diff --git a/.bazelrc b/.bazelrc
index 94841167276..391fc927c27 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -184,6 +184,10 @@ build:android_x86_64 --config=android
 build:android_x86_64 --cpu=x86_64
 build:android_x86_64 --fat_apk_cpu=x86_64
 
+# Build everything statically for Android since all static libs are later
+# bundled together into a single .so for deployment.
+build:android --dynamic_mode=off
+
 # Sets the default Apple platform to macOS.
 build:macos --apple_platform_type=macos
 
@@ -202,6 +206,8 @@ build:ios_armv7 --config=ios
 build:ios_armv7 --cpu=ios_armv7
 build:ios_arm64 --config=ios
 build:ios_arm64 --cpu=ios_arm64
+build:ios_arm64e --config=ios
+build:ios_arm64e --cpu=ios_arm64e
 build:ios_sim_arm64 --config=ios
 build:ios_sim_arm64 --cpu=ios_sim_arm64
 build:ios_i386 --config=ios
@@ -219,7 +225,9 @@ build:monolithic --define framework_shared_object=false
 build:monolithic --define tsl_protobuf_header_only=false
 build:monolithic --experimental_link_static_libraries_once=false  # b/229868128
 
-# Please note that MKL on MacOS or windows is still not supported.
+build:linux --define=build_with_onednn_v2=true
+
+# Please note that MKL on MacOS is still not supported.
 # If you would like to use a local MKL instead of downloading, please set the
 # environment variable "TF_MKL_ROOT" every time before build.
 build:mkl --define=build_with_mkl=true --define=enable_mkl=true
@@ -551,8 +559,8 @@ build:rbe_linux_py3_base --python_path="/usr/local/bin/python3.9"
 build:rbe_linux_py3_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_python3.9"
 
 build:rbe_win --config=rbe
-build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_02232023:toolchain"
-build:rbe_win --extra_toolchains="//tensorflow/tools/toolchains/win/tf_win_02232023:cc-toolchain-x64_windows"
+build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_05022023:toolchain"
+build:rbe_win --extra_toolchains="//tensorflow/tools/toolchains/win/tf_win_05022023:cc-toolchain-x64_windows"
 build:rbe_win --extra_execution_platforms="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
 build:rbe_win --host_platform="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
 build:rbe_win --platforms="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
@@ -683,10 +691,10 @@ build:ubsan --linkopt -fsanitize=undefined
 build:ubsan --linkopt -lubsan
 
 # Disable TFRT integration for now unless --config=tfrt is specified.
-build      --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils
+build      --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils,tensorflow/core/tfrt/utils/debug
 # TODO(b/240450920): We are in the process of migrating JitRt backend to XLA
 # and while we are doing this we can't keep it buildable/testable in OSS.
-build:tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils
+build:tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils,tensorflow/core/tfrt/utils/debug
 
 # TF Fuzztest config
 try-import fuzztest.bazelrc
diff --git a/.github/ISSUE_TEMPLATE/tensorflow_issue_template.yaml b/.github/ISSUE_TEMPLATE/tensorflow_issue_template.yaml
index 91c37cfe117..ac5643d9276 100644
--- a/.github/ISSUE_TEMPLATE/tensorflow_issue_template.yaml
+++ b/.github/ISSUE_TEMPLATE/tensorflow_issue_template.yaml
@@ -131,7 +131,6 @@ body:
       description: Also tell us, what did you expect to happen?
       placeholder: Tell us what you see!
       value: "A bug happened!"
-      render: shell
     validations:
       required: true
   - type: textarea
diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index 45b4c58fc90..b5cf2a5a6c2 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -16,9 +16,8 @@
 # A list of assignees
 assignees:
    - synandi
-   - tiruk007
+   - SuryanarayanaY
    - tilakrayal
-   - pjpratik
 # A list of assignees for compiler folder
 compiler_assignees:
    - joker-eph
diff --git a/.github/workflows/arm-ci-extended-cpp.yml b/.github/workflows/arm-ci-extended-cpp.yml
new file mode 100644
index 00000000000..cfa3a214918
--- /dev/null
+++ b/.github/workflows/arm-ci-extended-cpp.yml
@@ -0,0 +1,61 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+name: ARM CI Extended C++
+
+on:
+  push:
+    tags:
+      - v2.**
+  schedule:
+    - cron: '0 2 * * *'
+
+jobs:
+  build:
+    if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
+    runs-on: [self-hosted, linux, ARM64]
+    strategy:
+      matrix:
+        pyver: ['3.10']
+    steps:
+      - name: Stop old running containers (if any)
+        shell: bash
+        run: |
+          running_containers=$(docker ps -q) && \
+          if [[ $running_containers == "" ]]; then
+            echo "No running containers";
+          else
+            echo "Running container(s) found" && \
+            docker stop $running_containers;
+          fi
+          docker container prune -f
+          docker image prune -af
+      - name: Clean repository
+        shell: bash
+        run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
+      - name: Checkout repository for nightly (skipped for releases)
+        if: ${{ github.event_name == 'schedule' }}
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
+        with:
+          ref: 'nightly'
+      - name: Checkout repository
+        if: ${{ github.event_name == 'push' }}
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
+      - name: Build binary and run C++ tests
+        shell: bash
+        run: |
+          is_nightly=0 && tf_project_name='tf_ci_ext_c' && ${{ github.event_name == 'schedule' }} && is_nightly=1 && tf_project_name='tf_nightly_ci_ext_c'
+          CI_DOCKER_BUILD_EXTRA_PARAMS="--build-arg py_major_minor_version=${{ matrix.pyver }} --build-arg is_nightly=${is_nightly} --build-arg tf_project_name=${tf_project_name}" \
+          ./tensorflow/tools/ci_build/ci_build.sh cpu.arm64 bash tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_cpp.sh
diff --git a/.github/workflows/arm-ci-extended.yml b/.github/workflows/arm-ci-extended.yml
index 1592f4ed18a..7e32dafabe9 100644
--- a/.github/workflows/arm-ci-extended.yml
+++ b/.github/workflows/arm-ci-extended.yml
@@ -17,14 +17,10 @@ name: ARM CI Extended
 
 on:
   push:
-    branches:
-      - master
-      - r2.**
-  pull_request:
-    types: [opened, synchronize, reopened]
-    branches:
-      - master
-      - r2.**
+    tags:
+      - v2.**
+  schedule:
+    - cron: '0 4 * * *'
 
 jobs:
   build:
@@ -49,10 +45,17 @@ jobs:
       - name: Clean repository
         shell: bash
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
+      - name: Checkout repository for nightly (skipped for releases)
+        if: ${{ github.event_name == 'schedule' }}
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
+        with:
+          ref: 'nightly'
       - name: Checkout repository
+        if: ${{ github.event_name == 'push' }}
         uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
       - name: Build binary and run non-pip tests
         shell: bash
         run: |
-          CI_DOCKER_BUILD_EXTRA_PARAMS='--build-arg py_major_minor_version=${{ matrix.pyver }}' \
+          is_nightly=0 && tf_project_name='tf_ci_ext' && ${{ github.event_name == 'schedule' }} && is_nightly=1 && tf_project_name='tf_nightly_ci_ext'
+          CI_DOCKER_BUILD_EXTRA_PARAMS="--build-arg py_major_minor_version=${{ matrix.pyver }} --build-arg is_nightly=${is_nightly} --build-arg tf_project_name=${tf_project_name}" \
           ./tensorflow/tools/ci_build/ci_build.sh cpu.arm64 bash tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_nonpip.sh
diff --git a/.github/workflows/arm-ci.yml b/.github/workflows/arm-ci.yml
index e6ddbb9eec9..b0876ba60d7 100644
--- a/.github/workflows/arm-ci.yml
+++ b/.github/workflows/arm-ci.yml
@@ -54,7 +54,7 @@ jobs:
       - name: Build and test pip wheel
         shell: bash
         run: |
-          CI_DOCKER_BUILD_EXTRA_PARAMS='--build-arg py_major_minor_version=${{ matrix.pyver }}' \
+          CI_DOCKER_BUILD_EXTRA_PARAMS="--build-arg py_major_minor_version=${{ matrix.pyver }} --build-arg is_nightly=1 --build-arg tf_project_name=tf_nightly_ci" \
           ./tensorflow/tools/ci_build/ci_build.sh cpu.arm64 bash tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
       - name: Upload pip wheel to GitHub
         uses: actions/upload-artifact@83fd05a356d7e2593de66fc9913b3002723633cb # v3.1.1
diff --git a/.github/workflows/stale-issues.yml b/.github/workflows/stale-issues.yml
index d4fd32171b4..b4579591c91 100644
--- a/.github/workflows/stale-issues.yml
+++ b/.github/workflows/stale-issues.yml
@@ -28,8 +28,16 @@ jobs:
       pull-requests: write
     steps:
       - name: Awaiting response issues
-        uses: actions/stale@v5
+        uses: actions/stale@v7
         with:
+          #Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
+          exempt-issue-labels: 'override-stale'
+          #Comma separated list of labels that can be assigned to PRs to exclude them from being marked as stale
+          exempt-pr-labels: "override-stale"
+          #Limit the No. of API calls in one run default value is 30.
+          operations-per-run: 1000
+          #Prevent to remove stale label when PRs or issues are updated.
+          remove-stale-when-updated: false
           days-before-issue-stale: 7
           days-before-issue-close: 7
           stale-issue-label: "stale"
@@ -48,8 +56,16 @@ jobs:
           close-pr-message: "This PR was closed because it has been inactive for 14 days since being marked as stale. Please reopen if you'd like to work on this further."
           repo-token: ${{ secrets.GITHUB_TOKEN }}
       - name: Contribution issues
-        uses: actions/stale@v5
+        uses: actions/stale@v7
         with:
+          #Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
+          exempt-issue-labels: 'override-stale'
+          #Comma separated list of labels that can be assigned to PRs to exclude them from being marked as stale
+          exempt-pr-labels: "override-stale"
+          #Limit the No. of API calls in one run default value is 30.
+          operations-per-run: 1000
+          #Prevent to remove stale label when PRs or issues are updated.
+          remove-stale-when-updated: false
           days-before-issue-stale: 180
           days-before-issue-close: 365
           stale-issue-label: "stale"
diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
index ce31d59868a..d32d7affd64 100644
--- a/.github/workflows/update-rbe.yml
+++ b/.github/workflows/update-rbe.yml
@@ -80,6 +80,18 @@ jobs:
         map sigbuild-r2.12-clang-python3.9 2.12-python3.9
         map sigbuild-r2.12-clang-python3.10 2.12-python3.10
         map sigbuild-r2.12-clang-python3.11 2.12-python3.11
+        # TF 2.13
+        map sigbuild-r2.13 2.13-python3.9
+        map sigbuild-r2.13-python3.8 2.13-python3.8
+        map sigbuild-r2.13-python3.9 2.13-python3.9
+        map sigbuild-r2.13-python3.10 2.13-python3.10
+        map sigbuild-r2.13-python3.11 2.13-python3.11
+        # TF 2.13 + Clang (containers are the same, but env vars in configs.bzl are different)
+        map sigbuild-r2.13-clang 2.13-python3.9
+        map sigbuild-r2.13-clang-python3.8 2.13-python3.8
+        map sigbuild-r2.13-clang-python3.9 2.13-python3.9
+        map sigbuild-r2.13-clang-python3.10 2.13-python3.10
+        map sigbuild-r2.13-clang-python3.11 2.13-python3.11
     - name: Create Pull Request with changes
       uses: peter-evans/create-pull-request@2b011faafdcbc9ceb11414d64d0573f37c774b04 # v4.2.3
       with:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ccc170b5c6e..beea15f9bf0 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,39 +19,58 @@ Before sending your pull requests, make sure you do the following:
 
 ### Typical Pull Request Workflow -
 
-**1. New PR** - As a contributor, you submit a New PR on GitHub. - We inspect
-every incoming PR and add certain labels to the PR such as `size:`, `comp:` etc.
-At this stage we check if the PR is valid and meets certain quality
-requirements. - For example - We check if the CLA is signed, PR has sufficient
-description, if applicable unit tests are added, if it is a reasonable
-contribution meaning it is not a single liner cosmetic PR.
+**1. New PR**
 
-**2. Valid?** - If the PR passes all the quality checks then we go ahead and
-assign a reviewer. - If the PR didn't meet the validation criteria, we request
-for additional changes to be made to PR to pass quality checks and send it back
-or on a rare occassion we may reject it.
+- As a contributor, you submit a New PR on GitHub.
+- We inspect every incoming PR and add certain labels to the PR such as `size:`,
+  `comp:` etc.  At this stage we check if the PR is valid and meets certain
+  quality requirements. For example, we check if the CLA is signed, PR has
+  sufficient description, if applicable unit tests are added, if it is a
+  reasonable contribution (meaning it is not a single liner cosmetic PR).
 
-**3. Review** - For Valid PR, reviewer (person familiar with the
-code/functionality) checks if the PR looks good or needs additional changes. -
-If all looks good, reviewer would approve the PR. - If a change is needed, the
-contributor is requested to make suggested change. - You make the change and
-submit for the review again. - This cycle repeats itself till the PR gets
-approved. - Note: As a friendly reminder we may reach out to you if the PR is
-awaiting your response for more than 2 weeks.
+**2. Valid?**
 
-**4. Approved** - Once the PR is approved, it gets `kokoro:force-run` label
-applied and it initiates CI/CD tests. - We can't move forward if these tests
-fail. - In such situations, we may request you to make further changes to your
-PR for the tests to pass. - Once the tests pass, we now bring all the code in
-the internal code base, using a job called "copybara".
+- If the PR passes all the quality checks then we go ahead and assign a
+  reviewer.
+- If the PR didn't meet the validation criteria, we request for additional
+  changes to be made to PR to pass quality checks and send it back or on a rare
+  occassion we may reject it.
 
-**5. Copy to G3** - Once the PR is in Google codebase, we make sure it
-integrates well with its dependencies and the rest of the system. - Rarely, but
-If the tests fail at this stage, we cannot merge the code. - If needed, we may
-come to you to make some changes. - At times, it may not be you, it may be us
-who may have hit a snag. - Please be patient while we work to fix this. - Once
-the internal tests pass, we go ahead and merge the code internally as well as
-externally on GitHub.
+**3. Review**
+
+- For Valid PR, reviewer (person familiar with the code/functionality) checks if
+  the PR looks good or needs additional changes.
+- If all looks good, reviewer would approve the PR.
+- If a change is needed, the contributor is requested to make suggested change.
+- You make the change and submit for the review again.
+- This cycle repeats itself till the PR gets approved.
+- Note: As a friendly reminder we may reach out to you if the PR is awaiting
+  your response for more than 2 weeks.
+
+**4. Approved**
+
+- Once the PR is approved, it gets `kokoro:force-run` label applied and it
+  initiates CI/CD tests.
+- We can't move forward if these tests fail.
+- In such situations, we may request you to make further changes to your PR for
+  the tests to pass.
+- Once the tests pass, we now bring all the code in the internal code base,
+  using a job called "copybara".
+
+**5. Copy to Google Internal codebase and run internal CI**
+
+- Once the PR is in Google codebase, we make sure it integrates well with its
+  dependencies and the rest of the system.
+- Rarely, but If the tests fail at this stage, we cannot merge the code.
+- If needed, we may come to you to make some changes. At times, it may not be
+  you, it may be us who may have hit a snag. Please be patient while we work to
+  fix this.
+- Once the internal tests pass, we go ahead and merge the code internally as
+  well as externally on GitHub.
+
+In a graphical form, the entire lifetime of a PR looks like
+
+![image](https://user-images.githubusercontent.com/323199/229561784-0a2f5509-b731-493f-ad88-bad487688c8d.png)
 
 ### Contributor License Agreements
 
diff --git a/README.md b/README.md
index 2e1f9c72183..fa7a6c45733 100644
--- a/README.md
+++ b/README.md
@@ -92,8 +92,8 @@ uphold this code.**
 
 **We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
 tracking requests and bugs, please see
-[TensorFlow Discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss)
-for general questions and discussion, and please direct specific questions to
+[TensorFlow Forum](https://discuss.tensorflow.org/) for general questions and
+discussion, and please direct specific questions to
 [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).**
 
 The TensorFlow project strives to abide by generally accepted best practices in
diff --git a/RELEASE.md b/RELEASE.md
index 15bfd428d6e..87ebf46e557 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,45 @@
+# Release 2.14.0
+
+<INSERT SMALL BLURB ABOUT RELEASE FOCUS AREA AND POTENTIAL TOOLCHAIN CHANGES>
+
+# Breaking Changes
+
+* <DOCUMENT BREAKING CHANGES HERE>
+* <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
+
+*  `tf.Tensor`
+    * The class hierarchy for `tf.Tensor` has changed, and there are now
+      explicit `EagerTensor` and `SymbolicTensor` classes for eager and
+      tf.function respectively. Users who relied on the exact type of Tensor
+      (e.g. `type(t) == tf.Tensor`) will need to update their code to use
+      `isinstance(t, tf.Tensor)`. The `tf.is_symbolic_tensor` helper added in
+      2.13 may be used when it is necessary to determine if a value is
+      specifically a symbolic tensor.
+
+# Known Caveats
+
+* <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES).>
+* <ADDING/BUMPING DEPENDENCIES SHOULD GO HERE>
+* <KNOWN LACK OF SUPPORT ON SOME PLATFORM, SHOULD GO HERE>
+
+# Major Features and Improvements
+
+*   <INSERT MAJOR FEATURE HERE, USING MARKDOWN SYNTAX>
+*   <IF RELEASE CONTAINS MULTIPLE FEATURES FROM SAME AREA, GROUP THEM TOGETHER>
+
+# Bug Fixes and Other Changes
+* `tf.lite`
+    * Strided_Slice now supports `UINT32`.
+* <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
+* <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
+* <NOTES SHOULD BE GROUPED PER AREA>
+
+# Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+<INSERT>, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
+
 # Release 2.13.0
 
 ## Breaking Changes
@@ -18,6 +60,26 @@
        modifying H5 files saved by Keras under a `.keras` extension.
        If this breaks you, simply add `save_format="h5"` to your `.save()` call
        to revert back to the prior behavior.
+    *  Added `keras.utils.TimedThread` utility to run a timed thread every x
+       seconds. It can be used to run a threaded function alongside model
+       training or any other snippet of code.
+    *  In the `keras` PyPI package, accessible symbols are now restricted to
+       symbols that are intended to be public.
+       This may affect your code if you were using `import keras` and you used
+       `keras` functions that were not public APIs, but were accessible in
+       earlier versions with direct imports. In those cases, please use the
+       following guideline:
+        -  The API may be available in the public Keras API under a different
+           name, so make sure to look for it on keras.io or TensorFlow docs
+           and switch to the public version.
+        -  It could also be a simple python or TF utility that you could easily
+           copy over to your own codebase. In those case, just make it your own!
+        -  If you believe it should definitely be a public Keras API,
+           please open a feature request in keras GitHub repo.
+        -  As a workaround, you could import the same private symbol keras
+           `keras.src`, but keep in mind the `src` namespace is not stable and
+           those APIs may change or be removed in the future.
+
 
 * The LMDB kernels have been changed to return an error. This is in preparation
   for completely removing them from TensorFlow. The LMDB dependency that these
@@ -40,11 +102,19 @@
         clustering.
     *   Add int16x8 support for the built-in op `exp`
     *   Add int16x8 support for the built-in op `mirror_pad`
+    *   Add int16x8 support for the built-in ops `space_to_batch_nd` and
+        `batch_to_space_nd`
     *   Add 16-bit int type support for built-in op `less`, `greater_than`,
         `equal`
     *   Add 8-bit and 16-bit support for `floor_div` and `floor_mod`.
+    *   Add 16-bit and 32-bit int support for the built-in op `bitcast`.
+    *   Add 8-bit/16-bit/32-bit int/uint support for the built-in op `bitwise_xor`
     *   Add int16 indices support for built-in op `gather` and `gather_nd`.
+    *   Add 8-bit/16-bit/32-bit int/uint support for the built-in op `right_shift`
     *   Add reference implementation for 16-bit int unquantized `add`.
+    *   Add reference implementation for 16-bit int and 32-bit unsigned int unquantized `mul`.
+    *   `add_op` supports broadcasting up to 6 dimensions.
+    *   Add 16-bit support for `top_k`.
 
 *   `tf.keras`
 
@@ -57,6 +127,8 @@
         libraries (like sklearn or pycocotools) into Keras as first-class Keras
         metrics.
     *   Added `tf.keras.optimizers.Lion` optimizer.
+    *   Added `tf.keras.layers.SpectralNormalization` layer wrapper to perform
+        spectral normalization on the weights of a target layer.
     *   The `SidecarEvaluatorModelExport` callback has been added to Keras as
         `keras.callbacks.SidecarEvaluatorModelExport`. This callback allows for
         exporting the model the best-scoring model as evaluated by a
@@ -76,6 +148,16 @@
         `tf.keras.__internal__.RaggedKerasTensor` classes. You can use these
         classes to do instance type checking and type annotations for
         layer/model inputs and outputs.
+    *   All the `tf.keras.dtensor.experimental.optimizers` classes have been 
+        merged with `tf.keras.optimizers`. You can migrate your code to use
+        `tf.keras.optimizers` directly. The API namespace for
+        `tf.keras.dtensor.experimental.optimizers` will be removed in future
+        releases.
+    *   Added support for `class_weight` for 3+ dimensional targets (e.g.
+        image segmentation masks) in `Model.fit`.
+    *   Added a new loss, `keras.losses.CategoricalFocalCrossentropy`.
+    *   Remove the `tf.keras.dtensor.experimental.layout_map_scope()`. You can
+        user the `tf.keras.dtensor.experimental.LayoutMap.scope()` instead.
 
 *   `tf.function`:
 
@@ -94,6 +176,22 @@
         `tf.nn.safe_embedding_lookup_sparse`, which enables a simplified and
         typically faster lookup procedure.
 
+*   `tf.data`
+
+    *   `tf.data.Dataset.zip` now supports Python-style zipping, i.e.
+        `Dataset.zip(a, b, c)`.
+    *   `tf.data.Dataset.shuffle` now supports full shuffling. To specify that
+        data should be fully shuffled, use
+        `dataset = dataset.shuffle(dataset.cardinality())`. This will load the
+        full dataset into memory so that it can be shuffled, so make sure to
+        only use this with datasets of filenames or other small datasets.
+
+*   `tf.math`
+
+    * `tf.nn.top_k` now supports specifying the output index type via parameter
+       `index_type`.  Supported types are `tf.int16`, `tf.int32`
+       (default), and `tf.int64`.
+
 *   `tf.SavedModel`
 
     *   Introduce class method
@@ -109,6 +207,13 @@
 * <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
 * <NOTES SHOULD BE GROUPED PER AREA>
 
+*   `tf.Variable`
+
+    *   Changed resource variables to inherit from `tf.compat.v2.Variable`
+        instead of `tf.compat.v1.Variable`. Some checks for 
+        `isinstance(v, tf.compat.v1.Variable)` that previously returned True
+        may now return False.
+
 *   `tf.distribute`
 
     *   Opened an experimental API,
@@ -124,6 +229,20 @@
     *   List of members of dtensor.Layout and dtensor.Mesh have slightly changed
         as part of efforts to consolidate the C++ and Python source
         code with pybind11. Most notably, Layout.serialized_string is removed.
+    *   Minor API changes to represent Single Device Layout for non-distributed
+        Tensors inside DTensor functions. Runtime support will be added soon.
+
+*   `tf.experimental.ExtensionType`:
+
+    *   `tf.experimental.ExtensionType` now supports Python `tuple` as
+        the type annotation of its fields.
+
+*    `tf.nest`:
+    *   Deprecated API `tf.nest.is_sequence` has now been deleted.
+        Please use `tf.nest.is_nested` instead.
+
+*   `tf.lite`:
+    *   Add UINT32 support to tfl.pack
 
 ## Thanks to our Contributors
 
@@ -134,217 +253,166 @@ This release contains contributions from many people at Google, as well as:
 
 # Release 2.12.0
 
-# Breaking Changes
-
-* <DOCUMENT BREAKING CHANGES HERE>
-* <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
+### Breaking Changes
 
 *   Build, Compilation and Packaging
 
-    *   Removal of redundant packages: the `tensorflow-gpu` and `tf-nightly-gpu`
-        packages have been effectively removed and replaced with packages that
-        direct users to switch to `tensorflow` or `tf-nightly` respectively.
-        The naming difference was the only difference between the two sets of
-        packages ever since TensorFlow 2.1, so there is no loss of functionality
-        or GPU support. See
-        https://pypi.org/project/tensorflow-gpu for more details.
+    *   Removed redundant packages `tensorflow-gpu` and `tf-nightly-gpu`. These packages were removed and replaced with packages that direct users to switch to `tensorflow` or `tf-nightly` respectively. Since TensorFlow 2.1, the only difference between these two sets of packages was their names, so there is no loss of functionality or GPU support. See https://pypi.org/project/tensorflow-gpu for more details.
 
 *   `tf.function`:
 
-    *   tf.function now uses the Python inspect library directly for parsing
-        the signature of the Python function it is decorated on.
-    *   This can break certain cases that were previously ignored where the
-        signature is malformed, e.g.
-            *   Using functools.wraps on a function with different signature
-            *   Using functools.partial with an invalid tf.function input
-    *   tf.function now enforces input parameter names to be valid Python
-        identifiers. Incompatible names are automatically sanitized similarly to
-        existing SavedModel signature behavior.
-    *   Parameterless tf.functions are assumed to have an empty input_signature
-        instead of an undefined one even if the input_signature is unspecified.
-    *   tf.types.experimental.TraceType now requires an additional
-        `placeholder_value` method to be defined.
-    *   tf.function now traces with placeholder values generated by TraceType
-        instead of the value itself.
+    *   `tf.function` now uses the Python inspect library directly for parsing the signature of the Python function it is decorated on. This change may break code where the function signature is malformed, but was ignored previously, such as:
+        *   Using `functools.wraps` on a function with different signature
+        *   Using `functools.partial` with an invalid `tf.function` input
+    *   `tf.function` now enforces input parameter names to be valid Python identifiers. Incompatible names are automatically sanitized similarly to existing SavedModel signature behavior.
+    *   Parameterless `tf.function`s are assumed to have an empty `input_signature` instead of an undefined one even if the `input_signature` is unspecified.
+    *   `tf.types.experimental.TraceType` now requires an additional `placeholder_value` method to be defined.
+    *   `tf.function` now traces with placeholder values generated by TraceType instead of the value itself.
 
-*   `tf.config.experimental.enable_mlir_graph_optimization`:
+*   Experimental APIs `tf.config.experimental.enable_mlir_graph_optimization` and `tf.config.experimental.disable_mlir_graph_optimization` were removed.
 
-    * Experimental API removed.
+### Major Features and Improvements
 
-*   `tf.config.experimental.disable_mlir_graph_optimization`:
-
-    * Experimental API removed.
-
-*   `tf.keras`
-
-    * Moved all saving-related utilities to a new namespace, `keras.saving`,
-      i.e. `keras.saving.load_model`, `keras.saving.save_model`,
-      `keras.saving.custom_object_scope`, `keras.saving.get_custom_objects`,
-      `keras.saving.register_keras_serializable`,
-      `keras.saving.get_registered_name` and
-      `keras.saving.get_registered_object`.
-      The previous API locations (in `keras.utils` and `keras.models`) will
-      stay available indefinitely, but we recommend that you update your code
-      to point to the new API locations.
-    * Improvements and fixes in Keras loss masking:
-        * Whether you represent a ragged tensor as a `tf.RaggedTensor` or using
-          [keras masking](https://www.tensorflow.org/guide/keras/masking_and_padding),
-          the returned loss values should be the identical to each other.
-          In previous versions Keras may have silently ignored the mask.
-        * If you use masked losses with Keras the loss values may be different
-          in TensorFlow `2.12` compared to previous versions.
-        * In cases where the mask was previously ignored, you will now get
-          an error if you pass a mask with an incompatible shape.
-
-*   `tf.SavedModel`
-
-    * Introduce new class `tf.saved_model.experimental.Fingerprint` that
-      contains the fingerprint of the SavedModel. See the
-      [SavedModel Fingerprinting RFC](https://github.com/tensorflow/community/pull/415)
-      for details.
-    * Introduce API `tf.saved_model.experimental.read_fingerprint(export_dir)`
-      for reading the fingerprint of a SavedModel.
-
-
-# Known Caveats
-
-* <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES).>
-* <ADDING/BUMPING DEPENDENCIES SHOULD GO HERE>
-* <KNOWN LACK OF SUPPORT ON SOME PLATFORM, SHOULD GO HERE>
-
-# Major Features and Improvements
+*  Support for Python 3.11 has been added.
+*  Support for Python 3.7 has been removed. We are not releasing any more patches for Python 3.7.
 
 *   `tf.lite`:
 
     *   Add 16-bit float type support for built-in op `fill`.
     *   Transpose now supports 6D tensors.
-    *   Float LSTM now supports diagonal recurrent tensors:
-        https://arxiv.org/abs/1903.08023
-
-*   `tf.keras`:
-
-    *   The new Keras model saving format (`.keras`) is available. You can start
-        using it via `model.save(f"{fname}.keras", save_format="keras_v3")`. In
-        the future it will become the default for all files with the `.keras`
-        extension. This file format targets the Python runtime only and makes
-        it possible to reload Python objects identical to the saved originals.
-        The format supports non-numerical state such as vocabulary files and
-        lookup tables, and it is easy to customize in the case of custom layers
-        with exotic elements of state (e.g. a FIFOQueue). The format
-        does not rely on bytecode or pickling, and is safe by default. Note
-        that as a result, Python `lambdas` are disallowed at loading time. If
-        you want to use `lambdas`, you can pass `safe_mode=False` to the loading
-        method (only do this if you trust the source of the model).
-    *   Added a `model.export(filepath)` API to create a lightweight SavedModel
-        artifact that can be used for inference (e.g. with TF-Serving).
-    *   Added `keras.export.ExportArchive` class for low-level customization of
-        the process of exporting SavedModel artifacts for inference.
-        Both ways of exporting models are based on `tf.function` tracing
-        and produce a TF program composed of TF ops. They are meant primarily
-        for environments where the TF runtime is available,
-        but not the Python interpreter, as is typical
-        for production with TF Serving.
-    *   Added utility `tf.keras.utils.FeatureSpace`, a one-stop shop for
-        structured data preprocessing and encoding.
-    *   Added `tf.SparseTensor` input support to `tf.keras.layers.Embedding`
-        layer. The layer now accepts a new boolean argument `sparse`. If
-        `sparse` is set to True, the layer returns a SparseTensor instead of a
-        dense Tensor. Defaults to False.
-    *   Added `jit_compile` as a settable property to `tf.keras.Model`.
-    *   Added `synchronized` optional parameter to `layers.BatchNormalization`.
-    *   Added deprecation warning to
-        `layers.experimental.SyncBatchNormalization` and suggested to use
-        `layers.BatchNormalization` with `synchronized=True` instead.
-    *   Updated `tf.keras.layers.BatchNormalization` to support masking of the
-        inputs (`mask` argument) when computing the mean and variance.
-    *   Add `tf.keras.layers.Identity`, a placeholder pass-through layer.
-    *   Add `show_trainable` option to `tf.keras.utils.model_to_dot` to display
-        layer trainable status in model plots.
-    *   Add ability to save a `tf.keras.utils.FeatureSpace` object, via
-        `feature_space.save("myfeaturespace.keras")`, and reload it via
-        `feature_space = tf.keras.models.load_model("myfeaturespace.keras")`.
-    *   Added utility `tf.keras.utils.to_ordinal` to convert class vector to
-        ordinal regression / classification matrix.
+    *   Float LSTM now supports diagonal recurrent tensors: https://arxiv.org/abs/1903.08023
 
 *   `tf.experimental.dtensor`:
 
-    *   Coordination service now works with
-        `dtensor.initialize_accelerator_system`, and enabled by default.
-    *   Add `tf.experimental.dtensor.is_dtensor` to check if a tensor is a
-        DTensor instance.
+    *   Coordination service now works with `dtensor.initialize_accelerator_system`, and enabled by default.
+    *   Add `tf.experimental.dtensor.is_dtensor` to check if a tensor is a DTensor instance.
 
 *   `tf.data`:
 
-    *   Added support for alternative checkpointing protocol which makes it
-        possible to checkpoint the state of the input pipeline without having to
-        store the contents of internal buffers. The new functionality can be
-        enabled through the `experimental_symbolic_checkpoint` option of
-        `tf.data.Options()`.
-    *   Added a new `rerandomize_each_iteration` argument for the
-        `tf.data.Dataset.random()` operation, which controls whether the
-        sequence of generated random numbers should be re-randomized every epoch
-        or not (the default behavior). If `seed` is set and
-        `rerandomize_each_iteration=True`, the `random()` operation will produce
-        a different (deterministic) sequence of numbers every epoch.
-    *   Added a new `rerandomize_each_iteration` argument for the
-        `tf.data.Dataset.sample_from_datasets()` operation, which controls
-        whether the sequence of generated random numbers used for sampling
-        should be re-randomized every epoch or not. If `seed` is set and
-        `rerandomize_each_iteration=True`, the `sample_from_datasets()`
-        operation will use a different (deterministic) sequence of numbers every
-        epoch.
-    *   Added a new field, `warm_start`, to
-        `tf.data.experimental.OptimizationOptions`. If it is set to `True`,
-        tf.data will start background threads of asynchronous
-        transformations upon iterator creation (as opposed to upon first call
-        to `GetNext`). To enable this behavior, set `warm_start=True` in
-        `tf.data.experimental.OptimizationOptions`. It should be noted that this
-        possibly improves the latency of the initial 'GetNext' call at the
-        expense of requiring more memory to hold prefetched elements between
-        the time of iterator construction and usage.
+    *   Added support for alternative checkpointing protocol which makes it possible to checkpoint the state of the input pipeline without having to store the contents of internal buffers. The new functionality can be enabled through the `experimental_symbolic_checkpoint` option of `tf.data.Options()`.
+    *   Added a new `rerandomize_each_iteration` argument for the `tf.data.Dataset.random()` operation, which controls whether the sequence of generated random numbers should be re-randomized every epoch or not (the default behavior). If `seed` is set and `rerandomize_each_iteration=True`, the `random()` operation will produce a different (deterministic) sequence of numbers every epoch.
+    *   Added a new `rerandomize_each_iteration` argument for the `tf.data.Dataset.sample_from_datasets()` operation, which controls whether the sequence of generated random numbers used for sampling should be re-randomized every epoch or not. If `seed` is set and `rerandomize_each_iteration=True`, the `sample_from_datasets()` operation will use a different (deterministic) sequence of numbers every epoch.
+
 *   `tf.test`:
 
-    *   Added `tf.test.experimental.sync_devices`, which is useful for
-        accurately measuring performance in benchmarks.
+    *   Added `tf.test.experimental.sync_devices`, which is useful for accurately measuring performance in benchmarks.
 
 *   `tf.experimental.dtensor`:
 
     *   Added experimental support to ReduceScatter fuse on GPU (NCCL).
 
-# Bug Fixes and Other Changes
-
-* <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
-* <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
-* <NOTES SHOULD BE GROUPED PER AREA>
+### Bug Fixes and Other Changes
 
+*   `tf.SavedModel`:
+    * Introduced new class `tf.saved_model.experimental.Fingerprint` that contains the fingerprint of the SavedModel. See the [SavedModel Fingerprinting RFC](https://github.com/tensorflow/community/pull/415) for details.
+    * Introduced API `tf.saved_model.experimental.read_fingerprint(export_dir)` for reading the fingerprint of a SavedModel.
 * `tf.random`
-  * Added non-experimental aliases for `tf.random.split` and
-    `tf.random.fold_in`, the experimental endpoints are still available
-    so no code changes are necessary.
+  * Added non-experimental aliases for `tf.random.split` and `tf.random.fold_in`, the experimental endpoints are still available so no code changes are necessary.
 * `tf.experimental.ExtensionType`
-  * Added function `experimental.extension_type.as_dict()`, which converts an
-    instance of `tf.experimental.ExtensionType` to a `dict` representation.
+  * Added function `experimental.extension_type.as_dict()`, which converts an instance of `tf.experimental.ExtensionType` to a `dict` representation.
 * `stream_executor`
-  * Top level `stream_executor` directory has been deleted, users should use
-    equivalent headers and targets under `compiler/xla/stream_executor`.
+  * Top level `stream_executor` directory has been deleted, users should use equivalent headers and targets under `compiler/xla/stream_executor`.
 * `tf.nn`
-  * Added `tf.nn.experimental.general_dropout`, which is similar to
-    `tf.random.experimental.stateless_dropout` but accepts a custom sampler
-    function.
+  * Added `tf.nn.experimental.general_dropout`, which is similar to `tf.random.experimental.stateless_dropout` but accepts a custom sampler function.
 * `tf.types.experimental.GenericFunction`
-  * The `experimental_get_compiler_ir` method supports tf.TensorSpec
-   compilation arguments.
+  * The `experimental_get_compiler_ir` method supports tf.TensorSpec compilation arguments.
 *  `tf.config.experimental.mlir_bridge_rollout`
-    *   Removed enums `MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED` and
-    `MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED` which are no longer used by
-    the tf2xla bridge
+    *   Removed enums `MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED` and `MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED` which are no longer used by the tf2xla bridge
+
+## Keras
+
+ Keras is a framework built on top of the TensorFlow. See more details on the Keras [website](https://keras.io/).
+
+### Breaking Changes
 
 
-# Thanks to our Contributors
+`tf.keras`:
+
+* Moved all saving-related utilities to a new namespace, `keras.saving`, for example: `keras.saving.load_model`, `keras.saving.save_model`, `keras.saving.custom_object_scope`, `keras.saving.get_custom_objects`, `keras.saving.register_keras_serializable`,`keras.saving.get_registered_name` and `keras.saving.get_registered_object`. The previous API locations (in `keras.utils` and `keras.models`) will be available indefinitely, but we recommend you update your code to point to the new API locations.
+ * Improvements and fixes in Keras loss masking:
+    * Whether you represent a ragged tensor as a `tf.RaggedTensor` or using [keras masking](https://www.tensorflow.org/guide/keras/masking_and_padding), the returned loss values should be the identical to each other. In previous versions Keras may have silently ignored the mask.
+ * If you use masked losses with Keras the loss values may be different in TensorFlow `2.12` compared to previous versions.
+ * In cases where the mask was previously ignored, you will now get an error if you pass a mask with an incompatible shape.
+
+### Major Features and Improvements
+
+`tf.keras`:
+
+ *   The new Keras model saving format (`.keras`) is available. You can start using it via `model.save(f"{fname}.keras", save_format="keras_v3")`. In the future it will become the default for all files with the `.keras` extension. This file format targets the Python runtime only and makes it possible to reload Python objects identical to the saved originals. The format supports non-numerical state such as vocabulary files and lookup tables, and it is easy to customize in the case of custom layers with exotic elements of state (e.g. a FIFOQueue). The format does not rely on bytecode or pickling, and is safe by default. Note that as a result, Python `lambdas` are disallowed at loading time. If you want to use `lambdas`, you can pass `safe_mode=False` to the loading method (only do this if you trust the source of the model).
+*   Added a `model.export(filepath)` API to create a lightweight SavedModel artifact that can be used for inference (e.g. with TF-Serving).
+*   Added `keras.export.ExportArchive` class for low-level customization of the process of exporting SavedModel artifacts for inference. Both ways of exporting models are based on `tf.function` tracing and produce a TF program composed of TF ops. They are meant primarily for environments where the TF runtime is available, but not the Python interpreter, as is typical for production with TF Serving.
+ *   Added utility `tf.keras.utils.FeatureSpace`, a one-stop shop for structured data preprocessing and encoding.
+ *   Added `tf.SparseTensor` input support to `tf.keras.layers.Embedding` layer. The layer now accepts a new boolean argument `sparse`. If `sparse` is set to True, the layer returns a SparseTensor instead of a dense Tensor. Defaults to False.
+ *   Added `jit_compile` as a settable property to `tf.keras.Model`.
+ *   Added `synchronized` optional parameter to `layers.BatchNormalization`.
+ *   Added deprecation warning to `layers.experimental.SyncBatchNormalization` and suggested to use `layers.BatchNormalization` with `synchronized=True` instead.
+ *   Updated `tf.keras.layers.BatchNormalization` to support masking of the inputs (`mask` argument) when computing the mean and variance.
+ *   Add `tf.keras.layers.Identity`, a placeholder pass-through layer.
+ *   Add `show_trainable` option to `tf.keras.utils.model_to_dot` to display layer trainable status in model plots.
+ *   Add ability to save a `tf.keras.utils.FeatureSpace` object, via `feature_space.save("myfeaturespace.keras")`, and reload it via `feature_space = tf.keras.models.load_model("myfeaturespace.keras")`.
+*   Added utility `tf.keras.utils.to_ordinal` to convert class vector to ordinal regression / classification matrix.
+
+### Bug Fixes and Other Changes
+
+*   N/A
+
+## Security
+
+*   Moving forward, TensorFlow will no longer update [TFSAs](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/security). Please refer instead to our [GitHub security advisories](https://github.com/tensorflow/tensorflow/security/advisories), which are attached to [CVEs](https://cve.mitre.org/cve/).
+*   Fixes an FPE in TFLite in conv kernel [CVE-2023-27579](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-27579)
+*   Fixes a double free in Fractional(Max/Avg)Pool [CVE-2023-25801](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25801)
+*   Fixes a null dereference on ParallelConcat with XLA [CVE-2023-25676](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25676)
+*   Fixes a segfault in Bincount with XLA [CVE-2023-25675](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25675)
+*   Fixes an NPE in RandomShuffle with XLA enable [CVE-2023-25674](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25674)
+*   Fixes an FPE in TensorListSplit with XLA [CVE-2023-25673](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25673)
+*   Fixes segmentation fault in tfg-translate [CVE-2023-25671](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25671)
+*   Fixes an NPE in QuantizedMatMulWithBiasAndDequantize [CVE-2023-25670](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25670)
+*   Fixes an FPE in AvgPoolGrad with XLA [CVE-2023-25669](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25669)
+*   Fixes a heap out-of-buffer read vulnerability in the QuantizeAndDequantize operation [CVE-2023-25668](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25668)
+*   Fixes a segfault when opening multiframe gif [CVE-2023-25667](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25667)
+*   Fixes an NPE in SparseSparseMaximum [CVE-2023-25665](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25665)
+*   Fixes an FPE in AudioSpectrogram [CVE-2023-25666](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25666)
+*   Fixes a heap-buffer-overflow in AvgPoolGrad  [CVE-2023-25664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25664)
+*   Fixes a NPE in TensorArrayConcatV2  [CVE-2023-25663](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25663)
+*   Fixes a Integer overflow in EditDistance  [CVE-2023-25662](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25662)
+*   Fixes a Seg fault in `tf.raw_ops.Print` [CVE-2023-25660](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25660)
+*   Fixes a OOB read in DynamicStitch [CVE-2023-25659](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25659)
+*   Fixes a OOB Read in GRUBlockCellGrad [CVE-2023-25658](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25658)
+
+## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
 
-<INSERT>, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
+103yiran, 8bitmp3, Aakar, Aakar Dwivedi, Abinash Satapathy, Aditya Kane, ag.ramesh, Alexander Grund, Andrei Pikas, andreii, Andrew Goodbody, angerson, Anthony_256, Ashay Rane, Ashiq Imran, Awsaf, Balint Cristian, Banikumar Maiti (Intel Aipg), Ben Barsdell, bhack, cfRod, Chao Chen, chenchongsong, Chris Mc, Daniil Kutz, David Rubinstein, dianjiaogit, dixr, Dongfeng Yu, dongfengy, drah, Eric Kunze, Feiyue Chen, Frederic Bastien, Gauri1 Deshpande, guozhong.zhuang, hDn248, HYChou, ingkarat, James Hilliard, Jason Furmanek, Jaya, Jens Glaser, Jerry Ge, Jiao Dian'S Power Plant, Jie Fu, Jinzhe Zeng, Jukyy, Kaixi Hou, Kanvi Khanna, Karel Ha, karllessard, Koan-Sin Tan, Konstantin Beluchenko, Kulin Seth, Kun Lu, Kyle Gerard Felker, Leopold Cambier, Lianmin Zheng, linlifan, liuyuanqiang, Lukas Geiger, Luke Hutton, Mahmoud Abuzaina, Manas Mohanty, Mateo Fidabel, Maxiwell S. Garcia, Mayank Raunak, mdfaijul, meatybobby, Meenakshi Venkataraman, Michael Holman, Nathan John Sircombe, Nathan Luehr, nitins17, Om Thakkar, Patrice Vignola, Pavani Majety, per1234, Philipp Hack, pollfly, Prianka Liz Kariat, Rahul Batra, rahulbatra85, ratnam.parikh, Rickard Hallerbäck, Roger Iyengar, Rohit Santhanam, Roman Baranchuk, Sachin Muradi, sanadani, Saoirse Stewart, seanshpark, Shawn Wang, shuw, Srinivasan Narayanamoorthy, Stewart Miles, Sunita Nadampalli, SuryanarayanaY, Takahashi Shuuji, Tatwai Chong, Thibaut Goetghebuer-Planchon, tilakrayal, Tirumalesh, TJ, Tony Sung, Trevor Morris, unda, Vertexwahn, Vinila S, William Muir, Xavier Bonaventura, xiang.zhang, Xiao-Yong Jin, yleeeee, Yong Tang, Yuriy Chernyshov, Zhang, Xiangze, zhaozheng09
+
+
+# Release 2.11.1
+
+**Note**: TensorFlow 2.10 was the last TensorFlow release that supported GPU on native-Windows. Starting with TensorFlow 2.11, you will need to install TensorFlow in WSL2, or install tensorflow-cpu and, optionally, try the TensorFlow-DirectML-Plugin.
+*   Security vulnerability fixes will no longer be patched to this Tensorflow version. The latest Tensorflow version includes the security vulnerability fixes. You can update to the latest version (recommended) or patch security vulnerabilities yourself [steps](https://github.com/tensorflow/tensorflow#patching-guidelines). You can refer to the [release notes](https://github.com/tensorflow/tensorflow/releases) of the latest Tensorflow version for a list of newly fixed vulnerabilities. If you have any questions, please create a GitHub issue to let us know.
+
+This release also introduces several vulnerability fixes:
+
+*   Fixes an FPE in TFLite in conv kernel [CVE-2023-27579](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-27579)
+*   Fixes a double free in Fractional(Max/Avg)Pool [CVE-2023-25801](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25801)
+*   Fixes a null dereference on ParallelConcat with XLA [CVE-2023-25676](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25676)
+*   Fixes a segfault in Bincount with XLA [CVE-2023-25675](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25675)
+*   Fixes an NPE in RandomShuffle with XLA enable [CVE-2023-25674](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25674)
+*   Fixes an FPE in TensorListSplit with XLA [CVE-2023-25673](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25673)
+*   Fixes segmentation fault in tfg-translate [CVE-2023-25671](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25671)
+*   Fixes an NPE in QuantizedMatMulWithBiasAndDequantize [CVE-2023-25670](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25670)
+*   Fixes an FPE in AvgPoolGrad with XLA [CVE-2023-25669](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25669)
+*   Fixes a heap out-of-buffer read vulnerability in the QuantizeAndDequantize operation [CVE-2023-25668](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25668)
+*   Fixes a segfault when opening multiframe gif [CVE-2023-25667](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25667)
+*   Fixes an NPE in SparseSparseMaximum [CVE-2023-25665](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25665)
+*   Fixes an FPE in AudioSpectrogram [CVE-2023-25666](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25666)
+*   Fixes a heap-buffer-overflow in AvgPoolGrad  [CVE-2023-25664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25664)
+*   Fixes a NPE in TensorArrayConcatV2  [CVE-2023-25663](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25663)
+*   Fixes a Integer overflow in EditDistance  [CVE-2023-25662](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25662)
+*   Fixes a Seg fault in `tf.raw_ops.Print` [CVE-2023-25660](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25660)
+*   Fixes a OOB read in DynamicStitch [CVE-2023-25659](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25659)
+*   Fixes a OOB Read in GRUBlockCellGrad [CVE-2023-25658](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-25658)
+
 
 # Release 2.11.0
 
diff --git a/SECURITY.md b/SECURITY.md
index 0964f7debb1..87a16f17538 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -279,9 +279,9 @@ For each vulnerability, we try to ingress it as soon as possible, given the size
 of the team and the number of reports. Vulnerabilities will, in general, be
 batched to be fixed at the same time as a quarterly release.
 
-Past security advisories are listed
+Security advisories from 2018 to March 2023 are listed
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/README.md).
-In the future, we might sunset this list and only use GitHub's Security Advisory
-format, to simplify the post-vulnerability-fix process.  We credit reporters for
-identifying security issues, although we keep your name confidential if you
-request it.
+From TF 2.13 onwards, we have sunset this list and only use GitHub's Security
+Advisory format, to simplify the post-vulnerability-fix process.  In both
+locations, we credit reporters for identifying security issues, although we keep
+your name confidential if you request it.
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index ec1887945c4..fce465ff1f2 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -32,6 +32,10 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl_ml",
 )
+load(
+    "//third_party/mkl_dnn:build_defs.bzl",
+    "if_onednn_v3",
+)
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "//tensorflow:tensorflow.default.bzl",
@@ -124,7 +128,7 @@ PACKAGE_STATIC_DEPS = [
     "@flatbuffers//:__subpackages__",
     "@nccl_archive//:__subpackages__",
     "@triton//:__subpackages__",
-] + tsl_async_value_deps()
+] + tsl_async_value_deps() + if_onednn_v3(["@onednn_v3//:__subpackages__"])
 
 package(
     # copybara:uncomment default_applicable_licenses = [":license"],
@@ -1025,8 +1029,10 @@ package_group(
         "//third_party/cloud_tpu/inference_converter/...",
         "//third_party/py/cloud_ml_autoflow/...",
         "//third_party/py/envlogger/...",
+        "//third_party/py/gldm/...",
         "//third_party/py/keras/...",
         "//third_party/yggdrasil_decision_forests/...",
+        "//waymo/ml/cn/...",
     ],
 )
 
@@ -1144,6 +1150,9 @@ tf_cc_shared_library(
         ],
         "//conditions:default": [
             "-Wl,--version-script,$(location //tensorflow:tf_framework_version_script.lds)",
+            # copybara:uncomment_begin(google-only)
+            # "-Wl,--undefined-version",
+            # copybara:uncomment_end(google-only)
         ],
     }),
     linkstatic = 1,
@@ -1350,6 +1359,7 @@ tf_cc_shared_library(
         "//tensorflow/core/data/service:server_lib",
         "//tensorflow/core/debug",
         "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/framework:full_type_util",
         "//tensorflow/core/function/runtime_client:runtime_client_cc",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/clusters:single_machine",
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index d217e7a1f51..0e70244453f 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -22,6 +22,29 @@ package(
     licenses = ["notice"],
 )
 
+filegroup(
+    name = "safe_ptr_hdr",
+    srcs = ["safe_ptr.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+cc_library(
+    name = "safe_ptr",
+    srcs = [
+        "safe_ptr.cc",
+        "//tensorflow/c/eager:headers",
+    ],
+    hdrs = ["safe_ptr.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":c_api_internal",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Public targets
 
@@ -62,10 +85,10 @@ filegroup(
             "*test*",
         ],
     ) + [
-        "//tensorflow/tsl/c:srcs",
-        "//tensorflow/tsl/platform:ctstring",
         "//tensorflow/cc:srcs_no_runtime",
         "//tensorflow/core/distributed_runtime:server_lib.h",
+        "//tensorflow/tsl/c:srcs",
+        "//tensorflow/tsl/platform:ctstring",
     ],
     visibility = ["//visibility:public"],
 )
@@ -94,14 +117,17 @@ cc_library(
     name = "c_api_headers",
     hdrs = [
         "c_api.h",
-        "c_api_macros.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":c_api_macros_hdrs",
         ":tf_attrtype",
-        ":tf_buffer",
-        ":tf_datatype",
+        ":tf_buffer_hdrs",
+        ":tf_datatype_hdrs",
         ":tf_status_headers",
+        ":tf_tensor_hdrs",
+        # TODO: Only include tf_tstring_hdrs. Don't expose the implementation of TF_TString to API
+        # users.
         ":tf_tstring",
     ],
 )
@@ -165,6 +191,14 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "c_api_macros_hdrs",
+    hdrs = [
+        "c_api_macros.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "c_api_macros",
     hdrs = [
@@ -195,8 +229,9 @@ tf_cuda_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        ":c_api_no_xla",
         ":c_api_internal",
+        ":c_api_macros_hdrs",
+        ":c_api_no_xla",
         ":tf_attrtype",
         ":tf_buffer",
         ":tf_file_statistics",
@@ -207,8 +242,8 @@ tf_cuda_library(
         "//tensorflow/tsl/c:tsl_status",
     ] + select({
         "//tensorflow:with_xla_support": [
-            "//tensorflow/compiler/tf2xla:xla_compiler",
             "//tensorflow/compiler/jit",
+            "//tensorflow/compiler/tf2xla:xla_compiler",
         ],
         "//conditions:default": [],
     }) + if_tensorrt([
@@ -240,9 +275,9 @@ tf_cuda_library(
     deps = [
         ":c_api_internal",
         ":tf_attrtype",
-        ":tf_datatype",
         ":tf_buffer",
         ":tf_buffer_internal",
+        ":tf_datatype",
         ":tf_status_internal",
     ] + select({
         "//tensorflow:android": [
@@ -253,25 +288,25 @@ tf_cuda_library(
             ":logging",
             ":tf_status",
             ":tf_tensor",
-            "@com_google_absl//absl/strings",
             "//tensorflow/c/experimental/filesystem:modular_filesystem",
-            "//tensorflow/cc/saved_model:loader_lite",
+            "//tensorflow/cc:grad_ops",
             "//tensorflow/cc:gradients",
             "//tensorflow/cc:ops",
-            "//tensorflow/cc:grad_ops",
             "//tensorflow/cc:scope_internal",
             "//tensorflow/cc:while_loop",
+            "//tensorflow/cc/saved_model:loader_lite",
+            "//tensorflow/compiler/mlir/tfr:graph_decompose_pass",
+            "//tensorflow/compiler/mlir/tfr:node_expansion_pass",
             "//tensorflow/core:core_cpu",
             "//tensorflow/core:core_cpu_internal",
             "//tensorflow/core:framework",
-            "//tensorflow/core:op_gen_lib",
-            "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
+            "//tensorflow/core:op_gen_lib",
+            "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/distributed_runtime:server_lib",
             "//tensorflow/core/kernels:logging_ops",
-            "//tensorflow/compiler/mlir/tfr:node_expansion_pass",
-            "//tensorflow/compiler/mlir/tfr:graph_decompose_pass",
+            "@com_google_absl//absl/strings",
         ],
     }),
     alwayslink = 1,
@@ -308,9 +343,10 @@ tf_cuda_library(
         "//tensorflow/core/transforms:__subpackages__",
     ],
     deps = [
-        "//tensorflow/tsl/platform:status",
+        ":c_api_macros_hdrs",
         "//tensorflow/tsl/c:tsl_status",
         "//tensorflow/tsl/c:tsl_status_internal",
+        "//tensorflow/tsl/platform:status",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
@@ -363,6 +399,7 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":c_api_macros_hdrs",
         ":tf_status_internal",
         "//tensorflow/tsl/c:tsl_status",
     ] + select({
@@ -380,7 +417,8 @@ cc_library(
     hdrs = ["tf_status.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/tsl/c:tsl_status",
+        ":c_api_macros_hdrs",
+        "//tensorflow/tsl/c:tsl_status_headers",
     ],
 )
 
@@ -390,15 +428,15 @@ cc_library(
         "tf_tstring.cc",
     ],
     hdrs = [
-        "c_api_macros.h",
-        "tf_datatype.h",
-        "tf_status.h",
-        "tf_tensor.h",
         "tf_tstring.h",
     ],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":c_api_macros_hdrs",
+        ":tf_datatype_hdrs",
+        ":tf_status_headers",
+        ":tf_tensor_hdrs",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:tstring",
         "//tensorflow/tsl/c:tsl_status",
@@ -426,13 +464,23 @@ cc_library(
     }),
 )
 
+cc_library(
+    name = "tf_datatype_hdrs",
+    hdrs = ["tf_datatype.h"],
+    deps = [
+        ":c_api_macros_hdrs",
+    ],
+)
+
 cc_library(
     name = "tf_datatype",
     srcs = ["tf_datatype.cc"],
     hdrs = ["tf_datatype.h"],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
-    deps = select({
+    deps = [
+        ":c_api_macros_hdrs",
+    ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
@@ -443,6 +491,17 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "tf_tensor_hdrs",
+    hdrs = ["tf_tensor.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":c_api_macros_hdrs",
+        ":tf_datatype_hdrs",
+        ":tf_status_headers",
+    ],
+)
+
 cc_library(
     name = "tf_tensor",
     srcs = ["tf_tensor.cc"],
@@ -493,6 +552,16 @@ tf_cuda_library(
     }),
 )
 
+cc_library(
+    name = "tf_buffer_hdrs",
+    hdrs = [
+        "tf_buffer.h",
+    ],
+    deps = [
+        ":c_api_macros_hdrs",
+    ],
+)
+
 cc_library(
     name = "tf_buffer",
     srcs = [
@@ -504,6 +573,7 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":c_api_macros_hdrs",
         ":tf_buffer_internal",
         ":tf_status",
         ":tf_tensor_internal",
@@ -525,6 +595,7 @@ tf_cuda_library(
         "//tensorflow/c:__subpackages__",
     ],
     deps = [
+        ":c_api_macros_hdrs",
         ":tf_status",
         ":tf_tensor_internal",
         "//tensorflow/core/platform:protobuf",
@@ -545,6 +616,7 @@ tf_cuda_library(
     deps = [
         ":c_api",
         ":c_api_internal",
+        ":c_api_macros_hdrs",
         ":checkpoint_reader",
         ":tf_buffer",
         ":tf_buffer_internal",
@@ -635,9 +707,9 @@ tf_cuda_library(
         ],
     }) + [
         ":c_api_macros",
+        ":tf_file_statistics",
         ":tf_status",
         ":tf_status_helper",
-        ":tf_file_statistics",
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:path",
         "//tensorflow/core/platform:types",
@@ -652,10 +724,11 @@ cc_library(
     ],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":c_api_internal",
-        ":tf_datatype",
-        ":tf_status",
-        ":tf_tensor",
+        ":c_api_headers",
+        ":c_api_macros_hdrs",
+        ":tf_datatype_hdrs",
+        ":tf_status_headers",
+        ":tf_tensor_hdrs",
         "//tensorflow/c/experimental/stream_executor:stream_executor_hdrs",
     ],
 )
@@ -671,6 +744,7 @@ tf_cuda_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":c_api_macros_hdrs",
         ":tf_buffer",
         ":tf_buffer_internal",
         ":tf_status",
@@ -685,12 +759,14 @@ tf_cuda_library(
         "//conditions:default": [
             ":c_api_internal",
             ":tf_tensor",
-            "//tensorflow/compiler/xla/stream_executor:stream_executor",
+            "//tensorflow/c/experimental/stream_executor",
+            "//tensorflow/c/experimental/stream_executor:stream_executor_internal",
+            "//tensorflow/compiler/xla/stream_executor",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_lite",
             "//tensorflow/core:protos_all_cc",
-            "//tensorflow/c/experimental/stream_executor:stream_executor",
-            "//tensorflow/c/experimental/stream_executor:stream_executor_internal",
+            "//tensorflow/tsl/framework:device_id_utils",
+            "//tensorflow/tsl/platform:statusor",
         ],
     }),
 )
@@ -699,7 +775,10 @@ cc_library(
     name = "kernels_experimental_hdrs",
     hdrs = ["kernels_experimental.h"],
     visibility = ["//tensorflow:internal"],
-    deps = [":kernels_hdrs"],
+    deps = [
+        ":c_api_macros_hdrs",
+        ":kernels_hdrs",
+    ],
 )
 
 tf_cuda_library(
@@ -709,6 +788,7 @@ tf_cuda_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":c_api_macros_hdrs",
         ":kernels",
         ":tf_status_helper",
         ":tf_status_internal",
@@ -739,6 +819,7 @@ tf_cuda_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":c_api_macros_hdrs",
         ":tf_datatype",
         ":tf_status",
         ":tf_status_helper",
@@ -758,6 +839,7 @@ cc_library(
     hdrs = ["ops.h"],
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":c_api_macros_hdrs",
         ":tf_datatype",
         ":tf_status",
     ],
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index fb951559a0e..e4c6499506e 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
+#include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/tf_attrtype.h"
 #include "tensorflow/c/tf_buffer.h"
 #include "tensorflow/c/tf_datatype.h"
@@ -72,25 +73,6 @@ limitations under the License.
 //   and the API just provides high level controls over the number of
 //   devices of each type.
 
-// Macro to control visibility of exported symbols in the shared library (.so,
-// .dylib, .dll).
-// This duplicates the TF_EXPORT macro definition in
-// tensorflow/core/platform/macros.h in order to keep this .h file independent
-// of any other includes.
-#ifdef SWIG
-#define TF_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
-#define TF_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TF_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
-#else
-#define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 3a05e1e64db..45697e20d1e 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -190,7 +190,7 @@ const char* TF_GraphDebugString(TF_Graph* graph, size_t* len) {
 }
 
 char* TF_FunctionDebugString(TF_Function* func, size_t* len) {
-  const auto& debug_str = DebugString(func->fdef);
+  const auto& debug_str = DebugString(func->record->fdef());
   *len = debug_str.size();
   char* ret = static_cast<char*>(malloc(*len + 1));
   memcpy(ret, debug_str.c_str(), *len + 1);
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index aec1e875eaf..abae68cfe48 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/eager/c_api.h"
 
 // --------------------------------------------------------------------------
@@ -28,25 +29,6 @@ limitations under the License.
 // The API here is subject to changes in the future.
 // --------------------------------------------------------------------------
 
-// Macro to control visibility of exported symbols in the shared library (.so,
-// .dylib, .dll).
-// This duplicates the TF_EXPORT macro definition in
-// tensorflow/core/platform/macros.h in order to keep this .h file independent
-// of any other includes.$a
-#ifdef SWIG
-#define TF_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
-#define TF_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TF_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
-#else
-#define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index a13a1458553..2fd92bd7dc0 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include <algorithm>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 
 #include "absl/strings/match.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_buffer_internal.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -30,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/base64.h"
 #include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/util/debug_data_dumper.h"
 
 using tensorflow::errors::InvalidArgument;
 
@@ -203,23 +206,31 @@ TF_Function* TF_GraphToFunctionWithControlOutputs(
   }
 
   // Do the actual function creation.
-  TF_Function* tf_function = new TF_Function();
   DCHECK(append_hash_to_fn_name <= 1);
+  tensorflow::FunctionDef fdef;
   status->status = tensorflow::GraphToFunctionDef(
       fn_body->graph, fn_name, append_hash_to_fn_name != 0,
       /*set_stateful_from_nodes=*/true,
       /*copy_placeholder_attrs_from_nodes=*/true, body_nodes, input_tensors,
       output_tensors, output_names_vec, control_output_nodes,
-      control_output_names_vec, description, &tf_function->fdef);
+      control_output_names_vec, description, &fdef);
   if (TF_GetCode(status) != TF_OK) {
-    TF_DeleteFunction(tf_function);
     return nullptr;
   }
 
+  // Dump the op creation stacktraces for debugging purpose.
+  DEBUG_DATA_DUMPER()->DumpOpCreationStackTraces(
+      fn_name, kDebugGroupOpStacktrace, "initial", &fn_body->graph);
+
+  tensorflow::StackTracesMap stack_traces;
   for (const Node* n : fn_body->graph.nodes()) {
-    tf_function->stack_traces[n->name()] = n->GetStackTrace();
+    stack_traces[n->name()] = n->GetStackTrace();
   }
 
+  TF_Function* tf_function = new TF_Function();
+  tf_function->record = new tensorflow::FunctionRecord(
+      std::move(fdef), std::move(stack_traces), false);
+
   return tf_function;
 }
 
@@ -238,7 +249,7 @@ TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
 }
 
 const char* TF_FunctionName(TF_Function* func) {
-  return func->fdef.signature().name().c_str();
+  return func->record->fdef().signature().name().c_str();
 }
 
 void TF_GraphCopyFunction(TF_Graph* g, const TF_Function* func,
@@ -249,19 +260,20 @@ void TF_GraphCopyFunction(TF_Graph* g, const TF_Function* func,
     return;
   }
 
-  // TODO(iga): Add AddFunctionDef() and AddGradientDef() methods to graph
-  // to avoid the extra copy here.
-  tensorflow::FunctionDefLibrary fdef_lib;
-  *fdef_lib.add_function() = func->fdef;
-  if (grad) {
-    *fdef_lib.add_function() = grad->fdef;
-    tensorflow::GradientDef* gdef = fdef_lib.add_gradient();
-    gdef->set_function_name(func->fdef.signature().name());
-    gdef->set_gradient_func(grad->fdef.signature().name());
-  }
-
   tensorflow::mutex_lock l(g->mu);
-  status->status = g->graph.AddFunctionLibrary(fdef_lib);
+  status->status = g->graph.AddFunctionDef(func->record->fdef(),
+                                           func->record->stack_traces());
+  if (TF_GetCode(status) != TF_OK) return;
+  if (!grad) return;
+
+  status->status = g->graph.AddFunctionDef(grad->record->fdef(),
+                                           grad->record->stack_traces());
+  if (TF_GetCode(status) != TF_OK) return;
+
+  tensorflow::GradientDef gdef;
+  gdef.set_function_name(func->record->fdef().signature().name());
+  gdef.set_gradient_func(grad->record->fdef().signature().name());
+  status->status = g->graph.AddGradientDef(std::move(gdef));
 }
 
 int TF_GraphNumFunctions(TF_Graph* g) {
@@ -279,7 +291,7 @@ int TF_GraphGetFunctions(TF_Graph* g, TF_Function** funcs, int max_func,
   const auto len = std::min(max_func, static_cast<int>(lib.function_size()));
   for (int i = 0; i < len; ++i) {
     TF_Function* func = new TF_Function();
-    func->fdef = lib.function(i);
+    func->record = new tensorflow::FunctionRecord(lib.function(i), {}, false);
     funcs[i] = func;
   }
   status->status = ::tensorflow::OkStatus();
@@ -288,18 +300,21 @@ int TF_GraphGetFunctions(TF_Graph* g, TF_Function** funcs, int max_func,
 
 void TF_FunctionToFunctionDef(TF_Function* func, TF_Buffer* output_func_def,
                               TF_Status* status) {
-  status->status = MessageToBuffer(func->fdef, output_func_def);
+  status->status = MessageToBuffer(func->record->fdef(), output_func_def);
 }
 
 TF_Function* TF_FunctionImportFunctionDef(const void* proto, size_t proto_len,
                                           TF_Status* status) {
-  TF_Function* func = new TF_Function();
-  if (!func->fdef.ParseFromArray(proto, proto_len)) {
+  tensorflow::FunctionDef fdef;
+  bool success = fdef.ParseFromArray(proto, proto_len);
+  if (!success) {
     status->status = InvalidArgument(
         "Invalid FunctionDef given to TF_FunctionImportFunctionDef");
-    TF_DeleteFunction(func);
     return nullptr;
   }
+
+  TF_Function* func = new TF_Function();
+  func->record = new tensorflow::FunctionRecord(std::move(fdef), {}, false);
   status->status = ::tensorflow::OkStatus();
   return func;
 }
@@ -314,21 +329,37 @@ void TF_FunctionSetAttrValueProto(TF_Function* func, const char* attr_name,
         "TF_FunctionSetAttrValueProto");
     return;
   }
-  (*func->fdef.mutable_attr())[string(attr_name)] = attr_value;
+
+  auto fdef_or = func->record->mutable_fdef();
+  if (!fdef_or.ok()) {
+    status->status = fdef_or.status();
+    return;
+  }
+
+  (*(fdef_or.value()->mutable_attr()))[string(attr_name)] = attr_value;
+
   status->status = ::tensorflow::OkStatus();
 }
 
 void TF_FunctionGetAttrValueProto(TF_Function* func, const char* attr_name,
                                   TF_Buffer* output_attr_value,
                                   TF_Status* status) {
-  const auto& it = func->fdef.attr().find(attr_name);
-  if (it == func->fdef.attr().end()) {
+  const auto& it = func->record->fdef().attr().find(attr_name);
+  if (it == func->record->fdef().attr().end()) {
     status->status =
-        InvalidArgument("Function '", func->fdef.signature().name(),
+        InvalidArgument("Function '", func->record->fdef().signature().name(),
                         "' has no attr named '", attr_name, "'.");
     return;
   }
   status->status = MessageToBuffer(it->second, output_attr_value);
 }
 
-void TF_DeleteFunction(TF_Function* func) { delete func; }
+void TF_DeleteFunction(TF_Function* func) {
+  if (func == nullptr) {
+    return;
+  }
+
+  func->record->Unref();
+  func->record = nullptr;
+  delete func;
+}
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index ec8cfe4a31a..0f177ed30ae 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/c_test_util.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -1210,6 +1211,25 @@ TEST_F(CApiFunctionTest, OutputOpNotInBody) {
             string(TF_Message(s_)));
 }
 
+class TestStackTrace : public AbstractStackTrace {
+  absl::Span<StackFrame const> ToFrames() const override { return frames_; }
+
+  StackFrame LastUserFrame() const override { return frames_.back(); }
+
+  std::vector<StackFrame> GetUserFrames(int limit) const override {
+    return frames_;
+  }
+
+  string ToString(const TracePrintingOptions& opts) const override {
+    auto frame = LastUserFrame();
+    return absl::StrCat(frame.file_name, ":", frame.line_number, ":",
+                        frame.function_name);
+  }
+
+  std::vector<StackFrame> frames_{
+      StackFrame({"dummy_file_name", 10, "dummy_function_name"})};
+};
+
 void DefineFunction(const char* name, TF_Function** func,
                     const char* description = nullptr,
                     bool append_hash = false) {
@@ -1221,6 +1241,9 @@ void DefineFunction(const char* name, TF_Function** func,
   TF_Operation* feed = Placeholder(func_graph.get(), s.get());
   TF_Operation* neg = Neg(feed, func_graph.get(), s.get());
 
+  feed->node.SetStackTrace(std::make_shared<TestStackTrace>());
+  neg->node.SetStackTrace(std::make_shared<TestStackTrace>());
+
   TF_Output inputs[] = {{feed, 0}};
   TF_Output outputs[] = {{neg, 0}};
   *func = TF_GraphToFunction(func_graph.get(), name, append_hash, -1,
@@ -1270,11 +1293,11 @@ TEST_F(CApiFunctionTest, GraphToFunctionDefWithPlaceholderAttr) {
   ASSERT_NE(func_, nullptr);
 
   // Verify that FunctionDef has 2 attributes, "v1" and "v2".
-  ASSERT_EQ(func_->fdef.signature().attr().size(), 2);
-  EXPECT_EQ(func_->fdef.signature().attr(0).name(), "v1");
-  EXPECT_EQ(func_->fdef.signature().attr(0).type(), "int");
-  EXPECT_EQ(func_->fdef.signature().attr(1).name(), "v2");
-  EXPECT_EQ(func_->fdef.signature().attr(1).type(), "int");
+  ASSERT_EQ(func_->record->fdef().signature().attr().size(), 2);
+  EXPECT_EQ(func_->record->fdef().signature().attr(0).name(), "v1");
+  EXPECT_EQ(func_->record->fdef().signature().attr(0).type(), "int");
+  EXPECT_EQ(func_->record->fdef().signature().attr(1).name(), "v2");
+  EXPECT_EQ(func_->record->fdef().signature().attr(1).type(), "int");
 }
 
 void NodeWithAttrHelper(TF_Graph* graph, TF_Status* s, const char* name,
@@ -1308,14 +1331,65 @@ TEST_F(CApiFunctionTest, GraphToFunctionDefWithArgAttr) {
   ASSERT_NE(func_, nullptr);
 
   // Verify that FunctionDef ArgDef has attributes.
-  ASSERT_EQ(func_->fdef.arg_attr_size(), 1);
-  auto arg_attrs = func_->fdef.arg_attr().find(0);
-  ASSERT_NE(arg_attrs, func_->fdef.arg_attr().end());
+  ASSERT_EQ(func_->record->fdef().arg_attr_size(), 1);
+  auto arg_attrs = func_->record->fdef().arg_attr().find(0);
+  ASSERT_NE(arg_attrs, func_->record->fdef().arg_attr().end());
   auto iter = arg_attrs->second.attr().find("_test_attr");
   ASSERT_NE(iter, arg_attrs->second.attr().end());
   EXPECT_EQ(iter->second.s(), "value");
 }
 
+TEST_F(CApiFunctionTest, TFGraphToFunctionWithStackTraces) {
+  DefineFunction(func_name_, &func_);
+  auto stack_traces = func_->record->stack_traces();
+
+  EXPECT_EQ(stack_traces.size(), 4);
+  EXPECT_EQ(stack_traces["neg"]->ToString({}),
+            "dummy_file_name:10:dummy_function_name");
+  EXPECT_EQ(stack_traces["feed"]->ToString({}),
+            "dummy_file_name:10:dummy_function_name");
+}
+
+TEST_F(CApiFunctionTest, TFGraphCopyFunctionWithStackTraces) {
+  // Define the function and its grad
+  DefineFunction(func_name_, &func_);
+  TF_Function* grad_func;
+  DefineFunction("MyGrad", &grad_func);
+
+  // Add func and its gradient to host graph
+  TF_GraphCopyFunction(host_graph_, func_, grad_func, s_);
+
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_DeleteFunction(grad_func);
+
+  const StackTracesMap* func_stack_traces;
+  const StackTracesMap* grad_stack_traces;
+
+  {
+    mutex_lock l(host_graph_->mu);
+    auto flib_def = host_graph_->graph.flib_def();
+    func_stack_traces = flib_def.GetStackTraces(func_name_);
+    grad_stack_traces = flib_def.GetStackTraces("MyGrad");
+  }
+
+  // Verify that stack traces of func is copied to graph function library.
+  ASSERT_NE(func_stack_traces, nullptr);
+  EXPECT_EQ(func_stack_traces->size(), 4);
+  EXPECT_EQ(func_stack_traces->at("neg")->ToString({}),
+            "dummy_file_name:10:dummy_function_name");
+  EXPECT_EQ(func_stack_traces->at("feed")->ToString({}),
+            "dummy_file_name:10:dummy_function_name");
+
+  // Verify that stack traces of grad_func is copied to graph function library.
+  ASSERT_NE(grad_stack_traces, nullptr);
+  EXPECT_EQ(grad_stack_traces->size(), 4);
+  EXPECT_EQ(grad_stack_traces->at("neg")->ToString({}),
+            "dummy_file_name:10:dummy_function_name");
+  EXPECT_EQ(grad_stack_traces->at("feed")->ToString({}),
+            "dummy_file_name:10:dummy_function_name");
+}
+
 TEST_F(CApiFunctionTest, SetGradientAndRun) {
   // Define the function and its grad
   DefineFunction(func_name_, &func_);
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index a34e11a3e4c..92f63553ee1 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -16,14 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_C_C_API_INTERNAL_H_
 #define TENSORFLOW_C_C_API_INTERNAL_H_
 
-#include "tensorflow/c/c_api.h"
-
 #include <list>
 #include <set>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/c/c_api.h"
+
 // clang-format off
 // Required for IS_MOBILE_PLATFORM
 #include "tensorflow/core/platform/platform.h"
@@ -34,11 +34,12 @@ limitations under the License.
 #if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 #include "tensorflow/core/framework/op_gen_lib.h"
 #endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
@@ -159,8 +160,7 @@ struct TF_DeviceList {
 };
 
 struct TF_Function {
-  tensorflow::FunctionDef fdef;
-  tensorflow::StackTracesMap stack_traces;
+  tensorflow::FunctionRecord* record;
 };
 
 struct TF_ApiDefMap {
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 051c81fc782..008e2d772a3 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -243,7 +243,7 @@ void TestEncodeDecode(int line, const std::vector<string>& data) {
       src.flat<tstring>()(i) = data[i];
     }
     TF_Tensor* dst = TF_TensorFromTensor(src, &status);
-    ASSERT_TRUE(status.ok()) << status.error_message();
+    ASSERT_TRUE(status.ok()) << status.message();
 
     // Convert back to a C++ Tensor and ensure we get expected output.
     Tensor output;
@@ -1435,7 +1435,7 @@ TEST(CAPI, SavedModel) {
   ASSERT_TRUE(input_op != nullptr);
   Status status;
   csession.SetInputs({{input_op, TF_TensorFromTensor(input, &status)}});
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   const tensorflow::string output_op_name(
       tensorflow::ParseTensorName(output_name).first);
diff --git a/tensorflow/c/checkpoint_reader.cc b/tensorflow/c/checkpoint_reader.cc
index 4a613d874a2..3ed513f0caa 100644
--- a/tensorflow/c/checkpoint_reader.cc
+++ b/tensorflow/c/checkpoint_reader.cc
@@ -42,7 +42,7 @@ CheckpointReader::CheckpointReader(const string& filename, TF_Status* status)
     v2_reader_.reset(
         new BundleReader(Env::Default(), filename /* prefix to a V2 ckpt */));
     if (!v2_reader_->status().ok()) {
-      Set_TF_Status_from_Status(status, v2_reader_->status());
+      tsl::Set_TF_Status_from_Status(status, v2_reader_->status());
       return;
     }
     auto result = BuildV2VarMaps();
@@ -51,7 +51,7 @@ CheckpointReader::CheckpointReader(const string& filename, TF_Status* status)
   } else {
     reader_.reset(new TensorSliceReader(filename));
     if (!reader_->status().ok()) {
-      Set_TF_Status_from_Status(status, reader_->status());
+      tsl::Set_TF_Status_from_Status(status, reader_->status());
       return;
     }
     var_to_shape_map_.reset(
@@ -102,7 +102,7 @@ void CheckpointReader::GetTensor(
     }
   }
   if (!status.ok()) {
-    Set_TF_Status_from_Status(out_status, status);
+    tsl::Set_TF_Status_from_Status(out_status, status);
   }
 }
 
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 1fb1d367dfd..dd61bd26bc1 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -37,20 +37,17 @@ tf_cuda_library(
         ],
         "//conditions:default": [
             ":immediate_execution_context",
+            ":immediate_execution_distributed_manager",
             ":immediate_execution_operation",
             ":immediate_execution_tensor_handle",
-            ":immediate_execution_distributed_manager",
-            ":tfe_context_internal",
             ":tfe_cancellation_manager_internal",
+            ":tfe_context_internal",
             ":tfe_executor_internal",
             ":tfe_monitoring_internal",
             ":tfe_op_attrs_internal",
             ":tfe_op_internal",
             ":tfe_tensor_debug_info_internal",
             ":tfe_tensorhandle_internal",
-            "@com_google_absl//absl/algorithm:container",
-            "@com_google_absl//absl/types:span",
-            "@com_google_absl//absl/types:variant",
             "//tensorflow/c:c_api",
             "//tensorflow/c:c_api_internal",
             "//tensorflow/c:tf_buffer",
@@ -58,6 +55,12 @@ tf_cuda_library(
             "//tensorflow/c:tf_status_internal",
             "//tensorflow/c:tf_tensor_internal",
             "//tensorflow/core:core_cpu",
+            "//tensorflow/core:core_cpu_internal",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/common_runtime/eager:attr_builder",
             "//tensorflow/core/common_runtime/eager:context",
             "//tensorflow/core/common_runtime/eager:context_distributed_manager",
@@ -65,34 +68,32 @@ tf_cuda_library(
             "//tensorflow/core/common_runtime/eager:custom_device",
             "//tensorflow/core/common_runtime/eager:eager_executor",
             "//tensorflow/core/common_runtime/eager:execute",
-            "//tensorflow/core/common_runtime/eager:tensor_handle",
             "//tensorflow/core/common_runtime/eager:placement_utils",
-            "//tensorflow/core:core_cpu_internal",
-            "//tensorflow/core:framework",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:lib_internal",
-            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core/common_runtime/eager:tensor_handle",
             "//tensorflow/core/profiler/lib:traceme",
+            "@com_google_absl//absl/algorithm:container",
+            "@com_google_absl//absl/types:span",
+            "@com_google_absl//absl/types:variant",
         ],
     }) + [
-        "@com_google_absl//absl/memory",
         ":abstract_tensor_handle",
+        "//tensorflow/c:c_api_macros_hdrs",
+        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core/common_runtime/eager:eager_operation",
-        "//tensorflow/core/distributed_runtime/eager:remote_mgr",
+        "//tensorflow/core/distributed_runtime:remote_device",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime:worker_interface",
         "//tensorflow/core/distributed_runtime/eager:cluster_function_library_runtime",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
-        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
+        "//tensorflow/core/distributed_runtime/eager:remote_mgr",
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
-        "//tensorflow/core/distributed_runtime:remote_device",
-        "//tensorflow/core/distributed_runtime:server_lib",
-        "//tensorflow/core/distributed_runtime:worker_env",
-        "//tensorflow/core/distributed_runtime:worker_interface",
-        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
     ] + internal_tfrt_deps(),
     alwayslink = 1,
@@ -541,7 +542,9 @@ cc_library(
 cc_library(
     name = "tfe_op_attrs_internal",
     hdrs = ["tfe_op_attrs_internal.h"],
-    visibility = ["//visibility:private"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
     deps = [
         ":abstract_op_attrs",
         "//tensorflow/c:conversion_macros",
@@ -836,64 +839,84 @@ tf_cuda_library(
             "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            ":abstract_context",
+            ":abstract_operation",
+            ":abstract_tensor_handle",
             ":c_api",
             ":c_api_internal",
             ":graph_function",
+            ":immediate_execution_context",
+            ":immediate_execution_tensor_handle",
             ":tfe_context_internal",
             ":tfe_op_internal",
             ":tfe_tensorhandle_internal",
-            ":abstract_operation",
-            ":abstract_context",
-            ":abstract_tensor_handle",
-            ":immediate_execution_tensor_handle",
-            ":immediate_execution_context",
-            "//tensorflow/core/lib/llvm_rtti",
             "//tensorflow/c:c_api",
             "//tensorflow/c:c_api_internal",
+            "//tensorflow/c:conversion_macros",
             "//tensorflow/core:core_cpu",
-            "//tensorflow/core/common_runtime/eager:attr_builder",
-            "//tensorflow/core/common_runtime/eager:context",
-            "//tensorflow/core/common_runtime/eager:eager_executor",
-            "//tensorflow/core/common_runtime/eager:eager_operation",
-            "//tensorflow/core/common_runtime/eager:execute",
-            "//tensorflow/core/common_runtime/eager:kernel_and_device",
-            "//tensorflow/core/common_runtime/eager:tensor_handle",
-            "//tensorflow/core/common_runtime/eager:copy_to_device_node",
             "//tensorflow/core:core_cpu_internal",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core/common_runtime/eager:attr_builder",
+            "//tensorflow/core/common_runtime/eager:context",
+            "//tensorflow/core/common_runtime/eager:copy_to_device_node",
+            "//tensorflow/core/common_runtime/eager:eager_executor",
+            "//tensorflow/core/common_runtime/eager:eager_operation",
+            "//tensorflow/core/common_runtime/eager:execute",
+            "//tensorflow/core/common_runtime/eager:kernel_and_device",
+            "//tensorflow/core/common_runtime/eager:tensor_handle",
+            "//tensorflow/core/lib/llvm_rtti",
             "@com_google_absl//absl/types:variant",
-            "//tensorflow/c:conversion_macros",
         ],
     }) + select({
         "//tensorflow:with_xla_support": [
-            "//tensorflow/compiler/tf2xla:xla_compiler",
             "//tensorflow/compiler/jit",
             "//tensorflow/compiler/jit:xla_device",
+            "//tensorflow/compiler/tf2xla:xla_compiler",
         ],
         "//conditions:default": [],
     }) + [
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/c:tf_status_helper",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core/distributed_runtime:remote_device",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/coordination:coordination_service_error_util",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
-        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
-        "//tensorflow/core/distributed_runtime:remote_device",
-        "//tensorflow/core/distributed_runtime:server_lib",
-        "//tensorflow/core/distributed_runtime:worker_env",
-        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "c_api_experimental_reader",
+    testonly = True,
+    srcs = [
+        "c_api_experimental_reader.cc",
+    ],
+    hdrs = [
+        "c_api_experimental_reader.h",
+        "tfe_monitoring_reader_internal.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":c_api",
+        "//tensorflow/c:c_api",
+        "//tensorflow/core/lib/monitoring:cell_reader",
+        "@com_google_absl//absl/memory",
     ],
     alwayslink = 1,
 )
@@ -920,6 +943,29 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "c_api_experimental_reader_test",
+    size = "small",
+    srcs = [
+        "c_api_experimental_reader_test.cc",
+    ],
+    args = ["--heap_check="],
+    tags = tf_cuda_tests_tags() + ["nomac"],
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        ":c_api_experimental_reader",
+        ":c_api_test_util",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "c_api_unified_experimental_test",
     size = "small",
@@ -1009,6 +1055,23 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
+filegroup(
+    name = "pywrap_headers_monitoring_reader",
+    srcs = [
+        "c_api_experimental_reader.h",
+        "tfe_monitoring_reader_internal.h",
+    ],
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "headers_monitoring_reader",
+    srcs = [
+        "c_api_experimental_reader.h",
+    ],
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
 cc_library(
     name = "dlpack",
     srcs = ["dlpack.cc"],
@@ -1046,6 +1109,9 @@ filegroup(
         ],
         exclude = [
             "c_api_experimental.cc",
+            "c_api_experimental_reader.cc",
+            "c_api_experimental_reader.h",
+            "tfe_monitoring_reader_internal.h",
             "c_api_unified_experimental.cc",
             "c_api_unified_experimental_eager.cc",
             "c_api_unified_experimental_graph.cc",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index a4e11b63576..8503485f63c 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -137,14 +138,14 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
   std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
       new tensorflow::DynamicDeviceMgr(std::move(devices)));
 
-  tensorflow::Rendezvous* r =
-      new tensorflow::IntraProcessRendezvous(device_mgr.get());
+  auto r = tsl::core::RefCountPtr<tensorflow::IntraProcessRendezvous>(
+      new tensorflow::IntraProcessRendezvous(device_mgr.get()));
   tensorflow::EagerContext* eager_context = new tensorflow::EagerContext(
       opts->session_options.options,
       static_cast<tensorflow::ContextDevicePlacementPolicy>(
           opts->device_placement_policy),
       opts->async, device_mgr.release(),
-      /*device_mgr_owned*/ true, r,
+      /*device_mgr_owned*/ true, std::move(r),
       /*cluster_flr=*/nullptr,
       /*collective_executor_mgr=*/nullptr,
       /*run_eager_op_as_function=*/opts->run_eager_op_as_function,
@@ -931,9 +932,32 @@ void TFE_ContextAddFunctionDef(TFE_Context* ctx,
 
 void TFE_ContextAddFunction(TFE_Context* ctx, TF_Function* function,
                             TF_Status* status) {
-  AnnotateEagerRuntimeConstructionContext(function->fdef);
+  auto fdef_or = function->record->mutable_fdef();
+  if (!fdef_or.ok()) {
+    status->status = fdef_or.status();
+    return;
+  }
+
+  AnnotateEagerRuntimeConstructionContext(*fdef_or.value());
   status->status = tensorflow::unwrap(ctx)->AddFunctionDefWithStackTraces(
-      function->fdef, function->stack_traces);
+      *fdef_or.value(), function->record->stack_traces());
+}
+
+TF_Function* TFE_ContextGetFunction(TFE_Context* ctx, const char* name,
+                                    TF_Status* status) {
+  tensorflow::core::RefCountPtr<tensorflow::FunctionRecord> record =
+      tensorflow::unwrap(ctx)->FindRecord(name);
+
+  if (record == nullptr) {
+    status->status = tensorflow::errors::NotFound(
+        "Unable to find Function with name: ", name);
+    return nullptr;
+  }
+
+  TF_Function* result = new TF_Function();
+  record->Ref();
+  result->record = record.get();
+  return result;
 }
 
 void TFE_ContextRemoveFunction(TFE_Context* ctx, const char* name,
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 7ad77587d6f..7f458ac50ab 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -21,25 +21,7 @@ limitations under the License.
 // stable and can change without notice.
 
 #include "tensorflow/c/c_api.h"
-
-// Macro to control visibility of exported symbols in the shared library (.so,
-// .dylib, .dll).
-// This duplicates the TF_EXPORT macro definition in
-// tensorflow/core/platform/macros.h in order to keep this .h file independent
-// of any other includes.$a
-#ifdef SWIG
-#define TF_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
-#define TF_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TF_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
-#else
-#define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
+#include "tensorflow/c/c_api_macros.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/tensorflow/c/eager/c_api_cluster_test.cc b/tensorflow/c/eager/c_api_cluster_test.cc
index 7a604950a63..c4b58c3dd73 100644
--- a/tensorflow/c/eager/c_api_cluster_test.cc
+++ b/tensorflow/c/eager/c_api_cluster_test.cc
@@ -150,7 +150,7 @@ void TestRemoteExecuteChangeServerDef(bool async) {
   updated_server_def.set_task_index(1);
   tensorflow::Status s = tensorflow::GrpcServer::Create(
       updated_server_def, tensorflow::Env::Default(), &worker_server);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   ASSERT_TRUE(worker_server->Start().ok());
 
   TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
diff --git a/tensorflow/c/eager/c_api_distributed_test.cc b/tensorflow/c/eager/c_api_distributed_test.cc
index efd9e8a0a35..e35bc962525 100644
--- a/tensorflow/c/eager/c_api_distributed_test.cc
+++ b/tensorflow/c/eager/c_api_distributed_test.cc
@@ -434,6 +434,7 @@ class FunctionErrorInjectionPass : public tensorflow::FunctionOptimizationPass {
   tensorflow::Status Run(const std::string& function_name,
                          const tensorflow::DeviceSet& device_set,
                          const tensorflow::ConfigProto& config_proto,
+                         absl::string_view xla_compile_device_type,
                          std::unique_ptr<tensorflow::Graph>* graph,
                          tensorflow::FunctionLibraryDefinition* flib_def,
                          std::vector<std::string>* control_ret_node_names,
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 2490fc440ed..6fbcb7bb56a 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/c/eager/c_api_experimental.h"
 
+#include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "absl/time/time.h"
 #include "tensorflow/c/c_api.h"
@@ -29,6 +31,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/distributed_runtime/coordination/coordination_service_error_util.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/lib/monitoring/sampler.h"
@@ -80,7 +84,7 @@ TFE_MonitoringCounter0* TFE_MonitoringNewCounter0(const char* name,
                                                   TF_Status* status,
                                                   const char* description) {
   auto* result = new TFE_MonitoringCounter0({name, description});
-  Set_TF_Status_from_Status(status, result->counter->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->counter->GetStatus());
   if (!result->counter->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -103,7 +107,7 @@ TFE_MonitoringCounter1* TFE_MonitoringNewCounter1(const char* name,
                                                   const char* description,
                                                   const char* label1) {
   auto* result = new TFE_MonitoringCounter1({name, description, label1});
-  Set_TF_Status_from_Status(status, result->counter->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->counter->GetStatus());
   if (!result->counter->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -128,7 +132,7 @@ TFE_MonitoringCounter2* TFE_MonitoringNewCounter2(const char* name,
                                                   const char* label2) {
   auto* result =
       new TFE_MonitoringCounter2({name, description, label1, label2});
-  Set_TF_Status_from_Status(status, result->counter->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->counter->GetStatus());
   if (!result->counter->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -159,7 +163,7 @@ TFE_MonitoringIntGauge0* TFE_MonitoringNewIntGauge0(const char* name,
                                                     TF_Status* status,
                                                     const char* description) {
   auto* result = new TFE_MonitoringIntGauge0({name, description});
-  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->gauge->GetStatus());
   if (!result->gauge->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -182,7 +186,7 @@ TFE_MonitoringIntGauge1* TFE_MonitoringNewIntGauge1(const char* name,
                                                     const char* description,
                                                     const char* label1) {
   auto* result = new TFE_MonitoringIntGauge1({name, description, label1});
-  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->gauge->GetStatus());
   if (!result->gauge->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -207,7 +211,7 @@ TFE_MonitoringIntGauge2* TFE_MonitoringNewIntGauge2(const char* name,
                                                     const char* label2) {
   auto* result =
       new TFE_MonitoringIntGauge2({name, description, label1, label2});
-  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->gauge->GetStatus());
   if (!result->gauge->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -245,7 +249,7 @@ const void TFE_MonitoringStringGaugeCellValue(
 TFE_MonitoringStringGauge0* TFE_MonitoringNewStringGauge0(
     const char* name, TF_Status* status, const char* description) {
   auto* result = new TFE_MonitoringStringGauge0({name, description});
-  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->gauge->GetStatus());
   if (!result->gauge->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -267,7 +271,7 @@ TFE_MonitoringStringGauge1* TFE_MonitoringNewStringGauge1(
     const char* name, TF_Status* status, const char* description,
     const char* label1) {
   auto* result = new TFE_MonitoringStringGauge1({name, description, label1});
-  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->gauge->GetStatus());
   if (!result->gauge->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -290,7 +294,7 @@ TFE_MonitoringStringGauge2* TFE_MonitoringNewStringGauge2(
     const char* label1, const char* label2) {
   auto* result =
       new TFE_MonitoringStringGauge2({name, description, label1, label2});
-  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->gauge->GetStatus());
   if (!result->gauge->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -313,7 +317,7 @@ TFE_MonitoringStringGauge3* TFE_MonitoringNewStringGauge3(
     const char* label1, const char* label2, const char* label3) {
   auto* result = new TFE_MonitoringStringGauge3(
       {name, description, label1, label2, label3});
-  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->gauge->GetStatus());
   if (!result->gauge->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -338,7 +342,7 @@ TFE_MonitoringStringGauge4* TFE_MonitoringNewStringGauge4(
     const char* label4) {
   auto* result = new TFE_MonitoringStringGauge4(
       {name, description, label1, label2, label3, label4});
-  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->gauge->GetStatus());
   if (!result->gauge->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -370,7 +374,7 @@ TFE_MonitoringBoolGauge0* TFE_MonitoringNewBoolGauge0(const char* name,
                                                       TF_Status* status,
                                                       const char* description) {
   auto* result = new TFE_MonitoringBoolGauge0({name, description});
-  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->gauge->GetStatus());
   if (!result->gauge->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -393,7 +397,7 @@ TFE_MonitoringBoolGauge1* TFE_MonitoringNewBoolGauge1(const char* name,
                                                       const char* description,
                                                       const char* label1) {
   auto* result = new TFE_MonitoringBoolGauge1({name, description, label1});
-  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->gauge->GetStatus());
   if (!result->gauge->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -418,7 +422,7 @@ TFE_MonitoringBoolGauge2* TFE_MonitoringNewBoolGauge2(const char* name,
                                                       const char* label2) {
   auto* result =
       new TFE_MonitoringBoolGauge2({name, description, label1, label2});
-  Set_TF_Status_from_Status(status, result->gauge->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->gauge->GetStatus());
   if (!result->gauge->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -472,7 +476,7 @@ TFE_MonitoringSampler0* TFE_MonitoringNewSampler0(
     const char* description) {
   auto* result = new TFE_MonitoringSampler0(
       {name, buckets->create_buckets(), description});
-  Set_TF_Status_from_Status(status, result->sampler->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->sampler->GetStatus());
   if (!result->sampler->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -495,7 +499,7 @@ TFE_MonitoringSampler1* TFE_MonitoringNewSampler1(
     const char* description, const char* label1) {
   auto* result = new TFE_MonitoringSampler1(
       {name, buckets->create_buckets(), description, label1});
-  Set_TF_Status_from_Status(status, result->sampler->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->sampler->GetStatus());
   if (!result->sampler->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -518,7 +522,7 @@ TFE_MonitoringSampler2* TFE_MonitoringNewSampler2(
     const char* description, const char* label1, const char* label2) {
   auto* result = new TFE_MonitoringSampler2(
       {name, buckets->create_buckets(), description, label1, label2});
-  Set_TF_Status_from_Status(status, result->sampler->GetStatus());
+  tsl::Set_TF_Status_from_Status(status, result->sampler->GetStatus());
   if (!result->sampler->GetStatus().ok()) {
     delete result;
     return nullptr;
@@ -628,6 +632,30 @@ void TFE_ContextGetFunctionDef(TFE_Context* ctx, const char* function_name,
   status->status = ::tensorflow::OkStatus();
 }
 
+void TFE_ContextGetGraphDebugInfo(TFE_Context* ctx, const char* function_name,
+                                  TF_Buffer* buf, TF_Status* status) {
+  auto function_record = tensorflow::unwrap(ctx)->FindRecord(function_name);
+  if (function_record == nullptr) {
+    status->status = tensorflow::errors::NotFound(
+        "Unable to find function with name: ", function_name);
+    return;
+  }
+
+  tensorflow::GraphDebugInfo debug_info =
+      tensorflow::StackTracesMapToGraphDebugInfo(
+          function_record->stack_traces());
+
+  string str = debug_info.SerializeAsString();
+  void* data = tensorflow::port::Malloc(str.length());
+  str.copy(static_cast<char*>(data), str.length(), 0);
+  buf->data = data;
+  buf->length = str.length();
+  buf->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
+  status->status = ::tensorflow::OkStatus();
+}
+
 TF_Tensor* TFE_AllocateHostTensor(TFE_Context* ctx, TF_DataType dtype,
                                   const int64_t* dims, int num_dims,
                                   TF_Status* status) {
@@ -884,7 +912,7 @@ void TFE_GetTaskStates(TFE_Context* ctx, const TF_Buffer& tasks, void* states,
     const auto& result = (*results)[i];
     TF_Status s;
     TF_SetStatus(&s, static_cast<TF_Code>(result.error_code()),
-                 result.error_message().data());
+                 std::string(result.error_message()).data());
     if (TF_GetCode(&s) != TF_Code::TF_OK) {
       tensorflow::CoordinationServiceError error;
       *error.mutable_source_task() = result.error_payload().source_task();
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 48c5fe70ce0..fcbced2080a 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -612,6 +612,17 @@ TF_CAPI_EXPORT extern void TFE_ContextGetFunctionDef(TFE_Context* ctx,
                                                      TF_Buffer* buf,
                                                      TF_Status* status);
 
+// Get GraphDebugInfo containing stack traces mapping to node names
+TF_CAPI_EXPORT extern void TFE_ContextGetGraphDebugInfo(
+    TFE_Context* ctx, const char* function_name, TF_Buffer* buf,
+    TF_Status* status);
+
+// Extracts a TF_Function from the context.
+// Must call TF_DeleteFunction on the returned value.
+TF_CAPI_EXPORT extern TF_Function* TFE_ContextGetFunction(TFE_Context* ctx,
+                                                          const char* name,
+                                                          TF_Status* status);
+
 // Allocate and return a new Tensor on the host.
 //
 // The caller must set the Tensor values by writing them to the pointer returned
diff --git a/tensorflow/c/eager/c_api_experimental_reader.cc b/tensorflow/c/eager/c_api_experimental_reader.cc
new file mode 100644
index 00000000000..0959580a104
--- /dev/null
+++ b/tensorflow/c/eager/c_api_experimental_reader.cc
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");;
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api_experimental_reader.h"
+
+#include "tensorflow/c/eager/tfe_monitoring_reader_internal.h"
+
+template <typename... LabelType>
+int64_t TFE_MonitoringCounterReader::Read(const LabelType&... labels) {
+  return counter->Read(labels...);
+}
+
+TFE_MonitoringCounterReader* TFE_MonitoringNewCounterReader(const char* name) {
+  auto* result = new TFE_MonitoringCounterReader(name);
+
+  return result;
+}
+
+int64_t TFE_MonitoringReadCounter0(TFE_MonitoringCounterReader* cell_reader) {
+  int64_t result = cell_reader->Read();
+
+  return result;
+}
+
+int64_t TFE_MonitoringReadCounter1(TFE_MonitoringCounterReader* cell_reader,
+                                   const char* label) {
+  int64_t result = cell_reader->Read(label);
+
+  return result;
+}
diff --git a/tensorflow/c/eager/c_api_experimental_reader.h b/tensorflow/c/eager/c_api_experimental_reader.h
new file mode 100644
index 00000000000..71c2e4650f0
--- /dev/null
+++ b/tensorflow/c/eager/c_api_experimental_reader.h
@@ -0,0 +1,60 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");;
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_READER_H_
+#define TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_READER_H_
+
+#include "tensorflow/c/eager/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Test only exports of the monitoring Cell Reader API which allows tests to
+// read current values from streamz counters defined in other modules.
+//
+// The code under test will have created streamz counters like this:
+// auto* streamz = tensorflow::monitoring::Counter<1>::New("name",
+// "description", "label");
+// and then incremented that counter for various values of label:
+// streamz->GetCell("label-value")->IncrementBy(1);
+//
+// The test code can then read and test the value of that counter:
+//
+// auto* reader = TFE_MonitoringNewCounterReader("name");
+// test();
+// int64_t value = TFE_MonitoringReadCounter1(reader, "label-value");
+
+// Opaque handle to a reader.
+typedef struct TFE_MonitoringCounterReader TFE_MonitoringCounterReader;
+
+// Returns a handle to be used for reading values from streamz counter. The
+// counter can have been created with any number of labels.
+TF_CAPI_EXPORT extern TFE_MonitoringCounterReader*
+TFE_MonitoringNewCounterReader(const char* name);
+
+// Reads the value of a counter that was created with 0 labels.
+TF_CAPI_EXPORT extern int64_t TFE_MonitoringReadCounter0(
+    TFE_MonitoringCounterReader*);
+
+// Reads the value of specific cell of a counter that was created with 1 label.
+TF_CAPI_EXPORT extern int64_t TFE_MonitoringReadCounter1(
+    TFE_MonitoringCounterReader*, const char* label_value);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_READER_H_
diff --git a/tensorflow/c/eager/c_api_experimental_reader_test.cc b/tensorflow/c/eager/c_api_experimental_reader_test.cc
new file mode 100644
index 00000000000..3c7a09891a6
--- /dev/null
+++ b/tensorflow/c/eager/c_api_experimental_reader_test.cc
@@ -0,0 +1,86 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api_experimental_reader.h"
+
+#include <cstdint>
+
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TFE_MonitoringCounter0* CreateCounter0(const char* counter_name);
+TFE_MonitoringCounter1* CreateCounter1(const char* counter_name,
+                                       const char* label);
+void IncrementCounter0(TFE_MonitoringCounter0* counter, int64_t delta = 1);
+void IncrementCounter1(TFE_MonitoringCounter1* counter, const char* label,
+                       int64_t delta = 1);
+
+TEST(CAPI, MonitoringCellReader0) {
+  auto counter_name = "test/counter0";
+  auto* counter = CreateCounter0(counter_name);
+  auto* reader = TFE_MonitoringNewCounterReader(counter_name);
+  IncrementCounter0(counter);
+
+  int64_t actual = TFE_MonitoringReadCounter0(reader);
+
+  CHECK_EQ(actual, 1);
+}
+
+TEST(CAPI, MonitoringCellReader1) {
+  auto counter_name = "test/counter1";
+  auto label_name = "test/label";
+  auto* counter = CreateCounter1(counter_name, label_name);
+  auto* reader = TFE_MonitoringNewCounterReader(counter_name);
+  IncrementCounter1(counter, label_name);
+
+  int64_t actual = TFE_MonitoringReadCounter1(reader, label_name);
+
+  CHECK_EQ(actual, 1);
+}
+
+TFE_MonitoringCounter0* CreateCounter0(const char* counter_name) {
+  TF_Status* status = TF_NewStatus();
+  auto* counter =
+      TFE_MonitoringNewCounter0(counter_name, status, "description");
+  TF_DeleteStatus(status);
+  return counter;
+}
+
+void IncrementCounter0(TFE_MonitoringCounter0* counter, int64_t delta) {
+  auto* cell = TFE_MonitoringGetCellCounter0(counter);
+  TFE_MonitoringCounterCellIncrementBy(cell, delta);
+}
+
+TFE_MonitoringCounter1* CreateCounter1(const char* counter_name,
+                                       const char* label) {
+  TF_Status* status = TF_NewStatus();
+  auto* counter =
+      TFE_MonitoringNewCounter1(counter_name, status, "description", label);
+  TF_DeleteStatus(status);
+  return counter;
+}
+
+void IncrementCounter1(TFE_MonitoringCounter1* counter, const char* label,
+                       int64_t delta) {
+  auto* cell = TFE_MonitoringGetCellCounter1(counter, label);
+  TFE_MonitoringCounterCellIncrementBy(cell, delta);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index 8f600c5de8f..1fb76748059 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/c/eager/c_api_test_util.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/tf_datatype.h"
@@ -434,6 +438,8 @@ tensorflow::ServerDef GetServerDef(const string& job_name, int num_tasks) {
     int port = tensorflow::testing::PickUnusedPortOrDie();
     job_def->mutable_tasks()->insert(
         {i, tensorflow::strings::StrCat("localhost:", port)});
+    LOG(INFO) << "Picked test port: " << port << " for job: " << job_name
+              << ", task: " << i;
   }
   return server_def;
 }
diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index 543976b4a6b..53f340ee2aa 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -76,7 +76,7 @@ static TracingContext* CreateTracingExecutionContext(const char* fn_name,
   if (default_factory) {
     return default_factory(fn_name, s);
   }
-  Set_TF_Status_from_Status(
+  tsl::Set_TF_Status_from_Status(
       s, errors::FailedPrecondition("default_factory is nullptr"));
   return nullptr;
 }
@@ -109,7 +109,7 @@ using tensorflow::tracing::TracingOperation;
 using tensorflow::tracing::TracingTensorHandle;
 
 void TF_SetTracingImplementation(const char* name, TF_Status* s) {
-  Set_TF_Status_from_Status(s, SetDefaultTracingEngine(name));
+  tsl::Set_TF_Status_from_Status(s, SetDefaultTracingEngine(name));
 }
 
 // Creates a new TensorFlow function, it is an execution context attached to a
@@ -123,12 +123,13 @@ TF_AbstractFunction* TF_FinalizeFunction(TF_ExecutionContext* ctx,
   AbstractFunction* func;
   TracingContext* tracing_ctx = dyn_cast<TracingContext>(unwrap(ctx));
   if (!tracing_ctx) {
-    Set_TF_Status_from_Status(
+    tsl::Set_TF_Status_from_Status(
         s, tensorflow::errors::InvalidArgument(
                "Only TracingContext can be converted into a function."));
     return nullptr;
   }
-  Set_TF_Status_from_Status(s, tracing_ctx->Finalize(unwrap(outputs), &func));
+  tsl::Set_TF_Status_from_Status(s,
+                                 tracing_ctx->Finalize(unwrap(outputs), &func));
   TF_DeleteExecutionContext(ctx);
   return wrap(func);
 }
@@ -140,7 +141,7 @@ TF_AbstractTensor* TF_AddFunctionParameter(TF_ExecutionContext* func,
   TracingTensorHandle* t;
   TracingContext* tracing_ctx = dyn_cast<TracingContext>(unwrap(func));
   if (!tracing_ctx) {
-    Set_TF_Status_from_Status(
+    tsl::Set_TF_Status_from_Status(
         s, tensorflow::errors::InvalidArgument(
                "TF_AddFunctionParameter must be called on a TracingContext."));
     return nullptr;
@@ -152,11 +153,11 @@ TF_AbstractTensor* TF_AddFunctionParameter(TF_ExecutionContext* func,
         reinterpret_cast<int64_t*>(shape.dim_sizes), shape.num_dims,
         &partial_shape);
     if (!status.ok()) {
-      Set_TF_Status_from_Status(s, status);
+      tsl::Set_TF_Status_from_Status(s, status);
       return nullptr;
     }
   }
-  Set_TF_Status_from_Status(
+  tsl::Set_TF_Status_from_Status(
       s, tracing_ctx->AddParameter(static_cast<DataType>(dtype), partial_shape,
                                    &t));
   return wrap(t);
@@ -193,20 +194,21 @@ void TF_OutputListPushBack(TF_OutputList* o, TF_AbstractTensor* tensor,
 
 void TF_AbstractOpSetOpType(TF_AbstractOp* op, const char* const op_type,
                             TF_Status* s) {
-  Set_TF_Status_from_Status(s, unwrap(op)->Reset(op_type,
-                                                 /*raw_device_name=*/nullptr));
+  tsl::Set_TF_Status_from_Status(
+      s, unwrap(op)->Reset(op_type,
+                           /*raw_device_name=*/nullptr));
 }
 
 void TF_AbstractOpSetOpName(TF_AbstractOp* op, const char* const op_name,
                             TF_Status* s) {
   TracingOperation* tracing_op = dyn_cast<TracingOperation>(unwrap(op));
   if (!tracing_op) {
-    Set_TF_Status_from_Status(
+    tsl::Set_TF_Status_from_Status(
         s, tensorflow::errors::InvalidArgument(
                "TF_AbstractOpSetOpName must be called on a TracingOperation."));
     return;
   }
-  Set_TF_Status_from_Status(s, tracing_op->SetOpName(op_name));
+  tsl::Set_TF_Status_from_Status(s, tracing_op->SetOpName(op_name));
 }
 
 void TF_AbstractOpSetAttrType(TF_AbstractOp* op, const char* const attr_name,
@@ -214,20 +216,20 @@ void TF_AbstractOpSetAttrType(TF_AbstractOp* op, const char* const attr_name,
   Status status =
       unwrap(op)->SetAttrType(attr_name, static_cast<DataType>(value));
   TF_SetStatus(s, static_cast<TF_Code>(status.code()),
-               status.error_message().c_str());
+               tsl::NullTerminatedMessage(status));
 }
 
 void TF_ExecuteOperation(TF_AbstractOp* op, int num_inputs,
                          TF_AbstractTensor* const* inputs, TF_OutputList* o,
                          TF_Status* s) {
   for (int i = 0; i < num_inputs; i++) {
-    Set_TF_Status_from_Status(s, unwrap(op)->AddInput(unwrap(inputs[i])));
+    tsl::Set_TF_Status_from_Status(s, unwrap(op)->AddInput(unwrap(inputs[i])));
     if (TF_GetCode(s) != TF_OK) {
       return;
     }
   }
   int num_outputs = unwrap(o)->expected_num_outputs;
-  Set_TF_Status_from_Status(
+  tsl::Set_TF_Status_from_Status(
       s, unwrap(op)->Execute(
              absl::MakeSpan(reinterpret_cast<AbstractTensorHandle**>(
                                 unwrap(o)->outputs.data()),
@@ -242,5 +244,6 @@ void TF_DeleteAbstractFunction(TF_AbstractFunction* func) {
 void TF_ExecutionContextRegisterFunction(TF_ExecutionContext* ctx,
                                          TF_AbstractFunction* func,
                                          TF_Status* s) {
-  Set_TF_Status_from_Status(s, unwrap(ctx)->RegisterFunction(unwrap(func)));
+  tsl::Set_TF_Status_from_Status(s,
+                                 unwrap(ctx)->RegisterFunction(unwrap(func)));
 }
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
index 7d36cb0ad12..af8797c2932 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_graph.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
@@ -204,7 +205,7 @@ class GraphOperation : public TracingOperation {
   Status SetAttrType(const char* const attr_name, DataType value) override {
     if (!op_) {
       return Status(
-          error::Code::FAILED_PRECONDITION,
+          absl::StatusCode::kFailedPrecondition,
           "op_type and op_name must be specified before specifying attrs.");
     }
     op_->node_builder.Attr(attr_name, value);
@@ -387,7 +388,7 @@ class GraphContext : public TracingContext {
                                    inputs_.size(), inputs_.data(),
                                    graph_outputs.size(), graph_outputs.data(),
                                    nullptr, nullptr, name_.data(), s);
-    *f = new GraphFunction(std::move(func->fdef));
+    *f = new GraphFunction(std::move(func->record->fdef()));
     TF_DeleteFunction(func);
     TF_RETURN_IF_ERROR(StatusFromTF_Status(s));
     TF_DeleteStatus(s);
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index 4814344e405..edaf3d8e579 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -47,7 +47,7 @@ class UnifiedCAPI
     TF_StatusPtr status(TF_NewStatus());
     TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
     Status s = StatusFromTF_Status(status.get());
-    CHECK_EQ(errors::OK, s.code()) << s.error_message();
+    CHECK_EQ(errors::OK, s.code()) << s.message();
   }
 };
 
diff --git a/tensorflow/c/eager/gradient_checker_test.cc b/tensorflow/c/eager/gradient_checker_test.cc
index 4a688cec241..e012b29e93f 100644
--- a/tensorflow/c/eager/gradient_checker_test.cc
+++ b/tensorflow/c/eager/gradient_checker_test.cc
@@ -41,13 +41,13 @@ void CompareNumericalAndManualGradients(
     AbstractTensorHandle* numerical_grad_raw;
     s = CalcNumericalGrad(ctx, model, inputs, input_index, use_function,
                           &numerical_grad_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     numerical_grad.reset(numerical_grad_raw);
   }
 
   TF_Tensor* numerical_tensor;
   s = GetValue(numerical_grad.get(), &numerical_tensor);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
   auto num_elem_numerical = TF_TensorElementCount(numerical_tensor);
   ASSERT_EQ(num_elem_numerical, num_grad);
 
@@ -90,14 +90,14 @@ class GradientCheckerTest
 
     {
       Status s = StatusFromTF_Status(status.get());
-      CHECK_EQ(errors::OK, s.code()) << s.error_message();
+      CHECK_EQ(errors::OK, s.code()) << s.message();
     }
 
     {
       AbstractContext* ctx_raw = nullptr;
       Status s =
           BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
-      ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+      ASSERT_EQ(errors::OK, s.code()) << s.message();
       ctx_.reset(ctx_raw);
     }
 
@@ -122,7 +122,7 @@ TEST_P(GradientCheckerTest, TestMatMul) {
     AbstractTensorHandle* A_raw;
     Status s = TestTensorHandleWithDims<float, TF_FLOAT>(ctx_.get(), A_vals,
                                                          A_dims, 2, &A_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     A.reset(A_raw);
   }
   float B_vals[] = {.5f, -1.0f, 1.0f, 1.0f};
@@ -132,7 +132,7 @@ TEST_P(GradientCheckerTest, TestMatMul) {
     AbstractTensorHandle* B_raw;
     Status s = TestTensorHandleWithDims<float, TF_FLOAT>(ctx_.get(), B_vals,
                                                          B_dims, 2, &B_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     B.reset(B_raw);
   }
 
@@ -148,7 +148,7 @@ TEST_P(GradientCheckerTest, TestMul) {
     AbstractTensorHandle* x_raw = nullptr;
     Status s =
         TestScalarTensorHandle<float, TF_FLOAT>(ctx_.get(), 2.0f, &x_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     x.reset(x_raw);
   }
 
@@ -157,7 +157,7 @@ TEST_P(GradientCheckerTest, TestMul) {
     AbstractTensorHandle* y_raw = nullptr;
     Status s =
         TestScalarTensorHandle<float, TF_FLOAT>(ctx_.get(), 7.0f, &y_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     y.reset(y_raw);
   }
 
diff --git a/tensorflow/c/eager/gradients_test.cc b/tensorflow/c/eager/gradients_test.cc
index a24e97f9981..a345240e8c3 100644
--- a/tensorflow/c/eager/gradients_test.cc
+++ b/tensorflow/c/eager/gradients_test.cc
@@ -53,7 +53,7 @@ class CppGradients
     TF_StatusPtr status(TF_NewStatus());
     TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
     Status s = StatusFromTF_Status(status.get());
-    CHECK_EQ(errors::OK, s.code()) << s.error_message();
+    CHECK_EQ(errors::OK, s.code()) << s.message();
   }
 };
 
@@ -70,7 +70,7 @@ TEST_P(CppGradients, TestSetAttrString) {
     AbstractContext* ctx_raw = nullptr;
     Status s =
         BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     ctx.reset(ctx_raw);
   }
 
@@ -78,7 +78,7 @@ TEST_P(CppGradients, TestSetAttrString) {
   {
     AbstractTensorHandle* x_raw = nullptr;
     Status s = TestScalarTensorHandle<float, TF_FLOAT>(ctx.get(), 1.0f, &x_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     t.reset(x_raw);
   }
 
@@ -86,31 +86,31 @@ TEST_P(CppGradients, TestSetAttrString) {
   ForwardOperation forward_op;
   Status s = Reset(check_numerics_op.get(), "CheckNumerics",
                    /*raw_device_name=*/nullptr, &forward_op);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
   if (isa<TracingOperation>(check_numerics_op.get())) {
     s = dyn_cast<TracingOperation>(check_numerics_op.get())
             ->SetOpName("check_numerics");
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
   }
   s = AddInput(check_numerics_op.get(), t.get(), &forward_op);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
   string message = "This is the way!";
   s = SetAttrString(check_numerics_op.get(), "message", message.data(),
                     message.length(), &forward_op);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
   int num_retvals = 1;
   std::vector<AbstractTensorHandle*> outputs(1);
   GradientRegistry registry;
   s = RegisterGradients(&registry);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
   auto tape = std::make_unique<Tape>(/*persistent=*/false);
   s = Execute(check_numerics_op.get(), ctx.get(), absl::MakeSpan(outputs),
               &num_retvals, &forward_op, tape.get(), registry);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
 
   string read_message;
   s = forward_op.attrs.Get("message", &read_message);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
   ASSERT_EQ(read_message, message);
 }
 
@@ -136,7 +136,7 @@ TEST_P(CppGradients, TestRecordOperationWithNullGradientFunctionRaises) {
     AbstractContext* ctx_raw = nullptr;
     Status s =
         BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     ctx.reset(ctx_raw);
   }
 
@@ -144,7 +144,7 @@ TEST_P(CppGradients, TestRecordOperationWithNullGradientFunctionRaises) {
   {
     AbstractTensorHandle* x_raw = nullptr;
     Status s = TestScalarTensorHandle<float, TF_FLOAT>(ctx.get(), 2.0f, &x_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     x.reset(x_raw);
   }
 
@@ -157,7 +157,7 @@ TEST_P(CppGradients, TestRecordOperationWithNullGradientFunctionRaises) {
       "Provided null gradient_function for 'Neg'.\nIf the intent is to treat "
       "this op as non-differentiable consider using RegisterNotDifferentiable "
       "or NotDifferentiableGradientFunction.",
-      s.error_message());
+      s.message());
   ASSERT_EQ(nullptr, outputs[0]);
 }
 
diff --git a/tensorflow/c/eager/immediate_execution_context.h b/tensorflow/c/eager/immediate_execution_context.h
index 930a26bb120..f4eb7a05367 100644
--- a/tensorflow/c/eager/immediate_execution_context.h
+++ b/tensorflow/c/eager/immediate_execution_context.h
@@ -134,6 +134,10 @@ class ImmediateExecutionContext : public AbstractContext {
   // Find and return a added function by its name.
   virtual const FunctionDef* FindFunctionDef(const string& name) const = 0;
 
+  // Find and return a function record added by its name.
+  virtual core::RefCountPtr<FunctionRecord> FindRecord(
+      const string& name) const = 0;
+
   // Return the ParsedName of Host CPU device.
   virtual const DeviceNameUtils::ParsedName& HostCPUParsedName() const = 0;
   virtual const string& HostCPUName() const = 0;
@@ -249,6 +253,7 @@ class ImmediateExecutionContext : public AbstractContext {
     int64_t kernel_cache_size;
     int64_t device_cache_size;
     std::map<std::string, int64_t> func_kernel_cache_entries;
+    int64_t local_rendezvous_cache_active_size;
   };
   virtual CacheStats GetCacheStats() = 0;
 
diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index 0de029ff449..6195d5d1e85 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -77,6 +77,7 @@ cc_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/c:c_api",
+        "//tensorflow/c:safe_ptr",
         "//tensorflow/c:tf_status_internal",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
diff --git a/tensorflow/c/eager/parallel_device/parallel_device.cc b/tensorflow/c/eager/parallel_device/parallel_device.cc
index 71a5c46b7ea..8c51559d3f7 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device.cc
@@ -211,7 +211,7 @@ int ParallelTensorNumDims(void* data, TF_Status* status) {
   const std::vector<int64_t>* shape;
   Status s = reinterpret_cast<ParallelTensor*>(data)->Shape(&shape);
   if (!s.ok()) {
-    Set_TF_Status_from_Status(status, s);
+    tsl::Set_TF_Status_from_Status(status, s);
     return -1;
   }
   return shape->size();
@@ -223,7 +223,7 @@ int64_t ParallelTensorDim(void* data, int dim_index, TF_Status* status) {
   const std::vector<int64_t>* shape;
   Status s = reinterpret_cast<ParallelTensor*>(data)->Shape(&shape);
   if (!s.ok()) {
-    Set_TF_Status_from_Status(status, s);
+    tsl::Set_TF_Status_from_Status(status, s);
     return -1;
   }
   return (*shape)[dim_index];
@@ -234,7 +234,7 @@ TF_Buffer* ParallelTensorSummarize(void* data, TF_Status* status) {
   std::string summary;
   Status cpp_status = parallel_tensor->SummarizeValue(summary);
   if (!cpp_status.ok()) {
-    Set_TF_Status_from_Status(status, cpp_status);
+    tsl::Set_TF_Status_from_Status(status, cpp_status);
     return nullptr;
   }
   return TF_NewBufferFromString(summary.data(), summary.size());
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
index d3ff26b0a74..0522ad3b730 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
@@ -368,6 +368,27 @@ void ParallelDevice::StartExecute(TFE_Context* context,
   }
 }
 
+void ParallelDevice::StartExecute(
+    TFE_Context* context,
+    const std::vector<std::vector<TFE_TensorHandle*>>& inputs,
+    const char* operation_name, const TFE_OpAttrs* attributes,
+    int expected_max_outputs, CancellationManager& cancellation_manager,
+    absl::optional<int64_t> step_id) const {
+  for (int device_index = 0; device_index < underlying_devices_.size();
+       ++device_index) {
+    DeviceThread* device_thread = device_threads_[device_index].get();
+    std::vector<TFE_TensorHandle*> device_inputs;
+    device_inputs.reserve(inputs.size());
+    for (int input_index = 0; input_index < inputs.size(); ++input_index) {
+      // Parallel tensors are divided between operations by device.
+      device_inputs.push_back(inputs[input_index][device_index]);
+    }
+    device_thread->StartExecute(
+        context, operation_name, std::move(device_inputs), attributes,
+        expected_max_outputs, cancellation_manager, step_id);
+  }
+}
+
 void ParallelDevice::AsyncWait(TFE_Context* context, TF_Status* status) const {
   StatusPtr first_bad_status(nullptr);
 
@@ -486,6 +507,11 @@ std::unique_ptr<ParallelTensor> ParallelTensor::FromTensorHandles(
     const ParallelDevice& parallel_device,
     std::vector<TensorHandlePtr> components, absl::Span<const int64_t> shape,
     TF_Status* status) {
+  if (components.empty()) {
+    TF_SetStatus(status, TF_INTERNAL,
+                 "No components are provide for creating a ParallelTensor");
+    return nullptr;
+  }
   TFE_TensorHandleGetStatus(components[0].get(), status);
   if (!status->status.ok()) {
     return nullptr;
@@ -513,6 +539,11 @@ std::unique_ptr<ParallelTensor> ParallelTensor::FromTensorHandles(
 std::unique_ptr<ParallelTensor> ParallelTensor::FromTensorHandles(
     const ParallelDevice& parallel_device,
     std::vector<TensorHandlePtr> components, TF_Status* status) {
+  if (components.empty()) {
+    TF_SetStatus(status, TF_INTERNAL,
+                 "No components are provided for creating a ParallelTensor");
+    return nullptr;
+  }
   TFE_TensorHandleGetStatus(components[0].get(), status);
   if (!status->status.ok()) {
     return nullptr;
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.h b/tensorflow/c/eager/parallel_device/parallel_device_lib.h
index 4b87ad4c106..b1b96d3b410 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.h
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/types/optional.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
@@ -35,19 +37,7 @@ limitations under the License.
 namespace tensorflow {
 namespace parallel_device {
 
-// Functor for making unique_ptrs slightly more ergonomic. Using
-// decltype(delete_fn) in the unique_ptr's second template argument requires
-// passing a function pointer to delete_fn when constructing the unique_ptr.
-class TensorHandleDeleter {
- public:
-  void operator()(TFE_TensorHandle* to_delete) const {
-    TFE_DeleteTensorHandle(to_delete);
-  }
-};
-
-// TODO(b/256016071): Replace this with `Safe_TFE_TensorHandlePtr` when
-// `Safe_TFE_TensorHandlePtr` is marked to be compatible on non-prod env.
-using TensorHandlePtr = std::unique_ptr<TFE_TensorHandle, TensorHandleDeleter>;
+using TensorHandlePtr = tensorflow::Safe_TFE_TensorHandlePtr;
 
 class ParallelTensor;
 class DeviceThread;
@@ -128,6 +118,13 @@ class ParallelDevice {
                     CancellationManager& cancellation_manager,
                     std::optional<int64_t> step_id = std::nullopt) const;
 
+  void StartExecute(TFE_Context* context,
+                    const std::vector<std::vector<TFE_TensorHandle*>>& inputs,
+                    const char* operation_name, const TFE_OpAttrs* attributes,
+                    int expected_max_outputs,
+                    CancellationManager& cancellation_manager,
+                    std::optional<int64_t> step_id = std::nullopt) const;
+
   // Blocks until the previous `StartExecute` has run `TFE_Execute` on each
   // device. If is_async=false (constructor argument) this means the ops have
   // run and have results. If is_async=true it means that all of the
@@ -206,6 +203,17 @@ class ParallelTensor {
   // component device.
   Status SummarizeValue(std::string& summary);
 
+  std::vector<TensorHandlePtr> release_tensors() { return std::move(tensors_); }
+
+  std::vector<TFE_TensorHandle*> tensors() const {
+    std::vector<TFE_TensorHandle*> result;
+    result.reserve(tensors_.size());
+    for (const TensorHandlePtr& tensor : tensors_) {
+      result.emplace_back(tensor.get());
+    }
+    return result;
+  }
+
  private:
   ParallelTensor(const ParallelDevice& device,
                  std::vector<TensorHandlePtr> tensors,
@@ -222,7 +230,7 @@ class ParallelTensor {
         dtype_(dtype) {}
 
   const ParallelDevice& device_;
-  const std::vector<TensorHandlePtr> tensors_;
+  std::vector<TensorHandlePtr> tensors_;
   // Parallel tensors are immutable but compute their shape lazily unless it is
   // provided on construction. The optional has a value if the lazy computation
   // has been completed or the shape was provided on construction.
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc b/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc
index 41d6f14e068..9f157ae760e 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <array>
+#include <memory>
 #include <string>
 
 #include "tensorflow/c/c_api.h"
@@ -37,6 +38,8 @@ tensorflow::ServerDef GetServerDef(const std::string& job_name, int num_tasks) {
     int port = tensorflow::testing::PickUnusedPortOrDie();
     job_def->mutable_tasks()->insert(
         {i, tensorflow::strings::StrCat("localhost", ":", port)});
+    LOG(INFO) << "Picked test port: " << port << " for job: " << job_name
+              << ", task: " << i;
   }
   return server_def;
 }
diff --git a/tensorflow/c/eager/tfe_monitoring_reader_internal.h b/tensorflow/c/eager/tfe_monitoring_reader_internal.h
new file mode 100644
index 00000000000..3c63e6725f1
--- /dev/null
+++ b/tensorflow/c/eager/tfe_monitoring_reader_internal.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_TFE_MONITORING_READER_INTERNAL_H_
+#define TENSORFLOW_C_EAGER_TFE_MONITORING_READER_INTERNAL_H_
+
+#include <memory>
+
+#include "tensorflow/core/lib/monitoring/cell_reader.h"
+
+struct TFE_MonitoringCounterReader {
+  explicit TFE_MonitoringCounterReader(const char* name) {
+    counter = std::make_unique<
+        ::tensorflow::monitoring::testing::CellReader<int64_t>>(name);
+  }
+  template <typename... LabelType>
+  int64_t Read(const LabelType&... labels);
+  std::unique_ptr<::tensorflow::monitoring::testing::CellReader<int64_t>>
+      counter;
+};
+
+#endif  // TENSORFLOW_C_EAGER_TFE_MONITORING_READER_INTERNAL_H_
diff --git a/tensorflow/c/eager/unified_api_test.cc b/tensorflow/c/eager/unified_api_test.cc
index a9204f4462c..27e42be5bcc 100644
--- a/tensorflow/c/eager/unified_api_test.cc
+++ b/tensorflow/c/eager/unified_api_test.cc
@@ -30,7 +30,7 @@ class UnifiedAPI
     TF_StatusPtr status(TF_NewStatus());
     TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
     Status s = StatusFromTF_Status(status.get());
-    CHECK_EQ(errors::OK, s.code()) << s.error_message();
+    CHECK_EQ(errors::OK, s.code()) << s.message();
   }
 
  public:
@@ -61,7 +61,7 @@ TEST_P(UnifiedAPI, TestTensorShapeScalar) {
     AbstractContext* ctx_raw = nullptr;
     Status s =
         BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     ctx.reset(ctx_raw);
   }
 
@@ -69,7 +69,7 @@ TEST_P(UnifiedAPI, TestTensorShapeScalar) {
   {
     AbstractTensorHandle* x_raw = nullptr;
     Status s = TestScalarTensorHandle<float, TF_FLOAT>(ctx.get(), 2.0f, &x_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     x.reset(x_raw);
   }
 
@@ -77,7 +77,7 @@ TEST_P(UnifiedAPI, TestTensorShapeScalar) {
                       /*inputs=*/{x.get()},
                       /*outputs=*/{},
                       /*use_function=*/UseFunction());
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
 }
 
 // Checks that inputs[0] is a matrix with shape 2x4.
@@ -111,7 +111,7 @@ TEST_P(UnifiedAPI, TestTensorShape2x4) {
     AbstractContext* ctx_raw = nullptr;
     Status s =
         BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     ctx.reset(ctx_raw);
   }
 
@@ -122,7 +122,7 @@ TEST_P(UnifiedAPI, TestTensorShape2x4) {
     int64_t dim_sizes[] = {2, 4};
     Status s = TestTensorHandleWithDims<float, TF_FLOAT>(ctx.get(), data,
                                                          dim_sizes, 2, &x_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     x.reset(x_raw);
   }
 
@@ -130,7 +130,7 @@ TEST_P(UnifiedAPI, TestTensorShape2x4) {
                       /*inputs=*/{x.get()},
                       /*outputs=*/{},
                       /*use_function=*/UseFunction());
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
 }
 
 TEST_P(UnifiedAPI, TestUnknownShapeTracing) {
@@ -148,13 +148,13 @@ TEST_P(UnifiedAPI, TestUnknownShapeTracing) {
     PartialTensorShape shape;
     Status s = dyn_cast<tracing::TracingContext>(ctx.get())->AddParameter(
         DT_FLOAT, shape, &x_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     x.reset(x_raw);
   }
 
   PartialTensorShape shape;
   Status s = x->Shape(&shape);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
   ASSERT_TRUE(shape.unknown_rank());
 }
 
@@ -172,16 +172,16 @@ TEST_P(UnifiedAPI, TestPartialShapeTracing) {
     PartialTensorShape shape;
     int64_t dim_sizes[] = {2, -1};
     Status s = PartialTensorShape::MakePartialShape(dim_sizes, 2, &shape);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     s = dyn_cast<tracing::TracingContext>(ctx.get())->AddParameter(
         DT_FLOAT, shape, &x_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     x.reset(x_raw);
   }
 
   PartialTensorShape shape;
   Status s = x->Shape(&shape);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
   ASSERT_FALSE(shape.unknown_rank());
 
   ASSERT_EQ(2, shape.dim_size(0));
diff --git a/tensorflow/c/experimental/gradients/BUILD b/tensorflow/c/experimental/gradients/BUILD
index 1788cbd6551..65f580deee9 100644
--- a/tensorflow/c/experimental/gradients/BUILD
+++ b/tensorflow/c/experimental/gradients/BUILD
@@ -178,9 +178,9 @@ tf_cuda_cc_test(
         "//tensorflow/c/eager:unified_api_testutil",
         "//tensorflow/c/experimental/gradients/tape:tape_context",
         "//tensorflow/c/experimental/ops:nn_ops",
-        "//tensorflow/core/platform:tensor_float_32_utils",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:tensor_float_32_utils",
     ] + if_libtpu(
         if_false = ["//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration"],
         if_true = [],
@@ -204,9 +204,9 @@ tf_cuda_cc_test(
         "//tensorflow/c/eager:unified_api_testutil",
         "//tensorflow/c/experimental/gradients/tape:tape_context",
         "//tensorflow/c/experimental/ops:math_ops",
-        "//tensorflow/core/platform:tensor_float_32_utils",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:tensor_float_32_utils",
     ] + if_libtpu(
         if_false = ["//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration"],
         if_true = [],
@@ -222,17 +222,17 @@ tf_cuda_cc_test(
     args = ["--heap_check="],  # TODO(b/174752220): Remove
     tags = tf_cuda_tests_tags() + ["no_cuda_asan"],  # b/173654156,
     deps = [
-        ":grad_test_helper",
         ":array_grad",
+        ":grad_test_helper",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api_test_util",
-        "//tensorflow/c/experimental/gradients/tape:tape_context",
-        "//tensorflow/c/experimental/ops:array_ops",
-        "//tensorflow/core/platform:tensor_float_32_utils",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/c/eager:c_api_unified_internal",
         "//tensorflow/c/eager:unified_api_testutil",
+        "//tensorflow/c/experimental/gradients/tape:tape_context",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:tensor_float_32_utils",
     ] + if_libtpu(
         if_false = ["//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration"],
         if_true = [],
diff --git a/tensorflow/c/experimental/gradients/array_grad_test.cc b/tensorflow/c/experimental/gradients/array_grad_test.cc
index 61c0bce6664..fcaafd693e1 100644
--- a/tensorflow/c/experimental/gradients/array_grad_test.cc
+++ b/tensorflow/c/experimental/gradients/array_grad_test.cc
@@ -51,13 +51,13 @@ class CppGradients
     TF_StatusPtr status(TF_NewStatus());
     TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
     status_ = StatusFromTF_Status(status.get());
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
     {
       AbstractContext* ctx_raw = nullptr;
       status_ =
           BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
-      ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+      ASSERT_EQ(errors::OK, status_.code()) << status_.message();
       immediate_execution_ctx_.reset(ctx_raw);
     }
 
@@ -86,7 +86,7 @@ TEST_P(CppGradients, TestIdentityNGrad) {
     AbstractTensorHandle* x1_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 1.0f, &x1_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     x1.reset(x1_raw);
   }
 
@@ -95,19 +95,19 @@ TEST_P(CppGradients, TestIdentityNGrad) {
     AbstractTensorHandle* x2_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 1.0f, &x2_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     x2.reset(x2_raw);
   }
 
   status_ = registry_.Register("IdentityN", IdentityNRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
   auto IdentityNGradModel = BuildGradModel(IdentityNModel, registry_);
 
   std::vector<AbstractTensorHandle*> outputs(2);
   status_ =
       RunModel(IdentityNGradModel, immediate_execution_ctx_.get(),
                {x1.get(), x2.get()}, absl::MakeSpan(outputs), UseFunction());
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
   EXPECT_EQ(outputs[0], nullptr);
   ASSERT_NO_FATAL_FAILURE(CheckTensorValue(outputs[1], {1.0f}, /*dims*/ {},
                                            /*abs_error*/ 0));
diff --git a/tensorflow/c/experimental/gradients/custom_gradient_test.cc b/tensorflow/c/experimental/gradients/custom_gradient_test.cc
index d447073b36a..cce9a051a74 100644
--- a/tensorflow/c/experimental/gradients/custom_gradient_test.cc
+++ b/tensorflow/c/experimental/gradients/custom_gradient_test.cc
@@ -38,7 +38,7 @@ class CustomGradientTest
     TF_StatusPtr status(TF_NewStatus());
     TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
     Status s = StatusFromTF_Status(status.get());
-    CHECK_EQ(errors::OK, s.code()) << s.error_message();
+    CHECK_EQ(errors::OK, s.code()) << s.message();
   }
 };
 
@@ -92,7 +92,7 @@ TEST_P(CustomGradientTest, ExpWithPassThroughGrad) {
     AbstractContext* ctx_raw = nullptr;
     Status s =
         BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     ctx.reset(ctx_raw);
   }
 
@@ -100,7 +100,7 @@ TEST_P(CustomGradientTest, ExpWithPassThroughGrad) {
   {
     AbstractTensorHandle* x_raw = nullptr;
     Status s = TestScalarTensorHandle<float, TF_FLOAT>(ctx.get(), 1.0f, &x_raw);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     x.reset(x_raw);
   }
 
@@ -113,11 +113,11 @@ TEST_P(CustomGradientTest, ExpWithPassThroughGrad) {
   Status s = RunModel(ExpWithPassThroughGrad, ctx.get(), {x.get()},
                       absl::MakeSpan(outputs),
                       /*use_function=*/!std::get<2>(GetParam()));
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
 
   TF_Tensor* result_tensor;
   s = GetValue(outputs[0], &result_tensor);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
   auto result_value = static_cast<float*>(TF_TensorData(result_tensor));
   EXPECT_EQ(*result_value, 1.0);
   outputs[0]->Unref();
diff --git a/tensorflow/c/experimental/gradients/grad_test_helper.cc b/tensorflow/c/experimental/gradients/grad_test_helper.cc
index 1bcb72175f7..a4b71ea6d3b 100644
--- a/tensorflow/c/experimental/gradients/grad_test_helper.cc
+++ b/tensorflow/c/experimental/gradients/grad_test_helper.cc
@@ -30,7 +30,7 @@ void CompareNumericalAndAutodiffGradients(
   std::vector<AbstractTensorHandle*> outputs(num_inputs);
   auto s = RunModel(grad_model, ctx, inputs, absl::MakeSpan(outputs),
                     /*use_function=*/use_function);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
 
   for (int i = 0; i < num_inputs; ++i) {
     if (!outputs[i]) continue;
@@ -41,18 +41,18 @@ void CompareNumericalAndAutodiffGradients(
       s = CalcNumericalGrad(ctx, model, inputs,
                             /*input_index=*/i, use_function,
                             &numerical_grad_raw);
-      ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+      ASSERT_EQ(errors::OK, s.code()) << s.message();
       numerical_grad.reset(numerical_grad_raw);
     }
 
     TF_Tensor* numerical_tensor;
     s = GetValue(numerical_grad.get(), &numerical_tensor);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     auto num_elem_numerical = TF_TensorElementCount(numerical_tensor);
 
     TF_Tensor* analytical_tensor;
     s = GetValue(outputs[i], &analytical_tensor);
-    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(errors::OK, s.code()) << s.message();
     auto num_elem_analytical = TF_TensorElementCount(analytical_tensor);
 
     ASSERT_EQ(num_elem_numerical, num_elem_analytical);
@@ -79,7 +79,7 @@ void CheckTensorValue(AbstractTensorHandle* t, absl::Span<const float> manuals,
                       absl::Span<const int64_t> dims, double abs_error) {
   TF_Tensor* analytical_tensor;
   auto s = GetValue(t, &analytical_tensor);
-  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(errors::OK, s.code()) << s.message();
 
   int64_t num_elem_analytical = 1;
   auto num_dims_analytical = TF_NumDims(analytical_tensor);
diff --git a/tensorflow/c/experimental/gradients/math_grad_test.cc b/tensorflow/c/experimental/gradients/math_grad_test.cc
index c528fc1ae40..d0d08db8fd4 100644
--- a/tensorflow/c/experimental/gradients/math_grad_test.cc
+++ b/tensorflow/c/experimental/gradients/math_grad_test.cc
@@ -86,13 +86,13 @@ class CppGradients
     TF_StatusPtr status(TF_NewStatus());
     TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
     status_ = StatusFromTF_Status(status.get());
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
     {
       AbstractContext* ctx_raw = nullptr;
       status_ =
           BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
-      ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+      ASSERT_EQ(errors::OK, status_.code()) << status_.message();
       immediate_execution_ctx_.reset(ctx_raw);
     }
 
@@ -117,7 +117,7 @@ TEST_P(CppGradients, TestAddGrad) {
     AbstractTensorHandle* x_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 2.0f, &x_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     x.reset(x_raw);
   }
 
@@ -126,14 +126,14 @@ TEST_P(CppGradients, TestAddGrad) {
     AbstractTensorHandle* y_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 2.0f, &y_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     y.reset(y_raw);
   }
 
   // TODO(srbs): Rename ops::Add to ops::AddV2 and AddRegister to
   // AddV2Registerer.
   status_ = registry_.Register("AddV2", AddRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   ASSERT_NO_FATAL_FAILURE(CompareNumericalAndAutodiffGradients(
       AddModel, BuildGradModel(AddModel, registry_),
@@ -146,12 +146,12 @@ TEST_P(CppGradients, TestExpGrad) {
     AbstractTensorHandle* x_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 2.0f, &x_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     x.reset(x_raw);
   }
 
   status_ = registry_.Register("Exp", ExpRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   ASSERT_NO_FATAL_FAILURE(CompareNumericalAndAutodiffGradients(
       ExpModel, BuildGradModel(ExpModel, registry_),
@@ -171,7 +171,7 @@ TEST_P(CppGradients, TestMatMulGrad) {
     AbstractTensorHandle* A_raw;
     status_ = TestTensorHandleWithDims<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), A_vals, A_dims, 2, &A_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     A.reset(A_raw);
   }
 
@@ -182,12 +182,12 @@ TEST_P(CppGradients, TestMatMulGrad) {
     AbstractTensorHandle* B_raw;
     status_ = TestTensorHandleWithDims<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), B_vals, B_dims, 2, &B_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     B.reset(B_raw);
   }
 
   status_ = registry_.Register("MatMul", MatMulRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   for (bool transpose_a : {false, true}) {
     for (bool transpose_b : {false, true}) {
@@ -214,7 +214,7 @@ TEST_P(CppGradients, TestMatMulGradManual) {
     AbstractTensorHandle* A_raw;
     status_ = TestTensorHandleWithDims<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), A_vals, A_dims, 2, &A_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     A.reset(A_raw);
   }
 
@@ -225,12 +225,12 @@ TEST_P(CppGradients, TestMatMulGradManual) {
     AbstractTensorHandle* B_raw;
     status_ = TestTensorHandleWithDims<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), B_vals, B_dims, 2, &B_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     B.reset(B_raw);
   }
 
   status_ = registry_.Register("MatMul", MatMulRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   bool transpose_a_vals[] = {false, false, true, true};
   bool transpose_b_vals[] = {false, true, false, true};
@@ -259,7 +259,7 @@ TEST_P(CppGradients, TestMatMulGradManual) {
     status_ =
         RunModel(MatMulGradModel, immediate_execution_ctx_.get(),
                  {A.get(), B.get()}, absl::MakeSpan(outputs), UseFunction());
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     ASSERT_NO_FATAL_FAILURE(CheckTensorValue(outputs[0], dA_vals[i],
                                              /*dims*/ {3, 3},
                                              /*abs_error*/ 0));
@@ -277,12 +277,12 @@ TEST_P(CppGradients, TestSqrtGrad) {
     AbstractTensorHandle* x_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 2.0f, &x_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     x.reset(x_raw);
   }
 
   status_ = registry_.Register("Sqrt", SqrtRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   ASSERT_NO_FATAL_FAILURE(CompareNumericalAndAutodiffGradients(
       SqrtModel, BuildGradModel(SqrtModel, registry_),
@@ -295,12 +295,12 @@ TEST_P(CppGradients, TestNegGrad) {
     AbstractTensorHandle* x_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 2.0f, &x_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     x.reset(x_raw);
   }
 
   status_ = registry_.Register("Neg", NegRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   ASSERT_NO_FATAL_FAILURE(CompareNumericalAndAutodiffGradients(
       NegModel, BuildGradModel(NegModel, registry_),
@@ -313,7 +313,7 @@ TEST_P(CppGradients, TestSubGrad) {
     AbstractTensorHandle* x_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 2.0f, &x_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     x.reset(x_raw);
   }
 
@@ -322,12 +322,12 @@ TEST_P(CppGradients, TestSubGrad) {
     AbstractTensorHandle* y_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 2.0f, &y_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     y.reset(y_raw);
   }
 
   status_ = registry_.Register("Sub", SubRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   ASSERT_NO_FATAL_FAILURE(CompareNumericalAndAutodiffGradients(
       SubModel, BuildGradModel(SubModel, registry_),
@@ -340,7 +340,7 @@ TEST_P(CppGradients, TestMulGrad) {
     AbstractTensorHandle* x_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 2.0f, &x_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     x.reset(x_raw);
   }
 
@@ -349,12 +349,12 @@ TEST_P(CppGradients, TestMulGrad) {
     AbstractTensorHandle* y_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 2.0f, &y_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     y.reset(y_raw);
   }
 
   status_ = registry_.Register("Mul", MulRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   ASSERT_NO_FATAL_FAILURE(CompareNumericalAndAutodiffGradients(
       MulModel, BuildGradModel(MulModel, registry_),
@@ -367,12 +367,12 @@ TEST_P(CppGradients, TestLog1pGrad) {
     AbstractTensorHandle* x_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 2.0f, &x_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     x.reset(x_raw);
   }
 
   status_ = registry_.Register("Log1p", Log1pRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   ASSERT_NO_FATAL_FAILURE(CompareNumericalAndAutodiffGradients(
       Log1pModel, BuildGradModel(Log1pModel, registry_),
@@ -381,7 +381,7 @@ TEST_P(CppGradients, TestLog1pGrad) {
 
 TEST_P(CppGradients, TestDivNoNanGrad) {
   status_ = registry_.Register("DivNoNan", DivNoNanRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   auto DivNoNanGradModel = BuildGradModel(DivNoNanModel, registry_);
 
@@ -390,7 +390,7 @@ TEST_P(CppGradients, TestDivNoNanGrad) {
     AbstractTensorHandle* x_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 2.0f, &x_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     x.reset(x_raw);
   }
 
@@ -399,7 +399,7 @@ TEST_P(CppGradients, TestDivNoNanGrad) {
     AbstractTensorHandle* y_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 2.0f, &y_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     y.reset(y_raw);
   }
 
@@ -413,14 +413,14 @@ TEST_P(CppGradients, TestDivNoNanGrad) {
     AbstractTensorHandle* z_raw = nullptr;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 0.0f, &z_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     z.reset(z_raw);
   }
   std::vector<AbstractTensorHandle*> outputs(2);
   status_ =
       RunModel(DivNoNanGradModel, immediate_execution_ctx_.get(),
                {x.get(), z.get()}, absl::MakeSpan(outputs), UseFunction());
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
   ASSERT_NO_FATAL_FAILURE(CheckTensorValue(outputs[0], {0.0f}, /*dims*/ {},
                                            /*abs_error*/ 0));
   ASSERT_NO_FATAL_FAILURE(CheckTensorValue(outputs[1], {0.0f}, /*dims*/ {},
diff --git a/tensorflow/c/experimental/gradients/nn_grad_test.cc b/tensorflow/c/experimental/gradients/nn_grad_test.cc
index 15552eed3ca..d6d0d4dd524 100644
--- a/tensorflow/c/experimental/gradients/nn_grad_test.cc
+++ b/tensorflow/c/experimental/gradients/nn_grad_test.cc
@@ -67,13 +67,13 @@ class CppGradients
     TF_StatusPtr status(TF_NewStatus());
     TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
     status_ = StatusFromTF_Status(status.get());
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
     {
       AbstractContext* ctx_raw = nullptr;
       status_ =
           BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
-      ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+      ASSERT_EQ(errors::OK, status_.code()) << status_.message();
       immediate_execution_ctx_.reset(ctx_raw);
     }
 
@@ -94,7 +94,7 @@ class CppGradients
 
 TEST_P(CppGradients, TestReluGrad) {
   status_ = registry_.Register("Relu", ReluRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   auto ReluGradModel = BuildGradModel(ReluModel, registry_);
 
@@ -105,7 +105,7 @@ TEST_P(CppGradients, TestReluGrad) {
     AbstractTensorHandle* X_raw;
     status_ = TestTensorHandleWithDims<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), X_vals, X_dims, 2, &X_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     X.reset(X_raw);
   }
 
@@ -120,14 +120,14 @@ TEST_P(CppGradients, TestReluGrad) {
     AbstractTensorHandle* Y_raw;
     status_ = TestScalarTensorHandle<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), 0.0f, &Y_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     Y.reset(Y_raw);
   }
 
   std::vector<AbstractTensorHandle*> outputs(1);
   status_ = RunModel(ReluGradModel, immediate_execution_ctx_.get(), {Y.get()},
                      absl::MakeSpan(outputs), UseFunction());
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
   ASSERT_NO_FATAL_FAILURE(CheckTensorValue(outputs[0], {0.0f}, /*dims*/ {},
                                            /*abs_error*/ 0));
   outputs[0]->Unref();
@@ -148,7 +148,7 @@ TEST_P(CppGradients, TestSparseSoftmaxCrossEntropyWithLogitsGrad) {
     AbstractTensorHandle* X_raw;
     status_ = TestTensorHandleWithDims<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), X_vals, X_dims, 2, &X_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     X.reset(X_raw);
   }
   // Label
@@ -159,13 +159,13 @@ TEST_P(CppGradients, TestSparseSoftmaxCrossEntropyWithLogitsGrad) {
     AbstractTensorHandle* Y_raw;
     status_ = TestTensorHandleWithDims<int32_t, TF_INT32>(
         immediate_execution_ctx_.get(), Y_vals, Y_dims, 1, &Y_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     Y.reset(Y_raw);
   }
 
   status_ = registry_.Register("SparseSoftmaxCrossEntropyWithLogits",
                                SparseSoftmaxCrossEntropyWithLogitsRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   ASSERT_NO_FATAL_FAILURE(CompareNumericalAndAutodiffGradients(
       SparseSoftmaxCrossEntropyWithLogitsModel,
@@ -186,7 +186,7 @@ TEST_P(CppGradients, TestBiasAddGrad) {
     AbstractTensorHandle* A_raw;
     status_ = TestTensorHandleWithDims<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), A_vals, A_dims, 2, &A_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     A.reset(A_raw);
   }
   // Bias
@@ -197,12 +197,12 @@ TEST_P(CppGradients, TestBiasAddGrad) {
     AbstractTensorHandle* Bias_raw;
     status_ = TestTensorHandleWithDims<float, TF_FLOAT>(
         immediate_execution_ctx_.get(), Bias_vals, Bias_dims, 1, &Bias_raw);
-    ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+    ASSERT_EQ(errors::OK, status_.code()) << status_.message();
     Bias.reset(Bias_raw);
   }
 
   status_ = registry_.Register("BiasAdd", BiasAddRegisterer);
-  ASSERT_EQ(errors::OK, status_.code()) << status_.error_message();
+  ASSERT_EQ(errors::OK, status_.code()) << status_.message();
 
   ASSERT_NO_FATAL_FAILURE(CompareNumericalAndAutodiffGradients(
       BiasAddModel, BuildGradModel(BiasAddModel, registry_),
diff --git a/tensorflow/c/experimental/grappler/grappler_test.cc b/tensorflow/c/experimental/grappler/grappler_test.cc
index c7f2739601f..ed4b4d92362 100644
--- a/tensorflow/c/experimental/grappler/grappler_test.cc
+++ b/tensorflow/c/experimental/grappler/grappler_test.cc
@@ -94,7 +94,7 @@ TEST(Grappler, DeviceTypeNotSet) {
   tensorflow::Status status = InitGraphPlugin(plugin_init);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   ASSERT_EQ(
-      status.error_message(),
+      status.message(),
       "'device_type' field in TP_OptimizerRegistrationParams must be set.");
 }
 
@@ -109,7 +109,7 @@ TEST(Grappler, OptimizeFuncNotSet) {
 
   tensorflow::Status status = InitGraphPlugin(plugin_init);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
-  ASSERT_EQ(status.error_message(),
+  ASSERT_EQ(status.message(),
             "'optimize_func' field in TP_Optimizer must be set.");
 }
 
diff --git a/tensorflow/c/experimental/next_pluggable_device/BUILD b/tensorflow/c/experimental/next_pluggable_device/BUILD
index 89c718ec5d8..eda00deb59c 100644
--- a/tensorflow/c/experimental/next_pluggable_device/BUILD
+++ b/tensorflow/c/experimental/next_pluggable_device/BUILD
@@ -12,6 +12,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/c:c_api_headers",
+        "//tensorflow/c:c_api_macros_hdrs",
         "//tensorflow/c:kernels_experimental_hdrs",
         "//tensorflow/c:kernels_hdrs",
         "//tensorflow/c:tf_buffer_internal",
@@ -41,6 +42,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/c:c_api_headers",
+        "//tensorflow/c:c_api_macros_hdrs",
         "//tensorflow/c:kernels_hdrs",
         "//tensorflow/c:tf_buffer_internal",
         "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_hdrs",
diff --git a/tensorflow/c/experimental/next_pluggable_device/c_api.cc b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
index 3d7150433b9..caa49be2d3f 100644
--- a/tensorflow/c/experimental/next_pluggable_device/c_api.cc
+++ b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
@@ -56,7 +56,7 @@ void TF_CreatePluginResource(TF_OpKernelContext* ctx,
   auto cc_status =
       cc_ctx->resource_manager()->Create<tensorflow::PluginResource>(
           container_name, plugin_resource_name, cc_resource_ptr);
-  Set_TF_Status_from_Status(status, cc_status);
+  tsl::Set_TF_Status_from_Status(status, cc_status);
 }
 
 void TF_LookupOrCreatePluginResource(
@@ -86,7 +86,7 @@ void TF_LookupOrCreatePluginResource(
   } else {
     *result_plugin_resource = nullptr;
   }
-  Set_TF_Status_from_Status(status, cc_status);
+  tsl::Set_TF_Status_from_Status(status, cc_status);
 }
 
 // -------------------------  VariableInfo  ------------------------------------
@@ -113,7 +113,7 @@ TF_VariableInfo* TF_CreateVariableInfoFromContext(TF_OpKernelContext* ctx,
     cc_status = tsl::errors::InvalidArgument(
         "Trying to obtain resource handle from Input[", index,
         "], which is not type DT_RESOURCE.");
-    Set_TF_Status_from_Status(status, cc_status);
+    tsl::Set_TF_Status_from_Status(status, cc_status);
     return nullptr;
   }
   const tensorflow::ResourceHandle& handle =
@@ -141,20 +141,20 @@ void TF_AllocateTempForVariableInfo(TF_OpKernelContext* ctx,
   tsl::Status cc_status;
   if (var_info == nullptr) {
     cc_status = tsl::errors::InvalidArgument("TF_VariableInfo is NULL.");
-    Set_TF_Status_from_Status(status, cc_status);
+    tsl::Set_TF_Status_from_Status(status, cc_status);
     return;
   }
   if (var_info->var_info.var() == nullptr) {
     cc_status = tsl::errors::InvalidArgument(
         "VariableInfo does not track a resource variable.");
-    Set_TF_Status_from_Status(status, cc_status);
+    tsl::Set_TF_Status_from_Status(status, cc_status);
     return;
   }
 
   cc_status = cc_ctx->allocate_temp(var_info->var_info.var()->tensor()->dtype(),
                                     var_info->var_info.var()->tensor()->shape(),
                                     var_info->var_info.var()->tensor());
-  Set_TF_Status_from_Status(status, cc_status);
+  tsl::Set_TF_Status_from_Status(status, cc_status);
 }
 
 TF_Tensor* TF_GetTensorFromVariableInfo(TF_VariableInfo* var_info,
@@ -162,20 +162,20 @@ TF_Tensor* TF_GetTensorFromVariableInfo(TF_VariableInfo* var_info,
   tsl::Status cc_status;
   if (var_info == nullptr) {
     cc_status = tsl::errors::InvalidArgument("TF_VariableInfo is NULL.");
-    Set_TF_Status_from_Status(status, cc_status);
+    tsl::Set_TF_Status_from_Status(status, cc_status);
     return nullptr;
   }
   if (var_info->var_info.var() == nullptr) {
     cc_status = tsl::errors::InvalidArgument(
         "VariableInfo does not track a resource variable.");
-    Set_TF_Status_from_Status(status, cc_status);
+    tsl::Set_TF_Status_from_Status(status, cc_status);
     return nullptr;
   }
 
   tensorflow::Tensor* tensor = var_info->var_info.var()->tensor();
   TF_Tensor* result_tensor =
       tensorflow::TF_TensorFromTensor(*tensor, &cc_status);
-  Set_TF_Status_from_Status(status, cc_status);
+  tsl::Set_TF_Status_from_Status(status, cc_status);
   return result_tensor;
 }
 
@@ -323,6 +323,13 @@ void TF_CreatePjRtBuffer(TF_Tensor* c_tensor, PJRT_Buffer* c_buffer,
   }
   tensorflow::AsyncValueTensor* av_tensor =
       tensorflow::AsyncValueTensor::FromTensor(&tensor);
+  if (av_tensor == nullptr) {
+    tensorflow::Set_TF_Status_from_Status(
+        status,
+        tsl::errors::Internal(
+            "The tensor to set PjRtBuffer is not an AsyncValueTensor."));
+    return;
+  }
   av_tensor->SetBuffer(
       std::make_unique<xla::PjRtCApiBuffer>(pjrt_c_api_client, c_buffer));
   TF_SetStatus(status, TF_OK, "");
diff --git a/tensorflow/c/experimental/next_pluggable_device/c_api.h b/tensorflow/c/experimental/next_pluggable_device/c_api.h
index 4c476a68322..f8f3db2c737 100644
--- a/tensorflow/c/experimental/next_pluggable_device/c_api.h
+++ b/tensorflow/c/experimental/next_pluggable_device/c_api.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_C_EXPERIMENTAL_NEXT_PLUGGABLE_DEVICE_C_API_H_
 
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/kernels.h"
 #include "tensorflow/c/kernels_experimental.h"
 #include "tensorflow/c/tf_buffer.h"
@@ -26,25 +27,6 @@ limitations under the License.
 // C API for device. The API is under active development and eventually
 // should allow registering a plugin device with TensorFlow.
 
-// Macro to control visibility of exported symbols in the shared library (.so,
-// .dylib, .dll).
-// This duplicates the TF_EXPORT macro definition in
-// tensorflow/core/platform/macros.h in order to keep this .h file independent
-// of any other includes.
-#ifdef SWIG
-#define TF_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
-#define TF_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TF_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
-#else
-#define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index d72cf86a7bc..f35d4c1ee04 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -16,7 +16,6 @@ package(
         # copybara:uncomment(<g3 only>) "//learning/brain/tfrt/aot:__pkg__",
         "//tensorflow/c:__subpackages__",
         "//tensorflow/c/experimental/saved_model/internal:__pkg__",
-        "//tensorflow/cc/experimental/libtf:__pkg__",
         "//tensorflow/core:__subpackages__",
     ],
     licenses = ["notice"],
diff --git a/tensorflow/c/experimental/saved_model/core/ops/restore_ops_test.cc b/tensorflow/c/experimental/saved_model/core/ops/restore_ops_test.cc
index 52a652a90ef..9f63038ac4c 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/restore_ops_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/restore_ops_test.cc
@@ -96,7 +96,7 @@ TEST_F(RestoreOpsTest, BadCheckpointPrefixShouldFail) {
   Status status = internal::SingleRestore(
       context(), CheckpointPrefix("unknown_bad_checkpoint_prefix"),
       "x/.ATTRIBUTES/VARIABLE_VALUE", DT_FLOAT, &x_handle);
-  EXPECT_FALSE(status.ok()) << status.error_message();
+  EXPECT_FALSE(status.ok()) << status.message();
 }
 
 TEST_F(RestoreOpsTest, BadCheckpointKeyShouldFail) {
@@ -104,7 +104,7 @@ TEST_F(RestoreOpsTest, BadCheckpointKeyShouldFail) {
   Status status = internal::SingleRestore(
       context(), CheckpointPrefix("VarsAndArithmeticObjectGraph"),
       "bad_checkpoint_key", DT_FLOAT, &x_handle);
-  EXPECT_FALSE(status.ok()) << status.error_message();
+  EXPECT_FALSE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.cc b/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.cc
index 59f7306fedc..d6e568090f7 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.cc
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.cc
@@ -41,7 +41,7 @@ FlatTensorFunction::~FlatTensorFunction() {
   Status status = ctx_->RemoveFunction(name_);
   if (!status.ok()) {
     LOG(ERROR) << "Failed to remove functiondef " << name_ << ". "
-               << status.error_message();
+               << status.message();
   }
 }
 
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.cc b/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.cc
index 43b8c3ee303..3bcaee4852a 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.cc
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.cc
@@ -71,7 +71,7 @@ RestoredResource::~RestoredResource() {
     if (!status.ok()) {
       LOG(WARNING)
           << "Failed executing destroy_resource function for RestoredResource: "
-          << status.error_message();
+          << status.message();
     }
   }
 }
diff --git a/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc b/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
index 6947b6eb28d..7d6b50fa6b5 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
@@ -126,7 +126,7 @@ TEST_P(SavedVariableLoadingTest, AssignAndReadVariableSuccesful) {
   ImmediateTensorHandlePtr expected_handle =
       testing::CreateTensorHandle(context(), dtype, shape_vector, 42);
   AbstractTensorPtr expected_tensor(expected_handle->Resolve(&status));
-  TF_EXPECT_OK(status) << status.error_message();
+  TF_EXPECT_OK(status) << status.message();
 
   // Assign the tensorhandle to the variable.
   TF_EXPECT_OK(var->Assign(expected_handle.get()));
@@ -135,7 +135,7 @@ TEST_P(SavedVariableLoadingTest, AssignAndReadVariableSuccesful) {
   ImmediateTensorHandlePtr output_handle;
   TF_EXPECT_OK(var->ReadValue(&output_handle));
   AbstractTensorPtr output_tensor(output_handle->Resolve(&status));
-  TF_EXPECT_OK(status) << status.error_message();
+  TF_EXPECT_OK(status) << status.message();
 
   // Check that output_tensor == expected_tensor
   EXPECT_EQ(output_tensor->Type(), expected_tensor->Type());
diff --git a/tensorflow/c/experimental/saved_model/core/test_utils.cc b/tensorflow/c/experimental/saved_model/core/test_utils.cc
index 88d44011f09..0f1f9ca4488 100644
--- a/tensorflow/c/experimental/saved_model/core/test_utils.cc
+++ b/tensorflow/c/experimental/saved_model/core/test_utils.cc
@@ -139,7 +139,7 @@ void CheckBufferDataIsEqual(DataType dtype, int64_t num_elements, void* a,
 AbstractTensorPtr TensorHandleToTensor(ImmediateExecutionTensorHandle* handle) {
   Status status;
   AbstractTensorPtr tensor(handle->Resolve(&status));
-  CHECK(status.ok()) << status.error_message();
+  CHECK(status.ok()) << status.message();
   CHECK_NE(tensor.get(), nullptr);
   return tensor;
 }
diff --git a/tensorflow/c/experimental/saved_model/core/tf_concrete_function_loading_test.cc b/tensorflow/c/experimental/saved_model/core/tf_concrete_function_loading_test.cc
index ae3460f7a61..92a21ae6e04 100644
--- a/tensorflow/c/experimental/saved_model/core/tf_concrete_function_loading_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/tf_concrete_function_loading_test.cc
@@ -81,8 +81,7 @@ TEST_F(SavedConcreteFunctionLoadingTest, TooFewInputsInSavedConcreteFunction) {
   std::unique_ptr<TFConcreteFunction> result;
   Status status =
       internal::LoadTFConcreteFunction(saved, &func, {}, context(), &result);
-  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION)
-      << status.error_message();
+  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION) << status.message();
 }
 
 // A SavedConcreteFunction whose canonicalized input signature length +
@@ -105,8 +104,7 @@ TEST_F(SavedConcreteFunctionLoadingTest,
   std::unique_ptr<TFConcreteFunction> result;
   Status status = internal::LoadTFConcreteFunction(saved, &func, captures,
                                                    context(), &result);
-  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION)
-      << status.error_message();
+  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION) << status.message();
 }
 
 // A SavedConcreteFunction whose canonicalized input signature
@@ -124,8 +122,7 @@ TEST_F(SavedConcreteFunctionLoadingTest, TooManyInputsInSavedConcreteFunction) {
   std::unique_ptr<TFConcreteFunction> result;
   Status status =
       internal::LoadTFConcreteFunction(saved, &func, {}, context(), &result);
-  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION)
-      << status.error_message();
+  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION) << status.message();
 }
 
 // A SavedConcreteFunction whose canonicalized input signature
@@ -149,8 +146,7 @@ TEST_F(SavedConcreteFunctionLoadingTest,
   std::unique_ptr<TFConcreteFunction> result;
   Status status = internal::LoadTFConcreteFunction(saved, &func, captures,
                                                    context(), &result);
-  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION)
-      << status.error_message();
+  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION) << status.message();
 }
 
 // A SavedConcreteFunction whose capture refers to an index not in the capture
@@ -174,8 +170,7 @@ TEST_F(SavedConcreteFunctionLoadingTest, ImproperCaptureIndex) {
   std::unique_ptr<TFConcreteFunction> result;
   Status status = internal::LoadTFConcreteFunction(saved, &func, captures,
                                                    context(), &result);
-  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION)
-      << status.error_message();
+  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION) << status.message();
 }
 
 // A SavedConcreteFunction whose outputs are fewer than its corresponding
@@ -193,8 +188,7 @@ TEST_F(SavedConcreteFunctionLoadingTest, TooFewOutputsInSavedConcreteFunction) {
   std::unique_ptr<TFConcreteFunction> result;
   Status status =
       internal::LoadTFConcreteFunction(saved, &func, {}, context(), &result);
-  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION)
-      << status.error_message();
+  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION) << status.message();
 }
 
 // A SavedConcreteFunction whose outputs exceed its corresponding functiondef
@@ -213,8 +207,7 @@ TEST_F(SavedConcreteFunctionLoadingTest,
   std::unique_ptr<TFConcreteFunction> result;
   Status status =
       internal::LoadTFConcreteFunction(saved, &func, {}, context(), &result);
-  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION)
-      << status.error_message();
+  EXPECT_EQ(status.code(), error::FAILED_PRECONDITION) << status.message();
 }
 
 // A SavedConcreteFunction whose (inputs + captures) = functiondef inputs,
@@ -238,7 +231,7 @@ TEST_F(SavedConcreteFunctionLoadingTest, SuccessfulLoad) {
   std::unique_ptr<TFConcreteFunction> result;
   Status status = internal::LoadTFConcreteFunction(saved, &func, captures,
                                                    context(), &result);
-  TF_EXPECT_OK(status) << status.error_message();
+  TF_EXPECT_OK(status) << status.message();
 }
 
 // A TFConcreteFunction should register functiondefs on creation, and
@@ -257,7 +250,7 @@ TEST_F(SavedConcreteFunctionLoadingTest, RegistersAndRemovesFunctionDefs) {
     std::unique_ptr<TFConcreteFunction> result;
     Status status =
         internal::LoadTFConcreteFunction(saved, &func, {}, context(), &result);
-    TF_EXPECT_OK(status) << status.error_message();
+    TF_EXPECT_OK(status) << status.message();
     // The function should be registered with context.
     EXPECT_TRUE(context()->FindFunctionByName(func_name));
   }
diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
index 2a7c0a8c419..85647f78b7b 100644
--- a/tensorflow/c/experimental/stream_executor/BUILD
+++ b/tensorflow/c/experimental/stream_executor/BUILD
@@ -26,7 +26,7 @@ cc_library(
     hdrs = ["stream_executor.h"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c:c_api_macros_hdrs",
         "//tensorflow/c:tf_status_headers",
     ],
 )
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
index 2ba7d3cc953..3c984bcc15c 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -204,7 +204,7 @@ struct HostCallbackContext {
 void HostCallbackTrampoline(void* ctx, TF_Status* status) {
   HostCallbackContext* host_ctx = static_cast<HostCallbackContext*>(ctx);
   tsl::Status s = std::move(host_ctx->callback)();
-  Set_TF_Status_from_Status(status, s);
+  tsl::Set_TF_Status_from_Status(status, s);
   delete host_ctx;
 }
 
@@ -237,7 +237,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     stream_executor_->allocate(&device_, size, memory_space, &mem);
     tsl::Status status = ValidateSPDeviceMemoryBase(mem);
     if (!status.ok()) {
-      LOG(ERROR) << status.error_message();
+      LOG(ERROR) << status.message();
     }
     return DeviceMemoryBaseFromC(mem);
   }
@@ -284,7 +284,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     }
     tsl::Status status = ValidateSPAllocatorStats(c_stats);
     if (!status.ok()) {
-      LOG(ERROR) << status.error_message();
+      LOG(ERROR) << status.message();
       return absl::nullopt;
     }
     ::stream_executor::AllocatorStats stats;
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
index cf21374c48f..90b4dad5daa 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
@@ -65,7 +65,7 @@ TEST(StreamExecutor, NameNotSet) {
   tsl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
-  ASSERT_EQ(status.error_message(), "'name' field in SP_Platform must be set.");
+  ASSERT_EQ(status.message(), "'name' field in SP_Platform must be set.");
 }
 
 TEST(StreamExecutor, InvalidNameWithSemicolon) {
@@ -81,7 +81,7 @@ TEST(StreamExecutor, InvalidNameWithSemicolon) {
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       testing::ContainsRegex("Device name/type 'INVALID:NAME' must match"));
 }
 
@@ -97,7 +97,7 @@ TEST(StreamExecutor, InvalidNameWithSlash) {
   tsl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               testing::ContainsRegex("Device name/type 'INVALID/' must match"));
 }
 
@@ -113,7 +113,7 @@ TEST(StreamExecutor, CreateDeviceNotSet) {
   tsl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
-  ASSERT_EQ(status.error_message(),
+  ASSERT_EQ(status.message(),
             "'create_device' field in SP_PlatformFns must be set.");
 }
 
@@ -130,7 +130,7 @@ TEST(StreamExecutor, UnifiedMemoryAllocateNotSet) {
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   ASSERT_EQ(
-      status.error_message(),
+      status.message(),
       "'unified_memory_allocate' field in SP_StreamExecutor must be set.");
 }
 
@@ -327,7 +327,7 @@ TEST_F(StreamExecutorTest, StreamStatus) {
   status_ok = false;
   auto updated_status = stream.RefreshStatus();
   ASSERT_FALSE(stream.ok());
-  ASSERT_EQ(updated_status.error_message(), "Test error");
+  ASSERT_EQ(updated_status.message(), "Test error");
 }
 
 TEST_F(StreamExecutorTest, CreateEvent) {
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_test_util.cc b/tensorflow/c/experimental/stream_executor/stream_executor_test_util.cc
index 6722b86c0ef..41928bc469c 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_test_util.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_test_util.cc
@@ -83,9 +83,6 @@ void SynchronizeAllActivity(const SP_Device* const device,
 TF_Bool HostCallback(const SP_Device* const device, SP_Stream stream,
                      SE_StatusCallbackFn const callback_fn,
                      void* const callback_arg) {
-  TSL_Status* status_ignored = TSL_NewStatus();
-  callback_fn(callback_arg, status_ignored);
-  TSL_DeleteStatus(status_ignored);
   return true;
 }
 
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 9f34547f9ee..59f978000e6 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -36,6 +36,9 @@ limitations under the License.
 #if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 #include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/tsl/framework/device_id_utils.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 
 using tensorflow::errors::InvalidArgument;
@@ -660,12 +663,12 @@ TF_Buffer* TF_OpKernelConstruction_GetAttrFunction(TF_OpKernelConstruction* ctx,
   tensorflow::NameAttrList function;
   auto cc_status = cc_ctx->GetAttr(attr_name, &function);
   if (!cc_status.ok()) {
-    Set_TF_Status_from_Status(status, cc_status);
+    tsl::Set_TF_Status_from_Status(status, cc_status);
     return nullptr;
   }
   TF_Buffer* buffer = TF_NewBuffer();
   cc_status = tensorflow::MessageToBuffer(function, buffer);
-  Set_TF_Status_from_Status(status, cc_status);
+  tsl::Set_TF_Status_from_Status(status, cc_status);
   if (!cc_status.ok())
     return nullptr;
   else
@@ -753,10 +756,19 @@ int64_t TF_GetStepId(TF_OpKernelContext* ctx) {
 
 int TF_GetDeviceId(TF_OpKernelContext* ctx) {
   // TensorFlow always sets device in OpKernelContext.
-  auto* device =
-      reinterpret_cast<::tensorflow::OpKernelContext*>(ctx)->device();
-  if (!device->parsed_name().has_id) return -1;
-  return device->parsed_name().id;
+  const tensorflow::DeviceBase* device_base =
+      reinterpret_cast<tensorflow::OpKernelContext*>(ctx)->device();
+#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
+  if (!device_base->parsed_name().has_id) return -1;
+  return device_base->parsed_name().id;
+#else
+  const auto* device = reinterpret_cast<const tensorflow::Device*>(
+      device_base->UnderlyingDevice());
+  const tsl::StatusOr<int> id = tsl::GetDeviceIdFromDeviceParsedName(
+      device->parsed_name(), tensorflow::DeviceType(device->device_type()));
+  if (!id.ok()) return -1;
+  return *id;
+#endif  // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD)
 }
 
 TF_StringView TF_GetOpKernelName(TF_OpKernelContext* ctx) {
@@ -791,8 +803,6 @@ TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context, int index,
                              int num_dims, size_t len, TF_Status* status) {
   TF_SetStatus(status, TF_OK, "");
   auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
-  static_assert(sizeof(int64_t) == sizeof(int64_t),
-                "64-bit int types should match in size");
   tensorflow::gtl::ArraySlice<const int64_t> dimarray(
       reinterpret_cast<const int64_t*>(dims), num_dims);
   tensorflow::Tensor* tensor;
@@ -818,8 +828,6 @@ TF_Tensor* TF_ForwardInputOrAllocateOutput(
   TF_SetStatus(status, TF_OK, "");
   auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
 
-  static_assert(sizeof(int64_t) == sizeof(int64_t),
-                "64-bit int types should match in size");
   tensorflow::gtl::ArraySlice<int> input_indices_array(
       candidate_input_indices, num_candidate_input_indices);
   tensorflow::gtl::ArraySlice<const int64_t> output_dimarray(
@@ -847,8 +855,6 @@ TF_Tensor* TF_AllocateTemp(TF_OpKernelContext* context, TF_DataType dtype,
                            TF_Status* status) {
   auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
   TF_SetStatus(status, TF_OK, "");
-  static_assert(sizeof(int64_t) == sizeof(int64_t),
-                "64-bit int types should match in size");
   tensorflow::gtl::ArraySlice<const int64_t> dimarray(
       reinterpret_cast<const int64_t*>(dims), num_dims);
   if (attributes && !attributes->struct_size) {
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index 7759c02daa2..665aff8f17a 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -19,30 +19,12 @@ limitations under the License.
 #include <stdint.h>
 
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/experimental/stream_executor/stream_executor.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_tensor.h"
 
-// Macro to control visibility of exported symbols in the shared library (.so,
-// .dylib, .dll).
-// This duplicates the TF_EXPORT macro definition in
-// tensorflow/core/platform/macros.h in order to keep this .h file independent
-// of any other includes.
-#ifdef SWIG
-#define TF_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
-#define TF_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TF_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
-#else
-#define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -283,7 +265,11 @@ TF_CAPI_EXPORT extern int64_t TF_GetIterId(TF_OpKernelContext* ctx);
 // Returns the Step ID of the given context.
 TF_CAPI_EXPORT extern int64_t TF_GetStepId(TF_OpKernelContext* ctx);
 
-// Returns the Device ID of the device that the context possesses.
+// Returns the Device ID of the device that the context possesses. Returns the
+// PlatformDeviceId if a mapping between between TfDeviceId and PlatformDeviceId
+// is set; otherwise returns the id in the device name. Please refer to
+// tensorflow/tsl/framework/device_id.h for more details.
+// For mobile or slim build, returns the id in the device name.
 TF_CAPI_EXPORT extern int TF_GetDeviceId(TF_OpKernelContext* ctx);
 
 // Returns the graph def version of the given context.
diff --git a/tensorflow/c/kernels_experimental.cc b/tensorflow/c/kernels_experimental.cc
index 7590921d952..259d1cac9df 100644
--- a/tensorflow/c/kernels_experimental.cc
+++ b/tensorflow/c/kernels_experimental.cc
@@ -262,7 +262,7 @@ void TF_AssignUpdateVariable(TF_OpKernelContext* ctx, int input_index,
   Status status =
       LookupResource(context, HandleFromInput(context, input_index), &variable);
   if (!status.ok()) {
-    printf("Failed with error: %s\n", status.error_message().c_str());
+    printf("Failed with error: %s\n", tsl::NullTerminatedMessage(status));
     abort();
   }
   const Tensor& value = context->input(value_index);
@@ -475,6 +475,118 @@ static Status ValidateVariantType(const Variant& variant) {
   return ::tensorflow::OkStatus();
 }
 
+static Status VariantBinaryAddFunc(
+    ::tensorflow::OpKernelContext* cc_ctx, const Variant& a, const Variant& b,
+    Variant* out,
+    void (*binary_add_func)(TF_OpKernelContext* ctx, TF_Tensor* a, TF_Tensor* b,
+                            TF_Tensor* out));
+
+static Status CCBinaryAddFunc(
+    ::tensorflow::OpKernelContext* cc_ctx, const Tensor& cc_a,
+    const Tensor& cc_b, Tensor* cc_out,
+    void (*binary_add_func)(TF_OpKernelContext* ctx, TF_Tensor* a, TF_Tensor* b,
+                            TF_Tensor* out)) {
+  if (cc_a.dtype() == ::tensorflow::DT_INVALID) {
+    *cc_out = cc_b;
+    return ::tensorflow::OkStatus();
+  }
+  if (cc_b.dtype() == ::tensorflow::DT_INVALID) {
+    *cc_out = cc_a;
+    return ::tensorflow::OkStatus();
+  }
+
+  Status status;
+  TF_Tensor* a = TF_TensorFromTensor(cc_a, &status);
+  TF_RETURN_IF_ERROR(status);
+
+  TF_Tensor* b = TF_TensorFromTensor(cc_b, &status);
+  if (!status.ok()) {
+    TF_DeleteTensor(a);
+    return status;
+  }
+
+  ::tensorflow::AllocatorAttributes attr;
+  if (cc_a.dtype() == ::tensorflow::DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+
+  status = cc_ctx->allocate_temp(cc_a.dtype(), cc_a.shape(), cc_out, attr);
+  if (!status.ok()) {
+    TF_DeleteTensor(a);
+    TF_DeleteTensor(b);
+    return status;
+  }
+
+  TF_Tensor* out = TF_TensorFromTensor(*cc_out, &status);
+  if (!status.ok()) {
+    TF_DeleteTensor(a);
+    TF_DeleteTensor(b);
+    return status;
+  }
+
+  auto* ctx = reinterpret_cast<TF_OpKernelContext*>(cc_ctx);
+  if (cc_a.dtype() == ::tensorflow::DT_VARIANT) {
+    return VariantBinaryAddFunc(
+        cc_ctx, cc_a.scalar<Variant>()(), cc_b.scalar<Variant>()(),
+        cc_out->scalar<Variant>().data(), binary_add_func);
+  } else {
+    binary_add_func(ctx, a, b, out);
+    return cc_ctx->status();
+  }
+};
+
+static Status VariantBinaryAddFunc(
+    ::tensorflow::OpKernelContext* cc_ctx, const Variant& a, const Variant& b,
+    Variant* out,
+    void (*binary_add_func)(TF_OpKernelContext* ctx, TF_Tensor* a, TF_Tensor* b,
+                            TF_Tensor* out)) {
+  auto cc_binary_add = [binary_add_func](::tensorflow::OpKernelContext* cc_ctx,
+                                         const Tensor& cc_a, const Tensor& cc_b,
+                                         Tensor* cc_out) {
+    return CCBinaryAddFunc(cc_ctx, cc_a, cc_b, cc_out, binary_add_func);
+  };
+
+  if (out == nullptr) {
+    return ::tensorflow::errors::Internal(
+        "The output variant hasn't been initialized");
+  }
+
+  if (a.TypeId() != b.TypeId()) {
+    return ::tensorflow::errors::Internal(
+        "BinaryOpVariants: Variants a and b have different "
+        "type ids.  Type names: '",
+        a.TypeName(), "' vs. '", b.TypeName(), "'");
+  }
+
+  if (a.TypeId() == tensorflow::TypeIndex::Make<::tensorflow::TensorList>()) {
+    TF_RETURN_IF_ERROR(ValidateVariantType<::tensorflow::TensorList>(a));
+    *out = ::tensorflow::TensorList();
+
+    return ::tensorflow::TensorListBinaryAdd(
+        cc_ctx, *a.get<::tensorflow::TensorList>(),
+        *b.get<::tensorflow::TensorList>(),
+        out->get<::tensorflow::TensorList>(), cc_binary_add);
+  } else if (a.TypeId() == tensorflow::TypeIndex::Make<
+                               ::tensorflow::data::OptionalVariant>()) {
+    TF_RETURN_IF_ERROR(
+        ValidateVariantType<::tensorflow::data::OptionalVariant>(a));
+    *out = ::tensorflow::data::OptionalVariant();
+
+    return ::tensorflow::data::OptionalBinaryAdd(
+        cc_ctx, *a.get<::tensorflow::data::OptionalVariant>(),
+        *b.get<::tensorflow::data::OptionalVariant>(),
+        out->get<::tensorflow::data::OptionalVariant>(), cc_binary_add);
+  }
+
+  const std::string type_index_name =
+      ::tensorflow::port::MaybeAbiDemangle(a.TypeId().name());
+
+  return ::tensorflow::errors::Internal(
+      "No unary variant binary_op function found for op ADD Variant "
+      "type_name: ",
+      type_index_name, " for device type: ", cc_ctx->device()->name());
+}
+
 void TF_AddNVariant(TF_OpKernelContext* ctx,
                     void (*binary_add_func)(TF_OpKernelContext* ctx,
                                             TF_Tensor* a, TF_Tensor* b,
@@ -482,97 +594,11 @@ void TF_AddNVariant(TF_OpKernelContext* ctx,
                     TF_Status* status) {
   auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
 
-  auto cc_binary_add_func = [binary_add_func](
-                                ::tensorflow::OpKernelContext* cc_ctx,
-                                const Tensor& cc_a, const Tensor& cc_b,
-                                Tensor* cc_out) {
-    if (cc_a.dtype() == ::tensorflow::DT_INVALID) {
-      *cc_out = cc_b;
-      return ::tensorflow::OkStatus();
-    }
-    if (cc_b.dtype() == ::tensorflow::DT_INVALID) {
-      *cc_out = cc_a;
-      return ::tensorflow::OkStatus();
-    }
-
-    Status status;
-    TF_Tensor* a = TF_TensorFromTensor(cc_a, &status);
-    TF_RETURN_IF_ERROR(status);
-
-    TF_Tensor* b = TF_TensorFromTensor(cc_b, &status);
-    if (!status.ok()) {
-      TF_DeleteTensor(a);
-      return status;
-    }
-
-    ::tensorflow::AllocatorAttributes attr;
-    if (cc_a.dtype() == ::tensorflow::DT_VARIANT) {
-      attr.set_on_host(true);
-    }
-
-    status = cc_ctx->allocate_temp(cc_a.dtype(), cc_a.shape(), cc_out, attr);
-    if (!status.ok()) {
-      TF_DeleteTensor(a);
-      TF_DeleteTensor(b);
-      return status;
-    }
-
-    TF_Tensor* out = TF_TensorFromTensor(*cc_out, &status);
-    if (!status.ok()) {
-      TF_DeleteTensor(a);
-      TF_DeleteTensor(b);
-      return status;
-    }
-
-    auto* ctx = reinterpret_cast<TF_OpKernelContext*>(cc_ctx);
-    binary_add_func(ctx, a, b, out);
-    return cc_ctx->status();
-  };
-
-  auto binary_add_variant = [cc_binary_add_func](
-                                ::tensorflow::OpKernelContext* cc_ctx,
-                                const Variant& a, const Variant& b,
-                                Variant* out) {
-    if (out == nullptr) {
-      return ::tensorflow::errors::Internal(
-          "The output variant hasn't been initialized");
-    }
-
-    if (a.TypeId() != b.TypeId()) {
-      return ::tensorflow::errors::Internal(
-          "BinaryOpVariants: Variants a and b have different "
-          "type ids.  Type names: '",
-          a.TypeName(), "' vs. '", b.TypeName(), "'");
-    }
-
-    if (a.TypeId() == tensorflow::TypeIndex::Make<::tensorflow::TensorList>()) {
-      TF_RETURN_IF_ERROR(ValidateVariantType<::tensorflow::TensorList>(a));
-      *out = ::tensorflow::TensorList();
-
-      return ::tensorflow::TensorListBinaryAdd(
-          cc_ctx, *a.get<::tensorflow::TensorList>(),
-          *b.get<::tensorflow::TensorList>(),
-          out->get<::tensorflow::TensorList>(), cc_binary_add_func);
-    } else if (a.TypeId() == tensorflow::TypeIndex::Make<
-                                 ::tensorflow::data::OptionalVariant>()) {
-      TF_RETURN_IF_ERROR(
-          ValidateVariantType<::tensorflow::data::OptionalVariant>(a));
-      *out = ::tensorflow::data::OptionalVariant();
-
-      return ::tensorflow::data::OptionalBinaryAdd(
-          cc_ctx, *a.get<::tensorflow::data::OptionalVariant>(),
-          *b.get<::tensorflow::data::OptionalVariant>(),
-          out->get<::tensorflow::data::OptionalVariant>(), cc_binary_add_func);
-    }
-
-    const std::string type_index_name =
-        ::tensorflow::port::MaybeAbiDemangle(a.TypeId().name());
-
-    return ::tensorflow::errors::Internal(
-        "No unary variant binary_op function found for op ADD Variant "
-        "type_name: ",
-        type_index_name, " for device type: ", cc_ctx->device()->name());
-  };
+  auto binary_add_variant =
+      [binary_add_func](::tensorflow::OpKernelContext* cc_ctx, const Variant& a,
+                        const Variant& b, Variant* out) {
+        return VariantBinaryAddFunc(cc_ctx, a, b, out, binary_add_func);
+      };
   ::tensorflow::AddNVariant(cc_ctx, binary_add_variant);
   ::tensorflow::Set_TF_Status_from_Status(status, cc_ctx->status());
 }
diff --git a/tensorflow/c/kernels_experimental.h b/tensorflow/c/kernels_experimental.h
index fbf0247f1c0..a36ea55e311 100644
--- a/tensorflow/c/kernels_experimental.h
+++ b/tensorflow/c/kernels_experimental.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_C_KERNELS_EXPERIMENTAL_H_
 #define TENSORFLOW_C_KERNELS_EXPERIMENTAL_H_
 
+#include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/kernels.h"
 
 // --------------------------------------------------------------------------
@@ -24,25 +25,6 @@ limitations under the License.
 // The API here is subject to changes in the future.
 // --------------------------------------------------------------------------
 
-// Macro to control visibility of exported symbols in the shared library (.so,
-// .dylib, .dll).
-// This duplicates the TF_EXPORT macro definition in
-// tensorflow/core/platform/macros.h in order to keep this .h file independent
-// of any other includes.
-#ifdef SWIG
-#define TF_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
-#define TF_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TF_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
-#else
-#define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/tensorflow/c/ops.h b/tensorflow/c/ops.h
index 7463809e35b..5d3a1e8965d 100644
--- a/tensorflow/c/ops.h
+++ b/tensorflow/c/ops.h
@@ -73,23 +73,10 @@ limitations under the License.
 #include <stdint.h>
 #include <stdlib.h>
 
+#include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
 
-#ifdef SWIG
-#define TF_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
-#define TF_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TF_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
-#else
-#define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/tensorflow/python/lib/core/safe_ptr.cc b/tensorflow/c/safe_ptr.cc
similarity index 95%
rename from tensorflow/python/lib/core/safe_ptr.cc
rename to tensorflow/c/safe_ptr.cc
index ce852a4f009..fa200b0712f 100644
--- a/tensorflow/python/lib/core/safe_ptr.cc
+++ b/tensorflow/c/safe_ptr.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/c/safe_ptr.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/python/lib/core/safe_ptr.h b/tensorflow/c/safe_ptr.h
similarity index 90%
rename from tensorflow/python/lib/core/safe_ptr.h
rename to tensorflow/c/safe_ptr.h
index 00f47d7bbe6..8d8b8141b0b 100644
--- a/tensorflow/python/lib/core/safe_ptr.h
+++ b/tensorflow/c/safe_ptr.h
@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
-#define TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
-
-#include <Python.h>
+#ifndef TENSORFLOW_C_SAFE_PTR_H_
+#define TENSORFLOW_C_SAFE_PTR_H_
 
 #include <memory>
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
-#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 namespace tensorflow {
 namespace detail {
@@ -68,4 +65,4 @@ Safe_TF_BufferPtr make_safe(TF_Buffer* buffer);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
+#endif  // TENSORFLOW_C_SAFE_PTR_H_
diff --git a/tensorflow/c/tf_buffer.h b/tensorflow/c/tf_buffer.h
index f18f2116536..71a9aef844c 100644
--- a/tensorflow/c/tf_buffer.h
+++ b/tensorflow/c/tf_buffer.h
@@ -18,24 +18,7 @@ limitations under the License.
 
 #include <stddef.h>
 
-// Macro to control visibility of exported symbols in the shared library (.so,
-// .dylib, .dll).
-// This duplicates the TF_EXPORT macro definition in
-// tensorflow/core/platform/macros.h in order to keep this .h file independent
-// of any other includes.
-#ifdef SWIG
-#define TF_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
-#define TF_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TF_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
-#else
-#define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
+#include "tensorflow/c/c_api_macros.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/tensorflow/c/tf_buffer_internal.h b/tensorflow/c/tf_buffer_internal.h
index a538de7e895..805f632cf72 100644
--- a/tensorflow/c/tf_buffer_internal.h
+++ b/tensorflow/c/tf_buffer_internal.h
@@ -22,11 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/status.h"
 
-namespace tsl {
-class Status;
-}
 namespace tensorflow {
-using tsl::Status;
 
 Status MessageToBuffer(const tensorflow::protobuf::MessageLite& in,
                        TF_Buffer* out);
diff --git a/tensorflow/c/tf_datatype.h b/tensorflow/c/tf_datatype.h
index df0c1fb45b0..1f5597fe99a 100644
--- a/tensorflow/c/tf_datatype.h
+++ b/tensorflow/c/tf_datatype.h
@@ -18,24 +18,7 @@ limitations under the License.
 
 #include <stddef.h>
 
-// Macro to control visibility of exported symbols in the shared library (.so,
-// .dylib, .dll).
-// This duplicates the TF_EXPORT macro definition in
-// tensorflow/core/platform/macros.h in order to keep this .h file independent
-// of any other includes.
-#ifdef SWIG
-#define TF_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
-#define TF_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TF_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
-#else
-#define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
+#include "tensorflow/c/c_api_macros.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/tensorflow/c/tf_status.h b/tensorflow/c/tf_status.h
index db1d32bf8e7..22b237e16df 100644
--- a/tensorflow/c/tf_status.h
+++ b/tensorflow/c/tf_status.h
@@ -16,22 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_C_TF_STATUS_H_
 #define TENSORFLOW_C_TF_STATUS_H_
 
+#include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/tsl/c/tsl_status.h"
 
-#ifdef SWIG
-#define TF_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
-#define TF_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TF_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
-#else
-#define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/tensorflow/c/tf_tensor.cc b/tensorflow/c/tf_tensor.cc
index 90e19c55c89..d4efcaf50aa 100644
--- a/tensorflow/c/tf_tensor.cc
+++ b/tensorflow/c/tf_tensor.cc
@@ -183,7 +183,7 @@ void TF_TensorBitcastFrom(const TF_Tensor* from, TF_DataType type,
               *tensorflow::down_cast<const tensorflow::TensorInterface*>(
                   from->tensor),
               static_cast<tensorflow::DataType>(type), new_dims, num_new_dims));
-  Set_TF_Status_from_Status(status, cc_status);
+  tsl::Set_TF_Status_from_Status(status, cc_status);
 }
 
 namespace tensorflow {
diff --git a/tensorflow/c/tf_tensor.h b/tensorflow/c/tf_tensor.h
index e8bef826599..05c74b8f342 100644
--- a/tensorflow/c/tf_tensor.h
+++ b/tensorflow/c/tf_tensor.h
@@ -23,25 +23,6 @@ limitations under the License.
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
 
-// Macro to control visibility of exported symbols in the shared library (.so,
-// .dylib, .dll).
-// This duplicates the TF_EXPORT macro definition in
-// tensorflow/core/platform/macros.h in order to keep this .h file independent
-// of any other includes.
-#ifdef SWIG
-#define TF_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
-#define TF_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TF_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
-#else
-#define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/tensorflow/c/tf_tstring.h b/tensorflow/c/tf_tstring.h
index f9fb2fe083f..876fd5f384f 100644
--- a/tensorflow/c/tf_tstring.h
+++ b/tensorflow/c/tf_tstring.h
@@ -15,23 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_C_TF_TSTRING_H_
 #define TENSORFLOW_C_TF_TSTRING_H_
 
+#include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/core/platform/ctstring.h"
 
-#ifdef SWIG
-#define TF_CAPI_EXPORT
-#else
-#if defined(_WIN32)
-#ifdef TF_COMPILE_LIBRARY
-#define TF_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TF_CAPI_EXPORT __declspec(dllimport)
-#endif  // TF_COMPILE_LIBRARY
-#else
-#define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/tensorflow/cc/client/client_session.cc b/tensorflow/cc/client/client_session.cc
index 2ea322ffcb2..a6c7be07554 100644
--- a/tensorflow/cc/client/client_session.cc
+++ b/tensorflow/cc/client/client_session.cc
@@ -108,10 +108,14 @@ Status ClientSession::Run(const RunOptions& run_options, const FeedType& inputs,
                           std::vector<Tensor>* outputs,
                           RunMetadata* run_metadata) const {
   std::vector<std::pair<string, Tensor>> feeds;
+  feeds.reserve(inputs.size());
   for (auto const& feed : inputs) {
     TF_RETURN_IF_ERROR(feed.second.status);
-    feeds.emplace_back(feed.first.name(), feed.second.tensor);
+    feeds.emplace_back(std::piecewise_construct,
+                       std::forward_as_tuple(feed.first.name()),
+                       std::forward_as_tuple(feed.second.tensor));
   }
+
   std::vector<string> output_tensor_names;
   output_tensor_names.reserve(fetch_outputs.size());
   for (auto const& output : fetch_outputs) {
diff --git a/tensorflow/cc/experimental/libtf/tests/function_test.cc b/tensorflow/cc/experimental/libtf/tests/function_test.cc
index 226cbf2afa7..a9b4061f1a0 100644
--- a/tensorflow/cc/experimental/libtf/tests/function_test.cc
+++ b/tensorflow/cc/experimental/libtf/tests/function_test.cc
@@ -50,7 +50,7 @@ class FunctionTest
   impl::TaggedValueTensor CreateScalarTensor(T val) {
     AbstractTensorHandle* raw = nullptr;
     Status s = TestScalarTensorHandle<T, datatype>(ctx_.get(), val, &raw);
-    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.message();
     return impl::TaggedValueTensor(raw, /*add_ref=*/false);
   }
 
@@ -64,12 +64,12 @@ class FunctionTest
     TF_StatusPtr status(TF_NewStatus());
     TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
     Status s = tensorflow::StatusFromTF_Status(status.get());
-    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.message();
 
     // Set the runtime impl, Core RT vs TFRT.
     AbstractContext* ctx_raw = nullptr;
     s = BuildImmediateExecutionContext(UseTfrt(), &ctx_raw);
-    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.message();
     ctx_.reset(ctx_raw);
   }
 };
@@ -139,7 +139,7 @@ template <typename T>
 void ExpectEquals(AbstractTensorHandle* t, T expected) {
   TF_Tensor* result_t;
   Status s = tensorflow::GetValue(t, &result_t);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   auto value = static_cast<T*>(TF_TensorData(result_t));
   EXPECT_EQ(*value, expected);
   TF_DeleteTensor(result_t);
@@ -156,10 +156,10 @@ TEST_P(FunctionTest, Square) {
   PartialTensorShape unknown_shape;
   TaggedValue signature(unknown_shape, DT_FLOAT);
   Status s = tf_function.RegisterTrace(std::move(trace), signature, signature);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   TaggedValue args(std::move(x));
   StatusOr<TaggedValue> v = tf_function.Execute(ctx_.get(), args);
-  ASSERT_TRUE(v.ok()) << v.status().error_message();
+  ASSERT_TRUE(v.ok()) << v.status().message();
   const TaggedValue& result = v.value();
   AbstractTensorHandle* t = result.tensor().get();
   ExpectEquals(t, 4.0f);
@@ -178,12 +178,12 @@ TEST_P(FunctionTest, Add) {
   input_signature.tuple().emplace_back(tensor_spec);
   Status s =
       tf_function.RegisterTrace(std::move(trace), input_signature, tensor_spec);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   TaggedValue args = TaggedValue::Tuple();
   args.tuple().emplace_back(TaggedValue(x));
   args.tuple().emplace_back(TaggedValue(x));
   StatusOr<TaggedValue> v = tf_function.Execute(ctx_.get(), args);
-  ASSERT_TRUE(v.ok()) << v.status().error_message();
+  ASSERT_TRUE(v.ok()) << v.status().message();
   const TaggedValue& result = v.value();
   ExpectEquals(result.tensor().get(), 4.0f);
 }
@@ -200,12 +200,12 @@ TEST_P(FunctionTest, IdentityN) {
   signature.tuple().emplace_back(tensor_spec);
   signature.tuple().emplace_back(tensor_spec);
   Status s = tf_function.RegisterTrace(std::move(trace), signature, signature);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   TaggedValue args = TaggedValue::Tuple();
   args.tuple().emplace_back(TaggedValue(x));
   args.tuple().emplace_back(TaggedValue(y));
   StatusOr<TaggedValue> v = tf_function.Execute(ctx_.get(), args);
-  ASSERT_TRUE(v.ok()) << v.status().error_message();
+  ASSERT_TRUE(v.ok()) << v.status().message();
   const TaggedValue& result = v.value();
   ExpectEquals(result.tuple()[0].tensor().get(), 2.0f);
   ExpectEquals(result.tuple()[1].tensor().get(), 4.0f);
@@ -220,13 +220,13 @@ TEST_P(FunctionTest, UnaryFuncCalledWithMultipleArgsFails) {
   PartialTensorShape unknown_shape;
   TaggedValue signature(unknown_shape, DT_FLOAT);
   Status s = tf_function.RegisterTrace(std::move(trace), signature, signature);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   TaggedValue args = TaggedValue::Tuple();
   args.tuple().emplace_back(TaggedValue(x));
   args.tuple().emplace_back(TaggedValue(x));
   StatusOr<TaggedValue> v = tf_function.Execute(ctx_.get(), args);
   ASSERT_TRUE(tensorflow::errors::IsInvalidArgument(v.status()));
-  ASSERT_TRUE(absl::StrContains(v.status().error_message(), "No match"));
+  ASSERT_TRUE(absl::StrContains(v.status().message(), "No match"));
 }
 
 TEST_P(FunctionTest, IncorrectArityOfOutputSignatureFails) {
@@ -248,13 +248,13 @@ TEST_P(FunctionTest, IncorrectArityOfOutputSignatureFails) {
   TaggedValue output_signature(unknown_shape, DT_FLOAT);
   Status s = tf_function.RegisterTrace(std::move(trace), input_signature,
                                        output_signature);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   TaggedValue args = TaggedValue::Tuple();
   args.tuple().emplace_back(TaggedValue(x));
   args.tuple().emplace_back(TaggedValue(y));
   StatusOr<TaggedValue> v = tf_function.Execute(ctx_.get(), args);
   ASSERT_TRUE(tensorflow::errors::IsInvalidArgument(v.status())) << v.status();
-  ASSERT_TRUE(absl::StrContains(v.status().error_message(),
+  ASSERT_TRUE(absl::StrContains(v.status().message(),
                                 "Expecting 2 outputs, but *num_retvals is 1"));
 }
 
@@ -273,15 +273,15 @@ TEST_P(FunctionTest, IncorrectDtypeInOutputSignatureFails) {
   TaggedValue output_tensor_spec(unknown_shape, tensorflow::DT_INT64);
   Status s = tf_function.RegisterTrace(std::move(trace), input_signature,
                                        output_tensor_spec);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   TaggedValue args = TaggedValue::Tuple();
   args.tuple().emplace_back(TaggedValue(x));
   args.tuple().emplace_back(TaggedValue(x));
   StatusOr<TaggedValue> v = tf_function.Execute(ctx_.get(), args);
   ASSERT_TRUE(tensorflow::errors::IsInternal(v.status())) << v.status();
-  ASSERT_TRUE(absl::StrContains(v.status().error_message(),
-                                "Shape and dtype of tensor"));
-  ASSERT_TRUE(absl::StrContains(v.status().error_message(),
+  ASSERT_TRUE(
+      absl::StrContains(v.status().message(), "Shape and dtype of tensor"));
+  ASSERT_TRUE(absl::StrContains(v.status().message(),
                                 "does not match that in signature"));
 }
 
diff --git a/tensorflow/cc/experimental/libtf/tests/tensor_test.cc b/tensorflow/cc/experimental/libtf/tests/tensor_test.cc
index 0115d0ac50f..3f4708f0f0d 100644
--- a/tensorflow/cc/experimental/libtf/tests/tensor_test.cc
+++ b/tensorflow/cc/experimental/libtf/tests/tensor_test.cc
@@ -43,7 +43,7 @@ class UnifiedCAPI
     TF_StatusPtr status(TF_NewStatus());
     TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
     Status s = tensorflow::StatusFromTF_Status(status.get());
-    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.message();
   }
 };
 
@@ -52,7 +52,7 @@ template <class T>
 TaggedValue MakeContext(T runtime) {
   AbstractContext* ctx_raw = nullptr;
   Status s = BuildImmediateExecutionContext(runtime, &ctx_raw);
-  // ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+  // ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.message();
   return TaggedValue::Capsule(static_cast<void*>(ctx_raw), [](void* p) {
     tensorflow::internal::AbstractContextDeleter()(
         static_cast<AbstractContext*>(p));
@@ -67,7 +67,7 @@ TEST_P(UnifiedCAPI, HoldTensors) {
     AbstractContext* ctx_raw = nullptr;
     Status s =
         BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
-    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.message();
     ctx.reset(ctx_raw);
   }
 
@@ -76,7 +76,7 @@ TEST_P(UnifiedCAPI, HoldTensors) {
   {
     AbstractTensorHandle* x_raw = nullptr;
     Status s = TestScalarTensorHandle<float, TF_FLOAT>(ctx.get(), 2.0f, &x_raw);
-    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.message();
     x.reset(x_raw, false);
   }
   // Manually copy pointer so we can later compare the reference count.
diff --git a/tensorflow/cc/experimental/libtf/tests/variable_test.cc b/tensorflow/cc/experimental/libtf/tests/variable_test.cc
index 402943a58ca..8e7aca22bdc 100644
--- a/tensorflow/cc/experimental/libtf/tests/variable_test.cc
+++ b/tensorflow/cc/experimental/libtf/tests/variable_test.cc
@@ -48,7 +48,7 @@ class VariableTest
   impl::TaggedValueTensor CreateScalarTensor(T val) {
     AbstractTensorHandle* raw = nullptr;
     Status s = TestScalarTensorHandle<T, datatype>(ctx_.get(), val, &raw);
-    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.message();
     return impl::TaggedValueTensor(raw, /*add_ref=*/false);
   }
 
@@ -62,12 +62,12 @@ class VariableTest
     TF_StatusPtr status(TF_NewStatus());
     TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
     Status s = tensorflow::StatusFromTF_Status(status.get());
-    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.message();
 
     // Set the runtime impl, Core RT vs TFRT.
     AbstractContext* ctx_raw = nullptr;
     s = BuildImmediateExecutionContext(UseTfrt(), &ctx_raw);
-    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.message();
     ctx_.reset(ctx_raw);
   }
 };
@@ -76,7 +76,7 @@ template <typename T>
 void ExpectEquals(AbstractTensorHandle* t, T expected) {
   TF_Tensor* result_t;
   Status s = tensorflow::GetValue(t, &result_t);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   auto value = static_cast<T*>(TF_TensorData(result_t));
   EXPECT_EQ(*value, expected);
   TF_DeleteTensor(result_t);
@@ -89,7 +89,7 @@ TEST_P(VariableTest, CreateAssignReadDestroy) {
     AbstractTensorHandle* var_ptr = nullptr;
     PartialTensorShape scalar_shape;
     TF_EXPECT_OK(
-        PartialTensorShape::MakePartialShape<int32>({}, 0, &scalar_shape));
+        PartialTensorShape::MakePartialShape<int32_t>({}, 0, &scalar_shape));
     TF_EXPECT_OK(tensorflow::ops::VarHandleOp(ctx_.get(), &var_ptr, DT_FLOAT,
                                               scalar_shape));
     var.reset(var_ptr);
diff --git a/tensorflow/cc/framework/cc_ops_test.cc b/tensorflow/cc/framework/cc_ops_test.cc
index 178b4da972a..4c978da32ea 100644
--- a/tensorflow/cc/framework/cc_ops_test.cc
+++ b/tensorflow/cc/framework/cc_ops_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/testutil.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -241,7 +243,7 @@ TEST(CCOpTest, InvalidFinalize) {
       ops::ReaderReadUpTo(root, Variable(root, {}, DT_STRING),
                           Variable(root, {}, DT_STRING), static_cast<int32>(2));
   EXPECT_FALSE(root.status().ok());
-  auto err_msg = root.status().error_message();
+  auto err_msg = std::string(root.status().message());
   EXPECT_NE(err_msg.find("'num_records' passed int32 expected int64"),
             string::npos);
 }
diff --git a/tensorflow/cc/framework/gradients_test.cc b/tensorflow/cc/framework/gradients_test.cc
index 75291678177..2256d795422 100644
--- a/tensorflow/cc/framework/gradients_test.cc
+++ b/tensorflow/cc/framework/gradients_test.cc
@@ -459,7 +459,7 @@ TEST_F(GradientsTest, UnreachableInput) {
   Status status =
       AddSymbolicGradients(scope_test_, {m1}, {z}, {dm1}, &grad_outputs);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_EQ(status.error_message(),
+  EXPECT_EQ(status.message(),
             "Cannot compute the partial derivative"
             " for node 'z' as it's unreachable from the output node(s).");
 }
diff --git a/tensorflow/cc/ops/while_loop_test.cc b/tensorflow/cc/ops/while_loop_test.cc
index 18b8be3794f..1e9338eb0c2 100644
--- a/tensorflow/cc/ops/while_loop_test.cc
+++ b/tensorflow/cc/ops/while_loop_test.cc
@@ -42,7 +42,7 @@ class WhileLoopTest : public ::testing::Test {
     Status s =
         ops::BuildWhileLoop(scope_, inputs_, cond, body, kFrameName, &outputs_);
     EXPECT_EQ(s.code(), error_code);
-    EXPECT_EQ(s.error_message(), error_msg);
+    EXPECT_EQ(s.message(), error_msg);
   }
 
   template <typename T>
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index a3e4c20b7c9..d52db030b1b 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -26,7 +26,9 @@ package(
     licenses = ["notice"],
 )
 
-exports_files(["loader.h"])
+exports_files([
+    "loader.h",
+])
 
 cc_library(
     name = "constants",
@@ -58,9 +60,9 @@ cc_library(
     hdrs = ["reader.h"],
     deps = [
         ":constants",
-        "//tensorflow/core:protos_all_cc",
         ":metrics",
         ":util",
+        "//tensorflow/core:protos_all_cc",
     ] + if_not_mobile([
         # TODO(b/111634734): :lib and :protos_all contain dependencies that
         # cannot be built on mobile platforms. Instead, include the appropriate
@@ -158,6 +160,7 @@ cc_library(
         "//tensorflow/core/util/tensor_bundle",
         "//tensorflow/core/util/tensor_bundle:byteswaptensor",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@jsoncpp_git//:jsoncpp",
     ],
 )
 
@@ -186,6 +189,11 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:test",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@jsoncpp_git//:jsoncpp",
     ],
 )
 
@@ -331,7 +339,12 @@ cc_library(
         "//tensorflow/python:__pkg__",
         "//tensorflow/security/fuzzing/cc/ops:__pkg__",  # TODO(b/261455394): Remove.
     ],
-    deps = if_not_mobile(["//tensorflow/core:lib"]) + if_android(["//tensorflow/core:portable_tensorflow_lib_lite"]),
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@jsoncpp_git//:jsoncpp",
+    ] + if_not_mobile(["//tensorflow/core:lib"]) + if_android(["//tensorflow/core:portable_tensorflow_lib_lite"]),
     alwayslink = True,
 )
 
@@ -341,7 +354,11 @@ cc_library(
     visibility = ["//tensorflow/python/saved_model:__subpackages__"],
     deps = if_static([
         ":metrics_impl",
-    ]) + if_not_mobile(["//tensorflow/core:lib"]) + if_android(["//tensorflow/core:portable_tensorflow_lib_lite"]),
+    ]) + if_not_mobile(["//tensorflow/core:lib"]) + if_android(["//tensorflow/core:portable_tensorflow_lib_lite"]) + [
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
 )
 
 tf_cc_test(
@@ -350,9 +367,10 @@ tf_cc_test(
     srcs = ["metrics_test.cc"],
     deps = [
         ":metrics",
-        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_googletest//:gtest_main",
+        "@jsoncpp_git//:jsoncpp",
     ],
 )
 
@@ -392,14 +410,14 @@ cc_library(
     ],
     deps = [
         ":constants",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/graph/regularization:simple_delete",
         "//tensorflow/core/graph/regularization:util",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/util/tensor_bundle:naming",
         "//tensorflow/tsl/platform:types",
-        "@com_google_protobuf//:protobuf_headers",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/strings",
+        "@com_google_protobuf//:protobuf_headers",
     ] + if_not_mobile(["//tensorflow/core:lib"]) + if_android(["//tensorflow/core:portable_tensorflow_lib_lite"]),
     alwayslink = True,
 )
@@ -407,7 +425,12 @@ cc_library(
 cc_library(
     name = "fingerprinting",
     hdrs = ["fingerprinting.h"],
-    visibility = ["//tensorflow/python/saved_model:__subpackages__"],
+    visibility = [
+        "//learning/brain/contrib/hub/server/distro:__subpackages__",
+        "//learning/brain/contrib/tpu_modeling:__subpackages__",
+        "//learning/tfx/pipeline/util:__subpackages__",
+        "//tensorflow/python/saved_model:__subpackages__",
+    ],
     deps = if_static([
         ":fingerprinting_impl",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/cc/saved_model/bundle_v2.cc b/tensorflow/cc/saved_model/bundle_v2.cc
index 90f220575c5..21692edbf40 100644
--- a/tensorflow/cc/saved_model/bundle_v2.cc
+++ b/tensorflow/cc/saved_model/bundle_v2.cc
@@ -73,8 +73,8 @@ Status ReadSavedModelProto(const string& export_dir,
 
   Status err;
   if (found_pb.code() == found_pbtxt.code()) {
-    err = Status(found_pb.code(), StrCat(found_pb.error_message(), "\n",
-                                         found_pbtxt.error_message()));
+    err = Status(found_pb.code(),
+                 StrCat(found_pb.message(), "\n", found_pbtxt.message()));
   } else if (found_pb.code() == NOT_FOUND) {
     err = found_pbtxt;
   } else if (found_pbtxt.code() == NOT_FOUND) {
@@ -171,11 +171,17 @@ Status SavedModelV2Bundle::Load(const std::string& export_dir,
   // Read the fingerprint.
   auto fingerprint_proto =
       saved_model::fingerprinting::ReadSavedModelFingerprint(export_dir);
+  std::string singleprint = "";
   if (fingerprint_proto.ok()) {
-    // Set gauge cell with saved_model_checksum.
     metrics::SavedModelReadFingerprint().Set(
-        std::to_string(fingerprint_proto->saved_model_checksum()));
+        metrics::MakeFingerprintJson(fingerprint_proto.value()));
+
+    singleprint =
+        saved_model::fingerprinting::Singleprint(fingerprint_proto.value());
   }
+
+  metrics::SavedModelReadPathAndSingleprint().Set(
+      metrics::MakeSavedModelPathAndSingleprint(export_dir, singleprint));
   return OkStatus();
 }
 
diff --git a/tensorflow/cc/saved_model/bundle_v2.h b/tensorflow/cc/saved_model/bundle_v2.h
index 76e6ce20e70..e199bd1cc5d 100644
--- a/tensorflow/cc/saved_model/bundle_v2.h
+++ b/tensorflow/cc/saved_model/bundle_v2.h
@@ -25,8 +25,8 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
 #include "tensorflow/core/protobuf/trackable_object_graph.pb.h"
diff --git a/tensorflow/cc/saved_model/bundle_v2_test.cc b/tensorflow/cc/saved_model/bundle_v2_test.cc
index f6434914455..6dc3be0bf56 100644
--- a/tensorflow/cc/saved_model/bundle_v2_test.cc
+++ b/tensorflow/cc/saved_model/bundle_v2_test.cc
@@ -17,19 +17,25 @@ limitations under the License.
 
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "json/json.h"
+#include "json/reader.h"
+#include "json/value.h"
 #include "tensorflow/cc/saved_model/metrics.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/fingerprint.pb.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace {
 
 constexpr char kTestData[] = "cc/saved_model/testdata";
-// This is the value in testdata/VarsAndArithmeticObjectGraph/fingerprint.pb
-constexpr char kV2ModuleSavedModelChecksum[] = "15788619162413586750";
 
 class BundleV2Test : public ::testing::Test {
  protected:
@@ -116,10 +122,33 @@ TEST_F(BundleV2Test, UpdatesMetrics) {
   EXPECT_EQ(metrics::SavedModelReadCount("2").value(), read_count + 1);
   EXPECT_EQ(metrics::SavedModelReadApi(kCCLoadBundleV2Label).value(),
             api_count + 1);
-  // Check that the gauge contains the fingerprint.
-  EXPECT_EQ(metrics::SavedModelReadFingerprint().value(),
-            kV2ModuleSavedModelChecksum);
+  // Check that the gauge contains the path and fingerprint.
   EXPECT_EQ(metrics::SavedModelReadPath().value(), export_dir);
+
+  Json::Value fingerprint = Json::objectValue;
+  Json::Reader reader = Json::Reader();
+  reader.parse(metrics::SavedModelReadFingerprint().value(), fingerprint);
+  EXPECT_EQ(fingerprint["saved_model_checksum"].asUInt64(),
+            15788619162413586750ULL);
+  EXPECT_EQ(fingerprint["graph_def_program_hash"].asUInt64(),
+            706963557435316516ULL);
+  EXPECT_EQ(fingerprint["signature_def_hash"].asUInt64(),
+            5693392539583495303ULL);
+  EXPECT_EQ(fingerprint["saved_object_graph_hash"].asUInt64(),
+            12074714563970609759ULL);
+  EXPECT_EQ(fingerprint["checkpoint_hash"].asUInt64(), 10788359570789890102ULL);
+
+  // TODO(adamcogdell): add ASSERT_OK_AND_ASSIGN here after migrating
+  // cc/saved_model code from the tsl version of StatusOr to absl::StatusOr
+  auto [path, singleprint] = metrics::ParseSavedModelPathAndSingleprint(
+      metrics::SavedModelReadPathAndSingleprint().value());
+  EXPECT_TRUE(absl::StrContains(
+      path, absl::StrCat(kTestData, "/VarsAndArithmeticObjectGraph")));
+  EXPECT_EQ(singleprint,
+            "706963557435316516/"     // graph_def_program_hash
+            "5693392539583495303/"    // signature_def_hash
+            "12074714563970609759/"   // saved_object_graph_hash
+            "10788359570789890102");  // checkpoint_hash
 }
 
 }  // namespace
diff --git a/tensorflow/cc/saved_model/fingerprinting.cc b/tensorflow/cc/saved_model/fingerprinting.cc
index 0bb064c107f..389b28bf278 100644
--- a/tensorflow/cc/saved_model/fingerprinting.cc
+++ b/tensorflow/cc/saved_model/fingerprinting.cc
@@ -152,16 +152,14 @@ StatusOr<FingerprintDef> ReadSavedModelFingerprint(
   const string fingerprint_pb_path =
       io::JoinPath(export_dir, kFingerprintFilenamePb);
   Status found_pb = Env::Default()->FileExists(fingerprint_pb_path);
-  if (found_pb.ok()) {
-    FingerprintDef fingerprint_proto;
-    Status result = ReadBinaryProto(Env::Default(), fingerprint_pb_path,
-                                    &fingerprint_proto);
-    if (result.ok()) {
-      return fingerprint_proto;
-    }
-    return result;
-  }
-  return found_pb;
+  if (!found_pb.ok()) return found_pb;
+
+  FingerprintDef fingerprint_proto;
+  Status result =
+      ReadBinaryProto(Env::Default(), fingerprint_pb_path, &fingerprint_proto);
+  if (!result.ok()) return result;
+
+  return fingerprint_proto;
 }
 
 std::string Singleprint(uint64 graph_def_program_hash,
diff --git a/tensorflow/cc/saved_model/fingerprinting_test.cc b/tensorflow/cc/saved_model/fingerprinting_test.cc
index 7e298cfc844..1c1e12440d6 100644
--- a/tensorflow/cc/saved_model/fingerprinting_test.cc
+++ b/tensorflow/cc/saved_model/fingerprinting_test.cc
@@ -136,7 +136,8 @@ TEST(FingerprintingTest, TestReadValidFingerprint) {
 TEST(FingerprintingTest, TestReadNonexistentFingerprint) {
   const std::string export_dir = io::JoinPath(
       testing::TensorFlowSrcRoot(), "cc/saved_model/testdata", "AssetModule");
-  EXPECT_FALSE(ReadSavedModelFingerprint(export_dir).ok());
+  EXPECT_EQ(ReadSavedModelFingerprint(export_dir).status().code(),
+            absl::StatusCode::kNotFound);
 }
 
 TEST(FingerprintingTest, TestSingleprint) {
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 75869afe687..b9544bc7555 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/util.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
 #include "tensorflow/core/public/session.h"
diff --git a/tensorflow/cc/saved_model/loader.h b/tensorflow/cc/saved_model/loader.h
index 9d43f4ecc76..f2d318a25b7 100644
--- a/tensorflow/cc/saved_model/loader.h
+++ b/tensorflow/cc/saved_model/loader.h
@@ -21,8 +21,8 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/public/session.h"
 
diff --git a/tensorflow/cc/saved_model/metrics.cc b/tensorflow/cc/saved_model/metrics.cc
index 86ff72a7839..f44abe8b659 100644
--- a/tensorflow/cc/saved_model/metrics.cc
+++ b/tensorflow/cc/saved_model/metrics.cc
@@ -16,10 +16,15 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/metrics.h"
 
 #include <string>
+#include <utility>
 
+#include "json/config.h"
+#include "json/json.h"
+#include "json/writer.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/lib/monitoring/sampler.h"
+#include "tensorflow/core/protobuf/fingerprint.pb.h"
 
 namespace tensorflow {
 namespace metrics {
@@ -64,6 +69,17 @@ auto* saved_model_write_path = monitoring::Gauge<string, 0>::New(
     "/tensorflow/core/saved_model/write/path",
     "The path (saved_model_path) of the exported SavedModel.");
 
+// Gauge that contains the path (saved_model_path) and the singleprint
+// (concatenation of graph_def_program_hash, signature_def_hash,
+// saved_object_graph_hash, and checkpoint_hash) of the newly written
+// SavedModel.
+auto* saved_model_write_path_and_singleprint =
+    monitoring::Gauge<string, 0>::New(
+        "/tensorflow/core/saved_model/write/path_and_singleprint",
+        "The path (saved_model_path) and singleprint (concatenation of "
+        "graph_def_program_hash, signature_def_hash, saved_object_graph_hash, "
+        "and checkpoint_hash) of the newly written SavedModel.");
+
 // Gauge that contains the fingerprint (saved_model_checksum) of the loaded
 // SavedModel.
 auto* saved_model_read_fingerprint = monitoring::Gauge<string, 0>::New(
@@ -75,6 +91,15 @@ auto* saved_model_read_path = monitoring::Gauge<string, 0>::New(
     "/tensorflow/core/saved_model/read/path",
     "The path (saved_model_path) of the loaded SavedModel.");
 
+// Gauge that contains the path (saved_model_path) and the singleprint
+// (concatenation of graph_def_program_hash, signature_def_hash,
+// saved_object_graph_hash, and checkpoint_hash) of the loaded SavedModel.
+auto* saved_model_read_path_and_singleprint = monitoring::Gauge<string, 0>::New(
+    "/tensorflow/core/saved_model/read/path_and_singleprint",
+    "The path (saved_model_path) and singleprint (concatenation of "
+    "graph_def_program_hash, signature_def_hash, saved_object_graph_hash, "
+    "and checkpoint_hash) of the loaded SavedModel.");
+
 // Distribution of checkpoint write durations.
 auto* checkpoint_write_durations = monitoring::Sampler<1>::New(
     {
@@ -153,6 +178,10 @@ monitoring::GaugeCell<string>& SavedModelReadPath() {
   return *saved_model_read_path->GetCell();
 }
 
+monitoring::GaugeCell<string>& SavedModelReadPathAndSingleprint() {
+  return *saved_model_read_path_and_singleprint->GetCell();
+}
+
 monitoring::GaugeCell<string>& SavedModelWriteFingerprint() {
   return *saved_model_write_fingerprint->GetCell();
 }
@@ -161,6 +190,41 @@ monitoring::GaugeCell<string>& SavedModelWritePath() {
   return *saved_model_write_path->GetCell();
 }
 
+monitoring::GaugeCell<string>& SavedModelWritePathAndSingleprint() {
+  return *saved_model_write_path_and_singleprint->GetCell();
+}
+
+string MakeFingerprintJson(FingerprintDef fingerprint_serialized) {
+  Json::Value fingerprint = Json::objectValue;
+  fingerprint["saved_model_checksum"] =
+      Json::UInt64(fingerprint_serialized.saved_model_checksum());
+  fingerprint["graph_def_program_hash"] =
+      Json::UInt64(fingerprint_serialized.graph_def_program_hash());
+  fingerprint["signature_def_hash"] =
+      Json::UInt64(fingerprint_serialized.signature_def_hash());
+  fingerprint["saved_object_graph_hash"] =
+      Json::UInt64(fingerprint_serialized.saved_object_graph_hash());
+  fingerprint["checkpoint_hash"] =
+      Json::UInt64(fingerprint_serialized.checkpoint_hash());
+
+  Json::StreamWriterBuilder json_factory;
+  return Json::writeString(json_factory, fingerprint);
+}
+
+string MakeSavedModelPathAndSingleprint(string path, string singleprint) {
+  return absl::StrCat(path, ":", singleprint);
+}
+
+std::pair<string, string> ParseSavedModelPathAndSingleprint(
+    string path_and_singleprint) {
+  size_t delimiter = path_and_singleprint.rfind(':');
+  if (delimiter == std::string::npos) {
+    return std::pair<string, string>("", "");
+  }
+  return std::pair<string, string>(path_and_singleprint.substr(0, delimiter),
+                                   path_and_singleprint.substr(delimiter + 1));
+}
+
 monitoring::SamplerCell& CheckpointReadDuration(absl::string_view api_label) {
   return *checkpoint_read_durations->GetCell(std::string(api_label));
 }
diff --git a/tensorflow/cc/saved_model/metrics.h b/tensorflow/cc/saved_model/metrics.h
index f89374af0fa..c39b9c3bc8f 100644
--- a/tensorflow/cc/saved_model/metrics.h
+++ b/tensorflow/cc/saved_model/metrics.h
@@ -20,11 +20,13 @@ limitations under the License.
 
 #ifndef TENSORFLOW_CC_SAVED_MODEL_METRICS_H_
 #define TENSORFLOW_CC_SAVED_MODEL_METRICS_H_
-#include <string>
+#include <utility>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/lib/monitoring/sampler.h"
+#include "tensorflow/core/protobuf/fingerprint.pb.h"
 
 namespace tensorflow {
 namespace metrics {
@@ -49,6 +51,12 @@ monitoring::GaugeCell<string>& SavedModelWriteFingerprint();
 // the saved_model_path of the SM when it is exported.
 monitoring::GaugeCell<string>& SavedModelWritePath();
 
+// Returns "/tensorflow/core/saved_model/write/path_and_fingerprint" cell, which
+// contains the path (saved_model_path) and fingerprint (concatenation of
+// graph_def_program_hash, signature_def_hash, saved_object_graph_hash,
+// and checkpoint_hash) of the SavedModel when it is exported.
+monitoring::GaugeCell<string>& SavedModelWritePathAndSingleprint();
+
 // Returns "/tensorflow/core/saved_model/read/fingerprint" cell, wich contains
 // the saved_model_checksum of the SM's fingerprint when it is imported.
 monitoring::GaugeCell<string>& SavedModelReadFingerprint();
@@ -57,6 +65,24 @@ monitoring::GaugeCell<string>& SavedModelReadFingerprint();
 // the saved_model_path of the SM when it is imported.
 monitoring::GaugeCell<string>& SavedModelReadPath();
 
+// Returns "/tensorflow/core/saved_model/read/path_and_fingerprint" cell, which
+// contains the path (saved_model_path) and singleprint (concatenation of
+// graph_def_program_hash, signature_def_hash, saved_object_graph_hash,
+// and checkpoint_hash) of the SavedModel when it is imported.
+monitoring::GaugeCell<string>& SavedModelReadPathAndSingleprint();
+
+// Returns the fingerprint as a Json string.
+string MakeFingerprintJson(FingerprintDef fingerprint_serialized);
+
+// Returns canonical string concatenation of path and singleprint.
+string MakeSavedModelPathAndSingleprint(string path, string singleprint);
+
+// TODO(adamcogdell): change to StatusOr<> to account for missing delimiter
+// Returns path and singleprint as a pair, parsed canonically from the string
+// metric.
+std::pair<string, string> ParseSavedModelPathAndSingleprint(
+    string path_and_singleprint);
+
 // Returns "/tensorflow/core/saved_model/write/api" cell. This metric has 1
 // field "api_label" which corresponds to a SavedModel write API. The cell for
 // `foo` should be incremented when the write API `foo` is called.
diff --git a/tensorflow/cc/saved_model/metrics_test.cc b/tensorflow/cc/saved_model/metrics_test.cc
index 0f876040b44..b4901c942c1 100644
--- a/tensorflow/cc/saved_model/metrics_test.cc
+++ b/tensorflow/cc/saved_model/metrics_test.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/cc/saved_model/metrics.h"
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "json/json.h"
+#include "json/reader.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -89,6 +93,17 @@ TEST(MetricsTest, TestWritePath) {
   EXPECT_EQ(SavedModelWritePath().value(), "bar");
 }
 
+TEST(MetricsTest, TestWritePathAndSingleprint) {
+  EXPECT_EQ(SavedModelWritePathAndSingleprint().value(), "");
+  SavedModelWritePathAndSingleprint().Set("foo");
+  EXPECT_EQ(SavedModelWritePathAndSingleprint().value(), "foo");
+  SavedModelWritePathAndSingleprint().Set("bar");
+  EXPECT_EQ(SavedModelWritePathAndSingleprint().value(), "bar");
+
+  EXPECT_EQ(MakeSavedModelPathAndSingleprint("path", "singleprint"),
+            "path:singleprint");
+}
+
 TEST(MetricsTest, TestReadFingerprint) {
   EXPECT_EQ(SavedModelReadFingerprint().value(), "");
   SavedModelReadFingerprint().Set("foo");
@@ -105,5 +120,44 @@ TEST(MetricsTest, TestReadPath) {
   EXPECT_EQ(SavedModelReadPath().value(), "bar");
 }
 
+TEST(MetricsTest, TestReadPathAndSingleprint) {
+  EXPECT_EQ(SavedModelReadPathAndSingleprint().value(), "");
+  SavedModelReadPathAndSingleprint().Set("foo");
+  EXPECT_EQ(SavedModelReadPathAndSingleprint().value(), "foo");
+  SavedModelReadPathAndSingleprint().Set("bar");
+  EXPECT_EQ(SavedModelReadPathAndSingleprint().value(), "bar");
+
+  auto [path, singleprint] =
+      ParseSavedModelPathAndSingleprint("path/model:name:singleprint");
+  EXPECT_EQ(path, "path/model:name");
+  EXPECT_EQ(singleprint, "singleprint");
+}
+
+TEST(MetricsTest, TestMakeFingerprintJson) {
+  FingerprintDef fingerprint;
+  fingerprint.set_saved_model_checksum(1);
+  fingerprint.set_graph_def_program_hash(2);
+  fingerprint.set_signature_def_hash(3);
+  fingerprint.set_saved_object_graph_hash(4);
+  fingerprint.set_checkpoint_hash(5);
+
+  string serialized_fingerprint_json = MakeFingerprintJson(fingerprint);
+
+  EXPECT_EQ(
+      serialized_fingerprint_json,
+      "{\n\t\"checkpoint_hash\" : 5,\n\t\"graph_def_program_hash\" : "
+      "2,\n\t\"saved_model_checksum\" : 1,\n\t\"saved_object_graph_hash\" : "
+      "4,\n\t\"signature_def_hash\" : 3\n}");
+
+  Json::Value fingerprint_json = Json::objectValue;
+  Json::Reader reader = Json::Reader();
+  reader.parse(serialized_fingerprint_json, fingerprint_json);
+  EXPECT_EQ(fingerprint_json["saved_model_checksum"].asUInt64(), 1);
+  EXPECT_EQ(fingerprint_json["graph_def_program_hash"].asUInt64(), 2);
+  EXPECT_EQ(fingerprint_json["signature_def_hash"].asUInt64(), 3);
+  EXPECT_EQ(fingerprint_json["saved_object_graph_hash"].asUInt64(), 4);
+  EXPECT_EQ(fingerprint_json["checkpoint_hash"].asUInt64(), 5);
+}
+
 }  // namespace metrics
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/reader.h b/tensorflow/cc/saved_model/reader.h
index 2c2cb865b93..f51fbeb557f 100644
--- a/tensorflow/cc/saved_model/reader.h
+++ b/tensorflow/cc/saved_model/reader.h
@@ -21,8 +21,8 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/cc/saved_model/reader_test.cc b/tensorflow/cc/saved_model/reader_test.cc
index 443c04efe45..4b8b5cde20d 100644
--- a/tensorflow/cc/saved_model/reader_test.cc
+++ b/tensorflow/cc/saved_model/reader_test.cc
@@ -71,9 +71,9 @@ TEST_F(ReaderTest, NoTagMatch) {
                                              &meta_graph_def);
   EXPECT_FALSE(st.ok());
   EXPECT_TRUE(absl::StrContains(
-      st.error_message(),
+      st.message(),
       "Could not find meta graph def matching supplied tags: { missing-tag }"))
-      << st.error_message();
+      << st.message();
 }
 
 TEST_F(ReaderTest, NoTagMatchMultiple) {
@@ -84,9 +84,8 @@ TEST_F(ReaderTest, NoTagMatchMultiple) {
       export_dir, {kSavedModelTagServe, "missing-tag"}, &meta_graph_def);
   EXPECT_FALSE(st.ok());
   EXPECT_TRUE(absl::StrContains(
-      st.error_message(),
-      "Could not find meta graph def matching supplied tags: "))
-      << st.error_message();
+      st.message(), "Could not find meta graph def matching supplied tags: "))
+      << st.message();
 }
 
 TEST_F(ReaderTest, PbtxtFormat) {
diff --git a/tensorflow/cc/saved_model/saved_model_bundle_lite_test.cc b/tensorflow/cc/saved_model/saved_model_bundle_lite_test.cc
index 604fc412800..5d3690ea1a5 100644
--- a/tensorflow/cc/saved_model/saved_model_bundle_lite_test.cc
+++ b/tensorflow/cc/saved_model/saved_model_bundle_lite_test.cc
@@ -159,9 +159,9 @@ TEST_F(LoaderTest, NoTagMatch) {
                              {"missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
   EXPECT_TRUE(absl::StrContains(
-      st.error_message(),
+      st.message(),
       "Could not find meta graph def matching supplied tags: { missing-tag }"))
-      << st.error_message();
+      << st.message();
 }
 
 TEST_F(LoaderTest, NoTagMatchMultiple) {
@@ -175,9 +175,8 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
                              {kSavedModelTagServe, "missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
   EXPECT_TRUE(absl::StrContains(
-      st.error_message(),
-      "Could not find meta graph def matching supplied tags: "))
-      << st.error_message();
+      st.message(), "Could not find meta graph def matching supplied tags: "))
+      << st.message();
 }
 
 TEST_F(LoaderTest, SessionCreationFailure) {
@@ -194,8 +193,7 @@ TEST_F(LoaderTest, SessionCreationFailure) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(absl::StrContains(st.error_message(), kInvalidTarget))
-      << st.error_message();
+  EXPECT_TRUE(absl::StrContains(st.message(), kInvalidTarget)) << st.message();
 }
 
 TEST_F(LoaderTest, PbtxtFormat) {
diff --git a/tensorflow/cc/saved_model/saved_model_bundle_test.cc b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
index 7e78aee67b1..eda63fba4fe 100644
--- a/tensorflow/cc/saved_model/saved_model_bundle_test.cc
+++ b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
@@ -189,9 +189,9 @@ TEST_F(LoaderTest, NoTagMatch) {
                              {"missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
   EXPECT_TRUE(absl::StrContains(
-      st.error_message(),
+      st.message(),
       "Could not find meta graph def matching supplied tags: { missing-tag }"))
-      << st.error_message();
+      << st.message();
 }
 
 TEST_F(LoaderTest, NoTagMatchMultiple) {
@@ -205,9 +205,8 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
                              {kSavedModelTagServe, "missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
   EXPECT_TRUE(absl::StrContains(
-      st.error_message(),
-      "Could not find meta graph def matching supplied tags: "))
-      << st.error_message();
+      st.message(), "Could not find meta graph def matching supplied tags: "))
+      << st.message();
 }
 
 TEST_F(LoaderTest, SessionCreationFailure) {
@@ -224,8 +223,7 @@ TEST_F(LoaderTest, SessionCreationFailure) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(absl::StrContains(st.error_message(), kInvalidTarget))
-      << st.error_message();
+  EXPECT_TRUE(absl::StrContains(st.message(), kInvalidTarget)) << st.message();
 }
 
 TEST_F(LoaderTest, PbtxtFormat) {
@@ -317,9 +315,8 @@ TEST_F(LoaderTest, NegativeShapeDimension) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_NE(
-      st.error_message().find("initializes from a tensor with -1 elements"),
-      std::string::npos);
+  EXPECT_NE(st.message().find("initializes from a tensor with -1 elements"),
+            std::string::npos);
 }
 
 TEST_F(LoaderTest, ConstNoValue) {
@@ -332,9 +329,8 @@ TEST_F(LoaderTest, ConstNoValue) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_NE(
-      st.error_message().find("constant tensor but no value has been provided"),
-      std::string::npos);
+  EXPECT_NE(st.message().find("constant tensor but no value has been provided"),
+            std::string::npos);
 }
 
 TEST_F(LoaderTest, BadNodeAttr) {
@@ -347,9 +343,8 @@ TEST_F(LoaderTest, BadNodeAttr) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_NE(
-      st.error_message().find("constant tensor but no value has been provided"),
-      std::string::npos);
+  EXPECT_NE(st.message().find("constant tensor but no value has been provided"),
+            std::string::npos);
 }
 
 TEST_F(LoaderTest, UpdateMetricsV2) {
diff --git a/tensorflow/cc/training/coordinator.h b/tensorflow/cc/training/coordinator.h
index ca2b5f956bf..016b27101b9 100644
--- a/tensorflow/cc/training/coordinator.h
+++ b/tensorflow/cc/training/coordinator.h
@@ -37,7 +37,8 @@ class RunnerInterface {
   virtual ~RunnerInterface() {}
   virtual Status Join() = 0;
   virtual Status ExportCostGraph(CostGraphDef* cost_graph) const {
-    return Status(error::INVALID_ARGUMENT, "No cost model to export.");
+    return Status(absl::StatusCode::kInvalidArgument,
+                  "No cost model to export.");
   }
   /// Returns true iff the runner is running, i.e. if it is trying to populate
   /// its queue.
diff --git a/tensorflow/cc/training/coordinator_test.cc b/tensorflow/cc/training/coordinator_test.cc
index 75e0da6f8f0..75793297ddd 100644
--- a/tensorflow/cc/training/coordinator_test.cc
+++ b/tensorflow/cc/training/coordinator_test.cc
@@ -179,21 +179,24 @@ TEST(CoordinatorTest, StatusReporting) {
   BlockingCounter counter(3);
 
   std::unique_ptr<MockQueueRunner> qr1(new MockQueueRunner(&coord));
-  qr1->StartSettingStatus(Status(Code::CANCELLED, ""), &counter, &start);
+  qr1->StartSettingStatus(Status(absl::StatusCode::kCancelled, ""), &counter,
+                          &start);
   TF_ASSERT_OK(coord.RegisterRunner(std::move(qr1)));
 
   std::unique_ptr<MockQueueRunner> qr2(new MockQueueRunner(&coord));
-  qr2->StartSettingStatus(Status(Code::INVALID_ARGUMENT, ""), &counter, &start);
+  qr2->StartSettingStatus(Status(absl::StatusCode::kInvalidArgument, ""),
+                          &counter, &start);
   TF_ASSERT_OK(coord.RegisterRunner(std::move(qr2)));
 
   std::unique_ptr<MockQueueRunner> qr3(new MockQueueRunner(&coord));
-  qr3->StartSettingStatus(Status(Code::OUT_OF_RANGE, ""), &counter, &start);
+  qr3->StartSettingStatus(Status(absl::StatusCode::kOutOfRange, ""), &counter,
+                          &start);
   TF_ASSERT_OK(coord.RegisterRunner(std::move(qr3)));
 
   start.Notify();
   counter.Wait();
   TF_EXPECT_OK(coord.RequestStop());
-  EXPECT_EQ(coord.Join().code(), Code::INVALID_ARGUMENT);
+  EXPECT_EQ(coord.Join().code(), absl::StatusCode::kInvalidArgument);
 }
 
 TEST(CoordinatorTest, JoinWithoutStop) {
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 8a4414a96d7..18e3182e686 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -41,8 +41,8 @@ using ::xla::cpu_function_runtime::BufferInfo;
 
 void ExpectErrorContains(const Status& status, absl::string_view str) {
   EXPECT_NE(OkStatus(), status);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), str))
-      << "expected error: " << status.error_message() << " to contain: " << str;
+  EXPECT_TRUE(absl::StrContains(status.message(), str))
+      << "expected error: " << status.message() << " to contain: " << str;
 }
 
 TEST(ValidateCppIdent, Simple) {
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 3217174b79e..fd3bf0bb7e9 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -68,7 +68,7 @@ Status CompileXla(xla::CompileOnlyClient* client,
       client->GetComputationShape(computation);
   if (!pshape_or.ok()) {
     return errors::Unknown("Couldn't get XLA program shape: ",
-                           pshape_or.status().error_message());
+                           pshape_or.status().message());
   }
   compile_result->program_shape = pshape_or.value()->ToProto();
   xla::ProgramShapeProto* pshape = &compile_result->program_shape;
@@ -91,7 +91,7 @@ Status CompileXla(xla::CompileOnlyClient* client,
       aot_or = client->CompileAheadOfTime({instance}, aot_opts);
   if (!aot_or.ok()) {
     return errors::Unknown("XLA compilation failed: ",
-                           aot_or.status().error_message());
+                           aot_or.status().message());
   }
   compile_result->aot =
       xla::unique_ptr_static_cast<xla::cpu::CpuAotCompilationResult>(
@@ -260,7 +260,7 @@ Status Main(const MainFlags& flags) {
       CompileGraph(std::move(graph_def), config, flags, &compile_result);
   if (!status.ok()) {
     return errors::CreateWithUpdatedMessage(
-        status, InterpolateErrorMessage(status.error_message()));
+        status, InterpolateErrorMessage(std::string(status.message())));
   }
 
   // Write output files.
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 4e4c4bad3a3..f04aa37c887 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -69,17 +69,17 @@ py_binary(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session",
         "//tensorflow/python:training",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "@absl_py//absl:app",
         "@six_archive//:six",
@@ -325,6 +325,112 @@ tfcompile_test_dep_configs = [
     for suffix, mlir_component in tfcompile_test_dep_configs
 ]
 
+tfcompile_bench_tfmatmul_mkn = [
+    # Intentionally empty to avoid running unnecessary tests.
+    # Add here your desired (M, K, N) parameters, e.g.
+    #    (1, 1, 256),
+    #    (1, 2, 256),
+]
+
+tfcompile_bench_tfmatmul = [
+    (
+        "bench_graph_tfmatmul_%sx%sx%s" % (m, k, n),
+        "bench_graph_tfmatmul_%sx%sx%s.config.pbtxt" % (m, k, n),
+        "bench_graph_tfmatmul.template.pbtxt",
+        "-e \"s|<M>|%s|g\" -e \"s|<K>|%s|g\" -e \"s|<N>|%s|g\"" % (m, k, n),
+    )
+    for (m, k, n) in tfcompile_bench_tfmatmul_mkn
+]
+
+test_suite(
+    name = "all_tfmatmul_benchmarks",
+    tags = ["manual"],
+    tests = [
+        (":%s_test" % bench_name)
+        for (bench_name, _, _, _) in tfcompile_bench_tfmatmul
+    ],
+    visibility = ["//visibility:public"],
+)
+
+test_suite(
+    name = "all_tfmatmul_mlir_benchmarks",
+    tags = ["manual"],
+    tests = [
+        (":%s_mlir_test" % bench_name)
+        for (bench_name, _, _, _) in tfcompile_bench_tfmatmul
+    ],
+    visibility = ["//visibility:public"],
+)
+
+[[
+    genrule(
+        name = "gen_" + config_file,
+        testonly = 1,
+        srcs = [template_file],
+        outs = [config_file],
+        cmd = ("sed " + sed_replace + " " +
+               "$(location " + template_file + ") " +
+               "> $(OUTS)"),
+        tags = ["manual"],
+    ),
+    tf_library(
+        name = bench_name,
+        testonly = 1,
+        config = config_file,
+        cpp_class = "foo::bar::MatMulComp",
+        graph = "test_graph_tfmatmul.pb",
+        tags = [
+            "manual",
+            "no_mac",  # TODO(b/228273415)
+        ],
+    ),
+] for (bench_name, config_file, template_file, sed_replace) in tfcompile_bench_tfmatmul]
+
+tfcompile_bench_tfmatmul_tile_mkn = [
+    # Intentionally empty to avoid running unnecessary tests.
+    # Add here your desired (M, K, N) parameters, e.g.
+    #    (1, 2, 8),
+    #    (1, 4, 4),
+]
+
+tfcompile_bench_tfmatmul_custom_tiling = [
+    (
+        "bench_graph_tfmatmul_%sx%sx%s_tiled_%sx%sx%s_mlir" % (m, k, n, tm, tk, tn),
+        "bench_graph_tfmatmul_%sx%sx%s_tiled_%sx%sx%s_mlir.config.pbtxt" % (m, k, n, tm, tk, tn),
+        "bench_graph_tfmatmul.template.pbtxt",
+        "-e \"s|<M>|%s|g\" -e \"s|<K>|%s|g\" -e \"s|<N>|%s|g\"" % (m, k, n),
+        "--xla_cpu_enable_custom_matmul_tiling --xla_cpu_matmul_tiling_m_dim=%s --xla_cpu_matmul_tiling_k_dim=%s --xla_cpu_matmul_tiling_n_dim=%s" % (tm, tk, tn),
+    )
+    for (m, k, n) in tfcompile_bench_tfmatmul_mkn
+    for (tm, tk, tn) in tfcompile_bench_tfmatmul_tile_mkn
+]
+
+[[
+    genrule(
+        name = "gen_" + config_file,
+        testonly = 1,
+        srcs = [template_file],
+        outs = [config_file],
+        cmd = ("sed " + sed_replace + " " +
+               "$(location " + template_file + ") " +
+               "> $(OUTS)"),
+        tags = ["manual"],
+    ),
+    tf_library(
+        name = bench_name,
+        testonly = 1,
+        config = config_file,
+        cpp_class = "foo::bar::MatMulComp",
+        graph = "test_graph_tfmatmul.pb",
+        mlir_components = "HloLowering",  # XLA:CPU-Next only.
+        tags = [
+            "manual",
+            "no_mac",  # TODO(b/228273415)
+        ],
+        xla_flags = xla_flags,
+    ),
+] for (bench_name, config_file, template_file, sed_replace, xla_flags) in tfcompile_bench_tfmatmul_custom_tiling]
+
 tf_cc_test(
     name = "tfcompile_test",
     srcs = ["tfcompile_test.cc"],
diff --git a/tensorflow/compiler/aot/tests/bench_graph_tfmatmul.template.pbtxt b/tensorflow/compiler/aot/tests/bench_graph_tfmatmul.template.pbtxt
new file mode 100644
index 00000000000..5f8f68c8492
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/bench_graph_tfmatmul.template.pbtxt
@@ -0,0 +1,18 @@
+# Text form of tensorflow.tf2xla.Config proto.
+feed {
+  id { node_name: "x_hold" }
+  shape {
+    dim { size: <M> }
+    dim { size: <K> }
+  }
+}
+feed {
+  id { node_name: "y_hold" }
+  shape {
+    dim { size: <K> }
+    dim { size: <N> }
+  }
+}
+fetch {
+  id { node_name: "x_y_prod" }
+}
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 07d715725a2..56bea7413ef 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -31,11 +31,13 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack  # pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_assert
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.training import saver as saver_lib
 
@@ -50,7 +52,7 @@ def tfadd(_):
 
 def tfadd_with_ckpt(out_dir):
   x = array_ops.placeholder(dtypes.int32, name='x_hold')
-  y = variables.VariableV1(constant_op.constant([0]), name='y_saved')
+  y = variable_v1.VariableV1(constant_op.constant([0]), name='y_saved')
   math_ops.add(x, y, name='x_y_sum')
 
   init_op = variables.global_variables_initializer()
@@ -65,7 +67,7 @@ def tfadd_with_ckpt(out_dir):
 
 def tfadd_with_ckpt_saver(out_dir):
   x = array_ops.placeholder(dtypes.int32, name='x_hold')
-  y = variables.VariableV1(constant_op.constant([0]), name='y_saved')
+  y = variable_v1.VariableV1(constant_op.constant([0]), name='y_saved')
   math_ops.add(x, y, name='x_y_sum')
 
   init_op = variables.global_variables_initializer()
@@ -94,7 +96,7 @@ def tfcond(_):
   p = array_ops.placeholder(dtypes.bool, name='p_hold')
   x = array_ops.placeholder(dtypes.int32, name='x_hold')
   y = array_ops.placeholder(dtypes.int32, name='y_hold')
-  z = control_flow_ops.cond(p, lambda: x, lambda: y)
+  z = cond.cond(p, lambda: x, lambda: y)
   array_ops.identity(z, name='result')
 
 
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 6621f46e866..c1f8fdc089a 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -55,6 +55,7 @@ def _tfcompile_model_library_rule_impl(ctx):
                       "--xla_cpu_fast_math_honor_functions=false " +
                       "--xla_cpu_fast_math_honor_division=false " +
                       "--xla_cpu_enable_fast_min_max=true " +
+                      ctx.attr.xla_flags + " " +
                       "$${XLA_FLAGS:-}' "),
         "CUDA_VISIBLE_DEVICES": "",
     }
@@ -127,6 +128,7 @@ _tfcompile_model_library = rule(
         "dfsan_abilists": attr.label_list(default = [], allow_files = True),
         "is_linux": attr.bool(),
         "gen_compiler_log": attr.bool(),
+        "xla_flags": attr.string(),
     },
 )
 
@@ -151,7 +153,8 @@ def _tf_library(
         mlir_components = "None",
         deps = None,
         tags = [],
-        copts = []):
+        copts = [],
+        xla_flags = None):
     if not cpp_class:
         fail("cpp_class must be specified")
 
@@ -268,7 +271,7 @@ def _tf_library(
         tfcompile_config = config,
         entry_point = ep,
         cpp_class = cpp_class,
-        target_cpu = tfcompile_target_cpu(),
+        target_cpu = tfcompile_target_cpu(name),
         target_triple = target_llvm_triple(),
         flags = flags,
         extra_flags = debug_info_flags + profiling_flags + mlir_flags + traceme_flags,
@@ -281,6 +284,7 @@ def _tf_library(
         visibility = visibility,
         testonly = testonly,
         tags = tags,
+        xla_flags = xla_flags,
     )
 
     tfcompile_gen_object_files = tfcompile_gen + "_object_files"
@@ -327,6 +331,10 @@ def _tf_library(
             mlir_components.count("HloLowering") > 0 and [
                 "//tensorflow/compiler/xla/service/cpu:runtime_mlir_utils",
             ] or []
+        ) + (
+            include_standard_runtime_deps and mlir_components == "HloLowering" and [
+                "//tensorflow/compiler/xla/service/cpu/runtime:retain",
+            ] or []
         ) + (deps or []),
         tags = tags,
         copts = copts,
@@ -391,6 +399,7 @@ def _tf_library(
             ]),
             tags = tags,
             extra_copts = copts,
+            visibility = visibility,
         )
 
     if gen_benchmark:
@@ -437,6 +446,7 @@ def _tf_library(
                 "//tensorflow/compiler/aot:benchmark_extra_android",
             ]),
             tags = tags,
+            visibility = visibility,
         )
 
 def tf_library(
@@ -460,7 +470,8 @@ def tf_library(
         mlir_components = "None",
         deps = None,
         tags = [],
-        copts = []):
+        copts = [],
+        xla_flags = None):
     """Compiles a TensorFlow graph into an executable with fast math enabled.
 
     Given an invocation of tf_library(name="foo", ...), generates the following
@@ -543,6 +554,7 @@ def tf_library(
         deps,
         tags,
         copts,
+        xla_flags,
     )
     if mlir_components == "None":
         _tf_library(
@@ -567,6 +579,7 @@ def tf_library(
             deps,
             tags + ["notap", "local", "manual"],
             copts,
+            xla_flags,
         )
 
 def target_llvm_triple():
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index c1f14d30de7..da4fa91867f 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -87,7 +87,7 @@ int main(int argc, char** argv) {
                        "other than flags. See --help.\n\n";
   tensorflow::Status status = tensorflow::tfcompile::Main(flags);
   if (status.code() == absl::StatusCode::kInvalidArgument) {
-    std::cerr << "INVALID ARGUMENTS: " << status.error_message() << "\n\n";
+    std::cerr << "INVALID ARGUMENTS: " << status.message() << "\n\n";
     return 1;
   } else {
     TF_QCHECK_OK(status);
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 0b6869a327d..b3fd29ff259 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -78,6 +78,7 @@ cc_library(
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/core/tfrt/common:pjrt_cpu_client_registration",
     ] + if_libtpu(
         if_false = ["//tensorflow/compiler/xla/service:cpu_plugin"],
         if_true = [],
@@ -95,6 +96,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/core/tfrt/common:pjrt_gpu_client_registration",
     ]),
     alwayslink = 1,
 )
@@ -120,7 +122,6 @@ cc_library(
         ":jit_compilation_passes",
         ":xla_device",
         ":xla_kernel_creator",  # buildcleaner: keep
-        "@com_google_absl//absl/memory",
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla:layout_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -128,6 +129,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ] + if_libtpu(
         if_false = [
             "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
@@ -146,10 +148,8 @@ cc_library(
         ":flags",
         ":jit_compilation_passes",
         ":xla_device",
-        ":xla_kernel_creator",  # buildcleaner: keep
         ":xla_device_no_jit_rewrite_registration",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
+        ":xla_kernel_creator",  # buildcleaner: keep
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla:layout_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -158,6 +158,8 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + if_libtpu(
         if_false = [
             "//tensorflow/compiler/xla/service:gpu_plugin",  # buildcleaner: keep
@@ -174,14 +176,21 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":xla_device",
+        ":xla_device_context",
         ":xla_kernel_creator",  # buildcleaner: keep
-        "@com_google_absl//absl/types:optional",
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:layout_util",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",
+        "//tensorflow/compiler/xla/stream_executor/tpu:status_helper",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_base",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_node_context",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_interface",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_stream_interface",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
@@ -190,16 +199,10 @@ cc_library(
         "//tensorflow/core/common_runtime:device_factory",
         "//tensorflow/core/common_runtime:dma_helper",
         "//tensorflow/core/platform:status",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
         "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/core/tpu:tpu_node_device_util",
         "//tensorflow/core/tpu:virtual_device",
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",
-        "//tensorflow/compiler/xla/stream_executor/tpu:status_helper",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_base",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_node_context",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_interface",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_stream_interface",
+        "@com_google_absl//absl/types:optional",
     ] + if_static([
         "//tensorflow/core/common_runtime:copy_tensor",
         ":jit_compilation_passes",
@@ -289,13 +292,39 @@ XLA_DEVICE_DEPS = [
     "//tensorflow/compiler/xla/stream_executor/platform",
 ]
 
+cc_library(
+    name = "xla_device_context",
+    srcs = ["xla_device_context.cc"],
+    hdrs = ["xla_device_context.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_launch_util",
+        ":xla_tensor",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:layout_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core/common_runtime:device",
+        "//tensorflow/core/common_runtime:dma_helper",
+        "//tensorflow/core/framework:allocator",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 cc_library(
     name = "xla_device_no_jit_rewrite_registration",
     srcs = [
         "xla_compile_on_demand_op.cc",
         "xla_compiler_options_util.cc",
         "xla_device.cc",
-        "xla_device_context.cc",
         "xla_device_ops.cc",
         "xla_ops_on_regular_devices.cc",
         "xla_platform_info.cc",
@@ -304,7 +333,6 @@ cc_library(
         "xla_compile_on_demand_op.h",
         "xla_compiler_options_util.h",
         "xla_device.h",
-        "xla_device_context.h",
         "xla_device_ops.h",
         "xla_platform_info.h",
     ],
@@ -313,12 +341,24 @@ cc_library(
     deps = XLA_DEVICE_DEPS + [
         ":device_compilation_cache",
         ":device_compilation_profiler",
+        ":device_compiler",
         ":device_compiler_client",
         ":device_executable_persistor",
         ":flags_headers",
-        ":device_compiler",
+        ":pjrt_base_device",
+        ":pjrt_device_compiler_client",
         ":xla_device_compiler_client",
+        ":xla_device_context",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/core/tfrt/common:create_pjrt_client_util",
+        "//tensorflow/core/tfrt/common:pjrt_util",
         "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
     ],
     alwayslink = 1,
 )
@@ -328,7 +368,6 @@ cc_library(
     hdrs = [
         "xla_compile_on_demand_op.h",
         "xla_device.h",
-        "xla_device_context.h",
         "xla_device_ops.h",
     ],
     # Public visibility is needed for external TF/XLA backends.
@@ -337,6 +376,7 @@ cc_library(
         ":device_compilation_profiler",
         ":jit_compilation_passes",
         ":xla_device_no_jit_rewrite_registration",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
     ],
 )
 
@@ -364,9 +404,11 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:dump_graph",
         "//tensorflow/compiler/xla:parse_flags_from_env",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
@@ -379,12 +421,11 @@ cc_library(
     hdrs = ["flags.h"],
     visibility = [":friends"],
     deps = [
-        "//tensorflow/compiler/mlir/tensorflow:dump_graph",
-        "//tensorflow/compiler/xla:parse_flags_from_env",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core:lib",
         "//tensorflow/core/protobuf:for_core_protos_cc",
-        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -475,6 +516,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/core:core_cpu_internal",
@@ -484,8 +526,43 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/tfrt/common:async_value_tensor",
+        "//tensorflow/tsl/framework:device_id_utils",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cc_test(
+    name = "xla_launch_util_test",
+    srcs = ["xla_launch_util_test.cc"],
+    deps = [
+        ":device_compiler",
+        ":flags_headers",
+        ":pjrt_device_compiler_client",
+        ":variable_info",
+        ":variable_info_util",
+        ":xla_cpu_device",
+        ":xla_cpu_jit",
+        ":xla_device_no_jit_rewrite_registration",
+        ":xla_launch_util",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:tfrt_cpu_pjrt_client",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/framework:fake_input",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/platform:refcount",
+        "//tensorflow/core/tfrt/common:create_pjrt_client_util",
+        "//tensorflow/core/tfrt/common:pjrt_util",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -511,6 +588,7 @@ tf_cc_test(
         "xla_compile_util_test.cc",
     ],
     deps = [
+        ":flags_headers",
         ":xla_compile_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:test_main",
@@ -658,6 +736,8 @@ cc_library(
     name = "xla_kernel_creator",
     srcs = [
         "xla_kernel_creator.cc",
+    ],
+    hdrs = [
         "xla_kernel_creator.h",
     ],
     visibility = [
@@ -777,6 +857,7 @@ cc_library(
         ":shape_inference",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1393,6 +1474,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "pjrt_base_device",
+    srcs = ["pjrt_base_device.cc"],
+    hdrs = ["pjrt_base_device.h"],
+    # Public visibility is needed for external TF/XLA backends.
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:layout_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime:local_device",
+    ],
+)
+
 cc_library(
     name = "pjrt_device_context",
     srcs = [
@@ -1404,12 +1498,14 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:layout_util",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/tfrt/common:async_value_tensor",
         "//tensorflow/core/tfrt/common:create_pjrt_client_util",
+        "//tensorflow/tsl/framework:device_id_utils",
     ],
 )
 
@@ -1522,11 +1618,15 @@ tf_cuda_cc_test(
 tf_cuda_cc_test(
     name = "device_context_test",
     srcs = ["device_context_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + [
+        "config-cuda-only",
+        "no_oss",  # Temporarily disable OSS.
+    ],
     deps = [
         ":flags",
         ":xla_device",
         ":xla_gpu_device",
+        ":xla_gpu_jit",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:test",
@@ -1541,6 +1641,7 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":flags",
+        ":test_util",
         ":xla_device_no_jit_rewrite_registration",
         ":xla_gpu_device",
         ":xla_gpu_jit",
@@ -1554,3 +1655,29 @@ tf_cuda_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+tf_cuda_cc_test(
+    name = "xla_platform_info_test",
+    srcs = ["xla_platform_info_test.cc"],
+    tags = tf_cuda_tests_tags() + ["config-cuda-only"],
+    deps = [
+        ":flags_headers",
+        ":test_util",
+        ":xla_device_no_jit_rewrite_registration",
+        ":xla_gpu_device",
+        ":xla_gpu_jit",
+        "//tensorflow/compiler/tf2xla:layout_util",
+        "//tensorflow/compiler/xla/pjrt:tfrt_cpu_pjrt_client",
+        "//tensorflow/core:framework_types_hdr",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:test",
+        "//tensorflow/core/platform:refcount",
+        "//tensorflow/core/platform:status_matchers",
+        "//tensorflow/core/platform:statusor",
+        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
+        "//tensorflow/core/tfrt/common:create_pjrt_client_util",
+        "//tensorflow/core/tfrt/common:pjrt_util",
+        "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 01f2e1cf24b..cd922170c3c 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -202,7 +202,7 @@ bool RecursiveCompilabilityChecker::HasXLAKernel(
 
   Status s = FindKernelDef(jit_device_type_, node.def(), nullptr, nullptr);
   if (!s.ok()) {
-    *uncompilable_reason = s.error_message();
+    *uncompilable_reason = s.message();
     return false;
   }
   return true;
diff --git a/tensorflow/compiler/jit/device_compiler_disable_test.cc b/tensorflow/compiler/jit/device_compiler_disable_test.cc
index cf4b5461861..7853014b2ea 100644
--- a/tensorflow/compiler/jit/device_compiler_disable_test.cc
+++ b/tensorflow/compiler/jit/device_compiler_disable_test.cc
@@ -68,24 +68,21 @@ TEST(DeviceCompilerTest, TestDisabledXlaCompilation) {
       XlaCompiler::Options{}, fn, args, XlaCompiler::CompileOptions{},
       DeviceCompileMode::kStrict, profiler, &compilation_result, &executable);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "XLA compilation disabled"));
+  EXPECT_TRUE(absl::StrContains(status.message(), "XLA compilation disabled"));
 
   // Check that async compilation is disallowed.
   status = xla_device_compiler->CompileIfNeeded(
       XlaCompiler::Options{}, fn, args, XlaCompiler::CompileOptions{},
       DeviceCompileMode::kAsync, profiler, &compilation_result, &executable);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "XLA compilation disabled"));
+  EXPECT_TRUE(absl::StrContains(status.message(), "XLA compilation disabled"));
 
   // Check that lazy compilation is disallowed.
   status = xla_device_compiler->CompileIfNeeded(
       XlaCompiler::Options{}, fn, args, XlaCompiler::CompileOptions{},
       DeviceCompileMode::kLazy, profiler, &compilation_result, &executable);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "XLA compilation disabled"));
+  EXPECT_TRUE(absl::StrContains(status.message(), "XLA compilation disabled"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/jit/device_context_test.cc b/tensorflow/compiler/jit/device_context_test.cc
index 7bad65bcf5a..2328ec42d97 100644
--- a/tensorflow/compiler/jit/device_context_test.cc
+++ b/tensorflow/compiler/jit/device_context_test.cc
@@ -28,14 +28,21 @@ namespace tensorflow {
 namespace {
 
 static bool Initialized = [] {
+  auto& rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
+  rollout_config.enabled_for_xla_launch_ = true;
+  rollout_config.enabled_for_compile_on_demand_ = true;
+
   tensorflow::GetXlaDeviceFlags()->tf_xla_enable_xla_devices = true;
-  tensorflow::GetXlaOpsCommonFlags()->tf_xla_use_device_api = true;
   return true;
 }();
 
 class DeviceContextTest : public ::testing::Test {
  public:
   void SetDevice(const string& device_type) {
+    auto& rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
+    rollout_config.AllowForDeviceInXlaLaunch(DeviceType(device_type));
+    rollout_config.AllowForDeviceInXlaCompileOnDemand(DeviceType(device_type));
+
     auto device_factory = DeviceFactory::GetFactory(device_type);
     SessionOptions options;
     std::vector<std::unique_ptr<Device>> devices;
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index dbd202bda97..6e8fceaf47d 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -213,7 +213,8 @@ void AllocateAndParseFlags() {
   ops_flags = new XlaOpsCommonFlags;
   ops_flags->tf_xla_always_defer_compilation = false;
   ops_flags->tf_xla_async_compilation = false;
-  ops_flags->tf_xla_use_device_api = false;
+  ops_flags->tf_xla_use_device_api.enabled_for_xla_launch_ = false;
+  ops_flags->tf_xla_use_device_api.enabled_for_compile_on_demand_ = false;
 
   // The `enable_mlir_bridge` flag allows the user to explicitly request that
   // their program is (or isn't) compiled using the MLIR-based TF-to-XLA bridge.
@@ -267,9 +268,15 @@ void AllocateAndParseFlags() {
             "When lazy compilation is enabled, asynchronous compilation starts "
             "the cluster compilation in the background, and the fallback path "
             "is executed until the compilation has finished."),
-       Flag("tf_xla_use_device_api", &ops_flags->tf_xla_use_device_api,
-            "If true, uses the Device API (PjRt) for single device compilation."
-            " Defaults to false."),
+       Flag("tf_xla_use_device_api_for_xla_launch",
+            &ops_flags->tf_xla_use_device_api.enabled_for_xla_launch_,
+            "If true, uses Device API (PjRt) for single device compilation and "
+            "execution of functions marked for JIT compilation i.e. "
+            "jit_compile=True. Defaults to false."),
+       Flag("tf_xla_use_device_api_for_compile_on_demand",
+            &ops_flags->tf_xla_use_device_api.enabled_for_compile_on_demand_,
+            "If true, uses Device API (PjRt) for compiling and executing ops "
+            "one by one in 'on-demand' mode. Defaults to false."),
 
        Flag("tf_mlir_enable_mlir_bridge", &enable_mlir_bridge,
             "Enables experimental MLIR-Based TensorFlow Compiler Bridge.",
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 650a53293fa..9f151b89eb7 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -20,7 +20,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/types/optional.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -123,9 +125,55 @@ struct XlaOpsCommonFlags {
   // If true, _XlaCompile compiles the cluster asynchronously with respect to
   // the main execution. The fallback path is taken while compilation happens.
   bool tf_xla_async_compilation;
-  // If true, uses Device API (PjRt) for single device compilation. Defaults to
-  // false.
-  bool tf_xla_use_device_api;
+
+  class PjRtForSingleDeviceCompilationRollout {
+   public:
+    // Allow using Device API (PjRt) for `device_type` in the XlaLaunch op.
+    // Please note that `enabled_for_xla_launch_` needs to be true in addition
+    // to the `device_type` being allowed in order to use the Device API for
+    // single device compilation and execution in the XlaLaunch op.
+    void AllowForDeviceInXlaLaunch(const DeviceType& device_type) {
+      xla_launch_allowed_devices_.insert(device_type.type_string());
+    }
+
+    bool IsEnabledInXlaLaunchForDevice(const DeviceType& device_type) const {
+      return enabled_for_xla_launch_ &&
+             xla_launch_allowed_devices_.contains(device_type.type_string());
+    }
+
+    // Allow using Device API (PjRt) for `device_type` in the XlaCompileOnDemand
+    // op. Please note that `enabled_for_compile_on_demand_` needs to be true in
+    // addition to the `device_type` being allowed in order to use the Device
+    // API for single device compilation and execution in the XlaCompileOnDemand
+    // op.
+    void AllowForDeviceInXlaCompileOnDemand(const DeviceType& device_type) {
+      xla_compile_on_demand_allowed_devices_.insert(device_type.type_string());
+    }
+
+    bool IsEnabledInXlaCompileOnDemandForDevice(
+        const DeviceType& device_type) const {
+      return enabled_for_compile_on_demand_ &&
+             xla_compile_on_demand_allowed_devices_.contains(
+                 device_type.type_string());
+    }
+
+    // If true, uses Device API (PjRt) for single device compilation and
+    // execution of functions marked for JIT compilation i.e. jit_compile=True.
+    // Defaults to false.
+    bool enabled_for_xla_launch_;
+
+    // If true, uses Device API (PjRt) for compiling and executing ops one by
+    // one in "on-demand" mode. Defaults to false.
+    bool enabled_for_compile_on_demand_;
+
+   private:
+    // Devices for which using Device API (PjRt) is allowed in the XlaLaunch op.
+    // This can only be modified programmatically.
+    absl::flat_hash_set<std::string> xla_launch_allowed_devices_;
+    // Devices for which using Device API (PjRt) is allowed in the
+    // XlaCompileOnDemand op. This can only be modified programmatically.
+    absl::flat_hash_set<std::string> xla_compile_on_demand_allowed_devices_;
+  } tf_xla_use_device_api;
 };
 
 // Flags for the build_xla_ops pass.
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 1f7436cdc95..8046207ed54 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -57,6 +57,7 @@ cc_library(
         "//tensorflow/compiler/jit:tf_graph_to_hlo_compiler",
         "//tensorflow/compiler/jit:tf_to_hlo_compiler",
         "//tensorflow/compiler/jit:xla_compile_util",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/core/platform:refcount",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index b2547aa7e09..913cca35be3 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
 
+#include <functional>
 #include <map>
 #include <memory>
 #include <optional>
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/jit/xla_compile_util.h"
 #include "tensorflow/compiler/jit/xla_compiler_options_util.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -41,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -51,6 +54,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -74,6 +78,8 @@ namespace tensorflow {
 namespace {
 using XlaDeviceCompiler =
     DeviceCompiler<xla::LocalExecutable, xla::LocalClient>;
+using PjRtDeviceCompiler =
+    DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>;
 
 auto* xla_launch_counter = monitoring::Counter<1>::New(
     "/tensorflow/core/xla_launch_counter",
@@ -233,21 +239,19 @@ GetXlaCompilerArgsAndSnapshotVariables(
   return result;
 }
 
-}  // namespace
+XlaCompiler::CompileOptions GenerateCompileOptions(
+    bool has_ref_vars, bool may_alias_resource_update) {
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = true;
+  // Optimization: where possible, have the computation return a naked array
+  // rather than a one-element tuple.
+  compile_options.always_return_tuple = false;
+  compile_options.alias_resource_update =
+      !has_ref_vars && may_alias_resource_update;
+  return compile_options;
+}
 
-XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
-                                       const std::vector<int>& constants,
-                                       const std::vector<int>& resources,
-                                       const NameAttrList& function,
-                                       bool has_ref_vars)
-    : AsyncOpKernel(ctx),
-      constants_(constants),
-      resources_(resources),
-      function_(function),
-      platform_info_(XlaPlatformInfoFromDevice(ctx->device())),
-      has_ref_vars_(has_ref_vars) {}
-
-static Status CompileToLocalExecutable(
+Status CompileToLocalExecutable(
     OpKernelContext* ctx, const NameAttrList& function, bool has_ref_vars,
     const XlaPlatformInfo& platform_info,
     const std::vector<XlaCompiler::Argument>& args,
@@ -288,19 +292,78 @@ static Status CompileToLocalExecutable(
       *xla_device_compiler, *ctx->function_library(), ctx->device(),
       GetStream(ctx), platform_info, has_ref_vars);
 
-  XlaCompiler::CompileOptions compile_options;
-  compile_options.is_entry_computation = true;
-  // Optimization: where possible, have the computation return a naked array
-  // rather than a one-element tuple.
-  compile_options.always_return_tuple = false;
-  compile_options.alias_resource_update =
-      !has_ref_vars && may_alias_resource_update;
+  XlaCompiler::CompileOptions compile_options =
+      GenerateCompileOptions(has_ref_vars, may_alias_resource_update);
 
   return xla_device_compiler->CompileIfNeeded(
       options, function, args, compile_options, compile_mode, profiler,
       compilation_result, executable);
 }
 
+Status CompileToPjRtLoadedExecutable(
+    const OpKernelContext& ctx, const XlaPlatformInfo& platform_info,
+    const NameAttrList& function,
+    const std::vector<XlaCompiler::Argument>& args,
+    DeviceCompileMode compile_mode, bool has_ref_vars,
+    bool may_alias_resource_update,
+    const XlaCompiler::CompilationResult** compilation_result,
+    xla::PjRtClient** client, xla::PjRtLoadedExecutable** executable) {
+  // We store information about the JIT-compiled XLA computation
+  // in the ResourceMgr.
+  ResourceMgr* rm = ctx.resource_manager();
+  if (!rm) {
+    return errors::Internal("No resource manager.");
+  }
+
+  PjRtDeviceCompiler* pjrt_device_compiler;
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<PjRtDeviceCompiler>(
+      rm->default_container(), "pjrt_device_compiler", &pjrt_device_compiler,
+      [&](PjRtDeviceCompiler** pjrt_device_compiler) {
+        return BuildPjRtDeviceCompiler(platform_info, ctx.function_library(),
+                                       pjrt_device_compiler);
+      }));
+  DeviceCompilationProfiler* profiler;
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<DeviceCompilationProfiler>(
+      rm->default_container(), "pjrt_device_compilation_profiler", &profiler,
+      [](DeviceCompilationProfiler** profiler) {
+        *profiler = new DeviceCompilationProfiler();
+        return OkStatus();
+      }));
+  // Hold the reference to the PJRT device compiler and profiler during
+  // evaluation. (We could probably free them sooner because the ResourceMgr
+  // will retain references, but this is more obviously correct.)
+  core::ScopedUnref pjrt_device_compiler_ref(pjrt_device_compiler);
+  core::ScopedUnref profiler_ref(profiler);
+
+  *client = pjrt_device_compiler->client();
+
+  XlaCompiler::Options options = GenerateCompilerOptionsForPjRt(
+      *ctx.function_library(), ctx.device(), platform_info);
+
+  XlaCompiler::CompileOptions compile_options =
+      GenerateCompileOptions(has_ref_vars, may_alias_resource_update);
+
+  return pjrt_device_compiler->CompileIfNeeded(
+      options, function, args, compile_options, compile_mode, profiler,
+      compilation_result, executable);
+}
+
+Status GetUpdatedVariables(
+    const OpKernelContext* ctx, absl::Span<const Tensor* const> inputs,
+    absl::Span<const int> variable_indices,
+    const XlaCompiler::CompilationResult& compilation_result,
+    std::vector<VariableInfo>* variable_infos) {
+  std::set<int> variables_updated;
+  for (const auto& resource_update : compilation_result.resource_updates) {
+    if (resource_update.modified) {
+      variables_updated.insert(resource_update.input_index);
+    }
+  }
+  return GetVariableInfosFromInputs(ctx->resource_manager(), ctx->device(),
+                                    inputs, variable_indices,
+                                    &variables_updated, variable_infos);
+}
+
 // Get-or-create thread pool for a given collective.
 static thread::ThreadPool* GetOrCreateThreadPoolForCollective(
     const XlaCompilationResult::CollectiveInfo& collective_info) {
@@ -321,6 +384,33 @@ static thread::ThreadPool* GetOrCreateThreadPoolForCollective(
   return &it->second;
 }
 
+void RunInThreadPoolIfCollectivesPresent(
+    const XlaCompiler::CompilationResult& compilation_result,
+    std::function<void()> execution_fn) {
+  // If we are using collectives, we need to run in a separate threadpool.
+  if (compilation_result.collective_info.has_value()) {
+    GetOrCreateThreadPoolForCollective(*compilation_result.collective_info)
+        ->Schedule(execution_fn);
+  } else {
+    // Otherwise, just run normally: we merely "pretend" to be asynchronous.
+    execution_fn();
+  }
+}
+
+}  // namespace
+
+XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
+                                       const std::vector<int>& constants,
+                                       const std::vector<int>& resources,
+                                       const NameAttrList& function,
+                                       bool has_ref_vars)
+    : AsyncOpKernel(ctx),
+      constants_(constants),
+      resources_(resources),
+      function_(function),
+      platform_info_(XlaPlatformInfoFromDevice(ctx->device())),
+      has_ref_vars_(has_ref_vars) {}
+
 void XlaLocalLaunchBase::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   VLOG(1) << "XlaLocalLaunchOpBase::Compute "
           << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
@@ -328,10 +418,14 @@ void XlaLocalLaunchBase::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
       ->IncrementBy(1);
 
   std::vector<const Tensor*> inputs = InputsFromContext(ctx);
-  xla::LocalClient* client;
-  const XlaCompiler::CompilationResult* compilation_result;
-  xla::LocalExecutable* executable;
   std::vector<XlaCompiler::Argument> xla_compiler_args;
+  const XlaCompiler::CompilationResult* compilation_result;
+
+  xla::LocalClient* client;          // Not owned.
+  xla::LocalExecutable* executable;  // Not owned.
+
+  xla::PjRtClient* pjrt_client;                // Not owned.
+  xla::PjRtLoadedExecutable* pjrt_executable;  // Not owned.
 
   // Note that here we assume the shape of the variables don't change between
   // compilation and execution. The locks on the variables are released before
@@ -357,6 +451,50 @@ void XlaLocalLaunchBase::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
     OP_REQUIRES_OK_ASYNC(ctx, status_or_xla_compiler_args.status(), done);
     xla_compiler_args = std::move(status_or_xla_compiler_args.value());
   }
+
+  bool use_pjrt = GetXlaOpsCommonFlags()
+                      ->tf_xla_use_device_api.IsEnabledInXlaLaunchForDevice(
+                          platform_info_.device_type());
+  if (use_pjrt) {
+    VLOG(2) << "Compiling using PJRT";
+    Status status = CompileToPjRtLoadedExecutable(
+        *ctx, platform_info_, function_, xla_compiler_args,
+        DeviceCompileMode::kStrict, has_ref_vars_,
+        /*may_alias_resource_update=*/true, &compilation_result, &pjrt_client,
+        &pjrt_executable);
+    OP_REQUIRES_OK_ASYNC(ctx, status, done);
+
+    VLOG(2) << "Compiled using PJRT: " << status;
+    VLOG(2) << "pjrt_executable != nullptr: " << (pjrt_executable != nullptr);
+    VLOG(2) << "compilation_result != nullptr: "
+            << (compilation_result != nullptr);
+    VLOG(2) << "Executing using PJRT.";
+
+    auto run_pjrt_cluster = [ctx, pjrt_client, pjrt_executable,
+                             compilation_result, done, inputs,
+                             resources = resources_]() {
+      auto platform_info = XlaPlatformInfoFromDevice(ctx->device());
+      std::vector<VariableInfo> variable_infos;
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          GetUpdatedVariables(ctx, inputs, resources, *compilation_result,
+                              &variable_infos),
+          done);
+      OP_REQUIRES_OK_ASYNC(ctx, LockVariables(absl::MakeSpan(variable_infos)),
+                           done);
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          RunPjRtExecutable(*pjrt_client, inputs, variable_infos,
+                            *compilation_result, pjrt_executable, ctx),
+          done);
+      VLOG(2) << "Done executing with PJRT.";
+      done();
+    };
+
+    RunInThreadPoolIfCollectivesPresent(*compilation_result, run_pjrt_cluster);
+    return;
+  }
+
   Status status = CompileToLocalExecutable(
       ctx, function_, /*has_ref_vars=*/has_ref_vars_, platform_info_,
       xla_compiler_args, DeviceCompileMode::kStrict,
@@ -369,17 +507,11 @@ void XlaLocalLaunchBase::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
                           inputs, resources = resources_]() {
     auto platform_info = XlaPlatformInfoFromDevice(ctx->device());
     std::vector<VariableInfo> variable_infos;
-    std::set<int> variables_updated;
-    for (const auto& resource_update : compilation_result->resource_updates) {
-      if (resource_update.modified) {
-        variables_updated.insert(resource_update.input_index);
-      }
-    }
-    OP_REQUIRES_OK_ASYNC(ctx,
-                         GetVariableInfosFromInputs(
-                             ctx->resource_manager(), ctx->device(), inputs,
-                             resources, &variables_updated, &variable_infos),
-                         done);
+    OP_REQUIRES_OK_ASYNC(
+        ctx,
+        GetUpdatedVariables(ctx, inputs, resources, *compilation_result,
+                            &variable_infos),
+        done);
     OP_REQUIRES_OK_ASYNC(ctx, LockVariables(absl::MakeSpan(variable_infos)),
                          done);
     std::map<int, const Tensor*> resource_var_ptrs;
@@ -435,14 +567,7 @@ void XlaLocalLaunchBase::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
     done();
   };
 
-  // If we are using collectives, we need to run in a separate threadpool.
-  if (compilation_result->collective_info.has_value()) {
-    GetOrCreateThreadPoolForCollective(*compilation_result->collective_info)
-        ->Schedule(run_xla_cluster);
-  } else {
-    // Otherwise, just run normally: we merely "pretend" to be asynchronous.
-    run_xla_cluster();
-  }
+  RunInThreadPoolIfCollectivesPresent(*compilation_result, run_xla_cluster);
 }
 
 namespace {
diff --git a/tensorflow/compiler/jit/pjrt_base_device.cc b/tensorflow/compiler/jit/pjrt_base_device.cc
new file mode 100644
index 00000000000..d7c12921c71
--- /dev/null
+++ b/tensorflow/compiler/jit/pjrt_base_device.cc
@@ -0,0 +1,60 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/jit/pjrt_base_device.h"
+
+namespace tensorflow {
+namespace {
+
+DeviceAttributes BuildPjRtBaseDeviceAttributes(const string& name_prefix,
+                                               const string& device_name,
+                                               int device_ordinal) {
+  return Device::BuildDeviceAttributes(
+      absl::StrCat(name_prefix, "/device:", device_name, ":", device_ordinal),
+      DeviceType(device_name), Bytes(16ULL << 30), DeviceLocality(),
+      absl::StrCat("device: ", device_name, " device"));
+}
+
+}  // namespace
+
+PjRtBaseDevice::PjRtBaseDevice(const SessionOptions& session_options,
+                               const Options& options)
+    : LocalDevice(session_options,
+                  BuildPjRtBaseDeviceAttributes(options.device_name_prefix,
+                                                options.device_name,
+                                                options.device_ordinal)),
+      metadata_(DeviceType(options.compilation_device_name),
+                options.shape_determination_fns) {
+  if (options.shape_determination_fns.empty()) {
+    LOG(ERROR) << "shape_representation_fns must be non-empty.";
+  }
+  VLOG(1) << "Created PJRT base device " << options.compilation_device_name
+          << " device_name: " << name();
+}
+
+/*static*/ StatusOr<const PjRtBaseDevice::Metadata*>
+PjRtBaseDevice::GetMetadataFromDevice(DeviceBase* device) {
+  PjRtBaseDevice* pjrt_device =
+      dynamic_cast<PjRtBaseDevice*>(device->UnderlyingDevice());
+  if (pjrt_device == nullptr) {
+    return errors::Internal(
+        "Cannot get device metadata from non-PJRT device \"", device->name(),
+        "\". GetMetadata must only be called on a device derived from "
+        "PjRtBaseDevice. Either an internal bug has been triggered, or an "
+        "XLA-specific op has been placed on the wrong device.");
+  }
+  return &pjrt_device->metadata_;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/pjrt_base_device.h b/tensorflow/compiler/jit/pjrt_base_device.h
new file mode 100644
index 00000000000..26c8f88efab
--- /dev/null
+++ b/tensorflow/compiler/jit/pjrt_base_device.h
@@ -0,0 +1,111 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PJRT_BASE_DEVICE_H_
+#define TENSORFLOW_COMPILER_JIT_PJRT_BASE_DEVICE_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/framework/device_base.h"
+
+namespace tensorflow {
+
+// tensorflow::PjRtBaseDevice replaces the deprecated tensorflow::XlaDevice.
+// This accelerator agnostic device is mainly used to store metadata.
+class PjRtBaseDevice : public LocalDevice {
+ public:
+  // Stores metadata about the PjRtBaseDevice.
+  class Metadata {
+   public:
+    Metadata(const DeviceType& jit_device_type,
+             std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+                 shape_determination_fns)
+        : jit_device_type_(jit_device_type),
+          shape_determination_fns_(std::move(shape_determination_fns)) {}
+
+    // The index of the device on this host.
+    int device_ordinal() const;
+
+    const DeviceType& jit_device_type() const { return jit_device_type_; }
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns&
+    default_shape_determination_fns() const {
+      return shape_determination_fns_.at(0);
+    }
+
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns&
+    shape_determination_fns_at(int i) const {
+      return shape_determination_fns_[i];
+    }
+
+   private:
+    const DeviceType jit_device_type_;
+    std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+        shape_determination_fns_;
+
+    TF_DISALLOW_COPY_AND_ASSIGN(Metadata);
+  };
+
+  struct Options {
+    // The device name's prefix (e.g., "/task:7")
+    std::string device_name_prefix;
+
+    // The name of the  device (e.g., "TPU")
+    std::string device_name;
+
+    // The index of the device.
+    int device_ordinal = -1;
+
+    // The name of the compilation device, also referred to as jit_device_type.
+    // (e.g., "XLA_CPU_JIT");
+    std::string compilation_device_name;
+
+    // A vector of ShapeDeterminationFn (i.e., a bundle of LayoutSelectionFn,
+    // ShapeRepresentationFn). Each bundle describes how the on-host shapes of
+    // a) argument and return value, for entry computations b) variables, for
+    // all computations, should be represented in XLA. Parameters/return values
+    // will be shaped according to the function pair, and reshaped back to/from
+    // their declared shapes for computations. Must be non-empty.
+    std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+        shape_determination_fns;
+
+    Options(std::string device_name_prefix, std::string device_name,
+            int device_ordinal, std::string compilation_device_name,
+            std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+                shape_determination_fns)
+        : device_name_prefix(device_name_prefix),
+          device_name(device_name),
+          device_ordinal(device_ordinal),
+          compilation_device_name(compilation_device_name),
+          shape_determination_fns(shape_determination_fns) {}
+  };
+
+  // Creates a new PJRT base device.
+  PjRtBaseDevice(const SessionOptions& session_options, const Options& options);
+
+  static StatusOr<const PjRtBaseDevice::Metadata*> GetMetadataFromDevice(
+      DeviceBase* device);
+
+ private:
+  // The metadata of this PjRtBaseDevice.
+  const Metadata metadata_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PJRT_BASE_DEVICE_H_
diff --git a/tensorflow/compiler/jit/pjrt_device_context.cc b/tensorflow/compiler/jit/pjrt_device_context.cc
index 6c0cd4c50cb..90e12d218d7 100644
--- a/tensorflow/compiler/jit/pjrt_device_context.cc
+++ b/tensorflow/compiler/jit/pjrt_device_context.cc
@@ -16,41 +16,68 @@ limitations under the License.
 #include "tensorflow/compiler/jit/pjrt_device_context.h"
 
 #include <memory>
+#include <optional>
 #include <utility>
 
 #include "tensorflow/compiler/tf2xla/literal_util.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/tfrt/common/async_value_tensor.h"
 #include "tensorflow/core/tfrt/common/create_pjrt_client_util.h"
+#include "tensorflow/tsl/framework/device_id_utils.h"
 
 namespace tensorflow {
 namespace {
 
 StatusOr<std::unique_ptr<xla::PjRtBuffer>> HostTensorToPjRtBuffer(
     const tensorflow::Tensor* cpu_tensor, tensorflow::Device* device,
-    xla::PjRtClient* pjrt_client) {
-  // TODO(b/262472386): Consider layout_preference_fn and
-  // shape_representation_fn.
-  xla::Shape shape;
-  TF_RETURN_IF_ERROR(
-      TensorShapeToXLAShape(cpu_tensor->dtype(), cpu_tensor->shape(), &shape));
+    xla::PjRtClient* pjrt_client,
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns
+        shape_determination_fns) {
+  XlaLayoutPreference layout_preference =
+      shape_determination_fns.layout_preference_fn(
+          cpu_tensor->shape(), cpu_tensor->dtype(), std::nullopt);
+  TF_ASSIGN_OR_RETURN(xla::Shape shape,
+                      shape_determination_fns.shape_representation_fn(
+                          cpu_tensor->shape(), cpu_tensor->dtype(),
+                          /*fast_mem=*/false, layout_preference));
+  const xla::Layout* device_layout = &(shape.layout());
+  // The device id should matche the local_hardware_id in
+  // tensorflow/compiler/xla/pjrt/pjrt_client.h.
   TF_ASSIGN_OR_RETURN(
-      xla::PjRtDevice * pjrt_device,
-      pjrt_client->LookupAddressableDevice(device->parsed_name().id));
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<xla::PjRtBuffer> buffer,
-      pjrt_client->BufferFromHostBuffer(
-          cpu_tensor->data(), shape.element_type(), shape.dimensions(),
-          /*byte_strides=*/std::nullopt,
-          xla::PjRtClient::HostBufferSemantics::kZeroCopy,
-          /*on_done_with_host_buffer=*/
-          [cpu_tensor = *cpu_tensor]() { /* frees tensor */ }, pjrt_device));
-  return buffer;
+      const int pjrt_device_id,
+      tsl::GetDeviceIdFromDeviceParsedName(device->parsed_name(),
+                                           DeviceType(device->device_type())));
+  TF_ASSIGN_OR_RETURN(xla::PjRtDevice * pjrt_device,
+                      pjrt_client->LookupAddressableDevice(pjrt_device_id));
+  auto first_try_buffer = pjrt_client->BufferFromHostBuffer(
+      cpu_tensor->data(), shape.element_type(), shape.dimensions(),
+      /*byte_strides=*/std::nullopt,
+      xla::PjRtClient::HostBufferSemantics::kZeroCopy,
+      /*on_done_with_host_buffer=*/
+      [cpu_tensor = *cpu_tensor]() { /* frees tensor */ }, pjrt_device,
+      device_layout);
+  if (first_try_buffer.ok()) {
+    return std::move(*first_try_buffer);
+  }
+  if (first_try_buffer.status().code() == absl::StatusCode::kUnimplemented) {
+    LOG_FIRST_N(WARNING, 1)
+        << first_try_buffer.status()
+        << "; fallback to BufferFromHostBuffer without device layout.";
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<xla::PjRtBuffer> second_try_buffer,
+        pjrt_client->BufferFromHostBuffer(
+            cpu_tensor->data(), shape.element_type(), shape.dimensions(),
+            /*byte_strides=*/std::nullopt,
+            xla::PjRtClient::HostBufferSemantics::kZeroCopy,
+            /*on_done_with_host_buffer=*/
+            [cpu_tensor = *cpu_tensor]() { /* frees tensor */ }, pjrt_device));
+    return second_try_buffer;
+  } else {
+    return first_try_buffer.status();
+  }
 }
-
 }  // namespace
 
 void PjRtDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
@@ -101,18 +128,16 @@ void PjRtDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
     done(pjrt_client.status());
     return;
   }
-  StatusOr<std::unique_ptr<xla::PjRtBuffer>> buffer_or =
-      HostTensorToPjRtBuffer(cpu_tensor, device, *pjrt_client);
+  StatusOr<std::unique_ptr<xla::PjRtBuffer>> buffer_or = HostTensorToPjRtBuffer(
+      cpu_tensor, device, *pjrt_client, shape_determination_fns_);
   if (!buffer_or.ok()) {
     done(buffer_or.status());
     return;
   }
-  std::unique_ptr<xla::PjRtBuffer> device_buffer = std::move(buffer_or.value());
+  result_tensor->SetBuffer(std::move(*buffer_or));
   // TODO(b/244666476): evaluate the performance impact of marking ready when
-  // the data in device buffer is computed. In `tpu_device_context`, it is
-  // marked done when the allocation finished.
-  device_buffer->GetReadyFuture().OnReady(std::move(done));
-  result_tensor->SetBuffer(std::move(device_buffer));
+  // the data in device buffer is computed.
+  result_tensor->GetBuffer()->GetReadyFuture().OnReady(std::move(done));
 }
 
 void PjRtDeviceContext::CopyTensorInSameDevice(const Tensor* input_tensor,
diff --git a/tensorflow/compiler/jit/pjrt_device_context.h b/tensorflow/compiler/jit/pjrt_device_context.h
index 42e72dbd9d7..519598d3fe8 100644
--- a/tensorflow/compiler/jit/pjrt_device_context.h
+++ b/tensorflow/compiler/jit/pjrt_device_context.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_CONTEXT_H_
 #define TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_CONTEXT_H_
 
-#include <memory>
+#include <utility>
 
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/platform/status.h"
 
@@ -28,6 +28,10 @@ namespace tensorflow {
 // devices using PjRt.
 class PjRtDeviceContext : public DeviceContext {
  public:
+  explicit PjRtDeviceContext(
+      XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns)
+      : shape_determination_fns_(std::move(shape_determination_fns)) {}
+
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor, StatusCallback done,
                              bool sync_dst_compute) const override;
@@ -37,6 +41,9 @@ class PjRtDeviceContext : public DeviceContext {
   void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
                               Tensor* output_tensor,
                               StatusCallback done) const override;
+
+ private:
+  XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/test_util.cc b/tensorflow/compiler/jit/test_util.cc
index 6add3dae494..8c1268fc09b 100644
--- a/tensorflow/compiler/jit/test_util.cc
+++ b/tensorflow/compiler/jit/test_util.cc
@@ -15,8 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/test_util.h"
 
+#include <memory>
+#include <string>
+#include <utility>
+
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 
@@ -54,4 +60,39 @@ Status ShapeAnnotationsMatch(
   return OkStatus();
 }
 
+void DeviceSetup::AddDevicesAndSetUp(
+    const std::vector<std::string>& device_names) {
+  SessionOptions options;
+  auto* device_count = options.config.mutable_device_count();
+  for (const auto& device_name : device_names) {
+    device_count->insert({device_name, 1});
+  }
+
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_CHECK_OK(DeviceFactory::AddDevices(
+      options, "/job:localhost/replica:0/task:0", &devices));
+  device_mgr_ = std::make_unique<StaticDeviceMgr>(std::move(devices));
+
+  OptimizerOptions opts;
+  lib_def_ = std::make_unique<FunctionLibraryDefinition>(OpRegistry::Global(),
+                                                         FunctionDefLibrary());
+  pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
+      device_mgr_.get(), Env::Default(), /*config=*/nullptr,
+      TF_GRAPH_DEF_VERSION, lib_def_.get(), opts,
+      /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
+  flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+}
+
+Device* DeviceSetup::GetDevice(const string& device_name) {
+  if (device_mgr_ == nullptr) {
+    return nullptr;
+  }
+
+  string full_device_name = absl::StrCat(
+      "/job:localhost/replica:0/task:0/device:", device_name, ":0");
+  Device* device;
+  TF_CHECK_OK(device_mgr_->LookupDevice(full_device_name, &device));
+  return device;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/test_util.h b/tensorflow/compiler/jit/test_util.h
index b5982c490df..aad58daab2a 100644
--- a/tensorflow/compiler/jit/test_util.h
+++ b/tensorflow/compiler/jit/test_util.h
@@ -19,11 +19,15 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_TEST_UTIL_H_
 
 #include <map>
-#include <unordered_map>
+#include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/jit/shape_inference.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
@@ -62,7 +66,20 @@ struct GraphOptimizationPassWrapper {
   SessionOptions session_options;
 };
 
+// Helps set up devices for unit tests.
+class DeviceSetup {
+ public:
+  void AddDevicesAndSetUp(const std::vector<std::string>& device_names);
+  Device* GetDevice(const string& device_name);
+  FunctionLibraryRuntime* flr() { return flr_; }
+
+ private:
+  FunctionLibraryRuntime* flr_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+};
+
 }  // namespace tensorflow
 
-
 #endif  // TENSORFLOW_COMPILER_JIT_TEST_UTIL_H_
diff --git a/tensorflow/compiler/jit/tests/device_compiler_serialize_options_test.cc b/tensorflow/compiler/jit/tests/device_compiler_serialize_options_test.cc
index 3eaa8202261..3da7ac13eae 100644
--- a/tensorflow/compiler/jit/tests/device_compiler_serialize_options_test.cc
+++ b/tensorflow/compiler/jit/tests/device_compiler_serialize_options_test.cc
@@ -41,7 +41,7 @@ TEST_F(DeviceCompilerSerializeTest, PersistentCacheOptionsTest) {
       AlterPersistentCacheEntryHloModuleNames(tensorflow::testing::TmpDir());
   EXPECT_FALSE(status.ok());
   EXPECT_TRUE(absl::StrContains(
-      status.error_message(),
+      status.message(),
       "Did not find any persistent XLA compilation cache entries to alter."));
 
   TF_ASSERT_OK(AlterPersistentCacheEntryHloModuleNames(
diff --git a/tensorflow/compiler/jit/tests/device_compiler_serialize_test.cc b/tensorflow/compiler/jit/tests/device_compiler_serialize_test.cc
index 984b9852535..9233d8e43e5 100644
--- a/tensorflow/compiler/jit/tests/device_compiler_serialize_test.cc
+++ b/tensorflow/compiler/jit/tests/device_compiler_serialize_test.cc
@@ -57,8 +57,8 @@ TEST_F(DeviceCompilerSerializeTest, PersistentCacheTest) {
   for (int b = 1; b < 4; ++b) {
     auto status = ExecuteWithBatch(graph, b);
     EXPECT_FALSE(status.ok());
-    EXPECT_TRUE(absl::StrContains(status.error_message(),
-                                  "Serialized HLO does not match."));
+    EXPECT_TRUE(
+        absl::StrContains(status.message(), "Serialized HLO does not match."));
   }
 }
 
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index ae74dccec69..f6bdaf4e0bc 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -17,33 +17,93 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 
+#include <map>
 #include <memory>
 #include <utility>
+#include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/memory/memory.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/jit/device_compilation_profiler.h"
+#include "tensorflow/compiler/jit/device_compiler.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/variable_info.h"
+#include "tensorflow/compiler/jit/variable_info_util.h"
+#include "tensorflow/compiler/jit/xla_compile_util.h"
 #include "tensorflow/compiler/jit/xla_compiler_options_util.h"
-#include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace tensorflow {
 namespace {
 using XlaDeviceCompiler =
     DeviceCompiler<xla::LocalExecutable, xla::LocalClient>;
+using PjRtDeviceCompiler =
+    DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>;
+
+XlaCompiler::CompileOptions GetCompileOptions(bool for_pjrt = false) {
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = true;
+  // Optimization: where possible, have the computation return a naked array
+  // rather than a one-element tuple.
+  compile_options.always_return_tuple = false;
+  if (for_pjrt) {
+    compile_options.use_tuple_arg = false;
+    compile_options.always_return_tuple = true;
+  }
+
+  return compile_options;
+}
+
+// Gets `variables` from `ctx`, locks them and builds XlaCompiler::Arguments
+// using them. Stores the arguments in `args`. `variables` and `args` passed in
+// will be cleared before populating them.
+Status GetAndLockVariablesAndBuildXlaCompilerArguments(
+    const OpKernelContext& ctx, const std::vector<const Tensor*>& inputs,
+    const std::vector<int>& constant_indices,
+    const std::vector<int>& variable_indices,
+    std::vector<VariableInfo>* variables,
+    std::vector<XlaCompiler::Argument>* args) {
+  variables->clear();
+  args->clear();
+  TF_RETURN_IF_ERROR(GetVariableInfosFromInputs(ctx.resource_manager(),
+                                                ctx.device(), inputs,
+                                                variable_indices, variables));
+  TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(*variables)));
+  TF_ASSIGN_OR_RETURN(*args,
+                      XlaComputationLaunchContext::BuildXlaCompilerArguments(
+                          constant_indices, inputs, *variables,
+                          static_cast<Device*>(ctx.device())));
+  return OkStatus();
+}
 }  // namespace
 
-Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
-                                 XlaDeviceCompiler* xla_device_compiler,
+Status XlaCompileOnDemandOp::Run(const ResourceVarsSnapshot& variable_args,
                                  const XlaCompiler::CompilationResult* result,
+                                 const XlaDeviceCompiler* xla_device_compiler,
                                  xla::LocalExecutable* executable,
-                                 const ResourceVarsSnapshot& variable_args) {
+                                 OpKernelContext* ctx) {
   xla::LocalClient* client =
       static_cast<xla::LocalClient*>(xla_device_compiler->client());
 
@@ -104,14 +164,48 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
 }
 
 Status XlaCompileOnDemandOp::Compile(
-    OpKernelContext* ctx, const XlaCompiler::CompilationResult** result,
-    XlaDeviceCompiler** xla_device_compiler,
-    DeviceCompilationProfiler** profiler, ResourceVarsSnapshot* variable_args,
-    xla::LocalExecutable** executable) {
-  TF_ASSIGN_OR_RETURN(std::vector<int> constant_input_indices,
-                      GetConstantInputIndicesFromContext(ctx));
-  std::vector<const Tensor*> inputs = InputsFromContext(ctx);
+    const std::vector<XlaCompiler::Argument>& args, OpKernelContext* ctx,
+    PjRtDeviceCompiler** pjrt_device_compiler,
+    DeviceCompilationProfiler** profiler,
+    const XlaCompiler::CompilationResult** result,
+    xla::PjRtLoadedExecutable** executable) {
+  // We store information about the JIT-compiled XLA computation
+  // in the ResourceMgr.
+  ResourceMgr* rm = ctx->resource_manager();
+  if (!rm) {
+    return errors::Internal("No resource manager.");
+  }
 
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<PjRtDeviceCompiler>(
+      rm->default_container(), "pjrt_device_compiler", pjrt_device_compiler,
+      [&](PjRtDeviceCompiler** pjrt_device_compiler) {
+        return BuildPjRtDeviceCompiler(platform_info_, ctx->function_library(),
+                                       pjrt_device_compiler);
+      }));
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<DeviceCompilationProfiler>(
+      rm->default_container(), "pjrt_device_compilation_profiler", profiler,
+      [](DeviceCompilationProfiler** profiler) {
+        *profiler = new DeviceCompilationProfiler();
+        return OkStatus();
+      }));
+
+  XlaCompiler::Options options = GenerateCompilerOptionsForPjRt(
+      *(ctx->function_library()), ctx->device(), platform_info_);
+  // No detailed logging for on demand op.
+  options.detailed_logging = false;
+  XlaCompiler::CompileOptions compile_options = GetCompileOptions(true);
+
+  return (*pjrt_device_compiler)
+      ->CompileSingleOpIfNeeded(options, args, compile_options, ctx, *profiler,
+                                result, executable);
+}
+
+Status XlaCompileOnDemandOp::Compile(
+    const std::vector<XlaCompiler::Argument>& args, OpKernelContext* ctx,
+    XlaDeviceCompiler** xla_device_compiler,
+    DeviceCompilationProfiler** profiler,
+    const XlaCompiler::CompilationResult** result,
+    xla::LocalExecutable** executable) {
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
   ResourceMgr* rm = ctx->resource_manager();
@@ -137,54 +231,87 @@ Status XlaCompileOnDemandOp::Compile(
       platform_info_, /*has_ref_vars=*/true);
   // No detailed logging from on demand op.
   options.detailed_logging = false;
-  XlaCompiler::CompileOptions compile_options;
-  compile_options.is_entry_computation = true;
-  // Optimization: where possible, have the computation return a naked array
-  // rather than a one-element tuple.
-  compile_options.always_return_tuple = false;
-
-  std::vector<int> variables_indices =
-      GetResourceVariableIndicesFromContext(ctx);
-  StatusOr<std::vector<XlaCompiler::Argument>> args;
-  {
-    std::vector<VariableInfo> variable_infos;
-    TF_RETURN_IF_ERROR(
-        GetVariableInfosFromInputs(ctx->resource_manager(), ctx->device(),
-                                   inputs, variables_indices, &variable_infos));
-
-    TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
-    TF_RETURN_IF_ERROR(SnapshotResourceVariables(
-        ctx, variables_indices, variable_infos, variable_args));
-
-    args = XlaComputationLaunchContext::BuildXlaCompilerArguments(
-        constant_input_indices, inputs, variable_infos,
-        static_cast<Device*>(ctx->device()));
-    TF_RETURN_IF_ERROR(args.status());
-  }
+  XlaCompiler::CompileOptions compile_options = GetCompileOptions();
 
   return (*xla_device_compiler)
-      ->CompileSingleOpIfNeeded(options, *args, compile_options, ctx, *profiler,
+      ->CompileSingleOpIfNeeded(options, args, compile_options, ctx, *profiler,
                                 result, executable);
 }
 
 void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
   const XlaCompiler::CompilationResult* result;
-  xla::LocalExecutable* executable;
-  ResourceVarsSnapshot variable_args;
-  XlaDeviceCompiler* xla_device_compiler;
   DeviceCompilationProfiler* profiler;
+
   OP_REQUIRES(ctx, ctx->function_library(),
               errors::Internal("Function library missing"));
-  OP_REQUIRES_OK(ctx, Compile(ctx, &result, &xla_device_compiler, &profiler,
-                              &variable_args, &executable));
 
-  // Hold the reference to the XLA device compiler and profiler during
-  // evaluation. (We could probably free them sooner because the ResourceMgr
-  // will retain references, but this is more obviously correct.)
-  core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
-  core::ScopedUnref profiler_ref(profiler);
-  OP_REQUIRES_OK(
-      ctx, Run(ctx, xla_device_compiler, result, executable, variable_args));
+  // Get constants, inputs and variables from the OpKernelContext.
+  auto constant_indices_or = GetConstantInputIndicesFromContext(ctx);
+  OP_REQUIRES_OK(ctx, constant_indices_or.status());
+  std::vector<const Tensor*> inputs = InputsFromContext(ctx);
+  std::vector<int> variable_indices =
+      GetResourceVariableIndicesFromContext(ctx);
+
+  bool use_pjrt =
+      GetXlaOpsCommonFlags()
+          ->tf_xla_use_device_api.IsEnabledInXlaCompileOnDemandForDevice(
+              platform_info_.device_type());
+  if (use_pjrt) {
+    std::vector<VariableInfo> variables;
+    std::vector<XlaCompiler::Argument> args;
+    // Lock variables for the whole duration of compile + execute.
+    OP_REQUIRES_OK(ctx, GetAndLockVariablesAndBuildXlaCompilerArguments(
+                            *ctx, inputs, *constant_indices_or,
+                            variable_indices, &variables, &args));
+
+    PjRtDeviceCompiler* pjrt_device_compiler;
+    xla::PjRtLoadedExecutable* pjrt_executable;
+    OP_REQUIRES_OK(ctx, Compile(args, ctx, &pjrt_device_compiler, &profiler,
+                                &result, &pjrt_executable));
+    // Hold the reference to the XLA device compiler and profiler during
+    // evaluation. (We could probably free them sooner because the ResourceMgr
+    // will retain references, but this is more obviously correct.)
+    core::ScopedUnref pjrt_device_compiler_ref(pjrt_device_compiler);
+    core::ScopedUnref profiler_ref(profiler);
+
+    VLOG(2) << "Compiled op with PJRT: " << ctx->status();
+    VLOG(2) << "result != nullptr: " << (result != nullptr);
+    VLOG(2) << "pjrt_executable != nullptr: " << (pjrt_executable != nullptr);
+    VLOG(2) << "Executing with PJRT ...";
+
+    OP_REQUIRES_OK(ctx,
+                   RunPjRtExecutable(*pjrt_device_compiler->client(), inputs,
+                                     variables, *result, pjrt_executable, ctx));
+
+    VLOG(2) << "Completed executing with PJRT!";
+  } else {
+    ResourceVarsSnapshot variable_args;
+    std::vector<XlaCompiler::Argument> args;
+    // Lock variables only for generating XlaCompiler::Arguments and then
+    // release them.
+    {
+      std::vector<VariableInfo> variables;
+      OP_REQUIRES_OK(ctx, GetAndLockVariablesAndBuildXlaCompilerArguments(
+                              *ctx, inputs, *constant_indices_or,
+                              variable_indices, &variables, &args));
+      OP_REQUIRES_OK(ctx, SnapshotResourceVariables(ctx, variable_indices,
+                                                    variables, &variable_args));
+    }
+
+    XlaDeviceCompiler* xla_device_compiler;
+    xla::LocalExecutable* executable;
+    OP_REQUIRES_OK(ctx, Compile(args, ctx, &xla_device_compiler, &profiler,
+                                &result, &executable));
+    // Hold the reference to the XLA device compiler and profiler during
+    // evaluation. (We could probably free them sooner because the ResourceMgr
+    // will retain references, but this is more obviously correct.)
+    core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
+    core::ScopedUnref profiler_ref(profiler);
+
+    // Locks are acquired again when populating the `ctx` outputs.
+    OP_REQUIRES_OK(
+        ctx, Run(variable_args, result, xla_device_compiler, executable, ctx));
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
index c5e6e8a8c72..ced95edc604 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.h
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -19,14 +19,16 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
 
+#include <vector>
+
 #include "tensorflow/compiler/jit/device_compilation_profiler.h"
 #include "tensorflow/compiler/jit/variable_info.h"
 #include "tensorflow/compiler/jit/variable_info_util.h"
-#include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
@@ -45,21 +47,27 @@ class XlaCompileOnDemandOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  private:
-  XlaCompiler::Argument CreateCompilerArgument(OpKernelContext* ctx, int64_t i);
-  Status Compile(OpKernelContext* ctx,
-                 const XlaCompiler::CompilationResult** result,
+  Status Compile(const std::vector<XlaCompiler::Argument>& args,
+                 OpKernelContext* ctx,
                  DeviceCompiler<xla::LocalExecutable, xla::LocalClient>**
                      xla_device_compiler,
                  DeviceCompilationProfiler** profiler,
-                 ResourceVarsSnapshot* variable_args,
+                 const XlaCompiler::CompilationResult** result,
                  xla::LocalExecutable** executable);
 
-  Status Run(OpKernelContext* ctx,
-             DeviceCompiler<xla::LocalExecutable, xla::LocalClient>*
-                 xla_device_compiler,
+  Status Compile(const std::vector<XlaCompiler::Argument>& args,
+                 OpKernelContext* ctx,
+                 DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>**
+                     pjrt_device_compiler,
+                 DeviceCompilationProfiler** profiler,
+                 const XlaCompiler::CompilationResult** result,
+                 xla::PjRtLoadedExecutable** executable);
+
+  Status Run(const ResourceVarsSnapshot& variable_args,
              const XlaCompiler::CompilationResult* result,
-             xla::LocalExecutable* executable,
-             const ResourceVarsSnapshot& variable_args);
+             const DeviceCompiler<xla::LocalExecutable, xla::LocalClient>*
+                 xla_device_compiler,
+             xla::LocalExecutable* executable, OpKernelContext* ctx);
 
   const XlaPlatformInfo platform_info_;
 };
diff --git a/tensorflow/compiler/jit/xla_compile_util.cc b/tensorflow/compiler/jit/xla_compile_util.cc
index 8d72d20ba55..e5256a8b2c9 100644
--- a/tensorflow/compiler/jit/xla_compile_util.cc
+++ b/tensorflow/compiler/jit/xla_compile_util.cc
@@ -66,8 +66,10 @@ StatusOr<std::unique_ptr<Graph>> CreateSingleOpGraph(
   return graph;
 }
 
-bool UsePjRtForSingleDeviceCompilation() {
-  return GetXlaOpsCommonFlags()->tf_xla_use_device_api;
+bool UsePjRtForSingleDeviceCompilation(const DeviceType& device_type) {
+  const auto& rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
+  return rollout_config.IsEnabledInXlaLaunchForDevice(device_type) ||
+         rollout_config.IsEnabledInXlaCompileOnDemandForDevice(device_type);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_util.h b/tensorflow/compiler/jit/xla_compile_util.h
index bdc0ebafad5..345c55a86e5 100644
--- a/tensorflow/compiler/jit/xla_compile_util.h
+++ b/tensorflow/compiler/jit/xla_compile_util.h
@@ -44,7 +44,9 @@ StatusOr<std::unique_ptr<Graph>> CreateSingleOpGraph(
     const NodeDef& node_def, absl::Span<const XlaArgument> args,
     absl::Span<const DataType> result_types);
 
-bool UsePjRtForSingleDeviceCompilation();
+// Checks if single device compilation and execution with PJRT is enabled for
+// `device_type` in either the XlaLaunch op or the XlaCompileOnDemand op.
+bool UsePjRtForSingleDeviceCompilation(const DeviceType& device_type);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_COMPILE_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_compile_util_test.cc b/tensorflow/compiler/jit/xla_compile_util_test.cc
index 0e971a6b4db..9fc706fb649 100644
--- a/tensorflow/compiler/jit/xla_compile_util_test.cc
+++ b/tensorflow/compiler/jit/xla_compile_util_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
@@ -73,5 +74,49 @@ TEST_F(OpsTestBase, CreateSingleOpGraph) {
   EXPECT_EQ(retval_input_node->name(), "identity_op");
 }
 
+TEST(XlaCompileUtilTest, PjRtXlaLaunchFlagTest) {
+  EXPECT_FALSE(UsePjRtForSingleDeviceCompilation(DeviceType(DEVICE_CPU)));
+
+  // Flag is turned on, but no device is allowlisted.
+  auto& rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
+  rollout_config.enabled_for_xla_launch_ = true;
+
+  EXPECT_FALSE(UsePjRtForSingleDeviceCompilation(DeviceType(DEVICE_CPU)));
+
+  // Flag is turned on, some device is allowlisted, but the requested one isn't.
+  rollout_config.AllowForDeviceInXlaLaunch(DeviceType(DEVICE_GPU));
+  EXPECT_FALSE(UsePjRtForSingleDeviceCompilation(DeviceType(DEVICE_CPU)));
+
+  // Flag is turned on and the requested device is allowlisted.
+  rollout_config.AllowForDeviceInXlaLaunch(DeviceType(DEVICE_CPU));
+  EXPECT_TRUE(UsePjRtForSingleDeviceCompilation(DeviceType(DEVICE_CPU)));
+
+  // The requested device is allowlisted, but the flag is turned off.
+  rollout_config.enabled_for_xla_launch_ = false;
+  EXPECT_FALSE(UsePjRtForSingleDeviceCompilation(DeviceType(DEVICE_CPU)));
+}
+
+TEST(XlaCompileUtilTest, PjRtXlaCompileOnDemandFlagTest) {
+  EXPECT_FALSE(UsePjRtForSingleDeviceCompilation(DeviceType(DEVICE_CPU)));
+
+  // Flag is turned on, but no device is allowlisted.
+  auto& rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
+  rollout_config.enabled_for_compile_on_demand_ = true;
+
+  EXPECT_FALSE(UsePjRtForSingleDeviceCompilation(DeviceType(DEVICE_CPU)));
+
+  // Flag is turned on, some device is allowlisted, but the requested one isn't.
+  rollout_config.AllowForDeviceInXlaCompileOnDemand(DeviceType(DEVICE_GPU));
+  EXPECT_FALSE(UsePjRtForSingleDeviceCompilation(DeviceType(DEVICE_CPU)));
+
+  // Flag is turned on and the requested device is allowlisted.
+  rollout_config.AllowForDeviceInXlaCompileOnDemand(DeviceType(DEVICE_CPU));
+  EXPECT_TRUE(UsePjRtForSingleDeviceCompilation(DeviceType(DEVICE_CPU)));
+
+  // The requested device is allowlisted, but the flag is turned off.
+  rollout_config.enabled_for_compile_on_demand_ = false;
+  EXPECT_FALSE(UsePjRtForSingleDeviceCompilation(DeviceType(DEVICE_CPU)));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compiler_options_util.cc b/tensorflow/compiler/jit/xla_compiler_options_util.cc
index c8ffcfa6d8b..8580bcfbeef 100644
--- a/tensorflow/compiler/jit/xla_compiler_options_util.cc
+++ b/tensorflow/compiler/jit/xla_compiler_options_util.cc
@@ -86,8 +86,13 @@ XlaCompiler::Options GenerateCompilerOptionsForPjRt(
   options.device_ordinal = device_base->parsed_name().id;
   options.flib_def = function_library.GetFunctionLibraryDefinition();
   options.graph_def_version = function_library.graph_def_version();
-  if (platform_info.xla_device_metadata()) {
-    auto metadata = platform_info.xla_device_metadata();
+  if (const auto* metadata = platform_info.xla_device_metadata();
+      metadata != nullptr) {
+    options.device_type = metadata->jit_device_type();
+    options.shape_determination_fns =
+        metadata->default_shape_determination_fns();
+  } else if (const auto* metadata = platform_info.pjrt_device_metadata();
+             metadata != nullptr) {
     options.device_type = metadata->jit_device_type();
     options.shape_determination_fns =
         metadata->default_shape_determination_fns();
diff --git a/tensorflow/compiler/jit/xla_compiler_options_util_test.cc b/tensorflow/compiler/jit/xla_compiler_options_util_test.cc
index 06bcfe2facb..2a4742567e4 100644
--- a/tensorflow/compiler/jit/xla_compiler_options_util_test.cc
+++ b/tensorflow/compiler/jit/xla_compiler_options_util_test.cc
@@ -23,6 +23,8 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/test_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -52,8 +54,8 @@ XlaDeviceCompiler* CreateXlaDeviceCompiler(
                                std::move(compiler_client));
 }
 
-std::unique_ptr<XlaDevice::Metadata> CreateXlaDeviceMetadata(
-    DeviceType compilation_device_type) {
+std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+GetShapeDeterminationFns() {
   XlaHelpers::ShapeRepresentationFn shape_representation_fn =
       [](const TensorShape&, DataType, bool, XlaLayoutPreference) {
         return xla::Shape();
@@ -62,73 +64,83 @@ std::unique_ptr<XlaDevice::Metadata> CreateXlaDeviceMetadata(
       [](const TensorShape&, DataType, std::optional<XlaArgument::Kind>) {
         return tensorflow::XlaLayoutPreference::kTpuPreferLinearLayout;
       };
-  std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
-      shape_determination_fns = {XlaShapeLayoutHelpers::ShapeDeterminationFns{
-          layout_preference_fn, shape_representation_fn}};
+  return {XlaShapeLayoutHelpers::ShapeDeterminationFns{
+      layout_preference_fn, shape_representation_fn}};
+}
+
+std::unique_ptr<XlaDevice::Metadata> CreateXlaDeviceMetadata(
+    DeviceType compilation_device_type) {
   return std::make_unique<XlaDevice::Metadata>(
       /*device_ordinal=*/0, /*platform=*/nullptr, compilation_device_type,
-      shape_determination_fns, XlaDevice::PaddedShapeFn(),
+      GetShapeDeterminationFns(), XlaDevice::PaddedShapeFn(),
       /*use_multiple_streams=*/false);
 }
 
+std::unique_ptr<PjRtBaseDevice::Metadata> CreatePjRtDeviceMetadata(
+    DeviceType compilation_device_type) {
+  return std::make_unique<PjRtBaseDevice::Metadata>(compilation_device_type,
+                                                    GetShapeDeterminationFns());
+}
+
 class XlaCompilerOptionsTest : public ::testing::Test {
  protected:
   void SetUp() override {
     tensorflow::GetXlaDeviceFlags()->tf_xla_enable_xla_devices = true;
   }
 
-  void AddDevicesAndSetUp(const std::vector<std::string>& device_names) {
-    SessionOptions options;
-    auto* device_count = options.config.mutable_device_count();
-    for (const auto& device_name : device_names) {
-      device_count->insert({device_name, 1});
-    }
-
-    std::vector<std::unique_ptr<Device>> devices;
-    TF_CHECK_OK(DeviceFactory::AddDevices(
-        options, "/job:localhost/replica:0/task:0", &devices));
-    device_mgr_ = std::make_unique<StaticDeviceMgr>(std::move(devices));
-
-    OptimizerOptions opts;
-    lib_def_ = std::make_unique<FunctionLibraryDefinition>(
-        OpRegistry::Global(), FunctionDefLibrary());
-    pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
-        device_mgr_.get(), Env::Default(), /*config=*/nullptr,
-        TF_GRAPH_DEF_VERSION, lib_def_.get(), opts,
-        /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
-    flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
-  }
-
-  Device* GetXlaGpuDevice() {
-    if (device_mgr_ == nullptr) {
-      return nullptr;
-    }
-
-    Device* device;
-    TF_CHECK_OK(device_mgr_->LookupDevice(
-        "/job:localhost/replica:0/task:0/device:XLA_GPU:0", &device));
-    return device;
-  }
-
-  FunctionLibraryRuntime* flr_;
-  std::unique_ptr<DeviceMgr> device_mgr_;
-  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  DeviceSetup device_setup_;
 };
 
 TEST_F(XlaCompilerOptionsTest, PjRtOptionsXlaDevice) {
-  AddDevicesAndSetUp({DEVICE_XLA_GPU});
-  Device* device = GetXlaGpuDevice();
+  device_setup_.AddDevicesAndSetUp({DEVICE_XLA_GPU});
+  Device* device = device_setup_.GetDevice(DEVICE_XLA_GPU);
   DeviceType compilation_device_type = DeviceType(DEVICE_GPU_XLA_JIT);
 
   se::Platform::Id platform_id = nullptr;
   auto xla_device_metadata = CreateXlaDeviceMetadata(compilation_device_type);
   std::shared_ptr<se::DeviceMemoryAllocator> custom_allocator;
-  XlaPlatformInfo platform_info(compilation_device_type, platform_id,
-                                xla_device_metadata.get(), custom_allocator);
+  XlaPlatformInfo platform_info(
+      compilation_device_type, platform_id, xla_device_metadata.get(),
+      /*pjrt_device_metadata=*/nullptr, custom_allocator);
 
-  XlaCompiler::Options options =
-      GenerateCompilerOptionsForPjRt(*flr_, device, platform_info);
+  XlaCompiler::Options options = GenerateCompilerOptionsForPjRt(
+      *device_setup_.flr(), device, platform_info);
+
+  EXPECT_EQ(options.device_type, compilation_device_type);
+  EXPECT_EQ(options.device_ordinal, 0);
+  EXPECT_NE(options.flib_def, nullptr);
+  EXPECT_EQ(options.graph_def_version, TF_GRAPH_DEF_VERSION);
+  EXPECT_FALSE(options.allow_cpu_custom_calls);
+  EXPECT_FALSE(options.alias_passthrough_params);
+  EXPECT_FALSE(options.detailed_logging);
+  // Check if options have the supplied shape determination functions set.
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto shape, options.shape_determination_fns.shape_representation_fn(
+                      TensorShape(), DT_FLOAT, false,
+                      tensorflow::XlaLayoutPreference::kTpuPreferLinearLayout));
+  EXPECT_EQ(shape, xla::Shape());
+  EXPECT_EQ(options.shape_determination_fns.layout_preference_fn(
+                TensorShape(), DT_FLOAT, std::nullopt),
+            tensorflow::XlaLayoutPreference::kTpuPreferLinearLayout);
+}
+
+TEST_F(XlaCompilerOptionsTest, PjRtOptionsPjRtBaseDevice) {
+  // Although DEVICE_CPU isn't a PjRtBaseDevice, we use it here just for testing
+  // purposes and to keep things simple. Creating a TpuDevice or
+  // NextPluggableDevice in the context of this unit test is non-trivial.
+  device_setup_.AddDevicesAndSetUp({DEVICE_CPU});
+  Device* device = device_setup_.GetDevice(DEVICE_CPU);
+  DeviceType compilation_device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+
+  auto pjrt_device_metadata = CreatePjRtDeviceMetadata(compilation_device_type);
+  XlaPlatformInfo platform_info(
+      compilation_device_type, /*platform_id=*/nullptr,
+      /*xla_device_metadata=*/nullptr,
+      /*pjrt_device_metadata=*/pjrt_device_metadata.get(),
+      /*device_allocator=*/nullptr);
+
+  XlaCompiler::Options options = GenerateCompilerOptionsForPjRt(
+      *device_setup_.flr(), device, platform_info);
 
   EXPECT_EQ(options.device_type, compilation_device_type);
   EXPECT_EQ(options.device_ordinal, 0);
@@ -149,12 +161,12 @@ TEST_F(XlaCompilerOptionsTest, PjRtOptionsXlaDevice) {
 }
 
 TEST_F(XlaCompilerOptionsTest, XlaOptions) {
-  AddDevicesAndSetUp({DEVICE_XLA_CPU});
-  Device* device = device_mgr_->HostCPU();
+  device_setup_.AddDevicesAndSetUp({DEVICE_XLA_GPU});
+  Device* device = device_setup_.GetDevice(DEVICE_XLA_GPU);
 
   xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
-  DeviceType device_type = DeviceType(DEVICE_XLA_CPU);
-  DeviceType compilation_device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+  DeviceType device_type = DeviceType(DEVICE_XLA_GPU);
+  DeviceType compilation_device_type = DeviceType(DEVICE_GPU_XLA_JIT);
 
   auto xla_device_compiler = CreateXlaDeviceCompiler(
       XlaDeviceExecutablePersistor::Config(), compilation_device_type, client);
@@ -163,11 +175,13 @@ TEST_F(XlaCompilerOptionsTest, XlaOptions) {
   se::Platform::Id platform_id = se::host::kHostPlatformId;
   auto xla_device_metadata = CreateXlaDeviceMetadata(compilation_device_type);
   std::shared_ptr<se::DeviceMemoryAllocator> custom_allocator;
-  XlaPlatformInfo platform_info(device_type, platform_id,
-                                xla_device_metadata.get(), custom_allocator);
+  XlaPlatformInfo platform_info(
+      device_type, platform_id, xla_device_metadata.get(),
+      /*pjrt_device_metadata=*/nullptr, custom_allocator);
 
-  XlaCompiler::Options options = GenerateCompilerOptions(
-      *xla_device_compiler, *flr_, device, nullptr, platform_info, false);
+  XlaCompiler::Options options =
+      GenerateCompilerOptions(*xla_device_compiler, *device_setup_.flr(),
+                              device, nullptr, platform_info, false);
 
   EXPECT_EQ(options.device_type, compilation_device_type);
   EXPECT_NE(options.flib_def, nullptr);
@@ -187,8 +201,8 @@ TEST_F(XlaCompilerOptionsTest, XlaOptions) {
 }
 
 TEST_F(XlaCompilerOptionsTest, XlaOptionsHasRefVarsNoXlaDeviceMetadata) {
-  AddDevicesAndSetUp({DEVICE_CPU});
-  Device* device = device_mgr_->HostCPU();
+  device_setup_.AddDevicesAndSetUp({DEVICE_CPU});
+  Device* device = device_setup_.GetDevice(DEVICE_CPU);
 
   xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
   DeviceType device_type = DeviceType(DEVICE_CPU);
@@ -200,11 +214,13 @@ TEST_F(XlaCompilerOptionsTest, XlaOptionsHasRefVarsNoXlaDeviceMetadata) {
 
   se::Platform::Id platform_id = se::host::kHostPlatformId;
   std::shared_ptr<se::DeviceMemoryAllocator> custom_allocator;
-  XlaPlatformInfo platform_info(device_type, platform_id, nullptr,
-                                custom_allocator);
+  XlaPlatformInfo platform_info(
+      device_type, platform_id, /*xla_device_metadata=*/nullptr,
+      /*pjrt_device_metadata=*/nullptr, custom_allocator);
 
-  XlaCompiler::Options options = GenerateCompilerOptions(
-      *xla_device_compiler, *flr_, device, nullptr, platform_info, false);
+  XlaCompiler::Options options =
+      GenerateCompilerOptions(*xla_device_compiler, *device_setup_.flr(),
+                              device, nullptr, platform_info, false);
 
   EXPECT_EQ(options.device_type, compilation_device_type);
   EXPECT_NE(options.flib_def, nullptr);
@@ -227,7 +243,7 @@ TEST_F(XlaCompilerOptionsTest, XlaOptionsHasRefVarsNoXlaDeviceMetadata) {
 }
 
 TEST_F(XlaCompilerOptionsTest, TfRtTpuOptions) {
-  AddDevicesAndSetUp({DEVICE_TPU_NODE});
+  device_setup_.AddDevicesAndSetUp({DEVICE_TPU_NODE});
 
   // Just use the default local client for testing purposes.
   xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
@@ -237,8 +253,8 @@ TEST_F(XlaCompilerOptionsTest, TfRtTpuOptions) {
       XlaDeviceExecutablePersistor::Config(), compilation_device_type, client);
   core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
 
-  XlaCompiler::Options options =
-      GenerateCompilerOptionsForTfrtTpu(*xla_device_compiler, *flr_);
+  XlaCompiler::Options options = GenerateCompilerOptionsForTfrtTpu(
+      *xla_device_compiler, *device_setup_.flr());
 
   EXPECT_EQ(options.device_type, compilation_device_type);
   EXPECT_NE(options.flib_def, nullptr);
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 883f0edd91b..4742f8a72ea 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_compile_util.h"
 #include "tensorflow/compiler/jit/xla_device_context.h"
-#include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -213,6 +212,7 @@ XlaDevice::XlaDevice(const SessionOptions& session_options,
                                             : DefaultPaddedShapeFn,
                     options.use_multiple_streams),
       device_ordinal_(options.device_ordinal),
+      device_name_(options.device_name),
       jit_device_name_(options.compilation_device_name),
       platform_(options.platform),
       intra_op_parallelism_threads_(
@@ -272,7 +272,7 @@ Allocator* XlaDevice::GetAllocatorLocked(AllocatorAttributes attr) {
   }
 
   if (xla_allocator_ == nullptr) {
-    if (UsePjRtForSingleDeviceCompilation()) {
+    if (UsePjRtForSingleDeviceCompilation(device_name_)) {
       VLOG(1) << "XlaDevice " << this << " uses AsyncValueAllocator";
       pjrt_allocator_ = std::make_unique<AsyncValueAllocator>();
       xla_allocator_ = pjrt_allocator_.get();
@@ -308,16 +308,14 @@ Status XlaDevice::EnsureStreamOkLocked(xla::Backend* backend,
 }
 
 StatusOr<std::vector<DeviceContext*>> XlaDevice::GetDeviceContextLocked() {
-  if (UsePjRtForSingleDeviceCompilation()) {
-    // TODO(b/262472386) Support shape_determination_fns with PJRT.
-    if (shape_determination_fns_.size() > 1) {
-      return errors::Unimplemented(
-          "Use PJRT with multiple ShapeDeterminationFn is not implemented.");
-    }
+  if (UsePjRtForSingleDeviceCompilation(device_name_)) {
     if (device_contexts_.empty()) {
-      device_contexts_.emplace_back(new PjRtDeviceContext());
-      VLOG(1) << "XlaDevice " << this << " new PjRtDeviceContext "
-              << device_contexts_[0];
+      for (const auto& iter : shape_determination_fns_) {
+        auto device_context = new PjRtDeviceContext(iter);
+        VLOG(1) << "XlaDevice " << this << " new PjRtDeviceContext "
+                << device_context;
+        device_contexts_.emplace_back(device_context);
+      }
       if (use_accelerator_device_info_) {
         auto accelerator_device_info =
             std::make_unique<DeviceBase::AcceleratorDeviceInfo>();
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index e902afb8425..26c7a8d9a1b 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include <set>
 
 #include "absl/types/optional.h"
-#include "tensorflow/compiler/jit/xla_device_context.h"
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/layout_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -226,6 +225,8 @@ class XlaDevice : public LocalDevice {
   const Metadata xla_metadata_;
   // Which hardware device in the client's platform this XlaDevice controls.
   const int device_ordinal_;
+  // The name/type of this XlaDevice. eg. "XLA_GPU".
+  const DeviceType device_name_;
   // The name of the device that is used to compile Ops for this XlaDevice.
   const DeviceType jit_device_name_;
   // The platform for this device.
diff --git a/tensorflow/compiler/jit/xla_device_compiler_client.cc b/tensorflow/compiler/jit/xla_device_compiler_client.cc
index 37ebffd95dc..46689a0d547 100644
--- a/tensorflow/compiler/jit/xla_device_compiler_client.cc
+++ b/tensorflow/compiler/jit/xla_device_compiler_client.cc
@@ -102,6 +102,8 @@ XlaDeviceCompilerClient::LoadExecutable(
 }
 
 void XlaDeviceCompilerClient::WaitForProgramsToFinish() {
+  if (client_ == nullptr) return;
+
   for (auto* executor : client_->backend().stream_executors()) {
     bool ok = executor->SynchronizeAllActivity();
     if (!ok) {
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index ee00464178f..0309086b41d 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
diff --git a/tensorflow/compiler/jit/xla_device_ops.cc b/tensorflow/compiler/jit/xla_device_ops.cc
index 82aaa368d93..9305de9e47d 100644
--- a/tensorflow/compiler/jit/xla_device_ops.cc
+++ b/tensorflow/compiler/jit/xla_device_ops.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/jit/xla_device_context.h"
 #include "tensorflow/compiler/jit/xla_tensor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/jit/xla_kernel_creator.cc b/tensorflow/compiler/jit/xla_kernel_creator.cc
index 934071d7ca4..fbf24aeda65 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/jit/xla_kernel_creator.h"
 
+#include <memory>
+#include <vector>
+
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -25,9 +28,18 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/mlir_bridge_pass.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/node_properties.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace tensorflow {
 
@@ -87,7 +99,7 @@ Status XlaKernelCreator::CreateKernel(
   return CreateXlaKernel(flr, props->node_def, kernel);
 }
 
-static bool RegisterLaunchOpCreator() {
+bool RegisterLaunchOpCreator() {
   XlaKernelCreator* xla_kernel_creator = new XlaKernelCreator();
   RegisterDefaultCustomKernelCreator(xla_kernel_creator);
   return true;
diff --git a/tensorflow/compiler/jit/xla_kernel_creator.h b/tensorflow/compiler/jit/xla_kernel_creator.h
index 856701a791d..843a21acd19 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator.h
+++ b/tensorflow/compiler/jit/xla_kernel_creator.h
@@ -15,8 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_H_
 
+#include <memory>
+
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -39,6 +42,8 @@ class XlaKernelCreator : public CustomKernelCreator {
                       std::unique_ptr<OpKernel>* kernel) const override;
 };
 
+bool RegisterLaunchOpCreator();
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_KERNEL_CREATOR_H_
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index bc348948e4f..0ae56482025 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/cleanup/cleanup.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/jit/variable_info.h"
 #include "tensorflow/compiler/jit/variable_info_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
@@ -41,8 +42,11 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/tfrt/common/async_value_tensor.h"
 #include "tensorflow/core/util/stream_executor_util.h"
+#include "tensorflow/tsl/framework/device_id_utils.h"
 #include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -60,10 +64,17 @@ se::Platform::Id XlaPlatformInfoFromDevice(DeviceBase* device_base) {
   return platform_id;
 }
 
+absl::flat_hash_map<int, int> CreateVariableLookup(
+    const std::vector<VariableInfo>& variables) {
+  absl::flat_hash_map<int, int> variable_lookup;
+  for (int i = 0; i < variables.size(); i++) {
+    variable_lookup[variables[i].index()] = i;
+  }
+  return variable_lookup;
+}
+
 }  // anonymous namespace
 
-
-
 std::vector<const Tensor*> InputsFromContext(OpKernelContext* ctx) {
   std::vector<const Tensor*> inputs;
   inputs.reserve(ctx->num_inputs());
@@ -576,4 +587,157 @@ XlaComputationLaunchContext::BuildXlaCompilerArguments(
   return out;
 }
 
+void PreparePjRtExecutableArguments(
+    const std::vector<int>& input_mapping,
+    const std::vector<const Tensor*>& inputs,
+    const std::vector<VariableInfo>& variables,
+    std::vector<xla::PjRtBuffer*>* args,
+    absl::flat_hash_set<int>* non_donatable_input_indices) {
+  const auto& variable_lookup = CreateVariableLookup(variables);
+
+  for (auto arg_num : input_mapping) {
+    const Tensor* tensor;
+    if (auto it = variable_lookup.find(arg_num); it != variable_lookup.end()) {
+      tensor = variables[it->second].var()->tensor();
+    } else {
+      tensor = inputs[arg_num];
+    }
+    if (!tensor->RefCountIsOne()) {
+      non_donatable_input_indices->insert(arg_num);
+    }
+
+    AsyncValueTensor* av_tensor = AsyncValueTensor::FromTensor(tensor);
+    if (av_tensor->GetBuffer() == nullptr) {
+      // TODO(b/260799971): verify size 0 argument is supported.
+      CHECK_EQ(tensor->NumElements(), 0);  // Crash OK
+      continue;
+    }
+    args->push_back(av_tensor->GetBuffer().get());
+  }
+}
+
+Status PopulateCtxOutputsFromPjRtExecutableOutputs(
+    const std::vector<const Tensor*>& inputs,
+    const std::vector<VariableInfo>& variables,
+    const XlaCompiler::CompilationResult& compilation_result,
+    std::vector<std::unique_ptr<xla::PjRtBuffer>>& executable_outputs,
+    OpKernelContext* ctx) {
+  const auto& variable_lookup = CreateVariableLookup(variables);
+
+  // Copy XLA results to the OpOutputList.
+  int output_num = 0;
+  for (int i = 0, end = ctx->num_outputs(); i < end; ++i) {
+    const DataType& type = compilation_result.outputs[i].type;
+    VLOG(2) << "Populating output for retval " << i << " type "
+            << DataTypeString(type);
+
+    if (compilation_result.outputs[i].is_constant) {
+      bool requires_copy_to_device = GetDeviceType(ctx) != DEVICE_CPU;
+      TF_RETURN_IF_ERROR(SetOutputForConstant(ctx, requires_copy_to_device,
+                                              &compilation_result, i));
+    } else if (type == DT_RESOURCE) {
+      int input_index = compilation_result.outputs[i].input_index;
+      TF_RET_CHECK(input_index >= 0 && input_index < ctx->num_inputs())
+          << "Invalid input for outputs " << i << ": " << input_index;
+      ctx->set_output(i, *inputs[input_index]);
+    } else {
+      Tensor* output_tensor;
+      TF_ASSIGN_OR_RETURN(
+          xla::Shape device_shape,
+          executable_outputs[output_num]->logical_on_device_shape());
+      TensorShape tensor_shape;
+      TF_RETURN_IF_ERROR(XLAShapeToTensorShape(device_shape, &tensor_shape));
+      TF_RETURN_IF_ERROR(ctx->allocate_output(i, tensor_shape, &output_tensor));
+      auto output_avt = AsyncValueTensor::FromTensor(output_tensor);
+      output_avt->SetBuffer(std::move(executable_outputs[output_num]));
+      ++output_num;
+    }
+  }
+
+  // Apply variable updates, if any.
+  for (int i = 0, end = compilation_result.resource_updates.size(); i < end;
+       ++i) {
+    const XlaCompiler::ResourceUpdate& write =
+        compilation_result.resource_updates[i];
+    int actual_input_index = write.input_index;
+    CHECK_GE(actual_input_index, 0);                  // Crash OK
+    CHECK_LT(actual_input_index, ctx->num_inputs());  // Crash OK
+    auto it = variable_lookup.find(actual_input_index);
+    if (it == variable_lookup.end()) {
+      continue;
+    }
+    Var* var = variables[it->second].var();
+    CHECK(var);  // Crash OK
+
+    VLOG(2) << "Updating variable #" << i
+            << " at input index: " << actual_input_index << " with shape "
+            << write.shape.DebugString() << "; variable tensor has shape: "
+            << var->tensor()->shape().DebugString();
+
+    if (var->is_initialized && var->tensor()->dtype() != write.type) {
+      return errors::Internal("Mismatched type in variable write");
+    }
+
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(
+        var->tensor()->dtype(), var->tensor()->shape(), var->tensor()));
+    AsyncValueTensor::FromTensor(var->tensor())
+        ->SetBuffer(std::move(executable_outputs[output_num]));
+    var->is_initialized |= write.modified;
+    ++output_num;
+  }
+  return OkStatus();
+}
+
+xla::ExecuteOptions GetPjRtExecuteOptions(
+    absl::flat_hash_set<int> non_donatable_input_indices) {
+  xla::ExecuteOptions options;
+  options.arguments_are_tupled = false;
+  options.untuple_result = true;
+  // Note: TF does not use PJRT host callbacks as of today. Setting this option
+  // to true to workaround an ExecuteOptions check: [1].
+  //
+  // [1]:
+  // tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc;l=923-927;rcl=519286815
+  options.use_major_to_minor_data_layout_for_callbacks = true;
+  options.non_donatable_input_indices = std::move(non_donatable_input_indices);
+  return options;
+}
+
+DeviceType GetDeviceType(OpKernelContext* ctx) {
+  auto* device =
+      tensorflow::down_cast<Device*>(ctx->device()->UnderlyingDevice());
+  return DeviceType(device->device_type());
+}
+
+Status RunPjRtExecutable(
+    const xla::PjRtClient& pjrt_client,
+    const std::vector<const Tensor*>& inputs,
+    const std::vector<VariableInfo>& variables,
+    const XlaCompiler::CompilationResult& compilation_result,
+    xla::PjRtLoadedExecutable* executable, OpKernelContext* ctx) {
+  TF_ASSIGN_OR_RETURN(const int pjrt_device_id,
+                      tsl::GetDeviceIdFromDeviceParsedName(
+                          ctx->device()->parsed_name(), GetDeviceType(ctx)));
+  TF_ASSIGN_OR_RETURN(xla::PjRtDevice * device,
+                      pjrt_client.LookupAddressableDevice(pjrt_device_id));
+
+  std::vector<xla::PjRtBuffer*> executable_args;
+  executable_args.reserve(compilation_result.input_mapping.size());
+  absl::flat_hash_set<int> non_donatable_input_indices;
+  PreparePjRtExecutableArguments(compilation_result.input_mapping, inputs,
+                                 variables, &executable_args,
+                                 &non_donatable_input_indices);
+  // TODO(b/257548614): currently PJRT is compiled as portable (num_replica = 1
+  // and num_partition = 1). Support multiple partitions case.
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<xla::PjRtBuffer>> execute_outputs,
+      executable->ExecutePortable(
+          executable_args, device,
+          GetPjRtExecuteOptions(std::move(non_donatable_input_indices))));
+
+  TF_RETURN_IF_ERROR(PopulateCtxOutputsFromPjRtExecutableOutputs(
+      inputs, variables, compilation_result, execute_outputs, ctx));
+  return OkStatus();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 0e7a806d79c..1a9771068fc 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
 
 #include <map>
+#include <memory>
 #include <set>
 #include <vector>
 
@@ -26,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
@@ -51,6 +53,53 @@ Status SetOutputForConstant(
     OpKernelContext* ctx, bool requires_copy_to_device,
     const XlaCompiler::CompilationResult* compilation_result, int output_num);
 
+// Converts input tensors and variables which are parameters of the
+// XlaComputation into PjRtBuffers to be fed as input to the
+// PjRtLoadedExecutable. `input_mapping` is a vector that maps from the
+// parameters of the XlaComputation to their original argument positions. This
+// can be sourced from `XlaCompiler::CompilationResult::input_mapping`.
+//
+// The obtained PjRtBuffers are populated to `args` vector.
+// `non_donatable_input_indices` will also be set, which contains the indices of
+// the input that should not be donated to output.
+void PreparePjRtExecutableArguments(
+    const std::vector<int>& input_mapping,
+    const std::vector<const Tensor*>& inputs,
+    const std::vector<VariableInfo>& variables,
+    std::vector<xla::PjRtBuffer*>* args,
+    absl::flat_hash_set<int>* non_donatable_input_indices);
+
+// Populates the OpKernelContext outputs with the outputs of the
+// PjRtLoadedExecutable. Requires the `compilation_result` used to build the
+// PjRtLoadedExecutable.
+Status PopulateCtxOutputsFromPjRtExecutableOutputs(
+    const std::vector<const Tensor*>& inputs,
+    const std::vector<VariableInfo>& variables,
+    const XlaCompiler::CompilationResult& compilation_result,
+    std::vector<std::unique_ptr<xla::PjRtBuffer>>& executable_outputs,
+    OpKernelContext* ctx);
+
+// Returns the options used for executing a PjRtLoadedExecutable.
+xla::ExecuteOptions GetPjRtExecuteOptions(
+    absl::flat_hash_set<int> non_donatable_input_indices);
+
+// Returns the device ordinal from the parsed name of the device.
+int GetDeviceOrdinal(const DeviceBase* device);
+
+// Returns the device type from the OpKernelContext.
+DeviceType GetDeviceType(OpKernelContext* ctx);
+
+// Runs `executable` and populates the outputs in `ctx`. `inputs` and
+// `variables` are the input arguments to the computation, usually read from the
+// OpKernelContext, `ctx`. Requires the device-appropriate `pjrt_client` and the
+// `compilation_result` used to build the `executable`.
+Status RunPjRtExecutable(
+    const xla::PjRtClient& pjrt_client,
+    const std::vector<const Tensor*>& inputs,
+    const std::vector<VariableInfo>& variables,
+    const XlaCompiler::CompilationResult& compilation_result,
+    xla::PjRtLoadedExecutable* executable, OpKernelContext* ctx);
+
 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
 // ShapedBuffers suitable for passing to an XLA computation.
 class XlaComputationLaunchContext {
diff --git a/tensorflow/compiler/jit/xla_launch_util_test.cc b/tensorflow/compiler/jit/xla_launch_util_test.cc
new file mode 100644
index 00000000000..789daa318e1
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_launch_util_test.cc
@@ -0,0 +1,542 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/jit/device_compiler.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/pjrt_device_compiler_client.h"
+#include "tensorflow/compiler/jit/variable_info.h"
+#include "tensorflow/compiler/jit/variable_info_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/tfrt/common/create_pjrt_client_util.h"
+#include "tensorflow/core/tfrt/common/pjrt_util.h"
+#include "tensorflow/tsl/framework/allocator.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace {
+using PjRtDeviceCompiler =
+    DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>;
+using PjRtDeviceExecutablePersistor =
+    DeviceExecutablePersistor<xla::PjRtLoadedExecutable, xla::PjRtClient>;
+
+class PjRtExecutionUtilTest : public OpsTestBase {
+ public:
+  PjRtExecutionUtilTest() {
+    // Set flag to use PJRT for device compilation and execution.
+    auto& rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
+    rollout_config.enabled_for_xla_launch_ = true;
+    rollout_config.enabled_for_compile_on_demand_ = true;
+
+    // Set flag to enable using XLA devices. PJRT currently is only supported
+    // for XLA devices.
+    GetXlaDeviceFlags()->tf_xla_enable_xla_devices = true;
+
+    // Add and setup the XLA_CPU device.
+    auto device_type = DeviceType(DEVICE_XLA_CPU);
+    rollout_config.AllowForDeviceInXlaLaunch(device_type);
+    rollout_config.AllowForDeviceInXlaCompileOnDemand(device_type);
+
+    auto jit_device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+    auto device =
+        DeviceFactory::NewDevice(device_type.type_string(), SessionOptions(),
+                                 "/job:localhost/replica:0/task:0");
+    device_ = device.get();
+    SetDevice(device_type, std::move(device));
+
+    // Create PjRtClient for XLA_CPU.
+    TF_CHECK_OK(SetPjRtClientInTFGlobalResourceManager(
+        device_type,
+        xla::GetTfrtCpuClient(/*asynchronous=*/true, /*cpu_device_count=*/1)
+            .value()));
+
+    // device_context_ should be a PjRtDeviceContext.
+    TF_CHECK_OK(device_->TryGetDeviceContext(&device_context_));
+
+    // Get the host allocator.
+    AllocatorAttributes host_alloc_attr;
+    host_alloc_attr.set_on_host(true);
+    host_allocator_ = device_->GetAllocator(host_alloc_attr);
+
+    // Get the device allocator. This should give us an AsyncValueAllocator.
+    AllocatorAttributes device_alloc_attr;
+    device_alloc_attr.set_on_host(false);
+    device_allocator_ = device_->GetAllocator(device_alloc_attr);
+
+    // Create the DeviceCompiler to help with compiling executables.
+    auto pjrt_client_or = GetOrCreatePjRtClient(device_type_);
+    TF_CHECK_OK(pjrt_client_or.status());
+    pjrt_client_ = pjrt_client_or.value();
+    device_compiler_ = new PjRtDeviceCompiler(
+        std::make_unique<PjRtDeviceExecutablePersistor>(
+            PjRtDeviceExecutablePersistor::Config(), jit_device_type),
+        std::make_unique<PjRtDeviceCompilerClient>(pjrt_client_));
+    profiler_ = new DeviceCompilationProfiler();
+
+    compiler_options_.device_type = jit_device_type;
+    compiler_options_.client = nullptr;
+    compiler_options_.flib_def = flib_def_.get();
+  }
+
+  ~PjRtExecutionUtilTest() override {
+    for (const auto& tensor : tensors_) {
+      delete tensor;
+    }
+    tensors_.clear();
+    device_context_->Unref();
+    core::ScopedUnref device_compiler_ref(device_compiler_);
+    core::ScopedUnref profiler_ref(profiler_);
+  }
+
+  // Creates a Tensor on host using the host_allocator_
+  template <typename T>
+  Tensor* CreateHostTensor(const TensorShape& shape,
+                           const gtl::ArraySlice<T> data) {
+    Tensor* host_tensor =
+        new Tensor(host_allocator_, DataTypeToEnum<T>::v(), shape);
+    test::FillValues<T>(host_tensor, data);
+    tensors_.push_back(host_tensor);
+    return host_tensor;
+  }
+
+  // Creates a Tensor on device using the device_allocator_
+  template <typename T>
+  Tensor* CreateDeviceTensor(const TensorShape& shape,
+                             const gtl::ArraySlice<T> data) {
+    Tensor* host_tensor = CreateHostTensor<T>(shape, data);
+    Tensor* device_tensor =
+        new Tensor(device_allocator_, DataTypeToEnum<T>::v(), shape);
+    TF_EXPECT_OK(device_context_->CopyCPUTensorToDeviceSync(
+        host_tensor, device_, device_tensor));
+
+    tensors_.push_back(device_tensor);
+    return device_tensor;
+  }
+
+  // Gets the `output_index`-th output set in the context_
+  Tensor* GetOutput(int output_index) {
+    CHECK_LT(output_index, context_->num_outputs());
+    Tensor* device_tensor = context_->mutable_output(output_index);
+    managed_outputs_.resize(context_->num_outputs());
+    if (managed_outputs_[output_index]) {
+      return managed_outputs_[output_index];
+    }
+
+    Tensor* host_tensor = new Tensor(host_allocator_, device_tensor->dtype(),
+                                     device_tensor->shape());
+    TF_EXPECT_OK(device_context_->CopyDeviceTensorToCPUSync(
+        device_tensor, "", device_, host_tensor));
+    managed_outputs_[output_index] = host_tensor;
+    return host_tensor;
+  }
+
+  // Compiles the op set in the context_ to a PjRtLoadedExecutable
+  void CompileToExecutable(const std::vector<XlaCompiler::Argument>& args,
+                           const XlaCompiler::CompilationResult** result,
+                           xla::PjRtLoadedExecutable** executable,
+                           XlaCompiler::CompileOptions compile_options = {}) {
+    TF_EXPECT_OK(device_compiler_->CompileSingleOpIfNeeded(
+        compiler_options_, args, compile_options, context_.get(), profiler_,
+        result, executable));
+  }
+
+  // Runs a PjRtLoadedExecutable with the given inputs, variables. Requires the
+  // XlaCompiler::CompilationResult that was used to build the executable.
+  StatusOr<std::vector<std::unique_ptr<xla::PjRtBuffer>>> RunExecutable(
+      const std::vector<const Tensor*>& inputs,
+      const std::vector<VariableInfo>& variables,
+      const XlaCompiler::CompilationResult* result,
+      xla::PjRtLoadedExecutable* executable) {
+    TF_ASSIGN_OR_RETURN(auto pjrt_device, pjrt_client_->LookupAddressableDevice(
+                                              device_->parsed_name().id));
+
+    std::vector<xla::PjRtBuffer*> executable_args;
+    executable_args.reserve(result->input_mapping.size());
+    absl::flat_hash_set<int> non_donatable_input_indices;
+    PreparePjRtExecutableArguments(result->input_mapping, inputs, variables,
+                                   &executable_args,
+                                   &non_donatable_input_indices);
+
+    xla::ExecuteOptions exe_options;
+    exe_options.arguments_are_tupled = false;
+    exe_options.untuple_result = true;
+
+    // TODO(b/257548614): currently PJRT is compiled as portable (num_replica =
+    // 1 and num_partition = 1). Support multiple partitions case.
+    return executable->ExecutePortable(executable_args, pjrt_device,
+                                       exe_options);
+  }
+
+  // Creates a Variable. Doesn't add it to the resource manager.
+  template <typename T>
+  Var* CreateVariable(const string& name, const TensorShape& shape,
+                      const gtl::ArraySlice<T> data) {
+    Tensor* init_var_value = CreateDeviceTensor<T>(shape, data);
+    Var* var = new Var(DataTypeToEnum<T>::v());
+    *var->tensor() = *init_var_value;
+    var->is_initialized = true;
+
+    return var;
+  }
+
+  // Creates a Variable, adds it to the resource manager and also adds it as one
+  // of the inputs in the context_
+  template <typename T>
+  void AddVariableInput(const string& name, const TensorShape& shape,
+                        const gtl::ArraySlice<T> data) {
+    Var* var = CreateVariable<T>(name, shape, data);
+    ResourceMgr* rm = device_->resource_manager();
+    TF_ASSERT_OK(rm->Create(rm->default_container(), name, var));
+
+    ResourceHandle handle;
+    handle.set_device(device_->name());
+    handle.set_container(rm->default_container());
+    handle.set_name(name);
+    TypeIndex type_index = TypeIndex::Make<Var>();
+    handle.set_hash_code(type_index.hash_code());
+    handle.set_maybe_type_name(type_index.name());
+
+    Tensor* input = new Tensor(host_allocator_, DT_RESOURCE, TensorShape({}));
+    input->scalar<ResourceHandle>()() = handle;
+    tensors_.push_back(input);
+    inputs_.push_back({nullptr, input});
+  }
+
+ protected:
+  DeviceContext* device_context_;
+  Allocator* host_allocator_;
+  Allocator* device_allocator_;
+
+  XlaCompiler::Options compiler_options_;
+  xla::PjRtClient* pjrt_client_;
+  PjRtDeviceCompiler* device_compiler_;
+  DeviceCompilationProfiler* profiler_;
+};
+
+TEST_F(PjRtExecutionUtilTest, PreparePjRtExecutableArguments) {
+  std::vector<const Tensor*> inputs;
+  inputs.push_back(CreateDeviceTensor<int32_t>(TensorShape({1, 3}), {0, 0, 0}));
+  inputs.push_back(CreateDeviceTensor<int32_t>(TensorShape({1, 3}), {1, 2, 3}));
+  inputs.push_back(CreateDeviceTensor<int32_t>(TensorShape({1, 3}), {4, 5, 6}));
+  std::vector<int> input_mapping{1, 2};
+
+  std::vector<xla::PjRtBuffer*> exec_args;
+  exec_args.reserve(input_mapping.size());
+  absl::flat_hash_set<int> non_donatable_input_indices;
+  PreparePjRtExecutableArguments(input_mapping, inputs, {}, &exec_args,
+                                 &non_donatable_input_indices);
+
+  EXPECT_EQ(exec_args.size(), 2);
+
+  std::shared_ptr<xla::Literal> literal1 = *exec_args[0]->ToLiteralSync();
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(
+      *literal1, xla::LiteralUtil::CreateR2<int32_t>({{1, 2, 3}})));
+
+  std::shared_ptr<xla::Literal> literal2 = *exec_args[1]->ToLiteralSync();
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(
+      *literal2, xla::LiteralUtil::CreateR2<int32_t>({{4, 5, 6}})));
+}
+
+TEST_F(PjRtExecutionUtilTest, PreparePjRtExecutableArgumentsVariableInputs) {
+  std::vector<VariableInfo> variables;
+  Var* var1 = CreateVariable<int32>("v1", TensorShape({1, 2}), {1, 2});
+  variables.emplace_back(1, "v1", var1);
+  Var* var2 = CreateVariable<int32>("v2", TensorShape({1, 2}), {3, 4});
+  variables.emplace_back(2, "v2", var2);
+
+  std::vector<const Tensor*> inputs;
+  inputs.push_back(CreateDeviceTensor<int32_t>(TensorShape({1, 3}), {0, 0, 0}));
+  std::vector<int> input_mapping{1, 2};
+
+  std::vector<xla::PjRtBuffer*> exec_args;
+  exec_args.reserve(input_mapping.size());
+  absl::flat_hash_set<int> non_donatable_input_indices;
+  PreparePjRtExecutableArguments(input_mapping, inputs, variables, &exec_args,
+                                 &non_donatable_input_indices);
+
+  EXPECT_EQ(exec_args.size(), 2);
+
+  std::shared_ptr<xla::Literal> literal1 = *exec_args[0]->ToLiteralSync();
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(
+      *literal1, xla::LiteralUtil::CreateR2<int32_t>({{1, 2}})));
+
+  std::shared_ptr<xla::Literal> literal2 = *exec_args[1]->ToLiteralSync();
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(
+      *literal2, xla::LiteralUtil::CreateR2<int32_t>({{3, 4}})));
+}
+
+TEST_F(PjRtExecutionUtilTest, PopulateCtxOutputs) {
+  XlaOpRegistry::RegisterCompilationKernels();
+  TF_EXPECT_OK(NodeDefBuilder("AddV2", "AddV2")
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_INT32)
+                   .Device("/job:localhost/replica:0/task:0/device:XLA_CPU:0")
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+
+  // Add inputs.
+  Tensor* a = CreateDeviceTensor<int32>(TensorShape({1, 3}), {1, 2, 3});
+  Tensor* b = CreateDeviceTensor<int32>(TensorShape({1, 3}), {4, 5, 6});
+  inputs_.push_back({nullptr, a});
+  inputs_.push_back({nullptr, b});
+
+  CreateContext();
+
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({1, 3});
+  args[1].kind = XlaCompiler::Argument::kParameter;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({1, 3});
+
+  const XlaCompiler::CompilationResult* result;
+  xla::PjRtLoadedExecutable* executable;
+  CompileToExecutable(args, &result, &executable);
+
+  std::vector<const Tensor*> inputs;
+  inputs.push_back(a);
+  inputs.push_back(b);
+  TF_ASSERT_OK_AND_ASSIGN(auto execute_outputs,
+                          RunExecutable(inputs, {}, result, executable));
+
+  TF_EXPECT_OK(PopulateCtxOutputsFromPjRtExecutableOutputs(
+      inputs, {}, *result, execute_outputs, context_.get()));
+
+  Tensor* expected = CreateHostTensor<int32>(TensorShape({1, 3}), {5, 7, 9});
+  test::ExpectTensorEqual<int32>(*expected, *GetOutput(0));
+}
+
+TEST_F(PjRtExecutionUtilTest, PopulateCtxOutputsDynamicShape) {
+  XlaOpRegistry::RegisterCompilationKernels();
+  TF_EXPECT_OK(NodeDefBuilder("testWhere", "Where")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DT_FLOAT)
+                   .Device("/job:localhost/replica:0/task:0/device:XLA_CPU:0")
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+
+  // Add inputs.
+  Tensor* a =
+      CreateDeviceTensor<float>(TensorShape({2, 3}), {0., 1., 1., 0., 0., 0.});
+  inputs_.push_back({nullptr, a});
+
+  CreateContext();
+
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_FLOAT;
+  args[0].shape = TensorShape({2, 3});
+
+  const XlaCompiler::CompilationResult* result;
+  xla::PjRtLoadedExecutable* executable;
+  CompileToExecutable(args, &result, &executable);
+
+  std::vector<const Tensor*> inputs;
+  inputs.push_back(a);
+  TF_ASSERT_OK_AND_ASSIGN(auto execute_outputs,
+                          RunExecutable(inputs, {}, result, executable));
+
+  TF_EXPECT_OK(PopulateCtxOutputsFromPjRtExecutableOutputs(
+      inputs, {}, *result, execute_outputs, context_.get()));
+  // The expected output is indices of non-zero inputs.
+  Tensor* expected = CreateHostTensor<int64>(TensorShape({2, 2}), {0, 1, 0, 2});
+  test::ExpectTensorEqual<int64>(*expected, *GetOutput(0));
+}
+
+TEST_F(PjRtExecutionUtilTest, PopulateCtxOutputsVariableInputs) {
+  XlaOpRegistry::RegisterCompilationKernels();
+  TF_EXPECT_OK(NodeDefBuilder("AddV2", "AddV2")
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_INT32)
+                   .Device("/job:localhost/replica:0/task:0/device:XLA_CPU:0")
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+
+  AddVariableInput<int32>("var1", TensorShape({1, 2}), {1, 2});
+  AddVariableInput<int32>("var2", TensorShape({1, 2}), {3, 4});
+
+  CreateContext();
+
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].initialized = true;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({1, 2});
+  args[1].kind = XlaCompiler::Argument::kParameter;
+  args[1].initialized = true;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({1, 2});
+
+  const XlaCompiler::CompilationResult* result;
+  xla::PjRtLoadedExecutable* executable;
+  CompileToExecutable(args, &result, &executable);
+
+  std::vector<const Tensor*> inputs = InputsFromContext(context_.get());
+  std::vector<int> variables_indices =
+      GetResourceVariableIndicesFromContext(context_.get());
+  std::vector<VariableInfo> variables;
+  variables.reserve(variables_indices.size());
+  TF_ASSERT_OK(GetVariableInfosFromInputs(context_->resource_manager(),
+                                          context_->device(), inputs,
+                                          variables_indices, &variables));
+  TF_ASSERT_OK_AND_ASSIGN(auto execute_outputs,
+                          RunExecutable(inputs, variables, result, executable));
+  TF_EXPECT_OK(PopulateCtxOutputsFromPjRtExecutableOutputs(
+      inputs, variables, *result, execute_outputs, context_.get()));
+
+  Tensor* expected = CreateHostTensor<int32>(TensorShape({1, 2}), {4, 6});
+  test::ExpectTensorEqual<int32>(*expected, *GetOutput(0));
+}
+
+TEST_F(PjRtExecutionUtilTest, PopulateCtxOutputsResourceUpdates) {
+  XlaOpRegistry::RegisterCompilationKernels();
+  TF_EXPECT_OK(NodeDefBuilder("AssignAddVariableOp", "AssignAddVariableOp")
+                   .Input(FakeInput(DT_RESOURCE))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("dtype", DT_INT32)
+                   .Device("/job:localhost/replica:0/task:0/device:XLA_CPU:0")
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+
+  AddVariableInput<int32>("var", TensorShape({1, 3}), {1, 2, 3});
+  Tensor* a = CreateDeviceTensor<int32>(TensorShape({1, 3}), {2, 2, 2});
+  inputs_.push_back({nullptr, a});
+
+  CreateContext();
+
+  std::vector<const Tensor*> inputs = InputsFromContext(context_.get());
+  std::vector<int> variables_indices =
+      GetResourceVariableIndicesFromContext(context_.get());
+  std::vector<VariableInfo> variables;
+  variables.reserve(variables_indices.size());
+  TF_ASSERT_OK(GetVariableInfosFromInputs(context_->resource_manager(),
+                                          context_->device(), inputs,
+                                          variables_indices, &variables));
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<int> constant_input_indices,
+                          GetConstantInputIndicesFromContext(context_.get()));
+  TF_ASSERT_OK(LockVariables(absl::MakeSpan(variables)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<XlaCompiler::Argument> args,
+      XlaComputationLaunchContext::BuildXlaCompilerArguments(
+          constant_input_indices, inputs, variables,
+          static_cast<Device*>(context_->device())));
+
+  const XlaCompiler::CompilationResult* result;
+  xla::PjRtLoadedExecutable* executable;
+  CompileToExecutable(args, &result, &executable);
+  TF_ASSERT_OK_AND_ASSIGN(auto execute_outputs,
+                          RunExecutable(inputs, variables, result, executable));
+
+  TF_EXPECT_OK(PopulateCtxOutputsFromPjRtExecutableOutputs(
+      inputs, variables, *result, execute_outputs, context_.get()));
+
+  // Verify that there are no outputs.
+  EXPECT_EQ(context_->num_outputs(), 0);
+
+  // Verify that the original variable was updated.
+  ResourceMgr* rm = device_->resource_manager();
+  Var* var = nullptr;
+  TF_ASSERT_OK(rm->Lookup(rm->default_container(), "var", &var));
+  core::ScopedUnref var_ref(var);
+
+  Tensor* device_tensor = var->tensor();
+  Tensor* host_tensor = new Tensor(host_allocator_, device_tensor->dtype(),
+                                   device_tensor->shape());
+  tensors_.push_back(host_tensor);
+  TF_ASSERT_OK(device_context_->CopyDeviceTensorToCPUSync(
+      device_tensor, "", device_, host_tensor));
+
+  Tensor* expected = CreateHostTensor<int32>(TensorShape({1, 3}), {3, 4, 5});
+  test::ExpectTensorEqual<int32>(*expected, *host_tensor);
+}
+
+TEST(XlaLaunchUtilTest, GetPjRtExecuteOptions) {
+  xla::ExecuteOptions options = GetPjRtExecuteOptions({});
+  EXPECT_FALSE(options.arguments_are_tupled);
+  EXPECT_TRUE(options.untuple_result);
+  EXPECT_TRUE(options.use_major_to_minor_data_layout_for_callbacks);
+}
+
+TEST_F(PjRtExecutionUtilTest, RunPjRtExecutable) {
+  XlaOpRegistry::RegisterCompilationKernels();
+  TF_EXPECT_OK(NodeDefBuilder("AddV2", "AddV2")
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_INT32)
+                   .Device("/job:localhost/replica:0/task:0/device:XLA_CPU:0")
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+
+  AddVariableInput<int32>("var1", TensorShape({1, 2}), {1, 2});
+  AddVariableInput<int32>("var2", TensorShape({1, 2}), {3, 4});
+
+  CreateContext();
+
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].initialized = true;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({1, 2});
+  args[1].kind = XlaCompiler::Argument::kParameter;
+  args[1].initialized = true;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({1, 2});
+
+  const XlaCompiler::CompilationResult* result;
+  xla::PjRtLoadedExecutable* executable;
+  CompileToExecutable(args, &result, &executable);
+
+  std::vector<const Tensor*> inputs = InputsFromContext(context_.get());
+  std::vector<int> variables_indices =
+      GetResourceVariableIndicesFromContext(context_.get());
+  std::vector<VariableInfo> variables;
+  variables.reserve(variables_indices.size());
+  TF_ASSERT_OK(GetVariableInfosFromInputs(context_->resource_manager(),
+                                          context_->device(), inputs,
+                                          variables_indices, &variables));
+  TF_ASSERT_OK(RunPjRtExecutable(*pjrt_client_, inputs, variables, *result,
+                                 executable, context_.get()));
+
+  Tensor* expected = CreateHostTensor<int32>(TensorShape({1, 2}), {4, 6});
+  test::ExpectTensorEqual<int32>(*expected, *GetOutput(0));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_platform_info.cc b/tensorflow/compiler/jit/xla_platform_info.cc
index df4ab4460f5..b311faa13ab 100644
--- a/tensorflow/compiler/jit/xla_platform_info.cc
+++ b/tensorflow/compiler/jit/xla_platform_info.cc
@@ -17,20 +17,62 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <set>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/jit/device_executable_persistor.h"
 #include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/pjrt_device_compiler_client.h"
 #include "tensorflow/compiler/jit/xla_device_compiler_client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/tfrt/common/create_pjrt_client_util.h"
+#include "tensorflow/core/tfrt/common/pjrt_util.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 
 namespace tensorflow {
 namespace {
 using XlaDeviceCompiler =
     DeviceCompiler<xla::LocalExecutable, xla::LocalClient>;
+using PjRtDeviceCompiler =
+    DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>;
+using XlaDeviceExecutablePersistor =
+    DeviceExecutablePersistor<xla::LocalExecutable, xla::LocalClient>;
+using PjRtDeviceExecutablePersistor =
+    DeviceExecutablePersistor<xla::PjRtLoadedExecutable, xla::PjRtClient>;
+
+XlaDeviceCompiler* CreateXlaDeviceCompiler(
+    const XlaDeviceExecutablePersistor::Config& persistor_config,
+    DeviceType device_type, xla::LocalClient* local_client) {
+  return new XlaDeviceCompiler(
+      std::make_unique<XlaDeviceExecutablePersistor>(
+          std::move(persistor_config), device_type),
+      std::make_unique<XlaDeviceCompilerClient>(local_client));
+}
+
+PjRtDeviceCompiler* CreatePjRtDeviceCompiler(
+    const PjRtDeviceExecutablePersistor::Config& persistor_config,
+    DeviceType device_type, xla::PjRtClient* pjrt_client) {
+  return new PjRtDeviceCompiler(
+      std::make_unique<PjRtDeviceExecutablePersistor>(
+          std::move(persistor_config), device_type),
+      std::make_unique<PjRtDeviceCompilerClient>(pjrt_client));
+}
+
+StatusOr<std::optional<std::set<int>>> GetAllowedGpus(
+    FunctionLibraryRuntime* flr) {
+  std::optional<std::set<int>> gpu_ids = std::nullopt;
+
+  if (flr->config_proto()) {
+    string allowed_gpus =
+        flr->config_proto()->gpu_options().visible_device_list();
+    TF_ASSIGN_OR_RETURN(gpu_ids, ParseVisibleDeviceList(allowed_gpus));
+  }
+
+  return gpu_ids;
+}
 }  // namespace
 
 xla::StatusOr<std::optional<std::set<int>>> ParseVisibleDeviceList(
@@ -57,32 +99,27 @@ xla::StatusOr<std::optional<std::set<int>>> ParseVisibleDeviceList(
 Status BuildXlaDeviceCompiler(DeviceBase* device, FunctionLibraryRuntime* flr,
                               const XlaPlatformInfo& platform_info,
                               XlaDeviceCompiler** xla_device_compiler) {
-  using XlaDeviceExecutablePersistor =
-      DeviceExecutablePersistor<xla::LocalExecutable, xla::LocalClient>;
   XlaDeviceExecutablePersistor::Config persistor_config(
       GetMarkForCompilationPassFlags()->tf_xla_persistent_cache_directory,
       GetMarkForCompilationPassFlags()->tf_xla_disable_strict_signature_checks,
       GetMarkForCompilationPassFlags()->tf_xla_persistent_cache_prefix);
 
   if (platform_info.xla_device_metadata()) {
-    auto persistor = std::make_unique<XlaDeviceExecutablePersistor>(
-        std::move(persistor_config),
-        platform_info.xla_device_metadata()->jit_device_type());
-    auto compiler_client = std::make_unique<XlaDeviceCompilerClient>(
+    *xla_device_compiler = CreateXlaDeviceCompiler(
+        persistor_config,
+        platform_info.xla_device_metadata()->jit_device_type(),
         platform_info.xla_device_metadata()->client());
-    *xla_device_compiler =
-        new XlaDeviceCompiler(std::move(persistor), std::move(compiler_client));
     return OkStatus();
   }
 
   // TFRT-TPU is used if device type is `DEVICE_TPU` and platform_info does not
-  // have `xla_device_metadata`.
+  // have `xla_device_metadata`. This is used for TFRT-TPU when
+  // BuildXlaDeviceCompiler() is called in GetCompilerIr(). Currently only
+  // lowering to HLO is needed there and xla::LocalClient doesn't support
+  // building the executable for TFRT-TPU and hence, is set to nullptr here.
   if (platform_info.device_type() == DEVICE_TPU) {
-    auto persistor = std::make_unique<XlaDeviceExecutablePersistor>(
-        std::move(persistor_config), DeviceType(DEVICE_TPU_XLA_JIT));
-    auto compiler_client = std::make_unique<XlaDeviceCompilerClient>(nullptr);
-    *xla_device_compiler =
-        new XlaDeviceCompiler(std::move(persistor), std::move(compiler_client));
+    *xla_device_compiler = CreateXlaDeviceCompiler(
+        persistor_config, DeviceType(DEVICE_TPU_XLA_JIT), nullptr);
     return OkStatus();
   }
 
@@ -118,13 +155,8 @@ Status BuildXlaDeviceCompiler(DeviceBase* device, FunctionLibraryRuntime* flr,
   client_options.set_intra_op_parallelism_threads(
       device->tensorflow_cpu_worker_threads()->num_threads);
 
-  if (flr->config_proto()) {
-    string allowed_gpus =
-        flr->config_proto()->gpu_options().visible_device_list();
-    TF_ASSIGN_OR_RETURN(std::optional<std::set<int>> gpu_ids,
-                        ParseVisibleDeviceList(allowed_gpus));
-    client_options.set_allowed_devices(gpu_ids);
-  }
+  TF_ASSIGN_OR_RETURN(auto allowed_gpus, GetAllowedGpus(flr));
+  client_options.set_allowed_devices(allowed_gpus);
 
   auto client = xla::ClientLibrary::GetOrCreateLocalClient(client_options);
   if (!client.ok()) {
@@ -137,13 +169,77 @@ Status BuildXlaDeviceCompiler(DeviceBase* device, FunctionLibraryRuntime* flr,
                                    platform_info.device_type().type());
   }
 
-  auto persistor = std::make_unique<XlaDeviceExecutablePersistor>(
-      std::move(persistor_config),
-      DeviceType(registration->compilation_device_name));
-  auto compiler_client =
-      std::make_unique<XlaDeviceCompilerClient>(client.value());
-  *xla_device_compiler =
-      new XlaDeviceCompiler(std::move(persistor), std::move(compiler_client));
+  *xla_device_compiler = CreateXlaDeviceCompiler(
+      persistor_config, DeviceType(registration->compilation_device_name),
+      client.value());
+  return OkStatus();
+}
+
+Status BuildPjRtDeviceCompiler(const XlaPlatformInfo& platform_info,
+                               FunctionLibraryRuntime* flr,
+                               PjRtDeviceCompiler** pjrt_device_compiler) {
+  PjRtDeviceExecutablePersistor::Config persistor_config(
+      GetMarkForCompilationPassFlags()->tf_xla_persistent_cache_directory,
+      GetMarkForCompilationPassFlags()->tf_xla_disable_strict_signature_checks,
+      GetMarkForCompilationPassFlags()->tf_xla_persistent_cache_prefix);
+
+  DeviceType device_type = platform_info.device_type();
+
+  if (platform_info.xla_device_metadata()) {
+    VLOG(2) << "Building PjRtDeviceCompiler using "
+               "platform_info.xla_device_metadata().";
+
+    DeviceType compilation_device_type =
+        platform_info.xla_device_metadata()->jit_device_type();
+    TF_ASSIGN_OR_RETURN(auto pjrt_client, GetOrCreatePjRtClient(device_type));
+
+    *pjrt_device_compiler = CreatePjRtDeviceCompiler(
+        persistor_config, compilation_device_type, pjrt_client);
+    return OkStatus();
+  }
+  if (platform_info.pjrt_device_metadata()) {
+    VLOG(2) << "Building PjRtDeviceCompiler using "
+               "platform_info.pjrt_device_metadata().";
+
+    DeviceType compilation_device_type =
+        platform_info.pjrt_device_metadata()->jit_device_type();
+    TF_ASSIGN_OR_RETURN(auto pjrt_client, GetOrCreatePjRtClient(device_type));
+
+    *pjrt_device_compiler = CreatePjRtDeviceCompiler(
+        persistor_config, compilation_device_type, pjrt_client);
+    return OkStatus();
+  }
+
+  // TFRT-TPU is used if device_type is `DEVICE_TPU` and platform_info does not
+  // have `xla_device_metadata`.
+  if (device_type == DEVICE_TPU) {
+    TF_ASSIGN_OR_RETURN(auto pjrt_client, GetOrCreatePjRtClient(device_type));
+    *pjrt_device_compiler = CreatePjRtDeviceCompiler(
+        persistor_config, DeviceType(DEVICE_TPU_XLA_JIT), pjrt_client);
+    return OkStatus();
+  }
+
+  VLOG(2) << "platform_info.xla_device_metadata not found and "
+             "platform_info.device_type() != DEVICE_TPU. Building "
+             "PjRtDeviceCompiler for non-XLA device.";
+
+  const XlaOpRegistry::DeviceRegistration* registration;
+  if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
+    return errors::InvalidArgument("No JIT device registered for ",
+                                   device_type.type());
+  }
+  auto compilation_device_type =
+      DeviceType(registration->compilation_device_name);
+
+  TF_ASSIGN_OR_RETURN(auto allowed_gpus, GetAllowedGpus(flr));
+  // TODO(b/255826209): Set platform, intra op parallelism threads if required
+  // and when supported by GetOrCreatePjRtClient().
+  // The `allowed_gpus` argument is used only if the `device_type` is GPU.
+  TF_ASSIGN_OR_RETURN(auto pjrt_client,
+                      GetOrCreatePjRtClient(device_type, allowed_gpus));
+
+  *pjrt_device_compiler = CreatePjRtDeviceCompiler(
+      persistor_config, compilation_device_type, pjrt_client);
   return OkStatus();
 }
 
@@ -151,6 +247,7 @@ XlaPlatformInfo XlaPlatformInfoFromDevice(DeviceBase* device_base) {
   auto device = static_cast<Device*>(device_base);
   se::Platform::Id platform_id = nullptr;
   const XlaDevice::Metadata* xla_device_metadata = nullptr;
+  const PjRtBaseDevice::Metadata* pjrt_device_metadata = nullptr;
   std::shared_ptr<se::DeviceMemoryAllocator> custom_allocator;
 
   if (device->device_type() == DEVICE_CPU) {
@@ -174,10 +271,14 @@ XlaPlatformInfo XlaPlatformInfoFromDevice(DeviceBase* device_base) {
     platform_id = xla_device_metadata->platform()->id();
     custom_allocator =
         xla_device_metadata->client()->backend().shared_memory_allocator();
+  } else if (auto metadata = PjRtBaseDevice::GetMetadataFromDevice(device);
+             metadata.ok()) {
+    pjrt_device_metadata = *metadata;
   }
 
   return XlaPlatformInfo(DeviceType(device->device_type()), platform_id,
-                         xla_device_metadata, custom_allocator);
+                         xla_device_metadata, pjrt_device_metadata,
+                         custom_allocator);
 }
 
 std::shared_ptr<se::DeviceMemoryAllocator> GetAllocator(
diff --git a/tensorflow/compiler/jit/xla_platform_info.h b/tensorflow/compiler/jit/xla_platform_info.h
index 97ed0b4a9db..725a876904d 100644
--- a/tensorflow/compiler/jit/xla_platform_info.h
+++ b/tensorflow/compiler/jit/xla_platform_info.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 
 #include "tensorflow/compiler/jit/device_compiler.h"
+#include "tensorflow/compiler/jit/pjrt_base_device.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h"
 
@@ -27,7 +28,8 @@ namespace tensorflow {
 
 // Holds some information about the platform on which an
 // XlaLaunch/_XlaCompile/_XlaRun op must run on. Provides a common layer of
-// abstraction for normal and XLA devices.
+// abstraction for normal, XLA devices and devices inheriting from
+// PjRtBaseDevice.
 class XlaPlatformInfo {
  public:
   XlaPlatformInfo() : device_type_("") {}
@@ -35,10 +37,12 @@ class XlaPlatformInfo {
   explicit XlaPlatformInfo(
       const DeviceType device_type, se::Platform::Id platform_id,
       const XlaDevice::Metadata* xla_device_metadata,
+      const PjRtBaseDevice::Metadata* pjrt_device_metadata,
       std::shared_ptr<se::DeviceMemoryAllocator> device_allocator)
       : device_type_(device_type),
         platform_id_(platform_id),
         xla_device_metadata_(xla_device_metadata),
+        pjrt_device_metadata_(pjrt_device_metadata),
         device_allocator_(device_allocator) {}
 
   XlaPlatformInfo& operator=(XlaPlatformInfo&& other) = default;
@@ -65,6 +69,10 @@ class XlaPlatformInfo {
   }
   bool is_on_xla_device() const { return xla_device_metadata() != nullptr; }
 
+  const PjRtBaseDevice::Metadata* pjrt_device_metadata() const {
+    return pjrt_device_metadata_;
+  }
+
  private:
   DeviceType device_type_;
   se::Platform::Id platform_id_;
@@ -74,6 +82,11 @@ class XlaPlatformInfo {
   // XlaLaunch/_XlaCompile/_XlaRun OpKernel.
   const XlaDevice::Metadata* xla_device_metadata_;
 
+  // pjrt_device_metadata_ lives in tensorflow::PjRtBaseDevice in which the
+  // XlaLaunch/XlaCompileOnDemand op is placed and thus does not die before the
+  // op kernel.
+  const PjRtBaseDevice::Metadata* pjrt_device_metadata_;
+
   // If the op associated with this XlaPlatformInfo is placed on an XLA device
   // then device_allocator_ is the xla::Backend's memory allocator.  If the op
   // is placed on a regular CPU or GPU device then device_allocator_ is null.
@@ -90,13 +103,28 @@ class XlaPlatformInfo {
 StatusOr<std::optional<std::set<int>>> ParseVisibleDeviceList(
     absl::string_view visible_device_list);
 
-// Returns created XLA compilation cache.
+// Builds a DeviceCompiler that uses xla::LocalClient using `platform_info` and
+// sets *xla_device_compiler to point to it. Uses flags from
+// `MarkForCompilationPassFlags` for configuring the persistor used in the
+// DeviceCompiler.
 Status BuildXlaDeviceCompiler(
     DeviceBase* dev, FunctionLibraryRuntime* flr,
     const XlaPlatformInfo& platform_info,
     DeviceCompiler<xla::LocalExecutable, xla::LocalClient>**
         xla_device_compiler);
 
+// Builds a DeviceCompiler that uses xla::PjRtClient using an appropriate
+// PjRtClient for `platform_info.device_type()` and sets *pjrt_device_compiler
+// to point to it. Uses flags from `MarkForCompilationPassFlags` for configuring
+// the persistor used in the DeviceCompiler. Please note that non-XLA devices
+// aren't supported yet. This is because:
+// 1. PjRtClient doesn't support data transfer for non-XLA devices yet
+// 2. Fetching the PjRtClient for non-XLA devices is also not supported yet
+Status BuildPjRtDeviceCompiler(
+    const XlaPlatformInfo& platform_info, FunctionLibraryRuntime* flr,
+    DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>**
+        pjrt_device_compiler);
+
 // Returns information about the platform from kernel context.
 XlaPlatformInfo XlaPlatformInfoFromDevice(DeviceBase* device);
 
diff --git a/tensorflow/compiler/jit/xla_platform_info_test.cc b/tensorflow/compiler/jit/xla_platform_info_test.cc
new file mode 100644
index 00000000000..0dedbb39bb9
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_platform_info_test.cc
@@ -0,0 +1,170 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_platform_info.h"
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/test_util.h"
+#include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/status_matchers.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/core/tfrt/common/create_pjrt_client_util.h"
+#include "tensorflow/core/tfrt/common/pjrt_util.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+namespace {
+using XlaDeviceCompiler =
+    DeviceCompiler<xla::LocalExecutable, xla::LocalClient>;
+using PjRtDeviceCompiler =
+    DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>;
+
+class XlaPlatformInfoTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    tensorflow::GetXlaDeviceFlags()->tf_xla_enable_xla_devices = true;
+  }
+
+  DeviceSetup device_setup_;
+};
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+TEST_F(XlaPlatformInfoTest, BuildXlaDeviceCompilerXlaDeviceMetadata) {
+  device_setup_.AddDevicesAndSetUp({DEVICE_XLA_GPU});
+
+  Device* device = device_setup_.GetDevice(DEVICE_XLA_GPU);
+  const XlaDevice::Metadata* metadata = nullptr;
+  TF_CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
+  XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(device);
+
+  XlaDeviceCompiler* xla_device_compiler = nullptr;
+  TF_EXPECT_OK(BuildXlaDeviceCompiler(device, device_setup_.flr(),
+                                      platform_info, &xla_device_compiler));
+  core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
+
+  EXPECT_EQ(xla_device_compiler->device_type(), metadata->jit_device_type());
+  EXPECT_EQ(xla_device_compiler->client(), metadata->client());
+}
+
+TEST_F(XlaPlatformInfoTest, BuildXlaDeviceCompilerNonXlaDevice) {
+  device_setup_.AddDevicesAndSetUp({DEVICE_GPU});
+  Device* device = device_setup_.GetDevice(DEVICE_GPU);
+
+  XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(device);
+  XlaDeviceCompiler* xla_device_compiler = nullptr;
+  TF_EXPECT_OK(BuildXlaDeviceCompiler(device, device_setup_.flr(),
+                                      platform_info, &xla_device_compiler));
+  core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
+
+  EXPECT_EQ(xla_device_compiler->device_type(), DeviceType(DEVICE_GPU_XLA_JIT));
+  EXPECT_TRUE(xla_device_compiler->client() != nullptr);
+}
+
+TEST_F(XlaPlatformInfoTest, BuildPjRtDeviceCompilerTestXlaDevice) {
+  DeviceType device_type = DeviceType(DEVICE_XLA_GPU);
+  device_setup_.AddDevicesAndSetUp({device_type.type()});
+
+  Device* device = device_setup_.GetDevice(device_type.type());
+  const XlaDevice::Metadata* metadata = nullptr;
+  TF_CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
+  XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(device);
+
+  PjRtDeviceCompiler* pjrt_device_compiler = nullptr;
+  TF_EXPECT_OK(BuildPjRtDeviceCompiler(platform_info, device_setup_.flr(),
+                                       &pjrt_device_compiler));
+  core::ScopedUnref pjrt_device_compiler_ref(pjrt_device_compiler);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto pjrt_client, GetOrCreatePjRtClient(device_type));
+  EXPECT_EQ(pjrt_device_compiler->device_type(), metadata->jit_device_type());
+  EXPECT_EQ(pjrt_device_compiler->client(), pjrt_client);
+}
+
+TEST_F(XlaPlatformInfoTest, BuildPjRtDeviceCompilerTestGpuDevice) {
+  device_setup_.AddDevicesAndSetUp({DEVICE_GPU});
+  Device* device = device_setup_.GetDevice(DEVICE_GPU);
+  XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(device);
+  PjRtDeviceCompiler* pjrt_device_compiler = nullptr;
+  TF_EXPECT_OK(BuildPjRtDeviceCompiler(platform_info, device_setup_.flr(),
+                                       &pjrt_device_compiler));
+  core::ScopedUnref pjrt_device_compiler_ref(pjrt_device_compiler);
+}
+#endif
+
+TEST_F(XlaPlatformInfoTest, BuildXlaDeviceCompilerTpuDevice) {
+  DeviceType compilation_device_type = DeviceType(DEVICE_TPU_XLA_JIT);
+
+  // Instead of creating/initializing a TPU device, create a dummy platform_info
+  // and use a nullptr for Device for testing purposes. Only
+  // XlaPlatformInfo::device_type() is needed to build the appropriate
+  // XlaDeviceCompiler.
+  Device* device = nullptr;
+  XlaPlatformInfo platform_info(DeviceType(DEVICE_TPU), /*platform_id=*/nullptr,
+                                /*xla_device_metadata=*/nullptr,
+                                /*pjrt_device_metadata=*/nullptr,
+                                /*device_allocator=*/nullptr);
+
+  XlaDeviceCompiler* xla_device_compiler = nullptr;
+  TF_EXPECT_OK(BuildXlaDeviceCompiler(device, nullptr, platform_info,
+                                      &xla_device_compiler));
+  core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
+
+  EXPECT_EQ(xla_device_compiler->device_type(), compilation_device_type);
+  // TFRT-TPU is used if device type is `DEVICE_TPU` and `platform_info` does
+  // not have `xla_device_metadata`. XlaDeviceCompiler/xla::LocalClient is not
+  // used in this case.
+  EXPECT_EQ(xla_device_compiler->client(), nullptr);
+}
+
+// TODO(b/255826209): Look into using an actual TPU device for the unit test,
+// and move this out of OSS.
+TEST_F(XlaPlatformInfoTest, BuildPjRtDeviceCompilerTpuDevice) {
+  DeviceType device_type = DeviceType(DEVICE_TPU);
+  DeviceType compilation_device_type = DeviceType(DEVICE_TPU_XLA_JIT);
+  // Use a CPU PjRtClient instead of a TPU one just for testing whether
+  // GetOrCreatePjRtClient() is being called with the correct arguments.
+  TF_CHECK_OK(SetPjRtClientInTFGlobalResourceManager(
+      device_type,
+      xla::GetTfrtCpuClient(/*asynchronous=*/true, /*cpu_device_count=*/1)
+          .value()));
+  TF_ASSERT_OK_AND_ASSIGN(auto pjrt_client, GetOrCreatePjRtClient(device_type));
+
+  // Instead of creating/initializing a TPU device, create a dummy platform_info
+  // for testing purposes. Only XlaPlatformInfo::device_type() is needed to
+  // build the appropriate PjRtDeviceCompiler.
+  XlaPlatformInfo platform_info(device_type, /*platform_id=*/nullptr,
+                                /*xla_device_metadata=*/nullptr,
+                                /*pjrt_device_metadata=*/nullptr,
+                                /*device_allocator=*/nullptr);
+
+  PjRtDeviceCompiler* pjrt_device_compiler = nullptr;
+  TF_EXPECT_OK(
+      BuildPjRtDeviceCompiler(platform_info, nullptr, &pjrt_device_compiler));
+  core::ScopedUnref pjrt_device_compiler_ref(pjrt_device_compiler);
+
+  EXPECT_EQ(pjrt_device_compiler->device_type(), compilation_device_type);
+  EXPECT_EQ(pjrt_device_compiler->client(), pjrt_client);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_tpu_device.cc b/tensorflow/compiler/jit/xla_tpu_device.cc
index e6047e68bde..1f4db51e417 100644
--- a/tensorflow/compiler/jit/xla_tpu_device.cc
+++ b/tensorflow/compiler/jit/xla_tpu_device.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
 #include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_device_context.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/layout_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 923b203ab38..b359e05aac7 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -60,8 +60,8 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",  # buildcleaner:keep
         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
-        "//tensorflow/compiler/mlir/tf2xla:tf_xla_passes",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:tf_xla_passes",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
         "//tensorflow/compiler/mlir/tosa:tf_passes",
         "//tensorflow/compiler/mlir/tosa:tf_tfl_passes",
         "//tensorflow/compiler/mlir/tosa:tfl_passes",
@@ -249,6 +249,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/lib/monitoring:cell_reader",
         "@llvm-project//mlir:IR",
     ],
 )
diff --git a/tensorflow/compiler/mlir/g3doc/_includes/tf_passes.md b/tensorflow/compiler/mlir/g3doc/_includes/tf_passes.md
index 17996380f68..9405aa417df 100644
--- a/tensorflow/compiler/mlir/g3doc/_includes/tf_passes.md
+++ b/tensorflow/compiler/mlir/g3doc/_includes/tf_passes.md
@@ -178,6 +178,11 @@ func @_func(%arg0: tensor<i32>) -> tensor<i32> {
   return %identity : tensor<i32>
 }
 ```
+
+#### Options
+```
+-globally-unique-func-names : If true, the pass adds extra identifiers to make function names globally unique within a process, not just within a module.
+```
 ### `-tf-device-constant-sinking`: Sinks constants implicitly captured in a tf_device.cluster region.
 This pass sinks implicitly captured constants (`tf.Const` ops) used by and into
 a `tf_device.cluster` region. Performing this prior to outlining will reduce the
@@ -244,6 +249,11 @@ func @_func(%arg0: tensor<i32>) -> tensor<i32> {
   return %identity : tensor<i32>
 }
 ```
+
+#### Options
+```
+-globally-unique-func-names : If true, the pass adds extra identifiers to make function names globally unique within a process, not just within a module.
+```
 ### `-tf-device-mark-input-output-aliases`: Marks device cluster inputs-output pairs that read/write to the same variable as aliases
 This pass analyzes the inputs and outputs to device cluster and marks those
 input-output pairs as aliases (using `tf.aliasing_output` attribute) which read
@@ -259,6 +269,9 @@ inside device cluster. This would allow shape inference pass to further
 refine operand/result shapes of these ops. This is only safe to do when
 compiling to XLA.
 ### `-tf-einsum`: Transform Einsum to other TF Ops for the supported variants
+### `-tf-embedding-pipelining`: Rewrite graph for embedding pipelining
+For architectures that support accelerated embedding lookups, this pass will
+rewrite the graph to use pipelining for better device utilization.
 ### `-tf-executor-break-up-islands`: Transform from TF control dialect to TF executor dialect.
 ### `-tf-executor-check-control-dependencies`: Checks control dependencies
 This pass analyzes control dependencies between islands and warns about
@@ -726,6 +739,11 @@ func @outside_compilation() -> tensor<f32> {
   return %0 : tensor<f32>
 }
 ```
+### `-tf-extract-tpu-copy-with-dynamic-shape-op`: Extract the TPUCopyWithDynamicShapeOp out of the host launch and place it on device launch
+This pass looks for TPUCopyWithDynamicShapeOp which wraps in a
+`tf_device.launch` with host device attribute. It extracts the ops and wrap
+them in `tf_device.launch` with tpu device attribute so that ops can be
+run on TPU instead of CPU while still being compiled on host.
 ### `-tf-functional-control-flow-to-cfg`: Transform functional control flow Ops to MLIR Control Form Graph (CFG) form
 ### `-tf-functional-control-flow-to-regions`: Transforms functional control flow operations to their region-based counterparts
 This pass transforms functional control flow operations in the TensorFlow
@@ -1007,7 +1025,7 @@ Would become the following ops (unimportant attribute, type are omitted):
     "tf_device.launch"() {
       "tf.B"() {_xla_outside_compilation = "cluster1"}
       tf_device.return
-    } {device = "TPU_REPLICATED_HOST"} : () -> ()
+    } {device = "TPU_REPLICATED_HOST_0"} : () -> ()
     "tf.C"()
     tf_device.return
   }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []}
@@ -1161,6 +1179,12 @@ region and hoists them out. It also makes `tf.Shape` ops replicate invariant
 if possible. This currently updates or replaces `tf.Shape` ops of replicated
 arguments, either tensors or resources.
 
+The primary benefit of the pass is to hoist `num_replicas` `_TPUCompile`s
+into a single `_TPUCompile`.
+
+This pass assumes that when a `tf.Shape` directly inputs from `replicate`
+params, then it is the same shape across replicas.
+
 For example, the following
 
 ```mlir
@@ -1409,6 +1433,10 @@ func @main(%arg0: tensor<8x4xf32>) {
   return
 }
 ```
+### `-tf-tpu-annotate-dynamic-shape-inputs`: Annotate the inputs returned by TPUCopyWithDynamicShapeOp with dynamic shape
+This pass looks for the usage of the result of TPUCopyWithDynamicShapeOp
+and sets the shape of these inputs to be dynamic shaped. This will ensure
+that the generated HLO program is correctly reflecting the dynamic shape.
 ### `-tf-tpu-cleanup-cluster-attributes`: Eliminate _replication_info and other attributes from ops in a cluster
 This pass eliminate `_replication_info` and `device` attribute on operations
 that are contained in a tf_device.cluster op.
@@ -1508,6 +1536,25 @@ Then said `ReadVariableOp` is going to get replaced by:
     tf_device.return %2 : tensor<4xf32>
   }) {...} : () -> tensor<4xf32>
 ```
+### `-tf-tpu-colocate-splits`: Colocates each Split op with its predecessor
+It is beneficial for performance to assign a `Split` op to the same device
+as its predecessor. This is because the weight of cut edges is always
+minimized when the `Split` is with its predecessor. This colocation
+constraint will be used by the placer graph optimization to assign a device
+to the op.
+
+This pass should run in the export pipeline after tf-replicate-to-island so
+each replica has its own distinct (predecessor, Split) pair.
+
+The colocation class (`_class`) of the `Split` is set to the same class as
+its predecessor:
+
+```mlir
+%outputs1:2, %control1 = tf_executor.island wraps "tf.IteratorGetNext"(%arg)
+  {_class = ["loc:@dataset_iterator_1"]}
+%outputs2:2, %control2 = tf_executor.island wraps "tf.Split"(%outputs0, %outputs1#1)
+  {_class = ["loc:@dataset_iterator_1", num_split = 2 : i32}
+```
 ### `-tf-tpu-device-propagation`: Propagates TPU devices from ops to users
 ### `-tf-tpu-dynamic-layout-pass`: Inserts TPU layout ops to determine layout at run time.
 A pass that allows TPU input layout to be determined after JIT compilation.
@@ -1740,7 +1787,7 @@ will be rewritten as:
 
 ```mlir
 func @tf_tpu_rewrite(%arg0: tensor<i8>, %arg1: tensor<i8>) {
-  %0:2 = tf_device.replicate([%arg0, %arg1] as %arg2: tensor<i8>) {devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"], TPU_REPLICATED_HOST = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:0"]}, n = 2 : i32} {
+  %0:2 = tf_device.replicate([%arg0, %arg1] as %arg2: tensor<i8>) {devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"], TPU_REPLICATED_HOST_0 = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:0"]}, n = 2 : i32} {
     %1:2 = "tf_device.launch"() ( {
       %compilation_status, %program = "tf._TPUCompileMlir"() {mlir_module = "<serialized func>"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
       tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index f749b3e1221..27c2706622a 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -309,6 +309,7 @@ cc_library(
         "ir/tfl_ops.h",
         "transforms/passes.h",
         "utils/attribute_utils.h",
+        "utils/utils.h",
         "@llvm-project//mlir:include/mlir/Transforms/InliningUtils.h",
     ],
     deps = [
@@ -665,8 +666,8 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/tensorflow:unroll_batch_matmul_pass",
         "//tensorflow/compiler/mlir/tensorflow:verification_utils",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf_with_tf2xla",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/mlir_hlo",
@@ -966,29 +967,35 @@ cc_library(
         ":tensorflow_lite",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/lite/metrics:error_collector_inst",
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core/platform:status",
+        "//tensorflow/lite:graph_info",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:private_common",
         "//tensorflow/lite/delegates/flex:allowlisted_flex_ops_lib",
         "//tensorflow/lite/experimental/remat:metadata_util",
-        "//tensorflow/lite/kernels/internal:kernel_utils",
+        "//tensorflow/lite/python/metrics:converter_error_data_proto_cc",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/tools/versioning",
         "//tensorflow/lite/tools/versioning:gpu_compatibility",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:tstring",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@flatbuffers",
@@ -998,7 +1005,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TranslateLib",
     ],
 )
 
@@ -1268,6 +1274,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/stablehlo:stablehlo_tfl",
         "//tensorflow/compiler/mlir/lite/stablehlo:transforms",
         "//tensorflow/compiler/mlir/lite/stablehlo/serializer:flatbuffer_export",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantize_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
@@ -1282,6 +1289,8 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/lite/python/metrics:converter_error_data_proto_cc",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/tools/optimize:quantize_weights",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
diff --git a/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc b/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
index bfae3e96202..5f77797b9aa 100644
--- a/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/cluster_util.h"
 
 namespace mlir {
@@ -44,7 +45,7 @@ namespace common {
 bool IsConstantOrNone(Operation* op) {
   return (op->getNumResults() == 1 &&
           op->getResult(0).getType().isa<NoneType>()) ||
-         matchPattern(op, m_Constant());
+         matchPattern(op, m_Constant()) || isa<QConstOp>(op);
 }
 
 // Pre-order traverse, adding results and BlockArgs to `been_defined` and
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h
index 99928fcf4d8..38286ed3cfe 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h
@@ -94,6 +94,8 @@ class TargetHardware {
   // Usually should be something like mlir::TypeID::get<MyType>()
   virtual mlir::TypeID GetTypeId() const = 0;
 
+  virtual void GetDependentDialects(mlir::DialectRegistry& registry) const {}
+
  protected:
   // All registered hardware ops.
   std::vector<std::unique_ptr<TargetHardwareOperation>> hardware_ops_;
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
index 84e3df38b08..d363334fb5f 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
@@ -1,5 +1,9 @@
 load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
 load("//tensorflow:tensorflow.bzl", "VERSION")
+load(
+    "//third_party/mkl_dnn:build_defs.bzl",
+    "if_onednn_v3",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -96,7 +100,7 @@ pybind_extension(
         "@upb//:__subpackages__",
         "@XNNPACK//:__subpackages__",
         "@zlib//:__subpackages__",
-    ],
+    ] + if_onednn_v3(["@onednn_v3//:__subpackages__"]),
     deps = [
         ":tac_wrapper_lib",
         "//tensorflow/python:pybind11_lib",
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir
index 9e14f1eae7e..18b9e0fd605 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir
@@ -1,4 +1,6 @@
 // RUN: tac-opt-all-backends -tfl-raise-target-subgraphs %s -split-input-file | FileCheck %s
+// RUN: tac-opt-all-backends -tfl-raise-target-subgraphs="skip-raise-cpu-ops=true" %s -split-input-file | FileCheck %s --check-prefixes=CHECK-SKIP-CPU
+// RUN: tac-opt-all-backends -tfl-raise-target-subgraphs="ignore-inference-type=true" %s -split-input-file | FileCheck %s --check-prefixes=CHECK-IGNORE-INFERENCE-TYPE
 
 module {
 func.func @simpleWhile(%arg0: tensor<i32>) -> tensor<i32> {
@@ -502,3 +504,69 @@ func.func @cond_false_72730(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<?x
 // CHECK:     %21 = tfl.add %arg17, %arg3 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<i32>
 // CHECK:     return %1, %5, %7, %11, %13, %15, %16, %18, %20, %21 : tensor<i32>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?xi1>, tensor<?xi1>, tensor<?xi32>, tensor<?xi32>, tensor<i32>
 // CHECK:   }
+
+// -----
+
+// CHECK-SKIP-CPU-LABEL: testSkipCpuOps
+func.func @testSkipCpuOps(%arg0: tensor<1xf32>) -> (tensor<1xf32>, tensor<1xf32>) {
+  %0 = "tfl.add"(%arg0, %arg0) {tac.device = "GPU", fused_activation_function = "RELU6", tac.inference_type = "FLOAT"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  %1 = "tfl.add"(%arg0, %0) {tac.device = "CPU", fused_activation_function = "RELU6", tac.inference_type = "FLOAT"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  func.return %0, %1 : tensor<1xf32>, tensor<1xf32>
+}
+
+// CHECK-SKIP-CPU:   %[[RES0:.*]] = call @func_0_GPU_FLOAT(%arg0) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<1xf32>) -> tensor<1xf32>
+// CHECK-SKIP-CPU:   %[[RES1:.*]] = tfl.add %arg0, %[[RES0]] {fused_activation_function = "RELU6", tac.device = "CPU", tac.inference_type = "FLOAT"} : tensor<1xf32>
+// CHECK-SKIP-CPU:   return %[[RES0]], %[[RES1]] : tensor<1xf32>, tensor<1xf32>
+// CHECK-SKIP-CPU: }
+// CHECK-SKIP-CPU: func.func private @func_0_GPU_FLOAT(%arg0: tensor<1xf32>) -> tensor<1xf32> attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
+// CHECK-SKIP-CPU:   %[[RES2:.*]] = tfl.add %arg0, %arg0 {fused_activation_function = "RELU6", tac.device = "GPU", tac.inference_type = "FLOAT"} : tensor<1xf32>
+// CHECK-SKIP-CPU:   return %[[RES2]] : tensor<1xf32>
+// CHECK-SKIP-CPU: }
+
+// -----
+
+// CHECK-SKIP-CPU-LABEL: testSkipCpuOpsWithinLoop
+func.func @testSkipCpuOpsWithinLoop(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "tfl.while"(%arg0) ({
+  ^bb0(%block: tensor<i32>):
+    "tfl.yield"(%block) : (tensor<i32>) -> ()
+  },{
+  ^bb0(%block: tensor<i32>):
+    %0 = "tfl.add"(%arg0, %block) {tac.device = "GPU", fused_activation_function = "RELU6", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    "tfl.yield"(%0) : (tensor<i32>) -> ()
+  }) {tac.device = "CPU", fused_activation_function = "RELU6", tac.inference_type = "FLOAT"} : (tensor<i32>) -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-SKIP-CPU: "tfl.while"
+// CHECK-SKIP-CPU:   ^bb0(%[[ARG0:.*]]: tensor<i32>):
+// CHECK-SKIP-CPU:     "tfl.yield"(%[[ARG0]]) : (tensor<i32>) -> ()
+// CHECK-SKIP-CPU: }, {
+// CHECK-SKIP-CPU:   ^bb0(%[[ARG1:.*]]: tensor<i32>):
+// CHECK-SKIP-CPU:     %[[RES0:.*]] = func.call @func_0_GPU_FLOAT(%{{.*}}, %[[ARG1]]) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+// CHECK-SKIP-CPU:     "tfl.yield"(%[[RES0]]) : (tensor<i32>) -> ()
+// CHECK-SKIP-CPU: }) {fused_activation_function = "RELU6", tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<i32>) -> tensor<i32>
+
+// CHECK-SKIP-CPU: func.func private @func_0_GPU_FLOAT(%[[ARG2:.*]]: tensor<i32>, %[[ARG3:.*]]: tensor<i32>) -> tensor<i32> attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
+// CHECK-SKIP-CPU:   %[[RES1:.*]] = tfl.add %[[ARG2]], %[[ARG3]] {fused_activation_function = "RELU6", tac.device = "GPU", tac.inference_type = "FLOAT"} : tensor<i32>
+// CHECK-SKIP-CPU:   return %[[RES1]] : tensor<i32>
+// CHECK-SKIP-CPU: }
+
+// -----
+
+// CHECK-IGNORE-INFERENCE-TYPE-LABEL: testIgnoreInferenceType
+func.func @testIgnoreInferenceType(%arg0: tensor<1x384x384xf32>, %arg1: tensor<1x1x384x!quant.uniform<i8:f32, 0.003:-128>>) -> (tensor<1x384x384xf32>, tensor<1x1x384x!quant.uniform<i8:f32, 0.003:-128>>) {
+  // These 2 ops are clustered together when `ignore-inference-type` sets to true.
+  %0 = "tfl.add"(%arg0, %arg0) {tac.device = "GPU", tac.inference_type = "FLOAT", fused_activation_function = "NONE"} : (tensor<1x384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
+  %1 = "tfl.mul"(%arg1, %arg1) {tac.device = "GPU", tac.inference_type = "QUANTIZED_INT8", fused_activation_function = "NONE"} : (tensor<1x1x384x!quant.uniform<i8:f32, 0.003:-128>>, tensor<1x1x384x!quant.uniform<i8:f32, 0.003:-128>>) -> tensor<1x1x384x!quant.uniform<i8:f32, 0.003:-128>>
+  func.return %0, %1: tensor<1x384x384xf32>, tensor<1x1x384x!quant.uniform<i8:f32, 0.003:-128>>
+}
+
+// CHECK-IGNORE-INFERENCE-TYPE:   %[[RES0:.*]]:2 = call @[[FUNC_NAME:.*]](%arg0, %arg1) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<1x384x384xf32>, tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> (tensor<1x384x384xf32>, tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>)
+// CHECK-IGNORE-INFERENCE-TYPE:   return %[[RES0]]#0, %[[RES0]]#1 : tensor<1x384x384xf32>, tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
+// CHECK-IGNORE-INFERENCE-TYPE: }
+// CHECK-IGNORE-INFERENCE-TYPE: func.func private @[[FUNC_NAME]](%arg0: tensor<1x384x384xf32>, %arg1: tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> (tensor<1x384x384xf32>, tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>) attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
+// CHECK-IGNORE-INFERENCE-TYPE:   %[[RES1:.*]] = tfl.add %arg0, %arg0 {fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : tensor<1x384x384xf32>
+// CHECK-IGNORE-INFERENCE-TYPE:   %[[RES2:.*]] = tfl.mul %arg1, %arg1 {fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "QUANTIZED_INT8"} : tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
+// CHECK-IGNORE-INFERENCE-TYPE:   return %[[RES1]], %[[RES2]] : tensor<1x384x384xf32>, tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
+// CHECK-IGNORE-INFERENCE-TYPE: }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.cc
index 15fb7e66477..4efdd053eec 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.cc
@@ -65,7 +65,9 @@ int64_t GetTransferredTensorBytes(func::CallOp from_graph,
       if (IsQUI8Type(input_type) || IsQI8Type(input_type)) {
         total_size_transferred += input_type.getNumElements() * 8;
       } else {
-        total_size_transferred += input_type.cast<ShapedType>().getSizeInBits();
+        auto s_type = input_type.cast<ShapedType>();
+        total_size_transferred +=
+            s_type.getNumElements() * s_type.getElementTypeBitWidth();
       }
     }
   }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/passes.h b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/passes.h
index 201ce1690d3..f738b2e7a60 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/passes.h
@@ -37,8 +37,12 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateTargetAnnotationPass(
 std::unique_ptr<OperationPass<func::FuncOp>> CreateTargetAnnotationPass(
     const TacModule* module);
 
-// Create an instance of the RaiseTargetSubgraphsPass.
-std::unique_ptr<OperationPass<ModuleOp>> CreateRaiseTargetSubgraphsPass();
+// Create an instance of the RaiseTargetSubgraphsPass. If `skip_raise_cpu_ops`,
+// we skip clustering for CPU ops for better clustering of ops running on other
+// ML accelerators. When `ignore_inference_type` is set to true, the inference
+// types are set to "NOT_CARE" for better clustering.
+std::unique_ptr<OperationPass<ModuleOp>> CreateRaiseTargetSubgraphsPass(
+    bool skip_raise_cpu_ops = false, bool ignore_inference_type = false);
 
 // Create an instance of the AlternativeSubgraphPass.
 std::unique_ptr<OperationPass<ModuleOp>> CreateAlternativeSubgraphPass(
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/raise_target_subgraphs.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/raise_target_subgraphs.cc
index 92ac79aef63..4fd9f945764 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/raise_target_subgraphs.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/raise_target_subgraphs.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/experimental/common/outline_operations.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/subgraph.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
@@ -65,7 +66,28 @@ class RaiseTargetSubgraphsPass
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RaiseTargetSubgraphsPass)
 
+  RaiseTargetSubgraphsPass() = default;
+  RaiseTargetSubgraphsPass(const RaiseTargetSubgraphsPass& other) {
+    this->skip_raise_cpu_ops_ = other.skip_raise_cpu_ops_;
+    this->ignore_inference_type_ = other.ignore_inference_type_;
+  }
+  explicit RaiseTargetSubgraphsPass(bool skip_raise_cpu_ops,
+                                    bool ignore_inference_type) {
+    skip_raise_cpu_ops_ = skip_raise_cpu_ops;
+    ignore_inference_type_ = ignore_inference_type;
+  }
+
  private:
+  Option<bool> skip_raise_cpu_ops_{
+      *this, "skip-raise-cpu-ops",
+      llvm::cl::desc("Whether to cluster and raise CPU ops."),
+      llvm::cl::init(false)};
+
+  Option<bool> ignore_inference_type_{
+      *this, "ignore-inference-type",
+      llvm::cl::desc("Whether to ignore the inference type in clustering."),
+      llvm::cl::init(false)};
+
   llvm::StringRef getArgument() const final {
     return "tfl-raise-target-subgraphs";
   }
@@ -189,8 +211,11 @@ void RaiseTargetSubgraphsPass::RaiseTargetSubgraphsForBlock(
       return std::string("");
     }
     std::string concat_inference_device_type_string =
-        absl::StrCat(device_type.value().hardware, "_",
-                     GetInferenceString(device_type.value().inference_type));
+        ignore_inference_type_
+            ? device_type.value().hardware
+            : absl::StrCat(
+                  device_type.value().hardware, "_",
+                  GetInferenceString(device_type.value().inference_type));
     return concat_inference_device_type_string;
   };
 
@@ -208,6 +233,20 @@ void RaiseTargetSubgraphsPass::RaiseTargetSubgraphsForBlock(
       extract(cluster.ops);
     }
   }
+  if (skip_cpu) {
+    for (auto& op : block) {
+      auto op_device = GetInferenceDeviceTypeForOp(&op);
+      if (op_device_is(op, kCpuDeviceName))
+        // The recently raised func is device type cpu & `op` is a "CPU".
+        // Recursivley call again to raise any non-"CPU" subgraphs contained
+        // within nested region of `op`.
+        for (auto& region : op.getRegions())
+          for (auto& block : region.getBlocks())
+            RaiseTargetSubgraphsForBlock(block, builder, module,
+                                         /*skip_cpu=*/true, func_count,
+                                         side_effect_info);
+    }
+  }
 }
 
 void RaiseTargetSubgraphsPass::runOnOperation() {
@@ -220,15 +259,18 @@ void RaiseTargetSubgraphsPass::runOnOperation() {
     for (auto& block : func) {
       OpBuilder builder = OpBuilder::atBlockBegin(&block);
       RaiseTargetSubgraphsForBlock(block, builder, module,
-                                   /*skip_cpu=*/false, func_count, info);
+                                   /*skip_cpu=*/skip_raise_cpu_ops_, func_count,
+                                   info);
     }
   }
 }
 
 }  // namespace
 
-std::unique_ptr<OperationPass<ModuleOp>> CreateRaiseTargetSubgraphsPass() {
-  return std::make_unique<RaiseTargetSubgraphsPass>();
+std::unique_ptr<OperationPass<ModuleOp>> CreateRaiseTargetSubgraphsPass(
+    bool skip_raise_cpu_ops, bool ignore_inference_type) {
+  return std::make_unique<RaiseTargetSubgraphsPass>(skip_raise_cpu_ops,
+                                                    ignore_inference_type);
 }
 
 static PassRegistration<RaiseTargetSubgraphsPass> pass;
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_pass.h b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_pass.h
index 78e47dcdee9..392a2713e95 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_pass.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_pass.h
@@ -41,7 +41,8 @@ class TacPass : public OperationPass<T> {
 
   ~TacPass() override {}
 
-  const TargetHardware* GetTargetHardware(const std::string& hardware_name) {
+  const TargetHardware* GetTargetHardware(
+      const std::string& hardware_name) const {
     return module_ != nullptr
                ? module_->GetTargetHardware(hardware_name)
                : mlir::TFL::tac::GetTargetHardware(hardware_name);
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/target_annotation.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/target_annotation.cc
index 009f27d936e..2dddad4e9a8 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/target_annotation.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/target_annotation.cc
@@ -69,6 +69,16 @@ class TargetAnnotationPass : public TacFunctionPass<TargetAnnotationPass> {
       llvm::cl::desc(
           "comma separated list of device specs, like CPU, GPU, Hexagon."),
       llvm::cl::ZeroOrMore};
+
+  void getDependentDialects(mlir::DialectRegistry& registry) const override {
+    if (!module_) {
+      for (const auto& device : device_specs_flag_) {
+        auto* hardware = this->GetTargetHardware(device);
+        if (hardware == nullptr) continue;
+        hardware->GetDependentDialects(registry);
+      }
+    }
+  }
 };
 
 void SetAnnotation(Operation* op, std::string attribute, std::string annotation,
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index c7ca7accb66..1e4475bd4b3 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -19,54 +19,73 @@ limitations under the License.
 #include <stdlib.h>
 
 #include <algorithm>
+#include <cassert>
 #include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <limits>
+#include <map>
 #include <memory>
 #include <optional>
+#include <set>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/strings/match.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/flatbuffer_operator.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h"
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/lite/utils/low_bit_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
@@ -74,28 +93,33 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
-#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/delegates/flex/allowlisted_flex_ops.h"
 #include "tensorflow/lite/experimental/remat/metadata_util.h"
-#include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/graph_info.h"
+#include "tensorflow/lite/python/metrics/converter_error_data.pb.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/tools/versioning/gpu_compatibility.h"
 #include "tensorflow/lite/tools/versioning/op_version.h"
 #include "tensorflow/lite/tools/versioning/runtime_version.h"
 #include "tensorflow/lite/version.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/tstring.h"
 
 using llvm::dyn_cast;
 using llvm::formatv;
 using llvm::isa;
 using llvm::StringRef;
-using llvm::Twine;
 using mlir::Dialect;
 using mlir::ElementsAttr;
 using mlir::MLIRContext;
@@ -105,6 +129,7 @@ using mlir::Operation;
 using mlir::Region;
 using mlir::StringAttr;
 using mlir::TensorType;
+using mlir::Twine;
 using mlir::Type;
 using mlir::UnknownLoc;
 using mlir::Value;
@@ -124,7 +149,6 @@ using VectorBufferOffset = flatbuffers::Offset<flatbuffers::Vector<T>>;
 
 using CustomOptionsOffset = VectorBufferOffset<uint8_t>;
 
-namespace error = tensorflow::error;
 namespace tfl = mlir::TFL;
 
 ABSL_CONST_INIT const absl::string_view kFlexOpNamePrefix = "Flex";
@@ -142,7 +166,7 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
     return tflite::TensorType_UINT8;
   }
   if (!is_signed) {
-    return Status(error::INVALID_ARGUMENT,
+    return Status(absl::StatusCode::kInvalidArgument,
                   "'isSigned' can only be set for 8-bits integer type");
   }
 
@@ -164,14 +188,14 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
     if (ftype.isF64()) {
       return tflite::TensorType_COMPLEX128;
     }
-    return Status(error::INVALID_ARGUMENT, "Unsupported type");
+    return Status(absl::StatusCode::kInvalidArgument, "Unsupported type");
   } else if (auto itype = type.dyn_cast<mlir::IntegerType>()) {
     switch (itype.getWidth()) {
       case 1:
         return tflite::TensorType_BOOL;
       case 4:
         if (itype.isUnsigned()) {
-          return Status(error::INVALID_ARGUMENT,
+          return Status(absl::StatusCode::kInvalidArgument,
                         "Unsupported 4bit unsigned int type");
         } else {
           return tflite::TensorType_INT4;
@@ -207,7 +231,7 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
   }
   // TFLite export fills FLOAT32 for unknown data types. Returning an error
   // for now for safety and this could be revisited when required.
-  return Status(error::INVALID_ARGUMENT, "Unsupported type");
+  return Status(absl::StatusCode::kInvalidArgument, "Unsupported type");
 }
 
 static bool IsConst(Operation* op) {
@@ -335,7 +359,7 @@ static bool HasValidTFLiteType(Value value, T& error_handler) {
   if (!status.ok()) {
     return error_handler.emitError(
                formatv("Failed to convert element type '{0}': {1}",
-                       element_type, status.status().error_message())),
+                       element_type, status.status().message())),
            false;
   }
   return true;
@@ -1553,7 +1577,7 @@ std::optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
   }
 
   bool failed_once = false;
-  for (auto& item : llvm::enumerate(bb)) {
+  for (const auto& item : llvm::enumerate(bb)) {
     Operation& inst = item.value();
     const int operation_index = item.index();
     if (inst.hasTrait<mlir::OpTrait::IsTerminator>()) break;
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 6421598e76a..487b3edd60a 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -647,7 +647,7 @@ static StatusOr<Operation*> BuildSparseConstOp(
   }
   std::vector<char> dense_buffer(
       value_type.getElementType().getIntOrFloatBitWidth() / CHAR_BIT);
-  mlir::Attribute dummy_value =
+  mlir::TypedAttr dummy_value =
       mlir::DenseIntOrFPElementsAttr::getFromRawBuffer(value_type,
                                                        dense_buffer);
 
@@ -1376,7 +1376,7 @@ StatusOr<FuncOp> ConvertSubgraph(
   }
 
   // Construct MLIR operators from TFLite operators
-  for (auto& it : llvm::enumerate(subgraph.operators)) {
+  for (const auto& it : llvm::enumerate(subgraph.operators)) {
     auto& op = it.value();
 
     if (experimental_prune_unreachable_nodes_unconditionally &&
@@ -1612,8 +1612,7 @@ OwningOpRef<mlir::ModuleOp> tflite::FlatBufferToMlir(
         model_control_dependencies[subgraph_index]);
     if (!func_or_error.ok()) {
       return emitError(base_loc, "could not translate function ")
-                 << subgraph->name << ": "
-                 << func_or_error.status().error_message(),
+                 << subgraph->name << ": " << func_or_error.status().message(),
              nullptr;
     }
     module.push_back(std::move(func_or_error).value());
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 62f9e220665..f23dfd96e88 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <iterator>
 #include <numeric>
 #include <optional>
@@ -282,6 +283,11 @@ bool IsI32Type(Type element_type) {
   return element_type.isInteger(32) && !element_type.isUnsignedInteger();
 }
 
+// Return true when the given element_type is UI32.
+bool IsUI32Type(Type element_type) {
+  return element_type.isInteger(32) && element_type.isUnsignedInteger();
+}
+
 // Return true when the given element_type is I64.
 bool IsI64Type(Type element_type) {
   return element_type.isInteger(64) && !element_type.isUnsignedInteger();
@@ -335,7 +341,7 @@ bool VerifyAddOpShapeConstraints(AddOp op) {
       IsI32Type(element_type) || IsI64Type(element_type)) {
     return VerifyOperandsHaveSameShapesOrBroadcastableShape(
         /*op=*/op.getOperation(), /*indices=*/ArrayRef<unsigned>{0, 1},
-        /*max_bcast_rank=*/4);
+        /*max_bcast_rank=*/6);
   }
 
   // Allows QI16 output when operands have the same shape.
@@ -389,8 +395,9 @@ bool VerifyMulOpShapeConstraints(MulOp op) {
 
   // Allows I32, I64, QI16 and F32 outputs when the operands have valid shapes,
   // which are broadcastable shapes up to four dimension or have same shapes.
-  if (IsI32Type(element_type) || IsI64Type(element_type) ||
-      IsQI16Type(element_type) || element_type.isa<ComplexType>() ||
+  if (IsI32Type(element_type) || IsUI32Type(element_type) ||
+      IsI64Type(element_type) || IsQI16Type(element_type) ||
+      IsI16Type(element_type) || element_type.isa<ComplexType>() ||
       element_type.isF32()) {
     return VerifyOperandsHaveSameShapesOrBroadcastableShape(
         /*op=*/op.getOperation(), /*indices=*/ArrayRef<unsigned>{0, 1},
@@ -961,9 +968,9 @@ mlir::LogicalResult CustomOp::verify() {
 
 LogicalResult CustomTfOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attr, RegionRange ranges,
+    DictionaryAttr attr, OpaqueProperties, RegionRange ranges,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  CustomTfOpAdaptor op(operands, attr, ranges);
+  CustomTfOpAdaptor op(operands, attr, {}, ranges);
 
   if (op.getRegions().empty()) return success();
   auto* real_op = &op.getBody().front().front();
@@ -1226,7 +1233,7 @@ static LogicalResult ComputeConvWindowedOutputSize(
 
 LogicalResult Conv2DOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attr, RegionRange,
+    DictionaryAttr attr, OpaqueProperties, RegionRange,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   Conv2DOpAdaptor op(operands, attr);
 
@@ -1711,7 +1718,7 @@ struct ConvertShapeTo1D : public OpRewritePattern<ReshapeOp> {
       return failure();
     }
     // It is already a 1-D constant, no change.
-    auto old_shape = shape.getType().getShape();
+    auto old_shape = shape.getShapedType().getShape();
     if (old_shape.size() == 1) {
       return failure();
     }
@@ -1724,7 +1731,7 @@ struct ConvertShapeTo1D : public OpRewritePattern<ReshapeOp> {
       }
     }
     auto new_shape = shape.reshape(tensorflow::GetTypeFromTFTensorShape(
-        {*old_shape.rbegin()}, shape.getType().getElementType()));
+        {*old_shape.rbegin()}, shape.getShapedType().getElementType()));
     rewriter.replaceOpWithNewOp<TFL::ConstOp>(
         reshape.getShape().getDefiningOp(), new_shape);
     return success();
@@ -1907,7 +1914,7 @@ mlir::LogicalResult ReshapeOp::verify() {
 
 LogicalResult ReshapeOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attr, RegionRange,
+    DictionaryAttr attr, OpaqueProperties, RegionRange,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   ReshapeOpAdaptor op(operands, attr);
   const Value input = op.getInput();
@@ -2222,7 +2229,7 @@ static void BuildTopKOp(OpBuilder* builder, OperationState& result, Value input,
   if (!val_type.hasRank())
     return TFL::TopKV2Op::build(
         *builder, result, UnrankedTensorType::get(val_type.getElementType()),
-        UnrankedTensorType::get(builder->getIntegerType(32)), input, k);
+        UnrankedTensorType::get(k.getType()), input, k);
 
   // Resultant shape is value.shape[:-1] + [k]
   std::vector<int64_t> shape(val_type.getShape());
@@ -2230,8 +2237,7 @@ static void BuildTopKOp(OpBuilder* builder, OperationState& result, Value input,
   TFL::TopKV2Op::build(
       *builder, result,
       tensorflow::GetTypeFromTFTensorShape(shape, val_type.getElementType()),
-      tensorflow::GetTypeFromTFTensorShape(shape, builder->getIntegerType(32)),
-      input, k);
+      tensorflow::GetTypeFromTFTensorShape(shape, k.getType()), input, k);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2285,7 +2291,7 @@ void FakeQuantOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult UnpackOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> loc, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   UnpackOpAdaptor op(operands, attributes);
   // TODO(jpienaar): Refactor verify
@@ -2646,7 +2652,7 @@ mlir::LogicalResult UnidirectionalSequenceLSTMOp::verify() {
 
 LogicalResult UnidirectionalSequenceLSTMOp::inferReturnTypes(
     MLIRContext*, std::optional<Location>, ValueRange operands,
-    DictionaryAttr attr, RegionRange,
+    DictionaryAttr attr, OpaqueProperties, RegionRange,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   Value input = operands[0];
   auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
@@ -2922,7 +2928,7 @@ OpFoldResult RankOp::fold(FoldAdaptor adaptor) {
   assert(operands.size() == 1);
   auto result_type = getType().cast<ShapedType>();
   if (auto elements_attr = operands[0].dyn_cast_or_null<ElementsAttr>()) {
-    auto rank = static_cast<int32_t>(elements_attr.getType().getRank());
+    auto rank = static_cast<int32_t>(elements_attr.getShapedType().getRank());
     return DenseElementsAttr::get(result_type, {rank});
   }
 
@@ -3145,9 +3151,9 @@ OpFoldResult RangeOp::fold(FoldAdaptor adaptor) {
   auto delta_tensor = operands[2].dyn_cast_or_null<ElementsAttr>();
   if (start_tensor && limit_tensor && delta_tensor) {
     // Operands should all be scalars
-    assert(start_tensor.getType().getRank() == 0 &&
-           limit_tensor.getType().getRank() == 0 &&
-           delta_tensor.getType().getRank() == 0);
+    assert(start_tensor.getShapedType().getRank() == 0 &&
+           limit_tensor.getShapedType().getRank() == 0 &&
+           delta_tensor.getShapedType().getRank() == 0);
     Type elem_type = getType().cast<ShapedType>().getElementType();
     if (elem_type.isSignlessInteger()) {
       auto start_attr = start_tensor.getValues<IntegerAttr>()[0];
@@ -3328,9 +3334,12 @@ namespace {
 // The function recursively traverses the dimensions of the output tensor in
 // a row-major order and writes the value in the output tensor into
 // `new_values`.
-void ComputePermutation(ElementsAttr input_tensor, ArrayRef<int32_t> perm,
-                        ArrayRef<int64_t> output_shape, int num_dimensions,
-                        int output_axis, std::vector<uint64_t>* input_indices,
+void ComputePermutation(mlir::detail::ElementsAttrRange<
+                            mlir::detail::ElementsAttrIterator<mlir::Attribute>>
+                            input_tensor_values,
+                        ArrayRef<int32_t> perm, ArrayRef<int64_t> output_shape,
+                        const int num_dimensions, const int output_axis,
+                        std::vector<uint64_t>* input_indices,
                         std::vector<Attribute>* new_values) {
   // Refer to the implementation of `Transpose` function in
   // tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -3343,11 +3352,11 @@ void ComputePermutation(ElementsAttr input_tensor, ArrayRef<int32_t> perm,
     // recurse into the next axis.
     const bool is_last_axis = output_axis == num_dimensions - 1;
     if (is_last_axis) {
-      new_values->push_back(
-          input_tensor.getValues<Attribute>()[*input_indices]);
+      new_values->push_back(input_tensor_values[*input_indices]);
     } else {
-      ComputePermutation(input_tensor, perm, output_shape, num_dimensions,
-                         output_axis + 1, input_indices, new_values);
+      ComputePermutation(input_tensor_values, perm, output_shape,
+                         num_dimensions, output_axis + 1, input_indices,
+                         new_values);
     }
   }
 }
@@ -3366,11 +3375,11 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
   if (!getType().cast<ShapedType>().getElementType().isSignlessIntOrFloat())
     return nullptr;
 
-  assert(perm_tensor.getType().getRank() == 1);
-  const int num_dimensions = input_tensor.getType().getRank();
-  assert(perm_tensor.getType().getNumElements() == num_dimensions);
+  assert(perm_tensor.getShapedType().getRank() == 1);
+  const int num_dimensions = input_tensor.getShapedType().getRank();
+  assert(perm_tensor.getShapedType().getNumElements() == num_dimensions);
 
-  ArrayRef<int64_t> input_shape = input_tensor.getType().getShape();
+  ArrayRef<int64_t> input_shape = input_tensor.getShapedType().getShape();
   auto output_type = getType().cast<ShapedType>();
 
   SmallVector<int32_t, 4> perm;
@@ -3385,9 +3394,10 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
   }
 
   std::vector<Attribute> new_values;
-  new_values.reserve(input_tensor.getType().getNumElements());
+  new_values.reserve(input_tensor.getShapedType().getNumElements());
   std::vector<uint64_t> input_indices(num_dimensions);
-  ComputePermutation(input_tensor, perm, output_shape, num_dimensions,
+  auto input_tensor_values = input_tensor.getValues<Attribute>();
+  ComputePermutation(input_tensor_values, perm, output_shape, num_dimensions,
                      /*output_axis=*/0, &input_indices, &new_values);
   auto result_type = tensorflow::GetTypeFromTFTensorShape(
       output_shape, output_type.getElementType());
@@ -3542,7 +3552,7 @@ void IfOp::getSuccessorRegions(std::optional<unsigned> index,
   // Otherwise, the successor is dependent on the condition.
   bool condition;
   if (auto cond_attr = operands.front().dyn_cast_or_null<IntegerAttr>()) {
-    condition = cond_attr.getValue().isOneValue();
+    condition = cond_attr.getValue().isOne();
   } else {
     // If the condition isn't constant, both regions may be executed.
     regions.push_back(RegionSuccessor(&getThenRegion()));
@@ -3703,9 +3713,9 @@ struct WhileResultOperandsMatchAndImplicitCapture
 
     // Replace with new While with matching operands and results.
     Operation* op = while_op.getOperation();
-    Operation* new_op = rewriter.insert(
-        Operation::create(op->getLoc(), op->getName(), types, new_operands,
-                          op->getAttrs(), {}, /*numRegions=*/2));
+    Operation* new_op = rewriter.insert(Operation::create(
+        op->getLoc(), op->getName(), types, new_operands, op->getAttrs(),
+        op->getPropertiesStorage(), {}, /*numRegions=*/2));
 
     for (int i = 0; i < 2; ++i) new_op->getRegion(i).takeBody(op->getRegion(i));
     int new_index = 0;
@@ -4057,7 +4067,7 @@ OpFoldResult EmbeddingLookupOp::fold(FoldAdaptor adaptor) {
 
   std::vector<int64_t> new_shape = value_attr.getType().getShape().vec();
   new_shape[0] = lookup_attr.getType().getShape()[0];
-  Type new_type = value_attr.getType().clone(new_shape);
+  auto new_type = value_attr.getType().clone(new_shape);
 
   return DenseElementsAttr::get(new_type, new_values);
 }
@@ -4086,7 +4096,96 @@ Attribute ConstBytesAttr::parse(AsmParser& parser, Type type) {
 
 void ConstBytesAttr::print(mlir::AsmPrinter& printer) const {
   StringRef bytes_str = getValue();
-  printer << " : \"0x" << llvm::toHex(bytes_str) << "\"";
+  // Elide the attribute if flag is set.
+  std::optional<int64_t> limit = OpPrintingFlags().getLargeElementsAttrLimit();
+  printer << " : \"";
+  if (limit && limit.value() < bytes_str.size()) {
+    printer << "__elided__";
+  } else {
+    printer << "0x" << llvm::toHex(bytes_str);
+  }
+  printer << "\"";
+}
+
+//===----------------------------------------------------------------------===//
+// BitcastOp
+//===----------------------------------------------------------------------===//
+
+int64_t GetTypeBitWidth(mlir::Type type) {
+  if (auto quant_type = type.dyn_cast<mlir::quant::QuantizedType>()) {
+    return quant_type.getStorageTypeIntegralWidth();
+  }
+  if (type.isIntOrFloat()) {
+    return std::max(type.getIntOrFloatBitWidth(),
+                    static_cast<unsigned>(CHAR_BIT));
+  }
+  return -1;
+}
+
+LogicalResult BitcastOp::verify() {
+  BitcastOp op = *this;
+  auto input_type = op.getInput().getType().cast<ShapedType>();
+  auto output_type = op.getOutput().getType().cast<ShapedType>();
+
+  auto input_element_type = input_type.getElementType();
+  auto output_element_type = output_type.getElementType();
+
+  if (input_type.hasStaticShape()) {
+    const int input_element_type_bitwidth = GetTypeBitWidth(input_element_type);
+    const int output_element_type_bitwidth =
+        GetTypeBitWidth(output_element_type);
+
+    if (input_element_type_bitwidth < 0 || output_element_type_bitwidth < 0) {
+      // Only supports quantized type, int and float types.
+      return op.emitOpError("Unsupported element type.");
+    }
+
+    if (input_element_type_bitwidth < output_element_type_bitwidth) {
+      if (output_element_type_bitwidth % input_element_type_bitwidth != 0) {
+        return op.emitOpError(
+            "output element bitwidth is not multiple of input element "
+            "bitwidth");
+      }
+      if (input_type.getShape().empty() ||
+          input_type.getShape().back() % (output_element_type_bitwidth /
+                                          input_element_type_bitwidth) !=
+              0) {
+        return op.emitOpError(
+            "input rightmost dimension size is not multiple of the divisor");
+      }
+    } else if (input_element_type_bitwidth > output_element_type_bitwidth) {
+      if (input_element_type_bitwidth % output_element_type_bitwidth != 0) {
+        return op.emitOpError(
+            "input element bitwidth is not multiple of output element "
+            "bitwidth");
+      }
+    }
+  }
+  return success();
+}
+
+OpFoldResult BitcastOp::fold(FoldAdaptor adaptor) {
+  if (getType() == getInput().getType()) return getInput();
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicUpdateSliceOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult DynamicUpdateSliceOp::fold(FoldAdaptor) {
+  // Check if update replaces the whole tensor, meaning operand and update has
+  // the same shape and all start indices are zero.
+  DenseIntElementsAttr indices_attr;
+  if (matchPattern(getStartIndices(), m_Constant(&indices_attr)) &&
+      indices_attr.isSplat() && indices_attr.getSplatValue<int>() == 0 &&
+      getOperand().getType().hasStaticShape() &&
+      getUpdate().getType().hasStaticShape() &&
+      getOperand().getType() == getUpdate().getType()) {
+    return getUpdate();
+  }
+
+  return {};
 }
 
 //===----------------------------------------------------------------------===//
@@ -4133,7 +4232,7 @@ Operation* TFLDialect::materializeConstant(OpBuilder& builder, Attribute value,
        value.cast<ElementsAttr>().getType() != type))
     return builder.create<ConstOp>(loc, type, value.cast<ElementsAttr>());
   if (arith::ConstantOp::isBuildableWith(value, type))
-    return builder.create<arith::ConstantOp>(loc, type, value);
+    return builder.create<arith::ConstantOp>(loc, type, cast<TypedAttr>(value));
   if (NoValueOp::isBuildableWith(value, type))
     return builder.create<NoValueOp>(loc, type, value.cast<UnitAttr>());
   return nullptr;
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index e1cb8de2c57..73740be2310 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_enums.h.inc"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #define GET_ATTRDEF_CLASSES
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index c7991a28ba7..8266fc605c0 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -2362,13 +2362,13 @@ equivalent to setting:
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I32, I64, I8, UI8, I1, TFL_Str, QI8, QUI8, TFL_Quint8, QI16]>:$input,
+    TFL_TensorOf<[F32, I32, I64, I8, UI8, UI32, I1, TFL_Str, QI8, QUI8, TFL_Quint8, QI16]>:$input,
     TFL_I32OrI64Tensor:$begin,
     TFL_I32OrI64Tensor:$size
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, I8, UI8, I1, TFL_Str, QI8, QUI8, TFL_Quint8, QI16]>:$output
+    TFL_TensorOf<[F32, I32, I64, I8, UI8, UI32, I1, TFL_Str, QI8, QUI8, TFL_Quint8, QI16]>:$output
   );
 
   let hasVerifier = 1;
@@ -2528,11 +2528,11 @@ def TFL_MulOp : TFL_Op<"mul", [
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I32, I64, QI8, QUI8, QI16, Complex<F<32>>]>:$lhs,
-    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, QI16, Complex<F<32>>]>:$rhs,
+    ins TFL_TensorOf<[F32, I32, UI32, I64, QI8, QUI8, QI16, I16, Complex<F<32>>]>:$lhs,
+    TFL_TensorOf<[F32, I32, UI32, I64, QI8, QUI8, QI16, I16, Complex<F<32>>]>:$rhs,
     TFL_AFAttr:$fused_activation_function);
 
-  let results = (outs TFL_TensorOf<[F32, I32, I64, QI8, QUI8, QI16, Complex<F<32>>]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, UI32, I64, QI8, QUI8, QI16, I16, Complex<F<32>>]>:$output);
 
   let hasFolder = 1;
 
@@ -2612,14 +2612,14 @@ def TFL_PackOp : TFL_Op<"pack", [
   }];
 
   let arguments = (ins
-    TFL_VariadicTensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QUI8, QI16, TFL_Quint8]>:$values,
+    TFL_VariadicTensorOf<[F32, I8, I16, I32, I64, UI8, UI32, QI8, QUI8, QI16, TFL_Quint8]>:$values,
 
     ConfinedAttr<I32Attr, [IntPositive]>:$values_count,
     I32Attr:$axis
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QUI8, QI16, TFL_Quint8]>:$output
+    TFL_TensorOf<[F32, I8, I16, I32, I64, UI8, UI32, QI8, QUI8, QI16, TFL_Quint8]>:$output
   );
 
   let hasVerifier = 1;
@@ -3128,11 +3128,11 @@ def TFL_SelectOp : TFL_Op<"select", [
 
   let arguments = (ins
     TFL_BoolTensor:$condition,
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$x,
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$y);
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, UI32, QI8, QUI8, QI16, TFL_Quint8]>:$x,
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, UI32, QI8, QUI8, QI16, TFL_Quint8]>:$y);
 
   let results = (outs
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$output);
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, UI32, QI8, QUI8, QI16, TFL_Quint8]>:$output);
 
   // TODO(jpienaar): autogenerate this.
   let builders = [
@@ -3167,11 +3167,11 @@ def TFL_SelectV2Op : TFL_Op<"select_v2", [
 
   let arguments = (ins
     TFL_BoolTensor:$condition,
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$x,
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$y);
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, UI32, QI8, QUI8, QI16, TFL_Quint8]>:$x,
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, UI32, QI8, QUI8, QI16, TFL_Quint8]>:$y);
 
   let results = (outs
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$output);
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, UI32, QI8, QUI8, QI16, TFL_Quint8]>:$output);
 
   let builders = [
     OpBuilder<(ins "Value":$cond, "Value":$x, "Value":$y),
@@ -3235,12 +3235,13 @@ def TFL_SoftmaxOp : TFL_Op<"softmax", [
   // FixedOutputRangeInterface:
   quant::UniformQuantizedType GetFixedOutputRange(
       bool is_signed, int bit_width) {
+    if (bit_width != 8 && bit_width != 16) { return nullptr; }
     auto result_type = getOutput().getType();
     // zero_point = 0
     // scale = 1. / (max_value + 1)
     return quant::GetFixedOutputRange(is_signed, bit_width, result_type,
-        /*scale=*/1.0 / (1<<(bit_width)),
-        /*zero_point=*/-(1<<(bit_width-1)));
+        /*scale=*/1.0 / (bit_width == 8 ? (1<<(bit_width)) : (1<<(bit_width-1))),
+        /*zero_point=*/bit_width == 8 ? -(1<<(bit_width-1)): 0);
   }
   }];
 }
@@ -3457,12 +3458,12 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, UI8, QI8, QUI8]>:$input,
-    TFL_I32Tensor:$k);
+    TFL_TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QUI8]>:$input,
+    TFL_TensorOf<[I16, I32]>:$k);
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I32, I64, UI8, QI8, QUI8]>:$values,
-    TFL_I32Tensor:$indices);
+    TFL_TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QUI8]>:$values,
+    TFL_TensorOf<[I16, I32]>:$indices);
 
   let builders = [
     OpBuilder<(ins "Value":$input, "Value":$k),
@@ -3587,13 +3588,13 @@ def TFL_BatchToSpaceNdOp: TFL_Op<"batch_to_space_nd", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, UI8, QI8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I8, I32, I64, UI8, QI8, QUI8, QI16]>:$input,
     TFL_TensorOf<[I32]>:$block_shape,
     TFL_TensorOf<[I32]>:$indices
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I16, I32, I64, UI8, QI8, QUI8]>:$output
+    TFL_TensorOf<[F32, I16, I32, I64, UI8, QI8, QUI8, QI16]>:$output
   );
 }
 
@@ -3612,13 +3613,13 @@ def TFL_SpaceToBatchNdOp: TFL_Op<"space_to_batch_nd", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8, QI16]>:$input,
     TFL_I32Tensor:$block_shape,
     TFL_I32Tensor:$paddings
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8, QI16]>:$output
   );
 }
 
@@ -3863,7 +3864,7 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I32, I64, I8, UI8, QI8, QUI8, I1, I16, QI16, TFL_Quint8, TFL_Str]>:$input,
+    TFL_TensorOf<[F32, I32, I64, I8, UI8, UI32, QI8, QUI8, I1, I16, QI16, TFL_Quint8, TFL_Str]>:$input,
     TFL_I32Tensor:$begin,
     TFL_I32Tensor:$end,
     TFL_I32Tensor:$strides,
@@ -3876,7 +3877,7 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice", [
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, I8, UI8, QI8, QUI8, I1, I16, QI16, TFL_Quint8, TFL_Str]>:$output
+    TFL_TensorOf<[F32, I32, I64, I8, UI8, UI32, QI8, QUI8, I1, I16, QI16, TFL_Quint8, TFL_Str]>:$output
   );
 
   // TFLite kernel only supports up to 5D input including added axis.
@@ -4028,6 +4029,67 @@ def TFL_DynamicUpdateSliceOp: TFL_Op<"dynamic_update_slice", [
 
   let results = (
     outs TFL_TensorOf<[I1, I8, I32, I64, F32]>:$output);
+
+  let hasFolder = 1;
+}
+
+def TFL_BitcastOp : TFL_Op<"bitcast", [Pure]> {
+  let summary = "Bitcast operator";
+
+  let description = [{
+    Bitcasts a tensor from one type to another.
+  }];
+
+  let arguments = (ins AnyTensor:$input);
+
+  let results = (outs AnyTensor:$output);
+
+  // TFLite's bitcast bitop does not utilize options, instead derives types
+  // from the TfLiteTensors.
+  let hasOptions = 0;
+
+  let hasFolder = 1;
+
+  let hasVerifier = 1;
+}
+
+def TFL_BitwiseXorOp : TFL_Op<"bitwise_xor", [
+    Commutative,
+    SameOperandsAndResultElementType,
+    Pure]> {
+  let summary = "Bitwise Xor operator";
+
+  let description = [{
+    Elementwise computes the bitwise XOR of `lhs` and `rhs`.
+  }];
+
+  let arguments = (ins
+    TFL_TensorOf<[I8, UI8, I16, UI16, I32, UI32]>:$lhs,
+    TFL_TensorOf<[I8, UI8, I16, UI16, I32, UI32]>:$rhs
+  );
+
+  let results = (outs
+    TFL_TensorOf<[I8, UI8, I16, UI16, I32, UI32]>:$output
+  );
+}
+
+def TFL_RightShiftOp : TFL_Op<"right_shift", [
+    SameOperandsAndResultElementType,
+    Pure]> {
+  let summary = "Right Shift operator";
+
+  let description = [{
+    Elementwise computes the bitwise right-shift of `lhs` by `rhs`.
+  }];
+
+  let arguments = (ins
+    TFL_TensorOf<[I8, UI8, I16, UI16, I32, UI32]>:$lhs,
+    TFL_TensorOf<[I8, UI8, I16, UI16, I32, UI32]>:$rhs
+  );
+
+  let results = (outs
+    TFL_TensorOf<[I8, UI8, I16, UI16, I32, UI32]>:$output
+  );
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index 2249909aa0e..85c87fd66ad 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -35,10 +35,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h
index 6d90c2d08f4..e69d3c718d9 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h
@@ -16,8 +16,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_GRAPHDEF_TO_TFL_FLATBUFFER_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 
diff --git a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
index 998734c8d2a..1b0f22c7cd1 100644
--- a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
@@ -44,11 +44,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index 9c3ab396b5d..74c09b3e9e6 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -39,10 +39,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
index ed339ca64b9..362e9e39ae5 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
@@ -16,8 +16,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_SAVED_MODEL_TO_TFL_FLATBUFFER_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index bb36ebe81f4..5cfbc0c937a 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -35,10 +35,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc
index 89eec9c7349..ae9b67e9e60 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
@@ -96,7 +97,7 @@ LogicalResult QuantizedConstRewrite::matchAndRewrite(
   auto fusedLoc = rewriter.getFusedLoc(
       {qbarrier.getArg().getDefiningOp()->getLoc(), qbarrier.getLoc()});
   auto newConstOp = rewriter.create<arith::ConstantOp>(
-      fusedLoc, newConstValueType, newConstValue);
+      fusedLoc, newConstValueType, cast<TypedAttr>(newConstValue));
   rewriter.replaceOpWithNewOp<StorageCastOp>(qbarrier, qbarrier.getType(),
                                              newConstOp);
   return success();
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.cc b/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.cc
index 3bd80ad4a7b..d111141958c 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.cc
@@ -104,7 +104,7 @@ LogicalResult StatisticsOp::verify() {
 
   // Verify layerStats attribute.
   {
-    auto layerStatsType = getLayerStats().getType();
+    auto layerStatsType = getLayerStats().getShapedType();
     if (!layerStatsType.getElementType().isa<FloatType>()) {
       return emitOpError("layerStats must have a floating point element type");
     }
@@ -121,7 +121,7 @@ LogicalResult StatisticsOp::verify() {
         std::accumulate(std::next(shape.begin(), *getAxis()), shape.end(), 1,
                         std::multiplies<int64_t>());
 
-    auto axisStatsType = getAxisStats()->getType();
+    auto axisStatsType = getAxisStats()->getShapedType();
     if (!axisStatsType.getElementType().isa<FloatType>()) {
       return emitOpError("axisStats must have a floating point element type");
     }
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index e21105fc5c4..29216f3be16 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -124,7 +124,7 @@ TfLiteStatus QuantizeModel(
   // If the first or final ops are not quantized, remove QDQ.
   pm.addPass(TFL::CreatePostQuantizeRemoveQDQPass());
   if (failed(pm.run(module.get()))) {
-    const std::string& err = statusHandler.ConsumeStatus().error_message();
+    const std::string err(statusHandler.ConsumeStatus().message());
     error_reporter->Report("Failed to quantize: %s", err.c_str());
     return kTfLiteError;
   }
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc
index ce87e5d8f92..e784cf7a2eb 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc
@@ -147,7 +147,7 @@ TfLiteStatus QuantizeWeights(
   tensorflow::AddDynamicRangeQuantizationPasses(quant_specs, pm);
 
   if (failed(pm.run(module.get()))) {
-    absl::string_view err = statusHandler.ConsumeStatus().error_message();
+    absl::string_view err = statusHandler.ConsumeStatus().message();
     error_reporter->Report("Failed to quantize: %s", err);
     return kTfLiteError;
   }
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 91bde7ede70..57a5c93556c 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -335,7 +335,7 @@ class QuantizationDriver {
     fn_.walk([&](Operation *op) {
       std::unique_ptr<OpQuantScaleSpec> scale_spec = GetQuantScaleSpec(op);
       if (op->hasTrait<OpTrait::IsTerminator>() ||
-          (IsOpNotQuantizable(op) && !scale_spec->has_same_scale_requirement) ||
+          (!IsOpQuantizable(op) && !scale_spec->has_same_scale_requirement) ||
           llvm::isa<quantfork::QuantizeCastOp, quantfork::DequantizeCastOp,
                     func::ConstantOp, arith::ConstantOp>(op)) {
         return;
@@ -841,7 +841,7 @@ void QuantizationDriver::SetupAllStates() {
 
   fn_.walk([&](Operation *op) {
     std::unique_ptr<OpQuantScaleSpec> scale_spec = GetQuantScaleSpec(op);
-    if (IsOpNotQuantizable(op) && !scale_spec->has_same_scale_requirement) {
+    if (!IsOpQuantizable(op) && !scale_spec->has_same_scale_requirement) {
       return;
     }
     work_list_.push_back(op);
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index c559dd2403f..9a151a80e8f 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -16,12 +16,15 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
+#include <cstdlib>
 #include <iterator>
 #include <limits>
 #include <memory>
 #include <numeric>
 #include <string>
+#include <vector>
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -187,27 +190,26 @@ quant::UniformQuantizedPerAxisType ResetAxisAndBroadcast(
 
 }  // namespace
 
-bool IsOpNotQuantizable(Operation* op) {
-  // If it is terminator or not quantizable or any ops form the mlir quant
-  // ops dialect, we shouldn't rewrite.
-  bool attr_enforced_quantizable =
+bool IsOpQuantizable(Operation* op) {
+  if (isa<func::ConstantOp, arith::ConstantOp, quantfork::StatisticsOp>(op)) {
+    // Constant ops do not have QuantizableResult attribute but they can deal
+    // with quantized tensors.
+    return true;
+  } else if (op->hasTrait<OpTrait::IsTerminator>() ||
+             isa<quantfork::QuantizeCastOp, quantfork::DequantizeCastOp>(op)) {
+    // Terminators, qcast and decast are not quantizable.
+    return false;
+  }
+
+  const bool attr_enforced_quantizable =
       op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&
       op->getAttrOfType<StringAttr>(kQuantTraitAttrName).getValue().str() ==
           QuantTraitValues[QuantizationTrait::FullyQuantizable];
 
-  // Constant ops do not have QuantizableResult attribute but they can deal with
-  // quantized tensors.
-  if (llvm::isa<func::ConstantOp, arith::ConstantOp, quantfork::StatisticsOp>(
-          op))
-    return false;
-
-  bool prop_enforced_quantizable =
+  const bool trait_enforced_quantizable =
       op->hasTrait<OpTrait::quant::QuantizableResult>();
 
-  return op->hasTrait<OpTrait::IsTerminator>() ||
-         llvm::isa<quantfork::QuantizeCastOp, quantfork::DequantizeCastOp>(
-             op) ||
-         (!attr_enforced_quantizable && !prop_enforced_quantizable);
+  return attr_enforced_quantizable || trait_enforced_quantizable;
 }
 
 // Returns the quantized type for the
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index b91db7965b0..1113bb868fa 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -20,31 +20,38 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_UTILS_H_
 
 #include <algorithm>
+#include <cmath>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <string>
 #include <type_traits>
 #include <unordered_map>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
@@ -183,7 +190,7 @@ quant::QuantizedType DownCastScale(quant::QuantizedType type,
 quant::QuantizedType DownCastScale(quant::QuantizedType type, double min,
                                    double max, Location loc);
 
-bool IsOpNotQuantizable(Operation* op);
+bool IsOpQuantizable(Operation* op);
 
 // Specialized version of location to string for flatbuffer exported locations.
 inline std::string GetTensorNameFromLoc(Location loc) {
@@ -439,7 +446,7 @@ class QuantizationPattern : public RewritePattern {
         return failure();
       }
 
-      if (IsOpNotQuantizable(quantizing_op) &&
+      if (!IsOpQuantizable(quantizing_op) &&
           !static_cast<const ConcreteT*>(this)->IsQuantizableCustomOp(
               quantizing_op, custom_map)) {
         if (!(enable_verify && enable_whole_model_verify)) {
@@ -646,7 +653,7 @@ class QuantizationPattern : public RewritePattern {
       // compared against in parallel.
       // N.B. the return op will use this floating-point result.
       Value result;
-      if (IsOpNotQuantizable(float_op)) {
+      if (!IsOpQuantizable(float_op)) {
         // For not quantizable ops, search for dequantize attached to the
         // quantized op of the output.
         if (Operation* quantize_op = dyn_cast_or_null<QuantizeOpT>(
diff --git a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc
index a9614c0e62c..8c9035f2184 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc
+++ b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc
@@ -65,7 +65,7 @@ TfLiteStatus SparsifyModel(const tflite::ModelT& input_model,
   pm.addPass(TFL::CreateDenseToSparsePass());
 
   if (failed(pm.run(module.get()))) {
-    const std::string& err = statusHandler.ConsumeStatus().error_message();
+    const std::string err(statusHandler.ConsumeStatus().message());
     error_reporter->Report("Failed to sparsify: %s", err.c_str());
     return kTfLiteError;
   }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 27ca62b52cb..258da6bc55c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -132,7 +132,7 @@ cc_library(
         ":stablehlo_util",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
         "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
@@ -199,7 +199,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
-        "//tensorflow/compiler/mlir/tf2xla:tf_xla_passes",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:tf_xla_passes",
         "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Transforms",
@@ -317,6 +317,33 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "legalize_tf_xla_call_module_to_stablehlo_pass",
+    srcs = [
+        "transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc",
+    ],
+    hdrs = [
+        "transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h",
+    ],
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Transforms",
+        "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_serialization",
+        "@stablehlo//:vhlo_ops",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "optimize",
     srcs = [
@@ -362,7 +389,7 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
-        "//tensorflow/compiler/mlir/tf2xla:legalize_tf",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:legalize_tf",
         "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
         "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
@@ -387,6 +414,7 @@ tf_cc_binary(
     deps = [
         ":fold_broadcast_pass",
         ":fuse_convolution_pass",
+        ":legalize_tf_xla_call_module_to_stablehlo_pass",
         ":optimize",
         ":stablehlo_tfl",
         ":tf_stablehlo",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
index 88330c7356b..525d73c1b79 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
@@ -115,6 +115,11 @@ opt<bool> elide_large_elements_attrs(
     "e", llvm::cl::desc("Elide large elements attrs."), llvm::cl::Optional,
     llvm::cl::init(false));
 
+// NOLINTNEXTLINE
+opt<bool> debug_info(
+    "debug-info", llvm::cl::desc("Inclide MLIR debug location info in output."),
+    llvm::cl::Optional, llvm::cl::init(false));
+
 // NOLINTNEXTLINE
 opt<bool> allow_tf("allow-tf", llvm::cl::desc("Allow TF dialect."),
                    llvm::cl::Optional, llvm::cl::init(false));
@@ -143,6 +148,11 @@ opt<bool> freeze_tf_graph(
     llvm::cl::desc("Freeze TF graph to remove tf.ResourceVariable, etc."),
     llvm::cl::Optional, llvm::cl::init(false));
 
+// NOLINTNEXTLINE
+opt<std::string> exported_model_signatures(
+    "exported_model_signatures", llvm::cl::desc("model signature names"),
+    llvm::cl::Optional, llvm::cl::init("serving_default"));
+
 namespace mlir {
 namespace odml {
 
@@ -165,7 +175,8 @@ tensorflow::StatusOr<OwningOpRef<mlir::ModuleOp>> ImportSavedModelOrMLIR(
 
   // TODO(pulkitb): Remove hard-coded tag.
   std::unordered_set<std::string> tags({"serve"});
-  auto exported_names_in_vector = std::vector<std::string>({});
+  std::vector<std::string> exported_names_in_vector =
+      absl::StrSplit(exported_model_signatures, ',');
   absl::Span<std::string> exported_names(exported_names_in_vector);
   std::vector<std::string> custom_opdefs;
 
@@ -217,6 +228,9 @@ tensorflow::Status ExportModule(mlir::ModuleOp module,
   std::string result;
   llvm::raw_string_ostream os(result);
   OpPrintingFlags printing_flags;
+  if (debug_info) {
+    printing_flags.enableDebugInfo();
+  }
   if (elide_large_elements_attrs) {
     printing_flags.elideLargeElementsAttrs();
   }
@@ -232,7 +246,10 @@ tensorflow::Status ExportModule(mlir::ModuleOp module,
 tensorflow::Status ConvertTFToStableHLO(
     ModuleOp tf_module, const PassPipelineCLParser& pass_pipeline) {
   PassManager pm(tf_module.getContext());
-  applyPassManagerCLOptions(pm);
+  if (failed(applyPassManagerCLOptions(pm))) {
+    return tensorflow::errors::Aborted(
+        "Failed to apply MLIR pass manager CL options.");
+  }
 
   auto error_handler = [&](const Twine& msg) {
     emitError(UnknownLoc::get(pm.getContext())) << msg;
@@ -330,14 +347,14 @@ tensorflow::Status RunConverter(const PassPipelineCLParser& pass_pipeline) {
       ExportModule(*module, output_path, elide_large_elements_attrs);
   if (!conversion_status.ok()) {
     LOG(ERROR) << "TF to StableHLO conversion failed: "
-               << conversion_status.error_message();
+               << conversion_status.message();
 
     auto debug_export_status = ExportModule(
         *module, absl::StrCat(verbose_dir, "/debug_stablehlo.mlir"),
         elide_large_elements_attrs);
     if (!debug_export_status.ok()) {
       LOG(ERROR) << "Failed to export debug_stablehlo.mlir: "
-                 << debug_export_status.error_message();
+                 << debug_export_status.message();
     }
 
     return conversion_status;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.cc b/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.cc
index 74708abebe8..29f4977b1bf 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.cc
@@ -759,7 +759,7 @@ Translator::BuildSubGraph(const std::string& name, Region* region, int index) {
   }
 
   bool failed_once = false;
-  for (auto& item : llvm::enumerate(bb)) {
+  for (const auto& item : llvm::enumerate(bb)) {
     Operation& inst = item.value();
     const int operation_index = item.index();
     if (inst.hasTrait<mlir::OpTrait::IsTerminator>()) break;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/fuse_convolution_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/fuse_convolution_pass.cc
index 45c8edc1ec5..9918d044c5b 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/fuse_convolution_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/fuse_convolution_pass.cc
@@ -84,7 +84,7 @@ class FuseMhloMulAndConvolutionPattern : public OpRewritePattern<mhlo::MulOp> {
     // Only fuses multiplier if all dimensions other than the out channel
     // dimension are equal to 1.
     if (!TFL::IsDimensionsDegenerateExceptLastOne(
-            mul_value.getType().getShape())) {
+            mul_value.getShapedType().getShape())) {
       return rewriter.notifyMatchFailure(mul_op, [&](::mlir::Diagnostic &diag) {
         diag << "entities 'mul_value' failed to satisfy constraint: "
                 "unsupported dimensions";
@@ -97,9 +97,10 @@ class FuseMhloMulAndConvolutionPattern : public OpRewritePattern<mhlo::MulOp> {
     }
 
     // Rewrite
-    broadcast_dims = broadcast_op.getBroadcastDimensions();
+    broadcast_dims =
+        broadcast_op ? broadcast_op.getBroadcastDimensions() : nullptr;
     if (broadcast_dims == nullptr) {
-      const auto filter_rank = filter_value.getType().getRank();
+      const auto filter_rank = filter_value.getShapedType().getRank();
       auto dimsType = RankedTensorType::get({1}, rewriter.getIntegerType(64));
       broadcast_dims = DenseIntElementsAttr::get(dimsType, {filter_rank - 1});
     }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
new file mode 100644
index 00000000000..b7277ae0415
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
@@ -0,0 +1,176 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/Serialization.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace odml {
+
+static constexpr std::string_view kStablehloModuleDefaultEntryFuncName = "main";
+static constexpr std::string_view kStablehloFuncNamePrefix = "XlaCallModule";
+
+class ConvertTFXlaCallModuleOp
+    : public mlir::OpRewritePattern<mlir::TF::XlaCallModuleOp> {
+ public:
+  explicit ConvertTFXlaCallModuleOp(MLIRContext *context, ModuleOp module_op)
+      : OpRewritePattern<mlir::TF::XlaCallModuleOp>(context),
+        module_op_(module_op) {}
+  using OpRewritePattern<mlir::TF::XlaCallModuleOp>::OpRewritePattern;
+
+ private:
+  ModuleOp module_op_;
+  mlir::LogicalResult matchAndRewrite(
+      mlir::TF::XlaCallModuleOp op, PatternRewriter &rewriter) const override {
+    mlir::OwningOpRef<ModuleOp> stablehlo_module_op =
+        mlir::stablehlo::deserializePortableArtifact(op.getModuleAttr(),
+                                                     getContext());
+    if (stablehlo_module_op.get() == nullptr) {
+      return mlir::failure();
+    }
+    SymbolTable parent_module_symbol_table(module_op_);
+    SymbolTable stablehlo_module_symbol_table(stablehlo_module_op.get());
+    if (stablehlo_module_symbol_table.lookup<mlir::func::FuncOp>(
+            kStablehloModuleDefaultEntryFuncName) == nullptr) {
+      return rewriter.notifyMatchFailure(
+          op, "could not find main function in XlaCallModuleOp");
+    }
+    mlir::Builder stablehlo_builder(stablehlo_module_op.get().getContext());
+    // Rename XlaCallModuleOp's functions to avoid naming conflicts.
+    for (auto func_op :
+         stablehlo_module_op.get().getOps<mlir::func::FuncOp>()) {
+      const std::string new_func_name =
+          CreateNewFuncName(func_op.getSymName(), parent_module_symbol_table);
+      if (failed(stablehlo_module_symbol_table.replaceAllSymbolUses(
+              func_op, stablehlo_builder.getStringAttr(new_func_name),
+              stablehlo_module_op.get()))) {
+        return mlir::failure();
+      }
+      mlir::SymbolTable::setSymbolName(func_op, new_func_name);
+    }
+    // Move all functions from XlaCallModuleOp's stablehlo module, to parent
+    // module. Also marks the stablehlo module entry function as private.
+    mlir::func::FuncOp main_fn;
+    for (auto func_op :
+         stablehlo_module_op.get().getOps<mlir::func::FuncOp>()) {
+      mlir::func::FuncOp cloned_func_op = func_op.clone();
+      if (cloned_func_op.getSymName().contains(
+              kStablehloModuleDefaultEntryFuncName)) {
+        main_fn = cloned_func_op;
+        main_fn.setSymVisibility(stablehlo_builder.getStringAttr("private"));
+      }
+      parent_module_symbol_table.insert(cloned_func_op);
+    }
+
+    // The stablehlo module main function's input tensor types might be
+    // different from the XlaCallModuleOp's input tensor types. For example,
+    // The XlaCallModuleOp's input is tensor<*xf32> while the function's
+    // argument type is tensor<1x2f32>.
+    llvm::SmallVector<Value, 4> casted_operands;
+    casted_operands.reserve(main_fn.getNumArguments());
+    for (const auto &operand_and_type :
+         zip(op.getOperands(), main_fn.getFunctionType().getInputs())) {
+      Value operand = std::get<0>(operand_and_type);
+      Type expected_type = std::get<1>(operand_and_type);
+      if (operand.getType() != expected_type) {
+        operand = rewriter.create<TF::CastOp>(
+            op.getLoc(), expected_type, operand,
+            /*Truncate=*/rewriter.getBoolAttr(false));
+      }
+      casted_operands.push_back(operand);
+    }
+
+    auto call = rewriter.create<func::CallOp>(
+        op->getLoc(), main_fn.getSymName(), main_fn.getResultTypes(),
+        casted_operands);
+    rewriter.replaceOp(op, call->getResults());
+
+    return mlir::success();
+  }
+
+  // Creates a new function name to avoid collision. The naming scheme is
+  // XlaCallModule_%s_%d where %s is the original function name and %d is the
+  // counter.
+  std::string CreateNewFuncName(const StringRef func_name,
+                                SymbolTable &symbol_table) const {
+    int suffix_id = 0;
+    std::string new_func_name = absl::StrCat(kStablehloFuncNamePrefix, "_",
+                                             func_name.str(), "_", suffix_id);
+    while (symbol_table.lookup(new_func_name)) {
+      suffix_id++;
+      new_func_name = absl::StrCat(kStablehloFuncNamePrefix, "_",
+                                   func_name.str(), "_", suffix_id);
+    }
+    return new_func_name;
+  }
+};
+
+class TFXlaCallModuleOpToStablehloPass
+    : public PassWrapper<TFXlaCallModuleOpToStablehloPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  StringRef getArgument() const final {
+    return "tf-xla-call-module-op-to-stablehlo-pass";
+  }
+  StringRef getDescription() const final {
+    return "Legalize TF_XlaCallModule Op to stablehlo";
+  }
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<mlir::stablehlo::StablehloDialect, mlir::vhlo::VhloDialect,
+                    mlir::quant::QuantizationDialect, shape::ShapeDialect>();
+  }
+
+  void runOnOperation() override {
+    ModuleOp module_op = getOperation();
+    RewritePatternSet patterns(&getContext());
+    patterns.add<ConvertTFXlaCallModuleOp>(&getContext(), module_op);
+    if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+std::unique_ptr<mlir::OperationPass<ModuleOp>>
+CreateLegalizeTFXlaCallModuleToStablehloPass() {
+  return std::make_unique<TFXlaCallModuleOpToStablehloPass>();
+}
+
+static PassRegistration<TFXlaCallModuleOpToStablehloPass> pass;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h
new file mode 100644
index 00000000000..9bcee095f27
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_TF_XLA_CALL_MODULE_TO_STABLEHLO_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_TF_XLA_CALL_MODULE_TO_STABLEHLO_PASS_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Adds passes which transform TF_XlaCallModule Op to StableHLO Ops.
+// Note that this pass only supports static shape tensors for now.
+std::unique_ptr<mlir::OperationPass<ModuleOp>>
+CreateLegalizeTFXlaCallModuleToStablehloPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_TF_XLA_CALL_MODULE_TO_STABLEHLO_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
index 14cbe5963e3..476b02e0bd8 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
@@ -65,7 +65,6 @@ void AddTFToStablehloPasses(OpPassManager& pm, bool skip_resize,
     pm.addNestedPass<func::FuncOp>(CreateSmuggleDisallowedOpsPass());
     pm.addPass(mlir::createCanonicalizerPass());
   }
-  pm.addPass(CreateDropSavedModelSemanticsPass());
 }
 
 void AddStablehloOptimizationPasses(OpPassManager& pm) {
diff --git a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
index 082c0627b09..17b724051cd 100644
--- a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
@@ -326,3 +326,21 @@ func.func @broadcast_to_to_reshape_i64_const(%arg0: tensor<4x4x4xf32>) -> tensor
   // CHECK-SAME: (tensor<4x4x4xf32>, tensor<4xi32>) -> tensor<1x4x4x4xf32>
   func.return %0 : tensor<1x4x4x4xf32>
 }
+
+// -----
+
+func.func @trivial_dynamic_update_slice(%arg0: tensor<2x7x14xf32>, %arg1: tensor<2x7x14xf32>) -> tensor<2x7x14xf32> {
+  %0 = arith.constant dense<0> : tensor<3xi32>
+  %1 = "tfl.dynamic_update_slice"(%arg0, %arg1, %0) : (tensor<2x7x14xf32>, tensor<2x7x14xf32>, tensor<3xi32>) -> tensor<2x7x14xf32>
+  // CHECK: return %arg1
+  func.return %1 : tensor<2x7x14xf32>
+}
+
+// -----
+
+func.func @trivial_dynamic_update_slice_wrong_update_shape(%arg0: tensor<2x7x14xf32>, %arg1: tensor<2x7x7xf32>) -> tensor<2x7x14xf32> {
+  %0 = arith.constant dense<0> : tensor<3xi32>
+  %1 = "tfl.dynamic_update_slice"(%arg0, %arg1, %0) : (tensor<2x7x14xf32>, tensor<2x7x7xf32>, tensor<3xi32>) -> tensor<2x7x14xf32>
+  // CHECK: "tfl.dynamic_update_slice"
+  func.return %1 : tensor<2x7x14xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/disallow_stateful_partitioned_call.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/disallow_stateful_partitioned_call.pbtxt
new file mode 100644
index 00000000000..db6998bd7d9
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/disallow_stateful_partitioned_call.pbtxt
@@ -0,0 +1,195 @@
+# RUN: not tf_tfl_translate -tf-input-arrays=input0 -tf-input-shapes=-1 -tf-input-data-types=DT_FLOAT -tf-output-arrays=add %s 2>&1 | FileCheck %s
+# CHECK: error: The Graph contains unsupported `StatefulPartionedCallOp`(s)
+
+node {
+  name: "input0"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "args_0"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "spc1"
+  op: "StatefulPartitionedCall"
+  input: "input0"
+  input: "args_0"
+  attr {
+    key: "Tin"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_RESOURCE
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_RESOURCE
+      }
+    }
+  }
+  attr {
+    key: "config"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "config_proto"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "executor_type"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "f"
+    value {
+      func {
+        name: "function"
+      }
+    }
+  }
+}
+node {
+  name: "spc2"
+  op: "StatefulPartitionedCall"
+  input: "input0"
+  input: "args_0"
+  attr {
+    key: "Tin"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_RESOURCE
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_RESOURCE
+      }
+    }
+  }
+  attr {
+    key: "config"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "config_proto"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "executor_type"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "f"
+    value {
+      func {
+        name: "function"
+      }
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "spc1"
+  input: "spc2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+library {
+  function {
+    signature {
+      name: "function"
+      input_arg {
+        name: "inputs"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "statefulpartitionedcall_args_1"
+        type: DT_RESOURCE
+      }
+      output_arg {
+        name: "identity"
+        type: DT_FLOAT
+      }
+      is_stateful: true
+    }
+    node_def {
+      name: "Identity"
+      op: "Identity"
+      input: "inputs"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    ret {
+      key: "identity"
+      value: "Identity:output:0"
+    }
+    arg_attr {
+      key: 0
+      value {
+        attr {
+          key: "_user_specified_name"
+          value {
+            s: "inputs"
+          }
+        }
+      }
+    }
+    arg_attr {
+      key: 1
+      value {
+      }
+    }
+  }
+}
+versions {
+  producer: 121
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 109ab804748..4f58b7af868 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1865,26 +1865,29 @@ func.func @maximum_with_6d_broadcasting(%arg0: tensor<1x1x1x1x8x16xf32>, %arg1:
 
 // -----
 
-func.func @add_with_int32_5d_inputs(%arg0: tensor<1x1x1x3x1xi32>, %arg1 : tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32> {
+func.func @test5DAddWithImplicitBroadcast(%arg0: tensor<1x1x1x3x1xi32>, %arg1 : tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32> {
   %0 = "tf.Add"(%arg0, %arg1): (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
   func.return %0 : tensor<1x1x1x3x4xi32>
-// CHECK-LABEL: add_with_int32_5d_inputs
-// CHECK: [[CST:%.*]] = arith.constant dense<[1, 1, 1, 3, 4]> : tensor<5xi64>
-// CHECK: [[BCT:%.*]] = "tfl.broadcast_to"(%arg0, [[CST]])
-// CHECK: [[BCT_0:%.*]] = "tfl.broadcast_to"(%arg1, [[CST]])
-// CHECK:  tfl.add [[BCT]], [[BCT_0]]
+// CHECK-LABEL: test5DAddWithImplicitBroadcast
+// CHECK: %0 = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
 }
 
-// CHECK-LABEL: testAddWithBroadcastToOps
-func.func @testAddWithBroadcastToOps(%arg0: tensor<1x2x1x4x5x6xi32>, %arg1: tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32> {
-  // CHECK: [[CST:%.*]] = arith.constant dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi64>
-  // CHECK: [[BCAST:%.*]] = "tfl.broadcast_to"(%arg0, [[CST]])
-  // CHECK: [[BCAST_1:%.*]] = "tfl.broadcast_to"(%arg1, [[CST]])
-  // CHECK: tfl.add [[BCAST]], [[BCAST_1]] {fused_activation_function = "NONE"} : tensor<1x2x3x4x5x6xi32>
+func.func @test6DAddWithImplicitBroadcast(%arg0: tensor<1x2x1x4x5x6xi32>, %arg1: tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32> {
+// CHECK-LABEL: test6DAddWithImplicitBroadcast
+// CHECK:  %0 = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
   func.return %0 : tensor<1x2x3x4x5x6xi32>
 }
 
+func.func @add_with_int32_7d_inputs(%arg0: tensor<1x1x1x1x1x3x1xi32>, %arg1 : tensor<1x1x1x1x1x1x4xi32>) -> tensor<1x1x1x1x1x3x4xi32> {
+  %0 = "tf.Add"(%arg0, %arg1): (tensor<1x1x1x1x1x3x1xi32>, tensor<1x1x1x1x1x1x4xi32>) -> tensor<1x1x1x1x1x3x4xi32>
+  func.return %0 : tensor<1x1x1x1x1x3x4xi32>
+// CHECK-LABEL: add_with_int32_7d_inputs
+// CHECK: %0 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<1x1x1x1x1x3x1xi32>, tensor<7xi64>) -> tensor<1x1x1x1x1x3x4xi32>
+// CHECK: %1 = "tfl.broadcast_to"(%arg1, %cst) : (tensor<1x1x1x1x1x1x4xi32>, tensor<7xi64>) -> tensor<1x1x1x1x1x3x4xi32>
+// CHECK: %2 = tfl.add %0, %1 {fused_activation_function = "NONE"} : tensor<1x1x1x1x1x3x4xi32>
+}
+
 // CHECK-LABEL: testSubWithBroadcastToOps
 func.func @testSubWithBroadcastToOps(%arg0: tensor<1x2x1x4x5x6xi32>, %arg1: tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32> {
   // CHECK: [[CST:%.*]] = arith.constant dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi64>
@@ -2338,6 +2341,24 @@ func.func @mul_i64(%arg0: tensor<14xi64>, %arg1: tensor<14xi64>) -> tensor<14xi6
 // CHECK:  return
 }
 
+func.func @mul_i16(%arg0: tensor<14xi16>, %arg1: tensor<14xi16>) -> tensor<14xi16> {
+  %0 = "tf.Mul"(%arg0, %arg1) : (tensor<14xi16>, tensor<14xi16>) -> tensor<14xi16>
+  func.return %0: tensor<14xi16>
+
+// CHECK-LABEL: mul_i16
+// CHECK:  tfl.mul %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<14xi16>
+// CHECK:  return
+}
+
+func.func @mul_ui32(%arg0: tensor<14xui32>, %arg1: tensor<14xui32>) -> tensor<14xui32> {
+  %0 = "tf.Mul"(%arg0, %arg1) : (tensor<14xui32>, tensor<14xui32>) -> tensor<14xui32>
+  func.return %0: tensor<14xui32>
+
+// CHECK-LABEL: mul_ui32
+// CHECK:  tfl.mul %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<14xui32>
+// CHECK:  return
+}
+
 func.func @mul_complex32(%arg0: tensor<14xcomplex<f32>>, %arg1: tensor<14xcomplex<f32>>) -> tensor<14xcomplex<f32>> {
   %0 = "tf.Mul"(%arg0, %arg1) : (tensor<14xcomplex<f32>>, tensor<14xcomplex<f32>>) -> tensor<14xcomplex<f32>>
   func.return %0: tensor<14xcomplex<f32>>
@@ -2515,6 +2536,69 @@ func.func @sign(%arg0: tensor<8xf32>) -> tensor<8xf32> {
 // CHECK:  return %[[RES0]] : tensor<8xf32>
 }
 
+func.func @bitcast(%arg0: tensor<8xi32>) -> tensor<8xui32> {
+  %0 = "tf.Bitcast"(%arg0) : (tensor<8xi32>) -> tensor<8xui32>
+  func.return %0 : tensor<8xui32>
+
+// CHECK-LABEL: bitcast
+// CHECK: %[[RES0:.*]] = "tfl.bitcast"(%arg0) : (tensor<8xi32>) -> tensor<8xui32>
+// CHECK:  return %[[RES0]] : tensor<8xui32>
+}
+
+func.func @bitcastI32ToI16(%arg0: tensor<8xi32>) -> tensor<8x2xi16> {
+  %0 = "tf.Bitcast"(%arg0) : (tensor<8xi32>) -> tensor<8x2xi16>
+  func.return %0 : tensor<8x2xi16>
+
+// CHECK-LABEL: bitcastI32ToI16
+// CHECK: %[[RES0:.*]] = "tfl.bitcast"(%arg0) : (tensor<8xi32>) -> tensor<8x2xi16>
+// CHECK:  return %[[RES0]] : tensor<8x2xi16>
+}
+
+func.func @bitcastI16ToUI32(%arg0: tensor<8x2xi16>) -> tensor<8xui32> {
+  %0 = "tf.Bitcast"(%arg0) : (tensor<8x2xi16>) -> tensor<8xui32>
+  func.return %0 : tensor<8xui32>
+
+// CHECK-LABEL: bitcastI16ToUI32
+// CHECK: %[[RES0:.*]] = "tfl.bitcast"(%arg0) : (tensor<8x2xi16>) -> tensor<8xui32>
+// CHECK:  return %[[RES0]] : tensor<8xui32>
+}
+
+func.func @bitcastFloatToI16(%arg0: tensor<8xf32>) -> tensor<8x2xi16> {
+  %0 = "tf.Bitcast"(%arg0) : (tensor<8xf32>) -> tensor<8x2xi16>
+  func.return %0 : tensor<8x2xi16>
+
+// CHECK-LABEL: bitcastFloatToI16
+// CHECK: %[[RES0:.*]] = "tfl.bitcast"(%arg0) : (tensor<8xf32>) -> tensor<8x2xi16>
+// CHECK:  return %[[RES0]] : tensor<8x2xi16>
+}
+
+func.func @bitcastI16ToFloat(%arg0: tensor<8x2xi16>) -> tensor<8xf32> {
+  %0 = "tf.Bitcast"(%arg0) : (tensor<8x2xi16>) -> tensor<8xf32>
+  func.return %0 : tensor<8xf32>
+
+// CHECK-LABEL: bitcastI16ToFloat
+// CHECK: %[[RES0:.*]] = "tfl.bitcast"(%arg0) : (tensor<8x2xi16>) -> tensor<8xf32>
+// CHECK:  return %[[RES0]] : tensor<8xf32>
+}
+
+func.func @testBitwiseXor(%arg0: tensor<8xui32>, %arg1: tensor<8xui32>) -> tensor<8xui32> {
+  %0 = "tf.BitwiseXor"(%arg0, %arg1) : (tensor<8xui32>, tensor<8xui32>) -> tensor<8xui32>
+  func.return %0 : tensor<8xui32>
+
+  // CHECK-LABEL: testBitwiseXor
+  // CHECK: %[[RES0:.*]] = "tfl.bitwise_xor"(%arg0, %arg1) : (tensor<8xui32>, tensor<8xui32>) -> tensor<8xui32>
+  // CHECK: return %[[RES0]] : tensor<8xui32>
+}
+
+func.func @testRightShift(%arg0: tensor<8xui32>, %arg1: tensor<8xui32>) -> tensor<8xui32> {
+  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<8xui32>, tensor<8xui32>) -> tensor<8xui32>
+  func.return %0 : tensor<8xui32>
+
+  // CHECK-LABEL: testRightShift
+  // CHECK: %[[RES0:.*]] = "tfl.right_shift"(%arg0, %arg1) : (tensor<8xui32>, tensor<8xui32>) -> tensor<8xui32>
+  // CHECK: return %[[RES0]] : tensor<8xui32>
+}
+
 // =============================================================================
 // Training OPs
 // =============================================================================
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index b424a26964e..628e523c488 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -352,6 +352,22 @@ func.func @testMul(tensor<? x i32>, tensor<? x i32>) -> tensor<? x i32> {
   func.return %0#0 : tensor<? x i32>
 }
 
+// CHECK-LABEL: testMul32BitUInt
+func.func @testMul32BitUInt(tensor<? x ui32>, tensor<? x ui32>) -> tensor<? x ui32> {
+^bb0(%arg0: tensor<? x ui32>, %arg1: tensor<? x ui32>):
+  // CHECK: tfl.mul %arg0, %arg1 {fused_activation_function = "RELU6"}
+  %0 = tfl.mul %arg0, %arg1 {fused_activation_function = "RELU6"} : tensor<? x ui32>
+  func.return %0#0 : tensor<? x ui32>
+}
+
+// CHECK-LABEL: testMul16BitInt
+func.func @testMul16BitInt(tensor<? x i16>, tensor<? x i16>) -> tensor<? x i16> {
+^bb0(%arg0: tensor<? x i16>, %arg1: tensor<? x i16>):
+  // CHECK: tfl.mul %arg0, %arg1 {fused_activation_function = "RELU6"}
+  %0 = tfl.mul %arg0, %arg1 {fused_activation_function = "RELU6"} : tensor<? x i16>
+  func.return %0#0 : tensor<? x i16>
+}
+
 // CHECK-LABEL: testMulComplex
 func.func @testMulComplex(tensor<? x complex<f32>>, tensor<? x complex<f32>>) -> tensor<? x complex<f32>> {
 ^bb0(%arg0: tensor<? x complex<f32>>, %arg1: tensor<? x complex<f32>>):
@@ -397,6 +413,14 @@ func.func @mul_with_i32_five_dim_broadcasting(tensor<1x1x1x1x1xi32>, tensor<1xi3
 
 // -----
 
+func.func @add_with_i32_five_dim_broadcasting(tensor<1x1x1x1x1xi32>, tensor<1xi32>) -> tensor<1x1x1x1x1xi32> {
+^bb0(%arg0: tensor<1x1x1x1x1xi32>, %arg1: tensor<1xi32>):
+  %0 = "tfl.add"(%arg0, %arg1) {fused_activation_function = "RELU6"} : (tensor<1x1x1x1x1xi32>, tensor<1xi32>) -> tensor<1x1x1x1x1xi32>
+  func.return %0#0 : tensor<1x1x1x1x1xi32>
+}
+
+// -----
+
 func.func @mul_with_quantized_i16_five_dim_broadcasting(tensor<1x1x1x1x1x!quant.any<i16:f32>>, tensor<1x!quant.any<i16:f32>>) -> tensor<1x1x1x1x1x!quant.any<i16:f32>> {
 ^bb0(%arg0: tensor<1x1x1x1x1x!quant.any<i16:f32>>, %arg1: tensor<1x!quant.any<i16:f32>>):
   // expected-error @+1 {{Operands do not have valid shapes}}
@@ -1429,6 +1453,7 @@ func.func @unpackQuantized(%arg0: tensor<2x3x!quant.uniform<u8:f32, 0.02>>) -> t
 // -----
 
 func.func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
+  // expected-error @+2 {{failed to infer returned types}}
   // expected-error @+1 {{output count should match 'num' attribute}}
   %0:3 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 2 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
   func.return %0#0 : tensor<2xi32>
@@ -1437,6 +1462,7 @@ func.func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
 // -----
 
 func.func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
+  // expected-error @+2 {{failed to infer returned types}}
   // expected-error @+1 {{attribute 'axis' should be in range [-rank, rank), got axis = 2, and rank = 2}}
   %0:3 = "tfl.unpack"(%arg0) {axis = 2 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
   func.return %0#0 : tensor<2xi32>
@@ -1445,6 +1471,7 @@ func.func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
 // -----
 
 func.func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
+  // expected-error @+2 {{failed to infer returned types}}
   // expected-error @+1 {{attribute 'axis' should be in range [-rank, rank), got axis = -3, and rank = 2}}
   %0:3 = "tfl.unpack"(%arg0) {axis = -3 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
   func.return %0#0 : tensor<2xi32>
@@ -1453,6 +1480,7 @@ func.func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
 // -----
 
 func.func @unpack(%arg0: tensor<i32>) -> tensor<2xi32> {
+  // expected-error @+2 {{failed to infer returned types}}
   // expected-error @+1 {{input should be of rank larger than 0}}
   %0:3 = "tfl.unpack"(%arg0) {axis = 0 : i32, num = 3 : i32} : (tensor<i32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
   func.return %0#0 : tensor<2xi32>
@@ -1461,6 +1489,7 @@ func.func @unpack(%arg0: tensor<i32>) -> tensor<2xi32> {
 // -----
 
 func.func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
+  // expected-error @+2 {{failed to infer returned types}}
   // expected-error @+1 {{op inferred type(s) 'tensor<2xi32>', 'tensor<2xi32>', 'tensor<2xi32>' are incompatible with return type(s) of operation 'tensor<2xi32>', 'tensor<2x1xi32>', 'tensor<2xi32>'}}
   %0:3 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2x1xi32>, tensor<2xi32>)
   func.return %0#0 : tensor<2xi32>
@@ -3121,3 +3150,34 @@ func.func @testUnsortedSegmentMin(%arg0: tensor<8xf32>, %arg1: tensor<8xi32>,  %
   func.return %0 : tensor<8xf32>
   // CHECK: return %0 : tensor<8xf32>
 }
+
+
+// -----
+
+// CHECK-LABEL: testBitcast
+func.func @testBitcast(%arg0: tensor<8xui32>) -> tensor<8xi32> {
+  // CHECK: "tfl.bitcast"(%arg0)
+  %0 = "tfl.bitcast"(%arg0) : (tensor<8xui32>) -> tensor<8xi32>
+  func.return %0 : tensor<8xi32>
+  // CHECK: return %0 : tensor<8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: testBitwiseXor
+func.func @testBitwiseXor(%arg0: tensor<8xui32>, %arg1: tensor<8xui32>) -> tensor<8xui32> {
+  // CHECK: "tfl.bitwise_xor"(%arg0, %arg1)
+  %0 = "tfl.bitwise_xor"(%arg0, %arg1) : (tensor<8xui32>, tensor<8xui32>) -> tensor<8xui32>
+  func.return %0 : tensor<8xui32>
+  // CHECK: return %0 : tensor<8xui32>
+}
+
+// -----
+
+// CHECK-LABEL: testRightShift
+func.func @testRightShift(%arg0: tensor<8xui32>, %arg1: tensor<8xui32>) -> tensor<8xui32> {
+  // CHECK: "tfl.right_shift"(%arg0, %arg1)
+  %0 = "tfl.right_shift"(%arg0, %arg1) : (tensor<8xui32>, tensor<8xui32>) -> tensor<8xui32>
+  func.return %0 : tensor<8xui32>
+  // CHECK: return %0 : tensor<8xui32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 05880e8ef43..8d57178a47f 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -13,7 +13,7 @@ func.func @fusedConv2dRelu(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x
   %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x32x32x16xf32>
   %1 = "tfl.relu"(%0) : (tensor<256x32x32x16xf32>) -> tensor<256x32x32x16xf32>
   func.return %1 : tensor<256x32x32x16xf32>
-
+  
   // CHECK: %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x32x32x16xf32>
   // CHECK: return %0
 }
@@ -60,6 +60,25 @@ func.func @fuseAddIntoConv2d(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x
   // CHECK: %0 = "tfl.conv_2d"(%arg0, %arg1, %cst)
 }
 
+// CHECK-LABEL: fuse4DAddIntoConv2d
+func.func @fuse4DAddIntoConv2d(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<2x3x3x3xf32>) -> tensor<256x32x32x2xf32> {
+  %cst = arith.constant dense<[[[[1.0, 2.0]]]]> : tensor<1x1x1x2xf32>
+  %cst_0 = arith.constant dense<[1.0, 2.0]> : tensor<2xf32>
+  %0 = "tfl.conv_2d"(%arg0, %arg1, %cst_0) {
+    dilation_h_factor = 1 : i32,
+    dilation_w_factor = 1 : i32,
+    fused_activation_function = "NONE",
+    padding = "SAME",
+    stride_h = 1 : i32,
+    stride_w = 1 : i32
+  } : (tensor<256x32x32x3xf32>, tensor<2x3x3x3xf32>, tensor<2xf32>) -> tensor<256x32x32x2xf32>
+  %1 = "tfl.add"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<256x32x32x2xf32>, tensor<1x1x1x2xf32>) -> tensor<256x32x32x2xf32>
+  func.return %1 : tensor<256x32x32x2xf32>
+
+  // CHECK-DAG: %cst = arith.constant dense<[2.000000e+00, 4.000000e+00]> : tensor<2xf32>
+  // CHECK: %0 = "tfl.conv_2d"(%arg0, %arg1, %cst)
+}
+
 // CHECK-LABEL: fuseSubIntoConv2d
 func.func @fuseSubIntoConv2d(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3xf32>) -> tensor<256x32x32x16xf32> {
   %cst = arith.constant dense<0.5> : tensor<16xf32>
@@ -217,12 +236,20 @@ func.func @fuseSubIntoDepthwiseConv2d(%arg0: tensor<256x32x32x3xf32>, %arg1: ten
 }
 
 // CHECK-LABEL: dontFuseSubIntoDepthwiseConv2d
-func.func @dontFuseSubIntoDepthwiseConv2d(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<3x3x3x16xf32>) -> tensor<256x30x30x16xf32> {
-  %cst = arith.constant dense<0.5> : tensor<1x16xf32>
-  %cst_0 = arith.constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32>
-  %0 = "tfl.depthwise_conv_2d"(%arg0, %arg1, %cst_0) {depth_multiplier = 4 : i32, dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
-  %1 = "tfl.sub"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<256x30x30x16xf32>, tensor<1x16xf32>) -> tensor<256x30x30x16xf32>
-  func.return %1 : tensor<256x30x30x16xf32>
+func.func @dontFuseSubIntoDepthwiseConv2d(%arg0: tensor<256x3x3x3xf32>, %arg1: tensor<3x3x3x5xf32>) -> tensor<256x2x2x4xf32> {
+  %cst = arith.constant dense<[[1.0, 2.0, 3.0, 4.0], [-1.0, -2.0, -3.0, -4.0]]> : tensor<2x4xf32>
+  %cst_0 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf32>
+  %0 = "tfl.depthwise_conv_2d"(%arg0, %arg1, %cst_0) {
+    depth_multiplier = 4 : i32,
+    dilation_h_factor = 2 : i32,
+    dilation_w_factor = 3 : i32,
+    fused_activation_function = "NONE",
+    padding = "SAME",
+    stride_h = 4 : i32,
+    stride_w = 5 : i32
+    } : (tensor<256x3x3x3xf32>, tensor<3x3x3x5xf32>, tensor<4xf32>) -> tensor<256x2x2x4xf32>
+  %1 = "tfl.sub"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<256x2x2x4xf32>, tensor<2x4xf32>) -> tensor<256x2x2x4xf32>
+  func.return %1 : tensor<256x2x2x4xf32>
 
   // CHECK: "tfl.depthwise_conv_2d"
   // CHECK: tfl.sub
@@ -432,6 +459,23 @@ func.func @fuseMulIntoDepthwiseConv2d(%arg0: tensor<1x112x112x2xf32>) -> tensor<
 // CHECK:  return %0
 }
 
+// CHECK-LABEL: @fuse4DMulIntoDepthwiseConv2d
+func.func @fuse4DMulIntoDepthwiseConv2d(%arg0: tensor<1x112x112x2xf32>) -> tensor<1x112x112x2xf32> {
+  %cst0 = arith.constant dense<[[[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], [[7.0, 8.0], [9.0, 10.0], [11.0, 12.0]], [[13.0, 14.0], [15.0, 16.0], [17.0, 18.0]]]]> : tensor<1x3x3x2xf32>
+  %cst1 = arith.constant dense<2.0> : tensor<2xf32>
+  %cst2 = arith.constant dense<[[[[1.0, 2.0]]]]> : tensor<1x1x1x2xf32>
+
+  %0 = "tfl.depthwise_conv_2d"(%arg0, %cst0, %cst1) {depth_multiplier = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x112x2xf32>, tensor<1x3x3x2xf32>, tensor<2xf32>) -> tensor<1x112x112x2xf32>
+  %1 = "tfl.mul"(%0, %cst2) {fused_activation_function = "RELU6"} : (tensor<1x112x112x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x112x112x2xf32>
+
+  func.return %1 : tensor<1x112x112x2xf32>
+
+// CHECK-DAG:  %cst = arith.constant dense<{{\[\[\[\[}}1.000000e+00, 4.000000e+00], [3.000000e+00, 8.000000e+00], [5.000000e+00, 1.200000e+01]], {{\[\[}}7.000000e+00, 1.600000e+01], [9.000000e+00, 2.000000e+01], [1.100000e+01, 2.400000e+01]], {{\[\[}}1.300000e+01, 2.800000e+01], [1.500000e+01, 3.200000e+01], [1.700000e+01, 3.600000e+01]]]]> : tensor<1x3x3x2xf32>
+// CHECK-DAG:  %cst_0 = arith.constant dense<[2.000000e+00, 4.000000e+00]> : tensor<2xf32>
+// CHECK:  %0 = "tfl.depthwise_conv_2d"(%arg0, %cst, %cst_0) {depth_multiplier = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x112x2xf32>, tensor<1x3x3x2xf32>, tensor<2xf32>) -> tensor<1x112x112x2xf32>
+// CHECK:  return %0
+}
+
 // CHECK-LABEL: @notFuseMulIntoDepthwiseConv2d
 func.func @notFuseMulIntoDepthwiseConv2d(%arg0: tensor<1x4x4x2xf32>) -> tensor<1x4x4x2xf32> {
   %cst0 = arith.constant dense<[[[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], [[7.0, 8.0], [9.0, 10.0], [11.0, 12.0]], [[13.0, 14.0], [15.0, 16.0], [17.0, 18.0]]]]> : tensor<1x3x3x2xf32>
@@ -464,6 +508,21 @@ func.func @FuseFullyConnectedAddWithNoBias(%arg0: tensor<40x37xf32>, %arg1: tens
   // CHECK: return %[[fc]]
 }
 
+// CHECK-LABEL: @FuseFullyConnectedReducedAddWithNoBias
+func.func @FuseFullyConnectedReducedAddWithNoBias(%arg0: tensor<1024x1x126xf32>, %arg1: tensor<128x126xf32>) -> tensor<1024x1x128xf32> {
+  %cst = "tfl.no_value"() {value} : () -> none
+  %cst2 = arith.constant dense<2.0> : tensor<1x1x128xf32>
+
+  %0 = "tfl.fully_connected" (%arg0, %arg1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1024x1x126xf32>, tensor<128x126xf32>, none) -> (tensor<1024x1x128xf32>)
+  %1 = "tfl.add"(%0, %cst2) {fused_activation_function = "NONE"} : (tensor<1024x1x128xf32>, tensor<1x1x128xf32>) -> tensor<1024x1x128xf32>
+
+  func.return %1 : tensor<1024x1x128xf32>
+
+  // CHECK-DAG: %cst = arith.constant dense<2.000000e+00> : tensor<128xf32>
+  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %cst)
+  // CHECK: return %[[fc]]
+}
+
 // CHECK-LABEL: @FuseFullyConnectedAddWithExistingBias
 func.func @FuseFullyConnectedAddWithExistingBias(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
   %cst = arith.constant dense<3.0> : tensor<40xf32>
@@ -552,6 +611,38 @@ func.func @FuseFullyConnectedReshapeAddConst(%arg0: tensor<40x37xf32>, %arg1: te
   // FOLD: return %[[fc]]
 }
 
+// CHECK-LABEL: @RemoveRedundantReshapeUsedAsInputToBinaryOp
+func.func @RemoveRedundantReshapeUsedAsInputToBinaryOp(%arg0: tensor<128xf32>, %arg1: tensor<1x512x512x128xf32>, %arg2: tensor<1x512x512x128xf32>) -> (tensor<1x512x512x128xf32>, tensor<1x512x512x128xf32>) {
+  %cst_10 = arith.constant  dense<[1, 1, 1, 128]> : tensor<4xi32>
+
+  %894 = "tfl.reshape"(%arg0, %cst_10) : (tensor<128xf32>, tensor<4xi32>) -> tensor<1x1x1x128xf32>
+  %895 = "tfl.mul"(%894, %arg1) {fused_activation_function = "NONE"} : (tensor<1x1x1x128xf32>, tensor<1x512x512x128xf32>) -> tensor<1x512x512x128xf32>
+  %896 = "tfl.mul"(%arg2, %894) {fused_activation_function = "NONE"} : (tensor<1x512x512x128xf32>, tensor<1x1x1x128xf32>) -> tensor<1x512x512x128xf32>
+
+  return %895, %896 : tensor<1x512x512x128xf32>, tensor<1x512x512x128xf32>
+
+  // CHECK:  %0 = tfl.mul(%arg0, %arg1)
+  // CHECK:  %1 = tfl.mul(%arg2, %arg0)
+  // CHECK:  return %0, %1
+}
+
+// CHECK-LABEL: @RetainRedundantReshapeUseInNonBinaryOp
+func.func @RetainRedundantReshapeUseInNonBinaryOp(%arg0: tensor<128xf32>, %arg1: tensor<1x512x512x128xf32>, %arg2: tensor<1x512x512x128xf32>) -> (tensor<1x512x512x128xf32>, tensor<128xf32>) {
+  %cst = arith.constant dense<0> : tensor<1xi32>
+  %cst_10 = arith.constant  dense<[1, 1, 1, 128]> : tensor<4xi32>
+  %894 = "tfl.reshape"(%arg0, %cst_10) : (tensor<128xf32>, tensor<4xi32>) -> tensor<1x1x1x128xf32>
+  %895 = "tfl.mul"(%894, %arg1) {fused_activation_function = "NONE"} : (tensor<1x1x1x128xf32>, tensor<1x512x512x128xf32>) -> tensor<1x512x512x128xf32>
+  %896 = "tfl.reduce_max"(%894, %cst) {keep_dims = false} : (tensor<1x1x1x128xf32>, tensor<1xi32>) -> tensor<128xf32>
+  return %895, %896 : tensor<1x512x512x128xf32>, tensor<128xf32>
+
+  // CHECK-DAG: %cst = arith.constant dense<0> : tensor<1xi32>
+  // CHECK-DAG: %cst_0 = arith.constant dense<[1, 1, 1, 128]> : tensor<4xi32>
+  // CHECK: %0 = "tfl.reshape"(%arg0, %cst_0) : (tensor<128xf32>, tensor<4xi32>) -> tensor<1x1x1x128xf32>
+  // CHECK: %1 = tfl.mul(%0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x1x1x128xf32>, tensor<1x512x512x128xf32>) -> tensor<1x512x512x128xf32>
+  // CHECK: %2 = "tfl.reduce_max"(%0, %cst) {keep_dims = false} : (tensor<1x1x1x128xf32>, tensor<1xi32>) -> tensor<128xf32>
+  // CHECK: return %1, %2
+}
+
 // CHECK-LABEL: @FuseFullyConnectedReshapeAddConstWithOptionalAttribute
 // FOLD-LABEL: @FuseFullyConnectedReshapeAddConstWithOptionalAttribute
 func.func @FuseFullyConnectedReshapeAddConstWithOptionalAttribute(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
@@ -618,6 +709,24 @@ func.func @FuseFullyConnectedReshapeAdd2DConst(%arg0: tensor<40x37xf32>, %arg1:
   // CHECK: return %[[rs]]
 }
 
+// CHECK-LABEL: @FuseFCReshapeAdd2DConst2
+func.func @FuseFCReshapeAdd2DConst2(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<1x40x4x10xf32> {
+  %cst = "tfl.no_value"() {value} : () -> none
+  %cst2 = arith.constant dense<2.0> : tensor<1x1x4x10xf32>
+  %shape = arith.constant dense<[1, 40, 4, 10]> : tensor<4xi32>
+
+  %0 = "tfl.fully_connected"(%arg0, %arg1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> (tensor<40x40xf32>)
+  %1 = "tfl.reshape"(%0, %shape) : (tensor<40x40xf32>, tensor<4xi32>) -> tensor<1x40x4x10xf32>
+  %2 = "tfl.add"(%1, %cst2) {fused_activation_function = "NONE"} : (tensor<1x40x4x10xf32>, tensor<1x1x4x10xf32>) -> tensor<1x40x4x10xf32>
+
+  func.return %2 : tensor<1x40x4x10xf32>
+
+  // CHECK-DAG: %[[cst:.*]] = arith.constant dense<2.000000e+00> : tensor<40xf32>
+  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}
+  // CHECK: %[[rs:.*]] = "tfl.reshape"(%[[fc]]
+  // CHECK: return %[[rs]]
+}
+
 // CHECK-LABEL: @FuseFullyConnectedReshapeAdd2DConstWithActivation
 func.func @FuseFullyConnectedReshapeAdd2DConstWithActivation(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<1x40x4x10xf32> {
   %cst = "tfl.no_value"() {value} : () -> none
@@ -636,6 +745,24 @@ func.func @FuseFullyConnectedReshapeAdd2DConstWithActivation(%arg0: tensor<40x37
   // CHECK: return %[[rs]]
 }
 
+// CHECK-LABEL: @FuseFCReshapeAdd2DConstWithActvtn2
+func.func @FuseFCReshapeAdd2DConstWithActvtn2(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<1x40x4x10xf32> {
+  %cst = "tfl.no_value"() {value} : () -> none
+  %cst2 = arith.constant dense<2.0> : tensor<1x1x4x10xf32>
+  %shape = arith.constant dense<[1, 40, 4, 10]> : tensor<4xi32>
+
+  %0 = "tfl.fully_connected"(%arg0, %arg1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> (tensor<40x40xf32>)
+  %1 = "tfl.reshape"(%0, %shape) : (tensor<40x40xf32>, tensor<4xi32>) -> tensor<1x40x4x10xf32>
+  %2 = "tfl.add"(%1, %cst2) {fused_activation_function = "RELU6"} : (tensor<1x40x4x10xf32>, tensor<1x1x4x10xf32>) -> tensor<1x40x4x10xf32>
+
+  func.return %2 : tensor<1x40x4x10xf32>
+
+  // CHECK-DAG: %[[cst:.*]] = arith.constant dense<2.000000e+00> : tensor<40xf32>
+  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}
+  // CHECK: %[[rs:.*]] = "tfl.reshape"(%[[fc]]
+  // CHECK: return %[[rs]]
+}
+
 // CHECK-LABEL: @FuseFullyConnectedReshapeAdd2DConstWithExistingBias
 func.func @FuseFullyConnectedReshapeAdd2DConstWithExistingBias(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<1x40x4x10xf32> {
   %cst = arith.constant dense<3.0> : tensor<40xf32>
@@ -775,6 +902,17 @@ func.func @ReorderElementwiseValueOpAndMoveOp(%arg0: tensor<40x40x1xf32>) -> ten
   // CHECK: return %[[rs2]]
 }
 
+// CHECK-LABEL: @MinimumOfReluAnd6ToRelu6
+func.func @MinimumOfReluAnd6ToRelu6(%arg0: tensor<40x40xf32>) -> tensor<40x40xf32> {
+  %cst = arith.constant dense<6.0> : tensor<f32>
+  %2 = "tfl.relu"(%arg0) : (tensor<40x40xf32>) -> tensor<40x40xf32>
+  %3 = "tfl.minimum"(%2, %cst) : (tensor<40x40xf32>, tensor<f32>) -> tensor<40x40xf32>
+  func.return %3 : tensor<40x40xf32>
+
+  // CHECK: %[[rs1:.*]] = "tfl.relu6"(%arg0
+  // CHECK: return %[[rs1]]
+}
+
 // CHECK-LABEL: @NotReorderElementwiseValueOpAndMoveOp
 func.func @NotReorderElementwiseValueOpAndMoveOp(%arg0: tensor<40x40x1xf32>) -> (tensor<40x40xf32>, tensor<40x40xf32>) {
   %shape = arith.constant dense<[40, 40]> : tensor<2xi32>
@@ -1809,6 +1947,32 @@ func.func @DontConvertConstSelectMixed(%arg0: tensor<2xf32>, %arg1: tensor<2xf32
   // CHECK: return %0, %1
 }
 
+// CHECK-LABEL: FuseBroadcastToIntoSelect
+func.func @FuseBroadcastToIntoSelect(%arg0: tensor<1x8x1024x2048xf32>, %arg1: tensor<1x8x1024x2048xf32>, %arg2: tensor<1x1x1x2048xi1>) -> (tensor<1x8x1024x2048xf32>, tensor<1x8x1024x2048xf32>) {
+  %cst_0 = arith.constant dense<[1, 8, 1024, 2048]> : tensor<4xi32>
+  %0 = "tfl.broadcast_to"(%arg2, %cst_0) : (tensor<1x1x1x2048xi1>, tensor<4xi32>) -> tensor<1x8x1024x2048xi1>
+  %1 = "tfl.select"(%0, %arg0, %arg1) : (tensor<1x8x1024x2048xi1>, tensor<1x8x1024x2048xf32>, tensor<1x8x1024x2048xf32>) -> tensor<1x8x1024x2048xf32>
+  %2 = "tfl.select_v2"(%0, %arg0, %arg1) : (tensor<1x8x1024x2048xi1>, tensor<1x8x1024x2048xf32>, tensor<1x8x1024x2048xf32>) -> tensor<1x8x1024x2048xf32>
+  func.return %1, %2 : tensor<1x8x1024x2048xf32>, tensor<1x8x1024x2048xf32>
+  // CHECK: %0 = "tfl.select_v2"(%arg2, %arg0, %arg1) : (tensor<1x1x1x2048xi1>, tensor<1x8x1024x2048xf32>, tensor<1x8x1024x2048xf32>) -> tensor<1x8x1024x2048xf32>
+  // CHECK: %1 = "tfl.select_v2"(%arg2, %arg0, %arg1) : (tensor<1x1x1x2048xi1>, tensor<1x8x1024x2048xf32>, tensor<1x8x1024x2048xf32>) -> tensor<1x8x1024x2048xf32>
+  // CHECK: return %0, %1
+}
+
+// CHECK-LABEL: FuseBroadcastToIntoSelect1
+func.func @FuseBroadcastToIntoSelect1(%arg0: tensor<1x1x8x1024x2048xf32>, %arg1: tensor<1x1x8x1024x2048xf32>, %arg2: tensor<1x1x1x1x2048xi1>) -> tensor<1x1x8x1024x2048xf32> {
+  %cst_0 = arith.constant dense<[1, 1, 8, 1024, 2048]> : tensor<5xi32>
+  %0 = "tfl.broadcast_to"(%arg2, %cst_0) : (tensor<1x1x1x1x2048xi1>, tensor<5xi32>) -> tensor<1x1x8x1024x2048xi1>
+  %1 = "tfl.select"(%0, %arg0, %arg1) : (tensor<1x1x8x1024x2048xi1>, tensor<1x1x8x1024x2048xf32>, tensor<1x1x8x1024x2048xf32>) -> tensor<1x1x8x1024x2048xf32>
+
+  func.return %1 : tensor<1x1x8x1024x2048xf32>
+  // CHECK-DAG: %cst = arith.constant dense<[1, 1, 8, 1024, 2048]> : tensor<5xi32>
+  // CHECK: %0 = "tfl.broadcast_to"(%arg2, %cst) : (tensor<1x1x1x1x2048xi1>, tensor<5xi32>) -> tensor<1x1x8x1024x2048xi1>
+  // CHECK: %1 = "tfl.select"(%0, %arg0, %arg1) : (tensor<1x1x8x1024x2048xi1>, tensor<1x1x8x1024x2048xf32>, tensor<1x1x8x1024x2048xf32>) -> tensor<1x1x8x1024x2048xf32>
+
+  // CHECK: return %1
+}
+
 // CHECK-LABEL: CheckSelectNegated
 func.func @CheckSelectNegated(%arg0: tensor<1x2x3x4xi1>, %arg1: tensor<1x2x3x4xf32>, %arg2: tensor<1x2x3x4xf32>) -> (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) {
   %not = "tfl.logical_not"(%arg0) : (tensor<1x2x3x4xi1>) -> tensor<1x2x3x4xi1>
@@ -2381,51 +2545,42 @@ func.func @fuseUnpackAndConcatToReshape(%arg0: tensor<1x3x2xf32>) -> tensor<1x6x
   // CHECK: return %[[RES]]
 }
 
-// CHECK-LABEL: replaceReshapeEqualWithOneHot
-func.func @replaceReshapeEqualWithOneHot(%arg: tensor<2xi32>) -> tensor<2x3xi1> {
-  // Good match: Replace with one_hot
-  %shape = arith.constant dense<[2, 1]> : tensor<2xi32>
+// CHECK-LABEL: replaceReshapeEqualWithOneHotSingleDim
+func.func @replaceReshapeEqualWithOneHotSingleDim(%arg: tensor<1xi32>) -> tensor<3xi1> {
   %cst = arith.constant dense<[0, 1, 2]> : tensor<3xi32>
-  %tmp = "tfl.reshape"(%arg, %shape) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2x1xi32>
-  %result = "tfl.equal"(%tmp, %cst) : (tensor<2x1xi32>, tensor<3xi32>) -> tensor<2x3xi1>
+  %result = "tfl.equal"(%arg, %cst) : (tensor<1xi32>, tensor<3xi32>) -> tensor<3xi1>
+  func.return %result : tensor<3xi1>
+
+  // CHECK-NOT: tfl.one_hot
+}
+
+// CHECK-LABEL: replaceReshapeEqualWithOneHot
+func.func @replaceReshapeEqualWithOneHot(%arg: tensor<2x1xi32>) -> tensor<2x3xi1> {
+  // Good match: Replace with one_hot
+  %cst = arith.constant dense<[0, 1, 2]> : tensor<3xi32>
+  %result = "tfl.equal"(%arg, %cst) : (tensor<2x1xi32>, tensor<3xi32>) -> tensor<2x3xi1>
   func.return %result : tensor<2x3xi1>
 
   // CHECK-DAG: %[[CST1:.*]] = arith.constant dense<3> : tensor<i32>
   // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<true> : tensor<i1>
   // CHECK-DAG: %[[CST3:.*]] = arith.constant dense<false> : tensor<i1>
-  // CHECK: %[[RES:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<2xi32>, tensor<i32>, tensor<i1>, tensor<i1>) -> tensor<2x3xi1>
+  // CHECK-DAG: %[[CST4:.*]] = arith.constant dense<2> : tensor<1xi32>
+  // CHECK: %[[RESHAPE:.*]] = "tfl.reshape"(%arg0, %[[CST4]]) : (tensor<2x1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  // CHECK: %[[RES:.*]] = "tfl.one_hot"(%[[RESHAPE]], %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<2xi32>, tensor<i32>, tensor<i1>, tensor<i1>) -> tensor<2x3xi1>
 }
 
-// CHECK-LABEL: replaceReshapeEqualWithOneHotWithNonTrivialReshape
-func.func @replaceReshapeEqualWithOneHotWithNonTrivialReshape(%arg: tensor<4x4xi32>) -> tensor<16x3xi1> {
-  // Good match: Replace with one_hot
-  %shape = arith.constant dense<[16, 1]> : tensor<2xi32>
+// CHECK-LABEL: ReplaceReshapeEqualWithOneHotWithBatchingDim
+func.func @ReplaceReshapeEqualWithOneHotWithBatchingDim(%arg: tensor<2x2x1xi32>) -> tensor<2x2x3xi1> {
   %cst = arith.constant dense<[0, 1, 2]> : tensor<3xi32>
-  %tmp = "tfl.reshape"(%arg, %shape) : (tensor<4x4xi32>, tensor<2xi32>) -> tensor<16x1xi32>
-  %result = "tfl.equal"(%tmp, %cst) : (tensor<16x1xi32>, tensor<3xi32>) -> tensor<16x3xi1>
-  func.return %result : tensor<16x3xi1>
+  %result = "tfl.equal"(%arg, %cst) : (tensor<2x2x1xi32>, tensor<3xi32>) -> tensor<2x2x3xi1>
+  func.return %result : tensor<2x2x3xi1>
 
   // CHECK-DAG: %[[CST1:.*]] = arith.constant dense<3> : tensor<i32>
   // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<true> : tensor<i1>
   // CHECK-DAG: %[[CST3:.*]] = arith.constant dense<false> : tensor<i1>
-  // CHECK-DAG: %[[CST4:.*]] = arith.constant dense<16> : tensor<1xi32>
-  // CHECK-DAG: %[[TMP:.*]] = "tfl.reshape"(%arg0, %[[CST4]]) : (tensor<4x4xi32>, tensor<1xi32>) -> tensor<16xi32>
-  // CHECK: %[[RES:.*]] = "tfl.one_hot"(%[[TMP]], %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<16xi32>, tensor<i32>, tensor<i1>, tensor<i1>) -> tensor<16x3xi1>
-}
-
-// CHECK-LABEL: noReplaceReshapeEqualWithOneHotWithBatchingDim
-func.func @noReplaceReshapeEqualWithOneHotWithBatchingDim(%arg: tensor<2xi32>) -> tensor<1x2x3xi1> {
-  // Do not replace: shape length longer than 2
-  %shape = arith.constant dense<[1, 2, 1]> : tensor<3xi32>
-  %cst = arith.constant dense<[0, 1, 2]> : tensor<3xi32>
-  %tmp = "tfl.reshape"(%arg, %shape) : (tensor<2xi32>, tensor<3xi32>) -> tensor<1x2x1xi32>
-  %result = "tfl.equal"(%tmp, %cst) : (tensor<1x2x1xi32>, tensor<3xi32>) -> tensor<1x2x3xi1>
-  func.return %result : tensor<1x2x3xi1>
-
-  // CHECK-DAG: %[[CST1:.*]] = arith.constant dense<[1, 2, 1]> : tensor<3xi32>
-  // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<[0, 1, 2]> : tensor<3xi32>
-  // CHECK: %[[TMP:.*]] = "tfl.reshape"(%arg0, %[[CST1]]) : (tensor<2xi32>, tensor<3xi32>) -> tensor<1x2x1xi32>
-  // CHECK: %[[RES:.*]] = "tfl.equal"(%[[TMP]], %[[CST2]]) : (tensor<1x2x1xi32>, tensor<3xi32>) -> tensor<1x2x3xi1>
+  // CHECK-DAG: %[[CST4:.*]] = arith.constant dense<2> : tensor<2xi32>
+  // CHECK: %[[RESHAPE:.*]] = "tfl.reshape"(%arg0, %[[CST4]]) : (tensor<2x2x1xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  // CHECK: %[[RES:.*]] = "tfl.one_hot"(%[[RESHAPE]], %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<2x2xi32>, tensor<i32>, tensor<i1>, tensor<i1>) -> tensor<2x2x3xi1>
 }
 
 // CHECK-LABEL: noReplaceReshapeEqualWithOneHotBadShape
@@ -2549,8 +2704,8 @@ func.func @dontReplaceOneHotFullyConnectedWithLookupBadIndexTypeWithOptionalAttr
   // CHECK: %[[RES:.*]] = "tfl.fully_connected"(%[[TMP]], %[[CST4]], %[[CST5]]) {asymmetric_quantize_inputs = true,
 }
 
-// CHECK-LABEL: dontReplaceOneHotFullyConnectedWithLookupBadIndexRank
-func.func @dontReplaceOneHotFullyConnectedWithLookupBadIndexRank(%arg: tensor<11x2xi32>) -> tensor<11x2x5xf32> {
+// CHECK-LABEL: ReplaceOneHotFullyConnectedWithLookup2DRank
+func.func @ReplaceOneHotFullyConnectedWithLookup2DRank(%arg: tensor<11x2xi32>) -> tensor<11x2x5xf32> {
   %depth = arith.constant dense<3> : tensor<i32>
   %on = arith.constant dense<1.0> : tensor<f32>
   %off = arith.constant dense<0.0> : tensor<f32>
@@ -2558,18 +2713,16 @@ func.func @dontReplaceOneHotFullyConnectedWithLookupBadIndexRank(%arg: tensor<11
   %bias = "tfl.no_value"() {value} : () -> none
 
   %tmp = "tfl.one_hot"(%arg, %depth, %on, %off) {axis = -1 : i32} : (tensor<11x2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<11x2x3xf32>
-  %result = "tfl.fully_connected"(%tmp, %filter, %bias) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<11x2x3xf32>, tensor<5x3xf32>, none) -> tensor<11x2x5xf32>
+  %result = "tfl.fully_connected"(%tmp, %filter, %bias) {fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<11x2x3xf32>, tensor<5x3xf32>, none) -> tensor<11x2x5xf32>
 
   func.return %result : tensor<11x2x5xf32>
 
-  // CHECK-NOT: "tfl.embedding_lookup"
-  // CHECK-DAG: %[[CST1:.*]] = arith.constant dense<3> : tensor<i32>
-  // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK-DAG: %[[CST3:.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK-DAG: %[[CST4:.*]] = arith.constant dense<7.000000e+00> : tensor<5x3xf32>
-  // CHECK-DAG: %[[CST5:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[TMP:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<11x2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<11x2x3xf32>
-  // CHECK: %[[RES:.*]] = "tfl.fully_connected"(%[[TMP]], %[[CST4]], %[[CST5]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<11x2x3xf32>, tensor<5x3xf32>, none) -> tensor<11x2x5xf32>
+  // CHECK-DAG: %[[CST0:.*]] = arith.constant dense<22> : tensor<1xi32>
+  // CHECK-DAG: %[[CST1:.*]] = arith.constant dense<7.000000e+00> : tensor<3x5xf32>
+  // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<[11, 2, 5]> : tensor<3xi32>
+  // CHECK: %[[RESHAPE:.*]] = "tfl.reshape"(%arg0, %[[CST0]]) : (tensor<11x2xi32>, tensor<1xi32>) -> tensor<22xi32>
+  // CHECK: %[[TMP:.*]] = "tfl.embedding_lookup"(%[[RESHAPE]], %[[CST1]]) : (tensor<22xi32>, tensor<3x5xf32>) -> tensor<22x5xf32>
+  // CHECK: %[[RES:.*]] = "tfl.reshape"(%[[TMP]], %[[CST2]]) : (tensor<22x5xf32>, tensor<3xi32>) -> tensor<11x2x5xf32>
   // CHECK: return %[[RES]] : tensor<11x2x5xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir
index 17c793cd19f..d2e04734e0e 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir
@@ -222,8 +222,8 @@ func.func @QuantizeFixedOutputRangeInterfaceOpSoftmax(%arg0: tensor<1x1xf32>) ->
 // CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]]) : (tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x1xf32>
 // CHECK-NEXT: %[[sm:.*]] = "tfl.softmax"(%[[dq1]]) {{{.*}}} : (tensor<1x1xf32>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[sm]]) {qtype = tensor<1x1x!quant.uniform<i16:f32, 1.52587890625E-5:-32768>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, 1.52587890625E-5:-32768>>
-// CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x1x!quant.uniform<i16:f32, 1.52587890625E-5:-32768>>) -> tensor<1x1xf32>
+// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[sm]]) {qtype = tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>
+// CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>) -> tensor<1x1xf32>
 }
 
 // CHECK-LABEL: QuantizeFixedOutputRangeInterfaceOpL2Normalization
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 9836ec1ba15..a668475a9e2 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -193,6 +193,18 @@ func.func @identity(%arg0: tensor<10xi32>, %arg1: tensor<20xi32>, %arg2: tensor<
 // CHECK: return %arg0, %arg1, %arg2, %0
 }
 
+func.func @sharding(%arg0: tensor<10x10xi32>) -> (tensor<10x10xi32>) {
+  %0 = "tf.MatMul"(%arg0, %arg0) {device = "", transpose_a = false, transpose_b = false} : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<10x10xi32>
+  %1 = "tf.MatMul"(%arg0, %arg0) {device = "", transpose_a = false, transpose_b = false} : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<10x10xi32>
+  %2 = "tf.XlaSharding"(%0) {_XlaSharding = "\08\03\1A\02\01\01\22\01\00", device = "", sharding = "\08\03\1A\02\01\01\22\01\00", unspecified_dims = []} : (tensor<10x10xi32>) -> tensor<10x10xi32>
+  %3 = "tf.XlaSharding"(%1) {_XlaSharding = "\08\03\1A\02\01\01\22\01\00", device = "", sharding = "\08\03\1A\02\01\01\22\01\00", unspecified_dims = []} : (tensor<10x10xi32>) -> tensor<10x10xi32>
+  %4 = "tf.AddV2"(%2, %3) {device = ""} : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<10x10xi32>
+  func.return %4 : tensor<10x10xi32>
+
+// CHECK-LABEL: sharding
+// CHECK-NOT: %2 = "tf.XlaSharding"(%0) {_XlaSharding = "\08\03\1A\02\01\01\22\01\00", device = "", sharding = "\08\03\1A\02\01\01\22\01\00", unspecified_dims = []} : (tensor<10x10xi32>) -> tensor<10x10xi32>
+// CHECK-NOT: %3 = "tf.XlaSharding"(%1) {_XlaSharding = "\08\03\1A\02\01\01\22\01\00", device = "", sharding = "\08\03\1A\02\01\01\22\01\00", unspecified_dims = []} : (tensor<10x10xi32>) -> tensor<10x10xi32>
+}
 
 func.func @matmulNoTransposeAOrB(%arg0: tensor<1x1280xf32>, %arg1: tensor<1280x1000xf32>) -> tensor<1x1000xf32> {
   %166 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", _output_shapes = ["tfshape$dim { size = 1} dim { size = 1000}"], device = "", name = "matmul", transpose_a = false, transpose_b = false} : (tensor<1x1280xf32>, tensor<1280x1000xf32>) -> tensor<1x1000xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/shape-inference.mlir b/tensorflow/compiler/mlir/lite/tests/shape-inference.mlir
index 34b9a54bc91..5baa9811229 100644
--- a/tensorflow/compiler/mlir/lite/tests/shape-inference.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/shape-inference.mlir
@@ -68,6 +68,7 @@ func.func @testConv2dShapeInferenceDynamic(%arg0: tensor<1x?x?x128xf32>, %arg1:
 
 module attributes {tf.versions = {producer = 888 : i32}} {
 func.func @testConv2dShapeInvalidRanks(%arg0: tensor<1x112x80xf32>, %arg1: tensor<128x3x3x128xf32>, %arg2: tensor<128xf32>) -> tensor<1x?x?x128xf32> {
+  // expected-error @+2 {{'tfl.conv_2d' op failed to infer returned types}}
   // expected-error @+1 {{Invalid ranks}}
   %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 2 : i32, dilation_w_factor = 2 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x80xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x?x?x128xf32>
   func.return %0 : tensor<1x?x?x128xf32>
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index e7a613b3184..86dbe9c513e 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -44,10 +45,12 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
@@ -56,6 +59,8 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/lite/python/metrics/converter_error_data.pb.h"
 #include "tensorflow/lite/tools/optimize/quantize_weights.h"
 #include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
 #include "tensorflow/tsl/platform/statusor.h"
@@ -93,6 +98,25 @@ mlir::LogicalResult IsValidGraph(mlir::ModuleOp module) {
   return mlir::success();
 }
 
+mlir::LogicalResult GraphContainsStatefulPartitionedOp(mlir::ModuleOp module) {
+  auto result = module.walk([&](Operation* op) {
+    return llvm::isa_and_nonnull<mlir::TF::StatefulPartitionedCallOp>(op)
+               ? mlir::WalkResult::interrupt()
+               : mlir::WalkResult::advance();
+  });
+  if (result.wasInterrupted()) {
+    // StatefulPartitionedCall ops are not supported by the tflite runtime.
+    mlir::TFL::AttachErrorCode(
+        module.emitError(
+            "The Graph contains unsupported `StatefulPartionedCallOp`(s), will "
+            "retry with `guarantee_all_funcs_used_once`"),
+        tflite::metrics::ConverterErrorData::
+            ERROR_STATEFUL_PARTITIONED_CALL_IN_FINAL_IR);
+    return mlir::failure();
+  }
+  return mlir::success();
+}
+
 // Util that registers 'extra_tf_opdefs' to the TF global registry.
 // Return OK on success, failure if registering failed.
 Status RegisterExtraTfOpDefs(absl::Span<const std::string> extra_tf_opdefs) {
@@ -143,17 +167,19 @@ StatusOr<OwningOpRef<ModuleOp>> LoadFromGraphdefOrMlirSource(
 
   if (use_splatted_constant) {
     return tensorflow::GraphdefToSplattedMlirTranslateFunction(
-        file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
-        input_shapes, output_arrays, control_output_arrays,
-        specs.prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
+        file->getBuffer(), debug_info_file, /*xla_compile_device_type=*/"",
+        input_arrays, input_dtypes, input_shapes, output_arrays,
+        control_output_arrays, specs.prune_unused_nodes,
+        /*convert_legacy_fed_inputs=*/true,
         /*graph_as_function=*/false, specs.upgrade_legacy,
         /*enable_shape_inference=*/false,
         /*unconditionally_use_set_output_shapes=*/true, context);
   }
   return tensorflow::GraphdefToMlirTranslateFunction(
-      file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
-      input_shapes, output_arrays, control_output_arrays,
-      specs.prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
+      file->getBuffer(), debug_info_file, /*xla_compile_device_type=*/"",
+      input_arrays, input_dtypes, input_shapes, output_arrays,
+      control_output_arrays, specs.prune_unused_nodes,
+      /*convert_legacy_fed_inputs=*/true,
       /*graph_as_function=*/false, specs.upgrade_legacy,
       /*enable_shape_inference=*/false,
       /*unconditionally_use_set_output_shapes=*/true, context);
@@ -212,20 +238,25 @@ Status ConvertTFExecutorToStablehloFlatbuffer(
       return errors::Aborted("Failed to preprocess & freeze TF graph");
     }
 
-    // The default minimum number of elements a weights array must have to be
-    // quantized by this transformation.
-    const int kWeightsMinNumElementsDefault = 1024;
+    // TODO(b/264218457): Refactor the component below once StableHLO Quantizer
+    // can run DRQ. Temporarily using TF Quantization for StableHLO DRQ.
+    if (!toco_flags.has_quantization_options()) {
+      // The default minimum number of elements a weights array must have to be
+      // quantized by this transformation.
+      const int kWeightsMinNumElementsDefault = 1024;
 
-    tensorflow::quantization::QuantizationOptions quantization_options;
+      tensorflow::quantization::QuantizationOptions quantization_options;
 
-    quantization_options.mutable_quantization_method()->set_experimental_method(
-        tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE);
-    quantization_options.set_op_set(
-        tensorflow::quantization::UNIFORM_QUANTIZED);
-    quantization_options.set_min_num_elements_for_weights(
-        kWeightsMinNumElementsDefault);
-    tensorflow::quantization::AddQuantizePtqDynamicRangePasses(
-        pass_manager, quantization_options);
+      quantization_options.mutable_quantization_method()
+          ->set_experimental_method(
+              tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE);
+      quantization_options.set_op_set(
+          tensorflow::quantization::UNIFORM_QUANTIZED);
+      quantization_options.set_min_num_elements_for_weights(
+          kWeightsMinNumElementsDefault);
+      tensorflow::quantization::AddQuantizePtqDynamicRangePasses(
+          pass_manager, quantization_options);
+    }
     if (failed(pass_manager.run(module))) {
       return statusHandler.ConsumeStatus();
     }
@@ -237,6 +268,10 @@ Status ConvertTFExecutorToStablehloFlatbuffer(
   // Print out a detailed report of non-converted stats.
   pass_manager.addPass(mlir::odml::createPrintOpStatsPass());
   mlir::odml::AddStablehloOptimizationPasses(pass_manager);
+  if (toco_flags.has_quantization_options()) {
+    stablehlo::quantization::AddQuantizationPasses(
+        pass_manager, toco_flags.quantization_options());
+  }
   if (failed(pass_manager.run(module))) {
     return statusHandler.ConsumeStatus();
   }
@@ -285,7 +320,10 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
 
   mlir::PassManager pass_manager(module.getContext());
   mlir::registerPassManagerCLOptions();
-  mlir::applyPassManagerCLOptions(pass_manager);
+  if (mlir::failed(mlir::applyPassManagerCLOptions(pass_manager))) {
+    return tensorflow::FromAbslStatus(
+        absl::UnknownError("failed to apply MLIR pass manager CL options"));
+  }
   pass_manager.addInstrumentation(
       std::make_unique<mlir::TFL::ErrorCollectorInstrumentation>(
           pass_manager.getContext()));
@@ -345,6 +383,10 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
     return status;
   }
 
+  if (failed(GraphContainsStatefulPartitionedOp(module))) {
+    return statusHandler.ConsumeStatus();
+  }
+
   if (export_to_mlir) {
     llvm::raw_string_ostream os(*result);
     module.print(os);
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index 76a8de86dc3..bf4224c7631 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -116,8 +116,7 @@ void DefaultQuantParamsPass::runOnOperation() {
   }
 
   func.walk([&](Operation *op) {
-    if (quant::IsOpNotQuantizable(op) ||
-        op->getParentOfType<TFL::CustomTfOp>()) {
+    if (!quant::IsOpQuantizable(op) || op->getParentOfType<TFL::CustomTfOp>()) {
       return;
     }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
index 27c4763f179..51068fcf4ac 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
+++ b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
@@ -368,7 +368,7 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
         "SpaceToBatchND op's padding doesn't have same shape/type with "
         "BatchToSpaceND op's crops");
   }
-  int64_t m = stb_paddings_attr.getType().getDimSize(0);
+  int64_t m = stb_paddings_attr.getShapedType().getDimSize(0);
   // padding - crop.
   for (uint64_t i = 0; i < m; ++i) {
     for (uint64_t j = 0; j < 2; ++j) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index a8f0ea36135..a020a4be43a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -21,6 +21,7 @@ include "mlir/Dialect/Arith/IR/ArithOps.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "tensorflow/compiler/mlir/lite/utils/utils.td"
 
 def CreateEmptyBoolAttr : NativeCodeCall<"::mlir::BoolAttr()">;
 
@@ -29,10 +30,10 @@ def DenseElementsAttr : ElementsAttrBase<
   "non-opaque constant tensor">;
 
 def F32ElementsAttr : ElementsAttrBase<
-  CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isF32()">, "float constant tensor">;
+  CPred<"$_self.cast<ElementsAttr>().getShapedType().getElementType().isF32()">, "float constant tensor">;
 
 def Int64ElementsAttr : ElementsAttrBase<
-  CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isInteger(64)">, "Int 64 constant tensor">;
+  CPred<"$_self.cast<ElementsAttr>().getShapedType().getElementType().isInteger(64)">, "Int 64 constant tensor">;
 
 // Extract the ith int element from an ArrayAttr $0 as an 32-bit IntegerAttr
 // with builder.
@@ -65,11 +66,6 @@ def ExtractSingleElementAsInt32 : NativeCodeCall<
 def CreateTFCastToInt32Op : NativeCodeCall<
   "CreateCastToInt32($0, $_loc, $_builder)">;
 
-// Checks whether the given operation has static shapes and same shapes of all inputs.
-def HasSameStaticShapesPred : CPred<"HasSameStaticShapes($0.getDefiningOp())">;
-def HasSameStaticShapes : Constraint<HasSameStaticShapesPred, "op must have static same input shapes">;
-def HasNotSameStaticShapes : Constraint<Neg<HasSameStaticShapesPred>, "op must have not static same input shapes">;
-
 def CreateNoneValue : NativeCodeCall<
   "$_builder.create<TFL::NoValueOp>($0.getLoc(), $_builder.getUnitAttr())">;
 
@@ -234,11 +230,11 @@ def LegalizeSelect : Pat<(TF_SelectOp $cond, $x, $y),
                          (TFL_SelectOp $cond, $x, $y)>;
 def LegalizeSelectV2SameStaticShape : Pat<(TF_SelectV2Op:$src_op $cond, $x, $y),
     (TFL_SelectOp $cond, $x, $y),
-    [(HasSameStaticShapes $src_op)]>;
+    [(OpHasSameStaticShapes $src_op)]>;
 def LegalizeSelectV2NotSameStaticShape : Pat<
   (TF_SelectV2Op:$src_op $cond, $x, $y),
   (TFL_SelectV2Op $cond, $x, $y),
-  [(HasNotSameStaticShapes $src_op)]>;
+  [(OpHasNotSameStaticShapes $src_op)]>;
 def LegalizeShape : Pat<(TF_ShapeOp $arg), (TFL_ShapeOp $arg)>;
 def LegalizeSigmoid : Pat<(TF_SigmoidOp $arg), (TFL_LogisticOp $arg)>;
 def LegalizeSin : Pat<(TF_SinOp F32Tensor:$arg), (TFL_SinOp $arg)>;
@@ -577,6 +573,14 @@ def LegalizeAtan2 : Pat<(TF_Atan2Op $y, $x), (TFL_Atan2Op $y, $x)>;
 
 def LegalizeSign : Pat<(TF_SignOp $x), (TFL_SignOp $x)>;
 
+def LegalizeBitcast : Pat<(TF_BitcastOp $x), (TFL_BitcastOp $x)>;
+
+def LegalizeBitwiseXor : Pat<(TF_BitwiseXorOp $l, $r),
+                             (TFL_BitwiseXorOp $l, $r)>;
+
+def LegalizeRightShift : Pat<(TF_RightShiftOp $l, $r),
+                             (TFL_RightShiftOp $l, $r)>;
+
 // =============================================================================
 // Training OPs
 // =============================================================================
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 555a879c956..2a80ef14cc4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -92,28 +92,6 @@ class LegalizeTFPass : public impl::LegalizeTFPassBase<LegalizeTFPass> {
   void runOnOperation() override;
 };
 
-// Returns true if all tensor value in `values` has static shape and same shape.
-bool HasSameStaticShapes(Operation* op) {
-  auto values = op->getOperands();
-  int index = 0;
-  ArrayRef<int64_t> shape;
-  for (Value value : values) {
-    auto shaped_type = value.getType().dyn_cast<ShapedType>();
-    if (!shaped_type || !shaped_type.hasStaticShape()) {
-      return false;
-    }
-    if (index == 0) {
-      shape = shaped_type.getShape();
-    } else {
-      if (shape != shaped_type.getShape()) {
-        return false;
-      }
-    }
-    ++index;
-  }
-  return true;
-}
-
 // Util that casts 'val' to Int32 by adding a cast Op.
 Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter) {
   IntegerType new_ele_type = rewriter.getIntegerType(32);
diff --git a/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc b/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
index d939d74c5dd..91abd715cdf 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
@@ -230,7 +230,7 @@ class LiftFlexCustomOp : public OpRewritePattern<TFL::CustomOp> {
       StatusOr<Attribute> mlir_attr =
           tensorflow::ConvertAttributeValue(attr_value, &builder);
       if (!mlir_attr.ok()) {
-        return emitError(loc, mlir_attr.status().error_message());
+        return emitError(loc, mlir_attr.status().message());
       }
       attributes.push_back(builder.getNamedAttr(attr_name, *mlir_attr));
     }
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index 5be3dcac0e0..ae09b8faded 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -632,7 +632,7 @@ struct ConvertTensorListInitOp : public TensorListOpConverterBase<OpT> {
     // as specified by element_dtype.
     RankedTensorType zero_type =
         tensorflow::GetTypeFromTFTensorShape({}, element_dtype);
-    Attribute zero_attr = rewriter.getZeroAttr(zero_type);
+    auto zero_attr = rewriter.getZeroAttr(zero_type);
     auto zero = rewriter.create<arith::ConstantOp>(loc, zero_type, zero_attr);
 
     rewriter.replaceOpWithNewOp<TF::FillOp>(op, result_type, list_shape, zero);
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index a7e404233ce..9b74c6bf606 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -17,14 +17,17 @@ limitations under the License.
 // optimizes them to resulting operations in TensorFlowLite dialect.
 
 #include <algorithm>
+#include <array>
 #include <climits>
 #include <cstdint>
 #include <functional>
 #include <iterator>
 #include <map>
+#include <memory>
 #include <numeric>
 #include <optional>
 #include <utility>
+#include <vector>
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -40,6 +43,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
@@ -70,6 +74,14 @@ constexpr char kRelu[] = "RELU";
 constexpr char kRelu6[] = "RELU6";
 constexpr char kRelu1[] = "RELU_N1_TO_1";
 
+ElementsAttr FlattenTo1D(Attribute a) {
+  auto elements = a.cast<DenseElementsAttr>();
+  const std::array<int64_t, 1> flattened_shape = {elements.getNumElements()};
+  auto new_type = RankedTensorType::get(flattened_shape,
+                                        elements.getType().getElementType());
+  return elements.reshape(new_type);
+}
+
 bool L2NormalizeReduceAxis(Value sq_op, DenseElementsAttr axis) {
   if (axis.getNumElements() == 0) {
     return false;
@@ -138,14 +150,49 @@ bool IsTailOfShape(Type type1, Type type2) {
   return std::equal(i1, e1, i2);
 }
 
+// This function removes explicit broadcasting on type1 and returns whether if
+// the reduced `type1` dimensions are the same as the ending dimensions
+// of `type2`.
+bool IsReducedTailOfShape(Type type1, Type type2) {
+  auto tail_type = type1.dyn_cast<ShapedType>();
+  auto full_type = type2.dyn_cast<ShapedType>();
+  if (!tail_type || !full_type || !tail_type.hasRank() || !full_type.hasRank())
+    return false;
+
+  auto i1 = tail_type.getShape().rbegin();
+  auto reduced_e1 = tail_type.getShape().rend();
+  auto i2 = full_type.getShape().rbegin();
+
+  while ((std::distance(i1, reduced_e1) > 0) && (*(reduced_e1 - 1) == 1)) {
+    reduced_e1--;
+  }
+
+  return (std::distance(i1, reduced_e1) > 0) &&
+         (std::distance(i1, reduced_e1) <= full_type.getRank()) &&
+         (std::equal(i1, reduced_e1, i2));
+}
+
+// Check if the value of the last dimension of type1 is equal to the number of
+// elements in type2. This is a required condition to flatten type2 to form a
+// 1D array and allow the binaryOp handle the broadcasting implicitly.
+bool IsLastDimEqualToNumElements(Type type1, Type type2) {
+  return (type1.cast<ShapedType>().getRank() >= 1 &&
+          type1.cast<ShapedType>().getDimSize(
+              type1.cast<ShapedType>().getRank() - 1) ==
+              type2.cast<ShapedType>().getNumElements());
+}
+
 bool CanFuseConvOrDepthwiseConvShapes(const ArrayRef<int64_t> filter_shape,
                                       const ArrayRef<int64_t> elements_shape,
                                       bool is_depthwise) {
-  // Also, val tensor must be of rank 1 or 0 (scalar).
-  const auto elements_rank = elements_shape.size();
-  if (elements_rank != 1 && elements_rank != 0) {
-    return false;
+  // Val tensor must be a scalar or of a shape [1, ... , 1, elements_depth].
+  const int elements_rank = elements_shape.size();
+  for (int i = 0; i < elements_rank - 1; ++i) {
+    if (elements_shape[i] != 1) {
+      return false;
+    }
   }
+
   auto elements_depth = elements_shape.empty() ? 1 : elements_shape.back();
   // If elements depth equals 1 (i.e., scalar or tensor with 1 element), then we
   // can let binary op to broadcast elements.
@@ -313,16 +360,6 @@ DenseElementsAttr GetShape(Value output_val) {
       llvm::ArrayRef(shape));
 }
 
-static Type GetShapeStrippedType(TypeAttr type_attr) {
-  auto type = type_attr.getValue();
-  auto shaped_type = type.dyn_cast<ShapedType>();
-  if (shaped_type) {
-    return shaped_type.getElementType();
-  } else {
-    return type;
-  }
-}
-
 // Returns `true` if reducing `axes` in `input` with `keep_dims=true` results in
 // the specified `shape` and `false` otherwise.
 static bool ShapeMatchesReduceWithKeepAxes(Value input,
@@ -396,65 +433,63 @@ static bool FloatValueEquals(const Attribute &attr, double value) {
   });
 }
 
+// Returns true if `value` is compile-time constant and its splat value equals
+// to `raw_value`.
+template <typename T>
+bool IsConstantValueOf(mlir::TypedAttr value, T raw_value) {
+  auto element_type = value.getType().cast<ShapedType>().getElementType();
+
+  if (element_type.isa<FloatType>()) {
+    return FloatValueEquals(value, raw_value);
+  } else if (element_type.isa<IntegerType>()) {
+    auto int_attr = value.dyn_cast_or_null<DenseIntElementsAttr>();
+    if (!int_attr) return false;
+
+    if (int_attr.isSplat()) {
+      return int_attr.getSplatValue<APInt>() == raw_value;
+    }
+    return llvm::all_of(int_attr.getValues<APInt>(),
+                        [raw_value](const APInt &f) { return f == raw_value; });
+  }
+
+  return false;
+}
+
 // Returns true if the value's element type is F32.
 bool IsF32Value(Value value) {
   return value.getType().cast<ShapedType>().getElementType().isF32();
 }
 
-// Returns the number of elements in attr if it is a DenseElementsAttr, 1
-// otherwise, as an unranked int32 Attribute.
-Attribute GetNumElementsOrOne(Attribute attr) {
-  const auto dense_attr = attr.dyn_cast_or_null<DenseElementsAttr>();
-  int32_t num_elements = dense_attr ? dense_attr.getNumElements() : 1;
+// Returns the number of elements in attr if it is a static shape, 1 otherwise,
+// as an unranked int32 Attribute.
+TypedAttr GetNumElementsOrOne(Type type) {
+  auto shaped_type = type.cast<ShapedType>();
+  int32_t num_elements =
+      shaped_type.hasStaticShape() ? shaped_type.getNumElements() : 1;
 
-  OpBuilder builder(attr.getContext());
+  OpBuilder builder(type.getContext());
 
   return DenseIntElementsAttr::get(
       RankedTensorType::get({}, builder.getI32Type()),
       {llvm::APInt(32, num_elements, true)});
 }
 
-bool HasExactlyTwoElements(Attribute attr) {
-  const auto values = attr.dyn_cast_or_null<ElementsAttr>();
-  if (!values) return false;
-  return values.getNumElements() == 2;
-}
-
-// Returns true if attr is a DenseIntElementsAttr with the last element equal 1.
-bool IsLastElementEqualsOne(Attribute attr) {
-  const auto ints = attr.dyn_cast_or_null<DenseIntElementsAttr>();
-  if (!ints) return false;
-  if (ints.empty()) return false;
-  const auto last_element_index = ints.getNumElements() - 1;
-  const auto iterator = ints.value_begin<int>();
-  const int last_element = iterator[last_element_index];
-  return last_element == 1;
-}
-
 // Reshapes value to a given shape.
-Value ReshapeValueDroppingLastDim(OpBuilder &builder, Value value,
-                                  Attribute shape) {
-  // This function is always guarded with IsLastElementEqualsOne(), so we could
-  // cast safely here.
-  const auto old_shape = shape.cast<DenseIntElementsAttr>();
-  auto iterator = old_shape.value_begin<int>();
-  SmallVector<int, 4> new_shape;
-  SmallVector<int64_t, 4> new_shape_i64;
-  for (int i = 0; i < old_shape.size() - 1; ++i) {
-    new_shape.push_back(*iterator);
-    new_shape_i64.push_back(*iterator);
-    ++iterator;
+Value ReshapeValueDroppingLastDim(OpBuilder &builder, Value value) {
+  // This function is always guarded with HasTrivialShapeExceptSecondLastDim(),
+  // so we could cast safely here.
+  auto type = value.getType().cast<ShapedType>();
+  SmallVector<int> new_shape;
+  for (int64_t dim : type.getShape().drop_back()) {
+    new_shape.push_back(dim);
   }
   return builder.create<ReshapeOp>(
-      value.getLoc(),
-      RankedTensorType::get(
-          new_shape_i64, value.getType().cast<ShapedType>().getElementType()),
-      value,
+      value.getLoc(), value,
       builder.create<arith::ConstantOp>(
-          value.getLoc(), DenseIntElementsAttr::get(
-                              RankedTensorType::get({old_shape.size() - 1},
-                                                    builder.getI32Type()),
-                              new_shape)));
+          value.getLoc(),
+          DenseIntElementsAttr::get(
+              RankedTensorType::get(type.getRank() - 1, builder.getI32Type()),
+              new_shape)));
 }
 
 // Returns true if val has a static shape and the last dimension equals 1.
@@ -467,6 +502,27 @@ bool IsLastDimensionEqualOne(Value val) {
   return last_element == 1;
 }
 
+// Returns true if the supplied value-
+// 1) Has only one use or
+// 2) Is only used by binary op like AddOp, SubOp, MulOp or DivOp.
+bool HasOneUseOrUsedByOnlyBinaryOps(Value out_value) {
+  if (out_value.hasOneUse()) {
+    return true;
+  }
+
+  for (auto &use : out_value.getUses()) {
+    mlir::Operation *owner = use.getOwner();
+    if (!llvm::isa<mlir::TFL::AddOp>(owner) &&
+        !llvm::isa<mlir::TFL::SubOp>(owner) &&
+        !llvm::isa<mlir::TFL::DivOp>(owner) &&
+        !llvm::isa<mlir::TFL::MulOp>(owner)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 // Returns true if attr is a DenseIntElementsAttr of int32 or int64 values or an
 // incrementing sequence from 0 to N-1.
 //
@@ -481,7 +537,10 @@ bool IsOneHotIndexAttribute(Attribute attr) {
   if (index_elem_bits != 32 && index_elem_bits != 64) {
     return false;
   }
-  if (index_type.getRank() != 1) {
+  // Checks that the index has shape of [1, 1, 1, ..., 1, N].
+  if (index_type.getRank() < 1 ||
+      llvm::any_of(index_type.getShape().drop_back(),
+                   [](int64_t dim) { return dim != 1; })) {
     return false;
   }
   const auto elems = dense_attr.value_begin<APInt>();
@@ -493,6 +552,32 @@ bool IsOneHotIndexAttribute(Attribute attr) {
   return true;
 }
 
+Value Get1DShapeValue(OpBuilder &builder, Value value) {
+  auto type = value.getType().cast<ShapedType>();
+  if (!type.hasStaticShape()) {
+    return nullptr;
+  }
+  auto output_type = RankedTensorType::get({1}, builder.getI32Type());
+  const int num_elements = type.getNumElements();
+  return builder.create<ConstOp>(
+      value.getLoc(), output_type,
+      DenseIntElementsAttr::get(output_type, num_elements));
+}
+
+Type GetEmbeddingLookupShape(Value lookup, Value value) {
+  auto lookup_type = lookup.getType().cast<ShapedType>();
+  if (!lookup_type.hasStaticShape()) {
+    return nullptr;
+  }
+  auto value_type = value.getType().cast<ShapedType>();
+  if (!value_type.hasStaticShape() || value_type.getRank() != 2) {
+    return nullptr;
+  }
+  SmallVector<int64_t> new_shape = {lookup_type.getNumElements(),
+                                    value_type.getDimSize(0)};
+  return value_type.clone(new_shape);
+}
+
 // Creates FullyConnected op from params and returns the output.
 mlir::Value GetFcOutput(OpBuilder *builder,
                         ::mlir::Operation::result_range result, Value input,
@@ -521,7 +606,7 @@ bool AllValuesAreZero(mlir::Value value) {
 // Converts an Attribute with a single value of float or integral type to an
 // Attribute holding a single value of float type. If attr has no elements, the
 // result is 0.0f.
-Attribute ConvertSingleElementAttrToFloatAttr(Attribute attr) {
+TypedAttr ConvertSingleElementAttrToFloatAttr(Attribute attr) {
   const auto dense_fp_attr = attr.dyn_cast_or_null<DenseFPElementsAttr>();
   if (dense_fp_attr) {
     // Already float => return
@@ -578,8 +663,6 @@ struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
     bool is_scalar_rhs = false;
     if (constant_val_type.getRank() == 0) {
       is_scalar_rhs = true;
-    } else if (constant_val_type.getRank() != 1) {
-      return failure();
     }
 
     Value filter = fc_op.getFilter();
@@ -593,7 +676,18 @@ struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
 
     // Rewrite
     if (is_none_bias) {
-      if (is_scalar_rhs) {
+      if (constant_val_type.getRank() == 1) {
+        // If there no pre-existing bias and the `constant_val` is 1D, simply
+        // use `constant_val` as bias.
+        bias = constant_val;
+      } else {
+        if (!is_scalar_rhs &&
+            !(IsReducedTailOfShape(constant_val.getType(), filter.getType()) &&
+              IsLastDimEqualToNumElements(filter.getType(),
+                                          constant_val.getType()))) {
+          return failure();
+        }
+
         // If the `constant_val` is scalar, we must the shape of filter
         // to properly broadcast the scalar to `{num_channels}` shape.
 
@@ -606,29 +700,48 @@ struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
         }
         int num_channels = filter_type.getShape()[0];
 
-        // Create a zero tensor with shape {num_channels}, and the type need to
-        // be the same as constant_val.
-        // This is a way to gracefully handle scalar tensor. The Add will always
-        // be constant-folded away regardless if `constant_val` is a scalar or
-        // not.
+        // Create a zero tensor with shape {num_channels}, and the type need
+        // to be the same as constant_val. This is a way to gracefully handle
+        // scalar tensor. The Add will always be constant-folded away
+        // regardless if `constant_val` is a scalar or not.
         RankedTensorType type = RankedTensorType::get(
             {num_channels}, constant_val_type.getElementType());
         auto attr = rewriter.getZeroAttr(type);
         bias = rewriter.create<arith::ConstantOp>(add_op.getLoc(), type, attr);
         auto none_af = rewriter.getStringAttr("NONE");
-        bias =
-            rewriter.create<AddOp>(add_op.getLoc(), bias, constant_val, none_af)
-                .getOutput();
-      } else {
-        // If there no pre-existing bias and the `constant_val` is 1D, simply
-        // use `constant_val` as bias.
-        bias = constant_val;
+        if (is_scalar_rhs) {
+          bias =
+              rewriter
+                  .create<AddOp>(add_op.getLoc(), bias, constant_val, none_af)
+                  .getOutput();
+        } else {
+          // If the RHS is neither a scalar constant nor a 1d constant, look
+          // if there is opportunity to reduce the dimentionality and allow
+          // implicit broadcasting
+
+          auto new_added_value = added_value.reshape(RankedTensorType::get(
+              {added_value.getType().cast<ShapedType>().getNumElements()},
+              added_value.getType().cast<ShapedType>().getElementType()));
+
+          ::mlir::arith::ConstantOp new_constant_val =
+              rewriter.create<::mlir::arith::ConstantOp>(
+                  add_op.getLoc(),
+                  /*value=*/new_added_value);
+
+          bias = rewriter
+                     .create<::mlir::TFL::AddOp>(
+                         add_op.getLoc(),
+                         /*lhs=*/bias,
+                         /*rhs=*/new_constant_val.getResult(),
+                         /*fused_activation_function=*/none_af)
+                     .getOutput();
+        }
       }
     } else {
-      auto none_af = rewriter.getStringAttr("NONE");
-      bias =
-          rewriter.create<AddOp>(add_op.getLoc(), bias, constant_val, none_af)
-              .getOutput();
+      bias = rewriter
+                 .create<AddOp>(add_op.getLoc(), bias, constant_val,
+                                rewriter.getStringAttr("NONE"))
+                 .getOutput();
     }
 
     auto fc = rewriter.create<TFL::FullyConnectedOp>(
@@ -641,7 +754,8 @@ struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
         rewriter.getStringAttr(add_op.getFusedActivationFunction()),
         /*weights_format=*/rewriter.getStringAttr(fc_op.getWeightsFormat()),
         /*keep_num_dims=*/rewriter.getBoolAttr(fc_op.getKeepNumDims()),
-        /*asymmetric_quantize_inputs=*/fc_op.getAsymmetricQuantizeInputsAttr());
+        /*asymmetric_quantize_inputs=*/
+        fc_op.getAsymmetricQuantizeInputsAttr());
     rewriter.replaceOp(add_op, fc.getOutput());
 
     return success();
@@ -1488,7 +1602,7 @@ struct FuseUnpackAndConcatToReshape
     if (!unpack_op || unpack_op.getNumResults() != concat_op.getNumOperands()) {
       return failure();
     }
-    for (auto &index_and_value : llvm::enumerate(concat_op.getValues())) {
+    for (const auto &index_and_value : llvm::enumerate(concat_op.getValues())) {
       if (index_and_value.value() !=
           unpack_op.getResult(index_and_value.index())) {
         return failure();
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index e772f9cb88d..216ac15c034 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -25,12 +25,12 @@ include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
 // Checks if the param passed is a F32 ElementsAttr.
 def F32ElementsAttr : ElementsAttrBase<
-  CPred<"$_self.isa<ElementsAttr>() && $_self.cast<ElementsAttr>().getType().getElementType().isF32()">,
+  CPred<"$_self.isa<ElementsAttr>() && $_self.cast<ElementsAttr>().getShapedType().getElementType().isF32()">,
         "32 bit float constant tensor">;
 
 // Checks if the param passed is a float ElementsAttr.
 def FloatElementsAttr : ElementsAttrBase<
-  CPred<"$_self.isa<ElementsAttr>() && $_self.cast<ElementsAttr>().getType().getElementType().isa<FloatType>()">,
+  CPred<"$_self.isa<ElementsAttr>() && $_self.cast<ElementsAttr>().getShapedType().getElementType().isa<FloatType>()">,
         "float constant tensor">;
 
 // Checks if the param passed is of NoneType.
@@ -44,11 +44,28 @@ class HasRankAtMost<int n> : Constraint<
     CPred<"$0.getType().cast<ShapedType>().hasRank() && "
           "$0.getType().cast<ShapedType>().getRank() <= " # n>>;
 
+// Checks if the value has rank at most 'n'.
+class HasRankAtLeast<int n> : Constraint<
+    CPred<"$0.getType().cast<ShapedType>().hasRank() && "
+          "$0.getType().cast<ShapedType>().getRank() >= " # n>>;
+
 // Checks if the value has rank 'n'.
 class HasRank<int n> : Constraint<
     CPred<"$0.getType().cast<ShapedType>().hasRank() && "
           "$0.getType().cast<ShapedType>().getRank() == " # n>>;
 
+// Flattens a constant tensor to 1D.
+def FlattenTo1D : NativeCodeCall<"FlattenTo1D($0)">;
+
+def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
+
+def HasSameStaticShapes : Constraint<
+    CPred<"$0.getType().cast<ShapedType>().hasStaticShape() && "
+          "$1.getType().cast<ShapedType>().hasStaticShape() && "
+          "$0.getType().cast<ShapedType>().getShape() =="
+          "$1.getType().cast<ShapedType>().getShape()">,
+    "have the same static shape">;
+
 //===----------------------------------------------------------------------===//
 // Ternary ops patterns.
 //===----------------------------------------------------------------------===//
@@ -111,7 +128,7 @@ multiclass FuseBinaryOpToPrecedingAffine<Op binaryOp> {
               (Arith_ConstantOp FloatElementsAttr:$value), $act_fn),
     (TFL_Conv2DOp $input, $filter,
       (binaryOp (Arith_ConstantOp $bias),
-         (Arith_ConstantOp $value), TFL_AF_None),
+                (Arith_ConstantOp (FlattenTo1D $value)), TFL_AF_None),
       $h_factor, $w_factor, $act_fn, $padding, $stride_h, $stride_w),
     [(CanFuseConvOrDepthwiseConv<"false"> $filter, $value),
      (HasOneUse $output)]>;
@@ -122,11 +139,11 @@ multiclass FuseBinaryOpToPrecedingAffine<Op binaryOp> {
                 $stride_w, $multiplier),
               (Arith_ConstantOp FloatElementsAttr:$value), $act_fn),
     (TFL_DepthwiseConv2DOp $input, $filter,
-      (binaryOp (Arith_ConstantOp $bias), (Arith_ConstantOp $value), TFL_AF_None),
+      (binaryOp (Arith_ConstantOp $bias),
+                (Arith_ConstantOp (FlattenTo1D $value)), TFL_AF_None),
       $h_factor, $w_factor, $act_fn, $padding, $stride_h, $stride_w,
       $multiplier),
     [(CanFuseConvOrDepthwiseConv<"true"> $filter, $value),
-     (HasRank<1> $value),
      (HasOneUse $output)]>;
    def FuseBinaryOpWithTransposeConv#binaryOp : Pat<
     (binaryOp (TFL_TransposeConvOp:$output $output_shape, $weights, $input,
@@ -181,12 +198,11 @@ multiclass FuseMulOrDivWithConv2dOrDepthwiseConv2d<Op BinaryOp> {
         TFL_AF_None),
       (BinaryOp
         (Arith_ConstantOp $bias),
-        (Arith_ConstantOp $value),
+        (Arith_ConstantOp (FlattenTo1D $value)),
         TFL_AF_None),
       $h_factor, $w_factor, $act_fn, $padding, $stride_h,
       $stride_w, $multiplier),
     [(CanFuseConvOrDepthwiseConv<"true"> $filter, $value),
-     (HasRank<1> $value),
      (HasOneUse $output)]>;
   def FuseMulOrDivWithConv#BinaryOp : Pat<
     (BinaryOp (TFL_Conv2DOp:$conv_output $input,
@@ -200,7 +216,7 @@ multiclass FuseMulOrDivWithConv2dOrDepthwiseConv2d<Op BinaryOp> {
         (Arith_ConstantOp (ExpandTo4DForConv $value)),
         TFL_AF_None),
       (BinaryOp (Arith_ConstantOp $bias),
-        (Arith_ConstantOp $value),
+        (Arith_ConstantOp (FlattenTo1D $value)),
         TFL_AF_None),
       $h_factor, $w_factor, $act_fn, $padding, $stride_h, $stride_w),
     [(CanFuseConvOrDepthwiseConv<"false"> $filter, $value),
@@ -395,19 +411,30 @@ def OperandsBroadcastToOutputType : Constraint<CPred<
 def IsTailOfShape : Constraint<CPred<
   "TFL::IsTailOfShape($0.getType(), $1.getType())">>;
 
+def IsReducedTailOfShape : Constraint<CPred<
+  "TFL::IsReducedTailOfShape($0.getType(), $1.getType())">>;
+
+def IsRankLessThanEqualTo : Constraint<CPred<
+  "$0.getType().cast<ShapedType>().getRank() <= "
+  "$1.getType().cast<ShapedType>().getRank()">>;
+
 def Flatten : NativeCodeCall<
   "$0.cast<DenseElementsAttr>()"
     ".reshape(RankedTensorType::get({$0.getType().cast<ShapedType>().getNumElements()}, "
                                    "$0.getType().cast<ShapedType>().getElementType()))">;
 
 def IsLastDimEqualToNumElements : Constraint<CPred<
-  "$0.getType().cast<ShapedType>().getRank() >= 1 && "
-  "$0.getType().cast<ShapedType>().getDimSize($0.getType().cast<ShapedType>().getRank() - 1) == "
-  "$1.getType().cast<ShapedType>().getNumElements()">>;
+  "TFL::IsLastDimEqualToNumElements($0.getType(), $1.getType())">>;
 
 def IsDefinedByFullyConnectedOp : Constraint<CPred<
   "$0.getDefiningOp<TFL::FullyConnectedOp>() != nullptr">>;
 
+// Returns true if the supplied value-
+// 1) Has only one use or
+// 2) Is only used by binary op like AddOp, SubOp, MulOp or DivOp.
+def HasOneUseOrUsedByOnlyBinaryOps : Constraint<CPred<
+  "TFL::HasOneUseOrUsedByOnlyBinaryOps($0)">>;
+
 // Pattern for skipping Tile if it is mainly for broadcasting and the
 // Op is already supporting broadcasting.
 multiclass FuseTileBroadcastIntoFollowingBinary<Op BinaryOp> {
@@ -475,43 +502,94 @@ foreach BinaryOp = [TFL_AddOp, TFL_SubOp, TFL_DivOp, TFL_MulOp] in {
      (HasRankAtMost<4> $rhs),
      (SameElementType $input, $rhs)]>;
 
-    // Move binary op before reshape:
-    // binary(reshape(lhs), reshape(rhs)) => reshape(binary(lhs, rhs))
-    // This is valid only when both side of the binary operand is reshaped, and
-    // the sizes are the same both before and after the reshape.
-    def MoveBinaryOpBeforeReshape#BinaryOp : Pat<
-      (BinaryOp (TFL_ReshapeOp:$lhs $input1, (Arith_ConstantOp:$shape1 $s1)),
-                (TFL_ReshapeOp:$rhs $input2, (Arith_ConstantOp:$shape2 $s2)),
-                $act_fn),
-      (TFL_ReshapeOp (BinaryOp $input1, $input2, $act_fn), $shape1),
-      [(IsTailOfShape $rhs, $lhs),
-       (IsTailOfShape $lhs, $rhs),
-       (IsTailOfShape $input1, $input2),
-       (IsTailOfShape $input2, $input1),
-       (SameElementType $input1, $input2)]>;
+  // Move binary op before reshape:
+  // binary(reshape(lhs), reshape(rhs)) => reshape(binary(lhs, rhs))
+  // This is valid only when both side of the binary operand is reshaped, and
+  // the sizes are the same both before and after the reshape.
+  def MoveBinaryOpBeforeReshape#BinaryOp : Pat<
+    (BinaryOp (TFL_ReshapeOp:$lhs $input1, (Arith_ConstantOp:$shape1 $s1)),
+              (TFL_ReshapeOp:$rhs $input2, (Arith_ConstantOp:$shape2 $s2)),
+              $act_fn),
+    (TFL_ReshapeOp (BinaryOp $input1, $input2, $act_fn), $shape1),
+    [(IsTailOfShape $rhs, $lhs),
+      (IsTailOfShape $lhs, $rhs),
+      (IsTailOfShape $input1, $input2),
+      (IsTailOfShape $input2, $input1),
+      (SameElementType $input1, $input2)]>;
 
-    // Move binary op before reshape:
-    // binary(reshape(lhs), rhs) => reshape(binary(lhs, flatten(rhs)))
-    // This is valid only when the last dimension of lhs is equal to the
-    // number of elements in constant rhs.
-    // Therefore, after transformation broadcast of binary op is always
-    // applied to the last dimension of $input.
-    def MoveBinaryOpFlattenConstBeforeReshape#BinaryOp : Pat<
-      (BinaryOp (TFL_ReshapeOp:$lhs $input, (Arith_ConstantOp:$shape $s)),
-                (Arith_ConstantOp:$rhs ElementsAttr:$rhs_attr), $act_fn),
-      (TFL_ReshapeOp (BinaryOp $input, (Arith_ConstantOp (Flatten $rhs_attr)),
-                               $act_fn),
-                     $shape),
-      [(AnyStaticShapeTensor $input),
-       (IsTailOfShape $rhs, $lhs),
-       (IsLastDimEqualToNumElements $input, $rhs),
-       (HasOneUse $lhs),
-       // Restrict operands to have at most rank 4 because TFLite binary
-       // kernel supports up to 4D broadcast.
-       (HasRankAtMost<4> $input),
-       (HasRankAtMost<4> $lhs),
-       (HasRankAtMost<4> $rhs),
-       (IsDefinedByFullyConnectedOp $input)]>;
+  // Move binary op batched RHS before reshape:
+  // binary(reshape(lhs), rhs) => reshape(binary(lhs, flatten(rhs)))
+  // Pattern targetted here is as follows-
+  // [input, lhr, rhs] == [<1x1024x128>, <1x1024x8x16>, <1x1x8x16xf32>]
+  // This is valid only when the-
+  // 1.last dimension of lhs is equal to the number of elements in constant rhs.
+  // 2.Reduded shape of rhs, here <8x16> is equal to last dimensions of lhs.
+  // Therefore, after transformation broadcast of binary op is always
+  // applied to the last dimension of $input.
+  def MoveBinaryOpFlattenConstBeforeReshape#BinaryOp : Pat<
+    (BinaryOp (TFL_ReshapeOp:$lhs $input, (Arith_ConstantOp:$shape $s)),
+              (Arith_ConstantOp:$rhs ElementsAttr:$rhs_attr), $act_fn),
+    (TFL_ReshapeOp (BinaryOp $input, (Arith_ConstantOp (Flatten $rhs_attr)),
+                              $act_fn),
+                    $shape),
+    [(AnyStaticShapeTensor $input),
+      (IsReducedTailOfShape $rhs, $lhs),
+      (IsLastDimEqualToNumElements $input, $rhs),
+      (HasOneUse $lhs),
+      // Restrict operands to have at most rank 4 because TFLite binary
+      // kernel supports up to 4D broadcast.
+      (HasRankAtMost<4> $input),
+      (HasRankAtMost<4> $lhs),
+      (HasRankAtMost<4> $rhs),
+      (IsDefinedByFullyConnectedOp $input)]>;
+
+  // Pattern to remove redundant reshape op used as LHS to binary ops
+  // Binary(Reshape(input, shape), rhs) -> Binary(input, rhs)
+  // This pattern is valid only if-
+  // 1. The shape is only adding broadcasting that can otherwise be implicitly
+  // handled by the binary op. Ex- shape == [1, 1, 1, 128]
+  // 2. The rank of the input to reshape is <= reshape output.
+  // 3. The rank of the output to reshape is <= binary rhs.
+  // The conditions 2 and 3 will make sure any required increase in
+  // dimentionality dure to reshape op is not lost.
+  def RemoveRedundantReshapeUsedAsLhsTo#BinaryOp : Pat<
+    (BinaryOp (TFL_ReshapeOp:$lhs $input, (Arith_ConstantOp:$shape $s)),
+              $rhs, $act_fn),
+    (BinaryOp $input, $rhs, $act_fn),
+    [(AnyStaticShapeTensor $input),
+     (AnyStaticShapeTensor $rhs),
+     (IsRankLessThanEqualTo $input, $lhs),
+     (IsRankLessThanEqualTo $lhs, $rhs),
+     (IsReducedTailOfShape $lhs, $input),
+     (HasOneUseOrUsedByOnlyBinaryOps $lhs),
+     // Restrict operands to have at most rank 4 because TFLite binary
+     // kernel supports up to 4D broadcast.
+     (HasRankAtMost<4> $input),
+     (HasRankAtMost<4> $rhs)]>;
+
+  // Pattern to remove redundant reshape op used as RHS to binary ops
+  // Binary(lhs, Reshape(input, shape)) -> Binary(lhs, input)
+  // This pattern is valid only if-
+  // 1. The shape is only adding broadcasting that can otherwise be implicitly
+  // handled by the binary op. Ex- shape == [1, 1, 1, 128]
+  // 2. The rank of the input to reshape is <= reshape output.
+  // 3. The rank of the output to reshape is <= binary lhs.
+  // The conditions 2 and 3 will make sure any required increase in
+  // dimentionality dure to reshape op is not lost.
+  def RemoveRedundantReshapeUsedAsRhsTo#BinaryOp : Pat<
+    (BinaryOp $lhs, (TFL_ReshapeOp:$rhs $input, (Arith_ConstantOp:$shape $s)),
+              $act_fn),
+    (BinaryOp $lhs, $input, $act_fn),
+    [(AnyStaticShapeTensor $input),
+     (AnyStaticShapeTensor $lhs),
+     (IsRankLessThanEqualTo $input, $rhs),
+     (IsRankLessThanEqualTo $rhs, $lhs),
+     (IsReducedTailOfShape $rhs, $input),
+     (HasOneUseOrUsedByOnlyBinaryOps $rhs),
+     // Restrict operands to have at most rank 4 because TFLite binary
+     // kernel supports up to 4D broadcast.
+     (HasRankAtMost<4> $input),
+     (HasRankAtMost<4> $lhs)]>;
 }
 
 foreach BinaryOp = [TFL_FloorDivOp, TFL_FloorModOp, TFL_MinimumOp,
@@ -620,12 +698,22 @@ def ConvertExpandDimsToReshape : Pat<
 class FloatValueEquals<string val> : Constraint<CPred<
   "FloatValueEquals($0, " # val # ")">>;
 
+// Here, the element type can be any integer or float type.
+class IsConstantValueOf<int value> : Constraint<CPred<
+  "TFL::IsConstantValueOf($0," # value # ")">>;
+
 // ReLU patterns
 def MatchReluPattern : Pat<
   (TFL_MaximumOp $input, (Arith_ConstantOp $Zero)),
   (TFL_ReluOp $input),
   [(FloatValueEquals<"0"> $Zero)]>;
 
+// Optimize Minimum of tf.Relu and constant six to tf.Relu6
+def MinimumOfReluAnd6ToRelu6 :
+  Pat<(TFL_MinimumOp (TFL_ReluOp $x), (Arith_ConstantOp $y)),
+      (TFL_Relu6Op $x),
+      [(IsConstantValueOf<6> $y)]>;
+
 def MatchRelu1Pattern1 : Pat<
   (TFL_MinimumOp (TFL_MaximumOp $input, (Arith_ConstantOp $NegOne)),
     (Arith_ConstantOp $One)),
@@ -855,6 +943,23 @@ foreach SelectOp = [TFL_SelectOp, TFL_SelectV2Op] in {
   def Optimize#SelectOp#Not : Pat<
     (SelectOp (TFL_LogicalNotOp $condition), $input1, $input2),
     (SelectOp $condition, $input2, $input1)>;
+
+  // Fuse select(broadcast_to(input, shape), x, y) -> selectV2(input, x, y)
+  // Also, fuse selectv2(broadcast_to(input, shape), x, y) -> selectV2(input, x, y)
+  // It is safe to perform this transform here because-
+  // the shapes of `pre_broadcast` and `dim` must be broadcast
+  // compatible for the `broadcast_to` op to be valid.
+  // And considering, `shape(post_broadcast)` == `shape(%input1)`,
+  // `post_broadcast` is broadcast compatible with `input1`.
+  def FuseBroadcastInto#SelectOp : Pat<
+    (SelectOp
+      (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim),
+      AnyStaticShapeTensor:$input1, AnyStaticShapeTensor:$input2),
+    (TFL_SelectV2Op $pre_broadcast, $input1, $input2),
+    [(HasSameStaticShapes $post_broadcast, $input1),
+     (HasRankAtMost<4> $post_broadcast),
+     (HasRankAtMost<4> $input1),
+     (HasRankAtMost<4> $input2)]>;
 }
 
 def EliminateLogicalAndTrue : Pat<
@@ -914,38 +1019,33 @@ def OptimizeSliceOp : Pat<
   (replaceWithValue $input),
   [(CanOptimizeIdentitySliceOp $input, $begin, $size)]>;
 
-def GetNumElementsOrOne: NativeCodeCall<"GetNumElementsOrOne($0)">;
+def GetNumElementsOrOne: NativeCodeCall<"GetNumElementsOrOne($0.getType())">;
 
 def ReshapeValueDroppingLastDim : NativeCodeCall<
-  "ReshapeValueDroppingLastDim($_builder, $0, $1)">;
-
-def HasExactlyTwoElements : Constraint<CPred<
-  "TFL::HasExactlyTwoElements($0)">>;
-
-def IsLastElementEqualsOne : Constraint<CPred<
-  "TFL::IsLastElementEqualsOne($0)">>;
+  "ReshapeValueDroppingLastDim($_builder, $0)">;
 
 def IsOneHotIndexAttribute : Constraint<CPred<
   "TFL::IsOneHotIndexAttribute($0)">>;
 
+// Checks if the shape has shape with last dimension equals 1.
+def IsLastDimensionEqualOne : Constraint<CPred<"IsLastDimensionEqualOne($0)">>;
+
 // Replace
-//   Equal(Reshape(X, shape), indices)
+//   Equal(X, indices)
 // With
-//   OneHot(Reshape(X, shape[:-1]), N, true, false, -1)
+//   OneHot(Reshape(X), N, true, false, -1)
 // where
-//  - shape has length 2 (unnecessary, just to be conservative)
-//  - last value in shape is 1
+//  - last dimension of the LHS of the equal is 1, and the rank is at least 2.
 //  - indices is a incrementing series from 0 to N-1. (N elements total.)
 def ReshapeEqualOpToOneHotOp : Pat<
-  (TFL_EqualOp (TFL_ReshapeOp $x, (Arith_ConstantOp $shape)),
-               (Arith_ConstantOp $series)),
-  (TFL_OneHotOp (ReshapeValueDroppingLastDim $x, $shape),
+  (TFL_EqualOp $x, (Arith_ConstantOp $series)),
+  (TFL_OneHotOp (ReshapeValueDroppingLastDim $x),
                 (Arith_ConstantOp (GetNumElementsOrOne $series)),
                 (Arith_ConstantOp ConstantAttr<RankedSignlessIntElementsAttr<1, []>, "true">),
                 (Arith_ConstantOp ConstantAttr<RankedSignlessIntElementsAttr<1, []>, "false">),
                 ConstantAttr<I32Attr, "-1">),
-  [(HasExactlyTwoElements $shape),
-   (IsLastElementEqualsOne $shape),
+  [(IsLastDimensionEqualOne $x),
+   (HasRankAtLeast<2> $x),
    (IsOneHotIndexAttribute $series)]>;
 
 def F32ElementsVal : Constraint<CPred<
@@ -975,6 +1075,12 @@ def FuseOneHotAndCastToFloat : Pat<
                 $axis),
   [(F32ElementsVal $output)]>;
 
+def Get1DShapeValue: NativeCodeCall<"Get1DShapeValue($_builder, $0)">;
+
+class GetIthValue<int index> : NativeCodeCall<"$0[" # index # "]">;
+
+def GetEmbeddingLookupShape: NativeCodeCall<"GetEmbeddingLookupShape($0, $1)">;
+
 // Replace
 //   OneHot(index, depth, on=1.0f, off=0.0f, axis=-1) * filter
 // With
@@ -996,26 +1102,29 @@ def FuseOneHotAndCastToFloat : Pat<
 // This is exactly what the EmbeddedLookup operator is doing, on the transposed
 // matrix, without doing any arithmetic but only memcpy.
 def ReplaceOneHotFullyConnectedWithLookup : Pat<
-  (TFL_FullyConnectedOp
+  (TFL_FullyConnectedOp:$outputs
     (TFL_OneHotOp
-      $indices,
+      AnyStaticShapeTensor:$indices,
       (Arith_ConstantOp $depth),
       (Arith_ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "1.0f">),
       (Arith_ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "0.0f">),
       ConstantAttr<I32Attr, "-1">),
-    $filter,
+    StaticShapeTensorOf<[F32, I8, UI8]>:$filter,
     $bias,
     TFL_AF_None,
     TFL_FCWO_Default,
-    ConstBoolAttrFalse,
+    $keep_num_dims,
     $asymmetric_quantize_inputs),
+  (TFL_ReshapeOp
   (TFL_EmbeddingLookupOp
-    $indices,
+      (TFL_ReshapeOp $indices, (Get1DShapeValue $indices)),
     (TFL_TransposeOp
       $filter,
-      (Arith_ConstantOp ConstantAttr<RankedI32ElementsAttr<[2]>, "{1,0}"> ))),
+        (Arith_ConstantOp ConstantAttr<RankedI32ElementsAttr<[2]>, "{1,0}">)),
+      (returnType (GetEmbeddingLookupShape $indices, $filter))
+    ),
+    (Arith_ConstantOp (GetShape (GetIthValue<0> $outputs)))),
   [(I32ElementsVal $indices),     // lookup is not implemented for i64
-   (HasRank<1> $indices),  // lookup isn't implemented for any other rank
    (IsNoneType $bias)]>;          // Maybe folded into the lookup matrix later
 
 def AreInputDimensionsOneInAxes : Constraint<CPred<
@@ -1113,10 +1222,6 @@ def MatchGelu : Pat<
    (HasOneUse $mul_out1),
   ]>;
 
-
-// Checks if the shape has shape with last dimension equals 1.
-def IsLastDimensionEqualOne : Constraint<CPred<"IsLastDimensionEqualOne($0)">>;
-
 // Fetches the output of FC op, from the provided arguments.
 def GetFcOutput : NativeCodeCall<
   "GetFcOutput(&$_builder, $0, $1, $2, $3, $4, $5, $6, $7)">;
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 21313752165..efd3506a8aa 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -240,10 +240,10 @@ struct FoldTransposeOp : public OpRewritePattern<TransposeOp> {
     ElementsAttr input_tensor = qconst_op.getValue();
 
     assert(perm_tensor.getType().getRank() == 1);
-    const int num_dimensions = input_tensor.getType().getRank();
+    const int num_dimensions = input_tensor.getShapedType().getRank();
     assert(perm_tensor.getType().getNumElements() == num_dimensions);
 
-    ArrayRef<int64_t> input_shape = input_tensor.getType().getShape();
+    ArrayRef<int64_t> input_shape = input_tensor.getShapedType().getShape();
     auto output_type = op.getOutput().getType().cast<ShapedType>();
 
     SmallVector<int32_t, 4> perm;
@@ -258,7 +258,7 @@ struct FoldTransposeOp : public OpRewritePattern<TransposeOp> {
     }
 
     std::vector<Attribute> new_values;
-    new_values.reserve(input_tensor.getType().getNumElements());
+    new_values.reserve(input_tensor.getShapedType().getNumElements());
     std::vector<uint64_t> input_indices(num_dimensions);
     ComputePermutation(input_tensor, perm, output_shape, num_dimensions,
                        /*output_axis=*/0, &input_indices, &new_values);
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index 7803940f65d..9064d6c7f50 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -76,6 +76,7 @@ def ConvertPlaceholderWithDefault : Pat<(TF_PlaceholderWithDefaultOp $arg), (TF_
 //===----------------------------------------------------------------------===//
 // Op removal patterns.
 //===----------------------------------------------------------------------===//
+def RemoveXlaSharding : Pat<(TF_XlaShardingOp $a, $b, $c), (replaceWithValue $a)>;
 def RemoveIdentityN : Pat<(TF_IdentityNOp $arg), (replaceWithValue $arg)>;
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
index 04f7fb84011..a19c29a666f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
@@ -108,7 +108,7 @@ class PrepareDynamicRangeQuantizableOp
       return failure();
     }
 
-    // 2. Quantize collected ops. It is immediatly quantized by inserting Q-DQ
+    // 2. Quantize collected ops. It is immediately quantized by inserting Q-DQ
     // pair for int8 while it is lazily applied for float16 by inserting CastOp.
     if (!(quantizeOps(rewriter, op, quantizable_ops))) {
       return failure();
@@ -160,7 +160,7 @@ class PrepareDynamicRangeQuantizableOp
   // Insert CastOp which is used to for converting float32 ConstantOp into
   // float16 quantization. If there is an existing CastOp connected to the
   // ConstantOp, the quantize_op will be rewired to the existing CastOp. This
-  // guarentees at most one CastOp is created for float32 to float16 conversion.
+  // guarantees at most one CastOp is created for float32 to float16 conversion.
   void quantizeOpAsFloat16(PatternRewriter& rewriter, arith::ConstantOp op,
                            std::pair<Operation*, int> quant_op) const {
     Operation* quantize_op = quant_op.first;
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
index ae2d501643c..4a0c7c42e90 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
@@ -24,7 +24,7 @@ include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 // Quantize attribute $0 by using quantization parameter from %1.
 def QuantizeByQuantizedType : NativeCodeCall<"quant::Quantize($0, $1.getValue())">;
 def F32ElementsAttr : ElementsAttrBase<
-  CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isF32()">, "float constant tensor">;
+  CPred<"$_self.cast<ElementsAttr>().getShapedType().getElementType().isF32()">, "float constant tensor">;
 
 // Squash tfl.dequantize and tfl.quantize pairs.
 // TODO(fengliuai): Compare the scale of input and output. This can also be
diff --git a/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc b/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
index 2a85b5d54aa..ce2d51a66e5 100644
--- a/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
@@ -90,7 +90,7 @@ void RaiseCustomOpsPass::runOnOperation() {
 
     new_block->addArguments(op->getOperandTypes(),
                             SmallVector<Location>(op->getNumOperands(), loc));
-    for (auto &idx_args : llvm::enumerate(new_block->getArguments())) {
+    for (const auto &idx_args : llvm::enumerate(new_block->getArguments())) {
       inner_op->setOperand(idx_args.index(), idx_args.value());
     }
     custom_op->setAttrs(inner_op->getAttrs());
diff --git a/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc b/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc
index 60a81091be8..20336080cc2 100644
--- a/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc
@@ -20,8 +20,8 @@ namespace mlir {
 namespace TFL {
 
 FloatAttr ExtractSingleElementAsFloat(ElementsAttr attr) {
-  if (attr.getType().getNumElements() != 1 ||
-      !attr.getType().getElementType().isa<FloatType>()) {
+  if (attr.getShapedType().getNumElements() != 1 ||
+      !attr.getShapedType().getElementType().isa<FloatType>()) {
     return {};
   }
   return attr.getSplatValue<FloatAttr>();
@@ -36,8 +36,8 @@ FloatAttr GetSingleElementAsFloatOrSelf(Attribute attr) {
 }
 
 IntegerAttr ExtractSingleElementAsInteger(ElementsAttr attr) {
-  if (attr.getType().getNumElements() != 1 ||
-      !attr.getType().getElementType().isSignlessInteger()) {
+  if (attr.getShapedType().getNumElements() != 1 ||
+      !attr.getShapedType().getElementType().isSignlessInteger()) {
     return {};
   }
   return attr.getSplatValue<IntegerAttr>();
diff --git a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
index b49f2a10bc5..9f2301d4803 100644
--- a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
@@ -33,7 +33,7 @@ tsl::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
     int value) {
   Type element_type = shaped_type.getElementType();
   ShapedType scalar_type = RankedTensorType::get({}, element_type);
-  Attribute attr;
+  TypedAttr attr;
   if (element_type.isF16()) {
     auto floatType = mlir::FloatType::getF16(element_type.getContext());
     auto floatAttr = mlir::FloatAttr::get(floatType, static_cast<float>(value));
@@ -118,7 +118,8 @@ tsl::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
     return tensorflow::Status(absl::StatusCode::kInvalidArgument,
                               "Unsupported type");
   }
-  return rewriter->create<arith::ConstantOp>(loc, scalar_type, attr);
+  return rewriter->create<arith::ConstantOp>(loc, scalar_type,
+                                             cast<TypedAttr>(attr));
 }
 
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/utils/low_bit_utils.cc b/tensorflow/compiler/mlir/lite/utils/low_bit_utils.cc
index 3311a75e387..aa2e9697595 100644
--- a/tensorflow/compiler/mlir/lite/utils/low_bit_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/low_bit_utils.cc
@@ -22,14 +22,18 @@ limitations under the License.
 namespace tflite {
 
 std::vector<uint8_t> PackInt4ValuesDensely(std::vector<uint8_t> src_buffer) {
-  std::vector<uint8_t> packed_buffer((src_buffer.size() + 1) / 2);
+  auto num_elements = src_buffer.size();
+  auto packed_size = (num_elements + 1) / 2;
+  std::vector<uint8_t> packed_buffer((num_elements + 1) / 2);
 
-  for (int i = 0; i < src_buffer.size(); ++i) {
-    if (i % 2 == 0) {
-      packed_buffer.at(i / 2) = src_buffer[i];
-    } else {
-      packed_buffer.at(i / 2) |= src_buffer[i] << 4;
-    }
+  for (int i = 0; i < num_elements - 1; i += 2) {
+    packed_buffer[i / 2] = src_buffer[i] & 0x0F;
+    packed_buffer[i / 2] |= src_buffer[i + 1] << 4;
+  }
+
+  // Copy the final nibble if the buffer is odd-lengthed
+  if (num_elements % 2 != 0) {
+    packed_buffer[packed_size - 1] = src_buffer[num_elements - 1] & 0x0F;
   }
 
   return packed_buffer;
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.h b/tensorflow/compiler/mlir/lite/utils/utils.h
new file mode 100644
index 00000000000..7878b675895
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/utils.h
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_UTILS_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+using llvm::ArrayRef;
+using mlir::Operation;
+using mlir::ShapedType;
+using mlir::Value;
+
+// Returns true if all tensor value in `values` has static shape and same shape.
+inline bool OpHasSameStaticShapes(Operation* op) {
+  auto values = op->getOperands();
+  int operand_num = 0;
+  ArrayRef<int64_t> shape;
+  for (Value value : values) {
+    auto shaped_type = value.getType().dyn_cast<ShapedType>();
+    if (!shaped_type || !shaped_type.hasStaticShape()) {
+      return false;
+    }
+    if (operand_num == 0) {
+      shape = shaped_type.getShape();
+    } else {
+      if (shape != shaped_type.getShape()) {
+        return false;
+      }
+    }
+    ++operand_num;
+  }
+  return true;
+}
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.td b/tensorflow/compiler/mlir/lite/utils/utils.td
index bd832527ce5..4c8485c3551 100644
--- a/tensorflow/compiler/mlir/lite/utils/utils.td
+++ b/tensorflow/compiler/mlir/lite/utils/utils.td
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -27,5 +27,11 @@ def NotFromQuantOpOrSameQuantType : Constraint<
 def SameElementType : Constraint<
   CPred<"getElementTypeOrSelf($0) == getElementTypeOrSelf($1)">>;
 
-// Checks if the value has only one user.
-def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
+// Checks if all of an ops inputs are the same static shape.
+// BUILD NOTE: "OpHasSameStaticShapes" here refers to the C++ function defined 
+// in `utils/utils.h`. The `utils.h` header is included in `tfl_ops.h` so all
+// of our files will have access to `OpHasSameStaticShapes` when including files
+// generated from table-gen.
+def OpHasSameStaticShapesPred : CPred<"OpHasSameStaticShapes($0.getDefiningOp())">;
+def OpHasSameStaticShapes : Constraint<OpHasSameStaticShapesPred, "op must have static same input shapes">;
+def OpHasNotSameStaticShapes : Constraint<Neg<OpHasSameStaticShapesPred>, "op must have not static same input shapes">;
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
index 90b9d8cc854..dc69f3d64bb 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -57,6 +57,14 @@ auto* mlir_graph_optimization_pass_fallback_count = monitoring::Counter<1>::New(
     "used",
     /* metric field */ "status");
 
+auto* mlir_function_pass_graph_conversion_count = monitoring::Counter<1>::New(
+    /* metric name */
+    "/tensorflow/core/mlir_function_pass_graph_conversion_count",
+    /* metric description */
+    "Track success/failure of Graph to MLIR conversions in function "
+    "optimization pass",
+    /* metric field */ "status");
+
 // The status metric field is used to record success/failure of mlir
 // function/graph optimization passes.
 constexpr char kSuccess[] = "kSuccess";
@@ -76,8 +84,8 @@ static void DumpModule(mlir::ModuleOp module, std::string file_prefix) {
   auto* env = tensorflow::Env::Default();
   auto status = env->RecursivelyCreateDir(prefix);
   if (!status.ok()) {
-    LOG(WARNING) << "cannot create directory '" + prefix +
-                        "': " + status.error_message();
+    LOG(WARNING) << "cannot create directory '" << prefix
+                 << "': " << status.message();
     return;
   }
 
@@ -90,8 +98,7 @@ static void DumpModule(mlir::ModuleOp module, std::string file_prefix) {
   std::unique_ptr<WritableFile> file_writer;
   status = env->NewWritableFile(prefix, &file_writer);
   if (!status.ok()) {
-    LOG(WARNING) << "cannot open file '" + prefix +
-                        "': " + status.error_message();
+    LOG(WARNING) << "cannot open file '" << prefix << "': " << status.message();
     return;
   }
 
@@ -104,21 +111,14 @@ static void DumpModule(mlir::ModuleOp module, std::string file_prefix) {
 
   status = file_writer->Append(txt_module);
   if (!status.ok()) {
-    LOG(WARNING) << "error writing to file '" + prefix +
-                        "': " + status.error_message();
+    LOG(WARNING) << "error writing to file '" << prefix
+                 << "': " << status.message();
     return;
   }
   (void)file_writer->Close();
   VLOG(1) << "Dumped MLIR module to " << prefix;
 }
 
-static std::string GetModuleText(mlir::ModuleOp module) {
-  std::string module_txt;
-  llvm::raw_string_ostream os(module_txt);
-  module.print(os);
-  return module_txt;
-}
-
 MlirOptimizationPassRegistry& MlirOptimizationPassRegistry::Global() {
   static auto* global = new MlirOptimizationPassRegistry();
   return *global;
@@ -137,8 +137,8 @@ static void RegisterDialects(mlir::DialectRegistry& registry) {
 
 Status MlirFunctionOptimizationPass::Run(
     const std::string& function_name, const DeviceSet& device_set,
-    const ConfigProto& config_proto, std::unique_ptr<Graph>* graph,
-    FunctionLibraryDefinition* flib_def,
+    const ConfigProto& config_proto, absl::string_view xla_compile_device_type,
+    std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
     std::vector<std::string>* control_ret_node_names,
     bool* control_rets_updated) {
   //  overall_state equals to:
@@ -208,6 +208,7 @@ Status MlirFunctionOptimizationPass::Run(
   // the shape inference pass is run early in the pass pipeline, shape inference
   // during import is not necessary.
   import_config.enable_shape_inference = false;
+  import_config.xla_compile_device_type = xla_compile_device_type;
 
   static const char* kTfMlirCategory = "TfMlir";
   tensorflow::metrics::ScopedCounter<2> timings(
@@ -216,6 +217,9 @@ Status MlirFunctionOptimizationPass::Run(
 
   auto module_ref_status = ConvertGraphToMlir(**graph, debug_info, *flib_def,
                                               import_config, &context);
+  mlir_function_pass_graph_conversion_count
+      ->GetCell(absl::StatusCodeToString(module_ref_status.status().code()))
+      ->IncrementBy(1);
   timings.ReportAndStop();
 
   if (!module_ref_status.ok()) {
@@ -237,8 +241,14 @@ Status MlirFunctionOptimizationPass::Run(
   for (auto& pass_registration : registry_->passes()) {
     llvm::StringRef name = pass_registration.pass->name();
 
-    DUMP_MLIR_MODULE(function_name, llvm::formatv("mlir_{0}_before", name),
-                     GetModuleText(*module_ref), VLOG_IS_ON(1));
+    if (DEBUG_DATA_DUMPER()->ShouldDump(function_name, kDebugGroupMain) ||
+        VLOG_IS_ON(1)) {
+      ::tensorflow::DumpMlirOpToFile(
+          DEBUG_DATA_DUMPER()->GetDumpFilename(
+              function_name, kDebugGroupMain,
+              llvm::formatv("mlir_{0}_before", name)),
+          *module_ref, llvm::StringRef(), nullptr);
+    }
 
     Status pass_status = OkStatus();
     auto pass_state = per_pass_state[per_pass_state_index++];
@@ -247,8 +257,8 @@ Status MlirFunctionOptimizationPass::Run(
       VLOG(2) << "Graph #nodes " << (*graph)->num_nodes() << " #edges "
               << (*graph)->num_edges();
       timings.Reset({kTfMlirCategory, name.str()});
-      pass_status = pass_registration.pass->Run(config_proto, *module_ref,
-                                                **graph, *flib_def);
+      pass_status = pass_registration.pass->Run(
+          function_name, config_proto, *module_ref, **graph, *flib_def);
       timings.ReportAndStop();
       if (pass_status.ok()) {
         VLOG(2) << "Finished MLIR graph optimization pass: "
@@ -266,8 +276,8 @@ Status MlirFunctionOptimizationPass::Run(
       // module in case of no failures.
       auto module_ref_clone = module_ref->clone();
       timings.Reset({kTfMlirCategory, name.str() + "_fallback"});
-      pass_status = pass_registration.pass->Run(config_proto, module_ref_clone,
-                                                **graph, *flib_def);
+      pass_status = pass_registration.pass->Run(
+          function_name, config_proto, module_ref_clone, **graph, *flib_def);
       timings.ReportAndStop();
 
       if (pass_status.ok()) {
@@ -304,8 +314,13 @@ Status MlirFunctionOptimizationPass::Run(
       }
     }
 
-    DUMP_MLIR_MODULE(function_name, llvm::formatv("mlir_{0}_after", name),
-                     GetModuleText(*module_ref), VLOG_IS_ON(1));
+    if (DEBUG_DATA_DUMPER()->ShouldDump(function_name, kDebugGroupMain) ||
+        VLOG_IS_ON(1)) {
+      ::tensorflow::DumpMlirOpToFile(DEBUG_DATA_DUMPER()->GetDumpFilename(
+                                         function_name, kDebugGroupMain,
+                                         llvm::formatv("mlir_{0}_after", name)),
+                                     *module_ref, llvm::StringRef(), nullptr);
+    }
   }
 
   if (!is_module_updated) {
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
index 8fe0ccbd00e..d3a8420af94 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
@@ -65,7 +65,8 @@ class MlirOptimizationPass {
       const Graph& graph,
       const FunctionLibraryDefinition& function_library) const = 0;
 
-  virtual Status Run(const ConfigProto& config_proto, mlir::ModuleOp module,
+  virtual Status Run(const std::string& function_name,
+                     const ConfigProto& config_proto, mlir::ModuleOp module,
                      const Graph& graph,
                      const FunctionLibraryDefinition& function_library) = 0;
 };
@@ -118,8 +119,9 @@ class MlirFunctionOptimizationPass : public FunctionOptimizationPass {
 
   // Executes all of the underlying registered MlirOptimizationPasses.
   Status Run(const std::string& function_name, const DeviceSet& device_set,
-             const ConfigProto& config_proto, std::unique_ptr<Graph>* graph,
-             FunctionLibraryDefinition* flib_def,
+             const ConfigProto& config_proto,
+             absl::string_view xla_compile_device_type,
+             std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
              std::vector<std::string>* control_ret_node_names,
              bool* control_rets_updated) override;
 
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
index 36ba9160f59..4e7d1449946 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
@@ -15,11 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
 
+#include <map>
 #include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -29,6 +33,11 @@ using ::testing::NiceMock;
 using ::testing::Return;
 using ::testing::Test;
 
+constexpr char kOk[] = "OK";
+constexpr char kInvalidArgument[] = "INVALID_ARGUMENT";
+constexpr char kSuccess[] = "kSuccess";
+constexpr char kFailure[] = "kFailure";
+
 class MockMlirOptimizationPass : public MlirOptimizationPass {
  public:
   // MOCK_METHOD does not work on Windows build, using MOCK_CONST_METHODX
@@ -39,7 +48,8 @@ class MockMlirOptimizationPass : public MlirOptimizationPass {
                          const DeviceSet* device_set,
                          const ConfigProto& config_proto, const Graph& graph,
                          const FunctionLibraryDefinition& function_library));
-  MOCK_METHOD4(Run, Status(const ConfigProto& config_proto,
+  MOCK_METHOD5(Run, Status(const std::string& function_name,
+                           const ConfigProto& config_proto,
                            mlir::ModuleOp module, const Graph& graph,
                            const FunctionLibraryDefinition& function_library));
 };
@@ -72,8 +82,8 @@ class ModifyMlirModulePass : public MlirOptimizationPass {
 
   // Just modify MLIR module so that we can check whether original TF graph
   // has changed or not.
-  Status Run(const ConfigProto& config_proto, mlir::ModuleOp module,
-             const Graph& graph,
+  Status Run(const std::string& function_name, const ConfigProto& config_proto,
+             mlir::ModuleOp module, const Graph& graph,
              const FunctionLibraryDefinition& function_library) override {
     mlir::Builder b(module.getContext());
     auto producer = b.getNamedAttr("producer", b.getI32IntegerAttr(0));
@@ -123,10 +133,11 @@ class MlirGraphOptimizationPassTest : public Test {
 
       ON_CALL(*optimization_pass, GetPassState(_, _, _, _))
           .WillByDefault(Return(pass_state));
-      ON_CALL(*optimization_pass, Run(_, _, _, _))
+      ON_CALL(*optimization_pass, Run(_, _, _, _, _))
           .WillByDefault(Return(pass_run_result));
       MlirOptimizationPassRegistry::Global().Add(pass_priority++,
                                                  std::move(optimization_pass));
+      pass_result_expected_[pass_state][pass_run_result.ok()]++;
     }
 
     flib_ = std::make_unique<FunctionLibraryDefinition>(graph_->flib_def());
@@ -141,6 +152,7 @@ class MlirGraphOptimizationPassTest : public Test {
         .WillByDefault(Return(pass_state));
     MlirOptimizationPassRegistry::Global().Add(10,
                                                std::move(optimization_pass));
+    pass_result_expected_[pass_state][run_status.ok()]++;
   }
 
   void TearDown() override {
@@ -164,31 +176,60 @@ class MlirGraphOptimizationPassTest : public Test {
 #endif
   }
 
+  void verifyCounters() {
+    EXPECT_EQ(mlir_function_pass_fallback_count_.Read(kSuccess),
+              pass_result_expected_[MlirOptimizationPassState::FallbackEnabled]
+                                   [true]);
+    EXPECT_EQ(mlir_function_pass_fallback_count_.Read(kFailure),
+              pass_result_expected_[MlirOptimizationPassState::FallbackEnabled]
+                                   [false]);
+    EXPECT_EQ(mlir_function_pass_graph_conversion_count_.Read(kOk), 1);
+  }
+
   ConfigProto config_proto_;
   MlirFunctionOptimizationPass function_optimization_pass_;
   DeviceSet device_set_;
   std::unique_ptr<Graph> graph_;
   std::unique_ptr<FunctionLibraryDefinition> flib_;
   std::vector<std::string> control_ret_node_names_;
+  std::string xla_compile_device_type_;
   bool control_rets_updated_{false};
+  monitoring::testing::CellReader<int64_t> mlir_function_pass_fallback_count_ =
+      monitoring::testing::CellReader<int64_t>(
+          /* metric name */
+          "/tensorflow/core/mlir_function_pass_fallback_count");
+  monitoring::testing::CellReader<int64_t>
+      mlir_graph_optimization_pass_fallback_count_ =
+          monitoring::testing::CellReader<int64_t>(
+              /* metric name */
+              "/tensorflow/core/mlir_graph_optimization_pass_fallback_count");
+  monitoring::testing::CellReader<int64_t>
+      mlir_function_pass_graph_conversion_count_ =
+          monitoring::testing::CellReader<int64_t>(
+              /* metric name */
+              "/tensorflow/core/mlir_function_pass_graph_conversion_count");
+  std::map<MlirOptimizationPassState, std::map<bool, int64_t>>
+      pass_result_expected_;
 };
 
 TEST_F(MlirGraphOptimizationPassTest, OptimizationPassFailsNoFallback) {
-  Init(Status(error::Code::ABORTED, "aborted"),
+  Init(Status(absl::StatusCode::kAborted, "aborted"),
        {MlirOptimizationPassState::Enabled});
 
   GraphDef original_graph_def;
   graph_->ToGraphDef(&original_graph_def);
 
   EXPECT_EQ(function_optimization_pass_.Run(
-                "test_func", device_set_, config_proto_, &graph_, flib_.get(),
+                "test_func", device_set_, config_proto_,
+                xla_compile_device_type_, &graph_, flib_.get(),
                 &control_ret_node_names_, &control_rets_updated_),
-            Status(error::Code::ABORTED, "aborted"));
+            Status(absl::StatusCode::kAborted, "aborted"));
   verifyGraph(original_graph_def);
+  verifyCounters();
 }
 
 TEST_F(MlirGraphOptimizationPassTest, OptimizationPassFailsDisabledFallback) {
-  Init(Status(error::Code::ABORTED, "aborted"),
+  Init(Status(absl::StatusCode::kAborted, "aborted"),
        {MlirOptimizationPassState::Disabled,
         MlirOptimizationPassState::FallbackEnabled});
 
@@ -203,13 +244,15 @@ TEST_F(MlirGraphOptimizationPassTest, OptimizationPassFailsDisabledFallback) {
   GraphDef original_graph_def;
   graph_->ToGraphDef(&original_graph_def);
   AddModuleModificationPass(MlirOptimizationPassState::FallbackEnabled,
-                            Status(error::Code::ABORTED, "aborted"));
+                            Status(absl::StatusCode::kAborted, "aborted"));
 
   EXPECT_EQ(function_optimization_pass_.Run(
-                "test_func", device_set_, config_proto_, &graph_, flib_.get(),
+                "test_func", device_set_, config_proto_,
+                xla_compile_device_type_, &graph_, flib_.get(),
                 &control_ret_node_names_, &control_rets_updated_),
             OkStatus());
   verifyGraph(original_graph_def);
+  verifyCounters();
 }
 
 TEST_F(MlirGraphOptimizationPassTest, OptimizationPassDoesNotFailFallback) {
@@ -221,11 +264,32 @@ TEST_F(MlirGraphOptimizationPassTest, OptimizationPassDoesNotFailFallback) {
   AddModuleModificationPass(MlirOptimizationPassState::FallbackEnabled,
                             OkStatus());
   EXPECT_EQ(function_optimization_pass_.Run(
-                "test_func", device_set_, config_proto_, &graph_, flib_.get(),
+                "test_func", device_set_, config_proto_,
+                xla_compile_device_type_, &graph_, flib_.get(),
                 &control_ret_node_names_, &control_rets_updated_),
             OkStatus());
 
   verifyGraph(original_graph_def, true);
+  verifyCounters();
+}
+
+TEST_F(MlirGraphOptimizationPassTest, GraphDoesntConvertUpdatesCounter) {
+  Init(OkStatus(), {MlirOptimizationPassState::FallbackEnabled});
+
+  graph_ = std::make_unique<Graph>(OpRegistry::Global());
+  control_ret_node_names_.push_back("foo");
+
+  AddModuleModificationPass(MlirOptimizationPassState::FallbackEnabled,
+                            OkStatus());
+  EXPECT_EQ(function_optimization_pass_.Run(
+                "test_func", device_set_, config_proto_,
+                xla_compile_device_type_, &graph_, flib_.get(),
+                &control_ret_node_names_, &control_rets_updated_),
+            OkStatus());
+
+  EXPECT_EQ(mlir_function_pass_graph_conversion_count_.Read(kOk), 0);
+  EXPECT_EQ(mlir_function_pass_graph_conversion_count_.Read(kInvalidArgument),
+            1);
 }
 
 TEST(MlirOptimizationPassRegistry, RegisterPassesWithTheSamePriorityFails) {
diff --git a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
index 0f15052fa32..cbd03639c02 100644
--- a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
+++ b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
@@ -42,7 +42,8 @@ namespace tensorflow {
 
 OpOrArgNameMapper::~OpOrArgNameMapper() {}
 
-llvm::StringRef OpOrArgNameMapper::GetUniqueName(llvm::StringRef prefix) {
+llvm::StringRef OpOrArgNameMapper::GetUniqueName(llvm::StringRef prefix,
+                                                 int hash_value) {
   // Insert/find if prefix is unique.
   auto prefix_it = name_to_count_.try_emplace(prefix, 0);
   if (prefix_it.second && IsUnique(prefix)) {
@@ -55,8 +56,11 @@ llvm::StringRef OpOrArgNameMapper::GetUniqueName(llvm::StringRef prefix) {
   // Add increasing number (count) to end of prefix until it is determined
   // to be unique.
   auto& val = prefix_it.first->second;
-  llvm::SmallString<64> probe_name(prefix);
-  probe_name.append(GetSuffixSeparator());
+  auto prefix_name = hash_value == 0 ? prefix.str() + GetSuffixSeparator().str()
+                                     : prefix.str() + GetDashSeparator().str() +
+                                           std::to_string(hash_value) +
+                                           GetDashSeparator().str();
+  llvm::SmallString<64> probe_name(prefix_name);
   const int probe_prefix_size = probe_name.size();
   while (true) {
     probe_name.resize(probe_prefix_size);
@@ -75,11 +79,12 @@ llvm::StringRef OpOrArgNameMapper::GetUniqueName(llvm::StringRef prefix) {
   }
 }
 
-llvm::StringRef OpOrArgNameMapper::GetUniqueName(OpOrVal op_or_val) {
+llvm::StringRef OpOrArgNameMapper::GetUniqueName(OpOrVal op_or_val,
+                                                 int hash_value) {
   auto& name = op_or_val_to_name_[op_or_val];
   if (!name.empty()) return StringViewToRef(name);
   // Update the value in the map with unique name.
-  llvm::StringRef ref = GetUniqueName(GetName(op_or_val));
+  llvm::StringRef ref = GetUniqueName(GetName(op_or_val), hash_value);
   name = StringRefToView(ref);
   return ref;
 }
diff --git a/tensorflow/compiler/mlir/op_or_arg_name_mapper.h b/tensorflow/compiler/mlir/op_or_arg_name_mapper.h
index f4aa8626f43..d8ff9cebc01 100644
--- a/tensorflow/compiler/mlir/op_or_arg_name_mapper.h
+++ b/tensorflow/compiler/mlir/op_or_arg_name_mapper.h
@@ -36,10 +36,10 @@ using OpOrVal = llvm::PointerUnion<mlir::Operation*, mlir::Value>;
 class OpOrArgNameMapper {
  public:
   // Returns unique name for the given prefix.
-  llvm::StringRef GetUniqueName(llvm::StringRef prefix);
+  llvm::StringRef GetUniqueName(llvm::StringRef prefix, int hash_value = 0);
 
   // Returns unique name for the operation or value.
-  llvm::StringRef GetUniqueName(OpOrVal op_or_val);
+  llvm::StringRef GetUniqueName(OpOrVal op_or_val, int hash_value = 0);
 
   // Returns unique name as a string_view for the operation or value.
   absl::string_view GetUniqueNameView(OpOrVal op_or_val);
@@ -67,6 +67,8 @@ class OpOrArgNameMapper {
   // Returns the separator used before uniqueing suffix.
   virtual llvm::StringRef GetSuffixSeparator() { return ""; }
 
+  virtual llvm::StringRef GetDashSeparator() { return "_"; }
+
  private:
   // Returns name from the location of the operation or value.
   virtual std::string GetName(OpOrVal op_or_val) = 0;
diff --git a/tensorflow/compiler/mlir/python/BUILD b/tensorflow/compiler/mlir/python/BUILD
index 1596b976e6d..0afe50ac2e7 100644
--- a/tensorflow/compiler/mlir/python/BUILD
+++ b/tensorflow/compiler/mlir/python/BUILD
@@ -20,16 +20,20 @@ cc_library(
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:BytecodeWriter",
         "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:TosaDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@stablehlo//:register",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:tfe_context_internal",
         "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
+        "//tensorflow/compiler/mlir/lite:flatbuffer_import",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
@@ -40,7 +44,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
         "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
         "//tensorflow/compiler/mlir/tosa:passes_header",
         "//tensorflow/compiler/mlir/tosa:tf_passes",
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index 39593e2ded0..7cc1d25355e 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Bytecode/BytecodeWriter.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
 #include "mlir/IR/AsmState.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/InitAllPasses.h"  // from @llvm-project
@@ -36,10 +38,12 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "stablehlo/dialect/Register.h"  // from @stablehlo
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
@@ -119,7 +123,7 @@ std::string RunPassPipelineOnModule(mlir::ModuleOp module,
 
     mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext());
     if (failed(pm.run(module))) {
-      Set_TF_Status_from_Status(status, statusHandler.ConsumeStatus());
+      tsl::Set_TF_Status_from_Status(status, statusHandler.ConsumeStatus());
       return "// error";
     }
   }
@@ -137,13 +141,13 @@ static std::string ImportGraphDefImpl(const std::string& proto,
   GraphDef graphdef;
   auto s = tensorflow::LoadProtoFromBuffer(proto, &graphdef);
   if (!s.ok()) {
-    Set_TF_Status_from_Status(status, s);
+    tsl::Set_TF_Status_from_Status(status, s);
     return "// error";
   }
   mlir::MLIRContext context;
   auto module = ConvertGraphdefToMlir(graphdef, debug_info, specs, &context);
   if (!module.ok()) {
-    Set_TF_Status_from_Status(status, module.status());
+    tsl::Set_TF_Status_from_Status(status, module.status());
     return "// error";
   }
 
@@ -158,7 +162,7 @@ std::string ImportFunction(const std::string& functiondef_proto,
   FunctionDef functiondef;
   auto s = tensorflow::LoadProtoFromBuffer(functiondef_proto, &functiondef);
   if (!s.ok()) {
-    Set_TF_Status_from_Status(status, s);
+    tsl::Set_TF_Status_from_Status(status, s);
     return "// error";
   }
 
@@ -168,7 +172,7 @@ std::string ImportFunction(const std::string& functiondef_proto,
   const tensorflow::FunctionDef* fdef = flib_def.Find(function_name);
   if (fdef == nullptr) {
     s = tensorflow::errors::NotFound("Cannot find function ", function_name);
-    Set_TF_Status_from_Status(status, s);
+    tsl::Set_TF_Status_from_Status(status, s);
     return "// error";
   }
 
@@ -176,14 +180,14 @@ std::string ImportFunction(const std::string& functiondef_proto,
   s = FunctionDefToBodyHelper(*fdef, tensorflow::AttrSlice(), &flib_def,
                               &fbody);
   if (!s.ok()) {
-    Set_TF_Status_from_Status(status, s);
+    tsl::Set_TF_Status_from_Status(status, s);
     return "// error";
   }
 
   mlir::MLIRContext context;
   auto module = ConvertFunctionToMlir(fbody.get(), flib_def, &context);
   if (!module.ok()) {
-    Set_TF_Status_from_Status(status, module.status());
+    tsl::Set_TF_Status_from_Status(status, module.status());
     return "// error";
   }
 
@@ -211,7 +215,7 @@ std::string ImportGraphDef(const std::string& proto,
   auto s = ParseInputArrayInfo(input_names, input_data_types, input_data_shapes,
                                &specs.inputs);
   if (!s.ok()) {
-    Set_TF_Status_from_Status(status, s);
+    tsl::Set_TF_Status_from_Status(status, s);
     return "// error";
   }
   if (!output_names.empty()) {
@@ -230,7 +234,7 @@ std::string ExperimentalConvertSavedModelToMlir(
   auto load_status =
       tensorflow::SavedModelV2Bundle::Load(saved_model_path, &bundle);
   if (!load_status.ok()) {
-    Set_TF_Status_from_Status(status, load_status);
+    tsl::Set_TF_Status_from_Status(status, load_status);
     return "// error";
   }
 
@@ -242,7 +246,7 @@ std::string ExperimentalConvertSavedModelToMlir(
   auto module_or = ConvertSavedModelToMlir(
       &bundle, &context, absl::Span<std::string>(exported_names));
   if (!module_or.status().ok()) {
-    Set_TF_Status_from_Status(status, module_or.status());
+    tsl::Set_TF_Status_from_Status(status, module_or.status());
     return "// error";
   }
 
@@ -266,7 +270,7 @@ std::string ExperimentalConvertSavedModelV1ToMlirLite(
       saved_model_path, tag_set, absl::Span<std::string>(exported_names),
       &context, import_options);
   if (!module_or.status().ok()) {
-    Set_TF_Status_from_Status(status, module_or.status());
+    tsl::Set_TF_Status_from_Status(status, module_or.status());
     return "// error";
   }
 
@@ -275,7 +279,8 @@ std::string ExperimentalConvertSavedModelV1ToMlirLite(
 
 std::string ExperimentalConvertSavedModelV1ToMlir(
     const std::string& saved_model_path, const std::string& exported_names_str,
-    const std::string& tags, bool lift_variables, bool upgrade_legacy,
+    const std::string& tags, bool lift_variables,
+    bool include_variables_in_initializers, bool upgrade_legacy,
     bool show_debug_info, TF_Status* status) {
   // Load the saved model into a SavedModelBundle.
 
@@ -286,7 +291,7 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
   auto load_status =
       tensorflow::LoadSavedModel({}, {}, saved_model_path, tag_set, &bundle);
   if (!load_status.ok()) {
-    Set_TF_Status_from_Status(status, load_status);
+    tsl::Set_TF_Status_from_Status(status, load_status);
     return "// error";
   }
 
@@ -297,11 +302,13 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
   tensorflow::MLIRImportOptions import_options;
   import_options.upgrade_legacy = upgrade_legacy;
   import_options.lift_variables = lift_variables;
+  import_options.include_variables_in_initializers =
+      include_variables_in_initializers;
   auto module_or =
       ConvertSavedModelV1ToMlir(bundle, absl::Span<std::string>(exported_names),
                                 &context, import_options);
   if (!module_or.status().ok()) {
-    Set_TF_Status_from_Status(status, module_or.status());
+    tsl::Set_TF_Status_from_Status(status, module_or.status());
     return "// error";
   }
 
@@ -317,7 +324,7 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
 
   mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
   if (failed(pm.run(*module))) {
-    Set_TF_Status_from_Status(status, diagnostic_handler.ConsumeStatus());
+    tsl::Set_TF_Status_from_Status(status, diagnostic_handler.ConsumeStatus());
     return "// error";
   }
   return MlirModuleToString(*module, show_debug_info);
@@ -330,13 +337,16 @@ std::string ExperimentalRunPassPipeline(const std::string& mlir_txt,
   RegisterPasses();
   mlir::DialectRegistry registry;
   mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::stablehlo::registerAllDialects(registry);
+  registry.insert<mlir::shape::ShapeDialect>();
   mlir::MLIRContext context(registry);
   mlir::OwningOpRef<mlir::ModuleOp> module;
   {
     mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
     module = mlir::parseSourceString<mlir::ModuleOp>(mlir_txt, &context);
     if (!module) {
-      Set_TF_Status_from_Status(status, diagnostic_handler.ConsumeStatus());
+      tsl::Set_TF_Status_from_Status(status,
+                                     diagnostic_handler.ConsumeStatus());
       return "// error";
     }
   }
@@ -353,7 +363,7 @@ std::string ExperimentalRunPassPipeline(const std::string& mlir_txt,
 
   mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
   if (failed(pm.run(*module))) {
-    Set_TF_Status_from_Status(status, diagnostic_handler.ConsumeStatus());
+    tsl::Set_TF_Status_from_Status(status, diagnostic_handler.ConsumeStatus());
     return "// error";
   }
   return MlirModuleToString(*module, show_debug_info);
@@ -363,13 +373,16 @@ void ExperimentalWriteBytecode(const std::string& filename,
                                const std::string& mlir_txt, TF_Status* status) {
   mlir::DialectRegistry registry;
   mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::stablehlo::registerAllDialects(registry);
+  registry.insert<mlir::shape::ShapeDialect>();
   mlir::MLIRContext context(registry);
   mlir::OwningOpRef<mlir::ModuleOp> module;
+  mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
   {
-    mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
     module = mlir::parseSourceString<mlir::ModuleOp>(mlir_txt, &context);
     if (!module) {
-      Set_TF_Status_from_Status(status, diagnostic_handler.ConsumeStatus());
+      tsl::Set_TF_Status_from_Status(status,
+                                     diagnostic_handler.ConsumeStatus());
       return;
     }
   }
@@ -378,13 +391,74 @@ void ExperimentalWriteBytecode(const std::string& filename,
   std::string error;
   std::unique_ptr<llvm::ToolOutputFile> outputFile =
       mlir::openOutputFile(filename, &error);
+  if (!error.empty()) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 ("Unable to create output file " + error).c_str());
+    return;
+  }
+  outputFile->keep();
+  if (failed(mlir::writeBytecodeToFile(*module, outputFile->os(),
+                                       writer_config))) {
+    tsl::Set_TF_Status_from_Status(status, diagnostic_handler.ConsumeStatus());
+  }
+}
+
+void ExperimentalTFLiteToTosaBytecode(
+    const std::string& flatbuffer_file, const std::string& tosa_bytecode_file,
+    bool use_external_constant,
+    const std::vector<std::string>& ordered_input_arrays,
+    const std::vector<std::string>& ordered_output_arrays, TF_Status* status) {
+  mlir::DialectRegistry registry;
+  mlir::RegisterAllTensorFlowDialects(registry);
+  registry.insert<mlir::tosa::TosaDialect>();
+  mlir::MLIRContext context(registry);
+  mlir::OwningOpRef<mlir::ModuleOp> module;
+  mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
+  {
+    mlir::Location loc = mlir::UnknownLoc::get(&context);
+    std::string error;
+    std::unique_ptr<llvm::MemoryBuffer> buffer =
+        mlir::openInputFile(flatbuffer_file, &error);
+    if (buffer == nullptr) {
+      TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                   ("Unable to load input file " + error).c_str());
+      return;
+    }
+
+    auto buffer_view =
+        std::string_view(buffer->getBufferStart(), buffer->getBufferSize());
+    module = tflite::FlatBufferToMlir(
+        buffer_view, &context, loc, use_external_constant, ordered_input_arrays,
+        ordered_output_arrays);
+    mlir::PassManager pm(&context, module.get()->getName().getStringRef(),
+                         mlir::PassManager::Nesting::Implicit);
+    mlir::tosa::TOSATFLLegalizationPipelineOptions opts;
+    // This flow is specific to compilation backend, so set to true.
+    opts.target_compilation_backend = true;
+    // Temporary work-around for https://github.com/openxla/iree/issues/8974
+    opts.dequantize_tfl_softmax = true;
+    createTFLtoTOSALegalizationPipeline(pm, opts);
+    if (failed(pm.run(*module))) {
+      tsl::Set_TF_Status_from_Status(status,
+                                     diagnostic_handler.ConsumeStatus());
+      return;
+    }
+  }
+  mlir::FallbackAsmResourceMap fallback_resource_map;
+  mlir::BytecodeWriterConfig writer_config(fallback_resource_map);
+  std::string error;
+  std::unique_ptr<llvm::ToolOutputFile> outputFile =
+      mlir::openOutputFile(tosa_bytecode_file, &error);
   if (!error.empty()) {
     TF_SetStatus(status, TF_INVALID_ARGUMENT,
                  ("Unable to create output file" + error).c_str());
     return;
   }
   outputFile->keep();
-  mlir::writeBytecodeToFile(*module, outputFile->os(), writer_config);
+  if (failed(mlir::writeBytecodeToFile(*module, outputFile->os(),
+                                       writer_config))) {
+    tsl::Set_TF_Status_from_Status(status, diagnostic_handler.ConsumeStatus());
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/python/mlir.h b/tensorflow/compiler/mlir/python/mlir.h
index 740971d4fb8..a17f4f2843e 100644
--- a/tensorflow/compiler/mlir/python/mlir.h
+++ b/tensorflow/compiler/mlir/python/mlir.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_H_
 
 #include <string>
+#include <vector>
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/c/eager/c_api.h"
@@ -95,7 +96,8 @@ std::string ExperimentalConvertSavedModelV1ToMlirLite(
 //   A string of textual MLIR representing the raw imported SavedModel.
 std::string ExperimentalConvertSavedModelV1ToMlir(
     const std::string &saved_model_path, const std::string &exported_names_str,
-    const std::string &tags, bool lift_variables, bool upgrade_legacy,
+    const std::string &tags, bool lift_variables,
+    bool include_variables_in_initializers, bool upgrade_legacy,
     bool show_debug_info, TF_Status *status);
 
 std::string ExperimentalRunPassPipeline(const std::string &mlir_txt,
@@ -107,6 +109,16 @@ std::string ExperimentalRunPassPipeline(const std::string &mlir_txt,
 void ExperimentalWriteBytecode(const std::string &filename,
                                const std::string &mlir_txt, TF_Status *status);
 
+// Loads a TFLite flatbuffer, convert to TOSA for backend compilation and
+// produce an MLIR bytecode file as output.
+// TODO(jpienaar): Refactor this when we use more implicit module passing
+// between calls to avoid serialization overhead.
+void ExperimentalTFLiteToTosaBytecode(
+    const std::string &flatbuffer_file, const std::string &tosa_bytecode_file,
+    bool use_external_constant,
+    const std::vector<std::string> &ordered_input_arrays,
+    const std::vector<std::string> &ordered_output_arrays, TF_Status *status);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 0ae09a43dd3..f85f8f13882 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -1,21 +1,97 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/compiler/mlir/quantization/stablehlo:internal_visibility_allowlist.bzl", "internal_visibility_allowlist")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+
+# TODO(b/264218457): Create stablehlo-quantization-opt and register passes to actually test.
 
 package_group(
     name = "internal_visibility_allowlist_package",
     packages = [
         "//tensorflow/compiler/mlir/lite/...",
         "//tensorflow/compiler/mlir/quantization/...",
+        "//tensorflow/lite/...",
         "//third_party/cloud_tpu/inference_converter/...",  # TPU Inference Converter V1
     ] + internal_visibility_allowlist(),
 )
 
+# TODO(b/264218457): Add quantize and post_quantize passes.
+cc_library(
+    name = "passes",
+    srcs = [
+        "passes/quantize_weight.cc",
+    ],
+    hdrs = [
+        "passes/passes.h",
+    ],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        ":quantization_options_proto_cc",
+        ":stablehlo_passes_inc_gen",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/core/platform:path",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:stablehlo_ops",
+    ],
+    # Alwayslink is required for registering the MLIR passes.
+    # TODO(b/255530126): Split the pass registration from the definitions to avoid binary size bloat.
+    alwayslink = True,
+)
+
+cc_library(
+    name = "quantize_passes",
+    srcs = [
+        "quantize_passes.cc",
+    ],
+    hdrs = [
+        "quantize_passes.h",
+    ],
+    compatible_with = get_compatible_with_cloud(),
+    visibility = [":internal_visibility_allowlist_package"],
+    deps = [
+        ":passes",
+        ":quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
+        "//tensorflow/core/platform:path",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+gentbl_cc_library(
+    name = "stablehlo_passes_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+            ],
+            "passes/passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/passes.td",
+    deps = [
+        "@llvm-project//mlir:PassBaseTdFiles",
+    ],
+)
+
 tf_proto_library(
     name = "quantization_options_proto",
     srcs = ["quantization_options.proto"],
     cc_api_version = 2,
-    make_default_target_header_only = True,
-    visibility = [":internal_visibility_allowlist_package"],
+    visibility = ["//visibility:public"],
 )
 
 # copybara:uncomment_begin(google-only)
@@ -26,3 +102,7 @@ tf_proto_library(
 #     deps = [":quantization_options_proto"],
 # )
 # copybara:uncomment_end
+
+exports_files([
+    "run_lit.sh",
+])
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/internal_visibility_allowlist.bzl b/tensorflow/compiler/mlir/quantization/stablehlo/internal_visibility_allowlist.bzl
index 0e302a08fd5..310b10e5d0f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/internal_visibility_allowlist.bzl
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/internal_visibility_allowlist.bzl
@@ -1,10 +1,5 @@
 """Internal visibility rules."""
 
 def internal_visibility_allowlist():
-    """Returns a list of g3 packages that can depend on internal targets."""
-    return [
-        "//learning/brain/experimental/mlir/quantization/...",
-        "//learning/brain/mlir/quantization/tensorflow/...",
-        "//learning/brain/mobile/programmability/...",
-        "//learning/brain/experimental/tfq/...",
-    ]
+    """Returns a list of the packages that can depend on internal targets."""
+    return []
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
new file mode 100644
index 00000000000..788a00f349c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The StableHLO Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_PASSES_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+
+#define GEN_PASS_DECL
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+namespace mlir {
+namespace stablehlo {
+
+// Creates a pass that quantizes weight component of StableHLO graph.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizeWeightPass(
+    ::stablehlo::quantization::QuantizationOptions quantization_options);
+
+}  // namespace stablehlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_PASSES_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
new file mode 100644
index 00000000000..959121888b6
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -0,0 +1,22 @@
+/* Copyright 2023 The StableHLO Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def QuantizeWeightPass : Pass<"stablehlo-quantize-weight", "mlir::func::FuncOp"> {
+  let summary = "Quantizes the weight component of StableHLO graph.";
+  let constructor = "CreateQuantizeWeightPass()";
+  let dependentDialects = ["stablehlo::StablehloDialect"];
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
new file mode 100644
index 00000000000..9d5d0cc8e91
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
@@ -0,0 +1,244 @@
+/* Copyright 2023 The StableHLO Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+
+// NOLINTNEXTLINE
+//===----------------------------------------------------------------------===//
+// The Quantization Pass for Weight.
+//===----------------------------------------------------------------------===//
+namespace mlir {
+namespace stablehlo {
+
+namespace {
+#define GEN_PASS_DEF_QUANTIZEWEIGHTPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+using QuantizationUnits = llvm::SetVector<std::pair<Operation*, int>>;
+
+// Min/Max values used for creating ConstantOp.
+constexpr float kMaxFloat16Value = 65504.f;
+constexpr float kMinFloat16Value = -65504.f;
+
+class QuantizeWeightPass
+    : public impl::QuantizeWeightPassBase<QuantizeWeightPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizeWeightPass)
+
+  explicit QuantizeWeightPass(
+      ::stablehlo::quantization::QuantizationOptions quantization_options) {}
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "stablehlo-quantize-weight";
+  }
+
+  StringRef getDescription() const final {
+    return "Apply the specified quantization methods to weights.";
+  }
+
+ private:
+  void runOnOperation() override;
+};
+
+// Collects quantizable target ops, then insert Q-DQ quantization patterns.
+class QuantizeWeight : public OpRewritePattern<ConstantOp> {
+ public:
+  explicit QuantizeWeight(MLIRContext* context)
+      : OpRewritePattern<ConstantOp>(context) {}
+
+  LogicalResult matchAndRewrite(ConstantOp op,
+                                PatternRewriter& rewriter) const override {
+    // 1. Collect quantizable ops.
+    QuantizationUnits quantizable_ops = GetQuantizableOps(op);
+    if (quantizable_ops.empty()) {
+      return failure();
+    }
+
+    // 2. Quantize collected ops.
+    if (!QuantizeOps(rewriter, op, quantizable_ops)) {
+      return failure();
+    }
+
+    // 3. Complete the Q-DQ pair for each inference type.
+    if (!ConvertToFloat16Constant(rewriter, op)) {
+      return failure();
+    }
+    return success();
+  }
+
+ private:
+  // Marks users that are applicable for quantization where the criteria for
+  // determining quantizable ops differs by the inference type.
+  QuantizationUnits GetQuantizableOps(ConstantOp op) const {
+    // Non-float tensors do not need quantization.
+    QuantizationUnits quantizable_ops;
+    ShapedType type = op.getType().dyn_cast<ShapedType>();
+    if (!type || !type.getElementType().isF32()) return quantizable_ops;
+
+    Value value = op.getResult();
+
+    for (OpOperand& use : value.getUses()) {
+      Operation* user = use.getOwner();
+      int operand_num = use.getOperandNumber();
+      quantizable_ops.insert({user, operand_num});
+    }
+    return quantizable_ops;
+  }
+
+  // Returns whether quantization is applied to filtered users.
+  bool QuantizeOps(PatternRewriter& rewriter, ConstantOp op,
+                   const QuantizationUnits& quantizable_ops) const {
+    // TODO(b/212514817): refactor mode checking to improve code quality.
+    for (const std::pair<Operation*, int>& quant_op : quantizable_ops) {
+      // For f16 quantization, quantize all constant ops as float16.
+      QuantizeOpAsFloat16(rewriter, op, quant_op);
+    }
+    // TODO(b/264218457): Return a value that accurately captures result status.
+    return true;
+  }
+
+  // Inserts ConvertOp which is used for converting float32 ConstantOp into
+  // float16 quantization. If there is an existing ConvertOp connected to the
+  // ConstantOp, the quantizable_op will be rewired to the existing ConvertOp.
+  // This guarantees at most one ConvertOp is created for float32 to float16
+  // conversion.
+  void QuantizeOpAsFloat16(PatternRewriter& rewriter, ConstantOp op,
+                           const std::pair<Operation*, int> quant_op) const {
+    auto [quantizable_op, quantize_operand_num] = quant_op;
+    // If the constant is an output tensor, do nothing.
+    if (isa<func::ReturnOp>(quantizable_op)) {
+      return;
+    }
+
+    TensorType old_result_type =
+        op.getResult().getType().dyn_cast<TensorType>();
+    FloatType quantized_type = FloatType::getF16(op.getContext());
+    ShapedType new_result_type = old_result_type.clone(quantized_type);
+
+    // Insert ConvertOp if it does not exist yet. Otherwise, just rewire without
+    // creating a ConvertOp.
+    for (OpOperand& connected_op : op.getResult().getUses()) {
+      ConvertOp convert_op =
+          dyn_cast_or_null<ConvertOp>(connected_op.getOwner());
+      // ConvertOp already exists. Rewire the existing convert op into f16.
+      if (convert_op && convert_op.getType() == new_result_type) {
+        quantizable_op->setOperand(quantize_operand_num, convert_op);
+        return;
+      }
+    }
+    rewriter.setInsertionPointAfter(op);
+    ConvertOp new_convert_op = rewriter.create<ConvertOp>(
+        op->getLoc(), new_result_type, op.getResult());
+    quantizable_op->setOperand(quantize_operand_num,
+                               new_convert_op.getResult());
+  }
+
+  // Returns whether a ConvertOp-Operation sequence can be converted into new
+  // ConstantOp-Convert-Operation. The new ConstantOp has float16 data type.
+  bool ConvertToFloat16Constant(PatternRewriter& rewriter,
+                                ConstantOp op) const {
+    for (Operation* connected_op : op.getResult().getUsers()) {
+      ConvertOp convert_op = dyn_cast_or_null<ConvertOp>(connected_op);
+      // Skip if no convert op exists.
+      if (!convert_op || convert_op.getResult().use_empty()) continue;
+
+      // Get types.
+      Type old_result_type = op.getResult().getType();
+      ShapedType new_result_type = convert_op.getType().dyn_cast<ShapedType>();
+
+      // Proceeds only if the converting is to float16.
+      if (!new_result_type.getElementType().isF16()) continue;
+
+      // Convert values.
+      std::vector<Eigen::half> new_values;
+      DenseFPElementsAttr value_attr =
+          op.getValue().cast<DenseFPElementsAttr>();
+      new_values.reserve(value_attr.getNumElements());
+
+      for (float value : value_attr.getValues<float>()) {
+        new_values.push_back(Eigen::half(
+            std::min(std::max(value, kMinFloat16Value), kMaxFloat16Value)));
+      }
+      DenseElementsAttr new_value_attr = DenseFPElementsAttr::get(
+          new_result_type, ArrayRef<Eigen::half>(new_values));
+      // Create new ConstantOp-ConvertOp-Operation sequences. At this moment,
+      // old ConstantOp is guaranteed to have one F32->F16 convert op regardless
+      // of its number of users.
+      rewriter.setInsertionPointAfter(op);
+      // create new F16 constant op in that location
+      ConstantOp new_const = rewriter.create<ConstantOp>(
+          op->getLoc(), new_result_type, new_value_attr);
+      ConvertOp dcast =
+          rewriter.create<ConvertOp>(op->getLoc(), old_result_type, new_const);
+      // replace all convert ops with dq op.
+      convert_op->replaceAllUsesWith(dcast);
+      // Return without scanning for the next ConvertOp as only one ConvertOp is
+      // connected to all quantizable ops.
+      return true;
+    }
+    return false;
+  }
+};
+
+// TODO(b/264218457): Refactors the current file to parse preset quantization
+// options and allow modular control of quantization specs.
+void QuantizeWeightPass::runOnOperation() {
+  func::FuncOp func = getOperation();
+  MLIRContext* ctx = func.getContext();
+
+  RewritePatternSet patterns(ctx);
+  patterns.add<QuantizeWeight>(ctx);
+
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+
+  if (failed(applyPatternsAndFoldGreedily(func, frozen_patterns))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the StableHLO dialect Quantize Weight pass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizeWeightPass(
+    ::stablehlo::quantization::QuantizationOptions quantization_options) {
+  return std::make_unique<QuantizeWeightPass>(quantization_options);
+}
+}  // namespace stablehlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.proto
index 22163b54a6d..41834f95fd8 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.proto
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.proto
@@ -29,7 +29,7 @@ message QuantizationMethod {
 // NEXT ID: 2
 message PresetQuantizationMethod {
   // Preset quantization methods that are supported as a stable API.
-  // NEXT ID: 3
+  // NEXT ID: 5
   enum PresetMethod {
     // TODO(b/266173150): Update preset methods after redefining quantization
     // pattern matching in DarwiNN.
@@ -37,14 +37,24 @@ message PresetQuantizationMethod {
     METHOD_UNSPECIFIED = 0;  // go/do-include-enum-unspecified
 
     // Apply default weight-only quantization. Weights are quantized during
-    // conversion, then dequantized during inference. Data type is as follows:
-    // Weight: i8, Bias: f32, Activation: f32, Input/output: f32
+    // conversion, then dequantized during inference.
+    // Activation: f32, Weight: qi8, Bias: f32
     WEIGHT_ONLY = 1;
 
     // Apply default dynamic range quantization. Quantized tensor value's
-    // ranges are determined during graph runtime. Data type is as follows:
-    // Weight: i8, Bias: f32, Activation: f32, Input/output: f32
-    DYNAMIC_RANGE = 2;
+    // ranges are determined during graph runtime.
+    // Activation: f32, Weight: qi8, Bias: f32
+    POST_TRAINING_QUANTIZATION_DYNAMIC_RANGE = 2;
+
+    // Apply float16 quantization to all the weights. Quantized weights will be
+    // dequantized before running inference.
+    // Activation: f32, Weight: f16, Bias: f16
+    FLOAT16 = 3;
+
+    // Apply static range quantization. The quantization range is determined
+    // via calibration phase and quantized during conversion.
+    // Activation: qi8, Weight: qi8, Bias: qi32
+    POST_TRAINING_QUANTIZATION_STATIC_RANGE = 4;
   }
   PresetMethod preset_method = 1;
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.cc b/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.cc
new file mode 100644
index 00000000000..05290bcb126
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.cc
@@ -0,0 +1,32 @@
+/* Copyright 2023 The StableHLO Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace stablehlo {
+namespace quantization {
+
+void AddQuantizationPasses(mlir::PassManager& pass_manager,
+                           const QuantizationOptions& quantization_options) {
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
+      mlir::stablehlo::CreateQuantizeWeightPass(quantization_options));
+}
+
+}  // namespace quantization
+}  // namespace stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.h
new file mode 100644
index 00000000000..d754be94fc6
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The StableHLO Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_QUANTIZE_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_QUANTIZE_PASSES_H_
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+
+namespace stablehlo {
+namespace quantization {
+// Adds passes for quantization of individual quantizable components.
+// (i.e. activation, weight, bias)
+void AddQuantizationPasses(mlir::PassManager& pass_manager,
+                           const QuantizationOptions& quantization_options);
+
+}  // namespace quantization
+}  // namespace stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_QUANTIZE_PASSES_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
new file mode 100644
index 00000000000..00c76a029e9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
@@ -0,0 +1,29 @@
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "//tensorflow/compiler/mlir/quantization/stablehlo:run_lit.sh",
+    size_override = {
+    },
+    tags_override = {
+    },
+    test_file_exts = ["mlir"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
+        "@llvm-project//mlir:run_lit.sh",
+        # TODO(b/254144841): Add tests in this directory with the proper stablehlo-opt.
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index a8045b28116..2d42d137f9b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -8,8 +8,8 @@ load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 package_group(
     name = "internal_visibility_allowlist_package",
     packages = [
-        "//tensorflow/compiler/mlir/quantization/...",
         "//tensorflow/compiler/mlir/lite/...",
+        "//tensorflow/compiler/mlir/quantization/...",
     ] + internal_visibility_allowlist(),
 )
 
@@ -430,7 +430,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
         "//tensorflow/compiler/mlir/utils:name_utils",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/mlir_hlo",
@@ -442,6 +442,7 @@ cc_library(
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:macros",
         "//tensorflow/core/platform:path",
+        "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:quantization_util",
         "//tensorflow/tsl/platform:str_util",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
index f5d0d9a3542..17e12765f6f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
@@ -48,7 +48,8 @@ std::unique_ptr<OpQuantSpec> GetTFOpQuantSpec(Operation* op) {
       }
     } else if (function_name.contains("matmul")) {
       spec->coeff_op_quant_dim[1] = -1;
-      if (function_name.contains("with_bias")) {
+      if (function_name.contains("with_bias") ||
+          function_name.contains("and_bias")) {
         spec->biases_params[2] = {{0, 1},
                                   quant::GetUniformQuantizedTypeForBias};
       }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
index a7839292fee..f8d46612814 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
@@ -47,6 +47,66 @@ class CastBf16OpsToF32Pass
   void runOnOperation() override;
 };
 
+class CastBf16OpsToF32 : public RewritePattern {
+ public:
+  explicit CastBf16OpsToF32(MLIRContext* context)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
+
+ private:
+  LogicalResult match(Operation* op) const override {
+    if (isa<TF::CastOp, TF::ConstOp>(op) ||
+        op->getName().hasTrait<OpTrait::ZeroOperands>()) {
+      return failure();
+    }
+    for (Value input : op->getOperands()) {
+      if (getElementTypeOrSelf(input).isBF16()) {
+        return success();
+      }
+    }
+    for (Value value : op->getResults()) {
+      if (getElementTypeOrSelf(value).isBF16()) {
+        return success();
+      }
+    }
+    return failure();
+  }
+
+  void rewrite(Operation* op, PatternRewriter& rewriter) const override {
+    // Casts inputs of the operation.
+    for (int i = 0; i < op->getNumOperands(); i++) {
+      Value input = op->getOperand(i);
+      if (getElementTypeOrSelf(input).isBF16()) {
+        Value f32_cast = rewriter.create<TF::CastOp>(
+            op->getLoc(),
+            CloneTypeWithNewElementType(input.getType(), rewriter.getF32Type()),
+            input);
+        op->setOperand(i, f32_cast);
+      }
+    }
+
+    // Casts BF16 outputs of the operation.
+    for (Value value : op->getResults()) {
+      if (getElementTypeOrSelf(value).isBF16()) {
+        value.setType(CloneTypeWithNewElementType(value.getType(),
+                                                  rewriter.getF32Type()));
+        rewriter.setInsertionPointAfterValue(value);
+        for (Operation* user : op->getUsers()) {
+          for (int i = 0; i < user->getNumOperands(); i++) {
+            if (user->getOperand(i) == value) {
+              Value bf16_cast = rewriter.create<TF::CastOp>(
+                  user->getLoc(),
+                  CloneTypeWithNewElementType(value.getType(),
+                                              rewriter.getBF16Type()),
+                  value);
+              user->setOperand(i, bf16_cast);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.inc"
 
 void CastBf16OpsToF32Pass::runOnOperation() {
@@ -54,6 +114,7 @@ void CastBf16OpsToF32Pass::runOnOperation() {
   RewritePatternSet patterns(ctx);
   auto module_op = getOperation();
 
+  patterns.add<CastBf16OpsToF32>(ctx);
   populateWithGenerated(patterns);
 
   if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.td
index 5e38c6b1681..ace1a77e6f3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.td
@@ -32,148 +32,3 @@ def RemoveUnneededCastOps : Pat<
   (replaceWithValue $input),
   [(AreTheSameElementType $input, $output)]>;
 
-// Cast BF16 Conv2D ops to FP32 Conv2D ops. Inputs and
-// filters will be casted to fp32 as well, and unused
-// BF16 constant values will be removed by the compiler.
-def CastBFloat16ConvToFloat32 : Pat<
-  (TF_Conv2DOp:$res
-    $input, $filter, $strides, $use_cudnn_on_gpu, $padding,
-    $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations),
-  (TF_CastOp
-    (TF_Conv2DOp
-      (TF_CastOp
-        $input, /*truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $input))),
-      (TF_CastOp
-        $filter, /*truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $filter))),
-      $strides, $use_cudnn_on_gpu, $padding,
-      $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations,
-      (returnType (CloneTypeWithF32ElementType $res))),
-    /*truncate=*/ConstBoolAttrFalse),
-  [(IsBF16ElementType $input),
-   (IsBF16ElementType $filter)],
-  (addBenefit 1)>;
-
-// Casts BF16 BiasAdd ops to F32 to optimize quantizable ops followed by
-// BiasAdd ops. This cast will cover Conv + BiasAdd, MatMul + BiasAdd,
-// etc.
-def CastBFloat16BiasAddToFloat32 : Pat<
-  (TF_BiasAddOp:$res
-    $input, $bias, IsDataFormatNHWC:$bias_data_format),
-  (TF_CastOp
-    (TF_BiasAddOp
-      (TF_CastOp
-        $input, /*truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $input))),
-      (TF_CastOp
-        $bias, /*truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $bias))),
-      IsDataFormatNHWC:$bias_data_format,
-      (returnType (CloneTypeWithF32ElementType $res))),
-    /*truncate=*/ConstBoolAttrFalse),
-  [(IsBF16ElementType $input),
-   (IsBF16ElementType $bias)],
-  (addBenefit 1)>;
-
-def CastBFloat16AvgPoolToFloat32 : Pat<
-  (TF_AvgPoolOp:$res
-    $input, $ksize, $strides, $padding,
-    IsDataFormatNHWC:$data_format),
-  (TF_CastOp
-    (TF_AvgPoolOp
-      (TF_CastOp
-        $input, /*truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $input))),
-      $ksize, $strides, $padding,
-      IsDataFormatNHWC:$data_format,
-      (returnType (CloneTypeWithF32ElementType $res))),
-    /*truncate=*/ConstBoolAttrFalse),
-  [(IsBF16ElementType $input)],
-  (addBenefit 1)>;
-
-def CastBFloat16MatMulToFloat32 : Pat<
-  (TF_MatMulOp:$res
-    $input, $filter, $transpose_a, $transpose_b),
-  (TF_CastOp
-    (TF_MatMulOp
-      (TF_CastOp
-        $input, /*truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $input))),
-      (TF_CastOp
-        $filter, /*truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $filter))),
-      $transpose_a, $transpose_b,
-      (returnType (CloneTypeWithF32ElementType $res))),
-    /*truncate=*/ConstBoolAttrFalse),
-  [(IsBF16ElementType $input),
-   (IsBF16ElementType $filter)],
-  (addBenefit 1)>;
-
-def CastBFloat16BatchMatMulV2ToFloat32 : Pat<
-  (TF_BatchMatMulV2Op:$res
-    $input, $filter, $adj_x, $adj_y),
-  (TF_CastOp
-    (TF_BatchMatMulV2Op
-      (TF_CastOp
-        $input, /*truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $input))),
-      (TF_CastOp
-        $filter, /*truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $filter))),
-      $adj_x, $adj_y,
-      (returnType (CloneTypeWithF32ElementType $res))),
-    /*truncate=*/ConstBoolAttrFalse),
-  [(IsBF16ElementType $input),
-   (IsBF16ElementType $filter)],
-  (addBenefit 1)>;
-
-def CastBFloat16DepthwiseConvToFloat32 : Pat<
-  (TF_DepthwiseConv2dNativeOp:$res
-    $input, $filter, $strides, $padding,
-    $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations),
-  (TF_CastOp
-    (TF_DepthwiseConv2dNativeOp
-      (TF_CastOp
-        $input, /*truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $input))),
-      (TF_CastOp
-        $filter, /*truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $filter))),
-      $strides, $padding, $explicit_paddings,
-      IsDataFormatNHWC:$data_format, $dilations,
-      (returnType (CloneTypeWithF32ElementType $res))),
-    /*truncate=*/ConstBoolAttrFalse),
-  [(IsBF16ElementType $input),
-   (IsBF16ElementType $filter)],
-  (addBenefit 1)>;
-
-def CastBFloat16GatherToFloat32 : Pat<
-  (TF_GatherV2Op:$res
-    $params, $indices, $axis, $batch_dims),
-  (TF_CastOp
-    (TF_GatherV2Op
-      (TF_CastOp
-        $params, /*truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $params))),
-      $indices, $axis, $batch_dims,
-      (returnType (CloneTypeWithF32ElementType $res))),
-    /*truncate=*/ConstBoolAttrFalse),
-  [(IsBF16ElementType $params),
-    (IsConstTensor $params)],
-  (addBenefit 1)>;
-
-// Converts an AddV2 op accepting two bfloat16 operands into the one taking two
-// float32 operands.
-def CastBFloat16AddV2ToFloat32 : Pat<
-  (TF_AddV2Op:$res $x, $y),
-  (TF_CastOp
-    (TF_AddV2Op
-      (TF_CastOp $x, /*Truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $x))),
-      (TF_CastOp $y, /*Truncate=*/ConstBoolAttrFalse,
-        (returnType (CloneTypeWithF32ElementType $y)))),
-    /*Truncate=*/ConstBoolAttrFalse),
-  [(IsBF16ElementType $x),
-   (IsBF16ElementType $y),
-   (IsBF16ElementType $res)]>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.cc
index 586fe870808..bb606714023 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_identity_op_pattern.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
 
 namespace mlir {
 namespace quant {
@@ -56,17 +58,24 @@ class RemoveTpuOp : public RewritePattern {
       : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
 
  private:
-  LogicalResult matchAndRewrite(Operation* call_op,
+  LogicalResult matchAndRewrite(Operation* op,
                                 PatternRewriter& rewriter) const override {
+    // Remove `_tpu_replicate` attributes on each operation first.
+    if (op->hasAttr(tensorflow::kTPUReplicateAttr)) {
+      op->removeAttr(tensorflow::kTPUReplicateAttr);
+      return success();
+    }
+
+    // Remove TPU operations.
     if (isa<TF::TPUReplicateMetadataOp, TF::TPUCompilationResultOp,
-            TF::TPUOrdinalSelectorOp>(call_op)) {
-      call_op->erase();
+            TF::TPUOrdinalSelectorOp>(op)) {
+      op->erase();
     } else if (auto replicated_input_op =
-                   dyn_cast_or_null<TF::TPUReplicatedInputOp>(call_op)) {
+                   dyn_cast_or_null<TF::TPUReplicatedInputOp>(op)) {
       // TODO(b/267700110): Handle multiple input/output cases.
       rewriter.replaceOp(replicated_input_op, replicated_input_op.getInputs());
     } else if (auto replicated_output_op =
-                   dyn_cast_or_null<TF::TPUReplicatedOutputOp>(call_op)) {
+                   dyn_cast_or_null<TF::TPUReplicatedOutputOp>(op)) {
       // TODO(b/267700110): Handle multiple input/output cases.
       rewriter.replaceOp(replicated_output_op, replicated_output_op.getInput());
     } else {
@@ -115,6 +124,7 @@ void ConvertTpuModelToCpuPass::runOnOperation() {
   patterns.add<ReplaceTpuPartitionedCallOpWithPartitionedCallOp,
                ReplaceBatchFunctionOpToPartitionedCallOp>(ctx);
   patterns.add<RemoveTpuOp>(ctx);
+  patterns.add<RemoveIdentity>(ctx);
 
   if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
     module_op.emitError() << "quant-convert-tpu-model-to-cpu pattern "
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc
index 84e6f0781fb..96d42ebedf3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc
@@ -171,9 +171,8 @@ void InsertQuantizedFunctionsPass::runOnOperation() {
 
   StatusScopedDiagnosticHandler diagnostic_handler(context);
   if (failed(pm.run(*module_ref))) {
-    emitError(module.getLoc())
-        << "failed to apply the optimization: "
-        << diagnostic_handler.ConsumeStatus().error_message();
+    emitError(module.getLoc()) << "failed to apply the optimization: "
+                               << diagnostic_handler.ConsumeStatus().message();
     signalPassFailure();
     return;
   }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td
index 04934a479de..1eaaecf5b61 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td
@@ -152,6 +152,22 @@ def LiftMatmulWithBias : Pat<
       (NamedAttr<"transpose_b"> $transpose_b))),
   [(IsNotInLiftedFunc $res)], (addBenefit 5)>;
 
+// TODO(b/278493977): Create generic implementation of lifting any fused op
+// with any reshaping op
+def LiftMatmulWithReshapeAndBias : Pat<
+  (TF_BiasAddOp:$res
+    (TF_ReshapeOp:$out
+      (TF_MatMulOp $a, $b, $transpose_a, $transpose_b),
+    $shape),
+    $bias, IsDataFormatNHWC:$bias_data_format),
+  (LiftAsFunctionCall<"composite_matmul_with_reshape_and_bias_fn">
+    (ArgumentList $a, $b, $bias, $shape),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"transpose_a"> $transpose_a),
+      (NamedAttr<"transpose_b"> $transpose_b))),
+  [(IsNotInLiftedFunc $res)], (addBenefit 5)>;
+
 def LiftConv3dWithBias : Pat<
   (TF_BiasAddOp:$res
     (TF_Conv3DOp $input, $filter, $strides, $padding,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
index 6c73b266837..1d4db2b7067 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
@@ -13,28 +13,50 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
+#include <cstdint>
 #include <iterator>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_identity_op_pattern.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace mlir {
 namespace quant {
@@ -47,7 +69,11 @@ class PrepareLiftingPass
 
   PrepareLiftingPass() = default;
 
-  explicit PrepareLiftingPass(const OpSet op_set) : op_set_(op_set) {}
+  explicit PrepareLiftingPass(OpSet op_set) { op_set_ = op_set; }
+
+  PrepareLiftingPass(const PrepareLiftingPass& other) {
+    op_set_ = other.op_set_;
+  }
 
   StringRef getArgument() const final {
     // This is the argument used to refer to the pass in
@@ -68,7 +94,15 @@ class PrepareLiftingPass
   void runOnOperation() override;
 
  private:
-  OpSet op_set_;
+  Option<OpSet> op_set_{
+      *this, "target-opset", llvm::cl::init(OpSet::TF),
+      llvm::cl::desc("Choose target opset."),
+      llvm::cl::values(
+          clEnumValN(OpSet::TF, "TF",
+                     "Uses TF ops that mimic quantization behavior"),
+          clEnumValN(OpSet::XLA, "XLA", "Uses TF XLA ops"),
+          clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
+                     "Uses TF Uniform Quantized ops"))};
 };
 
 // Check if given indices in `val1` has same number of elements as given
@@ -116,10 +150,14 @@ LogicalResult MatchSupportedAffineOp(Operation* op, Value& binding_output,
       is_supported_affine_op = data_format.getValue().equals("NHWC") ||
                                data_format.getValue().equals("NDHWC");
     }
-  } else if (llvm::isa<TF::MatMulOp, TF::BatchMatMulV2Op>(op)) {
+  } else if (llvm::isa<TF::BatchMatMulV2Op>(op)) {
     if (const auto adj_y = op->getAttrOfType<BoolAttr>("adj_y")) {
       is_supported_affine_op = !adj_y.getValue();
     }
+  } else if (llvm::isa<TF::MatMulOp>(op)) {
+    if (const auto adj_y = op->getAttrOfType<BoolAttr>("transpose_b")) {
+      is_supported_affine_op = !adj_y.getValue();
+    }
   }
 
   if (!is_supported_affine_op) return failure();
@@ -141,7 +179,7 @@ Value MakeOneDimValueBroadcastable(OpBuilder& builder, Location loc,
   }
 
   int64_t num_elements = value_shape.getNumElements();
-  llvm::SmallVector<int64_t> new_shape;
+  SmallVector<int64_t> new_shape;
   for (auto idx : llvm::reverse(llvm::seq<int32_t>(0, rhs_shape.getRank()))) {
     const int64_t rhs_dim = rhs_shape.getDimSize(idx);
     if (num_elements % rhs_dim != 0) {
@@ -260,6 +298,243 @@ Value MultiplyFakeQuantValue(OpBuilder& builder, Location loc, Value value,
   return ConstantFoldOpIfPossible(dequantize).front();
 }
 
+// Generate an einsum equation from the given DotDimensionNumber.
+std::string CreateEinsumEquation(
+    const xla::DotDimensionNumbers& dot_dimension_numbers, const int lhs_rank,
+    const int rhs_rank) {
+  // Prepare necessary indices.
+  absl::flat_hash_set<int64_t> lhs_batch_idx, rhs_batch_idx;
+  absl::flat_hash_set<int64_t> lhs_contract_idx, rhs_contract_idx;
+  lhs_batch_idx.insert(dot_dimension_numbers.lhs_batch_dimensions().begin(),
+                       dot_dimension_numbers.lhs_batch_dimensions().end());
+  lhs_contract_idx.insert(
+      dot_dimension_numbers.lhs_contracting_dimensions().begin(),
+      dot_dimension_numbers.lhs_contracting_dimensions().end());
+  rhs_batch_idx.insert(dot_dimension_numbers.rhs_batch_dimensions().begin(),
+                       dot_dimension_numbers.rhs_batch_dimensions().end());
+  rhs_contract_idx.insert(
+      dot_dimension_numbers.rhs_contracting_dimensions().begin(),
+      dot_dimension_numbers.rhs_contracting_dimensions().end());
+
+  // Generate equation.
+  std::string lhs_eq = "";
+  std::string rhs_eq = "";
+  std::string out_eq = "";
+  char c = 'a';
+  std::vector<char> lhs_batch_dims;
+  std::vector<char> lhs_contract_dims;
+  for (int i = 0; i < lhs_rank; i++) {
+    absl::StrAppend(&lhs_eq, std::string(1, c));
+    if (lhs_batch_idx.contains(i)) {
+      lhs_batch_dims.push_back(c);
+    } else if (lhs_contract_idx.contains(i)) {
+      lhs_contract_dims.push_back(c);
+    }
+    c++;
+  }
+
+  int batch_trace_idx = 0;
+  int contract_trace_idx = 0;
+  const bool rhs_only_batch = lhs_batch_dims.empty();
+  for (int i = 0; i < rhs_rank; i++) {
+    if (rhs_batch_idx.contains(i)) {
+      if (rhs_only_batch) {
+        rhs_eq.push_back(c);
+        lhs_batch_dims.push_back(c);
+        c++;
+      } else {
+        rhs_eq.push_back(lhs_batch_dims[batch_trace_idx]);
+        batch_trace_idx++;
+      }
+    } else if (rhs_contract_idx.contains(i)) {
+      absl::StrAppend(&rhs_eq,
+                      std::string(1, lhs_contract_dims[contract_trace_idx]));
+      contract_trace_idx++;
+    } else {
+      rhs_eq += c;
+      c++;
+    }
+  }
+
+  // Create out_eq by merging lhs and rhs.
+  // In XlaDotv2 style - batch dim - leftover from lhs - leftover from rhs.
+  for (const char c : lhs_batch_dims) {
+    absl::StrAppend(&out_eq, std::string(1, c));
+  }
+  for (const char c : lhs_eq) {
+    if (!absl::StrContains(out_eq, c) && !absl::StrContains(rhs_eq, c)) {
+      absl::StrAppend(&out_eq, std::string(1, c));
+    }
+  }
+  for (const char c : rhs_eq) {
+    if (!absl::StrContains(out_eq, c) && !absl::StrContains(lhs_eq, c)) {
+      absl::StrAppend(&out_eq, std::string(1, c));
+    }
+  }
+
+  return absl::StrCat(lhs_eq, ",", rhs_eq, "->", out_eq);
+}
+
+Value CreateEinsumOpFromXlaDotV2Op(OpBuilder& builder, const Location loc,
+                                   Value lhs, Value rhs, Value output,
+                                   StringAttr dot_dimension_numbers_str) {
+  xla::DotDimensionNumbers dot_dimension_numbers;
+  dot_dimension_numbers.ParseFromString(dot_dimension_numbers_str.str());
+  SmallVector<Value> input_arguments = {lhs, rhs};
+  const int lhs_rank =
+      lhs.getType().template cast<ShapedType>().getShape().size();
+  const int rhs_rank =
+      rhs.getType().template cast<ShapedType>().getShape().size();
+
+  const std::string einsum_equation =
+      CreateEinsumEquation(dot_dimension_numbers, lhs_rank, rhs_rank);
+
+  return builder.create<TF::EinsumOp>(loc, output.getType(), input_arguments,
+                                      builder.getStringAttr(einsum_equation));
+}
+
+// Restores the collapsed dimensions to the `tensor_type`. `collapsed_dims`
+// designate the dimension indices that were collapsed to produce `tensor_type`.
+// The restored dimensions' sizes are 1, according to the semantics of
+// `XlaGatherOp (https://www.tensorflow.org/xla/operation_semantics#gather). The
+// resulting type's shape has `tensor_type.size() + collapsed_dims.size()`
+// dimensions.
+RankedTensorType RestoreCollapsedDimensions(
+    const RankedTensorType tensor_type,
+    const absl::flat_hash_set<int64_t>& collapsed_dims) {
+  ArrayRef<int64_t> original_tensor_shape = tensor_type.getShape();
+  const int output_tensor_rank =
+      original_tensor_shape.size() + collapsed_dims.size();
+  auto shape_itr = tensor_type.getShape().begin();
+
+  // Populate the dimensions of the output shape, including the restored
+  // dimensions.
+  SmallVector<int64_t> output_shape(output_tensor_rank);
+  for (int i = 0; i < output_tensor_rank; i++) {
+    if (collapsed_dims.contains(i)) {
+      // The collapsed dimension's size should have been 1, so it restores the
+      // dimension with size 1.
+      output_shape[i] = 1;
+    } else {
+      output_shape[i] = *shape_itr;
+      shape_itr++;
+    }
+  }
+
+  return RankedTensorType::get(output_shape, tensor_type.getElementType());
+}
+
+// Determines the output type of the `SliceOp` when it is being inserted in
+// place of a `XlaGatherOp`. When the dimensions of `xla_gather_op_output_type`
+// is known, the `collapsed_dims` are restored. `xla_gather_op_output_type` is
+// the result of collapsing the `collapsed_dims`, but the `SliceOp`'s output
+// should not have the dimensions collapsed already. Returns
+// `xla_gather_op_output_type` unchanged if the rank is unknown.
+//
+// Examples:
+//   * If `xla_gather_op_output_type` == tensor<*xf32>, then it returns:
+//     tensor<*xf32>.
+//   * If `xla_gather_op_output_type` == tensor<3x5xi32> and `collapsed_dims` ==
+//     {0}, then it returns: tensor<1x3x5xi32>.
+//   * If `xla_gather_op_output_type` == tensor<3x5xf32> and `collapsed_dims` ==
+//     {1, 3}, then it returns: tensor<3x1x5x1xf32>.
+Type GetSliceOpOutputType(Type xla_gather_op_output_type,
+                          const absl::flat_hash_set<int64_t>& collapsed_dims) {
+  if (auto ranked_output_type =
+          xla_gather_op_output_type.dyn_cast<RankedTensorType>();
+      ranked_output_type) {
+    return RestoreCollapsedDimensions(ranked_output_type, collapsed_dims);
+  }
+
+  return xla_gather_op_output_type;
+}
+
+// TODO (b/275225582): Supports Xla Gather op in general case.
+bool IsXlaGatherWithoutBatch(Value operand, Value start_indices) {
+  auto operand_type = operand.getType().dyn_cast_or_null<ShapedType>();
+  auto start_indices_type =
+      start_indices.getType().dyn_cast_or_null<ShapedType>();
+  if (start_indices_type == nullptr || operand_type == nullptr) return false;
+  return start_indices_type.getShape().size() == 1;
+}
+
+Value CreateSliceAndReshapeOpFromXlaGatherOpWithoutBatch(
+    OpBuilder& builder, const Location loc, Value operand, Value start_indices,
+    Value slice_sizes, Value output, StringAttr dimension_numbers_str) {
+  // Reads dimension numbers.
+  xla::GatherDimensionNumbers dimension_numbers;
+  dimension_numbers.ParseFromString(dimension_numbers_str.str());
+
+  // Construct full start_indices with given start_indices and
+  // start_index_map.
+  const ArrayRef<int64_t> operand_shape =
+      operand.getType().cast<ShapedType>().getShape();
+  const int64_t operand_rank = operand_shape.size();
+
+  // Fills zeros if start_index is not given in start_indices.
+  Value empty_start_indices = builder.create<TF::FillOp>(
+      loc, RankedTensorType::get({operand_rank}, builder.getI64Type()),
+      /*shape=*/Create1DConstValue<int64_t>(builder, loc, {operand_rank}),
+      /*value=*/CreateScalarConstValue<int64_t>(builder, loc, 0));
+
+  // Converts start_index_map proto to tensor.
+  const int64_t index_map_size = dimension_numbers.start_index_map().size();
+  SmallVector<int64_t> indices(index_map_size);
+  for (int64_t i = 0; i < index_map_size; i++) {
+    indices[i] = dimension_numbers.start_index_map()[i];
+  }
+
+  // Fill elements from start_indices with start_index_map
+  Value scattered_start_indices = builder.create<TF::TensorScatterUpdateOp>(
+      loc, empty_start_indices,
+      /*indices=*/
+      builder.create<TF::ReshapeOp>(
+          loc, RankedTensorType::get({index_map_size, 1}, builder.getI64Type()),
+          Create1DConstValue<int64_t>(builder, loc, indices),
+          Create1DConstValue<int64_t>(builder, loc, {index_map_size, 1})),
+      /*value=*/
+      builder.create<TF::CastOp>(
+          loc,
+          RankedTensorType::get(
+              start_indices.getType().template cast<ShapedType>().getShape(),
+              builder.getI64Type()),
+          start_indices));
+
+  absl::flat_hash_set<int64_t> collapsed_dims;
+  collapsed_dims.insert(dimension_numbers.collapsed_slice_dims().begin(),
+                        dimension_numbers.collapsed_slice_dims().end());
+
+  // Slice operand by constructed start_indices and slice_sizes.
+  auto slice_op = builder.create<TF::SliceOp>(
+      loc, GetSliceOpOutputType(output.getType(), collapsed_dims), operand,
+      /*start_indices=*/scattered_start_indices,
+      /*slice_sizes=*/
+      builder.create<TF::CastOp>(
+          loc,
+          RankedTensorType::get(
+              slice_sizes.getType().template cast<ShapedType>().getShape(),
+              builder.getI64Type()),
+          slice_sizes));
+
+  // Collapses dimensions by reshaping.
+  SmallVector<int64_t> new_shape(operand_rank - collapsed_dims.size());
+  for (int64_t i = 0, j = 0; i < operand_rank; i++) {
+    if (!collapsed_dims.contains(i)) {
+      new_shape[j++] = operand_shape[i];
+    }
+  }
+  if (!new_shape.empty()) new_shape[0] = -1;
+  return builder.create<TF::ReshapeOp>(
+      loc, output.getType(), slice_op,
+      Create1DConstValue(builder, loc, new_shape));
+}
+
+bool IsPrecisionEmpty(StringAttr prec_str) {
+  xla::PrecisionConfig prec;
+  prec.ParseFromString(prec_str.str());
+  return !prec.operand_precision_size();
+}
+
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.inc"
 
 void PrepareLiftingPass::runOnOperation() {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
index 52d4505781c..6f6e6d89da6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
@@ -21,12 +21,48 @@ include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 
+// Creates Einsum Op from XlaDotV2 Op by generating equation.
+def CreateEinsumOpFromXlaDotV2Op : NativeCodeCall<
+  "CreateEinsumOpFromXlaDotV2Op($_builder, $_loc, $0...)">;
+
+// Only handles the case where precision config is default.
+def IsPrecisionEmpty :
+  Constraint<CPred<"IsPrecisionEmpty($0)">>;
+
+// Convert XlaDotV2 Op to Einsum Op with above two functions.
+def ConvertXlaDotV2OpToEinsumOp : Pat<
+  (TF_XlaDotV2Op:$dot $lhs, $rhs, $dot_dimension_numbers, $precision_config),
+  (CreateEinsumOpFromXlaDotV2Op $lhs, $rhs, $dot, $dot_dimension_numbers),
+  [(IsPrecisionEmpty $precision_config)]>;
+
 // Converts arith.constant ops from freezing passes back to tf.Const ops.
 def ConvertArithConstToTfConst : Pat<
   (Arith_ConstantOp:$res DenseElementsAttr:$value),
   (TF_ConstOp $value),
   [(AnyStaticShapeTensor $res)]>;
 
+// Converts CheckNumerics op to Identity
+def ConvertCheckNumerics : Pat<
+  (TF_CheckNumericsOp $arg, $msg),
+  (TF_IdentityOp $arg)>;
+
+// Only handles the case where batch_dimension is empty.
+def IsXlaGatherWithoutBatch :
+  Constraint<CPred<"IsXlaGatherWithoutBatch($0, $1)">>;
+
+// Create Slice op from XlaGather op without batch dimension.
+def CreateSliceAndReshapeOpFromXlaGatherOpWithoutBatch : NativeCodeCall<
+  "CreateSliceAndReshapeOpFromXlaGatherOpWithoutBatch($_builder, $_loc, $0...)">;
+
+// Convert XlaGather op without batch to Slice op with above two functions.
+def ConvertXlaGatherOpWithoutBatch : Pat<
+  (TF_XlaGatherOp:$gather $operand,
+    $start_indices, $slice_sizes, $dimension_numbers, $indices_are_sorted),
+  (CreateSliceAndReshapeOpFromXlaGatherOpWithoutBatch $operand,
+    $start_indices, $slice_sizes, $gather, $dimension_numbers),
+  [(IsXlaGatherWithoutBatch $operand, $start_indices)]>;
+
+
 // Converts tf.FusedBatchNormV3 into a sequence of more primitive arithmetic
 // operations. Specifically, performs the following calculation:
 //
@@ -104,6 +140,19 @@ def ConvertAddToBiasAdd : Pat<
   [(HasRankOf<1> $add_rhs_value),
    (HasEqualElementSize<[-1], [0]> $conv_out, $add_rhs)]>;
 
+// TODO(b/278493977): Create generic implementation of lifting any fused op
+// with any reshaping op
+def ConvertAddWithReshapeToBiasAddWithReshape : Pat<
+  (TF_AddV2Op
+    (TF_ReshapeOp:$reshape_out
+      (SupportedAffineOpMatcher $_, $_, $_),
+      $_
+    ),
+    (TF_ConstOp:$add_rhs IsFloatElementsAttr:$add_rhs_value)),
+  (TF_BiasAddOp $reshape_out, $add_rhs, (CreateStringAttr<"NHWC">)),
+  [(HasRankOf<1> $add_rhs_value),
+   (HasEqualElementSize<[-1], [0]> $reshape_out, $add_rhs)]>;
+
 // Fuse consecutive BiasAddOp and an AddV2Op.
 def FuseBiasAndAddV2 : Pat<
   (TF_AddV2Op
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
index ce8aac4b8fa..bf93774e67f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
@@ -15,38 +15,42 @@ limitations under the License.
 // Copied and modified from
 // //third_party/tensorflow/compiler/mlir/lite/transforms/quantize.cc
 // This transformation pass applies quantization on TF dialect.
+#include <memory>
 #include <string>
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/framework/types.pb.h"
 
 namespace mlir {
 namespace quant {
@@ -81,14 +85,28 @@ struct TFQuantizationBase
   // range quantization.
   static bool AllowDynamicRangeQuantizedOperand(
       Operation* quantized_op, const CustomMap& custom_op_map) {
-    return quantization_trait == kDynamicRangeQuantization;
+    auto call_op = cast<TF::PartitionedCallOp>(quantized_op);
+    StringRef function_name =
+        call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue();
+    // The below can be generalized as there are more read-only ops added such
+    // as slice.
+    const bool is_gather = function_name.contains("gather");
+    return quantization_trait != kFullQuantization || is_gather;
   }
 
   // All the quantized ops are supported if the quantization method is dynamic
   // range quantization.
   static bool AllowDynamicRangeQuantizedResult(Operation* quantized_op,
                                                const CustomMap& custom_op_map) {
-    return quantization_trait == kDynamicRangeQuantization;
+    auto call_op = cast<TF::PartitionedCallOp>(quantized_op);
+    StringRef function_name =
+        call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue();
+    // The below can be generalized as there are more read-only ops added such
+    // as slice.
+    bool is_gather = false;
+    if (function_name.contains("gather")) is_gather = true;
+    return quantization_trait != kFullQuantization ||
+           (quantization_trait == kFullQuantization && is_gather);
   }
 
   // If weight_only_quantization is true, the legacy weight-only quantization is
@@ -164,7 +182,7 @@ class QuantizeSameScaleOpsPattern
 
   LogicalResult matchAndRewrite(quantfork::DequantizeCastOp op,
                                 PatternRewriter& rewriter) const override {
-    llvm::SmallVector<Operation*, 4> quantizing_ops;
+    SmallVector<Operation*, 4> quantizing_ops;
     auto users = op.getResult().getUsers();
     quantizing_ops.append(users.begin(), users.end());
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
index 31f0ade7ef8..6c374141025 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
@@ -452,24 +452,47 @@ LogicalResult TransferTFAttributesToTFUniformAttributes(
   // Set the attributes for ops with the attr_map attribute.
   for (Operation& inner_op : quantized_func.getBody().front().getOperations()) {
     if (auto uniform_op =
-            llvm::dyn_cast<TF::UniformQuantizedConvolutionHybridOp>(inner_op)) {
+            llvm::dyn_cast<TF::UniformQuantizedConvolutionHybridOp>(inner_op);
+        uniform_op != nullptr) {
       if (failed(FillAttributesForUniformQuantizedConvolutionOp(
               rewriter, uniform_op, identifier_to_attr, quantization_method,
               enable_per_channel_quantization)))
         return failure();
     } else if (auto uniform_op =
-                   llvm::dyn_cast<TF::UniformQuantizedConvolutionOp>(
-                       inner_op)) {
+                   llvm::dyn_cast<TF::UniformQuantizedConvolutionOp>(inner_op);
+               uniform_op != nullptr) {
       if (failed(FillAttributesForUniformQuantizedConvolutionOp(
               rewriter, uniform_op, identifier_to_attr, quantization_method,
               enable_per_channel_quantization)))
         return failure();
     } else if (auto uniform_op =
-                   llvm::dyn_cast<TF::UniformQuantizedDotHybridOp>(inner_op)) {
+                   llvm::dyn_cast<TF::UniformQuantizedDotHybridOp>(inner_op);
+               uniform_op != nullptr) {
       if (failed(FillAttributesForUniformQuantizedDotOp(
               rewriter, uniform_op, identifier_to_attr, quantization_method,
               enable_per_channel_quantization)))
         return failure();
+    } else if (auto uniform_op =
+                   llvm::dyn_cast<TF::UniformQuantizedAddOp>(inner_op);
+               uniform_op != nullptr) {
+      if (failed(FillAttributesForUniformQuantizedAddOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
+    } else if (auto uniform_op =
+                   llvm::dyn_cast<TF::UniformQuantizedClipByValueOp>(inner_op);
+               uniform_op != nullptr) {
+      if (failed(FillAttributesForUniformQuantizedClipByValueOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
+    } else if (auto uniform_op =
+                   llvm::dyn_cast<TF::UniformRequantizeOp>(inner_op);
+               uniform_op != nullptr) {
+      if (failed(FillAttributesForUniformRequantizeOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
     }
   }
   return success();
@@ -535,15 +558,28 @@ LogicalResult TransferAttributes(func::FuncOp float_func,
 }
 
 // Get the corresponding quantized function name from the given function name.
-std::string GetQuantizedFunctionName(StringRef func_name) {
+std::string GetQuantizedFunctionName(StringRef func_name,
+                                     const bool is_hybrid) {
   if (func_name.startswith(kQuantizedFuncPrefix)) return func_name.str();
   if (!func_name.startswith(kCompositeFuncPrefix)) return "";
 
-  return llvm::Twine(kQuantizedFuncPrefix)
-      .concat(llvm::Twine(
-          func_name.substr(kCompositeFuncPrefix.size()).rsplit("_fn").first))
-      .concat("_fn")
-      .str();
+  auto base_function_name =
+      llvm::Twine(kQuantizedFuncPrefix)
+          .concat(llvm::Twine(func_name.substr(kCompositeFuncPrefix.size())
+                                  .rsplit("_fn")
+                                  .first));
+
+  return is_hybrid
+             ? base_function_name.concat("_float_output").concat("_fn").str()
+             : base_function_name.concat("_fn").str();
+}
+
+bool ContainsQuantizedReusltType(ArrayRef<Type> result_types) {
+  for (auto current_type : result_types) {
+    if (!current_type.dyn_cast<TensorType>().getElementType().isF32())
+      return true;
+  }
+  return false;
 }
 
 // Unwraps quantization parameters of PartitionedCall ops with quantized
@@ -554,20 +590,17 @@ class QuantizeFunctionPattern
   explicit QuantizeFunctionPattern(MLIRContext* context,
                                    const QuantMethod quantization_method,
                                    const OpSet target_opset,
-                                   const bool enable_per_channel_quantization,
-                                   const bool enable_legacy_weight_only)
+                                   const bool enable_per_channel_quantization)
       : OpRewritePattern<TF::PartitionedCallOp>(context),
         quantization_method_(quantization_method),
         target_opset_(target_opset),
-        enable_per_channel_quantization_(enable_per_channel_quantization),
-        enable_legacy_weight_only_(enable_legacy_weight_only) {}
+        enable_per_channel_quantization_(enable_per_channel_quantization) {}
 
  private:
   QuantMethod quantization_method_ =
       tensorflow::quantization::QuantizationMethod::STATIC_RANGE;
   OpSet target_opset_ = OpSet::TF;
   bool enable_per_channel_quantization_;
-  bool enable_legacy_weight_only_;
 
   LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
                                 PatternRewriter& rewriter) const override {
@@ -579,24 +612,20 @@ class QuantizeFunctionPattern
     if (!f_attr.getValue().startswith(kCompositeFuncPrefix)) {
       return failure();
     }
-    // Determines if all required float input/outputs are now quantized.
-    bool has_quantized_types = true;
-    switch (quantization_method_) {
-      case tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE:
-        has_quantized_types &= IsQuantizedCallforDynamicRange(call_op);
-        break;
-      case tensorflow::quantization::QuantizationMethod::STATIC_RANGE:
-        has_quantized_types &= IsQuantizedCallforStaticRange(call_op);
-        break;
-      case tensorflow::quantization::QuantizationMethod::WEIGHT_ONLY:
-        // Skipping input type check for weight-only quantization as it can be
-        // dequantized beforehand for the legacy scheme.
-        has_quantized_types &= !enable_legacy_weight_only_;
-        break;
-      default:
-        call_op->emitError("The quantization method is not supported.");
-        return failure();
+
+    bool has_quantized_types = false;
+    if (quantization_method_ ==
+        tensorflow::quantization::QuantizationMethod::WEIGHT_ONLY) {
+      // Skipping input type check for weight-only quantization as it can be
+      // dequantized beforehand for the legacy scheme.
+      has_quantized_types = true;
+    } else {
+      // Determines if all required float input/outputs are now quantized.
+      // Either one of the criteria needs to meet.
+      has_quantized_types |= IsQuantizedCallforDynamicRange(call_op);
+      has_quantized_types |= IsQuantizedCallforStaticRange(call_op);
     }
+
     if (!has_quantized_types) return failure();
 
     SmallVector<Value, 4> args;
@@ -703,7 +732,6 @@ class QuantizeFunctionPattern
         result_types.push_back(result_type);
         continue;
       }
-
       if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
         ShapedType new_result_type = ConvertIntToQint(
             result_type.cast<ShapedType>(), rewriter.getContext());
@@ -730,8 +758,14 @@ class QuantizeFunctionPattern
         dyn_cast<func::FuncOp>(symbol_table.lookup(f_attr.getValue()));
     rewriter.setInsertionPointAfter(float_func);
 
+    // Applies only for hybrid ops in SRQ.
+    const bool is_hybrid =
+        !ContainsQuantizedReusltType(result_types) &&
+        (quantization_method_ ==
+         tensorflow::quantization::QuantizationMethod::STATIC_RANGE);
     const std::string quantized_function_name =
-        GetQuantizedFunctionName(f_attr.getValue());
+        GetQuantizedFunctionName(f_attr.getValue(), is_hybrid);
+
     const mlir::func::FuncOp quantized_func =
         dyn_cast<func::FuncOp>(symbol_table.lookup(quantized_function_name));
     mlir::func::FuncOp new_quantized_func =
@@ -816,7 +850,7 @@ class QuantizeFunctionPattern
     // the length of the "_fn" suffix.
     const size_t fn_suffix_length = 3;
     std::string quantized_function_name =
-        GetQuantizedFunctionName(f_attr.getValue());
+        GetQuantizedFunctionName(f_attr.getValue(), /*is_hybrid=*/false);
     quantized_function_name.replace(
         quantized_function_name.size() - fn_suffix_length, fn_suffix_length,
         kFloatOutputFuncPrefix);
@@ -905,7 +939,8 @@ class QuantizeConstPattern
       // TODO(b/225793355): It adds TensorProtoAttr to the constant as a
       // workaround.
       tensorflow::TensorProto tensor_proto;
-      if (!mlir::tfg::ConvertToTensorProto(tensor_proto_attr, &tensor_proto)
+      if (!mlir::tfg::ConvertToTensorProto(
+               tensor_proto_attr.cast<ElementsAttr>(), &tensor_proto)
                .ok()) {
         return failure();
       }
@@ -1047,7 +1082,8 @@ class QuantizationSummary {
 
   // Get the representative name attribute value of a composite function.
   FailureOr<StringRef> GetRepresentativeName(StringRef func_name) {
-    std::string quantized_func_name = GetQuantizedFunctionName(func_name);
+    std::string quantized_func_name =
+        GetQuantizedFunctionName(func_name, /*is_hybrid=*/false);
     auto quantized_func = dyn_cast_or_null<func::FuncOp>(
         symbol_table_.lookup(quantized_func_name));
     // Quantized function does not exist for weight-only case.
@@ -1125,13 +1161,16 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
     signalPassFailure();
   }
 
-  RewritePatternSet patterns(ctx);
-  patterns.add<QuantizeFunctionPattern>(
-      ctx, quantization_method_, target_opset_,
-      enable_per_channel_quantization_, enable_legacy_weight_only_);
+  // Legacy weight-only does not require quantized ops.
+  if (!enable_legacy_weight_only_) {
+    RewritePatternSet patterns(ctx);
+    patterns.add<QuantizeFunctionPattern>(ctx, quantization_method_,
+                                          target_opset_,
+                                          enable_per_channel_quantization_);
 
-  if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns)))) {
-    signalPassFailure();
+    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns)))) {
+      signalPassFailure();
+    }
   }
 
   // Constant quantization is a lossy transformation, so they are applied only
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library.mlir
index 07883e99afd..7ccff9d7091 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library.mlir
@@ -393,6 +393,36 @@ module {
     }
   } // end for
 
+  // TODO(b/278493977): Create generic implementation of lifting any fused op
+  // with any reshaping op
+  for main_op in ["MatMul"] {
+    parameters[
+      {"quantized_ops": ["${main_op}", "Reshape", "BiasAdd"], "act_func": "internal_requantize_no_activation_fn", "output_type": "i8"},
+      {"quantized_ops": ["${main_op}", "Reshape", "BiasAdd"], "act_func": "internal_dequantize_no_activation_fn", "output_type": "f32"},
+    ]
+    func.func @GenerateQuantizedFunctionName(${quantized_ops}, "${output_type}")(%input : tensor<*xi8>,
+                           %filter : tensor<*xi8>, %bias : tensor<*xi32>, %shape : tensor<*xi32>,
+                           %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
+                           %filter_scale : tensor<*xf32>, %filter_zp : tensor<*xi32>,
+                           %bias_scale : tensor<*xf32>, %bias_zp : tensor<*xi32>,
+                           %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x${output_type}>
+        attributes {tf_quant.quantized_ops = ${quantized_ops}} {
+      %0 = "tf.PartitionedCall"(%input, %filter, %input_scale, %input_zp,
+                                  %filter_scale, %filter_zp) {
+          config = "", config_proto = "", executor_type = "", f=@GenerateImplFunctionName(${main_op})
+        } : (tensor<*xi8>, tensor<*xi8>, tensor<*xf32>, tensor<*xi32>,
+               tensor<*xf32>, tensor<*xi32>) -> tensor<*xi32>
+      %1 = "tf.Reshape"(%0, %shape) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+      %2 = "tf.AddV2"(%1, %bias) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+      %3 = "tf.PartitionedCall"(%2, %input_scale, %input_zp, %filter_scale, %filter_zp,
+                                  %out_scale, %out_zp) {
+          config = "", config_proto = "", executor_type = "", f=@${act_func}
+        } : (tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>,
+               tensor<*xf32>, tensor<*xi32>) -> tensor<*x${output_type}>
+      func.return %3 : tensor<*x${output_type}>
+    }
+  } // end for
+
   func.func @quantize_i8(%input : tensor<*xf32>, %scale : tensor<*xf32>, %zp : tensor<*xi32>) -> tensor<*xi8> {
     %float_zp = "tf.Cast"(%zp) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
     %div = "tf.Div"(%input, %scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
@@ -421,20 +451,49 @@ module {
   // Weight-only functions.
   //===----------------------------------------------------------------------===//
 
+  func.func private @internal_dequantize_i8_in_f32_fn(
+                           %input : tensor<*xi8>, %weight_scale : tensor<*xf32>) -> tensor<*xf32> {
+    %input_f32 = "tf.Cast"(%input) : (tensor<*xi8>) -> tensor<*xf32>
+    %mul = "tf.Mul"(%input_f32, %weight_scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    func.return %mul : tensor<*xf32>
+  }
+
   // Note that input i64 type is also supported by this.
+  // As the output is quantized type, output scale/zp is required for the arguments.
   parameters[
-    {"quantized_ops": ["Gather"], "output_type": "i8"}
+    {"quantized_ops": ["Gather"], "act_func": "internal_identity_fn", "output_type": "i8"}
   ]
-  func.func @GenerateQuantizedFunctionName(${quantized_ops})(
+  func.func @GenerateQuantizedFunctionName(${quantized_ops}, "${output_type}")(
                          %weight : tensor<*xi8>, %input : tensor<*xi32>, %axis : tensor<i32>,
                          %weight_scale : tensor<*xf32>, %weight_zp : tensor<*xi32>,
                          %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x${output_type}>
-      attributes {tf_quant.quantized_ops = ${quantized_ops}}
-  {
+      attributes {tf_quant.quantized_ops = ${quantized_ops}} {
+
+    %out = "tf.GatherV2"(%weight, %input, %axis) {
+      batch_dims = 0 : i64, attr_map = "batch_dims:0"} : (tensor<*xi8>, tensor<*xi32>, tensor<i32>) -> tensor<*xi8>
+
+    func.return %out : tensor<*x${output_type}>
+  }
+
+  // Note that input i64 type is also supported by this.
+  // The dequantization is merged to the quantized function.
+  // As the output type is specified to f32, the quantized function has "_float_output_fn" tag at the end.
+  parameters[
+    {"quantized_ops": ["Gather"], "act_func": "internal_dequantize_i8_in_f32_fn", "output_type": "f32"}
+  ]
+  func.func @GenerateQuantizedFunctionName(${quantized_ops}, "${output_type}")(
+                         %weight : tensor<*xi8>, %input : tensor<*xi32>, %axis : tensor<i32>,
+                         %weight_scale : tensor<*xf32>, %weight_zp : tensor<*xi32>) -> tensor<*x${output_type}>
+      attributes {tf_quant.quantized_ops = ${quantized_ops}} {
+
     %accum_out = "tf.GatherV2"(%weight, %input, %axis) {
       batch_dims = 0 : i64, attr_map = "batch_dims:0"} : (tensor<*xi8>, tensor<*xi32>, tensor<i32>) -> tensor<*xi8>
 
-    func.return %accum_out : tensor<*x${output_type}>
+    %out = "tf.PartitionedCall"(%accum_out, %weight_scale) {
+        config = "", config_proto = "", executor_type = "", f=@${act_func}
+      } : (tensor<*xi8>, tensor<*xf32>) -> tensor<*x${output_type}>
+
+    func.return %out : tensor<*x${output_type}>
   }
 
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_uniform_quantized.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_uniform_quantized.mlir
index 2225e588e39..0d95b8eda87 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_uniform_quantized.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_uniform_quantized.mlir
@@ -51,7 +51,13 @@ module {
                                   %filter_scale, %filter_zp, %out_scale, %out_zp) {
           config = "", config_proto = "", executor_type = "", f=@GenerateImplFunctionName(${main_op})
         } : (tensor<*x!tf_type.qint8>, tensor<*x!tf_type.qint8>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
-      %add = "tf.UniformQuantizedAdd"(%main_out, %bias, %input_scale, %input_zp, %bias_scale, %bias_zp, %out_scale, %out_zp) {
+      // Extract channel shape from filter, and ensure input/output scale/zp's have the same channel size.
+      %filter_shape = "tf.Shape" (%filter_scale) : (tensor<*xf32>) -> tensor<*xi32>
+      %input_scale_filled = "tf.Fill" (%filter_shape, %input_scale) : (tensor<*xi32>, tensor<*xf32>) -> tensor<*xf32>
+      %input_zp_filled = "tf.Fill" (%filter_shape, %input_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+      %out_scale_filled = "tf.Fill" (%filter_shape, %out_scale) : (tensor<*xi32>, tensor<*xf32>) -> tensor<*xf32>
+      %out_zp_filled = "tf.Fill" (%filter_shape, %out_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+      %add = "tf.UniformQuantizedAdd"(%main_out, %bias, %input_scale_filled, %input_zp_filled, %bias_scale, %bias_zp, %out_scale_filled, %out_zp_filled) {
         lhs_quantization_axis = -1,
         lhs_quantization_min_val = -128,
         lhs_quantization_max_val = 127,
@@ -64,9 +70,9 @@ module {
         T = "tfdtype$DT_QINT32",
         attr_map = ""
       } : (tensor<*x!tf_type.qint32>, tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
-      %act = "tf.PartitionedCall"(%add, %input_scale, %input_zp, %out_scale, %out_zp) {
+      %act = "tf.PartitionedCall"(%add, %input_scale_filled, %input_zp_filled, %out_scale_filled, %out_zp_filled, %out_scale, %out_zp) {
           config = "", config_proto = "", executor_type = "", f=@${act_func}
-        } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x${output_type}>
+        } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x${output_type}>
       func.return %act : tensor<*x${output_type}>
     }
 
@@ -85,9 +91,14 @@ module {
                                   %filter_scale, %filter_zp, %out_scale, %out_zp) {
           config = "", config_proto = "", executor_type = "", f=@GenerateImplFunctionName(${main_op})
         } : (tensor<*x!tf_type.qint8>, tensor<*x!tf_type.qint8>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
-      %act = "tf.PartitionedCall"(%main_out, %input_scale, %input_zp, %out_scale, %out_zp) {
+      %filter_shape = "tf.Shape" (%filter_scale) : (tensor<*xf32>) -> tensor<*xi32>
+      %input_scale_filled = "tf.Fill" (%filter_shape, %input_scale) : (tensor<*xi32>, tensor<*xf32>) -> tensor<*xf32>
+      %input_zp_filled = "tf.Fill" (%filter_shape, %input_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+      %out_scale_filled = "tf.Fill" (%filter_shape, %out_scale) : (tensor<*xi32>, tensor<*xf32>) -> tensor<*xf32>
+      %out_zp_filled = "tf.Fill" (%filter_shape, %out_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+      %act = "tf.PartitionedCall"(%main_out, %input_scale_filled, %input_zp_filled, %out_scale_filled, %out_zp_filled, %out_scale, %out_zp) {
           config = "", config_proto = "", executor_type = "", f=@${act_func}
-        } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x${output_type}>
+        } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x${output_type}>
       func.return %act : tensor<*x${output_type}>
     }
   } // end for
@@ -198,7 +209,7 @@ module {
 
   // Requantizes and applies quantized Relu by clipping.
   func.func private @internal_requantize_no_activation_fn(%input : tensor<*x!tf_type.qint32>, %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
-                         %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
+                         %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>, %out_scale_single : tensor<*xf32>, %out_zp_single : tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
     %q_out = "tf.PartitionedCall"(%input, %input_scale, %input_zp, %out_scale, %out_zp) {
         config = "", config_proto = "", executor_type = "", f=@internal_requantize_qi8_fn
       } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint8>
@@ -207,20 +218,23 @@ module {
 
   // Requantizes and applies quantized Relu6 by clipping.
   func.func private @internal_requantize_and_relu_fn(%input : tensor<*x!tf_type.qint32>, %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
-                         %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
+                         %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>, %out_scale_single : tensor<*xf32>, %out_zp_single : tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
+    %filter_shape = "tf.Shape" (%input_scale) : (tensor<*xf32>) -> tensor<*xi32>
     %i8_min = "tf.Const"() {value = dense<-128.0> : tensor<f32>} : () -> tensor<f32>
     %i8_max = "tf.Const"() {value = dense<127.0> : tensor<f32>} : () -> tensor<f32>
+    %i8_min_filled = "tf.Fill" (%filter_shape, %i8_min) : (tensor<*xi32>, tensor<f32>) -> tensor<*xf32>
+    %i8_max_filled = "tf.Fill" (%filter_shape, %i8_max) : (tensor<*xi32>, tensor<f32>) -> tensor<*xf32>
     %float_out_zp = "tf.Cast"(%out_zp) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
-    %clip_min = "tf.Maximum"(%i8_min, %float_out_zp) : (tensor<f32>, tensor<*xf32>) -> tensor<f32>
-    %qclip_min = "tf.Cast"(%i8_min) {Truncate = false} : (tensor<f32>) -> tensor<!tf_type.qint32>
-    %qi8_max = "tf.Cast"(%i8_max) {Truncate = false} : (tensor<f32>) -> tensor<!tf_type.qint32>
-    %relu = "tf.UniformQuantizedClipByValue"(%input, %qclip_min, %qi8_max, %out_scale, %out_zp) {
+    %clip_min = "tf.Maximum"(%i8_min_filled, %float_out_zp) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %qclip_min = "tf.Cast"(%clip_min) {Truncate = false} : (tensor<*xf32>) -> tensor<*x!tf_type.qint32>
+    %qclip_max = "tf.Cast"(%i8_max_filled) {Truncate = false} : (tensor<*xf32>) -> tensor<*x!tf_type.qint32>
+    %relu = "tf.UniformQuantizedClipByValue"(%input, %qclip_min, %qclip_max, %out_scale, %out_zp) {
       T = "tfdtype$DT_QINT32",
       quantization_axis = -1,
       quantization_min_val = -128,
       quantization_max_val = 127,
       attr_map = ""
-    } : (tensor<*x!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
+    } : (tensor<*x!tf_type.qint32>, tensor<*x!tf_type.qint32>, tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
     %requantize = "tf.PartitionedCall"(%relu, %input_scale, %input_zp, %out_scale, %out_zp) {
         config = "", config_proto = "", executor_type = "", f=@internal_requantize_qi8_fn
       } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint8>
@@ -229,30 +243,34 @@ module {
 
    // Apply requantization and relu6.
   func.func private @internal_requantize_and_relu6_fn(%input : tensor<*x!tf_type.qint32>, %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
-                         %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
+                         %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>, %out_scale_single : tensor<*xf32>, %out_zp_single : tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
+    %filter_shape = "tf.Shape" (%input_scale) : (tensor<*xf32>) -> tensor<*xi32>
     %i8_min = "tf.Const"() {value = dense<-128.0> : tensor<f32>} : () -> tensor<f32>
     %i8_max = "tf.Const"() {value = dense<127.0> : tensor<f32>} : () -> tensor<f32>
     %act_max =  "tf.Const"() {value = dense<6.0> : tensor<f32>} : () -> tensor<f32>
-    %i8_act_max_0 = "tf.PartitionedCall"(%act_max, %input_scale, %input_zp) {
+    // Singular scale/zp is needed to ensure quantization is per-tensor for this variable.
+    %i8_act_max_0 = "tf.PartitionedCall"(%act_max, %out_scale_single, %out_zp_single) {
         config = "", config_proto = "", executor_type = "", f=@quantize_i8
       } : (tensor<f32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint8>
     %i8_act_max_1 = "tf.Cast"(%i8_act_max_0) {Truncate = false} : (tensor<*x!tf_type.qint8>) -> tensor<f32>
     %float_out_zp = "tf.Cast"(%out_zp) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
-    %clip_min = "tf.Maximum"(%i8_min, %float_out_zp) : (tensor<f32>, tensor<*xf32>) -> tensor<f32>
-    %clip_max = "tf.Minimum"(%i8_max, %i8_act_max_1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-    %qclip_min = "tf.Cast"(%i8_min) {Truncate = false} : (tensor<f32>) -> tensor<!tf_type.qint32>
-    %qclip_max = "tf.Cast"(%i8_max) {Truncate = false} : (tensor<f32>) -> tensor<!tf_type.qint32>
+    %i8_min_filled = "tf.Fill" (%filter_shape, %i8_min) : (tensor<*xi32>, tensor<f32>) -> tensor<*xf32>
+    %i8_max_filled = "tf.Fill" (%filter_shape, %i8_max) : (tensor<*xi32>, tensor<f32>) -> tensor<*xf32>
+    %i8_act_max_1_filled = "tf.Fill" (%filter_shape, %i8_act_max_1) : (tensor<*xi32>, tensor<f32>) -> tensor<*xf32>
+    %clip_min = "tf.Maximum"(%i8_min_filled, %float_out_zp) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %clip_max = "tf.Minimum"(%i8_max_filled, %i8_act_max_1_filled) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %qclip_min = "tf.Cast"(%clip_min) {Truncate = false} : (tensor<*xf32>) -> tensor<*x!tf_type.qint32>
+    %qclip_max = "tf.Cast"(%clip_max) {Truncate = false} : (tensor<*xf32>) -> tensor<*x!tf_type.qint32>
     %relu = "tf.UniformQuantizedClipByValue"(%input, %qclip_min, %qclip_max, %out_scale, %out_zp) {
       T = "tfdtype$DT_QINT32",
       quantization_axis = -1,
       quantization_min_val = -128,
       quantization_max_val = 127,
       attr_map = ""
-    } : (tensor<*x!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
+    } : (tensor<*x!tf_type.qint32>, tensor<*x!tf_type.qint32>, tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
     %requantize = "tf.PartitionedCall"(%relu, %input_scale, %input_zp, %out_scale, %out_zp) {
         config = "", config_proto = "", executor_type = "", f=@internal_requantize_qi8_fn
       } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint8>
     func.return %requantize : tensor<*x!tf_type.qint8>
   }
 }
-
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.cc
index c65f7ac7906..1491ccc049f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.cc
@@ -25,7 +25,7 @@ namespace mlir {
 namespace quant {
 
 bool HasQuantizedTensors(Operation* op) {
-  if (IsOpNotQuantizable(op)) return false;
+  if (!IsOpQuantizable(op)) return false;
   for (Type operand_type : op->getOperandTypes()) {
     auto tensor_type = operand_type.dyn_cast<TensorType>();
     if (tensor_type && tensor_type.getElementType().isa<QuantizedType>()) {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index 9372a6ca393..3b5a9d55f5f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -85,10 +85,12 @@ cc_library(
     hdrs = ["quantize_model.h"],
     compatible_with = get_compatible_with_cloud(),
     deps = if_static([":quantize_model_cc_impl"]) + [
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
-        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -124,7 +126,7 @@ tf_py_test(
     deps = [
         ":pywrap_quantize_model",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -147,7 +149,6 @@ pytype_strict_library(
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/training:saver",
         "//tensorflow/python/training:training_lib",
-        "//tensorflow/python/types",
         "@absl_py//absl/logging",
     ],
 )
@@ -171,14 +172,15 @@ pytype_strict_library(
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/lib/io:lib",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/trackable:autotrackable",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:core",
         "//third_party/py/numpy",
         "@absl_py//absl/logging",
     ],
@@ -198,6 +200,7 @@ tf_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:tag_constants",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -236,7 +239,7 @@ pytype_library(
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/trackable:asset",
         "//tensorflow/python/trackable:autotrackable",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:core",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -264,8 +267,8 @@ pytype_strict_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/python/client:session",
-        "//tensorflow/python/platform",
-        "//tensorflow/python/types",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/types:core",
     ],
 )
 
@@ -279,7 +282,7 @@ tf_py_test(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:core",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index 6f8df273bd3..892dfde7c9a 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -16,7 +16,7 @@
 # TODO(b/264234648): Refactor and cleanup this file.
 import itertools
 import os
-from typing import List, Mapping, Optional, Sequence, Tuple, Union
+from typing import Mapping, Optional, Sequence, Tuple, Union
 
 from absl.testing import parameterized
 import numpy as np
@@ -135,6 +135,14 @@ class MultipleSignatureModel(module.Module):
   Used to test where the quantizer has to handle multiple signatures.
   """
 
+  def __init__(self):
+    self.matmul_filters = random_ops.random_uniform(
+        shape=(4, 3), minval=-1.0, maxval=1.0
+    )
+    self.conv_filters = np.random.uniform(
+        low=-10, high=10, size=(2, 3, 3, 2)
+    ).astype('f4')
+
   @def_function.function(
       input_signature=[
           tensor_spec.TensorSpec(shape=[1, 4], dtype=dtypes.float32)
@@ -149,8 +157,7 @@ class MultipleSignatureModel(module.Module):
     Returns:
       A map of: output key -> output result.
     """
-    filters = random_ops.random_uniform(shape=(4, 3), minval=-1.0, maxval=1.0)
-    out = math_ops.matmul(matmul_input, filters)
+    out = math_ops.matmul(matmul_input, self.matmul_filters)
 
     return {'output': out}
 
@@ -168,12 +175,9 @@ class MultipleSignatureModel(module.Module):
     Returns:
       A map of: output key -> output result.
     """
-    filters = np.random.uniform(low=-10, high=10, size=(2, 3, 3, 2)).astype(
-        'f4'
-    )
     out = nn_ops.conv2d(
         conv_input,
-        filters,
+        self.conv_filters,
         strides=[1, 1, 2, 1],
         dilations=[1, 1, 1, 1],
         padding='SAME',
@@ -183,6 +187,8 @@ class MultipleSignatureModel(module.Module):
     return {'output': out}
 
 
+# TODO(b/280208261): Add unit tests for comparing unquantized and
+# quantized results
 @test_util.run_all_in_graph_and_eager_modes
 class QuantizationOptionsTest(quantize_model_test_base.QuantizedModelTest):
   """Test cases regarding the use of QuantizationOptions proto.
@@ -192,6 +198,10 @@ class QuantizationOptionsTest(quantize_model_test_base.QuantizedModelTest):
   """
 
   class SimpleModel(module.Module):
+    def __init__(self):
+      self.filters = np.random.uniform(low=-1.0, high=1.0, size=(4, 3)).astype(
+          'f4'
+      )
 
     @def_function.function(
         input_signature=[
@@ -207,9 +217,8 @@ class QuantizationOptionsTest(quantize_model_test_base.QuantizedModelTest):
       Returns:
         A map of: output key -> output result.
       """
-      filters = np.random.uniform(low=-1.0, high=1.0, size=(4, 3)).astype('f4')
 
-      out = math_ops.matmul(input_tensor, filters)
+      out = math_ops.matmul(input_tensor, self.filters)
       return {'output': out}
 
   def _simple_model_data_gen(self) -> repr_dataset.RepresentativeDataset:
@@ -352,6 +361,56 @@ class QuantizationOptionsTest(quantize_model_test_base.QuantizedModelTest):
         threshold=0.3,
     )
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_force_graph_mode_calibration(self):
+    input_type = dtypes.int32
+    input_placeholder = self._create_and_save_tf1_gather_model(
+        self._input_saved_model_path,
+        signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+        tags={tag_constants.SERVING},
+        input_key='x',
+        output_key='output',
+        input_type=input_type,
+    )
+
+    data_gen = self._create_data_generator(
+        input_key='x',
+        shape=input_placeholder.shape,
+        minval=0,
+        maxval=10,
+        dtype=input_type,
+    )
+
+    options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        force_graph_mode_calibration=True,
+    )
+
+    with self.assertLogs(level='INFO') as info_logs:
+      # Save the logger verbosity.
+      prev_log_level = logging.get_verbosity()
+      logging.set_verbosity(logging.INFO)
+
+      try:
+        quantize_model.quantize(
+            self._input_saved_model_path,
+            quantization_options=options,
+            representative_dataset=data_gen,
+        )
+      finally:
+        # Restore the logger verbosity.
+        logging.set_verbosity(prev_log_level)
+
+      self.assertNotEmpty(info_logs.records)
+      self.assertTrue(
+          self._any_log_contains(
+              'Calibration step is executed in graph mode.',
+              info_logs.records,
+          )
+      )
+
 
 class TensorNamePreservationTest(quantize_model_test_base.QuantizedModelTest):
 
@@ -495,24 +554,6 @@ class TensorNamePreservationTest(quantize_model_test_base.QuantizedModelTest):
 
 class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
 
-  def _any_warning_contains(
-      self, substring: str, warnings_list: List['LogRecord']
-  ) -> bool:
-    """Returns True if any of the warnings contains a given substring.
-
-    Args:
-      substring: A piece of string to check whether it exists in the warning
-        message.
-      warnings_list: A list of `absl.logging.LogRecord`s.
-
-    Returns:
-      True if and only if the substring exists in any of the warnings in
-      `warnings_list`.
-    """
-    return any(
-        map(lambda warning: substring in str(warning.message), warnings_list)
-    )
-
   @parameterized.parameters(
       parameter_combinations([{
           'shapes': [
@@ -775,7 +816,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
         )
         return {'output': q_out}
 
-    np.random.seed(1234)
     model = ConvModel()
     saved_model_save.save(model, self._input_saved_model_path)
 
@@ -1112,7 +1152,9 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
   ):
     input_type = dtypes.int32
     model = self._create_simple_gather_and_conv_model(
-        input_type, filter_shape=(2, 3, 3, 1024), is_qat_model=True
+        input_type,
+        filter_shape=(2, 3, 3, 1024),
+        is_qat_model=True,
     )
 
     saved_model_save.save(model, self._input_saved_model_path)
@@ -1126,25 +1168,16 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
         op_set=quant_opts_pb2.XLA,
     )
 
-    data_gen = self._create_data_generator(
-        input_key='input_tensor',
-        shape=(6),
-        minval=0,
-        maxval=10,
-        dtype=input_type,
-    )
-
     converted_model = quantize_model.quantize(
         self._input_saved_model_path,
         ['serving_default'],
         tags,
         self._output_saved_model_path,
         quantization_options,
-        representative_dataset=data_gen,
     )
     self.assertIsNotNone(converted_model)
     self.assertSizeRatioLessThan(
-        self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+        self._output_saved_model_path, self._input_saved_model_path, 0.5
     )
 
   # TODO(b/244276332): Allow table initialization in TF2 eager mode.
@@ -1220,6 +1253,77 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
 
       self.assertAllClose(lookup_val, [1.0, 2.0, 0.0])
 
+  # TODO(b/244276332): Allow table initialization in TF2 eager mode.
+  @test_util.deprecated_graph_mode_only
+  def test_qat_file_init_hash_table_lookup_model_tf1(self):
+    tags = {tag_constants.SERVING}
+    signature_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
+    # Create and save a simple model that involves a hash table.
+    inputs, outputs = self._create_and_save_file_init_hash_table_qat_model_tf1(
+        self._input_saved_model_path, tags, signature_def_key
+    )
+
+    # Make sure that the desired input key and output key is present.
+    self.assertIn('input_vocabs', inputs.keys())
+    self.assertIn('lookup', outputs.keys())
+
+    # Representative dataset is composed of a set of vocabs for table lookup.
+    repr_ds = [
+        {'input_vocabs': np.array([b'static', b'range', b'quantization'])}
+        for _ in range(4)
+    ]
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
+    signature_def_keys = [signature_def_key]
+
+    quantize_model.quantize(
+        self._input_saved_model_path,
+        signature_def_keys,
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+        representative_dataset=repr_ds,
+    )
+
+    # Tests table lookup to make sure the table has been initialized
+    # successfully.
+    with session.Session(graph=ops.Graph()) as sess:
+      output_meta_graph_def = saved_model_loader.load(
+          sess, tags=tags, export_dir=self._output_saved_model_path
+      )
+
+      # The graph should contain a quantized function call (it contains a
+      # single f32 matmul node).
+      self.assertTrue(
+          self._contains_quantized_function_call(
+              output_meta_graph_def.graph_def
+          )
+      )
+      self.assertCountEqual(
+          output_meta_graph_def.signature_def.keys(), signature_def_keys
+      )
+
+      signature_def = output_meta_graph_def.signature_def[signature_def_key]
+      input_tensor_name = signature_def.inputs['input_vocabs'].name
+      input_tensor = sess.graph.get_tensor_by_name(input_tensor_name)
+      lookup_tensor_name = signature_def.outputs['lookup'].name
+      lookup_tensor = sess.graph.get_tensor_by_name(lookup_tensor_name)
+
+      lookup_val = sess.run(
+          lookup_tensor,
+          feed_dict={
+              input_tensor: np.array([b'dynamic', b'quantization', b'range'])
+          },
+      )
+
+      # "dynamic" is not in the table: -1 (default value)
+      self.assertAllClose(lookup_val, [-1.0, 2.0, 1.0])
+
   # Run this test only with the eager mode.
   @test_util.run_v2_only
   def test_ptq_model_with_variable(self):
@@ -1309,140 +1413,143 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
     self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
-  # TODO(b/263830952): Use dictionaries instead of tuples for parameters.
+  # Check only the most simple case and the most complicated cases.
   @parameterized.named_parameters(
-      ('none', None, False, False, quant_opts_pb2.TF, False, False),
-      ('relu', nn_ops.relu, False, False, quant_opts_pb2.TF, False, False),
-      ('relu6', nn_ops.relu6, False, False, quant_opts_pb2.TF, False, False),
-      ('bn', None, False, True, quant_opts_pb2.TF, False, False),
-      (
-          'bn_and_relu',
-          nn_ops.relu,
-          False,
-          True,
-          quant_opts_pb2.TF,
-          False,
-          False,
-      ),
-      ('with_bias', None, True, False, quant_opts_pb2.TF, False, False),
-      ('with_bias_and_bn', None, True, True, quant_opts_pb2.TF, False, False),
-      (
-          'with_bias_and_bn_and_relu',
-          nn_ops.relu,
-          True,
-          True,
-          quant_opts_pb2.TF,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_relu',
-          nn_ops.relu,
-          True,
-          False,
-          quant_opts_pb2.TF,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_relu6',
-          nn_ops.relu6,
-          True,
-          False,
-          quant_opts_pb2.TF,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_bn_to_xla',
-          None,
-          True,
-          True,
-          quant_opts_pb2.XLA,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_relu6_to_xla',
-          nn_ops.relu6,
-          True,
-          False,
-          quant_opts_pb2.XLA,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_bn_to_xla_dynamic',
-          None,
-          True,
-          True,
-          quant_opts_pb2.XLA,
-          True,
-          False,
-      ),
-      (
-          'with_bias_and_relu6_to_xla_dynamic',
-          nn_ops.relu6,
-          True,
-          False,
-          quant_opts_pb2.XLA,
-          True,
-          False,
-      ),
-      (
-          'none_to_uq',
-          None,
-          False,
-          False,
-          quant_opts_pb2.UNIFORM_QUANTIZED,
-          False,
-          False,
-      ),
-      (
-          'none_to_uq_per_channel',
-          None,
-          False,
-          False,
-          quant_opts_pb2.UNIFORM_QUANTIZED,
-          False,
-          True,
-      ),
-      (
-          'relu_to_uq',
-          nn_ops.relu,
-          False,
-          False,
-          quant_opts_pb2.UNIFORM_QUANTIZED,
-          False,
-          False,
-      ),
-      (
-          'with_bias_to_uq',
-          None,
-          True,
-          False,
-          quant_opts_pb2.UNIFORM_QUANTIZED,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_relu_to_uq',
-          nn_ops.relu,
-          True,
-          False,
-          quant_opts_pb2.UNIFORM_QUANTIZED,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_relu6_to_uq',
-          nn_ops.relu6,
-          True,
-          False,
-          quant_opts_pb2.UNIFORM_QUANTIZED,
-          False,
-          False,
-      ),
+      {
+          'testcase_name': 'none',
+          'activation_fn': None,
+          'has_bias': False,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'relu',
+          'activation_fn': nn_ops.relu,
+          'has_bias': False,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'relu6',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': False,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'bn',
+          'activation_fn': None,
+          'has_bias': False,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias',
+          'activation_fn': None,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_relu6',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_bn_and_relu6',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_xla',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_bn_and_relu6_to_xla',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_xla_dynamic',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': True,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_bn_and_relu6_to_xla_dynamic',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': True,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_uq',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_bn_and_relu6_to_uq',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_uq_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+      },
+      {
+          'testcase_name': 'with_bias_and_bn_and_relu6_to_uq_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+      },
   )
   @test_util.run_in_graph_and_eager_modes
   def test_conv_ptq_model(
@@ -1457,7 +1564,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     input_shape = [None, None, None, 3] if input_shape_dynamic else [1, 3, 4, 3]
     filter_shape = [2, 3, 3, 2]
 
-    np.random.seed(1234)
     model = self._create_conv2d_model(
         input_shape, filter_shape, has_bias, has_batch_norm, activation_fn
     )
@@ -1614,7 +1720,7 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
 
     if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
       self.assertSizeRatioGreaterThan(
-          self._output_saved_model_path, self._input_saved_model_path, 0.7
+          self._output_saved_model_path, self._input_saved_model_path, 0.68
       )
       self.assertTrue(
           self._contains_op(output_graphdef, 'UniformQuantizedConvolution')
@@ -1628,140 +1734,143 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
       else:
         self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
-  # TODO(b/263830952): Use dictionaries instead of tuples for parameters.
+  # Check only the most simple case and the most complicated cases.
   @parameterized.named_parameters(
-      ('none', None, False, False, quant_opts_pb2.TF, False, False),
-      ('relu', nn_ops.relu, False, False, quant_opts_pb2.TF, False, False),
-      ('relu6', nn_ops.relu6, False, False, quant_opts_pb2.TF, False, False),
-      ('bn', None, False, True, quant_opts_pb2.TF, False, False),
-      (
-          'bn_and_relu',
-          nn_ops.relu,
-          False,
-          True,
-          quant_opts_pb2.TF,
-          False,
-          False,
-      ),
-      ('with_bias', None, True, False, quant_opts_pb2.TF, False, False),
-      ('with_bias_and_bn', None, True, True, quant_opts_pb2.TF, False, False),
-      (
-          'with_bias_and_bn_and_relu',
-          nn_ops.relu,
-          True,
-          True,
-          quant_opts_pb2.TF,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_relu',
-          nn_ops.relu,
-          True,
-          False,
-          quant_opts_pb2.TF,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_relu6',
-          nn_ops.relu6,
-          True,
-          False,
-          quant_opts_pb2.TF,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_bn_to_xla',
-          None,
-          True,
-          True,
-          quant_opts_pb2.XLA,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_relu6_to_xla',
-          nn_ops.relu6,
-          True,
-          False,
-          quant_opts_pb2.XLA,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_bn_to_xla_dynamic',
-          None,
-          True,
-          True,
-          quant_opts_pb2.XLA,
-          True,
-          False,
-      ),
-      (
-          'with_bias_and_relu6_to_xla_dynamic',
-          nn_ops.relu6,
-          True,
-          False,
-          quant_opts_pb2.XLA,
-          True,
-          False,
-      ),
-      (
-          'none_to_uq',
-          None,
-          False,
-          False,
-          quant_opts_pb2.UNIFORM_QUANTIZED,
-          False,
-          False,
-      ),
-      (
-          'none_to_uq_per_channel',
-          None,
-          False,
-          False,
-          quant_opts_pb2.UNIFORM_QUANTIZED,
-          False,
-          True,
-      ),
-      (
-          'relu_to_uq',
-          nn_ops.relu,
-          False,
-          False,
-          quant_opts_pb2.UNIFORM_QUANTIZED,
-          False,
-          False,
-      ),
-      (
-          'with_bias_to_uq',
-          None,
-          True,
-          False,
-          quant_opts_pb2.UNIFORM_QUANTIZED,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_relu_to_uq',
-          nn_ops.relu,
-          True,
-          False,
-          quant_opts_pb2.UNIFORM_QUANTIZED,
-          False,
-          False,
-      ),
-      (
-          'with_bias_and_relu6_to_uq',
-          nn_ops.relu6,
-          True,
-          False,
-          quant_opts_pb2.UNIFORM_QUANTIZED,
-          False,
-          False,
-      ),
+      {
+          'testcase_name': 'none',
+          'activation_fn': None,
+          'has_bias': False,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'relu',
+          'activation_fn': nn_ops.relu,
+          'has_bias': False,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'relu6',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': False,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'bn',
+          'activation_fn': None,
+          'has_bias': False,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias',
+          'activation_fn': None,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_relu6',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_bn_and_relu6',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.TF,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_xla',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_bn_and_relu6_to_xla',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_xla_dynamic',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': True,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_bn_and_relu6_to_xla_dynamic',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.XLA,
+          'input_shape_dynamic': True,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_uq',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_bn_and_relu6_to_uq',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_uq_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+      },
+      {
+          'testcase_name': 'with_bias_and_bn_and_relu6_to_uq_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': True,
+          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+      },
   )
   @test_util.run_in_graph_and_eager_modes
   def test_depthwise_conv_ptq_model(
@@ -1778,7 +1887,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     model = self._create_depthwise_conv2d_model(
         input_shape, filter_shape, has_bias, has_batch_norm, activation_fn
     )
-    np.random.seed(1234)
     saved_model_save.save(model, self._input_saved_model_path)
 
     def data_gen() -> repr_dataset.RepresentativeDataset:
@@ -1910,7 +2018,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
       batch_sizes: Sequence[int],
       target_opset: quant_opts_pb2.OpSet,
   ):
-    np.random.seed(1234)
     lhs_batch_size, rhs_batch_size = batch_sizes
     input_shape = (*lhs_batch_size, 1, 1024)
     filter_shape = (*rhs_batch_size, 1024, 3)
@@ -1922,15 +2029,14 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
         has_bias,
         activation_fn,
     )
+    rng = np.random.default_rng(seed=1234)
 
     def data_gen() -> repr_dataset.RepresentativeDataset:
       for _ in range(500):
         yield {
-            'input_tensor': ops.convert_to_tensor(
-                np.random.uniform(
-                    low=0.0, high=1.0, size=static_input_shape
-                ).astype('f4')
-            ),
+            'input_tensor': rng.uniform(
+                low=0.0, high=1.0, size=static_input_shape
+            ).astype(np.float32)
         }
 
     tags = {tag_constants.SERVING}
@@ -1961,15 +2067,16 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
     input_data = ops.convert_to_tensor(
-        np.random.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
-            'f4'
+        rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
+            np.float32
         )
     )
     expected_outputs = model.matmul(input_data)
     got_outputs = converted_model.signatures['serving_default'](
         input_tensor=ops.convert_to_tensor(input_data)
     )
-    self.assertAllClose(expected_outputs, got_outputs, atol=0.1674)
+    # The atol value is arbitrary.
+    self.assertAllClose(expected_outputs, got_outputs, atol=0.22)
 
     # Check the converted model in the target opset.
     quantization_options = quant_opts_pb2.QuantizationOptions(
@@ -2003,8 +2110,82 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
         input_tensor=ops.convert_to_tensor(input_data)
     )
     # The difference between TF and target path is expected to be small.
-    self.assertAllClose(new_outputs, got_outputs, atol=0.1202)
-    self.assertAllClose(new_outputs, expected_outputs, atol=0.1023)
+    # The atol value is arbitrary.
+    self.assertAllClose(new_outputs, got_outputs, atol=0.13)
+    self.assertAllClose(new_outputs, expected_outputs, atol=0.13)
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'with_biasadd',
+          'input_shape': (32, 16),
+          'filter_shape': (16, 8),
+          'bias_size': 4,
+          'use_biasadd': True,
+          'activation_fn': nn_ops.relu,
+      },
+      {
+          'testcase_name': 'with_addv2',
+          'input_shape': (32, 16),
+          'filter_shape': (16, 8),
+          'bias_size': 4,
+          'use_biasadd': False,
+          'activation_fn': nn_ops.relu,
+      },
+  )
+  def test_matmul_with_reshape_and_bias_ptq_model(
+      self, input_shape, filter_shape, bias_size, activation_fn, use_biasadd
+  ):
+
+    model = self._create_matmul_model(
+        input_shape,
+        filter_shape,
+        self._input_saved_model_path,
+        True,
+        activation_fn,
+        bias_size,
+        use_biasadd,
+    )
+
+    rng = np.random.default_rng(seed=1234)
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(5):
+        yield {
+            'input_tensor': rng.uniform(
+                low=0.0, high=1.0, size=input_shape
+            ).astype(np.float32)
+        }
+
+    tags = {tag_constants.SERVING}
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=quant_opts_pb2.OpSet.XLA,
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+        representative_dataset=data_gen(),
+    )
+
+    input_data = ops.convert_to_tensor(
+        rng.uniform(low=0.0, high=1.0, size=input_shape).astype(
+            np.float32
+        )
+    )
+    expected_outputs = model.matmul(input_data)
+
+    got_outputs = converted_model.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+
+    self.assertAllClose(expected_outputs, got_outputs, atol=0.05)
 
   @parameterized.parameters(
       ('abc,cde->abde', (2, 2, 64), (64, 3, 3), (3, 3), quant_opts_pb2.XLA),
@@ -2177,6 +2358,68 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
                 func.node_def, op_name='XlaConvV2', attr_name='', attr_val=None
             )
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_alias_preserved_in_qat(self):
+    _, y_shape, _, x_signature, y_signature = (
+        self._prepare_sample_einsum_datashapes('ab,bc->ac')
+    )
+    model = self._create_einsum_model_with_fake_quant(
+        'ab,bc->ac', y_shape, x_signature, y_signature
+    )
+
+    signatures = {
+        'serving_default': model.einsum_with_kernel.get_concrete_function(),
+    }
+    save_opts = save_options.SaveOptions(
+        function_aliases={'einsum_with_kernel': model.einsum_with_kernel}
+    )
+
+    saved_model_save.save(
+        model, self._input_saved_model_path, signatures, save_opts
+    )
+
+    tags = {tag_constants.SERVING}
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=quant_opts_pb2.OpSet.XLA,
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    # Test whether the aliased function exists.
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+
+    # Confirm that the function alias is preserved.
+    meta_graph_def = output_loader.get_meta_graph_def_from_tags(tags)
+    function_aliases = meta_graph_def.meta_info_def.function_aliases
+    self.assertNotEmpty(function_aliases)
+    self.assertCountEqual(function_aliases.values(), {'einsum_with_kernel'})
+
+    # Test that the aliased function contains a quantized op.
+    for func_name, alias in function_aliases.items():
+      if alias == 'einsum_with_kernel':
+        for func in meta_graph_def.graph_def.library.function:
+          if func.signature.name == func_name:
+            self._contains_op_with_name_and_attribute(
+                func.node_def, op_name='XlaDotV2', attr_name='', attr_val=None
+            )
+
   @test_util.deprecated_graph_mode_only
   def test_matmul_ptq_model_with_unfreeze_constants(self):
     # Uses large weight to exceed the constant size threshold of 64KiB
@@ -2574,11 +2817,9 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
       self.assertNotEmpty(warning_logs.records)
 
       # Warning message should contain the function name.
+      self.assertTrue(self._any_log_contains('matmul', warning_logs.records))
       self.assertTrue(
-          self._any_warning_contains('matmul', warning_logs.records)
-      )
-      self.assertTrue(
-          self._any_warning_contains(
+          self._any_log_contains(
               'does not have min or max values', warning_logs.records
           )
       )
@@ -2599,6 +2840,21 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     class IfModel(module.Module):
       """A model that contains a branching op."""
 
+      def __init__(self):
+        self.filters_0 = np.random.uniform(
+            low=-1.0, high=1.0, size=(4, 3)
+        ).astype('f4')
+        self.bias_0 = np.random.uniform(low=-1.0, high=1.0, size=(3,)).astype(
+            'f4'
+        )
+
+        self.filters_1 = np.random.uniform(
+            low=-1.0, high=1.0, size=(4, 3)
+        ).astype('f4')
+        self.bias_1 = np.random.uniform(low=-1.0, high=1.0, size=(3,)).astype(
+            'f4'
+        )
+
       @def_function.function(
           input_signature=[
               tensor_spec.TensorSpec(shape=[1, 4], dtype=dtypes.float32)
@@ -2617,20 +2873,12 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
           A map of: output key -> output result.
         """
         if math_ops.reduce_sum(x) > 10.0:
-          filters = np.random.uniform(low=-1.0, high=1.0, size=(4, 3)).astype(
-              'f4'
-          )
-          bias = np.random.uniform(low=-1.0, high=1.0, size=(3,)).astype('f4')
-          out = math_ops.matmul(x, filters)
-          out = nn_ops.bias_add(out, bias)
+          out = math_ops.matmul(x, self.filters_0)
+          out = nn_ops.bias_add(out, self.bias_0)
           return {'output': out}
 
-        filters = np.random.uniform(low=-1.0, high=1.0, size=(4, 3)).astype(
-            'f4'
-        )
-        bias = np.random.uniform(low=-1.0, high=1.0, size=(3,)).astype('f4')
-        out = math_ops.matmul(x, filters)
-        out = nn_ops.bias_add(out, bias)
+        out = math_ops.matmul(x, self.filters_1)
+        out = nn_ops.bias_add(out, self.bias_1)
         return {'output': out}
 
     model = IfModel()
@@ -2675,14 +2923,12 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
       # Warning message should contain the function name. The uncalibrated path
       # is when the condition is true, so 'cond_true' function must be part of
       # the warning message.
-      self.assertTrue(
-          self._any_warning_contains('cond_true', warning_logs.records)
-      )
+      self.assertTrue(self._any_log_contains('cond_true', warning_logs.records))
       self.assertFalse(
-          self._any_warning_contains('cond_false', warning_logs.records)
+          self._any_log_contains('cond_false', warning_logs.records)
       )
       self.assertTrue(
-          self._any_warning_contains(
+          self._any_log_contains(
               'does not have min or max values', warning_logs.records
           )
       )
@@ -3515,7 +3761,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
           out = activation_fn(out)
         return {'output': out}
 
-    np.random.seed(1234)
     model = ConvModel()
     saved_model_save.save(model, self._input_saved_model_path)
 
@@ -4083,7 +4328,7 @@ class DynamicRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
 
     if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
       self.assertSizeRatioGreaterThan(
-          self._output_saved_model_path, self._input_saved_model_path, 0.7
+          self._output_saved_model_path, self._input_saved_model_path, 0.65
       )
       self.assertTrue(
           self._contains_op(
@@ -4392,6 +4637,61 @@ class DynamicRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
 
       self.assertAllClose(lookup_val, [1.0, 2.0, 0.0])
 
+  @test_util.deprecated_graph_mode_only
+  def test_file_init_hash_table_lookup_model(self):
+    tags = {tag_constants.SERVING}
+    signature_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
+    # Create and save a simple model that involves a hash table.
+    inputs, outputs = self._create_and_save_file_init_hash_table_model_tf1(
+        self._input_saved_model_path, tags, signature_def_key
+    )
+    # Make sure that the desired input key and output key is present.
+    self.assertIn('input_vocabs', inputs.keys())
+    self.assertIn('lookup', outputs.keys())
+
+    signature_def_keys = [signature_def_key]
+    quantize_model.quantize(
+        self._input_saved_model_path,
+        signature_def_keys,
+        tags,
+        self._output_saved_model_path,
+        quantization_options=quant_opts_pb2.QuantizationOptions(
+            quantization_method=quant_opts_pb2.QuantizationMethod(
+                experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+            ),
+        ),
+    )
+
+    # Tests table lookup to make sure the table has been initialized
+    # successfully.
+    with session.Session(graph=ops.Graph()) as sess:
+      output_meta_graph_def = saved_model_loader.load(
+          sess, tags=tags, export_dir=self._output_saved_model_path
+      )
+
+      self.assertCountEqual(
+          output_meta_graph_def.signature_def.keys(), signature_def_keys
+      )
+
+      signature_def = output_meta_graph_def.signature_def[signature_def_key]
+
+      input_tensor_name = signature_def.inputs['input_vocabs'].name
+      input_tensor = sess.graph.get_tensor_by_name(input_tensor_name)
+
+      lookup_tensor_name = signature_def.outputs['lookup'].name
+      lookup_tensor = sess.graph.get_tensor_by_name(lookup_tensor_name)
+
+      lookup_val = sess.run(
+          lookup_tensor,
+          feed_dict={
+              input_tensor: np.array([b'dynamic', b'quantization', b'range'])
+          },
+      )
+
+      # "dynamic" is not in the table: -1 (default value)
+      self.assertAllClose(lookup_val, [-1.0, 2.0, 1.0])
+
 
 class WeightOnlyQuantizationTest(quantize_model_test_base.QuantizedModelTest):
   """Test cases for weight-only quantization.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
index 3341e1d84c4..f2593d336f7 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_string_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import builder
 from tensorflow.python.saved_model import save as saved_model_save
 from tensorflow.python.saved_model import signature_def_utils_impl
@@ -84,6 +85,27 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
         total += os.path.getsize(os.path.join(root, filename))
     return total
 
+  def _any_log_contains(
+      self, substring: str, log_record_list: List['logging.LogRecord']
+  ) -> bool:
+    """Returns True if any of the log contains a given substring.
+
+    Args:
+      substring: A piece of string to check whether it exists in the log
+        message.
+      log_record_list: A list of `absl.logging.LogRecord`s.
+
+    Returns:
+      True if and only if the substring exists in any of the log in
+      `log_record_list`.
+    """
+    return any(
+        map(
+            lambda log_record: substring in str(log_record.message),
+            log_record_list,
+        )
+    )
+
   def assertSizeRatioGreaterThan(
       self, path_a: str, path_b: str, threshold: float
   ):
@@ -530,7 +552,9 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
         shape=array_ops.shape(input_vocabs_placeholder), dtype=dtypes.float32
     )
     # shape: (?, 2)
-    weight = array_ops.transpose_v2(array_ops.stack([weight_row, weight_row]))
+    weight = array_ops.transpose_v2(
+        array_ops_stack.stack([weight_row, weight_row])
+    )
     # shape: (2, 2)
     output_tensor = math_ops.matmul(matmul_input, weight)
 
@@ -725,6 +749,126 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
 
     return input_vocabs_placeholder, lookup_vals, output_tensor
 
+  def _create_table_init_from_file_qat_model_tf1(
+      self, sess: session.Session
+  ) -> Tuple[core.Tensor, core.Tensor, core.Tensor]:
+    """Creates a simple QAT model that initializes a table from an asset file.
+
+    This model creates an asset file at "vocab_file.txt" containing
+    comma-separated vocabularies and uses it to initialize a
+    `StaticVocabularyTable`. For inference, the model performs a lookup with a
+    1D string tensor input vocabs.
+
+    Args:
+      sess: Tensorflow Session to create the model in.
+
+    Returns:
+      (input_vocabs_placeholder, lookup_vals, output_tensor), where
+      * input_vocabs_placeholder is a placeholder tensor of 1D strings
+      * lookup_vals is an output tensor that is a direct result of table lookup
+      * output_tensor is a float 2x2 matrix
+    """
+    # Creates and populates an asset file.
+    asset_dir = self.create_tempdir('assets').full_path
+    asset_file = os.path.join(asset_dir, 'vocab_file.txt')
+    content = '\n'.join(['static', 'range', 'quantization'])
+    file_io.write_string_to_file(filename=asset_file, file_content=content)
+
+    # The resulting table looks like:
+    # "static" -> 0
+    # "range" -> 1
+    # "quantization" -> 2
+    # default -> -1
+    init = lookup_ops.TextFileInitializer(
+        filename=asset_file,
+        key_dtype=dtypes.string,
+        key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
+        value_dtype=dtypes.int64,
+        value_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+    )
+    table = lookup_ops.StaticHashTable(init, default_value=-1)
+
+    input_vocabs_placeholder = array_ops.placeholder(
+        dtypes.string, shape=(None,), name='input_vocabs'
+    )
+
+    # Introduce a matmul op that takes the lookup values to observe the
+    # effects of quantization.
+    lookup_vals = math_ops.cast(
+        table.lookup(input_vocabs_placeholder), dtypes.float32
+    )
+    # shape: (2, ?)
+    matmul_input = array_ops_stack.stack([lookup_vals, lookup_vals])
+    matmul_input = array_ops.fake_quant_with_min_max_args(
+        matmul_input, min=-0.3, max=0.3, num_bits=8, narrow_range=False
+    )
+
+    # Create a dummy weight matrix filled with ones.
+    weight_row = array_ops.ones(
+        shape=array_ops.shape(input_vocabs_placeholder), dtype=dtypes.float32
+    )
+    # shape: (?, 2)
+    weight = array_ops.transpose_v2(
+        array_ops_stack.stack([weight_row, weight_row])
+    )
+    weight = array_ops.fake_quant_with_min_max_args(
+        weight, min=-0.1, max=0.2, num_bits=8, narrow_range=False
+    )
+
+    # shape: (2, 2)
+    output_tensor = math_ops.matmul(matmul_input, weight)
+    output_tensor = array_ops.fake_quant_with_min_max_args(
+        output_tensor, min=-0.2, max=0.2, num_bits=8, narrow_range=False
+    )
+
+    return input_vocabs_placeholder, lookup_vals, output_tensor
+
+  def _create_and_save_file_init_hash_table_qat_model_tf1(
+      self,
+      output_path: str,
+      tags: Collection[str],
+      signature_def_key: str,
+  ) -> Tuple[Mapping[str, core.Tensor], Mapping[str, core.Tensor]]:
+    """Creates and saves a QAT model that uses a file-initialized table.
+
+    The asset file "vocab_file.txt" is used to initialize a hash table.
+
+    Args:
+      output_path: Path to the directory to save the created model.
+      tags: Set of strings that identifies the saved meta graph.
+      signature_def_key: Name of the SignatureDef. Used to identify the
+        SignatureDef within the meta graph.
+
+    Returns:
+      inputs: A mapping of input_key -> input_tensor (placeholder). The input
+        key is "input_vocabs".
+      outputs: A mapping of output_key -> output_tensor. The output keys are
+        "lookup" and "output".
+    """
+    with session.Session(graph=ops.Graph()) as sess:
+      input_vocabs_placeholder, lookup_tensor, output_tensor = (
+          self._create_table_init_from_file_qat_model_tf1(sess)
+      )
+
+      inputs = {'input_vocabs': input_vocabs_placeholder}
+      outputs = {
+          'lookup': lookup_tensor,
+          'output': output_tensor,
+      }
+
+      self._save_tf1_model(
+          sess,
+          output_path,
+          signature_def_key,
+          tags,
+          inputs=inputs,
+          outputs=outputs,
+          init_op=lookup_ops.tables_initializer(),
+          assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
+      )
+
+    return inputs, outputs
+
   def _create_data_generator(
       self,
       input_key: str,
@@ -804,8 +948,16 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
 
       def __init__(self):
         """Initializes a SimpleGatherAndConvModel."""
-        embedding_w_val = np.random.randn(1024, 3, 4, 3).astype('f4')
-        self.embedding_w = embedding_w_val
+        self.embedding_w = np.random.randn(1024, 3, 4, 3).astype('f4')
+
+        self.conv_filters = np.random.uniform(
+            low=-10, high=10, size=filter_shape
+        ).astype('f4')
+
+        second_conv_filter_shape = (3, 3, filter_shape[-1], 1)
+        self.second_conv_filters = np.random.uniform(
+            low=-10, high=10, size=second_conv_filter_shape
+        ).astype('f4')
 
       @def_function.function(
           input_signature=[
@@ -823,23 +975,39 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
         Returns:
           A map of: output key -> output result.
         """
-        conv_filters = np.random.uniform(
-            low=-10, high=10, size=filter_shape
-        ).astype('f4')
 
         out = array_ops.gather_v2(self.embedding_w, input_tensor)
+
+        # One pure conv
+        out = nn_ops.conv2d(
+            out,
+            self.conv_filters,
+            strides=(1, 1, 2, 1),
+            dilations=(1, 1, 1, 1),
+            padding='SAME',
+            data_format='NHWC',
+        )
+
+        # One fakequant attached conv
         if is_qat_model:
           out = array_ops.fake_quant_with_min_max_args(
               out, min=-0.1, max=0.2, num_bits=8, narrow_range=False
           )
-          conv_filters = array_ops.fake_quant_with_min_max_args(
-              conv_filters, min=-0.1, max=0.2, num_bits=8, narrow_range=True
+          second_conv_filters = array_ops.fake_quant_with_min_max_args(
+              self.second_conv_filters,
+              min=-0.1,
+              max=0.2,
+              num_bits=8,
+              narrow_range=True,
           )
+        else:
+          second_conv_filters = self.second_conv_filters
+
         out = nn_ops.conv2d(
             out,
-            conv_filters,
-            strides=[1, 1, 2, 1],
-            dilations=[1, 1, 1, 1],
+            second_conv_filters,
+            strides=(1, 1, 2, 1),
+            dilations=(1, 1, 1, 1),
             padding='SAME',
             data_format='NHWC',
         )
@@ -945,6 +1113,16 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
     class DepthwiseConvModel(module.Module):
       """A simple model with a single depthwise conv2d, bias and relu."""
 
+      def __init__(self):
+        self.filters = np.random.uniform(
+            low=-10, high=10, size=filter_shape
+        ).astype('f4')
+
+        self.out_channel_size = filter_shape[2] * filter_shape[3]
+        self.bias = np.random.uniform(
+            low=0, high=10, size=(self.out_channel_size)
+        ).astype('f4')
+
       @def_function.function(
           input_signature=[
               tensor_spec.TensorSpec(shape=input_shape, dtype=dtypes.float32)
@@ -961,25 +1139,19 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
         Returns:
           A map of: output key -> output result.
         """
-        filters = np.random.uniform(low=-10, high=10, size=filter_shape).astype(
-            'f4'
-        )
-        out_channel_size = filter_shape[2] * filter_shape[3]
-        bias = np.random.uniform(
-            low=0, high=10, size=(out_channel_size)
-        ).astype('f4')
-        scale, offset = [1.0] * out_channel_size, [0.5] * out_channel_size
+        scale = [1.0] * self.out_channel_size
+        offset = [0.5] * self.out_channel_size
         mean, variance = scale, offset
         out = nn_ops.depthwise_conv2d_native(
             input_tensor,
-            filters,
+            self.filters,
             strides=[1, 2, 2, 1],
             dilations=[1, 1, 1, 1],
             padding='SAME',
             data_format='NHWC',
         )
         if has_bias:
-          out = nn_ops.bias_add(out, bias)
+          out = nn_ops.bias_add(out, self.bias)
         if has_batch_norm:
           # Fusing is supported for non-training case.
           out, _, _, _, _, _ = nn_ops.fused_batch_norm_v3(
@@ -1005,6 +1177,16 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
     class ConvModel(module.Module):
       """A simple model with a single conv2d, bias and relu."""
 
+      def __init__(self):
+        self.filters = np.random.uniform(
+            low=-10, high=10, size=filter_shape
+        ).astype('f4')
+
+        self.out_channel_size = filter_shape[-1]
+        self.bias = np.random.uniform(
+            low=0, high=10, size=(self.out_channel_size)
+        ).astype('f4')
+
       @def_function.function(
           input_signature=[
               tensor_spec.TensorSpec(shape=input_shape, dtype=dtypes.float32)
@@ -1019,25 +1201,19 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
         Returns:
           A map of: output key -> output result.
         """
-        filters = np.random.uniform(low=-10, high=10, size=filter_shape).astype(
-            'f4'
-        )
-        out_channel_size = filter_shape[-1]
-        bias = np.random.uniform(
-            low=0, high=10, size=(out_channel_size)
-        ).astype('f4')
-        scale, offset = [1.0] * out_channel_size, [0.5] * out_channel_size
+        scale = [1.0] * self.out_channel_size
+        offset = [0.5] * self.out_channel_size
         mean, variance = scale, offset
         out = nn_ops.conv2d(
             input_tensor,
-            filters,
+            self.filters,
             strides=[1, 1, 2, 1],
             dilations=[1, 1, 1, 1],
             padding='SAME',
             data_format='NHWC',
         )
         if has_bias:
-          out = nn_ops.bias_add(out, bias, data_format='NHWC')
+          out = nn_ops.bias_add(out, self.bias, data_format='NHWC')
         if has_batch_norm:
           # Fusing is supported for non-training case.
           out, _, _, _, _, _ = nn_ops.fused_batch_norm_v3(
@@ -1056,6 +1232,8 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
       saved_model_path: str,
       has_bias: bool = False,
       activation_fn: Optional[ops.Operation] = None,
+      bias_size: Optional[int] = None,
+      use_biasadd: bool = True,
   ) -> module.Module:
     class MatmulModel(module.Module):
       """A simple model with a single matmul.
@@ -1066,21 +1244,32 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
       def __init__(
           self,
           weight_shape: Sequence[int],
-          has_bias: bool = False,
+          bias_size: Optional[int] = None,
           activation_fn: Optional[ops.Operation] = None,
+          use_biasadd: bool = True,
       ) -> None:
         """Initializes a MatmulModel.
 
         Args:
           weight_shape: Shape of the weight tensor.
-          has_bias: If True, creates and adds a bias term.
+          bias_size: If None, do not use bias. Else, use given size as bias.
           activation_fn: The activation function to be used. No activation
             function if None.
+          use_biasadd: If True, use BiasAdd for adding bias, else use AddV2.
         """
-        self.has_bias = has_bias
+        self.bias_size = bias_size
         self.activation_fn = activation_fn
+        self.use_biasadd = use_biasadd
         self.filters = np.random.uniform(low=-1.0, high=1.0, size=weight_shape)
-        self.bias = np.random.uniform(low=-1.0, high=1.0, size=weight_shape[-1])
+
+        if bias_size is not None:
+          self.bias = np.random.uniform(low=-1.0, high=1.0, size=bias_size)
+
+      def has_bias(self) -> bool:
+        return self.bias_size is not None
+
+      def has_reshape(self) -> bool:
+        return self.has_bias() and self.bias_size != self.filters.shape[-1]
 
       @def_function.function
       def matmul(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
@@ -1098,15 +1287,40 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
         """
         out = math_ops.matmul(input_tensor, self.filters)
 
-        if self.has_bias:
-          out = nn_ops.bias_add(out, self.bias)
+        if self.has_reshape():
+          input_shape = input_tensor.shape
+          if len(input_shape) == 3:
+            reshape_shape = (input_shape[0], -1, self.bias_size)
+          else:
+            reshape_shape = (-1, self.bias_size)
+
+          out = array_ops.reshape(out, reshape_shape)
+
+        if self.has_bias():
+          if self.use_biasadd:
+            out = nn_ops.bias_add(out, self.bias)
+          else:
+            out = math_ops.add_v2(out, self.bias)
 
         if self.activation_fn is not None:
           out = self.activation_fn(out)
 
         return {'output': out}
 
-    model = MatmulModel(weight_shape, has_bias, activation_fn)
+    # If bias_size is not explictly given, it should default to width of weight.
+    if bias_size is None and has_bias:
+      bias_size = weight_shape[-1]
+
+    # Verify that when bias_size is not None, has_bias should be True.
+    # And if bias_size is None, has_bias should be False using XNOR
+    assert (not ((bias_size is not None) ^ has_bias))
+
+    # Verify that bias size is correct
+    if bias_size:
+      input_height = input_shape[0] if len(input_shape) == 2 else input_shape[1]
+      assert input_height * weight_shape[-1] % bias_size == 0
+
+    model = MatmulModel(weight_shape, bias_size, activation_fn)
     saved_model_save.save(
         model,
         saved_model_path,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
index 7aeadeb212f..4b083d6f96c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
@@ -161,10 +161,11 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
       [](const absl::string_view saved_model_path,
          const std::vector<std::string>& signature_keys,
          const std::unordered_set<std::string>& tags,
-         const QuantizationOptions& quant_opts)
+         const QuantizationOptions& quant_opts,
+         const absl::flat_hash_map<std::string, std::string>& function_aliases)
           -> absl::StatusOr<ExportedModel> {
         return QuantizeQatModel(saved_model_path, signature_keys, tags,
-                                quant_opts);
+                                quant_opts, function_aliases);
       },
       R"pbdoc(
       Returns serialized ExportedModel that contains the quantized model's
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index 95feea40ee1..0a3f2e95c36 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/SmallVector.h"
@@ -267,8 +268,8 @@ absl::StatusOr<ExportedModel> ConvertMlirModuleToExportedModel(
   if (const auto status = ConvertMlirToGraph(module_op, config, &graph,
                                              &flib_def, &control_ret_nodes);
       !status.ok()) {
-    return absl::InternalError("Failed to convert MLIR to GraphDef. " +
-                               status.error_message());
+    return absl::InternalError(
+        absl::StrCat("Failed to convert MLIR to GraphDef. ", status.message()));
   }
 
   GraphDef graph_def{};
@@ -285,6 +286,39 @@ absl::StatusOr<ExportedModel> ConvertMlirModuleToExportedModel(
                              function_aliases, asset_file_defs);
 }
 
+// Returns the updated function aliases. `module_op` may have different function
+// names from the original model, so it re-associates the aliases with the new
+// function names. Both the input `function_aliases` and the returned value
+// are function name -> alias mappings. `function_aliases` is the function alias
+// mapping of the original function.
+absl::flat_hash_map<std::string, std::string> UpdateFunctionAliases(
+    const absl::flat_hash_map<std::string, std::string> function_aliases,
+    mlir::ModuleOp module_op) {
+  absl::flat_hash_map<std::string, std::string> updated_function_aliases;
+
+  module_op->walk([&](mlir::func::FuncOp func_op) {
+    // We may retrieve the original function's name from the attribute.
+    // Functions without this attribute are ignored.
+    auto original_func_name =
+        func_op->getAttrOfType<mlir::StringAttr>("tf._original_func_name");
+    if (original_func_name) {
+      if (auto alias_itr = function_aliases.find(original_func_name.str());
+          alias_itr != function_aliases.end()) {
+        const std::string alias = alias_itr->second;
+        const std::string new_func_name = func_op.getSymName().str();
+
+        updated_function_aliases[new_func_name] = alias;
+
+        VLOG(1) << "Updated function alias. Alias: " << alias
+                << ", New function name: " << new_func_name
+                << ", Old function name: " << original_func_name.str();
+      }
+    }
+  });
+
+  return updated_function_aliases;
+}
+
 // Runs MLIR passes with `module_op`. The passes are added by calling
 // `add_passes_func`, which is a callable receiving mlir::PassManager& as its
 // only argument. `name` identifies the set of passes added by `add_passes_func`
@@ -310,7 +344,7 @@ absl::Status RunPasses(const absl::string_view name, FuncT add_passes_func,
   if (failed(pm.run(module_op))) {
     return absl::InternalError(
         absl::StrFormat("Failed to run pass: %s. %s", name,
-                        diagnostic_handler.ConsumeStatus().error_message()));
+                        diagnostic_handler.ConsumeStatus().message()));
   }
 
   return absl::OkStatus();
@@ -421,13 +455,15 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
     const absl::string_view saved_model_path,
     const std::vector<std::string> &signature_keys,
     const std::unordered_set<std::string> &tags,
-    const QuantizationOptions &quantization_options) {
+    const QuantizationOptions &quantization_options,
+    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
   // Convert the SavedModelBundle to an MLIR module.
   mlir::MLIRContext context = CreateMlirContextForTfQuantization();
 
   MLIRImportOptions import_options;
   import_options.upgrade_legacy = true;
   import_options.lift_variables = false;
+  import_options.include_variables_in_initializers = true;
   auto bundle = std::make_unique<SavedModelBundle>();
 
   // TODO(b/213406917): Add support for the object graph based saved model input
@@ -437,14 +473,33 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
                                           absl::MakeSpan(exported_names),
                                           &context, import_options, &bundle);
   if (!module.status().ok()) {
-    return absl::InternalError("Failed to import SavedModel: " +
-                               module.status().error_message());
+    return absl::InternalError(absl::StrCat("Failed to import SavedModel: ",
+                                            module.status().message()));
   }
 
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
-  TF_QUANT_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
-      module_ref.get(), &context, bundle ? bundle->GetSession() : nullptr));
+  const absl::flat_hash_map<std::string, std::string> updated_function_aliases =
+      UpdateFunctionAliases(function_aliases, *module_ref);
+
+  // Collect the names of the functions that have aliases so that they may not
+  // be inlined.
+  absl::flat_hash_set<std::string> aliased_function_names;
+  absl::c_for_each(updated_function_aliases, [&](const auto &aliases) {
+    return aliased_function_names.insert(aliases.first);
+  });
+
+  // TODO(b/274858158): Removing this triggers an error on unit test.
+  if (aliased_function_names.empty()) {
+    TF_QUANT_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
+        module_ref.get(), &context, bundle ? bundle->GetSession() : nullptr));
+  } else {
+    TF_QUANT_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
+        /*mlir_dump_file_prefix=*/kDefaultTfQuantMlirDumpFilePrefix,
+        /*is_inliner_run=*/false,
+        /*noinline_functions=*/aliased_function_names, module_ref.get(),
+        &context, bundle ? bundle->GetSession() : nullptr));
+  }
 
   TF_QUANT_RETURN_IF_ERROR(
       RunPasses(/*name=*/kTfQuantQatStepName,
@@ -468,44 +523,10 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
                       RunExportPasses(export_opts, context, *module_ref));
 
   return ConvertMlirModuleToExportedModel(
-      *module_ref, checkpoint_dir,
-      /*function_aliases=*/{},
+      *module_ref, checkpoint_dir, updated_function_aliases,
       {asset_file_defs.begin(), asset_file_defs.end()});
 }
 
-// Returns the updated function aliases. `module_op` may have different function
-// names from the original model, so it re-associates the aliases with the new
-// function names. Both the input `function_aliases` and the returned value
-// are function name -> alias mappings. `function_aliases` is the function alias
-// mapping of the original function.
-absl::flat_hash_map<std::string, std::string> UpdateFunctionAliases(
-    const absl::flat_hash_map<std::string, std::string> function_aliases,
-    mlir::ModuleOp module_op) {
-  absl::flat_hash_map<std::string, std::string> updated_function_aliases;
-
-  module_op->walk([&](mlir::func::FuncOp func_op) {
-    // We may retrieve the original function's name from the attribute.
-    // Functions without this attribute are ignored.
-    auto original_func_name =
-        func_op->getAttrOfType<mlir::StringAttr>("tf._original_func_name");
-    if (original_func_name) {
-      if (auto alias_itr = function_aliases.find(original_func_name.str());
-          alias_itr != function_aliases.end()) {
-        const std::string alias = alias_itr->second;
-        const std::string new_func_name = func_op.getSymName().str();
-
-        updated_function_aliases[new_func_name] = alias;
-
-        VLOG(1) << "Updated function alias. Alias: " << alias
-                << ", New function name: " << new_func_name
-                << ", Old function name: " << original_func_name.str();
-      }
-    }
-  });
-
-  return updated_function_aliases;
-}
-
 absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
     const absl::string_view saved_model_path,
     const std::vector<std::string> &signature_keys,
@@ -518,6 +539,7 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
   MLIRImportOptions import_options;
   import_options.upgrade_legacy = true;
   import_options.lift_variables = false;
+  import_options.include_variables_in_initializers = true;
   auto bundle = std::make_unique<SavedModelBundle>();
 
   // TODO(b/213406917): Add support for the object graph based saved model input
@@ -528,8 +550,8 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
                                           &context, import_options, &bundle);
 
   if (!module.status().ok()) {
-    return absl::InternalError("Failed to import SavedModel: " +
-                               module.status().error_message());
+    return absl::InternalError(absl::StrCat("Failed to import SavedModel: ",
+                                            module.status().message()));
   }
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
@@ -589,6 +611,7 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
   MLIRImportOptions import_options;
   import_options.upgrade_legacy = true;
   import_options.lift_variables = false;
+  import_options.include_variables_in_initializers = true;
   auto bundle = std::make_unique<SavedModelBundle>();
 
   // TODO(b/213406917): Add support for the object graph based saved model input
@@ -599,8 +622,8 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
                                           &context, import_options, &bundle);
 
   if (!module.status().ok()) {
-    return absl::InternalError("Failed to import SavedModel: " +
-                               module.status().error_message());
+    return absl::InternalError(absl::StrCat("Failed to import SavedModel: ",
+                                            module.status().message()));
   }
 
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
@@ -661,6 +684,7 @@ absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
   MLIRImportOptions import_options;
   import_options.upgrade_legacy = true;
   import_options.lift_variables = false;
+  import_options.include_variables_in_initializers = true;
   auto bundle = std::make_unique<SavedModelBundle>();
 
   // TODO(b/213406917): Add support for the object graph based saved model input
@@ -671,8 +695,8 @@ absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
                                           &context, import_options, &bundle);
 
   if (!module.status().ok()) {
-    return absl::InternalError("Failed to import SavedModel: " +
-                               module.status().error_message());
+    return absl::InternalError(absl::StrCat("Failed to import SavedModel: ",
+                                            module.status().message()));
   }
 
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
index c3747fee523..f17f20df4b6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
@@ -47,7 +47,8 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
     absl::string_view saved_model_path,
     const std::vector<std::string>& signature_keys,
     const std::unordered_set<std::string>& tags,
-    const QuantizationOptions& quant_opts);
+    const QuantizationOptions& quant_opts,
+    const absl::flat_hash_map<std::string, std::string>& function_aliases);
 
 // Apply post-training dynamic range quantization to the model.
 absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index 56057c222cd..758f99b62a0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -33,6 +33,7 @@ from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import loader_impl as saved_model_loader
@@ -192,7 +193,9 @@ def _convert_values_to_tf_tensors(
     if isinstance(tensorlike_value, core.Tensor):
       tensor_value = tensorlike_value
     else:
-      tensor_value = ops.convert_to_tensor_v2_with_dispatch(tensorlike_value)
+      tensor_value = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          tensorlike_value
+      )
 
     tensor_mapping[name] = tensor_value
 
@@ -461,6 +464,7 @@ def _run_graph_for_calibration(
     signature_keys: Sequence[str],
     tags: Collection[str],
     representative_dataset: repr_dataset.RepresentativeDatasetOrMapping,
+    force_graph_mode_calibration: bool,
 ) -> None:
   """Runs the graph for calibration using representative datasets.
 
@@ -475,6 +479,8 @@ def _run_graph_for_calibration(
       `signature_keys` contains more than one signature key,
       `representative_datsaet` should be a mapping that maps each signature keys
       to the corresponding representative dataset.
+    force_graph_mode_calibration: If set to true, it forces calibration in graph
+      model instead of eager mode when the context is in eager mode.
 
   Raises:
     ValueError iff:
@@ -495,11 +501,13 @@ def _run_graph_for_calibration(
     representative_dataset_map = {signature_keys[0]: representative_dataset}
 
   try:
-    if context.executing_eagerly():
+    if context.executing_eagerly() and not force_graph_mode_calibration:
+      logging.info('Calibration step is executed in eager mode.')
       _run_graph_for_calibration_eager_mode(
           float_model_dir, tags, representative_dataset_map
       )
     else:
+      logging.info('Calibration step is executed in graph mode.')
       _run_graph_for_calibration_graph_mode(
           float_model_dir, tags, representative_dataset_map
       )
@@ -511,85 +519,6 @@ def _run_graph_for_calibration(
   logging.info('Calibration step complete.')
 
 
-def _run_static_range_qat(
-    src_saved_model_path: str,
-    dst_saved_model_path: str,
-    signature_def_keys: Sequence[str],
-    tags: Collection[str],
-    quant_opts: quant_opts_pb2.QuantizationOptions,
-    signature_def_map: _SignatureDefMap,
-) -> None:
-  """Runs static-range quantization for a Quantization-Aware Trained model.
-
-  Runs the quantization for a model trained using QAT.
-
-  Args:
-    src_saved_model_path: Path to the source SavedModel directory.
-    dst_saved_model_path: Path to the destination SavedModel directory.
-    signature_def_keys: Keys of the signatures of the functions that are the
-      target for quantization.
-    tags: Tags identifying the MetaGraphDef.
-    quant_opts: Quantization options.
-    signature_def_map: Signature def key -> SignatureDef mapping.
-  """
-  logging.info('Running static-range quantization for QAT model.')
-  exported_model_serialized = pywrap_quantize_model.quantize_qat_model(
-      src_saved_model_path,
-      list(signature_def_keys),
-      set(tags),
-      quant_opts.SerializeToString(),
-  )
-
-  exported_model = exported_model_pb2.ExportedModel.FromString(
-      exported_model_serialized
-  )
-
-  save_model.save_model_v1(
-      exported_model.graph_def,
-      dst_saved_model_path,
-      signature_def_map,
-      tags,
-      init_op_name=exported_model.init_node_name,
-      saver_def=_get_saver_def_or_none(exported_model),
-      checkpoint_dir=exported_model.checkpoint_dir,
-      function_aliases=exported_model.function_aliases,
-      asset_file_defs=exported_model.asset_file_defs,
-  )
-
-
-def _add_calibration_statistics(graph_def: graph_pb2.GraphDef) -> None:
-  """Adds calibration statistics to the graph def.
-
-  This function must be run after running the graph with a representative
-  dataset. Retrieves calibration statistics from the global calibrator and adds
-  them to the corresponding nodes as attributes.
-
-  Args:
-    graph_def: GraphDef to add calibration statistics to.
-  """
-  for function_def in graph_def.library.function:
-    for node_def in function_def.node_def:
-      if node_def.op != 'CustomAggregator':
-        continue
-
-      node_id = node_def.attr['id'].s
-      try:
-        min_val = pywrap_quantize_model.get_min_from_calibrator(node_id)
-        max_val = pywrap_quantize_model.get_max_from_calibrator(node_id)
-        pywrap_quantize_model.clear_data_from_calibrator(node_id)
-        node_def.attr['min'].f = float(min_val)
-        node_def.attr['max'].f = float(max_val)
-      except ValueError:
-        logging.warn(
-            (
-                'CustomAggregator id "%s" from FunctionDef "%s" does not have '
-                'min or max values. Parts of this function are not quantized.'
-            ),
-            node_id.decode('utf-8'),
-            function_def.signature.name,
-        )
-
-
 def _copy_assets(src_path: str, dst_path: str) -> None:
   """Copies the assets directory of the saved model.
 
@@ -623,6 +552,94 @@ def _copy_assets(src_path: str, dst_path: str) -> None:
       )
 
 
+def _run_static_range_qat(
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
+    signature_def_keys: Sequence[str],
+    tags: Collection[str],
+    quant_opts: quant_opts_pb2.QuantizationOptions,
+    signature_def_map: _SignatureDefMap,
+) -> None:
+  """Runs static-range quantization for a Quantization-Aware Trained model.
+
+  Runs the quantization for a model trained using QAT.
+
+  Args:
+    src_saved_model_path: Path to the source SavedModel directory.
+    dst_saved_model_path: Path to the destination SavedModel directory.
+    signature_def_keys: Keys of the signatures of the functions that are the
+      target for quantization.
+    tags: Tags identifying the MetaGraphDef.
+    quant_opts: Quantization options.
+    signature_def_map: Signature def key -> SignatureDef mapping.
+  """
+  logging.info('Running static-range quantization for QAT model.')
+
+  loader = saved_model_loader.SavedModelLoader(src_saved_model_path)
+  function_aliases = loader.get_meta_graph_def_from_tags(
+      tags
+  ).meta_info_def.function_aliases
+
+  exported_model_serialized = pywrap_quantize_model.quantize_qat_model(
+      src_saved_model_path,
+      list(signature_def_keys),
+      set(tags),
+      quant_opts.SerializeToString(),
+      dict(function_aliases),
+  )
+
+  exported_model = exported_model_pb2.ExportedModel.FromString(
+      exported_model_serialized
+  )
+
+  save_model.save_model_v1(
+      exported_model.graph_def,
+      dst_saved_model_path,
+      signature_def_map,
+      tags,
+      init_op_name=exported_model.init_node_name,
+      saver_def=_get_saver_def_or_none(exported_model),
+      checkpoint_dir=exported_model.checkpoint_dir,
+      function_aliases=exported_model.function_aliases,
+      asset_file_defs=exported_model.asset_file_defs,
+  )
+
+  _copy_assets(src_saved_model_path, dst_saved_model_path)
+
+
+def _add_calibration_statistics(graph_def: graph_pb2.GraphDef) -> None:
+  """Adds calibration statistics to the graph def.
+
+  This function must be run after running the graph with a representative
+  dataset. Retrieves calibration statistics from the global calibrator and adds
+  them to the corresponding nodes as attributes.
+
+  Args:
+    graph_def: GraphDef to add calibration statistics to.
+  """
+  for function_def in graph_def.library.function:
+    for node_def in function_def.node_def:
+      if node_def.op != 'CustomAggregator':
+        continue
+
+      node_id = node_def.attr['id'].s
+      try:
+        min_val = pywrap_quantize_model.get_min_from_calibrator(node_id)
+        max_val = pywrap_quantize_model.get_max_from_calibrator(node_id)
+        pywrap_quantize_model.clear_data_from_calibrator(node_id)
+        node_def.attr['min'].f = float(min_val)
+        node_def.attr['max'].f = float(max_val)
+      except ValueError:
+        logging.warn(
+            (
+                'CustomAggregator id "%s" from FunctionDef "%s" does not have '
+                'min or max values. Parts of this function are not quantized.'
+            ),
+            node_id.decode('utf-8'),
+            function_def.signature.name,
+        )
+
+
 def _get_saver_def_or_none(
     exported_model: exported_model_pb2.ExportedModel,
 ) -> Optional[saver_pb2.SaverDef]:
@@ -721,6 +738,7 @@ def _run_static_range_ptq(
       signature_def_keys,
       tags,
       representative_dataset,
+      quant_opts.force_graph_mode_calibration,
   )
   _add_calibration_statistics(graph_def)
 
@@ -911,7 +929,7 @@ def _dynamic_range_quantize(
   # please also update default value in tflite converter:
   # tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc;l=201
   if quantization_options.min_num_elements_for_weights == 0:
-    (quantization_options.min_num_elements_for_weights) = (
+    quantization_options.min_num_elements_for_weights = (
         _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS
     )
     logging.warn(
@@ -941,9 +959,14 @@ def _dynamic_range_quantize(
       exported_model.graph_def,
       output_directory,
       signature_def_map,
-      tags=tags,
+      tags,
       init_op_name=exported_model.init_node_name,
+      saver_def=_get_saver_def_or_none(exported_model),
+      checkpoint_dir=exported_model.checkpoint_dir,
+      function_aliases=exported_model.function_aliases,
+      asset_file_defs=exported_model.asset_file_defs,
   )
+  _copy_assets(saved_model_path, output_directory)
 
   return saved_model_load(output_directory)
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
index 5e3f3bba9a5..6c5c520bffc 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
@@ -132,7 +132,7 @@ message FreezeAllVariables {
 // 2) A set of supported operations.
 // 3) Unit wise quantization precision.
 // 4) Target hardware name.
-// NEXT ID: 11
+// NEXT ID: 12
 message QuantizationOptions {
   // The default quantization configuration for the model. If the below
   // unit-wise configuration does not exist, we use this default quantization
@@ -181,4 +181,8 @@ message QuantizationOptions {
   // Produces legacy weight-only graph where the qconst op(containing quantized
   // values) is followed by a dequantization op.
   bool enable_legacy_weight_only = 10;
+
+  // If set to true, it forces calibration in graph model instead of eager mode
+  // when the context is in eager mode.
+  bool force_graph_mode_calibration = 11;
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
index 362971c5d42..a3b43e62e5e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
@@ -93,6 +93,9 @@ void AddQuantizePtqDynamicRangePasses(
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateUnrollBatchMatMulPassPass());
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  if (quantization_options.experimental_enable_tpu_model_support()) {
+    pm.addPass(mlir::quant::CreateConvertTpuModelToCpuPass());
+  }
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::quant::CreatePrepareLiftingPass(quantization_options.op_set()));
   pm.addPass(mlir::quant::CreateLiftQuantizableSpotsAsFunctionsDRQPass(
@@ -134,11 +137,11 @@ void AddQuantizePtqPreCalibrationPasses(
         mlir::TF::CreateUnrollBatchMatMulPassPass());
   }
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::quant::CreatePrepareLiftingPass(quantization_options.op_set()));
   if (quantization_options.experimental_enable_tpu_model_support()) {
     pm.addPass(mlir::quant::CreateConvertTpuModelToCpuPass());
   }
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::quant::CreatePrepareLiftingPass(quantization_options.op_set()));
   pm.addPass(mlir::quant::CreateLiftQuantizableSpotsAsFunctionsPass(
       quantization_options.op_set(),
       quantization_options.enable_two_input_tensors()));
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/cast_bf16_ops_to_f32.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/cast_bf16_ops_to_f32.mlir
index 61d9288a5a8..4fc6cbf3f97 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/cast_bf16_ops_to_f32.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/cast_bf16_ops_to_f32.mlir
@@ -47,8 +47,8 @@ func.func @cast_bf16_avg_pool_to_fp32(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3
 
 // CHECK: func @cast_bf16_avg_pool_to_fp32
 // CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-// CHECK: %[[conv:.*]] = "tf.Conv2D"(%arg0, %[[cst]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
-// CHECK: %[[avg_pool:.*]] = "tf.AvgPool"(%[[conv]]) {data_format = "NHWC", ksize = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK: %[[conv:.*]] = "tf.Conv2D"(%arg0, %[[cst]])
+// CHECK: %[[avg_pool:.*]] = "tf.AvgPool"(%[[conv]])
 // CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[avg_pool]]) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK: return %[[identity]] : tensor<1x3x2x2xf32>
 
@@ -63,7 +63,7 @@ func.func @cast_bf16_matmul_to_fp32(%arg0: tensor<1x10xf32>) -> (tensor<1x2xf32>
 
 // CHECK: func @cast_bf16_matmul_to_fp32
 // CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<10x2xf32>} : () -> tensor<10x2xf32>
-// CHECK: %[[matmul:.*]] = "tf.MatMul"(%arg0, %[[cst]]) {transpose_a = false, transpose_b = false} : (tensor<1x10xf32>, tensor<10x2xf32>) -> tensor<1x2xf32>
+// CHECK: %[[matmul:.*]] = "tf.MatMul"(%arg0, %[[cst]])
 // CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[matmul]])
 // CHECK: return %[[identity]] : tensor<1x2xf32>
 
@@ -78,7 +78,7 @@ func.func @cast_bf16_depthwise_conv_to_fp32(%arg0: tensor<1x3x4x3xf32>) -> (tens
 
 // CHECK: func @cast_bf16_depthwise_conv_to_fp32
 // CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-// CHECK: %[[depthwise_conv:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[cst]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x6xf32>
+// CHECK: %[[depthwise_conv:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[cst]])
 // CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[depthwise_conv]]) {device = ""} : (tensor<1x2x2x6xf32>) -> tensor<1x2x2x6xf32>
 // CHECK: return %[[identity]] : tensor<1x2x2x6xf32>
 
@@ -97,35 +97,18 @@ func.func @cast_bf16_batch_matmul_v2_to_fp32(%arg0: tensor<1x1x10xf32>) -> (tens
 // CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[batch_matmul]]) {device = ""} : (tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
 // CHECK: return %[[identity]] : tensor<1x1x2xf32>
 
-func.func @cast_bf16_gather_v2_to_fp32(%arg0: tensor<1xi64>) -> (tensor<1x3x4x3xf32>) {
-  %cst = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  %cst_0 = "tf.Const"() {device = "", value = dense<1.000000e+01> : tensor<1024x3x4x3xbf16>} : () -> tensor<1024x3x4x3xbf16>
-  %0 = "tf.GatherV2"(%cst_0, %arg0, %cst) {batch_dims = 0 : i64, device = ""} : (tensor<1024x3x4x3xbf16>, tensor<1xi64>, tensor<i32>) -> tensor<1x3x4x3xbf16>
-  %1 = "tf.Cast"(%0) {Truncate = false} : (tensor<1x3x4x3xbf16>) -> tensor<1x3x4x3xf32>
-  %2 = "tf.IdentityN"(%1) {device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
-  return %2 : tensor<1x3x4x3xf32>
-}
-
-// CHECK: func @cast_bf16_gather_v2_to_fp32
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<1024x3x4x3xf32>} : () -> tensor<1024x3x4x3xf32>
-// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK: %[[gather:.*]] = "tf.GatherV2"(%[[cst]], %arg0, %[[cst_0]]) {batch_dims = 0 : i64} : (tensor<1024x3x4x3xf32>, tensor<1xi64>, tensor<i32>) -> tensor<1x3x4x3xf32>
-// CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[gather]])
-// CHECK: return %[[identity]] : tensor<1x3x4x3xf32>
-
 // Tests that an AddV2 op accepting two bf16 operands is transformed into
 // an AddV2 op that accepts two fp32 operands.
-func.func @cast_bf16_add_v2_to_fp32(%arg0: tensor<2xbf16>, %arg1: tensor<2xbf16>) -> tensor<2xbf16> {
+func.func @cast_bf16_add_v2_to_fp32(%arg0: tensor<2xbf16>, %arg1: tensor<2xbf16>) -> tensor<2xf32> {
   %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2xbf16>, tensor<2xbf16>) -> tensor<2xbf16>
-  return %0 : tensor<2xbf16>
+  %1 = "tf.Cast"(%0) {Truncate = false} : (tensor<2xbf16>) -> tensor<2xf32>
+  return %1 : tensor<2xf32>
 }
 // The signature of the function is not changed.
-// CHECK: func @cast_bf16_add_v2_to_fp32(%[[ARG_0:.*]]: tensor<2xbf16>, %[[ARG_1:.*]]: tensor<2xbf16>) -> tensor<2xbf16>
+// CHECK: func @cast_bf16_add_v2_to_fp32(%[[ARG_0:.*]]: tensor<2xbf16>, %[[ARG_1:.*]]: tensor<2xbf16>) -> tensor<2xf32>
 
 // bfloat16 operands are cast to f32 operands.
 // CHECK-DAG: %[[CAST_0:.*]] = "tf.Cast"(%[[ARG_0]]) {Truncate = false} : (tensor<2xbf16>) -> tensor<2xf32>
 // CHECK-DAG: %[[CAST_1:.*]] = "tf.Cast"(%[[ARG_1]]) {Truncate = false} : (tensor<2xbf16>) -> tensor<2xf32>
 // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[CAST_0]], %[[CAST_1]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// f32 outputs are cast back to bfloat16.
-// CHECK: %[[CAST_2:.*]] = "tf.Cast"(%[[ADD]]) {Truncate = false} : (tensor<2xf32>) -> tensor<2xbf16>
-// CHECK: return %[[CAST_2]] : tensor<2xbf16>
+// CHECK: return %[[ADD]] : tensor<2xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_tpu_model_to_cpu.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_tpu_model_to_cpu.mlir
index cebbba385e5..f7c8c6aaabb 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_tpu_model_to_cpu.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_tpu_model_to_cpu.mlir
@@ -19,7 +19,7 @@ func.func private @tpu_func_0_optim0(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x2
   %2 = "tf.Transpose"(%0, %cst_0) {device = ""} : (tensor<1x3x4x3xbf16>, tensor<4xi32>) -> tensor<1x3x3x4xbf16>
   %3 = "tf.TPUReplicatedInput"(%2) {device = "", index = -1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<1x3x3x4xbf16>) -> tensor<1x3x3x4xbf16>
   %4 = "tf.Transpose"(%3, %cst_1) {_tpu_replicate = "cluster", device = ""} : (tensor<1x3x3x4xbf16>, tensor<4xi32>) -> tensor<1x3x4x3xbf16>
-  %5 = "tf.Conv2D"(%4, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16>
+  %5 = "tf.Conv2D"(%4, %cst) {_tpu_replicate = "cluster", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16>
   %6 = "tf.TPUReplicatedOutput"(%5) {device = ""} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xbf16>
   %7 = "tf.Cast"(%6) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
   func.return %7 : tensor<1x3x2x2xf32>
@@ -43,9 +43,7 @@ func.func @serving_default(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor
 // The contents of `@serving_default` should have been inlined to `@batch_func`.
 // CHECK: func.func @serving_default(%[[ARG0:.*]]: tensor<1xf32>, %[[ARG1:.*]]: tensor<1xf32>) -> tensor<1xf32>
 // CHECK-NOT: tf.BatchFunction
-// CHECK: %[[IDENTITY0:.*]] = "tf.Identity"(%[[ARG0]])
-// CHECK: %[[IDENTITY1:.*]] = "tf.Identity"(%[[ARG1]])
-// CHECK: %[[ADD0:.*]] = "tf.AddV2"(%[[IDENTITY0]], %[[IDENTITY1]])
+// CHECK: %[[ADD0:.*]] = "tf.AddV2"(%[[ARG0]], %[[ARG1]])
 // CHECK: return %[[ADD0]] : tensor<1xf32>
 
 func.func private @batched_func(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions.mlir
index debff9b9e26..f61a9fbe9fe 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions.mlir
@@ -195,6 +195,35 @@ func.func @float_matmul(
 
 // -----
 
+func.func @float_matmul_with_reshape(%arg0: tensor<1x10xf32>, %arg1: tensor<10x10xf32>) -> (tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<10xf32>} : () -> tensor<10xf32>
+  %cst_0 = "tf.Const"() {value = dense<[-1, 10]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %1 = "tf.MatMul"(%arg0, %arg1) {
+    transpose_a = false, transpose_b = true
+  } : (tensor<1x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %2 = "tf.Reshape"(%1, %cst_0) : (tensor<*xf32>, tensor<2xi32>) -> tensor<*xf32>
+  %3 = "tf.BiasAdd"(%2, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<10xf32>) -> tensor<*xf32>
+
+  func.return %3 : tensor<*xf32>
+
+
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<10xf32>}
+// CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[-1, 10]> : tensor<2xi32>}
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]], %[[SHAPE]])
+// CHECK-SAME: f = @composite_matmul_with_reshape_and_bias_fn_1}
+// CHECK: return %[[PARTITIONEDCALL_0]]
+// CHECK: }
+
+// CHECK-LABEL: private @composite_matmul_with_reshape_and_bias_fn_1
+// CHECK-NEXT: tf.MatMul"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:transpose_a,1:transpose_b"
+// CHECK-NEXT: tf.Reshape
+// CHECK-NEXT: tf.BiasAdd
+// CHECK-NEXT: return
+}
+
+// -----
+
 // CHECK-LABEL: float_conv_no_bias
 func.func @float_conv_no_bias(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) {
   %0 = "tf.Conv2D"(%arg0, %arg1) {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
index 9d0a807aa52..ee97c375ba9 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
@@ -1,4 +1,5 @@
-// RUN: tf-quant-opt %s -quant-prepare-lifting | FileCheck %s
+// RUN: tf-quant-opt %s -quant-prepare-lifting -split-input-file | FileCheck %s
+// RUN: tf-quant-opt %s -quant-prepare-lifting='target-opset=XLA' | FileCheck --check-prefix=XLA-CHECK %s
 
 func.func @decompose_batch_norm(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
   %cst = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
@@ -13,6 +14,8 @@ func.func @decompose_batch_norm(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
 // CHECK: %[[add:.*]] = "tf.AddV2"(%[[mul]], %[[CONST]]) : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
 // CHECK-NEXT: return %[[add]] : tensor<*xf32>
 
+// -----
+
 func.func @not_decompose_batch_norm(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
   %cst = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
   %cst_0 = "tf.Const"() {value = dense<0.500000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
@@ -25,6 +28,8 @@ func.func @not_decompose_batch_norm(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
 // CHECK: %[[bn:.*]], %batch_mean, %batch_variance, %reserve_space_1, %reserve_space_2, %reserve_space_3 = "tf.FusedBatchNormV3"(%arg0, %[[CONST]], %[[CONST_0]], %[[CONST_0]], %[[CONST]]) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = true} : (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>)
 // CHECK-NEXT: return %[[bn]] : tensor<*xf32>
 
+// -----
+
 func.func @convert_add_to_biasadd(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
   %cst = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
   %cst_0 = "tf.Const"() {value = dense<0.500000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
@@ -39,6 +44,8 @@ func.func @convert_add_to_biasadd(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2
 // CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<1x3x2x2xf32>
 
+// -----
+
 func.func @not_convert_add_to_biasadd(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x3xf32>) {
   %cst = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x3xf32>} : () -> tensor<2x3x3x3xf32>
   %cst_0 = "tf.Const"() {value = dense<0.500000e+00> : tensor<1x3x2x3xf32>} : () -> tensor<1x3x2x3xf32>
@@ -53,6 +60,8 @@ func.func @not_convert_add_to_biasadd(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3
 // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[CONV2D]], %[[CONST_0]]) : (tensor<1x3x2x3xf32>, tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
 // CHECK-NEXT: return %[[ADD]] : tensor<1x3x2x3xf32>
 
+// -----
+
 func.func @fuse_conv2d_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
@@ -65,6 +74,8 @@ func.func @fuse_conv2d_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf3
 // CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[CONV2D]] : tensor<1x3x2x2xf32>
 
+// -----
+
 func.func @not_fuse_conv2d_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
@@ -79,6 +90,8 @@ func.func @not_fuse_conv2d_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x
 // CHECK-NEXT: %[[ADD:.*]] = "tf.Mul"(%[[CONV2D]], %[[CONST_0]]) : (tensor<1x3x2x2xf32>, tensor<2x2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[ADD]] : tensor<1x3x2x2xf32>
 
+// -----
+
 func.func @fuse_conv2d_with_bias_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
@@ -95,6 +108,8 @@ func.func @fuse_conv2d_with_bias_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<
 // CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<1x3x2x2xf32>
 
+// -----
+
 func.func @not_fuse_conv2d_with_bias_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>, tensor<1x3x2x2xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
@@ -113,6 +128,8 @@ func.func @not_fuse_conv2d_with_bias_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (ten
 // CHECK-NEXT: %[[MUL:.*]] = "tf.Mul"(%[[CONV2D]], %[[CONST_1]]) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[BIASADD]], %[[MUL]] : tensor<1x3x2x2xf32>, tensor<1x3x2x2xf32>
 
+// -----
+
 func.func @fuse_conv2d_with_bias_and_add(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
   %cst_0 = "tf.Const"() {value = dense<0.500000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
@@ -129,6 +146,8 @@ func.func @fuse_conv2d_with_bias_and_add(%arg0: tensor<1x3x4x3xf32>) -> (tensor<
 // CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<1x3x2x2xf32>
 
+// -----
+
 func.func @not_fuse_conv2d_with_bias_and_add(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2xf32>) -> (tensor<1x3x2x2xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
@@ -145,6 +164,8 @@ func.func @not_fuse_conv2d_with_bias_and_add(%arg0: tensor<1x3x4x3xf32>, %arg1:
 // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[BIASADD]], %arg1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[ADD]] : tensor<1x3x2x2xf32>
 
+// -----
+
 func.func @match_depthwise_conv2d_and_add(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
@@ -159,6 +180,8 @@ func.func @match_depthwise_conv2d_and_add(%arg0: tensor<*xf32>) -> (tensor<*xf32
 // CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<*xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<*xf32>
 
+// -----
+
 func.func @match_depthwise_conv2d_and_mul(%arg0: tensor<*xf32>) -> (tensor<?x?x?x3xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
@@ -171,6 +194,8 @@ func.func @match_depthwise_conv2d_and_mul(%arg0: tensor<*xf32>) -> (tensor<?x?x?
 // CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
 // CHECK-NEXT: return %[[DEPTHWISE_CONV2D]] : tensor<?x?x?x3xf32>
 
+// -----
+
 func.func @match_depthwise_conv2d_with_bias_and_add(%arg0: tensor<*xf32>) -> (tensor<?x?x?x3xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
@@ -187,6 +212,8 @@ func.func @match_depthwise_conv2d_with_bias_and_add(%arg0: tensor<*xf32>) -> (te
 // CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<?x?x?x3xf32>
 
+// -----
+
 func.func @match_depthwise_conv2d_with_bias_and_mul(%arg0: tensor<*xf32>) -> (tensor<?x?x?x3xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
@@ -203,6 +230,8 @@ func.func @match_depthwise_conv2d_with_bias_and_mul(%arg0: tensor<*xf32>) -> (te
 // CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<?x?x?x3xf32>
 
+// -----
+
 func.func @lower_einsum(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -> tensor<3x4x6xf32> {
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,ikm->ijm"}: (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
   func.return %0 : tensor<3x4x6xf32>
@@ -210,6 +239,7 @@ func.func @lower_einsum(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -> t
 // CHECK-LABEL: lower_einsum
 // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
 
+// -----
 
 func.func @removing_identity_after_const(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
@@ -225,6 +255,8 @@ func.func @removing_identity_after_const(%arg0: tensor<*xf32>) -> (tensor<*xf32>
 // CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
 // CHECK: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]])
 
+// -----
+
 func.func @not_removing_identity_of_returning_value(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
@@ -239,22 +271,24 @@ func.func @not_removing_identity_of_returning_value(%arg0: tensor<*xf32>) -> (te
 // CHECK: %[[identity:.*]] = "tf.Identity"
 // CHECK: return %[[identity]] : tensor<*xf32>
 
+// -----
+
 func.func @batch_norm_with_q_dq(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
-  %cst = "tf.Const"() {device = "", value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32> 
-  %cst_0 = "tf.Const"() {device = "", value = dense<5.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32> 
-  %cst_1 = "tf.Const"() {device = "", value = dense<5.000000e-01> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32> 
-  %0 = "quantfork.qcast"(%cst_1) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.003937007874015748,0.003937007874015748}>> 
-  %1 = "quantfork.dcast"(%0) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.003937007874015748,0.003937007874015748}>>) -> tensor<2x3x3x2xf32> 
-  %2 = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>> 
-  %3 = "quantfork.dcast"(%2) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>) -> tensor<1x3x4x3xf32> 
-  %4 = "tf.Conv2D"(%3, %1) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32> 
-  %y, %batch_mean, %batch_variance, %reserve_space_1, %reserve_space_2, %reserve_space_3 = "tf.FusedBatchNormV3"(%4, %cst, %cst_0, %cst, %cst_0) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = false} : (tensor<1x3x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<1x3x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<*xf32>) 
-  %5 = "tf.Relu6"(%y) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32> 
-  %6 = "quantfork.qcast"(%5) : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2x!quant.uniform<i8<-127:127>:f32:3, {0.0026771653824903836:-60,0.0032283464285332388:-28}>> 
-  %7 = "quantfork.dcast"(%6) : (tensor<1x3x2x2x!quant.uniform<i8<-127:127>:f32:3, {0.0026771653824903836:-60,0.0032283464285332388:-28}>>) -> tensor<1x3x2x2xf32> 
-  %8 = "tf.Identity"(%7) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32> 
-  %9 = "tf.Identity"(%8) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32> 
-  return %9 : tensor<1x3x2x2xf32> 
+  %cst = "tf.Const"() {device = "", value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<5.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_1 = "tf.Const"() {device = "", value = dense<5.000000e-01> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "quantfork.qcast"(%cst_1) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.003937007874015748,0.003937007874015748}>>
+  %1 = "quantfork.dcast"(%0) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.003937007874015748,0.003937007874015748}>>) -> tensor<2x3x3x2xf32>
+  %2 = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>
+  %3 = "quantfork.dcast"(%2) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>) -> tensor<1x3x4x3xf32>
+  %4 = "tf.Conv2D"(%3, %1) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %y, %batch_mean, %batch_variance, %reserve_space_1, %reserve_space_2, %reserve_space_3 = "tf.FusedBatchNormV3"(%4, %cst, %cst_0, %cst, %cst_0) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = false} : (tensor<1x3x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<1x3x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<*xf32>)
+  %5 = "tf.Relu6"(%y) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  %6 = "quantfork.qcast"(%5) : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2x!quant.uniform<i8<-127:127>:f32:3, {0.0026771653824903836:-60,0.0032283464285332388:-28}>>
+  %7 = "quantfork.dcast"(%6) : (tensor<1x3x2x2x!quant.uniform<i8<-127:127>:f32:3, {0.0026771653824903836:-60,0.0032283464285332388:-28}>>) -> tensor<1x3x2x2xf32>
+  %8 = "tf.Identity"(%7) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  %9 = "tf.Identity"(%8) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %9 : tensor<1x3x2x2xf32>
 }
 
 // CHECK: func @batch_norm_with_q_dq
@@ -267,3 +301,80 @@ func.func @batch_norm_with_q_dq(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf
 // CHECK: %[[conv:.*]] = "tf.Conv2D"(%[[dq_input]], %[[dq_weight]])
 // CHECK: %[[bias:.*]] = "tf.BiasAdd"(%[[conv]], %[[cst_0]]) {data_format = "NHWC"}
 // CHECK: %[[relu6:.*]] = "tf.Relu6"(%[[bias]])
+
+// -----
+
+func.func @xla_dot_v2(%arg0: tensor<?x2x3xf32>, %arg1: tensor<3x4x5xf32>) -> (tensor<?x2x4x5xf32>) {
+  %0 = "tf.XlaDotV2"(%arg0, %arg1) {device = "", dimension_numbers = "\0A\01\02\12\01\00", precision_config = ""} : (tensor<?x2x3xf32>, tensor<3x4x5xf32>) -> tensor<?x2x4x5xf32>
+  func.return %0 : tensor<?x2x4x5xf32>
+}
+
+// CHECK: func @xla_dot_v2
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<[3, 20]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() {value = dense<[-1, 2, 4, 5]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK: %[[reshape:.*]] = "tf.Reshape"(%arg1, %[[cst]]) : (tensor<3x4x5xf32>, tensor<2xi64>) -> tensor<3x20xf32>
+// CHECK: %[[batch_matmul:.*]] = "tf.BatchMatMulV2"(%arg0, %[[reshape]]) {adj_x = false, adj_y = false} : (tensor<?x2x3xf32>, tensor<3x20xf32>) -> tensor<?x2x20xf32>
+// CHECK: %[[reshape_0:.*]] = "tf.Reshape"(%[[batch_matmul]], %[[cst_0]]) : (tensor<?x2x20xf32>, tensor<4xi64>) -> tensor<?x2x4x5xf32>
+// CHECK: return %[[reshape_0]] : tensor<?x2x4x5xf32>
+
+// XLA-CHECK: func @xla_dot_v2
+// XLA-CHECK: %[[einsum:.*]] = "tf.Einsum"(%arg0, %arg1) {equation = "abc,cde->abde"} : (tensor<?x2x3xf32>, tensor<3x4x5xf32>) -> tensor<?x2x4x5xf32>
+// XLA-CHECK: return %[[einsum]] : tensor<?x2x4x5xf32>
+
+// -----
+
+// dimension_numbers: {
+//   offset_dims: 0
+//   collapsed_slice_dims: 1
+//   start_index_map: 1
+// }
+func.func @xla_gather(%arg0: tensor<?x2xf32>, %arg1: tensor<1xi32>, %arg2: tensor<2xi32>) -> tensor<*xf32> {
+  %0 = "tf.XlaGather"(%arg0, %arg1, %arg2) {device = "", dimension_numbers = "\0A\01\00\12\01\01\1A\01\01", indices_are_sorted = true} : (tensor<?x2xf32>, tensor<1xi32>, tensor<2xi32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// CHECK: func @xla_gather
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() {value = dense<1> : tensor<1x1xi64>} : () -> tensor<1x1xi64>
+// CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK: %[[arg1_i64:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<1xi32>) -> tensor<1xi64>
+// CHECK: %[[tensor_scatter_update:.*]] = "tf.TensorScatterUpdate"(%[[cst]], %[[cst_0]], %[[arg1_i64]]) : (tensor<2xi64>, tensor<1x1xi64>, tensor<1xi64>) -> tensor<2xi64>
+// CHECK: %[[arg2_i64:.*]] = "tf.Cast"(%arg2) {Truncate = false} : (tensor<2xi32>) -> tensor<2xi64>
+// CHECK: %[[slice:.*]] = "tf.Slice"(%arg0, %[[tensor_scatter_update]], %[[arg2_i64]]) : (tensor<?x2xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<*xf32>
+// CHECK: %[[reshape:.*]] = "tf.Reshape"(%[[slice]], %[[cst_1]]) : (tensor<*xf32>, tensor<1xi64>) -> tensor<*xf32>
+// CHECK: return %[[reshape]] : tensor<*xf32>
+
+// -----
+
+// Tests that the converted `tf.Slice` has the correct number of dimensions
+// when the output shape is known (`tensor<i32>` instead of `tensor<*xi32>`).
+
+func.func @xla_gather_known_output_shape(%arg0: tensor<5xi32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>) -> tensor<i32> {
+  // dimension_numbers: {
+  //   collapsed_slice_dims: 0
+  //   start_index_map: 0
+  // }
+  %0 = "tf.XlaGather"(%arg0, %arg1, %arg2) {device = "", dimension_numbers = "\12\01\00\1A\01\00", indices_are_sorted = true} : (tensor<5xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+// CHECK: func @xla_gather_known_output_shape
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() {value = dense<0> : tensor<1x1xi64>} : () -> tensor<1x1xi64>
+// CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>} : () -> tensor<0xi64>
+// CHECK: %[[arg1_i64:.*]] = "tf.Cast"(%arg1) {Truncate = false} : (tensor<1xi32>) -> tensor<1xi64>
+// CHECK: %[[tensor_scatter_update:.*]] = "tf.TensorScatterUpdate"(%[[cst]], %[[cst_0]], %[[arg1_i64]]) : (tensor<1xi64>, tensor<1x1xi64>, tensor<1xi64>) -> tensor<1xi64>
+// CHECK: %[[arg2_i64:.*]] = "tf.Cast"(%arg2) {Truncate = false} : (tensor<1xi32>) -> tensor<1xi64>
+// CHECK: %[[slice:.*]] = "tf.Slice"(%arg0, %[[tensor_scatter_update]], %[[arg2_i64]]) : (tensor<5xi32>, tensor<1xi64>, tensor<1xi64>) -> tensor<1xi32>
+// CHECK: %[[reshape:.*]] = "tf.Reshape"(%[[slice]], %[[cst_1]]) : (tensor<1xi32>, tensor<0xi64>) -> tensor<i32>
+// CHECK: return %[[reshape]] : tensor<i32>
+
+// -----
+
+func.func @replace_checknumerics_to_identity(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.CheckNumerics"(%arg0) {device = "", message = "transformer"} : (tensor<*xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// CHECK: func @replace_checknumerics_to_identity
+// CHECK: %[[out:.*]] = "tf.Identity"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir
index 10bedcff581..d04ec262f6f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir
@@ -19,8 +19,8 @@ func.func private @conv(%input: tensor<1x3x4x3xf32> {tf._user_specified_name = "
   func.return %dq_res : tensor<*xf32>
 }
 
-// CHECK-DAG: [[bias:%.+]] = "arith.constant"() {value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: [[weight:%.+]] = "arith.constant"() {value = dense_resource<__elided__> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
+// CHECK-DAG: [[bias:%.+]] = "arith.constant"() <{value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: [[weight:%.+]] = "arith.constant"() <{value = dense_resource<__elided__> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
 // CHECK: [[q_input:%.+]] = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
 // CHECK-NEXT: [[q_bias:%.+]] = "quantfork.qcast"([[bias]]) : (tensor<2xf32>) -> tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>
 // CHECK-NEXT: [[conv:%.+]] = "tf.PartitionedCall"([[q_input]], [[weight]], [[q_bias]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @[[composite_fn:composite_conv2d_with_bias_and_relu6_fn.*]]} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>, tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir
index 5ba40e0eb1d..663b2efd580 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir
@@ -139,3 +139,59 @@ module {
 // CHECK: Number of quantize layers added: 1
 // CHECK: Number of dequantize layers added: 1
 }
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1219 : i32}, tf_saved_model.semantics} {
+  func.func @embedding_with_one_float_conv_and_one_quantized_conv(%arg0: tensor<1xi32> {tf_saved_model.index_path = ["input"]}) -> (tensor<1x3x1x1xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+
+    %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<3x3x1024x1xf32>} : () -> tensor<3x3x1024x1xf32>
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<1024x3x4x3xf32>} : () -> tensor<1024x3x4x3xf32>
+    %cst_1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %cst_2 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2x3x3x1024xf32>} : () -> tensor<2x3x3x1024xf32>
+
+    %0 = "tf.PartitionedCall"(%cst_0, %arg0, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_gather_fn_1} : (tensor<1024x3x4x3xf32>, tensor<1xi32>, tensor<i32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.PartitionedCall"(%0, %cst_2) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_2} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1024xf32>) -> tensor<1x3x2x1024xf32>
+    %2 = "quantfork.qcast"(%1) : (tensor<1x3x2x1024xf32>) -> tensor<1x3x2x1024x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>
+    %3 = "quantfork.dcast"(%2) : (tensor<1x3x2x1024x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>) -> tensor<1x3x2x1024xf32>
+    %4 = "tf.PartitionedCall"(%3, %cst) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1} : (tensor<1x3x2x1024xf32>, tensor<3x3x1024x1xf32>) -> tensor<1x3x1x1xf32>
+    %5 = "quantfork.qcast"(%4) : (tensor<1x3x1x1xf32>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>
+    %6 = "quantfork.dcast"(%5) : (tensor<1x3x1x1x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>) -> tensor<1x3x1x1xf32>
+    return %6 : tensor<1x3x1x1xf32>
+  }
+  func.func private @composite_gather_fn_1(%arg0: tensor<1024x3x4x3xf32>, %arg1: tensor<1xi32>, %arg2: tensor<i32>) -> tensor<1x3x4x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.GatherV2"(%arg0, %arg1, %arg2) {attr_map = "0:batch_dims", batch_dims = 0 : i64, device = ""} : (tensor<1024x3x4x3xf32>, tensor<1xi32>, tensor<i32>) -> tensor<1x3x4x3xf32>
+    return %0 : tensor<1x3x4x3xf32>
+  }
+  func.func private @composite_conv2d_fn_2(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x1024xf32>) -> tensor<1x3x2x1024xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1024xf32>) -> tensor<1x3x2x1024xf32>
+    return %0 : tensor<1x3x2x1024xf32>
+  }
+  func.func private @composite_conv2d_fn_1(%arg0: tensor<1x3x2x1024xf32>, %arg1: tensor<3x3x1024x1xf32>) -> tensor<1x3x1x1xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x2x1024xf32>, tensor<3x3x1024x1xf32>) -> tensor<1x3x1x1xf32>
+    return %0 : tensor<1x3x1x1xf32>
+  }
+
+// CHECK-LABEL: func @embedding_with_one_float_conv_and_one_quantized_conv
+
+// CHECK: %[[quantized_gather:.*]] = "tf.PartitionedCall"(
+// CHECK-SAME: f = @quantized_gather_float_output_fn_0
+// CHECK: %[[float_conv:.*]] = "tf.PartitionedCall"(%[[quantized_gather]]
+// CHECK-SAME: f = @composite_conv2d_fn_2
+// CHECK: %[[quantize:.*]] = "tf.PartitionedCall"(%[[float_conv]]
+// CHECK-SAME: f = @quantize_i8
+// CHECK: %[[quantized_conv:.*]] = "tf.PartitionedCall"(%[[quantize]]
+// CHECK-SAME: f = @quantized_conv2d_float_output_fn_0
+
+// CHECK: -------- Quantization Summary --------
+// CHECK: Number of quantized layers in the model
+// CHECK: --------------------------------
+// CHECK: Name    Count/Total
+// CHECK: ================================
+// CHECK: Gather  1/1
+// CHECK: Conv2D  1/2
+
+// CHECK: Number of quantized layers with quantized outputs: 0/2
+// CHECK: Number of quantize layers added: 1
+// CHECK: Number of dequantize layers added: 0
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir
index abe0c997195..c500b3c72e8 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir
@@ -13,7 +13,7 @@ module {
     return %0 : tensor<*xf32>
   }
 
-// CHECK: %[[cst:.*]] = "arith.constant"() {value = dense<0.000000e+00> : tensor<2x1024xf32>} : () -> tensor<2x1024xf32>
+// CHECK: %[[cst:.*]] = "arith.constant"() <{value = dense<0.000000e+00> : tensor<2x1024xf32>}> : () -> tensor<2x1024xf32>
 // CHECK: %[[q_cst:.*]] = "quantfork.qcast"(%[[cst]]) : (tensor<2x1024xf32>) -> tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>
 // CHECK: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_cst]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<*xf32>
 // CHECK: "func.return"(%[[out]]) : (tensor<*xf32>) -> ()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir
index 9123e41967e..4356d084a56 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir
@@ -19,8 +19,8 @@ func.func private @conv(%input: tensor<1x3x4x3xf32> {tf._user_specified_name = "
   func.return %dq_res : tensor<*xf32>
 }
 
-// CHECK-DAG: [[bias:%.+]] = "arith.constant"() {value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK-DAG: [[weight:%.+]] = "arith.constant"() {value = dense_resource<__elided__> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
+// CHECK-DAG: [[bias:%.+]] = "arith.constant"() <{value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: [[weight:%.+]] = "arith.constant"() <{value = dense_resource<__elided__> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
 // CHECK: [[q_input:%.+]] = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
 // CHECK-NEXT: [[q_bias:%.+]] = "quantfork.qcast"([[bias]]) : (tensor<2xf32>) -> tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>
 // CHECK-NEXT: [[conv:%.+]] = "tf.PartitionedCall"([[q_input]], [[weight]], [[q_bias]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @[[composite_fn:composite_conv2d_with_bias_and_relu6_fn.*]]} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>, tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
index f2d89b2df75..9f02d680300 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
@@ -63,6 +63,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow:uniform_op_quant_spec",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.cc
index e6f74a654aa..cb301ec8276 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.cc
@@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h"
 
+#include <array>
 #include <functional>
 #include <memory>
 #include <set>
 #include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "llvm/ADT/StringMap.h"
@@ -35,6 +37,26 @@ namespace mlir::quant {
 using QuantMethod =
     tensorflow::quantization::QuantizationMethod::ExperimentalMethod;
 
+enum class OpType {
+  kDynamicRangeOp,  // Dynamic Range kernels only have rhs attr.
+  kUnaryOp,         // Unary ops have one min/max attr.
+  kBinaryOp,        // Binary ops have lhs/rhs attr.
+  kQuantizationOp,  // Quantization ops have input/output attr.
+};
+
+// For each op type, the following axis carries axis information:
+// kDynamicRangeOp: rhs_quantization_axis will carry axis information.
+// kUnaryOp: quantization_axis will carry axis information.
+// kBinaryOp: Among {lhs, rhs, output}_quantization_axis, only check rhs.
+// kQuantizationOp: Among {input, output}_quantization_axis, only check input.
+// We therefore check exemplary 3 axes {rhs_, input_, }quantization_axis from
+// previous accumulations.
+constexpr std::array<absl::string_view, 3> kQuantizationAxisAttrs = {
+    "input_quantization_axis", "quantization_axis", "rhs_quantization_axis"};
+
+// Common suffixes for attributes used in FillQuantizationAttributes.
+constexpr std::array<absl::string_view, 2> kSuffixes = {"_min_val", "_max_val"};
+
 Attribute GetWindowStridesValue(
     PatternRewriter& rewriter, llvm::StringMap<Attribute>& identifier_to_attr) {
   ArrayAttr stride = identifier_to_attr["strides"].dyn_cast<ArrayAttr>();
@@ -103,50 +125,73 @@ Attribute GetBatchGroupCountValue(
   return rewriter.getI64IntegerAttr(1);
 }
 
+Attribute GetQuantizationAxis(PatternRewriter& rewriter, Operation* op,
+                              const int operand_index) {
+  auto* defining_op = op->getOperand(operand_index).getDefiningOp();
+  for (auto attr : kQuantizationAxisAttrs) {
+    if (defining_op->hasAttr(attr)) {
+      return defining_op->getAttr(attr);
+    }
+  }
+  // Not found.
+  return rewriter.getI64IntegerAttr(-1);
+}
+
 void FillQuantizationAttributes(PatternRewriter& rewriter, Operation* op,
                                 NamedAttrList& attrs,
                                 llvm::StringMap<Attribute>& identifier_to_attr,
-                                QuantMethod quantization_method) {
+                                OpType op_type) {
   // TODO(b/259374419): Support broader quantization schemes
   absl::flat_hash_map<std::string, int> min_max_scheme_for_8bit_narrow;
   min_max_scheme_for_8bit_narrow = {{"min", -127}, {"max", 127}};
 
-  std::set<std::string> quantization_attributes;
-  if (quantization_method ==
-      tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE) {
-    quantization_attributes = {
-        "rhs_quantization_min_val",
-        "rhs_quantization_max_val",
-    };
-  } else {
-    quantization_attributes = {
-        "lhs_quantization_min_val",    "lhs_quantization_max_val",
-        "rhs_quantization_min_val",    "rhs_quantization_max_val",
-        "output_quantization_min_val", "output_quantization_max_val",
-    };
+  std::vector<std::string> quantization_attributes;
+  switch (op_type) {
+    case OpType::kDynamicRangeOp:
+      quantization_attributes = {"rhs_quantization"};
+      break;
+    case OpType::kUnaryOp:
+      quantization_attributes = {"quantization"};
+      break;
+    case OpType::kBinaryOp:
+      quantization_attributes = {"lhs_quantization", "rhs_quantization",
+                                 "output_quantization"};
+      break;
+    case OpType::kQuantizationOp:
+      quantization_attributes = {"input_quantization", "output_quantization"};
+      break;
+    default:
+      quantization_attributes = {};
+      break;
   }
 
   for (const auto& attr : quantization_attributes) {
-    auto quant_val = absl::StrContains(attr, "min")
-                         ? min_max_scheme_for_8bit_narrow["min"]
-                         : min_max_scheme_for_8bit_narrow["max"];
-    auto quant_val_attr = rewriter.getI64IntegerAttr(quant_val);
-    attrs.push_back(rewriter.getNamedAttr(attr, quant_val_attr));
+    for (int i = 0; i < kSuffixes.size(); i++) {
+      auto quant_val = i == 0 ? min_max_scheme_for_8bit_narrow["min"]
+                              : min_max_scheme_for_8bit_narrow["max"];
+      std::string attr_minmax = absl::StrCat(attr, kSuffixes[i]);
+      attrs.push_back(rewriter.getNamedAttr(
+          attr_minmax, rewriter.getI64IntegerAttr(quant_val)));
+    }
   }
 }
 
+// This LogicalResult covers both the hybrid and fully quantized op cases.
 LogicalResult FillAttributesForUniformQuantizedDotOp(
     PatternRewriter& rewriter, Operation* op,
     llvm::StringMap<Attribute>& identifier_to_attr,
     QuantMethod quantization_method, bool enable_per_channel_quantization) {
   NamedAttrList attrs;
 
-  // Fill quantization related attributes.
-  FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
-                             quantization_method);
-
-  if (!(quantization_method ==
-        tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE)) {
+  if (quantization_method ==
+      tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE) {
+    // Fill quantization related attributes for Hybrid op.
+    FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                               OpType::kDynamicRangeOp);
+  } else {
+    // Fill quantization related attributes for fully quantized op.
+    FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                               OpType::kBinaryOp);
     // Per-channel activation is not supported
     attrs.push_back(rewriter.getNamedAttr("lhs_quantization_axis",
                                           rewriter.getI64IntegerAttr(-1)));
@@ -158,7 +203,7 @@ LogicalResult FillAttributesForUniformQuantizedDotOp(
   absl::flat_hash_set<int> operands = spec->quantizable_operands;
   int quant_dim = -1;
   if (enable_per_channel_quantization && operands.size() == 1) {
-    quant_dim = spec->coeff_op_quant_dim[*(spec->quantizable_operands.begin())];
+    quant_dim = spec->coeff_op_quant_dim[*(operands.begin())];
   }
   attrs.push_back(rewriter.getNamedAttr("rhs_quantization_axis",
                                         rewriter.getI64IntegerAttr(quant_dim)));
@@ -168,6 +213,7 @@ LogicalResult FillAttributesForUniformQuantizedDotOp(
   return success();
 }
 
+// This LogicalResult covers both the hybrid and fully quantized op cases.
 LogicalResult FillAttributesForUniformQuantizedConvolutionOp(
     PatternRewriter& rewriter, Operation* op,
     llvm::StringMap<Attribute>& identifier_to_attr,
@@ -211,9 +257,16 @@ LogicalResult FillAttributesForUniformQuantizedConvolutionOp(
   attrs.push_back(rewriter.getNamedAttr(
       feature_group_cnt_attr, rewriter.getI64IntegerAttr(feature_group_cnt)));
 
-  // Fill quantization related attributes.
-  FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
-                             quantization_method);
+  if (quantization_method ==
+      tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE) {
+    // Fill quantization related attributes for Hybrid op.
+    FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                               OpType::kDynamicRangeOp);
+  } else {
+    // Fill quantization related attributes for fully quantized op.
+    FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                               OpType::kBinaryOp);
+  }
 
   if (quantization_method !=
       tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE) {
@@ -228,7 +281,7 @@ LogicalResult FillAttributesForUniformQuantizedConvolutionOp(
   absl::flat_hash_set<int> operands = spec->quantizable_operands;
   int quant_dim = -1;
   if (enable_per_channel_quantization && operands.size() == 1) {
-    quant_dim = spec->coeff_op_quant_dim[*(spec->quantizable_operands.begin())];
+    quant_dim = spec->coeff_op_quant_dim[*(operands.begin())];
   }
   attrs.push_back(rewriter.getNamedAttr("rhs_quantization_axis",
                                         rewriter.getI64IntegerAttr(quant_dim)));
@@ -238,4 +291,84 @@ LogicalResult FillAttributesForUniformQuantizedConvolutionOp(
   return success();
 }
 
+LogicalResult FillAttributesForUniformQuantizedAddOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    const QuantMethod quantization_method,
+    const bool enable_per_channel_quantization) {
+  NamedAttrList attrs;
+
+  // Fill quantization related attributes.
+  FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                             OpType::kBinaryOp);
+  Attribute activation_quantization_axis = rewriter.getI64IntegerAttr(-1);
+  if (enable_per_channel_quantization) {
+    // If either of lhs or rhs is per-channel quantized, the quantization axis
+    // must match for lhs, rhs, and output.
+    activation_quantization_axis =
+        GetQuantizationAxis(rewriter, op, /*operand_index=*/0);
+    if (activation_quantization_axis == rewriter.getI64IntegerAttr(-1)) {
+      activation_quantization_axis =
+          GetQuantizationAxis(rewriter, op, /*operand_index=*/1);
+    }
+  }
+  attrs.push_back(rewriter.getNamedAttr("lhs_quantization_axis",
+                                        activation_quantization_axis));
+  attrs.push_back(rewriter.getNamedAttr("rhs_quantization_axis",
+                                        activation_quantization_axis));
+  attrs.push_back(rewriter.getNamedAttr("output_quantization_axis",
+                                        activation_quantization_axis));
+  op->setAttrs(rewriter.getDictionaryAttr(attrs));
+
+  return success();
+}
+
+LogicalResult FillAttributesForUniformQuantizedClipByValueOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    QuantMethod quantization_method, bool enable_per_channel_quantization) {
+  NamedAttrList attrs;
+
+  // Fill quantization related attributes.
+  FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                             OpType::kUnaryOp);
+
+  Attribute activation_quantization_axis = rewriter.getI64IntegerAttr(-1);
+  if (enable_per_channel_quantization) {
+    activation_quantization_axis =
+        GetQuantizationAxis(rewriter, op, /*operand_index=*/0);
+  }
+  attrs.push_back(
+      rewriter.getNamedAttr("quantization_axis", activation_quantization_axis));
+  op->setAttrs(rewriter.getDictionaryAttr(attrs));
+
+  return success();
+}
+
+LogicalResult FillAttributesForUniformRequantizeOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    QuantMethod quantization_method, bool enable_per_channel_quantization) {
+  NamedAttrList attrs;
+
+  // Fill quantization related attributes.
+  FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                             OpType::kQuantizationOp);
+
+  Attribute activation_quantization_axis = rewriter.getI64IntegerAttr(-1);
+  if (enable_per_channel_quantization) {
+    activation_quantization_axis =
+        GetQuantizationAxis(rewriter, op, /*operand_index=*/0);
+  }
+  // For per-axis -> per-axis requantization, input and output quantization axis
+  // must be equal.
+  attrs.push_back(rewriter.getNamedAttr("input_quantization_axis",
+                                        activation_quantization_axis));
+  attrs.push_back(rewriter.getNamedAttr("output_quantization_axis",
+                                        activation_quantization_axis));
+  op->setAttrs(rewriter.getDictionaryAttr(attrs));
+
+  return success();
+}
+
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h
index 547473f3d90..b8e2a8bcd4f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h
@@ -39,6 +39,27 @@ LogicalResult FillAttributesForUniformQuantizedConvolutionOp(
         quantization_method,
     bool enable_per_channel_quantization);
 
+LogicalResult FillAttributesForUniformQuantizedAddOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformQuantizedClipByValueOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformRequantizeOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
 }  // namespace mlir::quant
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_UNIFORM_ATTRIBUTE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index db721642303..74a804ceb97 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -71,6 +71,7 @@ tool_names = [
     'flatbuffer_to_string',
     'flatbuffer_translate',
     'hlo_to_llvm_ir',
+    'ifrt-opt',
     'json_to_flatbuffer',
     'kernel-gen-opt',
     'lhlo-tfrt-opt',
diff --git a/tensorflow/compiler/mlir/runlit.site.cfg.py b/tensorflow/compiler/mlir/runlit.site.cfg.py
index 293118cda2b..0fa778bcc3a 100644
--- a/tensorflow/compiler/mlir/runlit.site.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.site.cfg.py
@@ -53,6 +53,7 @@ mlir_tf_tools_dirs = [
     'tensorflow/compiler/xla/mlir/tools/mlir_bisect',
     'tensorflow/compiler/xla/mlir_hlo',
     'tensorflow/compiler/xla/mlir_hlo/tosa',
+    'tensorflow/compiler/xla/python/ifrt/ir/tests',
     'tensorflow/compiler/xla/service/gpu/tests',
     'tensorflow/compiler/xla/service/mlir_gpu',
     'tensorflow/compiler/xla/translate',
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index b5b6746d645..d49bca20c10 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -407,7 +407,7 @@ cc_library(
         ":tensorflow_passes",
         ":tf_saved_model_passes",
         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
-        "//tensorflow/compiler/mlir/tf2xla:legalize_tf",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:legalize_tf",
         "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -459,8 +459,8 @@ cc_library(
     srcs = [
                "ir/tf_dialect.h",
                "ir/tf_ops.h",
-               "ir/tfrt_ops.h",
                "ir/tf_remaining_ops.h",
+               "ir/tfrt_ops.h",
            ] + ["ir/tf_" + target["name"] + ".cc" for target in tf_ops_category_list] +
            ["ir/tf_" + target["name"] + ".cc.inc" for target in tf_ops_category_list] +
            ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
@@ -475,8 +475,8 @@ cc_library(
     deps = [
         ":attribute_utils",
         ":dynamic_shape_utils",
-        ":tensorflow_attributes",
         ":rewrite_util",
+        ":tensorflow_attributes",
         ":tensorflow_canonicalize_inc_gen",
         ":tensorflow_op_interfaces",
         ":tensorflow_op_interfaces_inc_gen",
@@ -484,24 +484,24 @@ cc_library(
         ":tensorflow_structs",
         ":tensorflow_traits",
         ":tensorflow_types",
+        ":tf_arith_ops_folder",
         ":tf_ops_canonicalization_helper",
         ":tf_ops_device_helper",
         ":tf_ops_layout_helper",
         ":tf_ops_tensor_helper",
-        ":tf_arith_ops_folder",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:SideEffectInterfaces",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:Support",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
     ] + [":tensorflow_" + target["name"] + "_inc_gen" for target in tf_ops_category_list],
 )
 
@@ -510,8 +510,8 @@ cc_library(
     srcs = [
         "ir/tf_dialect.h",
         "ir/tf_ops.h",
-        "ir/tf_remaining_ops.h",
         "ir/tf_remaining_ops.cc",
+        "ir/tf_remaining_ops.h",
         "ir/tfrt_ops.h",
     ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
     hdrs = [
@@ -555,9 +555,9 @@ cc_library(
     srcs = [
         "ir/tf_dialect.h",
         "ir/tf_ops.h",
-        "ir/tfrt_ops.h",
-        "ir/tfrt_ops.cc",
         "ir/tf_remaining_ops.h",
+        "ir/tfrt_ops.cc",
+        "ir/tfrt_ops.h",
     ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
     hdrs = [
     ],
@@ -1014,7 +1014,6 @@ cc_library(
         ":session_utils",
         ":tensorflow",
         ":tensorflow_ops",
-        ":tensorflow_passes",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework_internal",
         "@llvm-project//llvm:Support",
@@ -1209,12 +1208,14 @@ cc_library(
         "transforms/device_index_selector.cc",
         "transforms/drop_while_shape_invariant.cc",
         "transforms/einsum.cc",
+        "transforms/embedding_pipelining.cc",
         "transforms/executor_island_coarsening.cc",
         "transforms/executor_tpuv1_inline_tpu_island.cc",
         "transforms/executor_tpuv1_island_coarsening.cc",
         "transforms/executor_tpuv1_outline_tpu_island.cc",
         "transforms/extract_head_tail_outside_compilation.cc",
         "transforms/extract_outside_compilation.cc",
+        "transforms/extract_tpu_copy_with_dynamic_shape_op.cc",
         "transforms/fold_broadcast.cc",
         "transforms/functional_control_flow_to_cfg.cc",
         "transforms/functional_control_flow_to_regions.cc",
@@ -1267,9 +1268,11 @@ cc_library(
         "transforms/test_resource_alias_analysis.cc",
         "transforms/tf_data_optimization_pass.cc",
         "transforms/tf_device_assignment.cc",
+        "transforms/tpu_annotate_dynamic_shape_inputs.cc",
         "transforms/tpu_cluster_cleanup_attributes.cc",
         "transforms/tpu_cluster_formation.cc",
         "transforms/tpu_colocate_composite_resource_ops.cc",
+        "transforms/tpu_colocate_splits.cc",
         "transforms/tpu_device_propagation.cc",
         "transforms/tpu_dynamic_layout_pass.cc",
         "transforms/tpu_host_computation_expansion.cc",
@@ -1354,8 +1357,8 @@ cc_library(
         "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/lite:validators",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf_with_tf2xla",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -1416,10 +1419,10 @@ cc_library(
         ":serialize_mlir_module_utils",
         ":shape_inference_utils",
         ":tensorflow",
-        ":tensorflow_types",
         ":tf_device_pass_inc_gen",
         ":tf_pass_inc_gen",
         ":translate_utils",
+        "//tensorflow/compiler/tf2xla/kernels:xla_call_module_loader",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -1582,6 +1585,7 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/graph/regularization:util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -1996,19 +2000,22 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":constant_fold_utils",
         ":convert_tensor",
-        ":eval_util",
+        ":export_graphdef",
+        ":export_tf_dialect_op",
         ":tensorflow",
         ":tensorflow_traits",
         ":tensorflow_types",
-        "//tensorflow/c:tf_status",
         "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/ops",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
     ],
     alwayslink = 1,
@@ -2084,7 +2091,7 @@ cc_library(
         ":mlir_import_options",
         ":mlir_roundtrip_flags",
         "//tensorflow/cc/saved_model:bundle_v2",
-        "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/cc/saved_model:loader_lite",
         "//tensorflow/cc/saved_model:reader",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
@@ -2349,7 +2356,9 @@ cc_library(
     srcs = ["utils/tpu_rewrite_device_util.cc"],
     hdrs = ["utils/tpu_rewrite_device_util.h"],
     deps = [
+        ":device_util",
         ":tensorflow",
+        ":tensorflow_types",
         "//tensorflow/compiler/mlir/utils:string_container_utils",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -2370,6 +2379,8 @@ tf_cc_test(
     srcs = ["utils/tpu_rewrite_device_util_test.cc"],
     deps = [
         ":device_util",
+        ":serialize_mlir_module_utils",
+        ":tensorflow",
         ":tpu_rewrite_device_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
@@ -2513,8 +2524,14 @@ tf_cc_test(
 
 cc_library(
     name = "bridge_logger",
-    srcs = ["utils/bridge_logger.cc"],
-    hdrs = ["utils/bridge_logger.h"],
+    srcs = [
+        "utils/bridge_logger.cc",
+        "utils/data_dumper_logger_config.cc",
+    ],
+    hdrs = [
+        "utils/bridge_logger.h",
+        "utils/data_dumper_logger_config.h",
+    ],
     deps = [
         ":dump_mlir_util",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index c7374f6fa72..b6d0ff71211 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -147,7 +147,7 @@ LogicalResult ParallelExecuteOp::verify() {
   }
 
   int output_index = 0;
-  for (auto& region_and_index : llvm::enumerate(regions)) {
+  for (const auto& region_and_index : llvm::enumerate(regions)) {
     auto& region = region_and_index.value();
     auto* region_terminator = region.front().getTerminator();
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index c11c6edd591..14ff8f37ae8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -671,7 +671,7 @@ array([b'3.14', b'2.72'], dtype=object)
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8, TF_Variant]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8, TF_Variant]>:$input,
 
     DefaultValuedOptionalAttr<I64Attr, "-1">:$precision,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$scientific,
@@ -19882,7 +19882,7 @@ If two elements are equal, the lower-index element appears first.
 
   let arguments = (ins
     Arg<TF_IntOrFpTensor, [{1-D or higher with last dimension at least `k`.}]>:$input,
-    Arg<TF_Int32Tensor, [{0-D.  Number of top elements to look for along the last dimension (along each
+    Arg<TensorOf<[TF_Int16, TF_Int32, TF_Int64]>, [{0-D.  Number of top elements to look for along the last dimension (along each
 row for matrices).}]>:$k,
 
     DefaultValuedOptionalAttr<BoolAttr, "true">:$sorted
@@ -19890,10 +19890,12 @@ row for matrices).}]>:$k,
 
   let results = (outs
     Res<TF_IntOrFpTensor, [{The `k` largest elements along each last dimensional slice.}]>:$values,
-    Res<TF_Int32Tensor, [{The indices of `values` within the last dimension of `input`.}]>:$indices
+    Res<TensorOf<[TF_Int16, TF_Int32, TF_Int64]>, [{The indices of `values` within the last dimension of `input`.}]>:$indices
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tk = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr index_type = TF_DerivedResultTypeAttr<1>;
 
   let hasVerifier = 1;
 }
@@ -20216,6 +20218,8 @@ Must have same shape with `output_scales`.}]>:$output_zero_points,
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let hasVerifier = 1;
 }
 
 def TF_UniformQuantizedClipByValueOp : TF_Op<"UniformQuantizedClipByValue", [Pure]> {
@@ -20248,6 +20252,8 @@ Same shape condition as scales.}]>:$zero_points,
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let hasVerifier = 1;
 }
 
 def TF_UniformQuantizedConvolutionOp : TF_Op<"UniformQuantizedConvolution", [Pure]> {
@@ -21048,6 +21054,33 @@ where(input) ==> [[0, 0, 0],
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_WriteTrainingPredictionsOp : TF_Op<"WriteTrainingPredictions", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+  let summary = [{
+Writes the given predictions into a RecordIO file using a previously
+  }];
+
+  let description = [{
+initialized global TrainingPredictionWriter. The predictions are transformed
+into a PredictionData proto before they are written to the file.
+  }];
+
+  let arguments = (ins
+    Arg<TF_StrTensor, [{A tensor of Woodshed Keys corresponding to each of the prediction
+values. These keys should be in byte format.}]>:$keys,
+    Arg<Variadic<TF_Float32Tensor>, [{A list of float tensors containing prediction values.}]>:$predictions_list,
+    Arg<TF_Int64Tensor, [{An int that speficies at which step the predictions are generated.}]>:$step,
+    Arg<TF_Int64Tensor, [{The utc timestamp in microseconds of when the predictions were created.}]>:$timestamp_usec,
+
+    StrArrayAttr:$prediction_names,
+    BoolAttr:$training,
+    StrAttr:$file_path
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandSizeAttr num_predictions = TF_DerivedOperandSizeAttr<1>;
+}
+
 def TF_XdivyOp : TF_Op<"Xdivy", [Pure, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                  WithBroadcastableBinOpBuilder {
   let summary = "Returns 0 if x == 0, and x / y otherwise, elementwise.";
@@ -21123,8 +21156,8 @@ def TF_XlaCallModuleOp : TF_Op<"XlaCallModule", [Pure]> {
   let summary = "Invokes a StableHLO module.";
 
   let description = [{
-This op is experimental and is intended for use with JAX native serialization
-in a TensorFlow context.
+This op is used with JAX native serialization in a TensorFlow context with
+stability guarantees.
   }];
 
   let arguments = (ins
@@ -21137,7 +21170,8 @@ platform argument (see `platforms`) nor the dimension arguments (see
     StrAttr:$module,
     TF_ShapeAttrArray:$Sout,
     DefaultValuedOptionalAttr<StrArrayAttr, "{}">:$dim_args_spec,
-    DefaultValuedOptionalAttr<StrArrayAttr, "{}">:$platforms
+    DefaultValuedOptionalAttr<StrArrayAttr, "{}">:$platforms,
+    DefaultValuedOptionalAttr<TF_SymbolRefArrayAttr, "{}">:$function_list
   );
 
   let results = (outs
@@ -22609,7 +22643,8 @@ A pseudo-op to represent host-side computation in an XLA program.
 
     StrAttr:$send_key,
     StrAttr:$recv_key,
-    DefaultValuedOptionalAttr<StrAttr, "\"\"">:$host_mlir_module
+    DefaultValuedOptionalAttr<StrAttr, "\"\"">:$host_mlir_module,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$manual_sharding
   );
 
   let results = (outs
@@ -22636,7 +22671,8 @@ A placeholder op to receive values from a running XLA computation.
 execution the transfer corresponds to.}]>:$dynamic_key,
 
     StrAttr:$key,
-    I64Attr:$device_ordinal
+    I64Attr:$device_ordinal,
+    DefaultValuedOptionalAttr<StrAttr, "\"TPU\"">:$device_type
   );
 
   let results = (outs
@@ -22656,7 +22692,8 @@ A placeholder op to receive values from a running XLA computation with support f
 execution the transfer corresponds to.}]>:$dynamic_key,
     Arg<TF_Int64Tensor, [{The device id relative to the associated host device.}]>:$device_ordinal,
 
-    StrAttr:$key
+    StrAttr:$key,
+    DefaultValuedOptionalAttr<StrAttr, "\"TPU\"">:$device_type
   );
 
   let results = (outs
@@ -22675,7 +22712,8 @@ def TF__XlaSendFromHostOp : TF_Op<"_XlaSendFromHost", [DeclareOpInterfaceMethods
 execution the transfer corresponds to.}]>:$dynamic_key,
 
     StrAttr:$key,
-    I64Attr:$device_ordinal
+    I64Attr:$device_ordinal,
+    DefaultValuedOptionalAttr<StrAttr, "\"TPU\"">:$device_type
   );
 
   let results = (outs);
@@ -22694,7 +22732,8 @@ A placeholder op to send values to a running XLA computation with support for a
 execution the transfer corresponds to.}]>:$dynamic_key,
     Arg<TF_Int64Tensor, [{The device id relative to the associated host device.}]>:$device_ordinal,
 
-    StrAttr:$key
+    StrAttr:$key,
+    DefaultValuedOptionalAttr<StrAttr, "\"TPU\"">:$device_type
   );
 
   let results = (outs);
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 8a15e3b3e85..83525c93047 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -188,6 +188,7 @@ def TF_TPUExecuteResource : TF_ResourceBase<"TPUExecute">;
 def TF_RandomGeneratorResource : TF_ResourceBase<"RandomGenerator">;
 def TF_XlaHostComputeResource : TF_ResourceBase<"XlaHostCompute">;
 def TF_XlaLaunchResource : TF_ResourceBase<"XlaLaunch">;
+def TF_WriteTrainingPredictionsResource : TF_ResourceBase<"WriteTrainingPredictions">;
 def TF_CollectiveReduceOrderingResource : TF_ResourceBase<"CollectiveReduceOrdering">;
 def TF_NcclAllReduceOrderingResource : TF_ResourceBase<"NcclAllReduceOrdering">;
 
@@ -252,6 +253,7 @@ def TF_XlaHostComputeSideEffect : MemoryEffects<[MemWrite<TF_XlaHostComputeResou
 
 def TF_XlaLaunchSideEffect : MemoryEffects<[MemWrite<TF_XlaLaunchResource>]>;
 
+def TF_WriteTrainingPredictions : MemoryEffects<[MemWrite<TF_WriteTrainingPredictionsResource>]>; 
 def TF_RandomGeneratorSideEffect : MemoryEffects<[MemWrite<TF_RandomGeneratorResource>]>;
 
 // Special effect for keeping `CollectiveReduce` ops in order.
@@ -294,6 +296,13 @@ def TF_ShapeAttr : TF_TensorFlowAttr<"Shape", "shape"> {
 def TF_ShapeAttrArray :
     TypedArrayAttrBase<TF_ShapeAttr, "tensorflow shape attribute array">;
 
+// An array of FlatSymbolRef attributes that can be used as a default valued
+// attribute.
+def TF_SymbolRefArrayAttr :
+  TypedArrayAttrBase<SymbolRefAttr, "tensorflow symbol ref array attribute"> {
+  let constBuilderCall = "::mlir::ArrayAttr::get($_builder.getContext(), $0)";
+}
+
 //===----------------------------------------------------------------------===//
 // TensorFlow type definitions
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
index 88cbf879d56..db8208893e7 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
@@ -123,15 +123,15 @@ ResourceHandleValueAndId GetResourceHandleValueAndIdBase(
 // and have at least one operand, result type can be inferred using the first
 // operand's type.
 
-#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                        \
-  LogicalResult Op::inferReturnTypeComponents(                                \
-      MLIRContext* context, std::optional<Location> location,                 \
-      ValueShapeRange operands, DictionaryAttr attributes,                    \
-      RegionRange regions,                                                    \
-      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {          \
-    return inferReturnTypeComponentsFromOperands(context, location, operands, \
-                                                 attributes, regions,         \
-                                                 inferredReturnShapes);       \
+#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                \
+  LogicalResult Op::inferReturnTypeComponents(                        \
+      MLIRContext* context, std::optional<Location> location,         \
+      ValueShapeRange operands, DictionaryAttr attributes,            \
+      OpaqueProperties properties, RegionRange regions,               \
+      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {  \
+    return inferReturnTypeComponentsFromOperands(                     \
+        context, location, operands, attributes, properties, regions, \
+        inferredReturnShapes);                                        \
   }
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h.inc"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 80857c23765..c9a890778f6 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -1868,4 +1868,101 @@ def TF_SelectV2Op : TF_Op<"SelectV2", [Pure, ResultsBroadcastableShape]> {
   ];
 }
 
+def TF_TPUCopyWithDynamicShapeOp : TF_Op<"TPUCopyWithDynamicShape", [Pure, AttrSizedOperandSegments]> {
+  let summary = [{
+Op that copies host tensors to device with bounded dynamic shape support.
+  }];
+
+  let description = [{
+This op copies the padded tensor on cpu to TPU without the padded data. `tensors` 
+is a list of cpu tensors with padded data. `unpadded_sizes` is a list of shape
+tensors which describes unpadded size of each dimension for each cpu tensor. 
+The size of the `unpadded_sizes` should be the same as `tensors`. They are both
+on host. `tpu_tensors` are list of tpu device tensors without the padded data.
+`tpu_tensors` also has the same size of the `tensors` and the shapes of 
+`tpu_tensors` are determined by the `unpadded_sizes`.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$tensors,
+    Variadic<TF_Int32Tensor>:$unpadded_sizes
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$tpu_tensors
+  );
+
+  TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<1>;
+  TF_DerivedOperandTypeListAttr T = TF_DerivedOperandTypeListAttr<0>;
+}
+
+def TF_TPUAnnotateTensorsWithDynamicShapeOp : TF_Op<"TPUAnnotateTensorsWithDynamicShape", [Pure]> {
+  let summary = [{
+Placeholder op which takes the output of TPUCopyWithDynamicShapeOp and pass
+them to the following tpu ops.
+  }];
+
+  let description = [{
+This op serves as an annotation for the dynamic shaped tensor and will be
+removed during the bridge rewrite.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$tensors
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$tpu_tensors
+  );
+
+  TF_DerivedOperandTypeListAttr T = TF_DerivedOperandTypeListAttr<0>;
+}
+
+def TF_ConvertToCooTensorOp : TF_Op<"ConvertToCooTensor", [Pure]> {
+  let summary = [{
+Op that converts tensors into coo format.
+  }];
+
+  let description = [{
+This op coverts the dense, sparse and ragged tensor into standard coo tensor
+format which contains three 1D tensors.
+  }];
+
+  let arguments = (ins
+    TF_Int32Tensor:$indices_or_row_splits,
+    TF_Int32Tensor:$values,
+    TF_Float32Tensor:$weights,
+
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$sample_count,
+    StrAttr:$combiner
+  );
+
+  let results = (outs
+    TF_Int32Tensor:$row_ids,
+    TF_Int32Tensor:$col_ids,
+    TF_Float32Tensor:$gains
+  );
+}
+
+def TF_ResourceGatherNdOp : TF_Op<"ResourceGatherNd", []> {
+  let summary = "GatherNd on a resource.";
+
+  let description = [{
+This op reads the variable referenced by the first argument, and
+then performs a GatherNd operation on it.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead]>:$resource,
+    TF_I32OrI64Tensor:$indices
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 #endif // TF_OPS
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index e69b91c198e..dfa46846aa1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -1693,7 +1693,7 @@ void ConstOp::build(OpBuilder& builder, OperationState& result, Type type,
 
 LogicalResult ConstOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   auto value = attributes.get("value");
   if (!value) return emitOptionalError(location, "missing attribute 'value'");
@@ -1936,7 +1936,8 @@ static LogicalResult inferConvReturnTypeComponents(
 
 LogicalResult Conv2DOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
   Conv2DOpAdaptor op(operands.getValues(), attributes);
   ArrayRef<Attribute> explicit_padding;
@@ -2134,7 +2135,8 @@ StringRef Conv2DBackpropInputOp::GetOptimalLayout(
 
 LogicalResult Conv3DOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
   Conv3DOpAdaptor op(operands.getValues(), attributes);
   ArrayRef<Attribute> explicit_padding;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index e4ec395b1cb..36b9d6c6e20 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -1,3 +1,4 @@
+
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -717,9 +718,9 @@ OpFoldResult RangeOp::fold(FoldAdaptor adaptor) {
   if (!(start_tensor && limit_tensor && delta_tensor)) return nullptr;
 
   // Operands should all be scalars
-  assert(start_tensor.getType().getRank() == 0 &&
-         limit_tensor.getType().getRank() == 0 &&
-         delta_tensor.getType().getRank() == 0);
+  assert(start_tensor.getShapedType().getRank() == 0 &&
+         limit_tensor.getShapedType().getRank() == 0 &&
+         delta_tensor.getShapedType().getRank() == 0);
   Type elem_type = getType().cast<ShapedType>().getElementType();
   if (elem_type.isSignlessInteger() || elem_type.isUnsignedInteger()) {
     auto start_attr = start_tensor.getValues<IntegerAttr>()[0];
@@ -2320,6 +2321,18 @@ void TPUExecuteOp::getEffects(
   }
 }
 
+//===----------------------------------------------------------------------===//
+// WriteTrainingPredictions
+//===----------------------------------------------------------------------===//
+
+void WriteTrainingPredictionsOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.reserve(1);
+  effects.emplace_back(MemoryEffects::Write::get(),
+                       ResourceEffects::WriteTrainingPredictions::get());
+}
+
 //===----------------------------------------------------------------------===//
 // TPUExecuteAndUpdateVariablesOp
 //===----------------------------------------------------------------------===//
@@ -2372,7 +2385,7 @@ void TPUExecuteAndUpdateVariablesOp::getEffects(
         .isa<ResourceType>();
   });
 
-  for (auto &entry : llvm::enumerate(resource_handles)) {
+  for (const auto &entry : llvm::enumerate(resource_handles)) {
     Value value = entry.value();
     effects.emplace_back(MemoryEffects::Read::get(), value,
                          ResourceEffects::Variable::get());
@@ -2660,7 +2673,7 @@ void ToBoolOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult ToBoolOp::inferReturnTypes(
     MLIRContext *context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type> &inferredReturnTypes) {
   inferredReturnTypes.push_back(
       tensorflow::GetTypeFromTFTensorShape({}, IntegerType::get(context, 1)));
@@ -2734,11 +2747,13 @@ LogicalResult TransposeOp::verify() {
       const int64_t y_idx = e.index();
       const int64_t y_dim = y_type.getDimSize(y_idx);
       int64_t x_idx = e.value().getSExtValue();
-      if (x_idx < 0) x_idx += x_type.getRank();
-      if (x_idx < 0) {
+      int64_t x_rank = x_type.getRank();
+      if (x_idx < -x_rank || x_idx >= x_rank) {
         return op.emitOpError(
-            llvm::formatv("perm[{0}] must be in [-rank, rank)", x_idx));
+            llvm::formatv("perm[{0}]={1} must be in range [-{2}, {2})", y_idx,
+                          x_idx, x_rank));
       }
+      if (x_idx < 0) x_idx += x_rank;
       const int64_t x_dim = x_type.getDimSize(x_idx);
       if (!ShapedType::isDynamic(y_dim) && !ShapedType::isDynamic(x_dim) &&
           y_dim != x_dim) {
@@ -3467,7 +3482,8 @@ void XdivyOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult XlaBroadcastHelperOp::inferReturnTypeComponents(
     MLIRContext *context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
   XlaBroadcastHelperOpAdaptor op(operands.getValues(), attributes);
   Value lhs = op.getLhs();
@@ -3490,7 +3506,7 @@ LogicalResult XlaBroadcastHelperOp::inferReturnTypeComponents(
     return set_unranked_results();
   }
 
-  if (dims.size() == 0) {
+  if (dims.empty()) {
     if (lhs_rank != rhs_rank && lhs_rank != 0 && rhs_rank != 0) {
       return emitOptionalError(
           location,
@@ -3605,7 +3621,8 @@ LogicalResult XlaConvV2Op::verify() {
 
 LogicalResult XlaSetDynamicDimensionSizeOp::inferReturnTypeComponents(
     MLIRContext *context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
   XlaSetDynamicDimensionSizeOpAdaptor op(operands.getValues(), attributes);
 
@@ -3675,9 +3692,10 @@ LogicalResult XlaReduceWindowOp::verify() {
   auto check = [&](mlir::Value val, std::string attr_name) -> LogicalResult {
     ElementsAttr attr;
     if (matchPattern(val, m_Constant(&attr))) {
-      if (attr.getType().getRank() != 1) {
-        return op.emitOpError() << "expects the rank of " << attr_name
-                                << "to be 1, got " << attr.getType().getRank();
+      if (attr.getShapedType().getRank() != 1) {
+        return op.emitOpError()
+               << "expects the rank of " << attr_name << "to be 1, got "
+               << attr.getShapedType().getRank();
       }
       if (input_ty.hasRank()) {
         int64_t input_rank = input_ty.getRank();
@@ -3705,11 +3723,11 @@ LogicalResult XlaReduceWindowOp::verify() {
 
   ElementsAttr padding;
   if (matchPattern(op.getPadding(), m_Constant(&padding))) {
-    const ShapedType &padding_ty = padding.getType();
+    const ShapedType &padding_ty = cast<ShapedType>(padding.getType());
     if (padding_ty.getRank() != 2 || padding_ty.getDimSize(1) != 2) {
       return op.emitOpError()
              << "expects padding to be a matrix with minor dimension 2, got "
-             << padding.getType().getShape();
+             << padding.getShapedType().getShape();
     }
   }
 
@@ -3762,11 +3780,11 @@ LogicalResult XlaSelectAndScatterOp::verify() {
 
   ElementsAttr padding;
   if (matchPattern(op.getPadding(), m_Constant(&padding))) {
-    const ShapedType &padding_ty = padding.getType();
+    const ShapedType &padding_ty = cast<ShapedType>(padding.getType());
     if (padding_ty.getRank() != 2 || padding_ty.getDimSize(1) != 2) {
       return op.emitOpError()
              << "expects padding to be a matrix with minor dimension 2, got "
-             << padding.getType().getShape();
+             << padding.getShapedType().getShape();
     }
   }
 
@@ -3922,8 +3940,8 @@ LogicalResult XlaVariadicSortOp::verify() {
 
   ElementsAttr dimension;
   if (matchPattern(op.getDimension(), m_Constant(&dimension))) {
-    if (dimension.getType().getRank() != 0 ||
-        dimension.getType().getNumElements() != 1)
+    if (dimension.getShapedType().getRank() != 0 ||
+        dimension.getShapedType().getNumElements() != 1)
       return op.emitOpError() << "dimension must be a scalar";
   }
 
@@ -4108,6 +4126,26 @@ LogicalResult UniformQuantizedConvolutionOp::verify() {
   return VerifyLhsRhsBothUniformQuantizedOp(*this);
 }
 
+//===----------------------------------------------------------------------===//
+// UniformQuantizedAddOp
+//===----------------------------------------------------------------------===//
+//
+
+LogicalResult UniformQuantizedAddOp::verify() {
+  return VerifyLhsRhsBothUniformQuantizedOp(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// UniformQuantizedClipByValueOp
+//===----------------------------------------------------------------------===//
+//
+
+LogicalResult UniformQuantizedClipByValueOp::verify() {
+  UniformQuantizedClipByValueOp op = *this;
+  return VerifyScalesAndZeroPoints(op, op.getScales(), op.getZeroPoints(),
+                                   op.getQuantizationAxis());
+}
+
 }  // namespace TF
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
index d872f8ecd04..247a85804d8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
@@ -95,7 +95,7 @@ LogicalResult _XlaHostComputeMlirOp::verify() {
   if (!status.ok()) {
     return op.emitError()
            << "attribute 'host_mlir_module' can not be deserialized. "
-           << status.error_message();
+           << status.message();
   }
 
   func::FuncOp func = module_for_func->lookupSymbol<func::FuncOp>("host_func");
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
index e1c02b8d9c9..2fe672f1477 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
@@ -108,6 +108,11 @@ struct XlaLaunch : public ::mlir::SideEffects::Resource::Base<XlaLaunch> {
   StringRef getName() final { return "XlaLaunch"; }
 };
 
+struct WriteTrainingPredictions
+    : public ::mlir::SideEffects::Resource::Base<WriteTrainingPredictions> {
+  StringRef getName() final { return "WriteTrainingPredictions"; }
+};
+
 // Returns true iff resource type with given ID is only self-dependent, i.e.,
 // there are no dependencies to other resource types (including unknown resource
 // type).
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
index 87def73cd5b..62f6192c1f8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
@@ -151,7 +151,8 @@ class SameOperandsAndResultTypeResolveRef
 
   static LogicalResult inferReturnTypeComponentsFromOperands(
       MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-      DictionaryAttr attributes, RegionRange regions,
+      DictionaryAttr attributes, OpaqueProperties properties,
+      RegionRange regions,
       SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
     if (operands.empty())
       return emitOptionalError(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir b/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
index 67a3e54979c..e1071f3e899 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
@@ -1,5 +1,43 @@
 // RUN: tf-opt -split-input-file -verify-diagnostics -tf-einsum %s | FileCheck %s
 
+func.func @unary_einsum_reduce_sum_transpose(%arg0: tensor<3x4x5x6xf32>) -> tensor<3x5x4xf32> {
+  %0 = "tf.Einsum"(%arg0) {T = "tfdtype$DT_FLOAT", equation = "...gse->...sg"}: (tensor<3x4x5x6xf32>) -> tensor<3x5x4xf32>
+  func.return %0 : tensor<3x5x4xf32>
+  // CHECK-LABEL: unary_einsum_reduce_sum_transpose
+  // CHECK-DAG: %[[cst:.*]] = arith.constant dense<3> : tensor<1xi32>
+  // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<[0, 2, 1]> : tensor<3xi32>
+  // CHECK: %[[v0:.*]] = "tf.Sum"(%arg0, %[[cst]]) {keep_dims = false} : (tensor<3x4x5x6xf32>, tensor<1xi32>) -> tensor<3x4x5xf32>
+  // CHECK: %[[v1:.*]] = "tf.Transpose"(%[[v0]], %[[cst_1]]) : (tensor<3x4x5xf32>, tensor<3xi32>) -> tensor<3x5x4xf32>
+  // CHECK: return %[[v1]] : tensor<3x5x4xf32>
+}
+
+func.func @unary_einsum_reduce_sum_transpose1(%arg0: tensor<3x4x5x6xf32>) -> tensor<3x4x5xf32> {
+  %0 = "tf.Einsum"(%arg0) {T = "tfdtype$DT_FLOAT", equation = "...gse->...gs"}: (tensor<3x4x5x6xf32>) -> tensor<3x4x5xf32>
+  func.return %0 : tensor<3x4x5xf32>
+  // CHECK-LABEL: unary_einsum_reduce_sum_transpose1
+  // CHECK-DAG: %[[cst:.*]] = arith.constant dense<3> : tensor<1xi32>
+  // CHECK: %[[v0:.*]] = "tf.Sum"(%arg0, %[[cst]]) {keep_dims = false} : (tensor<3x4x5x6xf32>, tensor<1xi32>) -> tensor<3x4x5xf32>
+  // CHECK: return %[[v0]] : tensor<3x4x5xf32>
+}
+
+func.func @unary_einsum_transpose(%arg0: tensor<3x4x5xf32>) -> tensor<3x5x4xf32> {
+  %0 = "tf.Einsum"(%arg0) {T = "tfdtype$DT_FLOAT", equation = "ijk->ikj"}: (tensor<3x4x5xf32>) -> tensor<3x5x4xf32>
+  func.return %0 : tensor<3x5x4xf32>
+  // CHECK-LABEL: unary_einsum_transpose
+  // CHECK-DAG: %[[cst:.*]] = arith.constant dense<[0, 2, 1]> : tensor<3xi32>
+  // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg0, %[[cst]]) : (tensor<3x4x5xf32>, tensor<3xi32>) -> tensor<3x5x4xf32>
+  // CHECK: return %[[v0]] : tensor<3x5x4xf32>
+}
+
+func.func @unary_einsum_reduce_sum(%arg0: tensor<4x5x6xf32>) -> tensor<4xf32> {
+  %0 = "tf.Einsum"(%arg0) {T = "tfdtype$DT_FLOAT", equation = "ijk->i"}: (tensor<4x5x6xf32>) -> tensor<4xf32>
+  func.return %0 : tensor<4xf32>
+  // CHECK-LABEL: unary_einsum_reduce_sum
+  // CHECK-DAG: %[[cst:.*]] =  arith.constant dense<[1, 2]> : tensor<2xi32>
+  // CHECK: %[[v0:.*]] = "tf.Sum"(%arg0, %[[cst]]) {keep_dims = false} : (tensor<4x5x6xf32>, tensor<2xi32>) -> tensor<4xf32>
+  // CHECK: return %[[v0]]
+}
+
 func.func @einsum_basic(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -> tensor<3x4x6xf32> {
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,ikm->ijm"}: (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
   func.return %0 : tensor<3x4x6xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir
new file mode 100644
index 00000000000..f6bd3d4d586
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir
@@ -0,0 +1,319 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-embedding-pipelining | FILECHECK_OPTS="" FileCheck %s
+
+// This test verifies the handling of TPU replicated inputs and outputs as well as the extraction of the four main functions.
+module {
+  func.func @main() {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    // Verify that everything is extracted into one of the four functions.
+    // The order of these functions is also significant.
+    // CHECK: {{.*StatefulPartitionedCall.* f = @_func_non_tpu.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_sc_forward.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_core_tpu.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_sc_backward.*}}
+    // CHECK-NEXT: return
+    // metadata ops
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+  // Generated functions
+  // non_tpu should have to TPU ops - just identity and return (in this test).
+  // CHECK: func.func private @_func_non_tpu
+  // CHECK-NEXT: tf.Identity
+  // CHECK-NEXT: return
+
+  // sc_forward should have TPU ops including replicated outputs but not inputs
+  // CHECK: func.func private @_func_sc_forward
+  // CHECK-NOT: TPUReplicatedInput
+  // CHECK-DAG: TPUReplicateMetadata
+  // CHECK-DAG: TPUCompilationResult
+  // CHECK-DAG: TPUReplicatedOutput
+  // CHECK: return
+
+  // core_tput should have TPU ops including both replicated inputs and outputs
+  // CHECK: func.func private @_func_core_tpu
+  // CHECK-DAG: TPUReplicatedInput
+  // CHECK-DAG: TPUReplicateMetadata
+  // CHECK-DAG: TPUCompilationResult
+  // CHECK-DAG: TPUReplicatedOutput
+  // CHECK: return
+
+  // sc_backward should have TPU ops including replicted inputs but not outputs
+  // CHECK: func.func private @_func_sc_backward
+  // CHECK-NOT: TPUReplicatedOutput
+  // CHECK-DAG: TPUReplicateMetadata
+  // CHECK-DAG: TPUCompilationResult
+  // CHECK-DAG: TPUReplicatedInput
+  // CHECK: return
+}
+
+// -----
+// This test verifies that the extraction works correctly for evaluation-only models.
+module {
+  func.func @main() {
+    %cst = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    // CHECK: {{.*StatefulPartitionedCall.* f = @_func_non_tpu.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_sc_forward.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_core_tpu.*}}
+    // metadata ops
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 1 : i64} : () -> ()
+    %1 = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Identity"(%arg0) {_embedding_pipelining = "forward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+  // Only verify sc_backward. The previous test case verifies everything else.
+  // CHECK: func.func private @_func_sc_backward
+  // CHECK-NEXT: return
+}
+
+// -----
+// A test verifying too many TPUReplicateMetadataOp ops. Same logic tests too many TPUCompilationResultOp ops.
+module {
+  func.func @main(%arg0: tensor<*x!tf_type.resource>, %arg1: tensor<*x!tf_type.resource>, %arg2: tensor<*x!tf_type.resource<tensor<512x256xf32>>>) {
+    %cst = "tf.Const"() {value = dense<1> : tensor<i1>} : () -> tensor<i1>
+    %0 = "tf.While"(%cst) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i1>) -> (tensor<i1>)
+    return
+  }
+  // expected-error @+1 {{number of tf.TPUReplicateMetadata in loop body is not 1}}
+  func.func private @while_body(%arg0: tensor<i1>) -> (tensor<i1>) {
+    // metadata ops
+    %embedding_pass_trigger = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 1 : i64} : () -> ()
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 1 : i64} : () -> ()
+    %1 = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+    return %arg0 : tensor<i1>
+  }
+  func.func private @while_cond(%arg0: tensor<i1>) -> tensor<i1> {
+    return %arg0 : tensor<i1>
+  }
+}
+
+// -----
+// A test verifying the replication region of TPUReplicateMetadataOp ops. Same logic tests too many TPUCompilationResultOp ops.
+module {
+  func.func @main(%arg0: tensor<*x!tf_type.resource>, %arg1: tensor<*x!tf_type.resource>, %arg2: tensor<*x!tf_type.resource<tensor<512x256xf32>>>) {
+    %cst = "tf.Const"() {value = dense<1> : tensor<i1>} : () -> tensor<i1>
+    %0 = "tf.While"(%cst) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i1>) -> (tensor<i1>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i1>) -> (tensor<i1>) {
+    // metadata ops
+    %embedding_pass_trigger = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 1 : i64} : () -> ()
+    // expected-error @+1 {{'tf.TPUCompilationResult' op is not part of the replication region "repl_info" vs "wrong_repl_info"}}
+    %1 = "tf.TPUCompilationResult"() {_tpu_compilation_status = "wrong_repl_info"} : () -> tensor<!tf_type.string>
+    return %arg0 : tensor<i1>
+  }
+  func.func private @while_cond(%arg0: tensor<i1>) -> tensor<i1> {
+    return %arg0 : tensor<i1>
+  }
+}
+
+// -----
+// A test verifying TPUReplicatedOutput in the input graph doesn't trigger
+// any additional TPUReplicatedInput or TPUReplicatedOutput ops.
+module {
+  func.func @main() {
+    %cst_1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %cst_2 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+    %0:2 = "tf.While"(%cst_1, %cst_2) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+    // CHECK: {{.*StatefulPartitionedCall.* f = @_func_non_tpu.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_sc_forward.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_core_tpu.*}}
+    // metadata ops
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %1 = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+    %2 = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<3> : tensor<i32>} : () -> tensor<i32>
+    %3:2 = "tf.TPUReplicatedOutput"(%2) {device = ""} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+
+    // core_tpu ops:
+    %res_t = "tf.Const"() {_replication_info = "repl_info", value = dense<4> : tensor<i32>} : () -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+
+    return %res_n, %3#1 : tensor<i32>, tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg1, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+  // CHECK-DAG: TPUReplicatedOutput
+  // CHECK-NOT: TPUReplicatedoutput
+  // CHECK-NOT: TPUReplicatedInput
+}
+
+// -----
+// Verify error for backward pass with no forward pass.
+module {
+  func.func @main() {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    // expected-error @+1 {{'tf.Identity' op embedding backwards pass op with no forwards pass ops}}
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
+
+// -----
+// Verify error for unknown _embedding_pipelining attribute value.
+module {
+  func.func @main() {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    // expected-error @+1 {{'tf.Identity' op embedding op has unknown _embedding_pipelining attribute value garbage.}}
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "garbage", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
+
+// -----
+// Verify error for multiple WhileOp use of while_body function.
+module {
+  func.func @main() {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    // expected-error @+1 {{'tf.While' op multiple users of function.}}
+    %1 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
+
+// -----
+// Verify error for non-WhileOp use of while_body function.
+module {
+  func.func @main() {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    // expected-error @+1 {{'tf.StatefulPartitionedCall' op non while use of function.}}
+    %38 = "tf.StatefulPartitionedCall"(%cst_main) {config = "", config_proto = "", executor_type = "", f = @while_body} : (tensor<i32>) -> tensor<i32>
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/extract_head_tail_outside_compilation.mlir
index ffcd2a25923..baf243c9b5f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/extract_head_tail_outside_compilation.mlir
@@ -152,7 +152,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.A"(%[[RI]])
     // CHECK-NOT:      _xla_outside_compilation
     // CHECK-NEXT:     tf_device.return %[[A_OUT]]
-    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST"
+    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST_0"
     //
     // CHECK:        "tf_device.cluster"
     // CHECK-NEXT:     "tf.B"(%[[LAUNCH_OUT]])
@@ -370,7 +370,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:     %[[B_OUT:.*]] = "tf.B"(%[[CLUSTER_OUT]], %[[RI]])
     // CHECK-NOT:      _xla_outside_compilation
     // CHECK-NEXT:     tf_device.return
-    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST"
+    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST_0"
     tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
       "tf_device.cluster"() ({
         %a = "tf.A"(%ri) : (tensor<i32>) -> tensor<i32>
@@ -439,7 +439,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.A"(%[[RI]])
     // CHECK-NOT:      _xla_outside_compilation
     // CHECK-NEXT:     tf_device.return %[[A_OUT]]
-    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST"
+    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST_0"
     //
     // CHECK:        %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
     // CHECK-NEXT:     %[[B_OUT:.*]] = "tf.B"
@@ -456,7 +456,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:     "tf.D"(%[[HEAD_LAUNCH_OUT]], %[[CLUSTER_OUT]], %[[RI]])
     // CHECK-NOT:      _xla_outside_compilation
     // CHECK-NEXT:     tf_device.return
-    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST"
+    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST_0"
     tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
       "tf_device.cluster"() ({
         %a = "tf.A"(%ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir
index 0f2941bf317..f9a097d8fef 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir
@@ -104,7 +104,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:     "tf_device.launch"
     // CHECK:            "tf.B"
     // CHECK-NEXT:       tf_device.return
-    // CHECK-NEXT:     device = "TPU_REPLICATED_HOST"
+    // CHECK-NEXT:     device = "TPU_REPLICATED_HOST_0"
     // CHECK:          %[[TPU_CLUSTER_OUTPUT:[0-9]*]] = "tf_device.cluster"
     // CHECK:            tf_device.return
     // CHECK:          tf_device.return %[[TPU_CLUSTER_OUTPUT]]
@@ -215,6 +215,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"()
     // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
     // CHECK-SAME:       device_ordinal = 0
+    // CHECK-SAME:       device_type = "TPU"
     // CHECK-SAME:       key = "host_compute_channel_0_retvals"
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
@@ -227,7 +228,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
       %2 = "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> (tensor<2xi32>)
       %3 = "tf.C"(%2) : (tensor<2xi32>) -> tensor<2xi32>
       tf_device.return %3 : tensor<2xi32>
-    }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<2xi32>
+    }) {_xla_compile_device_type = "TPU", num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<2xi32>
 
     func.return %0 : tensor<2xi32>
   }
@@ -2067,3 +2068,33 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     func.return %0 : tensor<2xi32>
   }
 }
+
+// -----
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0"]} {
+  // CHECK-LABEL: func @single_outside_compiled_output_device_type
+  func.func @single_outside_compiled_output_device_type(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK:            %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf.Const"() {value = dense<""> : tensor<3x!tf_type.string>} : () -> tensor<3x!tf_type.string>
+    // CHECK-NOT:        "tf._TPUDeviceOrdinalPlaceholder"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"()
+    // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+    // CHECK-SAME:       device_ordinal = 0
+    // CHECK-SAME:       device_type = "CPU"
+    // CHECK-SAME:       key = "host_compute_channel_0_retvals"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._XlaHostComputeMlir"()
+    // CHECK-SAME:       recv_key = "host_compute_channel_0_retvals"
+    // CHECK-SAME:       send_key = "host_compute_channel_0_args"
+    // CHECK:            "tf.C"(%[[HOST_OUTPUT]])
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.A"() : () -> (tensor<2xi32>)
+      %2 = "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> (tensor<2xi32>)
+      %3 = "tf.C"(%2) : (tensor<2xi32>) -> tensor<2xi32>
+      tf_device.return %3 : tensor<2xi32>
+    }) {_xla_compile_device_type = "CPU"} : () -> tensor<2xi32>
+
+    func.return %0 : tensor<2xi32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/extract_tpu_copy_with_dynamic_shape_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/extract_tpu_copy_with_dynamic_shape_op.mlir
new file mode 100644
index 00000000000..fed754ea3c1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/extract_tpu_copy_with_dynamic_shape_op.mlir
@@ -0,0 +1,43 @@
+// RUN: tf-opt -split-input-file -verify-diagnostics -tf-extract-tpu-copy-with-dynamic-shape-op %s | FileCheck %s
+
+// Test that extract TPUCopyWithDynamicShape from host launch to device launch 
+
+// CHECK-LABEL: func @valid_copy_op_in_replicated_host
+
+// CHECK: "tf_device.launch"
+// CHECK: "TPU_REPLICATED_HOST_0"
+// CHECK: "tf_device.launch"
+// CHECK: "tf.TPUCopyWithDynamicShape"
+// CHECK: "TPU_REPLICATED_CORE_0"
+func.func @valid_copy_op_in_replicated_host(
+  %arg0: tensor<2048xi64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"},
+  %arg1: tensor<2048xi64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) -> (tensor<2048xi32>, tensor<2048xi32>) {
+  %cst = "tf.Const"() {value = dense<1024> : tensor<i32>} : () -> tensor<i32>
+  %0:2 = "tf_device.launch"() ({
+        %1 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<2048xi64>) -> tensor<2048xi32>
+        %2 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2048xi64>) -> tensor<2048xi32>
+        %3:2 = "tf.TPUCopyWithDynamicShape"(%1, %2, %cst, %cst) {operand_segment_sizes = array<i32: 2, 2>} : (tensor<2048xi32>, tensor<2048xi32>, tensor<i32>, tensor<i32>) -> (tensor<2048xi32>, tensor<2048xi32>)
+        tf_device.return %3#0, %3#1 : tensor<2048xi32>, tensor<2048xi32>
+      }) {device = "TPU_REPLICATED_HOST_0"} : () -> (tensor<2048xi32>, tensor<2048xi32>)
+  return %0#0, %0#1: tensor<2048xi32>, tensor<2048xi32>
+}
+
+// CHECK-LABEL: func @valid_copy_op_in_non_replicated_host
+
+// CHECK: "tf_device.launch"
+// CHECK: "/job:localhost/replica:0/task:0/device:CPU:0"
+// CHECK: "tf_device.launch"
+// CHECK: "tf.TPUCopyWithDynamicShape"
+// CHECK: "/job:localhost/replica:0/task:0/device:TPU:0"
+func.func @valid_copy_op_in_non_replicated_host(
+  %arg0: tensor<2048xi64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"},
+  %arg1: tensor<2048xi64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) -> (tensor<2048xi32>, tensor<2048xi32>) {
+  %cst = "tf.Const"() {value = dense<1024> : tensor<i32>} : () -> tensor<i32>
+  %0:2 = "tf_device.launch"() ({
+        %1 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<2048xi64>) -> tensor<2048xi32>
+        %2 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2048xi64>) -> tensor<2048xi32>
+        %3:2 = "tf.TPUCopyWithDynamicShape"(%1, %2, %cst, %cst) {operand_segment_sizes = array<i32: 2, 2>} : (tensor<2048xi32>, tensor<2048xi32>, tensor<i32>, tensor<i32>) -> (tensor<2048xi32>, tensor<2048xi32>)
+        tf_device.return %3#0, %3#1 : tensor<2048xi32>, tensor<2048xi32>
+      }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<2048xi32>, tensor<2048xi32>)
+  return %0#0, %0#1: tensor<2048xi32>, tensor<2048xi32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
index b7e47126779..d4d5b8e3c52 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
@@ -1,10 +1,11 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function -tf-xla-compile-device-type="GPU" -o - | FileCheck %s
 
 # Verify main graph was converted to a function, args/rets are mapped correctly,
 # and ops in the main graph are retained. In addition, check if subsequent
 # functions are converted.
 
 # CHECK:      func @main(%arg0: tensor<*x!tf_type.resource>, %arg1: tensor<*x!tf_type.resource<tensor<3x3x1x32xf32>>>, %arg2: tensor<*xf32>, %arg3: tensor<2x4x6x8xi32>) -> (tensor<*xf32>, tensor<*xf32>)
+# CHECK-SAME: _xla_compile_device_type = "GPU"
 # CHECK-SAME: control_outputs = ""
 # CHECK-SAME: inputs = "args_0,args_1,args_2,args_3"
 # CHECK-SAME: outputs = "rets_0,rets_1"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/parse_example.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/parse_example.mlir
index e6c041168d5..446af8cfb3f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/parse_example.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/parse_example.mlir
@@ -2,10 +2,10 @@
 
 // CHECK:      name: "tf.ParseExample"
 // CHECK-NEXT: op: "ParseExample"
-// CHECK-NEXT: input: "tf.Const3"
+// CHECK-NEXT: input: "tf.Const{{_.*_3}}"
 // CHECK-NEXT: input: "tf.Const"
-// CHECK-NEXT: input: "tf.Const1"
-// CHECK-NEXT: input: "tf.Const2"
+// CHECK-NEXT: input: "tf.Const{{_.*_1}}"
+// CHECK-NEXT: input: "tf.Const{{_.*_2}}"
 // CHECK-NEXT: attr {
 // CHECK-NEXT:   key: "Ndense"
 // CHECK-NEXT:   value {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/parse_example_v2.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/parse_example_v2.mlir
index a79a6c772d6..bf69559780c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/parse_example_v2.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/parse_example_v2.mlir
@@ -15,12 +15,12 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       // CHECK:      name: "ParseExample"
       // CHECK-NEXT: op: "ParseExampleV2"
       // CHECK-NEXT: input: "input0"
-      // CHECK-NEXT: input: "tf.Const3"
-      // CHECK-NEXT: input: "tf.Const5"
-      // CHECK-NEXT: input: "tf.Const2"
-      // CHECK-NEXT: input: "tf.Const4"
+      // CHECK-NEXT: input: "tf.Const{{_.*_3}}"
+      // CHECK-NEXT: input: "tf.Const{{_.*_5}}"
+      // CHECK-NEXT: input: "tf.Const{{_.*_2}}"
+      // CHECK-NEXT: input: "tf.Const{{_.*_4}}"
       // CHECK-NEXT: input: "tf.Const"
-      // CHECK-NEXT: input: "tf.Const1"
+      // CHECK-NEXT: input: "tf.Const{{_.*_1}}"
       // CHECK-NEXT: attr {
       // CHECK-NEXT:   key: "Tdense"
       // CHECK-NEXT:     value {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/preserve-entry-func-names.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/preserve-entry-func-names.mlir
index 9570d2cdb94..fdbfc839e55 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/preserve-entry-func-names.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/preserve-entry-func-names.mlir
@@ -25,4 +25,4 @@ attributes {tf.entry_function = {inputs = "foo,bar", outputs = "Add"}} {
 // CHECK-NEXT: input: "[[BAR_ID_0]]"
 // CHECK: name: "Add"
 // CHECK-NEXT: op: "_Retval"
-// CHECK-NEXT: input: "Add1"
+// CHECK-NEXT: input: "Add{{_.*_1}}"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf_add.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf_add.mlir
index 0cc07f8816c..d608c8550c6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf_add.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf_add.mlir
@@ -16,14 +16,14 @@ attributes {tf.entry_function = {inputs = "input0,input1", outputs = "Add"}} {
 // CHECK-NEXT:   name: "input1"
 // CHECK-NEXT:   op: "_Arg"
 // CHECK:      node {
-// CHECK-NEXT:   name: "Add1"
+// CHECK-NEXT:   name: "Add{{_.*_1}}"
 // CHECK-NEXT:   op: "Add"
 // CHECK-NEXT:   input: "input0"
 // CHECK-NEXT:   input: "input1"
 // CHECK:      node {
 // CHECK-NEXT:   name: "Add"
 // CHECK-NEXT:   op: "_Retval"
-// CHECK-NEXT:   input: "Add1"
+// CHECK-NEXT:   input: "Add{{_.*_1}}"
 // CHECK-NEXT:   attr {
 // CHECK-NEXT:     key: "T"
 // CHECK-NEXT:     value {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/unique_name.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/unique_name.mlir
index 72b445341ea..dc569a9e94f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/unique_name.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/unique_name.mlir
@@ -4,11 +4,11 @@ func.func @main() {
   tf_executor.graph {
     // CHECK: name: "foo"
     %0:2 = tf_executor.island wraps "tf.Const"() {dtype = "tfdtype$DT_INT32", value = dense<0> : tensor<i32>} : () -> (tensor<i32>) loc("foo")
-    // CHECK: name: "foo1"
+    // CHECK: name: "foo{{_.*_1}}"
     %1:2 = tf_executor.island wraps "tf.Const"() {dtype = "tfdtype$DT_INT32", value = dense<1> : tensor<i32>} : () -> (tensor<i32>) loc("foo")
-    // CHECK: name: "foo11"
+    // CHECK: name: "foo1"
     %2:2 = tf_executor.island wraps "tf.Const"() {dtype = "tfdtype$DT_INT32", value = dense<2> : tensor<i32>} : () -> (tensor<i32>) loc("foo1")
-    // CHECK: name: "foo2"
+    // CHECK: name: "foo{{_.*_2}}"
     %3:2 = tf_executor.island wraps "tf.Const"() {dtype = "tfdtype$DT_INT32", value = dense<2> : tensor<i32>} : () -> (tensor<i32>) loc("foo")
     // CHECK: name: "2"
     %4:2 = tf_executor.island wraps "tf.Const"() {dtype = "tfdtype$DT_INT32", value = dense<3> : tensor<i32>} : () -> (tensor<i32>) loc("2")
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/outside_compiled_to_host_launch.mlir b/tensorflow/compiler/mlir/tensorflow/tests/outside_compiled_to_host_launch.mlir
index e2d94c9c6e7..2f744534abd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/outside_compiled_to_host_launch.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/outside_compiled_to_host_launch.mlir
@@ -48,7 +48,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:       "tf.C"
     // CHECK-NOT:        _xla_outside_compilation
     // CHECK:            tf_device.return
-    // CHECK-NEXT:     device = "TPU_REPLICATED_HOST"
+    // CHECK-NEXT:     device = "TPU_REPLICATED_HOST_0"
     // CHECK: device_assignment =  [], num_cores_per_replica = 1 : i64, topology =  ""
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/prepare_tpu_computation_for_tf_export.mlir b/tensorflow/compiler/mlir/tensorflow/tests/prepare_tpu_computation_for_tf_export.mlir
index 66b0395f00a..45ee57ad75d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/prepare_tpu_computation_for_tf_export.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/prepare_tpu_computation_for_tf_export.mlir
@@ -163,3 +163,13 @@ func.func @UnsupportedOp(%arg0: tensor<i32>) -> tensor<i32> {
   func.return %0 : tensor<i32>
 }
 
+// -----
+
+// _XlaHostComputeMlir with manual_sharding should not fall back to
+// XlaHostCompute, because XlaHostCompute does not support manual_sharding.
+
+func.func @HostComputeManualNoFallback(%arg0: tensor<i32>) -> () {
+  // expected-error @+1 {{manual_sharding not supported with fallback}}
+  %1 = "tf._XlaHostComputeMlir"(%arg0) {recv_key = "host_compute_channel_recv1", send_key = "host_compute_channel_send1", host_mlir_module = "", manual_sharding = true} : (tensor<i32>) -> (tensor<f32>)
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_invariant_op_hoisting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_invariant_op_hoisting.mlir
index 2892a011923..b34a26431c0 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_invariant_op_hoisting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_invariant_op_hoisting.mlir
@@ -32,6 +32,18 @@ func.func @invariant_shape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
 // CHECK:   tf_device.return %[[SHAPE]]
 
 
+// CHECK-LABEL: func @not_invariant_ordinal_placeholder
+func.func @not_invariant_ordinal_placeholder(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
+  // CHECK: tf_device.replicate
+  // CHECK:   tf._TPUDeviceOrdinalPlaceholder
+  %0:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<*xf32>) {n = 2: i32} {
+    %1 = "tf._TPUDeviceOrdinalPlaceholder"() : () -> tensor<i64>
+    tf_device.return %1 : tensor<i64>
+  }
+  func.return
+}
+
+
 // CHECK-LABEL: func @replicate_resource_var_arg_shape
 // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<*x!tf_type.resource>, %[[ARG_1:[a-z0-9]*]]: tensor<*x!tf_type.resource>)
 func.func @replicate_resource_var_arg_shape(%arg0: tensor<*x!tf_type.resource>, %arg1: tensor<*x!tf_type.resource>) {
@@ -190,3 +202,21 @@ func.func @do_not_hoist_ops_with_virtual_device(%arg0: tensor<*xf32>, %arg1: ten
 // CHECK:      tf_device.return [[OP_C]] : tensor<*xi32>
 // CHECK:    }) {device = "c"} : () -> tensor<*xi32>
 // CHECK:    tf_device.return [[SHAPE]], [[OP_A]], [[LAUNCH_B]], [[LAUNCH_C]]
+
+
+// Checks that the argument to a Shape that has a virtual device is not changed.
+
+// CHECK-LABEL:   func @do_not_mutate_shape_op_with_virtual_device
+// CHECK:         tf_device.replicate
+// CHECK-SAME:    as [[RI:%.*]]: tensor<*xf32>
+// CHECK:         "tf.Shape"([[RI]])
+func.func @do_not_mutate_shape_op_with_virtual_device(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
+  tf_device.replicate([%arg0, %arg1] as %ri: tensor<*xf32>) {devices = {TPU_REPLICATED_HOST_0 = ["/device:CPU:0", "/device:CPU:1"]}, n = 2: i32} {
+    "tf_device.launch"() ({
+      %1 = "tf.Shape"(%ri) {T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> tensor<?xi32>
+      tf_device.return
+    }) {device = "TPU_REPLICATED_HOST_0"} : () -> ()
+    tf_device.return
+  }
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 3a1e3316c26..71fbf7cca9e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -1282,6 +1282,21 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     func.return %1#1 : tensor<*x!quant.uniform<u8:f32, 0.007:128>>
   }
 
+  // CHECK-LABEL: func @xla_call_module
+  // CHECK-SAME: (%arg0: tensor<f32>) -> tensor<f32>
+  func.func @xla_call_module(%arg0: tensor<f32>) -> tensor<*xf32> {
+    // Equivalent to the following:
+    //
+    // module @jit_sin {
+    //   func.func public @main(%arg0: tensor<f32>) -> tensor<f32> {
+    //     %0 = stablehlo.sine %arg0 : tensor<f32>
+    //     return %0 : tensor<f32>
+    //   }
+    // }
+    %0 = "tf.XlaCallModule"(%arg0) {Sout = [#tf_type.shape<*>], device = "", dim_args_spec = [], module = "ML\EFR\03MLIRxxx-trunk\00\01\17\05\01\05\01\03\05\03\07\07\t\0B\03K5\07\01\1B\07\0B\13\0B3\0B\0B\0B\0B\0F\0B\13\0B\03\1B\0F\1B\0B\0B\0B\0B\0B\0F\13\0B\0B\0B\0B\03\07\0F\17\07\02\A7\1F\05\0D\03\03\03\07\05\0F\03\0B\0B\1B\0D'\0F)\031\113\05\11\05\13\05\15\05\17\1D\15\17\05\19\17\19\EF\01\05\1B\03\03\1D\0D\05\1F!#%\1D\1D\1D\1F\1D!\1D##\03\03\03+\0D\03-/\1D%\1D'\1D)\1D+)\01\05\11\03\01\03\01\t\04A\05\01\11\01\05\07\03\01\05\03\11\01\t\05\03\05\0B\03\01\01\05\06\13\03\01\03\01\07\04\01\03\03\06\03\01\05\01\00\9A\04-\0F\0B\03!\1B\1D\05\1B\83/\1F\15\1D\15\11\13\15\11\11\0F\0B\11builtin\00vhlo\00module\00func_v1\00sine_v1\00return_v1\00sym_name\00jit_sin\00arg_attrs\00function_type\00res_attrs\00sym_visibility\00jit(sin)/jit(main)/sin\00third_party/py/jax/experimental/jax2tf/tests/back_compat_test.py\00jax.arg_info\00x\00mhlo.sharding\00{replicated}\00jax.result_info\00\00main\00public\00", platforms = [], version = 4 : i64} : (tensor<f32>) -> tensor<*xf32>
+    func.return %0 : tensor<*xf32>
+  }
+
   // CHECK-LABEL: func @xla_host_compute_mlir_empty_module
   func.func @xla_host_compute_mlir_empty_module(%arg0: tensor<2xf32>) -> tensor<*xf32> {
     // CHECK: "tf._XlaHostComputeMlir"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 17259f35fc2..74363ecc967 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -701,6 +701,7 @@ func.func @testConv2D(%arg0: tensor<256x32x3xf32>, %arg1: tensor<3x3x3x16xf32>)
 // -----
 
 func.func @testConv3D(%arg0: tensor<256x32x32x32x3xf32>, %arg1: tensor<3x3x3x3x16xf32>) -> tensor<256x32x32x16xf32> {
+  // expected-error @+2 {{'tf.Conv3D' op failed to infer returned types}}
   // expected-error @+1 {{op inferred type(s) 'tensor<256x32x32x32x16xf32>' are incompatible with return type(s) of operation 'tensor<256x32x32x16xf32>'}}
   %0 = "tf.Conv3D"(%arg0, %arg1) {padding = "SAME", strides = [1, 1, 1, 1, 1]} : (tensor<256x32x32x32x3xf32>, tensor<3x3x3x3x16xf32>) -> tensor<256x32x32x16xf32>
   func.return %0 : tensor<256x32x32x16xf32>
@@ -757,6 +758,7 @@ func.func @testConv2D(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<3x3x3x16xf32
 // -----
 
 func.func @testConv2D(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<3x3x3x16xf32>) -> tensor<256x30x30x16xf32> {
+  // expected-error @+2 {{'tf.Conv2D' op failed to infer returned types}}
   // expected-error @+1 {{op inferred type(s) 'tensor<256x16x11x16xf32>' are incompatible with return type(s) of operation 'tensor<256x30x30x16xf32>'}}
   %0 = "tf.Conv2D"(%arg0, %arg1) {padding = "SAME", strides = [1, 2, 3, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x30x30x16xf32>
   func.return %0 : tensor<256x30x30x16xf32>
@@ -765,6 +767,7 @@ func.func @testConv2D(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<3x3x3x16xf32
 // -----
 
 func.func @testConv2D(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<3x3x3x16xf32>) -> tensor<256x16x30x16xf32> {
+  // expected-error @+2 {{'tf.Conv2D' op failed to infer returned types}}
   // expected-error @+1 {{op inferred type(s) 'tensor<256x16x11x16xf32>' are incompatible with return type(s) of operation 'tensor<256x16x30x16xf32>'}}
   %0 = "tf.Conv2D"(%arg0, %arg1) {padding = "SAME", strides = [1, 2, 3, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x16x30x16xf32>
   func.return %0 : tensor<256x16x30x16xf32>
@@ -773,6 +776,7 @@ func.func @testConv2D(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<3x3x3x16xf32
 // -----
 
 func.func @testConv2D(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<3x3x3x16xf32>) -> tensor<256x32x32x16xf32> {
+  // expected-error @+2 {{'tf.Conv2D' op failed to infer returned types}}
   // expected-error @+1 {{op inferred type(s) 'tensor<256x6x6x16xf32>' are incompatible with return type(s) of operation 'tensor<256x32x32x16xf32>'}}
   %0 = "tf.Conv2D"(%arg0, %arg1) {padding = "EXPLICIT", dilations = [1, 2, 3, 4], explicit_paddings = [1, 2, 3, 4, 5, 6, 7, 8], strides = [5, 6, 7, 8]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x32x32x16xf32>
   func.return %0 : tensor<256x32x32x16xf32>
@@ -781,6 +785,7 @@ func.func @testConv2D(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<3x3x3x16xf32
 // -----
 
 func.func @testConv2D(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<3x3x3x16xf32>) -> tensor<256x32x32x16xf32> {
+  // expected-error @+2 {{'tf.Conv2D' op failed to infer returned types}}
   // expected-error @+1 {{op inferred type(s) 'tensor<256x30x30x16xf32>' are incompatible with return type(s) of operation 'tensor<256x32x32x16xf32>'}}
   %0 = "tf.Conv2D"(%arg0, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x32x32x16xf32>
   func.return %0 : tensor<256x32x32x16xf32>
@@ -2554,6 +2559,7 @@ func.func @testConst() -> tensor<f32> {
 
 // Test invalid tf.ToBool
 func.func @testInvalidToBool(%arg0: tensor<i32>) -> tensor<1xi1> {
+  // expected-error @+2 {{'tf.ToBool' op failed to infer returned types}}
   // expected-error @+1 {{op inferred type(s) 'tensor<i1>' are incompatible with return type(s) of operation 'tensor<1xi1>'}}
   %0 = "tf.ToBool"(%arg0) : (tensor<i32>) -> tensor<1xi1>
   func.return %0 : tensor<1xi1>
@@ -2639,7 +2645,7 @@ func.func @testTranspose(tensor<2x3xf32>) -> tensor<3x2xf32> {
 func.func @testTranspose(tensor<2x2xf32>) -> tensor<2x2xf32> {
 ^bb0(%arg0: tensor<2x2xf32>):
   %cst = arith.constant dense<[1, -3]> : tensor<2xi32>
-  // expected-error @+1 {{perm[-1] must be in [-rank, rank)}}
+  // expected-error @+1 {{'tf.Transpose' op perm[1]=-3 must be in range [-2, 2)}}
   %0 = "tf.Transpose"(%arg0, %cst) {T = "tfdtype$DT_FLOAT", Tperm = "tfdtype$DT_INT32"} : (tensor<2x2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
   func.return %0 : tensor<2x2xf32>
 }
@@ -4341,6 +4347,7 @@ func.func @testVarHandleOp() -> tensor<*x!tf_type.resource> {
 
 func.func @testXlaBroadcastHelper(%arg0: tensor<2x3x5xi32>, %arg1: tensor<5x2xi32>) -> () {
   %0 = "tf.Const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
+  // expected-error @+2 {{'tf.XlaBroadcastHelper' op failed to infer returned types}}
   // expected-error @+1 {{broadcast_dims must have size equal to the smaller argument rank}}
   %lhs_output, %rhs_output = "tf.XlaBroadcastHelper"(%arg0, %arg1, %0) : (tensor<2x3x5xi32>, tensor<5x2xi32>, tensor<1xi64>) -> (tensor<2x3x5xi32>, tensor<2x1x5xi32>)
   func.return
@@ -4350,6 +4357,7 @@ func.func @testXlaBroadcastHelper(%arg0: tensor<2x3x5xi32>, %arg1: tensor<5x2xi3
 
 func.func @testXlaBroadcastHelper(%arg0: tensor<2x3x5xi32>, %arg1: tensor<5x2xi32>) -> () {
   %0 = "tf.Const"() {value = dense<> : tensor<0xi64>} : () -> tensor<0xi64>
+  // expected-error @+2 {{'tf.XlaBroadcastHelper' op failed to infer returned types}}
   // expected-error @+1 {{if broadcast_dims is empty, both arguments must have equal rank or at least one argument must be a scalar}}
   %lhs_output, %rhs_output = "tf.XlaBroadcastHelper"(%arg0, %arg1, %0) : (tensor<2x3x5xi32>, tensor<5x2xi32>, tensor<0xi64>) -> (tensor<2x3x5xi32>, tensor<2x1x5xi32>)
   func.return
@@ -4359,6 +4367,7 @@ func.func @testXlaBroadcastHelper(%arg0: tensor<2x3x5xi32>, %arg1: tensor<5x2xi3
 
 func.func @testXlaBroadcastHelper(%arg0: tensor<5x2xi32>, %arg1: tensor<2x3x5xi32>) -> () {
   %0 = "tf.Const"() {value = dense<0> : tensor<2xi64>} : () -> tensor<2xi64>
+  // expected-error @+2 {{'tf.XlaBroadcastHelper' op failed to infer returned types}}
   // expected-error @+1 {{broadcast_dims has duplicates}}
   %lhs_output, %rhs_output = "tf.XlaBroadcastHelper"(%arg0, %arg1, %0) : (tensor<5x2xi32>, tensor<2x3x5xi32>, tensor<2xi64>) -> (tensor<2x1x5xi32>, tensor<2x3x5xi32>)
   func.return
@@ -4674,6 +4683,7 @@ func.func @testReluStaticShapeInputAndDynamicShapeOutput(%arg0: tensor<8x16xf32>
 
 func.func @set_dynamic_dimension_size(%input: tensor<4xf32>, %size: tensor<i32>) -> tensor<?xf16> {
   %dimension = "tf.Const"() { value = dense<1> : tensor<i32> } : () -> tensor<i32>
+  // expected-error @+2 {{'tf.XlaSetDynamicDimensionSize' op failed to infer returned types}}
   // expected-error @+1 {{dim_index (1) is out of range [0, 1)}}
   %0 = "tf.XlaSetDynamicDimensionSize"(%input, %dimension, %size) : (tensor<4xf32>, tensor<i32>, tensor<i32>) -> tensor<?xf16>
   func.return %0 : tensor<?xf16>
@@ -5023,6 +5033,52 @@ func.func @testUniformQuantizedConvolution(
   func.return
 }
 
+// -----
+
+func.func @testUniformQuantizedAdd(
+  %input: tensor<2x2x!tf_type.qint32>, %bias: tensor<2x!tf_type.qint32>,
+  %input_scales: tensor<f32>, %input_zps: tensor<i32>,
+  %bias_scales: tensor<f32>, %bias_zps: tensor<i32>,
+  %output_scales: tensor<2xf32>, %output_zps: tensor<i32>) -> () {
+  // expected-error @below {{'tf.UniformQuantizedAdd' op quantization_axis is -1, scales must have 0 rank.}}
+  %1 = "tf.UniformQuantizedAdd"(
+    %input, %bias,
+    %input_scales, %input_zps,
+    %bias_scales, %bias_zps,
+    %output_scales, %output_zps) {
+      lhs_quantization_axis = -1 : i64,
+      lhs_quantization_min_val = -2147483648 : i64,
+      lhs_quantization_max_val = 2147483647 : i64,
+      rhs_quantization_axis = -1 : i64,
+      rhs_quantization_min_val = -2147483648 : i64,
+      rhs_quantization_max_val = 2147483647 : i64,
+      output_quantization_axis = -1 : i64,
+      output_quantization_min_val = -2147483648 : i64,
+      output_quantization_max_val = 2147483647 : i64} : (
+        tensor<2x2x!tf_type.qint32>, tensor<2x!tf_type.qint32>,
+        tensor<f32>, tensor<i32>,
+        tensor<f32>, tensor<i32>,
+        tensor<2xf32>, tensor<i32>) -> tensor<2x2x!tf_type.qint32>
+  func.return
+}
+
+// -----
+
+func.func @testUniformQuantizedClipByValue(
+    %operand: tensor<*x!tf_type.qint32>, %min: tensor<!tf_type.qint32>, %max: tensor<!tf_type.qint32>, 
+    %scales: tensor<2xf32>, %zps: tensor<i32>) -> () {
+  // expected-error @below {{'tf.UniformQuantizedClipByValue' op quantization_axis is -1, scales must have 0 rank.}}
+  %0 = "tf.UniformQuantizedClipByValue"(%operand, %min, %max, %scales, %zps) {
+    quantization_axis = -1 : i64,
+    quantization_min_val = -2147483648 : i64,
+    quantization_max_val = 2147483647 : i64
+  } : (
+    tensor<*x!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<!tf_type.qint32>,
+    tensor<2xf32>, tensor<i32>
+  ) -> tensor<*x!tf_type.qint32>
+  func.return
+}
+
 // Following tests are for LegacyCall symbol use verifier.
 
 // -----
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_location_roundtrip.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_location_roundtrip.mlir
index d5e1a637b7f..fd69ecd1436 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_location_roundtrip.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_location_roundtrip.mlir
@@ -18,6 +18,7 @@
 // tf_executor.island, tf.Identity, and tf_executor.yield).
 
 // CHECK-LABEL: "func.func"
+// CHECK: sym_name = "island_one_op_all_locs_same"
 // CHECK:    "tf_executor.graph"() ({
 // CHECK-NEXT:      "tf_executor.island"() ({
 // CHECK-NEXT:        "tf.Identity"(%{{.*}}) : (tensor<f32>) -> tensor<f32> loc("identity@some_function")
@@ -26,7 +27,6 @@
 // CHECK-NEXT:      "tf_executor.fetch"(%{{.*}}) : (tensor<f32>) -> () loc(unknown)
 // CHECK-NEXT:    }) : () -> tensor<f32> loc(unknown)
 // CHECK-NEXT:    "func.return"(%{{.*}}) : (tensor<f32>) -> () loc(unknown)
-// CHECK-NEXT: sym_name = "island_one_op_all_locs_same"
 
 func.func @island_one_op_all_locs_same(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "tf_executor.graph"() ({
@@ -45,6 +45,7 @@ func.func @island_one_op_all_locs_same(%arg0: tensor<f32>) -> tensor<f32> {
 // don't have identical locations.
 
 // CHECK-LABEL: "func.func"
+// CHECK: sym_name = "island_one_op_all_locs_NOT_same"
 // CHECK:    "tf_executor.graph"() ({
 // CHECK-NEXT:      "tf_executor.island"() ({
 // CHECK-NEXT:        "tf.Identity"(%{{.*}}) : (tensor<f32>) -> tensor<f32> loc("identity@some_function")
@@ -53,7 +54,6 @@ func.func @island_one_op_all_locs_same(%arg0: tensor<f32>) -> tensor<f32> {
 // CHECK-NEXT:      "tf_executor.fetch"(%{{.*}}) : (tensor<f32>) -> () loc(unknown)
 // CHECK-NEXT:    }) : () -> tensor<f32> loc(unknown)
 // CHECK-NEXT:    "func.return"(%{{.*}}) : (tensor<f32>) -> () loc(unknown)
-// CHECK-NEXT: sym_name = "island_one_op_all_locs_NOT_same"
 
 func.func @island_one_op_all_locs_NOT_same(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "tf_executor.graph"() ({
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
index ef0a95d756f..b8e7715c593 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
@@ -48,6 +48,7 @@ def do_test(
     show_debug_info=False,
     use_lite=False,
     lift_variables=True,
+    include_variables_in_initializers=False,
 ):
   """Runs test.
 
@@ -70,6 +71,9 @@ def do_test(
     use_lite: If true, importer will not do any graph transformation such as
       lift variables.
     lift_variables: If false, no variable lifting will be done on the graph.
+    include_variables_in_initializers: If false, removes variables in
+      initializer functions before lifting variables or adding new variable
+      initialization patterns in the initializer function.
   """
 
   # Make LOG(ERROR) in C++ code show up on the console.
@@ -124,6 +128,7 @@ def do_test(
           exported_names,
           ','.join([tf.saved_model.tag_constants.SERVING]),
           lift_variables,
+          include_variables_in_initializers,
           upgrade_legacy,
           show_debug_info,
       )
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/include_variables_in_init_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/include_variables_in_init_v1.py
new file mode 100644
index 00000000000..2f99fae8d8d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/include_variables_in_init_v1.py
@@ -0,0 +1,87 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# RUN: %p/include_variables_in_init_v1 | FileCheck %s
+
+# pylint: disable=missing-docstring,line-too-long
+import tensorflow.compat.v1 as tf
+from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
+
+# Verify that the tf.versions attribute exists. It is difficult to enforce
+# contents, since the version numbers change over time. The conversion logic
+# itself is verified in the common graphdef converter, so here just assert
+# it is being invoked.
+# CHECK: module
+# CHECK-SAME: tf.versions
+# CHECK-SAME: bad_consumers
+# CHECK-SAME: min_consumer
+# CHECK-SAME: producer
+
+# CHECK: "tf_saved_model.global_tensor"()
+# CHECK: "tf_saved_model.session_initializer"() {initializers = [@[[INIT_FUNC:[a-zA-Z_0-9]+]]]} : () -> ()
+
+# Initializer function. This should contain the initialization sequence for the
+# variable.
+# CHECK: func @[[INIT_FUNC]](%[[ARG_0:.*]]: tensor<!tf_type.resource<tensor<1x3xf32>>> {tf_saved_model.bound_input = @y}) attributes {
+# CHECK-SAME: tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_init"]
+# CHECK-SAME: tf_saved_model.initializer_type = "init_op"
+# CHECK-SAME: }
+# CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{{.*dense<.*> : tensor<2xi32>.*}}} : () -> tensor<2xi32>
+# CHECK: %[[RAND_STD_NORMAL:.*]] = "tf.RandomStandardNormal"(%[[CST_0]])
+# CHECK: "tf.AssignVariableOp"(%[[ARG_0]], %[[RAND_STD_NORMAL]]){{.*}}: (tensor<!tf_type.resource<tensor<1x3xf32>>>, tensor<1x3xf32>) -> ()
+# CHECK: return
+
+# The function for the signature "key".
+# CHECK: func {{@[a-zA-Z_0-9]+}}(
+# CHECK-SAME: %[[ARG_1:.*]]: tensor<3x1xf32> {tf_saved_model.index_path = ["x"]}
+# CHECK-SAME: %[[ARG_2:.*]]: tensor<!tf_type.resource<tensor<1x3xf32>>> {tf_saved_model.bound_input = @y}
+# CHECK-SAME: -> (tensor<3x3xf32> {tf_saved_model.index_path = ["r"]})
+# CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
+# CHECK-NEXT: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%[[ARG_2]]) {{{.*}}} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+# CHECK-NEXT: %[[MATMUL_0:.*]] = "tf.MatMul"(%[[ARG_1]], %[[READ_VAR_0]]) {{{.*}}} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
+# CHECK-NEXT: return %[[MATMUL_0]] : tensor<3x3xf32>
+
+
+def Test():
+  x = tf.constant([[1.0], [1.0], [1.0]])
+  y = tf.compat.v1.get_variable(
+      name='y',
+      shape=(1, 3),
+      initializer=tf.random_normal_initializer(),
+      trainable=True,
+  )
+  r = tf.matmul(x, y)
+
+  tensor_info_x = tf.compat.v1.saved_model.utils.build_tensor_info(x)
+  tensor_info_r = tf.compat.v1.saved_model.utils.build_tensor_info(r)
+
+  return (
+      {
+          'key': (
+              tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
+                  inputs={'x': tensor_info_x},
+                  outputs={'r': tensor_info_r},
+                  method_name='some_function',
+              )
+          )
+      },
+      tf.initializers.global_variables(),
+      None,
+  )
+
+
+if __name__ == '__main__':
+  common_v1.set_tf_options()
+  common_v1.do_test(Test, include_variables_in_initializers=True)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-annotate-dynamic-shape-inputs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-annotate-dynamic-shape-inputs.mlir
new file mode 100644
index 00000000000..accdbdacca8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-annotate-dynamic-shape-inputs.mlir
@@ -0,0 +1,28 @@
+// RUN: tf-opt -split-input-file -verify-diagnostics -tf-tpu-annotate-dynamic-shape-inputs %s | FileCheck %s
+
+// Test that annotate the inputs of the cluster func to be dynamic shaped.
+
+module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+   func.func @main(
+      %arg0: tensor<2048xi64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"},
+      %arg1: tensor<2048xi64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) -> tensor<2048xi32> {
+      %cst = "tf.Const"() {value = dense<1024> : tensor<i32>} : () -> tensor<i32>
+      %0:2 = "tf_device.launch"() ({
+            %1 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<2048xi64>) -> tensor<2048xi32>
+            %2 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<2048xi64>) -> tensor<2048xi32>
+            %3:2 = "tf.TPUCopyWithDynamicShape"(%1, %2, %cst, %cst) {operand_segment_sizes = array<i32: 2, 2>} : (tensor<2048xi32>, tensor<2048xi32>, tensor<i32>, tensor<i32>) -> (tensor<2048xi32>, tensor<2048xi32>)
+            // CHECK-NOT: tf.TPUAnnotateTensorsWithDynamicShape
+            %4:2 = "tf.TPUAnnotateTensorsWithDynamicShape"(%3#0, %3#1) : (tensor<2048xi32>, tensor<2048xi32>) -> (tensor<2048xi32>, tensor<2048xi32>)
+            tf_device.return %4#0, %4#1 : tensor<2048xi32>, tensor<2048xi32>
+            }) {device = "TPU_REPLICATED_HOST_0"} : () -> (tensor<2048xi32>, tensor<2048xi32>)
+      %1 = "tf_device.cluster_func"(%0#0, %0#1) {_replication_info = "cluster_test_fn", func = @tpu_func} : (tensor<2048xi32>, tensor<2048xi32>) -> tensor<2048xi32>
+      return %1: tensor<2048xi32>
+   }
+   // CHECK-LABEL: func @tpu_func
+   // CHECK: mhlo.type_extensions
+   func.func @tpu_func (
+      %arg0: tensor<2048xi32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg1: tensor<2048xi32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}) -> (tensor<2048xi32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}) {
+    %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2048xi32>, tensor<2048xi32>) -> tensor<2048xi32>
+    return %0 : tensor<2048xi32>
+   }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
index 355085be8b4..db266ed4afe 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
@@ -540,6 +540,90 @@ func.func @replicated_non_replicated_output() {
 
 // -----
 
+// TF produces Identity ops between TPUReplicatedOutput and
+// TPUPartitionedOutputV2 ops. This test ensures that they are erased
+// and not considered within the clustered computation. It also ensures that
+// the expected interleaving pattern is present in the output.
+
+func.func @partitioned_outputs(%arg0: tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) {
+  %pi0 = "tf.TPUPartitionedInputV2"(%arg0) {N = 2, partition_dims = [], _XlaSharding = "", is_packed = true} : (tensor<?xi32>) -> (tensor<?xi32>)
+  %pi1 = "tf.TPUPartitionedInputV2"(%arg0) {N = 2, partition_dims = [], _XlaSharding = "", is_packed = true} : (tensor<?xi32>) -> (tensor<?xi32>)
+  %1 = "tf.TPUReplicatedInput"(%pi0, %pi1) {is_mirrored_variable = true, is_packed = false} : (tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>)
+  %2 = "tf.opA"(%1) {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", is_stateless = true} : (tensor<?xi32>) -> (tensor<?xi32>)
+  %3:2 = "tf.TPUReplicatedOutput"(%2) : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+  %4 = "tf.Identity"(%3#0) : (tensor<?xi32>) -> (tensor<?xi32>)
+  %5:2 = "tf.TPUPartitionedOutputV2"(%4) {_XlaSharding = "", partition_dims = []} : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+  %6 = "tf.Identity"(%3#1) : (tensor<?xi32>) -> (tensor<?xi32>)
+  %7:2 = "tf.TPUPartitionedOutputV2"(%6) {_XlaSharding = "", partition_dims = []} : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", num_replicas = 2, num_cores_per_replica = 2, topology = "topology"} : () -> ()
+  func.return %5#0, %5#1, %7#0, %7#1 : tensor<?xi32>, tensor<?xi32>, tensor<?xi32>, tensor<?xi32>
+}
+
+// CHECK: [[REPLICATE:%.+]]:4 = tf_device.replicate
+// CHECK: return [[REPLICATE]]#0, [[REPLICATE]]#2, [[REPLICATE]]#1, [[REPLICATE]]#3
+
+// -----
+
+// Ensures that mixed partitioned and replicated outputs
+// works in the multi-replica case.
+func.func @mixed_partitioned_outputs(%arg0: tensor<?xi32>) -> (tensor<?xi32>, tensor<?xf32>) {
+  %pi0 = "tf.TPUPartitionedInputV2"(%arg0) {N = 2, partition_dims = [], _XlaSharding = "", is_packed = true} : (tensor<?xi32>) -> (tensor<?xi32>)
+  %pi1 = "tf.TPUPartitionedInputV2"(%arg0) {N = 2, partition_dims = [], _XlaSharding = "", is_packed = true} : (tensor<?xi32>) -> (tensor<?xi32>)
+  %1 = "tf.TPUReplicatedInput"(%pi0, %pi1) {is_mirrored_variable = true, is_packed = false} : (tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>)
+  %2:2 = "tf.opA"(%1) {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", is_stateless = true} : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xf32>)
+  %3:2 = "tf.TPUReplicatedOutput"(%2#0) : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+  %5:2 = "tf.TPUPartitionedOutputV2"(%3#0) {_XlaSharding = "", partition_dims = []} : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+  %7:2 = "tf.TPUPartitionedOutputV2"(%3#1) {_XlaSharding = "", partition_dims = []} : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+  %8:2 = "tf.TPUReplicatedOutput"(%2#1) : (tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>)
+  %9 = "tf.opB"(%5#0, %5#1, %7#0, %7#1) : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>)
+  %10 = "tf.opC"(%8#0, %8#1) : (tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>)
+  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", num_replicas = 2, num_cores_per_replica = 2, topology = "topology"} : () -> ()
+  func.return %9, %10 : tensor<?xi32>, tensor<?xf32>
+}
+
+// CHECK: [[REPLICATE:%.+]]:6 = tf_device.replicate
+// CHECK: [[OP_B:%.+]] = "tf.opB"([[REPLICATE]]#0, [[REPLICATE]]#2, [[REPLICATE]]#1, [[REPLICATE]]#3)
+// CHECK: [[OP_C:%.+]] = "tf.opC"([[REPLICATE]]#4, [[REPLICATE]]#5)
+
+// -----
+
+// For the single replica case:
+// - Ensures that Identity ops are ignored.
+// - Checks that mixing TPUPartitionedOutputV2 and TPUReplicatedOutput works.
+
+func.func @single_replica_mixed_partitioned_outputs(%arg0: tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>, tensor<?xf32>) {
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {N = 2, partition_dims = [], _XlaSharding = "", is_packed = true} : (tensor<?xi32>) -> (tensor<?xi32>)
+  %1 = "tf.TPUReplicatedInput"(%0) {is_mirrored_variable = true, is_packed = false} : (tensor<?xi32>) -> (tensor<?xi32>)
+  %2:2 = "tf.opA"(%1) {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", is_stateless = true} : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xf32>)
+  %3 = "tf.TPUReplicatedOutput"(%2#0) : (tensor<?xi32>) -> (tensor<?xi32>)
+  %4 = "tf.Identity"(%3) : (tensor<?xi32>) -> (tensor<?xi32>)
+  %5:2 = "tf.TPUPartitionedOutputV2"(%4) {_XlaSharding = "", partition_dims = []} : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+  %6 = "tf.TPUReplicatedOutput"(%2#1) : (tensor<?xf32>) -> (tensor<?xf32>)
+  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", num_replicas = 1, num_cores_per_replica = 2, topology = "topology"} : () -> ()
+  func.return %5#0, %5#1, %6 : tensor<?xi32>, tensor<?xi32>, tensor<?xf32>
+}
+
+// CHECK: [[CLUSTER:%.+]]:2 = "tf_device.cluster"
+// CHECK: [[OUTPUT:%.+]]:2 = "tf.TPUPartitionedOutputV2"([[CLUSTER]]#0)
+// CHECK: return [[OUTPUT]]#0, [[OUTPUT]]#1, [[CLUSTER]]#1
+
+// -----
+
+func.func @replica_mismatch(%arg0: tensor<?xi32>) {
+  %pi0 = "tf.TPUPartitionedInputV2"(%arg0) {N = 2, partition_dims = [], _XlaSharding = "", is_packed = true} : (tensor<?xi32>) -> (tensor<?xi32>)
+  %pi1 = "tf.TPUPartitionedInputV2"(%arg0) {N = 2, partition_dims = [], _XlaSharding = "", is_packed = true} : (tensor<?xi32>) -> (tensor<?xi32>)
+  %1 = "tf.TPUReplicatedInput"(%pi0, %pi1) {is_mirrored_variable = true, is_packed = false} : (tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>)
+  %2 = "tf.opA"(%1) {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", is_stateless = true} : (tensor<?xi32>) -> (tensor<?xi32>)
+  %3:2 = "tf.TPUReplicatedOutput"(%2) : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+  %4 = "tf.Identity"(%3#0) : (tensor<?xi32>) -> (tensor<?xi32>)
+  // expected-error@+1 {{expected zero or 2 'TPUPartitionedOutput' op(s), instead got 1}}
+  %5:2 = "tf.TPUPartitionedOutputV2"(%4) {_XlaSharding = "", partition_dims = []} : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", num_replicas = 2, num_cores_per_replica = 2, topology = "topology"} : () -> ()
+  func.return
+}
+
+// -----
+
 
 // Test cluster with missing `num_replicas` attribute.
 func.func @missing_num_replicas() {
@@ -707,6 +791,19 @@ func.func @valid_compilation_cluster_no_replication_op_device() {
 
 // -----
 
+// Check conflicting device names
+// CHECK: "tf_device.cluster"()
+// CHECK:    "tf.opA"()
+// CHECK:    "tf.opB"()
+// CHECK-NOT: device =
+func.func @do_nothing_if_short_names_conflict() {
+  "tf.opA"() { _xla_compile_device_type = "TPU", device = "/replica:1/task:2/device:TPU:1"} : () -> ()
+  "tf.opB"() { _xla_compile_device_type = "TPU", device = "/replica:3/task:4/device:TPU:1"} : () -> ()
+  func.return
+}
+
+// -----
+
 // Check non-replicated case, including expected device attr in cluster.
 // CHECK: "tf_device.cluster"()
 // CHECK:    "tf.opA"()
@@ -924,4 +1021,25 @@ func.func @gpu_device() {
   func.return
 }
 
+// -----
 
+// CHECK-LABEL: func @gather_nd
+func.func @gather_nd(%arg0: tensor<*x!tf_type.resource<tensor<80xf32>>>,
+                     %arg1: tensor<3xf32>) {
+  // CHECK: ResourceGatherNd
+  // CHECK: tf_device.cluster
+  // CHECK: Add
+  // CHECK: ResourceGatherNd
+  %0 = "tf.Const"() {value = dense<32> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.ResourceGatherNd"(%arg0, %0) {
+    Tindices = i32
+  } : (tensor<*x!tf_type.resource<tensor<80xf32>>>, tensor<i32>) -> tensor<1x80xf32>
+  %2 = "tf.Add"(%1, %1) {
+    _xla_compile_device_type = "TPU",
+    device = "/task:0/device:TPU:0", dtype = f32
+  } : (tensor<1x80xf32>, tensor<1x80xf32>) -> tensor<1x80xf32>
+  %3 = "tf.ResourceGatherNd"(%arg0, %0) {
+    Tindices = i32
+  } : (tensor<*x!tf_type.resource<tensor<80xf32>>>, tensor<i32>) -> tensor<1x80xf32>
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir
index 05a3f483767..b2896fa543f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir
@@ -101,7 +101,7 @@ func.func @testNonTPUDeviceReplicationIgnored(%arg0: tensor<*x!tf_type.resource<
   // CHECK-SAME: (%[[ARG0]] as %[[RI_0:[a-z0-9]*]]: tensor<*x!tf_type.resource<tensor<4xf32>>>)
   tf_device.replicate(%arg0 as %arg1: tensor<*x!tf_type.resource<tensor<4xf32>>>) {
     _mirrored_variable_indices = [0],
-    devices = {TPU_REPLICATED_HOST = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:1"]},
+    devices = {TPU_REPLICATED_HOST_0 = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:1"]},
     n = 2 : i32} {
      // CHECK:      %[[VAL_OUT:.*]] = "tf.A"() : () -> tensor<4xf32>
      // CHECK-NEXT: "tf.AssignVariableOp"(%[[RI_0]], %[[VAL_OUT]])
@@ -111,7 +111,7 @@ func.func @testNonTPUDeviceReplicationIgnored(%arg0: tensor<*x!tf_type.resource<
      "tf_device.launch"() ({
        "tf.TPUExecuteAndUpdateVariables"(%arg1, %2) {device_var_reads_indices = [0], device_var_updates_indices = [-1]} : (tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<2x!tf_type.string>) -> ()
        tf_device.return
-    }) {device = "TPU_REPLICATED_HOST"} : () -> ()
+    }) {device = "TPU_REPLICATED_HOST_0"} : () -> ()
     tf_device.return
   }
   func.return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_splits.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_splits.mlir
new file mode 100644
index 00000000000..7c97e85c081
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_splits.mlir
@@ -0,0 +1,44 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-colocate-splits | FileCheck %s
+
+// CHECK-LABEL: func @colocate_split_with_pred
+func.func @colocate_split_with_pred() {
+  // CHECK: Split
+  // CHECK-SAME: _class = ["loc:@class"]
+  tf_executor.graph {
+    %c, %control0 = tf_executor.island wraps "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %a, %control1 = tf_executor.island wraps "tf.A"() {_class = ["loc:@class"]} : () -> (tensor<2xf32>)
+    %s:2, %control2 = tf_executor.island wraps "tf.Split"(%c, %a) {num_split = 2 : i32} : (tensor<i32>, tensor<2xf32>) -> (tensor<1xf32>, tensor<1xf32>)
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @colocate_split_with_pred_results
+func.func @colocate_split_with_pred_results() {
+  // CHECK: Split
+  // CHECK-SAME: _class = ["loc:@class"]
+  tf_executor.graph {
+    %c, %control0 = tf_executor.island wraps "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %a:2, %control1 = tf_executor.island wraps "tf.A"() {_class = ["loc:@class"]} : () -> (tensor<2xf32>, tensor<2xf32>)
+    %s:2, %control2 = tf_executor.island wraps "tf.Split"(%c, %a#1) {num_split = 2 : i32} : (tensor<i32>, tensor<2xf32>) -> (tensor<1xf32>, tensor<1xf32>)
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @no_colocate_split_has_device
+func.func @no_colocate_split_has_device() {
+  // CHECK: Split
+  // CHECK-NOT: _class = ["loc:@class"]
+  tf_executor.graph {
+    %c, %control0 = tf_executor.island wraps "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %a, %control1 = tf_executor.island wraps "tf.A"() {_class = ["loc:@class"]} : () -> tensor<2xf32>
+    %s:2, %control2 = tf_executor.island wraps "tf.Split"(%c, %a) {num_split = 2 : i32, device = "device"} : (tensor<i32>, tensor<2xf32>) -> (tensor<1xf32>, tensor<1xf32>)
+    tf_executor.fetch
+  }
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_resource_partitioning.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_resource_partitioning.mlir
index 0e1c5c79e22..91e4ff2b714 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_resource_partitioning.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_resource_partitioning.mlir
@@ -190,7 +190,7 @@ func.func @with_host_process(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg
     "tf_device.launch"() ({
       "tf.OpA"(%1) : (tensor<i32>) -> ()
       tf_device.return
-    }) {device = "TPU_REPLICATED_HOST"} : () -> ()
+    }) {device = "TPU_REPLICATED_HOST_0"} : () -> ()
     tf_device.return
   }, {
     %3 = "tf_device.cluster_func"(%1) {func = @computation, use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
@@ -231,7 +231,7 @@ func.func @non_replicated_sharding(%arg0: tensor<!tf_type.resource<tensor<i32>>>
     "tf_device.launch"() ({
       "tf.OpA"(%1) : (tensor<i32>) -> ()
       tf_device.return
-    }) {device = "TPU_REPLICATED_HOST"} : () -> ()
+    }) {device = "TPU_REPLICATED_HOST_0"} : () -> ()
     tf_device.return
   }, {
     %3 = "tf_device.cluster_func"(%1) {func = @computation, use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
@@ -251,7 +251,7 @@ func.func @packed_replicated(%arg0: tensor<!tf_type.resource<tensor<i32>>> {tf.d
     "tf_device.launch"() ({
       "tf.OpA"(%1) : (tensor<i32>) -> ()
       tf_device.return
-    }) {device = "TPU_REPLICATED_HOST"} : () -> ()
+    }) {device = "TPU_REPLICATED_HOST_0"} : () -> ()
     tf_device.return
   }, {
     %3 = "tf_device.cluster_func"(%1) {func = @computation, use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index c81a69f791f..5896c243e2c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -752,7 +752,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
     // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK-SAME: ([%[[A_OUTPUT]], %[[ARG_0]]] as %[[RI_0:[a-z0-9]*]]: tensor<?xi32>)
-    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"], TPU_REPLICATED_HOST = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:0"]}
+    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"], TPU_REPLICATED_HOST_0 = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:0"]}
     // CHECK-SAME: n = 2
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[RI_0]])
@@ -1585,7 +1585,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
   // CHECK-LABEL: func @replicated_parallel_execute
   func.func @replicated_parallel_execute(%arg0: tensor<8xi32>, %arg1: tensor<8xi32>) -> (tensor<8xi32>, tensor<8xi32>) {
     // CHECK: tf_device.replicate
-    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"], TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"], TPU_REPLICATED_HOST = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:1/device:CPU:0"]}
+    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"], TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"], TPU_REPLICATED_HOST_0 = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:1/device:CPU:0"], TPU_REPLICATED_HOST_1 = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:1/device:CPU:0"]}
     %0:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<8xi32>) {n = 2 : i32} {
       // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
       // CHECK-NEXT:   "tf._TPUCompileMlir"()
@@ -2643,3 +2643,90 @@ module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0",
     return %1 : tensor<?xi32>
   }
 }
+
+// -----
+
+// The following xla.OpSharding is used:
+// Proto debug string:
+//   type : OTHER
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 1
+//   tile_assignment_devices: 0
+//   last_tile_dims: REPLICATED
+// Serialized string:
+//   "\08\03\1A\06\01\01\01\01\01\01\22\01\00B\01\00"
+
+// Test that an input sharding with last_tile_dims REPLICATED won't generate SplitOp.
+//CHECK-NOT: tf.Split
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
+  func.func @cluster_to_single_core(%arg0: tensor<128xf32>) -> tensor<128xf32> {
+    %0 = "tf_device.cluster_func"(%arg0) {_xla_compile_device_type = "TPU", _replication_info = "cluster1", func = @_func, num_replica = 1, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", device_assignment = [], input_sharding_configuration = ["\08\03\1A\06\01\01\01\01\01\01\22\01\00B\01\00"], output_sharding_configuration = [""], use_spmd_for_xla_partitioning = false, use_tpu = true} : (tensor<128xf32>) -> tensor<128xf32>
+    func.return %0 : tensor<128xf32>
+  }
+  func.func @_func(%arg0: tensor<128xf32>) -> tensor<128xf32> {
+    func.return %arg0 : tensor<128xf32>
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @annotate_dynamic_shape_tensor
+module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:COMPOSITE:0", "/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1437 : i32}} {
+  func.func @annotate_dynamic_shape_tensor(%arg0: tensor<512xi64> {tf._user_specified_name = "190", tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) -> (tensor<512xi32>) {
+    %0 = "tf.TPUCompilationResult"() {_tpu_compilation_status = "cluster_test_fn", device = ""} : () -> tensor<!tf_type.string>
+    %cst = "tf.Const"() {value = dense<512> : tensor<i32>} : () -> tensor<i32>
+    %2:4 = "tf_device.launch"() ({
+      %4 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<512xi64>) -> tensor<512xi32>
+      %5 = "tf.TPUCopyWithDynamicShape"(%4,  %cst) {operand_segment_sizes = array<i32: 1, 1>} : (tensor<512xi32>, tensor<i32>) -> tensor<512xi32>
+      tf_device.return %5 : tensor<512xi32>
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<512xi32>, tensor<1024xi32>, tensor<1024xi32>, tensor<1024xf32>)
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:4 = "tf_device.launch"
+    // CHECK: "tf._TPUCompileMlir"()
+    // CHECK: is_bounded_dynamic_dim: true
+    %3 = "tf_device.cluster_func"(%2#0) {_dynamic_arg_index = [0 : i32], _has_manual_control_dependencies = true, _replication_info = "cluster_test_fn", _xla_compile_device_type = "TPU", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [], func = @_func, host_compute_core = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], num_cores_per_replica = 1 : i64, output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", tpu_compile_options_proto = "", use_spmd_for_xla_partitioning = false, use_tpu = true} : (tensor<512xi32>) ->  tensor<512xi32>
+    return %3: tensor<512xi32>
+  }
+func.func private @_func(%arg0: tensor<?xi32, #mhlo.type_extensions<bounds = [512]>> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}) -> (tensor<512xi32>) {
+    %0 = "tf.A"(%arg0) {} : (tensor<?xi32, #mhlo.type_extensions<bounds = [512]>>) -> tensor<512xi32>
+    return %0 : tensor<512xi32>
+  }
+}
+
+// -----
+
+// The following xla.OpSharding is used:
+// Proto debug string:
+//   type : OTHER
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 4
+//   tile_assignment_devices: 0
+//   tile_assignment_devices: 1
+//   tile_assignment_devices: 2
+//   tile_assignment_devices: 3
+//   last_tile_dims: REPLICATED
+// Serialized string:
+//   "\08\03\1A\03\01\01\04\22\04\00\01\02\03B\01\00"
+
+// Test that SplitOp is not generated when an input sharding has 
+// last_tile_dims REPLICATED and more tile_assignment_dimensions 
+// than tensor dimenstions, even when the SPMD sharding is enabled and
+// num_cores_per_replica is more than 1.
+// Test that ConcatV2 Op is not generated when a output sharding has 
+// last_tile_dims REPLICATED and more tile_assignment_dimensions 
+// than tensor dimenstions, even when the SPMD sharding is enabled and
+// num_cores_per_replica is more than 1.
+//CHECK-NOT: tf.Split
+// CHECK-NOT: tf.ConcatV2
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2", "/job:worker/replica:0/task:0/device:TPU:3"]} {
+  func.func @cluster_to_single_core(%arg0: tensor<4x128xf32>) -> tensor<4x128xf32> {
+    %0 = "tf_device.cluster_func"(%arg0) {_xla_compile_device_type = "TPU", _replication_info = "cluster1", func = @_func, num_replica = 1, num_cores_per_replica = 4, step_marker_location = "STEP_MARK_AT_ENTRY", topology = "\0A\04\02\02\01\01\10\01\18\04\22\10\00\00\00\00\01\00\00\00\00\01\00\00\01\01\00\00", device_assignment = [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0], input_sharding_configuration = ["\08\03\1A\03\01\01\04\22\04\00\01\02\03B\01\00"], output_sharding_configuration = ["\08\03\1A\03\01\01\04\22\04\00\01\02\03B\01\00"], use_spmd_for_xla_partitioning = true, use_tpu = true} : (tensor<4x128xf32>) -> tensor<4x128xf32>
+    func.return %0 : tensor<4x128xf32>
+  }
+  func.func @_func(%arg0: tensor<4x128xf32>) -> tensor<4x128xf32> {
+    func.return %arg0 : tensor<4x128xf32>
+  }
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
index 921248cf473..468e3495439 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
@@ -364,6 +364,86 @@ func.func @cluster_func(%arg0: tensor<*xf32>) {
 
 // -----
 
+// Tests TPIv2 with a "partially tiled" XLA annotation where:
+//   type: OTHER
+//   tile_assignment_dimensions: [4, 1, 1, 1, 2]
+//   tile_assignment_devices: [0, 1, 2, 3, 4, 5, 6, 7]
+//   replicate_on_last_tile_dim: true
+// Serialized string:
+//   "\08\03\1A\05\04\01\01\01\02\22\08\00\01\02\03\04\05\06\070\01"
+
+// CHECK-LABEL: func @partial_tile_partitioned_variable
+func.func @partial_tile_partitioned_variable(%arg0: tensor<!tf_type.resource<tensor<1x4x4x4xf32>>>) {
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "\08\03\1A\05\04\01\01\01\02\22\08\00\01\02\03\04\05\06\070\01", partition_dims = [4, 1, 1, 1, 2], is_packed = true} : (tensor<!tf_type.resource<tensor<1x4x4x4xf32>>>) -> tensor<!tf_type.resource<tensor<4x4x4x4xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<4x4x4x4xf32>>>) -> tensor<4x4x4x4xf32>
+  // CHECK:      tf_device.cluster_func
+  // CHECK-SAME: input_sharding_configuration = ["\08\03\1A\05\04\01\01\01\02\22\08\00\01\02\03\04\05\06\070\01"]
+  // CHECK-SAME: output_sharding_configuration = []
+  // CHECK-SAME: use_spmd_for_xla_partitioning = true
+  "tf_device.cluster_func"(%1) {func = @cluster_func, use_spmd_for_xla_partitioning = true, num_cores_per_replica = 8 : i64} : (tensor<4x4x4x4xf32>) -> ()
+  func.return
+}
+
+// CHECK-LABEL: func @cluster_func
+// CHECK-SAME: ({{.+}}: tensor<4x4x4x4xf32> {mhlo.sharding = "\08\03\1A\05\04\01\01\01\02\22\08\00\01\02\03\04\05\06\070\01"})
+func.func @cluster_func(%arg0: tensor<4x4x4x4xf32>) {
+  func.return
+}
+
+// -----
+
+// Tests TPIv2 with a "subgroup tiled" XLA annotation where:
+//   type: OTHER
+//   tile_assignment_dimensions: [4, 1, 1, 1, 2]
+//   tile_assignment_devices: [0, 1, 2, 3, 4, 5, 6, 7]
+//   last_tile_dims: [REPLICATED]
+// Serialized string:
+//   "\08\03\1A\05\04\01\01\01\02\22\08\00\01\02\03\04\05\06\07B\01\00"
+
+// CHECK-LABEL: func @subgroup_tile_partitioned_variable
+func.func @subgroup_tile_partitioned_variable(%arg0: tensor<!tf_type.resource<tensor<1x4x4x4xf32>>>) {
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "\08\03\1A\05\04\01\01\01\02\22\08\00\01\02\03\04\05\06\07B\01\00", partition_dims = [4, 1, 1, 1, 2], is_packed = true} : (tensor<!tf_type.resource<tensor<1x4x4x4xf32>>>) -> tensor<!tf_type.resource<tensor<4x4x4x4xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<4x4x4x4xf32>>>) -> tensor<4x4x4x4xf32>
+  // CHECK:      tf_device.cluster_func
+  // CHECK-SAME: input_sharding_configuration = ["\08\03\1A\05\04\01\01\01\02\22\08\00\01\02\03\04\05\06\07B\01\00"]
+  // CHECK-SAME: output_sharding_configuration = []
+  // CHECK-SAME: use_spmd_for_xla_partitioning = true
+  "tf_device.cluster_func"(%1) {func = @cluster_func, use_spmd_for_xla_partitioning = true, num_cores_per_replica = 8 : i64} : (tensor<4x4x4x4xf32>) -> ()
+  func.return
+}
+
+// CHECK-LABEL: func @cluster_func
+// CHECK-SAME: ({{.+}}: tensor<4x4x4x4xf32> {mhlo.sharding = "\08\03\1A\05\04\01\01\01\02\22\08\00\01\02\03\04\05\06\07B\01\00"})
+func.func @cluster_func(%arg0: tensor<4x4x4x4xf32>) {
+  func.return
+}
+
+// -----
+
+// Tests TPIv2 with a "partially tiled" XLA annotation where:
+//   type: OTHER
+//   tile_assignment_dimensions: [4, 1, 1, 1, 2]
+//   tile_assignment_devices: [0, 1, 2, 3, 4, 5, 6, 7]
+//   replicate_on_last_tile_dim: true
+// Serialized string:
+//   "\08\03\1A\05\04\01\01\01\02\22\08\00\01\02\03\04\05\06\070\01"
+
+// This sharding has an extra dimension than the TPIv2's rank, causing an error.
+
+func.func @partitioned_input_rank_mismatch(%arg0: tensor<!tf_type.resource<tensor<1x4x4xf32>>>) {
+  // expected-error @+1 {{rank}}
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "\08\03\1A\05\04\01\01\01\02\22\08\00\01\02\03\04\05\06\070\01", partition_dims = [4, 1, 1, 2], is_packed = true} : (tensor<!tf_type.resource<tensor<1x4x4xf32>>>) -> tensor<!tf_type.resource<tensor<4x4x4xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<4x4x4xf32>>>) -> tensor<4x4x4xf32>
+  "tf_device.cluster_func"(%1) {func = @cluster_func, use_spmd_for_xla_partitioning = true, num_cores_per_replica = 8 : i64} : (tensor<4x4x4xf32>) -> ()
+  func.return
+}
+
+func.func @cluster_func(%arg0: tensor<4x4x4xf32>) {
+  func.return
+}
+
+// -----
+
 // Tests partitioned inputs/outputs with no sharding (via XLA SPMD) defaults to
 // replicate sharding ("").
 
@@ -484,6 +564,43 @@ func.func @func(%arg0: tensor<*xi32> {tf.aliasing_output = 1 : i64},
 
 // -----
 
+// Partial tiled inputs using XlaSharding ops identified as REPLICATED should keep the sharding configuration.
+// The following xla.OpSharding is used:
+// Proto debug string:
+//   type : OTHER
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 2
+//   tile_assignment_devices: 0
+//   tile_assignment_devices: 1
+//   last_tile_dims: REPLICATED
+// Serialized string:
+// "\08\03\1A\03\01\01\02\22\02\00\01B\01\00"
+
+// CHECK-LABEL: func @check_partial_tile_mpmd_fallback
+func.func @check_partial_tile_mpmd_fallback(%arg0: tensor<2x7xi64>) -> tensor<2x7xi32> {
+  // CHECK:      tf_device.cluster_func
+  // CHECK-SAME: input_sharding_configuration = ["\08\03\1A\03\01\01\02\22\02\00\01B\01\00"]
+  // CHECK-SAME: output_sharding_configuration = [""]
+  // CHECK-SAME: use_spmd_for_xla_partitioning = true
+  %0 = "tf_device.cluster_func"(%arg0) {
+      func = @func,
+      use_spmd_for_xla_partitioning = true, num_cores_per_replica = 2 : i64
+  } : (tensor<2x7xi64>) -> (tensor<2x7xi32>)
+  %1 = "tf.Identity"(%0) : (tensor<2x7xi32>) -> tensor<2x7xi32>
+  func.return %1 : tensor<2x7xi32>
+}
+
+// CHECK-LABEL: func @func
+// CHECK-SAME: %arg0: tensor<2x7xi64> {mhlo.sharding = "\08\03\1A\03\01\01\02\22\02\00\01B\01\00"
+func.func @func(%arg0: tensor<2x7xi64>) -> (tensor<2x7xi32>) {
+  %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<2x7xi64>) -> tensor<2x7xi32>
+  %1 = "tf.XlaSharding"(%0) {_XlaSharding = "\08\03\1A\03\01\01\02\22\02\00\01B\01\00", sharding = "\08\03\1A\03\01\01\02\22\02\00\01B\01\00", unspecified_dims = []} : (tensor<2x7xi32>) -> tensor<2x7xi32>
+  func.return %0 : tensor<2x7xi32>
+}
+
+// -----
+
 // CHECK-LABEL: func @check_arg_sharding_errors
 func.func @check_arg_sharding_errors(%arg0: tensor<1x2x3xi32>) {
   // CHECK:      tf_device.cluster_func
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_validate_inputs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_validate_inputs.mlir
index d61f4cfeadf..7edb50ada79 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_validate_inputs.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_validate_inputs.mlir
@@ -2,59 +2,142 @@
 
 // CHECK-LABEL: func @num_replicas_replicated
 func.func @num_replicas_replicated(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
-  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
-  %ri = "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  %out = "tf.opA"(%ri) : (tensor<i32>) -> tensor<i32>
-  %ro:2 = "tf.TPUReplicatedOutput"(%out) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
-  func.return %ro#0, %ro#1 : tensor<i32>, tensor<i32>
+  %0:2 = tf_executor.graph {
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    %ro:2, %c2 = tf_executor.island wraps "tf.TPUReplicatedOutput"(%out) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    tf_executor.fetch %ro#0, %ro#1 : tensor<i32>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i32>, tensor<i32>
 }
 
 // -----
 
 func.func @num_replicas_replicated_input(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
-  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
-  // expected-error @+1 {{'tf.TPUReplicatedInput' op TF/XLA TPU bridge input check: number of inputs inconsistent. num_replicas=2 no. of inputs=3}}
-  %ri = "tf.TPUReplicatedInput"(%arg0, %arg1, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
-  %ro:2 = "tf.TPUReplicatedOutput"(%ri) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
-  func.return %ro#0, %ro#1 : tensor<i32>, tensor<i32>
+  %0:2 = tf_executor.graph {
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    // expected-error @+1 {{'tf.TPUReplicatedInput' op TF2XLA TPU bridge input check: number of inputs inconsistent. num_replicas=2 no. of inputs=3}}
+    %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+    %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    %ro:2, %c2 = tf_executor.island wraps "tf.TPUReplicatedOutput"(%out) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    tf_executor.fetch %ro#0, %ro#1 : tensor<i32>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i32>, tensor<i32>
 }
 
 // -----
 
 func.func @num_replicas_replicated_input_packed(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
-  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
-  // expected-error @+1 {{'tf.TPUReplicatedInput' op TF/XLA TPU bridge input check: packed with number of inputs not 1. num_replicas=2 no. of inputs=2}}
-  %ri = "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = true} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  %ro:2 = "tf.TPUReplicatedOutput"(%ri) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
-  func.return %ro#0, %ro#1 : tensor<i32>, tensor<i32>
+  %0:2 = tf_executor.graph {
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    // expected-error @+1 {{'tf.TPUReplicatedInput' op TF2XLA TPU bridge input check: packed with number of inputs not 1. num_replicas=2 no. of inputs=2}}
+    %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = true} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    %ro:2, %c2 = tf_executor.island wraps "tf.TPUReplicatedOutput"(%out) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    tf_executor.fetch %ro#0, %ro#1 : tensor<i32>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i32>, tensor<i32>
 }
 
 // -----
 
 func.func @num_replicas_replicated_output(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
-  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
-  %ri = "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  // expected-error @+1 {{'tf.TPUReplicatedOutput' op TF/XLA TPU bridge input check: number of outputs inconsistent. num_replicas=2 no. of outputs=3}}
-  %ro:3 = "tf.TPUReplicatedOutput"(%ri) : (tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
-  func.return %ro#0, %ro#1 : tensor<i32>, tensor<i32>
+  %0:2 = tf_executor.graph {
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    // expected-error @+1 {{'tf.TPUReplicatedOutput' op TF2XLA TPU bridge input check: number of outputs inconsistent. num_replicas=2 no. of outputs=3}}
+    %ro:3, %c2 = tf_executor.island wraps "tf.TPUReplicatedOutput"(%out) : (tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
+    tf_executor.fetch %ro#0, %ro#1 : tensor<i32>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i32>, tensor<i32>
 }
 
 // -----
 
 func.func @num_core_per_replica_partitioned_input(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
-  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
-  // expected-error @+1 {{'tf.TPUPartitionedInput' op TF/XLA TPU bridge input check: number of inputs inconsistent. num_cores_per_replica=2 no. of inputs=3}}
-  %pi = "tf.TPUPartitionedInput"(%arg0, %arg1, %arg1) {index = 1 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
-  %po:2 = "tf.TPUPartitionedOutput"(%pi) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
-  func.return %po#0, %po#1 : tensor<i32>, tensor<i32>
+  %0:2 = tf_executor.graph {
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
+    // expected-error @+1 {{'tf.TPUPartitionedInput' op TF2XLA TPU bridge input check: number of inputs inconsistent. num_cores_per_replica=2 no. of inputs=3}}
+    %pi, %c0 = tf_executor.island wraps "tf.TPUPartitionedInput"(%arg0, %arg1, %arg1) {index = 1 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+    %out, %c1 = tf_executor.island wraps "tf.opA"(%pi) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    %po:2, %c2 = tf_executor.island wraps "tf.TPUPartitionedOutput"(%out) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    tf_executor.fetch %po#0, %po#1 : tensor<i32>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i32>, tensor<i32>
 }
 
 // -----
 
 func.func @num_core_per_replica_partitioned_output(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
-  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
-  %pi = "tf.TPUPartitionedInput"(%arg0, %arg1) {index = 1 : i64} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  // expected-error @+1 {{'tf.TPUPartitionedOutput' op TF/XLA TPU bridge input check: number of outputs inconsistent. num_cores_per_replica=2 no. of outputs=3}}
-  %po:3 = "tf.TPUPartitionedOutput"(%pi) : (tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
-  func.return %po#0, %po#1 : tensor<i32>, tensor<i32>
-}
\ No newline at end of file
+  %0:2 = tf_executor.graph {
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
+    %pi, %c0 = tf_executor.island wraps "tf.TPUPartitionedInput"(%arg0, %arg1) {index = 1 : i64} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %out, %c1 = tf_executor.island wraps "tf.opA"(%pi) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    // expected-error @+1 {{'tf.TPUPartitionedOutput' op TF2XLA TPU bridge input check: number of outputs inconsistent. num_cores_per_replica=2 no. of outputs=3}}
+    %po:3, %c2 = tf_executor.island wraps "tf.TPUPartitionedOutput"(%out) : (tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
+    tf_executor.fetch %po#0, %po#1 : tensor<i32>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i32>, tensor<i32>
+}
+
+// -----
+
+func.func @validate_tpu_replicate_no_attr(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  %0:2 = tf_executor.graph {
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate="cluster"}: (tensor<i32>) -> tensor<i32>
+    // expected-warning @+1 {{TF2XLA TPU bridge input check: cluster op = tf.opA with cluster = cluster has successor as non cluster op tf.opB}}
+    %out2, %c2 = tf_executor.island wraps "tf.opB"(%out) : (tensor<i32>) -> tensor<i32>
+    // expected-error @+1 {{tf.TPUReplicatedOutput' op TF2XLA TPU bridge input check: non-cluster op = tf.opB has invalid successor op = tf.TPUReplicatedOutput}}
+    %ro:2, %c4 = tf_executor.island wraps "tf.TPUReplicatedOutput"(%out2) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    tf_executor.fetch %ro#0, %ro#1 : tensor<i32>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i32>, tensor<i32>
+}
+
+// -----
+
+func.func @validate_tpu_replicate_wrong_attr(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  %0:2 = tf_executor.graph {
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster_wrong"}: (tensor<i32>) -> tensor<i32>
+    // expected-error @+1 {{'tf.opB' op TF2XLA TPU bridge input check: mismatch clusters tpu_replicate attr. Parent op tf.opA with cluster = cluster_wrong has successor cluster op tf.opB with cluster = cluster}}
+    %out2, %c2 = tf_executor.island wraps "tf.opB"(%out) {_tpu_replicate = "cluster"}: (tensor<i32>) -> tensor<i32>
+    %ro:2, %c3 = tf_executor.island wraps "tf.TPUReplicatedOutput"(%out2) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    tf_executor.fetch %ro#0, %ro#1 : tensor<i32>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i32>, tensor<i32>
+}
+
+// -----
+
+func.func @valid_xla_nonxla(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  %0:2 = tf_executor.graph {
+    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster", device = "TPU"} : (tensor<i32>) -> tensor<i32>
+    %ro:2, %c2 = tf_executor.island wraps "tf.TPUReplicatedOutput"(%out) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    tf_executor.fetch %ro#0, %ro#1 : tensor<i32>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<i32>, tensor<i32>
+}
+
+// -----
+
+func.func @valid_xla_nonxla_warning(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<*x!tf_type.string>, tensor<*x!tf_type.string>) {
+  %0:2 = tf_executor.graph {
+    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<*x!tf_type.string>
+    // expected-warning @+1 {{TF/XLA TPU bridge input check: found invalid op. tf.Identity can't be both xla and non-xla}}
+    %out, %c1 = tf_executor.island(%c0) wraps "tf.Identity"(%ri) {_tpu_replicate = "cluster", device = ""} : (tensor<*x!tf_type.string>) -> tensor<*x!tf_type.string>
+    %ro:2, %c2 = tf_executor.island wraps "tf.TPUReplicatedOutput"(%out) : (tensor<*x!tf_type.string>) -> (tensor<*x!tf_type.string>, tensor<*x!tf_type.string>)
+    tf_executor.fetch %ro#0, %ro#1 : tensor<*x!tf_type.string>, tensor<*x!tf_type.string>
+  }
+  return %0#0, %0#1 : tensor<*x!tf_type.string>, tensor<*x!tf_type.string>
+}
+
+// -----
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/transpose-op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/transpose-op.mlir
new file mode 100644
index 00000000000..d719977dc36
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/transpose-op.mlir
@@ -0,0 +1,10 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics
+
+func.func @out_of_bounds_check(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32> {
+  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = "tf.Const"() {value = dense<[0, 0x4141, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %2 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
+  // expected-error @+1 {{'tf.Transpose' op perm[1]=16705 must be in range [-4, 4)}}
+  %3 = "tf.Transpose"(%2, %1) : (tensor<1x8x4x4xf32>, tensor<4xi32>) -> tensor<1x4x4x8xf32>
+  func.return %3 : tensor<1x4x4x8xf32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite.mlir
index 973fe031d75..675ae224f6b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite.mlir
@@ -1,72 +1,50 @@
 // RUN: tf-opt %s -split-input-file -tf-xla-rewrite | FileCheck %s
 
-// CHECK-LABEL: func.func @convert_partitioned_call
-func.func @convert_partitioned_call(%arg0: tensor<i32>) -> tensor<i32> {
-  %0 = "tf_device.cluster"() ({
-    // CHECK: "tf.XlaLaunch"(%arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @pcall_func, operand_segment_sizes = array<i32: 0, 1, 0>} : (tensor<i32>) -> tensor<i32>
-    %1 = "tf.PartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @pcall_func} : (tensor<i32>) -> (tensor<i32>)
-    tf_device.return %1 : tensor<i32>
-  }) : () -> tensor<i32>
-  func.return %0 : tensor<i32>
-}
 
-func.func @pcall_func(%arg0: tensor<i32>) -> tensor<i32> {
-  func.return %arg0 : tensor<i32>
+module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:GPU:0"]} {
+  // CHECK-LABEL: func.func @convert_cluster_func
+  func.func @convert_cluster_func(%arg0: tensor<i32>) -> tensor<i32> {
+    // CHECK: "tf.XlaLaunch"(%arg0) {function = @func, operand_segment_sizes = array<i32: 0, 1, 0>} : (tensor<i32>) -> tensor<i32>
+    %0 = "tf_device.cluster_func"(%arg0) {func = @func} : (tensor<i32>) -> tensor<i32>
+    func.return %0 : tensor<i32>
+  }
+
+  func.func @func(%arg0: tensor<i32>) -> tensor<i32> {
+    func.return %arg0 : tensor<i32>
+  }
 }
 
 // -----
 
-// CHECK-LABEL: func.func @convert_stateful_partitioned_call
-func.func @convert_stateful_partitioned_call(%arg0: tensor<i32>) -> tensor<i32> {
-  %0 = "tf_device.cluster"() ({
-    // CHECK: "tf.XlaLaunch"(%arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @stateful_pcall_func, operand_segment_sizes = array<i32: 0, 1, 0>} : (tensor<i32>) -> tensor<i32>
-    %1 = "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func} : (tensor<i32>) -> (tensor<i32>)
-    tf_device.return %1 : tensor<i32>
-  }) : () -> tensor<i32>
+module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:GPU:0"]} {
+  // CHECK-LABEL: func.func @convert_cluster_func_with_resources_in_order
+  func.func @convert_cluster_func_with_resources_in_order(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
+    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {function = @func_with_resources_in_order, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
+    %0 = "tf_device.cluster_func"(%arg1, %arg0) {func = @func_with_resources_in_order} : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<i32>)
+    func.return %0 : tensor<i32>
+  }
 
-  func.return %0 : tensor<i32>
-}
-
-func.func @stateful_pcall_func(%arg0: tensor<i32>) -> tensor<i32> {
-  func.return %arg0 : tensor<i32>
+  func.func @func_with_resources_in_order(%arg0 : tensor<i32>, %arg1 : tensor<!tf_type.resource>) -> tensor<i32> {
+    func.return %arg0 : tensor<i32>
+  }
 }
 
 // -----
 
-// CHECK-LABEL: func.func @convert_stateful_partitioned_call_with_resources_in_order
-func.func @convert_stateful_partitioned_call_with_resources_in_order(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-  %0 = "tf_device.cluster"() ({
-    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @stateful_pcall_func_with_resources_in_order, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
-    %1 = "tf.StatefulPartitionedCall"(%arg1, %arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func_with_resources_in_order} : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<i32>)
-     tf_device.return %1 : tensor<i32>
-  }) : () -> tensor<i32>
-  func.return %0 : tensor<i32>
-}
+module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:GPU:0"]} {
+  // CHECK-LABEL: func.func @convert_cluster_func_with_resources
+  func.func @convert_cluster_func_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
+    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {function = @func_with_resources, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
+    %0 = "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_resources} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
+    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {function = @func_with_resources, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
+    %1 = "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_resources} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
 
-func.func @stateful_pcall_func_with_resources_in_order(%arg0 : tensor<i32>, %arg1 : tensor<!tf_type.resource>) -> tensor<i32> {
-  func.return %arg0 : tensor<i32>
-}
-
-// -----
-
-// CHECK-LABEL: func.func @convert_stateful_partitioned_call_with_resources
-func.func @convert_stateful_partitioned_call_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-  %0 = "tf_device.cluster"() ({
-    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @stateful_pcall_func_with_resources, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
-    %2 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func_with_resources} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
-    tf_device.return %2 : tensor<i32>
-  }) : () -> tensor<i32>
-  %1 = "tf_device.cluster"() ({
-    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @stateful_pcall_func_with_resources, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
-    %2 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func_with_resources} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
-    tf_device.return %2 : tensor<i32>
-  }) : () -> tensor<i32>
-  return %0 : tensor<i32>
-}
-
-// CHECK-LABEL: func.func @stateful_pcall_func_with_resources
-// CHECK-SAME:  (%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource>) -> tensor<i32>
-// CHECK:         return %arg0 : tensor<i32>
-func.func @stateful_pcall_func_with_resources(%arg0 : tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-  func.return %arg1 : tensor<i32>
+  // CHECK-LABEL: func.func @func_with_resources
+  // CHECK-SAME:  (%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource>) -> tensor<i32>
+  // CHECK:         return %arg0 : tensor<i32>
+  func.func @func_with_resources(%arg0 : tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
+    func.return %arg1 : tensor<i32>
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index d604cb247b7..74f5a458b0b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -26,22 +26,29 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/platform/error_payloads.h"
 #include "tensorflow/core/protobuf/core_platform_payloads.pb.h"
+#include "tensorflow/core/util/debug_data_dumper.h"
 
 namespace mlir {
 namespace {
-
 // Add logger to bridge passmanager.
 // Enable timing statistics per pass for the bridge passmanager.
-void EnableDetailedLogging(PassManager *pm) {
+void EnableDetailedLogging(PassManager *pm,
+                           llvm::StringRef module_name = llvm::StringRef()) {
   // Print the whole module after each pass, which requires disabling
   // multi-threading as well.
   pm->getContext()->disableMultithreading();
-  pm->enableIRPrinting(std::make_unique<tensorflow::BridgeLoggerConfig>(
+  pm->enableIRPrinting(std::make_unique<::tensorflow::DataDumperLoggerConfig>(
+      [module_name](const std::string &pass_tag_name) {
+        return DEBUG_DATA_DUMPER()->GetDumpFilename(
+            module_name.str(), kDebugGroupBridgePhase1, pass_tag_name);
+      },
+      "",
       /*print_module_scope=*/true));
   pm->enableTiming();
 }
@@ -50,11 +57,24 @@ void EnableDetailedLogging(PassManager *pm) {
 namespace TFTPU {
 
 namespace {
+std::string GetMLIRModuleText(mlir::Operation *op,
+                              const mlir::PassManager *pass_manager) {
+  std::string module_txt;
+  llvm::raw_string_ostream os(module_txt);
+
+  if (pass_manager) ::tensorflow::PrintPassPipeline(*pass_manager, op, os);
+
+  op->print(os, mlir::OpPrintingFlags().useLocalScope());
+
+  return os.str();
+}
+
 // Run the TF XLA Bridge based on the input pipeline, which can be either TPU
 // bridge pipeline or non TPU bridge pipeline.
 tensorflow::Status RunTFXLABridge(
-    ModuleOp module, bool enable_logging,
-    llvm::function_ref<void(OpPassManager &pm)> pipeline_builder) {
+    ModuleOp module,
+    llvm::function_ref<void(OpPassManager &pm)> pipeline_builder,
+    llvm::StringRef module_name = llvm::StringRef()) {
   // Explicitly check that the TensorFlow dialect can constant fold ops.
   // Constant folding is essential for the bridge. Without this check, the
   // bridge may fail with an error that is difficult to understand and not
@@ -76,18 +96,35 @@ tensorflow::Status RunTFXLABridge(
       module.getContext(), /*propagate=*/false,
       /*filter_stack=*/!VLOG_IS_ON(1));
 
-  if (enable_logging || VLOG_IS_ON(1)) {
-    tensorflow::DumpMlirOpToFile("tf_xla_bridge_before", module, "", &bridge);
-    if (VLOG_IS_ON(2)) EnableDetailedLogging(&bridge);
+  if (VLOG_IS_ON(1) ||
+      DEBUG_DATA_DUMPER()->ShouldDump(module_name.str(), kDebugGroupMain)) {
+    ::tensorflow::DumpMlirOpToFile(
+        DEBUG_DATA_DUMPER()->GetDumpFilename(module_name.str(), kDebugGroupMain,
+                                             "tf_xla_bridge_before"),
+        module, llvm::StringRef(), &bridge);
   }
+
+  if (VLOG_IS_ON(2) || DEBUG_DATA_DUMPER()->ShouldDump(
+                           module_name.str(), kDebugGroupBridgePhase1)) {
+    EnableDetailedLogging(&bridge, module_name);
+  }
+
   LogicalResult result = bridge.run(module);
   (void)result;
-  if (enable_logging || VLOG_IS_ON(1))
-    tensorflow::DumpMlirOpToFile("tf_xla_bridge_after", module, "", &bridge);
+
+  if (VLOG_IS_ON(1) ||
+      DEBUG_DATA_DUMPER()->ShouldDump(module_name.str(), kDebugGroupMain)) {
+    ::tensorflow::DumpMlirOpToFile(
+        DEBUG_DATA_DUMPER()->GetDumpFilename(module_name.str(), kDebugGroupMain,
+                                             "tf_xla_bridge_after"),
+        module, llvm::StringRef(), &bridge);
+  }
+
   return diag_handler.ConsumeStatus();
 }
 
-void CreateTPUBridgePipelineImpl(OpPassManager &pm) {
+void CreateTPUBridgePipelineImpl(
+    OpPassManager &pm, llvm::StringRef module_name = llvm::StringRef()) {
   // The following ops must be preserved regardless of reachability. Ideally,
   // all graphs should have control dependencies to enforce this but this is
   // currently not the case (see b/177478741).
@@ -111,6 +148,7 @@ void CreateTPUBridgePipelineImpl(OpPassManager &pm) {
   pm.addNestedPass<func::FuncOp>(
       CreateTPUReorderReplicateAndPartitionedInputsPass());
   pm.addNestedPass<func::FuncOp>(TF::CreateDecomposeReduceDatasetPass());
+  pm.addPass(TFDevice::CreateEmbeddingPipeliningPass());
   pm.addPass(CreateTPUClusterFormationPass());
   // Run TPU cluster cleanup attributes so ops with no outside compiled
   // attribute have no host device attribute.
@@ -192,11 +230,13 @@ void CreateTPUBridgePipelineImpl(OpPassManager &pm) {
   pm.addPass(TFDevice::CreateAnnotateParameterReplicationPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateRewriteTPUEmbeddingOpsPass());
-  pm.addPass(CreateTPURewritePass());
+  pm.addPass(CreateTPUAnnotateDynamicShapeInputsPass());
+  pm.addPass(CreateTPURewritePass(module_name));
   pm.addPass(createSymbolDCEPass());
   pm.addNestedPass<func::FuncOp>(
       TFDevice::CreateReplicateInvariantOpHoistingPass());
   pm.addPass(CreateTPUMergeVariablesWithExecutePass());
+  pm.addNestedPass<func::FuncOp>(CreateExtractTPUCopyWithDynamicShapeOpPass());
   pm.addNestedPass<func::FuncOp>(
       TF::CreateHoistReplicateInvariantResourceWritesPass());
   pm.addNestedPass<func::FuncOp>(CreateTPUColocateCompositeResourceOps());
@@ -205,11 +245,11 @@ void CreateTPUBridgePipelineImpl(OpPassManager &pm) {
 }
 }  // namespace
 
-void CreateTPUBridgePipeline(OpPassManager &pm) {
+void CreateTPUBridgePipeline(OpPassManager &pm, llvm::StringRef module_name) {
   pm.addPass(CreateTPUValidateInputsPass());
   pm.addNestedPass<func::FuncOp>(
       TF::CreateCanonicalizeCompileAndReplicateAttributesPass());
-  CreateTPUBridgePipelineImpl(pm);
+  CreateTPUBridgePipelineImpl(pm, module_name);
 }
 
 void CreateTPUBridgePipelineV1(OpPassManager &pm) {
@@ -238,27 +278,32 @@ void CreateTPUBridgePipelineV1(OpPassManager &pm) {
       CreateConvertToLegacyCompileAndReplicateAttributesPass());
 }
 
-tensorflow::Status TPUBridge(ModuleOp module, bool enable_logging,
-                             bool fallback_enabled) {
-  Status status = RunTFXLABridge(module, enable_logging, [](OpPassManager &pm) {
-    CreateTPUBridgePipeline(pm);
-    // Add set of passes to lower back to graph (from tf_executor).
-    // Use graph export pipline V2 in TPU Bridge.
-    // TODO(hanxiong): Completely replace AddGraphExportLoweringPasses with
-    // AddGraphExortLoweringPassessV2 in all the code paths (V1 compat pipeline,
-    // CPU/GPU bridge, etc.)
-    TF::AddGraphExportLoweringPassesV2(pm);
-  });
+tensorflow::Status TPUBridge(ModuleOp module, bool fallback_enabled,
+                             llvm::StringRef module_name) {
+  Status status = RunTFXLABridge(
+      module,
+      [module_name](OpPassManager &pm) {
+        CreateTPUBridgePipeline(pm, module_name);
+        // Add set of passes to lower back to graph
+        // (from tf_executor). Use graph export
+        // pipline V2 in TPU Bridge.
+        // TODO(hanxiong): Completely replace
+        // AddGraphExportLoweringPasses with
+        // AddGraphExortLoweringPassessV2 in all the
+        // code paths (V1 compat pipeline, CPU/GPU
+        // bridge, etc.)
+        TF::AddGraphExportLoweringPassesV2(pm);
+      },
+      module_name);
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
       "tpu", "v2", fallback_enabled, status.ok() ? "success" : "failure");
-  OkOrSetErrorCounterPayload(
+  tsl::OkOrSetErrorCounterPayload(
       tensorflow::core::platform::ErrorSourceProto::MLIR_BRIDGE_PHASE_1,
       status);
   return status;
 }
-tensorflow::Status TPUBridgeV1Compat(ModuleOp module, bool enable_logging,
-                                     bool fallback_enabled) {
-  Status status = RunTFXLABridge(module, enable_logging, [](OpPassManager &pm) {
+tensorflow::Status TPUBridgeV1Compat(ModuleOp module, bool fallback_enabled) {
+  Status status = RunTFXLABridge(module, [](OpPassManager &pm) {
     CreateTPUBridgePipelineV1(pm);
     // Add set of passes to lower back to graph (from tf_executor).
     TF::AddGraphExportLoweringPasses(pm);
@@ -272,6 +317,8 @@ tensorflow::Status TPUBridgeV1Compat(ModuleOp module, bool enable_logging,
 
 namespace TF {
 
+void NoCanonicalization(OpPassManager &pm) {}
+
 void AddGraphExportLoweringPasses(OpPassManager &pm) {
   auto add_pass = [&](std::unique_ptr<Pass> pass) {
     pm.addNestedPass<func::FuncOp>(std::move(pass));
@@ -286,6 +333,7 @@ void AddGraphExportLoweringPasses(OpPassManager &pm) {
   add_pass(TFDevice::CreateLaunchToDeviceAttributePass(
       /*legacy_graph_export=*/true));
   pm.addNestedPass<func::FuncOp>(TFTPU::CreateTPUDevicePropagationPass());
+  pm.addNestedPass<func::FuncOp>(TFTPU::CreateTPUColocateSplitsPass());
   pm.addPass(createSymbolDCEPass());
   if (tensorflow::GetMlirCommonFlags()
           ->tf_mlir_enable_convert_control_to_data_outputs_pass) {
@@ -317,6 +365,7 @@ void AddGraphExportLoweringPassesV2(OpPassManager &pm) {
   pm.addPass(tf_executor::CreateTFExecutorUpdateControlDependenciesPass());
 
   pm.addNestedPass<func::FuncOp>(TFTPU::CreateTPUDevicePropagationPass());
+  pm.addNestedPass<func::FuncOp>(TFTPU::CreateTPUColocateSplitsPass());
   pm.addPass(createSymbolDCEPass());
   if (tensorflow::GetMlirCommonFlags()
           ->tf_mlir_enable_convert_control_to_data_outputs_pass) {
@@ -382,6 +431,13 @@ void CreateTFXLABridgePipeline(OpPassManager &pm) {
   // shapes.
   pm.addPass(TF::CreateTFShapeInferencePass());
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  // Inline all the function calls. Do not call canonicalizer to prevent it from
+  // moving the definition of any constant operand of ops within a cluster to
+  // its outside. This may cause the op to fail to verify after the cluster is
+  // outlined, as the constant operand is replaced by an argument.
+  pm.addPass(mlir::createInlinerPass({}, NoCanonicalization));
+  // Lift resource operations out of device computation. This step needs to be
+  // done after inlining.
   pm.addPass(TFDevice::CreateResourceOpLiftingPass());
   // TODO(b/267193636): Remove this flag when outside compilation
   // for generic pipeline is landed.
@@ -391,10 +447,10 @@ void CreateTFXLABridgePipeline(OpPassManager &pm) {
     pm.addPass(TFDevice::CreateExtractHeadTailOutsideCompilationPass());
     pm.addPass(TFDevice::CreateExtractOutsideCompilationPass());
   }
+  // Outline clusters into cluster functions.
+  pm.addPass(TFDevice::CreateClusterOutliningPass());
   // Rewrite cluster functions into XLA  launch ops.
   pm.addPass(TFDevice::CreateXlaRewritePass());
-  // Inline the cluster ops.
-  pm.addPass(TFDevice::CreateXlaInlineDeviceOpsPass());
   // Re-run the canonicalizer pass as some cleanup during resource op lifting
   // pass opens up some opportunities for canonicalization of cluster ops.
   // Specifically, we want to eliminate pass through results from the cluster
@@ -406,13 +462,16 @@ void CreateTFXLABridgePipeline(OpPassManager &pm) {
   pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
 }
 
-tensorflow::Status RunTFXLABridge(ModuleOp module, bool enable_logging) {
+tensorflow::Status RunTFXLABridge(ModuleOp module,
+                                  llvm::StringRef module_name) {
   Status status = mlir::TFTPU::RunTFXLABridge(
-      module, enable_logging, [](OpPassManager &pm) {
+      module,
+      [](OpPassManager &pm) {
         CreateTFXLABridgePipeline(pm);
         // Add set of passes to lower back to graph (from tf_executor).
         TF::AddGraphExportLoweringPasses(pm);
-      });
+      },
+      module_name);
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
       /*device type*/ "cpu/gpu", /*bridge version*/ "tfxla",
       /*fallback_enabled*/ false,
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
index 925149dd843..b0125cc592a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BRIDGE_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BRIDGE_H_
 
+#include <string>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/core/lib/core/status.h"
@@ -24,20 +26,18 @@ namespace mlir {
 namespace TFTPU {
 
 // Run all the passes involved in transforming the graph before execution so
-// that it is suitable for targeting TPUs. When enable_logging is true, enables
-// tensorflow::BridgeLogger. When fallback_enabled is true, it means if the
-// bridge fails the old bridge will run. This is used for logging and doesn't
-// affect any logic.
-tensorflow::Status TPUBridge(ModuleOp module, bool enable_logging,
-                             bool fallback_enabled = false);
+// that it is suitable for targeting TPUs. When fallback_enabled is true, it
+// means if the bridge fails the old bridge will run. This is used for logging
+// and doesn't affect any logic.
+tensorflow::Status TPUBridge(ModuleOp module, bool fallback_enabled = false,
+                             llvm::StringRef module_name = llvm::StringRef());
 
 // Run all the passes involved in transforming the graph before execution so
-// that it is suitable for targeting TPUs. When enable_logging is true, enables
-// tensorflow::BridgeLogger.  When fallback_enabled is true, it means if the
-// bridge fails the old bridge will run.  This is used for logging and doesn't
-// affect any logic.
+// that it is suitable for targeting TPUs. When fallback_enabled is true, it
+// means if the bridge fails the old bridge will run.  This is used for logging
+// and doesn't affect any logic.
 // This variant of `TPUBridge` is intended for TensorFlow V1 compatibility.
-tensorflow::Status TPUBridgeV1Compat(ModuleOp module, bool enable_logging,
+tensorflow::Status TPUBridgeV1Compat(ModuleOp module,
                                      bool fallback_enabled = false);
 
 }  // namespace TFTPU
@@ -56,7 +56,8 @@ tensorflow::Status RunBridgeWithStandardPipeline(ModuleOp module,
                                                  bool enable_inliner);
 
 // Runs all passes for non TPU (GPU and CPU) graph.
-tensorflow::Status RunTFXLABridge(ModuleOp module, bool enable_logging);
+tensorflow::Status RunTFXLABridge(
+    ModuleOp module, llvm::StringRef module_name = llvm::StringRef());
 }  // namespace TF
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge_pass.cc
index c05581fd202..19850ddc3aa 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge_pass.cc
@@ -33,7 +33,9 @@ mlir::PassPipelineRegistration<> tpu_pipeline(
     "tf-tpu-bridge",
     "Run all the passes involved in transforming the graph before execution so "
     "that it is suitable for targeting TPUs.",
-    mlir::TFTPU::CreateTPUBridgePipeline);
+    [](mlir::OpPassManager& pm) {
+      return mlir::TFTPU::CreateTPUBridgePipeline(pm);
+    });
 
 // Registers a pipeline builder function for TF TPU V1 bridge.
 mlir::PassPipelineRegistration<> tpu_pipeline_v1(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h b/tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h
index e04d1323352..6d27780316f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h
@@ -47,7 +47,7 @@ LogicalResult GetOutermostOpsOfType(
         auto v = symtab.lookup<func::FuncOp>(sym.getRootReference());
         if (!v) {
           // This is not expected to happen in practice.
-          v.emitError() << "Cannot find function " << sym.getRootReference();
+          op->emitError() << "Cannot find function " << sym.getRootReference();
           return WalkResult::interrupt();
         }
         worklist.push(v);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
index 25118033f65..4b409ffe1f6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
@@ -428,7 +428,7 @@ static ClusteringState InitializeClusteringState(
   }
 
   // Initialize mapping from the member operation (block argument) to the id.
-  for (auto &tuple : llvm::enumerate(state.members)) {
+  for (const auto &tuple : llvm::enumerate(state.members)) {
     state.member_ids.try_emplace(tuple.value().source, tuple.index());
   }
 
@@ -471,7 +471,7 @@ static bool RunClusteringPass(ClusteringState &state,
                               const ClusteringPolicySet &policies) {
   bool clustered = false;
 
-  for (auto &tuple : llvm::enumerate(state.members)) {
+  for (const auto &tuple : llvm::enumerate(state.members)) {
     size_t member_id = tuple.index();
     Member &member = tuple.value();
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
index 23986108112..7bb1ef5a10b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <string>
+
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -20,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
@@ -40,6 +45,10 @@ constexpr char kFuncAttr[] = "func";
 
 struct ClusterOutliningPass
     : public impl::ClusterOutliningPassBase<ClusterOutliningPass> {
+  explicit ClusterOutliningPass(bool globally_unique_func_names) {
+    globally_unique_func_names_ = globally_unique_func_names;
+  }
+
   void runOnOperation() override;
 };
 
@@ -48,6 +57,10 @@ struct ClusterOutliningPass
 
 struct LaunchOutliningPass
     : public impl::LaunchOutliningPassBase<LaunchOutliningPass> {
+  explicit LaunchOutliningPass(bool globally_unique_func_names) {
+    globally_unique_func_names_ = globally_unique_func_names;
+  }
+
   void runOnOperation() override;
 };
 
@@ -62,17 +75,29 @@ void ReplaceClusterReturnWithReturn(tf_device::ReturnOp cluster_return_op,
 // and inserts built function into given module.
 template <typename ClusterOrLaunchOp>
 func::FuncOp BuildFunction(llvm::ArrayRef<Value> live_ins, ClusterOrLaunchOp op,
-                           SymbolTable* symbol_table, OpBuilder* builder) {
+                           SymbolTable* symbol_table, OpBuilder* builder,
+                           bool globally_unique_func_names) {
   llvm::SmallVector<Type, 4> operand_types;
   operand_types.reserve(live_ins.size());
   for (Value v : live_ins) operand_types.emplace_back(v.getType());
 
   auto func_type = builder->getFunctionType(operand_types, op.getResultTypes());
 
-  // TODO(lyandy): Define better name for outlined function. Potentially some
-  // name can be added during cluster formation.
+  std::string func_name;
+  if (globally_unique_func_names) {
+    // While processing XLA launch ops, signatures are created for each function
+    // to decide if a function has been compiled. Function signatures are
+    // decided by function name and input types. By giving each function a
+    // unique name, we make sure the same signature is not incorrectly given to
+    // functions of different graphs with same name and input type.
+    func_name =
+        absl::StrCat("_func_", size_t(OperationEquivalence::computeHash(op)));
+  } else {
+    func_name = "_func";
+  }
+
   func::FuncOp outlined_func =
-      func::FuncOp::create(op.getLoc(), "_func", func_type);
+      func::FuncOp::create(op.getLoc(), func_name, func_type);
 
   // This function is not externally visible and marking it private would allow
   // symbol-dce pass to remove it when it is not referenced anymore.
@@ -108,13 +133,14 @@ func::FuncOp BuildFunction(llvm::ArrayRef<Value> live_ins, ClusterOrLaunchOp op,
 // `tf_device.cluster_func` to invoke that function. `tf_device.cluster` is
 // removed afterwards.`
 void OutlineCluster(tf_device::ClusterOp cluster_op, SymbolTable* symbol_table,
-                    OpBuilder* builder) {
+                    OpBuilder* builder, bool globally_unique_func_names) {
   llvm::SetVector<Value> live_ins;
   getUsedValuesDefinedAbove(cluster_op.getBody(), cluster_op.getBody(),
                             live_ins);
 
   func::FuncOp outlined_func =
-      BuildFunction(live_ins.getArrayRef(), cluster_op, symbol_table, builder);
+      BuildFunction(live_ins.getArrayRef(), cluster_op, symbol_table, builder,
+                    globally_unique_func_names);
   cluster_op->setAttr(
       builder->getStringAttr(kFuncAttr),
       mlir::SymbolRefAttr::get(builder->getContext(), outlined_func.getName()));
@@ -135,12 +161,13 @@ void OutlineCluster(tf_device::ClusterOp cluster_op, SymbolTable* symbol_table,
 // `tf_device.launch_func` to invoke that function. `tf_device.launch` is
 // removed afterwards.`
 void OutlineLaunch(tf_device::LaunchOp launch_op, SymbolTable* symbol_table,
-                   OpBuilder* builder) {
+                   OpBuilder* builder, bool globally_unique_func_names) {
   llvm::SetVector<Value> live_ins;
   getUsedValuesDefinedAbove(launch_op.getBody(), launch_op.getBody(), live_ins);
 
   func::FuncOp outlined_func =
-      BuildFunction(live_ins.getArrayRef(), launch_op, symbol_table, builder);
+      BuildFunction(live_ins.getArrayRef(), launch_op, symbol_table, builder,
+                    globally_unique_func_names);
   launch_op->setAttr(
       builder->getStringAttr(kFuncAttr),
       mlir::SymbolRefAttr::get(builder->getContext(), outlined_func.getName()));
@@ -159,7 +186,8 @@ void ClusterOutliningPass::runOnOperation() {
   SymbolTable symbol_table(module);
   OpBuilder builder(module.getContext());
   module.walk([&](tf_device::ClusterOp cluster) {
-    OutlineCluster(cluster, &symbol_table, &builder);
+    OutlineCluster(cluster, &symbol_table, &builder,
+                   globally_unique_func_names_.getValue());
   });
 }
 
@@ -168,18 +196,21 @@ void LaunchOutliningPass::runOnOperation() {
   SymbolTable symbol_table(module);
   OpBuilder builder(module.getContext());
   module.walk([&](tf_device::LaunchOp launch) {
-    OutlineLaunch(launch, &symbol_table, &builder);
+    OutlineLaunch(launch, &symbol_table, &builder,
+                  globally_unique_func_names_.getValue());
   });
 }
 
 }  // namespace
 
-std::unique_ptr<OperationPass<ModuleOp>> CreateClusterOutliningPass() {
-  return std::make_unique<ClusterOutliningPass>();
+std::unique_ptr<OperationPass<ModuleOp>> CreateClusterOutliningPass(
+    bool globally_unique_func_names) {
+  return std::make_unique<ClusterOutliningPass>(globally_unique_func_names);
 }
 
-std::unique_ptr<OperationPass<ModuleOp>> CreateLaunchOutliningPass() {
-  return std::make_unique<LaunchOutliningPass>();
+std::unique_ptr<OperationPass<ModuleOp>> CreateLaunchOutliningPass(
+    bool globally_unique_func_names) {
+  return std::make_unique<LaunchOutliningPass>(globally_unique_func_names);
 }
 
 }  // namespace TFDevice
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index 4eb8f987a4d..481f2d868e1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -16,6 +16,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h"
 
 #include <algorithm>
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
@@ -24,13 +28,27 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/eval_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/tsl/util/device_name_utils.h"
 
 namespace mlir {
 namespace TF {
 
+static bool IsOk(const tensorflow::Status& s) {
+  if (s.ok()) return true;
+  VLOG(2) << s.message();
+  return false;
+}
+
+#define RETURN_FAILURE_IF_ERROR(expr) \
+  if (!IsOk(expr)) {                  \
+    return mlir::failure();           \
+  }
+
 // Implements a TF specific policy on when constant folding is allowed.
 // Policy:
 //
@@ -71,15 +89,129 @@ static bool ShouldBeFolded(Operation* inst) {
 #ifdef TF_DISABLE_CONSTANT_FOLDING
   constexpr int64_t kResultsSizeThreshold = 0;
 #else
-  constexpr int64_t kResultsSizeThreshold = (1 << 23);   // 1 MB
+  constexpr int64_t kResultsSizeThreshold = (1 << 23);  // 1 MB
 #endif
-  constexpr int64_t kOperandsSizeThreshold = (1 << 30);  // 1 GB
+  constexpr int64_t kOperandsSizeThreshold = (1 << 30);  // 128 MB
 
   return (operands_size <= kOperandsSizeThreshold) &&
          (has_unknown_shape || (results_size <= kResultsSizeThreshold) ||
           (results_size <= kSizeFactor * operands_size));
 }
 
+static const tensorflow::tfrt_stub::FallbackState& GetDefaultFallbackState() {
+  static const auto* const fallback_state = []() {
+    tensorflow::SessionOptions session_options;
+    tensorflow::FunctionDefLibrary fdef_lib;
+    auto fallback_state =
+        tensorflow::tfrt_stub::FallbackState::CreateWithCpuDevice(
+            session_options, fdef_lib)
+            .value();
+    return fallback_state.release();
+  }();
+
+  return *fallback_state;
+}
+
+static std::function<void(std::function<void()>)>* GetDefaultRunner() {
+  static auto* const default_runner =
+      new std::function<void(std::function<void()>)>(
+          [](const std::function<void()>& f) { f(); });
+  return default_runner;
+}
+
+static mlir::LogicalResult EvaluateOperation(
+    mlir::Operation* inst, llvm::ArrayRef<mlir::ElementsAttr> operands,
+    llvm::SmallVectorImpl<mlir::Attribute>* results) {
+  // If any operand is nullptr returns true for a failure.
+  // TODO(b/120678030): remove this constraint if we find operators can be
+  // evaluated with some unknown operands.
+  if (std::any_of(operands.begin(), operands.end(),
+                  [](mlir::Attribute operand) { return !operand; })) {
+    VLOG(1) << "Can't evaluate since not all operands are constant.";
+    return mlir::failure();
+  }
+
+  // Builds TF operation and sets all the attributes.
+  std::string node_name = "unnamed";
+  if (auto attr = inst->getAttrOfType<mlir::StringAttr>("name")) {
+    node_name = std::string(attr.getValue());
+  }
+  auto node_def_or = tensorflow::ConvertTFDialectOpToNodeDef(
+      inst, node_name.c_str(), /*ignore_unregistered_attrs=*/true);
+  RETURN_FAILURE_IF_ERROR(node_def_or.status());
+  const auto& node_def = node_def_or.value();
+
+  const auto& fallback_state = GetDefaultFallbackState();
+
+  // Explicitly set device to Host CPU instead of the device present in device
+  // attribute of the MLIR op. The assigned device might be remote, not
+  // available during compilation or compilation only device for on demand
+  // execution which may create a recursion if used for constant folding.
+  auto host_cpu = tensorflow::DeviceNameUtils::FullName(
+      /*job=*/"localhost", /*replica=*/0, /*task=*/0, /*type=*/"CPU", /*id=*/0);
+
+  auto statusor_runner = tensorflow::tfrt_stub::OpKernelRunner::Create(
+      node_def->op(), node_def->name(), host_cpu, operands.size(),
+      [&](tensorflow::AttrValueMap* attr_value_map) {
+        *attr_value_map = node_def->attr();
+        return tensorflow::OkStatus();
+      },
+      fallback_state.device_manager(),
+      fallback_state.process_function_library_runtime());
+  RETURN_FAILURE_IF_ERROR(statusor_runner.status());
+  const auto& runner = *statusor_runner;
+
+  VLOG(1) << "Start to evaluate node: " << node_def->DebugString();
+
+  std::vector<tensorflow::Tensor> inputs;
+
+  // Adds inputs to the TF operation.
+  for (const auto operand : operands) {
+    tensorflow::Tensor tensor;
+    RETURN_FAILURE_IF_ERROR(tensorflow::ConvertToTensor(operand, &tensor));
+    inputs.push_back(std::move(tensor));
+  }
+
+  std::vector<tensorflow::TensorValue> input_values;
+  for (auto& tensor : inputs) {
+    input_values.emplace_back();
+    input_values.back().tensor = &tensor;
+  }
+
+  tensorflow::OpKernelContext::Params params;
+  params.inputs = input_values;
+  params.device = runner.device();
+  params.op_kernel = runner.op_kernel();
+  // Still use original device's resource_manager.
+  params.resource_manager = runner.resource_manager();
+  params.input_alloc_attrs = runner.input_alloc_attrs();
+  params.output_attr_array = runner.output_alloc_attrs().data();
+  // Following two parameters are used to support executing tf.data via
+  // fallback.
+  params.function_library = runner.function_library_runtime();
+  params.runner = GetDefaultRunner();
+
+  // Executes the TF operation.
+  tensorflow::OpKernelContext op_kernel_context(&params);
+  runner.Run(&op_kernel_context);
+  RETURN_FAILURE_IF_ERROR(op_kernel_context.status());
+
+  // Converts the outputs to MLIR attributes.
+  mlir::Builder builder(inst->getContext());
+
+  for (int i = 0; i < op_kernel_context.num_outputs(); ++i) {
+    DCHECK(op_kernel_context.mutable_output(i));
+    auto attr_or = tensorflow::ConvertTensor(
+        *op_kernel_context.mutable_output(i), &builder);
+    RETURN_FAILURE_IF_ERROR(attr_or.status());
+    results->push_back(attr_or.value());
+  }
+
+  VLOG(1) << "Evaluate node " << node_name << " successfully!";
+
+  return mlir::success();
+}
+
 LogicalResult ConstantFoldFallbackHook(
     Operation* inst, ArrayRef<Attribute> operands,
     SmallVectorImpl<OpFoldResult>& results) {  // NOLINT
@@ -136,13 +268,6 @@ LogicalResult ConstantFoldFallbackHook(
   // size/size increase due to folding.
   if (!ShouldBeFolded(inst)) return failure();
 
-  // TODO(jpienaar): Currently this persists the entire program execution. This
-  // should instead be per module/set from the Graph being executed in TF (if
-  // any) so that the value of variables in the context could be read.
-  // Note: Sharing the context is fine as ops are side-effect free.
-  static TFE_Context* ctx = GetContextForConstantFold();
-  if (!ctx) return failure();
-
   // Returns directly if any of the operands is not an elements attributes.
   if (std::any_of(operands.begin(), operands.end(), [](Attribute attr) {
         return !attr || !attr.isa<ElementsAttr>();
@@ -160,8 +285,7 @@ LogicalResult ConstantFoldFallbackHook(
   static auto* mu = new tensorflow::mutex();
   tensorflow::mutex_lock l(*mu);
   SmallVector<Attribute, 8> constants;
-  LogicalResult status =
-      tensorflow::EvaluateOperation(inst, inputs, ctx, &constants);
+  LogicalResult status = EvaluateOperation(inst, inputs, &constants);
   results.assign(constants.begin(), constants.end());
   return status;
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
index 4c379b4e5b5..51438ac4901 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/verification_utils.h"
 #include "tensorflow/core/util/matmul_bcast.h"
@@ -82,6 +83,27 @@ arith::ConstantOp createI64ConstantOp(llvm::ArrayRef<int64_t> values,
   return rewriter->create<arith::ConstantOp>(loc, values_type, constant_attr);
 }
 
+// Function to create a tf.SumOp to sum the element in 'value' reduced along the
+// 'redux_axes'.
+TF::SumOp createSumOp(Value value, Location loc,
+                      llvm::ArrayRef<int32_t> redux_axes,
+                      PatternRewriter* rewriter) {
+  Value redux_op = createI32ConstantOp(redux_axes, loc, rewriter);
+
+  auto value_type = value.getType().cast<RankedTensorType>();
+  auto shape = value_type.getShape();
+  llvm::SmallVector<int64_t> sum_shape;
+  for (int i = 0; i < shape.size(); ++i) {
+    if (std::find(redux_axes.begin(), redux_axes.end(), i) ==
+        redux_axes.end()) {
+      sum_shape.push_back(shape[i]);
+    }
+  }
+  return rewriter->create<TF::SumOp>(
+      loc, RankedTensorType::get(sum_shape, value_type.getElementType()), value,
+      redux_op);
+}
+
 TF::TransposeOp createTransposeOp(Value value, Location loc,
                                   llvm::ArrayRef<int32_t> permutation,
                                   PatternRewriter* rewriter) {
@@ -344,6 +366,61 @@ std::tuple<std::string, std::string, std::string> FlattenEllipsis(
   return std::make_tuple(new_lhs, new_rhs, new_output);
 }
 
+// vectors/maps to map the dimensions of lhs with output in unary einsum op
+std::optional<EinsumDimensionNumbers> GetEinsumDimensionNumbersUnary(
+    llvm::StringRef equation, RankedTensorType lhs_ty) {
+  llvm::StringRef lhs;
+  llvm::StringRef out;
+  std::tie(lhs, out) = equation.split("->");
+  if (lhs.empty() || out.empty()) return std::nullopt;
+
+  // Try to flatten the "..." if possible.
+  int lhs_named_label, rhs_named_label;
+
+  // following rhs and rhs_ty variables are non-functional here only created to
+  // comply with the existing API
+  llvm::StringRef rhs;
+  RankedTensorType rhs_ty;
+
+  auto available_labels =
+      GetAvailableLabels(lhs, rhs, &lhs_named_label, &rhs_named_label);
+  if (!available_labels.has_value()) return std::nullopt;
+
+  auto flattended_labels =
+      FlattenEllipsis(lhs, lhs_named_label, rhs, rhs_named_label, out, lhs_ty,
+                      rhs_ty, available_labels.value());
+
+  lhs = std::get<0>(flattended_labels);
+  out = std::get<2>(flattended_labels);
+
+  auto lhs_map_or = EquationToMap(lhs);
+  if (!lhs_map_or.has_value()) return std::nullopt;
+  auto lhs_map = lhs_map_or.value();
+
+  auto out_map_or = EquationToMap(out);
+  if (!out_map_or.has_value()) return std::nullopt;
+  auto out_map = out_map_or.value();
+
+  EinsumDimensionNumbers dnums;
+  for (int64_t i = 0; i < lhs.size(); ++i) {
+    auto out_index = out_map.find(lhs[i]);
+    if (out_index == out_map.end()) {
+      dnums.lhs.emplace_back(i);
+    } else {
+      dnums.lhs_out.emplace_back(i, out_index->second);
+    }
+  }
+
+  for (int64_t i = 0; i < out.size(); ++i) {
+    auto lhs_index = lhs_map.find(out[i]);
+    if (lhs_index == lhs_map.end()) {
+      // out only isn't supported
+      return std::nullopt;
+    }
+  }
+  return dnums;
+}
+
 std::optional<EinsumDimensionNumbers> GetEinsumDimensionNumbers(
     llvm::StringRef equation, RankedTensorType lhs_ty,
     RankedTensorType rhs_ty) {
@@ -419,6 +496,62 @@ std::optional<EinsumDimensionNumbers> GetEinsumDimensionNumbers(
   return dnums;
 }
 
+// Function to replace a unary einsum op, that can undergo simple transpose, to
+// an explicit transpose op.
+LogicalResult rewriteToReduceSumAndTranspose(TF::EinsumOp op,
+                                             EinsumDimensionNumbers dnums,
+                                             PatternRewriter& rewriter) {
+  auto inputs = op.getInputs();
+  Value lhs = inputs.front();
+
+  // Having indices in dnums.lhs list indicates that the ranks of the input and
+  // output to the unary einsum are not equal making it non-candidate for simple
+  // transpose.
+  bool needs_reduce_sum = false;
+  if (!dnums.lhs.empty()) {
+    needs_reduce_sum = true;
+    llvm::SmallVector<int32_t> reduce_idcs(dnums.lhs.size());
+    for (int64_t i = 0; i < dnums.lhs.size(); ++i) {
+      reduce_idcs[i] = dnums.lhs[i];
+    }
+
+    lhs = createSumOp(lhs, lhs.getLoc(), reduce_idcs, &rewriter);
+  }
+
+  llvm::SmallVector<int32_t> lhs_transpose;
+  lhs_transpose.reserve(dnums.lhs_out.size());
+
+  llvm::SmallDenseMap<int64_t, int64_t> out_lhs_map(dnums.lhs_out.size());
+  for (int64_t i = 0; i < dnums.lhs_out.size(); ++i) {
+    out_lhs_map[std::get<1>(dnums.lhs_out[i])] = std::get<0>(dnums.lhs_out[i]);
+  }
+
+  bool needs_transpose = false;
+  for (int64_t i = 0; i < dnums.lhs_out.size(); ++i) {
+    if (std::get<0>(dnums.lhs_out[i]) >
+        lhs.getType().cast<RankedTensorType>().getRank() - 1) {
+      continue;
+    }
+
+    if (std::get<0>(dnums.lhs_out[i]) != std::get<1>(dnums.lhs_out[i])) {
+      needs_transpose = true;
+    }
+    lhs_transpose.push_back(out_lhs_map[i]);
+  }
+
+  if (!needs_reduce_sum && !needs_transpose) {
+    return rewriter.notifyMatchFailure(
+        op, "unary einsum equation does not require transpose");
+  } else if (needs_reduce_sum && !needs_transpose) {
+    rewriter.replaceOp(op, lhs);
+    return success();
+  }
+
+  lhs = createTransposeOp(lhs, lhs.getLoc(), lhs_transpose, &rewriter);
+  rewriter.replaceOp(op, lhs);
+  return success();
+}
+
 std::vector<int64_t> inverseTransposeVector(
     llvm::ArrayRef<int64_t> input, llvm::ArrayRef<int32_t> permutation) {
   std::vector<int64_t> output(input.size());
@@ -682,6 +815,27 @@ LogicalResult rewriteToBatchMatmul(TF::EinsumOp op,
   return success();
 }
 
+LogicalResult matchAndRewriteUnaryEinsumOp(TF::EinsumOp op,
+                                           PatternRewriter& rewriter) {
+  if (op->getNumOperands() != 1) {
+    return rewriter.notifyMatchFailure(
+        op, "Function only supports unary einsum op");
+  }
+  RankedTensorType lhs =
+      op.getOperand(0).getType().dyn_cast_or_null<RankedTensorType>();
+  if (!lhs) {
+    return failure();
+  }
+  // unary einsum op is only supported to the case where the operation can be
+  // replaced using reduce_sum and/or transpose
+  if (const auto dnums_or =
+          GetEinsumDimensionNumbersUnary(op.getEquation(), lhs)) {
+    return rewriteToReduceSumAndTranspose(op, dnums_or.value(), rewriter);
+  }
+
+  return rewriter.notifyMatchFailure(op, "unsupported einsum lowering");
+}
+
 #define GEN_PASS_DEF_TRANSFORMEINSUMPASS
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
 
@@ -703,6 +857,10 @@ void TransformEinsumPass::runOnOperation() {
 
 LogicalResult ConvertTFEinsumOp::matchAndRewrite(
     TF::EinsumOp op, PatternRewriter& rewriter) const {
+  if (op->getNumOperands() == 1) {
+    return matchAndRewriteUnaryEinsumOp(op, rewriter);
+  }
+
   RankedTensorType lhs =
       op.getOperand(0).getType().dyn_cast_or_null<RankedTensorType>();
   RankedTensorType rhs =
@@ -711,10 +869,10 @@ LogicalResult ConvertTFEinsumOp::matchAndRewrite(
     return failure();
   }
 
-  // TODO(b/162328998) Better support Einsum with dynamic input. Currently, one
-  // dynamic dimension is always supported. If there are two or more dynamic
-  // dimensions, it is supported if they only exist in a single component
-  // among: L0,...,Ln R0,...,Rn or C0,...,Cn.
+  // TODO(b/162328998) Better support Einsum with dynamic input. Currently,
+  // one dynamic dimension is always supported. If there are two or more
+  // dynamic dimensions, it is supported if they only exist in a single
+  // component among: L0,...,Ln R0,...,Rn or C0,...,Cn.
   if (const auto dnums_or =
           GetEinsumDimensionNumbers(op.getEquation(), lhs, rhs))
     return rewriteToBatchMatmul(op, dnums_or.value(), rewriter);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc
new file mode 100644
index 00000000000..e5671bf5961
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc
@@ -0,0 +1,922 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This pass implements automated pipelining for TPU embeddings defined using
+// the TF2 Embedding API. This is designed for applications that have an
+// embedding lookup on the SparseCore, followed by one or more dense layers on
+// TensorCores, optionally followed by a backward pass (training update) with
+// more ops on the SparseCore. Ops are broken up into:
+//   1. SC forward pass
+//   2. TC forward/backward pass
+//   3. SC backward pass
+//   4. non-TPU loop counter updates
+// These 4 functions are then staggered so as to enable parallel execution.
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+
+#define GEN_PASS_DEF_EMBEDDINGPIPELININGPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+
+static constexpr char kEmbeddingPipelining[] = "_embedding_pipelining";
+static constexpr char kEmbeddingForward[] = "forward";
+static constexpr char kEmbeddingBackward[] = "backward";
+static constexpr char kDevice[] = "device";
+static constexpr llvm::StringRef kTpuCompilationStatus =
+    "_tpu_compilation_status";
+
+namespace mlir {
+namespace TFDevice {
+namespace {
+
+struct EmbeddingPipeliningPass
+    : public ::impl::EmbeddingPipeliningPassBase<EmbeddingPipeliningPass> {
+  void getDependentDialects(mlir::DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect>();
+  }
+
+  void runOnOperation() override;
+};
+
+template <typename InputContainer>
+std::vector<Type> GetValueTypes(const InputContainer& input) {
+  // Convert a list of mlir::Value's into a list of mlir::Type's
+  std::vector<Type> types;
+  types.reserve(input.size());
+  for (auto val : input) types.push_back(val.getType());
+  return types;
+}
+
+bool IsResourceType(Type val_type) {
+  if (auto tensor_type = val_type.dyn_cast<mlir::TensorType>()) {
+    if (tensor_type.getElementType().isa<TF::ResourceType>()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool IsTPUOp(mlir::Operation* op) {
+  return op->hasAttr(TF::kReplicationInfoAttr);
+}
+
+StringAttr GetReplicationAttr(mlir::Operation* op) {
+  return op->getAttrOfType<StringAttr>(TF::kReplicationInfoAttr);
+}
+
+StringAttr GetReplicationAttr(TF::TPUCompilationResultOp op) {
+  // Special case for getting the replication region for
+  // TPUCompilationResultsOp.
+  return op->getAttrOfType<StringAttr>(kTpuCompilationStatus);
+}
+
+int64_t GetNumOps(func::FuncOp func) {
+  int64_t num_ops = 0;
+  for (auto it = func.begin(); it != func.end(); ++it) ++num_ops;
+  return num_ops;
+}
+
+void GatherOpsForExtraction(mlir::SetVector<Operation*>* operations,
+                            const mlir::SetVector<Operation*>& ops_to_avoid,
+                            bool predecessors, bool successors) {
+  // Walk the input and output dependencies of the Ops in `operations` to form
+  // the closer of Ops needed to evaluate 'operations'. Input dependencies are
+  // walked if 'predecessors' is true and output dependencies are walked if
+  // 'successors' is true. In either case, if a discoverd Op is in the
+  // 'ops_to_avoid' set, then the dependency walking is terminated.
+  llvm::SetVector<Operation*> ops_to_process(*operations);
+  llvm::SetVector<Operation*> new_ops;
+
+  while (!ops_to_process.empty()) {
+    for (Operation* op : ops_to_process) {
+      if (predecessors) {
+        for (Value operand : op->getOperands()) {
+          // Stop at the block boundary.
+          if (operand.isa<BlockArgument>()) continue;
+
+          Operation* predecessor = operand.getDefiningOp();
+          if (!operations->contains(predecessor) &&
+              !ops_to_avoid.contains(predecessor)) {
+            new_ops.insert(operand.getDefiningOp());
+            operations->insert(operand.getDefiningOp());
+          }
+        }
+      }
+      if (successors) {
+        for (mlir::Operation* successor : op->getUsers()) {
+          // Don't include the return op
+          if (llvm::isa<func::ReturnOp>(successor)) continue;
+
+          if (!operations->contains(successor) &&
+              !ops_to_avoid.contains(successor)) {
+            new_ops.insert(successor);
+            operations->insert(successor);
+          }
+        }
+      }
+    }
+    ops_to_process.swap(new_ops);
+    new_ops.clear();
+  }
+}
+
+TF::StatefulPartitionedCallOp MakeFuncCaller(
+    mlir::OpBuilder& builder, const Location& loc, func::FuncOp func,
+    const llvm::SetVector<Value>& operands) {
+  // Constructs a tf.StatefulPartitionedCall to the function provided in 'func'
+  // using the operands in 'operands'. Assumes the insertion point on builder is
+  // already set.
+  auto symbol =
+      mlir::SymbolRefAttr::get(builder.getContext(), func.getSymName());
+  auto result_types = func.getResultTypes();
+  auto caller = builder.create<TF::StatefulPartitionedCallOp>(
+      loc, result_types, operands.getArrayRef(), symbol,
+      /*config=*/builder.getStringAttr(""),
+      /*config_proto=*/builder.getStringAttr(""),
+      /*executor_type=*/builder.getStringAttr(""));
+  caller.setFAttr(symbol);
+  return caller;
+}
+
+func::FuncOp CreateFnWithSignature(ModuleOp module,
+                                   const llvm::SetVector<Value>& inputs,
+                                   const llvm::SetVector<Value>& outputs,
+                                   const std::string& name) {
+  // Creates an empty func.FuncOp with a signature compatible with 'inputs'
+  // (operands) and 'outputs' (results).
+  OpBuilder builder(module);
+
+  std::vector<Type> input_types = GetValueTypes(inputs);
+  std::vector<Type> output_types = GetValueTypes(outputs);
+  builder.setInsertionPointToEnd(&module.getBodyRegion().back());
+  func::FuncOp func_op = builder.create<func::FuncOp>(
+      module.getLoc(), name,
+      builder.getFunctionType(input_types, output_types));
+  func_op.setPrivate();
+
+  return func_op;
+}
+
+TF::StatefulPartitionedCallOp EncapsulateOpsInFunc(
+    OpBuilder& builder, const llvm::SetVector<Operation*>& ops,
+    const llvm::SetVector<Value>& inputs, const llvm::SetVector<Value>& outputs,
+    func::FuncOp parent_func, ModuleOp module, const std::string& name) {
+  // Moves all of the Operations in 'ops' into a newly created func.FuncOp
+  // function named 'name' and replaces the original ops with a call to the
+  // newly created function using a tf.StatefulPartitionedCall. Here,
+  // 'parent_func' is the function that holds the original set of ops.
+  // Note, 'inputs' and 'outputs' are the predetermined set of values that
+  // should become the operands and return values, respectively.
+  auto insertion_point = builder.saveInsertionPoint();
+  func::FuncOp new_func = CreateFnWithSignature(module, inputs, outputs,
+                                                absl::StrCat("_func_", name));
+
+  // This preserves the order of the ops that was in the original parent
+  // funtion. This is critical for preserving correctness in the presence of
+  // resource variables and stateful functions.
+  std::vector<Operation*> topological_order;
+  for (Operation& op : parent_func.getOps())
+    if (ops.contains(&op)) topological_order.push_back(&op);
+
+  // Create the partitioned call
+  builder.restoreInsertionPoint(insertion_point);
+  auto caller = MakeFuncCaller(builder, module.getLoc(), new_func, inputs);
+
+  Block* block = new_func.addEntryBlock();
+
+  for (Operation* op : topological_order) op->moveBefore(block, block->end());
+
+  // Replace the 'inputs' values with the new function's arguments.
+  for (auto p : llvm::zip(inputs, new_func.getArguments()))
+    replaceAllUsesInRegionWith(std::get<0>(p), std::get<1>(p),
+                               new_func.getBody());
+
+  builder.setInsertionPointToEnd(block);
+  builder.create<func::ReturnOp>(parent_func.getLoc(), outputs.getArrayRef());
+
+  // Replace the original 'outputs' values with the result of the call to the
+  // new function.
+  for (auto p : llvm::zip(outputs, caller->getResults()))
+    replaceAllUsesInRegionWith(std::get<0>(p), std::get<1>(p),
+                               parent_func.getBody());
+
+  return caller;
+}
+
+void UpdateAndInsertTPUOps(TF::StatefulPartitionedCallOp caller,
+                           TF::TPUReplicateMetadataOp metadata_op,
+                           TF::TPUCompilationResultOp compilation_op,
+                           StringAttr old_group) {
+  // Adds the TPUReplicateMetatdataOp and TPUCompilationResultOp ops to the
+  // function called by the provided 'caller'.
+  mlir::CallInterfaceCallable callable = caller.getCallableForCallee();
+  mlir::SymbolRefAttr sym = callable.dyn_cast<mlir::SymbolRefAttr>();
+  auto func = llvm::dyn_cast<mlir::func::FuncOp>(
+      mlir::SymbolTable::lookupNearestSymbolFrom(caller, sym));
+  OpBuilder builder(func.getBody());
+
+  StringAttr new_group = builder.getStringAttr(
+      absl::StrCat(old_group.getValue().str(), caller.getF().str()));
+
+  builder.insert(metadata_op.clone());
+  for (Operation& op : func.getOps()) {
+    if (!IsTPUOp(&op)) continue;
+    op.setAttr(TF::kReplicationInfoAttr, new_group);
+  }
+  TF::TPUCompilationResultOp new_result = compilation_op.clone();
+  new_result->setAttr(kTpuCompilationStatus, new_group);
+  builder.insert(new_result);
+}
+
+template <typename OpType>
+LogicalResult FindAndExcludeOp(func::FuncOp func,
+                               const StringAttr& replication_attr,
+                               llvm::SetVector<Operation*>& merged_set,
+                               OpType& found_op) {
+  // Find the TPUReplicationMetadata or TPUCompilationResult ops which will be
+  // cloned/inserted into each region. We add them to the merged_set so that
+  // they're ignored when extracting the four main functions.
+  found_op = nullptr;
+  for (OpType op : func.getOps<OpType>()) {
+    if (found_op != nullptr) {
+      func.emitOpError() << "number of " << found_op.getOperationName()
+                         << " in loop body is not 1";
+      return LogicalResult::failure();
+    }
+    if (GetReplicationAttr(op) != replication_attr) {
+      op.emitOpError() << "is not part of the replication region "
+                       << replication_attr << " vs " << GetReplicationAttr(op);
+      return LogicalResult::failure();
+    }
+    found_op = op;
+    merged_set.insert(found_op);
+  }
+  return LogicalResult::success();
+}
+
+LogicalResult FindOwningWhileOp(func::FuncOp body_func, ModuleOp module,
+                                TF::WhileOp* while_op) {
+  // Given a while loop body function 'body_func', find the tf.While Op that
+  // uses it.
+  auto uses_optional = body_func.getSymbolUses(module);
+  if (!uses_optional.has_value()) {
+    body_func.emitOpError() << "no use of while loop body";
+    return LogicalResult::failure();
+  }
+  *while_op = nullptr;
+  for (auto& use : uses_optional.value()) {
+    if (llvm::isa<TF::WhileOp>(use.getUser())) {
+      if (*while_op != nullptr) {
+        use.getUser()->emitOpError() << "multiple users of function.";
+        return LogicalResult::failure();
+      } else {
+        *while_op = llvm::cast<TF::WhileOp>(use.getUser());
+      }
+    } else {
+      use.getUser()->emitOpError() << "non while use of function.";
+      return LogicalResult::failure();
+    }
+  }
+  // TODO(bfontain): If the while op is not present we could just split things
+  // or we wait until the compiler supports multiple regions?
+  if (while_op == nullptr) {
+    body_func.emitOpError() << "unable to find while body user.";
+    return LogicalResult::failure();
+  }
+  return LogicalResult::success();
+}
+
+LogicalResult FindForwardPassOps(OpBuilder& builder,
+                                 llvm::SetVector<Operation*>& forward_pass_ops,
+                                 llvm::SetVector<Operation*>& backward_pass_ops,
+                                 llvm::SetVector<Operation*>& merged_set,
+                                 func::FuncOp loop_body_func,
+                                 const int num_replicas) {
+  // Find all the ops that are to be included in the 'sc_forward' function which
+  // will be executed on the SparseCore. Note, 'forward_pass_ops' is initially
+  // seeded with ops from the input MLIR graph that have the
+  // _embedding_pipelining="forward" attribute which is set by the TF2 Embedding
+  // API.
+  //
+  // When outputs of the forward pass function are used outside of it, we'll
+  // need to insert a TPUReplicatedOutput Op and include that in the
+  // forward_pass_ops. And if that usage is also on the TPU (either TensorCore
+  // or SparseCore) we'll need to insert a matching TPUReplicatedInput. We do
+  // this before the Ops are removed from the original function/graph so that
+  // function operands and return values are handled automatically.
+
+  // First, walk the op dependencies.
+  GatherOpsForExtraction(&forward_pass_ops, merged_set, /*predecessors=*/true,
+                         /*successors=*/false);
+
+  // Locate which variable inputs are part of the forwards pass. These will
+  // also be used in the backwards pass. We need to create a 'private' copy
+  // of the TpuReplicatedInput for for the fowards pass if there are users
+  // outside the pass. Note that in the case of the backwards pass existing
+  // this will be the case.
+  // This means that when we have put all out sections together some resource
+  // inputs will have multiple TPUReplicateInput nodes, so we will need a final
+  // pass to merge these together into the earliest copy.
+  llvm::SetVector<int64_t> forward_variable_inputs;
+
+  // Validate that the only resource inputs that are read by ops in
+  // forward_pass_ops are dataset and variable ops.
+  int64_t resource_count = 0;
+  for (auto argument : loop_body_func.getArguments()) {
+    // Check that all resource arguments are either fed to iterator get next
+    // or a TPUReplicatedInput with is_packed.
+
+    if (IsResourceType(argument.getType())) {
+      resource_count++;
+      bool is_variable = false;
+      bool is_non_variable = false;
+      bool use_in_forward = false;
+      bool use_in_not_forward = false;
+      for (auto user : argument.getUsers()) {
+        if (llvm::isa<func::ReturnOp>(user)) continue;
+        if (!forward_pass_ops.contains(user)) {
+          use_in_not_forward = true;
+        } else {
+          use_in_forward = true;
+        }
+        if (TF::TPUReplicatedInputOp input =
+                llvm::dyn_cast<TF::TPUReplicatedInputOp>(user)) {
+          if (!input.getIsPacked()) {
+            input.emitOpError() << "unexpected variable input, not packed";
+            return LogicalResult::failure();
+          }
+
+          if (is_variable) {
+            input.emitOpError() << "unexpected multiple TPUReplicatedInputOp "
+                                << "for single argument";
+            return LogicalResult::failure();
+          }
+          is_variable = true;
+        } else {
+          is_non_variable = true;
+        }
+      }
+      if (use_in_forward && use_in_not_forward) {
+        loop_body_func.emitOpError()
+            << "resource input " << argument.getArgNumber()
+            << " is used both in the forwards and "
+            << "not forward passes dataset";
+        return LogicalResult::failure();
+      }
+      if (is_non_variable && is_variable) {
+        loop_body_func.emitOpError()
+            << "resource input " << argument.getArgNumber()
+            << " is used both as a varible and not "
+            << " a variable";
+        return LogicalResult::failure();
+      }
+      if (is_variable && use_in_forward)
+        forward_variable_inputs.insert(argument.getArgNumber());
+    }
+  }
+
+  VLOG(3) << "Found " << forward_variable_inputs.size()
+          << " variables used in forward pass of " << resource_count
+          << " total resource inputs";
+
+  // Clone the TPUReplicatedInputs.
+  int64_t cloned_inputs = 0;
+  for (int64_t index : forward_variable_inputs) {
+    Value argument = loop_body_func.getArgument(index);
+    // Uses of this argument should only be the return and the
+    // TPUReplicateInputOp. This is checked by the loop above.
+    Operation* input_ptr = nullptr;
+    for (Operation* user : argument.getUsers()) {
+      if (llvm::isa<TF::TPUReplicatedInputOp>(user)) {
+        input_ptr = user;
+        break;
+      }
+    }
+    TF::TPUReplicatedInputOp input =
+        llvm::cast<TF::TPUReplicatedInputOp>(input_ptr);
+
+    // Validate that all users of the TPUReplicatedInput are ReadVariable
+    // or AssignVariable ops and check if any are outside the forwards pass.
+    bool duplicate_needed = false;
+    for (Operation* next_user : input.getOutput().getUsers()) {
+      if (!llvm::isa<TF::ReadVariableOp>(next_user) &&
+          !llvm::isa<TF::AssignVariableOp>(next_user)) {
+        next_user->emitOpError()
+            << "unexpected user of output of TPUReplicatedInputOp";
+        return LogicalResult::failure();
+      }
+      if (!forward_pass_ops.contains(next_user)) duplicate_needed = true;
+    }
+    if (!duplicate_needed) continue;
+
+    cloned_inputs++;
+    builder.setInsertionPointAfter(input);
+    forward_pass_ops.remove(input);
+
+    TF::TPUReplicatedInputOp private_input = input.clone();
+    builder.insert(private_input);
+    forward_pass_ops.insert(private_input);
+    for (OpOperand& next_use : input.getOutput().getUses()) {
+      if (!forward_pass_ops.contains(next_use.getOwner())) continue;
+      next_use.getOwner()->setOperand(next_use.getOperandNumber(),
+                                      private_input.getOutput());
+    }
+  }
+
+  VLOG(2) << "Cloned " << cloned_inputs << " TPUReplicatedInputOps";
+
+  // Add TPUReplicatedInput/TPUReplicatedOutput pairs along each edge.
+  llvm::SetVector<Operation*> new_forward_ops;
+  for (Operation* op : forward_pass_ops) {
+    // TODO(bfontain): Should validate that all the TPU ops are in the same
+    // replication region.
+    if (!IsTPUOp(op)) continue;
+    for (Value result : op->getResults()) {
+      std::vector<std::pair<Operation*, int64_t>> out_of_region_use;
+      for (OpOperand& use : result.getUses()) {
+        auto use_owner = use.getOwner();
+        // TODO(bfontain): Error check here, if the use.getOwner() is not a TPU
+        // then this op must be a TPUReplicatedOutputOp.
+        if (IsTPUOp(use_owner) && !forward_pass_ops.contains(use_owner))
+          out_of_region_use.push_back(
+              std::make_pair(use_owner, use.getOperandNumber()));
+      }
+      if (out_of_region_use.empty()) continue;
+      builder.setInsertionPointAfter(op);
+      std::vector<Type> types(num_replicas, result.getType());
+      TF::TPUReplicatedOutputOp replicated_output =
+          builder.create<TF::TPUReplicatedOutputOp>(op->getLoc(),
+                                                    TypeRange(types), result);
+      new_forward_ops.insert(replicated_output);
+      // TODO(bfontain): Check for other attributes.
+      replicated_output->setAttr(kDevice, builder.getStringAttr(""));
+      TF::TPUReplicatedInputOp input = builder.create<TF::TPUReplicatedInputOp>(
+          op->getLoc(), result.getType(), replicated_output.getResults());
+      input->setAttr(kDevice, builder.getStringAttr(""));
+      mlir::Value new_value = input.getOutput();
+
+      if (mlir::isa<TF::TPUAnnotateTensorsWithDynamicShapeOp>(
+              result.getDefiningOp())) {
+        TF::TPUAnnotateTensorsWithDynamicShapeOp annotate_op =
+            builder.create<TF::TPUAnnotateTensorsWithDynamicShapeOp>(
+                op->getLoc(), result.getType(), new_value,
+                result.getDefiningOp()->getAttrs());
+        for (auto [operation, index] : out_of_region_use) {
+          if (!backward_pass_ops.contains(operation)) {
+            operation->emitOpError()
+                << "expect all dynamic inputs consumed by backwards pass.";
+            return LogicalResult::failure();
+          }
+        }
+
+        backward_pass_ops.insert(annotate_op);
+        new_value = annotate_op->getResult(0);
+      }
+      for (auto [operation, index] : out_of_region_use)
+        operation->setOperand(index, new_value);
+    }
+  }
+
+  VLOG(2) << "inserted " << new_forward_ops.size() << " TPU Input/Output ops";
+  forward_pass_ops.insert(new_forward_ops.begin(), new_forward_ops.end());
+  return LogicalResult::success();
+}
+
+LogicalResult FindBackwardPassOps(
+    OpBuilder& builder, llvm::SetVector<Operation*>& backward_pass_ops,
+    llvm::SetVector<Operation*>& merged_set, const int num_replicas) {
+  // Find all the ops that are to be included in the 'sc_backward' function
+  // which will be executed on the SparseCore. Note, 'backward_pass_ops' is
+  // initially seeded with ops from the input MLIR graph that have the
+  // _embedding_pipelining="backward" attribute which is set by the TF2
+  // Embedding API.
+  //
+  // Since we're inserting a replication boundary around the backward pass
+  // function, we'll also need to make sure TPUReplicatedInputOp and
+  // TPUReplicatedOutputOp ops are inserted as necessary.
+
+  // First, walk the Ops dependencies.
+  GatherOpsForExtraction(&backward_pass_ops, merged_set, /*predecessors=*/false,
+                         /*successors=*/true);
+
+  VLOG(3) << "found " << backward_pass_ops.size() << " backwards pass ops";
+
+  // If any inputs are to the backward_pass_ops region are direct
+  // TPUReplicatedInput ops, then include (if this is the only use) or
+  // clone the op. This will be the case for all Read/Assign variable ops.
+
+  llvm::SetVector<TF::TPUReplicatedInputOp> to_clone;
+  llvm::SetVector<TF::TPUReplicatedInputOp> to_insert;
+
+  for (Operation* op : backward_pass_ops) {
+    for (OpOperand& input_value : op->getOpOperands()) {
+      Operation* predecessor_op = input_value.get().getDefiningOp();
+      if (TF::TPUReplicatedInputOp input =
+              llvm::dyn_cast<TF::TPUReplicatedInputOp>(predecessor_op)) {
+        if (to_clone.contains(input) || to_insert.contains(input)) continue;
+        // Check if all uses in backwards pass.
+        bool all_in_backwards = true;
+        for (Operation* user : input->getUsers())
+          if (!backward_pass_ops.contains(user)) all_in_backwards = false;
+        if (all_in_backwards)
+          to_insert.insert(input);
+        else
+          to_clone.insert(input);
+      }
+    }
+  }
+  backward_pass_ops.insert(to_insert.begin(), to_insert.end());
+  for (TF::TPUReplicatedInputOp input : to_clone) {
+    builder.setInsertionPointAfter(input);
+    TF::TPUReplicatedInputOp private_input = input.clone();
+    builder.insert(private_input);
+    backward_pass_ops.insert(private_input);
+    for (OpOperand& next_use : input.getOutput().getUses()) {
+      if (!backward_pass_ops.contains(next_use.getOwner())) continue;
+      next_use.getOwner()->setOperand(next_use.getOperandNumber(),
+                                      private_input.getOutput());
+    }
+  }
+
+  VLOG(2) << " cloned " << to_clone.size() << " and inserted "
+          << to_insert.size() << " TPUReplicatedInput ops";
+
+  // For all other inputs that go from TPU op to TPU op, insert the
+  // TPUOutput/Input pair.
+
+  // Add TPUReplicatedInput/TPUReplicatedOutput pairs along each edge.
+  // TODO(bfontain): Should be merged with the above loop.
+  llvm::SetVector<Value> values_to_add_nodes;
+
+  for (Operation* op : backward_pass_ops) {
+    // TODO(bfontain): Should validate that all the TPU ops are in the same
+    // replication region.
+    // If the op is already a replicated input, no need to to anything.
+    if (!IsTPUOp(op) || llvm::isa<TF::TPUReplicatedInputOp>(op)) continue;
+    for (OpOperand& input_value : op->getOpOperands())
+      // TODO(bfontain): Error check here, this line should never be false,
+      // since we skip the TF::TPUReplicatedInputOp case.
+      if (IsTPUOp(input_value.get().getDefiningOp()) &&
+          !backward_pass_ops.contains(input_value.get().getDefiningOp()))
+        values_to_add_nodes.insert(input_value.get());
+  }
+
+  for (Value value : values_to_add_nodes) {
+    builder.setInsertionPointAfter(value.getDefiningOp());
+    std::vector<Type> types(num_replicas, value.getType());
+    Location loc = value.getDefiningOp()->getLoc();
+    TF::TPUReplicatedOutputOp output =
+        builder.create<TF::TPUReplicatedOutputOp>(loc, TypeRange(types), value);
+    // TODO(bfontain): Check for other attributes.
+    output->setAttr(kDevice, builder.getStringAttr(""));
+    TF::TPUReplicatedInputOp input = builder.create<TF::TPUReplicatedInputOp>(
+        loc, value.getType(), output.getResults());
+    input->setAttr(kDevice, builder.getStringAttr(""));
+    for (OpOperand& use : value.getUses())
+      if (backward_pass_ops.contains(use.getOwner()))
+        use.getOwner()->setOperand(use.getOperandNumber(), input.getOutput());
+    backward_pass_ops.insert(input);
+  }
+
+  VLOG(2) << " inserted " << values_to_add_nodes.size()
+          << " TPUReplicatedInput/Output pairs";
+  return LogicalResult::success();
+}
+
+LogicalResult FindCoreTPUOps(
+    llvm::SetVector<Operation*>& core_tpu_ops,
+    const llvm::SetVector<Operation*>& forward_pass_ops,
+    const llvm::SetVector<Operation*>& backward_pass_ops,
+    const llvm::SetVector<Operation*>& merged_set,
+    func::FuncOp loop_body_func) {
+  // Find all of the Ops that are part of the forward/backward pass but aren't
+  // targeting the SparseCore. Note that we need to include some non-TPU ops
+  // that flow out of the forward pass function. Otherwise, they would get
+  // absorbed into the non_tpu function which breaks the pipelining
+  // decomposition strategy.
+  //
+  // Find all the outputs of the forward pass that aren't fed into the backward
+  // pass.
+  for (Operation* op : forward_pass_ops) {
+    for (Value res : op->getResults()) {
+      for (auto user : res.getUsers()) {
+        if (!forward_pass_ops.contains(user) &&
+            !backward_pass_ops.contains(user)) {
+          core_tpu_ops.insert(user);
+        }
+      }
+    }
+  }
+
+  // Gather all TPU ops marked for compilation in this while loop body that also
+  // are not in one of the two other sets.
+  for (Operation& op : loop_body_func.getOps()) {
+    // Find all TPU ops that don't belong to the forward or backward pass.
+    if (merged_set.contains(&op) || llvm::isa<func::ReturnOp>(op) ||
+        !IsTPUOp(&op) || op.hasAttr(kEmbeddingPipelining))
+      continue;
+    // TODO(bfontain): only collect those ops in a fixed TPUReplica.
+    core_tpu_ops.insert(&op);
+  }
+
+  GatherOpsForExtraction(&core_tpu_ops, merged_set, /*predecessors=*/true,
+                         /*successors=*/true);
+
+  // TODO(patn): Verify that all the ops here fall between the forward pass
+  // and backward pass ops (i.e., not before the forward pass or after the
+  // backward pass).
+  return LogicalResult::success();
+}
+
+LogicalResult FindNonTPUOps(llvm::SetVector<Operation*>& non_tpu_ops,
+                            const llvm::SetVector<Operation*>& merged_set,
+                            func::FuncOp loop_body_func) {
+  // Find all of the left over Ops after the sc_forward, sc_backward and
+  // core_tpu ops have been identified. What's left are just the ops necessary
+  // for updating loop counters etc.
+  llvm::SetVector<int64_t> non_tpu_args;
+  for (Operation& op : loop_body_func.getOps()) {
+    if (merged_set.contains(&op) || llvm::isa<func::ReturnOp>(op) ||
+        op.hasAttr(kEmbeddingPipelining))
+      continue;
+    // Note, there should be no TPU ops left at this point. If this trips,
+    // there's likely a bug in this pass.
+    if (IsTPUOp(&op)) {
+      loop_body_func.emitOpError()
+          << "Unexpcted TPU op found while identifying non-TPU ops.";
+      return LogicalResult::failure();
+    }
+    non_tpu_ops.insert(&op);
+  }
+
+  // Validate that remainder_ops takes and returns a subset of the loop carried
+  // args. This will basically be our set increment fn.
+  for (Operation* op : non_tpu_ops)
+    for (Value input : op->getOperands())
+      if (BlockArgument arg = llvm::dyn_cast<BlockArgument>(input))
+        // TODO(bfontain): Check that this is actually an argument to the loop
+        // body.
+        non_tpu_args.insert(arg.getArgNumber());
+
+  // All funcs have a return op so this should be safe.
+  func::ReturnOp return_op = *loop_body_func.getOps<func::ReturnOp>().begin();
+
+  for (OpOperand& operand : return_op->getOpOperands()) {
+    if (non_tpu_args.contains(operand.getOperandNumber())) {
+      if (BlockArgument argument =
+              llvm::dyn_cast<BlockArgument>(operand.get())) {
+        if (argument.getArgNumber() != operand.getOperandNumber()) {
+          return_op.emitOpError()
+              << "non TPU ops do not divide state into two pieces.";
+          return LogicalResult::failure();
+        }
+      } else if (!non_tpu_ops.contains(operand.get().getDefiningOp())) {
+        return_op.emitOpError()
+            << "non TPU ops do not divide state into two pieces.";
+        return LogicalResult::failure();
+      }
+    }
+  }
+  return LogicalResult::success();
+}
+
+LogicalResult ExtractOpsAsFunc(
+    OpBuilder& builder, ModuleOp module, llvm::SetVector<Operation*>& ops,
+    StringAttr replication_attr, TF::TPUReplicateMetadataOp metadata_op,
+    TF::TPUCompilationResultOp compilation_op, func::FuncOp parent_func,
+    const std::string& func_name, Operation** caller) {
+  // Move the given set of 'ops' into it's own function and replace them with a
+  // call to that function ('caller'). if 'metadata_op' and 'compilation_op' are
+  // non-null, also insert those (i.e., target the resulting function to the
+  // TPU). Here, 'parent_func' is the func.FuncOp that owns the ops in 'ops'.
+  //
+  // Returns in 'caller' a tf.StatefulPartitionedCallOp that calls the function
+  // that was extracted..
+
+  // Find the input edges to form the set of operands to the new function call.
+  llvm::SetVector<Value> inputs;
+  for (Operation* op : ops) {
+    for (Value operand : op->getOperands()) {
+      Operation* defining_op = operand.getDefiningOp();
+      if (!ops.contains(defining_op)) inputs.insert(operand);
+    }
+  }
+  // Find the output edges to form the set of resutls of the new function call.
+  llvm::SetVector<OpResult> results;
+  for (Operation* op : ops) {
+    for (auto result : op->getResults()) {
+      for (const OpOperand& operand : result.getUsers()) {
+        if (!ops.contains(operand.getOwner())) {
+          results.insert(result);
+          break;
+        }
+      }
+    }
+  }
+  llvm::SetVector<Value> outputs;
+  for (auto output : results) outputs.insert(output);
+  auto tf_caller = EncapsulateOpsInFunc(builder, ops, inputs, outputs,
+                                        parent_func, module, func_name);
+  if (!ops.empty() && metadata_op != nullptr && compilation_op != nullptr)
+    UpdateAndInsertTPUOps(tf_caller, metadata_op, compilation_op,
+                          replication_attr);
+  *caller = tf_caller;
+  return LogicalResult::success();
+}
+
+void EmbeddingPipeliningPass::runOnOperation() {
+  ModuleOp module = getOperation();
+
+  llvm::SetVector<Operation*> forward_pass_ops;
+  llvm::SetVector<Operation*> backward_pass_ops;
+
+  // Find all ops that we know compose the embedding forward and backward pass.
+  // These ops are only tagged if one enables the
+  // `pipeline_execution_with_tensor_core` flag in the mid-level API.
+  WalkResult walk_result = module.walk([&](Operation* op) -> WalkResult {
+    if (op->hasAttr(kEmbeddingPipelining)) {
+      const std::string region =
+          op->getAttrOfType<StringAttr>(kEmbeddingPipelining).getValue().str();
+      if (region == kEmbeddingForward) {
+        forward_pass_ops.insert(op);
+      } else if (region == kEmbeddingBackward) {
+        backward_pass_ops.insert(op);
+      } else {
+        return op->emitOpError()
+               << "embedding op has unknown " << kEmbeddingPipelining
+               << " attribute value " << region << ".";
+      }
+      op->removeAttr(kEmbeddingPipelining);
+    }
+    return WalkResult::advance();
+  });
+  if (walk_result.wasInterrupted()) return signalPassFailure();
+
+  // If there are no forward pass ops, there is no SC, so we end early.
+  if (forward_pass_ops.empty()) {
+    if (backward_pass_ops.empty()) {
+      return;
+    } else {
+      (*backward_pass_ops.begin())->emitOpError()
+          << "embedding backwards pass op with no forwards pass ops.";
+      return signalPassFailure();
+    }
+  }
+
+  // Ensure that all ops are in the same region, and have the same replication
+  // info.
+  // TODO(bfontain): Allow for multiple regions/loops in one module.
+  // TODO(patn): move this pass after cluster formation to remove the complexity
+  // with replication info and metadata, cluster checking and generalizing to
+  // multiple TPU clusters.
+  Region* region = (*forward_pass_ops.begin())->getParentRegion();
+  StringAttr replication_attr = GetReplicationAttr(*forward_pass_ops.begin());
+  llvm::SmallVector<Operation*> checkset(forward_pass_ops.getArrayRef());
+  checkset.append(backward_pass_ops.begin(), backward_pass_ops.end());
+  for (Operation* op : checkset) {
+    if (op->getParentRegion() != region) {
+      op->emitOpError() << "embedding ops in two different regions";
+      return signalPassFailure();
+    }
+    if (GetReplicationAttr(op) != replication_attr) {
+      op->emitOpError() << "embedding ops with different replication info "
+                        << replication_attr << " vs " << GetReplicationAttr(op);
+      return signalPassFailure();
+    }
+  }
+
+  // TODO(bfontain): Check that the region here is the region
+  // of the loop body func.
+  // Find the FuncOp for the surrounding while loop body.
+  func::FuncOp loop_body_func =
+      (*forward_pass_ops.begin())->getParentOfType<func::FuncOp>();
+
+  // merged_set will keep track of which ops are to be avoided when gather ops
+  // for inclusion into the four extracted functions.
+  llvm::SetVector<Operation*> merged_set;
+
+  // Find the TPUReplicationMetadata and TPUCompilationResult ops and delete
+  // them. These will be cloned/inserted into each region.
+  TF::TPUReplicateMetadataOp metadata_op;
+  auto result = FindAndExcludeOp(loop_body_func, replication_attr, merged_set,
+                                 metadata_op);
+  if (failed(result)) return signalPassFailure();
+  const int num_replicas = metadata_op.getNumReplicas();
+
+  TF::TPUCompilationResultOp compilation_op;
+  result = FindAndExcludeOp<TF::TPUCompilationResultOp>(
+      loop_body_func, replication_attr, merged_set, compilation_op);
+  if (failed(result)) return signalPassFailure();
+
+  TF::WhileOp while_op = nullptr;
+  result = FindOwningWhileOp(loop_body_func, module, &while_op);
+  if (failed(result)) return signalPassFailure();
+
+  OpBuilder builder(module);
+
+  result = FindForwardPassOps(builder, forward_pass_ops, backward_pass_ops,
+                              merged_set, loop_body_func, num_replicas);
+  if (failed(result)) return signalPassFailure();
+  merged_set.insert(forward_pass_ops.begin(), forward_pass_ops.end());
+
+  result =
+      FindBackwardPassOps(builder, backward_pass_ops, merged_set, num_replicas);
+  if (failed(result)) return signalPassFailure();
+  merged_set.insert(backward_pass_ops.begin(), backward_pass_ops.end());
+
+  llvm::SetVector<Operation*> core_tpu_ops;
+  result = FindCoreTPUOps(core_tpu_ops, forward_pass_ops, backward_pass_ops,
+                          merged_set, loop_body_func);
+  if (failed(result)) return signalPassFailure();
+  merged_set.insert(core_tpu_ops.begin(), core_tpu_ops.end());
+
+  llvm::SetVector<Operation*> non_tpu_ops;
+  result = FindNonTPUOps(non_tpu_ops, merged_set, loop_body_func);
+  if (failed(result)) return signalPassFailure();
+  merged_set.insert(non_tpu_ops.begin(), non_tpu_ops.end());
+
+  VLOG(2) << "Forwards pass " << forward_pass_ops.size()
+          << " ops, backwards pass " << backward_pass_ops.size()
+          << " ops, core " << core_tpu_ops.size()
+          << " ops. Total = " << merged_set.size() << " of "
+          << GetNumOps(loop_body_func) << ".\n";
+
+  builder.setInsertionPointAfter(*non_tpu_ops.begin());
+  Operation* non_tpu_caller = nullptr;
+  result =
+      ExtractOpsAsFunc(builder, module, non_tpu_ops, replication_attr, nullptr,
+                       nullptr, loop_body_func, "non_tpu", &non_tpu_caller);
+  if (failed(result)) return signalPassFailure();
+
+  builder.setInsertionPointAfter(non_tpu_caller);
+  Operation* forward_caller = nullptr;
+  result = ExtractOpsAsFunc(builder, module, forward_pass_ops, replication_attr,
+                            metadata_op, compilation_op, loop_body_func,
+                            "sc_forward", &forward_caller);
+  if (failed(result)) return signalPassFailure();
+
+  // Create tpu_core function
+  builder.setInsertionPointAfter(forward_caller);
+  Operation* core_tpu_caller = nullptr;
+  result = ExtractOpsAsFunc(builder, module, core_tpu_ops, replication_attr,
+                            metadata_op, compilation_op, loop_body_func,
+                            "core_tpu", &core_tpu_caller);
+  if (failed(result)) return signalPassFailure();
+
+  builder.setInsertionPointAfter(core_tpu_caller);
+  Operation* backwards_pass_caller = nullptr;
+  result = ExtractOpsAsFunc(
+      builder, module, backward_pass_ops, replication_attr, metadata_op,
+      compilation_op, loop_body_func, "sc_backward", &backwards_pass_caller);
+  if (failed(result)) return signalPassFailure();
+
+  metadata_op->erase();
+  compilation_op->erase();
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<mlir::ModuleOp>> CreateEmbeddingPipeliningPass() {
+  return std::make_unique<EmbeddingPipeliningPass>();
+}
+
+}  // namespace TFDevice
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/extract_outside_compilation.cc
index 1d7b2c10ba6..9c3e82e88e1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/extract_outside_compilation.cc
@@ -169,13 +169,14 @@ Operation* CreateSendFromHostOp(OpBuilder& builder, Location loc,
                                 ValueRange inputs, Value compilation_key,
                                 Value device_ordinal,
                                 int default_device_ordinal,
+                                StringAttr device_type_attr,
                                 llvm::StringRef communication_key) {
   if (device_ordinal)
     return ApplyXlaHostTransferAttr(
         builder.create<TF::_XlaSendFromHostV2Op>(
             loc, inputs,
             /*dynamic_key=*/compilation_key, device_ordinal,
-            builder.getStringAttr(communication_key)),
+            builder.getStringAttr(communication_key), device_type_attr),
         builder);
 
   return ApplyXlaHostTransferAttr(
@@ -183,7 +184,8 @@ Operation* CreateSendFromHostOp(OpBuilder& builder, Location loc,
           loc, inputs,
           /*dynamic_key=*/compilation_key,
           builder.getStringAttr(communication_key),
-          /*device_ordinal=*/builder.getI64IntegerAttr(default_device_ordinal)),
+          /*device_ordinal=*/builder.getI64IntegerAttr(default_device_ordinal),
+          device_type_attr),
       builder);
 }
 
@@ -192,19 +194,21 @@ Operation* CreateSendFromHostOp(OpBuilder& builder, Location loc,
 Operation* CreateRecvAtHostOp(OpBuilder& builder, Location loc,
                               TypeRange output_types, Value compilation_key,
                               Value device_ordinal, int default_device_ordinal,
+                              StringAttr device_type_attr,
                               llvm::StringRef communication_key) {
   if (device_ordinal)
     return ApplyXlaHostTransferAttr(
         builder.create<TF::_XlaRecvAtHostV2Op>(
             loc, output_types, /*dynamic_key=*/compilation_key, device_ordinal,
-            builder.getStringAttr(communication_key)),
+            builder.getStringAttr(communication_key), device_type_attr),
         builder);
 
   return ApplyXlaHostTransferAttr(
       builder.create<TF::_XlaRecvAtHostOp>(
           loc, output_types, /*dynamic_key=*/compilation_key,
           builder.getStringAttr(communication_key),
-          /*device_ordinal=*/builder.getI64IntegerAttr(default_device_ordinal)),
+          /*device_ordinal=*/builder.getI64IntegerAttr(default_device_ordinal),
+          device_type_attr),
       builder);
 }
 
@@ -332,14 +336,6 @@ bool HasDynamicExternalValues(Operation* op) {
       .wasInterrupted();
 }
 
-// Checks if `type` is allowed for XLA. String and resources are not XLA types.
-// There are other TF types that are not XLA types which will be removed by
-// successive passes in TF/XLA bridge phase 2.
-bool TypeValidForXLA(const Type& type) {
-  const Type elem = getElementTypeOrSelf(type);
-  return !elem.isa<TF::ResourceType>() && !elem.isa<TF::StringType>();
-}
-
 // Returns operands of `cluster_ops` that need to be
 // communicated from device->host. This is for the case when all operands have a
 // static shape.
@@ -354,7 +350,7 @@ llvm::SmallSetVector<Value, 4> GetStaticExternalOperands(
               walked_op))
         return WalkResult::advance();
       for (Value v : walked_op->getOperands()) {
-        if (!TypeValidForXLA(v.getType())) continue;
+        if (!tensorflow::TypeValidForXLA(v.getType())) continue;
         if (auto* defining_op = v.getDefiningOp()) {
           if (!op->isAncestor(defining_op) &&
               device_cluster->isAncestor(defining_op) &&
@@ -385,7 +381,7 @@ llvm::SmallSetVector<Value, 4> GetAllExternalOperands(
   for (Operation* op : cluster_ops) {
     op->walk([&](Operation* walked_op) {
       for (Value v : walked_op->getOperands()) {
-        if (!TypeValidForXLA(v.getType())) continue;
+        if (!tensorflow::TypeValidForXLA(v.getType())) continue;
         Operation* defining_op = v.getDefiningOp();
         if (!defining_op || !cluster_ops.count(defining_op)) {
           external_values.insert(v);
@@ -431,8 +427,8 @@ void GetExternalOutputs(const llvm::SmallSetVector<Operation*, 4>& cluster_ops,
           HasDynamicOutputs(user)) {
         if (!user_set.insert(user).second) continue;
         for (Value v : user->getOperands()) {
-          if (TypeValidForXLA(v.getType()) && v.getDefiningOp() == op &&
-              !isa<tf_device::ReturnOp>(user))
+          if (tensorflow::TypeValidForXLA(v.getType()) &&
+              v.getDefiningOp() == op && !isa<tf_device::ReturnOp>(user))
             external_outputs.insert(v);
           if (v.getDefiningOp() == op && isa<tf_device::ReturnOp>(user))
             tmp_host_outputs.push_back(v);
@@ -489,7 +485,7 @@ bool ShouldCloseCluster(llvm::ArrayRef<Value> outputs) {
           return true;
       }
     }
-    if (!TypeValidForXLA(v.getType()))
+    if (!tensorflow::TypeValidForXLA(v.getType()))
       for (const Operation* user : v.getUsers())
         if (!isa<tf_device::ReturnOp>(user)) has_nonxla_output = true;
   }
@@ -570,8 +566,8 @@ void MoveOpsToHost(const llvm::SmallSetVector<Operation*, 4>& clustered_ops,
                    const llvm::SmallSetVector<Value, 4>& external_operands,
                    const llvm::SmallSetVector<Value, 4>& external_outputs,
                    Operation* insertion_point, Value compilation_key,
-                   Value device_ordinal, int default_device_ordignal,
-                   int& communication_key_index) {
+                   Value device_ordinal, int default_device_ordinal,
+                   StringAttr device_type_attr, int& communication_key_index) {
   OpBuilder builder(insertion_point);
   Operation& op = *clustered_ops.back();
   std::string args_communication_key =
@@ -612,7 +608,7 @@ void MoveOpsToHost(const llvm::SmallSetVector<Operation*, 4>& clustered_ops,
 
   Operation* recv_at_host = CreateRecvAtHostOp(
       builder, op.getLoc(), host_operand_types, compilation_key, device_ordinal,
-      default_device_ordignal, args_communication_key);
+      default_device_ordinal, device_type_attr, args_communication_key);
   Block* original_op_block = op.getBlock();
   Operation* after_op = recv_at_host;
   for (Operation* cluster_op : clustered_ops) {
@@ -624,7 +620,8 @@ void MoveOpsToHost(const llvm::SmallSetVector<Operation*, 4>& clustered_ops,
   if (!external_outputs.empty()) {
     CreateSendFromHostOp(builder, op.getLoc(), external_outputs.getArrayRef(),
                          compilation_key, device_ordinal,
-                         default_device_ordignal, retvals_communication_key);
+                         default_device_ordinal, device_type_attr,
+                         retvals_communication_key);
   }
 
   if (external_operands.empty()) {
@@ -656,7 +653,7 @@ void MoveOpsToHost(const llvm::SmallSetVector<Operation*, 4>& clustered_ops,
 // its value.
 LogicalResult MoveOpsToHost(
     tf_device::ClusterOp device_cluster, Block* src, Operation* insertion_point,
-    Value compilation_key, Value device_ordinal, int default_device_ordignal,
+    Value compilation_key, Value device_ordinal, int default_device_ordinal,
     int& communication_key_index,
     llvm::SmallVector<Value, 4>* return_value_from_host = nullptr) {
   // Contains all of the outside compiled operations that should be moved to the
@@ -664,6 +661,8 @@ LogicalResult MoveOpsToHost(
   // single op except in the case where some of the input/output shapes are
   // non-static.
   llvm::SmallSetVector<Operation*, 4> clustered_ops;
+  auto device_type_attr =
+      device_cluster->getAttrOfType<StringAttr>(TF::kCompileDeviceTypeAttr);
 
   for (Operation& op : llvm::make_early_inc_range(*src)) {
     if (HasOutsideCompilationAncestorExclusive(&op) ||
@@ -687,7 +686,8 @@ LogicalResult MoveOpsToHost(
       }
       MoveOpsToHost(clustered_ops, external_operands, external_outputs,
                     insertion_point, compilation_key, device_ordinal,
-                    default_device_ordignal, communication_key_index);
+                    default_device_ordinal, device_type_attr,
+                    communication_key_index);
       clustered_ops.clear();
     }
 
@@ -710,7 +710,8 @@ LogicalResult MoveOpsToHost(
 
       MoveOpsToHost(clustered_ops, external_operands, external_outputs,
                     insertion_point, compilation_key, device_ordinal,
-                    default_device_ordignal, communication_key_index);
+                    default_device_ordinal, device_type_attr,
+                    communication_key_index);
       clustered_ops.clear();
     }
   }
@@ -740,24 +741,22 @@ void GetReturnValueFromDevice(
 // `communication_key_index` when creating communication ops.
 LogicalResult DecomposeControlFlow(tf_device::ClusterOp device_cluster,
                                    Value compilation_key, Value device_ordinal,
-                                   int default_device_ordignal,
+                                   int default_device_ordinal,
                                    int& communication_key_index) {
   auto result = device_cluster.GetBody().walk([&](Operation* op) {
     if (auto if_op = llvm::dyn_cast<TF::IfRegionOp>(op)) {
       if (!HasOutsideCompilationNested(op)) return WalkResult::advance();
       OpBuilder builder(if_op);
       auto host_if = CloneEmptyIfWithPredicate(if_op, builder);
-      if (failed(MoveOpsToHost(device_cluster, &if_op.getThenBranch().front(),
-                               host_if.getThenBranch().front().getTerminator(),
-                               compilation_key, device_ordinal,
-                               default_device_ordignal,
-                               communication_key_index)))
+      if (failed(MoveOpsToHost(
+              device_cluster, &if_op.getThenBranch().front(),
+              host_if.getThenBranch().front().getTerminator(), compilation_key,
+              device_ordinal, default_device_ordinal, communication_key_index)))
         return WalkResult::interrupt();
-      if (failed(MoveOpsToHost(device_cluster, &if_op.getElseBranch().front(),
-                               host_if.getElseBranch().front().getTerminator(),
-                               compilation_key, device_ordinal,
-                               default_device_ordignal,
-                               communication_key_index)))
+      if (failed(MoveOpsToHost(
+              device_cluster, &if_op.getElseBranch().front(),
+              host_if.getElseBranch().front().getTerminator(), compilation_key,
+              device_ordinal, default_device_ordinal, communication_key_index)))
         return WalkResult::interrupt();
       // Mark op as stateful due to side-effecting communication ops.
       if_op->setAttr("is_stateless", builder.getBoolAttr(false));
@@ -782,21 +781,21 @@ LogicalResult DecomposeControlFlow(tf_device::ClusterOp device_cluster,
       builder.setInsertionPointToEnd(&cond.front());
       auto recv_condition_at_host = CreateRecvAtHostOp(
           builder, while_op.getLoc(), TypeRange{condition.getType()},
-          compilation_key, device_ordinal, default_device_ordignal,
+          compilation_key, device_ordinal, default_device_ordinal,
+          device_cluster->getAttrOfType<StringAttr>(TF::kCompileDeviceTypeAttr),
           condition_send_recv_key);
       builder.create<TF::YieldOp>(while_op.getLoc(),
                                   recv_condition_at_host->getResults());
 
       if (failed(MoveOpsToHost(device_cluster, &while_op.getCond().front(),
                                recv_condition_at_host, compilation_key,
-                               device_ordinal, default_device_ordignal,
+                               device_ordinal, default_device_ordinal,
                                communication_key_index)))
         return WalkResult::interrupt();
-      if (failed(MoveOpsToHost(device_cluster, &while_op.getBody().front(),
-                               host_while.getBody().front().getTerminator(),
-                               compilation_key, device_ordinal,
-                               default_device_ordignal,
-                               communication_key_index)))
+      if (failed(MoveOpsToHost(
+              device_cluster, &while_op.getBody().front(),
+              host_while.getBody().front().getTerminator(), compilation_key,
+              device_ordinal, default_device_ordinal, communication_key_index)))
         return WalkResult::interrupt();
       // Mark op as stateful due to side-effecting communication ops.
       while_op->setAttr("is_stateless", builder.getBoolAttr(false));
@@ -1167,7 +1166,7 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
 // have a valid XLA type.
 LogicalResult CheckClusterResults(tf_device::ClusterOp cluster) {
   for (OpResult result : cluster.getResults()) {
-    if (!TypeValidForXLA(result.getType())) {
+    if (!tensorflow::TypeValidForXLA(result.getType())) {
       cluster.emitError()
           << "The ExtractHeadTailOutsideCompilation pass produced a Device "
              "cluster with a result with a non-XLA type: "
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc b/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
new file mode 100644
index 00000000000..9284fd2bc0b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
@@ -0,0 +1,199 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
+
+#define DEBUG_TYPE "tf-extract-tpu-copy-with-dynamic-shape-op"
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+#define GEN_PASS_DEF_EXTRACTTPUCOPYWITHDYNAMICSHAPEOPPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+
+class ExtractTPUCopyWithDynamicShapeOpPass
+    : public impl::ExtractTPUCopyWithDynamicShapeOpPassBase<
+          ExtractTPUCopyWithDynamicShapeOpPass> {
+  void runOnOperation() override;
+};
+
+// Finds op that created a given value. If the value is a BlockArgument, this
+// returns the owner of the Block.
+Operation* GetOpOfValue(Value value) {
+  if (auto block_arg = value.dyn_cast<BlockArgument>())
+    return block_arg.getOwner()->getParentOp();
+
+  return value.getDefiningOp();
+}
+
+// Check if the TPUCopyWithDynamicShapeOp is valid.
+// 1. The op should be wrapped inside a launch op.
+// 2. The wrapped launch op should be placed on CPU.
+bool IsOpValid(Operation* op) {
+  auto launch_op = llvm::dyn_cast<tf_device::LaunchOp>(op->getParentOp());
+  if (!launch_op) return false;
+  std::string device_str = launch_op.getDeviceAttr().getValue().str();
+  return device_str == tensorflow::GetDeviceAliasForHostOfLogicalCore(0) ||
+         device_str == "/job:localhost/replica:0/task:0/device:CPU:0";
+}
+
+// Get the new launch op results. This is the results if the copy op is removed
+// from the old launch op.
+llvm::SmallVector<Value, 4> CreateNewLaunchOpResults(
+    tf_device::LaunchOp* old_launch_op,
+    Operation* tpu_copy_with_dynamic_shape_op) {
+  llvm::SmallSetVector<Value, 4> new_launch_op_results;
+
+  new_launch_op_results.insert(
+      old_launch_op->GetBody().getTerminator()->getOperands().begin(),
+      old_launch_op->GetBody().getTerminator()->getOperands().end());
+
+  for (Value operand : tpu_copy_with_dynamic_shape_op->getOperands()) {
+    if (GetOpOfValue(operand)->getParentRegion() ==
+        tpu_copy_with_dynamic_shape_op->getParentRegion()) {
+      new_launch_op_results.insert(operand);
+    }
+  }
+
+  for (Value result : tpu_copy_with_dynamic_shape_op->getResults()) {
+    new_launch_op_results.remove(result);
+  }
+
+  return new_launch_op_results.takeVector();
+}
+
+// Create a new host launch op which contains all the old launch op body
+// except the dynamic shape copy op.
+tf_device::LaunchOp CreateNewHostLaunchOpWithNewResult(
+    tf_device::LaunchOp* old_launch_op,
+    llvm::SmallVector<Value, 4>& new_launch_op_results) {
+  OpBuilder builder(*old_launch_op);
+
+  builder.setInsertionPointAfter(*old_launch_op);
+
+  llvm::SmallVector<Type, 4> new_launch_op_results_types;
+  for (Value result : new_launch_op_results)
+    new_launch_op_results_types.push_back(result.getType());
+
+  auto new_launch_op = builder.create<tf_device::LaunchOp>(
+      old_launch_op->getLoc(), old_launch_op->getDeviceAttr(),
+      /*result_types=*/new_launch_op_results_types);
+
+  new_launch_op.getBody().takeBody(old_launch_op->getBody());
+  new_launch_op.GetBody().getTerminator()->setOperands(new_launch_op_results);
+
+  return new_launch_op;
+}
+
+// Create the new device launch op which wraps the copy op.
+tf_device::LaunchOp CreateNewDeviceLaunchOp(
+    Operation* tpu_copy_with_dynamic_shape_op, bool replicated) {
+  OpBuilder builder(tpu_copy_with_dynamic_shape_op);
+
+  builder.setInsertionPointAfter(tpu_copy_with_dynamic_shape_op);
+
+  std::string device_str;
+  if (replicated) {
+    device_str = tensorflow::GetDeviceAliasForLogicalCore(0);
+  } else {
+    device_str = "/job:localhost/replica:0/task:0/device:TPU:0";
+  }
+
+  auto new_device_launch_op = builder.create<tf_device::LaunchOp>(
+      tpu_copy_with_dynamic_shape_op->getLoc(),
+      builder.getStringAttr(device_str),
+      /*result_types=*/tpu_copy_with_dynamic_shape_op->getResultTypes());
+
+  new_device_launch_op.getBody().push_back(new Block);
+  builder.setInsertionPointToEnd(&new_device_launch_op.GetBody());
+  auto* return_op = builder
+                        .create<tf_device::ReturnOp>(
+                            tpu_copy_with_dynamic_shape_op->getLoc(),
+                            tpu_copy_with_dynamic_shape_op->getResults())
+                        .getOperation();
+  tpu_copy_with_dynamic_shape_op->moveBefore(return_op);
+  return new_device_launch_op;
+}
+
+// Update all the usage of tf_device.return op with launch op result.
+void UpdateReturnOpResultWithLaunchOpResult(tf_device::LaunchOp* launch_op) {
+  auto operand_not_in_launch = [&](OpOperand& operand) {
+    return !launch_op->getOperation()->isProperAncestor(operand.getOwner());
+  };
+
+  for (auto result :
+       llvm::zip(launch_op->getResults(),
+                 launch_op->GetBody().getTerminator()->getOperands()))
+    std::get<1>(result).replaceUsesWithIf(std::get<0>(result),
+                                          operand_not_in_launch);
+}
+
+void ExtractTPUCopyWithDynamicShapeOpPass::runOnOperation() {
+  llvm::SmallVector<Operation*, 4> tpu_copy_with_dynamic_shape_ops;
+  getOperation().walk([&](Operation* op) {
+    if (isa<TF::TPUCopyWithDynamicShapeOp>(op)) {
+      if (!IsOpValid(op)) return signalPassFailure();
+      tpu_copy_with_dynamic_shape_ops.push_back(op);
+    }
+  });
+
+  for (Operation* op : tpu_copy_with_dynamic_shape_ops) {
+    OpBuilder builder(op);
+
+    auto old_launch_op = llvm::dyn_cast<tf_device::LaunchOp>(op->getParentOp());
+
+    bool replicated = old_launch_op.getDeviceAttr().getValue().str() ==
+                      tensorflow::GetDeviceAliasForHostOfLogicalCore(0);
+
+    for (auto result :
+         llvm::zip(old_launch_op->getResults(),
+                   old_launch_op.GetBody().getTerminator()->getOperands()))
+      std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
+
+    llvm::SmallVector<Value, 4> new_launch_op_results =
+        CreateNewLaunchOpResults(&old_launch_op, op);
+
+    op->moveAfter(old_launch_op);
+
+    auto new_host_launch_op = CreateNewHostLaunchOpWithNewResult(
+        &old_launch_op, new_launch_op_results);
+    UpdateReturnOpResultWithLaunchOpResult(&new_host_launch_op);
+
+    old_launch_op->erase();
+
+    auto new_device_launch_op = CreateNewDeviceLaunchOp(op, replicated);
+    UpdateReturnOpResultWithLaunchOpResult(&new_device_launch_op);
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateExtractTPUCopyWithDynamicShapeOpPass() {
+  return std::make_unique<ExtractTPUCopyWithDynamicShapeOpPass>();
+}
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
index b5af8f60bd4..dff2223b115 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h"
 
+#include <string>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
@@ -33,7 +35,8 @@ using Graph = ::tensorflow::Graph;
 }  // namespace
 
 Status MlirGraphOptimizationPass::Run(
-    const ConfigProto& config_proto, ModuleOp module, const Graph& graph,
+    const std::string& function_name, const ConfigProto& config_proto,
+    ModuleOp module, const Graph& graph,
     const tensorflow::FunctionLibraryDefinition& function_library) {
   if (GetPassState(/*device_set=*/nullptr, config_proto, graph,
                    function_library) ==
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h
index 4da3e14721b..4390e59ca80 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_GRAPH_OPTIMIZATION_PASS_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_GRAPH_OPTIMIZATION_PASS_H_
 
+#include <string>
+
 #include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
 
 namespace mlir {
@@ -39,6 +41,7 @@ class MlirGraphOptimizationPass : public ::tensorflow::MlirOptimizationPass {
   }
 
   ::tensorflow::Status Run(
+      const std::string& function_name,
       const ::tensorflow::ConfigProto& config_proto, ModuleOp module,
       const ::tensorflow::Graph& graph,
       const tensorflow::FunctionLibraryDefinition& function_library) override;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/group_by_dialect.cc b/tensorflow/compiler/mlir/tensorflow/transforms/group_by_dialect.cc
index 805fb19742a..2edd6d76f03 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/group_by_dialect.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/group_by_dialect.cc
@@ -164,7 +164,7 @@ void wrapOpsInFunction(std::vector<Operation*>& ops, int function_id,
   auto call = builder.create<mlir::func::CallOp>(
       ops[0]->getLoc(), func.getFunctionType().getResults(), func.getSymName(),
       inputs);
-  for (auto& v : llvm::enumerate(outputs)) {
+  for (const auto& v : llvm::enumerate(outputs)) {
     v.value().replaceUsesWithIf(call.getResult(v.index()), [=](OpOperand& o) {
       // Outside of what we're moving, results of our operations need to
       // be replaced by results from the function call.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.cc b/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.cc
index 44e178ac76c..3b974c39570 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.cc
@@ -114,8 +114,8 @@ LogicalResult InitializeVariablesInSessionInitializer(
   const tensorflow::DeviceMgr* mgr = nullptr;
   auto status = session->LocalDeviceManager(&mgr);
   if (!status.ok()) {
-    module->emitError("failed to fetch device manager: " +
-                      status.error_message());
+    module->emitError(
+        absl::StrCat("failed to fetch device manager: ", status.message()));
     return failure();
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc
index aa2605cbb33..bc0534fdb0b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc
@@ -82,7 +82,7 @@ LogicalResult LiftVariablesFromSession(
       /*target_tensor_names=*/{}, &resource_tensors);
   if (!status.ok()) {
     return module.emitOpError()
-           << "failed to run the provided session: " << status.error_message();
+           << "failed to run the provided session: " << status.message();
   }
 
   const DeviceMgr* device_manager;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_initialized_variables.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mark_initialized_variables.cc
index 25c9e4c0749..54fef16b043 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mark_initialized_variables.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mark_initialized_variables.cc
@@ -56,8 +56,8 @@ LogicalResult MarkInitializedVariablesInFunction(func::FuncOp function,
   const tensorflow::DeviceMgr* mgr = nullptr;
   auto status = session->LocalDeviceManager(&mgr);
   if (!status.ok())
-    return function->emitError("failed to fetch device manager: " +
-                               status.error_message());
+    return function->emitError(
+        absl::StrCat("failed to fetch device manager: ", status.message()));
 
   // Fetch all varHandleOp in the function.
   llvm::SmallVector<TF::VarHandleOp, 4> var_ops;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/merge_control_flow.cc b/tensorflow/compiler/mlir/tensorflow/transforms/merge_control_flow.cc
index af856295f24..709e4532c12 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/merge_control_flow.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/merge_control_flow.cc
@@ -367,7 +367,8 @@ llvm::SmallVector<int, 4> GetReturnIndicesToKeep(
     }
     return false;
   };
-  for (auto& index_and_value : llvm::enumerate(current_if_op.getResults())) {
+  for (const auto& index_and_value :
+       llvm::enumerate(current_if_op.getResults())) {
     if (!llvm::all_of(index_and_value.value().getUsers(),
                       is_op_inside_IfRegions)) {
       return_indices_to_keep.push_back(index_and_value.index());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.cc
index 8594e5ad65a..32ff28ea968 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.cc
@@ -64,7 +64,7 @@ void PopulateLowerToMlProgramAndHloPipeline(mlir::OpPassManager& pm) {
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
 
   llvm::StringRef tf2xla_fallback_device_type = "XLA_CPU_JIT";
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createLegalizeTFPass(
+  pm.addPass(mlir::mhlo::createLegalizeTFPass(
       /*allow_partial_conversion=*/true, /*legalize_chlo=*/true,
       tf2xla_fallback_device_type, /*prefer_tf2xla=*/false));
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td
index 034fbe1d840..be01d276902 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td
@@ -43,7 +43,7 @@ def CanFuseMulAndConv2D :
     Constraint<CPred<"TFL::IsBroadcastableElementsAttrs($0, $1) && TFL::IsDimensionsDegenerateExceptLastOne($1)">>;
 
 def F32ElementsAttr : ElementsAttrBase<
-    CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isF32()">, "float constant tensor">;
+    CPred<"$_self.cast<ElementsAttr>().getShapedType().getElementType().isF32()">, "float constant tensor">;
 def DefinedByConv2D : Constraint<CPred<"llvm::isa_and_nonnull<mlir::TF::Conv2DOp>($0.getDefiningOp())">>;
 // Checks if the value has only one user.
 def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index b17084201a4..4092b90411e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -377,10 +377,12 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateClusterConstantSinkingPass(
     llvm::function_ref<bool(tf_device::ClusterOp, ElementsAttr)> filter = {});
 
 // Creates a pass that outlines regions of tf_device.cluster operations.
-std::unique_ptr<OperationPass<ModuleOp>> CreateClusterOutliningPass();
+std::unique_ptr<OperationPass<ModuleOp>> CreateClusterOutliningPass(
+    bool globally_unique_func_names = true);
 
 // Creates a pass that outlines regions of tf_device.launch operations.
-std::unique_ptr<OperationPass<ModuleOp>> CreateLaunchOutliningPass();
+std::unique_ptr<OperationPass<ModuleOp>> CreateLaunchOutliningPass(
+    bool globally_unique_func_names = true);
 
 // Creates a pass that converts tf_device::LaunchFuncOp into
 // TF::PartitionedCallOp.
@@ -432,6 +434,10 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateReplicateToIslandPass(
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreateReplicaIDToDeviceOrdinalPass();
 
+// Creates a pass that adds pipelining to a graph that contains device
+// accelerated embeddings.
+std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingPipeliningPass();
+
 // Creates a pass that creates `tf_executor.island` from a single
 // `tf_device.parallel_execute` island.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateParallelExecuteToIslandsPass(
@@ -529,9 +535,16 @@ CreateTPUReorderReplicateAndPartitionedInputsPass();
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreateTPUResourceReadsWritesPartitioningPass();
 
+// Creates a pass that looks for usage of the result of
+// TPUCopyWithDynamicShapeOp and annotate these values to be dynamic shape. This
+// ensures that the generated tpu program has the correct inputs annotation.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUAnnotateDynamicShapeInputsPass();
+
 // Creates a pass that rewrites `tf_device.launch_func` on TPUs into TPU runtime
 // ops.
-std::unique_ptr<OperationPass<ModuleOp>> CreateTPURewritePass();
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPURewritePass(
+    llvm::StringRef module_name = llvm::StringRef());
 
 // Creates a pass that identifies XLASharding ops in launch op for TPU
 // computation.
@@ -549,6 +562,12 @@ CreateTPUParallelExecuteSinkResourceWritePass();
 std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUMergeVariablesWithExecutePass();
 
+// Create a pass that extract TPUCopyWithDynamicShapeOp from the host launch op
+// and wrap them in device launch op. This allows this op executed on TPU while
+// still compiled on host.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateExtractTPUCopyWithDynamicShapeOpPass();
+
 // Creates a pass that wraps ReadVariableOp/AssignVariable op that consumes a
 // packed tensor to have same device placement as underlying TPU device.
 std::unique_ptr<OperationPass<func::FuncOp>>
@@ -578,9 +597,13 @@ CreateTPUUpdateEmbeddingEnqueueOpInputsPass();
 // Creates a pass that propagates TPU devices to users.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateTPUDevicePropagationPass();
 
+// Create a pass that colocates each `Split` with its predecessor.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTPUColocateSplitsPass();
+
 // Populates the supplied passmanager with the passes required to run the
 // bridge.
-void CreateTPUBridgePipeline(OpPassManager& pm);
+void CreateTPUBridgePipeline(OpPassManager& pm,
+                             llvm::StringRef module_name = llvm::StringRef());
 
 // Populates the supplied passmanager with the passes required to run the
 // bridge in V1 mode.
@@ -681,6 +704,7 @@ enum MoveTransposeDirection { kBegin, kEnd };
 #define GEN_PASS_DECL_TPUHOSTCOMPUTATIONEXPANSIONPASS
 #define GEN_PASS_DECL_TPUIDENTITYPRUNINGPASS
 #define GEN_PASS_DECL_TPUMERGEVARIABLESWITHEXECUTEPASS
+#define GEN_PASS_DECL_EXTRACTTPUCOPYWITHDYNAMICSHAPEOPPASS
 #define GEN_PASS_DECL_TPUPARALLELEXECUTESINKRESOURCEWRITEPASS
 #define GEN_PASS_DECL_TPUREORDERREPLICATEANDPARTITIONEDINPUTSPASS
 #define GEN_PASS_DECL_TPURESOURCEREADFORWRITEPASS
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
index 457dca838af..81affc412bc 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
@@ -67,6 +67,12 @@ class RewriteXlaHostComputeMlir
 
   LogicalResult matchAndRewrite(TF::_XlaHostComputeMlirOp op,
                                 PatternRewriter& rewriter) const override {
+    if (op.getManualSharding()) {
+      op.emitOpError() << "manual_sharding not supported with fallback of "
+                          "phase 2 legalize TF/XLA bridge. manual_sharding is "
+                          "used by map_outside_compilation";
+      return failure();
+    }
     llvm::SmallVector<Attribute> shape_attrs;
     shape_attrs.reserve(op.getNumResults());
     for (Type ty : op.getResultTypes()) {
@@ -99,7 +105,8 @@ class RewriteXlaHostComputeMlir
       auto recv_at_host = rewriter.create<TF::_XlaRecvAtHostOp>(
           func.getLoc(), op.getOperandTypes(), /*dynamic_key=*/dynamic_key,
           op.getSendKeyAttr(),
-          /*device_ordinal=*/rewriter.getI64IntegerAttr(0));
+          /*device_ordinal=*/rewriter.getI64IntegerAttr(0),
+          rewriter.getStringAttr("TPU"));
       for (auto result :
            llvm::zip(cloned_func.getArguments(), recv_at_host->getResults())) {
         std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
@@ -110,7 +117,8 @@ class RewriteXlaHostComputeMlir
           func.getLoc(),
           cloned_func.getBody().front().getTerminator()->getOperands(),
           /*dynamic_key=*/dynamic_key, op.getRecvKeyAttr(),
-          /*device_ordinal=*/rewriter.getI64IntegerAttr(0));
+          /*device_ordinal=*/rewriter.getI64IntegerAttr(0),
+          rewriter.getStringAttr("TPU"));
     }
 
     constexpr int64_t kDefaultCostEstimate = 1000000;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
index 1831ecc68db..49b913963af 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
@@ -17,6 +17,7 @@ limitations under the License.
 // result(s) regardless of replication, out of their respective replicate.
 
 #include <memory>
+#include <optional>
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
@@ -45,8 +46,46 @@ struct ReplicateInvariantOpHoistingPass
   void runOnOperation() override;
 };
 
+// Check if op directly uses a key in `virtual_devices`.
+bool DirectUseOfVirtualDevice(const DictionaryAttr& virtual_devices,
+                              Operation* op) {
+  StringAttr op_device = op->getAttrOfType<StringAttr>(kDeviceAttr);
+  if (!op_device) return false;
+  if (virtual_devices.get(op_device.getValue())) return true;
+  return false;
+}
+
+// Check if op or its ancestor uses a key in `virtual_devices`.
+bool AncestorUsesVirtualDevice(
+    const std::optional<DictionaryAttr>& virtual_devices, Operation* op) {
+  if (!virtual_devices.has_value()) return false;
+  if (!op) return false;
+  if (llvm::isa<tf_device::ReplicateOp>(op)) return false;
+  if (DirectUseOfVirtualDevice(*virtual_devices, op)) return true;
+  return AncestorUsesVirtualDevice(virtual_devices, op->getParentOp());
+}
+
+// Check if op or its descendant uses a key in `virtual_devices`.
+bool DescendantUsesVirtualDevice(
+    const std::optional<DictionaryAttr>& virtual_devices,
+    Operation* operation) {
+  if (!virtual_devices.has_value()) return false;
+
+  auto result = operation->walk([&](Operation* op) {
+    if (DirectUseOfVirtualDevice(*virtual_devices, op))
+      return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+  return result.wasInterrupted();
+}
+
+// Make invariant the `ShapeOp`s or a `ReadVariableOp` that's the `ShapeOp`'s
+// predecessor.
 void MakeShapeOpInvariant(tf_device::ReplicateOp replicate_op, int num_replicas,
                           Block* replicate_block, TF::ShapeOp shape_op) {
+  // Ignore ShapeOps that have virtual devices.
+  if (AncestorUsesVirtualDevice(replicate_op.getDevices(), shape_op)) return;
+
   Value input = shape_op.getInput();
   // If ShapeOp operand is replicate tensor block argument, replace with the
   // associated first replica operand.
@@ -85,22 +124,6 @@ void MakeShapeOpInvariant(tf_device::ReplicateOp replicate_op, int num_replicas,
   }
 }
 
-// Check if op uses a device from a list of virtual devices.
-bool UsesVirtualDevice(const std::optional<DictionaryAttr>& virtual_devices,
-                       Operation* operation) {
-  if (!virtual_devices.has_value()) return false;
-
-  auto result = operation->walk([&](Operation* op) {
-    StringAttr op_device = op->getAttrOfType<StringAttr>(kDeviceAttr);
-    if (!op_device) return WalkResult::advance();
-
-    if (virtual_devices.value().get(op_device.getValue()))
-      return WalkResult::interrupt();
-    return WalkResult::advance();
-  });
-  return result.wasInterrupted();
-}
-
 // Checks if op and inner op operands are all replicate invariant.
 bool IsOpReplicateInvariant(Region* replicate_region, Operation* op) {
   auto ancestor_of_replicate = [&](Region* region) {
@@ -110,6 +133,9 @@ bool IsOpReplicateInvariant(Region* replicate_region, Operation* op) {
   for (Value operand : op->getOperands())
     if (!ancestor_of_replicate(operand.getParentRegion())) return false;
 
+  // _TPUDeviceOrdinalPlaceholder implicitly depends on the replica.
+  if (llvm::isa<TF::_TPUDeviceOrdinalPlaceholderOp>(op)) return false;
+
   bool has_replicate_operands = false;
   visitUsedValuesDefinedAbove(op->getRegions(), [&](OpOperand* operand) {
     if (!ancestor_of_replicate(operand->get().getParentRegion()))
@@ -127,6 +153,10 @@ void HoistReplicateInvariantOps(tf_device::ReplicateOp replicate_op) {
   const int num_replicas = replicate_op.getN();
   Block* replicate_block = &replicate_op.GetBody();
 
+  // A `ShapeOp` that directly depends on a `tf_device.replicate` param and does
+  // not have a virtual device is assumed to return the same shape across all
+  // replicas. Thus it is invariant across replicas.
+  // TODO(b/277936694): Remove this assumption and special case.
   replicate_op.walk([&](TF::ShapeOp shape_op) {
     MakeShapeOpInvariant(replicate_op, num_replicas, replicate_block, shape_op);
   });
@@ -138,7 +168,7 @@ void HoistReplicateInvariantOps(tf_device::ReplicateOp replicate_op) {
     if (llvm::isa<tf_device::ReturnOp>(inner_op)) continue;
     // Skip hoisting if the inner op device attribute is a virtual device
     // defined by tf_device.replicate.
-    if (UsesVirtualDevice(virtual_device_list, &inner_op)) continue;
+    if (DescendantUsesVirtualDevice(virtual_device_list, &inner_op)) continue;
 
     if (IsOpReplicateInvariant(replicate_region, &inner_op))
       inner_op.moveBefore(replicate_op);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index aa4941ec5b6..a8bfb700209 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -464,7 +464,8 @@ void RegionResourceHoister::ReplaceOpWithNewOp() {
   // Clone this old operation but with new result types.
   Operation* new_op = Operation::create(
       op_->getLoc(), op_->getName(), new_result_types, op_->getOperands(),
-      op_->getAttrs(), op_->getSuccessors(), op_->getNumRegions());
+      op_->getAttrs(), op_->getPropertiesStorage(), op_->getSuccessors(),
+      op_->getNumRegions());
   builder.insert(new_op);
 
   // Move regions to the new op.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
index 08e7b308b10..99693a91b2f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
@@ -97,7 +97,8 @@ void EliminateUnusedResults(
   OpBuilder builder(op);
   Operation *new_op = Operation::create(
       op->getLoc(), op->getName(), new_result_types, op->getOperands(),
-      op->getAttrs(), op->getSuccessors(), op->getNumRegions());
+      op->getAttrs(), op->getPropertiesStorage(), op->getSuccessors(),
+      op->getNumRegions());
   builder.insert(new_op);
 
   // Move region bodies to the new operation.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 3b176cb0ba3..1edc7f4bb73 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -19,9 +19,13 @@ limitations under the License.
 #include <cstdint>
 #include <initializer_list>
 #include <iterator>
+#include <memory>
 #include <optional>
 #include <queue>
 #include <stack>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -45,6 +49,7 @@ limitations under the License.
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/FunctionInterfaces.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
@@ -69,6 +74,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h"
@@ -427,6 +433,35 @@ Type GetType(Attribute shape_attr, Attribute type_attr) {
     return UnrankedTensorType::get(type.getValue());
 }
 
+// Returns a new arg type based on the shape and element type. If there are
+// dynamic bounds attribute to the arg, update the bounds based on the shape
+// as well.
+Type GetNewArgType(Type old_arg_type, ArrayRef<int64_t> shape,
+                   Type element_type, mlir::MLIRContext* context) {
+  Type new_arg_type = tensorflow::GetTypeFromTFTensorShape(shape, element_type);
+
+  if (auto input_ty = old_arg_type.dyn_cast<RankedTensorType>()) {
+    ArrayRef<int64_t> bounds = hlo::encodingToBounds(input_ty.getEncoding());
+    // The input type has bounded dynamic dimension.
+    if (!bounds.empty()) {
+      SmallVector<int64_t> new_bounds(bounds.begin(), bounds.end());
+      SmallVector<int64_t> new_shape(shape.begin(), shape.end());
+      // If dimension of the input type is dynamic. Update the
+      // bounds of the dim with the new type if needed.
+      for (int i = 0; i < input_ty.getShape().size(); i++) {
+        if (hlo::isDynamicDimSize(input_ty.getShape()[i])) {
+          new_bounds[i] = new_shape[i];
+          new_shape[i] = ShapedType::kDynamic;
+        }
+      }
+      new_arg_type = tensorflow::GetTypeFromTFTensorShape(
+          new_shape, element_type,
+          mhlo::TypeExtensionsAttr::get(context, new_bounds));
+    }
+  }
+  return new_arg_type;
+}
+
 }  // namespace
 
 // Returns whether type can be further refined.
@@ -883,6 +918,10 @@ class ShapeInference {
   // yields.
   bool InferShapeForCaseRegion(CaseRegionOp op);
 
+  // Infers the shape CaseRegion outputs based on the embedded StableHLO module.
+  // Returns true if a return type was changed.
+  bool InferShapeForXlaCallModule(XlaCallModuleOp op);
+
   // Infers the shape of _XlaHostComputeMlir based on the host computation
   // module.  Returns true if a return type was changed.
   bool InferShapeForXlaHostComputeMlir(_XlaHostComputeMlirOp op);
@@ -955,6 +994,14 @@ class ShapeInference {
   // TODO(b/154065712): Remove propagate_caller_callee_constants once using
   // SCCP pass instead.
   bool propagate_caller_callee_constants_;
+
+  // XlaCallModule loader, which is used to deserialize the StableHLO module in
+  // each `XlaCallModule` op. Uses its own MLIRContext since the loader needs to
+  // load additional dialects, which is not allowed for the main context since
+  // shape inference may be called from a pass.
+  MLIRContext xla_call_module_context_;
+  DenseMap<XlaCallModuleOp, std::unique_ptr<tensorflow::XlaCallModuleLoader>>
+      xla_call_module_loaders_;
 };
 
 ShapeInference::ShapeInference(int64_t graph_version, ModuleOp module,
@@ -1141,6 +1188,74 @@ bool ShapeInference::InferShapeForCaseRegion(CaseRegionOp op) {
   return changed;
 }
 
+bool ShapeInference::InferShapeForXlaCallModule(XlaCallModuleOp op) {
+  tensorflow::XlaCallModuleLoader* loader;
+  {
+    const auto [it, inserted] = xla_call_module_loaders_.insert({op, nullptr});
+
+    // Lazily parse XlaCallModule's embedded HLO module and cache the loader to
+    // avoid repeatedly parsing the module.
+    if (inserted) {
+      std::vector<std::string> dim_args_spec;
+      for (auto attr : op.getDimArgsSpec().getAsRange<StringAttr>()) {
+        dim_args_spec.push_back(attr.getValue().str());
+      }
+
+      // Always use the first platform. The assumption is that shape inference
+      // results should be the same regardless of which platform is chosen.
+      int platform_index = op.getPlatforms().size() > 1 ? 0 : -1;
+
+      auto l = tensorflow::XlaCallModuleLoader::Create(
+          &xla_call_module_context_, op.getVersion(), op.getModule().str(),
+          std::move(dim_args_spec), platform_index);
+      if (!l.ok()) {
+        LLVM_DEBUG(llvm::dbgs() << "Parsing error in XlaCallModule: "
+                                << l.status().ToString() << "\n");
+        return false;
+      }
+      it->second = *std::move(l);
+    }
+
+    loader = it->second.get();
+  }
+
+  // Cannot pass `op.getArgs().getTypes()` to `loader->RefineDynamicShapes`
+  // because `op` and `loader` are using different MLIR contexts. See comments
+  // on `xla_call_module_context_` for details.
+  std::vector<xla::Shape> input_shapes;
+  input_shapes.reserve(op.getArgs().size());
+  for (mlir::Type type : op.getArgs().getTypes()) {
+    input_shapes.push_back(xla::TypeToShape(type));
+  }
+
+  tsl::Status status = loader->RefineDynamicShapes(input_shapes);
+  if (!status.ok()) {
+    LLVM_DEBUG(llvm::dbgs() << "Failed during XlaCallModule shape refinement: "
+                            << status.ToString());
+    return false;
+  }
+
+  bool changed = false;
+  for (auto [result, type] :
+       llvm::zip(op.getResults(), loader->output_types())) {
+    auto ranked = type.dyn_cast<RankedTensorType>();
+    if (ranked == nullptr) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Unsupported XlaCallModule result type: " << type);
+      continue;
+    }
+
+    // Build a new type object from `type` and `elem_type`. `type` is owned by
+    // `xla_call_module_context_` and should not be mixed with op's context.
+    auto new_type = RankedTensorType::get(
+        ranked.getShape(), getElementTypeOrSelf(result.getType()));
+
+    changed = RefineResultType(op, result, new_type) || changed;
+  }
+
+  return changed;
+}
+
 bool ShapeInference::InferShapeForXlaHostComputeMlir(
     _XlaHostComputeMlirOp host_compute_op) {
   // Extract the module and function.
@@ -1741,14 +1856,14 @@ bool ShapeInference::InferShapeForXlaGatherOp(XlaGatherOp op) {
   auto output_shape = xla::ShapeInference::InferGatherShape(
       input_shape, start_indices_shape, gather_dim_numbers, slice_sizes);
   if (!output_shape.ok()) {
-    op->emitError(output_shape.status().error_message());
+    op->emitError() << output_shape.status().message();
     return false;
   }
 
   auto refined_type = xla::ConvertShapeToType<RankedTensorType>(
       *output_shape, mlir::Builder(op));
   if (!refined_type.ok()) {
-    op->emitError(refined_type.status().error_message());
+    op->emitError() << refined_type.status().message();
     return false;
   }
 
@@ -2030,7 +2145,8 @@ bool ShapeInference::RefineWithInferTypeOpInterface(
   SmallVector<Type, 4> inferred;
   LogicalResult res = infer_ti.inferReturnTypes(
       op->getContext(), op->getLoc(), op->getOperands(),
-      op->getAttrDictionary(), op->getRegions(), inferred);
+      op->getAttrDictionary(), op->getPropertiesStorage(), op->getRegions(),
+      inferred);
   if (failed(res)) {
     op->emitOpError("failed to refine type as inference failed");
     return false;
@@ -2319,6 +2435,10 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op,
         while_region,
         while_region.getBody().front().getTerminator()->getOperandTypes());
 
+  if (auto xla_call_module = dyn_cast<XlaCallModuleOp>(op)) {
+    return InferShapeForXlaCallModule(xla_call_module);
+  }
+
   if (auto host_compute_op = dyn_cast<_XlaHostComputeMlirOp>(op)) {
     return InferShapeForXlaHostComputeMlir(host_compute_op);
   }
@@ -2952,8 +3072,9 @@ FailureOr<bool> InferShapeForFunction(func::FuncOp func,
       element_type = unranked_input_ty.getElementType();
     }
 
-    auto new_arg_type =
-        tensorflow::GetTypeFromTFTensorShape(shape, element_type);
+    auto new_arg_type = GetNewArgType(func_type.getInput(i), shape,
+                                      element_type, func.getContext());
+
     if (new_arg_type != func_type.getInput(i)) {
       // If the new type is more detailed, trigger shape inference.
       func.getArgument(i).setType(new_arg_type);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
index f46fb14d2ca..e307266c93f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
@@ -170,7 +170,7 @@ def HostLaunchToOutsideCompiledPass : Pass<"tf-device-host-launch-to-outside-com
         "tf_device.launch"() {
           "tf.B"()
           tf_device.return
-        } {device = "TPU_REPLICATED_HOST"} : () -> ()
+        } {device = "TPU_REPLICATED_HOST_0"} : () -> ()
         "tf.C"()
         tf_device.return
       }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
index e84cf959800..46a23a48a6d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.h"
 
+#include <utility>
+
 #include "llvm/Support/CommandLine.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
@@ -26,9 +28,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
@@ -72,7 +74,7 @@ void GraphOptPass::runOnOperation() {
   auto graph = std::make_unique<Graph>(flib_def);
   Status status = ConvertMlirToGraph(module_in, confs, &graph, &flib_def);
   if (!status.ok()) {
-    mlir::emitError(mlir::UnknownLoc::get(&ctx)) << status.error_message();
+    mlir::emitError(mlir::UnknownLoc::get(&ctx)) << status.message();
     return signalPassFailure();
   }
 
@@ -92,7 +94,7 @@ void GraphOptPass::runOnOperation() {
     Status status = pass->Run(options);
     if (!status.ok()) {
       mlir::emitError(mlir::UnknownLoc::get(&ctx))
-          << pass->name() << ": " << status.error_message();
+          << pass->name() << ": " << status.message();
       return signalPassFailure();
     }
   }
@@ -104,7 +106,7 @@ void GraphOptPass::runOnOperation() {
       ConvertGraphToMlir(**options.graph, debug_info, flib_def, specs, &ctx);
   if (!module_or_status.ok()) {
     mlir::emitError(mlir::UnknownLoc::get(&ctx))
-        << module_or_status.status().error_message();
+        << module_or_status.status().message();
     return signalPassFailure();
   }
   auto module_out = std::move(module_or_status).value();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
index 94d5ee37e05..839d9d601d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
@@ -393,6 +393,15 @@ def ReplicaIDToDeviceOrdinalPass : Pass<"tf-replica-id-to-device-ordinal", "mlir
   }];
 }
 
+def EmbeddingPipeliningPass : Pass<"tf-embedding-pipelining", "mlir::ModuleOp"> {
+  let summary = "Rewrite graph for embedding pipelining";
+  let constructor = "TFDevice::CreateEmbeddingPipeliningPass()";
+    let description = [{
+    For architectures that support accelerated embedding lookups, this pass will
+    rewrite the graph to use pipelining for better device utilization.
+  }];
+}
+
 def ConvertReadonlyReferenceVariablesToResourceVariablesPass :
   Pass<"tf-readonly-references-to-resources", "mlir::func::FuncOp"> {
   let summary = "Convert readonly reference variables to resource variables.";
@@ -1115,6 +1124,13 @@ def ClusterOutliningPass : Pass<"tf-device-cluster-outlining", "ModuleOp"> {
   }];
 
   let constructor = "TFDevice::CreateClusterOutliningPass()";
+
+  let options = [
+    Option<"globally_unique_func_names_", "globally-unique-func-names", "bool",
+           /*default=*/"true",
+           "If true, the pass adds extra identifiers to make function names "
+           "globally unique within a process, not just within a module.">
+  ];
 }
 
 def ConvertTfControlFlowToScfPass : Pass<"convert-tf-control-flow-to-scf", "ModuleOp"> {
@@ -1168,6 +1184,13 @@ def LaunchOutliningPass : Pass<"tf-device-launch-outlining", "ModuleOp"> {
   }];
 
   let constructor = "TFDevice::CreateLaunchOutliningPass()";
+
+  let options = [
+    Option<"globally_unique_func_names_", "globally-unique-func-names", "bool",
+           /*default=*/"true",
+           "If true, the pass adds extra identifiers to make function names "
+           "globally unique within a process, not just within a module.">
+  ];
 }
 
 def ConvertLaunchFuncToTFCallPass : Pass<"tf-device-convert-launch-func-to-tf-call", "ModuleOp"> {
@@ -1575,7 +1598,7 @@ def TPURewritePass : Pass<"tf-tpu-rewrite", "ModuleOp"> {
 
     ```mlir
     func @tf_tpu_rewrite(%arg0: tensor<i8>, %arg1: tensor<i8>) {
-      %0:2 = tf_device.replicate([%arg0, %arg1] as %arg2: tensor<i8>) {devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"], TPU_REPLICATED_HOST = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:0"]}, n = 2 : i32} {
+      %0:2 = tf_device.replicate([%arg0, %arg1] as %arg2: tensor<i8>) {devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"], TPU_REPLICATED_HOST_0 = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:0"]}, n = 2 : i32} {
         %1:2 = "tf_device.launch"() ( {
           %compilation_status, %program = "tf._TPUCompileMlir"() {mlir_module = "<serialized func>"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
           tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
@@ -1781,6 +1804,31 @@ def TPUMergeVariablesWithExecutePass : Pass<"tf-tpu-merge-variables-with-execute
   let constructor = "TFTPU::CreateTPUMergeVariablesWithExecutePass()";
 }
 
+def ExtractTPUCopyWithDynamicShapeOpPass : Pass<"tf-extract-tpu-copy-with-dynamic-shape-op", "mlir::func::FuncOp"> {
+  let summary = "Extract the TPUCopyWithDynamicShapeOp out of the host launch and place it on device launch";
+
+  let description = [{
+    This pass looks for TPUCopyWithDynamicShapeOp which wraps in a
+    `tf_device.launch` with host device attribute. It extracts the ops and wrap
+    them in `tf_device.launch` with tpu device attribute so that ops can be
+    run on TPU instead of CPU while still being compiled on host.
+  }];
+
+  let constructor = "TFTPU::CreateExtractTPUCopyWithDynamicShapeOpPass()";
+}
+
+def TPUAnnotateDynamicShapeInputsPass : Pass<"tf-tpu-annotate-dynamic-shape-inputs", "ModuleOp"> {
+  let summary = "Annotate the inputs returned by TPUCopyWithDynamicShapeOp with dynamic shape";
+
+  let description = [{
+    This pass looks for the usage of the result of TPUCopyWithDynamicShapeOp
+    and sets the shape of these inputs to be dynamic shaped. This will ensure
+    that the generated HLO program is correctly reflecting the dynamic shape.
+  }];
+
+  let constructor = "TFTPU::CreateTPUAnnotateDynamicShapeInputsPass()";
+}
+
 def ReplicateInvariantOpHoistingPass : Pass<"tf-replicate-invariant-op-hoisting", "mlir::func::FuncOp"> {
   let summary = "Hoists replicate invariant operations out of replicate";
 
@@ -1790,6 +1838,12 @@ def ReplicateInvariantOpHoistingPass : Pass<"tf-replicate-invariant-op-hoisting"
     if possible. This currently updates or replaces `tf.Shape` ops of replicated
     arguments, either tensors or resources.
 
+    The primary benefit of the pass is to hoist `num_replicas` `_TPUCompile`s
+    into a single `_TPUCompile`.
+
+    This pass assumes that when a `tf.Shape` directly inputs from `replicate`
+    params, then it is the same shape across replicas.
+
     For example, the following
 
     ```mlir
@@ -1878,7 +1932,7 @@ def OutsideCompiledToHostLaunchPass : Pass<"tf-outside-compiled-to-host-launch",
         "tf_device.launch"() {
           "tf.B"() {_xla_outside_compilation = "cluster1"}
           tf_device.return
-        } {device = "TPU_REPLICATED_HOST"} : () -> ()
+        } {device = "TPU_REPLICATED_HOST_0"} : () -> ()
         "tf.C"()
         tf_device.return
       }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []}
@@ -2357,6 +2411,31 @@ def TPUDevicePropagationPass : Pass<"tf-tpu-device-propagation", "mlir::func::Fu
   let constructor = "TFTPU::CreateTPUDevicePropagationPass()";
 }
 
+def TPUColocateSplitsPass : Pass<"tf-tpu-colocate-splits", "mlir::func::FuncOp"> {
+  let summary = "Colocates each Split op with its predecessor";
+  let constructor = "TFTPU::CreateTPUColocateSplitsPass()";
+  let description = [{
+    It is beneficial for performance to assign a `Split` op to the same device
+    as its predecessor. This is because the weight of cut edges is always
+    minimized when the `Split` is with its predecessor. This colocation
+    constraint will be used by the placer graph optimization to assign a device
+    to the op.
+
+    This pass should run in the export pipeline after tf-replicate-to-island so
+    each replica has its own distinct (predecessor, Split) pair.
+
+    The colocation class (`_class`) of the `Split` is set to the same class as
+    its predecessor:
+
+    ```mlir
+    %outputs1:2, %control1 = tf_executor.island wraps "tf.IteratorGetNext"(%arg)
+      {_class = ["loc:@dataset_iterator_1"]}
+    %outputs2:2, %control2 = tf_executor.island wraps "tf.Split"(%outputs0, %outputs1#1)
+      {_class = ["loc:@dataset_iterator_1", num_split = 2 : i32}
+    ```
+  }];
+}
+
 def TPUIdentityPruningPass : Pass<"tf-tpu-identity-pruning", "ModuleOp"> {
   let summary = "Removes Identity/IdentityN ops from the TPU computation";
   let constructor = "TFTPU::CreateTPUIdentityPruningPass()";
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.cc
index a1552ffc4ac..141807309c4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.cc
@@ -363,8 +363,8 @@ LogicalResult FreezeVariables(ModuleOp module, tensorflow::Session* session) {
   const tensorflow::DeviceMgr* mgr = nullptr;
   auto status = session->LocalDeviceManager(&mgr);
   if (!status.ok()) {
-    module->emitError("failed to fetch device manager: " +
-                      status.error_message());
+    module->emitError(
+        absl::StrCat("failed to fetch device manager: ", status.message()));
     return failure();
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_annotate_dynamic_shape_inputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_annotate_dynamic_shape_inputs.cc
new file mode 100644
index 00000000000..76f64d18935
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_annotate_dynamic_shape_inputs.cc
@@ -0,0 +1,159 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+#define DEBUG_TYPE "tf-tpu-annotate-dynamic-shape-inputs"
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+#define GEN_PASS_DEF_TPUANNOTATEDYNAMICSHAPEINPUTSPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+
+class TPUAnnotateDynamicShapeInputsPass
+    : public impl::TPUAnnotateDynamicShapeInputsPassBase<
+          TPUAnnotateDynamicShapeInputsPass> {
+  void runOnOperation() override;
+};
+
+// Finds op that created a given value. If the value is a BlockArgument, this
+// returns the owner of the Block.
+Operation* GetOpOfValue(Value value) {
+  if (auto block_arg = value.dyn_cast<BlockArgument>())
+    return block_arg.getOwner()->getParentOp();
+
+  return value.getDefiningOp();
+}
+
+void TPUAnnotateDynamicShapeInputsPass::runOnOperation() {
+  getOperation().walk([&](tf_device::ClusterFuncOp cluster_func_op) {
+    Builder builder(cluster_func_op->getContext());
+    // Skip non-tpu device cluster_func.
+    auto cluster_id =
+        cluster_func_op->getAttrOfType<StringAttr>(TF::kReplicationInfoAttr);
+    if (!cluster_id) return WalkResult::advance();
+
+    llvm::SmallVector<int, 4> dynamic_shape_arg_index;
+
+    // Traverse the operands of the cluster func op and find which operand
+    // is returned by TPUAnnotateTensorsWithDynamicShapeOp.
+    for (const auto& cluster_func_operand :
+         llvm::enumerate(cluster_func_op.getOperands())) {
+      auto device_launch_op = llvm::dyn_cast<tf_device::LaunchOp>(
+          GetOpOfValue(cluster_func_operand.value()));
+      if (!device_launch_op) continue;
+      for (auto result : llvm::zip(
+               device_launch_op.getResults(),
+               device_launch_op.GetBody().getTerminator()->getOperands())) {
+        if (std::get<0>(result) == cluster_func_operand.value() &&
+            llvm::isa<TF::TPUAnnotateTensorsWithDynamicShapeOp>(
+                std::get<1>(result).getDefiningOp())) {
+          dynamic_shape_arg_index.push_back(cluster_func_operand.index());
+        }
+      }
+    }
+
+    cluster_func_op->setAttr(TF::kDynamicArgIndexAttr,
+                             builder.getI32ArrayAttr(dynamic_shape_arg_index));
+
+    FlatSymbolRefAttr func_attr = cluster_func_op.getFuncAttr();
+    func::FuncOp func =
+        cluster_func_op->getParentOfType<ModuleOp>().lookupSymbol<func::FuncOp>(
+            func_attr.getValue());
+
+    // Update the marked argument with dynamic shapes.
+    for (int index : dynamic_shape_arg_index) {
+      BlockArgument arg = func.getArgument(index);
+      auto inputType = arg.getType().dyn_cast<RankedTensorType>();
+      // Only rank 1 tensor is supported for now.
+      if (!inputType || inputType.getRank() != 1) continue;
+      auto shape = llvm::to_vector<4>(inputType.getShape());
+      llvm::SmallVector<int64_t, 4> bounds(shape.begin(), shape.end());
+      // Mark the dim as dynamic dim.
+      shape[0] = ShapedType::kDynamic;
+      auto extensions =
+          mhlo::TypeExtensionsAttr::get(func->getContext(), bounds);
+      auto resultType =
+          RankedTensorType::get(shape, inputType.getElementType(), extensions);
+      arg.setType(resultType);
+    }
+    llvm::SmallVector<Type, 8> arg_types;
+    for (auto arg : func.getArguments()) arg_types.push_back(arg.getType());
+    func.setType(
+        FunctionType::get(func.getContext(), arg_types,
+                          func.front().getTerminator()->getOperandTypes()));
+    return WalkResult::advance();
+  });
+
+  // Remove the annotated op after since it is just a placeholder.
+  DenseSet<tf_device::LaunchOp> launch_ops;
+  getOperation().walk([&](Operation* op) {
+    if (llvm::isa<TF::TPUAnnotateTensorsWithDynamicShapeOp>(op)) {
+      for (auto result : llvm::zip(op->getOperands(), op->getResults())) {
+        std::get<1>(result).replaceAllUsesWith(std::get<0>(result));
+      }
+      launch_ops.insert(op->getParentOfType<tf_device::LaunchOp>());
+      op->erase();
+    }
+    return WalkResult::advance();
+  });
+
+  for (auto launch_op : launch_ops) {
+    Block& block = launch_op.GetBody();
+    if (&block.front() == &block.back()) {
+      // The tf_device.launch is empty (except for the return).
+      // Remove the whole tf_device.launch, since later passes will make it send
+      // the arguments back and forth between the devices.
+      Operation* return_op = &block.back();
+      assert(llvm::isa<tf_device::ReturnOp>(return_op));
+      for (auto [inner, outer] :
+           llvm::zip(return_op->getOperands(), launch_op->getResults())) {
+        outer.replaceAllUsesWith(inner);
+      }
+      launch_op->erase();
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUAnnotateDynamicShapeInputsPass() {
+  return std::make_unique<TPUAnnotateDynamicShapeInputsPass>();
+}
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
index c9dd2824ce1..bb2f8f26b65 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <cstdint>
 #include <iterator>
 #include <memory>
 #include <set>
@@ -22,6 +23,7 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -142,6 +144,7 @@ LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters,
                                         std::string& device) {
   bool has_replicated_compiled_op = false;
   bool has_non_replicated_compiled_op = false;
+  bool has_local_device_name_collisions = false;
   // Use ordered set here to make error message below deterministic.
   std::set<llvm::StringRef> device_types;
   std::unordered_map<std::string, std::string> devices;
@@ -197,17 +200,20 @@ LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters,
       // information such as task, replica, job etc. An example fullname is
       // "/job:foo_bar/replica:1/task:2/device:GPU:3"
       if (devices.count(device_local_name)) {
+        std::string device1 = devices[device_local_name];
+        std::string device2 = device_attr.str();
+        // Is either of the two devices just a substring of the other? If
+        // not, we treat them as different devices, and we have a collision.
+        if (device1.find(device2) == std::string::npos &&
+            device2.find(device1) == std::string::npos) {
+          has_local_device_name_collisions = true;
+          LOG(WARNING) << "found two devices with same local name "
+                       << device_local_name
+                       << " but conflicting fullname: " << device1 << " and "
+                       << device2;
+        }
+        // Always keep the longer name.
         if (devices[device_local_name].size() < device_attr.str().size()) {
-          // If for same local name, the smaller device fullname is not
-          // a substring of larger device fullname, then there is definitely
-          // some issue with device names.
-          if (device_attr.str().find(devices[device_local_name]) ==
-              std::string::npos) {
-            LOG(WARNING) << "found two devices with same local name but "
-                            "conflicting fullname: "
-                         << device_attr.str() << " and "
-                         << devices[device_local_name];
-          }
           devices[device_local_name] = device_attr.str();
         }
       } else {
@@ -233,9 +239,10 @@ LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters,
       for (const auto& device_names : devices) {
         LOG(WARNING) << device_names.first << ", " << device_names.second;
       }
-    }
-    if (devices.size() == 1 &&
-        absl::StrContains(devices.begin()->second, "TPU:")) {
+    } else if (has_local_device_name_collisions) {
+      LOG(WARNING) << "Not assigning device because of conflicting fullnames.";
+    } else if (devices.size() == 1 &&
+               absl::StrContains(devices.begin()->second, "TPU:")) {
       device = devices.begin()->second;
     }
   }
@@ -437,12 +444,191 @@ tf_device::ClusterOp CreateClusterOp(
   return cluster;
 }
 
+// Returns an op of the given type that uses the result, along with
+// a list of identity ops along the way.
+template <typename T>
+std::tuple<T, llvm::SmallVector<TF::IdentityOp, 4>> GetSingleUserOfType(
+    OpResult result) {
+  llvm::SmallVector<TF::IdentityOp, 4> identity_ops;
+
+  do {
+    Operation* user = result.hasOneUse() ? *result.getUsers().begin() : nullptr;
+    if (auto t = llvm::dyn_cast_or_null<T>(user)) {
+      return std::make_tuple(t, identity_ops);
+    } else if (auto identity = llvm::dyn_cast_or_null<TF::IdentityOp>(user)) {
+      identity_ops.emplace_back(identity);
+      result = identity->getResult(0);
+    } else {
+      result = OpResult();  // reset to stop iterating
+    }
+  } while (result);
+
+  return std::make_tuple(T(), identity_ops);
+}
+
+using PartitionedClusterOutputMap =
+    absl::flat_hash_map<uint64_t,
+                        llvm::SmallVector<TF::TPUPartitionedOutputV2Op, 8>>;
+
+// Returns the partitioned output ops from the cluster if there are any,
+// along with any single user identity ops between them. Not all outputs
+// of a cluster must be partitioned, so the output is a map from cluster
+// output ids to ops.
+std::tuple<PartitionedClusterOutputMap, llvm::SmallVector<TF::IdentityOp, 8>>
+GetPartitionedOutputsAndIdentityOps(tf_device::ClusterOp cluster) {
+  PartitionedClusterOutputMap partitioned_outputs;
+  llvm::SmallVector<TF::IdentityOp, 8> erase_list;
+
+  for (auto [cluster_result_id, cluster_result] :
+       llvm::enumerate(cluster.getResults())) {
+    auto [replicated_output, _] =
+        GetSingleUserOfType<TF::TPUReplicatedOutputOp>(cluster_result);
+    if (replicated_output) {
+      for (OpResult per_replica_result : replicated_output->getResults()) {
+        auto [partitioned_output, id_ops] =
+            GetSingleUserOfType<TF::TPUPartitionedOutputV2Op>(
+                per_replica_result);
+        if (partitioned_output) {
+          erase_list.insert(erase_list.end(), id_ops.begin(), id_ops.end());
+          partitioned_outputs[cluster_result_id].emplace_back(
+              partitioned_output);
+        }
+      }
+    }
+  }
+
+  return std::forward_as_tuple(partitioned_outputs, erase_list);
+}
+
+// Inlines the partitioned output ops into the cluster, and updates
+// their users to point to the replicate op instead.
+Operation* BuildPartitionedOutputs(
+    OpBuilder& builder, tf_device::ClusterOp cluster,
+    tf_device::ReplicateOp replicate_op,
+    PartitionedClusterOutputMap& partitioned_outputs,
+    llvm::SmallVector<TF::IdentityOp, 8>& erase_list,
+    llvm::SmallVector<Type, 8>& result_types, int num_replicas) {
+  Operation* result_op;
+  llvm::SmallVector<Value, 8> results;
+  uint64_t num_results = cluster.getNumResults();
+  for (uint64_t result_id = 0; result_id < num_results; ++result_id) {
+    auto search = partitioned_outputs.find(result_id);
+    if (search == partitioned_outputs.end()) {
+      // If the output is not partitioned, directly pass it through.
+      results.emplace_back(cluster.getResult(result_id));
+
+      continue;
+    }
+
+    // Otherwise, "inline" the partitioned output ops by:
+    // - Building a new op within the cluster.
+    // - Replacing all the uses of the original ops with the cluster's outputs.
+    llvm::SmallVector<TF::TPUPartitionedOutputV2Op, 8>& ops = search->second;
+    for (auto [replica_id, partitioned_output] : llvm::enumerate(ops)) {
+      for (auto [core_id, result] :
+           llvm::enumerate(partitioned_output->getResults())) {
+        // outputs from replicate op are interleaved:
+        // [(replica:0,core:0), (replica:1,core:0), ...,
+        //  (replica:0,core:1), (replica:1,core:1), ...]
+        uint64_t output_id =
+            core_id * num_replicas + replica_id + results.size();
+        result.replaceAllUsesWith(replicate_op.getResult(output_id));
+      }
+    }
+
+    // Assume all the replicas have the same structure.
+    TF::TPUPartitionedOutputV2Op first_op = *(ops.begin());
+    ArrayAttr dims = first_op.getPartitionDimsAttr();
+    StringAttr sharding = first_op.get_XlaShardingAttr();
+    Operation::result_type_range output_types = first_op.getResultTypes();
+    result_op = builder.create<TF::TPUPartitionedOutputV2Op>(
+        replicate_op.getLoc(), output_types, cluster.getResult(result_id), dims,
+        sharding);
+
+    results.insert(results.end(), result_op->getResults().begin(),
+                   result_op->getResults().end());
+  }
+
+  // Once we've accumulated all the cluster's results, build a return op.
+  builder.create<tf_device::ReturnOp>(result_op->getLoc(), results);
+
+  // Then erase all the identity and partitioned output ops.
+  for (auto [_, ops] : partitioned_outputs) {
+    for (TF::TPUPartitionedOutputV2Op op : ops) {
+      op->erase();
+    }
+  }
+
+  for (TF::IdentityOp to_erase : erase_list) {
+    to_erase->erase();
+  }
+
+  return result_op;
+}
+
+// Return the cluster's per-replica result type, converting any full-shaped
+// tensor types into sharded-shaped ones if they're partitioned.
+llvm::SmallVector<Type, 8> GetClusterResultTypes(
+    tf_device::ClusterOp cluster,
+    const PartitionedClusterOutputMap& partitioned_outputs) {
+  llvm::SmallVector<Type, 8> result_types;
+  Operation::result_type_range cluster_result_types = cluster.getResultTypes();
+  if (partitioned_outputs.empty()) {
+    // Directly pass through the cluster's outputs if none are partitioned.
+    result_types.insert(result_types.end(), cluster_result_types.begin(),
+                        cluster_result_types.end());
+  } else {
+    // For each output of the cluster...
+    for (auto [output_id, result_type] :
+         llvm::enumerate(cluster_result_types)) {
+      auto search = partitioned_outputs.find(output_id);
+      if (search == std::end(partitioned_outputs)) {
+        // If it's not partitioned, directly pass it through.
+        result_types.emplace_back(result_type);
+      } else {
+        // Otherwise, pass through the result shard types.
+        Operation::result_type_range partitioned_result_types =
+            (*search->second.begin())->getResultTypes();
+        result_types.insert(result_types.end(),
+                            partitioned_result_types.begin(),
+                            partitioned_result_types.end());
+      }
+    }
+  }
+  return result_types;
+}
+
 // Creates a `tf_device.replicate` to represent replication for the cluster, if
-// necessary.
+// necessary. Erases Identity ops between partitioned and replicated output ops.
 LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas,
                                int num_cores_per_replica) {
+  OpBuilder builder(cluster);
+  auto [partitioned_outputs, erase_list] =
+      GetPartitionedOutputsAndIdentityOps(cluster);
+
+  for (auto [_, ops] : partitioned_outputs) {
+    if (!(ops.empty() || ops.size() == num_replicas)) {
+      return (ops.begin())->emitOpError()
+             << "expected zero or " << num_replicas
+             << " 'TPUPartitionedOutput' op(s), instead got "
+             << partitioned_outputs.size();
+    }
+  }
+
   // No need to replicate.
-  if (num_replicas == 1) return success();
+  if (num_replicas == 1) {
+    // Collapse all the Identity ops between the TRO and TPO ops.
+    if (!partitioned_outputs.empty()) {
+      for (TF::IdentityOp to_erase : erase_list) {
+        Value in = to_erase->getOperand(0);
+        OpResult out = to_erase->getResult(0);
+        out.replaceAllUsesWith(in);
+        to_erase->erase();
+      }
+    }
+
+    return success();
+  }
 
   if (num_replicas < 1)
     return cluster.emitError() << "requires '" << kNumReplicasAttr
@@ -494,7 +680,7 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas,
   llvm::SmallVector<Value, 8> packed_inputs;
   llvm::SmallVector<TF::TPUReplicatedInputOp, 8> replicated_ops;
   llvm::SmallVector<TF::TPUReplicatedInputOp, 8> packed_ops;
-  for (auto& pos_and_input : llvm::enumerate(replicated_input_ops)) {
+  for (const auto& pos_and_input : llvm::enumerate(replicated_input_ops)) {
     auto input = pos_and_input.value();
     bool is_packed = input.getIsPacked();
     const int num_operands = input->getNumOperands();
@@ -528,24 +714,28 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas,
   }
 
   // Create replicate op.
-  OpBuilder builder(cluster);
+  auto result_types = GetClusterResultTypes(cluster, partitioned_outputs);
   auto replicate_op = builder.create<tf_device::ReplicateOp>(
       cluster.getLoc(), num_replicas,
       llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<StringRef, 4>>(),
-      replicated_inputs, packed_inputs, cluster.getResultTypes());
+      replicated_inputs, packed_inputs, result_types);
 
   if (!mirrored_variable_indices.empty())
     replicate_op->setAttr(kMirroredVariableIndicesAttr,
                           builder.getI64ArrayAttr(mirrored_variable_indices));
 
   // Replace replicated cluster results with replicate op results.
-  for (auto result_and_idx : llvm::enumerate(cluster.getResults())) {
-    Value result = result_and_idx.value();
-    int idx = result_and_idx.index();
-    auto replicate_outputs = llvm::make_range(
-        std::next(replicate_op.result_begin(), idx * num_replicas),
-        std::next(replicate_op.result_begin(), (idx + 1) * num_replicas));
+  uint64_t offset = 0;
+  for (auto [idx, result] : llvm::enumerate(cluster.getResults())) {
+    if (partitioned_outputs.contains(idx)) {
+      // Partitioned output propagation happens in BuildPartitionedOutputs.
+      offset += num_replicas * num_cores_per_replica;
+      continue;
+    }
 
+    auto replicate_outputs = llvm::make_range(
+        std::next(replicate_op.result_begin(), offset),
+        std::next(replicate_op.result_begin(), offset + num_replicas));
     for (auto& use : llvm::make_early_inc_range(result.getUses())) {
       Operation* def = use.getOwner();
       if (!llvm::isa<TF::TPUReplicatedOutputOp>(def)) {
@@ -562,6 +752,8 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas,
 
       def->replaceAllUsesWith(replicate_outputs);
     }
+
+    offset += num_replicas;
   }
 
   // Collect all `tf.TPUPartitionedInputV2` ops to be moved inside the
@@ -587,11 +779,20 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas,
   // Create terminator for replicate op and move `tf_device.cluster` and
   // `tf.TPUPartitionedInputV2`(s) into replicate body.
   builder.setInsertionPointToEnd(&replicate_op.GetBody());
-  auto return_op = builder.create<tf_device::ReturnOp>(replicate_op.getLoc(),
-                                                       cluster.getResults());
-  for (auto pi : partitioned_inputs) pi->moveBefore(return_op);
 
-  cluster.getOperation()->moveBefore(return_op);
+  Operation* result_op;
+  if (!partitioned_outputs.empty()) {
+    result_op = BuildPartitionedOutputs(builder, cluster, replicate_op,
+                                        partitioned_outputs, erase_list,
+                                        result_types, num_replicas);
+  } else {
+    result_op = builder.create<tf_device::ReturnOp>(replicate_op.getLoc(),
+                                                    cluster.getResults());
+  }
+
+  for (auto pi : partitioned_inputs) pi->moveBefore(result_op);
+
+  cluster.getOperation()->moveBefore(result_op);
 
   return success();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_splits.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_splits.cc
new file mode 100644
index 00000000000..1e7f9c5a4d2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_splits.cc
@@ -0,0 +1,85 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+#define GEN_PASS_DEF_TPUCOLOCATESPLITSPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+
+constexpr char kDeviceAttr[] = "device";
+// Attribute of colocation classes.
+constexpr char kClassAttr[] = "_class";
+
+bool HasDevice(Operation* op) {
+  auto attr = op->getAttrOfType<StringAttr>(kDeviceAttr);
+  if (!attr) return false;
+  return !attr.getValue().empty();
+}
+
+// Returns the predecessors of `op` when `op`'s predecessors are wrapped by
+// islands.
+llvm::SmallVector<Operation*> IslandPredecessors(Operation* op) {
+  llvm::SmallVector<Operation*> predecessors;
+  for (Value operand : op->getOperands()) {
+    if (Operation* pred = operand.getDefiningOp()) {
+      int result_number = llvm::cast<OpResult>(operand).getResultNumber();
+      if (auto pred_island = llvm::dyn_cast<tf_executor::IslandOp>(pred)) {
+        Value yield_operand = pred_island.GetYield().getOperand(result_number);
+        predecessors.push_back(yield_operand.getDefiningOp());
+      }
+    }
+  }
+  return predecessors;
+}
+
+struct TPUColocateSplits
+    : public impl::TPUColocateSplitsPassBase<TPUColocateSplits> {
+  void runOnOperation() override;
+};
+
+void TPUColocateSplits::runOnOperation() {
+  getOperation().walk([&](Operation* op) {
+    if (auto split = llvm::dyn_cast<TF::SplitOp>(op)) {
+      if (HasDevice(split) || split->getAttrOfType<ArrayAttr>(kClassAttr))
+        return WalkResult::advance();
+      for (Operation* pred : IslandPredecessors(split)) {
+        if (auto colocation_classes =
+                pred->getAttrOfType<ArrayAttr>(kClassAttr)) {
+          split->setAttr(kClassAttr, colocation_classes);
+          return WalkResult::advance();
+        }
+      }
+    }
+    return WalkResult::advance();
+  });
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTPUColocateSplitsPass() {
+  return std::make_unique<TPUColocateSplits>();
+}
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
index c8ad200e328..04b488a3804 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
@@ -172,7 +172,7 @@ bool HandleReplicatedInputs(
 
   MutableArrayRef<OpOperand> inputs =
       replicate.GetOperandsForBlockArgument(replicate_arg);
-  for (auto entry : llvm::enumerate(inputs)) {
+  for (const auto& entry : llvm::enumerate(inputs)) {
     auto input_op = entry.value().get().getDefiningOp();
     if (!input_op || !IsSupportedInputOp(input_op, resource_alias_analysis))
       return false;
@@ -181,7 +181,7 @@ bool HandleReplicatedInputs(
   auto get_layout = BuildGetLayout(execute_arg_index, compilation_key,
                                    compile_launch, &builder);
   builder.setInsertionPoint(replicate);
-  for (auto entry : llvm::enumerate(inputs)) {
+  for (const auto& entry : llvm::enumerate(inputs)) {
     auto copy_with_layout =
         BuildCopyWithLayout(execute_launch, compile_launch, get_layout,
                             entry.value().get(), &builder);
@@ -222,7 +222,7 @@ void HandleCompileAndExecutes(
         llvm::cast<TF::TPUExecuteOp>(execute_launch.GetBody().front());
     const auto& input_mapping = std::get<1>(execute_and_input_mapping);
 
-    for (auto& input_and_idx : llvm::enumerate(execute.getArgs())) {
+    for (const auto& input_and_idx : llvm::enumerate(execute.getArgs())) {
       Value input = input_and_idx.value();
       const int64_t execute_arg_index = input_and_idx.index();
       if (auto block_arg = input.dyn_cast<BlockArgument>()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index b8eb4c22598..f20db8a9976 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 #include <type_traits>
@@ -21,6 +22,7 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
@@ -29,6 +31,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
@@ -82,7 +85,12 @@ constexpr char kBadArrayAttrLengthMsg[] =
 
 namespace {
 struct TPURewritePass : public impl::TPURewritePassBase<TPURewritePass> {
+  explicit TPURewritePass(llvm::StringRef _module_name)
+      : module_name(_module_name) {}
+
   void runOnOperation() override;
+
+  llvm::StringRef module_name;
 };
 
 // Creates a missing attribute error message.
@@ -90,7 +98,8 @@ std::string CreateMissingAttributeMsg(llvm::StringRef attribute) {
   return llvm::formatv("requires attribute '{0}'", attribute).str();
 }
 
-LogicalResult EncapsulateFuncAndSerialize(func::FuncOp entry_func,
+LogicalResult EncapsulateFuncAndSerialize(const std::string& module_name,
+                                          func::FuncOp entry_func,
                                           std::string* serialized_func_module) {
   ModuleOp module = entry_func->getParentOfType<ModuleOp>();
   SymbolTable entry_module_table(module);
@@ -98,7 +107,8 @@ LogicalResult EncapsulateFuncAndSerialize(func::FuncOp entry_func,
 
   // Create a new module to hold func and all referenced functions.
   OwningOpRef<mlir::ModuleOp> module_for_func =
-      ModuleOp::create(mlir::UnknownLoc::get(entry_func.getContext()));
+      ModuleOp::create(mlir::UnknownLoc::get(entry_func.getContext()),
+                       absl::StrCat("module_", module_name));
   auto parent_module = entry_func->getParentOfType<ModuleOp>();
   auto versions_attr = parent_module->getAttr(kVersionsAttr);
   if (!versions_attr)
@@ -207,6 +217,15 @@ LogicalResult SetMetadataProtoArgs(
   // Set args metadata in proto.
   mlir::StringAttr replication_attr_name = mlir::StringAttr::get(
       op.getContext(), "mhlo.is_same_data_across_replicas");
+
+  auto dynamic_arg_idx = op->getAttrOfType<ArrayAttr>(TF::kDynamicArgIndexAttr);
+  llvm::SmallSet<int, 4> dynamic_arg_idx_set;
+  if (dynamic_arg_idx) {
+    for (auto idx : dynamic_arg_idx.getValue()) {
+      dynamic_arg_idx_set.insert(idx.dyn_cast<IntegerAttr>().getInt());
+    }
+  }
+
   for (auto operand_type_and_idx : llvm::enumerate(op.getOperandTypes())) {
     Type operand_type = operand_type_and_idx.value();
     int index = operand_type_and_idx.index();
@@ -217,7 +236,7 @@ LogicalResult SetMetadataProtoArgs(
     if (!status.ok())
       return op.emitOpError(
           llvm::formatv("failed to determine operand type at index {0}: {1}",
-                        index, status.error_message()));
+                        index, status.message()));
 
     arg->set_dtype(dtype);
     // TODO(lyandy): Support other arg kinds.
@@ -247,6 +266,10 @@ LogicalResult SetMetadataProtoArgs(
     mlir::UnitAttr attr = op.getFuncOp().getArgAttrOfType<mlir::UnitAttr>(
         index, replication_attr_name);
     arg->set_is_same_data_across_replicas(attr != nullptr);
+
+    // Currently only support first dimension to be bounded dynamic.
+    arg->mutable_is_bounded_dynamic_dim()->Add(
+        dynamic_arg_idx_set.contains(index));
   }
 
   return success();
@@ -336,12 +359,14 @@ tf_device::LaunchOp WrapOpInLaunch(OpBuilder* builder, Location loc,
 // Create a `tf._TPUCompileMlir` that contains a MLIR module that is
 // functionally equivalent to the function referenced by cluster_func.
 Operation* BuildCompileOp(
-    tf_device::ClusterFuncOp cluster_func, int num_replicas,
-    int num_cores_per_replica, llvm::StringRef compilation_device,
+    llvm::StringRef module_name, tf_device::ClusterFuncOp cluster_func,
+    int num_replicas, int num_cores_per_replica,
+    llvm::StringRef compilation_device,
     std::optional<xla::DeviceAssignmentProto>&& xla_device_assignment,
     OpBuilder* builder, bool tpu_compile_metadata_debug) {
   // Set metadata from attributes.
   tensorflow::tpu::TPUCompileMetadataProto metadata;
+  if (!module_name.empty()) metadata.set_module_name(module_name.str());
   if (failed(SetMetadataProtoFromClusterFuncOp(
           cluster_func, num_replicas, num_cores_per_replica,
           std::move(xla_device_assignment), &metadata)))
@@ -373,7 +398,10 @@ Operation* BuildCompileOp(
           func_attr.getValue());
 
   std::string txt_module;
-  if (failed(EncapsulateFuncAndSerialize(func, &txt_module))) return nullptr;
+  if (failed(EncapsulateFuncAndSerialize(
+          module_name.empty() ? "unknown_graph" : module_name.str(), func,
+          &txt_module)))
+    return nullptr;
 
   auto compilation_status_type =
       RankedTensorType::get({}, builder->getType<TF::StringType>());
@@ -419,24 +447,24 @@ void AssignDevicesToReplicate(
   for (int core = 0; core < num_cores_per_replica; ++core) {
     llvm::SmallVector<StringRef, 8> devices_by_core;
     devices_by_core.reserve(num_replicas);
-    for (int replica = 0; replica < num_replicas; ++replica)
+    llvm::SmallVector<StringRef, 8> hosts_by_core;
+    hosts_by_core.reserve(num_replicas);
+    for (int replica = 0; replica < num_replicas; ++replica) {
       devices_by_core.push_back(tpu_devices[replica][core].device);
+      hosts_by_core.push_back(tpu_devices[replica][core].host);
+    }
 
     device_attrs.push_back(
         builder->getNamedAttr(tensorflow::GetDeviceAliasForLogicalCore(core),
                               builder->getStrArrayAttr(devices_by_core)));
+
+    // For data parallelism, also add replicated host devices, as these are
+    // necessary for outside compilation.
+    device_attrs.push_back(builder->getNamedAttr(
+        tensorflow::GetDeviceAliasForHostOfLogicalCore(core),
+        builder->getStrArrayAttr(hosts_by_core)));
   }
 
-  // For data parallelism, also add replicated host devices, as these are
-  // necessary for outside compilation.
-  llvm::SmallVector<StringRef, 8> hosts;
-  hosts.reserve(num_replicas);
-  for (int replica = 0; replica < num_replicas; ++replica)
-    hosts.push_back(tpu_devices[replica][0].host);
-
-  device_attrs.push_back(builder->getNamedAttr(
-      tensorflow::kTPUReplicatedHost, builder->getStrArrayAttr(hosts)));
-
   replicate->setAttr(kDevicesAttr, builder->getDictionaryAttr(device_attrs));
 }
 
@@ -715,7 +743,7 @@ int GetNumResultsPreCluster(tf_device::ParallelExecuteOp parallel_execute) {
 }
 
 LogicalResult Rewrite(
-    tf_device::ClusterFuncOp cluster_func,
+    llvm::StringRef module_name, tf_device::ClusterFuncOp cluster_func,
     llvm::ArrayRef<tensorflow::DeviceNameUtils::ParsedName> devices,
     ArrayRef<TF::TPUCompilationResultOp> compilation_result, OpBuilder* builder,
     bool tpu_compile_metadata_debug) {
@@ -782,7 +810,7 @@ LogicalResult Rewrite(
   if (!status_or_device_coodinates.ok())
     return cluster_func.emitError()
            << "error in fetching tpu device coordinates: "
-           << status_or_device_coodinates.status().error_message();
+           << status_or_device_coodinates.status().message();
 
   // Determine compilation and execution devices.
   auto status_or_tpu_device_assignment =
@@ -792,7 +820,7 @@ LogicalResult Rewrite(
   if (!status_or_tpu_device_assignment.ok())
     return cluster_func.emitError()
            << "error in fetching TPU compilation/execution devices: "
-           << status_or_tpu_device_assignment.status().error_message();
+           << status_or_tpu_device_assignment.status().message();
 
   // Create compile op.
   auto& tpu_device_assignment = status_or_tpu_device_assignment.value();
@@ -800,11 +828,11 @@ LogicalResult Rewrite(
   // Create the TPUCompileMlir and TPUCompileSucceededAssert outside of
   // the parallel_execute.
   builder->setInsertionPoint(old_parallel_execute);
-  Operation* compile_op =
-      BuildCompileOp(cluster_func, num_replicas, num_cores_per_replica,
-                     tpu_device_assignment.compilation_device,
-                     std::move(tpu_device_assignment.xla_device_assignment),
-                     builder, tpu_compile_metadata_debug);
+  Operation* compile_op = BuildCompileOp(
+      module_name, cluster_func, num_replicas, num_cores_per_replica,
+      tpu_device_assignment.compilation_device,
+      std::move(tpu_device_assignment.xla_device_assignment), builder,
+      tpu_compile_metadata_debug);
   if (!compile_op) return failure();
 
   // This replaces _TPUCompileMlir placeholder ops that are required
@@ -940,7 +968,7 @@ void TPURewritePass::runOnOperation() {
     auto cluster_id = op->getAttrOfType<StringAttr>(TF::kReplicationInfoAttr);
     if (!cluster_id) return WalkResult::advance();
 
-    if (failed(Rewrite(op, devices.device_names(),
+    if (failed(Rewrite(module_name, op, devices.device_names(),
                        compilation_results[cluster_id], &builder,
                        tpu_compile_metadata_debug_)))
       return WalkResult::interrupt();
@@ -970,8 +998,9 @@ void TPURewritePass::runOnOperation() {
 
 }  // namespace
 
-std::unique_ptr<OperationPass<ModuleOp>> CreateTPURewritePass() {
-  return std::make_unique<TPURewritePass>();
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPURewritePass(
+    llvm::StringRef module_name) {
+  return std::make_unique<TPURewritePass>(module_name);
 }
 
 }  // namespace TFTPU
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
index e3f9d15e5c6..bb8dd429174 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <type_traits>
+#include <variant>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -35,6 +37,7 @@ limitations under the License.
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -46,6 +49,9 @@ namespace mlir {
 namespace TFTPU {
 namespace {
 
+using OpShardingVariant = std::variant<mlir::Operation*, llvm::StringRef>;
+using OpShardingVector = llvm::SmallVector<OpShardingVariant, 8>;
+
 constexpr char kReplicateSharding[] = "";
 constexpr char kShardingAttr[] = "mhlo.sharding";
 constexpr char kUseSpmdAttr[] = "use_spmd_for_xla_partitioning";
@@ -65,20 +71,79 @@ std::string CreateMissingAttributeMsg(llvm::StringRef attribute) {
   return llvm::formatv("requires attribute '{0}'", attribute).str();
 }
 
-// Returns XLA sharding from TPUPartitionedInput op connected to a
-// `tf_device.cluster_func` operand value. If value is a resource type then
+// Returns nullptr if the op does not have a sharding attribute.
+template <typename PartitionedOp>
+mlir::Operation* NullUnlessSharded(PartitionedOp op) {
+  return op.get_XlaSharding() ? op : nullptr;
+}
+
+// Returns a TPUPartitionedInput op connected to a `tf_device.cluster_func`
+// operand value if it has an XLA sharding. If value is a resource type then
 // TPUPartitionedInput op will be connected to a ReadVariable op that feeds into
 // a `tf_device.cluster_func`.
-std::optional<llvm::StringRef> GetXlaShardingFromOperand(Value value) {
+mlir::Operation* GetXlaShardingFromOperand(Value value) {
   Value value_to_visit = value;
   if (auto read_var = value_to_visit.getDefiningOp<TF::ReadVariableOp>())
     value_to_visit = read_var.getResource();
 
   if (auto partitioned_input =
-          value_to_visit.getDefiningOp<TF::TPUPartitionedInputV2Op>())
-    return partitioned_input.get_XlaSharding();
+          value_to_visit.getDefiningOp<TF::TPUPartitionedInputV2Op>()) {
+    return NullUnlessSharded(partitioned_input);
+  }
 
-  return std::nullopt;
+  return nullptr;
+}
+
+// Returns the op sharding attribute from a partitioned operator.
+std::optional<StringRef> GetXlaShardingFromOperator(mlir::Operation* op) {
+  if (auto partitioned_output =
+          llvm::dyn_cast<TF::TPUPartitionedOutputV2Op>(op)) {
+    return partitioned_output.get_XlaSharding();
+  } else if (auto partitioned_input =
+                 llvm::dyn_cast<TF::TPUPartitionedInputV2Op>(op)) {
+    return partitioned_input.get_XlaSharding();
+  } else {
+    return std::nullopt;
+  }
+}
+
+// Returns the sharding string from a op-sharding variant if it is available.
+std::optional<StringRef> GetShardingStringFromVariant(
+    const OpShardingVariant& sharding_or_op) {
+  return std::visit(
+      [](auto&& sharding_or_op) -> std::optional<StringRef> {
+        using T = std::decay_t<decltype(sharding_or_op)>;
+        if constexpr (std::is_same_v<T, StringRef>) {
+          return sharding_or_op;
+        } else {
+          return GetXlaShardingFromOperator(sharding_or_op);
+        }
+      },
+      sharding_or_op);
+}
+
+// Returns the sharding from a op-sharding variant if it is available and valid.
+std::optional<xla::OpSharding> GetShardingFromVariant(
+    const OpShardingVariant& sharding_or_op) {
+  xla::OpSharding sharding;
+  const auto sharding_string = GetShardingStringFromVariant(sharding_or_op);
+  if (sharding_string && sharding.ParseFromString(sharding_string->str())) {
+    return sharding;
+  } else {
+    return std::nullopt;
+  }
+}
+
+// Converts an op-sharding vector into a string attr using the builder.
+mlir::ArrayAttr GetStrArrayAttr(Builder* builder,
+                                const OpShardingVector& vect) {
+  llvm::SmallVector<mlir::Attribute, 8> strings;
+  for (const auto& sharding_or_op : vect) {
+    if (const auto sharding = GetShardingStringFromVariant(sharding_or_op)) {
+      strings.emplace_back(builder->getStringAttr(*sharding));
+    }
+  }
+  return builder->getArrayAttr(strings);
 }
 
 // Given a `tf_device.cluster_func` operand value return true iff it a device
@@ -96,19 +161,37 @@ bool IsMaximalVariable(Value value) {
 //  on CPU)
 // If the sharding is incorrect, return failure. If it's good, or if we can't
 // verify it, return success.
-LogicalResult VerifySharding(Type type, StringRef sharding_string) {
-  xla::OpSharding sharding;
-  if (!sharding.ParseFromString(sharding_string.str())) {
+LogicalResult VerifySharding(mlir::Type type,
+                             const OpShardingVariant& sharding_or_op) {
+  auto* partitioned_op =
+      std::holds_alternative<mlir::Operation*>(sharding_or_op)
+          ? std::get<mlir::Operation*>(sharding_or_op)
+          : nullptr;
+  const auto sharding = GetShardingFromVariant(sharding_or_op);
+  if (!sharding || sharding->type() != xla::OpSharding::OTHER) {
     // Some test cases use \01\02\03 as sharding, to test propagation. Treat
-    // a non-proto sharding as valid, and don't verify further.
-    return success();
-  }
-  if (sharding.type() != xla::OpSharding::OTHER) {
-    // We currently only verify shardings that actually break a tensor apart.
+    // a non-proto sharding as valid, and don't verify further. We also only
+    // verify shardings that actually break a tensor apart.
     return success();
   }
   if (RankedTensorType ranked_type = type.dyn_cast<RankedTensorType>()) {
-    if (ranked_type.getRank() < sharding.tile_assignment_dimensions_size()) {
+    const int64_t tensor_rank = ranked_type.getRank();
+    int tile_assignment_rank = sharding->tile_assignment_dimensions_size();
+
+    // When a tensor is partial or subgroup tiled, its tile assignment will
+    // have one or more dimension(s) than its rank; so, we subtract them to
+    // determine which rank the sharding is compatible with.
+    tile_assignment_rank -= (int)sharding->replicate_on_last_tile_dim();
+    tile_assignment_rank -= sharding->last_tile_dims_size();
+
+    if (tensor_rank < tile_assignment_rank) {
+      if (partitioned_op) {
+        partitioned_op->emitError()
+            << "tensor of type " << ranked_type << " (rank=" << tensor_rank
+            << ") sharded in " << (tile_assignment_rank - tensor_rank)
+            << " extra dimension(s) by: " << sharding->DebugString();
+      }
+
       return failure();
     }
   }
@@ -116,21 +199,20 @@ LogicalResult VerifySharding(Type type, StringRef sharding_string) {
 }
 
 // Verify sharding for all arguments and return values.
-LogicalResult VerifyShardings(
-    mlir::func::FuncOp func,
-    const llvm::SmallVectorImpl<llvm::StringRef>& sharding_for_args,
-    const llvm::SmallVectorImpl<llvm::StringRef>& sharding_for_rets) {
+LogicalResult VerifyShardings(mlir::func::FuncOp func,
+                              const OpShardingVector& sharding_for_args,
+                              const OpShardingVector& sharding_for_rets) {
   Block& function_block = func.front();
   for (auto sharding_and_arg :
        llvm::zip(sharding_for_args, function_block.getArguments())) {
-    StringRef sharding = std::get<0>(sharding_and_arg);
+    const auto& sharding = std::get<0>(sharding_and_arg);
     BlockArgument arg = std::get<1>(sharding_and_arg);
     if (failed(VerifySharding(arg.getType(), sharding))) return failure();
   }
   Operation* terminator = function_block.getTerminator();
   for (auto sharding_and_retval :
        llvm::zip(sharding_for_rets, terminator->getOpOperands())) {
-    StringRef sharding = std::get<0>(sharding_and_retval);
+    const auto& sharding = std::get<0>(sharding_and_retval);
     OpOperand& retval = std::get<1>(sharding_and_retval);
     if (failed(VerifySharding(retval.get().getType(), sharding)))
       return failure();
@@ -215,8 +297,7 @@ std::optional<llvm::StringRef> GetXlaShardingFromArg(
 void IdentifyXlaShardingForComputationInputs(
     const llvm::SmallVector<std::string>& logical_device_vec, bool use_spmd,
     bool infer_from_computation, tf_device::ClusterFuncOp cluster_func,
-    func::FuncOp func, Builder* builder,
-    llvm::SmallVectorImpl<llvm::StringRef>& sharding_for_args) {
+    func::FuncOp func, Builder* builder, OpShardingVector& sharding_for_args) {
   // Look up function definition from module.
   Block& function_block = func.front();
 
@@ -245,7 +326,7 @@ void IdentifyXlaShardingForComputationInputs(
     BlockArgument arg = std::get<1>(operand_and_arg);
 
     if (auto operand_sharding = GetXlaShardingFromOperand(operand)) {
-      sharding_for_args.push_back(operand_sharding.value());
+      sharding_for_args.push_back(operand_sharding);
       continue;
     }
 
@@ -271,24 +352,24 @@ void IdentifyXlaShardingForComputationInputs(
   }
 }
 
-// Returns XLA sharding from TPUPartitionedOutput or TPUPartitionedInput (via
-// AssignVariableOp/resource write) op connected to a `tf_device.cluster_func`
-// result value.
-std::optional<llvm::StringRef> GetXlaShardingFromResult(Value value) {
-  if (!value.hasOneUse()) return std::nullopt;
+// Returns a TPUPartitionedOutput or TPUPartitionedInput op with XLA sharding
+// connected to a `tf_device.cluster_func` result value (via AssignVariableOp/
+// resource write).
+mlir::Operation* GetXlaShardingFromResult(Value value) {
+  if (!value.hasOneUse()) return nullptr;
 
   Operation* user = *value.getUsers().begin();
   if (auto partitioned_output =
           llvm::dyn_cast<TF::TPUPartitionedOutputV2Op>(user))
-    return partitioned_output.get_XlaSharding();
+    return NullUnlessSharded(partitioned_output);
 
   if (auto assign_var = llvm::dyn_cast<TF::AssignVariableOp>(user))
     if (auto partitioned_input =
             assign_var.getResource()
                 .getDefiningOp<TF::TPUPartitionedInputV2Op>())
-      return partitioned_input.get_XlaSharding();
+      return NullUnlessSharded(partitioned_input);
 
-  return std::nullopt;
+  return nullptr;
 }
 
 // Looks up arg->retval aliases for every argument, and builds a reverse map.
@@ -307,12 +388,12 @@ void ExtractAliases(func::FuncOp func, llvm::SmallVectorImpl<int>& aliases) {
 // Returns XLA sharding from argument connected via tf.aliasing_output.
 std::optional<StringRef> GetXlaShardingFromAlias(
     Value value, llvm::SmallVectorImpl<int>& aliases,
-    const llvm::SmallVectorImpl<llvm::StringRef>& sharding_for_args) {
+    const OpShardingVector& sharding_for_args) {
   int retval_index = value.cast<OpResult>().getResultNumber();
   if (retval_index >= 0 && retval_index < aliases.size()) {
     int arg_index = aliases[retval_index];
     if (arg_index >= 0 && arg_index < sharding_for_args.size()) {
-      return sharding_for_args[arg_index];
+      return GetShardingStringFromVariant(sharding_for_args[arg_index]);
     }
   }
   return std::nullopt;
@@ -394,8 +475,8 @@ void IdentifyXlaShardingForComputationOutputs(
     const llvm::SmallVector<std::string>& logical_device_vec, bool use_spmd,
     bool infer_from_computation, tf_device::ClusterFuncOp cluster_func,
     func::FuncOp func, Builder* builder,
-    const llvm::SmallVectorImpl<llvm::StringRef>& sharding_for_args,
-    llvm::SmallVectorImpl<llvm::StringRef>& sharding_for_rets) {
+    const OpShardingVector& sharding_for_args,
+    OpShardingVector& sharding_for_rets) {
   Block& function_block = func.front();
   Operation* terminator = function_block.getTerminator();
   sharding_for_rets.reserve(terminator->getNumOperands());
@@ -418,7 +499,7 @@ void IdentifyXlaShardingForComputationOutputs(
     OpOperand& retval = std::get<1>(result_and_retval);
 
     if (auto result_sharding = GetXlaShardingFromResult(result)) {
-      sharding_for_rets.push_back(result_sharding.value());
+      sharding_for_rets.push_back(result_sharding);
       continue;
     }
 
@@ -477,21 +558,21 @@ LogicalResult IdentifyXlaShardingForTPUComputation(
         xla::sharding_builder::AssignDevice(idx).SerializeAsString();
   }
 
-  llvm::SmallVector<llvm::StringRef, 8> sharding_for_args;
+  OpShardingVector sharding_for_args;
   IdentifyXlaShardingForComputationInputs(logical_device_vec, use_spmd,
                                           /*infer_from_computation=*/true,
                                           cluster_func, func, builder,
                                           sharding_for_args);
 
-  llvm::SmallVector<llvm::StringRef, 8> sharding_for_rets;
+  OpShardingVector sharding_for_rets;
   IdentifyXlaShardingForComputationOutputs(
       logical_device_vec, use_spmd, /*infer_from_computation=*/true,
       cluster_func, func, builder, sharding_for_args, sharding_for_rets);
 
-  auto has_maximal_sharding = [](llvm::StringRef sharding_string) -> bool {
-    xla::OpSharding sharding;
-    sharding.ParseFromString(sharding_string.str());
-    return sharding.type() == xla::OpSharding::MAXIMAL;
+  auto has_maximal_sharding =
+      [](const OpShardingVariant& sharding_or_op) -> bool {
+    const auto sharding = GetShardingFromVariant(sharding_or_op);
+    return sharding && sharding->type() == xla::OpSharding::MAXIMAL;
   };
 
   // XLA SPMD only supports cases where all inputs/outputs exist on every
@@ -523,26 +604,30 @@ LogicalResult IdentifyXlaShardingForTPUComputation(
   Block& function_block = func.front();
   for (auto sharding_and_arg :
        llvm::zip(sharding_for_args, function_block.getArguments())) {
-    StringRef sharding = std::get<0>(sharding_and_arg);
     BlockArgument arg = std::get<1>(sharding_and_arg);
-    func.setArgAttr(arg.getArgNumber(), kShardingAttr,
-                    builder->getStringAttr(sharding));
+    const auto& sharding_or_op = std::get<0>(sharding_and_arg);
+    if (auto sharding = GetShardingStringFromVariant(sharding_or_op)) {
+      func.setArgAttr(arg.getArgNumber(), kShardingAttr,
+                      builder->getStringAttr(*sharding));
+    }
   }
 
   Operation* terminator = function_block.getTerminator();
   for (auto sharding_and_retval :
        llvm::zip(sharding_for_rets, terminator->getOpOperands())) {
-    StringRef sharding = std::get<0>(sharding_and_retval);
     OpOperand& retval = std::get<1>(sharding_and_retval);
-    func.setResultAttr(retval.getOperandNumber(), kShardingAttr,
-                       builder->getStringAttr(sharding));
+    const auto& sharding_or_op = std::get<0>(sharding_and_retval);
+    if (auto sharding = GetShardingStringFromVariant(sharding_or_op)) {
+      func.setResultAttr(retval.getOperandNumber(), kShardingAttr,
+                         builder->getStringAttr(*sharding));
+    }
   }
 
   // Update input/output sharding attributes on tf_device.cluster_func op.
   cluster_func->setAttr(tensorflow::kInputShardingAttr,
-                        builder->getStrArrayAttr(sharding_for_args));
+                        GetStrArrayAttr(builder, sharding_for_args));
   cluster_func->setAttr(tensorflow::kOutputShardingAttr,
-                        builder->getStrArrayAttr(sharding_for_rets));
+                        GetStrArrayAttr(builder, sharding_for_rets));
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_validate_inputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_validate_inputs.cc
index c6909581e17..dd5465bf5d7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_validate_inputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_validate_inputs.cc
@@ -12,11 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <functional>
 #include <memory>
+#include <string>
+#include <unordered_map>
 
+#include "absl/strings/match.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 
 namespace mlir {
 namespace TFTPU {
@@ -26,34 +38,151 @@ namespace {
 #define GEN_PASS_DEF_TPUVALIDATEINPUTSPASS
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
 
+typedef std::unordered_map<std::string, TF::TPUReplicateMetadataOp> MetadataMap;
+
 struct TPUValidateInputsPass
     : public impl::TPUValidateInputsPassBase<TPUValidateInputsPass> {
   void runOnOperation() override;
 };
+bool IsTpuRegularOp(Operation* op) {
+  static auto* ops = [] {
+    llvm::SmallDenseSet<mlir::TypeID, 32>* ops_set =
+        new llvm::SmallDenseSet<mlir::TypeID, 32>{
+            TypeID::get<mlir::ModuleOp>(),
+            TypeID::get<mlir::tf_executor::GraphOp>(),
+            TypeID::get<mlir::func::ReturnOp>(),
+            TypeID::get<mlir::func::FuncOp>(),
+            TypeID::get<mlir::tf_executor::YieldOp>(),
+            TypeID::get<mlir::tf_executor::IslandOp>(),
+            TypeID::get<TF::TPUReplicatedInputOp>(),
+            TypeID::get<TF::TPUReplicatedOutputOp>(),
+            TypeID::get<TF::TPUPartitionedInputOp>(),
+            TypeID::get<TF::TPUPartitionedInputV2Op>(),
+            TypeID::get<TF::TPUPartitionedOutputOp>(),
+            TypeID::get<TF::TPUPartitionedOutputV2Op>(),
+            TypeID::get<TF::TPUReplicateMetadataOp>(),
+            TypeID::get<mlir::tf_executor::FetchOp>(),
+        };
+    return ops_set;
+  }();
+  auto abstractOp = op->getRegisteredInfo();
+  if (!abstractOp) return true;
+  return ops->count(abstractOp->getTypeID()) == 0;
+}
 
-bool ValidateReplicatedInput(TF::TPUReplicatedInputOp rep, int num_replicas) {
-  int arity = rep.getInputs().size();
-  if (rep.getIsPacked() && arity != 1) {
-    rep.emitOpError(
-        "TF/XLA TPU bridge input check: packed with number of inputs not 1.")
-        << " num_replicas=" << num_replicas << " no. of inputs=" << arity;
+bool IsIntersectionXlaNonXlaOps(Operation* op) {
+  static auto* ops = [] {
+    llvm::SmallDenseSet<mlir::TypeID, 32>* ops_set =
+        new llvm::SmallDenseSet<mlir::TypeID, 32>{
+            TypeID::get<TF::ConstOp>(),
+            TypeID::get<TF::WhileOp>(),
+            TypeID::get<TF::AssertOp>(),
+            TypeID::get<TF::XlaSetDynamicDimensionSizeOp>(),
+        };
+    return ops_set;
+  }();
+  auto abstractOp = op->getRegisteredInfo();
+  if (!abstractOp) return true;
+  return ops->count(abstractOp->getTypeID()) == 0;
+}
+
+bool IsPartitionedOp(Operation* op) {
+  static auto* ops = [] {
+    llvm::SmallDenseSet<mlir::TypeID, 32>* ops_set =
+        new llvm::SmallDenseSet<mlir::TypeID, 32>{
+            TypeID::get<TF::StatefulPartitionedCallOp>(),
+            TypeID::get<TF::PartitionedCallOp>(),
+            TypeID::get<TF::TPUPartitionedCallOp>(),
+        };
+    return ops_set;
+  }();
+  auto abstractOp = op->getRegisteredInfo();
+  if (!abstractOp) return false;
+  return ops->count(abstractOp->getTypeID()) != 0;
+}
+
+// Gets the successors of an op wrapped in a tf_executor.island.
+llvm::SmallVector<Operation*> GetSuccessors(Operation* op) {
+  llvm::SmallVector<Operation*> successors;
+  for (auto result : op->getParentOp()->getOpResults()) {
+    for (auto& use : result.getUses()) {
+      auto succ = use.getOwner();
+      successors.push_back(succ);
+    }
+  }
+  return successors;
+}
+// Gets the predecessors of an op wrapped in tf_executor.island.
+llvm::SmallVector<Operation*> GetPredecessors(Operation* op) {
+  llvm::SmallVector<Operation*> predecessors;
+  for (auto operand : op->getOperands()) {
+    if (Operation* pred = operand.getDefiningOp()) {
+      pred->walk([&](mlir::Operation* opinexecutor) {
+        predecessors.push_back(opinexecutor);
+      });
+    }
+  }
+  return predecessors;
+}
+
+bool CheckTpuReplicateAttr(Operation* op, StringAttr attr,
+                           std::function<std::string()> errormsg) {
+  if (!op->hasAttr(TF::kTpuReplicateAttr)) {
+    op->emitOpError("TF2XLA TPU bridge input check: " + errormsg() +
+                    "missing _tpu_replicate attr");
     return false;
-  } else if (!rep.getIsPacked() && arity != num_replicas) {
-    rep.emitOpError(
-        "TF/XLA TPU bridge input check: number of inputs inconsistent.")
-        << " num_replicas=" << num_replicas << " no. of inputs=" << arity;
+  }
+  auto opattr = op->getAttr(TF::kTpuReplicateAttr);
+  if (opattr != attr) {
+    op->emitOpError("TF2XLA TPU bridge input check: " + errormsg() +
+                    "invalid _tpu_replicate attr.")
+        << " Expected attr: " << attr << ", Actual attr: " << opattr;
     return false;
   }
   return true;
 }
-bool ValidateReplicatedOutput(TF::TPUReplicatedOutputOp rep, int num_replicas) {
+
+bool ValidateReplicatedInput(TF::TPUReplicatedInputOp rep, int num_replicas,
+                             StringAttr attr) {
+  int arity = rep.getInputs().size();
+  if (rep.getIsPacked() && arity != 1) {
+    rep.emitOpError(
+        "TF2XLA TPU bridge input check: packed with number of inputs not 1.")
+        << " num_replicas=" << num_replicas << " no. of inputs=" << arity;
+    return false;
+  } else if (!rep.getIsPacked() && arity != num_replicas) {
+    rep.emitOpError(
+        "TF2XLA TPU bridge input check: number of inputs inconsistent.")
+        << " num_replicas=" << num_replicas << " no. of inputs=" << arity;
+    return false;
+  }
+  for (auto& succ : GetSuccessors(rep)) {
+    if (!IsTpuRegularOp(succ)) continue;
+    auto errormsg = [&]() -> std::string {
+      return rep->getName().getStringRef().str() + " op has successor op " +
+             succ->getName().getStringRef().str() + " with error: ";
+    };
+    if (!CheckTpuReplicateAttr(succ, attr, errormsg)) return false;
+  }
+  return true;
+}
+bool ValidateReplicatedOutput(TF::TPUReplicatedOutputOp rep, int num_replicas,
+                              StringAttr attr) {
   int arity = rep.getOutputs().size();
   if (arity != num_replicas) {
     rep.emitOpError(
-        "TF/XLA TPU bridge input check: number of outputs inconsistent.")
+        "TF2XLA TPU bridge input check: number of outputs inconsistent.")
         << " num_replicas=" << num_replicas << " no. of outputs=" << arity;
     return false;
   }
+  for (auto& pred : GetPredecessors(rep)) {
+    if (!IsTpuRegularOp(pred)) continue;
+    auto errormsg = [&]() -> std::string {
+      return rep->getName().getStringRef().str() + " op has predecessor op " +
+             pred->getName().getStringRef().str() + " with error: ";
+    };
+    if (!CheckTpuReplicateAttr(pred, attr, errormsg)) return false;
+  }
   return true;
 }
 bool ValidatePartitionedInput(TF::TPUPartitionedInputOp rep,
@@ -61,7 +190,7 @@ bool ValidatePartitionedInput(TF::TPUPartitionedInputOp rep,
   int arity = rep.getInputs().size();
   if (arity != num_cores_per_replica) {
     rep.emitOpError(
-        "TF/XLA TPU bridge input check: number of inputs inconsistent.")
+        "TF2XLA TPU bridge input check: number of inputs inconsistent.")
         << " num_cores_per_replica=" << num_cores_per_replica
         << " no. of inputs=" << arity;
     return false;
@@ -73,13 +202,13 @@ bool ValidatePartitionedInputV2(TF::TPUPartitionedInputV2Op rep,
   int arity = rep.getInputs().size();
   if (rep.getIsPacked() && arity != 1) {
     rep.emitOpError(
-        "TF/XLA TPU bridge input check: packed with number of inputs not 1.")
+        "TF2XLA TPU bridge input check: packed with number of inputs not 1.")
         << " num_cores_per_replicas=" << num_cores_per_replica
         << " no. of inputs=" << arity;
     return false;
   } else if (!rep.getIsPacked() && arity != num_cores_per_replica) {
     rep.emitOpError(
-        "TF/XLA TPU bridge input check: number of inputs inconsistent.")
+        "TF2XLA TPU bridge input check: number of inputs inconsistent.")
         << " num_cores_per_replica=" << num_cores_per_replica
         << " no. of inputs=" << arity;
     return false;
@@ -91,49 +220,229 @@ bool ValidatePartitionedOutput(T rep, int num_cores_per_replica) {
   int arity = rep.getOutput().size();
   if (arity != num_cores_per_replica) {
     rep.emitOpError(
-        "TF/XLA TPU bridge input check: number of outputs inconsistent.")
+        "TF2XLA TPU bridge input check: number of outputs inconsistent.")
         << " num_cores_per_replica=" << num_cores_per_replica
         << " no. of outputs=" << arity;
     return false;
   }
   return true;
 }
+
+bool CheckReplicatedIOOp(Operation* op, TF::TPUReplicateMetadataOp metadata,
+                         Operation* parent) {
+  int num_replicas = metadata.getNumReplicas();
+  int num_cores_per_replica = metadata.getNumCoresPerReplica();
+  StringAttr tpu_replicate_attr =
+      metadata->getAttrOfType<StringAttr>(TF::kTpuReplicateAttr);
+  if (auto repinput = dyn_cast<TF::TPUReplicatedInputOp>(op)) {
+    if (!ValidateReplicatedInput(repinput, num_replicas, tpu_replicate_attr))
+      return false;
+  }
+  if (auto repoutput = dyn_cast<TF::TPUReplicatedOutputOp>(op)) {
+    if (!ValidateReplicatedOutput(repoutput, num_replicas, tpu_replicate_attr))
+      return false;
+  }
+  if (auto partinput = dyn_cast<TF::TPUPartitionedInputOp>(op)) {
+    if (!ValidatePartitionedInput(partinput, num_cores_per_replica))
+      return false;
+  }
+  if (auto partinput = dyn_cast<TF::TPUPartitionedInputV2Op>(op)) {
+    if (!ValidatePartitionedInputV2(partinput, num_cores_per_replica))
+      return false;
+  }
+  if (auto partoutput = dyn_cast<TF::TPUPartitionedOutputOp>(op)) {
+    if (!ValidatePartitionedOutput(partoutput, num_cores_per_replica))
+      return false;
+  }
+  if (auto partoutput = dyn_cast<TF::TPUPartitionedOutputV2Op>(op)) {
+    if (!ValidatePartitionedOutput(partoutput, num_cores_per_replica))
+      return false;
+  }
+  return true;
+}
+// Checking op which is successor to a cluster op.
+bool CheckClusterSuccessors(Operation* op, std::string cluster,
+                            Operation* parent, MetadataMap& metadata_map) {
+  std::string cluster_succ = "";
+  if (op->hasAttr(TF::kTpuReplicateAttr)) {
+    cluster_succ = op->getAttrOfType<StringAttr>(TF::kTpuReplicateAttr).str();
+  }
+  if (cluster_succ.empty()) {
+    // TODO (b/269195256#comment16): Change to error after resolving issue
+    // with test. Will fix it after the upstream code is fixed.
+    op->emitWarning("TF2XLA TPU bridge input check: cluster op = ")
+        << parent->getName() << " with cluster = " << cluster
+        << " has successor as non cluster op " << op->getName();
+    return true;
+  }
+  if (cluster != cluster_succ) {
+    op->emitOpError(
+        "TF2XLA TPU bridge input check: mismatch clusters tpu_replicate "
+        "attr. Parent op ")
+        << parent->getName() << " with cluster = " << cluster
+        << " has successor cluster op " << op->getName()
+        << " with cluster = " << cluster_succ;
+    return false;
+  }
+  return true;
+}
+
+// Checking op which is a predecessor to a non-cluster op.
+bool CheckNonClusterSuccessors(Operation* op, Operation* parent,
+                               MetadataMap& metadata_map) {
+  if (!IsTpuRegularOp(op)) {
+    if (isa<TF::TPUReplicatedOutputOp>(op)) {
+      op->emitOpError("TF2XLA TPU bridge input check: non-cluster op = ")
+          << parent->getName()
+          << " has invalid successor op = " << op->getName();
+      return false;
+    } else {
+      return true;
+    }
+  }
+  return true;
+}
+// Checking op which is a successor to a non-cluster op.
+bool CheckNonClusterPredecessors(Operation* op, Operation* parent,
+                                 MetadataMap& metadata_map) {
+  if (!IsTpuRegularOp(op)) {
+    if (isa<TF::TPUReplicatedInputOp>(op)) {
+      op->emitOpError("TF2XLA TPU bridge input check: non-cluster op = ")
+          << parent->getName()
+          << " has invalid predecessor op = " << op->getName();
+      return false;
+    } else {
+      return true;
+    }
+  }
+  return true;
+}
+
+bool CheckOpsClusterIO(Operation* op, MetadataMap& metadata_map) {
+  bool is_cluster_op = false;
+  std::string cluster = "";
+  if (op->hasAttr(TF::kTpuReplicateAttr)) {
+    cluster = op->getAttrOfType<StringAttr>(TF::kTpuReplicateAttr).str();
+    if (cluster.empty()) {
+      op->emitOpError("TF2XLA TPU bridge input check: empty _tpu_replicate")
+          << " attr for op = " << op->getName();
+      return false;
+    }
+    is_cluster_op = true;
+  }
+  bool has_cluster_metadata =
+      (metadata_map.find(cluster) != metadata_map.end());
+
+  for (auto pred : GetPredecessors(op)) {
+    if (is_cluster_op && !IsTpuRegularOp(pred) && has_cluster_metadata) {
+      if (!CheckReplicatedIOOp(pred, metadata_map[cluster], op)) return false;
+    }
+    if (!is_cluster_op) {
+      if (!CheckNonClusterPredecessors(pred, op, metadata_map)) return false;
+    }
+  }
+
+  for (auto succ : GetSuccessors(op)) {
+    if (is_cluster_op && !IsTpuRegularOp(succ) && has_cluster_metadata) {
+      if (!CheckReplicatedIOOp(succ, metadata_map[cluster], op)) return false;
+    }
+    if (is_cluster_op && IsTpuRegularOp(succ)) {
+      if (!CheckClusterSuccessors(succ, cluster, op, metadata_map))
+        return false;
+    }
+    if (!is_cluster_op) {
+      if (!CheckNonClusterSuccessors(succ, op, metadata_map)) return false;
+    }
+  }
+  return true;
+}
+
+bool TypeMustBeNonXLA(const Type& type) {
+  const Type elem = getElementTypeOrSelf(type);
+  return !elem.isa<TF::ResourceType>() && !tensorflow::TypeValidForXLA(type);
+}
+
+// Check if the op cannot be XLA compiled. If the op does not satisfy this
+// criteria, then it is possible for the op to be XLA and non-XLA. But this
+// function specifically checks if the op must be non-xla.
+bool IsMustNotBeXlaOp(Operation* op) {
+  for (auto& input : op->getOpOperands()) {
+    if (TypeMustBeNonXLA(input.get().getType())) return true;
+  }
+  for (auto output_types : op->getResultTypes()) {
+    if (TypeMustBeNonXLA(output_types)) return true;
+  }
+  return false;
+}
+
+// Check if the op must be compiled with XLA. If the op does not satisfy this
+// critiria for "must be xla" then it is still possible for this op to be xla
+// and non-xla as well. But below function specifically checks for the op to be
+// only XLA op.
+bool IsMustBeXlaOp(Operation* op, MetadataMap metadata_map) {
+  // All PartitionedCall are inlined-out before XLA.
+  // So MustBeXLA should return false
+  if (IsPartitionedOp(op)) return false;
+  if (!op->hasAttr(TF::kTpuReplicateAttr)) return false;
+  auto cluster = op->getAttrOfType<StringAttr>(TF::kTpuReplicateAttr).str();
+  if (metadata_map.find(cluster) == metadata_map.end()) return false;
+  auto metadata = metadata_map[cluster];
+  if (!metadata.getAllowSoftPlacement() &&
+      !op->hasAttr(TF::kXlaOutsideCompilationAttr))
+    return true;
+  std::string device = "";
+  if (op->hasAttr(TF::kDeviceAttr))
+    device = op->getAttrOfType<StringAttr>(TF::kDeviceAttr).str();
+  else
+    return false;
+  if (absl::StrContains(device, TF::kTpuDevice)) return true;
+  return false;
+}
+bool ValidateIntersectionXlaNonXlaOps(Operation* op, MetadataMap metadata_map) {
+  if (isa<TF::TPUReplicateMetadataOp>(op) ||
+      isa<TF::TPUReplicatedInputOp>(op) || isa<TF::TPUReplicatedOutputOp>(op) ||
+      isa<TF::TPUPartitionedInputOp>(op) ||
+      isa<TF::TPUPartitionedInputV2Op>(op) ||
+      isa<TF::TPUPartitionedOutputOp>(op) ||
+      isa<TF::TPUPartitionedOutputV2Op>(op))
+    return true;
+  if (IsMustBeXlaOp(op, metadata_map) && IsMustNotBeXlaOp(op)) {
+    // TODO(b/269195256#comment19) change the warning for Identity op to error
+    // when issue with input graph is resolved. Possible issue with python layer
+    // inserting Identity op incorrectly.
+    if (isa<TF::IdentityOp>(op)) {
+      op->emitWarning("TF/XLA TPU bridge input check: found invalid op. ")
+          << op->getName() << " can't be both xla and non-xla";
+      return true;
+    }
+    op->emitOpError("TF/XLA TPU bridge input check: found invalid op. ")
+        << "Can't be both xla and non-xla";
+    return false;
+  }
+  return true;
+}
+
 void TPUValidateInputsPass::runOnOperation() {
   ModuleOp module = getOperation();
   bool success = true;
   int num_metadata = 0;
   TF::TPUReplicateMetadataOp metadata;
+  MetadataMap metadata_map;
   module.walk([&](TF::TPUReplicateMetadataOp meta) {
     ++num_metadata;
     metadata = meta;
+    metadata_map[meta->getAttrOfType<StringAttr>(TF::kTpuReplicateAttr).str()] =
+        meta;
+  });
+
+  getOperation().walk([&](mlir::Operation* op) {
+    if (IsTpuRegularOp(op)) {
+      success &= CheckOpsClusterIO(op, metadata_map);
+    }
+    if (IsIntersectionXlaNonXlaOps(op)) {
+      success &= ValidateIntersectionXlaNonXlaOps(op, metadata_map);
+    }
   });
-  // TODO(b/269195256): support multi-TPUReplicateMetadata case.
-  // Currently handling case with one metadata op / cluster. Further CLs will
-  // address cases with multi-TPUReplicatedMetadata.
-  if (num_metadata == 1) {
-    int num_replicas = metadata.getNumReplicas();
-    int num_cores_per_replica = metadata.getNumCoresPerReplica();
-    module.walk([&](mlir::Operation* op) {
-      if (auto repinput = dyn_cast<TF::TPUReplicatedInputOp>(op)) {
-        success &= ValidateReplicatedInput(repinput, num_replicas);
-      }
-      if (auto repoutput = dyn_cast<TF::TPUReplicatedOutputOp>(op)) {
-        success &= ValidateReplicatedOutput(repoutput, num_replicas);
-      }
-      if (auto partinput = dyn_cast<TF::TPUPartitionedInputOp>(op)) {
-        success &= ValidatePartitionedInput(partinput, num_cores_per_replica);
-      }
-      if (auto partinput = dyn_cast<TF::TPUPartitionedInputV2Op>(op)) {
-        success &= ValidatePartitionedInputV2(partinput, num_cores_per_replica);
-      }
-      if (auto partoutput = dyn_cast<TF::TPUPartitionedOutputOp>(op)) {
-        success &= ValidatePartitionedOutput(partoutput, num_cores_per_replica);
-      }
-      if (auto partoutput = dyn_cast<TF::TPUPartitionedOutputV2Op>(op)) {
-        success &= ValidatePartitionedOutput(partoutput, num_cores_per_replica);
-      }
-    });
-  }
   if (!success) {
     signalPassFailure();
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc
index df3aba4eeb0..f876231ab00 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc
@@ -57,6 +57,9 @@ void EncapsulatePartitionedCall(Operation *call_op) {
 
   builder.setInsertionPointToEnd(&cluster.GetBody());
   builder.create<tf_device::ReturnOp>(call_op->getLoc(), call_op->getResults());
+  // Propagate necessary attributes to the cluster so that when it's outlined,
+  // the function will have correct attributes.
+  TF::CopyDeviceAndUnderscoredAttributes(call_op, cluster);
 }
 
 void XlaClusterFormationPass::runOnOperation() {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc
index 550e5804430..c59d6e532d0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc
@@ -60,14 +60,11 @@ void MoveResourceArgsToEnd(func::FuncOp callee) {
                                            callee.getResultTypes()));
 }
 
-template <typename OpT,
-          typename std::enable_if<llvm::is_one_of<
-              OpT, TF::PartitionedCallOp,
-              TF::StatefulPartitionedCallOp>::value>::type * = nullptr>
-void RewriteCall(OpT call_op, SymbolTable &symtab) {
+void RewriteCall(tf_device::ClusterFuncOp cluster_func_op, SymbolTable &symtab,
+                 OpBuilder &builder) {
   llvm::SmallVector<Value> non_resource_args, resource_args;
   bool has_resources = false, in_order = true;
-  for (const Value &arg : call_op.getArgs()) {
+  for (const Value &arg : cluster_func_op.getOperands()) {
     if (!getElementTypeOrSelf(arg.getType()).template isa<TF::ResourceType>()) {
       non_resource_args.push_back(arg);
       if (has_resources) in_order = false;
@@ -80,33 +77,26 @@ void RewriteCall(OpT call_op, SymbolTable &symtab) {
   if (!in_order) {
     // Functions do not get reused in practice, so skip the check for if the
     // callee has been updated.
-    StringAttr callee_sym =
-        cast<SymbolRefAttr>(call_op.getFAttr()).getRootReference();
+    StringAttr callee_sym = cluster_func_op.getFuncAttr().getAttr();
     MoveResourceArgsToEnd(symtab.lookup<func::FuncOp>(callee_sym));
   }
-  OpBuilder builder(call_op->getContext());
-  builder.setInsertionPoint(call_op);
+  builder.setInsertionPoint(cluster_func_op);
   auto xla_launch_op = builder.create<TF::XlaLaunchOp>(
-      call_op.getLoc(), call_op.getResultTypes(),
+      cluster_func_op.getLoc(), cluster_func_op.getResultTypes(),
       /*constants=*/ValueRange({}), ValueRange(non_resource_args),
-      ValueRange(resource_args), call_op.getFAttr());
+      ValueRange(resource_args), cluster_func_op.getFuncAttr());
 
-  CopyDeviceAndUnderscoredAttributes(call_op, xla_launch_op);
-  call_op.replaceAllUsesWith(xla_launch_op.getResults());
-  call_op.erase();
+  CopyDeviceAndUnderscoredAttributes(cluster_func_op, xla_launch_op);
+  cluster_func_op.replaceAllUsesWith(xla_launch_op.getResults());
+  cluster_func_op.erase();
 }
 
 void XlaRewritePass::runOnOperation() {
   ModuleOp module = getOperation();
   SymbolTable symtab(module);
-  module.walk([&](tf_device::ClusterOp cluster_op) {
-    cluster_op.getBody().walk([&](mlir::Operation *op) {
-      if (auto call_op = llvm::dyn_cast<TF::StatefulPartitionedCallOp>(op)) {
-        RewriteCall(call_op, symtab);
-      } else if (auto call_op = llvm::dyn_cast<TF::PartitionedCallOp>(op)) {
-        RewriteCall(call_op, symtab);
-      }
-    });
+  OpBuilder builder(&getContext());
+  module.walk([&](tf_device::ClusterFuncOp cluster_func_op) {
+    RewriteCall(cluster_func_op, symtab, builder);
   });
 
   // Verify that there are no nested XLA launch ops.
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 4aff79c8585..61490f6a749 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -63,6 +63,7 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/regularization/util.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -135,7 +136,9 @@ class Exporter {
 
  private:
   explicit Exporter(Graph* graph, const Dialect* tf_dialect)
-      : graph_(graph), tf_dialect_(tf_dialect) {}
+      : graph_(graph), tf_dialect_(tf_dialect) {
+    graph_->ToGraphDef(&graphdef_);
+  }
 
   Status AddArgumentNode(BlockArgument arg, unsigned index,
                          llvm::StringRef name);
@@ -158,6 +161,7 @@ class Exporter {
   Status AddEdgeBetweenNodes(Value src, Node* dst_node, unsigned dst_index);
 
   Graph* graph_;
+  GraphDef graphdef_;
   LegalizedOpOrValLocNameMapper op_to_name_;
   absl::flat_hash_map<Operation*, Node*> nodes_;
   llvm::DenseMap<BlockArgument, Node*> args_;
@@ -358,7 +362,8 @@ Status Exporter::AddEdge(Operation* inst) {
 
 Status Exporter::AddInstructionNode(Operation* inst) {
   std::unique_ptr<NodeDef> node_def;
-  auto name = op_to_name_.GetUniqueName(inst);
+  int graph_hash_value = graph_regularization::ComputeHash(graphdef_);
+  auto name = op_to_name_.GetUniqueName(inst, graph_hash_value);
   // Convert registered TF ops to NodeDef. Only registered ops are handled to
   // ensure that PopulateDerivedAttrs adds the correct attributes.
   TF_ASSIGN_OR_RETURN(node_def,
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index ce9d086f22a..b8ba989b33b 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <type_traits>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
@@ -99,6 +100,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
@@ -124,7 +126,6 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
@@ -1089,7 +1090,7 @@ StatusOr<mlir::Type> ImporterBase::InferOutputType(const Node& node, int idx,
             return errors::InvalidArgument(
                 "Node '", node.name(), " has an invalid ",
                 kOutputShapesAttrName, " attribute (shape #", idx, " error:'",
-                s.error_message(), "')");
+                s.message(), "')");
           c->set_output(idx, h);
         }
       }
@@ -1680,7 +1681,7 @@ Status ImporterBase::ConvertFunctionArgAndRets(
   }
 
   llvm::SmallVector<mlir::Value, 8> inst_to_return;
-  for (auto ret_and_idx : llvm::enumerate(ret_nodes)) {
+  for (const auto& ret_and_idx : llvm::enumerate(ret_nodes)) {
     const auto& ret = ret_and_idx.value();
     auto* inst = node_values_[ret.node->id()];
     if (ret.node->IsRetval()) {
@@ -1772,6 +1773,7 @@ mlir::Location ImporterBase::GetLocation(const Node& node) {
             mlir::FileLineColLoc::get(file_name, frame.line_number, 1);
         locations.push_back(file_line_loc);
       }
+      stack_trace->WipeCache();
     } else {
       DVLOG(1) << "No stack trace for " << node.name();
       const auto location_it = debug_info.find(debug_info_key);
@@ -2353,7 +2355,8 @@ class GraphDefImporter : public ImporterBase {
       mlir::MLIRContext* context, const Graph& graph,
       const GraphDebugInfo& debug_info,
       const FunctionLibraryDefinition& flib_def, const GraphImportConfig& specs,
-      std::unordered_map<std::string, std::string>& tf_name_to_mlir_name);
+      std::unordered_map<std::string, std::string>& tf_name_to_mlir_name,
+      bool disable_crash_analysis = false);
 
  private:
   explicit GraphDefImporter(
@@ -2395,7 +2398,8 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphDefImporter::Convert(
     mlir::MLIRContext* context, const Graph& graph,
     const GraphDebugInfo& debug_info, const FunctionLibraryDefinition& flib_def,
     const GraphImportConfig& specs,
-    std::unordered_map<std::string, std::string>& tf_name_to_mlir_name) {
+    std::unordered_map<std::string, std::string>& tf_name_to_mlir_name,
+    bool disable_crash_analysis) {
   LoadImporterDialects(*context);
   mlir::OwningOpRef<mlir::ModuleOp> module =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(context));
@@ -2405,22 +2409,28 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphDefImporter::Convert(
   // via conversion to the graph def first. Convert graph to graph_def here
   // first and avoid extra copies later.
   auto graph_def = std::make_unique<GraphDef>();
-  graph.ToGraphDef(graph_def.get());
+  graph.ToGraphDef(graph_def.get(), /*include_flib_def=*/false);
 
-  static std::atomic<uint32> counter(0);
-  uint32 current_file_prefix = counter++;
-  const auto* graph_crash_handle = crash_analysis::ReportProtoDataOnCrash(
-      absl::StrCat(current_file_prefix, "_mlir_import_graph.pbtxt"),
-      *graph_def);
-  auto reachable_flib = flib_def.ReachableDefinitions(*graph_def);
-  const auto* flib_crash_handle = crash_analysis::ReportProtoDataOnCrash(
-      absl::StrCat(current_file_prefix, "_mlir_import_flib.pbtxt"),
-      reachable_flib.ToProto());
+  auto scope_exit = [&]() {
+    std::function<void()> cleanup = []() {};
+    if (!disable_crash_analysis) {
+      static std::atomic<uint32> counter(0);
+      uint32 current_file_prefix = counter++;
+      const auto* graph_crash_handle = crash_analysis::ReportProtoDataOnCrash(
+          absl::StrCat(current_file_prefix, "_mlir_import_graph.pbtxt"),
+          *graph_def);
+      auto reachable_flib = flib_def.ReachableDefinitions(*graph_def);
+      const auto* flib_crash_handle = crash_analysis::ReportProtoDataOnCrash(
+          absl::StrCat(current_file_prefix, "_mlir_import_flib.pbtxt"),
+          reachable_flib.ToProto());
+      cleanup = [=]() {
+        crash_analysis::RemoveReportData(graph_crash_handle);
+        crash_analysis::RemoveReportData(flib_crash_handle);
+      };
+    }
 
-  auto scope_exit = llvm::make_scope_exit([&]() {
-    crash_analysis::RemoveReportData(graph_crash_handle);
-    crash_analysis::RemoveReportData(flib_crash_handle);
-  });
+    return llvm::make_scope_exit(std::move(cleanup));
+  }();
 
   VLOG(2) << "Importing: "
           << ::tensorflow::DumpGraphToFile("tf_mlir_importer_base", graph,
@@ -2471,6 +2481,11 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphDefImporter::Convert(
     attrs.push_back(b.getNamedAttr(
         "tf.entry_function",
         b.getDictionaryAttr({inputs, outputs, control_outputs})));
+    if (!specs.xla_compile_device_type.empty()) {
+      attrs.push_back(
+          b.getNamedAttr("_xla_compile_device_type",
+                         b.getStringAttr(specs.xla_compile_device_type)));
+    }
   } else {
     // Collects the argument and return nodes by looking up the node names
     // specified by the user.
@@ -2539,7 +2554,7 @@ StatusOr<mlir::FunctionType> GraphDefImporter::InferMainFunctionType(
   // Feeds have been remapped to single output nodes (Placeholder), so an exact
   // name match is sufficient.
   absl::flat_hash_map<absl::string_view, int> inputs;
-  for (auto input_and_idx : llvm::enumerate(specs.inputs)) {
+  for (const auto& input_and_idx : llvm::enumerate(specs.inputs)) {
     TensorId tensor = ParseTensorName(input_and_idx.value().first);
     auto remapped_it = remapped_feeds_.find(tensor);
     if (remapped_it != remapped_feeds_.end()) {
@@ -2700,7 +2715,7 @@ GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
   mlir::Builder builder(context);
   llvm::SmallVector<mlir::Type, 4> arg_types;
   arg_types.reserve(arg_nodes->size());
-  for (auto arg_node_and_idx : llvm::enumerate(*arg_nodes)) {
+  for (const auto& arg_node_and_idx : llvm::enumerate(*arg_nodes)) {
     auto& arg_node = arg_node_and_idx.value();
     if (arg_node.node == nullptr)
       return errors::InvalidArgument("Graph missing _Arg at index ",
@@ -2713,7 +2728,7 @@ GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
 
   llvm::SmallVector<mlir::Type, 4> ret_types;
   ret_types.reserve(ret_nodes->size());
-  for (auto ret_node_and_idx : llvm::enumerate(*ret_nodes)) {
+  for (const auto& ret_node_and_idx : llvm::enumerate(*ret_nodes)) {
     auto& ret_node = ret_node_and_idx.value();
     if (ret_node.node == nullptr)
       return errors::InvalidArgument("Graph missing _Retval at index ",
@@ -2733,7 +2748,7 @@ Status GraphDefImporter::GetControlRetsFromGraph(
   if (control_outputs.empty()) return OkStatus();
 
   llvm::SmallDenseMap<llvm::StringRef, int32_t> controls_to_idx;
-  for (auto control_and_idx : llvm::enumerate(control_outputs))
+  for (const auto& control_and_idx : llvm::enumerate(control_outputs))
     controls_to_idx.insert({control_and_idx.value(), control_and_idx.index()});
 
   if (controls_to_idx.size() != control_outputs.size())
@@ -3411,12 +3426,12 @@ Status CreateSavedModelIR(
             function.concrete_functions(0), "' (", input_index_paths.size(),
             " vs ", bound_input_base, ")");
       }
-      for (auto index_path : llvm::enumerate(input_index_paths)) {
+      for (const auto& index_path : llvm::enumerate(input_index_paths)) {
         func.setArgAttr(index_path.index(), kTfSavedModelIndexPathAttr,
                         index_path.value());
       }
 
-      for (auto& bound_input :
+      for (const auto& bound_input :
            llvm::enumerate(concrete_function.bound_inputs())) {
         int arg_index = bound_input_base + bound_input.index();
         auto symbol_ref = mlir::SymbolRefAttr::get(
@@ -3438,7 +3453,7 @@ Status CreateSavedModelIR(
             function.concrete_functions(0), "' (", output_index_paths.size(),
             " vs ", func.getNumResults(), ")");
       }
-      for (auto index_path : llvm::enumerate(output_index_paths)) {
+      for (const auto& index_path : llvm::enumerate(output_index_paths)) {
         func.setResultAttr(index_path.index(), kTfSavedModelIndexPathAttr,
                            index_path.value());
       }
@@ -3560,8 +3575,8 @@ SavedModelObjectGraphImporter::Convert(SavedModelV2Bundle* saved_model,
     TF_RETURN_IF_ERROR(PreprocessGraphDef(nullptr, &preprocessed_graphdef));
   }
 
-  TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(options, preprocessed_graphdef, &graph));
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
+      options, std::move(preprocessed_graphdef), &graph));
 
   NameUniquifier function_name_uniquifier(graph.flib_def());
   SavedModelObjectGraphImporter importer(graph.flib_def(), debug_info, specs,
@@ -3615,7 +3630,7 @@ class SimpleSavedModelMLIRImportInput : public SavedModelMLIRImportInput {
       const MLIRImportOptions& import_options,
       const MetaGraphDef* meta_graph_def, const GraphDebugInfo& debug_info) {
     DCHECK(meta_graph_def);
-    GraphDef graph_def = meta_graph_def->graph_def();
+    GraphDef graph_def(meta_graph_def->graph_def());
     auto graph = std::make_unique<Graph>(OpRegistry::Global());
 
     if (import_options.upgrade_legacy) {
@@ -3626,8 +3641,8 @@ class SimpleSavedModelMLIRImportInput : public SavedModelMLIRImportInput {
     GraphConstructorOptions graph_ctor_options;
     graph_ctor_options.allow_internal_ops = true;
     graph_ctor_options.add_default_attributes = true;
-    TF_RETURN_IF_ERROR(
-        ConvertGraphDefToGraph(graph_ctor_options, graph_def, graph.get()));
+    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
+        graph_ctor_options, std::move(graph_def), graph.get()));
 
     if (import_options.upgrade_legacy) {
       // TODO(jpienaar): Remove need to const_cast.
@@ -3941,7 +3956,8 @@ SavedModelSignatureDefImporterLite::ConvertGraph(
   // Convert sub-graph to MLIR module.
   return GraphDefImporter::Convert(module_->getContext(), *subgraph,
                                    input_.debug_info(), subgraph->flib_def(),
-                                   specs, tf_name_to_mlir_name);
+                                   specs, tf_name_to_mlir_name,
+                                   /*disable_crash_analysis=*/true);
 }
 
 Status SavedModelSignatureDefImporterLite::ConvertSignature(
@@ -3983,11 +3999,11 @@ Status SavedModelSignatureDefImporterLite::ConvertSignature(
                    builder.getStrArrayAttr({sig_def_key}));
 
   // Transfer input and output parameter names to index_path attributes.
-  for (auto input_and_idx : llvm::enumerate(inputs)) {
+  for (const auto& input_and_idx : llvm::enumerate(inputs)) {
     func_op.setArgAttr(input_and_idx.index(), kTfSavedModelIndexPathAttr,
                        builder.getStrArrayAttr({input_and_idx.value().first}));
   }
-  for (auto output_and_idx : llvm::enumerate(outputs)) {
+  for (const auto& output_and_idx : llvm::enumerate(outputs)) {
     func_op.setResultAttr(
         output_and_idx.index(), kTfSavedModelIndexPathAttr,
         builder.getStrArrayAttr({output_and_idx.value().first}));
@@ -4170,7 +4186,9 @@ class SavedModelSignatureDefImporter {
     mlir::OpBuilder builder(module->getContext());
     (*module)->setAttr("tf_saved_model.under_construction",
                        builder.getUnitAttr());
-    TF_RETURN_IF_ERROR(LiftVariables(bundle, *module, options.lift_variables));
+    TF_RETURN_IF_ERROR(
+        LiftVariables(bundle, *module, options.lift_variables,
+                      options.include_variables_in_initializers));
     (*module)->removeAttr("tf_saved_model.under_construction");
 
     return module;
@@ -4178,14 +4196,21 @@ class SavedModelSignatureDefImporter {
 
  private:
   // Lifts the variables in `module`.
+  // If `include_variables_in_initializers` is set to false, then it removes all
+  // variables from the initializer functions (registered in the
+  // `tf_saved_model::SessionInitializerOp`) by running the
+  // `RemoveVariablesInSessionInitializerPass`, regardless of whether
+  // `lift_variable_ops_to_args` is true or not.
   static Status LiftVariables(const SavedModelBundle& bundle,
                               mlir::ModuleOp module,
-                              bool lift_varhandle_ops_to_args);
+                              bool lift_varhandle_ops_to_args,
+                              bool include_variables_in_initializers);
 };
 
 Status SavedModelSignatureDefImporter::LiftVariables(
     const SavedModelBundle& bundle, mlir::ModuleOp module,
-    bool lift_varhandle_ops_to_args) {
+    const bool lift_varhandle_ops_to_args,
+    const bool include_variables_in_initializers) {
   mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
 
   mlir::PassManager pm(module.getContext());
@@ -4194,8 +4219,10 @@ Status SavedModelSignatureDefImporter::LiftVariables(
       mlir::tf_executor::CreateTFExecutorGraphPruningPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::CreateExecutorDialectToFunctionalConversionPass());
-  pm.addPass(
-      mlir::tf_saved_model::CreateRemoveVariablesInSessionInitializerPass());
+  if (!include_variables_in_initializers) {
+    pm.addPass(
+        mlir::tf_saved_model::CreateRemoveVariablesInSessionInitializerPass());
+  }
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::
           CreateConvertReadonlyReferenceVariablesToResourceVariablesPass());
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
index ac10baa94c3..182a53078ba 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
@@ -30,8 +30,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h b/tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h
index e1b45dda1c5..44262d0bd08 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h
@@ -32,9 +32,20 @@ struct MLIRImportOptions {
   // Apply default attributes from the op definition to the loaded op.
   bool add_default_attributes = true;
 
-  //  If set, promote tf.VarHandleOp to resource arguments for all functions.
+  // If set, promote tf.VarHandleOp to resource arguments for all functions.
   bool lift_variables = true;
 
+  // Keeps the variables in initializers before lifting variables (when
+  // `lift_variables == true`) or newly adding variable initialization patterns
+  // in the initializer functions. One might want to set this to `true` because
+  // the `RemoveVariablesInSessionInitializerPass` pass, which runs otherwise,
+  // may unexpectedly also remove the initialization patterns for non-variable
+  // resources (like hash tables) if they involve variables. Such a case is
+  // illustrated in the test file
+  // "../tests/tf_saved_model_remove_vars_in_session_initializer.mlir".
+  // This defaults to `false` to avoid breaking existing uses.
+  bool include_variables_in_initializers = false;
+
   // Load the model without restoring associated variables from disk. Enables
   // loading raw programs without checkpoints.
   bool allow_uninitialized_variables = false;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc
index 12db69d867b..3c703722b82 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc
@@ -62,6 +62,7 @@ std::string GraphImportConfig::str() const {
   ss << "\nenable_shape_inference: " << enable_shape_inference;
   ss << "\nunconditionally_use_set_output_shapes: "
      << unconditionally_use_set_output_shapes;
+  ss << "\nxla_compile_device_type: " << xla_compile_device_type;
 
   return ss.str();
 }
@@ -245,7 +246,7 @@ static StatusOr<std::vector<std::string>> ParseDTypesHelper(
   bool inside_subtype = false;
   int cur_pos = 0;
   std::vector<std::string> dtypes;
-  for (auto& it : llvm::enumerate(data_types_str)) {
+  for (const auto& it : llvm::enumerate(data_types_str)) {
     char c = it.value();
     int i = it.index();
     // Skip parsing the subtypes of a type
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
index 5eb7b25a126..79d364bf6b2 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
@@ -90,6 +90,9 @@ struct GraphImportConfig {
   // so make it opt-in to consider it unconditionally also when importing the
   // graph.
   bool unconditionally_use_set_output_shapes = false;
+  // If set, use the value as the device type and mark the function graph for
+  // XLA compilation.
+  string xla_compile_device_type;
 };
 
 struct GraphExportConfig {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
index 65a6dbaa1c5..84ae5a522e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h"
 
+#include <utility>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -25,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.cc b/tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.cc
index a1dced4bf5e..c5e059e3a67 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.cc
@@ -176,7 +176,7 @@ void SplitIsland(mlir::tf_executor::IslandOp island_op,
   // `island_op.getControl().dropAllUses();` of a control dep that's only used
   // in a graph's fetch, immediately leads to a segfault. Turns out we need to
   // drop its uses manually so that we don't leave dangling controls.
-  for (auto& fetch : llvm::enumerate(graph_op.GetFetch().getFetches())) {
+  for (const auto& fetch : llvm::enumerate(graph_op.GetFetch().getFetches())) {
     if (fetch.value() == island_op.getControl()) {
       graph_op.GetFetch().getFetchesMutable().erase(fetch.index(), 1);
       break;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
index 66f511d1a93..233d35d8c01 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
@@ -34,18 +34,19 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/import_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/utils/transitive_fanin.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/util/tensor_bundle/byte_swap_tensor.h"
 
 namespace tensorflow {
 
 static StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirImport(
     llvm::StringRef input, absl::string_view debug_info_file,
+    absl::string_view xla_compile_device_type,
     const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::optional<std::vector<int>>>& input_shapes,
@@ -73,6 +74,7 @@ static StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirImport(
   specs.enable_shape_inference = enable_shape_inference;
   specs.unconditionally_use_set_output_shapes =
       unconditionally_use_set_output_shapes;
+  specs.xla_compile_device_type = xla_compile_device_type;
   TF_RETURN_IF_ERROR(ParseInputArrayInfo(input_arrays, input_dtypes,
                                          input_shapes, &specs.inputs));
   TF_RETURN_IF_ERROR(ParseOutputArrayInfo(output_arrays, &specs.outputs));
@@ -108,6 +110,7 @@ static StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirImport(
 
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
     llvm::StringRef input, absl::string_view debug_info_file,
+    absl::string_view xla_compile_device_type,
     const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::optional<std::vector<int>>>& input_shapes,
@@ -117,10 +120,11 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
     bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
     bool unconditionally_use_set_output_shapes, mlir::MLIRContext* context) {
   auto module_or = GraphdefToMlirImport(
-      input, debug_info_file, input_arrays, input_dtypes, input_shapes,
-      output_arrays, control_output_arrays, prune_unused_nodes,
-      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy,
-      enable_shape_inference, unconditionally_use_set_output_shapes, context);
+      input, debug_info_file, xla_compile_device_type, input_arrays,
+      input_dtypes, input_shapes, output_arrays, control_output_arrays,
+      prune_unused_nodes, convert_legacy_fed_inputs, graph_as_function,
+      upgrade_legacy, enable_shape_inference,
+      unconditionally_use_set_output_shapes, context);
   if (!module_or.status().ok()) {
     LOG(ERROR) << "Graph import failed: " << module_or.status();
   }
@@ -129,12 +133,12 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
 
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
     llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view input_arrays, absl::string_view input_dtypes,
-    absl::string_view input_shapes, absl::string_view output_arrays,
-    absl::string_view control_output_arrays, bool prune_unused_nodes,
-    bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
-    bool enable_shape_inference, bool unconditionally_use_set_output_shapes,
-    mlir::MLIRContext* context) {
+    absl::string_view xla_compile_device_type, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, absl::string_view control_output_arrays,
+    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
+    bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
+    bool unconditionally_use_set_output_shapes, mlir::MLIRContext* context) {
   std::vector<std::string> input_array_vector;
   std::vector<std::string> input_dtype_vector;
   std::vector<std::optional<std::vector<int>>> input_shapes_vector;
@@ -147,11 +151,11 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
   TF_RETURN_IF_ERROR(
       ParseNodeNames(control_output_arrays, control_output_array_vector));
   return GraphdefToMlirTranslateFunction(
-      input, debug_info_file, input_array_vector, input_dtype_vector,
-      input_shapes_vector, output_array_vector, control_output_array_vector,
-      prune_unused_nodes, convert_legacy_fed_inputs, graph_as_function,
-      upgrade_legacy, enable_shape_inference,
-      unconditionally_use_set_output_shapes, context);
+      input, debug_info_file, xla_compile_device_type, input_array_vector,
+      input_dtype_vector, input_shapes_vector, output_array_vector,
+      control_output_array_vector, prune_unused_nodes,
+      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy,
+      enable_shape_inference, unconditionally_use_set_output_shapes, context);
 }
 
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SavedModelObjectGraphToMlirImport(
@@ -249,6 +253,7 @@ SavedModelSignatureDefsToMlirImportLite(
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input, absl::string_view debug_info_file,
+    absl::string_view xla_compile_device_type,
     const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::optional<std::vector<int>>>& input_shapes,
@@ -258,10 +263,11 @@ GraphdefToSplattedMlirTranslateFunction(
     bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
     bool unconditionally_use_set_output_shapes, mlir::MLIRContext* context) {
   auto module_or = GraphdefToMlirImport(
-      input, debug_info_file, input_arrays, input_dtypes, input_shapes,
-      output_arrays, control_output_arrays, prune_unused_nodes,
-      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy,
-      enable_shape_inference, unconditionally_use_set_output_shapes, context);
+      input, debug_info_file, xla_compile_device_type, input_arrays,
+      input_dtypes, input_shapes, output_arrays, control_output_arrays,
+      prune_unused_nodes, convert_legacy_fed_inputs, graph_as_function,
+      upgrade_legacy, enable_shape_inference,
+      unconditionally_use_set_output_shapes, context);
   if (!module_or.status().ok()) {
     LOG(ERROR) << "Graph import failed: " << module_or.status();
     return module_or.status();
@@ -274,7 +280,7 @@ GraphdefToSplattedMlirTranslateFunction(
         auto attr_id = mlir::StringAttr::get(context, "value");
         if (auto attr = inst.getAttrOfType<mlir::ElementsAttr>(attr_id)) {
           mlir::Attribute rand_val;
-          mlir::Type element_type = attr.getType().getElementType();
+          mlir::Type element_type = attr.getShapedType().getElementType();
           if (element_type.isa<mlir::IntegerType>()) {
             rand_val = mlir::IntegerAttr::get(element_type, std::rand());
           } else if (element_type.isF16() || element_type.isF32() ||
@@ -288,8 +294,8 @@ GraphdefToSplattedMlirTranslateFunction(
                 << "an unsupported attribute type " << element_type;
             continue;
           }
-          auto new_attr =
-              mlir::DenseElementsAttr::get(attr.getType(), rand_val);
+          auto new_attr = mlir::DenseElementsAttr::get(
+              llvm::cast<mlir::ShapedType>(attr.getType()), rand_val);
           inst.setAttr(attr_id, new_attr);
         }
       }
@@ -301,12 +307,12 @@ GraphdefToSplattedMlirTranslateFunction(
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view input_arrays, absl::string_view input_dtypes,
-    absl::string_view input_shapes, absl::string_view output_arrays,
-    absl::string_view control_output_arrays, bool prune_unused_nodes,
-    bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
-    bool enable_shape_inference, bool unconditionally_use_set_output_shapes,
-    mlir::MLIRContext* context) {
+    absl::string_view xla_compile_device_type, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, absl::string_view control_output_arrays,
+    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
+    bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
+    bool unconditionally_use_set_output_shapes, mlir::MLIRContext* context) {
   std::vector<std::string> input_array_vector;
   std::vector<std::string> input_dtype_vector;
   std::vector<std::optional<std::vector<int>>> input_shapes_vector;
@@ -319,11 +325,11 @@ GraphdefToSplattedMlirTranslateFunction(
   TF_RETURN_IF_ERROR(
       ParseNodeNames(control_output_arrays, control_output_array_vector));
   return GraphdefToSplattedMlirTranslateFunction(
-      input, debug_info_file, input_array_vector, input_dtype_vector,
-      input_shapes_vector, output_array_vector, control_output_array_vector,
-      prune_unused_nodes, convert_legacy_fed_inputs, graph_as_function,
-      upgrade_legacy, enable_shape_inference,
-      unconditionally_use_set_output_shapes, context);
+      input, debug_info_file, xla_compile_device_type, input_array_vector,
+      input_dtype_vector, input_shapes_vector, output_array_vector,
+      control_output_array_vector, prune_unused_nodes,
+      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy,
+      enable_shape_inference, unconditionally_use_set_output_shapes, context);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
index 33435cea739..677c09dd027 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
@@ -41,6 +41,7 @@ using tsl::StatusOr;
 // Creates MLIR entities into the given MLIR `context`.
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
     llvm::StringRef input, absl::string_view debug_info_file,
+    absl::string_view xla_compile_device_type,
     const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::optional<std::vector<int>>>& input_shapes,
@@ -59,10 +60,11 @@ ABSL_DEPRECATED(
 // Creates MLIR entities into the given MLIR `context`.
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
     llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view input_arrays, absl::string_view input_dtypes,
-    absl::string_view input_shapes, absl::string_view output_arrays,
-    absl::string_view control_output_arrays, bool prune_unused_nodes,
-    bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
+    absl::string_view xla_compile_device_type, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, absl::string_view control_output_arrays,
+    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
+    bool graph_as_function, bool upgrade_legacy,
     // TODO(jpienaar): Remove these.
     bool enable_shape_inference, bool unconditionally_use_set_output_shapes,
     mlir::MLIRContext* context);
@@ -72,6 +74,7 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input, absl::string_view debug_info_file,
+    absl::string_view xla_compile_device_type,
     const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::vector<int>>& input_shapes,
@@ -91,10 +94,11 @@ ABSL_DEPRECATED(
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view input_arrays, absl::string_view input_dtypes,
-    absl::string_view input_shapes, absl::string_view output_arrays,
-    absl::string_view control_output_arrays, bool prune_unused_nodes,
-    bool convert_legacy_fed_inputs, bool graph_as_function, bool upgrade_legacy,
+    absl::string_view xla_compile_device_type, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, absl::string_view control_output_arrays,
+    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
+    bool graph_as_function, bool upgrade_legacy,
     // TODO(jpienaar): Remove these.
     bool enable_shape_inference, bool unconditionally_use_set_output_shapes,
     mlir::MLIRContext* context);
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc
index fdcfb18cd58..d739b3997c5 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc
@@ -86,6 +86,12 @@ opt<std::string> debug_info_file(
     llvm::cl::desc("Path to the debug info file of the input graph def"),
     llvm::cl::init(""));
 
+// NOLINTNEXTLINE
+opt<std::string> xla_compile_device_type(
+    "tf-xla-compile-device-type",
+    llvm::cl::desc("Sets the compilation device type of the input graph def"),
+    llvm::cl::init(""));
+
 // TODO(b/134792656): If pruning is moved into TF dialect as a pass
 // we should remove this.
 // NOLINTNEXTLINE
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h
index aaf0b5c4c74..af50bdc185f 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h
@@ -36,6 +36,7 @@ extern llvm::cl::opt<std::string> inference_type;
 extern llvm::cl::opt<std::string> min_values;
 extern llvm::cl::opt<std::string> max_values;
 extern llvm::cl::opt<std::string> debug_info_file;
+extern llvm::cl::opt<std::string> xla_compile_device_type;
 extern llvm::cl::opt<bool> prune_unused_nodes;
 extern llvm::cl::opt<bool> convert_legacy_fed_inputs;
 extern llvm::cl::opt<bool> graph_as_function;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
index f1c39aba7ad..6ce04664a7b 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
@@ -45,10 +45,11 @@ inline absl::string_view StringRefToView(llvm::StringRef ref) {
 static OwningOpRef<mlir::ModuleOp> GraphdefToMlirTranslateFunction(
     llvm::StringRef input, MLIRContext* context) {
   auto module_or = tensorflow::GraphdefToMlirTranslateFunction(
-      input, debug_info_file, input_arrays, input_dtypes, input_shapes,
-      output_arrays, control_output_arrays, prune_unused_nodes,
-      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy,
-      enable_shape_inference, unconditionally_use_set_output_shapes, context);
+      input, debug_info_file, xla_compile_device_type, input_arrays,
+      input_dtypes, input_shapes, output_arrays, control_output_arrays,
+      prune_unused_nodes, convert_legacy_fed_inputs, graph_as_function,
+      upgrade_legacy, enable_shape_inference,
+      unconditionally_use_set_output_shapes, context);
   if (!module_or.status().ok()) return nullptr;
   return std::move(module_or).value();
 }
@@ -59,10 +60,11 @@ static TranslateToMLIRRegistration GraphdefToMlirTranslate(
 static OwningOpRef<mlir::ModuleOp> GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input, MLIRContext* context) {
   auto module_or = tensorflow::GraphdefToSplattedMlirTranslateFunction(
-      input, debug_info_file, input_arrays, input_dtypes, input_shapes,
-      output_arrays, control_output_arrays, prune_unused_nodes,
-      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy,
-      enable_shape_inference, unconditionally_use_set_output_shapes, context);
+      input, debug_info_file, xla_compile_device_type, input_arrays,
+      input_dtypes, input_shapes, output_arrays, control_output_arrays,
+      prune_unused_nodes, convert_legacy_fed_inputs, graph_as_function,
+      upgrade_legacy, enable_shape_inference,
+      unconditionally_use_set_output_shapes, context);
   if (!module_or.status().ok()) return nullptr;
   return std::move(module_or).value();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
index 0c6a4733dc9..95066de457a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
@@ -43,12 +43,20 @@ inline constexpr llvm::StringRef kReplicationInfoAttr = "_replication_info";
 inline constexpr llvm::StringRef kTpuReplicateAttr = "_tpu_replicate";
 // Device types.
 inline constexpr llvm::StringRef kTpuDevice = "TPU";
+// _xla_outside_compilation
+inline constexpr llvm::StringRef kXlaOutsideCompilationAttr =
+    "_xla_outside_compilation";
+// device attr
+inline constexpr llvm::StringRef kDeviceAttr = "device";
 // Function attribute to signal that a function should be skipped from TPU
 // island outlining. The attribute is set in
 // `TpuV1BridgeExecutorIslandCoarsening` and removed in the subsequent
 // `TPUBridgeExecutorIslandOutlining` pass.
 inline constexpr llvm::StringRef kSkipIslandOutlining =
     "_skip_island_outlining";
+// Function attribute to signal which argument contains bounded dynamic
+// dimension.
+inline constexpr llvm::StringRef kDynamicArgIndexAttr = "_dynamic_arg_index";
 
 // This string attribute encodes parallel execution groups and their associated
 // branches. It has the following format:
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index 42716d6e9ec..fce8c6f8dcf 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -219,7 +219,7 @@ StatusOr<ElementsAttr> ConvertTensorProto(const TensorProto& input_tensor,
     llvm::SmallVector<int64_t> original_dimensions;
     for (auto dim : input_tensor_shape) original_dimensions.push_back(dim.size);
     return ElementsAttr(mlir::SplatElementsAttr::get(
-        single_attr.getType().clone(original_dimensions),
+        single_attr.getShapedType().clone(original_dimensions),
         single_attr.getValues<mlir::Attribute>()[0]));
   }
 
@@ -404,7 +404,7 @@ void ConvertFloat8ElementsAttr(const mlir::DenseElementsAttr attr,
 }
 
 Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
-  auto type = attr.getType();
+  auto type = attr.getShapedType();
   auto shape = type.getShape();
   DataType output_dtype;
   TF_RETURN_IF_ERROR(ConvertToDataType(type, &output_dtype));
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
index 115a1cbbfd2..373e88f7413 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
@@ -119,7 +119,7 @@ class ConvertTensorTest : public ::testing::Test {
     TF_ASSERT_OK(value_or.status());
     auto attr = value_or.value();
 
-    EXPECT_EQ(attr.getType().getElementType(), expected_ty);
+    EXPECT_EQ(attr.getShapedType().getElementType(), expected_ty);
 
     Tensor out;
     TF_ASSERT_OK(ConvertToTensor(attr, &out));
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.cc b/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.cc
new file mode 100644
index 00000000000..f49950cdf1e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.cc
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
+
+namespace tensorflow {
+DataDumperLoggerConfig::DataDumperLoggerConfig(
+    std::function<std::string(const std::string &)> get_filename,
+    const std::string &pass_prefix, bool print_module_scope,
+    bool print_after_only_on_change)
+    : ::tensorflow::BridgeLoggerConfig(print_module_scope,
+                                       print_after_only_on_change),
+      get_filename_(get_filename),
+      pass_prefix_(pass_prefix) {}
+
+void DataDumperLoggerConfig::printBeforeIfEnabled(
+    mlir::Pass *pass, mlir::Operation *op, PrintCallbackFn print_callback) {
+  std::string pass_name = pass->getName().str();
+  std::string filename = get_filename_(pass_prefix_ + "before_" + pass_name);
+
+  DumpMlir(filename, print_callback);
+}
+
+void DataDumperLoggerConfig::printAfterIfEnabled(
+    mlir::Pass *pass, mlir::Operation *op, PrintCallbackFn print_callback) {
+  std::string pass_name = pass->getName().str();
+  std::string filename = get_filename_(pass_prefix_ + "after_" + pass_name);
+
+  DumpMlir(filename, print_callback);
+}
+
+void DataDumperLoggerConfig::DumpMlir(
+    const std::string &filename,
+    BridgeLoggerConfig::PrintCallbackFn print_callback) {
+  std::unique_ptr<llvm::raw_ostream> os;
+  std::string filepath;
+  if (tensorflow::CreateFileForDumping(filename, &os, &filepath).ok()) {
+    print_callback(*os);
+    LOG(INFO) << "Dumped MLIR module to " << filepath;
+  }
+}
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h b/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h
new file mode 100644
index 00000000000..c962d68c02f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DATA_DUMPER_LOGGER_CONFIG_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DATA_DUMPER_LOGGER_CONFIG_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h"
+
+namespace tensorflow {
+
+class DataDumperLoggerConfig : public ::tensorflow::BridgeLoggerConfig {
+ public:
+  explicit DataDumperLoggerConfig(
+      std::function<std::string(const std::string &)> get_filename,
+      const std::string &pass_prefix = "", bool print_module_scope = false,
+      bool print_after_only_on_change = true);
+
+  void printBeforeIfEnabled(mlir::Pass *pass, mlir::Operation *op,
+                            PrintCallbackFn print_callback) override;
+
+  void printAfterIfEnabled(mlir::Pass *pass, mlir::Operation *op,
+                           PrintCallbackFn print_callback) override;
+
+ private:
+  static void DumpMlir(const std::string &filename,
+                       BridgeLoggerConfig::PrintCallbackFn print_callback);
+
+  // The function to dump the target MLIR string to file.
+  // The parameter that will be sent to the dump_func_ is:
+  // The pass name (std::string)
+  std::function<std::string(const std::string &)> get_filename_;
+
+  // The pass prefix.
+  std::string pass_prefix_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DATA_DUMPER_LOGGER_CONFIG_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
index dbc8f07c4a6..51db1be0820 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
@@ -64,7 +64,7 @@ mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
                                      mlir::TF::RuntimeDevices* devices) {
   DeviceNameUtils::ParsedName device;
 
-  for (auto& kv : llvm::enumerate(array_attr)) {
+  for (const auto& kv : llvm::enumerate(array_attr)) {
     const int idx = kv.index();
 
     auto string_attr = kv.value().dyn_cast<mlir::StringAttr>();
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index c45ef133240..efcbca84872 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -148,7 +148,7 @@ Status CreateFileForDumping(llvm::StringRef name,
     dir = GetDumpDirFromEnvVar();
 
   if (dir.empty()) {
-    return Status(error::Code::INVALID_ARGUMENT,
+    return Status(absl::StatusCode::kInvalidArgument,
                   "(TF_DUMP_GRAPH_PREFIX not specified)");
   }
 
@@ -164,7 +164,7 @@ Status CreateFileForDumping(llvm::StringRef name,
   if (!status.ok()) {
     LOG(WARNING) << "Failed to create '" << dir
                  << "' directory for dumping: " << status;
-    return Status(error::Code::UNAVAILABLE, "(unavailable)");
+    return Status(absl::StatusCode::kUnavailable, "(unavailable)");
   }
   *filepath = io::JoinPath(dir, MakeUniqueFilename(std::string(name)));
 
@@ -173,7 +173,7 @@ Status CreateFileForDumping(llvm::StringRef name,
   status = env->NewWritableFile(*filepath, &file);
   if (!status.ok()) {
     LOG(WARNING) << "Failed to create file '" << filepath << "': " << status;
-    return Status(error::Code::UNAVAILABLE, "(unavailable)");
+    return Status(absl::StatusCode::kUnavailable, "(unavailable)");
   }
   file = std::make_unique<tsl::BufferedWritableFile>(std::move(file));
   *os = std::make_unique<WritableFileRawStream>(std::move(file));
@@ -202,7 +202,7 @@ std::string DumpMlirOpToFile(llvm::StringRef name, mlir::Operation* op,
   std::unique_ptr<raw_ostream> os;
   std::string filepath;
   Status result = CreateFileForDumping(name, &os, &filepath, dirname);
-  if (!result.ok()) return result.error_message();
+  if (!result.ok()) return std::string(result.message());
 
   if (pass_manager) PrintPassPipeline(*pass_manager, op, *os);
   op->print(*os, mlir::OpPrintingFlags().useLocalScope());
@@ -236,7 +236,7 @@ std::string DumpRawStringToFile(llvm::StringRef name, llvm::StringRef content,
   std::unique_ptr<raw_ostream> os;
   std::string filepath;
   Status result = CreateFileForDumping(name, &os, &filepath, dirname);
-  if (!result.ok()) return result.error_message();
+  if (!result.ok()) return std::string(result.message());
 
   (*os) << content;
   LOG(INFO) << "Outputted requested string to '" << filepath << "'";
@@ -276,8 +276,8 @@ void SetCrashReproducer(mlir::PassManager& pm, llvm::StringRef dir_path) {
     auto* env = tensorflow::Env::Default();
     auto status = env->RecursivelyCreateDir(path);
     if (!status.ok()) {
-      LOG(WARNING) << "cannot create directory '" + path +
-                          "': " + status.error_message();
+      LOG(WARNING) << "cannot create directory '" << path
+                   << "': " << status.message();
       return;
     }
 
@@ -307,7 +307,7 @@ void SetCrashReproducer(mlir::PassManager& pm, llvm::StringRef dir_path) {
 
     if (!status.ok()) {
       error = absl::StrCat("Failed to create file '", path,
-                           "': ", status.error_message());
+                           "': ", status.message());
       return nullptr;
     }
     return std::make_unique<CrashReproducerStream>(
@@ -318,7 +318,11 @@ void SetCrashReproducer(mlir::PassManager& pm, llvm::StringRef dir_path) {
 
 void applyTensorflowAndCLOptions(mlir::PassManager& pm,
                                  llvm::StringRef dir_path) {
-  mlir::applyPassManagerCLOptions(pm);
+  mlir::registerPassManagerCLOptions();
+  if (!mlir::succeeded(mlir::applyPassManagerCLOptions(pm))) {
+    LOG(ERROR) << "cannot apply MLIR pass manager CL options";
+    return;
+  }
   SetCrashReproducer(pm, dir_path);
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
index 5287a7d2d25..6069b8ca2ad 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
@@ -96,6 +96,9 @@ void SetCrashReproducer(mlir::PassManager& pm, llvm::StringRef dir_path = "");
 void applyTensorflowAndCLOptions(mlir::PassManager& pm,
                                  llvm::StringRef dir_path = "");
 
+// Prints the pass pipeline of `pass_manager` to `os`.
+void PrintPassPipeline(const mlir::PassManager& pass_manager,
+                       mlir::Operation* op, llvm::raw_ostream& os);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_MLIR_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
index a68a66a7136..908bf40f834 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
@@ -88,6 +89,7 @@ TEST(DumpMlirModuleTest, Valid) {
 }
 
 TEST(DumpCrashReproducerTest, RoundtripDumpAndReadValid) {
+  mlir::registerPassManagerCLOptions();
   mlir::MLIRContext context;
   mlir::OwningOpRef<mlir::ModuleOp> module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
@@ -119,11 +121,13 @@ TEST(DumpCrashReproducerTest, RoundtripDumpAndReadValid) {
   mlir::registerTensorFlowPasses();
 
   EXPECT_TRUE(mlir::MlirOptMain(output_stream->os(), std::move(input_file),
-                                passPipeline, registry,
-                                /*splitInputFile=*/false,
-                                /*verifyDiagnostics=*/false,
-                                /*verifyPasses=*/false,
-                                /*allowUnregisteredDialects=*/false)
+                                registry,
+                                mlir::MlirOptMainConfig{}
+                                    .splitInputFile(false)
+                                    .verifyDiagnostics(false)
+                                    .verifyPasses(false)
+                                    .allowUnregisteredDialects(false)
+                                    .setPassPipelineParser(passPipeline))
                   .succeeded());
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
index 4ef6340f39e..3cf746cd226 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
@@ -62,10 +62,9 @@ TEST(ErrorUtilTest, StatusScopedDiagnosticHandler) {
     StatusScopedDiagnosticHandler ssdh(&context);
     Status s = ssdh.Combine(function());
     ASSERT_TRUE(tensorflow::errors::IsInternal(s));
-    EXPECT_THAT(s.error_message(), HasSubstr("Passed in error"));
-    EXPECT_THAT(s.error_message(), HasSubstr("Diagnostic message reported"));
-    EXPECT_THAT(s.error_message(),
-                HasSubstr("Second diagnostic message reported"));
+    EXPECT_THAT(s.message(), HasSubstr("Passed in error"));
+    EXPECT_THAT(s.message(), HasSubstr("Diagnostic message reported"));
+    EXPECT_THAT(s.message(), HasSubstr("Second diagnostic message reported"));
   }
 }
 
@@ -111,11 +110,11 @@ TEST(ErrorUtilTest, StatusScopedDiagnosticHandlerWithFilter) {
   emitError(callsite_loc3) << "Error 3";
   Status s_filtered = ssdh_filter.ConsumeStatus();
   // Check for the files that should not be filtered.
-  EXPECT_THAT(s_filtered.error_message(), HasSubstr("keras"));
-  EXPECT_THAT(s_filtered.error_message(), HasSubstr("test.py"));
-  EXPECT_THAT(s_filtered.error_message(), HasSubstr("show_file"));
+  EXPECT_THAT(s_filtered.message(), HasSubstr("keras"));
+  EXPECT_THAT(s_filtered.message(), HasSubstr("test.py"));
+  EXPECT_THAT(s_filtered.message(), HasSubstr("show_file"));
   // Verify the filtered files are not present.
-  EXPECT_THAT(s_filtered.error_message(), Not(HasSubstr("filtered_file")));
+  EXPECT_THAT(s_filtered.message(), Not(HasSubstr("filtered_file")));
 }
 
 TEST(ErrorUtilTest, StatusScopedDiagnosticHandlerWithoutFilter) {
@@ -151,10 +150,10 @@ TEST(ErrorUtilTest, StatusScopedDiagnosticHandlerWithoutFilter) {
   emitError(callsite_loc2) << "Error 2";
   Status s_no_filter = ssdh_no_filter.ConsumeStatus();
   // All files should be present, especially the 'filtered' ones.
-  EXPECT_THAT(s_no_filter.error_message(), HasSubstr("keras"));
-  EXPECT_THAT(s_no_filter.error_message(), HasSubstr("my_op"));
-  EXPECT_THAT(s_no_filter.error_message(), HasSubstr("filtered_file_A"));
-  EXPECT_THAT(s_no_filter.error_message(), HasSubstr("filtered_file_B"));
+  EXPECT_THAT(s_no_filter.message(), HasSubstr("keras"));
+  EXPECT_THAT(s_no_filter.message(), HasSubstr("my_op"));
+  EXPECT_THAT(s_no_filter.message(), HasSubstr("filtered_file_A"));
+  EXPECT_THAT(s_no_filter.message(), HasSubstr("filtered_file_B"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
index c03ba4c2f8a..925c2dfc57b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
@@ -50,7 +50,7 @@ static bool IsOk(const TF_Status* s) {
 
 static bool IsOk(const Status& s) {
   if (s.ok()) return true;
-  VLOG(2) << s.error_message();
+  VLOG(2) << s.message();
   return false;
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc
index 5b89105156d..fdb1ebc39a9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc
@@ -71,7 +71,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> GetResourcesFromSession(
 
   auto status = session->Run({}, variable_names, {}, &resource_tensors);
   if (!status.ok())
-    return absl::Status(absl::StatusCode::kInternal, status.error_message());
+    return absl::Status(absl::StatusCode::kInternal, status.message());
   return resource_tensors;
 }
 }  // namespace tf_saved_model
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 16c0e316204..449e0532cf0 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -30,6 +30,8 @@ limitations under the License.
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/compiler/mlir/utils/string_container_utils.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
@@ -48,22 +50,25 @@ constexpr int kTPUTopologyRank = 4;
 constexpr char kDeviceTPUSystem[] = "TPU_SYSTEM";
 constexpr char kDeviceTPU[] = "TPU";
 constexpr char kTPUReplicatedCore[] = "TPU_REPLICATED_CORE";
+constexpr char kTPUReplicatedHost[] = "TPU_REPLICATED_HOST";
 constexpr char kBadIntArrayElementMsg[] =
     "bad '{0}' attribute at index {1}, not an int";
 
-using Device = DeviceNameUtils::ParsedName;
-using Devices = llvm::ArrayRef<DeviceNameUtils::ParsedName>;
+using ParsedDevice = DeviceNameUtils::ParsedName;
+using ParsedDevices = llvm::ArrayRef<DeviceNameUtils::ParsedName>;
 
 namespace {
-// Finds matching devices in `devices` based on pattern `spec`.
-void FindMatchingDevices(Devices devices, const Device& spec,
-                         llvm::SmallVectorImpl<Device>* matched_devices) {
+// Find matching devices in `devices` based on pattern `spec`.
+llvm::SmallVector<ParsedDevice, 8> FindMatchingDevices(
+    ParsedDevices devices, const ParsedDevice& spec) {
+  llvm::SmallVector<ParsedDevice, 8> matching_devices;
   for (const auto& device : devices)
     if (DeviceNameUtils::IsCompleteSpecification(spec, device))
-      matched_devices->push_back(device);
+      matching_devices.push_back(device);
+  return matching_devices;
 }
 
-// Creates error message for a conflicting attribute of a device.
+// Create error message for a conflicting attribute of a device.
 template <typename T>
 Status MismatchedTPUSystemAttributeErr(absl::string_view attribute, T a, T b) {
   return errors::InvalidArgument("found ", kDeviceTPUSystem,
@@ -71,20 +76,20 @@ Status MismatchedTPUSystemAttributeErr(absl::string_view attribute, T a, T b) {
                                  a, "' and '", b, "'");
 }
 
-// Finds TPU_SYSTEM:0 devices in `devices`. If multiple TPU_SYSTEM devices are
+// Find TPU_SYSTEM:0 devices in `devices`. If multiple TPU_SYSTEM devices are
 // found, the first one lexicographically is returned. If no TPU_SYSTEM device
 // is found or if there are multiple TPU_SYSTEM devices with different jobs or
 // replicas, a failure will be returned.
-Status GetTPUSystemDevices(Devices devices,
-                           llvm::SmallVectorImpl<Device>* matched_devices) {
-  Device spec;
+StatusOr<llvm::SmallVector<ParsedDevice, 8>> GetTPUSystemDevices(
+    ParsedDevices devices) {
+  ParsedDevice spec;
   spec.type = kDeviceTPUSystem;
   spec.has_type = true;
   spec.id = 0;
   spec.has_id = true;
 
-  llvm::SmallVector<Device, 8> system_devices;
-  FindMatchingDevices(devices, spec, &system_devices);
+  llvm::SmallVector<ParsedDevice, 8> system_devices =
+      FindMatchingDevices(devices, spec);
   if (system_devices.empty())
     return errors::InvalidArgument("no ", kDeviceTPUSystem, " devices found");
 
@@ -103,33 +108,36 @@ Status GetTPUSystemDevices(Devices devices,
 
   // Sort by task to be deterministic.
   std::sort(system_devices.begin(), system_devices.end(),
-            [](const Device& a, const Device& b) { return a.task < b.task; });
+            [](const ParsedDevice& a, const ParsedDevice& b) {
+              return a.task < b.task;
+            });
 
-  matched_devices->swap(system_devices);
-
-  return OkStatus();
+  return system_devices;
 }
 
-// Finds TPU devices associated to system device based on spec (e.g. from
+// Find TPU devices associated to system device based on spec (e.g. from
 // GetTPUSystemDevices). If the number of TPU devices per host do not match for
 // every host, a failure will be returned.
-Status GetTPUDevices(
-    Devices devices, llvm::ArrayRef<Device> system_devices,
-    llvm::SmallVectorImpl<llvm::SmallVector<Device, 8>>* tpu_devices) {
-  tpu_devices->reserve(system_devices.size());
+StatusOr<llvm::SmallVector<llvm::SmallVector<ParsedDevice, 8>, 8>>
+GetTPUDevices(ParsedDevices devices,
+              llvm::ArrayRef<ParsedDevice> system_devices) {
+  llvm::SmallVector<llvm::SmallVector<ParsedDevice, 8>, 8> tpu_devices;
+  tpu_devices.reserve(system_devices.size());
 
-  auto lookup = [&devices](Device device_spec) {
+  auto lookup = [&devices](ParsedDevice device_spec) {
     device_spec.has_type = true;
     device_spec.type = kDeviceTPU;
     // Enumerate all the available TPUs.
     device_spec.has_id = false;
 
-    llvm::SmallVector<Device, 8> host_tpu_devices;
-    FindMatchingDevices(devices, device_spec, &host_tpu_devices);
+    llvm::SmallVector<ParsedDevice, 8> host_tpu_devices =
+        FindMatchingDevices(devices, device_spec);
 
     // Sort devices by id.
     std::sort(host_tpu_devices.begin(), host_tpu_devices.end(),
-              [](const Device& i, const Device& j) { return i.id < j.id; });
+              [](const ParsedDevice& i, const ParsedDevice& j) {
+                return i.id < j.id;
+              });
     return host_tpu_devices;
   };
 
@@ -138,7 +146,7 @@ Status GetTPUDevices(
     const auto& device = system_devices[0];
     auto host_tpu_devices = lookup(device);
     num_tpus_per_host = host_tpu_devices.size();
-    tpu_devices->push_back(std::move(host_tpu_devices));
+    tpu_devices.push_back(std::move(host_tpu_devices));
   }
 
   for (const auto& device_spec : llvm::make_range(
@@ -151,14 +159,15 @@ Status GetTPUDevices(
           "expected the number of TPU devices per host to be ",
           num_tpus_per_host, ", got ", host_tpu_devices.size());
 
-    tpu_devices->push_back(std::move(host_tpu_devices));
+    tpu_devices.push_back(std::move(host_tpu_devices));
   }
 
-  return OkStatus();
+  return tpu_devices;
 }
 
-// Finds the compilation device from system device.
-std::string GetTPUCompilationDevice(Device system_device) {
+// Find the compilation device from system device with `DEVICE_CPU` as its
+// type.
+std::string GetTPUCompilationDevice(ParsedDevice system_device) {
   // TODO(b/110910013) GetTPUSystemDevices parses the spec and returns the
   // TPU_SYSTEM device, which we replace with the CPU device. We do this
   // replacement because we want to place the `tf._TPUCompileMlir` explicitly on
@@ -167,21 +176,22 @@ std::string GetTPUCompilationDevice(Device system_device) {
   return DeviceNameUtils::ParsedNameToString(system_device);
 }
 
-// Finds the host CPU device for a given TPU device.
-std::string GetCPUHostDeviceForTPUDevice(Device tpu_device) {
+// Find the host CPU device for a given TPU device with `DEVICE_CPU` as its
+// type and `id` 0.
+std::string GetCPUHostDeviceForTPUDevice(ParsedDevice tpu_device) {
   tpu_device.type = DEVICE_CPU;
   tpu_device.id = 0;
   return DeviceNameUtils::ParsedNameToString(tpu_device);
 }
 
-// Determines execution devices when topology and device assignment are not
+// Determine execution devices when topology and device assignment are not
 // defined. This is a special case where a single core computation is replicated
 // to every core in the mesh. TPU devices are simply added to
 // `execution_devices` of one replica. `num_replicas` must be 1 or the total
 // number of TPU devices available, and `num_cores_per_replica` must be 1.
 StatusOr<TPUDevicesAndHosts> GetFullMeshTPUExecutionDeviceAssignment(
     int num_replicas, int num_cores_per_replica,
-    llvm::ArrayRef<llvm::SmallVector<Device, 8>> tpu_devices) {
+    llvm::ArrayRef<llvm::SmallVector<ParsedDevice, 8>> tpu_devices) {
   const int num_tasks = tpu_devices.size();
   const int num_tpus_per_task = tpu_devices[0].size();
   const int num_tpu_devices = num_tasks * num_tpus_per_task;
@@ -219,14 +229,14 @@ struct TaskAndDevice {
   int device = -1;
 };
 
-// Checks if device coordinate is outside of topology mesh shape bounds.
+// Check if device coordinate is outside of topology mesh shape bounds.
 bool DeviceCoordinateOutOfBound(int x, int y, int z, int core, int bound_x,
                                 int bound_y, int bound_z, int bound_core) {
   return x < 0 || x >= bound_x || y < 0 || y >= bound_y || z < 0 ||
          z >= bound_z || core < 0 || core >= bound_core;
 }
 
-// Creates error message for an out of bound device coordinate.
+// Create error message for an out of bound device coordinate.
 Status DeviceCoordinateErrorMsg(absl::string_view attribute, int x, int y,
                                 int z, int core, int bound_x, int bound_y,
                                 int bound_z, int bound_core) {
@@ -236,7 +246,7 @@ Status DeviceCoordinateErrorMsg(absl::string_view attribute, int x, int y,
                                  bound_y, ", ", bound_z, ", ", bound_core, ")");
 }
 
-// Creates error message for a duplicate device coordinate.
+// Create error message for a duplicate device coordinate.
 Status DuplicateCoordinateErrorMsg(absl::string_view attribute, int x, int y,
                                    int z, int core) {
   return errors::InvalidArgument("'", attribute,
@@ -244,7 +254,7 @@ Status DuplicateCoordinateErrorMsg(absl::string_view attribute, int x, int y,
                                  y, ", ", z, ", ", core, ")");
 }
 
-// Parses and validates topology (serialized string of TopologyProto), and maps
+// Parse and validate topology (serialized string of TopologyProto), and maps
 // device coordinate (x, y, z, core) to task and device (of available TPUs).
 // Topology attribute device coordinates are ordered by task then device (major
 // to minor).
@@ -326,7 +336,7 @@ StatusOr<xla::Array4D<TaskAndDevice>> ParseTopologyAttr(
   return topology;
 }
 
-// Determines execution devices when topology and device assignment are defined.
+// Determine execution devices when topology and device assignment are defined.
 // With a topology device coordinate to task and device mapping, device
 // assignment device coordinates can then be mapped to task and device for TPU
 // devices. The device assignment array is also validated.
@@ -340,7 +350,7 @@ StatusOr<xla::Array4D<TaskAndDevice>> ParseTopologyAttr(
 StatusOr<std::pair<TPUDevicesAndHosts, xla::DeviceAssignmentProto>>
 GetGeneralTPUExecutionDeviceAssignment(
     int num_replicas, int num_cores_per_replica,
-    llvm::ArrayRef<llvm::SmallVector<Device, 8>> tpu_devices,
+    llvm::ArrayRef<llvm::SmallVector<ParsedDevice, 8>> tpu_devices,
     llvm::StringRef topology_attr,
     llvm::ArrayRef<int64_t> device_assignment_attr) {
   const int num_tasks = tpu_devices.size();
@@ -441,59 +451,149 @@ mlir::LogicalResult GetHostDeviceOCInGenericPipeline(
   return mlir::success();
 }
 
-mlir::LogicalResult GetHostDeviceOCInTPUPipeline(
-    mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
-    std::string* host_device) {
-  auto replicate = cluster->getParentOfType<mlir::tf_device::ReplicateOp>();
-  if (replicate) {
-    *host_device = tensorflow::kTPUReplicatedHost;
-    return mlir::success();
-  }
-
-  auto topology_attr =
+mlir::LogicalResult GetTopology(mlir::tf_device::ClusterOp cluster,
+                                std::string& topology) {
+  mlir::StringAttr topology_attr =
       cluster->getAttrOfType<mlir::StringAttr>(tensorflow::kTopologyAttr);
-  if (!topology_attr)
-    return cluster.emitOpError("cluster op missing `topology` attribute");
-
-  auto num_cores_per_replica_attr = cluster->getAttrOfType<mlir::IntegerAttr>(
-      tensorflow::kNumCoresPerReplicaAttr);
-  if (!num_cores_per_replica_attr)
+  if (topology_attr) {
+    topology = topology_attr.getValue();
+    return mlir::success();
+  } else {
     return cluster.emitOpError(
-        llvm::formatv("requires attribute '{0}'",
-                      tensorflow::kNumCoresPerReplicaAttr)
+        llvm::formatv("requires attribute '{0}'", tensorflow::kTopologyAttr)
             .str());
+  }
+}
 
-  auto device_assignment_attr = cluster->getAttrOfType<mlir::ArrayAttr>(
-      tensorflow::kDeviceAssignmentAttr);
+mlir::LogicalResult GetDeviceAssignmentCoordinates(
+    mlir::tf_device::ClusterOp cluster,
+    llvm::SmallVector<int64_t, 8>& device_coordinates) {
+  mlir::ArrayAttr device_assignment_attr =
+      cluster->getAttrOfType<mlir::ArrayAttr>(
+          tensorflow::kDeviceAssignmentAttr);
   if (!device_assignment_attr)
     return cluster.emitOpError(llvm::formatv("requires attribute '{0}'",
                                              tensorflow::kDeviceAssignmentAttr)
                                    .str());
+  if (StatusOr<llvm::SmallVector<int64_t, 8>> fetched_device_coordinates =
+          tensorflow::GetDeviceCoordinates(device_assignment_attr);
+      fetched_device_coordinates.ok()) {
+    device_coordinates = *fetched_device_coordinates;
+    return mlir::success();
+  } else {
+    return cluster.emitError() << "error in fetching tpu device coordinates: "
+                               << fetched_device_coordinates.status().message();
+  }
+}
 
-  auto status_or_device_coodinates =
-      tensorflow::GetDeviceCoordinates(device_assignment_attr);
+int GetNumCoresPerReplica(mlir::tf_device::ClusterOp cluster) {
+  mlir::IntegerAttr num_cores_per_replica_attr =
+      cluster->getAttrOfType<mlir::IntegerAttr>(kNumCoresPerReplicaAttr);
+  if (num_cores_per_replica_attr) {
+    return num_cores_per_replica_attr.getInt();
+  } else {
+    return 1;
+  }
+}
 
-  if (!status_or_device_coodinates.ok())
-    return cluster.emitError()
-           << "error in fetching tpu device coordinates: "
-           << status_or_device_coodinates.status().error_message();
+// Get the TPUDevicesAndHosts for a cluster that is not replicated.
+mlir::LogicalResult GetTPUDevicesAndHostsNotReplicated(
+    mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
+    tensorflow::TPUDevicesAndHosts& devices_and_hosts) {
+  std::string topology;
+  if (failed(GetTopology(cluster, topology))) {
+    return mlir::failure();
+  }
+
+  llvm::SmallVector<int64_t, 8> device_coordinates;
+  if (failed(GetDeviceAssignmentCoordinates(cluster, device_coordinates))) {
+    return mlir::failure();
+  }
 
   // Determine compilation and execution devices.
-  auto status_or_tpu_device_assignment =
-      tensorflow::GetTPUCompilationAndExecutionDevices(
-          devices.device_names(), /*num_replicas=*/1,
-          num_cores_per_replica_attr.getInt(), topology_attr.getValue(),
-          std::move(status_or_device_coodinates).value());
-  if (!status_or_tpu_device_assignment.ok())
+  if (StatusOr<TPUDeviceAssignment> tpu_device_assignment =
+          tensorflow::GetTPUCompilationAndExecutionDevices(
+              devices.device_names(), /*num_replicas=*/1,
+              GetNumCoresPerReplica(cluster), topology, device_coordinates);
+      tpu_device_assignment.ok()) {
+    devices_and_hosts = tpu_device_assignment->tpu_devices;
+    return mlir::success();
+  } else {
     return cluster.emitError()
            << "error in fetching TPU compilation/execution devices: "
-           << status_or_tpu_device_assignment.status().error_message();
-  auto& tpu_device_assignment = status_or_tpu_device_assignment.value();
+           << tpu_device_assignment.status().message();
+  }
+}
 
-  *host_device = tpu_device_assignment.tpu_devices[0][0].host;
+mlir::LogicalResult GetHostDeviceOCInTPUPipeline(
+    mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
+    std::string& host_device) {
+  mlir::tf_device::ReplicateOp replicate =
+      cluster->getParentOfType<mlir::tf_device::ReplicateOp>();
+  if (replicate) {
+    host_device = GetDeviceAliasForHostOfLogicalCore(0);
+    return mlir::success();
+  }
+
+  tensorflow::TPUDevicesAndHosts devices_and_hosts;
+  if (failed(GetTPUDevicesAndHostsNotReplicated(devices, cluster,
+                                                devices_and_hosts))) {
+    return mlir::failure();
+  } else {
+    host_device = devices_and_hosts[0][0].host;
+    return mlir::success();
+  }
+}
+
+// Get the map from `core` to `TPU_REPLICATED_HOST_{core}` for a replicated
+// TPU cluster.
+// TPU_REPLICATED_HOST_{core} is the host that corresponds to the TPU core.
+// Different TPU_REPLICATED_HOST_*s can map to the same physical host within the
+// same replica. Also, TPU_REPLICATE_HOST_{core} in different replicas can map
+// to the same physical host. For example, if there are 2 hosts, num_replicas=8,
+// and num_cores_per_replica=2, then all cores in the first 4 replicas will map
+// to the first host and all cores in the second 4 replicas will map to the
+// second host.
+llvm::SmallVector<std::string, 8> GetTPUToHostMapReplicated(
+    mlir::tf_device::ClusterOp cluster) {
+  int num_cores_per_replica = GetNumCoresPerReplica(cluster);
+  llvm::SmallVector<std::string, 8> core_to_host;
+  core_to_host.reserve(num_cores_per_replica);
+  for (int core = 0; core < num_cores_per_replica; ++core) {
+    core_to_host.push_back(GetDeviceAliasForHostOfLogicalCore(core));
+  }
+  return core_to_host;
+}
+
+// Get the map from `core` to host device for a non-replicated TPU cluster.
+mlir::LogicalResult GetTPUToHostMapNotReplicated(
+    mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
+    llvm::SmallVector<std::string, 8>& core_to_host) {
+  tensorflow::TPUDevicesAndHosts devices_and_hosts;
+  if (failed(GetTPUDevicesAndHostsNotReplicated(devices, cluster,
+                                                devices_and_hosts))) {
+    return mlir::failure();
+  }
+
+  // core_to_host is the list of hosts in replica 0, which is the only replica.
+  core_to_host.reserve(GetNumCoresPerReplica(cluster));
+  for (const auto& device_and_host : devices_and_hosts[0]) {
+    core_to_host.push_back(device_and_host.host);
+  }
   return mlir::success();
 }
 
+// Get the map from `core` to host device for a TPU cluster.
+mlir::LogicalResult GetTPUToHostMap(
+    mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
+    llvm::SmallVector<std::string, 8>& core_to_host) {
+  if (cluster->getParentOfType<mlir::tf_device::ReplicateOp>()) {
+    core_to_host = GetTPUToHostMapReplicated(cluster);
+    return mlir::success();
+  }
+  return GetTPUToHostMapNotReplicated(devices, cluster, core_to_host);
+}
+
 }  // anonymous namespace
 
 StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
@@ -518,16 +618,14 @@ StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
 }
 
 StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
-    Devices devices, int num_replicas, int num_cores_per_replica,
+    ParsedDevices devices, int num_replicas, int num_cores_per_replica,
     llvm::StringRef topology_attr,
     llvm::ArrayRef<int64_t> device_assignment_attr) {
   // Collect TPU_SYSTEM devices.
-  llvm::SmallVector<Device, 8> system_devices;
-  TF_RETURN_IF_ERROR(GetTPUSystemDevices(devices, &system_devices));
+  TF_ASSIGN_OR_RETURN(auto system_devices, GetTPUSystemDevices(devices));
 
   // Collect TPU devices based on TPU_SYSTEM devices collected earlier.
-  llvm::SmallVector<llvm::SmallVector<Device, 8>, 8> tpu_devices;
-  TF_RETURN_IF_ERROR(GetTPUDevices(devices, system_devices, &tpu_devices));
+  TF_ASSIGN_OR_RETURN(auto tpu_devices, GetTPUDevices(devices, system_devices));
 
   std::string compilation_device = GetTPUCompilationDevice(system_devices[0]);
 
@@ -553,10 +651,14 @@ StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
                              std::move(devices_and_ids.second));
 }
 
-std::string GetDeviceAliasForLogicalCore(int core_index) {
+std::string GetDeviceAliasForLogicalCore(const int core_index) {
   return llvm::formatv("{0}_{1}", kTPUReplicatedCore, core_index).str();
 }
 
+std::string GetDeviceAliasForHostOfLogicalCore(const int core_index) {
+  return llvm::formatv("{0}_{1}", kTPUReplicatedHost, core_index).str();
+}
+
 bool HasModelParallelism(mlir::tf_device::ClusterOp cluster) {
   mlir::IntegerAttr num_cores_per_replica_attr =
       cluster->getAttrOfType<mlir::IntegerAttr>(
@@ -576,13 +678,15 @@ mlir::LogicalResult GetHostDeviceOutsideComputation(
     mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
     std::string* host_device) {
   if (HasTPUDevice(devices) ||
-      cluster->getParentOfType<mlir::tf_device::ReplicateOp>())
-    return GetHostDeviceOCInTPUPipeline(devices, cluster, host_device);
-  return GetHostDeviceOCInGenericPipeline(devices, host_device);
+      cluster->getParentOfType<mlir::tf_device::ReplicateOp>()) {
+    return GetHostDeviceOCInTPUPipeline(devices, cluster, *host_device);
+  } else {
+    return GetHostDeviceOCInGenericPipeline(devices, host_device);
+  }
 }
 
 bool IsTPUDevice(llvm::StringRef device) {
-  Device parsed_device;
+  ParsedDevice parsed_device;
   if (!DeviceNameUtils::ParseFullName(mlir::StringRefToView(device),
                                       &parsed_device))
     return false;
@@ -590,10 +694,41 @@ bool IsTPUDevice(llvm::StringRef device) {
 }
 
 bool IsTPUReplicatedCore(llvm::StringRef device) {
-  Device parsed_device;
+  ParsedDevice parsed_device;
   if (!DeviceNameUtils::ParseFullName(mlir::StringRefToView(device),
                                       &parsed_device))
     return false;
   return parsed_device.has_type && parsed_device.type == kTPUReplicatedCore;
 }
+
+bool TypeValidForXLA(const mlir::Type& type) {
+  const mlir::Type elem = getElementTypeOrSelf(type);
+  return !elem.isa<mlir::TF::ResourceType>() &&
+         !elem.isa<mlir::TF::StringType>();
+}
+
+mlir::LogicalResult GetDeviceToHostMap(
+    mlir::tf_device::ClusterOp cluster,
+    llvm::SmallVector<std::string, 8>& core_to_host) {
+  mlir::TF::RuntimeDevices devices;
+  if (failed(tensorflow::GetDevicesFromOp(
+          cluster->getParentOfType<mlir::ModuleOp>(), &devices))) {
+    return mlir::failure();
+  }
+
+  if (tensorflow::HasTPUDevice(devices) ||
+      cluster->getParentOfType<mlir::tf_device::ReplicateOp>()) {
+    return GetTPUToHostMap(devices, cluster, core_to_host);
+  }
+
+  std::string host_device;
+  if (failed(tensorflow::GetHostDeviceOCInGenericPipeline(devices,
+                                                          &host_device))) {
+    return mlir::failure();
+  } else {
+    core_to_host.push_back(host_device);
+    return mlir::success();
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index f4780d6abc0..77f853be582 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -35,7 +35,6 @@ limitations under the License.
 namespace tensorflow {
 using tsl::StatusOr;
 
-inline constexpr absl::string_view kTPUReplicatedHost = "TPU_REPLICATED_HOST";
 inline constexpr absl::string_view kNumCoresPerReplicaAttr =
     "num_cores_per_replica";
 inline constexpr absl::string_view kTopologyAttr = "topology";
@@ -238,10 +237,14 @@ StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
     int num_cores_per_replica, llvm::StringRef topology_attr,
     llvm::ArrayRef<int64_t> device_assignment_attr);
 
-// Virtual device is used for evice assignment for executing ops on a specified
-// logical core.
+// Virtual device name of the passed logical core. The logical core is the index
+// of a core within a replica.
 std::string GetDeviceAliasForLogicalCore(int core_index);
 
+// Virtual device name of the host that is associated with the passed logical
+// core. The logical core is the index of a core within a replica.
+std::string GetDeviceAliasForHostOfLogicalCore(int core_index);
+
 // Returns true if cluster contains model parallelism based on
 // `num_cores_per_replica_attribute`. Otherwise returns false.
 bool HasModelParallelism(mlir::tf_device::ClusterOp cluster);
@@ -251,7 +254,8 @@ bool HasTPUDevice(const mlir::TF::RuntimeDevices& devices);
 
 // Parses XLA compilation and execution devices from a tf_device.cluster and
 // returns the host device for the head and tail computations. For TPU device,
-// if the computation is replicated, kTPUReplicatedHost is returned instead.
+// if the computation is replicated, GetDeviceAliasForHostOfLogicalCore(0) is
+// returned instead.
 mlir::LogicalResult GetHostDeviceOutsideComputation(
     mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
     std::string* host_device);
@@ -262,6 +266,20 @@ bool IsTPUDevice(llvm::StringRef device);
 // Checks if a device string is a TPU replicated core device.
 bool IsTPUReplicatedCore(llvm::StringRef device);
 
+// Checks if `type` is allowed for XLA. String and resources are not XLA types.
+// There are other TF types that are not XLA types which will be removed by
+// successive passes in TF/XLA bridge phase 2.
+bool TypeValidForXLA(const mlir::Type& type);
+
+// Returns the map from core to the host that is associated with the
+// core. If `cluster` is not replicated then the core is a physical core index
+// and the host is a physical host name. If `cluster` is replicated then the
+// core with index `i` is a logical core (`TPU_REPLICATED_CORE_i`), and the host
+// is the associated virtual device name (`TPU_REPLICATED_HOST_i`).
+mlir::LogicalResult GetDeviceToHostMap(
+    mlir::tf_device::ClusterOp cluster,
+    llvm::SmallVector<std::string, 8>& core_to_host);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_REWRITE_DEVICE_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 8cb93df6922..2f33ccd88b2 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -19,12 +19,15 @@ limitations under the License.
 #include <optional>
 #include <tuple>
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/tpu/topology.pb.h"
@@ -33,6 +36,20 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetMlirModuleFromString(
+    llvm::StringRef string, mlir::MLIRContext* context) {
+  mlir::DialectRegistry mlir_registry;
+  RegisterAllTensorFlowDialects(mlir_registry);
+  context->appendDialectRegistry(mlir_registry);
+  mlir::OwningOpRef<mlir::ModuleOp> mlir_module;
+  auto status =
+      tensorflow::DeserializeMlirModule(string, context, &mlir_module);
+  if (!status.ok()) {
+    return status;
+  }
+  return mlir_module;
+}
+
 using Device = DeviceNameUtils::ParsedName;
 
 bool DeviceNamesToParsedNames(llvm::ArrayRef<std::string> device_names,
@@ -63,7 +80,7 @@ TEST_P(ParameterizedDeviceSetTest, BadDeviceSet) {
       devices, /*num_replicas=*/1, /*num_cores_per_replica=*/1, topology_attr,
       device_assignment_attr);
   ASSERT_FALSE(status_or.ok());
-  EXPECT_EQ(status_or.status().error_message(), std::get<1>(GetParam()));
+  EXPECT_EQ(status_or.status().message(), std::get<1>(GetParam()));
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -110,7 +127,7 @@ TEST_P(ParameterizedMetadataTest, BadMetadata) {
       devices, std::get<0>(GetParam()), std::get<1>(GetParam()),
       std::get<2>(GetParam()), std::get<3>(GetParam()));
   ASSERT_FALSE(status_or.ok());
-  EXPECT_EQ(status_or.status().error_message(), std::get<4>(GetParam()));
+  EXPECT_EQ(status_or.status().message(), std::get<4>(GetParam()));
 }
 
 std::string TopologyWithMeshShape(llvm::ArrayRef<int> mesh_shape) {
@@ -310,7 +327,7 @@ TEST(TPURewriteDeviceUtilTest,
       device_assignment_attr);
 
   ASSERT_FALSE(status_or.ok());
-  EXPECT_EQ(status_or.status().error_message(),
+  EXPECT_EQ(status_or.status().message(),
             "no TPU device found for 'device_assignment' device coordinate (1, "
             "0, 0, 0)");
 }
@@ -622,7 +639,7 @@ TEST(TPURewriteDeviceUtilTest, TestInvalidAttrForDeviceAssignmentDisallowed) {
   auto status_or_device_coodinates =
       GetDeviceCoordinates(device_assignment_attr);
   ASSERT_TRUE(!status_or_device_coodinates.ok());
-  EXPECT_EQ(status_or_device_coodinates.status().error_message(),
+  EXPECT_EQ(status_or_device_coodinates.status().message(),
             "bad 'device_assignment' attribute at index 0, not an int");
 }
 
@@ -830,7 +847,7 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceTPUReplicate) {
   std::string host_device;
   EXPECT_TRUE(mlir::succeeded(
       GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
-  EXPECT_EQ(host_device, kTPUReplicatedHost);
+  EXPECT_EQ(host_device, GetDeviceAliasForHostOfLogicalCore(0));
 }
 
 TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceNotReplicated) {
@@ -917,5 +934,124 @@ TEST(TPURewriteDeviceUtilTest, TestIsTPUDevice) {
   EXPECT_FALSE(IsTPUDevice("INVALID_DEVICE"));
 }
 
+TEST(TPURewriteDeviceUtilTest, TestDeviceToHostMapBadTopology) {
+  static const char* const module_str =
+      R"(
+module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"}} {
+  func.func @main() -> () {
+    "tf_device.cluster"() ({
+      tf_device.return
+    }) {device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], num_cores_per_replica = 2 : i64} : () -> ()
+    func.return
+  }
+})";
+  mlir::MLIRContext context;
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                          GetMlirModuleFromString(module_str, &context));
+  mlir::tf_device::ClusterOp cluster;
+  module->walk(
+      [&](mlir::tf_device::ClusterOp descendant) { cluster = descendant; });
+  llvm::SmallVector<std::string, 8> core_to_host;
+  EXPECT_TRUE(mlir::failed(GetDeviceToHostMap(cluster, core_to_host)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestDeviceToHostMapBadDeviceAssignment) {
+  static const char* const module_str =
+      R"(
+module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"}} {
+  func.func @main() -> () {
+    "tf_device.cluster"() ({
+      tf_device.return
+    }) {num_cores_per_replica = 2 : i64, topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01*\02\08\01"} : () -> ()
+    func.return
+  }
+})";
+  mlir::MLIRContext context;
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                          GetMlirModuleFromString(module_str, &context));
+  mlir::tf_device::ClusterOp cluster;
+  module->walk(
+      [&](mlir::tf_device::ClusterOp descendant) { cluster = descendant; });
+  llvm::SmallVector<std::string, 8> core_to_host;
+  EXPECT_TRUE(mlir::failed(GetDeviceToHostMap(cluster, core_to_host)));
+}
+
+// Tests `GetDeviceToHostMap` on a non-replicated TPU cluster.
+TEST(TPURewriteDeviceUtilTest, TestDeviceToHostMapNotReplicated) {
+  static const char* const module_str =
+      R"(
+module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"}} {
+  func.func @main() -> () {
+    "tf_device.cluster"() ({
+      tf_device.return
+    }) {device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], num_cores_per_replica = 2 : i64, topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01*\02\08\01"} : () -> ()
+    func.return
+  }
+})";
+  mlir::MLIRContext context;
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                          GetMlirModuleFromString(module_str, &context));
+  mlir::tf_device::ClusterOp cluster;
+  module->walk(
+      [&](mlir::tf_device::ClusterOp descendant) { cluster = descendant; });
+  llvm::SmallVector<std::string, 8> core_to_host;
+  EXPECT_TRUE(mlir::succeeded(GetDeviceToHostMap(cluster, core_to_host)));
+  EXPECT_EQ(core_to_host.size(), 2);
+  EXPECT_EQ(core_to_host[0], "/job:localhost/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(core_to_host[1], "/job:localhost/replica:0/task:0/device:CPU:0");
+}
+
+// Tests `GetDeviceToHostMap` on a replicated TPU cluster.
+TEST(TPURewriteDeviceUtilTest, TestDeviceToHostMapReplicated) {
+  static const char* const module_str =
+      R"(
+module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU:2", "/job:localhost/replica:0/task:0/device:TPU:3", "/job:localhost/replica:0/task:0/device:TPU:4", "/job:localhost/replica:0/task:0/device:TPU:5", "/job:localhost/replica:0/task:0/device:TPU:6", "/job:localhost/replica:0/task:0/device:TPU:7", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"}} {
+  func.func @main() -> () {
+    tf_device.replicate() {n = 4 : i32} {
+      "tf_device.cluster"() ({
+        tf_device.return
+      }) {device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1], num_cores_per_replica = 2 : i64, topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01*\02\08\01"} : () -> ()
+      tf_device.return
+    }
+    func.return
+  }
+})";
+  mlir::MLIRContext context;
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                          GetMlirModuleFromString(module_str, &context));
+  mlir::tf_device::ClusterOp cluster;
+  module->walk(
+      [&](mlir::tf_device::ClusterOp descendant) { cluster = descendant; });
+  llvm::SmallVector<std::string, 8> core_to_host;
+  EXPECT_TRUE(mlir::succeeded(GetDeviceToHostMap(cluster, core_to_host)));
+  EXPECT_EQ(core_to_host.size(), 2);
+  EXPECT_EQ(core_to_host[0], "TPU_REPLICATED_HOST_0");
+  EXPECT_EQ(core_to_host[1], "TPU_REPLICATED_HOST_1");
+}
+
+// Tests `GetDeviceToHostMap` on a CPU cluster.
+TEST(TPURewriteDeviceUtilTest, TestDeviceToHostMapCPU) {
+  static const char* const module_str =
+      R"(
+module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0"}} {
+  func.func @main() -> () {
+    "tf_device.cluster"() ({
+      tf_device.return
+    }) {} : () -> ()
+    func.return
+  }
+})";
+  mlir::MLIRContext context;
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                          GetMlirModuleFromString(module_str, &context));
+  mlir::tf_device::ClusterOp cluster;
+  module->walk(
+      [&](mlir::tf_device::ClusterOp descendant) { cluster = descendant; });
+  llvm::SmallVector<std::string, 8> core_to_host;
+  EXPECT_TRUE(mlir::succeeded(GetDeviceToHostMap(cluster, core_to_host)));
+  EXPECT_EQ(core_to_host.size(), 1);
+  EXPECT_EQ(core_to_host[0], "/job:localhost/replica:0/task:0/device:CPU:0");
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index 2a6d94828a3..e55ba55caf9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
 
 #include <numeric>
+#include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -40,6 +41,47 @@ namespace {
 
 constexpr char kNumSplitAttr[] = "num_split";
 
+// Gets the proper tensor dimension from XLA OpSharding.
+// "replicate_on_last_tile_dim" and "last_tile_dims" should be deducted from the
+// real Tensor dimensions when tiled.
+// For example:
+// f32[8,512](sharding={devices=[1,1,2]0,1 last_tile_dims={REPLICATED})
+// also means a replicated tensor over all devices.
+//
+// See xla_data.proto for detailed explanations on the fields.
+int GetDimsFromXLAShardingTiled(const xla::OpSharding& xla_sharding) {
+  return xla_sharding.tile_assignment_dimensions_size() -
+         (xla_sharding.replicate_on_last_tile_dim() ? 1 : 0) -
+         xla_sharding.last_tile_dims_size();
+}
+
+// A sharding with OTHER type may be REPLICATED if:
+// 'replicate_on_last_tile_dim' is true OR
+// 'last_tile_dims' is not empty
+// AND
+// other than replicated last tile dims, all other dims are not sharded.
+bool IsOtherReplicatedSharding(const xla::OpSharding& xla_sharding) {
+  int max_dim = GetDimsFromXLAShardingTiled(xla_sharding);
+  for (int i = 0; i < max_dim; ++i) {
+    if (xla_sharding.tile_assignment_dimensions(i) != 1) {
+      return false;
+    }
+  }
+  return xla_sharding.type() == xla::OpSharding::OTHER &&
+         (xla_sharding.replicate_on_last_tile_dim() ||
+          !xla_sharding.last_tile_dims().empty());
+}
+
+bool IsSplitSharding(const xla::OpSharding& sharding) {
+  return sharding.type() == xla::OpSharding::OTHER &&
+         !IsOtherReplicatedSharding(sharding);
+}
+
+bool IsReplicatedSharding(const xla::OpSharding& sharding) {
+  return sharding.type() == xla::OpSharding::REPLICATED ||
+         IsOtherReplicatedSharding(sharding);
+}
+
 // Creates a tf::SplitOp that splits 'src_input' into 'num_splits' ways
 // in 'split_dimension' dimension and returns the split values.
 mlir::LogicalResult CreateSplitOp(const int num_split,
@@ -147,7 +189,7 @@ mlir::LogicalResult HandleTileShardedInputs(
   // Split nodes at ith depth from the original input node represent nodes
   // that split the input data at i-th dimension.
   const auto& dimension_splits = input_sharding.tile_assignment_dimensions();
-  for (auto num_splits_and_index : llvm::enumerate(dimension_splits)) {
+  for (const auto& num_splits_and_index : llvm::enumerate(dimension_splits)) {
     const int num_splits = num_splits_and_index.value();
     const int dimension_index = num_splits_and_index.index();
     if (num_splits == 1) continue;
@@ -256,7 +298,7 @@ mlir::LogicalResult ExtractInputsForLogicalDevices(
                << input_index << "-th input";
 
       if (input_sharding_type == xla::OpSharding::REPLICATED) {
-        for (auto& index_and_inputs : llvm::enumerate(*input_list)) {
+        for (const auto& index_and_inputs : llvm::enumerate(*input_list)) {
           index_and_inputs.value().emplace_back(
               partitioned_input.getOperand(index_and_inputs.index()));
         }
@@ -276,7 +318,7 @@ mlir::LogicalResult ExtractInputsForLogicalDevices(
       continue;
     }
 
-    if (input_sharding_type == xla::OpSharding::OTHER) {
+    if (IsSplitSharding(sharding)) {
       llvm::SmallVector<mlir::Value, 4> tiled_inputs;
       auto result = HandleTileShardedInputs(
           cluster_func.getLoc(), sharding, input_value, builder, &tiled_inputs);
@@ -290,7 +332,7 @@ mlir::LogicalResult ExtractInputsForLogicalDevices(
         const int assigned_logical_device = sharding.tile_assignment_devices(i);
         (*input_list)[assigned_logical_device].emplace_back(tiled_inputs[i]);
       }
-    } else if (input_sharding_type == xla::OpSharding::REPLICATED) {
+    } else if (IsReplicatedSharding(sharding)) {
       for (auto& inputs : *input_list) inputs.emplace_back(input_value);
     } else {
       assert(input_sharding_type == xla::OpSharding::MAXIMAL);
@@ -317,7 +359,7 @@ mlir::LogicalResult ParseAndValidateOutputSharding(
   if (output_sharding_attrs.size() != cluster_func.getNumResults())
     return cluster_func.emitError("incorrect number of output sharding");
 
-  for (auto output_sharding_and_index :
+  for (const auto& output_sharding_and_index :
        llvm::enumerate(output_sharding_attrs)) {
     const auto& output_sharding = output_sharding_and_index.value();
     const int sharding_index = output_sharding_and_index.index();
@@ -472,7 +514,7 @@ mlir::LogicalResult ValidateAndGetTiledExecuteOutputShape(
     mlir::Type* tiled_logical_computation_type) {
   auto new_output_shape =
       llvm::to_vector<4>(cluster_func_output_type.getShape());
-  for (auto dimension_and_output_splits :
+  for (const auto& dimension_and_output_splits :
        llvm::enumerate(output_sharding.tile_assignment_dimensions())) {
     const auto dimension_index = dimension_and_output_splits.index();
     const auto output_splits = dimension_and_output_splits.value();
@@ -515,17 +557,17 @@ mlir::LogicalResult GetOutputTypesForLogicalDeviceComputation(
   output_types->reserve(cluster_func.getNumResults());
 
   int core_index = 0;
-  for (auto result_and_index : llvm::enumerate(cluster_func.getResults())) {
+  for (const auto& result_and_index :
+       llvm::enumerate(cluster_func.getResults())) {
     const auto output_index = result_and_index.index();
     const auto& output_sharding = output_sharding_config[output_index];
-    const auto output_sharding_type = output_sharding.type();
     const auto cluster_func_output_type =
         result_and_index.value().getType().cast<mlir::TensorType>();
 
     // If output shape of cluster func is statically known and output is tiled
     // sharded, then the corresponding output shape of cluster func must be
     // evenly divisible number of shardings.
-    if (output_sharding_type == xla::OpSharding::OTHER) {
+    if (IsSplitSharding(output_sharding)) {
       mlir::Type tiled_logical_computation_type;
       if (cluster_func_output_type.hasRank()) {
         auto result = ValidateAndGetTiledExecuteOutputShape(
@@ -537,7 +579,7 @@ mlir::LogicalResult GetOutputTypesForLogicalDeviceComputation(
       }
       cluster_to_core_index->emplace_back(core_index++);
       output_types->emplace_back(tiled_logical_computation_type);
-    } else if (output_sharding_type == xla::OpSharding::REPLICATED ||
+    } else if (IsReplicatedSharding(output_sharding) ||
                IsAssignedToLogicalDevice(core_id, output_sharding)) {
       cluster_to_core_index->emplace_back(core_index++);
       output_types->emplace_back(cluster_func_output_type);
@@ -557,7 +599,7 @@ mlir::LogicalResult RemapOutputsFromLogicalDevices(
     mlir::tf_device::ParallelExecuteOp old_parallel_execute, int cluster_idx,
     mlir::tf_device::ParallelExecuteOp new_parallel_execute,
     mlir::OpBuilder* builder) {
-  for (auto& result_and_index :
+  for (const auto& result_and_index :
        llvm::enumerate(old_parallel_execute.getResults())) {
     const auto output_index = result_and_index.index();
     const auto old_parallel_execute_output = result_and_index.value();
@@ -605,10 +647,11 @@ mlir::LogicalResult RemapOutputsFromLogicalDevices(
       if (output_sharding_type == xla::OpSharding::REPLICATED) {
         for (const auto& index_and_output :
              llvm::enumerate(partitioned_output.getOutput())) {
+          auto idx = (cluster_idx + index_and_output.index()) %
+                     new_parallel_execute->getNumRegions();
           const auto output_from_logical_device =
               new_parallel_execute.GetRegionOutputs(
-                  cluster_idx +
-                  index_and_output.index())[tpu_cluster_output_index];
+                  idx)[tpu_cluster_output_index];
           index_and_output.value().replaceAllUsesWith(
               output_from_logical_device);
         }
@@ -627,7 +670,7 @@ mlir::LogicalResult RemapOutputsFromLogicalDevices(
       continue;
     }
 
-    if (output_sharding_type == xla::OpSharding::OTHER) {
+    if (IsSplitSharding(output_sharding)) {
       if (failed(HandleTileShardedOutputs(
               tpu_cluster_output_index, output_sharding_config,
               cluster_to_core_index, location, old_parallel_execute_output,
diff --git a/tensorflow/compiler/mlir/tf2xla/BUILD b/tensorflow/compiler/mlir/tf2xla/BUILD
index 963bd8cfaa2..5605cbed225 100644
--- a/tensorflow/compiler/mlir/tf2xla/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/BUILD
@@ -2,9 +2,13 @@
 #    TF2XLA Bridge and related components.
 
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+
+package_group(
+    name = "tensorflow_mlir_tf2xla",
+    packages = [
+        "//tensorflow/compiler/mlir/tf2xla/...",
+    ],
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -26,427 +30,7 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "compile_mlir_util_no_tf_dialect_passes",
-    srcs = ["api/v0/compile_mlir_util.cc"],
-    hdrs = ["api/v0/compile_mlir_util.h"],
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
-        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
-        "//tensorflow/compiler/mlir/tensorflow:convert_type",
-        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
-        "//tensorflow/compiler/mlir/tensorflow:import_model",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
-        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
-        "//tensorflow/compiler/mlir/tensorflow:shape_inference_pass",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/compiler/mlir/tensorflow:translate_utils",
-        "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
-        "//tensorflow/compiler/mlir/tf2xla:tf_xla_passes",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_targets",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf_with_tf2xla",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:layout_util",
-        "//tensorflow/compiler/tf2xla:xla_argument",
-        "//tensorflow/compiler/tf2xla:xla_helpers",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
-        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
-        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:layout_util",
-        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
-        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/common_runtime:core_cpu_internal",
-        "//tensorflow/core/platform:error_payloads",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core/tpu:tpu_defs",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:variant",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
-        "@stablehlo//:register",
-    ],
-)
-
-tf_cc_test(
-    name = "compile_mlir_util_test",
-    srcs = ["api/v0/compile_mlir_util_test.cc"],
-    deps = [
-        ":compile_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
-        "//tensorflow/compiler/tf2xla:xla_helpers",
-        "//tensorflow/core:framework",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Pass",
-    ],
-)
-
 alias(
     name = "compile_mlir_util",
-    actual = ":compile_mlir_util_no_tf_dialect_passes",
-)
-
-gentbl_cc_library(
-    name = "legalize_tf_patterns_inc_gen",
-    compatible_with = get_compatible_with_cloud(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_legalize_tf.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "transforms/legalize_tf_patterns.td",
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_ops_td_files",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncTdFiles",
-        "@llvm-project//mlir:TensorOpsTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "xla_legalize_tf_passes_inc_gen",
-    compatible_with = get_compatible_with_cloud(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=LegalizeTf",
-            ],
-            "transforms/xla_legalize_tf_passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "transforms/xla_legalize_tf_passes.td",
-    deps = [
-        "@llvm-project//mlir:PassBaseTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "tf_xla_passes_inc_gen",
-    compatible_with = get_compatible_with_cloud(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TfXla",
-            ],
-            "transforms/tf_xla_passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "transforms/tf_xla_passes.td",
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_ops_td_files",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncTdFiles",
-        "@llvm-project//mlir:PassBaseTdFiles",
-        "@llvm-project//mlir:SparseTensorDialect",
-        "@llvm-project//mlir:TensorOpsTdFiles",
-    ],
-)
-
-cc_library(
-    name = "tf_xla_passes",
-    srcs = [
-        "transforms/xla_legalize_tf_passes.h.inc",
-    ],
-    hdrs = [
-        "transforms/passes.h",
-    ],
-    deps = [
-        ":tf_xla_passes_inc_gen",
-        ":xla_legalize_tf",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SparseTensorDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
-cc_library(
-    name = "legalize_utils",
-    srcs = ["transforms/utils.cc"],
-    hdrs = ["transforms/utils.h"],
-    deps = [
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "legalize_tf",
-    srcs = [
-        "transforms/generated_legalize_tf.inc",
-        "transforms/legalize_tf.cc",
-    ],
-    hdrs = [
-        "transforms/passes.h",
-    ],
-    deps = [
-        ":legalize_tf_patterns_inc_gen",
-        ":legalize_utils",
-        ":tf_xla_passes_inc_gen",
-        ":xla_legalize_tf_passes_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client:sharding_builder",
-        "//tensorflow/compiler/xla/client/lib:conv_grad_size_util",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:convert_op_folder",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:attribute_importer",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/kernels:conv_grad_shape_utils",
-        "//tensorflow/tsl/platform:bfloat16",
-        "//tensorflow/tsl/platform:status",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformUtils",
-        "@stablehlo//:chlo_ops",
-    ],
-)
-
-cc_library(
-    name = "xla_legalize_targets",
-    srcs = [
-        "transforms/xla_legalize_targets.cc",
-    ],
-    hdrs = [
-        "transforms/xla_legalize_targets.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformUtils",
-        "@stablehlo//:chlo_ops",
-    ],
-)
-
-tf_cc_test(
-    name = "xla_legalize_targets_test",
-    srcs = ["transforms/xla_legalize_targets_test.cc"],
-    deps = [
-        ":xla_legalize_targets",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformUtils",
-        "@stablehlo//:chlo_ops",
-    ],
-)
-
-tf_cc_test(
-    name = "verify_tfxla_legalization_test",
-    srcs = ["transforms/verify_tfxla_legalization_test.cc"],
-    deps = [
-        ":legalize_tf",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
-        "//tensorflow/core/lib/monitoring:cell_reader",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:Pass",
-    ],
-)
-
-cc_library(
-    name = "xla_legalize_tf",
-    srcs = [
-        "transforms/convert_mhlo_quant_to_int.cc",
-        "transforms/infeed_ops_xla_adjust_layout.cc",
-        "transforms/legalize_tf_collective.cc",
-        "transforms/legalize_tf_communication.cc",
-        "transforms/legalize_tf_types.cc",
-        "transforms/tf_xla_passes.h.inc",
-        "transforms/tfxla_device_specific_transforms.cc",
-        "transforms/verify_tfxla_legalization.cc",
-        "transforms/xla_legalize_tf.cc",
-        "transforms/xla_legalize_tf_passes.h.inc",
-    ],
-    hdrs = [
-        "transforms/passes.h",
-    ],
-    deps = [
-        ":legalize_tf",
-        ":legalize_utils",
-        ":xla_legalize_targets",
-        ":xla_legalize_tf_no_fallback",
-        ":xla_legalize_tf_passes_inc_gen",
-        ":xla_legalize_tf_with_tf2xla",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
-        "//tensorflow/compiler/mlir/tensorflow:mangling_util",
-        "//tensorflow/compiler/mlir/tensorflow:set_tpu_infeed_layout",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/compiler/tf2xla/kernels:rng_converter_utils",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:side_effect_util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client:sharding_builder",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:chlo_legalize_to_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:convert_op_folder",
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:attribute_importer",
-        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/util/quantization:uniform_quant_ops_params",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:SparseTensorDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:Transforms",
-        "@stablehlo//:chlo_ops",
-    ],
-)
-
-cc_library(
-    name = "xla_legalize_tf_no_fallback",
-    srcs = [
-        "transforms/xla_legalize_tf_no_fallback.cc",
-        "transforms/xla_legalize_tf_passes.h.inc",
-    ],
-    hdrs = [
-        "transforms/passes.h",
-    ],
-    deps = [
-        ":legalize_tf",
-        ":tf_xla_passes_inc_gen",
-        ":xla_legalize_tf_passes_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:SparseTensorDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:Transforms",
-        "@stablehlo//:chlo_ops",
-    ],
-)
-
-cc_library(
-    name = "xla_legalize_tf_with_tf2xla",
-    srcs = [
-        "transforms/legalize_tf_with_tf2xla.cc",
-    ],
-    hdrs = [
-        "transforms/passes.h",
-    ],
-    deps = [
-        ":tf_xla_passes_inc_gen",
-        ":xla_legalize_tf_passes_inc_gen",
-        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
-        "//tensorflow/compiler/mlir/tensorflow:convert_type",
-        "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
-        "//tensorflow/compiler/mlir/tensorflow:tpu_embedding_ops_registry",
-        "//tensorflow/compiler/mlir/tensorflow:translate_utils",
-        "//tensorflow/compiler/tf2xla:xla_compilation_device",
-        "//tensorflow/compiler/tf2xla:xla_context",
-        "//tensorflow/compiler/tf2xla:xla_expression",
-        "//tensorflow/compiler/tf2xla:xla_helpers",
-        "//tensorflow/compiler/tf2xla:xla_op_registry",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/stream_executor:timer",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:mlir_hlo_builder",
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:session_options",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SparseTensorDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformUtils",
-    ],
+    actual = "//tensorflow/compiler/mlir/tf2xla/api/v0:compile_mlir_util_no_tf_dialect_passes",
 )
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v0/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v0/BUILD
new file mode 100644
index 00000000000..18744b3032f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v0/BUILD
@@ -0,0 +1,138 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "compile_mlir_util_no_tf_dialect_passes",
+    srcs = ["compile_mlir_util.cc"],
+    hdrs = ["compile_mlir_util.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
+        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:import_model",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
+        "//tensorflow/compiler/mlir/tensorflow:shape_inference_pass",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:translate_utils",
+        "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:tf_xla_passes",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_targets",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf_with_tf2xla",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:layout_util",
+        "//tensorflow/compiler/tf2xla:xla_argument",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:layout_util",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:core_cpu_internal",
+        "//tensorflow/core/platform:error_payloads",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/tpu:tpu_defs",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        "@stablehlo//:register",
+    ],
+)
+
+tf_cc_test(
+    name = "compile_mlir_util_test",
+    srcs = ["compile_mlir_util_test.cc"],
+    deps = [
+        ":compile_mlir_util_no_tf_dialect_passes",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/lib/monitoring:cell_reader",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+cc_library(
+    name = "compile_tf_graph",
+    srcs = ["compile_tf_graph.cc"],
+    hdrs = ["compile_tf_graph.h"],
+    deps = [
+        ":compile_mlir_util_no_tf_dialect_passes",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:import_model",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
+        "//tensorflow/compiler/mlir/tensorflow:set_tpu_infeed_layout",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow:translate_utils",
+        "//tensorflow/compiler/tf2xla:layout_util",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/xla/client:compile_only_client",
+        "//tensorflow/compiler/xla/pjrt:compile_options_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/tpu:tpu_compile",
+        "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
+        "//tensorflow/core/tpu/kernels:tpu_compile_proto_cc",
+        "//tensorflow/core/tpu/kernels:tpu_util",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:variant",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+tf_cc_test(
+    name = "compile_tf_graph_test",
+    testonly = 1,
+    srcs = ["compile_tf_graph_test.cc"],
+    linkstatic = 1,
+    deps = [
+        ":compile_tf_graph",
+        "//tensorflow/compiler/jit",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/lib/monitoring:cell_reader",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/lib/monitoring:test_utils",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.cc b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.cc
index c800a6fce7a..19c148214e1 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.cc
@@ -16,11 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.h"
 
 #include <memory>
+#include <string>
 
 #include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/optional.h"
-#include "absl/types/variant.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -52,6 +50,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
@@ -78,10 +77,10 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/protobuf/core_platform_payloads.pb.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/util/debug_data_dumper.h"
 
 namespace tensorflow {
 namespace {
-
 constexpr absl::string_view kGroupSizeAttrName =
     "tf2xla.collective_info.group_size";
 constexpr absl::string_view kGroupKeyAttrName =
@@ -336,7 +335,7 @@ void AddLegalizationPasses(mlir::OpPassManager& pm, bool legalize_chlo,
   // in VerifyTFXLALegalization that full conversion happened.
   // TODO(b/188389290): Cleanup allow_partial_conversion as a legalization
   // parameter.
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createLegalizeTFPass(
+  pm.addPass(mlir::mhlo::createLegalizeTFPass(
       /*allow_partial_conversion=*/true, legalize_chlo,
       /*tf2xla_fallback_device_type=*/device_type, enable_op_fallback));
 
@@ -356,35 +355,6 @@ void AddLegalizationPasses(mlir::OpPassManager& pm, bool legalize_chlo,
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
 }
 
-// The default LLVM MLIR Inliner always runs canonicalization, however there
-// is a bug where dumping the pass pipeline and recreating it in offline
-// tools doesn't run canonicalization. To ensure prod and offline tools
-// inlining are equal, explicitly create the Inliner with canonicalization so
-// that the canonicalizer is dumped as part of pipeline passes.
-// See https://github.com/llvm/llvm-project/issues/60960.
-ABSL_CONST_INIT absl::Mutex pass_registration_lock(absl::kConstInit);
-std::unique_ptr<mlir::Pass> CreateInlinerWithCanonicalization() {
-  // This is really wonky. Pass Registration isn't thread safe in LLVM, so we
-  // need a mutex to guard pass registration. Pass registration also needs
-  // to happen once per thread, so make this thread local.
-  // TODO(b/268509024): Delete this whole function once the upstream LLVM issue
-  // is resolved.
-  static thread_local bool pass_registered = false;
-  if (!pass_registered) {
-    absl::MutexLock lock(&pass_registration_lock);
-    mlir::registerCanonicalizerPass();
-    pass_registered = true;
-  }
-
-  auto inliner = mlir::createInlinerPass(/*opPipelines=*/{},
-                                         /*defaultPipelineBuilder=*/{});
-  if (inliner->initializeOptions("default-pipeline=canonicalize").failed()) {
-    return nullptr;
-  }
-
-  return inliner;
-}
-
 }  //  namespace
 
 void CreateConvertMlirToXlaHloPipeline(
@@ -401,7 +371,7 @@ void CreateConvertMlirToXlaHloPipeline(
   // Note that the region-based control-flow produced here still contains
   // function call ops which get inlined by the subsequent inliner pass.
   pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
-  pm.addPass(CreateInlinerWithCanonicalization());
+  pm.addPass(mlir::createInlinerPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateDropWhileShapeInvariantPass());
   // Create a replicated TensorList initialization ops for all of its uses. This
@@ -457,8 +427,6 @@ void CreateConvertMlirToXlaHloPipeline(
 
   pm.addNestedPass<mlir::func::FuncOp>(mlir::TF::CreateLowerQuantizedPass());
   pm.addPass(mlir::mhlo::CreateLegalizeTfTypesPass());
-  pm.addPass(mlir::mhlo::createLegalizeTFModulePass(
-      /*tf2xla_fallback_device_type=*/device_type));
 
   for (auto& target_pass : custom_legalization_passes) {
     pm.addNestedPass<mlir::func::FuncOp>(std::move(target_pass));
@@ -481,7 +449,7 @@ void CreateConvertMlirToXlaHloPipeline(
   }
 
   if (CanInlineFunctionsPostLegalization(device_type)) {
-    pm.addPass(CreateInlinerWithCanonicalization());
+    pm.addPass(mlir::createInlinerPass());
   }
 
   // In order to export to XLA, we must sink constants to control flow regions,
@@ -543,20 +511,34 @@ Status RefineShapes(llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
 Status LegalizeToHlo(mlir::ModuleOp module_op, llvm::StringRef device_type,
                      bool enable_op_fallback,
                      llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
-                         custom_legalization_passes) {
+                         custom_legalization_passes,
+                     llvm::StringRef module_name = llvm::StringRef()) {
   mlir::PassManager tf2xla(module_op.getContext());
   applyTensorflowAndCLOptions(tf2xla);
   CreateConvertMlirToXlaHloPipeline(tf2xla, device_type, enable_op_fallback,
                                     custom_legalization_passes);
 
-  if (VLOG_IS_ON(1))
-    tensorflow::DumpMlirOpToFile("legalize_hlo_before", module_op, "", &tf2xla);
-  if (VLOG_IS_ON(2)) {
+  if (DEBUG_DATA_DUMPER()->ShouldDump(module_name.str(), kDebugGroupMain) ||
+      VLOG_IS_ON(1)) {
+    tensorflow::DumpMlirOpToFile(
+        DEBUG_DATA_DUMPER()->GetDumpFilename(module_name.str(), kDebugGroupMain,
+                                             "legalize_hlo_before"),
+        module_op, "", &tf2xla);
+  }
+
+  if (VLOG_IS_ON(2) || DEBUG_DATA_DUMPER()->ShouldDump(
+                           module_name.str(), kDebugGroupBridgePhase2)) {
     // Print the whole module after each pass which requires disabling
     // multi-threading as well.
     module_op.getContext()->disableMultithreading();
-    tf2xla.enableIRPrinting(std::make_unique<tensorflow::BridgeLoggerConfig>(
-        /*print_module_scope=*/true));
+    tf2xla.enableIRPrinting(
+        std::make_unique<::tensorflow::DataDumperLoggerConfig>(
+            [module_name](const std::string& pass_tag_name) {
+              return DEBUG_DATA_DUMPER()->GetDumpFilename(
+                  module_name.str(), kDebugGroupBridgePhase2, pass_tag_name);
+            },
+            "",
+            /*print_module_scope=*/true));
   }
 
   // Make sure we catch any error reported by MLIR and forward it to the TF
@@ -572,8 +554,14 @@ Status LegalizeToHlo(mlir::ModuleOp module_op, llvm::StringRef device_type,
     return error_handler.Combine(status);
   }
 
-  if (VLOG_IS_ON(1))
-    tensorflow::DumpMlirOpToFile("legalize_hlo_after", module_op, "", &tf2xla);
+  if (DEBUG_DATA_DUMPER()->ShouldDump(module_name.str(), kDebugGroupMain) ||
+      VLOG_IS_ON(1)) {
+    tensorflow::DumpMlirOpToFile(
+        DEBUG_DATA_DUMPER()->GetDumpFilename(module_name.str(), kDebugGroupMain,
+                                             "legalize_hlo_after"),
+        module_op, "", &tf2xla);
+  }
+
   Status status = error_handler.ConsumeStatus();
   tensorflow::OkOrSetErrorCounterPayload(
       tensorflow::core::platform::ErrorSourceProto::MLIR_BRIDGE_PHASE_2,
@@ -602,9 +590,10 @@ Status ConvertMLIRToXlaComputation(
     bool enable_op_fallback, bool return_tuple,
     const XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
     llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
-        custom_legalization_passes) {
+        custom_legalization_passes,
+    llvm::StringRef module_name) {
   TF_RETURN_IF_ERROR(LegalizeToHlo(module_op, device_type, enable_op_fallback,
-                                   custom_legalization_passes));
+                                   custom_legalization_passes, module_name));
 
   mlir::MlirToHloConversionOptions options;
   options.layout_preference_fn =
@@ -722,7 +711,8 @@ Status CompileMlirToXlaHlo(
     XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
     XlaCompilationResult* compilation_result,
     llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
-        custom_legalization_passes) {
+        custom_legalization_passes,
+    llvm::StringRef module_name) {
   if (enable_op_fallback &&
       GetMlirBridge2ndPhaseRolloutPolicy(module_op) ==
           MlirBridgeRolloutPolicy::kDisabledAfterGraphAnalysis) {
@@ -736,7 +726,7 @@ Status CompileMlirToXlaHlo(
   TF_RETURN_IF_ERROR(ConvertMLIRToXlaComputation(
       module_op, device_type, compilation_result->computation.get(),
       use_tuple_args, enable_op_fallback, use_return_tuple,
-      shape_determination_fns, custom_legalization_passes));
+      shape_determination_fns, custom_legalization_passes, module_name));
 
   TF_RETURN_IF_ERROR(PopulateCollectiveInfo(module_op, compilation_result));
 
@@ -751,7 +741,8 @@ Status CompileSerializedMlirToXlaHlo(
     const XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
     XlaCompilationResult* compilation_result,
     llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
-        custom_legalization_passes) {
+        custom_legalization_passes,
+    llvm::StringRef module_name) {
   mlir::DialectRegistry mlir_registry;
   RegisterDialects(mlir_registry);
   mlir::MLIRContext mlir_context(mlir_registry);
@@ -767,7 +758,7 @@ Status CompileSerializedMlirToXlaHlo(
       mlir_module.get(), tensor_or_resource_shapes, device_type, use_tuple_args,
       enable_op_fallback, /*use_return_tuple=*/true,
       /*use_resource_updates_for_aliases=*/false, shape_determination_fns,
-      compilation_result, custom_legalization_passes);
+      compilation_result, custom_legalization_passes, module_name);
 }
 
 // Rewrites the given module with specified args. For each of the constant args,
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.h b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.h
index 19f1551382d..84cf70f0b3b 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.h
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 
 namespace tensorflow {
 
@@ -72,7 +72,8 @@ Status ConvertMLIRToXlaComputation(
     const XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns =
         {},
     llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
-        custom_legalization_passes = {});
+        custom_legalization_passes = {},
+    llvm::StringRef module_name = llvm::StringRef());
 
 // Creates a MLIR pipeline that lowers MLIR module to MHLO dialect. The input
 // module should only contain operations in tf dialect. For example, if the
@@ -144,7 +145,8 @@ Status CompileMlirToXlaHlo(
     XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
     XlaCompilationResult* compilation_result,
     llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
-        custom_legalization_passes);
+        custom_legalization_passes,
+    llvm::StringRef module_name = llvm::StringRef());
 
 // Compiles a serialized MLIR module into XLA HLO, generates all accompanying
 // metadata and stores them in CompilationResult.
@@ -154,7 +156,8 @@ Status CompileSerializedMlirToXlaHlo(
     const XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
     XlaCompilationResult* compilation_result,
     llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
-        custom_legalization_passes = {});
+        custom_legalization_passes = {},
+    llvm::StringRef module_name = llvm::StringRef());
 
 // Compiles a TensorFlow Graph (already converted to MLIR, imported with
 // tf_executor dialect still present) into XLA HLO, generates all accompanying
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util_test.cc
index 31dd9aeb551..d3158b5a917 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util_test.cc
@@ -27,11 +27,13 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/monitoring/cell_reader.h"
 
 namespace tensorflow {
 namespace {
 
 using ::mlir::OpPassManager;
+using ::tensorflow::monitoring::testing::CellReader;
 using ::testing::HasSubstr;
 
 static constexpr char kMlirModuleStr[] = R"(
@@ -64,6 +66,8 @@ TEST(LegalizeMlirTest, FailsLegalizesModule) {
       func.return %0 : tensor<1xi32>
     }
   })";
+  CellReader<int64_t> count(
+      "/tensorflow/core/tf2xla/v0/mlir_failed_xla_legalize_tf_pass_count");
 
   std::vector<tensorflow::TensorShape> arg_shapes;
   XlaCompilationResult compilation_result;
@@ -73,6 +77,7 @@ TEST(LegalizeMlirTest, FailsLegalizesModule) {
       /*shape_determination_fns=*/{}, &compilation_result);
 
   EXPECT_FALSE(status.ok());
+  EXPECT_EQ(count.Delta("tf.DoesntExist", "Unknown"), 1);
 }
 
 TEST(CompileMlirUtil, CreatesPipeline) {
@@ -89,9 +94,7 @@ TEST(CompileMlirUtil, CreatesPipeline) {
 TEST(CompileMlirUtil, HasLegalizationPass) {
   OpPassManager pass_manager;
   llvm::StringRef device_type = "XLA_CPU_JIT";
-  absl::string_view kLegalizeTfPass =
-      "xla-legalize-tf{allow-partial-conversion=false device-type=XLA_CPU_JIT "
-      "legalize-chlo=true prefer-tf2xla=true use-tf2xla-fallback=true})";
+  absl::string_view kLegalizeTfPass = "xla-legalize-tf";
 
   CreateConvertMlirToXlaHloPipeline(pass_manager, device_type,
                                     /*enable_op_fallback=*/true,
@@ -121,5 +124,24 @@ TEST(CompileMlirUtil, CanonicalizationIsExplicitDuringInlining) {
   EXPECT_THAT(pass_description, HasSubstr(kInlinePass));
 }
 
+TEST(LegalizeMlirTest, LegalizesModuleWithDynamicShape) {
+  constexpr char legalization[] = R"(
+  module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+    func.func @main(%arg0: tensor<?xi32, #mhlo.type_extensions<bounds = [1]>>) -> tensor<?xi32, #mhlo.type_extensions<bounds = [1]>> {
+      %0 = "tf.Identity"(%arg0) : (tensor<?xi32, #mhlo.type_extensions<bounds = [1]>>) -> tensor<?xi32, #mhlo.type_extensions<bounds = [1]>>
+      func.return %0 : tensor<?xi32, #mhlo.type_extensions<bounds = [1]>>
+    }
+  })";
+
+  std::vector<tensorflow::TensorShape> arg_shapes = {{1}};
+  XlaCompilationResult compilation_result;
+  Status status = CompileSerializedMlirToXlaHlo(
+      legalization, arg_shapes, /*device_type=*/"XLA_TPU_JIT",
+      /*use_tuple_args=*/true, /*enable_op_fallback=*/false,
+      /*shape_determination_fns=*/{}, &compilation_result);
+
+  EXPECT_TRUE(status.ok());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v0/compile_tf_graph.cc b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_tf_graph.cc
new file mode 100644
index 00000000000..9df38ac36cb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_tf_graph.cc
@@ -0,0 +1,257 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/api/v0/compile_tf_graph.h"
+
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/core/tpu/tpu_compile.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v0 {
+
+using ::tensorflow::tpu::FunctionToHloArgs;
+using ::tensorflow::tpu::GuaranteedConsts;
+using ::tensorflow::tpu::MlirToHloArgs;
+using ::tensorflow::tpu::ShardingAndIndex;
+
+auto* phase2_bridge_compilation_status =
+    tensorflow::monitoring::Counter<1>::New(
+        "/tensorflow/core/tf2xla/api/v0/"
+        "phase2_compilation_status", /*metric_name*/
+        "Tracks the compilation status of the non-mlir bridge",
+        /* metric description */ "status" /* metric label */);
+
+auto* phase2_bridge_compilation_time = tsl::monitoring::Sampler<1>::New(
+    {"/tensorflow/core/tf2xla/api/v0/phase2_compilation_time",
+     "The wall-clock time spent on executing graphs in milliseconds.",
+     "configuration"},
+    // Power of 1.5 with bucket count 45 (> 23 hours)
+    {tsl::monitoring::Buckets::Exponential(1, 1.5, 45)});
+
+// There were no MLIR ops so the old bridge was called successfully.
+constexpr char kOldBridgeNoMlirSuccess[] = "kOldBridgeNoMlirSuccess";
+// There were no MLIR ops so the old bridge was called but it failed.
+constexpr char kOldBridgeNoMlirFailure[] = "kOldBridgeNoMlirFailure";
+
+namespace {
+
+// Time the execution of kernels (in CPU cycles). Meant to be used as RAII.
+struct CompilationTimer {
+  uint64 start_cycles = profile_utils::CpuUtils::GetCurrentClockCycle();
+
+  uint64 ElapsedCycles() {
+    return profile_utils::CpuUtils::GetCurrentClockCycle() - start_cycles;
+  }
+
+  int64_t ElapsedCyclesInMilliseconds() {
+    std::chrono::duration<double> duration =
+        profile_utils::CpuUtils::ConvertClockCycleToTime(ElapsedCycles());
+
+    return std::chrono::duration_cast<std::chrono::milliseconds>(duration)
+        .count();
+  }
+};
+
+// Populates input_output_alias field in the HLO Module proto.
+Status PopulateInputOutputAliasing(
+    mlir::func::FuncOp main_fn,
+    XlaCompiler::CompilationResult* compilation_result, bool use_tuple_args) {
+  constexpr char kAliasingAttr[] = "tf.aliasing_output";
+  llvm::SmallDenseMap<unsigned, unsigned> output_to_input_alias;
+  unsigned num_arguments = main_fn.getNumArguments();
+  for (unsigned arg_index = 0; arg_index < num_arguments; ++arg_index) {
+    if (auto aliasing_output = main_fn.getArgAttrOfType<mlir::IntegerAttr>(
+            arg_index, kAliasingAttr))
+      output_to_input_alias[aliasing_output.getInt()] = arg_index;
+  }
+
+  if (output_to_input_alias.empty()) return OkStatus();
+
+  xla::HloModuleProto* module_proto =
+      compilation_result->computation->mutable_proto();
+  StatusOr<xla::ProgramShape> program_shape_or_status =
+      compilation_result->computation->GetProgramShape();
+  TF_RET_CHECK(program_shape_or_status.ok());
+
+  xla::ProgramShape& program_shape = program_shape_or_status.value();
+  if (!program_shape.result().IsTuple())
+    return errors::Internal("Expect result to have tuple shape");
+
+  xla::HloInputOutputAliasConfig config(program_shape.result());
+  for (auto alias : output_to_input_alias) {
+    if (use_tuple_args) {
+      TF_RETURN_IF_ERROR(config.SetUpAlias(
+          xla::ShapeIndex({alias.first}), 0, xla::ShapeIndex({alias.second}),
+          xla::HloInputOutputAliasConfig::AliasKind::kMayAlias));
+    } else {
+      TF_RETURN_IF_ERROR(config.SetUpAlias(
+          xla::ShapeIndex({alias.first}), alias.second, xla::ShapeIndex({}),
+          xla::HloInputOutputAliasConfig::AliasKind::kMayAlias));
+    }
+  }
+  *module_proto->mutable_input_output_alias() = config.ToProto();
+  return OkStatus();
+}
+
+// Transforms the given module to be suitable for export to TensorFlow GraphDef
+// and then exports all functions to the given library.
+Status PrepareAndExportToLibrary(mlir::ModuleOp module,
+                                 FunctionLibraryDefinition* flib_def) {
+  // Pass pipeline is defined here instead of leveraging the phase one export
+  // pipeline because only the functional to executor dialect conversion and
+  // breakup islands passes are common between the export pipeline and here.
+  // Reconsider this if there is more commonality in the future with more
+  // passes.
+  mlir::PassManager manager(module.getContext());
+  applyTensorflowAndCLOptions(manager);
+  manager.addPass(mlir::TF::CreatePrepareTpuComputationForTfExportPass());
+  manager.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
+  manager.addNestedPass<mlir::func::FuncOp>(
+      mlir::CreateFunctionalToExecutorDialectConversionPass());
+  manager.addPass(mlir::CreateBreakUpIslandsPass());
+
+  mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
+  if (failed(manager.run(module))) return diag_handler.ConsumeStatus();
+
+  GraphExportConfig config;
+  config.export_entry_func_to_flib = true;
+  return tensorflow::ConvertMlirToGraph(module, config, /*graph=*/nullptr,
+                                        flib_def);
+}
+
+}  // namespace
+
+tsl::Status CompileTensorflowGraphToHlo(
+    const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>& computation,
+    const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns
+        shape_determination_funcs,
+    const std::vector<tensorflow::TensorShape>& arg_shapes,
+    std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes,
+    xla::CompileOnlyClient* client,
+    XlaCompiler::CompilationResult* compilation_result) {
+  LOG_FIRST_N(INFO, 1) << "Compiling MLIR computation to XLA HLO using the "
+                          "old (non-MLIR) tf2xla bridge";
+
+  *compilation_result = {};
+  bool has_mlir = computation.index() == 0;
+
+  std::string mlir_string = has_mlir ? "has_mlir" : "has_function_to_hlo";
+  const std::string kBridgePhase2Config =
+      absl::StrCat("graph_old_bridge_", mlir_string);
+  CompilationTimer timer;
+
+  if (!has_mlir) {
+    FunctionToHloArgs function_computation = std::get<1>(computation);
+    Status comp_status = CompileTFFunctionToHlo(
+        *function_computation.flib_def, function_computation.graph_def_version,
+        shape_determination_funcs, arg_shapes,
+        function_computation.guaranteed_constants,
+        *function_computation.function, metadata, client, arg_core_mapping,
+        per_core_arg_shapes, use_tuple_args, compilation_result);
+    if (comp_status.ok()) {
+      phase2_bridge_compilation_status->GetCell(kOldBridgeNoMlirSuccess)
+          ->IncrementBy(1);
+    } else {
+      phase2_bridge_compilation_status->GetCell(kOldBridgeNoMlirFailure)
+          ->IncrementBy(1);
+    }
+
+    phase2_bridge_compilation_time->GetCell(kBridgePhase2Config)
+        ->Add(timer.ElapsedCyclesInMilliseconds());
+    return comp_status;
+  }
+
+  mlir::DialectRegistry registry;
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::MLIRContext context(registry);
+
+  mlir::OwningOpRef<mlir::ModuleOp> mlir_module;
+  TF_RETURN_IF_ERROR(DeserializeMlirModule(std::get<0>(computation).mlir_module,
+                                           &context, &mlir_module));
+  if (!mlir::SetTPUInfeedLayout(mlir_module))
+    return errors::Internal("Failed to set layouts attribute");
+
+  if (VLOG_IS_ON(2)) {
+    tensorflow::DumpMlirOpToFile("legalize_with_old_bridge", mlir_module.get());
+  }
+  constexpr char kEntryFuncName[] = "main";
+  auto main_fn = mlir_module->lookupSymbol<mlir::func::FuncOp>(kEntryFuncName);
+  if (!main_fn) {
+    return errors::Internal(
+        "TPU compile op requires module with a entry function main");
+  }
+
+  // Export functions to the library.
+  auto flib_def = std::make_unique<FunctionLibraryDefinition>(
+      OpRegistry::Global(), FunctionDefLibrary());
+  TF_RETURN_IF_ERROR(PrepareAndExportToLibrary(*mlir_module, flib_def.get()));
+
+  if (VLOG_IS_ON(2)) {
+    tensorflow::DumpMlirOpToFile("legalize_with_old_bridge_post_transform",
+                                 mlir_module.get());
+  }
+  VersionDef versions;
+  if (mlir::failed(ExtractTfVersions(*mlir_module, &versions))) {
+    return errors::Internal(
+        "module attribute in _TPUCompileMlir op is missing tf versions.");
+  }
+
+  NameAttrList func;
+  func.set_name(kEntryFuncName);
+  GuaranteedConsts consts;
+
+  *compilation_result = {};
+
+  TF_RETURN_IF_ERROR(CompileTFFunctionToHlo(
+      *flib_def, versions.producer(), shape_determination_funcs, arg_shapes,
+      consts, func, metadata, client, arg_core_mapping, per_core_arg_shapes,
+      use_tuple_args, compilation_result));
+
+  phase2_bridge_compilation_time->GetCell(kBridgePhase2Config)
+      ->Add(timer.ElapsedCyclesInMilliseconds());
+
+  return PopulateInputOutputAliasing(main_fn, compilation_result,
+                                     use_tuple_args);
+}
+
+};  // namespace v0
+};  // namespace tf2xla
+};  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v0/compile_tf_graph.h b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_tf_graph.h
new file mode 100644
index 00000000000..b249b228330
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_tf_graph.h
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V0_COMPILE_TF_GRAPH_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V0_COMPILE_TF_GRAPH_H_
+
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/variant.h"
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
+#include "tensorflow/compiler/xla/pjrt/compile_options.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v0 {
+
+// Compiles the given Tensorflow graph into xla::HLO. The result is in
+// compilation_result. If the input computation is in MLIR, it will be
+// converted to a Tensorflow graph. Otherwise, the graph compiler will be run.
+tsl::Status CompileTensorflowGraphToHlo(
+    const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>& computation,
+    const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_funcs,
+    const std::vector<tensorflow::TensorShape>& arg_shapes,
+    std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes,
+    xla::CompileOnlyClient* client,
+    XlaCompiler::CompilationResult* compilation_result);
+
+}  // namespace v0
+}  // namespace tf2xla
+};  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V0_COMPILE_TF_GRAPH_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v0/compile_tf_graph_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_tf_graph_test.cc
new file mode 100644
index 00000000000..678e1ab7243
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_tf_graph_test.cc
@@ -0,0 +1,130 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/api/v0/compile_tf_graph.h"
+
+#include <variant>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/core/lib/monitoring/cell_reader.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/lib/monitoring/test_utils.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v0 {
+namespace {
+
+using ::tensorflow::monitoring::testing::CellReader;
+using ::tensorflow::tpu::FunctionToHloArgs;
+using ::tensorflow::tpu::MlirToHloArgs;
+using ::tensorflow::tpu::ShardingAndIndex;
+using ::tsl::monitoring::testing::Histogram;
+
+static constexpr char kCompilationTimeStreamzName[] =
+    "/tensorflow/core/tf2xla/api/v0/phase2_compilation_time";
+
+static constexpr char kCompilationStatusStreamzName[] =
+    "/tensorflow/core/tf2xla/api/v0/phase2_compilation_status";
+
+MlirToHloArgs CreateTestMlirToHloArgs() {
+  static constexpr char kMlirModuleStr[] = R"(
+    module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+    func.func @main() -> () {
+      func.return
+    }
+  })";
+
+  MlirToHloArgs mlir_to_hlo_args;
+  mlir_to_hlo_args.rollout_state =
+      ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_DISABLED;
+  mlir_to_hlo_args.mlir_module = kMlirModuleStr;
+
+  return mlir_to_hlo_args;
+}
+
+class CompileTFGraphTest : public ::testing::Test {
+ public:
+  tsl::Status CompileWithComputation(
+      const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>
+          computation) {
+    se::Platform* platform =
+        se::MultiPlatformManager::PlatformWithName("Host").value();
+    auto client =
+        xla::ClientLibrary::GetOrCreateCompileOnlyClient(platform).value();
+
+    std::vector<TensorShape> arg_shapes;
+    bool use_tuple_args = true;
+    std::vector<ShardingAndIndex> arg_core_mapping;
+    std::vector<std::vector<xla::Shape>> per_core_arg_shapes;
+    XlaCompiler::CompilationResult result;
+    tpu::TPUCompileMetadataProto metadata_proto;
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_funcs;
+
+    return CompileTensorflowGraphToHlo(
+        computation, metadata_proto, use_tuple_args, shape_determination_funcs,
+        arg_shapes, &arg_core_mapping, &per_core_arg_shapes, client, &result);
+  }
+};
+
+TEST_F(CompileTFGraphTest, RecordsStreamzForMlirFallback) {
+  CellReader<Histogram> compilation_time(kCompilationTimeStreamzName);
+
+  MlirToHloArgs mlir_to_hlo_args = CreateTestMlirToHloArgs();
+
+  TF_EXPECT_OK(CompileWithComputation(mlir_to_hlo_args));
+
+  Histogram histogram = compilation_time.Delta("graph_old_bridge_has_mlir");
+
+  EXPECT_EQ(histogram.num(), 1);
+}
+
+TEST_F(CompileTFGraphTest, RecordsStreamzForFunctionToHlo) {
+  CellReader<Histogram> compilation_time(kCompilationTimeStreamzName);
+  CellReader<int64_t> compilation_status(kCompilationStatusStreamzName);
+
+  FunctionDef empty_function =
+      tensorflow::FunctionDefHelper::Create("empty", {}, {}, {}, {}, {});
+
+  tensorflow::FunctionDefLibrary fdef;
+  *(fdef.add_function()) = empty_function;
+  tensorflow::FunctionLibraryDefinition flib_def(
+      tensorflow::OpRegistry::Global(), fdef);
+
+  OpInputList guaranteed_constants;
+  NameAttrList function;
+  function.set_name("empty");
+
+  FunctionToHloArgs function_to_hlo_args = {&function,
+                                            &flib_def,
+                                            /*graph_def_version=*/0,
+                                            {&guaranteed_constants}};
+
+  TF_EXPECT_OK(CompileWithComputation(function_to_hlo_args));
+
+  Histogram histogram =
+      compilation_time.Delta("graph_old_bridge_has_function_to_hlo");
+
+  EXPECT_EQ(histogram.num(), 1);
+  EXPECT_EQ(compilation_status.Delta("kOldBridgeNoMlirSuccess"), 1);
+}
+
+}  // namespace
+}  // namespace v0
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
new file mode 100644
index 00000000000..a95e558f506
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
@@ -0,0 +1,82 @@
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        ":__subpackages__",
+        ":tf2xla_users",
+    ],
+)
+
+# Please reach out to tf-bridge-team@ before using the TF2XLA bridge.
+package_group(name = "tf2xla_users")
+
+cc_library(
+    name = "legalize_tf",
+    srcs = ["legalize_tf.cc"],
+    hdrs = ["legalize_tf.h"],
+    deps = [
+        ":device_type_proto_cc",
+        "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/jit:shape_inference",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
+        "//tensorflow/compiler/mlir/tensorflow:set_tpu_infeed_layout",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow:translate_utils",
+        "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
+        "//tensorflow/compiler/mlir/tf2xla/api/v0:compile_tf_graph",
+        "//tensorflow/compiler/tf2xla:layout_util",
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/xla/client:compile_only_client",
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/xla/pjrt:compile_options_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:statusor",
+        "//tensorflow/core/tpu:tpu_compile",
+        "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
+        "//tensorflow/core/tpu/kernels:tpu_compile_proto_cc",
+        "//tensorflow/core/tpu/kernels:tpu_util_hdrs",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:variant",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@stablehlo//:register",
+    ],
+)
+
+tf_cc_test(
+    name = "legalize_tf_test",
+    srcs = ["legalize_tf_test.cc"],
+    deps = [
+        ":device_type_proto_cc",
+        ":legalize_tf",
+        "//tensorflow/compiler/jit",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/lib/monitoring:cell_reader",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
+        "//tensorflow/tsl/lib/monitoring:test_utils",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_proto_library(
+    name = "device_type_proto",
+    srcs = ["device_type.proto"],
+    cc_api_version = 2,
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/device_type.proto b/tensorflow/compiler/mlir/tf2xla/api/v1/device_type.proto
new file mode 100644
index 00000000000..6bca9312b4a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/device_type.proto
@@ -0,0 +1,11 @@
+syntax = "proto2";
+
+package tensorflow.tf2xla.v1;
+
+// The requested device type to compile for.
+enum DeviceType {
+  DEVICE_TYPE_UNSPECIFIED = 0;
+  XLA_TPU_JIT = 1;
+  XLA_CPU_JIT = 2;
+  XLA_GPU_JIT = 3;
+}
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf.cc
new file mode 100644
index 00000000000..f5f6818d33e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf.cc
@@ -0,0 +1,281 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf.h"
+
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/variant.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "stablehlo/dialect/Register.h"  // from @stablehlo
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/shape_inference.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v0/compile_tf_graph.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/core/tpu/kernels/tpu_util.h"
+#include "tensorflow/core/tpu/tpu_compile.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v1 {
+
+using tpu::FunctionToHloArgs;
+using tpu::MlirToHloArgs;
+using tpu::ShardingAndIndex;
+
+auto* mlir_second_phase_count = tensorflow::monitoring::Counter<1>::New(
+    "/tensorflow/core/tf2xla/api/v1/phase2_compilation_status" /*metric_name*/,
+    "Counts the number of graphs that were analyzed prior deciding whether "
+    "the MLIR or the old bridge will be used" /* metric description */,
+    "status" /* metric label */);
+
+auto* phase2_bridge_compilation_time = tsl::monitoring::Sampler<1>::New(
+    {"/tensorflow/core/tf2xla/api/v1/phase2_compilation_time",
+     "The wall-clock time spent on executing graphs in milliseconds.",
+     "configuration"},
+    // Power of 1.5 with bucket count 45 (> 23 hours)
+    {tsl::monitoring::Buckets::Exponential(1, 1.5, 45)});
+
+// The label `status` is used to count the following events:
+// MLIR bridge phase 2 was executed and the graph was processed successfully
+// (fallback enabled).
+constexpr char kMlirWithFallbackModeSuccess[] = "kMlirWithFallbackModeSuccess";
+// MLIR bridge phase 2 compilation was failure (fallback enabled).
+constexpr char kMlirWithFallbackModeFailure[] = "kMlirWithFallbackModeFailure";
+// MLIR bridge phase 2 compilation was successful (manually enabled).
+constexpr char kMlirModeSuccess[] = "kMlirModeSuccess";
+// MLIR bridge phase 2 compilation fails (manually enabled)
+constexpr char kMlirModeFailure[] = "kMlirModeFailure";
+// Old bridge compilation was run successfully (was run because MLIR bridge
+// could not process the graph).
+constexpr char kOldBridgeMlirFilteredSuccess[] =
+    "kOldBridgeMlirFilteredSuccess";
+// Old bridge failed (was run b/c MLIR bridge could not process the graph).
+constexpr char kOldBridgeMlirFilteredFailure[] =
+    "kOldBridgeMlirFilteredFailure";
+// Old bridge compilation was successfully run after MLIR bridge ran and failed.
+constexpr char kOldBridgeWithFallbackModeSuccess[] =
+    "kOldBridgeWithFallbackModeSuccess";
+// Old Bridge failed in fallback (was run because MLIR bridge failed first).
+constexpr char kOldBridgeWithFallbackModeFailure[] =
+    "kOldBridgeWithFallbackModeFailure";
+
+// Time the execution of kernels (in CPU cycles). Meant to be used as RAII.
+struct CompilationTimer {
+  uint64 start_cycles = profile_utils::CpuUtils::GetCurrentClockCycle();
+
+  uint64 ElapsedCycles() {
+    return profile_utils::CpuUtils::GetCurrentClockCycle() - start_cycles;
+  }
+
+  int64_t ElapsedCyclesInMilliseconds() {
+    std::chrono::duration<double> duration =
+        profile_utils::CpuUtils::ConvertClockCycleToTime(ElapsedCycles());
+
+    return std::chrono::duration_cast<std::chrono::milliseconds>(duration)
+        .count();
+  }
+};
+
+namespace {
+
+bool ShouldFallbackToGraphCompiler(
+    const std::variant<MlirToHloArgs, FunctionToHloArgs>& computation) {
+  if (computation.index() == 1) return true;
+
+  return std::get<0>(computation).rollout_state ==
+         ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_DISABLED;
+}
+
+Status CompileFromMlirToXlaHlo(
+    bool enable_op_fallback,
+    const std::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
+    const tpu::TPUCompileMetadataProto& metadata, llvm::StringRef device_type,
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns& shape_determination_fns,
+    bool use_tuple_args, XlaCompiler::CompilationResult* compilation_result,
+    std::vector<std::unique_ptr<mlir::Pass>>& custom_legalization_passes,
+    const std::vector<TensorShape>& arg_shapes,
+    std::vector<ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes) {
+  if (enable_op_fallback) {
+    LOG_FIRST_N(INFO, 1)
+        << "Compiling MLIR computation to XLA HLO using MLIR tf2xla bridge in "
+           "the op by op fallback mode. This is Phase 2 of the TF2XLA Bridge. "
+           "Old (non-MLIR) bridge may be used in case of unsupported feature "
+           "or compilation failure from the MLIR bridge (full fallback mode).";
+  } else {
+    LOG_FIRST_N(INFO, 1)
+        << "Compiling MLIR computation to XLA HLO using MLIR tf2xla bridge "
+           "phase 2. Fallback to the old (non-MLIR) bridge is disabled. "
+           "Op-by-op fallback is also disabled.";
+  }
+
+  mlir::DialectRegistry registry;
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+  mlir::stablehlo::registerAllDialects(registry);
+  mlir::MLIRContext context(registry);
+  mlir::OwningOpRef<mlir::ModuleOp> mlir_module;
+  TF_RETURN_IF_ERROR(DeserializeMlirModule(std::get<0>(computation).mlir_module,
+                                           &context, &mlir_module));
+  if (!mlir::SetTPUInfeedLayout(mlir_module))
+    return errors::Internal("Failed to set layouts attribute");
+
+  TF_RETURN_IF_ERROR(CompileSerializedMlirToXlaHlo(
+      SerializeMlirModule(mlir_module.get()), arg_shapes, device_type,
+      use_tuple_args, enable_op_fallback, shape_determination_fns,
+      compilation_result, custom_legalization_passes, metadata.module_name()));
+
+  // Compute how arguments are shared across different cores.
+  return tpu::GetShardingInfo(metadata, arg_shapes, shape_determination_fns,
+                              arg_core_mapping, per_core_arg_shapes);
+}
+
+}  // namespace
+
+tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
+    const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>& computation,
+    const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
+    llvm::StringRef device_type,
+    std::vector<std::unique_ptr<mlir::Pass>>& custom_legalization_passes,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    const std::vector<tensorflow::TensorShape>& arg_shapes,
+    std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes,
+    xla::CompileOnlyClient* client) {
+  XlaCompilationResult compilation_result;
+  // If there are no MLIR args, compile the given function in the library.
+  if (ShouldFallbackToGraphCompiler(computation)) {
+    TF_RETURN_IF_ERROR(tf2xla::v0::CompileTensorflowGraphToHlo(
+        computation, metadata, use_tuple_args, shape_determination_fns,
+        arg_shapes, arg_core_mapping, per_core_arg_shapes, client,
+        &compilation_result));
+    return compilation_result;
+  }
+
+  // We could only end up here if the MLIR bridge was explicitly enabled or
+  // if it was in the default/unspecified state and graph analysis in the first
+  // phase has not identified unsupported features.
+  // Enabling op fallback also enables whole graph fallback if op by op
+  // fallback failed.
+  bool enable_op_fallback =
+      std::get<0>(computation).rollout_state !=
+      ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED;
+
+  Status mlir_bridge_status = tsl::OkStatus();
+  {
+    CompilationTimer timer;
+    std::string enabled_string = enable_op_fallback ? "enabled" : "disabled";
+    const std::string kMlirBridgeFallback =
+        absl::StrCat("mlir_bridge_op_fallback_", enabled_string);
+
+    mlir_bridge_status = CompileFromMlirToXlaHlo(
+        enable_op_fallback, computation, metadata, device_type,
+        shape_determination_fns, use_tuple_args, &compilation_result,
+        custom_legalization_passes, arg_shapes, arg_core_mapping,
+        per_core_arg_shapes);
+
+    phase2_bridge_compilation_time->GetCell(kMlirBridgeFallback)
+        ->Add(timer.ElapsedCyclesInMilliseconds());
+  }
+
+  if (mlir_bridge_status.ok()) {
+    if (enable_op_fallback) {
+      VLOG(1) << "Successfully compiled MLIR computation to XLA HLO using MLIR "
+                 "tf2xla bridge";
+      mlir_second_phase_count->GetCell(kMlirWithFallbackModeSuccess)
+          ->IncrementBy(1);
+    } else {
+      mlir_second_phase_count->GetCell(kMlirModeSuccess)->IncrementBy(1);
+    }
+    return compilation_result;
+  } else if (!enable_op_fallback) {
+    // Don't fallback to the old bridge if op-by-op fallback isn't enabled.
+    mlir_second_phase_count->GetCell(kMlirModeFailure)->IncrementBy(1);
+    return mlir_bridge_status;
+  }
+
+  bool filtered_graph = false;
+  if (mlir_bridge_status == CompileToHloGraphAnalysisFailedError()) {
+    VLOG(1) << "Filtered out MLIR computation to XLA HLO using MLIR tf2xla "
+               "bridge. Falling back to old (non-MLIR) bridge.";
+    filtered_graph = true;
+  } else {
+    mlir_second_phase_count->GetCell(kMlirWithFallbackModeFailure)
+        ->IncrementBy(1);
+
+    VLOG(1) << "Failed to compile MLIR computation to XLA HLO using MLIR "
+               "tf2xla bridge. Falling back to old (non-MLIR) bridge. MLIR "
+               "bridge compilation status: "
+            << mlir_bridge_status;
+  }
+
+  Status old_bridge_status = tf2xla::v0::CompileTensorflowGraphToHlo(
+      computation, metadata, use_tuple_args, shape_determination_fns,
+      arg_shapes, arg_core_mapping, per_core_arg_shapes, client,
+      &compilation_result);
+
+  // Record filter/failure stats only if the old bridge succeeds. This removes
+  // noise from invalid inputs.
+  if (!old_bridge_status.ok()) {
+    // If the old bridge failed for this input as well. Mark the input as
+    // invalid. This might be incorrect in case of old bridge bugs but that
+    // should be rare.
+    if (filtered_graph) {
+      mlir_second_phase_count->GetCell(kOldBridgeMlirFilteredFailure)
+          ->IncrementBy(1);
+    } else {
+      mlir_second_phase_count->GetCell(kOldBridgeWithFallbackModeFailure)
+          ->IncrementBy(1);
+    }
+    return old_bridge_status;
+  }
+
+  if (filtered_graph) {
+    mlir_second_phase_count->GetCell(kOldBridgeMlirFilteredSuccess)
+        ->IncrementBy(1);
+  } else {
+    mlir_second_phase_count->GetCell(kOldBridgeWithFallbackModeSuccess)
+        ->IncrementBy(1);
+  }
+  return compilation_result;
+}
+
+};  // namespace v1
+};  // namespace tf2xla
+};  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf.h b/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf.h
new file mode 100644
index 00000000000..f9ddb0cad78
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf.h
@@ -0,0 +1,68 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_LEGALIZE_TF_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_LEGALIZE_TF_H_
+
+#include <memory>
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/variant.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tf2xla/api/v1/device_type.pb.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
+#include "tensorflow/compiler/xla/pjrt/compile_options.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v1 {
+
+// Legalizes the given mlir::Module into XLA HLO. If successful, returns the
+// compiled XLA HLO. V1 of the tf2xla uses MLIR whereas V0 does not use MLIR.
+//
+// Inputs:
+//  computation - The MLIR module op. It currently takes in
+//  tpu::FunctionToHloArgs but this is deprecated. arg_shapes - The shapes of
+//  the arguments in module_op. device_type - The device type to compile for.
+//  use_tuple_args - Pack the incoming arg shapes into a single tuple.
+//  custom_legalization_passes - Extra passes to lower from TF -> MHLO.
+//  arg_shapes  - The shapes of the args.
+//  arg_core_mapping - Which args go on which cores.
+//  per_core_arg_shapes - For each core, the shapes for each argument.
+//  client - The Xla Compilation client.
+tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
+    const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>& computation,
+    const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
+    llvm::StringRef device_type,
+    std::vector<std::unique_ptr<mlir::Pass>>& custom_legalization_passes,
+    XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns,
+    const std::vector<tensorflow::TensorShape>& arg_shapes,
+    std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes,
+    xla::CompileOnlyClient* client);
+
+};  // namespace v1
+};  // namespace tf2xla
+};  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_API_V1_LEGALIZE_TF_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf_test.cc
new file mode 100644
index 00000000000..32e90358ab9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/mlir/tf2xla/api/v1/device_type.pb.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/core/lib/monitoring/cell_reader.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/tsl/lib/monitoring/test_utils.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace v1 {
+
+using ::tensorflow::monitoring::testing::CellReader;
+using tpu::FunctionToHloArgs;
+using tpu::MlirToHloArgs;
+using tpu::ShardingAndIndex;
+using tpu::TPUCompileMetadataProto;
+using ::tsl::monitoring::testing::Histogram;
+
+static constexpr char kCompilationTimeStreamzName[] =
+    "/tensorflow/core/tf2xla/api/v1/phase2_compilation_time";
+static constexpr char kCompilationStatusStreamzName[] =
+    "/tensorflow/core/tf2xla/api/v1/phase2_compilation_status";
+
+static constexpr char kMlirModuleStr[] = R"(
+  module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+  func.func @main() -> () {
+    func.return
+  }
+})";
+
+tsl::StatusOr<XlaCompiler::CompilationResult> CompileMlirModule(
+    ConfigProto::Experimental::MlirBridgeRollout rollout_state) {
+  MlirToHloArgs mlir_to_hlo_args;
+  mlir_to_hlo_args.rollout_state = rollout_state;
+  mlir_to_hlo_args.mlir_module = kMlirModuleStr;
+
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("Host").value();
+  auto client =
+      xla::ClientLibrary::GetOrCreateCompileOnlyClient(platform).value();
+
+  std::vector<TensorShape> arg_shapes;
+  TPUCompileMetadataProto metadata_proto;
+  bool use_tuple_args = true;
+  std::vector<ShardingAndIndex> arg_core_mapping;
+  std::vector<std::vector<xla::Shape>> per_core_arg_shapes;
+  std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes;
+
+  return LegalizeMlirToHlo(mlir_to_hlo_args, metadata_proto, use_tuple_args,
+                           /*device_type=*/"XLA_TPU_JIT",
+                           custom_legalization_passes,
+                           /*shape_determination_fns=*/{}, arg_shapes,
+                           &arg_core_mapping, &per_core_arg_shapes, client);
+}
+
+TEST(LegalizeTFTest, RecordsStreamzForMlirBridge) {
+  CellReader<Histogram> compilation_time(kCompilationTimeStreamzName);
+  CellReader<int64_t> compilation_status(kCompilationStatusStreamzName);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      XlaCompiler::CompilationResult result,
+      CompileMlirModule(
+          ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED));
+
+  Histogram histogram =
+      compilation_time.Delta("mlir_bridge_op_fallback_disabled");
+  EXPECT_EQ(histogram.num(), 1);
+  EXPECT_EQ(compilation_status.Delta("kMlirModeSuccess"), 1);
+}
+
+TEST(LegalizeTFTest, RecordsStreamzForMlirOpFallback) {
+  CellReader<Histogram> compilation_time(kCompilationTimeStreamzName);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      XlaCompiler::CompilationResult result,
+      CompileMlirModule(
+          ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_UNSPECIFIED));
+
+  Histogram histogram =
+      compilation_time.Delta("mlir_bridge_op_fallback_enabled");
+  EXPECT_EQ(histogram.num(), 1);
+}
+
+TEST(LegalizeTFTest, RecordsStreamzForNoMlirFallback) {
+  FunctionDef my_func =
+      tensorflow::FunctionDefHelper::Create("empty", {}, {}, {}, {}, {});
+
+  tensorflow::FunctionDefLibrary fdef;
+  *(fdef.add_function()) = my_func;
+  tensorflow::FunctionLibraryDefinition flib_def(
+      tensorflow::OpRegistry::Global(), fdef);
+
+  OpInputList guaranteed_constants;
+  NameAttrList function;
+  FunctionToHloArgs function_to_hlo_args{&function,
+                                         &flib_def,
+                                         /*graph_def_version=*/0,
+                                         {&guaranteed_constants}};
+
+  se::Platform* cpu_platform =
+      se::MultiPlatformManager::PlatformWithName("Host").value();
+  auto client =
+      xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform).value();
+
+  std::vector<TensorShape> arg_shapes;
+  TPUCompileMetadataProto metadata_proto;
+  bool use_tuple_args = true;
+  std::vector<ShardingAndIndex> arg_core_mapping;
+  std::vector<std::vector<xla::Shape>> per_core_arg_shapes;
+  std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes;
+
+  // This doesn't actually compile correctly.
+  tsl::StatusOr<XlaCompiler::CompilationResult> compile_result =
+      LegalizeMlirToHlo(function_to_hlo_args, metadata_proto, use_tuple_args,
+                        /*device_type=*/"XLA_CPU_JIT",
+                        custom_legalization_passes,
+                        /*shape_determination_fns=*/{}, arg_shapes,
+                        &arg_core_mapping, &per_core_arg_shapes, client);
+
+  EXPECT_FALSE(compile_result.ok());
+}
+
+}  // namespace v1
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.cc b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.cc
index b4736462e26..6479253dd6e 100644
--- a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.cc
+++ b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
 
+#include <optional>
+
 #include "tensorflow/compiler/jit/flags.h"
 
 namespace tensorflow {
@@ -22,7 +24,7 @@ namespace tensorflow {
 MlirBridgeRolloutPolicy GetMlirBridgeRolloutPolicy(
     const tensorflow::Graph& graph,
     const FunctionLibraryDefinition* function_library,
-    std::optional<ConfigProto> config_proto,
+    std::optional<ConfigProto> config_proto, bool is_tpu_graph,
     bool uses_uninitialized_resource_args, bool is_v1_compat,
     bool record_stats) {
   switch (GetMlirBridgeRolloutState(config_proto)) {
diff --git a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
index 262ebc0fd2e..9f67442205d 100644
--- a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
+++ b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_MLIR_BRIDGE_ROLLOUT_POLICY_H_
 #define TENSORFLOW_COMPILER_MLIR_TF2XLA_MLIR_BRIDGE_ROLLOUT_POLICY_H_
 
+#include <optional>
+
 #include "mlir/IR/BuiltinOps.h"
-#include "absl/types/optional.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
@@ -52,7 +53,7 @@ enum class MlirBridgeRolloutPolicy {
 MlirBridgeRolloutPolicy GetMlirBridgeRolloutPolicy(
     const tensorflow::Graph& graph,
     const FunctionLibraryDefinition* function_library,
-    std::optional<tensorflow::ConfigProto> config_proto,
+    std::optional<tensorflow::ConfigProto> config_proto, bool is_tpu_graph,
     bool uses_uninitialized_resource_args, bool is_v1_compat,
     bool record_stats);
 
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-communication.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-communication.mlir
index f0cc4783d9d..be4026d77c6 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-communication.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-communication.mlir
@@ -858,3 +858,15 @@ func.func @multi_block_func() {
   %0 = "tf.XlaRecvFromHost"() {key = "recv_key", shape = #tf_type.shape<>} : () -> tensor<i32>
   func.return
 }
+
+// -----
+
+// CHECK-LABEL: func @host_compute_manual_sharding
+func.func @host_compute_manual_sharding(%arg0: tensor<i32>) {
+  // CHECK:  "mhlo.send"
+  // CHECK-SAME: mhlo.sharding = "\08\04"
+  // CHECK:  "mhlo.recv"
+  // CHECK-SAME: mhlo.sharding = "\08\04"
+  %0 = "tf._XlaHostComputeMlir"(%arg0) {recv_key = "host_compute_channel_recv", send_key = "host_compute_channel_send", host_mlir_module = "", manual_sharding = true} : (tensor<i32>) -> tensor<i32>
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-include-tf2xla-fallback.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-include-tf2xla-fallback.mlir
index ab6d07c3e84..6382f05f708 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-include-tf2xla-fallback.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-include-tf2xla-fallback.mlir
@@ -1,7 +1,7 @@
-// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion use-tf2xla-fallback=false" -verify-diagnostics %s | FileCheck --check-prefix NO_FALLBACK %s
-// RUN: tf-opt "-xla-legalize-tf=use-tf2xla-fallback=true device-type=XLA_CPU_JIT" -verify-diagnostics %s | FileCheck --check-prefix SUPPORTED_FALLBACK_DEVICE %s
-// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion use-tf2xla-fallback=true" %s | FileCheck --check-prefix UNSPECIFIED_FALLBACK_DEVICE %s
-// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion use-tf2xla-fallback=true device-type=INVALID_DEVICE_TYPE" %s | FileCheck --check-prefix UNSUPPORTED_FALLBACK_DEVICE %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion use-tf2xla-fallback=false use-tf2xla-hlo-importer=false" -verify-diagnostics %s | FileCheck --check-prefix NO_FALLBACK %s
+// RUN: tf-opt "-xla-legalize-tf=use-tf2xla-fallback=true device-type=XLA_CPU_JIT use-tf2xla-hlo-importer=false" -verify-diagnostics %s | FileCheck --check-prefix SUPPORTED_FALLBACK_DEVICE %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion use-tf2xla-fallback=true use-tf2xla-hlo-importer=false" %s | FileCheck --check-prefix UNSPECIFIED_FALLBACK_DEVICE %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion use-tf2xla-fallback=true device-type=INVALID_DEVICE_TYPE use-tf2xla-hlo-importer=false" %s | FileCheck --check-prefix UNSUPPORTED_FALLBACK_DEVICE %s
 
 // We run this test four times:
 // 1) Legalize without using TF2XLA fallback (ops cannot be legalized).
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-prefer-tf2xla.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-prefer-tf2xla.mlir
index 2f69349d13b..a2a20eb558c 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-prefer-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-prefer-tf2xla.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion device-type=XLA_CPU_JIT legalize-chlo=false use-tf2xla-fallback=true prefer-tf2xla=true" %s | FileCheck %s
-// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion device-type=XLA_CPU_JIT legalize-chlo=false prefer-tf2xla=true" %s | FileCheck --check-prefix NOFALLBACK %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion device-type=XLA_CPU_JIT legalize-chlo=false use-tf2xla-fallback=true prefer-tf2xla=true use-tf2xla-hlo-importer=false" %s | FileCheck %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion device-type=XLA_CPU_JIT legalize-chlo=false prefer-tf2xla=true use-tf2xla-hlo-importer=false" %s | FileCheck --check-prefix NOFALLBACK %s
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
 
@@ -92,4 +92,51 @@ func.func @simple_strided_slice(%input: tensor<4x8xf32>) -> tensor<3x2xf32> {
   func.return %output : tensor<3x2xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// Fused op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: fused_conv2d
+func.func @fused_conv2d(%input: tensor<1x300x300x40xi8>,
+                        %filter: tensor<3x3x40x40xi8>,
+                        %bias: tensor<40xf32>,
+                        %act: tensor<0xi8>) -> tensor<1x300x300x40xi8> {
+
+  // CHECK:       %[[v0:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK-NEXT:  %[[v1:.*]] = mhlo.constant dense<2.000000e+00> : tensor<f32>
+  // CHECK-NEXT:  %[[v2:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK-NEXT:  %[[v3:.*]] = mhlo.constant dense<2.000000e+00> : tensor<f32>
+  // CHECK-NEXT:  %[[v4:.*]] = mhlo.convert %arg0 : (tensor<1x300x300x40xi8>) -> tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %[[v5:.*]] = mhlo.convert %arg1 : (tensor<3x3x40x40xi8>) -> tensor<3x3x40x40xf32>
+  // CHECK:       %[[v6:.*]] = mhlo.convolution(%[[v4]], %[[v5]])
+  // CHECK-SAME{LITERAL}:  dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
+  // CHECK-SAME{LITERAL}:  window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
+  // CHECK-SAME:  batch_group_count = 1
+  // CHECK-SAME:  feature_group_count = 1
+  // CHECK-NEXT:  %[[v7:.*]] = mhlo.convert %[[v6]] : tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %[[v8:.*]] = "mhlo.broadcast_in_dim"(%[[v2]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %[[v9:.*]] = mhlo.multiply %[[v7]], %[[v8]] : tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %[[v10:.*]] = mhlo.convert %arg2 : tensor<40xf32>
+  // CHECK-NEXT:  %[[v11:.*]] = "mhlo.broadcast_in_dim"(%[[v10]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<40xf32>) -> tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %[[v12:.*]] = mhlo.add %[[v9]], %[[v11]] : tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %[[v13:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT:  %[[v14:.*]] = "mhlo.broadcast_in_dim"(%[[v13]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %[[v15:.*]] = mhlo.maximum %[[v12]], %[[v14]] : tensor<1x300x300x40xf32>
+  // CHECK-DAG:   %[[v16:.*]] = mhlo.constant dense<-1.280000e+02> : tensor<f32>
+  // CHECK-DAG:   %[[v17:.*]] = mhlo.constant dense<1.270000e+02> : tensor<f32>
+  // CHECK-DAG:   %[[v18:.*]] = "mhlo.broadcast_in_dim"(%[[v16]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x300x300x40xf32>
+  // CHECK-DAG:   %[[v19:.*]] = "mhlo.broadcast_in_dim"(%[[v17]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x300x300x40xf32>
+  // CHECK:       %[[v20:.*]] = mhlo.clamp %[[v18]], %[[v15]], %[[v19]] : tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %[[v21:.*]] = mhlo.round_nearest_even %[[v20]] : tensor<1x300x300x40xf32>
+  // CHECK-NEXT:  %[[v22:.*]] = mhlo.convert %[[v21]] : (tensor<1x300x300x40xf32>) -> tensor<1x300x300x40xi8>
+  // CHECK-NEXT:  return %[[v22]] : tensor<1x300x300x40xi8>
+
+  %input_scale = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+  %side_input_scale = "tf.Const"() {value = dense<2.0> : tensor<f32>} : () -> tensor<f32>
+  %conv2d = "tf._FusedConv2D"(%input, %filter, %bias, %act, %input_scale, %side_input_scale) {
+    data_format = "NHWC", dilations = [1, 1, 1, 1], epsilon = 9.99999974E-5 : f32, explicit_paddings = [], filter_format = "HWIO", fused_ops = ["BiasAdd", "Relu"], leakyrelu_alpha = 2.000000e-01 : f32, num_args = 2 : i64, operand_segment_sizes = array<i32: 1, 1, 2, 2>, padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x300x300x40xi8>, tensor<3x3x40x40xi8>, tensor<40xf32>, tensor<0xi8>, tensor<f32>, tensor<f32>) -> tensor<1x300x300x40xi8>
+  func.return %conv2d : tensor<1x300x300x40xi8>
 }
+
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
new file mode 100644
index 00000000000..1d6bfb6bcd7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
@@ -0,0 +1,556 @@
+// RUN: tf-opt "-xla-legalize-tf=device-type=XLA_CPU_JIT allow-partial-conversion=true prefer-tf2xla=true use-tf2xla-fallback=true use-tf2xla-hlo-importer=true" %s -verify-diagnostics -mlir-disable-threading  | FileCheck %s
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+  // CHECK-LABEL: binary_op
+  func.func @binary_op(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+    // CHECK: mhlo.atan2
+    %0 = "tf.Atan2"(%arg0, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+    func.return %0 : tensor<2xf32>
+  }
+
+  // CHECK-LABEL: multiple_return_values
+  func.func @multiple_return_values(%arg0: tensor<3xi64>) -> tensor<i64> {
+     %0:3 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<3xi64>) -> (tensor<i64>, tensor<i64>, tensor<i64>)
+    // CHECK: return %1 : tensor<i64>
+    func.return %0#0 : tensor<i64>
+  }
+
+  // CHECK-LABEL: constant_parameter
+  func.func @constant_parameter(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+    %0 = "tf.Const"() {value = dense<1.42> : tensor<2xf32>} : () -> tensor<2xf32>
+    // CHECK: mhlo.atan2 %arg0, %4
+    %1 = "tf.Atan2"(%arg0, %0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+    func.return %0 : tensor<2xf32>
+  }
+
+  // CHECK-LABEL: uses_translated_return_type
+  func.func @uses_translated_return_type(%arg0: tensor<3xf32>) -> tensor<?xf32> {
+    // CHECK: tensor.cast %{{[0-9]+}} : tensor<?xf32, #mhlo.type_extensions<bounds = [3]>> to tensor<?xf32>
+    %y, %idx = "tf.Unique"(%arg0) {device = ""} : (tensor<3xf32>) -> (tensor<?xf32>, tensor<3xi32>)
+    return %y : tensor<?xf32>
+  }
+
+  // CHECK-LABEL: @abs
+  func.func @abs(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+    // CHECK-NOT: tf.Abs
+    %0 = "tf.Abs"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+    func.return %0 : tensor<2xf32>
+  }
+
+  // CHECK-LABEL: func @testBroadcastGradientArgs
+  func.func @testBroadcastGradientArgs(%s0: tensor<4xi32>, %s1: tensor<4xi32>) -> (tensor<1xi32>, tensor<0xi32>) {
+    // CHECK:     tf.BroadcastGradientArgs
+    %r0, %r1 = "tf.BroadcastGradientArgs"(%s0, %s1) : (tensor<4xi32>, tensor<4xi32>) -> (tensor<1xi32>, tensor<0xi32>)
+    func.return %r0, %r1 : tensor<1xi32>, tensor<0xi32>
+  }
+
+  // CHECK-LABEL: @acos
+  func.func @acos(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+    // CHECK-NOT:  tf.Acos
+    %0 = "tf.Acos"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+    func.return %0 : tensor<2xf32>
+  }
+
+  // CHECK-LABEL: strided_slice_uses_mlir
+  func.func @strided_slice_uses_mlir(%input: tensor<4x8xf32>) -> tensor<3x2xf32> {
+    %begin = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+    %end = "tf.Const"() {value = dense<[3, 7]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+    %strides = "tf.Const"() {value = dense<[1, 3]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+
+    // CHECK-NOT: tf.StridedSlice
+    %output = "tf.StridedSlice"(%input, %begin, %end, %strides)
+        : (tensor<4x8xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<3x2xf32>
+    func.return %output : tensor<3x2xf32>
+  }
+
+  // CHECK-LABEL: func @random_uniform_uses_mlir
+  func.func @random_uniform_uses_mlir(%arg0: tensor<3xi32>) -> tensor<12x?x64xf32> {
+    // CHECK-NOT: tf.RandomUniform
+    %0 = "tf.RandomUniform"(%arg0) : (tensor<3xi32>) -> tensor<12x?x64xf32>
+    func.return %0 : tensor<12x?x64xf32>
+  }
+
+  // CHECK-LABEL: unknown_op
+  func.func @unknown_op(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+    // CHECK: tf.CustomTestOp
+    %0 = "tf.CustomTestOp"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+
+    func.return %0 : tensor<2xf32>
+  }
+
+  // CHECK-LABEL: add_v2
+  func.func @add_v2(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+    // CHECK: mhlo.add %arg0, %arg0 : tensor<2xi32>
+    %0 = "tf.AddV2"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+    func.return %0: tensor<2xi32>
+  }
+
+  // CHECK-LABEL: not_allowlisted_op
+  func.func @not_allowlisted_op(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<?x?x?xf32> {
+    // CHECK: tf.TensorListReserve
+    %0 = "tf.TensorListReserve"(%arg0, %arg1) : (tensor<3xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x?x?xf32>>>
+    // CHECK: tf.TensorListGetItem
+    %1 = "tf.TensorListGetItem"(%0, %arg2, %arg0) : (tensor<!tf_type.variant<tensor<?x?x?xf32>>>, tensor<i32>, tensor<3xi32>) -> tensor<?x?x?xf32>
+    func.return %1 : tensor<?x?x?xf32>
+  }
+
+  // CHECK-LABEL: unranked_operand
+  func.func @unranked_operand(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+    // CHECK: tf.Atan2
+    // expected-remark@+1 {{lowering requires bounded tensor operands}}
+    %0 = "tf.Atan2"(%arg0, %arg0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+
+    func.return %0 : tensor<*xf32>
+  }
+
+  // CHECK-LABEL: dynamic_operand
+  func.func @dynamic_operand(%arg0: tensor<?xf32>) -> tensor<?xf32> {
+    // CHECK: tf.Atan2
+    // expected-remark@+1 {{lowering requires bounded tensor operands}}
+    %0 = "tf.Atan2"(%arg0, %arg0) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+
+    func.return %0 : tensor<?xf32>
+  }
+
+  // CHECK-LABEL: tuple_type
+  func.func @tuple_type(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
+    // Verifies that the pass can handle operands of non-tensor type like tuple
+    // from non TensorFlow ops.
+    %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
+    func.return %0 : tensor<f32>
+  }
+
+  // CHECK-LABEL: unsupported_dtype
+  func.func @unsupported_dtype(%arg0: tensor<2x!tf_type.variant>) -> tensor<2x!tf_type.variant> {
+    // CHECK: tf.AddN
+    // expected-remark@+1 {{skipping legalization due to unsupported type 'tensor<2x!tf_type.variant>'}}
+    %0 = "tf.AddN"(%arg0, %arg0) : (tensor<2x!tf_type.variant>, tensor<2x!tf_type.variant>) -> tensor<2x!tf_type.variant>
+
+    func.return %0 : tensor<2x!tf_type.variant>
+  }
+
+  // CHECK-LABEL: multiple_dialect_ops
+  func.func @multiple_dialect_ops(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+    // CHECK: mhlo.negate
+    %0 = "mhlo.negate"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+    // CHECK: mhlo.atan2
+    %1 = "tf.Atan2"(%arg0, %0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+
+    func.return %1 : tensor<2xf32>
+  }
+
+  // CHECK-LABEL: binary_op_broadcast
+  func.func @binary_op_broadcast(%arg0: tensor<4x1xf32>, %arg1: tensor<4x1x4xf32>) -> tensor<4x4x4xf32> {
+    // CHECK: %[[BROADCAST0:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<4x1xf32>) -> tensor<4x4x1xf32>
+    // CHECK: %[[RESHAPE0:.*]] = mhlo.reshape %[[BROADCAST0]] : (tensor<4x4x1xf32>) -> tensor<4x4xf32>
+    // CHECK: %[[UPDATED_ARG0:.*]] = "mhlo.broadcast_in_dim"(%[[RESHAPE0]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x4xf32>) -> tensor<4x4x4xf32>
+
+    // CHECK: %[[RESHAPE1:.*]] = mhlo.reshape %arg1 : (tensor<4x1x4xf32>) -> tensor<4x4xf32>
+    // CHECK: %[[UPDATED_ARG1:.*]] = "mhlo.broadcast_in_dim"(%[[RESHAPE1]]) {broadcast_dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<4x4xf32>) -> tensor<4x4x4xf32>
+
+    // CHECK: %[[RESULT:.*]] = mhlo.atan2 %[[UPDATED_ARG0]], %[[UPDATED_ARG1]] : tensor<4x4x4xf32>
+    // CHECK: return %[[RESULT]] : tensor<4x4x4xf32>
+
+    %0 = "tf.Atan2"(%arg0, %arg1) : (tensor<4x1xf32>, tensor<4x1x4xf32>) -> tensor<4x4x4xf32>
+    func.return %0: tensor<4x4x4xf32>
+  }
+
+  // CHECK-LABEL: func @ternary_op
+  func.func @ternary_op(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
+    // CHECK: mhlo.select %arg0, %arg1, %arg2
+    %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+    func.return %0: tensor<2xi32>
+  }
+
+  // CHECK-LABEL: func @convert
+  func.func @convert(%arg0: tensor<2xi32>) -> tensor<2xf32> {
+    // CHECK: mhlo.convert %arg0 : (tensor<2xi32>) -> tensor<2xf32>
+    %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<2xi32>) -> tensor<2xf32>
+    func.return %0 : tensor<2xf32>
+  }
+
+  // CHECK-LABEL: func @constant
+  func.func @constant(%arg0: tensor<f32>) -> tensor<f32> {
+    // CHECK: %[[ONE:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
+    // CHECK: %[[RESULT:.*]] = mhlo.divide %[[ONE]], %arg0 : tensor<f32>
+    // CHECK: return %[[RESULT]]
+
+    %0 = "tf.Inv"(%arg0) : (tensor<f32>) -> tensor<f32>
+    func.return %0 : tensor<f32>
+  }
+
+  // CHECK-LABEL: func @const_inputs
+  // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x2xf64>, %[[ARG1:.*]]: tensor<f64>,
+  func.func @const_inputs(%arg0: tensor<2x2xf64>, %arg1: tensor<f64>, %arg2: tensor<2xi32>, %arg3: tensor<2xi32>, %arg4: tensor<2xi32>) -> tensor<6x5xf64> {
+
+    // CHECK: "mhlo.pad"(%[[ARG0]], %[[ARG1]])
+    // CHECK-SAME-DAG: edge_padding_high = dense<[1, 2]> : tensor<2xi64>
+    // CHECK-SAME-DAG: edge_padding_low = dense<[2, 1]> : tensor<2xi64>
+    // CHECK-SAME-DAG: interior_padding = dense<[1, 0]> : tensor<2xi64>
+
+    %0 = mhlo.constant dense<[2, 1]> : tensor<2xi32>
+    %1 = mhlo.constant dense<[1, 2]> : tensor<2xi32>
+    %2 = mhlo.constant dense<[1, 0]> : tensor<2xi32>
+    %3 = "tf.XlaPad"(%arg0, %arg1, %0, %1, %2) : (tensor<2x2xf64>, tensor<f64>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<6x5xf64>
+    func.return %3 : tensor<6x5xf64>
+  }
+
+  func.func @non_const_inputs(%arg0: tensor<2x2xf64>, %arg1: tensor<f64>, %arg2: tensor<2xi32>, %arg3: tensor<2xi32>, %arg4: tensor<2xi32>) -> tensor<6x5xf64> {
+    // expected-remark@+1 {{lowering requires operand #2 to be a constant}}
+    %0 = "tf.XlaPad"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<2x2xf64>, tensor<f64>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<6x5xf64>
+    func.return %0 : tensor<6x5xf64>
+  }
+
+  // CHECK-LABEL: dynamic_result_type
+  func.func @dynamic_result_type(%arg0: tensor<2xf32>) -> tensor<*xf32> {
+    // CHECK: %[[RESULT:.*]] = mhlo.atan2 %arg0, %arg0 : tensor<2xf32>
+    // CHECK: tensor.cast %[[RESULT]] : tensor<2xf32> to tensor<*xf32>
+    %0 = "tf.Atan2"(%arg0, %arg0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<*xf32>
+
+    // return %[[RESULT]]
+    func.return %0 : tensor<*xf32>
+  }
+
+  func.func @truncated_normal() -> tensor<2x2xf32> {
+    // CHECK-NOT: tf.TruncatedNormal
+    %0 = mhlo.constant dense<[2, 2]> : tensor<2xi32>
+    %1 = "tf.TruncatedNormal"(%0) {T = i32, device = "", dtype = f32, seed = 0 : i64, seed2 = 1950157571 : i64} : (tensor<2xi32>) -> tensor<2x2xf32>
+    func.return %1 : tensor<2x2xf32>
+  }
+
+  // CHECK-LABEL: dynamic_update_slice
+  // CHECK-SAME: (%[[ARG0:.*]]: tensor<3x4xi32>, %[[ARG1:.*]]: tensor<2x2xi32>, %[[ARG2:.*]]: tensor<2xi32>
+  func.func @dynamic_update_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<2x2xi32>, %arg2: tensor<2xi32>) -> tensor<3x4xi32> {
+
+    // CHECK: %[[SLICE0:.*]] = "mhlo.slice"(%[[ARG2]])
+    // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>
+    // CHECK-DAG-SAME: limit_indices = dense<1> : tensor<1xi64>
+    // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>
+    // CHECK-SAME: (tensor<2xi32>) -> tensor<1xi32>
+    // CHECK: %[[DIM0:.*]] = mhlo.reshape %[[SLICE0]] : (tensor<1xi32>) -> tensor<i32>
+
+    // CHECK: %[[SLICE1:.*]] = "mhlo.slice"(%[[ARG2]])
+    // CHECK-DAG-SAME: start_indices = dense<1> : tensor<1xi64>
+    // CHECK-DAG-SAME: limit_indices = dense<2> : tensor<1xi64>
+    // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>
+    // CHECK-SAME: (tensor<2xi32>) -> tensor<1xi32>
+    // CHECK: %[[DIM1:.*]] = mhlo.reshape %[[SLICE1]] : (tensor<1xi32>) -> tensor<i32>
+
+    // CHECK: mhlo.dynamic_update_slice %[[ARG0]], %[[ARG1]], %[[DIM0]], %[[DIM1]]
+
+    %0 = "tf.XlaDynamicUpdateSlice"(%arg0, %arg1, %arg2) : (tensor<3x4xi32>, tensor<2x2xi32>, tensor<2xi32>) -> tensor<3x4xi32>
+    func.return %0: tensor<3x4xi32>
+  }
+
+  // CHECK-LABEL: @sparse_to_dense
+  // CHECK-SAME: (%[[ARG0:.*]]: tensor<3x2xi32>, %[[ARG1:.*]]: tensor<3xf32>, %[[ARG2:.*]]: tensor<f32>)
+  func.func @sparse_to_dense(%arg0: tensor<3x2xi32>, %arg1: tensor<3xf32>, %arg2: tensor<f32>) -> tensor<3x3xf32> {
+
+  // CHECK:      %[[DEFAULT:.*]] = "mhlo.broadcast_in_dim"(%[[ARG2]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<3x3xf32>
+
+  // CHECK:      %[[RESULT:.*]] = "mhlo.scatter"(%[[DEFAULT]], %[[ARG0]], %[[ARG1]]) ({
+  // CHECK:      ^bb0(%[[ARG3:.*]]: tensor<f32>, %[[ARG4:.*]]: tensor<f32>):
+  // CHECK:        mhlo.return %[[ARG4]] : tensor<f32>
+  // CHECK:      })
+  // CHECK-SAME: indices_are_sorted = false
+  // CHECK-SAME: scatter_dimension_numbers
+  // CHECK-SAME:   inserted_window_dims = [0, 1]
+  // CHECK-SAME:   scatter_dims_to_operand_dims = [0, 1]
+  // CHECK-SAME:   index_vector_dim = 1
+  // CHECK-SAME: unique_indices = false
+  // CHECK-SAME: (tensor<3x3xf32>, tensor<3x2xi32>, tensor<3xf32>) -> tensor<3x3xf32>
+
+  // return %[[RESULT]] : tensor<3x3xf32>
+
+    %cst = mhlo.constant dense<3> : tensor<2xi32>
+    %0 = "tf.SparseToDense"(%arg0, %cst, %arg1, %arg2) {validate_indices = true}: (tensor<3x2xi32>, tensor<2xi32>, tensor<3xf32>, tensor<f32>) -> tensor<3x3xf32>
+    func.return %0 : tensor<3x3xf32>
+  }
+
+  // CHECK-LABEL: reverse_sequence
+  func.func @reverse_sequence(%arg0: tensor<4x2x3x1x1xi32>, %arg1: tensor<3xi32>) -> tensor<4x2x3x1x1xi32> {
+    // CHECK-NOT: tf.ReverseSequence
+    %0 = "tf.ReverseSequence"(%arg0, %arg1) {batch_dim = 2 : i64, seq_dim = 0 : i64}: (tensor<4x2x3x1x1xi32>, tensor<3xi32>) -> tensor<4x2x3x1x1xi32>
+    func.return %0 : tensor<4x2x3x1x1xi32>
+  }
+
+  // CHECK-LABEL: mirror_pad
+  func.func @mirror_pad(%arg0: tensor<2x3xcomplex<f64>>) -> tensor<4x7xcomplex<f64>> {
+    %0 = mhlo.constant dense<[[1, 1], [2, 2]]> : tensor<2x2xi32>
+    // CHECK-NOT: tf.MirrorPad
+    %1 = "tf.MirrorPad"(%arg0, %0) {mode = "SYMMETRIC"} : (tensor<2x3xcomplex<f64>>, tensor<2x2xi32>) -> tensor<4x7xcomplex<f64>>
+    func.return %1 : tensor<4x7xcomplex<f64>>
+  }
+
+  // CHECK-LABEL: bucketize
+  func.func @bucketize(%arg0: tensor<2x5xf32>) -> tensor<2x5xi32> {
+    // CHECK-NOT: tf.Bucketize
+    %0 = "tf.Bucketize"(%arg0) {boundaries = [0.000000e+00 : f32, 3.000000e+00 : f32, 8.000000e+00 : f32, 1.100000e+01 : f32]} : (tensor<2x5xf32>) -> tensor<2x5xi32>
+    func.return %0 : tensor<2x5xi32>
+  }
+
+  // CHECK-LABEL: arg_min
+  func.func @arg_min(%arg0: tensor<6xf64>) -> tensor<i32> {
+    // CHECK-NOT: ArgMin
+    %0 = mhlo.constant dense<0> : tensor<i32>
+    %1 = "tf.ArgMin"(%arg0, %0) : (tensor<6xf64>, tensor<i32>) -> tensor<i32>
+    func.return %1 : tensor<i32>
+  }
+
+  // CHECK-LABEL: non_max_suppression_v4
+  func.func @non_max_suppression_v4(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<2xi32> {
+    %max_size = mhlo.constant dense<2> : tensor<i32>
+    // CHECK-NOT: tf.NonMaxSuppressionV4
+    %0:2 = "tf.NonMaxSuppressionV4"(%arg0, %arg1, %max_size, %arg2, %arg3) {pad_to_max_output_size = true}: (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>, tensor<i32>)
+    func.return %0#0 : tensor<2xi32>
+  }
+
+  // CHECK-LABEL: bessel_i0e
+  func.func @bessel_i0e(%arg0: tensor<3xf16>, %arg1: tensor<3xf32>, %arg2: tensor<3xf64>) -> (tensor<3xf16>, tensor<3xf32>, tensor<3xf64>) {
+    // CHECK-NOT: tf.BesselI0e
+    %0 = "tf.BesselI0e"(%arg0) : (tensor<3xf16>) -> (tensor<3xf16>)
+    %1 = "tf.BesselI0e"(%arg1) : (tensor<3xf32>) -> (tensor<3xf32>)
+    %2 = "tf.BesselI0e"(%arg2) : (tensor<3xf64>) -> (tensor<3xf64>)
+    func.return %0, %1, %2 : tensor<3xf16>, tensor<3xf32>, tensor<3xf64>
+  }
+
+  // CHECK-LABEL: bessel_i1e
+  func.func @bessel_i1e(%arg0: tensor<3xf16>, %arg1: tensor<3xf32>, %arg2: tensor<3xf64>) -> (tensor<3xf16>, tensor<3xf32>, tensor<3xf64>) {
+    // CHECK-NOT: tf.BesselI1e
+    %0 = "tf.BesselI1e"(%arg0) : (tensor<3xf16>) -> (tensor<3xf16>)
+    %1 = "tf.BesselI1e"(%arg1) : (tensor<3xf32>) -> (tensor<3xf32>)
+    %2 = "tf.BesselI1e"(%arg2) : (tensor<3xf64>) -> (tensor<3xf64>)
+    func.return %0, %1, %2 : tensor<3xf16>, tensor<3xf32>, tensor<3xf64>
+  }
+
+  // CHECK-LABEL: diag
+  func.func @diag(%arg0: tensor<2xf32>) -> tensor<2x2xf32> {
+    // CHECK-NOT: tf.Diag
+    %0 = "tf.Diag"(%arg0) : (tensor<2xf32>) -> tensor<2x2xf32>
+    func.return %0 : tensor<2x2xf32>
+  }
+
+  // CHECK-LABEL: random_uniform_int
+  func.func @random_uniform_int(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<1000xi32> {
+    %0 = "tf.Const"() {value = dense<1000> : tensor<1xi32>} : () -> tensor<1xi32>
+    // CHECK-NOT: tf.RandomUniformInt
+    %1 = "tf.RandomUniformInt"(%0, %arg0, %arg1) {seed = 0 : i64, seed2 = 0 : i64} : (tensor<1xi32>, tensor<i32>, tensor<i32>) -> tensor<1000xi32>
+    func.return %1 : tensor<1000xi32>
+  }
+
+  // CHECK-LABEL: multinomial
+  func.func @multinomial(%arg0: tensor<2x4xf32>, %seed: tensor<i32>, %seed2: tensor<i32>) -> tensor<2x10xi32> {
+    // CHECK-NOT: tf.Multinomial
+    %samples = "tf.Const"() { value = dense<10> : tensor<i32> } : () -> tensor<i32>
+    %1 = "tf.Multinomial"(%arg0, %samples) {seed = 0, seed2 = 0}: (tensor<2x4xf32>, tensor<i32>) -> tensor<2x10xi32>
+    func.return %1 : tensor<2x10xi32>
+  }
+
+  // CHECK-LABEL: @set_dynamic_dimension_size
+  func.func @set_dynamic_dimension_size(%input: tensor<4xf32>, %size: tensor<i32>) -> tensor<?xf32> {
+    %dimension = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+    // CHECK: mhlo.set_dimension_size
+    // CHECK-SAME: {dimension = 0 : i64} : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
+    %0 = "tf.XlaSetDynamicDimensionSize"(%input, %dimension, %size) : (tensor<4xf32>, tensor<i32>, tensor<i32>) -> tensor<?xf32>
+    func.return %0 : tensor<?xf32>
+  }
+
+  // CHECK-LABEL: unique
+  func.func @unique(%arg0: tensor<5xf32>) -> (tensor<?xf32>, tensor<?xi32>) {
+    // CHECK-NOT: tf.Unique
+    %0, %1 = "tf.Unique"(%arg0) : (tensor<5xf32>) -> (tensor<?xf32>, tensor<?xi32>)
+    func.return %0, %1 : tensor<?xf32> , tensor<?xi32>
+  }
+
+  // CHECK-LABEL: @erfinv
+  func.func @erfinv(%input: tensor<4xf32>) -> tensor<4xf32> {
+    // CHECK-NOT: tf.Erfinv
+    %0 = "tf.Erfinv"(%input) : (tensor<4xf32>) -> tensor<4xf32>
+    func.return %0 : tensor<4xf32>
+  }
+
+  // CHECK-LABEL: @ndtri
+  func.func @ndtri(%input: tensor<4xf32>) -> tensor<4xf32> {
+    // CHECK-NOT: tf.Ndtri
+    %0 = "tf.Ndtri"(%input) : (tensor<4xf32>) -> tensor<4xf32>
+    func.return %0 : tensor<4xf32>
+  }
+
+  // CHECK-LABEL: @fake_param
+  func.func @fake_param() -> tensor<4xf32> {
+    // CHECK-NOT: tf.FakeParam
+    %0 = "tf.FakeParam"() {shape = #tf_type.shape<4>} : () -> tensor<4xf32>
+    func.return %0 : tensor<4xf32>
+  }
+
+  // CHECK-LABEL: @parameterized_truncated_normal
+  func.func @parameterized_truncated_normal(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<10000000xf32> {
+    %0 = "tf.Const"() {value = dense<10000000> : tensor<1xi32>} : () -> tensor<1xi32>
+    // CHECK-NOT: tf.ParameterizedTruncatedNormal
+    %1 = "tf.ParameterizedTruncatedNormal"(%0, %arg0, %arg1, %arg2, %arg3) {seed = 0 : i64, seed2 = 0 : i64} : (tensor<1xi32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<10000000xf32>
+    func.return %1 : tensor<10000000xf32>
+  }
+
+  // Check XlaSpmdFullToShardShape's conversion from split sharding to manual
+  // sharding.
+  // The split sharding is:
+  //   type: OTHER
+  //   tile_assignment_dimensions: 2
+  //   tile_assignment_dimensions: 1
+  //   tile_assignment_devices: 0
+  //   tile_assignment_devices: 1
+  // Serialized string:
+  //   "\08\03\1A\02\02\01\22\02\00\01"
+  // The manual sharding is:
+  //   type: MANUAL
+  // Serialized string:
+  //   "\08\04"
+
+  // CHECK-LABEL: @xla_spmd_full_to_shard_shape
+  func.func @xla_spmd_full_to_shard_shape(%arg0: tensor<2x2xi64>) -> (tensor<1x2xi64>) {
+    // CHECK: %[[SHARDING:.*]] = mhlo.custom_call @Sharding(%arg0) {backend_config = "", mhlo.sharding = "{devices=[2,1]0,1}"} : (tensor<2x2xi64>) -> tensor<2x2xi64>
+    // CHECK: %[[MANUAL:.*]] = mhlo.custom_call @SPMDFullToShardShape(%[[SHARDING]]) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<2x2xi64>) -> tensor<1x2xi64>
+    // CHECK: return %[[MANUAL]]
+    %0 = "tf.XlaSpmdFullToShardShape"(%arg0) {dim = -1 : i64, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<2x2xi64>) -> tensor<1x2xi64>
+    func.return %0 : tensor<1x2xi64>
+  }
+
+  // Check XlaSpmdShardToFullShape's conversion from manual sharding to split
+  // sharding.
+  // The manual sharding is:
+  //   type: MANUAL
+  // Serialized string:
+  //   "\08\04"
+  // The split sharding is:
+  //   type: OTHER
+  //   tile_assignment_dimensions: 2
+  //   tile_assignment_dimensions: 1
+  //   tile_assignment_devices: 0
+  //   tile_assignment_devices: 1
+  // Serialized string:
+  //   "\08\03\1A\02\02\01\22\02\00\01"
+
+  // CHECK-LABEL: @xla_spmd_shard_to_full_shape
+  func.func @xla_spmd_shard_to_full_shape(%arg0: tensor<1x2xi64>) -> (tensor<2x2xi64>) {
+    // CHECK: %[[SHARDING:.*]] = mhlo.custom_call @Sharding(%arg0) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<1x2xi64>) -> tensor<1x2xi64>
+    // CHECK: %[[FULL:.*]] = mhlo.custom_call @SPMDShardToFullShape(%[[SHARDING]]) {backend_config = "", mhlo.sharding = "{devices=[2,1]0,1}"} : (tensor<1x2xi64>) -> tensor<2x2xi64>
+    // CHECK: return %[[FULL]]
+    %0 = "tf.XlaSpmdShardToFullShape"(%arg0) {dim = -1 : i64, full_shape = #tf_type.shape<2x2>, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<1x2xi64>) -> tensor<2x2xi64>
+    func.return %0 : tensor<2x2xi64>
+  }
+
+  // CHECK-LABEL: @xla_svd
+  func.func @xla_svd(%arg0: tensor<1x1xf32>) -> (tensor<1xf32>, tensor<1x1xf32>, tensor<1x1xf32>) {
+    // CHECK-NOT: XlaSvd
+    %s, %u, %v = "tf.XlaSvd"(%arg0) {max_iter = 1, epsilon = 1.0E-09 : f32, precision_config = ""} : (tensor<1x1xf32>) -> (tensor<1xf32>, tensor<1x1xf32>, tensor<1x1xf32>)
+    func.return %s, %u, %v : tensor<1xf32>, tensor<1x1xf32>, tensor<1x1xf32>
+  }
+
+  func.func @identity(%arg0: f32) -> f32 {
+  func.return %arg0 : f32
+  }
+
+  // This test verifies that legalization for ops with symbol reference attribute
+  // is not attempted even if they are in allow-list. XLA op kernels for these
+  // ops compile the function to HLO on-demand which won't work in our case as it
+  // may contain unsupported ops in the fallback nor we provide XlaCompiler to
+  // the kernel. Using a allowed op Atan2 to protect against future addition of a
+  // new op with a symbol ref.
+
+  // CHECK-LABEL: @atan2_with_symbol_ref
+  func.func @atan2_with_symbol_ref(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+    // CHECK: tf.Atan2
+    // expected-remark@+1 {{ops with symbol references are not supported}}
+    %0 = "tf.Atan2"(%arg0, %arg0) {_body = @identity} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+
+    func.return %0 : tensor<2xf32>
+  }
+
+  func.func private @branch0(tensor<2xf32>) -> tensor<2xf32>
+  func.func private @branch1(tensor<2xf32>) -> tensor<2xf32>
+
+  func.func @case_with_symbol_ref(%arg0: tensor<i32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+    // CHECK: tf.Case
+    // expected-remark@+1 {{ops with symbol references are not supported}}
+    %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<2xf32>) -> tensor<2xf32>
+    func.return %0 : tensor<2xf32>
+  }
+
+  // CHECK-LABEL: const
+  func.func @const() -> tensor<2xf32> {
+    // CHECK: mhlo.const
+    %cst = "tf.Const"() {value = dense<2.0> : tensor<2xf32>} : () -> tensor<2xf32>
+    func.return %cst : tensor<2xf32>
+  }
+
+  // CHECK-LABEL: @bounds_propagation
+  func.func @bounds_propagation(%input: tensor<4xf32>, %size: tensor<i32>) -> tensor<?xf32> {
+    %dimension = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+    // CHECK: %[[BOUNDED:.*]] = "mhlo.set_dimension_size"
+    // CHECK-SAME: {dimension = 0 : i64} : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
+    %0 = "tf.XlaSetDynamicDimensionSize"(%input, %dimension, %size) : (tensor<4xf32>, tensor<i32>, tensor<i32>) -> tensor<?xf32>
+
+    %axis = "tf.Const"() { value = dense<0> : tensor<1xi32> } : () -> tensor<1xi32>
+    // CHECK: %[[REVERSED:.*]] = "mhlo.reverse"(%[[BOUNDED]])
+    // CHECK-SAME: {dimensions = dense<0> : tensor<1xi64>}
+    // CHECK-SAME: (tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
+    %1 = "tf.ReverseV2"(%0, %axis) : (tensor<?xf32>, tensor<1xi32>) -> tensor<?xf32>
+
+    // CHECK: %[[RESULT:.*]] = tensor.cast %[[REVERSED]] : tensor<?xf32, #mhlo.type_extensions<bounds = [4]>> to tensor<?xf32>
+    // CHECK: return %[[RESULT]] : tensor<?xf32>
+    func.return %1 : tensor<?xf32>
+  }
+
+  // CHECK-LABEL: @bounds_propagation_skip_symbol_ref_ops
+  func.func @bounds_propagation_skip_symbol_ref_ops(%input: tensor<4xf32>, %size: tensor<i32>) -> tensor<?xf32> {
+    %dimension = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+    // CHECK: %[[BOUNDED:.*]] = "mhlo.set_dimension_size"
+    // CHECK-SAME: {dimension = 0 : i64} : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
+    %0 = "tf.XlaSetDynamicDimensionSize"(%input, %dimension, %size) : (tensor<4xf32>, tensor<i32>, tensor<i32>) -> tensor<?xf32>
+
+    // CHECK: %[[ORIGINAL:.*]] = tensor.cast %[[BOUNDED]] : tensor<?xf32, #mhlo.type_extensions<bounds = [4]>> to tensor<?xf32>
+
+    %axis = "tf.Const"() { value = dense<0> : tensor<1xi32> } : () -> tensor<1xi32>
+    // CHECK: tf.ReverseV2
+    // CHECK-SAME: (tensor<?xf32>, tensor<1xi32>) -> tensor<?xf32>
+    // expected-remark@+1 {{lowering requires bounded tensor operands}}
+    %1 = "tf.ReverseV2"(%0, %axis) {_body = @identity} : (tensor<?xf32>, tensor<1xi32>) -> tensor<?xf32>
+
+    func.return %1 : tensor<?xf32>
+  }
+
+  // CHECK-LABEL: func @set_bound
+  func.func @set_bound(%arg0: tensor<i32>) -> tensor<i32> {
+    %bound = "tf.Const"() {value = dense<16> : tensor<i32>} : () -> tensor<i32>
+
+    // CHECK: %[[RESULT:.*]] = mhlo.custom_call @SetBound(%arg0) {backend_config = "", mhlo.literal = dense<16> : tensor<i32>}
+    %bounded = "tf.XlaSetBound"(%arg0, %bound) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+    // CHECK: return %[[RESULT]]
+    func.return %bounded : tensor<i32>
+  }
+
+  // CHECK-LABEL: func @greater
+  func.func @greater(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+    // CHECK-NEXT:  mhlo.compare GT, %arg0, %arg1
+    %0 = "tf.Greater"(%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+    func.return %0: tensor<2xi1>
+  }
+
+  // CHECK-LABEL: batchmatmulv2
+  func.func @batchmatmulv2(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
+    // CHECK: mhlo.reduce
+    // CHECK: mhlo.dot_general
+    // CHECK: mhlo.transpose
+    %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<1x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+    func.return %0 : tensor<3x4x4xf32>
+  }
+
+  // CHECK-LABEL: approx_topk
+  func.func @approx_topk(%arg0: tensor<!tf_type.resource<tensor<10x500xbf16>>> {tf._user_specified_name = "db"}) -> (tensor<10x10xbf16>) {
+    %0 = "tf.ReadVariableOp"(%arg0) {device = ""} : (tensor<!tf_type.resource<tensor<10x500xbf16>>>) -> tensor<10x500xbf16>
+    // CHECK: mhlo.compare  GT
+    %values, %indices = "tf.ApproxTopK"(%0) {aggregate_to_topk = true, device = "", is_max_k = true, k = 10 : i64, recall_target = 0.949999988 : f32, reduction_dimension = -1 : i64, reduction_input_size_override = -1 : i64} : (tensor<10x500xbf16>) -> (tensor<10x10xbf16>, tensor<10x10xi32>)
+    return %values : tensor<10x10xbf16>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla.mlir
index cc2e7f24709..3e550e0366c 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt "-xla-legalize-tf=device-type=XLA_CPU_JIT allow-partial-conversion=true prefer-tf2xla=true use-tf2xla-fallback=true" %s -verify-diagnostics | FileCheck %s
+// RUN: tf-opt "-xla-legalize-tf=device-type=XLA_CPU_JIT allow-partial-conversion=true prefer-tf2xla=true use-tf2xla-fallback=true use-tf2xla-hlo-importer=false" %s -verify-diagnostics | FileCheck %s
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
 
@@ -337,6 +337,54 @@ func.func @parameterized_truncated_normal(%arg0: tensor<f32>, %arg1: tensor<f32>
   func.return %1 : tensor<10000000xf32>
 }
 
+// Check XlaSpmdFullToShardShape's conversion from split sharding to manual
+// sharding.
+// The split sharding is:
+//   type: OTHER
+//   tile_assignment_dimensions: 2
+//   tile_assignment_dimensions: 1
+//   tile_assignment_devices: 0
+//   tile_assignment_devices: 1
+// Serialized string:
+//   "\08\03\1A\02\02\01\22\02\00\01"
+// The manual sharding is:
+//   type: MANUAL
+// Serialized string:
+//   "\08\04"
+
+// CHECK-LABEL: @xla_spmd_full_to_shard_shape
+func.func @xla_spmd_full_to_shard_shape(%arg0: tensor<2x2xi64>) -> (tensor<1x2xi64>) {
+  // CHECK: %[[SHARDING:.*]] = mhlo.custom_call @Sharding(%arg0) {backend_config = "", mhlo.sharding = "{devices=[2,1]0,1}"} : (tensor<2x2xi64>) -> tensor<2x2xi64>
+  // CHECK: %[[MANUAL:.*]] = mhlo.custom_call @SPMDFullToShardShape(%[[SHARDING]]) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<2x2xi64>) -> tensor<1x2xi64>
+  // CHECK: return %[[MANUAL]]
+  %0 = "tf.XlaSpmdFullToShardShape"(%arg0) {dim = -1 : i64, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<2x2xi64>) -> tensor<1x2xi64>
+  func.return %0 : tensor<1x2xi64>
+}
+
+// Check XlaSpmdShardToFullShape's conversion from manual sharding to split
+// sharding.
+// The manual sharding is:
+//   type: MANUAL
+// Serialized string:
+//   "\08\04"
+// The split sharding is:
+//   type: OTHER
+//   tile_assignment_dimensions: 2
+//   tile_assignment_dimensions: 1
+//   tile_assignment_devices: 0
+//   tile_assignment_devices: 1
+// Serialized string:
+//   "\08\03\1A\02\02\01\22\02\00\01"
+
+// CHECK-LABEL: @xla_spmd_shard_to_full_shape
+func.func @xla_spmd_shard_to_full_shape(%arg0: tensor<1x2xi64>) -> (tensor<2x2xi64>) {
+  // CHECK: %[[SHARDING:.*]] = mhlo.custom_call @Sharding(%arg0) {backend_config = "", mhlo.sharding = "{manual}"} : (tensor<1x2xi64>) -> tensor<1x2xi64>
+  // CHECK: %[[FULL:.*]] = mhlo.custom_call @SPMDShardToFullShape(%[[SHARDING]]) {backend_config = "", mhlo.sharding = "{devices=[2,1]0,1}"} : (tensor<1x2xi64>) -> tensor<2x2xi64>
+  // CHECK: return %[[FULL]]
+  %0 = "tf.XlaSpmdShardToFullShape"(%arg0) {dim = -1 : i64, full_shape = #tf_type.shape<2x2>, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<1x2xi64>) -> tensor<2x2xi64>
+  func.return %0 : tensor<2x2xi64>
+}
+
 // CHECK-LABEL: @xla_svd
 func.func @xla_svd(%arg0: tensor<1x1xf32>) -> (tensor<1xf32>, tensor<1x1xf32>, tensor<1x1xf32>) {
   // CHECK-NOT: XlaSvd
@@ -428,6 +476,21 @@ func.func @set_bound(%arg0: tensor<i32>) -> tensor<i32> {
   func.return %bounded : tensor<i32>
 }
 
+// CHECK-LABEL: @XlaScatterOpNotSupported
+func.func @XlaScatterOpNotSupported(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>, %arg2: tensor<4xi32>) -> tensor<8xi32> {
+  // CHECK: tf.XlaScatter
+  %0 = "tf.XlaScatter"(%arg0, %arg1, %arg2) {dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03", indices_are_sorted = false, update_computation = @no_reducer} : (tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8xi32>
+  func.return %0 : tensor<8xi32>
+}
+
+// CHECK-LABEL: approx_topk
+func.func @approx_topk(%arg0: tensor<!tf_type.resource<tensor<10x500xbf16>>> {tf._user_specified_name = "db"}) -> (tensor<10x10xbf16>) {
+  %0 = "tf.ReadVariableOp"(%arg0) {device = ""} : (tensor<!tf_type.resource<tensor<10x500xbf16>>>) -> tensor<10x500xbf16>
+  // CHECK: mhlo.compare  GT
+  %values, %indices = "tf.ApproxTopK"(%0) {aggregate_to_topk = true, device = "", is_max_k = true, k = 10 : i64, recall_target = 0.949999988 : f32, reduction_dimension = -1 : i64, reduction_input_size_override = -1 : i64} : (tensor<10x500xbf16>) -> (tensor<10x10xbf16>, tensor<10x10xi32>)
+  return %values : tensor<10x10xbf16>
+}
+
 // TODO(hinsu): Add a test with a valid TF op for which tf2xla kernel is
 // available but doesn't support this instance.
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
index 78f254d7a23..19fe43f0250 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
@@ -4059,6 +4059,34 @@ func.func @rng_uniform(%arg0: tensor<3xi32>) -> tensor<12x?x64xf32> {
 
 // -----
 
+// CHECK-LABEL: func @random_uniform_simple
+func.func @random_uniform_simple(%arg0: tensor<3xi32>) -> tensor<12x?x64xf32> {
+  // CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[ONE:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK: %[[CONV:.*]] = mhlo.convert %arg0 : (tensor<3xi32>) -> tensor<3xi64>
+  // CHECK: %[[F32:.*]] = "mhlo.rng"(%[[ZERO]], %[[ONE]], %[[CONV]]) {{.*UNIFORM.*}} -> tensor<12x?x64xf32>
+  %0 = "tf.RandomUniform"(%arg0) : (tensor<3xi32>) -> tensor<12x?x64xf32>
+  // CHECK: return %[[F32]]
+  func.return %0 : tensor<12x?x64xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @random_uniform_with_seeds
+func.func @random_uniform_with_seeds(%arg0: tensor<4xi32>) -> tensor<32x12x12x64xf32> {
+  // CHECK:  %0 = mhlo.constant dense<[32, 12, 12, 64]> : tensor<4xi32>
+  // CHECK-NEXT:  %1 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT:  %2 = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK-NEXT:  %3 = mhlo.convert %0 : (tensor<4xi32>) -> tensor<4xi64>
+  // CHECK-NEXT:  %4 = "mhlo.rng"(%1, %2, %3) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<4xi64>) -> tensor<32x12x12x64xf32>
+  %cst = "tf.Const"() {value = dense<[32, 12, 12, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %0 = "tf.RandomUniform"(%cst) {seed = 87654321 : i64, seed2 = 0 : i64} : (tensor<4xi32>) -> tensor<32x12x12x64xf32>
+  // CHECK: return %4 : tensor<32x12x12x64xf32>
+  func.return %0 : tensor<32x12x12x64xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @rng_std_normal
 func.func @rng_std_normal(%arg0: tensor<3xi32>) -> tensor<12x?x64xf32> {
   // CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
@@ -6264,6 +6292,208 @@ func.func @uniform_quantized_convolution(%input: tensor<1x2x2x3xf32>) -> () {
   func.return
 }
 
+//===----------------------------------------------------------------------===//
+// tf.UniformQuantizedAdd legalization
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantized_add
+func.func @uniform_quantized_add(%input: tensor<3x2xf32>) -> () {
+  %input_scales = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
+  %input_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
+  // tensor_proto that points to dense<127> of type !tf_type.qint32.
+  %bias = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
+  %bias_scales = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
+  %bias_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
+
+  %output_scales = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
+  %output_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK-DAG: %[[LHS:.*]] = mhlo.uniform_quantize %arg0 : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>
+  // CHECK-DAG: %[[RHS:.*]] = mhlo.constant()
+  // CHECK-SAME{LITERAL}: {value = dense<127> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32, 2.000000e+00:4>>
+  // CHECK: chlo.broadcast_add %[[LHS]], %[[RHS]] {broadcast_dimensions = dense<1> : tensor<1xi64>} :
+  // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>, tensor<2x!quant.uniform<i32:f32, 2.000000e+00:4>>)
+  // CHECK-SAME: -> tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>
+
+  %0 = "tf.UniformQuantize"(%input, %input_scales, %input_zps) {
+    quantization_axis = -1 : i64, quantization_min_val = -2147483648 : i64, quantization_max_val = 2147483647 : i64
+  } : (tensor<3x2xf32>, tensor<f32>, tensor<i32>) -> tensor<3x2x!tf_type.qint32>
+  %1 = "tf.UniformQuantizedAdd"(
+    %0, %bias,
+    %input_scales, %input_zps,
+    %bias_scales, %bias_zps,
+    %output_scales, %output_zps) {
+      lhs_quantization_axis = -1 : i64,
+      lhs_quantization_min_val = -2147483648 : i64,
+      lhs_quantization_max_val = 2147483647 : i64,
+      rhs_quantization_axis = -1 : i64,
+      rhs_quantization_min_val = -2147483648 : i64,
+      rhs_quantization_max_val = 2147483647 : i64,
+      output_quantization_axis = -1 : i64,
+      output_quantization_min_val = -2147483648 : i64,
+      output_quantization_max_val = 2147483647 : i64} : (
+        tensor<3x2x!tf_type.qint32>, tensor<2x!tf_type.qint32>,
+        tensor<f32>, tensor<i32>,
+        tensor<f32>, tensor<i32>,
+        tensor<f32>, tensor<i32>) -> tensor<3x2x!tf_type.qint32>
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantized_add_unknown_lhs_rank
+func.func @uniform_quantized_add_unknown_lhs_rank(%input: tensor<*x!tf_type.qint32>) -> () {
+  %input_scales = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
+  %input_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
+  // tensor_proto that points to dense<127> of type !tf_type.qint32.
+  %bias = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
+  %bias_scales = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
+  %bias_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
+
+  %output_scales = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
+  %output_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
+  %1 = "tf.UniformQuantizedAdd"(
+    %input, %bias,
+    %input_scales, %input_zps,
+    %bias_scales, %bias_zps,
+    %output_scales, %output_zps) {
+      lhs_quantization_axis = -1 : i64,
+      lhs_quantization_min_val = -2147483648 : i64,
+      lhs_quantization_max_val = 2147483647 : i64,
+      rhs_quantization_axis = -1 : i64,
+      rhs_quantization_min_val = -2147483648 : i64,
+      rhs_quantization_max_val = 2147483647 : i64,
+      output_quantization_axis = -1 : i64,
+      output_quantization_min_val = -2147483648 : i64,
+      output_quantization_max_val = 2147483647 : i64} : (
+        tensor<*x!tf_type.qint32>, tensor<2x!tf_type.qint32>,
+        tensor<f32>, tensor<i32>,
+        tensor<f32>, tensor<i32>,
+        tensor<f32>, tensor<i32>) -> tensor<*x!tf_type.qint32>
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantized_add_non_constant_lhs_scales
+func.func @uniform_quantized_add_non_constant_lhs_scales(
+    %input: tensor<*x!tf_type.qint32>, %input_scales: tensor<f32>) -> () {
+  %input_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
+  // tensor_proto that points to dense<127> of type !tf_type.qint32.
+  %bias = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
+  %bias_scales = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
+  %bias_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
+
+  %output_scales = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
+  %output_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
+  %1 = "tf.UniformQuantizedAdd"(
+    %input, %bias,
+    %input_scales, %input_zps,
+    %bias_scales, %bias_zps,
+    %output_scales, %output_zps) {
+      lhs_quantization_axis = -1 : i64,
+      lhs_quantization_min_val = -2147483648 : i64,
+      lhs_quantization_max_val = 2147483647 : i64,
+      rhs_quantization_axis = -1 : i64,
+      rhs_quantization_min_val = -2147483648 : i64,
+      rhs_quantization_max_val = 2147483647 : i64,
+      output_quantization_axis = -1 : i64,
+      output_quantization_min_val = -2147483648 : i64,
+      output_quantization_max_val = 2147483647 : i64} : (
+        tensor<*x!tf_type.qint32>, tensor<2x!tf_type.qint32>,
+        tensor<f32>, tensor<i32>,
+        tensor<f32>, tensor<i32>,
+        tensor<f32>, tensor<i32>) -> tensor<*x!tf_type.qint32>
+  func.return
+}
+
+//===----------------------------------------------------------------------===//
+// tf.UniformQuantizedClipByValue legalization
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantized_clip_by_value
+func.func @uniform_quantized_clip_by_value(%input: tensor<3x2xf32>) -> () {
+  %scales = "tf.Const"() { value = dense<2.0> : tensor<2xf32> } : () -> tensor<2xf32>
+  %zps = "tf.Const"() { value = dense<4> : tensor<2xi32> } : () -> tensor<2xi32>
+  // tensor_proto that points to dense<127> of type !tf_type.qint32.
+  %min = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
+  %max = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
+
+  // CHECK-DAG: %[[OPERAND:.*]] = mhlo.uniform_quantize %arg0 : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
+  // CHECK-DAG: %[[MIN:.*]] = mhlo.constant()
+  // CHECK-SAME{LITERAL}: {value = dense<127> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
+  // CHECK: %[[MAX:.*]] = mhlo.constant()
+  // CHECK-SAME{LITERAL}: {value = dense<127> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
+  // CHECK: %[[MIN_CLIPPED:.*]] = chlo.broadcast_maximum %[[OPERAND]], %[[MIN]] {broadcast_dimensions = dense<1> : tensor<1xi64>} :
+  // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>, tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>)
+  // CHECK-SAME: -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
+  // CHECK: chlo.broadcast_minimum %[[MIN_CLIPPED]], %[[MAX]] {broadcast_dimensions = dense<1> : tensor<1xi64>} :
+  // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>, tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>)
+  // CHECK-SAME: -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
+
+  %0 = "tf.UniformQuantize"(%input, %scales, %zps) {
+    quantization_axis = 1 : i64, quantization_min_val = -2147483648 : i64, quantization_max_val = 2147483647 : i64
+  } : (tensor<3x2xf32>, tensor<2xf32>, tensor<2xi32>) -> tensor<3x2x!tf_type.qint32>
+  %1 = "tf.UniformQuantizedClipByValue"(%0, %min, %max, %scales, %zps) {
+      quantization_axis = 1 : i64,
+      quantization_min_val = -2147483648 : i64,
+      quantization_max_val = 2147483647 : i64
+  } : (tensor<3x2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2xf32>, tensor<2xi32>) -> tensor<3x2x!tf_type.qint32>
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantized_clip_by_value_min_not_const
+func.func @uniform_quantized_clip_by_value_min_not_const(%input: tensor<3x2x!tf_type.qint32>, %min: tensor<2x!tf_type.qint32>) -> () {
+  %scales = "tf.Const"() { value = dense<2.0> : tensor<2xf32> } : () -> tensor<2xf32>
+  %zps = "tf.Const"() { value = dense<4> : tensor<2xi32> } : () -> tensor<2xi32>
+  // tensor_proto that points to dense<127> of type !tf_type.qint32.
+  %max = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
+  %0 = "tf.UniformQuantizedClipByValue"(%input, %min, %max, %scales, %zps) {
+      quantization_axis = 1 : i64,
+      quantization_min_val = -2147483648 : i64,
+      quantization_max_val = 2147483647 : i64
+  } : (tensor<3x2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2xf32>, tensor<2xi32>) -> tensor<3x2x!tf_type.qint32>
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantized_clip_by_value_max_not_const
+func.func @uniform_quantized_clip_by_value_max_not_const(%input: tensor<3x2x!tf_type.qint32>, %max: tensor<2x!tf_type.qint32>) -> () {
+  %scales = "tf.Const"() { value = dense<2.0> : tensor<2xf32> } : () -> tensor<2xf32>
+  %zps = "tf.Const"() { value = dense<4> : tensor<2xi32> } : () -> tensor<2xi32>
+  // tensor_proto that points to dense<127> of type !tf_type.qint32.
+  %min = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
+  %0 = "tf.UniformQuantizedClipByValue"(%input, %min, %max, %scales, %zps) {
+      quantization_axis = 1 : i64,
+      quantization_min_val = -2147483648 : i64,
+      quantization_max_val = 2147483647 : i64
+  } : (tensor<3x2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2xf32>, tensor<2xi32>) -> tensor<3x2x!tf_type.qint32>
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantized_clip_by_value_scales_not_const
+func.func @uniform_quantized_clip_by_value_scales_not_const(%input: tensor<3x2x!tf_type.qint32>, %scales: tensor<2xf32>) -> () {
+  %zps = "tf.Const"() { value = dense<4> : tensor<2xi32> } : () -> tensor<2xi32>
+  // tensor_proto that points to dense<127> of type !tf_type.qint32.
+  %min = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
+  %max = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
+  %0 = "tf.UniformQuantizedClipByValue"(%input, %min, %max, %scales, %zps) {
+      quantization_axis = 1 : i64,
+      quantization_min_val = -2147483648 : i64,
+      quantization_max_val = 2147483647 : i64
+  } : (tensor<3x2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2xf32>, tensor<2xi32>) -> tensor<3x2x!tf_type.qint32>
+  func.return
+}
+
 //===----------------------------------------------------------------------===//
 // tf.Softplus legalization
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
new file mode 100644
index 00000000000..df4bf8fa204
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
@@ -0,0 +1,493 @@
+# Description:
+#    TF2XLA Bridge transforms
+
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+gentbl_cc_library(
+    name = "legalize_tf_patterns_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            ["-gen-rewriters"],
+            "generated_legalize_tf.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "legalize_tf_patterns.td",
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_ops_td_files",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncTdFiles",
+        "@llvm-project//mlir:TensorOpsTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "xla_legalize_tf_passes_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=LegalizeTf",
+            ],
+            "xla_legalize_tf_passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "xla_legalize_tf_passes.td",
+    deps = [
+        "@llvm-project//mlir:PassBaseTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "tf_xla_passes_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=TfXla",
+            ],
+            "tf_xla_passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "tf_xla_passes.td",
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_ops_td_files",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncTdFiles",
+        "@llvm-project//mlir:PassBaseTdFiles",
+        "@llvm-project//mlir:SparseTensorDialect",
+        "@llvm-project//mlir:TensorOpsTdFiles",
+    ],
+)
+
+cc_library(
+    name = "tf_xla_passes",
+    srcs = [
+        "xla_legalize_tf_passes.h.inc",
+    ],
+    hdrs = [
+        "passes.h",
+    ],
+    deps = [
+        ":tf_xla_passes_inc_gen",
+        ":xla_legalize_tf",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SparseTensorDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "legalize_utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
+    deps = [
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "test_utils",
+    testonly = True,
+    srcs = ["test_utils.cc"],
+    hdrs = ["test_utils.h"],
+    deps = [
+        "//tensorflow/compiler/mlir:register_common_dialects",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/core/lib/monitoring:cell_reader",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+cc_library(
+    name = "legalize_tf",
+    srcs = [
+        "generated_legalize_tf.inc",
+        "legalize_tf.cc",
+    ],
+    hdrs = [
+        "passes.h",
+    ],
+    deps = [
+        ":legalize_tf_patterns_inc_gen",
+        ":legalize_utils",
+        ":tf_xla_passes_inc_gen",
+        ":xla_legalize_tf_passes_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client:sharding_builder",
+        "//tensorflow/compiler/xla/client/lib:conv_grad_size_util",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:convert_op_folder",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:attribute_importer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/kernels:conv_grad_shape_utils",
+        "//tensorflow/tsl/platform:bfloat16",
+        "//tensorflow/tsl/platform:status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:chlo_ops",
+    ],
+)
+
+cc_library(
+    name = "xla_legalize_targets",
+    srcs = [
+        "xla_legalize_targets.cc",
+    ],
+    hdrs = [
+        "xla_legalize_targets.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:chlo_ops",
+    ],
+)
+
+tf_cc_test(
+    name = "xla_legalize_targets_test",
+    srcs = ["xla_legalize_targets_test.cc"],
+    deps = [
+        ":xla_legalize_targets",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:chlo_ops",
+    ],
+)
+
+tf_cc_test(
+    name = "verify_tfxla_legalization_test",
+    srcs = ["verify_tfxla_legalization_test.cc"],
+    deps = [
+        ":legalize_tf",
+        ":test_utils",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
+        "//tensorflow/core/lib/monitoring:cell_reader",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+cc_library(
+    name = "xla_legalize_tf",
+    srcs = [
+        "convert_mhlo_quant_to_int.cc",
+        "infeed_ops_xla_adjust_layout.cc",
+        "legalize_tf_collective.cc",
+        "legalize_tf_communication.cc",
+        "legalize_tf_types.cc",
+        "tf_xla_passes.h.inc",
+        "tfxla_device_specific_transforms.cc",
+        "verify_tfxla_legalization.cc",
+        "xla_legalize_tf.cc",
+        "xla_legalize_tf_passes.h.inc",
+    ],
+    hdrs = [
+        "passes.h",
+    ],
+    deps = [
+        ":legalize_tf",
+        ":legalize_utils",
+        ":xla_legalize_targets",
+        ":xla_legalize_tf_no_fallback",
+        ":xla_legalize_tf_passes_inc_gen",
+        ":xla_legalize_tf_with_tf2xla",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
+        "//tensorflow/compiler/mlir/tensorflow:mangling_util",
+        "//tensorflow/compiler/mlir/tensorflow:set_tpu_infeed_layout",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/tf2xla/kernels:rng_converter_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:side_effect_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client:sharding_builder",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:chlo_legalize_to_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:convert_op_folder",
+        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:attribute_importer",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/util/quantization:uniform_quant_ops_params",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:SparseTensorDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:Transforms",
+        "@stablehlo//:chlo_ops",
+    ],
+)
+
+cc_library(
+    name = "xla_legalize_tf_no_fallback",
+    srcs = [
+        "xla_legalize_tf_no_fallback.cc",
+        "xla_legalize_tf_passes.h.inc",
+    ],
+    hdrs = [
+        "passes.h",
+    ],
+    deps = [
+        ":legalize_tf",
+        ":tf_xla_passes_inc_gen",
+        ":xla_legalize_tf_passes_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:SparseTensorDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:Transforms",
+        "@stablehlo//:chlo_ops",
+    ],
+)
+
+cc_library(
+    name = "tf2xla_rewriter",
+    srcs = [
+        "tf2xla_rewriter.cc",
+    ],
+    hdrs = [
+        "tf2xla_rewriter.h",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":legalize_tf",
+        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
+        "//tensorflow/compiler/mlir/tensorflow:tpu_embedding_ops_registry",
+        "//tensorflow/compiler/mlir/tensorflow:translate_utils",
+        "//tensorflow/compiler/tf2xla:xla_compilation_device",
+        "//tensorflow/compiler/tf2xla:xla_context",
+        "//tensorflow/compiler/tf2xla:xla_expression",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_module_importer",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:mlir_hlo_builder",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_types_hdr",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:allocator",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SparseTensorDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+tf_cc_test(
+    name = "tf2xla_rewriter_test",
+    srcs = [
+        "tf2xla_rewriter_test.cc",
+    ],
+    deps = [
+        ":test_utils",
+        ":tf2xla_rewriter",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ops",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "xla_legalize_tf_with_tf2xla",
+    srcs = [
+        "legalize_tf_with_tf2xla.cc",
+    ],
+    hdrs = [
+        "passes.h",
+    ],
+    deps = [
+        ":tf2xla_rewriter",
+        ":tf_xla_passes_inc_gen",
+        ":xla_legalize_tf_passes_inc_gen",
+        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tpu_embedding_ops_registry",
+        "//tensorflow/compiler/mlir/tensorflow:translate_utils",
+        "//tensorflow/compiler/tf2xla:xla_compilation_device",
+        "//tensorflow/compiler/tf2xla:xla_context",
+        "//tensorflow/compiler/tf2xla:xla_expression",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/stream_executor:timer",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:mlir_hlo_builder",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SparseTensorDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+tf_cc_test(
+    name = "xla_legalize_tf_test",
+    srcs = [
+        "xla_legalize_tf_test.cc",
+    ],
+    deps = [
+        ":tf_xla_passes",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
+        "//tensorflow/compiler/tf2xla:xla_compilation_device",
+        "//tensorflow/compiler/tf2xla:xla_context",
+        "//tensorflow/compiler/tf2xla:xla_expression",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core/framework:allocator",
+        "//tensorflow/core/lib/monitoring:cell_reader",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
index 39742383ebf..06d6df007f2 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
@@ -137,7 +137,7 @@ static IntegerAttr GetHLOAxisFromTFAxis(Attribute attr, int64_t rank,
                                         Builder *b) {
   IntegerAttr intAttr = attr.dyn_cast_or_null<IntegerAttr>();
   if (auto elementAttr = attr.dyn_cast_or_null<ElementsAttr>()) {
-    SmallVector<uint64_t, 1> index(elementAttr.getType().getRank(), 0);
+    SmallVector<uint64_t, 1> index(elementAttr.getShapedType().getRank(), 0);
     intAttr = elementAttr.getValues<IntegerAttr>()[index];
   }
 
@@ -259,7 +259,7 @@ static RankedTensorType GetStaticBroadcastType(
                                           shape_large.end());
 
   // Update according to the broadcast dimensions.
-  for (auto &index_pair : llvm::enumerate(broadcast_dimensions)) {
+  for (const auto &index_pair : llvm::enumerate(broadcast_dimensions)) {
     auto old_value = out_shape[index_pair.value()];
     auto new_value = shape_small[index_pair.index()];
     out_shape[index_pair.value()] = std::max(old_value, new_value);
@@ -554,7 +554,7 @@ static DenseIntElementsAttr SliceDenseIntElementsAttrColumn2D(
   llvm::SmallVector<int64_t, 4> values;
   values.reserve(shaped_type.getNumElements() / shape[1]);
 
-  for (auto &it : llvm::enumerate(int_attr.getValues<APInt>())) {
+  for (const auto &it : llvm::enumerate(int_attr.getValues<APInt>())) {
     if (static_cast<int>(it.index() % shape[1]) == column) {
       values.push_back(it.value().getSExtValue());
     }
@@ -568,7 +568,7 @@ static DenseIntElementsAttr SliceDenseIntElementsAttrColumn2D(
 // Returns interior padding to use in HLO Pad op based on the TensorFlow padding
 // in TensorFlow PadV2 op.
 static DenseIntElementsAttr GetInteriorPadding(ElementsAttr tf_padding) {
-  auto length = tf_padding.getType().getShape()[0];
+  auto length = tf_padding.getShapedType().getShape()[0];
   auto element_type = IntegerType::get(tf_padding.getContext(), 64);
   return DenseIntElementsAttr::get<int64_t>(
       tensorflow::GetTypeFromTFTensorShape({length}, element_type), 0);
@@ -3403,7 +3403,7 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
     std::optional<int> dynamic_dim_index;
     split_sizes.reserve(
         split_sizes_attr.getType().cast<ShapedType>().getNumElements());
-    for (auto &dim : llvm::enumerate(split_sizes_attr)) {
+    for (const auto &dim : llvm::enumerate(split_sizes_attr)) {
       int64_t dim_val = dim.value().getSExtValue();
       split_sizes.push_back(dim_val);
       if (dim_val == -1) {
@@ -4072,7 +4072,8 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
     // that this is a restricted form of shape manipulation that is just adding
     // unit dims.
     if (op.getKeepDims()) {
-      for (auto &dim_is_reduced : llvm::enumerate(reduced_dimensions_bitmap)) {
+      for (const auto &dim_is_reduced :
+           llvm::enumerate(reduced_dimensions_bitmap)) {
         if (dim_is_reduced.value()) {
           auto index_attr = GetI32ElementsAttr(
               {static_cast<int>(dim_is_reduced.index())}, &rewriter);
@@ -5318,7 +5319,7 @@ class ConvertInfeedDequeueTupleOp
     }
     llvm::SmallVector<Value> results;
     results.reserve(result_types.size());
-    for (auto &idx_and_type : llvm::enumerate(result_types)) {
+    for (const auto &idx_and_type : llvm::enumerate(result_types)) {
       results.push_back(data_and_token.getResult(idx_and_type.index()));
     }
     rewriter.replaceOp(op, ValueRange(results));
@@ -6772,7 +6773,7 @@ class LowerControlFlowOp : public OpConversionPattern<SrcOpT> {
       if constexpr (std::is_same<DstOpT, mhlo::WhileOp>::value) {
         TypeConverter::SignatureConversion signature(num_results);
         Block &block = region.front();
-        for (auto &[block_idx, original_ty] :
+        for (const auto &[block_idx, original_ty] :
              llvm::enumerate(block.getArgumentTypes())) {
           TensorType updated_ty =
               UpdateElementTypeTo(original_ty, element_types[block_idx]);
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
index 2e51ccd7901..94bb9deb14a 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
@@ -212,13 +212,15 @@ LogicalResult GetFunctionsToRewrite(
   return success();
 }
 
-// Assigns op sharding to full tensor on `kShardingTpuCore`.
-void SetOpSharding(Operation* op) {
-  std::string sharding_serialized =
-      ::xla::sharding_builder::AssignDevice(kShardingTpuCore)
-          .SerializeAsString();
+// Assigns either MAXIMAL or MANUAL sharding. The MAXIMAL sharding sends/recvs
+// one message from core `kShardingTpuCore` with the full tensor. MANUAL
+// sharding sends/recvs one message for each core with the core's shard.
+void SetOpSharding(Operation* op, bool manual_sharding) {
+  xla::OpSharding sharding =
+      manual_sharding ? ::xla::sharding_builder::Manual()
+                      : ::xla::sharding_builder::AssignDevice(kShardingTpuCore);
   op->setAttr(kShardingAttr,
-              StringAttr::get(op->getContext(), sharding_serialized));
+              StringAttr::get(op->getContext(), sharding.SerializeAsString()));
 }
 
 // Assigns frontend attributes holding information about data type and
@@ -263,7 +265,7 @@ void SetFrontendAttributes(Operation* op, int32_t index, StringRef key,
 // Creates a `mhlo.send` op for sending value `operand`.
 Value CreateSendOp(OpBuilder& builder, int64_t& channel_id, Location loc,
                    Value operand, StringRef key, size_t index, Value token,
-                   StringRef host_handler_name) {
+                   StringRef host_handler_name, bool manual_sharding) {
   // type 2 == DEVICE_TO_HOST
   auto channel_handle = ChannelHandleAttr::get(builder.getContext(),
                                                /*handle=*/channel_id++,
@@ -275,7 +277,7 @@ Value CreateSendOp(OpBuilder& builder, int64_t& channel_id, Location loc,
   SetFrontendAttributes(send, index, key, operand.getType(),
                         /*device_to_host=*/true, host_handler_name);
 
-  SetOpSharding(send);
+  SetOpSharding(send, manual_sharding);
 
   return send.getResult();
 }
@@ -283,7 +285,7 @@ Value CreateSendOp(OpBuilder& builder, int64_t& channel_id, Location loc,
 // Creates a `mhlo.recv` op for receiving a value.
 Value CreateRecvOp(OpBuilder& builder, int64_t& channel_id, Location loc,
                    Value result, StringRef key, size_t index, Value token,
-                   StringRef host_handler_name) {
+                   StringRef host_handler_name, bool manual_sharding) {
   // type 3 == HOST_TO_DEVICE
   auto channel_handle = ChannelHandleAttr::get(builder.getContext(),
                                                /*handle=*/channel_id++,
@@ -297,7 +299,7 @@ Value CreateRecvOp(OpBuilder& builder, int64_t& channel_id, Location loc,
   SetFrontendAttributes(recv, index, key, result_type,
                         /*device_to_host=*/false, host_handler_name);
 
-  SetOpSharding(recv);
+  SetOpSharding(recv, manual_sharding);
 
   result.replaceAllUsesWith(recv.getResult(0));
 
@@ -328,12 +330,14 @@ Value RewriteHostComputeOp(OpBuilder& builder, int64_t& channel_id,
                            Value token) {
   builder.setInsertionPoint(host_compute);
   Location loc = host_compute.getLoc();
+  bool manual_sharding = host_compute.getManualSharding();
 
   SmallVector<Value, 4> send_tokens;
   for (auto operand : llvm::enumerate(host_compute.getInputs())) {
     auto send_token = CreateSendOp(
         builder, channel_id, loc, operand.value(), host_compute.getSendKey(),
-        operand.index(), token, xla::kXlaHostTransferTfRendezvousHandlerName);
+        operand.index(), token, xla::kXlaHostTransferTfRendezvousHandlerName,
+        manual_sharding);
     send_tokens.push_back(send_token);
   }
   token = CreateSinkToken(builder, loc, send_tokens, token);
@@ -342,7 +346,8 @@ Value RewriteHostComputeOp(OpBuilder& builder, int64_t& channel_id,
   for (auto result : llvm::enumerate(host_compute.getOutputs())) {
     auto recv_token = CreateRecvOp(
         builder, channel_id, loc, result.value(), host_compute.getRecvKey(),
-        result.index(), token, xla::kXlaHostTransferTfRendezvousHandlerName);
+        result.index(), token, xla::kXlaHostTransferTfRendezvousHandlerName,
+        manual_sharding);
     recv_tokens.push_back(recv_token);
   }
   token = CreateSinkToken(builder, loc, recv_tokens, token);
@@ -358,7 +363,8 @@ Value RewriteSendToHostOp(OpBuilder& builder, int64_t& channel_id,
   token = CreateSendOp(builder, channel_id, send_to_host.getLoc(),
                        send_to_host.getInput(), send_to_host.getKey(),
                        /*index=*/0, token,
-                       xla::kXlaHostTransferTfRendezvousHandlerName);
+                       xla::kXlaHostTransferTfRendezvousHandlerName,
+                       /*manual_sharding=*/false);
 
   send_to_host.erase();
   return token;
@@ -371,7 +377,8 @@ Value RewriteRecvFromHostOp(OpBuilder& builder, int64_t& channel_id,
   token = CreateRecvOp(builder, channel_id, recv_from_host.getLoc(),
                        recv_from_host.getOutput(), recv_from_host.getKey(),
                        /*index=*/0, token,
-                       xla::kXlaHostTransferTfRendezvousHandlerName);
+                       xla::kXlaHostTransferTfRendezvousHandlerName,
+                       /*manual_sharding=*/false);
 
   recv_from_host.erase();
   return token;
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
index c78e0e8a709..f28ea6958d3 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
@@ -269,7 +269,7 @@ def : EqualityPat<TF_NotEqualOp, CHLO_ComparisonDirectionValue<"NE">>;
 //===----------------------------------------------------------------------===//
 
 def OneElementAttrPred
-  : CPred<"$_self.cast<ElementsAttr>().getType().getNumElements() == 1">;
+  : CPred<"$_self.cast<ElementsAttr>().getShapedType().getNumElements() == 1">;
 
 def OneElementAttr
   : ElementsAttrBase<And<[ElementsAttr.predicate, OneElementAttrPred]>,
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
index 8f74a84288d..ddd3b091e23 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -34,21 +33,14 @@ limitations under the License.
 #include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_expression.h"
@@ -56,7 +48,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -72,8 +63,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/public/session_options.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
@@ -102,6 +91,7 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
             TypeID::get<TF::AdjustHueOp>(),
             TypeID::get<TF::AdjustSaturationOp>(),
             TypeID::get<TF::ApproximateEqualOp>(),
+            TypeID::get<TF::ApproxTopKOp>(),
             TypeID::get<TF::ArgMaxOp>(),
             TypeID::get<TF::ArgMinOp>(),
             TypeID::get<TF::AsinhOp>(),
@@ -288,6 +278,8 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
             TypeID::get<TF::XlaPadOp>(),
             TypeID::get<TF::XlaSetBoundOp>(),
             TypeID::get<TF::XlaSetDynamicDimensionSizeOp>(),
+            TypeID::get<TF::XlaSpmdFullToShardShapeOp>(),
+            TypeID::get<TF::XlaSpmdShardToFullShapeOp>(),
             TypeID::get<TF::XlaSvdOp>(),
         };
 
@@ -347,9 +339,8 @@ bool IsOpAllowedTf2XlaPreferred(Operation* op) {
     TypeID::get<TF::FusedBatchNormOp>(),
     TypeID::get<TF::FusedBatchNormGradOp>(),
     TypeID::get<TF::FusedBatchNormGradV2Op>(),
-    TypeID::get<TF::FusedBatchNormGradV3Op>(),
     TypeID::get<TF::FusedBatchNormV2Op>(),
-    TypeID::get<TF::FusedBatchNormV3Op>(),
+    TypeID::get<TF::_FusedConv2DOp>(),
     TypeID::get<TF::GatherNdOp>(),
     TypeID::get<TF::GatherV2Op>(),
     TypeID::get<TF::GreaterEqualOp>(),
@@ -368,7 +359,6 @@ bool IsOpAllowedTf2XlaPreferred(Operation* op) {
     TypeID::get<TF::MaximumOp>(),
     TypeID::get<TF::MaxPoolOp>(),
     TypeID::get<TF::MaxPool3DOp>(),
-    TypeID::get<TF::MaxPoolGradOp>(),
     TypeID::get<TF::MeanOp>(),
     TypeID::get<TF::MinOp>(),
     TypeID::get<TF::MinimumOp>(),
@@ -431,141 +421,13 @@ bool IsOpAllowedTf2XlaPreferred(Operation* op) {
 }
 // LINT.ThenChange()
 
-// List of ops that require falling back to XlaOpKernel legalizations and also
-// require the ability to create functions.
-bool IsOpAllowedTf2XlaFallbackAndCreateFunctions(Operation* op) {
-  static auto* ops = new llvm::SmallDenseSet<mlir::TypeID, 16>{
-      TypeID::get<TF::ApproxTopKOp>(),
-  };
-  auto abstractOp = op->getRegisteredInfo();
-  if (!abstractOp) return false;
-  return ops->count(abstractOp->getTypeID());
-}
-
 bool HasTf2XlaFallback(Operation* op) {
   return IsOpAllowedTf2XlaFallback(op) ||
-         IsOpAllowedTf2XlaFallbackAndCreateFunctions(op) ||
          IsOpAllowedTf2XlaPreferred(op);
 }
 
 namespace {
 
-template <typename T, size_t N>
-using InlinedVector = tensorflow::gtl::InlinedVector<T, N>;  // non-absl ok
-
-static std::unique_ptr<tensorflow::StaticDeviceMgr> CreateDeviceMgr(
-    const std::string& device_type) {
-  // Register compilation kernels for all registered XLA backends.
-  tensorflow::XlaOpRegistry::RegisterCompilationKernels();
-
-  auto device = std::make_unique<tensorflow::XlaCompilationDevice>(
-      tensorflow::SessionOptions(), tensorflow::DeviceType(device_type));
-  return std::make_unique<tensorflow::StaticDeviceMgr>(std::move(device));
-}
-
-class Tf2XlaRewriter {
- public:
-  static LogicalResult RewriteOp(Operation* op, PatternRewriter& rewriter,
-                                 const std::string& device_type,
-                                 bool is_module_pass) {
-    Tf2XlaRewriter tf2xla_rewriter(op, rewriter, device_type, is_module_pass);
-    return tf2xla_rewriter.LegalizeOp();
-  }
-
- private:
-  Tf2XlaRewriter(Operation* op, PatternRewriter& rewriter,
-                 const std::string& device_type, bool is_module_pass)
-      : op_(op),
-        device_type_(device_type),
-        rewriter_(rewriter),
-        hlo_builder_(op->getName().getStringRef().str(), rewriter_,
-                     op->getLoc(), /*build_functions=*/is_module_pass),
-        context_(nullptr) {}
-
-  ~Tf2XlaRewriter() {
-    if (context_) context_->Unref();
-  }
-
-  // Prepares OpKernelContext params common to all the ops.
-  // Emits an error on failure.
-  LogicalResult PrepareParams();
-
-  // Tries to legalize the specified TensorFlow op, if supported.
-  //
-  // Emits an error and returns failure if an error is encountered during
-  // conversion. Note that success return value doesn't mean successful
-  // legalization.
-  LogicalResult LegalizeOp();
-
-  // Converts the given operand to expression of kind kConstant or kXlaOp.
-  // Emits a remark and returns expression of kind kInvalid on failure.
-  tensorflow::XlaExpression GetExprForOperand(Value operand, Operation* op);
-
-  Operation* op_;
-  std::string device_type_;
-
-  PatternRewriter& rewriter_;
-  ::xla::MlirHloBuilder hlo_builder_;
-  tensorflow::OpOrArgLocNameMapper name_mapper_;
-
-  tensorflow::XlaContext* context_;  // Ref-counted.
-
-  std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr_;
-  tensorflow::Device* device_;  // Owned by device_mgr_;
-  std::unique_ptr<tensorflow::ScopedStepContainer> step_container_;
-  std::unique_ptr<tensorflow::FunctionLibraryDefinition> flib_def_;
-  std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr_;
-  tensorflow::OpKernelContext::Params params_;
-};
-
-LogicalResult Tf2XlaRewriter::PrepareParams() {
-  // XlaCompiler within the context is only used by the functional ops to
-  // compile functions. We are not handling those at the moment so XlaCompiler
-  // is not required.
-  context_ = new tensorflow::XlaContext(/*compiler=*/nullptr, &hlo_builder_,
-                                        /*graph=*/nullptr);
-  context_->Ref();
-
-  device_mgr_ = CreateDeviceMgr(device_type_);
-  if (!device_mgr_) return failure();
-
-  // Type of params_.device is DeviceBase* so store it as Device* to access
-  // derived class method.
-  device_ = device_mgr_->ListDevices().front();
-  params_.device = device_;
-  params_.resource_manager = device_->resource_manager();
-
-  // Resources are cleared at the time of device manager destruction so pass
-  // no-op cleanup function.
-  auto cleanup = [](const std::string& name) {};
-  // Use step_id zero as we only have a single context concurrently and
-  // concurrently running each of the MLIR functions create a new device.
-  step_container_ = std::make_unique<tensorflow::ScopedStepContainer>(
-      /*step_id=*/0, cleanup);
-  tsl::Status status = step_container_->Create(
-      device_->resource_manager(),
-      tensorflow::XlaContext::kXlaContextResourceName, context_);
-  if (!status.ok()) {
-    return emitRemark(op_->getLoc())
-           << "failed to create XlaContext resource: " << status.ToString();
-  }
-  params_.step_container = step_container_.get();
-
-  tsl::StatusOr<int64_t> version_or = tensorflow::GetTfGraphProducerVersion(
-      op_->getParentOfType<mlir::ModuleOp>());
-  if (!version_or.ok()) {
-    return emitError(op_->getLoc()) << version_or.status().ToString();
-  }
-
-  flib_def_ = std::make_unique<tensorflow::FunctionLibraryDefinition>(
-      tensorflow::OpRegistry::Global(), tensorflow::FunctionDefLibrary());
-  pflr_ = std::make_unique<tensorflow::ProcessFunctionLibraryRuntime>(
-      device_mgr_.get(), tensorflow::Env::Default(), /*config=*/nullptr,
-      version_or.value(), flib_def_.get(), tensorflow::OptimizerOptions());
-  params_.function_library = pflr_->GetFLR(device_->name());
-  return success();
-}
-
 // Returns true if the given type is a ranked tensor type with static or bounded
 // dimensions.
 bool IsBounded(Type ty) {
@@ -601,183 +463,16 @@ bool HasSymbolRefAttr(Operation* op) {
   return false;
 }
 
-LogicalResult Tf2XlaRewriter::LegalizeOp() {
-  for (Type ty : op_->getOperandTypes()) {
-    auto ranked_ty = ty.dyn_cast<ShapedType>();
-    // Only bounded operands are supported in the XLA builders.
-    if (!IsBounded(ranked_ty)) {
-      return op_->emitRemark()
-             << "lowering requires bounded tensor operands " << ranked_ty;
-    }
-  }
-
-  if (HasSymbolRefAttr(op_)) {
-    return op_->emitRemark() << "ops with symbol references are not supported";
-  }
-
-  auto nodedef_or = tensorflow::ConvertTFDialectOpToNodeDef(
-      op_, name_mapper_.GetUniqueName(op_), /*ignore_unregistered_attrs=*/true);
-  if (!nodedef_or.ok()) {
-    return op_->emitRemark() << "failed to convert op to NodeDef: "
-                             << nodedef_or.status().ToString();
-  }
-
-  if (failed(PrepareParams())) return failure();
-
-  std::shared_ptr<const tensorflow::NodeProperties> props;
-  tsl::Status status = tensorflow::NodeProperties::CreateFromNodeDef(
-      *nodedef_or.value(),
-      params_.function_library->GetFunctionLibraryDefinition(), &props);
-  if (!status.ok()) {
-    return op_->emitRemark()
-           << "failed to create NodeProperties: " << status.ToString();
-  }
-  tensorflow::OpKernel* op_kernel_raw;
-  status = params_.function_library->CreateKernel(props, &op_kernel_raw);
-  if (!status.ok()) {
-    return op_->emitRemark()
-           << "failed to create tf2xla kernel: " << status.ToString();
-  }
-  // Transfer ownership of the kernel to a local smart pointer.
-  auto op_kernel = absl::WrapUnique(op_kernel_raw);
-
-  std::vector<int> required_constants;
-  status = tensorflow::XlaOpRegistry::CompileTimeConstantInputs(
-      *op_kernel, &required_constants);
-  if (!status.ok()) {
-    return op_->emitRemark()
-           << "failed to compute required constants: " << status.ToString();
-  }
-  llvm::SmallDenseSet<int, 4> required_consts;
-  required_consts.insert(required_constants.begin(), required_constants.end());
-
-  // TensorValue in inputs are backed by tensors which in turn depend on
-  // expressions. So, pre-allocate them to the required size.
-  InlinedVector<tensorflow::XlaExpression, 4> expressions;
-  InlinedVector<tensorflow::Tensor, 4> tensors;
-  InlinedVector<tensorflow::TensorValue, 4> inputs;
-  expressions.reserve(op_->getNumOperands());
-  tensors.reserve(op_->getNumOperands());
-  inputs.reserve(op_->getNumOperands());
-
-  // Prepare the list of Tensor inputs for the kernel.
-  for (auto it : llvm::enumerate(op_->getOperands())) {
-    Value operand = it.value();
-    size_t idx = it.index();
-
-    tensorflow::XlaExpression expr = GetExprForOperand(operand, op_);
-    tensorflow::XlaExpression::Kind kind = expr.kind();
-    if (kind == tensorflow::XlaExpression::Kind::kInvalid) return failure();
-    if (required_consts.count(idx) &&
-        kind != tensorflow::XlaExpression::Kind::kConstant) {
-      return op_->emitRemark()
-             << "lowering requires operand #" << idx << " to be a constant";
-    }
-    expressions.push_back(expr);
-
-    if (!tensorflow::DataTypeCanUseMemcpy(expr.dtype())) {
-      return op_->emitRemark()
-             << "skipping legalization due to unsupported type "
-             << operand.getType();
-    }
-
-    auto shape_or = expr.GetShape();
-    if (!shape_or.ok()) {
-      return op_->emitRemark()
-             << "failed to get shape for expression. " << expr.HumanString();
-    }
-
-    tensors.emplace_back(
-        device_->GetAllocator(tensorflow::AllocatorAttributes()), expr.dtype(),
-        shape_or.value());
-    tensorflow::Tensor& tensor = tensors.back();
-    tensorflow::XlaExpression::AssignExpressionToTensor(expr, &tensor);
-    inputs.emplace_back(&tensor);
-  }
-
-  params_.inputs = inputs;
-  params_.op_kernel = op_kernel.get();
-  llvm::SmallVector<tensorflow::AllocatorAttributes, 4> output_attr(
-      op_->getNumResults());
-  params_.output_attr_array = output_attr.data();
-
-  hlo_builder_.setInsertionPoint(op_);
-  hlo_builder_.SetLocation(op_->getLoc());
-
-  // Execute the kernel.
-  tensorflow::OpKernelContext op_context(&params_, op_->getNumResults());
-  device_->Compute(params_.op_kernel, &op_context);
-
-  status = op_context.status();
-  status.Update(hlo_builder_.GetCurrentStatus());
-  if (!status.ok()) {
-    return op_->emitRemark()
-           << "compilation to HLO failed: " << status.ToString();
-  }
-
-  // Replace uses of old results using the corresponding value after the
-  // lowering.
-  llvm::SmallVector<Value, 2> values;
-  values.reserve(op_->getNumResults());
-  for (int i = 0, e = op_->getNumResults(); i < e; i++) {
-    tensorflow::Tensor* output = op_context.mutable_output(i);
-    const tensorflow::XlaExpression* expr =
-        tensorflow::XlaExpression::CastExpressionFromTensor(*output);
-    if (expr->kind() != tensorflow::XlaExpression::Kind::kXlaOp &&
-        expr->kind() != tensorflow::XlaExpression::Kind::kConstant) {
-      return op_->emitRemark(
-          "expects XlaExpression of kind kXlaOp or kConstant in compiled "
-          "output");
-    }
-    mlir::Value value = hlo_builder_.GetValue(expr->AsXlaOp(&hlo_builder_));
-    values.push_back(value);
-  }
-  rewriter_.replaceOp(op_, values);
-  return success();
-}
-
-tensorflow::XlaExpression Tf2XlaRewriter::GetExprForOperand(Value operand,
-                                                            Operation* op) {
-  ElementsAttr const_attr;
-  auto defining_op = operand.getDefiningOp();
-  if (defining_op && matchPattern(defining_op, m_Constant(&const_attr))) {
-    tensorflow::Tensor tensor;
-    auto status = tensorflow::ConvertToTensor(const_attr, &tensor);
-    if (!status.ok()) {
-      op->emitRemark() << "skipping legalization due to failed const conversion"
-                       << status.ToString();
-      return tensorflow::XlaExpression::Invalid();
-    }
-    return tensorflow::XlaExpression::Constant(tensor);
-  }
-
-  // Skip this op if XLA doesn't support this operand type.
-  auto xla_op_or = hlo_builder_.MakeXlaOp(operand);
-  if (!xla_op_or.ok()) {
-    op->emitRemark() << "skipping legalization due to "
-                     << xla_op_or.status().ToString();
-    return tensorflow::XlaExpression::Invalid();
-  }
-  ::xla::XlaOp xla_op = xla_op_or.value();
-
-  tensorflow::DataType dtype;
-  auto status = tensorflow::ConvertToDataType(operand.getType(), &dtype);
-  if (!status.ok()) {
-    op->emitRemark() << "skipping legalization due to " << status.ToString();
-    return tensorflow::XlaExpression::Invalid();
-  }
-  return tensorflow::XlaExpression::XlaOp(xla_op, dtype);
-}
-
 class Tf2XlaRewritePattern : public ConversionPattern {
  public:
   explicit Tf2XlaRewritePattern(MLIRContext* ctx, TypeConverter& converter,
                                 const std::string& device_type,
-                                bool prefer_tf2xla, bool is_module_pass)
+                                bool prefer_tf2xla,
+                                bool use_tf2xla_hlo_importer)
       : ConversionPattern(converter, MatchAnyOpTypeTag(), /*benefit=*/1, ctx),
         device_type_(device_type),
         prefer_tf2xla_(prefer_tf2xla),
-        is_module_pass_(is_module_pass) {}
+        use_tf2xla_hlo_importer_(use_tf2xla_hlo_importer) {}
 
   LogicalResult matchAndRewrite(
       Operation* op, ArrayRef<Value> operands,
@@ -790,25 +485,19 @@ class Tf2XlaRewritePattern : public ConversionPattern {
       if (old_val.getType() != new_val.getType()) return failure();
     }
 
-    if (is_module_pass_) {
-      // Module passes should only ever legalize ops that have been specifically
-      // whitelisted for legalization within a module pass. They will never
-      // legalize any ops whitelisted for legalization within a func pass.
-      if (!IsOpAllowedTf2XlaFallbackAndCreateFunctions(op)) {
-        return failure();
-      }
-    } else if (!(IsOpAllowedTf2XlaFallback(op) ||
-                 (prefer_tf2xla_ && IsOpAllowedTf2XlaPreferred(op)))) {
+    if (!(IsOpAllowedTf2XlaFallback(op) ||
+          (prefer_tf2xla_ && IsOpAllowedTf2XlaPreferred(op)))) {
       return failure();
     }
+
     return Tf2XlaRewriter::RewriteOp(op, rewriter, device_type_,
-                                     is_module_pass_);
+                                     use_tf2xla_hlo_importer_);
   }
 
  private:
   std::string device_type_;
   bool prefer_tf2xla_;
-  bool is_module_pass_;
+  bool use_tf2xla_hlo_importer_;
 };
 
 bool ShouldRefineTypeTo(Type original_ty, Type updated_ty) {
@@ -897,12 +586,15 @@ Tf2XlaTypeConverter::Tf2XlaTypeConverter() {
   addSourceMaterialization(cast_value);
 }
 
-void PopulateLegalizeTfWithTf2XlaPatterns(
-    llvm::StringRef device_type, RewritePatternSet& patterns, MLIRContext* ctx,
-    Tf2XlaTypeConverter& converter, bool prefer_tf2xla, bool is_module_pass) {
+void PopulateLegalizeTfWithTf2XlaPatterns(llvm::StringRef device_type,
+                                          RewritePatternSet& patterns,
+                                          MLIRContext* ctx,
+                                          Tf2XlaTypeConverter& converter,
+                                          bool prefer_tf2xla,
+                                          bool use_tf2xla_hlo_importer) {
   patterns.add<TypePropagator>(ctx);
   patterns.add<Tf2XlaRewritePattern>(ctx, converter, device_type.str(),
-                                     prefer_tf2xla, is_module_pass);
+                                     prefer_tf2xla, use_tf2xla_hlo_importer);
 }
 
 }  // end namespace mhlo
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/passes.h b/tensorflow/compiler/mlir/tf2xla/transforms/passes.h
index 4438756a419..e805b069f86 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/passes.h
@@ -46,16 +46,14 @@ namespace mhlo {
 /// patterns from TF2XLA fallback for provided device type (see
 /// legalize_tf_with_tf2xla.cc for details). By default, TF2XLA fallback is not
 /// used.
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFPass(
+/// Note: This is a module pass because when legalizing with TF2XLA fallback,
+/// functions are imported into the module. Importing functions into a
+/// module is not thread safe.
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFPass(
     bool allow_partial_conversion = false, bool legalize_chlo = true,
     std::optional<StringRef> tf2xla_fallback_device_type = std::nullopt,
     bool prefer_tf2xla = false);
 
-/// Legalize whitelisted Ops using TF2XLA fallback for ops that must also be
-/// able to create new functions.
-std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFModulePass(
-    StringRef tf2xla_fallback_device_type = "");
-
 // Legalizes from MHLO quantized ops with MHLO quant types to MHLO primitive ops
 // like int ops.
 std::unique_ptr<OperationPass<func::FuncOp>> createConvertMHLOQuantToIntPass();
@@ -84,7 +82,7 @@ void PopulateLegalizeTfWithTf2XlaPatterns(llvm::StringRef device_type,
                                           MLIRContext* ctx,
                                           Tf2XlaTypeConverter& converter,
                                           bool prefer_tf2xla = false,
-                                          bool is_module_pass = false);
+                                          bool use_tf2xla_hlo_importer = false);
 
 /// Adds the TF to TF lowerings and TF to XLA rewrite patterns to the pattern
 /// list.
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.cc b/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.cc
new file mode 100644
index 00000000000..a8d36fe1fce
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.cc
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/register_common_dialects.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace mlir {
+namespace mhlo {
+namespace test {
+
+using ::mlir::DialectRegistry;
+using ::mlir::MLIRContext;
+using ::mlir::ModuleOp;
+using ::mlir::OwningOpRef;
+using ::tsl::StatusOr;
+
+StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
+    absl::string_view module_string, MLIRContext* context) {
+  DialectRegistry mlir_registry;
+  RegisterCommonToolingDialects(mlir_registry);
+  context->appendDialectRegistry(mlir_registry);
+
+  OwningOpRef<ModuleOp> mlir_module;
+  auto status =
+      tensorflow::DeserializeMlirModule(module_string, context, &mlir_module);
+  if (!status.ok()) {
+    return status;
+  }
+  return mlir_module;
+}
+
+}  // namespace test
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h b/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h
new file mode 100644
index 00000000000..15ea2bc7412
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TEST_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TEST_UTILS_H_
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace mlir {
+namespace mhlo {
+namespace test {
+
+// Given a raw string, return a ModuleOp that can be used with the given
+// MLIRContext.
+tsl::StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
+    absl::string_view module_string, MLIRContext* mlir_context);
+
+}  // namespace test
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TEST_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc
new file mode 100644
index 00000000000..4117b5ce026
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc
@@ -0,0 +1,554 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h"
+#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
+#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h"
+#include "tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace mlir {
+namespace mhlo {
+namespace {
+
+using ::mlir::FunctionType;
+using ::mlir::ModuleOp;
+using ::mlir::OwningOpRef;
+using ::mlir::func::FuncOp;
+using ::tensorflow::Tensor;
+using ::tsl::StatusOr;
+using ::xla::XlaComputation;
+
+static std::unique_ptr<tensorflow::StaticDeviceMgr> CreateDeviceMgr(
+    const std::string& device_type) {
+  // Register compilation kernels for all registered XLA backends.
+  tensorflow::XlaOpRegistry::RegisterCompilationKernels();
+
+  auto device = std::make_unique<tensorflow::XlaCompilationDevice>(
+      tensorflow::SessionOptions(), tensorflow::DeviceType(device_type));
+  return std::make_unique<tensorflow::StaticDeviceMgr>(std::move(device));
+}
+
+bool RootInstructionIsTuple(const xla::HloModule& hlo_module) {
+  xla::HloInstruction* root_instruction =
+      hlo_module.entry_computation()->root_instruction();
+
+  return root_instruction->opcode() == xla::HloOpcode::kTuple;
+}
+
+};  // namespace
+
+LogicalResult Tf2XlaRewriter::RewriteOp(Operation* op,
+                                        PatternRewriter& rewriter,
+                                        const std::string& device_type,
+                                        bool use_tf2xla_hlo_importer) {
+  Tf2XlaRewriter tf2xla_rewriter(op, rewriter, device_type,
+                                 use_tf2xla_hlo_importer);
+  return tf2xla_rewriter.LegalizeOp();
+}
+
+Tf2XlaRewriter::Tf2XlaRewriter(Operation* op, PatternRewriter& rewriter,
+                               const std::string& device_type,
+                               bool use_tf2xla_hlo_importer)
+    : op_(op),
+      device_type_(device_type),
+      rewriter_(rewriter),
+      hlo_builder_(op->getName().getStringRef().str(), rewriter_, op->getLoc(),
+                   /*build_functions=*/true),
+      context_(nullptr),
+      use_tf2xla_hlo_importer_(use_tf2xla_hlo_importer),
+      xla_builder_(op_->getName().getStringRef().str()) {}
+
+Tf2XlaRewriter::~Tf2XlaRewriter() {
+  if (context_) context_->Unref();
+}
+
+tsl::StatusOr<mhlo::TupleOp> Tf2XlaRewriter::ImportXlaComputation(
+    XlaComputation& computation) {
+  xla::DebugOptions debug_options;
+  TF_ASSIGN_OR_RETURN(auto hlo_module_config,
+                      xla::HloModule::CreateModuleConfigFromProto(
+                          computation.proto(), debug_options));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::HloModule> hlo_module,
+      xla::HloModule::CreateFromProto(computation.proto(), hlo_module_config));
+
+  if (!RootInstructionIsTuple(*hlo_module)) {
+    return tsl::errors::InvalidArgument("Imported XLA Root is not a tuple op");
+  }
+
+  ModuleOp mlir_module = op_->getParentOfType<ModuleOp>();
+  mlir::OpBuilder builder(op_);
+  mlir::SymbolTable symbol_table(mlir_module);
+
+  llvm::SmallVector<mlir::Value> arguments;
+  for (int i = 0; i < op_->getNumOperands(); i++) {
+    arguments.push_back(op_->getOperand(i));
+  }
+
+  // Ideally we could use the Function Importer but it increases compilation
+  // time when we have a model with thousands of tf2xla op fallbacks. At time
+  // of writing, this caused compilation time to be greater than 2x slower.
+  // So we have to directly import these instructions.
+  TF_ASSIGN_OR_RETURN(
+      mlir::Value root_value,
+      xla::HloFunctionImporter::ImportInstructions(
+          *hlo_module->entry_computation(), arguments, symbol_table, &builder));
+
+  mhlo::TupleOp root_tuple =
+      mlir::dyn_cast_or_null<mhlo::TupleOp>(root_value.getDefiningOp());
+  if (!root_tuple) {
+    return tsl::errors::InvalidArgument(
+        "Imported XLA Root Value is not a tuple op");
+  }
+
+  return root_tuple;
+}
+
+LogicalResult Tf2XlaRewriter::PrepareParams() {
+  // XlaCompiler within the context is only used by the functional ops to
+  // compile functions. We are not handling those at the moment so
+  // XlaCompiler is not required.
+  if (use_tf2xla_hlo_importer_) {
+    context_ = new tensorflow::XlaContext(/*compiler=*/nullptr, &xla_builder_,
+                                          /*graph=*/nullptr);
+  } else {
+    context_ = new tensorflow::XlaContext(/*compiler=*/nullptr, &hlo_builder_,
+                                          /*graph=*/nullptr);
+  }
+  context_->Ref();
+
+  device_mgr_ = CreateDeviceMgr(device_type_);
+  if (!device_mgr_) return failure();
+
+  // Type of params_.device is DeviceBase* so store it as Device* to access
+  // derived class method.
+  device_ = device_mgr_->ListDevices().front();
+  params_.device = device_;
+  params_.resource_manager = device_->resource_manager();
+
+  // Resources are cleared at the time of device manager destruction so pass
+  // no-op cleanup function.
+  auto cleanup = [](const std::string& name) {};
+  // Use step_id zero as we only have a single context concurrently and
+  // concurrently running each of the MLIR functions create a new device.
+  step_container_ = std::make_unique<tensorflow::ScopedStepContainer>(
+      /*step_id=*/0, cleanup);
+  tsl::Status status = step_container_->Create(
+      device_->resource_manager(),
+      tensorflow::XlaContext::kXlaContextResourceName, context_);
+  if (!status.ok()) {
+    return emitRemark(op_->getLoc())
+           << "failed to create XlaContext resource: " << status.ToString();
+  }
+  params_.step_container = step_container_.get();
+
+  tsl::StatusOr<int64_t> version_or = tensorflow::GetTfGraphProducerVersion(
+      op_->getParentOfType<mlir::ModuleOp>());
+  if (!version_or.ok()) {
+    return emitError(op_->getLoc()) << version_or.status().ToString();
+  }
+
+  flib_def_ = std::make_unique<tensorflow::FunctionLibraryDefinition>(
+      tensorflow::OpRegistry::Global(), tensorflow::FunctionDefLibrary());
+  pflr_ = std::make_unique<tensorflow::ProcessFunctionLibraryRuntime>(
+      device_mgr_.get(), tensorflow::Env::Default(), /*config=*/nullptr,
+      version_or.value(), flib_def_.get(), tensorflow::OptimizerOptions());
+  params_.function_library = pflr_->GetFLR(device_->name());
+  return success();
+}
+
+// Returns true if the given type is a ranked tensor type with static or
+// bounded dimensions.
+bool IsBounded(Type ty) {
+  auto ranked_ty = ty.dyn_cast<RankedTensorType>();
+  if (!ranked_ty) return false;
+
+  if (ranked_ty.hasStaticShape()) return true;
+
+  auto encoding =
+      ranked_ty.getEncoding().dyn_cast_or_null<TypeExtensionsAttr>();
+  if (!encoding) return false;
+
+  for (int i = 0; i < ranked_ty.getRank(); ++i) {
+    if (ranked_ty.isDynamicDim(i) &&
+        encoding.getBounds()[i] == ShapedType::kDynamic) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool HasSymbolRefAttr(Operation* op) {
+  for (const auto& attr : op->getAttrs()) {
+    Attribute attr_value = attr.getValue();
+    if (attr_value.isa<SymbolRefAttr>()) {
+      return true;
+    } else if (auto array_attr = attr_value.dyn_cast<ArrayAttr>()) {
+      if (!array_attr.empty() && array_attr.begin()->isa<SymbolRefAttr>()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+LogicalResult Tf2XlaRewriter::PrepareKernelInputs(
+    const llvm::SmallDenseSet<int>& required_consts,
+    std::vector<tensorflow::XlaExpression>& expressions,
+    std::vector<tensorflow::Tensor>& tensors,
+    std::vector<tensorflow::TensorValue>& inputs) {
+  // Prepare the list of Tensor inputs for the kernel.
+  for (auto it : llvm::enumerate(op_->getOperands())) {
+    Value operand = it.value();
+    size_t idx = it.index();
+
+    tensorflow::XlaExpression expr = GetExprForOperand(operand, op_, idx);
+    tensorflow::XlaExpression::Kind kind = expr.kind();
+    if (kind == tensorflow::XlaExpression::Kind::kInvalid) return failure();
+    if (required_consts.count(idx) &&
+        kind != tensorflow::XlaExpression::Kind::kConstant) {
+      return op_->emitRemark()
+             << "lowering requires operand #" << idx << " to be a constant";
+    }
+    expressions.push_back(expr);
+
+    if (!tensorflow::DataTypeCanUseMemcpy(expr.dtype())) {
+      return op_->emitRemark()
+             << "skipping legalization due to unsupported type "
+             << operand.getType();
+    }
+
+    auto shape_or = expr.GetShape();
+    if (!shape_or.ok()) {
+      return op_->emitRemark()
+             << "failed to get shape for expression. " << expr.HumanString();
+    }
+
+    tensors.emplace_back(
+        device_->GetAllocator(tensorflow::AllocatorAttributes()), expr.dtype(),
+        shape_or.value());
+
+    tensorflow::Tensor& tensor = tensors.back();
+    tensorflow::XlaExpression::AssignExpressionToTensor(expr, &tensor);
+    inputs.emplace_back(&tensor);
+  }
+
+  return success();
+}
+
+LogicalResult Tf2XlaRewriter::LegalizeOp() {
+  for (Type ty : op_->getOperandTypes()) {
+    auto ranked_ty = ty.dyn_cast<ShapedType>();
+    // Only bounded operands are supported in the XLA builders.
+    if (!IsBounded(ranked_ty)) {
+      return op_->emitRemark()
+             << "lowering requires bounded tensor operands " << ranked_ty;
+    }
+  }
+
+  if (HasSymbolRefAttr(op_)) {
+    return op_->emitRemark() << "ops with symbol references are not supported";
+  }
+
+  auto nodedef_or = tensorflow::ConvertTFDialectOpToNodeDef(
+      op_, name_mapper_.GetUniqueName(op_),
+      /*ignore_unregistered_attrs=*/true);
+  if (!nodedef_or.ok()) {
+    return op_->emitRemark() << "failed to convert op to NodeDef: "
+                             << nodedef_or.status().ToString();
+  }
+
+  if (failed(PrepareParams())) return failure();
+
+  std::shared_ptr<const tensorflow::NodeProperties> props;
+  tsl::Status status = tensorflow::NodeProperties::CreateFromNodeDef(
+      *nodedef_or.value(),
+      params_.function_library->GetFunctionLibraryDefinition(), &props);
+  if (!status.ok()) {
+    return op_->emitRemark()
+           << "failed to create NodeProperties: " << status.ToString();
+  }
+  tensorflow::OpKernel* op_kernel_raw;
+  status = params_.function_library->CreateKernel(props, &op_kernel_raw);
+  if (!status.ok()) {
+    return op_->emitRemark()
+           << "failed to create tf2xla kernel: " << status.ToString();
+  }
+  // Transfer ownership of the kernel to a local smart pointer.
+  auto op_kernel = absl::WrapUnique(op_kernel_raw);
+
+  std::vector<int> required_constants;
+  status = tensorflow::XlaOpRegistry::CompileTimeConstantInputs(
+      *op_kernel, &required_constants);
+  if (!status.ok()) {
+    return op_->emitRemark()
+           << "failed to compute required constants: " << status.ToString();
+  }
+
+  llvm::SmallDenseSet<int> required_consts;
+  required_consts.insert(required_constants.begin(), required_constants.end());
+
+  // TensorValue in inputs are backed by tensors which in turn depend on
+  // expressions. So, pre-allocate them to the required size. Subtle note:
+  // Since these are assigned to params_, these have to live past the kernel
+  // compilation.
+  std::vector<tensorflow::XlaExpression> expressions;
+  std::vector<tensorflow::Tensor> tensors;
+  std::vector<tensorflow::TensorValue> inputs;
+  expressions.reserve(op_->getNumOperands());
+  tensors.reserve(op_->getNumOperands());
+  inputs.reserve(op_->getNumOperands());
+
+  if (failed(
+          PrepareKernelInputs(required_consts, expressions, tensors, inputs)))
+    return failure();
+
+  params_.inputs = inputs;
+  params_.op_kernel = op_kernel.get();
+  llvm::SmallVector<tensorflow::AllocatorAttributes, 4> output_attr(
+      op_->getNumResults());
+  params_.output_attr_array = output_attr.data();
+
+  hlo_builder_.setInsertionPoint(op_);
+  hlo_builder_.SetLocation(op_->getLoc());
+
+  tensorflow::OpKernelContext op_context(&params_, op_->getNumResults());
+  device_->Compute(params_.op_kernel, &op_context);
+
+  status = op_context.status();
+  status.Update(hlo_builder_.GetCurrentStatus());
+  if (!status.ok()) {
+    return op_->emitRemark()
+           << "compilation to HLO failed: " << status.ToString();
+  }
+
+  if (failed(VerifyOpResults(op_context))) return failure();
+
+  mhlo::TupleOp tuple_result;
+  if (use_tf2xla_hlo_importer_) {
+    StatusOr<mhlo::TupleOp> tuple_result_or_status =
+        CompileWithHloImporter(op_context);
+    if (!tuple_result_or_status.ok()) {
+      return op_->emitRemark() << tuple_result_or_status.status().ToString();
+    }
+    tuple_result = tuple_result_or_status.value();
+  }
+
+  llvm::SmallVector<Value> output_values;
+  if (failed(GetKernelOutputs(op_context, tuple_result, output_values))) {
+    return failure();
+  }
+
+  rewriter_.replaceOp(op_, output_values);
+  return success();
+}
+
+tsl::StatusOr<mhlo::TupleOp> Tf2XlaRewriter::CompileWithHloImporter(
+    tensorflow::OpKernelContext& op_context) {
+  if (!use_tf2xla_hlo_importer_) {
+    return tsl::errors::InvalidArgument(
+        "Cannot compile with HloImporter because it isn't supported");
+  }
+
+  // XLA can only return a single value. Wrap all output op return values
+  // in a Tuple op that gets unpacked later.
+  std::vector<xla::XlaOp> output_values;
+  for (int i = 0, e = op_->getNumResults(); i < e; i++) {
+    tensorflow::Tensor* output = op_context.mutable_output(i);
+    const tensorflow::XlaExpression* expr =
+        tensorflow::XlaExpression::CastExpressionFromTensor(*output);
+    output_values.push_back(expr->AsXlaOp(&xla_builder_));
+  }
+
+  absl::Span<const xla::XlaOp> return_values(output_values);
+  xla::XlaOp root_value = xla::Tuple(&xla_builder_, return_values);
+
+  TF_ASSIGN_OR_RETURN(XlaComputation computation,
+                      xla_builder_.Build(root_value,
+                                         /*remove_dynamic_dimensions=*/false));
+
+  return ImportXlaComputation(computation);
+}
+
+mlir::LogicalResult Tf2XlaRewriter::VerifyOpResults(
+    tensorflow::OpKernelContext& op_context) {
+  for (int i = 0, e = op_->getNumResults(); i < e; i++) {
+    tensorflow::Tensor* output = op_context.mutable_output(i);
+    const tensorflow::XlaExpression* expr =
+        tensorflow::XlaExpression::CastExpressionFromTensor(*output);
+
+    if (expr->kind() != tensorflow::XlaExpression::Kind::kXlaOp &&
+        expr->kind() != tensorflow::XlaExpression::Kind::kConstant) {
+      return op_->emitRemark(absl::StrCat(
+          "expects XlaExpression of kind kXlaOp or kConstant in compiled "
+          "output index ",
+          i));
+    }
+  }
+  return success();
+}
+
+// XLA computations can only return a single value, but TF ops can return
+// multiple values. We get around this by returning a tuple as an XLA op. We
+// then unpack it here to return the multiple values instead.
+mlir::LogicalResult Tf2XlaRewriter::UnpackTupleResults(
+    mhlo::TupleOp tuple_result, llvm::SmallVector<Value>& outputs) {
+  if (tuple_result->getNumOperands() != op_->getNumResults()) {
+    return op_->emitRemark() << "Translated TF2XLA tuple has different "
+                                "number of results than original op";
+  }
+
+  for (int i = 0; i < tuple_result->getNumOperands(); i++) {
+    outputs.push_back(tuple_result->getOperand(i));
+  }
+
+  tuple_result.getOperation()->erase();
+  return success();
+}
+
+mlir::LogicalResult Tf2XlaRewriter::GetKernelOutputs(
+    tensorflow::OpKernelContext& op_context, mhlo::TupleOp tuple_results,
+    llvm::SmallVector<Value>& outputs) {
+  outputs.reserve(op_->getNumResults());
+
+  if (use_tf2xla_hlo_importer_) {
+    return UnpackTupleResults(tuple_results, outputs);
+  }
+
+  for (int i = 0, e = op_->getNumResults(); i < e; i++) {
+    tensorflow::Tensor* output = op_context.mutable_output(i);
+    const tensorflow::XlaExpression* expr =
+        tensorflow::XlaExpression::CastExpressionFromTensor(*output);
+
+    mlir::Value value = hlo_builder_.GetValue(expr->AsXlaOp(&hlo_builder_));
+    outputs.push_back(value);
+  }
+
+  return success();
+}
+
+tensorflow::XlaExpression Tf2XlaRewriter::GetExprForOperand(
+    Value operand, Operation* op, int64_t operand_index) {
+  ElementsAttr const_attr;
+  auto defining_op = operand.getDefiningOp();
+
+  ::xla::XlaOp xla_op;
+  if (use_tf2xla_hlo_importer_) {
+    xla_op = xla::Parameter(&xla_builder_, operand_index,
+                            xla::TypeToShape(operand.getType()),
+                            std::to_string(operand_index));
+  }
+
+  if (defining_op && matchPattern(defining_op, m_Constant(&const_attr))) {
+    tensorflow::Tensor tensor;
+    auto status = tensorflow::ConvertToTensor(const_attr, &tensor);
+    if (!status.ok()) {
+      op->emitRemark() << "skipping legalization due to failed const conversion"
+                       << status.ToString();
+      return tensorflow::XlaExpression::Invalid();
+    }
+
+    return tensorflow::XlaExpression::Constant(tensor);
+  }
+
+  if (!use_tf2xla_hlo_importer_) {
+    auto xla_op_or = hlo_builder_.MakeXlaOp(operand);
+    if (!xla_op_or.ok()) {
+      op->emitRemark() << "skipping legalization due to "
+                       << xla_op_or.status().ToString();
+      return tensorflow::XlaExpression::Invalid();
+    }
+    xla_op = xla_op_or.value();
+  }
+
+  tensorflow::DataType dtype;
+  auto status = tensorflow::ConvertToDataType(operand.getType(), &dtype);
+  if (!status.ok()) {
+    op->emitRemark() << "skipping legalization due to " << status.ToString();
+    return tensorflow::XlaExpression::Invalid();
+  }
+  return tensorflow::XlaExpression::XlaOp(xla_op, dtype);
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h
new file mode 100644
index 00000000000..642674469d1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h
@@ -0,0 +1,130 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TF2XLA_REWRITER_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TF2XLA_REWRITER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace mlir {
+namespace mhlo {
+
+class Tf2XlaRewriterTestPeer;
+
+class Tf2XlaRewriter {
+ public:
+  static mlir::LogicalResult RewriteOp(mlir::Operation* op,
+                                       mlir::PatternRewriter& rewriter,
+                                       const std::string& device_type,
+                                       bool use_tf2xla_hlo_importer);
+
+ private:
+  friend class Tf2XlaRewriterTestPeer;
+
+  Tf2XlaRewriter(mlir::Operation* op, mlir::PatternRewriter& rewriter,
+                 const std::string& device_type, bool use_tf2xla_hlo_importer);
+
+  ~Tf2XlaRewriter();
+
+  // Compiles the given Operation with XlaBuilder and imports the generated HLO
+  // via the HLO -> MHLO importer.
+  tsl::StatusOr<mhlo::TupleOp> CompileWithHloImporter(
+      tensorflow::OpKernelContext& op_context);
+
+  // Import the given XlaComputation into the parent module. Returns the given
+  // generated function.
+  tsl::StatusOr<mhlo::TupleOp> ImportXlaComputation(
+      xla::XlaComputation& computation);
+
+  // Prepares OpKernelContext params common to all the ops.
+  // Emits an error on failure.
+  mlir::LogicalResult PrepareParams();
+
+  // Given the required_consts, it will fill the 3 output vectors with
+  // their respective data.
+  // Expressions: Output XLA expressions as required by the compiled kernel.
+  // Tensors: Vector of tensors that back the TensorValue inputs
+  // Inputs: Vector of inputs that are backed by tensors.
+  mlir::LogicalResult PrepareKernelInputs(
+      const llvm::SmallDenseSet<int>& required_consts,
+      std::vector<tensorflow::XlaExpression>& expressions,
+      std::vector<tensorflow::Tensor>& tensors,
+      std::vector<tensorflow::TensorValue>& inputs);
+
+  mlir::LogicalResult VerifyOpResults(tensorflow::OpKernelContext& op_context);
+  mlir::LogicalResult GetKernelOutputs(tensorflow::OpKernelContext& op_context,
+                                       mhlo::TupleOp tuple_results,
+                                       llvm::SmallVector<Value>& outputs);
+
+  // Given a translated function with a single return value, unpack the tuple
+  // results.
+  mlir::LogicalResult UnpackTupleResults(mhlo::TupleOp tuple_result,
+                                         llvm::SmallVector<Value>& outputs);
+
+  // Tries to legalize the specified TensorFlow op, if supported.
+  //
+  // Emits an error and returns failure if an error is encountered during
+  // conversion. Note that success return value doesn't mean successful
+  // legalization.
+  mlir::LogicalResult LegalizeOp();
+
+  // Converts the given operand to expression of kind kConstant or kXlaOp.
+  // Emits a remark and returns expression of kind kInvalid on failure.
+  tensorflow::XlaExpression GetExprForOperand(mlir::Value operand,
+                                              mlir::Operation* op,
+                                              int64_t operand_index);
+
+  mlir::Operation* op_;
+  std::string device_type_;
+
+  mlir::PatternRewriter& rewriter_;
+  ::xla::MlirHloBuilder hlo_builder_;
+  tensorflow::OpOrArgLocNameMapper name_mapper_;
+
+  tensorflow::XlaContext* context_;  // Ref-counted.
+
+  std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr_;
+  tensorflow::Device* device_;  // Owned by device_mgr_;
+  std::unique_ptr<tensorflow::ScopedStepContainer> step_container_;
+  std::unique_ptr<tensorflow::FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr_;
+  tensorflow::OpKernelContext::Params params_;
+
+  bool use_tf2xla_hlo_importer_;
+  xla::XlaBuilder xla_builder_;
+};
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TF2XLA_REWRITER_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc
new file mode 100644
index 00000000000..4aeb42bd7bd
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc
@@ -0,0 +1,324 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace mlir {
+namespace mhlo {
+
+using ::mlir::LogicalResult;
+using ::mlir::ModuleOp;
+using ::mlir::OpBuilder;
+using ::mlir::Operation;
+using ::mlir::func::FuncOp;
+using ::tsl::Status;
+using ::tsl::StatusOr;
+using ::xla::ReplicaGroup;
+using ::xla::ShapeUtil;
+using ::xla::XlaBuilder;
+using ::xla::XlaComputation;
+using ::xla::XlaOp;
+
+static constexpr char kMlirModuleStr[] = R"(
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1442 : i32}} {
+  func.func @main(%arg0: tensor<3xi64> {tf._user_specified_name = "resource", tf.aliasing_output = 3 : i64}) -> () attributes {tf.entry_function = {control_outputs = "stateful_normal/RngReadAndSkip,stateful_uniform/RngReadAndSkip,stateful_uniform_full_int/RngReadAndSkip", inputs = "stateful_normal_rngreadandskip_resource", outputs = "identity_RetVal,identity_1_RetVal,identity_2_RetVal"}} {
+    %0:3 = "tf.Unpack"(%arg0) {axis = 0 : i64} : (tensor<3xi64>) -> (tensor<i64>, tensor<i64>, tensor<i64>)
+    return
+  }
+})";
+
+XlaComputation GetTestXlaComputation() {
+  XlaBuilder xla_builder("test");
+  XlaOp add = xla::Add(xla::ConstantR0<float>(&xla_builder, 1.0),
+                       xla::ConstantR0<float>(&xla_builder, 2.0));
+
+  std::vector<XlaOp> tuple_values;
+  tuple_values.push_back(add);
+
+  xla::Tuple(&xla_builder, tuple_values);
+  return xla_builder.Build().value();
+}
+
+class EmptyPatternRewriter : public mlir::PatternRewriter {
+ public:
+  explicit EmptyPatternRewriter(const OpBuilder& other_builder)
+      : mlir::PatternRewriter(other_builder) {}
+  ~EmptyPatternRewriter() override = default;
+};
+
+class Tf2XlaRewriterTestPeer {
+ public:
+  explicit Tf2XlaRewriterTestPeer() = delete;
+  explicit Tf2XlaRewriterTestPeer(mlir::Operation* op)
+      : op_builder_(op),
+        empty_rewriter_(op_builder_),
+        tf2xla_rewriter_(op, empty_rewriter_,
+                         /*device_type=*/"XLA_CPU_JIT",
+                         /*use_tf2xla_hlo_importer=*/true) {}
+
+  tsl::StatusOr<TupleOp> ImportXlaComputationIntoModule(
+      XlaComputation& computation) {
+    return tf2xla_rewriter_.ImportXlaComputation(computation);
+  }
+
+ private:
+  OpBuilder op_builder_;
+  EmptyPatternRewriter empty_rewriter_;
+  Tf2XlaRewriter tf2xla_rewriter_;
+};
+
+// This should only have unit tests. End to end tests should be done with
+// FILECHECK and MLIR tests.
+class Tf2XlaRewriterTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    tensorflow::XlaOpRegistry::RegisterCompilationKernels();
+  }
+
+  Status CreateMlirModule(std::string module_string = kMlirModuleStr) {
+    TF_ASSIGN_OR_RETURN(
+        module_, test::GetMlirModuleFromString(module_string, &context_));
+
+    context_.loadAllAvailableDialects();
+    return tsl::OkStatus();
+  }
+
+  Status LegalizeSingleOp(bool use_tf2xla_hlo_importer, Operation& op) {
+    SourceMgrDiagnosticHandler sourceMgrHandler(source_manager_, &context_);
+
+    OpBuilder op_builder(&op);
+    EmptyPatternRewriter pattern_rewriter(op_builder);
+
+    LogicalResult result = Tf2XlaRewriter::RewriteOp(
+        &op, pattern_rewriter,
+        /*device_type=*/"XLA_CPU_JIT", use_tf2xla_hlo_importer);
+    if (!result.succeeded()) {
+      return tsl::errors::Internal("Failed to rewrite op");
+    }
+
+    return tsl::OkStatus();
+  }
+
+  Status LegalizeModule(bool use_tf2xla_hlo_importer,
+                        std::string module_string = kMlirModuleStr) {
+    TF_EXPECT_OK(CreateMlirModule(module_string));
+    FuncOp main = module_->lookupSymbol<mlir::func::FuncOp>("main");
+    if (!main) {
+      return tsl::errors::InvalidArgument("Could not find a main function");
+    }
+
+    WalkResult walk_result = main.walk([&](Operation* op) {
+      if (op->getDialect()->getNamespace() !=
+          TF::TensorFlowDialect::getDialectNamespace()) {
+        return WalkResult::advance();
+      }
+
+      if (!LegalizeSingleOp(use_tf2xla_hlo_importer, *op).ok()) {
+        return WalkResult::interrupt();
+      }
+
+      return WalkResult::advance();
+    });
+
+    if (walk_result.wasInterrupted()) {
+      return tsl::errors::Internal("Could not legalize all ops");
+    }
+
+    return tsl::OkStatus();
+  }
+
+  mlir::func::FuncOp GetMainFunc() {
+    func::FuncOp main_func = module_->lookupSymbol<mlir::func::FuncOp>("main");
+    EXPECT_TRUE(main_func);
+
+    return main_func;
+  }
+
+  mlir::Operation& GetFirstOpFromMain() {
+    mlir::func::FuncOp main_func = GetMainFunc();
+    return main_func.getBody().front().front();
+  }
+
+  StatusOr<TupleOp> ImportXlaComputationIntoModule(
+      XlaComputation& computation) {
+    SourceMgrDiagnosticHandler sourceMgrHandler(source_manager_, &context_);
+
+    mlir::Operation& first_op = GetFirstOpFromMain();
+
+    Tf2XlaRewriterTestPeer test_peer(&first_op);
+    return test_peer.ImportXlaComputationIntoModule(computation);
+  }
+
+ protected:
+  MLIRContext context_;
+  OwningOpRef<ModuleOp> module_;
+  llvm::SourceMgr source_manager_;
+};
+
+TEST_F(Tf2XlaRewriterTest, LegalizesOp) {
+  TF_EXPECT_OK(LegalizeModule(/*use_tf2xla_hlo_importer=*/false));
+}
+
+TEST_F(Tf2XlaRewriterTest, LegalizesOpWithTf2xlaHloImporter) {
+  TF_EXPECT_OK(LegalizeModule(/*use_tf2xla_hlo_importer=*/true));
+
+  int num_tuple_ops = 0;
+  module_->walk([&num_tuple_ops](TupleOp tuple_op) { num_tuple_ops += 1; });
+
+  EXPECT_EQ(num_tuple_ops, 0);
+}
+
+TEST_F(Tf2XlaRewriterTest, ImportsXlaComputationIntoModule) {
+  TF_ASSERT_OK(CreateMlirModule());
+
+  XlaComputation computation = GetTestXlaComputation();
+
+  TF_ASSERT_OK_AND_ASSIGN(TupleOp root_tuple,
+                          ImportXlaComputationIntoModule(computation));
+
+  ModuleOp parent_module =
+      root_tuple.getOperation()->getParentOfType<ModuleOp>();
+  EXPECT_EQ(parent_module, *module_);
+}
+
+TEST_F(Tf2XlaRewriterTest, FailsWithoutRootTuple) {
+  TF_ASSERT_OK(CreateMlirModule());
+
+  XlaBuilder xla_builder("test_fail");
+  xla::Add(xla::ConstantR0<float>(&xla_builder, 1.0),
+           xla::ConstantR0<float>(&xla_builder, 2.0));
+  XlaComputation bad_computation = xla_builder.Build().value();
+
+  EXPECT_FALSE(ImportXlaComputationIntoModule(bad_computation).ok());
+}
+
+TEST_F(Tf2XlaRewriterTest, ImportsSingleComputation) {
+  XlaBuilder builder("test_builder");
+  XlaComputation to_apply;
+  {
+    auto sub_builder = builder.CreateSubBuilder("add");
+    auto arg0 = Parameter(sub_builder.get(), 0,
+                          ShapeUtil::MakeScalarShape(xla::F32), "x");
+    auto arg1 = Parameter(sub_builder.get(), 1,
+                          ShapeUtil::MakeScalarShape(xla::F32), "y");
+    Add(arg0, arg1);
+    TF_ASSERT_OK_AND_ASSIGN(to_apply, sub_builder->Build());
+  }
+  auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(xla::F32, {4, 16}), "x");
+  ReplicaGroup group;
+  group.add_replica_ids(0);
+  group.add_replica_ids(1);
+  XlaOp reduce_scatter =
+      ReduceScatter(x, to_apply, /*scatter_dimension=*/1, /*shard_count=*/2,
+                    /*replica_groups=*/{group});
+
+  std::vector<XlaOp> tuple_values;
+  tuple_values.push_back(reduce_scatter);
+  xla::Tuple(&builder, tuple_values);
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
+  EXPECT_EQ(computation.proto().computations_size(), 2);
+
+  TF_ASSERT_OK(CreateMlirModule());
+  TF_ASSERT_OK_AND_ASSIGN(TupleOp root_tuple,
+                          ImportXlaComputationIntoModule(computation));
+  EXPECT_TRUE(root_tuple);
+
+  int num_func_ops = 0;
+  module_->walk([&num_func_ops](func::FuncOp func_op) { num_func_ops++; });
+
+  // Ensure that only a single computation was imported.
+  EXPECT_EQ(num_func_ops, 1);
+}
+
+TEST_F(Tf2XlaRewriterTest, InsertsConstantParameters) {
+  static constexpr char kModuleWithConstParam[] = R"(
+  module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1442 : i32}} {
+    func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+      %0 = "tf.Const"() {value = dense<1.42> : tensor<2xf32>} : () -> tensor<2xf32>
+      %1 = "tf.Atan2"(%arg0, %0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+      func.return %0 : tensor<2xf32>
+    }
+  })";
+
+  TF_ASSERT_OK(
+      LegalizeModule(/*use_tf2xla_hlo_importer=*/true, kModuleWithConstParam));
+}
+
+TEST_F(Tf2XlaRewriterTest, DISABLED_ImportsPrivateFunctions) {
+  XlaBuilder builder("test_builder");
+  XlaComputation to_apply;
+  {
+    auto sub_builder = builder.CreateSubBuilder("add");
+    auto arg0 = Parameter(sub_builder.get(), 0,
+                          ShapeUtil::MakeScalarShape(xla::F32), "x");
+    auto arg1 = Parameter(sub_builder.get(), 1,
+                          ShapeUtil::MakeScalarShape(xla::F32), "y");
+    Add(arg0, arg1);
+    TF_ASSERT_OK_AND_ASSIGN(to_apply, sub_builder->Build());
+  }
+  auto a = Parameter(&builder, 0, ShapeUtil::MakeScalarShape(xla::F32), "a");
+  auto b = Parameter(&builder, 1, ShapeUtil::MakeScalarShape(xla::F32), "b");
+  XlaOp call_op = xla::Call(&builder, to_apply, {a, b});
+
+  std::vector<XlaOp> tuple_values;
+  tuple_values.push_back(call_op);
+  xla::Tuple(&builder, tuple_values);
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
+  EXPECT_EQ(computation.proto().computations_size(), 2);
+
+  TF_ASSERT_OK(CreateMlirModule());
+  TF_ASSERT_OK_AND_ASSIGN(TupleOp root_tuple,
+                          ImportXlaComputationIntoModule(computation));
+  EXPECT_TRUE(root_tuple);
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization_test.cc
index 0a3de2f45ac..264d64eda8a 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization_test.cc
@@ -25,16 +25,17 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h"
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
 
-using ::llvm::StringRef;
-using ::mlir::DialectRegistry;
 using ::mlir::MLIRContext;
 using ::mlir::ModuleOp;
 using ::mlir::OwningOpRef;
+using ::mlir::mhlo::test::GetMlirModuleFromString;
 using ::tensorflow::monitoring::testing::CellReader;
 
 // Using a string constant here instead of testdata to make this compatible
@@ -50,21 +51,6 @@ static constexpr char kMlirModuleStr[] = R"(
 static constexpr char kFailedLegalizationStreamz[] =
     "/tensorflow/core/tf2xla/mlir_second_phase_failed_legalization_op_count";
 
-tsl::StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
-    StringRef string, MLIRContext* context) {
-  DialectRegistry mlir_registry;
-  RegisterAllTensorFlowDialects(mlir_registry);
-  context->appendDialectRegistry(mlir_registry);
-
-  OwningOpRef<ModuleOp> mlir_module;
-  auto status =
-      tensorflow::DeserializeMlirModule(string, context, &mlir_module);
-  if (!status.ok()) {
-    return status;
-  }
-  return mlir_module;
-}
-
 TEST(VerifyTfxlaLegalizationTest, RecordsStreamzFailedVerification) {
   MLIRContext context;
   TF_ASSERT_OK_AND_ASSIGN(OwningOpRef<ModuleOp> module,
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
index 3f993f270c5..09d5b91f05a 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
@@ -52,12 +53,15 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tf2xla/transforms/utils.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rewriters.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/util/quantization/uniform_quant_ops_attr.pb.h"
 #include "tensorflow/core/util/quantization/uniform_quant_ops_params.h"
 
@@ -68,6 +72,10 @@ namespace {
 #define GEN_PASS_DEF_LEGALIZETF
 #include "tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.h.inc"
 
+auto *mlir_failed_legalization_count = tensorflow::monitoring::Counter<2>::New(
+    "/tensorflow/core/tf2xla/v0/mlir_failed_xla_legalize_tf_pass_count",
+    "Counts the failure of legalization of ops", "op_name", "legality");
+
 class LegalizeTF : public impl::LegalizeTFBase<LegalizeTF> {
  public:
   explicit LegalizeTF(bool allow_partial_conversion, bool legalize_chlo,
@@ -87,17 +95,6 @@ class LegalizeTF : public impl::LegalizeTFBase<LegalizeTF> {
 #define GEN_PASS_DEF_LEGALIZETFMODULEPASS
 #include "tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.h.inc"
 
-class LegalizeTFModulePass
-    : public impl::LegalizeTFModulePassBase<LegalizeTFModulePass> {
- public:
-  explicit LegalizeTFModulePass(StringRef tf2xla_fallback_device_type) {
-    device_type_ = tf2xla_fallback_device_type.str();
-  }
-
-  /// Performs the lowering to XLA dialect.
-  void runOnOperation() override;
-};
-
 FailureOr<IntegerType> GetStorageType(Operation *op,
                                       Type original_output_element_type,
                                       PatternRewriter &rewriter) {
@@ -169,13 +166,15 @@ FailureOr<TensorType> GetUniformQuantizedType(
   return GetSameShapeTensorType(original_type.cast<TensorType>(), elem_ty);
 }
 
-template <typename UniformQuantizedOp>
-FailureOr<mhlo::ConstantOp> CreateConstantOpForQint8Rhs(
-    UniformQuantizedOp op, TensorType new_rhs_type, PatternRewriter &rewriter) {
+template <typename TFQuantizedType, typename UniformQuantizedOp>
+FailureOr<mhlo::ConstantOp> CreateConstantOp(UniformQuantizedOp op,
+                                             Value original_operand,
+                                             TensorType new_operand_type,
+                                             PatternRewriter &rewriter) {
   // Check whether the rhs operand has constant op.
   TF::TensorProtoAttr tensor_proto_attr;
-  if (!matchPattern(op.getRhs(), m_Constant(&tensor_proto_attr))) {
-    return rewriter.notifyMatchFailure(op, "rhs must be constant.");
+  if (!matchPattern(original_operand, m_Constant(&tensor_proto_attr))) {
+    return rewriter.notifyMatchFailure(op, "operand must be constant.");
   }
 
   llvm::StringRef mangled_tensor = tensor_proto_attr.getValue();
@@ -186,7 +185,7 @@ FailureOr<mhlo::ConstantOp> CreateConstantOpForQint8Rhs(
   tensorflow::Status status =
       tensorflow::mangling_util::DemangleTensor(tensor_view, &tensor_proto);
   if (!status.ok()) {
-    return rewriter.notifyMatchFailure(op, status.error_message());
+    return rewriter.notifyMatchFailure(op, status.message());
   }
 
   tensorflow::Tensor t;
@@ -194,11 +193,13 @@ FailureOr<mhlo::ConstantOp> CreateConstantOpForQint8Rhs(
     return op.emitError("Failed to convert tensor proto to Tensor.");
   }
 
-  auto arr = t.flat<tensorflow::qint8>();
+  auto arr = t.flat<TFQuantizedType>();
   auto dense_attr = mlir::DenseElementsAttr::get(
-      GetSameShapeTensorType(new_rhs_type, rewriter.getIntegerType(8)),
+      GetSameShapeTensorType(
+          new_operand_type,
+          rewriter.getIntegerType(8 * sizeof(TFQuantizedType))),
       llvm::ArrayRef(arr.data(), arr.size()));
-  return rewriter.create<mhlo::ConstantOp>(op.getLoc(), new_rhs_type,
+  return rewriter.create<mhlo::ConstantOp>(op.getLoc(), new_operand_type,
                                            dense_attr);
 }
 
@@ -360,7 +361,8 @@ class ConvertUniformQuantizedDotHybridOp
       return failure();
     }
 
-    auto rhs = CreateConstantOpForQint8Rhs(op, *rhs_type, rewriter);
+    auto rhs = CreateConstantOp<tensorflow::qint8>(op, op.getRhs(), *rhs_type,
+                                                   rewriter);
     if (failed(rhs)) {
       return failure();
     }
@@ -388,7 +390,8 @@ class ConvertUniformQuantizedConvolutionHybridOp
       return failure();
     }
 
-    auto rhs = CreateConstantOpForQint8Rhs(op, *rhs_type, rewriter);
+    auto rhs = CreateConstantOp<tensorflow::qint8>(op, op.getRhs(), *rhs_type,
+                                                   rewriter);
     if (failed(rhs)) {
       return failure();
     }
@@ -498,7 +501,8 @@ class ConvertUniformQuantizedDotOp
       return failure();
     }
 
-    auto rhs_or = CreateConstantOpForQint8Rhs(op, *rhs_type, rewriter);
+    auto rhs_or = CreateConstantOp<tensorflow::qint8>(op, op.getRhs(),
+                                                      *rhs_type, rewriter);
     if (failed(rhs_or)) {
       return failure();
     }
@@ -539,7 +543,8 @@ class ConvertUniformQuantizedConvolutionOp
       return failure();
     }
 
-    auto rhs_or = CreateConstantOpForQint8Rhs(op, *rhs_type, rewriter);
+    auto rhs_or = CreateConstantOp<tensorflow::qint8>(op, op.getRhs(),
+                                                      *rhs_type, rewriter);
     if (failed(rhs_or)) {
       return failure();
     }
@@ -565,6 +570,110 @@ class ConvertUniformQuantizedConvolutionOp
   }
 };
 
+class ConvertUniformQuantizedAddOp
+    : public OpConversionPattern<TF::UniformQuantizedAddOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TF::UniformQuantizedAddOp op, TF::UniformQuantizedAddOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    Value lhs = adaptor.getLhs();
+
+    auto lhs_type = lhs.getType().cast<ShapedType>();
+    if (!lhs_type.hasRank()) {
+      return rewriter.notifyMatchFailure(
+          op, "Legalization supports cases where only lhs rank known.");
+    }
+    // rhs (bias) is always 1D that broadcasts to the last dim of lhs.
+    auto broadcast_dims =
+        GetI64ElementsAttr({lhs_type.getRank() - 1}, &rewriter);
+
+    auto rhs_type = GetUniformQuantizedType(
+        op, adaptor.getRhs().getType(), op.getRhsScales(),
+        op.getRhsZeroPoints(),
+        /*expressed_type=*/rewriter.getF32Type(), op.getRhsQuantizationMinVal(),
+        op.getRhsQuantizationMaxVal(), op.getRhsQuantizationAxis(), rewriter);
+    if (failed(rhs_type)) {
+      return failure();
+    }
+
+    auto rhs_or = CreateConstantOp<tensorflow::qint32>(op, op.getRhs(),
+                                                       *rhs_type, rewriter);
+    if (failed(rhs_or)) {
+      return failure();
+    }
+
+    auto output_type = GetUniformQuantizedType(
+        op, op.getOutput().getType(), op.getOutputScales(),
+        op.getOutputZeroPoints(),
+        /*expressed_type=*/rewriter.getF32Type(),
+        op.getOutputQuantizationMinVal(), op.getOutputQuantizationMaxVal(),
+        op.getOutputQuantizationAxis(), rewriter);
+    if (failed(output_type)) {
+      return failure();
+    }
+
+    // lhs, rhs, output scales and zero_points are guaranteed (by the TF
+    // quantizer) to be identical, respectively.
+    rewriter.replaceOpWithNewOp<chlo::BroadcastAddOp>(op, *output_type, lhs,
+                                                      *rhs_or, broadcast_dims);
+    return success();
+  }
+};
+
+class ConvertUniformQuantizedClipByValueOp
+    : public OpConversionPattern<TF::UniformQuantizedClipByValueOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TF::UniformQuantizedClipByValueOp op,
+      TF::UniformQuantizedClipByValueOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    Value operand = adaptor.getOperand();
+
+    const int64_t quantization_axis = op.getQuantizationAxis();
+    llvm::SmallVector<int64_t> broadcast_dims_values = {};
+    if (quantization_axis >= 0) {
+      broadcast_dims_values.push_back(quantization_axis);
+    }
+    auto broadcast_dims = GetI64ElementsAttr(broadcast_dims_values, &rewriter);
+
+    auto min_max_type = GetUniformQuantizedType(
+        op, adaptor.getMin().getType(), op.getScales(), op.getZeroPoints(),
+        /*expressed_type=*/rewriter.getF32Type(), op.getQuantizationMinVal(),
+        op.getQuantizationMaxVal(), op.getQuantizationAxis(), rewriter);
+    if (failed(min_max_type)) {
+      return failure();
+    }
+    auto min_or = CreateConstantOp<tensorflow::qint32>(op, op.getMin(),
+                                                       *min_max_type, rewriter);
+    if (failed(min_or)) {
+      return failure();
+    }
+    auto max_or = CreateConstantOp<tensorflow::qint32>(op, op.getMax(),
+                                                       *min_max_type, rewriter);
+    if (failed(max_or)) {
+      return failure();
+    }
+
+    auto output_type = GetUniformQuantizedType(
+        op, op.getOutput().getType(), op.getScales(), op.getZeroPoints(),
+        /*expressed_type=*/rewriter.getF32Type(), op.getQuantizationMinVal(),
+        op.getQuantizationMaxVal(), op.getQuantizationAxis(), rewriter);
+    if (failed(output_type)) {
+      return failure();
+    }
+
+    Value res_min_clipped = rewriter.create<chlo::BroadcastMaxOp>(
+        op->getLoc(), *output_type, operand, *min_or, broadcast_dims);
+    rewriter.replaceOpWithNewOp<chlo::BroadcastMinOp>(
+        op, *output_type, res_min_clipped, *max_or, broadcast_dims);
+    return success();
+  }
+};
+
 // Emits debug information which includes the number of ops of each type which
 // failed to legalize.
 void EmitLegalizationErrors(Operation *op,
@@ -736,25 +845,64 @@ RewritePatternSet PatternsIncludeOps(
   return to;
 }
 
+std::string OperationLegalityString(Operation *op,
+                                    const ConversionTarget &target) {
+  auto op_name = op->getName();
+  auto action = target.getOpAction(op_name);
+  if (!action.has_value()) {
+    return "Unknown";
+  }
+  switch (action.value_or(ConversionTarget::LegalizationAction::Legal)) {
+    case ConversionTarget::LegalizationAction::Legal:
+      return "Legal";
+    case ConversionTarget::LegalizationAction::Dynamic:
+      return "Dynamic";
+    case ConversionTarget::LegalizationAction::Illegal:
+      return "Illegal";
+    default:
+      return "Invalid";
+  }
+}
+
+void IncrementFailedLegalizationCount(Operation *op,
+                                      const ConversionTarget &target) {
+  auto op_name = op->getName();
+  auto name_string = op_name.getStringRef().str();
+  auto op_legality = OperationLegalityString(op, target);
+
+  mlir_failed_legalization_count->GetCell(name_string, op_legality)
+      ->IncrementBy(1);
+}
+
 mlir::LogicalResult ApplyPatterns(Operation *op, RewritePatternSet &patterns,
                                   bool legalize_chlo) {
   ConversionTarget target =
       GetDefaultLegalConversionTargets(*op->getContext(), legalize_chlo);
 
-  return applyPartialConversion(op, target, std::move(patterns));
+  DenseSet<Operation *> unconverted_ops;
+  auto result =
+      applyPartialConversion(op, target, std::move(patterns), &unconverted_ops);
+  if (failed(result)) {
+    IncrementFailedLegalizationCount(op, target);
+  }
+  for (const auto &unconverted_op : unconverted_ops) {
+    IncrementFailedLegalizationCount(unconverted_op, target);
+  }
+  return result;
 }
 
 /// When `tf2xla_fallback_device_type` is not `None`, also uses legalization
 /// patterns from TF2XLA fallback for provided device type (see
-/// legalize_tf_with_tf2xla.cc for details). By default, TF2XLA fallback is not
-/// used.
+/// legalize_tf_with_tf2xla.cc for details). By default, TF2XLA fallback is
+/// not used.
 LogicalResult legalizeTF(Operation *op, bool legalize_chlo,
                          std::optional<StringRef> tf2xla_fallback_device_type,
-                         bool prefer_tf2xla) {
+                         bool prefer_tf2xla, bool use_tf2xla_hlo_importer) {
   MLIRContext *context = op->getContext();
   RewritePatternSet legalize_lower_patterns(context);
   // Note that the `OperationConverter` orders patterns lexicographically by:
-  // 1) Ascending legalization depth (i.e., minimum number of patterns necessary
+  // 1) Ascending legalization depth (i.e., minimum number of patterns
+  // necessary
   //    to arrive at conversion target). This requires relevant patterns to
   //    specify the list of ops generated by it which most of patterns
   //    implemented in C++ don't do so this comparison doesn't work in those
@@ -791,9 +939,9 @@ LogicalResult legalizeTF(Operation *op, bool legalize_chlo,
   Tf2XlaTypeConverter converter;
   if (tf2xla_fallback_device_type) {
     // Add TF->HLO legalization patterns via TF2XLA fallback.
-    PopulateLegalizeTfWithTf2XlaPatterns(tf2xla_fallback_device_type.value(),
-                                         patterns, context, converter,
-                                         prefer_tf2xla);
+    PopulateLegalizeTfWithTf2XlaPatterns(
+        tf2xla_fallback_device_type.value(), patterns, context, converter,
+        prefer_tf2xla, use_tf2xla_hlo_importer);
   }
 
   // Populate with CHLO->HLO lowerings to account for TF ops legalized to
@@ -817,28 +965,8 @@ void LegalizeTF::runOnOperation() {
     tf2xla_fallback_device_type = device_type_;
   }
   if (failed(legalizeTF(getOperation(), legalize_chlo_,
-                        tf2xla_fallback_device_type, prefer_tf2xla_))) {
-    signalPassFailure();
-  }
-}
-
-void LegalizeTFModulePass::runOnOperation() {
-  // This pass should only be run when a fallback device is present.
-  if (!device_type_.hasValue()) {
-    return;
-  }
-  VLOG(1) << "TF to XLA legalization patterns include TF2XLA fallback "
-             "patterns for Ops that need to create functions.";
-  Operation *op = getOperation();
-  MLIRContext *context = op->getContext();
-  RewritePatternSet patterns(context);
-  Tf2XlaTypeConverter converter;
-  PopulateLegalizeTfWithTf2XlaPatterns(device_type_, patterns, context,
-                                       converter, /*prefer_tf2xla=*/false,
-                                       /*is_module_pass=*/true);
-
-  if (failed(ApplyPatterns(op, patterns,
-                           /*legalize_chlo=*/false))) {
+                        tf2xla_fallback_device_type, prefer_tf2xla_,
+                        use_tf2xla_hlo_importer_))) {
     signalPassFailure();
   }
 }
@@ -847,14 +975,16 @@ void LegalizeTFModulePass::runOnOperation() {
 
 void PopulateLegalizeTfQuantizationPatterns(MLIRContext *context,
                                             RewritePatternSet *patterns) {
-  patterns->add<ConvertUniformQuantizedDotHybridOp,
-                ConvertUniformQuantizedConvolutionHybridOp,
-                ConvertUniformQuantizeOp, ConvertUniformRequantizeOp,
-                ConvertUniformDequantizeOp, ConvertUniformQuantizedDotOp,
-                ConvertUniformQuantizedConvolutionOp>(context);
+  patterns
+      ->add<ConvertUniformQuantizedDotHybridOp,
+            ConvertUniformQuantizedConvolutionHybridOp,
+            ConvertUniformQuantizeOp, ConvertUniformRequantizeOp,
+            ConvertUniformDequantizeOp, ConvertUniformQuantizedDotOp,
+            ConvertUniformQuantizedConvolutionOp, ConvertUniformQuantizedAddOp,
+            ConvertUniformQuantizedClipByValueOp>(context);
 }
 
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFPass(
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFPass(
     bool allow_partial_conversion, bool legalize_chlo,
     std::optional<StringRef> tf2xla_fallback_device_type, bool prefer_tf2xla) {
   return std::make_unique<LegalizeTF>(allow_partial_conversion, legalize_chlo,
@@ -862,10 +992,5 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFPass(
                                       prefer_tf2xla);
 }
 
-std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFModulePass(
-    StringRef tf2xla_fallback_device_type) {
-  return std::make_unique<LegalizeTFModulePass>(tf2xla_fallback_device_type);
-}
-
 }  // end namespace mhlo
 }  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
index 4d1b9388af2..cfec5714798 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
@@ -17,7 +17,7 @@ limitations under the License.
 
 include "mlir/Pass/PassBase.td"
 
-def LegalizeTF : Pass<"xla-legalize-tf", "mlir::func::FuncOp"> {
+def LegalizeTF : Pass<"xla-legalize-tf", "ModuleOp"> {
   let summary = "Legalize from TF dialect's or HLO dialect's control flow.";
 
   let description = [{
@@ -44,7 +44,12 @@ def LegalizeTF : Pass<"xla-legalize-tf", "mlir::func::FuncOp"> {
     Option<"prefer_tf2xla_", "prefer-tf2xla", "bool",
         /*default=*/"false",
         "Prioritize tf2xla fallback legalization over MLIR legalization "
-        "patterns">
+        "patterns">,
+    Option<"use_tf2xla_hlo_importer_", "use-tf2xla-hlo-importer",
+       "bool", /*default=*/"false",
+        "Use the experimental HLO to MHLO importer for per-op fallback calls "
+        " from MLIR bridge to TF2XLA."
+        "Users should not set this flag and ideally this goes away.">
   ];
 
   let constructor = "mlir::mhlo::createLegalizeTFPass()";
@@ -56,26 +61,6 @@ def LegalizeTF : Pass<"xla-legalize-tf", "mlir::func::FuncOp"> {
                            "sparse_tensor::SparseTensorDialect"];
 }
 
-def LegalizeTFModulePass : Pass<"xla-fallback-legalize-tf-module-pass", "ModuleOp"> {
-  let summary = "Legalize whitelisted Ops using TF2XLA fallback for ops that "
-      "must also be able to create new functions.";
-
-  let description = [{
-    Legalizes whitelisted Ops from TF dialect to HLO dialect using TF2XLA
-    fallback for ops that must be allowed to create new functions.
-  }];
-  let options = [
-    Option<"device_type_", "device-type", "std::string",
-        /*default=*/"\"INVALID_DEVICE_TYPE\"",
-        "The device type used by TF2XLA fallback. Required.">,
-  ];
-
-  let constructor = "mlir::mhlo::createLegalizeTFModulePass()";
-  let dependentDialects = ["arith::ArithDialect, chlo::ChloDialect",
-                           "mhlo::MhloDialect",
-                           "shape::ShapeDialect", "func::FuncDialect", "sparse_tensor::SparseTensorDialect"];
-}
-
 def ConvertMHLOQuantToInt : Pass<"convert-mhlo-quant-to-int", "mlir::func::FuncOp"> {
   let summary = "Convert from MHLO quantized ops to MHLO primitive ops.";
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_test.cc
new file mode 100644
index 00000000000..7cc4d39676a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_test.cc
@@ -0,0 +1,115 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <functional>
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
+#include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
+#include "tensorflow/core/lib/monitoring/cell_reader.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace {
+
+using ::mlir::MLIRContext;
+using ::mlir::ModuleOp;
+using ::mlir::OwningOpRef;
+using ::mlir::PassManager;
+using ::tensorflow::monitoring::testing::CellReader;
+
+StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
+    absl::string_view module_string, MLIRContext* context) {
+  mlir::DialectRegistry mlir_registry;
+  RegisterAllTensorFlowDialects(mlir_registry);
+  context->appendDialectRegistry(mlir_registry);
+
+  OwningOpRef<ModuleOp> mlir_module;
+  auto status =
+      tensorflow::DeserializeMlirModule(module_string, context, &mlir_module);
+  if (!status.ok()) {
+    return status;
+  }
+  return mlir_module;
+}
+
+bool BuildAndRunPipeline(absl::string_view module_string,
+                         const std::function<void(PassManager*)>& passes) {
+  mlir::registerPassManagerCLOptions();
+  MLIRContext context;
+
+  OwningOpRef<ModuleOp> module =
+      GetMlirModuleFromString(module_string, &context).value();
+
+  PassManager pm(&context);
+
+  if (mlir::failed(mlir::applyPassManagerCLOptions(pm))) return false;
+  passes(&pm);
+
+  return pm.run(module.get()).succeeded();
+}
+
+std::function<void(PassManager*)> legalizeTFPasses() {
+  return [](PassManager* pm) {
+    pm->addPass(mlir::mhlo::createLegalizeTFPass(
+        /* allow_partial_conversion=*/false, /* legalize_chlo=*/true,
+        llvm::StringRef("gpu/xpu"), /* prefer_tf2xla=*/false));
+  };
+}
+
+TEST(XlaLegalizeTest, IllegalOp) {
+  constexpr char kMlirIllegalOpStr[] = R"(
+  module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+    func.func @main() -> tensor<1xi32> {
+      %0 = "tf.DoesntExist"() : () -> tensor<1xi32>
+      func.return %0 : tensor<1xi32>
+    }
+  })";
+  CellReader<int64_t> legalize_failure_count(
+      "/tensorflow/core/tf2xla/v0/mlir_failed_xla_legalize_tf_pass_count");
+
+  auto status = BuildAndRunPipeline(kMlirIllegalOpStr, legalizeTFPasses());
+
+  EXPECT_TRUE(status);
+  EXPECT_EQ(legalize_failure_count.Read("tf.DoesntExist", "Unknown"), 1);
+}
+
+TEST(XlaLegalizeTest, LegalOp) {
+  // We expect legalization to fail for legal op with dynamic shapes:
+  static constexpr char kMlirLegalOpStr[] = R"(
+   func.func @infeed_dequeue_tuple_dynamic_error() -> (tensor<3x3xf32>, tensor<4x?xf32>) {
+     %0:2 = "tf.InfeedDequeueTuple"() : () -> (tensor<3x3xf32>, tensor<4x?xf32>) func.return %0#0, %0#1 : tensor<3x3xf32>, tensor<4x?xf32>
+   })";
+  CellReader<int64_t> legalize_failure_count(
+      "/tensorflow/core/tf2xla/v0/mlir_failed_xla_legalize_tf_pass_count");
+
+  auto status = BuildAndRunPipeline(kMlirLegalOpStr, legalizeTFPasses());
+
+  EXPECT_TRUE(status);
+  EXPECT_EQ(legalize_failure_count.Read("tf.InfeedDequeueTuple", "Unknown"), 1);
+}
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
index 93e904f3d90..f9fff19986e 100644
--- a/tensorflow/compiler/mlir/tfr/BUILD
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -286,6 +286,7 @@ tf_py_test(
     deps = [
         "//tensorflow/compiler/mlir/tfr/resources:composite_ops",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -317,6 +318,7 @@ tf_py_test(
     ],
     deps = [
         "//tensorflow/compiler/mlir/tfr/resources:composite_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -354,13 +356,20 @@ py_library(
         "//tensorflow/compiler/mlir/tfr:tfr_wrapper",
         "//tensorflow/python/autograph/converters:control_flow",
         "//tensorflow/python/autograph/converters:return_statements",
-        "//tensorflow/python/autograph/impl",
-        "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/autograph/impl:api",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:cfg",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:transformer",
+        "//tensorflow/python/autograph/pyct:transpiler",
+        "//tensorflow/python/autograph/pyct/static_analysis:activity",
+        "//tensorflow/python/autograph/pyct/static_analysis:reaching_definitions",
+        "//tensorflow/python/autograph/pyct/static_analysis:reaching_fndefs",
+        "//tensorflow/python/autograph/pyct/static_analysis:type_inference",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:op_def_registry",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_inspect",
         "@gast_archive//:gast",
     ],
@@ -380,6 +389,7 @@ tf_py_test(
         "//tensorflow/compiler/mlir/tfr/resources:test_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -389,6 +399,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/autograph/pyct:transformer",
+        "//tensorflow/python/autograph/pyct:transpiler",
     ],
 )
 
@@ -403,6 +415,7 @@ tf_py_test(
         ":composite",
         ":op_reg_gen",
         "//tensorflow/compiler/mlir/python/mlir_wrapper:filecheck_wrapper",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -412,6 +425,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc
index 206e5ef13f8..50a8686ad4d 100644
--- a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc
@@ -40,7 +40,8 @@ MlirOptimizationPassState GraphDecomposePass::GetPassState(
 }
 
 Status GraphDecomposePass::Run(
-    const ConfigProto& config_proto, mlir::ModuleOp module, const Graph& graph,
+    const std::string& function_name, const ConfigProto& config_proto,
+    mlir::ModuleOp module, const Graph& graph,
     const FunctionLibraryDefinition& function_library) {
   if (GetPassState(/*device_set=*/nullptr, config_proto, graph,
                    function_library) == MlirOptimizationPassState::Disabled) {
diff --git a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h
index e415f5cbea9..575fd2d178d 100644
--- a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h
+++ b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_GRAPH_DECOMPOSE_PASS_H_
 #define TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_GRAPH_DECOMPOSE_PASS_H_
 
+#include <string>
+
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
 #include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
@@ -40,8 +42,8 @@ class GraphDecomposePass : public MlirOptimizationPass {
 
   // This should be used as a thin mapper around mlir::ModulePass::runOnModule
   // API integrated with the Tensorflow runtime.
-  Status Run(const ConfigProto& config_proto, mlir::ModuleOp module,
-             const Graph& graph,
+  Status Run(const std::string& function_name, const ConfigProto& config_proto,
+             mlir::ModuleOp module, const Graph& graph,
              const FunctionLibraryDefinition& function_library) override;
 };
 
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
index 8c33af424b8..91a306c1fba 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -144,7 +145,8 @@ TFRDialect::TFRDialect(MLIRContext *context)
 Operation *TFRDialect::materializeConstant(OpBuilder &builder, Attribute value,
                                            Type type, Location loc) {
   if (arith::ConstantOp::isBuildableWith(value, type))
-    return builder.create<arith::ConstantOp>(loc, type, value);
+    return builder.create<arith::ConstantOp>(loc, type,
+                                             value.cast<TypedAttr>());
   if (func::ConstantOp::isBuildableWith(value, type))
     return builder.create<func::ConstantOp>(loc, type,
                                             value.cast<FlatSymbolRefAttr>());
@@ -923,6 +925,16 @@ ArrayRef<Type> TFRFuncOp::getCallableResults() {
   return getFunctionType().getResults();
 }
 
+// CallableOpInterface
+::mlir::ArrayAttr TFRFuncOp::getCallableArgAttrs() {
+  return getArgAttrs().value_or(nullptr);
+}
+
+// CallableOpInterface
+::mlir::ArrayAttr TFRFuncOp::getCallableResAttrs() {
+  return getResAttrs().value_or(nullptr);
+}
+
 //===----------------------------------------------------------------------===//
 // Dialect type definitions
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tfr/passes/decompose.cc b/tensorflow/compiler/mlir/tfr/passes/decompose.cc
index 2e12356f03f..9a76d68efd9 100644
--- a/tensorflow/compiler/mlir/tfr/passes/decompose.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/decompose.cc
@@ -79,8 +79,8 @@ namespace TFR {
 namespace {
 
 // Quantize the float value based on given scale and zero point attributes.
-Attribute Quantize(float value, Attribute scale_attr, Attribute zp_attr,
-                   OpBuilder builder) {
+IntegerAttr Quantize(float value, Attribute scale_attr, Attribute zp_attr,
+                     OpBuilder builder) {
   double scale = scale_attr.cast<FloatAttr>().getValueAsDouble();
   int64_t zp = zp_attr.cast<IntegerAttr>().getInt();
 
@@ -223,8 +223,8 @@ LogicalResult DecomposeTFOpsPass::RewriteUnregisteredTFOps() {
           attr_cst =
               builder.create<ConstOp>(op->getLoc(), output_type, attribute);
         } else {
-          attr_cst =
-              builder.create<mlir::arith::ConstantOp>(op->getLoc(), attribute);
+          attr_cst = builder.create<mlir::arith::ConstantOp>(
+              op->getLoc(), cast<TypedAttr>(attribute));
         }
         new_operands.push_back(attr_cst);
       }
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 1d748bd6ae9..068b7cabf22 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -24,8 +24,8 @@ package_group(
         "//tensorflow/core/runtime_fallback/...",
         "//tensorflow/core/tfrt/eager/...",
         "//tensorflow/core/tfrt/experimental/data/...",
-        "//tensorflow/core/tfrt/saved_model/...",
         "//tensorflow/core/tfrt/graph_executor/...",
+        "//tensorflow/core/tfrt/saved_model/...",
         "//tensorflow/core/tfrt/tfrt_session/...",
     ] + if_google([
         "//learning/brain/experimental/mlir/tflite/tfmrt/...",
@@ -112,7 +112,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_passes",
         "//tensorflow/compiler/xla/mlir/backends/cpu/transforms:passes",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compiler",
@@ -300,7 +300,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:threadpool_interface",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat_eager",
         "//tensorflow/core/runtime_fallback/runtime:kernel_utils",
         "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
         "//tensorflow/core/tfrt/utils:fallback_tensor",
@@ -394,12 +394,12 @@ cc_library(
         "transforms/merge_tf_if_ops.cc",
         "transforms/optimize.cc",
         "transforms/optimize_tf_control_flow_side_effect.cc",
+        "transforms/passes.cc",
         "transforms/remove_device_attribute.cc",
         "transforms/remove_tf_if_const_args.cc",
         "transforms/reorder_assert.cc",
         "transforms/sink_in_invariant_ops.cc",
         "transforms/tf_to_tfrt.cc",
-        "transforms/tpu_passes.h",
         "transforms/xla_rewrite_pass.cc",
     ],
     hdrs = [
@@ -411,44 +411,38 @@ cc_library(
         ":cost_analysis",
         ":fallback_converter",
         ":tensor_array_side_effect_analysis",
-        ":tf_jitrt_opdefs",
-        ":tf_jitrt_pipeline",
+        ":tfrt_jitrt_stub",
         ":tfrt_pipeline_options",
+        ":tpu_passes",
+        ":transform_utils",
         ":transforms/gpu_passes",
         ":transforms/set_shape_invariant_in_while_ops",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
-        "//tensorflow/compiler/mlir/tensorflow:device_util",
-        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_op_interfaces",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_passes",
-        "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_clustering",
-        "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_opdefs",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/platform:tstring",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
+        "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_gpu_opdefs",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:tstring",
+        "//tensorflow/tsl/platform:status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:core_runtime_opdefs",
-        "@tf_runtime//backends/jitrt:jitrt_opdefs",
         "@tf_runtime//:stream_analysis",
         "@tf_runtime//:test_kernels_opdefs",
-        ":transform_utils",
-        "//tensorflow/tsl/platform:status",
-    ] + if_google([
-        "//learning/brain/tfrt/tpu/compiler/mlir:tf_to_tfrt_tpu",
-    ]),
+    ],
     alwayslink = 1,
 )
 
@@ -504,30 +498,28 @@ cc_library(
     ],
     deps = [
         ":tf_to_tfrt",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:FuncDialect",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:convert_type",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
         "@tf_runtime//:bef",
         "@tf_runtime//:core_runtime",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:mlirtobef",
         "@tf_runtime//:tensor",
-    ] + if_google([
-        "//learning/brain/tfrt/tpu/compiler/mlir:tf_to_tfrt_tpu",
-    ]),
+    ],
 )
 
 cc_library(
@@ -539,6 +531,7 @@ cc_library(
         "translate/import_model.h",
     ],
     visibility = [
+        # copybara:uncomment "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms:__pkg__",
         # copybara:uncomment "//learning/brain/experimental/tfrt/visualization:__pkg__",
         "//tensorflow/compiler/mlir/tfrt/tests/saved_model:__pkg__",
         "//tensorflow/core/tfrt/eager:__pkg__",
@@ -549,33 +542,35 @@ cc_library(
         ":function",
         ":tf_to_tfrt",
         ":tfrt_compile_options",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
+        ":tfrt_pipeline_options",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:import_model",
-        "@llvm-project//mlir:FuncDialect",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/core:framework",
         "//tensorflow/core/common_runtime:function_body",
         "//tensorflow/core/common_runtime:function_def_utils",
-        "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/platform:status",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
+        "//tensorflow/tsl/platform:errors",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@tf_runtime//:bef",
         "@tf_runtime//:mlirtobef",
-    ] + if_google([
-        "//learning/brain/tfrt/tpu/compiler/mlir:tf_to_tfrt_tpu",
-    ]),
+    ],
 )
 
 cc_library(
     name = "tfrt_compile_options",
     srcs = ["translate/tfrt_compile_options.cc"],
     hdrs = ["translate/tfrt_compile_options.h"],
-    deps = ["@com_google_absl//absl/strings"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 cc_library(
@@ -583,6 +578,7 @@ cc_library(
     srcs = ["analysis/cost_analysis.cc"],
     hdrs = ["analysis/cost_analysis.h"],
     deps = [
+        ":constants",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/tfrt/fallback:cost_recorder",
@@ -639,12 +635,10 @@ cc_library(
         ":__subpackages__",
     ],
     deps = [
+        "//tensorflow/compiler/mlir/tfrt:tf_to_tfrt",
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_passes",
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_test_passes",
-        "//tensorflow/compiler/mlir/tfrt:tf_to_tfrt",
-    ] + if_google([
-        "//learning/brain/tfrt/tpu/compiler/mlir:tf_to_tfrt_tpu",
-    ]),
+    ],
 )
 
 cc_library(
@@ -659,8 +653,8 @@ cc_library(
     ],
 )
 
-tf_cc_binary(
-    name = "tf-tfrt-opt",
+cc_library(
+    name = "tf_tfrt_opt_lib",
     testonly = True,
     srcs = ["tf-tfrt-opt.cc"],
     deps = [
@@ -669,6 +663,7 @@ tf_cc_binary(
         ":test_tensor_array_side_effect_analysis",
         ":tf_jitrt_opdefs",
         ":tf_to_tfrt",
+        ":tfrt_jitrt_passes",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir:passes",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
@@ -694,6 +689,12 @@ tf_cc_binary(
     ],
 )
 
+tf_cc_binary(
+    name = "tf-tfrt-opt",
+    testonly = True,
+    deps = [":tf_tfrt_opt_lib"],
+)
+
 tf_cc_binary(
     name = "lhlo-tfrt-opt",
     srcs = ["lhlo-tfrt-opt.cc"],
@@ -778,11 +779,11 @@ tf_cc_binary(
     ],
     visibility = [":friends"],
     deps = [
-        "@llvm-project//mlir:TranslateLib",
-        "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_opdefs",
-        "//tensorflow/compiler/mlir/tfrt:tf_jitrt_registration",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tfrt:tf_jitrt_registration",
         "//tensorflow/compiler/mlir/tfrt:tfrt_fallback_registration",
+        "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_opdefs",
+        "@llvm-project//mlir:TranslateLib",
         "@tf_runtime//:init_tfrt_dialects",
         "@tf_runtime//:mlirtobef_translate",
     ] + if_google(
@@ -839,3 +840,55 @@ cc_library(
         "@llvm-project//mlir:IR",
     ],
 )
+
+cc_library(
+    name = "tpu_passes",
+    hdrs = ["transforms/tpu_passes.h"],
+    deps = [
+        ":fallback_converter",
+        ":tfrt_compile_options",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "tfrt_jitrt_passes",
+    srcs = ["transforms/tfrt_jitrt_passes.cc"],
+    deps = [
+        ":fallback_converter",
+        ":tf_jitrt_opdefs",
+        ":tf_jitrt_pipeline",
+        ":tfrt_jitrt_stub",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
+        "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_clustering",
+        "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_passes",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@tf_runtime//:basic_kernels_opdefs",
+        "@tf_runtime//backends/jitrt:jitrt_opdefs",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tfrt_jitrt_stub",
+    srcs = ["transforms/tfrt_jitrt_stub.cc"],
+    hdrs = ["transforms/tfrt_jitrt_stub.h"],
+    deps = [
+        ":corert_converter",
+        ":tfrt_pipeline_options",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "constants",
+    hdrs = ["constants.h"],
+)
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc
index 9426580bf13..c7d02332839 100644
--- a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc
+++ b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/constants.h"
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 
 namespace tensorflow {
@@ -141,6 +142,7 @@ void CostAnalysis::AnalyzeArguments(mlir::func::FuncOp func_op) {
   // Use the max size among function inputs as the default size of dynamic
   // shaped tensors in the function.
   for (auto arg : func_op.getArguments()) {
+    if (!arg.getType().isa<mlir::TensorType>()) continue;
     auto type = arg.getType().cast<mlir::TensorType>();
     if (type.hasRank()) {
       max_arg_size_ = std::max(max_arg_size_, GetRankedTensorSize(type));
@@ -160,15 +162,6 @@ void CostAnalysis::EvaluateCost(mlir::Operation* op) {
     return;
   }
 
-  // These ops are cheap regardless of their input sizes.
-  //
-  // TODO(chky): Find a more scalable way to figure out cheap ops.
-  if (llvm::isa<mlir::TF::ShapeOp, mlir::TF::StridedSliceOp,
-                mlir::TF::ReshapeOp, mlir::TF::ExpandDimsOp>(op)) {
-    cost_map_[op] = kDefaultCheapCost;
-    return;
-  }
-
   // Try to use its cost function if it is registered.
   const auto& registry = GetCostFunctionRegistry();
   absl::string_view op_name = op->getName().getStringRef();
@@ -180,6 +173,25 @@ void CostAnalysis::EvaluateCost(mlir::Operation* op) {
     return;
   }
 
+  // Try to use the recorded cost if any.
+  if (cost_recorder_ != nullptr) {
+    const auto op_key_attr =
+        op->getAttrOfType<mlir::IntegerAttr>(kOpKeyAttrName);
+    if (op_key_attr) {
+      cost_map_[op] = cost_recorder_->GetCostNanosecond(op_key_attr.getInt());
+      return;
+    }
+  }
+
+  // These ops are cheap regardless of their input sizes.
+  //
+  // TODO(chky): Find a more scalable way to figure out cheap ops.
+  if (llvm::isa<mlir::TF::ShapeOp, mlir::TF::StridedSliceOp,
+                mlir::TF::ReshapeOp, mlir::TF::ExpandDimsOp>(op)) {
+    cost_map_[op] = kDefaultCheapCost;
+    return;
+  }
+
   // For other ops, use the sum of input sizes as its cost.
   int64_t cost = kDefaultCheapCost;
   for (auto operand : op->getOperands()) {
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h
index 8ed554de919..fa01b38dd64 100644
--- a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h
+++ b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 #include "tensorflow/core/tfrt/fallback/op_cost_map.pb.h"
 
 namespace tensorflow {
@@ -36,7 +37,10 @@ namespace tfrt_compiler {
 //
 class CostAnalysis {
  public:
-  explicit CostAnalysis(mlir::func::FuncOp func_op) {
+  explicit CostAnalysis(
+      mlir::func::FuncOp func_op,
+      const tfrt_stub::CostRecorder* cost_recorder = nullptr) {
+    cost_recorder_ = cost_recorder;
     AnalyzeArguments(func_op);
     AnalyzeBlock(&func_op.front());
   }
@@ -50,6 +54,7 @@ class CostAnalysis {
 
   int64_t max_arg_size_ = 1;
   llvm::DenseMap<mlir::Operation*, int64_t> cost_map_;
+  const tfrt_stub::CostRecorder* cost_recorder_;
 };
 
 struct CostContext {
diff --git a/tensorflow/compiler/mlir/tfrt/constants.h b/tensorflow/compiler/mlir/tfrt/constants.h
new file mode 100644
index 00000000000..dfbb9ba4898
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/constants.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_CONSTANTS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_CONSTANTS_H_
+
+namespace tensorflow {
+namespace tfrt_compiler {
+
+// Use __ prefix to indicate this is internal attribute.
+inline constexpr char kOpKeyAttrName[] = "__op_key";
+
+}  // namespace tfrt_compiler
+
+namespace mlrt_compiler {
+
+inline constexpr char kArgPassByValue[] = "mlrt.__pass_by_value";
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_CONSTANTS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.td b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.td
index d61d3235e0f..daf76268bc2 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.td
@@ -60,6 +60,26 @@ def SetResourceOp : FallbackSync_Op<"set_resource", [CoreRT_TypedAttributeTrait]
   let assemblyFormat = "operands attr-dict";
 }
 
+def SetResourceDhtOp : FallbackSync_Op<"set_resource_dht", [CoreRT_TypedAttributeTrait]> {
+  let summary = "Set a DHT in resource array";
+
+  let description = [{
+    Set a DHT in resource array.
+
+    arg: the tensor to be set in the resource array.
+    index: the index in the resource array
+  }];
+
+  let arguments = (ins
+    TensorType:$arg,
+    I64Attr:$index
+  );
+
+  let results = (outs);
+
+  let assemblyFormat = "operands attr-dict";
+}
+
 def GetResourceOp : FallbackSync_Op<"get_resource",
     [CoreRT_TypedAttributeTrait]> {
   let summary = "get a tensor in resource array";
@@ -82,6 +102,28 @@ def GetResourceOp : FallbackSync_Op<"get_resource",
   let assemblyFormat = "attr-dict `:` type($results)";
 }
 
+def GetResourceDhtOp : FallbackSync_Op<"get_resource_dht",
+    [CoreRT_TypedAttributeTrait]> {
+  let summary = "get a DHT in resource array";
+
+  let description = [{
+    Get a tensor in resource array.
+
+    indices: the indices in the resource array.
+    results: the tensor values for the corresponding indices.
+  }];
+
+  let arguments = (ins
+    I64ArrayAttr:$indices
+  );
+
+  let results = (outs
+    Variadic<TensorType>:$results
+  );
+
+  let assemblyFormat = "attr-dict `:` type($results)";
+}
+
 def CreateOp: FallbackSync_Op<"createop", []> {
   let summary = "The Fallback CreateOp";
 
diff --git a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_kernels.cc b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_kernels.cc
index a7cf4379ccb..6fe3091fed3 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_kernels.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_kernels.cc
@@ -651,7 +651,7 @@ struct DebugListener : public SpecializationListener {
     std::string message;
     llvm::raw_string_ostream os(message);
     os << "Specialized operands:\n";
-    for (auto& tuple : llvm::enumerate(llvm::zip(operands, attrs))) {
+    for (const auto& tuple : llvm::enumerate(llvm::zip(operands, attrs))) {
       mlir::Type type = std::get<0>(tuple.value());
       mlir::Attribute attr = std::get<1>(tuple.value());
       os << "%arg" << tuple.index() << ": " << type << " " << attr << "\n";
diff --git a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.cc b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.cc
index 946cc1c4bb6..327cfa45b9a 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.cc
@@ -98,7 +98,7 @@ void CreateTfJitRtPipeline(OpPassManager& pm,
   pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
 
   // Transform TF operation to HLO.
-  pm.addNestedPass<FuncOp>(mlir::mhlo::createLegalizeTFPass());
+  pm.addPass(mlir::mhlo::createLegalizeTFPass());
 
   if (options.legalize_i1_tensors) {
     // Convert 'i1' tensors into 'i8' tensors.
@@ -130,7 +130,7 @@ void CreateTfJitRtPipeline(OpPassManager& pm,
   // Transform HLO operations to Linalg and Standard.
   pm.addNestedPass<FuncOp>(mlir::mhlo::createLegalizeControlFlowPass());
   pm.addNestedPass<FuncOp>(mlir::mhlo::createLegalizeSortPass());
-  pm.addNestedPass<FuncOp>(xla::cpu::createLegalizeCollectiveOpsPass());
+  pm.addNestedPass<FuncOp>(xla::cpu::createLegalizeLibraryOpsPass());
 
   if (options.vectorize) {
     pm.addNestedPass<FuncOp>(mlir::mhlo::createLegalizeMHLOToTHLOPass());
@@ -170,6 +170,7 @@ void CreateTfJitRtPipeline(OpPassManager& pm,
         mlir::gml_st::getDefaultCPUPipelineOptions(llvm::sys::getHostCPUName());
     gml_st_opts.matmulTileSizes = options.matmul_tile_sizes;
     gml_st_opts.lowerToMmt4d = options.lower_to_mmt4d;
+    gml_st_opts.reductionEnableHeuristic = true;
     mlir::gml_st::addCPUTilingPipeline(pm, gml_st_opts);
   } else {
     pm.addNestedPass<FuncOp>(CreateFusionPass());
diff --git a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_test.cc b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_test.cc
index 2418798f19d..c0bb497078e 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_test.cc
@@ -29,7 +29,6 @@
 namespace tensorflow {
 
 using ::tfrt::AsyncValue;
-using ::tfrt::DType;
 using ::tfrt::RCReference;
 using ::tfrt::RemainingResults;
 
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/BUILD b/tensorflow/compiler/mlir/tfrt/jit/transforms/BUILD
index c185cb0f9c3..b1ce160fa74 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/BUILD
@@ -63,7 +63,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/mlir_hlo:gml_st",
         "//tensorflow/compiler/xla/mlir_hlo:gml_st_passes",
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering.cc
index 6fe59cf34e7..4ae62f262a5 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering.cc
@@ -852,7 +852,7 @@ void populateTfJitRtConstraintsPolicies(ClusteringPolicySet& policies,
 
 mlir::LogicalResult IsCompilableConstant(mlir::ElementsAttr value) {
   return success(value.getNumElements() <= 16 &&
-                 value.getType().getElementType().isIntOrIndexOrFloat());
+                 value.getShapedType().getElementType().isIntOrIndexOrFloat());
 }
 
 static bool IsI1Integer(Type type) {
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fusion.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fusion.cc
index 0e3a24ee5c1..65456ca8c0f 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fusion.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fusion.cc
@@ -39,9 +39,9 @@ using mlir::AffineMap;
 using mlir::MLIRContext;
 using mlir::Operation;
 using mlir::OpOperand;
-using mlir::OpResult;
 using mlir::RewritePatternSet;
 
+namespace affine = mlir::affine;
 namespace linalg = mlir::linalg;
 namespace tensor = mlir::tensor;
 
@@ -140,7 +140,7 @@ struct FusionPass : public impl::FusionBase<FusionPass> {
     linalg::populateConstantFoldLinalgOperations(patterns,
                                                  ControlElementwiseOpsFusion);
 
-    mlir::AffineApplyOp::getCanonicalizationPatterns(patterns, context);
+    affine::AffineApplyOp::getCanonicalizationPatterns(patterns, context);
     linalg::GenericOp::getCanonicalizationPatterns(patterns, context);
     tensor::ExpandShapeOp::getCanonicalizationPatterns(patterns, context);
     tensor::CollapseShapeOp::getCanonicalizationPatterns(patterns, context);
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.cc
index f81159c9699..64237f0bb08 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.cc
@@ -19,8 +19,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using ::mlir::Operation;
-
 bool IsContiguousMemref(mlir::Value value) {
   auto memref_type = value.getType().dyn_cast<mlir::MemRefType>();
   if (!memref_type) return false;
diff --git a/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/build_defs.bzl b/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/build_defs.bzl
index d6327bf24a5..fb183e02362 100644
--- a/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/build_defs.bzl
+++ b/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/build_defs.bzl
@@ -32,7 +32,10 @@ def _run_regression_test(name, compare_with_tensorflow, vectorize, data):
             "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
             "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tfrt_fallback",
             "//tensorflow/python:client_testlib",
-            "//tensorflow/python/platform",
+            "//tensorflow/python/platform:tf_logging",
+            "//tensorflow/python/platform:client_testlib",
+            "//tensorflow/python/platform:resource_loader",
+            "//tensorflow/python/platform:gfile",
         ],
     )
 
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.cc
index 62ec862a393..0f16091d799 100644
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.cc
+++ b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.cc
@@ -33,13 +33,14 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/platform/threadpool_interface.h"
-#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.h"
 #include "tensorflow/core/runtime_fallback/runtime/kernel_utils.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 #include "tfrt/bef_converter/mlir_to_bef.h"  // from @tf_runtime
 #include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
 #include "tfrt/host_context/async_value.h"  // from @tf_runtime
+#include "tfrt/host_context/chain.h"  // from @tf_runtime
 #include "tfrt/host_context/execution_context.h"  // from @tf_runtime
 #include "tfrt/host_context/function.h"  // from @tf_runtime
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
@@ -52,12 +53,9 @@ using ::tfrt::AsyncValue;
 using ::tfrt::BEFFile;
 using ::tfrt::ExecutionContext;
 using ::tfrt::Function;
-using ::tfrt::HostContext;
 using ::tfrt::MakeAvailableAsyncValueRef;
 using ::tfrt::RCReference;
-using ::tfrt::RequestContext;
 using ::tfrt::RequestContextBuilder;
-using ::tfrt::ResourceContext;
 
 using ::tensorflow::Env;
 using ::tensorflow::thread::ThreadPool;
@@ -112,8 +110,7 @@ RuntimeFallbackExecutor::RuntimeFallbackExecutor(int64_t num_threads)
   // Initialize fallback kernels state with a custom intra-op thread pool.
   auto status = tensorflow::tfd::SetUpKernelFallbackCompatRequestContext(
       &builder, /*runner_table=*/nullptr, eager_context, intra_op_.get());
-  CHECK(status.ok()) << "Failed to setup request context: "
-                     << status.error_message();
+  CHECK(status.ok()) << "Failed to setup request context: " << status.message();
 
   auto req_ctx = std::move(builder).build();
   if (auto err = req_ctx.takeError())
diff --git a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
index 1cafb216743..ceda5aecef7 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
@@ -244,6 +244,34 @@ func.func private @some_func(%arg: tensor<i1>) -> tensor<i32> {
 
 module attributes {tf_saved_model.semantics} {
 
+// Test not hoisting callees in xla launch functions.
+
+// CHECK-LABEL: func private @xla_func
+func.func private @xla_func(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32>
+  attributes {tf._input_shapes = [#tf_type.shape<1x3>, #tf_type.shape<*>], tf.signature.is_stateful} {
+  // CHECK-NOT: tf._TfrtGetResource
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  %2 = "tf.AddV2"(%arg0, %1) {device = "/device:CPU:0"} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  %3 = "tf.Identity"(%2) {device = "/device:CPU:0"} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %3 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1x3xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<*xf32> {tf_saved_model.index_path = ["r"]}) 
+  attributes {tf_saved_model.exported_names = ["main"]} {
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  %1 = "tf.XlaLaunch"(%arg0, %0) {device = "/device:GPU:0", function = @xla_func, operand_segment_sizes = array<i32: 0, 2, 0>} : (tensor<1x3xf32>, tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<*xf32>
+  func.return  %1 : tensor<*xf32>
+
+}
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
 // Test not hoisting in TPU functions.
 
 // CHECK-LABEL: func @_tfrt_resource_init
@@ -260,4 +288,4 @@ func.func private @func2(%arg: tensor<i1>) -> tensor<i32> {
   func.return %r : tensor<i32>
 }
 
-}
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops_mlrt.mlir b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops_mlrt.mlir
new file mode 100644
index 00000000000..7b797b357a1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops_mlrt.mlir
@@ -0,0 +1,31 @@
+// RUN: tf-tfrt-opt -split-input-file -tfrt-lower-tf-savedmodel="hoist-invariant-ops=true fuse-get-resource-ops=false" %s | FileCheck %s --dump-input=fail --dump-input-filter=all
+
+module attributes {tf_saved_model.semantics} {
+
+// Test hoisting hash table op.
+
+// CHECK-LABEL: func @_tfrt_resource_init
+// CHECK: [[handle:%.*]] = "tf.HashTableV2"()
+// CHECK-SAME: shared_name = "x"
+// CHECK: "tf._TfrtSetResource"([[handle]]) {device = "/job:localhost/replica:0/task:0/device:CPU:0", index = [[handle_id:.*]] : i64}
+// CHECK: [[x:%.*]] = "tf.LookupTableSizeV2"([[handle]])
+// CHECK: "tf._TfrtSetResource"([[x]]) {device = "/job:localhost/replica:0/task:0/device:CPU:0", index = [[size_id:.*]] : i64} : (tensor<i64>) -> ()
+
+// CHECK: func @test_hoist_hash_table
+func.func @hoist_hash_table(%arg: tensor<?x!tf_type.string> {tf_saved_model.index_path = ["input"]}, %default: tensor<i64> {tf_saved_model.index_path = ["default"]}) -> (tensor<i64> {tf_saved_model.index_path = ["r"]}, tensor<*xi64> {tf_saved_model.index_path = ["r1"]})
+  attributes {tf_saved_model.exported_names = ["test_hoist_hash_table"]} {
+  // CHECK-NOT: tf.HashTableV2
+  // CHECK-NOT: tf.LookupTableSizeV2
+  // CHECK-DAG: [[v0:%.*]] = "tf._TfrtGetResource"() {container = [""], device = "/job:localhost/replica:0/task:0/device:CPU:0", indices = [[[handle_id]]], shared_name = [{{.*}}]}
+  // CHECK-DAG: [[v1:%.*]] = "tf._TfrtGetResource"() {container = [""], device = "/job:localhost/replica:0/task:0/device:CPU:0", indices = [[[size_id]]], shared_name = [{{.*}}]}
+  // CHECK-DAG: [[r:%.*]] = "tf.LookupTableFindV2"([[v0]]
+  // CHECK-DAG: return [[v1]], [[r]]
+  %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "x", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+  %1 = "tf.LookupTableSizeV2"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<i64>
+  %2 = "tf.LookupTableFindV2"(%0, %arg, %default) {device = "/CPU:0"} : (tensor<!tf_type.resource>, tensor<?x!tf_type.string>, tensor<i64>) -> tensor<*xi64>
+  func.return %1, %2 : tensor<i64>, tensor<*xi64>
+}
+
+}
+
+// -----
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/const_tensor.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/const_tensor.mlir
index 7e8655ce4e0..b208fe390ac 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/const_tensor.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/const_tensor.mlir
@@ -16,7 +16,7 @@ func.func @dense_tensor() -> tensor<4xui64> {
   %0 = "tf.Const"() {value = dense<[1, 2, 3, 4]> : tensor<4xui64>} : () -> tensor<4xui64>
   // CHECK: corert.const_dense_tensor  dense<1.000000e+00> : tensor<1xbf16>
   %1 = "tf.Const"() {device = "/device:CPU:0", value = dense<[1.0]> : tensor<1xbf16>} : () -> tensor<4xbf16>
-  // CHECK-NOT: corert.executeop
+  // CHECK: corert.executeop({{.*}}) "tf.Const"() {dtype = ui64, value = dense<[1, 2, 3, 4]> : tensor<4xui64>} : 1
   %2 = "tf.Const"() {device = "/device:GPU:0", value = dense<[1, 2, 3, 4]> : tensor<4xui64>} : () -> tensor<4xui64>
   func.return %0 : tensor<4xui64>
 }
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
index 149fee8f244..4c5777c28e2 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
@@ -5,7 +5,7 @@ func.func @device_test(
     %arg0: tensor<3x1xf32> {tf_saved_model.index_path = [0]},
     %arg1: tensor<1x3xf32> {tf_saved_model.index_path = [0]})
       -> (tensor<3x3xf32> {tf_saved_model.index_path = []}) {
-  // CHECK: device("/device:GPU:0")
+  // CHECK: {{%.*}} = corert.get_op_handler %arg0 "/device:GPU:0"
   %2 = "tf.MatMul"(%arg0, %arg1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "/device:GPU:0", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
   func.return %2 : tensor<3x3xf32>
 }
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback.mlir
index 0e605ccc6af..8f59a1a42d7 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback.mlir
@@ -62,9 +62,9 @@ func.func @no_native(%arg0: tensor<3x1xf32>, %arg1: tensor<!tf_type.resource<ten
 
 // CHECK-LABEL: func @gpu_device
 func.func @gpu_device(%arg0: tensor<3x1xf32>, %arg1: tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<3x3xf32> {
-  // CHECK-NOT: corert.executeop
-  // CHECK: tfrt_fallback_async.executeop.seq({{.*}}) key({{.*}}) cost({{.*}}) device("/device:GPU:0") "tf.ReadVariableOp"
-  // CHECK: tfrt_fallback_async.executeop key({{.*}}) cost({{.*}}) device("/device:GPU:0") "tf.MatMul"
+  // CHECK: {{%.*}} = corert.get_op_handler %arg0 "/device:GPU:0"
+  // CHECK: {{.*}} = corert.executeop.seq({{.*}}) "tf.ReadVariableOp"({{.*}}) {dtype = f32} : 1
+  // CHECK: {{.*}} = corert.executeop({{.*}}) "tf.MatMul"({{.*}}) {T = f32, transpose_a = false, transpose_b = false} : 1
   %0 = "tf.ReadVariableOp"(%arg1) {device = "/device:GPU:0", dtype = f32} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
   %1 = "tf.MatMul"(%arg0, %0) {T = f32, device = "/device:GPU:0", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
   func.return %1 : tensor<3x3xf32>
@@ -117,12 +117,3 @@ func.func @tensor_array() -> (tensor<1x1x512xf32>) {
   %result = "tf.TensorArrayGatherV3"(%handle, %indices, %flow_1) {device = "/job:localhost/replica:0/task:0/device:CPU:0", element_shape = #tf_type.shape<1x512>} : (tensor<2x!tf_type.resource<tensor<1x512xf32>>>, tensor<1xi32>, tensor<f32>) -> tensor<1x1x512xf32>
   func.return %result : tensor<1x1x512xf32>
 }
-
-// CHECK-LABEL: func @gpu_device_cost
-func.func @gpu_device_cost(%arg0: tensor<3x1xf32>, %arg1: tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<3x3xf32> {
-  // CHECK: tfrt_fallback_async.executeop.seq({{.*}}) key({{.*}}) cost({{1}}) device({{.*}}) "tf.ReadVariableOp"
-  // CHECK: tfrt_fallback_async.executeop key({{.*}}) cost({{1}}) device({{.*}}) "tf.MatMul"
-  %0 = "tf.ReadVariableOp"(%arg1) {device = "/job:localhost/replica:0/task:0/device:GPU:0", dtype = f32} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
-  %1 = "tf.MatMul"(%arg0, %0) {T = f32, device = "/job:localhost/replica:0/task:0/device:GPU:0", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-  func.return %1 : tensor<3x3xf32>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
index 47c0277670b..0ec42b59fd6 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
@@ -82,80 +82,6 @@ struct HoistInfo {
       hoisted_values;
 };
 
-void ReplaceHoistedValues(
-    llvm::ArrayRef<std::pair<mlir::Value, mlir::TF::ResourceHandle>>
-        hoisted_values,
-    mlir::OpBuilder &builder) {
-  struct HoistedValueInfo {
-    llvm::SmallVector<mlir::Value, 4> hoisted_values;
-    llvm::SmallVector<int64_t, 4> indices;
-    llvm::SmallVector<llvm::StringRef, 4> shared_names;
-    llvm::SmallVector<llvm::StringRef, 4> containers;
-  };
-  // Rearrange the hoisted values by each function and each device.
-  llvm::DenseMap<mlir::Block *, llvm::StringMap<HoistedValueInfo>>
-      hoisted_values_by_block_device;
-
-  // Find a block where to place tf._TfrtGetResource operation. We do not place
-  // get resource operations inside the `tf_device.cluster` operations, because
-  // these blocks are intended for later on-device compilation. Insert resource
-  // reads to the closest block outside of the `tf_device.cluster` operation.
-  auto hoist_into_block = [](mlir::Value value) -> mlir::Block * {
-    mlir::Operation *cluster_op =
-        value.getDefiningOp()->getParentOfType<mlir::tf_device::ClusterOp>();
-    return cluster_op ? cluster_op->getBlock() : value.getParentBlock();
-  };
-
-  for (auto iter : llvm::enumerate(hoisted_values)) {
-    auto value = iter.value().first;
-    auto index = iter.index();
-    auto &device_map = hoisted_values_by_block_device[hoist_into_block(value)];
-
-    assert(value.getDefiningOp() && "hoisted values must not be arguments.");
-    llvm::StringRef device = kCpuDeviceName;
-    if (auto device_attr =
-            value.getDefiningOp()->getAttrOfType<mlir::StringAttr>("device")) {
-      if (!device_attr.getValue().empty()) device = device_attr.getValue();
-    }
-
-    auto &item = device_map[device];
-
-    item.hoisted_values.push_back(value);
-    item.indices.push_back(index);
-    item.shared_names.push_back(iter.value().second.name);
-    item.containers.push_back(iter.value().second.container);
-  }
-
-  // Create tf._TfrtGetResource op for each function and device.
-  for (const auto &block_iter : hoisted_values_by_block_device) {
-    auto *block = block_iter.first;
-    const auto &device_map = block_iter.second;
-
-    builder.setInsertionPointToStart(block);
-    for (const auto &device_iter : device_map) {
-      llvm::StringRef device = device_iter.getKey();
-      mlir::ValueRange old_values = device_iter.getValue().hoisted_values;
-      const auto &indices = device_iter.getValue().indices;
-      const auto &shared_name_arr = device_iter.getValue().shared_names;
-      const auto &container_arr = device_iter.getValue().containers;
-
-      auto get_resource_op = builder.create<mlir::TF::_TfrtGetResourceOp>(
-          block->getParentOp()->getLoc(), old_values.getTypes(),
-          builder.getI64ArrayAttr(indices),
-          builder.getStrArrayAttr(shared_name_arr),
-          builder.getStrArrayAttr(container_arr));
-      get_resource_op->setAttr("device", builder.getStringAttr(device));
-
-      auto new_values = get_resource_op.getResults();
-      for (auto iter : llvm::zip(old_values, new_values)) {
-        auto old_value = std::get<0>(iter);
-        auto new_value = std::get<1>(iter);
-        old_value.replaceAllUsesWith(new_value);
-      }
-    }
-  }
-}
-
 bool OnlyHasReadOrNoEffect(mlir::Operation *op) {
   auto interface = llvm::dyn_cast<mlir::MemoryEffectOpInterface>(op);
   if (!interface) return false;
@@ -275,136 +201,35 @@ void HoistInvariantOpsInFunction(
   }
 }
 
+void FindCalleesRecursiveForOp(const mlir::SymbolTable &symbol_table,
+                               mlir::Operation *op,
+                               llvm::StringSet<> &callees) {
+  for (const auto &named_attr : op->getAttrs()) {
+    if (auto symbol_attr =
+            named_attr.getValue().dyn_cast<mlir::FlatSymbolRefAttr>()) {
+      auto symbol = symbol_attr.getValue();
+      if (!callees.contains(symbol)) {
+        callees.insert(symbol);
+
+        auto func = symbol_table.lookup<mlir::func::FuncOp>(symbol);
+        if (!func) continue;
+
+        func.walk([&](mlir::Operation *op) {
+          FindCalleesRecursiveForOp(symbol_table, op, callees);
+        });
+      }
+    }
+  }
+}
+
 void FindCalleesRecursive(const mlir::SymbolTable &symbol_table,
                           mlir::func::FuncOp func, llvm::StringSet<> &callees) {
   assert(func);
   func.walk([&](mlir::Operation *op) {
-    for (const auto &named_attr : op->getAttrs()) {
-      if (auto symbol_attr =
-              named_attr.getValue().dyn_cast<mlir::FlatSymbolRefAttr>()) {
-        auto symbol = symbol_attr.getValue();
-        if (!callees.contains(symbol)) {
-          callees.insert(symbol);
-
-          auto func = symbol_table.lookup<mlir::func::FuncOp>(symbol);
-          if (!func) continue;
-
-          FindCalleesRecursive(symbol_table, func, callees);
-        }
-      }
-    }
+    FindCalleesRecursiveForOp(symbol_table, op, callees);
   });
 }
 
-void HoistInvariantOps(mlir::ModuleOp module) {
-  mlir::SymbolTable symbol_table(module);
-
-  // Find all resources used in non-init functions.
-  llvm::DenseMap<mlir::TF::ResourceHandle,
-                 llvm::SmallVector<mlir::Operation *, 4>>
-      resources;
-
-  // Find all callees referenced in the initialization functions.
-  llvm::StringSet<> init_callees;
-
-  module.walk([&](mlir::Operation *op) {
-    if (llvm::isa<mlir::TF::VarHandleOp, mlir::TF::HashTableV2Op>(op)) {
-      auto func = op->getParentOfType<mlir::func::FuncOp>();
-      if (IsSessionInitializer(func)) return;
-      resources[GetResourceHandle(op)].push_back(op);
-    } else if (auto func = llvm::dyn_cast<mlir::func::FuncOp>(op)) {
-      if (!IsSessionInitializer(func)) return;
-      FindCalleesRecursive(symbol_table, func, init_callees);
-    }
-  });
-
-  llvm::DenseSet<mlir::TF::ResourceHandle> read_only_vars;
-  for (const auto &iter : resources) {
-    const auto &key = iter.first;
-    const auto &vars = iter.second;
-    if (std::all_of(vars.begin(), vars.end(), [](mlir::Operation *op) {
-          for (auto *user : op->getUsers()) {
-            if (!OnlyHasReadOrNoEffect(user)) return false;
-          }
-          return true;
-        })) {
-      read_only_vars.insert(key);
-    }
-  }
-
-  mlir::TF::SideEffectAnalysis side_effect_analysis(module);
-
-  mlir::OpBuilder builder(&module.getBodyRegion());
-  // "_tfrt_resource_init" is the special function that executes all invariant
-  // ops (eg. read-only variables) used in the model. This function should be
-  // executed after user-specified initialization.
-  auto init_func_op = builder.create<mlir::func::FuncOp>(
-      module.getLoc(), "_tfrt_resource_init",
-      mlir::FunctionType::get(module.getContext(), /*inputs=*/{},
-                              /*results=*/{}));
-  auto *block = init_func_op.addEntryBlock();
-  builder.setInsertionPointToStart(block);
-
-  HoistInfo module_hoist_info;
-
-  for (auto func : module.getOps<mlir::func::FuncOp>()) {
-    // Skips hoisting if this function is an init function or any callees,
-    // including recursive ones, of an init functions, because otherwise the
-    // hoisted values won't be initialized when this function is called.
-    if (IsSessionInitializer(func) ||
-        init_callees.contains(func.getSymName()) || func == init_func_op)
-      continue;
-
-    // Skips hoisting if this function runs on TPU. This is will happen when
-    // fallback to TPUPartitionedCallOp is enabled for SPMD.
-    // TODO(b/214039254): remove this once tfrt support native SPMD.
-    bool has_tpu_op = false;
-    func.walk([&has_tpu_op](mlir::Operation *op) {
-      if (op->hasAttr("_tpu_replicate")) has_tpu_op = true;
-    });
-    if (has_tpu_op) continue;
-
-    HoistInvariantOpsInFunction(func, read_only_vars,
-                                side_effect_analysis.GetAnalysisForFunc(func),
-                                builder, module_hoist_info);
-  }
-
-  // Create tf._TfrtSetResource ops in the init function.
-  for (auto iter : llvm::enumerate(module_hoist_info.hoisted_values)) {
-    mlir::Value value = iter.value().first;
-    int64_t index = iter.index();
-
-    auto new_value = module_hoist_info.value_mapping.lookup(value);
-    auto *new_op = new_value.getDefiningOp();
-    assert(new_op);
-    builder.setInsertionPointAfter(new_op);
-    auto set_resource_op = builder.create<mlir::TF::_TfrtSetResourceOp>(
-        new_op->getLoc(), new_value, index);
-
-    // Preserve the device attribute.
-    llvm::StringRef device = kCpuDeviceName;
-    if (auto device_attr = new_op->getAttrOfType<mlir::StringAttr>("device")) {
-      if (!device_attr.getValue().empty()) device = device_attr.getValue();
-    }
-    set_resource_op->setAttr("device", builder.getStringAttr(device));
-  }
-
-  builder.setInsertionPointToEnd(block);
-  // Finish building the init function by inserting an return op.
-  builder.create<mlir::func::ReturnOp>(init_func_op.getLoc());
-
-  // Now that we have the index for each value that will be replaced, we can
-  // create the tf._TfrtGetResource op in each function using these indices.
-  ReplaceHoistedValues(module_hoist_info.hoisted_values, builder);
-
-  // Lastly, erase the hoisted ops in reverse topological order.
-  for (auto *op :
-       llvm::reverse(module_hoist_info.hoists_in_topological_order)) {
-    assert(op->use_empty());
-    op->erase();
-  }
-}
-
 // This pass rewrites tf_saved_model dialect's ops according to TFRT's
 // requirements:
 //
@@ -416,11 +241,17 @@ void HoistInvariantOps(mlir::ModuleOp module) {
 class LowerTFSavedModelPass
     : public mlir::PassWrapper<LowerTFSavedModelPass,
                                mlir::OperationPass<mlir::ModuleOp>> {
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<mlir::func::FuncDialect>();
+  }
+
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerTFSavedModelPass)
 
-  explicit LowerTFSavedModelPass(bool hoist_invariant_ops) {
+  explicit LowerTFSavedModelPass(bool hoist_invariant_ops,
+                                 bool fuse_get_resource_ops) {
     hoist_invariant_ops_ = hoist_invariant_ops;
+    fuse_get_resource_ops_ = fuse_get_resource_ops;
   }
   LowerTFSavedModelPass() = default;
   LowerTFSavedModelPass(const LowerTFSavedModelPass &) {}
@@ -512,11 +343,230 @@ class LowerTFSavedModelPass
   }
 
  private:
+  void HoistInvariantOps(mlir::ModuleOp module);
+  void ReplaceHoistedValues(
+      llvm::ArrayRef<std::pair<mlir::Value, mlir::TF::ResourceHandle>>
+          hoisted_values,
+      mlir::OpBuilder &builder);
+
   Option<bool> hoist_invariant_ops_{*this, "hoist-invariant-ops",
                                     llvm::cl::desc("hoist-invariant-ops"),
                                     llvm::cl::init(false)};
+  Option<bool> fuse_get_resource_ops_{*this, "fuse-get-resource-ops",
+                                      llvm::cl::desc("fuse get resource ops"),
+                                      llvm::cl::init(true)};
 };
 
+void LowerTFSavedModelPass::HoistInvariantOps(mlir::ModuleOp module) {
+  mlir::SymbolTable symbol_table(module);
+
+  // Find all resources used in non-init functions.
+  llvm::DenseMap<mlir::TF::ResourceHandle,
+                 llvm::SmallVector<mlir::Operation *, 4>>
+      resources;
+
+  // Find all callees referenced in the initialization functions.
+  llvm::StringSet<> init_callees;
+
+  // Recursively find all callees referenced in the tf.XlaLaunch op.
+  // At and after the point of calling this pass, the MLIR xla function is no
+  // longer used. So there is no point to do hoisting for xla functions.
+  llvm::StringSet<> xla_launch_callees;
+
+  module.walk([&](mlir::Operation *op) {
+    if (llvm::isa<mlir::TF::VarHandleOp, mlir::TF::HashTableV2Op>(op)) {
+      auto func = op->getParentOfType<mlir::func::FuncOp>();
+      if (IsSessionInitializer(func)) return;
+      resources[GetResourceHandle(op)].push_back(op);
+    } else if (auto func = llvm::dyn_cast<mlir::func::FuncOp>(op)) {
+      if (!IsSessionInitializer(func)) return;
+      FindCalleesRecursive(symbol_table, func, init_callees);
+    } else if (op->getName().getStringRef().str() == "tf.XlaLaunch") {
+      // TODO(b/275095412): Clean up MLIR XLA functions after they are written
+      // back to function library, so that we don't need to do special handling
+      // for those functions here.
+      FindCalleesRecursiveForOp(symbol_table, op, xla_launch_callees);
+    }
+  });
+
+  llvm::DenseSet<mlir::TF::ResourceHandle> read_only_vars;
+  for (const auto &iter : resources) {
+    const auto &key = iter.first;
+    const auto &vars = iter.second;
+    if (std::all_of(vars.begin(), vars.end(), [](mlir::Operation *op) {
+          for (auto *user : op->getUsers()) {
+            if (!OnlyHasReadOrNoEffect(user)) return false;
+          }
+          return true;
+        })) {
+      read_only_vars.insert(key);
+    }
+  }
+
+  mlir::TF::SideEffectAnalysis side_effect_analysis(module);
+
+  mlir::OpBuilder builder(&module.getBodyRegion());
+  // "_tfrt_resource_init" is the special function that executes all invariant
+  // ops (eg. read-only variables) used in the model. This function should be
+  // executed after user-specified initialization.
+  auto init_func_op = builder.create<mlir::func::FuncOp>(
+      module.getLoc(), "_tfrt_resource_init",
+      mlir::FunctionType::get(module.getContext(), /*inputs=*/{},
+                              /*results=*/{}));
+  auto *block = init_func_op.addEntryBlock();
+  builder.setInsertionPointToStart(block);
+
+  HoistInfo module_hoist_info;
+
+  for (auto func : module.getOps<mlir::func::FuncOp>()) {
+    // Skips hoisting if this function is an init function or any callees,
+    // including recursive ones, of an init functions, because otherwise the
+    // hoisted values won't be initialized when this function is called.
+    if (IsSessionInitializer(func) ||
+        init_callees.contains(func.getSymName()) || func == init_func_op ||
+        xla_launch_callees.contains(func.getSymName()))
+      continue;
+
+    // Skips hoisting if this function runs on TPU. This is will happen when
+    // fallback to TPUPartitionedCallOp is enabled for SPMD.
+    // TODO(b/214039254): remove this once tfrt support native SPMD.
+    bool has_tpu_op = false;
+    func.walk([&has_tpu_op](mlir::Operation *op) {
+      if (op->hasAttr("_tpu_replicate")) has_tpu_op = true;
+    });
+    if (has_tpu_op) continue;
+
+    HoistInvariantOpsInFunction(func, read_only_vars,
+                                side_effect_analysis.GetAnalysisForFunc(func),
+                                builder, module_hoist_info);
+  }
+
+  // Create tf._TfrtSetResource ops in the init function.
+  for (auto iter : llvm::enumerate(module_hoist_info.hoisted_values)) {
+    mlir::Value value = iter.value().first;
+    int64_t index = iter.index();
+
+    auto new_value = module_hoist_info.value_mapping.lookup(value);
+    auto *new_op = new_value.getDefiningOp();
+    assert(new_op);
+    builder.setInsertionPointAfter(new_op);
+    auto set_resource_op = builder.create<mlir::TF::_TfrtSetResourceOp>(
+        new_op->getLoc(), new_value, index);
+
+    // Preserve the device attribute.
+    llvm::StringRef device = kCpuDeviceName;
+    if (auto device_attr = new_op->getAttrOfType<mlir::StringAttr>("device")) {
+      if (!device_attr.getValue().empty()) device = device_attr.getValue();
+    }
+    set_resource_op->setAttr("device", builder.getStringAttr(device));
+  }
+
+  builder.setInsertionPointToEnd(block);
+  // Finish building the init function by inserting an return op.
+  builder.create<mlir::func::ReturnOp>(init_func_op.getLoc());
+
+  // Now that we have the index for each value that will be replaced, we can
+  // create the tf._TfrtGetResource op in each function using these indices.
+  ReplaceHoistedValues(module_hoist_info.hoisted_values, builder);
+
+  // Lastly, erase the hoisted ops in reverse topological order.
+  for (auto *op :
+       llvm::reverse(module_hoist_info.hoists_in_topological_order)) {
+    assert(op->use_empty());
+    op->erase();
+  }
+}
+
+void LowerTFSavedModelPass::ReplaceHoistedValues(
+    llvm::ArrayRef<std::pair<mlir::Value, mlir::TF::ResourceHandle>>
+        hoisted_values,
+    mlir::OpBuilder &builder) {
+  struct HoistedValueInfo {
+    llvm::SmallVector<mlir::Value, 4> hoisted_values;
+    llvm::SmallVector<int64_t, 4> indices;
+    llvm::SmallVector<llvm::StringRef, 4> shared_names;
+    llvm::SmallVector<llvm::StringRef, 4> containers;
+  };
+  // Rearrange the hoisted values by each function and each device.
+  llvm::DenseMap<mlir::Block *, llvm::StringMap<HoistedValueInfo>>
+      hoisted_values_by_block_device;
+
+  // Find a block where to place tf._TfrtGetResource operation. We do not place
+  // get resource operations inside the `tf_device.cluster` operations, because
+  // these blocks are intended for later on-device compilation. Insert resource
+  // reads to the closest block outside of the `tf_device.cluster` operation.
+  auto hoist_into_block = [](mlir::Value value) -> mlir::Block * {
+    mlir::Operation *cluster_op =
+        value.getDefiningOp()->getParentOfType<mlir::tf_device::ClusterOp>();
+    return cluster_op ? cluster_op->getBlock() : value.getParentBlock();
+  };
+
+  for (auto iter : llvm::enumerate(hoisted_values)) {
+    auto value = iter.value().first;
+    auto index = iter.index();
+    auto &device_map = hoisted_values_by_block_device[hoist_into_block(value)];
+
+    assert(value.getDefiningOp() && "hoisted values must not be arguments.");
+    llvm::StringRef device = kCpuDeviceName;
+    if (auto device_attr =
+            value.getDefiningOp()->getAttrOfType<mlir::StringAttr>("device")) {
+      if (!device_attr.getValue().empty()) device = device_attr.getValue();
+    }
+
+    auto &item = device_map[device];
+
+    item.hoisted_values.push_back(value);
+    item.indices.push_back(index);
+    item.shared_names.push_back(iter.value().second.name);
+    item.containers.push_back(iter.value().second.container);
+  }
+
+  // Create tf._TfrtGetResource op for each function and device.
+  for (const auto &block_iter : hoisted_values_by_block_device) {
+    auto *block = block_iter.first;
+    const auto &device_map = block_iter.second;
+
+    builder.setInsertionPointToStart(block);
+    for (const auto &device_iter : device_map) {
+      llvm::StringRef device = device_iter.getKey();
+      mlir::ValueRange old_values = device_iter.getValue().hoisted_values;
+      const auto &indices = device_iter.getValue().indices;
+      const auto &shared_name_arr = device_iter.getValue().shared_names;
+      const auto &container_arr = device_iter.getValue().containers;
+
+      llvm::SmallVector<mlir::Value> new_values;
+
+      if (fuse_get_resource_ops_) {
+        auto get_resource_op = builder.create<mlir::TF::_TfrtGetResourceOp>(
+            block->getParentOp()->getLoc(), old_values.getTypes(),
+            builder.getI64ArrayAttr(indices),
+            builder.getStrArrayAttr(shared_name_arr),
+            builder.getStrArrayAttr(container_arr));
+        get_resource_op->setAttr("device", builder.getStringAttr(device));
+        new_values = get_resource_op.getResults();
+      } else {
+        for (int i = 0; i < old_values.size(); ++i) {
+          auto get_resource_op = builder.create<mlir::TF::_TfrtGetResourceOp>(
+              block->getParentOp()->getLoc(),
+              mlir::TypeRange(old_values[i].getType()),
+              builder.getI64ArrayAttr(indices[i]),
+              builder.getStrArrayAttr(shared_name_arr[i]),
+              builder.getStrArrayAttr(container_arr[i]));
+          get_resource_op->setAttr("device", builder.getStringAttr(device));
+          new_values.append(get_resource_op->result_begin(),
+                            get_resource_op->result_end());
+        }
+      }
+
+      for (auto iter : llvm::zip(old_values, new_values)) {
+        auto old_value = std::get<0>(iter);
+        auto new_value = std::get<1>(iter);
+        old_value.replaceAllUsesWith(new_value);
+      }
+    }
+  }
+}
+
 static llvm::SmallVector<unsigned, 4> CompareTypes(mlir::TypeRange x,
                                                    mlir::TypeRange y) {
   llvm::SmallVector<unsigned, 4> results;
@@ -672,8 +722,10 @@ void ConvertReferenceVariableToResourceVariablePass::runOnOperation() {
 }  // namespace
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateLowerTFSavedModelPass(bool hoist_invariant_ops) {
-  return std::make_unique<LowerTFSavedModelPass>(hoist_invariant_ops);
+CreateLowerTFSavedModelPass(bool hoist_invariant_ops,
+                            bool fuse_get_resource_ops) {
+  return std::make_unique<LowerTFSavedModelPass>(hoist_invariant_ops,
+                                                 fuse_get_resource_ops);
 }
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
new file mode 100644
index 00000000000..2eda5bfd0e9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
@@ -0,0 +1,238 @@
+
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/set_shape_invariant_in_while_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_stub.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace {
+
+// Assigns devices so that later passes can utilize device information.
+// Device assignment might have not been done by the upstream pipeline, or get
+// removed by previous passes. However, we assume most of the device assignment
+// has been done by the upstream pipeline, so we simply assign the default
+// device to unassigned ops. Specifically, we do assignment for ConstOp first to
+// place it on the same device as its user operation, instead of placing it on
+// the default device blindly.
+// TODO(b/221297389): Figure out a more robust way to handle dropped device
+// assignment.
+void AddTfDeviceAssignmentPasses(mlir::OpPassManager &pm,
+                                 const TfrtPipelineOptions &options) {
+  pm.addPass(mlir::TF::CreateConstantOpDeviceAssignmentPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TF::CreateTFDeviceAssignmentByFuncAttrPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TF::CreateSimpleTFDeviceAssignmentPass(options.default_device));
+}
+
+}  // namespace
+
+void CreateTFExecutorToTFPreInvariantOptimizationPipelineHelper(
+    mlir::OpPassManager &pm, const TfrtPipelineOptions &options) {
+  // Due to b/191304670, functionalized while ops might not have the
+  // shape_invariant attribute set correctly, which leads to failure in shape
+  // inference. As a workaround, we conservatively (e.g., we place less
+  // restrictions on tf.while which will avoid failures but lead to potentially
+  // less exact shape inference) set the shape_invariant attribute in all
+  // tf.While ops before performing shape inference.
+  //
+  // Note that this pass might not work well with TF XLA bridge, but this is
+  // fine as TF XLA bridge is run before this pipeline. For CPU ops, less exact
+  // shape inference may lead to fewer optimizations but it should be fine as it
+  // is limited to while ops currently.
+  //
+  // TODO(b/191304670): Remove this pass once the shape_invariant attribute is
+  // set correctly in the upstream.
+  pm.addNestedPass<mlir::func::FuncOp>(
+      tfrt_compiler::CreateSetShapeInvariantInWhileOps());
+
+  // We pass the MLIR module through the TF standard pipeline, which for
+  // instances does shape inference, canonicalization, inlining, etc.
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_executor::CreateTFExecutorGraphPruningPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_executor::CreateTFExecutorIslandCoarseningPass());
+
+  AddTfDeviceAssignmentPasses(pm, options);
+
+  pm.addPass(tfrt_compiler::CreateTfrtXlaRewritePass());
+
+  // Here we perform TFRT specific optimization before standard TF optimization,
+  // as TFRT-specific optimization may create more opportunities.
+  pm.addNestedPass<mlir::func::FuncOp>(
+      tfrt_compiler::CreateOptimizeTfForTfrtPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+  // Guarantee all functions have one use, which enables more exact shape
+  // inference.
+  pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::TF::CreateTFOptimizePass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
+
+  AddTfDeviceAssignmentPasses(pm, options);
+
+  // After the standard pass, we now have MLIR in TF dialect, and now we convert
+  // reference variable to resource variables, which is besteffort.
+  pm.addPass(CreateConvertReferenceVariableToResourceVariablePass());
+
+  // Move the tf.Assert op to the end of the function, so that it does not
+  // impose unnecessary control dependencies on other ops.
+  pm.addPass(tfrt_compiler::CreateReorderTfAssertPass());
+
+  // Optimze the side-effects of control flow ops by examining the ops in its
+  // callees.
+  pm.addPass(tfrt_compiler::CreateOptimizeTfControlFlowSideEffectPass());
+
+  // Remove tf.If ops' operands that are produced by tf.Const ops.
+  pm.addPass(tfrt_compiler::CreateRemoveTfIfConstArgsPass());
+
+  // Merge non-side-effecting tf.If ops if their operands are the same.
+  pm.addPass(tfrt_compiler::CreateMergeTfIfOpsPass());
+
+  // Deduplicate functions invoked by tf.BatchFunction with the same
+  // shared_name
+  pm.addPass(
+      tfrt_compiler::CreateDeduplicateFunctionsInovkedByBatchFunctionPass());
+
+  // RemoveUnusedWhileResultsPass operates on the region-based control flow, so
+  // the functional control flow is first converted to region-based control
+  // flow, which is converted back after the optimization passes are performed.
+  pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addNestedPass<func::FuncOp>(
+      mlir::TF::CreateRemoveUnusedWhileResultsPass());
+  pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
+
+  // Apply standard optimization after optimizing control flow ops.
+  pm.addPass(mlir::createInlinerPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
+
+  // TODO(b/187876545): An extra shape inference pass is added because it does
+  // not work well with tf.Identity op that remove ref type. So we work around
+  // by performing shape inference again after reference variable to resource
+  // variable conversion. We should remove this after b/187876545 is fixed.
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TFDevice::CreateLaunchToDeviceAttributePass());
+
+  // After all standard passes run layout optimization to assign optimal data
+  // format for all layout sensitive operations.
+  mlir::TF::LayoutOptimizationPipelineOptions layout_optimization_options;
+  layout_optimization_options.force_data_format =
+      options.force_data_format.getValue();
+  // TODO(b/191304261): Folding transpose in ops is buggy in the layout
+  // optimization pass. Disable it to avoid errors in b/191304261. This should
+  // not affect CPU performance as it does not change the number of ops, nor
+  // does it change the types of the ops.
+  layout_optimization_options.skip_fold_transpose_in_ops = true;
+  mlir::TF::CreateLayoutOptimizationPipeline(pm.nest<mlir::func::FuncOp>(),
+                                             layout_optimization_options);
+
+  // Run canonicalization pipeline to remove unused constants and bypassed
+  // transpose operations left in the IR after layout optimization.
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+
+  // Decompose resource ops as resource variables will be converted to tensors
+  // directly.
+  if (options.decompose_resource_ops)
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::TFDevice::CreateDecomposeResourceOpsPass());
+
+  AddTfDeviceAssignmentPasses(pm, options);
+
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TF::CreateTensorDeviceCopyConversionPass());
+
+  AddTfrtJitRtPasses(options, pm);
+
+  // Rewriter operation sequences to device specific fusions.
+  DeviceNameUtils::ParsedName parsed_name;
+
+  // Ignore error.
+  bool success =
+      DeviceNameUtils::ParseFullName(options.default_device, &parsed_name);
+  assert(success && "default device is invalid");
+  (void)success;
+
+  if (parsed_name.has_type && parsed_name.type == DEVICE_GPU)
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::TF::CreateGpuOpFusionPass());
+
+  if (parsed_name.has_type && parsed_name.type == DEVICE_CPU)
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::TF::CreateFusedKernelMatcherPass());
+
+  if (options.tpu_fuse_ops) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        tfrt_compiler::CreateFuseTpuCompileAndExecutePass());
+    // Remove ops for the input to _TPUCompileMlirOp, which are no longer needed
+    // after CreateFuseTpuCompileAndExecutePass
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+  }
+
+  AddTfDeviceAssignmentPasses(pm, options);
+}
+
+void CreateTFExecutorToTFInvariantOptimizationPipelineHelper(
+    mlir::OpPassManager &pm, const TfrtPipelineOptions &options) {
+  if (options.sink_in_invariant_ops) {
+    pm.addPass(CreateSinkInInvariantOpsPass());
+  }
+
+  pm.addPass(CreateLowerTFSavedModelPass(
+      options.hoist_invariant_ops, options.fuse_get_resource_ops_in_hoisting));
+}
+
+Status ValidateTfrtPipelineOptions(const TfrtPipelineOptions &options) {
+  if (options.target_tpurt &&
+      (options.target_gpu || options.use_bridge_for_gpu)) {
+    return tensorflow::errors::Internal(
+        "Invalid pipeline options. Targeting both TPU and GPU is not "
+        "supported.");
+  }
+  return OkStatus();
+}
+
+Status CreateTFExecutorToTFPreInvariantOptimizationPipeline(
+    mlir::PassManager &pm, const TfrtPipelineOptions &options) {
+  TF_RETURN_IF_ERROR(ValidateTfrtPipelineOptions(options));
+  if (VLOG_IS_ON(1)) {
+    // Print the whole module after each pass, which requires disabling
+    // multi-threading as well.
+    pm.getContext()->disableMultithreading();
+    pm.enableIRPrinting(std::make_unique<tensorflow::BridgeLoggerConfig>(
+        /*print_module_scope=*/true));
+  }
+  CreateTFExecutorToTFPreInvariantOptimizationPipelineHelper(pm, options);
+  return OkStatus();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.h b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
index 01f1010788c..2502623adfa 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/tpu_passes.h"
-#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace mlir {
 class PassManager;
@@ -88,7 +88,8 @@ CreateSinkInInvariantOpsPass();
 // Create a pass that rewrites tf_saved_model dialect's ops according to TFRT's
 // requirements.
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateLowerTFSavedModelPass(bool hoist_invariant_ops);
+CreateLowerTFSavedModelPass(bool hoist_invariant_ops,
+                            bool fuse_get_resource_ops);
 
 // Create a pass that converts ref variables to resource variables in a limited
 // number of cases.
@@ -116,19 +117,28 @@ CreateCrossDeviceTransferPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateTfToTfrtConversionPass(const TfrtPipelineOptions& options);
 
-// Creates a pipeline of passes that lowers MLIR TF Executor dialect to TF
-// dialect for CoreRT purposes.
-tsl::Status CreateTFExecutorToTFPipeline(mlir::PassManager& pm,
-                                         const TfrtPipelineOptions& options);
-
 // Creates a pipeline of passes that lowers MLIR TF dialect to TFRT dialects.
 void CreateTfToTfrtPipeline(mlir::OpPassManager& pm,
                             const TfrtPipelineOptions& options);
 
 // Creates a pipeline of passes that lowers MLIR TF dialect from tf.function to
 // TFRT dialect. SavedModel related conversions are not included.
-tsl::Status CreateTfExecutorToTfrtPipeline(mlir::PassManager& pm,
-                                           const TfrtPipelineOptions& options);
+Status CreateTfExecutorToTfrtPipeline(mlir::PassManager& pm,
+                                      const TfrtPipelineOptions& options);
+
+// Creates a pipeline of passes that lowers MLIR TF Executor dialect to TF
+// dialect for CoreRT purposes.
+Status CreateTFExecutorToTFPipeline(mlir::PassManager& pm,
+                                    const TfrtPipelineOptions& options);
+
+// TODO(deqiangc): refactor below helpers once mlrt is OSSed.
+void CreateTFExecutorToTFPreInvariantOptimizationPipelineHelper(
+    mlir::OpPassManager& pm, const TfrtPipelineOptions& options);
+void CreateTFExecutorToTFInvariantOptimizationPipelineHelper(
+    mlir::OpPassManager& pm, const TfrtPipelineOptions& options);
+
+Status CreateTFExecutorToTFPreInvariantOptimizationPipeline(
+    mlir::PassManager& pm, const TfrtPipelineOptions& options);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt.cc
index 3205154d0e5..15973c75a9c 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt.cc
@@ -26,53 +26,41 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/Types.h"
-#include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassOptions.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/RegionUtils.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h"
 #include "tensorflow/compiler/mlir/tfrt/analysis/tensor_array_side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/gpu_ops.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h"
-#include "tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_jitrt_ops.h"
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering.h"
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/attr_lowering_utils.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/corert_converter.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/fallback_converter.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tfrt/transforms/set_shape_invariant_in_while_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_stub.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/utils.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/platform/tstring.h"
-#include "tfrt/jitrt/opdefs/jitrt_ops.h"  // from @tf_runtime
 #include "tfrt/basic_kernels/opdefs/basic_kernels.h"  // from @tf_runtime
 #include "tfrt/basic_kernels/opdefs/tfrt_base.h"  // from @tf_runtime
 #include "tfrt/basic_kernels/opdefs/types.h"  // from @tf_runtime
-#include "tfrt/core_runtime/opdefs/attributes.h"  // from @tf_runtime
 #include "tfrt/core_runtime/opdefs/core_runtime.h"  // from @tf_runtime
 #include "tfrt/core_runtime/opdefs/types.h"  // from @tf_runtime
 #include "tfrt/test_kernels/opdefs/test_kernels.h"  // from @tf_runtime
@@ -94,7 +82,9 @@ constexpr int64_t kDefaultCheapCost = 1;
 void getDependentConversionDialects(mlir::DialectRegistry &registry) {
   registry.insert<tfrt::corert::CoreRTDialect, mlir::func::FuncDialect,
                   tfrt::fallback_async::FallbackAsyncDialect,
-                  tfrt::compiler::TFRTDialect, tf_jitrt::JitRuntimeDialect>();
+                  tfrt::compiler::TFRTDialect>();
+
+  RegisterJitRtDialects(registry);
 }
 
 mlir::Value GetFunctionInputChain(mlir::Operation *op) {
@@ -237,7 +227,13 @@ class FallbackExecuteOpConversion : public mlir::ConversionPattern {
         // called by a TPUPartitionedCall op and will be compiled in
         // TPUPartitionedCall op via FunctionLibraryRuntime and not be processed
         // by BEFExecutor.
-        is_tpu_op) {
+        //
+        // We also avoid creating tfrt_fallback_async.createop for all GPU ops
+        // except for tf.XlaLaunch. This is correct as long as we only run XLA
+        // clusters on GPU and all other ops on CPU.
+        is_tpu_op ||
+        (parsed_device_name->device_type == DEVICE_GPU &&
+         op->getName().getStringRef().str() != "tf.XlaLaunch")) {
       return ConvertToCoreRTExecuteOp(
           op, operands, parsed_device_name->op_handler_name, op_attrs,
           op_func_attrs, op_name, rewriter);
@@ -450,7 +446,7 @@ class FallbackConstOpConversion
 
     tensorflow::TensorProto tensor_proto;
     auto status = ConvertToTensorProto(op.getValue(), &tensor_proto);
-    if (!status.ok()) return op.emitError(status.error_message());
+    if (!status.ok()) return op.emitError(tsl::NullTerminatedMessage(status));
 
     rewriter.replaceOpWithNewOp<tfrt::fallback_async::ConstTensorProtoOp>(
         op, rewriter.getType<tfrt::fallback::TFTensorType>(),
@@ -812,7 +808,7 @@ class CoreRTConstStringTensorOpConversion
           llvm::StringRef(element.data(), element.size())));
 
     // Create the shape attribute from the tensor shape.
-    ArrayRef<int64_t> shape = op.getValue().getType().getShape();
+    ArrayRef<int64_t> shape = op.getValue().getShapedType().getShape();
     llvm::SmallVector<mlir::Attribute, 4> dims;
     dims.reserve(shape.size());
     auto i64_type = rewriter.getIntegerType(64);
@@ -1414,43 +1410,6 @@ mlir::func::FuncOp TFRTWhileOpConversion::GetWhileBodyFunction(
   return body_fn;
 }
 
-// TODO(ezhulenev): tf_device.cluster operations after auto-fusion should
-// have the correct device assigned based on the fused operations. We should
-// use this device to convert operands and results from/to corert handles.
-// For now it is safe to assume that it is "CPU" because we do not support
-// any other devices and do not support distributed models.
-constexpr char kJitRtDevice[] = "/job:localhost/replica:0/task:0/device:CPU:0";
-
-// Convert jitrt.call operations to the tf_jitrt.fallback.execute operation.
-class JitRtCallToJitRtCompileAndExecuteConversion
-    : public OpConversionPattern<tfrt::jitrt::CallOp> {
- public:
-  explicit JitRtCallToJitRtCompileAndExecuteConversion(MLIRContext *context)
-      : OpConversionPattern<tfrt::jitrt::CallOp>(context) {}
-
-  LogicalResult matchAndRewrite(
-      tfrt::jitrt::CallOp call, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    // Convert operands to fallback tensors.
-    llvm::SmallVector<Value, 4> fallback_operands;
-    if (failed(tfrt_compiler::ConvertFallbackOperands(
-            call, kJitRtDevice, adaptor.getOperands(), &fallback_operands,
-            rewriter)))
-      return rewriter.notifyMatchFailure(call, "failed to convert operand");
-
-    // tf_jitrt.fallback.execute always produces fallback tensors.
-    llvm::SmallVector<Type, 4> result_types(
-        call->getNumResults(),
-        rewriter.getType<tfrt::fallback::TFTensorType>());
-
-    // Replace jitrt.call operation with a tf_jitrt.fallback.execute operation.
-    rewriter.replaceOpWithNewOp<tf_jitrt::FallbackExecuteOp>(
-        call, result_types, call.getCallee(), fallback_operands, kJitRtDevice);
-
-    return success();
-  }
-};
-
 // Helper function for specifying legal dialects for conversion to CoreRT.
 void SetUpTFToTFRTConversionLegality(mlir::ConversionTarget *target,
                                      mlir::TypeConverter *func_type_converter,
@@ -1459,10 +1418,8 @@ void SetUpTFToTFRTConversionLegality(mlir::ConversionTarget *target,
   target->addLegalDialect<tfrt::fallback_async::FallbackAsyncDialect>();
   target->addLegalDialect<tfrt::compiler::TFRTDialect>();
   target->addLegalDialect<tfrt::test::TestDialect>();
-  target->addLegalDialect<tf_jitrt::JitRuntimeDialect>();
   target->addIllegalDialect<TF::TensorFlowDialect>();
   target->addIllegalDialect<tf_device::TensorFlowDeviceDialect>();
-  target->addIllegalDialect<tfrt::jitrt::JitRuntimeDialect>();
   target->addDynamicallyLegalOp<mlir::func::FuncOp>([func_type_converter,
                                                      chain_type](
                                                         func::FuncOp op) {
@@ -1477,14 +1434,6 @@ void SetUpTFToTFRTConversionLegality(mlir::ConversionTarget *target,
   });
 }
 
-// Helper function for inserting TFRT JitRt dialect conversions.
-void PopulateJitRtConversionPatterns(MLIRContext *context,
-                                     RewritePatternSet *patterns,
-                                     CoreRTConverter *corert_converter) {
-  // Lower jitrt.call to the pair of compile and execute operations.
-  patterns->add<JitRtCallToJitRtCompileAndExecuteConversion>(context);
-}
-
 // Helper function for inserting TF dialect to TFRT dialect op conversion
 // patterns.
 void PopulateTFToTFRTConversionPatterns(
@@ -1613,7 +1562,9 @@ class TfToTfrtConversionPass
     }
     SetUpTFToTFRTConversionLegality(&target, func_type_converter,
                                     corert_converter.chain_type());
-    PopulateJitRtConversionPatterns(&context, &patterns, &corert_converter);
+
+    PopulateJitRtConversionPatterns(&target, &context, &patterns,
+                                    &corert_converter);
 
     PopulateTFToTFRTConversionPatterns(
         &context, &patterns, &corert_converter, &fallback_converter,
@@ -1737,31 +1688,8 @@ class TfToTfrtConversionPass
       chain_value = create_op;
     }
 
-    // Pre-compile all JIT compiled kernels found in the module.
-    llvm::SmallVector<Value> compiled;
-
-    // A set SymbolRef attributes referencing compiled kernels.
-    llvm::DenseSet<mlir::Attribute> kernels;
-
-    // Compile all kernels in parallell.
-    module.walk([&](tf_jitrt::FallbackExecuteOp execute) {
-      // Do not compiled the same kernel multiple times.
-      if (kernels.contains(execute.getKernel())) return;
-
-      auto compile = builder.create<tf_jitrt::FallbackCompileOp>(
-          execute.getLoc(), chain_type, execute.getKernel(),
-          execute.getDevice());
-      compiled.push_back(compile.getResult());
-      kernels.insert(compile.getKernel());
-    });
-
-    // Wait for the compilation completion before returning from init function.
-    if (!compiled.empty()) {
-      // Do not forget to wait for the fallback kernels initialization.
-      compiled.insert(compiled.begin(), chain_value);
-      chain_value = builder.create<tfrt::compiler::MergeChainsOp>(
-          func_op.getLoc(), chain_type, compiled);
-    }
+    chain_value =
+        CreateJitRtFallbackCompileKernel(builder, module, chain_value);
 
     builder.create<tfrt::compiler::ReturnOp>(func_op.getLoc(), chain_value);
   }
@@ -1877,25 +1805,6 @@ class TfToTfrtConversionPass
                      "currently experimental."),
       llvm::cl::init(false)};
 };
-
-// Assigns devices so that later passes can utilize device information.
-// Device assignement might have not been done by the upstream pipeline, or get
-// removed by previous passes. However, we assume most of the device assignment
-// has been done by the upstream pipeline, so we simply assign the default
-// device to unassigned ops. Specifically, we do assignment for ConstOp first to
-// place it on the same device as its user operation, instead of placing it on
-// the default device blindly.
-// TODO(b/221297389): Figure out a more robust way to handle dropped device
-// assignment.
-void AddTfDeviceAssignmentPasses(mlir::OpPassManager &pm,
-                                 const TfrtPipelineOptions &options) {
-  pm.addPass(mlir::TF::CreateConstantOpDeviceAssignmentPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::TF::CreateTFDeviceAssignmentByFuncAttrPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::TF::CreateSimpleTFDeviceAssignmentPass(options.default_device));
-}
-
 }  // namespace
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
@@ -1904,425 +1813,6 @@ CreateTfToTfrtConversionPass(const TfrtPipelineOptions &options) {
 }
 
 // -------------------------------------------------------------------------- //
-// Outline tf_device.cluster operation regions into functions in the nested
-// modules and replaces all cluster operations with jitrt.call operations.
-// -------------------------------------------------------------------------- //
-
-class OutlineJitRtClustersPass
-    : public PassWrapper<OutlineJitRtClustersPass, OperationPass<ModuleOp>> {
- public:
-  llvm::StringRef getArgument() const final {
-    return "tf-outline-jitrt-cluster";
-  }
-  llvm::StringRef getDescription() const final {
-    return "Outlines `tf_device.cluster` operations into functions and "
-           "replaces them with `jitrt.call` operations.";
-  }
-
-  void runOnOperation() override;
-
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<tfrt::jitrt::JitRuntimeDialect>();
-  }
-
- private:
-  struct CompiledModule {
-    ModuleOp module;
-    func::FuncOp entrypoint;
-    llvm::SetVector<Value> operands;
-  };
-
-  // Creates a nested module with a single function that will be compiled into
-  // the kernel at runtime.
-  CompiledModule CreateCompiledModule(tf_device::ClusterOp cluster,
-                                      int64_t max_arg_size,
-                                      SymbolTable *symbol_table);
-
-  // Update compiled module entrypoint signature with inferred operands
-  // constraints.
-  LogicalResult SetEntrypointConstraints(CompiledModule &compiled);
-
-  // Outlines cluster operation regions into compiled modules, and replaces
-  // cluster operation with a jitrt.call operation.
-  LogicalResult OutlineClusterOp(tf_device::ClusterOp cluster,
-                                 int64_t max_arg_size,
-                                 SymbolTable *symbol_table);
-
-  // Mapping from the outlined module string representation to the module itself
-  // and an entrypoint function. Used to deduplicate identical modules during
-  // the `tf_device.cluster` outlining.
-  llvm::StringMap<std::pair<ModuleOp, func::FuncOp>> outlined_;
-};
-
-OutlineJitRtClustersPass::CompiledModule
-OutlineJitRtClustersPass::CreateCompiledModule(tf_device::ClusterOp cluster,
-                                               int64_t max_arg_size,
-                                               SymbolTable *symbol_table) {
-  MLIRContext *ctx = cluster->getContext();
-  Location loc = cluster.getLoc();
-
-  // Create a module that will hold compiled function and async wrappers.
-  // TODO(ezhulenev): Give better names to module and function.
-  auto compiled_module = ModuleOp::create(loc, {"kernel"});
-  compiled_module->setAttr("tfrt.compiled", UnitAttr::get(ctx));
-  compiled_module->setAttr(
-      "tfrt.max-arg-size",
-      IntegerAttr::get(IntegerType::get(ctx, 64), max_arg_size));
-
-  SymbolTable compiled_module_symbol_table(compiled_module);
-
-  // Find out the cluster arguments and their types.
-  llvm::SetVector<Value> live_ins;
-  getUsedValuesDefinedAbove(cluster.getBody(), cluster.getBody(), live_ins);
-
-  llvm::SmallVector<Type, 4> operand_types;
-  operand_types.reserve(live_ins.size());
-  for (Value v : live_ins) operand_types.emplace_back(v.getType());
-
-  // Create a function in the compiled module.
-  auto compiled_func_type =
-      FunctionType::get(ctx, operand_types, cluster->getResultTypes());
-  auto compiled_func = func::FuncOp::create(loc, "compute", compiled_func_type);
-  compiled_module_symbol_table.insert(compiled_func);
-
-  // Replace uses of live-in values within cluster region with block arguments.
-  Block *compiled_func_block = compiled_func.addEntryBlock();
-  for (auto p : llvm::zip(live_ins, compiled_func_block->getArguments()))
-    replaceAllUsesInRegionWith(std::get<0>(p), std::get<1>(p),
-                               cluster.getBody());
-
-  // Move all operations in cluster into compiled_func's entry block.
-  auto &cluster_body = cluster.GetBody().getOperations();
-  compiled_func_block->getOperations().splice(
-      compiled_func_block->end(), cluster_body, cluster_body.begin(),
-      cluster_body.end());
-
-  // Replace `tf_device.return` terminator with `func.return` in the function
-  // body.
-  auto device_return =
-      cast<tf_device::ReturnOp>(compiled_func_block->getTerminator());
-  OpBuilder builder(device_return.getOperation());
-  builder.create<func::ReturnOp>(device_return.getLoc(),
-                                 device_return.getOperands());
-  device_return.erase();
-
-  // TODO(ezhulenev): MLIR doesn't define operation equivalence upstream yet,
-  // replace module printing with a more principled solution when available.
-  // Operations in the cluster can be in different order, however define the
-  // identical Tensorflow programs, with current approach we'll not be able
-  // to detect duplicates like this.
-
-  // Remove location attribute attached to Tensorflow operations to be able to
-  // deduplicate compiled clusters with the same set of operations.
-  //
-  // TODO(ezhulenev): Figure out how to propagate locations for error reporting,
-  // right now JitRt will ignore them anyway.
-  compiled_module.walk([](Operation *op) { op->removeAttr("_class"); });
-
-  // Serialize prepared module to string.
-  std::string serialized;
-  llvm::raw_string_ostream os(serialized);
-  compiled_module.print(os);
-
-  // Try to find if identical module was already outlined.
-  auto it = outlined_.find(serialized);
-
-  // Return identical module that was already outlined earlier.
-  if (it != outlined_.end()) {
-    compiled_module.erase();  // erase identical module
-    return {it->second.first, it->second.second, live_ins};
-  }
-
-  // Insert compiled module into the symbol table and assign it a unique name.
-  symbol_table->insert(compiled_module);
-
-  // Cache unique module.
-  outlined_.insert({std::move(serialized), {compiled_module, compiled_func}});
-
-  return {compiled_module, compiled_func, live_ins};
-}
-
-LogicalResult OutlineJitRtClustersPass::SetEntrypointConstraints(
-    CompiledModule &compiled) {
-  func::FuncOp func = compiled.entrypoint;
-
-  // Functions outlined from jitrt device clusters must have a single block.
-  assert(func.getBody().getBlocks().size() == 1 && "expected single block");
-
-  mlir::TFDevice::ClusteringPolicySet policies;
-  populateTfJitRtConstraintsPolicies(policies);
-
-  // Infer constraints on the values defined in the entrypoint function
-  // (including function entry block arguments).
-  mlir::TFDevice::ValuesConstraintSet constraints;
-  if (failed(mlir::TFDevice::PropagateValuesConstraints(
-          func.getBody(), policies, constraints, /*resolve=*/true)))
-    return failure();
-
-  // Annotate arguments with inferred constraints.
-  for (unsigned i = 0; i < func.getNumArguments(); ++i) {
-    if (auto constraint = constraints.GetConstraint(func.getArgument(i))) {
-      auto constraint_name = mlir::StringAttr::get(
-          &getContext(), llvm::formatv("{0}", *constraint).str());
-      func.setArgAttr(i, "rt.constraint", constraint_name);
-    }
-  }
-
-  return success();
-}
-
-LogicalResult OutlineJitRtClustersPass::OutlineClusterOp(
-    tf_device::ClusterOp cluster, int64_t max_arg_size,
-    SymbolTable *symbol_table) {
-  Location loc = cluster->getLoc();
-  OpBuilder builder(cluster);
-
-  CompiledModule compiled_module =
-      CreateCompiledModule(cluster, max_arg_size, symbol_table);
-  func::FuncOp compiled_func = compiled_module.entrypoint;
-
-  // Add constraints to the entrypoint arguments.
-  if (failed(SetEntrypointConstraints(compiled_module))) return failure();
-
-  // Replace device cluster with a jitrt.call operation.
-  auto module_name = *compiled_module.module.getSymName();
-  auto func_name = compiled_func.getSymName();
-  auto func_flat_ref =
-      mlir::SymbolRefAttr::get(builder.getContext(), func_name);
-  auto func_ref = mlir::SymbolRefAttr::get(builder.getContext(), module_name,
-                                           {func_flat_ref});
-
-  auto cluster_func_op = builder.create<tfrt::jitrt::CallOp>(
-      loc, cluster.getResultTypes(), func_ref,
-      compiled_module.operands.getArrayRef());
-
-  cluster.replaceAllUsesWith(cluster_func_op);
-  cluster.erase();
-
-  return success();
-}
-
-void OutlineJitRtClustersPass::runOnOperation() {
-  ModuleOp module = getOperation();
-  SymbolTable symbol_table(module);
-
-  // Keep track of the maximum argument size for each function with tf_device
-  // cluster operations in the function body. We need to pass it to the compiled
-  // module to correctly compute its cost later.
-  llvm::DenseMap<mlir::func::FuncOp, int64_t> max_arg_size_map;
-
-  auto get_max_arg_size = [&](mlir::func::FuncOp func) -> int64_t {
-    auto it = max_arg_size_map.find(func);
-    if (it != max_arg_size_map.end()) return it->second;
-    return max_arg_size_map[func] = tf_jitrt::GetMaxArgSize(func);
-  };
-
-  OpBuilder builder(module.getContext());
-  auto result = module.walk([&](tf_device::ClusterOp cluster) -> WalkResult {
-    // Ensure that cluster was formed for TFRT JIT compilation.
-    auto policy = cluster->getAttr("policy").dyn_cast_or_null<StringAttr>();
-    if (!policy || policy.getValue() != "tfrt.auto-fusion")
-      return WalkResult::advance();
-
-    // Get the maximum argument size of the parent function.
-    mlir::func::FuncOp parent_func =
-        cluster->getParentOfType<mlir::func::FuncOp>();
-    int64_t max_arg_size = get_max_arg_size(parent_func);
-
-    if (failed(OutlineClusterOp(cluster, max_arg_size, &symbol_table)))
-      return WalkResult::interrupt();
-    return WalkResult::advance();
-  });
-
-  if (result.wasInterrupted()) {
-    module->emitError("Failed to outline tf_device.cluster operations");
-    signalPassFailure();
-  }
-}
-
-static std::unique_ptr<Pass> CreateOutlineJitRtClustersPass() {
-  return std::make_unique<OutlineJitRtClustersPass>();
-}
-
-// -------------------------------------------------------------------------- //
-
-static void CreateTFExecutorToTFPipelineHelper(
-    mlir::OpPassManager &pm, const TfrtPipelineOptions &options) {
-  // Due to b/191304670, functionalized while ops might not have the
-  // shape_invariant attribute set correctly, which leads to failure in shape
-  // inference. As a workaround, we conservatively (e.g., we place less
-  // restrictions on tf.while which will avoid failures but lead to potentially
-  // less exact shape inference) set the shape_invariant attribute in all
-  // tf.While ops before performing shape inference.
-  //
-  // Note that this pass might not work well with TF XLA bridge, but this is
-  // fine as TF XLA bridge is run before this pipeline. For CPU ops, less exact
-  // shape inference may lead to fewer optimizations but it should be fine as it
-  // is limited to while ops currently.
-  //
-  // TODO(b/191304670): Remove this pass once the shape_invariant attribute is
-  // set correctly in the upstream.
-  pm.addNestedPass<mlir::func::FuncOp>(
-      tfrt_compiler::CreateSetShapeInvariantInWhileOps());
-
-  // We pass the MLIR module through the TF standard pipeline, which for
-  // instances does shape inference, canonicalization, inlining, etc.
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::tf_executor::CreateTFExecutorGraphPruningPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::tf_executor::CreateTFExecutorIslandCoarseningPass());
-
-  AddTfDeviceAssignmentPasses(pm, options);
-
-  pm.addPass(tfrt_compiler::CreateTfrtXlaRewritePass());
-
-  // Here we perform TFRT specific optimization before standard TF optimization,
-  // as TFRT-specific optimization may create more opportunities.
-  pm.addNestedPass<mlir::func::FuncOp>(
-      tfrt_compiler::CreateOptimizeTfForTfrtPass());
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
-  // Guarantee all functions have one use, which enables more exact shape
-  // inference.
-  pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
-  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
-  pm.addPass(mlir::createInlinerPass());
-  pm.addPass(mlir::createSymbolDCEPass());
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::TF::CreateTFOptimizePass());
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
-
-  AddTfDeviceAssignmentPasses(pm, options);
-
-  // After the standard pass, we now have MLIR in TF dialect, and now we convert
-  // reference variable to resource variables, which is besteffort.
-  pm.addPass(CreateConvertReferenceVariableToResourceVariablePass());
-
-  // Move the tf.Assert op to the end of the function, so that it does not
-  // impose unnecessary control dependencies on other ops.
-  pm.addPass(tfrt_compiler::CreateReorderTfAssertPass());
-
-  // Optimze the side-effects of control flow ops by examining the ops in its
-  // callees.
-  pm.addPass(tfrt_compiler::CreateOptimizeTfControlFlowSideEffectPass());
-
-  // Remove tf.If ops' operands that are produced by tf.Const ops.
-  pm.addPass(tfrt_compiler::CreateRemoveTfIfConstArgsPass());
-
-  // Merge non-side-effecting tf.If ops if their operands are the same.
-  pm.addPass(tfrt_compiler::CreateMergeTfIfOpsPass());
-
-  // Deduplicate functions invoked by tf.BatchFunction with the same
-  // shared_name
-  pm.addPass(
-      tfrt_compiler::CreateDeduplicateFunctionsInovkedByBatchFunctionPass());
-
-  // RemoveUnusedWhileResultsPass operates on the region-based control flow, so
-  // the functional control flow is first converted to region-based control
-  // flow, which is converted back after the optimization passes are performed.
-  pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
-  pm.addPass(mlir::createInlinerPass());
-  pm.addNestedPass<func::FuncOp>(
-      mlir::TF::CreateRemoveUnusedWhileResultsPass());
-  pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
-
-  // Apply standard optimization after optimizing control flow ops.
-  pm.addPass(mlir::createInlinerPass());
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
-
-  // TODO(b/187876545): An extra shape inference pass is added because it does
-  // not work well with tf.Identity op that remove ref type. So we work around
-  // by performing shape inference again after reference variable to resource
-  // variable conversion. We should remove this after b/187876545 is fixed.
-  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
-
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::TFDevice::CreateLaunchToDeviceAttributePass());
-
-  // After all standard passes run layout optimization to assign optimal data
-  // format for all layout sensitive operations.
-  mlir::TF::LayoutOptimizationPipelineOptions layout_optimization_options;
-  layout_optimization_options.force_data_format =
-      options.force_data_format.getValue();
-  // TODO(b/191304261): Folding transpose in ops is buggy in the layout
-  // optimization pass. Disable it to avoid errors in b/191304261. This should
-  // not affect CPU performance as it does not change the number of ops, nor
-  // does it change the types of the ops.
-  layout_optimization_options.skip_fold_transpose_in_ops = true;
-  mlir::TF::CreateLayoutOptimizationPipeline(pm.nest<mlir::func::FuncOp>(),
-                                             layout_optimization_options);
-
-  // Run canonicalization pipeline to remove unused constants and bypassed
-  // transpose operations left in the IR after layout optimization.
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
-
-  // Decompose resource ops as resource variables will be converted to tensors
-  // directly.
-  if (options.decompose_resource_ops)
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::TFDevice::CreateDecomposeResourceOpsPass());
-
-  AddTfDeviceAssignmentPasses(pm, options);
-
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::TF::CreateTensorDeviceCopyConversionPass());
-
-  // Outline auto-fusion clusters into tf_device.cluster_operations and then
-  // convert them to functions. We currently support only tfrt fallback tensors
-  // as operands, so we disable these passes if we can have native ops after
-  // lowering.
-  pm.addNestedPass<mlir::func::FuncOp>(CreateTfJitRtClusteringPass(
-      options.auto_fusion_oplist, options.auto_fusion_min_cluster_size));
-
-  // Sink small constants into the outlined clusters to reduce the number of
-  // arguments for each of the execute operations.
-  auto is_compilable_const = [](mlir::tf_device::ClusterOp cluster,
-                                mlir::ElementsAttr value) -> bool {
-    // Ensure that cluster was formed for TFRT JIT compilation.
-    auto policy = cluster->getAttr("policy").dyn_cast_or_null<StringAttr>();
-    if (!policy || policy.getValue() != "tfrt.auto-fusion") return false;
-
-    // Check that TF->JitRt compiler supports constant compilation.
-    return mlir::succeeded(IsCompilableConstant(value));
-  };
-
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::TFDevice::CreateClusterConstantSinkingPass(is_compilable_const));
-
-  // Outline formed JIT compiled device clusters into function.
-  pm.addPass(CreateOutlineJitRtClustersPass());
-
-  // Rewriter operation sequences to device specific fusions.
-  DeviceNameUtils::ParsedName parsed_name;
-
-  // Ignore error.
-  bool success =
-      DeviceNameUtils::ParseFullName(options.default_device, &parsed_name);
-  assert(success && "default device is invalid");
-  (void)success;
-
-  if (parsed_name.has_type && parsed_name.type == DEVICE_GPU)
-    pm.addNestedPass<mlir::func::FuncOp>(mlir::TF::CreateGpuOpFusionPass());
-
-  if (parsed_name.has_type && parsed_name.type == DEVICE_CPU)
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::TF::CreateFusedKernelMatcherPass());
-
-  if (options.tpu_fuse_ops) {
-    pm.addNestedPass<mlir::func::FuncOp>(
-        tfrt_compiler::CreateFuseTpuCompileAndExecutePass());
-    // Remove ops for the input to _TPUCompileMlirOp, which are no longer needed
-    // after CreateFuseTpuCompileAndExecutePass
-    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
-  }
-
-  AddTfDeviceAssignmentPasses(pm, options);
-
-  if (options.sink_in_invariant_ops) {
-    pm.addPass(CreateSinkInInvariantOpsPass());
-  }
-
-  pm.addPass(CreateLowerTFSavedModelPass(options.hoist_invariant_ops));
-}
-
 void CreateTfToTfrtPipeline(mlir::OpPassManager &pm,
                             const TfrtPipelineOptions &options) {
   pm.addPass(CreateTfToTfrtConversionPass(options));
@@ -2341,49 +1831,33 @@ void CreateTfToTfrtPipeline(mlir::OpPassManager &pm,
 
 static void CreateTfExecutorToTfrtPipelineHelper(
     mlir::OpPassManager &pm, const TfrtPipelineOptions &options) {
-  CreateTFExecutorToTFPipelineHelper(pm, options);
+  CreateTFExecutorToTFPreInvariantOptimizationPipelineHelper(pm, options);
+  CreateTFExecutorToTFInvariantOptimizationPipelineHelper(pm, options);
   CreateTfToTfrtPipeline(pm, options);
 }
 
-Status ValidateTfrtPipelineOptions(const TfrtPipelineOptions &options) {
-  if (options.target_tpurt &&
-      (options.target_gpu || options.use_bridge_for_gpu)) {
-    return tensorflow::errors::Internal(
-        "Invalid pipeline options. Targeting both TPU and GPU is not "
-        "supported.");
-  }
-  return OkStatus();
-}
-
 // If verbose logging is on, dump the output of each pass to a file directory,
 // set via env var TF_DUMP_GRAPH_PREFIX. e.g.:
 // export TF_DUMP_GRAPH_PREFIX=/tmp/mlir
 Status CreateTfExecutorToTfrtPipeline(mlir::PassManager &pm,
                                       const TfrtPipelineOptions &options) {
-  TF_RETURN_IF_ERROR(CreateTFExecutorToTFPipeline(pm, options));
+  TF_RETURN_IF_ERROR(
+      CreateTFExecutorToTFPreInvariantOptimizationPipeline(pm, options));
+  CreateTFExecutorToTFInvariantOptimizationPipelineHelper(pm, options);
   CreateTfToTfrtPipeline(pm, options);
   return OkStatus();
 }
 
 Status CreateTFExecutorToTFPipeline(mlir::PassManager &pm,
                                     const TfrtPipelineOptions &options) {
-  TF_RETURN_IF_ERROR(ValidateTfrtPipelineOptions(options));
-  if (VLOG_IS_ON(1)) {
-    // Print the whole module after each pass, which requires disabling
-    // multi-threading as well.
-    pm.getContext()->disableMultithreading();
-    pm.enableIRPrinting(std::make_unique<tensorflow::BridgeLoggerConfig>(
-        /*print_module_scope=*/true));
-  }
-  CreateTFExecutorToTFPipelineHelper(pm, options);
+  TF_RETURN_IF_ERROR(
+      CreateTFExecutorToTFPreInvariantOptimizationPipeline(pm, options));
+  CreateTFExecutorToTFInvariantOptimizationPipelineHelper(pm, options);
   return OkStatus();
 }
 
 static mlir::PassRegistration<TfToTfrtConversionPass> tf_to_tfrt_pass;
 
-static mlir::PassRegistration<OutlineJitRtClustersPass>
-    tf_outline_jitrt_cluster_pass(CreateOutlineJitRtClustersPass);
-
 static mlir::PassPipelineRegistration<TfrtPipelineOptions> tf_pipeline(
     "tf-executor-to-tfrt-pipeline",
     "Convert Tensorflow Executor dialect to TFRT dialect and "
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_passes.cc
new file mode 100644
index 00000000000..91a4c1d61fd
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_passes.cc
@@ -0,0 +1,414 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h"
+#include "tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_jitrt_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering.h"
+#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/fallback_converter.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_stub.h"
+#include "tfrt/jitrt/opdefs/jitrt_ops.h"  // from @tf_runtime
+#include "tfrt/basic_kernels/opdefs/basic_kernels.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace {
+
+class TfrtJitRtStubImpl : public TfrtJitRtStub {
+  void RegisterJitRtDialects(mlir::DialectRegistry &registry) override;
+
+  void PopulateJitRtConversionPatterns(
+      mlir::ConversionTarget *target, mlir::MLIRContext *context,
+      mlir::RewritePatternSet *patterns,
+      CoreRTConverter *corert_converter) override;
+
+  mlir::Value CreateJitRtFallbackCompileKernel(
+      mlir::OpBuilder &builder, mlir::ModuleOp module,
+      mlir::Value chain_value) override;
+
+  void AddTfrtJitRtPasses(const TfrtPipelineOptions &options,
+                          mlir::OpPassManager &pm) override;
+};
+
+void TfrtJitRtStubImpl::RegisterJitRtDialects(mlir::DialectRegistry &registry) {
+  registry.insert<tf_jitrt::JitRuntimeDialect>();
+}
+
+// TODO(ezhulenev): tf_device.cluster operations after auto-fusion should
+// have the correct device assigned based on the fused operations. We should
+// use this device to convert operands and results from/to corert handles.
+// For now it is safe to assume that it is "CPU" because we do not support
+// any other devices and do not support distributed models.
+constexpr char kJitRtDevice[] = "/job:localhost/replica:0/task:0/device:CPU:0";
+
+// Convert jitrt.call operations to the tf_jitrt.fallback.execute operation.
+class JitRtCallToJitRtCompileAndExecuteConversion
+    : public OpConversionPattern<tfrt::jitrt::CallOp> {
+ public:
+  explicit JitRtCallToJitRtCompileAndExecuteConversion(MLIRContext *context)
+      : OpConversionPattern<tfrt::jitrt::CallOp>(context) {}
+
+  LogicalResult matchAndRewrite(
+      tfrt::jitrt::CallOp call, OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    // Convert operands to fallback tensors.
+    llvm::SmallVector<Value, 4> fallback_operands;
+    if (failed(tfrt_compiler::ConvertFallbackOperands(
+            call, kJitRtDevice, adaptor.getOperands(), &fallback_operands,
+            rewriter)))
+      return rewriter.notifyMatchFailure(call, "failed to convert operand");
+
+    // tf_jitrt.fallback.execute always produces fallback tensors.
+    llvm::SmallVector<Type, 4> result_types(
+        call->getNumResults(),
+        rewriter.getType<tfrt::fallback::TFTensorType>());
+
+    // Replace jitrt.call operation with a tf_jitrt.fallback.execute operation.
+    rewriter.replaceOpWithNewOp<tf_jitrt::FallbackExecuteOp>(
+        call, result_types, call.getCallee(), fallback_operands, kJitRtDevice);
+
+    return success();
+  }
+};
+
+// Helper function for inserting TFRT JitRt dialect conversions.
+void TfrtJitRtStubImpl::PopulateJitRtConversionPatterns(
+    mlir::ConversionTarget *target, MLIRContext *context,
+    RewritePatternSet *patterns, CoreRTConverter *corert_converter) {
+  target->addLegalDialect<tf_jitrt::JitRuntimeDialect>();
+  target->addIllegalDialect<tfrt::jitrt::JitRuntimeDialect>();
+  // Lower jitrt.call to the pair of compile and execute operations.
+  patterns->add<JitRtCallToJitRtCompileAndExecuteConversion>(context);
+}
+
+mlir::Value TfrtJitRtStubImpl::CreateJitRtFallbackCompileKernel(
+    mlir::OpBuilder &builder, mlir::ModuleOp module, mlir::Value chain_value) {
+  // Pre-compile all JIT compiled kernels found in the module.
+  llvm::SmallVector<Value> compiled;
+
+  // A set SymbolRef attributes referencing compiled kernels.
+  llvm::DenseSet<mlir::Attribute> kernels;
+
+  // Compile all kernels in parallell.
+  module.walk([&](tf_jitrt::FallbackExecuteOp execute) {
+    // Do not compiled the same kernel multiple times.
+    if (kernels.contains(execute.getKernel())) return;
+
+    auto compile = builder.create<tf_jitrt::FallbackCompileOp>(
+        execute.getLoc(), builder.getType<tfrt::compiler::ChainType>(),
+        execute.getKernel(), execute.getDevice());
+    compiled.push_back(compile.getResult());
+    kernels.insert(compile.getKernel());
+  });
+
+  // Wait for the compilation completion before returning from init function.
+  if (!compiled.empty()) {
+    // Do not forget to wait for the fallback kernels initialization.
+    compiled.insert(compiled.begin(), chain_value);
+    chain_value = builder.create<tfrt::compiler::MergeChainsOp>(
+        module.getLoc(), builder.getType<tfrt::compiler::ChainType>(),
+        compiled);
+  }
+
+  return chain_value;
+}
+
+// -------------------------------------------------------------------------- //
+// Outline tf_device.cluster operation regions into functions in the nested
+// modules and replaces all cluster operations with jitrt.call operations.
+// -------------------------------------------------------------------------- //
+
+class OutlineJitRtClustersPass
+    : public PassWrapper<OutlineJitRtClustersPass, OperationPass<ModuleOp>> {
+ public:
+  llvm::StringRef getArgument() const final {
+    return "tf-outline-jitrt-cluster";
+  }
+  llvm::StringRef getDescription() const final {
+    return "Outlines `tf_device.cluster` operations into functions and "
+           "replaces them with `jitrt.call` operations.";
+  }
+
+  void runOnOperation() override;
+
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<tfrt::jitrt::JitRuntimeDialect>();
+  }
+
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OutlineJitRtClustersPass)
+
+ private:
+  struct CompiledModule {
+    ModuleOp module;
+    func::FuncOp entrypoint;
+    llvm::SetVector<Value> operands;
+  };
+
+  // Creates a nested module with a single function that will be compiled into
+  // the kernel at runtime.
+  CompiledModule CreateCompiledModule(tf_device::ClusterOp cluster,
+                                      int64_t max_arg_size,
+                                      SymbolTable *symbol_table);
+
+  // Update compiled module entrypoint signature with inferred operands
+  // constraints.
+  LogicalResult SetEntrypointConstraints(CompiledModule &compiled);
+
+  // Outlines cluster operation regions into compiled modules, and replaces
+  // cluster operation with a jitrt.call operation.
+  LogicalResult OutlineClusterOp(tf_device::ClusterOp cluster,
+                                 int64_t max_arg_size,
+                                 SymbolTable *symbol_table);
+
+  // Mapping from the outlined module string representation to the module itself
+  // and an entrypoint function. Used to deduplicate identical modules during
+  // the `tf_device.cluster` outlining.
+  llvm::StringMap<std::pair<ModuleOp, func::FuncOp>> outlined_;
+};
+
+OutlineJitRtClustersPass::CompiledModule
+OutlineJitRtClustersPass::CreateCompiledModule(tf_device::ClusterOp cluster,
+                                               int64_t max_arg_size,
+                                               SymbolTable *symbol_table) {
+  MLIRContext *ctx = cluster->getContext();
+  Location loc = cluster.getLoc();
+
+  // Create a module that will hold compiled function and async wrappers.
+  // TODO(ezhulenev): Give better names to module and function.
+  auto compiled_module = ModuleOp::create(loc, {"kernel"});
+  compiled_module->setAttr("tfrt.compiled", UnitAttr::get(ctx));
+  compiled_module->setAttr(
+      "tfrt.max-arg-size",
+      IntegerAttr::get(IntegerType::get(ctx, 64), max_arg_size));
+
+  SymbolTable compiled_module_symbol_table(compiled_module);
+
+  // Find out the cluster arguments and their types.
+  llvm::SetVector<Value> live_ins;
+  getUsedValuesDefinedAbove(cluster.getBody(), cluster.getBody(), live_ins);
+
+  llvm::SmallVector<Type, 4> operand_types;
+  operand_types.reserve(live_ins.size());
+  for (Value v : live_ins) operand_types.emplace_back(v.getType());
+
+  // Create a function in the compiled module.
+  auto compiled_func_type =
+      FunctionType::get(ctx, operand_types, cluster->getResultTypes());
+  auto compiled_func = func::FuncOp::create(loc, "compute", compiled_func_type);
+  compiled_module_symbol_table.insert(compiled_func);
+
+  // Replace uses of live-in values within cluster region with block arguments.
+  Block *compiled_func_block = compiled_func.addEntryBlock();
+  for (auto p : llvm::zip(live_ins, compiled_func_block->getArguments()))
+    replaceAllUsesInRegionWith(std::get<0>(p), std::get<1>(p),
+                               cluster.getBody());
+
+  // Move all operations in cluster into compiled_func's entry block.
+  auto &cluster_body = cluster.GetBody().getOperations();
+  compiled_func_block->getOperations().splice(
+      compiled_func_block->end(), cluster_body, cluster_body.begin(),
+      cluster_body.end());
+
+  // Replace `tf_device.return` terminator with `func.return` in the function
+  // body.
+  auto device_return =
+      cast<tf_device::ReturnOp>(compiled_func_block->getTerminator());
+  OpBuilder builder(device_return.getOperation());
+  builder.create<func::ReturnOp>(device_return.getLoc(),
+                                 device_return.getOperands());
+  device_return.erase();
+
+  // TODO(ezhulenev): MLIR doesn't define operation equivalence upstream yet,
+  // replace module printing with a more principled solution when available.
+  // Operations in the cluster can be in different order, however define the
+  // identical Tensorflow programs, with current approach we'll not be able
+  // to detect duplicates like this.
+
+  // Remove location attribute attached to Tensorflow operations to be able to
+  // deduplicate compiled clusters with the same set of operations.
+  //
+  // TODO(ezhulenev): Figure out how to propagate locations for error reporting,
+  // right now JitRt will ignore them anyway.
+  compiled_module.walk([](Operation *op) { op->removeAttr("_class"); });
+
+  // Serialize prepared module to string.
+  std::string serialized;
+  llvm::raw_string_ostream os(serialized);
+  compiled_module.print(os);
+
+  // Try to find if identical module was already outlined.
+  auto it = outlined_.find(serialized);
+
+  // Return identical module that was already outlined earlier.
+  if (it != outlined_.end()) {
+    compiled_module.erase();  // erase identical module
+    return {it->second.first, it->second.second, live_ins};
+  }
+
+  // Insert compiled module into the symbol table and assign it a unique name.
+  symbol_table->insert(compiled_module);
+
+  // Cache unique module.
+  outlined_.insert({std::move(serialized), {compiled_module, compiled_func}});
+
+  return {compiled_module, compiled_func, live_ins};
+}
+
+LogicalResult OutlineJitRtClustersPass::SetEntrypointConstraints(
+    CompiledModule &compiled) {
+  func::FuncOp func = compiled.entrypoint;
+
+  // Functions outlined from jitrt device clusters must have a single block.
+  assert(func.getBody().getBlocks().size() == 1 && "expected single block");
+
+  mlir::TFDevice::ClusteringPolicySet policies;
+  populateTfJitRtConstraintsPolicies(policies);
+
+  // Infer constraints on the values defined in the entrypoint function
+  // (including function entry block arguments).
+  mlir::TFDevice::ValuesConstraintSet constraints;
+  if (failed(mlir::TFDevice::PropagateValuesConstraints(
+          func.getBody(), policies, constraints, /*resolve=*/true)))
+    return failure();
+
+  // Annotate arguments with inferred constraints.
+  for (unsigned i = 0; i < func.getNumArguments(); ++i) {
+    if (auto constraint = constraints.GetConstraint(func.getArgument(i))) {
+      auto constraint_name = mlir::StringAttr::get(
+          &getContext(), llvm::formatv("{0}", *constraint).str());
+      func.setArgAttr(i, "rt.constraint", constraint_name);
+    }
+  }
+
+  return success();
+}
+
+LogicalResult OutlineJitRtClustersPass::OutlineClusterOp(
+    tf_device::ClusterOp cluster, int64_t max_arg_size,
+    SymbolTable *symbol_table) {
+  Location loc = cluster->getLoc();
+  OpBuilder builder(cluster);
+
+  CompiledModule compiled_module =
+      CreateCompiledModule(cluster, max_arg_size, symbol_table);
+  func::FuncOp compiled_func = compiled_module.entrypoint;
+
+  // Add constraints to the entrypoint arguments.
+  if (failed(SetEntrypointConstraints(compiled_module))) return failure();
+
+  // Replace device cluster with a jitrt.call operation.
+  auto module_name = *compiled_module.module.getSymName();
+  auto func_name = compiled_func.getSymName();
+  auto func_flat_ref =
+      mlir::SymbolRefAttr::get(builder.getContext(), func_name);
+  auto func_ref = mlir::SymbolRefAttr::get(builder.getContext(), module_name,
+                                           {func_flat_ref});
+
+  auto cluster_func_op = builder.create<tfrt::jitrt::CallOp>(
+      loc, cluster.getResultTypes(), func_ref,
+      compiled_module.operands.getArrayRef());
+
+  cluster.replaceAllUsesWith(cluster_func_op);
+  cluster.erase();
+
+  return success();
+}
+
+void OutlineJitRtClustersPass::runOnOperation() {
+  ModuleOp module = getOperation();
+  SymbolTable symbol_table(module);
+
+  // Keep track of the maximum argument size for each function with tf_device
+  // cluster operations in the function body. We need to pass it to the compiled
+  // module to correctly compute its cost later.
+  llvm::DenseMap<mlir::func::FuncOp, int64_t> max_arg_size_map;
+
+  auto get_max_arg_size = [&](mlir::func::FuncOp func) -> int64_t {
+    auto it = max_arg_size_map.find(func);
+    if (it != max_arg_size_map.end()) return it->second;
+    return max_arg_size_map[func] = tf_jitrt::GetMaxArgSize(func);
+  };
+
+  OpBuilder builder(module.getContext());
+  auto result = module.walk([&](tf_device::ClusterOp cluster) -> WalkResult {
+    // Ensure that cluster was formed for TFRT JIT compilation.
+    auto policy = cluster->getAttr("policy").dyn_cast_or_null<StringAttr>();
+    if (!policy || policy.getValue() != "tfrt.auto-fusion")
+      return WalkResult::advance();
+
+    // Get the maximum argument size of the parent function.
+    mlir::func::FuncOp parent_func =
+        cluster->getParentOfType<mlir::func::FuncOp>();
+    int64_t max_arg_size = get_max_arg_size(parent_func);
+
+    if (failed(OutlineClusterOp(cluster, max_arg_size, &symbol_table)))
+      return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+
+  if (result.wasInterrupted()) {
+    module->emitError("Failed to outline tf_device.cluster operations");
+    signalPassFailure();
+  }
+}
+
+std::unique_ptr<Pass> CreateOutlineJitRtClustersPass() {
+  return std::make_unique<OutlineJitRtClustersPass>();
+}
+
+void TfrtJitRtStubImpl::AddTfrtJitRtPasses(const TfrtPipelineOptions &options,
+                                           mlir::OpPassManager &pm) {
+  // Outline auto-fusion clusters into tf_device.cluster_operations and then
+  // convert them to functions. We currently support only tfrt fallback tensors
+  // as operands, so we disable these passes if we can have native ops after
+  // lowering.
+  pm.addNestedPass<mlir::func::FuncOp>(CreateTfJitRtClusteringPass(
+      options.auto_fusion_oplist, options.auto_fusion_min_cluster_size));
+
+  // Sink small constants into the outlined clusters to reduce the number of
+  // arguments for each of the execute operations.
+  auto is_compilable_const = [](mlir::tf_device::ClusterOp cluster,
+                                mlir::ElementsAttr value) -> bool {
+    // Ensure that cluster was formed for TFRT JIT compilation.
+    auto policy = cluster->getAttr("policy").dyn_cast_or_null<StringAttr>();
+    if (!policy || policy.getValue() != "tfrt.auto-fusion") return false;
+
+    // Check that TF->JitRt compiler supports constant compilation.
+    return mlir::succeeded(IsCompilableConstant(value));
+  };
+
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TFDevice::CreateClusterConstantSinkingPass(is_compilable_const));
+
+  // Outline formed JIT compiled device clusters into function.
+  pm.addPass(CreateOutlineJitRtClustersPass());
+}
+
+mlir::PassRegistration<OutlineJitRtClustersPass> tf_outline_jitrt_cluster_pass(
+    CreateOutlineJitRtClustersPass);
+
+const bool kUnused =
+    (RegisterTfrtJitRtStub(std::make_unique<TfrtJitRtStubImpl>()), true);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_stub.cc b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_stub.cc
new file mode 100644
index 00000000000..1bde6382c79
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_stub.cc
@@ -0,0 +1,76 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_stub.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace tensorflow {
+namespace {
+
+class TfrtJitRtStubRegistry {
+ public:
+  TfrtJitRtStubRegistry() : stub_(std::make_unique<TfrtJitRtStub>()) {}
+
+  void Register(std::unique_ptr<TfrtJitRtStub> stub) {
+    stub_ = std::move(stub);
+  }
+
+  TfrtJitRtStub &Get() { return *stub_; }
+
+ private:
+  std::unique_ptr<TfrtJitRtStub> stub_;
+};
+
+TfrtJitRtStubRegistry &GetGlobalTfrtJitRtStubRegistry() {
+  static auto *const stub = new TfrtJitRtStubRegistry;
+  return *stub;
+}
+
+}  // namespace
+
+void RegisterTfrtJitRtStub(std::unique_ptr<TfrtJitRtStub> stub) {
+  GetGlobalTfrtJitRtStubRegistry().Register(std::move(stub));
+}
+
+void RegisterJitRtDialects(mlir::DialectRegistry &registry) {
+  GetGlobalTfrtJitRtStubRegistry().Get().RegisterJitRtDialects(registry);
+}
+
+// Helper function for inserting TFRT JitRt dialect conversions.
+void PopulateJitRtConversionPatterns(mlir::ConversionTarget *target,
+                                     mlir::MLIRContext *context,
+                                     mlir::RewritePatternSet *patterns,
+                                     CoreRTConverter *corert_converter) {
+  GetGlobalTfrtJitRtStubRegistry().Get().PopulateJitRtConversionPatterns(
+      target, context, patterns, corert_converter);
+}
+
+mlir::Value CreateJitRtFallbackCompileKernel(mlir::OpBuilder &builder,
+                                             mlir::ModuleOp module,
+                                             mlir::Value chain_value) {
+  return GetGlobalTfrtJitRtStubRegistry()
+      .Get()
+      .CreateJitRtFallbackCompileKernel(builder, module, chain_value);
+}
+
+void AddTfrtJitRtPasses(const TfrtPipelineOptions &options,
+                        mlir::OpPassManager &pm) {
+  GetGlobalTfrtJitRtStubRegistry().Get().AddTfrtJitRtPasses(options, pm);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_stub.h b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_stub.h
new file mode 100644
index 00000000000..d9c00c4d376
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_stub.h
@@ -0,0 +1,71 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TFRT_JITRT_STUB_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TFRT_JITRT_STUB_H_
+
+#include <memory>
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/corert_converter.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
+
+namespace tensorflow {
+
+class TfrtJitRtStub {
+ public:
+  virtual ~TfrtJitRtStub() = default;
+
+  virtual void RegisterJitRtDialects(mlir::DialectRegistry &registry) {}
+
+  virtual void PopulateJitRtConversionPatterns(
+      mlir::ConversionTarget *target, mlir::MLIRContext *context,
+      mlir::RewritePatternSet *patterns, CoreRTConverter *corert_converter) {}
+
+  virtual mlir::Value CreateJitRtFallbackCompileKernel(
+      mlir::OpBuilder &builder, mlir::ModuleOp module,
+      mlir::Value chain_value) {
+    return chain_value;
+  }
+
+  virtual void AddTfrtJitRtPasses(const TfrtPipelineOptions &options,
+                                  mlir::OpPassManager &pm) {}
+};
+
+void RegisterTfrtJitRtStub(std::unique_ptr<TfrtJitRtStub> stub);
+
+void RegisterJitRtDialects(mlir::DialectRegistry &registry);
+
+// Helper function for inserting TFRT JitRt dialect conversions.
+void PopulateJitRtConversionPatterns(mlir::ConversionTarget *target,
+                                     mlir::MLIRContext *context,
+                                     mlir::RewritePatternSet *patterns,
+                                     CoreRTConverter *corert_converter);
+
+mlir::Value CreateJitRtFallbackCompileKernel(mlir::OpBuilder &builder,
+                                             mlir::ModuleOp module,
+                                             mlir::Value chain_value);
+
+void AddTfrtJitRtPasses(const TfrtPipelineOptions &options,
+                        mlir::OpPassManager &pm);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TFRT_JITRT_STUB_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
index cdd221750dc..24d245b1714 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
@@ -122,6 +122,11 @@ struct TfrtPipelineOptions
                      "out to run during loading."),
       llvm::cl::init(false)};
 
+  Option<bool> fuse_get_resource_ops_in_hoisting{
+      *this, "fuse-get-resource-ops-in-hoisting",
+      llvm::cl::desc("If true, get_resource_op will be fused during hoisting"),
+      llvm::cl::init(true)};
+
   Option<bool> sink_in_invariant_ops{
       *this, "sink-in-invariant-ops",
       llvm::cl::desc("If true, sink the selected invariant ops in to the "
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
index 2980c5486eb..1573306ba2d 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
@@ -16,21 +16,26 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
 
 #include <deque>
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/strings/match.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
 #include "tensorflow/core/common_runtime/function_body.h"
 #include "tensorflow/core/common_runtime/function_def_utils.h"
+#include "tensorflow/tsl/platform/errors.h"
 #include "tfrt/bef_converter/mlir_to_bef.h"  // from @tf_runtime
 
 namespace tensorflow {
@@ -108,7 +113,7 @@ Status ConvertFunctionToBef(
   if (!expected_module.ok())
     return tensorflow::errors::Internal(
         "Failed to convert function to mlir for function ", function_name.str(),
-        ". Error: ", expected_module.status().error_message());
+        ". Error: ", expected_module.status().message());
 
   auto module = std::move(expected_module).value();
 
@@ -165,8 +170,7 @@ Status ConvertTfMlirToRuntimeExecutable(
     }
   } else if (options.device_target == TfrtDeviceInfraTarget::kGpu &&
              options.use_bridge_for_gpu) {
-    TF_RETURN_IF_ERROR(
-        mlir::TF::RunTFXLABridge(module, /*enable_logging=*/VLOG_IS_ON(1)));
+    TF_RETURN_IF_ERROR(mlir::TF::RunTFXLABridge(module));
 
     // GPU XLA clusters are wrapped in functions, which could be transformed by
     // bridge. Hence, the MLIR functions for XLA clusters are exported and added
@@ -187,44 +191,13 @@ Status ConvertTfMlirToRuntimeExecutable(
   // Lower MLIR TF Dialect to MLIR TFRT CoreRT dialect.
   mlir::PassManager pm(module.getContext());
 
-  tensorflow::TfrtPipelineOptions pass_options;
-  if (!options.default_device.empty()) {
-    pass_options.default_device = options.default_device;
-  }
-  if (!options.force_data_format.empty()) {
-    pass_options.force_data_format = options.force_data_format;
-  }
-
-  // TODO(b/187991150): Consider only decomposing read-only resource variable
-  // ops.
-  pass_options.decompose_resource_ops = options.decompose_resource_ops;
-  pass_options.enable_optimizer = options.enable_optimizer;
-  pass_options.target_tpurt =
-      (options.device_target == TfrtDeviceInfraTarget::kTpurt);
-  pass_options.target_gpu =
-      (options.device_target == TfrtDeviceInfraTarget::kGpu);
-  pass_options.use_bridge_for_gpu = options.use_bridge_for_gpu;
-  pass_options.tpu_fuse_ops = options.tpu_fuse_ops;
-  pass_options.use_tpu_host_allocator_for_inputs =
-      options.use_tpu_host_allocator_for_inputs;
-  pass_options.tpu_allow_unpadded_batch = options.tpu_allow_unpadded_batch;
-  pass_options.sink_in_invariant_ops = options.sink_in_invariant_ops;
-  pass_options.hoist_invariant_ops = options.hoist_invariant_ops;
-  pass_options.func_use_fallback_tensor = true;
-  pass_options.enable_while_parallel_iterations =
-      options.enable_while_parallel_iterations;
-  pass_options.auto_fusion_oplist = options.auto_fusion_oplist;
-  pass_options.auto_fusion_min_cluster_size =
-      options.auto_fusion_min_cluster_size;
-  pass_options.cost_threshold = options.cost_threshold;
-  pass_options.upper_cost_threshold = options.upper_cost_threshold;
-  pass_options.merge_inter_dependent_streams =
-      options.merge_inter_dependent_streams;
+  auto pipeline_options = GetTfrtPipelineOptions(options);
 
   TF_RETURN_IF_ERROR(
-      tensorflow::CreateTFExecutorToTFPipeline(pm, pass_options));
+      tensorflow::CreateTFExecutorToTFPreInvariantOptimizationPipeline(
+          pm, *pipeline_options));
 
-  auto status = emit_executable(pm, module, pass_options);
+  auto status = emit_executable(pm, module, *pipeline_options);
 
   if (VLOG_IS_ON(1)) {
     tensorflow::DumpMlirOpToFile("tfrt_dialect", module);
@@ -241,11 +214,17 @@ Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
       [bef_buffer](mlir::PassManager& pm, mlir::ModuleOp module,
                    const tensorflow::TfrtPipelineOptions& options) {
         mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
+        tensorflow::CreateTFExecutorToTFInvariantOptimizationPipelineHelper(
+            pm, options);
         tensorflow::CreateTfToTfrtPipeline(pm, options);
 
-        if (mlir::failed(pm.run(module)))
+        if (mlir::failed(pm.run(module))) {
+          if (VLOG_IS_ON(1)) {
+            tensorflow::DumpMlirOpToFile("tf_to_corert_failure", module);
+          }
           return diag_handler.Combine(tensorflow::errors::Internal(
               "failed to lower TF Dialect to CoreRT dialect."));
+        }
 
         *bef_buffer =
             tfrt::ConvertMLIRToBEF(module, /*disable_optional_sections=*/true);
@@ -259,4 +238,45 @@ Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
       fallback_state);
 }
 
+std::unique_ptr<tensorflow::TfrtPipelineOptions> GetTfrtPipelineOptions(
+    const TfrtCompileOptions& options) {
+  auto pipeline_options = std::make_unique<tensorflow::TfrtPipelineOptions>();
+  if (!options.default_device.empty()) {
+    pipeline_options->default_device = options.default_device;
+  }
+  if (!options.force_data_format.empty()) {
+    pipeline_options->force_data_format = options.force_data_format;
+  }
+
+  // TODO(b/187991150): Consider only decomposing read-only resource variable
+  // ops.
+  pipeline_options->decompose_resource_ops = options.decompose_resource_ops;
+  pipeline_options->enable_optimizer = options.enable_optimizer;
+  pipeline_options->target_tpurt =
+      (options.device_target == TfrtDeviceInfraTarget::kTpurt);
+  pipeline_options->target_gpu =
+      (options.device_target == TfrtDeviceInfraTarget::kGpu);
+  pipeline_options->use_bridge_for_gpu = options.use_bridge_for_gpu;
+  pipeline_options->tpu_fuse_ops = options.tpu_fuse_ops;
+  pipeline_options->use_tpu_host_allocator_for_inputs =
+      options.use_tpu_host_allocator_for_inputs;
+  pipeline_options->tpu_allow_unpadded_batch = options.tpu_allow_unpadded_batch;
+  pipeline_options->sink_in_invariant_ops = options.sink_in_invariant_ops;
+  pipeline_options->hoist_invariant_ops = options.hoist_invariant_ops;
+  pipeline_options->fuse_get_resource_ops_in_hoisting =
+      options.fuse_get_resource_ops_in_hoisting;
+  pipeline_options->func_use_fallback_tensor = true;
+  pipeline_options->enable_while_parallel_iterations =
+      options.enable_while_parallel_iterations;
+  pipeline_options->auto_fusion_oplist = options.auto_fusion_oplist;
+  pipeline_options->auto_fusion_min_cluster_size =
+      options.auto_fusion_min_cluster_size;
+  pipeline_options->cost_threshold = options.cost_threshold;
+  pipeline_options->upper_cost_threshold = options.upper_cost_threshold;
+  pipeline_options->merge_inter_dependent_streams =
+      options.merge_inter_dependent_streams;
+
+  return pipeline_options;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.h b/tensorflow/compiler/mlir/tfrt/translate/import_model.h
index 9df6ae57137..2b2dc6cc987 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.h
@@ -16,15 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_IMPORT_MODEL_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_IMPORT_MODEL_H_
 
+#include <memory>
 #include <vector>
 
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tfrt/function/function.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/platform/status.h"
@@ -61,6 +61,9 @@ Status ConvertTfMlirToRuntimeExecutable(
         emit_executable,
     tfrt_stub::FallbackState* fallback_state = nullptr);
 
+std::unique_ptr<tensorflow::TfrtPipelineOptions> GetTfrtPipelineOptions(
+    const TfrtCompileOptions& options);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_IMPORT_MODEL_H_
diff --git a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
index 1f6bcb54baf..e451cf737f3 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
+++ b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/protobuf/config.pb.h"
+
 namespace tensorflow {
 
 enum class TfrtDeviceInfraTarget {
@@ -47,6 +49,10 @@ struct TfrtCompileOptions {
   // If true, run grappler passes before compiling.
   bool enable_grappler = true;
 
+  // Graph rewrite options that will be applied on GraphDef before converting to
+  // MLIR.
+  GraphOptions graph_options;
+
   // Force data format for all layout sensitive operations, eg. setting it to
   // "NHWC" will changes all data format in the graph to "NHWC" by inserting
   // or removing related tf.Transpose op. Currently the supported formats are
@@ -97,6 +103,9 @@ struct TfrtCompileOptions {
   // supposed to be turned on by default.
   bool hoist_invariant_ops = false;
 
+  // If true, get_resource_op will be fused during hoisting.
+  bool fuse_get_resource_ops_in_hoisting = true;
+
   // If true, the compiler will try to sink in the invariant ops (e.g. const
   // ops, var handle ops, etc.) to the nested function (e.g. batch function) to
   // facilitate invariant ops hoisting.
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index c815a19f411..145cce5ac6b 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -45,7 +45,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tf2xla:xla_legalize_tf_no_fallback",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf_no_fallback",
         "//tensorflow/compiler/mlir/tools/kernel_gen/transforms:bufferize",
         "//tensorflow/compiler/mlir/tools/kernel_gen/transforms:gpu_passes",  # fixdeps: keep
         "//tensorflow/compiler/mlir/tools/kernel_gen/transforms:passes",
@@ -60,11 +60,13 @@ cc_library(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithTransforms",
         "@llvm-project//mlir:BufferizationTransforms",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexToStandard",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:GPUToGPURuntimeTransforms",
+        "@llvm-project//mlir:GPUToLLVMIRTranslation",
         "@llvm-project//mlir:GPUToNVVMTransforms",
         "@llvm-project//mlir:GPUTransforms",
         "@llvm-project//mlir:IR",
@@ -96,24 +98,24 @@ tf_cc_binary(
     ],
     deps = [
         ":kernel_creator",
-        "@llvm-project//llvm:TargetParser",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:ARMCodeGen",  # fixdeps: keep
+        "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:CodeGen",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:MC",
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
         "@llvm-project//llvm:X86Disassembler",  # fixdeps: keep
         "@llvm-project//mlir:ExecutionEngineUtils",
-        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ToLLVMIRTranslation",
     ] + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
@@ -188,14 +190,14 @@ cc_library(
         "-DTENSORFLOW_USE_ROCM=1",
     ]),
     deps = [
-        "@llvm-project//mlir:mlir_runner_utils",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:mutex",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/tsl/platform:hash",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//mlir:mlir_runner_utils",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/compiler/xla/stream_executor/cuda:stream_executor_cuda",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
index f040ca2af3b..29ae6752cd2 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
@@ -83,6 +83,7 @@ cc_library(
         ":tf_framework_ops_inc_gen",
         ":tf_status_inc_gen",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
+        "@com_google_absl//absl/status",
         "@llvm-project//mlir:AllocationOpInterface",
         "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:ControlFlowInterfaces",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
index ac791ace79c..48e288eb48d 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
@@ -120,43 +120,43 @@ std::optional<Value> JITExecuteOp::buildClone(OpBuilder &builder, Value alloc) {
       .getResult();
 }
 
-::tensorflow::error::Code ConvertAttrToEnumValue(ErrorCode error_code) {
+absl::StatusCode ConvertAttrToEnumValue(ErrorCode error_code) {
   using ::tensorflow::error::Code;
   switch (error_code) {
     case ErrorCode::OK:
-      return Code::OK;
+      return absl::StatusCode::kOk;
     case ErrorCode::CANCELLED:
-      return Code::CANCELLED;
+      return absl::StatusCode::kCancelled;
     case ErrorCode::UNKNOWN:
-      return Code::UNKNOWN;
+      return absl::StatusCode::kUnknown;
     case ErrorCode::INVALID_ARGUMENT:
-      return Code::INVALID_ARGUMENT;
+      return absl::StatusCode::kInvalidArgument;
     case ErrorCode::DEADLINE_EXCEEDED:
-      return Code::DEADLINE_EXCEEDED;
+      return absl::StatusCode::kDeadlineExceeded;
     case ErrorCode::NOT_FOUND:
-      return Code::NOT_FOUND;
+      return absl::StatusCode::kNotFound;
     case ErrorCode::ALREADY_EXISTS:
-      return Code::ALREADY_EXISTS;
+      return absl::StatusCode::kAlreadyExists;
     case ErrorCode::PERMISSION_DENIED:
-      return Code::PERMISSION_DENIED;
+      return absl::StatusCode::kPermissionDenied;
     case ErrorCode::UNAUTHENTICATED:
-      return Code::UNAUTHENTICATED;
+      return absl::StatusCode::kUnauthenticated;
     case ErrorCode::RESOURCE_EXHAUSTED:
-      return Code::RESOURCE_EXHAUSTED;
+      return absl::StatusCode::kResourceExhausted;
     case ErrorCode::FAILED_PRECONDITION:
-      return Code::FAILED_PRECONDITION;
+      return absl::StatusCode::kFailedPrecondition;
     case ErrorCode::ABORTED:
-      return Code::ABORTED;
+      return absl::StatusCode::kAborted;
     case ErrorCode::OUT_OF_RANGE:
-      return Code::OUT_OF_RANGE;
+      return absl::StatusCode::kOutOfRange;
     case ErrorCode::UNIMPLEMENTED:
-      return Code::UNIMPLEMENTED;
+      return absl::StatusCode::kUnimplemented;
     case ErrorCode::INTERNAL:
-      return Code::INTERNAL;
+      return absl::StatusCode::kInternal;
     case ErrorCode::UNAVAILABLE:
-      return Code::UNAVAILABLE;
+      return absl::StatusCode::kUnavailable;
     case ErrorCode::DATA_LOSS:
-      return Code::DATA_LOSS;
+      return absl::StatusCode::kDataLoss;
   }
 }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
index 6f05c194093..c5f011f25cf 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_IR_TF_FRAMEWORK_OPS_H_
 #define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_IR_TF_FRAMEWORK_OPS_H_
 
+#include "absl/status/status.h"
 #include "mlir/Dialect/Bufferization/IR/AllocationOpInterface.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -48,7 +49,7 @@ class JITCallableType
   using Base::Base;
 };
 
-::tensorflow::error::Code ConvertAttrToEnumValue(ErrorCode error_code);
+absl::StatusCode ConvertAttrToEnumValue(ErrorCode error_code);
 
 }  // namespace tf_framework
 }  // namespace kernel_gen
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
index 2f946e28bd8..0a1977380da 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@@ -46,6 +46,8 @@ limitations under the License.
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"  // from @llvm-project
@@ -409,6 +411,8 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SetupContextAndParseModule(
   mlir::DialectRegistry registry;
   mlir::RegisterAllTensorFlowDialects(registry);
   registry.insert<mlir::chlo::ChloDialect, mlir::mhlo::MhloDialect>();
+  mlir::registerBuiltinDialectTranslation(registry);
+  mlir::registerGPUDialectTranslation(registry);
   mlir::registerLLVMDialectTranslation(registry);
   mlir::registerNVVMDialectTranslation(registry);
   mlir::registerROCDLDialectTranslation(registry);
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
index bd5ef12ba3f..e2a5601fc53 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
@@ -1,10 +1,10 @@
 // RUN: kernel-gen-opt %s -tf-kernel-to-llvm -split-input-file | FileCheck %s
 
 // CHECK: llvm.func @_mlir_ciface_tf_alloc
-// CHECK-SAME:  (!llvm.ptr<i8>, i64, i64, i32, i32, !llvm.ptr<i32>) -> !llvm.ptr<i8>
+// CHECK-SAME:  (!llvm.ptr, i64, i64, i32, i32, !llvm.ptr) -> !llvm.ptr
 
 // CHECK-LABEL: llvm.func @alloc(
-// CHECK-SAME:    [[TF_CTX:%.*]]: !llvm.ptr<i8>,
+// CHECK-SAME:    [[TF_CTX:%.*]]: !llvm.ptr,
 // CHECK-SAME:    [[SIZE_0:%.*]]: i64,
 // CHECK-SAME:    [[SIZE_2:%.*]]: i64) -> [[DESC_TY:!.*]] {
 func.func @alloc(%ctx: !tf_framework.op_kernel_context,
@@ -18,16 +18,16 @@ func.func @alloc(%ctx: !tf_framework.op_kernel_context,
 // CHECK: [[NUM_ELEMS:%.*]] = llvm.mul [[NUM_ELEM_0]], [[SIZE_2]] : i64
 
 // Compute the size of an individual element.
-// CHECK: [[NULL:%.*]] = llvm.mlir.null : !llvm.ptr<f32>
+// CHECK: [[NULL:%.*]] = llvm.mlir.null : !llvm.ptr
 // CHECK: [[GEP:%.*]] = llvm.getelementptr [[NULL]]{{\[}}1]
-// CHECK-SAME:            (!llvm.ptr<f32>) -> !llvm.ptr<f32>
+// CHECK-SAME:            (!llvm.ptr) -> !llvm.ptr, f32
 // CHECK: [[SIZE_OF_FLOAT:%.*]] = llvm.ptrtoint [[GEP]]
-// CHECK-SAME:            !llvm.ptr<f32> to i64
+// CHECK-SAME:            !llvm.ptr to i64
 
 // Compute output index (-1) and candidate indices (0, NULL).
 // CHECK: [[OUTPUT_INDEX:%.*]] = llvm.mlir.constant(-1 : i32) : i32
 // CHECK-NEXT: [[NUM_CANDIDATES:%.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK-NEXT: [[CANDIDATES_PTR:%.*]] = llvm.mlir.null : !llvm.ptr<i32>
+// CHECK-NEXT: [[CANDIDATES_PTR:%.*]] = llvm.mlir.null : !llvm.ptr
 
 // Allocate memory.
 // CHECK: [[BYTES_PTR:%.*]] = llvm.call @{{.*}}([[TF_CTX]], [[NUM_ELEMS]],
@@ -38,10 +38,8 @@ func.func @alloc(%ctx: !tf_framework.op_kernel_context,
 // CHECK: [[DESC_0:%.*]] = llvm.mlir.undef : [[DESC_TY]]
 
 // Set pointers and offset.
-// CHECK: [[FLOAT_PTR:%.*]] = llvm.bitcast [[BYTES_PTR]]
-// CHECK-SAME:                  !llvm.ptr<i8> to !llvm.ptr<f32>
-// CHECK: [[DESC_1:%.*]] = llvm.insertvalue [[FLOAT_PTR]], [[DESC_0]][0]
-// CHECK: [[DESC_2:%.*]] = llvm.insertvalue [[FLOAT_PTR]], [[DESC_1]][1]
+// CHECK: [[DESC_1:%.*]] = llvm.insertvalue [[BYTES_PTR]], [[DESC_0]][0]
+// CHECK: [[DESC_2:%.*]] = llvm.insertvalue [[BYTES_PTR]], [[DESC_1]][1]
 // CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : index) : i64
 // CHECK: [[DESC_3:%.*]] = llvm.insertvalue [[C0]], [[DESC_2]][2] : [[DESC_TY]]
 
@@ -59,10 +57,10 @@ func.func @alloc(%ctx: !tf_framework.op_kernel_context,
 
 // -----
 
-// CHECK: llvm.func @_mlir_ciface_tf_dealloc(!llvm.ptr<i8>, !llvm.ptr<i8>)
+// CHECK: llvm.func @_mlir_ciface_tf_dealloc(!llvm.ptr, !llvm.ptr)
 
 // CHECK-LABEL: llvm.func @dealloc(
-// CHECK-SAME:    [[TF_CTX:%.*]]: !llvm.ptr<i8>,
+// CHECK-SAME:    [[TF_CTX:%[a-z0-9]*]]: !llvm.ptr
 func.func @dealloc(%ctx: !tf_framework.op_kernel_context,
                   %memref : memref<?x10xf32>) {
   tf_framework.dealloc(%ctx, %memref) : memref<?x10xf32>
@@ -71,29 +69,27 @@ func.func @dealloc(%ctx: !tf_framework.op_kernel_context,
 // Extract allocated ptr from the memref descriptor.
 // CHECK: %{{.*}} = llvm.mlir.undef : [[DESC_TY:!.*]]
 // CHECK: [[FLOAT_PTR:%.*]] = llvm.extractvalue %{{.*}}[0] : [[DESC_TY]]
-// CHECK-NEXT: [[VOID_PTR:%.*]] = llvm.bitcast [[FLOAT_PTR]]
-// CHECK-SAME:                   !llvm.ptr<f32> to !llvm.ptr<i8>
 
 // Deallocate.
 // CHECK: llvm.call @_mlir_ciface_tf_dealloc(
-// CHECK-SAME: [[TF_CTX]], [[VOID_PTR]]) : (!llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
+// CHECK-SAME: [[TF_CTX]], [[FLOAT_PTR]]) : (!llvm.ptr, !llvm.ptr) -> ()
 
 // -----
 
-// CHECK-LABEL: llvm.func @_mlir_ciface_tf_report_error(!llvm.ptr<i8>, i32, !llvm.ptr<i8>)
+// CHECK-LABEL: llvm.func @_mlir_ciface_tf_report_error(!llvm.ptr, i32, !llvm.ptr)
 // CHECK: llvm.mlir.global internal constant [[MSG_CONST:@error_message_[0-9]+]]("Everything is awesome\00")
 
 func.func @report_error(%ctx: !tf_framework.op_kernel_context) {
   tf_framework.report_error %ctx, "INVALID_ARGUMENT", "Everything is awesome" loc(unknown)
   func.return
 }
-// CHECK:     llvm.func @report_error([[CTX:%.*]]: !llvm.ptr<i8>)
+// CHECK:     llvm.func @report_error([[CTX:%.*]]: !llvm.ptr)
 // CHECK-NEXT:  [[ADDR:%.*]] = llvm.mlir.addressof [[MSG_CONST]]
 // CHECK:       [[MSG:%.*]] = llvm.getelementptr [[ADDR]]
 // CHECK:       [[CODE:%.*]] = llvm.mlir.constant({{.*}}) : i32
 // CHECK:       llvm.call @{{.*}}_tf_report_error([[CTX]], [[CODE]], [[MSG]])
 
-// ----
+// -----
 
 // CHECK-LABEL: llvm.func @unranked_null_memref()
 func.func @unranked_null_memref() {
@@ -101,12 +97,12 @@ func.func @unranked_null_memref() {
   func.return
 }
 // CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK: [[DESC_0:%.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
+// CHECK: [[DESC_0:%.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr)>
 // CHECK: [[DESC_1:%.*]] = llvm.insertvalue [[C0]], [[DESC_0]][0]
 // CHECK: [[PTR:%.*]] = llvm.alloca {{.*}} x i8
 // CHECK: [[DESC_2:%.*]] = llvm.insertvalue [[PTR]], [[DESC_1]][1]
 
-// ----
+// -----
 
 // CHECK-LABEL: llvm.func @ranked_null_memref()
 func.func @ranked_null_memref() {
@@ -119,9 +115,9 @@ func.func @ranked_null_memref() {
 // CHECK-NEXT: %[[C1_:.*]] = llvm.mlir.constant(1 : index) : i64
 
 // CHECK: llvm.mlir.null
-// CHECK: %[[NULL:.*]] = llvm.mlir.null : !llvm.ptr<f32>
+// CHECK: %[[NULL:.*]] = llvm.mlir.null : !llvm.ptr
 // CHECK-NEXT: %[[DESC_0:.*]] = llvm.mlir.undef :
-// CHECK-SAME:   !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
+// CHECK-SAME:   !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
 // CHECK-NEXT: %[[DESC_1:.*]] = llvm.insertvalue %[[NULL]], %[[DESC_0]][0]
 // CHECK-NEXT: %[[DESC_2:.*]] = llvm.insertvalue %[[NULL]], %[[DESC_1]][1]
 // CHECK-NEXT: %[[DESC_3:.*]] = llvm.insertvalue %[[C0]], %[[DESC_2]][2]
@@ -130,7 +126,7 @@ func.func @ranked_null_memref() {
 // CHECK-NEXT: %[[DESC_6:.*]] = llvm.insertvalue %[[C1]], %[[DESC_5]][3, 1]
 // CHECK-NEXT: %[[DESC_7:.*]] = llvm.insertvalue %[[C1_]], %[[DESC_6]][4, 1]
 
-// ----
+// -----
 
 // CHECK-LABEL: llvm.func @is_valid_memref
 func.func @is_valid_memref(%buf: memref<?xf32>) -> i1 {
@@ -146,19 +142,18 @@ func.func @is_valid_memref(%buf: memref<?xf32>) -> i1 {
 // CHECK-NEXT: %[[IS_EMPTY_:.*]] =  llvm.or %[[IS_EMPTY]], %[[IS_ZERO]] : i1
 
 // CHECK-NEXT: %[[PTR_F32:.*]] = llvm.extractvalue %[[MEMREF]][0]
-// CHECK-NEXT: %[[VOID_PTR:.*]] = llvm.bitcast %[[PTR_F32]] : !llvm.ptr<f32> to !llvm.ptr<i8>
-// CHECK-NEXT: %[[NULL_PTR:.*]] = llvm.mlir.null : !llvm.ptr<i8>
-// CHECK-NEXT: %[[NOT_NULL:.*]] = llvm.icmp "ne" %[[VOID_PTR]], %[[NULL_PTR]]
+// CHECK-NEXT: %[[NULL_PTR:.*]] = llvm.mlir.null : !llvm.ptr
+// CHECK-NEXT: %[[NOT_NULL:.*]] = llvm.icmp "ne" %[[PTR_F32]], %[[NULL_PTR]]
 
 // CHECK-NEXT: %[[PRED:.*]] = llvm.or %[[NOT_NULL]], %[[IS_EMPTY_]]  : i1
 // CHECK-NEXT: llvm.return %[[PRED]]
 
 // -----
 
-// CHECK-LABEL: llvm.func @_mlir_ciface_tf_jit_compile(!llvm.ptr<i8>, !llvm.ptr<i8>, i64, !llvm.ptr<i64>, i64, !llvm.ptr<i64>, i64, i1, i1, i1) -> !llvm.ptr<i8>
+// CHECK-LABEL: llvm.func @_mlir_ciface_tf_jit_compile(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr, i64, !llvm.ptr, i64, i1, i1, i1) -> !llvm.ptr
 // CHECK: llvm.mlir.global internal constant @[[CODE:jit_module_code_[0-9]+]]("placeholder\00")
 
-// CHECK: @jit_compile_from_str(%[[CTX:.*]]: !llvm.ptr<i8>)
+// CHECK: @jit_compile_from_str(%[[CTX:.*]]: !llvm.ptr)
 func.func @jit_compile_from_str(%ctx: !tf_framework.op_kernel_context)
     -> !tf_framework.jit_callable {
   // CHECK: %[[ADDR:.*]] = llvm.mlir.addressof @[[CODE]]
@@ -205,10 +200,10 @@ func.func @jit_compile_from_str(%ctx: !tf_framework.op_kernel_context)
 
 // -----
 
-// CHECK-LABEL: llvm.func @_mlir_ciface_tf_jit_execute(!llvm.ptr<i8>, !llvm.ptr<i8>, !llvm.ptr<i8>, i64, !llvm.ptr<i8>)
+// CHECK-LABEL: llvm.func @_mlir_ciface_tf_jit_execute(!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr)
 
 // CHECK:      @jit_execute
-// CHECK-SAME: (%[[CTX:.*]]: !llvm.ptr<i8>, %[[CALLABLE:.*]]: !llvm.ptr<i8>, %[[RANK:.*]]: i64, %[[ARG_DESCR:.*]]: !llvm.ptr<i8>)
+// CHECK-SAME: (%[[CTX:.*]]: !llvm.ptr, %[[CALLABLE:.*]]: !llvm.ptr, %[[RANK:.*]]: i64, %[[ARG_DESCR:.*]]: !llvm.ptr)
 func.func @jit_execute(%ctx: !tf_framework.op_kernel_context,
     %callable : !tf_framework.jit_callable, %arg : memref<*xf32>)
     -> memref<*xf32> {
@@ -216,24 +211,21 @@ func.func @jit_execute(%ctx: !tf_framework.op_kernel_context,
   // CHECK: %[[T1:.*]] = llvm.insertvalue %[[RANK]], %[[T0]][0]
   // CHECK: %[[ARG:.*]] = llvm.insertvalue %[[ARG_DESCR]], %[[T1]][1]
   // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64)
-  // CHECK: %[[RESULT_PTR:.*]] = llvm.alloca %[[C1]] x !llvm.struct<(i64, ptr<i8>)>
-  // CHECK: %[[RESULT_PTR_:.*]] = llvm.bitcast %[[RESULT_PTR]]
-
+  // CHECK: %[[RESULT_PTR:.*]] = llvm.alloca %[[C1]] x !llvm.struct<(i64, ptr)>
+  
   // Copy argument(s) to stack-allocated buffer.
   // CHECK: %[[NUM_ARGS:.*]] = llvm.mlir.constant(1 : i64)
-  // CHECK: %[[ARGS_PTR:.*]] = llvm.alloca %[[NUM_ARGS]] x !llvm.struct<(i64, ptr<i8>)>
+  // CHECK: %[[ARGS_PTR:.*]] = llvm.alloca %[[NUM_ARGS]] x !llvm.struct<(i64, ptr)>
   // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64)
   // CHECK: %[[ARGS0_PTR:.*]] = llvm.getelementptr %[[ARGS_PTR]][%[[C0]]]
   // CHECK: llvm.store %[[ARG]], %[[ARGS0_PTR]]
-  // CHECK: %[[ARGS_PTR_:.*]] = llvm.bitcast %[[ARGS_PTR]]
-  // CHECK: llvm.call @_mlir_ciface_tf_jit_execute(%[[CTX]], %[[CALLABLE]], %[[RESULT_PTR_]], %[[NUM_ARGS]], %[[ARGS_PTR_]])
+  // CHECK: llvm.call @_mlir_ciface_tf_jit_execute(%[[CTX]], %[[CALLABLE]], %[[RESULT_PTR]], %[[NUM_ARGS]], %[[ARGS_PTR]])
   // CHECK: %[[RESULT:.*]] = llvm.load %[[RESULT_PTR]]
 
   // Copy unranked memref descriptor to stack-allocated memory.
   // ...
-  // CHECK: %[[RESULT_DESCR_SIZE:.*]] = llvm.add %16, %20
   // CHECK: %[[FALSE:.*]] = llvm.mlir.constant(false)
-  // CHECK: %[[STACK_RESULT_DESCR:.*]] = llvm.alloca %[[RESULT_DESCR_SIZE]] x i8
+  // CHECK: %[[STACK_RESULT_DESCR:.*]] = llvm.alloca %[[RESULT_DESCR_SIZE:[0-9]*]] x i8
   // CHECK: %[[RESULT_DESCR:.*]] = llvm.extractvalue %[[RESULT]][1]
   // CHECK: "llvm.intr.memcpy"(%[[STACK_RESULT_DESCR]], %[[RESULT_DESCR]], %[[RESULT_DESCR_SIZE]], %[[FALSE]])
   // CHECK: llvm.call @free(%[[RESULT_DESCR]])
@@ -244,9 +236,8 @@ func.func @jit_execute(%ctx: !tf_framework.op_kernel_context,
 
   // Copy unranked memref descriptor to heap-allocated memory for return.
   // ...
-  // CHECK: %[[RESULT_DESCR_SIZE:.*]] = llvm.add %33, %37
   // CHECK: %[[FALSE:.*]] = llvm.mlir.constant(false)
-  // CHECK: %[[HEAP_RESULT_DESCR:.*]] = llvm.call @malloc(%[[RESULT_DESCR_SIZE]])
+  // CHECK: %[[HEAP_RESULT_DESCR:.*]] = llvm.call @malloc(%[[RESULT_DESCR_SIZE:[0-9]*]])
   // CHECK: %[[STACK_RESULT_DESCR:.*]] = llvm.extractvalue %[[RESULT]][1]
   // CHECK: "llvm.intr.memcpy"(%[[HEAP_RESULT_DESCR]], %[[STACK_RESULT_DESCR]], %[[RESULT_DESCR_SIZE]], %[[FALSE]])
   // CHECK: %[[T0:.*]] = llvm.mlir.undef
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_kernel_gpu_launch_to_llvm.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_kernel_gpu_launch_to_llvm.mlir
index cf5b8f9620f..cf26ac7d6a1 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_kernel_gpu_launch_to_llvm.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_kernel_gpu_launch_to_llvm.mlir
@@ -10,20 +10,20 @@ gpu.module @kernel_module attributes {gpu.binary_blob = "BLOB!"} {
   }
 }
 
-// CHECK: llvm.func @_mlir_ciface_tf_launch_kernel(!llvm.ptr<i8>, !llvm.ptr<i8>, !llvm.ptr<i8>, i64, i64, i64, i64, i64, i64, !llvm.ptr<ptr<i8>>)
+// CHECK: llvm.func @_mlir_ciface_tf_launch_kernel(!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64, i64, !llvm.ptr)
 // CHECK-DAG: llvm.mlir.global internal constant @kernel_module_the_kernel_kernel_name("the_kernel\00")
 // CHECK-DAG: llvm.mlir.global internal constant @kernel_module_blob("BLOB!")
 
 // CHECK-LABEL: llvm.func @launch
-// CHECK-SAME: (%[[CTX:.*]]: !llvm.ptr<i8>, %{{.*}}: !llvm.ptr<f32>, %arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64
+// CHECK-SAME: (%[[CTX:.*]]: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64
 func.func @launch(%ctx: !tf_framework.op_kernel_context, %memref: memref<?x10xf32>) {
   // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) : i64
-  // CHECK: %[[BLOB:.*]] = llvm.mlir.addressof @kernel_module_blob : !llvm.ptr<array<5 x i8>>
-  // CHECK: %[[BLOB_PTR:.*]] = llvm.getelementptr %[[BLOB]][0, 0] : (!llvm.ptr<array<5 x i8>>) -> !llvm.ptr<i8>
-  // CHECK: %[[NAME:.*]] = llvm.mlir.addressof @kernel_module_the_kernel_kernel_name : !llvm.ptr<array<11 x i8>>
-  // CHECK: %[[NAME_PTR:.*]] = llvm.getelementptr %[[NAME]][0, 0] : (!llvm.ptr<array<11 x i8>>) -> !llvm.ptr<i8>
+  // CHECK: %[[BLOB:.*]] = llvm.mlir.addressof @kernel_module_blob : !llvm.ptr
+  // CHECK: %[[BLOB_PTR:.*]] = llvm.getelementptr %[[BLOB]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<5 x i8>
+  // CHECK: %[[NAME:.*]] = llvm.mlir.addressof @kernel_module_the_kernel_kernel_name : !llvm.ptr
+  // CHECK: %[[NAME_PTR:.*]] = llvm.getelementptr %[[NAME]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<11 x i8>
   // CHECK: %[[C7:.*]] = llvm.mlir.constant(7 : i32) : i32
-  // CHECK: %[[ARGS:.*]] = llvm.alloca %22 x !llvm.ptr<i8> : (i32) -> !llvm.ptr<ptr<i8>>
+  // CHECK: %[[ARGS:.*]] = llvm.alloca %22 x !llvm.ptr : (i32) -> !llvm.ptr
   // CHECK: llvm.call @_mlir_ciface_tf_launch_kernel(%[[CTX]], %[[BLOB_PTR]], %[[NAME_PTR]], %[[C1]], %[[C1]], %[[C1]], %[[C1]], %[[C1]], %[[C1]], %[[ARGS]])
   %c1 = arith.constant 1 : index
   gpu.launch_func  @kernel_module::@the_kernel
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_jit_invocations.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_jit_invocations.mlir
index 79b1ca008b9..2d8ef4cd763 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_jit_invocations.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_jit_invocations.mlir
@@ -88,3 +88,40 @@ func.func @binary_sub(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>) -> tensor<*x
 // CHECK-SAME:      }
 // CHECK:       %[[RES:.*]] = tf_framework.jit_execute %[[CALLABLE]](%[[ARG0]], %[[ARG1]])
 // CHECK:       return %[[RES]]
+
+// CHECK-JFLT-LABEL: @binary_sub
+// CHECK-JFLT:  %[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>
+// CHECK-JFLT:  %[[LIMIT:.*]] = arith.constant 4294967296
+// CHECK-JFLT:  %[[SHAPE1:.*]] = shape.shape_of %[[ARG0]] : tensor<*xf32> -> tensor<?xindex>
+// CHECK-JFLT:  %[[ELEMENTCOUNT1:.*]] = shape.num_elements %[[SHAPE1]] : tensor<?xindex> -> index
+// CHECK-JFLT:  %[[COMP1:.*]] = arith.cmpi sgt, %[[ELEMENTCOUNT1]], %[[LIMIT]] : index
+// CHECK-JFLT:  %[[SHAPE2:.*]] = shape.shape_of %[[ARG1]] : tensor<*xf32> -> tensor<?xindex>
+// CHECK-JFLT:  %[[ELEMENTCOUNT2:.*]] = shape.num_elements %[[SHAPE2]] : tensor<?xindex> -> index
+// CHECK-JFLT:  %[[COMP2:.*]]  = arith.cmpi sgt, %[[ELEMENTCOUNT2]], %[[LIMIT]] : index
+// CHECK-JFLT:  %[[COMPRES:.*]] = arith.ori %[[COMP1]], %[[COMP2]] : i1
+// CHECK-JFLT:  %[[IFRES:.*]] = scf.if %[[COMPRES]] -> (tensor<*xf32>) {
+// CHECK-JFLT:       %[[CALLABLE:.*]] = tf_framework.jit_compile_from_str
+// CHECK-JFLT-SAME:      "
+// CHECK-JFLT-SAME:      module {
+// CHECK-JFLT-SAME:        func @main(%[[ARG0_JIT:.*]]: tensor<*xf32>, %[[ARG1_JIT:.*]]: tensor<*xf32>) -> tensor<*xf32>
+// CHECK-JFLT-SAME:          attributes {tf_entry}
+// CHECK-JFLT-SAME:        {
+// CHECK-JFLT-SAME:          %[[RES_JIT:.*]] = \22tf.Sub\22(%[[ARG0_JIT]], %[[ARG1_JIT]])
+// CHECK-JFLT-SAME:          return %[[RES_JIT]]
+// CHECK-JFLT-SAME:        }
+// CHECK-JFLT-SAME:      }
+// CHECK-JFLT-SAME:      "
+// CHECK-JFLT-SAME:      {
+// CHECK-JFLT-SAME:        cpuCodegen = false
+// CHECK-JFLT-SAME:        enableFtz = false
+// CHECK-JFLT-SAME:        maxSupportedRank = 32 : i64
+// CHECK-JFLT-SAME:        tileSizes = [1, 2, 3]
+// CHECK-JFLT-SAME:        unrollFactors = [3, 2, 1]
+// CHECK-JFLT-SAME:      }
+// CHECK-JFLT:       %[[RES:.*]] = tf_framework.jit_execute %[[CALLABLE]](%[[ARG0]], %[[ARG1]])
+// CHECK-JFLT: scf.yield %[[RES]] : tensor<*xf32>
+// CHECK-JFLT:     } else {
+// CHECK-JFLT:       %[[RES2:.*]] = "tf.Sub"(%[[ARG0]], %[[ARG1]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK-JFLT:       scf.yield %[[RES2]] : tensor<*xf32>
+// CHECK-JFLT:     }
+// CHECK-JFLT:       return %[[IFRES]]
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
index 5b98741b053..fe9d26723b9 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
@@ -130,8 +130,8 @@ std::string GetFileCachePath(const std::string cache_dir,
 llvm::orc::SymbolMap TFFrameworkSymbolMap(llvm::orc::MangleAndInterner mangle) {
   llvm::orc::SymbolMap symbol_map;
   auto bind = [&](llvm::StringRef name, auto symbol_ptr) {
-    symbol_map[mangle(name)] = llvm::JITEvaluatedSymbol(
-        llvm::pointerToJITTargetAddress(symbol_ptr), llvm::JITSymbolFlags());
+    symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(symbol_ptr),
+                                llvm::JITSymbolFlags()};
   };
 
   // Register TF framework symbols.
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
index 4798d8508a0..8d5d583a2dc 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
@@ -29,6 +29,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/TargetParser/Host.h"
@@ -117,6 +118,9 @@ Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
 
   // Compile.
   mlir::MLIRContext context;
+  llvm::SourceMgr source_mgr;
+  mlir::SourceMgrDiagnosticHandler source_mgr_handler(source_mgr, &context);
+
   TF_ASSIGN_OR_RETURN(
       mlir::OwningOpRef<mlir::ModuleOp> module,
       GenerateKernelForTfCode(context, tf_code, architectures, tile_sizes,
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index 79a43aeb240..70127f58c4a 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -103,43 +103,57 @@ cc_library(
     deps = [
         ":embed_tf_framework",
         ":kernel_gen_passes_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:type_conversion",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service/gpu:gpu_asm_opts_util",
+        "//tensorflow/compiler/xla/service/gpu:target_constants",
+        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:errors",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TransformUtils",
+        "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithToLLVM",
         "@llvm-project//mlir:ArithTransforms",
-        "@llvm-project//mlir:ControlFlowDialect",
-        "@llvm-project//mlir:ControlFlowToLLVM",
-        "@llvm-project//mlir:MathToLibm",
-        "@llvm-project//mlir:MathToLLVM",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ComplexToLLVM",
+        "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:ControlFlowToLLVM",
         "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncToLLVM",
+        "@llvm-project//mlir:FuncTransforms",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:GPUToGPURuntimeTransforms",
         "@llvm-project//mlir:GPUTransforms",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:MathToLLVM",
+        "@llvm-project//mlir:MathToLibm",
+        "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:MemRefToLLVM",
         "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:NVVMToLLVMIRTranslation",  # buildcleaner: keep
-        "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:ROCDLToLLVMIRTranslation",  # buildcleaner: keep
+        "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:ShapeToStandard",
         "@llvm-project//mlir:ShapeTransforms",
-        "@llvm-project//mlir:FuncTransforms",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TensorTransforms",
@@ -148,20 +162,6 @@ cc_library(
         "@llvm-project//mlir:VectorDialect",
         "@llvm-project//mlir:VectorToLLVM",
         "@llvm-project//mlir:VectorTransforms",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
-        "//tensorflow/compiler/xla/mlir_hlo:gml_st",
-        "//tensorflow/compiler/xla/mlir_hlo:type_conversion",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
-        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
-        "//tensorflow/compiler/xla/service/gpu:gpu_asm_opts_util",
-        "//tensorflow/compiler/xla/service/gpu:target_constants",
-        "//tensorflow/compiler/xla/service:hlo_module_config",
-        "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:errors",
     ] + if_cuda_is_configured([
         "//tensorflow/tsl/platform:cuda_libdevice_path",
         "//tensorflow/compiler/xla/stream_executor/gpu:asm_compiler",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
index dcb59b2ae06..0d76cd4c93c 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -70,7 +70,7 @@ class GpuKernelToBlobPass
       return;
     }
     // Forward the error by attaching the message to the gpu module.
-    gpu_module.emitError(blob_or.status().error_message());
+    gpu_module.emitError(blob_or.status().message());
     return signalPassFailure();
   }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
index c91ed7c427c..9a5b0749888 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
@@ -62,14 +62,14 @@ class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
       Location loc, Type size_ty, Type element_ty,
       std::optional<ArrayAttr> attr, ConversionPatternRewriter *rewriter,
       std::function<Value(Attribute)> create_element) const {
-    Type element_ptr_ty = LLVM::LLVMPointerType::get(element_ty);
+    Type ptr_ty = LLVM::LLVMPointerType::get(element_ty.getContext());
 
     // If the attribute is missing or empty, set the element count to 0 and
     // return NULL.
     if (!attr.has_value() || attr.value().empty()) {
       Value zero = rewriter->create<LLVM::ConstantOp>(
           loc, size_ty, rewriter->getIntegerAttr(size_ty, 0));
-      Value null_ptr = rewriter->create<LLVM::NullOp>(loc, element_ptr_ty);
+      Value null_ptr = rewriter->create<LLVM::NullOp>(loc, ptr_ty);
       return std::make_pair(zero, null_ptr);
     }
 
@@ -78,12 +78,12 @@ class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
     Value array_size = rewriter->create<LLVM::ConstantOp>(
         loc, size_ty, rewriter->getIntegerAttr(size_ty, array_attr.size()));
     Value array_ptr = rewriter->create<LLVM::AllocaOp>(
-        loc, element_ptr_ty, array_size, /*alignment=*/0);
-    for (auto &e : llvm::enumerate(array_attr)) {
+        loc, ptr_ty, element_ty, array_size, /*alignment=*/0);
+    for (const auto &e : llvm::enumerate(array_attr)) {
       Value index = rewriter->create<LLVM::ConstantOp>(
           loc, size_ty, rewriter->getIntegerAttr(size_ty, e.index()));
-      Value element_ptr =
-          rewriter->create<LLVM::GEPOp>(loc, element_ptr_ty, array_ptr, index);
+      Value element_ptr = rewriter->create<LLVM::GEPOp>(loc, ptr_ty, element_ty,
+                                                        array_ptr, index);
       Value element = create_element(e.value());
       rewriter->create<LLVM::StoreOp>(loc, element, element_ptr);
     }
@@ -169,17 +169,15 @@ class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
 
   Type GetFuncType() const override {
     Type llvm_i32_type = IntegerType::get(getDialect().getContext(), 32);
-    Type llvm_i32_ptr_type = LLVM::LLVMPointerType::get(llvm_i32_type);
-    Type llvm_void_ptr_type = getVoidPtrType();
+    Type llvm_ptr_type = LLVM::LLVMPointerType::get(getDialect().getContext());
     return LLVM::LLVMFunctionType::get(
-        llvm_void_ptr_type,
-        llvm::ArrayRef(
-            {/*void* op_kernel_ctx*/ llvm_void_ptr_type,
-             /*size_t num_elements*/ getIndexType(),
-             /*size_t element_size*/ getIndexType(),
-             /*int32_t output_index*/ llvm_i32_type,
-             /*int32_t num_candidates*/ llvm_i32_type,
-             /*int32_t* candidate_input_indices*/ llvm_i32_ptr_type}));
+        llvm_ptr_type,
+        llvm::ArrayRef({/*void* op_kernel_ctx*/ llvm_ptr_type,
+                        /*size_t num_elements*/ getIndexType(),
+                        /*size_t element_size*/ getIndexType(),
+                        /*int32_t output_index*/ llvm_i32_type,
+                        /*int32_t num_candidates*/ llvm_i32_type,
+                        /*int32_t* candidate_input_indices*/ llvm_ptr_type}));
   }
 
  private:
@@ -193,10 +191,8 @@ class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
         rewriter, loc, typeConverter->convertType(memref_type));
 
     // TF AllocateRaw returns aligned pointer => AllocatedPtr == AlignedPtr.
-    Value allocated_type_ptr = rewriter.create<LLVM::BitcastOp>(
-        loc, getElementPtrType(memref_type), allocated_byte_ptr);
-    memref_desc.setAllocatedPtr(rewriter, loc, allocated_type_ptr);
-    memref_desc.setAlignedPtr(rewriter, loc, allocated_type_ptr);
+    memref_desc.setAllocatedPtr(rewriter, loc, allocated_byte_ptr);
+    memref_desc.setAlignedPtr(rewriter, loc, allocated_byte_ptr);
     memref_desc.setConstantOffset(rewriter, loc, 0);
 
     if (memref_type.getRank() == 0) {
@@ -230,9 +226,7 @@ class TFDeallocOpConverter : public ConvertToLLVMCallOpPattern<TFDeallocOp> {
     if (!op.getMemref().getType().isa<MemRefType>()) return failure();
     MemRefDescriptor memref(adaptor.getMemref());
 
-    Value allocated_bytes_ptr = rewriter.create<LLVM::BitcastOp>(
-        op.getLoc(), getVoidPtrType(),
-        memref.allocatedPtr(rewriter, op.getLoc()));
+    Value allocated_bytes_ptr = memref.allocatedPtr(rewriter, op.getLoc());
 
     // Insert function call.
     FlatSymbolRefAttr tf_func_ref =
@@ -296,18 +290,16 @@ class JITCompileFromStrOpConverter
   StringRef GetFuncName() const override { return kCInterfaceJITCompile; }
 
   Type GetFuncType() const override {
-    auto i8_ptr_ty =
-        LLVM::LLVMPointerType::get(IntegerType::get(getContext(), 8));
+    auto ptr_ty = LLVM::LLVMPointerType::get(getContext());
     auto i64_ty = IntegerType::get(getContext(), 64);
-    Type i64_ptr_ty = LLVM::LLVMPointerType::get(i64_ty);
     auto i1_ty = IntegerType::get(getContext(), 1);
     return LLVM::LLVMFunctionType::get(
         getVoidPtrType(), {/*void* op_kernel_ctx*/ getVoidPtrType(),
-                           /*char* code*/ i8_ptr_ty,
+                           /*char* code*/ ptr_ty,
                            /*int64_t num_tile_sizes*/ i64_ty,
-                           /*int64_t* tile_sizes_ptr*/ i64_ptr_ty,
+                           /*int64_t* tile_sizes_ptr*/ ptr_ty,
                            /*int64_t num_unroll_factors*/ i64_ty,
-                           /*int64_t* unroll_factors_ptr*/ i64_ptr_ty,
+                           /*int64_t* unroll_factors_ptr*/ ptr_ty,
                            /*int64_t max_supported_rank*/ i64_ty,
                            /*bool enable_ftz*/ i1_ty,
                            /*bool index_64bit*/ i1_ty,
@@ -331,47 +323,42 @@ class JITExecuteOpConverter : public ConvertToLLVMCallOpPattern<JITExecuteOp> {
     auto loc = op.getLoc();
     Type result_ty =
         getTypeConverter()->convertType(op->getResultTypes().front());
-    Type result_ptr_ty = LLVM::LLVMPointerType::get(result_ty);
+    Type ptr_ty = LLVM::LLVMPointerType::get(getContext());
     Type i64_ty = rewriter.getI64Type();
     Value one = rewriter.create<LLVM::ConstantOp>(
         loc, i64_ty, rewriter.getI64IntegerAttr(1));
     auto result_ptr =
-        rewriter.create<LLVM::AllocaOp>(loc, result_ptr_ty, one, std::nullopt);
-    Type void_ptr_ty = getVoidPtrType();
-    auto result_void_ptr =
-        rewriter.create<LLVM::BitcastOp>(loc, void_ptr_ty, result_ptr);
+        rewriter.create<LLVM::AllocaOp>(loc, ptr_ty, result_ty, one);
 
     // Pass the buffer arguments as a stack-allocated array.
-    Type arg_ptr_ty =
-        LLVM::LLVMPointerType::get(adaptor.getInputs().front().getType());
+    Type args_elem_ty = adaptor.getInputs().front().getType();
     Value num_args = rewriter.create<LLVM::ConstantOp>(
         loc, i64_ty,
         rewriter.getI64IntegerAttr(
             static_cast<int64_t>(adaptor.getInputs().size())));
-    Value args_ptr = rewriter.create<LLVM::AllocaOp>(loc, arg_ptr_ty, num_args,
-                                                     /*alignment=*/0);
+    Value args_ptr =
+        rewriter.create<LLVM::AllocaOp>(loc, ptr_ty, args_elem_ty, num_args,
+                                        /*alignment=*/0);
     for (const auto &it : llvm::enumerate(adaptor.getInputs())) {
       Value index = rewriter.create<LLVM::ConstantOp>(
           loc, i64_ty, rewriter.getI64IntegerAttr(it.index()));
-      Value element_ptr =
-          rewriter.create<LLVM::GEPOp>(loc, arg_ptr_ty, args_ptr, index);
+      Value element_ptr = rewriter.create<LLVM::GEPOp>(
+          loc, ptr_ty, args_elem_ty, args_ptr, index);
       rewriter.create<LLVM::StoreOp>(loc, it.value(), element_ptr);
     }
-    auto args_void_ptr =
-        rewriter.create<LLVM::BitcastOp>(loc, void_ptr_ty, args_ptr);
 
     // Materialize runtime call.
     FlatSymbolRefAttr tf_func_ref =
         GetOrInsertLLVMFunction(GetFuncName(), GetFuncType(), op, &rewriter);
     rewriter.create<LLVM::CallOp>(
         loc, std::nullopt, tf_func_ref,
-        ValueRange{adaptor.getCtx(), adaptor.getCallable(), result_void_ptr,
-                   num_args, args_void_ptr});
+        ValueRange{adaptor.getCtx(), adaptor.getCallable(), result_ptr,
+                   num_args, args_ptr});
 
     // Copy result (including the descriptor) to a stack-allocated buffer and
     // free the old descriptor.
     llvm::SmallVector<Value, 1> final_result = {
-        rewriter.create<LLVM::LoadOp>(loc, result_ptr)};
+        rewriter.create<LLVM::LoadOp>(loc, result_ty, result_ptr)};
     if (failed(copyUnrankedDescriptors(rewriter, loc, op->getResultTypes(),
                                        final_result,
                                        /*toDynamic=*/false))) {
@@ -387,13 +374,13 @@ class JITExecuteOpConverter : public ConvertToLLVMCallOpPattern<JITExecuteOp> {
 
   Type GetFuncType() const override {
     auto i64_ty = IntegerType::get(getContext(), 64);
-    auto void_ptr_ty = getVoidPtrType();
+    auto ptr_ty = LLVM::LLVMPointerType::get(getContext());
     return LLVM::LLVMFunctionType::get(getVoidType(),
-                                       {/*void* op_kernel_ctx*/ void_ptr_ty,
-                                        /*void* callable*/ void_ptr_ty,
-                                        /*void* result*/ void_ptr_ty,
+                                       {/*void* op_kernel_ctx*/ ptr_ty,
+                                        /*void* callable*/ ptr_ty,
+                                        /*void* result*/ ptr_ty,
                                         /*int64_t num_args*/ i64_ty,
-                                        /*void* args_ptr*/ void_ptr_ty});
+                                        /*void* args_ptr*/ ptr_ty});
   }
 };
 
@@ -426,10 +413,10 @@ class ReportErrorOpConverter
   StringRef GetFuncName() const override { return kCInterfaceReportError; }
   Type GetFuncType() const override {
     MLIRContext *ctx = &getTypeConverter()->getContext();
-    auto i8_ptr_type = LLVM::LLVMPointerType::get(IntegerType::get(ctx, 8));
+    auto ptr_type = LLVM::LLVMPointerType::get(ctx);
     auto i32_type = IntegerType::get(ctx, 32);
-    return LLVM::LLVMFunctionType::get(
-        getVoidType(), {getVoidPtrType(), i32_type, i8_ptr_type});
+    return LLVM::LLVMFunctionType::get(getVoidType(),
+                                       {getVoidPtrType(), i32_type, ptr_type});
   }
 
  private:
@@ -474,6 +461,7 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
       ConversionPatternRewriter &rewriter) const override {
     Location loc = null_memref_op->getLoc();
     LLVMTypeConverter type_converter = *getTypeConverter();
+    MLIRContext *ctx = null_memref_op.getContext();
     mlir::Operation *op = null_memref_op.getOperation();
 
     auto shaped_result_type = null_memref_op.getType().cast<BaseMemRefType>();
@@ -481,9 +469,8 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
         shaped_result_type.getMemorySpace().dyn_cast_or_null<IntegerAttr>();
     unsigned address_space =
         static_cast<unsigned>(mem_space ? mem_space.getInt() : 0);
-
-    Type elem_type = shaped_result_type.getElementType();
-    Type llvm_elem_type = type_converter.convertType(elem_type);
+    LLVM::LLVMPointerType llvm_ptr_type =
+        LLVM::LLVMPointerType::get(ctx, address_space);
 
     Value zero = createIndexConstant(rewriter, loc, 0);
     if (auto result_type = null_memref_op.getType().dyn_cast<MemRefType>()) {
@@ -497,8 +484,7 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
 
       // Prepare packed args [allocatedPtr, alignedPtr, offset, sizes, strides]
       // to create a memref descriptor.
-      Value null = rewriter.create<LLVM::NullOp>(
-          loc, LLVM::LLVMPointerType::get(llvm_elem_type, address_space));
+      Value null = rewriter.create<LLVM::NullOp>(loc, llvm_ptr_type);
       SmallVector<Value, 12> packed_values{null, null, zero};
       packed_values.append(sizes);
       packed_values.append(strides);
@@ -529,21 +515,18 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
     UnrankedMemRefDescriptor::computeSizes(rewriter, loc, *getTypeConverter(),
                                            desc, addressSpace, sizes);
     Value underlying_desc_ptr = rewriter.create<LLVM::AllocaOp>(
-        loc, getVoidPtrType(), sizes.front(), std::nullopt);
+        loc, getVoidPtrType(), IntegerType::get(getContext(), 8),
+        sizes.front());
 
     // Populate underlying ranked descriptor.
-    LLVM::LLVMPointerType elem_ptr_ptr_type = LLVM::LLVMPointerType::get(
-        LLVM::LLVMPointerType::get(llvm_elem_type, address_space));
-
-    Value null = rewriter.create<LLVM::NullOp>(
-        loc, LLVM::LLVMPointerType::get(llvm_elem_type, address_space));
+    Value null = rewriter.create<LLVM::NullOp>(loc, llvm_ptr_type);
     UnrankedMemRefDescriptor::setAllocatedPtr(
-        rewriter, loc, underlying_desc_ptr, elem_ptr_ptr_type, null);
+        rewriter, loc, underlying_desc_ptr, llvm_ptr_type, null);
     UnrankedMemRefDescriptor::setAlignedPtr(rewriter, loc, *getTypeConverter(),
-                                            underlying_desc_ptr,
-                                            elem_ptr_ptr_type, null);
+                                            underlying_desc_ptr, llvm_ptr_type,
+                                            null);
     UnrankedMemRefDescriptor::setOffset(rewriter, loc, *getTypeConverter(),
-                                        underlying_desc_ptr, elem_ptr_ptr_type,
+                                        underlying_desc_ptr, llvm_ptr_type,
                                         zero);
 
     desc.setMemRefDescPtr(rewriter, loc, underlying_desc_ptr);
@@ -576,8 +559,7 @@ class IsValidMemRefOpConverter
           rewriter.create<LLVM::OrOp>(loc, is_empty_shape, is_zero_size);
     }
 
-    Value ptr = rewriter.create<LLVM::BitcastOp>(
-        loc, getVoidPtrType(), desc.allocatedPtr(rewriter, loc));
+    Value ptr = desc.allocatedPtr(rewriter, loc);
     Value null = rewriter.create<LLVM::NullOp>(loc, getVoidPtrType());
     Value is_not_nullptr = rewriter.create<LLVM::ICmpOp>(
         loc, rewriter.getI1Type(), LLVM::ICmpPredicate::ne, ptr, null);
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
index 8f7ce2f0a0c..136b278e8c9 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <stdexcept>
+#include <utility>
 
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"  // from @llvm-project
@@ -37,6 +39,7 @@ limitations under the License.
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
 #include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/Transforms/Transforms.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
@@ -77,10 +80,7 @@ class ConvertLaunchFuncOpToTfRuntimeCallPattern
   MLIRContext *context_ = &this->getTypeConverter()->getContext();
 
   Type llvm_void_type_ = LLVM::LLVMVoidType::get(context_);
-  Type llvm_pointer_type_ =
-      LLVM::LLVMPointerType::get(IntegerType::get(context_, 8));
-  Type llvm_pointer_pointer_type_ =
-      LLVM::LLVMPointerType::get(llvm_pointer_type_);
+  Type llvm_pointer_type_ = LLVM::LLVMPointerType::get(context_);
   Type llvm_int8_type_ = IntegerType::get(context_, 8);
   Type llvm_int32_type_ = IntegerType::get(context_, 32);
   Type llvm_int64_type_ = IntegerType::get(context_, 64);
@@ -119,25 +119,24 @@ Value ConvertLaunchFuncOpToTfRuntimeCallPattern::generateParamsArray(
   auto one = builder.create<LLVM::ConstantOp>(loc, llvm_int32_type_,
                                               builder.getI32IntegerAttr(1));
   auto struct_ptr = builder.create<LLVM::AllocaOp>(
-      loc, LLVM::LLVMPointerType::get(struct_type), one, /*alignment=*/0);
+      loc, llvm_pointer_type_, struct_type, one, /*alignment=*/0);
   auto array_size = builder.create<LLVM::ConstantOp>(
       loc, llvm_int32_type_, builder.getI32IntegerAttr(num_arguments));
   auto array_ptr = builder.create<LLVM::AllocaOp>(
-      loc, llvm_pointer_pointer_type_, array_size, /*alignment=*/0);
+      loc, llvm_pointer_type_, llvm_pointer_type_, array_size, /*alignment=*/0);
   auto zero = builder.create<LLVM::ConstantOp>(loc, llvm_int32_type_,
                                                builder.getI32IntegerAttr(0));
   for (auto en : llvm::enumerate(arguments)) {
     auto index = builder.create<LLVM::ConstantOp>(
         loc, llvm_int32_type_, builder.getI32IntegerAttr(en.index()));
     auto field_ptr = builder.create<LLVM::GEPOp>(
-        loc, LLVM::LLVMPointerType::get(argument_types[en.index()]), struct_ptr,
+        loc, llvm_pointer_type_, struct_type, struct_ptr,
         ArrayRef<Value>{zero, index.getResult()});
     builder.create<LLVM::StoreOp>(loc, en.value(), field_ptr);
-    auto element_ptr = builder.create<LLVM::GEPOp>(
-        loc, llvm_pointer_pointer_type_, array_ptr, index.getResult());
-    auto casted =
-        builder.create<LLVM::BitcastOp>(loc, llvm_pointer_type_, field_ptr);
-    builder.create<LLVM::StoreOp>(loc, casted, element_ptr);
+    auto element_ptr =
+        builder.create<LLVM::GEPOp>(loc, llvm_pointer_type_, llvm_pointer_type_,
+                                    array_ptr, index.getResult());
+    builder.create<LLVM::StoreOp>(loc, field_ptr, element_ptr);
   }
   return array_ptr;
 }
@@ -179,7 +178,7 @@ LogicalResult ConvertLaunchFuncOpToTfRuntimeCallPattern::matchAndRewrite(
   name_buffer.append("_blob");
   Value module_blob = LLVM::createGlobalString(loc, rewriter, name_buffer.str(),
                                                binary_attr.getValue(),
-                                               LLVM::Linkage::Internal, false);
+                                               LLVM::Linkage::Internal, true);
 
   // Make sure the trailing zero is included in the constant.
   auto kernel_name = launch_op.getKernelName().getValue();
@@ -193,7 +192,7 @@ LogicalResult ConvertLaunchFuncOpToTfRuntimeCallPattern::matchAndRewrite(
           .toStringRef(kernel_name_global_name_buffer);
   auto kernel_name_global = LLVM::createGlobalString(
       loc, rewriter, kernel_name_global_name, kernel_name_buffer,
-      LLVM::Linkage::Internal, false);
+      LLVM::Linkage::Internal, true);
 
   // The TensorFlow OpKernelContext is the first argument of the surrounding
   // LLVMFunc.
@@ -208,19 +207,18 @@ LogicalResult ConvertLaunchFuncOpToTfRuntimeCallPattern::matchAndRewrite(
   if (!function) {
     PatternRewriter::InsertionGuard guard(rewriter);
     auto function_type = LLVM::LLVMFunctionType::get(
-        llvm_void_type_,
-        {
-            llvm_pointer_type_,         /* void* context */
-            llvm_pointer_type_,         /* void* module_blob */
-            llvm_pointer_type_,         /* void* function_name */
-            llvm_intptr_type_,          /* intptr_t grid_x_dim */
-            llvm_intptr_type_,          /* intptr_t grid_y_dim */
-            llvm_intptr_type_,          /* intptr_t grid_z_dim */
-            llvm_intptr_type_,          /* intptr_t block_x_dim */
-            llvm_intptr_type_,          /* intptr_t block_y_dim */
-            llvm_intptr_type_,          /* intptr_t block_z_dim */
-            llvm_pointer_pointer_type_, /* void **kernel_params */
-        });
+        llvm_void_type_, {
+                             llvm_pointer_type_, /* void* context */
+                             llvm_pointer_type_, /* void* module_blob */
+                             llvm_pointer_type_, /* void* function_name */
+                             llvm_intptr_type_,  /* intptr_t grid_x_dim */
+                             llvm_intptr_type_,  /* intptr_t grid_y_dim */
+                             llvm_intptr_type_,  /* intptr_t grid_z_dim */
+                             llvm_intptr_type_,  /* intptr_t block_x_dim */
+                             llvm_intptr_type_,  /* intptr_t block_y_dim */
+                             llvm_intptr_type_,  /* intptr_t block_z_dim */
+                             llvm_pointer_type_, /* void **kernel_params */
+                         });
     rewriter.setInsertionPointToStart(
         launch_op->getParentOfType<ModuleOp>().getBody());
     function = rewriter.create<LLVM::LLVMFuncOp>(
@@ -257,15 +255,13 @@ class TFKernelToLLVMPass
 
     // Populate type conversions.
     MLIRContext *ctx = m.getContext();
-    // TODO(b/267828330): Migrate to opaque pointers.
     LowerToLLVMOptions options(&getContext());
-    options.useOpaquePointers = false;
     LLVMTypeConverter type_converter(ctx, options);
     type_converter.addConversion([&](tf_framework::OpKernelContextType type) {
-      return LLVM::LLVMPointerType::get(IntegerType::get(ctx, 8));
+      return LLVM::LLVMPointerType::get(type.getContext());
     });
     type_converter.addConversion([&](tf_framework::JITCallableType type) {
-      return LLVM::LLVMPointerType::get(IntegerType::get(ctx, 8));
+      return LLVM::LLVMPointerType::get(type.getContext());
     });
 
     // Populate patterns.
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_to_jit_invocations.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_to_jit_invocations.cc
index a1e6e7b6c3c..1afa6372434 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_to_jit_invocations.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_to_jit_invocations.cc
@@ -64,6 +64,10 @@ bool IsUnaryTFOperation(Operation *op) {
   return IsSingleResultTFOperation(op) && op->getNumOperands() == 1;
 }
 
+bool IsBinaryTFOperation(Operation *op) {
+  return IsSingleResultTFOperation(op) && op->getNumOperands() == 2;
+}
+
 struct TFToJITInvocationsPattern : public RewritePattern {
   explicit TFToJITInvocationsPattern(MLIRContext *ctx)
       : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
@@ -116,20 +120,30 @@ struct TFToI64JITInvocationForLargeTensorsPattern : public RewritePattern {
 
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
-    if (!IsUnaryTFOperation(op) ||
+    if ((!IsUnaryTFOperation(op) && !IsBinaryTFOperation(op)) ||
         !llvm::isa<func::FuncOp>(op->getParentOp())) {
       return failure();
     }
 
     // Create large argument condition.
     auto loc = op->getLoc();
-    auto arg = op->getOperands().front();
-    auto shape = rewriter.create<shape::ShapeOfOp>(loc, arg);
-    auto num_elems = rewriter.create<shape::NumElementsOp>(loc, shape);
+    auto arg_1 = op->getOperands().front();
+    auto shape_1 = rewriter.create<shape::ShapeOfOp>(loc, arg_1);
+    auto num_elems_1 = rewriter.create<shape::NumElementsOp>(loc, shape_1);
     Value cst_i32_limit =
         rewriter.create<arith::ConstantIndexOp>(loc, i32Limit);
     Value large_tensor_predicate = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sgt, num_elems, cst_i32_limit);
+        loc, arith::CmpIPredicate::sgt, num_elems_1, cst_i32_limit);
+    if (IsBinaryTFOperation(op)) {
+      auto arg_2 = op->getOperands().back();
+      auto shape_2 = rewriter.create<shape::ShapeOfOp>(loc, arg_2);
+      auto num_elems_2 = rewriter.create<shape::NumElementsOp>(loc, shape_2);
+      large_tensor_predicate = rewriter.create<arith::OrIOp>(
+          loc, large_tensor_predicate,
+          // Compare op to check size of the second op
+          rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt,
+                                         num_elems_2, cst_i32_limit));
+    }
 
     // Create dispatch code.
     auto jit_body_builder_fn = [&](OpBuilder &b, Location loc) {
@@ -152,9 +166,10 @@ struct TFToI64JITInvocationForLargeTensorsPattern : public RewritePattern {
       }
 
       // Create JIT execute op.
+      assert(op->getOperands().size() == 1 || op->getOperands().size() == 2);
       auto jit_execute_op = b.create<tf_framework::JITExecuteOp>(
           loc, op->getResultTypes().front(), /*ctx=*/Value(),
-          jit_compile_op.getResult(), arg);
+          jit_compile_op.getResult(), op->getOperands());
       b.create<scf::YieldOp>(loc, jit_execute_op.getResult());
     };
     auto aot_body_builder_fn = [&](OpBuilder &b, Location loc) {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc
index c1dbe67f5b6..b1c909bb523 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 
 namespace mlir {
 namespace kernel_gen {
@@ -49,16 +50,18 @@ Value CreateOrFindGlobalStringConstant(Location loc, StringRef global_name,
   Operation* global_constant = SymbolTable::lookupNearestSymbolFrom(
       module, b->getStringAttr(global_name));
   if (global_constant) {
-    Value global_ptr = b->create<LLVM::AddressOfOp>(
-        loc, cast<LLVM::GlobalOp>(global_constant));
+    auto global_op = cast<LLVM::GlobalOp>(global_constant);
+    StringRef symbol_name = global_op.getName();
+    Type symbol_type = global_op.getType();
+    Type ptr_type = LLVM::LLVMPointerType::get(b->getContext());
+    Value global_ptr = b->create<LLVM::AddressOfOp>(loc, ptr_type, symbol_name);
     Value c0 =
         b->create<LLVM::ConstantOp>(loc, b->getI64Type(), b->getIndexAttr(0));
-    return b->create<LLVM::GEPOp>(
-        loc, LLVM::LLVMPointerType::get(b->getIntegerType(8)), global_ptr,
-        ValueRange{c0, c0});
+    return b->create<LLVM::GEPOp>(loc, ptr_type, symbol_type, global_ptr,
+                                  ValueRange{c0, c0});
   }
   return LLVM::createGlobalString(loc, *b, global_name, content,
-                                  LLVM::Linkage::Internal, false);
+                                  LLVM::Linkage::Internal, true);
 }
 
 }  // namespace transforms
diff --git a/tensorflow/compiler/mlir/tosa/BUILD b/tensorflow/compiler/mlir/tosa/BUILD
index b456496bc5b..586204d9594 100644
--- a/tensorflow/compiler/mlir/tosa/BUILD
+++ b/tensorflow/compiler/mlir/tosa/BUILD
@@ -67,6 +67,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
     ],
 )
@@ -178,11 +179,16 @@ cc_library(
     name = "tfl_passes",
     srcs = [
         "tfl_passes.cc",
+        "transforms/convert_metadata.cc",
         "transforms/convert_tfl_uint8.cc",
         "transforms/legalize_tfl.cc",
         "transforms/lower_complex_types.cc",
+        "transforms/lower_global_tensors.cc",
+        "transforms/retain_call_once_funcs.cc",
+        "transforms/strip_metadata.cc",
         "transforms/strip_quant_types.cc",
         "transforms/tfl_legalize_patterns.inc",
+        "transforms/verify_fully_converted.cc",
     ],
     hdrs = [
         "tfl_passes.h",
@@ -202,8 +208,10 @@ cc_library(
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MLProgramDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TosaDialect",
         "@llvm-project//mlir:Transforms",
diff --git a/tensorflow/compiler/mlir/tosa/tests/convert_metadata.mlir b/tensorflow/compiler/mlir/tosa/tests/convert_metadata.mlir
new file mode 100644
index 00000000000..7fb03c7728c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/tests/convert_metadata.mlir
@@ -0,0 +1,25 @@
+// RUN: tf-opt --split-input-file --pass-pipeline='builtin.module(func.func(tosa-tflite-convert-function-metadata))' %s | FileCheck %s
+
+module attributes {tfl.schema_version = 3 : i32} {
+  // CHECK: func.func @main(
+  // CHECK-SAME: %arg0: tensor<?xf32> {ml_program.identifier = "input0"},
+  // CHECK-SAME: %arg1: tensor<?xf32> {ml_program.identifier = "input1"}
+  // CHECK-SAME: ) -> (
+  // CHECK-SAME: tensor<?xf32> {ml_program.identifier = "output0"},
+  // CHECK-SAME: tensor<?xf32> {ml_program.identifier = "output1"})
+  func.func @main(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>) attributes {
+    tf.entry_function = {inputs = "input0,input1", outputs = "output0,output1"}
+  } {
+    return %arg0, %arg1 : tensor<?xf32>, tensor<?xf32>
+  }
+
+  // CHECK: func.func @no_input(
+  // CHECK-SAME: ) -> (
+  // CHECK-SAME: tensor<1xf32> {ml_program.identifier = "output0"})
+  func.func @no_input() -> (tensor<1xf32>) attributes {
+    tf.entry_function = {outputs = "output0"}
+  } {
+    %0 = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tosa/tests/lower_global_tensors.mlir b/tensorflow/compiler/mlir/tosa/tests/lower_global_tensors.mlir
new file mode 100644
index 00000000000..5b8bd2cc3c0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/tests/lower_global_tensors.mlir
@@ -0,0 +1,145 @@
+// RUN: tf-opt --split-input-file --pass-pipeline='builtin.module(tflite-lower-global-tensors)' %s | FileCheck %s
+
+module {
+  // CHECK: ml_program.global private mutable @Variable(dense<1.000000e+00> : tensor<16x16xf32>)
+  // CHECK-LABEL: func.func @state
+  func.func @state(%arg0: tensor<16x16xf32>) -> () {
+    "tfl.call_once"() {session_init_function = "StateInit"} : () -> ()
+    return
+  }
+
+  func.func private @StateInit() {
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+    %1 = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16x16xf32>} : () -> tensor<16x16xf32>
+    "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
+    return
+  }
+}
+
+// -----
+
+module {
+  // CHECK: ml_program.global private mutable @Variable(dense<1.000000e+00> : tensor<16x16xf32>)
+
+  // CHECK-LABEL: func.func @assign
+  func.func @assign(%arg0: tensor<16x16xf32>) -> () {
+    "tfl.call_once"() {session_init_function = "AssignInit"} : () -> ()
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+
+    // CHECK: ml_program.global_store @Variable = %arg0
+    "tfl.assign_variable"(%0, %arg0) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
+    return
+  }
+
+  func.func private @AssignInit() {
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+    %1 = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16x16xf32>} : () -> tensor<16x16xf32>
+    "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
+    return
+  }
+}
+
+// -----
+
+module {
+  // CHECK: ml_program.global private mutable @Variable(dense<1.000000e+00> : tensor<16x16xf32>)
+
+  // CHECK-LABEL: func.func @read
+  func.func @read(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
+    "tfl.call_once"() {session_init_function = "ReadInit"} : () -> ()
+
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+
+    // CHECK: %[[LOAD:.+]] = ml_program.global_load @Variable : tensor<16x16xf32>
+    %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<16x16xf32>
+    return %1 : tensor<16x16xf32>
+  }
+
+  func.func private @ReadInit() {
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+    %1 = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16x16xf32>} : () -> tensor<16x16xf32>
+    "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
+    return
+  }
+}
+
+// -----
+
+module {
+  // CHECK: ml_program.global private mutable @Variable(dense<2.000000e+00> : tensor<16x16xf32>)
+
+  // CHECK-LABEL: func.func @readAssign
+  func.func @readAssign(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
+    "tfl.call_once"() {session_init_function = "ReadAssignInit"} : () -> ()
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+
+    // CHECK: %[[LOAD:.+]] = ml_program.global_load @Variable : tensor<16x16xf32>
+    %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<16x16xf32>
+
+    // CHECK: %[[ADD:.+]] = tfl.add %[[LOAD]], %arg0
+    %2 = tfl.add %1, %arg0 {fused_activation_function = "NONE"} : tensor<16x16xf32>
+
+    // CHECK: ml_program.global_store  @Variable = %[[ADD]]
+    "tfl.assign_variable"(%0, %2) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
+    return %2 : tensor<16x16xf32>
+  }
+  func.func private @ReadAssignInit() {
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+    %1 = "tfl.pseudo_const"() {value = dense<2.000000e+00> : tensor<16x16xf32>} : () -> tensor<16x16xf32>
+    "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
+    return
+  }
+}
+
+// -----
+
+module {
+  // CHECK: ml_program.global private mutable @Variable(dense<42> : tensor<2x3xi8>)
+  // CHECK-LABEL: func.func @readAssignQuant
+  func.func @readAssignQuant(%arg0: tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> (tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) {
+    "tfl.call_once"() {session_init_function = "ReadAssignInit"} : () -> ()
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+
+    // CHECK: %[[ADDR:.+]] = ml_program.global_load @Variable : tensor<2x3xi8>
+    // CHECK: %[[CAST:.+]] = builtin.unrealized_conversion_cast %[[ADDR]] : tensor<2x3xi8> to tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
+    %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+
+    // CHECK: %[[ADD:.+]] = tfl.add %[[CAST]], %arg0 {fused_activation_function = "NONE"}
+    %2 = tfl.add %1, %arg0 {fused_activation_function = "NONE"} : tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+
+    // CHECK: %[[CAST2:.+]] = builtin.unrealized_conversion_cast %[[ADD]] : tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>> to tensor<2x3xi8>
+    // CHECK: ml_program.global_store @Variable = %[[CAST2]]
+    "tfl.assign_variable"(%0, %2) : (tensor<*x!tf_type.resource>, tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> ()
+    return %2 : tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+  }
+  func.func private @ReadAssignInit() {
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+    %1 = "tfl.pseudo_const"() {qtype = tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>, value = dense<42> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+    "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> ()
+    return
+  }
+}
+
+// -----
+
+module {
+  // CHECK-label: @nostate
+  func.func @nostate(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
+    "tfl.call_once"() {session_init_function = "NoStateInit"} : () -> ()
+    // CHECK: tfl.var_handle
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+
+    // CHECK: tfl.read_variable
+    %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<16x16xf32>
+
+    %2 = tfl.add %1, %arg0 {fused_activation_function = "NONE"} : tensor<16x16xf32>
+
+    // CHECK: tfl.assign_variable
+    "tfl.assign_variable"(%0, %2) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
+    return %2 : tensor<16x16xf32>
+  }
+  func.func private @NoStateInit() {
+    return
+  }
+}
+
diff --git a/tensorflow/compiler/mlir/tosa/tests/multi_add.mlir b/tensorflow/compiler/mlir/tosa/tests/multi_add.mlir
new file mode 100644
index 00000000000..c513f2ec936
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/tests/multi_add.mlir
@@ -0,0 +1,16 @@
+// RUN: tf-opt --tfl-to-tosa-pipeline=target-compilation-backend %s | FileCheck %s
+
+// CHECK:      tensor<1x8x8x3xf32> {ml_program.identifier = "a"}
+// CHECK-SAME: tensor<1x8x8x3xf32> {ml_program.identifier = "b"}
+// CHECK-SAME: tensor<1x8x8x3xf32> {ml_program.identifier = "c"}
+// CHECK-SAME: tensor<1x8x8x3xf32> {ml_program.identifier = "d"}
+// CHECK-SAME: -> (tensor<1x8x8x3xf32> {ml_program.identifier = "x"}, tensor<1x8x8x3xf32> {ml_program.identifier = "y"})
+
+module attributes {tfl.schema_version = 3 : i32} {
+  func.func @main(%arg0: tensor<1x8x8x3xf32>, %arg1: tensor<1x8x8x3xf32>, %arg2: tensor<1x8x8x3xf32>, %arg3: tensor<1x8x8x3xf32>) -> (tensor<1x8x8x3xf32>, tensor<1x8x8x3xf32>) attributes {tf.entry_function = {inputs = "a,b,c,d", outputs = "x,y"}} {
+    %0 = tfl.add %arg1, %arg2 {fused_activation_function = "NONE"} : tensor<1x8x8x3xf32>
+    %1 = tfl.add %arg0, %0 {fused_activation_function = "NONE"} : tensor<1x8x8x3xf32>
+    %2 = tfl.add %arg3, %0 {fused_activation_function = "NONE"} : tensor<1x8x8x3xf32>
+    return %1, %2 : tensor<1x8x8x3xf32>, tensor<1x8x8x3xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tosa/tests/retain_call_once_funcs.mlir b/tensorflow/compiler/mlir/tosa/tests/retain_call_once_funcs.mlir
new file mode 100644
index 00000000000..5719fd35989
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/tests/retain_call_once_funcs.mlir
@@ -0,0 +1,21 @@
+// RUN: tf-opt --split-input-file --pass-pipeline='builtin.module(tflite-retain-call-once-funcs)' %s | FileCheck %s
+
+// CHECK-LABEL: module {
+module {
+  // CHECK-LABEL: @main
+  func.func @main(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
+    // CHECK: "tfl.call_once"() {session_init_function = "NoOp", session_init_function_symbol = @NoOp} : () -> ()
+    "tfl.call_once"() {session_init_function = "NoOp"} : () -> ()
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+    %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<16x16xf32>
+    %2 = tfl.add %1, %arg0 {fused_activation_function = "NONE"} : tensor<16x16xf32>
+    "tfl.assign_variable"(%0, %2) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
+    return %2 : tensor<16x16xf32>
+  }
+  func.func private @NoOp() {
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+    %1 = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<16x16xf32>} : () -> tensor<16x16xf32>
+    "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
+    return
+  }
+}
diff --git a/tensorflow/compiler/mlir/tosa/tests/strip_metadata.mlir b/tensorflow/compiler/mlir/tosa/tests/strip_metadata.mlir
new file mode 100644
index 00000000000..f2198823a6d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/tests/strip_metadata.mlir
@@ -0,0 +1,14 @@
+// RUN: tf-opt --pass-pipeline='builtin.module(tosa-tflite-strip-module-metadata,func.func(tosa-tflite-strip-function-metadata))' %s | FileCheck %s
+
+// CHECK-LABEL: module {
+// CHECK-NOT: tf.schema_version
+module attributes {tfl.schema_version = 3 : i32} {
+  // CHECK: func.func @main
+  // CHECK-NOT: tf.entry_function
+  func.func @main(%arg0: tensor<1x8x8x3xf32>) -> tensor<1x8x8x3xf32> attributes {tf.entry_function = {inputs = "input", outputs = "output"}} {
+    // CHECK-NEXT: tfl.add
+    %0 = tfl.add %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<1x8x8x3xf32>
+    %1 = tfl.add %0, %arg0 {fused_activation_function = "NONE"} : tensor<1x8x8x3xf32>
+    return %1 : tensor<1x8x8x3xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
index c0e5b23e3b8..5cacdf03552 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
@@ -7,10 +7,10 @@
 // -----
 
 // CHECK-LABEL: test_conv2d
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<[3, 0, 1, 2]> : tensor<4xi32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<[3, 0, 1, 2]> : tensor<4xi32>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%arg1, %[[VAR1]])
-// CHECK: %[[VAR3:.*]] = "tosa.conv2d"(%arg0, %[[VAR2]], %[[VAR0]]) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR3:.*]] = "tosa.conv2d"(%arg0, %[[VAR2]], %[[VAR0]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 func.func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x16xf32>) -> tensor<1x32x32x16xf32> {
   %3 = "tf.Conv2D"(%arg0, %arg1)  {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}  : (tensor<1x32x32x8xf32>, tensor<2x2x8x16xf32>) -> tensor<1x32x32x16xf32>
   func.return %3 : tensor<1x32x32x16xf32>
@@ -19,8 +19,8 @@ func.func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x16xf32>
 // -----
 
 // CHECK-LABEL: test_depthwise_conv2d
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
-// CHECK: %[[VAR1:.*]] = "tosa.depthwise_conv2d"(%arg0, %arg1, %0) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}
+// CHECK: %[[VAR1:.*]] = "tosa.depthwise_conv2d"(%arg0, %arg1, %0) <{dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 func.func @test_depthwise_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x2xf32>) -> tensor<1x32x32x16xf32> {
   %5 = "tf.DepthwiseConv2dNative"(%arg0, %arg1)  {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}  : (tensor<1x32x32x8xf32>, tensor<2x2x8x2xf32>) -> tensor<1x32x32x16xf32>
   %6 = "tf.Identity"(%5)   : (tensor<1x32x32x16xf32>) -> tensor<1x32x32x16xf32>
@@ -31,9 +31,9 @@ func.func @test_depthwise_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2
 
 // CHECK-LABEL: @test_transpose_conv2d
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<1x32x32x8xf32>, %[[ARG1:.*]]: tensor<1x1x16x8xf32>
-// CHECK:         %[[CONST:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
-// CHECK:         %[[RESHAPE:.*]] = "tosa.reshape"(%[[ARG1]]) {new_shape = array<i64: 16, 1, 1, 8>}
-// CHECK:         %[[TRANSPOSE:.*]] = "tosa.transpose_conv2d"(%[[ARG0]], %[[RESHAPE]], %[[CONST]]) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>}
+// CHECK:         %[[CONST:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}
+// CHECK:         %[[RESHAPE:.*]] = "tosa.reshape"(%[[ARG1]]) <{new_shape = array<i64: 16, 1, 1, 8>}
+// CHECK:         %[[TRANSPOSE:.*]] = "tosa.transpose_conv2d"(%[[ARG0]], %[[RESHAPE]], %[[CONST]]) <{out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>}
 // CHECK:         return %[[TRANSPOSE]]
 func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<1x1x16x8xf32>) -> tensor<1x32x32x16xf32> {
   %3 = "tf.Const"()  {value = dense<[1, 32, 32, 16]> : tensor<4xi32>}  : () -> tensor<4xi32>
@@ -46,10 +46,10 @@ func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<1x1
 // CHECK-LABEL: test_conv3d
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<2x4x128x128x8xf32>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<2x3x3x2x4xf32>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<4xf32>}
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<4xf32>}
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}
 // CHECK: %[[VAL_4:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_3]])
-// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 2, 2>}
+// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) <{dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 2, 2>}
 func.func @test_conv3d(%arg0: tensor<2x4x128x128x8xf32>, %arg1: tensor<2x3x3x2x4xf32>) -> tensor<2x4x64x64x4xf32> {
   %0 = "tf.Conv3D"(%arg0, %arg1) {data_format = "NDHWC", device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 2, 1]} : (tensor<2x4x128x128x8xf32>, tensor<2x3x3x2x4xf32>) -> tensor<2x4x64x64x4xf32>
   return %0 : tensor<2x4x64x64x4xf32>
@@ -61,9 +61,9 @@ func.func @test_conv3d(%arg0: tensor<2x4x128x128x8xf32>, %arg1: tensor<2x3x3x2x4
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<3x32x16x16x5xf32>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<2x3x3x5x10xf32>
 // CHECK-SAME: %[[VAL_2:.*]]: tensor<10xf32>) -> tensor<3x32x16x16x10xf32>
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}
 // CHECK: %[[VAL_4:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_3]])
-// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 1>}
+// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) <{dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 1>}
 func.func @test_conv3d_bias(%arg0: tensor<3x32x16x16x5xf32>, %arg1: tensor<2x3x3x5x10xf32>, %bias: tensor<10xf32>) -> tensor<3x32x16x16x10xf32> {
   %0 = "tf.Conv3D"(%arg0, %arg1) {data_format = "NDHWC", device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} : (tensor<3x32x16x16x5xf32>, tensor<2x3x3x5x10xf32>) -> tensor<3x32x16x16x10xf32>
   %1 = "tf.BiasAdd"(%0, %bias) {data_format = "NHWC", device = ""} : (tensor<3x32x16x16x10xf32>, tensor<10xf32>) -> tensor<3x32x16x16x10xf32>
@@ -91,7 +91,7 @@ func.func @test_sub(%arg0: tensor<1x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> te
 // -----
 
 // CHECK-LABEL: test_mul
-// CHECK: %[[VAR0:.*]] = "tosa.mul"(%arg0, %arg1) {shift = 0 : i32}
+// CHECK: %[[VAR0:.*]] = "tosa.mul"(%arg0, %arg1) <{shift = 0 : i32}
 func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xf32> {
   %2 = "tf.Mul"(%arg0, %arg1)   : (tensor<13x21x3xf32>, tensor<13x1x3xf32>) -> tensor<13x21x3xf32>
   func.return %2 : tensor<13x21x3xf32>
@@ -137,7 +137,7 @@ func.func @test_rcp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_relu
-// CHECK: %[[VAR0:.*]] = "tosa.clamp"(%arg0) {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
+// CHECK: %[[VAR0:.*]] = "tosa.clamp"(%arg0) <{max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
 func.func @test_relu(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %2 = "tf.Relu"(%arg0)   : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   func.return %2 : tensor<13x21x3xf32>
@@ -146,7 +146,7 @@ func.func @test_relu(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_relu6
-// CHECK: %[[VAR0:.*]] = "tosa.clamp"(%arg0) {max_fp = 6.000000e+00 : f32, max_int = 6 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
+// CHECK: %[[VAR0:.*]] = "tosa.clamp"(%arg0) <{max_fp = 6.000000e+00 : f32, max_int = 6 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
 func.func @test_relu6(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %2 = "tf.Relu6"(%arg0)   : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   func.return %2 : tensor<13x21x3xf32>
@@ -155,9 +155,9 @@ func.func @test_relu6(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_leaky_relu
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<1x1xf32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<5.000000e-01> : tensor<1x1xf32>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.mul"(%arg0, %[[VAR1]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1xf32>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<5.000000e-01> : tensor<1x1xf32>}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.mul"(%arg0, %[[VAR1]]) <{shift = 0 : i32}
 // CHECK-DAG: %[[VAR3:.*]] = "tosa.greater_equal"(%arg0, %[[VAR0]])
 // CHECK: %[[VAR6:.*]] = "tosa.select"(%[[VAR3]], %arg0, %[[VAR2]])
 func.func @test_leaky_relu(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
@@ -168,7 +168,7 @@ func.func @test_leaky_relu(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
 // -----
 
 // CHECK-LABEL: test_concat
-// CHECK: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1) {axis = 0 : i64}
+// CHECK: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1) <{axis = 0 : i64}
 func.func @test_concat(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<26x21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<i32>}  : () -> tensor<i32>
   %3 = "tf.ConcatV2"(%arg0, %arg1, %2)   : (tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<i32>) -> tensor<26x21x3xf32>
@@ -241,8 +241,8 @@ func.func @test_logical_not(%arg0: tensor<1x21x3xi1>) -> tensor<1x21x3xi1> {
 // -----
 
 // CHECK-LABEL: test_reduce_any
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_any"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_any"(%arg0) <{axis = 0 : i64}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 21, 3>}
 func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Any"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xi1>, tensor<1xi32>) -> tensor<21x3xi1>
@@ -252,8 +252,8 @@ func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
 // -----
 
 // CHECK-LABEL: test_reduce_all
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_all"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_all"(%arg0) <{axis = 0 : i64}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 21, 3>}
 func.func @test_reduce_all(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.All"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xi1>, tensor<1xi32>) -> tensor<21x3xi1>
@@ -263,8 +263,8 @@ func.func @test_reduce_all(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
 // -----
 
 // CHECK-LABEL: test_reduce_min
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_min"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_min"(%arg0) <{axis = 0 : i64}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 21, 3>}
 func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Min"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -274,8 +274,8 @@ func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_reduce_max
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_max"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_max"(%arg0) <{axis = 0 : i64}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 21, 3>}
 func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Max"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -285,8 +285,8 @@ func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_reduce_sum
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_sum"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_sum"(%arg0) <{axis = 0 : i64}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 21, 3>}
 func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Sum"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -297,11 +297,11 @@ func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // CHECK-LABEL: test_reduce_sum_nonzero_axis
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<10x20x30x40x50xf32>
-// CHECK: %[[VAL_1:.*]] = "tosa.const"() {value = dense<[0, 1, 2, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<[0, 1, 2, 4, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
 // CHECK: %[[VAL_2:.*]] = "tosa.transpose"(%[[VAL_0]], %[[VAL_1]]) : (tensor<10x20x30x40x50xf32>, tensor<5xi32>) -> tensor<10x20x30x50x40xf32>
-// CHECK: %[[VAL_3:.*]] = "tosa.reshape"(%[[VAL_2]]) {new_shape = array<i64: 300000, 40>} : (tensor<10x20x30x50x40xf32>) -> tensor<300000x40xf32>
-// CHECK: %[[VAL_4:.*]] = "tosa.reduce_sum"(%[[VAL_3]]) {axis = 1 : i64} : (tensor<300000x40xf32>) -> tensor<300000x1xf32>
-// CHECK: %[[VAL_5:.*]] = "tosa.reshape"(%[[VAL_4]]) {new_shape = array<i64: 10, 20, 30, 50>} : (tensor<300000x1xf32>) -> tensor<10x20x30x50xf32>
+// CHECK: %[[VAL_3:.*]] = "tosa.reshape"(%[[VAL_2]]) <{new_shape = array<i64: 300000, 40>}> : (tensor<10x20x30x50x40xf32>) -> tensor<300000x40xf32>
+// CHECK: %[[VAL_4:.*]] = "tosa.reduce_sum"(%[[VAL_3]]) <{axis = 1 : i64}> : (tensor<300000x40xf32>) -> tensor<300000x1xf32>
+// CHECK: %[[VAL_5:.*]] = "tosa.reshape"(%[[VAL_4]]) <{new_shape = array<i64: 10, 20, 30, 50>}> : (tensor<300000x1xf32>) -> tensor<10x20x30x50xf32>
 // CHECK: return %[[VAL_5]] : tensor<10x20x30x50xf32>
 func.func @test_reduce_sum_nonzero_axis(%arg0: tensor<10x20x30x40x50xf32> {tf._user_specified_name = "inp_list"}) -> tensor<10x20x30x50xf32> {
   %cst = "tf.Const"() {device = "", value = dense<3> : tensor<i32>} : () -> tensor<i32>
@@ -313,10 +313,10 @@ func.func @test_reduce_sum_nonzero_axis(%arg0: tensor<10x20x30x40x50xf32> {tf._u
 // -----
 
 // CHECK-LABEL: test_reduce_mean
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.0769230798> : tensor<1x1xf32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%arg0) {axis = 0 : i64}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%[[VAR1]]) {new_shape = array<i64: 21, 3>}
-// CHECK: %[[VAR4:.*]] = "tosa.mul"(%[[VAR2]], %[[VAR0]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.0769230798> : tensor<1x1xf32>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%arg0) <{axis = 0 : i64}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%[[VAR1]]) <{new_shape = array<i64: 21, 3>}
+// CHECK: %[[VAR4:.*]] = "tosa.mul"(%[[VAR2]], %[[VAR0]]) <{shift = 0 : i32}
 func.func @test_reduce_mean(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Mean"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -326,8 +326,8 @@ func.func @test_reduce_mean(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_reduce_product
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_prod"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_prod"(%arg0) <{axis = 0 : i64}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 21, 3>}
 func.func @test_reduce_product(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Prod"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -420,12 +420,12 @@ func.func @test_rsqrt(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // CHECK-LABEL: test_sin
 // CHECK-SAME: -> tensor<10xf32>
 func.func @test_sin(%arg0: tensor<10xf32>) -> tensor<*xf32> {
-  // CHECK-DAG: %[[ONE:.+]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1xf32>}
-  // CHECK-DAG: %[[TWO:.+]] = "tosa.const"() {value = dense<2.000000e+00> : tensor<1xf32>}
-  // CHECK-DAG: %[[RESULT_SCALE:.+]] = "tosa.const"() {value = dense<2.38418579E-7> : tensor<1xf32>}
-  // CHECK-DAG: %[[INT_MAX:.+]] = "tosa.const"() {value = dense<3.276700e+04> : tensor<1xf32>}
-  // CHECK-DAG: %[[IN_SCALE:.+]] = "tosa.const"() {value = dense<0.159154937> : tensor<1xf32>}
-  // CHECK-DAG: %[[TBLVAL:.+]] = "tosa.const"() {value = dense<{{.+}}> : tensor<513xi16>}
+  // CHECK-DAG: %[[ONE:.+]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1xf32>}
+  // CHECK-DAG: %[[TWO:.+]] = "tosa.const"() <{value = dense<2.000000e+00> : tensor<1xf32>}
+  // CHECK-DAG: %[[RESULT_SCALE:.+]] = "tosa.const"() <{value = dense<2.38418579E-7> : tensor<1xf32>}
+  // CHECK-DAG: %[[INT_MAX:.+]] = "tosa.const"() <{value = dense<3.276700e+04> : tensor<1xf32>}
+  // CHECK-DAG: %[[IN_SCALE:.+]] = "tosa.const"() <{value = dense<0.159154937> : tensor<1xf32>}
+  // CHECK-DAG: %[[TBLVAL:.+]] = "tosa.const"() <{value = dense<{{.+}}> : tensor<513xi16>}
   // CHECK-DAG: %[[IN_SCALED:.+]] = "tosa.mul"(%arg0, %[[IN_SCALE]])
   // CHECK-DAG: %[[FLOOR:.+]] = "tosa.floor"(%[[IN_SCALED]])
   // CHECK-DAG: %[[SUB1:.+]] = "tosa.sub"(%[[IN_SCALED]], %[[FLOOR]])
@@ -447,13 +447,13 @@ func.func @test_sin(%arg0: tensor<10xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: test_cos
 // CHECK-SAME: -> tensor<10xf32>
 func.func @test_cos(%arg0: tensor<10xf32>) -> tensor<*xf32> {
-  // CHECK-DAG: %[[RESULT_SCALE:.+]] = "tosa.const"() {value = dense<2.38418579E-7> : tensor<1xf32>}
-  // CHECK-DAG: %[[INT_MAX:.+]] = "tosa.const"() {value = dense<3.276700e+04> : tensor<1xf32>}
-  // CHECK-DAG: %[[ONE:.+]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1xf32>}
-  // CHECK-DAG: %[[TWO:.+]] = "tosa.const"() {value = dense<2.000000e+00> : tensor<1xf32>}
-  // CHECK-DAG: %[[IN_SCALE:.+]] = "tosa.const"() {value = dense<0.159154937> : tensor<1xf32>}
-  // CHECK-DAG: %[[HALF_PI:.+]] = "tosa.const"() {value = dense<1.57079637> : tensor<1xf32>}
-  // CHECK-DAG: %[[TBLVAL:.+]] = "tosa.const"() {value = dense<{{.+}}> : tensor<513xi16>}
+  // CHECK-DAG: %[[RESULT_SCALE:.+]] = "tosa.const"() <{value = dense<2.38418579E-7> : tensor<1xf32>}
+  // CHECK-DAG: %[[INT_MAX:.+]] = "tosa.const"() <{value = dense<3.276700e+04> : tensor<1xf32>}
+  // CHECK-DAG: %[[ONE:.+]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1xf32>}
+  // CHECK-DAG: %[[TWO:.+]] = "tosa.const"() <{value = dense<2.000000e+00> : tensor<1xf32>}
+  // CHECK-DAG: %[[IN_SCALE:.+]] = "tosa.const"() <{value = dense<0.159154937> : tensor<1xf32>}
+  // CHECK-DAG: %[[HALF_PI:.+]] = "tosa.const"() <{value = dense<1.57079637> : tensor<1xf32>}
+  // CHECK-DAG: %[[TBLVAL:.+]] = "tosa.const"() <{value = dense<{{.+}}> : tensor<513xi16>}
   // CHECK-DAG: %[[IN_TRANSLATE:.+]] = "tosa.add"(%arg0, %[[HALF_PI]])
   // CHECK-DAG: %[[IN_SCALED:.+]] = "tosa.mul"(%[[IN_TRANSLATE]], %[[IN_SCALE]])
   // CHECK-DAG: %[[FLOOR:.+]] = "tosa.floor"(%[[IN_SCALED]])
@@ -473,6 +473,22 @@ func.func @test_cos(%arg0: tensor<10xf32>) -> tensor<*xf32> {
 
 // -----
 
+// CHECK-LABEL: test_sign
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<8x33xf32>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1xf32>}
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<-1.000000e+00> : tensor<1x1xf32>}
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1xf32>}
+// CHECK: %[[VAL_4:.*]] = "tosa.greater"(%[[VAL_0]], %[[VAL_1]])
+// CHECK: %[[VAL_5:.*]] = "tosa.greater"(%[[VAL_1]], %[[VAL_0]])
+// CHECK: %[[VAL_6:.*]] = "tosa.select"(%[[VAL_5]], %[[VAL_2]], %[[VAL_1]])
+// CHECK: %[[VAL_7:.*]] = "tosa.select"(%[[VAL_4]], %[[VAL_3]], %[[VAL_6]])
+func.func @test_sign(%arg0: tensor<8x33xf32>) -> tensor<8x33xf32> {
+  %0 = "tf.Sign"(%arg0) : (tensor<8x33xf32>) -> tensor<8x33xf32>
+    func.return %0 : tensor<8x33xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_sigmoid
 // CHECK: %[[VAR0:.*]] = "tosa.sigmoid"(%arg0)
 func.func @test_sigmoid(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
@@ -483,7 +499,7 @@ func.func @test_sigmoid(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_square
-// CHECK: %[[VAR0:.*]] = "tosa.mul"(%arg0, %arg0) {shift = 0 : i32}
+// CHECK: %[[VAR0:.*]] = "tosa.mul"(%arg0, %arg0) <{shift = 0 : i32}
 func.func @test_square(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %2 = "tf.Square"(%arg0)   : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   func.return %2 : tensor<13x21x3xf32>
@@ -539,7 +555,7 @@ func.func @test_less_equal(%arg0: tensor<13x21x3xf32>, %arg1: tensor<1x21x3xf32>
 // -----
 
 // CHECK-LABEL: test_argmax
-// CHECK: %[[VAR0:.*]] = "tosa.argmax"(%arg0) {axis = 0 : i64}
+// CHECK: %[[VAR0:.*]] = "tosa.argmax"(%arg0) <{axis = 0 : i64}
 func.func @test_argmax(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xi32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<i32>}  : () -> tensor<i32>
   %3 = "tf.ArgMax"(%arg0, %2)   : (tensor<13x21x3xf32>, tensor<i32>) -> tensor<21x3xi32>
@@ -549,7 +565,7 @@ func.func @test_argmax(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xi32> {
 // -----
 
 // CHECK-LABEL: test_avg_pool2d
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32> {
   %2 = "tf.AvgPool"(%arg0)  {data_format = "NHWC", ksize = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]}  : (tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32>
   func.return %2 : tensor<1x32x32x8xf32>
@@ -558,7 +574,7 @@ func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32
 // -----
 
 // CHECK-LABEL: test_max_pool2d
-// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32> {
   %2 = "tf.MaxPool"(%arg0)  {data_format = "NHWC", explicit_paddings = [], ksize = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]}  : (tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32>
   func.return %2 : tensor<1x32x32x8xf32>
@@ -567,7 +583,7 @@ func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32
 // -----
 
 // CHECK-LABEL: test_reshape
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 819>}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 819>}
 func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<1x819xf32> {
   %0 = "tf.Const"()  {value = dense<[1, 819]> : tensor<2xi32>}  : () -> tensor<2xi32>
   %3 = "tf.Reshape"(%arg0, %0)   : (tensor<13x21x3xf32>, tensor<2xi32>) -> tensor<1x819xf32>
@@ -578,7 +594,7 @@ func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<1x819xf32> {
 // -----
 
 // CHECK-LABEL: test_transpose
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[2, 0, 1]> : tensor<3xi32>}
 // CHECK: %[[VAR1:.*]] = "tosa.transpose"(%arg0, %[[VAR0]])
 func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> {
   %2 = "tf.Const"()  {value = dense<[2, 0, 1]> : tensor<3xi32>}  : () -> tensor<3xi32>
@@ -589,7 +605,7 @@ func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> {
 // -----
 
 // CHECK-LABEL: test_slice
-// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 4, 11, 1>, start = array<i64: 6, 8, 0>}
+// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 4, 11, 1>, start = array<i64: 6, 8, 0>}
 func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<4x11x1xf32> {
   %2 = "tf.Const"()  {value = dense<[6, 8, 0]> : tensor<3xi64>}  : () -> tensor<3xi64>
   %3 = "tf.Const"()  {value = dense<[4, 11, 1]> : tensor<3xi64>}  : () -> tensor<3xi64>
@@ -600,10 +616,10 @@ func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<4x11x1xf32> {
 // -----
 
 // CHECK-LABEL: test_strided_slice
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 9, 21, 2>, start = array<i64: 4, 0, 1>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 9, 1, 7, 3, 2, 1>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) {size = array<i64: 9, 1, 7, 1, 2, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 9, 7, 2>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 9, 21, 2>, start = array<i64: 4, 0, 1>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 9, 1, 7, 3, 2, 1>}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) <{size = array<i64: 9, 1, 7, 1, 2, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 9, 7, 2>}
 func.func @test_strided_slice(%arg0: tensor<13x21x3xf32>) -> tensor<9x7x2xf32> {
   %2 = "tf.Const"()  {value = dense<[4, 0, 1]> : tensor<3xi64>}  : () -> tensor<3xi64>
   %3 = "tf.Const"()  {value = dense<[13, 21, 3]> : tensor<3xi64>}  : () -> tensor<3xi64>
@@ -615,7 +631,7 @@ func.func @test_strided_slice(%arg0: tensor<13x21x3xf32>) -> tensor<9x7x2xf32> {
 // -----
 
 // CHECK-LABEL: test_select
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg2) {new_shape = array<i64: 1, 1, 1>} : (tensor<1xi1>) -> tensor<1x1x1xi1>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg2) <{new_shape = array<i64: 1, 1, 1>}> : (tensor<1xi1>) -> tensor<1x1x1xi1>
 // CHECK: %[[VAR2:.*]] = "tosa.select"(%[[VAR1]], %arg0, %arg1)
 func.func @test_select(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<1xi1>) -> tensor<13x21x3xf32> {
   %2 = "tf.SelectV2"(%arg2, %arg0, %arg1)   : (tensor<1xi1>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
@@ -636,7 +652,7 @@ func.func @test_addn(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %ar
 // -----
 
 // CHECK-LABEL: test_concatv2
-// CHECK: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1, %arg2, %arg3) {axis = 0 : i64}
+// CHECK: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1, %arg2, %arg3) <{axis = 0 : i64}
 func.func @test_concatv2(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<13x21x3xf32>, %arg3: tensor<13x21x3xf32>) -> tensor<52x21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<i32>}  : () -> tensor<i32>
   %3 = "tf.ConcatV2"(%arg0, %arg1, %arg2, %arg3, %2)   : (tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<i32>) -> tensor<52x21x3xf32>
@@ -646,8 +662,8 @@ func.func @test_concatv2(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>,
 // -----
 
 // CHECK-LABEL: test_stack
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1, %arg2, %arg3) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 4, 13, 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1, %arg2, %arg3) <{axis = 0 : i64}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 4, 13, 21, 3>}
 func.func @test_stack(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<13x21x3xf32>, %arg3: tensor<13x21x3xf32>) -> tensor<4x13x21x3xf32> {
   %2 = "tf.Pack"(%arg0, %arg1, %arg2, %arg3)  {axis = 0 : i64}  : (tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<4x13x21x3xf32>
   func.return %2 : tensor<4x13x21x3xf32>
@@ -656,7 +672,7 @@ func.func @test_stack(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %a
 // -----
 
 // CHECK-LABEL: test_unstack
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 32, 32, 8>}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 32, 32, 8>}
 func.func @test_unstack(%arg0: tensor<1x32x32x8xf32>) -> tensor<32x32x8xf32> {
   %2 = "tf.Unpack"(%arg0)  {axis = 0 : i64}  : (tensor<1x32x32x8xf32>) -> tensor<32x32x8xf32>
   %3 = "tf.Identity"(%2)   : (tensor<32x32x8xf32>) -> tensor<32x32x8xf32>
@@ -666,8 +682,8 @@ func.func @test_unstack(%arg0: tensor<1x32x32x8xf32>) -> tensor<32x32x8xf32> {
 // -----
 
 // CHECK-LABEL: test_pad
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<1> : tensor<3x2xi32>}
-// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<f32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1> : tensor<3x2xi32>}
+// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}
 // CHECK: %[[VAR1:.*]] = "tosa.pad"(%arg0, %[[VAR0]], %[[PVAL]])
 func.func @test_pad(%arg0: tensor<13x21x3xf32>) -> tensor<15x23x5xf32> {
   %2 = "tf.Const"()  {value = dense<1> : tensor<3x2xi32>}  : () -> tensor<3x2xi32>
@@ -678,7 +694,7 @@ func.func @test_pad(%arg0: tensor<13x21x3xf32>) -> tensor<15x23x5xf32> {
 // -----
 
 // CHECK-LABEL: test_expand_dims
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 21, 3>}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 13, 21, 3>}
 func.func @test_expand_dims(%arg0: tensor<13x21x3xf32>) -> tensor<1x13x21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<i32>}  : () -> tensor<i32>
   %3 = "tf.ExpandDims"(%arg0, %2)   : (tensor<13x21x3xf32>, tensor<i32>) -> tensor<1x13x21x3xf32>
@@ -688,7 +704,7 @@ func.func @test_expand_dims(%arg0: tensor<13x21x3xf32>) -> tensor<1x13x21x3xf32>
 // -----
 
 // CHECK-LABEL: test_expand_dims_negative_index
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 13, 1, 21, 3>}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 13, 1, 21, 3>}
 func.func @test_expand_dims_negative_index(%arg0: tensor<13x21x3xf32>) -> tensor<13x1x21x3xf32> {
   %2 = "tf.Const"()  {value = dense<-2> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.ExpandDims"(%arg0, %2)   : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<13x1x21x3xf32>
@@ -698,7 +714,7 @@ func.func @test_expand_dims_negative_index(%arg0: tensor<13x21x3xf32>) -> tensor
 // -----
 
 // CHECK-LABEL: test_shape
-// CHECK: %[[VAR0:.*]] = "tosa.const"() {value = dense<[13, 21, 3]> : tensor<3xi32>}
+// CHECK: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[13, 21, 3]> : tensor<3xi32>}
 func.func @test_shape() -> tensor<3xi32> {
   %3 = "tf.Const"()  {value = dense<[13, 21, 3]> : tensor<3xi32>}  : () -> tensor<3xi32>
   func.return %3 : tensor<3xi32>
@@ -707,7 +723,7 @@ func.func @test_shape() -> tensor<3xi32> {
 // -----
 
 // CHECK-LABEL: test_rank
-// CHECK: %[[VAR0:.*]] = "tosa.const"() {value = dense<3> : tensor<i32>}
+// CHECK: %[[VAR0:.*]] = "tosa.const"() <{value = dense<3> : tensor<i32>}
 func.func @test_rank() -> tensor<i32> {
   %3 = "tf.Const"()  {value = dense<3> : tensor<i32>}  : () -> tensor<i32>
   func.return %3 : tensor<i32>
@@ -716,8 +732,8 @@ func.func @test_rank() -> tensor<i32> {
 // -----
 
 // CHECK-LABEL: test_elu
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x1xf32>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.exp"(%arg0)
 // CHECK-DAG: %[[VAR4:.*]] = "tosa.sub"(%[[VAR2]], %[[VAR0]])
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.greater_equal"(%arg0, %[[VAR1]])
@@ -730,10 +746,12 @@ func.func @test_elu(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_softmax
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.exp"(%arg0)
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%[[VAR0]]) {axis = 2 : i64}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.reciprocal"(%[[VAR1]])
-// CHECK: %[[VAR3:.*]] = "tosa.mul"(%[[VAR0]], %[[VAR2]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_max"(%arg0)
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.sub"(%arg0, %[[VAR0]])
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.exp"(%[[VAR1]])
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reduce_sum"(%[[VAR2]]) <{axis = 2 : i64}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.reciprocal"(%[[VAR3]])
+// CHECK: %[[VAR5:.*]] = "tosa.mul"(%[[VAR2]], %[[VAR4]]) <{shift = 0 : i32}
 func.func @test_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %2 = "tf.Softmax"(%arg0)   : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   func.return %2 : tensor<13x21x3xf32>
@@ -743,9 +761,9 @@ func.func @test_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 
 // CHECK-LABEL: test_log_softmax
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.exp"(%arg0)
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%[[VAR0]]) {axis = 2 : i64}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%[[VAR0]]) <{axis = 2 : i64}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.reciprocal"(%[[VAR1]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.mul"(%[[VAR0]], %[[VAR2]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.mul"(%[[VAR0]], %[[VAR2]]) <{shift = 0 : i32}
 // CHECK: %[[VAR4:.*]] = "tosa.log"(%[[VAR3]])
 func.func @test_log_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %2 = "tf.LogSoftmax"(%arg0)   : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
@@ -764,10 +782,10 @@ func.func @test_batch_matmul_3d(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x3x4
 // -----
 
 // CHECK-LABEL: test_batch_matmul_4d
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 65, 21, 3>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 65, 3, 42>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 65, 21, 3>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 65, 3, 42>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 5, 13, 21, 42>}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 5, 13, 21, 42>}
 func.func @test_batch_matmul_4d(%arg0: tensor<5x13x21x3xf32>, %arg1: tensor<5x13x3x42xf32>) -> tensor<5x13x21x42xf32> {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false, device = ""} : (tensor<5x13x21x3xf32>, tensor<5x13x3x42xf32>) -> tensor<5x13x21x42xf32>
   func.return %0 : tensor<5x13x21x42xf32>
@@ -776,10 +794,10 @@ func.func @test_batch_matmul_4d(%arg0: tensor<5x13x21x3xf32>, %arg1: tensor<5x13
 // -----
 
 // CHECK-LABEL: test_matmul
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 14, 19>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 19, 28>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 14, 19>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 19, 28>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 14, 28>}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 14, 28>}
 func.func @test_matmul(%arg0: tensor<14x19xf32>, %arg1: tensor<19x28xf32>) -> tensor<14x28xf32> {
   %2 = "tf.MatMul"(%arg0, %arg1)  {transpose_a = false, transpose_b = false}  : (tensor<14x19xf32>, tensor<19x28xf32>) -> tensor<14x28xf32>
   func.return %2 : tensor<14x28xf32>
@@ -788,7 +806,7 @@ func.func @test_matmul(%arg0: tensor<14x19xf32>, %arg1: tensor<19x28xf32>) -> te
 // -----
 
 // CHECK-LABEL: test_add_scalar
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x1xf32>}
 // CHECK: %[[VAR2:.*]] = "tosa.add"(%arg0, %[[VAR0]])
 func.func @test_add_scalar(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %2 = "tf.Const"()  {value = dense<1.000000e+00> : tensor<f32>}  : () -> tensor<f32>
@@ -799,8 +817,8 @@ func.func @test_add_scalar(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_add_1d
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_sum"(%arg1) {axis = 0 : i64}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%[[VAR0]]) {axis = 1 : i64}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_sum"(%arg1) <{axis = 0 : i64}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%[[VAR0]]) <{axis = 1 : i64}
 // CHECK: %[[VAR2:.*]] = "tosa.add"(%arg0, %[[VAR1]])
 func.func @test_add_1d(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %0 = "tf.Const"()  {value = dense<[0, 1]> : tensor<2xi32>}  : () -> tensor<2xi32>
@@ -812,9 +830,9 @@ func.func @test_add_1d(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -
 // -----
 
 // CHECK-LABEL: test_split
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 0, 0>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 7, 0>}
-// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 14, 0>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 13, 7, 3>, start = array<i64: 0, 0, 0>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 13, 7, 3>, start = array<i64: 0, 7, 0>}
+// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 13, 7, 3>, start = array<i64: 0, 14, 0>}
 func.func @test_split(%arg0: tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor<13x7x3xf32>, tensor<13x7x3xf32>) {
   %6 = "tf.Const"()  {value = dense<1> : tensor<i32>}  : () -> tensor<i32>
   %7:3 = "tf.Split"(%6, %arg0)   : (tensor<i32>, tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor<13x7x3xf32>, tensor<13x7x3xf32>)
@@ -845,13 +863,13 @@ func.func @test_reverse(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_space_to_batch
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{\[}}[0, 0], [0, 1], [0, 0]]> : tensor<3x2xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<[2, 0, 1, 3]> : tensor<4xi32>}
-// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<f32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{\[}}[0, 0], [0, 1], [0, 0]]> : tensor<3x2xi32>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<[2, 0, 1, 3]> : tensor<4xi32>}
+// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.pad"(%arg0, %[[VAR0]], %[[PVAL]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 13, 11, 2, 3>}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 13, 11, 2, 3>}
 // CHECK-DAG: %[[VAR4:.*]] = "tosa.transpose"(%[[VAR3]], %[[VAR1]])
-// CHECK: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = array<i64: 26, 11, 3>}
+// CHECK: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) <{new_shape = array<i64: 26, 11, 3>}
 func.func @test_space_to_batch(%arg0: tensor<13x21x3xf32>) -> tensor<26x11x3xf32> {
   %2 = "tf.Const"()  {value = dense<2> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Const"()  {value = dense<[[0, 1]]> : tensor<1x2xi32>}  : () -> tensor<1x2xi32>
@@ -862,12 +880,12 @@ func.func @test_space_to_batch(%arg0: tensor<13x21x3xf32>) -> tensor<26x11x3xf32
 // -----
 
 // CHECK-LABEL: test_batch_to_space
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[3, 1, 2, 0]> : tensor<4xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<[2, 3, 0, 4, 1, 5]> : tensor<6xi32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[3, 1, 2, 0]> : tensor<4xi32>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<[2, 3, 0, 4, 1, 5]> : tensor<6xi32>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%arg0, %[[VAR0]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 2, 2, 2, 32, 32, 1>}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 2, 2, 2, 32, 32, 1>}
 // CHECK-DAG: %[[VAR4:.*]] = "tosa.transpose"(%[[VAR3]], %[[VAR1]])
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = array<i64: 2, 64, 64, 1>}
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) <{new_shape = array<i64: 2, 64, 64, 1>}
 // CHECK: return %[[VAR5]]
 func.func @test_batch_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<2x64x64x1xf32> {
   %2 = "tf.Const"()  {value = dense<2> : tensor<2xi32>}  : () -> tensor<2xi32>
@@ -881,10 +899,10 @@ func.func @test_batch_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<2x64x64x1
 // -----
 
 // CHECK-LABEL: test_space_to_depth
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 16, 2, 16, 2, 8>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 16, 2, 16, 2, 8>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%[[VAR1]], %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 1, 16, 16, 32>}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 1, 16, 16, 32>}
 func.func @test_space_to_depth(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x16x16x32xf32> {
   %2 = "tf.SpaceToDepth"(%arg0)  {block_size = 2 : i64, data_format = "NHWC"}  : (tensor<1x32x32x8xf32>) -> tensor<1x16x16x32xf32>
   func.return %2 : tensor<1x16x16x32xf32>
@@ -893,10 +911,10 @@ func.func @test_space_to_depth(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x16x16x3
 // -----
 
 // CHECK-LABEL: test_depth_to_space
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 32, 32, 2, 2, 2>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 32, 32, 2, 2, 2>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%[[VAR1]], %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 1, 64, 64, 2>}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 1, 64, 64, 2>}
 func.func @test_depth_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x2xf32> {
   %2 = "tf.DepthToSpace"(%arg0)  {block_size = 2 : i64, data_format = "NHWC"}  : (tensor<1x32x32x8xf32>) -> tensor<1x64x64x2xf32>
   func.return %2 : tensor<1x64x64x2xf32>
@@ -914,7 +932,7 @@ func.func @test_left_shift(%arg0: tensor<4x4xi32>, %arg1: tensor<1x1xi32>) -> te
 // -----
 
 // CHECK-LABEL: test_right_shift
-// CHECK: %[[VAR0:.*]] = "tosa.arithmetic_right_shift"(%arg0, %arg1) {round = false}
+// CHECK: %[[VAR0:.*]] = "tosa.arithmetic_right_shift"(%arg0, %arg1) <{round = false}
 func.func @test_right_shift(%arg0: tensor<4x4xi32>, %arg1: tensor<1x1xi32>) -> tensor<4x4xi32> {
   %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4x4xi32>, tensor<1x1xi32>) -> tensor<4x4xi32>
   func.return %0 : tensor<4x4xi32>
@@ -924,13 +942,13 @@ func.func @test_right_shift(%arg0: tensor<4x4xi32>, %arg1: tensor<1x1xi32>) -> t
 
 // CHECK-LABEL: @test_one_hot
 // CHECK-SAME:      %[[ARG0_0:.*]]: tensor<4x4xi32>, %[[ARG1_0:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>
-// CHECK:         %[[RESHAPE_0:.*]] = "tosa.reshape"(%[[ARG1_0]]) {new_shape = array<i64: 1, 1, 1>}
-// CHECK:         %[[TILE:.*]] = "tosa.tile"(%[[RESHAPE_0]]) {multiples = array<i64: 16, 1, 1>}
-// CHECK:         %[[RESHAPE_1:.*]] = "tosa.reshape"(%[[ARG2]]) {new_shape = array<i64: 1, 1, 1>}
-// CHECK:         %[[TILE_0:.*]] = "tosa.tile"(%[[RESHAPE_1]]) {multiples = array<i64: 16, 2, 1>}
-// CHECK:         %[[RESHAPE_2:.*]] = "tosa.reshape"(%[[ARG0_0]]) {new_shape = array<i64: 16, 1>}
+// CHECK:         %[[RESHAPE_0:.*]] = "tosa.reshape"(%[[ARG1_0]]) <{new_shape = array<i64: 1, 1, 1>}
+// CHECK:         %[[TILE:.*]] = "tosa.tile"(%[[RESHAPE_0]]) <{multiples = array<i64: 16, 1, 1>}
+// CHECK:         %[[RESHAPE_1:.*]] = "tosa.reshape"(%[[ARG2]]) <{new_shape = array<i64: 1, 1, 1>}
+// CHECK:         %[[TILE_0:.*]] = "tosa.tile"(%[[RESHAPE_1]]) <{multiples = array<i64: 16, 2, 1>}
+// CHECK:         %[[RESHAPE_2:.*]] = "tosa.reshape"(%[[ARG0_0]]) <{new_shape = array<i64: 16, 1>}
 // CHECK:         %[[SCATTER:.*]] = "tosa.scatter"(%[[TILE_0]], %[[RESHAPE_2]], %[[TILE]])
-// CHECK:         %[[RESHAPE_3:.*]] = "tosa.reshape"(%[[SCATTER]]) {new_shape = array<i64: 4, 4, 2>}
+// CHECK:         %[[RESHAPE_3:.*]] = "tosa.reshape"(%[[SCATTER]]) <{new_shape = array<i64: 4, 4, 2>}
 // CHECK:         return %[[RESHAPE_3]]
 func.func @test_one_hot(%arg0: tensor<4x4xi32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<4x4x2xf32> {
   %0 = "tf.Const"()  {value = dense<2> : tensor<i32>}  : () -> tensor<i32>
@@ -941,18 +959,18 @@ func.func @test_one_hot(%arg0: tensor<4x4xi32>, %arg1: tensor<f32>, %arg2: tenso
 // -----
 
 // CHECK-LABEL: test_fakequant_with_min_max_args
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<-2.00003052> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<1.99996948> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() {value = dense<6.10360876E-5> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() {value = dense<16383.75> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() {value = dense<5.000000e-01> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<-2.00003052> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<1.99996948> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{value = dense<6.10360876E-5> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{value = dense<16383.75> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{value = dense<5.000000e-01> : tensor<1x1x1xf32>}
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.minimum"(%arg0, %[[VAR1]])
 // CHECK-DAG: %[[VAR8:.*]] = "tosa.maximum"(%[[VAR6]], %[[VAR0]])
 // CHECK-DAG: %[[VAR10:.*]] = "tosa.sub"(%[[VAR8]], %[[VAR0]])
-// CHECK-DAG: %[[VAR12:.*]] = "tosa.mul"(%[[VAR10]], %[[VAR3]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR12:.*]] = "tosa.mul"(%[[VAR10]], %[[VAR3]]) <{shift = 0 : i32}
 // CHECK-DAG: %[[VAR14:.*]] = "tosa.add"(%[[VAR12]], %[[VAR4]])
 // CHECK-DAG: %[[VAR15:.*]] = "tosa.floor"(%[[VAR14]])
-// CHECK-DAG: %[[VAR17:.*]] = "tosa.mul"(%[[VAR15]], %[[VAR2]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR17:.*]] = "tosa.mul"(%[[VAR15]], %[[VAR2]]) <{shift = 0 : i32}
 // CHECK: %[[VAR19:.*]] = "tosa.add"(%[[VAR17]], %[[VAR0]])
 func.func @test_fakequant_with_min_max_args(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %2 = "tf.FakeQuantWithMinMaxArgs"(%arg0)  {max = 2.000000e+00 : f32, min = -2.000000e+00 : f32, narrow_range = false, num_bits = 16 : i64}  : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
@@ -961,10 +979,10 @@ func.func @test_fakequant_with_min_max_args(%arg0: tensor<13x21x3xf32>) -> tenso
 
 // -----
 // CHECK-LABEL: test_gather
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<1x49xi32>}
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 63>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<1x49xi32>}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 13, 63>}
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.gather"(%[[VAR4]], %[[VAR0]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = array<i64: 7, 7, 21, 3>}
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) <{new_shape = array<i64: 7, 7, 21, 3>}
 // CHECK: return %[[VAR7]]
 func.func @test_gather(%arg0: tensor<13x21x3xf32>) -> tensor<7x7x21x3xf32> {
   %0 = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
@@ -976,10 +994,10 @@ func.func @test_gather(%arg0: tensor<13x21x3xf32>) -> tensor<7x7x21x3xf32> {
 
 // -----
 // CHECK-LABEL: test_gather_nd
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<1x42xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 63>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<1x42xi32>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 13, 63>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.gather"(%[[VAR1]], %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 6, 7, 21, 3>}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 6, 7, 21, 3>}
 func.func @test_gather_nd(%arg0: tensor<13x21x3xf32>) -> tensor<6x7x21x3xf32> {
   %0 = "tf.Const"() {device = "", value = dense<[[[0], [5], [3], [12], [2], [4], [3]], [[11], [1], [11], [10], [3], [12], [8]], [[5], [3], [1], [11], [3], [10], [0]], [[0], [8], [4], [7], [3], [12], [2]], [[7], [6], [11], [4], [2], [10], [11]], [[11], [1], [11], [1], [1], [11], [8]]]> : tensor<6x7x1xi32>} : () -> tensor<6x7x1xi32>
   %1 = "tf.GatherNd"(%arg0, %0) {device = ""} : (tensor<13x21x3xf32>, tensor<6x7x1xi32>) -> tensor<6x7x21x3xf32>
@@ -992,16 +1010,16 @@ func.func @test_gather_nd(%arg0: tensor<13x21x3xf32>) -> tensor<6x7x21x3xf32> {
 
 // CHECK-LABEL: test_fused_batch_norm
 func.func @test_fused_batch_norm(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
-  // CHECK:  %[[ONE:.+]] = "tosa.const"() {value = dense<1.000000e-03> : tensor<1xf32>}
-  // CHECK:  %[[RES0:.+]] = "tosa.reshape"(%arg3) {new_shape = array<i64: 1, 1, 1, 8>}
+  // CHECK:  %[[ONE:.+]] = "tosa.const"() <{value = dense<1.000000e-03> : tensor<1xf32>}
+  // CHECK:  %[[RES0:.+]] = "tosa.reshape"(%arg3) <{new_shape = array<i64: 1, 1, 1, 8>}
   // CHECK:  %[[SUB0:.+]] = "tosa.sub"(%arg0, %[[RES0]])
   // CHECK:  %[[ADD0:.+]] = "tosa.add"(%arg4, %[[ONE]])
   // CHECK:  %[[RSQR:.+]] = "tosa.rsqrt"(%[[ADD0]])
-  // CHECK:  %[[RES1:.+]] = "tosa.reshape"(%[[RSQR]]) {new_shape = array<i64: 1, 1, 1, 8>}
-  // CHECK:  %[[MUL0:.+]] = "tosa.mul"(%[[SUB0]], %[[RES1]]) {shift = 0 : i32}
-  // CHECK:  %[[RES1:.+]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 1, 1, 8>}
-  // CHECK:  %[[MUL1:.+]] = "tosa.mul"(%[[MUL0]], %[[RES1]]) {shift = 0 : i32}
-  // CHECK:  %[[RES2:.+]] = "tosa.reshape"(%arg2) {new_shape = array<i64: 1, 1, 1, 8>}
+  // CHECK:  %[[RES1:.+]] = "tosa.reshape"(%[[RSQR]]) <{new_shape = array<i64: 1, 1, 1, 8>}
+  // CHECK:  %[[MUL0:.+]] = "tosa.mul"(%[[SUB0]], %[[RES1]]) <{shift = 0 : i32}
+  // CHECK:  %[[RES1:.+]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 1, 1, 8>}
+  // CHECK:  %[[MUL1:.+]] = "tosa.mul"(%[[MUL0]], %[[RES1]]) <{shift = 0 : i32}
+  // CHECK:  %[[RES2:.+]] = "tosa.reshape"(%arg2) <{new_shape = array<i64: 1, 1, 1, 8>}
   // CHECK:  %[[ADD1:.+]] = "tosa.add"(%[[MUL1]], %[[RES2]])
   %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<*xf32>)
 
@@ -1022,14 +1040,14 @@ func.func @test_fused_batch_norm_training(%arg0: tensor<8x8x8x8xf32>, %arg1: ten
 
 // CHECK-LABEL: mirrorpad_symmetric
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<5x10xf32>
-// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 1, 10>, start = array<i64: 0, 0>} : (tensor<5x10xf32>)
-// CHECK: %[[VAL_2:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 2, 10>, start = array<i64: 3, 0>} : (tensor<5x10xf32>)
-// CHECK: %[[VAL_3:.*]] = "tosa.reverse"(%[[VAL_2]]) {axis = 0 : i64} : (tensor<2x10xf32>)
-// CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_1]], %[[VAL_0]], %[[VAL_3]]) {axis = 0 : i64} : (tensor<1x10xf32>, tensor<5x10xf32>, tensor<2x10xf32>)
-// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) {size = array<i64: 8, 1>, start = array<i64: 0, 0>} : (tensor<8x10xf32>)
-// CHECK: %[[VAL_6:.*]] = "tosa.slice"(%[[VAL_4]]) {size = array<i64: 8, 2>, start = array<i64: 0, 8>} : (tensor<8x10xf32>)
-// CHECK: %[[VAL_7:.*]] = "tosa.reverse"(%[[VAL_6]]) {axis = 1 : i64} : (tensor<8x2xf32>)
-// CHECK: %[[VAL_8:.*]] = "tosa.concat"(%[[VAL_5]], %[[VAL_4]], %[[VAL_7]]) {axis = 1 : i64} : (tensor<8x1xf32>, tensor<8x10xf32>, tensor<8x2xf32>)
+// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) <{size = array<i64: 1, 10>, start = array<i64: 0, 0>}> : (tensor<5x10xf32>)
+// CHECK: %[[VAL_2:.*]] = "tosa.slice"(%[[VAL_0]]) <{size = array<i64: 2, 10>, start = array<i64: 3, 0>}> : (tensor<5x10xf32>)
+// CHECK: %[[VAL_3:.*]] = "tosa.reverse"(%[[VAL_2]]) <{axis = 0 : i64}> : (tensor<2x10xf32>)
+// CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_1]], %[[VAL_0]], %[[VAL_3]]) <{axis = 0 : i64}> : (tensor<1x10xf32>, tensor<5x10xf32>, tensor<2x10xf32>)
+// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) <{size = array<i64: 8, 1>, start = array<i64: 0, 0>}> : (tensor<8x10xf32>)
+// CHECK: %[[VAL_6:.*]] = "tosa.slice"(%[[VAL_4]]) <{size = array<i64: 8, 2>, start = array<i64: 0, 8>}> : (tensor<8x10xf32>)
+// CHECK: %[[VAL_7:.*]] = "tosa.reverse"(%[[VAL_6]]) <{axis = 1 : i64}> : (tensor<8x2xf32>)
+// CHECK: %[[VAL_8:.*]] = "tosa.concat"(%[[VAL_5]], %[[VAL_4]], %[[VAL_7]]) <{axis = 1 : i64}> : (tensor<8x1xf32>, tensor<8x10xf32>, tensor<8x2xf32>)
 func.func @mirrorpad_symmetric(%arg0: tensor<5x10xf32>) -> tensor<8x13xf32> {
   %cst = "tf.Const"() {device = "", value = dense<[[1, 2], [1, 2]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
   %0 = "tf.MirrorPad"(%arg0, %cst) {device = "", mode = "SYMMETRIC"} : (tensor<5x10xf32>, tensor<2x2xi32>) -> tensor<8x13xf32>
@@ -1041,12 +1059,12 @@ func.func @mirrorpad_symmetric(%arg0: tensor<5x10xf32>) -> tensor<8x13xf32> {
 
 // CHECK-LABEL: mirrorpad_reflect
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3xf32>
-// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 1, 21, 3>, start = array<i64: 1, 0, 0>} : (tensor<13x21x3xf32>)
-// CHECK: %[[VAL_2:.*]] = "tosa.concat"(%[[VAL_1]], %[[VAL_0]]) {axis = 0 : i64} : (tensor<1x21x3xf32>, tensor<13x21x3xf32>)
-// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_2]]) {size = array<i64: 14, 1, 3>, start = array<i64: 0, 1, 0>} : (tensor<14x21x3xf32>)
-// CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_3]], %[[VAL_2]]) {axis = 1 : i64} : (tensor<14x1x3xf32>, tensor<14x21x3xf32>)
-// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) {size = array<i64: 14, 22, 1>, start = array<i64: 0, 0, 1>} : (tensor<14x22x3xf32>)
-// CHECK: %[[VAL_6:.*]] = "tosa.concat"(%[[VAL_5]], %[[VAL_4]]) {axis = 2 : i64} : (tensor<14x22x1xf32>, tensor<14x22x3xf32>)
+// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) <{size = array<i64: 1, 21, 3>, start = array<i64: 1, 0, 0>}> : (tensor<13x21x3xf32>)
+// CHECK: %[[VAL_2:.*]] = "tosa.concat"(%[[VAL_1]], %[[VAL_0]]) <{axis = 0 : i64}> : (tensor<1x21x3xf32>, tensor<13x21x3xf32>)
+// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_2]]) <{size = array<i64: 14, 1, 3>, start = array<i64: 0, 1, 0>}> : (tensor<14x21x3xf32>)
+// CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_3]], %[[VAL_2]]) <{axis = 1 : i64}> : (tensor<14x1x3xf32>, tensor<14x21x3xf32>)
+// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) <{size = array<i64: 14, 22, 1>, start = array<i64: 0, 0, 1>}> : (tensor<14x22x3xf32>)
+// CHECK: %[[VAL_6:.*]] = "tosa.concat"(%[[VAL_5]], %[[VAL_4]]) <{axis = 2 : i64}> : (tensor<14x22x1xf32>, tensor<14x22x3xf32>)
 func.func @mirrorpad_reflect(%arg0: tensor<13x21x3xf32>) -> tensor<14x22x4xf32> {
   %cst = "tf.Const"() {device = "", value = dense<[[1, 0], [1, 0], [1, 0]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
   %0 = "tf.MirrorPad"(%arg0, %cst) {device = "", mode = "REFLECT"} : (tensor<13x21x3xf32>, tensor<3x2xi32>) -> tensor<14x22x4xf32>
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir
index 95c8f252767..469728b361f 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir
@@ -26,7 +26,7 @@ func.func @test_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 
 // CHECK-LABEL: test_matmul
 // CHECK-DAG: %[[VAR0:.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<28xf32>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<28xf32>}>
 // CHECK: %[[VAR2:.*]] = "tosa.transpose"(%arg1, %[[VAR0]])
 // CHECK: %[[VAR3:.*]] = "tosa.fully_connected"(%arg0, %[[VAR2]], %[[VAR1]])
 func.func @test_matmul(%arg0: tensor<14x19xf32>, %arg1: tensor<19x28xf32>) -> tensor<*xf32> {
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
index b16b7ffa83b..ddfc7eefe81 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
@@ -11,8 +11,8 @@
 // -----
 
 // CHECK-LABEL: test_conv2d
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
-// CHECK: %[[VAR1:.*]] = "tosa.conv2d"(%arg0, %arg1, %[[VAR0]]) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}>
+// CHECK: %[[VAR1:.*]] = "tosa.conv2d"(%arg0, %arg1, %[[VAR0]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}>
 func.func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<0.000000e+00> : tensor<16xf32>
   %0 = "tfl.conv_2d"(%arg0, %arg1, %cst)  {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<*xf32>
@@ -33,7 +33,7 @@ func.func @test_conv2d_dynamic(%arg0: tensor<?x32x32x8xf32>, %arg1: tensor<16x1x
 // -----
 
 // CHECK-LABEL: test_conv2d_bias
-// CHECK: %[[VAR0:.*]] = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR0:.*]] = "tosa.conv2d"(%arg0, %arg1, %arg2) <{dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}>
 // CHECK-SAME: tensor<1x32x32x16xf32>
 func.func @test_conv2d_bias(%arg0: tensor<1x32x32x8xf32>, %cst: tensor<16x2x2x8xf32>, %cst_0: tensor<16xf32>) -> tensor<*xf32> {
   %0 = "tfl.conv_2d"(%arg0, %cst, %cst_0)  {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<*xf32>
@@ -43,8 +43,8 @@ func.func @test_conv2d_bias(%arg0: tensor<1x32x32x8xf32>, %cst: tensor<16x2x2x8x
 // -----
 
 // CHECK-LABEL: test_transpose_conv2d
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
-// CHECK: %[[VAR1:.*]] = "tosa.transpose_conv2d"(%arg0, %arg1, %[[VAR0]]) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}>
+// CHECK: %[[VAR1:.*]] = "tosa.transpose_conv2d"(%arg0, %arg1, %[[VAR0]]) <{out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>}>
 func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %cst_0: tensor<16x1x1x8xf32>) -> tensor<1x32x32x16xf32> {
   %cst = arith.constant dense<[1, 32, 32, 16]> : tensor<4xi32>
   %cst_1 = "tfl.no_value"() {value = unit} : () -> none
@@ -55,9 +55,9 @@ func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %cst_0: tensor<16
 // -----
 
 // CHECK-LABEL: test_transpose_conv2d_relu
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
-// CHECK: %[[VAR1:.*]] = "tosa.transpose_conv2d"(%arg0, %arg1, %[[VAR0]]) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>}
-// CHECK: %[[VAR2:.*]] = "tosa.clamp"(%[[VAR1]]) {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}>
+// CHECK: %[[VAR1:.*]] = "tosa.transpose_conv2d"(%arg0, %arg1, %[[VAR0]]) <{out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>}>
+// CHECK: %[[VAR2:.*]] = "tosa.clamp"(%[[VAR1]]) <{max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}>
 func.func @test_transpose_conv2d_relu(%arg0: tensor<1x32x32x8xf32>, %cst_0: tensor<16x1x1x8xf32>) -> tensor<1x32x32x16xf32> {
   %cst = arith.constant dense<[1, 32, 32, 16]> : tensor<4xi32>
   %cst_1 = "tfl.no_value"() {value = unit} : () -> none
@@ -68,9 +68,9 @@ func.func @test_transpose_conv2d_relu(%arg0: tensor<1x32x32x8xf32>, %cst_0: tens
 // -----
 
 // CHECK-LABEL: test_conv2d_qi8
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<16x2x2x8xi8>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<0> : tensor<16xi32>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.conv2d"(%arg0, %[[VAR0]], %[[VAR1]]) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<16x2x2x8xi8>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0> : tensor<16xi32>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.conv2d"(%arg0, %[[VAR0]], %[[VAR1]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>}>
 // CHECK: %[[VAR3:.*]] = "tosa.rescale"(%[[VAR2]])
 func.func @test_conv2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>> {
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<16x2x2x8x!quant.uniform<i8<-127:127>:f32:0, {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1}>>, value = dense<42> : tensor<16x2x2x8xi8>} : () -> tensor<16x2x2x8x!quant.uniform<i8<-127:127>:f32:0,  {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1} >>
@@ -82,9 +82,9 @@ func.func @test_conv2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.0156
 // -----
 
 // CHECK-LABEL: test_conv2d_qi16
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0> : tensor<16xi48>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<16x1x1x8xi8>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.conv2d"(%arg0, %[[VAR1]], %[[VAR0]]) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0> : tensor<16xi48>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<16x1x1x8xi8>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.conv2d"(%arg0, %[[VAR1]], %[[VAR0]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>}>
 // CHECK: %[[VAR3:.*]] = "tosa.rescale"(%[[VAR2]])
 func.func @test_conv2d_qi16(%arg0: tensor<1x32x32x8x!quant.uniform<i16:f32, 1.0>>) -> tensor<1x32x32x16x!quant.uniform<i16:f32, 1.0>> {
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<16x1x1x8x!quant.uniform<i8:f32, 1.0>>, value = dense<42> : tensor<16x1x1x8xi8>} : () -> tensor<16x1x1x8x!quant.uniform<i8:f32, 1.0>>
@@ -97,10 +97,9 @@ func.func @test_conv2d_qi16(%arg0: tensor<1x32x32x8x!quant.uniform<i16:f32, 1.0>
 
 // CHECK-LABEL: @test_depthwise_conv2d_bias_qi8
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015678688883781433:-1>>
-// CHECK-DAG:     %[[CONST:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<16xi32>}
-// CHECK-DAG:     %[[CONST_0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<1x2x2x16xi8>}
-// CHECK-DAG:     %[[RESHAPE:.*]] = "tosa.reshape"(%[[CONST_0]]) {new_shape = array<i64: 2, 2, 8, 2>}
-// CHECK-DAG:     %[[DEPTHWISE:.*]] = "tosa.depthwise_conv2d"(%[[ARG0]], %[[RESHAPE]], %[[CONST]]) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, quantization_info = #tosa.conv_quant<input_zp = -1, weight_zp = 0>, stride = array<i64: 1, 1>}
+// CHECK-DAG:     %[[CONST:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<16xi32>}>
+// CHECK-DAG:     %[[CONST_0:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<2x2x8x2xi8>}>
+// CHECK-DAG:     %[[DEPTHWISE:.*]] = "tosa.depthwise_conv2d"(%[[ARG0]], %[[CONST_0]], %[[CONST]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, quantization_info = #tosa.conv_quant<input_zp = -1, weight_zp = 0>, stride = array<i64: 1, 1>}>
 // CHECK:         %[[RESCALE:.*]] = "tosa.rescale"(%[[DEPTHWISE]])
 // CHECK-SAME:        multiplier = array<i32: 1373724854, 1373724854, 1373724854, 1373724854, 1803013871, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854>
 // CHECK-SAME:        shift = array<i32: 36, 36, 36, 36, 32, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36>
@@ -114,6 +113,75 @@ func.func @test_depthwise_conv2d_bias_qi8(%arg0: tensor<1x32x32x8x!quant.uniform
 
 // -----
 
+// CHECK-LABEL: @test_conv2d_grouped_convolution
+// CHECK-DAG: %[[INPUT_SLICE_1:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 1, 4, 1, 64>, start = array<i64: 0, 0, 0, 0>}>
+// CHECK-DAG: %[[FILTER_SLICE_1:.*]] = "tosa.slice"(%arg1) <{size = array<i64: 64, 1, 1, 64>, start = array<i64: 0, 0, 0, 0>}>
+// CHECK-DAG: %[[BIAS_SLICE_1:.*]] = "tosa.slice"(%arg2) <{size = array<i64: 64>, start = array<i64: 0>}>
+// CHECK-DAG: %[[CONV_1:.*]] = "tosa.conv2d"(%[[INPUT_SLICE_1]], %[[FILTER_SLICE_1]], %[[BIAS_SLICE_1]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
+// CHECK-DAG: %[[INPUT_SLICE_2:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 1, 4, 1, 64>, start = array<i64: 0, 0, 0, 64>}>
+// CHECK-DAG: %[[FILTER_SLICE_2:.*]] = "tosa.slice"(%arg1) <{size = array<i64: 64, 1, 1, 64>, start = array<i64: 64, 0, 0, 0>}>
+// CHECK-DAG: %[[BIAS_SLICE_2:.*]] = "tosa.slice"(%arg2) <{size = array<i64: 64>, start = array<i64: 64>}>
+// CHECK-DAG: %[[CONV_2:.*]] = "tosa.conv2d"(%[[INPUT_SLICE_2]], %[[FILTER_SLICE_2]], %[[BIAS_SLICE_2]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
+// CHECK-DAG: %[[CONCAT:.*]] = "tosa.concat"(%[[CONV_1]], %[[CONV_2]]) <{axis = 3 : i64}>
+// CHECK: return %[[CONCAT]]
+func.func @test_conv2d_grouped_convolution(%input: tensor<1x4x1x128xf32>, %weights: tensor<128x1x1x64xf32>, %bias: tensor<128xf32>) -> tensor<1x4x1x128xf32> {
+  %0 = "tfl.conv_2d"(%input, %weights, %bias) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x4x1x128xf32>, tensor<128x1x1x64xf32>, tensor<128xf32>)  -> (tensor<1x4x1x128xf32>)
+  return %0 : tensor<1x4x1x128xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @test_conv2d_grouped_strided_convolution
+// CHECK-DAG: %[[INPUT_SLICE_1:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 1, 3, 1, 16>, start = array<i64: 0, 0, 0, 0>}>
+// CHECK-DAG: %[[FILTER_SLICE_1:.*]] = "tosa.slice"(%arg1) <{size = array<i64: 128, 3, 1, 16>, start = array<i64: 0, 0, 0, 0>}>
+// CHECK-DAG: %[[BIAS_SLICE_1:.*]] = "tosa.slice"(%arg2) <{size = array<i64: 128>, start = array<i64: 0>}>
+// CHECK-DAG: %[[CONV_1:.*]] = "tosa.conv2d"(%[[INPUT_SLICE_1]], %[[FILTER_SLICE_1]], %[[BIAS_SLICE_1]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 0, 0>, stride = array<i64: 2, 1>}>
+// CHECK-DAG: %[[INPUT_SLICE_2:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 1, 3, 1, 16>, start = array<i64: 0, 0, 0, 16>}>
+// CHECK-DAG: %[[FILTER_SLICE_2:.*]] = "tosa.slice"(%arg1) <{size = array<i64: 128, 3, 1, 16>, start = array<i64: 16, 0, 0, 0>}>
+// CHECK-DAG: %[[BIAS_SLICE_2:.*]] = "tosa.slice"(%arg2) <{size = array<i64: 128>, start = array<i64: 16>}>
+// CHECK-DAG: %[[CONV_2:.*]] = "tosa.conv2d"(%[[INPUT_SLICE_2]], %[[FILTER_SLICE_2]], %[[BIAS_SLICE_2]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 0, 0>, stride = array<i64: 2, 1>}>
+// CHECK-DAG: %[[INPUT_SLICE_3:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 1, 3, 1, 16>, start = array<i64: 0, 0, 0, 32>}>
+// CHECK-DAG: %[[FILTER_SLICE_3:.*]] = "tosa.slice"(%arg1) <{size = array<i64: 128, 3, 1, 16>, start = array<i64: 32, 0, 0, 0>}>
+// CHECK-DAG: %[[BIAS_SLICE_3:.*]] = "tosa.slice"(%arg2) <{size = array<i64: 128>, start = array<i64: 32>}>
+// CHECK-DAG: %[[CONV_3:.*]] = "tosa.conv2d"(%[[INPUT_SLICE_3]], %[[FILTER_SLICE_3]], %[[BIAS_SLICE_3]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 0, 0>, stride = array<i64: 2, 1>}>
+// CHECK-DAG: %[[INPUT_SLICE_4:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 1, 3, 1, 16>, start = array<i64: 0, 0, 0, 48>}>
+// CHECK-DAG: %[[FILTER_SLICE_4:.*]] = "tosa.slice"(%arg1) <{size = array<i64: 128, 3, 1, 16>, start = array<i64: 48, 0, 0, 0>}>
+// CHECK-DAG: %[[BIAS_SLICE_4:.*]] = "tosa.slice"(%arg2) <{size = array<i64: 128>, start = array<i64: 48>}>
+// CHECK-DAG: %[[CONV_4:.*]] = "tosa.conv2d"(%[[INPUT_SLICE_4]], %[[FILTER_SLICE_4]], %[[BIAS_SLICE_4]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 0, 0>, stride = array<i64: 2, 1>}>
+// CHECK-DAG: %[[CONCAT:.*]] = "tosa.concat"(%[[CONV_1]], %[[CONV_2]], %[[CONV_3]], %[[CONV_4]]) <{axis = 3 : i64}>
+// CHECK: return %[[CONCAT]]
+func.func @test_conv2d_grouped_strided_convolution(%input: tensor<1x3x1x64xf32>, %weights: tensor<512x3x1x16xf32>, %bias: tensor<512xf32>) -> tensor<1x2x1x512xf32> {
+  %0 = "tfl.conv_2d"(%input, %weights, %bias) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 1 : i32} : (tensor<1x3x1x64xf32>, tensor<512x3x1x16xf32>, tensor<512xf32>)  -> (tensor<1x2x1x512xf32>)
+  return %0 : tensor<1x2x1x512xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @test_conv2d_q_grouped_convolution
+// CHECK-DAG: %[[BIAS:.*]] = "tosa.const"() <{value = dense<0> : tensor<16xi32>}>
+// CHECK-DAG: %[[FILTER:.*]] = "tosa.const"() <{value = dense<42> : tensor<16x1x1x8xi8>}>
+// CHECK-DAG: %[[INPUT_SLICE_1:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 1, 4, 1, 8>, start = array<i64: 0, 0, 0, 0>}>
+// CHECK-DAG: %[[FILTER_SLICE_1:.*]] = "tosa.slice"(%[[FILTER]]) <{size = array<i64: 8, 1, 1, 8>, start = array<i64: 0, 0, 0, 0>}>
+// CHECK-DAG: %[[BIAS_SLICE_1:.*]] = "tosa.slice"(%[[BIAS]]) <{size = array<i64: 8>, start = array<i64: 0>}>
+// CHECK-DAG: %[[CONV_1:.*]] = "tosa.conv2d"(%[[INPUT_SLICE_1]], %[[FILTER_SLICE_1]], %[[BIAS_SLICE_1]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>}>
+// CHECK-DAG: %[[RESCALE_1:.*]] = "tosa.rescale"(%[[CONV_1]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1374257539, 1374257539, 1374257539, 1374257539, 1374257539, 1374257539, 1374257539, 1374257539>, output_zp = 0 : i32, per_channel = true, scale32 = true, shift = array<i32: 36, 36, 36, 36, 36, 36, 36, 36>}>
+// CHECK-DAG: %[[INPUT_SLICE_2:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 1, 4, 1, 8>, start = array<i64: 0, 0, 0, 8>}>
+// CHECK-DAG: %[[FILTER_SLICE_2:.*]] = "tosa.slice"(%[[FILTER]]) <{size = array<i64: 8, 1, 1, 8>, start = array<i64: 8, 0, 0, 0>}>
+// CHECK-DAG: %[[BIAS_SLICE_2:.*]] = "tosa.slice"(%[[BIAS]]) <{size = array<i64: 8>, start = array<i64: 8>}>
+// CHECK-DAG: %[[CONV_2:.*]] = "tosa.conv2d"(%[[INPUT_SLICE_2]], %[[FILTER_SLICE_2]], %[[BIAS_SLICE_2]]) <{dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>}>
+// CHECK-DAG: %[[RESCALE_2:.*]] = "tosa.rescale"(%[[CONV_2]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1374257539, 1374257539, 1374257539, 1374257539, 1374257539, 1374257539, 1374257539, 1374257539>, output_zp = 0 : i32, per_channel = true, scale32 = true, shift = array<i32: 36, 36, 36, 36, 36, 36, 36, 36>}>
+// CHECK-DAG: %[[CONCAT:.*]] = "tosa.concat"(%[[RESCALE_1]], %[[RESCALE_2]]) <{axis = 3 : i64}>
+// CHECK: return %[[CONCAT]]
+
+func.func @test_conv2d_q_grouped_convolution(%input: tensor<1x4x1x16x!quant.uniform<i8:f32, 0.015684768557548523>>) -> tensor<1x4x1x16x!quant.uniform<i8:f32, 0.078431375324726104>> {
+  %0 = "tfl.pseudo_qconst"() {qtype = tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1}>>, value = dense<42> : tensor<16x1x1x8xi8>} : () -> tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32:0,  {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1} >>
+  %1 = "tfl.pseudo_qconst"() {qtype = tensor<16x!quant.uniform<i32:f32:0, {2.0,2.0,1.0,1.0,1.0,2.0,2.4,1.7,2.3,2.4,2.4,2.3,2.1,2.4,2.1,2.4}>>, value = dense<0> : tensor<16xi32>} : () -> tensor<16x!quant.uniform<i32:f32:0,  {2.0,2.0,1.0,1.0,1.0,2.0,2.4,1.7,2.3,2.4,2.4,2.3,2.1,2.4,2.1,2.4} >>
+  %2 = "tfl.conv_2d"(%input, %0, %1) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x4x1x16x!quant.uniform<i8:f32, 0.015684768557548523>>, tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1}>>, tensor<16x!quant.uniform<i32:f32:0, {2.0,2.0,1.0,1.0,1.0,2.0,2.4,1.7,2.3,2.4,2.4,2.3,2.1,2.4,2.1,2.4} >>) -> tensor<1x4x1x16x!quant.uniform<i8:f32, 0.078431375324726104>>
+  return %2 : tensor<1x4x1x16x!quant.uniform<i8:f32, 0.078431375324726104>>
+}
+
+// -----
+
 // CHECK-LABEL: test_depthwise_conv2d_bias_inferred
 func.func @test_depthwise_conv2d_bias_inferred(%arg0: tensor<?x32x32x8xf32>, %arg1 : tensor<1x1x1x16xf32>, %arg2 : tensor<16xf32>) -> tensor<?x?x?x?xf32> {
   // CHECK: tosa.depthwise_conv2d
@@ -127,10 +195,10 @@ func.func @test_depthwise_conv2d_bias_inferred(%arg0: tensor<?x32x32x8xf32>, %ar
 // CHECK-LABEL: test_conv3d
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<2x2x7x7x2xf32>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<2x3x3x2x4xf32>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<4xf32>}
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<4xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}>
 // CHECK: %[[VAL_4:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_3]])
-// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 1>}
+// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) <{dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 1>}>
 func.func @test_conv3d(%arg0: tensor<2x2x7x7x2xf32>, %arg1: tensor<2x3x3x2x4xf32>) -> tensor<2x2x7x7x4xf32> {
   %cst = "tfl.no_value"() {value} : () -> none
   %0 = "tfl.conv_3d"(%arg0, %arg1, %cst) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<2x2x7x7x2xf32>, tensor<2x3x3x2x4xf32>, none) -> tensor<2x2x7x7x4xf32>
@@ -142,10 +210,10 @@ func.func @test_conv3d(%arg0: tensor<2x2x7x7x2xf32>, %arg1: tensor<2x3x3x2x4xf32
 // CHECK-LABEL: test_conv3d_dynamic
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<?x11x32x32x8xf32>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<3x1x1x8x16xf32>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}>
 // CHECK: %[[VAL_4:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_3]])
-// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 1, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>}
+// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) <{dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 1, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>}>
 func.func @test_conv3d_dynamic(%arg0: tensor<?x11x32x32x8xf32>, %arg1: tensor<3x1x1x8x16xf32>) -> tensor<*xf32> {
   %cst = "tfl.no_value"() {value} : () -> none
   %0 = "tfl.conv_3d"(%arg0, %arg1, %cst) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<?x11x32x32x8xf32>, tensor<3x1x1x8x16xf32>, none) -> tensor<*xf32>
@@ -158,9 +226,9 @@ func.func @test_conv3d_dynamic(%arg0: tensor<?x11x32x32x8xf32>, %arg1: tensor<3x
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<10x3x64x64x12xf32>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<16x2x2x12x8xf32>
 // CHECK-SAME: %[[VAL_2:.*]]: tensor<8xf32>
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<[4, 0, 1, 2, 3]>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<[4, 0, 1, 2, 3]>
 // CHECK: %[[VAL_4:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_3]])
-// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 7, 8, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>}
+// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) <{dilation = array<i64: 1, 1, 1>, pad = array<i64: 7, 8, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>}>
 func.func @test_conv3d_bias(%arg0: tensor<10x3x64x64x12xf32>, %arg1: tensor<16x2x2x12x8xf32>, %cst: tensor<8xf32>) -> tensor<10x3x64x64x8xf32> {
   %0 = "tfl.conv_3d"(%arg0, %arg1, %cst) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<10x3x64x64x12xf32>, tensor<16x2x2x12x8xf32>, tensor<8xf32>) -> tensor<10x3x64x64x8xf32>
   func.return %0 : tensor<10x3x64x64x8xf32>
@@ -171,20 +239,18 @@ func.func @test_conv3d_bias(%arg0: tensor<10x3x64x64x12xf32>, %arg1: tensor<16x2
 // CHECK-LABEL: test_conv3d_qi8(
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x4x8x21x17x!quant.uniform<i8:f32, 0.015686264261603355>>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<2x3x3x17x34xf32>) -> tensor<1x4x8x11x34x!quant.uniform<i8:f32, 0.8929935097694397:-4>>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<1x1x1x1x1xf32>}
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<0.0156862643> : tensor<1x1x1x1x1xf32>}
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() {value = dense<1.11982894> : tensor<1x1x1x1x1xf32>}
-// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() {value = dense<-4.000000e+00> : tensor<1x1x1x1x1xf32>}
-// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<34xf32>}
-// CHECK-DAG: %[[VAL_7:.*]] = "tosa.const"() {value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}
-// CHECK: %[[VAL_8:.*]] = "tosa.cast"(%[[VAL_0]])
-// CHECK: %[[VAL_9:.*]] = "tosa.sub"(%[[VAL_8]], %[[VAL_2]])
-// CHECK: %[[VAL_10:.*]] = "tosa.mul"(%[[VAL_9]], %[[VAL_3]]) {shift = 0 : i32}
-// CHECK: %[[VAL_11:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_7]])
-// CHECK: %[[VAL_12:.*]] = "tosa.conv3d"(%[[VAL_10]], %[[VAL_11]], %[[VAL_6]]) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 2>}
-// CHECK: %[[VAL_13:.*]] = "tosa.mul"(%[[VAL_12]], %[[VAL_4]]) {shift = 0 : i32}
-// CHECK: %[[VAL_14:.*]] = "tosa.add"(%[[VAL_13]], %[[VAL_5]])
-// CHECK: %[[VAL_15:.*]] = "tosa.cast"(%[[VAL_14]])
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<0.0156862643> : tensor<1x1x1x1x1xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<1.11982894> : tensor<1x1x1x1x1xf32>}>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<-4.000000e+00> : tensor<1x1x1x1x1xf32>}>
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<34xf32>}>
+// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}>
+// CHECK: %[[VAL_7:.*]] = "tosa.cast"(%[[VAL_0]])
+// CHECK: %[[VAL_8:.*]] = "tosa.mul"(%[[VAL_7]], %[[VAL_2]]) <{shift = 0 : i32}>
+// CHECK: %[[VAL_9:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_6]])
+// CHECK: %[[VAL_10:.*]] = "tosa.conv3d"(%[[VAL_8]], %[[VAL_9]], %[[VAL_5]]) <{dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 2>}>
+// CHECK: %[[VAL_11:.*]] = "tosa.mul"(%[[VAL_10]], %[[VAL_3]]) <{shift = 0 : i32}>
+// CHECK: %[[VAL_12:.*]] = "tosa.add"(%[[VAL_11]], %[[VAL_4]])
+// CHECK: %[[VAL_13:.*]] = "tosa.cast"(%[[VAL_12]])
 func.func @test_conv3d_qi8(%arg0: tensor<1x4x8x21x17x!quant.uniform<i8:f32, 0.015686264261603355>>, %arg1: tensor<2x3x3x17x34xf32>) -> (tensor<1x4x8x11x34x!quant.uniform<i8:f32, 0.8929935097694397:-4>>) {
   %0 = "tfl.dequantize"(%arg0) : (tensor<1x4x8x21x17x!quant.uniform<i8:f32, 0.015686264261603355>>) -> tensor<1x4x8x21x17xf32>
   %2 = "tfl.no_value"() {value} : () -> none
@@ -232,7 +298,7 @@ func.func @test_sub_unranked(%arg0: tensor<1x21x3xf32>, %arg1: tensor<1x1x1xf32>
 // -----
 
 // CHECK-LABEL: test_mul
-// CHECK: %[[VAR0:.*]] = "tosa.mul"(%arg0, %arg1) {shift = 0 : i32}
+// CHECK: %[[VAR0:.*]] = "tosa.mul"(%arg0, %arg1) <{shift = 0 : i32}>
 func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xf32> {
   %0 = "tfl.mul"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3xf32>, tensor<13x1x3xf32>) -> tensor<13x21x3xf32>
   func.return %0 : tensor<13x21x3xf32>
@@ -241,7 +307,7 @@ func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> te
 // -----
 
 // CHECK-LABEL: test_mul_unranked
-// CHECK: %[[VAR0:.*]] = "tosa.mul"(%arg0, %arg1) {shift = 0 : i32}
+// CHECK: %[[VAR0:.*]] = "tosa.mul"(%arg0, %arg1) <{shift = 0 : i32}>
 func.func @test_mul_unranked(%arg0: tensor<13x21x3xf32>, %arg1: tensor<1x1x1xf32>) -> tensor<*xf32> {
   %0 = "tfl.mul"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -289,7 +355,7 @@ func.func @test_floor_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<i32>) -> ten
 // -----
 
 // CHECK-LABEL: test_relu1
-// CHECK: %[[VAL0:.*]] = "tosa.clamp"(%arg0) {max_fp = 1.000000e+00 : f32, max_int = 1 : i64, min_fp = -1.000000e+00 : f32, min_int = -1 : i64}
+// CHECK: %[[VAL0:.*]] = "tosa.clamp"(%arg0) <{max_fp = 1.000000e+00 : f32, max_int = 1 : i64, min_fp = -1.000000e+00 : f32, min_int = -1 : i64}>
 func.func @test_relu1(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %0 = "tfl.relu_n1_to_1"(%arg0) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   func.return %0 : tensor<13x21x3xf32>
@@ -298,7 +364,7 @@ func.func @test_relu1(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_relu0To1
-// CHECK: %[[VAL0:.*]] = "tosa.clamp"(%arg0) {max_fp = 1.000000e+00 : f32, max_int = 1 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
+// CHECK: %[[VAL0:.*]] = "tosa.clamp"(%arg0) <{max_fp = 1.000000e+00 : f32, max_int = 1 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}>
 func.func @test_relu0To1(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %0 = "tfl.relu_0_to_1"(%arg0) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   func.return %0 : tensor<13x21x3xf32>
@@ -307,7 +373,7 @@ func.func @test_relu0To1(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_relu6
-// CHECK: %[[VAR0:.*]] = "tosa.clamp"(%arg0) {max_fp = 6.000000e+00 : f32, max_int = 6 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
+// CHECK: %[[VAR0:.*]] = "tosa.clamp"(%arg0) <{max_fp = 6.000000e+00 : f32, max_int = 6 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}>
 func.func @test_relu6(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %0 = "tfl.relu6"(%arg0) : (tensor<13x21x3xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -316,7 +382,7 @@ func.func @test_relu6(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_relu6_dynamic
-// CHECK: %[[VAR0:.*]] = "tosa.clamp"(%arg0) {max_fp = 6.000000e+00 : f32, max_int = 6 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
+// CHECK: %[[VAR0:.*]] = "tosa.clamp"(%arg0) <{max_fp = 6.000000e+00 : f32, max_int = 6 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}>
 // CHECK-SAME: -> tensor<?x21x3xf32>
 func.func @test_relu6_dynamic(%arg0: tensor<?x21x3xf32>) -> tensor<?x?x?xf32> {
   %0 = "tfl.relu6"(%arg0) : (tensor<?x21x3xf32>) -> tensor<?x?x?xf32>
@@ -326,8 +392,8 @@ func.func @test_relu6_dynamic(%arg0: tensor<?x21x3xf32>) -> tensor<?x?x?xf32> {
 // -----
 
 // CHECK-LABEL: test_leaky_relu
-// CHECK: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.707330704> : tensor<1x1x1xf32>}
-// CHECK: %[[VAR1:.*]] = "tosa.mul"(%arg0, %[[VAR0]]) {shift = 0 : i32}
+// CHECK: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.707330704> : tensor<1x1x1xf32>}>
+// CHECK: %[[VAR1:.*]] = "tosa.mul"(%arg0, %[[VAR0]]) <{shift = 0 : i32}>
 // CHECK: %[[VAR2:.*]] = "tosa.maximum"(%[[VAR1]], %arg0) : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
 // CHECK: return %[[VAR2]] : tensor<13x21x3xf32>
 func.func @test_leaky_relu(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
@@ -338,9 +404,9 @@ func.func @test_leaky_relu(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_prelu
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 2, 3>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.mul"(%arg0, %[[VAR1]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 2, 3>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.mul"(%arg0, %[[VAR1]]) <{shift = 0 : i32}>
 // CHECK-DAG: %[[VAR3:.*]] = "tosa.greater_equal"(%arg0, %[[VAR0]])
 // CHECK: %[[VAR4:.*]] = "tosa.select"(%[[VAR3]], %arg0, %[[VAR2]])
 func.func @test_prelu(%arg0: tensor<4x2x3xf32>, %arg1: tensor<2x3xf32>) -> tensor<4x2x3xf32> {
@@ -352,20 +418,20 @@ func.func @test_prelu(%arg0: tensor<4x2x3xf32>, %arg1: tensor<2x3xf32>) -> tenso
 
 // CHECK-LABEL: test_prelu_qu8
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x8x4x17x!quant.uniform<u8:f32, 0.015686038881540298:128>>
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() {value = dense<0> : tensor<1x1x1x1xi32>}
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<8x4x17xi8>}
-// CHECK: %[[VAL_3:.*]] = "tosa.rescale"(%[[VAL_0]]) {double_round = false, input_zp = 128 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
-// CHECK: %[[VAL_4:.*]] = "tosa.rescale"(%[[VAL_3]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
-// CHECK: %[[VAL_5:.*]] = "tosa.rescale"(%[[VAL_4]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0> : tensor<1x1x1x1xi32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<8x4x17xi8>}>
+// CHECK: %[[VAL_3:.*]] = "tosa.rescale"(%[[VAL_0]]) <{double_round = false, input_zp = 128 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}>
+// CHECK: %[[VAL_4:.*]] = "tosa.rescale"(%[[VAL_3]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}>
+// CHECK: %[[VAL_5:.*]] = "tosa.rescale"(%[[VAL_4]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}>
 // CHECK: %[[VAL_6:.*]] = "tosa.greater_equal"(%[[VAL_5]], %[[VAL_1]]) : (tensor<1x8x4x17xi32>, tensor<1x1x1x1xi32>)
-// CHECK: %[[VAL_7:.*]] = "tosa.rescale"(%[[VAL_2]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
-// CHECK: %[[VAL_8:.*]] = "tosa.reshape"(%[[VAL_7]]) {new_shape = array<i64: 1, 8, 4, 17>} : (tensor<8x4x17xi32>)
-// CHECK: %[[VAL_9:.*]] = "tosa.mul"(%[[VAL_5]], %[[VAL_8]]) {shift = 0 : i32} : (tensor<1x8x4x17xi32>, tensor<1x8x4x17xi32>)
-// CHECK: %[[VAL_10:.*]] = "tosa.rescale"(%[[VAL_9]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1130006236>, output_zp = 5 : i32, per_channel = false, scale32 = true, shift = array<i32: 37>}
-// CHECK: %[[VAL_11:.*]] = "tosa.rescale"(%[[VAL_4]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1472433039>, output_zp = 5 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}
+// CHECK: %[[VAL_7:.*]] = "tosa.rescale"(%[[VAL_2]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}>
+// CHECK: %[[VAL_8:.*]] = "tosa.reshape"(%[[VAL_7]]) <{new_shape = array<i64: 1, 8, 4, 17>}> : (tensor<8x4x17xi32>)
+// CHECK: %[[VAL_9:.*]] = "tosa.mul"(%[[VAL_5]], %[[VAL_8]]) <{shift = 0 : i32}> : (tensor<1x8x4x17xi32>, tensor<1x8x4x17xi32>)
+// CHECK: %[[VAL_10:.*]] = "tosa.rescale"(%[[VAL_9]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1130006236>, output_zp = 5 : i32, per_channel = false, scale32 = true, shift = array<i32: 37>}>
+// CHECK: %[[VAL_11:.*]] = "tosa.rescale"(%[[VAL_4]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1472433039>, output_zp = 5 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}>
 // CHECK: %[[VAL_12:.*]] = "tosa.select"(%[[VAL_6]], %[[VAL_11]], %[[VAL_10]])
-// CHECK: %[[VAL_13:.*]] = "tosa.rescale"(%[[VAL_12]]) {double_round = true, input_zp = 5 : i32, multiplier = array<i32: 1073741824>, output_zp = 5 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
-// CHECK: %[[VAL_14:.*]] = "tosa.rescale"(%[[VAL_13]]) {double_round = false, input_zp = 5 : i32, multiplier = array<i32: 1073741824>, output_zp = 133 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
+// CHECK: %[[VAL_13:.*]] = "tosa.rescale"(%[[VAL_12]]) <{double_round = true, input_zp = 5 : i32, multiplier = array<i32: 1073741824>, output_zp = 5 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}>
+// CHECK: %[[VAL_14:.*]] = "tosa.rescale"(%[[VAL_13]]) <{double_round = false, input_zp = 5 : i32, multiplier = array<i32: 1073741824>, output_zp = 133 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}>
 func.func @test_prelu_qu8(%arg0: tensor<1x8x4x17x!quant.uniform<u8:f32, 0.015686038881540298:128>>) -> tensor<1x8x4x17x!quant.uniform<u8:f32, 0.045754898339509964:133>> {
   %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>} : (tensor<1x8x4x17x!quant.uniform<u8:f32, 0.015686038881540298:128>>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>
   %1 = "tfl.pseudo_qconst"() {qtype = tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.023982547223567963>>, value = dense<"0x191D0557FF212FA1137FDE2B247CE8BA2A8B2213F6B109FA12232EC613FEEE03EF2D265BE5E4F6CB0E09F7F0A95606DA1709EDE632D0F92A2002E98E61F9213997D3FCEBFA0D2DFC4DD00D0700C60C0705F3CFCB01D30C3617C7144C294DAE27061A62E70665021AF50827F40EC9E0172D42B9FB01FB076A09553006F7F710211A031EC9F11BCF130FCC1906D5FED8E5F64E06EAEAFEFD2515F20BB6E3401023C89DFCF8DEC0390B37D8CA2001E1F7BC270ADDE92DFC6D230CE1FEEE1DE8F90ABF9E3ECAEEBC311DF6FDE41F0E31ED0AC309B3121533E7EC2D1B0F1E04D44513E627F4ED5E491D10E53EEA45FF23E31D11D1DE2E0A3B1015AF06102329DEED5C1C180402000B0D071BF0D4FBC0DE0C3BF012E018D80716351D1922F8D508CF2708BA0CEAFE14E4972732FDFD283ED9342A1506F4F137200A12F436D6C9EC071FBCBDEBF4F8051426B8201EC410F9C3C7EFF7CD04D7AC34E2F9D73A5A05CFFA0FF7FD21D6BBEA03F16AF8330C1105285605C9FFE72BE04726DA06F2DCDCDC14C1310CF4E32F06BE0941420B10C9293DD10EFE28D4D20716E6E6EE0A101FFE3AAF1716120EF62FECEBC0F0D72A0903F9E74425EDF82E290E0413BB69F3F45AF30A22D4D024411B4D243BE13FB9CBE0F5FA16A1D7532007AEF62837C42406E3ED3CCE0408CA1C0CFA18B40C0BF7261E06D3E504B8E714BCF6F010DB12373739E200E609E9DAEF1922A2C338FEF2C519F0E5101E2AE917DCA3FA27D245DD10F0EBCE"> : tensor<8x4x17xi8>} : () -> tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.023982547223567963>>
@@ -378,15 +444,15 @@ func.func @test_prelu_qu8(%arg0: tensor<1x8x4x17x!quant.uniform<u8:f32, 0.015686
 
 // CHECK-LABEL: test_prelu_qi8
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() {value = dense<0> : tensor<1x1x1x1xi32>}
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<8x4x17xi8>}
-// CHECK: %[[VAL_3:.*]] = "tosa.rescale"(%[[VAL_0]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0> : tensor<1x1x1x1xi32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<8x4x17xi8>}>
+// CHECK: %[[VAL_3:.*]] = "tosa.rescale"(%[[VAL_0]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}>
 // CHECK: %[[VAL_4:.*]] = "tosa.greater_equal"(%[[VAL_3]], %[[VAL_1]]) : (tensor<1x8x4x17xi32>, tensor<1x1x1x1xi32>)
-// CHECK: %[[VAL_5:.*]] = "tosa.rescale"(%[[VAL_2]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
-// CHECK: %[[VAL_6:.*]] = "tosa.reshape"(%[[VAL_5]]) {new_shape = array<i64: 1, 8, 4, 17>} : (tensor<8x4x17xi32>)
-// CHECK: %[[VAL_7:.*]] = "tosa.mul"(%[[VAL_3]], %[[VAL_6]]) {shift = 0 : i32} : (tensor<1x8x4x17xi32>, tensor<1x8x4x17xi32>)
-// CHECK: %[[VAL_8:.*]] = "tosa.rescale"(%[[VAL_7]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1103996759>, output_zp = 1 : i32, per_channel = false, scale32 = true, shift = array<i32: 37>}
-// CHECK: %[[VAL_9:.*]] = "tosa.rescale"(%[[VAL_0]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1582183328>, output_zp = 1 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}
+// CHECK: %[[VAL_5:.*]] = "tosa.rescale"(%[[VAL_2]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}>
+// CHECK: %[[VAL_6:.*]] = "tosa.reshape"(%[[VAL_5]]) <{new_shape = array<i64: 1, 8, 4, 17>}> : (tensor<8x4x17xi32>)
+// CHECK: %[[VAL_7:.*]] = "tosa.mul"(%[[VAL_3]], %[[VAL_6]]) <{shift = 0 : i32}> : (tensor<1x8x4x17xi32>, tensor<1x8x4x17xi32>)
+// CHECK: %[[VAL_8:.*]] = "tosa.rescale"(%[[VAL_7]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1103996759>, output_zp = 1 : i32, per_channel = false, scale32 = true, shift = array<i32: 37>}>
+// CHECK: %[[VAL_9:.*]] = "tosa.rescale"(%[[VAL_0]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1582183328>, output_zp = 1 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}>
 // CHECK: %[[VAL_10:.*]] = "tosa.select"(%[[VAL_4]], %[[VAL_9]], %[[VAL_8]])
 func.func @test_prelu_qi8(%arg0: tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>> {
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.021805247291922569>>, value = dense<"0xDAFDEBC120CBE1E028231F05CF04F52484B2F0AC0041E618200308F820FE308FFCF2E1E02A06D00606FB1044C928D8D811E3FCCE350E25C4DE2B0D00E20AC1E215940D0D12C809290D480FE9E2DB26E31E50F5F4FDD31EFF21C210E717E187144F27C848E820C5D503E31729218D96D2D6D3D9C43BF13014EFCB043631AE4403FE2D4CDF1F16E2D13BA20AE92CEAB7323405F728CF3DF4E9BBFAFEFEE120ECA7FA120609030FF0FCF0E5D40939172EE7E256BADEC5ECFFB32C35F4E936E2F8092FE2E3EFE22B0C02F5EE1D36DE03CBE02FF346081C30ED882AECCAF4E4E3361604EABF133CB6371DDAFCDA4F2D32034A270BF0120A0048131331E50D11CAEB1DEE0ADFC0F12531E8351DD7BDEB2821FF3ECC34F8D42EE4D6FF2AE5FEEDFC3DF7463CED10192CE4B728151827A92E000EE31CF3C5DF193DAC2836181BD916D339E914192B14F0163C58C500BDC6BAEFFB03EC33DA24E7FF0E292CE30504B3070AB5FDE6D7E7CB4CB0D818F90919EAEF5DFDF2DB6C4132DF8EF2E40AF7EA04F1D496F22F2971420FF01D012E2954D5081C0AF2C5E5DED2CCD8C6157416201AFF3A2B29FBDD9EF06340B021F45C322A202DDD86111EBDF44BE9110E29F3FE7FDEDDFB5FDEDBD933E2ED0DD4E21C4BC6FD28E31934C821CE10F61C12740A100F1BE205CC01434BD7E3FB14F01CE0E406710022E464E0F0D8FB3D01C733C9C94017FAC50BE812D202E2B10C04E70AF326CEFD0DE20ABD153D3D14171C34061DE5FC5A"> : tensor<8x4x17xi8>} : () -> tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.021805247291922569>>
@@ -434,7 +500,7 @@ func.func @test_reduce_sum_axis_out_of_bounds(%arg0: tensor<13x21x3xf32>) -> ten
 
 // CHECK-LABEL: test_reduce_all_axis_1_keep_true
 // CHECK-SAME: %[[VAL_0:.+]]: tensor<1x4x8x19xi1>
-// CHECK: %[[VAL_1:.*]] = "tosa.reduce_all"(%[[VAL_0]]) {axis = 1 : i64} : (tensor<1x4x8x19xi1>) -> tensor<1x1x8x19xi1>
+// CHECK: %[[VAL_1:.*]] = "tosa.reduce_all"(%[[VAL_0]]) <{axis = 1 : i64}> : (tensor<1x4x8x19xi1>) -> tensor<1x1x8x19xi1>
 func.func @test_reduce_all_axis_1_keep_true(%arg0: tensor<1x4x8x19xi1>) -> tensor<1x1x8x19xi1> {
   %cst = arith.constant dense<1> : tensor<1xi32>
   %0 = "tfl.reduce_all"(%arg0, %cst)  {keep_dims = true}  : (tensor<1x4x8x19xi1>, tensor<1xi32>) -> tensor<1x1x8x19xi1>
@@ -445,8 +511,8 @@ func.func @test_reduce_all_axis_1_keep_true(%arg0: tensor<1x4x8x19xi1>) -> tenso
 
 // CHECK-LABEL: test_reduce_all_axis_1_keep_false
 // CHECK-SAME: %[[VAL_0:.+]]: tensor<1x4x8x19xi1>
-// CHECK: %[[VAL_1:.*]] = "tosa.reduce_all"(%[[VAL_0]]) {axis = 1 : i64} : (tensor<1x4x8x19xi1>) -> tensor<1x1x8x19xi1>
-// CHECK: %[[VAL_2:.*]] = "tosa.reshape"(%[[VAL_1]]) {new_shape = array<i64: 1, 8, 19>} : (tensor<1x1x8x19xi1>) -> tensor<1x8x19xi1>
+// CHECK: %[[VAL_1:.*]] = "tosa.reduce_all"(%[[VAL_0]]) <{axis = 1 : i64}> : (tensor<1x4x8x19xi1>) -> tensor<1x1x8x19xi1>
+// CHECK: %[[VAL_2:.*]] = "tosa.reshape"(%[[VAL_1]]) <{new_shape = array<i64: 1, 8, 19>}> : (tensor<1x1x8x19xi1>) -> tensor<1x8x19xi1>
 func.func @test_reduce_all_axis_1_keep_false(%arg0: tensor<1x4x8x19xi1>) -> tensor<1x8x19xi1> {
   %cst = arith.constant dense<1> : tensor<1xi32>
   %0 = "tfl.reduce_all"(%arg0, %cst)  {keep_dims = false}  : (tensor<1x4x8x19xi1>, tensor<1xi32>) -> tensor<1x8x19xi1>
@@ -457,7 +523,7 @@ func.func @test_reduce_all_axis_1_keep_false(%arg0: tensor<1x4x8x19xi1>) -> tens
 
 // CHECK-LABEL: test_reduce_all_axis_2_keep_true
 // CHECK-SAME: %[[VAL_0:.+]]: tensor<1x4x8x19xi1>
-// CHECK: %[[VAL_1:.*]] = "tosa.reduce_all"(%[[VAL_0]]) {axis = 2 : i64} : (tensor<1x4x8x19xi1>) -> tensor<1x4x1x19xi1>
+// CHECK: %[[VAL_1:.*]] = "tosa.reduce_all"(%[[VAL_0]]) <{axis = 2 : i64}> : (tensor<1x4x8x19xi1>) -> tensor<1x4x1x19xi1>
 func.func @test_reduce_all_axis_2_keep_true(%arg0: tensor<1x4x8x19xi1>) -> tensor<1x4x1x19xi1> {
   %cst = arith.constant dense<2> : tensor<1xi32>
   %0 = "tfl.reduce_all"(%arg0, %cst)  {keep_dims = true}  : (tensor<1x4x8x19xi1>, tensor<1xi32>) -> tensor<1x4x1x19xi1>
@@ -468,8 +534,8 @@ func.func @test_reduce_all_axis_2_keep_true(%arg0: tensor<1x4x8x19xi1>) -> tenso
 
 // CHECK-LABEL: test_reduce_all_axis_2_keep_false
 // CHECK-SAME: %[[VAL_0:.+]]: tensor<1x4x8x19xi1>
-// CHECK: %[[VAL_1:.*]] = "tosa.reduce_all"(%[[VAL_0]]) {axis = 2 : i64} : (tensor<1x4x8x19xi1>) -> tensor<1x4x1x19xi1>
-// CHECK: %[[VAL_2:.*]] = "tosa.reshape"(%[[VAL_1]]) {new_shape = array<i64: 1, 4, 19>} : (tensor<1x4x1x19xi1>) -> tensor<1x4x19xi1>
+// CHECK: %[[VAL_1:.*]] = "tosa.reduce_all"(%[[VAL_0]]) <{axis = 2 : i64}> : (tensor<1x4x8x19xi1>) -> tensor<1x4x1x19xi1>
+// CHECK: %[[VAL_2:.*]] = "tosa.reshape"(%[[VAL_1]]) <{new_shape = array<i64: 1, 4, 19>}> : (tensor<1x4x1x19xi1>) -> tensor<1x4x19xi1>
 func.func @test_reduce_all_axis_2_keep_false(%arg0: tensor<1x4x8x19xi1>) -> tensor<1x4x19xi1> {
   %cst = arith.constant dense<2> : tensor<1xi32>
   %0 = "tfl.reduce_all"(%arg0, %cst)  {keep_dims = false}  : (tensor<1x4x8x19xi1>, tensor<1xi32>) -> tensor<1x4x19xi1>
@@ -479,8 +545,8 @@ func.func @test_reduce_all_axis_2_keep_false(%arg0: tensor<1x4x8x19xi1>) -> tens
 // -----
 
 // CHECK-LABEL: test_reduce_any
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_any"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_any"(%arg0) <{axis = 0 : i64}>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 21, 3>}>
 func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
   %cst = arith.constant dense<0> : tensor<1xi32>
   %0 = "tfl.reduce_any"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xi1>, tensor<1xi32>) -> tensor<21x3xi1>
@@ -490,8 +556,8 @@ func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
 // -----
 
 // CHECK-LABEL: test_reduce_min
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_min"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_min"(%arg0) <{axis = 0 : i64}>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 21, 3>}>
 func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
   %0 = "tfl.reduce_min"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -501,8 +567,8 @@ func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_reduce_max
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_max"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_max"(%arg0) <{axis = 0 : i64}>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 21, 3>}>
 func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
   %0 = "tfl.reduce_max"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -512,21 +578,23 @@ func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_reduce_sum
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_sum"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_sum"(%arg0) <{axis = 0 : i64}>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 21, 3>}>
 func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
   %0 = "tfl.sum"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
   func.return %0 : tensor<21x3xf32>
 }
 
+// -----
+
 // CHECK-LABEL: test_reduce_sum_nonzero_axis
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<10x20x30x40x50xf32>
-// CHECK: %[[VAL_1:.*]] = "tosa.const"() {value = dense<[0, 1, 2, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<[0, 1, 2, 4, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
 // CHECK: %[[VAL_2:.*]] = "tosa.transpose"(%[[VAL_0]], %[[VAL_1]]) : (tensor<10x20x30x40x50xf32>, tensor<5xi32>) -> tensor<10x20x30x50x40xf32>
-// CHECK: %[[VAL_3:.*]] = "tosa.reshape"(%[[VAL_2]]) {new_shape = array<i64: 300000, 40>} : (tensor<10x20x30x50x40xf32>) -> tensor<300000x40xf32>
-// CHECK: %[[VAL_4:.*]] = "tosa.reduce_sum"(%[[VAL_3]]) {axis = 1 : i64} : (tensor<300000x40xf32>) -> tensor<300000x1xf32>
-// CHECK: %[[VAL_5:.*]] = "tosa.reshape"(%[[VAL_4]]) {new_shape = array<i64: 10, 20, 30, 50>} : (tensor<300000x1xf32>) -> tensor<10x20x30x50xf32>
+// CHECK: %[[VAL_3:.*]] = "tosa.reshape"(%[[VAL_2]]) <{new_shape = array<i64: 300000, 40>}> : (tensor<10x20x30x50x40xf32>) -> tensor<300000x40xf32>
+// CHECK: %[[VAL_4:.*]] = "tosa.reduce_sum"(%[[VAL_3]]) <{axis = 1 : i64}> : (tensor<300000x40xf32>) -> tensor<300000x1xf32>
+// CHECK: %[[VAL_5:.*]] = "tosa.reshape"(%[[VAL_4]]) <{new_shape = array<i64: 10, 20, 30, 50>}> : (tensor<300000x1xf32>) -> tensor<10x20x30x50xf32>
 // CHECK: return %[[VAL_5]] : tensor<10x20x30x50xf32>
 func.func @test_reduce_sum_nonzero_axis(%arg0: tensor<10x20x30x40x50xf32> {tf._user_specified_name = "inp_list"}) -> tensor<10x20x30x50xf32> {
   %cst = arith.constant dense<3> : tensor<i32>
@@ -536,16 +604,14 @@ func.func @test_reduce_sum_nonzero_axis(%arg0: tensor<10x20x30x40x50xf32> {tf._u
 
 // -----
 
-// -----
-
 // CHECK-LABEL: test_reduce_sum_5D
 func.func @test_reduce_sum_5D(%arg0: tensor<4x5x6x7x8xf32>) -> tensor<6x8xf32> {
   %cst = arith.constant dense<[0, 1, 3]> : tensor<3xi32>
-  // CHECK-DAG: %[[PERM:.+]] = "tosa.const"() {value = dense<[2, 4, 0, 1, 3]> : tensor<5xi32>}
+  // CHECK-DAG: %[[PERM:.+]] = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3]> : tensor<5xi32>}>
   // CHECK-DAG: %[[TRANSPOSE:.+]] = "tosa.transpose"(%arg0, %[[PERM]])
-  // CHECK-DAG: %[[RESHAPE0:.+]] = "tosa.reshape"(%[[TRANSPOSE:.+]]) {new_shape = array<i64: 48, 140>}
-  // CHECK-DAG: %[[REDUCE:.+]] = "tosa.reduce_sum"(%[[RESHAPE0]]) {axis = 1 : i64}
-  // CHECK: %[[RESHAPE1:.+]] = "tosa.reshape"(%[[REDUCE]]) {new_shape = array<i64: 6, 8>}
+  // CHECK-DAG: %[[RESHAPE0:.+]] = "tosa.reshape"(%[[TRANSPOSE:.+]]) <{new_shape = array<i64: 48, 140>}>
+  // CHECK-DAG: %[[REDUCE:.+]] = "tosa.reduce_sum"(%[[RESHAPE0]]) <{axis = 1 : i64}>
+  // CHECK: %[[RESHAPE1:.+]] = "tosa.reshape"(%[[REDUCE]]) <{new_shape = array<i64: 6, 8>}>
   %0 = "tfl.sum"(%arg0, %cst)  {keep_dims = false}  : (tensor<4x5x6x7x8xf32>, tensor<3xi32>) -> tensor<6x8xf32>
   func.return %0 : tensor<6x8xf32>
 }
@@ -553,10 +619,10 @@ func.func @test_reduce_sum_5D(%arg0: tensor<4x5x6x7x8xf32>) -> tensor<6x8xf32> {
 // -----
 
 // CHECK-LABEL: test_reduce_mean
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.0769230798> : tensor<1x1xf32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%arg0) {axis = 0 : i64}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%[[VAR1]]) {new_shape = array<i64: 21, 3>}
-// CHECK: %[[VAR4:.*]] = "tosa.mul"(%[[VAR2]], %[[VAR0]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.0769230798> : tensor<1x1xf32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%arg0) <{axis = 0 : i64}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%[[VAR1]]) <{new_shape = array<i64: 21, 3>}>
+// CHECK: %[[VAR4:.*]] = "tosa.mul"(%[[VAR2]], %[[VAR0]]) <{shift = 0 : i32}>
 func.func @test_reduce_mean(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
   %0 = "tfl.mean"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -576,8 +642,8 @@ func.func @test_reduce_mean_out_of_bounds(%arg0: tensor<13x21x3xf32>) -> tensor<
 // -----
 
 // CHECK-LABEL: test_reduce_product
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_prod"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_prod"(%arg0) <{axis = 0 : i64}>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 21, 3>}>
 func.func @test_reduce_product(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
   %0 = "tfl.reduce_prod"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -679,10 +745,38 @@ func.func @test_negate(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_rsqrt
-// CHECK: %[[VAR0:.*]] = "tosa.rsqrt"(%arg0)
-func.func @test_rsqrt(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
-  %0 = "tfl.rsqrt"(%arg0) : (tensor<13x21x3xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3xf32>
+// CHECK: %[[VAL_1:.*]] = "tosa.rsqrt"(%[[VAL_0]]) : (tensor<13x21x3xf32>)
+func.func @test_rsqrt(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+  %0 = "tfl.rsqrt"(%arg0) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+  func.return %0 : tensor<13x21x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_rsqrt_qi8
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3x!quant.uniform<i8:f32, 1.500000e-02:-128>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<{{.+}}> : tensor<256xi8>}>
+// CHECK: %[[VAL_2:.*]] = "tosa.table"(%[[VAL_0]], %[[VAL_1]])
+func.func @test_rsqrt_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015:-128>>) -> (tensor<13x21x3x!quant.uniform<i8:f32, 3.71:-128>>) {
+  %0 = "tfl.rsqrt"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015:-128>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 3.71:-128>>
+  func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 3.71:-128>>
+}
+
+// -----
+
+// CHECK-LABEL: test_sign
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<21x45xi32>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<-1> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<1> : tensor<1x1xi32>}>
+// CHECK: %[[VAL_4:.*]] = "tosa.greater"(%[[VAL_0]], %[[VAL_1]])
+// CHECK: %[[VAL_5:.*]] = "tosa.greater"(%[[VAL_1]], %[[VAL_0]])
+// CHECK: %[[VAL_6:.*]] = "tosa.select"(%[[VAL_5]], %[[VAL_2]], %[[VAL_1]])
+// CHECK: %[[VAL_7:.*]] = "tosa.select"(%[[VAL_4]], %[[VAL_3]], %[[VAL_6]])
+func.func @test_sign(%arg0: tensor<21x45xi32>) -> tensor<21x45xi32> {
+  %0 = "tfl.sign"(%arg0) : (tensor<21x45xi32>) -> tensor<21x45xi32>
+    func.return %0 : tensor<21x45xi32>
 }
 
 // -----
@@ -690,12 +784,12 @@ func.func @test_rsqrt(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: test_sin
 // CHECK-SAME: -> tensor<10xf32>
 func.func @test_sin(%arg0: tensor<10xf32>) -> tensor<*xf32> {
-  // CHECK-DAG: %[[ONE:.+]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1xf32>}
-  // CHECK-DAG: %[[TWO:.+]] = "tosa.const"() {value = dense<2.000000e+00> : tensor<1xf32>}
-  // CHECK-DAG: %[[RESULT_SCALE:.+]] = "tosa.const"() {value = dense<2.38418579E-7> : tensor<1xf32>}
-  // CHECK-DAG: %[[INT_MAX:.+]] = "tosa.const"() {value = dense<3.276700e+04> : tensor<1xf32>}
-  // CHECK-DAG: %[[IN_SCALE:.+]] = "tosa.const"() {value = dense<0.159154937> : tensor<1xf32>}
-  // CHECK-DAG: %[[TBLVAL:.+]] = "tosa.const"() {value = dense<{{.+}}> : tensor<513xi16>}
+  // CHECK-DAG: %[[ONE:.+]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1xf32>}>
+  // CHECK-DAG: %[[TWO:.+]] = "tosa.const"() <{value = dense<2.000000e+00> : tensor<1xf32>}>
+  // CHECK-DAG: %[[RESULT_SCALE:.+]] = "tosa.const"() <{value = dense<2.38418579E-7> : tensor<1xf32>}>
+  // CHECK-DAG: %[[INT_MAX:.+]] = "tosa.const"() <{value = dense<3.276700e+04> : tensor<1xf32>}>
+  // CHECK-DAG: %[[IN_SCALE:.+]] = "tosa.const"() <{value = dense<0.159154937> : tensor<1xf32>}>
+  // CHECK-DAG: %[[TBLVAL:.+]] = "tosa.const"() <{value = dense<{{.+}}> : tensor<513xi16>}>
   // CHECK-DAG: %[[IN_SCALED:.+]] = "tosa.mul"(%arg0, %[[IN_SCALE]])
   // CHECK-DAG: %[[FLOOR:.+]] = "tosa.floor"(%[[IN_SCALED]])
   // CHECK-DAG: %[[SUB1:.+]] = "tosa.sub"(%[[IN_SCALED]], %[[FLOOR]])
@@ -717,13 +811,13 @@ func.func @test_sin(%arg0: tensor<10xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: test_cos
 // CHECK-SAME: -> tensor<10xf32>
 func.func @test_cos(%arg0: tensor<10xf32>) -> tensor<*xf32> {
-  // CHECK-DAG: %[[RESULT_SCALE:.+]] = "tosa.const"() {value = dense<2.38418579E-7> : tensor<1xf32>}
-  // CHECK-DAG: %[[INT_MAX:.+]] = "tosa.const"() {value = dense<3.276700e+04> : tensor<1xf32>}
-  // CHECK-DAG: %[[ONE:.+]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1xf32>}
-  // CHECK-DAG: %[[TWO:.+]] = "tosa.const"() {value = dense<2.000000e+00> : tensor<1xf32>}
-  // CHECK-DAG: %[[IN_SCALE:.+]] = "tosa.const"() {value = dense<0.159154937> : tensor<1xf32>}
-  // CHECK-DAG: %[[HALF_PI:.+]] = "tosa.const"() {value = dense<1.57079637> : tensor<1xf32>}
-  // CHECK-DAG: %[[TBLVAL:.+]] = "tosa.const"() {value = dense<{{.+}}> : tensor<513xi16>}
+  // CHECK-DAG: %[[RESULT_SCALE:.+]] = "tosa.const"() <{value = dense<2.38418579E-7> : tensor<1xf32>}>
+  // CHECK-DAG: %[[INT_MAX:.+]] = "tosa.const"() <{value = dense<3.276700e+04> : tensor<1xf32>}>
+  // CHECK-DAG: %[[ONE:.+]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1xf32>}>
+  // CHECK-DAG: %[[TWO:.+]] = "tosa.const"() <{value = dense<2.000000e+00> : tensor<1xf32>}>
+  // CHECK-DAG: %[[IN_SCALE:.+]] = "tosa.const"() <{value = dense<0.159154937> : tensor<1xf32>}>
+  // CHECK-DAG: %[[HALF_PI:.+]] = "tosa.const"() <{value = dense<1.57079637> : tensor<1xf32>}>
+  // CHECK-DAG: %[[TBLVAL:.+]] = "tosa.const"() <{value = dense<{{.+}}> : tensor<513xi16>}>
   // CHECK-DAG: %[[IN_TRANSLATE:.+]] = "tosa.add"(%arg0, %[[HALF_PI]])
   // CHECK-DAG: %[[IN_SCALED:.+]] = "tosa.mul"(%[[IN_TRANSLATE]], %[[IN_SCALE]])
   // CHECK-DAG: %[[FLOOR:.+]] = "tosa.floor"(%[[IN_SCALED]])
@@ -745,27 +839,27 @@ func.func @test_cos(%arg0: tensor<10xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: test_atan2
 // CHECK-SAME: -> tensor<13x21x3xf32>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() {value = dense<2.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() {value = dense<3.276700e+04> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() {value = dense<2.38418579E-7> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() {value = dense<1.57079637> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_7:.*]] = "tosa.const"() {value = dense<3.14159274> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_8:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_9:.*]] = "tosa.const"() {value = dense<{{.+}}> : tensor<513xi16>} : () -> tensor<513xi16>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<2.000000e+00> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<3.276700e+04> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<2.38418579E-7> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{value = dense<1.57079637> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_7:.*]] = "tosa.const"() <{value = dense<3.14159274> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_8:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_9:.*]] = "tosa.const"() <{value = dense<{{.+}}> : tensor<513xi16>}> : () -> tensor<513xi16>
 // CHECK: %[[VAL_10:.*]] = "tosa.abs"(%arg0) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
 // CHECK: %[[VAL_11:.*]] = "tosa.abs"(%arg1) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
 // CHECK: %[[VAL_12:.*]] = "tosa.minimum"(%[[VAL_10]], %[[VAL_11]]) : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
 // CHECK: %[[VAL_13:.*]] = "tosa.maximum"(%[[VAL_10]], %[[VAL_11]]) : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
 // CHECK: %[[VAL_14:.*]] = "tosa.reciprocal"(%[[VAL_13]]) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
-// CHECK: %[[VAL_15:.*]] = "tosa.mul"(%[[VAL_14]], %[[VAL_12]]) {shift = 0 : i32} : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
-// CHECK: %[[VAL_16:.*]] = "tosa.mul"(%[[VAL_15]], %[[VAL_2]]) {shift = 0 : i32} : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_15:.*]] = "tosa.mul"(%[[VAL_14]], %[[VAL_12]]) <{shift = 0 : i32}> : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_16:.*]] = "tosa.mul"(%[[VAL_15]], %[[VAL_2]]) <{shift = 0 : i32}> : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<13x21x3xf32>
 // CHECK: %[[VAL_17:.*]] = "tosa.sub"(%[[VAL_16]], %[[VAL_3]]) : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<13x21x3xf32>
-// CHECK: %[[VAL_18:.*]] = "tosa.mul"(%[[VAL_17]], %[[VAL_4]]) {shift = 0 : i32} : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_18:.*]] = "tosa.mul"(%[[VAL_17]], %[[VAL_4]]) <{shift = 0 : i32}> : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<13x21x3xf32>
 // CHECK: %[[VAL_19:.*]] = "tosa.cast"(%[[VAL_18]]) : (tensor<13x21x3xf32>) -> tensor<13x21x3xi16>
 // CHECK: %[[VAL_20:.*]] = "tosa.table"(%[[VAL_19]], %[[VAL_9]]) : (tensor<13x21x3xi16>, tensor<513xi16>) -> tensor<13x21x3xi32>
 // CHECK: %[[VAL_21:.*]] = "tosa.cast"(%[[VAL_20]]) : (tensor<13x21x3xi32>) -> tensor<13x21x3xf32>
-// CHECK: %[[VAL_22:.*]] = "tosa.mul"(%[[VAL_21]], %[[VAL_5]]) {shift = 0 : i32} : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_22:.*]] = "tosa.mul"(%[[VAL_21]], %[[VAL_5]]) <{shift = 0 : i32}> : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<13x21x3xf32>
 // CHECK: %[[VAL_23:.*]] = "tosa.sub"(%[[VAL_6]], %[[VAL_22]]) : (tensor<1x1x1xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
 // CHECK: %[[VAL_24:.*]] = "tosa.greater"(%[[VAL_10]], %[[VAL_11]]) : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xi1>
 // CHECK: %[[VAL_25:.*]] = "tosa.select"(%[[VAL_24]], %[[VAL_23]], %[[VAL_22]]) : (tensor<13x21x3xi1>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
@@ -794,7 +888,7 @@ func.func @test_sigmoid(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_square
-// CHECK: %[[VAR0:.*]] = "tosa.mul"(%arg0, %arg0) {shift = 0 : i32}
+// CHECK: %[[VAR0:.*]] = "tosa.mul"(%arg0, %arg0) <{shift = 0 : i32}>
 func.func @test_square(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %0 = "tfl.square"(%arg0) : (tensor<13x21x3xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -868,7 +962,7 @@ func.func @test_less_equal_dynamic(%arg0: tensor<13x1x3xf32>, %arg1: tensor<13x?
 // -----
 
 // CHECK-LABEL: test_avg_pool2d
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
 func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -877,7 +971,7 @@ func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_avg_pool2d_dynamic
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
 func.func @test_avg_pool2d_dynamic(%arg0: tensor<?x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<?x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -886,7 +980,7 @@ func.func @test_avg_pool2d_dynamic(%arg0: tensor<?x32x32x8xf32>) -> tensor<*xf32
 // -----
 
 // CHECK-LABEL: test_max_pool2d
-// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
 func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.max_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -895,7 +989,7 @@ func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_max_pool2d_dynamic
-// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
 func.func @test_max_pool2d_dynamic(%arg0: tensor<?x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.max_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<?x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -904,7 +998,7 @@ func.func @test_max_pool2d_dynamic(%arg0: tensor<?x32x32x8xf32>) -> tensor<*xf32
 // -----
 
 // CHECK-LABEL: test_reshape
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 819>}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 819>}>
 func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[1, 819]> : tensor<2xi32>
   %0 = "tfl.reshape"(%arg0, %cst) : (tensor<13x21x3xf32>, tensor<2xi32>) -> tensor<*xf32>
@@ -914,7 +1008,7 @@ func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_reshape_unknown
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 9, -1>}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 9, -1>}>
 // CHECK-SAME: -> tensor<9x91xf32>
 func.func @test_reshape_unknown(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[9, -1]> : tensor<2xi32>
@@ -925,7 +1019,7 @@ func.func @test_reshape_unknown(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_reshape_dynamic
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 3, -1>}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 3, -1>}>
 // CHECK-SAME: -> tensor<3x?xf32>
 func.func @test_reshape_dynamic(%arg0: tensor<13x21x?xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[3, -1]> : tensor<2xi32>
@@ -936,7 +1030,7 @@ func.func @test_reshape_dynamic(%arg0: tensor<13x21x?xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_transpose
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[2, 0, 1]> : tensor<3xi32>}>
 // CHECK: %[[VAR1:.*]] = "tosa.transpose"(%arg0, %[[VAR0]])
 func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[2, 0, 1]> : tensor<3xi32>
@@ -947,7 +1041,7 @@ func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_transpose
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[2, 0, 1]> : tensor<3xi32>}>
 // CHECK: %[[VAR1:.*]] = "tosa.transpose"(%arg0, %[[VAR0]])
 func.func @test_transpose(%arg0: tensor<13x?x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[2, 0, 1]> : tensor<3xi32>
@@ -958,7 +1052,7 @@ func.func @test_transpose(%arg0: tensor<13x?x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_slice
-// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 4, 11, 1>, start = array<i64: 6, 8, 0>}
+// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 4, 11, 1>, start = array<i64: 6, 8, 0>}>
 func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[6, 8, 0]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[4, 11, 1]> : tensor<3xi32>
@@ -969,10 +1063,10 @@ func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_strided_slice_simple
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 9, 21, 2>, start = array<i64: 4, 0, 1>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 9, 1, 7, 3, 2, 1>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) {size = array<i64: 9, 1, 7, 1, 2, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 9, 7, 2>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 9, 21, 2>, start = array<i64: 4, 0, 1>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 9, 1, 7, 3, 2, 1>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) <{size = array<i64: 9, 1, 7, 1, 2, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}>
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 9, 7, 2>}>
 func.func @test_strided_slice_simple(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
@@ -984,10 +1078,10 @@ func.func @test_strided_slice_simple(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32
 // -----
 
 // CHECK-LABEL: test_strided_slice_simple_negative
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 9, 18, 2>, start = array<i64: 4, 0, 1>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 9, 1, 6, 3, 2, 1>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) {size = array<i64: 9, 1, 6, 1, 2, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 9, 6, 2>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 9, 18, 2>, start = array<i64: 4, 0, 1>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 9, 1, 6, 3, 2, 1>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) <{size = array<i64: 9, 1, 6, 1, 2, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}>
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 9, 6, 2>}>
 func.func @test_strided_slice_simple_negative(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, -3, 3]> : tensor<3xi32>
@@ -999,8 +1093,8 @@ func.func @test_strided_slice_simple_negative(%arg0: tensor<13x21x3xf32>) -> ten
 // -----
 
 // CHECK-LABEL: test_strided_slice_strideless
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 9, 1, 2>, start = array<i64: 4, 0, 1>}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 9, 2>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 9, 1, 2>, start = array<i64: 4, 0, 1>}>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 9, 2>}>
 func.func @test_strided_slice_strideless(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
@@ -1012,10 +1106,10 @@ func.func @test_strided_slice_strideless(%arg0: tensor<13x21x3xf32>) -> tensor<*
 // -----
 
 // CHECK-LABEL: test_strided_slice_shrink
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 1, 21, 1>, start = array<i64: 4, 0, 1>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 1, 1, 7, 3, 1, 1>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) {size = array<i64: 1, 1, 7, 1, 1, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 7>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 1, 21, 1>, start = array<i64: 4, 0, 1>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 1, 1, 7, 3, 1, 1>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) <{size = array<i64: 1, 1, 7, 1, 1, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}>
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 7>}>
 func.func @test_strided_slice_shrink(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
@@ -1027,8 +1121,8 @@ func.func @test_strided_slice_shrink(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32
 // -----
 
 // CHECK-LABEL: test_strided_slice_shrink_ignore_stride
-// CHECK-DAG: %[[VAR0:.*]] =  "tosa.slice"(%arg0) {size = array<i64: 1, 1, 2>, start = array<i64: 4, 0, 1>}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 2>}
+// CHECK-DAG: %[[VAR0:.*]] =  "tosa.slice"(%arg0) <{size = array<i64: 1, 1, 2>, start = array<i64: 4, 0, 1>}>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 2>}>
 func.func @test_strided_slice_shrink_ignore_stride(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
@@ -1041,8 +1135,8 @@ func.func @test_strided_slice_shrink_ignore_stride(%arg0: tensor<13x21x3xf32>) -
 
 // CHECK-LABEL: test_strided_slice_unstrided
 // CHECK-SAME: -> tensor<9x21x2xf32>
-// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 9, 21, 2>, start = array<i64: 4, 0, 1>}
-// CHECK: %[[VAR1:.*]] = "tosa.reverse"(%[[VAR0]]) {axis = 2 : i64}
+// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 9, 21, 2>, start = array<i64: 4, 0, 1>}>
+// CHECK: %[[VAR1:.*]] = "tosa.reverse"(%[[VAR0]]) <{axis = 2 : i64}>
 // CHECK: return %[[VAR1]]
 func.func @test_strided_slice_unstrided(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
@@ -1056,8 +1150,8 @@ func.func @test_strided_slice_unstrided(%arg0: tensor<13x21x3xf32>) -> tensor<*x
 
 // CHECK-LABEL: test_strided_slice_unstrided_shorter
 // CHECK: -> tensor<9x21x3xf32>
-// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 9, 21, 3>, start = array<i64: 4, 0, 0>}
-// CHECK: %[[VAR1:.*]] = "tosa.reverse"(%[[VAR0]]) {axis = 1 : i64}
+// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 9, 21, 3>, start = array<i64: 4, 0, 0>}>
+// CHECK: %[[VAR1:.*]] = "tosa.reverse"(%[[VAR0]]) <{axis = 1 : i64}>
 // CHECK: return %[[VAR1]]
 func.func @test_strided_slice_unstrided_shorter(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0]> : tensor<2xi32>
@@ -1071,8 +1165,8 @@ func.func @test_strided_slice_unstrided_shorter(%arg0: tensor<13x21x3xf32>) -> t
 
 // CHECK-LABEL: test_strided_slice_dynamic_masked
 // CHECK-SAME: -> tensor<10x?x?xf32>
-// CHECK: %[[VAR0:.*]] = "tosa.reverse"(%arg0) {axis = 1 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reverse"(%[[VAR0]]) {axis = 2 : i64}
+// CHECK: %[[VAR0:.*]] = "tosa.reverse"(%arg0) <{axis = 1 : i64}>
+// CHECK: %[[VAR1:.*]] = "tosa.reverse"(%[[VAR0]]) <{axis = 2 : i64}>
 // CHECK: return %[[VAR1]]
 func.func @test_strided_slice_dynamic_masked(%arg0: tensor<10x?x?xf32>, %arg1: tensor<3xi32>) -> tensor<*xf32> {
   %cst_0 = arith.constant dense<[13, -1, 3]> : tensor<3xi32>
@@ -1093,8 +1187,8 @@ func.func @test_strided_slice_dynamic_begin(%arg0: tensor<10x?x?xf32>) -> tensor
   %cst = arith.constant dense<[0, 2, 0]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, -1, 3]> : tensor<3xi32>
   %cst_1 = arith.constant dense<[1, -1, -1]> : tensor<3xi32>
-  // CHECK: %[[VAR0:.*]] = "tosa.reverse"(%arg0) {axis = 1 : i64}
-  // CHECK: %[[VAR1:.*]] = "tosa.reverse"(%[[VAR0]]) {axis = 2 : i64}
+  // CHECK: %[[VAR0:.*]] = "tosa.reverse"(%arg0) <{axis = 1 : i64}>
+  // CHECK: %[[VAR1:.*]] = "tosa.reverse"(%[[VAR0]]) <{axis = 2 : i64}>
   // CHECK: return %[[VAR1]]
   %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 7 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32}  : (tensor<10x?x?xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -1108,10 +1202,10 @@ func.func @test_strided_slice_dynamic_end(%arg0: tensor<10x?x?xf32>) -> tensor<*
   %end = arith.constant dense<[7, -1, 6]> : tensor<3xi32>
   %stride = arith.constant dense<[1, 2, -1]> : tensor<3xi32>
 
-  // CHECK: %[[SLICE1:.+]] = "tosa.slice"(%arg0) {size = array<i64: 7, -1, 1>, start = array<i64: 0, 1, 2>}
-  // CHECK: %[[RESHAPE1:.+]] = "tosa.reshape"(%[[SLICE1]]) {new_shape = array<i64: 7, 1, -1, 2, 1, 1>}
-  // CHECK: %[[SLICE2:.+]] = "tosa.slice"(%[[RESHAPE1]]) {size = array<i64: 7, 1, -1, 1, 1, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}
-  // CHECK: %[[RESHAPE2:.+]] = "tosa.reshape"(%[[SLICE2]]) {new_shape = array<i64: 7, -1>}
+  // CHECK: %[[SLICE1:.+]] = "tosa.slice"(%arg0) <{size = array<i64: 7, -1, 1>, start = array<i64: 0, 1, 2>}>
+  // CHECK: %[[RESHAPE1:.+]] = "tosa.reshape"(%[[SLICE1]]) <{new_shape = array<i64: 7, 1, -1, 2, 1, 1>}>
+  // CHECK: %[[SLICE2:.+]] = "tosa.slice"(%[[RESHAPE1]]) <{size = array<i64: 7, 1, -1, 1, 1, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}>
+  // CHECK: %[[RESHAPE2:.+]] = "tosa.reshape"(%[[SLICE2]]) <{new_shape = array<i64: 7, -1>}>
   %0 = "tfl.strided_slice"(%arg0, %begin, %end, %stride)  {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 2 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 4 : i32}  : (tensor<10x?x?xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
   // CHECK: return %[[RESHAPE2]]
   func.return %0 : tensor<*xf32>
@@ -1120,7 +1214,7 @@ func.func @test_strided_slice_dynamic_end(%arg0: tensor<10x?x?xf32>) -> tensor<*
 // -----
 
 // CHECK-LABEL: test_select
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg2) {new_shape = array<i64: 1, 1, 1>} : (tensor<1xi1>) -> tensor<1x1x1xi1>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg2) <{new_shape = array<i64: 1, 1, 1>}> : (tensor<1xi1>) -> tensor<1x1x1xi1>
 // CHECK: %[[VAR2:.*]] = "tosa.select"(%[[VAR1]], %arg0, %arg1)
 func.func @test_select(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<1xi1>) -> tensor<13x21x3xf32> {
   %0 = "tfl.select_v2"(%arg2, %arg0, %arg1) : (tensor<1xi1>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
@@ -1151,7 +1245,7 @@ func.func @test_addn(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %ar
 // -----
 
 // CHECK-LABEL: test_concatv2
-// CHECK: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1, %arg2, %arg3) {axis = 0 : i64}
+// CHECK: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1, %arg2, %arg3) <{axis = 0 : i64}>
 func.func @test_concatv2(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<13x21x3xf32>, %arg3: tensor<13x21x3xf32>) -> tensor<52x21x3xf32> {
   %0 = "tfl.concatenation"(%arg0, %arg1, %arg2, %arg3)  {axis = 0 : i32, fused_activation_function = "NONE"}  : (tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<52x21x3xf32>
   func.return %0 : tensor<52x21x3xf32>
@@ -1160,8 +1254,8 @@ func.func @test_concatv2(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>,
 // -----
 
 // CHECK-LABEL: test_stack
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1, %arg2, %arg3) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 4, 13, 21, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1, %arg2, %arg3) <{axis = 0 : i64}>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 4, 13, 21, 3>}>
 func.func @test_stack(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<13x21x3xf32>, %arg3: tensor<13x21x3xf32>) -> tensor<4x13x21x3xf32> {
   %0 = "tfl.pack"(%arg0, %arg1, %arg2, %arg3)  {axis = 0 : i32, values_count = 4 : i32}  : (tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<4x13x21x3xf32>
   func.return %0 : tensor<4x13x21x3xf32>
@@ -1170,9 +1264,9 @@ func.func @test_stack(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %a
 // -----
 
 // CHECK-LABEL: test_stack_end
-// CHECK-DAG: %[[PERM:.*]] = "tosa.const"() {value = dense<[1, 2, 3, 0]> : tensor<4xi32>}
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 2, 13, 21, 3>}
+// CHECK-DAG: %[[PERM:.*]] = "tosa.const"() <{value = dense<[1, 2, 3, 0]> : tensor<4xi32>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1) <{axis = 0 : i64}>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 2, 13, 21, 3>}>
 // CHECK: %[[TRANSPOSE:.*]] = "tosa.transpose"(%[[VAR1]], %[[PERM]])
 func.func @test_stack_end(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3x2xf32> {
   %0 = "tfl.pack"(%arg0, %arg1)  {axis = 3 : i32, values_count = 2 : i32}  : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3x2xf32>
@@ -1182,7 +1276,7 @@ func.func @test_stack_end(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>
 // -----
 
 // CHECK-LABEL: test_unstack
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 32, 32, 8>}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 32, 32, 8>}>
 func.func @test_unstack(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.unpack"(%arg0)  {axis = 0 : i32, num = 1 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -1191,8 +1285,8 @@ func.func @test_unstack(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_pad
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{\[\[}}1, 1], {{\[}}2, 2]]> : tensor<2x2xi32>}
-// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<f32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{\[\[}}1, 1], {{\[}}2, 2]]> : tensor<2x2xi32>}>
+// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}>
 // CHECK: %[[VAR1:.*]] = "tosa.pad"(%arg0, %[[VAR0]], %[[PVAL]])
 func.func @test_pad(%arg0: tensor<2x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[[1, 1], [2, 2]]> : tensor<2x2xi32>
@@ -1206,10 +1300,10 @@ func.func @test_pad(%arg0: tensor<2x3xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: test_pad_v2
 // CHECK-SAME: -> tensor<1x257x9x28xf32>
 func.func @test_pad_v2(%arg0: tensor<1x256x8x25xf32>) -> (tensor<*xf32>) {
-  // CHECK-DAG: %[[PADDING:.+]] = "tosa.const"() {value = dense<{{\[\[}}0, 0], [1, 0], [0, 1], [1, 2]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[PADDING:.+]] = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [1, 0], [0, 1], [1, 2]]> : tensor<4x2xi32>}>
   %0 = "tfl.pseudo_const"() {value = dense<[[0, 0], [1, 0], [0, 1], [1, 2]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
 
-  // CHECK-DAG: %[[VAL:.+]] = "tosa.const"() {value = dense<-3.40282347E+38> : tensor<f32>}
+  // CHECK-DAG: %[[VAL:.+]] = "tosa.const"() <{value = dense<-3.40282347E+38> : tensor<f32>}>
   %1 = "tfl.pseudo_const"() {value = dense<-3.40282347E+38> : tensor<f32>} : () -> tensor<f32>
 
   // CHECK-DAG: %[[PAD:.+]] = "tosa.pad"(%arg0, %[[PADDING]], %[[VAL]]) : (tensor<1x256x8x25xf32>, tensor<4x2xi32>, tensor<f32>) -> tensor<1x257x9x28xf32>
@@ -1222,7 +1316,7 @@ func.func @test_pad_v2(%arg0: tensor<1x256x8x25xf32>) -> (tensor<*xf32>) {
 // -----
 
 // CHECK-LABEL: test_expand_dims
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 21, 3>}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 13, 21, 3>}>
 func.func @test_expand_dims(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[1, 13, 21, 3]> : tensor<4xi32>
   %0 = "tfl.reshape"(%arg0, %cst) : (tensor<13x21x3xf32>, tensor<4xi32>) -> tensor<*xf32>
@@ -1232,7 +1326,7 @@ func.func @test_expand_dims(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_shape
-// CHECK: %[[VAR0:.*]] = "tosa.const"() {value = dense<[13, 21, 3]> : tensor<3xi32>}
+// CHECK: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[13, 21, 3]> : tensor<3xi32>}>
 func.func @test_shape() -> tensor<3xi32> {
   %cst = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
   func.return %cst : tensor<3xi32>
@@ -1241,7 +1335,7 @@ func.func @test_shape() -> tensor<3xi32> {
 // -----
 
 // CHECK-LABEL: test_rank
-// CHECK: %[[VAR0:.*]] = "tosa.const"() {value = dense<3> : tensor<i32>}
+// CHECK: %[[VAR0:.*]] = "tosa.const"() <{value = dense<3> : tensor<i32>}>
 func.func @test_rank() -> tensor<i32> {
   %cst = arith.constant dense<3> : tensor<i32>
   func.return %cst : tensor<i32>
@@ -1250,8 +1344,8 @@ func.func @test_rank() -> tensor<i32> {
 // -----
 
 // CHECK-LABEL: test_elu
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x1xf32>}>
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.exp"(%arg0)
 // CHECK-DAG: %[[VAR4:.*]] = "tosa.sub"(%[[VAR2]], %[[VAR0]])
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.greater_equal"(%arg0, %[[VAR1]])
@@ -1264,10 +1358,12 @@ func.func @test_elu(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_softmax
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.exp"(%arg0)
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%[[VAR0]]) {axis = 2 : i64}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.reciprocal"(%[[VAR1]])
-// CHECK: %[[VAR3:.*]] = "tosa.mul"(%[[VAR0]], %[[VAR2]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_max"(%arg0)
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.sub"(%arg0, %[[VAR0]])
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.exp"(%[[VAR1]])
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reduce_sum"(%[[VAR2]]) <{axis = 2 : i64}>
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.reciprocal"(%[[VAR3]])
+// CHECK: %[[VAR5:.*]] = "tosa.mul"(%[[VAR2]], %[[VAR4]]) <{shift = 0 : i32}>
 func.func @test_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %0 = "tfl.softmax"(%arg0)  {beta = 1.000000e+00 : f32}  : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   func.return %0 : tensor<13x21x3xf32>
@@ -1277,13 +1373,13 @@ func.func @test_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 
 // CHECK-LABEL: test_l2normalization
 func.func @test_l2normalization(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
-  // CHECK-DAG: %[[MIN:.+]] = "tosa.const"() {value = dense<1.08420217E-19> : tensor<1x1xf32>}
-  // CHECK-DAG: %[[SQR:.+]] = "tosa.mul"(%arg0, %arg0) {shift = 0 : i32}
-  // CHECK-DAG: %[[SUM:.+]] = "tosa.reduce_sum"(%[[SQR]]) {axis = 1 : i64}
+  // CHECK-DAG: %[[MIN:.+]] = "tosa.const"() <{value = dense<1.08420217E-19> : tensor<1x1xf32>}>
+  // CHECK-DAG: %[[SQR:.+]] = "tosa.mul"(%arg0, %arg0) <{shift = 0 : i32}>
+  // CHECK-DAG: %[[SUM:.+]] = "tosa.reduce_sum"(%[[SQR]]) <{axis = 1 : i64}>
   // CHECK-DAG: %[[MAX:.+]] = "tosa.maximum"(%[[SUM]], %[[MIN]])
   // CHECK-DAG: %[[RSQRT:.+]] = "tosa.rsqrt"(%[[MAX]])
   // CHECK-DAG: %[[MUL:.+]] = "tosa.mul"(%[[RSQRT]], %arg0)
-  // CHECK: %[[CLAMP:.+]] = "tosa.clamp"(%[[MUL]]) {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
+  // CHECK: %[[CLAMP:.+]] = "tosa.clamp"(%[[MUL]]) <{max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}>
   %0 = "tfl.l2_normalization"(%arg0) {fused_activation_function = "RELU"} : (tensor<16x16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
@@ -1292,9 +1388,9 @@ func.func @test_l2normalization(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>)
 
 // CHECK-LABEL: test_log_softmax
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.exp"(%arg0)
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%[[VAR0]]) {axis = 2 : i64}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%[[VAR0]]) <{axis = 2 : i64}>
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.reciprocal"(%[[VAR1]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.mul"(%[[VAR0]], %[[VAR2]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.mul"(%[[VAR0]], %[[VAR2]]) <{shift = 0 : i32}>
 // CHECK: %[[VAR4:.*]] = "tosa.log"(%[[VAR3]])
 func.func @test_log_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %0 = "tfl.log_softmax"(%arg0) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
@@ -1304,8 +1400,8 @@ func.func @test_log_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_matmul
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<28xf32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<28xf32>}>
 // CHECK: %[[VAR2:.*]] = "tosa.transpose"(%arg1, %[[VAR0]])
 // CHECK: %[[VAR3:.*]] = "tosa.fully_connected"(%arg0, %[[VAR2]], %[[VAR1]])
 func.func @test_matmul(%arg0: tensor<14x19xf32>, %arg1: tensor<19x28xf32>) -> tensor<*xf32> {
@@ -1377,10 +1473,10 @@ func.func @test_batch_matmul(%arg0: tensor<1x16x128xf32>, %arg1: tensor<1x128x32
 
 // CHECK-LABEL: @test_batch_matmul_4d
 func.func @test_batch_matmul_4d(%arg0: tensor<4x5x16x128xf32>, %arg1: tensor<4x5x128x32xf32>) -> (tensor<4x5x16x32xf32> ) {
-  // CHECK: %[[R0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 20, 16, 128>}
-  // CHECK: %[[R1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 20, 128, 32>}
+  // CHECK: %[[R0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 20, 16, 128>}>
+  // CHECK: %[[R1:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 20, 128, 32>}>
   // CHECK: %[[MM:.*]] = "tosa.matmul"(%[[R0]], %[[R1]])
-  // CHECK: "tosa.reshape"(%[[MM]]) {new_shape = array<i64: 4, 5, 16, 32>}
+  // CHECK: "tosa.reshape"(%[[MM]]) <{new_shape = array<i64: 4, 5, 16, 32>}>
   %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<4x5x16x128xf32>, tensor<4x5x128x32xf32>) -> tensor<4x5x16x32xf32>
   func.return %0 : tensor<4x5x16x32xf32>
 }
@@ -1389,7 +1485,7 @@ func.func @test_batch_matmul_4d(%arg0: tensor<4x5x16x128xf32>, %arg1: tensor<4x5
 
 // CHECK-LABEL: @test_batch_matmul_transpose
 func.func @test_batch_matmul_transpose(%arg0: tensor<1x16x128xf32>, %arg1: tensor<1x128x32xf32>) -> (tensor<1x32x16xf32> ) {
-  // CHECK-DAG: %[[PERM:.+]] = "tosa.const"() {value = dense<[0, 2, 1]> : tensor<3xi32>}
+  // CHECK-DAG: %[[PERM:.+]] = "tosa.const"() <{value = dense<[0, 2, 1]> : tensor<3xi32>}>
   // CHECK-DAG: %[[TP0:.+]] = "tosa.transpose"(%arg0, %[[PERM]])
   // CHECK-DAG: %[[TP1:.+]] = "tosa.transpose"(%arg1, %[[PERM]])
   // CHECK: "tosa.matmul"(%[[TP1]], %[[TP0]])
@@ -1400,7 +1496,7 @@ func.func @test_batch_matmul_transpose(%arg0: tensor<1x16x128xf32>, %arg1: tenso
 // -----
 
 // CHECK-LABEL: test_add_scalar
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x1xf32>}>
 // CHECK: %[[VAR2:.*]] = "tosa.add"(%arg0, %[[VAR0]])
 func.func @test_add_scalar(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<1.000000e+00> : tensor<f32>
@@ -1411,8 +1507,8 @@ func.func @test_add_scalar(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_add_1d
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_sum"(%arg1) {axis = 0 : i64}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%[[VAR0]]) {axis = 1 : i64}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_sum"(%arg1) <{axis = 0 : i64}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%[[VAR0]]) <{axis = 1 : i64}>
 // CHECK: %[[VAR2:.*]] = "tosa.add"(%arg0, %[[VAR1]])
 func.func @test_add_1d(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[0, 1]> : tensor<2xi32>
@@ -1429,7 +1525,7 @@ func.func @test_fused_activation_relun_clamp(
     %arg1: tensor<10x!quant.uniform<i8:f32, 0.1:-127>>) ->
     tensor<10x!quant.uniform<i8:f32, 0.1:-127>> {
   %cst = arith.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK: "tosa.clamp"(%{{.+}}) {max_fp = 0.000000e+00 : f32, max_int = -67 : i64, min_fp = 0.000000e+00 : f32, min_int = -127 : i64}
+  // CHECK: "tosa.clamp"(%{{.+}}) <{max_fp = 0.000000e+00 : f32, max_int = -67 : i64, min_fp = 0.000000e+00 : f32, min_int = -127 : i64}>
   %0 = "tfl.add"(%arg0, %arg0)  {fused_activation_function = "RELU6"} : (tensor<10x!quant.uniform<i8:f32, 0.1:-127>>, tensor<10x!quant.uniform<i8:f32, 0.1:-127>>) -> tensor<10x!quant.uniform<i8:f32, 0.1:-127>>
   func.return %0 : tensor<10x!quant.uniform<i8:f32, 0.1:-127>>
 }
@@ -1442,7 +1538,7 @@ func.func @test_fused_activation_relun_noclamp(
     %arg1: tensor<10x!quant.uniform<i8:f32, 0.01:-129>>) ->
     tensor<10x!quant.uniform<i8:f32, 0.01:-129>> {
   %cst = arith.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK: "tosa.clamp"(%{{.+}}) {max_fp = 0.000000e+00 : f32, max_int = 127 : i64, min_fp = 0.000000e+00 : f32, min_int = -128 : i64}
+  // CHECK: "tosa.clamp"(%{{.+}}) <{max_fp = 0.000000e+00 : f32, max_int = 127 : i64, min_fp = 0.000000e+00 : f32, min_int = -128 : i64}>
   %0 = "tfl.add"(%arg0, %arg0)  {fused_activation_function = "RELU6"} : (tensor<10x!quant.uniform<i8:f32, 0.01:-129>>, tensor<10x!quant.uniform<i8:f32, 0.01:-129>>) -> tensor<10x!quant.uniform<i8:f32, 0.01:-129>>
   func.return %0 : tensor<10x!quant.uniform<i8:f32, 0.01:-129>>
 }
@@ -1454,7 +1550,7 @@ func.func @test_fused_activation_relun1to1_noclamp(
                          %arg0: tensor<10x!quant.uniform<i8:f32, 0.001:-120>>,
                          %arg1: tensor<10x!quant.uniform<i8:f32, 0.001:-120>>) -> tensor<10x!quant.uniform<i8:f32, 0.001:-120>> {
   %cst = arith.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK:  "tosa.clamp"(%{{.}}) {max_fp = 0.000000e+00 : f32, max_int = 127 : i64, min_fp = 0.000000e+00 : f32, min_int = -128 : i64}
+  // CHECK:  "tosa.clamp"(%{{.}}) <{max_fp = 0.000000e+00 : f32, max_int = 127 : i64, min_fp = 0.000000e+00 : f32, min_int = -128 : i64}>
   %0 = "tfl.add"(%arg0, %arg0)  {fused_activation_function = "RELU_N1_TO_1"}  : (tensor<10x!quant.uniform<i8:f32, 0.001:-120>>, tensor<10x!quant.uniform<i8:f32, 0.001:-120>>) -> tensor<10x!quant.uniform<i8:f32, 0.001:-120>>
   func.return %0 : tensor<10x!quant.uniform<i8:f32, 0.001:-120>>
 }
@@ -1466,7 +1562,7 @@ func.func @test_fused_activation_relun1to1_clamp(
                          %arg0: tensor<10x!quant.uniform<i8:f32, 0.01:-10>>,
                          %arg1: tensor<10x!quant.uniform<i8:f32, 0.01:-10>>) -> tensor<10x!quant.uniform<i8:f32, 0.01:-10>> {
   %cst = arith.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK:  "tosa.clamp"(%{{.}}) {max_fp = 0.000000e+00 : f32, max_int = 90 : i64, min_fp = 0.000000e+00 : f32, min_int = -110 : i64}
+  // CHECK:  "tosa.clamp"(%{{.}}) <{max_fp = 0.000000e+00 : f32, max_int = 90 : i64, min_fp = 0.000000e+00 : f32, min_int = -110 : i64}>
   %0 = "tfl.add"(%arg0, %arg0)  {fused_activation_function = "RELU_N1_TO_1"}  : (tensor<10x!quant.uniform<i8:f32, 0.01:-10>>, tensor<10x!quant.uniform<i8:f32, 0.01:-10>>) -> tensor<10x!quant.uniform<i8:f32, 0.01:-10>>
   func.return %0 : tensor<10x!quant.uniform<i8:f32, 0.01:-10>>
 }
@@ -1474,9 +1570,9 @@ func.func @test_fused_activation_relun1to1_clamp(
 // -----
 
 // CHECK-LABEL: test_split
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 0, 0>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 7, 0>}
-// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 14, 0>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 13, 7, 3>, start = array<i64: 0, 0, 0>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 13, 7, 3>, start = array<i64: 0, 7, 0>}>
+// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 13, 7, 3>, start = array<i64: 0, 14, 0>}>
 func.func @test_split(%arg0: tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor<13x7x3xf32>, tensor<13x7x3xf32>) {
   %cst_0 = arith.constant dense<1> : tensor<i32>
   %0:3 = "tfl.split"(%cst_0, %arg0)  {num_splits = 3 : i32}  : (tensor<i32>, tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor<13x7x3xf32>, tensor<13x7x3xf32>)
@@ -1488,13 +1584,13 @@ func.func @test_split(%arg0: tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor
 // CHECK-LABEL: test_split_dynamic
 func.func @test_split_dynamic(%arg0: tensor<13x?x3xf32>) -> (tensor<13x?x3xf32>, tensor<13x?x3xf32>, tensor<13x?x3xf32>) {
   %cst_0 = arith.constant dense<1> : tensor<i32>
-  // CHECK-DAG: %[[VAR0:.+]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 13, 3, -1, 3>}
-  // CHECK-DAG: %[[VAR1:.+]] = "tosa.slice"(%[[VAR0]]) {size = array<i64: 13, 1, -1, 3>, start = array<i64: 0, 0, 0, 0>}
-  // CHECK-DAG: %[[VAR2:.+]] = "tosa.slice"(%[[VAR0]]) {size = array<i64: 13, 1, -1, 3>, start = array<i64: 0, 1, 0, 0>}
-  // CHECK-DAG: %[[VAR3:.+]] = "tosa.slice"(%[[VAR0]]) {size = array<i64: 13, 1, -1, 3>, start = array<i64: 0, 2, 0, 0>}
-  // CHECK-DAG: %[[VAR4:.+]] = "tosa.reshape"(%[[VAR1]]) {new_shape = array<i64: 13, -1, 3>}
-  // CHECK-DAG: %[[VAR5:.+]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 13, -1, 3>}
-  // CHECK-DAG: %[[VAR6:.+]] = "tosa.reshape"(%[[VAR3]]) {new_shape = array<i64: 13, -1, 3>}
+  // CHECK-DAG: %[[VAR0:.+]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 13, 3, -1, 3>}>
+  // CHECK-DAG: %[[VAR1:.+]] = "tosa.slice"(%[[VAR0]]) <{size = array<i64: 13, 1, -1, 3>, start = array<i64: 0, 0, 0, 0>}>
+  // CHECK-DAG: %[[VAR2:.+]] = "tosa.slice"(%[[VAR0]]) <{size = array<i64: 13, 1, -1, 3>, start = array<i64: 0, 1, 0, 0>}>
+  // CHECK-DAG: %[[VAR3:.+]] = "tosa.slice"(%[[VAR0]]) <{size = array<i64: 13, 1, -1, 3>, start = array<i64: 0, 2, 0, 0>}>
+  // CHECK-DAG: %[[VAR4:.+]] = "tosa.reshape"(%[[VAR1]]) <{new_shape = array<i64: 13, -1, 3>}>
+  // CHECK-DAG: %[[VAR5:.+]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 13, -1, 3>}>
+  // CHECK-DAG: %[[VAR6:.+]] = "tosa.reshape"(%[[VAR3]]) <{new_shape = array<i64: 13, -1, 3>}>
   // CHECK: return %[[VAR4]], %[[VAR5]], %[[VAR6]]
   %0:3 = "tfl.split"(%cst_0, %arg0)  {num_splits = 3 : i32}  : (tensor<i32>, tensor<13x?x3xf32>) -> (tensor<13x?x3xf32>, tensor<13x?x3xf32>, tensor<13x?x3xf32>)
   func.return %0#0, %0#1, %0#2 : tensor<13x?x3xf32>, tensor<13x?x3xf32>, tensor<13x?x3xf32>
@@ -1503,9 +1599,9 @@ func.func @test_split_dynamic(%arg0: tensor<13x?x3xf32>) -> (tensor<13x?x3xf32>,
 // -----
 
 // CHECK-LABEL: test_split_neg
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 0, 0>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 7, 0>}
-// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 14, 0>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 13, 7, 3>, start = array<i64: 0, 0, 0>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 13, 7, 3>, start = array<i64: 0, 7, 0>}>
+// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 13, 7, 3>, start = array<i64: 0, 14, 0>}>
 func.func @test_split_neg(%arg0: tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor<13x7x3xf32>, tensor<13x7x3xf32>) {
   %cst_0 = arith.constant dense<-2> : tensor<i32>
   %0:3 = "tfl.split"(%cst_0, %arg0)  {num_splits = 3 : i32}  : (tensor<i32>, tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor<13x7x3xf32>, tensor<13x7x3xf32>)
@@ -1515,9 +1611,9 @@ func.func @test_split_neg(%arg0: tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, te
 // -----
 
 // CHECK-LABEL: test_split_axis_0
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 7, 13, 3>, start = array<i64: 0, 0, 0>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = array<i64: 7, 13, 3>, start = array<i64: 7, 0, 0>}
-// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) {size = array<i64: 7, 13, 3>, start = array<i64: 14, 0, 0>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 7, 13, 3>, start = array<i64: 0, 0, 0>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 7, 13, 3>, start = array<i64: 7, 0, 0>}>
+// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 7, 13, 3>, start = array<i64: 14, 0, 0>}>
 func.func @test_split_axis_0(%arg0: tensor<21x13x3xf32>) -> (tensor<7x13x3xf32>, tensor<7x13x3xf32>, tensor<7x13x3xf32>) {
   %cst_0 = arith.constant dense<0> : tensor<i32>
   %0:3 = "tfl.split"(%cst_0, %arg0)  {num_splits = 3 : i32}  : (tensor<i32>, tensor<21x13x3xf32>) -> (tensor<7x13x3xf32>, tensor<7x13x3xf32>, tensor<7x13x3xf32>)
@@ -1527,8 +1623,8 @@ func.func @test_split_axis_0(%arg0: tensor<21x13x3xf32>) -> (tensor<7x13x3xf32>,
 // -----
 
 // CHECK-LABEL: test_split_v_neg_axis
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 2, 3, 3, 3>, start = array<i64: 0, 0, 0, 0>}
-// CHECK: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = array<i64: 2, 3, 3, 5>, start = array<i64: 0, 0, 0, 3>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 2, 3, 3, 3>, start = array<i64: 0, 0, 0, 0>}>
+// CHECK: %[[VAR1:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 2, 3, 3, 5>, start = array<i64: 0, 0, 0, 3>}>
 func.func @test_split_v_neg_axis(%arg0: tensor<2x3x3x8xf32>) -> (tensor<2x3x3x3xf32>, tensor<2x3x3x5xf32>) {
   %split_size = arith.constant dense<[3, 5]> : tensor<2xi32>
   %axis = arith.constant dense<-1> : tensor<i32>
@@ -1549,13 +1645,13 @@ func.func @test_tile(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_space_to_batch
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{\[}}[0, 0], [0, 1], [0, 0]]> : tensor<3x2xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<[2, 0, 1, 3]> : tensor<4xi32>}
-// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<f32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{\[}}[0, 0], [0, 1], [0, 0]]> : tensor<3x2xi32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<[2, 0, 1, 3]> : tensor<4xi32>}>
+// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}>
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.pad"(%arg0, %[[VAR0]], %[[PVAL]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 13, 11, 2, 3>}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 13, 11, 2, 3>}>
 // CHECK-DAG: %[[VAR4:.*]] = "tosa.transpose"(%[[VAR3]], %[[VAR1]])
-// CHECK: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = array<i64: 26, 11, 3>}
+// CHECK: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) <{new_shape = array<i64: 26, 11, 3>}>
 func.func @test_space_to_batch(%arg0: tensor<13x21x3xf32>) -> tensor<26x11x3xf32> {
   %cst = arith.constant dense<2> : tensor<1xi32>
   %cst_0 = arith.constant dense<[[0, 1]]> : tensor<1x2xi32>
@@ -1566,13 +1662,13 @@ func.func @test_space_to_batch(%arg0: tensor<13x21x3xf32>) -> tensor<26x11x3xf32
 // -----
 
 // CHECK-LABEL: test_space_to_batch_dyn
-// CHECK-DAG: %[[C0:.+]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<f32>}
-// CHECK-DAG: %[[C1:.+]] = "tosa.const"() {value = dense<{{\[\[}}0, 0], [0, 2], [0, 0], [0, 0]]> : tensor<4x2xi32>}
-// CHECK-DAG: %[[C2:.+]] = "tosa.const"() {value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
+// CHECK-DAG: %[[C0:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}>
+// CHECK-DAG: %[[C1:.+]] = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [0, 2], [0, 0], [0, 0]]> : tensor<4x2xi32>}>
+// CHECK-DAG: %[[C2:.+]] = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}>
 // CHECK-DAG: %[[PAD:.+]] = "tosa.pad"(%arg0, %[[C1]], %[[C0]]) : (tensor<?x241x1x80xf32>, tensor<4x2xi32>, tensor<f32>) -> tensor<?x243x1x80xf32>
-// CHECK-DAG: %[[R0:.+]] = "tosa.reshape"(%[[PAD]]) {new_shape = array<i64: -1, 81, 3, 1, 1, 80>}
+// CHECK-DAG: %[[R0:.+]] = "tosa.reshape"(%[[PAD]]) <{new_shape = array<i64: -1, 81, 3, 1, 1, 80>}>
 // CHECK-DAG: %[[T:.+]] = "tosa.transpose"(%[[R0]], %[[C2]])
-// CHECK-DAG: %[[R1:.+]] = "tosa.reshape"(%[[T]]) {new_shape = array<i64: -1, 81, 1, 80>}
+// CHECK-DAG: %[[R1:.+]] = "tosa.reshape"(%[[T]]) <{new_shape = array<i64: -1, 81, 1, 80>}>
 // CHECK: return %[[R1]] : tensor<?x81x1x80xf32>
 func.func @test_space_to_batch_dyn(%arg0 : tensor<?x241x1x80xf32>) -> (tensor<?x81x1x80xf32>) {
     %0 = "tfl.pseudo_const"() {value = dense<[3, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -1584,12 +1680,12 @@ func.func @test_space_to_batch_dyn(%arg0 : tensor<?x241x1x80xf32>) -> (tensor<?x
 // -----
 
 // CHECK-LABEL: test_batch_to_space
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[3, 1, 2, 0]> : tensor<4xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<[2, 3, 0, 4, 1, 5]> : tensor<6xi32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[3, 1, 2, 0]> : tensor<4xi32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<[2, 3, 0, 4, 1, 5]> : tensor<6xi32>}>
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%arg0, %[[VAR0]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 2, 2, 2, 32, 32, 1>}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 2, 2, 2, 32, 32, 1>}>
 // CHECK-DAG: %[[VAR4:.*]] = "tosa.transpose"(%[[VAR3]], %[[VAR1]])
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = array<i64: 2, 64, 64, 1>}
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) <{new_shape = array<i64: 2, 64, 64, 1>}>
 // CHECK: return %[[VAR5:.*]]
 func.func @test_batch_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<2x64x64x1xf32> {
   %cst = arith.constant dense<2> : tensor<2xi32>
@@ -1603,11 +1699,11 @@ func.func @test_batch_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<2x64x64x1
 // -----
 
 // CHECK-LABEL: @test_batch_to_space_dyn
-// CHECK-DAG: %[[C0:.+]] = "tosa.const"() {value = dense<[2, 3, 0, 4, 1, 5]> : tensor<6xi32>}
-// CHECK-DAG: %[[R0:.+]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 3, 1, -1, 79, 1, 80>}
+// CHECK-DAG: %[[C0:.+]] = "tosa.const"() <{value = dense<[2, 3, 0, 4, 1, 5]> : tensor<6xi32>}>
+// CHECK-DAG: %[[R0:.+]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 3, 1, -1, 79, 1, 80>}>
 // CHECK-DAG: %[[T:.+]] = "tosa.transpose"(%[[R0]], %[[C0]])
-// CHECK-DAG: %[[R1:.+]] = "tosa.reshape"(%[[T]]) {new_shape = array<i64: -1, 237, 1, 80>}
-// CHECK-DAG: %[[SLICE:.+]] = "tosa.slice"(%[[R1]]) {size = array<i64: -1, 235, 1, 80>, start = array<i64: 0, 0, 0, 0>}
+// CHECK-DAG: %[[R1:.+]] = "tosa.reshape"(%[[T]]) <{new_shape = array<i64: -1, 237, 1, 80>}>
+// CHECK-DAG: %[[SLICE:.+]] = "tosa.slice"(%[[R1]]) <{size = array<i64: -1, 235, 1, 80>, start = array<i64: 0, 0, 0, 0>}>
 // CHECK: return %[[SLICE]]
 func.func @test_batch_to_space_dyn(%arg0 : tensor<?x79x1x80xf32>) -> (tensor<?x235x1x80xf32>) {
     %0 = "tfl.pseudo_const"() {value = dense<[3, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -1619,10 +1715,10 @@ func.func @test_batch_to_space_dyn(%arg0 : tensor<?x79x1x80xf32>) -> (tensor<?x2
 // -----
 
 // CHECK-LABEL: test_space_to_depth
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 16, 2, 16, 2, 8>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 16, 2, 16, 2, 8>}>
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%[[VAR1]], %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 1, 16, 16, 32>}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 1, 16, 16, 32>}>
 func.func @test_space_to_depth(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x16x16x32xf32> {
   %0 = "tfl.space_to_depth"(%arg0)  {block_size = 2 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<1x16x16x32xf32>
   func.return %0 : tensor<1x16x16x32xf32>
@@ -1631,10 +1727,10 @@ func.func @test_space_to_depth(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x16x16x3
 // -----
 
 // CHECK-LABEL: test_depth_to_space
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 32, 32, 2, 2, 2>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 32, 32, 2, 2, 2>}>
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%[[VAR1]], %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 1, 64, 64, 2>}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 1, 64, 64, 2>}>
 func.func @test_depth_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x2xf32> {
   %0 = "tfl.depth_to_space"(%arg0)  {block_size = 2 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<1x64x64x2xf32>
   func.return %0 : tensor<1x64x64x2xf32>
@@ -1643,12 +1739,12 @@ func.func @test_depth_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x2
 // -----
 
 // CHECK-LABEL: @test_bucketize
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() {value = dense<{{\[\[\[}}0.000000e+00, 3.000000e+00, 8.000000e+00, 1.100000e+01]]]> : tensor<1x1x4xf32>}
-// CHECK: %[[VAL_1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, 5, 1>}
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<{{\[\[\[}}0.000000e+00, 3.000000e+00, 8.000000e+00, 1.100000e+01]]]> : tensor<1x1x4xf32>}>
+// CHECK: %[[VAL_1:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 2, 5, 1>}>
 // CHECK: %[[VAL_2:.*]] = "tosa.greater_equal"(%[[VAL_1]], %[[VAL_0]])
 // CHECK: %[[VAL_3:.*]] = "tosa.cast"(%[[VAL_2]]) : (tensor<2x5x4xi1>) -> tensor<2x5x4xi32>
-// CHECK: %[[VAL_4:.*]] = "tosa.reduce_sum"(%[[VAL_3]]) {axis = 2 : i64}
-// CHECK: %[[VAL_5:.*]] = "tosa.reshape"(%[[VAL_4]]) {new_shape = array<i64: 2, 5>}
+// CHECK: %[[VAL_4:.*]] = "tosa.reduce_sum"(%[[VAL_3]]) <{axis = 2 : i64}>
+// CHECK: %[[VAL_5:.*]] = "tosa.reshape"(%[[VAL_4]]) <{new_shape = array<i64: 2, 5>}>
 func.func @test_bucketize(%arg0: tensor<2x5xf32>) -> tensor<2x5xi32> {
   %0 = "tfl.bucketize"(%arg0) {boundaries = [0.000000e+00 : f32, 3.000000e+00 : f32, 8.000000e+00 : f32, 1.100000e+01 : f32]} : (tensor<2x5xf32>) -> tensor<2x5xi32>
   func.return %0 : tensor<2x5xi32>
@@ -1657,14 +1753,14 @@ func.func @test_bucketize(%arg0: tensor<2x5xf32>) -> tensor<2x5xi32> {
 // -----
 
 // CHECK-LABEL: @test_bucketize_cast_boundaries
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() {value = dense<{{\[}}0.000000e+00, 3.000000e+00, 8.000000e+00, 1.100000e+01]> : tensor<4xf32>}
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<{{\[}}0.000000e+00, 3.000000e+00, 8.000000e+00, 1.100000e+01]> : tensor<4xf32>}>
 // CHECK: %[[VAL_1:.*]] = "tosa.cast"(%[[VAL_0]]) : (tensor<4xf32>) -> tensor<4xi32>
-// CHECK: %[[VAL_2:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, 5, 1>}
-// CHECK: %[[VAL_3:.*]] = "tosa.reshape"(%[[VAL_1]]) {new_shape = array<i64: 1, 1, 4>}
+// CHECK: %[[VAL_2:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 2, 5, 1>}>
+// CHECK: %[[VAL_3:.*]] = "tosa.reshape"(%[[VAL_1]]) <{new_shape = array<i64: 1, 1, 4>}>
 // CHECK: %[[VAL_4:.*]] = "tosa.greater_equal"(%[[VAL_2]], %[[VAL_3]])
 // CHECK: %[[VAL_5:.*]] = "tosa.cast"(%[[VAL_4]]) : (tensor<2x5x4xi1>) -> tensor<2x5x4xi32>
-// CHECK: %[[VAL_6:.*]] = "tosa.reduce_sum"(%[[VAL_5]]) {axis = 2 : i64}
-// CHECK: %[[VAL_7:.*]] = "tosa.reshape"(%[[VAL_6]]) {new_shape = array<i64: 2, 5>}
+// CHECK: %[[VAL_6:.*]] = "tosa.reduce_sum"(%[[VAL_5]]) <{axis = 2 : i64}>
+// CHECK: %[[VAL_7:.*]] = "tosa.reshape"(%[[VAL_6]]) <{new_shape = array<i64: 2, 5>}>
 func.func @test_bucketize_cast_boundaries(%arg0: tensor<2x5xi32>) -> tensor<2x5xi32> {
   %0 = "tfl.bucketize"(%arg0) {boundaries = [0.000000e+00 : f32, 3.000000e+00 : f32, 8.000000e+00 : f32, 1.100000e+01 : f32]} : (tensor<2x5xi32>) -> tensor<2x5xi32>
   func.return %0 : tensor<2x5xi32>
@@ -1674,13 +1770,13 @@ func.func @test_bucketize_cast_boundaries(%arg0: tensor<2x5xi32>) -> tensor<2x5x
 
 // CHECK-LABEL: @test_one_hot
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<4x4xi32>, %[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>
-// CHECK-DAG:     %[[RESHAPE:.*]] = "tosa.reshape"(%[[ARG1]]) {new_shape = array<i64: 1, 1, 1>}
-// CHECK-DAG:     %[[TILE:.*]] = "tosa.tile"(%[[RESHAPE]]) {multiples = array<i64: 16, 1, 1>}
-// CHECK-DAG:     %[[RESHAPE_0:.*]] = "tosa.reshape"(%[[ARG2]]) {new_shape = array<i64: 1, 1, 1>}
-// CHECK-DAG:     %[[TILE_0:.*]] = "tosa.tile"(%[[RESHAPE_0]]) {multiples = array<i64: 16, 2, 1>}
-// CHECK-DAG:     %[[RESHAPE_1:.*]] = "tosa.reshape"(%[[ARG0]]) {new_shape = array<i64: 16, 1>}
+// CHECK-DAG:     %[[RESHAPE:.*]] = "tosa.reshape"(%[[ARG1]]) <{new_shape = array<i64: 1, 1, 1>}>
+// CHECK-DAG:     %[[TILE:.*]] = "tosa.tile"(%[[RESHAPE]]) <{multiples = array<i64: 16, 1, 1>}>
+// CHECK-DAG:     %[[RESHAPE_0:.*]] = "tosa.reshape"(%[[ARG2]]) <{new_shape = array<i64: 1, 1, 1>}>
+// CHECK-DAG:     %[[TILE_0:.*]] = "tosa.tile"(%[[RESHAPE_0]]) <{multiples = array<i64: 16, 2, 1>}>
+// CHECK-DAG:     %[[RESHAPE_1:.*]] = "tosa.reshape"(%[[ARG0]]) <{new_shape = array<i64: 16, 1>}>
 // CHECK-DAG:     %[[SCATTER:.*]] = "tosa.scatter"(%[[TILE_0]], %[[RESHAPE_1]], %[[TILE]])
-// CHECK-DAG:     %[[RESHAPE_2:.*]] = "tosa.reshape"(%[[SCATTER]]) {new_shape = array<i64: 4, 4, 2>}
+// CHECK-DAG:     %[[RESHAPE_2:.*]] = "tosa.reshape"(%[[SCATTER]]) <{new_shape = array<i64: 4, 4, 2>}>
 // CHECK:         return %[[RESHAPE_2]]
 func.func @test_one_hot(%arg0: tensor<4x4xi32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<4x4x2xf32> {
   %0 = arith.constant dense<2> : tensor<i32>
@@ -1691,15 +1787,12 @@ func.func @test_one_hot(%arg0: tensor<4x4xi32>, %arg1: tensor<f32>, %arg2: tenso
 // -----
 
 // CHECK-LABEL: test_fakequant_with_min_max_args
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<16383.75> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() {value = dense<6.10360876E-5> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.mul"(%arg0, %[[VAR0]]) {shift = 0 : i32}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.add"(%[[VAR4]], %[[VAR1]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.cast"(%[[VAR5]])
-// CHECK-DAG: %[[VAR8:.*]] = "tosa.cast"(%[[VAR7]])
-// CHECK-DAG: %[[VAR10:.*]] = "tosa.sub"(%[[VAR8]], %[[VAR1]])
-// CHECK-DAG: %[[VAR12:.*]] = "tosa.mul"(%[[VAR10]], %[[VAR2]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<16383.75> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<6.10360876E-5> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.mul"(%arg0, %[[VAR0]]) <{shift = 0 : i32}>
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.cast"(%[[VAR2]])
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.cast"(%[[VAR3]])
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.mul"(%[[VAR4]], %[[VAR1]]) <{shift = 0 : i32}>
 func.func @test_fakequant_with_min_max_args(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %0 = "tfl.quantize"(%arg0)  {qtype = tensor<13x21x3x!quant.uniform<u16:f32, 6.1036087586785687E-5:32768>>}  : (tensor<13x21x3xf32>) -> tensor<*x!quant.uniform<u16:f32, 6.1036087586785687E-5:32768>>
   %1 = "tfl.dequantize"(%0) : (tensor<*x!quant.uniform<u16:f32, 6.1036087586785687E-5:32768>>) -> tensor<13x21x3xf32>
@@ -1720,7 +1813,7 @@ func.func @test_dequantize_float(%arg0: tensor<10xf16>) -> tensor<*xf32> {
 
 // CHECK-LABEL: @test_dequantize_quant_uniform
 func.func @test_dequantize_quant_uniform(%arg0: tensor<4x!quant.uniform<i8:f32, 1.0:-1>>) -> tensor<*xf32> {
-  // CHECK-DAG: %[[VAL0:.+]] = "tosa.const"() {value = dense<-1.000000e+00> : tensor<1xf32>}
+  // CHECK-DAG: %[[VAL0:.+]] = "tosa.const"() <{value = dense<-1.000000e+00> : tensor<1xf32>}>
   // CHECK-DAG: %[[VAL1:.+]] = "tosa.cast"(%arg0)
   // CHECK-DAG: %[[VAL2:.+]] = "tosa.sub"(%[[VAL1]], %[[VAL0]])
   %0 = "tfl.dequantize"(%arg0) : (tensor<4x!quant.uniform<i8:f32, 1.0:-1>>) -> tensor<*xf32>
@@ -1730,11 +1823,11 @@ func.func @test_dequantize_quant_uniform(%arg0: tensor<4x!quant.uniform<i8:f32,
 
 // CHECK-LABEL: @test_dequantize_quant_per_axis
 func.func @test_dequantize_quant_per_axis(%arg0: tensor<1x4x!quant.uniform<i8:f32:1, {1.0:5, 2.0:6, 3.0:7, 4.0:8}>>) -> tensor<*xf32> {
-  // CHECK-DAG: %[[VAL0:.+]] = "tosa.const"() {value = dense<{{\[}}[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]]> : tensor<1x4xf32>}
-  // CHECK-DAG: %[[VAL1:.+]] = "tosa.const"() {value = dense<{{\[}}[5.000000e+00, 6.000000e+00, 7.000000e+00, 8.000000e+00]]> : tensor<1x4xf32>}
+  // CHECK-DAG: %[[VAL0:.+]] = "tosa.const"() <{value = dense<{{\[}}[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]]> : tensor<1x4xf32>}>
+  // CHECK-DAG: %[[VAL1:.+]] = "tosa.const"() <{value = dense<{{\[}}[5.000000e+00, 6.000000e+00, 7.000000e+00, 8.000000e+00]]> : tensor<1x4xf32>}>
   // CHECK-DAG: %[[VAL2:.+]] = "tosa.cast"(%arg0) : (tensor<1x4x!quant.uniform<i8:f32:1, {1.000000e+00:5,2.000000e+00:6,3.000000e+00:7,4.000000e+00:8}>>) -> tensor<1x4xf32>
   // CHECK-DAG: %[[VAL3:.+]] = "tosa.sub"(%[[VAL2]], %[[VAL1]]) : (tensor<1x4xf32>, tensor<1x4xf32>) -> tensor<1x4xf32>
-  // CHECK: %[[VAL4:.+]] = "tosa.mul"(%[[VAL3]], %[[VAL0]]) {shift = 0 : i32} : (tensor<1x4xf32>, tensor<1x4xf32>) -> tensor<1x4xf32>
+  // CHECK: %[[VAL4:.+]] = "tosa.mul"(%[[VAL3]], %[[VAL0]]) <{shift = 0 : i32}> : (tensor<1x4xf32>, tensor<1x4xf32>) -> tensor<1x4xf32>
   %0 = "tfl.dequantize"(%arg0) : (tensor<1x4x!quant.uniform<i8:f32:1, {1.0:5, 2.0:6, 3.0:7, 4.0:8}>>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
@@ -1751,11 +1844,11 @@ func.func @test_quantfork.stats(%arg0: tensor<2x1xf32>) -> (tensor<2x1xf32>) {
 // -----
 
 // CHECK-LABEL: test_add_qi8
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.rescale"(%arg0) {double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 10>}
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.rescale"(%[[VAL_0]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 2147311776>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.rescale"(%arg1) {double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 11>}
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.rescale"(%arg0) <{double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 10>}>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.rescale"(%[[VAL_0]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 2147311776>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.rescale"(%arg1) <{double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 11>}>
 // CHECK-DAG: %[[VAL_3:.*]] = "tosa.add"(%[[VAL_1]], %[[VAL_2]])
-// CHECK: %[[VAL_4:.*]] = "tosa.rescale"(%[[VAL_3]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1075580483>, output_zp = -1 : i32, per_channel = false, scale32 = true, shift = array<i32: 50>}
+// CHECK: %[[VAL_4:.*]] = "tosa.rescale"(%[[VAL_3]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1075580483>, output_zp = -1 : i32, per_channel = false, scale32 = true, shift = array<i32: 50>}>
 func.func @test_add_qi8(%arg0: tensor<13x21x1x!quant.uniform<i8:f32, 0.01568480022251606:-1>>, %arg1: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686055645346642:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.031318482011556625:-1>> {
   %0 = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<13x21x1x!quant.uniform<i8:f32, 0.01568480022251606:-1>>, tensor<13x21x3x!quant.uniform<i8:f32, 0.015686055645346642:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.031318482011556625:-1>>
   func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 0.031318482011556625:-1>>
@@ -1764,11 +1857,11 @@ func.func @test_add_qi8(%arg0: tensor<13x21x1x!quant.uniform<i8:f32, 0.015684800
 // -----
 
 // CHECK-LABEL: test_sub_qi8
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.rescale"(%arg0) {double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 10>}
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.rescale"(%[[VAL_0]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 2147427038>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.rescale"(%arg1) {double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 11>}
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.rescale"(%arg0) <{double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 10>}>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.rescale"(%[[VAL_0]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 2147427038>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.rescale"(%arg1) <{double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 11>}>
 // CHECK-DAG: %[[VAL_3:.*]] = "tosa.sub"(%[[VAL_1]], %[[VAL_2]])
-// CHECK: %[[VAL_4:.*]] = "tosa.rescale"(%[[VAL_3]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1076408862>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 50>}
+// CHECK: %[[VAL_4:.*]] = "tosa.rescale"(%[[VAL_3]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1076408862>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 50>}>
 func.func @test_sub_qi8(%arg0: tensor<1x21x3x!quant.uniform<i8:f32, 0.015685770660638809:-1>>, %arg1: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686184167861938:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.031294636428356171>> {
   %0 = tfl.sub(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x21x3x!quant.uniform<i8:f32, 0.015685770660638809:-1>>, tensor<13x21x3x!quant.uniform<i8:f32, 0.015686184167861938:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.031294636428356171>>
   func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 0.031294636428356171>>
@@ -1779,7 +1872,7 @@ func.func @test_sub_qi8(%arg0: tensor<1x21x3x!quant.uniform<i8:f32, 0.0156857706
 // CHECK-LABEL: test_mul_qi8
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.rescale"(%arg0)
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.rescale"(%arg1)
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.mul"(%[[VAR0]], %[[VAR1]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.mul"(%[[VAR0]], %[[VAR1]]) <{shift = 0 : i32}>
 // CHECK: %[[VAR3:.*]] = "tosa.rescale"(%[[VAR2]])
 func.func @test_mul_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015681236982345581>>, %arg1: tensor<13x21x3x!quant.uniform<i8:f32, 0.015647144988179207:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.0078376950696110725>> {
   %0 = "tfl.mul"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015681236982345581>>, tensor<13x21x3x!quant.uniform<i8:f32, 0.015647144988179207:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.0078376950696110725>>
@@ -1789,7 +1882,7 @@ func.func @test_mul_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015681236
 // -----
 
 // CHECK-LABEL: test_avg_pool2d_qi8
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.unary_quant<input_zp = 0, output_zp = 0>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.unary_quant<input_zp = 0, output_zp = 0>, stride = array<i64: 1, 1>}>
 // CHECK-SAME: -> tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684349462389946:-1>>
 func.func @test_avg_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684349462389946:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.015684349462389946:-1>> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684349462389946:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.015684349462389946:-1>>
@@ -1799,7 +1892,7 @@ func.func @test_avg_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.
 // -----
 
 // CHECK-LABEL: test_avg_pool2d_i16
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
 // CHECK-SAME: -> tensor<1x32x32x8xi16>
 func.func @test_avg_pool2d_i16(%arg0: tensor<1x32x32x8xi16>) -> tensor<*xi16> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xi16>) -> tensor<*xi16>
@@ -1809,7 +1902,7 @@ func.func @test_avg_pool2d_i16(%arg0: tensor<1x32x32x8xi16>) -> tensor<*xi16> {
 // -----
 
 // CHECK-LABEL: test_max_pool2d_qi8
-// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
 func.func @test_max_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.01568342000246048:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.01568342000246048:-1>> {
   %0 = "tfl.max_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8x!quant.uniform<i8:f32, 0.01568342000246048:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.01568342000246048:-1>>
   func.return %0 : tensor<*x!quant.uniform<i8:f32, 0.01568342000246048:-1>>
@@ -1818,24 +1911,24 @@ func.func @test_max_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.
 // -----
 
 // CHECK-LABEL: test_softmax_qi8
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<35> : tensor<1x1x1xi32>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() {value = dense<4> : tensor<1x1x1xi32>}
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() {value = dense<536870912> : tensor<1x1x1xi32>}
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() {value = dense<1515870810> : tensor<1x1x1xi32>}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.const"() {value = dense<-1010580540> : tensor<1x1x1xi32>}
-// CHECK-DAG: %[[VAR6:.*]] = "tosa.const"() {value = dense<1> : tensor<1x1x1xi32>}
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.const"() {value = dense<12> : tensor<1x1x1xi32>}
-// CHECK-DAG: %[[VAR8:.*]] = "tosa.const"() {value = dense<7> : tensor<1x1x1xi32>}
-// CHECK-DAG: %[[VAR9:.*]] = "tosa.const"() {value = dense<9> : tensor<1x1x1xi32>}
-// CHECK-DAG: %[[VAR10:.*]] = "tosa.const"() {value = dense<17> : tensor<1x1x1xi32>}
-// CHECK-DAG: %[[VAR11:.*]] = "tosa.const"() {value = dense<"0x5{{.*}}"> : tensor<513xi16>}
-// CHECK-DAG: %[[VAR12:.*]] = "tosa.const"() {value = dense<"0xE{{.*}}"> : tensor<513xi16>}
-// CHECK-DAG: %[[VAR13:.*]] = "tosa.const"() {value = dense<"0x4{{.*}}"> : tensor<513xi16>}
-// CHECK-DAG: %[[VAR14:.*]] = "tosa.const"() {value = dense<"0x0{{.*}}"> : tensor<513xi16>}
-// CHECK-DAG: %[[VAR15:.*]] = "tosa.rescale"(%arg0) {double_round = false, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
-// CHECK-DAG: %[[VAR16:.*]] = "tosa.reduce_max"(%[[VAR15]]) {axis = 2 : i64}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<35> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{value = dense<4> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{value = dense<536870912> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{value = dense<1515870810> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.const"() <{value = dense<-1010580540> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR6:.*]] = "tosa.const"() <{value = dense<1> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.const"() <{value = dense<12> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR8:.*]] = "tosa.const"() <{value = dense<7> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR9:.*]] = "tosa.const"() <{value = dense<9> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR10:.*]] = "tosa.const"() <{value = dense<17> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR11:.*]] = "tosa.const"() <{value = dense<"0x5{{.*}}"> : tensor<513xi16>}>
+// CHECK-DAG: %[[VAR12:.*]] = "tosa.const"() <{value = dense<"0xE{{.*}}"> : tensor<513xi16>}>
+// CHECK-DAG: %[[VAR13:.*]] = "tosa.const"() <{value = dense<"0x4{{.*}}"> : tensor<513xi16>}>
+// CHECK-DAG: %[[VAR14:.*]] = "tosa.const"() <{value = dense<"0x0{{.*}}"> : tensor<513xi16>}>
+// CHECK-DAG: %[[VAR15:.*]] = "tosa.rescale"(%arg0) <{double_round = false, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}>
+// CHECK-DAG: %[[VAR16:.*]] = "tosa.reduce_max"(%[[VAR15]]) <{axis = 2 : i64}>
 // CHECK-DAG: %[[VAR17:.*]] = "tosa.sub"(%[[VAR15]], %[[VAR16]])
-// CHECK-DAG: %[[VAR18:.*]] = "tosa.rescale"(%[[VAR17]]) {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 23>}
+// CHECK-DAG: %[[VAR18:.*]] = "tosa.rescale"(%[[VAR17]]) <{double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 23>}>
 // CHECK-DAG: %[[VAR19:.*]] = "tosa.table"(%[[VAR18]], %[[VAR14]])
 // CHECK-DAG: %[[VAR20:.*]] = "tosa.table"(%[[VAR18]], %[[VAR13]])
 // CHECK-DAG: %[[VAR21:.*]] = "tosa.table"(%[[VAR18]], %[[VAR12]])
@@ -1843,36 +1936,36 @@ func.func @test_max_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.
 // CHECK-DAG: %[[VAR23:.*]] = "tosa.logical_left_shift"(%[[VAR19]], %[[VAR10]])
 // CHECK-DAG: %[[VAR24:.*]] = "tosa.logical_left_shift"(%[[VAR20]], %[[VAR9]])
 // CHECK-DAG: %[[VAR25:.*]] = "tosa.logical_left_shift"(%[[VAR21]], %[[VAR6]])
-// CHECK-DAG: %[[VAR26:.*]] = "tosa.arithmetic_right_shift"(%[[VAR22]], %[[VAR8]]) {round = true}
+// CHECK-DAG: %[[VAR26:.*]] = "tosa.arithmetic_right_shift"(%[[VAR22]], %[[VAR8]]) <{round = true}>
 // CHECK-DAG: %[[VAR27:.*]] = "tosa.add"(%[[VAR23]], %[[VAR24]])
 // CHECK-DAG: %[[VAR28:.*]] = "tosa.add"(%[[VAR27]], %[[VAR25]])
 // CHECK-DAG: %[[VAR29:.*]] = "tosa.add"(%[[VAR28]], %[[VAR26]])
-// CHECK-DAG: %[[VAR30:.*]] = "tosa.arithmetic_right_shift"(%[[VAR29]], %[[VAR7]]) {round = true}
-// CHECK-DAG: %[[VAR31:.*]] = "tosa.reduce_sum"(%[[VAR30]]) {axis = 2 : i64}
+// CHECK-DAG: %[[VAR30:.*]] = "tosa.arithmetic_right_shift"(%[[VAR29]], %[[VAR7]]) <{round = true}>
+// CHECK-DAG: %[[VAR31:.*]] = "tosa.reduce_sum"(%[[VAR30]]) <{axis = 2 : i64}>
 // CHECK-DAG: %[[VAR32:.*]] = "tosa.clz"(%[[VAR31]])
 // CHECK-DAG: %[[VAR33:.*]] = "tosa.sub"(%[[VAR32]], %[[VAR6]])
 // CHECK-DAG: %[[VAR34:.*]] = "tosa.logical_left_shift"(%[[VAR31]], %[[VAR33]])
-// CHECK-DAG: %[[VAR35:.*]] = "tosa.mul"(%[[VAR34]], %[[VAR5]]) {shift = 31 : i32}
+// CHECK-DAG: %[[VAR35:.*]] = "tosa.mul"(%[[VAR34]], %[[VAR5]]) <{shift = 31 : i32}>
 // CHECK-DAG: %[[VAR36:.*]] = "tosa.add"(%[[VAR35]], %[[VAR4]])
-// CHECK-DAG: %[[VAR37:.*]] = "tosa.mul"(%[[VAR36]], %[[VAR34]]) {shift = 31 : i32}
+// CHECK-DAG: %[[VAR37:.*]] = "tosa.mul"(%[[VAR36]], %[[VAR34]]) <{shift = 31 : i32}>
 // CHECK-DAG: %[[VAR38:.*]] = "tosa.sub"(%[[VAR3]], %[[VAR37]])
-// CHECK-DAG: %[[VAR39:.*]] = "tosa.mul"(%[[VAR36]], %[[VAR38]]) {shift = 31 : i32}
-// CHECK-DAG: %[[VAR40:.*]] = "tosa.mul"(%[[VAR39]], %[[VAR2]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR39:.*]] = "tosa.mul"(%[[VAR36]], %[[VAR38]]) <{shift = 31 : i32}>
+// CHECK-DAG: %[[VAR40:.*]] = "tosa.mul"(%[[VAR39]], %[[VAR2]]) <{shift = 0 : i32}>
 // CHECK-DAG: %[[VAR41:.*]] = "tosa.add"(%[[VAR36]], %[[VAR40]])
-// CHECK-DAG: %[[VAR42:.*]] = "tosa.mul"(%[[VAR41]], %[[VAR34]]) {shift = 31 : i32}
+// CHECK-DAG: %[[VAR42:.*]] = "tosa.mul"(%[[VAR41]], %[[VAR34]]) <{shift = 31 : i32}>
 // CHECK-DAG: %[[VAR43:.*]] = "tosa.sub"(%[[VAR3]], %[[VAR42]])
-// CHECK-DAG: %[[VAR44:.*]] = "tosa.mul"(%[[VAR41]], %[[VAR43]]) {shift = 31 : i32}
-// CHECK-DAG: %[[VAR45:.*]] = "tosa.mul"(%[[VAR44]], %[[VAR2]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR44:.*]] = "tosa.mul"(%[[VAR41]], %[[VAR43]]) <{shift = 31 : i32}>
+// CHECK-DAG: %[[VAR45:.*]] = "tosa.mul"(%[[VAR44]], %[[VAR2]]) <{shift = 0 : i32}>
 // CHECK-DAG: %[[VAR46:.*]] = "tosa.add"(%[[VAR41]], %[[VAR45]])
-// CHECK-DAG: %[[VAR47:.*]] = "tosa.mul"(%[[VAR46]], %[[VAR34]]) {shift = 31 : i32}
+// CHECK-DAG: %[[VAR47:.*]] = "tosa.mul"(%[[VAR46]], %[[VAR34]]) <{shift = 31 : i32}>
 // CHECK-DAG: %[[VAR48:.*]] = "tosa.sub"(%[[VAR3]], %[[VAR47]])
-// CHECK-DAG: %[[VAR49:.*]] = "tosa.mul"(%[[VAR46]], %[[VAR48]]) {shift = 31 : i32}
-// CHECK-DAG: %[[VAR50:.*]] = "tosa.mul"(%[[VAR49]], %[[VAR2]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR49:.*]] = "tosa.mul"(%[[VAR46]], %[[VAR48]]) <{shift = 31 : i32}>
+// CHECK-DAG: %[[VAR50:.*]] = "tosa.mul"(%[[VAR49]], %[[VAR2]]) <{shift = 0 : i32}>
 // CHECK-DAG: %[[VAR51:.*]] = "tosa.add"(%[[VAR46]], %[[VAR50]])
-// CHECK-DAG: %[[VAR52:.*]] = "tosa.mul"(%[[VAR29]], %[[VAR51]]) {shift = 30 : i32}
+// CHECK-DAG: %[[VAR52:.*]] = "tosa.mul"(%[[VAR29]], %[[VAR51]]) <{shift = 30 : i32}>
 // CHECK-DAG: %[[VAR53:.*]] = "tosa.sub"(%[[VAR1]], %[[VAR32]])
-// CHECK-DAG: %[[VAR54:.*]] = "tosa.arithmetic_right_shift"(%[[VAR52]], %[[VAR53]]) {round = true}
-// CHECK: %[[VAR55:.*]] = "tosa.rescale"(%[[VAR54]]) {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = -128 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
+// CHECK-DAG: %[[VAR54:.*]] = "tosa.arithmetic_right_shift"(%[[VAR52]], %[[VAR53]]) <{round = true}>
+// CHECK: %[[VAR55:.*]] = "tosa.rescale"(%[[VAR54]]) <{double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = -128 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}>
 func.func @test_softmax_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015685837715864182:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 3.906250e-03:-128>> {
   %0 = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015685837715864182:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 3.906250e-03:-128>>
   func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 3.906250e-03:-128>>
@@ -1882,37 +1975,37 @@ func.func @test_softmax_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01568
 
 
 // CHECK-LABEL: test_softmax_qi16
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<31> : tensor<1x1xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<7> : tensor<1x1xi32>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() {value = dense<32768> : tensor<1x1xi32>}
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() {value = dense<14> : tensor<1x1xi32>}
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() {value = dense<1073741824> : tensor<1x1xi32>}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.const"() {value = dense<1> : tensor<1x1xi32>}
-// CHECK-DAG: %[[VAR6:.*]] = "tosa.const"() {value = dense<32767> : tensor<1x1xi32>}
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.const"() {value = dense<"0xF{{.*}}>
-// CHECK-DAG: %[[VAR8:.*]] = "tosa.const"() {value = dense<"0x0{{.*}}> : tensor<513xi16>}
-// CHECK-DAG: %[[VAR9:.*]] = "tosa.rescale"(%arg0) {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
-// CHECK-DAG: %[[VAR10:.*]] = "tosa.reduce_max"(%[[VAR9]]) {axis = 1 : i64}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<31> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<7> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{value = dense<32768> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{value = dense<14> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{value = dense<1073741824> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.const"() <{value = dense<1> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAR6:.*]] = "tosa.const"() <{value = dense<32767> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.const"() <{value = dense<"0xF{{.*}}>
+// CHECK-DAG: %[[VAR8:.*]] = "tosa.const"() <{value = dense<"0x0{{.*}}> : tensor<513xi16>}>
+// CHECK-DAG: %[[VAR9:.*]] = "tosa.rescale"(%arg0) <{double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}>
+// CHECK-DAG: %[[VAR10:.*]] = "tosa.reduce_max"(%[[VAR9]]) <{axis = 1 : i64}>
 // CHECK-DAG: %[[VAR11:.*]] = "tosa.sub"(%[[VAR9]], %[[VAR10]])
-// CHECK-DAG: %[[VAR12:.*]] = "tosa.rescale"(%[[VAR11]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1717965619>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}
+// CHECK-DAG: %[[VAR12:.*]] = "tosa.rescale"(%[[VAR11]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1717965619>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}>
 // CHECK-DAG: %[[VAR13:.*]] = "tosa.add"(%[[VAR12]], %[[VAR6]])
 // CHECK-DAG: %[[VAR14:.*]] = "tosa.cast"(%[[VAR13]])
 // CHECK-DAG: %[[VAR15:.*]] = "tosa.table"(%[[VAR14]], %[[VAR8]])
-// CHECK-DAG: %[[VAR16:.*]] = "tosa.arithmetic_right_shift"(%[[VAR15]], %[[VAR1]]) {round = true}
-// CHECK-DAG: %[[VAR17:.*]] = "tosa.reduce_sum"(%[[VAR16]]) {axis = 1 : i64}
+// CHECK-DAG: %[[VAR16:.*]] = "tosa.arithmetic_right_shift"(%[[VAR15]], %[[VAR1]]) <{round = true}>
+// CHECK-DAG: %[[VAR17:.*]] = "tosa.reduce_sum"(%[[VAR16]]) <{axis = 1 : i64}>
 // CHECK-DAG: %[[VAR18:.*]] = "tosa.clz"(%[[VAR17]])
 // CHECK-DAG: %[[VAR19:.*]] = "tosa.sub"(%[[VAR18]], %[[VAR5]])
 // CHECK-DAG: %[[VAR20:.*]] = "tosa.logical_left_shift"(%[[VAR17]], %[[VAR19]])
 // CHECK-DAG: %[[VAR21:.*]] = "tosa.sub"(%[[VAR20]], %[[VAR4]])
-// CHECK-DAG: %[[VAR22:.*]] = "tosa.arithmetic_right_shift"(%[[VAR21]], %[[VAR3]]) {round = true}
+// CHECK-DAG: %[[VAR22:.*]] = "tosa.arithmetic_right_shift"(%[[VAR21]], %[[VAR3]]) <{round = true}>
 // CHECK-DAG: %[[VAR23:.*]] = "tosa.sub"(%[[VAR22]], %[[VAR2]])
 // CHECK-DAG: %[[VAR24:.*]] = "tosa.cast"(%[[VAR23]])
 // CHECK-DAG: %[[VAR25:.*]] = "tosa.table"(%[[VAR24]], %[[VAR7]])
-// CHECK-DAG: %[[VAR26:.*]] = "tosa.arithmetic_right_shift"(%[[VAR25]], %[[VAR1]]) {round = true}
-// CHECK-DAG: %[[VAR27:.*]] = "tosa.mul"(%[[VAR26]], %[[VAR16]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR26:.*]] = "tosa.arithmetic_right_shift"(%[[VAR25]], %[[VAR1]]) <{round = true}>
+// CHECK-DAG: %[[VAR27:.*]] = "tosa.mul"(%[[VAR26]], %[[VAR16]]) <{shift = 0 : i32}>
 // CHECK-DAG: %[[VAR28:.*]] = "tosa.sub"(%[[VAR0]], %[[VAR18]])
-// CHECK-DAG: %[[VAR29:.*]] = "tosa.arithmetic_right_shift"(%[[VAR27]], %[[VAR28]]) {round = true}
-// CHECK: %[[VAR30:.*]] = "tosa.rescale"(%[[VAR29]]) {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
+// CHECK-DAG: %[[VAR29:.*]] = "tosa.arithmetic_right_shift"(%[[VAR27]], %[[VAR28]]) <{round = true}>
+// CHECK: %[[VAR30:.*]] = "tosa.rescale"(%[[VAR29]]) <{double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}>
 func.func @test_softmax_qi16(%arg0: tensor<14x19x!quant.uniform<i16:f32, 6.103533087298274E-5>>) -> tensor<14x19x!quant.uniform<i16:f32, 3.0517578125E-5>> {
   %0 = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<14x19x!quant.uniform<i16:f32, 6.103533087298274E-5>>) -> tensor<14x19x!quant.uniform<i16:f32, 3.0517578125E-5>>
   func.return %0 : tensor<14x19x!quant.uniform<i16:f32, 3.0517578125E-5>>
@@ -1921,7 +2014,7 @@ func.func @test_softmax_qi16(%arg0: tensor<14x19x!quant.uniform<i16:f32, 6.10353
 // -----
 
 // CHECK-LABEL: test_sigmoid_qi8
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<256xi8>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<256xi8>}>
 // CHECK: %[[VAR1:.*]] = "tosa.table"(%arg0, %[[VAR0]])
 func.func @test_sigmoid_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015667613595724106>>) -> tensor<*x!quant.uniform<i8:f32, 3.906250e-03:-128>> {
   %0 = "tfl.logistic"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015667613595724106>>) -> tensor<*x!quant.uniform<i8:f32, 3.906250e-03:-128>>
@@ -1931,7 +2024,7 @@ func.func @test_sigmoid_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01566
 // -----
 
 // CHECK-LABEL: test_tanh_qi8
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<256xi8>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<256xi8>}>
 // CHECK: %[[VAR1:.*]] = "tosa.table"(%arg0, %[[VAR0]])
 func.func @test_tanh_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015673128888010979:-1>>) -> tensor<*x!quant.uniform<i8:f32, 7.812500e-03>> {
   %0 = "tfl.tanh"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015673128888010979:-1>>) -> tensor<*x!quant.uniform<i8:f32, 7.812500e-03>>
@@ -1942,7 +2035,7 @@ func.func @test_tanh_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01567312
 
 // CHECK-LABEL: test_relu_qi8
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.rescale"(%arg0)
-// CHECK: %[[VAL_1:.*]] = "tosa.clamp"(%0) {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = -128 : i64}
+// CHECK: %[[VAL_1:.*]] = "tosa.clamp"(%0) <{max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = -128 : i64}>
 func.func @test_relu_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015685949474573135:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078430203720927238:-128>> {
   %0 = "tfl.relu"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015685949474573135:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078430203720927238:-128>>
   func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 0.0078430203720927238:-128>>
@@ -1952,7 +2045,7 @@ func.func @test_relu_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01568594
 
 // CHECK-LABEL: test_relu0To1_qi8
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.rescale"(%arg0)
-// CHECK: %[[VAL_1:.*]] = "tosa.clamp"(%0) {max_fp = 1.000000e+00 : f32, max_int = 126 : i64, min_fp = -1.000000e+00 : f32, min_int = -128 : i64}
+// CHECK: %[[VAL_1:.*]] = "tosa.clamp"(%0) <{max_fp = 1.000000e+00 : f32, max_int = 126 : i64, min_fp = -1.000000e+00 : f32, min_int = -128 : i64}>
 func.func @test_relu0To1_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686025843024254:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431377187371254:-1>> {
   %0 = "tfl.relu_n1_to_1"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015686025843024254:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431377187371254:-1>>
   func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431377187371254:-1>>
@@ -1962,7 +2055,7 @@ func.func @test_relu0To1_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.0156
 
 // CHECK-LABEL: test_relu6_qi8
 // CHECK-DAG: %[[VAL_0:.*]] = "tosa.rescale"(%arg0)
-// CHECK: %[[VAL_1:.*]] = "tosa.clamp"(%0) {max_fp = 6.000000e+00 : f32, max_int = 127 : i64, min_fp = 0.000000e+00 : f32, min_int = -128 : i64}
+// CHECK: %[[VAL_1:.*]] = "tosa.clamp"(%0) <{max_fp = 6.000000e+00 : f32, max_int = 127 : i64, min_fp = 0.000000e+00 : f32, min_int = -128 : i64}>
 func.func @test_relu6_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686137601733208:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431284055113792:-128>>  {
     %0 = "tfl.relu6"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015686137601733208:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431284055113792:-128>>
     func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431284055113792:-128>>
@@ -1974,7 +2067,7 @@ func.func @test_relu6_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.0156861
 // CHECK: %[[VAL_1:.*]] = "tosa.rescale"(%arg0)
 // CHECK: %[[VAL_2:.*]] = "tosa.rescale"(%[[VAL_1]])
 // CHECK: %[[VAL_3:.*]] = "tosa.rescale"(%[[VAL_2]])
-// CHECK: %[[VAL_4:.*]] = "tosa.clamp"(%[[VAL_3]]) {max_fp = 6.000000e+00 : f32, max_int = 127 : i64, min_fp = 0.000000e+00 : f32, min_int = -128 : i64}
+// CHECK: %[[VAL_4:.*]] = "tosa.clamp"(%[[VAL_3]]) <{max_fp = 6.000000e+00 : f32, max_int = 127 : i64, min_fp = 0.000000e+00 : f32, min_int = -128 : i64}>
 // CHECK: %[[VAL_5:.*]] = "tosa.rescale"(%[[VAL_4]])
 // CHECK: %[[VAL_6:.*]] = "tosa.rescale"(%[[VAL_5]])
 func.func @test_relu6_qu8(%arg0: tensor<13x21x3x!quant.uniform<u8:f32, 0.015686137601733208:127>>) -> tensor<13x21x3x!quant.uniform<u8:f32, 0.0078431284055113792>> {
@@ -1987,10 +2080,10 @@ func.func @test_relu6_qu8(%arg0: tensor<13x21x3x!quant.uniform<u8:f32, 0.0156861
 // -----
 
 // CHECK-LABEL: test_leaky_relu_qi8
-// CHECK: %[[VAR0:.*]] = "tosa.rescale"(%arg0) {double_round = true, input_zp = -1 : i32,
-// CHECK: %[[VAR1:.*]] = "tosa.rescale"(%arg0) {double_round = true, input_zp = -1 : i32,
+// CHECK: %[[VAR0:.*]] = "tosa.rescale"(%arg0) <{double_round = true, input_zp = -1 : i32,
+// CHECK: %[[VAR1:.*]] = "tosa.rescale"(%arg0) <{double_round = true, input_zp = -1 : i32,
 // CHECK: %[[VAR2:.*]] = "tosa.maximum"(%[[VAR1]], %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.rescale"(%[[VAR2]]) {double_round = true, input_zp = 0 : i32,
+// CHECK: %[[VAR3:.*]] = "tosa.rescale"(%[[VAR2]]) <{double_round = true, input_zp = 0 : i32,
 // CHECK: return %[[VAR3]] : tensor<14x19x!quant.uniform<i8:f32, 0.015519863925874233:-1>>
 func.func @test_leaky_relu_qi8(%arg0: tensor<14x19x!quant.uniform<i8:f32, 0.015519863925874233:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.015519863925874233:-1>> {
   %0 = "tfl.leaky_relu"(%arg0) {alpha = 0.948724806 : f32} : (tensor<14x19x!quant.uniform<i8:f32, 0.015519863925874233:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.015519863925874233:-1>>
@@ -2000,10 +2093,10 @@ func.func @test_leaky_relu_qi8(%arg0: tensor<14x19x!quant.uniform<i8:f32, 0.0155
 // -----
 
 // CHECK-LABEL: test_leaky_relu_qi16
-// CHECK: %[[VAR0:.*]] = "tosa.rescale"(%arg0) {double_round = true, input_zp = -1 : i32,
-// CHECK: %[[VAR1:.*]] = "tosa.rescale"(%arg0) {double_round = true, input_zp = -1 : i32,
+// CHECK: %[[VAR0:.*]] = "tosa.rescale"(%arg0) <{double_round = true, input_zp = -1 : i32,
+// CHECK: %[[VAR1:.*]] = "tosa.rescale"(%arg0) <{double_round = true, input_zp = -1 : i32,
 // CHECK: %[[VAR2:.*]] = "tosa.minimum"(%[[VAR1]], %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.rescale"(%[[VAR2]]) {double_round = true, input_zp = 0 : i32,
+// CHECK: %[[VAR3:.*]] = "tosa.rescale"(%[[VAR2]]) <{double_round = true, input_zp = 0 : i32,
 // CHECK: return %[[VAR3]] : tensor<14x19x!quant.uniform<i16:f32, 0.015519863925874233:-1>>
 func.func @test_leaky_relu_qi16(%arg0: tensor<14x19x!quant.uniform<i16:f32, 0.015519863925874233:-1>>) -> tensor<*x!quant.uniform<i16:f32, 0.015519863925874233:-1>> {
   %0 = "tfl.leaky_relu"(%arg0) {alpha = 1.048724806 : f32} : (tensor<14x19x!quant.uniform<i16:f32, 0.015519863925874233:-1>>) -> tensor<*x!quant.uniform<i16:f32, 0.015519863925874233:-1>>
@@ -2013,8 +2106,8 @@ func.func @test_leaky_relu_qi16(%arg0: tensor<14x19x!quant.uniform<i16:f32, 0.01
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_qi8
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 14, 14>, mode = "BILINEAR", offset = array<i64: 0, 0>, scale = array<i64: 16, 2, 16, 2>}
-// CHECK: %[[VAR2:.*]] = "tosa.rescale"(%[[VAR1]]) {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 38>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 14, 14>, mode = "BILINEAR", offset = array<i64: 0, 0>, scale = array<i64: 16, 2, 16, 2>}>
+// CHECK: %[[VAR2:.*]] = "tosa.rescale"(%[[VAR1]]) <{double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 38>}>
 func.func @test_resize_bilinear_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = false, half_pixel_centers = false} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -2024,7 +2117,7 @@ func.func @test_resize_bilinear_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f3
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_half_qi8
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 7, 7>, mode = "BILINEAR", offset = array<i64: -7, -7>, scale = array<i64: 16, 2, 16, 2>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 7, 7>, mode = "BILINEAR", offset = array<i64: -7, -7>, scale = array<i64: 16, 2, 16, 2>}>
 func.func @test_resize_bilinear_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = false, half_pixel_centers = true} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -2034,7 +2127,7 @@ func.func @test_resize_bilinear_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_align_qi8
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 0, 0>, mode = "BILINEAR", offset = array<i64: 0, 0>, scale = array<i64: 1278, 158, 1278, 158>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 0, 0>, mode = "BILINEAR", offset = array<i64: 0, 0>, scale = array<i64: 1278, 158, 1278, 158>}>
 func.func @test_resize_bilinear_align_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = true, half_pixel_centers = false} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -2044,7 +2137,7 @@ func.func @test_resize_bilinear_align_qi8(%arg0: tensor<1x80x80x2x!quant.uniform
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_align_half_qi8
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: -560, -560>, mode = "BILINEAR", offset = array<i64: -560, -560>, scale = array<i64: 1278, 158, 1278, 158>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: -560, -560>, mode = "BILINEAR", offset = array<i64: -560, -560>, scale = array<i64: 1278, 158, 1278, 158>}>
 func.func @test_resize_bilinear_align_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = true, half_pixel_centers = true} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -2054,7 +2147,7 @@ func.func @test_resize_bilinear_align_half_qi8(%arg0: tensor<1x80x80x2x!quant.un
 // -----
 
 // CHECK-LABEL: test_resize_nearest_qi8
-// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 14, 14>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 0, 0>, scale = array<i64: 16, 2, 16, 2>}
+// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 14, 14>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 0, 0>, scale = array<i64: 16, 2, 16, 2>}>
 func.func @test_resize_nearest_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_nearest_neighbor"(%arg0, %0) {align_corners = false, half_pixel_centers = false} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -2065,7 +2158,7 @@ func.func @test_resize_nearest_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32
 // -----
 
 // CHECK-LABEL: test_resize_nearest_half_qi8
-// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 15, 15>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 1, 1>, scale = array<i64: 16, 2, 16, 2>}
+// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 15, 15>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 1, 1>, scale = array<i64: 16, 2, 16, 2>}>
 func.func @test_resize_nearest_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_nearest_neighbor"(%arg0, %0) {align_corners = false, half_pixel_centers = true} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -2075,7 +2168,7 @@ func.func @test_resize_nearest_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i
 // -----
 
 // CHECK-LABEL: test_resize_nearest_align_qi8
-// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 639, 639>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 639, 639>, scale = array<i64: 1278, 158, 1278, 158>}
+// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 639, 639>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 639, 639>, scale = array<i64: 1278, 158, 1278, 158>}>
 func.func @test_resize_nearest_align_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_nearest_neighbor"(%arg0, %0) {align_corners = true, half_pixel_centers = false} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -2085,7 +2178,7 @@ func.func @test_resize_nearest_align_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<
 // -----
 
 // CHECK-LABEL: test_resize_nearest_align_half_qi8
-// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 718, 718>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 718, 718>, scale = array<i64: 1278, 158, 1278, 158>}
+// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 718, 718>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 718, 718>, scale = array<i64: 1278, 158, 1278, 158>}>
 func.func @test_resize_nearest_align_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_nearest_neighbor"(%arg0, %0) {align_corners = true, half_pixel_centers = true} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -2094,12 +2187,74 @@ func.func @test_resize_nearest_align_half_qi8(%arg0: tensor<1x80x80x2x!quant.uni
 
 // -----
 
+// CHECK-LABEL: test_resize_bilinear_f32_scalar_input
+// CHECK: %[[VAL_1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 1, 1>, mode = "BILINEAR", offset = array<i64: 0, 0>, scale = array<i64: 2, 1, 2, 1>}>
+func.func @test_resize_bilinear_f32_scalar_input(%arg0: tensor<3x1x1x7xf32>) -> tensor<3x2x2x7xf32> {
+  %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
+  %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = false, half_pixel_centers = false} : (tensor<3x1x1x7xf32>, tensor<2xi32>) -> tensor<3x2x2x7xf32>
+  func.return %1 : tensor<3x2x2x7xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_resize_bilinear_half_qi8_scalar_input
+// CHECK: %[[VAL_1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 1, 1>, mode = "BILINEAR", offset = array<i64: 0, 0>, scale = array<i64: 2, 1, 2, 1>}>
+// CHECK: %[[VAL_2:.*]] = "tosa.rescale"(%[[VAL_1]]) <{double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}>
+func.func @test_resize_bilinear_half_qi8_scalar_input(%arg0: tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>> {
+  %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
+  %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = false, half_pixel_centers = true} : (tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>, tensor<2xi32>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>>
+  func.return %1 : tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>>
+}
+
+// -----
+
+// CHECK-LABEL: test_resize_bilinear_align_qi8_scalar_input
+// CHECK: %[[VAL_1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 1, 1>, mode = "BILINEAR", offset = array<i64: 0, 0>, scale = array<i64: 2, 1, 2, 1>}>
+// CHECK: %[[VAL_2:.*]] = "tosa.rescale"(%[[VAL_1]]) <{double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}>
+func.func @test_resize_bilinear_align_qi8_scalar_input(%arg0: tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>> {
+  %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
+  %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = true, half_pixel_centers = false} : (tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>, tensor<2xi32>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>>
+  func.return %1 : tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>>
+}
+
+// -----
+
+// CHECK-LABEL: test_resize_nearest_f32_scalar_input
+// CHECK: %[[VAL_1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 1, 1>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 0, 0>, scale = array<i64: 2, 1, 2, 1>}>
+func.func @test_resize_nearest_f32_scalar_input(%arg0: tensor<3x1x1x7xf32>) -> tensor<3x2x2x7xf32> {
+  %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
+  %1 = "tfl.resize_nearest_neighbor"(%arg0, %0) {align_corners = false, half_pixel_centers = false} : (tensor<3x1x1x7xf32>, tensor<2xi32>) -> tensor<3x2x2x7xf32>
+  func.return %1 : tensor<3x2x2x7xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_resize_nearest_half_qi8_scalar_input
+// CHECK: %[[VAL_1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 1, 1>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 0, 0>, scale = array<i64: 2, 1, 2, 1>}>
+func.func @test_resize_nearest_half_qi8_scalar_input(%arg0: tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>> {
+  %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
+  %1 = "tfl.resize_nearest_neighbor"(%arg0, %0) {align_corners = false, half_pixel_centers = true} : (tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>, tensor<2xi32>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>>
+  func.return %1 : tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>>
+}
+
+// -----
+
+// CHECK-LABEL: test_resize_nearest_align_qi8_scalar_input
+// CHECK: %[[VAL_1:.*]] = "tosa.resize"(%arg0) <{border = array<i64: 1, 1>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 0, 0>, scale = array<i64: 2, 1, 2, 1>}>
+func.func @test_resize_nearest_align_qi8_scalar_input(%arg0: tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>> {
+  %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
+  %1 = "tfl.resize_nearest_neighbor"(%arg0, %0) {align_corners = true, half_pixel_centers = false} : (tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>, tensor<2xi32>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>>
+  func.return %1 : tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>>
+}
+
+// -----
+
 // CHECK-LABEL: test_fullyconnected_qi8
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<0> : tensor<28xi32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0> : tensor<28xi32>}>
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%arg1, %[[VAR0]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.fully_connected"(%arg0, %[[VAR2]], %[[VAR1]]) {quantization_info = #tosa.conv_quant<input_zp = -1, weight_zp = -1>}
-// CHECK: %[[VAR4:.*]] = "tosa.rescale"(%[[VAR3]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1353377973>, output_zp = 3 : i32, per_channel = false, scale32 = true, shift = array<i32: 40>}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.fully_connected"(%arg0, %[[VAR2]], %[[VAR1]]) <{quantization_info = #tosa.conv_quant<input_zp = -1, weight_zp = -1>}>
+// CHECK: %[[VAR4:.*]] = "tosa.rescale"(%[[VAR3]]) <{double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1353377973>, output_zp = 3 : i32, per_channel = false, scale32 = true, shift = array<i32: 40>}>
 func.func @test_fullyconnected_qi8(%arg0: tensor<14x19x!quant.uniform<i8:f32, 0.015685491263866425:-1>>, %arg1: tensor<19x28x!quant.uniform<i8:f32, 0.015685983002185822:-1>>) -> tensor<14x28x!quant.uniform<i8:f32, 0.19988977909088135:3>> {
   %0 = "tfl.pseudo_const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.transpose"(%arg1, %0) : (tensor<19x28x!quant.uniform<i8:f32, 0.015685983002185822:-1>>, tensor<2xi32>) -> tensor<28x19x!quant.uniform<i8:f32, 0.015685983002185822:-1>>
@@ -2110,10 +2265,10 @@ func.func @test_fullyconnected_qi8(%arg0: tensor<14x19x!quant.uniform<i8:f32, 0.
 
 // -----
 // CHECK-LABEL: test_gather
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 63>}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 49>}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 13, 63>}>
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 49>}>
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.gather"(%[[VAR4]], %[[VAR5]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = array<i64: 7, 7, 21, 3>}
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) <{new_shape = array<i64: 7, 7, 21, 3>}>
 // CHECK: return %[[VAR7]]
 func.func @test_gather(%arg0: tensor<13x21x3xf32>, %arg1: tensor<7x7xi32>) -> tensor<*xf32> {
   %2 = "tfl.gather"(%arg0, %arg1) {axis = 0 : i32} : (tensor<13x21x3xf32>, tensor<7x7xi32>) -> tensor<*xf32>
@@ -2122,10 +2277,10 @@ func.func @test_gather(%arg0: tensor<13x21x3xf32>, %arg1: tensor<7x7xi32>) -> te
 
 // -----
 // CHECK-LABEL: test_gather_dyn
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, -1, 63>}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 49>}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, -1, 63>}>
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 49>}>
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.gather"(%[[VAR4]], %[[VAR5]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = array<i64: 7, 7, 21, 3>}
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) <{new_shape = array<i64: 7, 7, 21, 3>}>
 // CHECK: return %[[VAR7]]
 func.func @test_gather_dyn(%arg0: tensor<?x21x3xf32>, %arg1 : tensor<7x7xi32>) -> tensor<*xf32> {
   %2 = "tfl.gather"(%arg0, %arg1) {axis = 0 : i32} : (tensor<?x21x3xf32>, tensor<7x7xi32>) -> tensor<*xf32>
@@ -2135,10 +2290,10 @@ func.func @test_gather_dyn(%arg0: tensor<?x21x3xf32>, %arg1 : tensor<7x7xi32>) -
 
 // -----
 // CHECK-LABEL: test_gather_channel_dyn
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, -1>}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 49>}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 13, -1>}>
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 49>}>
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.gather"(%[[VAR4]], %[[VAR5]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = array<i64: 7, 7, 21, -1>}
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) <{new_shape = array<i64: 7, 7, 21, -1>}>
 // CHECK: return %[[VAR7]]
 func.func @test_gather_channel_dyn(%arg0: tensor<13x21x?xf32>, %arg1: tensor<7x7xi32>) -> tensor<*xf32> {
   %2 = "tfl.gather"(%arg0, %arg1) {axis = 0 : i32} : (tensor<13x21x?xf32>, tensor<7x7xi32>) -> tensor<*xf32>
@@ -2147,10 +2302,10 @@ func.func @test_gather_channel_dyn(%arg0: tensor<13x21x?xf32>, %arg1: tensor<7x7
 
 // -----
 // CHECK-LABEL: test_gather_indices_dyn
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 63>}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, -1>}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 13, 63>}>
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, -1>}>
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.gather"(%[[VAR4]], %[[VAR5]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = array<i64: -1, 7, 21, 3>}
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) <{new_shape = array<i64: -1, 7, 21, 3>}>
 // CHECK: return %[[VAR7]]
 func.func @test_gather_indices_dyn(%arg0: tensor<13x21x3xf32>, %arg1: tensor<?x7xi32>) -> tensor<*xf32> {
   %2 = "tfl.gather"(%arg0, %arg1) {axis = 0 : i32} : (tensor<13x21x3xf32>, tensor<?x7xi32>) -> tensor<*xf32>
@@ -2160,9 +2315,9 @@ func.func @test_gather_indices_dyn(%arg0: tensor<13x21x3xf32>, %arg1: tensor<?x7
 // -----
 // CHECK-LABEL: test_gather_batch
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 4, 16>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 4, 16>}>
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.gather"(%[[VAR1]], %[[VAR0]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 1, 3, 4, 4>}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 1, 3, 4, 4>}>
 // CHECK: return %[[VAR3]]
 func.func @test_gather_batch(%arg0: tensor<1x4x4x4xi32>) -> tensor<1x3x4x4xi32> {
   %0 = "tfl.pseudo_const"() {value = dense<[[0, 3, 1]]> : tensor<1x3xi32>} : () -> tensor<1x3xi32>
@@ -2172,9 +2327,9 @@ func.func @test_gather_batch(%arg0: tensor<1x4x4x4xi32>) -> tensor<1x3x4x4xi32>
 
 // -----
 // CHECK-LABEL: test_gather_batch_dyn
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: -1, 4, 16>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: -1, 4, 16>}>
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.gather"(%[[VAR1]], %arg1)
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: -1, 3, 4, 4>}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: -1, 3, 4, 4>}>
 // CHECK: return %[[VAR3]]
 func.func @test_gather_batch_dyn(%arg0: tensor<?x4x4x4xi32>, %arg1: tensor<?x3xi32>) -> tensor<?x3x4x4xi32> {
   %1 = "tfl.gather"(%arg0, %arg1) {axis = 1 : i32, batch_dims = 1 : i32} : (tensor<?x4x4x4xi32>, tensor<?x3xi32>) -> tensor<?x3x4x4xi32>
@@ -2184,13 +2339,13 @@ func.func @test_gather_batch_dyn(%arg0: tensor<?x4x4x4xi32>, %arg1: tensor<?x3xi
 // -----
 // CHECK-LABEL: test_gather_nd
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 273, 3>}
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 42, 2>}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.mul"(%[[VAR3]], %[[VAR1]]) {shift = 0 : i32}
-// CHECK-DAG: %[[VAR6:.*]] = "tosa.reduce_sum"(%[[VAR5]]) {axis = 1 : i64}
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = array<i64: 1, 42>}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 273, 3>}>
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 42, 2>}>
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.mul"(%[[VAR3]], %[[VAR1]]) <{shift = 0 : i32}>
+// CHECK-DAG: %[[VAR6:.*]] = "tosa.reduce_sum"(%[[VAR5]]) <{axis = 1 : i64}>
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) <{new_shape = array<i64: 1, 42>}>
 // CHECK-DAG: %[[VAR8:.*]] = "tosa.gather"(%[[VAR2]], %[[VAR7]])
-// CHECK: %[[VAR9:.*]] = "tosa.reshape"(%[[VAR8]]) {new_shape = array<i64: 6, 7, 3>}
+// CHECK: %[[VAR9:.*]] = "tosa.reshape"(%[[VAR8]]) <{new_shape = array<i64: 6, 7, 3>}>
 func.func @test_gather_nd(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6x7x2xi32>) -> tensor<6x7x3xf32> {
   %1 = "tfl.gather_nd"(%arg0, %arg1) : (tensor<13x21x3xf32>, tensor<6x7x2xi32>) -> tensor<6x7x3xf32>
   func.return %1 : tensor<6x7x3xf32>
@@ -2199,10 +2354,10 @@ func.func @test_gather_nd(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6x7x2xi32>)
 // -----
 // CHECK-LABEL: test_gather_cast
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.cast"(%arg1)
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 63>}
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR1]]) {new_shape = array<i64: 1, 49>}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 13, 63>}>
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR1]]) <{new_shape = array<i64: 1, 49>}>
 // CHECK-DAG: %[[VAR4:.*]] = "tosa.gather"(%[[VAR2]], %[[VAR3]])
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = array<i64: 7, 7, 21, 3>}
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) <{new_shape = array<i64: 7, 7, 21, 3>}>
 // CHECK: return %[[VAR5]]
 func.func @test_gather_cast(%arg0: tensor<13x21x3xf32>, %arg1: tensor<7x7xi64>) -> tensor<*xf32> {
   %2 = "tfl.gather"(%arg0, %arg1) {axis = 0 : i32} : (tensor<13x21x3xf32>, tensor<7x7xi64>) -> tensor<*xf32>
@@ -2211,15 +2366,15 @@ func.func @test_gather_cast(%arg0: tensor<13x21x3xf32>, %arg1: tensor<7x7xi64>)
 
 // -----
 
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{\[\[}}48, 1]]> : tensor<1x2xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<-1> : tensor<1x48x1xi64>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{\[\[}}48, 1]]> : tensor<1x2xi32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<-1> : tensor<1x48x1xi64>}>
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.cast"(%arg0)
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.mul"(%[[VAR2]], %[[VAR0]]) {shift = 0 : i32}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reduce_sum"(%[[VAR4]]) {axis = 1 : i64}
-// CHECK-DAG: %[[VAR6:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, -1, 1>}
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR5]]) {new_shape = array<i64: 1, -1>}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.mul"(%[[VAR2]], %[[VAR0]]) <{shift = 0 : i32}>
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reduce_sum"(%[[VAR4]]) <{axis = 1 : i64}>
+// CHECK-DAG: %[[VAR6:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, -1, 1>}>
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR5]]) <{new_shape = array<i64: 1, -1>}>
 // CHECK-DAG: %[[VAR8:.*]] = "tosa.scatter"(%[[VAR1]], %[[VAR7]], %[[VAR6]])
-// CHECK-DAG: %[[VAR9:.*]] = "tosa.reshape"(%[[VAR8]]) {new_shape = array<i64: 1, 48>}
+// CHECK-DAG: %[[VAR9:.*]] = "tosa.reshape"(%[[VAR8]]) <{new_shape = array<i64: 1, 48>}>
 // CHECK: return %[[VAR9]]
 func.func @sparse_to_dense(%arg0 : tensor<?x2xi64>, %arg1 : tensor<?xi64>) -> (tensor<1x48xi64>) {
   %0 = arith.constant dense<[1, 48]> : tensor<2xi64>
@@ -2232,7 +2387,7 @@ func.func @sparse_to_dense(%arg0 : tensor<?x2xi64>, %arg1 : tensor<?xi64>) -> (t
 
 // CHECK-LABEL: @test_arg_max
 func.func @test_arg_max(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
-  // CHECK: %[[ARGMAX:.+]] = "tosa.argmax"(%arg0) {axis = 1 : i64}
+  // CHECK: %[[ARGMAX:.+]] = "tosa.argmax"(%arg0) <{axis = 1 : i64}>
   %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %1 = "tfl.arg_max"(%arg0, %0) : (tensor<13x21x3xf32>, tensor<i32>) -> tensor<*xf32>
   func.return %1 : tensor<*xf32>
@@ -2242,7 +2397,7 @@ func.func @test_arg_max(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: @test_arg_max_negative_dim
 func.func @test_arg_max_negative_dim(%arg0: tensor<13x21x3xf32>) -> tensor<13x21xf32> {
-  // CHECK: %[[ARGMAX:.+]] = "tosa.argmax"(%arg0) {axis = 2 : i64}
+  // CHECK: %[[ARGMAX:.+]] = "tosa.argmax"(%arg0) <{axis = 2 : i64}>
   %0 = "tfl.pseudo_const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
   %1 = "tfl.arg_max"(%arg0, %0) : (tensor<13x21x3xf32>, tensor<i32>) -> tensor<13x21xf32>
   func.return %1 : tensor<13x21xf32>
@@ -2253,7 +2408,7 @@ func.func @test_arg_max_negative_dim(%arg0: tensor<13x21x3xf32>) -> tensor<13x21
 // CHECK-LABEL: @test_arg_min_f32
 func.func @test_arg_min_f32(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   // CHECK: %[[NEG:.+]] = "tosa.negate"(%arg0) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
-  // CHECK: "tosa.argmax"(%[[NEG]]) {axis = 1 : i64}
+  // CHECK: "tosa.argmax"(%[[NEG]]) <{axis = 1 : i64}>
   %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %1 = "tfl.arg_min"(%arg0, %0) : (tensor<13x21x3xf32>, tensor<i32>) -> tensor<*xf32>
   func.return %1 : tensor<*xf32>
@@ -2263,9 +2418,9 @@ func.func @test_arg_min_f32(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: @test_arg_min_i32
 func.func @test_arg_min_i32(%arg0: tensor<13x21x3xi32>) -> tensor<*xi32> {
-  // CHECK: %[[ONE:.+]] = "tosa.const"() {value = dense<-1> : tensor<1x1x1xi32>}
+  // CHECK: %[[ONE:.+]] = "tosa.const"() <{value = dense<-1> : tensor<1x1x1xi32>}>
   // CHECK: %[[SUB:.+]] = "tosa.sub"(%[[ONE]], %arg0)
-  // CHECK: %[[ARGMAX:.+]] = "tosa.argmax"(%[[SUB]]) {axis = 1 : i64}
+  // CHECK: %[[ARGMAX:.+]] = "tosa.argmax"(%[[SUB]]) <{axis = 1 : i64}>
   %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %1 = "tfl.arg_min"(%arg0, %0) : (tensor<13x21x3xi32>, tensor<i32>) -> tensor<*xi32>
   func.return %1 : tensor<*xi32>
@@ -2275,9 +2430,9 @@ func.func @test_arg_min_i32(%arg0: tensor<13x21x3xi32>) -> tensor<*xi32> {
 
 // CHECK-LABEL: @test_arg_min_ui8
 func.func @test_arg_min_ui8(%arg0: tensor<13x21x3xui8>) -> tensor<*xui8> {
-  // CHECK: %[[MAX:.+]] = "tosa.const"() {value = dense<255> : tensor<1x1x1xui8>}
+  // CHECK: %[[MAX:.+]] = "tosa.const"() <{value = dense<255> : tensor<1x1x1xui8>}>
   // CHECK: %[[SUB:.+]] = "tosa.sub"(%[[MAX]], %arg0)
-  // CHECK: %[[ARGMAX:.+]] = "tosa.argmax"(%[[SUB]]) {axis = 1 : i64}
+  // CHECK: %[[ARGMAX:.+]] = "tosa.argmax"(%[[SUB]]) <{axis = 1 : i64}>
   %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %1 = "tfl.arg_min"(%arg0, %0) : (tensor<13x21x3xui8>, tensor<i32>) -> tensor<*xui8>
   func.return %1 : tensor<*xui8>
@@ -2286,18 +2441,18 @@ func.func @test_arg_min_ui8(%arg0: tensor<13x21x3xui8>) -> tensor<*xui8> {
 // -----
 
 // CHECK-LABEL: test_fakequant
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<-2.00003052> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<1.99996948> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() {value = dense<6.10360876E-5> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() {value = dense<16383.75> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() {value = dense<5.000000e-01> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<-2.00003052> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<1.99996948> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{value = dense<6.10360876E-5> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{value = dense<16383.75> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{value = dense<5.000000e-01> : tensor<1x1x1xf32>}>
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.minimum"(%arg0, %[[VAR1]])
 // CHECK-DAG: %[[VAR8:.*]] = "tosa.maximum"(%[[VAR6]], %[[VAR0]])
 // CHECK-DAG: %[[VAR10:.*]] = "tosa.sub"(%[[VAR8]], %[[VAR0]])
-// CHECK-DAG: %[[VAR12:.*]] = "tosa.mul"(%[[VAR10]], %[[VAR3]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR12:.*]] = "tosa.mul"(%[[VAR10]], %[[VAR3]]) <{shift = 0 : i32}>
 // CHECK-DAG: %[[VAR14:.*]] = "tosa.add"(%[[VAR12]], %[[VAR4]])
 // CHECK-DAG: %[[VAR15:.*]] = "tosa.floor"(%[[VAR14]])
-// CHECK-DAG: %[[VAR17:.*]] = "tosa.mul"(%[[VAR15]], %[[VAR2]]) {shift = 0 : i32}
+// CHECK-DAG: %[[VAR17:.*]] = "tosa.mul"(%[[VAR15]], %[[VAR2]]) <{shift = 0 : i32}>
 // CHECK: %[[VAR19:.*]] = "tosa.add"(%[[VAR17]], %[[VAR0]])
 func.func @test_fakequant(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %2 = "tfl.fake_quant"(%arg0)  {max = 2.000000e+00 : f32, min = -2.000000e+00 : f32, narrow_range = false, num_bits = 16 : i32}  : (tensor<13x21x3xf32>) -> tensor<*xf32>
@@ -2307,14 +2462,12 @@ func.func @test_fakequant(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: @test_fullyconnected_hybrid
-func.func @test_fullyconnected_hybrid(%arg0: tensor<14x19xf32>) -> tensor<*xf32> {
+func.func @test_fullyconnected_hybrid(%arg0: tensor<14x19xf32>, %arg1: tensor<28x19x!quant.uniform<i8:f32, 1.0:17>>, %arg2: tensor<28xf32>) -> tensor<*xf32> {
   // This verifies that the constant is decomposed into a dequantization via a
   // cast, subtract, and multiplication.
   // CHECK: "tosa.sub"
   // CHECK: "tosa.fully_connected"
-  %0 = "tfl.pseudo_qconst"() {qtype = tensor<36x36x!quant.uniform<i8:f32, 1.0>>, value = dense<42> : tensor<28x19xi8>} : () -> tensor<28x19x!quant.uniform<i8:f32, 1.0>>
-  %1 = "tfl.pseudo_const"() {value = dense<0.0> : tensor<28xf32>} : () -> tensor<28xf32>
-  %2 = "tfl.fully_connected"(%arg0, %0, %1) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<14x19xf32>, tensor<28x19x!quant.uniform<i8:f32, 1.0>>, tensor<28xf32>) -> tensor<*xf32>
+  %2 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<14x19xf32>, tensor<28x19x!quant.uniform<i8:f32, 1.0:17>>, tensor<28xf32>) -> tensor<*xf32>
   func.return %2 : tensor<*xf32>
 }
 
@@ -2355,19 +2508,19 @@ func.func @test_squeeze_neg(%arg0: tensor<2x1x3x1xf32>) -> tensor<2x1x3xf32> {
 
 // CHECK-LABEL: test_gelu
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x4x8x19xf32>
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() {value = dense<3.000000e+00> : tensor<1x1x1x1xf32>}
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() {value = dense<4.471500e-02> : tensor<1x1x1x1xf32>}
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<0.797884583> : tensor<1x1x1x1xf32>}
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1x1xf32>}
-// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() {value = dense<5.000000e-01> : tensor<1x1x1x1xf32>}
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xf32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<4.471500e-02> : tensor<1x1x1x1xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<0.797884583> : tensor<1x1x1x1xf32>}>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x1x1xf32>}>
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<5.000000e-01> : tensor<1x1x1x1xf32>}>
 // CHECK: %[[VAL_6:.*]] = "tosa.pow"(%[[VAL_0]], %[[VAL_1]])
-// CHECK: %[[VAL_7:.*]] = "tosa.mul"(%[[VAL_6]], %[[VAL_2]]) {shift = 0 : i32}
+// CHECK: %[[VAL_7:.*]] = "tosa.mul"(%[[VAL_6]], %[[VAL_2]]) <{shift = 0 : i32}>
 // CHECK: %[[VAL_8:.*]] = "tosa.add"(%[[VAL_0]], %[[VAL_7]])
-// CHECK: %[[VAL_9:.*]] = "tosa.mul"(%[[VAL_8]], %[[VAL_3]]) {shift = 0 : i32}
+// CHECK: %[[VAL_9:.*]] = "tosa.mul"(%[[VAL_8]], %[[VAL_3]]) <{shift = 0 : i32}>
 // CHECK: %[[VAL_10:.*]] = "tosa.tanh"(%[[VAL_9]])
 // CHECK: %[[VAL_11:.*]] = "tosa.add"(%[[VAL_10]], %[[VAL_4]])
-// CHECK: %[[VAL_12:.*]] = "tosa.mul"(%[[VAL_0]], %[[VAL_5]]) {shift = 0 : i32}
-// CHECK: %[[VAL_13:.*]] = "tosa.mul"(%[[VAL_12]], %[[VAL_11]]) {shift = 0 : i32}
+// CHECK: %[[VAL_12:.*]] = "tosa.mul"(%[[VAL_0]], %[[VAL_5]]) <{shift = 0 : i32}>
+// CHECK: %[[VAL_13:.*]] = "tosa.mul"(%[[VAL_12]], %[[VAL_11]]) <{shift = 0 : i32}>
 func.func @test_gelu(%arg0: tensor<1x4x8x19xf32>) -> tensor<1x4x8x19xf32> {
   %0 = "tfl.gelu"(%arg0) {approximate = true} : (tensor<1x4x8x19xf32>) -> tensor<1x4x8x19xf32>
   func.return %0 : tensor<1x4x8x19xf32>
@@ -2377,8 +2530,8 @@ func.func @test_gelu(%arg0: tensor<1x4x8x19xf32>) -> tensor<1x4x8x19xf32> {
 
 // CHECK-LABEL: test_gelu_qi8
 // CHECK-SAME: %[[VAR0:.*]]: tensor<1x4x4x4x!quant.uniform<i8:f32, 0.015685562044382095:-1>>
-// CHECK: %[[VAR1:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<256xi8>}
-// CHECK: %[[VAR2:.*]] = "tosa.table"(%[[VAR0]], %[[VAR1]]) : (tensor<1x4x4x4x!quant.uniform<i8:f32, 0.015685562044382095:-1>>, tensor<256xi8>)
+// CHECK: %[[VAR1:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<256xi8>}>
+// CHECK: %[[VAR2:.*]] = "tosa.table"(%[[VAR0]], %[[VAR1]]) : (tensor<1x4x4x4x!quant.uniform<i8:f32, 0.015685562044382095:-1>>, tensor<256x!quant.uniform<i8:f32, 1.000000e+00>>)
 func.func @test_gelu_qi8(%arg0: tensor<1x4x4x4x!quant.uniform<i8:f32, 0.015685562044382095:-1>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 0.0083315325900912285:-108>> {
   %0 = "tfl.gelu"(%arg0) {approximate = true} : (tensor<1x4x4x4x!quant.uniform<i8:f32, 0.015685562044382095:-1>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>
   func.return %0 : tensor<1x4x4x4x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>
@@ -2388,14 +2541,14 @@ func.func @test_gelu_qi8(%arg0: tensor<1x4x4x4x!quant.uniform<i8:f32, 0.01568556
 
 // CHECK-LABEL: mirrorpad_reflect
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>
-// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 2, 9>, start = array<i64: 1, 0>} : (tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
-// CHECK: %[[VAL_2:.*]] = "tosa.reverse"(%[[VAL_1]]) {axis = 0 : i64} : (tensor<2x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
-// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 1, 9>, start = array<i64: 2, 0>} : (tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
-// CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_2]], %[[VAL_0]], %[[VAL_3]]) {axis = 0 : i64} : (tensor<2x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<1x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
-// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) {size = array<i64: 7, 2>, start = array<i64: 0, 1>} : (tensor<7x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
-// CHECK: %[[VAL_6:.*]] = "tosa.reverse"(%[[VAL_5]]) {axis = 1 : i64} : (tensor<7x2x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
-// CHECK: %[[VAL_7:.*]] = "tosa.slice"(%[[VAL_4]]) {size = array<i64: 7, 1>, start = array<i64: 0, 7>} : (tensor<7x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
-// CHECK: %[[VAL_8:.*]] = "tosa.concat"(%[[VAL_6]], %[[VAL_4]], %[[VAL_7]]) {axis = 1 : i64} : (tensor<7x2x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<7x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<7x1x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) <{size = array<i64: 2, 9>, start = array<i64: 1, 0>}> : (tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_2:.*]] = "tosa.reverse"(%[[VAL_1]]) <{axis = 0 : i64}> : (tensor<2x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_0]]) <{size = array<i64: 1, 9>, start = array<i64: 2, 0>}> : (tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_2]], %[[VAL_0]], %[[VAL_3]]) <{axis = 0 : i64}> : (tensor<2x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<1x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) <{size = array<i64: 7, 2>, start = array<i64: 0, 1>}> : (tensor<7x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_6:.*]] = "tosa.reverse"(%[[VAL_5]]) <{axis = 1 : i64}> : (tensor<7x2x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_7:.*]] = "tosa.slice"(%[[VAL_4]]) <{size = array<i64: 7, 1>, start = array<i64: 0, 7>}> : (tensor<7x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_8:.*]] = "tosa.concat"(%[[VAL_6]], %[[VAL_4]], %[[VAL_7]]) <{axis = 1 : i64}> : (tensor<7x2x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<7x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<7x1x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
 func.func @mirrorpad_reflect(%arg0: tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>) -> tensor<7x12x!quant.uniform<i8:f32, 0.0083315325900912285:-108>> {
   %0 = "tfl.pseudo_const"() {value = dense<[[2, 1], [2, 1]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
   %1 = "tfl.mirror_pad"(%arg0, %0) {mode = #tfl<mirror_pad_attr REFLECT>} : (tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<2x2xi32>) -> tensor<7x12x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>
@@ -2406,12 +2559,12 @@ func.func @mirrorpad_reflect(%arg0: tensor<4x9x!quant.uniform<i8:f32, 0.00833153
 
 // CHECK-LABEL: mirrorpad_symmetric
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<15x23x2xf32>
-// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 1, 23, 2>, start = array<i64: 0, 0, 0>} : (tensor<15x23x2xf32>)
-// CHECK: %[[VAL_2:.*]] = "tosa.concat"(%[[VAL_1]], %[[VAL_0]]) {axis = 0 : i64} : (tensor<1x23x2xf32>, tensor<15x23x2xf32>)
-// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_2]]) {size = array<i64: 16, 1, 2>, start = array<i64: 0, 0, 0>} : (tensor<16x23x2xf32>)
-// CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_3]], %[[VAL_2]]) {axis = 1 : i64} : (tensor<16x1x2xf32>, tensor<16x23x2xf32>)
-// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) {size = array<i64: 16, 24, 1>, start = array<i64: 0, 0, 0>} : (tensor<16x24x2xf32>)
-// CHECK: %[[VAL_6:.*]] = "tosa.concat"(%[[VAL_5]], %[[VAL_4]]) {axis = 2 : i64} : (tensor<16x24x1xf32>, tensor<16x24x2xf32>)
+// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) <{size = array<i64: 1, 23, 2>, start = array<i64: 0, 0, 0>}> : (tensor<15x23x2xf32>)
+// CHECK: %[[VAL_2:.*]] = "tosa.concat"(%[[VAL_1]], %[[VAL_0]]) <{axis = 0 : i64}> : (tensor<1x23x2xf32>, tensor<15x23x2xf32>)
+// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_2]]) <{size = array<i64: 16, 1, 2>, start = array<i64: 0, 0, 0>}> : (tensor<16x23x2xf32>)
+// CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_3]], %[[VAL_2]]) <{axis = 1 : i64}> : (tensor<16x1x2xf32>, tensor<16x23x2xf32>)
+// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) <{size = array<i64: 16, 24, 1>, start = array<i64: 0, 0, 0>}> : (tensor<16x24x2xf32>)
+// CHECK: %[[VAL_6:.*]] = "tosa.concat"(%[[VAL_5]], %[[VAL_4]]) <{axis = 2 : i64}> : (tensor<16x24x1xf32>, tensor<16x24x2xf32>)
 func.func @mirrorpad_symmetric(%arg0: tensor<15x23x2xf32>) -> tensor<16x24x3xf32> {
   %0 = "tfl.pseudo_const"() {value = dense<[[1, 0], [1, 0], [1, 0]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
   %1 = "tfl.mirror_pad"(%arg0, %0) {mode = #tfl<mirror_pad_attr SYMMETRIC>} : (tensor<15x23x2xf32>, tensor<3x2xi32>) -> tensor<16x24x3xf32>
@@ -2422,8 +2575,8 @@ func.func @mirrorpad_symmetric(%arg0: tensor<15x23x2xf32>) -> tensor<16x24x3xf32
 
 // CHECK-LABEL: @test_reverse_works
 func.func @test_reverse_works(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
-  // CHECK: %[[VAL0:.+]] = "tosa.reverse"(%arg0) {axis = 1 : i64}
-  // CHECK: %[[VAL1:.+]] = "tosa.reverse"(%[[VAL0]]) {axis = 2 : i64}
+  // CHECK: %[[VAL0:.+]] = "tosa.reverse"(%arg0) <{axis = 1 : i64}>
+  // CHECK: %[[VAL1:.+]] = "tosa.reverse"(%[[VAL0]]) <{axis = 2 : i64}>
   %0 = "tfl.pseudo_const"() {value = dense<[1, -2]> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.reverse_v2"(%arg0, %0): (tensor<1x2x3x4xf32>, tensor<2xi32>) -> tensor<1x2x3x4xf32>
   func.return %1 : tensor<1x2x3x4xf32>
@@ -2443,7 +2596,7 @@ func.func @test_reverse_fail(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
 
 // CHECK-LABEL: test_tfl_custom
 // CHECK-SAME: %[[ARG_0:.*]]: tensor<1x64x64x32xf32>
-// CHECK: %[[VAL_0:.*]] = "tosa.custom"(%[[ARG_0]]) {config = "TFL", identifier = "MaxPoolingWithArgmax2D", implementation_attrs = "{{.*}}"} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
+// CHECK: %[[VAL_0:.*]] = "tosa.custom"(%[[ARG_0]]) <{config = "TFL", identifier = "MaxPoolingWithArgmax2D", implementation_attrs = "{{.*}}"}> : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
 func.func @test_tfl_custom(%arg0: tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>) {
   // custom op for "tfl.max_pooling_with_argmax_2d"(%arg0) {filter_h = 2 : i32, filter_w = 2 : i32, padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
   %0, %1 = "tfl.custom"(%arg0) {custom_option = #tfl<const_bytes : "0x01000000020000000200000002000000020000000000000000000000000000000000000000000000">, custom_code = "MaxPoolingWithArgmax2D"} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
@@ -2453,15 +2606,15 @@ func.func @test_tfl_custom(%arg0: tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32x
 // -----
 // CHECK-LABEL: test_tfl_while_loop
 // CHECK: %[[VAL_0:.*]]: tensor<1x4x4x4xf32> {tf_saved_model.index_path = ["placeholder_0"]}) -> (tensor<1x4x4x4xf32> {tf_saved_model.index_path = ["output_0"]}) {
-// CHECK: %[[VAL_1:.*]] = "tosa.const"() {value = dense<2.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<2.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
 // CHECK: %[[VAL_2:.*]] = "tosa.while_loop"(%[[VAL_0]]) ({
 // CHECK: ^bb0(%[[VAL_3:.*]]: tensor<1x4x4x4xf32>):
-// CHECK: %[[VAL_4:.*]] = "tosa.reduce_sum"(%[[VAL_3]]) {axis = 1 : i64} : (tensor<1x4x4x4xf32>) -> tensor<1x1x4x4xf32>
-// CHECK: %[[VAL_5:.*]] = "tosa.reduce_sum"(%[[VAL_4]]) {axis = 2 : i64} : (tensor<1x1x4x4xf32>) -> tensor<1x1x1x4xf32>
-// CHECK: %[[VAL_6:.*]] = "tosa.reduce_sum"(%[[VAL_5]]) {axis = 3 : i64} : (tensor<1x1x1x4xf32>) -> tensor<1x1x1x1xf32>
-// CHECK: %[[VAL_7:.*]] = "tosa.reshape"(%[[VAL_6]]) {new_shape = array<i64: 1>} : (tensor<1x1x1x1xf32>) -> tensor<1xf32>
+// CHECK: %[[VAL_4:.*]] = "tosa.reduce_sum"(%[[VAL_3]]) <{axis = 1 : i64}> : (tensor<1x4x4x4xf32>) -> tensor<1x1x4x4xf32>
+// CHECK: %[[VAL_5:.*]] = "tosa.reduce_sum"(%[[VAL_4]]) <{axis = 2 : i64}> : (tensor<1x1x4x4xf32>) -> tensor<1x1x1x4xf32>
+// CHECK: %[[VAL_6:.*]] = "tosa.reduce_sum"(%[[VAL_5]]) <{axis = 3 : i64}> : (tensor<1x1x1x4xf32>) -> tensor<1x1x1x1xf32>
+// CHECK: %[[VAL_7:.*]] = "tosa.reshape"(%[[VAL_6]]) <{new_shape = array<i64: 1>}> : (tensor<1x1x1x1xf32>) -> tensor<1xf32>
 // CHECK: %[[VAL_8:.*]] = "tosa.greater"(%[[VAL_1]], %[[VAL_7]]) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xi1>
-// CHECK: %[[VAL_9:.*]] = "tosa.reshape"(%[[VAL_8]]) {new_shape = array<i64>} : (tensor<1xi1>) -> tensor<i1>
+// CHECK: %[[VAL_9:.*]] = "tosa.reshape"(%[[VAL_8]]) <{new_shape = array<i64>}> : (tensor<1xi1>) -> tensor<i1>
 // CHECK: "tosa.yield"(%[[VAL_9]]) : (tensor<i1>) -> ()
 // CHECK: }, {
 // CHECK: ^bb0(%[[VAL_10:.*]]: tensor<1x4x4x4xf32>):
@@ -2500,10 +2653,82 @@ func.func private @result_body(%arg0: tensor<1x4x4x4xf32>) -> tensor<1x4x4x4xf32
 
 // -----
 
+// CHECK-LABEL: test_rfft2d
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x8x16xf32>
+// CHECK: %[[VAL_1:.*]], %[[VAL_2:.*]] = "tosa.rfft2d"(%[[VAL_0]]) : (tensor<1x8x16xf32>) -> (tensor<1x8x9xf32>, tensor<1x8x9xf32>)
+// CHECK: %[[VAL_3:.*]] = "tosa.reshape"(%[[VAL_1]]) <{new_shape = array<i64: 1, 8, 9, 1>}> : (tensor<1x8x9xf32>) -> tensor<1x8x9x1xf32>
+// CHECK: %[[VAL_4:.*]] = "tosa.reshape"(%[[VAL_2]]) <{new_shape = array<i64: 1, 8, 9, 1>}> : (tensor<1x8x9xf32>) -> tensor<1x8x9x1xf32>
+// CHECK: %[[VAL_5:.*]] = "tosa.concat"(%[[VAL_3]], %[[VAL_4]]) <{axis = 3 : i64}> : (tensor<1x8x9x1xf32>, tensor<1x8x9x1xf32>) -> tensor<1x8x9x2xf32>
+// CHECK: return %[[VAL_5]] : tensor<1x8x9x2xf32>
+func.func @test_rfft2d(%arg0: tensor<1x8x16xf32>) -> tensor<1x8x9xcomplex<f32>> {
+  %0 = "tfl.pseudo_const"() {value = dense<[8, 16]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %1 = "tfl.rfft2d"(%arg0, %0) : (tensor<1x8x16xf32>, tensor<2xi32>) -> tensor<1x8x9xcomplex<f32>>
+  return %1 : tensor<1x8x9xcomplex<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: test_rfft2d_crop_input
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3xf32>
+// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) <{size = array<i64: 13, 2, 2>, start = array<i64: 0, 0, 0>}> : (tensor<13x21x3xf32>) -> tensor<13x2x2xf32>
+// CHECK: %[[VAL_2:.*]], %[[VAL_3:.*]] = "tosa.rfft2d"(%[[VAL_1]]) : (tensor<13x2x2xf32>) -> (tensor<13x2x2xf32>, tensor<13x2x2xf32>)
+// CHECK: %[[VAL_4:.*]] = "tosa.reshape"(%[[VAL_2]]) <{new_shape = array<i64: 13, 2, 2, 1>}> : (tensor<13x2x2xf32>) -> tensor<13x2x2x1xf32>
+// CHECK: %[[VAL_5:.*]] = "tosa.reshape"(%[[VAL_3]]) <{new_shape = array<i64: 13, 2, 2, 1>}> : (tensor<13x2x2xf32>) -> tensor<13x2x2x1xf32>
+// CHECK: %[[VAL_6:.*]] = "tosa.concat"(%[[VAL_4]], %[[VAL_5]]) <{axis = 3 : i64}> : (tensor<13x2x2x1xf32>, tensor<13x2x2x1xf32>) -> tensor<13x2x2x2xf32>
+// CHECK: return %[[VAL_6]] : tensor<13x2x2x2xf32>
+func.func @test_rfft2d_crop_input(%arg0: tensor<13x21x3xf32>) -> tensor<13x2x2xcomplex<f32>> {
+  %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
+  %1 = "tfl.rfft2d"(%arg0, %0) : (tensor<13x21x3xf32>, tensor<2xi32>) -> tensor<13x2x2xcomplex<f32>>
+  return %1 : tensor<13x2x2xcomplex<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: test_rfft2d_pad_input
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3xf32>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [0, 11], [0, 5]]> : tensor<3x2xi32>}> : () -> tensor<3x2xi32>
+// CHECK: %[[VAL_3:.*]] = "tosa.pad"(%[[VAL_0]], %[[VAL_2]], %[[VAL_1]]) : (tensor<13x21x3xf32>, tensor<3x2xi32>, tensor<f32>) -> tensor<13x32x8xf32>
+// CHECK: %[[VAL_4:.*]], %[[VAL_5:.*]] = "tosa.rfft2d"(%[[VAL_3]]) : (tensor<13x32x8xf32>) -> (tensor<13x32x5xf32>, tensor<13x32x5xf32>)
+// CHECK: %[[VAL_6:.*]] = "tosa.reshape"(%[[VAL_4]]) <{new_shape = array<i64: 13, 32, 5, 1>}> : (tensor<13x32x5xf32>) -> tensor<13x32x5x1xf32>
+// CHECK: %[[VAL_7:.*]] = "tosa.reshape"(%[[VAL_5]]) <{new_shape = array<i64: 13, 32, 5, 1>}> : (tensor<13x32x5xf32>) -> tensor<13x32x5x1xf32>
+// CHECK: %[[VAL_8:.*]] = "tosa.concat"(%[[VAL_6]], %[[VAL_7]]) <{axis = 3 : i64}> : (tensor<13x32x5x1xf32>, tensor<13x32x5x1xf32>) -> tensor<13x32x5x2xf32>
+// CHECK: return %[[VAL_8]] : tensor<13x32x5x2xf32>
+func.func @test_rfft2d_pad_input(%arg0: tensor<13x21x3xf32>) -> (tensor<13x32x5xcomplex<f32>>) {
+  %0 = "tfl.pseudo_const"() {value = dense<[[0, 0], [0, 11], [0, 5]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
+  %1 = "tfl.pad"(%arg0, %0) : (tensor<13x21x3xf32>, tensor<3x2xi32>) -> tensor<13x32x8xf32>
+  %2 = "tfl.pseudo_const"() {value = dense<[32, 8]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %3 = "tfl.rfft2d"(%1, %2) : (tensor<13x32x8xf32>, tensor<2xi32>) -> tensor<13x32x5xcomplex<f32>>
+  return %3 : tensor<13x32x5xcomplex<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: test_rfft2d_crop_height_pad_width
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3xf32>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [0, 0], [0, 13]]> : tensor<3x2xi32>}> : () -> tensor<3x2xi32>
+// CHECK: %[[VAL_3:.*]] = "tosa.pad"(%[[VAL_0]], %[[VAL_2]], %[[VAL_1]]) : (tensor<13x21x3xf32>, tensor<3x2xi32>, tensor<f32>) -> tensor<13x21x16xf32>
+// CHECK: %[[VAL_4:.*]] = "tosa.slice"(%[[VAL_3]]) <{size = array<i64: 13, 2, 16>, start = array<i64: 0, 0, 0>}> : (tensor<13x21x16xf32>) -> tensor<13x2x16xf32>
+// CHECK: %[[VAL_5:.*]], %[[VAL_6:.*]] = "tosa.rfft2d"(%[[VAL_4]]) : (tensor<13x2x16xf32>) -> (tensor<13x2x9xf32>, tensor<13x2x9xf32>)
+// CHECK: %[[VAL_7:.*]] = "tosa.reshape"(%[[VAL_5]]) <{new_shape = array<i64: 13, 2, 9, 1>}> : (tensor<13x2x9xf32>) -> tensor<13x2x9x1xf32>
+// CHECK: %[[VAL_8:.*]] = "tosa.reshape"(%[[VAL_6]]) <{new_shape = array<i64: 13, 2, 9, 1>}> : (tensor<13x2x9xf32>) -> tensor<13x2x9x1xf32>
+// CHECK: %[[VAL_9:.*]] = "tosa.concat"(%[[VAL_7]], %[[VAL_8]]) <{axis = 3 : i64}> : (tensor<13x2x9x1xf32>, tensor<13x2x9x1xf32>) -> tensor<13x2x9x2xf32>
+// CHECK: return %[[VAL_9]] : tensor<13x2x9x2xf32>
+func.func @test_rfft2d_crop_height_pad_width(%arg0: tensor<13x21x3xf32>) -> (tensor<13x2x9xcomplex<f32>>) {
+  %0 = "tfl.pseudo_const"() {value = dense<[[0, 0], [0, 0], [0, 13]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
+  %1 = "tfl.pad"(%arg0, %0) : (tensor<13x21x3xf32>, tensor<3x2xi32>) -> tensor<13x21x16xf32>
+  %2 = "tfl.pseudo_const"() {value = dense<[2, 16]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %3 = "tfl.rfft2d"(%1, %2) : (tensor<13x21x16xf32>, tensor<2xi32>) -> tensor<13x2x9xcomplex<f32>>
+  return %3 : tensor<13x2x9xcomplex<f32>>
+}
+
+// -----
+
 // CHECK-LABEL: test_real
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x8x9x2xf32>
-// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 1, 8, 9, 1>, start = array<i64: 0, 0, 0, 0>} : (tensor<1x8x9x2xf32>) -> tensor<1x8x9x1xf32>
-// CHECK: %[[VAL_2:.*]] = "tosa.reshape"(%[[VAL_1]]) {new_shape = array<i64: 1, 8, 9>} : (tensor<1x8x9x1xf32>) -> tensor<1x8x9xf32>
+// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) <{size = array<i64: 1, 8, 9, 1>, start = array<i64: 0, 0, 0, 0>}> : (tensor<1x8x9x2xf32>) -> tensor<1x8x9x1xf32>
+// CHECK: %[[VAL_2:.*]] = "tosa.reshape"(%[[VAL_1]]) <{new_shape = array<i64: 1, 8, 9>}> : (tensor<1x8x9x1xf32>) -> tensor<1x8x9xf32>
 // CHECK: return %[[VAL_2]] : tensor<1x8x9xf32>
 func.func @test_real(%arg0: tensor<1x8x9xcomplex<f32>>) -> (tensor<1x8x9xf32>) {
   %0 = "tfl.real"(%arg0) {} : (tensor<1x8x9xcomplex<f32>>) -> tensor<1x8x9xf32>
@@ -2525,8 +2750,8 @@ func.func @test_real_non_complex(%arg0: tensor<1x8x9xf32>) -> (tensor<1x8x9xf32>
 
 // CHECK-LABEL: test_imag
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x8x9x2xf32>
-// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 1, 8, 9, 1>, start = array<i64: 0, 0, 0, 1>} : (tensor<1x8x9x2xf32>) -> tensor<1x8x9x1xf32>
-// CHECK: %[[VAL_2:.*]] = "tosa.reshape"(%[[VAL_1]]) {new_shape = array<i64: 1, 8, 9>} : (tensor<1x8x9x1xf32>) -> tensor<1x8x9xf32>
+// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) <{size = array<i64: 1, 8, 9, 1>, start = array<i64: 0, 0, 0, 1>}> : (tensor<1x8x9x2xf32>) -> tensor<1x8x9x1xf32>
+// CHECK: %[[VAL_2:.*]] = "tosa.reshape"(%[[VAL_1]]) <{new_shape = array<i64: 1, 8, 9>}> : (tensor<1x8x9x1xf32>) -> tensor<1x8x9xf32>
 // CHECK: return %[[VAL_2]] : tensor<1x8x9xf32>
 func.func @test_imag(%arg0: tensor<1x8x9xcomplex<f32>>) -> (tensor<1x8x9xf32>) {
   %0 = "tfl.imag"(%arg0) {} : (tensor<1x8x9xcomplex<f32>>) -> tensor<1x8x9xf32>
@@ -2537,7 +2762,7 @@ func.func @test_imag(%arg0: tensor<1x8x9xcomplex<f32>>) -> (tensor<1x8x9xf32>) {
 
 // CHECK-LABEL: test_imag_non_complex
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x8x9xf32>
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<1x8x9xf32>} : () -> tensor<1x8x9xf32>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x8x9xf32>}> : () -> tensor<1x8x9xf32>
 // CHECK: return %[[VAL_1]] : tensor<1x8x9xf32>
 func.func @test_imag_non_complex(%arg0: tensor<1x8x9xf32>) -> (tensor<1x8x9xf32>) {
   %0 = "tfl.imag"(%arg0) {} : (tensor<1x8x9xf32>) -> tensor<1x8x9xf32>
diff --git a/tensorflow/compiler/mlir/tosa/tests/verify_fully_converted.mlir b/tensorflow/compiler/mlir/tosa/tests/verify_fully_converted.mlir
new file mode 100644
index 00000000000..3783c379908
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/tests/verify_fully_converted.mlir
@@ -0,0 +1,19 @@
+// RUN: tf-opt %s --tosa-tflite-verify-fully-converted --split-input-file -verify-diagnostics
+
+// CHECK-LABEL: func.func @main
+func.func @main(%arg0: tensor<2xf32>) -> (tensor<2xf32>) {
+  // CHECK: "tosa.add"
+  %0 = "tosa.add"(%arg0, %arg0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// expected-error@below {{The following illegal operations still remain}}
+func.func @main(%arg0: tensor<1x8x8x3xf32>) -> tensor<1x8x8x3xf32> attributes {tf.entry_function = {inputs = "input", outputs = "output"}} {
+  // expected-error@+1 {{'tfl.add' op : illegal op still exists}}
+  %0 = tfl.add %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<1x8x8x3xf32>
+  // expected-error@+1 {{'tfl.sub' op : illegal op still exists}}
+  %1 = tfl.sub %0, %arg0 {fused_activation_function = "NONE"} : tensor<1x8x8x3xf32>
+  return %1 : tensor<1x8x8x3xf32>
+}
diff --git a/tensorflow/compiler/mlir/tosa/tf_passes.cc b/tensorflow/compiler/mlir/tosa/tf_passes.cc
index caedab20ccf..f1e7191e2ca 100644
--- a/tensorflow/compiler/mlir/tosa/tf_passes.cc
+++ b/tensorflow/compiler/mlir/tosa/tf_passes.cc
@@ -36,8 +36,8 @@ void createTFtoTOSALegalizationPipeline(
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
 
-  pm.addPass(mlir::createLoopFusionPass());
-  pm.addPass(mlir::createAffineScalarReplacementPass());
+  pm.addPass(mlir::affine::createLoopFusionPass());
+  pm.addPass(mlir::affine::createAffineScalarReplacementPass());
 
   //----------------------------------------------------------------------------
   // Perform main conversion.
diff --git a/tensorflow/compiler/mlir/tosa/tf_tfl_passes.cc b/tensorflow/compiler/mlir/tosa/tf_tfl_passes.cc
index 98cd3514561..2b31e3246fd 100644
--- a/tensorflow/compiler/mlir/tosa/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/tosa/tf_tfl_passes.cc
@@ -40,8 +40,8 @@ void createTFTFLtoTOSALegalizationPipeline(
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
 
-  pm.addPass(mlir::createLoopFusionPass());
-  pm.addPass(mlir::createAffineScalarReplacementPass());
+  pm.addPass(mlir::affine::createLoopFusionPass());
+  pm.addPass(mlir::affine::createAffineScalarReplacementPass());
 
   //----------------------------------------------------------------------------
   // Perform main conversion.
diff --git a/tensorflow/compiler/mlir/tosa/tfl_passes.cc b/tensorflow/compiler/mlir/tosa/tfl_passes.cc
index 9f352f9c4a3..ff3c38e381e 100644
--- a/tensorflow/compiler/mlir/tosa/tfl_passes.cc
+++ b/tensorflow/compiler/mlir/tosa/tfl_passes.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tosa/tfl_passes.h"
 
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Dialect/Affine/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Tosa/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
@@ -29,8 +30,17 @@ void createTFLtoTOSALegalizationPipeline(
   //----------------------------------------------------------------------------
   // Prepare TFL module for conversion
   //----------------------------------------------------------------------------
+  if (opts.target_compilation_backend) {
+    pm.addPass(createRetainCallOnceFuncsPass());
+  }
   // Inline all functions into main and then delete the functions themselves.
   pm.addPass(mlir::createInlinerPass());
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createSymbolDCEPass());
+  if (opts.target_compilation_backend) {
+    pm.nest<func::FuncOp>().addPass(createConvertFunctionMetadataPass());
+    pm.addPass(createLowerGlobalTensorsPass());
+  }
 
   // Add pass to decompose TFLite mixed quantization to non-quantized variants.
   pm.addPass(TFL::CreateDecomposeHybridQuantizationPass());
@@ -39,8 +49,8 @@ void createTFLtoTOSALegalizationPipeline(
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
 
-  pm.addPass(mlir::createLoopFusionPass());
-  pm.addPass(mlir::createAffineScalarReplacementPass());
+  pm.addPass(mlir::affine::createLoopFusionPass());
+  pm.addPass(mlir::affine::createAffineScalarReplacementPass());
 
   //----------------------------------------------------------------------------
   // Perform main conversion.
@@ -62,6 +72,15 @@ void createTFLtoTOSALegalizationPipeline(
   pm.addPass(mlir::createInlinerPass());
   // Clean up with DCE.
   pm.addPass(mlir::createSymbolDCEPass());
+
+  if (opts.target_compilation_backend) {
+    pm.nest<func::FuncOp>().addPass(mlir::tosa::createStripQuantTypesPass());
+    pm.addPass(createCanonicalizerPass());
+    pm.addPass(createReconcileUnrealizedCastsPass());
+    pm.nest<func::FuncOp>().addPass(createStripFunctionMetadataPass());
+    pm.addPass(createStripModuleMetadataPass());
+    pm.addPass(createVerifyFullyConvertedPass());
+  }
 }
 
 void registerTFLtoTOSALegalizationPipeline() {
diff --git a/tensorflow/compiler/mlir/tosa/tfl_passes.h b/tensorflow/compiler/mlir/tosa/tfl_passes.h
index 1d73a655ce0..228b9ec2691 100644
--- a/tensorflow/compiler/mlir/tosa/tfl_passes.h
+++ b/tensorflow/compiler/mlir/tosa/tfl_passes.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 
+#include "llvm/Support/CommandLine.h"
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Pass/PassOptions.h"  // from @llvm-project
 
@@ -29,7 +30,16 @@ struct TOSATFLLegalizationPipelineOptions
     : public PassPipelineOptions<TOSATFLLegalizationPipelineOptions> {
   ArrayRef<std::string> disabled_patterns;
   ArrayRef<std::string> enabled_patterns;
-  bool dequantize_tfl_softmax = false;
+
+  PassOptions::Option<bool> target_compilation_backend{
+      *this, "target-compilation-backend",
+      llvm::cl::desc("Whether targetting compilation backend"),
+      llvm::cl::init(false)};
+
+  PassOptions::Option<bool> dequantize_tfl_softmax{
+      *this, "dequantize-tfl-softmax",
+      llvm::cl::desc("Dequantize the TFLite softmax"), llvm::cl::init(false)};
+
   TOSATFLLegalizationPipelineOptions() {
     disabled_patterns = std::nullopt;
     enabled_patterns = std::nullopt;
diff --git a/tensorflow/compiler/mlir/tosa/transforms/convert_metadata.cc b/tensorflow/compiler/mlir/tosa/transforms/convert_metadata.cc
new file mode 100644
index 00000000000..f81aee69f55
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/transforms/convert_metadata.cc
@@ -0,0 +1,114 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+
+#include "llvm/ADT/StringExtras.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir::tosa {
+
+#define GEN_PASS_DEF_CONVERTFUNCTIONMETADATA
+#include "tensorflow/compiler/mlir/tosa/transforms/passes.h.inc"
+
+namespace {
+
+// Extract the input and output names
+static void splitFunctionIONames(StringAttr namesAttr,
+                                 llvm::SmallVectorImpl<std::string> &names) {
+  SmallVector<StringRef, 4> namesRef;
+  llvm::SplitString(namesAttr.getValue(), namesRef, ",");
+  for (auto nameRef : namesRef) {
+    names.push_back(nameRef.str());
+  }
+}
+
+class ConvertFunctionMetadataPass
+    : public impl::ConvertFunctionMetadataBase<ConvertFunctionMetadataPass> {
+ public:
+  void runOnOperation() override {
+    auto funcOp = getOperation();
+
+    // Setup entry functions for compilation and preserve the
+    // associated metadata. Note that TFLite uses `tf.entry_function`.
+    auto entryFunctionAttr =
+        funcOp->getAttrOfType<DictionaryAttr>("tf.entry_function");
+    if (entryFunctionAttr) {
+      setupEntryPointAttrs(funcOp, entryFunctionAttr);
+    }
+  }
+
+ private:
+  // TF/TFL pack their I/O names in a dictionary, convert into arg attributes.
+  void setupEntryPointAttrs(func::FuncOp funcOp,
+                            DictionaryAttr entryFunctionAttr) {
+    funcOp.setPublic();
+
+    if (funcOp.getNumArguments() > 0) {
+      auto inputsAttr =
+          dyn_cast_or_null<StringAttr>(entryFunctionAttr.get("inputs"));
+      if (!inputsAttr) {
+        funcOp.emitError() << "functions with tf.entry_function must have "
+                              "input names to be handled by backend";
+        return signalPassFailure();
+      }
+      SmallVector<std::string, 4> inputNames;
+      splitFunctionIONames(inputsAttr, inputNames);
+      if (inputNames.size() != funcOp.getNumArguments()) {
+        funcOp.emitError()
+            << "tf.entry_function attribute malformed: inputs don't "
+               "match the function signature";
+        return signalPassFailure();
+      }
+      for (auto [i, name] : llvm::enumerate(inputNames)) {
+        funcOp.setArgAttr(i, "ml_program.identifier",
+                          StringAttr::get(&getContext(), name));
+      }
+    }
+    if (funcOp.getNumResults() > 0) {
+      auto outputsAttr =
+          dyn_cast_or_null<StringAttr>(entryFunctionAttr.get("outputs"));
+      if (!outputsAttr) {
+        funcOp.emitError() << "functions with tf.entry_function must have "
+                              "output names to be handled by backend";
+        return signalPassFailure();
+      }
+      SmallVector<std::string, 4> outputNames;
+      splitFunctionIONames(outputsAttr, outputNames);
+      if (outputNames.size() != funcOp.getNumResults()) {
+        funcOp.emitError()
+            << "tf.entry_function attribute malformed: outputs don't "
+               "match the function signature";
+        return signalPassFailure();
+      }
+      for (auto [i, name] : llvm::enumerate(outputNames)) {
+        funcOp.setResultAttr(i, "ml_program.identifier",
+                             StringAttr::get(&getContext(), name));
+      }
+    }
+  }
+};
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createConvertFunctionMetadataPass() {
+  return std::make_unique<ConvertFunctionMetadataPass>();
+}
+
+}  // namespace mlir::tosa
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
index 4e8cd06c2bf..54539429695 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
@@ -1396,9 +1397,11 @@ std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
   // softmax = exp(logits - max(logits)) / reduce_sum(exp(logits -
   // max(logits)), -1)
   //
-  // We'll use first version for direct fp lowering, and second version for
-  // quantized lowering since second one we can restrict input to exp() be
-  // negative, and thus LUT can always be within [0.0, 1.0].
+  // Second equation is used for both quantized and fp lowering.
+  // For quantized case, we can restrict input to exp() be negative,
+  // and thus LUT can always be within [0.0, 1.0].
+  // For fp case, the normalization in the equation is required to prevent
+  // float overflow in softmax's intermediate calculations.
   RankedTensorType output_type =
       result_value.getType().dyn_cast<RankedTensorType>();
   RankedTensorType input_type =
@@ -1777,28 +1780,42 @@ std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
     rsum_shape_v[input_rank - 1] = 1;
     ArrayRef<int64_t> rsum_shape(rsum_shape_v);
 
-    // Floating-point loewring is more direct:
+    // Floating-point lowering is more direct:
     //
-    // op1 = exp(logits)
-    // op2 = reduce_sum(op1, -1)
-    // op3 = reciprocal(op2)
-    // op4 = mul(op1, op3)
-    auto op1_exp_in = CreateOpAndInfer<tosa::ExpOp>(rewriter, op->getLoc(),
-                                                    output_type, logits_value);
+    // op1 = reducemax(logits)
+    // op2 = sub(logits, op1)
+    // op3 = exp(op2)
+    // op4 = reduce_sum(op3, -1)
+    // op5 = reciprocal(op4)
+    // op6 = mul(op3, op5)
     RankedTensorType rsum_type = tensorflow::GetTypeFromTFTensorShape(
         rsum_shape, output_type.getElementType());
+    RankedTensorType logits_type = tensorflow::GetTypeFromTFTensorShape(
+        logits_shape, output_type.getElementType());
 
-    // Keep dims so we don't need to reshape later
-    auto op2_reducesum_op1 = CreateOpAndInfer<tosa::ReduceSumOp>(
-        rewriter, op->getLoc(), rsum_type, op1_exp_in.getResult(),
+    // Step 1. get x - max(x)
+    auto max_logits = CreateOpAndInfer<tosa::ReduceMaxOp>(
+        rewriter, op->getLoc(), rsum_type, logits_value,
         rewriter.getI64IntegerAttr(input_rank - 1));
-    auto op3_reciprocal_op2 = CreateOpAndInfer<tosa::ReciprocalOp>(
-        rewriter, op->getLoc(), op2_reducesum_op1.getType(),
-        op2_reducesum_op1.getResult());
+    auto normalized_logits =
+        CreateOpAndInfer<tosa::SubOp>(rewriter, op->getLoc(), logits_type,
+                                      logits_value, max_logits.getResult());
+
+    // Step 2. get exp(x - max(x))
+    auto exp_norm_logits = CreateOpAndInfer<tosa::ExpOp>(
+        rewriter, op->getLoc(), output_type, normalized_logits);
+
+    // Step 3. reuse softmax numerator to obtain denominator
+    // Keep dims so we don't need to reshape later
+    auto reducesum = CreateOpAndInfer<tosa::ReduceSumOp>(
+        rewriter, op->getLoc(), rsum_type, exp_norm_logits.getResult(),
+        rewriter.getI64IntegerAttr(input_rank - 1));
+    auto denominator = CreateOpAndInfer<tosa::ReciprocalOp>(
+        rewriter, op->getLoc(), reducesum.getType(), reducesum.getResult());
 
     return CreateOpAndInfer<tosa::MulOp>(rewriter, op->getLoc(), output_type,
-                                         op1_exp_in.getResult(),
-                                         op3_reciprocal_op2.getResult(), 0)
+                                         exp_norm_logits.getResult(),
+                                         denominator.getResult(), 0)
         .getResult();
   }
 }
@@ -2222,7 +2239,7 @@ std::optional<SmallVector<Value>> convertSplitVOp(
 // the only legal negative stride.
 static Value reverseNegativeStride(PatternRewriter& rewriter, Operation* op,
                                    Value input, ArrayRef<int32_t> strides) {
-  for (auto it : llvm::enumerate(strides)) {
+  for (const auto& it : llvm::enumerate(strides)) {
     auto axis = it.index();
     auto stride = it.value();
     if (stride != -1) continue;
@@ -2321,7 +2338,7 @@ std::optional<Value> convertStridedSliceOp(
   }
 
   // Set begin mask values if possible.
-  for (auto& val : llvm::enumerate(begin))
+  for (const auto& val : llvm::enumerate(begin))
     begin_mask |= (val.value() == 0) << val.index();
 
   // If all begin/end masks are set and striding is one we can just return
@@ -3096,7 +3113,7 @@ std::optional<Value> convertResizeOp(PatternRewriter& rewriter, Operation* op,
                        int& border) {
     // Dimension is length 1, we are just sampling from one value.
     if (input == 1) {
-      n = 1;
+      n = output;
       d = 1;
       offset = 0;
       border = output - 1;
@@ -4463,5 +4480,45 @@ std::optional<Value> convertSinOp(PatternRewriter& rewriter, Operation* op,
       .getResult();
 }
 
+// Lowers Sign operator to a sequence of TOSA ops.
+std::optional<Value> convertSignOp(PatternRewriter& rewriter, Operation* op,
+                                   Value input, RankedTensorType output_type) {
+  auto output_elem_type = output_type.getElementType();
+  if (output_elem_type.isa<mlir::quant::QuantizedType>()) {
+    (void)rewriter.notifyMatchFailure(op, "tfl quantization not yet supported");
+    return std::nullopt;
+  }
+
+  // TOSA greater and select can both broadcast, so simply create a tensor with
+  // one element.
+  Value pos_one, neg_one, zero;
+  ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
+  if (output_elem_type.isa<FloatType>()) {
+    pos_one = getTosaConstTensorSingleF32(rewriter, op, 1.0f);
+    neg_one = getTosaConstTensorSingleF32(rewriter, op, -1.0f);
+    zero = getTosaConstTensorSingleF32(rewriter, op, 0.0f);
+  } else {
+    pos_one = getTosaConstTensorScalarInt(builder, output_elem_type, 1);
+    neg_one = getTosaConstTensorScalarInt(builder, output_elem_type, -1);
+    zero = getTosaConstTensorScalarInt(builder, output_elem_type, 0);
+  }
+
+  ShapedType const_type = output_type.clone(rewriter.getIntegerType(1));
+
+  auto gt_zero_op =
+      CreateOpAndInfer<tosa::GreaterOp>(builder, const_type, input, zero);
+
+  auto lt_zero_op =
+      CreateOpAndInfer<tosa::GreaterOp>(builder, const_type, zero, input);
+
+  auto select_neg_op = CreateOpAndInfer<tosa::SelectOp>(
+      builder, output_type, lt_zero_op, neg_one, zero);
+
+  // Select positive one based on the condition tensor.
+  return CreateOpAndInfer<tosa::SelectOp>(builder, output_type, gt_zero_op,
+                                          pos_one, select_neg_op)
+      .getResult();
+}
+
 };  // namespace tosa
 };  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h
index 33eaaab4202..3dc87952753 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h
@@ -302,6 +302,10 @@ std::optional<Value> convertOneHotOp(PatternRewriter& rewriter, Operation* op,
 std::optional<Value> convertSinOp(PatternRewriter& rewriter, Operation* op,
                                   Value input, ShapedType output_type);
 
+// Lowers Sign operator to a sequence of TOSA ops.
+std::optional<Value> convertSignOp(PatternRewriter& rewriter, Operation* op,
+                                   Value input, RankedTensorType output_type);
+
 };  // namespace tosa
 };  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
index d38958f7fff..5418eab622c 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
@@ -79,6 +79,7 @@ DECL_CONVERT_OP(Sub);
 DECL_CONVERT_OP(Mul);
 DECL_CONVERT_OP(Square);
 DECL_CONVERT_OP(SquaredDifference);
+DECL_CONVERT_OP(Sign);
 DECL_CONVERT_OP(Round);
 DECL_CONVERT_OP(FloorDiv);
 DECL_CONVERT_OP(FloorMod);
@@ -250,6 +251,21 @@ LogicalResult ConvertTFGreaterEqualOp::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertTFSignOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tf_sign_op = cast<TF::SignOp>(op);
+
+  RankedTensorType output_type =
+      tf_sign_op.getResult().getType().cast<RankedTensorType>();
+
+  std::optional<Value> result =
+      convertSignOp(rewriter, op, tf_sign_op.getX(), output_type);
+  if (!result) return failure();
+
+  rewriter.replaceOp(op, {result.value()});
+  return success();
+}
+
 LogicalResult ConvertTFSinOp::matchAndRewrite(Operation* op,
                                               PatternRewriter& rewriter) const {
   auto tf_sin_op = cast<TF::SinOp>(op);
@@ -748,8 +764,8 @@ LogicalResult ConvertTFRankOp::matchAndRewrite(
   RankedTensorType rank_type =
       tensorflow::GetTypeFromTFTensorShape({1}, rewriter.getIntegerType(32));
   auto rank_attr = DenseI32ArrayAttr::get(rewriter.getContext(), {rank});
-  auto rank_const = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
-                                                    rank_type, rank_attr);
+  auto rank_const = CreateOpAndInfer<tosa::ConstOp>(
+      rewriter, op->getLoc(), rank_type, cast<mlir::ElementsAttr>(rank_attr));
 
   rewriter.replaceOp(op, {rank_const.getResult()});
 
@@ -780,8 +796,8 @@ LogicalResult ConvertTFShapeOp::matchAndRewrite(
       {static_cast<int32_t>(shape_arr.size())}, rewriter.getIntegerType(32));
   auto shape_attr =
       DenseI32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(shape_arr));
-  auto shape_const = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
-                                                     shape_type, shape_attr);
+  auto shape_const = CreateOpAndInfer<tosa::ConstOp>(
+      rewriter, op->getLoc(), shape_type, cast<mlir::ElementsAttr>(shape_attr));
 
   rewriter.replaceOp(op, {shape_const.getResult()});
 
@@ -849,11 +865,12 @@ LogicalResult ConvertTFFillOp::matchAndRewrite(
     return failure();
 
   RankedTensorType fill_type = tensorflow::GetTypeFromTFTensorShape(
-      ArrayRef<int64_t>(dims_vals), value_elem.getType().getElementType());
+      ArrayRef<int64_t>(dims_vals),
+      value_elem.getShapedType().getElementType());
   DenseArrayAttr fill_attr;
 
   // Convert to a compatible zero type
-  if (value_elem.getType().getElementType().isa<FloatType>()) {
+  if (value_elem.getShapedType().getElementType().isa<FloatType>()) {
     SmallVector<float> fill_arr(
         total_size,
         value_elem.getValues<FloatAttr>()[0].getValue().convertToFloat());
@@ -866,8 +883,8 @@ LogicalResult ConvertTFFillOp::matchAndRewrite(
     fill_attr =
         DenseI32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(fill_arr));
   }
-  auto fill_const_op = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
-                                                       fill_type, fill_attr);
+  auto fill_const_op = CreateOpAndInfer<tosa::ConstOp>(
+      rewriter, op->getLoc(), fill_type, fill_attr.cast<ElementsAttr>());
   rewriter.replaceOp(op, {fill_const_op.getResult()});
 
   return success();
@@ -2428,6 +2445,7 @@ void populateLegalizeTFPatterns(MLIRContext* ctx, RewritePatternSet& patterns) {
   patterns.add<ConvertTFMulOp>(ctx);
   patterns.add<ConvertTFSquareOp>(ctx);
   patterns.add<ConvertTFSquaredDifferenceOp>(ctx);
+  patterns.add<ConvertTFSignOp>(ctx);
   patterns.add<ConvertTFRoundOp>(ctx);
   patterns.add<ConvertTFFloorDivOp>(ctx);
   patterns.add<ConvertTFFloorModOp>(ctx);
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
index b0d249495f9..0162ddd4a8a 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // Legalize TensorFlow Lite to TOSA
 
+#include <cfloat>
 #include <climits>
 #include <cmath>
 #include <cstddef>
@@ -104,6 +105,7 @@ DECL_CONVERT_OP(Sub);
 DECL_CONVERT_OP(Mul);
 DECL_CONVERT_OP(Square);
 DECL_CONVERT_OP(SquaredDifference);
+DECL_CONVERT_OP(Sign);
 DECL_CONVERT_OP(Round);
 DECL_CONVERT_OP(Div);
 DECL_CONVERT_OP(Maximum);
@@ -123,6 +125,7 @@ DECL_CONVERT_OP(Fill);
 DECL_CONVERT_OP(Elu);
 DECL_CONVERT_OP(Softmax);
 DECL_CONVERT_OP(LogSoftmax);
+DECL_CONVERT_OP(Rsqrt);
 DECL_CONVERT_OP(Sqrt);
 DECL_CONVERT_OP(L2Normalization);
 DECL_CONVERT_OP(ReduceAll);
@@ -187,6 +190,7 @@ DECL_CONVERT_OP(FakeQuant);
 DECL_CONVERT_OP(While);
 DECL_CONVERT_OP(Real);
 DECL_CONVERT_OP(Imag);
+DECL_CONVERT_OP(RFFT2d);
 
 #undef DECL_CONVERT_OP
 
@@ -816,6 +820,21 @@ static LogicalResult matchAndRewriteAddSub(Operation* op,
   return success();
 }
 
+LogicalResult ConvertTFLSignOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tfl_sign_op = cast<TFL::SignOp>(op);
+
+  RankedTensorType output_type =
+      tfl_sign_op.getResult().getType().cast<RankedTensorType>();
+
+  std::optional<Value> result =
+      convertSignOp(rewriter, op, tfl_sign_op.getX(), output_type);
+  if (!result) return failure();
+
+  rewriter.replaceOp(op, {result.value()});
+  return success();
+}
+
 LogicalResult ConvertTFLAddOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   return matchAndRewriteAddSub<TFL::AddOp, tosa::AddOp>(op, op->getOperands(),
@@ -1251,6 +1270,126 @@ LogicalResult ConvertTFLMaxPool2DOp::matchAndRewrite(
   return success();
 }
 
+// Returns a new type based on an input type and slicing information.
+// If the input is quantized per axis, slices the scale and zp arrays.
+// In any other case, returns the original type.
+RankedTensorType getTypeForSlice(RankedTensorType type, int64_t slice_size,
+                                 int64_t offset) {
+  if (auto per_channel_qtype =
+          dyn_cast<quant::UniformQuantizedPerAxisType>(type.getElementType())) {
+    SmallVector<double> output_scale_arr(
+        per_channel_qtype.getScales().begin() + offset,
+        per_channel_qtype.getScales().begin() + offset + slice_size);
+    SmallVector<int64_t> output_zp_arr(
+        per_channel_qtype.getZeroPoints().begin() + offset,
+        per_channel_qtype.getZeroPoints().begin() + offset + slice_size);
+    auto output_per_channel_qtype = quant::UniformQuantizedPerAxisType::get(
+        per_channel_qtype.getFlags(), per_channel_qtype.getStorageType(),
+        per_channel_qtype.getExpressedType(), output_scale_arr, output_zp_arr,
+        per_channel_qtype.getQuantizedDimension(),
+        per_channel_qtype.getStorageTypeMin(),
+        per_channel_qtype.getStorageTypeMax());
+    return RankedTensorType::get(type.getShape(), output_per_channel_qtype);
+  }
+  return type;
+}
+
+Value lowerGroupedConvolution(TFL::Conv2DOp op, PatternRewriter& rewriter) {
+  auto input_type = dyn_cast<RankedTensorType>(op.getInput().getType());
+  auto filter_type = dyn_cast<RankedTensorType>(op.getFilter().getType());
+  auto bias_type = dyn_cast<RankedTensorType>(op.getBias().getType());
+  auto output_type = dyn_cast<RankedTensorType>(op.getResult().getType());
+
+  // The inputs are NHWC, so the slicing/concatenation is done over dim 3.
+  int64_t in_channels_dim = 3;
+  int64_t input_channels = input_type.getDimSize(in_channels_dim);
+  int64_t filter_channels = filter_type.getDimSize(in_channels_dim);
+  int64_t num_groups = input_channels / filter_channels;
+
+  SmallVector<Value> convolutions;
+  convolutions.reserve(num_groups);
+  auto rank = input_type.getRank();
+
+  // Input size vector
+  SmallVector<int64_t, 4> input_size_vals(input_type.getShape().begin(),
+                                          input_type.getShape().end());
+  input_size_vals.back() = filter_channels;
+  DenseI64ArrayAttr input_size = rewriter.getDenseI64ArrayAttr(input_size_vals);
+  auto input_slice_ty =
+      RankedTensorType::get(input_size_vals, input_type.getElementType());
+
+  // Filter size vector
+  SmallVector<int64_t, 4> filter_size_vals(filter_type.getShape().begin(),
+                                           filter_type.getShape().end());
+  filter_size_vals.front() = filter_type.getDimSize(0) / num_groups;
+  DenseI64ArrayAttr filter_size =
+      rewriter.getDenseI64ArrayAttr(filter_size_vals);
+  auto filter_slice_ty =
+      RankedTensorType::get(filter_size_vals, filter_type.getElementType());
+
+  // Bias size vector
+  int64_t bias_size_val = bias_type.getDimSize(0) / num_groups;
+  DenseI64ArrayAttr bias_size = rewriter.getDenseI64ArrayAttr(bias_size_val);
+  auto bias_slice_ty =
+      RankedTensorType::get(bias_size_val, bias_type.getElementType());
+
+  auto per_conv_out_ty = RankedTensorType::get(
+      {output_type.getDimSize(0), output_type.getDimSize(1),
+       output_type.getDimSize(2), output_type.getDimSize(3) / num_groups},
+      output_type.getElementType());
+
+  // Create a separate convolution for each group
+  for (int i = 0; i < num_groups; ++i) {
+    auto verified_input_slice_ty =
+        getTypeForSlice(input_slice_ty, filter_channels, i * filter_channels);
+    auto verified_filter_slice_ty =
+        getTypeForSlice(filter_slice_ty, filter_channels, i * filter_channels);
+    auto verified_bias_slice_ty =
+        getTypeForSlice(bias_slice_ty, filter_channels, i * filter_channels);
+    auto verified_per_conv_out_ty =
+        getTypeForSlice(per_conv_out_ty, filter_channels, i * filter_channels);
+
+    // Slice the input
+    SmallVector<int64_t, 4> input_start_vals(rank, 0);
+    input_start_vals.back() = i * filter_channels;
+    DenseI64ArrayAttr input_start =
+        rewriter.getDenseI64ArrayAttr(input_start_vals);
+
+    auto slice_input = rewriter.createOrFold<tosa::SliceOp>(
+        op->getLoc(), verified_input_slice_ty, op.getInput(), input_start,
+        input_size);
+
+    // Slice the filter
+    SmallVector<int64_t, 4> filter_start_vals(rank, 0);
+    filter_start_vals.front() = i * filter_channels;
+    DenseI64ArrayAttr filter_start =
+        rewriter.getDenseI64ArrayAttr(filter_start_vals);
+
+    auto slice_filter = rewriter.createOrFold<tosa::SliceOp>(
+        op->getLoc(), verified_filter_slice_ty, op.getFilter(), filter_start,
+        filter_size);
+
+    // Slice the bias
+    DenseI64ArrayAttr bias_start =
+        rewriter.getDenseI64ArrayAttr(i * filter_channels);
+    auto slice_bias = rewriter.createOrFold<tosa::SliceOp>(
+        op->getLoc(), verified_bias_slice_ty, op.getBias(), bias_start,
+        bias_size);
+
+    // Create a convolution for each set of slices
+    auto conv = rewriter.create<TFL::Conv2DOp>(
+        op->getLoc(), verified_per_conv_out_ty, slice_input, slice_filter,
+        slice_bias, op.getDilationHFactor(), op.getDilationWFactor(),
+        op.getFusedActivationFunction(), op.getPadding(), op.getStrideH(),
+        op.getStrideW());
+
+    convolutions.push_back(conv.getResult());
+  }
+
+  return rewriter.createOrFold<tosa::ConcatOp>(op->getLoc(), output_type,
+                                               convolutions, in_channels_dim);
+}
+
 LogicalResult ConvertTFLConv2DOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tfl_conv2d_op = cast<TFL::Conv2DOp>(op);
@@ -1281,6 +1420,14 @@ LogicalResult ConvertTFLConv2DOp::matchAndRewrite(
         "be all quantized or all floating-point");
   }
 
+  int64_t input_channels = input_type.getDimSize(3);
+  int64_t filter_channels = filter_type.getDimSize(3);
+  if (input_channels != filter_channels &&
+      input_channels % filter_channels == 0) {
+    rewriter.replaceOp(op, lowerGroupedConvolution(tfl_conv2d_op, rewriter));
+    return success();
+  }
+
   DenseI64ArrayAttr pad;
   DenseI64ArrayAttr stride;
   DenseI64ArrayAttr dilation;
@@ -2041,8 +2188,8 @@ LogicalResult ConvertTFLRankOp::matchAndRewrite(
   RankedTensorType rank_type =
       RankedTensorType::get({1}, rewriter.getIntegerType(32));
   auto rank_attr = DenseI32ArrayAttr::get(rewriter.getContext(), {rank});
-  auto rank_const = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
-                                                    rank_type, rank_attr);
+  auto rank_const = CreateOpAndInfer<tosa::ConstOp>(
+      rewriter, op->getLoc(), rank_type, rank_attr.cast<ElementsAttr>());
 
   rewriter.replaceOp(op, {rank_const.getResult()});
 
@@ -2074,8 +2221,8 @@ LogicalResult ConvertTFLShapeOp::matchAndRewrite(
       {static_cast<int32_t>(shape_arr.size())}, rewriter.getIntegerType(32));
   auto shape_attr =
       DenseI32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(shape_arr));
-  auto shape_const = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
-                                                     shape_type, shape_attr);
+  auto shape_const = CreateOpAndInfer<tosa::ConstOp>(
+      rewriter, op->getLoc(), shape_type, shape_attr.cast<ElementsAttr>());
 
   rewriter.replaceOp(op, {shape_const.getResult()});
 
@@ -2142,12 +2289,13 @@ LogicalResult ConvertTFLFillOp::matchAndRewrite(
   if (!matchPattern(tfl_fill_op.getInput(), m_Constant(&value_elem)))
     return failure();
 
-  RankedTensorType fill_type = RankedTensorType::get(
-      ArrayRef<int64_t>(dims_vals), value_elem.getType().getElementType());
+  RankedTensorType fill_type =
+      RankedTensorType::get(ArrayRef<int64_t>(dims_vals),
+                            value_elem.getShapedType().getElementType());
   DenseArrayAttr fill_attr;
 
   // Convert to a compatible zero type.
-  if (value_elem.getType().getElementType().isa<FloatType>()) {
+  if (value_elem.getShapedType().getElementType().isa<FloatType>()) {
     SmallVector<float> fill_arr(
         total_size, value_elem.getValues<APFloat>()[0].convertToFloat());
     fill_attr =
@@ -2158,8 +2306,8 @@ LogicalResult ConvertTFLFillOp::matchAndRewrite(
     fill_attr =
         DenseI32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(fill_arr));
   }
-  auto fill_const_op = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
-                                                       fill_type, fill_attr);
+  auto fill_const_op = CreateOpAndInfer<tosa::ConstOp>(
+      rewriter, op->getLoc(), fill_type, fill_attr.cast<ElementsAttr>());
   rewriter.replaceOp(op, {fill_const_op.getResult()});
 
   return success();
@@ -2350,6 +2498,54 @@ LogicalResult ConvertTFLSoftmaxOp::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertTFLRsqrtOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tfl_rsqrt_op = cast<TFL::RsqrtOp>(op);
+
+  RankedTensorType output_type =
+      tfl_rsqrt_op.getResult().getType().dyn_cast<RankedTensorType>();
+  RankedTensorType input_type =
+      tfl_rsqrt_op.getX().getType().dyn_cast<RankedTensorType>();
+
+  mlir::quant::UniformQuantizedType input_qtype =
+      input_type.getElementType()
+          .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+  mlir::quant::UniformQuantizedType output_qtype =
+      output_type.getElementType()
+          .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+
+  // Quantization case
+  if (input_qtype && output_qtype) {
+    auto rsqrt_func = [](double x) -> double {
+      // Negative numbers are undefined for rsqrt
+      // 0 should return the max value of the storage data type for rsqrt
+      if (x <= 0.0) return DBL_MAX;
+      return 1.0 / std::sqrt(x);
+    };
+
+    // 16-bit is pending review for TFL
+    // https://github.com/tensorflow/tensorflow/pull/58406
+    if (input_qtype.getStorageTypeIntegralWidth() != 8) {
+      return rewriter.notifyMatchFailure(op,
+                                         "input qtype storage width is not 8");
+    }
+
+    // Implement with 8-bit table lookup.
+    Value table_const = getTosaConst8bitTable(
+        rewriter, op, input_qtype.getScale(), input_qtype.getZeroPoint(),
+        output_qtype.getScale(), output_qtype.getZeroPoint(), rsqrt_func);
+
+    CreateReplaceOpAndInfer<tosa::TableOp>(rewriter, op, output_type,
+                                           tfl_rsqrt_op.getX(), table_const);
+    return success();
+  }
+
+  CreateReplaceOpAndInfer<tosa::RsqrtOp>(rewriter, op, tfl_rsqrt_op.getType(),
+                                         tfl_rsqrt_op.getX());
+
+  return success();
+}
+
 LogicalResult ConvertTFLSqrtOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tfl_rsqrt_op = cast<TFL::SqrtOp>(op);
@@ -3365,21 +3561,20 @@ static LogicalResult LegalizeQuantizedPrelu(Operation* op,
 
   // Perform an element-wise multiplication on rescaled alpha and input for
   // PReLU.
-    Value alpha = tfl_prelu_op.getAlpha();
-    ShapedType alpha_type = alpha.getType().cast<ShapedType>();
-    UniformQuantizedType alpha_qtype =
-        alpha_type.getElementType().cast<UniformQuantizedType>();
+  Value alpha = tfl_prelu_op.getAlpha();
+  ShapedType alpha_type = alpha.getType().cast<ShapedType>();
+  UniformQuantizedType alpha_qtype =
+      alpha_type.getElementType().cast<UniformQuantizedType>();
 
-    Value op_rescale_alpha = removeZeroPointAndCastToInt32(
-        rewriter, op, alpha, alpha_qtype.getZeroPoint());
+  Value op_rescale_alpha = removeZeroPointAndCastToInt32(
+      rewriter, op, alpha, alpha_qtype.getZeroPoint());
 
-    Value op_mul =
-        CreateOpAndInfer<tosa::MulOp>(rewriter, op->getLoc(), rescale_type,
-                                      op_rescale_in, op_rescale_alpha, 0);
+  Value op_mul = CreateOpAndInfer<tosa::MulOp>(
+      rewriter, op->getLoc(), rescale_type, op_rescale_in, op_rescale_alpha, 0);
 
-    op_rescale_slope_in = buildRescale(
-        rewriter, op, output_type, op_mul, scale_alpha,
-        /* input_zp = */ 0, output_qtype.getZeroPoint(), true, true);
+  op_rescale_slope_in =
+      buildRescale(rewriter, op, output_type, op_mul, scale_alpha,
+                   /* input_zp = */ 0, output_qtype.getZeroPoint(), true, true);
 
   Value op_rescale_identity_in = buildRescale(
       rewriter, op, output_type, input, scale_identity,
@@ -3745,7 +3940,7 @@ LogicalResult ConvertTFLConstOp::matchAndRewrite(
   if (!output_type) return failure();
 
   ElementsAttr elements = tfl_const_op.getValue();
-  Type element_type = elements.getType().getElementType();
+  Type element_type = elements.getShapedType().getElementType();
   if (output_type.getElementType().isa<quant::QuantizedType>()) {
     output_type = RankedTensorType::get(output_type.getShape(), element_type);
   }
@@ -4190,6 +4385,74 @@ LogicalResult ConvertTFLImagOp::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertTFLRFFT2dOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto rfft2d_op = cast<TFL::RFFT2dOp>(op);
+  auto loc = op->getLoc();
+  Value input = rfft2d_op.getInput();
+
+  auto input_type = dyn_cast<RankedTensorType>(input.getType());
+  auto output_type = dyn_cast<ShapedType>(rfft2d_op.getResult().getType());
+
+  if (!input_type || !output_type) {
+    return rewriter.notifyMatchFailure(op, "ranked input/output required");
+  }
+
+  if (!input_type.getElementType().isF32()) {
+    return rewriter.notifyMatchFailure(op, "input type must be fp32");
+  }
+
+  Value fft_length_value = rfft2d_op.getFftLength();
+  llvm::SmallVector<int32_t> fft_length;
+  if (failed(getVectorFromValue32(fft_length_value, fft_length))) {
+    return rewriter.notifyMatchFailure(op, "fft_length is not a constant");
+  }
+
+  auto fp32_ty = UnrankedTensorType::get(rewriter.getF32Type());
+
+  // Padding is automatically inserted during the lowering when
+  // fft_length > input shape. However, to take care of the
+  // case fft_length < input shape we need to crop the input.
+  const int64_t rank = input_type.getRank();
+  auto input_shape = input_type.getShape();
+  if (fft_length[0] < input_shape[rank - 2] ||
+      fft_length[1] < input_shape[rank - 1]) {
+    llvm::SmallVector<int64_t> slice_begin(rank, 0);
+    llvm::SmallVector<int64_t> slice_size;
+    for (auto dim : input_type.getShape().drop_back(2)) {
+      slice_size.push_back(dim);
+    }
+    slice_size.push_back(fft_length[0]);
+    slice_size.push_back(fft_length[1]);
+    input = CreateOpAndInfer<tosa::SliceOp>(
+        rewriter, loc, fp32_ty, input,
+        rewriter.getDenseI64ArrayAttr(slice_begin),
+        rewriter.getDenseI64ArrayAttr(slice_size));
+  }
+
+  auto rfft2d =
+      CreateOpAndInfer<tosa::RFFT2dOp>(rewriter, loc, fp32_ty, fp32_ty, input);
+
+  auto output_shape = output_type.getShape();
+  llvm::SmallVector<int64_t> new_shape{output_shape};
+  new_shape.push_back(1);
+  auto reshape_1 = CreateOpAndInfer<tosa::ReshapeOp>(
+      rewriter, loc, fp32_ty, rfft2d.getResult(0),
+      rewriter.getDenseI64ArrayAttr(new_shape));
+  auto reshape_2 = CreateOpAndInfer<tosa::ReshapeOp>(
+      rewriter, loc, fp32_ty, rfft2d.getResult(1),
+      rewriter.getDenseI64ArrayAttr(new_shape));
+
+  llvm::SmallVector<Value, 2> values = {reshape_1, reshape_2};
+  auto concat = CreateOpAndInfer<tosa::ConcatOp>(rewriter, loc, fp32_ty, values,
+                                                 rewriter.getI64IntegerAttr(3));
+
+  CreateReplaceOpAndInfer<mlir::UnrealizedConversionCastOp>(
+      rewriter, op, output_type, concat.getResult());
+
+  return success();
+}
+
 LogicalResult LegalizeTFL::initialize(MLIRContext* context) {
   RewritePatternSet patterns(context);
   mlir::tosa::populateLegalizeTFLPatterns(context, patterns);
@@ -4241,6 +4504,7 @@ void populateLegalizeTFLPatterns(MLIRContext* ctx,
   DEF_PATTERN_INSERT(TFLMul);
   DEF_PATTERN_INSERT(TFLSquare);
   DEF_PATTERN_INSERT(TFLSquaredDifference);
+  DEF_PATTERN_INSERT(TFLSign);
   DEF_PATTERN_INSERT(TFLRound);
   DEF_PATTERN_INSERT(TFLDiv);
   DEF_PATTERN_INSERT(TFLMaximum);
@@ -4325,6 +4589,7 @@ void populateLegalizeTFLPatterns(MLIRContext* ctx,
   DEF_PATTERN_INSERT(TFLWhile);
   DEF_PATTERN_INSERT(TFLReal);
   DEF_PATTERN_INSERT(TFLImag);
+  DEF_PATTERN_INSERT(TFLRFFT2d);
 }
 
 // Creates an instance of the TensorFlow Lite dialect LegalizeTFL pass.
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
index 762ba97ed62..ff8616687a2 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
@@ -285,6 +285,13 @@ Value getTosaConst8bitTable(PatternRewriter& rewriter, Operation* op,
   for (int32_t i = -128; i < 128; i++) {
     double dequantized = input_scale * (i - input_zp);
     double transformed = func(dequantized);
+
+    double max = (output_scale > 1.0) ? DBL_MAX : (DBL_MAX * output_scale);
+    if (transformed >= max) {
+      table.push_back(INT8_MAX);
+      continue;
+    }
+
     int32_t rescaled = std::llround(transformed / output_scale);
     int32_t quantized = static_cast<int32_t>(rescaled + output_zp);
     table.push_back(
@@ -434,6 +441,21 @@ Value getTosaConstTensorSingleI32(PatternRewriter& rewriter, Operation* op,
   return const_op.getResult();
 }
 
+// Create an expected bitwidth integer constant operator based on the type
+// parameter.
+Value getTosaConstTensorScalarInt(ImplicitLocOpBuilder& builder, Type type,
+                                  int64_t val) {
+  auto bit_width = type.getIntOrFloatBitWidth();
+  auto const_type = tensorflow::GetTypeFromTFTensorShape(
+      {}, builder.getIntegerType(bit_width));
+  auto const_attr =
+      SplatElementsAttr::get(const_type, builder.getIntegerAttr(type, val));
+
+  auto const_op =
+      builder.create<tosa::ConstOp>(builder.getLoc(), const_type, const_attr);
+  return const_op.getResult();
+}
+
 // Create a vector from a 32-bit value tensor.  Returns the size of
 // the new vector or -1 on error.
 LogicalResult getVectorFromValue32(Value val, SmallVectorImpl<int32_t>& vec) {
@@ -695,7 +717,10 @@ LogicalResult ApplyPatternsWithShapeResolution(
   // This should be investigate for whether it is still necessary due to quant
   // type stripping changing.
   func.walk([&](tosa::ConstOp op) {
-    auto ety = op.getValue().getType().getElementType();
+    if (op.getType().getElementType().isa<QuantizedType>()) {
+      return;
+    }
+    auto ety = op.getValue().getShapedType().getElementType();
     auto new_ty = op.getType().cast<TensorType>().clone(ety);
     op.getResult().setType(new_ty);
   });
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
index c67791200f5..b2e76197fb5 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TOSA_TRANSFORMS_LEGALIZE_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_TOSA_TRANSFORMS_LEGALIZE_UTILS_H_
 
+#include <cfloat>
 #include <climits>
 #include <cstddef>
 #include <cstdint>
@@ -28,6 +29,7 @@ limitations under the License.
 #include "mlir/Dialect/Tosa/Utils/ShapeUtils.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
@@ -103,6 +105,11 @@ Value getTosaConstTensorSingleF32(PatternRewriter& rewriter, Operation* op,
 Value getTosaConstTensorSingleI32(PatternRewriter& rewriter, Operation* op,
                                   int32_t val);
 
+// Create an expected bitwidth integer constant operator based on the type
+// parameter.
+Value getTosaConstTensorScalarInt(ImplicitLocOpBuilder& builder, Type type,
+                                  int64_t val);
+
 // Create a vector from a 32-bit value tensor.  Returns vector size on success
 // or -1 on error.
 LogicalResult getVectorFromValue32(Value val, SmallVectorImpl<int32_t>& vec);
@@ -151,9 +158,9 @@ LogicalResult ApplyPatternsWithShapeResolution(
 // Creates a TOSA operation and performs shape inference on the individual
 // op. This allows shape inference during the TFLite to TOSA lowering.
 template <typename TosaOp, typename... Args>
-TosaOp CreateOpAndInfer(PatternRewriter& rewriter, Location loc, Type result_ty,
+TosaOp CreateOpAndInfer(ImplicitLocOpBuilder& builder, Type result_ty,
                         Args&&... args) {
-  auto op = rewriter.create<TosaOp>(loc, result_ty, args...);
+  auto op = builder.create<TosaOp>(result_ty, args...);
 
   InferShapedTypeOpInterface shapeInterface =
       dyn_cast<InferShapedTypeOpInterface>(op.getOperation());
@@ -161,8 +168,9 @@ TosaOp CreateOpAndInfer(PatternRewriter& rewriter, Location loc, Type result_ty,
 
   SmallVector<ShapedTypeComponents> returnedShapes;
   if (shapeInterface
-          .inferReturnTypeComponents(op.getContext(), op.getLoc(),
+          .inferReturnTypeComponents(op.getContext(), builder.getLoc(),
                                      op->getOperands(), op->getAttrDictionary(),
+                                     op->getPropertiesStorage(),
                                      op->getRegions(), returnedShapes)
           .failed())
     return op;
@@ -196,6 +204,13 @@ TosaOp CreateOpAndInfer(PatternRewriter& rewriter, Location loc, Type result_ty,
   return op;
 }
 
+template <typename TosaOp, typename... Args>
+TosaOp CreateOpAndInfer(PatternRewriter& rewriter, Location loc, Type result_ty,
+                        Args&&... args) {
+  ImplicitLocOpBuilder builder(loc, rewriter);
+  return CreateOpAndInfer<TosaOp>(builder, result_ty, args...);
+}
+
 template <typename TosaOp, typename... Args>
 void CreateReplaceOpAndInfer(PatternRewriter& rewriter, Operation* op,
                              Type result_ty, Args&&... args) {
diff --git a/tensorflow/compiler/mlir/tosa/transforms/lower_global_tensors.cc b/tensorflow/compiler/mlir/tosa/transforms/lower_global_tensors.cc
new file mode 100644
index 00000000000..de30f7c2fb0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/transforms/lower_global_tensors.cc
@@ -0,0 +1,206 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "mlir/Dialect/MLProgram/IR/MLProgram.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
+
+#define PASS_NAME "tosa-lower-global-tensors"
+#define DEBUG_TYPE PASS_NAME
+
+namespace mlir::tosa {
+
+#define GEN_PASS_DEF_LOWERGLOBALTENSORS
+#include "tensorflow/compiler/mlir/tosa/transforms/passes.h.inc"
+
+namespace {
+
+class LowerGlobalTensorsPass
+    : public impl::LowerGlobalTensorsBase<LowerGlobalTensorsPass> {
+ public:
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<mlir::TFL::TensorFlowLiteDialect,
+                    mlir::ml_program::MLProgramDialect>();
+  }
+
+  // Converts TFLite state operations to the MLProgram equivalent.
+  void runOnOperation() override {
+    auto* context = &getContext();
+    auto moduleOp = getOperation();
+    mlir::OpBuilder builder(moduleOp.getBodyRegion());
+
+    DenseMap<StringRef, func::FuncOp> symNameToFunction;
+    for (auto func : moduleOp.getOps<func::FuncOp>()) {
+      symNameToFunction[func.getSymName()] = func;
+    }
+
+    DenseMap<StringRef, DenseElementsAttr> sharedNameToConstant;
+    DenseMap<StringRef, LocationAttr> sharedNameToLoc;
+
+    SmallVector<mlir::TFL::VarHandleOp, 6> handleOps;
+    SmallVector<mlir::TFL::AssignVariableOp, 6> assignOps;
+    SmallVector<mlir::TFL::ReadVariableOp, 6> readOps;
+    for (auto it : symNameToFunction) {
+      auto func = std::get<1>(it);
+      // Look through the initialization functions and find the assigned values
+      // for each handle, save out the constant value.
+      for (auto init : func.getOps<mlir::TFL::CallOnceOp>()) {
+        auto findInitFunc =
+            symNameToFunction.find(init.getSessionInitFunction());
+        if (findInitFunc == symNameToFunction.end()) {
+          init.emitError("unable to find initialization function: " +
+                         init.getSessionInitFunction());
+          continue;
+        }
+        func::FuncOp initFunc = std::get<1>(*findInitFunc);
+        for (auto assign : initFunc.getOps<mlir::TFL::AssignVariableOp>()) {
+          auto handle = dyn_cast<mlir::TFL::VarHandleOp>(
+              assign.getResourceId().getDefiningOp());
+          if (!handle) continue;
+
+          DenseElementsAttr constant;
+          if (!matchPattern(assign.getValue(), m_Constant(&constant))) {
+            // Quantized types we can not use the m_Constant matcher.
+            if (auto constOp = dyn_cast<mlir::TFL::QConstOp>(
+                    assign.getValue().getDefiningOp())) {
+              constant = constOp.getValue().cast<DenseElementsAttr>();
+            }
+          }
+          if (!constant) continue;
+
+          auto name = handle.getSharedName();
+          sharedNameToConstant[name] = constant;
+          sharedNameToLoc[name] = handle.getLoc();
+        }
+      }
+
+      // We also want to grab the list of operations to replace.
+      for (auto& op : func.getOps()) {
+        if (auto handle = dyn_cast<mlir::TFL::VarHandleOp>(op))
+          handleOps.push_back(handle);
+        if (auto assign = dyn_cast<mlir::TFL::AssignVariableOp>(op))
+          assignOps.push_back(assign);
+        if (auto read = dyn_cast<mlir::TFL::ReadVariableOp>(op))
+          readOps.push_back(read);
+      }
+    }
+
+    // TF::CallOnceOps are no longer needed as we have already extracted their
+    // state.
+    SmallVector<mlir::TFL::CallOnceOp> callOnceOps;
+    for (auto func : moduleOp.getOps<func::FuncOp>()) {
+      for (auto init : func.getOps<mlir::TFL::CallOnceOp>()) {
+        callOnceOps.push_back(init);
+      }
+    }
+    for (auto op : callOnceOps) op.erase();
+
+    // Create the ml_program::GlobalOps to store our new global variables.
+    DenseMap<StringRef, mlir::ml_program::GlobalOp> symbolRefMap;
+    for (auto it : sharedNameToConstant) {
+      auto name = std::get<0>(it);
+      auto attribute = std::get<1>(it);
+      auto locIt = sharedNameToLoc.find(name);
+      LocationAttr loc = mlir::UnknownLoc();
+      if (locIt != sharedNameToLoc.end()) {
+        loc = std::get<1>(*locIt);
+      }
+
+      // TODO(suderman): Determine the global type based on all store
+      // operations.
+      auto global = builder.create<mlir::ml_program::GlobalOp>(
+          loc, name, attribute.getType(), /*is_mutable=*/true, attribute,
+          nullptr);
+      global.setPrivate();
+
+      symbolRefMap[name] = global;
+    }
+
+    // Replace the assign ops with a global store operation.
+    for (auto assign : assignOps) {
+      auto handle = dyn_cast<mlir::TFL::VarHandleOp>(
+          assign.getResourceId().getDefiningOp());
+      if (!handle) continue;
+
+      Value value = assign.getValue();
+      auto globalOpIt = symbolRefMap.find(handle.getSharedName());
+      if (globalOpIt == symbolRefMap.end()) {
+        assign->emitError(
+            "unable to find corresponding GlobalOp for op's VarHandle");
+        continue;
+      }
+      auto globalOp = std::get<1>(*globalOpIt);
+
+      builder.setInsertionPoint(assign);
+      if (globalOp.getType() != value.getType()) {
+        value = builder
+                    .create<UnrealizedConversionCastOp>(
+                        assign.getLoc(), globalOp.getType(), value)
+                    .getResult(0);
+      }
+
+      auto globalSymbolRef = SymbolRefAttr::get(context, globalOp.getSymName());
+      builder.create<mlir::ml_program::GlobalStoreOp>(assign.getLoc(),
+                                                      globalSymbolRef, value);
+      assign.erase();
+    }
+
+    for (auto read : readOps) {
+      auto handle = dyn_cast<mlir::TFL::VarHandleOp>(
+          read.getResourceId().getDefiningOp());
+      if (!handle) continue;
+
+      auto globalOpIt = symbolRefMap.find(handle.getSharedName());
+      if (globalOpIt == symbolRefMap.end()) continue;
+      auto globalOp = std::get<1>(*globalOpIt);
+
+      builder.setInsertionPoint(read);
+
+      auto globalSymbolRef = SymbolRefAttr::get(context, globalOp.getSymName());
+      Value load = builder.create<mlir::ml_program::GlobalLoadOp>(
+          read.getLoc(), globalOp.getType(), globalSymbolRef);
+
+      if (read.getType() != load.getType()) {
+        load = builder
+                   .create<UnrealizedConversionCastOp>(read.getLoc(),
+                                                       read.getType(), load)
+                   .getResult(0);
+      }
+      read.getResult().replaceAllUsesWith(load);
+      read.erase();
+    }
+
+    for (auto handle : handleOps) {
+      if (handle.getResult().use_empty()) {
+        handle.erase();
+      }
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> createLowerGlobalTensorsPass() {
+  return std::make_unique<LowerGlobalTensorsPass>();
+}
+
+}  // namespace mlir::tosa
diff --git a/tensorflow/compiler/mlir/tosa/transforms/passes.h b/tensorflow/compiler/mlir/tosa/transforms/passes.h
index 8721c83a50f..99f9465c8a6 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/passes.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_set>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 
 namespace mlir {
@@ -55,13 +56,21 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFLPass(
     ArrayRef<std::string> disabled_patterns = std::nullopt,
     ArrayRef<std::string> enabled_patterns = std::nullopt);
 
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFTFLPass();
+std::unique_ptr<OperationPass<ModuleOp>> createLowerGlobalTensorsPass();
+std::unique_ptr<OperationPass<ModuleOp>> createRetainCallOnceFuncsPass();
+std::unique_ptr<OperationPass<ModuleOp>> createStripModuleMetadataPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createConvertTFLUint8Pass();
-std::unique_ptr<OperationPass<func::FuncOp>> createStripQuantTypesPass();
-std::unique_ptr<OperationPass<func::FuncOp>> createLowerComplexTypesPass();
+std::unique_ptr<OperationPass<func::FuncOp>>
+createConvertFunctionMetadataPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createDequantizeTFLSoftmaxPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFTFLPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createLowerComplexTypesPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createStripFunctionMetadataPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createStripQuantTypesPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createVerifyFullyConvertedPass();
 
 #define GEN_PASS_REGISTRATION
+#define GEN_PASS_CLASSES
 #define GEN_PASS_DECL_TOSALEGALIZETFPASS
 #define GEN_PASS_DECL_TOSALEGALIZETFLPASS
 #define GEN_PASS_DECL_TOSALEGALIZETFTFLPASS
@@ -70,6 +79,12 @@ std::unique_ptr<OperationPass<func::FuncOp>> createDequantizeTFLSoftmaxPass();
 #define GEN_PASS_DECL_TOSASTRIPQUANTTYPESPASS
 #define GEN_PASS_DECL_TOSALOWERCOMPLEXTYPESPASS
 #define GEN_PASS_DECL_TOSADEQUANTIZETFLSOFTMAXPASS
+#define GEN_PASS_DECL_LOWERGLOBALTENSORS
+#define GEN_PASS_DECL_RETAINCALLONCEFUNCS
+#define GEN_PASS_DECL_STRIPFUNCTIONMETADATA
+#define GEN_PASS_DECL_STRIPMODULEMETADATA
+#define GEN_PASS_DECL_VERIFYFULLYCONVERTED
+#define GEN_PASS_DECL_CONVERTFUNCTIONMETADATA
 
 #include "tensorflow/compiler/mlir/tosa/transforms/passes.h.inc"
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/passes.td b/tensorflow/compiler/mlir/tosa/transforms/passes.td
index f2c3fe1d463..e623760a4e9 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/passes.td
+++ b/tensorflow/compiler/mlir/tosa/transforms/passes.td
@@ -88,3 +88,40 @@ def TosaDequantizeTFLSoftmaxPass : Pass<"tosa-dequantize-tfl-softmax", "mlir::fu
   let constructor = "createDequantizeTFLSoftmaxPass()";
   let dependentDialects = ["mlir::TFL::TFLDialect", "quantfork::QuantizationForkDialect"];
 }
+
+def LowerGlobalTensors :
+    Pass<"tflite-lower-global-tensors", "mlir::ModuleOp"> {
+  let summary = "Lowers TFLite global tensors to MLProgram dialect variables.";
+  let constructor = "createLowerGlobalTensorsPass()";
+}
+
+def RetainCallOnceFuncs :
+    Pass<"tflite-retain-call-once-funcs", "mlir::ModuleOp"> {
+  let summary = "Guarantees that functions used by tfl.call_once are retained.";
+  let constructor = "createRetainCallOnceFuncsPass()";
+}
+
+def StripFunctionMetadata :
+    Pass<"tosa-tflite-strip-function-metadata", "mlir::func::FuncOp"> {
+  let summary = "Strip all unneeded TF/TFLite specific metadata.";
+  let constructor = "createStripFunctionMetadataPass()";
+}
+
+def StripModuleMetadata :
+    Pass<"tosa-tflite-strip-module-metadata", "mlir::ModuleOp"> {
+  let summary = "Strip all unneeded TF/TFLite specific metadata.";
+  let constructor = "createStripModuleMetadataPass()";
+}
+
+def VerifyFullyConverted :
+    Pass<"tosa-tflite-verify-fully-converted", "mlir::func::FuncOp"> {
+  let summary = "Verifies that all TFLite frontend ops were converted and none remain.";
+  let constructor = "createVerifyFullyConvertedPass()";
+}
+
+def ConvertFunctionMetadata :
+    Pass<"tosa-tflite-convert-function-metadata", "mlir::func::FuncOp"> {
+  let summary = "Converts TFLite input attributes to MLProgram arg attributes on functions.";
+  let constructor = "createConvertFunctionMetadataPass()";
+}
+
diff --git a/tensorflow/compiler/mlir/tosa/transforms/retain_call_once_funcs.cc b/tensorflow/compiler/mlir/tosa/transforms/retain_call_once_funcs.cc
new file mode 100644
index 00000000000..de76f4585ad
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/transforms/retain_call_once_funcs.cc
@@ -0,0 +1,68 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
+
+#define PASS_NAME "retain-call-once-funcs"
+#define DEBUG_TYPE PASS_NAME
+
+namespace mlir::tosa {
+
+#define GEN_PASS_DEF_RETAINCALLONCEFUNCS
+#include "tensorflow/compiler/mlir/tosa/transforms/passes.h.inc"
+
+namespace {
+
+class RetainCallOnceFuncsPass
+    : public impl::RetainCallOnceFuncsBase<RetainCallOnceFuncsPass> {
+ public:
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<mlir::TFL::TensorFlowLiteDialect>();
+  }
+
+  void runOnOperation() override {
+    auto moduleOp = getOperation();
+
+    llvm::DenseMap<StringRef, func::FuncOp> funcMap;
+    for (auto func : moduleOp.getOps<mlir::func::FuncOp>()) {
+      funcMap[func.getSymName()] = func;
+    }
+
+    for (auto func : moduleOp.getOps<mlir::func::FuncOp>()) {
+      for (auto callOnce : func.getOps<mlir::TFL::CallOnceOp>()) {
+        auto callFunc = funcMap[callOnce.getSessionInitFunction()];
+        callOnce->setAttr("session_init_function_symbol",
+                          SymbolRefAttr::get(callFunc));
+      }
+    }
+  }
+};
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> createRetainCallOnceFuncsPass() {
+  return std::make_unique<RetainCallOnceFuncsPass>();
+}
+
+}  // namespace mlir::tosa
diff --git a/tensorflow/compiler/mlir/tosa/transforms/strip_metadata.cc b/tensorflow/compiler/mlir/tosa/transforms/strip_metadata.cc
new file mode 100644
index 00000000000..7960285fdb1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/transforms/strip_metadata.cc
@@ -0,0 +1,103 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "mlir/IR/FunctionInterfaces.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
+
+#define PASS_NAME "tosa-strip-metadata"
+#define DEBUG_TYPE PASS_NAME
+
+namespace mlir::tosa {
+
+#define GEN_PASS_DEF_STRIPM
+#include "tensorflow/compiler/mlir/tosa/transforms/passes.h.inc"
+
+namespace {
+
+static bool isTFLAttr(NamedAttribute &namedAttr) {
+  // TFLite uses both tf and tfl in attribute annotations.
+  auto name = namedAttr.getName().strref();
+  // Don't trim attributes from tf_saved_model---they carry ABI information.
+  if (name.startswith("tf_saved_model.")) return false;
+
+  if (name.startswith("tf.") || name.startswith("tf_") ||
+      name.startswith("tfl.") || name.startswith("tfl_")) {
+    return true;
+  }
+  StringRef attrNamespace = namedAttr.getValue().getDialect().getNamespace();
+  return attrNamespace == "tf" || attrNamespace == "tfl";
+}
+
+class StripModuleMetadataPass
+    : public StripModuleMetadataBase<StripModuleMetadataPass> {
+ public:
+  void runOnOperation() override {
+    auto moduleOp = getOperation();
+    auto stripAttrs = llvm::to_vector<4>(llvm::make_filter_range(
+        moduleOp->getAttrs(),
+        [](NamedAttribute namedAttr) { return isTFLAttr(namedAttr); }));
+    for (auto namedAttr : stripAttrs) {
+      moduleOp->removeAttr(namedAttr.getName());
+    }
+  }
+};
+
+class StripFunctionMetadataPass
+    : public StripFunctionMetadataBase<StripFunctionMetadataPass> {
+ public:
+  void runOnOperation() override {
+    auto funcOp = getOperation();
+    auto stripAttrs = llvm::to_vector<4>(llvm::make_filter_range(
+        funcOp->getAttrs(),
+        [](NamedAttribute namedAttr) { return isTFLAttr(namedAttr); }));
+    for (auto namedAttr : stripAttrs) {
+      funcOp->removeAttr(namedAttr.getName());
+    }
+
+    for (int i = 0, e = funcOp.getNumArguments(); i < e; ++i) {
+      auto stripAttrs = llvm::to_vector<4>(llvm::make_filter_range(
+          mlir::function_interface_impl::getArgAttrs(funcOp, i),
+          [](NamedAttribute namedAttr) { return isTFLAttr(namedAttr); }));
+      for (auto namedAttr : stripAttrs) {
+        funcOp.removeArgAttr(i, namedAttr.getName());
+      }
+    }
+
+    for (int i = 0, e = funcOp.getNumResults(); i < e; ++i) {
+      auto stripAttrs = llvm::to_vector<4>(llvm::make_filter_range(
+          mlir::function_interface_impl::getResultAttrs(funcOp, i),
+          [](NamedAttribute namedAttr) { return isTFLAttr(namedAttr); }));
+      for (auto namedAttr : stripAttrs) {
+        funcOp.removeResultAttr(i, namedAttr.getName());
+      }
+    }
+  }
+};
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> createStripModuleMetadataPass() {
+  return std::make_unique<StripModuleMetadataPass>();
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> createStripFunctionMetadataPass() {
+  return std::make_unique<StripFunctionMetadataPass>();
+}
+
+}  // namespace mlir::tosa
diff --git a/tensorflow/compiler/mlir/tosa/transforms/tfl_legalize_patterns.td b/tensorflow/compiler/mlir/tosa/transforms/tfl_legalize_patterns.td
index cec441d25ef..1745907f8a8 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/tfl_legalize_patterns.td
+++ b/tensorflow/compiler/mlir/tosa/transforms/tfl_legalize_patterns.td
@@ -31,7 +31,6 @@ def ConvertTFLCeilOp : Pat<(TFL_CeilOp $arg), (Tosa_CeilOp $arg)>;
 def ConvertTFLFloorOp : Pat<(TFL_FloorOp $arg), (Tosa_FloorOp $arg)>;
 def ConvertTFLExpOp : Pat<(TFL_ExpOp $arg), (Tosa_ExpOp $arg)>;
 def ConvertTFLLogOp : Pat<(TFL_LogOp $arg), (Tosa_LogOp $arg)>;
-def ConvertTFLRsqrtOp : Pat<(TFL_RsqrtOp $arg), (Tosa_RsqrtOp $arg)>;
 def ConvertTFLLogicalNotOp : Pat<(TFL_LogicalNotOp $arg), (Tosa_LogicalNotOp $arg)>;
 def ConvertTFLCastOp: Pat<(TFL_CastOp $in), (Tosa_CastOp $in)>;
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/verify_fully_converted.cc b/tensorflow/compiler/mlir/tosa/transforms/verify_fully_converted.cc
new file mode 100644
index 00000000000..478c77ba61a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/transforms/verify_fully_converted.cc
@@ -0,0 +1,90 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir::tosa {
+
+#define GEN_PASS_DEF_VERIFYFULLYCONVERTED
+#include "tensorflow/compiler/mlir/tosa/transforms/passes.h.inc"
+
+namespace {
+
+static void emitLegalizationErrors(Location loc,
+                                   const DenseSet<Operation *> &illegalOps) {
+  // Print op errors for each of the illegal ops that still remain.
+  llvm::MapVector<StringRef, int> opNameCounts;
+  for (Operation *illegalOp : illegalOps) {
+    StringRef opName = illegalOp->getName().getStringRef();
+    opNameCounts[opName]++;
+    illegalOp->emitOpError() << ": illegal op still exists";
+  }
+
+  std::vector<std::string> errorMessages;
+  errorMessages.reserve(opNameCounts.size());
+  for (const auto &opInfo : opNameCounts) {
+    errorMessages.push_back(
+        llvm::formatv("\t{0} (count: {1})", opInfo.first, opInfo.second));
+  }
+  emitError(loc) << "The following illegal operations still remain: \n"
+                 << llvm::join(errorMessages, "\n") << "\n";
+}
+
+LogicalResult verifyAllOperationsAreLegal(Operation *op,
+                                          const ConversionTarget &target) {
+  DenseSet<Operation *> illegalOps;
+  op->walk([&](Operation *op) {
+    if (!target.isLegal(op)) {
+      illegalOps.insert(op);
+    }
+  });
+  if (illegalOps.empty()) return success();
+  emitLegalizationErrors(op->getLoc(), illegalOps);
+  return failure();
+}
+
+class VerifyFullyConvertedPass
+    : public impl::VerifyFullyConvertedBase<VerifyFullyConvertedPass> {
+ public:
+  // Validates that no TFLite frontends ops are in the function.
+  void runOnOperation() override {
+    // We don't just use applyPartialConversion with no patterns because this
+    // pass shouldn't alter the IR at all (including via folding or
+    // canonicalizations that dialect conversion does automatically).
+    ConversionTarget target(getContext());
+    target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
+    target.addIllegalDialect<mlir::TFL::TensorFlowLiteDialect>();
+    target.addIllegalOp<mlir::UnrealizedConversionCastOp>();
+    if (failed(verifyAllOperationsAreLegal(getOperation(), target)))
+      return signalPassFailure();
+  }
+};
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createVerifyFullyConvertedPass() {
+  return std::make_unique<VerifyFullyConvertedPass>();
+}
+
+}  // namespace mlir::tosa
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index f1419a29796..1fb31b9db74 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -50,11 +50,12 @@ py_library(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:session",
         "//tensorflow/python:variables",
         "//tensorflow/python/compiler/xla:compiler_py",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -83,6 +84,7 @@ py_test(
     ],
     deps = [
         ":xla_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -101,6 +103,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -120,6 +123,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -139,6 +143,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -159,6 +164,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -180,6 +186,7 @@ tf_xla_py_test(
         "//tensorflow/python:list_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -199,6 +206,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -224,6 +232,7 @@ tf_xla_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -269,6 +278,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -311,6 +321,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -326,6 +337,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:control_flow_switch_case",
         "//tensorflow/python:framework",
@@ -333,6 +345,7 @@ tf_xla_py_test(
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -354,6 +367,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -371,6 +385,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -397,6 +412,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -458,6 +474,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -564,6 +581,7 @@ tf_xla_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -581,6 +599,7 @@ tf_xla_py_test(
         "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -621,6 +640,7 @@ tf_xla_py_test(
         "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -655,6 +675,7 @@ tf_xla_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -671,6 +692,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:framework",
         "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
@@ -697,6 +719,7 @@ tf_xla_py_test(
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -747,6 +770,7 @@ tf_xla_py_test(
     srcs = ["ftrl_test.py"],
     enable_mlir_bridge = False,
     python_version = "PY3",
+    shard_count = 16,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
@@ -757,6 +781,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -814,7 +839,6 @@ tf_xla_py_test(
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "optonly",  # Times out frequently in fastbuild mode.
-        "requires-gpu-nvidia",
     ],
     deps = [
         ":xla_test",
@@ -822,6 +846,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -842,6 +867,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -898,6 +924,7 @@ tf_xla_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -937,6 +964,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1012,6 +1040,7 @@ tf_xla_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1074,6 +1103,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1101,6 +1131,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1180,6 +1211,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
+        "//tensorflow/python/platform:test",
     ],
 )
 
@@ -1198,6 +1230,7 @@ tf_xla_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1221,6 +1254,7 @@ tf_xla_py_test(
 #         "//tensorflow/python:platform_test",
 #         "//tensorflow/python/compat:v2_compat",
 #         "//tensorflow/python/eager:function",
+#         "//tensorflow/python/platform:client_testlib",
 #     ],
 # )
 # copybara:uncomment_end
@@ -1241,6 +1275,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1261,6 +1296,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1300,6 +1336,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1318,6 +1355,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1338,6 +1376,7 @@ tf_xla_py_test(
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1361,6 +1400,8 @@ tf_xla_py_test(
         "//tensorflow/python:standard_ops",
         "//tensorflow/python:stateful_random_ops",
         "//tensorflow/python/kernel_tests/random:util",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:flags",
     ],
 )
 
@@ -1382,6 +1423,7 @@ tf_xla_py_test(
         "//tensorflow/python:standard_ops",
         "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python/kernel_tests/random:util",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1411,6 +1453,7 @@ tf_xla_py_test(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1432,6 +1475,7 @@ tf_xla_py_test(
         "//tensorflow/python:list_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1441,7 +1485,7 @@ tf_xla_py_test(
     srcs = ["ternary_ops_test.py"],
     enable_mlir_bridge = True,
     python_version = "PY3",
-    shard_count = 4,
+    shard_count = 16,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
@@ -1462,7 +1506,7 @@ tf_xla_py_test(
     srcs = ["unary_ops_test.py"],
     enable_mlir_bridge = True,
     python_version = "PY3",
-    shard_count = 4,
+    shard_count = 32,
     tags = [
         "no_cuda_asan",  # times out
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1499,6 +1543,7 @@ tf_xla_py_test(
         "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1543,6 +1588,7 @@ tf_xla_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
         "//tensorflow/python:while_loop",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1566,6 +1612,7 @@ tf_xla_py_test(
         "//tensorflow/python:image_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1588,6 +1635,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1606,6 +1654,8 @@ tf_xla_py_test(
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:flags",
     ],
 )
 
@@ -1623,6 +1673,7 @@ tf_xla_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1641,6 +1692,7 @@ tf_xla_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1661,6 +1713,7 @@ tf_xla_py_test(
         "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1698,6 +1751,7 @@ tf_xla_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1737,6 +1791,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:gradients",
@@ -1820,8 +1875,8 @@ tf_cuda_cc_test(
     shard_count = 20,
     # This test is randomized, so only run it if explicitly requested.
     tags = [
-        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "manual",
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "notap",
     ] + tf_cuda_tests_tags(),
     deps = [":randomized_tests_library"],
@@ -1834,8 +1889,8 @@ tf_cuda_cc_test(
     shard_count = 20,
     # This test is randomized, so only run it if explicitly requested.
     tags = [
-        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "manual",
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "notap",
     ] + tf_cuda_tests_tags(),
     deps = [":randomized_tests_library"],
@@ -1856,8 +1911,8 @@ tf_cuda_cc_test(
         "config-cuda-only",
         "no_cuda_asan",  # TODO(b/201651800)
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
-        "requires-gpu-nvidia",
         "no_rocm",  # ROCmSoftwarePlatform #958
+        "requires-gpu-nvidia",
     ] + tf_cuda_tests_tags(),
     deps = [":randomized_tests_library"],
 )
@@ -1877,8 +1932,8 @@ tf_cuda_cc_test(
         "config-cuda-only",
         "no_cuda_asan",  # TODO(b/201651800)
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
-        "requires-gpu-nvidia",
         "no_rocm",  # ROCmSoftwarePlatform #958
+        "requires-gpu-nvidia",
     ] + tf_cuda_tests_tags(),
     deps = [":randomized_tests_library"],
 )
@@ -1917,7 +1972,7 @@ py_library(
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:variable_v1",
         "@six_archive//:six",
     ],
 )
@@ -1939,7 +1994,6 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:variables",
     ],
 )
@@ -2062,6 +2116,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -2084,6 +2139,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -2126,6 +2182,7 @@ tf_xla_py_test(
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:standard_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -2142,6 +2199,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:framework",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:standard_ops",
@@ -2161,7 +2219,9 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -2180,6 +2240,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -2202,6 +2263,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -2221,6 +2283,7 @@ tf_xla_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -2246,6 +2309,7 @@ tf_xla_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
         "//tensorflow/python/compiler/xla:compiler_py",
+        "//tensorflow/python/tpu",
     ],
 )
 
@@ -2253,7 +2317,10 @@ tf_xla_py_test(
     name = "where_op_tpu_test",
     size = "small",
     srcs = ["where_op_test.py"],
-    args = ["--tpu_use_tfrt=true"],
+    args = [
+        "--tpu_use_tfrt=true",
+        # TODO(b/274633087): Set tf_use_pjrt=true after fixing bug.
+    ],
     disabled_backends = [
         "cpu",
         "cpu_ondemand",
@@ -2274,6 +2341,7 @@ tf_xla_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
         "//tensorflow/python/compiler/xla:compiler_py",
+        "//tensorflow/python/tpu",
     ],
 )
 
@@ -2293,6 +2361,7 @@ tf_xla_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/ops/risc:risc_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -2325,6 +2394,7 @@ cuda_py_test(
         ":xla_test",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -2370,6 +2440,7 @@ tf_xla_py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/tpu:tpu_lib",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -2404,7 +2475,6 @@ tf_xla_py_test(
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
-        "notap",
     ],
     use_xla_device = False,  # Uses tf.function(jit_compile=True)
     deps = [
diff --git a/tensorflow/compiler/tests/cond_test.py b/tensorflow/compiler/tests/cond_test.py
index 9119095b6a3..db767f0d554 100644
--- a/tensorflow/compiler/tests/cond_test.py
+++ b/tensorflow/compiler/tests/cond_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_switch_case
 from tensorflow.python.ops import math_ops
@@ -44,7 +45,7 @@ class CondTest(xla_test.XLATestCase):
       @def_function.function
       def f():
         ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=1)
-        output = control_flow_ops.cond(
+        output = cond.cond(
             constant_op.constant(True),
             lambda: ta.write(0, 5.), lambda: ta.write(0, 10.))
 
@@ -64,7 +65,7 @@ class CondTest(xla_test.XLATestCase):
       @def_function.function
       def f():
         ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=1)
-        output = control_flow_ops.cond(
+        output = cond.cond(
             constant_op.constant(False),
             lambda: ta.write(0, 5.), lambda: ta.write(0, 10.))
 
@@ -84,7 +85,7 @@ class CondTest(xla_test.XLATestCase):
 
       def f():
         ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=1)
-        output = control_flow_ops.cond(
+        output = cond.cond(
             constant_op.constant(True),
             lambda: ta.write(0, 5.), lambda: ta.write(0, 10.))
 
@@ -112,7 +113,7 @@ class CondTest(xla_test.XLATestCase):
       def if_false():
         return 5.
 
-      output = control_flow_ops.cond(
+      output = cond.cond(
           constant_op.constant(True), if_true, if_false)
 
       self.assertAllEqual(1.,
@@ -142,7 +143,7 @@ class CondTest(xla_test.XLATestCase):
         def if_false():
           return 5.
 
-        return control_flow_ops.cond(
+        return cond.cond(
             constant_op.constant(True), if_true, if_false)
 
       output = xla.compile(f)
@@ -169,7 +170,7 @@ class CondTest(xla_test.XLATestCase):
       def if_false():
         return array_ops.fill([p], 5.)
 
-      output = control_flow_ops.cond(
+      output = cond.cond(
           constant_op.constant(True), if_true, if_false)
 
       with self.assertRaisesRegex(errors.InvalidArgumentError,
@@ -202,7 +203,7 @@ class CondTest(xla_test.XLATestCase):
         def if_false():
           return array_ops.fill([p], 5.)
 
-        return control_flow_ops.cond(condition, if_true, if_false)
+        return cond.cond(condition, if_true, if_false)
 
       output = xla.compile(f)
 
@@ -304,7 +305,7 @@ class CondTest(xla_test.XLATestCase):
       xla_context.Enter()
 
       for pred in True, False:
-        cond_out = control_flow_ops.cond(
+        cond_out = cond.cond(
             array_ops.placeholder_with_default(pred, []),
             lambda: constant_op.constant(2.),
             lambda: constant_op.constant(1.))
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index 0b8b5c2d866..6ef5a7c9a9f 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -21,7 +21,6 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
@@ -30,7 +29,7 @@ from tensorflow.python.layers import convolutional
 from tensorflow.python.layers import pooling
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack  # pylint: disable=g-direct-tensorflow-import
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_random_ops
@@ -345,12 +344,12 @@ class EagerFunctionTest(xla_test.XLATestCase):
       v = resource_variable_ops.ResourceVariable(1.0)
       w = resource_variable_ops.ResourceVariable(0.0)
 
-      @function.defun_with_attributes(attributes={'_noinline': True})
+      @def_function.function(experimental_attributes={'_noinline': True})
       def g(x):
         w.assign(w.read_value() + x)
         return v.read_value() + x * w.read_value()
 
-      @function.defun_with_attributes(attributes={'_noinline': True})
+      @def_function.function(experimental_attributes={'_noinline': True})
       def f():
         return g(1.0) + g(2.0) + g(3.0) + g(4.0) + g(5.0)
 
@@ -362,11 +361,11 @@ class EagerFunctionTest(xla_test.XLATestCase):
     with self.test_scope():
       v = resource_variable_ops.ResourceVariable(10.0)
 
-      @function.defun_with_attributes(attributes={'_noinline': True})
+      @def_function.function(experimental_attributes={'_noinline': True})
       def g():
         return v.read_value()
 
-      @function.defun_with_attributes(attributes={'_noinline': True})
+      @def_function.function(experimental_attributes={'_noinline': True})
       def f():
         return g() + g() + g() + g() + g()
 
@@ -376,11 +375,11 @@ class EagerFunctionTest(xla_test.XLATestCase):
     with self.test_scope():
       v = resource_variable_ops.ResourceVariable(0.0)
 
-      @function.defun_with_attributes(attributes={'_noinline': True})
+      @def_function.function(experimental_attributes={'_noinline': True})
       def g(x):
         v.assign(x)
 
-      @function.defun_with_attributes(attributes={'_noinline': True})
+      @def_function.function(experimental_attributes={'_noinline': True})
       def f():
         g(1.0)
         g(2.0)
@@ -637,7 +636,7 @@ class EagerFunctionTest(xla_test.XLATestCase):
       def f(pred, value):
         fn1 = lambda: math_ops.add(value, 1.0)
         fn2 = lambda: math_ops.subtract(value, 1.0)
-        return control_flow_ops.cond(pred, fn1, fn2)
+        return cond.cond(pred, fn1, fn2)
 
       plus_one = f(constant_op.constant(True), constant_op.constant(10.0))
       minus_one = f(constant_op.constant(False), constant_op.constant(10.0))
diff --git a/tensorflow/compiler/tests/giant_const_op_test.py b/tensorflow/compiler/tests/giant_const_op_test.py
index c0f4b47be01..014b9d5f1eb 100644
--- a/tensorflow/compiler/tests/giant_const_op_test.py
+++ b/tensorflow/compiler/tests/giant_const_op_test.py
@@ -56,16 +56,6 @@ def get_tpu_strategy():
 # tensors.
 class GiantConstOp(test.TestCase):
 
-  def setUp(self):
-    super(GiantConstOp, self).setUp()
-    # Make sure TF_XLA_FLAGS is not already set to avoid dropping the existing
-    # value silently.
-    assert "TF_XLA_FLAGS" not in os.environ
-
-    # Disable tfxla constant folding that always creates full Tensors and will
-    # fail for giant tensors.
-    os.environ["TF_XLA_FLAGS"] = "--tf_xla_disable_constant_folding=true"
-
   # Verifies that graphs containing giant const tensors that won't fit in memory
   # are compiled correctly to HLO.
   def testGiantConst(self):
@@ -106,4 +96,12 @@ class GiantConstOp(test.TestCase):
         self.assertAllEqual(output, expected)
 
 if __name__ == "__main__":
+  # Make sure TF_XLA_FLAGS is not already set to avoid dropping the existing
+  # value silently.
+  assert "TF_XLA_FLAGS" not in os.environ
+
+  # Disable tfxla constant folding that always creates full Tensors and will
+  # fail for giant tensors.
+  os.environ["TF_XLA_FLAGS"] = "--tf_xla_disable_constant_folding=true"
+
   test.main()
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index b0f252658a8..7f24610f3e8 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -358,7 +359,7 @@ class XlaCompilationTest(test.TestCase):
       c = array_ops.placeholder(dtypes.bool)
       with jit_scope():
         z = x + 1.0
-        w = control_flow_ops.cond(c, lambda: z, lambda: y)
+        w = cond.cond(c, lambda: z, lambda: y)
         t = math_ops.add(z, w)
 
       # If JIT compilation chooses to cluster z and t, then execution will
diff --git a/tensorflow/compiler/tests/lstm.py b/tensorflow/compiler/tests/lstm.py
index 748d5f0a850..8dd2a786155 100644
--- a/tensorflow/compiler/tests/lstm.py
+++ b/tensorflow/compiler/tests/lstm.py
@@ -27,7 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 
 
 def Clip(x):
@@ -115,7 +115,7 @@ def LSTMLayer(cell_name, weights, m, c, x_seq, pad_seq):
 
 def RandomVar(shape, name=None):
   """Returns a variable of the given shape initialized to random values."""
-  return variables.VariableV1(
+  return variable_v1.VariableV1(
       random_ops.random_uniform(shape), dtype=dtypes.float32, name=name)
 
 
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index 012fe158e1c..61c187cf7c4 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -368,9 +368,11 @@ class StatelessRandomOpsTest(xla_test.XLATestCase, parameterized.TestCase):
     self._testParameterizedTruncatedNormal(-1., 1., -2., 2.)
 
   def testParameterizedTruncatedNormalRightTail(self):
+    self.skipTest('b/276957102')
     self._testParameterizedTruncatedNormal(0., 1., 4., 20., variance_rtol=2e-2)
 
   def testParameterizedTruncatedNormalLeftTail(self):
+    self.skipTest('b/276957102')
     self._testParameterizedTruncatedNormal(
         0., 1., -20., -4., variance_rtol=5e-2)
 
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index 9a6b3dd0a73..d21a8fb4cc5 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -860,7 +860,7 @@ class TensorArrayTest(xla_test.XLATestCase):
   #     c = lambda i, acc: i < 5
 
   #     def b(i, acc):
-  #       x1 = control_flow_ops.cond(
+  #       x1 = cond.cond(
   #           math_ops.equal(i, 0), lambda: x,
   #           lambda: math_ops.multiply(acc.read(i - 1), 2.0))
   #       return i + 1, acc.write(i, x1)
diff --git a/tensorflow/compiler/tests/where_op_test.py b/tensorflow/compiler/tests/where_op_test.py
index 186877c03fe..e150b52567c 100644
--- a/tensorflow/compiler/tests/where_op_test.py
+++ b/tensorflow/compiler/tests/where_op_test.py
@@ -16,15 +16,23 @@
 
 # pylint: disable=g-direct-tensorflow-import
 from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu
 # pylint: enable=g-direct-tensorflow-import
 
 
 class WhereOpTest(xla_test.XLATestCase):
 
+  def __init__(self, method_name="runTest"):
+    super(WhereOpTest, self).__init__(method_name)
+    if config.list_logical_devices("TPU"):
+      with self.session() as sess:
+        sess.run(tpu.initialize_system())
+
   def testWhere(self):
     """Test first form of where (return indices)."""
 
diff --git a/tensorflow/compiler/tests/xla_call_module_test.py b/tensorflow/compiler/tests/xla_call_module_test.py
index e0923f32bac..01f30718217 100644
--- a/tensorflow/compiler/tests/xla_call_module_test.py
+++ b/tensorflow/compiler/tests/xla_call_module_test.py
@@ -13,12 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for XLA call module op wrapper."""
-
+from typing import Tuple
 import unittest
+
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
+from tensorflow.compiler.tf2xla.ops import gen_xla_ops
 from tensorflow.compiler.tf2xla.python import xla
+
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -27,6 +30,14 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import googletest
 
 
+def serialize(module_str: str) -> Tuple[str, int]:
+  # TODO(b/274838200): error importing xla_extension in OSS
+  # target_version = '0.9.0'  # TODO(gleasonk): use APIs to get this
+  # return xla_extension.mlir.serialize_portable_artifact(
+  #     module_str, target_version), 4
+  return module_str, 3
+
+
 class XlaCallModuleOpTest(xla_test.XLATestCase):
 
   def _assertOpOutputMatchesExpected(self,
@@ -64,7 +75,7 @@ class XlaCallModuleOpTest(xla_test.XLATestCase):
 
     def f(x):
       # sin(cos(x))
-      module = """
+      module, version = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg0: tensor<3xf32>) -> tensor<3xf32> {
     %0 = stablehlo.cosine %arg0 : tensor<3xf32>
@@ -72,8 +83,8 @@ module @jit_f.0 {
     return %1 : tensor<3xf32>
   }
 }
-"""
-      return xla.call_module([x], version=2,
+""")
+      return xla.call_module([x], version=version,
                              module=module, Tout=[x.dtype], Sout=[x.shape])
 
     self._assertOpOutputMatchesExpected(f, (x,), (np.sin(np.cos(x)),))
@@ -84,7 +95,7 @@ module @jit_f.0 {
 
     def f(x):
       # return x >= 1
-      module = """
+      module, version = serialize("""
 module @jit_f_jax.0 {
   func.func public @main(%arg0: tensor<ui32>) -> tensor<i1> {
     %0 = stablehlo.constant dense<1> : tensor<ui32>
@@ -92,8 +103,8 @@ module @jit_f_jax.0 {
     return %1 : tensor<i1>
   }
 }
-"""
-      return xla.call_module([x], version=2,
+""")
+      return xla.call_module([x], version=version,
                              module=module,
                              Tout=[res.dtype],
                              Sout=[res.shape])
@@ -106,7 +117,7 @@ module @jit_f_jax.0 {
 
     def f(x, y):
       # (sin(x), cos(y))
-      module = """
+      module, version = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg0: tensor<3xf32>, %arg1: tensor<4xf64>) -> (tensor<3xf32>, tensor<4xf64>) {
     %0 = stablehlo.sine %arg0 : tensor<3xf32>
@@ -114,8 +125,8 @@ module @jit_f.0 {
     return %0, %1 : tensor<3xf32>, tensor<4xf64>
   }
 }
-"""
-      return xla.call_module([x, y], version=2,
+""")
+      return xla.call_module([x, y], version=version,
                              module=module,
                              Tout=[x.dtype, y.dtype],
                              Sout=[x.shape, y.shape])
@@ -128,16 +139,15 @@ module @jit_f.0 {
     def f(x):  # x: f32[2, b]
       # Module takes another argument which is the value of b
       # (sin(x), x.shape[1])
-      module = """
+      module, version = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<i32>) {
     %0 = stablehlo.sine %arg1 : tensor<2x?xf32>
     return %0, %arg0 : tensor<2x?xf32>, tensor<i32>
   }
 }
-"""
-      return xla.call_module([x],
-                             version=2,
+""")
+      return xla.call_module([x], version=version,
                              module=module,
                              Tout=[x.dtype, np.int32],
                              Sout=[(None, 3), ()],
@@ -151,17 +161,16 @@ module @jit_f.0 {
     def f(x):  # x: f32[2, b]
       # Module takes another argument which is the value of b
       # (sin(x), x.shape[1])
-      module = """
+      module, version = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg0: tensor<i64>, %arg1: tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<i64>) {
     %0 = stablehlo.sine %arg1 : tensor<2x?xf32>
     return %0, %arg0 : tensor<2x?xf32>, tensor<i64>
   }
 }
-"""
+""")
       return xla.call_module([x],
-                             version=2,
-                             module=module,
+                             module=module, version=version,
                              Tout=[x.dtype, np.int64],
                              Sout=[(None, 3), ()],
                              dim_args_spec=['0.1'])
@@ -174,7 +183,7 @@ module @jit_f.0 {
 
     def f(x):  # x: f32[2, b]
       # (sin(x), x.shape[1])
-      module = """
+      module, version = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg1: tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<i32>) {
     %arg0_new = "stablehlo.get_dimension_size"(%arg1) {dimension = 1 : i64} : (tensor<2x?xf32>) -> tensor<i32>
@@ -186,9 +195,9 @@ module @jit_f.0 {
     return %0, %arg0 : tensor<2x?xf32>, tensor<i32>
   }
 }
-"""
-      return xla.call_module([x], version=2,
-                             module=module,
+""")
+      return xla.call_module([x],
+                             module=module, version=version,
                              Tout=[x.dtype, np.int32],
                              Sout=[(None, 3), ()])
 
@@ -201,7 +210,7 @@ module @jit_f.0 {
 
     # Module takes two prefix arguments with the values of b and c
     #   return (sin(x + y), x.shape[1])
-    module = """
+    module, version = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<2x?x?xf32>, %arg3: tensor<2x?x?xf32>) -> (tensor<2x?x?xf32>, tensor<i32>) {
     %0 = stablehlo.add %arg2, %arg3 : tensor<2x?x?xf32>
@@ -209,13 +218,12 @@ module @jit_f.0 {
     return %1, %arg0 : tensor<2x?x?xf32>, tensor<i32>
   }
 }
-"""
+""")
 
     dim_args_spec = ['0.1', '0.2']
     def f(x, y):
       return xla.call_module([x, y],
-                             version=2,
-                             module=module,
+                             module=module, version=version,
                              Tout=[x.dtype, np.int32],
                              Sout=[(None, 3), ()],
                              dim_args_spec=dim_args_spec)
@@ -274,7 +282,7 @@ module @jit_f.0 {
     x = np.float32(0.)
 
     #  returns x + 2. on CPU, x + 3. on GPU and x + 4. on TPU
-    module = """
+    module, version = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg_platform_idx: tensor<i32>, %arg0: tensor<f32>) -> tensor<f32> {
     %to_add = "stablehlo.case"(%arg_platform_idx) ({
@@ -291,12 +299,11 @@ module @jit_f.0 {
     return %0 : tensor<f32>
   }
 }
-"""
+""")
 
     platforms = ['CPU', 'CUDA', 'TPU']
     def f(x):
-      return xla.call_module([x],
-                             version=3,
+      return xla.call_module([x], version=version,
                              module=module,
                              Tout=[np.float32],
                              Sout=[()],
@@ -310,7 +317,7 @@ module @jit_f.0 {
     y = np.arange(3., dtype=np.float32)
 
     #  returns x + x on CPU and x - x on TPU
-    module = """
+    module, version = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg_platform_idx: tensor<i32>, %arg_dim0: tensor<i32>, %arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
     %res = "stablehlo.case"(%arg_platform_idx) ({
@@ -323,10 +330,9 @@ module @jit_f.0 {
     return %res : tensor<?xf32>
   }
 }
-"""
+""")
     def f(x, y):
-      return xla.call_module([x, y],
-                             version=3,
+      return xla.call_module([x, y], version=version,
                              module=module,
                              Tout=[np.float32],
                              Sout=[(None,)],
@@ -341,18 +347,17 @@ module @jit_f.0 {
     """Error reporting for the platforms attribute."""
     x = np.float32(0.)
 
-    module = """
+    module_str = """
 module @jit_f.0 {
   func.func public @main(%arg_platform_idx: tensor<i32>, %arg0: tensor<f32>) -> tensor<f32> {
     return %arg0 : tensor<f32>
   }
 }
 """
+    module, version = serialize(module_str)
     platforms = []
-    version = 3
     def f(x):
-      return xla.call_module([x],
-                             version=version,
+      return xla.call_module([x], version=version,
                              module=module,
                              Tout=[np.float32],
                              Sout=[()],
@@ -376,17 +381,6 @@ module @jit_f.0 {
           'and 0 dimension arguments.'):
         self._assertOpOutputMatchesExpected(f, (x,), (x,))
 
-    #  Same if the version is 2
-    platforms = ['CPU', 'CUDA', 'TPU']
-    version = 2
-    with self.assertRaisesRegex(
-        errors.InvalidArgumentError,
-        'Incorrect number of arguments passed to XlaCallModule: 1. '
-        'The module takes 2 arguments of which 0 platform index arguments '
-        'and 0 dimension arguments.'):
-      self._assertOpOutputMatchesExpected(f, (x,), (x,))
-
-    version = 3
     platforms = ['RANDOM_PLATFORM_1', 'RANDOM_PLATFORM_2']
     with self.assertRaisesRegex(
         errors.NotFoundError,
@@ -403,7 +397,7 @@ module @jit_f.0 {
       self._assertOpOutputMatchesExpected(f, (x,), (x,))
 
     # The module cannot have i64 %arg_platform_idx
-    module = module.replace('i32', 'i64')
+    module, version = serialize(module_str.replace('i32', 'i64'))
     platforms = ['CPU', 'CUDA', 'TPU']
     with self.assertRaisesRegex(
         errors.InvalidArgumentError,
@@ -413,13 +407,13 @@ module @jit_f.0 {
       self._assertOpOutputMatchesExpected(f, (x,), (x,))
 
     # A module without the platform index argument
-    module = """
+    module, version = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg0: tensor<i32>) -> tensor<i32> {
     return %arg0 : tensor<i32>
   }
 }
-"""
+""")
     with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         'The module should have 1 platform index arguments and 0 dimension '
@@ -432,7 +426,7 @@ module @jit_f.0 {
 
     def f(x):  # x: f32[b, 5]
       # return np.arange(x.shape[0], dtype=np.int32)
-      module = """
+      module, version = serialize("""
 module @jit_fun.1 {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x5xi32>) -> tensor<?xi32> {
     %0 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
@@ -440,8 +434,8 @@ module @jit_fun.1 {
     return %1 : tensor<?xi32>
   }
 }
-"""
-      return xla.call_module([x,], version=2,
+""")
+      return xla.call_module([x,], version=version,
                              module=module,
                              Tout=[res.dtype],
                              Sout=[(None,)],
@@ -453,17 +447,16 @@ module @jit_fun.1 {
     """We can construct the tf.Graph on all platforms."""
     x = np.float32(0.)
 
-    module = """
+    module, version = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg_platform_idx: tensor<i32>, %arg0: tensor<f32>) -> tensor<f32> {
     return %arg0 : tensor<f32>
   }
 }
-"""
+""")
     platforms = ['TPU']  # the module is compileable only on TPU
     def f(x):
-      return xla.call_module([x],
-                             version=3,
+      return xla.call_module([x], version=version,
                              module=module,
                              Tout=[np.float32],
                              Sout=[()],
@@ -476,7 +469,7 @@ module @jit_f.0 {
     res = x.reshape((-1,))
 
     def f(x):  # x: f32[b, 3]
-      module = """
+      module, version = serialize("""
 module @jit_fun_flat_jax {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x3xf32>) -> tensor<?xf32> {
     %0 = stablehlo.constant dense<3> : tensor<i32>
@@ -486,8 +479,8 @@ module @jit_fun_flat_jax {
     return %3 : tensor<?xf32>
   }
 }
-"""
-      return xla.call_module([x],
+""")
+      return xla.call_module([x], version=version,
                              module=module,
                              Tout=[res.dtype],
                              Sout=[(None,)],
@@ -500,7 +493,7 @@ module @jit_fun_flat_jax {
     res = np.ones((3, 2), dtype=np.float32)
 
     def f(x):  # x: f32[b, 4]
-      module = """
+      module, version = serialize("""
 module @jit_fun_flat_jax {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>) -> tensor<?x2xf32> {
     %0 = stablehlo.constant dense<0> : tensor<i64>
@@ -512,8 +505,8 @@ module @jit_fun_flat_jax {
     return %5 : tensor<?x2xf32>
   }
 }
-"""
-      return xla.call_module([x],
+""")
+      return xla.call_module([x], version=version,
                              module=module,
                              Tout=[res.dtype],
                              Sout=[(None, 2)],
@@ -526,7 +519,7 @@ module @jit_fun_flat_jax {
     res = x[-1, :]
 
     def f(x):  # x: f32[b, 4]
-      module = """
+      module, version = serialize("""
 module @jit_fun_flat_jax {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>) -> tensor<4xf32> {
     %0 = stablehlo.constant dense<-1> : tensor<i32>
@@ -543,8 +536,8 @@ module @jit_fun_flat_jax {
     return %12 : tensor<4xf32>
   }
 }
-"""
-      return xla.call_module([x],
+""")
+      return xla.call_module([x], version=version,
                              module=module,
                              Tout=[x.dtype],
                              Sout=[(4,)],
@@ -558,7 +551,7 @@ module @jit_fun_flat_jax {
     res = x   # The update should be a nop
 
     def f(x, idx):  # x: f32[b, 4]  idx: i32
-      module = """
+      module, version = serialize("""
 module @jit_fun_flat_jax {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>, %arg2: tensor<i32>) -> tensor<?x4xf32> {
     %0 = stablehlo.constant dense<0> : tensor<i32>
@@ -570,8 +563,8 @@ module @jit_fun_flat_jax {
     return %5 : tensor<?x4xf32>
   }
 }
-"""
-      return xla.call_module([x, idx],
+""")
+      return xla.call_module([x, idx], version=version,
                              module=module,
                              Tout=[res.dtype],
                              Sout=[(None, 4)],
@@ -586,7 +579,7 @@ module @jit_fun_flat_jax {
 
     def f(x, y):  # x: f32[b, 4]  y: f32[2, b, 4]
       # return (np.broadcast_to(x, y.shape), x + y)
-      module = """
+      module, version = serialize("""
 module @jit_fun.0 {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>, %arg2: tensor<2x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>) {
     %0 = stablehlo.constant dense<2> : tensor<1xi32>
@@ -598,8 +591,8 @@ module @jit_fun.0 {
     return %5, %6 : tensor<2x?x4xf32>, tensor<2x?x4xf32>
   }
 }
-"""
-      return xla.call_module([x, y], version=2,
+""")
+      return xla.call_module([x, y], version=version,
                              module=module,
                              Tout=[res[0].dtype, res[1].dtype],
                              Sout=[(2, None, 4), (2, None, 4)],
@@ -613,7 +606,7 @@ module @jit_fun.0 {
     res = np.sum(x) * x.shape[0]
 
     def f(x):  # x: i32[b]
-      module = """
+      module, version = serialize("""
 module @jit_fun{
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?xi32>) -> tensor<i32> {
     %0 = stablehlo.constant dense<0> : tensor<i32>
@@ -626,8 +619,8 @@ module @jit_fun{
     return %2 : tensor<i32>
   }
 }
-"""
-      return xla.call_module([x], version=1,
+""")
+      return xla.call_module([x], version=version,
                              module=module,
                              Tout=[res.dtype],
                              Sout=[res.shape],
@@ -640,7 +633,7 @@ module @jit_fun{
     res = np.arange(3, dtype=np.float32).reshape(3, 1) * 5
 
     def f(x):  # x: f32[b, 5]
-      module = """
+      module, version = serialize("""
 module @jit_fun_flat_jax {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x5xf32>) -> tensor<?x1xf32> {
     %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
@@ -656,8 +649,8 @@ module @jit_fun_flat_jax {
     return %5 : tensor<?x1xf32>
   }
 }
-"""
-      return xla.call_module([x,],
+""")
+      return xla.call_module([x,], version=version,
                              module=module,
                              Tout=[res.dtype],
                              Sout=[(None, 1)],
@@ -671,7 +664,7 @@ module @jit_fun_flat_jax {
     res = np.arange(x.shape[0], dtype=np.int32)
 
     def f(x):  # x: f32[b]
-      module = """
+      module, version = serialize("""
 module @jit_fun_3 {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xi32> {
     %0 = call @f(%arg0, %arg1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xi32>
@@ -683,8 +676,8 @@ module @jit_fun_3 {
     return %1 : tensor<?xi32>
   }
 }
-"""
-      return xla.call_module([x,], version=2,
+""")
+      return xla.call_module([x,], version=version,
                              module=module,
                              Tout=[res.dtype],
                              Sout=[()],
@@ -697,15 +690,14 @@ module @jit_fun_3 {
     res = x
 
     def f(x):  # x: f32[b]
-      module = """
+      module, version = serialize("""
 module @jit_fun_3 {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
     return %arg1 : tensor<?xf32>
   }
 }
-"""
-      return xla.call_module([x],
-                             version=2,
+""")
+      return xla.call_module([x], version=version,
                              module=module,
                              Tout=[res.dtype],
                              Sout=[()],
@@ -723,7 +715,7 @@ module @jit_fun_3 {
     res1 = np.int64(5)
 
     def f(x):  # x: f32[b]
-      module = """
+      module, version = serialize("""
 module @jit_fun_flat_jax {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> (tensor<?xf32>, tensor<i64>) {
     %0 = stablehlo.constant dense<0> : tensor<i64>
@@ -744,8 +736,8 @@ module @jit_fun_flat_jax {
     return %1#0, %1#1 : tensor<?xf32>, tensor<i64>
   }
 }
-"""
-      return xla.call_module([x,], version=2,
+""")
+      return xla.call_module([x,], version=version,
                              module=module,
                              Tout=[res0.dtype, res1.dtype],
                              Sout=[(None,), res1.shape],
@@ -753,6 +745,34 @@ module @jit_fun_flat_jax {
 
     self._assertOpOutputMatchesExpected(f, (x,), (res0, res1))
 
+  def test_op_backward_compatibility(self):
+    """Test for ensuring XlaCallModuleOp backward compatiblity."""
+    x = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+
+    def f(x):
+      # sin(cos(x))
+      module, version = serialize("""
+module @jit_f.0 {
+  func.func public @main(%arg0: tensor<3xf32>) -> tensor<3xf32> {
+    %0 = stablehlo.cosine %arg0 : tensor<3xf32>
+    %1 = stablehlo.sine %0 : tensor<3xf32>
+    return %1 : tensor<3xf32>
+  }
+}
+""")
+      # Create the raw XlaCallModule op directly instead of calling
+      # `xla.call_module`, which handles default values for unpresent
+      # attributes.
+      return gen_xla_ops.xla_call_module(
+          [x],
+          version=version,
+          module=module,
+          Tout=[x.dtype],
+          Sout=[x.shape],
+      )
+
+    self._assertOpOutputMatchesExpected(f, (x,), (np.sin(np.cos(x)),))
+
 
 if __name__ == '__main__':
   # This test is using Tensorflow sessions which are not compatible with eager
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index c4fb8e3f44c..f8ce172dc34 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -289,15 +289,44 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
             padding_value=7,
             padding_low=[2, 1],
             padding_high=[1, 2],
-            padding_interior=[1, 0])
+            padding_interior=[1, 0],
+        )
 
       self._assertOpOutputMatchesExpected(
           pad_fn,
           args=(np.arange(4, dtype=np.int32).astype(dtype).reshape([2, 2]),),
           expected=np.array(
-              [[7, 7, 7, 7, 7], [7, 7, 7, 7, 7], [7, 0, 1, 7, 7],
-               [7, 7, 7, 7, 7], [7, 2, 3, 7, 7], [7, 7, 7, 7, 7]],
-              dtype=dtype))
+              [
+                  [7, 7, 7, 7, 7],
+                  [7, 7, 7, 7, 7],
+                  [7, 0, 1, 7, 7],
+                  [7, 7, 7, 7, 7],
+                  [7, 2, 3, 7, 7],
+                  [7, 7, 7, 7, 7],
+              ],
+              dtype=dtype,
+          ),
+      )
+
+  def testSetDynamicDimensionSize(self):
+    dynamic_size = 7
+
+    # XLA doesn't support this for bfloat16.
+    for dtype in set(self.numeric_types).intersection(
+        set([np.int32, np.float32, np.float64, np.complex64])):
+
+      def xla_set_dynamic_dimension_size_fn(x):
+        # Tell XLA to cut the array to size=dynamic_size.
+        return gen_xla_ops.xla_set_dynamic_dimension_size(
+            x, dim_index=0, size=dynamic_size
+        )
+
+      a = np.arange(10, dtype=np.int32).astype(dtype)
+      expected = a[:dynamic_size]
+
+      self._assertOpOutputMatchesExpected(
+          xla_set_dynamic_dimension_size_fn, args=(a,), expected=expected
+      )
 
   def testPadNegative(self):
     for dtype in self.numeric_types:
@@ -574,6 +603,41 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           args=(values_1, values_2),
           expected=(values_1, values_2))
 
+  @test_util.disable_mlir_bridge('Not supported yet')
+  def testScatter(self):
+    test_array = np.arange(9).astype(np.int32).reshape((3, 3))
+    scatter_indices = np.array([0, 2], dtype=np.int32)
+    updates = np.array([[10, 20, 30], [70, 80, 90]], dtype=np.int32)
+
+    dnums = xla_data_pb2.ScatterDimensionNumbers()
+    dnums.update_window_dims.append(1)
+    dnums.inserted_window_dims.append(0)
+    dnums.scatter_dims_to_operand_dims.append(0)
+    dnums.index_vector_dim = 1
+
+    add_numbers = function.Defun(np.int32, np.int32)(lambda x, y: x + y)
+
+    def test_fn(
+        scatter_input,
+        scatter_indices,
+        scatter_updates,
+    ):
+      return gen_xla_ops.xla_scatter(
+          scatter_input,
+          scatter_indices,
+          scatter_updates,
+          add_numbers,
+          dnums.SerializeToString(),
+          indices_are_sorted=False,
+      )
+
+    expected = np.array([[10, 21, 32], [3, 4, 5], [76, 87, 98]], dtype=np.int32)
+    self._assertOpOutputMatchesExpected(
+        test_fn,
+        args=(test_array, scatter_indices, updates),
+        expected=expected,
+    )
+
   def testSelectAndScatter(self):
     for dtype in set(self.numeric_types).intersection(
         set([dtypes.bfloat16.as_numpy_dtype, np.float32])):
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index d0ce575deef..bb48f9e806b 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -102,9 +102,9 @@ tf_cuda_cc_test(
         ":utils",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init",
         "//tensorflow/core:lib",
-        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:stream_executor",
     ] + if_tensorrt([
         ":tensorrt_lib",
     ]) + select({
@@ -211,11 +211,11 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         ":trt_conversion",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:tensor_testutil",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/framework:tensor_testutil",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
@@ -230,8 +230,8 @@ tf_cuda_cc_test(
     ],
     deps = [
         ":testutils",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test_main",
         "//tensorflow/core/platform:protobuf",
     ] + if_tensorrt([
         ":tensorrt_lib",
@@ -247,6 +247,7 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":common_utils",
         ":trt_allocator",
         ":trt_conversion",
         ":trt_engine_utils",
@@ -254,19 +255,18 @@ cc_library(
         ":trt_plugins",
         ":trt_resources",
         ":utils",
-        ":common_utils",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core:stream_executor_headers_lib",
         "//tensorflow/core/common_runtime:core_cpu_lib_no_ops",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + if_tensorrt([
         ":tensorrt_lib",
         "@local_config_cuda//cuda:cuda_headers",
@@ -285,14 +285,14 @@ cc_library(
         ":trt_logging",
         ":trt_plugins",
         ":trt_resources",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + if_tensorrt([":tensorrt_lib"]) + tf_custom_op_library_additional_deps(),
     alwayslink = 1,
 )
@@ -342,16 +342,11 @@ tf_cuda_cc_test(
         "nomac",
     ],
     deps = [
+        ":testutils",
+        ":trt_conversion",
         ":trt_op_kernels",
         ":trt_op_libs",
         ":trt_resources",
-        ":trt_conversion",
-        ":testutils",
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "//third_party/eigen3",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:scope",
@@ -362,10 +357,15 @@ tf_cuda_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core/kernels:ops_testutil",
-        "//tensorflow/core/kernels:function_ops",
-        "//tensorflow/core/kernels:array",
         "//tensorflow/core/framework:fake_input",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
     ] + if_tensorrt([
         "@local_config_cuda//cuda:cuda_headers",
     ]),
@@ -401,17 +401,17 @@ tf_cuda_library(
     ],
     deps = [
         ":common_utils",
-        ":trt_logging",
-        ":utils",
         ":trt_allocator",
+        ":trt_logging",
         ":trt_parameters",
-        "@com_google_absl//absl/strings",
+        ":utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_headers_lib",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/core:stream_executor_headers_lib",
+        "@com_google_absl//absl/strings",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
@@ -424,8 +424,8 @@ tf_cuda_library(
         ":common_utils",
         ":logger_registry",
         ":utils",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:lib_proto_parsing",
+        "@com_google_absl//absl/strings",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
@@ -445,7 +445,6 @@ tf_custom_op_py_library(
         ":trt_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
         "//tensorflow/python:resources",
     ],
 )
@@ -459,9 +458,9 @@ tf_cuda_library(
     copts = tf_copts(),
     deps = [
         ":utils",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/core:lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
@@ -485,10 +484,10 @@ tf_cuda_library(
         ":utils",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:framework_lite",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core:graph",
         "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/grappler:op_types",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
@@ -557,8 +556,8 @@ tf_cuda_library(
     ],
     copts = tf_copts(),
     deps = [
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
@@ -571,8 +570,8 @@ tf_cuda_library(
     copts = tf_copts(),
     deps = [
         ":utils",
-        "//tensorflow/core:lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
@@ -602,10 +601,10 @@ tf_cuda_library(
     ],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        ":utils",
         ":op_converter",
-        "@com_google_absl//absl/strings",
+        ":utils",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
@@ -616,8 +615,8 @@ tf_cuda_library(
     ],
     copts = tf_copts(),
     deps = [
-        ":utils",
         ":op_converter",
+        ":utils",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:logging",
@@ -704,27 +703,17 @@ tf_cuda_library(
         ":algorithm_selector",
         ":common_utils",
         ":logger_registry",
-        ":segment",
-        ":trt_allocator",
-        ":trt_parameters",
-        ":trt_plugins",
-        ":trt_logging",
-        ":trt_resources",
-        ":utils",
-        ":trt_weights",
         ":op_converter",
         ":op_converter_registry",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
+        ":segment",
+        ":trt_allocator",
+        ":trt_logging",
+        ":trt_parameters",
+        ":trt_plugins",
+        ":trt_resources",
+        ":trt_weights",
+        ":utils",
         "//tensorflow/cc:array_ops",
-        "//tensorflow/core/common_runtime:core_cpu",
-        "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
-        "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:gpu_runtime",
@@ -732,13 +721,23 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:core_cpu",
         "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
-        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/profiler/lib:annotated_traceme",
+        "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/tools/graph_transforms:transform_utils",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + if_tensorrt([":tensorrt_lib"]) + tf_custom_op_library_additional_deps() + select({
         ":use_efficient_nms_plugin": [":efficient_nms_plugin"],
         "//conditions:default": [],
@@ -756,17 +755,13 @@ tf_cuda_cc_test(
         "nomac",
     ],
     deps = [
+        ":testutils",
+        ":trt_conversion",
         ":trt_op_kernels",
         ":trt_op_libs",
-        ":trt_conversion",
-        ":testutils",
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/strings",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:direct_session",
@@ -775,6 +770,10 @@ tf_cuda_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
@@ -791,31 +790,31 @@ tf_cuda_cc_test(
         "nomac",
     ],
     deps = [
-        ":trt_logging",
-        ":trt_conversion",
-        ":trt_plugins",
-        ":trt_engine_utils",
-        ":utils",
         ":testutils",
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
+        ":trt_conversion",
+        ":trt_engine_utils",
+        ":trt_logging",
+        ":trt_plugins",
+        ":utils",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
-        "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
         "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:resource_variable_ops",
-        "//tensorflow/core:test",
         "//tensorflow/core/platform:status_matchers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
     ] + if_tensorrt([
         ":tensorrt_lib",
         "@local_config_cuda//cuda:cuda_headers",
@@ -834,38 +833,38 @@ tf_cuda_cc_test(
         "nomac",
     ],
     deps = [
-        ":trt_logging",
+        ":testutils",
         ":trt_conversion",
         ":trt_convert_api",
-        ":trt_plugins",
         ":trt_engine_utils",
+        ":trt_logging",
         ":trt_op_kernels",
+        ":trt_plugins",
         ":trt_resources",
         ":utils",
-        ":testutils",
-        "//tensorflow/compiler/jit:shape_inference",
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
+        "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core/platform:status_matchers",
-        "//tensorflow/core/kernels:ops_testutil",
-        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:nn",
+        "//tensorflow/core/kernels:ops_testutil",
         "//tensorflow/core/kernels:pooling_ops",
+        "//tensorflow/core/platform:status_matchers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
     ] + if_tensorrt([
         ":tensorrt_lib",
         "@local_config_cuda//cuda:cuda_headers",
@@ -963,12 +962,12 @@ cc_library(
     ],
     copts = tf_copts(),
     deps = [
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
-        "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ] + if_tensorrt([":tensorrt_lib"]),
 )
 
diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.cc b/tensorflow/compiler/tf2tensorrt/common/utils.cc
index 69ecc84dca7..92166c2e79e 100644
--- a/tensorflow/compiler/tf2tensorrt/common/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.cc
@@ -213,6 +213,11 @@ std::ostream& operator<<(std::ostream& os, const nvinfer1::DataType& v) {
     case nvinfer1::DataType::kHALF:
       os << "kHalf";
       break;
+#if IS_TRT_VERSION_GE(8, 6, 0, 0)
+    case nvinfer1::DataType::kFP8:
+      os << "kFP8";
+      break;
+#endif
     case nvinfer1::DataType::kINT8:
       os << "kINT8";
       break;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 79a60d2b1de..e809152c1e7 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -914,7 +914,7 @@ Status ConvertGraph(const TRTOptimizationPass::ConversionParams& params,
     } else {
       // Graph is not modified.
       LOG_WARNING_WITH_PREFIX << "Cannot replace " << msg
-                              << " reason: " << status.error_message()
+                              << " reason: " << status.message()
                               << " (keeping original segment).";
     }
     if (VLOG_IS_ON(1)) {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 914576f7552..1c3a1903477 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -1010,11 +1010,11 @@ Status TrtNodeValidator::IsTensorRTCandidate(const Node* node) {
                                              &tensor_or_weights);
     if (!status.ok()) {
       VLOG(2) << "Failed to convert input `" << src_def.name() << "` to a "
-              << "TRT_TensorOrWeights: " << status.error_message();
+              << "TRT_TensorOrWeights: " << status.message();
 
       return errors::Internal(
           "Failed to convert at least one input to a TRT_TensorOrWeights: ",
-          status.error_message());
+          status.message());
     }
     inputs.push_back(tensor_or_weights);
   }
@@ -1131,11 +1131,10 @@ Status Converter::ConvertNode(const NodeDef& node_def) {
             << output.DebugString();
     Status status = AddTensorOrWeights(output_name, output);
     if (!status.ok()) {
-      return errors::Create(
-          static_cast<absl::StatusCode>(status.code()),
-          StrCat("Failed to add output for node: ", node_def.name(), ": ",
-                 status.error_message()),
-          errors::GetPayloads(status));
+      return errors::Create(static_cast<absl::StatusCode>(status.code()),
+                            StrCat("Failed to add output for node: ",
+                                   node_def.name(), ": ", status.message()),
+                            errors::GetPayloads(status));
     }
   }
   return OkStatus();
@@ -1151,7 +1150,7 @@ Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype,
     status = MaybeUpdateBatchSize(batch_size);
     if (!status.ok()) {
       return errors::CreateWithUpdatedMessage(
-          status, batch_size_error(name, status.error_message()));
+          status, batch_size_error(name, status.message()));
     }
   }
   ITensorProxyPtr tensor = network()->addInput(name.c_str(), dtype, dims);
@@ -1162,8 +1161,8 @@ Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype,
   status = AddTensorOrWeights(name, TRT_TensorOrWeights(tensor));
   if (!status.ok()) {
     return errors::CreateWithUpdatedMessage(
-        status, StrCat("Failed to add input tensor ", name, ": ",
-                       status.error_message()));
+        status,
+        StrCat("Failed to add input tensor ", name, ": ", status.message()));
   }
   return OkStatus();
 }
@@ -1173,8 +1172,8 @@ Status Converter::AddInputResource(const string& name,
   Status status = AddTensorOrWeights(name, TRT_TensorOrWeights(resource));
   if (!status.ok()) {
     return errors::CreateWithUpdatedMessage(
-        status, StrCat("Failed to add input resource ", name, ": ",
-                       status.error_message()));
+        status,
+        StrCat("Failed to add input resource ", name, ": ", status.message()));
   }
   return OkStatus();
 }
@@ -1376,7 +1375,7 @@ Status Converter::BuildCudaEngine(
     auto cache = registry->LookUp("default_cache", builder_config.get());
     if (!cache.ok()) {
       LOG(WARNING) << "failed to create a timing cache: "
-                   << cache.status().error_message();
+                   << cache.status().message();
     } else {
       timing_cache = std::move(*cache);
       builder_config->setTimingCache(*timing_cache, /*ignoreMismatch*/ false);
@@ -2537,7 +2536,7 @@ Status Converter::SqueezeTensor(ITensorProxyPtr input,
   // Reshape tensor.
   TF_RETURN_IF_ERROR(PrepareTensorForShape(
       params->converter, TRT_TensorOrWeights(input), DimsAdapter(*input_dims),
-      /*validation_only=*/false, output, params->node_def));
+      /*validation_only=*/false, output, params->node_def, op_instance));
   return OkStatus();
 }
 
@@ -5945,7 +5944,7 @@ Status ConvertGraphDefToEngine(
         if (!status.ok()) {
           const string error_message =
               StrCat("Validation failed for ", node_name, " and input slot ",
-                     slot_number, ": ", status.error_message());
+                     slot_number, ": ", status.message());
           LOG_WARNING_WITH_PREFIX << error_message;
           return errors::CreateWithUpdatedMessage(status, error_message);
         }
@@ -6238,7 +6237,7 @@ std::string unexpected_type_error_msg(nvinfer1::DataType type_being_checked,
          DebugString(type_being_checked) + ".";
 }
 
-string batch_size_error(const string& name, const string& comment) {
+string batch_size_error(absl::string_view name, absl::string_view comment) {
   return StrCat("Batch size doesn't match for tensor '", name, "' : ", comment);
 }
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index be675a1a9c6..e9afd320be9 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -574,7 +574,7 @@ std::string input_shapes_error_msg(const nvinfer1::Dims& shape1,
                                    const nvinfer1::Dims& shape2,
                                    const NodeDef& node,
                                    bool then_vs_else = false);
-std::string batch_size_error(const string& name, const string& comment);
+std::string batch_size_error(absl::string_view name, absl::string_view comment);
 
 inline bool find_name(const string& name, const std::vector<string> names) {
   return std::find(names.begin(), names.end(), name) != names.end();
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index d1c75833219..91b5b3540eb 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -1457,7 +1457,7 @@ class OpConverterTest : public ::testing::Test {
 
   void RunConversion(const Node* node,
                      absl::StatusCode expected_code = absl::StatusCode::kOk,
-                     const std::string& expected_msg_substr = "") {
+                     absl::string_view expected_msg_substr = "") {
     EXPECT_THAT(converter_->ConvertNode(node->def()),
                 StatusIs(expected_code, HasSubstr(expected_msg_substr)));
     if (expected_code == absl::StatusCode::kOk) {
@@ -1470,7 +1470,7 @@ class OpConverterTest : public ::testing::Test {
   void RunValidationAndConversion(
       const NodeDef& node_def,
       absl::StatusCode expected_code = absl::StatusCode::kOk,
-      const std::string& expected_msg_substr = "",
+      absl::string_view expected_msg_substr = "",
       bool should_run_conversion = true) {
     // Add the node to the graph.
     // TODO(laigd): we should accept a function that adds the node using
@@ -1505,7 +1505,7 @@ class OpConverterTest : public ::testing::Test {
       const std::vector<std::vector<int>>& exp_out_dims) {
     RunValidationAndConversion(node_def,
                                static_cast<absl::StatusCode>(status.code()),
-                               status.error_message(), true);
+                               status.message(), true);
 
     if (status.ok()) {
       // TODO(tfeher): Enable this check in explicit_batch_mode.
@@ -9881,6 +9881,48 @@ TEST_P(OpConverter_Select, ConvertSelectV2) { RunTest("SelectV2"); }
 
 TEST_P(OpConverter_Select, Convert_Select) { RunTest("Select"); }
 
+TEST_F(OpConverterTest, DuplicateSqueeze) {
+  // Define a custom converter which performs multiple squeezes.
+  auto op_converter = [](const OpConverterParams* params) -> Status {
+    if (params->validation_only) return OkStatus();
+    auto input = params->inputs.at(0).tensor();
+    ITensorProxyPtr output;
+    // Squeeze the first dimension.
+    std::vector<int> new_dims = {0, 1, 2, 3};
+    TF_EXPECT_OK(params->converter->SqueezeTensor(
+        /*input=*/input, /*input_dims=*/&new_dims, /*params=*/params,
+        /*output=*/&output, /*op_instance=*/0));
+    // Squeeze the second dimension.
+    new_dims = {0, 2, 3};
+    TF_EXPECT_OK(params->converter->SqueezeTensor(
+        /*input=*/output, /*input_dims=*/&new_dims, /*params=*/params,
+        /*output=*/&output, /*op_instance=*/1));
+    params->outputs->push_back(TRT_TensorOrWeights(output));
+    return OkStatus();
+  };
+  // Use a simple unary op for the custom converter and add an input.
+  NodeDef node_def = CreateUnaryOp<ops::Abs>(DataType::DT_FLOAT);
+  AddTestTensor("input", {1, 1, 2, 3});
+  // Override the converter for Abs to use the custom converter for this test
+  // only, and run conversion.
+  GetOpConverterRegistry()->Register("Abs", kDefaultConverterPriority + 1,
+                                     op_converter);
+  RunValidationAndConversion(node_def);
+  // Set up the inputs and outputs.
+  DataVec input_data;
+  DataVec output_data;
+  InputOutputData abs_input{
+      "input", ConstructTensor<float>(/*data_size=*/6, /*value=*/0,
+                                      /*tf_type=*/DataType::DT_FLOAT)};
+  InputOutputData abs_output{
+      "my_unary", ConstructTensor<float>(/*data_size=*/6, /*value=*/0,
+                                         /*tf_type=*/DataType::DT_FLOAT)};
+  input_data.push_back(abs_input);
+  output_data.push_back(abs_output);
+  // Build and run the cuda engine.
+  TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
+}
+
 #endif
 
 }  // namespace convert
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h b/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h
index 5445df8b51c..e3aadc279d9 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h
@@ -657,8 +657,8 @@ class TRTNetworkBuilder {
   nvinfer1::INetworkDefinition* Network() { return network_; }
 
  private:
-  nvinfer1::INetworkDefinition* const network_;
-  TrtWeightStore* const weight_store_;
+  nvinfer1::INetworkDefinition* network_;
+  TrtWeightStore* weight_store_;
 };
 
 class ShuffleBuilder {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index ef61ea3fce6..f2cc8be2fd0 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -69,6 +69,10 @@ string DebugString(const nvinfer1::DataType trt_dtype) {
 #if IS_TRT_VERSION_GE(8, 5, 0, 0)
     case nvinfer1::DataType::kUINT8:
       return "kUINT8";
+#endif
+#if IS_TRT_VERSION_GE(8, 6, 0, 0)
+    case nvinfer1::DataType::kFP8:
+      return "kFP8";
 #endif
     default:
       return "Invalid TRT data type";
@@ -204,6 +208,11 @@ Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type) {
     case nvinfer1::DataType::kUINT8:
       *tf_type = DT_UINT8;
       break;
+#endif
+#if IS_TRT_VERSION_GE(8, 6, 0, 0)
+    case nvinfer1::DataType::kFP8:
+      *tf_type = DT_FLOAT8_E4M3FN;
+      break;
 #endif
     default:
       return errors::InvalidArgument("Invalid TRT data type");
diff --git a/tensorflow/compiler/tf2tensorrt/convert/weights.cc b/tensorflow/compiler/tf2tensorrt/convert/weights.cc
index c608291a0ae..da2157096b5 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/weights.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/weights.cc
@@ -68,6 +68,9 @@ size_t TRT_ShapedWeights::size_bytes() const {
       break;
 #if IS_TRT_VERSION_GE(8, 5, 0, 0)
     case nvinfer1::DataType::kUINT8:
+#endif
+#if IS_TRT_VERSION_GE(8, 6, 0, 0)
+    case nvinfer1::DataType::kFP8:
 #endif
     case nvinfer1::DataType::kINT8:
     case nvinfer1::DataType::kBOOL:
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 1e1d7eab557..abf83f27027 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -244,6 +244,9 @@ class TRTEngineOp : public AsyncOpKernel {
   // Maximum number of cached engines.
   int max_cached_engines_;
 
+  // Flag to detect whether native segment nodes have been deleted from graph
+  bool native_segment_absent_;
+
   int64 workspace_size_;
   mutex engine_mutex_;
   FunctionLibraryRuntime::Handle native_execution_func_handle_;
@@ -357,7 +360,7 @@ StatusOr<FunctionLibraryRuntime::Handle> TRTEngineOp::ConstructFunctionHandle(
   FunctionLibraryRuntime::InstantiateOptions inst_ops;
   inst_ops.state_handle = "";
   inst_ops.target = device_name;
-  if (allow_soft_placement) {
+  if (!native_segment_absent_ && allow_soft_placement) {
     const FunctionDef* fdef =
         lib->GetFunctionLibraryDefinition()->Find(func_.name());
     if (!fdef) {
@@ -421,9 +424,6 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   OP_REQUIRES_OK(context,
                  context->GetAttr("calibration_data", &calibration_data));
   OP_REQUIRES_OK(context, context->GetAttr("segment_func", &func_));
-  OP_REQUIRES(context, !func_.name().empty(),
-              errors::InvalidArgument(
-                  "The TF function for the TRT segment could not be empty"));
   OP_REQUIRES_OK(context,
                  TrtPrecisionModeFromName(precision_string, &precision_mode_));
   OP_REQUIRES_OK(context,
@@ -468,11 +468,17 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
     use_explicit_precision_ = false;
   }
 
+  // When a TF-TRT converted model without native segments is loaded,
+  // func_ can be empty.
+  native_segment_absent_ = (func_.name() == "");
   native_execution_func_handle_ = kInvalidHandle;
-  if (!static_engine_) {
-    OP_REQUIRES_OK(context, ImportSegmentGraphDef(context->function_library(),
-                                                  context->device()->name()));
+  if (!native_segment_absent_) {
+    if (!static_engine_) {
+      OP_REQUIRES_OK(context, ImportSegmentGraphDef(context->function_library(),
+                                                    context->device()->name()));
+    }
   }
+
   // TODO(laigd): calibration_data is used in TF v1.x and we keep it only for
   // backward compatibility reasons. Remove it once all known users switch to
   // 2.0.
@@ -721,7 +727,12 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
       VLOG(2) << "Passed calibration data";
     }
   }
-  ExecuteNativeSegment(ctx, async_helper);
+  if (!native_segment_absent_) {
+    ExecuteNativeSegment(ctx, async_helper);
+  } else {
+    LOG(ERROR) << "Calibration requires native segment, but is not found in "
+                  "the graph.";
+  }
 }
 
 Status TRTEngineOp::VerifyInputShapes(
@@ -843,11 +854,11 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
   Status verify_input_shape_status =
       VerifyInputShapes(input_concrete_shapes_filtered);
   // TODO(bixia): Fix the segmentation.
-  if (!verify_input_shape_status.ok()) {
+  if (!verify_input_shape_status.ok() && !native_segment_absent_) {
     LOG_FIRST_FEW_WARNING_WITH_PREFIX
         << "Running native segment for" << name()
         << " due to failure in verifying input shapes: "
-        << verify_input_shape_status.error_message();
+        << verify_input_shape_status.message();
     ExecuteNativeSegment(ctx, async_helper);
     return;
   }
@@ -868,8 +879,14 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
       // Just collect the input shape info and return. The shapes are used to
       // generate optimization profiles during engine creation.
       cache_res->profiles_.AddShape(input_concrete_shapes);
-      VLOG(1) << "Native segment is used during collecting shapes for profiles";
-      ExecuteNativeSegment(ctx, async_helper);
+      VLOG(1)
+          << "Native segment is used during collecting shapes for profiles.";
+      if (!native_segment_absent_) {
+        ExecuteNativeSegment(ctx, async_helper);
+      } else {
+        LOG(ERROR) << "Native segment is required for profile generation,  "
+                      "but is not found in the graph.";
+      }
       return;
     } else if (cache_res->profiles_.GetNumProfiles() == 0 && !static_engine_) {
       // Add current shape if we did not collect any shapes so far.
@@ -926,9 +943,14 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
   EngineContext* engine_context = status.value().first;
   int trt_context_idx = status.value().second;
   auto may_execute_native_segment = [&] {
-    if (!AllowEngineNativeSegmentExecution()) {
+    if (!native_segment_absent_ && !AllowEngineNativeSegmentExecution()) {
       ctx->CtxFailure(
-          errors::Aborted("User disallowed engine native segment execution"));
+          errors::Aborted("User disallowed engine native segment execution."));
+      return false;
+    } else if (native_segment_absent_) {
+      ctx->CtxFailure(
+          errors::Aborted("Native segment execution is enabled but "
+                          " native segment is not found in the graph."));
       return false;
     }
     return true;
@@ -954,14 +976,20 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
   if (!may_execute_native_segment()) {
     return;
   }
-  // Release any outputs that are allocated, ExecuteNativeSegment will
-  // re-allocate them and fail if they are currently allocated.
+  // When Native Segment execution is enabled, release any outputs that
+  // are allocated. ExecuteNativeSegment will re-allocate them and
+  // fail if they are currently allocated.
   // The Tensor pointer in the returned TensorValue must be explicitly
   // deleted.
   for (int i = 0; i < ctx->num_outputs(); i++) {
     delete ctx->release_output(i).tensor;
   }
-  ExecuteNativeSegment(ctx, async_helper);
+  if (!native_segment_absent_) {
+    ExecuteNativeSegment(ctx, async_helper);
+  } else {
+    LOG(ERROR) << "Native segment execution is enabled, "
+                  "but native segment is not found in the graph.";
+  }
 }
 
 Status TRTEngineOp::ExecuteTrtEngine(
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index a9994bc2db3..3e71229888b 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -765,7 +765,7 @@ string GenerateNonConversionReport(
         // Log the error in case of issue, however do not stop execution.
         LOG(ERROR) << "Problem encountered while generating the TF-TRT "
                    << "Non-Conversion Report in CSV Format:\n"
-                   << status.error_message();
+                   << status.message();
       }
       show_detailed_conversion_report = true;
     } else if (std::stoi(detailed_report_var) >= 1) {
@@ -949,7 +949,7 @@ Status SegmentGraph(const Graph* tf_graph,
     } else {
       const Status status = candidate_fn(node->tf_node());
       if (!status.ok()) {
-        exclude_node(status.error_message());
+        exclude_node(status.message());
       } else if (tftrt_op_denylist.contains(node->tf_node()->type_string())) {
         // WARNING verbosity since the user explicitly requests this behavior.
         LOG_WARNING_WITH_PREFIX
diff --git a/tensorflow/compiler/tf2tensorrt/trt_convert_api.cc b/tensorflow/compiler/tf2tensorrt/trt_convert_api.cc
index c675e1157e0..171798b216a 100644
--- a/tensorflow/compiler/tf2tensorrt/trt_convert_api.cc
+++ b/tensorflow/compiler/tf2tensorrt/trt_convert_api.cc
@@ -158,8 +158,8 @@ Status RunTfTrt(const MetaGraphDef& meta_graph_def,
                 const RewriterConfig& rewriter_config,
                 GraphDef* segmented_graph_def) {
   ConfigProto config_proto;
-  config_proto.mutable_graph_options()->mutable_rewrite_options()->CopyFrom(
-      rewriter_config);
+  *config_proto.mutable_graph_options()->mutable_rewrite_options() =
+      rewriter_config;
 
   VLOG(4) << "Setting up Grappler parameters\n" << config_proto.DebugString();
   std::unique_ptr<grappler::Cluster> cluster;
@@ -202,7 +202,7 @@ Status RunSession(Session* session, const std::vector<std::string>& input_names,
   std::vector<std::pair<std::string, tensorflow::Tensor>> input_pairs;
   std::vector<std::string> prefixed_output_names;
   auto prefixed_name = [](std::string prefix, std::string name) {
-    return prefix.size() > 0 ? absl::StrJoin({prefix, name}, "/") : name;
+    return !prefix.empty() ? absl::StrJoin({prefix, name}, "/") : name;
   };
   for (int i = 0; i < input_names.size(); i++) {
     input_pairs.push_back(
@@ -315,7 +315,7 @@ Status ReadSerializedEngine(
 // Saves the TRT engines as attributes of the TRTEngineOp nodes.
 Status ConvertToStaticEngine(const GraphDef graph_def,
                              GraphDef* static_graph_def, Session* session) {
-  static_graph_def->CopyFrom(graph_def);
+  *static_graph_def = graph_def;
   VLOG(1) << "Saving TRT engines as static engine";
   std::string op{"TRTEngineOp"};
   for (auto& node : *(static_graph_def->mutable_node())) {
@@ -397,7 +397,7 @@ StatusOr<GraphDef> ConvertAndBuild(
     const TfTrtConversionParams& conv_params) {
   TF_RETURN_IF_ERROR(ValidateConversionParams(conv_params, inputs.size()));
   MetaGraphDef meta_graph;
-  meta_graph.mutable_graph_def()->CopyFrom(frozen_graph_def);
+  *meta_graph.mutable_graph_def() = frozen_graph_def;
 
   RewriterConfig rewriter_config;
   TF_RETURN_IF_ERROR(
@@ -409,12 +409,12 @@ StatusOr<GraphDef> ConvertAndBuild(
 
   GraphDef output;
 
-  if (inputs.size() > 0 && conv_params.convert_to_static_engine) {
+  if (!inputs.empty() && conv_params.convert_to_static_engine) {
     // The TRTOptimization pass has inserted placeholder TRTEngineOps. Here we
     // trigger conversion by inferring the graph.
     std::unique_ptr<tensorflow::Session> session(
         tensorflow::NewSession(GetSessionConfg()));
-    if (!session.get()) {
+    if (!session) {
       return errors::Internal("Failed to create build session");
     }
 
@@ -424,7 +424,7 @@ StatusOr<GraphDef> ConvertAndBuild(
     TF_RETURN_IF_ERROR(
         ConvertToStaticEngine(segmented_graph_def, &output, session.get()));
   } else {
-    output.CopyFrom(segmented_graph_def);
+    output = segmented_graph_def;
   }
   VLOG(1) << "TF-TRT conversion finished";
   return output;
@@ -456,9 +456,9 @@ Status FreezeGraph(SavedModelBundle& bundle, MetaGraphDef* frozen_meta_graph) {
   TF_RETURN_IF_ERROR(
       FreezeSavedModel(bundle, &frozen_graph_def, &inputs, &outputs));
 
-  frozen_meta_graph->CopyFrom(bundle.meta_graph_def);
+  *frozen_meta_graph = bundle.meta_graph_def;
   GraphDef* gdef = frozen_meta_graph->mutable_graph_def();
-  gdef->CopyFrom(frozen_graph_def);
+  *gdef = frozen_graph_def;
 
   VLOG(2) << "Graph frozen";
   return OkStatus();
@@ -491,7 +491,7 @@ StatusOr<GraphDef> ConvertAndBuild(
 
   // Replace the graph_def with the inlined graph. Note that bundle->session
   // still has the original graph.
-  bundle->meta_graph_def.mutable_graph_def()->CopyFrom(inlined_graph_def);
+  *bundle->meta_graph_def.mutable_graph_def() = inlined_graph_def;
 
   // Freeze variables.
   MetaGraphDef frozen_meta_graph;
diff --git a/tensorflow/compiler/tf2tensorrt/trt_convert_api_test.cc b/tensorflow/compiler/tf2tensorrt/trt_convert_api_test.cc
index 706a8b515e1..5d969614448 100644
--- a/tensorflow/compiler/tf2tensorrt/trt_convert_api_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/trt_convert_api_test.cc
@@ -107,16 +107,14 @@ class TrtConverterTest
         {});
     FunctionDef fdef;
     if (use_variable_) {
-      gdef.add_node()->CopyFrom(
+      *gdef.add_node() =
           NDef("my_var", "VarHandleOp", {},
-               {{"dtype", DT_FLOAT}, {"shape", value_shape_proto}}));
+               {{"dtype", DT_FLOAT}, {"shape", value_shape_proto}});
 
-      gdef.add_node()->CopyFrom(NDef("my_var/init", "AssignVariableOp",
-                                     {"my_var", "my_const"},
-                                     {{"dtype", DT_FLOAT}}));
-      gdef.add_node()->CopyFrom(NDef("my_var/Read/ReadVariableOp",
-                                     "ReadVariableOp", {"my_var"},
-                                     {{"dtype", DT_FLOAT}}));
+      *gdef.add_node() = NDef("my_var/init", "AssignVariableOp",
+                              {"my_var", "my_const"}, {{"dtype", DT_FLOAT}});
+      *gdef.add_node() = NDef("my_var/Read/ReadVariableOp", "ReadVariableOp",
+                              {"my_var"}, {{"dtype", DT_FLOAT}});
       // Define function f(x, v) = x * v + x, where v is a variable.
       fdef = FunctionDefHelper::Define(
           "f",                          // Name
@@ -146,7 +144,7 @@ class TrtConverterTest
            {{"my_add"}, "AddV2", {"x", "my_mul"}, {{"T", DT_FLOAT}}},
            {{"q"}, "Identity", {"my_add"}, {{"T", DT_FLOAT}}}});
     }
-    gdef.mutable_library()->add_function()->CopyFrom(fdef);
+    *gdef.mutable_library()->add_function() = fdef;
 
     return gdef;
   }
@@ -166,13 +164,12 @@ class TrtConverterTest
     SignatureDef signature_def;
     (*signature_def.mutable_inputs())["input"].set_name("input:0");
     (*signature_def.mutable_inputs())["input"].set_dtype(DT_FLOAT);
-    (*signature_def.mutable_inputs())["input"].mutable_tensor_shape()->CopyFrom(
-        shape_proto);
+    *(*signature_def.mutable_inputs())["input"].mutable_tensor_shape() =
+        shape_proto;
     (*signature_def.mutable_outputs())["output"].set_name("output:0");
     (*signature_def.mutable_outputs())["output"].set_dtype(DT_FLOAT);
-    (*signature_def.mutable_outputs())["output"]
-        .mutable_tensor_shape()
-        ->CopyFrom(shape_proto);
+    *(*signature_def.mutable_outputs())["output"].mutable_tensor_shape() =
+        shape_proto;
     (*out.mutable_signature_def())["serving_default"] = signature_def;
 
     VLOG(2) << signature_def.DebugString();
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
index 110a32b1f2f..798ebd8bd0c 100755
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
@@ -108,6 +108,10 @@ Status SetupBindings(nvinfer1::ICudaEngine* cuda_engine, const Tensor& tensor,
     case nvinfer1::DataType::kUINT8:
       buffers[binding_index] = const_cast<uint8*>(tensor.flat<uint8>().data());
       break;
+#endif
+#if IS_TRT_VERSION_GE(8, 6, 0, 0)
+    case nvinfer1::DataType::kFP8:
+      return errors::Internal("FP8 inputs are not supported yet!");
 #endif
     default:
       return errors::Internal("Unknown TRT data type: ",
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 8827bd480b1..0bf252386bc 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -35,6 +35,7 @@ package_group(
         "//tensorflow/compiler/tf2xla/...",
         "//tensorflow/core/tpu/...",
         "//tensorflow/python/compiler/...",
+        "//tensorflow/python/util/...",
     ],
 )
 
@@ -42,6 +43,7 @@ package_group(
     name = "friends",
     includes = [":internal"],
     packages = [
+        "//platforms/performance/automl/...",
         "//tensorflow/...",
         "//tensorflow_federated/cc/core/impl/executors/...",
         "//tensorflow_models/...",
@@ -210,6 +212,7 @@ filegroup(
         "//tensorflow/compiler/xla:cpu_runtime_hdrs",
         "//tensorflow/compiler/xla/service:custom_call_status_hdrs",
         "//tensorflow/compiler/xla/service/cpu:runtime_hdrs",
+        "//tensorflow/compiler/xla/service/cpu:xla_runtime_runner_hdrs",
         "//tensorflow/core/kernels:xla_cpu_runtime_hdrs",
         "//tensorflow/core/platform:xla_cpu_runtime_srcs",
         "//tensorflow/tsl/framework:xla_cpu_runtime_hdrs",
@@ -226,6 +229,7 @@ filegroup(
         "//tensorflow/compiler/xla:cpu_runtime_srcs",
         "//tensorflow/compiler/xla/service:custom_call_status_srcs",
         "//tensorflow/compiler/xla/service/cpu:runtime_srcs",
+        "//tensorflow/compiler/xla/service/cpu:xla_runtime_runner_srcs",
         "//tensorflow/core/kernels:xla_cpu_runtime_srcs",
         "//tensorflow/core/platform:xla_cpu_runtime_srcs",
         "//tensorflow/tsl/platform:xla_cpu_runtime_srcs",
@@ -282,7 +286,9 @@ cc_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "//third_party/eigen3",
@@ -371,6 +377,7 @@ cc_library(
         # binary produced by tfcompile.
         "//tensorflow/compiler/xla:cpu_function_runtime",
         "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla/service/cpu:buffer_desc",
         "//tensorflow/core/platform:types",
     ],
 )
@@ -403,10 +410,10 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:platform_util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/compiler/xla/stream_executor:platform",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
     ] + if_libtpu(
         if_false = [
             "//tensorflow/compiler/xla/service:cpu_plugin",
@@ -417,14 +424,32 @@ cc_library(
     ),
 )
 
+tf_cc_test(
+    name = "graph_compiler_test",
+    srcs = ["graph_compiler_test.cc"],
+    deps = [
+        ":graph_compiler_util",
+        ":tf2xla_proto_cc",
+        ":xla_compilation_device",
+        ":xla_compiler",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/monitoring:cell_reader",
+        "//tensorflow/core/platform:refcount",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "xla_compiler",
     srcs = [
         "const_analysis.cc",
         "graph_compiler.cc",
         "xla_compiler.cc",
-        "xla_op_kernel.cc",
         "xla_cpu_backend.cc",
+        "xla_op_kernel.cc",
     ] + if_cuda_is_configured([
         "xla_gpu_backend.cc",
     ]) + if_rocm_is_configured([
@@ -442,8 +467,8 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":common",
-        ":layout_util",
         ":host_compute_metadata_proto_cc",
+        ":layout_util",
         ":rearrange_function_argument",
         ":sharding_util",
         ":side_effect_util",
@@ -455,22 +480,13 @@ cc_library(
         ":xla_helpers",
         ":xla_op_registry",
         ":xla_resource",
-        "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
         "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/jit:xla_compile_util",
+        "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
+        "//tensorflow/compiler/mlir/tf2xla/api/v0:compile_mlir_util_no_tf_dialect_passes",
         "//tensorflow/compiler/mlir/utils:array_container_utils",
-        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:layout_util",
-        "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util_no_tf_dialect_passes",
-        "//tensorflow/compiler/xla/client:value_inference",
-        "//tensorflow/compiler/xla/service:computation_placer_hdr",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -479,11 +495,12 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:value_inference",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/core/util:overflow",
-        "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/compiler/xla/service:computation_placer_hdr",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:layout_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -491,6 +508,14 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/core/util:overflow",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
     ] + if_libtpu([
         ":xla_tpu_backend_registration",
     ]),
@@ -1336,8 +1361,9 @@ cc_library(
     deps = [
         ":xla_compiler",
         "//tensorflow/compiler/jit:xla_compile_util",
-        "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util_no_tf_dialect_passes",
+        "//tensorflow/compiler/mlir/tf2xla/api/v0:compile_mlir_util_no_tf_dialect_passes",
         "//tensorflow/compiler/mlir/utils:array_container_utils",
+        "//tensorflow/core:framework",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -1380,3 +1406,41 @@ tf_cuda_cc_test(
         "@com_google_absl//absl/memory",
     ],
 )
+
+filegroup(
+    name = "tf2xla_opset_hdrs",
+    srcs = [
+        "tf2xla_opset.h",
+    ],
+    visibility = ["//tensorflow/python/util:__pkg__"],
+)
+
+cc_library(
+    name = "tf2xla_opset",
+    srcs = [
+        "tf2xla_opset.cc",
+    ],
+    hdrs = ["tf2xla_opset.h"],
+    visibility = ["//tensorflow/python:__pkg__"],
+    deps = [
+        ":tf2xla_util",
+        ":xla_op_registry",
+        "//tensorflow/compiler/jit:xla_device",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "tf2xla_opset_test",
+    srcs = [
+        "tf2xla_opset_test.cc",
+    ],
+    deps = [
+        ":tf2xla_opset",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index fef312ab635..ee91e574a02 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -1528,10 +1528,10 @@ Status FunctionalizeCond::FunctionalizeInternal() {
   // nesting. (CondId, AncestorId) is not enough, e.g.
   //   pred1 = array_ops.placeholder(dtypes.bool, name='pred1')
   //   pred2 = array_ops.placeholder(dtypes.bool, name='pred2')
-  //   cond1 = control_flow_ops.cond(pred1, ...)
-  //   cond2 = control_flow_ops.cond(pred2, ...)
-  //   cond3 = control_flow_ops.cond(pred1, use cond1 and cond2)
-  //   cond4 = control_flow_ops.cond(pred2, use cond1 and cond2)
+  //   cond1 = cond.cond(pred1, ...)
+  //   cond2 = cond.cond(pred2, ...)
+  //   cond3 = cond.cond(pred1, use cond1 and cond2)
+  //   cond4 = cond.cond(pred2, use cond1 and cond2)
   // cond3 and cond4 have the same (CondId, AncestorId), but they should not
   // be merged into one "If" node (because they have different predicates).
   std::deque<std::vector<Node*>> merge_clusters;
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 91a9fc63716..b1fd82aeed4 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -65,7 +65,7 @@ Status FindIfThenAndElse(const GraphDef& graph, string* op_name,
 // Graph:
 // x = array_ops.placeholder(dtypes.int32)
 // y = array_ops.placeholder(dtypes.int32)
-// z = control_flow_ops.cond(
+// z = cond.cond(
 //     math_ops.less(y, x), lambda: math_ops.multiply(y, 17),
 //     lambda: math_ops.add(x, 23))
 //
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 1b07033b5c8..f72a47ace77 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -47,12 +47,20 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
+auto* graph_compiler_failed_compilation_op_count =
+    tensorflow::monitoring::Counter<1>::New(
+        /*metric_name=*/
+        "/tensorflow/core/tf2xla/graph_compilation_failed_op_count",
+        /*metric_description=*/"Records an op that failed to compile",
+        /*metric_label=*/"op_name");
+
 namespace {
 Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
                         const std::vector<const XlaExpression*>& expressions,
@@ -177,6 +185,9 @@ Status GraphCompiler::Compile() {
       device_->Compute(CHECK_NOTNULL(params.op_kernel), &op_context);
       Status s = op_context.status();
       if (!s.ok()) {
+        graph_compiler_failed_compilation_op_count
+            ->GetCell(params.op_kernel->def().op())
+            ->IncrementBy(1);
         return AttachDef(s, n->def());
       }
     }
diff --git a/tensorflow/compiler/tf2xla/graph_compiler_test.cc b/tensorflow/compiler/tf2xla/graph_compiler_test.cc
new file mode 100644
index 00000000000..6ec8b8f8793
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/graph_compiler_test.cc
@@ -0,0 +1,150 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/graph_compiler.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/tf2xla/graph_compiler_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/monitoring/cell_reader.h"
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::monitoring::testing::CellReader;
+
+constexpr char kOpCompilationFailureStreamz[] =
+    "/tensorflow/core/tf2xla/graph_compilation_failed_op_count";
+
+class DummyOp : public XlaOpKernel {
+ public:
+  explicit DummyOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {}
+};
+
+REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_DEFAULT), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("NoOp").Device("XLA_TPU_JIT"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("NoOp").Device("XLA_CPU_JIT"), DummyOp);
+
+class MockAlwaysFailsOp : public XlaOpKernel {
+ public:
+  explicit MockAlwaysFailsOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    ctx->CtxFailure(__FILE__, __LINE__, errors::InvalidArgument("MockBroken"));
+  }
+};
+
+REGISTER_OP("MockAlwaysFails")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+A test only Op that always fails to compile.
+)doc");
+
+REGISTER_KERNEL_BUILDER(Name("MockAlwaysFails").Device(DEVICE_DEFAULT),
+                        MockAlwaysFailsOp);
+REGISTER_KERNEL_BUILDER(Name("MockAlwaysFails").Device("XLA_CPU_JIT"),
+                        MockAlwaysFailsOp);
+REGISTER_KERNEL_BUILDER(Name("MockAlwaysFails").Device("XLA_TPU_JIT"),
+                        MockAlwaysFailsOp);
+REGISTER_XLA_OP(Name("MockAlwaysFails").CompilationOnly(), MockAlwaysFailsOp);
+
+class GraphCompilerTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    device_ = new tensorflow::XlaCompilationDevice(
+        tensorflow::SessionOptions(), tensorflow::DeviceType("XLA_TPU_JIT"));
+    device_mgr_ = std::make_unique<StaticDeviceMgr>(absl::WrapUnique(device_));
+  }
+
+  Status RunGraphCompiler(Graph& graph) {
+    ProcessFunctionLibraryRuntime runtime(
+        device_mgr_.get(), Env::Default(), nullptr, TF_GRAPH_DEF_VERSION,
+        &graph.flib_def(), OptimizerOptions());
+
+    xla::XlaBuilder builder("test_builder");
+    XlaCompiler::Options options;
+    options.device_type = "XLA_TPU_JIT";
+
+    XlaCompiler xla_compiler(options);
+
+    // Resource cleanup is messy, see the LINT.ThenChange for comments.
+    // LINT.IfChange
+    XlaContext* xla_context = new XlaContext(&xla_compiler, &builder, &graph);
+    core::ScopedUnref context_unref(xla_context);
+    xla_context->Ref();
+
+    auto step_container =
+        std::make_unique<ScopedStepContainer>(0, [this](const string& name) {
+          Status status = this->device_->resource_manager()->Cleanup(name);
+        });
+    auto container_status = step_container->Create(
+        device_->resource_manager(), XlaContext::kXlaContextResourceName,
+        xla_context);
+
+    GraphCompiler graph_compiler(
+        device_, &graph, runtime.GetFLR(device_->name()), step_container.get());
+
+    return graph_compiler.Compile();
+    // LINT.ThenChange(//tensorflow/compiler/tf2xla/xla_compiler.cc:ExecuteGraph)
+  }
+
+ protected:
+  XlaCompilationDevice* device_;  // Owned by device_mgr_
+  std::unique_ptr<StaticDeviceMgr> device_mgr_;
+};
+
+TEST_F(GraphCompilerTest, CompilesGraph) {
+  Graph graph(OpRegistry::Global());
+
+  EXPECT_TRUE(RunGraphCompiler(graph).ok());
+}
+
+TEST_F(GraphCompilerTest, RecordsStreamzFailedCompilationNode) {
+  Graph graph(OpRegistry::Global());
+  Node* mock_fail;
+  ASSERT_TRUE(NodeBuilder("mock_fail", "MockAlwaysFails")
+                  .Finalize(&graph, &mock_fail)
+                  .ok());
+  graph.AddControlEdge(graph.source_node(), mock_fail);
+  graph.AddControlEdge(mock_fail, graph.sink_node());
+
+  CellReader<int64_t> op_reader(kOpCompilationFailureStreamz);
+
+  EXPECT_FALSE(RunGraphCompiler(graph).ok());
+
+  EXPECT_EQ(op_reader.Delta("MockAlwaysFails"), 1);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index cb4ed43287b..ac616e542a5 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -172,28 +172,15 @@ tf_kernel_library(
         ":case_op",
         ":conv_op_helpers",
         ":if_op",
+        ":rng_converter_utils",
         ":tensor_list_utils",
         ":while_op",
         ":xla_call_module_op",
-        ":rng_converter_utils",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@stablehlo//:chlo_ops",
         "//tensorflow/compiler/jit:xla_activity_listener",
         "//tensorflow/compiler/jit:xla_activity_proto_cc",
-        "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:mlir_xla_op_kernel",
         "//tensorflow/compiler/tf2xla:xla_compilation_device",
@@ -238,6 +225,8 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client/lib:sorting",
         "//tensorflow/compiler/xla/client/lib:svd",
         "//tensorflow/compiler/xla/client/lib:tridiagonal",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -247,6 +236,17 @@ tf_kernel_library(
         "//tensorflow/core/kernels:stateless_random_ops_v2_header",
         "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/core/util:overflow",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:chlo_ops",
     ] + if_cuda_or_rocm(
         if_false = [],
         if_true = [":light_outside_compilation"],
@@ -377,13 +377,38 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "xla_call_module_loader",
+    srcs = ["xla_call_module_loader.cc"],
+    hdrs = ["xla_call_module_loader.h"],
+    deps = [
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:regexp",
+        "//tensorflow/tsl/platform:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+        "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_passes",
+        "@stablehlo//:stablehlo_serialization",
+        "@stablehlo//:vhlo_ops",
+    ],
+)
+
 tf_kernel_library(
     name = "xla_call_module_op",
     srcs = ["xla_call_module_op.cc"],
     deps = [
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        ":xla_call_module_loader",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
@@ -397,27 +422,14 @@ tf_kernel_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/tpu:tpu_defs",
-        "//tensorflow/tsl/platform:regexp",
         "@com_google_absl//absl/strings",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
-        "@stablehlo//:chlo_ops",
-        "@stablehlo//:stablehlo_ops",
-        "@stablehlo//:stablehlo_passes",
-        "@stablehlo//:stablehlo_serialization",
-        "@stablehlo//:vhlo_ops",
+        "@llvm-project//llvm:Support",
     ],
 )
 
diff --git a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
index 1707fd10a74..e2b3e3ffcf5 100644
--- a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
@@ -35,13 +35,13 @@ class DataFormatDimMapOp : public XlaOpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
     string dst_format;
     OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
-    OP_REQUIRES(context, src_format.size() == 4 or src_format.size() == 5,
+    OP_REQUIRES(context, src_format.size() == 4 || src_format.size() == 5,
                 errors::InvalidArgument(
                     absl::StrCat("Source format must of length 4 or 5, "
                                  "received src_format = ",
                                  src_format)));
     OP_REQUIRES(
-        context, dst_format.size() == 4 or dst_format.size() == 5,
+        context, dst_format.size() == 4 || dst_format.size() == 5,
         errors::InvalidArgument(absl::StrCat(
             "Destination format must of length 4 or 5, received dst_format = ",
             dst_format)));
diff --git a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
index f169d86e8b1..aaf6a8f89eb 100644
--- a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
+++ b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
@@ -521,8 +521,8 @@ void GenericTfCallback(void* stream_handle, void** buffers, const char* opaque,
                        int opaque_len, XlaCustomCallStatus* status) {
   Status s = CallTfKernel(stream_handle, buffers, opaque, opaque_len);
   if (!s.ok()) {
-    XlaCustomCallStatusSetFailure(status, s.error_message().c_str(),
-                                  s.error_message().size());
+    auto msg = s.message();
+    XlaCustomCallStatusSetFailure(status, msg.data(), msg.size());
   }
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/topk_op.cc b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
index 2b3ae968309..ad0366eba03 100644
--- a/tensorflow/compiler/tf2xla/kernels/topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
@@ -28,12 +29,17 @@ class TopKOp : public XlaOpKernel {
  public:
   explicit TopKOp(OpKernelConstruction* context) : XlaOpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("sorted", &sorted_));
+    DataType index_type;
+    OP_REQUIRES_OK(context, context->GetAttr("index_type", &index_type));
+    OP_REQUIRES_OK(context, DataTypeToPrimitiveType(index_type, &index_type_));
   }
 
   void Compile(XlaOpKernelContext* context) override {
-    const TensorShape input_shape = context->InputShape(0);
-    int last_dim = input_shape.dims() - 1;
-    int last_dim_size = input_shape.dim_size(last_dim);
+    const StatusOr<xla::Shape> input_shape_or = context->InputXlaShape(0);
+    OP_REQUIRES_OK(context, input_shape_or.status());
+    const xla::Shape& input_shape = *input_shape_or;
+    int last_dim = input_shape.dimensions_size() - 1;
+    int last_dim_size = input_shape.dimensions(last_dim);
 
     int64_t k;
     bool k_bound_inferrable =
@@ -49,7 +55,7 @@ class TopKOp : public XlaOpKernel {
     OP_REQUIRES(context, k >= 0,
                 errors::InvalidArgument("Need k >= 0, got ", k));
 
-    OP_REQUIRES(context, input_shape.dims() >= 1,
+    OP_REQUIRES(context, input_shape.dimensions_size() >= 1,
                 errors::InvalidArgument("input must be >= 1-D, got shape ",
                                         input_shape.DebugString()));
 
@@ -64,7 +70,7 @@ class TopKOp : public XlaOpKernel {
     bool k_is_dynamic;
     OP_REQUIRES_OK(context,
                    context->ResolveInputDynamismIntoPred(1, &k_is_dynamic));
-    xla::XlaOp output_tuple = TopK(context->Input(0), k);
+    xla::XlaOp output_tuple = TopK(context->Input(0), k, index_type_);
     auto values = xla::GetTupleElement(output_tuple, 0);
     auto indices = xla::GetTupleElement(output_tuple, 1);
     if (k_is_dynamic) {
@@ -78,11 +84,18 @@ class TopKOp : public XlaOpKernel {
 
  private:
   bool sorted_;
+  xla::PrimitiveType index_type_;
 };
 
-REGISTER_XLA_OP(Name("TopKV2").CompileTimeConstantInput("k").TypeConstraint(
-                    "T", {DT_UINT32, DT_INT32, DT_UINT64, DT_INT64, DT_FLOAT,
-                          DT_HALF, DT_DOUBLE, DT_BFLOAT16, DT_UINT8, DT_INT8}),
+REGISTER_XLA_OP(Name("TopKV2")
+                    .CompileTimeConstantInput("k")
+                    .TypeConstraint("T",
+                                    {DT_UINT32, DT_INT32, DT_UINT64, DT_INT64,
+                                     DT_FLOAT, DT_HALF, DT_DOUBLE, DT_BFLOAT16,
+                                     DT_UINT8, DT_INT8, DT_INT16})
+                    .TypeConstraint("Tk", {DT_INT16, DT_INT32, DT_INT64})
+                    .TypeConstraint("index_type",
+                                    {DT_INT16, DT_INT32, DT_INT64}),
                 TopKOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
new file mode 100644
index 00000000000..c8a82fbfa28
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
@@ -0,0 +1,473 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinDialect.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/Serialization.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo
+#include "stablehlo/transforms/Passes.h"  // from @stablehlo
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
+#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/regexp.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// When adding a new version, write when it was added. Also change the default
+// version in the constructor in xla.py.
+// Version 1 used MHLO & CHLO, not supported anymore.
+// Version 2 supports StableHLO & CHLO. From 10/2022.
+const int VERSION_START_STABLE_HLO = 2;
+// Version 3 supports platform checking and multiple platforms. From 02/2023.
+const int VERSION_START_PLATFORMS = 3;
+// Version 4 supports StableHLO with compatibility guarantees.
+// Used from 03/2023.
+const int VERSION_START_STABLE_HLO_COMPATIBILITY = 4;
+// Version 5 add support to stablehlo.custom_call for host call tf graph.
+// Used from 04/2023.
+const int VERSION_SUPPORT_CUSTOM_CALL = 5;
+const int VERSION_MINIMUM_SUPPORTED = VERSION_START_STABLE_HLO;
+const int VERSION_MAXIMUM_SUPPORTED = VERSION_SUPPORT_CUSTOM_CALL;
+
+// Computes a dimension value from the dim_arg specification.
+// The specification is of the form "<arg_idx>.<arg_axis_idx>".
+tsl::StatusOr<mlir::Value> ComputeDimensionValue(
+    int version, std::string dim_arg_spec, std::vector<mlir::Value> arguments,
+    mlir::OpBuilder op_builder, mlir::Type dim_arg_type) {
+  static const LazyRE2 dim_arg_spec_re = {R"((\d+).(\d+))"};
+  int arg_idx, arg_axis_idx;
+  if (!RE2::FullMatch(dim_arg_spec, *dim_arg_spec_re, &arg_idx,
+                      &arg_axis_idx)) {
+    return tsl::errors::InvalidArgument("Syntax error in dim_args_spec '",
+                                        dim_arg_spec, "'");
+  }
+  if (arg_idx < 0 || arg_idx >= arguments.size()) {
+    return tsl::errors::InvalidArgument(
+        "Invalid argument index ", arg_idx,
+        " when the number of non-dimension arguments is ", arguments.size(),
+        " in dim_arg_spec '", dim_arg_spec, "'");
+  }
+  mlir::RankedTensorType arg_type =
+      arguments[arg_idx].getType().dyn_cast<mlir::RankedTensorType>();
+  if (!arg_type) {
+    return tsl::errors::InvalidArgument(
+        "Argument ", arg_idx, " referenced in dim_arg_spec '", dim_arg_spec,
+        "' does not have a RankedTensorType");
+  }
+  if (arg_axis_idx < 0 || arg_axis_idx >= arg_type.getShape().size()) {
+    return tsl::errors::InvalidArgument(
+        "Invalid axis index ", arg_axis_idx,
+        " when the rank of non-dimension argument ", arg_idx, " is ",
+        arg_type.getShape().size(), " in dim_arg_spec '", dim_arg_spec, "'");
+  }
+  mlir::Value val;
+  mlir::Type get_dim_type =
+      mlir::RankedTensorType::get({}, op_builder.getI32Type());
+  val = op_builder.create<mlir::stablehlo::GetDimensionSizeOp>(
+      arguments[arg_idx].getLoc(), get_dim_type, arguments[arg_idx],
+      op_builder.getI64IntegerAttr(arg_axis_idx));
+  if (dim_arg_type != get_dim_type) {
+    val = op_builder.create<mlir::stablehlo::ConvertOp>(
+        arguments[arg_idx].getLoc(), dim_arg_type, val);
+  }
+  return val;
+}
+
+}  // namespace
+
+tsl::StatusOr<std::unique_ptr<XlaCallModuleLoader>> XlaCallModuleLoader::Create(
+    mlir::MLIRContext *context, int version, std::string module_str,
+    std::vector<std::string> dim_args_spec, int platform_index) {
+  if (version < VERSION_MINIMUM_SUPPORTED) {
+    return tsl::errors::InvalidArgument(
+        "XlaCallModuleOp with version ", version,
+        " is not supported anymore. Must be >= ", VERSION_MINIMUM_SUPPORTED);
+  }
+  if (version > VERSION_MAXIMUM_SUPPORTED) {
+    return tsl::errors::InvalidArgument(
+        "XlaCallModuleOp with version ", version,
+        " is not supported by this build. Must be <= ",
+        VERSION_MAXIMUM_SUPPORTED);
+  }
+
+  if (version < VERSION_START_PLATFORMS) {
+    platform_index = -1;
+  }
+
+  std::unique_ptr<XlaCallModuleLoader> loader(new XlaCallModuleLoader);
+  TF_RETURN_IF_ERROR(loader->LoadAndPreprocessModule(
+      context, version, std::move(module_str), std::move(dim_args_spec),
+      platform_index));
+  return loader;
+}
+
+// Adds a wrapper for the "main" function to compute the platform index and the
+// dimension arguments.
+//
+// The input module has the following structure:
+//
+//    func public main(%arg_platform_index: i32, %arg_dim0: i32, %arg_dim1: i32,
+//                     %arg0: f32[?, ?, 8]) { ... }
+//
+// where %arg_platform_index is the index of the current compilation platform
+// among the declared `platforms` (missing if version < 3 or if platforms has
+// fewer than 2 elements), %arg_dim0 and %arg_dim1 are dimension arguments
+// (missing if dim_args_spec is empty). The value of the dimension arguments
+// are computed based on the static shapes of the actual arguments
+// (%arg0 and following).
+// In the above example, the dim_args_spec array would have two elements, one
+// for %arg_dim0 and one for %arg_dim1. E.g., ['0.0', '0.1'] specifies that
+// %arg_dim0 should be set to the size of axis 0 or array argument 0 (%arg0),
+// while %arg_dim1 should be set to the size of axis 1.
+// The platform index argument must be a 0-dimensional 32-bit integer, and the
+// dimension arguments must be 0-dimensional tensors of integer type.
+//
+// We create a new "main" function as follows:
+//   func public main(%arg0: f32[?, ?, 8]) {
+//      %arg_platform_index = stablehlo.constant <platform_index>
+//      %arg_dim0 = stablehlo.get_dimension_size(%arg0) dimension=0
+//      %arg_dim1 = stablehlo.get_dimension_size(%arg0) dimension=1
+//      %res = func.call _wrapped_main(%arg_platform_index,
+//                                     %arg_dim0, %arg_dim1, %arg0)
+//      return %res
+//   }
+//   func private _wrapped_main(%arg_platform_index: i32,
+//                              %arg_dim0: i32, %arg_dim1: i32,
+//                              %arg0: f32[?, ?, 8]) {
+//      ... the original main function ...
+//   }
+//
+// and then we run the inliner. This is important because in the
+// RefineDynamicShapes method called in Compile we refine the shape of the
+// array arguments. This would create a type error at the call to _wrapped_main
+// with the expected type of %arg0.
+tsl::Status XlaCallModuleLoader::AddMainWrapper() {
+  int nr_dim_args = dim_args_spec_.size();
+  // Locate the 'main' function.
+  // This is the convention used by MlirToXlaComputation.
+  mlir::func::FuncOp orig_main =
+      module_->lookupSymbol<mlir::func::FuncOp>("main");
+  if (!orig_main) {
+    return tsl::errors::InvalidArgument("Cannot find 'main' in module");
+  }
+  int nr_platform_args = 0;
+  if (platform_index_ >= 0) {
+    nr_platform_args = 1;
+  }
+  if (orig_main.getNumArguments() <= nr_platform_args + nr_dim_args) {
+    return tsl::errors::InvalidArgument(
+        "The module should have ", nr_platform_args,
+        " platform index arguments and ", nr_dim_args,
+        " dimension arguments, but it ", "has only ",
+        orig_main.getNumArguments(), " total arguments");
+  }
+  mlir::Block &orig_main_body = orig_main.front();
+
+  mlir::SymbolTable::setSymbolVisibility(
+      orig_main, mlir::SymbolTable::Visibility::Private);
+  mlir::OpBuilder op_builder(module_->getBodyRegion());
+  orig_main.setName(op_builder.getStringAttr("_wrapped_main"));
+  mlir::Location loc = module_->getLoc();
+  std::vector<mlir::Type> new_main_arg_types(
+      orig_main.getArgumentTypes().begin() + nr_platform_args + nr_dim_args,
+      orig_main.getArgumentTypes().end());
+  mlir::func::FuncOp new_main = op_builder.create<mlir::func::FuncOp>(
+      loc, "main",
+      mlir::FunctionType::get(module_->getContext(),
+                              /*inputs=*/new_main_arg_types,
+                              /*results=*/orig_main.getResultTypes()));
+  mlir::SymbolTable::setSymbolVisibility(new_main,
+                                         mlir::SymbolTable::Visibility::Public);
+  mlir::Block *new_main_block = new_main.addEntryBlock();
+  std::vector<mlir::Value> block_args(new_main_block->getArguments().begin(),
+                                      new_main_block->getArguments().end());
+  op_builder.setInsertionPointToStart(new_main_block);
+
+  std::vector<mlir::Value> call_args(orig_main_body.getNumArguments());
+  for (int i = 0; i < orig_main_body.getNumArguments(); ++i) {
+    if (i < nr_platform_args + nr_dim_args) {
+      mlir::Type arg_type = orig_main.getArgument(i).getType();
+      mlir::RankedTensorType arg_ranked_type =
+          arg_type.dyn_cast<mlir::RankedTensorType>();
+      if (!arg_ranked_type ||
+          !arg_ranked_type.getElementType().dyn_cast<mlir::IntegerType>() ||
+          !arg_ranked_type.getShape().empty()) {
+        std::string argument_type =
+            (i < nr_platform_args) ? "platform index" : "dimension";
+        return tsl::errors::InvalidArgument(
+            "Module argument at index ", i,
+            " should be a 0-dimensional integer-tensor ", argument_type,
+            " argument but has type ", mlir::debugString(arg_type));
+      }
+      if (i < nr_platform_args) {
+        if (arg_ranked_type.getElementTypeBitWidth() != 32) {
+          return tsl::errors::InvalidArgument(
+              "Module argument at index ", i,
+              " should be a 0-dimensional 32-bit integer-tensor"
+              " platform index argument but has type ",
+              mlir::debugString(arg_type));
+        }
+        call_args[i] = op_builder.create<mlir::stablehlo::ConstantOp>(
+            block_args[0].getLoc(),
+            op_builder.getI32IntegerAttr(platform_index_));
+      } else {
+        TF_ASSIGN_OR_RETURN(
+            call_args[i],
+            ComputeDimensionValue(
+                version_, dim_args_spec_[i - nr_platform_args], block_args,
+                op_builder, orig_main.getArgument(i).getType()));
+      }
+    } else {
+      call_args[i] =
+          new_main_block->getArgument(i - nr_platform_args - nr_dim_args);
+    }
+  }
+  mlir::func::CallOp call_op = op_builder.create<mlir::func::CallOp>(
+      loc, orig_main.getResultTypes(), orig_main.getSymName(), call_args);
+  op_builder.create<mlir::func::ReturnOp>(loc, call_op.getResults());
+  VLOG(3) << "XlaCallModule module with wrapper: "
+          << mlir::debugString(*module_);
+
+  return tsl::OkStatus();
+}
+
+tsl::Status XlaCallModuleLoader::RefineDynamicShapes(
+    llvm::ArrayRef<xla::Shape> input_shapes) {
+  // Locate the (wrapped) 'main' function.
+  // This is the convention used by MlirToXlaComputation.
+  mlir::Block &main_body = main_.front();
+  int nr_platform_args = (platform_index_ >= 0 ? 1 : 0);
+  int nr_dim_args = dim_args_spec_.size();
+  int non_dimension_arguments = input_shapes.size();
+  if (non_dimension_arguments != main_body.getNumArguments()) {
+    return tsl::errors::InvalidArgument(
+        "Incorrect number of arguments passed to XlaCallModule: ",
+        non_dimension_arguments, ". The module takes ",
+        main_body.getNumArguments() + nr_platform_args + nr_dim_args,
+        " arguments of which ", nr_platform_args,
+        " platform index arguments and ", nr_dim_args,
+        " dimension arguments. It must be called with ",
+        main_body.getNumArguments(), " arguments.");
+  }
+
+  mlir::Builder builder(module_->getContext());
+  std::vector<mlir::Type> static_array_input_types(non_dimension_arguments);
+  for (int i = 0, end = non_dimension_arguments; i < end; ++i) {
+    const xla::Shape &xla_shape = input_shapes[i];
+    std::vector<int64_t> xla_dimensions(xla_shape.dimensions().begin(),
+                                        xla_shape.dimensions().end());
+    TF_ASSIGN_OR_RETURN(
+        mlir::Type element_type,
+        ConvertPrimitiveTypeToMLIRType(xla_shape.element_type(), builder));
+    mlir::Type type = mlir::RankedTensorType::get(xla_dimensions, element_type);
+    // TODO(burmako): This fails with an obscure compilation error.
+    // TF_ASSIGN_OR_RETURN(
+    //     mlir::Type type,
+    //     ConvertShapeToType<mlir::RankedTensorType>(xla_shape, builder));
+    VLOG(3) << "XlaCallModule static array input type #" << i << ": "
+            << mlir::debugString(type);
+    // TODO(b/278273480): Determine whether it's safe to override the element
+    // type using that from the input shape.
+    static_array_input_types[i] = type;
+  }
+
+  // Refine 'main' argument types to use static input types instead.
+  // This will only change the argument types and will not propagate the
+  // additional type information further. For that, we'll need to run
+  // shape refinement as explained below.
+  // Before refining the argument types it is useful to run the inliner to
+  // remove calls that may be called with the input arguments.
+  mlir::PassManager pm_inline(module_->getContext());
+  pm_inline.addPass(mlir::createInlinerPass());
+  if (!mlir::succeeded(pm_inline.run(*module_))) {
+    return tsl::errors::InvalidArgument("Module inlining failed");
+  }
+  VLOG(3) << "XlaCallModule module after inlining: "
+          << mlir::debugString(*module_);
+
+  auto static_array_output_types = llvm::to_vector(main_.getResultTypes());
+  for (auto i = 0; i < main_body.getNumArguments(); ++i) {
+    auto arg = main_body.getArgument(i);
+    arg.setType(static_array_input_types[i]);
+    // If the argument is used by `func.return`, then we also need to
+    // update function result types. It's not great that we need this hack,
+    // but in the future when we have stablehlo.func, stablehlo.return, etc,
+    // this will not be needed.
+    // TODO(burmako): Once https://github.com/openxla/stablehlo/issues/425 is
+    // fixed, clean this up.
+    for (mlir::OpOperand &use : arg.getUses()) {
+      if (auto ret = llvm::dyn_cast<mlir::func::ReturnOp>(use.getOwner())) {
+        static_array_output_types[use.getOperandNumber()] = arg.getType();
+      }
+    }
+  }
+  main_.setType(builder.getFunctionType(static_array_input_types,
+                                        static_array_output_types));
+
+  // Verify the module before running passes on it.
+  // If the module doesn't pass verification, all sorts of weirdness might
+  // happen if we run the pass manager.
+  if (failed(verify(*module_))) {
+    VLOG(3) << "XlaCallModule module with verification failed: "
+            << mlir::debugString(*module_);
+    return tsl::errors::InvalidArgument("Module verification failed");
+  }
+  mlir::PassManager pm(module_->getContext());
+  if (VLOG_IS_ON(3)) {
+    auto print_before = [](mlir::Pass *, mlir::Operation *) { return true; };
+    auto print_after = [](mlir::Pass *, mlir::Operation *) { return true; };
+    pm.enableIRPrinting(print_before, print_after, /*printModuleScope=*/true,
+                        /*printAfterOnlyOnChange=*/false);
+  }
+  pm.addPass(mlir::createCSEPass());
+  pm.addPass(mlir::stablehlo::createStablehloRefineShapesPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::stablehlo::createStablehloCanonicalizeDynamismPass());
+  if (!mlir::succeeded(pm.run(*module_))) {
+    return tsl::errors::InvalidArgument("Module shape refinement failed");
+  }
+
+  VLOG(3) << "XlaCallModule module with refined shapes: "
+          << mlir::debugString(*module_);
+  return tsl::OkStatus();
+}
+
+tsl::Status XlaCallModuleLoader::LoadAndPreprocessModule(
+    mlir::MLIRContext *context, int version, std::string module_str,
+    std::vector<std::string> dim_args_spec, int platform_index) {
+  context_ = context;
+  version_ = version;
+  dim_args_spec_ = std::move(dim_args_spec);
+  platform_index_ = platform_index;
+
+  // Load a superset of dialects; we should check at serialization time that
+  // we only include allowable dialects.
+  context_->loadDialect<mlir::func::FuncDialect>();
+  context_->loadDialect<mlir::stablehlo::StablehloDialect>();
+  context_->loadDialect<mlir::mhlo::MhloDialect>();
+  context_->loadDialect<mlir::chlo::ChloDialect>();
+  context_->loadDialect<mlir::vhlo::VhloDialect>();
+  // Parses both IR text and bytecode.
+  if (version >= VERSION_START_STABLE_HLO_COMPATIBILITY) {
+    module_ =
+        mlir::stablehlo::deserializePortableArtifact(module_str, context_);
+  } else {
+    module_ = mlir::parseSourceString<mlir::ModuleOp>(module_str, context_);
+  }
+
+  if (!module_) {
+    return tsl::errors::InvalidArgument("Cannot deserialize computation");
+  }
+  VLOG(3) << "Parsed serialized module (version " << version
+          << ", platform_index = " << platform_index_ << ", dim_args_spec = ["
+          << absl::StrJoin(dim_args_spec_, ", ") << "])\n"
+          << mlir::debugString(*module_);
+
+  if (failed(module_->verifyInvariants())) {
+    VLOG(1) << "MLIR verification failed.";
+    module_->dump();
+    return tsl::errors::InvalidArgument("Error verifying module");
+  }
+  main_ = module_->lookupSymbol<mlir::func::FuncOp>("main");
+  if (!main_) {
+    return tsl::errors::InvalidArgument("Cannot find 'main' in module");
+  }
+
+  if (!dim_args_spec_.empty() || platform_index_ >= 0) {
+    TF_RETURN_IF_ERROR(AddMainWrapper());
+    main_ = module_->lookupSymbol<mlir::func::FuncOp>("main");
+  }
+  return tsl::OkStatus();
+}
+
+tsl::Status XlaCallModuleLoader::ValidateModule() {
+  bool moduleHasUnsupportedDialects = false;
+  bool moduleHasDynamicShapes = false;
+
+  module_->walk([&](mlir::Operation *op) {
+    // StableHLO programs created by jax2tf only contain operations
+    // from Builtin, Func and StableHLO dialects.
+    if (!llvm::isa<mlir::BuiltinDialect, mlir::chlo::ChloDialect,
+                   mlir::func::FuncDialect, mlir::stablehlo::StablehloDialect>(
+            op->getDialect())) {
+      moduleHasUnsupportedDialects = true;
+      VLOG(3) << "Operation has unsupported dialects: "
+              << mlir::debugString(*op);
+    }
+
+    // It's sufficient to only check results because operands either come from
+    // results or from block arguments which are checked below.
+    auto hasDynamicShape = [](mlir::Value value) {
+      auto shaped_type = value.getType().dyn_cast<mlir::ShapedType>();
+      return shaped_type ? !shaped_type.hasStaticShape() : false;
+    };
+    bool opHasDynamicShapes = false;
+    opHasDynamicShapes |= llvm::any_of(op->getResults(), hasDynamicShape);
+    for (mlir::Region &region : op->getRegions()) {
+      opHasDynamicShapes |=
+          llvm::any_of(region.getArguments(), hasDynamicShape);
+    }
+    if (opHasDynamicShapes) {
+      moduleHasDynamicShapes = true;
+      VLOG(3) << "Operation has dynamic shapes: " << mlir::debugString(*op);
+    }
+  });
+
+  if (moduleHasUnsupportedDialects)
+    return tsl::errors::InvalidArgument("Module has unsupported dialects");
+  if (moduleHasDynamicShapes)
+    return tsl::errors::InvalidArgument("Module has dynamic shapes");
+  return tsl::OkStatus();
+}
+
+tsl::StatusOr<xla::XlaComputation> XlaCallModuleLoader::ToXlaComputation() {
+  xla::XlaComputation xla_computation;
+  TF_RETURN_IF_ERROR(
+      MlirToXlaComputation(*module_, xla_computation, false, false));
+  return xla_computation;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
new file mode 100644
index 00000000000..6196cfe1f20
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
@@ -0,0 +1,85 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_XLA_CALL_MODULE_LOADER_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_XLA_CALL_MODULE_LOADER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+
+class XlaCallModuleLoader {
+ public:
+  static tsl::StatusOr<std::unique_ptr<XlaCallModuleLoader>> Create(
+      mlir::MLIRContext* context, int version, std::string module_str,
+      std::vector<std::string> dim_args_spec, int platform_index);
+
+  int nr_outputs() { return main_.getNumResults(); }
+  mlir::TypeRange output_types() { return main_.getResultTypes(); }
+
+  // Refines the dynamic module arguments based on the static argument shapes.
+  // This assumes that the module has a "main" function without dimension args,
+  // but possibly with dynamic shapes. We read the static shapes of the inputs,
+  // then set them as the types of the function parameters, and run StableHLO
+  // shape refinement to specialize all dynamic shapes in the StableHLO program
+  // to static shapes.
+  //
+  // This method accepts a list of `llvm::ArrayRef` instead of `mlir::Type`.
+  // This is to prevent callers from accidentally passing `mlir::Type` owned by
+  // a context that's different from the one passed to `Create`, which could
+  // cause lifetime issues.
+  tsl::Status RefineDynamicShapes(llvm::ArrayRef<xla::Shape> input_shapes);
+
+  // Validate that the module represents a statically-shaped StableHLO program,
+  // otherwise all sorts of weirdness might happen in the HLO exporter which is
+  // much easier to detect here.
+  tsl::Status ValidateModule();
+
+  tsl::StatusOr<xla::XlaComputation> ToXlaComputation();
+
+ private:
+  XlaCallModuleLoader() = default;
+
+  // Initializes the loader with the given serialized module string.
+  tsl::Status LoadAndPreprocessModule(mlir::MLIRContext* context, int version,
+                                      std::string module_str,
+                                      std::vector<std::string> dim_args_spec,
+                                      int platform_index);
+
+  // Adds a wrapper for the "main" function to compute the platform index and
+  // the dimension arguments.
+  tsl::Status AddMainWrapper();
+
+  mlir::MLIRContext* context_;
+  int version_;
+  mlir::OwningOpRef<mlir::ModuleOp> module_;
+  int platform_index_;
+  std::vector<std::string> dim_args_spec_;
+  mlir::func::FuncOp main_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_XLA_CALL_MODULE_LOADER_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
index 6b30c17a9b0..fbb853528fc 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
@@ -14,441 +14,40 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/str_join.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinDialect.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Verifier.h"  // from @llvm-project
-#include "mlir/Parser/Parser.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
-#include "stablehlo/dialect/Serialization.h"  // from @stablehlo
-#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
-#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo
-#include "stablehlo/transforms/Passes.h"  // from @stablehlo
+#include "llvm/ADT/ArrayRef.h"
+#include "tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
-#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
-#include "tensorflow/tsl/platform/regexp.h"
 
 namespace tensorflow {
 namespace {
 
-// Version 1 used MHLO & CHLO, not supported anymore.
-// Version 2 supports StableHLO & CHLO. From 10/2022. Minimum from 03/2023.
-const int VERSION_START_STABLE_HLO = 2;
-// Version 3 supports platform checking and multiple platforms. From 02/2023.
-const int VERSION_START_PLATFORMS = 3;
-// Version 4 supports StableHLO with compatibility guarantees. From 03/2023
-const int VERSION_START_STABLE_HLO_COMPATIBILITY = 4;
-const int VERSION_MINIMUM_SUPPORTED = VERSION_START_STABLE_HLO;
-
-// Computes a dimension value from the dim_arg specification.
-// The specification is of the form "<arg_idx>.<arg_axis_idx>".
-StatusOr<mlir::Value> ComputeDimensionValue(int version, string dim_arg_spec,
-                                            std::vector<mlir::Value> arguments,
-                                            mlir::OpBuilder op_builder,
-                                            mlir::Type dim_arg_type) {
-  static const LazyRE2 dim_arg_spec_re = {R"((\d+).(\d+))"};
-  int arg_idx, arg_axis_idx;
-  if (!RE2::FullMatch(dim_arg_spec, *dim_arg_spec_re, &arg_idx,
-                      &arg_axis_idx)) {
-    return errors::InvalidArgument("Syntax error in dim_args_spec '",
-                                   dim_arg_spec, "'");
-  }
-  if (arg_idx < 0 || arg_idx >= arguments.size()) {
-    return errors::InvalidArgument(
-        "Invalid argument index ", arg_idx,
-        " when the number of non-dimension arguments is ", arguments.size(),
-        " in dim_arg_spec '", dim_arg_spec, "'");
-  }
-  mlir::RankedTensorType arg_type =
-      arguments[arg_idx].getType().dyn_cast<mlir::RankedTensorType>();
-  if (!arg_type) {
-    return errors::InvalidArgument(
-        "Argument ", arg_idx, " referenced in dim_arg_spec '", dim_arg_spec,
-        "' does not have a RankedTensorType");
-  }
-  if (arg_axis_idx < 0 || arg_axis_idx >= arg_type.getShape().size()) {
-    return errors::InvalidArgument("Invalid axis index ", arg_axis_idx,
-                                   " when the rank of non-dimension argument ",
-                                   arg_idx, " is ", arg_type.getShape().size(),
-                                   " in dim_arg_spec '", dim_arg_spec, "'");
-  }
-  mlir::Value val;
-  mlir::Type get_dim_type =
-      mlir::RankedTensorType::get({}, op_builder.getI32Type());
-  val = op_builder.create<mlir::stablehlo::GetDimensionSizeOp>(
-      arguments[arg_idx].getLoc(), get_dim_type, arguments[arg_idx],
-      op_builder.getI64IntegerAttr(arg_axis_idx));
-  if (dim_arg_type != get_dim_type) {
-    val = op_builder.create<mlir::stablehlo::ConvertOp>(
-        arguments[arg_idx].getLoc(), dim_arg_type, val);
-  }
-  return val;
-}
-
-// Adds a wrapper for the "main" function to compute the platform index and the
-// dimension arguments.
-//
-// The input module has the following structure:
-//
-//    func public main(%arg_platform_index: i32, %arg_dim0: i32, %arg_dim1: i32,
-//                     %arg0: f32[?, ?, 8]) { ... }
-//
-// where %arg_platform_index is the index of the current compilation platform
-// among the declared `platforms` (missing if version < 3 or if platforms has
-// fewer than 2 elements), %arg_dim0 and %arg_dim1 are dimension arguments
-// (missing if dim_args_spec is empty). The value of the dimension arguments
-// are computed based on the static shapes of the actual arguments
-// (%arg0 and following).
-// In the above example, the dim_args_spec array would have two elements, one
-// for %arg_dim0 and one for %arg_dim1. E.g., ['0.0', '0.1'] specifies that
-// %arg_dim0 should be set to the size of axis 0 or array argument 0 (%arg0),
-// while %arg_dim1 should be set to the size of axis 1.
-// The platform index argument must be a 0-dimensional 32-bit integer, and the
-// dimension arguments must be 0-dimensional tensors of integer type.
-//
-// We create a new "main" function as follows:
-//   func public main(%arg0: f32[?, ?, 8]) {
-//      %arg_platform_index = stablehlo.constant <platform_index>
-//      %arg_dim0 = stablehlo.get_dimension_size(%arg0) dimension=0
-//      %arg_dim1 = stablehlo.get_dimension_size(%arg0) dimension=1
-//      %res = func.call _wrapped_main(%arg_platform_index,
-//                                     %arg_dim0, %arg_dim1, %arg0)
-//      return %res
-//   }
-//   func private _wrapped_main(%arg_platform_index: i32,
-//                              %arg_dim0: i32, %arg_dim1: i32,
-//                              %arg0: f32[?, ?, 8]) {
-//      ... the original main function ...
-//   }
-//
-// and then we run the inliner. This is important because in the
-// RefineDynamicShapes method called in Compile we refine the shape of the
-// array arguments. This would create a type error at the call to _wrapped_main
-// with the expected type of %arg0.
-Status AddMainWrapper(int version, mlir::ModuleOp module, int platform_index,
-                      std::vector<string> dim_args_spec) {
-  int nr_dim_args = dim_args_spec.size();
-  // Locate the 'main' function.
-  // This is the convention used by MlirToXlaComputation.
-  mlir::func::FuncOp orig_main =
-      module.lookupSymbol<mlir::func::FuncOp>("main");
-  if (!orig_main) {
-    return errors::InvalidArgument("Cannot find 'main' in module");
-  }
-  int nr_platform_args = 0;
-  if (platform_index >= 0) {
-    nr_platform_args = 1;
-  }
-  if (orig_main.getNumArguments() <= nr_platform_args + nr_dim_args) {
-    return errors::InvalidArgument("The module should have ", nr_platform_args,
-                                   " platform index arguments and ",
-                                   nr_dim_args, " dimension arguments, but it ",
-                                   "has only ", orig_main.getNumArguments(),
-                                   " total arguments");
-  }
-  mlir::Block &orig_main_body = orig_main.front();
-
-  mlir::SymbolTable::setSymbolVisibility(
-      orig_main, mlir::SymbolTable::Visibility::Private);
-  mlir::OpBuilder op_builder(module.getBodyRegion());
-  orig_main.setName(op_builder.getStringAttr("_wrapped_main"));
-  mlir::Location loc = module.getLoc();
-  std::vector<mlir::Type> new_main_arg_types(
-      orig_main.getArgumentTypes().begin() + nr_platform_args + nr_dim_args,
-      orig_main.getArgumentTypes().end());
-  mlir::func::FuncOp new_main = op_builder.create<mlir::func::FuncOp>(
-      loc, "main",
-      mlir::FunctionType::get(module.getContext(),
-                              /*inputs=*/new_main_arg_types,
-                              /*results=*/orig_main.getResultTypes()));
-  mlir::SymbolTable::setSymbolVisibility(new_main,
-                                         mlir::SymbolTable::Visibility::Public);
-  mlir::Block *new_main_block = new_main.addEntryBlock();
-  std::vector<mlir::Value> block_args(new_main_block->getArguments().begin(),
-                                      new_main_block->getArguments().end());
-  op_builder.setInsertionPointToStart(new_main_block);
-
-  std::vector<mlir::Value> call_args(orig_main_body.getNumArguments());
-  for (int i = 0; i < orig_main_body.getNumArguments(); ++i) {
-    if (i < nr_platform_args + nr_dim_args) {
-      mlir::Type arg_type = orig_main.getArgument(i).getType();
-      mlir::RankedTensorType arg_ranked_type =
-          arg_type.dyn_cast<mlir::RankedTensorType>();
-      if (!arg_ranked_type ||
-          !arg_ranked_type.getElementType().dyn_cast<mlir::IntegerType>() ||
-          !arg_ranked_type.getShape().empty()) {
-        string argument_type =
-            (i < nr_platform_args) ? "platform index" : "dimension";
-        return errors::InvalidArgument(
-            "Module argument at index ", i,
-            " should be a 0-dimensional integer-tensor ", argument_type,
-            " argument but has type ", debugString(arg_type));
-      }
-      if (i < nr_platform_args) {
-        if (arg_ranked_type.getElementTypeBitWidth() != 32) {
-          return errors::InvalidArgument(
-              "Module argument at index ", i,
-              " should be a 0-dimensional 32-bit integer-tensor"
-              " platform index argument but has type ",
-              debugString(arg_type));
-        }
-        call_args[i] = op_builder.create<mlir::stablehlo::ConstantOp>(
-            block_args[0].getLoc(),
-            op_builder.getI32IntegerAttr(platform_index));
-      } else {
-        TF_ASSIGN_OR_RETURN(
-            call_args[i],
-            ComputeDimensionValue(version, dim_args_spec[i - nr_platform_args],
-                                  block_args, op_builder,
-                                  orig_main.getArgument(i).getType()));
-      }
-    } else {
-      call_args[i] =
-          new_main_block->getArgument(i - nr_platform_args - nr_dim_args);
-    }
-  }
-  mlir::func::CallOp call_op = op_builder.create<mlir::func::CallOp>(
-      loc, orig_main.getResultTypes(), orig_main.getSymName(), call_args);
-  op_builder.create<mlir::func::ReturnOp>(loc, call_op.getResults());
-  VLOG(3) << "XlaCallModule module with wrapper: " << debugString(module);
-
-  return OkStatus();
-}
-
-// Refines the dynamic module arguments based on the static argument shapes.
-// This assumes that the module has a "main" function without dimension args,
-// but possibly with dynamic shapes. We read the static shapes of the inputs,
-// then set them as the types of the function parameters, and run StableHLO
-// shape refinement to specialize all dynamic shapes in the StableHLO program
-// to static shapes.
-Status RefineDynamicShapes(XlaOpKernelContext *ctx,
-                           mlir::OwningOpRef<mlir::ModuleOp> *module,
-                           int nr_platform_args, int nr_dim_args) {
-  // Locate the (wrapped) 'main' function.
-  // This is the convention used by MlirToXlaComputation.
-  mlir::func::FuncOp main = (*module)->lookupSymbol<mlir::func::FuncOp>("main");
-  if (!main) {
-    return errors::InvalidArgument("Cannot find 'main' in module");
-  }
-  mlir::Block &main_body = main.front();
-  int non_dimension_arguments = ctx->num_inputs();
-  if (non_dimension_arguments != main_body.getNumArguments()) {
-    return errors::InvalidArgument(
-        "Incorrect number of arguments passed to XlaCallModule: ",
-        non_dimension_arguments, ". The module takes ",
-        main_body.getNumArguments() + nr_platform_args + nr_dim_args,
-        " arguments of which ", nr_platform_args,
-        " platform index arguments and ", nr_dim_args,
-        " dimension arguments. It must be called with ",
-        main_body.getNumArguments(), " arguments.");
-  }
-
-  mlir::Builder builder((*module)->getContext());
-  std::vector<mlir::Type> static_array_input_types(non_dimension_arguments);
-  for (int i = 0, end = non_dimension_arguments; i < end; ++i) {
-    TF_ASSIGN_OR_RETURN(xla::Shape xla_shape, ctx->InputXlaShape(i));
-    std::vector<int64_t> xla_dimensions(xla_shape.dimensions().begin(),
-                                        xla_shape.dimensions().end());
-    TF_ASSIGN_OR_RETURN(
-        mlir::Type element_type,
-        ConvertPrimitiveTypeToMLIRType(xla_shape.element_type(), builder));
-    mlir::Type type = mlir::RankedTensorType::get(xla_dimensions, element_type);
-    // TODO(burmako): This fails with an obscure compilation error.
-    // OP_REQUIRES_VALUE(
-    //     mlir::Type type, ctx,
-    //     ConvertShapeToType<mlir::RankedTensorType>(xla_shape, builder));
-    VLOG(3) << "XlaCallModule static array input type #" << i << ": "
-            << debugString(type);
-    static_array_input_types[i] = type;
-  }
-
-  // Refine 'main' argument types to use static input types instead.
-  // This will only change the argument types and will not propagate the
-  // additional type information further. For that, we'll need to run
-  // shape refinement as explained below.
-  // Before refining the argument types it is useful to run the inliner to
-  // remove calls that may be called with the input arguments.
-  mlir::PassManager pm_inline((*module)->getContext());
-  pm_inline.addPass(mlir::createInlinerPass());
-  if (!mlir::succeeded(pm_inline.run(**module))) {
-    return errors::InvalidArgument("Module inlining failed");
-  }
-  VLOG(3) << "XlaCallModule module after inlining: " << debugString(module);
-
-  auto static_array_output_types = llvm::to_vector(main.getResultTypes());
-  for (auto i = 0; i < main_body.getNumArguments(); ++i) {
-    auto arg = main_body.getArgument(i);
-    arg.setType(static_array_input_types[i]);
-    // If the argument is used by `func.return`, then we also need to
-    // update function result types. It's not great that we need this hack,
-    // but in the future when we have stablehlo.func, stablehlo.return, etc,
-    // this will not be needed.
-    // TODO(burmako): Once https://github.com/openxla/stablehlo/issues/425 is
-    // fixed, clean this up.
-    for (mlir::OpOperand &use : arg.getUses()) {
-      if (auto ret = llvm::dyn_cast<mlir::func::ReturnOp>(use.getOwner())) {
-        static_array_output_types[use.getOperandNumber()] = arg.getType();
-      }
-    }
-  }
-  main.setType(builder.getFunctionType(static_array_input_types,
-                                       static_array_output_types));
-
-  // Verify the module before running passes on it.
-  // If the module doesn't pass verification, all sorts of weirdness might
-  // happen if we run the pass manager.
-  if (failed(verify(**module))) {
-    VLOG(3) << "XlaCallModule module with verification failed: "
-            << debugString(**module);
-    return errors::InvalidArgument("Module verification failed");
-  }
-  mlir::PassManager pm((*module)->getContext());
-  if (VLOG_IS_ON(3)) {
-    auto print_before = [](mlir::Pass *, mlir::Operation *) { return true; };
-    auto print_after = [](mlir::Pass *, mlir::Operation *) { return true; };
-    pm.enableIRPrinting(print_before, print_after, /*printModuleScope=*/true,
-                        /*printAfterOnlyOnChange=*/false);
-  }
-  pm.addPass(mlir::createCSEPass());
-  pm.addPass(mlir::stablehlo::createStablehloRefineShapesPass());
-  if (!mlir::succeeded(pm.run(**module))) {
-    return errors::InvalidArgument("Module shape refinement failed");
-  }
-
-  VLOG(3) << "XlaCallModule module with refined shapes: "
-          << debugString(**module);
-  return OkStatus();
-}
-
-Status LoadAndPreprocessModule(int version,
-                               mlir::OwningOpRef<mlir::ModuleOp> *module,
-                               mlir::MLIRContext *context, string module_str,
-                               std::vector<string> dim_args_spec,
-                               std::vector<string> platforms,
-                               int platform_index, int *nr_outputs) {
-  // Load a superset of dialects; we should check at serialization time that
-  // we only include allowable dialects.
-  context->loadDialect<mlir::func::FuncDialect>();
-  context->loadDialect<mlir::stablehlo::StablehloDialect>();
-  context->loadDialect<mlir::mhlo::MhloDialect>();
-  context->loadDialect<mlir::chlo::ChloDialect>();
-  context->loadDialect<mlir::vhlo::VhloDialect>();
-  // Parses both IR text and bytecode.
-  if (version >= VERSION_START_STABLE_HLO_COMPATIBILITY) {
-    *module = mlir::stablehlo::deserializePortableArtifact(module_str, context);
-  } else {
-    *module = mlir::parseSourceString<mlir::ModuleOp>(module_str, context);
-  }
-
-  if (!*module) {
-    return errors::InvalidArgument("Cannot deserialize computation");
-  }
-  VLOG(3) << "Parsed serialized module (version " << version
-          << ", platforms = [" << absl::StrJoin(platforms, ", ") << "]"
-          << ", platform_index = " << platform_index << ", dim_args_spec = ["
-          << absl::StrJoin(dim_args_spec, ", ") << "])\n"
-          << debugString(**module);
-
-  if (failed((*module)->verifyInvariants())) {
-    VLOG(1) << "MLIR verification failed.";
-    (*module)->dump();
-    return errors::InvalidArgument("Error verifying module");
-  }
-  mlir::func::FuncOp main = (*module)->lookupSymbol<mlir::func::FuncOp>("main");
-  if (!main) {
-    return errors::InvalidArgument("Cannot find 'main' in module");
-  }
-
-  if (!dim_args_spec.empty() || platform_index >= 0) {
-    TF_RETURN_IF_ERROR(
-        AddMainWrapper(version, **module, platform_index, dim_args_spec));
-    main = (*module)->lookupSymbol<mlir::func::FuncOp>("main");
-  }
-  *nr_outputs = main.getNumResults();
-  return OkStatus();
-}
-
-// Validate that the module represents a statically-shaped StableHLO program,
-// otherwise all sorts of weirdness might happen in the HLO exporter which
-// is much easier to detect here.
-Status ValidateModule(mlir::ModuleOp module) {
-  bool moduleHasUnsupportedDialects = false;
-  bool moduleHasDynamicShapes = false;
-
-  module.walk([&](mlir::Operation *op) {
-    // StableHLO programs created by jax2tf only contain operations
-    // from Builtin, Func and StableHLO dialects.
-    if (!llvm::isa<mlir::BuiltinDialect, mlir::chlo::ChloDialect,
-                   mlir::func::FuncDialect, mlir::stablehlo::StablehloDialect>(
-            op->getDialect())) {
-      moduleHasUnsupportedDialects = true;
-      VLOG(3) << "Operation has unsupported dialects: " << debugString(op);
-    }
-
-    // It's sufficient to only check results because operands either come from
-    // results or from block arguments which are checked below.
-    auto hasDynamicShape = [](mlir::Value value) {
-      auto shaped_type = value.getType().dyn_cast<mlir::ShapedType>();
-      return shaped_type ? !shaped_type.hasStaticShape() : false;
-    };
-    bool opHasDynamicShapes = false;
-    opHasDynamicShapes |= llvm::any_of(op->getResults(), hasDynamicShape);
-    for (mlir::Region &region : op->getRegions()) {
-      opHasDynamicShapes |=
-          llvm::any_of(region.getArguments(), hasDynamicShape);
-    }
-    if (opHasDynamicShapes) {
-      moduleHasDynamicShapes = true;
-      VLOG(3) << "Operation has dynamic shapes: " << debugString(op);
-    }
-  });
-
-  if (moduleHasUnsupportedDialects)
-    return errors::InvalidArgument("Module has unsupported dialects");
-  if (moduleHasDynamicShapes)
-    return errors::InvalidArgument("Module has dynamic shapes");
-  return OkStatus();
-}
-
 class XlaCallModuleOp : public XlaOpKernel {
  public:
   explicit XlaCallModuleOp(OpKernelConstruction *ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("version", &version_));
-    OP_REQUIRES(
-        ctx, version_ >= VERSION_MINIMUM_SUPPORTED,
-        errors::InvalidArgument("XlaCallModuleOp with version ", version_,
-                                " is not supported anymore. Must be >= ",
-                                VERSION_MINIMUM_SUPPORTED));
+    int version;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("version", &version));
     string module_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("module", &module_str));
     std::vector<PartialTensorShape> expected_output_shapes;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Sout", &expected_output_shapes));
     std::vector<DataType> expected_output_dtypes;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Tout", &expected_output_dtypes));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("dim_args_spec", &dim_args_spec_));
+    std::vector<string> dim_args_spec;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dim_args_spec", &dim_args_spec));
     OP_REQUIRES(ctx,
                 expected_output_shapes.size() == expected_output_dtypes.size(),
                 errors::InvalidArgument("The size of Sout (",
@@ -456,12 +55,14 @@ class XlaCallModuleOp : public XlaOpKernel {
                                         ") must match the size of Tout (",
                                         expected_output_dtypes.size(), ")"));
     std::vector<string> platforms;
-    platform_index_ = -1;
-    if (version_ >= VERSION_START_PLATFORMS) {
+    // Index in platforms of the current platform, or -1 if module does not take
+    // a platform index arg.
+    int platform_index = -1;
+    if (ctx->HasAttr("platforms")) {
       OP_REQUIRES_OK(ctx, ctx->GetAttr("platforms", &platforms));
       if (!platforms.empty()) {
-        std::string current_device_type = ctx->device_type().type_string();
-        std::string current_platform = "";
+        string current_device_type = ctx->device_type().type_string();
+        string current_platform = "";
         if (current_device_type == DEVICE_CPU_XLA_JIT) {
           current_platform = "CPU";
         } else if (current_device_type == DEVICE_GPU_XLA_JIT) {
@@ -491,46 +92,51 @@ class XlaCallModuleOp : public XlaOpKernel {
         // We only use a platform index arguments if we support at least 2
         // platforms.
         if (platforms.size() > 1) {
-          platform_index_ = found_platform - platforms.begin();
+          platform_index = found_platform - platforms.begin();
         }
       }
     }
-    OP_REQUIRES_OK(
-        ctx, LoadAndPreprocessModule(version_, &module_, &context_, module_str,
-                                     dim_args_spec_, platforms, platform_index_,
-                                     &nr_outputs_));
+
+    auto loader =
+        XlaCallModuleLoader::Create(&context_, version, std::move(module_str),
+                                    std::move(dim_args_spec), platform_index);
+    OP_REQUIRES_OK(ctx, loader.status());
+    loader_ = *std::move(loader);
   }
 
   void Compile(XlaOpKernelContext *ctx) override {
-    OP_REQUIRES_OK(
-        ctx, RefineDynamicShapes(ctx, &module_, (platform_index_ >= 0 ? 1 : 0),
-                                 dim_args_spec_.size()));
-    OP_REQUIRES_OK(ctx, ValidateModule(*module_));
+    std::vector<xla::Shape> input_shapes;
+    for (int i = 0; i < ctx->num_inputs(); ++i) {
+      auto shape = ctx->InputXlaShape(i);
+      OP_REQUIRES_OK(ctx, shape.status());
+      input_shapes.push_back(*std::move(shape));
+    }
+    OP_REQUIRES_OK(ctx, loader_->RefineDynamicShapes(input_shapes));
+    OP_REQUIRES_OK(ctx, loader_->ValidateModule());
 
     std::vector<xla::XlaOp> inputs(ctx->num_inputs());
     for (int i = 0, end = ctx->num_inputs(); i < end; ++i) {
       inputs[i] = ctx->Input(i);
     }
 
-    xla::XlaComputation xla_computation;
-    OP_REQUIRES_OK(
-        ctx, MlirToXlaComputation(*module_, xla_computation, false, false));
+    auto xla_computation = loader_->ToXlaComputation();
+    OP_REQUIRES_OK(ctx, xla_computation.status());
 
     if (VLOG_IS_ON(3)) {
       OP_REQUIRES_VALUE(
           const xla::HloModuleConfig module_config, ctx,
           xla::HloModule::CreateModuleConfigFromProto(
-              xla_computation.proto(), xla::GetDebugOptionsFromFlags()));
+              xla_computation->proto(), xla::GetDebugOptionsFromFlags()));
       OP_REQUIRES_VALUE(std::unique_ptr<xla::HloModule> hlo_module, ctx,
-                        xla::HloModule::CreateFromProto(xla_computation.proto(),
-                                                        module_config));
+                        xla::HloModule::CreateFromProto(
+                            xla_computation->proto(), module_config));
       xla::HloPrintOptions options;
       options = xla::HloPrintOptions::ShortParsable();
       VLOG(3) << "XlaCallModule converted to HLO module "
               << hlo_module->ToString(options);
     }
 
-    xla::XlaOp output = xla::Call(ctx->builder(), xla_computation, inputs);
+    xla::XlaOp output = xla::Call(ctx->builder(), *xla_computation, inputs);
 
     // Check that the resulting computation returns the expected shape
     OP_REQUIRES_VALUE(xla::Shape found_output_shape, ctx,
@@ -538,25 +144,21 @@ class XlaCallModuleOp : public XlaOpKernel {
     VLOG(3) << "XlaCallModule compiled output shape : "
             << xla::ShapeUtil::HumanString(found_output_shape);
 
-    if (nr_outputs_ == 1) {
+    if (loader_->nr_outputs() == 1) {
       ctx->SetOutput(0, output);
     } else {
-      for (int i = 0; i < nr_outputs_; ++i) {
+      for (int i = 0; i < loader_->nr_outputs(); ++i) {
         ctx->SetOutput(i, xla::GetTupleElement(output, i));
       }
     }
   }
 
  private:
-  int version_;
-  int nr_outputs_;
-  std::vector<string> dim_args_spec_;
-  int platform_index_;  // Index in platforms of the current platform, or -1
-                        // if module does not take a platform index arg.
   mlir::MLIRContext context_{mlir::MLIRContext::Threading::DISABLED};
-  mlir::OwningOpRef<mlir::ModuleOp> module_;
+  std::unique_ptr<XlaCallModuleLoader> loader_;
 };
 
 REGISTER_XLA_OP(Name("XlaCallModule"), XlaCallModuleOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/literal_util_test.cc b/tensorflow/compiler/tf2xla/literal_util_test.cc
index 05af57e551a..e1fa0821054 100644
--- a/tensorflow/compiler/tf2xla/literal_util_test.cc
+++ b/tensorflow/compiler/tf2xla/literal_util_test.cc
@@ -32,10 +32,10 @@ TEST(LiteralUtil, LiteralToHostTensor) {
   Tensor host_tensor;
   EXPECT_EQ("Cannot convert literal of type S64 to tensor of type int32",
             LiteralToHostTensor(int64_values_literal, DT_INT32, &host_tensor)
-                .error_message());
+                .message());
   EXPECT_EQ("Cannot convert literal of type S64 to tensor of type qint32",
             LiteralToHostTensor(int64_values_literal, DT_QINT32, &host_tensor)
-                .error_message());
+                .message());
   EXPECT_TRUE(
       LiteralToHostTensor(int64_values_literal, DT_INT64, &host_tensor).ok());
   test::ExpectTensorEqual<int64_t>(host_tensor,
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 5ffe2a06f34..3959ebb5771 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -163,42 +163,75 @@ MlirOptimizationPassState MlirBridgePass::GetPassState(
     const DeviceSet* device_set, const ConfigProto& config_proto,
     const Graph& graph,
     const FunctionLibraryDefinition& function_library) const {
-  // Skip MLIR TF XLA Bridge if no TPU devices found and the non TPU graph is
-  // not qualified.
-  if (device_set && !HasTPUDevice(*device_set) && !EnableNonTpuBridge(graph)) {
+  // Skip MLIR TF/XLA Bridge if no TPU devices and no qualified CPU/GPU
+  // graphs are found.
+  bool has_tpu_device = device_set ? HasTPUDevice(*device_set) : false;
+  // GetPassState is called once before MlirBridgePass starts, and the pass
+  // gets skipped if it is disabled. Log such cases in this function. The cases
+  // where the pass is enabled will only be logged during their execution to
+  // prevent them from being counted twice.
+  if (device_set && !has_tpu_device && !EnableNonTpuBridge(graph)) {
+    // Only record CPU/GPU graphs that are qualified but filtered out
+    if (HasQualifiedNonTPUOp(graph)) {
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter(
+          /*device type*/ "cpu/gpu",
+          /*bridge version*/ "tfxla",
+          /*fallback_enabled*/ false,
+          /*result*/ "invalid_graph");
+    }
     return MlirOptimizationPassState::Disabled;
   }
 
   // We set `uses_uninitialized_resource_args` to false here because the first
   // phase of the bridge is not affected by uninitialized resource args.
   MlirBridgeRolloutPolicy policy = GetMlirBridgeRolloutPolicy(
-      graph, &function_library, config_proto,
+      graph, &function_library, config_proto, /*is_tpu_graph*/ has_tpu_device,
       /*uses_uninitialized_resource_args=*/false,
       /*is_v1_compat=*/false, /*record_stats=*/false);
+  if (has_tpu_device) {
+    switch (policy) {
+      case MlirBridgeRolloutPolicy::kEnabledByUser:
+        return MlirOptimizationPassState::Enabled;
+      case MlirBridgeRolloutPolicy::kEnabledAfterGraphAnalysis:
+        return MlirOptimizationPassState::FallbackEnabled;
+      case MlirBridgeRolloutPolicy::kDisabledByUser:
+        VLOG(1) << "Skipping MLIR TPU Bridge, disabled by user. "
+                   "Old bridge will evaluate.";
+        metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v2", true,
+                                                     "disabled_by_user");
+        return MlirOptimizationPassState::Disabled;
+      case MlirBridgeRolloutPolicy::kDisabledAfterGraphAnalysis:
+        VLOG(1) << "Skipping MLIR TPU Bridge, disabled because "
+                   "graph has unsupported features. Old bridge will evaluate.";
+        metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v2", true,
+                                                     "invalid_graph");
+        // We set `uses_uninitialized_resource_args` to false here because the
+        // first phase of the bridge is not affected by uninitialized resource
+        // args.
+        // For Invalid Graph Analysis we need to log here because Run will not
+        // be called.
+        LogGraphFeatures(graph, &function_library, config_proto,
+                         /*uses_uninitialized_resource_args=*/false,
+                         /*is_v1_compat=*/false);
+        return MlirOptimizationPassState::Disabled;
+    }
+  }
+  // TODO(b/277112519): Have uniform behavior for GPU/CPU and TPU
   switch (policy) {
     case MlirBridgeRolloutPolicy::kEnabledByUser:
       return MlirOptimizationPassState::Enabled;
     case MlirBridgeRolloutPolicy::kEnabledAfterGraphAnalysis:
       return MlirOptimizationPassState::FallbackEnabled;
     case MlirBridgeRolloutPolicy::kDisabledByUser:
-      VLOG(1) << "Skipping MLIR TPU Bridge, MLIR TPU bridge disabled by user. "
-                 "Old bridge will evaluate.";
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v2", true,
+      VLOG(1) << "Skipping MLIR CPU/GPU Bridge, disabled by user.";
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter("cpu/gpu", "tfxla", false,
                                                    "disabled_by_user");
       return MlirOptimizationPassState::Disabled;
-    case MlirBridgeRolloutPolicy::kDisabledAfterGraphAnalysis:
-      VLOG(1) << "Skipping MLIR TPU Bridge, MLIR TPU bridge disabled because "
-                 "graph has unsupported features. Old bridge will evaluate.";
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v2", true,
+    default:
+      // This case should never be hit. Added here to be consistent with OSS
+      // implementation.
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter("cpu/gpu", "ftxla", false,
                                                    "invalid_graph");
-      // We set `uses_uninitialized_resource_args` to false here because the
-      // first phase of the bridge is not affected by uninitialized resource
-      // args.
-      // For Invalid Graph Analysis we need to log here because Run will not be
-      // called.
-      LogGraphFeatures(graph, &function_library, config_proto,
-                       /*uses_uninitialized_resource_args=*/false,
-                       /*is_v1_compat=*/false);
       return MlirOptimizationPassState::Disabled;
   }
 }
@@ -209,14 +242,15 @@ MlirOptimizationPassState MlirBridgePass::GetPassState(
 // and attached to a "compile" operation, whose result is fed to an "execute"
 // operation. The kernel for these operations is responsible to lower the
 // encapsulated graph to a particular device.
-Status MlirBridgePass::Run(const ConfigProto& config_proto,
+Status MlirBridgePass::Run(const std::string& function_name,
+                           const ConfigProto& config_proto,
                            mlir::ModuleOp module, const Graph& graph,
                            const FunctionLibraryDefinition& function_library) {
   static absl::once_flag flag;
   absl::call_once(flag, UpdateLogVerbosityIfDefined, "TF_DEBUG_LOG_VERBOSITY");
 
   // Check if there are TPU devices or TPU ops. If not, then check if the
-  // non TPU graph is qualified to run TF XLA Bridge.
+  // non TPU graph is qualified to run TF2XLA Bridge.
   // This check needs to precede GetPassState for instrumentation purposes.
   bool is_qualified_for_tpu_bridge = HasTPUDevicesAndOps(module),
        is_qualified_for_non_tpu_bridge = false;
@@ -224,7 +258,7 @@ Status MlirBridgePass::Run(const ConfigProto& config_proto,
     is_qualified_for_non_tpu_bridge = EnableNonTpuBridge(graph);
   if (!is_qualified_for_tpu_bridge && !is_qualified_for_non_tpu_bridge) {
     VLOG(1)
-        << "Skipping MLIR TF XLA Bridge, no qualified devices or ops found.";
+        << "Skipping MLIR TF2XLA Bridge, no qualified devices or ops found.";
     return OkStatus();
   }
 
@@ -259,11 +293,10 @@ Status MlirBridgePass::Run(const ConfigProto& config_proto,
     }
     VLOG(1) << "Running MLIR TPU Bridge";
     mlir_bridge_gauge_v2->GetCell()->Set(true);
-    return mlir::TFTPU::TPUBridge(module, /*enable_logging=*/VLOG_IS_ON(1),
-                                  fallback_enabled);
+    return mlir::TFTPU::TPUBridge(module, fallback_enabled, function_name);
   }
-  VLOG(1) << "Running MLIR non-TPU Bridge";
-  return mlir::TF::RunTFXLABridge(module, VLOG_IS_ON(1));
+  VLOG(1) << "Running MLIR CPU/GPU Bridge";
+  return mlir::TF::RunTFXLABridge(module, function_name);
 }
 
 MlirOptimizationPassState MlirBridgeV1CompatPass::GetPassState(
@@ -277,6 +310,7 @@ MlirOptimizationPassState MlirBridgeV1CompatPass::GetPassState(
   // phase of the bridge is not affected by uninitialized resource args.
   MlirBridgeRolloutPolicy policy = GetMlirBridgeRolloutPolicy(
       graph, /*function_library=*/&function_library, config_proto,
+      /*is_tpu_graph*/ true,
       /*uses_uninitialized_resource_args=*/false, /*is_v1_compat=*/true,
       /*record_stats=*/false);
   switch (policy) {
@@ -356,8 +390,7 @@ Status MlirBridgeV1CompatPass::Run(const GraphOptimizationPassOptions& options,
 
   mlir_bridge_gauge_v1->GetCell()->Set(true);
 
-  return mlir::TFTPU::TPUBridgeV1Compat(
-      module, /*enable_logging=*/VLOG_IS_ON(1), fallback_enabled);
+  return mlir::TFTPU::TPUBridgeV1Compat(module, fallback_enabled);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.h b/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
index f0f8424cab5..ff32bc5f9ad 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_MLIR_BRIDGE_PASS_H_
 #define TENSORFLOW_COMPILER_TF2XLA_MLIR_BRIDGE_PASS_H_
 
+#include <string>
+
 #include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
 #include "llvm/ADT/StringRef.h"
 #include "tensorflow/compiler/jit/flags.h"
@@ -37,8 +39,8 @@ class MlirBridgePass : public MlirOptimizationPass {
 
   // This should be used as a thin mapper around mlir::ModulePass::runOnModule
   // API integrated with the Tensorflow runtime.
-  Status Run(const ConfigProto& config_proto, mlir::ModuleOp module,
-             const Graph& graph,
+  Status Run(const std::string& function_name, const ConfigProto& config_proto,
+             mlir::ModuleOp module, const Graph& graph,
              const FunctionLibraryDefinition& function_library) override;
 };
 
diff --git a/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc b/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc
index 0e61e144f93..694a1a15910 100644
--- a/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc
@@ -15,12 +15,43 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
 
+#include <string>
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/xla_compile_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.h"
 #include "tensorflow/compiler/mlir/utils/array_container_utils.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/core/framework/resource_op_kernel.h"
 
 namespace tensorflow {
 
+namespace {
+
+class MLIRContextResource : public ResourceBase {
+ public:
+  static constexpr const char* kDefaultResourceName =
+      "mlir-xla-op-cached-context";
+
+  static Status Create(MLIRContextResource** resource) {
+    *resource = new MLIRContextResource();
+    return OkStatus();
+  }
+  mlir::MLIRContext* GetContext() { return &mlir_ctx_; }
+  std::string DebugString() const override {
+    return "MlirXlaOpKernel MLIRContext resource";
+  }
+
+ private:
+  // Since this kernel implements lowering for a single TF operation, we
+  // disable MLIR threading for efficiency purpose (avoid starting a large
+  // number of threads eagerly).
+  MLIRContextResource() : mlir_ctx_(mlir::MLIRContext::Threading::DISABLED) {}
+  mlir::MLIRContext mlir_ctx_;
+};
+
+}  // namespace
+
 Status MlirXlaOpKernel::ContextToXlaArgs(
     XlaOpKernelContext* ctx, std::vector<XlaCompiler::Argument>& xla_args) {
   // Collect arguments that are registered as CompileTimeConstantInput.
@@ -57,11 +88,7 @@ Status MlirXlaOpKernel::ContextToXlaArgs(
 }
 
 MlirXlaOpKernel::MlirXlaOpKernel(OpKernelConstruction* ctx)
-    : XlaOpKernel(ctx),
-      // Since this kernel implements lowering for a single TF operation, we
-      // disable MLIR threading for efficiency purpose (avoid starting a large
-      // number of threads eagerly).
-      mlir_ctx_(mlir::MLIRContext::Threading::DISABLED) {}
+    : XlaOpKernel(ctx) {}
 
 Status MlirXlaOpKernel::ConstructXlaOp(XlaOpKernelContext* ctx) {
   // Create input XlaArguments.
@@ -99,11 +126,19 @@ Status MlirXlaOpKernel::ConstructXlaOp(XlaOpKernelContext* ctx) {
   TF_ASSIGN_OR_RETURN(auto graph,
                       CreateSingleOpGraph(def(), xla_args, result_dtypes));
 
+  ResourceMgr* res_manager = ctx->op_kernel_context()->resource_manager();
+  MLIRContextResource* ctx_res;
+  TF_RETURN_IF_ERROR(res_manager->LookupOrCreate<MLIRContextResource>(
+      res_manager->default_container(),
+      MLIRContextResource::kDefaultResourceName, &ctx_res,
+      MLIRContextResource::Create));
+  core::ScopedUnref unref_ctx(ctx_res);
+
   // Compile the graph to HLO.
   GraphDebugInfo debug_info;
   std::vector<xla::XlaOp> returns(1);
   TF_RETURN_IF_ERROR(BuildHloFromGraph(
-      *graph, *ctx->builder(), mlir_ctx_, xla_params, returns,
+      *graph, *ctx->builder(), *ctx_res->GetContext(), xla_params, returns,
       mlir::SpanToArrayRef<XlaCompiler::Argument>(xla_args), control_rets,
       device->device_type(),
       *ctx->function_library()->GetFunctionLibraryDefinition(), debug_info,
diff --git a/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h b/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h
index e4ece6e692a..ec62bd98a21 100644
--- a/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_MLIR_XLA_OP_KERNEL_H_
 #define TENSORFLOW_COMPILER_TF2XLA_MLIR_XLA_OP_KERNEL_H_
 
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 
 namespace tensorflow {
@@ -32,7 +31,6 @@ class MlirXlaOpKernel : public XlaOpKernel {
                           std::vector<XlaCompiler::Argument>& xla_args);
   void Compile(XlaOpKernelContext* ctx) override;
   Status ConstructXlaOp(XlaOpKernelContext* ctx);
-  mlir::MLIRContext mlir_ctx_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index a66c953efeb..e536ffa3746 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -1325,6 +1325,7 @@ REGISTER_OP("XlaCallModule")
     .Attr("Tin: list(type) >= 0")
     .Attr("dim_args_spec: list(string) = []")
     .Attr("platforms: list(string) = []")
+    .Attr("function_list: list(func) = []")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       std::vector<shape_inference::ShapeHandle> args_shapes;
       TF_RETURN_IF_ERROR(c->input("args", &args_shapes));
@@ -1347,8 +1348,8 @@ REGISTER_OP("XlaCallModule")
     .Doc(R"doc(
 Invokes a StableHLO module.
 
-This op is experimental and is intended for use with JAX native serialization
-in a TensorFlow context.
+This op is used with JAX native serialization in a TensorFlow context with
+stability guarantees.
 
 args: A list of `Tensor` with possibly different types to be passed as arguments
   to the `module`. These are the actual arguments and do not include the
@@ -1357,7 +1358,10 @@ args: A list of `Tensor` with possibly different types to be passed as arguments
 version: Tracks changes the semantics of the op, to support backwards
   compatibility. Minimum supported version is 2. From
   version 2, the op carries a StableHLO text or bytecode `module`. From
-  version 3, the op also supports the `platforms` attribute.
+  version 3, the op also supports the `platforms` attribute. From version 4,
+  the op carries a StableHLO module with compatibility guarantees. From version
+  5, XLACallModule can include `stablehlo.custom_call` op to execute tf
+  functions.
 module: A serialized computation, a text or bytecode representation of
   an mlir.Module. The return type must be a tuple if and only if the `Sout` is
   a list with 0 or more than 1 elements. The length of `Tout` and
@@ -1382,6 +1386,11 @@ dim_args_spec: in presence of dynamic shapes, this is the specification for the
   string of the form "<arg_idx>.<axis_idx>" that specifies that the value of
   the corresponding dimension argument must be "args[arg_idx].shape[axis_idx]",
   where "args" are the actual array arguments.
+function_list: This list contains the TensorFlow FunctionDefs that are used by
+  the XLACallModule. If the XLACallModule contains `stablehlo.custom_call`
+  operations, they can call TensorFlow graph functions outside of the
+  XLACallModule. This `function_list` attribute registers the dependency of the
+  XLACallModule on those functions. This attribute was added in version 5.
 )doc");
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index d1273cd403e..61d2be76ac1 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -604,12 +604,12 @@ def custom_call_v2(
   )
 
 
-def call_module(args, *, version=2, module, Tout, Sout,
-                dim_args_spec=(), platforms=()):
+def call_module(args, *, version=4, module, Tout, Sout,
+                dim_args_spec=(), platforms=(), function_list=()):
   # See documentation for the XlaCallModule op.
   return gen_xla_ops.xla_call_module(
       args, version=version, module=module, dim_args_spec=dim_args_spec,
-      Tout=Tout, Sout=Sout, platforms=platforms)
+      Tout=Tout, Sout=Sout, platforms=platforms, function_list=function_list)
 
 
 def gather(operand,
diff --git a/tensorflow/compiler/tf2xla/rearrange_function_argument.cc b/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
index 24cd31b3ba9..942b3ef0bdc 100644
--- a/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
+++ b/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
@@ -366,8 +366,13 @@ Status MaybeRewriteWhileNode(
     string new_name =
         fld->UniqueFunctionName(absl::StrCat(attr_value.name(), "_rearrange_"));
     TF_RETURN_IF_ERROR(GraphToFunctionDef(*fbody->graph, new_name, &new_fdef));
-    TF_RETURN_IF_ERROR(
-        fld->AddFunctionDef(new_fdef, fld->GetStackTraces(attr_value.name())));
+
+    const StackTracesMap* stack_traces = fld->GetStackTraces(attr_value.name());
+    if (stack_traces != nullptr) {
+      TF_RETURN_IF_ERROR(fld->AddFunctionDef(new_fdef, *stack_traces));
+    } else {
+      TF_RETURN_IF_ERROR(fld->AddFunctionDef(new_fdef, {}));
+    }
 
     // Change node to use rewritten function.
     attr_value.set_name(new_name);
@@ -457,11 +462,18 @@ Status MaybeRewriteIfNode(
     string new_name =
         fld->UniqueFunctionName(absl::StrCat(f.name(), "_rearrange_"));
     TF_RETURN_IF_ERROR(GraphToFunctionDef(*fbody->graph, new_name, &new_fdef));
-    const StackTracesMap& stack_traces =
-        fld->GetStackTraces(f.name()).empty() && global_fld
-            ? global_fld->GetStackTraces(f.name())
-            : fld->GetStackTraces(f.name());
-    TF_RETURN_IF_ERROR(fld->AddFunctionDef(new_fdef, stack_traces));
+
+    const StackTracesMap* global_stack_traces =
+        global_fld ? global_fld->GetStackTraces(f.name()) : nullptr;
+    const StackTracesMap* local_stack_traces = fld->GetStackTraces(f.name());
+
+    if (global_stack_traces != nullptr) {
+      TF_RETURN_IF_ERROR(fld->AddFunctionDef(new_fdef, *global_stack_traces));
+    } else if (local_stack_traces != nullptr) {
+      TF_RETURN_IF_ERROR(fld->AddFunctionDef(new_fdef, *local_stack_traces));
+    } else {
+      TF_RETURN_IF_ERROR(fld->AddFunctionDef(new_fdef, {}));
+    }
 
     // Change node to use rewritten function.
     f.set_name(new_name);
diff --git a/tensorflow/compiler/tf2xla/tf2xla_opset.cc b/tensorflow/compiler/tf2xla/tf2xla_opset.cc
new file mode 100644
index 00000000000..a2a9ddde35b
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/tf2xla_opset.cc
@@ -0,0 +1,96 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/tf2xla_opset.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
+
+namespace tensorflow {
+
+const int SUPPORTED_DEVICES_NUM = 2;
+static const char* const SUPPORTED_DEVICES[SUPPORTED_DEVICES_NUM] = {
+    DEVICE_GPU_XLA_JIT, DEVICE_CPU_XLA_JIT};
+
+bool IsSupportedBackend(absl::string_view device_name) {
+  for (int i = 0; i < SUPPORTED_DEVICES_NUM; i++) {
+    if (SUPPORTED_DEVICES[i] == device_name) return true;
+  }
+  return false;
+}
+
+absl::Status RegisterBackends(absl::string_view device_name) {
+  if (!IsSupportedBackend(device_name)) {
+    return absl::InvalidArgumentError(
+        absl::StrCat(device_name, " is not supported.  Supported devices are ",
+                     absl::StrJoin(SUPPORTED_DEVICES, ", ")));
+  }
+  // All backends need to be registered before DeviceKernels is called
+  // because it calls RegisterCompilationKernels which will only run 1x,
+  // meaning if a device is registered afterwards the ops for that device
+  // will not be included.
+  auto op_filter = [](KernelDef* kdef) {
+    if (kdef->op() == "Const") {
+      AddDtypeToKernelDefConstraint("dtype", DT_STRING, kdef);
+    }
+    if (kdef->op() == "Assert") {
+      AddDtypeToKernelDefConstraint("T", DT_STRING, kdef);
+    }
+    return true;
+  };
+
+  // Backends might already be registered due to preprocesser macros defined
+  // in xla_op_registery.h so this first checks to see if they are registered
+  // already because re-registering the same device will cause a failure.
+  if (!XlaOpRegistry::IsBackendRegistered(DEVICE_GPU_XLA_JIT)) {
+    static auto gpu_backend =
+        XlaBackendRegistrar(DEVICE_GPU_XLA_JIT, kGpuAllTypes, op_filter);
+  }
+  if (!XlaOpRegistry::IsBackendRegistered(DEVICE_CPU_XLA_JIT)) {
+    static auto cpu_backend =
+        XlaBackendRegistrar(DEVICE_CPU_XLA_JIT, kCpuAllTypes, op_filter);
+  }
+  if (!XlaOpRegistry::IsBackendRegistered(std::string(device_name))) {
+    return absl::InternalError(
+        absl::StrCat(device_name, " is not registered."));
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::vector<std::string>> GetRegisteredXlaOpsForDevice(
+    absl::string_view device_name) {
+  auto status = RegisterBackends(device_name);
+  if (!status.ok()) return status;
+
+  std::vector<const KernelDef*> kernel_defs =
+      XlaOpRegistry::DeviceKernels(std::string(device_name), true);
+  std::vector<std::string> op_names;
+  op_names.reserve(kernel_defs.size());
+  for (const auto& kernel_def : kernel_defs) {
+    op_names.push_back(kernel_def->op());
+  }
+  std::sort(op_names.begin(), op_names.end());
+  return op_names;
+}
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_opset.h b/tensorflow/compiler/tf2xla/tf2xla_opset.h
new file mode 100644
index 00000000000..37fa8f3940f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/tf2xla_opset.h
@@ -0,0 +1,30 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TF2XLA_OPSET_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TF2XLA_OPSET_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+
+namespace tensorflow {
+
+absl::StatusOr<std::vector<std::string>> GetRegisteredXlaOpsForDevice(
+    absl::string_view device_name);
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_OPSET_H_
diff --git a/tensorflow/compiler/tf2xla/tf2xla_opset_test.cc b/tensorflow/compiler/tf2xla/tf2xla_opset_test.cc
new file mode 100644
index 00000000000..f7031e06a4f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/tf2xla_opset_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/tf2xla_opset.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(GeXlaOpsForDeviceTest, InvalidDeviceToRegister) {
+  absl::StatusOr<std::vector<std::string>> result =
+      GetRegisteredXlaOpsForDevice("Invalid_Device");
+  EXPECT_FALSE(result.ok());
+}
+TEST(GeXlaOpsForDeviceTest, GetGpuNames) {
+  absl::StatusOr<std::vector<std::string>> result =
+      GetRegisteredXlaOpsForDevice("XLA_GPU_JIT");
+  EXPECT_GT(result.value().size(), 0);
+  auto matmul =
+      std::find(result.value().begin(), result.value().end(), "MatMul");
+  auto max = std::find(result.value().begin(), result.value().end(), "Max");
+  auto min = std::find(result.value().begin(), result.value().end(), "Min");
+  EXPECT_TRUE((matmul != result.value().end()));
+  EXPECT_TRUE((max != result.value().end()));
+  EXPECT_TRUE((min != result.value().end()));
+  EXPECT_LT(matmul, max);
+  EXPECT_LT(max, min);
+}
+TEST(GeXlaOpsForDeviceTest, GetCpuNames) {
+  absl::StatusOr<std::vector<std::string>> result =
+      GetRegisteredXlaOpsForDevice("XLA_CPU_JIT");
+  EXPECT_GT(result.value().size(), 0);
+  auto matmul =
+      std::find(result.value().begin(), result.value().end(), "MatMul");
+  auto max = std::find(result.value().begin(), result.value().end(), "Max");
+  auto min = std::find(result.value().begin(), result.value().end(), "Min");
+  EXPECT_TRUE((matmul != result.value().end()));
+  EXPECT_TRUE((max != result.value().end()));
+  EXPECT_TRUE((min != result.value().end()));
+  EXPECT_LT(matmul, max);
+  EXPECT_LT(max, min);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 2840076a3c3..f896f97a462 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
@@ -256,10 +257,15 @@ Status PropagateConstIntoFuncAttr(
   FunctionDef replace_fdef;
   string new_func_name =
       fld->UniqueFunctionName(absl::StrCat(func_attr.name(), "_const_"));
+  const StackTracesMap* stack_traces =
+      lookup_fld->GetStackTraces(func_attr.name());
   TF_RETURN_IF_ERROR(
       GraphToFunctionDef(*func_graph, new_func_name, &replace_fdef));
-  TF_RETURN_IF_ERROR(fld->AddFunctionDef(
-      replace_fdef, lookup_fld->GetStackTraces(func_attr.name())));
+  if (stack_traces != nullptr) {
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(replace_fdef, *stack_traces));
+  } else {
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(replace_fdef, {}));
+  }
 
   VLOG(1) << "replace func " << func_attr.name() << " with " << new_func_name;
   // Change the node to use rewritten function.
@@ -267,9 +273,6 @@ Status PropagateConstIntoFuncAttr(
   n->ClearAttr(attr_name);
   n->AddAttr(attr_name, func_attr);
 
-  TF_RETURN_IF_ERROR(fld->AddFunctionDef(
-      replace_fdef, lookup_fld->GetStackTraces(func_attr.name())));
-
   // Copy associated functions.
   TF_RETURN_IF_ERROR(CopyAssociatedFunctions(func_graph, lookup_fld, fld));
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index 70ca4576be0..d3ba6133243 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -42,8 +42,8 @@ namespace {
 
 void ExpectErrorContains(const Status& status, absl::string_view str) {
   EXPECT_NE(OkStatus(), status);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), str))
-      << "expected error: " << status.error_message() << " to contain: " << str;
+  EXPECT_TRUE(absl::StrContains(status.message(), str))
+      << "expected error: " << status.message() << " to contain: " << str;
 }
 
 TEST(ValidateConfig, Good) {
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index bd57112ccdc..1c24cffa93d 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
 
 #include <cassert>
+#include <vector>
 
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
 
@@ -24,9 +25,12 @@ namespace tensorflow {
 XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
                                                AllocMode alloc_mode)
     : raw_function_(static_data.raw_function_),
+      run_function_(static_data.run_function_),
+      cpu_executable_(static_data.cpu_executable_),
       result_index_(static_data.result_index_),
       buffer_table_(new void*[static_data.num_buffers_]),
       buffer_infos_(static_data.buffer_infos_),
+      num_buffers_(static_data.num_buffers_),
       arg_index_table_(static_data.arg_index_table_),
       num_args_(static_data.num_args_),
       num_variables_(static_data.num_variables_),
@@ -53,12 +57,29 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
 }
 
 bool XlaCompiledCpuFunction::Run() {
+  if (run_function_) {
+    std::vector<xla::cpu::BufferDesc> descriptor_table =
+        MakeXlaRuntimeDescriptorTable();
+    return run_function_(cpu_executable_, descriptor_table, &run_options_);
+  }
   XlaCustomCallStatus status;
   raw_function_(buffer_table_[result_index_], &run_options_, nullptr,
                 buffer_table_, &status, profile_counters_);
   return !xla::CustomCallStatusGetMessage(&status).has_value();
 }
 
+std::vector<xla::cpu::BufferDesc>
+XlaCompiledCpuFunction::MakeXlaRuntimeDescriptorTable() {
+  std::vector<xla::cpu::BufferDesc> descriptor_table;
+  descriptor_table.reserve(num_buffers_);
+  for (int32_t i = 0; i < num_buffers_; ++i) {
+    void* data = buffer_table_[i];
+    uint64_t size = buffer_infos_[i].size();
+    descriptor_table.emplace_back(data, size);
+  }
+  return descriptor_table;
+}
+
 XlaCompiledCpuFunction::~XlaCompiledCpuFunction() {
   xla::cpu_function_runtime::FreeContiguous(alloc_buffer_table_);
   delete[] buffer_table_;
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 8e707278ed8..176f203e924 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -18,9 +18,11 @@ limitations under the License.
 
 #include <cassert>
 #include <string>
+#include <vector>
 
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/service/cpu/buffer_desc.h"
 #include "tensorflow/compiler/xla/service/custom_call_status_internal.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -29,6 +31,10 @@ limitations under the License.
 namespace xla {
 class ProgramShapeProto;
 class HloProfilePrinterData;
+
+namespace cpu {
+class CpuExecutable;
+}  // namespace cpu
 }  // namespace xla
 
 namespace tensorflow {
@@ -54,6 +60,10 @@ class XlaCompiledCpuFunction {
                                const xla::ExecutableRunOptions* run_options,
                                const void** args, void** temps,
                                XlaCustomCallStatus*, int64_t* profile_counters);
+  using RunFunction =
+      bool (*)(const xla::cpu::CpuExecutable* cpu_executable,
+               const std::vector<xla::cpu::BufferDesc>& descriptor_table,
+               const xla::ExecutableRunOptions* run_options);
 
   // StaticData represents the state necessary to run an XLA-compiled
   // function. For JIT this is backed by data in XlaJitCompiledCpuFunction; for
@@ -66,9 +76,12 @@ class XlaCompiledCpuFunction {
     // The raw function to call.
     RawFunction raw_function_;
 
+    RunFunction run_function_ = nullptr;
+    const xla::cpu::CpuExecutable* cpu_executable_ = nullptr;
+
     // Contains information about the buffers used by the XLA computation.
     const xla::cpu_function_runtime::BufferInfo* buffer_infos_ = nullptr;
-    size_t num_buffers_ = 0;
+    int32_t num_buffers_ = 0;
 
     // Entry parameter i is described by
     // buffer_infos[arg_index_table[i]].
@@ -278,6 +291,16 @@ class XlaCompiledCpuFunction {
     static_data->raw_function_ = raw_function;
   }
 
+  static void set_static_data_run_function(StaticData* static_data,
+                                           RunFunction run_function) {
+    static_data->run_function_ = run_function;
+  }
+
+  static void set_static_data_cpu_executable(
+      StaticData* static_data, const xla::cpu::CpuExecutable* cpu_executable) {
+    static_data->cpu_executable_ = cpu_executable;
+  }
+
   static void set_static_data_buffer_infos(
       StaticData* static_data,
       const xla::cpu_function_runtime::BufferInfo* buffer_infos) {
@@ -347,6 +370,12 @@ class XlaCompiledCpuFunction {
 
  private:
   const RawFunction raw_function_;
+  // TODO(ecg): RunFunction and CpuExecutable should go away. Instead, we should
+  // have a pointer or reference to a minimal wrapper around CpuExecutable's
+  // Execute(), without CpuExecutable's dependences. We could call this wrapper
+  // "XlaRuntimeRunner".
+  const RunFunction run_function_;
+  const xla::cpu::CpuExecutable* cpu_executable_;
   const size_t result_index_;
 
   // Array containing pointers to argument and temp buffers (slots corresponding
@@ -355,6 +384,7 @@ class XlaCompiledCpuFunction {
 
   // Describes the buffers used by the XLA computation.
   const xla::cpu_function_runtime::BufferInfo* const buffer_infos_;
+  const int32 num_buffers_;
 
   // Argument i needs to be placed in buffer_table_[arg_index_to_temp_index_[i]]
   // for XLA generated code to be able to find it.
@@ -383,6 +413,9 @@ class XlaCompiledCpuFunction {
   const xla::ProgramShapeProto* program_shape_ = nullptr;
   const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
 
+  // Creates a descriptor table for XLA Runtime.
+  std::vector<xla::cpu::BufferDesc> MakeXlaRuntimeDescriptorTable();
+
   // Add `XlaJitCompiledCpuFunction` as a friend so that it can access the
   // `set_static_data_*` static methods above.
   friend class XlaJitCompiledCpuFunction;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index d8a227c6de3..ef7c45f0a4b 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 
+#include <algorithm>
 #include <memory>
 #include <numeric>
+#include <string>
 #include <vector>
 
 #include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
@@ -53,6 +55,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -61,7 +64,6 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 #include "tensorflow/core/util/dump_graph.h"
 
@@ -124,6 +126,10 @@ ComputeArgAndRetvalShardings(const Graph& graph) {
   return std::make_pair(std::move(arg_shardings), std::move(retval_shardings));
 }
 
+// Due to the wonkiness with Resource Cleanup, changing how resources are
+// cleaned up here need to change how resources are cleaned up in
+// graph_compiler_test.
+// LINT.IfChange(ExecuteGraph)
 Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
                     XlaCompilationDevice* device, FunctionLibraryRuntime* flib,
                     int64_t step_id) {
@@ -150,6 +156,7 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
   step_container.reset();
   return status;
 }
+// LINT.ThenChange(//tensorflow/compiler/tf2xla/graph_compiler_test.cc)
 
 // Builds the XLA computation.
 // - `args` is the list of input arguments
@@ -570,7 +577,7 @@ Status XlaCompiler::FindFunctionBody(const NameAttrList& function,
     }
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
         GetFunctionBody(function, flib_runtime_, fbody),
-        "Local lookup failed with: ", status.error_message());
+        "Local lookup failed with: ", status.message());
     if (config_proto) {
       *config_proto = flib_runtime_->config_proto();
     }
@@ -896,8 +903,16 @@ Status XlaCompiler::CompileFunction(
     }
   } else {
     VLOG(1) << "MLIR bridge off. Using the old bridge to compile the function";
-    TF_RETURN_IF_ERROR(
-        CompileGraph(options, function_id, std::move(graph), args, result));
+    auto status =
+        CompileGraph(options, function_id, std::move(graph), args, result);
+    if (!status.ok()) {
+      ::tsl::errors::AppendToMessage(
+          &status, "tf2xla conversion failed while converting ", function_id,
+          ". Run with TF_DUMP_GRAPH_PREFIX=/path/to/dump/dir and "
+          "--vmodule=xla_compiler=2 to obtain a dump of the compiled "
+          "functions.");
+      return status;
+    }
   }
   VLOG(1) << "====================================================";
 
@@ -1325,7 +1340,7 @@ Status ValidateGraph(const Graph* graph,
       std::string errmsg = absl::StrCat(
           "Detected unsupported operations when trying to compile graph ", name,
           " on ", device_type.type_string(), ": ", node->def().op(), " (",
-          s.error_message(), ")", FormatNodeForError(*node));
+          s.message(), ")", FormatNodeForError(*node));
       if (absl::StrContains(device_type.type_string(), "TPU")) {
         absl::StrAppend(&errmsg,
                         "\nOne approach is to outside compile the unsupported "
@@ -1382,12 +1397,43 @@ void ConvertConstantsToExpressions(xla::XlaBuilder* builder,
 
 }  // namespace
 
+// A temporary dummy stack trace, used to identify locations where stack trace
+// info is being lost, and to clarify how stack trace info is otherwise being
+// handled in individual passes. This class and its usage below will be removed
+// once we have robust end-to-end metadata handling.
+// TODO(b/265059672): Remove when end-to-end stack trace handling is in place
+class DummyStackTrace : public AbstractStackTrace {
+  absl::Span<StackFrame const> ToFrames() const override { return frames_; }
+
+  StackFrame LastUserFrame() const override { return frames_.back(); }
+
+  std::vector<StackFrame> GetUserFrames(int /*limit*/) const override {
+    return frames_;
+  }
+
+  std::string ToString(const TracePrintingOptions& opts) const override {
+    auto frame = LastUserFrame();
+    return absl::StrCat(frame.file_name, ":", frame.line_number, ":",
+                        frame.function_name);
+  }
+
+  std::vector<StackFrame> frames_{
+      StackFrame({"dummy_file_name", 10, "dummy_function_name"})};
+};
+
 Status XlaCompiler::CompileGraph(
     const XlaCompiler::CompileOptions& options, string const& name,
     std::unique_ptr<Graph> graph, absl::Span<const XlaCompiler::Argument> args,
     CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate XlaBuilder.: " << name;
 
+  DummyStackTrace stack_trace;
+  for (auto node : graph->nodes()) {
+    if (node->GetStackTrace() == nullptr) {
+      node->SetStackTrace(std::make_shared<DummyStackTrace>(stack_trace));
+    }
+  }
+
   TF_RETURN_IF_ERROR(PropagateConstIntoFunctionalNodes(
       graph.get(), options_.flib_def, local_flib_def_.get()));
   TF_RETURN_IF_ERROR(RearrangeFunctionArguments(
@@ -1635,7 +1681,7 @@ Status XlaCompiler::GetHostComputeControlDependency(
 }
 
 Status XlaCompiler::SetHostComputeControlDependency(
-    const string& host_compute_name, const xla::XlaOp& handle) {
+    const string& host_compute_name, const xla::XlaOp handle) {
   if (host_compute_control_output_.find(host_compute_name) !=
       host_compute_control_output_.end()) {
     return errors::InvalidArgument(
@@ -1660,8 +1706,7 @@ Status XlaCompiler::PopNodeTokenMapping() {
   return OkStatus();
 }
 
-Status XlaCompiler::SetNodeToken(const string& node_name,
-                                 const xla::XlaOp& op) {
+Status XlaCompiler::SetNodeToken(const string& node_name, const xla::XlaOp op) {
   if (node_token_mapping_stack_.empty()) {
     return errors::FailedPrecondition(
         "Calling SetNodeToken() when node_token_mapping_stack_ is "
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index d027326239e..a90d705b2b1 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -301,7 +301,7 @@ class XlaCompiler {
   Status GetHostComputeControlDependency(const string& host_compute_name,
                                          xla::XlaOp* handle);
   Status SetHostComputeControlDependency(const string& host_compute_name,
-                                         const xla::XlaOp& handle);
+                                         xla::XlaOp handle);
 
   const Options& options() const { return options_; }
   xla::Client* client() const { return options_.client; }
@@ -309,7 +309,7 @@ class XlaCompiler {
 
   void PushNodeTokenMapping();
   Status PopNodeTokenMapping();
-  Status SetNodeToken(const string& node_name, const xla::XlaOp& op);
+  Status SetNodeToken(const string& node_name, xla::XlaOp op);
   StatusOr<xla::XlaOp> GetNodeToken(const string& node_name);
 
   // Sets the function body `fbody` to the one registered as `function`.
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 5231d9e4246..ea34895b76b 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -572,14 +572,13 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
       compiler.CompileGraph(XlaCompiler::CompileOptions(), "reshape",
                             std::move(graph), args, &result);
   EXPECT_FALSE(status.ok());
+  EXPECT_TRUE(absl::StrContains(status.message(), "depends on a parameter"))
+      << status.message();
+  EXPECT_TRUE(absl::StrContains(status.message(), "{{node C}}"))
+      << status.message();
   EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "depends on a parameter"))
-      << status.error_message();
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "{{node C}}"))
-      << status.error_message();
-  EXPECT_TRUE(absl::StrContains(status.error_message(),
-                                "must be a compile-time constant"))
-      << status.error_message();
+      absl::StrContains(status.message(), "must be a compile-time constant"))
+      << status.message();
 }
 
 // Tests handling of compile-time constant outputs.
@@ -943,8 +942,8 @@ TEST_F(XlaCompilerTest, UndefinedFunctionFails) {
       compiler.CompileFunction(XlaCompiler::CompileOptions(), name_attr,
                                /*args=*/{}, &result);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "is not defined."))
-      << status.error_message();
+  EXPECT_TRUE(absl::StrContains(status.message(), "is not defined."))
+      << status.message();
 }
 
 FunctionDef FillFn() {
@@ -1022,11 +1021,11 @@ TEST_F(XlaCompilerTest, LocalFunctionWithWrongArgumentsFail) {
 
   ASSERT_FALSE(status.ok());
   // Flib lookup failure.
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "is not defined."))
-      << status.error_message();
+  EXPECT_TRUE(absl::StrContains(status.message(), "is not defined."))
+      << status.message();
   // Local flib lookup failure.
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Attr T is not found"))
-      << status.error_message();
+  EXPECT_TRUE(absl::StrContains(status.message(), "Attr T is not found"))
+      << status.message();
 }
 
 FunctionDef SliceFn() {
@@ -1521,10 +1520,10 @@ TEST_F(XlaCompilerTest, FunctionWithInvalidOp) {
   status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "fill",
                                  std::move(graph), args, &result);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "InvalidOp"))
-      << status.error_message();
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "{{node fill_fn}}"))
-      << status.error_message();
+  EXPECT_TRUE(absl::StrContains(status.message(), "InvalidOp"))
+      << status.message();
+  EXPECT_TRUE(absl::StrContains(status.message(), "{{node fill_fn}}"))
+      << status.message();
 }
 
 // Tests a graph which has a node with invalid data type.
@@ -1546,11 +1545,11 @@ TEST_F(XlaCompilerTest, NodeWithInvalidDataType) {
   status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "invalid_type",
                                  std::move(graph), args, &result);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(absl::StrContains(status.error_message(),
+  EXPECT_TRUE(absl::StrContains(status.message(),
                                 "is not in the list of allowed values"))
-      << status.error_message();
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "{{node Shape}}"))
-      << status.error_message();
+      << status.message();
+  EXPECT_TRUE(absl::StrContains(status.message(), "{{node Shape}}"))
+      << status.message();
 }
 
 TEST_F(XlaCompilerTest, SingleOpWithoutInputs) {
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index f52e83c8c63..c936d6d7962 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -90,8 +90,8 @@ xla::XlaOp XlaHelpers::FloatLiteral(xla::XlaBuilder* b, DataType data_type,
 
 Status XlaHelpers::OneHot(xla::XlaBuilder* builder, int64_t depth, int axis,
                           DataType index_type, const TensorShape& indices_shape,
-                          const xla::XlaOp& indices, const xla::XlaOp& on_value,
-                          const xla::XlaOp& off_value, xla::XlaOp* one_hot) {
+                          const xla::XlaOp indices, const xla::XlaOp on_value,
+                          const xla::XlaOp off_value, xla::XlaOp* one_hot) {
   // Broadcast the linspace constant across the indices along the new axis,
   // and test equality at each position.
   std::vector<int64_t> broadcast_dims(indices_shape.dims());
@@ -128,7 +128,7 @@ DataType XlaHelpers::SumAccumulationType(const DataType& dtype) {
   return dtype;
 }
 
-xla::XlaOp XlaHelpers::ConvertElementType(const xla::XlaOp& operand,
+xla::XlaOp XlaHelpers::ConvertElementType(const xla::XlaOp operand,
                                           const DataType new_element_type) {
   xla::PrimitiveType convert_to;
   TF_CHECK_OK(DataTypeToPrimitiveType(new_element_type, &convert_to));
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 0e621995cbc..f2551774f1c 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -68,8 +68,8 @@ class XlaHelpers {
   // respectively.
   static Status OneHot(xla::XlaBuilder* builder, int64_t depth, int axis,
                        DataType index_type, const TensorShape& indices_shape,
-                       const xla::XlaOp& indices, const xla::XlaOp& on_value,
-                       const xla::XlaOp& off_value, xla::XlaOp* one_hot);
+                       xla::XlaOp indices, xla::XlaOp on_value,
+                       xla::XlaOp off_value, xla::XlaOp* one_hot);
 
   // Certain DataTypes should use increased precision DataTypes when performing
   // reductions.  This function remaps a given DataType to a higher precision
@@ -78,7 +78,7 @@ class XlaHelpers {
 
   // A helper for creating a ConvertElementType xla op given a DataType rather
   // than the xla::PrimitiveType.
-  static xla::XlaOp ConvertElementType(const xla::XlaOp& operand,
+  static xla::XlaOp ConvertElementType(xla::XlaOp operand,
                                        const DataType new_element_type);
 
   typedef std::function<StatusOr<xla::Shape>(const TensorShape&, DataType, bool,
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index c9d17abe2a7..5c54551707b 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -79,6 +79,15 @@ void CollectNames(const T& entries, std::vector<string>* nonempty_names,
   name_ptrs->push_back(nullptr);  // array terminator
 }
 
+bool RunXlaRuntime(const xla::cpu::CpuExecutable* cpu_executable,
+                   const std::vector<xla::cpu::BufferDesc>& descriptor_table,
+                   const xla::ExecutableRunOptions* run_options) {
+  assert(cpu_executable->IsXlaRuntime());
+  Status status =
+      cpu_executable->ExecuteXlaRuntime(descriptor_table, run_options);
+  return status.ok();
+}
+
 }  // namespace
 
 /*static*/ StatusOr<std::unique_ptr<XlaJitCompiledCpuFunction>>
@@ -147,6 +156,12 @@ XlaJitCompiledCpuFunction::Compile(
       std::make_unique<xla::ProgramShapeProto>(program_shape->ToProto());
   XlaCompiledCpuFunction::set_static_data_raw_function(&jit->static_data_,
                                                        raw_function);
+  if (cpu_executable->IsXlaRuntime()) {
+    XlaCompiledCpuFunction::set_static_data_run_function(&jit->static_data_,
+                                                         RunXlaRuntime);
+    XlaCompiledCpuFunction::set_static_data_cpu_executable(&jit->static_data_,
+                                                           cpu_executable);
+  }
   XlaCompiledCpuFunction::set_static_data_buffer_infos(
       &jit->static_data_, jit->buffer_infos_.data());
   XlaCompiledCpuFunction::set_static_data_num_buffers(
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index 6f45dcf1726..f1b838ab882 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -331,7 +331,7 @@ TEST(XlaJitCompiledCpuFunction, CanCompileWithAdditionalPlatform) {
     return std::unique_ptr<xla::Compiler>(nullptr);
   });
 
-  EXPECT_THAT(xla::PlatformUtil::GetDefaultPlatform().status().error_message(),
+  EXPECT_THAT(xla::PlatformUtil::GetDefaultPlatform().status().message(),
               HasSubstr("FakePlatform"));
 
   GraphDef graph_def = SumGraph();
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 0f7373659bd..54996de9c24 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -199,18 +199,20 @@ Status XlaOpKernelContext::ConstantInputReshaped(
   return OkStatus();
 }
 
-// Converts an int32 or int64 scalar literal to an int64.
+// Converts an int16, int32 or int64 scalar literal to an int64.
 static Status LiteralToInt64Scalar(const xla::LiteralSlice& literal,
                                    int64_t* out) {
   if (literal.shape().rank() != 0) {
     return errors::InvalidArgument("value is not a scalar");
   }
-  if (literal.shape().element_type() == xla::S32) {
+  if (literal.shape().element_type() == xla::S16) {
+    *out = literal.Get<int16>({});
+  } else if (literal.shape().element_type() == xla::S32) {
     *out = literal.Get<int32>({});
   } else if (literal.shape().element_type() == xla::S64) {
     *out = literal.Get<int64_t>({});
   } else {
-    return errors::InvalidArgument("value must be either int32 or int64");
+    return errors::InvalidArgument("value must be int16, int32, or int64");
   }
   return OkStatus();
 }
@@ -754,8 +756,7 @@ Status XlaOpKernelContext::AssignVariable(absl::string_view name, DataType type,
 static Status GetStatusWithStackTrace(const Status& s,
                                       const XlaOpKernelContext* ctx) {
   if (s.code() == error::INVALID_ARGUMENT) {
-    return Status{s.code(),
-                  absl::StrCat(s.error_message(), "\n", ctx->StackTrace())};
+    return Status{s.code(), absl::StrCat(s.message(), "\n", ctx->StackTrace())};
   }
   return s;
 }
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 7f1b5dbd1b9..0bdb03cd76b 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -222,7 +222,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
       const OpDef* op_def;
       Status lookup_status = op_registry->LookUpOpDef(op_name, &op_def);
       if (!lookup_status.ok()) {
-        LOG(ERROR) << lookup_status.error_message();
+        LOG(ERROR) << lookup_status.message();
         XLA_LOG_LINES(
             ERROR,
             "Ops registered: \n" +
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 347c81c10d2..f29391811ed 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -29,6 +29,7 @@ package_group(
         "//third_party/mlir_edge/model_curriculum/...",
         "//third_party/py/jax/...",
         "//third_party/py/t5x/...",
+        "//third_party/py/tpu_graphs/...",
         "//tensorflow/compiler/...",
         "//tensorflow/python/tpu/...",
     ],
@@ -206,6 +207,7 @@ cc_library(
     visibility = [":friends"],
     deps = [
         "//third_party/eigen3",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -770,6 +772,10 @@ cc_library(
     hdrs = ["executable_run_options.h"],
     compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status:statusor",
+    ],
 )
 
 cc_library(
@@ -1177,7 +1183,9 @@ filegroup(
         "runlit.cfg.py",
         "runlit.site.cfg.py",
     ],
-    visibility = ["//tensorflow/compiler/xla:__subpackages__"],
+    visibility = [
+        "//tensorflow/compiler/xla:__subpackages__",  # Scheuklappen: keep
+    ],
 )
 
 # -----------------------------------------------------------------------------
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index bdfc8d687e6..3238ffdf53d 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -18,7 +18,8 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
-#include <functional>
+#include <cstdint>
+#include <cstring>
 #include <initializer_list>
 #include <iterator>
 #include <memory>
@@ -26,15 +27,12 @@ limitations under the License.
 #include <random>
 #include <string>
 #include <type_traits>
-#include <vector>
 
 #include "absl/functional/function_ref.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/tsl/platform/logging.h"
 
 namespace xla {
 
@@ -69,29 +67,37 @@ class Array {
   // nests, especially if one or more dimensions is one as the compiler just
   // sees a single-element integer initializer. These typedefs allow casting
   // explicitly with less typing.
-  using InitializerList1D = std::initializer_list<T>;
-  using InitializerList2D = std::initializer_list<InitializerList1D>;
-  using InitializerList3D = std::initializer_list<InitializerList2D>;
-  using InitializerList4D = std::initializer_list<InitializerList3D>;
+  template <typename D>
+  using InitializerList1D = std::initializer_list<D>;
+  template <typename D>
+  using InitializerList2D = std::initializer_list<InitializerList1D<D>>;
+  template <typename D>
+  using InitializerList3D = std::initializer_list<InitializerList2D<D>>;
+  template <typename D>
+  using InitializerList4D = std::initializer_list<InitializerList3D<D>>;
 
   using value_type = T;
 
   // Creates a new array with the specified dimensions and initialized elements.
   explicit Array(absl::Span<const int64_t> sizes)
-      : sizes_(sizes.begin(), sizes.end()), values_(new T[num_elements()]()) {}
+      : sizes_(sizes.size()),
+        values_(calculate_elements(sizes), default_init_t{}) {
+    std::memcpy(sizes_.data.get(), sizes.data(),
+                sizeof(int64_t) * sizes.size());
+  }
 
   // Creates a new array with the specified dimensions and specified value for
   // every cell.
   Array(absl::Span<const int64_t> sizes, T value)
-      : sizes_(sizes.begin(), sizes.end()), values_(new T[num_elements()]) {
+      : Array(sizes, no_default_init_t{}) {
     Fill(value);
   }
 
   // Creates a 2D array from the given nested initializer list. The outer
   // initializer list is the first dimension, the inner is the second dimension.
   // For example, {{1, 2, 3}, {4, 5, 6}} results in an array with n1=2 and n2=3.
-  Array(InitializerList2D values)
-      : Array(ToInt64Vector({values.size(), values.begin()->size()})) {
+  Array(InitializerList2D<T> values)
+      : Array(ToInt64Array(values), no_default_init_t{}) {
     int64_t idx = 0;
     for (const auto& it1 : values) {
       for (const auto& it2 : it1) {
@@ -111,7 +117,7 @@ class Array {
                               std::is_same<T, double>::value) &&
                              std::is_same<T2, float>::value>::type>
   Array(std::initializer_list<T2> values)
-      : Array(ToInt64Vector({values.size()})) {
+      : Array(ToInt64Array(values), no_default_init_t{}) {
     int64_t idx = 0;
     for (const auto& it1 : values) {
       values_[idx] = static_cast<T>(it1);
@@ -131,7 +137,7 @@ class Array {
                               std::is_same<T, double>::value) &&
                              std::is_same<T2, float>::value>::type>
   Array(std::initializer_list<std::initializer_list<T2>> values)
-      : Array(ToInt64Vector({values.size(), values.begin()->size()})) {
+      : Array(ToInt64Array(values), no_default_init_t{}) {
     int64_t idx = 0;
     for (const auto& it1 : values) {
       for (const auto& it2 : it1) {
@@ -144,9 +150,8 @@ class Array {
 
   // Creates a 3D array from the given nested initializer list. The outer
   // initializer list is the first dimension, and so on.
-  Array(InitializerList3D values)
-      : Array(ToInt64Vector({values.size(), values.begin()->size(),
-                             values.begin()->begin()->size()})) {
+  Array(InitializerList3D<T> values)
+      : Array(ToInt64Array(values), no_default_init_t{}) {
     int64_t idx = 0;
     for (const auto& it1 : values) {
       for (const auto& it2 : it1) {
@@ -169,8 +174,7 @@ class Array {
                              std::is_same<T2, float>::value>::type>
   Array(std::initializer_list<std::initializer_list<std::initializer_list<T2>>>
             values)
-      : Array(ToInt64Vector({values.size(), values.begin()->size(),
-                             values.begin()->begin()->size()})) {
+      : Array(ToInt64Array(values), no_default_init_t{}) {
     int64_t idx = 0;
     for (const auto& it1 : values) {
       for (const auto& it2 : it1) {
@@ -185,10 +189,8 @@ class Array {
 
   // Creates a 4D array from the given nested initializer list. The outer
   // initializer list is the first dimension, and so on.
-  Array(InitializerList4D values)
-      : Array(ToInt64Vector({values.size(), values.begin()->size(),
-                             values.begin()->begin()->size(),
-                             values.begin()->begin()->begin()->size()})) {
+  Array(InitializerList4D<T> values)
+      : Array(ToInt64Array(values), no_default_init_t{}) {
     int64_t idx = 0;
     for (const auto& it1 : values) {
       for (const auto& it2 : it1) {
@@ -214,9 +216,7 @@ class Array {
   Array(std::initializer_list<
         std::initializer_list<std::initializer_list<std::initializer_list<T2>>>>
             values)
-      : Array(ToInt64Vector({values.size(), values.begin()->size(),
-                             values.begin()->begin()->size(),
-                             values.begin()->begin()->begin()->size()})) {
+      : Array(ToInt64Array(values), no_default_init_t{}) {
     int64_t idx = 0;
     for (const auto& it1 : values) {
       for (const auto& it2 : it1) {
@@ -232,43 +232,29 @@ class Array {
   }
 
   Array(const Array<T>& other)
-      : sizes_(other.sizes_), values_(new T[num_elements()]) {
-    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
-              &values_[0]);
-  }
+      : sizes_(other.sizes_.Clone()), values_(other.values_.Clone()) {}
 
-  Array(Array<T>&& other)
-      : sizes_(std::move(other.sizes_)), values_(std::move(other.values_)) {}
+  Array(Array<T>&& other) = default;
 
   Array<T>& operator=(const Array<T>& other) {
-    sizes_ = other.sizes_;
-    values_.reset(new T[num_elements()]);
-    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
-              &values_[0]);
+    sizes_ = other.sizes_.Clone();
+    values_ = other.values_.Clone();
     return *this;
   }
 
-  Array<T>& operator=(Array<T>&& other) {
-    sizes_ = std::move(other.sizes_);
-    values_ = std::move(other.values_);
-    return *this;
-  }
+  Array<T>& operator=(Array<T>&& other) = default;
 
   // Fills the array with the specified value.
-  void Fill(const T& value) {
-    std::fill(&values_[0], &values_[0] + num_elements(), value);
-  }
+  void Fill(const T& value) { std::fill(begin(), end(), value); }
 
   // Fills the array with sequentially increasing values.
-  void FillIota(const T& value) {
-    std::iota(&values_[0], &values_[0] + num_elements(), value);
-  }
+  void FillIota(const T& value) { std::iota(begin(), end(), value); }
 
   // Fills the array with a repeating sequence:
   //   [value, value + 1, ..., value + length - 1, value, ... ]
   void FillRepeatedIota(const T& value, int64_t length) {
     for (int64_t i = 0; i < num_elements(); i += length) {
-      std::iota(&values_[i], &values_[std::min(i + length, num_elements())],
+      std::iota(begin() + i, begin() + std::min(i + length, num_elements()),
                 value);
     }
   }
@@ -324,23 +310,23 @@ class Array {
   void SetValues(const Container& container) {
     CHECK_EQ(std::distance(std::begin(container), std::end(container)),
              num_elements());
-    std::copy(std::begin(container), std::end(container), &values_[0]);
+    std::copy(std::begin(container), std::end(container), begin());
   }
 
   // Invokes a callback with the (indices, value_ptr) for each cell in the
   // array.
   void Each(absl::FunctionRef<void(absl::Span<const int64_t>, T*)> f) {
-    std::vector<int64_t> index(sizes_.size());
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
     for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
-      f(index, &values_[i]);
+      f(index.span(), &values_[i]);
     }
   }
 
   // Invokes a callback with the (indices, value) for each cell in the array.
   void Each(absl::FunctionRef<void(absl::Span<const int64_t>, T)> f) const {
-    std::vector<int64_t> index(sizes_.size());
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
     for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
-      f(index, values_[i]);
+      f(index.span(), values_[i]);
     }
   }
 
@@ -349,9 +335,9 @@ class Array {
   // OkStatus().
   Status EachStatus(
       absl::FunctionRef<Status(absl::Span<const int64_t>, T*)> f) {
-    std::vector<int64_t> index(sizes_.size());
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
     for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
-      Status s = f(index, &values_[i]);
+      Status s = f(index.span(), &values_[i]);
       if (!s.ok()) {
         return s;
       }
@@ -364,9 +350,9 @@ class Array {
   // OkStatus().
   Status EachStatus(
       absl::FunctionRef<Status(absl::Span<const int64_t>, T)> f) const {
-    std::vector<int64_t> index(sizes_.size());
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
     for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
-      Status s = f(index, values_[i]);
+      Status s = f(index.span(), values_[i]);
       if (!s.ok()) {
         return s;
       }
@@ -384,6 +370,7 @@ class Array {
   typename std::enable_if<array_impl::pack_is_integral<Dims...>::value,
                           const T&>::type
   operator()(Dims... dims) const {
+    CHECK_EQ(sizeof...(dims), num_dimensions());
     // We are using a std::array to avoid having to allocate memory in this
     // function for performance reasons.
     std::array<int64_t, sizeof...(dims)> indexes{
@@ -397,23 +384,21 @@ class Array {
   typename std::enable_if<array_impl::pack_is_integral<Dims...>::value,
                           T&>::type
   operator()(Dims... dims) {
-    // We are using a std::array to avoid having to allocate memory in this
-    // function for performance reasons.
-    std::array<int64_t, sizeof...(dims)> indexes{
-        {static_cast<int64_t>(dims)...}};
-    return values_[calculate_index(indexes)];
+    return const_cast<T&>(const_cast<const Array*>(this)->operator()(
+        std::forward<Dims>(dims)...));
   }
 
   // Returns the value at the cell specified by the indexes. The number of
   // arguments have to match with the number of dimensions for the array.
   const T& operator()(absl::Span<const int64_t> indexes) const {
+    CHECK_EQ(indexes.size(), num_dimensions());
     return values_[calculate_index(indexes)];
   }
 
   // Returns the value at the cell specified by the indexes. The number of
   // arguments have to match with the number of dimensions for the array.
   T& operator()(absl::Span<const int64_t> indexes) {
-    return values_[calculate_index(indexes)];
+    return const_cast<T&>(const_cast<const Array*>(this)->operator()(indexes));
   }
 
   // Low-level accessor for stuff like memcmp, handle with care. Returns pointer
@@ -422,37 +407,33 @@ class Array {
     // TODO(tberghammer): Get rid of the const_cast. Currently it is needed
     // because the Eigen backend needs a non-const pointers even for reading
     // from the array.
-    return const_cast<Array*>(this)->values_.get();
+    return const_cast<Array*>(this)->values_.data.get();
   }
 
   // Returns the size of the dimension at the given index.
   int64_t dim(int64_t n) const {
-    const int64_t sizes_size = sizes_.size();
-    CHECK(n < sizes_size);
+    DCHECK_LT(n, sizes_.size);
     return sizes_[n];
   }
 
   // Returns a vector containing the dimensions of the array.
-  const std::vector<int64_t>& dimensions() const { return sizes_; }
+  absl::Span<const int64_t> dimensions() const { return sizes_.span(); }
 
-  int64_t num_dimensions() const { return sizes_.size(); }
+  int64_t num_dimensions() const { return sizes_.size; }
 
   // Returns the total number of elements in the array.
-  int64_t num_elements() const {
-    return std::accumulate(sizes_.begin(), sizes_.end(), 1LL,
-                           std::multiplies<int64_t>());
-  }
+  int64_t num_elements() const { return values_.size; }
 
-  const T* begin() const { return &values_[0]; }
-  T* begin() { return &values_[0]; }
-  const T* end() const { return &values_[num_elements()]; }
-  T* end() { return &values_[num_elements()]; }
+  const T* begin() const { return values_.data.get(); }
+  T* begin() { return values_.data.get(); }
+  const T* end() const { return values_.data.get() + num_elements(); }
+  T* end() { return values_.data.get() + num_elements(); }
 
   bool operator==(const Array<T>& other) const {
-    if (sizes_.size() != other.sizes_.size()) {
+    if (sizes_.size != other.sizes_.size) {
       return false;
     }
-    for (int64_t i = 0, end = sizes_.size(); i < end; ++i) {
+    for (int64_t i = 0, end = sizes_.size; i < end; ++i) {
       if (sizes_[i] != other.sizes_[i]) {
         return false;
       }
@@ -473,16 +454,16 @@ class Array {
     CHECK_EQ(starts.size(), num_dimensions());
     CHECK_EQ(limits.size(), num_dimensions());
 
-    std::vector<int64_t> sizes;
-    std::transform(starts.begin(), starts.end(), limits.begin(),
-                   std::back_inserter(sizes),
-                   [](int64_t start, int64_t limit) { return limit - start; });
-    Array<T> result(sizes);
+    OwnedBuffer<int64_t> sizes(starts.size());
+    for (int64_t i = 0; i < starts.size(); ++i) {
+      sizes[i] = limits[i] - starts[i];
+    }
+    Array<T> result(sizes.span());
 
-    std::vector<int64_t> index(sizes_.size());
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
     int64_t slice_i = 0;
     for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
-      if (array_impl::all_inside_range(index, starts, limits)) {
+      if (array_impl::all_inside_range(index.span(), starts, limits)) {
         // Even though the bounds of result are different to our bounds, we're
         // iterating in the same order. So we can simply write successive linear
         // indices instead of recalculating a multi-dimensional index.
@@ -496,14 +477,15 @@ class Array {
   void UpdateSlice(const Array<T>& from,
                    absl::Span<const int64_t> start_indices) {
     CHECK_EQ(from.num_dimensions(), num_dimensions());
-    std::vector<int64_t> limit_indices;
-    std::transform(start_indices.begin(), start_indices.end(),
-                   from.dimensions().begin(), std::back_inserter(limit_indices),
-                   std::plus<int64_t>{});
-    std::vector<int64_t> index(sizes_.size());
+    OwnedBuffer<int64_t> limit_indices(start_indices.size());
+    for (int64_t i = 0; i < start_indices.size(); ++i) {
+      limit_indices[i] = from.sizes_[i] + start_indices[i];
+    }
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
     int64_t from_i = 0;
     for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
-      if (array_impl::all_inside_range(index, start_indices, limit_indices)) {
+      if (array_impl::all_inside_range(index.span(), start_indices,
+                                       limit_indices)) {
         // Even though the bounds of from are different to our bounds, we're
         // iterating in the same order. So we can simply write successive linear
         // indices instead of recalculating a multi-dimensional index.
@@ -515,86 +497,167 @@ class Array {
   // Performs an in-place reshape, modifying the dimensions but not the
   // underlying data.
   void Reshape(absl::Span<const int64_t> new_dimensions) {
-    int64_t old_num_elements = num_elements();
-    sizes_ = std::vector<int64_t>(new_dimensions.begin(), new_dimensions.end());
-    CHECK_EQ(num_elements(), old_num_elements);
+    const int64_t new_num_elements =
+        std::accumulate(new_dimensions.begin(), new_dimensions.end(), 1LL,
+                        std::multiplies<int64_t>());
+    CHECK_EQ(new_num_elements, num_elements());
+    if (sizes_.size != new_dimensions.size()) {
+      sizes_ = OwnedBuffer<int64_t>(new_dimensions.size());
+    }
+    std::memcpy(sizes_.data.get(), new_dimensions.data(),
+                new_dimensions.size() * sizeof(int64_t));
   }
 
   // Performs a permutation of dimensions.
   void TransposeDimensions(absl::Span<const int64_t> permutation) {
-    std::vector<int64_t> permuted_dims(permutation.size());
+    CHECK_EQ(sizes_.size, permutation.size());
+    OwnedBuffer<int64_t> permuted_dims(permutation.size());
     for (int64_t i = 0; i < permutation.size(); ++i) {
       permuted_dims[i] = this->dim(permutation[i]);
     }
-    Array<T> permuted(permuted_dims);
-    std::vector<int64_t> src_indices(sizes_.size(), -1);
+    Array<T> permuted(permuted_dims.span());
+    OwnedBuffer<int64_t> src_indices(sizes_.size, -1);
     permuted.Each([&](absl::Span<const int64_t> indices, T* value) {
-      CHECK_EQ(sizes_.size(), indices.size());
-      for (int64_t i = 0; i < sizes_.size(); ++i) {
+      for (int64_t i = 0; i < sizes_.size; ++i) {
         src_indices[permutation[i]] = indices[i];
       }
-      *value = (*this)(src_indices);
+      *value = (*this)(src_indices.span());
     });
     *this = std::move(permuted);
   }
 
   template <typename H>
   friend H AbslHashValue(H h, const Array& array) {
-    return H::combine(std::move(h), absl::MakeSpan(array.begin(), array.end()),
-                      array.dimensions());
+    return H::combine(std::move(h), array.values_.span(), array.dimensions());
   }
 
   // Returns a string representation of the array suitable for debugging.
   std::string ToString() const {
-    if (sizes_.empty()) {
+    if (sizes_.size == 0) {
       return "";
     }
-    std::vector<std::string> pieces;
-    std::vector<int64_t> index(sizes_.size());
+    std::string result;
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
     do {
       // Emit leading spaces and opening square brackets
-      if (index.back() == 0) {
-        for (int64_t i = sizes_.size() - 1; i >= 0; --i) {
+      if (index[index.size - 1] == 0) {
+        for (int64_t i = sizes_.size - 1; i >= 0; --i) {
           if (i == 0 || index[i - 1] != 0) {
-            for (int64_t j = 0; j < sizes_.size(); ++j) {
-              pieces.push_back(j < i ? " " : "[");
+            for (int64_t j = 0; j < sizes_.size; ++j) {
+              absl::StrAppend(&result, j < i ? " " : "[");
             }
             break;
           }
         }
       }
-      int value_index = calculate_index(index);
+      int value_index = calculate_index(index.span());
       if (value_index < num_elements()) {
-        pieces.push_back(absl::StrCat(values_[value_index]));
+        absl::StrAppend(&result, values_[value_index]);
       }
 
       // Emit comma if it isn't the last element
-      if (index.back() < sizes_.back() - 1) {
-        pieces.push_back(", ");
+      if (index[index.size - 1] < sizes_[sizes_.size - 1] - 1) {
+        absl::StrAppend(&result, ", ");
       }
 
       // Emit closing square brackets
-      for (int64_t i = sizes_.size() - 1; i >= 0; --i) {
+      for (int64_t i = sizes_.size - 1; i >= 0; --i) {
         if (index[i] < sizes_[i] - 1) {
           break;
         }
-        pieces.push_back("]");
+        absl::StrAppend(&result, "]");
         if (i != 0 && index[i - 1] < sizes_[i - 1] - 1) {
-          pieces.push_back(",\n");
+          absl::StrAppend(&result, ",\n");
         }
       }
     } while (next_index(&index));
-    return absl::StrJoin(pieces, "");
+    return result;
   }
 
  private:
-  // Converts an initializer_list of type U to a vector of type int64_t. Used by
-  // the initializer list based constructors to convert the size type into
-  // int64_t to be passed to the size based constructor.
-  template <typename U>
-  static std::vector<int64_t> ToInt64Vector(
-      const std::initializer_list<U>& data) {
-    return std::vector<int64_t>(data.begin(), data.end());
+  struct default_init_t {};
+  struct no_default_init_t {};
+  // A fixed sized dynamically allocated buffer to replace std::vector usage. It
+  // saves one word for storing capacity which is always the same as size and it
+  // provides the ability to leave its elements uninitialized if the element
+  // type is trivially destructible.
+  template <typename D>
+  struct OwnedBuffer {
+    explicit OwnedBuffer(size_t size)
+        : data(std::is_trivially_destructible_v<D> ? new D[size]
+                                                   : new D[size]()),
+          size(size) {}
+    explicit OwnedBuffer(size_t size, default_init_t)
+        : data(new D[size]()), size(size) {}
+
+    explicit OwnedBuffer(size_t size, D init) : OwnedBuffer(size) {
+      std::fill(data.get(), data.get() + size, init);
+    }
+
+    OwnedBuffer(OwnedBuffer&& other)
+        : data(std::move(other.data)), size(other.size) {
+      other.size = 0;
+    }
+
+    OwnedBuffer& operator=(OwnedBuffer&& other) {
+      data = std::move(other.data);
+      size = other.size;
+      other.size = 0;
+      return *this;
+    }
+
+    OwnedBuffer Clone() const {
+      OwnedBuffer clone(size);
+      std::memcpy(clone.data.get(), data.get(), size * sizeof(D));
+      return clone;
+    }
+
+    D& operator[](int64_t index) { return data[index]; }
+    const D& operator[](int64_t index) const { return data[index]; }
+
+    absl::Span<const D> span() const {
+      return absl::MakeConstSpan(data.get(), size);
+    }
+
+    std::unique_ptr<D[]> data;
+    size_t size;
+  };
+
+  explicit Array(absl::Span<const int64_t> sizes, no_default_init_t)
+      : sizes_(sizes.size()), values_(calculate_elements(sizes)) {
+    std::memcpy(sizes_.data.get(), sizes.data(),
+                sizeof(int64_t) * sizes.size());
+  }
+
+  // Extracts the dimensions of an initializer_list to an array type int64_t.
+  // Used by the initializer list based constructors to convert the size type
+  // into int64_t to be passed to the size based constructor.
+  template <typename D>
+  static std::array<int64_t, 1> ToInt64Array(const InitializerList1D<D>& data) {
+    return std::array<int64_t, 1>{static_cast<int64_t>(data.size())};
+  }
+
+  template <typename D>
+  static std::array<int64_t, 2> ToInt64Array(const InitializerList2D<D>& data) {
+    return std::array<int64_t, 2>{static_cast<int64_t>(data.size()),
+                                  static_cast<int64_t>(data.begin()->size())};
+  }
+
+  template <typename D>
+  static std::array<int64_t, 3> ToInt64Array(const InitializerList3D<D>& data) {
+    return std::array<int64_t, 3>{
+        static_cast<int64_t>(data.size()),
+        static_cast<int64_t>(data.begin()->size()),
+        static_cast<int64_t>(data.begin()->begin()->size())};
+  }
+
+  template <typename D>
+  static std::array<int64_t, 4> ToInt64Array(const InitializerList4D<D>& data) {
+    return std::array<int64_t, 4>{
+        static_cast<int64_t>(data.size()),
+        static_cast<int64_t>(data.begin()->size()),
+        static_cast<int64_t>(data.begin()->begin()->size()),
+        static_cast<int64_t>(data.begin()->begin()->begin()->size())};
   }
 
   // Returns the linear index from the list of per-dimension indexes. Function
@@ -602,11 +665,10 @@ class Array {
   // memory allocation.
   // The returned value may be larger than or equal to the number of elements if
   // the indexes exceed the array's corresponding dimension size.
-  template <typename U>
-  int64_t calculate_index(const U& indexes) const {
-    CHECK_EQ(sizes_.size(), indexes.size());
+  int64_t calculate_index(absl::Span<const int64_t> indexes) const {
+    DCHECK_EQ(sizes_.size, indexes.size());
     int64_t index = 0;
-    for (int64_t i = 0; i < sizes_.size(); ++i) {
+    for (int64_t i = 0; i < sizes_.size; ++i) {
       index *= sizes_[i];
       index += indexes[i];
     }
@@ -615,9 +677,9 @@ class Array {
 
   // Advances the specified set of indexes and returns true if we haven't
   // wrapped around (i.e. result isn't {0, 0, ...}).
-  bool next_index(std::vector<int64_t>* index) const {
-    CHECK_EQ(index->size(), sizes_.size());
-    for (int64_t i = sizes_.size() - 1; i >= 0; --i) {
+  bool next_index(OwnedBuffer<int64_t>* index) const {
+    DCHECK_EQ(index->size, sizes_.size);
+    for (int64_t i = sizes_.size - 1; i >= 0; --i) {
       (*index)[i]++;
       if ((*index)[i] < sizes_[i]) {
         return true;
@@ -627,15 +689,20 @@ class Array {
     return false;
   }
 
-  std::vector<int64_t> sizes_;
-  std::unique_ptr<T[]> values_;
+  static size_t calculate_elements(absl::Span<const int64_t> sizes) {
+    return std::accumulate(sizes.begin(), sizes.end(), 1LL,
+                           std::multiplies<int64_t>());
+  }
+
+  OwnedBuffer<int64_t> sizes_;
+  OwnedBuffer<T> values_;
 };
 
 // Specialization of FillRandom() method for complex64 type. Uses real part of
 // the stddev parameter as the standard deviation value.
 template <>
-void Array<complex64>::FillRandom(const complex64& stddev, const double mean,
-                                  const int seed);
+void Array<complex64>::FillRandom(const complex64& stddev, double mean,
+                                  int seed);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h
index 77e3c9c94e8..2409fe6268b 100644
--- a/tensorflow/compiler/xla/array2d.h
+++ b/tensorflow/compiler/xla/array2d.h
@@ -54,6 +54,7 @@ class Array2D : public Array<T> {
   // or double) from the given nested initializer list of float values.
   template <typename T2, typename = typename std::enable_if<
                              (std::is_same<T, tsl::float8_e4m3fn>::value ||
+                              std::is_same<T, tsl::float8_e4m3b11>::value ||
                               std::is_same<T, tsl::float8_e5m2>::value ||
                               std::is_same<T, Eigen::half>::value ||
                               std::is_same<T, bfloat16>::value ||
diff --git a/tensorflow/compiler/xla/autotune_serialize.cc b/tensorflow/compiler/xla/autotune_serialize.cc
index 149fdf9f24f..71e0562b6e8 100644
--- a/tensorflow/compiler/xla/autotune_serialize.cc
+++ b/tensorflow/compiler/xla/autotune_serialize.cc
@@ -40,7 +40,7 @@ Status LoadAutotuneResults(absl::string_view data) {
   }
   if (results.version() != kVersion) {
     return tsl::errors::InvalidArgument(absl::StrFormat(
-        "Version mismatch in autotune results.  Expected %d but was %d",
+        "Version mismatch in autotune results. Expected %d but was %d",
         kVersion, results.version()));
   }
 
diff --git a/tensorflow/compiler/xla/backends/interpreter/platform.cc b/tensorflow/compiler/xla/backends/interpreter/platform.cc
index 0259baf8221..9e7bfe804f8 100644
--- a/tensorflow/compiler/xla/backends/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/backends/interpreter/platform.cc
@@ -80,7 +80,7 @@ XlaInterpreterPlatform::GetUncachedExecutor(
   auto init_status = executor->Init(config.device_options);
   if (!init_status.ok()) {
     return tsl::Status{
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat(
             "failed initializing StreamExecutor for device ordinal %d: %s",
             config.ordinal, init_status.ToString())};
diff --git a/tensorflow/compiler/xla/backends/profiler/cpu/BUILD b/tensorflow/compiler/xla/backends/profiler/cpu/BUILD
index 3543ed7a558..c1f28b7bc4d 100644
--- a/tensorflow/compiler/xla/backends/profiler/cpu/BUILD
+++ b/tensorflow/compiler/xla/backends/profiler/cpu/BUILD
@@ -9,7 +9,6 @@ cc_library(
     visibility = [
         "//tensorflow/compiler/xla/backends/profiler:__pkg__",
         "//tensorflow/core/profiler:internal",
-        "//third_party/car/onboard/gpu:__subpackages__",
     ],
     deps = [
         ":host_tracer_impl",
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/BUILD b/tensorflow/compiler/xla/backends/profiler/gpu/BUILD
index 12a4f1a9926..5ae87af9e0a 100644
--- a/tensorflow/compiler/xla/backends/profiler/gpu/BUILD
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/BUILD
@@ -112,8 +112,8 @@ tsl_gpu_cc_test(
     size = "small",
     srcs = ["cupti_error_manager_test.cc"],
     tags = tf_cuda_tests_tags() + [
-        "nomac",
         "gpu_cupti",
+        "nomac",
     ],
     deps = [
         "//tensorflow/tsl/platform:test_main",
@@ -240,23 +240,23 @@ tsl_gpu_library(
     copts = tf_profiler_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/tsl/platform:abi",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:parse_annotation",
+        "//tensorflow/tsl/profiler/utils:trace_utils",
+        "//tensorflow/tsl/profiler/utils:xplane_builder",
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
+        "//tensorflow/tsl/profiler/utils:xplane_utils",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/strings",
-        "//tensorflow/tsl/platform:abi",
-        "//tensorflow/tsl/platform:platform_port",
-        "//tensorflow/tsl/platform:mutex",
-        "//tensorflow/tsl/platform:macros",
-        "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/platform:types",
-        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/tsl/profiler/utils:parse_annotation",
-        "//tensorflow/tsl/profiler/utils:xplane_builder",
-        "//tensorflow/tsl/profiler/utils:xplane_schema",
-        "//tensorflow/tsl/profiler/utils:xplane_utils",
-        "//tensorflow/tsl/profiler/utils:trace_utils",
     ] + tf_additional_cupti_deps(),
 )
 
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.cc b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.cc
index 11a13892df9..f3a17b64db2 100644
--- a/tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.cc
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.cc
@@ -90,7 +90,7 @@ Status ToStatus(CUresult result) {
 
 inline void LogIfError(const Status &status) {
   if (status.ok()) return;
-  LOG(ERROR) << status.error_message();
+  LOG(ERROR) << status.message();
 }
 
 // Maps an OverheadKind enum to a const string.
diff --git a/tensorflow/compiler/xla/backends/profiler/tpu/tpu_tracer.cc b/tensorflow/compiler/xla/backends/profiler/tpu/tpu_tracer.cc
index 48f9ec1bead..de05a7e1545 100644
--- a/tensorflow/compiler/xla/backends/profiler/tpu/tpu_tracer.cc
+++ b/tensorflow/compiler/xla/backends/profiler/tpu/tpu_tracer.cc
@@ -65,7 +65,7 @@ TpuTracer::TpuTracer() {
   stream_executor::tpu::OpsApiFn()->TpuProfiler_CreateFn(&tpu_profiler_,
                                                          status.c_status);
   if (!status.ok()) {
-    LOG(ERROR) << status.status().error_message();
+    LOG(ERROR) << status.status().message();
   }
 }
 
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index e1a9edfb119..9d46f68b6fb 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -212,6 +212,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:test_macros_header",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/tsl/lib/core:status_test_util",
     ],
 )
 
@@ -293,6 +294,7 @@ cc_library(
     hdrs = ["prng.h"],
     deps = [
         ":constants",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
diff --git a/tensorflow/compiler/xla/client/lib/constants_test.cc b/tensorflow/compiler/xla/client/lib/constants_test.cc
index 4bc4c494d87..5b034dde320 100644
--- a/tensorflow/compiler/xla/client/lib/constants_test.cc
+++ b/tensorflow/compiler/xla/client/lib/constants_test.cc
@@ -40,7 +40,7 @@ XLA_TEST_F(ConstantsTest, ConstantR0WithTypeS32DoesNotAcceptFloats) {
   ConstantR0WithType(&builder, xla::S32, 4.5);
   auto statusor = builder.Build();
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(), HasSubstr("Invalid cast"));
+  EXPECT_THAT(statusor.status().message(), HasSubstr("Invalid cast"));
 }
 
 XLA_TEST_F(ConstantsTest, ConstantR0WithTypeF32) {
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index fd6a02223ea..25179617548 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -33,8 +33,12 @@ template <typename FP>
 XlaOp EvaluatePolynomial(XlaOp x, absl::Span<const FP> coefficients) {
   static_assert(std::is_floating_point<FP>::value,
                 "Template-argument 'FP' must be a floating-point type");
-  XlaOp poly = ScalarLike(x, 0.0);
-  for (FP c : coefficients) {
+  if (coefficients.empty()) {
+    return ScalarLike(x, FP(0.0));
+  }
+  XlaOp poly = ScalarLike(x, coefficients[0]);
+  for (int i = 1; i < coefficients.size(); ++i) {
+    FP c = coefficients[i];
     poly = poly * x + ScalarLike(x, c);
   }
   return poly;
@@ -296,23 +300,27 @@ XlaOp Erfc(XlaOp x) {
   });
 }
 
-// Compute a polynomial approximation of the error function.
-// This is the same approximation used by Eigen.
+// Compute a rational approximation of the error function.
 static XlaOp ErfImpl32(XlaOp x) {
-  static const std::array<float, 7> kAlpha{
-      -2.72614225801306e-10f, 2.77068142495902e-08f,  -2.10102402082508e-06f,
-      -5.69250639462346e-05f, -7.34990630326855e-04f, -2.95459980854025e-03f,
-      -1.60960333262415e-02f,
-  };
+  static const std::array<float, 5> kAlpha{
+      0.00022905065861350646f, 0.0034082910107109506f, 0.050955695062380861f,
+      0.18520832239976145f, 1.128379143519084f};
 
-  static const std::array<float, 5> kBeta{
-      -1.45660718464996e-05f, -2.13374055278905e-04f, -1.68282697438203e-03f,
-      -7.37332916720468e-03f, -1.42647390514189e-02f,
-  };
+  static const std::array<float, 7> kBeta{-1.1791602954361697e-7,
+                                          0.000023547966471313185f,
+                                          0.0010179625278914885f,
+                                          0.014070470171167667f,
+                                          0.11098505178285362f,
+                                          0.49746925110067538f,
+                                          1.0f};
 
-  x = Clamp(ScalarLike(x, -4.f), x, ScalarLike(x, 4.f));
+  // We clamp x to be within [-c;c] where c = erfinv(1-2^-23), outside of
+  // which x should be +/-1.
+  constexpr float kErfInvOneMinusHalfULP = 3.7439211627767994f;
+  x = Clamp(ScalarLike(x, -kErfInvOneMinusHalfULP), x,
+            ScalarLike(x, kErfInvOneMinusHalfULP));
   auto x2 = x * x;
-  return x * EvaluatePolynomial<float>(x2, kAlpha) /
+  return (x * EvaluatePolynomial<float>(x2, kAlpha)) /
          EvaluatePolynomial<float>(x2, kBeta);
 }
 
@@ -330,10 +338,8 @@ XlaOp Erf(XlaOp x) {
     }
     // Erf(c)Impl don't have enough precision when run with bf16 intermediates
     // (not surprising!), so upcast to f32 in this case.
-    return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
-      return Select(Lt(Abs(x), ScalarLike(x, 1)), ErfImpl32(x),
-                    ScalarLike(x, 1) - ErfcImpl32(x));
-    });
+    return DoWithUpcastToF32(x, {BF16, F16},
+                             [](XlaOp x) { return ErfImpl32(x); });
   });
 }
 
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index 92a0eaf5b73..ccd4ee2b1cc 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
@@ -143,7 +144,7 @@ class MathTypedTest : public MathTest {
     ComputeAndCompareR1<T>(&b, expected, {}, error_spec_);
   }
 
-  void TestErfEdgeCases() {
+  void TestErfInvEdgeCases() {
     SetFastMathDisabled(true);
 
     XlaBuilder b(TestName());
@@ -155,6 +156,23 @@ class MathTypedTest : public MathTest {
 
     ComputeAndCompareR1<T>(&b, expected, {}, error_spec_);
   }
+
+  void TestErfEdgeCases() {
+    SetFastMathDisabled(true);
+    const T kErfInvOneMinusHalfULP = T(3.832506856900711);
+    const T inf(std::numeric_limits<float>::infinity());
+
+    XlaBuilder b(TestName());
+    auto x = AddParam(LiteralUtil::CreateR1<T>({T{-inf}, T{inf}, T{-0}, T{0},
+                                                T{-kErfInvOneMinusHalfULP},
+                                                T{kErfInvOneMinusHalfULP}}),
+                      &b);
+    Erf(x);
+
+    std::vector<T> expected = {T(-1), T(1), T(-0), T(0), T(-1), T(1)};
+
+    ComputeAndCompareR1<T>(&b, expected, {}, error_spec_);
+  }
 };
 
 // TODO(b/123355973): Add bfloat16 to TestTypes once it's working.
@@ -178,7 +196,8 @@ XLA_TYPED_TEST(MathTypedTest, IsNegZero) { this->TestIsNegZero(); }
 XLA_TYPED_TEST(MathTypedTest, SqrtPowInequivalence) {
   this->TestSqrtPowInequivalence();
 }
-XLA_TYPED_TEST(MathTypedTest, ErfInvEdgeCases) { this->TestErfEdgeCases(); }
+XLA_TYPED_TEST(MathTypedTest, ErfInvEdgeCases) { this->TestErfInvEdgeCases(); }
+XLA_TYPED_TEST(MathTypedTest, ErfEdgeCases) { this->TestErfEdgeCases(); }
 
 // Check that certain ops only support real, floating-point inputs.
 //
@@ -203,7 +222,7 @@ XLA_TEST_F(MathTest, RealFpOnlyOps) {
     } else {
       continue;
     }
-    if (ty == F8E5M2 || ty == F8E4M3FN) {
+    if (ty == F8E5M2 || ty == F8E4M3FN || ty == F8E4M3B11FNUZ) {
       // TODO(b/259609697): Add FP8 support to math ops
       continue;
     }
@@ -226,7 +245,11 @@ XLA_TEST_F(MathTest, RealFpOnlyOps) {
       XlaOp p = Parameter(&b, 0, shape, "p0");
       test.first(p);
 
-      EXPECT_EQ(b.first_error().ok(), primitive_util::IsFloatingPointType(ty));
+      if (primitive_util::IsFloatingPointType(ty)) {
+        TF_EXPECT_OK(b.first_error());
+      } else {
+        EXPECT_FALSE(b.first_error().ok());
+      }
     }
   }
 }
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index 8466b3d51a8..b0b66dd1b0a 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -255,13 +256,15 @@ RngOutput ThreeFryRngBit32(XlaOp key, XlaOp initial_state, const Shape& shape) {
 
 // Generates random 16bits with the given shape using the Three Fry
 // implementation. Returns the random bits and the new state.
-RngOutput ThreeFryRngBit16(XlaOp op_key, XlaOp initial_state,
-                           const Shape& shape) {
+RngOutput ThreeFryRngBitNarrow(XlaOp op_key, XlaOp initial_state,
+                               const Shape& shape) {
   // TODO(b/256713018): Use a better approach to not waste the upper 16 bits.
   auto new_shape = shape;
   new_shape.set_element_type(U32);
   auto output = ThreeFryRngBit32(op_key, initial_state, new_shape);
-  output.value = ConvertElementType(output.value, U16);
+  output.value = ConvertElementType(
+      output.value, primitive_util::UnsignedIntegralTypeForBitWidth(
+                        primitive_util::BitWidth(shape.element_type())));
   return output;
 }
 
@@ -446,15 +449,17 @@ RngOutput PhiloxRngBit32(XlaOp op_key, XlaOp initial_state,
 // Generates an array of primitive type U16 with the given shape containing
 // random bits generated by the Philox algorithm. Returns the array and the new
 // state of the random number generator.
-RngOutput PhiloxRngBit16(XlaOp op_key, XlaOp initial_state,
-                         const Shape& shape) {
+RngOutput PhiloxRngBitNarrow(XlaOp op_key, XlaOp initial_state,
+                             const Shape& shape) {
   // We use PhiloxRngBit32 and throw away the upper 16 bits here, to align with
   // the non-XLA kernels.
   // TODO(b/256713018): Use a better approach to not waste the upper 16 bits.
   auto new_shape = shape;
   new_shape.set_element_type(U32);
   auto output = PhiloxRngBit32(op_key, initial_state, new_shape);
-  output.value = ConvertElementType(output.value, U16);
+  output.value = ConvertElementType(
+      output.value, primitive_util::UnsignedIntegralTypeForBitWidth(
+                        primitive_util::BitWidth(shape.element_type())));
   return output;
 }
 
@@ -593,10 +598,12 @@ RngOutput ThreeFryBitGenerator(XlaOp key, XlaOp initial_state,
                                const Shape& shape) {
   PrimitiveType type = shape.element_type();
   switch (type) {
+    case S8:
+    case U8:
     case F16:
     case U16:
     case S16:
-      return ThreeFryRngBit16(key, initial_state, shape);
+      return ThreeFryRngBitNarrow(key, initial_state, shape);
     case F32:
     case U32:
     case S32:
@@ -619,10 +626,12 @@ RngOutput PhiloxBitGenerator(XlaOp key, XlaOp initial_state,
                              const Shape& shape) {
   PrimitiveType type = shape.element_type();
   switch (type) {
+    case S8:
+    case U8:
     case F16:
     case U16:
     case S16:
-      return PhiloxRngBit16(key, initial_state, shape);
+      return PhiloxRngBitNarrow(key, initial_state, shape);
     case F32:
     case U32:
     case S32:
diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc
index af4883b1f65..cdd1f4a542a 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting.cc
@@ -22,10 +22,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
-XlaOp TopK(XlaOp input, int64_t k) {
+XlaOp TopK(XlaOp input, int64_t k, PrimitiveType index_type) {
   XlaBuilder* const builder = input.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
@@ -41,16 +42,17 @@ XlaOp TopK(XlaOp input, int64_t k) {
       int64_t num_partitions =
           CeilOfRatio(last_dim_size - k, kPerPartitionSize - k);
       if (num_partitions >= kMinNumPartitions) {
-        return TopKWithPartitions(input, k, num_partitions);
+        return TopKWithPartitions(input, k, num_partitions, index_type);
       }
     }
 
-    Shape iota_shape = ShapeUtil::MakeShape(S32, input_shape.dimensions());
-    XlaOp iota_s32 = Iota(builder, iota_shape, last_dim);
+    Shape iota_shape =
+        ShapeUtil::MakeShape(index_type, input_shape.dimensions());
+    XlaOp iota = Iota(builder, iota_shape, last_dim);
     for (int64_t i = 0; i < input_shape.rank(); ++i) {
       if (input_shape.is_dynamic_dimension(i)) {
         // Propagate dynamic dimension from inputs to iota.
-        iota_s32 = SetDimensionSize(iota_s32, GetDimensionSize(input, i), i);
+        iota = SetDimensionSize(iota, GetDimensionSize(input, i), i);
       }
     }
     auto input_dims = input_shape.dimensions();
@@ -101,13 +103,14 @@ XlaOp TopK(XlaOp input, int64_t k) {
           Or(sign_magnitude_to_from_ones_complement(
                  BitcastConvertType(ConvertElementType(input, F32), S32)),
              ConstantR0<int32_t>(builder, kLow16BitsMask));
-      XlaOp input_and_iota = Xor(input_f32_trimmed, iota_s32);
+      XlaOp input_and_iota = Xor(input_f32_trimmed, iota);
 
       // Sort in reverse order so the largest elements are at the beginning.
       // Breaking ties here is why the index bits need to be inverted.
-      XlaOp sort_result_raw = Sort(
-          {input_and_iota}, CreateScalarGtComputation({S32}, builder), last_dim,
-          /*is_stable=*/false);
+      XlaOp sort_result_raw =
+          Sort({input_and_iota},
+               CreateScalarGtComputation({index_type}, builder), last_dim,
+               /*is_stable=*/false);
 
       // Slice off the first k values.
       sort_result_raw =
@@ -132,9 +135,9 @@ XlaOp TopK(XlaOp input, int64_t k) {
           ConstantR0<int32_t>(builder, kLow16BitsMask));
     } else {
       XlaOp sort_result =
-          Sort({input, iota_s32},
-               CreateScalarGtComputation({input_shape.element_type(), S32},
-                                         iota_s32.builder()),
+          Sort({input, iota},
+               CreateScalarGtComputation(
+                   {input_shape.element_type(), index_type}, iota.builder()),
                last_dim, /*is_stable=*/true);
       values = Slice(GetTupleElement(sort_result, 0), start_indices,
                      limit_indices, strides);
@@ -150,7 +153,8 @@ XlaOp TopK(XlaOp input, int64_t k) {
   });
 }
 
-XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions) {
+XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions,
+                         PrimitiveType index_type) {
   XlaBuilder* const builder = input.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
@@ -162,15 +166,16 @@ XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions) {
         CeilOfRatio(last_dim_size, num_partitions);
     // Do normal TopK when per partition size is smaller than or equal to k.
     if (k >= per_partition_size) {
-      return TopK(input, k);
+      return TopK(input, k, index_type);
     }
 
-    Shape iota_shape = ShapeUtil::MakeShape(S32, input_shape.dimensions());
-    XlaOp iota_s32 = Iota(builder, iota_shape, last_dim);
+    Shape iota_shape =
+        ShapeUtil::MakeShape(index_type, input_shape.dimensions());
+    XlaOp iota = Iota(builder, iota_shape, last_dim);
     for (int64_t i = 0; i < input_shape.rank(); ++i) {
       if (input_shape.is_dynamic_dimension(i)) {
         // Propagate dynamic dimension from inputs to iota.
-        iota_s32 = SetDimensionSize(iota_s32, GetDimensionSize(input, i), i);
+        iota = SetDimensionSize(iota, GetDimensionSize(input, i), i);
       }
     }
 
@@ -180,25 +185,41 @@ XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions) {
       auto values = values_and_indices[0];
       auto indices = values_and_indices[1];
       auto input = values_and_indices[2];
-      auto iota_s32 = values_and_indices[3];
+      auto iota = values_and_indices[3];
 
       // Slice value and indices for this partition.
-      XlaOp start = Mul(Add(partition, ConstantR0<int32_t>(builder, 1)),
-                        ConstantR0<int32_t>(builder, per_partition_size));
+      XlaOp start;
+      switch (index_type) {
+        case PrimitiveType::S16:
+          start = Mul(Add(partition, ConstantR0<int16_t>(builder, 1)),
+                      ConstantR0<int16_t>(builder, per_partition_size));
+          break;
+        case PrimitiveType::S32:
+          start = Mul(Add(partition, ConstantR0<int32_t>(builder, 1)),
+                      ConstantR0<int32_t>(builder, per_partition_size));
+          break;
+        case PrimitiveType::S64:
+          start = Mul(Add(partition, ConstantR0<int64_t>(builder, 1)),
+                      ConstantR0<int64_t>(builder, per_partition_size));
+          break;
+        default:
+          LOG(FATAL) << "Unsupported index type "
+                     << PrimitiveType_Name(index_type);
+      }
       XlaOp sliced_input =
           DynamicSliceInMinorDims(input, {start}, {per_partition_size});
       XlaOp sliced_indices =
-          DynamicSliceInMinorDims(iota_s32, {start}, {per_partition_size});
+          DynamicSliceInMinorDims(iota, {start}, {per_partition_size});
       // Concat with previous results.
       sliced_input = ConcatInDim(builder, {values, sliced_input}, last_dim);
       sliced_indices =
           ConcatInDim(builder, {indices, sliced_indices}, last_dim);
       // Sort this slice
-      XlaOp sort_result =
-          Sort({sliced_input, sliced_indices},
-               CreateScalarGtComputation({input_shape.element_type(), S32},
-                                         sliced_indices.builder()),
-               last_dim, true);
+      XlaOp sort_result = Sort(
+          {sliced_input, sliced_indices},
+          CreateScalarGtComputation({input_shape.element_type(), index_type},
+                                    sliced_indices.builder()),
+          last_dim, true);
 
       std::vector<int64_t> start_indices(input_shape.dimensions_size(), 0);
       std::vector<int64_t> limit_indices(input_dims.begin(), input_dims.end());
@@ -210,7 +231,7 @@ XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions) {
                      limit_indices, strides);
       indices = Slice(GetTupleElement(sort_result, 1), start_indices,
                       limit_indices, strides);
-      return std::vector<XlaOp>{values, indices, input, iota_s32};
+      return std::vector<XlaOp>{values, indices, input, iota};
     };
 
     // Get the values and indices for the first topk so that they can
@@ -222,12 +243,11 @@ XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions) {
     limit_indices[last_dim] = per_partition_size;
     // Slice value and indices for the first partition.
     XlaOp sliced_input = Slice(input, start_indices, limit_indices, strides);
-    XlaOp sliced_indices =
-        Slice(iota_s32, start_indices, limit_indices, strides);
+    XlaOp sliced_indices = Slice(iota, start_indices, limit_indices, strides);
     // Sort this slice
     XlaOp sort_result =
         Sort({sliced_input, sliced_indices},
-             CreateScalarGtComputation({input_shape.element_type(), S32},
+             CreateScalarGtComputation({input_shape.element_type(), index_type},
                                        sliced_indices.builder()),
              last_dim, /*is_stable=*/true);
 
@@ -241,10 +261,11 @@ XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions) {
 
     // Pass the result of the first TopK to the while loop and do
     // num_partition - 1 iterations.
-    TF_ASSIGN_OR_RETURN(auto values_and_indices,
-                        ForEachIndex(num_partitions - 1, S32, topk_body_fn,
-                                     {values, indices, input, iota_s32},
-                                     "topk_with_partition", builder));
+    TF_ASSIGN_OR_RETURN(
+        auto values_and_indices,
+        ForEachIndex(num_partitions - 1, index_type, topk_body_fn,
+                     {values, indices, input, iota}, "topk_with_partition",
+                     builder));
     return Tuple(builder, {values_and_indices[0], values_and_indices[1]});
   });
 }
diff --git a/tensorflow/compiler/xla/client/lib/sorting.h b/tensorflow/compiler/xla/client/lib/sorting.h
index 0f810ccb365..9fbdf1b9945 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.h
+++ b/tensorflow/compiler/xla/client/lib/sorting.h
@@ -24,11 +24,14 @@ namespace xla {
 
 // Returns a tuple composed of the top `k` values and corresponding indices in
 // `input`.  Output values are in descending order, from largest to smallest.
-XlaOp TopK(XlaOp input, int64_t k);
+XlaOp TopK(XlaOp input, int64_t k,
+           PrimitiveType index_type = PrimitiveType::S32);
+
 // Split sort in TopK into smaller sorts.
 // Returns a tuple composed of the top `k` values and corresponding indices in
 // `input`.  Output values are in descending order, from largest to smallest.
-XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions = 1);
+XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions = 1,
+                         PrimitiveType index_type = PrimitiveType::S32);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index e820d5bfe6f..7d5de392067 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -44,6 +44,14 @@ XLA_TEST_F(SortingTest, TopK3From8Indices) {
   ComputeAndCompareR1<int>(&builder, {0, 1, 2}, {});
 }
 
+XLA_TEST_F(SortingTest, TopK3From8Int16Indices) {
+  XlaBuilder builder(TestName());
+  auto x =
+      ConstantR1<float>(&builder, {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0});
+  xla::GetTupleElement(xla::TopK(x, 3, PrimitiveType::S16), 1);
+  ComputeAndCompareR1<int16_t>(&builder, {7, 6, 5}, {});
+}
+
 XLA_TEST_F(SortingTest, TopKFullSortMinInt) {
   XlaBuilder builder(TestName());
   auto x_rev = ConstantR1<int>(&builder, {std::numeric_limits<int>::min(),
@@ -140,6 +148,16 @@ XLA_TEST_F(SortingTest, TopK3From8Indices5Partitions) {
   ComputeAndCompareR1<int>(&builder, {0, 1, 2}, {});
 }
 
+XLA_TEST_F(SortingTest, TopK3From8Int16Indices5Partitions) {
+  XlaBuilder builder(TestName());
+  auto x_rev =
+      ConstantR1<float>(&builder, {7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0});
+  xla::GetTupleElement(xla::TopKWithPartitions(x_rev, 3, /*num_partitions=*/5,
+                                               PrimitiveType::S16),
+                       1);
+  ComputeAndCompareR1<int16_t>(&builder, {0, 1, 2}, {});
+}
+
 XLA_TEST_F(SortingTest, TopKFullSortWithDuplicates2Partitions) {
   XlaBuilder builder(TestName());
   XlaOp a;
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 2b3d972d5a5..339ce5b2ad8 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -1090,7 +1090,7 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, XlaOp lhs, XlaOp rhs, XlaOp ehs) {
     if (!status_or_shape.status().ok()) {
       return InvalidArgument(
           "%s Input scalar shapes may have been changed to non-scalar shapes.",
-          status_or_shape.status().error_message());
+          status_or_shape.status().message());
     }
 
     return AddOpWithShape(triop, status_or_shape.value(),
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 513670738f1..1b0eb3bc073 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -181,7 +181,7 @@ TEST_F(XlaBuilderTest, ShiftRightOperatorOnNonIntegerProducesError) {
   auto statusor = b.Build();
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr("Argument to >> operator does not have an integral type"));
 }
 
@@ -226,7 +226,7 @@ TEST_F(XlaBuilderTest, ShapeInferenceError) {
   Add(x, y);
   auto statusor = BuildHloModule(&b);
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Shapes must be equal rank"));
 }
 
@@ -250,7 +250,7 @@ TEST_F(XlaBuilderTest, ParameterAlreadyRegistered) {
   Add(x, y);
   auto statusor = BuildHloModule(&b);
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("parameter 0 already registered"));
 }
 
@@ -345,7 +345,7 @@ TEST_F(XlaBuilderTest, BroadcastInDimWithNegativeSize) {
                  /*broadcast_dimensions=*/{0, 1, 2});
   auto statusor = BuildHloModule(&b);
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(), HasSubstr("invalid shape"));
+  EXPECT_THAT(statusor.status().message(), HasSubstr("invalid shape"));
 }
 
 TEST_F(XlaBuilderTest, OperandFromWrongBuilder) {
@@ -357,7 +357,7 @@ TEST_F(XlaBuilderTest, OperandFromWrongBuilder) {
   auto statusor = builder.Build();
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr(
           "built by builder 'b1', but is trying to use it in builder 'main'"));
 }
@@ -527,7 +527,7 @@ TEST_F(XlaBuilderTest, ReportError) {
   Add(b.ReportError(InvalidArgument("a test error")), x);
   auto statusor = b.Build();
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(), HasSubstr("a test error"));
+  EXPECT_THAT(statusor.status().message(), HasSubstr("a test error"));
 }
 
 TEST_F(XlaBuilderTest, ReportErrorOrReturnHandlesNonErrors) {
@@ -545,7 +545,7 @@ TEST_F(XlaBuilderTest, ReportErrorOrReturnHandlesErrors) {
   Add(b.ReportErrorOrReturn(op), ConstantR0<float>(&b, 2.0));
   auto statusor = b.Build();
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(), HasSubstr("a test error"));
+  EXPECT_THAT(statusor.status().message(), HasSubstr("a test error"));
 }
 
 TEST_F(XlaBuilderTest, BuildWithSpecificRoot) {
@@ -584,7 +584,7 @@ TEST_F(XlaBuilderTest, BuildWithSpecificRootWithWrongBuilder) {
   Status status = b.Build(other_param).status();
   ASSERT_IS_NOT_OK(status);
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       ::testing::HasSubstr("root operation is not in this computation"));
 }
 
@@ -1238,7 +1238,7 @@ TEST_F(XlaBuilderTest, AfterAllWithNonTokenOperands) {
   AfterAll(&b, {CreateToken(&b), ConstantR0<float>(&b, 1.0)});
   Status status = b.Build().status();
   ASSERT_IS_NOT_OK(status);
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::HasSubstr("All operands to AfterAll must be tokens"));
 }
 
@@ -1471,9 +1471,7 @@ TEST_F(XlaBuilderTest, OutfeedTokenSharding) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto it = std::find_if(module->entry_computation()->instructions().begin(),
                          module->entry_computation()->instructions().end(),
-                         [](const HloInstruction* i) {
-                           return i->opcode() == HloOpcode::kOutfeed;
-                         });
+                         HloPredicateIsOp<HloOpcode::kOutfeed>);
   EXPECT_NE(it, module->entry_computation()->instructions().end());
   auto* outfeed = *it;
   EXPECT_TRUE(outfeed->has_sharding());
@@ -1507,7 +1505,7 @@ TEST_F(XlaBuilderTest, InvalidSharding) {
   Parameter(&b, 0, shape2d, "p0");
   auto statusor = b.Build();
   EXPECT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Number of tile assignment dimensions (excluding "
                         "subgroups) is different than the input rank"));
 }
diff --git a/tensorflow/compiler/xla/comparison_util.cc b/tensorflow/compiler/xla/comparison_util.cc
index e79d6b75606..69f4c0b2100 100644
--- a/tensorflow/compiler/xla/comparison_util.cc
+++ b/tensorflow/compiler/xla/comparison_util.cc
@@ -38,6 +38,7 @@ bool IsValidComparison(xla::PrimitiveType type, Comparison::Order order) {
     case F64:
     case F8E5M2:
     case F8E4M3FN:
+    case F8E4M3B11FNUZ:
     case C64:
     case C128:
       return true;
@@ -105,6 +106,7 @@ Comparison::Order DefaultOrdering(PrimitiveType type) {
       return Comparison::Order::kTotal;
     case F8E5M2:
     case F8E4M3FN:
+    case F8E4M3B11FNUZ:
     case BF16:
     case F16:
     case F32:
@@ -187,11 +189,11 @@ std::string ComparisonTypeToString(Comparison::Type type) {
   }
 }
 
-std::string ComparisonPrimitiveTypeToString(PrimitiveType type) {
+absl::string_view ComparisonPrimitiveTypeToString(PrimitiveType type) {
   return PrimitiveType_Name(type);
 }
 
-std::string ComparisonOrderToString(Comparison::Order order) {
+absl::string_view ComparisonOrderToString(Comparison::Order order) {
   switch (order) {
     case Comparison::Order::kPartial:
       return "PARTIALORDER";
@@ -262,6 +264,7 @@ Comparison::Type Comparison::DefaultComparisonType(PrimitiveType type) {
       return Type::kUnsigned;
     case F8E5M2:
     case F8E4M3FN:
+    case F8E4M3B11FNUZ:
     case F16:
     case F32:
     case BF16:
@@ -316,6 +319,7 @@ std::optional<Comparison> Comparison::Inverse() const {
     case F64:
     case F8E5M2:
     case F8E4M3FN:
+    case F8E4M3B11FNUZ:
     case C64:
     case C128:
     case S4:
diff --git a/tensorflow/compiler/xla/comparison_util.h b/tensorflow/compiler/xla/comparison_util.h
index ec97bdc8c2c..1b6f349e8b9 100644
--- a/tensorflow/compiler/xla/comparison_util.h
+++ b/tensorflow/compiler/xla/comparison_util.h
@@ -234,8 +234,8 @@ inline std::ostream& operator<<(std::ostream& os, const Comparison& cmp) {
 
 std::string ComparisonDirectionToString(Comparison::Direction direction);
 std::string ComparisonTypeToString(Comparison::Type type);
-std::string ComparisonPrimitiveTypeToString(PrimitiveType type);
-std::string ComparisonOrderToString(Comparison::Order order);
+absl::string_view ComparisonPrimitiveTypeToString(PrimitiveType type);
+absl::string_view ComparisonOrderToString(Comparison::Order order);
 
 StatusOr<Comparison::Direction> StringToComparisonDirection(
     absl::string_view direction);
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 0c904033bdc..dabed025c92 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -58,6 +58,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_use_acl(true);
 #endif
   opts.set_xla_cpu_use_xla_runtime(false);
+  opts.set_xla_cpu_sparse_cuda_threads(0);
 
   opts.set_xla_cpu_enable_fast_math(false);
   // Disable forms of fast math that have caused users problems in the past.
@@ -75,8 +76,11 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   // flag.
   opts.set_xla_gpu_enable_cublaslt(false);
 
-  // TODO(b/258036887): Remove this flag once CUDA Graphs are fully supported.
-  opts.set_xla_gpu_enable_cuda_graphs(false);
+  // TODO(b/258036887): Enable once CUDA Graphs are fully supported.
+  opts.set_xla_gpu_cuda_graph_level(0);
+  opts.set_xla_gpu_cuda_graph_instantiation_threshold(2);
+  opts.set_xla_gpu_enable_persistent_temp_buffers(false);
+  opts.set_xla_gpu_cuda_graph_capture_threshold(2);
 
   // Despite the name, fast min/max on GPUs does not seem to be any faster, and
   // adds very counter-intuitive "NaN-swallowing" behavior.
@@ -87,6 +91,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_force_host_platform_device_count(1);
   opts.set_xla_gpu_all_reduce_combine_threshold_bytes(30 * 1024 * 1024);
   opts.set_xla_gpu_enable_async_all_reduce(true);
+  opts.set_xla_gpu_enable_reassociation_for_converted_ar(true);
+
   opts.set_xla_cpu_enable_xprof_traceme(false);
   opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
   opts.set_xla_multiheap_size_constraint_per_heap(-1);
@@ -104,8 +110,15 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_simplify_all_fp_conversions(true);
   opts.set_xla_dump_latency_hiding_schedule(false);
   opts.set_xla_gpu_enable_latency_hiding_scheduler(false);
+  opts.set_xla_gpu_lhs_enable_gpu_async_tracker(false);
+  opts.set_xla_gpu_pgle_profile_directory("");
 
   opts.set_xla_cpu_enable_mlir_tiling_and_fusion(true);
+  opts.set_xla_cpu_enable_custom_matmul_tiling(false);
+  opts.set_xla_cpu_matmul_tiling_m_dim(8);
+  opts.set_xla_cpu_matmul_tiling_n_dim(8);
+  opts.set_xla_cpu_matmul_tiling_k_dim(8);
+  opts.set_xla_cpu_enable_mlir_fusion_outlining(true);
   opts.set_xla_cpu_enable_experimental_deallocation(true);
 
   opts.set_xla_partitioning_algorithm(
@@ -114,6 +127,12 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_enable_triton_gemm(true);
   opts.set_xla_gpu_enable_cudnn_int8x32_convolution_reordering(true);
   opts.set_xla_gpu_triton_gemm_any(false);
+
+  // Moving reduce-scatter out of while loops can incrase memory footprint, so
+  // turning it off by default.
+  opts.set_xla_gpu_enable_while_loop_reduce_scatter_code_motion(false);
+
+  opts.set_xla_gpu_collective_inflation_factor(1);
   return opts;
 }
 
@@ -326,14 +345,14 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_nans),
       debug_options->xla_cpu_fast_math_honor_nans(),
       "When xla_cpu_enable_fast_math is true then this controls whether we "
-      "allow operations to produce NaNs.  Ignored when "
+      "allow operations to produce NaNs. Ignored when "
       "xla_cpu_enable_fast_math is false."));
   flag_list->push_back(tsl::Flag(
       "xla_cpu_fast_math_honor_infs",
       bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_infs),
       debug_options->xla_cpu_fast_math_honor_infs(),
       "When xla_cpu_enable_fast_math is true then this controls whether we "
-      "allow operations to produce infinites.  Ignored when "
+      "allow operations to produce infinites. Ignored when "
       "xla_cpu_enable_fast_math is false."));
   flag_list->push_back(tsl::Flag(
       "xla_cpu_fast_math_honor_division",
@@ -403,10 +422,10 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
   flag_list->push_back(tsl::Flag(
       "xla_disable_all_hlo_passes",
       bool_setter_for(&DebugOptions::set_xla_disable_all_hlo_passes), false,
-      "Disables all HLO passes.  Notes that some passes are necessary for "
+      "Disables all HLO passes. Notes that some passes are necessary for "
       "correctness and the invariants that must be satisfied by 'fully "
       "optimized' HLO are different for different devices and may change "
-      "over time.  The only 'guarantee', such as it is, is that if you compile "
+      "over time. The only 'guarantee', such as it is, is that if you compile "
       "XLA and dump the optimized HLO for some graph, you should be able to "
       "run it again on the same device with the same build of XLA."));
   flag_list->push_back(
@@ -486,6 +505,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 bool_setter_for(&DebugOptions::set_xla_cpu_use_xla_runtime),
                 debug_options->xla_cpu_use_xla_runtime(),
                 "Enable XLA Runtime in the CPU backend."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_sparse_cuda_threads",
+      int32_setter_for(&DebugOptions::set_xla_cpu_sparse_cuda_threads),
+      debug_options->xla_cpu_sparse_cuda_threads(),
+      "Sets number fo CUDA threads for sparse GPU acceleration in the CPU "
+      "backend (0 = off)."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_crash_on_verification_failures",
       bool_setter_for(
@@ -527,7 +552,7 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "If multiple parameters, separate them by comma."));
   flag_list->push_back(tsl::Flag(
       "xla_fuel", setter_for_xla_fuel, /*default_value_for_display=*/"",
-      "Sets compiler fuel, useful for bisecting bugs in passes.  Format "
+      "Sets compiler fuel, useful for bisecting bugs in passes. Format "
       "--xla_fuel=PASS1=NUM1,PASS2=NUM2,..."));
   flag_list->push_back(tsl::Flag(
       "xla_dump_to", string_setter_for(&DebugOptions::set_xla_dump_to),
@@ -727,6 +752,21 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::set_xla_gpu_enable_async_collective_permute),
       debug_options->xla_gpu_enable_async_collective_permute(),
       "Converts synchronous collective-permute ops into asynchronous."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_async_all_gather",
+      bool_setter_for(&DebugOptions::set_xla_gpu_enable_async_all_gather),
+      debug_options->xla_gpu_enable_async_all_gather(),
+      "Converts synchronous all-gather ops into asynchronous."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_async_reduce_scatter",
+      bool_setter_for(&DebugOptions::set_xla_gpu_enable_async_reduce_scatter),
+      debug_options->xla_gpu_enable_async_reduce_scatter(),
+      "Converts synchronous reduce-scatter ops into asynchronous."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_async_all_to_all",
+      bool_setter_for(&DebugOptions::set_xla_gpu_enable_async_all_to_all),
+      debug_options->xla_gpu_enable_async_all_to_all(),
+      "Converts synchronous all-to-all ops into asynchronous."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_all_reduce_combine_threshold_bytes",
       int64_setter_for(
@@ -749,6 +789,28 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "ReduceScatter-AllReduce-AllGather sequence, with the initial "
       "ReduceScatter being performed over all of the devices in the same host. "
       "Set to < 1 to disable all-reduce decomposition."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_while_loop_reduce_scatter_code_motion",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_gpu_enable_while_loop_reduce_scatter_code_motion),
+      debug_options->xla_gpu_enable_while_loop_reduce_scatter_code_motion(),
+      "Enable hoisting of reduce-scatter outside while loops."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_collective_inflation_factor",
+      int32_setter_for(&DebugOptions::set_xla_gpu_collective_inflation_factor),
+      debug_options->xla_gpu_collective_inflation_factor(),
+      "Inflation factor for collectives. If set to > 1, each XLA/GPU "
+      "collective will execute multiple times (will yield incorrect results)"));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_reassociation_for_converted_ar",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_enable_reassociation_for_converted_ar),
+      debug_options->xla_gpu_enable_reassociation_for_converted_ar(),
+      "Enable allreduce reassociation on allreduces that are converted to a "
+      "wider type. "
+      "The reassociated allreduce will be promoted to a wider-typed "
+      "allreduce."));
   flag_list->push_back(
       tsl::Flag("xla_gpu_dump_llvmir",
                 bool_setter_for(&DebugOptions::set_xla_gpu_dump_llvmir),
@@ -764,10 +826,33 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 debug_options->xla_gpu_enable_cublaslt(),
                 "Use cuBLASLt for GEMMs when possible."));
   flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_cuda_graphs",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_cuda_graphs),
-      debug_options->xla_gpu_enable_cuda_graphs(),
-      "Use CUDA graphs to execute XLA GPU executables when possible."));
+      "xla_gpu_cuda_graph_level",
+      int32_setter_for(&DebugOptions::set_xla_gpu_cuda_graph_level),
+      debug_options->xla_gpu_cuda_graph_level(),
+      "Set CUDA graph level. 0 = off; 1 = capture fusions and memcpys; 2 = "
+      "capture convolutions and gemms; 3 = capture collectives."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_cuda_graph_instantiation_threshold",
+      int32_setter_for(
+          &DebugOptions::set_xla_gpu_cuda_graph_instantiation_threshold),
+      debug_options->xla_gpu_cuda_graph_instantiation_threshold(),
+      "Instantiate a cuda graph after the time a captured function is executed "
+      "reaches the threshold."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_cuda_graph_capture_threshold",
+      int32_setter_for(&DebugOptions::set_xla_gpu_cuda_graph_capture_threshold),
+      debug_options->xla_gpu_cuda_graph_capture_threshold(),
+      "Capture a region as a function to be launched as cuda graph if the "
+      "number of moved instructions reaches this threshold."));
+
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_persistent_temp_buffers",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_enable_persistent_temp_buffers),
+      debug_options->xla_gpu_enable_persistent_temp_buffers(),
+      "Allocate temp buffers once during the first execution of an executable. "
+      "Reuse the allocated buffers in subsequent executions. Executables cannot"
+      " run concurrently if this is enabled."));
   flag_list->push_back(
       tsl::Flag("xla_dump_disable_metadata",
                 bool_setter_for(&DebugOptions::set_xla_dump_disable_metadata),
@@ -834,7 +919,7 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_cpu_strict_dot_conv_math),
       debug_options->xla_cpu_strict_dot_conv_math(),
       "By default, XLA:CPU will run fp16 dot/conv as fp32, as this is "
-      "generally (much) faster on our hardware.  Set this flag to true to "
+      "generally (much) faster on our hardware. Set this flag to true to "
       "disable this behavior."));
   flag_list->push_back(tsl::Flag(
       "xla_dump_latency_hiding_schedule",
@@ -846,6 +931,31 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_cpu_enable_mlir_tiling_and_fusion),
       debug_options->xla_cpu_enable_mlir_tiling_and_fusion(),
       "Enable MLIR tiling and fusion."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_enable_mlir_fusion_outlining",
+      bool_setter_for(&DebugOptions::set_xla_cpu_enable_mlir_fusion_outlining),
+      debug_options->xla_cpu_enable_mlir_fusion_outlining(),
+      "Enable MLIR fusion outlining (to improve compile time)."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_enable_custom_matmul_tiling",
+      bool_setter_for(&DebugOptions::set_xla_cpu_enable_custom_matmul_tiling),
+      debug_options->xla_cpu_enable_custom_matmul_tiling(),
+      "Enable custom tiling given by M, K, N parameters."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_matmul_tiling_m_dim",
+      int64_setter_for(&DebugOptions::set_xla_cpu_matmul_tiling_m_dim),
+      debug_options->xla_cpu_matmul_tiling_m_dim(),
+      "Custom tile size for matmul's M dimension."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_matmul_tiling_n_dim",
+      int64_setter_for(&DebugOptions::set_xla_cpu_matmul_tiling_n_dim),
+      debug_options->xla_cpu_matmul_tiling_n_dim(),
+      "Custom tile size for matmul's N dimension."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_matmul_tiling_k_dim",
+      int64_setter_for(&DebugOptions::set_xla_cpu_matmul_tiling_k_dim),
+      debug_options->xla_cpu_matmul_tiling_k_dim(),
+      "Custom tile size for matmul's K dimension."));
   flag_list->push_back(tsl::Flag(
       "xla_cpu_enable_experimental_deallocation",
       bool_setter_for(
@@ -858,6 +968,16 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                     &DebugOptions::set_xla_gpu_enable_latency_hiding_scheduler),
                 debug_options->xla_gpu_enable_latency_hiding_scheduler(),
                 "Enable latency-hiding scheduler for XLA:GPU"));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_pgle_profile_directory",
+      string_setter_for(&DebugOptions::set_xla_gpu_pgle_profile_directory),
+      debug_options->xla_gpu_pgle_profile_directory(),
+      "Directory for PGLE profiles in XLA:GPU"));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_lhs_enable_gpu_async_tracker",
+      bool_setter_for(&DebugOptions::set_xla_gpu_lhs_enable_gpu_async_tracker),
+      debug_options->xla_gpu_lhs_enable_gpu_async_tracker(),
+      "Enable GPU async tracker for latency-hiding scheduler in XLA:GPU"));
   flag_list->push_back(tsl::Flag(
       "xla_partitioning_algorithm", setter_for_xla_partitioning_algorithm,
       DebugOptions::PartitioningAlgorithm_Name(
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 8a0aa19dc06..58be6edef78 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -21,6 +21,9 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+
 // These classes are forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@@ -38,18 +41,12 @@ struct ThreadPoolDevice;
 }  // namespace Eigen
 
 namespace tsl {
-class Status;
-template <typename T>
-class StatusOr;
 template <typename T>
 class AsyncValueRef;
 }  // namespace tsl
 
 namespace xla {
 
-using ::tsl::Status;    // TENSORFLOW_STATUS_OK
-using ::tsl::StatusOr;  // TENSORFLOW_STATUS_OK
-
 class DeviceAssignment;
 class ExecutionProfile;
 class Shape;
@@ -96,19 +93,23 @@ using ThenExecuteFunction =
 
 // Callback for sending device buffer to a channel. Returned event will be
 // recorded on a `stream` once the send operation is completed and data was
-// copied from the `src` memory.
+// copied from the `src` memory. `frontend_attrs` contains frontend specific
+// attributes for the send.
 using SendDeviceMemoryFunction =
-    std::function<StatusOr<tsl::AsyncValueRef<stream_executor::Event>>(
+    std::function<absl::StatusOr<tsl::AsyncValueRef<stream_executor::Event>>(
         int64_t channel_id, stream_executor::Stream* stream, const Shape& shape,
-        const stream_executor::DeviceMemoryBase& src)>;
+        const stream_executor::DeviceMemoryBase& src,
+        const absl::flat_hash_map<std::string, std::string>& frontend_attrs)>;
 
 // Callback for receiving device buffer from a channel. Returned event will be
 // recorded on a `stream` once the recv operation is completed and data was
-// copied into the `dst` memory.
+// copied into the `dst` memory. `frontend_attrs` contains frontend specific
+// attributes for the receive.
 using RecvDeviceMemoryFunction =
-    std::function<StatusOr<tsl::AsyncValueRef<stream_executor::Event>>(
+    std::function<absl::StatusOr<tsl::AsyncValueRef<stream_executor::Event>>(
         int64_t channel_id, stream_executor::Stream* stream, const Shape& shape,
-        stream_executor::DeviceMemoryBase* dst)>;
+        stream_executor::DeviceMemoryBase* dst,
+        const absl::flat_hash_map<std::string, std::string>& frontend_attrs)>;
 
 // Class containing options for running a LocalExecutable.
 class ExecutableRunOptions {
diff --git a/tensorflow/compiler/xla/experimental/conv_emitter/BUILD b/tensorflow/compiler/xla/experimental/conv_emitter/BUILD
deleted file mode 100644
index 35b2e106800..00000000000
--- a/tensorflow/compiler/xla/experimental/conv_emitter/BUILD
+++ /dev/null
@@ -1,92 +0,0 @@
-# Description:
-#   MLIR-GPU-specific convolution in XLA service implementation.
-
-load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    includes = ["//tensorflow/compiler/xla:friends"],
-)
-
-# Filegroup used to collect source files for dependency checking.
-filegroup(
-    name = "c_srcs",
-    data = glob([
-        "**/*.cc",
-        "**/*.h",
-    ]),
-)
-
-cc_library(
-    name = "conv_emitter",
-    srcs = ["conv_emitter.cc"],
-    hdrs = ["conv_emitter.h"],
-    deps = [
-        ":conv_emitter_transforms",
-        "//tensorflow/compiler/xla:permutation_util",
-        "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service/llvm_ir:llvm_type_conversion_util",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineUtils",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
-cc_library(
-    name = "conv_emitter_transforms",
-    srcs = ["conv_emitter_transforms.cc"],
-    hdrs = ["conv_emitter_transforms.h"],
-    deps = [
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:types",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineUtils",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
-xla_cc_test(
-    name = "conv_emitter_test",
-    srcs = ["conv_emitter_test.cc"],
-    deps = [
-        ":conv_emitter",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "//tensorflow/compiler/xla/tests:filecheck",
-        "//tensorflow/compiler/xla/tests:verified_hlo_module",
-        "//tensorflow/tsl/platform:test",
-        "//tensorflow/tsl/platform:test_main",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineToStandard",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncToLLVM",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:MemRefToLLVM",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFToControlFlow",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
diff --git a/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.cc b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.cc
deleted file mode 100644
index c5af2884e0a..00000000000
--- a/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.cc
+++ /dev/null
@@ -1,608 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This is an explorative prototype emitter for convolution using MLIR.
-// This prototype is still under construction.
-// TODO(timshen): Fix the documentation once it's implemented.
-//
-// Goals:
-// * Autotune-able tiling.
-// * Autotune-able memory accesses.
-// * Autotune-able lowering logic (from a portable program to thread-oriented
-//   CUDA program).
-// * Use milr::AffineExpr to analyze all accesses. It aims to algorithmically
-//   find memory access strategies for given input layouts and tiling configs.
-
-#include "tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.h"
-
-#include "absl/types/span.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
-#include "mlir/Dialect/Affine/LoopUtils.h"  // from @llvm-project
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/AffineExpr.h"  // from @llvm-project
-#include "mlir/IR/AffineMap.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.h"
-#include "tensorflow/compiler/xla/permutation_util.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/llvm_type_conversion_util.h"
-#include "tensorflow/compiler/xla/window_util.h"
-
-namespace xla {
-namespace experimental {
-namespace {
-
-using mlir::OpBuilder;
-
-// Various extracted information for input shapes.
-struct ShapeInfo {
-  // Buffer dimensions in the order of NCHW.
-  std::vector<int64_t> nchw_dimensions;
-
-  // Buffer dimensions in the order of major to minor;
-  std::vector<int64_t> physical_dimensions;
-
-  // The affine map that takes NCHW indices, and maps to the physical order.
-  mlir::AffineMap affine_map;
-
-  mlir::Type element_type;
-};
-
-ShapeInfo GetShapeInfo(const Shape& shape, int64_t n_dim, int64_t c_dim,
-                       absl::Span<const tsl::protobuf_int64> spatial_dims,
-                       mlir::Builder builder) {
-  ShapeInfo shape_info;
-
-  std::vector<int64_t> physical_to_logical(
-      shape.layout().minor_to_major().rbegin(),
-      shape.layout().minor_to_major().rend());
-
-  std::vector<int64_t> nchw_to_logical;
-
-  nchw_to_logical.push_back(n_dim);
-  nchw_to_logical.push_back(c_dim);
-  for (int64_t dim : spatial_dims) {
-    nchw_to_logical.push_back(dim);
-  }
-
-  for (int64_t dim : nchw_to_logical) {
-    shape_info.nchw_dimensions.push_back(shape.dimensions(dim));
-  }
-
-  for (int64_t dim : physical_to_logical) {
-    shape_info.physical_dimensions.push_back(shape.dimensions(dim));
-  }
-
-  std::vector<mlir::AffineExpr> affine_exprs;
-  // We want physical to nchw order.
-  for (int64_t dim : ComposePermutations(InversePermutation(nchw_to_logical),
-                                         physical_to_logical)) {
-    affine_exprs.push_back(builder.getAffineDimExpr(dim));
-  }
-
-  shape_info.affine_map = mlir::AffineMap::get(
-      /*dimCount=*/2 + spatial_dims.size(), /*symbolCount=*/0, affine_exprs,
-      builder.getContext());
-
-  shape_info.element_type = [&] {
-    switch (shape.element_type()) {
-      case xla::F16:
-        return builder.getF16Type();
-      case xla::F32:
-        return builder.getF32Type();
-      default:
-        break;
-    }
-    CHECK(false);
-  }();
-
-  return shape_info;
-}
-
-void SetMemRef(mlir::Operation* op, mlir::Value memref) {
-  if (auto load = mlir::dyn_cast<mlir::AffineLoadOp>(op)) {
-    load.setMemRef(memref);
-  } else if (auto store = mlir::dyn_cast<mlir::AffineStoreOp>(op)) {
-    store.setMemRef(memref);
-  } else {
-    CHECK(false);
-  }
-}
-
-// Hoist operations out of `where`. [begin_op, end_op) must be the first
-// operations of their parent loop, and `where` must be an ancestor of that
-// parent loop.
-//
-// It always preserves the semantics of the program, therefore it may modify the
-// hoisted operations or add extra loops at the hoisted place.
-mlir::Operation* HoistAndFix(llvm::iplist<mlir::Operation>::iterator begin_op,
-                             llvm::iplist<mlir::Operation>::iterator end_op,
-                             mlir::AffineForOp where) {
-  // All loops to hoist through.
-  llvm::SmallVector<mlir::AffineForOp, 4> ancestors;
-  getPerfectlyNestedLoops(ancestors, where);
-  {
-    int i;
-    for (i = 0; i < ancestors.size(); i++) {
-      if (&ancestors[i].getBody()->front() == &*begin_op) {
-        break;
-      }
-    }
-    CHECK(i < ancestors.size());
-    ancestors.resize(i + 1);
-  }
-
-  std::vector<int64_t> ancestor_dimensions;
-  for (auto ancestor : ancestors) {
-    CHECK(IsSimpleLoop(ancestor));
-    ancestor_dimensions.push_back(
-        ancestor.getUpperBoundMap().getSingleConstantResult());
-  }
-
-  if (auto alloc = mlir::dyn_cast<mlir::memref::AllocOp>(begin_op)) {
-    CHECK(std::next(begin_op) == end_op)
-        << "alloc() needs to be hoisted by its own";
-
-    OpBuilder builder(where);
-    mlir::MemRefType type = alloc.getType();
-    CHECK(type.getLayout().isIdentity());
-    ancestor_dimensions.insert(ancestor_dimensions.end(),
-                               type.getShape().begin(), type.getShape().end());
-    mlir::MemRefType new_type =
-        mlir::MemRefType::get(ancestor_dimensions, type.getElementType());
-    auto new_alloc = builder.create<mlir::memref::AllocOp>(
-        builder.getUnknownLoc(), new_type);
-
-    std::vector<mlir::Value> indvars;
-    for (auto ancestor : ancestors) {
-      indvars.push_back(ancestor.getInductionVar());
-    }
-    for (auto& use : llvm::make_early_inc_range(alloc.getResult().getUses())) {
-      mlir::Operation* owner = use.getOwner();
-      BoundAffineMap affine_map = GetBoundAffineMapFrom(owner);
-      affine_map.operands.insert(affine_map.operands.begin(), indvars.begin(),
-                                 indvars.end());
-      CHECK(affine_map.affine_map.isIdentity());
-      affine_map.affine_map = mlir::AffineMap::getMultiDimIdentityMap(
-          affine_map.operands.size(), builder.getContext());
-
-      mlir::Operation* new_op =
-          CloneWithNewAffineMap(owner, affine_map, OpBuilder(owner));
-      SetMemRef(new_op, new_alloc);
-      owner->replaceAllUsesWith(new_op);
-      owner->erase();
-    }
-    alloc.erase();
-    return new_alloc;
-  }
-
-  const bool any_op_is_loop_variant = [&] {
-    for (mlir::Operation& op : llvm::make_range(begin_op, end_op)) {
-      if (mlir::isa<mlir::AffineForOp, mlir::AffineStoreOp>(op)) {
-        return true;
-      }
-    }
-    return false;
-  }();
-
-  if (any_op_is_loop_variant) {
-    auto builder = OpBuilder(where);
-    std::vector<mlir::AffineForOp> new_loops;
-    for (auto dim : ancestor_dimensions) {
-      auto where =
-          builder.create<mlir::AffineForOp>(builder.getUnknownLoc(), 0, dim);
-      new_loops.push_back(where);
-      builder = OpBuilder::atBlockTerminator(where.getBody());
-    }
-    for (mlir::Operation& op :
-         llvm::make_early_inc_range(llvm::make_range(begin_op, end_op))) {
-      op.moveBefore(&new_loops.back().getBody()->back());
-    }
-    CHECK_EQ(ancestors.size(), new_loops.size());
-    for (int i = 0; i < ancestors.size(); i++) {
-      replaceAllUsesInRegionWith(ancestors[i].getInductionVar(),
-                                 new_loops[i].getInductionVar(),
-                                 new_loops.back().getRegion());
-    }
-    return new_loops.front();
-  }
-  CHECK(false);
-}
-
-mlir::Operation* HoistAndFix(mlir::Operation* op, mlir::AffineForOp where) {
-  return HoistAndFix(op->getIterator(), std::next(op->getIterator()), where);
-}
-
-struct InitialMlirConvAnchors {
-  std::vector<mlir::AffineForOp> cartesian_product_loops;
-  std::vector<mlir::AffineForOp> reduction_loops;
-  mlir::memref::AllocOp output_acc;
-};
-
-// Return the following IR with the anchors set to corresponding operations.
-//   for (cartesian loops...) {
-//     %output_acc = alloc() : memref(f32)
-//     output_acc[] = 0
-//     for (reduction loops...) {
-//       output_acc[] += input[...] * filter[...]
-//     }
-//     output[...] = output_acc[]
-//   }
-StatusOr<InitialMlirConvAnchors> CreateNaiveMlirConv(
-    mlir::Value input, mlir::Value filter, mlir::Value output,
-    const ShapeInfo& input_shape_info, const ShapeInfo& filter_shape_info,
-    const ShapeInfo& output_shape_info, const Window& window,
-    OpBuilder builder) {
-  CHECK(input_shape_info.element_type == builder.getF16Type());
-  CHECK(filter_shape_info.element_type == builder.getF16Type());
-  CHECK(output_shape_info.element_type == builder.getF16Type());
-
-  auto location = mlir::UnknownLoc::get(builder.getContext());
-
-  std::vector<mlir::AffineForOp> cartesian_product_loops =
-      CreateNestedSimpleLoops(output_shape_info.nchw_dimensions, builder);
-
-  builder =
-      OpBuilder::atBlockTerminator(cartesian_product_loops.back().getBody());
-
-  auto output_acc = builder.create<mlir::memref::AllocOp>(
-      location, mlir::MemRefType::get({}, builder.getF32Type()));
-
-  builder.create<mlir::AffineStoreOp>(
-      location,
-      builder.create<mlir::arith::ConstantOp>(
-          location, mlir::FloatAttr::get(builder.getF32Type(), 0)),
-      output_acc, llvm::ArrayRef<mlir::Value>());
-
-  std::vector<mlir::AffineForOp> reduction_loops;
-  reduction_loops = CreateNestedSimpleLoops(
-      absl::MakeSpan(filter_shape_info.nchw_dimensions).subspan(1), builder);
-
-  mlir::AffineForOp loop_n = cartesian_product_loops[0];
-  mlir::AffineForOp loop_o = cartesian_product_loops[1];
-  mlir::AffineForOp loop_c = reduction_loops[0];
-
-  std::vector<mlir::Value> output_spatial_indvars;
-  for (auto loop : absl::MakeSpan(cartesian_product_loops).subspan(2)) {
-    output_spatial_indvars.push_back(loop.getInductionVar());
-  }
-  std::vector<mlir::Value> filter_spatial_indvars;
-  for (auto loop : absl::MakeSpan(reduction_loops).subspan(1)) {
-    filter_spatial_indvars.push_back(loop.getInductionVar());
-  }
-  int num_spatial_dims = output_spatial_indvars.size();
-  CHECK_EQ(num_spatial_dims, filter_spatial_indvars.size());
-
-  builder = OpBuilder::atBlockTerminator(reduction_loops.back().getBody());
-
-  mlir::Value loaded_input = [&] {
-    std::vector<mlir::AffineExpr> input_indices;
-    input_indices.push_back(builder.getAffineDimExpr(0));
-    input_indices.push_back(builder.getAffineDimExpr(1));
-
-    // For spatial dimensions, generate input_index * stride + filter_index -
-    // left_pad
-    //
-    // TODO(timshen): guard out-of-bound loads and stores brought by padding.
-    for (int i = 0; i < num_spatial_dims; i++) {
-      const WindowDimension& window_dim = window.dimensions(i);
-      input_indices.push_back(
-          builder.getAffineDimExpr(i + 2) * window_dim.stride() +
-          builder.getAffineDimExpr(2 + num_spatial_dims + i) -
-          window_dim.padding_low());
-    }
-    std::vector<mlir::Value> input_vars;
-    input_vars.push_back(loop_n.getInductionVar());
-    input_vars.push_back(loop_c.getInductionVar());
-    input_vars.insert(input_vars.end(), output_spatial_indvars.begin(),
-                      output_spatial_indvars.end());
-    input_vars.insert(input_vars.end(), filter_spatial_indvars.begin(),
-                      filter_spatial_indvars.end());
-
-    return builder.create<mlir::arith::ExtFOp>(
-        location, builder.getF32Type(),
-        builder.createOrFold<mlir::AffineLoadOp>(
-            location, input,
-            mlir::AffineMap(input_shape_info.affine_map)
-                .compose(mlir::AffineMap::get(
-                    /*dimCount=*/2 + num_spatial_dims * 2,
-                    /*symbolCount=*/0, input_indices, builder.getContext())),
-            input_vars));
-  }();
-
-  mlir::Value loaded_filter = [&] {
-    std::vector<mlir::Value> filter_vars;
-    filter_vars.push_back(loop_o.getInductionVar());
-    filter_vars.push_back(loop_c.getInductionVar());
-    filter_vars.insert(filter_vars.end(), filter_spatial_indvars.begin(),
-                       filter_spatial_indvars.end());
-
-    return builder.create<mlir::arith::ExtFOp>(
-        location, builder.getF32Type(),
-        builder.createOrFold<mlir::AffineLoadOp>(
-            location, filter, filter_shape_info.affine_map, filter_vars));
-  }();
-
-  auto accum_load_op =
-      builder.createOrFold<mlir::AffineLoadOp>(location, output_acc);
-  builder.createOrFold<mlir::AffineStoreOp>(
-      location,
-      builder.create<mlir::arith::AddFOp>(
-          location, accum_load_op,
-          builder.create<mlir::arith::MulFOp>(location, loaded_input,
-                                              loaded_filter)),
-      output_acc, llvm::ArrayRef<mlir::Value>());
-
-  builder.setInsertionPointAfter(reduction_loops[0]);
-  {
-    std::vector<mlir::Value> output_vars;
-    output_vars.push_back(loop_n.getInductionVar());
-    output_vars.push_back(loop_o.getInductionVar());
-    output_vars.insert(output_vars.end(), output_spatial_indvars.begin(),
-                       output_spatial_indvars.end());
-    builder.createOrFold<mlir::AffineStoreOp>(
-        location,
-        builder.create<mlir::arith::TruncFOp>(
-            location, builder.getF16Type(),
-            builder.createOrFold<mlir::AffineLoadOp>(location, output_acc)),
-        output, output_shape_info.affine_map, output_vars);
-  }
-
-  return InitialMlirConvAnchors{cartesian_product_loops, reduction_loops,
-                                output_acc};
-}
-
-// Contains the following pattern with anchors:
-//   for (cartesian loops...) {
-//     %output_acc = alloc() : memref(..., f32)
-//     for (reduction loops...) {
-//       for (tiled cartesian loops...) {
-//         output_acc[...] = 0
-//       }
-//       for (tiled cartesian loops...) {
-//         for (reduction loops...) {
-//           output_acc[] += input[...] * filter[...]
-//         }
-//       }
-//       for (tiled cartesian loops...) {
-//         output[...] = output_acc[...]
-//       }
-//     }
-//   }
-struct TransformedMlirConvAnchors {
-  std::vector<mlir::AffineForOp> cartesian_product_loops;
-  std::vector<mlir::AffineForOp> reduction_loops;
-};
-
-StatusOr<TransformedMlirConvAnchors> TransformMlirConv(
-    InitialMlirConvAnchors anchors) {
-  std::vector<mlir::AffineForOp> cartesian_product_loops =
-      anchors.cartesian_product_loops;
-  std::vector<mlir::AffineForOp> reduction_loops = anchors.reduction_loops;
-  mlir::memref::AllocOp output_acc = anchors.output_acc;
-
-  // TODO(timshen): consider using pattern matchers for transformations
-  //
-  // Initial form:
-  //   for (cartesian loops...) {
-  //     %output_acc = alloc() : memref(f32)
-  //     output_acc[] = 0
-  //     for (reduction loops...) {
-  //       output_acc[] += input[...] * filter[...]
-  //     }
-  //     output[...] = output_acc[]
-  //   }
-
-  // Tile cartesian loops to:
-  //   for (cartesian loops...) {
-  //     for (tiled cartesian loops...) {
-  //       %output_acc = alloc() : memref(f32)
-  //       output_acc[] = 0
-  //       for (reduction loops...) {
-  //         output_acc[] += input[...] * filter[...]
-  //       }
-  //       output[...] = output_acc[]
-  //     }
-  //   }
-  TileLoop(reduction_loops[0], 4, reduction_loops.back());
-
-  std::vector<mlir::AffineForOp> tiled_cartesian_loops;
-  tiled_cartesian_loops.push_back(
-      TileLoop(cartesian_product_loops[1], 32, cartesian_product_loops.back()));
-
-  tiled_cartesian_loops.push_back(TileLoop(cartesian_product_loops.back(), 16,
-                                           tiled_cartesian_loops.back()));
-
-  // Two hoist operations to interleave the allocation, computation, and
-  // writebacks to output_acc:
-  // After first hoist:
-  //   for (cartesian loops...) {
-  //     %output_acc = alloc() : memref(..., f32)
-  //     for (tiled cartesian loops...) {
-  //       output_acc[...] = 0
-  //       for (reduction loops...) {
-  //         output_acc[...] += input[...] * filter[...]
-  //       }
-  //       output[...] = output_acc[...]
-  //     }
-  //   }
-  output_acc = llvm::cast<mlir::memref::AllocOp>(
-      HoistAndFix(output_acc, tiled_cartesian_loops.front()));
-
-  // Hoist everything before reduction loops (aka zero initializations of
-  // output_acc):
-  //   for (cartesian loops...) {
-  //     %output_acc = alloc() : memref(..., f32)
-  //     for (tiled cartesian loops...) {
-  //       output_acc[...] = 0
-  //     }
-  //     for (tiled cartesian loops...) {
-  //       for (reduction loops...) {
-  //         output_acc[...] += input[...] * filter[...]
-  //       }
-  //       output[...] = output_acc[...]
-  //     }
-  //   }
-  HoistAndFix(tiled_cartesian_loops.back().getBody()->begin(),
-              reduction_loops.front().getOperation()->getIterator(),
-              tiled_cartesian_loops.front());
-
-  // Now hoist all reduction loops outside of tiled cartesian loops.
-  // Notice that HoistAndFix automatically add a new set of tiled cartesian
-  // loops for hoisted reduction loops to keep the semantics correct.
-  //
-  // After second hoist:
-  //   for (cartesian loops...) {
-  //     %output_acc = alloc() : memref(..., f32)
-  //     for (tiled cartesian loops...) {
-  //       output_acc[...] = 0
-  //     }
-  //     for (tiled cartesian loops...) {
-  //       for (reduction loops...) {
-  //         output_acc[] += input[...] * filter[...]
-  //       }
-  //     }  // compute loop
-  //     for (tiled cartesian loops...) {
-  //       output[...] = output_acc[...]
-  //     }
-  //   }
-  {
-    auto compute_loop = llvm::cast<mlir::AffineForOp>(
-        HoistAndFix(reduction_loops.front(), tiled_cartesian_loops[0]));
-
-    // Fix tiled_cartesian_loops to make them point to the tiled compute loops,
-    // not the writeback loops to output buffer.
-    llvm::SmallVector<mlir::AffineForOp, 4> all_loops;
-    getPerfectlyNestedLoops(all_loops, compute_loop);
-    absl::c_copy_n(all_loops, tiled_cartesian_loops.size(),
-                   tiled_cartesian_loops.data());
-  }
-
-  // After exchanging tiled cartesian compute loops with reduction loops:
-  //   for (cartesian loops...) {
-  //     %output_acc = alloc() : memref(..., f32)
-  //     for (tiled cartesian loops...) {
-  //       output_acc[...] = 0
-  //     }
-  //     for (reduction loops...) {
-  //       for (tiled cartesian loops...) {
-  //         output_acc[] += input[...] * filter[...]
-  //       }
-  //     }
-  //     for (tiled cartesian loops...) {
-  //       output[...] = output_acc[...]
-  //     }
-  //   }
-  //
-  // ...so that later tiled cartesian loops (with computations in it) can be
-  // replaced by CUDA MMA instructions.
-  {
-    std::vector<mlir::AffineForOp> loops;
-    loops.insert(loops.end(), tiled_cartesian_loops.begin(),
-                 tiled_cartesian_loops.end());
-    loops.insert(loops.end(), reduction_loops.begin(), reduction_loops.end());
-    SinkPerfectlyNestedLoops(loops, tiled_cartesian_loops.size());
-  }
-  return TransformedMlirConvAnchors{cartesian_product_loops, reduction_loops};
-}
-
-}  // namespace
-
-StatusOr<mlir::func::FuncOp> EmitConvolutionForwardAsMlir(
-    HloInstruction* conv, absl::string_view function_name,
-    mlir::MLIRContext* context) {
-  OpBuilder builder(context);
-
-  const auto& dim_nums = conv->convolution_dimension_numbers();
-  ShapeInfo input_shape_info =
-      GetShapeInfo(conv->operand(0)->shape(), dim_nums.input_batch_dimension(),
-                   dim_nums.input_feature_dimension(),
-                   dim_nums.input_spatial_dimensions(), builder);
-
-  ShapeInfo filter_shape_info = GetShapeInfo(
-      conv->operand(1)->shape(), dim_nums.kernel_output_feature_dimension(),
-      dim_nums.kernel_input_feature_dimension(),
-      dim_nums.kernel_spatial_dimensions(), builder);
-
-  ShapeInfo output_shape_info = GetShapeInfo(
-      conv->shape().tuple_shapes(0), dim_nums.output_batch_dimension(),
-      dim_nums.output_feature_dimension(), dim_nums.output_spatial_dimensions(),
-      builder);
-
-  auto function = mlir::func::FuncOp::create(
-      mlir::UnknownLoc::get(builder.getContext()),
-      llvm_ir::AsStringRef(function_name),
-      builder.getFunctionType(
-          {mlir::MemRefType::get(output_shape_info.physical_dimensions,
-                                 output_shape_info.element_type,
-                                 mlir::AffineMap()),
-           mlir::MemRefType::get(input_shape_info.physical_dimensions,
-                                 input_shape_info.element_type,
-                                 mlir::AffineMap()),
-           mlir::MemRefType::get(filter_shape_info.physical_dimensions,
-                                 filter_shape_info.element_type,
-                                 mlir::AffineMap())},
-          {}));
-
-  auto* entry_block = function.addEntryBlock();
-  builder.setInsertionPointToStart(entry_block);
-  builder.create<mlir::func::ReturnOp>(builder.getUnknownLoc());
-  builder.setInsertionPointToStart(entry_block);
-
-  mlir::Value input = entry_block->getArgument(1);
-  mlir::Value filter = entry_block->getArgument(2);
-  mlir::Value output = entry_block->getArgument(0);
-
-  TF_RETURN_IF_ERROR(ConvIsImplemented(conv));
-
-  TF_ASSIGN_OR_RETURN(
-      InitialMlirConvAnchors initial_anchors,
-      CreateNaiveMlirConv(input, filter, output, input_shape_info,
-                          filter_shape_info, output_shape_info, conv->window(),
-                          builder));
-
-  TF_ASSIGN_OR_RETURN(TransformedMlirConvAnchors transformed_anchors,
-                      TransformMlirConv(initial_anchors));
-
-  // TODO(timshen): Implement a transformation that collects loads to a given
-  // buffer, create a local alloc() for the accessed part, redirects all loads
-  // and stores to that local alloc(), and create code to initialize /
-  // writeback the local alloc() if needed.
-
-  // TODO(timshen): Implement CUDA-specific lowering.
-
-  return function;
-}
-
-Status ConvIsImplemented(const HloInstruction* conv) {
-  if (conv->feature_group_count() != 1 || conv->batch_group_count() != 1) {
-    return Unimplemented("group count is not implemented.");
-  }
-  if (window_util::HasWindowReversal(conv->window())) {
-    return Unimplemented("Window reversal is not implemented.");
-  }
-  if (window_util::HasDilation(conv->window())) {
-    return Unimplemented("Dilation is not implemented.");
-  }
-  return ::tsl::OkStatus();
-}
-
-}  // namespace experimental
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.h b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.h
deleted file mode 100644
index a380800b2f7..00000000000
--- a/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_H_
-#define TENSORFLOW_COMPILER_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_H_
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
-
-namespace xla {
-namespace experimental {
-
-// Builds MLIR using custom_call that represents a foward convolution.
-//
-// The generated function has the following signature:
-// func @<function_name>(%output: memref<physical_layout...>,
-//                       %input: memref<physical_layout...>,
-//                       %filter: memref<physical_layout...>) { ... }
-//
-// Note that the custom_call is XLA/GPU-specific, as it calls into cuDNN's
-// forward convolution. However, here we are building a MLIR custom emitter, and
-// we are not calling into cuDNN. We just want to borrow the HLO representation
-// that already exists in XLA/GPU backend.
-//
-// `input`, `filter`, `output` are convolution inputs.
-StatusOr<mlir::func::FuncOp> EmitConvolutionForwardAsMlir(
-    HloInstruction* conv, absl::string_view function_name,
-    mlir::MLIRContext* context);
-
-// Returns OkStatus() if convolution can be implemented by this emitter.
-Status ConvIsImplemented(const HloInstruction* conv);
-
-}  // namespace experimental
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_H_
diff --git a/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_test.cc b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_test.cc
deleted file mode 100644
index c0ab3e283a1..00000000000
--- a/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_test.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.h"
-
-#include <vector>
-
-#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"  // from @llvm-project
-#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
-#include "tensorflow/compiler/xla/tests/filecheck.h"
-#include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
-#include "tensorflow/tsl/platform/test.h"
-
-namespace xla {
-namespace experimental {
-namespace {
-
-std::string CompileHloConvAndGetMlir(absl::string_view hlo_text) {
-  xla::HloModuleConfig hlo_config;
-  VerifiedHloModule hlo_module(
-      "Conv", hlo_config, /*verifier_layout_sensitive=*/false,
-      /*allow_mixed_precision_in_hlo_verifier=*/true,
-      /*shape_size_function=*/ShapeUtil::ByteSizeOfElements);
-  TF_CHECK_OK(hlo_module.ParseHloStringAndVerifyModule(hlo_text));
-  xla::HloInstruction* conv =
-      hlo_module.entry_computation()->root_instruction();
-
-  mlir::MLIRContext context;
-  context.loadDialect<mlir::AffineDialect, mlir::arith::ArithDialect,
-                      mlir::memref::MemRefDialect, mlir::func::FuncDialect>();
-  mlir::OwningOpRef<mlir::ModuleOp> mlir_module(
-      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context)));
-
-  mlir::func::FuncOp function =
-      EmitConvolutionForwardAsMlir(conv, "Conv", &context).value();
-
-  mlir_module->push_back(function);
-  (void)mlir_module->verifyInvariants();
-
-  std::string mlir_text = llvm_ir::DumpToString(function);
-  VLOG(1) << mlir_text;
-
-  {
-    mlir::PassManager pm(mlir_module->getContext());
-    pm.addPass(mlir::createLowerAffinePass());
-    pm.addPass(mlir::createConvertSCFToCFPass());
-    pm.addPass(mlir::createFinalizeMemRefToLLVMConversionPass());
-    pm.addPass(mlir::createConvertFuncToLLVMPass());
-    CHECK(mlir::succeeded(pm.run(*mlir_module)));
-  }
-
-  return mlir_text;
-}
-
-// TODO(timshen): integrate this with mlir's testing infrastructure.
-TEST(ConvEmitterTest, TestDefault) {
-  std::string hlo_text = R"(HloModule TestModule
-ENTRY %TestComputation {
-  %param_0 = f16[128,4,224,224]{1,3,2,0} parameter(0)
-  %param_1 = f16[7,7,64,4]{3,1,0,2} parameter(1)
-  ROOT %custom-call.1 = (f16[128,64,112,112]{1,3,2,0}, u8[0]{0}) custom-call(%param_0, %param_1), window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=bf01_01oi->bf01, custom_call_target="__cudnn$convForward", backend_config="{conv_result_scale:1}"
-})";
-
-  std::string expected_mlir_pattern =
-      R"(
-CHECK: func @Conv(%arg0: memref<128x112x112x64xf16>, %arg1: memref<128x224x224x4xf16>, %arg2: memref<64x7x7x4xf16>) {
-CHECK-NEXT:   affine.for %arg3 = 0 to 128 {
-CHECK-NEXT:     affine.for %arg4 = 0 to 2 {
-CHECK-NEXT:       affine.for %arg5 = 0 to 112 {
-CHECK-NEXT:         affine.for %arg6 = 0 to 7 {
-CHECK-NEXT:           %[[ALLOC:.*]] = memref.alloc() : memref<32x16xf32>
-CHECK-NEXT:           affine.for %arg7 = 0 to 32 {
-CHECK-NEXT:             affine.for %arg8 = 0 to 16 {
-CHECK-NEXT:               %cst = arith.constant 0.000000e+00 : f32
-CHECK-NEXT:               affine.store %cst, %[[ALLOC]][%arg7, %arg8] : memref<32x16xf32>
-CHECK-NEXT:             }
-CHECK-NEXT:           }
-CHECK-NEXT:           affine.for %arg7 = 0 to 1 {
-CHECK-NEXT:             affine.for %arg8 = 0 to 7 {
-CHECK-NEXT:               affine.for %arg9 = 0 to 7 {
-CHECK-NEXT:                 affine.for %arg10 = 0 to 32 {
-CHECK-NEXT:                   affine.for %arg11 = 0 to 16 {
-CHECK-NEXT:                     affine.for %arg12 = 0 to 4 {
-CHECK-NEXT:                       %[[LOAD0:.*]] = affine.load %arg1[%arg3, %arg5 * 2 + %arg8 - 3, (%arg6 * 16 + %arg11) * 2 + %arg9 - 3, %arg7 * 4 + %arg12] : memref<128x224x224x4xf16>
-CHECK-NEXT:                       %[[EXT0:.*]] = arith.extf %[[LOAD0]] : f16 to f32
-CHECK-NEXT:                       %[[LOAD1:.*]] = affine.load %arg2[%arg4 * 32 + %arg10, %arg8, %arg9, %arg7 * 4 + %arg12] : memref<64x7x7x4xf16>
-CHECK-NEXT:                       %[[EXT1:.*]] = arith.extf %[[LOAD1]] : f16 to f32
-CHECK-NEXT:                       %[[LOAD2:.*]] = affine.load %[[ALLOC]][%arg10, %arg11] : memref<32x16xf32>
-CHECK-NEXT:                       %[[MUL:.*]] = arith.mulf %[[EXT0]], %[[EXT1]] : f32
-CHECK-NEXT:                       %[[ADD:.*]] = arith.addf %[[LOAD2]], %[[MUL]] : f32
-CHECK-NEXT:                       affine.store %[[ADD]], %[[ALLOC]][%arg10, %arg11] : memref<32x16xf32>
-CHECK-NEXT:                     }
-CHECK-NEXT:                   }
-CHECK-NEXT:                 }
-CHECK-NEXT:               }
-CHECK-NEXT:             }
-CHECK-NEXT:           }
-CHECK-NEXT:           affine.for %arg7 = 0 to 32 {
-CHECK-NEXT:             affine.for %arg8 = 0 to 16 {
-CHECK-NEXT:               %[[LOAD:.*]] = affine.load %[[ALLOC]][%arg7, %arg8] : memref<32x16xf32>
-CHECK-NEXT:               %[[TRUNC:.*]] = arith.truncf %[[LOAD]] : f32 to f16
-CHECK-NEXT:               affine.store %[[TRUNC]], %arg0[%arg3, %arg5, %arg6 * 16 + %arg8, %arg4 * 32 + %arg7] : memref<128x112x112x64xf16>
-CHECK-NEXT:             }
-CHECK-NEXT:           }
-CHECK-NEXT:         }
-CHECK-NEXT:       }
-CHECK-NEXT:     }
-CHECK-NEXT:   }
-CHECK-NEXT:   return
-CHECK-NEXT: }
-)";
-
-  EXPECT_TRUE(
-      RunFileCheck(CompileHloConvAndGetMlir(hlo_text), expected_mlir_pattern)
-          .value());
-}
-
-}  // namespace
-}  // namespace experimental
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.cc b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.cc
deleted file mode 100644
index 91268062959..00000000000
--- a/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.h"
-
-#include <iterator>
-
-#include "absl/algorithm/container.h"
-#include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
-#include "mlir/Dialect/Affine/LoopUtils.h"  // from @llvm-project
-#include "tensorflow/tsl/platform/logging.h"
-
-namespace xla {
-namespace experimental {
-
-using mlir::OpBuilder;
-
-BoundAffineMap GetBoundAffineMapFrom(mlir::Operation* op) {
-  if (auto load = mlir::dyn_cast<mlir::AffineLoadOp>(op)) {
-    return {load.getAffineMap(),
-            std::vector<mlir::Value>(load.getMapOperands().begin(),
-                                     load.getMapOperands().end())};
-  } else if (auto store = mlir::dyn_cast<mlir::AffineStoreOp>(op)) {
-    return {store.getAffineMap(),
-            std::vector<mlir::Value>(store.getMapOperands().begin(),
-                                     store.getMapOperands().end())};
-  } else {
-    CHECK(false);
-  }
-}
-
-mlir::Operation* CloneWithNewAffineMap(mlir::Operation* op,
-                                       BoundAffineMap new_affine,
-                                       OpBuilder builder) {
-  if (auto load = mlir::dyn_cast<mlir::AffineLoadOp>(op)) {
-    return builder.create<mlir::AffineLoadOp>(
-        builder.getUnknownLoc(), load.getMemRef(), new_affine.affine_map,
-        new_affine.operands);
-  } else if (auto store = mlir::dyn_cast<mlir::AffineStoreOp>(op)) {
-    return builder.create<mlir::AffineStoreOp>(
-        builder.getUnknownLoc(), store.getValueToStore(), store.getMemRef(),
-        new_affine.affine_map, new_affine.operands);
-  } else {
-    CHECK(false);
-  }
-}
-
-bool IsSimpleLoop(mlir::AffineForOp loop) {
-  return loop.getLowerBoundMap().isSingleConstant() &&
-         loop.getLowerBoundMap().getSingleConstantResult() == 0 &&
-         loop.getStep() == 1 && loop.getUpperBoundMap().getNumResults() == 1 &&
-         std::next(loop.getRegion().begin()) == loop.getRegion().end();
-}
-
-std::vector<mlir::AffineForOp> CreateNestedSimpleLoops(
-    absl::Span<const int64_t> upper_bounds, OpBuilder builder) {
-  std::vector<mlir::AffineForOp> loops;
-  loops.reserve(upper_bounds.size());
-  for (int64_t dim : upper_bounds) {
-    auto loop =
-        builder.create<mlir::AffineForOp>(builder.getUnknownLoc(), 0, dim);
-    loops.push_back(loop);
-    builder = OpBuilder::atBlockTerminator(loop.getBody());
-  }
-  return loops;
-}
-
-void SetBoundForSimpleLoop(mlir::AffineForOp loop, mlir::AffineExpr new_bound,
-                           OpBuilder builder) {
-  CHECK(IsSimpleLoop(loop));
-
-  loop.setUpperBoundMap(mlir::AffineMap::get(
-      loop.getUpperBoundMap().getNumDims(),
-      loop.getUpperBoundMap().getNumSymbols(), {new_bound}));
-}
-
-mlir::AffineForOp TileLoop(mlir::AffineForOp loop, int64_t size,
-                           mlir::AffineForOp target) {
-  CHECK(IsSimpleLoop(loop));
-  CHECK(IsSimpleLoop(target));
-  {
-    llvm::SmallVector<mlir::AffineForOp, 4> all_loops;
-    getPerfectlyNestedLoops(all_loops, loop);
-    CHECK(absl::c_linear_search(all_loops, target));
-  }
-
-  auto builder = OpBuilder::atBlockTerminator(target.getBody());
-
-  auto inner_loop =
-      builder.create<mlir::AffineForOp>(builder.getUnknownLoc(), 0, size);
-  {
-    auto& inner_operations = inner_loop.getBody()->getOperations();
-    auto& target_operations = target.getBody()->getOperations();
-
-    inner_operations.splice(inner_operations.begin(), target_operations,
-                            target_operations.begin(),
-                            std::prev(target_operations.end(), 2));
-
-    mlir::AffineExpr length = loop.getUpperBoundMap().getResult(0);
-    CHECK_EQ(0, length.cast<mlir::AffineConstantExpr>().getValue() % size);
-    SetBoundForSimpleLoop(loop, length.ceilDiv(size), builder);
-  }
-
-  for (auto& use :
-       llvm::make_early_inc_range(loop.getInductionVar().getUses())) {
-    mlir::Operation* owner = use.getOwner();
-    BoundAffineMap affine_map = GetBoundAffineMapFrom(owner);
-    unsigned new_dim = affine_map.operands.size();
-    affine_map.operands.push_back(inner_loop.getInductionVar());
-    std::vector<mlir::AffineExpr> replacements;
-    for (int i = 0; i < affine_map.affine_map.getNumDims(); i++) {
-      if (affine_map.operands[i] == loop.getInductionVar()) {
-        replacements.push_back(builder.getAffineDimExpr(i) * size +
-                               builder.getAffineDimExpr(new_dim));
-      } else {
-        replacements.push_back(builder.getAffineDimExpr(i));
-      }
-    }
-    affine_map.affine_map = affine_map.affine_map.replaceDimsAndSymbols(
-        replacements, {}, affine_map.operands.size(), 0);
-    auto new_op = CloneWithNewAffineMap(owner, affine_map, OpBuilder(owner));
-    owner->replaceAllUsesWith(new_op);
-    owner->erase();
-  }
-  return inner_loop;
-}
-
-void SinkPerfectlyNestedLoops(llvm::MutableArrayRef<mlir::AffineForOp> loops,
-                              int rotate_amount) {
-  CHECK_GE(rotate_amount, 0);
-  std::vector<unsigned> permutation(loops.size());
-  std::iota(permutation.begin(), permutation.end(), unsigned(0));
-  std::rotate(permutation.begin(),
-              permutation.begin() + loops.size() - rotate_amount,
-              permutation.end());
-  mlir::permuteLoops(loops, permutation);
-}
-
-}  // namespace experimental
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.h b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.h
deleted file mode 100644
index 97c44daa52f..00000000000
--- a/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_TRANSFORMS_H_
-#define TENSORFLOW_COMPILER_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_TRANSFORMS_H_
-
-#include "absl/types/span.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "tensorflow/tsl/platform/types.h"
-
-namespace xla {
-namespace experimental {
-
-struct BoundAffineMap {
-  mlir::AffineMap affine_map;
-  std::vector<mlir::Value> operands;
-};
-
-BoundAffineMap GetBoundAffineMapFrom(mlir::Operation* op);
-mlir::Operation* CloneWithNewAffineMap(mlir::Operation* op,
-                                       BoundAffineMap new_affine,
-                                       mlir::OpBuilder builder);
-
-bool IsSimpleLoop(mlir::AffineForOp loop);
-std::vector<mlir::AffineForOp> CreateNestedSimpleLoops(
-    absl::Span<const int64_t> upper_bounds, mlir::OpBuilder builder);
-void SetBoundForSimpleLoop(mlir::AffineForOp loop, mlir::AffineExpr new_bound,
-                           mlir::OpBuilder builder);
-
-// Tile a loop with trip count N by `size`. For now, N has to be a multiple of
-// size, but later this constraint will be removed.
-//
-// The major loop (with trip count N / size) stays as-is, while the minor loop
-// (with trip count `size`) will take over the body of `target`, and be placed
-// as the new body of `target`.
-//
-// `target` has to be within the same "perfectly nested loop group" as `loop`.
-// See the documentation for mlir::getPerfectlyNestedLoops.
-//
-// Example:
-// Before tiling `loop` with tile size X:
-//   for (loop in N)
-//     for (unrelated_loop in ...)
-//       for (target in ...)
-//         // pass loop into affine maps
-// After:
-//   for (loop in N / X)
-//     for (unrelated_loop in ...)
-//       for (target in ...)
-//         for (tiled_loop in X)
-//           // rewrite all affine exprs from loop to `loop * X + tiled_loop`.
-//
-// Design note:
-// TileLoop is different from mlir::tile. At the moment, mlir::tile is not well
-// documented about the exact tiling semantics, but the observed behavior is:
-//   for (i from 0 to N)
-//     for (unrelated_loop in ...)
-//       for (target in ...)
-//         // pass i into affine maps
-// =>
-//   for (i from 0 to N, step = X)
-//     for (unrelated_loop in ...)
-//       for (target in ...)
-//         for (j from i to min(i + X, N), step = 1)
-//           // pass j into affine maps
-//
-// There are two differences between mlir::tile and TileLoop:
-// * TileLoop always puts the tiling logic "stepping" logic into AffineExprs.
-//   With that all index calculation is done in AffineExprs and easier to
-//   analyze in a single place.
-// * TileLoop doesn't plan to use max() and min() to resolve the issue when
-//   N % X != 0. max() and min() are not representable in AffineExprs.
-//   TODO(timshen): support the case where N % X != 0.
-//
-// TODO(timshen): consider the possibility to reuse mlir::tile's logic to
-// achieve the same goal.
-mlir::AffineForOp TileLoop(mlir::AffineForOp loop, int64_t size,
-                           mlir::AffineForOp target);
-
-// Sinks a segment of perfectly nested loops to the bottom. It implements this
-// by rotating the loop nest by rotate_amount.
-void SinkPerfectlyNestedLoops(llvm::MutableArrayRef<mlir::AffineForOp> loops,
-                              int rotate_amount);
-
-}  // namespace experimental
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_TRANSFORMS_H_
diff --git a/tensorflow/compiler/xla/experimental/conv_emitter/g3doc/conv_emitter.md b/tensorflow/compiler/xla/experimental/conv_emitter/g3doc/conv_emitter.md
deleted file mode 100644
index 6151357372d..00000000000
--- a/tensorflow/compiler/xla/experimental/conv_emitter/g3doc/conv_emitter.md
+++ /dev/null
@@ -1,324 +0,0 @@
-# Convolution Emitter
-
-## Context
-
-This is a doc that describes a set of patches that are still under review.
-TODO(timshen): Change once all patches are checked in.
-
-The convolution emitter is a prototype with the following goals:
-
-*   The top priority is performance.
-*   It supports arbitrarily sophisticated layouts.
-*   It supports platform-specific high-performance instructions.
-*   It is as portable as possible.
-*   It enables fusion support in the future.
-
-## Current Design
-
-### Overview
-
-The prototype consists of the following components:
-
-*   The emitter currently focuses on NVIDIA Volta architecture and N(C/4)HW4
-    layout.
-*   An MLIR-based emitter. It takes a set of tuning parameters and a convolution
-    configuration, then produces a NVVM device function.
-*   An autotuner, which generates tuning parameters given a convolution
-    configuration.
-*   A test framework, which executes the generated device function with random
-    inputs, and compares the result against cuDNN.
-
-### The Emitter - Naive Implementation
-
-The emitter starts with a hand-built, naive implementation that looks like
-following Resnet first layer convolution (pseudo code):
-
-```mlir
-func @Conv(%input : memref<128x1x224x224xvector<4xf16>>,
-           %filter : memref<64x1x7x7xvector<4xf16>>,
-           %output : memref<128x64x224x224xf16>) {
-  affine.parallel (%n, %o, %oh, %ow) = 0 to 128, 0 to 64, 0 to 112, 0 to 112 {
-    %acc = alloc() : memref<f32>
-    affine.store 0, %acc[]
-    affine.for (%c, %fh, %fw) = 0 to 1, 0 to 7, 0 to 7 {
-      %a = affine.padded.load %input[%n, %c, %oh * 2 + %fh - 3, %ow * 2 + %fw - 3]
-      %b = affine.load %filter[%o, %c, %fh, %fw]
-      %c = affine.load %acc[]
-      %d = std.fpext %a to vector<4xf32>
-      %e = std.fpext %b to vector<4xf32>
-      %f = std.multiply %d, %e
-      %g = "reduce" %f
-      %v = %g + %c
-      affine.store %v, %acc[]
-    }
-    %c = affine.load %acc[]
-    affine.store %acc, %output[%n, %o, %oh, %ow]
-  }
-}
-```
-
-A few extensions are used in the example above:
-
-*   affine.padded.load allows out-of-bounds access, in which case the result is
-    always 0.
-*   The "reduce" operation produces the sum of elements in a vector.
-
-Also notice that the input element type is vector<4xf16> only because the
-current implementation does so. A MemRef with <...x4xf16> should work as well,
-given the alignment properly aligned to at least 8 (usually 16).
-
-Then the emitter does a few semantic preserving transformations to work the code
-towards PTX's structure.
-
-### The Emitter - Tiling
-
-The following is the naive code after loop tiling:
-
-```mlir
-func @Conv(%input : memref<128x1x224x224xvector<4xf16>>,
-           %filter : memref<64x1x7x7xvector<4xf16>>,
-           %output : memref<128x64x224x224xf16>) {
-  affine.parallel (%n0, %o0, %oh0, %ow0) = 0 to 128, 0 to 1, 0 to 7, 0 to 7 {
-    affine.parallel (%n1, %o1, %oh1, %ow1) = 0 to 1, 0 to 64, 0 to 16, 0 to 16 {
-      %acc = alloc() : memref<f32>
-      affine.store 0, %acc[]
-      affine.for (%c0, %fh0, %fw0) = 0 to 1, 0 to 1, 0 to 1 {
-        affine.for (%c1, %fh1, %fw1) = 0 to 1, 0 to 7, 0 to 7 {
-          %a = affine.padded.load %input[
-              %n0 * 1 + %n1,
-              %c0 * 1 + %c1,
-              (%oh0 * 16 + %oh1) * 2 + %fh0 * 7 + %fh1 - 3,
-              (%ow0 * 16 + %ow1) * 2 + %fw0 * 7 + %fw1 - 3]
-          %b = affine.load %filter[
-              %o0 * 64 + %o1,
-              %c0 * 1 + %c1,
-              %fh0 * 7 + %fh1,
-              %fw0 * 7 + %fw1]
-          %old = affine.load %acc[]
-          %d = std.fpext %a to vector<4xf32>
-          %e = std.fpext %b to vector<4xf32>
-          %f = std.multiply %d, %e
-          %g = "reduce" %f
-          %new = %g + %old
-          affine.store %new, %acc[]
-        }
-      }
-      %v = affine.load %acc[]
-      affine.store %v, %output[
-          %n0 * 1 + %n1,
-          %o0 * 64 + %o1,
-          %oh0 * 16 + %oh1,
-          %ow0 * 16 + %ow1]
-    } { ptx_block }
-  } { ptx_grid }
-}
-```
-
-The motivation is obvious - we need to decide which loops are parallelized on
-the compute units in the PTX architecture. The `ptx_grid` and `ptx_block`
-directs that the loop should be parallelized on a grid / a block, respectively.
-
-Also notice that to keep the code pattern clean and neat, tiling is implemented
-in the following way. Defining "simple loop" as a loop with lower bound 0, and
-step 1, the tiling:
-
-*   only takes simple loops.
-*   only produces simple loops.
-*   no extra operation is generated. All altered index calculations are done in
-    each user AffineMaps.
-
-The contracting dimensions (%c, %fh, %fw) are also tiled for once. The
-significance will be seen later in shared memory promotion.
-
-### The Emitter - Splitting
-
-This step splits the body of the (%n1, %o1, %oh1, %ow1) loop into several parts:
-
-*   The code that sets the accumulators to 0.
-*   The actual convolution computation code.
-*   The code that writes back accumulators to the %output buffer.
-
-This transformation "vectorizes" the accumulator accordingly as the `alloc()`
-gets hoisted out of the `affine.parallel` op.
-
-After splitting:
-
-```mlir
-func @Conv(%input : memref<128x1x224x224xvector<4xf16>>,
-           %filter : memref<64x1x7x7xvector<4xf16>>,
-           %output : memref<128x64x224x224xf16>) {
-  affine.parallel (%n0, %o0, %oh0, %ow0) = 0 to 128, 0 to 1, 0 to 7, 0 to 7 {
-    %acc = alloc() : memref<1x64x16x16xf32>
-    affine.parallel (%n1, %o1, %oh1, %ow1) = 0 to 1, 0 to 64, 0 to 16, 0 to 16 {
-      affine.store 0, %acc[%n1, %o1, %oh1, %ow1]
-    } { ptx_block }
-    affine.parallel (%n1, %o1, %oh1, %ow1) = 0 to 1, 0 to 64, 0 to 16, 0 to 16 {
-      affine.for (%c0, %fh0, %fw0) = 0 to 1, 0 to 1, 0 to 1 {
-        affine.for (%c1, %fh1, %fw1) = 0 to 1, 0 to 7, 0 to 7 {
-          %a = affine.padded.load %input[
-              %n0 * 1 + %n1,
-              %c0 * 1 + %c1,
-              (%oh0 * 16 + %oh1) * 2 + %fh0 * 7 + %fh1 - 3,
-              (%ow0 * 16 + %ow1) * 2 + %fw0 * 7 + %fw1 - 3]
-          %b = affine.load %filter[
-              %o0 * 64 + %o1,
-              %c0 * 1 + %c1,
-              %fh0 * 7 + %fh1,
-              %fw0 * 7 + %fw1]
-          %old = affine.load %acc[%n1, %o1, %oh1, %ow1]
-          %d = std.fpext %a to vector<4xf32>
-          %e = std.fpext %b to vector<4xf32>
-          %f = std.multiply %d, %e
-          %g = "reduce" %f
-          %new = %g + %old
-          affine.store %new, %acc[%n1, %o1, %oh1, %ow1]
-        }
-      }
-    } { ptx_block }
-    affine.parallel (%n1, %o1, %oh1, %ow1) = 0 to 1, 0 to 64, 0 to 16, 0 to 16 {
-      %v = affine.load %acc[%n1, %o1, %oh1, %ow1]
-      affine.store %v, %output[
-          %n0 * 1 + %n1,
-          %o0 * 64 + %o1,
-          %oh0 * 16 + %oh1,
-          %ow0 * 16 + %ow1]
-    } { ptx_block }
-  } { ptx_grid }
-}
-```
-
-To prepare for the next transformations, we'd also like to sink the (%n1, %o1,
-%oh1, %ow1), as (%c0, %fh0, %fw0) is not interesting.
-
-```
-affine.parallel (%n1, %o1, %oh1, %ow1) = 0 to 1, 0 to 64, 0 to 16, 0 to 16 {
-  affine.for (%c0, %fh0, %fw0) = 0 to 1, 0 to 1, 0 to 1 {
-    affine.for (%c1, %fh1, %fw1) = 0 to 1, 0 to 7, 0 to 7 {
-      ...
-    }
-  }
-} { ptx_block }
-
-=>
-
-affine.for (%c0, %fh0, %fw0) = 0 to 1, 0 to 1, 0 to 1 {
-  affine.for (%c1, %fh1, %fw1) = 0 to 1, 0 to 7, 0 to 7 {
-    affine.parallel (%n1, %o1, %oh1, %ow1) = 0 to 1, 0 to 64, 0 to 16, 0 to 16 {
-      ...
-    } { ptx_block }
-  }
-}
-```
-
-### The Emitter - Shared Memory Promotion
-
-This transformation is done by `affineDataCopyGenerate`, which does precise
-calculation on how much memory is transferred for a load operation.
-
-After calculating the sizes of the shared memory buffer (`%promoted_input` and
-`%promoted_filter`), the transformation also creates loads and stores to
-pre-fetch data from global memory (`%input`, `%filter`) to the promoted, shared
-memory.
-
-```mlir
-// Before
-affine.for (%c1, %fh1, %fw1) = 0 to 1, 0 to 7, 0 to 7 {
-  affine.parallel (%n1, %o1, %oh1, %ow1) = 0 to 1, 0 to 64, 0 to 16, 0 to 16 {
-    %a = affine.padded.load %input[
-        %n0 * 1 + %n1,
-        %c0 * 1 + %c1,
-        (%oh0 * 16 + %oh1) * 2 + %fh0 * 7 + %fh1 - 3,
-        (%ow0 * 16 + %ow1) * 2 + %fw0 * 7 + %fw1 - 3]
-    %b = affine.load %filter[
-        %o0 * 64 + %o1,
-        %c0 * 1 + %c1,
-        %fh0 * 7 + %fh1,
-        %fw0 * 7 + %fw1]
-    %old = affine.load %acc[%n1, %o1, %oh1, %ow1]
-    %d = std.fpext %a to vector<4xf32>
-    %e = std.fpext %b to vector<4xf32>
-    %f = std.multiply %d, %e
-    %g = "reduce" %f
-    %new = %g + %old
-    affine.store %new, %acc[%n1, %o1, %oh1, %ow1]
-  } { ptx_block }
-}
-```
-
-```mlir
-// After
-
-%promoted_input = alloc() : memref<1x1x37x37, memory_space = 3>
-%promoted_filter = alloc() : memref<64x1x7x7, memory_space = 3>
-affine.parallel (%i0, %i1, %i2, %i3) = 0 to 1, 0 to 1, 0 to 37, 0 to 37 {
-  %v = affine.padded.load %input[
-      %n0 * 1 + %i0,
-      %c0 * 1 + %i1,
-      (%oh0 * 16) * 2 + %fh0 * 7 + %i2 - 3,
-      (%ow0 * 16) * 2 + %fw0 * 7 + %i3 - 3]
-  affine.store %v, %promoted_input[%i0, %i1, %i2, %i3]
-} { ptx_block }
-affine.parallel (%i0, %i1, %i2, %i3) = 0 to 64, 0 to 1, 0 to 7, 0 to 7 {
-  %v = affine.load %filter[
-      %o0 * 64 + %i0,
-      %c0 * 1 + %i1,
-      %fh0 * 7 + %i2,
-      %fw0 * 7 + %i3]
-  affine.store %v, %promoted_filter[%i0, %i1, %i2, %i3]
-} { ptx_block }
-affine.for (%c1, %fh1, %fw1) = 0 to 1, 0 to 7, 0 to 7 {
-  affine.parallel (%n1, %o1, %oh1, %ow1) = 0 to 1, 0 to 64, 0 to 16, 0 to 16 {
-    %a = affine.load %promoted_input[%n1, %c1, %oh1 * 2 + %fh1, %ow1 * 2 + %fw1]
-    %b = affine.load %promoted_filter[%o1, %c1, %fh1, %fw1]
-    %old = affine.load %acc[%n1, %o1, %oh1, %ow1]
-    %d = std.fpext %a to vector<4xf32>
-    %e = std.fpext %b to vector<4xf32>
-    %f = std.multiply %d, %e
-    %g = "reduce" %f
-    %new = %g + %old
-    affine.store %new, %acc[%n1, %o1, %oh1, %ow1]
-  } { ptx_block }
-}
-```
-
-### The Emitter - Volta MMA Instruction
-
-This transformation turns the inner loop:
-
-```mlir
-affine.parallel (%n1, %o1, %oh1, %ow1) = 0 to 1, 0 to 64, 0 to 16, 0 to 16 {
-  %a = affine.load %promoted_input[%n1, %c1, %oh1 * 2 + %fh1, %ow1 * 2 + %fw1]
-  %b = affine.load %promoted_filter[%o1, %c1, %fh1, %fw1]
-  %old = affine.load %acc[%n1, %o1, %oh1, %ow1]
-  %d = std.fpext %a to vector<4xf32>
-  %e = std.fpext %b to vector<4xf32>
-  %f = std.multiply %d, %e
-  %g = "reduce" %f
-  %new = %g + %old
-  affine.store %new, %acc[%n1, %o1, %oh1, %ow1]
-} { ptx_block }
-```
-
-to multiple Volta mma.sync instructions. The result is not shown here, because
-the prototype currently only hacks it up to achieve benchmark goals.
-
-### The Autotuner
-
-As shown above, many parameters dictate how a naive implementation is
-transformed. For now, the parameters are all tile sizes. On the top of the
-emitter, the prototype includes a simple autotuner that enumerates all good
-combinations of tile sizes and invoke the emitter with each of the combinations.
-With the assistance of in-process benchmarking, the autotuner is able to pick
-the best set of parameters.
-
-## Future Improvements
-
-*   Explore Linalg/Vector for a higher-level naive implementation. MMA
-    instruction handling would be much easier with high-level functional
-    constructs.
-*   Explore other layouts. The current layout corresponds to NVIDIA
-    `CUDNN_TENSOR_NCHW_VECT_C` but for fp16s.
-*   Iron out GPU dialect related lowering. Annotations like `ptx_grid` and
-    `ptx_block` should be generalized to more architectures.
-*   Speed up autotuning through more pruning.
-*   Support dynamic shapes.
diff --git a/tensorflow/compiler/xla/experiments/BUILD b/tensorflow/compiler/xla/experiments/BUILD
new file mode 100644
index 00000000000..d298feaf3f0
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/BUILD
@@ -0,0 +1,8 @@
+# Various experiments related to the compiler that are not a part of the final XLA binary.
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # keep visibility private, if you need to depend on this, move it out of experiments
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],
+)
diff --git a/tensorflow/compiler/xla/experiments/README.md b/tensorflow/compiler/xla/experiments/README.md
new file mode 100644
index 00000000000..502dcb82913
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/README.md
@@ -0,0 +1,24 @@
+# XLA Experiments
+
+This folder is intended to serve as a place to collaborate on code related to
+the XLA compiler, but will not end up being a part of the compiler itself.
+
+As such, the code here is not necessarily production quality, and should not be
+depended on from other parts of the compiler.
+
+Some examples of code appropriate for this folder are:
+
+*   microbenchmarks that allow us to better understand various architectures
+*   scripts that help with developing specific features of the compiler, which
+    might remain useful after the feature is complete (general tools should
+    instead go into the xla/tools directory)
+*   experimental code transformations that are not yet integrated into the
+    compiler
+
+## Visibility
+
+As a result of the nature of the content in this folder, its build visibility
+is intentionally kept private.
+
+If you need something from here elsewhere, the recommended approach is to move
+it to a more suitable and production-supported location.
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/BUILD b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/BUILD
new file mode 100644
index 00000000000..769e8a76bb0
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/BUILD
@@ -0,0 +1,32 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
+
+cc_library(
+    name = "sm_bw_utils",
+    hdrs = ["sm_bw_utils.h"],
+    defines = if_cuda(["GOOGLE_CUDA=1"]),
+    deps = [
+        "//tensorflow/tsl/platform:logging",
+    ] + if_cuda([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]),
+)
+
+cuda_library(
+    name = "sm_bw_kernels",
+    srcs = ["sm_bw_kernels.cu.cc"],
+    hdrs = ["sm_bw_kernels.h"],
+    deps = [
+        ":sm_bw_utils",
+    ],
+)
+
+cc_test(
+    name = "sm_bw_test",
+    srcs = ["sm_bw_test.cc"],
+    tags = ["requires-gpu-sm80-only"],
+    deps = [
+        ":sm_bw_kernels",
+        ":sm_bw_utils",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.cu.cc b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.cu.cc
new file mode 100644
index 00000000000..d0bc62cd0de
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.cu.cc
@@ -0,0 +1,131 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA
+
+#include "tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.h"
+
+#include "tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_utils.h"
+
+namespace experiments {
+namespace benchmark {
+#define DFUNC __forceinline__ __device__
+#define HDFUNC DFUNC __host__
+
+constexpr int kMaxBlockSize = 1024;
+
+template <typename ET, size_t S>
+class Vec {
+ public:
+  using ElementType = ET;
+  constexpr static size_t Size = S;
+
+  template <typename... Ts>
+  HDFUNC Vec(Ts... elements) : data_() {
+    InsertElements(0, elements...);
+  }
+
+  HDFUNC ElementType& operator[](size_t idx) { return data_[idx]; }
+  HDFUNC const ElementType& operator[](size_t idx) const { return data_[idx]; }
+
+ private:
+  template <typename T, typename... Ts>
+  HDFUNC void InsertElements(size_t idx, T element, Ts... rest) {
+    data_[idx] = element;
+    InsertElements(idx + 1, rest...);
+  }
+  HDFUNC void InsertElements(size_t idx) {}
+
+  ElementType data_[Size];
+};
+
+template <typename VectorType, typename T>
+DFUNC void Store(VectorType vx, T* __restrict__ x, size_t id) {
+  reinterpret_cast<VectorType* __restrict__>(x)[id] = vx;
+}
+template <>
+DFUNC void Store(Vec<float, 4> vx, float* __restrict__ x, size_t id) {
+  asm("st.global.v4.f32 [%0], {%1, %2, %3, %4};"
+      :
+      : "l"(x + 4 * id), "f"(vx[0]), "f"(vx[1]), "f"(vx[2]), "f"(vx[3]));
+}
+
+template <typename VectorType, typename T>
+DFUNC void LoadNc(VectorType& vx, const T* __restrict__ x, size_t id) {
+  vx = reinterpret_cast<const VectorType* __restrict__>(x)[id];
+}
+
+template <>
+DFUNC void LoadNc(Vec<float, 4>& vx, const float* __restrict__ x, size_t id) {
+  asm("ld.global.nc.v4.f32 {%0, %1, %2, %3}, [%4];"
+      : "=f"(vx[0]), "=f"(vx[1]), "=f"(vx[2]), "=f"(vx[3])
+      : "l"(x + 4 * id));
+}
+
+template <int chunks>
+__launch_bounds__(kMaxBlockSize) __global__
+    void BenchmarkDeviceCopyKernel(const float* __restrict__ in,
+                                   float* __restrict__ out, int64_t size) {
+  const int64_t lines = size / (blockDim.x * chunks);
+  const int64_t start_line = lines * blockIdx.x / gridDim.x;
+  const int64_t end_line = lines * (blockIdx.x + 1) / gridDim.x;
+  const int64_t start_offset =
+      start_line * blockDim.x * chunks + 4 * threadIdx.x;
+  const int64_t end_offset = end_line * blockDim.x * chunks;
+  Vec<float, 4> buffer[chunks / 4];
+  for (int64_t i = start_offset; i < end_offset; i += blockDim.x * chunks) {
+#pragma unroll
+    for (int j = 0; j < chunks; j += 4) {
+      LoadNc(buffer[j / 4], in + i + blockDim.x * j, 0);
+    }
+#pragma unroll
+    for (int j = 0; j < chunks; j += 4) {
+      Store(buffer[j / 4], out + i + blockDim.x * j, 0);
+    }
+  }
+}
+
+template <int chunks>
+void BenchmarkDeviceCopy(float* in, float* out, int64_t size, int num_blocks,
+                         int num_threads) {
+  BenchmarkDeviceCopyKernel<chunks><<<num_blocks, num_threads>>>(in, out, size);
+  CHECK_CUDA(cudaGetLastError());
+}
+
+template void BenchmarkDeviceCopy<1>(float* in, float* out, int64_t size,
+                                     int num_blocks, int num_threads);
+template void BenchmarkDeviceCopy<1 << 1>(float* in, float* out, int64_t size,
+                                          int num_blocks, int num_threads);
+template void BenchmarkDeviceCopy<1 << 2>(float* in, float* out, int64_t size,
+                                          int num_blocks, int num_threads);
+template void BenchmarkDeviceCopy<1 << 3>(float* in, float* out, int64_t size,
+                                          int num_blocks, int num_threads);
+template void BenchmarkDeviceCopy<1 << 4>(float* in, float* out, int64_t size,
+                                          int num_blocks, int num_threads);
+template void BenchmarkDeviceCopy<1 << 5>(float* in, float* out, int64_t size,
+                                          int num_blocks, int num_threads);
+template void BenchmarkDeviceCopy<1 << 6>(float* in, float* out, int64_t size,
+                                          int num_blocks, int num_threads);
+template void BenchmarkDeviceCopy<1 << 7>(float* in, float* out, int64_t size,
+                                          int num_blocks, int num_threads);
+template void BenchmarkDeviceCopy<1 << 8>(float* in, float* out, int64_t size,
+                                          int num_blocks, int num_threads);
+template void BenchmarkDeviceCopy<1 << 9>(float* in, float* out, int64_t size,
+                                          int num_blocks, int num_threads);
+template void BenchmarkDeviceCopy<1 << 10>(float* in, float* out, int64_t size,
+                                           int num_blocks, int num_threads);
+}  // namespace benchmark
+}  // namespace experiments
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.h b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.h
new file mode 100644
index 00000000000..ea398f04a06
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_EXPERIMENTS_SM_BANDWIDTH_BENCHMARK_SM_BW_KERNELS_H_
+#define TENSORFLOW_COMPILER_XLA_EXPERIMENTS_SM_BANDWIDTH_BENCHMARK_SM_BW_KERNELS_H_
+
+namespace experiments {
+namespace benchmark {
+
+template <int chunks>
+void BenchmarkDeviceCopy(float* in, float* out, int64_t size, int num_blocks,
+                         int num_threads);
+
+}  // namespace benchmark
+}  // namespace experiments
+
+#endif  // TENSORFLOW_COMPILER_XLA_EXPERIMENTS_SM_BANDWIDTH_BENCHMARK_SM_BW_KERNELS_H_
diff --git a/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_test.cc b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_test.cc
new file mode 100644
index 00000000000..e4bacd79c3f
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_test.cc
@@ -0,0 +1,315 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.h"
+#include "tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_utils.h"
+
+namespace experiments {
+namespace benchmark {
+namespace {
+
+constexpr int kNumSM = 108;
+constexpr int kNum32BitRegisters = 64 * 1024;
+constexpr int kMaxBlockSize = 1024;
+
+template <typename T>
+struct DeviceMemoryDeleter {
+  void operator()(T* ptr) { cudaFree(ptr); }
+};
+template <typename T>
+using DeviceMemory = std::unique_ptr<T, DeviceMemoryDeleter<T>>;
+
+template <typename T>
+DeviceMemory<T> MakeDeviceMemory(size_t size) {
+  T* gpu_ptr = nullptr;
+  CHECK_CUDA(cudaMalloc(reinterpret_cast<void**>(&gpu_ptr), size * sizeof(T)));
+  return DeviceMemory<T>(gpu_ptr);
+}
+
+template <typename T>
+struct HostMemoryDeleter {
+  void operator()(T* ptr) { free(ptr); }
+};
+template <typename T>
+using HostMemory = std::unique_ptr<T, HostMemoryDeleter<T>>;
+
+template <typename T>
+HostMemory<T> MakeHostMemory(size_t size) {
+  T* h_in = (T*)malloc(size * sizeof(T));
+  return HostMemory<T>(h_in);
+}
+
+struct EventDeleter {
+  using pointer = cudaEvent_t;
+  void operator()(pointer event) { cudaEventDestroy(event); }
+};
+using Event = std::unique_ptr<cudaEvent_t, EventDeleter>;
+Event MakeEvent() {
+  cudaEvent_t event = nullptr;
+  CHECK_CUDA(cudaEventCreate(&event));
+  return Event(event);
+}
+
+bool CheckOutputAndClean(float* h_in, float* h_out, float* d_out, size_t size) {
+  cudaMemcpy(h_out, d_out, size * sizeof(float), cudaMemcpyDeviceToHost);
+
+  for (size_t i = 0; i < size; i++) {
+    if ((h_in[i] - h_out[i]) > 1e-6) {
+      LOG(ERROR) << "mismatch :(, i = " << i << " , values are " << h_in[i]
+                 << ", " << h_out[i];
+      return false;
+    }
+    h_out[i] = 0;
+  }
+  return true;
+}
+
+template <int chunks>
+float BenchmarkCustomDeviceCopy(int kReps, float* d_in, float* d_out,
+                                size_t size, int num_blocks = kNumSM,
+                                int num_threads = 64) {
+  Event start = MakeEvent();
+  Event stop = MakeEvent();
+  CHECK_CUDA(cudaEventRecord(start.get()));
+  for (int i = 0; i < kReps; i++) {
+    BenchmarkDeviceCopy<chunks>(d_in, d_out, size, num_blocks, num_threads);
+  }
+  CHECK_CUDA(cudaEventRecord(stop.get()));
+  CHECK_CUDA(cudaEventSynchronize(stop.get()));
+  float time_diff = 0.0f;
+  CHECK_CUDA(cudaEventElapsedTime(&time_diff, start.get(), stop.get()));
+  return time_diff / kReps;
+}
+
+float BenchmarkDev2DevCopy(int kReps, float* d_in, float* d_out, size_t size) {
+  Event start = MakeEvent();
+  Event stop = MakeEvent();
+  CHECK_CUDA(cudaEventRecord(start.get()));
+  for (int i = 0; i < kReps; i++) {
+    CHECK_CUDA(cudaMemcpy(d_out, d_in, size * sizeof(float),
+                          cudaMemcpyDeviceToDevice));
+  }
+  CHECK_CUDA(cudaEventRecord(stop.get()));
+  CHECK_CUDA(cudaEventSynchronize(stop.get()));
+  float time_diff = 0.0f;
+  CHECK_CUDA(cudaEventElapsedTime(&time_diff, start.get(), stop.get()));
+  return time_diff / kReps;
+}
+
+// B/ms -> TB/s
+float TbPerSec(size_t size, float time_diff) {
+  return 2 * sizeof(float) * size / (1e9 * time_diff);
+}
+
+TEST(SMBandwidthTest, IncreasingMemorySize) {
+  constexpr int64_t kOneM = 1024 * 1024;
+  constexpr int64_t kOneG = 1024 * 1024 * 1024;
+  constexpr int64_t kMaxSize = kOneG;
+
+  DeviceMemory<float> d_in = MakeDeviceMemory<float>(kMaxSize);
+  DeviceMemory<float> d_out = MakeDeviceMemory<float>(kMaxSize);
+
+  HostMemory<float> h_in = MakeHostMemory<float>(kMaxSize);
+  HostMemory<float> h_out = MakeHostMemory<float>(kMaxSize);
+
+  for (size_t i = 0; i < kMaxSize; i++) {
+    h_in.get()[i] = i;
+  }
+  CHECK_CUDA(cudaMemcpy(d_in.get(), h_in.get(), kMaxSize * sizeof(float),
+                        cudaMemcpyHostToDevice));
+
+  constexpr int kReps = 10;
+  LOG(ERROR) << "size,custom TB/s,devTodev TB/s";
+  for (size_t size = kOneM; size <= kMaxSize; size *= 2) {
+    float time_diff_c =
+        BenchmarkCustomDeviceCopy<1>(kReps, d_in.get(), d_out.get(), size);
+    EXPECT_TRUE(
+        CheckOutputAndClean(h_in.get(), h_out.get(), d_out.get(), size));
+
+    float time_diff_d2d =
+        BenchmarkDev2DevCopy(kReps, d_in.get(), d_out.get(), size);
+    EXPECT_TRUE(
+        CheckOutputAndClean(h_in.get(), h_out.get(), d_out.get(), size));
+
+    LOG(ERROR) << size << "," << TbPerSec(size, time_diff_c) << ","
+               << TbPerSec(size, time_diff_d2d);
+  }
+}
+
+TEST(SMBandwidthTest, IncreasingNumBlocks) {
+  constexpr size_t kSize = 1 << 28;
+  constexpr int kReps = 10;
+  constexpr int kNumThreads = 64;
+
+  DeviceMemory<float> d_in = MakeDeviceMemory<float>(kSize);
+  DeviceMemory<float> d_out = MakeDeviceMemory<float>(kSize);
+
+  HostMemory<float> h_in = MakeHostMemory<float>(kSize);
+  HostMemory<float> h_out = MakeHostMemory<float>(kSize);
+
+  for (size_t i = 0; i < kSize; i++) {
+    h_in.get()[i] = i;
+  }
+  CHECK_CUDA(cudaMemcpy(d_in.get(), h_in.get(), kSize * sizeof(float),
+                        cudaMemcpyHostToDevice));
+
+  LOG(ERROR) << "num_blocks,TB/s";
+  for (int64_t num_blocks = kNumSM; num_blocks <= kNumSM * 32;
+       num_blocks += kNumSM) {
+    Event start = MakeEvent();
+    Event stop = MakeEvent();
+    CHECK_CUDA(cudaEventRecord(start.get()));
+    for (int i = 0; i < kReps; i++) {
+      BenchmarkDeviceCopy<1>(d_in.get(), d_out.get(), kSize, num_blocks,
+                             kNumThreads);
+    }
+    CHECK_CUDA(cudaEventRecord(stop.get()));
+    CHECK_CUDA(cudaEventSynchronize(stop.get()));
+    float time_diff = 0.0f;
+    CHECK_CUDA(cudaEventElapsedTime(&time_diff, start.get(), stop.get()));
+    time_diff /= kReps;
+    LOG(ERROR) << num_blocks << "," << TbPerSec(kSize, time_diff);
+
+    CHECK_CUDA(cudaMemcpy(h_out.get(), d_out.get(), kSize * sizeof(float),
+                          cudaMemcpyDeviceToHost));
+    EXPECT_TRUE(
+        CheckOutputAndClean(h_in.get(), h_out.get(), d_out.get(), kSize));
+  }
+}
+
+template <int chunks_log>
+struct ForLoop {
+  template <template <int> class Func>
+  static void iterate() {
+    Func<chunks_log>()();
+    ForLoop<chunks_log - 1>::template iterate<Func>();
+  }
+};
+
+template <>
+struct ForLoop<0> {
+  template <template <int> class Func>
+  static void iterate() {
+    Func<0>()();
+  }
+};
+
+template <int chunks_log>
+struct IterateOverChunkSizeImpl {
+  void operator()() {
+    constexpr size_t kSize = 1 << 28;
+    constexpr int kReps = 10;
+
+    DeviceMemory<float> d_in = MakeDeviceMemory<float>(kSize);
+    DeviceMemory<float> d_out = MakeDeviceMemory<float>(kSize);
+
+    HostMemory<float> h_in = MakeHostMemory<float>(kSize);
+    HostMemory<float> h_out = MakeHostMemory<float>(kSize);
+
+    for (size_t i = 0; i < kSize; i++) {
+      h_in.get()[i] = i;
+    }
+    CHECK_CUDA(cudaMemcpy(d_in.get(), h_in.get(), kSize * sizeof(float),
+                          cudaMemcpyHostToDevice));
+
+    float time_diff = BenchmarkCustomDeviceCopy<1 << chunks_log>(
+        kReps, d_in.get(), d_out.get(), kSize);
+    EXPECT_TRUE(
+        CheckOutputAndClean(h_in.get(), h_out.get(), d_out.get(), kSize));
+
+    LOG(ERROR) << (1 << chunks_log) << "," << TbPerSec(kSize, time_diff);
+  }
+};
+
+TEST(SMBandwidthTest, IterateOverChunkSize) {
+  LOG(ERROR) << "chunks,TB/s";
+  ForLoop<10>::iterate<IterateOverChunkSizeImpl>();
+}
+
+TEST(SMBandwidthTest, BestParameters) {
+  constexpr size_t kSize = 1 << 28;
+  constexpr int kReps = 10;
+  constexpr int kNumThreads = 1024;
+  constexpr int kChunkSize = 32;
+
+  DeviceMemory<float> d_in = MakeDeviceMemory<float>(kSize);
+  DeviceMemory<float> d_out = MakeDeviceMemory<float>(kSize);
+
+  HostMemory<float> h_in = MakeHostMemory<float>(kSize);
+  HostMemory<float> h_out = MakeHostMemory<float>(kSize);
+
+  for (size_t i = 0; i < kSize; i++) {
+    h_in.get()[i] = i;
+  }
+  CHECK_CUDA(cudaMemcpy(d_in.get(), h_in.get(), kSize * sizeof(float),
+                        cudaMemcpyHostToDevice));
+
+  for (int num_blocks = 1; num_blocks <= kNumSM; num_blocks++) {
+    float time_diff = BenchmarkCustomDeviceCopy<kChunkSize>(
+        kReps, d_in.get(), d_out.get(), kSize, num_blocks, kNumThreads);
+    EXPECT_TRUE(
+        CheckOutputAndClean(h_in.get(), h_out.get(), d_out.get(), kSize));
+    LOG(ERROR) << "num_blocks: " << num_blocks
+               << ", num_threads: " << kNumThreads
+               << ", TB/sec: " << TbPerSec(kSize, time_diff);
+  }
+}
+
+template <int chunks_log>
+struct UseMaxNumberOfRegistersPerSmImpl {
+  void operator()() {
+    constexpr size_t kSize = 1 << 28;
+    constexpr int kReps = 10;
+    constexpr int kNumBlocks = kNumSM;
+
+    DeviceMemory<float> d_in = MakeDeviceMemory<float>(kSize);
+    DeviceMemory<float> d_out = MakeDeviceMemory<float>(kSize);
+
+    HostMemory<float> h_in = MakeHostMemory<float>(kSize);
+    HostMemory<float> h_out = MakeHostMemory<float>(kSize);
+
+    for (size_t i = 0; i < kSize; i++) {
+      h_in.get()[i] = i;
+    }
+    CHECK_CUDA(cudaMemcpy(d_in.get(), h_in.get(), kSize * sizeof(float),
+                          cudaMemcpyHostToDevice));
+
+    for (int coeff = 1; coeff <= 3; coeff++) {
+      int num_threads = kNum32BitRegisters / ((1 << chunks_log) * coeff);
+      if (num_threads > kMaxBlockSize) continue;
+      float time_diff = BenchmarkCustomDeviceCopy<1 << chunks_log>(
+          kReps, d_in.get(), d_out.get(), kSize, kNumBlocks, num_threads);
+      EXPECT_TRUE(
+          CheckOutputAndClean(h_in.get(), h_out.get(), d_out.get(), kSize));
+      LOG(ERROR) << "num_threads: " << num_threads
+                 << ", chunks: " << (1 << chunks_log)
+                 << ", TB/sec: " << TbPerSec(kSize, time_diff);
+    }
+  }
+};
+
+TEST(SMBandwidthTest, UseMaxNumberOfRegistersPerSm) {
+  ForLoop<10>::iterate<UseMaxNumberOfRegistersPerSmImpl>();
+}
+
+}  // namespace
+}  // namespace benchmark
+}  // namespace experiments
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_utils.h b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_utils.h
new file mode 100644
index 00000000000..9e06d9a837c
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_EXPERIMENTS_SM_BANDWIDTH_BENCHMARK_SM_BW_UTILS_H_
+#define TENSORFLOW_COMPILER_XLA_EXPERIMENTS_SM_BANDWIDTH_BENCHMARK_SM_BW_UTILS_H_
+#if GOOGLE_CUDA
+
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "tensorflow/tsl/platform/logging.h"
+
+#define CHECK_CUDA(_expr) \
+  CHECK((_expr) == cudaSuccess) << cudaGetErrorString(cudaGetLastError())
+
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_COMPILER_XLA_EXPERIMENTS_SM_BANDWIDTH_BENCHMARK_SM_BW_UTILS_H_
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index bf9fe3c99ee..1f9ffa35a3d 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -183,9 +183,9 @@ AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0, /*split_count=*/4);
 </div>
 
 In this example, there are 4 cores participating the Alltoall. On each core, the
-operand is split into 4 parts along dimension 0, so each part has shape
+operand is split into 4 parts along dimension 1, so each part has shape
 f32[4,4]. The 4 parts are scattered to all cores. Then each core concatenates
-the received parts along dimension 1, in the order or core 0-4. So the output on
+the received parts along dimension 0, in the order or core 0-4. So the output on
 each core has shape f32[16,4].
 
 ## BatchNormGrad
diff --git a/tensorflow/compiler/xla/hlo/evaluator/BUILD b/tensorflow/compiler/xla/hlo/evaluator/BUILD
index f99a903ecdf..d9a7ef2218d 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/BUILD
+++ b/tensorflow/compiler/xla/hlo/evaluator/BUILD
@@ -32,6 +32,7 @@ cc_library(
         "hlo_evaluator_typed_visitor_half.cc",
         "hlo_evaluator_typed_visitor_int16.cc",
         "hlo_evaluator_typed_visitor_int32.cc",
+        "hlo_evaluator_typed_visitor_int4.cc",
         "hlo_evaluator_typed_visitor_int64.cc",
         "hlo_evaluator_typed_visitor_int8.cc",
         "hlo_evaluator_typed_visitor_uint16.cc",
@@ -52,10 +53,10 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/service:call_graph",
         "//tensorflow/compiler/xla/service:compilation_environments",
         "//tensorflow/compiler/xla/service:dynamic_dimension_inference",
-        "//tensorflow/compiler/xla/service:hlo_query",
         "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.cc
index 42b412f2ba3..67311db899f 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <complex>
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <functional>
 #include <iterator>
 #include <memory>
@@ -41,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -49,7 +51,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/compilation_environments.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
@@ -60,6 +61,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/lib/core/bitmap.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/logging.h"
@@ -198,6 +200,87 @@ std::optional<bool> GetInstructionStaticValueAsBool(
   return std::nullopt;
 }
 
+template <PrimitiveType kType>
+struct PopulateParallelImpl {
+  using NativeT = NativeTypeOf<kType>;
+  static Status Run(Literal& literal,
+                    absl::FunctionRef<Literal(absl::Span<const int64_t>, int)>
+                        literal_generator) {
+    return literal.PopulateParallel<NativeT>(
+        [&literal_generator](absl::Span<const int64_t> output_index,
+                             int thread_id) {
+          return literal_generator(output_index, thread_id)
+              .template Get<NativeT>({});
+        });
+  }
+};
+
+template <PrimitiveType kType>
+struct PopulateImpl {
+  using NativeT = NativeTypeOf<kType>;
+  static Status Run(
+      Literal& literal,
+      absl::FunctionRef<Literal(absl::Span<const int64_t>)> literal_generator) {
+    return literal.Populate<NativeT>(
+        [&literal_generator](absl::Span<const int64_t> output_index) {
+          return literal_generator(output_index).template Get<NativeT>({});
+        });
+  }
+};
+
+// Helper function for when it has a Literal generator (typically from an
+// embedded evaluator to evaluate subcomputations) but needs to extract the
+// scalar value of a specific type from it to populate a Literal. Putting such
+// evaluation implementations in typed visitors gives no performance benefits
+// but leads to unnecessarily large code size therefore providing a delegation
+// to small templated helpers just for the parts that require manipulating the
+// native types to avoid templating the whole implementations.
+template <template <PrimitiveType> typename Trait, typename F>
+Status Apply(Literal& literal, F&& literal_generator) {
+  switch (literal.shape().element_type()) {
+    case U8:
+      return Trait<U8>::Run(literal, std::forward<F>(literal_generator));
+    case U16:
+      return Trait<U16>::Run(literal, std::forward<F>(literal_generator));
+    case U32:
+      return Trait<U32>::Run(literal, std::forward<F>(literal_generator));
+    case U64:
+      return Trait<U64>::Run(literal, std::forward<F>(literal_generator));
+    case S8:
+      return Trait<S8>::Run(literal, std::forward<F>(literal_generator));
+    case S16:
+      return Trait<S16>::Run(literal, std::forward<F>(literal_generator));
+    case S32:
+      return Trait<S32>::Run(literal, std::forward<F>(literal_generator));
+    case S64:
+      return Trait<S64>::Run(literal, std::forward<F>(literal_generator));
+    case F8E5M2:
+      return Trait<F8E5M2>::Run(literal, std::forward<F>(literal_generator));
+    case F8E4M3FN:
+      return Trait<F8E4M3FN>::Run(literal, std::forward<F>(literal_generator));
+    case F8E4M3B11FNUZ:
+      return Trait<F8E4M3B11FNUZ>::Run(literal,
+                                       std::forward<F>(literal_generator));
+    case F16:
+      return Trait<F16>::Run(literal, std::forward<F>(literal_generator));
+    case BF16:
+      return Trait<BF16>::Run(literal, std::forward<F>(literal_generator));
+    case F32:
+      return Trait<F32>::Run(literal, std::forward<F>(literal_generator));
+    case F64:
+      return Trait<F64>::Run(literal, std::forward<F>(literal_generator));
+    case C64:
+      return Trait<C64>::Run(literal, std::forward<F>(literal_generator));
+    case C128:
+      return Trait<C128>::Run(literal, std::forward<F>(literal_generator));
+    case PRED:
+      return Trait<PRED>::Run(literal, std::forward<F>(literal_generator));
+    default:
+      LOG(FATAL) << "Unhandled primitive type "
+                 << literal.shape().element_type();
+  }
+}
+
 constexpr absl::string_view kEvalErrorDetailUrl = "EvalErrorDetailUrl";
 
 // Use this class to represent the precise details of the error to enable
@@ -830,6 +913,11 @@ HloEvaluator::HloEvaluator(int64_t max_loop_iterations)
   typed_visitors_[C128] =
       std::make_unique<HloEvaluatorTypedVisitor<complex128>>(this);
 
+  typed_visitors_[U4] =
+      std::make_unique<HloEvaluatorTypedVisitor<u4, uint64_t>>(this);
+  typed_visitors_[S4] =
+      std::make_unique<HloEvaluatorTypedVisitor<s4, int64_t>>(this);
+
   // Most of the evaluator computations we use don't support BF16 and F8 (e.g.,
   // std::ceil, std::tanh). To make evaluator work with these dtypes, we set all
   // elementwise computations to be done in F32 and do BF16<->F32 or F8<->F32
@@ -841,6 +929,9 @@ HloEvaluator::HloEvaluator(int64_t max_loop_iterations)
   typed_visitors_[F8E4M3FN] =
       std::make_unique<HloEvaluatorTypedVisitor<tsl::float8_e4m3fn, float>>(
           this);
+  typed_visitors_[F8E4M3B11FNUZ] =
+      std::make_unique<HloEvaluatorTypedVisitor<tsl::float8_e4m3b11, float>>(
+          this);
 
   typed_visitors_[TUPLE] =
       std::make_unique<FunctionVisitor>([](HloInstruction*) {
@@ -1120,49 +1211,6 @@ Status HloEvaluator::EvaluateParameterFromCallerArgument(
   return OkStatus();
 }
 
-/*static*/ void HloEvaluator::IterateThroughWindow(
-    const Shape& window_shape, const Window& window, const Shape& base_shape,
-    const absl::Span<const int64_t> window_count_index,
-    const std::function<void(absl::Span<const int64_t>)>& f) {
-  const int64_t rank = base_shape.rank();
-  DimensionVector window_index(rank);
-  std::fill(window_index.begin(), window_index.end(), 0);
-  do {
-    DimensionVector base_index(rank);
-    bool out_of_bound = false;
-    for (int64_t i = 0; i < rank; ++i) {
-      // Padding is applied to the dilated base. Say that padding is 3 and
-      // dilation is 2 for some dimension. After applying base dilation and
-      // padding, the dimension looks like:
-      // P P P E D D E D D ... E D D E P P P
-      // where E are the elements and D are the holes. So, the elements are
-      // located in indices: padding + k*base_dilation for k = {0, 1, 2, ...}.
-      // We are accessing elements in the transformed base at indices:
-      // window_count_index * stride + window_index * window_dilation.
-      // Solving for k gives us
-      // (win_count_i * stride + win_i * win_dilation - pad) / base_dilation
-      // When this is a natural number, we index an original element.
-      // Otherwise, we index a 0 (pad or hole), and we don't need to apply
-      // the callback f.
-      base_index[i] = window_count_index[i] * window.dimensions(i).stride() +
-                      window_index[i] * window.dimensions(i).window_dilation() -
-                      window.dimensions(i).padding_low();
-      if (base_index[i] % window.dimensions(i).base_dilation() != 0) {
-        out_of_bound = true;
-        break;
-      }
-      base_index[i] /= window.dimensions(i).base_dilation();
-      if (base_index[i] < 0 || base_index[i] >= base_shape.dimensions(i)) {
-        out_of_bound = true;
-        break;
-      }
-    }
-    if (!out_of_bound) {
-      f(base_index);
-    }
-  } while (IndexUtil::BumpIndices(window_shape, absl::MakeSpan(window_index)));
-}
-
 std::vector<int64_t> HloEvaluator::GetS64Indices(
     absl::Span<HloInstruction* const> start_indices) {
   auto get_first_s64 = [&](const Literal& index) -> int64_t {
@@ -1192,6 +1240,16 @@ std::vector<int64_t> HloEvaluator::GetS64Indices(
   return start;
 }
 
+DimensionVector HloEvaluator::MakeDimMultipliers(const Shape& shape) {
+  DimensionVector v(shape.rank());
+  int64_t scale = 1;
+  for (auto dim : LayoutUtil::MinorToMajor(shape)) {
+    v[dim] = scale;
+    scale *= shape.dimensions(dim);
+  }
+  return v;
+}
+
 Status HloEvaluator::EvaluateInternal(
     HloInstruction* instruction, const ShapeIndex& shape_index,
     bool recursively_evaluate_nonconstant_operands) {
@@ -1239,8 +1297,7 @@ Status HloEvaluator::EvaluateInternal(
             EvaluateParameterFromCallerArgument(instruction, shape_index);
         if (!argument_eval_status.ok()) {
           VLOG(4) << "Failed to evaluate parameter " << instruction->name()
-                  << " from caller. Reason: "
-                  << argument_eval_status.error_message();
+                  << " from caller. Reason: " << argument_eval_status.message();
         } else {
           VLOG(4) << "Successfully evaluated parameter: "
                   << instruction->name();
@@ -1290,6 +1347,16 @@ Status HloEvaluator::HandleBitcast(HloInstruction* bitcast) {
   return OkStatus();
 }
 
+Status HloEvaluator::HandleBitcastConvert(HloInstruction* convert) {
+  const HloInstruction* operand = convert->operand(0);
+  TF_ASSIGN_OR_RETURN(
+      Literal result,
+      GetEvaluatedLiteralFor(operand).BitcastConvert(convert->shape()));
+
+  evaluated_[convert] = std::move(result);
+  return OkStatus();
+}
+
 Status HloEvaluator::HandleGetDimensionSize(
     HloInstruction* get_dimension_size) {
   HloInstruction* operand = get_dimension_size->mutable_operand(0);
@@ -1347,14 +1414,14 @@ Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
     // arg_literals_.
     CHECK_LT(parameter->parameter_number(), arg_literals_.size());
 #ifndef NDEBUG
-  const Literal* input_literal = arg_literals_[parameter->parameter_number()];
-  VLOG(2) << "Parameter evaluated to: " << input_literal->ToString();
-  DCHECK(Shape::Equal().MinorToMajorOnlyInLayout()(parameter->shape(),
-                                                   input_literal->shape()))
-      << "parameter shape is: "
-      << ShapeUtil::HumanStringWithLayout(parameter->shape())
-      << ", but input literal shape is: "
-      << ShapeUtil::HumanStringWithLayout(input_literal->shape());
+    const Literal* input_literal = arg_literals_[parameter->parameter_number()];
+    VLOG(2) << "Parameter evaluated to: " << input_literal->ToString();
+    DCHECK(Shape::Equal().MinorToMajorOnlyInLayout()(parameter->shape(),
+                                                     input_literal->shape()))
+        << "parameter shape is: "
+        << ShapeUtil::HumanStringWithLayout(parameter->shape())
+        << ", but input literal shape is: "
+        << ShapeUtil::HumanStringWithLayout(input_literal->shape());
 #endif
   }
 
@@ -1459,6 +1526,7 @@ Status HloEvaluator::HandleIsFinite(HloInstruction* is_finite) {
           PrimitiveType_Name(elem_ty));
     case F8E5M2:
     case F8E4M3FN:
+    case F8E4M3B11FNUZ:
       return InvalidArgument("F8 is unsupported in IsFinite");
 
     case F16: {
@@ -1715,6 +1783,11 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
                                                    compare->shape(), direction,
                                                    lhs_literal, rhs_literal));
     } break;
+    case F8E4M3B11FNUZ: {
+      TF_ASSIGN_OR_RETURN(evaluated_[compare], Compare<tsl::float8_e4m3b11>(
+                                                   compare->shape(), direction,
+                                                   lhs_literal, rhs_literal));
+    } break;
     case F16: {
       TF_ASSIGN_OR_RETURN(
           evaluated_[compare],
@@ -2805,8 +2878,7 @@ Status HloEvaluator::HandleGather(HloInstruction* gather) {
       DCHECK_GE(input_index[i], 0);
       DCHECK_LT(input_index[i], operand_shape.dimensions(i));
     }
-    TF_RETURN_IF_ERROR(
-        result.CopyElementFrom(operand, input_index, output_index));
+    result.CopyElementFrom(operand, input_index, output_index);
     return true;
   };
 
@@ -3477,6 +3549,119 @@ Status HloEvaluator::HandleConditional(HloInstruction* conditional) {
   return OkStatus();
 }
 
+Status HloEvaluator::HandleConvert(HloInstruction* convert) {
+  const HloInstruction* operand = convert->operand(0);
+  TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
+  TF_ASSIGN_OR_RETURN(Literal result, GetEvaluatedLiteralFor(operand).Convert(
+                                          convert->shape().element_type()));
+  evaluated_[convert] = std::move(result);
+  return OkStatus();
+}
+
+Status HloEvaluator::HandleDynamicSlice(HloInstruction* dynamic_slice) {
+  auto operand = dynamic_slice->operand(0);
+  auto start_indices = dynamic_slice->operand(1);
+  auto result_shape = dynamic_slice->shape();
+  TF_ASSIGN_OR_RETURN(
+      auto inferred_return_shape,
+      ShapeInference::InferDynamicSliceShape(
+          operand->shape(),
+          Cast<HloDynamicSliceInstruction>(dynamic_slice)->index_shapes(),
+          dynamic_slice->dynamic_slice_sizes()));
+  TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+      << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
+      << " but is inferred to be: "
+      << ShapeUtil::HumanString(inferred_return_shape);
+  TF_RET_CHECK(
+      primitive_util::IsIntegralType(start_indices->shape().element_type()));
+
+  const Literal& operand_literal = GetEvaluatedLiteralFor(operand);
+
+  std::vector<int64_t> start =
+      GetS64Indices(absl::MakeConstSpan(dynamic_slice->operands()).subspan(1));
+
+  // Clamp the start indices so the slice is in-bounds w.r.t the operand.
+  for (int64_t i = 0; i < start.size(); ++i) {
+    start[i] = std::min<int64_t>(
+        std::max(int64_t{0}, start[i]),
+        operand_literal.shape().dimensions(i) - result_shape.dimensions(i));
+  }
+
+  std::vector<int64_t> operand_index(start.size());
+  Literal result(result_shape);
+  const size_t element_byte_size =
+      primitive_util::ByteWidth(result_shape.element_type());
+  auto* operand_base = static_cast<const char*>(operand_literal.untyped_data());
+  auto func = [&](void* dest, absl::Span<const int64_t> result_index) {
+    for (int64_t i = 0; i < operand_index.size(); ++i) {
+      CHECK_GE(result_index[i] + start[i], 0);
+      operand_index[i] = result_index[i] + start[i];
+    }
+
+    auto* src = operand_base + (element_byte_size *
+                                IndexUtil::MultidimensionalIndexToLinearIndex(
+                                    operand_literal.shape(), operand_index));
+
+    std::memcpy(dest, src, element_byte_size);
+    return true;
+  };
+  TF_RETURN_IF_ERROR(result.PopulateInplace(func));
+  evaluated_[dynamic_slice] = std::move(result);
+  return OkStatus();
+}
+
+Status HloEvaluator::HandleDynamicUpdateSlice(HloInstruction* dus) {
+  auto operand = dus->operand(0);
+  auto update = dus->operand(1);
+  auto start_indices = dus->operand(2);
+  auto result_shape = dus->shape();
+  TF_ASSIGN_OR_RETURN(
+      auto inferred_return_shape,
+      ShapeInference::InferDynamicUpdateSliceShape(
+          operand->shape(), update->shape(),
+          Cast<HloDynamicUpdateSliceInstruction>(dus)->index_shapes()));
+  TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+      << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
+      << " but is inferred to be: "
+      << ShapeUtil::HumanString(inferred_return_shape);
+  TF_RET_CHECK(
+      primitive_util::IsIntegralType(start_indices->shape().element_type()));
+  TF_RET_CHECK(ShapeUtil::Compatible(result_shape, operand->shape()));
+
+  const Literal& operand_literal = GetEvaluatedLiteralFor(operand);
+  const Literal& update_literal = GetEvaluatedLiteralFor(update);
+
+  auto result = operand_literal.Clone();
+  const auto rank = result.shape().rank();
+  std::vector<int64_t> start =
+      GetS64Indices(absl::MakeConstSpan(dus->operands()).subspan(2));
+
+  // Clamp the update start indices so the slice is in-bounds w.r.t the
+  // operand.
+  for (int64_t i = 0; i < rank; ++i) {
+    start[i] = std::min<int64_t>(
+        std::max<int64_t>(0, start[i]),
+        result.shape().dimensions(i) - update_literal.shape().dimensions(i));
+  }
+  std::vector<int64_t> result_index(rank, 0);
+
+  auto func = [&](absl::Span<const int64_t> update_index) {
+    std::transform(update_index.begin(), update_index.end(), start.begin(),
+                   result_index.begin(), std::plus<int64_t>());
+    result.CopyElementFrom(update_literal, update_index, result_index);
+    return true;
+  };
+
+  std::vector<int64_t> base(update_literal.shape().dimensions_size(), 0);
+  std::vector<int64_t> step(update_literal.shape().dimensions_size(), 1);
+  ShapeUtil::ForEachIndexNoStatus(update_literal.shape(), base,
+                                  update_literal.shape().dimensions(), step,
+                                  func);
+  evaluated_[dus] = std::move(result);
+
+  return OkStatus();
+}
+
 Status HloEvaluator::HandleSelect(HloInstruction* select) {
   const auto& pred = GetEvaluatedLiteralFor(select->operand(0));
   const auto& on_true = GetEvaluatedLiteralFor(select->operand(1));
@@ -3637,11 +3822,7 @@ Status HloEvaluator::HandleWhile(HloInstruction* while_hlo) {
 namespace {
 template <typename NativeT>
 Literal ExtractLiteralFromIndexPositions(const Literal& from,
-                                         absl::Span<int64_t const> indices,
-                                         bool extract_as_scalar) {
-  if (extract_as_scalar) {
-    return LiteralUtil::CreateR0<NativeT>(from.Get<NativeT>({indices[0]}));
-  }
+                                         absl::Span<int64_t const> indices) {
   // We use a InlinedVector here because we need to convert it to an
   // absl::Span later, and this would not work with std::vector<bool>.
   absl::InlinedVector<NativeT, 10> values;
@@ -3652,80 +3833,400 @@ Literal ExtractLiteralFromIndexPositions(const Literal& from,
 }
 
 StatusOr<Literal> ExtractFromIndexPositions(const Literal& from,
-                                            absl::Span<int64_t const> indices,
-                                            bool extract_as_scalar = false) {
-  if (extract_as_scalar) {
-    CHECK_EQ(indices.size(), 1);
-  }
+                                            absl::Span<int64_t const> indices) {
   PrimitiveType type = from.shape().element_type();
   switch (type) {
     case PRED: {
-      return ExtractLiteralFromIndexPositions<bool>(from, indices,
-                                                    extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<bool>(from, indices);
     }
     case U8: {
-      return ExtractLiteralFromIndexPositions<uint8_t>(from, indices,
-                                                       extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<uint8_t>(from, indices);
     }
     case S8: {
-      return ExtractLiteralFromIndexPositions<int8_t>(from, indices,
-                                                      extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<int8_t>(from, indices);
     }
     case BF16: {
-      return ExtractLiteralFromIndexPositions<bfloat16>(from, indices,
-                                                        extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<bfloat16>(from, indices);
     }
     case F16: {
-      return ExtractLiteralFromIndexPositions<Eigen::half>(from, indices,
-                                                           extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<Eigen::half>(from, indices);
     }
     case U16: {
-      return ExtractLiteralFromIndexPositions<uint16_t>(from, indices,
-                                                        extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<uint16_t>(from, indices);
     }
     case S16: {
-      return ExtractLiteralFromIndexPositions<int16_t>(from, indices,
-                                                       extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<int16_t>(from, indices);
     }
     case F32: {
-      return ExtractLiteralFromIndexPositions<float>(from, indices,
-                                                     extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<float>(from, indices);
     }
     case U32: {
-      return ExtractLiteralFromIndexPositions<uint32_t>(from, indices,
-                                                        extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<uint32_t>(from, indices);
     }
     case S32: {
-      return ExtractLiteralFromIndexPositions<int32_t>(from, indices,
-                                                       extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<int32_t>(from, indices);
     }
     case F64: {
-      return ExtractLiteralFromIndexPositions<double>(from, indices,
-                                                      extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<double>(from, indices);
     }
     case C64: {
-      return ExtractLiteralFromIndexPositions<std::complex<float>>(
-          from, indices, extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<std::complex<float>>(from,
+                                                                   indices);
     }
     case U64: {
-      return ExtractLiteralFromIndexPositions<uint64_t>(from, indices,
-                                                        extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<uint64_t>(from, indices);
     }
     case S64: {
-      return ExtractLiteralFromIndexPositions<int64_t>(from, indices,
-                                                       extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<int64_t>(from, indices);
     }
     case C128: {
-      return ExtractLiteralFromIndexPositions<std::complex<double>>(
-          from, indices, extract_as_scalar);
+      return ExtractLiteralFromIndexPositions<std::complex<double>>(from,
+                                                                    indices);
     }
     default:
       return InvalidArgument("Unsupported type for Sort: %s",
                              PrimitiveType_Name(type));
   }
 }
+
+// For one particular placement of a window in a base shape (the placement is
+// represented as `window_count_index`), iterates inside the window.
+// Translates the window index into base index. If the base index is within
+// bound, call `f` with the base index.
+void IterateThroughWindow(
+    const Shape& window_shape, const Window& window, const Shape& base_shape,
+    const absl::Span<const int64_t> window_count_index,
+    const std::function<void(absl::Span<const int64_t>)>& f) {
+  const int64_t rank = base_shape.rank();
+  DimensionVector window_index(rank);
+  std::fill(window_index.begin(), window_index.end(), 0);
+  do {
+    DimensionVector base_index(rank);
+    bool out_of_bound = false;
+    for (int64_t i = 0; i < rank; ++i) {
+      // Padding is applied to the dilated base. Say that padding is 3 and
+      // dilation is 2 for some dimension. After applying base dilation and
+      // padding, the dimension looks like:
+      // P P P E D D E D D ... E D D E P P P
+      // where E are the elements and D are the holes. So, the elements are
+      // located in indices: padding + k*base_dilation for k = {0, 1, 2, ...}.
+      // We are accessing elements in the transformed base at indices:
+      // window_count_index * stride + window_index * window_dilation.
+      // Solving for k gives us
+      // (win_count_i * stride + win_i * win_dilation - pad) / base_dilation
+      // When this is a natural number, we index an original element.
+      // Otherwise, we index a 0 (pad or hole), and we don't need to apply
+      // the callback f.
+      base_index[i] = window_count_index[i] * window.dimensions(i).stride() +
+                      window_index[i] * window.dimensions(i).window_dilation() -
+                      window.dimensions(i).padding_low();
+      if (base_index[i] % window.dimensions(i).base_dilation() != 0) {
+        out_of_bound = true;
+        break;
+      }
+      base_index[i] /= window.dimensions(i).base_dilation();
+      if (base_index[i] < 0 || base_index[i] >= base_shape.dimensions(i)) {
+        out_of_bound = true;
+        break;
+      }
+    }
+    if (!out_of_bound) {
+      f(base_index);
+    }
+  } while (IndexUtil::BumpIndices(window_shape, absl::MakeSpan(window_index)));
+}
+
+template <typename Fp, typename Uint, typename ResultT>
+StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
+                                      const Literal& random_literal,
+                                      const Shape& result_shape) {
+  std::function<ResultT(Fp, Uint)> stochastic_convert_op =
+      [](Fp operand, Uint random) -> ResultT {
+    bool is_negative = ToSignMagnitude(operand) < 0;
+    if (Eigen::numext::isinf(operand)) {
+      return is_negative ? std::numeric_limits<ResultT>::min()
+                         : std::numeric_limits<ResultT>::max();
+    }
+    if (Eigen::numext::isnan(operand)) {
+      return static_cast<ResultT>(0);
+    }
+    if (operand >= static_cast<Fp>(std::numeric_limits<ResultT>::max())) {
+      return std::numeric_limits<ResultT>::max();
+    }
+    if (operand <= static_cast<Fp>(std::numeric_limits<ResultT>::min())) {
+      return std::numeric_limits<ResultT>::min();
+    }
+
+    operand = Eigen::numext::abs(operand);
+
+    // Gets the integral piece of the floating point input.
+    auto truncated = static_cast<ResultT>(operand);
+
+    // Removes the integral piece to obtain the fractional piece.
+    Fp fractional = operand - static_cast<Fp>(truncated);
+    if (fractional == Fp{0}) {
+      // No rounding necessary.
+      return is_negative ? -truncated : truncated;
+    }
+
+    // Compares fractional values against unsigned random values by
+    // normalizing random values into [0, 1): fractional vs. (random /
+    // random_max). This equals to comparing (fractional * random_max) vs.
+    // random.
+    auto fixed_fractional = static_cast<Uint>(std::ldexp(
+        static_cast<double>(fractional), std::numeric_limits<Uint>::digits));
+
+    // Rounds the integer output up if the fractional pieces is larger than
+    // the input random number.
+    if (random < fixed_fractional) {
+      // This only happens when the operand is in the (min, -max) range and
+      // should be rounded to min.
+      if (truncated == std::numeric_limits<ResultT>::max()) {
+        return std::numeric_limits<ResultT>::min();
+      }
+      truncated++;
+    }
+    return is_negative ? -truncated : truncated;
+  };
+
+  Literal result(result_shape);
+  TF_RETURN_IF_ERROR(
+      result.Populate<ResultT>([&](absl::Span<const int64_t> multi_index) {
+        return stochastic_convert_op(operand_literal.Get<Fp>(multi_index),
+                                     random_literal.Get<Uint>(multi_index));
+      }));
+  return std::move(result);
+}
+
+// Converts from primitive types to native types.
+template <PrimitiveType operand_type, PrimitiveType random_type,
+          PrimitiveType result_type>
+StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
+                                      const Literal& random_literal,
+                                      const Shape& result_shape) {
+  return StochasticConvertOp<
+      typename primitive_util::PrimitiveTypeToNative<operand_type>::type,
+      typename primitive_util::PrimitiveTypeToNative<random_type>::type,
+      typename primitive_util::PrimitiveTypeToNative<result_type>::type>(
+      operand_literal, random_literal, result_shape);
+}
+
+// Evaluates all possible paths of converting to different integers.
+template <PrimitiveType operand_type, PrimitiveType random_type>
+StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
+                                      const Literal& random_literal,
+                                      const Shape& result_shape) {
+  switch (result_shape.element_type()) {
+#define CONVERT_IF_RESULT_TYPES_MATCH(type)                        \
+  case (type):                                                     \
+    return StochasticConvertOp<operand_type, random_type, (type)>( \
+        operand_literal, random_literal, result_shape);
+    CONVERT_IF_RESULT_TYPES_MATCH(S32)
+    CONVERT_IF_RESULT_TYPES_MATCH(S16)
+    CONVERT_IF_RESULT_TYPES_MATCH(S8)
+#undef CONVERT_IF_RESULT_TYPES_MATCH
+    default:
+      break;
+  }
+  // TODO(b/232442915): Enable converting big floats to small floats.
+  return Unimplemented(
+      "Stochastically converting from type %s to type %s is not implemented.",
+      PrimitiveType_Name(operand_literal.shape().element_type()),
+      PrimitiveType_Name(result_shape.element_type()));
+}
+
+StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
+                                      const Literal& random_literal,
+                                      const Shape& result_shape) {
+  switch (operand_literal.shape().element_type()) {
+    case F16:
+      return StochasticConvertOp<F16, U16>(operand_literal, random_literal,
+                                           result_shape);
+    case BF16:
+      return StochasticConvertOp<BF16, U16>(operand_literal, random_literal,
+                                            result_shape);
+    case F32:
+      return StochasticConvertOp<F32, U32>(operand_literal, random_literal,
+                                           result_shape);
+    case F64:
+      return StochasticConvertOp<F64, U64>(operand_literal, random_literal,
+                                           result_shape);
+    default:
+      break;
+  }
+  // TODO(b/232442915): Enable converting big floats to small floats.
+  return Unimplemented(
+      "Stochastically converting from type %s to type %s is not implemented.",
+      PrimitiveType_Name(operand_literal.shape().element_type()),
+      PrimitiveType_Name(result_shape.element_type()));
+}
 }  // namespace
 
+Status HloEvaluator::HandleReverse(HloInstruction* reverse) {
+  const Shape& result_shape = reverse->shape();
+  const auto reverse_dimensions = reverse->dimensions();
+
+  auto operand = reverse->operand(0);
+  TF_ASSIGN_OR_RETURN(
+      auto inferred_return_shape,
+      ShapeInference::InferReverseShape(operand->shape(), reverse_dimensions));
+
+  TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+      << "return shape set to: " << ShapeUtil::HumanString(result_shape)
+      << " but is inferred to be: "
+      << ShapeUtil::HumanString(inferred_return_shape);
+
+  const Literal& operand_literal = GetEvaluatedLiteralFor(operand);
+  Literal result(result_shape);
+  const size_t element_byte_size =
+      primitive_util::ByteWidth(result_shape.element_type());
+  auto* operand_base = static_cast<const char*>(operand_literal.untyped_data());
+  TF_RETURN_IF_ERROR(result.PopulateInplaceParallel(
+      [&](void* dest, absl::Span<const int64_t> out_index, int) {
+        std::vector<int64_t> from_index(out_index.begin(), out_index.end());
+        for (const int64_t dim : reverse_dimensions) {
+          from_index[dim] = result_shape.dimensions(dim) - 1 - out_index[dim];
+        }
+        auto* src =
+            operand_base +
+            (element_byte_size * IndexUtil::MultidimensionalIndexToLinearIndex(
+                                     operand_literal.shape(), from_index));
+        std::memcpy(dest, src, element_byte_size);
+      }));
+
+  evaluated_[reverse] = std::move(result);
+  return OkStatus();
+}
+
+Status HloEvaluator::HandleSelectAndScatter(
+    HloInstruction* select_and_scatter) {
+  auto operand = select_and_scatter->operand(0);
+  auto source = select_and_scatter->operand(1);
+  const Window& window = select_and_scatter->window();
+
+  const Literal& init_literal =
+      GetEvaluatedLiteralFor(select_and_scatter->operand(2));
+  TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
+
+  // Initialize result array with the init value.
+  TF_ASSIGN_OR_RETURN(Literal result,
+                      init_literal.Broadcast(select_and_scatter->shape(), {}));
+
+  std::vector<int64_t> window_dimension_sizes;
+  for (const auto& window_dimension : window.dimensions()) {
+    window_dimension_sizes.push_back(window_dimension.size());
+  }
+  const Shape window_shape = ShapeUtil::MakeShape(
+      operand->shape().element_type(), window_dimension_sizes);
+
+  HloComputation* select = select_and_scatter->select();
+  HloComputation* scatter = select_and_scatter->scatter();
+
+  const Literal& operand_literal = GetEvaluatedLiteralFor(operand);
+  const Literal& source_literal = GetEvaluatedLiteralFor(source);
+
+  int64_t rank = operand_literal.shape().rank();
+
+  HloEvaluator embedded_evaluator(max_loop_iterations_);
+  DimensionVector source_index(rank, 0);
+
+  do {
+    // For each element in `source`, we place a window in `operand`. For each
+    // window placement, we iterate inside the window twice:
+    //
+    // 1. Find the selected index by applying `select` function to all
+    // elements. E.g., If the `select` function is GreaterEqual, the first
+    // iteration through the window finds the biggest value and returns its
+    // index.
+    //
+    // 2. Using the selected index, scatter value from `source` to result. We
+    // do this by iterating through the window, and compare each index with
+    // the selected index.
+    std::optional<Literal> selected_val;
+    std::optional<DimensionVector> selected_index;
+
+    IterateThroughWindow(
+        window_shape, window, operand_literal.shape(), source_index,
+        [&](absl::Span<const int64_t> operand_index) {
+          auto curr_val =
+              LiteralUtil::GetScalarLiteral(operand_literal, operand_index);
+          if (!selected_val.has_value()) {
+            selected_val.emplace(curr_val.Clone());
+            selected_index.emplace(operand_index.begin(), operand_index.end());
+          }
+          Literal computed_result =
+              embedded_evaluator
+                  .Evaluate(*select, {&selected_val.value(), &curr_val})
+                  .value();
+          bool selected = !computed_result.Get<bool>({});
+          if (selected) {
+            *selected_val = std::move(curr_val);
+            selected_index.emplace(operand_index.begin(), operand_index.end());
+          }
+          embedded_evaluator.ResetVisitStates();
+        });
+
+    IterateThroughWindow(
+        window_shape, window, operand_literal.shape(), source_index,
+        [&](absl::Span<const int64_t> operand_index) {
+          if (std::equal(operand_index.begin(), operand_index.end(),
+                         selected_index->begin())) {
+            auto source =
+                LiteralUtil::GetScalarLiteral(source_literal, source_index);
+            auto scattered =
+                LiteralUtil::GetScalarLiteral(result, operand_index);
+            Literal computed_result =
+                embedded_evaluator.Evaluate(*scatter, {&source, &scattered})
+                    .value();
+            LiteralUtil::SetScalarLiteral(result, operand_index,
+                                          computed_result);
+            // Clear visit states so that the we can use the evaluator again
+            // on the same computation.
+            embedded_evaluator.ResetVisitStates();
+          }
+        });
+  } while (
+      IndexUtil::BumpIndices(source->shape(), absl::MakeSpan(source_index)));
+
+  evaluated_[select_and_scatter] = std::move(result);
+  return OkStatus();
+}
+
+Status HloEvaluator::HandleSlice(HloInstruction* slice) {
+  auto operand = slice->operand(0);
+  const Shape& shape = slice->shape();
+  TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                      ShapeInference::InferSliceShape(
+                          operand->shape(), slice->slice_starts(),
+                          slice->slice_limits(), slice->slice_strides()));
+  TF_RET_CHECK(ShapeUtil::Compatible(shape, inferred_return_shape))
+      << "return shape set to: " << ShapeUtil::HumanString(shape)
+      << " but is inferred to be: "
+      << ShapeUtil::HumanString(inferred_return_shape);
+
+  const int64_t rank = operand->shape().rank();
+  const Literal& operand_literal = GetEvaluatedLiteralFor(operand);
+  const size_t element_byte_size =
+      primitive_util::ByteWidth(shape.element_type());
+  auto* operand_base = static_cast<const char*>(operand_literal.untyped_data());
+  auto func = [&](void* dest, absl::Span<const int64_t> out_index, int) {
+    DimensionVector operand_index(rank);
+    for (int64_t i = 0; i < rank; ++i) {
+      operand_index[i] =
+          slice->slice_starts(i) + out_index[i] * slice->slice_strides(i);
+    }
+    auto* src = operand_base + (element_byte_size *
+                                IndexUtil::MultidimensionalIndexToLinearIndex(
+                                    operand_literal.shape(), operand_index));
+    std::memcpy(dest, src, element_byte_size);
+  };
+
+  Literal result(shape);
+  TF_RETURN_IF_ERROR(result.PopulateInplaceParallel(func));
+  evaluated_[slice] = std::move(result);
+  return OkStatus();
+}
+
 Status HloEvaluator::HandleSort(HloInstruction* sort) {
   TF_RET_CHECK(sort->operand_count() >= 1)
       << "Expected at least 1 operand for sort";
@@ -3785,20 +4286,10 @@ Status HloEvaluator::HandleSort(HloInstruction* sort) {
           std::vector<Literal> literals;
           literals.reserve(2 * sort->operand_count());
           for (int64_t i = 0; i < sort->operand_count(); ++i) {
-            auto lhs = ExtractFromIndexPositions(literals_to_sort[i], {a},
-                                                 /*extract_as_scalar=*/true);
-            if (!lhs.ok()) {
-              compare_status = lhs.status();
-              return false;
-            }
-            literals.push_back(std::move(lhs.value()));
-            auto rhs = ExtractFromIndexPositions(literals_to_sort[i], {b},
-                                                 /*extract_as_scalar=*/true);
-            if (!rhs.ok()) {
-              compare_status = rhs.status();
-              return false;
-            }
-            literals.push_back(std::move(rhs.value()));
+            literals.push_back(
+                LiteralUtil::GetScalarLiteral(literals_to_sort[i], {a}));
+            literals.push_back(
+                LiteralUtil::GetScalarLiteral(literals_to_sort[i], {b}));
           }
           std::vector<const Literal*> literal_ptrs;
           absl::c_transform(literals, std::back_inserter(literal_ptrs),
@@ -3855,6 +4346,22 @@ Status HloEvaluator::HandleSort(HloInstruction* sort) {
   return OkStatus();
 }
 
+Status HloEvaluator::HandleStochasticConvert(
+    HloInstruction* stochastic_convert) {
+  const HloInstruction* operand = stochastic_convert->operand(0);
+  const HloInstruction* random = stochastic_convert->operand(1);
+  const Shape& result_shape = stochastic_convert->shape();
+  TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), random->shape()));
+  TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), result_shape));
+
+  const Literal& operand_literal = GetEvaluatedLiteralFor(operand);
+  const Literal& random_literal = GetEvaluatedLiteralFor(random);
+  TF_ASSIGN_OR_RETURN(
+      evaluated_[stochastic_convert],
+      StochasticConvertOp(operand_literal, random_literal, result_shape));
+  return OkStatus();
+}
+
 static bool IsScalarAdd(HloComputation* computation) {
   HloInstruction* instruction = computation->root_instruction();
   if (instruction->opcode() == HloOpcode::kAdd &&
@@ -3890,10 +4397,8 @@ static StatusOr<bool> PerformReductionStep(
     accumulators.emplace_back(
         ShapeUtil::MakeShape(input_args[i]->shape().element_type(), {}));
 
-    TF_RETURN_IF_ERROR(
-        arg_values[i].CopyElementFrom(*input_args[i], input_index, {}));
-    TF_RETURN_IF_ERROR(
-        accumulators[i].CopyElementFrom(results[i], output_index, {}));
+    arg_values[i].CopyElementFrom(*input_args[i], input_index, {});
+    accumulators[i].CopyElementFrom(results[i], output_index, {});
   }
 
   // Evaluate computation with specified literal operands.
@@ -3916,12 +4421,10 @@ static StatusOr<bool> PerformReductionStep(
   if (is_tuple) {
     std::vector<Literal> computed_results = computed_result.DecomposeTuple();
     for (int64_t i = 0; i < num_args; ++i) {
-      TF_RETURN_IF_ERROR(
-          results[i].CopyElementFrom(computed_results[i], {}, output_index));
+      results[i].CopyElementFrom(computed_results[i], {}, output_index);
     }
   } else {
-    TF_RETURN_IF_ERROR(
-        results[0].CopyElementFrom(computed_result, {}, output_index));
+    results[0].CopyElementFrom(computed_result, {}, output_index);
   }
 
   return true;
@@ -3949,8 +4452,7 @@ static StatusOr<bool> GenerateReduceOutputElement(
   }
 
   for (int64_t i = 0; i < results.size(); ++i) {
-    TF_RETURN_IF_ERROR(
-        results[i].CopyElementFrom(*init_values[i], {}, output_index));
+    results[i].CopyElementFrom(*init_values[i], {}, output_index);
   }
 
   if (use_fast_add) {
@@ -4101,20 +4603,176 @@ Status HloEvaluator::HandleReduce(HloInstruction* instr) {
 }
 
 Status HloEvaluator::HandleReduceWindow(HloInstruction* hlo) {
-  // Here we delegate the handling to the typed visitor class, instantiated by
-  // using the type of the first input of ReduceWindow. The support for the
-  // variadic case inside the typed_visitor is made to not use the template
-  // parameter so it doesn't really matter which type is used to instantiate it
-  // here. We choose not to move the implementation for handle ReduceWindow
-  // from the typed visitor to here because we need to reuse the
-  // IterateThroughWindow method, which is defined and only avaiable inside the
-  // typed visitor.
-  if (hlo->shape().IsTuple()) {
-    return hlo->Visit(
-        typed_visitors_[hlo->shape().tuple_shapes(0).element_type()].get());
-  } else {
-    return DefaultAction(hlo);
+  auto* reduce_window = Cast<HloReduceWindowInstruction>(hlo);
+  const Window& window = reduce_window->window();
+  HloComputation* function = reduce_window->to_apply();
+  TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                      ShapeInference::InferReduceWindowShape(
+                          reduce_window->input_shapes(),
+                          reduce_window->init_value_shapes(), window,
+                          /*to_apply_shape=*/function->ComputeProgramShape()));
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(reduce_window->shape(), inferred_return_shape))
+      << "return shape is set to: "
+      << ShapeUtil::HumanStringWithLayout(reduce_window->shape())
+      << " but is inferred to be: "
+      << ShapeUtil::HumanStringWithLayout(inferred_return_shape);
+
+  absl::InlinedVector<const Literal*, 2> input_literal_vec, init_literal_vec;
+  auto input_arrays = reduce_window->inputs();
+  auto init_values = reduce_window->init_values();
+  int64_t num_args = input_arrays.size();
+  for (int i = 0; i < num_args; ++i) {
+    const Literal& input_literal = GetEvaluatedLiteralFor(input_arrays[i]);
+    VLOG(3) << "HandleReduceWindow arg_literal: " << input_literal.ToString();
+    input_literal_vec.push_back(&input_literal);
+    const Literal& init_literal = GetEvaluatedLiteralFor(init_values[i]);
+    VLOG(3) << "HandleReduceWindow init_literal: " << init_literal.ToString();
+    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
+    init_literal_vec.push_back(&init_literal);
   }
+  // Creates a Shape object from window, for iteration below.
+  absl::InlinedVector<int64_t, 2> window_dimension_sizes;
+  for (const auto& window_dimension : window.dimensions()) {
+    window_dimension_sizes.push_back(window_dimension.size());
+  }
+  const Shape window_shape = ShapeUtil::MakeShape(
+      input_arrays[0]->shape().element_type(), window_dimension_sizes);
+
+  const int num_threads = ShapeUtil::GetForEachIndexParallelThreadCount() + 1;
+  std::vector<std::unique_ptr<HloEvaluator>> embedded_evaluators;
+  embedded_evaluators.reserve(num_threads);
+  for (int i = 0; i < num_threads; ++i) {
+    embedded_evaluators.push_back(CreateEmbedded(max_loop_iterations_));
+  }
+
+  // For each resulting dimension, calculate and assign computed value.
+  auto evaluate_impl = [&init_literal_vec, &window_shape, &window,
+                        &input_literal_vec, &embedded_evaluators, function,
+                        &inferred_return_shape](
+                           absl::Span<const int64_t> output_index,
+                           int thread_id) -> absl::InlinedVector<Literal, 2> {
+    const int embedded_evaluator_index = thread_id + 1;
+    CHECK_GE(embedded_evaluator_index, 0);
+    CHECK_LT(embedded_evaluator_index, embedded_evaluators.size());
+    HloEvaluator& embedded_evaluator =
+        *embedded_evaluators[embedded_evaluator_index];
+    absl::InlinedVector<Literal, 2> computed_result;
+    computed_result.reserve(init_literal_vec.size());
+    for (const auto* init : init_literal_vec) {
+      computed_result.push_back(init->Clone());
+    }
+    IterateThroughWindow(
+        window_shape, window, input_literal_vec[0]->shape(), output_index,
+        [&](absl::Span<const int64_t> operand_index) -> void {
+          absl::InlinedVector<const Literal*, 2> args;
+          for (auto& curr_result_val : computed_result) {
+            VLOG(2) << "Pushing:" << curr_result_val.ToString() << "\n";
+            args.push_back(&curr_result_val);
+          }
+          absl::InlinedVector<Literal, 2> curr_val_literal_vec;
+          curr_val_literal_vec.reserve(input_literal_vec.size());
+          for (const auto* input_literal : input_literal_vec) {
+            // Evaluate computation with specified literal operands.
+            curr_val_literal_vec.push_back(Literal(ShapeUtil::MakeShape(
+                input_literal->shape().element_type(), {})));
+            curr_val_literal_vec.back().CopyElementFrom(*input_literal,
+                                                        operand_index, {});
+            VLOG(2) << "Pushing:" << curr_val_literal_vec.back().ToString()
+                    << "\n";
+            args.push_back(&curr_val_literal_vec.back());
+          }
+          computed_result[0] =
+              embedded_evaluator.Evaluate(*function, args).value();
+          VLOG(2) << "Computed result:" << computed_result[0].ToString()
+                  << "\n";
+          // Clear visit states so that the we can use the evaluate again
+          // on the same computation.
+          embedded_evaluator.ResetVisitStates();
+          if (inferred_return_shape.IsTuple()) {
+            auto decomposed = computed_result[0].DecomposeTuple();
+            computed_result.clear();
+            computed_result.reserve(decomposed.size());
+            for (int i = 0; i < decomposed.size(); ++i) {
+              computed_result.push_back(std::move(decomposed[i]));
+            }
+          }
+        });
+    VLOG(2) << "Final result size:" << computed_result.size() << "\n";
+    for (const auto& res : computed_result) {
+      VLOG(2) << res.ToString() << "\n";
+    }
+    return computed_result;
+  };
+  Literal result(inferred_return_shape);
+  if (inferred_return_shape.IsTuple()) {
+    absl::InlinedVector<Literal, 1> results(num_args);
+    for (int64_t i = 0; i < num_args; ++i) {
+      results[i] = Literal(inferred_return_shape.tuple_shapes(i));
+    }
+    ShapeUtil::ForEachIndexParallel(
+        inferred_return_shape.tuple_shapes(0),
+        [&results, &evaluate_impl](absl::Span<const int64_t> output_index,
+                                   int thread_id) -> bool {
+          absl::InlinedVector<Literal, 2> computed_result_vec =
+              evaluate_impl(output_index, thread_id);
+          for (int i = 0; i < computed_result_vec.size(); ++i) {
+            // We are reading from `computed_result_vec[i]` at the top-level
+            // literal index and writing to `results[i]` at `output_index`.
+            // This is thread-safe because:
+            //  - `results[i]` is not changing size.
+            //  - `computed_result_vec[i]` is thread-local.
+            //  - There is exactly one write to `results[i]` for each
+            //    `output_index`.
+            results[i].CopyElementFrom(computed_result_vec[i], {},
+                                       output_index);
+          }
+          return true;
+        });
+    result = Literal::MoveIntoTuple(absl::MakeSpan(results));
+    VLOG(2) << "Final result is:" << result.ToString() << "\n";
+  } else {
+    TF_RETURN_IF_ERROR(Apply<PopulateParallelImpl>(
+        result, [&evaluate_impl](absl::Span<const int64_t> output_index,
+                                 int thread_id) {
+          return std::move(evaluate_impl(output_index, thread_id)[0]);
+        }));
+  }
+  VLOG(2) << "Final result is:" << result.ToString() << "\n";
+  evaluated_[reduce_window] = std::move(result);
+  return OkStatus();
+}
+
+Status HloEvaluator::HandleMap(HloInstruction* map) {
+  auto operands = map->operands();
+  HloComputation* computation = map->to_apply();
+
+  Literal result(map->shape());
+
+  HloEvaluator embedded_evaluator(max_loop_iterations_);
+  TF_RETURN_IF_ERROR(
+      Apply<PopulateImpl>(result, [&](absl::Span<const int64_t> multi_index) {
+        std::vector<Literal> arg_literals;
+        arg_literals.reserve(operands.size());
+
+        // Construct scalar literal parameters to be passed to the map
+        // computation.
+        for (auto operand : operands) {
+          const Literal& arg_literal = GetEvaluatedLiteralFor(operand);
+          arg_literals.push_back(
+              LiteralUtil::GetScalarLiteral(arg_literal, multi_index));
+        }
+
+        Literal computed_result =
+            embedded_evaluator.Evaluate(*computation, arg_literals).value();
+        // Clear visit states so that the we can use the evaluate again on
+        // the same computation.
+        embedded_evaluator.ResetVisitStates();
+
+        return computed_result;
+      }));
+  evaluated_[map] = std::move(result);
+  return OkStatus();
 }
 
 Status HloEvaluator::HandleCustomCall(HloInstruction* custom_call) {
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h
index d73dba253b7..f8a0bd14179 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h
@@ -252,20 +252,22 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   Status EvaluateParameterFromCallerArgument(HloInstruction* parameter,
                                              const ShapeIndex& shape_index);
 
-  // For one particular placement of a window in a base shape (the placement is
-  // represented as `window_count_index`), iterates inside the window.
-  // Translates the window index into base index. If the base index is within
-  // bound, call `f` with the base index.
-  static void IterateThroughWindow(
-      const Shape& window_shape, const Window& window, const Shape& base_shape,
-      absl::Span<const int64_t> window_count_index,
-      const std::function<void(absl::Span<const int64_t>)>& f);
-
   // Helper method to extract a list of int64_t from evaluated instruction for
   // start_indices for DynamicSlice and DynamicUpdateSlice.
   std::vector<int64_t> GetS64Indices(
       absl::Span<HloInstruction* const> start_indices);
 
+  // Creates a vector of multipliers which can be used to create a linear index
+  // into shape.
+  //
+  // Given the multidimensional index {i1, ..., iN} and
+  // M = MakeDimMultipliers(shape), the corresponding linear index LI is simply
+  //
+  //   LI = i1 * M[1] + i2 * M[2] + ... + iN * M[N].
+  //
+  // This lets you calculate LI given the multidimensional indices in any order.
+  static DimensionVector MakeDimMultipliers(const Shape& shape);
+
   // Make HloEvaluatorTypedVisitor a friend because it is logically part of this
   // class.
   //
@@ -292,6 +294,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   //
   Status HandleBitcast(HloInstruction* bitcast) override;
 
+  Status HandleBitcastConvert(HloInstruction* convert) override;
+
   Status HandleGetDimensionSize(HloInstruction* get_dimension_size) override;
 
   Status HandleSetDimensionSize(HloInstruction* set_dimension_size) override;
@@ -336,8 +340,14 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleConditional(HloInstruction* conditional) override;
 
+  Status HandleConvert(HloInstruction* convert) override;
+
   Status HandleCall(HloInstruction* call) override;
 
+  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
+
+  Status HandleDynamicUpdateSlice(HloInstruction* dus) override;
+
   Status HandleFusion(HloInstruction* fusion) override;
 
   Status HandleWhile(HloInstruction* while_hlo) override;
@@ -350,8 +360,16 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleAddDependency(HloInstruction* add_dependency) override;
 
+  Status HandleReverse(HloInstruction* reverse) override;
+
+  Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override;
+
+  Status HandleSlice(HloInstruction* slice) override;
+
   Status HandleSort(HloInstruction* sort) override;
 
+  Status HandleStochasticConvert(HloInstruction* stochastic_convert) override;
+
   Status HandleReal(HloInstruction* real) override;
 
   Status HandleImag(HloInstruction* imag) override;
@@ -362,6 +380,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleReduceWindow(HloInstruction* hlo) override;
 
+  Status HandleMap(HloInstruction* map) override;
+
   Status HandleCustomCall(HloInstruction* custom_call) override;
 
   // Unsupported HLOs, note some of them (such as BatchNorm*) are typically
@@ -461,7 +481,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
       HloInstruction* instruction,
       const std::function<ReturnT(NativeT)>& unary_op,
       const Literal& operand_literal) {
-    const auto shape = instruction->shape();
+    const Shape& shape = instruction->shape();
     const auto* operand = instruction->operand(0);
     TF_RET_CHECK(ShapeUtil::SameDimensions(shape, operand->shape()));
 
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_test.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_test.cc
index d1aaa6a42ee..403715aeb4d 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_test.cc
@@ -4404,16 +4404,14 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal input_wrong_shape = LiteralUtil::CreateR1<int32_t>({0, 1});
 
-  EXPECT_EQ(HloEvaluator()
-                .Evaluate(*m_, {&input_wrong_shape})
-                .status()
-                .error_message(),
-            "Shape mismatch at parameter 0. Computation expected s32[1]{0}, "
-            "but arg was s32[2]{0}.");
+  EXPECT_EQ(
+      HloEvaluator().Evaluate(*m_, {&input_wrong_shape}).status().message(),
+      "Shape mismatch at parameter 0. Computation expected s32[1]{0}, "
+      "but arg was s32[2]{0}.");
   EXPECT_EQ(HloEvaluator()
                 .Evaluate(*m_->entry_computation(), {&input_wrong_shape})
                 .status()
-                .error_message(),
+                .message(),
             "Shape mismatch at parameter 0. Computation expected s32[1]{0}, "
             "but arg was s32[2]{0}.");
 }
@@ -4431,13 +4429,12 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal input = LiteralUtil::CreateR1<int32_t>({0});
 
-  EXPECT_EQ(
-      HloEvaluator().Evaluate(*m_, {&input, &input}).status().error_message(),
-      "Expected 1 argument, but got 2.");
+  EXPECT_EQ(HloEvaluator().Evaluate(*m_, {&input, &input}).status().message(),
+            "Expected 1 argument, but got 2.");
   EXPECT_EQ(HloEvaluator()
                 .Evaluate(*m_->entry_computation(), {&input, &input})
                 .status()
-                .error_message(),
+                .message(),
             "Expected 1 argument, but got 2.");
 }
 
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
index d30f76561d0..96668095a3e 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
@@ -103,6 +103,38 @@ auto ToArithmeticSafeType(T t) {
   }
 }
 
+// std::make_signed_t is “behavior undefined” for custom types, so provide a
+// general util to make signed/unsigned for both primitive and custom types.
+template <typename T>
+struct MakeSigned {
+  using type = std::make_signed_t<T>;
+};
+
+template <>
+struct MakeSigned<u4> {
+  using type = s4;
+};
+
+template <>
+struct MakeSigned<s4> {
+  using type = s4;
+};
+
+template <typename T>
+struct MakeUnsigned {
+  using type = std::make_unsigned_t<T>;
+};
+
+template <>
+struct MakeUnsigned<u4> {
+  using type = u4;
+};
+
+template <>
+struct MakeUnsigned<s4> {
+  using type = u4;
+};
+
 // Templated DfsHloVisitor for use by HloEvaluator.
 //
 // Typically ReturnT here indicates the resulting literal type of each evaluated
@@ -123,30 +155,22 @@ auto ToArithmeticSafeType(T t) {
 // file rather than in hlo_evaluator.cc because we use extern templates and a
 // bunch of independent cc files to speed up compiling the many instantiations
 // of this class.
+//
+// NOTE: Prefer putting new implementation to HloEvalator rather than
+// HloEvaluatorTypedVisitor whenever possible, because this class is templated
+// for all primitive types and is an order of magnitude larger in code size as
+// well as compile time. Only put op handling that involves compute using native
+// C++ types here, such as elementwise ops with compute, convolution, dot, etc.
 template <typename ReturnT, typename ElementwiseT = ReturnT>
 class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
  private:
-  Status UnsupportedTypeError(HloInstruction* instruction) {
+  ABSL_ATTRIBUTE_NOINLINE Status
+  UnsupportedTypeError(HloInstruction* instruction) {
     return InvalidArgument(
         "Unsupported type for %s: %s", HloOpcodeString(instruction->opcode()),
         PrimitiveType_Name(instruction->shape().element_type()));
   }
 
-  // Get the value in the given literal static_cast as a double.
-  template <typename NativeT>
-  double GetAsDouble(const Literal& literal,
-                     absl::Span<const int64_t> input_index) {
-    // Specialization for complex types. In this case it is not possible to
-    // static_cast value to a double so just CHECK fail. This method is not used
-    // at run-time, but must be available at compile-time to keep the compiler
-    // happy.
-    if constexpr (is_complex_v<NativeT>) {
-      LOG(FATAL) << "Trying to get complex literal as double: "
-                 << literal.ToString();
-    }
-    return static_cast<double>(literal.Get<NativeT>(input_index));
-  }
-
  public:
   explicit HloEvaluatorTypedVisitor(HloEvaluator* p) : parent_(p) {}
 
@@ -266,26 +290,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return UnsupportedTypeError(ceil);
   }
 
-  Status HandleConvert(HloInstruction* convert) override {
-    const HloInstruction* operand = convert->operand(0);
-    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
-    TF_ASSIGN_OR_RETURN(Literal result,
-                        parent_->GetEvaluatedLiteralFor(operand).Convert(
-                            convert->shape().element_type()));
-    parent_->evaluated_[convert] = std::move(result);
-    return OkStatus();
-  }
-
-  Status HandleBitcastConvert(HloInstruction* convert) override {
-    const HloInstruction* operand = convert->operand(0);
-    TF_ASSIGN_OR_RETURN(Literal result,
-                        parent_->GetEvaluatedLiteralFor(operand).BitcastConvert(
-                            convert->shape()));
-
-    parent_->evaluated_[convert] = std::move(result);
-    return OkStatus();
-  }
-
   Status HandleExp(HloInstruction* exp) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
                         ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
@@ -396,7 +400,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleSign(HloInstruction* sign) override {
-    using NativeT = ReturnT;
+    using NativeT = ElementwiseT;
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[sign],
         ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
@@ -672,16 +676,16 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleShiftRightArithmetic(HloInstruction* shr) override {
     if constexpr (std::is_integral_v<ElementwiseT> &&
                   !std::is_same_v<ElementwiseT, bool>) {
-      using SignedT = std::make_signed_t<ReturnT>;
+      using SignedT = typename MakeSigned<ReturnT>::type;
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[shr],
           ElementWiseBinaryOp(
               shr, [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
                 SignedT lhs_signed = static_cast<SignedT>(lhs_elem);
                 if (IsShiftOutOfBounds<ReturnT>(rhs_elem)) {
-                  return lhs_signed < 0 ? static_cast<SignedT>(-1) : 0;
+                  return lhs_signed < 0 ? static_cast<ElementwiseT>(-1) : 0;
                 } else {
-                  return lhs_signed >> rhs_elem;
+                  return static_cast<ElementwiseT>(lhs_signed >> rhs_elem);
                 }
               }));
       return OkStatus();
@@ -692,7 +696,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleShiftRightLogical(HloInstruction* shr) override {
     if constexpr (std::is_integral_v<ElementwiseT> &&
                   !std::is_same_v<ElementwiseT, bool>) {
-      using UnsignedT = std::make_unsigned_t<ReturnT>;
+      using UnsignedT = typename MakeUnsigned<ReturnT>::type;
       TF_ASSIGN_OR_RETURN(parent_->evaluated_[shr],
                           ElementWiseBinaryOp(shr, [](ElementwiseT lhs_elem,
                                                       ElementwiseT rhs_elem) {
@@ -709,21 +713,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return UnsupportedTypeError(shr);
   }
 
-  Status HandleStochasticConvert(HloInstruction* stochastic_convert) override {
-    const HloInstruction* operand = stochastic_convert->operand(0);
-    const HloInstruction* random = stochastic_convert->operand(1);
-    const Shape& result_shape = stochastic_convert->shape();
-    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), random->shape()));
-    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), result_shape));
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    const Literal& random_literal = parent_->GetEvaluatedLiteralFor(random);
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[stochastic_convert],
-        StochasticConvertOp(operand_literal, random_literal, result_shape));
-    return OkStatus();
-  }
-
   Status HandleClamp(HloInstruction* clamp) override {
     if constexpr (!is_complex_v<ElementwiseT>) {
       auto clamp_op = [](ElementwiseT low, ElementwiseT value,
@@ -765,36 +754,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status HandleReverse(HloInstruction* reverse) override {
-    const auto result_shape = reverse->shape();
-    const auto reverse_dimensions = reverse->dimensions();
-
-    auto operand = reverse->operand(0);
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferReverseShape(operand->shape(),
-                                                          reverse_dimensions));
-
-    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
-        << "return shape set to: " << ShapeUtil::HumanString(result_shape)
-        << " but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    Literal result(result_shape);
-
-    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
-        [&](absl::Span<const int64_t> out_index, int) {
-          std::vector<int64_t> from_index(out_index.begin(), out_index.end());
-          for (const int64_t dim : reverse_dimensions) {
-            from_index[dim] = result_shape.dimensions(dim) - 1 - out_index[dim];
-          }
-          return operand_literal.Get<ReturnT>(from_index);
-        }));
-
-    parent_->evaluated_[reverse] = std::move(result);
-    return OkStatus();
-  }
-
   Status HandleConvolutionWithLiterals(HloInstruction* conv,
                                        const Literal& lhs_literal,
                                        const Literal& rhs_literal) {
@@ -830,8 +789,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const Shape& window_shape =
         ShapeUtil::MakeShape(rhs_shape.element_type(), window_dimension_sizes);
 
-    DimensionVector lhs_dim_multipliers = MakeDimMultipliers(lhs_shape);
-    DimensionVector rhs_dim_multipliers = MakeDimMultipliers(rhs_shape);
+    DimensionVector lhs_dim_multipliers =
+        HloEvaluator::MakeDimMultipliers(lhs_shape);
+    DimensionVector rhs_dim_multipliers =
+        HloEvaluator::MakeDimMultipliers(rhs_shape);
 
     auto lhs_literal_data = lhs_literal.data<ReturnT>();
     auto rhs_literal_data = rhs_literal.data<ReturnT>();
@@ -1335,395 +1296,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                                    0);
     std::vector<int64_t> step(evaluated_operand.shape().dimensions_size(), 1);
 
-    ShapeUtil::ForEachIndex(evaluated_operand.shape(), zero_base,
-                            evaluated_operand.shape().dimensions(), step, func);
+    ShapeUtil::ForEachIndexNoStatus(evaluated_operand.shape(), zero_base,
+                                    evaluated_operand.shape().dimensions(),
+                                    step, func);
 
     parent_->evaluated_[pad] = std::move(result);
     return OkStatus();
   }
 
-  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override {
-    auto operand = dynamic_slice->operand(0);
-    auto start_indices = dynamic_slice->operand(1);
-    auto result_shape = dynamic_slice->shape();
-    TF_ASSIGN_OR_RETURN(
-        auto inferred_return_shape,
-        ShapeInference::InferDynamicSliceShape(
-            operand->shape(),
-            Cast<HloDynamicSliceInstruction>(dynamic_slice)->index_shapes(),
-            dynamic_slice->dynamic_slice_sizes()));
-    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
-        << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
-        << " but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-    TF_RET_CHECK(
-        primitive_util::IsIntegralType(start_indices->shape().element_type()));
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[dynamic_slice],
-        DynamicSlice(operand_literal,
-                     absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
-                     result_shape));
-
-    return OkStatus();
-  }
-
-  Status HandleDynamicUpdateSlice(
-      HloInstruction* dynamic_update_slice) override {
-    auto operand = dynamic_update_slice->operand(0);
-    auto update = dynamic_update_slice->operand(1);
-    auto start_indices = dynamic_update_slice->operand(2);
-    auto result_shape = dynamic_update_slice->shape();
-    TF_ASSIGN_OR_RETURN(
-        auto inferred_return_shape,
-        ShapeInference::InferDynamicUpdateSliceShape(
-            operand->shape(), update->shape(),
-            Cast<HloDynamicUpdateSliceInstruction>(dynamic_update_slice)
-                ->index_shapes()));
-    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
-        << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
-        << " but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-    TF_RET_CHECK(
-        primitive_util::IsIntegralType(start_indices->shape().element_type()));
-    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, operand->shape()));
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    const Literal& update_literal = parent_->GetEvaluatedLiteralFor(update);
-
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[dynamic_update_slice],
-        DynamicUpdateSlice(
-            operand_literal, update_literal,
-            absl::MakeConstSpan(dynamic_update_slice->operands()).subspan(2)));
-
-    return OkStatus();
-  }
-
-  StatusOr<Literal> MapImpl(HloInstruction* map) {
-    auto operands = map->operands();
-    HloComputation* computation = map->to_apply();
-
-    Literal result(map->shape());
-
-    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
-    TF_RETURN_IF_ERROR(
-        result.Populate<ReturnT>([&](absl::Span<const int64_t> multi_index) {
-          std::vector<Literal> arg_literals;
-          arg_literals.reserve(operands.size());
-
-          // Construct scalar literal parameters to be passed to the map
-          // computation.
-          for (auto operand : operands) {
-            const Literal& arg_literal =
-                parent_->GetEvaluatedLiteralFor(operand);
-            arg_literals.push_back(
-                LiteralUtil::GetScalarLiteral(arg_literal, multi_index));
-          }
-
-          Literal computed_result =
-              embedded_evaluator.Evaluate(*computation, arg_literals).value();
-          // Clear visit states so that the we can use the evaluate again on
-          // the same computation.
-          embedded_evaluator.ResetVisitStates();
-
-          return computed_result.Get<ReturnT>({});
-        }));
-    return std::move(result);
-  }
-
-  Status HandleMap(HloInstruction* map) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl(map));
-    return OkStatus();
-  }
-
-  Status HandleSort(HloInstruction* sort) override {
-    return UnsupportedTypeError(sort);
-  }
-
-  Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
-    auto operand = select_and_scatter->operand(0);
-    auto source = select_and_scatter->operand(1);
-    const Window& window = select_and_scatter->window();
-
-    const Literal& init_literal =
-        parent_->GetEvaluatedLiteralFor(select_and_scatter->operand(2));
-    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
-    auto init_scalar = init_literal.Get<ReturnT>({});
-
-    Literal result(select_and_scatter->shape());
-
-    // Initialize result array with the init value.
-    TF_RETURN_IF_ERROR(result.Populate<ReturnT>(
-        [&](absl::Span<const int64_t> output_index) { return init_scalar; }));
-
-    std::vector<int64_t> window_dimension_sizes;
-    for (const auto& window_dimension : window.dimensions()) {
-      window_dimension_sizes.push_back(window_dimension.size());
-    }
-    const Shape window_shape = ShapeUtil::MakeShape(
-        operand->shape().element_type(), window_dimension_sizes);
-
-    HloComputation* select = select_and_scatter->select();
-    HloComputation* scatter = select_and_scatter->scatter();
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    const Literal& source_literal = parent_->GetEvaluatedLiteralFor(source);
-
-    int64_t rank = operand_literal.shape().rank();
-
-    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
-    DimensionVector source_index(rank, 0);
-
-    // Used in the dual IterateThroughWindow lambdas below. Hoisted to avoid
-    // dynamic memory allocations.
-    auto curr_val_literal = LiteralUtil::CreateR0<ReturnT>(ReturnT());
-    auto selected_val_literal = LiteralUtil::CreateR0<ReturnT>(ReturnT());
-    auto source_literal_scatter = LiteralUtil::CreateR0<ReturnT>(ReturnT());
-    auto scattered_literal = LiteralUtil::CreateR0<ReturnT>(ReturnT());
-    do {
-      // For each element in `source`, we place a window in `operand`. For each
-      // window placement, we iterate inside the window twice:
-      //
-      // 1. Find the selected index by applying `select` function to all
-      // elements. E.g., If the `select` function is GreaterEqual, the first
-      // iteration through the window finds the biggest value and returns its
-      // index.
-      //
-      // 2. Using the selected index, scatter value from `source` to result. We
-      // do this by iterating through the window, and compare each index with
-      // the selected index.
-      std::optional<ReturnT> selected_val;
-      std::optional<DimensionVector> selected_index;
-
-      HloEvaluator::IterateThroughWindow(
-          window_shape, window, operand_literal.shape(), source_index,
-          [&](absl::Span<const int64_t> operand_index) {
-            auto curr_val = operand_literal.Get<ReturnT>(operand_index);
-            if (!selected_val) {
-              selected_val = curr_val;
-              selected_index.emplace(operand_index.begin(),
-                                     operand_index.end());
-            }
-            curr_val_literal.Set({}, curr_val);
-            selected_val_literal.Set({}, *selected_val);
-            Literal computed_result =
-                embedded_evaluator
-                    .Evaluate(*select,
-                              {&selected_val_literal, &curr_val_literal})
-                    .value();
-            bool selected = !computed_result.Get<bool>({});
-            if (selected) {
-              selected_val = curr_val;
-              selected_index.emplace(operand_index.begin(),
-                                     operand_index.end());
-            }
-            embedded_evaluator.ResetVisitStates();
-          });
-
-      HloEvaluator::IterateThroughWindow(
-          window_shape, window, operand_literal.shape(), source_index,
-          [&](absl::Span<const int64_t> operand_index) {
-            if (std::equal(operand_index.begin(), operand_index.end(),
-                           selected_index->begin())) {
-              auto source = source_literal.Get<ReturnT>(source_index);
-              auto scattered = result.Get<ReturnT>(operand_index);
-              source_literal_scatter.Set({}, source);
-              scattered_literal.Set({}, scattered);
-              Literal computed_result =
-                  embedded_evaluator
-                      .Evaluate(*scatter,
-                                {&source_literal_scatter, &scattered_literal})
-                      .value();
-              result.Set(operand_index, computed_result.Get<ReturnT>({}));
-              // Clear visit states so that the we can use the evaluator again
-              // on the same computation.
-              embedded_evaluator.ResetVisitStates();
-            }
-          });
-    } while (
-        IndexUtil::BumpIndices(source->shape(), absl::MakeSpan(source_index)));
-
-    parent_->evaluated_[select_and_scatter] = std::move(result);
-    return OkStatus();
-  }
-
-  Status HandleReduceWindow(HloInstruction* reduce_window) override {
-    auto* reduce_window_instr = Cast<HloReduceWindowInstruction>(reduce_window);
-    const Window& window = reduce_window->window();
-    HloComputation* function = reduce_window->to_apply();
-    TF_ASSIGN_OR_RETURN(
-        auto inferred_return_shape,
-        ShapeInference::InferReduceWindowShape(
-            reduce_window_instr->input_shapes(),
-            reduce_window_instr->init_value_shapes(), window,
-            /*to_apply_shape=*/function->ComputeProgramShape()));
-    TF_RET_CHECK(
-        ShapeUtil::Compatible(reduce_window->shape(), inferred_return_shape))
-        << "return shape is set to: "
-        << ShapeUtil::HumanStringWithLayout(reduce_window->shape())
-        << " but is inferred to be: "
-        << ShapeUtil::HumanStringWithLayout(inferred_return_shape);
-
-    absl::InlinedVector<const Literal*, 2> input_literal_vec, init_literal_vec;
-    auto input_arrays = reduce_window_instr->inputs();
-    auto init_values = reduce_window_instr->init_values();
-    int64_t num_args = input_arrays.size();
-    for (int i = 0; i < num_args; ++i) {
-      const Literal& input_literal =
-          parent_->GetEvaluatedLiteralFor(input_arrays[i]);
-      VLOG(3) << "HandleReduceWindow arg_literal: " << input_literal.ToString();
-      input_literal_vec.push_back(&input_literal);
-      const Literal& init_literal =
-          parent_->GetEvaluatedLiteralFor(init_values[i]);
-      VLOG(3) << "HandleReduceWindow init_literal: " << init_literal.ToString();
-      TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
-      init_literal_vec.push_back(&init_literal);
-    }
-    // Creates a Shape object from window, for iteration below.
-    absl::InlinedVector<int64_t, 2> window_dimension_sizes;
-    for (const auto& window_dimension : window.dimensions()) {
-      window_dimension_sizes.push_back(window_dimension.size());
-    }
-    const Shape window_shape = ShapeUtil::MakeShape(
-        input_arrays[0]->shape().element_type(), window_dimension_sizes);
-
-    const int num_threads = ShapeUtil::GetForEachIndexParallelThreadCount() + 1;
-    std::vector<std::unique_ptr<HloEvaluator>> embedded_evaluators;
-    embedded_evaluators.reserve(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
-      embedded_evaluators.push_back(
-          parent_->CreateEmbedded(parent_->max_loop_iterations_));
-    }
-
-    // For each resulting dimension, calculate and assign computed value.
-    auto evaluate_impl = [&init_literal_vec, &window_shape, &window,
-                          &input_literal_vec, &embedded_evaluators, function,
-                          &inferred_return_shape](
-                             absl::Span<const int64_t> output_index,
-                             int thread_id) -> absl::InlinedVector<Literal, 2> {
-      const int embedded_evaluator_index = thread_id + 1;
-      CHECK_GE(embedded_evaluator_index, 0);
-      CHECK_LT(embedded_evaluator_index, embedded_evaluators.size());
-      HloEvaluator& embedded_evaluator =
-          *embedded_evaluators[embedded_evaluator_index];
-      absl::InlinedVector<Literal, 2> computed_result;
-      computed_result.reserve(init_literal_vec.size());
-      for (const auto* init : init_literal_vec) {
-        computed_result.push_back(init->Clone());
-      }
-      HloEvaluator::IterateThroughWindow(
-          window_shape, window, input_literal_vec[0]->shape(), output_index,
-          [&](absl::Span<const int64_t> operand_index) -> void {
-            absl::InlinedVector<const Literal*, 2> args;
-            for (auto& curr_result_val : computed_result) {
-              VLOG(2) << "Pushing:" << curr_result_val.ToString() << "\n";
-              args.push_back(&curr_result_val);
-            }
-            absl::InlinedVector<Literal, 2> curr_val_literal_vec;
-            curr_val_literal_vec.reserve(input_literal_vec.size());
-            for (const auto* input_literal : input_literal_vec) {
-              // Evaluate computation with specified literal operands.
-              curr_val_literal_vec.push_back(Literal(ShapeUtil::MakeShape(
-                  input_literal->shape().element_type(), {})));
-              TF_CHECK_OK(curr_val_literal_vec.back().CopyElementFrom(
-                  *input_literal, operand_index, {}));
-              VLOG(2) << "Pushing:" << curr_val_literal_vec.back().ToString()
-                      << "\n";
-              args.push_back(&curr_val_literal_vec.back());
-            }
-            computed_result[0] =
-                embedded_evaluator.Evaluate(*function, args).value();
-            VLOG(2) << "Computed result:" << computed_result[0].ToString()
-                    << "\n";
-            // Clear visit states so that the we can use the evaluate again
-            // on the same computation.
-            embedded_evaluator.ResetVisitStates();
-            if (inferred_return_shape.IsTuple()) {
-              auto decomposed = computed_result[0].DecomposeTuple();
-              computed_result.clear();
-              computed_result.reserve(decomposed.size());
-              for (int i = 0; i < decomposed.size(); ++i) {
-                computed_result.push_back(std::move(decomposed[i]));
-              }
-            }
-          });
-      VLOG(2) << "Final result size:" << computed_result.size() << "\n";
-      for (const auto& res : computed_result) {
-        VLOG(2) << res.ToString() << "\n";
-      }
-      return computed_result;
-    };
-    Literal result(inferred_return_shape);
-    if (inferred_return_shape.IsTuple()) {
-      absl::InlinedVector<Literal, 1> results(num_args);
-      for (int64_t i = 0; i < num_args; ++i) {
-        results[i] = Literal(inferred_return_shape.tuple_shapes(i));
-      }
-      ShapeUtil::ForEachIndexParallel(
-          inferred_return_shape.tuple_shapes(0),
-          [&results, &evaluate_impl](absl::Span<const int64_t> output_index,
-                                     int thread_id) -> bool {
-            absl::InlinedVector<Literal, 2> computed_result_vec =
-                evaluate_impl(output_index, thread_id);
-            for (int i = 0; i < computed_result_vec.size(); ++i) {
-              // We are reading from `computed_result_vec[i]` at the top-level
-              // literal index and writing to `results[i]` at `output_index`.
-              // This is thread-safe because:
-              //  - `results[i]` is not changing size.
-              //  - `computed_result_vec[i]` is thread-local.
-              //  - There is exactly one write to `results[i]` for each
-              //    `output_index`.
-              TF_CHECK_OK(results[i].CopyElementFrom(computed_result_vec[i], {},
-                                                     output_index));
-            }
-            return true;
-          });
-      result = Literal::MoveIntoTuple(absl::MakeSpan(results));
-      VLOG(2) << "Final result is:" << result.ToString() << "\n";
-    } else {
-      TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
-          [&evaluate_impl](absl::Span<const int64_t> output_index,
-                           int thread_id) {
-            return evaluate_impl(output_index, thread_id)[0]
-                .template Get<ReturnT>({});
-          }));
-    }
-    VLOG(2) << "Final result is:" << result.ToString() << "\n";
-    parent_->evaluated_[reduce_window] = std::move(result);
-    return OkStatus();
-  }
-
-  Status HandleSlice(HloInstruction* slice) override {
-    auto operand = slice->operand(0);
-    const Shape& shape = slice->shape();
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferSliceShape(
-                            operand->shape(), slice->slice_starts(),
-                            slice->slice_limits(), slice->slice_strides()));
-    TF_RET_CHECK(ShapeUtil::Compatible(shape, inferred_return_shape))
-        << "return shape set to: " << ShapeUtil::HumanString(shape)
-        << " but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    const int64_t rank = operand->shape().rank();
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    auto func = [&](absl::Span<const int64_t> out_index, int) {
-      DimensionVector operand_index(rank);
-      for (int64_t i = 0; i < rank; ++i) {
-        operand_index[i] =
-            slice->slice_starts(i) + out_index[i] * slice->slice_strides(i);
-      }
-      return operand_literal.Get<ReturnT>(operand_index);
-    };
-
-    Literal result(shape);
-    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(func));
-    parent_->evaluated_[slice] = std::move(result);
-    return OkStatus();
-  }
-
   Status HandleClz(HloInstruction* clz) override {
     // Enable CLZ only for integer types.
     if constexpr (std::is_integral_v<ElementwiseT> &&
@@ -1731,9 +1311,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[clz],
           ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
-            using UnsignedT = std::make_unsigned_t<ReturnT>;
-            return (std::numeric_limits<UnsignedT>::digits - 1) -
-                   Log2Floor<UnsignedT>(elem_operand);
+            int64_t unsigned_digits = std::numeric_limits<ReturnT>::digits +
+                                      std::numeric_limits<ReturnT>::is_signed;
+            return (unsigned_digits - 1) - Log2Floor<uint64_t>(elem_operand);
           }));
       return OkStatus();
     }
@@ -1908,7 +1488,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                   is_complex_v<ElementwiseT> ||
                   std::is_floating_point_v<ElementwiseT>) {
       Literal result(iota->shape());
-      ShapeUtil::ForEachIndex(
+      ShapeUtil::ForEachIndexNoStatus(
           iota->shape(), [&](absl::Span<const int64_t> idx) {
             result.Set(idx, static_cast<ReturnT>(idx[iota->iota_dimension()]));
             return true;
@@ -1921,7 +1501,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleRng(HloInstruction* random) override {
     RandomDistribution distribution = random->random_distribution();
-    const auto result_shape = random->shape();
+    const Shape& result_shape = random->shape();
     Literal result(result_shape);
 
     if constexpr (std::is_floating_point_v<ElementwiseT>) {
@@ -1993,7 +1573,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           // i.e., [low, high], but we want [low, high) instead. Hence high-1 is
           // used as the upper range.
           std::uniform_int_distribution<int64_t> generator(
-              low.Get<ReturnT>({}), high.Get<ReturnT>({}) - 1);
+              static_cast<int64_t>(low.Get<ReturnT>({})),
+              static_cast<int64_t>(high.Get<ReturnT>({})) - 1);
 
           TF_RETURN_IF_ERROR(result.Populate<ReturnT>(
               [&](absl::Span<const int64_t> /*indexes*/) {
@@ -2017,86 +1598,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
  private:
-  // Creates a vector of multipliers which can be used to create a linear index
-  // into shape.
-  //
-  // Given the multidimensional index {i1, ..., iN} and
-  // M = MakeDimMultipliers(shape), the corresponding linear index LI is simply
-  //
-  //   LI = i1 * M[1] + i2 * M[2] + ... + iN * M[N].
-  //
-  // This lets you calculate LI given the multidimensional indices in any order.
-  static DimensionVector MakeDimMultipliers(const Shape& shape) {
-    DimensionVector v(shape.rank());
-    int64_t scale = 1;
-    for (auto dim : LayoutUtil::MinorToMajor(shape)) {
-      v[dim] = scale;
-      scale *= shape.dimensions(dim);
-    }
-    return v;
-  }
-
-  StatusOr<Literal> DynamicSlice(
-      const Literal& operand_literal,
-      absl::Span<HloInstruction* const> start_indices,
-      const Shape& result_shape) {
-    std::vector<int64_t> start = parent_->GetS64Indices(start_indices);
-
-    // Clamp the start indices so the slice is in-bounds w.r.t the operand.
-    for (int64_t i = 0; i < start.size(); ++i) {
-      start[i] = std::min<int64_t>(
-          std::max(int64_t{0}, start[i]),
-          operand_literal.shape().dimensions(i) - result_shape.dimensions(i));
-    }
-
-    std::vector<int64_t> operand_indices(start.size());
-    Literal result(result_shape);
-    TF_RETURN_IF_ERROR(
-        result.Populate<ReturnT>([&](absl::Span<const int64_t> multi_index) {
-          for (int64_t i = 0; i < operand_indices.size(); ++i) {
-            CHECK_GE(multi_index[i] + start[i], 0);
-            operand_indices[i] = multi_index[i] + start[i];
-          }
-
-          auto result = operand_literal.Get<ReturnT>(operand_indices);
-          return result;
-        }));
-
-    return std::move(result);
-  }
-
-  StatusOr<Literal> DynamicUpdateSlice(
-      const Literal& operand_literal, const Literal& update_literal,
-      absl::Span<HloInstruction* const> start_indices) {
-    auto result = operand_literal.Clone();
-    const auto rank = result.shape().rank();
-    std::vector<int64_t> start = parent_->GetS64Indices(start_indices);
-
-    // Clamp the update start indices so the slice is in-bounds w.r.t the
-    // operand.
-    for (int64_t i = 0; i < rank; ++i) {
-      start[i] = std::min<int64_t>(
-          std::max<int64_t>(0, start[i]),
-          result.shape().dimensions(i) - update_literal.shape().dimensions(i));
-    }
-    std::vector<int64_t> result_index(rank, 0);
-
-    auto func = [&](absl::Span<const int64_t> update_index) {
-      std::transform(update_index.begin(), update_index.end(), start.begin(),
-                     result_index.begin(), std::plus<int64_t>());
-      result.Set<ReturnT>(result_index,
-                          update_literal.Get<ReturnT>(update_index));
-      return true;
-    };
-
-    std::vector<int64_t> base(update_literal.shape().dimensions_size(), 0);
-    std::vector<int64_t> step(update_literal.shape().dimensions_size(), 1);
-    ShapeUtil::ForEachIndex(update_literal.shape(), base,
-                            update_literal.shape().dimensions(), step, func);
-
-    return std::move(result);
-  }
-
   StatusOr<Literal> ElementWiseUnaryOp(
       HloInstruction* instruction,
       const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
@@ -2162,135 +1663,11 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return std::move(result);
   }
 
-  template <typename Fp, typename Uint, typename ResultT>
-  StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
-                                        const Literal& random_literal,
-                                        const Shape& result_shape) {
-    std::function<ResultT(Fp, Uint)> stochastic_convert_op =
-        [](Fp operand, Uint random) -> ResultT {
-      bool is_negative = ToSignMagnitude(operand) < 0;
-      if (Eigen::numext::isinf(operand)) {
-        return is_negative ? std::numeric_limits<ResultT>::min()
-                           : std::numeric_limits<ResultT>::max();
-      }
-      if (Eigen::numext::isnan(operand)) {
-        return static_cast<ResultT>(0);
-      }
-      if (operand >= static_cast<Fp>(std::numeric_limits<ResultT>::max())) {
-        return std::numeric_limits<ResultT>::max();
-      }
-      if (operand <= static_cast<Fp>(std::numeric_limits<ResultT>::min())) {
-        return std::numeric_limits<ResultT>::min();
-      }
-
-      operand = Eigen::numext::abs(operand);
-
-      // Gets the integral piece of the floating point input.
-      auto truncated = static_cast<ResultT>(operand);
-
-      // Removes the integral piece to obtain the fractional piece.
-      Fp fractional = operand - static_cast<Fp>(truncated);
-      if (fractional == Fp{0}) {
-        // No rounding necessary.
-        return is_negative ? -truncated : truncated;
-      }
-
-      // Compares fractional values against unsigned random values by
-      // normalizing random values into [0, 1): fractional vs. (random /
-      // random_max). This equals to comparing (fractional * random_max) vs.
-      // random.
-      auto fixed_fractional = static_cast<Uint>(std::ldexp(
-          static_cast<double>(fractional), std::numeric_limits<Uint>::digits));
-
-      // Rounds the integer output up if the fractional pieces is larger than
-      // the input random number.
-      if (random < fixed_fractional) {
-        // This only happens when the operand is in the (min, -max) range and
-        // should be rounded to min.
-        if (truncated == std::numeric_limits<ResultT>::max()) {
-          return std::numeric_limits<ResultT>::min();
-        }
-        truncated++;
-      }
-      return is_negative ? -truncated : truncated;
-    };
-
-    Literal result(result_shape);
-    TF_RETURN_IF_ERROR(
-        result.Populate<ResultT>([&](absl::Span<const int64_t> multi_index) {
-          return stochastic_convert_op(operand_literal.Get<Fp>(multi_index),
-                                       random_literal.Get<Uint>(multi_index));
-        }));
-    return std::move(result);
-  }
-
-  // Converts from primitive types to native types.
-  template <PrimitiveType operand_type, PrimitiveType random_type,
-            PrimitiveType result_type>
-  StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
-                                        const Literal& random_literal,
-                                        const Shape& result_shape) {
-    return StochasticConvertOp<
-        typename primitive_util::PrimitiveTypeToNative<operand_type>::type,
-        typename primitive_util::PrimitiveTypeToNative<random_type>::type,
-        typename primitive_util::PrimitiveTypeToNative<result_type>::type>(
-        operand_literal, random_literal, result_shape);
-  }
-
-  // Evaluates all possible paths of converting to different integers.
-  template <PrimitiveType operand_type, PrimitiveType random_type>
-  StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
-                                        const Literal& random_literal,
-                                        const Shape& result_shape) {
-    switch (result_shape.element_type()) {
-#define CONVERT_IF_RESULT_TYPES_MATCH(type)                        \
-  case (type):                                                     \
-    return StochasticConvertOp<operand_type, random_type, (type)>( \
-        operand_literal, random_literal, result_shape);
-      CONVERT_IF_RESULT_TYPES_MATCH(S32)
-      CONVERT_IF_RESULT_TYPES_MATCH(S16)
-      CONVERT_IF_RESULT_TYPES_MATCH(S8)
-#undef CONVERT_IF_RESULT_TYPES_MATCH
-      default:
-        break;
-    }
-    // TODO(b/232442915): Enable converting big floats to small floats.
-    return Unimplemented(
-        "Stochastically converting from type %s to type %s is not implemented.",
-        PrimitiveType_Name(operand_literal.shape().element_type()),
-        PrimitiveType_Name(result_shape.element_type()));
-  }
-
-  StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
-                                        const Literal& random_literal,
-                                        const Shape& result_shape) {
-    switch (operand_literal.shape().element_type()) {
-      case F16:
-        return StochasticConvertOp<F16, U16>(operand_literal, random_literal,
-                                             result_shape);
-      case BF16:
-        return StochasticConvertOp<BF16, U16>(operand_literal, random_literal,
-                                              result_shape);
-      case F32:
-        return StochasticConvertOp<F32, U32>(operand_literal, random_literal,
-                                             result_shape);
-      case F64:
-        return StochasticConvertOp<F64, U64>(operand_literal, random_literal,
-                                             result_shape);
-      default:
-        break;
-    }
-    // TODO(b/232442915): Enable converting big floats to small floats.
-    return Unimplemented(
-        "Stochastically converting from type %s to type %s is not implemented.",
-        PrimitiveType_Name(operand_literal.shape().element_type()),
-        PrimitiveType_Name(result_shape.element_type()));
-  }
-
   template <typename NativeT>
-  static bool IsShiftOutOfBounds(NativeT rhs) {
-    using UnsignedT = std::make_unsigned_t<NativeT>;
-    UnsignedT lhs_bits_unsigned = std::numeric_limits<UnsignedT>::digits;
+  static bool IsShiftOutOfBounds(ElementwiseT rhs) {
+    using UnsignedT = typename MakeUnsigned<NativeT>::type;
+    UnsignedT lhs_bits_unsigned =
+        static_cast<UnsignedT>(std::numeric_limits<UnsignedT>::digits);
     UnsignedT rhs_unsigned = static_cast<UnsignedT>(rhs);
     return rhs_unsigned >= lhs_bits_unsigned;
   }
@@ -2302,10 +1679,12 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 // instantiating it.  We explicitly instantiate this class in the various
 // hlo_evaluator_typed_visitor*.cc files.
 extern template class HloEvaluatorTypedVisitor<bool>;
+extern template class HloEvaluatorTypedVisitor<u4, uint64_t>;
 extern template class HloEvaluatorTypedVisitor<uint8_t, uint64_t>;
 extern template class HloEvaluatorTypedVisitor<uint16_t, uint64_t>;
 extern template class HloEvaluatorTypedVisitor<uint32_t, uint64_t>;
 extern template class HloEvaluatorTypedVisitor<uint64_t>;
+extern template class HloEvaluatorTypedVisitor<s4, int64_t>;
 extern template class HloEvaluatorTypedVisitor<int8_t, int64_t>;
 extern template class HloEvaluatorTypedVisitor<int16_t, int64_t>;
 extern template class HloEvaluatorTypedVisitor<int32_t, int64_t>;
@@ -2318,6 +1697,7 @@ extern template class HloEvaluatorTypedVisitor<complex128>;
 extern template class HloEvaluatorTypedVisitor<bfloat16, float>;
 extern template class HloEvaluatorTypedVisitor<tsl::float8_e5m2, float>;
 extern template class HloEvaluatorTypedVisitor<tsl::float8_e4m3fn, float>;
+extern template class HloEvaluatorTypedVisitor<tsl::float8_e4m3b11, float>;
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_float8.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_float8.cc
index 3c878b900a6..f123ff87856 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_float8.cc
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_float8.cc
@@ -19,4 +19,5 @@ limitations under the License.
 namespace xla {
 template class HloEvaluatorTypedVisitor<tsl::float8_e5m2, float>;
 template class HloEvaluatorTypedVisitor<tsl::float8_e4m3fn, float>;
+template class HloEvaluatorTypedVisitor<tsl::float8_e4m3b11, float>;
 }  // namespace xla
diff --git a/tensorflow/lite/core/shims/cc/model.h b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int4.cc
similarity index 60%
rename from tensorflow/lite/core/shims/cc/model.h
rename to tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int4.cc
index 8a016319510..e4e2f34a242 100644
--- a/tensorflow/lite/core/shims/cc/model.h
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int4.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_H_
 
-/// For documentation, see third_party/tensorflow/lite/model.h.
+#include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
+#include "tensorflow/compiler/xla/types.h"
 
-#include "tensorflow/lite/core/shims/cc/interpreter_builder.h"
-#include "tensorflow/lite/core/shims/cc/model_builder.h"
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_H_
+namespace xla {
+template class HloEvaluatorTypedVisitor<s4, int64_t>;
+template class HloEvaluatorTypedVisitor<u4, uint64_t>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/BUILD b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/BUILD
index 38a3c1666e3..1fbde7d6569 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/BUILD
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/BUILD
@@ -1,6 +1,7 @@
 # Automatic sharding annotation
 
-load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -42,6 +43,7 @@ cc_library(
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -122,6 +124,7 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/utils:hlo_sharding_util",
         "//tensorflow/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -149,3 +152,27 @@ xla_cc_binary(
         "//tensorflow/tsl/platform:platform_port",
     ],
 )
+
+build_test(
+    name = "auto_sharding_runner_build_test",
+    targets = [
+        ":auto_sharding_runner",
+    ],
+)
+
+xla_cc_test(
+    name = "auto_sharding_test",
+    srcs = ["auto_sharding_test.cc"],
+    deps = [
+        ":auto_sharding",
+        ":auto_sharding_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        #"//tensorflow/core:test",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "@com_google_ortools//ortools/linear_solver",
+    ],
+)
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 45873568a37..7b4ddc0b684 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.h"
 #include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/matrix.h"
 #include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/metrics.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
@@ -127,27 +128,44 @@ std::optional<HloSharding> GetInputSharding(const HloInstruction* ins,
                                                   10, true, call_graph);
 }
 
-std::vector<std::vector<double>> GenerateReshardingCostsForAllOperands(
+// Compute the resharding costs as well as input shardings (when missing) for
+// all operands of a given instruction, and an output sharding for that
+// instruction.
+std::vector<std::vector<double>>
+GenerateReshardingCostsAndMissingShardingsForAllOperands(
     const HloInstruction* ins, const HloSharding& output_sharding,
     const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
     const CallGraph& call_graph,
-    std::vector<std::optional<HloSharding>> input_shardings = {}) {
+    std::vector<std::optional<HloSharding>>& input_shardings) {
   std::vector<std::vector<double>> resharding_costs;
+  if (input_shardings.empty() && ins->operand_count() > 0) {
+    input_shardings.resize(ins->operand_count());
+  }
   for (int64_t k = 0; k < ins->operand_count(); ++k) {
     auto operand = ins->operand(k);
     if (operand->shape().rank() == 0) {
       resharding_costs.push_back(std::vector<double>(
           strategy_map.at(operand)->leaf_vector.size(), 0.0));
+      if (!input_shardings[k].has_value()) {
+        input_shardings[k] = HloSharding::Replicate();
+      }
     } else {
       std::optional<HloSharding> cur_input_sharding;
-      if (!input_shardings.empty() && input_shardings[k].has_value()) {
+      if (input_shardings[k].has_value()) {
         CHECK_EQ(input_shardings.size(), ins->operand_count());
         cur_input_sharding = input_shardings[k];
       } else {
         cur_input_sharding =
             GetInputSharding(ins, operand, k, output_sharding, call_graph);
       }
+      if (!cur_input_sharding.has_value() &&
+          ins->opcode() == HloOpcode::kGather && k == 0) {
+        cur_input_sharding = HloSharding::Replicate();
+      }
       CHECK(cur_input_sharding.has_value());
+      if (!input_shardings[k].has_value()) {
+        input_shardings[k] = cur_input_sharding;
+      }
       auto operand_strategies = strategy_map.at(operand).get();
       auto operand_shape = operand->shape();
       resharding_costs.push_back(
@@ -158,6 +176,35 @@ std::vector<std::vector<double>> GenerateReshardingCostsForAllOperands(
   return resharding_costs;
 }
 
+std::pair<std::vector<std::vector<double>>, std::vector<HloSharding>>
+GenerateReshardingCostsAndShardingsForAllOperands(
+    const HloInstruction* ins, const HloSharding& output_sharding,
+    const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
+    const CallGraph& call_graph) {
+  std::vector<std::optional<HloSharding>> input_shardings_optional;
+  auto resharding_costs =
+      GenerateReshardingCostsAndMissingShardingsForAllOperands(
+          ins, output_sharding, strategy_map, cluster_env, call_graph,
+          input_shardings_optional);
+  std::vector<HloSharding> input_shardings;
+  for (auto sharding_optional : input_shardings_optional) {
+    CHECK(sharding_optional.has_value());
+    input_shardings.push_back(sharding_optional.value());
+  }
+
+  return std::make_pair(resharding_costs, input_shardings);
+}
+
+std::vector<std::vector<double>> GenerateReshardingCostsForAllOperands(
+    const HloInstruction* ins, const HloSharding& output_sharding,
+    const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
+    const CallGraph& call_graph,
+    std::vector<std::optional<HloSharding>> input_shardings) {
+  return GenerateReshardingCostsAndMissingShardingsForAllOperands(
+      ins, output_sharding, strategy_map, cluster_env, call_graph,
+      input_shardings);
+}
+
 std::unique_ptr<StrategyVector> MaybeFollowInsStrategyVector(
     const StrategyVector* src_strategies, const Shape& shape,
     size_t instruction_id, bool have_memory_cost,
@@ -343,12 +390,14 @@ std::vector<size_t> FindReplicateStrategyIndices(
   return indices;
 }
 
-std::vector<std::vector<double>> ReshardingCostsForTupleOperand(
-    const HloInstruction* operand, StrategyVector* operand_strategy_vector) {
+std::pair<std::vector<std::vector<double>>, std::vector<HloSharding>>
+ReshardingCostsForTupleOperand(const HloInstruction* operand,
+                               StrategyVector* operand_strategy_vector) {
   // TODO(yuemmawang) Support instructions with more than one tuple operand.
   // Creates resharding costs such that favors when operand strategies are
   // replicated.
   std::vector<std::vector<double>> resharding_costs;
+  std::vector<HloSharding> tuple_element_shardings;
   for (size_t tuple_element_idx = 0;
        tuple_element_idx < operand->shape().tuple_shapes_size();
        tuple_element_idx++) {
@@ -362,11 +411,14 @@ std::vector<std::vector<double>> ReshardingCostsForTupleOperand(
         << tuple_element_strategies->ToString();
     resharding_costs.push_back(std::vector<double>(
         tuple_element_strategies->leaf_vector.size(), kInfinityCost));
+    tuple_element_shardings.push_back(HloSharding::Replicate());
     for (const size_t i : indices) {
       resharding_costs.back().at(i) = 0.0;
     }
   }
-  return resharding_costs;
+  return std::make_pair(resharding_costs,
+                        std::vector<HloSharding>({HloSharding::Tuple(
+                            operand->shape(), tuple_element_shardings)}));
 }
 
 // Add "Replicate()" strategy
@@ -378,14 +430,15 @@ void AddReplicatedStrategy(const HloInstruction* ins, const Shape& shape,
   HloSharding output_spec = HloSharding::Replicate();
 
   std::vector<std::vector<double>> resharding_costs;
-
+  std::vector<HloSharding> input_shardings;
   if (ins->operand_count() > 0 && ins->operand(0)->shape().IsTuple()) {
     CHECK_EQ(ins->operand_count(), 1)
         << "Do not support instructions with more than one tuple "
            "operand. If this CHECK fails, we will need to fix "
            "b/233412625.";
-    resharding_costs = ReshardingCostsForTupleOperand(
-        ins->operand(0), strategy_map.at(ins->operand(0)).get());
+    std::tie(resharding_costs, input_shardings) =
+        ReshardingCostsForTupleOperand(ins->operand(0),
+                                       strategy_map.at(ins->operand(0)).get());
     LOG(INFO) << absl::StrJoin(resharding_costs.back(), ",");
   } else {
     for (int64_t k = 0; k < ins->operand_count(); ++k) {
@@ -393,17 +446,52 @@ void AddReplicatedStrategy(const HloInstruction* ins, const Shape& shape,
       resharding_costs.push_back(ReshardingCostVector(
           strategy_map.at(operand).get(), ins->operand(k)->shape(), output_spec,
           cluster_env));
+      input_shardings.push_back(output_spec);
     }
   }
   double memory_cost = GetBytes(shape) / output_spec.NumTiles();
-  strategies->leaf_vector.push_back(
-      ShardingStrategy({"R",
-                        HloSharding::Replicate(),
-                        replicated_penalty,
-                        0,
-                        memory_cost,
-                        std::move(resharding_costs),
-                        {}}));
+  strategies->leaf_vector.push_back(ShardingStrategy(
+      {"R", HloSharding::Replicate(), replicated_penalty, 0, memory_cost,
+       std::move(resharding_costs), input_shardings}));
+}
+
+std::vector<std::vector<double>> CreateZeroReshardingCostsForAllOperands(
+    const HloInstruction* ins, const StrategyMap& strategy_map) {
+  std::vector<std::vector<double>> resharding_costs;
+  for (size_t i = 0; i < ins->operand_count(); ++i) {
+    auto operand = ins->operand(i);
+    const auto& operand_strategies = strategy_map.at(operand);
+    if (operand->shape().IsTuple()) {
+      CHECK_EQ(ins->operand_count(), 0)
+          << "Do not support instructions with more than one tuple "
+             "operand.";
+      for (size_t tuple_element_idx = 0;
+           tuple_element_idx < operand->shape().tuple_shapes_size();
+           tuple_element_idx++) {
+        auto tuple_element_strategies =
+            operand_strategies->childs.at(tuple_element_idx).get();
+        resharding_costs.push_back(std::vector<double>(
+            tuple_element_strategies->leaf_vector.size(), 0));
+      }
+    } else {
+      resharding_costs.push_back(
+          std::vector<double>(operand_strategies->leaf_vector.size(), 0));
+    }
+  }
+  return resharding_costs;
+}
+
+// TODO(pratikf) Communication costs for sort HLO ops. This is currently a
+// placeholder approximation and should be improved.
+double ComputeSortCommunicationCost(int64_t sort_dim,
+                                    int64_t operand_sharded_dim,
+                                    int64_t mesh_sharding_dim,
+                                    const Shape& shape,
+                                    const ClusterEnvironment& cluster_env) {
+  if (sort_dim == operand_sharded_dim) {
+    return cluster_env.AllToAllCost(GetBytes(shape), mesh_sharding_dim);
+  }
+  return 0;
 }
 
 // Enumerate all 1d partition strategies.
@@ -429,30 +517,42 @@ void EnumerateAll1DPartition(const HloInstruction* ins, const Shape& shape,
       double memory_cost = GetBytes(shape) / output_spec.NumTiles();
 
       std::vector<std::vector<double>> resharding_costs;
-      if (ins->operand_count() > 0 && ins->operand(0)->shape().IsTuple()) {
+      std::vector<HloSharding> input_shardings;
+      if (ins->opcode() == HloOpcode::kConditional) {
+        // TODO(pratikf): Compute input_shardings for kConditional ops
+        resharding_costs =
+            CreateZeroReshardingCostsForAllOperands(ins, strategy_map);
+      } else if (ins->operand_count() > 0 &&
+                 ins->operand(0)->shape().IsTuple()) {
         CHECK_EQ(ins->operand_count(), 1)
             << "Do not support instructions with more than one tuple "
                "operand.";
-        resharding_costs = ReshardingCostsForTupleOperand(
-            ins->operand(0), strategy_map.at(ins->operand(0)).get());
+        std::tie(resharding_costs, input_shardings) =
+            ReshardingCostsForTupleOperand(
+                ins->operand(0), strategy_map.at(ins->operand(0)).get());
         LOG(INFO) << absl::StrJoin(resharding_costs.back(), ",");
       } else if (ins->opcode() == HloOpcode::kRngBitGenerator &&
                  ins->operand(0)->shape().IsArray()) {
+        auto replicated_sharding = HloSharding::Replicate();
+        input_shardings.push_back(HloSharding::SingleTuple(
+            ins->operand(0)->shape(), replicated_sharding));
         resharding_costs = GenerateReshardingCostsForAllOperands(
             ins, output_spec, strategy_map, cluster_env, call_graph,
-            {HloSharding::Replicate()});
+            {replicated_sharding});
       } else {
-        resharding_costs = GenerateReshardingCostsForAllOperands(
-            ins, output_spec, strategy_map, cluster_env, call_graph);
+        std::tie(resharding_costs, input_shardings) =
+            GenerateReshardingCostsAndShardingsForAllOperands(
+                ins, output_spec, strategy_map, cluster_env, call_graph);
       }
-      strategies->leaf_vector.push_back(
-          ShardingStrategy({name,
-                            output_spec,
-                            compute_cost,
-                            communication_cost,
-                            memory_cost,
-                            std::move(resharding_costs),
-                            {}}));
+      if (ins->opcode() == HloOpcode::kSort) {
+        auto sort_ins = xla::DynCast<HloSortInstruction>(ins);
+        CHECK(sort_ins);
+        communication_cost = ComputeSortCommunicationCost(
+            sort_ins->sort_dimension(), i, j, shape, cluster_env);
+      }
+      strategies->leaf_vector.push_back(ShardingStrategy(
+          {name, output_spec, compute_cost, communication_cost, memory_cost,
+           std::move(resharding_costs), input_shardings}));
     }
   }
 }
@@ -498,27 +598,44 @@ void EnumerateAll2DPartition(const HloInstruction* ins, const Shape& shape,
                device_mesh);
       double compute_cost = 0, communication_cost = 0;
       double memory_cost = GetBytes(shape) / output_spec.NumTiles();
+      std::vector<HloSharding> input_shardings;
       std::vector<std::vector<double>> resharding_costs;
-      if (ins->operand_count() > 0 && ins->operand(0)->shape().IsTuple()) {
+      if (ins->opcode() == HloOpcode::kConditional) {
+        // TODO(pratikf): Compute input_shardings for kConditional ops
+        resharding_costs =
+            CreateZeroReshardingCostsForAllOperands(ins, strategy_map);
+      } else if (ins->operand_count() > 0 &&
+                 ins->operand(0)->shape().IsTuple()) {
         CHECK_EQ(ins->operand_count(), 1)
             << "Do not support instructions with more than one tuple "
                "operand. If this CHECK fails, we will need to fix "
                "b/233412625.";
-        resharding_costs = ReshardingCostsForTupleOperand(
-            ins->operand(0), strategy_map.at(ins->operand(0)).get());
+        std::tie(resharding_costs, input_shardings) =
+            ReshardingCostsForTupleOperand(
+                ins->operand(0), strategy_map.at(ins->operand(0)).get());
         LOG(INFO) << absl::StrJoin(resharding_costs.back(), ",");
       } else {
-        resharding_costs = GenerateReshardingCostsForAllOperands(
-            ins, output_spec, strategy_map, cluster_env, call_graph);
+        std::tie(resharding_costs, input_shardings) =
+            GenerateReshardingCostsAndShardingsForAllOperands(
+                ins, output_spec, strategy_map, cluster_env, call_graph);
       }
-      strategies->leaf_vector.push_back(
-          ShardingStrategy({name,
-                            output_spec,
-                            compute_cost,
-                            communication_cost,
-                            memory_cost,
-                            std::move(resharding_costs),
-                            {}}));
+      // TODO(pratikf) Communication costs for sort HLO ops. This is currently a
+      // placeholder approximation and should be improved.
+      if (ins->opcode() == HloOpcode::kSort) {
+        auto sort_ins = xla::DynCast<HloSortInstruction>(ins);
+        CHECK(sort_ins);
+
+        if (sort_ins->sort_dimension() == i) {
+          communication_cost = ComputeSortCommunicationCost(
+              sort_ins->sort_dimension(), i, 0, shape, cluster_env);
+        } else if (sort_ins->sort_dimension() == j) {
+          communication_cost = ComputeSortCommunicationCost(
+              sort_ins->sort_dimension(), j, 1, shape, cluster_env);
+        }
+      }
+      strategies->leaf_vector.push_back(ShardingStrategy(
+          {name, output_spec, compute_cost, communication_cost, memory_cost,
+           std::move(resharding_costs), input_shardings}));
     }
   }
 }
@@ -771,23 +888,24 @@ void DisableIncompatibleMixedMeshShapeAndForceBatchDim(
   }
 }
 
-StatusOr<std::unique_ptr<StrategyVector>> CreateParameterStrategyVector(
+StatusOr<std::unique_ptr<StrategyVector>> CreateAllStrategiesVector(
     const HloInstruction* ins, const Shape& shape, size_t instruction_id,
     LeafStrategies& leaf_strategies, const ClusterEnvironment& cluster_env,
     const StrategyMap& strategy_map,
     const AutoShardingSolverOption& solver_option, double replicated_penalty,
     const InstructionBatchDimMap& batch_dim_map, const CallGraph& call_graph,
-    bool only_allow_divisible) {
+    bool only_allow_divisible, bool create_replicated_strategies) {
   std::unique_ptr<StrategyVector> strategies;
   if (shape.IsTuple()) {
     strategies = CreateTupleStrategyVector(instruction_id);
     strategies->childs.reserve(shape.tuple_shapes_size());
     for (size_t i = 0; i < shape.tuple_shapes_size(); ++i) {
       strategies->childs.push_back(
-          CreateParameterStrategyVector(
+          CreateAllStrategiesVector(
               ins, shape.tuple_shapes().at(i), instruction_id, leaf_strategies,
               cluster_env, strategy_map, solver_option, replicated_penalty,
-              batch_dim_map, call_graph, only_allow_divisible)
+              batch_dim_map, call_graph, only_allow_divisible,
+              create_replicated_strategies)
               .value());
     }
   } else if (shape.IsArray()) {
@@ -818,8 +936,7 @@ StatusOr<std::unique_ptr<StrategyVector>> CreateParameterStrategyVector(
                               cluster_env, strategy_map, strategies,
                               only_allow_divisible, " 1d", call_graph);
     }
-    if (solver_option.allow_replicated_parameters ||
-        strategies->leaf_vector.empty()) {
+    if (create_replicated_strategies || strategies->leaf_vector.empty()) {
       AddReplicatedStrategy(ins, shape, cluster_env, strategy_map, strategies,
                             replicated_penalty);
     }
@@ -837,6 +954,19 @@ StatusOr<std::unique_ptr<StrategyVector>> CreateParameterStrategyVector(
   return strategies;
 }
 
+StatusOr<std::unique_ptr<StrategyVector>> CreateParameterStrategyVector(
+    const HloInstruction* ins, const Shape& shape, size_t instruction_id,
+    LeafStrategies& leaf_strategies, const ClusterEnvironment& cluster_env,
+    const StrategyMap& strategy_map,
+    const AutoShardingSolverOption& solver_option, double replicated_penalty,
+    const InstructionBatchDimMap& batch_dim_map, const CallGraph& call_graph,
+    bool only_allow_divisible) {
+  return CreateAllStrategiesVector(
+      ins, shape, instruction_id, leaf_strategies, cluster_env, strategy_map,
+      solver_option, replicated_penalty, batch_dim_map, call_graph,
+      only_allow_divisible, solver_option.allow_replicated_parameters);
+}
+
 // The sharding is replicated or the total number of tiles is over or equal to
 // the total number of devices. If returns true, this sharding is likely
 // provided by users.
@@ -920,20 +1050,40 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
         if (strategies->in_nodes.empty()) {
           resharding_costs = {};
         } else {
+          HloInstruction* ins = instructions.at(strategies->instruction_id);
           for (size_t i = 0; i < strategies->in_nodes.size(); i++) {
             HloInstruction* operand =
                 instructions.at(strategies->in_nodes.at(i)->instruction_id);
-            HloInstruction* ins = instructions.at(strategies->instruction_id);
             std::optional<HloSharding> input_sharding_or =
                 ShardingPropagation::GetShardingFromUser(*operand, *ins, 10,
                                                          true, call_graph);
             if (input_sharding_or.has_value()) {
               input_shardings.push_back(input_sharding_or.value());
             }
-            // Set resharding cost to be 0 because there is only one choice and
-            // the cost do not matter.
-            resharding_costs.push_back(std::vector<double>(
-                strategy_map.at(operand)->leaf_vector.size(), 0.0));
+
+            StrategyVector* operand_strategies;
+            Shape operand_shape;
+            if (ins->opcode() == HloOpcode::kGetTupleElement) {
+              operand_strategies =
+                  strategy_map.at(operand)->childs[ins->tuple_index()].get();
+              operand_shape = operand->shape().tuple_shapes(ins->tuple_index());
+            } else {
+              operand_strategies = strategy_map.at(operand).get();
+              operand_shape = operand->shape();
+            }
+            std::vector<double> in_resharding_costs =
+                ReshardingCostVector(operand_strategies, operand_shape,
+                                     existing_sharding, cluster_env);
+            // If there is only one option for resharding, and the cost
+            // computed for that option is kInfinityCost, set the cost to
+            // zero. This is okay because there is only one option anyway, and
+            // having the costs set to kInfinityCost is problematic for the
+            // solver.
+            if (in_resharding_costs.size() == 1 &&
+                in_resharding_costs[0] == kInfinityCost) {
+              in_resharding_costs[0] = 0;
+            }
+            resharding_costs.push_back(in_resharding_costs);
           }
         }
         double memory_cost =
@@ -1097,7 +1247,8 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
                      const AliasMap& alias_map,
                      const ClusterEnvironment& cluster_env,
                      AutoShardingSolverOption& solver_option,
-                     const CallGraph& call_graph) {
+                     const CallGraph& call_graph,
+                     bool trying_multiple_mesh_shapes) {
   const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
   const Array<int64_t>& device_mesh_1d = cluster_env.device_mesh_1d_;
   StrategyMap strategy_map;
@@ -1208,8 +1359,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
                               device_mesh.dim(j)))) {
               continue;
             }
-            std::string name = absl::StrCat("S", std::to_string(index_dim),
-                                            " @ ", std::to_string(j));
+            std::string name = absl::StrCat("S", index_dim, " @ ", j);
 
             HloSharding output_spec =
                 Tile(shape, {index_dim}, {j}, device_mesh);
@@ -1221,19 +1371,22 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
             if (!input_spec.has_value()) {  // invalid reshape
               continue;
             }
+            std::vector<std::optional<HloSharding>> input_shardings_optional(
+                {std::nullopt, input_spec});
             std::vector<std::vector<double>> resharding_cost =
-                GenerateReshardingCostsForAllOperands(
+                GenerateReshardingCostsAndMissingShardingsForAllOperands(
                     ins, output_spec, strategy_map, cluster_env, call_graph,
-                    {input_spec, std::nullopt});
+                    input_shardings_optional);
 
-            strategies->leaf_vector.push_back(
-                ShardingStrategy({name,
-                                  output_spec,
-                                  compute_cost,
-                                  communication_cost,
-                                  memory_cost,
-                                  std::move(resharding_cost),
-                                  {*input_spec}}));
+            std::vector<HloSharding> input_shardings;
+            for (auto sharding_optional : input_shardings_optional) {
+              CHECK(sharding_optional.has_value());
+              input_shardings.push_back(sharding_optional.value());
+            }
+
+            strategies->leaf_vector.push_back(ShardingStrategy(
+                {name, output_spec, compute_cost, communication_cost,
+                 memory_cost, std::move(resharding_cost), input_shardings}));
           }
         }
         AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
@@ -1389,7 +1542,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
                                 communication_cost,
                                 memory_cost,
                                 {resharding_costs},
-                                {}}));
+                                {input_spec}}));
         }
         break;
       }
@@ -1588,20 +1741,17 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
             double memory_cost =
                 GetBytes(ins->shape()) / output_spec.NumTiles();
             std::vector<std::vector<double>> resharding_costs;
+            std::vector<HloSharding> input_shardings;
             for (int64_t k = 0; k < ins->operand_count(); ++k) {
               resharding_costs.push_back(ReshardingCostVector(
                   strategy_map.at(ins->operand(k)).get(),
                   ins->operand(k)->shape(), output_spec, cluster_env));
+              input_shardings.push_back(output_spec);
             }
 
-            strategies->leaf_vector.push_back(
-                ShardingStrategy({name,
-                                  output_spec,
-                                  compute_cost,
-                                  communication_cost,
-                                  memory_cost,
-                                  std::move(resharding_costs),
-                                  {}}));
+            strategies->leaf_vector.push_back(ShardingStrategy(
+                {name, output_spec, compute_cost, communication_cost,
+                 memory_cost, std::move(resharding_costs), input_shardings}));
           }
         }
         if (ins->opcode() == HloOpcode::kAdd) {
@@ -1772,6 +1922,24 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
 
         break;
       }
+      case HloOpcode::kConditional:
+      case HloOpcode::kSort: {
+        strategies =
+            CreateAllStrategiesVector(
+                ins, ins->shape(), instruction_id, leaf_strategies, cluster_env,
+                strategy_map, solver_option, replicated_penalty, batch_dim_map,
+                call_graph, only_allow_divisible,
+                /*create_replicated_strategies*/ true)
+                .value();
+        break;
+      }
+      case HloOpcode::kAfterAll: {
+        strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
+                                              leaf_strategies);
+        AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
+                              strategies, replicated_penalty);
+        break;
+      }
       default:
         LOG(FATAL) << "Unhandled instruction: " + ins->ToString();
     }
@@ -1831,8 +1999,21 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
         strategies->leaf_vector = std::move(new_leaf_vector);
       }
     }
-    CHECK(strategies->is_tuple || !strategies->leaf_vector.empty())
-        << ins->ToString() << " does not have any valid strategies.";
+
+    // When trying out multiple mesh shapes in the presence of user specified
+    // sharding (as in
+    // AutoShardingTest.AutoShardingKeepUserShardingInputOutput), there may be a
+    // situation when we cannot generate any shardings for an instruction when
+    // the mesh shape we're trying does not match with the mesh shape used in
+    // user specified shardings. So we disable the check in that situation.
+    if (!trying_multiple_mesh_shapes) {
+      CHECK(strategies->is_tuple || !strategies->leaf_vector.empty())
+          << ins->ToString() << " does not have any valid strategies.";
+    } else if (!(strategies->is_tuple || !strategies->leaf_vector.empty())) {
+      return Status(absl::StatusCode::kFailedPrecondition,
+                    "Could not generate any shardings for an instruction due "
+                    "to mismatched mesh shapes.");
+    }
     // Checks the shape of resharding_costs is valid. It will check fail if the
     // shape is not as expected.
     CheckReshardingCostsShape(strategies.get());
@@ -1976,7 +2157,8 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
                   const std::vector<std::vector<double>>& r,
                   const std::vector<std::pair<int, int>>& A,
                   const std::vector<std::vector<double>>& v,
-                  const std::vector<std::string>& instruction_names) {
+                  const std::vector<std::string>& instruction_names,
+                  bool crash_at_infinity_costs_check) {
   size_t num_edges = E.size();
 
   int32_t num_workers = 32;
@@ -2006,8 +2188,7 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
     if (s_follow[i] < 0) {
       var_vector_cnt += 1;
       // Creates variables for instructions that do not follow others.
-      solver->MakeBoolVarArray(
-          s_len[i], absl::StrCat("s[", std::to_string(i), "]"), &s[i]);
+      solver->MakeBoolVarArray(s_len[i], absl::StrCat("s[", i, "]"), &s[i]);
     }
   }
 
@@ -2021,10 +2202,9 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
 
   for (size_t i = 0; i < num_edges; ++i) {
     std::pair<int, int> edge = E[i];
-    solver->MakeBoolVarArray(s_len[edge.first] * s_len[edge.second],
-                             absl::StrCat("e[", std::to_string(edge.first), ",",
-                                          std::to_string(edge.second), "]"),
-                             &e[i]);
+    solver->MakeBoolVarArray(
+        s_len[edge.first] * s_len[edge.second],
+        absl::StrCat("e[", edge.first, ",", edge.second, "]"), &e[i]);
   }
 
   // Objective
@@ -2090,8 +2270,14 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
       }
     }
     if (all_infinity) {
-      LOG(FATAL) << "All of e[" << E[i].first << "][" << E[i].second
-                 << "][*] have infinity costs";
+      auto err_msg = absl::StrCat("All of e[", E[i].first, "][", E[i].second,
+                                  "][*] have infinity costs");
+      if (crash_at_infinity_costs_check) {
+        LOG(FATAL) << err_msg;
+      } else {
+        LOG(WARNING) << err_msg;
+        return tsl::errors::Internal(err_msg);
+      }
     }
   }
 
@@ -2298,7 +2484,8 @@ StatusOr<std::tuple<std::vector<int64_t>, std::vector<int64_t>, double>>
 CallSolver(const HloInstructionSequence& sequence,
            const LivenessSet& liveness_set, const StrategyMap& strategy_map,
            const LeafStrategies& leaf_strategies, const CostGraph& cost_graph,
-           const AliasSet& alias_set, int64_t memory_budget_per_device) {
+           const AliasSet& alias_set, int64_t memory_budget_per_device,
+           bool crash_at_infinity_costs_check) {
   // Serialize edges and edge costs to 1d numpy arrays
   int64_t N = leaf_strategies.size();
   int64_t M = memory_budget_per_device;
@@ -2416,7 +2603,7 @@ CallSolver(const HloInstructionSequence& sequence,
     }
   }
   return CallORToolsSolver(N, M, s_len, s_follow, E, L, c, d, m, r, A, v,
-                           instruction_names);
+                           instruction_names, crash_at_infinity_costs_check);
 }
 
 void CheckHloSharding(const HloInstructionSequence& sequence,
@@ -2639,6 +2826,29 @@ void SetHloShardingPostProcessing(const HloInstructionSequence& sequence,
         FixMixedMeshShapeResharding(inst, 0, stra.input_shardings[0],
                                     device_mesh, resharding_cache);
       }
+    } else {
+      // TODO(pratikf): We currently skip over tuple shaped instructions here as
+      // GetShardingStrategy, which is invoked below does not currently support
+      // such instructions. Implement this support.
+      if (inst->shape().IsTuple()) {
+        continue;
+      }
+      const ShardingStrategy& stra =
+          GetShardingStrategy(inst, strategy_map, cost_graph, s_val);
+      if (stra.input_shardings.empty()) {
+        continue;
+      }
+      if (inst->opcode() == HloOpcode::kGetTupleElement) {
+        FixMixedMeshShapeReshardingGetTupleElement(inst, inst->sharding(),
+                                                   device_mesh);
+      } else {
+        for (size_t i = 0; i < inst->operand_count(); ++i) {
+          if (stra.input_shardings.size() > i) {
+            FixMixedMeshShapeResharding(inst, i, stra.input_shardings[i],
+                                        device_mesh, resharding_cache);
+          }
+        }
+      }
     }
   }
 
@@ -2660,7 +2870,8 @@ std::string PrintLivenessSet(const LivenessSet& liveness_set) {
     std::vector<std::string> names;
     names.reserve(liveness_set[i].size());
     for (const HloValue* value : liveness_set[i]) {
-      names.push_back(value->instruction()->name() + value->index().ToString());
+      names.push_back(absl::StrCat(value->instruction()->name(),
+                                   value->index().ToString()));
     }
     std::sort(names.begin(), names.end());
     absl::StrAppend(&str, "Time ", i, ": ", absl::StrJoin(names, ", "), "\n");
@@ -3194,7 +3405,7 @@ void GenerateReduceScatter(const HloInstructionSequence& sequence,
         continue;
       }
 
-      VLOG(10) << "SET:  " << output_spec.ToString();
+      VLOG(10) << "SET: " << output_spec.ToString();
 
       if (absl::StartsWith(strategy.name, "RR = RS x SR")) {
         // If set the sharding for this dot instruction, the SPMD
@@ -3640,7 +3851,7 @@ bool HasReduceScatterOpportunity(
 
 }  // namespace spmd
 
-StatusOr<bool> AutoSharding::RemoveShardingAnnotation(
+StatusOr<bool> AutoShardingImplementation::RemoveShardingAnnotation(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (option_.preserve_shardings ==
@@ -3677,7 +3888,7 @@ StatusOr<bool> AutoSharding::RemoveShardingAnnotation(
   return changed;
 }
 
-Status AutoSharding::CanonicalizeLayouts(HloModule* module) {
+Status AutoShardingImplementation::CanonicalizeLayouts(HloModule* module) {
   if (!module->layout_canonicalization_callback()) {
     LOG(INFO) << "There is no registered layout_canonicalization_callback.";
     return OkStatus();
@@ -3702,10 +3913,11 @@ Status AutoSharding::CanonicalizeLayouts(HloModule* module) {
   return OkStatus();
 }
 
-AutoSharding::AutoSharding(const AutoShardingOption& option)
+AutoShardingImplementation::AutoShardingImplementation(
+    const AutoShardingOption& option)
     : option_(option) {}
 
-StatusOr<bool> AutoSharding::Run(
+StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (!option_.enable) {
@@ -3714,15 +3926,7 @@ StatusOr<bool> AutoSharding::Run(
   bool module_is_changed = false;
   VLOG(1) << "Start auto sharding pass";
 
-#if !defined(__APPLE__)
-  // Streamz metrics.
-  absl::Time start_time = absl::Now();
-  metrics::RecordAutoShardingInvocations();
-#endif
-
   bool set_to_memory_lower_bound = (option_.memory_budget_per_device == 0);
-  TF_RETURN_IF_ERROR(option_.CheckAndSetup());
-  VLOG(1) << "AutoShardingOptions:\n" << option_.ToString();
   // ----- Set options for this pass -----
   spmd::AutoShardingSolverOption solver_option;
   solver_option.override_all_gather_cost = false;
@@ -3940,8 +4144,8 @@ StatusOr<bool> AutoSharding::Run(
     TF_ASSIGN_OR_RETURN(
         std::tie(strategy_map, leaf_strategies, associative_dot_pairs),
         BuildStrategyAndCost(sequence, module, ins_depth_map, batch_dim_map,
-                             alias_map, cluster_env, solver_option,
-                             *call_graph));
+                             alias_map, cluster_env, solver_option, *call_graph,
+                             option_.try_multiple_mesh_shapes));
     spmd::AliasSet alias_set = spmd::BuildAliasSet(module, strategy_map);
     CheckAliasSetCompatibility(alias_set, leaf_strategies, sequence);
     XLA_VLOG_LINES(8, PrintStrategyMap(strategy_map, sequence));
@@ -3957,8 +4161,11 @@ StatusOr<bool> AutoSharding::Run(
       TF_ASSIGN_OR_RETURN(
           auto solution,
           CallSolver(sequence, liveness_set, strategy_map, leaf_strategies,
-                     cost_graph, alias_set, option_.memory_budget_per_device));
+                     cost_graph, alias_set, option_.memory_budget_per_device,
+                     /*crash_at_infinity_costs_check*/
+                     !option_.try_multiple_mesh_shapes));
       std::tie(s_val, e_val, objective) = solution;
+      this->solver_optimal_objective_value_ = objective;
     } else {
       s_val = option_.strategy_vector;
     }
@@ -3999,6 +4206,114 @@ StatusOr<bool> AutoSharding::Run(
   XLA_VLOG_LINES(6, absl::StrCat("After auto sharding:\n", module->ToString()));
   DumpHloModuleIfEnabled(*module, "after_auto_spmd_sharding");
 
+  return module_is_changed;
+}
+
+AutoSharding::AutoSharding(const AutoShardingOption& option)
+    : option_(option) {}
+
+StatusOr<bool> AutoSharding::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  VLOG(1) << "Running auto-sharding pass";
+
+#if !defined(__APPLE__)
+  // Streamz metrics.
+  absl::Time start_time = absl::Now();
+  metrics::RecordAutoShardingInvocations();
+#endif
+
+  TF_RETURN_IF_ERROR(option_.CheckAndSetup());
+  VLOG(1) << "AutoShardingOptions:\n" << option_.ToString();
+
+  if (!option_.try_multiple_mesh_shapes) {
+    AutoShardingImplementation pass(option_);
+    auto result = pass.RunAutoSharding(module, execution_threads);
+    this->solver_optimal_objective_value_ =
+        pass.GetSolverOptimalObjectiveValue();
+    return result;
+  }
+
+  bool asymmetrical_mesh_dims = false;
+  for (size_t i = 0; i < option_.device_mesh_shape.size(); ++i) {
+    if (option_.device_mesh_beta[0] != option_.device_mesh_beta[i] ||
+        option_.device_mesh_alpha[0] != option_.device_mesh_alpha[i]) {
+      asymmetrical_mesh_dims = true;
+      break;
+    }
+  }
+
+  std::vector<std::vector<int64_t>> mesh_shapes;
+  if (option_.try_multiple_mesh_shapes) {
+    mesh_shapes = spmd::CreateDifferentMeshShapesToTry(
+        absl::c_accumulate(option_.device_mesh_shape, 1,
+                           [](int64_t a, int64_t b) { return a * b; }),
+        option_.device_mesh_shape.size(),
+        /* symmetrical_mesh_dims */ !asymmetrical_mesh_dims);
+  } else {
+    mesh_shapes.push_back(option_.device_mesh_shape);
+  }
+
+  size_t num_meshes = mesh_shapes.size();
+  std::vector<std::unique_ptr<HloModule>> modules(num_meshes);
+  std::vector<StatusOr<bool>> changed(num_meshes, false);
+  std::vector<double> objective_values(num_meshes, -1);
+
+  VLOG(1) << "Original mesh shape "
+          << spmd::ToString(option_.device_mesh_shape);
+  double min_objective_value = std::numeric_limits<double>::max();
+  int min_mesh_shape_index = -1;
+  for (size_t i = 0; i < mesh_shapes.size(); ++i) {
+    VLOG(1) << "Trying mesh shape " << spmd::ToString(mesh_shapes[i]);
+    AutoShardingOption this_option = option_;
+    this_option.device_mesh_shape = mesh_shapes[i];
+    auto pass = new AutoShardingImplementation(this_option);
+    auto module_clone = module->Clone();
+    auto pass_result =
+        pass->RunAutoSharding(module_clone.get(), execution_threads);
+    changed[i] = pass_result;
+    objective_values[i] = pass->GetSolverOptimalObjectiveValue();
+    modules[i] = std::move(module_clone);
+    delete pass;
+    VLOG(1) << "Mesh shape " << spmd::ToString(mesh_shapes[i])
+            << " has objective value " << objective_values[i];
+    if (objective_values[i] >= 0 && min_objective_value > objective_values[i]) {
+      min_mesh_shape_index = i;
+      min_objective_value = objective_values[i];
+    }
+  }
+
+  StatusOr<bool> module_is_changed;
+  if (!changed[min_mesh_shape_index].ok()) {
+    module_is_changed = changed[min_mesh_shape_index];
+  } else {
+    solver_optimal_objective_value_ = min_objective_value;
+    if (*changed[min_mesh_shape_index]) {
+      VLOG(1) << "Choosing mesh shape "
+              << spmd::ToString(mesh_shapes[min_mesh_shape_index])
+              << " which had the minimal solver objective value of "
+              << min_objective_value;
+
+      absl::flat_hash_map<HloComputation*, HloComputation*>
+          computation_replacements;
+      for (size_t i = 0; i < module->computation_count(); ++i) {
+        auto original_computation = module->mutable_computation(i);
+        auto new_computation =
+            modules[min_mesh_shape_index]->mutable_computation(i);
+        computation_replacements[original_computation] = new_computation;
+      }
+
+      module->ReplaceComputations(computation_replacements);
+      module->MoveComputationsFrom(modules[min_mesh_shape_index].get());
+
+      module_is_changed = true;
+    } else if (!*changed[min_mesh_shape_index]) {
+      module_is_changed = false;
+    } else {
+      module_is_changed = false;
+    }
+  }
+
 #if !defined(__APPLE__)
   absl::Time end_time = absl::Now();
   auto duration = end_time - start_time;
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h
index db7fe6bfc41..c37b843b9bb 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -159,6 +159,9 @@ struct AutoShardingOption {
   std::vector<double> device_mesh_beta;
   // Load the strategy vector instead of solving one.
   bool load_strategy = false;
+  // Explore other mesh shapes with the same number of devices as the provided
+  // one for a potentially better auto-sharding solution.
+  bool try_multiple_mesh_shapes = false;
   std::vector<int64_t> strategy_vector;
 
   std::string ToString() {
@@ -172,6 +175,8 @@ struct AutoShardingOption {
           absl::StrCat("memory_budget_per_device: ",
                        memory_budget_per_device / (1024 * 1024 * 1024), " GB"));
     }
+    lines.push_back(
+        absl::StrCat("try_multiple_mesh_shapes: ", try_multiple_mesh_shapes));
     lines.push_back(
         absl::StrCat("force_all_gather_cost: ", force_all_gather_cost));
 
@@ -251,6 +256,12 @@ struct AutoShardingOption {
                        "device_mesh_shape=",
                        absl::StrJoin(device_mesh_shape, ",")));
     }
+    if (spmd::VectorGreaterThanOneElementCount(device_mesh_shape) > 2) {
+      return tsl::errors::OutOfRange(
+          absl::StrCat("the auto-sharding pass currently does not support ",
+                       "more than two shardable dims: device_mesh_shape=",
+                       absl::StrJoin(device_mesh_shape, ",")));
+    }
     if (device_mesh_alpha.empty()) {
       // Generates simple device_mesh_alpha based on the size of
       // device_mesh_shape.
@@ -309,16 +320,15 @@ struct AutoShardingOption {
   }
 };
 
-class AutoSharding : public HloModulePass {
+class AutoShardingImplementation {
  public:
-  explicit AutoSharding(const AutoShardingOption& option);
-  ~AutoSharding() override = default;
-  absl::string_view name() const override { return "auto_sharding"; }
+  explicit AutoShardingImplementation(const AutoShardingOption& option);
+  ~AutoShardingImplementation() = default;
 
-  using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  // using HloPassInterface::Run;
+  StatusOr<bool> RunAutoSharding(
       HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
 
   // Removes SPMD annotations (if there are) to test AutoSharding on manually
   // annotated graphs.
@@ -334,8 +344,38 @@ class AutoSharding : public HloModulePass {
   //     tensorflow/compiler/xla/pjrt/utils.cc
   Status CanonicalizeLayouts(HloModule* module);
 
+  // Returns the optimal objective value that the ILP solver computes
+  double GetSolverOptimalObjectiveValue() {
+    return solver_optimal_objective_value_;
+  }
+
  private:
   AutoShardingOption option_;
+
+  // Stores the optimal value of the objective the solver found. This is used to
+  // chose the best mesh shape when the try_multiple_mesh_shapes option is on.
+  double solver_optimal_objective_value_ = -1.0;
+};
+
+class AutoSharding : public HloModulePass {
+ public:
+  explicit AutoSharding(const AutoShardingOption& option);
+  ~AutoSharding() override = default;
+  absl::string_view name() const override { return "auto_sharding"; }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  double GetSolverOptimalObjectiveValue() {
+    return solver_optimal_objective_value_;
+  }
+
+ private:
+  AutoShardingOption option_;
+  // Stores the optimal value of the objective the solver found.
+  double solver_optimal_objective_value_ = -1.0;
 };
 
 namespace spmd {
@@ -421,7 +461,6 @@ bool HasReduceScatterOpportunity(
 HloSharding GetReduceScatterOutput(const HloInstruction* ins,
                                    const ShardingStrategy& strategy,
                                    const ClusterEnvironment& cluster_env);
-
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_solver_option.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_solver_option.h
index 18d5495ae12..f3c1eb16294 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_solver_option.h
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_solver_option.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_OPTION_H__
-#define TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_OPTION_H__
+#ifndef TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_OPTION_H_
+#define TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_OPTION_H_
 
 #include <cstddef>
 #include <cstdint>
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
index 49ef931c3ce..13c9c2c062b 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
@@ -136,33 +136,29 @@ struct StrategyVector {
 
   std::string ToString(size_t indention = 0) const {
     std::string str;
-    absl::StrAppend(&str, std::string(indention, ' '), "id: ", id, "\n");
-    absl::StrAppend(&str, std::string(indention, ' '),
-                    "instruction id: ", instruction_id, "\n");
-    absl::StrAppend(&str, std::string(indention, ' '), "is_tuple: ", is_tuple,
-                    "\n");
+    const std::string indent(indention, ' ');
+    absl::StrAppend(&str, indent, "id: ", id, "\n");
+    absl::StrAppend(&str, indent, "instruction id: ", instruction_id, "\n");
+    absl::StrAppend(&str, indent, "is_tuple: ", is_tuple, "\n");
     if (following != nullptr) {
-      absl::StrAppend(&str, std::string(indention, ' '),
+      absl::StrAppend(&str, indent,
                       "following instruction: ", following->instruction_id,
                       "\n");
     } else {
-      absl::StrAppend(&str, std::string(indention, ' '),
-                      "source instruction\n");
+      absl::StrAppend(&str, indent, "source instruction\n");
     }
     for (auto i : in_nodes) {
-      absl::StrAppend(&str, std::string(indention, ' '), "in nodes: id=", i->id,
+      absl::StrAppend(&str, indent, "in nodes: id=", i->id,
                       " instruction_id=", i->instruction_id, "\n");
     }
     if (is_tuple) {
       for (size_t i = 0; i < childs.size(); ++i) {
-        absl::StrAppend(&str, std::string(indention, ' '), "Tuple element #", i,
-                        ":\n");
+        absl::StrAppend(&str, indent, "Tuple element #", i, ":\n");
         absl::StrAppend(&str, childs[i]->ToString(indention + 2));
       }
     } else {
       for (const auto& strategy : leaf_vector) {
-        absl::StrAppend(&str, std::string(indention, ' '), "Strategy ",
-                        strategy.ToStringLong());
+        absl::StrAppend(&str, indent, "Strategy ", strategy.ToStringLong());
       }
     }
     return str;
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
new file mode 100644
index 00000000000..fe75e4954a8
--- /dev/null
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -0,0 +1,957 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h"
+
+#include <cstddef>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "ortools/linear_solver/linear_solver.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+using MPConstraint = operations_research::MPConstraint;
+using MPSolver = operations_research::MPSolver;
+using MPSolverParameters = operations_research::MPSolverParameters;
+using MPVariable = operations_research::MPVariable;
+
+namespace xla {
+namespace spmd {
+namespace {
+
+using DummyAutoShardingTest = HloTestBase;
+
+TEST_F(DummyAutoShardingTest, ReplicatedShardingDummy) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %elementwise {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0)
+  %param1 = f32[5,7,11,13]{3,2,1,0} parameter(1)
+  %add = f32[5,7,11,13]{3,2,1,0} add(%param0, %param1)
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%add)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, DummyAutoSharding().Run(module.get()));
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "param0");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction, op::Sharding("{replicated}"));
+}
+
+TEST(MIPSolverTest, TwoVariableToyExample) {
+  // SAT or SCIP
+  std::unique_ptr<MPSolver> solver(
+      std::make_unique<MPSolver>("", MPSolver::GLPK_MIXED_INTEGER_PROGRAMMING));
+  solver->MutableObjective()->SetMaximization();
+  ASSERT_TRUE(solver);
+  // Test with the following integer programming problem:
+  //   max  x + 2y
+  //   s.t. 6x + 2y <= 19
+  //        0 <= x <= 3
+  //        0 <= y <= 2
+  MPVariable* x = solver->MakeIntVar(0.0, 3.0, "x");
+  MPVariable* y = solver->MakeIntVar(0.0, 2.0, "y");
+  MPConstraint* constraint =
+      solver->MakeRowConstraint(-MPSolver::infinity(), 19.0);
+  constraint->SetCoefficient(x, 6.0);
+  constraint->SetCoefficient(y, 2.0);
+  solver->MutableObjective()->SetCoefficient(x, 1.0);
+  solver->MutableObjective()->SetCoefficient(y, 2.0);
+  MPSolver::ResultStatus solve_status = solver->Solve();
+  EXPECT_EQ(solve_status, MPSolver::OPTIMAL);
+  EXPECT_DOUBLE_EQ(x->solution_value(), 2.0);
+  EXPECT_DOUBLE_EQ(y->solution_value(), 2.0);
+}
+
+class AutoShardingTest : public HloTestBase {
+ protected:
+  const char* const dot_hlo_string_ = R"(
+HloModule module
+ENTRY matmul {
+  parameter.1 = f32[32,64]{1,0} parameter(0)
+  parameter.2 = f32[64,128]{1,0} parameter(1)
+  ROOT root = f32[32,128]{1,0} dot(parameter.1, parameter.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+  const char* const add_hlo_string_ = R"(
+HloModule module
+ENTRY %elementwise {
+  %param0 = f32[16,32,64]{2,1,0} parameter(0)
+  %param1 = f32[16,32,64]{2,1,0} parameter(1)
+  ROOT root = f32[16,32,64]{2,1,0} add(%param0, %param1)
+})";
+  void RunMatMulAutoShardingWithOptions(
+      AutoShardingOption option, size_t expected_num_tiles,
+      size_t expected_sharded_dimensions = 1) {
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(dot_hlo_string_));
+    RunAutoShardingWithOptions(module.get(), option, expected_num_tiles,
+                               expected_sharded_dimensions);
+  }
+
+  void RunAddAutoShardingWithOptions(AutoShardingOption option,
+                                     size_t expected_num_tiles,
+                                     size_t expected_sharded_dimensions = 1) {
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(add_hlo_string_));
+    RunAutoShardingWithOptions(module.get(), option, expected_num_tiles,
+                               expected_sharded_dimensions);
+  }
+
+  void RunAutoShardingWithOptions(HloModule* module, AutoShardingOption option,
+                                  size_t expected_num_tiles,
+                                  size_t expected_sharded_dimensions = 1) {
+    TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module));
+    EXPECT_TRUE(changed);
+    // To simplify the test, only checking the sharding of root.
+    auto* root = FindInstruction(module, "root");
+    ASSERT_NE(root, nullptr);
+    EXPECT_EQ(root->sharding().NumTiles(), expected_num_tiles);
+    EXPECT_EQ(VectorGreaterThanOneElementCount(
+                  root->sharding().tile_assignment().dimensions(),
+                  root->sharding().ReplicateOnLastTileDim()),
+              expected_sharded_dimensions);
+  }
+
+  void RunMatMulAutoShardingWithOptionsExpectFail(AutoShardingOption option) {
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(dot_hlo_string_));
+    RunAutoShardingWithOptionsExpectFail(module.get(), option);
+  }
+
+  void RunAutoShardingWithOptionsExpectFail(HloModule* module,
+                                            AutoShardingOption option) {
+    EXPECT_FALSE(AutoSharding(option).Run(module).ok());
+  }
+
+  void RunMatMulAutoShardingWithOptionsNoDeviceIds(
+      AutoShardingOption option, std::vector<int64_t> expected_tile,
+      bool expeted_last_dim_replicate = false) {
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(dot_hlo_string_));
+    RunAutoShardingWithOptionsNoDeviceIds(module.get(), option, expected_tile,
+                                          expeted_last_dim_replicate);
+  }
+
+  void RunAutoShardingWithOptionsNoDeviceIds(HloModule* module,
+                                             AutoShardingOption option,
+                                             std::vector<int64_t> expected_tile,
+                                             bool expeted_last_dim_replicate) {
+    TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module));
+    EXPECT_TRUE(changed);
+    // To simplify the test, only checking the sharding of root.
+    HloInstruction* root = FindInstruction(module, "root");
+    ASSERT_NE(root, nullptr);
+    EXPECT_EQ(root->sharding().ReplicateOnLastTileDim(),
+              expeted_last_dim_replicate);
+    EXPECT_THAT(root->sharding().tile_assignment().dimensions(),
+                ::testing::ElementsAreArray(expected_tile));
+  }
+};
+
+TEST_F(AutoShardingTest, DISABLED_ElementWiseOperator) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %elementwise {
+  %param0 = f32[128,128]{0,1} parameter(0)
+  %param1 = f32[128,128]{0,1} parameter(1)
+  %add = f32[128,128]{0,1} add(%param0, %param1)
+  ROOT %copy = f32[128,128]{0,1} copy(%add)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "param0");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction, op::Sharding("{devices=[2,2]0,2,1,3}"));
+}
+
+TEST_F(AutoShardingTest, DotLHSTwoNonContractingDims) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %param0 = f32[4,256,64]{2,1,0} parameter(0)
+  %param1 = f32[64,32]{0,1} parameter(1)
+  %dot = f32[4,256,32]{2,1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[64,32]{0,1} %param1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  ROOT %copy = f32[4,256,32]{2,1,0} copy(%dot)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  VLOG(2) << module->ToString();
+  EXPECT_TRUE(changed);
+  auto* param0 = FindInstruction(module.get(), "param0");
+  auto* param1 = FindInstruction(module.get(), "param1");
+  auto* dot = FindInstruction(module.get(), "dot");
+  ASSERT_NE(param0, nullptr);
+  ASSERT_NE(param1, nullptr);
+  ASSERT_NE(dot, nullptr);
+  EXPECT_THAT(
+      std::make_tuple(param0, param1, dot),
+      AnyOf(
+          ::testing::FieldsAre(
+              op::Sharding(
+                  "{devices=[1,2,1,2]0,1,2,3 last_tile_dim_replicate}"),
+              op::Sharding("{devices=[1,2,2]0,2,1,3 last_tile_dim_replicate}"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3}")),
+          ::testing::FieldsAre(
+              op::Sharding(
+                  "{devices=[1,2,1,2]0,2,1,3 last_tile_dim_replicate}"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"),
+              op::Sharding("{devices=[1,2,2]0,2,1,3}")),
+          ::testing::FieldsAre(
+              op::Sharding(
+                  "{devices=[2,1,1,2]0,1,2,3 last_tile_dim_replicate}"),
+              op::Sharding("{devices=[1,2,2]0,2,1,3 last_tile_dim_replicate}"),
+              op::Sharding("{devices=[2,1,2]0,1,2,3}")),
+          ::testing::FieldsAre(
+              op::Sharding(
+                  "{devices=[2,1,1,2]0,2,1,3 last_tile_dim_replicate}"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"),
+              op::Sharding("{devices=[2,1,2]0,2,1,3}"))));
+}
+
+TEST_F(AutoShardingTest, DotRHSTwoNonContractingDims) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %param0 = f32[4,256,32]{2,1,0} parameter(0)
+  %param1 = f32[4,256,4,8]{1,3,2,0} parameter(1)
+  %dot = f32[32,4,8]{2,1,0} dot(f32[4,256,32]{2,1,0} %param0, f32[4,256,4,8]{1,3,2,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+  ROOT %copy = f32[32,4,8]{2,1,0} copy(%dot)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  VLOG(2) << module->ToString();
+  EXPECT_TRUE(changed);
+  auto* param0 = FindInstruction(module.get(), "param0");
+  ASSERT_NE(param0, nullptr);
+  EXPECT_THAT(
+      param0,
+      op::Sharding("{devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+  auto* param1 = FindInstruction(module.get(), "param1");
+  ASSERT_NE(param1, nullptr);
+  EXPECT_THAT(
+      param1,
+      op::Sharding("{devices=[1,1,2,1,2]0,2,1,3 last_tile_dim_replicate}"));
+  auto* dot = FindInstruction(module.get(), "dot");
+  ASSERT_NE(dot, nullptr);
+  EXPECT_THAT(dot, op::Sharding("{devices=[2,2,1]0,1,2,3}"));
+}
+
+TEST_F(AutoShardingTest, DotTwoContractingDims) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %param0 = f32[4,256,64]{2,1,0} parameter(0)
+  %param1 = f32[4,256,32]{2,1,0} parameter(1)
+  %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[4,256,32]{2,1,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+  ROOT %copy = f32[64,32]{1,0} copy(%dot)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  VLOG(2) << module->ToString();
+  EXPECT_TRUE(changed);
+  auto* param0 = FindInstruction(module.get(), "param0");
+  auto* param1 = FindInstruction(module.get(), "param1");
+  auto* dot = FindInstruction(module.get(), "dot");
+  ASSERT_NE(param0, nullptr);
+  ASSERT_NE(param1, nullptr);
+  ASSERT_NE(dot, nullptr);
+  EXPECT_THAT(
+      std::make_tuple(param0, param1, dot),
+      AnyOf(::testing::FieldsAre(
+                op::Sharding(
+                    "{devices=[1,1,2,2]0,2,1,3 last_tile_dim_replicate}"),
+                op::Sharding(
+                    "{devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}"),
+                op::Sharding("{devices=[2,2]0,2,1,3}")),
+            ::testing::FieldsAre(
+                op::Sharding(
+                    "{devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}"),
+                op::Sharding(
+                    "{devices=[1,1,2,2]0,2,1,3 last_tile_dim_replicate}"),
+                op::Sharding("{devices=[2,2]0,1,2,3}"))));
+}
+
+TEST_F(AutoShardingTest, TwoMatmul) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY twomatmul {
+  parameter.1 = f32[64,64]{1,0} parameter(0)
+  parameter.2 = f32[64,128]{1,0} parameter(1)
+  dot.4 = f32[64,128]{1,0} dot(parameter.1, parameter.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  parameter.3 = f32[128,64]{1,0} parameter(2)
+  ROOT dot.5 = f32[64,64]{1,0} dot(dot.4, parameter.3), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+  auto* param1 = FindInstruction(module.get(), "parameter.1");
+  ASSERT_NE(param1, nullptr);
+  EXPECT_THAT(param1,
+              op::Sharding("{devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}"));
+  auto* param2 = FindInstruction(module.get(), "parameter.2");
+  ASSERT_NE(param2, nullptr);
+  EXPECT_THAT(param2,
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+  auto* param3 = FindInstruction(module.get(), "parameter.3");
+  ASSERT_NE(param3, nullptr);
+  EXPECT_THAT(param3,
+              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+  auto* dot4 = FindInstruction(module.get(), "dot.4");
+  ASSERT_NE(dot4, nullptr);
+  EXPECT_THAT(dot4, op::Sharding("{devices=[2,2]0,2,1,3}"));
+  auto* dot5 = FindInstruction(module.get(), "dot.5");
+  ASSERT_NE(dot5, nullptr);
+  EXPECT_THAT(dot5,
+              op::Sharding("{devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}"));
+}
+
+TEST_F(AutoShardingTest, ProcessCustomCallShardings) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry {
+  %param0 = f32[6,3] parameter(0)
+  %copy = f32[6,3] copy(%param0)
+  %annotate = f32[6,3] custom-call(%copy), custom_call_target="Sharding",
+    backend_config="unspecified_dims=[1]",
+    sharding={devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}
+  %copy.2 = f32[6,3] copy(%annotate)
+  ROOT %copy.3 = f32[6,3] copy(%copy.2)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.preserve_shardings =
+      AutoShardingOption::PreserveShardingsType::kKeepAllShardings;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  EXPECT_TRUE(changed);
+  // %annotate's sharding is moved to %copy.
+  auto* copy = FindInstruction(module.get(), "copy");
+  ASSERT_NE(copy, nullptr);
+  EXPECT_TRUE(copy->has_sharding());
+  EXPECT_THAT(copy,
+              op::Sharding("{devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}"));
+}
+
+TEST_F(AutoShardingTest, RemoveShardingAnnotationKeepAll) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
+  %param0 = f32[4,256,64]{2,1,0} parameter(0), sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %param1 = f32[4,256,32]{2,1,0} parameter(1), sharding={devices=[1,1,2,2]0,2,1,3 last_tile_dim_replicate}
+  %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[4,256,32]{2,1,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={devices=[2,2]0,1,2,3}
+  ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  // Keep all user shardings
+  option.preserve_shardings =
+      AutoShardingOption::PreserveShardingsType::kKeepAllShardings;
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, AutoShardingImplementation(option).RemoveShardingAnnotation(
+                        module.get()));
+  EXPECT_FALSE(changed);
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* ins : computation->instructions()) {
+      EXPECT_TRUE(ins->has_sharding());
+    }
+  }
+}
+
+TEST_F(AutoShardingTest, RemoveShardingAnnotationRemoveIntermediate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
+  %param0 = f32[4,256,64]{2,1,0} parameter(0), sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %param1 = f32[4,256,32]{2,1,0} parameter(1), sharding={devices=[1,1,2,2]0,2,1,3 last_tile_dim_replicate}
+  %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[4,256,32]{2,1,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={devices=[2,2]0,1,2,3}
+  ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.preserve_shardings =
+      AutoShardingOption::PreserveShardingsType::kKeepInputOutputShardings;
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, AutoShardingImplementation(option).RemoveShardingAnnotation(
+                        module.get()));
+  EXPECT_TRUE(changed);
+  // Dot does not have shardings anymore.
+  auto* dot = FindInstruction(module.get(), "dot");
+  ASSERT_NE(dot, nullptr);
+  EXPECT_FALSE(dot->has_sharding());
+  // params and copy still have shardings.
+  auto* param0 = FindInstruction(module.get(), "param0");
+  ASSERT_NE(param0, nullptr);
+  EXPECT_TRUE(param0->has_sharding());
+  EXPECT_THAT(
+      param0,
+      op::Sharding("{devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+  auto* param1 = FindInstruction(module.get(), "param1");
+  ASSERT_NE(param1, nullptr);
+  EXPECT_TRUE(param1->has_sharding());
+  EXPECT_THAT(
+      param1,
+      op::Sharding("{devices=[1,1,2,2]0,2,1,3 last_tile_dim_replicate}"));
+  auto* copy = FindInstruction(module.get(), "copy");
+  ASSERT_NE(copy, nullptr);
+  EXPECT_TRUE(copy->has_sharding());
+  EXPECT_THAT(copy, op::Sharding("{devices=[2,2]0,1,2,3}"));
+}
+
+TEST_F(AutoShardingTest, RemoveShardingAnnotationRemoveAll) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
+  %param0 = f32[4,256,64]{2,1,0} parameter(0), sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %param1 = f32[4,256,32]{2,1,0} parameter(1), sharding={devices=[1,1,2,2]0,2,1,3 last_tile_dim_replicate}
+  %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[4,256,32]{2,1,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={devices=[2,2]0,1,2,3}
+  ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  // Remove all user shardings
+  option.preserve_shardings =
+      AutoShardingOption::PreserveShardingsType::kRemoveAllShardings;
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, AutoShardingImplementation(option).RemoveShardingAnnotation(
+                        module.get()));
+  EXPECT_TRUE(changed);
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* ins : computation->instructions()) {
+      EXPECT_FALSE(ins->has_sharding());
+    }
+  }
+}
+
+TEST_F(AutoShardingTest, MatmulMeshShape1DMeshShape) {
+  AutoShardingOption option;
+  option.enable = true;
+  // Only provide device_mesh_shape
+  option.device_mesh_shape = {4};
+  RunMatMulAutoShardingWithOptions(option, 4);
+  option.device_mesh_shape = {8};
+  RunMatMulAutoShardingWithOptions(option, 8);
+}
+
+TEST_F(AutoShardingTest, MatmulMeshShape1DMeshShapeIds) {
+  AutoShardingOption option;
+  option.enable = true;
+
+  // Add mesh_ids
+  option.device_mesh_shape = {4};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  RunMatMulAutoShardingWithOptions(option, 4);
+
+  option.device_mesh_shape = {8};
+  option.device_mesh_ids = {0, 1, 2, 3, 4, 5, 6, 7};
+  RunMatMulAutoShardingWithOptions(option, 8);
+}
+
+TEST_F(AutoShardingTest, MatmulMeshShape1DAllOptions) {
+  AutoShardingOption option;
+  option.enable = true;
+  // Add alpha and beta
+  option.device_mesh_shape = {4};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0};
+  option.device_mesh_beta = {1.0};
+  RunMatMulAutoShardingWithOptions(option, 4);
+
+  option.device_mesh_shape = {8};
+  option.device_mesh_ids = {0, 1, 2, 3, 4, 5, 6, 7};
+  option.device_mesh_alpha = {1.0};
+  option.device_mesh_beta = {1.0};
+  RunMatMulAutoShardingWithOptions(option, 8);
+}
+
+TEST_F(AutoShardingTest, MatmulMeshShape2DAllOptions) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  RunMatMulAutoShardingWithOptions(option, 4, 2);
+
+  option.enable = true;
+  option.device_mesh_shape = {1, 4};
+  RunMatMulAutoShardingWithOptions(option, 4);
+
+  option.enable = true;
+  option.device_mesh_shape = {4, 1};
+  RunMatMulAutoShardingWithOptions(option, 4);
+}
+
+TEST_F(AutoShardingTest, MatmulMeshShape2DNoAlphaBeta) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  RunMatMulAutoShardingWithOptions(option, 4, 2);
+
+  option.enable = true;
+  option.device_mesh_shape = {1, 4};
+  RunMatMulAutoShardingWithOptions(option, 4);
+
+  // Specifying all mesh_* options.
+  option.enable = true;
+  option.device_mesh_shape = {4, 1};
+  RunMatMulAutoShardingWithOptions(option, 4);
+}
+
+TEST_F(AutoShardingTest, MatmulMeshShape2DNoAlphaBetaMeshIds) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  RunMatMulAutoShardingWithOptions(option, 4, 2);
+
+  option.enable = true;
+  option.device_mesh_shape = {1, 4};
+  RunMatMulAutoShardingWithOptions(option, 4);
+
+  // Specifying all mesh_* options.
+  option.enable = true;
+  option.device_mesh_shape = {4, 1};
+  RunMatMulAutoShardingWithOptions(option, 4);
+}
+
+TEST_F(AutoShardingTest, MatmulMeshShape2DNoMeshIds) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  RunMatMulAutoShardingWithOptions(option, 4, 2);
+
+  option.enable = true;
+  option.device_mesh_shape = {1, 4};
+  RunMatMulAutoShardingWithOptions(option, 4);
+
+  // Specifying all mesh_* options.
+  option.enable = true;
+  option.device_mesh_shape = {4, 1};
+  RunMatMulAutoShardingWithOptions(option, 4);
+}
+
+TEST_F(AutoShardingTest, DISABLED_MatmulMeshShape3DAllOptions) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3, 4, 5, 6, 7};
+  option.device_mesh_alpha = {1.0, 1.0, 1.0};
+  option.device_mesh_beta = {0.01, 0.5, 1.0};
+  RunMatMulAutoShardingWithOptionsNoDeviceIds(option, {2, 2, 2}, true);
+}
+
+TEST_F(AutoShardingTest, Matmul3DMeshShape2DSharding) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {1, 2, 2};
+  RunMatMulAutoShardingWithOptions(option, 4, 2);
+
+  option.device_mesh_shape = {2, 1, 2};
+  RunMatMulAutoShardingWithOptions(option, 4, 2);
+
+  option.device_mesh_shape = {2, 2, 1};
+  RunMatMulAutoShardingWithOptions(option, 4, 2);
+}
+
+TEST_F(AutoShardingTest, DISABLED_AddMeshShape3DAllOptions) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {1, 2, 4};
+  option.device_mesh_ids = {0, 1, 2, 3, 4, 5, 6, 7};
+  option.device_mesh_alpha = {1.0, 1.0, 1.0};
+  option.device_mesh_beta = {0.01, 0.5, 1.0};
+  RunAddAutoShardingWithOptions(option, 8, 2);
+
+  option.device_mesh_shape = {4, 1, 2};
+  RunAddAutoShardingWithOptions(option, 8, 2);
+
+  option.device_mesh_shape = {1, 4, 2};
+  RunAddAutoShardingWithOptions(option, 8, 2);
+}
+
+TEST_F(AutoShardingTest, DISABLED_AddMeshShape3DNoAlphaBeta) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {1, 2, 4};
+  option.device_mesh_ids = {0, 1, 2, 3, 4, 5, 6, 7};
+  RunAddAutoShardingWithOptions(option, 8, 2);
+
+  option.device_mesh_shape = {4, 1, 2};
+  RunAddAutoShardingWithOptions(option, 8, 2);
+
+  option.device_mesh_shape = {1, 4, 2};
+  RunAddAutoShardingWithOptions(option, 8, 2);
+}
+
+TEST_F(AutoShardingTest, DISABLED_AddMeshShape3DNoAlphaBetaMeshIds) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {1, 2, 4};
+  RunAddAutoShardingWithOptions(option, 8, 2);
+
+  option.device_mesh_shape = {4, 1, 2};
+  RunAddAutoShardingWithOptions(option, 8, 2);
+
+  option.device_mesh_shape = {1, 4, 2};
+  RunAddAutoShardingWithOptions(option, 8, 2);
+}
+
+TEST_F(AutoShardingTest, DISABLED_AddMeshShape3DNoMeshIds) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {1, 2, 4};
+  option.device_mesh_alpha = {1.0, 1.0, 1.0};
+  option.device_mesh_beta = {0.01, 0.5, 1.0};
+  RunAddAutoShardingWithOptions(option, 8, 2);
+
+  option.device_mesh_shape = {4, 1, 2};
+  RunAddAutoShardingWithOptions(option, 8, 2);
+
+  option.device_mesh_shape = {1, 4, 2};
+  RunAddAutoShardingWithOptions(option, 8, 2);
+}
+
+TEST_F(AutoShardingTest, DISABLED_MatMulMeshShape2D) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  RunMatMulAutoShardingWithOptions(option, 4, 2);
+}
+
+TEST_F(AutoShardingTest, DISABLED_AddMeshShape2D) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  RunAddAutoShardingWithOptions(option, 4, 2);
+}
+
+TEST_F(AutoShardingTest, DISABLED_AddMeshShape3D) {
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2, 2};
+  option.device_mesh_alpha = {1.0, 1.0, 1.0};
+  option.device_mesh_beta = {0.01, 0.5, 1.0};
+  RunAddAutoShardingWithOptions(option, 2);
+}
+
+TEST_F(AutoShardingTest, InvalidOptions) {
+  // Sizes do not match.
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {1, 2, 4};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 0.5};
+  EXPECT_FALSE(option.CheckAndSetup().ok());
+  RunMatMulAutoShardingWithOptionsExpectFail(option);
+
+  // Size is too large.
+  option.device_mesh_shape = {1, 2, 4, 7};
+  option.device_mesh_alpha = {1.0, 1.0, 1.0, 1.0};
+  option.device_mesh_beta = {1.0, 1.0, 1.0, 1.0};
+  EXPECT_FALSE(option.CheckAndSetup().ok());
+  RunMatMulAutoShardingWithOptionsExpectFail(option);
+
+  // device_mesh_shape is empty.
+  AutoShardingOption empty_option;
+  empty_option.enable = true;
+  EXPECT_FALSE(empty_option.CheckAndSetup().ok());
+  RunMatMulAutoShardingWithOptionsExpectFail(empty_option);
+
+  // Non-positive values in device_mesh_shape.
+  AutoShardingOption option_with_non_positive_mesh;
+  option_with_non_positive_mesh.enable = true;
+  option_with_non_positive_mesh.device_mesh_shape = {0, 4};
+  EXPECT_FALSE(option_with_non_positive_mesh.CheckAndSetup().ok());
+  RunMatMulAutoShardingWithOptionsExpectFail(option_with_non_positive_mesh);
+  option_with_non_positive_mesh.device_mesh_shape = {-1, 4};
+  EXPECT_FALSE(option_with_non_positive_mesh.CheckAndSetup().ok());
+  RunMatMulAutoShardingWithOptionsExpectFail(option_with_non_positive_mesh);
+
+  // device_mesh_shape and device_mesh_ids are not compatible.
+  AutoShardingOption option_not_compatible;
+  option_not_compatible.enable = true;
+  option_not_compatible.device_mesh_shape = {4, 8};
+  option_not_compatible.device_mesh_ids = {1, 2, 3, 4};
+  EXPECT_FALSE(option_not_compatible.CheckAndSetup().ok());
+  RunMatMulAutoShardingWithOptionsExpectFail(option_not_compatible);
+}
+
+TEST_F(AutoShardingTest, AutoShardingKeepUserShardingInputOutput) {
+  // An HLO Module with sharding for all instructions.
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
+  %param0 = f32[4,256,64]{2,1,0} parameter(0), sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %param1 = f32[4,256,32]{2,1,0} parameter(1), sharding={devices=[1,1,2,2]0,2,1,3 last_tile_dim_replicate}
+  %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[4,256,32]{2,1,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={devices=[2,2]0,1,2,3}
+  ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  // Remove the sharding in dot
+  auto* dot = FindInstruction(module.get(), "dot");
+  dot->clear_sharding();
+  EXPECT_FALSE(dot->has_sharding());
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.preserve_shardings =
+      AutoShardingOption::PreserveShardingsType::kKeepInputOutputShardings;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  EXPECT_TRUE(changed);
+  auto* dot_after = FindInstruction(module.get(), "dot");
+  ASSERT_NE(dot_after, nullptr);
+  EXPECT_THAT(dot_after, op::Sharding("{devices=[2,2]0,1,2,3}"));
+  auto sharding = dot_after->sharding();
+  TF_EXPECT_OK(sharding.Validate(dot_after->shape(), 4));
+}
+
+TEST_F(AutoShardingTest, AutoShardingKeepUserShardingAdd) {
+  // An HLO Module with sharding for all instructions.
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %elementwise {
+  %param0 = f32[128,128]{0,1} parameter(0)
+  %param1 = f32[128,128]{0,1} parameter(1)
+  %add = f32[128,128]{0,1} add(%param0, %param1), sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %copy = f32[128,128]{0,1} copy(%add)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  // Run AutoSharding
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.preserve_shardings =
+      AutoShardingOption::PreserveShardingsType::kKeepAllShardings;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  EXPECT_TRUE(changed);
+  auto* param0_after = FindInstruction(module.get(), "param0");
+  ASSERT_NE(param0_after, nullptr);
+  EXPECT_THAT(param0_after,
+              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+  auto* param1_after = FindInstruction(module.get(), "param1");
+  ASSERT_NE(param1_after, nullptr);
+  EXPECT_THAT(param1_after,
+              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+  auto* add_after = FindInstruction(module.get(), "add");
+  ASSERT_NE(add_after, nullptr);
+  EXPECT_THAT(add_after,
+              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
+TEST_F(AutoShardingTest, AutoShardingKeepUserShardingDot) {
+  // An HLO Module with sharding for all instructions.
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
+  %param0 = f32[4,256,64]{2,1,0} parameter(0), sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %param1 = f32[4,256,32]{2,1,0} parameter(1), sharding={devices=[1,1,2,2]0,2,1,3 last_tile_dim_replicate}
+  %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[4,256,32]{2,1,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={devices=[2,2]0,1,2,3}
+  ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  // Remove the sharding in param0, param1 and copy
+  auto* param0 = FindInstruction(module.get(), "param0");
+  param0->clear_sharding();
+  EXPECT_FALSE(param0->has_sharding());
+  auto* param1 = FindInstruction(module.get(), "param1");
+  param1->clear_sharding();
+  EXPECT_FALSE(param1->has_sharding());
+  auto* copy = FindInstruction(module.get(), "copy");
+  copy->clear_sharding();
+  EXPECT_FALSE(copy->has_sharding());
+  // Run AutoSharding
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.preserve_shardings =
+      AutoShardingOption::PreserveShardingsType::kKeepAllShardings;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  EXPECT_TRUE(changed);
+  auto* param0_after = FindInstruction(module.get(), "param0");
+  ASSERT_NE(param0_after, nullptr);
+  EXPECT_THAT(
+      param0_after,
+      op::Sharding("{devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+  auto* param1_after = FindInstruction(module.get(), "param1");
+  ASSERT_NE(param1_after, nullptr);
+  EXPECT_THAT(
+      param1_after,
+      op::Sharding("{devices=[1,1,2,2]0,2,1,3 last_tile_dim_replicate}"));
+  auto* copy_after = FindInstruction(module.get(), "copy");
+  ASSERT_NE(copy_after, nullptr);
+  EXPECT_THAT(copy_after, op::Sharding("{devices=[2,2]0,1,2,3}"));
+}
+
+TEST_F(AutoShardingTest, DISABLED_TupleParameter) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %tupleparameter {
+  %tuple_param = (f32[16,32,64]{2,1,0}, f32[16,32,64]{2,1,0}) parameter(0)
+  %first = f32[16,32,64]{2,1,0} get-tuple-element((f32[16,32,64]{2,1,0}, f32[16,32,64]{2,1,0}) %tuple_param), index=0
+  %second = f32[16,32,64]{2,1,0} get-tuple-element((f32[16,32,64]{2,1,0}, f32[16,32,64]{2,1,0}) %tuple_param), index=1
+  ROOT root = f32[16,32,64]{2,1,0} add(%first, %second)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+  auto* tuple_param = FindInstruction(module.get(), "tuple_param");
+  ASSERT_NE(tuple_param, nullptr);
+  EXPECT_THAT(
+      tuple_param,
+      op::Sharding("{{devices=[2,2,1]0,2,1,3}, {devices=[2,2,1]0,2,1,3}}"));
+  TF_EXPECT_OK(tuple_param->sharding().Validate(tuple_param->shape(), 4));
+}
+
+TEST_F(AutoShardingTest, Reshape) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry {
+  %param.0 = bf16[24,2048,2048]{2,1,0} parameter(0)
+  %param.1 = s32[] parameter(1)
+  %param.2 = bf16[512,1024,2048]{2,1,0} parameter(2)
+  %constant = s32[] constant(0)
+  %dynamic-slice = bf16[1,2048,2048]{2,1,0} dynamic-slice(bf16[24,2048,2048]{2,1,0} %param.0, s32[] %param.1, s32[] %constant, s32[] %constant), dynamic_slice_sizes={1,2048,2048}
+  %reshape = bf16[2048,16,128]{2,1,0} reshape(bf16[1,2048,2048]{2,1,0} %dynamic-slice)
+  %dot = bf16[512,1024,16,128]{3,2,1,0} dot(bf16[512,1024,2048]{2,1,0} %param.2, bf16[2048,16,128]{2,1,0} %reshape), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {64, 1};
+  option.device_mesh_ids.resize(64);
+  std::iota(option.device_mesh_ids.begin(), option.device_mesh_ids.end(), 0);
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  VLOG(1) << module->ToString();
+  EXPECT_TRUE(changed);
+}
+
+TEST_F(AutoShardingTest, TestReshardingCostsForUserAnnotatedSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry {
+  %param0 = f32[256,256] parameter(0)
+  %param1 = f32[256,256] parameter(1)
+  %dot = f32[256,256] dot(%param0, %param1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  ROOT %result = f32[256,256] tanh(%dot), sharding={devices=[1,4]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_beta = {1, 1};
+  option.device_mesh_alpha = {1, 1};
+  option.preserve_shardings =
+      AutoShardingOption::PreserveShardingsType::kKeepAllShardings;
+  AutoSharding pass(option);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+  EXPECT_TRUE(changed);
+  LOG(INFO) << module->ToString();
+  EXPECT_GT(pass.GetSolverOptimalObjectiveValue(), 0);
+}
+
+}  // namespace
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index 1a077757b64..3b4a256caf4 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -22,25 +22,23 @@ limitations under the License.
 #include <memory>
 #include <numeric>
 #include <optional>
-#include <ostream>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/btree_set.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/array.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.h"
-#include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
 namespace spmd {
@@ -327,9 +325,9 @@ InstructionDepthMap BuildInstructionDepthMap(
 
 std::string GetBatchDimMapKey(const HloInstruction* ins, int64_t idx) {
   if (idx >= 0) {
-    return ins->name() + "/" + std::to_string(idx);
+    return absl::StrCat(ins->name(), "/", idx);
   }
-  return ins->name();
+  return std::string(ins->name());
 }
 
 void BatchDimMapForward(const std::vector<HloInstruction*>& instructions,
@@ -1225,9 +1223,9 @@ bool TileAssignmentMatchesMesh(const HloSharding& spec,
   return sharded_dims <= 0;
 }
 
-std::vector<int64_t> GetTensorDimToMeshDim(const int64_t tensor_shape_rank,
-                                           const HloSharding& spec,
-                                           const Array<int64_t>& device_mesh) {
+absl::StatusOr<std::vector<int64_t>> GetTensorDimToMeshDimNoCrash(
+    int64_t tensor_shape_rank, const HloSharding& spec,
+    const Array<int64_t>& device_mesh) {
   if (spec.IsReplicated()) {
     return std::vector<int64_t>(tensor_shape_rank, -1);
   }
@@ -1248,13 +1246,15 @@ std::vector<int64_t> GetTensorDimToMeshDim(const int64_t tensor_shape_rank,
     }
   } while (absl::c_next_permutation(axes));
   if (!found) {
-    LOG(FATAL) << "Could not find mapping for " << spec.ToString()
-               << " with device mesh " << device_mesh.ToString();
+    return absl::NotFoundError(
+        absl::StrCat("Could not find mapping for ", spec.ToString(),
+                     " with device mesh ", device_mesh.ToString()));
   }
 
   if (!TileAssignmentMatchesMesh(spec, mesh)) {
-    LOG(FATAL) << "Device mesh and tile assignment need to have the same "
-                  "number of sharded dims.";
+    return absl::InvalidArgumentError(
+        "Device mesh and tile assignment need to have the same number of "
+        "sharded dims.");
   }
 
   // Transform tile_assignment_dimensions using found transformation (axes).
@@ -1272,6 +1272,119 @@ std::vector<int64_t> GetTensorDimToMeshDim(const int64_t tensor_shape_rank,
   return tensor_dim_to_device_dim;
 }
 
+std::vector<int64_t> GetTensorDimToMeshDim(int64_t tensor_shape_rank,
+                                           const HloSharding& spec,
+                                           const Array<int64_t>& device_mesh) {
+  auto mapping_or =
+      GetTensorDimToMeshDimNoCrash(tensor_shape_rank, spec, device_mesh);
+  if (mapping_or.ok()) {
+    return mapping_or.value();
+  } else {
+    LOG(FATAL) << mapping_or.status().message();
+  }
+}
+
+Shape ComputeIntermediateShape(const HloSharding& src_sharding,
+                               const HloSharding& dst_sharding,
+                               const Shape& shape,
+                               const Array<int64_t>& device_mesh) {
+  int64_t src_n_dim = NumTileDimensions(src_sharding);
+
+  const HloSharding* sharding_1d;
+
+  if (src_n_dim == 1) {
+    sharding_1d = &src_sharding;
+  } else {
+    sharding_1d = &dst_sharding;
+  }
+
+  // Find an intermediate shape
+  std::vector<int64_t> inter_shape_dims;
+
+  for (size_t i = 0; i < shape.rank(); ++i) {
+    if (sharding_1d->tile_assignment().dim(i) == 1) {
+      inter_shape_dims.push_back(shape.dimensions(i));
+    } else {
+      CHECK(shape.dimensions(i) % device_mesh.dim(0) == 0)
+          << "Only support even partition";
+      inter_shape_dims.push_back(device_mesh.dim(0));
+      inter_shape_dims.push_back(shape.dimensions(i) / device_mesh.dim(0));
+    }
+  }
+  VLOG(3) << " SHAPE " << static_cast<int>(shape.element_type()) << " "
+          << spmd::ToString(inter_shape_dims) << " " << src_sharding.ToString()
+          << "\n"
+          << dst_sharding.ToString();
+  return ShapeUtil::MakeShape(shape.element_type(), inter_shape_dims);
+}
+
+void FixMixedMeshShapeReshardingGetTupleElement(
+    HloInstruction* inst, const HloSharding& dst_sharding,
+    const Array<int64_t>& device_mesh) {
+  HloInstruction* operand = inst->mutable_operand(0);
+  auto input_tuple_sharding = operand->sharding();
+  size_t index = inst->tuple_index();
+  if (input_tuple_sharding.tuple_elements()[index] == dst_sharding) {
+    return;
+  }
+
+  const HloSharding& src_sharding =
+      input_tuple_sharding.tuple_elements()[index];
+  CHECK(operand->shape().IsTuple());
+  const Shape& shape = operand->shape().tuple_shapes(index);
+
+  int64_t src_n_dim = NumTileDimensions(src_sharding);
+  int64_t dst_n_dim = NumTileDimensions(dst_sharding);
+
+  HloInstruction* replace_with = nullptr;
+
+  auto inst_users = inst->users();
+  if (replace_with != nullptr) {
+    // Do nothing
+  } else if (src_n_dim != dst_n_dim && src_n_dim != -1 && dst_n_dim != -1) {
+    Shape inter_shape = ComputeIntermediateShape(src_sharding, dst_sharding,
+                                                 shape, device_mesh);
+
+    std::optional<HloSharding> src_inter_sharding =
+        hlo_sharding_util::ReshapeSharding(shape, inter_shape, src_sharding);
+    std::optional<HloSharding> dst_inter_sharding =
+        hlo_sharding_util::ReshapeSharding(shape, inter_shape, dst_sharding);
+    if (!src_inter_sharding.has_value() || !dst_inter_sharding.has_value()) {
+      src_inter_sharding = HloSharding::Replicate();
+      dst_inter_sharding = HloSharding::Replicate();
+      LOG(WARNING) << "Invalid mixed mesh shape resharding.";
+    }
+
+    HloInstruction* src_inter = inst->parent()->AddInstruction(
+        HloInstruction::CreateReshape(inter_shape, inst));
+    src_inter->set_sharding(*src_inter_sharding);
+
+    HloInstruction* dst_inter = inst->parent()->AddInstruction(
+        HloInstruction::CreateReshape(inter_shape, src_inter));
+    dst_inter->set_sharding(*dst_inter_sharding);
+
+    replace_with = inst->parent()->AddInstruction(
+        HloInstruction::CreateReshape(shape, dst_inter));
+    replace_with->set_sharding(dst_sharding);
+  } else {
+    replace_with = inst->parent()->AddInstruction(
+        HloInstruction::CreateReshape(shape, inst));
+    replace_with->set_sharding(dst_sharding);
+  }
+  inst->set_sharding(src_sharding);
+  size_t size =
+      GetInstructionSize(replace_with->shape()) / (1024 * 1024 * 1024);
+  if (size > 1) {
+    LOG(WARNING) << "Large reshape instruction inserted (operand of "
+                 << inst->name() << ") with size " << size
+                 << "GB: " << replace_with->ToString();
+  }
+
+  for (auto user : inst_users) {
+    TF_CHECK_OK(inst->ReplaceUseWith(user, replace_with));
+  }
+}
+
 void FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
                                  const HloSharding& dst_sharding,
                                  const Array<int64_t>& device_mesh,
@@ -1303,29 +1416,8 @@ void FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
   if (replace_with != nullptr) {
     // Do nothing
   } else if (src_n_dim != dst_n_dim && src_n_dim != -1 && dst_n_dim != -1) {
-    const HloSharding* sharding_1d;
-
-    if (src_n_dim == 1) {
-      sharding_1d = &src_sharding;
-    } else {
-      sharding_1d = &dst_sharding;
-    }
-
-    // Find an intermediate shape
-    std::vector<int64_t> inter_shape_dims;
-
-    for (size_t i = 0; i < shape.rank(); ++i) {
-      if (sharding_1d->tile_assignment().dim(i) == 1) {
-        inter_shape_dims.push_back(shape.dimensions(i));
-      } else {
-        CHECK(shape.dimensions(i) % device_mesh.dim(0) == 0)
-            << "Only support even partition";
-        inter_shape_dims.push_back(device_mesh.dim(0));
-        inter_shape_dims.push_back(shape.dimensions(i) / device_mesh.dim(0));
-      }
-    }
-    Shape inter_shape =
-        ShapeUtil::MakeShape(shape.element_type(), inter_shape_dims);
+    Shape inter_shape = ComputeIntermediateShape(src_sharding, dst_sharding,
+                                                 shape, device_mesh);
 
     std::optional<HloSharding> src_inter_sharding =
         hlo_sharding_util::ReshapeSharding(shape, inter_shape, src_sharding);
@@ -1510,8 +1602,7 @@ AliasMap BuildAliasMap(const HloModule* module) {
       module->input_output_alias_config();
 
   HloComputation* entry = module->entry_computation();
-  const std::vector<HloInstruction*>& parameter_instructions =
-      entry->parameter_instructions();
+  const auto& parameter_instructions = entry->parameter_instructions();
   const HloInstruction* output_tuple = entry->root_instruction();
 
   if (IsCustomCallMarker(output_tuple)) {
@@ -1550,8 +1641,7 @@ AliasSet BuildAliasSet(const HloModule* module,
       module->input_output_alias_config();
 
   HloComputation* entry = module->entry_computation();
-  const std::vector<HloInstruction*>& parameter_instructions =
-      entry->parameter_instructions();
+  const auto& parameter_instructions = entry->parameter_instructions();
   const HloInstruction* output_tuple = entry->root_instruction();
 
   AliasSet alias_set;
@@ -1595,7 +1685,8 @@ AliasSet BuildAliasSet(const HloModule* module,
     }
   });
 
-  // Uses the same sharding spec for while loop related instructions.
+  // Uses the same sharding spec for while loop and conditional related
+  // instructions.
   for (const HloComputation* computation : module->computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kWhile) {
@@ -1612,6 +1703,18 @@ AliasSet BuildAliasSet(const HloModule* module,
             strategy_map
                 .at(instruction->while_condition()->parameter_instruction(0))
                 .get());
+      } else if (instruction->opcode() == HloOpcode::kConditional) {
+        auto branch_computations = instruction->branch_computations();
+        for (size_t i = 0; i < branch_computations.size(); ++i) {
+          const auto& branch_computation = branch_computations[i];
+          traverse_tuple_alias(
+              strategy_map.at(instruction).get(),
+              strategy_map.at(branch_computation->root_instruction()).get());
+          traverse_tuple_alias(
+              strategy_map.at(instruction->operand(i + 1)).get(),
+              strategy_map.at(branch_computation->parameter_instruction(0))
+                  .get());
+        }
       }
     }
   }
@@ -1670,13 +1773,13 @@ void CheckAliasSetCompatibility(const AliasSet& alias_set,
   }
 }
 
-size_t VectorGreaterThanOneElementCount(const std::vector<int64_t>& vector,
+size_t VectorGreaterThanOneElementCount(absl::Span<const int64_t> span,
                                         bool omit_last_dim) {
-  return VectorGreaterThanOneElementIndices(vector, omit_last_dim).size();
+  return VectorGreaterThanOneElementIndices(span, omit_last_dim).size();
 }
 
 std::vector<int64_t> VectorGreaterThanOneElementIndices(
-    const std::vector<int64_t>& vector, bool omit_last_dim) {
+    absl::Span<const int64_t> vector, bool omit_last_dim) {
   std::vector<int64_t> result;
   for (size_t i = 0; i < vector.size(); i++) {
     if (i == vector.size() - 1 && omit_last_dim) {
@@ -1804,11 +1907,15 @@ std::optional<HloSharding> AdjustShardingWithPartialMeshShapePerElement(
       }
       // If replicate on other dimensions, remove the
       // replicate_on_last_tile
-      new_tile_assignment_dimensions = sharding.tile_assignment().dimensions();
+      new_tile_assignment_dimensions.assign(
+          sharding.tile_assignment().dimensions().begin(),
+          sharding.tile_assignment().dimensions().end());
       new_tile_assignment_dimensions.erase(
           new_tile_assignment_dimensions.end() - 1);
     } else {
-      new_tile_assignment_dimensions = sharding.tile_assignment().dimensions();
+      new_tile_assignment_dimensions.assign(
+          sharding.tile_assignment().dimensions().begin(),
+          sharding.tile_assignment().dimensions().end());
       absl::flat_hash_set<int64_t> current_shards;
       for (const auto dim : new_tile_assignment_dimensions) {
         if (dim > 1) {
@@ -1944,5 +2051,51 @@ bool IsEntryComputationInputOrOutput(const HloModule* module,
   }
   return false;
 }
+
+void CreateDifferentMeshShapesToTryHelper(
+    int64_t num_devices, size_t num_mesh_dims,
+    std::vector<int64_t> current_shape,
+    std::vector<std::vector<int64_t>>& all_shapes) {
+  if (current_shape.size() == num_mesh_dims - 1) {
+    current_shape.push_back(num_devices);
+    if (spmd::VectorGreaterThanOneElementCount(current_shape) <= 2) {
+      all_shapes.push_back(current_shape);
+    }
+    return;
+  } else {
+    int64_t current_dim = 1;
+    while (current_dim <= num_devices) {
+      std::vector<int64_t> new_shape(current_shape);
+      new_shape.push_back(current_dim);
+      CreateDifferentMeshShapesToTryHelper(
+          num_devices / current_dim, num_mesh_dims, new_shape, all_shapes);
+      current_dim *= 2;
+    }
+  }
+}
+
+std::vector<std::vector<int64_t>> CreateDifferentMeshShapesToTry(
+    const int64_t num_devices, int num_mesh_dims, bool symmetrical_mesh_dims) {
+  std::vector<std::vector<int64_t>> result;
+  CreateDifferentMeshShapesToTryHelper(num_devices, num_mesh_dims, {}, result);
+
+  if (symmetrical_mesh_dims) {
+    absl::flat_hash_set<absl::btree_multiset<int64_t>> dedup_result;
+    for (const auto& mesh_shape : result) {
+      dedup_result.insert(
+          absl::btree_multiset<int64_t>(mesh_shape.begin(), mesh_shape.end()));
+    }
+
+    result.clear();
+
+    for (const auto& mesh_shape_set : dedup_result) {
+      result.push_back(
+          std::vector<int64_t>(mesh_shape_set.begin(), mesh_shape_set.end()));
+    }
+  }
+
+  return result;
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index dfba0d6a459..3bae22d4e22 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -428,8 +428,19 @@ bool IsValidTileAssignment(const HloSharding& spec);
 // -1 means the tensor is replicated on the whole the mesh.
 int64_t NumTileDimensions(const HloSharding& spec);
 
+// When fixing mixed mesh resharding (see below), compute the correct
+// intermediate shape in order to insert copies.
+Shape ComputeIntermediateShape(const HloSharding& src_sharding,
+                               const HloSharding& dst_sharding,
+                               const Shape& shape,
+                               const Array<int64_t>& device_mesh);
+
 // Forcibly set the sharding of the operand of inst.
 // Also fix the resharding between 1d and 2d logical mesh.
+void FixMixedMeshShapeReshardingGetTupleElement(
+    HloInstruction* inst, const HloSharding& dst_sharding,
+    const Array<int64_t>& device_mesh);
+
 void FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
                                  const HloSharding& dst_sharding,
                                  const Array<int64_t>& device_mesh,
@@ -507,6 +518,10 @@ std::vector<int64_t> GetTensorDimToMeshDim(const int64_t tensor_shape_rank,
                                            const HloSharding& spec,
                                            const Array<int64_t>& device_mesh);
 
+absl::StatusOr<std::vector<int64_t>> GetTensorDimToMeshDimNoCrash(
+    int64_t tensor_shape_rank, const HloSharding& spec,
+    const Array<int64_t>& device_mesh);
+
 HloSharding Tile(const Shape& tensor_shape,
                  absl::Span<const int64_t> tensor_dims,
                  absl::Span<const int64_t> mesh_dims,
@@ -522,7 +537,8 @@ AliasSet BuildAliasSet(const HloModule* module,
 template <typename T>
 Array<T> Transpose(const Array<T> array, std::vector<int64_t> axes) {
   // Computes transposed array's size.
-  std::vector<int64_t> transposed_array_dimensions(array.dimensions());
+  std::vector<int64_t> transposed_array_dimensions(array.dimensions().begin(),
+                                                   array.dimensions().end());
   for (size_t i = 0; i < axes.size(); i++) {
     transposed_array_dimensions[i] = array.dimensions()[axes[i]];
   }
@@ -538,11 +554,11 @@ Array<T> Transpose(const Array<T> array, std::vector<int64_t> axes) {
 }
 
 // Used to determine whether a sharding or mesh shape is 1D, 2D, or 3D.
-size_t VectorGreaterThanOneElementCount(const std::vector<int64_t>& vector,
+size_t VectorGreaterThanOneElementCount(absl::Span<const int64_t> span,
                                         bool omit_last_dim = false);
 
 std::vector<int64_t> VectorGreaterThanOneElementIndices(
-    const std::vector<int64_t>& vector, bool omit_last_dim = false);
+    absl::Span<const int64_t> span, bool omit_last_dim = false);
 
 int64_t GetInstructionSize(const Shape& shape);
 
@@ -581,6 +597,12 @@ bool OutputInputSameShapes(const HloInstruction* ins);
 
 bool IsEntryComputationInputOrOutput(const HloModule* module,
                                      const HloInstruction* ins);
+
+// Given a number of devices (`num_devices`), create a list different mesh
+// shapes of a given rank (`num_mesh_dims`) to try, if the option to try
+// multiple mesh shapes is enabled.
+std::vector<std::vector<int64_t>> CreateDifferentMeshShapesToTry(
+    int64_t num_devices, int num_mesh_dims, bool symmetrical_mesh_dims);
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.cc
index 9bc1eca4ed4..ee2f055bae5 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.cc
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.cc
@@ -165,22 +165,43 @@ double ClusterEnvironment::ReshardingCost(const Shape& shape,
           src_spec.tile_assignment().dimensions()) == 1 &&
       VectorGreaterThanOneElementCount(device_mesh_.dimensions()) > 1) {
     // src spec is 1D and device_mesh is 2D or 3D
-    src_tensor_dim_to_mesh_dim =
-        GetTensorDimToMeshDim(src_rank, src_spec, device_mesh_1d_);
+
+    auto src_tensor_dim_to_mesh_dim_or =
+        GetTensorDimToMeshDimNoCrash(src_rank, src_spec, device_mesh_1d_);
+    if (!src_tensor_dim_to_mesh_dim_or.ok()) {
+      return kInfinityCost;
+    }
+    src_tensor_dim_to_mesh_dim = src_tensor_dim_to_mesh_dim_or.value();
   } else {
-    src_tensor_dim_to_mesh_dim =
-        GetTensorDimToMeshDim(src_rank, src_spec, device_mesh_);
+    auto src_tensor_dim_to_mesh_dim_or =
+        GetTensorDimToMeshDimNoCrash(src_rank, src_spec, device_mesh_);
+    if (!src_tensor_dim_to_mesh_dim_or.ok()) {
+      return kInfinityCost;
+    }
+    src_tensor_dim_to_mesh_dim = src_tensor_dim_to_mesh_dim_or.value();
   }
   std::vector<int64_t> dst_tensor_dim_to_mesh_dim;
+
+  // TODO(pratikf) Currently, we return kInfinityCost when the input mesh shape
+  // and mesh shape in the sharding do not match. This can possibly be better
+  // handled.
   if (VectorGreaterThanOneElementCount(
           dst_spec.tile_assignment().dimensions()) == 1 &&
       VectorGreaterThanOneElementCount(device_mesh_.dimensions()) > 1) {
     // src spec is 1D and device_mesh is 2D or 3D
-    dst_tensor_dim_to_mesh_dim =
-        GetTensorDimToMeshDim(dst_rank, dst_spec, device_mesh_1d_);
+    auto dst_tensor_dim_to_mesh_dim_or =
+        GetTensorDimToMeshDimNoCrash(dst_rank, dst_spec, device_mesh_1d_);
+    if (!dst_tensor_dim_to_mesh_dim_or.ok()) {
+      return kInfinityCost;
+    }
+    dst_tensor_dim_to_mesh_dim = dst_tensor_dim_to_mesh_dim_or.value();
   } else {
-    dst_tensor_dim_to_mesh_dim =
-        GetTensorDimToMeshDim(dst_rank, dst_spec, device_mesh_);
+    auto dst_tensor_dim_to_mesh_dim_or =
+        GetTensorDimToMeshDimNoCrash(dst_rank, dst_spec, device_mesh_);
+    if (!dst_tensor_dim_to_mesh_dim_or.ok()) {
+      return kInfinityCost;
+    }
+    dst_tensor_dim_to_mesh_dim = dst_tensor_dim_to_mesh_dim_or.value();
   }
   if (src_n_dim != dst_n_dim && src_n_dim != -1 && dst_n_dim != -1) {
     return ReshardingCostMixedMeshShape(
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/profiling_result.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/profiling_result.h
index 92a3a417649..3414b8f8783 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/profiling_result.h
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/profiling_result.h
@@ -137,8 +137,7 @@ class ProfilingResult {
   // Make a string key of a replica_groups.
   std::string Group2Str(
       const std::vector<std::vector<int64_t>>& replica_groups) const {
-    std::string str;
-    absl::StrAppend(&str, "(");
+    std::string str("(");
     for (const auto& group : replica_groups) {
       absl::StrAppend(&str, "(", absl::StrJoin(group, ","), ")");
     }
diff --git a/tensorflow/compiler/xla/hlo/ir/BUILD b/tensorflow/compiler/xla/hlo/ir/BUILD
index 501d119cd72..8a193aa7506 100644
--- a/tensorflow/compiler/xla/hlo/ir/BUILD
+++ b/tensorflow/compiler/xla/hlo/ir/BUILD
@@ -86,6 +86,7 @@ cc_library(
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
diff --git a/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h
index 79468f35fad..76c359ab26b 100644
--- a/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h
@@ -16,17 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_HLO_IR_DFS_HLO_VISITOR_WITH_DEFAULT_H_
 #define TENSORFLOW_COMPILER_XLA_HLO_IR_DFS_HLO_VISITOR_WITH_DEFAULT_H_
 
+#include <utility>
+
+#include "absl/base/optimization.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
-#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/status.h"
 
 namespace xla {
@@ -297,9 +295,11 @@ class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
   StatusOr<bool> RunOnModule(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads = {}) {
-    for (const auto& computation :
+    Status status;
+    for (HloComputation* computation :
          module->MakeNonfusionComputations(execution_threads)) {
-      TF_RETURN_IF_ERROR(computation->Accept(this));
+      status = computation->Accept(this);
+      if (ABSL_PREDICT_FALSE(!status.ok())) return status;
     }
     return changed();
   }
@@ -318,13 +318,15 @@ class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
   Status ReplaceWithNewInstruction(
       HloInstruction* old_instruction,
       std::unique_ptr<HloInstruction> new_instruction) {
-    VLOG(3) << "Replacing instruction:";
-    VLOG(3) << "  old: " << old_instruction->ToString();
-    VLOG(3) << "  new: " << new_instruction->ToString();
-    TF_RETURN_IF_ERROR(old_instruction->parent()->ReplaceWithNewInstruction(
-        old_instruction, std::move(new_instruction)));
-    changed_ = true;
-    return OkStatus();
+    VLOG(3) << "Replacing instruction:"
+            << "\n  old: " << old_instruction->ToString()
+            << "\n  new: " << new_instruction->ToString();
+    Status status = old_instruction->parent()->ReplaceWithNewInstruction(
+        old_instruction, std::move(new_instruction));
+    if (ABSL_PREDICT_TRUE(status.ok())) {
+      changed_ = true;
+    }
+    return status;
   }
 
   // Replaces the existing HLO instruction old_instruction, with
@@ -333,23 +335,26 @@ class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
   StatusOr<bool> ReplaceInstruction(HloInstruction* old_instruction,
                                     HloInstruction* new_instruction,
                                     bool preserve_sharding) {
-    VLOG(3) << "Replacing instruction:";
-    VLOG(3) << "  old: " << old_instruction->ToString();
-    VLOG(3) << "  new: " << new_instruction->ToString();
-    TF_ASSIGN_OR_RETURN(
-        bool changed, old_instruction->parent()->ReplaceInstruction(
-                          old_instruction, new_instruction, preserve_sharding));
-    changed_ |= changed;
-    return changed;
+    VLOG(3) << "Replacing instruction:"
+            << "\n  old: " << old_instruction->ToString()
+            << "\n  new: " << new_instruction->ToString();
+    StatusOr<bool> changed_or = old_instruction->parent()->ReplaceInstruction(
+        old_instruction, new_instruction, preserve_sharding);
+    if (ABSL_PREDICT_TRUE(changed_or.ok())) {
+      changed_ |= changed_or.value();
+    }
+    return changed_or;
   }
 
   Status ReplaceInstruction(HloInstruction* old_instruction,
                             HloInstruction* new_instruction) {
-    TF_ASSIGN_OR_RETURN(bool changed,
-                        ReplaceInstruction(old_instruction, new_instruction,
-                                           /*preserve_sharding=*/false));
-    DCHECK(changed);
-    return OkStatus();
+    StatusOr<bool> changed_or =
+        ReplaceInstruction(old_instruction, new_instruction,
+                           /*preserve_sharding=*/false);
+    if (ABSL_PREDICT_TRUE(changed_or.ok())) {
+      DCHECK(changed_or.value());
+    }
+    return changed_or.status();
   }
 
   // Mark the computation as having changed.
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc b/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc
index 310806f69ba..7c82eb64a18 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc
@@ -67,7 +67,7 @@ std::unique_ptr<HloComputation> HloComputation::Builder::Build(
   }
   // If root_instruction is not specified use the last added instruction.
   HloInstruction* root =
-      root_instruction ? root_instruction : last_added_instruction_;
+      root_instruction ? root_instruction : last_added_instruction();
   CHECK_NE(nullptr, root);
   return absl::WrapUnique(new HloComputation(
       name_, parameter_count, &instructions_, root, fusion_instruction_));
@@ -90,7 +90,7 @@ HloComputation::HloComputation(
     if (instruction->opcode() == HloOpcode::kParameter) {
       int64_t param_no = instruction->parameter_number();
       CHECK(param_no >= 0 && param_no < parameter_count)
-          << "\nERROR: invalid parameter number.  Expected [0, "
+          << "\nERROR: invalid parameter number. Expected [0, "
           << parameter_count << "), got " << param_no;
       CHECK(param_instructions_[param_no] == nullptr)
           << "\nERROR: parameter number " << param_no
@@ -120,7 +120,7 @@ HloComputation::~HloComputation() {
 }
 
 HloInstruction* HloComputation::AddInstruction(
-    std::unique_ptr<HloInstruction> instruction, const std::string& new_name) {
+    std::unique_ptr<HloInstruction> instruction, absl::string_view new_name) {
   CHECK(instruction->opcode() != HloOpcode::kParameter)
       << "Parameter instructions cannot be added to a computation after "
       << "it has been built";
@@ -277,12 +277,13 @@ Status HloComputation::RemoveUnusedParametersImpl(bool allow_non_fusion) {
   return OkStatus();
 }
 
-bool HloComputation::IsSafelyRemovable(const HloInstruction* instruction) {
+bool HloComputation::IsSafelyRemovable(const HloInstruction* instruction,
+                                       bool ignore_control_dependency) {
   // If the instruction has control predecessors or successors then we cannot
   // remove the instruction without violating ordering constraints (added, for
   // example, to avert interference due to buffer aliasing).
-  if (!instruction->control_predecessors().empty() ||
-      !instruction->control_successors().empty()) {
+
+  if (!ignore_control_dependency && instruction->HasControlDependencies()) {
     return false;
   }
 
@@ -309,7 +310,8 @@ bool HloComputation::IsMarkedAsDead(const HloInstruction* inst) {
 
 Status HloComputation::RemoveInstructionAndUnusedOperands(
     HloInstruction* instruction,
-    std::optional<absl::FunctionRef<void(HloInstruction*)>> cleanup) {
+    std::optional<absl::FunctionRef<void(HloInstruction*)>> cleanup,
+    bool ignore_control_dependencies) {
   TF_RET_CHECK(root_instruction() != instruction);
 
   TF_RET_CHECK(instruction->IsDead());
@@ -322,10 +324,17 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
     HloInstruction* item = worklist.front();
     worklist.pop();
 
-    if (removed.contains(item) || !item->IsDead() || !IsSafelyRemovable(item) ||
+    if (removed.contains(item) || !item->IsDead() ||
+        !IsSafelyRemovable(item, ignore_control_dependencies) ||
         (item->HasSideEffect() && item != instruction)) {
       continue;
     }
+    if (ignore_control_dependencies) {
+      TF_RETURN_IF_ERROR(item->SafelyDropAllControlDependencies());
+    } else if (item->HasControlDependencies()) {
+      continue;
+    }
+
     for (int i = 0; i < item->operand_count(); ++i) {
       worklist.push(item->mutable_operand(i));
     }
@@ -670,6 +679,10 @@ void HloComputation::Print(
   }
 }
 
+std::string HloComputation::ToString() const {
+  return ToString(HloPrintOptions::Default());
+}
+
 std::string HloComputation::ToString(const HloPrintOptions& options) const {
   return ToString(options, MakeInstructionPostOrder());
 }
@@ -753,7 +766,7 @@ HloComputation::CreateFromProto(
       if (instruction->opcode() == HloOpcode::kParameter) {
         int64_t param_no = instruction->parameter_number();
         TF_RET_CHECK(param_no >= 0 && param_no < parameter_count)
-            << "Invalid parameter number.  Expected [0, " << parameter_count
+            << "Invalid parameter number. Expected [0, " << parameter_count
             << "), got " << param_no;
         TF_RET_CHECK(!parameters_seen[param_no])
             << "Parameter number " << param_no
@@ -820,7 +833,7 @@ HloInstruction* HloComputation::CreateCallInstruction(
 
 StatusOr<HloInstruction*> HloComputation::CreateAsyncInstructions(
     HloInstruction* instruction, absl::Span<const Shape> context_shapes,
-    absl::string_view async_execution_thread) {
+    absl::string_view async_execution_thread, bool replace) {
   Builder builder("async_computation");
   std::vector<HloInstruction*> parameters(instruction->operand_count());
   std::vector<Shape> parameter_shapes(instruction->operand_count());
@@ -850,7 +863,9 @@ StatusOr<HloInstruction*> HloComputation::CreateAsyncInstructions(
   async_start->CopyBackendConfigFrom(instruction);
   async_done->set_metadata(instruction->metadata());
   async_done->CopyBackendConfigFrom(instruction);
-  TF_RETURN_IF_ERROR(ReplaceInstruction(instruction, async_done));
+  if (replace) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(instruction, async_done));
+  }
   return async_done;
 }
 
@@ -1029,13 +1044,14 @@ Status HloComputation::ReplaceWithNewEntryComputationParameter(
 
 StatusOr<bool> HloComputation::ReplaceInstruction(
     HloInstruction* old_instruction, HloInstruction* new_instruction,
-    bool preserve_sharding) {
+    bool preserve_sharding, bool relay_control_dependency) {
   TF_RET_CHECK(
       ShapeUtil::Compatible(old_instruction->shape(), new_instruction->shape()))
       << ShapeUtil::HumanString(old_instruction->shape()) << " vs "
       << ShapeUtil::HumanString(new_instruction->shape());
   return ReplaceInstructionWithDifferentShape(old_instruction, new_instruction,
-                                              preserve_sharding);
+                                              preserve_sharding,
+                                              relay_control_dependency);
 }
 
 Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
@@ -1049,13 +1065,18 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
 
 StatusOr<bool> HloComputation::ReplaceInstructionWithDifferentShape(
     HloInstruction* old_instruction, HloInstruction* new_instruction,
-    bool preserve_sharding) {
+    bool preserve_sharding, bool relay_control_dependency) {
   if (preserve_sharding && new_instruction->has_sharding() &&
       old_instruction->has_sharding() &&
       !new_instruction->has_compatible_sharding(old_instruction)) {
     VLOG(10) << "Skipping replacement due to incompatible sharding";
     return false;
   }
+  if (relay_control_dependency) {
+    TF_RETURN_IF_ERROR(
+        new_instruction->CopyAllControlDepsFrom(old_instruction));
+    TF_RETURN_IF_ERROR(old_instruction->DropAllControlDeps());
+  }
   VLOG(10) << "transformed " << old_instruction->ToString() << " to "
            << new_instruction->ToString();
   // Try to add metadata for HLO instructions that are created to replace
@@ -1082,7 +1103,7 @@ StatusOr<bool> HloComputation::ReplaceInstructionWithDifferentShape(
   // information on the new instruction we should copy the old sharding
   // information (if any).
   if (!new_instruction->has_sharding()) {
-    new_instruction->set_sharding(old_instruction->sharding_ptr());
+    new_instruction->copy_sharding(old_instruction);
   }
 
   TF_RETURN_IF_ERROR(
@@ -1098,7 +1119,9 @@ StatusOr<bool> HloComputation::ReplaceInstructionWithDifferentShape(
     new_instruction->SetAndSanitizeName(old_instruction->name());
   }
 
-  TF_RETURN_IF_ERROR(RemoveInstructionAndUnusedOperands(old_instruction));
+  TF_RETURN_IF_ERROR(RemoveInstructionAndUnusedOperands(
+      old_instruction, /*cleanup=*/std::nullopt,
+      /*ignore_control_dependencies=*/relay_control_dependency));
   return true;
 }
 
@@ -1363,7 +1386,8 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
   // match the order in instructions_.
   SortClonedInstructions(*context, replace, *this, instructions_, instructions);
 
-  Builder builder(suffix.empty() ? name() : name() + "." + suffix);
+  Builder builder(suffix.empty() ? std::string(name())
+                                 : absl::StrCat(name(), ".", suffix));
   for (auto& instr : instructions) {
     builder.AddInstruction(std::move(instr));
   }
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_computation.h b/tensorflow/compiler/xla/hlo/ir/hlo_computation.h
index 82dbe5b32be..45dfefbc077 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_computation.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_computation.h
@@ -74,9 +74,7 @@ class HloComputation {
    public:
     explicit Builder(absl::string_view name,
                      HloInstruction* fusion_instruction = nullptr)
-        : name_(name),
-          last_added_instruction_(nullptr),
-          fusion_instruction_(fusion_instruction) {}
+        : name_(name), fusion_instruction_(fusion_instruction) {}
     Builder(Builder&& b) = default;
     virtual ~Builder() = default;
 
@@ -93,9 +91,15 @@ class HloComputation {
     // `original_inst->AddInstruction(new_inst)` instead.
     virtual HloInstruction* AddInstruction(
         std::unique_ptr<HloInstruction> instruction) {
+      auto* added_instruction = instruction.get();
       instructions_.push_back(std::move(instruction));
-      last_added_instruction_ = instructions_.back().get();
-      return last_added_instruction_;
+      return added_instruction;
+    }
+
+    HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction,
+                                   std::optional<absl::string_view> new_name) {
+      instruction->SetAndSanitizeName(new_name.value());
+      return AddInstruction(std::move(instruction));
     }
 
     Status ForEachInstruction(
@@ -107,12 +111,11 @@ class HloComputation {
     }
 
     HloInstruction* last_added_instruction() const {
-      return last_added_instruction_;
+      return instructions_.empty() ? nullptr : instructions_.back().get();
     }
 
    private:
     const std::string name_;
-    HloInstruction* last_added_instruction_;
     HloInstruction* fusion_instruction_;
     std::vector<std::unique_ptr<HloInstruction>> instructions_;
 
@@ -143,7 +146,7 @@ class HloComputation {
   // Add an instruction to the computation. The computation takes ownership of
   // the instruction.
   HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction,
-                                 const std::string& new_name = "");
+                                 absl::string_view new_name = "");
 
   HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction,
                                  const OpMetadata* metadata);
@@ -202,11 +205,17 @@ class HloComputation {
   // and also transitively any operand that has no side effect and no users post
   // removing an instruction. The instruction must have no users. Instruction is
   // deallocated with this call. If given, the cleanup routine is executed on a
-  // removed instruction before its deallocation.
+  // removed instruction before its deallocation. If ignore_control_dependencies
+  // is set to true, if will remove the unused operands even when they have
+  // control dependencies, and transitively pass the control dependencies from
+  // the predecessors to the succesors of the removed instructions, so that the
+  // logical exeuction order of the remaining unremoved instructions are
+  // preserved.
   Status RemoveInstructionAndUnusedOperands(
       HloInstruction* instruction,
       std::optional<absl::FunctionRef<void(HloInstruction*)>> cleanup =
-          std::nullopt);
+          std::nullopt,
+      bool ignore_control_dependencies = false);
 
   // Set the root of the computation to the given instruction. The instruction
   // must have already been added to the computation. In addition it must have
@@ -230,11 +239,11 @@ class HloComputation {
     return param_instructions_[param_no];
   }
 
-  const std::vector<HloInstruction*>& parameter_instructions() const {
+  const HloInstruction::InstructionVector& parameter_instructions() const {
     return param_instructions_;
   }
 
-  const std::string& name() const { return name_; }
+  absl::string_view name() const { return name_; }
 
   // Use the given NameUniquer to select a unique name for the computation based
   // on the computation's existing name.
@@ -245,7 +254,7 @@ class HloComputation {
   // (We express the default options using an overload rather than a default
   // param because gdb ignores default params, but does resolve overloads.)
   void Print(Printer* printer) const {
-    return Print(printer, HloPrintOptions());
+    return Print(printer, HloPrintOptions::Default());
   }
   void Print(Printer* printer, const HloPrintOptions& options) const;
   void Print(Printer* printer, const HloPrintOptions& options,
@@ -255,7 +264,7 @@ class HloComputation {
   //
   // (We express the default options using an overload rather than a default
   // param because gdb ignores default params, but does resolve overloads.)
-  std::string ToString() const { return ToString(HloPrintOptions()); }
+  std::string ToString() const;
   std::string ToString(const HloPrintOptions& options) const;
 
   // Overload which accepts an order to emit the instructions in.
@@ -267,7 +276,7 @@ class HloComputation {
   //
   // (We express the default options using an overload rather than a default
   // param because gdb ignores default params, but does resolve overloads.)
-  absl::Cord ToCord() const { return ToCord(HloPrintOptions()); }
+  absl::Cord ToCord() const { return ToCord(HloPrintOptions::Default()); }
   absl::Cord ToCord(const HloPrintOptions& options) const;
 
   // Overload which accepts an order to emit the instructions in.
@@ -373,10 +382,12 @@ class HloComputation {
   // of the async done instruction so that can be accessed using that. If
   // present, `async_execution_thread` will be attached to the
   // async-start/update/done instructions as well as wrapped computations.
+  // If `replace` is true, replace instruction with the async done instruction.
   StatusOr<HloInstruction*> CreateAsyncInstructions(
       HloInstruction* instruction, absl::Span<const Shape> context_shapes,
       absl::string_view async_execution_thread =
-          HloInstruction::kMainExecutionThread);
+          HloInstruction::kMainExecutionThread,
+      bool replace = true);
 
   // Create a deep copy of the given instruction and return the instruction
   // producing the copied result. All instructions performing the copy are added
@@ -474,7 +485,8 @@ class HloComputation {
   // information of |old_instruction|, and function will return true.
   StatusOr<bool> ReplaceInstruction(HloInstruction* old_instruction,
                                     HloInstruction* new_instruction,
-                                    bool preserve_sharding);
+                                    bool preserve_sharding,
+                                    bool relay_control_dependency = false);
 
   // Same as above, with preserve_sharding=false. Since this replacement always
   // happens, it returns just a Status as opposed to StatusOr<bool>
@@ -485,7 +497,7 @@ class HloComputation {
   // shape.
   StatusOr<bool> ReplaceInstructionWithDifferentShape(
       HloInstruction* old_instruction, HloInstruction* new_instruction,
-      bool preserve_sharding);
+      bool preserve_sharding, bool relay_control_dependency = false);
   Status ReplaceInstructionWithDifferentShape(HloInstruction* old_instruction,
                                               HloInstruction* new_instruction);
 
@@ -577,7 +589,8 @@ class HloComputation {
   // but the transformation must guarantee the invariants relevant to the
   // instructions still hold (e.g., Send and Recv must be removed together to
   // make each channel complete).
-  bool IsSafelyRemovable(const HloInstruction* instruction);
+  bool IsSafelyRemovable(const HloInstruction* instruction,
+                         bool ignore_control_dependency = false);
 
   // Returns a map from an instruction to the group of instructions associated
   // with the same channel. These instructions will be considered as a single
@@ -635,14 +648,12 @@ class HloComputation {
     return async_instructions_;
   }
 
-  void AddAsyncInstruction(HloInstruction* async_instruction) {
-    CHECK(async_instruction != nullptr)
-        << "Nullptr shouldn't be added as commputation's async instruction. ";
+  void AddAsyncInstruction(HloInstruction& async_instruction) {
     CHECK(!IsFusionComputation() && !IsCustomCallComputation());
-    CHECK(async_instruction->opcode() == HloOpcode::kAsyncStart ||
-          async_instruction->opcode() == HloOpcode::kAsyncUpdate ||
-          async_instruction->opcode() == HloOpcode::kAsyncDone);
-    async_instructions_.push_back(async_instruction);
+    CHECK(async_instruction.opcode() == HloOpcode::kAsyncStart ||
+          async_instruction.opcode() == HloOpcode::kAsyncUpdate ||
+          async_instruction.opcode() == HloOpcode::kAsyncDone);
+    async_instructions_.push_back(&async_instruction);
   }
 
   void RemoveAsyncInstruction(HloInstruction* async_instruction) {
@@ -793,7 +804,7 @@ class HloComputation {
   // deallocated when Cleanup is called.
   std::vector<std::unique_ptr<HloInstruction>> to_be_deleted_;
 
-  std::vector<HloInstruction*> param_instructions_;
+  HloInstruction::InstructionVector param_instructions_;
 
   HloComputation(const HloComputation&) = delete;
   HloComputation& operator=(const HloComputation&) = delete;
@@ -844,8 +855,7 @@ Status HloComputation::AcceptOrdered(
     TF_RETURN_IF_ERROR(visitor->Postprocess(mutable_instruction));
     visited.insert(instruction);
   }
-  TF_RETURN_IF_ERROR(visitor->FinishVisit(root_instruction()));
-  return OkStatus();
+  return visitor->FinishVisit(root_instruction());
 }
 
 // Explicit instantiations.
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.cc b/tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.cc
index e3cec3fcb63..f68103cf92d 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.cc
@@ -136,15 +136,16 @@ bool HloInputOutputAliasConfig::ParameterMustAlias(
 
 std::optional<ShapeIndex> HloInputOutputAliasConfig::GetAliasedOutput(
     int64_t param_number, const ShapeIndex& param_index) const {
-  std::optional<ShapeIndex> output;
-  alias_.ForEachElement(
-      [&](const xla::ShapeIndex& output_index, std::optional<Alias> alias) {
-        if (alias && alias->parameter_number == param_number &&
-            alias->parameter_index == param_index) {
-          output = output_index;
-        }
-      });
-  return output;
+  // We use reverse iterator to preserve the semantics of
+  // alias_.ForEachElement() which was used before.
+  for (auto it = alias_.rbegin(); it != alias_.rend(); ++it) {
+    if (it->second.has_value() &&
+        it->second->parameter_number == param_number &&
+        it->second->parameter_index == param_index) {
+      return it->first;
+    }
+  }
+  return std::nullopt;
 }
 
 std::optional<HloInputOutputAliasConfig::Alias>
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_instruction.cc b/tensorflow/compiler/xla/hlo/ir/hlo_instruction.cc
index 3320b408bdb..a7c4a18c52a 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_instruction.cc
@@ -51,7 +51,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/printer.h"
-#include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/mapped_ptr_container_sorter.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -1042,7 +1041,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateParameter(
-    int64_t parameter_number, const Shape& shape, const std::string& name) {
+    int64_t parameter_number, const Shape& shape, absl::string_view name) {
   return std::make_unique<HloParameterInstruction>(parameter_number, shape,
                                                    name);
 }
@@ -1741,7 +1740,7 @@ HloInstruction::CreateBroadcastSequence(
         HloInstruction::CreateBroadcast(broadcast_shape, operand, {});
     broadcast->set_metadata(operand->metadata());
     if (operand->has_sharding()) {
-      broadcast->set_sharding(operand->sharding());
+      broadcast->copy_sharding(operand);
     }
     broadcast->set_frontend_attributes(operand->frontend_attributes());
     return broadcast;
@@ -1767,7 +1766,7 @@ HloInstruction::CreateBroadcastSequence(
       operand));
   reshaped_operand->set_metadata(operand->metadata());
   if (operand->has_sharding()) {
-    reshaped_operand->set_sharding(operand->sharding());
+    reshaped_operand->copy_sharding(operand);
   }
   reshaped_operand->set_frontend_attributes(operand->frontend_attributes());
   // Broadcast 'reshape' up to the larger size.
@@ -1775,7 +1774,7 @@ HloInstruction::CreateBroadcastSequence(
       broadcast_shape, reshaped_operand, broadcast_dimensions);
   broadcast->set_metadata(operand->metadata());
   if (operand->has_sharding()) {
-    broadcast->set_sharding(operand->sharding());
+    broadcast->copy_sharding(operand);
   }
   broadcast->set_frontend_attributes(operand->frontend_attributes());
   return broadcast;
@@ -2275,7 +2274,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewShape(
   std::unique_ptr<HloInstruction> clone =
       CloneWithNewOperands(shape, operands_, context);
   if (suffix.empty()) {
-    clone->name_ = name();
+    clone->name_.assign(name().begin(), name().end());
   } else {
     // If an instruction is cloned multiple times avoid names like
     // foo.suffix.suffix.suffix. Instead of repeating the suffix add a numeric
@@ -2285,14 +2284,14 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewShape(
     size_t index = name().rfind(dot_suffix);
     if (index == std::string::npos) {
       // Existing name does not include ".suffix".
-      clone->name_ = name() + dot_suffix;
+      clone->name_ = absl::StrCat(name(), dot_suffix);
     } else {
       // Existing name includes ".suffix". Determine if substring after
       // ".suffix" is numeric and should be replaced with an incremented number.
-      std::string after_suffix = name().substr(index + dot_suffix.size());
+      auto after_suffix = name().substr(index + dot_suffix.size());
       if (after_suffix.empty()) {
         // Existing name ends in ".suffix". New name should end in ".suffix2".
-        clone->name_ = name() + "2";
+        clone->name_ = absl::StrCat(name(), "2");
       } else {
         // If names ends with .suffix[0-9]+ then replace with a suffix with the
         // numeric value incremented.
@@ -2302,7 +2301,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewShape(
               StrCat(name().substr(0, index), dot_suffix, numeric_suffix + 1);
         } else {
           // Substring after ".suffix" is non-numeric.
-          clone->name_ = name() + dot_suffix;
+          clone->name_ = absl::StrCat(name(), dot_suffix);
         }
       }
     }
@@ -2402,6 +2401,21 @@ Status HloInstruction::DropAllControlDeps() {
   return OkStatus();
 }
 
+Status HloInstruction::SafelyDropAllControlDependencies() {
+  // Add all pairs of transitive dependencies from predecessors to successors.
+  for (HloInstruction* predecessor : control_predecessors_) {
+    for (HloInstruction* successor : control_successors_) {
+      TF_RETURN_IF_ERROR(predecessor->AddControlDependencyTo(successor));
+    }
+  }
+  TF_RETURN_IF_ERROR(DropAllControlDeps());
+  return OkStatus();
+}
+
+bool HloInstruction::HasControlDependencies() const {
+  return !control_predecessors_.empty() || !control_successors_.empty();
+}
+
 Status HloInstruction::CopyAllControlDepsFrom(const HloInstruction* inst) {
   for (auto* ctrl_pred : inst->control_predecessors()) {
     TF_RETURN_IF_ERROR(ctrl_pred->AddControlDependencyTo(this));
@@ -3009,7 +3023,7 @@ void PrintCycle(const HloInstruction* child, DFSStack* dfs_stack) {
                   << absl::StrJoin(
                          dfs, "\n  ",
                          [](std::string* out, const HloInstruction* instr) {
-                           out->append(instr->name());
+                           absl::StrAppend(out, instr->name());
                          });
         return;
       }
@@ -3040,6 +3054,10 @@ std::string HloInstruction::ToString(const HloPrintOptions& options) const {
   return std::move(printer).ToString();
 }
 
+std::string HloInstruction::ToString() const {
+  return ToString(HloPrintOptions::Default());
+}
+
 bool HloInstruction::IsOpElementwise(HloOpcode opcode) {
   switch (opcode) {
     // Unary elementwise operations.
@@ -4562,7 +4580,6 @@ HloModule* HloInstruction::GetModule() const {
 }
 
 void HloInstruction::UniquifyName(NameUniquer* name_uniquer) {
-  std::string parent_str = parent() == nullptr ? "noparent" : parent()->name();
   name_ = name_uniquer->GetUniqueName(name_);
 }
 
@@ -4749,7 +4766,8 @@ HloInstruction* HloInstruction::fused_parameter(
   return Cast<HloFusionInstruction>(this)->fused_parameter(parameter_number);
 }
 
-const std::vector<HloInstruction*>& HloInstruction::fused_parameters() const {
+const HloInstruction::InstructionVector& HloInstruction::fused_parameters()
+    const {
   return Cast<HloFusionInstruction>(this)->fused_parameters();
 }
 
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h b/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h
index a9b8043f615..7628e7174c2 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h
@@ -81,10 +81,12 @@ class HloPrintOptions {
 
   // Constructs the default print options: don't print large constants, don't
   // compact operands, no indentation.
-  HloPrintOptions()
-      : print_large_constants_(false),
-        print_only_essential_constants_(false),
+  constexpr HloPrintOptions()
+      : print_operand_index_annotation_interval_(5),
         print_subcomputation_mode_(PrintSubcomputationMode::kNameOnly),
+        indent_amount_(0),
+        print_large_constants_(false),
+        print_only_essential_constants_(false),
         print_metadata_(true),
         print_backend_config_(true),
         print_infeed_outfeed_config_(true),
@@ -93,17 +95,21 @@ class HloPrintOptions {
         print_result_shape_(true),
         print_operand_shape_(true),
         print_operand_names_(true),
-        print_operand_index_annotation_interval_(5),
         print_program_shape_(true),
         print_percent_(true),
         print_control_dependencies_(true),
         canonicalize_instruction_names_(false),
-        indent_amount_(0),
         is_in_nested_computation_(false),
         print_ids_(true),
         canonicalize_computations_(false),
         print_extra_attributes_(true),
         syntax_sugar_async_ops_(true) {}
+  // Static reference to a default construction HloPrintOptions, to avoid
+  // constructing a new one each time default is needed.
+  static const HloPrintOptions& Default() {
+    ABSL_CONST_INIT static const HloPrintOptions options;
+    return options;
+  }
 
   static HloPrintOptions ShortParsable() {
     return HloPrintOptions()
@@ -376,9 +382,13 @@ class HloPrintOptions {
   int is_in_nested_computation() const { return is_in_nested_computation_; }
 
  private:
+  // The interval between the /*index=*/ annotated operands. 0 means never print
+  // the annotation, 1 means print annotation for every operand.
+  int64_t print_operand_index_annotation_interval_;
+  PrintSubcomputationMode print_subcomputation_mode_;
+  int indent_amount_;
   bool print_large_constants_;
   bool print_only_essential_constants_;
-  PrintSubcomputationMode print_subcomputation_mode_;
   bool print_metadata_;
   bool print_backend_config_;
   bool print_infeed_outfeed_config_;
@@ -387,14 +397,10 @@ class HloPrintOptions {
   bool print_result_shape_;
   bool print_operand_shape_;
   bool print_operand_names_;
-  // The interval between the /*index=*/ annotated operands. 0 means never print
-  // the annotation, 1 means print annotation for every operand.
-  int64_t print_operand_index_annotation_interval_;
   bool print_program_shape_;
   bool print_percent_;
   bool print_control_dependencies_;
   bool canonicalize_instruction_names_;
-  int indent_amount_;
   bool is_in_nested_computation_;
   bool print_ids_;
   bool canonicalize_computations_;
@@ -517,7 +523,7 @@ class HloInstruction {
   // instruction from each operand's user set and user's operand set.
   void DetachFromOperandsAndUsers();
 
-  // Adds a derived instruciton to the parent compuation of this instruction.
+  // Adds a derived instruciton to the parent computation of this instruction.
   // Also update setup the new instruction as a derived instruction.
   HloInstruction* AddInstruction(
       std::unique_ptr<HloInstruction> derived_instruction);
@@ -538,7 +544,7 @@ class HloInstruction {
 
   // Creates a parameter-retrieving instruction.
   static std::unique_ptr<HloInstruction> CreateParameter(
-      int64_t parameter_number, const Shape& shape, const std::string& name);
+      int64_t parameter_number, const Shape& shape, absl::string_view name);
 
   // Creates a literal constant instruction.
   static std::unique_ptr<HloInstruction> CreateConstant(Literal literal);
@@ -1247,7 +1253,7 @@ class HloInstruction {
 
   // Returns true if this instruction is a user of 'instruction'.
   bool IsUserOf(const HloInstruction* instruction) const {
-    return ContainsKey(instruction->user_map_, this);
+    return instruction->user_map_.contains(this);
   }
 
   // Adds a control dependency from this instruction to the given
@@ -1268,6 +1274,14 @@ class HloInstruction {
   // Drops all control predecessors and successors from this HLO instruction.
   Status DropAllControlDeps();
 
+  // Drops all control predecessors and successors from this HLO instruction,
+  // and the maintain the transitivie control dependencies between
+  // control predecessors and control successors.
+  Status SafelyDropAllControlDependencies();
+
+  // Returns if instruction has any control dependencies.
+  bool HasControlDependencies() const;
+
   // Copies the control predecessors and successors on this HLO instruction to
   // `inst`.  Does not do a deep copy so this makes sense only if `inst` and
   // this HLO are in the same module.
@@ -1509,7 +1523,7 @@ class HloInstruction {
 
   // Prints a debugging string that represents this instruction.
   void Print(Printer* printer) const {
-    return Print(printer, HloPrintOptions());
+    return Print(printer, HloPrintOptions::Default());
   }
   void Print(Printer* printer, const HloPrintOptions& options) const;
 
@@ -1521,7 +1535,7 @@ class HloInstruction {
   // TODO(b/73348663): Make ToString() adaptive to the size of the string by
   // default, backing off on providing full information for very large strings,
   // or provide a different name for a ToString-like function that does that.
-  std::string ToString() const { return ToString(HloPrintOptions()); }
+  std::string ToString() const;
   std::string ToString(const HloPrintOptions& options) const;
 
   // Components of the Print() and ToString() representation:
@@ -1609,6 +1623,13 @@ class HloInstruction {
   void set_sharding(std::shared_ptr<const HloSharding> sharding) {
     sharding_ = std::move(sharding);
   }
+  // Copies the sharding of another instruction, this is more efficient than
+  // set_sharding(hlo->sharding()) because it avoids a deep copy and shares the
+  // storage. Note that if the other instruction has no sharding set, it also
+  // clears the sharding of the current instruction.
+  void copy_sharding(const HloInstruction* hlo) {
+    set_sharding(hlo->sharding_ptr());
+  }
   void set_single_sharding(const HloSharding& sharding);
   // Sets a sharding that assigns the current instruction to device.
   void set_device_sharding(int64_t device) {
@@ -1721,7 +1742,7 @@ class HloInstruction {
   ReshapeMerelyInsertsOrDeletes1SizedDimensions() const;
 
   // Gets the string identifier for this instruction.
-  const std::string& name() const { return name_; }
+  absl::string_view name() const { return name_; }
 
   // Sets the string identifier for this instruction. Name will be sanitized to
   // match the regexp "[a-zA-Z_][a-zA-Z0-9_.-]*".
@@ -1859,8 +1880,8 @@ class HloInstruction {
   void set_logical_creation_pass_id(int64_t pass_id) {
     metadata_.set_logical_creation_pass_id(pass_id);
   }
-  void set_metadata_deduplicated_name(const std::string& deduplicated_name) {
-    metadata_.set_deduplicated_name(deduplicated_name);
+  void set_metadata_deduplicated_name(std::string deduplicated_name) {
+    metadata_.set_deduplicated_name(std::move(deduplicated_name));
   }
   const OpMetadata& metadata() const { return metadata_; }
 
@@ -1994,7 +2015,7 @@ class HloInstruction {
   HloInstruction* fused_parameter(int64_t parameter_number) const;
 
   // Delegates to HloFusionInstruction::fused_parameters.
-  const std::vector<HloInstruction*>& fused_parameters() const;
+  const InstructionVector& fused_parameters() const;
 
   // Returns true if this instruction is a fusion instruction that generates
   // multiple outputs.
@@ -2469,6 +2490,18 @@ using HloInstructionSet = std::set<HloInstruction*, HloPtrComparator>;
 using ConstHloInstructionSet =
     std::set<const HloInstruction*, HloPtrComparator>;
 
+template <HloOpcode op, HloOpcode... rest>
+bool HloPredicateIsOp(const HloInstruction* instruction) {
+  if (instruction->opcode() == op) {
+    return true;
+  }
+  if constexpr (sizeof...(rest) == 0) {
+    return false;
+  } else {
+    return HloPredicateIsOp<rest...>(instruction);
+  }
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_HLO_IR_HLO_INSTRUCTION_H_
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_instructions.cc b/tensorflow/compiler/xla/hlo/ir/hlo_instructions.cc
index d86f18b1d27..a44629f4992 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_instructions.cc
@@ -265,24 +265,22 @@ HloAsyncInstruction::HloAsyncInstruction(
   AppendComputation(async_computation);
   CHECK(!async_computation->IsCustomCallComputation());
   CHECK(!async_computation->IsFusionComputation());
-  async_computation->AddAsyncInstruction(this);
+  async_computation->AddAsyncInstruction(*this);
   set_async_execution_thread(async_execution_thread);
+
+  // Drop 'async' from async-{start/update/done} to get the suffix.
+  absl::string_view suffix = HloOpcodeString(opcode).substr(5);
+  absl::string_view wrapped_name = HloOpcodeString(async_wrapped_opcode());
+  SetAndSanitizeName(absl::StrCat(wrapped_name, suffix));
 }
 
 HloAsyncInstruction::HloAsyncInstruction(
     HloOpcode opcode, const Shape& shape, HloInstruction* operand,
     HloComputation* async_computation, std::optional<int64_t> async_group_id,
     absl::string_view async_execution_thread)
-    : HloInstruction(opcode, shape),
-      async_group_id_(async_group_id),
-      async_execution_thread_(async_execution_thread) {
-  AppendOperand(operand);
-  AppendComputation(async_computation);
-  CHECK(!async_computation->IsCustomCallComputation());
-  CHECK(!async_computation->IsFusionComputation());
-  async_computation->AddAsyncInstruction(this);
-  set_async_execution_thread(async_execution_thread);
-}
+    : HloAsyncInstruction(opcode, shape, absl::MakeConstSpan(&operand, 1),
+                          async_computation, async_group_id,
+                          async_execution_thread) {}
 
 HloAsyncInstruction::~HloAsyncInstruction() {
   ClearAsyncComputationInstruction();
@@ -1698,7 +1696,7 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
       clone = called_computation()->AddInstruction(
           instruction_to_append->Clone(/*suffix=*/""));
     }
-    const std::vector<HloInstruction*>& called_computation_parameters =
+    const auto& called_computation_parameters =
         called_computation()->parameter_instructions();
     for (int64_t operand_num = 0; operand_num < operand_count();
          ++operand_num) {
@@ -1729,7 +1727,7 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
   }
 
   // Reread the parameters in the computation.
-  const std::vector<HloInstruction*>& called_computation_parameters =
+  const auto& called_computation_parameters =
       called_computation()->parameter_instructions();
 
   // Add each operand of the clone as an operand of the callable instruction.
@@ -2132,8 +2130,8 @@ HloInstruction* HloFusionInstruction::fused_parameter(
       parameter_number);
 }
 
-const std::vector<HloInstruction*>& HloFusionInstruction::fused_parameters()
-    const {
+const HloInstruction::InstructionVector&
+HloFusionInstruction::fused_parameters() const {
   return fused_instructions_computation()->parameter_instructions();
 }
 
@@ -2280,7 +2278,7 @@ std::unique_ptr<HloInstruction> HloRngInstruction::CloneWithNewOperandsImpl(
 
 HloParameterInstruction::HloParameterInstruction(int64_t parameter_number,
                                                  const Shape& shape,
-                                                 const std::string& name)
+                                                 absl::string_view name)
     : HloInstruction(HloOpcode::kParameter, shape),
       parameter_number_(parameter_number) {
   SetAndSanitizeName(name);
@@ -2537,10 +2535,10 @@ HloConvolutionInstruction::HloConvolutionInstruction(
 std::string HloConvolutionInstruction::ToCategory() const {
   std::string category = "convolution";
   if (window_util::HasBaseDilation(window())) {
-    category += " base-dilated";
+    absl::StrAppend(&category, " base-dilated");
   }
   if (window_util::HasWindowDilation(window())) {
-    category += " window-dilated";
+    absl::StrAppend(&category, " window-dilated");
   }
   return category;
 }
@@ -2726,7 +2724,7 @@ HloCustomCallInstruction::HloCustomCallInstruction(
     absl::string_view custom_call_target, std::string opaque,
     CustomCallApiVersion api_version)
     : HloCallableInstruction(HloOpcode::kCustomCall, shape, operands),
-      custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
+      custom_call_target_(custom_call_target),
       feature_group_count_(1),
       batch_group_count_(1),
       layout_constrained_(false),
@@ -2742,7 +2740,7 @@ HloCustomCallInstruction::HloCustomCallInstruction(
     HloComputation* to_apply, absl::string_view custom_call_target,
     std::string opaque, CustomCallApiVersion api_version)
     : HloCallableInstruction(HloOpcode::kCustomCall, shape, operands, to_apply),
-      custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
+      custom_call_target_(custom_call_target),
       feature_group_count_(1),
       batch_group_count_(1),
       layout_constrained_(false),
@@ -2761,7 +2759,7 @@ HloCustomCallInstruction::HloCustomCallInstruction(
     CustomCallApiVersion api_version)
     : HloCallableInstruction(HloOpcode::kCustomCall, shape, operands,
                              called_computations),
-      custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
+      custom_call_target_(custom_call_target),
       feature_group_count_(1),
       batch_group_count_(1),
       layout_constrained_(false),
@@ -2781,7 +2779,7 @@ HloCustomCallInstruction::HloCustomCallInstruction(
     absl::Span<const Shape> operand_shapes_with_layout,
     CustomCallApiVersion api_version)
     : HloCallableInstruction(HloOpcode::kCustomCall, shape, operands),
-      custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
+      custom_call_target_(custom_call_target),
       feature_group_count_(1),
       batch_group_count_(1),
       layout_constrained_(true),
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_instructions.h b/tensorflow/compiler/xla/hlo/ir/hlo_instructions.h
index a9463d3f5b0..9a5dc6f2b0e 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_instructions.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_instructions.h
@@ -1324,7 +1324,7 @@ class HloFusionInstruction : public HloCallableInstruction {
   HloInstruction* fused_parameter(int64_t parameter_number) const;
 
   // Returns the vector of fused parameters inside this fusion instruction.
-  const std::vector<HloInstruction*>& fused_parameters() const;
+  const HloInstruction::InstructionVector& fused_parameters() const;
 
   // Returns true if this instruction is a fusion instruction that generates
   // multiple outputs.
@@ -1421,7 +1421,7 @@ class HloRngInstruction : public HloInstruction {
 class HloParameterInstruction : public HloInstruction {
  public:
   explicit HloParameterInstruction(int64_t parameter_number, const Shape& shape,
-                                   const std::string& name);
+                                   absl::string_view name);
   int64_t parameter_number() const { return parameter_number_; }
 
   // Sets and gets the whether all replicas will receive the same parameter data
@@ -1724,9 +1724,7 @@ class HloReduceWindowInstruction : public HloInstruction {
   absl::InlinedVector<const Shape*, 2> input_shapes() const {
     absl::InlinedVector<const Shape*, 2> shapes;
     for (const auto* op : inputs()) {
-      VLOG(2) << "Pushing input array shape for: " << op->ToString() << "\n";
       shapes.push_back(&op->shape());
-      VLOG(2) << "Pushed shape: " << shapes.back()->ToString() << "\n";
     }
     return shapes;
   }
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_module.cc b/tensorflow/compiler/xla/hlo/ir/hlo_module.cc
index a9e734eb3a8..90189f9dda4 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_module.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_module.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 
 #include <algorithm>
+#include <atomic>
 #include <cstdint>
 #include <functional>
 #include <iterator>
 #include <memory>
 #include <optional>
-#include <set>
 #include <sstream>
 #include <string>
 #include <utility>
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
@@ -173,6 +174,55 @@ HloComputation* HloModule::AddEmbeddedComputation(
                                 /*preserve_entry_layouts=*/false);
 }
 
+void HloModule::MarkFusionDuplications(
+    const absl::flat_hash_map<HloComputation*, HloComputation*>& replacements) {
+  for (std::unique_ptr<HloComputation>& computation : computations_) {
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kFusion) {
+        auto rep =
+            replacements.find(instruction->fused_instructions_computation());
+        if (rep != replacements.end()) {
+          xla::HloComputation* new_comp = rep->second;
+          if (new_comp->IsFusionComputation()) {
+            auto dedup_name = new_comp->FusionInstruction()->name();
+            new_comp->FusionInstruction()->set_metadata_deduplicated_name(
+                std::string(dedup_name));
+            instruction->set_metadata_deduplicated_name(
+                std::string(dedup_name));
+          }
+        }
+      }
+    }
+  }
+}
+
+void HloModule::MoveComputationsFrom(HloModule* module) {
+  for (size_t i = 0; i < module->computation_count(); ++i) {
+    for (auto* instruction : module->computations_[i]->instructions()) {
+      instruction->ClearUniqueIdInternal();
+    }
+    module->computations_[i]->ClearUniqueIdInternal();
+    auto computation_raw_ptr = module->computations_[i].get();
+    if (computation_raw_ptr->IsEntryComputation()) {
+      this->entry_computation_ = nullptr;
+    }
+    this->AddComputationInternal(
+        std::move(module->computations_[i]),
+        /*is_entry=*/computation_raw_ptr->IsEntryComputation(),
+        /*uniquify_identifiers=*/false,
+        /*preserve_entry_layouts=*/false);
+    // Pick unique IDs for each instruction.
+    for (auto* instruction : computation_raw_ptr->instructions()) {
+      instruction->SetUniqueId(NewUniqueInstructionId());
+    }
+    // Set unique id to this computation_raw_ptr.
+    CHECK_NE(computation_raw_ptr->root_instruction()->unique_id(), -1)
+        << "Root has no valid id: " << computation_raw_ptr->ToString();
+    computation_raw_ptr->SetUniqueId(
+        computation_raw_ptr->root_instruction()->unique_id());
+  }
+}
+
 void HloModule::ReplaceComputations(
     const absl::flat_hash_map<HloComputation*, HloComputation*>& replacements) {
   // Replace all uses of non-canonical computations with their
@@ -318,7 +368,8 @@ HloModuleProto HloModule::ToProto() const {
   proto.set_id(unique_id_);
   proto.set_name(name_);
   if (entry_computation_) {
-    proto.set_entry_computation_name(entry_computation_->name());
+    *proto.mutable_entry_computation_name() =
+        std::string(entry_computation_->name());
     proto.set_entry_computation_id(entry_computation_->unique_id());
     *proto.mutable_host_program_shape() =
         entry_computation_layout().ComputeProgramShape().ToProto();
@@ -382,9 +433,9 @@ StatusOr<HloModuleProtoWithConfig> HloModule::ToProtoWithConfig() const {
 }
 
 Status HloModule::CheckUniqueNamesAndIdsForComputationsAndInstructions() const {
-  absl::flat_hash_set<std::string> computation_names;
+  absl::flat_hash_set<absl::string_view> computation_names;
   absl::flat_hash_set<int> computation_ids;
-  absl::flat_hash_set<std::string> instruction_names;
+  absl::flat_hash_set<absl::string_view> instruction_names;
   absl::flat_hash_set<int> instruction_ids;
 
   for (const HloComputation* computation : computations()) {
@@ -918,9 +969,9 @@ std::unique_ptr<HloModule> HloModule::Clone(const std::string& suffix) const {
 std::unique_ptr<HloModule> HloModule::Clone(const HloModuleConfig& config,
                                             const std::string& suffix) const {
   VLOG(1) << "Cloning module :" << name_ << " --> " << suffix << "\n";
-  auto module = absl::WrapUnique(new HloModule(
+  auto module = std::make_unique<HloModule>(
       absl::StrCat(name_, suffix.empty() ? "" : "-", suffix), config,
-      std::make_unique<CompilationEnvironments>(*comp_envs_)));
+      std::make_unique<CompilationEnvironments>(*comp_envs_));
 
   HloCloneContext context(module.get(), suffix);
   auto cloned_computation = entry_computation_->Clone(suffix, &context);
@@ -971,7 +1022,7 @@ std::unique_ptr<HloModule> HloModule::Clone(const HloModuleConfig& config,
 Status HloModule::RemoveUnusedComputations() {
   std::string suffix = "tmp";
   auto module = std::make_unique<HloModule>(
-      absl::StrCat(name_, suffix.empty() ? "" : "-", suffix), config(),
+      absl::StrCat(name_, "-", suffix), config(),
       std::make_unique<CompilationEnvironments>(*comp_envs_));
   HloCloneContext context(module.get(), suffix);
   entry_computation_->Clone(suffix, &context);
@@ -1016,6 +1067,13 @@ HloComputation* HloModule::GetComputationWithName(absl::string_view name) {
   return it == computations_in_module.end() ? nullptr : *it;
 }
 
+std::string HloModule::GetFingerprint128(const HloPrintOptions& options) const {
+  const tsl::Fprint128 fingerprint = tsl::Fingerprint128(ToString(options));
+  absl::string_view fp_bytes(reinterpret_cast<const char*>(&fingerprint),
+                             sizeof(tsl::Fprint128));
+  return absl::BytesToHexString(fp_bytes);
+}
+
 /* static */ std::atomic<int> HloModule::next_unique_module_id_(0);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_module.h b/tensorflow/compiler/xla/hlo/ir/hlo_module.h
index 9f2535042b8..cf342ca1e05 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_module.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_module.h
@@ -103,6 +103,12 @@ class HloModule {
   // Removes unused computations.
   Status RemoveUnusedComputations();
 
+  // Marks duplicate fusions with the same name to be able to group them for
+  // analysis purposes (e.g. through Xprof).
+  void MarkFusionDuplications(
+      const absl::flat_hash_map<HloComputation*, HloComputation*>&
+          replacements);
+
   // Replaces all uses of computations that are keys of 'replacements' with
   // the corresponding values in 'replacements'. Replaces the entry computation,
   // if applicable.
@@ -110,6 +116,12 @@ class HloModule {
   // This function iterates over all instructions in the module to find
   // computations to replace. We could speed it up by keeping track of users of
   // computations.
+  //
+  // N.B.: This function does not update the computations_ field of the
+  // HloModule with the newly added compututations. Therefore, along with
+  // invoking this function, if a replacement computation is not already present
+  // in module, it should be separately added into the module using
+  // `AddEmbeddedComputation`.
   void ReplaceComputations(
       const absl::flat_hash_map<HloComputation*, HloComputation*>&
           replacements);
@@ -117,6 +129,10 @@ class HloModule {
   const std::string& name() const { return name_; }
   void set_name(std::string name) { name_ = std::move(name); }
 
+  // Move computations from the input module to this one, while ensuring that
+  // the names of instructions within the computations are unchanged.
+  void MoveComputationsFrom(HloModule* module);
+
   // Returns a deep copy of this module including all computations.
   std::unique_ptr<HloModule> Clone(const std::string& suffix = "clone") const;
   std::unique_ptr<HloModule> Clone(const HloModuleConfig& config,
@@ -330,7 +346,7 @@ class HloModule {
   // (We express the default options using an overload rather than a default
   // param because gdb ignores default params, but does resolve overloads.)
   void Print(Printer* printer) const {
-    return Print(printer, HloPrintOptions());
+    return Print(printer, HloPrintOptions::Default());
   }
   void Print(Printer* printer, const HloPrintOptions& options) const;
 
@@ -338,14 +354,14 @@ class HloModule {
   //
   // (We express the default options using an overload rather than a default
   // param because gdb ignores default params, but does resolve overloads.)
-  std::string ToString() const { return ToString(HloPrintOptions()); }
+  std::string ToString() const { return ToString(HloPrintOptions::Default()); }
   std::string ToString(const HloPrintOptions& options) const;
 
   // Returns a Cord representation of the module.
   //
   // (We express the default options using an overload rather than a default
   // param because gdb ignores default params, but does resolve overloads.)
-  absl::Cord ToCord() const { return ToCord(HloPrintOptions()); }
+  absl::Cord ToCord() const { return ToCord(HloPrintOptions::Default()); }
   absl::Cord ToCord(const HloPrintOptions& options) const;
 
   // Convert an HloModule to or from a proto.
@@ -572,6 +588,11 @@ class HloModule {
 
   CompilationEnvironments& comp_envs() const { return *comp_envs_; }
 
+  // Get 128-bit fingerprint of the module by printing it using the given print
+  // options.
+  std::string GetFingerprint128(const HloPrintOptions& options =
+                                    HloPrintOptions::ModuleFingerprint()) const;
+
  private:
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.cc b/tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.cc
index e251faa9cf5..cc7b858e6e9 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.cc
@@ -45,8 +45,9 @@ std::string OpMetadataToString(const OpMetadata& metadata) {
         "profile_type={", absl::StrJoin(metadata.profile_type(), ","), "}"));
   }
   if (!metadata.deduplicated_name().empty()) {
-    result.push_back(
-        absl::StrCat("deduplicated_name=", metadata.deduplicated_name()));
+    result.push_back(absl::StrCat("deduplicated_name=\"",
+                                  absl::CEscape(metadata.deduplicated_name()),
+                                  "\""));
   }
   return absl::StrJoin(result, " ");
 }
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_sharding.cc b/tensorflow/compiler/xla/hlo/ir/hlo_sharding.cc
index 54015d82a5a..4e9b3796594 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_sharding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <iterator>
 #include <map>
 #include <numeric>
@@ -68,7 +69,8 @@ HloSharding HloSharding::PartialTile(
   if (replication_groups.size() == 1) {
     return Replicate(metadata);
   }
-  auto new_tile_dims = group_tile_assignment.dimensions();
+  std::vector<int64_t> new_tile_dims(group_tile_assignment.dimensions().begin(),
+                                     group_tile_assignment.dimensions().end());
   new_tile_dims.push_back(replication_groups[0].size());
   auto new_tile_assignment = Array<int64_t>(new_tile_dims);
   new_tile_assignment.Each(
@@ -90,7 +92,9 @@ HloSharding HloSharding::PartialTile(
     return Replicate(metadata);
   }
   if (tile_assignment_last_dim_replicate.dimensions().back() == 1) {
-    auto new_tile_dims = tile_assignment_last_dim_replicate.dimensions();
+    std::vector<int64_t> new_tile_dims(
+        tile_assignment_last_dim_replicate.dimensions().begin(),
+        tile_assignment_last_dim_replicate.dimensions().end());
     new_tile_dims.pop_back();
     auto fully_tiled = tile_assignment_last_dim_replicate;
     fully_tiled.Reshape(new_tile_dims);
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc
index 24912eb98e2..95ffc97351b 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc
@@ -438,7 +438,7 @@ std::string ShardingMetadata::ToString() const {
 ShardingMetadata::ToShardingMetadata(const DomainMetadata* metadata) {
   if (metadata->Kind() != ShardingMetadata::KindName()) {
     return Status(
-        tsl::error::INVALID_ARGUMENT,
+        absl::StatusCode::kInvalidArgument,
         "ShardingMetadata normalizer called with incorrect domain metadata");
   }
   return static_cast<const ShardingMetadata*>(metadata);
diff --git a/tensorflow/compiler/xla/hlo/utils/BUILD b/tensorflow/compiler/xla/hlo/utils/BUILD
index 158a5af9010..00386708974 100644
--- a/tensorflow/compiler/xla/hlo/utils/BUILD
+++ b/tensorflow/compiler/xla/hlo/utils/BUILD
@@ -140,3 +140,15 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
+
+cc_library(
+    name = "hlo_query",
+    srcs = ["hlo_query.cc"],
+    hdrs = ["hlo_query.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
diff --git a/tensorflow/compiler/xla/hlo/utils/hlo_live_range.cc b/tensorflow/compiler/xla/hlo/utils/hlo_live_range.cc
index 67b3e66498a..4aed55e0fc4 100644
--- a/tensorflow/compiler/xla/hlo/utils/hlo_live_range.cc
+++ b/tensorflow/compiler/xla/hlo/utils/hlo_live_range.cc
@@ -201,11 +201,9 @@ void HloLiveRange::CalculateBufferStartEndMap() {
     if (async_context_it != computations_in_async_context_.end()) {
       const HloComputation* async_context = async_context_it->second;
       CHECK(async_context->IsAsyncComputation());
-      auto async_done_it = absl::c_find_if(
-          async_context->AsyncInstructions(),
-          [](const HloInstruction* instruction) {
-            return instruction->opcode() == HloOpcode::kAsyncDone;
-          });
+      auto async_done_it =
+          absl::c_find_if(async_context->AsyncInstructions(),
+                          HloPredicateIsOp<HloOpcode::kAsyncDone>);
       CHECK(async_done_it != async_context->AsyncInstructions().end());
       definition_end_time =
           std::max(definition_end_time, instruction_schedule_[*async_done_it]);
diff --git a/tensorflow/compiler/xla/hlo/utils/hlo_matchers.cc b/tensorflow/compiler/xla/hlo/utils/hlo_matchers.cc
index 70605fcdd0a..bab9b224e99 100644
--- a/tensorflow/compiler/xla/hlo/utils/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/hlo/utils/hlo_matchers.cc
@@ -15,11 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
 
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
-#include "tensorflow/compiler/xla/test.h"
 
 namespace xla {
 namespace testing {
@@ -354,6 +360,36 @@ void HloReplicaGroupsMatcher::DescribeTo(std::ostream* os) const {
   *os << "{" << absl::StrJoin(replica_group_strs, ",") << "}";
 }
 
+bool HloSourceTargetPairsMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  const auto* collective_permute =
+      DynCast<HloCollectivePermuteInstruction>(instruction);
+
+  if (!collective_permute) {
+    *listener << instruction->ToString() << " not a collective permute";
+    return false;
+  }
+
+  if (collective_permute->source_target_pairs() == source_target_pairs_) {
+    return true;
+  }
+
+  std::ostringstream desc_stream;
+  DescribeTo(&desc_stream);
+  *listener << instruction->ToString()
+            << " has incorrect source_target_pairs (expected: "
+            << desc_stream.str() << ")";
+  return false;
+}
+
+void HloSourceTargetPairsMatcher::DescribeTo(std::ostream* os) const {
+  const auto pair_formatter = [](std::string* out,
+                                 const std::pair<int64_t, int64_t>& pair) {
+    absl::StrAppend(out, "{", pair.first, ",", pair.second, "}");
+  };
+  *os << '{' << absl::StrJoin(source_target_pairs_, ",", pair_formatter) << "}";
+}
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/tensorflow/compiler/xla/hlo/utils/hlo_matchers.h b/tensorflow/compiler/xla/hlo/utils/hlo_matchers.h
index 7217334ed8e..9052af7409b 100644
--- a/tensorflow/compiler/xla/hlo/utils/hlo_matchers.h
+++ b/tensorflow/compiler/xla/hlo/utils/hlo_matchers.h
@@ -222,6 +222,21 @@ class HloReplicaGroupsMatcher
   std::vector<std::vector<int64_t>> replica_groups_;
 };
 
+class HloSourceTargetPairsMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloSourceTargetPairsMatcher(
+      std::vector<std::pair<int64_t, int64_t>> source_target_pairs)
+      : source_target_pairs_(std::move(source_target_pairs)) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  std::vector<std::pair<int64_t, int64_t>> source_target_pairs_;
+};
+
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -531,6 +546,12 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> ReplicaGroups(
       new ::xla::testing::HloReplicaGroupsMatcher(std::move(replica_groups)));
 }
 
+inline ::testing::Matcher<const ::xla::HloInstruction*> SourceTargetPairs(
+    std::vector<std::pair<int64_t, int64_t>> source_target_pairs) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloSourceTargetPairsMatcher(
+      std::move(source_target_pairs)));
+}
+
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/tensorflow/compiler/xla/hlo/utils/hlo_matchers_test.cc b/tensorflow/compiler/xla/hlo/utils/hlo_matchers_test.cc
index a3eca95cc02..371b94b5fef 100644
--- a/tensorflow/compiler/xla/hlo/utils/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/hlo/utils/hlo_matchers_test.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
 
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -22,6 +27,7 @@ limitations under the License.
 namespace op = xla::testing::opcode_matchers;
 using ::testing::_;
 using ::testing::Eq;
+using ::testing::HasSubstr;
 
 namespace xla {
 namespace {
@@ -348,5 +354,19 @@ TEST_F(HloMatchersTest, ReplicaGroupsMatcher) {
   EXPECT_THAT(all_to_all.get(), op::ReplicaGroups({{0, 2}, {1, 3}}));
 }
 
+TEST_F(HloMatchersTest, SourceTargetPairsMatcher) {
+  Shape shape = ShapeUtil::MakeShape(F32, {5, 7});
+  std::unique_ptr<HloInstruction> p0 =
+      HloInstruction::CreateParameter(0, shape, "param");
+  std::vector<std::pair<int64_t, int64_t>> source_target_pairs = {
+      {0, 1}, {2, 3}, {1, 2}};
+  std::unique_ptr<HloInstruction> cp = HloInstruction::CreateCollectivePermute(
+      shape, p0.get(), source_target_pairs, std::nullopt);
+  EXPECT_THAT(Explain(p0.get(), op::SourceTargetPairs({{0, 1}})),
+              HasSubstr("not a collective permute"));
+  EXPECT_THAT(Explain(cp.get(), op::SourceTargetPairs({{0, 1}, {2, 3}})),
+              HasSubstr("source_target_pairs (expected: {{0,1},{2,3}}"));
+  EXPECT_THAT(cp.get(), op::SourceTargetPairs({{0, 1}, {2, 3}, {1, 2}}));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/hlo/utils/hlo_query.cc
similarity index 89%
rename from tensorflow/compiler/xla/service/hlo_query.cc
rename to tensorflow/compiler/xla/hlo/utils/hlo_query.cc
index 9c0e5841e85..2cd41abd901 100644
--- a/tensorflow/compiler/xla/service/hlo_query.cc
+++ b/tensorflow/compiler/xla/hlo/utils/hlo_query.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
+
+#include <algorithm>
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
@@ -27,7 +29,20 @@ namespace hlo_query {
 bool IsCollectiveCommunicationOp(HloOpcode op) {
   return op == HloOpcode::kAllReduce || op == HloOpcode::kAllGather ||
          op == HloOpcode::kAllToAll || op == HloOpcode::kCollectivePermute ||
-         op == HloOpcode::kReduceScatter;
+         op == HloOpcode::kReduceScatter || op == HloOpcode::kAllReduceStart ||
+         op == HloOpcode::kAllGatherStart ||
+         op == HloOpcode::kCollectivePermuteStart;
+}
+
+bool IsAsyncCollectiveStartOp(HloOpcode op) {
+  return op == HloOpcode::kAllReduceStart || op == HloOpcode::kAllGatherStart ||
+         op == HloOpcode::kCollectivePermuteStart ||
+         op == HloOpcode::kAsyncStart;
+}
+
+bool IsAsyncCollectiveDoneOp(HloOpcode op) {
+  return op == HloOpcode::kAllReduceDone || op == HloOpcode::kAllGatherDone ||
+         op == HloOpcode::kCollectivePermuteDone || op == HloOpcode::kAsyncDone;
 }
 
 bool IsConstantR0F32(HloInstruction* instruction, float* out) {
diff --git a/tensorflow/compiler/xla/service/hlo_query.h b/tensorflow/compiler/xla/hlo/utils/hlo_query.h
similarity index 94%
rename from tensorflow/compiler/xla/service/hlo_query.h
rename to tensorflow/compiler/xla/hlo/utils/hlo_query.h
index b6fa2fca2bc..2d0ee2ef896 100644
--- a/tensorflow/compiler/xla/service/hlo_query.h
+++ b/tensorflow/compiler/xla/hlo/utils/hlo_query.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_QUERY_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_QUERY_H_
+#ifndef TENSORFLOW_COMPILER_XLA_HLO_UTILS_HLO_QUERY_H_
+#define TENSORFLOW_COMPILER_XLA_HLO_UTILS_HLO_QUERY_H_
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
@@ -30,6 +30,9 @@ namespace hlo_query {
 // Returns whether the given opcode is a collective communications operation.
 bool IsCollectiveCommunicationOp(HloOpcode op);
 
+bool IsAsyncCollectiveStartOp(HloOpcode op);
+bool IsAsyncCollectiveDoneOp(HloOpcode op);
+
 // Returns whether the instruction provided is a constant rank-0 float32, and
 // if so, places the constant value into out.
 // Precondition: out != nullptr
@@ -98,4 +101,4 @@ bool HasX64TransformedHostTransfer(const HloModule& module);
 }  // namespace hlo_query
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_QUERY_H_
+#endif  // TENSORFLOW_COMPILER_XLA_HLO_UTILS_HLO_QUERY_H_
diff --git a/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.cc b/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.cc
index 500f256af64..67a140a3509 100644
--- a/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.cc
+++ b/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.cc
@@ -2099,8 +2099,9 @@ GroupedSharding GroupShardingOnDims(const HloSharding& sharding,
                                     absl::Span<const int64_t> group_dim_shards,
                                     bool subgroup_manual) {
   CHECK(!sharding.IsTileMaximal());
-  std::vector<int64_t> grouped_tiling_dims =
-      sharding.tile_assignment().dimensions();
+  std::vector<int64_t> grouped_tiling_dims(
+      sharding.tile_assignment().dimensions().begin(),
+      sharding.tile_assignment().dimensions().end());
   std::vector<int64_t> group_dim_sizes(group_dims.size());
   for (int64_t i = 0; i < group_dims.size(); ++i) {
     CHECK_EQ(grouped_tiling_dims[group_dims[i]] % group_dim_shards[i], 0);
@@ -2242,8 +2243,9 @@ PartialReplicatedGroupShardingWithAssignedDeviceGroups(
       [&device_to_index](absl::Span<const int64_t> indices, int64_t device) {
         device_to_index[device].assign(indices.begin(), indices.end());
       });
-  std::vector<int64_t> grouped_tiling_dims =
-      sharding.tile_assignment().dimensions();
+  std::vector<int64_t> grouped_tiling_dims(
+      sharding.tile_assignment().dimensions().begin(),
+      sharding.tile_assignment().dimensions().end());
   grouped_tiling_dims.back() /= device_groups.size();
   std::optional<HloSharding> final_sharding;
   const int64_t shard_size_on_replicated_dim =
@@ -2326,7 +2328,9 @@ HloSharding UngroupSharding(const GroupedSharding& grouped_sharding) {
     subgroup_types = std::vector<OpSharding::Type>(subgroup_dim_size,
                                                    OpSharding::REPLICATED);
     if (!grouped_sharding.sharding.IsTileMaximal()) {
-      tiling_dims = grouped_sharding.sharding.tile_assignment().dimensions();
+      tiling_dims.assign(
+          grouped_sharding.sharding.tile_assignment().dimensions().begin(),
+          grouped_sharding.sharding.tile_assignment().dimensions().end());
     }
     for (int i = 0; i < grouped_sharding.group_dims.size(); i++) {
       subgroup_types[grouped_sharding.group_dims[i] -
@@ -2338,7 +2342,9 @@ HloSharding UngroupSharding(const GroupedSharding& grouped_sharding) {
   } else if (!grouped_sharding.sharding.IsTileMaximal()) {
     // Handles tile replicated.
     partial_sharding = grouped_sharding.sharding.ReplicateOnLastTileDim();
-    tiling_dims = grouped_sharding.sharding.tile_assignment().dimensions();
+    tiling_dims.assign(
+        grouped_sharding.sharding.tile_assignment().dimensions().begin(),
+        grouped_sharding.sharding.tile_assignment().dimensions().end());
     if (absl::c_linear_search(grouped_sharding.group_dims,
                               tiling_dims.size())) {
       tiling_dims.push_back(1);
@@ -2419,7 +2425,8 @@ HloSharding SplitShardingDimension(const HloSharding& sharding,
   CHECK_EQ(sharding.tile_assignment().dim(dimension) % new_dim_size, 0)
       << "dim size " << new_dim_size;
   auto new_tile_assignment = sharding.tile_assignment();
-  std::vector<int64_t> dimensions = new_tile_assignment.dimensions();
+  std::vector<int64_t> dimensions(new_tile_assignment.dimensions().begin(),
+                                  new_tile_assignment.dimensions().end());
   int64_t current_dimension = dimensions[dimension];
   dimensions.insert(dimensions.begin() + dimension + 1,
                     current_dimension / new_dim_size);
@@ -2435,7 +2442,8 @@ HloSharding MergeShardingDimension(const HloSharding& sharding,
                                    int64_t dimension) {
   CHECK_GT(sharding.TiledDataRank(), dimension);
   auto new_tile_assignment = sharding.tile_assignment();
-  std::vector<int64_t> dimensions = new_tile_assignment.dimensions();
+  std::vector<int64_t> dimensions(new_tile_assignment.dimensions().begin(),
+                                  new_tile_assignment.dimensions().end());
   dimensions[dimension] *= dimensions[dimension + 1];
   dimensions.erase(dimensions.begin() + dimension + 1);
   new_tile_assignment.Reshape(dimensions);
diff --git a/tensorflow/compiler/xla/layout.cc b/tensorflow/compiler/xla/layout.cc
index a9c133df2b3..16ef16f432b 100644
--- a/tensorflow/compiler/xla/layout.cc
+++ b/tensorflow/compiler/xla/layout.cc
@@ -196,6 +196,8 @@ absl::string_view DimLevelTypeAbbrev(DimLevelType dim_level_type) {
       return "C";
     case DIM_SINGLETON:
       return "S";
+    case xla::DIM_COMPRESSED_WITH_HI:
+      return "H";
     default:
       LOG(FATAL) << "Invalid DimLevelType value: " << dim_level_type;
   }
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index ff03706ef97..987d5ed4556 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -701,10 +701,25 @@ bool LayoutUtil::ValidateDimLevel(DimLevelType dim_level_type, bool dim_unique,
       return dim_unique && dim_ordered;
     case DIM_COMPRESSED:
     case DIM_SINGLETON:
+    case DIM_COMPRESSED_WITH_HI:
       return true;
     default:
       return false;
   }
 }
 
+/*static*/ bool LayoutUtil::ByteStridesIsMajorToMinor(
+    absl::Span<const int64_t> byte_strides, absl::Span<const int64_t> dims,
+    PrimitiveType element_type) {
+  CHECK_EQ(dims.size(), byte_strides.size());
+  int64_t stride = ShapeUtil::ByteSizeOfPrimitiveType(element_type);
+  for (int i = dims.size() - 1; i >= 0; --i) {
+    if (byte_strides[i] != stride) {
+      return false;
+    }
+    stride *= dims[i];
+  }
+  return true;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index 28d461f808b..2c7432e5a1d 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -270,6 +270,13 @@ class LayoutUtil {
   static bool ValidateDimLevel(xla::DimLevelType dim_level_type,
                                bool dim_unique, bool dim_ordered);
 
+  // Returns true if `byte_strides` is major to minor order, i.e. the strides
+  // form a cumulative product of the byte size and dimensions in reverse order
+  // and the smallest stride is the byte size for `element_type`.
+  static bool ByteStridesIsMajorToMinor(absl::Span<const int64_t> byte_strides,
+                                        absl::Span<const int64_t> dims,
+                                        PrimitiveType element_type);
+
  private:
   LayoutUtil(const LayoutUtil&) = delete;
   LayoutUtil& operator=(const LayoutUtil&) = delete;
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index 3f412774d2e..acf09ac425c 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/layout_util.h"
 
+#include <optional>
 #include <sstream>
+#include <vector>
 
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -168,7 +170,7 @@ TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleDifferentRank) {
   Shape dst = MakeShapeWithLayout(F32, {2, 3}, {1, 0});
   auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::ContainsRegex("cannot copy layout from shape"));
 }
 
@@ -187,7 +189,7 @@ TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleTuple) {
 
   auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::ContainsRegex("cannot copy layout from shape"));
 }
 
@@ -199,10 +201,9 @@ TEST_F(LayoutUtilTest, CopyLayoutBogusLayout) {
 
   auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(
-      status.error_message(),
-      ::testing::ContainsRegex("layout minor_to_major field contains .* "
-                               "elements, but shape is rank"));
+  EXPECT_THAT(status.message(), ::testing::ContainsRegex(
+                                    "layout minor_to_major field contains .* "
+                                    "elements, but shape is rank"));
 }
 
 TEST_F(LayoutUtilTest, CopyTokenLayout) {
@@ -399,13 +400,13 @@ TEST_F(LayoutUtilTest, ValidateLayout_InvalidArrayLayout) {
   auto status =
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::HasSubstr("layout minor_to_major field "
                                    "contains 3 elements, but shape is rank 2"));
   status =
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::HasSubstr("layout minor_to_major field "
                                    "contains 3 elements, but shape is rank 2"));
 }
@@ -418,13 +419,13 @@ TEST_F(LayoutUtilTest, ValidateLayout_InvalidDimLevelTypes) {
   auto status =
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::HasSubstr("layout dim_level_types field "
                                    "contains 3 elements, but shape is rank 2"));
   status =
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::HasSubstr("layout dim_level_types field "
                                    "contains 3 elements, but shape is rank 2"));
 }
@@ -435,7 +436,7 @@ TEST_F(LayoutUtilTest, ValidateLayout_MissingArrayLayout) {
   auto status =
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::HasSubstr("shape f32[2,3] does not have a layout"));
   status =
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
@@ -497,7 +498,7 @@ TEST_F(LayoutUtilTest, ValidateLayout_TupleSubshapesWithMissingLayouts) {
   auto status =
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::HasSubstr("shape f32[1,2] does not have a layout"));
   status =
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
@@ -510,7 +511,7 @@ TEST_F(LayoutUtilTest, ValidateLayout_TupleSubshapesWithMissingLayouts) {
   status =
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::HasSubstr("layout minor_to_major field "
                                    "contains 3 elements, but shape is rank 1"));
 }
@@ -524,5 +525,22 @@ TEST_F(LayoutUtilTest, MoveDimToMajor) {
   EXPECT_EQ(new_layout, LayoutUtil::MakeLayout({2, 0, 1}));
 }
 
+TEST_F(LayoutUtilTest, StridesIsMajorToMinor) {
+  std::vector<int64_t> byte_strides = {3960, 440, 44, 4};
+  EXPECT_TRUE(LayoutUtil::ByteStridesIsMajorToMinor(
+      byte_strides, {8, 9, 10, 11}, PrimitiveType::F32));
+}
+
+TEST_F(LayoutUtilTest, StridesNotMajorToMinorInnerMostStrideIncorrect) {
+  std::vector<int64_t> byte_strides = {1880, 220, 22, 2};
+  EXPECT_FALSE(LayoutUtil::ByteStridesIsMajorToMinor(
+      byte_strides, {8, 9, 10, 11}, PrimitiveType::F32));
+}
+
+TEST_F(LayoutUtilTest, StridesNotMajorToMinor) {
+  std::vector<int64_t> byte_strides = {1880, 440, 44, 4};
+  EXPECT_FALSE(LayoutUtil::ByteStridesIsMajorToMinor(
+      byte_strides, {8, 9, 10, 11}, PrimitiveType::F32));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 7ce169727dd..a4efef75d60 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -33,8 +33,6 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/str_split.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
@@ -86,7 +84,8 @@ bool LiteralProtoHasValues(const LiteralProto& proto) {
          proto.c128s_size() || proto.tuple_literals_size() ||
          !proto.f16s().empty() || !proto.bf16s().empty() ||
          !proto.u16s().empty() || !proto.s16s().empty() ||
-         !proto.f8e5m2s().empty() || !proto.f8e4m3fns().empty();
+         !proto.f8e5m2s().empty() || !proto.f8e4m3fns().empty() ||
+         !proto.f8e4m3b11fnuzs().empty();
 }
 
 // Lazy getter for the interned scalar shape in static storage. We reuse this
@@ -131,6 +130,8 @@ const Shape& ScalarShape(PrimitiveType type) {
       return ScalarShapeImpl<F8E5M2>();
     case F8E4M3FN:
       return ScalarShapeImpl<F8E4M3FN>();
+    case F8E4M3B11FNUZ:
+      return ScalarShapeImpl<F8E4M3B11FNUZ>();
     case F16:
       return ScalarShapeImpl<F16>();
     case BF16:
@@ -176,35 +177,31 @@ const Shape* TryInternShape(const Shape& shape) {
   return nullptr;
 }
 
-}  // namespace
+// Utility structure which is used to create the optimal configuration for
+// a ShapeUtil::ForEachIndex() scan across two literals.
+struct StrideConfig {
+  StrideConfig(const Shape& source_shape, const Shape& dest_shape,
+               absl::Span<const int64_t> dimensions);
 
-LiteralBase::~LiteralBase() = default;
+  // The dimensions of the stride operation. Essentially every dimension
+  // will be iterated from base[i] to base[i]+dimensions[i], in step[i]
+  // steps.
+  absl::Span<const int64_t> dimensions;
+  DimensionVector base;
+  DimensionVector step;
+  int64_t minor_dimension = 0;
+  // The size of the strides for source and destination. One of the two
+  // (the one looping through its most minor dimension) will be 1, while
+  // the other will be the stride size at the dimension matching the other
+  // shape most minor dimension being scanned.
+  int64_t dest_stride = 1;
+  int64_t source_stride = 1;
+  // The size of the inner loop on the most minor dimension.
+  int64_t minor_loop_size = 1;
+};
 
-const Shape& LiteralBase::shape() const { return root_piece().subshape(); }
-
-const char* LiteralBase::Piece::buffer() const {
-  return std::visit(BufferVisitor{}, rep_);
-}
-
-const LiteralBase::Piece& LiteralBase::piece(
-    const ShapeIndex& shape_index) const {
-  Piece* piece = &const_cast<Piece&>(root_piece());
-  for (const auto i : shape_index) {
-    DCHECK_GE(i, 0);
-    DCHECK_LT(i, piece->children_size());
-    piece = &piece->child(i);
-  }
-  return *piece;
-}
-
-std::ostream& operator<<(std::ostream& out, const Literal& literal) {
-  out << literal.ToString();
-  return out;
-}
-
-MutableLiteralBase::StrideConfig::StrideConfig(
-    const Shape& source_shape, const Shape& dest_shape,
-    absl::Span<const int64_t> dimensions)
+StrideConfig::StrideConfig(const Shape& source_shape, const Shape& dest_shape,
+                           absl::Span<const int64_t> dimensions)
     : dimensions(dimensions),
       base(dimensions.size(), 0),
       step(dimensions.size(), 1) {
@@ -225,6 +222,41 @@ MutableLiteralBase::StrideConfig::StrideConfig(
   }
 }
 
+}  // namespace
+
+LiteralBase::~LiteralBase() = default;
+
+const Shape& LiteralBase::shape() const { return root_piece().subshape(); }
+
+const char* LiteralBase::Piece::buffer() const {
+  // std::visit is avoided here due to its code size issues.
+  if (auto* r = std::get_if<DenseRep>(&rep_)) {
+    return r->data;
+  }
+  if (auto* r = std::get_if<DenseInlinedRep>(&rep_)) {
+    return r->data;
+  }
+  DCHECK(std::holds_alternative<TupleRep>(rep_) ||
+         std::holds_alternative<Uninitialized>(rep_));
+  return nullptr;
+}
+
+const LiteralBase::Piece& LiteralBase::piece(
+    const ShapeIndex& shape_index) const {
+  const Piece* piece = &root_piece();
+  for (const auto i : shape_index) {
+    DCHECK_GE(i, 0);
+    DCHECK_LT(i, piece->children_size());
+    piece = &piece->child(i);
+  }
+  return *piece;
+}
+
+std::ostream& operator<<(std::ostream& out, const Literal& literal) {
+  out << literal.ToString();
+  return out;
+}
+
 Shape* MutableLiteralBase::mutable_shape_do_not_use() {
   const Shape* const_shape = shape_.get();
   Shape* shape = shape_.get_mutable(/*ensure_owned=*/true);
@@ -253,7 +285,7 @@ void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays,
                        ArrayValueState leaf_array_value_state) {
   if (shape.IsTuple()) {
     for (const Shape& subshape : shape.tuple_shapes()) {
-      auto child_piece = Piece();
+      Piece child_piece;
       child_piece.set_subshape(&subshape);
 
       SetPiece(subshape, &child_piece, allocate_arrays, leaf_array_value_state);
@@ -380,28 +412,24 @@ template <typename NativeT>
 Status MutableLiteralBase::CopySliceFromInternal(
     const LiteralBase& src_literal, absl::Span<const int64_t> src_base,
     absl::Span<const int64_t> dest_base, absl::Span<const int64_t> copy_size) {
-  const int64_t src_base_size = src_base.size();
-  const int64_t dest_base_size = dest_base.size();
-  TF_RET_CHECK(LayoutUtil::IsDenseArray(shape()));
-  TF_RET_CHECK(src_literal.shape().rank() == src_base_size);
-  TF_RET_CHECK(shape().rank() == dest_base_size);
-
   auto linear_index = [](const Shape& shape,
                          absl::Span<const int64_t> multi_index) {
     return IndexUtil::MultidimensionalIndexToLinearIndex(shape, multi_index);
   };
 
+  // `this->` is needed to workaround MSVC bug: #16882
+  NativeT* dest_data = this->data<NativeT>().data();
+  const NativeT* src_data = src_literal.data<NativeT>().data();
   if (src_literal.shape().rank() == 0 || shape().rank() == 0) {
-    // If any of the two shapes are scalars, we can just call the StridedCopy()
-    // directly, and we know we will be copying only one value.
+    // If any of the two shapes are scalars, just assign the value once.
     TF_RET_CHECK(copy_size.empty());
-    StridedCopy(data<NativeT>(), linear_index(shape(), dest_base), 0,
-                src_literal.data<NativeT>(),
-                linear_index(src_literal.shape(), src_base), 0, 1);
+    dest_data[linear_index(shape(), dest_base)] =
+        src_data[linear_index(src_literal.shape(), src_base)];
   } else if (!ShapeUtil::IsZeroElementArray(shape()) &&
-             !ShapeUtil::IsZeroElementArray(src_literal.shape())) {
-    // Perform copy if neither src nor dest has dimensions with zero element,
-    // otherwise it's a no-op.
+             !ShapeUtil::IsZeroElementArray(src_literal.shape()) &&
+             absl::c_none_of(copy_size, [](auto d) { return d == 0; })) {
+    // Perform copy if none of src, dest and copy_size has dimensions with zero
+    // element, otherwise it's a no-op.
     TF_RET_CHECK(src_base.size() == dest_base.size());
     TF_RET_CHECK(src_base.size() == copy_size.size());
 
@@ -411,8 +439,7 @@ Status MutableLiteralBase::CopySliceFromInternal(
     // proper stride size at the matching dimension.
     DimensionVector src_indexes(src_base.size(), 0);
     DimensionVector dest_indexes(dest_base.size(), 0);
-    MutableLiteralBase::StrideConfig stride_config(src_literal.shape(), shape(),
-                                                   copy_size);
+    StrideConfig stride_config(src_literal.shape(), shape(), copy_size);
 
     auto copy_proc = [&](absl::Span<const int64_t> indexes) {
       // Map from multi-dimensional index, to source index.
@@ -425,10 +452,9 @@ Status MutableLiteralBase::CopySliceFromInternal(
       int64_t src_index = linear_index(src_literal.shape(), src_indexes);
       int64_t dest_index = linear_index(shape(), dest_indexes);
 
-      // `this->` is needed to workaround MSVC bug: #16882
-      StridedCopy(this->data<NativeT>(), dest_index, stride_config.dest_stride,
-                  src_literal.data<NativeT>(), src_index,
-                  stride_config.source_stride, stride_config.minor_loop_size);
+      StridedCopy(dest_data + dest_index, stride_config.dest_stride,
+                  src_data + src_index, stride_config.source_stride,
+                  stride_config.minor_loop_size);
       return true;
     };
 
@@ -439,9 +465,9 @@ Status MutableLiteralBase::CopySliceFromInternal(
   return OkStatus();
 }
 
-Status MutableLiteralBase::CopyElementFrom(
-    const LiteralSlice& src_literal, absl::Span<const int64_t> src_index,
-    absl::Span<const int64_t> dest_index) {
+void MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
+                                         absl::Span<const int64_t> src_index,
+                                         absl::Span<const int64_t> dest_index) {
   DCHECK(LayoutUtil::IsDenseArray(shape()));
   DCHECK_EQ(shape().element_type(), src_literal.shape().element_type());
   const int64_t src_linear_index =
@@ -460,7 +486,6 @@ Status MutableLiteralBase::CopyElementFrom(
   if (dest_address != source_address) {
     memcpy(dest_address, source_address, primitive_size);
   }
-  return OkStatus();
 }
 
 /* static */ StatusOr<Literal> MutableLiteralBase::CreateFromProto(
@@ -568,9 +593,9 @@ template <typename NativeT>
 void CopyElementsBetween(absl::Span<NativeT> dest,
                          absl::Span<const NativeT> src, const Shape& dest_shape,
                          const Shape& src_shape) {
-  CHECK(LayoutUtil::IsDenseArray(dest_shape));
-  CHECK(LayoutUtil::IsDenseArray(src_shape));
-  CHECK(ShapeUtil::Compatible(dest_shape, src_shape));
+  DCHECK(LayoutUtil::IsDenseArray(dest_shape));
+  DCHECK(LayoutUtil::IsDenseArray(src_shape));
+  DCHECK(ShapeUtil::Compatible(dest_shape, src_shape));
   if (ShapeUtil::IsZeroElementArray(dest_shape)) {
     return;
   }
@@ -624,6 +649,9 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src,
       << __func__ << " is only supported for dense arrays: " << subshape();
   CHECK(LayoutUtil::IsDenseArray(src.subshape()))
       << __func__ << " is only supported for dense arrays: " << src.subshape();
+  if (!only_dynamic_bound) {
+    CHECK(ShapeUtil::Compatible(subshape(), src.subshape()));
+  }
   if (src.array_value_state_ == ArrayValueState::kUnknown ||
       src.array_value_state_ == ArrayValueState::kUndetermined) {
     if (array_value_state_ == ArrayValueState::kKnown) {
@@ -667,6 +695,7 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src,
       COPY_ELEMENTS(S64, int64_t);
       COPY_ELEMENTS(F8E5M2, tsl::float8_e5m2);
       COPY_ELEMENTS(F8E4M3FN, tsl::float8_e4m3fn);
+      COPY_ELEMENTS(F8E4M3B11FNUZ, tsl::float8_e4m3b11);
       COPY_ELEMENTS(F16, half);
       COPY_ELEMENTS(BF16, bfloat16);
       COPY_ELEMENTS(F32, float);
@@ -716,8 +745,9 @@ Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal,
   const Shape& src_subshape =
       ShapeUtil::GetSubshape(src_literal.shape(), src_shape_index);
   if (only_dynamic_bound) {
-    auto bound_shape = dest_subshape.is_static() ? src_subshape : dest_subshape;
-    auto compact_shape =
+    auto& bound_shape =
+        dest_subshape.is_static() ? src_subshape : dest_subshape;
+    auto& compact_shape =
         dest_subshape.is_static() ? dest_subshape : src_subshape;
     CHECK(ShapeUtil::DynamicShapeIsCompatible(compact_shape, bound_shape))
         << compact_shape.ToString() << " vs " << bound_shape.ToString();
@@ -797,10 +827,12 @@ Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal,
                                          absl::Span<const int64_t> src_base,
                                          absl::Span<const int64_t> dest_base,
                                          absl::Span<const int64_t> copy_size) {
-  TF_RET_CHECK(shape().IsArray()) << ShapeUtil::HumanString(shape());
-  TF_RET_CHECK(src_literal.shape().IsArray())
-      << ShapeUtil::HumanString(src_literal.shape());
+  TF_RET_CHECK(LayoutUtil::IsDenseArray(shape())) << shape();
+  TF_RET_CHECK(LayoutUtil::IsDenseArray(src_literal.shape()))
+      << src_literal.shape();
   TF_RET_CHECK(ShapeUtil::SameElementType(src_literal.shape(), shape()));
+  TF_RET_CHECK(src_literal.shape().rank() == src_base.size());
+  TF_RET_CHECK(shape().rank() == dest_base.size());
 
   switch (shape().element_type()) {
     case U4:
@@ -839,6 +871,9 @@ Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal,
     case F8E4M3FN:
       return CopySliceFromInternal<tsl::float8_e4m3fn>(src_literal, src_base,
                                                        dest_base, copy_size);
+    case F8E4M3B11FNUZ:
+      return CopySliceFromInternal<tsl::float8_e4m3b11>(src_literal, src_base,
+                                                        dest_base, copy_size);
     case F16:
       return CopySliceFromInternal<half>(src_literal, src_base, dest_base,
                                          copy_size);
@@ -879,6 +914,77 @@ void MutableLiteralBase::PopulateR1(const tsl::core::Bitmap& values) {
   }
 }
 
+void MutableLiteralBase::PopulateInplaceInternal(
+    absl::FunctionRef<void(void*, absl::Span<const int64_t>, int)> populator,
+    bool parallel) {
+  const Shape& this_shape = shape();
+  const int64_t rank = this_shape.rank();
+  DCHECK(LayoutUtil::IsDenseArray(this_shape));
+  char* const dest_base = static_cast<char*>(untyped_data());
+  if (rank > 0) {
+    StrideConfig stride_config(this_shape, this_shape, this_shape.dimensions());
+    int64_t minor_dimension_size =
+        ShapeUtil::GetDimension(this_shape, stride_config.minor_dimension);
+
+    const int64_t primitive_size =
+        ShapeUtil::ByteSizeOfPrimitiveType(shape().element_type());
+
+    auto init_function = [&](absl::Span<const int64_t> indexes,
+                             int thread_id) -> StatusOr<bool> {
+      const int64_t index =
+          IndexUtil::MultidimensionalIndexToLinearIndex(shape(), indexes);
+      DimensionVector minor_scan_indexes(rank, 0);
+      std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin());
+      char* dest_ptr = dest_base + index * primitive_size;
+      char* const dest_end = dest_ptr + primitive_size * minor_dimension_size;
+      while (dest_ptr < dest_end) {
+        populator(dest_ptr, minor_scan_indexes, thread_id);
+        ++minor_scan_indexes[stride_config.minor_dimension];
+        dest_ptr += primitive_size;
+      }
+      return true;
+    };
+    if (parallel) {
+      ShapeUtil::ForEachIndexParallel(this_shape, stride_config.base,
+                                      stride_config.dimensions,
+                                      stride_config.step, init_function);
+    } else {
+      ShapeUtil::ForEachIndex(
+          this_shape, stride_config.base, stride_config.dimensions,
+          stride_config.step,
+          [&init_function](
+              absl::Span<const int64_t> indexes) -> StatusOr<bool> {
+            auto result_ignored = init_function(indexes, /*thread_id=*/-1);
+            return true;
+          });
+    }
+  } else {
+    // For scalars.
+    populator(dest_base, {}, /*thread_id=*/-1);
+  }
+}
+
+Status MutableLiteralBase::PopulateInplace(
+    absl::FunctionRef<void(void*, absl::Span<const int64_t>)> populator) {
+  TF_RET_CHECK(LayoutUtil::IsDenseArray(shape()))
+      << __func__ << " is only supported for dense arrays: " << shape();
+  PopulateInplaceInternal(
+      [&](void* dest, absl::Span<const int64_t> indexes, int /*thread_id*/) {
+        return populator(dest, indexes);
+      },
+      /*parallel=*/false);
+  return OkStatus();
+}
+
+Status MutableLiteralBase::PopulateInplaceParallel(
+    absl::FunctionRef<void(void*, absl::Span<const int64_t>, int)> populator) {
+  TF_RET_CHECK(LayoutUtil::IsDenseArray(shape()))
+      << __func__ << " is only supported for dense arrays: " << shape();
+  PopulateInplaceInternal(populator,
+                          /*parallel=*/element_count() > 32);
+  return OkStatus();
+}
+
 Literal LiteralBase::Relayout(const Layout& new_layout,
                               const ShapeIndex& shape_index) const {
   // Create new shape with 'new_layout' set at the given shape index.
@@ -1127,29 +1233,32 @@ Literal LiteralBase::Transpose(absl::Span<const int64_t> permutation) const {
   return new_literal;
 }
 
+namespace {
 template <typename NativeT>
-Literal LiteralBase::SliceInternal(
-    const Shape& result_shape, absl::Span<const int64_t> start_indices) const {
-  Literal result_literal(result_shape);
+void SliceInternal(const LiteralBase& src_literal,
+                   absl::Span<const int64_t> start_indices,
+                   Literal& result_literal) {
+  const Shape& result_shape = result_literal.shape();
   DimensionVector new_indices(result_shape.rank());
   CHECK(result_literal
             .Populate<NativeT>([&](absl::Span<const int64_t> indices) {
               for (int64_t i = 0; i < result_shape.rank(); ++i) {
                 new_indices[i] = indices[i] + start_indices[i];
               }
-              return Get<NativeT>(new_indices);
+              return src_literal.Get<NativeT>(new_indices);
             })
             .ok());
-  for (int64_t dnum = 0; dnum < shape().rank(); ++dnum) {
-    if (shape().is_dynamic_dimension(dnum)) {
-      int64_t dynamic_size = GetDynamicSize(dnum) - start_indices[dnum];
-      CHECK_GE(dynamic_size, 0) << GetDynamicSize(dnum);
+  for (int64_t dnum = 0; dnum < src_literal.shape().rank(); ++dnum) {
+    if (src_literal.shape().is_dynamic_dimension(dnum)) {
+      int64_t dynamic_size =
+          src_literal.GetDynamicSize(dnum) - start_indices[dnum];
+      CHECK_GE(dynamic_size, 0) << src_literal.GetDynamicSize(dnum);
       dynamic_size = std::min(dynamic_size, result_shape.dimensions(dnum));
       result_literal.SetDynamicSize(dnum, dynamic_size);
     }
   }
-  return result_literal;
 }
+}  // namespace
 
 Literal LiteralBase::Slice(absl::Span<const int64_t> start_indices,
                            absl::Span<const int64_t> limit_indices) const {
@@ -1168,45 +1277,73 @@ Literal LiteralBase::Slice(absl::Span<const int64_t> start_indices,
       shape().element_type(), result_dimensions,
       LayoutUtil::MinorToMajor(shape()));
   ShapeUtil::CopyDynamicDimensions(&result_shape, shape());
+  Literal result_literal(result_shape);
   switch (result_shape.element_type()) {
     case PRED:
-      return SliceInternal<bool>(result_shape, start_indices);
+      SliceInternal<bool>(*this, start_indices, result_literal);
+      break;
+    case U4:
+      SliceInternal<u4>(*this, start_indices, result_literal);
+      break;
     case U8:
-      return SliceInternal<uint8_t>(result_shape, start_indices);
+      SliceInternal<uint8_t>(*this, start_indices, result_literal);
+      break;
     case U16:
-      return SliceInternal<uint16_t>(result_shape, start_indices);
+      SliceInternal<uint16_t>(*this, start_indices, result_literal);
+      break;
     case U32:
-      return SliceInternal<uint32_t>(result_shape, start_indices);
+      SliceInternal<uint32_t>(*this, start_indices, result_literal);
+      break;
     case U64:
-      return SliceInternal<uint64_t>(result_shape, start_indices);
+      SliceInternal<uint64_t>(*this, start_indices, result_literal);
+      break;
+    case S4:
+      SliceInternal<s4>(*this, start_indices, result_literal);
+      break;
     case S8:
-      return SliceInternal<int8_t>(result_shape, start_indices);
+      SliceInternal<int8_t>(*this, start_indices, result_literal);
+      break;
     case S16:
-      return SliceInternal<int16_t>(result_shape, start_indices);
+      SliceInternal<int16_t>(*this, start_indices, result_literal);
+      break;
     case S32:
-      return SliceInternal<int32_t>(result_shape, start_indices);
+      SliceInternal<int32_t>(*this, start_indices, result_literal);
+      break;
     case S64:
-      return SliceInternal<int64_t>(result_shape, start_indices);
+      SliceInternal<int64_t>(*this, start_indices, result_literal);
+      break;
     case F8E5M2:
-      return SliceInternal<tsl::float8_e5m2>(result_shape, start_indices);
+      SliceInternal<tsl::float8_e5m2>(*this, start_indices, result_literal);
+      break;
     case F8E4M3FN:
-      return SliceInternal<tsl::float8_e4m3fn>(result_shape, start_indices);
+      SliceInternal<tsl::float8_e4m3fn>(*this, start_indices, result_literal);
+      break;
+    case F8E4M3B11FNUZ:
+      SliceInternal<tsl::float8_e4m3b11>(*this, start_indices, result_literal);
+      break;
     case F16:
-      return SliceInternal<half>(result_shape, start_indices);
+      SliceInternal<half>(*this, start_indices, result_literal);
+      break;
     case BF16:
-      return SliceInternal<bfloat16>(result_shape, start_indices);
+      SliceInternal<bfloat16>(*this, start_indices, result_literal);
+      break;
     case F32:
-      return SliceInternal<float>(result_shape, start_indices);
+      SliceInternal<float>(*this, start_indices, result_literal);
+      break;
     case F64:
-      return SliceInternal<double>(result_shape, start_indices);
+      SliceInternal<double>(*this, start_indices, result_literal);
+      break;
     case C64:
-      return SliceInternal<complex64>(result_shape, start_indices);
+      SliceInternal<complex64>(*this, start_indices, result_literal);
+      break;
     case C128:
-      return SliceInternal<complex128>(result_shape, start_indices);
+      SliceInternal<complex128>(*this, start_indices, result_literal);
+      break;
     default:
       LOG(FATAL) << "not yet implemented: "
                  << PrimitiveType_Name(result_shape.element_type());
   }
+  return result_literal;
 }
 
 Literal LiteralBase::Clone() const {
@@ -1268,6 +1405,9 @@ std::string LiteralBase::GetAsString(absl::Span<const int64_t> multi_index,
     case F8E4M3FN:
       return RoundTripFpToString(
           Get<tsl::float8_e4m3fn>(multi_index, shape_index));
+    case F8E4M3B11FNUZ:
+      return RoundTripFpToString(
+          Get<tsl::float8_e4m3b11>(multi_index, shape_index));
     case F64:
       return RoundTripFpToString(Get<double>(multi_index, shape_index));
     case C64: {
@@ -1325,6 +1465,8 @@ std::optional<double> LiteralBase::GetAsDouble(
       return static_cast<double>(Get<tsl::float8_e5m2>(multi_index));
     case F8E4M3FN:
       return static_cast<double>(Get<tsl::float8_e4m3fn>(multi_index));
+    case F8E4M3B11FNUZ:
+      return static_cast<double>(Get<tsl::float8_e4m3b11>(multi_index));
     case F16:
       return static_cast<double>(Get<half>(multi_index));
     case F32:
@@ -1359,6 +1501,9 @@ std::optional<double> LiteralBase::GetSumAsDouble(
     case F8E4M3FN:
       SUMLOOP(tsl::float8_e4m3fn);
       break;
+    case F8E4M3B11FNUZ:
+      SUMLOOP(tsl::float8_e4m3b11);
+      break;
     case F16:
       SUMLOOP(half);
       break;
@@ -1386,6 +1531,8 @@ std::optional<complex128> LiteralBase::GetAsComplex128(
       return {{static_cast<double>(Get<tsl::float8_e5m2>(multi_index)), 0}};
     case F8E4M3FN:
       return {{static_cast<double>(Get<tsl::float8_e4m3fn>(multi_index)), 0}};
+    case F8E4M3B11FNUZ:
+      return {{static_cast<double>(Get<tsl::float8_e4m3b11>(multi_index)), 0}};
     case BF16:
       return {{static_cast<double>(Get<bfloat16>(multi_index)), 0}};
     case F16:
@@ -1459,6 +1606,10 @@ Status MutableLiteralBase::SetFromDouble(absl::Span<const int64_t> multi_index,
       Set<tsl::float8_e4m3fn>(multi_index,
                               static_cast<tsl::float8_e4m3fn>(value));
       break;
+    case F8E4M3B11FNUZ:
+      Set<tsl::float8_e4m3b11>(multi_index,
+                               static_cast<tsl::float8_e4m3b11>(value));
+      break;
     default:
       return FailedPrecondition("Array element type is not floating: %s",
                                 PrimitiveType_Name(shape().element_type()));
@@ -1703,29 +1854,16 @@ void LiteralBase::EachCellAsString(
 }
 
 namespace {
-template <typename NativeSrcT, typename NativeDestT, typename ConverterType>
-Literal ConvertBetweenNativeTypesWithConverter(const LiteralBase& src_literal,
-                                               const ConverterType& converter) {
-  CHECK(src_literal.shape().IsArray());
-  Literal result_literal(ShapeUtil::ChangeElementType(
-      src_literal.shape(),
-      primitive_util::NativeToPrimitiveType<NativeDestT>()));
-  auto src_data = src_literal.data<NativeSrcT>();
-  auto dest_data = result_literal.template data<NativeDestT>();
-  int64_t num_elements = src_literal.element_count();
 
-  for (int64_t i = 0; i < num_elements; ++i) {
-    dest_data[i] = converter(src_data[i]);
-  }
-  return result_literal;
-}
+template <PrimitiveType kType>
+using NativeTypeOf =
+    typename primitive_util::PrimitiveTypeToNative<kType>::type;
 
 template <typename NativeSrcT, typename NativeDestT>
-Literal ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
+void ConvertBetweenNativeTypes(absl::Span<const NativeSrcT> src_data,
+                               void* dst_base) {
+  static_assert(!std::is_same_v<NativeSrcT, NativeDestT>);
   auto converter = [](NativeSrcT src) {
-    if constexpr (std::is_same_v<NativeSrcT, NativeDestT>) {
-      return src;
-    }
     // C++ [conv.bool]p1:
     //   A prvalue of arithmetic [...] type can be converted to a prvalue of
     //   type bool. A zero value [...] is converted to false; any other value is
@@ -1744,113 +1882,89 @@ Literal ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
       if (src != src) {
         return NativeDestT{0};
       }
-      if (src >=
-          static_cast<NativeSrcT>(std::numeric_limits<NativeDestT>::max())) {
-        return std::numeric_limits<NativeDestT>::max();
-      }
-      if (src <=
-          static_cast<NativeSrcT>(std::numeric_limits<NativeDestT>::lowest())) {
-        return std::numeric_limits<NativeDestT>::lowest();
-      }
+      src = std::clamp(
+          src,
+          static_cast<NativeSrcT>(std::numeric_limits<NativeDestT>::lowest()),
+          static_cast<NativeSrcT>(std::numeric_limits<NativeDestT>::max()));
     }
     return static_cast<NativeDestT>(src);
   };
-  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
-      src_literal, converter);
-}
 
-template <typename NativeSrcT, typename NativeDestT>
-Literal BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
-  if constexpr (sizeof(NativeSrcT) == sizeof(NativeDestT)) {
-    auto converter = [](NativeSrcT src) {
-      return Eigen::numext::bit_cast<NativeDestT>(src);
-    };
-    return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
-        src_literal, converter);
-  }
-  // This template specialization is here to make the compiler happy. bit_cast
-  // has a static check that the types are the same size. This specialization
-  // should never be used because the source and destination types are checked
-  // for identical sizes higher up.
-  LOG(FATAL) << "Invalid bitcast between types of different sizes.";
-}
-
-template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
-Literal ConvertIfTypesMatch(const LiteralBase& src_literal, bool bitcast) {
-  CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
-  if (bitcast) {
-    return BitcastBetweenNativeTypes<
-        typename primitive_util::PrimitiveTypeToNative<
-            primitive_src_type>::type,
-        typename primitive_util::PrimitiveTypeToNative<
-            primitive_dest_type>::type>(src_literal);
-  } else {
-    return ConvertBetweenNativeTypes<
-        typename primitive_util::PrimitiveTypeToNative<
-            primitive_src_type>::type,
-        typename primitive_util::PrimitiveTypeToNative<
-            primitive_dest_type>::type>(src_literal);
+  NativeDestT* dest_data = static_cast<NativeDestT*>(dst_base);
+  for (const NativeSrcT& src : src_data) {
+    *(dest_data++) = converter(src);
   }
 }
 
-template <PrimitiveType primitive_src_type>
-StatusOr<Literal> ConvertIfDestTypeMatches(const LiteralBase& src_literal,
-                                           PrimitiveType primitive_dest_type,
-                                           bool bitcast) {
-  switch (primitive_dest_type) {
-#define CONVERT_IF_TYPES_MATCH(type)                                    \
-  case (type):                                                          \
-    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal, \
-                                                           bitcast);
-    CONVERT_IF_TYPES_MATCH(PRED)
-    CONVERT_IF_TYPES_MATCH(S4)
-    CONVERT_IF_TYPES_MATCH(S8)
-    CONVERT_IF_TYPES_MATCH(S16)
-    CONVERT_IF_TYPES_MATCH(S32)
-    CONVERT_IF_TYPES_MATCH(S64)
-    CONVERT_IF_TYPES_MATCH(U4)
-    CONVERT_IF_TYPES_MATCH(U8)
-    CONVERT_IF_TYPES_MATCH(U16)
-    CONVERT_IF_TYPES_MATCH(U32)
-    CONVERT_IF_TYPES_MATCH(U64)
-    CONVERT_IF_TYPES_MATCH(F16)
-    CONVERT_IF_TYPES_MATCH(F32)
-    CONVERT_IF_TYPES_MATCH(F64)
-    CONVERT_IF_TYPES_MATCH(BF16)
-    CONVERT_IF_TYPES_MATCH(F8E5M2)
-    CONVERT_IF_TYPES_MATCH(F8E4M3FN)
-#undef CONVERT_IF_TYPES_MATCH
-    case C64:
-      if (bitcast) {
-        break;
-      }
-      return ConvertIfTypesMatch<primitive_src_type, C64>(src_literal, false);
-    case C128:
-      if (bitcast) {
-        break;
-      }
-      return ConvertIfTypesMatch<primitive_src_type, C128>(src_literal, false);
-    // Other types are not yet supported.
+template <PrimitiveType kSrcType>
+void ConvertIfDestTypeMatches(const LiteralBase& src_literal,
+                              MutableLiteralBase& dst_literal) {
+  DCHECK(dst_literal.shape().IsArray());
+  using NativeSrcT = NativeTypeOf<kSrcType>;
+  // Pass raw data Span/pointers to called template methods to avoid duplicating
+  // the Literal method calls to many time which hurts code size.
+  auto src_data = src_literal.data<NativeSrcT>();
+  void* dst_base = dst_literal.untyped_data();
+  DCHECK_EQ(src_data.size(), dst_literal.element_count());
+  switch (dst_literal.shape().element_type()) {
+#define CONVERT_BETWEEN_NATIVE_TYPES(type)                                    \
+  case (type):                                                                \
+    if constexpr (kSrcType != type) {                                         \
+      using NativeDestT = NativeTypeOf<type>;                                 \
+      ConvertBetweenNativeTypes<NativeSrcT, NativeDestT>(src_data, dst_base); \
+    }                                                                         \
+    break;
+    CONVERT_BETWEEN_NATIVE_TYPES(PRED)
+    CONVERT_BETWEEN_NATIVE_TYPES(S4)
+    CONVERT_BETWEEN_NATIVE_TYPES(S8)
+    CONVERT_BETWEEN_NATIVE_TYPES(S16)
+    CONVERT_BETWEEN_NATIVE_TYPES(S32)
+    CONVERT_BETWEEN_NATIVE_TYPES(S64)
+    CONVERT_BETWEEN_NATIVE_TYPES(U4)
+    CONVERT_BETWEEN_NATIVE_TYPES(U8)
+    CONVERT_BETWEEN_NATIVE_TYPES(U16)
+    CONVERT_BETWEEN_NATIVE_TYPES(U32)
+    CONVERT_BETWEEN_NATIVE_TYPES(U64)
+    CONVERT_BETWEEN_NATIVE_TYPES(F16)
+    CONVERT_BETWEEN_NATIVE_TYPES(F32)
+    CONVERT_BETWEEN_NATIVE_TYPES(F64)
+    CONVERT_BETWEEN_NATIVE_TYPES(BF16)
+    CONVERT_BETWEEN_NATIVE_TYPES(F8E5M2)
+    CONVERT_BETWEEN_NATIVE_TYPES(F8E4M3FN)
+    CONVERT_BETWEEN_NATIVE_TYPES(F8E4M3B11FNUZ)
+    CONVERT_BETWEEN_NATIVE_TYPES(C64)
+    CONVERT_BETWEEN_NATIVE_TYPES(C128)
+#undef CONVERT_BETWEEN_NATIVE_TYPES
+    // This code path is impossible to hit.
     default:
-      break;
+      LOG(FATAL) << "Unexpected type " << dst_literal.shape().element_type();
   }
-  return Unimplemented("Converting from type %s to type %s is not implemented.",
-                       PrimitiveType_Name(src_literal.shape().element_type()),
-                       PrimitiveType_Name(primitive_dest_type));
 }
 
 StatusOr<Literal> ConvertSwitch(const LiteralBase& literal,
-                                PrimitiveType primitive_dest_type,
-                                bool bitcast) {
+                                PrimitiveType primitive_dest_type) {
   TF_RET_CHECK(LayoutUtil::IsDenseArray(literal.shape()));
   if (literal.shape().element_type() == primitive_dest_type) {
     return literal.Clone();
   }
+  // Source Array type requirement is ensured by IsDenseArray before.
+  if (!primitive_util::IsArrayType(primitive_dest_type) ||
+      primitive_util::IsComplexType(literal.shape().element_type())) {
+    return Unimplemented("%s from type %s to type %s is not implemented.",
+                         "Converting",
+                         PrimitiveType_Name(literal.shape().element_type()),
+                         PrimitiveType_Name(primitive_dest_type));
+  }
+  // At this point, we know both src & dst are array types, while src is not
+  // complex type, so we can allocate the result literal here to avoid
+  // duplicating it N^2 times in the conversion implementation.
+  Literal result(
+      ShapeUtil::ChangeElementType(literal.shape(), primitive_dest_type));
   switch (literal.shape().element_type()) {
-#define CONVERT_IF_DEST_TYPE_MATCHES(type)                                \
-  case (type):                                                            \
-    return ConvertIfDestTypeMatches<(type)>(literal, primitive_dest_type, \
-                                            bitcast);
+#define CONVERT_IF_DEST_TYPE_MATCHES(type)             \
+  case (type):                                         \
+    ConvertIfDestTypeMatches<(type)>(literal, result); \
+    break;
     CONVERT_IF_DEST_TYPE_MATCHES(PRED)
     CONVERT_IF_DEST_TYPE_MATCHES(S4)
     CONVERT_IF_DEST_TYPE_MATCHES(S8)
@@ -1868,21 +1982,21 @@ StatusOr<Literal> ConvertSwitch(const LiteralBase& literal,
     CONVERT_IF_DEST_TYPE_MATCHES(BF16)
     CONVERT_IF_DEST_TYPE_MATCHES(F8E5M2)
     CONVERT_IF_DEST_TYPE_MATCHES(F8E4M3FN)
+    CONVERT_IF_DEST_TYPE_MATCHES(F8E4M3B11FNUZ)
 #undef CONVERT_IF_DEST_TYPE_MATCHES
-      // Other types are not yet supported.
+      // Unsupported conversions are checked before this switch, this path is
+      // not possible to hit.
     default:
-      return Unimplemented("%s from type %s to type %s is not implemented.",
-                           (bitcast ? "Bitcast converting" : "Converting"),
-                           PrimitiveType_Name(literal.shape().element_type()),
-                           PrimitiveType_Name(primitive_dest_type));
+      LOG(FATAL) << "Unexpected type " << literal.shape().element_type();
   }
+  return result;
 }
 
 }  // namespace
 
 StatusOr<Literal> LiteralBase::Convert(
     PrimitiveType primitive_dest_type) const {
-  return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/false);
+  return ConvertSwitch(*this, primitive_dest_type);
 }
 
 StatusOr<Literal> LiteralBase::BitcastConvert(const Shape& dest_shape) const {
@@ -1907,7 +2021,7 @@ StatusOr<Literal> LiteralBase::BitcastConvert(const Shape& dest_shape) const {
               root_piece().size_bytes_dense());
 
   // Perform the reshape on little endian encoding even on big endian machines.
-  if (!kLittleEndian) {
+  if constexpr (!kLittleEndian) {
     // Swap byte ordering as per the input data type.
     size_t input_elem_size =
         ShapeUtil::ByteSizeOfPrimitiveType(shape().element_type());
@@ -1961,12 +2075,12 @@ StatusOr<Literal> LiteralBase::ConvertToShape(const Shape& dest_shape) const {
 template <typename NativeT>
 void LiteralBase::Piece::CopyElementsWithDynamicBound(
     const LiteralBase::Piece& src) {
-  auto dest_shape = subshape();
-  auto src_shape = src.subshape();
+  auto& dest_shape = subshape();
+  auto& src_shape = src.subshape();
 
   // At least one shape has to be static as bound.
   CHECK(dest_shape.is_static() || src_shape.is_static());
-  auto bound_shape = dest_shape.is_static() ? src_shape : dest_shape;
+  auto& bound_shape = dest_shape.is_static() ? src_shape : dest_shape;
   if (ShapeUtil::IsZeroElementArray(dest_shape)) {
     return;
   }
@@ -2065,6 +2179,8 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
       return EqualElementsInternal<tsl::float8_e5m2>(other, &multi_index);
     case F8E4M3FN:
       return EqualElementsInternal<tsl::float8_e4m3fn>(other, &multi_index);
+    case F8E4M3B11FNUZ:
+      return EqualElementsInternal<tsl::float8_e4m3b11>(other, &multi_index);
     case C64:
       return EqualElementsInternal<complex64>(other, &multi_index);
     case C128:
@@ -2184,6 +2300,10 @@ bool Literal::Piece::IsAll(const Literal& scalar) const {
       return AllElementsEqualValue(
           data<tsl::float8_e4m3fn>(),
           scalar.GetFirstElement<tsl::float8_e4m3fn>());
+    case F8E4M3B11FNUZ:
+      return AllElementsEqualValue(
+          data<tsl::float8_e4m3b11>(),
+          scalar.GetFirstElement<tsl::float8_e4m3b11>());
     case F16:
       return AllElementsEqualValue(data<half>(),
                                    scalar.GetFirstElement<half>());
@@ -2287,6 +2407,10 @@ bool LiteralBase::IsAllFloatImpl(float value, bool round_value) const {
       scalar.Set<tsl::float8_e4m3fn>({},
                                      static_cast<tsl::float8_e4m3fn>(value));
       break;
+    case F8E4M3B11FNUZ:
+      scalar.Set<tsl::float8_e4m3b11>({},
+                                      static_cast<tsl::float8_e4m3b11>(value));
+      break;
     case F16:
       scalar.Set<half>({}, static_cast<half>(value));
       break;
@@ -2391,6 +2515,9 @@ bool LiteralBase::IsR1Iota() const {
       case F8E4M3FN:
         return Get<tsl::float8_e4m3fn>({idx}) ==
                static_cast<tsl::float8_e4m3fn>(idx);
+      case F8E4M3B11FNUZ:
+        return Get<tsl::float8_e4m3b11>({idx}) ==
+               static_cast<tsl::float8_e4m3b11>(idx);
       case C64:
         return Get<complex64>({idx}) == complex64(idx, 0.0f);
       case C128:
@@ -2514,6 +2641,9 @@ bool LiteralBase::IsZero(absl::Span<const int64_t> indices) const {
     case F8E4M3FN:
       return Get<tsl::float8_e4m3fn>(indices) ==
              static_cast<tsl::float8_e4m3fn>(0.0f);
+    case F8E4M3B11FNUZ:
+      return Get<tsl::float8_e4m3b11>(indices) ==
+             static_cast<tsl::float8_e4m3b11>(0.0f);
     case PRED:
       return Get<bool>(indices) == false;
     default:
@@ -2621,6 +2751,11 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
           reinterpret_cast<const char*>(data<tsl::float8_e4m3fn>().data()),
           size_bytes_dense());
       break;
+    case F8E4M3B11FNUZ:
+      *proto->mutable_f8e4m3b11fnuzs() = std::string(
+          reinterpret_cast<const char*>(data<tsl::float8_e4m3b11>().data()),
+          size_bytes_dense());
+      break;
     case F32:
       CopyToRepeatedField(proto->mutable_f32s(), data<float>());
       break;
@@ -2757,6 +2892,13 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
                    s.size());
       memcpy(untyped_data(), s.data(), s.size());
     } break;
+    case F8E4M3B11FNUZ: {
+      const std::string& s(proto.f8e4m3b11fnuzs());
+      TF_RET_CHECK(data<tsl::float8_e4m3b11>().size() *
+                       sizeof(tsl::float8_e4m3b11) ==
+                   s.size());
+      memcpy(untyped_data(), s.data(), s.size());
+    } break;
     case F16: {
       const std::string& s(proto.f16s());
       TF_RET_CHECK(data<half>().size() * sizeof(half) == s.size());
@@ -2894,7 +3036,7 @@ void MutableBorrowingLiteral::CopyPieceSubtree(const Shape& shape,
     for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
       const Shape& subshape = shape.tuple_shapes(i);
 
-      auto child_piece = Piece();
+      Piece child_piece;
       child_piece.set_subshape(&subshape);
 
       CopyPieceSubtree(subshape, &src_piece->child(i), &child_piece);
@@ -3012,7 +3154,7 @@ void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) {
   for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
     const Shape& subshape = shape.tuple_shapes(i);
 
-    auto child_piece = Piece();
+    Piece child_piece;
     child_piece.set_subshape(&subshape);
 
     if (subshape.IsTuple()) {
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index ee8fa3d3bbe..d99625740b0 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -373,7 +373,7 @@ class LiteralBase {
   // literal's dynamic sizes.
   Literal ToStatic() const;
 
-  // Expand a static literal into a new one with a bounded dyanmic literal. The
+  // Expand a static literal into a new one with a bounded dynamic literal. The
   // static dimensions of the original literal becomes dynamic dimensions of the
   // new literal, where the argument `bounded_shape` becomes the bounded shape
   // of the new literal.
@@ -578,7 +578,7 @@ class LiteralBase {
     }
 
     // Returns the size of children pieces of this piece.
-    int64_t children_size() {
+    int64_t children_size() const {
       if (auto* tuple_rep = GetTupleRep()) {
         return tuple_rep->children.size();
       }
@@ -694,20 +694,6 @@ class LiteralBase {
       char data[kMaxInlinedBytes];
     };
 
-    // Helper visiter to access the buffer in the representation variant.
-    struct BufferVisitor {
-      char* operator()(Uninitialized&) { return nullptr; }
-      const char* operator()(const Uninitialized&) const { return nullptr; }
-      char* operator()(TupleRep&) { return nullptr; }
-      const char* operator()(const TupleRep&) const { return nullptr; }
-      char* operator()(DenseInlinedRep& rep) { return rep.data; }
-      const char* operator()(const DenseInlinedRep& rep) const {
-        return rep.data;
-      }
-      char* operator()(DenseRep& rep) { return rep.data; }
-      const char* operator()(const DenseRep& rep) const { return rep.data; }
-    };
-
     const DenseInlinedRep* GetDenseInlinedRep() const {
       return std::get_if<DenseInlinedRep>(&rep_);
     }
@@ -800,10 +786,6 @@ class LiteralBase {
   friend class BorrowingLiteral;
 
  private:
-  template <typename NativeT>
-  Literal SliceInternal(const Shape& result_shape,
-                        absl::Span<const int64_t> start_indices) const;
-
   // Like IsAllFloat, but if round_value is false and the value is not
   // representable with the literal's type (e.g., due to rounding error or
   // overflow/underflow when casting the value to the literal's type), returns
@@ -872,9 +854,9 @@ class MutableLiteralBase : public LiteralBase {
                        absl::Span<const int64_t> copy_size);
 
   // Copies one element from src_literal[src_index] to (*this)[dest_index].
-  Status CopyElementFrom(const LiteralSlice& src_literal,
-                         absl::Span<const int64_t> src_index,
-                         absl::Span<const int64_t> dest_index);
+  void CopyElementFrom(const LiteralSlice& src_literal,
+                       absl::Span<const int64_t> src_index,
+                       absl::Span<const int64_t> dest_index);
 
   // Sets an element in the literal at the given index. The multi_index is
   // CHECKed against the dimension sizes.
@@ -938,6 +920,22 @@ class MutableLiteralBase : public LiteralBase {
   Status PopulateParallel(
       absl::FunctionRef<NativeT(absl::Span<const int64_t>, int)> generator);
 
+  // Similar to Populate() but takes a populator function that allows caller
+  // specify how to write to the destination buffer rather than a generator that
+  // returns the values. This is useful when the value population simply does
+  // memcpy without compute therefore can be written in a type agnostic way, so
+  // that we can avoid templatizing the method for better code size.
+  //
+  // This literal must have a dense layout.
+  Status PopulateInplace(
+      absl::FunctionRef<void(void*, absl::Span<const int64_t>)> populator);
+
+  // A parallel version of PopulateInplace(). This can be used if the generator
+  // is thread-safe and the values for the shape's different elements are
+  // independent.
+  Status PopulateInplaceParallel(
+      absl::FunctionRef<void(void*, absl::Span<const int64_t>, int)> populator);
+
   // Fills this literal with the given value.
   template <typename NativeT>
   void PopulateWithValue(NativeT value);
@@ -968,29 +966,6 @@ class MutableLiteralBase : public LiteralBase {
                                absl::Span<const int64_t> dest_base,
                                absl::Span<const int64_t> copy_size);
 
-  // Utility structure which is used to create the optimal configuration for
-  // a ShapeUtil::ForEachIndex() scan across two literals.
-  struct StrideConfig {
-    StrideConfig(const Shape& source_shape, const Shape& dest_shape,
-                 absl::Span<const int64_t> dimensions);
-
-    // The dimensions of the stride operation. Essentially every dimension
-    // will be iterated from base[i] to base[i]+dimensions[i], in step[i]
-    // steps.
-    absl::Span<const int64_t> dimensions;
-    DimensionVector base;
-    DimensionVector step;
-    int64_t minor_dimension = 0;
-    // The size of the strides for source and destination. One of the two
-    // (the one looping through its most minor dimension) will be 1, while
-    // the other will be the stride size at the dimension matching the other
-    // shape most minor dimension being scanned.
-    int64_t dest_stride = 1;
-    int64_t source_stride = 1;
-    // The size of the inner loop on the most minor dimension.
-    int64_t minor_loop_size = 1;
-  };
-
   // A unique_ptr like class which may or may not have ownership of its pointer.
   // The literal may or may not own the storage of the shape. Creating/copying a
   // shape can incur significant overhead which in many case we'd like to avoid,
@@ -1092,6 +1067,9 @@ class MutableLiteralBase : public LiteralBase {
   Status PopulateInternal(
       absl::FunctionRef<NativeT(absl::Span<const int64_t>, int)> generator,
       bool parallel);
+  void PopulateInplaceInternal(
+      absl::FunctionRef<void(void*, absl::Span<const int64_t>, int)> populator,
+      bool parallel);
 
   friend class LiteralBase;
   friend class MutableBorrowingLiteral;
@@ -1461,8 +1439,7 @@ TF_ATTRIBUTE_NOINLINE Status MutableLiteralBase::PopulateInternal(
     absl::FunctionRef<NativeT(absl::Span<const int64_t>, int)> generator,
     bool parallel) {
   const Shape& this_shape = shape();
-  const int64_t rank = this_shape.rank();
-  TF_RET_CHECK(LayoutUtil::IsDenseArray(this_shape));
+  DCHECK(LayoutUtil::IsDenseArray(this_shape));
   TF_RET_CHECK(this_shape.element_type() ==
                primitive_util::NativeToPrimitiveType<NativeT>())
       << "Failing to populate literal with element type "
@@ -1470,49 +1447,18 @@ TF_ATTRIBUTE_NOINLINE Status MutableLiteralBase::PopulateInternal(
       << " using data of type "
       << primitive_util::LowercasePrimitiveTypeName(
              primitive_util::NativeToPrimitiveType<NativeT>());
-  absl::Span<NativeT> literal_data = data<NativeT>();
-  if (rank > 0) {
-    StrideConfig stride_config(this_shape, this_shape, this_shape.dimensions());
-    int64_t minor_dimension_size =
-        ShapeUtil::GetDimension(this_shape, stride_config.minor_dimension);
-
-    auto init_function = [&](absl::Span<const int64_t> indexes,
-                             int thread_id) -> StatusOr<bool> {
-      DimensionVector minor_scan_indexes(rank, 0);
-      const int64_t index =
-          IndexUtil::MultidimensionalIndexToLinearIndex(shape(), indexes);
-      std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin());
-      for (int64_t i = 0; i < minor_dimension_size; ++i) {
-        minor_scan_indexes[stride_config.minor_dimension] = i;
-        literal_data.at(index + i) = generator(minor_scan_indexes, thread_id);
-      }
-      return true;
-    };
-    if (parallel) {
-      ShapeUtil::ForEachIndexParallel(this_shape, stride_config.base,
-                                      stride_config.dimensions,
-                                      stride_config.step, init_function);
-    } else {
-      ShapeUtil::ForEachIndex(
-          this_shape, stride_config.base, stride_config.dimensions,
-          stride_config.step,
-          [&init_function](
-              absl::Span<const int64_t> indexes) -> StatusOr<bool> {
-            auto result_ignored = init_function(indexes, /*thread_id=*/-1);
-            return true;
-          });
-    }
-  } else {
-    // For scalars.
-    literal_data.at(0) = generator({}, /*thread_id=*/-1);
-  }
+  PopulateInplaceInternal(
+      [&](void* dest, absl::Span<const int64_t> indices, int thread_id) {
+        *static_cast<NativeT*>(dest) = generator(indices, thread_id);
+      },
+      parallel);
   return OkStatus();
 }
 
 template <typename NativeT>
 TF_ATTRIBUTE_NOINLINE Status MutableLiteralBase::Populate(
     absl::FunctionRef<NativeT(absl::Span<const int64_t>)> generator) {
-  CHECK(LayoutUtil::IsDenseArray(shape()))
+  TF_RET_CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
   return PopulateInternal<NativeT>(
       [&](absl::Span<const int64_t> indexes, int /*thread_id*/) {
@@ -1523,20 +1469,16 @@ TF_ATTRIBUTE_NOINLINE Status MutableLiteralBase::Populate(
 template <typename NativeT>
 TF_ATTRIBUTE_NOINLINE Status MutableLiteralBase::PopulateParallel(
     absl::FunctionRef<NativeT(absl::Span<const int64_t>, int)> generator) {
-  CHECK(LayoutUtil::IsDenseArray(shape()))
+  TF_RET_CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
-  return PopulateInternal<NativeT>(
-      [&](absl::Span<const int64_t> indexes, int thread_id) {
-        return generator(indexes, thread_id);
-      },
-      /*parallel=*/data<NativeT>().size() > 32);
+  return PopulateInternal<NativeT>(generator,
+                                   /*parallel=*/data<NativeT>().size() > 32);
 }
 
 template <typename NativeT>
 void MutableLiteralBase::PopulateWithValue(NativeT value) {
   CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
-  CHECK(shape().IsArray());
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
   for (NativeT& element : data<NativeT>()) {
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index 01d3e813e37..584a87e4573 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -86,6 +86,13 @@ bool CompareEqual<tsl::float8_e4m3fn>(tsl::float8_e4m3fn lhs,
                                                                 multi_index);
 }
 template <>
+bool CompareEqual<tsl::float8_e4m3b11>(tsl::float8_e4m3b11 lhs,
+                                       tsl::float8_e4m3b11 rhs,
+                                       absl::Span<const int64_t> multi_index) {
+  return CompareFloatsBitwiseEqual<tsl::float8_e4m3b11, uint8_t>(lhs, rhs,
+                                                                 multi_index);
+}
+template <>
 bool CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs,
                             absl::Span<const int64_t> multi_index) {
   return CompareFloatsBitwiseEqual<bfloat16, uint16_t>(lhs, rhs, multi_index);
@@ -143,6 +150,24 @@ Status MakeErrorStatus(NativeT lhs, NativeT rhs,
       LiteralUtil::MultiIndexAsString(multi_index), StrCat(lhs), StrCat(rhs));
 }
 
+template <>
+Status MakeErrorStatus(s4 lhs, s4 rhs, absl::Span<const int64_t> multi_index) {
+  return InvalidArgument(
+      "first mismatch at array index %s:\n  expected value: %s\n  actual "
+      "value:   %s",
+      LiteralUtil::MultiIndexAsString(multi_index),
+      StrCat(static_cast<int8_t>(lhs)), StrCat(static_cast<int8_t>(rhs)));
+}
+
+template <>
+Status MakeErrorStatus(u4 lhs, u4 rhs, absl::Span<const int64_t> multi_index) {
+  return InvalidArgument(
+      "first mismatch at array index %s:\n  expected value: %s\n  actual "
+      "value:   %s",
+      LiteralUtil::MultiIndexAsString(multi_index),
+      StrCat(static_cast<uint8_t>(lhs)), StrCat(static_cast<uint8_t>(rhs)));
+}
+
 template <>
 Status MakeErrorStatus(tsl::float8_e5m2 lhs, tsl::float8_e5m2 rhs,
                        absl::Span<const int64_t> multi_index) {
@@ -156,6 +181,12 @@ Status MakeErrorStatus(tsl::float8_e4m3fn lhs, tsl::float8_e4m3fn rhs,
                                                              multi_index);
 }
 template <>
+Status MakeErrorStatus(tsl::float8_e4m3b11 lhs, tsl::float8_e4m3b11 rhs,
+                       absl::Span<const int64_t> multi_index) {
+  return MakeBitwiseErrorStatus<tsl::float8_e4m3b11, uint8_t>(lhs, rhs,
+                                                              multi_index);
+}
+template <>
 Status MakeErrorStatus(bfloat16 lhs, bfloat16 rhs,
                        absl::Span<const int64_t> multi_index) {
   return MakeBitwiseErrorStatus<bfloat16, uint16_t>(lhs, rhs, multi_index);
@@ -275,6 +306,10 @@ std::string FpValueToString(tsl::float8_e4m3fn value) {
   return absl::StrFormat("%5.3g", static_cast<double>(value));
 }
 
+std::string FpValueToString(tsl::float8_e4m3b11 value) {
+  return absl::StrFormat("%5.3g", static_cast<double>(value));
+}
+
 std::string FpValueToString(bfloat16 value) {
   return absl::StrFormat("%10.4g", static_cast<double>(value));
 }
@@ -328,6 +363,11 @@ double FpAbsoluteValue(tsl::float8_e4m3fn value) {
   return FpAbsoluteValue<float>(static_cast<float>(value));
 }
 
+template <>
+double FpAbsoluteValue(tsl::float8_e4m3b11 value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
 // Helper class for comparing floating-point literals within an error bound.
 template <typename NativeT>
 class NearComparator {
@@ -782,6 +822,9 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual,
       case PRED:
         result = Equal<bool>(expected, actual, index, 0, miscompared_ptr);
         break;
+      case S4:
+        result = Equal<s4>(expected, actual, index, 0, miscompared_ptr);
+        break;
       case S8:
         result = Equal<int8_t>(expected, actual, index, 0, miscompared_ptr);
         break;
@@ -794,6 +837,9 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual,
       case S64:
         result = Equal<int64_t>(expected, actual, index, 0, miscompared_ptr);
         break;
+      case U4:
+        result = Equal<u4>(expected, actual, index, 0, miscompared_ptr);
+        break;
       case U8:
         result = Equal<uint8_t>(expected, actual, index, 0, miscompared_ptr);
         break;
@@ -814,6 +860,10 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual,
         result = Equal<tsl::float8_e4m3fn>(expected, actual, index, 0,
                                            miscompared_ptr);
         break;
+      case F8E4M3B11FNUZ:
+        result = Equal<tsl::float8_e4m3b11>(expected, actual, index, 0,
+                                            miscompared_ptr);
+        break;
       case BF16:
         result = Equal<bfloat16>(expected, actual, index, 0, miscompared_ptr);
         break;
@@ -874,14 +924,13 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
           NearHelper(expected_element, actual_element, element_index, error,
                      detailed_message, miscompare_callback);
       if (!element_result.ok()) {
-        element_result = InvalidArgument("Array at shape index %s, %s",
-                                         element_index.ToString(),
-                                         element_result.error_message());
+        element_result =
+            InvalidArgument("Array at shape index %s, %s",
+                            element_index.ToString(), element_result.message());
         if (return_status.ok()) {
           return_status = element_result;
         } else {
-          return_status =
-              AppendStatus(return_status, element_result.error_message());
+          return_status = AppendStatus(return_status, element_result.message());
         }
       }
     }
@@ -892,7 +941,7 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
       return_status =
           InvalidArgument("\nMismatches in shape %s (%d elements):\n%s",
                           ShapeUtil::HumanString(actual.shape()),
-                          total_elements, return_status.error_message());
+                          total_elements, return_status.message());
     }
     return return_status;
   }
@@ -912,6 +961,11 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
             expected, actual, shape_index, error, use_detailed_message,
             miscompare_callback);
         break;
+      case F8E4M3B11FNUZ:
+        return NearComparator<tsl::float8_e4m3b11>::Compare(
+            expected, actual, shape_index, error, use_detailed_message,
+            miscompare_callback);
+        break;
       case BF16:
         return NearComparator<bfloat16>::Compare(expected, actual, shape_index,
                                                  error, use_detailed_message,
@@ -1062,7 +1116,7 @@ Status EmitLiteralsInErrorMessage(const Status& result,
     return result;
   }
   return InvalidArgument("%s\n\nExpected literal:\n%s\n\nActual literal:\n%s",
-                         result.error_message(), ToStringTruncated(expected),
+                         result.message(), ToStringTruncated(expected),
                          ToStringTruncated(actual));
 }
 
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index 537e0ca33b4..b60bb5f9171 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -154,6 +154,10 @@ TEST_F(LiteralUtilTest, LiteralScalarToString) {
   auto f8e4m3_lit =
       LiteralUtil::CreateR0<tsl::float8_e4m3fn>(tsl::float8_e4m3fn(0.5));
   EXPECT_EQ("f8e4m3fn[] 0.5", f8e4m3_lit.ToString());
+
+  auto f8e4m3b11fnuz_lit =
+      LiteralUtil::CreateR0<tsl::float8_e4m3b11>(tsl::float8_e4m3b11(0.5));
+  EXPECT_EQ("f8e4m3b11fnuz[] 0.5", f8e4m3b11fnuz_lit.ToString());
 }
 
 TEST_F(LiteralUtilTest, LiteralVectorToString) {
@@ -585,6 +589,10 @@ TEST_F(LiteralUtilTest, IsAll) {
   EXPECT_FALSE(LiteralUtil::CreateR1<tsl::float8_e4m3fn>({r16}).IsAll(8));
   EXPECT_TRUE(LiteralUtil::CreateR1<tsl::float8_e4m3fn>({r16}).IsAll(9));
 
+  tsl::float8_e4m3b11 s16(9);  // Exactly representable in e4m3
+  EXPECT_FALSE(LiteralUtil::CreateR1<tsl::float8_e4m3b11>({s16}).IsAll(8));
+  EXPECT_TRUE(LiteralUtil::CreateR1<tsl::float8_e4m3b11>({s16}).IsAll(9));
+
   complex64 c8_9 = {8, 9};
   EXPECT_FALSE(LiteralUtil::CreateR2<complex64>({{c8_9}, {c8_9}}).IsAll(8));
 
@@ -1083,6 +1091,14 @@ TEST_F(LiteralUtilTest, PopulateWithValueR1F8e4m3) {
   EXPECT_EQ(output, expected);
 }
 
+TEST_F(LiteralUtilTest, PopulateWithValueR1F8e4m3b11) {
+  Literal output(ShapeUtil::MakeShape(F8E4M3B11FNUZ, {3}));
+  tsl::float8_e4m3b11 x(0.5f);
+  output.PopulateWithValue<tsl::float8_e4m3b11>(x);
+  auto expected = LiteralUtil::CreateR1<tsl::float8_e4m3b11>({x, x, x});
+  EXPECT_EQ(output, expected);
+}
+
 TEST_F(LiteralUtilTest, ReplicateR2U32) {
   auto input = LiteralUtil::CreateR2<uint32_t>(
       {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
@@ -1257,8 +1273,7 @@ TEST_F(LiteralUtilTest, CopyFromDifferentShapes) {
   auto vector = LiteralUtil::CreateR1<float>({5.0, 7.0});
   Status status = matrix.CopyFrom(vector);
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              HasSubstr("Destination subshape incompatible"));
+  EXPECT_THAT(status.message(), HasSubstr("Destination subshape incompatible"));
 }
 
 TEST_F(LiteralUtilTest, F16) {
@@ -1577,6 +1592,9 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatchF8) {
   using e4 = tsl::float8_e4m3fn;
   auto f8e4m3 = LiteralUtil::CreateR2WithLayout<e4>(
       {{e4{0.}, e4{1.}}, {e4{2.}, e4{3.}}}, layout_r2_dim0major_);
+  using b11 = tsl::float8_e4m3b11;
+  auto f8e4m3b11 = LiteralUtil::CreateR2WithLayout<b11>(
+      {{b11{0.}, b11{1.}}, {b11{2.}, b11{3.}}}, layout_r2_dim0major_);
   Literal conv;
 
   conv = s8.Convert(F8E5M2).value();
@@ -1614,6 +1632,15 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatchF8) {
 
   conv = f8e4m3.Convert(C128).value();
   EXPECT_EQ(conv, c128);
+
+  conv = f8e4m3b11.Convert(S8).value();
+  EXPECT_EQ(conv, s8);
+
+  conv = f8e4m3b11.Convert(F32).value();
+  EXPECT_EQ(conv, f32);
+
+  conv = f8e4m3b11.Convert(C128).value();
+  EXPECT_EQ(conv, c128);
 }
 
 TEST_F(LiteralUtilTest, BitcastConvert) {
@@ -1633,8 +1660,8 @@ TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) {
       literal.BitcastConvert(ShapeUtil::ChangeElementType(literal.shape(), F64))
           .status();
   EXPECT_NE(OkStatus(), status);
-  EXPECT_TRUE(absl::StrContains(status.error_message(),
-                                "to a shape of different size"));
+  EXPECT_TRUE(
+      absl::StrContains(status.message(), "to a shape of different size"));
 }
 
 // Sets the layout of the given ShapeProto to the default.
@@ -2010,6 +2037,9 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
   using e4 = tsl::float8_e4m3fn;
   auto vector_f8e4m3 =
       LiteralUtil::CreateR1<e4>({e4{10.0}, e4{20.0}, e4{-32.0}});
+  using b11 = tsl::float8_e4m3b11;
+  auto vector_f8e4m3b11 =
+      LiteralUtil::CreateR1<b11>({b11{10.0}, b11{20.0}, b11{-30.0}});
   auto matrix_pred =
       LiteralUtil::CreateR2<bool>({{true, false, true}, {false, false, true}});
   auto vector_s4 = LiteralUtil::CreateR1<s4>({s4{-1}, s4{3}, s4{7}});
@@ -2032,6 +2062,7 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
   EXPECT_EQ(vector_bfloat16, to_from_proto(vector_bfloat16));
   EXPECT_EQ(vector_f8e5m2, to_from_proto(vector_f8e5m2));
   EXPECT_EQ(vector_f8e4m3, to_from_proto(vector_f8e4m3));
+  EXPECT_EQ(vector_f8e4m3b11, to_from_proto(vector_f8e4m3b11));
   EXPECT_EQ(matrix_pred, to_from_proto(matrix_pred));
   EXPECT_EQ(vector_s4, to_from_proto(vector_s4));
   EXPECT_EQ(vector_u4, to_from_proto(vector_u4));
@@ -2049,7 +2080,7 @@ TEST_F(LiteralUtilTest, InvalidProtoNoValues) {
   *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}).ToProto();
   Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Expected 3 elements in LiteralProto"));
 }
 
@@ -2085,7 +2116,7 @@ TEST_F(LiteralUtilTest, InvalidProtoNoShape) {
   proto.add_preds(false);
   Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("LiteralProto has no shape"));
+  EXPECT_THAT(status.message(), HasSubstr("LiteralProto has no shape"));
 }
 
 TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) {
@@ -2097,7 +2128,7 @@ TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) {
   proto.add_preds(false);
   Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Expected 3 elements in LiteralProto"));
 }
 
@@ -2110,7 +2141,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewValues) {
   proto.add_f32s(3.0);
   Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Expected 84 elements in LiteralProto"));
 }
 
@@ -2123,7 +2154,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooManyValues) {
   proto.add_s32s(100);
   Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Expected 2 elements in LiteralProto"));
 }
 
@@ -2138,7 +2169,7 @@ TEST_F(LiteralUtilTest, InvalidProtoMissingLayout) {
   proto.add_preds(false);
   Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("LiteralProto has no layout"));
+  EXPECT_THAT(status.message(), HasSubstr("LiteralProto has no layout"));
 }
 
 TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) {
@@ -2156,7 +2187,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) {
 
   Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("Expected 2 tuple elements"));
+  EXPECT_THAT(status.message(), HasSubstr("Expected 2 tuple elements"));
 }
 
 TEST_F(LiteralUtilTest, InvalidProtoTooManyTupleElements) {
@@ -2181,7 +2212,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooManyTupleElements) {
 
   Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("Expected 2 tuple elements"));
+  EXPECT_THAT(status.message(), HasSubstr("Expected 2 tuple elements"));
 }
 
 TEST_F(LiteralUtilTest, BroadcastVectorToMatrix0) {
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 25161a1005d..94b8bd4d9f6 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <cstring>
 #include <functional>
 #include <limits>
@@ -113,6 +114,9 @@ Literal CreateScalar(PrimitiveType primitive_type, Args... args) {
     case F8E4M3FN:
       return CreateScalarImpl<F8E4M3FN>(F<F8E4M3FN>{},
                                         std::forward<Args>(args)...);
+    case F8E4M3B11FNUZ:
+      return CreateScalarImpl<F8E4M3B11FNUZ>(F<F8E4M3B11FNUZ>{},
+                                             std::forward<Args>(args)...);
     case F16:
       return CreateScalarImpl<F16>(F<F16>{}, std::forward<Args>(args)...);
     case BF16:
@@ -158,7 +162,8 @@ struct IsReal {
       std::is_integral<T>::value || std::is_floating_point<T>::value ||
       std::is_same<bfloat16, T>::value || std::is_same<half, T>::value ||
       std::is_same<tsl::float8_e5m2, T>::value ||
-      std::is_same<tsl::float8_e4m3fn, T>::value;
+      std::is_same<tsl::float8_e4m3fn, T>::value ||
+      std::is_same<tsl::float8_e4m3b11, T>::value;
 };
 
 template <typename T>
@@ -216,6 +221,18 @@ GetMinImpl() {
   return std::numeric_limits<NativeT>::lowest();
 }
 
+template <typename NativeT>
+std::enable_if_t<std::is_same<NativeT, tsl::float8_e4m3b11>::value, NativeT>
+GetMaxImpl() {
+  return std::numeric_limits<NativeT>::max();
+}
+
+template <typename NativeT>
+std::enable_if_t<std::is_same<NativeT, tsl::float8_e4m3b11>::value, NativeT>
+GetMinImpl() {
+  return std::numeric_limits<NativeT>::lowest();
+}
+
 template <typename NativeT>
 std::enable_if_t<!IsReal<NativeT>::value, NativeT> GetMaxImpl() {
   LOG(FATAL) << "No max value for given type.";
@@ -318,6 +335,11 @@ void SetScalarAtIndexImpl(MutableLiteralBase& literal,
   return ConvertType<float, bfloat16>(f32_literal);
 }
 
+/* static */ Literal LiteralUtil::ConvertF32ToS8(
+    const LiteralSlice& f32_literal) {
+  return ConvertType<float, int8_t>(f32_literal);
+}
+
 /* static */ Literal LiteralUtil::ConvertF32ToF64(
     const LiteralSlice& f32_literal) {
   return ConvertType<float, double>(f32_literal);
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index 5ecaee51b67..c8ac111ae39 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -227,6 +227,7 @@ class LiteralUtil {
   static Literal ConvertBF16ToF32(const LiteralSlice& bf16_literal);
   static Literal ConvertBF16ToF64(const LiteralSlice& bf16_literal);
   static Literal ConvertF32ToBF16(const LiteralSlice& f32_literal);
+  static Literal ConvertF32ToS8(const LiteralSlice& f32_literal);
   static Literal ConvertF32ToF64(const LiteralSlice& f32_literal);
   static Literal ConvertF64ToBF16(const LiteralSlice& f64_literal);
   static Literal ConvertF64ToF32(const LiteralSlice& f64_literal);
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/BUILD b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/BUILD
index 877676f2723..8ede01d328a 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/BUILD
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/BUILD
@@ -29,8 +29,8 @@ gentbl_cc_library(
 cc_library(
     name = "passes",
     srcs = [
-        "legalize_collective_ops.cc",
         "legalize_i1_vector_transfers.cc",
+        "legalize_library_ops.cc",
         "lmhlo_to_cpu_runtime.cc",
         "remove_copies_to_out_params.cc",
         "sparse_rewrite_passes.cc",
@@ -62,5 +62,6 @@ cc_library(
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:VectorDialect",
+        "@stablehlo//:chlo_ops",
     ],
 )
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_collective_ops.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_library_ops.cc
similarity index 66%
rename from tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_collective_ops.cc
rename to tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_library_ops.cc
index ca571238ae9..35ea93dd2a1 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_collective_ops.cc
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_library_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <iterator>
 #include <memory>
 #include <optional>
@@ -38,13 +39,13 @@ namespace xla {
 namespace cpu {
 namespace {
 
-#define GEN_PASS_DEF_LEGALIZECOLLECTIVEOPSPASS
+#define GEN_PASS_DEF_LEGALIZELIBRARYOPSPASS
 #include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h.inc"
 
 using namespace mlir;  // NOLINT
 
-class LegalizeCollectiveOpsPass
-    : public impl::LegalizeCollectiveOpsPassBase<LegalizeCollectiveOpsPass> {
+class LegalizeLibraryOpsPass
+    : public impl::LegalizeLibraryOpsPassBase<LegalizeLibraryOpsPass> {
   void runOnOperation() override;
 };
 
@@ -201,8 +202,13 @@ class AllToAllLowering : public OpRewritePattern<mhlo::AllToAllOp> {
 
     rewriter.replaceOpWithNewOp<xla_cpu::AllToAllOp>(
         op, op->getResultTypes(), op->getOperands(), dsts,
-        op.getReplicaGroupsAttr(), op.getSplitDimensionAttr(),
-        op.getConcatDimensionAttr(), op.getSplitCountAttr());
+        op.getReplicaGroupsAttr(),
+        rewriter.getI32IntegerAttr(op.getChannelHandle() ? 1 : 0),
+        rewriter.getI64IntegerAttr(op.getChannelHandle()
+                                       ? op.getChannelHandle()->getHandle()
+                                       : int64_t{0}),
+        op.getSplitDimensionAttr(), op.getConcatDimensionAttr(),
+        op.getSplitCountAttr());
     return success();
   };
 };
@@ -248,6 +254,17 @@ class OutfeedLowering : public OpRewritePattern<mhlo::OutfeedOp> {
   };
 };
 
+class AfterAllLowering : public OpRewritePattern<mhlo::AfterAllOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::AfterAllOp op,
+                                PatternRewriter& rewriter) const override {
+    // We don't reorder collective ops, so after_all is a no-op.
+    rewriter.replaceOp(op, op->getOperand(0));
+    return success();
+  };
+};
+
 class RngBitGeneratorLowering
     : public OpRewritePattern<mhlo::RngBitGeneratorOp> {
   using OpRewritePattern<mhlo::RngBitGeneratorOp>::OpRewritePattern;
@@ -278,14 +295,115 @@ class AddDependencyLowering : public OpRewritePattern<mhlo::AddDependencyOp> {
   };
 };
 
-void LegalizeCollectiveOpsPass::runOnOperation() {
+class ConvolutionLowering : public OpRewritePattern<mhlo::ConvolutionOp> {
+  using OpRewritePattern<mhlo::ConvolutionOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::ConvolutionOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    auto input_shape = op.getLhs().getType().dyn_cast<ShapedType>();
+    auto kernel_shape = op.getRhs().getType().dyn_cast<ShapedType>();
+    auto output_shape = op.getResult().getType().dyn_cast<ShapedType>();
+
+    auto dnums = op.getDimensionNumbers();
+    auto reversals = op.getWindowReversal();
+    // Convolution op is implementable as Eigen convolution if:
+    // - input and kernel have non-zero number of elements
+    // - input is NHWC order
+    // - kernel is HWIO order
+    // - some other layout constraints
+    auto implementable_as_eigen_convolution = [&]() {
+      if (!input_shape || !kernel_shape || !output_shape ||
+          !input_shape.hasStaticShape() || !kernel_shape.hasStaticShape() ||
+          !output_shape.hasStaticShape()) {
+        return false;
+      }
+
+      auto primitive_type = input_shape.getElementType();
+      if (!(primitive_type.isF32() || primitive_type.isF16())) {
+        return false;
+      }
+
+      if (llvm::is_contained(input_shape.getShape(), 0) ||
+          llvm::is_contained(kernel_shape.getShape(), 0)) {
+        return false;
+      }
+
+      if (reversals.has_value() &&
+          llvm::is_contained(reversals.value().getValues<bool>(), true)) {
+        return false;
+      }
+
+      auto numSpatialDims = dnums.getOutputSpatialDimensions().size();
+      if (numSpatialDims < 1 || numSpatialDims > 3) {
+        return false;
+      }
+
+      if (!llvm::equal(dnums.getInputSpatialDimensions(),
+                       llvm::seq<int64_t>(1, numSpatialDims + 1))) {
+        return false;
+      }
+
+      if (!llvm::equal(dnums.getKernelSpatialDimensions(),
+                       llvm::seq<int64_t>(0, numSpatialDims))) {
+        return false;
+      }
+
+      if (!llvm::equal(dnums.getOutputSpatialDimensions(),
+                       llvm::seq<int64_t>(1, numSpatialDims + 1))) {
+        return false;
+      }
+
+      if (!op.getWindowStrides().has_value() || !op.getPadding().has_value() ||
+          !op.getLhsDilation().has_value() || !op.getRhsDilation().has_value())
+        return false;
+
+      auto input_rank = input_shape.getRank();
+      auto kernel_rank = kernel_shape.getRank();
+      auto output_rank = output_shape.getRank();
+      return dnums.getInputBatchDimension() == 0 &&
+             dnums.getInputFeatureDimension() == input_rank - 1 &&
+             dnums.getOutputBatchDimension() == 0 &&
+             dnums.getOutputFeatureDimension() == output_rank - 1 &&
+             dnums.getKernelInputFeatureDimension() == kernel_rank - 2 &&
+             dnums.getKernelOutputFeatureDimension() == kernel_rank - 1;
+    };
+    if (!implementable_as_eigen_convolution()) {
+      return failure();
+    }
+
+    auto dst = b.create<tensor::EmptyOp>(op.getLoc(), op.getType().getShape(),
+                                         op.getType().getElementType());
+
+    rewriter.replaceOpWithNewOp<xla_cpu::ConvolutionOp>(
+        op, op->getResultTypes(), op.getLhs(), op.getRhs(), dst,
+        op.getWindowStridesAttr(), op.getPaddingAttr(), op.getLhsDilationAttr(),
+        op.getRhsDilationAttr(), op.getWindowReversalAttr(),
+        rewriter.getI64IntegerAttr(dnums.getInputBatchDimension()),
+        rewriter.getI64IntegerAttr(dnums.getInputFeatureDimension()),
+        rewriter.getI64ArrayAttr(dnums.getInputSpatialDimensions()),
+        rewriter.getI64IntegerAttr(dnums.getKernelInputFeatureDimension()),
+        rewriter.getI64IntegerAttr(dnums.getKernelOutputFeatureDimension()),
+        rewriter.getI64ArrayAttr(dnums.getKernelSpatialDimensions()),
+        rewriter.getI64IntegerAttr(dnums.getOutputBatchDimension()),
+        rewriter.getI64IntegerAttr(dnums.getOutputFeatureDimension()),
+        rewriter.getI64ArrayAttr(dnums.getOutputSpatialDimensions()),
+        op.getFeatureGroupCountAttr(), op.getBatchGroupCountAttr(),
+        op.getPrecisionConfigAttr());
+    return success();
+  };
+};
+
+void LegalizeLibraryOpsPass::runOnOperation() {
   func::FuncOp func = getOperation();
   MLIRContext* ctx = func.getContext();
 
-  // Convert mhlo collective operations to XLA cpu ops.
+  // Convert mhlo library operations to XLA cpu ops.
   RewritePatternSet patterns(ctx);
-  patterns.insert<AddDependencyLowering, AllReduceLowering, AllToAllLowering,
-                  CollectivePermuteLowering, FftLowering,
+  patterns.insert<AddDependencyLowering, AfterAllLowering, AllReduceLowering,
+                  AllToAllLowering, CollectivePermuteLowering,
+                  ConvolutionLowering, FftLowering,
                   IdLowering<mhlo::PartitionIdOp, xla_cpu::PartitionIdOp>,
                   IdLowering<mhlo::ReplicaIdOp, xla_cpu::ReplicaIdOp>,
                   OutfeedLowering, RngBitGeneratorLowering>(ctx);
@@ -297,8 +415,8 @@ void LegalizeCollectiveOpsPass::runOnOperation() {
 
 }  // namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeCollectiveOpsPass() {
-  return std::make_unique<LegalizeCollectiveOpsPass>();
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeLibraryOpsPass() {
+  return std::make_unique<LegalizeLibraryOpsPass>();
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/lmhlo_to_cpu_runtime.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/lmhlo_to_cpu_runtime.cc
index 1444d8daece..5faac2a5fd0 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/lmhlo_to_cpu_runtime.cc
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/lmhlo_to_cpu_runtime.cc
@@ -143,9 +143,9 @@ class CustomCallOpLowering : public OpRewritePattern<CustomCallOp> {
     callee->setAttr("rt.dynamic", UnitAttr::get(b.getContext()));
 
     // Forward backend config to the custom call implementation.
-    auto dict = op.getBackendConfig()
-                    ? op.getBackendConfig()->cast<mlir::DictionaryAttr>()
-                    : nullptr;
+    auto config = op.getBackendConfig();
+    if (!config) return op.emitOpError("Failed to get backend config");
+    auto dict = config->cast<mlir::DictionaryAttr>();
     llvm::SmallVector<NamedAttribute> backend_config(dict.begin(), dict.end());
 
     // Call the custom call function forwarding user-defined attributes.
@@ -225,6 +225,11 @@ class CustomCallOpLowering : public OpRewritePattern<CustomCallOp> {
         {b.getStringAttr("api_version"), api_version},
         {b.getStringAttr("call_target_name"), op.getCallTargetNameAttr()}};
 
+    if (auto backend_config = op.getBackendConfigAttr()) {
+      custom_call_attrs.emplace_back(b.getStringAttr("backend_config"),
+                                     op.getBackendConfigAttr());
+    }
+
     // Call the runtime intrinsic with the original operands.
     auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
         op, callee.getName(), TypeRange(), operands);
@@ -383,20 +388,40 @@ class CollectivePermuteLowering
 
 //===----------------------------------------------------------------------===//
 
-class FftLowering : public OpRewritePattern<xla_cpu::FftOp> {
+class ConvolutionLowering : public OpRewritePattern<xla_cpu::ConvolutionOp> {
  public:
-  FftLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+  ConvolutionLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
       : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
 
-  LogicalResult matchAndRewrite(xla_cpu::FftOp op,
+  LogicalResult matchAndRewrite(xla_cpu::ConvolutionOp op,
                                 PatternRewriter& rewriter) const override {
-    CreateCallForDpsCollectiveOp(op.getOperation(), custom_calls_, kCallTarget,
-                                 rewriter);
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+    b.setInsertionPoint(op);
+
+    // Subview ops result in strided Memrefs. The runtime can't deal with them,
+    // so we copy everything that doesn't have the default layout.
+    SmallVector<Value> new_operands = EnsureFlatMemrefs(op->getOperands(), b);
+
+    func::FuncOp callee = custom_calls_.GetOrCreate(
+        b, kCallTarget, TypeRange(ValueRange(new_operands)), TypeRange());
+    auto call =
+        b.create<func::CallOp>(callee.getName(), TypeRange(), new_operands);
+
+    // Copy attributes from original op.
+    for (auto name :
+         {"inputBatchDimension", "inputSpatialDimensions",
+          "inputFeatureDimension", "kernelSpatialDimensions",
+          "kernelInputFeatureDimension", "kernelOutputFeatureDimension",
+          "outputSpatialDimensions", "window_strides", "padding",
+          "lhs_dilation", "rhs_dilation", "feature_group_count"}) {
+      call->setAttr(name, op->getAttr(name));
+    }
+    rewriter.eraseOp(op);
     return success();
   }
 
  private:
-  static constexpr const char kCallTarget[] = "xla.cpu.fft";
+  static constexpr const char kCallTarget[] = "xla.cpu.convolution";
 
   CustomCallDeclarations& custom_calls_;
 };
@@ -481,6 +506,26 @@ class OutfeedLowering : public OpRewritePattern<xla_cpu::OutfeedOp> {
 
 //===----------------------------------------------------------------------===//
 
+class FftLowering : public OpRewritePattern<xla_cpu::FftOp> {
+ public:
+  FftLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(xla_cpu::FftOp op,
+                                PatternRewriter& rewriter) const override {
+    CreateCallForDpsCollectiveOp(op.getOperation(), custom_calls_, kCallTarget,
+                                 rewriter);
+    return success();
+  }
+
+ private:
+  static constexpr const char kCallTarget[] = "xla.cpu.fft";
+
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
 void ConvertLmhloToCpuRuntimePass::runOnOperation() {
   ModuleOp module = getOperation();
   MLIRContext* ctx = module.getContext();
@@ -491,10 +536,11 @@ void ConvertLmhloToCpuRuntimePass::runOnOperation() {
 
   // Convert lmhlo operations to XLA cpu runtime custom calls.
   RewritePatternSet patterns(ctx);
-  patterns.insert<AllReduceLowering, AllToAllLowering,
-                  CollectivePermuteLowering, CustomCallOpLowering, FftLowering,
-                  InfeedOpLowering, OutfeedLowering, RngBitGeneratorLowering>(
-      ctx, custom_calls);
+  patterns
+      .insert<AllReduceLowering, AllToAllLowering, CollectivePermuteLowering,
+              ConvolutionLowering, CustomCallOpLowering, FftLowering,
+              InfeedOpLowering, OutfeedLowering, RngBitGeneratorLowering>(
+          ctx, custom_calls);
   patterns.insert<IdOpLowering<PartitionIdOp>>(ctx, "xla.cpu.partition_id",
                                                custom_calls);
   patterns.insert<IdOpLowering<ReplicaIdOp>>(ctx, "xla.cpu.replica_id",
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h
index 9d91e10b196..0ede4247626 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h
@@ -36,7 +36,7 @@ std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 createXlaAbiLegalizationPass();
 
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createLegalizeCollectiveOpsPass();
+createLegalizeLibraryOpsPass();
 
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 createLegalizeI1VectorTransferOpsPass();
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.td b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.td
index 6c7ff4995c8..1dab8a2a2e7 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.td
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.td
@@ -51,12 +51,12 @@ def LegalizeXlaAbiPass :
   let constructor = "createXlaAbiLegalizationPass()";
 }
 
-def LegalizeCollectiveOpsPass :
-   Pass<"xla-legalize-collective-ops", "mlir::func::FuncOp"> {
-  let summary = "Legalizes collective ops to AllToAll and regular ops.";
+def LegalizeLibraryOpsPass :
+   Pass<"xla-legalize-library-ops", "mlir::func::FuncOp"> {
+  let summary = "Legalizes ops that map to runtime library calls.";
 
   let description = [{
-      Lowers collective ops to xla_cpu ops.
+      Lowers ops that map to a runtime library to xla_cpu ops.
   }];
 
   let dependentDialects = [
@@ -64,7 +64,7 @@ def LegalizeCollectiveOpsPass :
     "mlir::xla_cpu::XlaCpuDialect"
   ];
 
-  let constructor = "createLegalizeCollectiveOpsPass()";
+  let constructor = "createLegalizeLibraryOpsPass()";
 }
 
 def LegalizeI1VectorTransferOpsPass :
@@ -120,7 +120,8 @@ def SparseCustomCallRewritingPass :
   }];
 
   let dependentDialects = [
-      "mlir::sparse_tensor::SparseTensorDialect"
+      "mlir::sparse_tensor::SparseTensorDialect",
+      "mlir::chlo::ChloDialect",
   ];
 
   let constructor = "createSparseCustomCallRewritingPass()";
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
index 12e79953db5..10fdcbc65af 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
@@ -14,19 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cassert>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <utility>
 
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
@@ -39,19 +43,42 @@ namespace {
 
 using namespace mlir;  // NOLINT
 
-class SparseCustomCallRewritingPass
-    : public impl::SparseCustomCallRewritingPassBase<
-          SparseCustomCallRewritingPass> {
-  void runOnOperation() override;
-};
+DenseIntElementsAttr getDenseIntAttrFromConstant(Value v) {
+  if (auto const_op = v.getDefiningOp<mhlo::ConstantOp>()) {
+    return const_op.getValue().cast<DenseIntElementsAttr>();
+  } else if (auto itoa_op = v.getDefiningOp<mhlo::IotaOp>()) {
+    // MHLO canonicalizer canonicalizes constants like [0, 1, 2, .., n-1] to
+    // mhlo.itoa {itoa_dimension=0}: tensor<n x i64>
+    RankedTensorType rtt = itoa_op.getOutput().getType();
+    // We only use 1-D tensors to encode constant parameters in custom calls.
+    assert(itoa_op.getIotaDimension() == 0 && rtt.getRank() == 1);
+    SmallVector<int64_t> const_values;
+    const_values.reserve(rtt.getShape()[0]);
+    for (int i = 0; i < rtt.getShape()[0]; ++i) {
+      const_values.push_back(i);
+    }
+    return DenseIntElementsAttr::get(rtt, const_values);
+  }
+  llvm_unreachable("unrecognizable type of constant");
+}
 
-struct SparsePackCallRewriter {
+void getIntegersFromDenseElements(Value v, SmallVectorImpl<int64_t>& values) {
+  auto attr = getDenseIntAttrFromConstant(v);
+  values.reserve(values.size() + attr.size());
+  auto range = llvm::map_range(attr, [](APInt i) { return i.getZExtValue(); });
+  values.append(range.begin(), range.end());
+}
+
+struct SparseBatchedPackCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
-    assert(op.getInputs().size() == 2 && "Need two arrays (data/indices)");
+    assert(op.getInputs().size() == 3 && "Need two arrays (data/indices)");
     assert(op.getResults().size() == 1 && "Must be packing into one tensor");
+    llvm::APInt batchedLvls =
+        *getDenseIntAttrFromConstant(op.getInputs()[2]).begin();
     Value ret_sp_tensor = op.getResults()[0];
     rewriter.replaceOpWithNewOp<sparse_tensor::PackOp>(
-        op, ret_sp_tensor.getType(), op.getInputs()[0], op.getInputs()[1]);
+        op, ret_sp_tensor.getType(), op.getInputs()[0], op.getInputs()[1],
+        IntegerAttr::get(rewriter.getIndexType(), batchedLvls));
     return success();
   }
 };
@@ -88,19 +115,15 @@ struct SparseUnpackCallRewriter {
 
 struct SparseTransposeCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
-    assert(op.getInputs().size() >= 2 && "Need argument and permutation");
+    assert(op.getInputs().size() == 2 && "Need argument and permutation");
     assert(op.getResults().size() == 1 && "Need one output tensor");
-    // Rebuild the permutation from the parameters.
-    unsigned sz = op.getInputs().size() - 1;
-    llvm::SmallVector<int64_t> permutation_array(sz);
-    for (int64_t i = 0; i < sz; i++) {
-      auto input = op.getInputs()[i + 1].getDefiningOp<mhlo::ConstantOp>();
-      auto attr = input.getValue().cast<DenseElementsAttr>();
-      permutation_array[i] = attr.getValues<uint64_t>()[0];
-    }
-    DenseIntElementsAttr permutation = DenseIntElementsAttr::get(
-        RankedTensorType::get(permutation_array.size(), rewriter.getI64Type()),
-        permutation_array);
+
+    // The permutation is passed in as a constant of dense int elements.
+    auto permutation_constant =
+        op.getInputs()[1].getDefiningOp<mhlo::ConstantOp>();
+    auto permutation =
+        permutation_constant.getValue().cast<DenseIntElementsAttr>();
+
     // Reconstruct the transpose operation.
     Value ret_sp_tensor = op.getResults()[0];
     rewriter.replaceOpWithNewOp<mhlo::TransposeOp>(
@@ -109,19 +132,295 @@ struct SparseTransposeCallRewriter {
   }
 };
 
+struct SparseDotCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getInputs().size() == 6 && "Need arguments and metadata");
+    assert(op.getResults().size() == 1 && "Need one output tensor");
+    SmallVector<int64_t> lhs_contr, rhs_contr, lhs_batch, rhs_batch;
+    getIntegersFromDenseElements(op.getInputs()[2], lhs_contr);
+    getIntegersFromDenseElements(op.getInputs()[3], rhs_contr);
+    getIntegersFromDenseElements(op.getInputs()[4], lhs_batch);
+    getIntegersFromDenseElements(op.getInputs()[5], rhs_batch);
+    auto dot_dims = mlir::mhlo::DotDimensionNumbersAttr::get(
+        op.getContext(), lhs_batch, rhs_batch, lhs_contr, rhs_contr);
+    Value ret_sp_tensor = op.getResults()[0];
+    rewriter.replaceOpWithNewOp<mhlo::DotGeneralOp>(
+        op, ret_sp_tensor.getType(), op.getInputs()[0], op.getInputs()[1],
+        dot_dims, /*defaultPrecision*/ ArrayAttr());
+    return success();
+  }
+};
+
+struct SparseConcatenateCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getResults().size() == 1 && "Need one output tensor");
+
+    // The concatenation dimension.
+    auto concat_dim = op.getInputs().back().getDefiningOp<mhlo::ConstantOp>();
+    auto concat_dim_attr = concat_dim.getValue().cast<DenseIntElementsAttr>();
+    // Reconstruct the concatenate operation.
+    Value ret_sp_tensor = op.getResults()[0];
+    // Depending on test setup, we can get either a 32-bit integer or a 64-bit
+    // integer.
+    if (concat_dim_attr.getElementType().isInteger(32)) {
+      rewriter.replaceOpWithNewOp<sparse_tensor::ConcatenateOp>(
+          op, ret_sp_tensor.getType(), op.getInputs().drop_back(),
+          rewriter.getIndexAttr(concat_dim_attr.getValues<uint32_t>()[0]));
+    } else {
+      assert(concat_dim_attr.getElementType().isInteger(64));
+      rewriter.replaceOpWithNewOp<sparse_tensor::ConcatenateOp>(
+          op, ret_sp_tensor.getType(), op.getInputs().drop_back(),
+          rewriter.getIndexAttr(concat_dim_attr.getValues<uint64_t>()[0]));
+    }
+
+    return success();
+  }
+};
+
+struct SparseBroadcastInDimCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getInputs().size() == 2 &&
+           "Need argument and broadcast dimensions");
+    assert(op.getResults().size() == 1 && "Need one output tensor");
+
+    // Broadcast dimensions are passed in as a constant of dense int elements.
+    auto dims_constant = op.getInputs()[1].getDefiningOp<mhlo::ConstantOp>();
+    auto broadcast_dimensions =
+        dims_constant.getValue().cast<DenseIntElementsAttr>();
+
+    // Reconstruct the broadcast_in_dim operation.
+    Value ret_sp_tensor = op.getResults()[0];
+    rewriter.replaceOpWithNewOp<mhlo::BroadcastInDimOp>(
+        op, ret_sp_tensor.getType(), op.getInputs()[0], broadcast_dimensions);
+    return success();
+  }
+};
+
+template <typename unaryChlo>
+struct SparseUnaryChloCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getInputs().size() == 1 && "Need one argument");
+    assert(op.getResults().size() == 1 && "Need one output tensor");
+    // Reconstruct the unary chlo operation.
+    Value ret_sp_tensor = op.getResults()[0];
+    rewriter.replaceOpWithNewOp<unaryChlo>(op, ret_sp_tensor.getType(),
+                                           op.getInputs()[0]);
+    return success();
+  }
+};
+
+struct SparseSliceCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getInputs().size() == 4 &&
+           "Need one operand and three slicing parameters");
+    assert(op.getResults().size() == 1 && "Need one output tensor");
+
+    auto ctx = op.getContext();
+    auto loc = op.getLoc();
+    auto retTp = op.getResults().getTypes()[0].cast<RankedTensorType>();
+
+    auto offsets = getDenseIntAttrFromConstant(op.getInputs()[1]);
+    auto strides = getDenseIntAttrFromConstant(op.getInputs()[3]);
+
+    assert(offsets.getNumElements() == strides.getNumElements() &&
+           offsets.getNumElements() == retTp.getRank());
+
+    SmallVector<sparse_tensor::SparseTensorDimSliceAttr> slice_attrs;
+    SmallVector<int64_t> static_offsets, static_sizes, static_strides;
+    for (auto [offset, size, stride] :
+         llvm::zip(offsets, retTp.getShape(), strides)) {
+      int64_t o = offset.getZExtValue(), s = stride.getZExtValue();
+      // Converts limits to sizes.
+      slice_attrs.push_back(
+          sparse_tensor::SparseTensorDimSliceAttr::get(ctx, o, size, s));
+      static_offsets.push_back(o);
+      static_sizes.push_back(size);
+      static_strides.push_back(s);
+    }
+
+    auto srcEnc =
+        retTp.getEncoding().cast<sparse_tensor::SparseTensorEncodingAttr>();
+    // TODO(peiming): add a getSliceEncodingFrom into MLIR upstream.
+    auto sliceEnc = sparse_tensor::SparseTensorEncodingAttr::get(
+        ctx, srcEnc.getDimLevelType(), srcEnc.getDimOrdering(),
+        srcEnc.getHigherOrdering(), srcEnc.getPosWidth(), srcEnc.getCrdWidth(),
+        slice_attrs);
+    auto sliceTp = RankedTensorType::get(retTp.getShape(),
+                                         retTp.getElementType(), sliceEnc);
+
+    auto slice = rewriter.create<tensor::ExtractSliceOp>(
+        loc, sliceTp, op.getInputs()[0], ValueRange(), ValueRange(),
+        ValueRange(), static_offsets, static_sizes, static_strides);
+
+    // TODO(peiming): This weakens the performance benefit we get from the
+    // sparse compiler by forcing every slice to be materizalized while the
+    // sparse compiler supports view-based slice.
+    rewriter.replaceOpWithNewOp<sparse_tensor::ConvertOp>(op, retTp, slice);
+    return success();
+  }
+};
+
+struct SparseDynSliceCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getResults().size() == 1 && "Need one output tensor");
+
+    auto ctx = op.getContext();
+    auto loc = op.getLoc();
+    auto retTp = op.getResults().getTypes()[0].cast<RankedTensorType>();
+    // Strips the tensor operand at the front and the static_size array at
+    // the end. Inputs in between specify the dynamic offsets.
+    auto dyn_off_tensors = op.getInputs().drop_front().drop_back();
+    auto sizes = getDenseIntAttrFromConstant(op.getInputs().back());
+
+    assert(sizes.getNumElements() == retTp.getRank() &&
+           dyn_off_tensors.size() == retTp.getRank());
+
+    SmallVector<sparse_tensor::SparseTensorDimSliceAttr> slice_attrs;
+    SmallVector<int64_t> static_offsets, static_sizes, static_strides;
+    SmallVector<Value> dyn_offsets;
+    constexpr auto dyn_v = sparse_tensor::SparseTensorDimSliceAttr::kDynamic;
+    for (auto em : llvm::enumerate(sizes)) {
+      // Populates sparse tensor slice attribute
+      uint64_t sz = em.value().getZExtValue();
+      slice_attrs.push_back(
+          sparse_tensor::SparseTensorDimSliceAttr::get(ctx, dyn_v, sz, 1));
+      // Populates arrays used for ExtractSliceOp.
+      static_offsets.push_back(ShapedType::kDynamic);
+      static_strides.push_back(1);  // dynamic_slice always uses stride == 1
+      static_sizes.push_back(sz);
+      // Populates dynamic offset value arrays for ExtractSliceOp.
+      Value dyn_off = rewriter.create<tensor::ExtractOp>(
+          loc, dyn_off_tensors[em.index()], ValueRange{});
+      Value dyn_off_idx = rewriter.create<arith::IndexCastOp>(
+          loc, rewriter.getIndexType(), dyn_off);
+      dyn_offsets.push_back(dyn_off_idx);
+    }
+
+    auto srcEnc =
+        retTp.getEncoding().cast<sparse_tensor::SparseTensorEncodingAttr>();
+    auto sliceEnc = sparse_tensor::SparseTensorEncodingAttr::get(
+        ctx, srcEnc.getDimLevelType(), srcEnc.getDimOrdering(),
+        srcEnc.getHigherOrdering(), srcEnc.getPosWidth(), srcEnc.getCrdWidth(),
+        slice_attrs);
+    auto sliceTp = RankedTensorType::get(retTp.getShape(),
+                                         retTp.getElementType(), sliceEnc);
+
+    auto slice = rewriter.create<tensor::ExtractSliceOp>(
+        loc, sliceTp, op.getInputs()[0], dyn_offsets, /*sizes=*/ValueRange{},
+        /*strides=*/ValueRange{}, static_offsets, static_sizes, static_strides);
+
+    // TODO(peiming): This weakens the performance benefit we get from the
+    // sparse compiler by forcing every slice to be materizalized while the
+    // sparse compiler supports view-based slice.
+    rewriter.replaceOpWithNewOp<sparse_tensor::ConvertOp>(op, retTp, slice);
+    return success();
+  }
+};
+
+struct SparseReshapeCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getInputs().size() == 1 && "Need one input tensor");
+    assert(op.getResults().size() == 1 && "Need one output tensor");
+
+    // Reconstruct the reshape operation.
+    Value ret_sp_tensor = op.getResults()[0];
+    // TODO(anlunx): Fix the issue that the reshape is rewritten to a collapse +
+    // expand pair where the sparsity encoding is dropped in between.
+    rewriter.replaceOpWithNewOp<mhlo::ReshapeOp>(op, ret_sp_tensor.getType(),
+                                                 op.getInputs()[0]);
+    return success();
+  }
+};
+
+struct SparseConvertCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getInputs().size() == 1 && "Need one input tensor");
+    assert(op.getResults().size() == 1 && "Need one output tensor");
+    Value ret_sp_tensor = op.getResults()[0];
+    rewriter.replaceOpWithNewOp<sparse_tensor::ConvertOp>(
+        op, ret_sp_tensor.getType(), op.getInputs()[0]);
+    return success();
+  }
+};
+
+struct SparseReduceSumCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getInputs().size() == 2 && "Need one input tensor and axes");
+    assert(op.getResults().size() == 1 && "Need one output tensor");
+    SmallVector<int64_t> axes;
+    getIntegersFromDenseElements(op.getInputs()[1], axes);
+    Value result = op.getResults()[0];
+    auto resultType = result.getType().dyn_cast<RankedTensorType>();
+    auto elementType = resultType.getElementType();
+
+    Location loc = op.getLoc();
+    RankedTensorType blockArgumentType = RankedTensorType::get({}, elementType);
+    Value zero = rewriter.create<mhlo::ConstantOp>(
+        loc, DenseElementsAttr::get(blockArgumentType,
+                                    rewriter.getZeroAttr(elementType)));
+
+    mhlo::ReduceOp reduce = rewriter.create<mhlo::ReduceOp>(
+        loc, result.getType(), op.getInputs()[0], zero,
+        rewriter.getI64TensorAttr(axes));
+
+    // Setup the body for mhlo.reduce.
+    Region& region = reduce.getBody();
+    Block& block = region.emplaceBlock();
+    block.addArgument(blockArgumentType, loc);
+    block.addArgument(blockArgumentType, loc);
+    auto* firstArgument = block.args_begin();
+    auto secondArgument = block.args_rbegin();
+    {
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPointToStart(&block);
+      Value addResult =
+          rewriter.create<mhlo::AddOp>(loc, *firstArgument, *secondArgument);
+      rewriter.create<mhlo::ReturnOp>(loc, addResult);
+    }
+
+    rewriter.replaceOp(op, reduce.getResults());
+    return success();
+  }
+};
+
 class SparseCustomCallRewriter : public OpRewritePattern<mhlo::CustomCallOp> {
   using OpRewritePattern<mhlo::CustomCallOp>::OpRewritePattern;
   using SparseCustomTargetRewriter = std::function<LogicalResult(
       mhlo::CustomCallOp op, PatternRewriter& rewriter)>;
 
   const llvm::StringMap<SparseCustomTargetRewriter> rewriter_map_{
-      std::make_pair("sparse_tensor_sparse_pack", SparsePackCallRewriter()),
+      std::make_pair("sparse_tensor_sparse_pack",
+                     SparseBatchedPackCallRewriter()),
       std::make_pair("sparse_tensor_sparse_unpack", SparseUnpackCallRewriter()),
       std::make_pair("sparse_tensor_transpose", SparseTransposeCallRewriter()),
+      std::make_pair("sparse_tensor_dot_general", SparseDotCallRewriter()),
+      std::make_pair("sparse_tensor_concatenate",
+                     SparseConcatenateCallRewriter()),
+      std::make_pair("sparse_tensor_broadcast_in_dim",
+                     SparseBroadcastInDimCallRewriter()),
+      std::make_pair("sparse_tensor_asin",
+                     SparseUnaryChloCallRewriter<chlo::AsinOp>()),
+      std::make_pair("sparse_tensor_asinh",
+                     SparseUnaryChloCallRewriter<chlo::AsinhOp>()),
+      std::make_pair("sparse_tensor_atan",
+                     SparseUnaryChloCallRewriter<chlo::AtanOp>()),
+      std::make_pair("sparse_tensor_atanh",
+                     SparseUnaryChloCallRewriter<chlo::AtanhOp>()),
+      std::make_pair("sparse_tensor_bessel_i1e",
+                     SparseUnaryChloCallRewriter<chlo::BesselI1eOp>()),
+      std::make_pair("sparse_tensor_sinh",
+                     SparseUnaryChloCallRewriter<chlo::SinhOp>()),
+      std::make_pair("sparse_tensor_tan",
+                     SparseUnaryChloCallRewriter<chlo::TanOp>()),
+      std::make_pair("sparse_tensor_slice", SparseSliceCallRewriter()),
+      std::make_pair("sparse_tensor_dynamic_slice",
+                     SparseDynSliceCallRewriter()),
+      std::make_pair("sparse_tensor_reshape", SparseReshapeCallRewriter()),
+      std::make_pair("sparse_tensor_convert", SparseConvertCallRewriter()),
+      std::make_pair("sparse_tensor_reduce_sum", SparseReduceSumCallRewriter()),
   };
 
-  // Rewrites a CustomCallOp to target 'sparse_tensor_pack/unpack' to
-  // the corresponding sparse_tensor::PackOp and sparse_tensor::UnpackOp.
+  // Rewrites a CustomCallOp to corresponding sparse_tensor operation.
   LogicalResult matchAndRewrite(mhlo::CustomCallOp op,
                                 PatternRewriter& rewriter) const override {
     if (auto it = rewriter_map_.find(op.getCallTargetName());
@@ -133,32 +432,20 @@ class SparseCustomCallRewriter : public OpRewritePattern<mhlo::CustomCallOp> {
   }
 };
 
-class ReallocToAllocRewriter : public OpRewritePattern<memref::ReallocOp> {
-  using OpRewritePattern::OpRewritePattern;
-  // Rewrites a Realloc to alloc + copy
-  LogicalResult matchAndRewrite(memref::ReallocOp op,
-                                PatternRewriter& rewriter) const override {
-    Value alloc = rewriter.create<memref::AllocOp>(
-        op.getLoc(), op.getType(), op.getOperands().drop_front(1),
-        op.getAlignmentAttr());
-    rewriter.create<memref::CopyOp>(op.getLoc(), op.getSource(), alloc);
-    rewriter.replaceOp(op, alloc);
-    return success();
+class SparseCustomCallRewritingPass
+    : public impl::SparseCustomCallRewritingPassBase<
+          SparseCustomCallRewritingPass> {
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+    MLIRContext* ctx = func.getContext();
+    RewritePatternSet patterns(ctx);
+    patterns.insert<SparseCustomCallRewriter>(ctx);
+    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+      return signalPassFailure();
+    }
   }
 };
 
-void SparseCustomCallRewritingPass::runOnOperation() {
-  func::FuncOp func = getOperation();
-  MLIRContext* ctx = func.getContext();
-
-  RewritePatternSet patterns(ctx);
-  patterns.insert<SparseCustomCallRewriter>(ctx);
-
-  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
-    return signalPassFailure();
-  }
-}
-
 }  // namespace
 
 std::unique_ptr<OperationPass<func::FuncOp>>
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops.mlir
index f02a4eac6ab..6bf301bfb4f 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-cpu-opt %s -xla-legalize-collective-ops | FileCheck %s
+// RUN: xla-cpu-opt %s -xla-legalize-library-ops | FileCheck %s
 
 func.func @max_reduce(%arg0: tensor<10xf32>) -> tensor<10xf32> {
   %0 = "mhlo.all_reduce"(%arg0) ({
@@ -139,6 +139,7 @@ func.func @all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
     split_dimension = 1 : i64,
     concat_dimension = 0 : i64,
     split_count = 4 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 2, type = 0>,
     replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
   } : (tensor<4x16xf32>) -> tensor<16x4xf32>
   func.return %0 : tensor<16x4xf32>
@@ -148,7 +149,9 @@ func.func @all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
 //  CHECK-SAME: %[[ARG0:.*]]: tensor<4x16xf32>
 //       CHECK: %[[DST:.*]] = tensor.empty() : tensor<16x4xf32>
 //       CHECK: %[[RET:.*]] = "xla_cpu.all_to_all"(%[[ARG0]], %[[DST]]) {
+//  CHECK-SAME:    channel_id_present = 1
 //  CHECK-SAME:    concat_dimension = 0
+//  CHECK-SAME:    op_id = 2
 //  CHECK-SAME:    replica_groups = dense<
 //  CHECK-SAME:    split_count = 4
 //  CHECK-SAME:    split_dimension = 1
@@ -253,4 +256,214 @@ func.func @add_dependency(%arg0: tensor<16xf32>, %arg1: !mhlo.token) -> tensor<1
 //  CHECK-SAME: %[[ARG1:.*]]: !mhlo.token
 //       CHECK: %[[RES:.*]] = "xla_cpu.add_dependency"
 //  CHECK-SAME: %[[ARG0]], %[[ARG1]]
-//       CHECK: return %[[RES]] : tensor<16xf32>
\ No newline at end of file
+//       CHECK: return %[[RES]] : tensor<16xf32>
+
+func.func @conv_i4(%arg0: tensor<64x8x8x8xi4>, %arg1: tensor<4x4x8x32xi4>)
+  -> tensor<64x3x3x32xi8> {
+  %0 = mhlo.convolution(%arg0, %arg1)
+         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+         window = {stride = [1, 1], pad = [[0, 1], [0, 1]], rhs_dilate = [2, 2]}
+         {batch_group_count = 1 : i64, feature_group_count = 1 : i64} :
+       (tensor<64x8x8x8xi4>, tensor<4x4x8x32xi4>) -> tensor<64x3x3x32xi8>
+  func.return %0 : tensor<64x3x3x32xi8>
+}
+
+// CHECK-LABEL: @conv_i4
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<64x8x8x8xi4>
+//  CHECK-SAME: %[[ARG1:.*]]: tensor<4x4x8x32xi4>
+//       CHECK: %[[RES:.*]] = mhlo.convolution
+//  CHECK-SAME: %[[ARG0]], %[[ARG1]]
+//       CHECK: return %[[RES]] : tensor<64x3x3x32xi8>
+
+func.func @conv_0d_nc(%arg0: tensor<3x2xf32>, %arg1: tensor<2x3xf32>)
+  -> tensor<3x3xf32> {
+  %0 = mhlo.convolution(%arg0, %arg1)
+     dim_numbers = [b, f]x[i, o]->[b, f],
+     window = {stride = [], pad = [], lhs_dilate = [], rhs_dilate = [],
+               reverse = []}
+     {batch_group_count = 1 : i64, feature_group_count = 1 : i64,
+      precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
+   : (tensor<3x2xf32>, tensor<2x3xf32>) -> tensor<3x3xf32>
+  func.return %0 : tensor<3x3xf32>
+}
+
+// CHECK-LABEL: @conv_0d_nc
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<3x2xf32>
+//  CHECK-SAME: %[[ARG1:.*]]: tensor<2x3xf32>
+//       CHECK: %[[RES:.*]] = mhlo.convolution
+//  CHECK-SAME: %[[ARG0]], %[[ARG1]]
+//       CHECK: return %[[RES]] : tensor<3x3xf32>
+
+func.func @conv_1d_nwc_dyn(%arg0: tensor<?x8x?xf32>, %arg1: tensor<2x?x?xf32>)
+  -> tensor<?x7x?xf32> {
+  %0 = "mhlo.convolution"(%arg0, %arg1) {
+    batch_group_count = 1 : i64,
+    dimension_numbers = #mhlo.conv<raw
+      input_batch_dimension = 0,
+      input_feature_dimension = 2,
+      input_spatial_dimensions = [1],
+      kernel_input_feature_dimension = 1,
+      kernel_output_feature_dimension = 2,
+      kernel_spatial_dimensions = [0],
+      output_batch_dimension = 0,
+      output_feature_dimension = 2,
+      output_spatial_dimensions = [1]
+    >,
+    feature_group_count = 1 : i64,
+    padding = dense<[[0, 0]]> : tensor<1x2xi64>,
+    rhs_dilation = dense<1> : tensor<1xi64>,
+    window_strides = dense<1> : tensor<1xi64>,
+    someattr
+  } : (tensor<?x8x?xf32>, tensor<2x?x?xf32>) -> tensor<?x7x?xf32>
+  func.return %0 : tensor<?x7x?xf32>
+}
+
+// CHECK-LABEL: @conv_1d_nwc_dyn
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<?x8x?xf32>
+//  CHECK-SAME: %[[ARG1:.*]]: tensor<2x?x?xf32>
+//       CHECK: %[[RES:.*]] = mhlo.convolution
+//  CHECK-SAME: %[[ARG0]], %[[ARG1]]
+//       CHECK: return %[[RES]] : tensor<?x7x?xf32>
+
+func.func @depthwise_conv1d(%arg0: tensor<1x10x8xf32>,
+                            %arg1: tensor<3x1x16xf32>) -> tensor<1x10x16xf32> {
+  %0 = mhlo.convolution(%arg0, %arg1)
+    dim_numbers = [b, 0, f]x[0, i, o]->[b, 0, f],
+    window = {
+      stride = [1],
+      pad = [[1, 1]],
+      lhs_dilate = [1],
+      rhs_dilate = [1],
+      reverse = [0]} {
+    batch_group_count = 1 : i64,
+    feature_group_count = 8 : i64,
+    someattr} : (tensor<1x10x8xf32>, tensor<3x1x16xf32>) -> tensor<1x10x16xf32>
+  func.return %0 : tensor<1x10x16xf32>
+}
+
+// CHECK-LABEL: @depthwise_conv1d
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<1x10x8xf32>
+//  CHECK-SAME: %[[ARG1:.*]]: tensor<3x1x16xf32>
+//       CHECK: %[[DST:.*]] = tensor.empty() : tensor<1x10x16xf32>
+//       CHECK: %[[RES:.*]] = "xla_cpu.convolution"
+//  CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[DST]]
+//       CHECK: return %[[RES]] : tensor<1x10x16xf32>
+
+func.func @conv_2d_nhwc_hwcf(%arg0: tensor<1x4x5x1xf32>, %arg1: tensor<3x2x1x1xf32>)
+  -> tensor<1x2x4x1xf32> {
+  %0 = "mhlo.convolution"(%arg0, %arg1) {
+    batch_group_count = 1 : i64,
+    dimension_numbers = #mhlo.conv<raw
+      input_batch_dimension = 0,
+      input_feature_dimension = 3,
+      input_spatial_dimensions = [1, 2],
+      kernel_input_feature_dimension = 2,
+      kernel_output_feature_dimension = 3,
+      kernel_spatial_dimensions = [0, 1],
+      output_batch_dimension = 0,
+      output_feature_dimension = 3,
+      output_spatial_dimensions = [1, 2]
+    >,
+    feature_group_count = 1 : i64,
+    padding = dense<[[0, 0], [0, 0]]> : tensor<2x2xi64>,
+    lhs_dilation = dense<1> : tensor<2xi64>,
+    rhs_dilation = dense<1> : tensor<2xi64>,
+    window_strides = dense<1> : tensor<2xi64>
+  } : (tensor<1x4x5x1xf32>, tensor<3x2x1x1xf32>) -> tensor<1x2x4x1xf32>
+  func.return %0 : tensor<1x2x4x1xf32>
+}
+
+// CHECK-LABEL: @conv_2d_nhwc_hwcf
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<1x4x5x1xf32>
+//  CHECK-SAME: %[[ARG1:.*]]: tensor<3x2x1x1xf32>
+//       CHECK: %[[DST:.*]] = tensor.empty() : tensor<1x2x4x1xf32>
+//       CHECK: %[[RES:.*]] = "xla_cpu.convolution"
+//  CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[DST]]
+//       CHECK: return %[[RES]] : tensor<1x2x4x1xf32>
+
+func.func @conv_3d_ndhwc_dhwcf(%arg0: tensor<1x8x8x8x1xf32>,
+                               %arg1: tensor<2x2x2x1x1xf32>)
+  -> tensor<1x7x7x7x1xf32> {
+  %0 = "mhlo.convolution"(%arg0, %arg1) {
+    batch_group_count = 1 : i64,
+    dimension_numbers = #mhlo.conv<raw
+      input_batch_dimension = 0,
+      input_feature_dimension = 4,
+      input_spatial_dimensions = [1, 2, 3],
+      kernel_input_feature_dimension = 3,
+      kernel_output_feature_dimension = 4,
+      kernel_spatial_dimensions = [0, 1, 2],
+      output_batch_dimension = 0,
+      output_feature_dimension = 4,
+      output_spatial_dimensions = [1, 2, 3]
+    >,
+    feature_group_count = 1 : i64,
+    padding = dense<[[0, 0], [0, 0], [0, 0]]> : tensor<3x2xi64>,
+    lhs_dilation = dense<1> : tensor<3xi64>,
+    rhs_dilation = dense<1> : tensor<3xi64>,
+    window_strides = dense<1> : tensor<3xi64>
+  } : (tensor<1x8x8x8x1xf32>, tensor<2x2x2x1x1xf32>) -> tensor<1x7x7x7x1xf32>
+  func.return %0 : tensor<1x7x7x7x1xf32>
+}
+
+// CHECK-LABEL: @conv_3d_ndhwc_dhwcf
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<1x8x8x8x1xf32>
+//  CHECK-SAME: %[[ARG1:.*]]: tensor<2x2x2x1x1xf32>
+//       CHECK: %[[DST:.*]] = tensor.empty() : tensor<1x7x7x7x1xf32>
+//       CHECK: %[[RES:.*]] = "xla_cpu.convolution"
+//  CHECK-SAME: %[[ARG0]], %[[ARG1]], %[[DST]]
+//       CHECK: return %[[RES]] : tensor<1x7x7x7x1xf32>
+
+func.func @normal_convolution_with_reversal(%arg0: tensor<1x3x3x3xf32>,
+    %arg1: tensor<3x3x3x1xf32>) -> tensor<1x1x1x1xf32> {
+  %0 = mhlo.convolution(%arg0, %arg1)
+      dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+      window = {
+        stride = [1, 1],
+        pad = [[0, 0], [0, 0]],
+        lhs_dilate = [1, 1],
+        rhs_dilate = [1, 1],
+        reverse = [1, 1]
+      } {
+        batch_group_count = 1 : i64,
+        feature_group_count = 1 : i64, precision_config = [
+          #mhlo<precision DEFAULT>,
+          #mhlo<precision DEFAULT>]
+      } : (tensor<1x3x3x3xf32>, tensor<3x3x3x1xf32>) -> tensor<1x1x1x1xf32>
+  return %0 : tensor<1x1x1x1xf32>
+}
+
+// CHECK-LABEL: @normal_convolution_with_reversal
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<1x3x3x3xf32>
+//  CHECK-SAME: %[[ARG1:.*]]: tensor<3x3x3x1xf32>
+//       CHECK: %[[RES:.*]] = mhlo.convolution
+//  CHECK-SAME: %[[ARG0]], %[[ARG1]]
+//       CHECK: return %[[RES]] : tensor<1x1x1x1xf32>
+
+func.func @general_convolution_with_zero_sized_dimension_in_output(
+  %arg0: tensor<2x4x9x0xi64> {bufferization.writable = false,
+                              xla_framework.input_mapping = 2 : i32},
+  %arg1: tensor<4x5x2x4xi64> {bufferization.writable = false,
+                              xla_framework.input_mapping = 0 : i32})
+  -> tensor<2x5x0x4xi64> attributes {xla_framework.result_mapping = 1 : i32} {
+  %0 = mhlo.convolution(%arg0, %arg1)
+    dim_numbers = [b, f, 0, 1]x[0, 1, i, o]->[b, 0, 1, f],
+    window = {stride = [2, 1], pad = [[1, 2], [2, 0]], lhs_dilate = [1, 4],
+              rhs_dilate = [1, 1], reverse = [0, 0]}
+    {batch_group_count = 1 : i64, feature_group_count = 2 : i64,
+     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
+    : (tensor<2x4x9x0xi64>, tensor<4x5x2x4xi64>) -> tensor<2x5x0x4xi64>
+  return %0 : tensor<2x5x0x4xi64>
+}
+
+// CHECK-LABEL: @general_convolution_with_zero_sized_dimension_in_output
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<2x4x9x0xi64>
+//  CHECK-SAME: %[[ARG1:.*]]: tensor<4x5x2x4xi64>
+//       CHECK: %[[RES:.*]] = mhlo.convolution
+//  CHECK-SAME: %[[ARG0]], %[[ARG1]]
+//       CHECK: return %[[RES]] : tensor<2x5x0x4xi64>
+
+func.func @foo(%0: tensor<3x9x9x8xf32>, %1: tensor<1x7x8x8xf32>) -> tensor<3x9x9x8xf32> {
+  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[0, 0], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1], reverse = [0, 0]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<3x9x9x8xf32>, tensor<1x7x8x8xf32>) -> tensor<3x9x9x8xf32>
+  return %2 : tensor<3x9x9x8xf32>
+}
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops_to_cpu_runtime.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops_to_cpu_runtime.mlir
index f50cc3752f3..1e287013fc0 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops_to_cpu_runtime.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops_to_cpu_runtime.mlir
@@ -100,3 +100,32 @@ func.func @rng_bit_generator_three_fry(%state: memref<2xui64>,
 //       CHECK: call @xla.cpu.rng.three_fry(
 //       CHECK: func.func private @xla.cpu.rng.three_fry(
 //  CHECK-SAME:     attributes {rt.custom_call = "xla.cpu.rng.three_fry"}
+
+// -----
+
+func.func @conv_2d_nhwc_hwcf(%arg0: memref<1x4x5x1xf32>, %arg1: memref<3x2x1x1xf32>, %out: memref<1x2x4x1xf32>) {
+  "xla_cpu.convolution"(%arg0, %arg1, %out) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, inputBatchDimension = 0 : i64, inputFeatureDimension = 3 : i64, inputSpatialDimensions = [1, 2], kernelInputFeatureDimension = 2 : i64, kernelOutputFeatureDimension = 3 : i64, kernelSpatialDimensions = [0, 1], lhs_dilation = dense<1> : tensor<2xi64>, outputBatchDimension = 0 : i64, outputFeatureDimension = 3 : i64, outputSpatialDimensions = [1, 2], padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (memref<1x4x5x1xf32>, memref<3x2x1x1xf32>, memref<1x2x4x1xf32>) -> ()
+  return
+}
+
+// -----
+
+func.func @conv_3d_ndhwc_dhwcf(%arg0: memref<1x8x8x8x1xf32>, %arg1: memref<2x2x2x1x1xf32>, %out: memref<1x7x7x7x1xf32>) {
+  "xla_cpu.convolution"(%arg0, %arg1, %out) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, inputBatchDimension = 0 : i64, inputFeatureDimension = 4 : i64, inputSpatialDimensions = [1, 2, 3], kernelInputFeatureDimension = 3 : i64, kernelOutputFeatureDimension = 4 : i64, kernelSpatialDimensions = [0, 1, 2], lhs_dilation = dense<1> : tensor<3xi64>, outputBatchDimension = 0 : i64, outputFeatureDimension = 4 : i64, outputSpatialDimensions = [1, 2, 3], padding = dense<0> : tensor<3x2xi64>, rhs_dilation = dense<1> : tensor<3xi64>, window_strides = dense<1> : tensor<3xi64>} : (memref<1x8x8x8x1xf32>, memref<2x2x2x1x1xf32>, memref<1x7x7x7x1xf32>) -> ()
+  return
+}
+
+// -----
+
+func.func @depthwise_conv1d(%arg0: memref<1x10x8xf32>, %arg1: memref<3x1x16xf32>, %out: memref<1x10x16xf32>) {
+  "xla_cpu.convolution"(%arg0, %arg1, %out) {batch_group_count = 1 : i64, feature_group_count = 8 : i64, inputBatchDimension = 0 : i64, inputFeatureDimension = 2 : i64, inputSpatialDimensions = [1], kernelInputFeatureDimension = 1 : i64, kernelOutputFeatureDimension = 2 : i64, kernelSpatialDimensions = [0], lhs_dilation = dense<1> : tensor<1xi64>, outputBatchDimension = 0 : i64, outputFeatureDimension = 2 : i64, outputSpatialDimensions = [1], padding = dense<1> : tensor<1x2xi64>, rhs_dilation = dense<1> : tensor<1xi64>, window_reversal = dense<false> : tensor<1xi1>, window_strides = dense<1> : tensor<1xi64>} : (memref<1x10x8xf32>, memref<3x1x16xf32>, memref<1x10x16xf32>) -> ()
+  return
+}
+
+// -----
+
+func.func @foo(%arg0: memref<3x9x9x8xf32>, %arg1: memref<1x7x8x8xf32>, %out: memref<3x9x9x8xf32>) {
+  "xla_cpu.convolution"(%arg0, %arg1, %out) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, inputBatchDimension = 0 : i64, inputFeatureDimension = 3 : i64, inputSpatialDimensions = [1, 2], kernelInputFeatureDimension = 2 : i64, kernelOutputFeatureDimension = 3 : i64, kernelSpatialDimensions = [0, 1], lhs_dilation = dense<1> : tensor<2xi64>, outputBatchDimension = 0 : i64, outputFeatureDimension = 3 : i64, outputSpatialDimensions = [1, 2], padding = dense<[[0, 0], [3, 3]]> : tensor<2x2xi64>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>], rhs_dilation = dense<1> : tensor<2xi64>, window_reversal = dense<false> : tensor<2xi1>, window_strides = dense<1> : tensor<2xi64>} : (memref<3x9x9x8xf32>, memref<1x7x8x8xf32>, memref<3x9x9x8xf32>) -> ()
+  return
+}
+
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/fft.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/fft.mlir
index a914a754148..72973b0add5 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/fft.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/fft.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-cpu-opt %s -xla-legalize-collective-ops | FileCheck %s
+// RUN: xla-cpu-opt %s -xla-legalize-library-ops | FileCheck %s
 
 func.func @fft(%arg0: tensor<3x5x4x8x256xf32>) -> tensor<3x5x4x8x129xcomplex<f32>> {
   %0 = "mhlo.fft"(%arg0) {
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_custom_call.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_custom_call.mlir
index 12162cdfe90..4fbb3bbc1c1 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_custom_call.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_custom_call.mlir
@@ -7,12 +7,14 @@
 func.func @test(%arg0: memref<f32>) {
   // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]])
   // CHECK-SAME:   api_version = 2 : i32
+  // CHECK-SAME:   backend_config = ""
   // CHECK-SAME:   call_target_name = "target"
   // CHECK-SAME:   num_results = 1 : i32
   // CHECK-SAME:   output_tuple = false
   // CHECK-SAME: : (memref<f32>) -> ()
   "lmhlo.custom_call"(%arg0) ({}) {
     api_version = 2 : i32,
+    backend_config = "",
     call_target_name = "target",
     operand_segment_sizes = array<i32: 0, 1>
   } : (memref<f32>) -> ()
@@ -42,11 +44,13 @@ func.func @test_with_mapping(
   // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]], %[[HOLE]], %[[ARG1]], %[[HOLE]],
   // CHECK-SAME:  %[[ARG2]], %[[ARG3]], %[[HOLE]], %[[ARG4]])
   // CHECK-SAME:   api_version = 1 : i32
+  // CHECK-SAME:   backend_config = ""
   // CHECK-SAME:   call_target_name = "target"
   // CHECK-SAME:   num_results = 4 : i32
   // CHECK-SAME:   output_tuple = true
   "lmhlo.custom_call"(%arg0, %arg1, %arg2, %arg3, %arg4) ({}) {
     api_version = 1 : i32,
+    backend_config = "",
     call_target_name = "target",
     operand_segment_sizes = array<i32: 2, 3>,
     target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/rng_bit_generator.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/rng_bit_generator.mlir
index c1b934dd693..fc7b6ba99e3 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/rng_bit_generator.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/rng_bit_generator.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-cpu-opt %s -xla-legalize-collective-ops | FileCheck %s
+// RUN: xla-cpu-opt %s -xla-legalize-library-ops | FileCheck %s
 
 func.func @rng_bit_generator(%state: tensor<2xui64>) -> (tensor<2xui64>, tensor<10xui32>) {
   %new_state, %output = "mhlo.rng_bit_generator"(%state) {
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_memref_element_cast_to_llvm.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_memref_element_cast_to_llvm.mlir
index 6ec509beecc..f56b3b1566e 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_memref_element_cast_to_llvm.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_memref_element_cast_to_llvm.mlir
@@ -8,20 +8,17 @@ func.func @memref_cast(%arg0: memref<10xf32>) -> memref<10xi32> {
 // CHECK-LABEL: func.func @memref_cast(
 // CHECK-SAME:      %[[SRC:.*]]: memref<10xf32>) -> memref<10xi32>
 // CHECK:         %[[SRC_DESC:.*]] = builtin.unrealized_conversion_cast %[[SRC]]
-// CHECK-SAME:      : memref<10xf32> to !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-SAME:      : memref<10xf32> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
 // CHECK-NEXT:    %[[ALLOC_PTR:.*]] = llvm.extractvalue %[[SRC_DESC]][0]
 // CHECK-NEXT:    %[[ALIGN_PTR:.*]] = llvm.extractvalue %[[SRC_DESC]][1]
 
-// CHECK:         %[[ALLOC_PTR_CAST:.*]] = llvm.bitcast %[[ALLOC_PTR]] : !llvm.ptr<f32> to !llvm.ptr<i32>
-// CHECK-NEXT:    %[[ALIGN_PTR_CAST:.*]] = llvm.bitcast %[[ALIGN_PTR]] : !llvm.ptr<f32> to !llvm.ptr<i32>
-
 // CHECK:         %[[DST_DESC:.*]] = llvm.mlir.undef
-// CHECK-SAME:      : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-NEXT:    %[[DST_DESC_:.*]] = llvm.insertvalue %[[ALLOC_PTR_CAST]], %[[DST_DESC]][0]
-// CHECK-NEXT:    llvm.insertvalue %[[ALIGN_PTR_CAST]], %[[DST_DESC_]][1]
+// CHECK-SAME:      : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-NEXT:    %[[DST_DESC_:.*]] = llvm.insertvalue %[[ALLOC_PTR]], %[[DST_DESC]][0]
+// CHECK-NEXT:    llvm.insertvalue %[[ALIGN_PTR]], %[[DST_DESC_]][1]
 
 // CHECK:         builtin.unrealized_conversion_cast
-// CHECK-SAME:      : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> to memref<10xi32>
+// CHECK-SAME:      : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<10xi32>
 
 //  -----
 
@@ -32,17 +29,14 @@ func.func @memref_cast_i1(%arg0: memref<10xi1>) -> memref<10xi8> {
 // CHECK-LABEL: func.func @memref_cast_i1(
 // CHECK-SAME:      %[[SRC:.*]]: memref<10xi1>) -> memref<10xi8>
 // CHECK:         %[[SRC_DESC:.*]] = builtin.unrealized_conversion_cast %[[SRC]]
-// CHECK-SAME:      : memref<10xi1> to !llvm.struct<(ptr<i1>, ptr<i1>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-SAME:      : memref<10xi1> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
 // CHECK-NEXT:    %[[ALLOC_PTR:.*]] = llvm.extractvalue %[[SRC_DESC]][0]
 // CHECK-NEXT:    %[[ALIGN_PTR:.*]] = llvm.extractvalue %[[SRC_DESC]][1]
 
-// CHECK:         %[[ALLOC_PTR_CAST:.*]] = llvm.bitcast %[[ALLOC_PTR]] : !llvm.ptr<i1> to !llvm.ptr<i8>
-// CHECK-NEXT:    %[[ALIGN_PTR_CAST:.*]] = llvm.bitcast %[[ALIGN_PTR]] : !llvm.ptr<i1> to !llvm.ptr<i8>
-
 // CHECK:         %[[DST_DESC:.*]] = llvm.mlir.undef
-// CHECK-SAME:      : !llvm.struct<(ptr<i8>, ptr<i8>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-NEXT:    %[[DST_DESC_:.*]] = llvm.insertvalue %[[ALLOC_PTR_CAST]], %[[DST_DESC]][0]
-// CHECK-NEXT:    llvm.insertvalue %[[ALIGN_PTR_CAST]], %[[DST_DESC_]][1]
+// CHECK-SAME:      : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-NEXT:    %[[DST_DESC_:.*]] = llvm.insertvalue %[[ALLOC_PTR]], %[[DST_DESC]][0]
+// CHECK-NEXT:    llvm.insertvalue %[[ALIGN_PTR]], %[[DST_DESC_]][1]
 
 // CHECK:         builtin.unrealized_conversion_cast
-// CHECK-SAME:      : !llvm.struct<(ptr<i8>, ptr<i8>, i64, array<1 x i64>, array<1 x i64>)> to memref<10xi8>
+// CHECK-SAME:      : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<10xi8>
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_cpu_memref_element_cast_to_llvm.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_cpu_memref_element_cast_to_llvm.cc
index 1d0284d1aae..87bb94d26aa 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_cpu_memref_element_cast_to_llvm.cc
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_cpu_memref_element_cast_to_llvm.cc
@@ -64,16 +64,6 @@ struct MemRefElementCastOpLowering
     MemRefDescriptor::unpack(rewriter, loc, adaptor.getSrc(),
                              src_type.cast<MemRefType>(), desc_fields);
 
-    // Bitcast allocated and aligned pointers.
-    auto dst_elem_ty =
-        typeConverter->convertType(cast_op.getType().getElementType());
-    auto dst_elem_ptr_ty = LLVM::LLVMPointerType::get(
-        dst_elem_ty, cast_op.getType().getMemorySpaceAsInt());
-    desc_fields[0] =
-        rewriter.create<LLVM::BitcastOp>(loc, dst_elem_ptr_ty, desc_fields[0]);
-    desc_fields[1] =
-        rewriter.create<LLVM::BitcastOp>(loc, dst_elem_ptr_ty, desc_fields[1]);
-
     // Create descriptor.
     auto dst_desc = MemRefDescriptor::pack(rewriter, loc, type_converter,
                                            cast_op.getType(), desc_fields);
@@ -92,8 +82,6 @@ struct ConvertXlaCpuMemRefElementCastToLLVMPass
     const auto &data_layout_analysis = getAnalysis<DataLayoutAnalysis>();
     LowerToLLVMOptions options(&getContext(),
                                data_layout_analysis.getAtOrAbove(op));
-    // TODO(b/267828330): Migrate to opaque pointers.
-    options.useOpaquePointers = false;
 
     LLVMTypeConverter type_converter(&getContext(), options,
                                      &data_layout_analysis);
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/BUILD b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/BUILD
index 1b763d92080..9ca1ae5a346 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/BUILD
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/BUILD
@@ -44,6 +44,8 @@ cc_library(
     compatible_with = [],
     deps = [
         ":passes_inc_gen",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
         "//tensorflow/compiler/xla/mlir/runtime/utils:custom_calls",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo",
@@ -57,6 +59,7 @@ cc_library(
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ControlFlowDialect",
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/add_hlo_trace_annotations.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/add_hlo_trace_annotations.cc
index fc029349b61..1fa8533a371 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/add_hlo_trace_annotations.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/add_hlo_trace_annotations.cc
@@ -17,9 +17,12 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/strings/match.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.h"
@@ -56,6 +59,19 @@ void AddHloTraceAnnotationsPass::runOnOperation() {
     auto callee = sym_table.lookup<func::FuncOp>(call.getCallee());
     if (!callee->hasAttr("rt.custom_call")) return;
 
+    // Drop multi-op trace for CUDA graphs since they are too large for xprof to
+    // display.
+    // TODO(b/275240695): Report the graph content once the Xprof team provides
+    // an API.
+    if (absl::StrContains(call.getCalleeAttr().getValue(),
+                          "xla.gpu.cuda.graph.launch")) {
+      auto capture = call->getAttr("capture").cast<FlatSymbolRefAttr>();
+      std::string op_name = "cuda_graph: @" + capture.getValue().str();
+      auto annotation = HloTraceAttr::get(ctx, std::move(op_name));
+      call->setAttr("rt.trace", annotation);
+      return;
+    }
+
     // HLO operation name is encoded in the operation location.
     std::string hlo_op = mlir::mhlo::GetDebugNameFromLocation(call->getLoc());
     auto annotation = HloTraceAttr::get(ctx, std::move(hlo_op));
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
index 049ffe6a133..9cbbf957587 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/memset_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/reusable_kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
 
@@ -204,9 +203,9 @@ static absl::StatusOr<std::unique_ptr<ThunkSequence>> Match(
 
   // Check if we know how to lower a Thunk to Gpu operation(s).
   auto is_supported = [](const std::unique_ptr<Thunk>& thunk) -> bool {
-    Thunk::Kind kinds[] = {Thunk::kKernel,  Thunk::kReusableKernel,
-                           Thunk::kCopy,    Thunk::kMemset32BitValue,
-                           Thunk::kMemzero, Thunk::kSequential};
+    Thunk::Kind kinds[] = {Thunk::kKernel, Thunk::kCopy,
+                           Thunk::kMemset32BitValue, Thunk::kMemzero,
+                           Thunk::kSequential};
     return llvm::any_of(
         kinds, [&](Thunk::Kind kind) { return thunk->kind() == kind; });
   };
@@ -320,17 +319,6 @@ static void LowerThunkToGpuOp(Operation* op, OpBuilder& b,
 
   if (thunk->kind() == Thunk::kKernel) {
     const auto* kernel_thunk = static_cast<const KernelThunk*>(thunk);
-    SmallVector<Value> kernel_args;
-    for (auto kernel_arg : kernel_thunk->values())
-      kernel_args.push_back(kernel_arg);
-
-    LowerKernelThunkToGpuOp(op, b, gpu_module, kernel_thunk->kernel_name(),
-                            kernel_args, kernel_thunk->launch_dimensions());
-    return;
-  }
-
-  if (thunk->kind() == Thunk::kReusableKernel) {
-    const auto* kernel_thunk = static_cast<const ReusableKernelThunk*>(thunk);
 
     SmallVector<Value> kernel_args;
     for (auto kernel_arg : kernel_thunk->values())
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
index 1cfc0245d1a..b15c149ca00 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
@@ -13,10 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <iterator>
+#include <cstdint>
 #include <memory>
 #include <optional>
-#include <string>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -166,7 +165,11 @@ class CustomCallOpLowering : public OpRewritePattern<CustomCallOp> {
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     func::FuncOp callee =
         custom_calls_.GetOrCreate(b, op.getCallTargetName(), op);
-    callee->setAttr("rt.dynamic", UnitAttr::get(b.getContext()));
+    // Custom calls starting with the __gpu$ prefix are considered internal and
+    // statically linked (e.g. __gpu$TopK).
+    if (!op.getCallTargetName().starts_with("__gpu$")) {
+      callee->setAttr("rt.dynamic", UnitAttr::get(b.getContext()));
+    }
 
     // Forward backend config to the custom call implementation.
     auto dict = op.getBackendConfig()
@@ -471,12 +474,7 @@ class WhileOpLowering : public OpRewritePattern<WhileOp> {
 // Collective operations lowerings.
 //===----------------------------------------------------------------------===//
 
-using mlir::lmhlo::AllGatherOp;
-using mlir::lmhlo::AllReduceOp;
-using mlir::lmhlo::AllToAllOp;
-using mlir::lmhlo::CollectivePermuteOp;
 using mlir::lmhlo::PartitionIdOp;
-using mlir::lmhlo::ReduceScatterOp;
 using mlir::lmhlo::ReplicaIdOp;
 using mlir::lmhlo_gpu::AllGatherDoneOp;
 using mlir::lmhlo_gpu::AllGatherStartOp;
@@ -539,27 +537,14 @@ class CollectiveUidGenerator {
 
 template <typename CollectiveOp>
 class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
-  static StringRef Target(AllGatherOp) { return "xla.gpu.all_gather"; }
-  static StringRef Target(AllReduceOp) { return "xla.gpu.all_reduce"; }
-  static StringRef Target(AllToAllOp) { return "xla.gpu.all_to_all"; }
-  static StringRef Target(ReduceScatterOp) { return "xla.gpu.reduce_scatter"; }
-  static StringRef Target(CollectivePermuteOp) {
-    return "xla.gpu.collective_permute";
+  static StringRef Target(AllGatherStartOp) { return "xla.gpu.all_gather"; }
+  static StringRef Target(AllReduceStartOp) { return "xla.gpu.all_reduce"; }
+  static StringRef Target(AllToAllStartOp) { return "xla.gpu.all_to_all"; }
+  static StringRef Target(ReduceScatterStartOp) {
+    return "xla.gpu.reduce_scatter";
   }
   static StringRef Target(CollectivePermuteStartOp) {
-    return "xla.gpu.collective_permute_start";
-  }
-  static StringRef Target(AllReduceStartOp) {
-    return "xla.gpu.all_reduce_start";
-  }
-  static StringRef Target(AllGatherStartOp) {
-    return "xla.gpu.all_gather_start";
-  }
-  static StringRef Target(ReduceScatterStartOp) {
-    return "xla.gpu.reduce_scatter_start";
-  }
-  static StringRef Target(AllToAllStartOp) {
-    return "xla.gpu.all_to_all_start";
+    return "xla.gpu.collective_permute";
   }
 
   template <typename ReduceOrGatherOp>
@@ -569,7 +554,7 @@ class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
     return GetNcclCollectiveConfigForMlir(op, op.getUseGlobalDeviceIds());
   }
 
-  static NcclCollectiveConfig GetNcclCollectiveConfig(AllToAllOp op,
+  static NcclCollectiveConfig GetNcclCollectiveConfig(AllToAllStartOp op,
                                                       int /*replica_count*/,
                                                       int /*num_partitions*/) {
     // TODO(b/180174349): LMHLO AllToAll incorrectly has use_global_device_ids
@@ -577,14 +562,6 @@ class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
     return GetNcclCollectiveConfigForMlir(op, std::nullopt);
   }
 
-  static NcclCollectiveConfig GetNcclCollectiveConfig(CollectivePermuteOp op,
-                                                      int replica_count,
-                                                      int num_partitions) {
-    return NcclCollectivePermuteThunk::GetNcclCollectivePermuteConfig(
-               op, replica_count, num_partitions)
-        .config;
-  }
-
   static NcclCollectiveConfig GetNcclCollectiveConfig(
       CollectivePermuteStartOp op, int replica_count, int num_partitions) {
     return NcclCollectivePermuteStartThunk::GetNcclCollectivePermuteConfig(
@@ -624,13 +601,6 @@ class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
     return success();
   }
 
-  static LogicalResult TryDegenerateToMemCopy(
-      CollectivePermuteOp op, const NcclCollectiveConfig& config,
-      int replica_count, int num_partitions, PatternRewriter& rewriter) {
-    return TryDegenerateCollectivePermuteToMemCopy<NcclCollectivePermuteThunk>(
-        op, config, replica_count, num_partitions, rewriter);
-  }
-
   static LogicalResult TryDegenerateToMemCopy(
       CollectivePermuteStartOp op, const NcclCollectiveConfig& config,
       int replica_count, int num_partitions, PatternRewriter& rewriter) {
@@ -639,52 +609,42 @@ class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
                                          num_partitions, rewriter);
   }
 
-  static bool CanImplement(AllGatherOp op) {
-    return NcclAllGatherThunk::CanImplement(op);
+  static Status CheckImplementable(AllGatherStartOp op, int64_t replica_count,
+                                   int64_t num_partitions) {
+    return NcclAllGatherStartThunk::CheckImplementable(op, replica_count,
+                                                       num_partitions);
   }
 
-  static bool CanImplement(AllGatherStartOp op) {
-    return NcclAllGatherStartThunk::CanImplement(op);
+  static Status CheckImplementable(AllReduceStartOp op, int64_t replica_count,
+                                   int64_t num_partitions) {
+    return NcclAllReduceStartThunk::CheckImplementable(op, replica_count,
+                                                       num_partitions);
   }
 
-  static bool CanImplement(AllReduceOp op) {
-    return NcclAllReduceThunk::CanImplement(op);
+  static Status CheckImplementable(AllToAllStartOp op, int64_t replica_count,
+                                   int64_t num_partitions) {
+    return NcclAllToAllStartThunk::CheckImplementable(op, replica_count,
+                                                      num_partitions);
   }
 
-  static bool CanImplement(AllReduceStartOp op) {
-    return NcclAllReduceStartThunk::CanImplement(op);
+  static Status CheckImplementable(CollectivePermuteStartOp op,
+                                   int64_t replica_count,
+                                   int64_t num_partitions) {
+    return NcclCollectivePermuteStartThunk::CheckImplementable(
+        op, replica_count, num_partitions);
   }
 
-  static bool CanImplement(ReduceScatterOp op) {
-    return NcclReduceScatterThunk::CanImplement(op);
-  }
-
-  static bool CanImplement(AllToAllOp op) {
-    return NcclAllToAllThunk::CanImplement(op);
-  }
-
-  static bool CanImplement(AllToAllStartOp op) {
-    return NcclAllToAllStartThunk::CanImplement(op);
-  }
-
-  static bool CanImplement(CollectivePermuteOp op) {
-    return NcclCollectivePermuteThunk::CanImplement(op);
-  }
-
-  static bool CanImplement(CollectivePermuteStartOp op) {
-    return NcclCollectivePermuteStartThunk::CanImplement(op);
-  }
-
-  static bool CanImplement(ReduceScatterStartOp op) {
-    return NcclReduceScatterStartThunk::CanImplement(op);
+  static Status CheckImplementable(ReduceScatterStartOp op,
+                                   int64_t replica_count,
+                                   int64_t num_partitions) {
+    return NcclReduceScatterStartThunk::CheckImplementable(op, replica_count,
+                                                           num_partitions);
   }
 
   template <typename OpT>
-  static
-      typename std::enable_if_t<is_any<OpT, AllReduceOp, AllReduceStartOp,
-                                       ReduceScatterOp, ReduceScatterStartOp>,
-                                LogicalResult>
-      SetSpecificAttrs(ImplicitLocOpBuilder& b, OpT op, func::CallOp call) {
+  static typename std::enable_if_t<
+      is_any<OpT, AllReduceStartOp, ReduceScatterStartOp>, LogicalResult>
+  SetSpecificAttrs(ImplicitLocOpBuilder& b, OpT op, func::CallOp call) {
     std::optional<xla::ReductionKind> reduction_kind =
         NcclAllReduceReduceScatterThunkBase::MatchAllReduceComputation(
             op.getComputation());
@@ -699,31 +659,26 @@ class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
     return success();
   }
 
-  template <typename OpT>
-  static typename std::enable_if_t<is_any<OpT, AllGatherOp, AllGatherStartOp>,
-                                   LogicalResult>
-  SetSpecificAttrs(ImplicitLocOpBuilder& b, OpT op, func::CallOp call) {
+  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b,
+                                        AllGatherStartOp op,
+                                        func::CallOp call) {
     return success();
   }
 
-  template <typename OpT>
-  static typename std::enable_if_t<is_any<OpT, AllToAllOp, AllToAllStartOp>,
-                                   LogicalResult>
-  SetSpecificAttrs(ImplicitLocOpBuilder& b, OpT op, func::CallOp call) {
+  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b,
+                                        AllToAllStartOp op, func::CallOp call) {
     call->setAttr(b.getStringAttr("has_split_dimension"),
                   b.getBoolAttr(op.getSplitDimension().has_value()));
     return success();
   }
 
-  template <typename OpT>
-  static typename std::enable_if_t<
-      is_any<OpT, CollectivePermuteOp, CollectivePermuteStartOp>, LogicalResult>
-  SetSpecificAttrs(ImplicitLocOpBuilder& b, OpT op, func::CallOp call) {
+  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b,
+                                        CollectivePermuteStartOp op,
+                                        func::CallOp call) {
     auto source_target_pairs_or =
         ConvertNx2Attribute(op.getSourceTargetPairs());
     if (!source_target_pairs_or.ok()) {
-      return op.emitOpError()
-             << source_target_pairs_or.status().error_message();
+      return op.emitOpError() << source_target_pairs_or.status().message();
     }
 
     // Pass an array of pairs as two vectors.
@@ -775,18 +730,23 @@ class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
     NcclCollectiveConfig config =
         GetNcclCollectiveConfig(op, replica_count, num_partitions);
 
-    // A given collective op can be degenerate if across all groups formed
-    // by it are singleton. In such a case, we don't need to do any
-    // communication and we can just copy the input to the output.
-    if (succeeded(TryDegenerateToMemCopy(op, config, replica_count,
-                                         num_partitions, rewriter))) {
-      // For async collective erase all corresponding done operations.
+    // For async collective erase all corresponding done operations.
+    auto erase_done_op = [&]() {
       eraseDoneOp<AllGatherStartOp, AllGatherDoneOp>(rewriter, op);
       eraseDoneOp<AllReduceStartOp, AllReduceDoneOp>(rewriter, op);
       eraseDoneOp<CollectivePermuteStartOp, CollectivePermuteDoneOp>(rewriter,
                                                                      op);
       eraseDoneOp<ReduceScatterStartOp, ReduceScatterDoneOp>(rewriter, op);
       eraseDoneOp<AllToAllStartOp, AllToAllDoneOp>(rewriter, op);
+    };
+
+    // A given collective op can be degenerate if across all groups formed
+    // by it are singleton. In such a case, we don't need to do any
+    // communication and we can just copy the input to the output.
+    if (succeeded(TryDegenerateToMemCopy(op, config, replica_count,
+                                         num_partitions, rewriter))) {
+      // For async collective erase all corresponding done operations.
+      erase_done_op();
 
       // Erase the original collective operation.
       rewriter.eraseOp(op);
@@ -794,13 +754,10 @@ class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
       return success();
     }
 
-    if (!CanImplement(op)) {
-      return op.emitOpError()
-             << "Requested " << Target(op)
-             << " not implemented on GPU; replica_count: " << replica_count
-             << ", num_partitions: " << num_partitions << ", group_mode: "
-             << CollectiveOpGroupModeToString(config.group_mode)
-             << ", NCCL support: " << NcclCollectiveThunk::NcclIsEnabled();
+    Status implementable_status =
+        CheckImplementable(op, replica_count, num_partitions);
+    if (!implementable_status.ok()) {
+      return op.emitOpError() << implementable_status.message();
     }
 
     // Check that we have and assigned unique collective operation id.
@@ -856,14 +813,19 @@ class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
     auto result = SetSpecificAttrs(b, op, call);
     if (failed(result)) return result;
 
-    // For asynchonous start operation we need to produce a fake token, that
-    // will be later removed, because corresponding `done` operation doesn't
-    // have a token argument. We rely on the `unrealized_conversion_cast`
-    // operation to create a fake token from the `i8` constant, and on the dead
-    // code elimination pass that will remove unused fake tokens.
-    if constexpr (is_any<CollectiveOp, AllGatherStartOp, AllReduceStartOp,
-                         AllToAllStartOp, CollectivePermuteStartOp,
-                         ReduceScatterStartOp>) {
+    bool is_async = !op.getIsSync();
+    call->setAttr(b.getStringAttr("is_async"), b.getBoolAttr(is_async));
+
+    // If the collective will not execute asynchronously, erase the associated
+    // done op.
+    if (!is_async) {
+      erase_done_op();
+    } else {
+      // For asynchonous start operation we need to produce a fake token, that
+      // will be later removed, because corresponding `done` operation doesn't
+      // have a token argument. We rely on the `unrealized_conversion_cast`
+      // operation to create a fake token from the `i8` constant, and on the
+      // dead code elimination pass that will remove unused fake tokens.
       Value token = op.getToken();
       Value c0 = b.create<arith::ConstantOp>(b.getI8IntegerAttr(0));
       auto fake = b.create<UnrealizedConversionCastOp>(token.getType(), c0);
@@ -887,14 +849,9 @@ class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
     using CollectiveOpLowering::CollectiveOpLowering;    \
   }
 
-DEFINE_COLLECTIVE_OP_LOWERING(AllGatherOp);
 DEFINE_COLLECTIVE_OP_LOWERING(AllGatherStartOp);
-DEFINE_COLLECTIVE_OP_LOWERING(AllReduceOp);
 DEFINE_COLLECTIVE_OP_LOWERING(AllReduceStartOp);
-DEFINE_COLLECTIVE_OP_LOWERING(ReduceScatterOp);
-DEFINE_COLLECTIVE_OP_LOWERING(AllToAllOp);
 DEFINE_COLLECTIVE_OP_LOWERING(AllToAllStartOp);
-DEFINE_COLLECTIVE_OP_LOWERING(CollectivePermuteOp);
 DEFINE_COLLECTIVE_OP_LOWERING(CollectivePermuteStartOp);
 DEFINE_COLLECTIVE_OP_LOWERING(ReduceScatterStartOp);
 
@@ -905,17 +862,14 @@ class AsyncDoneOpLowering : public OpRewritePattern<OpT> {
  public:
   AsyncDoneOpLowering(MLIRContext* ctx, CollectiveUidGenerator& uid,
                       CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<OpT>(ctx),
-        custom_call_target_(Derived::kCustomCallTarget),
-        uid_(uid),
-        custom_calls_(custom_calls) {}
+      : OpRewritePattern<OpT>(ctx), uid_(uid), custom_calls_(custom_calls) {}
 
   LogicalResult matchAndRewrite(OpT op,
                                 PatternRewriter& rewriter) const override {
     // Get or create a custom call function declaration.
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, custom_call_target_,
-                                                    TypeRange(), TypeRange());
+    func::FuncOp callee = custom_calls_.GetOrCreate(
+        b, "xla.gpu.collective_done", TypeRange(), TypeRange());
 
     // Get a unique collective operation id.
     FailureOr<int32_t> uid = uid_.AssignedUid(op);
@@ -923,7 +877,8 @@ class AsyncDoneOpLowering : public OpRewritePattern<OpT> {
       return op.emitOpError("failed to get a unique collective operation id");
 
     llvm::SmallVector<NamedAttribute> custom_call_attributes = {
-        {b.getStringAttr("uid"), b.getI32IntegerAttr(*uid)}};
+        {b.getStringAttr("uid"), b.getI32IntegerAttr(*uid)},
+        {b.getStringAttr("done_type"), b.getStringAttr(Derived::kDoneType)}};
 
     // Convert AllReduceDone to a function call.
     auto call = rewriter.replaceOpWithNewOp<func::CallOp>(op, callee.getName(),
@@ -934,24 +889,22 @@ class AsyncDoneOpLowering : public OpRewritePattern<OpT> {
   }
 
  private:
-  const char* custom_call_target_;
   CollectiveUidGenerator& uid_;
   CustomCallDeclarations& custom_calls_;
 };
 
-#define DEFINE_COLLECTIVE_DONE_OP_LOWERING(OP, custom_call)            \
+#define DEFINE_COLLECTIVE_DONE_OP_LOWERING(OP, done_type)              \
   struct OP##Lowering : public AsyncDoneOpLowering<OP, OP##Lowering> { \
-    static constexpr const char kCustomCallTarget[] = custom_call;     \
+    static constexpr const char kDoneType[] = done_type;               \
     using AsyncDoneOpLowering::AsyncDoneOpLowering;                    \
   }
 
-DEFINE_COLLECTIVE_DONE_OP_LOWERING(AllGatherDoneOp, "xla.gpu.all_gather_done");
-DEFINE_COLLECTIVE_DONE_OP_LOWERING(AllReduceDoneOp, "xla.gpu.all_reduce_done");
-DEFINE_COLLECTIVE_DONE_OP_LOWERING(AllToAllDoneOp, "xla.gpu.all_to_all_done");
+DEFINE_COLLECTIVE_DONE_OP_LOWERING(AllGatherDoneOp, "all_gather_done");
+DEFINE_COLLECTIVE_DONE_OP_LOWERING(AllReduceDoneOp, "all_reduce_done");
+DEFINE_COLLECTIVE_DONE_OP_LOWERING(AllToAllDoneOp, "all_to_all_done");
 DEFINE_COLLECTIVE_DONE_OP_LOWERING(CollectivePermuteDoneOp,
-                                   "xla.gpu.collective_permute_done");
-DEFINE_COLLECTIVE_DONE_OP_LOWERING(ReduceScatterDoneOp,
-                                   "xla.gpu.reduce_scatter_done");
+                                   "collective_permute_done");
+DEFINE_COLLECTIVE_DONE_OP_LOWERING(ReduceScatterDoneOp, "reduce_scatter_done");
 
 #undef DEFINE_COLLECTIVE_DONE_OP_LOWERING
 
@@ -1131,12 +1084,10 @@ void ConvertLmhloToGpuRuntimePass::runOnOperation() {
   // Convert lmhlo collective operations to XLA gpu runtime custom calls.
   patterns.insert<PartitionIdOpLowering, ReplicaIdOpLowering>(ctx,
                                                               custom_calls);
-  patterns.insert<AllGatherOpLowering, AllGatherStartOpLowering,
-                  AllReduceOpLowering, AllReduceStartOpLowering,
-                  AllToAllOpLowering, AllToAllStartOpLowering,
-                  CollectivePermuteOpLowering, CollectivePermuteStartOpLowering,
-                  ReduceScatterOpLowering, ReduceScatterStartOpLowering>(
-      ctx, collective_uid, custom_calls);
+  patterns.insert<AllGatherStartOpLowering, AllReduceStartOpLowering,
+                  AllToAllStartOpLowering, CollectivePermuteStartOpLowering,
+                  ReduceScatterStartOpLowering>(ctx, collective_uid,
+                                                custom_calls);
 
   // Convert lmhlo host<->device point-to-point communication operations to XLA
   // gpu runtime.
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
index b08a2c1ee79..8fa6c141530 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.h"
 #include "tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.h"
@@ -40,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -53,11 +55,19 @@ using mlir::gpu::LaunchFuncOp;
 
 class OutlineCudaGraphsPass
     : public impl::OutlineCudaGraphsPassBase<OutlineCudaGraphsPass> {
+ public:
+  OutlineCudaGraphsPass() = default;
+  explicit OutlineCudaGraphsPass(int cuda_graph_level)
+      : cuda_graph_level_(cuda_graph_level) {}
+
   void runOnOperation() override;
 
   void getDependentDialects(DialectRegistry& registry) const override {
     registry.insert<func::FuncDialect, runtime::RuntimeDialect>();
   }
+
+ private:
+  int cuda_graph_level_ = 3;
 };
 
 //===----------------------------------------------------------------------===//
@@ -102,15 +112,12 @@ using CloneOp = OpCapture<kClone, T, Ts...>;
 // Capture gpu operations by moving them into graph capture function.
 struct LaunchFuncOpCapture : public MoveOp<LaunchFuncOp> {};
 
-// TODO(b/270426911): Right now GEMM/Convolution with runtime autotuning can't
-// be captured by a cuda graph. However, longer term the proper fix is to make
-// autotuning "cuda-graph-aware", and run autotuning on a separate stream that
-// is not in capture mode.
+template <typename T>
 struct ConvOpCapture : public OpCapturePattern {
   FailureOr<OpCapturePattern::Capture> match(Operation* op) final {
-    if (auto conv = llvm::dyn_cast<lmhlo_gpu::ConvForwardFusedOp>(op)) {
-      // GEMM that does runtime autotuning should not be captured, since CUDA
-      // graphs do not support operations that allocate memory.
+    if (auto conv = llvm::dyn_cast<T>(op)) {
+      // Convolution that does runtime autotuning should not be captured, since
+      // CUDA graphs do not support operations that allocate memory.
       lmhlo_gpu::ConvolutionBackendConfigAttr backend_config =
           conv.getBackendConfig();
       if (backend_config.getAlgorithm() != -1) {
@@ -121,6 +128,20 @@ struct ConvOpCapture : public OpCapturePattern {
   }
 };
 
+// TODO(b/270426911): Right now GEMM/Convolution with runtime autotuning can't
+// be captured by a cuda graph. However, longer term the proper fix is to make
+// autotuning "cuda-graph-aware", and run autotuning on a separate stream that
+// is not in capture mode.
+struct ConvForwardOpCapture : public ConvOpCapture<lmhlo_gpu::ConvForwardOp> {};
+struct ConvBackwardInputOpCapture
+    : public ConvOpCapture<lmhlo_gpu::ConvBackwardInputOp> {};
+struct ConvBackwardFilterOpCapture
+    : public ConvOpCapture<lmhlo_gpu::ConvBackwardFilterOp> {};
+struct ConvForwardFusedOpCapture
+    : public ConvOpCapture<lmhlo_gpu::ConvForwardFusedOp> {};
+struct ConvForwardFusedSideInputOpCapture
+    : public ConvOpCapture<lmhlo_gpu::ConvForwardFusedSideInputOp> {};
+
 struct GemmOpCapture : public OpCapturePattern {
   FailureOr<OpCapturePattern::Capture> match(Operation* op) final {
     if (auto gemm = llvm::dyn_cast<lmhlo_gpu::GEMMOp>(op)) {
@@ -136,9 +157,34 @@ struct GemmOpCapture : public OpCapturePattern {
   }
 };
 
+struct MemcpyOpCapture : public OpCapturePattern {
+  FailureOr<OpCapturePattern::Capture> match(Operation* op) final {
+    if (auto memcpy = llvm::dyn_cast<mlir::gpu::MemcpyOp>(op)) {
+      // We use a heuristic to identify the direction of the memcpy operation,
+      // if the operand was allocated by alloca op or is a global memref, then
+      // it must be a memref on the host.
+      auto IsHostMemRef = [](Value value) {
+        auto* op = value.getDefiningOp();
+        return llvm::isa_and_nonnull<memref::AllocaOp, memref::GetGlobalOp>(op);
+      };
+
+      auto IsDeviceToDevice = [&](mlir::gpu::MemcpyOp op) {
+        return !IsHostMemRef(op.getDst()) && !IsHostMemRef(op.getSrc());
+      };
+
+      // Device-to-host Memcpy cannot be captured by CUDA graphs.
+      if (IsDeviceToDevice(memcpy)) {
+        return kMove;
+      }
+    }
+    return failure();
+  }
+};
+
 // Capture pure operations by cloning them into graph capture function.
 struct ConstantOpCapture : public CloneOp<arith::ConstantOp> {};
 struct ViewOpCapture : public CloneOp<memref::ViewOp> {};
+struct ReinterpretCastOpCapture : public CloneOp<memref::ReinterpretCastOp> {};
 
 //===----------------------------------------------------------------------===//
 
@@ -286,7 +332,10 @@ static LogicalResult Outline(unsigned ordinal,
   unsigned num_move_captures = llvm::count_if(seq, [](auto capture) {
     return capture.second == OpCapturePattern::Capture::kMove;
   });
-  if (num_move_captures < 2) return failure();
+  DebugOptions debug_options = GetDebugOptionsFromFlags();
+  int32_t graph_capture_threshold =
+      debug_options.xla_gpu_cuda_graph_capture_threshold();
+  if (num_move_captures < graph_capture_threshold) return failure();
 
   SymbolTable& sym_table = custom_calls.sym_table();
   MLIRContext* ctx = sym_table.getOp()->getContext();
@@ -304,6 +353,15 @@ static LogicalResult Outline(unsigned ordinal,
       "xla.gpu.cuda.graph.capture",
       FunctionType::get(ctx, TypeRange(ValueRange(args)), TypeRange()));
 
+  for (auto op : seq) {
+    mlir::Operation* captured_op = op.first;
+    if (isa<lmhlo_gpu::GEMMOp>(captured_op)) {
+      func->setAttr(b.getStringAttr("xla.requires_blas"),
+                    BoolAttr::get(ctx, true));
+      break;
+    }
+  }
+
   // Add graph capture function to the module.
   sym_table.insert(func);
 
@@ -369,11 +427,25 @@ void OutlineCudaGraphsPass::runOnOperation() {
   CustomCallDeclarations custom_calls(std::move(sym_table));
 
   OpCapturePatternSet patterns;
-  patterns.emplace_back(new LaunchFuncOpCapture());
-  patterns.emplace_back(new ConvOpCapture());
-  patterns.emplace_back(new ConstantOpCapture());
-  patterns.emplace_back(new GemmOpCapture());
-  patterns.emplace_back(new ViewOpCapture());
+
+  if (cuda_graph_level_ >= 1) {
+    // Enable capturing fusions and memcpies.
+    patterns.emplace_back(new LaunchFuncOpCapture());
+    patterns.emplace_back(new ConstantOpCapture());
+    patterns.emplace_back(new ViewOpCapture());
+    patterns.emplace_back(new MemcpyOpCapture());
+    patterns.emplace_back(new ReinterpretCastOpCapture());
+  }
+
+  if (cuda_graph_level_ >= 2) {
+    // Enable capturing conv/gemms.
+    patterns.emplace_back(new ConvForwardOpCapture());
+    patterns.emplace_back(new ConvBackwardInputOpCapture());
+    patterns.emplace_back(new ConvBackwardFilterOpCapture());
+    patterns.emplace_back(new ConvForwardFusedOpCapture());
+    patterns.emplace_back(new ConvForwardFusedSideInputOpCapture());
+    patterns.emplace_back(new GemmOpCapture());
+  }
 
   unsigned ordinal = 1;  // entry point will be exported with ordinal 0
   for (auto& seq : CollectCaptureSequences(getAnalysis<DominanceInfo>(),
@@ -386,5 +458,10 @@ std::unique_ptr<OperationPass<ModuleOp>> createOutlineCudaGraphsPass() {
   return std::make_unique<OutlineCudaGraphsPass>();
 }
 
+std::unique_ptr<OperationPass<ModuleOp>> createOutlineCudaGraphsPass(
+    int cuda_graph_level) {
+  return std::make_unique<OutlineCudaGraphsPass>(cuda_graph_level);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.cc
index e71c76ab0e7..0389ecabe4c 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.cc
@@ -41,9 +41,7 @@ void populateXlaGpuRuntimePasses(mlir::OpPassManager& pm,
   pm.addPass(createSymbolDCEPass());  // Clean up unused global constants.
 
   // Outline CUDA-Graph-compatible operations into graph capture functions.
-  if (opts.enable_cuda_graphs) {
-    pm.addPass(createOutlineCudaGraphsPass());
-  }
+  pm.addPass(createOutlineCudaGraphsPass(opts.cuda_graph_level));
 
   // Lower all Gpu operations to the XLA Gpu runtime custom calls.
   pm.addPass(createConvertLmhloGpuToGpuRuntimePass());
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h
index 1777e06505f..a3099214fb1 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_MLIR_BACKENDS_GPU_TRANSFORMS_PASSES_H_
 #define TENSORFLOW_COMPILER_XLA_MLIR_BACKENDS_GPU_TRANSFORMS_PASSES_H_
 
+#include <cstdint>
 #include <memory>
 
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -39,7 +40,7 @@ struct GpuPipelineOpts {
   // Enable experimental pass that outlines parts of the XLA computation into
   // CUDA Graphs, which allows us to amortize the cost of launching multiple
   // device kernels.
-  bool enable_cuda_graphs = false;
+  int32_t cuda_graph_level = 0;
 };
 
 // Populate passes that lower MLIR modules from a combination of LMHLO and
@@ -97,6 +98,9 @@ createAddHloTraceAnnotationsPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 createOutlineCudaGraphsPass();
 
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createOutlineCudaGraphsPass(int32_t cuda_graph_level);
+
 //===-----------------------------------------------------------------------===/
 
 #define GEN_PASS_REGISTRATION
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
index fa177393ca3..ccea3c0c00c 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
@@ -439,3 +439,188 @@ module attributes {gpu.container_module} {
   }
   func.func private @external()
 }
+
+// -----
+// Check that convolutions are captured by cuda graphs.
+
+#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * 3 + d1 + d2 * 9 + d3 * 9)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * 16384 + d1 * 4 + d2 + d3 * 16)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * 4096 + d1 * 2 + d2 + d3 * 4)>
+
+module attributes {gpu.container_module} {
+
+  gpu.module @gpu_module attributes {binary = "kernel binary"} {
+    gpu.func @fn0(%arg0: memref<16xi8>) kernel { gpu.return }
+  }
+
+
+  // CHECK: @func(%[[ARG0:.*]]: memref<1x4x4x1024xf16, #map>
+  // CHECK-SAME: %[[ARG1:.*]]: memref<3x3x1x1024xf16, #map1>
+  // CHECK-SAME: %[[ARG2:.*]]: memref<1x2x2x1024xf16, #map2>
+  // CHECK-SAME: %[[ARG3:.*]]: memref<0xui8>
+  // CHECK-SAME: %[[ARG4:.*]]: memref<16xi8>
+  func.func @func(%input: memref<1x4x4x1024xf16, #map1>,
+                                %filter: memref<3x3x1x1024xf16, #map0>,
+                                %output: memref<1x2x2x1024xf16, #map2>,
+                                %scratch: memref<0xui8>,
+                                %raw_arg0: memref<16xi8> {lmhlo.params = 0 : index}
+                                ) {
+    %c0 = arith.constant 0 : index
+
+    // CHECK: call @xla.gpu.cuda.graph.launch(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
+    // CHECK-SAME: {capture = @xla.gpu.cuda.graph.capture}
+    lmhlo_gpu.conv_forward(%input, %filter, %output, %scratch)
+      dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+      window = { stride = [1, 1],
+                lhs_dilate = [1, 1],
+                rhs_dilate = [1, 1],
+                reverse = [0, 0]
+              }
+      { backend_config = #lmhlo_gpu.convolution_backend_config<
+          algorithm = 0,
+          is_cudnn_frontend = true,
+          is_cudnn_reordered_int8 = false,
+          knob_ids = [],
+          knob_values = [],
+          operand_0_layout = [2, 1, 3, 0],
+          operand_1_layout = [1, 0, 2, 3],
+          result_layout = [2, 1, 3, 0],
+          tensor_ops_enabled = false,
+          workspace_size = 0
+        >,
+        batch_group_count = 1 : i64,
+        feature_group_count = 1024 : i64,
+        precision_config = [],
+        result_scale = 1.000000e+00 : f64
+      } : (memref<1x4x4x1024xf16, #map1>,
+          memref<3x3x1x1024xf16, #map0>,
+          memref<1x2x2x1024xf16, #map2>,
+          memref<0xui8>) -> ()
+    gpu.launch_func  @gpu_module::@fn0 blocks in (%c0, %c0, %c0)
+      threads in (%c0, %c0, %c0) args(%raw_arg0 : memref<16xi8>)
+    return
+  }
+  func.func private @external()
+}
+
+// CHECK: func @xla.gpu.cuda.graph.capture
+// CHECK-NEXT: arith.constant 0
+// CHECK-NEXT: lmhlo_gpu.conv_forward
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
+// CHECK-NEXT: return
+
+// -----
+// Check that d2d memcpy are captured.
+
+module attributes {gpu.container_module} {
+
+  // CHECK: @func(%[[ARG0:.*]]: memref<100xi8>)
+  func.func @func(%arg0: memref<100xi8>) {
+    %c0 = arith.constant 0 : index
+    %dst = memref.view %arg0[%c0][] : memref<100xi8> to memref<10xf32>
+    %src = memref.view %arg0[%c0][] : memref<100xi8> to memref<10xf32>
+
+    // CHECK: call @xla.gpu.cuda.graph.launch(%[[ARG0]])
+    // CHECK-SAME: {capture = @xla.gpu.cuda.graph.capture}
+    gpu.memcpy %dst, %src : memref<10xf32>, memref<10xf32>
+    gpu.memcpy %dst, %src : memref<10xf32>, memref<10xf32>
+
+    // CHECK: return
+    return
+  }
+  func.func private @external()
+}
+
+// CHECK: func @xla.gpu.cuda.graph.capture
+// CHECK: gpu.memcpy
+// CHECK: gpu.memcpy
+// CHECK-NEXT: return
+
+// -----
+// Check that memref.reinterpret_cast operations are cloned into the graph
+// capture function.
+
+module attributes {gpu.container_module} {
+
+gpu.module @gpu_module attributes {binary = "kernel binary"} {
+  gpu.func @fn0(%arg0: memref<16xi8, strided<[1], offset: 0>>) kernel { gpu.return }
+  gpu.func @fn1(%arg0: memref<16xi8, strided<[1], offset: 0>>) kernel { gpu.return }
+}
+
+// CHECK: @func(%[[ARG0:.*]]: memref<16xi8>)
+func.func @func(%arg0: memref<16xi8>) {
+  %c1 = arith.constant 1 : index
+  %view = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1]: memref<16xi8> to memref<16xi8, strided<[1], offset: 0>>
+
+  call @external() : () -> ()
+
+  // CHECK: call @xla.gpu.cuda.graph.launch(%[[ARG0]])
+  // CHECK-SAME: {capture = @xla.gpu.cuda.graph.capture}
+  // CHECK-NEXT: return
+  gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1) args(%view : memref<16xi8, strided<[1], offset: 0>>)
+  gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1) args(%view : memref<16xi8, strided<[1], offset: 0>>)
+
+  func.return
+}
+
+func.func private @external()
+}
+
+// CHECK: func @xla.gpu.cuda.graph.capture
+// CHECK-NEXT: arith.constant 1
+// CHECK-NEXT: memref.reinterpret_cast
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
+// CHECK-NEXT: return
+
+// -----
+// Check that the loop body of lmhlo.while is cloned into the graph.
+
+module attributes {gpu.container_module} {
+
+gpu.module @gpu_module attributes {binary = "kernel binary"} {
+  gpu.func @fn0(%arg0: memref<16xi8>)  kernel { gpu.return }
+  gpu.func @fn1(%arg0: memref<16xi8>) kernel { gpu.return }
+}
+
+// CHECK: @func(%[[ARG0:.*]]: memref<16xi8>
+func.func @func(%arg0: memref<16xi8>, %cond: memref<i1>) {
+  %c1 = arith.constant 1 : index
+
+  call @external() : () -> ()
+
+  "lmhlo.while"(%cond) ({
+    // CHECK: func.call @xla.gpu.cuda.graph.launch(%[[ARG0]])
+    // CHECK-SAME: {capture = @xla.gpu.cuda.graph.capture}
+    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%arg0: memref<16xi8>)
+    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%arg0: memref<16xi8>)
+    "lmhlo.terminator"() : () -> () }, {
+    // CHECK: func.call @xla.gpu.cuda.graph.launch(%[[ARG0]])
+    // CHECK-SAME: {capture = @xla.gpu.cuda.graph.capture_0}
+    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%arg0: memref<16xi8>)
+    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%arg0: memref<16xi8>)
+    "lmhlo.terminator"() : () -> ()
+  }) : (memref<i1>) -> ()
+  func.return
+}
+
+func.func private @external()
+}
+
+// CHECK: func @xla.gpu.cuda.graph.capture
+// CHECK-NEXT: arith.constant 1
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
+// CHECK-NEXT: return
+
+// CHECK: func @xla.gpu.cuda.graph.capture_0
+// CHECK-NEXT: arith.constant 1
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
+// CHECK-NEXT: return
diff --git a/tensorflow/compiler/xla/mlir/framework/ir/xla_framework_ops.td b/tensorflow/compiler/xla/mlir/framework/ir/xla_framework_ops.td
index 9356f65169e..8cde6c4b8bc 100644
--- a/tensorflow/compiler/xla/mlir/framework/ir/xla_framework_ops.td
+++ b/tensorflow/compiler/xla/mlir/framework/ir/xla_framework_ops.td
@@ -15,8 +15,8 @@ limitations under the License.
 
 // This is the operation definition file for XLA Framework ops.
 
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_XLA_FRAMEWORK_OPS_TD_
-#define TENSORFLOW_COMPILER_MLIR_XLA_XLA_FRAMEWORK_OPS_TD_
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_FRAMEWORK_IR_XLA_FRAMEWORK_OPS_TD_
+#define TENSORFLOW_COMPILER_XLA_MLIR_FRAMEWORK_IR_XLA_FRAMEWORK_OPS_TD_
 
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/OpBase.td"
@@ -109,4 +109,4 @@ def XLAFramework_MemToXLABufferOp : XLAFramework_Op<"mem_to_buffer",
   }];
 }
 
-#endif // TENSORFLOW_COMPILER_MLIR_XLA_XLA_FRAMEWORK_OPS_TD_
+#endif // TENSORFLOW_COMPILER_XLA_MLIR_FRAMEWORK_IR_XLA_FRAMEWORK_OPS_TD_
diff --git a/tensorflow/compiler/xla/mlir/framework/tests/legalize-xla-framework.mlir b/tensorflow/compiler/xla/mlir/framework/tests/legalize-xla-framework.mlir
index 3747f70feb7..32a408940fa 100644
--- a/tensorflow/compiler/xla/mlir/framework/tests/legalize-xla-framework.mlir
+++ b/tensorflow/compiler/xla/mlir/framework/tests/legalize-xla-framework.mlir
@@ -12,23 +12,22 @@ func.func @buffer_type(%arg: !xla_framework.buffer {xla_framework.input_mapping
 
 // CHECK-LABEL: @buffer_type
 // The following signature is always the same.
-// CHECK-SAME: %{{[^:]*}}: !llvm.ptr<i8>
-// CHECK-SAME: %{{[^:]*}}: !llvm.ptr<i8>
-// CHECK-SAME: %{{[^:]*}}: !llvm.ptr<ptr<i8>>
-// CHECK-SAME: %[[BUFFERS:.*]]: !llvm.ptr<ptr<i8>>
-// CHECK-SAME: %{{[^:]*}}: !llvm.ptr<i64>
-// CHECK-SAME: %{{[^:]*}}: !llvm.ptr<i64>) {
+// CHECK-SAME: %{{[^:]*}}: !llvm.ptr,
+// CHECK-SAME: %{{[^:]*}}: !llvm.ptr,
+// CHECK-SAME: %{{[^:]*}}: !llvm.ptr,
+// CHECK-SAME: %[[BUFFERS:[^:]*]]: !llvm.ptr,
+// CHECK-SAME: %{{[^:]*}}: !llvm.ptr,
+// CHECK-SAME: %{{[^:]*}}: !llvm.ptr) {
 // Retrieve pointer from the input as part of the function signature lowering.
 // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i32
-// CHECK: %[[PTRS:.*]] = llvm.getelementptr %[[BUFFERS]][%[[C0]]] : (!llvm.ptr<ptr<i8>>, i32) -> !llvm.ptr<ptr<i8>>
-// CHECK: %[[PTR0:.*]] = llvm.load %[[PTRS]] : !llvm.ptr<ptr<i8>>
-// CHECK: %[[INP0:.*]] = llvm.bitcast %[[PTR0]] : !llvm.ptr<i8> to !llvm.ptr<f32>
+// CHECK: %[[PTRS:.*]] = llvm.getelementptr %[[BUFFERS]][%[[C0]]] : (!llvm.ptr, i32) -> !llvm.ptr
+// CHECK: %[[PTR0:.*]] = llvm.load %[[PTRS]] : !llvm.ptr
 // Create memref descriptor as the buffer_to_mem lowering.
-// CHECK: %[[MEMREF:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64)>
-// CHECK: %[[MEMREF1:.*]] = llvm.insertvalue %[[INP0]], %[[MEMREF]][0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64)>
-// CHECK: %[[MEMREF:.*]] = llvm.insertvalue %[[INP0]], %[[MEMREF1]][1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64)>
+// CHECK: %[[MEMREF:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+// CHECK: %[[MEMREF1:.*]] = llvm.insertvalue %[[PTR0]], %[[MEMREF]][0] : !llvm.struct<(ptr, ptr, i64)>
+// CHECK: %[[MEMREF:.*]] = llvm.insertvalue %[[PTR0]], %[[MEMREF1]][1] : !llvm.struct<(ptr, ptr, i64)>
 // CHECK: %[[C0_0:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK: llvm.insertvalue %[[C0_0:.*]], %[[MEMREF:.*]][2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64)>
+// CHECK: llvm.insertvalue %[[C0_0:.*]], %[[MEMREF:.*]][2] : !llvm.struct<(ptr, ptr, i64)>
 // No return values in this case
 // CHECK: return
 
@@ -41,37 +40,35 @@ func.func @return_tuple(%result0: !xla_framework.buffer, %result1: !xla_framewor
 
 // CHECK-LABEL: @return_tuple
 // The following signature is always the same.
-// CHECK-SAME: %{{[^:]*}}: !llvm.ptr<i8>
-// CHECK-SAME: %{{[^:]*}}: !llvm.ptr<i8>
-// CHECK-SAME: %{{[^:]*}}: !llvm.ptr<ptr<i8>>
-// CHECK-SAME: %[[BUFFERS:.*]]: !llvm.ptr<ptr<i8>>
-// CHECK-SAME: %{{[^:]*}}: !llvm.ptr<i64>
-// CHECK-SAME: %{{[^:]*}}: !llvm.ptr<i64>) {
+// CHECK-SAME: %{{[^:]*}}: !llvm.ptr,
+// CHECK-SAME: %{{[^:]*}}: !llvm.ptr,
+// CHECK-SAME: %{{[^:]*}}: !llvm.ptr,
+// CHECK-SAME: %[[BUFFERS:[^:]*]]: !llvm.ptr,
+// CHECK-SAME: %{{[^:]*}}: !llvm.ptr,
+// CHECK-SAME: %{{[^:]*}}: !llvm.ptr) {
 // Get Tuple
 // CHECK-NEXT: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i32
-// CHECK-NEXT: %[[PTRS0:.*]] = llvm.getelementptr %[[BUFFERS]][%[[C0]]] : (!llvm.ptr<ptr<i8>>, i32) -> !llvm.ptr<ptr<i8>>
-// CHECK-NEXT: %[[PTR0:.*]] = llvm.load %[[PTRS0]] : !llvm.ptr<ptr<i8>>
+// CHECK-NEXT: %[[PTRS0:.*]] = llvm.getelementptr %[[BUFFERS]][%[[C0]]] : (!llvm.ptr, i32) -> !llvm.ptr
+// CHECK-NEXT: %[[PTR0:.*]] = llvm.load %[[PTRS0]] : !llvm.ptr
 // Get individual output buffer
 // CHECK-NEXT: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
-// CHECK-NEXT: %[[PTRS1:.*]] = llvm.getelementptr %[[BUFFERS]][%[[C1]]] : (!llvm.ptr<ptr<i8>>, i32) -> !llvm.ptr<ptr<i8>>
-// CHECK-NEXT: %[[PTR1:.*]] = llvm.load %[[PTRS1]] : !llvm.ptr<ptr<i8>>
+// CHECK-NEXT: %[[PTRS1:.*]] = llvm.getelementptr %[[BUFFERS]][%[[C1]]] : (!llvm.ptr, i32) -> !llvm.ptr
+// CHECK-NEXT: %[[PTR1:.*]] = llvm.load %[[PTRS1]] : !llvm.ptr
 // Store into tuple
-// CHECK-NEXT: %[[TUPLE:.*]] = llvm.bitcast %[[PTR0]] : !llvm.ptr<i8> to !llvm.ptr<ptr<i8>>
 // CHECK-NEXT: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK-NEXT: %[[TUPLE_ELEMENT:.*]] = llvm.getelementptr %[[TUPLE]][%[[C0]]] : (!llvm.ptr<ptr<i8>>, i32) -> !llvm.ptr<ptr<i8>>
-// CHECK-NEXT: llvm.store %[[PTR1]], %[[TUPLE_ELEMENT]] : !llvm.ptr<ptr<i8>>
+// CHECK-NEXT: %[[TUPLE_ELEMENT:.*]] = llvm.getelementptr %[[PTR0]][%[[C0]]] : (!llvm.ptr, i32) -> !llvm.ptr
+// CHECK-NEXT: llvm.store %[[PTR1]], %[[TUPLE_ELEMENT]] : !llvm.ptr
 // Get tuple
 // CHECK-NEXT: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i32
-// CHECK-NEXT: %[[PTRS0:.*]] = llvm.getelementptr %[[BUFFERS]][%[[C0]]] : (!llvm.ptr<ptr<i8>>, i32) -> !llvm.ptr<ptr<i8>>
-// CHECK-NEXT: %[[PTR0:.*]] = llvm.load %[[PTRS0]] : !llvm.ptr<ptr<i8>>
+// CHECK-NEXT: %[[PTRS0:.*]] = llvm.getelementptr %[[BUFFERS]][%[[C0]]] : (!llvm.ptr, i32) -> !llvm.ptr
+// CHECK-NEXT: %[[PTR0:.*]] = llvm.load %[[PTRS0]] : !llvm.ptr
 // Get individual output buffer
 // CHECK-NEXT: %[[C2:.*]] = llvm.mlir.constant(2 : i32) : i32
-// CHECK-NEXT: %[[PTRS2:.*]] = llvm.getelementptr %[[BUFFERS]][%[[C2]]] : (!llvm.ptr<ptr<i8>>, i32) -> !llvm.ptr<ptr<i8>>
-// CHECK-NEXT: %[[PTR2:.*]] = llvm.load %[[PTRS2]] : !llvm.ptr<ptr<i8>>
+// CHECK-NEXT: %[[PTRS2:.*]] = llvm.getelementptr %[[BUFFERS]][%[[C2]]] : (!llvm.ptr, i32) -> !llvm.ptr
+// CHECK-NEXT: %[[PTR2:.*]] = llvm.load %[[PTRS2]] : !llvm.ptr
 // Store into Tuple
-// CHECK-NEXT: %[[TUPLE:.*]] = llvm.bitcast %[[PTR0]] : !llvm.ptr<i8> to !llvm.ptr<ptr<i8>>
 // CHECK-NEXT: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32
-// CHECK-NEXT: %[[TUPLE_ELEMENT:.*]] = llvm.getelementptr %[[TUPLE]][%[[C1]]] : (!llvm.ptr<ptr<i8>>, i32) -> !llvm.ptr<ptr<i8>>
-// CHECK-NEXT: llvm.store %[[PTR2]], %[[TUPLE_ELEMENT]] : !llvm.ptr<ptr<i8>>
+// CHECK-NEXT: %[[TUPLE_ELEMENT:.*]] = llvm.getelementptr %[[PTR0]][%[[C1]]] : (!llvm.ptr, i32) -> !llvm.ptr
+// CHECK-NEXT: llvm.store %[[PTR2]], %[[TUPLE_ELEMENT]] : !llvm.ptr
 // No return values
 // CHECK-NEXT:  return
diff --git a/tensorflow/compiler/xla/mlir/framework/tests/outline-with-xla-framework.mlir b/tensorflow/compiler/xla/mlir/framework/tests/outline-with-xla-framework.mlir
index c4cda613ea9..1fb6ee591e5 100644
--- a/tensorflow/compiler/xla/mlir/framework/tests/outline-with-xla-framework.mlir
+++ b/tensorflow/compiler/xla/mlir/framework/tests/outline-with-xla-framework.mlir
@@ -1,15 +1,15 @@
 // RUN: xla-translate-opt %s -split-input-file -outline-with-xla-framework | FileCheck %s
 
-// CHECK-LABEL: @func_to_outline_xla_framework
+// CHECK-LABEL: @main_xla_framework
 // CHECK-SAME: %[[ARG0:.*]]: !xla_framework.buffer
 // CHECK-SAME: -> !xla_framework.buffer
 // CHECK-SAME: attributes {xla_entry = true}
 // CHECK-NEXT: %[[BUF0:.*]] = xla_framework.buffer_to_mem %[[ARG0]] : memref<?xf32>
-// CHECK-NEXT: %[[BUF1:.*]] = call @func_to_outline(%[[BUF0]])
+// CHECK-NEXT: %[[BUF1:.*]] = call @main(%[[BUF0]])
 // CHECK-NEXT: %[[RESULT:.*]] = xla_framework.mem_to_buffer %[[BUF1]] : memref<?xf32>
 // CHECK-NEXT: return %[[RESULT]] : !xla_framework.buffer
-func.func @func_to_outline(%arg0: memref<?xf32>) -> memref<?xf32> {
+func.func @main(%arg0: memref<?xf32>) -> memref<?xf32> {
   func.return %arg0 : memref<?xf32>
 }
 
-// CHECK: func private @func_to_outline(%arg0: memref<?xf32>) -> memref<?xf32> attributes {llvm.linkage = #llvm.linkage<internal>}
+// CHECK: func private @main(%arg0: memref<?xf32>) -> memref<?xf32> attributes {llvm.linkage = #llvm.linkage<internal>}
diff --git a/tensorflow/compiler/xla/mlir/framework/transforms/outline_with_xla_framework.cc b/tensorflow/compiler/xla/mlir/framework/transforms/outline_with_xla_framework.cc
index e437525ae5f..794200c30de 100644
--- a/tensorflow/compiler/xla/mlir/framework/transforms/outline_with_xla_framework.cc
+++ b/tensorflow/compiler/xla/mlir/framework/transforms/outline_with_xla_framework.cc
@@ -77,6 +77,7 @@ struct OutlineXLAFunc : public RewritePattern {
 
     // Functions should only be outlined once and should only use memrefs
     if (!func) return failure();
+    if (func.getSymName() != "main") return failure();
     if (llvm::any_of(op->getOperandTypes(),
                      [](Type t) { return !t.isa<MemRefType>(); }) ||
         op->getNumResults() != 0)
diff --git a/tensorflow/compiler/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc b/tensorflow/compiler/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
index 9f209090ab7..b2c1bd97cde 100644
--- a/tensorflow/compiler/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
+++ b/tensorflow/compiler/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
 #include <memory>
 #include <stdexcept>
 #include <utility>
@@ -61,12 +62,7 @@ struct XLABufferToMemOpConversion
     this->getMemRefDescriptorSizes(loc, mem_ref_type, ValueRange(), rewriter,
                                    sizes, strides, size_bytes);
 
-    auto ptr_type = LLVM::LLVMPointerType::get(
-        typeConverter->convertType(mem_ref_type.getElementType()),
-        mem_ref_type.getMemorySpaceAsInt());
-    Value ptr =
-        rewriter.create<LLVM::BitcastOp>(loc, ptr_type, adaptor.getBuffer());
-
+    Value ptr = adaptor.getBuffer();
     Value result = this->createMemRefDescriptor(loc, mem_ref_type, ptr, ptr,
                                                 sizes, strides, rewriter);
     rewriter.replaceOp(op, {result});
@@ -81,12 +77,9 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
 
   Value LoadValue(ConversionPatternRewriter &rewriter, Location loc,
                   Value pointer, Value index) const {
+    auto ptr = LLVM::LLVMPointerType::get(rewriter.getContext());
     return rewriter.create<LLVM::LoadOp>(
-        loc, rewriter.create<LLVM::GEPOp>(
-                 loc,
-                 LLVM::LLVMPointerType::get(LLVM::LLVMPointerType::get(
-                     IntegerType::get(rewriter.getContext(), 8))),
-                 pointer, index));
+        loc, ptr, rewriter.create<LLVM::GEPOp>(loc, ptr, ptr, pointer, index));
   }
 
   mlir::func::FuncOp convertFuncOpToLLVMFuncOp(
@@ -99,20 +92,8 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
     // This only works for the global function version that tf.compile uses.
     // Local functions will only be called by MLIR compiled code, so we can
     // ignore them.
-    SmallVector<Type, 6> arg_types;
-    arg_types.reserve(6);
-    arg_types.push_back(
-        LLVM::LLVMPointerType::get(IntegerType::get(rewriter.getContext(), 8)));
-    arg_types.push_back(
-        LLVM::LLVMPointerType::get(IntegerType::get(rewriter.getContext(), 8)));
-    arg_types.push_back(LLVM::LLVMPointerType::get(LLVM::LLVMPointerType::get(
-        IntegerType::get(rewriter.getContext(), 8))));
-    arg_types.push_back(LLVM::LLVMPointerType::get(LLVM::LLVMPointerType::get(
-        IntegerType::get(rewriter.getContext(), 8))));
-    arg_types.push_back(LLVM::LLVMPointerType::get(
-        IntegerType::get(rewriter.getContext(), 64)));
-    arg_types.push_back(LLVM::LLVMPointerType::get(
-        IntegerType::get(rewriter.getContext(), 64)));
+    auto ptr = LLVM::LLVMPointerType::get(rewriter.getContext());
+    std::array<Type, 6> arg_types = {ptr, ptr, ptr, ptr, ptr, ptr};
     auto llvm_type =
         mlir::FunctionType::get(rewriter.getContext(), arg_types, {});
 
@@ -169,17 +150,13 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
               LoadValue(rewriter, loc, new_entry->getArgument(3), inner_index);
           mapping.map(funcOp.front().getArgument(i), ptr);
 
-          auto ptr_type = LLVM::LLVMPointerType::get(LLVM::LLVMPointerType::get(
-              IntegerType::get(rewriter.getContext(), 8)));
-          first_load =
-              rewriter.create<LLVM::BitcastOp>(loc, ptr_type, first_load);
-
+          auto ptr_type = LLVM::LLVMPointerType::get(rewriter.getContext());
           Value second_index = rewriter.create<LLVM::ConstantOp>(
               loc, typeConverter->convertType(rewriter.getIntegerType(32)),
               rewriter.getI32IntegerAttr(current_index));
           rewriter.create<LLVM::StoreOp>(
               loc, ptr,
-              rewriter.create<LLVM::GEPOp>(loc, ptr_type, first_load,
+              rewriter.create<LLVM::GEPOp>(loc, ptr_type, ptr_type, first_load,
                                            llvm::ArrayRef(second_index)));
 
         } else {
@@ -236,12 +213,9 @@ class LegalizeXLAFrameworkToLLVMPass
 
     // Populate type conversions.
     MLIRContext *ctx = m.getContext();
-    // TODO(b/267828330): Migrate to opaque pointers.
-    LowerToLLVMOptions options(&getContext());
-    options.useOpaquePointers = false;
-    LLVMTypeConverter type_converter(ctx, options);
+    LLVMTypeConverter type_converter(ctx);
     type_converter.addConversion([&](::mlir::xla_framework::BufferType) {
-      return LLVM::LLVMPointerType::get(IntegerType::get(ctx, 8));
+      return LLVM::LLVMPointerType::get(ctx);
     });
 
     // Populate patterns.
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/BUILD b/tensorflow/compiler/xla/mlir/runtime/transforms/BUILD
index 7c2a5fff3b8..f3cac07f5e9 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/BUILD
@@ -106,6 +106,7 @@ cc_library(
         "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir/math/transforms:passes",
         "//tensorflow/compiler/xla/mlir/memref/transforms:passes",
+        "//tensorflow/compiler/xla/mlir_hlo:transforms_passes",
         "//tensorflow/compiler/xla/runtime:compiler",
         "@llvm-project//mlir:AMXToLLVMIRTranslation",
         "@llvm-project//mlir:AffineDialect",
@@ -117,10 +118,13 @@ cc_library(
         "@llvm-project//mlir:AsyncDialect",
         "@llvm-project//mlir:AsyncToLLVM",
         "@llvm-project//mlir:AsyncTransforms",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexToLLVM",
         "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:ConversionPasses",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncToLLVM",
+        "@llvm-project//mlir:GPUTransforms",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
         "@llvm-project//mlir:LinalgToLLVM",
         "@llvm-project//mlir:LinalgTransforms",
@@ -160,6 +164,7 @@ cc_library(
         "@llvm-project//mlir:AsyncDialect",
         "@llvm-project//mlir:AsyncToLLVM",
         "@llvm-project//mlir:AsyncTransforms",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:IR",
@@ -227,8 +232,11 @@ cc_library(
         "//tensorflow/compiler/xla/runtime:symbolic_shape",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Passes",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
         "@llvm-project//mlir:ExecutionEngineUtils",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
index c05820c7611..aa08fcdfd4e 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/MathToLibm/MathToLibm.h"  // from @llvm-project
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/Passes.h"  // from @llvm-project
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"  // from @llvm-project
@@ -35,6 +36,7 @@ limitations under the License.
 #include "mlir/Dialect/Async/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
@@ -45,6 +47,7 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Dialect/AMX/AMXToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/ArmNeon/ArmNeonToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/ArmSVE/ArmSVEToLLVMIRTranslation.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/X86Vector/X86VectorToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
@@ -55,24 +58,27 @@ limitations under the License.
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/transforms/passes.h"
 
 namespace xla {
 namespace runtime {
 
 void RegisterDefaultXlaCpuRuntimeDialects(DialectRegistry& dialects) {
   // Register MLIR dialects supported by the compiled executables.
-  dialects->insert<
-      mlir::AffineDialect, mlir::arith::ArithDialect, mlir::async::AsyncDialect,
-      mlir::cf::ControlFlowDialect, mlir::linalg::LinalgDialect,
-      mlir::math::MathDialect, mlir::memref::MemRefDialect,
-      mlir::scf::SCFDialect, mlir::func::FuncDialect,
-      mlir::sparse_tensor::SparseTensorDialect, mlir::tensor::TensorDialect,
-      mlir::vector::VectorDialect, RuntimeDialect>();
+  dialects->insert<mlir::affine::AffineDialect, mlir::arith::ArithDialect,
+                   mlir::async::AsyncDialect, mlir::cf::ControlFlowDialect,
+                   mlir::linalg::LinalgDialect, mlir::math::MathDialect,
+                   mlir::memref::MemRefDialect, mlir::scf::SCFDialect,
+                   mlir::func::FuncDialect,
+                   mlir::sparse_tensor::SparseTensorDialect,
+                   mlir::tensor::TensorDialect, mlir::vector::VectorDialect,
+                   RuntimeDialect>();
 
   // Register MLIR dialects that can be translated to LLVM IR.
   mlir::registerArmNeonDialectTranslation(*dialects);
   mlir::registerAMXDialectTranslation(*dialects);
   mlir::registerArmSVEDialectTranslation(*dialects);
+  mlir::registerBuiltinDialectTranslation(*dialects);
   mlir::registerLLVMDialectTranslation(*dialects);
   mlir::registerX86VectorDialectTranslation(*dialects);
 }
@@ -136,41 +142,29 @@ static void CreateXlaCpuCompilationPipeline(mlir::OpPassManager& pm,
     pm.addPass(CreateConvertRuntimeToLLVMPass(std::move(rt_to_llvm_opts)));
 
     // Convert async to LLVM once everything else is in the LLVM dialect.
-    // TODO(b/267828330): Migrate to opaque pointers.
-    mlir::ConvertAsyncToLLVMPassOptions async_to_llvm_opts;
-    async_to_llvm_opts.useOpaquePointers = false;
-    pm.addPass(mlir::createConvertAsyncToLLVMPass(async_to_llvm_opts));
+    pm.addPass(mlir::createConvertAsyncToLLVMPass());
   } else {
     pm.addPass(mlir::xla_framework::CreateLegalizeXLAFrameworkToLLVMPass());
   }
-  // TODO(b/267828330): Migrate to opaque pointers.
-  mlir::ConvertLinalgToLLVMPassOptions linalg_to_llvm_opts;
-  linalg_to_llvm_opts.useOpaquePointers = false;
-  pm.addPass(mlir::createConvertLinalgToLLVMPass(linalg_to_llvm_opts));
-  pm.addPass(mlir::createConvertSCFToCFPass());
-
-  // Lower math dialect to LLVM/Libm.
-  mlir::ConvertMathToLLVMPassOptions mathOpts;
-  mathOpts.approximateLog1p = false;
-  pm.addPass(mlir::createConvertMathToLLVMPass(mathOpts));
-  pm.addPass(mlir::createConvertMathToLibmPass());
 
   // Convert everything else to LLVM dialect.
-  mlir::ConvertVectorToLLVMPassOptions vector_to_llvm_opts;
-  // TODO(b/267828330): Migrate to opaque pointers.
-  vector_to_llvm_opts.useOpaquePointers = false;
-  if (opts.math_avx2) vector_to_llvm_opts.x86Vector = true;
-  pm.addPass(mlir::createConvertVectorToLLVMPass(vector_to_llvm_opts));
-  // TODO(b/267828330): Migrate to opaque pointers.
-  mlir::FinalizeMemRefToLLVMConversionPassOptions memref_to_llvm_opts;
-  memref_to_llvm_opts.useOpaquePointers = false;
-  pm.addPass(
-      mlir::createFinalizeMemRefToLLVMConversionPass(memref_to_llvm_opts));
-  // TODO(b/267828330): Migrate to opaque pointers.
-  mlir::ConvertFuncToLLVMPassOptions func_to_llvm_opts;
-  func_to_llvm_opts.useOpaquePointers = false;
-  pm.addPass(mlir::createConvertFuncToLLVMPass(func_to_llvm_opts));
-  pm.addPass(mlir::createConvertComplexToLLVMPass());
+  mlir::GenericHostToLLVMPassOptions llvm_options;
+  llvm_options.enableAvx2 = opts.math_avx2;
+
+  const bool gpuCodegen = opts.xla_cpu_sparse_cuda_threads > 0;
+  if (gpuCodegen) {
+#ifdef MLIR_GPU_TO_CUBIN_PASS_ENABLE
+    pm.addNestedPass<mlir::gpu::GPUModuleOp>(
+        mlir::createGpuSerializeToCubinPass(opts.cuda_triplet, opts.cuda_arch,
+                                            opts.cuda_features));
+#endif
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createConvertSCFToCFPass());
+    pm.addPass(mlir::createConvertFuncToLLVMPass());
+    pm.addPass(mlir::createGpuToLLVMConversionPass());
+  }
+
+  pm.addPass(mlir::hlo::createGenericHostToLLVMPass(llvm_options));
+
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
 
   // Prepare module for translation to LLVM.
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h
index ad531a7d96c..151536560f3 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h
@@ -31,6 +31,15 @@ struct CpuPipelineOptions {
   // Tensorflow requires tensors to be aligned on 16, 32 or 64 bytes.
   int alignment = 0;
 
+  // Accelerate sparse computations with CUDA threading.
+  // This is an experimental feature, so off by default.
+  int32_t xla_cpu_sparse_cuda_threads = 0;
+#ifdef MLIR_GPU_TO_CUBIN_PASS_ENABLE
+  std::string cuda_triplet = "nvptx64-nvidia-cuda";
+  std::string cuda_arch = "sm_80";
+  std::string cuda_features = "+ptx71+";
+#endif
+
   // Enables math approximations that emit AVX2 intrinsics.
 #ifdef __AVX2__
   bool math_avx2 = true;
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.cc
index 7588e1df088..8f2634e4be9 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/runtime/ir/tests/testlib.h"
@@ -47,6 +48,7 @@ void RegisterDefaultXlaGpuRuntimeDialects(DialectRegistry& dialects) {
                    mlir::async::AsyncDialect, RuntimeDialect>();
 
   // Register MLIR dialects that can be translated to LLVM IR.
+  mlir::registerBuiltinDialectTranslation(*dialects);
   mlir::registerLLVMDialectTranslation(*dialects);
 }
 
@@ -82,21 +84,11 @@ static void CreateDefaultXlaGpuRuntimeCompilationPipeline(
   pm.addPass(CreateConvertRuntimeToLLVMPass(std::move(rt_to_llvm_opts)));
 
   // Convert async dialect to LLVM once everything else is in the LLVM dialect.
-  // TODO(b/267828330): Migrate to opaque pointers.
-  mlir::ConvertAsyncToLLVMPassOptions async_to_llvm_opts;
-  async_to_llvm_opts.useOpaquePointers = false;
-  pm.addPass(mlir::createConvertAsyncToLLVMPass(async_to_llvm_opts));
+  pm.addPass(mlir::createConvertAsyncToLLVMPass());
 
   // Convert everything else to LLVM dialect.
-  // TODO(b/267828330): Migrate to opaque pointers.
-  mlir::FinalizeMemRefToLLVMConversionPassOptions memref_to_llvm_opts;
-  memref_to_llvm_opts.useOpaquePointers = false;
-  pm.addPass(
-      mlir::createFinalizeMemRefToLLVMConversionPass(memref_to_llvm_opts));
-  // TODO(b/267828330): Migrate to opaque pointers.
-  mlir::ConvertFuncToLLVMPassOptions func_to_llvm_opts;
-  func_to_llvm_opts.useOpaquePointers = false;
-  pm.addPass(mlir::createConvertFuncToLLVMPass(func_to_llvm_opts));
+  pm.addPass(mlir::createFinalizeMemRefToLLVMConversionPass());
+  pm.addPass(mlir::createConvertFuncToLLVMPass());
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
 
   // Clean up IR before passing it to LLVM.
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.cc
index c4dd90e93a1..e9e62cb5767 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
@@ -154,7 +155,7 @@ LLVM::GlobalOp EncodeString(Globals &g, ImplicitLocOpBuilder &b,
 mlir::LLVM::GlobalOp EncodeScalar(Globals &g, mlir::ImplicitLocOpBuilder &b,
                                   mlir::Attribute value,
                                   std::string_view symbol_base) {
-  return g.GetOrCreate(b, value, symbol_base);
+  return g.GetOrCreate(b, cast<TypedAttr>(value), symbol_base);
 }
 
 // Reshape dense elements as a one-dimensional array.
@@ -244,7 +245,7 @@ static LLVM::GlobalOp EncodeArrayAttrData(Globals &g, ImplicitLocOpBuilder &b,
   auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
     Value data = ib.create<LLVM::UndefOp>(arr_type);
     for (int i = 0; i < array.size(); i++) {
-      Value value = ib.create<ConstantOp>(array[i]);
+      Value value = ib.create<ConstantOp>(cast<TypedAttr>(array[i]));
       data = ib.create<LLVM::InsertValueOp>(data, value, i);
     }
     ib.create<LLVM::ReturnOp>(data);
@@ -630,6 +631,7 @@ static PrimitiveType ScalarPrimitiveType(Type type) {
 
   // Floating point types.
   if (type.isFloat8E4M3FN()) return PrimitiveType::F8E4M3FN;
+  if (type.isFloat8E4M3B11FNUZ()) return PrimitiveType::F8E4M3B11FNUZ;
   if (type.isFloat8E5M2()) return PrimitiveType::F8E5M2;
   if (type.isF16()) return PrimitiveType::F16;
   if (type.isF32()) return PrimitiveType::F32;
@@ -977,7 +979,7 @@ FailureOr<LLVM::GlobalOp> EncodeAttributes(
     insert_value(Globals::AddrOf(b, num_attrs), 0);
 
     // Insert encoded attributes into the allocated storage.
-    for (auto &pair : llvm::enumerate(encoded_attrs)) {
+    for (const auto &pair : llvm::enumerate(encoded_attrs)) {
       CustomCallAttrEncoding::Encoded encoded = pair.value().second;
       int64_t offset = 1 + pair.index() * 3;
 
@@ -1153,11 +1155,10 @@ static Value EncodeMemRef(ImplicitLocOpBuilder &b, MemRefType memref_ty,
   // better canonicalization and cleaner final LLVM IR.
   if (desc.has_value()) {
     Value offset = b.create<ConstantOp>(i64(memref_offset));
-    Value data = b.create<LLVM::GEPOp>(desc->getElementPtrType(),
-                                       desc->alignedPtr(b, loc), offset);
     auto ptr = LLVM::LLVMPointerType::get(b.getContext());
-    memref = b.create<LLVM::InsertValueOp>(
-        memref, b.create<LLVM::BitcastOp>(ptr, data), 2);
+    Value data = b.create<LLVM::GEPOp>(ptr, memref_ty.getElementType(),
+                                       desc->alignedPtr(b, loc), offset);
+    memref = b.create<LLVM::InsertValueOp>(memref, data, 2);
   }
 
   return memref;
@@ -1300,8 +1301,7 @@ FailureOr<Value> MemrefRetEncoding::Decode(ImplicitLocOpBuilder &b, Type type,
 
   // Fill memref descriptor pointers and offset.
   Value gep = b.create<LLVM::GEPOp>(ptr, encoded, alloca, ValueRange({c0, c2}));
-  Value data_ptr = b.create<LLVM::BitcastOp>(memref_desc.getElementPtrType(),
-                                             b.create<LLVM::LoadOp>(ptr, gep));
+  Value data_ptr = b.create<LLVM::LoadOp>(ptr, gep);
   memref_desc.setAllocatedPtr(b, loc, data_ptr);
   memref_desc.setAlignedPtr(b, loc, data_ptr);
 
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/export_functions.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/export_functions.cc
index 3fe35bc5f3a..c5b1ee41509 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/export_functions.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/export_functions.cc
@@ -47,7 +47,7 @@ static void ConvertReturnOperations(func::FuncOp func, Value exec_ctx) {
     ImplicitLocOpBuilder b(ret.getLoc(), ret);
 
     // Return all outputs via the `rt.set_output` operation.
-    for (auto& pair : llvm::enumerate(ret.getOperands())) {
+    for (const auto& pair : llvm::enumerate(ret.getOperands())) {
       b.create<SetOutputOp>(exec_ctx, pair.index(), pair.value());
     }
 
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.cc
index 63d9e66fca7..e9e05ba8592 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.cc
@@ -25,9 +25,17 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/Pass.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
 #include "mlir/ExecutionEngine/OptUtils.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -37,7 +45,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/passes.h"
-#include "tensorflow/compiler/xla/runtime/symbolic_shape.h"
 
 namespace xla {
 namespace runtime {
@@ -95,6 +102,12 @@ static void SetupPassDebugging(MLIRContext* context, mlir::PassManager& pm) {
   }
 }
 
+static void PrintPassPipeline(const mlir::PassManager& pm) {
+  llvm::errs() << "MLIR Pass Pipeline:\n";
+  pm.printAsTextualPipeline(llvm::errs());
+  llvm::errs() << "\n";
+}
+
 static LogicalResult RunPipeline(
     ModuleOp module, const std::function<void(PassManager&)>& create_pipeline) {
   if (!create_pipeline) return success();
@@ -114,6 +127,10 @@ static LogicalResult RunPipeline(
   PassManager passes(&pm);
   create_pipeline(passes);
 
+  if (DebugJitCompiler()) {
+    PrintPassPipeline(pm);
+  }
+
   return pm.run(module);
 }
 
@@ -179,7 +196,7 @@ absl::Status JitCompiler::ComputeOrdinalsForExportedFunctions(
   SymbolTable sym_table(*module_);
 
   // Add `rt.export` operations for all explicitly exported functions.
-  for (auto& indexed : llvm::enumerate(exported)) {
+  for (const auto& indexed : llvm::enumerate(exported)) {
     if (auto func = sym_table.lookup<FunctionOpInterface>(indexed.value())) {
       OpBuilder(func).create<ExportOp>(func.getLoc(), func, indexed.index());
       continue;
@@ -239,6 +256,32 @@ JitCompiler::Instantiate(JitCompiler::Options opts, ModuleOp mlir_module,
   return {std::move(compiler)};
 }
 
+static std::function<llvm::Error(llvm::Module*)>
+MakeOptimizingTransformerForJit(llvm::TargetMachine* targetMachine) {
+  return [targetMachine](llvm::Module* m) -> llvm::Error {
+    llvm::LoopAnalysisManager lam;
+    llvm::FunctionAnalysisManager fam;
+    llvm::CGSCCAnalysisManager cgam;
+    llvm::ModuleAnalysisManager mam;
+
+    llvm::PipelineTuningOptions tuningOptions;
+    // Vectorization happens at the MLIR level.
+    tuningOptions.LoopVectorization = false;
+    llvm::PassBuilder pb(targetMachine, tuningOptions);
+
+    pb.registerModuleAnalyses(mam);
+    pb.registerCGSCCAnalyses(cgam);
+    pb.registerFunctionAnalyses(fam);
+    pb.registerLoopAnalyses(lam);
+    pb.crossRegisterProxies(lam, fam, cgam, mam);
+
+    llvm::ModulePassManager mpm;
+    mpm.addPass(pb.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O2));
+    mpm.run(*m, mam);
+    return llvm::Error::success();
+  };
+}
+
 /*static*/ absl::StatusOr<Executable> JitCompiler::Compile(
     std::unique_ptr<JitCompiler> compiler, std::string_view memory_region_name,
     std::optional<size_t> specialization) {
@@ -255,7 +298,7 @@ JitCompiler::Instantiate(JitCompiler::Options opts, ModuleOp mlir_module,
   std::vector<Executable::Function> functions;
   std::vector<std::string_view> exported;  // names of exported functions
 
-  for (auto& indexed : llvm::enumerate(compiler->exported())) {
+  for (const auto& indexed : llvm::enumerate(compiler->exported())) {
     auto func = indexed.value();
     std::string_view name = exported.emplace_back(func.getName());
 
@@ -285,13 +328,18 @@ JitCompiler::Instantiate(JitCompiler::Options opts, ModuleOp mlir_module,
         Executable::GetResultsMemoryLayout(*runtime_signature);
     if (!results_memory_layout.ok()) return results_memory_layout.status();
 
+    bool requires_blas = false;
+    if (Attribute requires_blas_attr = func->getAttr("xla.requires_blas")) {
+      requires_blas = cast<BoolAttr>(requires_blas_attr).getValue();
+    }
+
     // Add function with an unresolved function pointer; it will be updated once
     // we compile the input module to the native executable.
     functions.push_back(Executable::Function(
         name,
         /*fptr=*/nullptr, std::move(*signature), std::move(*runtime_signature),
-        std::move(*arguments_memory_layout),
-        std::move(*results_memory_layout)));
+        std::move(*arguments_memory_layout), std::move(*results_memory_layout),
+        requires_blas));
   }
 
   // Run the compilation pipeline to lower the module to LLVM dialect.
@@ -329,7 +377,7 @@ JitCompiler::Instantiate(JitCompiler::Options opts, ModuleOp mlir_module,
   ExecutionEngine::JitOptions engine_options;
   engine_options.opt_level = compiler->options().jit_code_opt_level;
   engine_options.target_machine = target_machine->get();
-  engine_options.make_optimizing_transformer = makeOptimizingTransformer;
+  engine_options.make_optimizing_transformer = MakeOptimizingTransformerForJit;
   engine_options.section_memory_mapper = memory_mapper.get();
   engine_options.symbols_binding = std::move(symbols);
 
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/rt_to_llvm.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/rt_to_llvm.cc
index a8d4ec39f4a..399ad8de6fb 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/rt_to_llvm.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/rt_to_llvm.cc
@@ -256,7 +256,7 @@ static LLVM::GlobalOp EncodeTypeTable(Globals &g, ImplicitLocOpBuilder &b,
   // Global initializer that encodes type ids as pointers.
   auto init = [&](ImplicitLocOpBuilder &ib, Attribute) -> LogicalResult {
     Value arr = b.create<LLVM::UndefOp>(type);
-    for (auto &pair : llvm::enumerate(type_ids)) {
+    for (const auto &pair : llvm::enumerate(type_ids)) {
       arr = b.create<LLVM::InsertValueOp>(arr, Globals::AddrOf(b, pair.value()),
                                           pair.index());
     }
@@ -324,7 +324,7 @@ static FailureOr<EncodedArguments> EncodeArguments(
   if (!encoded.empty()) insert_value(Globals::AddrOf(b, type_table), 1);
 
   // Store pointer to encoded arguments into the allocated storage.
-  for (auto &pair : llvm::enumerate(encoded)) {
+  for (const auto &pair : llvm::enumerate(encoded)) {
     CustomCallArgEncoding::Encoded encoded = pair.value();
     int64_t offset = 2 + pair.index();
     insert_value(AsPtr(b, encoded.value), offset);
@@ -418,7 +418,7 @@ static FailureOr<EncodedResults> EncodeResults(
   if (!encoded.empty()) insert_value(Globals::AddrOf(b, type_table), 1);
 
   // Store encoded results into the allocated storage.
-  for (auto &pair : llvm::enumerate(encoded)) {
+  for (const auto &pair : llvm::enumerate(encoded)) {
     CustomCallRetEncoding::Encoded encoded_pair = pair.value();
     int64_t offset = 2 + pair.index();
     insert_value(encoded_pair.value, offset);
@@ -689,10 +689,7 @@ void ConvertRuntimeToLLVMPass::runOnOperation() {
   RewritePatternSet patterns(ctx);
 
   // We use conversion to LLVM type to lower all runtime operands to LLVM types.
-  // TODO(b/267828330): Migrate to opaque pointers.
-  LowerToLLVMOptions options(&getContext());
-  options.useOpaquePointers = false;
-  LLVMTypeConverter llvm_converter(ctx, options);
+  LLVMTypeConverter llvm_converter(ctx);
   llvm_converter.addConversion(
       RuntimeTypeConverter::ConvertExecutionContextType);
   llvm_converter.addConversion(RuntimeTypeConverter::ConvertStatusType);
@@ -705,8 +702,7 @@ void ConvertRuntimeToLLVMPass::runOnOperation() {
   // Convert all async types to opaque pointers.
   llvm_converter.addConversion([&](Type type) -> std::optional<Type> {
     if (type.isa<async::TokenType, async::GroupType, async::ValueType>())
-      return llvm_converter.getPointerType(
-          IntegerType::get(type.getContext(), 8));
+      return LLVM::LLVMPointerType::get(ctx);
     return std::nullopt;
   });
 
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/BUILD b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/BUILD
index f63e709402b..a56c4857f58 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/BUILD
@@ -37,6 +37,7 @@ cc_library(
         "@llvm-project//mlir:AsyncDialect",
         "@llvm-project//mlir:AsyncToLLVM",
         "@llvm-project//mlir:AsyncTransforms",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/compilation_pipeline.mlir b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/compilation_pipeline.mlir
index d12f10cefdf..18fe89180a2 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/compilation_pipeline.mlir
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/compilation_pipeline.mlir
@@ -6,8 +6,8 @@
 
 // CHECK-LABEL: llvm.func @main(
 // CHECK-SAME:    %[[ARG0:arg[0-9]+]]: !llvm.ptr,
-// CHECK-SAME:    %[[ARG1:arg[0-9]+]]: !llvm.ptr<f32>,
-// CHECK-SAME:    %[[ARG2:arg[0-9]+]]: !llvm.ptr<f32>,
+// CHECK-SAME:    %[[ARG1:arg[0-9]+]]: !llvm.ptr,
+// CHECK-SAME:    %[[ARG2:arg[0-9]+]]: !llvm.ptr,
 // CHECK-SAME:    %[[ARG3:arg[0-9]+]]: i64,
 // CHECK-SAME:    %[[ARG4:arg[0-9]+]]: i64,
 // CHECK-SAME:    %[[ARG5:arg[0-9]+]]: i64
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/rt_to_llvm.mlir b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/rt_to_llvm.mlir
index 31914e3cb36..b26b0b6f606 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/rt_to_llvm.mlir
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/rt_to_llvm.mlir
@@ -503,10 +503,9 @@ func.func @custom_call_unit_attr(%ctx: !rt.execution_context) {
 
 // CHECK: call @f32_reduce
 
-// CHECK: %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
+// CHECK: %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
 // CHECK: %[[DATA_GEP:.*]] = llvm.getelementptr %[[MEMREF_ALLOCA]]
-// CHECK: %[[DATA_PTR:.*]] = llvm.load %[[DATA_GEP]]
-// CHECK: %[[DATA:.*]] = llvm.bitcast %[[DATA_PTR]] : !llvm.ptr to !llvm.ptr<f32>
+// CHECK: %[[DATA:.*]] = llvm.load %[[DATA_GEP]]
 
 // CHECK: llvm.insertvalue %[[DATA]], {{.*}}[0]
 // CHECK: llvm.insertvalue %[[DATA]], {{.*}}[1]
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/testlib_pipeline.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/testlib_pipeline.cc
index 92871291090..12a22a307ab 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/testlib_pipeline.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/testlib_pipeline.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
@@ -44,6 +45,7 @@ void RegisterXlaRuntimeTestlibDialects(DialectRegistry& dialects) {
                    mlir::memref::MemRefDialect, RuntimeDialect>();
 
   // Register MLIR dialects that can be translated to LLVM IR.
+  registerBuiltinDialectTranslation(*dialects);
   registerLLVMDialectTranslation(*dialects);
 }
 
@@ -67,21 +69,11 @@ void CreateXlaRuntimeTestlibPipeline(PassManager& passes) {
   passes->addPass(CreateConvertRuntimeToLLVMPass(std::move(rt_to_llvm_opts)));
 
   // Convert async runtime operations to LLVM dialect.
-  // TODO(b/267828330): Migrate to opaque pointers.
-  mlir::ConvertAsyncToLLVMPassOptions async_to_llvm_opts;
-  async_to_llvm_opts.useOpaquePointers = false;
-  passes->addPass(mlir::createConvertAsyncToLLVMPass(async_to_llvm_opts));
+  passes->addPass(mlir::createConvertAsyncToLLVMPass());
 
   // Convert everything else to LLVM dialect.
-  // TODO(b/267828330): Migrate to opaque pointers.
-  mlir::FinalizeMemRefToLLVMConversionPassOptions memref_to_llvm_opts;
-  memref_to_llvm_opts.useOpaquePointers = false;
-  passes->addPass(
-      mlir::createFinalizeMemRefToLLVMConversionPass(memref_to_llvm_opts));
-  // TODO(b/267828330): Migrate to opaque pointers.
-  mlir::ConvertFuncToLLVMPassOptions func_to_llvm_opts;
-  func_to_llvm_opts.useOpaquePointers = false;
-  passes->addPass(mlir::createConvertFuncToLLVMPass(func_to_llvm_opts));
+  passes->addPass(mlir::createFinalizeMemRefToLLVMConversionPass());
+  passes->addPass(mlir::createConvertFuncToLLVMPass());
   passes->addPass(mlir::createReconcileUnrealizedCastsPass());
 
   // Clean up IR before translating it to LLVM.
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.cc
index 89b2df9b3af..2f8b873b84d 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.cc
@@ -109,8 +109,11 @@ static std::unique_ptr<Type> ConvertCanonicalType(
 /*static*/ StatusOr<PrimitiveType> TypeConverter::ConvertElementType(
     mlir::Type type) {
   if (type.isFloat8E4M3FN()) return PrimitiveType::F8E4M3FN;
+  if (type.isFloat8E4M3B11FNUZ()) return PrimitiveType::F8E4M3B11FNUZ;
   if (type.isFloat8E5M2()) return PrimitiveType::F8E5M2;
   if (type.isIndex()) return PrimitiveType::S64;
+  if (type.isFloat8E4M3FN()) return PrimitiveType::F8E4M3FN;
+  if (type.isFloat8E5M2()) return PrimitiveType::F8E5M2;
   if (type.isBF16()) return PrimitiveType::BF16;
   if (type.isF16()) return PrimitiveType::F16;
   if (type.isF32()) return PrimitiveType::F32;
diff --git a/tensorflow/compiler/xla/mlir/runtime/utils/BUILD b/tensorflow/compiler/xla/mlir/runtime/utils/BUILD
index 0464e84ccb2..58d5eca7340 100644
--- a/tensorflow/compiler/xla/mlir/runtime/utils/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/utils/BUILD
@@ -16,8 +16,8 @@ cc_library(
         "//tensorflow/compiler/xla/runtime:async_runtime",
         "//tensorflow/tsl/platform:platform_port",
         "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/functional:any_invocable",
         "@llvm-project//llvm:OrcJIT",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:mlir_async_runtime_api",
         "@tf_runtime//:async_value",
     ],
diff --git a/tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.cc b/tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.cc
index 27c7081fd0d..5f288e83ae7 100644
--- a/tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include <ostream>
 #include <string_view>
 #include <thread>  // NOLINT TODO(ezhulenev): Remove this header.
-#include <type_traits>
+#include <utility>
 
 #include "absl/base/dynamic_annotations.h"
 #include "mlir/ExecutionEngine/AsyncRuntime.h"  // from @llvm-project
@@ -50,19 +50,20 @@ AsyncValueRef<Chain> ConvertAsyncTokenToChain(AsyncRuntime::Token *token) {
 
 void ExtractAsyncValue(
     AsyncRuntime::Value *value, AsyncValue *dst,
-    llvm::function_ref<void(void *storage, AsyncValue *dst)> emplace_fn) {
+    absl::AnyInvocable<void(void *storage, AsyncValue *dst)> emplace_fn) {
   auto *async_value = AsyncRuntime::GetAsyncValue(value);
 
   // Fast path if async value is already available.
   if (async_value->IsAvailable()) {
     auto *storage = AsyncRuntime::GetStorage(value);
-    emplace_fn(storage, dst);
+    std::move(emplace_fn)(storage, dst);
     AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
     return;
   }
 
   // Wait for the async value completion, and emplace the `dst`.
-  async_value->AndThen([value, emplace_fn, dst = FormRef(dst)]() {
+  async_value->AndThen([value, emplace_fn = std::move(emplace_fn),
+                        dst = FormRef(dst)]() mutable {
     auto *storage = AsyncRuntime::GetStorage(value);
     emplace_fn(storage, dst.get());
     AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
@@ -96,8 +97,8 @@ llvm::orc::SymbolMap AsyncRuntimeApiSymbolMap(
   llvm::orc::SymbolMap symbol_map;
 
   auto bind = [&](std::string_view name, auto symbol_ptr) {
-    symbol_map[mangle(name)] = llvm::JITEvaluatedSymbol(
-        llvm::pointerToJITTargetAddress(symbol_ptr), llvm::JITSymbolFlags());
+    symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(symbol_ptr),
+                                llvm::JITSymbolFlags()};
   };
 
   bind("mlirAsyncRuntimeAddRef", &mlir::runtime::mlirAsyncRuntimeAddRef);
@@ -167,8 +168,8 @@ llvm::orc::SymbolMap AsyncRuntimeMemoryAllocationSymbolMap(
   llvm::orc::SymbolMap symbol_map;
 
   auto bind = [&](std::string_view name, auto symbol_ptr) {
-    symbol_map[mangle(name)] = llvm::JITEvaluatedSymbol(
-        llvm::pointerToJITTargetAddress(symbol_ptr), llvm::JITSymbolFlags());
+    symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(symbol_ptr),
+                                llvm::JITSymbolFlags()};
   };
 
   bind("malloc", &RuntimeMalloc);
diff --git a/tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.h b/tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.h
index ea954e8dea0..e1c310f3810 100644
--- a/tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.h
+++ b/tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_MLIR_RUNTIME_UTILS_ASYNC_RUNTIME_API_H_
 #define TENSORFLOW_COMPILER_XLA_MLIR_RUNTIME_UTILS_ASYNC_RUNTIME_API_H_
 
+#include "absl/functional/any_invocable.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/Mangling.h"
 #include "tensorflow/compiler/xla/runtime/async_runtime.h"
@@ -35,7 +36,7 @@ tsl::AsyncValueRef<tsl::Chain> ConvertAsyncTokenToChain(
 // reference on the runtime value after it is no longer needed.
 void ExtractAsyncValue(
     AsyncRuntime::Value* value, tsl::AsyncValue* dst,
-    llvm::function_ref<void(void*, tsl::AsyncValue*)> emplace_fn);
+    absl::AnyInvocable<void(void*, tsl::AsyncValue*)> emplace_fn);
 
 // A version of the `ExtractAsyncValue` function defined above that takes an
 // additional opaque pointer that will be passed to the emplace function when
diff --git a/tensorflow/compiler/xla/mlir/runtime/utils/c_runner_utils.h b/tensorflow/compiler/xla/mlir/runtime/utils/c_runner_utils.h
index e00cd82e062..ff031ed7d1d 100644
--- a/tensorflow/compiler/xla/mlir/runtime/utils/c_runner_utils.h
+++ b/tensorflow/compiler/xla/mlir/runtime/utils/c_runner_utils.h
@@ -30,8 +30,8 @@ inline llvm::orc::SymbolMap CRunnerUtilsSymbolMap(
   llvm::orc::SymbolMap symbol_map;
 
   auto bind = [&](std::string_view name, auto symbol_ptr) {
-    symbol_map[mangle(name)] = llvm::JITEvaluatedSymbol(
-        llvm::pointerToJITTargetAddress(symbol_ptr), llvm::JITSymbolFlags());
+    symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(symbol_ptr),
+                                llvm::JITSymbolFlags()};
   };
 
 #ifndef _WIN32
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/mlir_bisect.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
index d08ce09f0b0..607917ecb75 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
@@ -256,8 +256,8 @@ void ReplaceArgsWithConstants(ModuleOp module,
         bbarg.getType());
     CHECK_EQ(attr.size(), 1) << "unsupported argument";
 
-    bbarg.replaceAllUsesWith(b.create<arith::ConstantOp>(
-        main.getLoc(), attr.front(), bbarg.getType()));
+    bbarg.replaceAllUsesWith(arith::ConstantOp::materialize(
+        b, attr.front(), bbarg.getType(), main.getLoc()));
   }
   while (main.getBody().getNumArguments() > 0) {
     main.getBody().eraseArgument(0);
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/general.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/general.cc
index 24368d320e8..7e14bd65787 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/general.cc
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/general.cc
@@ -81,8 +81,8 @@ llvm::SmallVector<OwningOpRef<ModuleOp>> ReplaceOpWithConstant(
       auto attribute = interpreter::ValueToAttribute(value, type);
       if (attribute.size() == 1) {
         op_clone->getResults()[i].replaceAllUsesWith(
-            b.create<arith::ConstantOp>(op_clone->getLoc(), attribute.front(),
-                                        type));
+            arith::ConstantOp::materialize(b, attribute.front(), type,
+                                           op_clone->getLoc()));
       } else {
         // We don't currently support tuples.
         all_replaced = false;
@@ -117,8 +117,8 @@ llvm::SmallVector<OwningOpRef<ModuleOp>> ReplaceOperandWithConstant(
         auto [module_clone, op_clone] = CloneModuleFor(op);
         OpBuilder b(op_clone);
         op_clone->setOperand(
-            i, b.create<arith::ConstantOp>(op_clone->getLoc(),
-                                           attribute.front(), type));
+            i, arith::ConstantOp::materialize(b, attribute.front(), type,
+                                              op_clone->getLoc()));
         result.push_back(std::move(module_clone));
       }
     }
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_replay/BUILD
index b113fbf54cf..a89a00980f6 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_replay/BUILD
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/BUILD
@@ -15,6 +15,7 @@ xla_cc_binary(
     deps = [
         ":mlir_replay_lib",
         "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla/mlir/framework/ir:xla_framework",
         "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
         "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
         "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc_impl",
@@ -35,6 +36,7 @@ xla_cc_binary(
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/util:command_line_flags",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
@@ -47,12 +49,14 @@ cc_library(
     hdrs = ["mlir_replay_lib.h"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/mlir/framework/ir:xla_framework",
         "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc",
         "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:execution_trace_utils",
         "//tensorflow/compiler/xla/mlir_hlo:mlir_interpreter_framework",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/random:bit_gen_ref",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay.cc b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay.cc
index 8cd60f169df..3d8340ff586 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay.cc
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay.cc
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
 #include <string>
-#include <variant>
 #include <vector>
 
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h"
 #include "tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.h"
 #include "tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.h"
 #include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
@@ -48,15 +48,17 @@ struct ReplayOptions {
   std::string mlir_compilation_trace;
   std::string mlir_compilation_trace_dir = "";
   std::string execution_trace_dir = "";
-  std::string entry_point = "main";
+  std::vector<std::string> entry_points = {"main", "main_xla_framework"};
   bool print_changes_only = false;
   bool stop_after_first_failure = false;
+  bool print_values = true;
 };
 
 bool ResultsMatch(const xla::HloSnapshot& snapshot,
                   const llvm::SmallVector<mlir::interpreter::InterpreterValue>&
                       first_pass_results,
-                  std::vector<std::string>& failures) {
+                  std::vector<std::string>& failures,
+                  const ReplayOptions& opts) {
   auto actual = mlir::interpreter::LiteralToValue(snapshot.result());
   TF_CHECK_OK(actual.status());
 
@@ -67,8 +69,12 @@ bool ResultsMatch(const xla::HloSnapshot& snapshot,
   }
 
   if (!(*actual == first_pass_results[0])) {
-    failures.push_back("result mismatch: " + actual->toString() +
-                       " != " + first_pass_results[0].toString());
+    if (opts.print_values) {
+      failures.push_back("result mismatch: " + actual->toString() +
+                         " != " + first_pass_results[0].toString());
+    } else {
+      failures.push_back("result mismatch");
+    }
     return false;
   }
   return true;
@@ -100,12 +106,12 @@ void TestAll(mlir::MLIRContext& context, const ReplayOptions& opts) {
 
       auto results =
           mlir::interpreter::Run(context, trace.passes(0).mlir_module(),
-                                 snapshot, nullptr, opts.entry_point);
+                                 snapshot, nullptr, opts.entry_points);
       if (!results.status().ok()) {
         failures.push_back("Failed to execute " + snapshot_path + ": " +
                            results.status().ToString());
       } else {
-        if (!ResultsMatch(snapshot, *results, failures)) {
+        if (!ResultsMatch(snapshot, *results, failures, opts)) {
           failures.push_back(
               std::string("run :mlir_replay -- --mlir-compilation-trace=") +
               trace_path + " --hlo-snapshot=" + snapshot_path +
@@ -125,6 +131,7 @@ int main(int argc, char* argv[]) {
   // Flush llvm::outs before writing errors.
   llvm::errs().tie(&llvm::outs());
 
+  std::string entry_points;
   ReplayOptions opts;
   std::vector<tsl::Flag> flag_list = {
       tsl::Flag("hlo-snapshot", &opts.hlo_snapshot,
@@ -137,12 +144,13 @@ int main(int argc, char* argv[]) {
                 "report the ones with bugs."),
       tsl::Flag("execution-trace-dir", &opts.execution_trace_dir,
                 "Directory where to store the execution traces (optional)."),
-      tsl::Flag("entry-point", &opts.entry_point,
+      tsl::Flag("entry-point", &entry_points,
                 "Program entry function (optional, defaults to 'main')."),
       tsl::Flag("print-changes-only", &opts.print_changes_only,
                 "If set, only print changed values"),
       tsl::Flag("stop-after-first-failure", &opts.stop_after_first_failure,
                 "If set, stop after the first failed invocation."),
+      tsl::Flag("print-values", &opts.print_values, "If set, print values."),
   };
   xla::AppendDebugOptionsFlags(&flag_list);
 
@@ -152,6 +160,11 @@ int main(int argc, char* argv[]) {
   if (!tsl::Flags::Parse(&argc, argv, flag_list)) {
     return 1;
   }
+
+  if (!entry_points.empty()) {
+    opts.entry_points = absl::StrSplit(entry_points, ',');
+  }
+
   tsl::port::InitMain(usage_string.c_str(), &argc, &argv);
 
   CHECK(opts.mlir_compilation_trace.empty() !=
@@ -168,7 +181,8 @@ int main(int argc, char* argv[]) {
   registry.insert<mlir::deallocation::DeallocationDialect,
                   mlir::lmhlo::LmhloDialect, mlir::lmhlo_gpu::LmhloGpuDialect,
                   mlir::gml_st::GmlStDialect, mlir::thlo::THLODialect,
-                  xla::runtime::RuntimeDialect, mlir::xla_cpu::XlaCpuDialect>();
+                  xla::runtime::RuntimeDialect, mlir::xla_cpu::XlaCpuDialect,
+                  mlir::xla_framework::XLAFrameworkDialect>();
 
   mlir::MLIRContext context(registry);
 
@@ -194,9 +208,10 @@ int main(int argc, char* argv[]) {
     auto results = mlir::interpreter::Run(
         context, state.mlir_module(), snapshot,
         opts.execution_trace_dir.empty() ? nullptr : &execution_trace,
-        opts.entry_point);
+        opts.entry_points);
     if (results.status().ok()) {
-      if (!opts.print_changes_only || (*results != previous_results)) {
+      if (opts.print_values &&
+          (!opts.print_changes_only || (*results != previous_results))) {
         llvm::outs() << "Results:\n";
         for (const auto& result : *results) {
           llvm::outs() << result.toString() << "\n";
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
index e5b5e445104..eadb280f8ca 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
@@ -15,17 +15,18 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.h"
 
-#include <complex>
 #include <iterator>
 #include <memory>
+#include <numeric>
 #include <random>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/random/bit_gen_ref.h"
 #include "absl/random/random.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -34,11 +35,11 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/ParseUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h"
 #include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
 #include "tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h"
 #include "tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
@@ -112,12 +113,52 @@ mlir::FailureOr<mlir::interpreter::InterpreterValue> MakeRandomInput(
   return failure();
 }
 
+// TODO(jreiffers): Add a flag to intentionally alias as many buffers as
+// possible (in particular, all non-variable inputs).
+// Extracts a mapping from function arguments to allocated buffers.
+// The buffer assignment is only relevant once the program is bufferized and
+// memref results were converted to arguments.
+std::vector<int64_t> extractXlaBufferAssignment(func::FuncOp main) {
+  std::vector<int64_t> buffer_assignment(main.getNumArguments());
+  auto result_mapping =
+      main->getAttrOfType<IntegerAttr>("xla_framework.result_mapping");
+  if (!result_mapping) {
+    // No attribute, fall back to unique buffers for each argument.
+    std::iota(buffer_assignment.begin(), buffer_assignment.end(), 0);
+    return buffer_assignment;
+  }
+
+  std::vector<int64_t> result_to_buffer;
+  if (auto inner_mapping = main->getAttrOfType<ArrayAttr>(
+          "xla_framework.result_inner_mapping")) {
+    llvm::copy(llvm::map_range(inner_mapping.getAsValueRange<IntegerAttr>(),
+                               [](const llvm::APInt& value) {
+                                 return value.getSExtValue();
+                               }),
+               std::back_inserter(result_to_buffer));
+  } else {
+    result_to_buffer = {result_mapping.getInt()};
+  }
+
+  int64_t result_index = 0;
+  for (int64_t arg_index : llvm::seq<int64_t>(0, main.getNumArguments())) {
+    if (auto input_buffer_index = main.getArgAttrOfType<IntegerAttr>(
+            arg_index, "xla_framework.input_mapping")) {
+      buffer_assignment[arg_index] = input_buffer_index.getInt();
+    } else {
+      buffer_assignment[arg_index] = result_to_buffer[result_index++];
+    }
+  }
+
+  return buffer_assignment;
+}
+
 }  // namespace
 
 tsl::StatusOr<SmallVector<InterpreterValue>> Run(
     MLIRContext& context, const std::string& mlir_ir,
     const xla::HloSnapshot& snapshot, ExecutionTrace* trace,
-    const std::string& entry) {
+    const std::vector<std::string>& entry) {
   auto sourceMgr = std::make_shared<llvm::SourceMgr>();
   sourceMgr->AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(mlir_ir),
                                 mlir::SMLoc());
@@ -128,10 +169,16 @@ tsl::StatusOr<SmallVector<InterpreterValue>> Run(
   }
 
   SymbolTable symbols(*module);
-  auto main = llvm::dyn_cast_or_null<func::FuncOp>(symbols.lookup(entry));
+  func::FuncOp main;
+  for (const std::string& candidate : entry) {
+    main = llvm::dyn_cast_or_null<func::FuncOp>(symbols.lookup(candidate));
+    if (main && !main.getBody().empty()) {
+      break;
+    }
+  }
+
   if (!main) {
-    return tsl::errors::InvalidArgument("failed to find entry function \"" +
-                                        entry + "\"");
+    return tsl::errors::InvalidArgument("failed to find entry point");
   }
 
   if (trace) {
@@ -143,28 +190,56 @@ tsl::StatusOr<SmallVector<InterpreterValue>> Run(
   // argument. The interpreter currently cannot deal with these things, so we
   // fail in that case.
   auto function_args = main.getBody().getBlocks().front().getArguments();
-  if (!llvm::all_of(function_args, [](Value arg) {
-        return arg.getType().isa<ShapedType>();
+  auto buffer_type = xla_framework::BufferType::get(main.getContext());
+  if (!llvm::all_of(function_args, [&](Value arg) {
+        return arg.getType().isa<ShapedType>() || arg.getType() == buffer_type;
       })) {
     return tsl::errors::InvalidArgument(
         "expected all function arguments to be shaped types");
   }
 
+  auto args_to_buffers = extractXlaBufferAssignment(main);
   TF_ASSIGN_OR_RETURN(auto args, LoadArgs(snapshot));
   auto out_args =
       main.getBody().getBlocks().front().getArguments().drop_front(args.size());
 
+  absl::flat_hash_map<int64_t, InterpreterValue> buffer_to_value;
+  // None of the input arguments will be statically known to alias.
+  for (auto [index, value] : llvm::enumerate(args)) {
+    buffer_to_value[args_to_buffers[index]] = value;
+  }
+
   std::seed_seq my_seed_seq({0});
   absl::BitGen bitgen(my_seed_seq);
   llvm::SmallVector<InterpreterValue> out_buffers;
   // Add random inputs for output arguments and unspecified inputs.
   for (auto arg : out_args) {
-    auto arg_or = MakeRandomInput(bitgen, arg.getType());
+    auto ty = arg.getType();
+    if (ty == buffer_type) {
+      // Buffers are used exactly once, in a buffer_to_mem op.
+      if (!arg.hasOneUse()) {
+        return tsl::errors::InvalidArgument(
+            "expected buffer argument to be used eactly once");
+      }
+      ty = arg.getUsers().begin()->getResultTypes().front();
+    }
+
+    int64_t buffer_index = args_to_buffers[arg.getArgNumber()];
+    // If we already have a buffer for this argument, use it.
+    if (buffer_to_value.contains(buffer_index)) {
+      auto& value = buffer_to_value[buffer_index];
+      out_buffers.push_back(value);
+      args.push_back(value);
+      continue;
+    }
+
+    auto arg_or = MakeRandomInput(bitgen, ty);
     if (!succeeded(arg_or)) {
       return tsl::errors::InvalidArgument("failed to create input");
     }
     out_buffers.push_back(*arg_or);
     args.push_back(*arg_or);
+    buffer_to_value[buffer_index] = *arg_or;
   }
 
   InterpreterOptions options;
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.h b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.h
index e4a16f663c8..d7844461627 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.h
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
 
 #include <string>
+#include <vector>
 
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
@@ -31,7 +32,7 @@ namespace interpreter {
 tsl::StatusOr<SmallVector<InterpreterValue>> Run(
     MLIRContext& context, const std::string& mlir_ir,
     const xla::HloSnapshot& snapshot, ExecutionTrace* trace,
-    const std::string& entry);
+    const std::vector<std::string>& entry);
 
 }  // namespace interpreter
 }  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
index bbbc48df149..2399961a0f2 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
@@ -191,21 +191,22 @@ llvm::SmallVector<mlir::Attribute> ValueToAttribute(
     return {};
   }
 
-  return {
-      dispatchScalarType(type.cast<ShapedType>().getElementType(),
-                         [&](auto dummy) -> mlir::Attribute {
-                           using T = decltype(dummy);
-                           auto& t = std::get<TensorOrMemref<T>>(value.storage);
-                           SmallVector<T> vals;
-                           for (const auto& index : t.view.indices()) {
-                             vals.push_back(t.at(index));
-                           }
-                           if constexpr (std::is_same_v<T, bool>) {
-                             return mlir::DenseElementsAttr::get(type, vals);
-                           } else {
-                             return mlir::DenseElementsAttr::get<T>(type, vals);
-                           }
-                         })};
+  auto shaped_ty = type.cast<ShapedType>();
+  return {dispatchScalarType(shaped_ty, [&](auto dummy) -> mlir::Attribute {
+    using T = decltype(dummy);
+    auto& t = std::get<TensorOrMemref<T>>(value.storage);
+    SmallVector<T> vals;
+    for (const auto& index : t.view.indices()) {
+      vals.push_back(t.at(index));
+    }
+    auto attr_ty =
+        shaped_ty.cloneWith(/*shape=*/t.view.sizes, shaped_ty.getElementType());
+    if constexpr (std::is_same_v<T, bool>) {
+      return mlir::DenseElementsAttr::get(attr_ty, vals);
+    } else {
+      return mlir::DenseElementsAttr::get<T>(attr_ty, vals);
+    }
+  })};
 }
 
 namespace {
@@ -275,6 +276,8 @@ tsl::StatusOr<InterpreterValue> LiteralToValue(const xla::Literal& literal) {
         return tsl::errors::Unimplemented("F8E5M2 not implemented");
       case xla::F8E4M3FN:
         return tsl::errors::Unimplemented("F8E4M3FN not implemented");
+      case xla::F8E4M3B11FNUZ:
+        return tsl::errors::Unimplemented("F8E4M3B11FNUZ not implemented");
       case xla::C64:
         return {{ArrayLiteralToTensor<std::complex<float>>(literal)}};
       case xla::C128:
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc
index 9b34d010fff..4b9f07312c2 100644
--- a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc
@@ -153,6 +153,12 @@ LogicalResult MemRefElementCastOp::verify() {
   return success();
 }
 
+LogicalResult ConvolutionOp::bufferize(
+    RewriterBase &rewriter,
+    const bufferization::BufferizationOptions &options) {
+  return BufferizeOp(*this, rewriter, options, this->getNumOperands() - 1);
+}
+
 }  // namespace xla_cpu
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_ops.td b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_ops.td
index ed2e28cc9fa..02e29687238 100644
--- a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_ops.td
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_ops.td
@@ -22,7 +22,7 @@ include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td"
 include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_enums.td"
-include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_base.td"
+include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.td"
 
 // Base class for XLA CPU dialect ops.
 class XlaCpu_Op<string mnemonic, list<Trait> traits = []> :
@@ -137,6 +137,8 @@ def AllToAllOp : XlaCpu_Op<"all_to_all",
       Variadic<TensorOrMemref>:$operand,
       Variadic<TensorOrMemref>:$dst,
       I64ElementsAttr:$replica_groups,
+      I32Attr:$channel_id_present,
+      I64Attr:$op_id,
       OptionalAttr<I64Attr>:$split_dimension,
       OptionalAttr<I64Attr>:$concat_dimension,
       OptionalAttr<I64Attr>:$split_count
@@ -315,4 +317,61 @@ def AddDependencyOp : XlaCpu_Op<"add_dependency", [BufferizableOpInterface]> {
   }];
 }
 
+def ConvolutionOp : XlaCpu_Op<"convolution", [BufferizableOpInterface]> {
+  let summary = "CPU-specific version of convolution";
+  let description = [{
+    The major differences between this and HLO's convolution are:
+    - It bufferizes to itself.
+    - It uses destination passing style.
+  }];
+
+  let arguments = (ins
+      TensorOrMemref:$input,
+      TensorOrMemref:$kernel,
+      TensorOrMemref:$dst,
+      // Default value: one for each of the spatial dimension.
+      OptionalAttr<I64ElementsAttr>:$window_strides,
+      // Default value: two zeros for each of the spatial dimension.
+      OptionalAttr<I64ElementsAttr>:$padding,
+      // Default value: one for each of the spatial dimension.
+      OptionalAttr<I64ElementsAttr>:$lhs_dilation,
+      // Default value: one for each of the spatial dimension.
+      OptionalAttr<I64ElementsAttr>:$rhs_dilation,
+      // Default value: false for each of the spatial dimension.
+      OptionalAttr<MHLO_BoolElementsAttr>:$window_reversal,
+      I64Attr:$inputBatchDimension,
+      I64Attr:$inputFeatureDimension,
+      I64ArrayAttr:$inputSpatialDimensions,
+      I64Attr:$kernelInputFeatureDimension,
+      I64Attr:$kernelOutputFeatureDimension,
+      I64ArrayAttr:$kernelSpatialDimensions,
+      I64Attr:$outputBatchDimension,
+      I64Attr:$outputFeatureDimension,
+      I64ArrayAttr:$outputSpatialDimensions,
+      I64Attr:$feature_group_count,
+      I64Attr:$batch_group_count,
+      MHLO_PrecisionConfigAttr:$precision_config
+  );
+  let results = (outs Variadic<TensorOrMemref>);
+  let extraClassDeclaration = [{
+    // Declarations for BufferizableOpInterface:
+    bool bufferizesToMemoryRead(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return opOperand.getOperandNumber() < 2;
+    }
+    bool bufferizesToMemoryWrite(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return opOperand.getOperandNumber() == 2;
+    }
+    bufferization::AliasingOpResultList getAliasingOpResults(
+        OpOperand &opOperand, const bufferization::AnalysisState &state) {
+      if (opOperand.getOperandNumber() < 2) return {};
+      return {{getOperation()->getOpResult(0),
+               bufferization::BufferRelation::Equivalent}};
+    }
+    LogicalResult bufferize(RewriterBase &rewriter,
+        const bufferization::BufferizationOptions &options);
+  }];
+}
+
 #endif  // TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_OPS_TD_
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/tests/bufferize.mlir b/tensorflow/compiler/xla/mlir/xla_cpu/tests/bufferize.mlir
index f0d5cde450c..f6d727d415a 100644
--- a/tensorflow/compiler/xla/mlir/xla_cpu/tests/bufferize.mlir
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/tests/bufferize.mlir
@@ -48,6 +48,8 @@ func.func @all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
   %1 = "xla_cpu.all_to_all"(%arg0, %0) {
     concat_dimension = 0 : i64,
     replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+    channel_id_present = 0 : i32,
+    op_id = 0 : i64,
     split_count = 4 : i64,
     split_dimension = 1 : i64
   } : (tensor<4x16xf32>, tensor<16x4xf32>) -> tensor<16x4xf32>
@@ -71,7 +73,9 @@ func.func @all_to_all_tuple(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32>)
   %0 = tensor.empty() : tensor<128x4xf32>
   %1 = tensor.empty() : tensor<128x4xf32>
   %2:2 = "xla_cpu.all_to_all"(%arg0, %arg1, %0, %1) {
-    replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
+    replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
+    channel_id_present = 0 : i32,
+    op_id = 0 : i64
   } : (tensor<128x4xf32>, tensor<128x4xf32>,
        tensor<128x4xf32>, tensor<128x4xf32>) ->
       (tensor<128x4xf32>, tensor<128x4xf32>)
diff --git a/tensorflow/compiler/xla/mlir_hlo/BUILD b/tensorflow/compiler/xla/mlir_hlo/BUILD
index e59a170f8c6..3d833f466c7 100644
--- a/tensorflow/compiler/xla/mlir_hlo/BUILD
+++ b/tensorflow/compiler/xla/mlir_hlo/BUILD
@@ -400,6 +400,14 @@ gentbl_cc_library(
             ["-gen-dialect-defs"],
             "deallocation/IR/deallocation_dialect.cc.inc",
         ),
+        (
+            ["-gen-typedef-decls"],
+            "deallocation/IR/deallocation_typedefs.h.inc",
+        ),
+        (
+            ["-gen-typedef-defs"],
+            "deallocation/IR/deallocation_typedefs.cc.inc",
+        ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "deallocation/IR/deallocation_ops.td",
@@ -412,8 +420,10 @@ cc_library(
         "deallocation/transforms/buffer_reuse.cc",
         "deallocation/transforms/convert_deallocation_ops_to_llvm.cc",
         "deallocation/transforms/deallocate.cc",
+        "deallocation/transforms/deallocation_simplification.cc",
         "deallocation/transforms/deallocation_to_scf.cc",
         "deallocation/transforms/split_alloc_tensors.cc",
+        "deallocation/transforms/xla_buffer_arg_rewrite.cc",
     ],
     hdrs = ["deallocation/transforms/passes.h"],
     strip_include_prefix = ".",
@@ -1208,6 +1218,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:Pass",
     ],
@@ -1239,7 +1250,10 @@ cc_library(
     ],
     strip_include_prefix = ".",
     deps = [
+        ":deallocation",
+        ":deallocation_passes",
         ":gml_st",
+        ":gml_st_bufferizable_op_interface",
         ":gml_st_passes",
         ":lhlo",
         ":mhlo_passes",
@@ -1306,6 +1320,8 @@ cc_library(
         "@llvm-project//mlir:VectorDialect",
         "@llvm-project//mlir:VectorToLLVM",
         "@llvm-project//mlir:VectorTransforms",
+        "@llvm-project//mlir:X86VectorDialect",
+        "@llvm-project//mlir:X86VectorTransforms",
         "@stablehlo//:chlo_ops",
     ],
 )
@@ -1476,25 +1492,23 @@ cc_library(
     name = "gml_st_passes",
     srcs = [
         "gml_st/transforms/add_debug_info/add_debug_info.cc",
+        "gml_st/transforms/canonicalization/optimize_linalg_ops.cc",
         "gml_st/transforms/collapse_shape/collapse_shape.cc",
+        "gml_st/transforms/collect_stats/collect_stats.cc",
         "gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc",
         "gml_st/transforms/copy_removal/copy_removal.cc",
         "gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc",
         "gml_st/transforms/cpu_tiling/fusion_outlining.cc",
         "gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc",
         "gml_st/transforms/cpu_tiling/pack_matmul.cc",
-        "gml_st/transforms/cpu_tiling/transform_conv_for_cpu.cc",
+        "gml_st/transforms/cpu_tiling/remove_label.cc",
         "gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc",
-        "gml_st/transforms/cpu_tiling/transform_map_for_cpu.cc",
-        "gml_st/transforms/cpu_tiling/transform_matmul_for_cpu.cc",
+        "gml_st/transforms/cpu_tiling/transform_elementwise_for_cpu.cc",
         "gml_st/transforms/cpu_tiling/transform_mmt4d_for_cpu.cc",
         "gml_st/transforms/cpu_tiling/transform_pack_for_cpu.cc",
         "gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc",
-        "gml_st/transforms/cpu_tiling/transform_reverse_for_cpu.cc",
         "gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc",
-        "gml_st/transforms/cpu_tiling/transform_transpose_for_cpu.cc",
         "gml_st/transforms/fusion/fusion.cc",
-        "gml_st/transforms/passes.cc",
         "gml_st/transforms/passes.h.inc",
         "gml_st/transforms/peeling/peeling.cc",
         "gml_st/transforms/rewrite_from_elements_op/rewrite_from_elements_op.cc",
@@ -1509,6 +1523,7 @@ cc_library(
         "gml_st/transforms/vectorization/vectorize_copy.cc",
         "gml_st/transforms/vectorization/vectorize_for_cpu.cc",
         "gml_st/utils/linalg_utils.cc",
+        "gml_st/utils/tensor_utils.cc",
     ],
     hdrs = [
         "gml_st/transforms/fusion/fusion.h",
@@ -1518,6 +1533,7 @@ cc_library(
         "gml_st/transforms/tiling/tiling.h",
         "gml_st/transforms/vectorization/vectorization.h",
         "gml_st/utils/linalg_utils.h",
+        "gml_st/utils/tensor_utils.h",
     ],
     strip_include_prefix = ".",
     deps = [
@@ -1755,6 +1771,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gml_st_bufferizable_op_interface",
+    srcs = ["gml_st/interfaces/bufferizable_op_interface_impl.cc"],
+    hdrs = ["gml_st/interfaces/bufferizable_op_interface_impl.h"],
+    strip_include_prefix = ".",
+    deps = [
+        ":gml_st",
+        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:DialectUtils",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 gentbl_cc_library(
     name = "gml_st_passes_inc_gen",
     compatible_with = get_compatible_with_cloud(),
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/CMakeLists.txt
index 78dc16ddcb9..89ace26593b 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/CMakeLists.txt
@@ -17,6 +17,8 @@ mlir_tablegen(deallocation_ops.h.inc -gen-op-decls)
 mlir_tablegen(deallocation_ops.cc.inc -gen-op-defs)
 mlir_tablegen(deallocation_dialect.h.inc -gen-dialect-decls)
 mlir_tablegen(deallocation_dialect.cc.inc -gen-dialect-defs)
+mlir_tablegen(deallocation_typedefs.h.inc -gen-typedef-decls)
+mlir_tablegen(deallocation_typedefs.cc.inc -gen-typedef-defs)
 
 add_public_tablegen_target(MLIRdeallocation_opsIncGen)
 add_dependencies(mlir-headers MLIRdeallocation_opsIncGen)
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.cc
index 37326090bdb..c899022593b 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.cc
@@ -15,17 +15,19 @@ limitations under the License.
 
 #include "deallocation/IR/deallocation_ops.h"
 
-#include <optional>
-
 #include "deallocation/IR/deallocation_dialect.cc.inc"
-#include "deallocation/utils/util.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/TypeSwitch.h"  // IWYU pragma: keep
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"  // IWYU pragma: keep
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 
+#define GET_TYPEDEF_CLASSES
+#include "deallocation/IR/deallocation_typedefs.cc.inc"
+#undef GET_TYPEDEF_CLASSES
+
 namespace mlir {
 namespace deallocation {
 
@@ -35,180 +37,22 @@ void DeallocationDialect::initialize() {
 #include "deallocation/IR/deallocation_ops.cc.inc"
 #undef GET_OP_LIST
       >();
+  addTypes<
+#define GET_TYPEDEF_LIST
+#include "deallocation/IR/deallocation_typedefs.cc.inc"
+#undef GET_TYPEDEF_LIST
+      >();
 }
 
-namespace {
-
-LogicalResult retainNoOp(RetainOp op, PatternRewriter& rewriter) {
-  if (op.getAllocs().size() != 1 || op.getAllocs() != op.getRetained()) {
-    return failure();
-  }
-  if (op.getType(0) != op.getAllocs().front().getType()) {
-    rewriter.replaceOpWithNewOp<memref::CastOp>(op, op.getType(0),
-                                                op.getAllocs().front());
-  } else {
-    rewriter.replaceOp(op, op.getAllocs());
-  }
-  return success();
+void OwnOp::build(OpBuilder& odsBuilder, OperationState& odsState,
+                  Value memref) {
+  return build(odsBuilder, odsState,
+               OwnershipIndicatorType::get(odsBuilder.getContext()), memref);
 }
 
-enum AllocNullability : uint32_t {
-  UNDEFINED = 0,
-  ALWAYS_NULL = 1,
-  NEVER_NULL = 2,
-  SOMETIMES_NULL = 3
-};
-
-AllocNullability operator|=(AllocNullability& lhs, AllocNullability rhs) {
-  return lhs = static_cast<AllocNullability>(static_cast<uint32_t>(lhs) | rhs);
-}
-
-// Returns the nullability of `v`. `pending` contains a set of `Values` we're
-// already considering in the computation of some value's nullability. It is
-// assumed that we will eventually take the maximum (logical or) of all
-// nullability values in this set.
-AllocNullability getAllocNullabilityImpl(Value v,
-                                         llvm::DenseSet<Value>& pending) {
-  if (llvm::isa_and_present<memref::AllocOp>(v.getDefiningOp())) {
-    return NEVER_NULL;
-  }
-
-  if (llvm::isa_and_present<deallocation::NullOp>(v.getDefiningOp())) {
-    return ALWAYS_NULL;
-  }
-
-  if (auto retain =
-          llvm::dyn_cast_or_null<deallocation::RetainOp>(v.getDefiningOp())) {
-    // We start with ALWAYS_NULL because a retain without any allocs is null.
-    // Also, because a retain with a non-null alloc can be null (otherwise, this
-    // would have been cleaned up by `retainNoOp`).
-    AllocNullability nullability = ALWAYS_NULL;
-    for (auto alloc : retain.getAllocs()) {
-      if (pending.insert(alloc).second) {
-        nullability |= getAllocNullabilityImpl(alloc, pending);
-      }
-      if (nullability == SOMETIMES_NULL) break;
-    }
-    return nullability;
-  }
-
-  if (llvm::isa_and_present<memref::SubViewOp, memref::CastOp,
-                            memref::ExpandShapeOp, memref::CollapseShapeOp,
-                            memref::ReshapeOp, memref::ViewOp,
-                            memref::ReinterpretCastOp, memref::TransposeOp>(
-          v.getDefiningOp())) {
-    return getAllocNullabilityImpl(v.getDefiningOp()->getOperand(0), pending);
-  }
-
-  // Returns the nullability of an operand in each of the region's predecessors.
-  auto getPredecessorNullability =
-      [&](RegionBranchOpInterface rbi,
-          std::optional<int64_t> successorRegionIndex,
-          int64_t successorArgIndex) {
-        AllocNullability nullability = UNDEFINED;
-        for (const auto& pred :
-             getPredecessorRegions(rbi, successorRegionIndex)) {
-          Value operand = pred.getPredecessorOperand(successorArgIndex);
-          // It is safe to skip values that are already being considered higher
-          // up in the call stack, because we end up taking the maximum of all
-          // nullability values.
-          if (pending.insert(operand).second) {
-            nullability |= getAllocNullabilityImpl(operand, pending);
-          }
-          if (nullability == SOMETIMES_NULL) break;
-        }
-        return nullability;
-      };
-
-  // If `v` is a block argument, check all incoming edges.
-  if (auto bbarg = v.dyn_cast<BlockArgument>()) {
-    if (auto rbi = llvm::dyn_cast<RegionBranchOpInterface>(
-            bbarg.getParentRegion()->getParentOp())) {
-      return getPredecessorNullability(
-          rbi, bbarg.getParentRegion()->getRegionNumber(),
-          bbarg.getArgNumber());
-    }
-  }
-
-  if (auto rbi =
-          llvm::dyn_cast_or_null<RegionBranchOpInterface>(v.getDefiningOp())) {
-    return getPredecessorNullability(rbi, std::nullopt,
-                                     llvm::cast<OpResult>(v).getResultNumber());
-  }
-
-  // Something we don't understand.
-  return AllocNullability::SOMETIMES_NULL;
-}
-
-bool allocIsNonNull(Value v) {
-  llvm::DenseSet<Value> pendingChecks;
-  return getAllocNullabilityImpl(v, pendingChecks) == NEVER_NULL;
-}
-
-bool allocIsNull(Value v) {
-  llvm::DenseSet<Value> pendingChecks;
-  return getAllocNullabilityImpl(v, pendingChecks) == ALWAYS_NULL;
-}
-
-LogicalResult retainIsDealloc(RetainOp op, PatternRewriter& rewriter) {
-  if (!op.getRetained().empty() || op.getAllocs().size() != 1 ||
-      !allocIsNonNull(op.getAllocs()[0])) {
-    return failure();
-  }
-  rewriter.replaceOpWithNewOp<memref::DeallocOp>(op, op.getAllocs()[0]);
-  return success();
-}
-
-LogicalResult retainIsNull(RetainOp op, PatternRewriter& rewriter) {
-  // If all allocs are null, the result is null and there is nothing to
-  // deallocate.
-  if (!llvm::all_of(op.getAllocs(), allocIsNull)) {
-    return failure();
-  }
-
-  auto nulls = llvm::to_vector(
-      llvm::map_range(TypeRange{op.getRetained()}, [&](Type ty) -> Value {
-        return rewriter.create<NullOp>(op.getLoc(), getUnrankedMemrefType(ty));
-      }));
-  rewriter.replaceOp(op, nulls);
-  return success();
-}
-
-LogicalResult splitRetain(RetainOp op, PatternRewriter& rewriter) {
-  if (!op.getRetained().empty() || op.getAllocs().size() <= 1) {
-    return failure();
-  }
-  for (Value alloc : op.getAllocs()) {
-    rewriter.create<deallocation::RetainOp>(op.getLoc(), TypeRange{},
-                                            ValueRange{}, ValueRange{alloc});
-  }
-  op.erase();
-  return success();
-}
-
-}  // namespace
-
-void RetainOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                           MLIRContext*) {
-  results.add(retainNoOp, 2);
-  results.add(retainIsDealloc, 2);
-  results.add(splitRetain, 2);
-  // Run the above analyses first. They make retainIsNull cheaper.
-  results.add(retainIsNull, 1);
-}
-
-LogicalResult RetainOp::verify() {
-  Type elemTy = getElementTypeOrSelf(getOperandTypes().front());
-  if (!llvm::all_of(
-          getOperandTypes(),
-          [&](Type it) { return getElementTypeOrSelf(it) == elemTy; }) ||
-      !llvm::all_of(getResultTypes(), [&](Type it) {
-        return getElementTypeOrSelf(it) == elemTy;
-      })) {
-    return emitOpError()
-           << "expected homogeneous operand and result element type";
-  }
-  return success();
+void NullOp::build(OpBuilder& odsBuilder, OperationState& odsState) {
+  return build(odsBuilder, odsState,
+               OwnershipIndicatorType::get(odsBuilder.getContext()));
 }
 
 }  // namespace deallocation
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.h b/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.h
index 85a16d1e42d..107f46a5db1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.h
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.h
@@ -24,6 +24,10 @@ limitations under the License.
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
+#define GET_TYPEDEF_CLASSES
+#include "deallocation/IR/deallocation_typedefs.h.inc"
+#undef GET_TYPEDEF_CLASSES
+
 #define GET_OP_CLASSES
 #include "deallocation/IR/deallocation_dialect.h.inc"
 #include "deallocation/IR/deallocation_ops.h.inc"
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.td b/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.td
index 593889bf998..0cdff360dc5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.td
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.td
@@ -18,6 +18,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/AttrTypeBase.td"
 
 def DeallocationDialect : Dialect {
   let name = "deallocation";
@@ -27,16 +28,36 @@ def DeallocationDialect : Dialect {
     Ops for modelling owned/unowned memrefs using null pointers.
   }];
   let cppNamespace = "::mlir::deallocation";
+
+  let useDefaultTypePrinterParser = 1;
+}
+
+def OwnershipIndicatorType : TypeDef<DeallocationDialect, "OwnershipIndicator"> {
+  let mnemonic = "ownership";
+  let summary = "an ownership indicator";
 }
 
 class DeallocationOp<string mnemonic, list<Trait> traits = []>
     : Op<DeallocationDialect, mnemonic, traits>;
 
 def GetBufferOp : DeallocationOp<"get_buffer", [Pure]> {
-  let summary = "extracts the data pointer as an index";
+  let summary = "extracts the base pointer as an index";
+
+  let arguments = (ins AnyTypeOf<[AnyMemRef, OwnershipIndicatorType]>:$alloc);
+  let results = (outs Index:$result);
+
+  let assemblyFormat = "attr-dict $alloc `:` type($alloc)";
+}
+
+def OwnOp : DeallocationOp<"own", [Pure]> {
+  let summary = "declare ownership";
 
   let arguments = (ins AnyRankedOrUnrankedMemRef:$memref);
-  let results = (outs Index:$result);
+  let results = (outs OwnershipIndicatorType:$result);
+
+  let builders = [
+    OpBuilder<(ins "Value":$memref)>
+  ];
 
   let assemblyFormat = "attr-dict $memref `:` type($memref)";
 }
@@ -44,11 +65,24 @@ def GetBufferOp : DeallocationOp<"get_buffer", [Pure]> {
 def NullOp : DeallocationOp<"null", [Pure]> {
   let summary = "null pointer";
 
-  let results = (outs AnyRankedOrUnrankedMemRef:$result);
+  let results = (outs OwnershipIndicatorType:$result);
 
-  let assemblyFormat = "attr-dict `:` type($result)";
+  let builders = [
+    OpBuilder<(ins)>
+  ];
+
+  let assemblyFormat = "attr-dict";
 }
 
+def FreeOp : DeallocationOp<"free"> {
+  let summary = "free";
+
+  let arguments = (ins OwnershipIndicatorType:$alloc);
+
+  let assemblyFormat = "attr-dict $alloc";
+}
+
+// TODO(jreiffers): Implement InferTypeOpInterface.
 def RetainOp : DeallocationOp<"retain", [AttrSizedOperandSegments]> {
   let summary = "null-safe dealloc";
 
@@ -63,16 +97,13 @@ def RetainOp : DeallocationOp<"retain", [AttrSizedOperandSegments]> {
   }];
 
   let arguments = (ins Variadic<AnyRankedOrUnrankedMemRef>:$retained,
-                       Variadic<AnyRankedOrUnrankedMemRef>:$allocs);
-  let results = (outs Variadic<AnyRankedOrUnrankedMemRef>:$result_allocs);
+                       Variadic<OwnershipIndicatorType>:$allocs);
+  let results = (outs Variadic<OwnershipIndicatorType>:$result_allocs);
 
   let assemblyFormat = [{
     `(` $retained `)` `of` `(` $allocs `)` attr-dict `:`
       functional-type(operands, results)
   }];
-
-  let hasCanonicalizer = 1;
-  let hasVerifier = 1;
 }
 
 #endif  // DEALLOCATION_TD_
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/README.md b/tensorflow/compiler/xla/mlir_hlo/deallocation/README.md
new file mode 100644
index 00000000000..9cbbed9769e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/README.md
@@ -0,0 +1,446 @@
+# MLIR-HLO deallocation and buffer reuse passes
+
+MLIR-HLO deallocation is an alternative to the upstream buffer-deallocation and
+buffer-hoisting passes.
+
+The core concept is that of *ownership*, i.e. for each allocation, we track an
+*ownership indicator* that can be moved around. These indicators can be
+understood as a `std::unique_ptr` or alternatively a ref-counted pointer with a
+maximum count of 1. At the end of a block, an ownership indicator must either
+be yielded or the underlying alloc must be freed. In practice, it is not always
+known whether a particular alloc is owned by the current block. Therefore, we
+must also be able to represent empty ownership indicators (i.e., null pointers).
+
+## Usage
+
+This is the recommended and supported pass pipeline to use these passes:
+
+1.  `hlo-split-alloc-tensors`
+1.  `one-shot-bufferize` with `create-deallocs=0`
+1.  `hlo-deallocate`
+1.  `hlo-deallocation-simplification`
+1.  `hlo-buffer-reuse`
+1.  `hlo-deallocation-simplification`
+1.  `hlo-deallocation-to-scf`
+1.  (...)
+1.  `convert-deallocation-ops-to-llvm`
+
+It is possible to use just the deallocation pass or just buffer-reuse, but the
+former isn't recommended because the output will be inefficient. The latter will
+work as long as the invariants assumed by this code are maintained (in
+particular, there should be no unranked memrefs in the input IR, since as
+described above, the code here assigns special meaning to those).
+
+## "ABI"
+
+As long as the IR contains only a single function, there shouldn't be any sharp
+edges here. If there are multiple functions, it is important to pay attention to
+the ABI assumed here:
+
+1.  Function arguments are always owned by the caller.
+1.  Function results are always owned by the caller **and do not alias with any
+    function arguments**. In other words, function results are always freshly
+    allocated buffers. Function arguments may alias each other.
+
+Warning: The second condition here is particularly important - if a function
+returns one of its arguments, the deallocation pass will silently introduce a
+double free.
+
+This restriction could be lifted by introducing ownership indicators for
+function arguments, but as of March 2023, this is not done.
+
+## The deallocation pass
+
+The deallocation pass assumes that:
+
+1.  The input IR was fully bufferized (i.e., no tensors are left in the
+    program).
+1.  No `dealloc`s, `alloca`s or `realloc`s exist yet.
+1.  No `memrefs` with distinct element types alias (strict aliasing; in
+    particular, no `xla_cpu.memref_element_cast` ops should exist at this point)
+
+The basic deallocation algorithm works mostly locally within blocks. It
+transforms the input IR op by op, keeping track of memref alias information as
+it goes. For each op, it produces the following information: 1) which allocs
+were released by the parent block (i.e., are no longer owned by it; more on that
+in the section on transferring ownership), 2) which new allocs are now owned by
+the parent block. For example, when processing an `alloc` op, nothing is
+released, and the result of the op is now owned by the block. It also keeps
+track of aliasing information. Conservatively, it is assumed that all inputs
+alias all compatible outputs.
+
+When transforming a block, it is not possible to know in general whether
+`memref` arguments are owned by it or by some ancestor. Therefore, we introduce
+ownership indicator arguments (`!deallocation.ownership`) for each `memref`
+argument. Inside the block, `allocs` and alias sets are tracked as described
+above. At the end of the block, we must reconcile these memrefs and potentially
+owned allocs. We can do this separately for those that are yielded from the
+block and those that aren't.
+
+For `memrefs` (or rather sets of `memrefs` that potentially alias) that aren't
+yielded, we must free the corresponding `alloc` if we own it. In general, we
+can't know statically whether that's the case, so we use the `retain` op, which
+frees non-null allocs [^1] that are no longer needed. To find the place to
+insert the op, we simply traverse the block backwards, starting from the
+terminator, and look for the last op that contains any reference to a memref
+from the alias set.
+
+```
+  // Free %alloc_0 and %alloc_1 iff they are non-null.
+  deallocation.retain() of(%alloc_0, %alloc_1)
+      : (!deallocation.ownership, !deallocation.ownership) -> ()
+```
+
+For `memrefs` that are yielded, we also insert retain ops, but this time, we
+must retain allocs if we own them. The `retain` ops look like this:
+
+```
+  // Check if %yielded_memref aliases with any of %a, %b or %c. If it does,
+  // return the corresponding memref. Free the others if they are non-null.
+  %maybe_owned = deallocation.retain(%yielded_memref) of(%a, %b, %c)
+      : (!deallocation.ownership, !deallocation.ownership, !deallocation.ownership)
+      -> (!deallocation.ownership)
+```
+
+To understand where such ops come from, consider the following code:
+
+```
+  %result = scf.if %cond -> memref<2xi32> {
+    scf.yield %some_alloc : memref<2xi32>
+  } else {
+    %new_alloc = memref.alloc() : memref<2xi32>
+    scf.yield %new_alloc : memref<2xi32>
+  }
+```
+
+Whether the parent block owns the alloc that backs `%result` depends on which
+branch was taken. Therefore, after transforming the block, the `if` will look
+like this:
+
+```
+  %result, %result_ownership = scf.if %cond -> memref<2xi32> {
+    %null = deallocation.null
+    scf.yield %some_alloc, %null : memref<2xi32>, !deallocation.ownership
+  } else {
+    %new_alloc = memref.alloc() : memref<2xi32>
+    %new_alloc_owned = deallocation.own %new_alloc : memref<2x32>
+    scf.yield %new_alloc, %new_alloc_owned : memref<2xi32>, !deallocation.ownership
+  }
+```
+
+`%result_ownership` is nonnull iff `%result` is owned by the parent block. If
+`%result` is yielded, the corresponding retain op would be:
+
+```
+  %yielded_result_ownership = deallocation.retain(%result) of(%result_ownership)
+```
+
+However, here we can statically determine that this always results in
+`%result_ownership`, so the `retain` op will not be emitted.
+
+### Loops and if: `RegionBranchOpInterface`
+
+RegionBranchOpInterface ops mostly follow what was described above for blocks,
+but there are two interesting things about them:
+
+1.  Regions with multiple predecessors
+1.  Transferring ownership to the op
+
+*Multiple predecessors*. In `scf.while`, and `scf.if`, some regions have
+multiple predecessors (in the case of `while`, the `before` region, in the case
+of `if`, the parent region). As it turns out, no special logic is required to
+handle this - the regions will always yield the same types of memrefs, and
+therefore the added ownership indicators will also have the same types.
+
+*Transfer of ownership*. If a `memref` operand of a loop has no further uses
+after the loop, we can transfer the ownership indicator for the operand to the
+loop. Note that this does not necessarily mean ownership is actually
+transferred - the ownership indicator may be null.
+
+#### Implicit capture / implicit transfer of ownership
+
+Consider the following program, which conditionally reallocates a memref:
+
+```
+%alloc = memref.alloc(%size) : memref<?xi32>
+scf.for %i = %lb to %ub step %step iter_args(%arg0 = %alloc) {
+  %should_grow, %new_size = "dummy.check_capacity"(%arg0)
+    : (memref<?xi32>) -> (i1, index)
+  %mem = scf.if %should_grow {
+    %0 = memref.realloc %arg0(%new_size) : memref<?xi32> -> memref<?xi32>
+    scf.yield %0 : memref<?xi32>
+  } else {
+    scf.yield %arg0 : memref<?xi32>
+  }
+  "dummy.use"(%mem) : (memref<?xi32>) -> ()
+  scf.yield %mem : memref<?xi32>
+}
+```
+
+`%arg0` is owned by the loop, but it must not be deallocated at the end of the
+loop body - otherwise, we'd run into a double free when it is reallocated.
+
+We solve this by defining implicit captures, or implicit transfer of ownership.
+`memref.realloc` ops are considered to implicitly capture and release their
+operand. There are a couple of restrictions to this:
+
+1.  Only ops owned by the parent block can be implicitly captured.
+1.  Implicit capture is only allowed in `scf.if` ops. This rule may be applied
+    recursively.
+1.  The implicit capture must be the last use of the captured value across all
+    execution paths.
+1.  Implied by the previous rule: Implicit capture is not allowed in `scf.if`
+    ops that do not have an else branch.
+
+To illustrate these restrictions, we can look at some IR that violates them:
+
+```
+%alloc = memref.alloc()
+scf.if %cond {
+  %0 = memref.realloc %alloc  // invalid
+}
+```
+
+This IR contains an implicit capture inside an `scf.if` without an `else`
+branch. Since `%alloc` is only freed if `%cond` is true, there must be some
+further use of `%alloc`, which is invalid. To make this valid, the following IR
+should be emitted instead:
+
+```
+%alloc = memref.alloc()
+%0 = scf.if %cond {
+  %1 = memref.realloc %alloc
+  scf.yield %1
+} else {
+  scf.yield %alloc
+}
+```
+
+Note that `scf.yield %alloc` is executed no execution path that also executes
+the `realloc`, so condition 3 is not violated.
+
+An example that violates condition 1:
+
+```
+%alloc = memref.alloc()
+scf.for %i = %lb to %ub step %step {
+  scf.if ... {
+    %0 = memref.realloc %alloc  // invalid
+  } else {
+    ...
+  }
+}
+```
+
+`%alloc` cannot be implicitly captured here, since there is no chain of ancestor
+`scf.if` ops to its definition. To make this valid, turn `%alloc` into an
+`iter_arg`:
+
+```
+%alloc = memref.alloc()
+%0 = scf.for %i = %lb to %ub step %step iter_args(%arg0 = %alloc) {
+  %1 = scf.if ... {
+    %2 = memref.realloc %alloc
+  } else {
+    ...
+  }
+  scf.yield %1
+}
+```
+
+## Ops in the deallocation dialect
+
+### The `null` op
+
+Creates a null pointer.
+
+### The `own` op
+
+Declares ownership of an alloc and returns an ownership indicator. This is
+lowered to an extraction of the alloc's base pointer.
+
+### The `retain` op
+
+Takes a list of memrefs and a list of ownership indicator. For each memref,
+returns the ownership (alloc) that it was derived from (if present). Each alloc
+is returned at most once. Alloc that are not returned are freed.
+
+Some retain ops can be simplified to a no op (e.g. if there's only one alloc
+and one memref, and they're the same). Others can be rewritten to memref.dealloc
+(if we know that the alloc is non-null and there is no memref). This is done by
+the `deallocation-simplification` pass.
+
+There are two lowerings of `retain`: retains with a single memref or a single
+ownership indicator are lowered to a sequence of `scf.if` ops. Lowerings with
+more than one of either are instead lowered to a library call. For details, see
+the section on the deallocation-to-scf pass.
+
+### The `get_buffer` op
+
+Returns the memref's base pointer as an index.
+
+## The buffer reuse pass
+
+The buffer reuse pass is intended to be run after the deallocation pass and
+assumes that the code has the structure that the pass guarantees (in particular,
+unranked memref == ownership indicator). For best results, the IR should be
+canonicalized first.
+
+### Loop simplification
+
+As a preprocessing step, this pass transforms `retain` ops that operate on the
+result of loops. Consider the following IR:
+
+```
+%alloc1 = memref.alloc() : memref<4xi32>
+%alloc2 = memref.alloc() : memref<4xi32>
+%0:4 = scf.while(%arg0 = %alloc1, $arg1 = %alloc2) {
+  scf.condition(%cond) %arg1, %arg0
+do {
+  (...)
+  scf.yield %arg0, %arg1
+}
+memref.dealloc %0#0 : memref<4xi32>
+memref.dealloc %0#1 : memref<4xi32>
+```
+
+`%0#0` and `%0#1` are `%alloc1` and `%alloc2`, in some order. Since there is no
+further use of these allocs and they are all deallocated, we can rewrite the
+operands to `%alloc1` and `%alloc2`, even though we don't know which one is
+which.
+
+The purpose of this preprocessing step is to allow more buffer reuse, which
+requires `dealloc`/`alloc` pairs to work.
+
+### Buffer reuse
+
+Buffer reuse coalesces `dealloc`/`alloc` pairs:
+
+```
+memref.dealloc %alloc : memref<100xi32>
+(...)
+%alloc_1 = memref.alloc() : memref<100xi32>
+```
+
+Instead of deallocating and allocating, we replace all uses of `%alloc_1` with
+`%alloc`. Currently, we only do this for immediate `dealloc`/`alloc` pairs with
+no other `alloc`/`dealloc` ops in between. So in the example above, if `(...)`
+included any other allocation or deallocation, no reuse would occur.
+
+### Copy elision
+
+Another simple transformation eliminates `alloc`/`copy`/`dealloc` patterns:
+
+```
+%a = memref.alloc() : memref<100xi32>
+(... 1)  // no uses of %a
+memref.copy %b, %a : memref<100xi32> to memref<100xi32>
+memref.dealloc %b : memref<100xi32>
+(... 2)  // uses of %a
+```
+
+Since `%a` is completely overwritten with `%b`, which is deallocated immediately
+afterwards, we can remove the allocation of `%a` and replace its uses with `%b`.
+
+```
+(... 1)  // potential uses of %b
+(... 2)  // all uses of %a replaced with %b
+```
+
+Note: This pattern could be generalized to only look at copy ops and the uses of
+its operand, leaving the elimination of the allocation and deallocation to other
+patterns. As of March 2023, this is not done.
+
+### Hoisting
+
+The second transformation implemented in this pass is buffer hoisting. This
+simply looks for allocs that happen in each iteration of a loop and moves them
+out of the loop:
+
+```
+scf.for %i = %c0 to %c1000 step %c1 {
+  %foo = memref.alloc() : memref<100xi32>
+  (...)
+  memref.dealloc %foo : memref<100xi32>
+}
+```
+
+Since the contents of a freshly allocated memref are undefined, this can be
+transformed as follows:
+
+```
+%foo = memref.alloc() : memref<100xi32>
+scf.for %i = %c0 to %c1000 step %c1 {
+  (...)
+}
+memref.dealloc %foo : memref<100xi32>
+```
+
+The same transformation applies for while loops, with the caveat that it may
+increase peak heap usage in that case.
+
+### Double buffering
+
+Double buffering can be considered a variant of hoisting. It is useful in cases
+where use ranges of buffers overlap, preventing simple hoisting. Consider the
+following IR (ownership indicator omitted for clarity):
+
+```
+%0 = scf.for %i = %c0 to %c1000 step %c1 iter_args(%arg = %alloc)
+    -> memref<100xi32> {
+  %tmp = memref.alloc() : memref<100xi32>
+  "some.op"(%tmp, %arg) : (memref<100xi32>, memref<100xi32>) -> ()
+  memref.dealloc %arg : memref<100xi32>
+  scf.yield %tmp : memref<100xi32>
+}
+memref.dealloc %0 : memref<100xi32>
+```
+
+The live ranges of `%alloc` and `%tmp` overlap, so we can't do straightforward
+hoisting here. However, we only need two distinct buffers at any given time, so
+instead, we introduce an additional iter arg for the temporary buffer, hoist and
+swap in each iteration:
+
+```
+%tmp = memref.alloc() : memref<100xi32>
+%0, %1 = scf.for %i = %c0 to %c1000 step %c1
+    iter_args(%arg = %alloc, %tmp_ = %tmp) -> memref<100xi32> {
+  "some.op"(%tmp_, %arg) : (memref<100xi32>, memref<100xi32>) -> ()
+  scf.yield %tmp_, %arg : memref<100xi32>, memref<100xi32>
+}
+memref.dealloc %1 : memref<100xi32>
+memref.dealloc %0 : memref<100xi32>
+```
+
+Note that the presence of a deallocation of `%arg` inside the loop implies no
+further uses of `%alloc` after the loop. So, similarly to the case described in
+the section on loop simplification, it doesn't matter which alloc is in `%0` and
+which one is in `%1`.
+
+Double buffering works analogously for `while` loops, with the exception that
+buffers have to be plumbed through the before region.
+
+Note: as of March 2023, double buffering allocations in `while` loops is only
+implemented for the `after` region.
+
+## The split-alloc-tensors pass
+
+This pass is a helper pass to improve the behavior of the other passes when used
+together with `one-shot-bufferize`. The purpose of this pass is to prevent
+accidental buffer reuse by `one-shot-bufferize` by ensuring each `alloc_tensor`
+is used only once, thereby minimizing the sizes of live ranges and enabling the
+buffer reuse pass to work optimally.
+
+## The deallocation-to-scf pass
+
+As described previously, most `deallocation.retain` ops are eliminated either by
+canonicalization or by `buffer-reuse`. `deallocation-to-scf` lowers the ones
+that remain to sequences of `scf.if` ops.
+
+Because the size of the emitted code is in `O(|allocs| * |memrefs|)`, we only
+use this lowering when at least one of `|allocs|` or `|memrefs|` is 1.
+
+[^1]: `memref.dealloc` happens to tolerate null inputs as well, but at this
+    point of the pipeline, we assume that the argument is always non-null,
+    because 1) this behavior isn't documented 2) it simplifies analysis in
+    subsequent passes.
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/CMakeLists.txt
index 5851b4d3637..76820e2c482 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/CMakeLists.txt
@@ -24,8 +24,10 @@ add_mlir_library(DeallocationPasses
   buffer_reuse.cc
   convert_deallocation_ops_to_llvm.cc
   deallocate.cc
+  deallocation_simplification.cc
   deallocation_to_scf.cc
   split_alloc_tensors.cc
+  xla_buffer_arg_rewrite.cc
 
   DEPENDS
   MLIRDeallocationPassesIncGen
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
index 17984841d6c..f8ce24ab4cb 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <functional>
 #include <memory>
-#include <numeric>
 #include <optional>
 #include <utility>
 
+#include "deallocation/IR/deallocation_ops.h"
 #include "deallocation/transforms/passes.h"
 #include "deallocation/utils/util.h"
 #include "llvm/ADT/STLExtras.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Block.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
@@ -38,159 +40,9 @@ namespace mlir {
 namespace deallocation {
 namespace {
 
-// Finds the start of a memref use chain (e.g. subview of cast of alloc). Stops
-// at block arguments and allocs. If `useDeallocateInvariants` is set, assumes
-// that hlo-deallocate was run previously and the invariants that it guarantees
-// still hold: each ranked memref block argument should have a corresponding
-// unranked ownership indicator argument.
-Value rootAlloc(Value v, bool useDeallocateInvariants) {
-  if (useDeallocateInvariants) {
-    if (auto bbarg = llvm::dyn_cast<BlockArgument>(v)) {
-      if (v.getType().isa<UnrankedMemRefType>()) {
-        return v;
-      }
-
-      auto memrefArgs = llvm::to_vector(llvm::make_filter_range(
-          bbarg.getParentBlock()->getArguments(),
-          [](BlockArgument arg) { return arg.getType().isa<MemRefType>(); }));
-      auto unrankedMemrefArgs = llvm::to_vector(llvm::make_filter_range(
-          bbarg.getParentBlock()->getArguments(), [](BlockArgument arg) {
-            return arg.getType().isa<UnrankedMemRefType>();
-          }));
-
-      // Find the ownership indicator for the block argument.
-      for (auto [memref, alloc] : llvm::zip(memrefArgs, unrankedMemrefArgs)) {
-        if (memref == bbarg) {
-          return alloc;
-        }
-      }
-
-      // There may not be an ownership indicator, for example if this is a
-      // function block argument.
-      return v;
-    }
-  }
-
-  if (llvm::isa_and_present<memref::SubViewOp, memref::CastOp,
-                            memref::ExpandShapeOp, memref::CollapseShapeOp,
-                            memref::ReshapeOp, memref::ViewOp,
-                            memref::ReinterpretCastOp, memref::TransposeOp>(
-          v.getDefiningOp())) {
-    return rootAlloc(v.getDefiningOp()->getOperand(0), useDeallocateInvariants);
-  }
-  return v;
-}
-
-// Eliminates redundant owernship arguments for memrefs that are always owned
-// by the block. This helps with hoisting and reuse.
-void elideRedundantOwnershipArgs(RegionBranchOpInterface op) {
-  bool isFor = llvm::isa<scf::ForOp>(op);
-  if (!llvm::isa<scf::WhileOp>(op) && !isFor) {
-    return;
-  }
-
-  SmallVector<size_t> resultIndices(op->getNumResults());
-  std::iota(resultIndices.begin(), resultIndices.end(), 0);
-
-  // Get the root allocs of all operands and arguments while the invariants are
-  // still intact.
-  DenseMap<Value, Value> rootAllocs;
-  for (auto operand : op->getOperands()) {
-    rootAllocs[operand] = rootAlloc(operand, true);
-  }
-  for (auto& region : op->getRegions()) {
-    for (auto arg : region.getArguments()) {
-      rootAllocs[arg] = rootAlloc(arg, true);
-    }
-    for (auto operand :
-         region.getBlocks().front().getTerminator()->getOperands()) {
-      rootAllocs[operand] = rootAlloc(operand, true);
-    }
-  }
-
-  for (auto& region : op->getRegions()) {
-    llvm::SmallVector<size_t> memrefArgIndices;
-    llvm::SmallVector<size_t> ownershipArgIndices;
-
-    llvm::SmallVector<Value> memrefArgs;
-    llvm::SmallVector<Value> ownershipArgs;
-    for (auto [index, arg] : llvm::enumerate(region.getArguments())) {
-      if (arg.getType().isa<UnrankedMemRefType>()) {
-        ownershipArgs.push_back(arg);
-        ownershipArgIndices.push_back(index);
-      } else if (arg.getType().isa<MemRefType>()) {
-        memrefArgs.push_back(arg);
-        memrefArgIndices.push_back(index);
-      }
-    }
-
-    // Only proceed if this region has the standard form after
-    // buffer-deallocation.
-    if (memrefArgs.size() != ownershipArgs.size()) continue;
-
-    llvm::SmallBitVector argsToDrop(ownershipArgs.size(), true);
-    auto predecessors = getPredecessorRegions(op, region.getRegionNumber());
-    for (const auto& pred : predecessors) {
-      for (unsigned i = 0; i < argsToDrop.size(); ++i) {
-        Value argRoot =
-            rootAllocs[pred.getPredecessorOperand(memrefArgIndices[i])];
-        Value ownerRoot =
-            rootAllocs[pred.getPredecessorOperand(ownershipArgIndices[i])];
-        bool same = argRoot && argRoot == ownerRoot;
-        argsToDrop[i] = argsToDrop[i] && same;
-      }
-    }
-
-    for (int64_t i = static_cast<int64_t>(argsToDrop.size()) - 1; i >= 0; --i) {
-      if (!argsToDrop[i]) continue;
-
-      for (auto& pred : predecessors) {
-        if ((pred.predecessorOp == op && isFor) ||
-            (pred.predecessorRegionIndex == 0 && !isFor)) {
-          resultIndices[ownershipArgIndices[i] - pred.successorValueIndex] =
-              memrefArgIndices[i] - pred.successorValueIndex;
-        }
-        pred.predecessorOp->eraseOperands(pred.predecessorOperandIndex +
-                                          ownershipArgIndices[i] -
-                                          pred.successorValueIndex);
-      }
-      // Cast to the right type.
-      OpBuilder b(&region.getBlocks().front().front());
-      region.getArgument(ownershipArgIndices[i])
-          .replaceAllUsesWith(b.create<memref::CastOp>(
-              region.getLoc(), ownershipArgs[i].getType(), memrefArgs[i]));
-      region.eraseArgument(ownershipArgIndices[i]);
-    }
-  }
-
-  auto newOp = moveRegionsToNewOpButKeepOldOp(op);
-  SmallVector<Value> results;
-  OpBuilder b(op.getContext());
-  b.setInsertionPointAfter(newOp);
-  for (auto [oldIndex, newIndex] : llvm::enumerate(resultIndices)) {
-    if (oldIndex == newIndex) {
-      results.push_back(newOp->getResult(newIndex));
-    } else {
-      results.push_back(b.create<memref::CastOp>(newOp.getLoc(),
-                                                 op->getResultTypes()[oldIndex],
-                                                 newOp->getResult(newIndex)));
-    }
-  }
-
-  op->replaceAllUsesWith(results);
-  op->erase();
-}
-
-void elideRedundantOwnershipArgs(Block& block) {
-  block.walk(
-      [](RegionBranchOpInterface rbi) { elideRedundantOwnershipArgs(rbi); });
-  block.walk([](memref::DeallocOp dealloc) {
-    dealloc.setOperand(rootAlloc(dealloc.getMemref(), false));
-  });
-}
-
 SmallVector<Value> hoistAllocs(Operation* parent, Region& region,
                                SmallVector<Value> freeAllocs) {
+  if (region.empty()) return freeAllocs;
   assert(region.hasOneBlock() && "expected the region to have a single block");
   // Hoist local allocs out of the loop.
   // TODO(jreiffers): Add some smarts here so we don't blow up the heap for
@@ -219,25 +71,24 @@ SmallVector<Value> hoistAllocs(Operation* parent, Region& region,
         alloc->moveBefore(parent);
         result.push_back(alloc);
       } else {
-        op->replaceAllUsesWith(ValueRange{*reusable});
+        alloc->replaceAllUsesWith(ValueRange{*reusable});
         dealloc->erase();
         op = op->getNextNode();
         alloc->erase();
+        result.push_back(*reusable);
         *reusable = {};
       }
     } else {
       op = op->getNextNode();
     }
   }
+  // Return remaining free allocs.
+  for (auto reusable : freeAllocs) {
+    if (reusable) result.push_back(reusable);
+  }
   return result;
 }
 
-bool hoistAllocs(scf::WhileOp op) {
-  auto beforeAllocs = hoistAllocs(op, op.getBefore(), {});
-  auto afterAllocs = hoistAllocs(op, op.getAfter(), beforeAllocs);
-  return !beforeAllocs.empty() || !afterAllocs.empty();
-}
-
 // Hoists allocs from while and for loops.
 bool hoistAllocs(Block& block) {
   auto* op = &block.front();
@@ -250,12 +101,12 @@ bool hoistAllocs(Block& block) {
       }
     }
 
-    if (auto whileOp = llvm::dyn_cast<scf::WhileOp>(op)) {
-      result |= hoistAllocs(whileOp);
-    }
-
-    if (auto forOp = llvm::dyn_cast<scf::ForOp>(op)) {
-      result |= !hoistAllocs(forOp, forOp.getLoopBody(), {}).empty();
+    if (auto rbi = llvm::dyn_cast<RegionBranchOpInterface>(op)) {
+      SmallVector<Value> hoistedAllocs;
+      for (auto& region : op->getRegions()) {
+        hoistedAllocs = hoistAllocs(rbi, region, std::move(hoistedAllocs));
+      }
+      result |= !hoistedAllocs.empty();
     }
     op = op->getNextNode();
   }
@@ -273,18 +124,29 @@ T findOp(Operation* start, std::function<bool(T)> predicate) {
   return {};
 }
 
-// Checks if v is used in [start; end)
-bool hasUsesBetween(Operation* start, Operation* end, Value v) {
+// Recursively checks if `pred` holds for any op in [start; end)
+bool anyMatchBetween(Operation* start, Operation* end,
+                     const std::function<bool(Operation*)>& predicate) {
   while (start != end) {
-    if (llvm::is_contained(start->getOperands(), v)) return true;
+    if (predicate(start)) return true;
     for (auto& region : start->getRegions()) {
-      if (hasUsesBetween(&region.front().front(), nullptr, v)) return true;
+      if (!region.empty() &&
+          anyMatchBetween(&region.front().front(), nullptr, predicate)) {
+        return true;
+      }
     }
     start = start->getNextNode();
   }
   return false;
 }
 
+// Recursively checks if `v` is used in [start; end)
+bool hasUsesBetween(Operation* start, Operation* end, Value v) {
+  return anyMatchBetween(start, end, [v](Operation* op) {
+    return llvm::is_contained(op->getOperands(), v);
+  });
+}
+
 std::pair<DenseMap<Type, SmallVector<memref::AllocOp>>,
           DenseMap<Type, SmallVector<memref::DeallocOp>>>
 findAllocsAndDeallocs(Block& block) {
@@ -327,26 +189,26 @@ void doubleBuffer(Operation* op, memref::AllocOp alloc,
 }
 
 RegionBranchOpInterface doubleBuffer(RegionBranchOpInterface op) {
-  // TODO(jreiffers): Implement double buffering for all regions.
-  auto [allocations, deallocations] =
-      findAllocsAndDeallocs(op->getRegion(op->getNumRegions() - 1).front());
-
-  // If we have an argument that's deallocated, and a matching allocation that's
-  // yielded, we can instead stash the deallocated buffer in an arg and use it
-  // the next time.
   SmallVector<memref::DeallocOp> deallocsToFix;
-  for (auto [type, allocs] : allocations) {
-    for (auto [alloc, dealloc] : llvm::zip(allocs, deallocations[type])) {
-      doubleBuffer(op, alloc, dealloc);
-      deallocsToFix.push_back(dealloc);
+  for (unsigned i = 0; i < op->getNumRegions(); ++i) {
+    auto [allocations, deallocations] =
+        findAllocsAndDeallocs(op->getRegion(i).front());
 
-      if (llvm::isa<scf::WhileOp>(op)) {
-        auto& before = op->getRegion(0);
-        // Forward the double buffered alloc from the before to the after
-        // region.
-        Value beforeArg = before.addArgument(alloc.getType(), op.getLoc());
-        before.front().getTerminator()->insertOperands(
-            before.front().getTerminator()->getNumOperands(), beforeArg);
+    // If we have an argument that's deallocated, and a matching allocation
+    // that's yielded, we can instead stash the deallocated buffer in an arg and
+    // use it the next time.
+    for (auto [type, allocs] : allocations) {
+      for (auto [alloc, dealloc] : llvm::zip(allocs, deallocations[type])) {
+        doubleBuffer(op, alloc, dealloc);
+        deallocsToFix.push_back(dealloc);
+
+        if (llvm::isa<scf::WhileOp>(op)) {
+          auto& otherRegion = op->getRegion(1 - i);
+          // Forward the double buffered alloc.
+          Value arg = otherRegion.addArgument(alloc.getType(), op.getLoc());
+          otherRegion.front().getTerminator()->insertOperands(
+              otherRegion.front().getTerminator()->getNumOperands(), arg);
+        }
       }
     }
   }
@@ -376,15 +238,8 @@ bool doubleBuffer(Block& block) {
       }
     }
 
-    if (auto whileOp = llvm::dyn_cast<scf::WhileOp>(op)) {
-      if (auto db = doubleBuffer(whileOp); db != op) {
-        op = db;
-        result = true;
-      }
-    }
-
-    if (auto forOp = llvm::dyn_cast<scf::ForOp>(op)) {
-      if (auto db = doubleBuffer(forOp); db != op) {
+    if (llvm::isa<scf::ForOp, scf::WhileOp>(op)) {
+      if (auto db = doubleBuffer(cast<RegionBranchOpInterface>(op)); db != op) {
         op = db;
         result = true;
       }
@@ -395,16 +250,102 @@ bool doubleBuffer(Block& block) {
   return result;
 }
 
-bool reuseBuffers(Block& block) {
+bool isRestrictBbArg(Value value) {
+  auto bbarg = llvm::dyn_cast<BlockArgument>(value);
+  auto func =
+      llvm::dyn_cast<func::FuncOp>(value.getParentBlock()->getParentOp());
+  if (!bbarg || !func) return false;
+  auto isRestrict = func.getArgAttrOfType<BoolAttr>(bbarg.getArgNumber(),
+                                                    "deallocation.restrict");
+  return isRestrict && isRestrict.getValue();
+}
+
+void eliminateCopies(Block& block, Block& root) {
+  auto* op = &block.front();
+  while (op) {
+    for (auto& region : op->getRegions()) {
+      if (!region.empty()) {
+        assert(region.hasOneBlock());
+        eliminateCopies(region.front(), root);
+      }
+    }
+
+    auto copy = llvm::dyn_cast_or_null<memref::CopyOp>(op);
+    op = op->getNextNode();
+
+    auto dealloc = llvm::dyn_cast_or_null<memref::DeallocOp>(op);
+    if (!copy || !dealloc ||
+        copy.getTarget().getType() != copy.getSource().getType() ||
+        dealloc.getMemref() != copy.getSource()) {
+      continue;
+    }
+
+    auto sourceAlloc = llvm::dyn_cast_or_null<memref::AllocOp>(
+        copy.getSource().getDefiningOp());
+    if (!sourceAlloc) continue;
+
+    bool targetIsFirstUseOfRestrictBbArg =
+        isRestrictBbArg(copy.getTarget()) &&
+        !hasUsesBetween(&root.front(), copy, copy.getTarget());
+    if (!targetIsFirstUseOfRestrictBbArg) {
+      auto targetAlloc = llvm::dyn_cast_or_null<memref::AllocOp>(
+          copy.getTarget().getDefiningOp());
+      bool targetIsFirstUseOfAlloc =
+          targetAlloc && !hasUsesBetween(targetAlloc, copy, targetAlloc);
+
+      // If the source was used before the definition of the target, or the
+      // target was used before the copy, this transformation is unsafe.
+      if (!targetIsFirstUseOfAlloc ||
+          hasUsesBetween(&root.front(), targetAlloc, sourceAlloc)) {
+        continue;
+      }
+    }
+
+    // (no use of %b)
+    // %a = alloc or %a is a bbarg with `restrict`.
+    // %b = alloc
+    // (no use of %a)
+    // copy %b, %a
+    // dealloc %b
+    copy.getSource().replaceAllUsesWith(copy.getTarget());
+    op = dealloc->getNextNode();
+    copy->erase();
+    dealloc->erase();
+    sourceAlloc->erase();
+  }
+}
+
+enum class BufferReuseMode {
+  // Only reuse buffers if between a `dealloc` and `alloc` there are no further
+  // `alloc`s that might later become a candidate for buffer reuse.
+  CONSERVATIVE,
+  // Also reuse buffers if there are intermediate ops between `dealloc` and
+  // `alloc`. This may extend live-ranges of buffers (e.g. if the intermediate
+  // op contains a region), which may destroy reuse opportunities.
+  AGGRESSIVE
+};
+
+bool reuseBuffers(Block& block, BufferReuseMode mode) {
   auto* op = &block.front();
   bool result = false;
   while (op) {
     if (auto dealloc = llvm::dyn_cast<memref::DeallocOp>(op)) {
-      // Try to find an alloc op with the same shape. Only check the next
-      // alloc, so we don't increase the heap size in the meantime.
-      // TODO(jreiffers): Can we be smarter here?
-      auto alloc = findOp<memref::AllocOp>(
+      memref::AllocOp alloc = findOp<memref::AllocOp>(
           op->getNextNode(), [](memref::AllocOp) { return true; });
+
+      // In conservative mode, don't reuse buffers if there is a candidate alloc
+      // in between the dealloc/alloc pair that might still be matched with this
+      // dealloc. If we extend the live-range of the buffer past this alloc,
+      // this will prevent further reuse.
+      if (!alloc || (mode == BufferReuseMode::CONSERVATIVE &&
+                     anyMatchBetween(dealloc, alloc, [&](Operation* op) {
+                       return llvm::isa<memref::AllocOp>(op) &&
+                              op->getResultTypes().front() == alloc.getType();
+                     }))) {
+        op = op->getNextNode();
+        continue;
+      }
+
       if (alloc && alloc.getDynamicSizes().empty() &&
           alloc->getResultTypes()[0] == dealloc.getMemref().getType()) {
         alloc.replaceAllUsesWith(dealloc.getMemref());
@@ -416,34 +357,10 @@ bool reuseBuffers(Block& block) {
       }
     }
 
-    if (auto copy = llvm::dyn_cast_or_null<memref::CopyOp>(op)) {
-      auto alloc = llvm::dyn_cast_or_null<memref::AllocOp>(
-          copy.getTarget().getDefiningOp());
-      auto dealloc =
-          llvm::dyn_cast_or_null<memref::DeallocOp>(copy->getNextNode());
-      if (alloc && dealloc &&
-          alloc.getType() == dealloc.getMemref().getType() &&
-          !hasUsesBetween(/*start=*/alloc, /*end=*/copy, alloc)) {
-        // %a = alloc
-        // (some IR not using %a)
-        // copy %b, %a
-        // dealloc %b
-        alloc.replaceAllUsesWith(dealloc.getMemref());
-        op = dealloc->getNextNode();
-        copy->erase();
-        alloc->erase();
-        dealloc->erase();
-        result = true;
-        continue;
-      }
-    }
-
-    // Reuse may get rid of allocs entirely, so run it before attempting double
-    // buffering.
     for (auto& region : op->getRegions()) {
       if (!region.empty()) {
         assert(region.hasOneBlock());
-        result |= reuseBuffers(region.front());
+        result |= reuseBuffers(region.front(), mode);
       }
     }
 
@@ -457,6 +374,12 @@ void promoteToStack(memref::DeallocOp dealloc) {
   OpBuilder b(alloc);
   auto alloca = b.create<memref::AllocaOp>(
       alloc->getLoc(), alloc->getResultTypes()[0].cast<MemRefType>());
+  for (auto* user : alloc->getUsers()) {
+    if (auto ownership = llvm::dyn_cast<OwnOp>(user)) {
+      b.setInsertionPoint(ownership);
+      ownership->replaceAllUsesWith(b.create<NullOp>(ownership.getLoc()));
+    }
+  }
   alloc->replaceAllUsesWith(ValueRange{alloca.getResult()});
   alloc->erase();
   dealloc->erase();
@@ -591,17 +514,23 @@ struct BufferReusePass : public impl::BufferReusePassBase<BufferReusePass> {
   void runOnOperation() override {
     bool result;
     auto& block = getOperation().getBody().front();
-    // This assumes invariants that it breaks, so it can only be run once.
-    elideRedundantOwnershipArgs(block);
+    // Copy elimination requires small live-ranges to work well. We only extend
+    // live ranges afterwards, so running it more than once doesn't help.
+    eliminateCopies(block, /*root=*/block);
     do {
       // Eliminate dead code.
       (void)applyPatternsAndFoldGreedily(getOperation(), {});
-      result = hoistAllocs(block);
-      result |= reuseBuffers(block);
-      result |= doubleBuffer(block);
-      result |= simplifyLoopDeallocs(block);
+      // Only coalesce dealloc/alloc pairs that are immediate neighbors, to
+      // make sure we don't accidentally extend the live range of a buffer.
+      result = reuseBuffers(block, BufferReuseMode::CONSERVATIVE);
+      // Make sure we rerun buffer reuse after every intermediate step.
+      result |= hoistAllocs(block) || doubleBuffer(block) ||
+                simplifyLoopDeallocs(block);
     } while (result);
+    // Now we can also coalesce distant dealloc/alloc pairs.
+    reuseBuffers(block, BufferReuseMode::AGGRESSIVE);
     promoteBuffers(block);
+    (void)applyPatternsAndFoldGreedily(getOperation(), {});
   }
 };
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/convert_deallocation_ops_to_llvm.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/convert_deallocation_ops_to_llvm.cc
index bbdd3946d80..4fa4e32a7a5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/convert_deallocation_ops_to_llvm.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/convert_deallocation_ops_to_llvm.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
-#include <optional>
 #include <utility>
 
 #include "deallocation/IR/deallocation_ops.h"
@@ -25,9 +24,10 @@ limitations under the License.
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/MLIRContext.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 
@@ -39,75 +39,22 @@ struct NullOpLowering : public ConvertOpToLLVMPattern<NullOp> {
   using ConvertOpToLLVMPattern<NullOp>::ConvertOpToLLVMPattern;
 
   LogicalResult matchAndRewrite(
-      NullOp nullOp, OpAdaptor /*adaptor*/,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = nullOp->getLoc();
-    LLVMTypeConverter typeConverter = *getTypeConverter();
+      NullOp nullOp, OpAdaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<LLVM::NullOp>(
+        nullOp, LLVM::LLVMPointerType::get(rewriter.getContext(), 0));
+    return success();
+  }
+};
 
-    auto baseMemRefType = nullOp.getType().cast<BaseMemRefType>();
+struct OwnOpLowering : public ConvertOpToLLVMPattern<OwnOp> {
+  using ConvertOpToLLVMPattern<OwnOp>::ConvertOpToLLVMPattern;
 
-    FailureOr<unsigned> addressSpaceOr =
-        typeConverter.getMemRefAddressSpace(baseMemRefType);
-    if (failed(addressSpaceOr)) return failure();
-    unsigned addressSpace = addressSpaceOr.value();  // NOLINT
-
-    Type elemType = baseMemRefType.getElementType();
-    Type llvmElemType = typeConverter.convertType(elemType);
-
-    Value zero = createIndexConstant(rewriter, loc, 0);
-    if (auto resultType = nullOp.getType().dyn_cast<MemRefType>()) {
-      // Set all dynamic sizes to 1 and compute fake strides.
-      SmallVector<Value> dynSizes(resultType.getNumDynamicDims(),
-                                  createIndexConstant(rewriter, loc, 1));
-      SmallVector<Value> sizes, strides;
-      Value sizeBytes;
-      getMemRefDescriptorSizes(loc, resultType, dynSizes, rewriter, sizes,
-                               strides, sizeBytes);
-
-      // Prepare packed args [allocatedPtr, alignedPtr, offset, sizes, strides]
-      // to create a memref descriptor.
-      Value null = rewriter.create<LLVM::NullOp>(
-          loc, LLVM::LLVMPointerType::get(llvmElemType, addressSpace));
-      SmallVector<Value> packedValues{null, null, zero};
-      packedValues.append(sizes);
-      packedValues.append(strides);
-
-      rewriter.replaceOp(nullOp,
-                         MemRefDescriptor::pack(rewriter, loc, typeConverter,
-                                                resultType, packedValues));
-      return success();
-    }
-
-    auto resultType = nullOp.getType().cast<UnrankedMemRefType>();
-    Type llvmResultType = typeConverter.convertType(resultType);
-
-    auto desc = UnrankedMemRefDescriptor::undef(rewriter, loc, llvmResultType);
-    desc.setRank(rewriter, loc, zero);
-
-    // The allocated pointer is stored in the underlying ranked memref
-    // descriptor.
-    SmallVector<Value, 1> sizes;
-    UnrankedMemRefDescriptor::computeSizes(rewriter, loc, *getTypeConverter(),
-                                           desc, addressSpace, sizes);
-    Value underlyingDestPtr = rewriter.create<LLVM::AllocaOp>(
-        loc, getVoidPtrType(), sizes.front(), std::nullopt);
-
-    // Populate underlying ranked descriptor.
-    LLVM::LLVMPointerType elemPtrType =
-        LLVM::LLVMPointerType::get(llvmElemType, addressSpace);
-
-    Value null = rewriter.create<LLVM::NullOp>(
-        loc, LLVM::LLVMPointerType::get(llvmElemType, addressSpace));
-    UnrankedMemRefDescriptor::setAllocatedPtr(rewriter, loc, underlyingDestPtr,
-                                              elemPtrType, null);
-    UnrankedMemRefDescriptor::setAlignedPtr(rewriter, loc, *getTypeConverter(),
-                                            underlyingDestPtr, elemPtrType,
-                                            null);
-    UnrankedMemRefDescriptor::setOffset(rewriter, loc, *getTypeConverter(),
-                                        underlyingDestPtr, elemPtrType, zero);
-
-    desc.setMemRefDescPtr(rewriter, loc, underlyingDestPtr);
-    rewriter.replaceOp(nullOp, {desc});
+  LogicalResult matchAndRewrite(
+      OwnOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    rewriter.replaceOp(op, MemRefDescriptor(adaptor.getMemref())
+                               .allocatedPtr(rewriter, op->getLoc()));
     return success();
   }
 };
@@ -117,24 +64,91 @@ struct GetBufferOpLowering : public ConvertOpToLLVMPattern<GetBufferOp> {
 
   LogicalResult matchAndRewrite(
       GetBufferOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    auto loc = op->getLoc();
-    auto memref = adaptor.getMemref();
-    Value ptr;
-    if (auto unrankedTy =
-            llvm::dyn_cast<UnrankedMemRefType>(op.getMemref().getType())) {
-      Type elementType = unrankedTy.getElementType();
-      Type llvmElementTy = getTypeConverter()->convertType(elementType);
-      LLVM::LLVMPointerType elementPtrTy = getTypeConverter()->getPointerType(
-          llvmElementTy, unrankedTy.getMemorySpaceAsInt());
-      memref = UnrankedMemRefDescriptor(memref).memRefDescPtr(rewriter, loc);
-      ptr = UnrankedMemRefDescriptor::allocatedPtr(rewriter, loc, memref,
-                                                   elementPtrTy);
+      ConversionPatternRewriter& rewriter) const override {
+    if (op.getAlloc().getType().isa<OwnershipIndicatorType>()) {
+      rewriter.replaceOpWithNewOp<LLVM::PtrToIntOp>(
+          op, getTypeConverter()->getIndexType(), adaptor.getAlloc());
     } else {
-      ptr = MemRefDescriptor(memref).allocatedPtr(rewriter, loc);
+      rewriter.replaceOpWithNewOp<LLVM::PtrToIntOp>(
+          op, getTypeConverter()->getIndexType(),
+          MemRefDescriptor(adaptor.getAlloc())
+              .allocatedPtr(rewriter, op->getLoc()));
     }
-    rewriter.replaceOpWithNewOp<LLVM::PtrToIntOp>(
-        op, getTypeConverter()->getIndexType(), ptr);
+    return success();
+  }
+};
+
+struct FreeOpLowering : public ConvertOpToLLVMPattern<FreeOp> {
+  using ConvertOpToLLVMPattern<FreeOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult matchAndRewrite(
+      FreeOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    auto freeFn =
+        LLVM::lookupOrCreateFreeFn(op->getParentOfType<ModuleOp>(),
+                                   getTypeConverter()->useOpaquePointers());
+
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, freeFn, adaptor.getAlloc());
+    return success();
+  }
+};
+
+struct RetainOpLowering : public ConvertOpToLLVMPattern<RetainOp> {
+  using ConvertOpToLLVMPattern<RetainOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult matchAndRewrite(
+      RetainOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    auto loc = op.getLoc();
+    auto ptrTy = LLVM::LLVMPointerType::get(op.getContext());
+    rewriter.setInsertionPoint(op);
+    auto alloca = rewriter.create<memref::AllocaScopeOp>(
+        loc, SmallVector<Type>(op->getNumResults(), ptrTy));
+    auto& body = alloca.getBodyRegion().emplaceBlock();
+    rewriter.setInsertionPoint(&body, body.begin());
+
+    auto i64Ty = rewriter.getI64Type();
+    auto ptrPtrTy = getTypeConverter()->getPointerType(ptrTy);
+    auto getBuffers = [&](ValueRange values) {
+      auto ret = rewriter.create<LLVM::AllocaOp>(
+          loc, ptrPtrTy, ptrTy,
+          createIndexConstant(rewriter, loc,
+                              values.size() *
+                                  getTypeConverter()->getPointerBitwidth() /
+                                  CHAR_BIT));
+      for (auto [index, value] : llvm::enumerate(values)) {
+        auto ptr = rewriter.create<LLVM::GEPOp>(
+            loc, ptrPtrTy, ptrTy, ret,
+            createIndexConstant(rewriter, loc, index));
+        rewriter.create<LLVM::StoreOp>(loc, value, ptr);
+      }
+      return ret;
+    };
+
+    Value numAllocs = createIndexConstant(rewriter, loc, op.getAllocs().size());
+    Value allocBuffers = getBuffers(adaptor.getAllocs());
+    Value numRetained =
+        createIndexConstant(rewriter, loc, op.getRetained().size());
+    Value retainedBuffers = getBuffers(adaptor.getRetained());
+
+    auto retainFn =
+        LLVM::lookupOrCreateFn(op->getParentOfType<ModuleOp>(), "retainBuffers",
+                               {i64Ty, ptrPtrTy, i64Ty, ptrPtrTy},
+                               LLVM::LLVMVoidType::get(op->getContext()));
+    rewriter.create<LLVM::CallOp>(
+        loc, retainFn,
+        ValueRange{numAllocs, allocBuffers, numRetained, retainedBuffers});
+
+    SmallVector<Value> results;
+    for (auto index : llvm::seq<size_t>(0, op.getRetained().size())) {
+      auto ptr = rewriter.create<LLVM::GEPOp>(
+          loc, ptrPtrTy, ptrTy, retainedBuffers,
+          createIndexConstant(rewriter, loc, index));
+      results.push_back(rewriter.create<LLVM::LoadOp>(loc, ptrTy, ptr));
+    }
+    rewriter.create<memref::AllocaScopeReturnOp>(loc, results);
+
+    rewriter.replaceOp(op, alloca->getResults());
     return success();
   }
 };
@@ -148,20 +162,20 @@ struct ConvertDeallocationOpsToLLVMPass
   ConvertDeallocationOpsToLLVMPass() = default;
 
   void runOnOperation() override {
-    Operation *func = getOperation();
-    const auto &dataLayoutAnalysis = getAnalysis<DataLayoutAnalysis>();
+    Operation* func = getOperation();
+    const auto& dataLayoutAnalysis = getAnalysis<DataLayoutAnalysis>();
     LowerToLLVMOptions options(&getContext(),
                                dataLayoutAnalysis.getAtOrAbove(func));
-    // TODO(b/267828330): Migrate to opaque pointers.
-    options.useOpaquePointers = false;
 
     LLVMTypeConverter typeConverter(&getContext(), options,
                                     &dataLayoutAnalysis);
     RewritePatternSet patterns(&getContext());
-    patterns.add<GetBufferOpLowering, NullOpLowering>(typeConverter);
+    populateDeallocationToLLVMConversionPatterns(typeConverter, patterns);
 
     LLVMConversionTarget target(getContext());
     target.addLegalOp<func::FuncOp>();
+    target.addLegalOp<memref::AllocaScopeOp, memref::AllocaScopeReturnOp>();
+    target.addIllegalOp<OwnOp, FreeOp, GetBufferOp, NullOp, RetainOp>();
     if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
       signalPassFailure();
     }
@@ -170,6 +184,15 @@ struct ConvertDeallocationOpsToLLVMPass
 
 }  // namespace
 
+void populateDeallocationToLLVMConversionPatterns(LLVMTypeConverter& converter,
+                                                  RewritePatternSet& patterns) {
+  converter.addConversion([&](OwnershipIndicatorType) {
+    return LLVM::LLVMPointerType::get(&converter.getContext());
+  });
+  patterns.add<OwnOpLowering, FreeOpLowering, GetBufferOpLowering,
+               NullOpLowering, RetainOpLowering>(converter);
+}
+
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 createConvertDeallocationOpsToLLVM() {
   return std::make_unique<ConvertDeallocationOpsToLLVMPass>();
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocate.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocate.cc
index c1d9bac7816..04e99006fd4 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocate.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocate.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <cstdint>
+#include <map>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -22,15 +25,25 @@ limitations under the License.
 #include "deallocation/utils/util.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Block.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Region.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
 
 namespace mlir {
 namespace deallocation {
@@ -41,11 +54,16 @@ bool isMemref(Value v) { return v.getType().isa<BaseMemRefType>(); }
 struct TransformResult {
   // Allocs that are no longer owned by the current block. Note that it is valid
   // for an alloc to be both in `acquired` and `released`, if it was temporarily
-  // released and then reacquired.
-  llvm::SmallVector<Value> released;
+  // released and then reacquired. It is valid to release an alloc that's not
+  // owned by the current block, if some ancestor that is reachable without
+  // crossing a loop boundary owns it.
+  // Collects values that are the actual memrefs.
+  breaks_if_you_move_ops::ValueSet released;
 
-  // Allocs that are now owned by the current block.
-  llvm::SmallVector<Value> acquired;
+  // Allocs that are now owned by the current block. Order matters here - it's
+  // the same order as in the terminator/result list.
+  // Collects values that are the ownership indicators.
+  SmallVector<Value> acquired;
 };
 
 bool doesAlias(Operation* op, Value v,
@@ -62,64 +80,154 @@ bool doesAlias(Operation* op, Value v,
 }
 
 struct Deallocator {
-  // Transforms the operation and returns any allocations whose ownership is
-  // transferred to the parent block.
-  // `ownedMemrefs` contains the memrefs owned by the immediate parent block at
-  // the point of `op`.
-  TransformResult transformOp(
-      Operation* op, const breaks_if_you_move_ops::ValueSet& ownedMemrefs);
-  TransformResult transformOp(
+  void setOwnershipIndicator(Value owned, Value indicator);
+  Value findOwnershipIndicator(Value v);
+
+  // Transform ops, introducing deallocs.
+  LogicalResult transformModuleOp(ModuleOp op);
+  LogicalResult transformFuncOp(func::FuncOp op);
+  FailureOr<TransformResult> transformBlock(Block& block,
+                                            bool ownsInputs = true);
+  FailureOr<breaks_if_you_move_ops::ValueSet> transformIfImplicitCapture(
+      scf::IfOp op, TransformResult& ifResult, TransformResult& elseResult);
+  FailureOr<TransformResult> transformOp(
       RegionBranchOpInterface op,
       const breaks_if_you_move_ops::ValueSet& ownedMemrefs);
-  // Returns the values within the block that are retained, but does not add
-  // them to the terminator.
-  llvm::SmallVector<Value> transformBlock(Block& block, bool ownsInputs = true);
+  FailureOr<TransformResult> transformOp(func::CallOp op);
+  FailureOr<TransformResult> transformOp(
+      Operation* op, const breaks_if_you_move_ops::ValueSet& ownedMemrefs);
 
-  // If `value` is guaranteed to be derived from a particular alloc, returns it.
-  // Otherwise, returns null.
-  Value getUniquePossibleAlloc(Value value);
-
-  breaks_if_you_move_ops::ValueEquivalenceClasses aliases;
-  llvm::SmallVector<RegionBranchOpInterface> loops;
+  // Internal state keeping track of
+  //   - inter-function aliasing,
+  //   - intra-function aliasing, and
+  //   - ownership indicators per memref.
+  std::map<func::FuncOp, SmallVector<llvm::SmallVector<int64_t>>>
+      functionAliasOverapprox;
+  breaks_if_you_move_ops::ValueEquivalenceClasses aliasOverapprox;
+  breaks_if_you_move_ops::ValueMap<Value> ownershipIndicator;
 };
 
-Value Deallocator::getUniquePossibleAlloc(Value v) {
-  Value result = {};
-  for (auto it = aliases.findLeader(v); it != aliases.member_end(); ++it) {
-    if (llvm::isa<BlockArgument>(*it) ||
-        llvm::isa<memref::AllocOp>(it->getDefiningOp())) {
-      if (result) return {};
-      result = *it;
-    }
+void Deallocator::setOwnershipIndicator(Value owned, Value indicator) {
+  ownershipIndicator[owned] = indicator;
+  aliasOverapprox.unionSets(owned, indicator);
+}
+
+Value Deallocator::findOwnershipIndicator(Value v) {
+  if (llvm::isa_and_nonnull<memref::SubViewOp, memref::ViewOp,
+                            memref::CollapseShapeOp, memref::ExpandShapeOp,
+                            memref::TransposeOp, memref::ReinterpretCastOp>(
+          v.getDefiningOp())) {
+    return findOwnershipIndicator(v.getDefiningOp()->getOperand(0));
   }
+  auto it = ownershipIndicator.find(v);
+  if (it != ownershipIndicator.end()) return it->second;
+  return {};
+}
+
+LogicalResult Deallocator::transformModuleOp(ModuleOp op) {
+  LogicalResult result = success();
+  op.walk([&](func::FuncOp funcOp) {
+    if (failed(transformFuncOp(funcOp))) {
+      result = failure();
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+
   return result;
 }
 
-llvm::SmallVector<Value> Deallocator::transformBlock(Block& block,
-                                                     bool ownsInputs) {
+// TODO(frgossen): Also allow passing ownership to functions.
+LogicalResult Deallocator::transformFuncOp(func::FuncOp op) {
+  // If we find an aliasing record for this function, it is already being
+  // transformed. We might be hitting a cycle in the call graph here, in which
+  // case this is a temorary aliasing overapproximation and may be refined
+  // later.
+  if (functionAliasOverapprox.find(op) != functionAliasOverapprox.end())
+    return success();
+
+  // Mark function as being processed and provide a valid overapproximation for
+  // aliasing: every result may alias every argument.
+  SmallVector<llvm::SmallVector<int64_t>> trivialOverapproximation;
+  int numOwnershipResults = 0;
+  auto allArgs = llvm::to_vector(llvm::seq<int64_t>(0, op.getNumArguments()));
+  for (Type resultTy : op.getFunctionType().getResults()) {
+    auto& resultAliasing = trivialOverapproximation.emplace_back();
+    if (!llvm::isa<MemRefType>(resultTy)) continue;
+    resultAliasing = allArgs;
+    numOwnershipResults++;
+  }
+  trivialOverapproximation.append(numOwnershipResults, allArgs);
+  functionAliasOverapprox[op] = trivialOverapproximation;
+
+  if (op->getNumRegions() == 0) return success();
+
+  // Transform function body.
+  assert(op.getBody().getBlocks().size() == 1 &&
+         "expect single block functions");
+  Block& block = op.getBody().front();
+  auto transformedBlock = transformBlock(block, /*ownsInputs=*/false);
+  if (failed(transformedBlock)) return failure();
+  if (!transformedBlock->released.empty()) {
+    op->emitOpError("invalid realloc of memref");
+    return failure();
+  }
+
+  // Update terminator and pass on the ownership indicator per escaping memref.
+  auto returnOp = llvm::dyn_cast<func::ReturnOp>(block.getTerminator());
+  returnOp->setOperands(returnOp.getNumOperands(), 0,
+                        transformedBlock->acquired);
+  op.setFunctionType(mlir::FunctionType::get(
+      op.getContext(), block.getArgumentTypes(), returnOp.getOperandTypes()));
+
+  // Refine function aliasing based on return values.
+  SmallVector<llvm::SmallVector<int64_t>> refinedOverapproximation;
+  for (Value result : returnOp.getOperands()) {
+    auto& resultAliasing = refinedOverapproximation.emplace_back();
+    for (auto [j, arg] : llvm::enumerate(op.getArguments())) {
+      if (aliasOverapprox.isEquivalent(result, arg))
+        resultAliasing.push_back(j);
+    }
+  }
+  functionAliasOverapprox[op] = refinedOverapproximation;
+
+  return success();
+}
+
+FailureOr<TransformResult> Deallocator::transformBlock(Block& block,
+                                                       bool ownsInputs) {
+  auto loc = block.getParent()->getLoc();
+  auto ownershipTy = OwnershipIndicatorType::get(loc.getContext());
   // Introduce block arguments for the owned inputs.
   breaks_if_you_move_ops::ValueSet ownedMemrefs;
-
   if (ownsInputs) {
     for (auto arg : llvm::to_vector(
              llvm::make_filter_range(block.getArguments(), isMemref))) {
       // Add an argument for a potentially owned memref.
-      auto newArg = block.addArgument(getUnrankedMemrefType(arg),
-                                      block.getParent()->getLoc());
+      auto newArg = block.addArgument(ownershipTy, loc);
       ownedMemrefs.insert(newArg);
-      aliases.unionSets(arg, newArg);
+      setOwnershipIndicator(arg, newArg);
     }
   }
 
+  TransformResult blockResult;
   for (auto& op : llvm::make_early_inc_range(block.without_terminator())) {
-    auto result = transformOp(&op, ownedMemrefs);
+    auto opResult = transformOp(&op, ownedMemrefs);
+    if (failed(opResult)) return failure();
     // Remove released memrefs.
-    for (auto v : result.released) {
-      bool wasRemoved = ownedMemrefs.erase(v);
-      (void)wasRemoved;
-      assert(wasRemoved && "released an alloc that was not owned");
+    for (auto v : opResult->released) {
+      auto owned = llvm::find(ownedMemrefs, v);
+      // If we don't own the released value, pass the release on to the parent.
+      if (owned == ownedMemrefs.end()) {
+        if (!blockResult.released.insert(v).second) {
+          block.getParentOp()->emitOpError("same value released twice");
+          return failure();
+        }
+      } else {
+        ownedMemrefs.erase(owned);
+      }
     }
-    ownedMemrefs.insert(result.acquired.begin(), result.acquired.end());
+    ownedMemrefs.insert(opResult->acquired.begin(), opResult->acquired.end());
   }
   auto yieldedMemrefs = llvm::to_vector(
       llvm::make_filter_range(block.getTerminator()->getOperands(), isMemref));
@@ -127,16 +235,16 @@ llvm::SmallVector<Value> Deallocator::transformBlock(Block& block,
   // Handle owned memrefs that don't alias with any yielded memref first.
   for (auto v : ownedMemrefs) {
     if (!llvm::any_of(yieldedMemrefs, [&](Value yielded) {
-          return aliases.isEquivalent(yielded, v);
+          return aliasOverapprox.isEquivalent(yielded, v);
         })) {
       // This owned memref does not escape, so we can put it in its own
       // retain and place it as early as possible.
       auto* insertionPoint = block.getTerminator();
       while (insertionPoint->getPrevNode() &&
-             !doesAlias(insertionPoint->getPrevNode(), v, aliases)) {
+             !doesAlias(insertionPoint->getPrevNode(), v, aliasOverapprox)) {
         insertionPoint = insertionPoint->getPrevNode();
       }
-      ImplicitLocOpBuilder b(block.getParent()->getLoc(), insertionPoint);
+      ImplicitLocOpBuilder b(loc, insertionPoint);
       b.create<RetainOp>(TypeRange{}, ValueRange{}, ValueRange{v});
     }
   }
@@ -145,8 +253,8 @@ llvm::SmallVector<Value> Deallocator::transformBlock(Block& block,
   auto groupByLeader = [&](auto& values) {
     breaks_if_you_move_ops::ValueMap<SmallVector<Value>> result;
     for (auto v : values) {
-      aliases.insert(v);
-      result[aliases.getLeaderValue(v)].push_back(v);
+      aliasOverapprox.insert(v);
+      result[aliasOverapprox.getLeaderValue(v)].push_back(v);
     }
     return result;
   };
@@ -154,95 +262,160 @@ llvm::SmallVector<Value> Deallocator::transformBlock(Block& block,
   auto ownedByLeader = groupByLeader(ownedMemrefs);
 
   // Create one retain per equivalence class.
-  ImplicitLocOpBuilder b(block.getParent()->getLoc(), block.getTerminator());
-  SmallVector<Value> results(yieldedMemrefs.size());
+  ImplicitLocOpBuilder b(loc, block.getTerminator());
+  auto null = b.create<NullOp>();
+  blockResult.acquired =
+      SmallVector<Value>(yieldedMemrefs.size(), null.getResult());
   for (auto [leader, yielded] : yieldedByLeader) {
     auto& ownedGroup = ownedByLeader[leader];
-    if (ownedGroup.size() == 1 && yielded.size() == 1 &&
-        getUniquePossibleAlloc(yielded.front()) == ownedGroup.front()) {
+    if (ownedGroup.size() == 1 && yielded.size() == 1) {
       // We know the alloc that the yielded memref is derived from, so we can
       // omit the retain op. This would better be a canonicalization pattern,
       // but it requires an alias analysis, which we already have here.
-      auto cast = results[llvm::find(yieldedMemrefs, yielded.front()) -
-                          yieldedMemrefs.begin()] =
-          b.create<memref::CastOp>(getUnrankedMemrefType(yielded.front()),
-                                   ownedGroup.front());
-      aliases.unionSets(cast, ownedGroup.front());
-    } else {
-      auto types = llvm::to_vector(llvm::map_range(
-          yielded, [](Value v) { return getUnrankedMemrefType(v); }));
-      auto retain = b.create<RetainOp>(types, yielded, ownedGroup);
-      for (auto [retained, result] : llvm::zip(retain.getResults(), yielded)) {
-        aliases.unionSets(retained, result);
-        results[llvm::find(yieldedMemrefs, result) - yieldedMemrefs.begin()] =
-            retained;
-      }
+      blockResult.acquired[llvm::find(yieldedMemrefs, yielded.front()) -
+                           yieldedMemrefs.begin()] = ownedGroup.front();
+      continue;
+    }
+
+    SmallVector<Type> types(yielded.size(), ownershipTy);
+    auto retain = b.create<RetainOp>(types, yielded, ownedGroup);
+    for (auto [retained, result] : llvm::zip(retain.getResults(), yielded)) {
+      aliasOverapprox.unionSets(retained, result);
+      blockResult.acquired[llvm::find(yieldedMemrefs, result) -
+                           yieldedMemrefs.begin()] = retained;
     }
   }
-  for (auto [result, yielded] : llvm::zip(results, yieldedMemrefs)) {
-    if (!result) {
-      result = b.create<NullOp>(getUnrankedMemrefType(yielded)).getResult();
-    }
-  }
-  return results;
+  if (!llvm::is_contained(blockResult.acquired, null.getResult())) null.erase();
+  return blockResult;
 }
 
-TransformResult Deallocator::transformOp(
+FailureOr<breaks_if_you_move_ops::ValueSet>
+Deallocator::transformIfImplicitCapture(scf::IfOp op, TransformResult& ifResult,
+                                        TransformResult& elseResult) {
+  if (ifResult.released == elseResult.released) {
+    return ifResult.released;
+  }
+
+  auto fixAcquiredAlloc = [&](Value v, Region& region,
+                              TransformResult& result) -> LogicalResult {
+    if (region.empty()) {
+      op.emitOpError("cannot implicitly capture from an if without else");
+      return failure();
+    }
+    auto* terminator = region.front().getTerminator();
+    auto operands = terminator->getOperands();
+    auto it = llvm::find_if(operands, [&](Value operand) {
+      return findOwnershipIndicator(operand) == v;
+    });
+    if (it == operands.end()) {
+      op.emitOpError("released value not yielded on other branch");
+      return failure();
+    }
+    ownershipIndicator.erase(v);
+
+    auto index = std::count_if(operands.begin(), it, isMemref);
+    result.acquired[index] = v;
+    return success();
+  };
+
+  for (auto v : ifResult.released) {
+    if (!llvm::is_contained(elseResult.released, v)) {
+      if (failed(fixAcquiredAlloc(v, op.getElseRegion(), elseResult)))
+        return failure();
+    }
+  }
+  for (auto v : elseResult.released) {
+    if (!llvm::is_contained(ifResult.released, v)) {
+      if (failed(fixAcquiredAlloc(v, op.getThenRegion(), ifResult)))
+        return failure();
+    }
+  }
+
+  breaks_if_you_move_ops::ValueSet released = ifResult.released;
+  released.insert(elseResult.released.begin(), elseResult.released.end());
+  return released;
+}
+
+FailureOr<TransformResult> Deallocator::transformOp(
     RegionBranchOpInterface op,
     const breaks_if_you_move_ops::ValueSet& ownedMemrefs) {
   SmallVector<int64_t> originalNumArgsByRegion;
-  SmallVector<SmallVector<Value>> retentionSetsByRegion;
-  retentionSetsByRegion.reserve(op->getNumRegions());
+  SmallVector<TransformResult> transformResultsByRegion;
+  transformResultsByRegion.reserve(op->getNumRegions());
 
+  bool mayImplicitlyCapture = llvm::isa<scf::IfOp>(op);
   for (auto [index, region] : llvm::enumerate(op->getRegions())) {
     assert(region.getBlocks().size() <= 1 &&
            "expected regions to have at most one block");
     auto edges = getSuccessorRegions(op, index);
     originalNumArgsByRegion.push_back(region.getNumArguments());
 
-    auto& retentionSet = retentionSetsByRegion.emplace_back();
+    auto& result = transformResultsByRegion.emplace_back();
     if (region.empty()) continue;
 
-    // Transform region and collect owned memrefs.
-    retentionSet = transformBlock(region.front());
+    // Transform the block and collect acquired/released memrefs.
+    auto transformResultOrError = transformBlock(region.front());
+    if (failed(transformResultOrError)) return failure();
+
+    result = *std::move(transformResultOrError);  // NOLINT
+    if (!result.released.empty() && !mayImplicitlyCapture) {
+      // This error means that there's a realloc or free in a loop, and the op
+      // defining the value is outside the loop. This is not valid. To fix
+      // this, turn the argument of realloc/free into an iter arg.
+      op.emitOpError(
+          "can't implicitly capture across loop boundaries; use an "
+          "explicit iter arg instead");
+      return failure();
+    }
+  }
+
+  breaks_if_you_move_ops::ValueSet released;
+  if (llvm::any_of(transformResultsByRegion, [](const TransformResult& result) {
+        return !result.released.empty();
+      })) {
+    auto releasedByIf = transformIfImplicitCapture(
+        llvm::cast<scf::IfOp>(op.getOperation()), transformResultsByRegion[0],
+        transformResultsByRegion[1]);
+    if (failed(releasedByIf)) return failure();
+    released = *std::move(releasedByIf);  // NOLINT
   }
 
   // Adjust terminator operands.
-  for (auto [region, retentionSet] :
-       llvm::zip(op->getRegions(), retentionSetsByRegion)) {
+  for (auto [region, transformResult] :
+       llvm::zip(op->getRegions(), transformResultsByRegion)) {
     if (region.empty()) continue;
     auto* terminator = region.front().getTerminator();
-    terminator->setOperands(terminator->getNumOperands(), 0, retentionSet);
+    terminator->setOperands(terminator->getNumOperands(), 0,
+                            transformResult.acquired);
   }
 
   ImplicitLocOpBuilder b(op.getLoc(), op);
   SmallVector<Value> operands = op->getOperands();
-  SmallVector<Value> released;
   // If we pass an owned memref to the loop and don't reuse it afterwards, we
   // can transfer ownership.
   for (auto operand : llvm::make_filter_range(operands, isMemref)) {
     auto isLastUse = [&]() {
       for (auto* candidate = op.getOperation(); candidate != nullptr;
            candidate = candidate->getNextNode()) {
-        if (doesAlias(candidate, operand, aliases,
-                      /*considerOperands=*/candidate != op.getOperation()))
+        if (doesAlias(candidate, operand, aliasOverapprox,
+                      /*considerOperands=*/candidate != op.getOperation())) {
           return false;
+        }
       }
       return true;
     };
 
-    auto ty = getUnrankedMemrefType(operand);
-    if (llvm::is_contained(ownedMemrefs, operand) && isLastUse() &&
-        !llvm::is_contained(released, operand)) {
+    Value ownershipIndicator = findOwnershipIndicator(operand);
+    if (ownershipIndicator &&
+        !llvm::is_contained(released, ownershipIndicator) &&
+        llvm::is_contained(ownedMemrefs, ownershipIndicator) && isLastUse()) {
       // This is an alloc that is not used again, so we can pass ownership
       // to the loop.
-      auto cast = b.create<memref::CastOp>(ty, operand);
-      op->insertOperands(op->getNumOperands(), cast.getResult());
-      released.push_back(operand);
+      op->insertOperands(op->getNumOperands(), ownershipIndicator);
+      released.insert(ownershipIndicator);
     } else {
       // Either the operand is not an alloc or it's reused.
-      op->insertOperands(op->getNumOperands(),
-                         b.create<NullOp>(ty).getResult());
+      op->insertOperands(op->getNumOperands(), b.create<NullOp>().getResult());
     }
   }
 
@@ -257,13 +430,13 @@ TransformResult Deallocator::transformOp(
     for (auto& region : getSuccessorRegions(newOp, index)) {
       for (auto [pred, succ] : llvm::zip(region.getPredecessorOperands(),
                                          region.getSuccessorValues())) {
-        aliases.unionSets(pred, succ);
+        aliasOverapprox.unionSets(pred, succ);
       }
     }
   };
   auto setMemrefAliases = [this](ValueRange a, ValueRange b) {
     for (auto [aa, bb] : llvm::zip(llvm::make_filter_range(a, isMemref), b)) {
-      aliases.unionSets(aa, bb);
+      aliasOverapprox.unionSets(aa, bb);
     }
   };
   setupAliases(std::nullopt);
@@ -274,20 +447,84 @@ TransformResult Deallocator::transformOp(
     setMemrefAliases(args.take_front(n), args.drop_front(n));
   }
   setMemrefAliases(newResults, retained);
-  return {released, retained};
+  return TransformResult{released, retained};
+}
+
+// TODO(frgossen): Also allow passing ownership to functions.
+FailureOr<TransformResult> Deallocator::transformOp(func::CallOp op) {
+  ImplicitLocOpBuilder b(op.getLoc(), op);
+
+  // Extend result types with ownership indicators.
+  SmallVector<Type> newResultTys(op.getResultTypes());
+  int64_t numMemrefResults = llvm::count_if(op.getResults(), isMemref);
+  newResultTys.append(
+      SmallVector<Type>(numMemrefResults, b.getType<OwnershipIndicatorType>()));
+  auto newOp = b.create<func::CallOp>(op.getCalleeAttr(), newResultTys,
+                                      op.getOperands());
+
+  // Follow the call graph and process the callee first to get accurate aliasing
+  // information.
+  auto callee = llvm::cast<func::FuncOp>(
+      op->getParentOfType<ModuleOp>().lookupSymbol(op.getCallee()));
+  if (failed(transformFuncOp(callee))) return failure();
+
+  // Update ownership indicators and aliasing.
+  int64_t numResults = op.getNumResults();
+  int64_t ownershipIndicatorIdx = numResults;
+  for (auto [result, resultAliasing] :
+       llvm::zip(newOp.getResults().take_front(numResults),
+                 functionAliasOverapprox[callee])) {
+    if (!isMemref(result)) continue;
+    setOwnershipIndicator(result, newOp.getResult(ownershipIndicatorIdx++));
+    for (int64_t i : resultAliasing) {
+      aliasOverapprox.unionSets(result, op.getOperand(i));
+    }
+  }
+
+  // Replace old op.
+  op.replaceAllUsesWith(newOp.getResults().take_front(numResults));
+  op.erase();
+
+  // Collect ownership indicators.
+  auto retained = newOp->getResults().drop_front(numResults);
+  return TransformResult{{}, retained};
 }
 
 // Returns the set of values that are potentially owned by the op.
-TransformResult Deallocator::transformOp(
+FailureOr<TransformResult> Deallocator::transformOp(
     Operation* op, const breaks_if_you_move_ops::ValueSet& ownedMemrefs) {
   if (auto rbi = llvm::dyn_cast<RegionBranchOpInterface>(op)) {
     return transformOp(rbi, ownedMemrefs);
   }
-  if (auto alloc = llvm::dyn_cast<memref::AllocOp>(op)) {
-    return {{}, {alloc.getResult()}};
+  if (auto callOp = llvm::dyn_cast<func::CallOp>(op)) {
+    return transformOp(callOp);
   }
-  if (auto func = llvm::dyn_cast<func::FuncOp>(op)) {
-    return {{}, transformBlock(func.getBody().front(), /*ownsInputs=*/false)};
+
+  if (auto me = llvm::dyn_cast<MemoryEffectOpInterface>(op)) {
+    TransformResult result;
+    OpBuilder b(op->getContext());
+    b.setInsertionPointAfter(op);
+
+    SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>> allocs,
+        frees;
+    me.getEffects<MemoryEffects::Allocate>(allocs);
+    me.getEffects<MemoryEffects::Free>(frees);
+    if (!allocs.empty() || !frees.empty()) {
+      for (const auto& alloc : allocs) {
+        auto owned = b.create<OwnOp>(op->getLoc(), alloc.getValue());
+        setOwnershipIndicator(alloc.getValue(), owned);
+        result.acquired.push_back(owned);
+      }
+      for (const auto& free : frees) {
+        auto ownershipIndicator = findOwnershipIndicator(free.getValue());
+        if (!ownershipIndicator) {
+          op->emitOpError("unable to find ownership indicator for operand");
+          return failure();
+        }
+        result.released.insert(ownershipIndicator);
+      }
+      return result;
+    }
   }
 
   // Deallocate ops inside unknown op regions.
@@ -298,7 +535,13 @@ TransformResult Deallocator::transformOp(
     assert(llvm::none_of(op->getResults(), isMemref));
     for (auto& region : op->getRegions()) {
       for (auto& block : region.getBlocks()) {
-        transformBlock(block, /*ownsInputs=*/false);
+        auto transformedBlock = transformBlock(block, /*ownsInputs=*/false);
+        if (failed(transformedBlock)) return failure();
+        if (!transformedBlock->acquired.empty() ||
+            !transformedBlock->released.empty()) {
+          op->emitOpError("block unexpectededly released or returned an alloc");
+          return failure();
+        }
       }
     }
   }
@@ -308,12 +551,12 @@ TransformResult Deallocator::transformOp(
     for (auto arg : llvm::make_filter_range(op->getOperands(), isMemref)) {
       if (getElementTypeOrSelf(result.getType()) ==
           getElementTypeOrSelf(arg.getType())) {
-        aliases.unionSets(result, arg);
+        aliasOverapprox.unionSets(result, arg);
       }
     }
   }
   // No new allocations or releases.
-  return {};
+  return TransformResult{};
 }
 
 #define GEN_PASS_DEF_DEALLOCATEPASS
@@ -321,14 +564,16 @@ TransformResult Deallocator::transformOp(
 
 struct DeallocatePass : public impl::DeallocatePassBase<DeallocatePass> {
   void runOnOperation() override {
-    Deallocator().transformOp(this->getOperation(), {});
+    ModuleOp moduleOp = getOperation();
+    if (failed(Deallocator().transformModuleOp(moduleOp))) {
+      signalPassFailure();
+    }
   }
 };
 
 }  // namespace
 
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createDeallocatePass() {
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createDeallocatePass() {
   return std::make_unique<DeallocatePass>();
 }
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocation_simplification.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocation_simplification.cc
new file mode 100644
index 00000000000..211d4188231
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocation_simplification.cc
@@ -0,0 +1,237 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <optional>
+
+#include "deallocation/IR/deallocation_ops.h"
+#include "deallocation/transforms/passes.h"
+#include "deallocation/utils/util.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace deallocation {
+namespace {
+
+// Returns the value owned by the given ownership indicator. Returns null if it
+// could not be determined.
+Value getOwnedValue(Value v) {
+  ValueRange vals;
+  unsigned valueNum;
+  if (auto bbarg = v.dyn_cast<BlockArgument>()) {
+    vals = v.getParentBlock()->getArguments();
+    valueNum = bbarg.getArgNumber();
+  } else {
+    vals = v.getDefiningOp()->getResults();
+    valueNum = v.cast<OpResult>().getResultNumber();
+  }
+
+  int64_t num = llvm::count_if(vals.take_front(valueNum), [](Value it) {
+    return it.getType().isa<OwnershipIndicatorType>();
+  });
+
+  auto memrefs = llvm::make_filter_range(
+      vals, [](Value it) { return it.getType().isa<BaseMemRefType>(); });
+
+  auto it = memrefs.begin();
+  for (auto end = memrefs.end(); it != end && num > 0; ++it) {
+    --num;
+  }
+  if (it == memrefs.end()) return {};
+  return *it;
+}
+
+enum AllocNullability : uint32_t {
+  UNDEFINED = 0,
+  ALWAYS_NULL = 1,
+  NEVER_NULL = 2,
+  SOMETIMES_NULL = 3
+};
+
+AllocNullability operator|=(AllocNullability& lhs, AllocNullability rhs) {
+  return lhs = static_cast<AllocNullability>(static_cast<uint32_t>(lhs) | rhs);
+}
+
+struct AllocInfo {
+  AllocNullability nullability;
+  // Set only if nullability is NEVER_NULL.
+  Value nonNullValue;
+};
+
+// Returns the nullability of `v`. `pending` contains a set of `Values` we're
+// already considering in the computation of some value's nullability. It is
+// assumed that we will eventually take the maximum (logical or) of all
+// nullability values in this set.
+AllocInfo getAllocNullabilityImpl(Value v, llvm::DenseSet<Value>& pending) {
+  if (llvm::isa_and_present<OwnOp>(v.getDefiningOp())) {
+    return {NEVER_NULL, v.getDefiningOp()->getOperand(0)};
+  }
+
+  if (llvm::isa_and_present<NullOp>(v.getDefiningOp())) {
+    return {ALWAYS_NULL, {}};
+  }
+
+  if (auto retain = llvm::dyn_cast_or_null<RetainOp>(v.getDefiningOp())) {
+    // We start with ALWAYS_NULL because a retain without any allocs is null.
+    // Also, because a retain with a non-null alloc can be null (otherwise, this
+    // would have been cleaned up by `retainNoOp`).
+    AllocNullability nullability = ALWAYS_NULL;
+    for (auto alloc : retain.getAllocs()) {
+      if (pending.insert(alloc).second) {
+        // We can ignore the non-null value here, since the final outcome won't
+        // be NEVER_NULL.
+        nullability |= getAllocNullabilityImpl(alloc, pending).nullability;
+      }
+      if (nullability == SOMETIMES_NULL) break;
+    }
+    return {nullability, {}};
+  }
+
+  // Returns the nullability of an operand in each of the region's predecessors.
+  auto getPredecessorNullability =
+      [&](RegionBranchOpInterface rbi,
+          std::optional<int64_t> successorRegionIndex,
+          int64_t successorArgIndex) -> AllocInfo {
+    AllocNullability nullability = UNDEFINED;
+    for (const auto& pred : getPredecessorRegions(rbi, successorRegionIndex)) {
+      Value operand = pred.getPredecessorOperand(successorArgIndex);
+      // It is safe to skip values that are already being considered higher
+      // up in the call stack, because we end up taking the maximum of all
+      // nullability values.
+      if (pending.insert(operand).second) {
+        nullability |= getAllocNullabilityImpl(operand, pending).nullability;
+      }
+      if (nullability == SOMETIMES_NULL) break;
+    }
+    if (nullability == NEVER_NULL) {
+      return {NEVER_NULL, getOwnedValue(v)};
+    }
+    return {nullability, {}};
+  };
+
+  // If `v` is a block argument, check all incoming edges.
+  if (auto bbarg = v.dyn_cast<BlockArgument>()) {
+    if (auto rbi = llvm::dyn_cast<RegionBranchOpInterface>(
+            bbarg.getParentRegion()->getParentOp())) {
+      return getPredecessorNullability(
+          rbi, bbarg.getParentRegion()->getRegionNumber(),
+          bbarg.getArgNumber());
+    }
+  }
+
+  if (auto rbi =
+          llvm::dyn_cast_or_null<RegionBranchOpInterface>(v.getDefiningOp())) {
+    return getPredecessorNullability(rbi, std::nullopt,
+                                     llvm::cast<OpResult>(v).getResultNumber());
+  }
+
+  // Something we don't understand.
+  return {AllocNullability::SOMETIMES_NULL, {}};
+}
+
+bool allocIsNull(Value v) {
+  llvm::DenseSet<Value> pendingChecks;
+  return getAllocNullabilityImpl(v, pendingChecks).nullability == ALWAYS_NULL;
+}
+
+// Returns true if the value is just passed around, but never really used.
+bool valueIsUnused(Value value) {
+  llvm::DenseSet<Value> pendingChecks;
+  std::function<bool(Value)> checkValue;
+  std::function<bool(OpOperand&)> checkUser;
+
+  checkUser = [&](OpOperand& user) -> bool {
+    std::optional<unsigned> regionIndex = std::nullopt;
+    auto rbi = llvm::dyn_cast<RegionBranchOpInterface>(user.getOwner());
+    if (user.getOwner()->mightHaveTrait<OpTrait::IsTerminator>()) {
+      rbi = llvm::dyn_cast<RegionBranchOpInterface>(
+          user.getOwner()->getParentOp());
+      regionIndex = user.getOwner()->getParentRegion()->getRegionNumber();
+    }
+    return rbi && llvm::all_of(getSuccessorRegions(rbi, regionIndex),
+                               [&](const RegionEdge& edge) {
+                                 return checkValue(edge.getSuccessorValue(
+                                     user.getOperandNumber()));
+                               });
+  };
+  checkValue = [&](Value value) {
+    if (!pendingChecks.insert(value).second) return true;
+    return llvm::all_of(value.getUses(), checkUser);
+  };
+
+  return checkValue(value);
+}
+
+#define GEN_PASS_DEF_DEALLOCATIONSIMPLIFICATIONPASS
+#include "deallocation/transforms/passes.h.inc"
+
+struct DeallocationSimplificationPass
+    : public impl::DeallocationSimplificationPassBase<
+          DeallocationSimplificationPass> {
+  void runOnOperation() override {
+    getOperation()->walk([](RetainOp op) {
+      OpBuilder b(op);
+      // If all allocs are null, the result is null and there is nothing to
+      // deallocate.
+      if (llvm::all_of(op.getAllocs(), allocIsNull)) {
+        auto null = b.create<NullOp>(op.getLoc());
+        auto nulls = llvm::SmallVector<Value>(op.getNumResults(), null);
+        op.replaceAllUsesWith(nulls);
+        op.erase();
+        return;
+      }
+
+      if (op.getRetained().empty() && op.getAllocs().size() == 1) {
+        llvm::DenseSet<Value> pendingChecks;
+        auto nullability =
+            getAllocNullabilityImpl(op.getAllocs()[0], pendingChecks);
+        if (nullability.nullability != NEVER_NULL ||
+            !nullability.nonNullValue) {
+          return;
+        }
+
+        b.setInsertionPoint(op);
+        b.create<memref::DeallocOp>(op.getLoc(), nullability.nonNullValue);
+        op.erase();
+      }
+    });
+    getOperation()->walk([](OwnOp op) {
+      if (op.use_empty()) {
+        op.erase();
+      } else if (valueIsUnused(op.getResult())) {
+        OpBuilder b(op);
+        op.replaceAllUsesWith(b.create<NullOp>(op.getLoc()).getResult());
+        op.erase();
+      }
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createDeallocationSimplificationPass() {
+  return std::make_unique<DeallocationSimplificationPass>();
+}
+
+}  // namespace deallocation
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocation_to_scf.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocation_to_scf.cc
index 32bdd623a7b..b6e82d4d4c3 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocation_to_scf.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocation_to_scf.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "deallocation/IR/deallocation_ops.h"
 #include "deallocation/transforms/passes.h"
-#include "deallocation/utils/util.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -39,11 +38,10 @@ namespace {
 LogicalResult rewriteRetain(RetainOp op, PatternRewriter& rewriter) {
   assert(!op.getAllocs().empty() && "run canonicalization first");
 
-  // `dealloc` happens to lower to free, which accepts null pointers. We still
-  // guard it with if, because this behavior is not documented and it makes
-  // downstream passes simpler (because they can assume we never deallocate
-  // null).
-
+  if (op.getRetained().size() != 1 && op.getAllocs().size() != 1) {
+    return rewriter.notifyMatchFailure(
+        op, "this retain needs to be lowered to a library call");
+  }
   // Note: The generated code has size O(|`allocs`| * |`retains`|). If there are
   // cases where this gets too big, we should lower it to a library call
   // instead.
@@ -53,12 +51,7 @@ LogicalResult rewriteRetain(RetainOp op, PatternRewriter& rewriter) {
   // Get the buffers of all `alloc` values.
   SmallVector<Value> remainingBuffersAndResult;
   for (Value alloc : op.getAllocs()) {
-    if (alloc.getType().isa<UnrankedMemRefType>()) {
-      remainingBuffersAndResult.push_back(alloc);
-    } else {
-      remainingBuffersAndResult.push_back(rewriter.create<memref::CastOp>(
-          loc, getUnrankedMemrefType(alloc), alloc));
-    }
+    remainingBuffersAndResult.push_back(alloc);
   }
   llvm::copy(llvm::map_range(op.getAllocs(),
                              [&](Value alloc) -> Value {
@@ -68,8 +61,7 @@ LogicalResult rewriteRetain(RetainOp op, PatternRewriter& rewriter) {
              std::back_inserter(remainingBuffersAndResult));
   remainingBuffersAndResult.push_back({});
 
-  Value null =
-      rewriter.create<NullOp>(loc, getUnrankedMemrefType(op.getAllocs()[0]));
+  Value null = rewriter.create<NullOp>(loc);
   auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
   SmallVector<Value> results;
 
@@ -110,16 +102,15 @@ LogicalResult rewriteRetain(RetainOp op, PatternRewriter& rewriter) {
   }
 
   // Deallocate any remaining buffers.
-  for (auto index : llvm::seq<size_t>(0, op.getAllocs().size())) {
+  for (auto index : llvm::seq<size_t>(0, nAllocs)) {
     auto nonZero = rewriter.create<arith::CmpIOp>(
         loc, arith::CmpIPredicate::ne,
-        remainingBuffersAndResult[index + op.getAllocs().size()], zero);
-    rewriter.create<scf::IfOp>(loc, nonZero,
-                               [&](OpBuilder& thenBuilder, Location loc) {
-                                 thenBuilder.create<memref::DeallocOp>(
-                                     loc, remainingBuffersAndResult[index]);
-                                 thenBuilder.create<scf::YieldOp>(loc);
-                               });
+        remainingBuffersAndResult[index + nAllocs], zero);
+    rewriter.create<scf::IfOp>(
+        loc, nonZero, [&](OpBuilder& thenBuilder, Location loc) {
+          thenBuilder.create<FreeOp>(loc, remainingBuffersAndResult[index]);
+          thenBuilder.create<scf::YieldOp>(loc);
+        });
   }
 
   rewriter.replaceOp(op, results);
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.h
index d648d3797ee..430a0e00519 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.h
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.h
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <memory>
 
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
@@ -31,20 +33,30 @@ createSplitAllocTensorsPass();
 // Pass to insert deallocations (in the form of `deallocation.retain`) ops. Most
 // deallocations are typically converted to `memref.dealloc` by
 // canonicalization.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> createDeallocatePass();
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createDeallocatePass();
+
+// Pass to annotate buffer arguments with aliasing information.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createXlaBufferArgRewritePass();
 
 // Pass to reuse buffers (hoisting, double buffering, dealloc/alloc
 // coalescing).
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 createBufferReusePass();
 
+// Lowers retain to SCF.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createDeallocationToScfPass();
+
 // Convert `deallocation` ops to LLVM.
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 createConvertDeallocationOpsToLLVM();
 
-// Lowers retain to SCF.
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createDeallocationToScfPass();
+createDeallocationSimplificationPass();
+
+void populateDeallocationToLLVMConversionPatterns(LLVMTypeConverter& converter,
+                                                  RewritePatternSet& patterns);
 
 #define GEN_PASS_REGISTRATION
 #include "deallocation/transforms/passes.h.inc"
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.td b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.td
index 9dbbbe83742..7d0e59697e7 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.td
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.td
@@ -27,7 +27,7 @@ def SplitAllocTensorsPass :
   let dependentDialects = ["::mlir::bufferization::BufferizationDialect"];
 }
 
-def DeallocatePass : Pass<"hlo-deallocate", "mlir::func::FuncOp"> {
+def DeallocatePass : Pass<"hlo-deallocate", "mlir::ModuleOp"> {
   let summary = "Deallocate buffers by inserting `deallocation.retain` ops.";
   let description = [{
     Inserts deallocations (in the form of `deallocation.retain`) ops. Most
@@ -38,6 +38,30 @@ def DeallocatePass : Pass<"hlo-deallocate", "mlir::func::FuncOp"> {
   let dependentDialects = ["::mlir::deallocation::DeallocationDialect"];
 }
 
+def DeallocationSimplificationPass : Pass<"hlo-deallocation-simplification",
+    "mlir::func::FuncOp"> {
+  let summary = "Simplifies deallocation.retain ops.";
+  let constructor = "::mlir::deallocation::createDeallocationSimplificationPass()";
+  let dependentDialects = ["::mlir::deallocation::DeallocationDialect"];
+}
+
+def XlaBufferArgRewritePass :
+    Pass<"hlo-xla-buffer-arg-rewrite", "mlir::func::FuncOp"> {
+  let summary = "Rewrites XLA framework buffer arguments with alias information";
+  let description = [{
+    In the presence of variables, some results of the main function will alias
+    other parameters. This pass rewrites the main function to annotate results
+    for which this isn't the case with the `deallocation.restrict` attribute,
+    indicating that they do not alias with any other buffer and allowing the
+    buffer-reuse pass to optimize them.
+
+    The pass uses attributes present in XLA programs
+    (`xla_framework.input_mapping`, `xla_framework.result_mapping` and
+    `xla_framework.result_inner_mapping`, specifically).
+  }];
+  let constructor = "::mlir::deallocation::createXlaBufferArgRewritePass()";
+}
+
 def BufferReusePass : Pass<"hlo-buffer-reuse", "mlir::func::FuncOp"> {
   let summary = "Reuse buffers.";
   let description = [{
@@ -63,7 +87,10 @@ def ConvertDeallocationOpsToLLVMPass
     : Pass<"hlo-convert-deallocation-ops-to-llvm", "mlir::func::FuncOp"> {
   let summary = "Convert `deallocation` ops to LLVM";
   let constructor = "::mlir::deallocation::createConvertDeallocationOpsToLLVM()";
-  let dependentDialects = ["mlir::LLVM::LLVMDialect"];
+  let dependentDialects = [
+    "::mlir::LLVM::LLVMDialect",
+    "::mlir::memref::MemRefDialect",
+  ];
 }
 
 def DeallocationToScfPass : Pass<"hlo-deallocation-to-scf",
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/xla_buffer_arg_rewrite.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/xla_buffer_arg_rewrite.cc
new file mode 100644
index 00000000000..919308910c2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/xla_buffer_arg_rewrite.cc
@@ -0,0 +1,91 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "deallocation/transforms/passes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/TypeUtilities.h"
+
+namespace mlir {
+namespace deallocation {
+namespace {
+
+#define GEN_PASS_DEF_XLABUFFERARGREWRITEPASS
+#include "deallocation/transforms/passes.h.inc"
+
+constexpr char kInputMapping[] = "xla_framework.input_mapping";
+constexpr char kResultMapping[] = "xla_framework.result_mapping";
+constexpr char kResultInnerMapping[] = "xla_framework.result_inner_mapping";
+
+struct XlaBufferArgRewritePass
+    : public impl::XlaBufferArgRewritePassBase<XlaBufferArgRewritePass> {
+  void runOnOperation() override {
+    func::FuncOp op = getOperation();
+    if (!op->hasAttr(kResultMapping)) return;
+
+    // Collect result arguments and input arguments.
+    auto results = llvm::to_vector(
+        llvm::make_filter_range(op.getArguments(), [&](auto arg) {
+          return op.getArgAttr(arg.getArgNumber(), kInputMapping) == nullptr;
+        }));
+    auto args =
+        llvm::to_vector(llvm::map_range(op.getArguments(), [&](auto arg) {
+          auto buffer = op.getArgAttrOfType<IntegerAttr>(arg.getArgNumber(),
+                                                         kInputMapping);
+          return buffer ? buffer.getInt() : -1;
+        }));
+
+    SmallVector<int64_t> resultMapping;
+    if (auto innerMapping = op->getAttrOfType<ArrayAttr>(kResultInnerMapping)) {
+      resultMapping = llvm::to_vector(llvm::map_range(
+          innerMapping.getAsValueRange<IntegerAttr>(),
+          [](const APInt& value) { return value.getSExtValue(); }));
+    } else if (auto mapping = op->getAttrOfType<IntegerAttr>(kResultMapping)) {
+      resultMapping = {mapping.getInt()};
+    }
+
+    if (resultMapping.size() != results.size()) {
+      op.emitOpError(
+          "number of result arguments does not match size of mapping");
+      signalPassFailure();
+      return;
+    }
+
+    for (auto [bufferIndex, result] : llvm::zip(resultMapping, results)) {
+      // If the result doesn't alias any argument, add the
+      // `deallocation.restrict` attribute to signal to the buffer reuse pass
+      // that this buffer is guaranteed not to alias any other argument.
+      if (!llvm::is_contained(args, bufferIndex)) {
+        op.setArgAttr(result.getArgNumber(), "deallocation.restrict",
+                      OpBuilder(op).getBoolAttr(true));
+      }
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createXlaBufferArgRewritePass() {
+  return std::make_unique<XlaBufferArgRewritePass>();
+}
+
+}  // namespace deallocation
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/utils/util.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/utils/util.cc
index 8288e2661b5..c7a3674144c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/utils/util.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/utils/util.cc
@@ -51,6 +51,21 @@ SmallVector<RegionEdge> getSuccessorRegions(RegionBranchOpInterface op,
     }
   }
 
+  // TODO(frgossen): Fix this in the `RegionBranchOpInterface`.
+  // RegionBranchOpInterface believes for ops are always executed at least once.
+  if (llvm::isa<scf::ForOp>(op) && !index) {
+    assert(llvm::none_of(edges,
+                         [](auto& edge) {
+                           return edge.successorRegionIndex == std::nullopt;
+                         }) &&
+           "this was fixed, please remove this if");
+    auto& edge = edges.emplace_back();
+    edge.successorRegionIndex = edge.predecessorRegionIndex = std::nullopt;
+    edge.successorOpOrRegion = edge.predecessorOp = op;
+    edge.successorValueIndex = 0;
+    edge.predecessorOperandIndex = 3;
+  }
+
   return edges;
 }
 
@@ -103,18 +118,5 @@ RegionBranchOpInterface moveRegionsToNewOpButKeepOldOp(
   return newOp;
 }
 
-Type getUnrankedMemrefType(Type ty) {
-  if (ty.isa<UnrankedMemRefType>()) {
-    return ty;
-  }
-  MemRefType memRefTy = llvm::cast<MemRefType>(ty);
-  return UnrankedMemRefType::get(memRefTy.getElementType(),
-                                 memRefTy.getMemorySpace());
-}
-
-Type getUnrankedMemrefType(Value v) {
-  return getUnrankedMemrefType(v.getType());
-}
-
 }  // namespace deallocation
 }  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/utils/util.h b/tensorflow/compiler/xla/mlir_hlo/deallocation/utils/util.h
index 8dc3ae80236..038a28e9c79 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/utils/util.h
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/utils/util.h
@@ -56,6 +56,10 @@ struct RegionEdge {
     return successorOpOrRegion.get<Region*>()->getArguments().drop_front(
         successorValueIndex);
   }
+
+  Value getSuccessorValue(unsigned predecessorIndex) const {
+    return getSuccessorValues()[predecessorIndex - predecessorOperandIndex];
+  }
 };
 
 // Returns predecessors of the given region. Includes nullopt if the region is
@@ -71,9 +75,6 @@ SmallVector<RegionEdge> getSuccessorRegions(RegionBranchOpInterface op,
 RegionBranchOpInterface moveRegionsToNewOpButKeepOldOp(
     RegionBranchOpInterface op);
 
-Type getUnrankedMemrefType(Type ty);
-Type getUnrankedMemrefType(Value v);
-
 namespace detail {
 // An arbitrary deterministic Value order.
 struct ValueComparator {
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/gml_st/CMakeLists.txt
index d8d21638c1f..47c038050ca 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/CMakeLists.txt
@@ -13,5 +13,6 @@
 # limitations under the License.
 
 add_subdirectory(IR)
+add_subdirectory(interfaces)
 add_subdirectory(transforms)
 add_subdirectory(utils)
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.cc
index 4d98bf06175..57210d920b0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "gml_st/IR/gml_st_ops.h"
 
+#include <tuple>
 #include <utility>
 
 #include "llvm/ADT/SetVector.h"
@@ -79,18 +80,35 @@ YieldOp FusionOp::getTerminator() {
 }
 
 void FusionOp::print(OpAsmPrinter &p) {
-  p << " (";
-  llvm::interleaveComma(
-      llvm::zip(getBody()->getArguments(), getInputs()), p, [&](auto it) {
-        Value inputRegionArg, input;
-        std::tie(inputRegionArg, input) = it;
-        p << inputRegionArg << " = " << input << ": " << input.getType();
-      });
-  p << ") ";
+  p << " ";
+  if (!getInputs().empty()) {
+    p << "ins(";
+    llvm::interleaveComma(
+        llvm::zip(getBody()->getArguments(), getInputs()), p, [&](auto it) {
+          Value inputRegionArg, input;
+          std::tie(inputRegionArg, input) = it;
+          p << inputRegionArg << " = " << input << ": " << input.getType();
+        });
+    p << ") ";
+  }
+
+  if (!getInits().empty()) {
+    p << "inits(";
+    llvm::interleaveComma(
+        llvm::zip(getBody()->getArguments().drop_front(getInputs().size()),
+                  getInits()),
+        p, [&](auto it) {
+          Value inputRegionArg, input;
+          std::tie(inputRegionArg, input) = it;
+          p << inputRegionArg << " = " << input << ": " << input.getType();
+        });
+    p << ") ";
+  }
 
   p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
 
-  p.printOptionalAttrDict(getOperation()->getAttrs());
+  p.printOptionalAttrDict(getOperation()->getAttrs(),
+                          {getOperandSegmentSizesAttrName()});
 
   if (!getResultTypes().empty()) {
     p << " : ";
@@ -115,9 +133,18 @@ ParseResult FusionOp::parse(OpAsmParser &parser, OperationState &result) {
     return success();
   };
 
-  // Parse argument list.
-  if (parser.parseCommaSeparatedList(AsmParser::Delimiter::Paren, parseElt))
-    return failure();
+  size_t numInputs = 0, numInits = 0;
+  if (succeeded(parser.parseOptionalKeyword("ins"))) {
+    if (parser.parseCommaSeparatedList(AsmParser::Delimiter::Paren, parseElt))
+      return failure();
+  }
+  numInputs = operands.size();
+
+  if (succeeded(parser.parseOptionalKeyword("inits"))) {
+    if (parser.parseCommaSeparatedList(AsmParser::Delimiter::Paren, parseElt))
+      return failure();
+  }
+  numInits = operands.size() - numInputs;
 
   SMLoc loc = parser.getCurrentLocation();
   if (parser.resolveOperands(operands, operandTypes, loc, result.operands))
@@ -138,20 +165,10 @@ ParseResult FusionOp::parse(OpAsmParser &parser, OperationState &result) {
   // Parser result types.
   if (parser.parseOptionalColonTypeList(result.types)) return failure();
 
-  return success();
-}
-
-LogicalResult FusionOp::verify() {
-  llvm::SetVector<Value> valuesDefinedAbove;
-  getUsedValuesDefinedAbove(getRegion(), getRegion(), valuesDefinedAbove);
-
-  for (Value v : valuesDefinedAbove) {
-    auto *definingOp = v.getDefiningOp();
-
-    if (!isa_and_nonnull<arith::ConstantOp>(definingOp))
-      return emitOpError() << "using value defined outside the region that is "
-                              "not 'arith.constant'.";
-  }
+  result.addAttribute(
+      "operand_segment_sizes",
+      parser.getBuilder().getDenseI32ArrayAttr(
+          {static_cast<int32_t>(numInputs), static_cast<int32_t>(numInits)}));
 
   return success();
 }
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.td b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.td
index 749a194140c..8a78bc5102e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.td
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.td
@@ -29,19 +29,22 @@ include "gml_st/IR/gml_st_ops_base.td"
 ///////////////////////////////////////////////////////////////////////////////
 
 def GMLST_FusionOp : GMLST_Op<"fusion", [
+      AttrSizedOperandSegments,
       DestinationStyleOpInterface,
+      IsolatedFromAbove,
       SingleBlockImplicitTerminator<"gml_st::YieldOp">
     ]> {
   let summary = "A cluster of operations to be tiled and fused.";
 
   let arguments = (ins Variadic<AnyType>:$inputs,
+                       Variadic<AnyType>:$inits,
                        OptionalAttr<DenseI64ArrayAttr>:$parallel_tile_sizes,
                        OptionalAttr<DenseI64ArrayAttr>:$reduction_tile_sizes);
   let results = (outs Variadic<AnyRankedTensor>:$results);
   let regions = (region SizedRegion<1>:$region);
 
   let hasCustomAssemblyFormat = 1;
-  let hasVerifier = 1;
+  let hasVerifier = 0;
 
   code extraClassDeclaration = [{
     /// Return terminator of the region body.
@@ -50,8 +53,8 @@ def GMLST_FusionOp : GMLST_Op<"fusion", [
     // Implement method necessary for DestinationStyleOpInterface.
     std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
       int64_t numOperands = this->getNumOperands();
-      int64_t numResults = this->getNumResults();
-      return {numOperands - numResults, numOperands};
+      int64_t numInits = this->getInits().size();
+      return {numOperands - numInits, numOperands};
     }
   }];
 }
diff --git a/tensorflow/python/tpu/_tpu_estimator_embedding.py b/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/CMakeLists.txt
similarity index 50%
rename from tensorflow/python/tpu/_tpu_estimator_embedding.py
rename to tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/CMakeLists.txt
index c97149f7ff8..da038529bba 100644
--- a/tensorflow/python/tpu/_tpu_estimator_embedding.py
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/CMakeLists.txt
@@ -1,19 +1,33 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-# http://www.apache.org/licenses/LICENSE-2.0
+#      https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Stub file to maintain backwards compatibility."""
 
-# pylint: disable=wildcard-import,unused-import
-from tensorflow_estimator.python.estimator.tpu._tpu_estimator_embedding import *
-# pylint: enable=wildcard-import,unused-import
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+set(LLVM_OPTIONAL_SOURCES
+  bufferizable_op_interface_impl.cc
+)
+
+add_mlir_library(GmlStBufferizableOpInterface
+  bufferizable_op_interface_impl.cc
+
+  LINK_LIBS PUBLIC
+  GmlStDialect
+  MLIRBufferizationDialect
+  MLIRBufferizationTransforms
+  MLIRDialectUtils
+  MLIRIR
+  MLIRSupport
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.cc
new file mode 100644
index 00000000000..06801a5caa3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.cc
@@ -0,0 +1,215 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/interfaces/bufferizable_op_interface_impl.h"
+
+#include <optional>
+#include <tuple>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+using mlir::bufferization::AliasingOpOperandList;
+using mlir::bufferization::AliasingOpResultList;
+using mlir::bufferization::AnalysisState;
+using mlir::bufferization::BufferizableOpInterface;
+using mlir::bufferization::BufferizationOptions;
+using mlir::bufferization::BufferRelation;
+
+struct FusionOpBufferizationInterface
+    : public BufferizableOpInterface::ExternalModel<
+          FusionOpBufferizationInterface, FusionOp> {
+  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
+                              const AnalysisState & /*state*/) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const AnalysisState & /*state*/) const {
+    return cast<FusionOp>(op).isDpsInit(&opOperand);
+  }
+
+  AliasingOpOperandList getAliasingOpOperands(
+      Operation *op, OpResult opResult, const AnalysisState & /*state*/) const {
+    auto fusionOp = cast<FusionOp>(op);
+
+    // The i-th OpResult aliases with the i-th "out" tensor.
+    return {{fusionOp.getDpsInitOperand(opResult.getResultNumber()),
+             BufferRelation::Equivalent}};
+  }
+
+  AliasingOpResultList getAliasingOpResults(
+      Operation *op, OpOperand &opOperand,
+      const AnalysisState & /*state*/) const {
+    auto fusionOp = cast<FusionOp>(op);
+
+    // The i-th "out" tensor aliases with the i-th OpResult.
+    if (fusionOp.isDpsInit(&opOperand)) {
+      return {
+          {fusionOp.getTiedOpResult(&opOperand), BufferRelation::Equivalent}};
+    }
+    return {};
+  }
+
+  bool isWritable(Operation * /*op*/, Value /*value*/,
+                  const AnalysisState & /*state*/) const {
+    return true;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    // Take a guard before anything else.
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPoint(op);
+
+    auto loc = op->getLoc();
+    FusionOp fusionOp = cast<FusionOp>(op);
+
+    // Nothing to do. This op is already bufferized.
+    if (fusionOp.hasBufferSemantics()) return success();
+
+    if (!fusionOp.hasTensorSemantics()) {
+      return op->emitError() << "expected either buffer or tensor semantics";
+    }
+
+    size_t numOutputs = fusionOp.getNumDpsInits();
+
+    // New operands for the cloned op.
+    SmallVector<Value> newOperands;
+    newOperands.reserve(fusionOp.getNumDpsInputs() + numOutputs);
+
+    for (OpOperand *opOperand : fusionOp.getDpsInputOperands()) {
+      if (fusionOp.isScalar(opOperand)) {
+        newOperands.push_back(opOperand->get());
+        continue;
+      }
+      FailureOr<Value> buffer = getBuffer(rewriter, opOperand->get(), options);
+      if (failed(buffer)) return failure();
+      newOperands.push_back(*buffer);
+    }
+
+    // New output operands for the cloned op.
+    SmallVector<Value> newOutputs;
+    newOutputs.reserve(numOutputs);
+
+    for (OpResult opResult : fusionOp->getOpResults()) {
+      OpOperand *opOperand =
+          fusionOp.getDpsInitOperand(opResult.getResultNumber());
+      FailureOr<Value> resultBuffer =
+          getBuffer(rewriter, opOperand->get(), options);
+      if (failed(resultBuffer)) return failure();
+      newOutputs.push_back(*resultBuffer);
+    }
+
+    newOperands.append(newOutputs.begin(), newOutputs.end());
+
+    // Set insertion point now that potential alloc/dealloc are introduced.
+    rewriter.setInsertionPoint(op);
+
+    // Clone the op, but use the new operands. Move the existing block into the
+    // new op. Since the new op does not have any tensor results, it does not
+    // return anything.
+    auto newFusionOp = cast<FusionOp>(cloneWithoutRegions(
+        rewriter, op, /*resultTypes=*/TypeRange{}, newOperands));
+
+    // Create empty region in the new bufferized op.
+    Region &region = newFusionOp.getRegion();
+    SmallVector<Type, 4> blockArgTypes =
+        llvm::to_vector(TypeRange(ValueRange(newOperands)));
+    SmallVector<Location, 4> blockArgLocs(blockArgTypes.size(), loc);
+    rewriter.createBlock(&region, region.end(), blockArgTypes, blockArgLocs);
+
+    ArrayRef<BlockArgument> bbArgs =
+        newFusionOp.getRegion().front().getArguments();
+    SmallVector<Value> bbArgsToTensors;
+    for (auto buf : bbArgs) {
+      if (isa<MemRefType>(buf.getType())) {
+        Value tensor = rewriter.create<bufferization::ToTensorOp>(loc, buf);
+        bbArgsToTensors.push_back(tensor);
+      } else {
+        bbArgsToTensors.push_back(buf);
+      }
+    }
+
+    // Move old body into new fusion op.
+    rewriter.mergeBlocks(fusionOp.getBody(), newFusionOp.getBody(),
+                         bbArgsToTensors);
+
+    // Copy results to output memrefs. In most of the cases it's not necessary,
+    // because clusters are constructed in a way that the result is produced by
+    // an dst-style op that already put everything in the output memrefs, but
+    // there are corner cases when it doesn't happen. For example, tiled 1d
+    // linalg.reduce.
+    rewriter.setInsertionPoint(newFusionOp.getTerminator());
+    for (auto [bbArg, resultValue] :
+         llvm::zip(bbArgs.take_back(numOutputs),
+                   newFusionOp.getTerminator().getValues())) {
+      if (auto toTensorOp =
+              resultValue.getDefiningOp<bufferization::ToTensorOp>()) {
+        rewriter.create<memref::CopyOp>(loc, toTensorOp.getMemref(), bbArg);
+      }
+    }
+
+    // Replace gml_st.yield values with output buffers.
+    rewriter.replaceOpWithNewOp<gml_st::YieldOp>(newFusionOp.getTerminator(),
+                                                 bbArgs.take_back(numOutputs));
+
+    // Replace the results of the old op with the new output buffers.
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, newOutputs);
+
+    return success();
+  }
+
+  FailureOr<BaseMemRefType> getBufferType(
+      Operation *op, Value value, const BufferizationOptions &options,
+      const DenseMap<Value, BaseMemRefType> &fixedTypes) const {
+    auto fusionOp = cast<FusionOp>(op);
+
+    if (auto bbArg = value.dyn_cast<BlockArgument>()) {
+      // A tensor block argument has the same bufferized type as the
+      // corresponding output operand.
+      return bufferization::getBufferType(
+          fusionOp->getOpOperand(bbArg.getArgNumber()).get(), options,
+          fixedTypes);
+    }
+
+    // The bufferized result type is the same as the bufferized type of the
+    // corresponding output operand.
+    return bufferization::getBufferType(
+        fusionOp.getDpsInitOperand(value.cast<OpResult>().getResultNumber())
+            ->get(),
+        options, fixedTypes);
+  }
+};
+
+}  // namespace
+}  // namespace gml_st
+}  // namespace mlir
+
+void mlir::gml_st::registerBufferizableOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(
+      +[](MLIRContext *ctx, gml_st::GmlStDialect * /*dialect*/) {
+        FusionOp::attachInterface<FusionOpBufferizationInterface>(*ctx);
+      });
+}
diff --git a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/gpu_plugin.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h
similarity index 64%
rename from tensorflow/lite/core/shims/c/experimental/acceleration/configuration/gpu_plugin.h
rename to tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h
index ecb7397b6de..54739c5b3ac 100644
--- a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/gpu_plugin.h
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h
@@ -12,9 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h"
+#ifndef MLIR_HLO_GML_ST_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
+#define MLIR_HLO_GML_ST_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
 
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
+namespace mlir {
+class DialectRegistry;
+
+namespace gml_st {
+
+void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
+
+}  // namespace gml_st
+}  // namespace mlir
+
+#endif
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/CMakeLists.txt
index 5ad3f910846..379af7e56a1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/CMakeLists.txt
@@ -28,25 +28,23 @@ include_directories(BEFORE
 
 add_mlir_library(GmlStPasses
   add_debug_info/add_debug_info.cc
+  canonicalization/optimize_linalg_ops.cc
   collapse_shape/collapse_shape.cc
+  collect_stats/collect_stats.cc
   compose_extract_insert_slice/compose_extract_insert_slice.cc
   copy_removal/copy_removal.cc
   cpu_tiling/cpu_tiling_pipeline.cc
   cpu_tiling/fusion_outlining.cc
   cpu_tiling/fusion_planning_for_cpu.cc
   cpu_tiling/pack_matmul.cc
-  cpu_tiling/transform_conv_for_cpu.cc
+  cpu_tiling/remove_label.cc
   cpu_tiling/transform_dot_for_cpu.cc
-  cpu_tiling/transform_map_for_cpu.cc
-  cpu_tiling/transform_matmul_for_cpu.cc
+  cpu_tiling/transform_elementwise_for_cpu.cc
   cpu_tiling/transform_mmt4d_for_cpu.cc
   cpu_tiling/transform_pack_for_cpu.cc
   cpu_tiling/transform_reduce_for_cpu.cc
-  cpu_tiling/transform_reverse_for_cpu.cc
   cpu_tiling/transform_scatter_for_cpu.cc
-  cpu_tiling/transform_transpose_for_cpu.cc
   fusion/fusion.cc
-  passes.cc
   peeling/peeling.cc
   rewrite_from_elements_op/rewrite_from_elements_op.cc
   rewrite_scf_forall/rewrite_scf_forall.cc
@@ -61,7 +59,6 @@ add_mlir_library(GmlStPasses
 
   DEPENDS
   MLIRGmlStPassIncGen
-  MLIRGmlStUtils
 
   LINK_COMPONENTS
   Core
@@ -72,6 +69,7 @@ add_mlir_library(GmlStPasses
   MLIRDestinationStyleOpInterface
   MLIRDialectUtils
   MLIRFuncDialect
+  MLIRGmlStUtils
   MLIRIR
   MLIRLinalgDialect
   MLIRLinalgTransforms
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/canonicalization/optimize_linalg_ops.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/canonicalization/optimize_linalg_ops.cc
new file mode 100644
index 00000000000..3bc5c40f95d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/canonicalization/optimize_linalg_ops.cc
@@ -0,0 +1,167 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/transforms.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::gml_st {
+namespace {
+
+#define GEN_PASS_DEF_OPTIMIZELINALGOPSPASS
+#include "gml_st/transforms/passes.h.inc"
+
+std::optional<Value> getSplatValue(PatternRewriter& rewriter, Location loc,
+                                   Value value) {
+  auto* definingOp = value.getDefiningOp();
+  if (!definingOp) return std::nullopt;
+
+  if (auto constantOp = dyn_cast_or_null<arith::ConstantOp>(definingOp)) {
+    auto denseElementsAttr =
+        constantOp.getValue().dyn_cast<DenseElementsAttr>();
+
+    if (!denseElementsAttr.isSplat()) return std::nullopt;
+
+    auto splatAttr = denseElementsAttr.getSplatValue<Attribute>();
+    auto splatType = denseElementsAttr.getElementType();
+
+    if (complex::ConstantOp::isBuildableWith(splatAttr, splatType))
+      return rewriter.create<complex::ConstantOp>(loc, splatType,
+                                                  splatAttr.cast<ArrayAttr>());
+
+    return rewriter.create<arith::ConstantOp>(loc, cast<TypedAttr>(splatAttr));
+  }
+
+  if (auto fillOp = dyn_cast_or_null<linalg::FillOp>(definingOp))
+    return fillOp.getInputs()[0];
+  return std::nullopt;
+}
+
+LogicalResult foldConstantOperandsIntoMap(linalg::MapOp op,
+                                          PatternRewriter& rewriter) {
+  auto loc = op->getLoc();
+  SmallVector<Value> newInputs;
+  IRMapping mapping;
+
+  for (auto [operand, bbArg] :
+       llvm::zip(op.getDpsInputOperands(), op.getBody()->getArguments())) {
+    auto constantValue = getSplatValue(rewriter, loc, operand->get());
+    if (constantValue.has_value()) {
+      mapping.map(bbArg, *constantValue);
+    } else {
+      newInputs.push_back(operand->get());
+    }
+  }
+
+  // No constant operands found.
+  if (newInputs.size() == op.getInputs().size()) return failure();
+
+  auto newMapOp = rewriter.create<linalg::MapOp>(loc, op.getResultTypes(),
+                                                 /*inputs=*/newInputs,
+                                                 /*init=*/op.getInit());
+  rewriter.cloneRegionBefore(op.getRegion(), newMapOp.getRegion(),
+                             newMapOp.getRegion().begin(), mapping);
+  rewriter.replaceOp(op, newMapOp.getResults());
+
+  return success();
+}
+
+// Replace linalg.map with no inputs with an linalg.fill.
+LogicalResult replaceConstantMapWithFill(linalg::MapOp op,
+                                         PatternRewriter& rewriter) {
+  // Only replace linalg.map that has no inputs.
+  if (!op.getInputs().empty()) return failure();
+
+  // linalg.index indicates that region result is not constant.
+  if (!op.getBody()->getOps<linalg::IndexOp>().empty()) return failure();
+
+  // Move all ops outside of the region. It's safe, because this linalg.map has
+  // only implicit arguments.
+  for (Operation& regionOp :
+       llvm::make_early_inc_range(op.getBody()->without_terminator())) {
+    regionOp.moveBefore(op);
+  }
+
+  // Get fill value from gml_st.yield operand.
+  auto yieldValue = op.getBody()->getTerminator()->getOperand(0);
+
+  rewriter.replaceOpWithNewOp<linalg::FillOp>(op, yieldValue, op.getInit());
+  return success();
+}
+
+// Replace linalg.broadcast(single_element_tensor) with linalg.fill.
+LogicalResult replaceBroadcastWithFill(linalg::BroadcastOp op,
+                                       PatternRewriter& rewriter) {
+  Value input = op.getInput();
+  auto inputType = dyn_cast<RankedTensorType>(input.getType());
+  if (!inputType) return failure();
+
+  Location loc = op.getLoc();
+  Value scalar;
+  if (auto splatValue = getSplatValue(rewriter, loc, input)) {
+    scalar = *splatValue;
+  } else if (hasSingleElement(inputType)) {
+    SmallVector<Value> indicesInput(
+        inputType.getRank(), rewriter.create<arith::ConstantIndexOp>(loc, 0));
+    scalar = rewriter.create<tensor::ExtractOp>(loc, input, indicesInput);
+  }
+  if (!scalar) return failure();
+  rewriter.replaceOpWithNewOp<linalg::FillOp>(op, scalar, op.getInit());
+  return success();
+}
+
+struct OptimizeLinalgOpsPass
+    : public impl::OptimizeLinalgOpsPassBase<OptimizeLinalgOpsPass> {
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext* ctx = &getContext();
+
+    // Populate patterns.
+    RewritePatternSet patterns(ctx);
+    patterns.add(foldConstantOperandsIntoMap);
+    patterns.add(replaceBroadcastWithFill);
+    patterns.add(replaceConstantMapWithFill);
+
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createOptimizeLinalgOpsPass() {
+  return std::make_unique<gml_st::OptimizeLinalgOpsPass>();
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/collect_stats/collect_stats.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/collect_stats/collect_stats.cc
new file mode 100644
index 00000000000..577a0865b73
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/collect_stats/collect_stats.cc
@@ -0,0 +1,123 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "gml_st/transforms/passes.h"
+#include "gml_st/utils/tensor_utils.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_COLLECTSTATSPASS
+#include "gml_st/transforms/passes.h.inc"
+
+using NameToOpMap =
+    std::unordered_map<std::string, llvm::SmallVector<Operation *, 4>>;
+
+struct CollectStatsPass : public impl::CollectStatsPassBase<CollectStatsPass> {
+  using CollectStatsPassBase<CollectStatsPass>::CollectStatsPassBase;
+
+  explicit CollectStatsPass(int64_t level) { detailLevel = level; }
+
+  void runOnOperation() override {
+    if (detailLevel <= 0) return;
+    func::FuncOp func = getOperation();
+
+    func.walk([&](Operation *op) {
+      if (!isa<TilingInterface, tensor::CollapseShapeOp, tensor::EmptyOp,
+               tensor::ExpandShapeOp, tensor::PackOp, tensor::PadOp,
+               tensor::UnPackOp>(op))
+        return WalkResult::advance();
+
+      std::string key = op->getName().getStringRef().str();
+      if (auto collapseShapeOp = dyn_cast<tensor::CollapseShapeOp>(op)) {
+        key += isDegenerateReshapeOp(collapseShapeOp) ? " (degenerate)"
+                                                      : " (non-degenerate)";
+      }
+      map[key].push_back(op);
+      return WalkResult::advance();
+    });
+
+    printStats();
+  }
+
+ private:
+  void printStats() {
+    llvm::outs() << "*** Tileable ops stats (detail level " << detailLevel
+                 << ") ***\n";
+    for (auto it : map) {
+      auto name = it.first;
+      auto ops = it.second;
+      llvm::outs() << ops.size() << "x " << name << "\n";
+      // If we want the op name only, stop here.
+      if (detailLevel == 1) continue;
+      for (size_t i = 0; i < ops.size(); ++i) {
+        auto *op = ops[i];
+        llvm::outs().indent(2) << i + 1 << ". ";
+        op->print(llvm::outs());
+        llvm::outs() << '\n';
+        // If we want the full op string only, stop here.
+        if (detailLevel == 2) continue;
+        // Otherwise print info about the producers and consumers of the op.
+        llvm::outs().indent(4) << "Producers:\n";
+        for (auto operand : op->getOperands()) {
+          if (auto loopLikeProducer =
+                  operand.getDefiningOp<LoopLikeOpInterface>()) {
+            llvm::outs().indent(6)
+                << loopLikeProducer->getName().getStringRef() << '\n';
+          } else {
+            operand.print(llvm::outs().indent(6));
+            llvm::outs() << '\n';
+          }
+        }
+        llvm::outs().indent(4) << "Consumers:\n";
+        for (auto user : op->getUsers()) {
+          user->print(llvm::outs().indent(6));
+          llvm::outs() << '\n';
+        }
+      }
+      llvm::outs() << '\n';
+    }
+  }
+
+  int64_t detailLevel;
+  NameToOpMap map;
+};
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createCollectStatsPass() {
+  return std::make_unique<CollectStatsPass>();
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> createCollectStatsPass(
+    int64_t level) {
+  return std::make_unique<CollectStatsPass>(level);
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc
index b87e685acac..1d2e1f0b1e8 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc
@@ -17,27 +17,59 @@ limitations under the License.
 #include <utility>
 
 #include "gml_st/transforms/passes.h"
+#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/Transforms/Transforms.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
-namespace mlir {
-namespace gml_st {
+namespace mlir::gml_st {
 namespace {
 
+using tensor::ExtractOp;
+using tensor::ExtractSliceOp;
+
 #define GEN_PASS_DEF_COMPOSEEXTRACTINSERTSLICEPASS
 #include "gml_st/transforms/passes.h.inc"
 
+LogicalResult composeExtractOfExtractSlice(ExtractOp extractOp,
+                                           PatternRewriter& rewriter) {
+  auto sliceOp = extractOp.getTensor().getDefiningOp<ExtractSliceOp>();
+  if (!sliceOp) return failure();
+
+  Location loc = extractOp.getLoc();
+  SmallVector<OpFoldResult> combinedOffsets, combinedSizes, combinedStrides;
+
+  // ExtractOp can be viewed as ExtractSliceOp as extracts 1x...x1 slice.
+  int64_t rank = extractOp.getTensor().getType().getRank();
+  SmallVector<OpFoldResult> consumerOffsets(
+      getAsOpFoldResult(extractOp.getIndices()));
+  SmallVector<OpFoldResult> consumerSizes(rank, rewriter.getIndexAttr(1));
+  SmallVector<OpFoldResult> consumerStrides(rank, rewriter.getIndexAttr(1));
+
+  if (failed(affine::mergeOffsetsSizesAndStrides(
+          rewriter, loc, sliceOp.getMixedOffsets(), sliceOp.getMixedSizes(),
+          sliceOp.getMixedStrides(), sliceOp.getDroppedDims(), consumerOffsets,
+          consumerSizes, consumerStrides, combinedOffsets, combinedSizes,
+          combinedStrides)))
+    return failure();
+
+  rewriter.replaceOpWithNewOp<ExtractOp>(
+      extractOp, sliceOp.getSource(),
+      getValueOrCreateConstantIndexOp(rewriter, loc, combinedOffsets));
+  return success();
+}
+
 struct ComposeExtractInsertSlicePass
     : public impl::ComposeExtractInsertSlicePassBase<
           ComposeExtractInsertSlicePass> {
   void runOnOperation() override {
     MLIRContext* ctx = &getContext();
     RewritePatternSet patterns(ctx);
+    patterns.add(composeExtractOfExtractSlice);
     tensor::populateMergeConsecutiveInsertExtractSlicePatterns(patterns);
-
-    populateCollapseForallOpDimensionsPattern(patterns);
     if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                             std::move(patterns)))) {
       return signalPassFailure();
@@ -52,5 +84,4 @@ createComposeExtractInsertSlicePass() {
   return std::make_unique<ComposeExtractInsertSlicePass>();
 }
 
-}  // namespace gml_st
-}  // namespace mlir
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/copy_removal/copy_removal.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/copy_removal/copy_removal.cc
index 5ea1582a3be..a5ebefdef58 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/copy_removal/copy_removal.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/copy_removal/copy_removal.cc
@@ -30,7 +30,7 @@ namespace {
 /// Remove memref::CopyOp whose target (can be either a memref::SubViewOp or
 /// memref::AllocOp) has no other users.
 LogicalResult removeCopy(memref::CopyOp op, PatternRewriter &rewriter) {
-  auto valueIt = op.getTarget();
+  Value valueIt = op.getTarget();
   Operation *onlyNonStoreLikeUser = op;
   for (auto subviewOp = valueIt.getDefiningOp<memref::SubViewOp>(); subviewOp;
        onlyNonStoreLikeUser = subviewOp, valueIt = subviewOp.getSource(),
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc
index b9438b82a69..5e2649621f1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <functional>
-#include <iostream>
 
 #include "gml_st/transforms/passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -26,146 +25,88 @@ limitations under the License.
 namespace mlir {
 namespace gml_st {
 
-GmlStCPUTilingOptions getDefaultCPUPipelineOptions(StringRef cpuName) {
+GmlStCPUTilingOptions getDefaultCPUPipelineOptions(StringRef cpuName,
+                                                   int64_t statsDetailLevel) {
   GmlStCPUTilingOptions opts;
   opts.vectorSize = 8;
-  opts.reduction1DTileSize = 32;
-  opts.reduction2DTileSizes = {4, 4};
+  opts.reductionEnableHeuristic = false;
+  opts.reduction1DSplitRatio = 8;
+  opts.reduction1DTileSize = 8;
+  opts.reduction2DParallelDimTileSize = 4;
+  opts.reduction2DReductionDimTileSize = 4;
   opts.matmulTileSizes = {};
+  // TODO(vuson): Re-enable or remove this:
+  opts.vectorizationSizeThreshold = 0;
+  opts.vectorizationTiledSizeThreshold = 1024;
   opts.lowerToMmt4d = false;
   opts.cpuName = cpuName;
+  opts.statsDetailLevel = statsDetailLevel;
+  opts.fuseDegenerateReshapes = false;
+  opts.inlineFusionClusters = true;
   return opts;
 }
 
-namespace {
-
-int64_t roundDownToPowerOfTwo(int64_t n) {
-  if ((n & (n - 1)) == 0) return n;
-  n |= n >> 1;
-  n |= n >> 2;
-  n |= n >> 4;
-  n |= n >> 8;
-  n |= n >> 16;
-  n |= n >> 32;
-  return (n + 1) >> 1;
-}
-
-// Tiling heuristic that was tuned for static power-of-two sized shapes on
-// Skylake.
-MatmulSizes skylakeTilingHeuristic(MatmulSizes sizes) {
-  if (sizes.m == 1) {
-    return {1, sizes.n, 1};
-  }
-
-  if (sizes.n == 1) {
-    if (sizes.k <= 8) {
-      return {1, 1, 1};
-    }
-    return {std::min<int64_t>(8, sizes.m), 1, 4};
-  }
-
-  MatmulSizes result;
-  result.k = sizes.k <= 8 ? 1 : 4;
-  result.n = std::min<int64_t>(8, sizes.n) << (sizes.m <= 16 ? 1 : 0);
-  result.m = std::min<int64_t>(32, sizes.m) << (sizes.n <= 4 ? 1 : 0);
-  return result;
-}
-
-// Tiling heuristic that was tuned for static power-of-two sized shapes on Zen
-// v2 ("Rome").
-MatmulSizes znver2TilingHeuristic(MatmulSizes sizes) {
-  MatmulSizes result;
-  result.k = sizes.n == 1 ? 8 : 1;
-  if (sizes.n == 1) {
-    result.m = sizes.k >= 32 ? 16 : 8;
-  } else {
-    result.m = sizes.n <= 8 ? 8 : 4;
-  }
-  if (sizes.m == 1) {
-    result.n = std::min<int64_t>(64, sizes.n) * (sizes.k <= 64 ? 1 : 2);
-  } else {
-    result.n = std::min<int64_t>(16, sizes.n);
-  }
-  return result;
-}
-
-std::function<MatmulSizes(MatmulSizes)> wrapHeuristic(
-    const std::function<MatmulSizes(MatmulSizes)>& heuristic,
-    MatmulSizes dynamicDefault) {
-  return [=](MatmulSizes sizes) {
-    if (sizes.n < 0 || sizes.m < 0 || sizes.k < 0) {
-      return dynamicDefault;
-    }
-
-    sizes.m = roundDownToPowerOfTwo(sizes.m);
-    sizes.n = roundDownToPowerOfTwo(sizes.n);
-    sizes.k = roundDownToPowerOfTwo(sizes.k);
-
-    return heuristic(sizes);
-  };
-}
-
-}  // namespace
-
 void addCPUTilingPipeline(OpPassManager& pm,
                           const GmlStCPUTilingOptions& options) {
   using func::FuncOp;
 
-  if (options.enableFusionClusters) {
-    pm.addNestedPass<FuncOp>(createFusionPlanningForCpuPass());
-  }
+  pm.addNestedPass<FuncOp>(createCollectStatsPass(options.statsDetailLevel));
+  pm.addNestedPass<FuncOp>(createScalarizationPass(false));
+  pm.addNestedPass<FuncOp>(
+      createVectorizeForCPUPass(options.vectorizationSizeThreshold));
 
-  // Outline and deduplicate fusion clusters.
-  if (options.enableFusionClusterOutlining) {
-    pm.addPass(createFusionOutliningPass());
-    pm.addPass(func::createDuplicateFunctionEliminationPass());
-  }
+  if (options.lowerToMmt4d) pm.addNestedPass<FuncOp>(createPackMatmulPass());
 
-  if (options.lowerToMmt4d) {
-    pm.addNestedPass<FuncOp>(createPackMatmulPass());
-  }
-
-  pm.addNestedPass<FuncOp>(createTransformConvForCpuPass());
   pm.addNestedPass<FuncOp>(createTransformScatterForCpuPass());
-  pm.addNestedPass<FuncOp>(createTransformReduceForCpuPass(
-      options.vectorSize, options.reduction1DTileSize,
-      options.reduction2DTileSizes));
-  std::function<MatmulSizes(MatmulSizes)> tilingHeuristic;
-  if (!options.matmulTileSizes.empty()) {
-    MatmulSizes fixedSizes{options.matmulTileSizes[0],
-                           options.matmulTileSizes[1],
-                           options.matmulTileSizes[2]};
-    tilingHeuristic = [=](MatmulSizes) { return fixedSizes; };
-  } else {
-    tilingHeuristic = options.cpuName.starts_with("znver")
-                          ? wrapHeuristic(znver2TilingHeuristic, {16, 8, 8})
-                          : wrapHeuristic(skylakeTilingHeuristic, {16, 16, 4});
-  }
-  pm.addNestedPass<FuncOp>(createTransformDotForCpuPass(tilingHeuristic));
-  pm.addNestedPass<FuncOp>(createTransformMatmulForCpuPass(tilingHeuristic));
+
+  pm.addNestedPass<FuncOp>(
+      createTransformDotForCpuPass(options.matmulTileSizes, options.cpuName));
+  TransformReduceForCpuPassOptions reductionOpts;
+  reductionOpts.enableHeuristic = options.reductionEnableHeuristic;
+  reductionOpts.tileSize1D = options.reduction1DTileSize;
+  reductionOpts.splitRatio1D = options.reduction1DSplitRatio;
+  reductionOpts.parallelDimTileSize2D = options.reduction2DParallelDimTileSize;
+  reductionOpts.reductionDimTileSize2D =
+      options.reduction2DReductionDimTileSize;
+  pm.addNestedPass<FuncOp>(createTransformReduceForCpuPass(reductionOpts));
+
+  // Upstream generalization of tensor.pack/unpack (i.e. tensor.pack/unpack ->
+  // tensor.pad + linalg.transpose + tensor.insert_slice) does not transfer
+  // transformed labels from tensor.pack/unpack to linalg.transpose and thus
+  // makes the latter being tiled again.
+  // Hence, elementwise ops transformation needs to be run before pack/unpack
+  // transformation.
+  pm.addNestedPass<FuncOp>(createTransformElementwiseForCpuPass(
+      options.vectorSize, options.fuseDegenerateReshapes));
   pm.addNestedPass<FuncOp>(createTransformMmt4DForCpuPass());
   pm.addNestedPass<FuncOp>(createTransformPackForCpuPass());
 
-  pm.addNestedPass<FuncOp>(createTransformTransposeForCpuPass());
-  pm.addNestedPass<FuncOp>(createTransformMapForCpuPass(options.vectorSize));
-  pm.addNestedPass<FuncOp>(createTransformReverseForCpuPass());
-
-  pm.addNestedPass<FuncOp>(createInlineFusionClustersPass());
+  if (options.inlineFusionClusters)
+    pm.addNestedPass<FuncOp>(createInlineFusionClustersPass());
 
   pm.addPass(createCSEPass());
   pm.addPass(createCanonicalizerPass());
 
   pm.addNestedPass<FuncOp>(createRewriteForallOpPass());
   pm.addNestedPass<FuncOp>(createComposeExtractInsertSlicePass());
-  pm.addNestedPass<FuncOp>(createVectorizeForCPUPass());
+  pm.addNestedPass<FuncOp>(
+      createVectorizeForCPUPass(options.vectorizationTiledSizeThreshold));
 
   // Tile remaining ops by size one and scalarize what we can.
   pm.addNestedPass<FuncOp>(createTileByOnePass());
   pm.addNestedPass<FuncOp>(createScalarizationPass());
+  pm.addNestedPass<FuncOp>(createComposeExtractInsertSlicePass());
+
+  pm.addPass(createCanonicalizerPass());
+
+  // Remove transformed labels after tiling all ops.
+  pm.addNestedPass<FuncOp>(createRemoveLabelPass());
 }
 
-void addDefaultCPUTilingPipeline(OpPassManager& pm, StringRef cpuName) {
-  addCPUTilingPipeline(pm, getDefaultCPUPipelineOptions(cpuName));
+void addDefaultCPUTilingPipeline(OpPassManager& pm, StringRef cpuName,
+                                 int64_t statsDetailLevel) {
+  addCPUTilingPipeline(pm,
+                       getDefaultCPUPipelineOptions(cpuName, statsDetailLevel));
 }
 
 }  // namespace gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_outlining.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_outlining.cc
index 7007a697a51..83daddef16f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_outlining.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_outlining.cc
@@ -33,23 +33,14 @@ namespace {
 #define GEN_PASS_DEF_FUSIONOUTLININGPASS
 #include "gml_st/transforms/passes.h.inc"
 
-static constexpr char kFusionFunctionLabel[] = "fusion";
+constexpr llvm::StringRef kFusionFunctionLabel = "fusion";
+constexpr llvm::StringRef kElementwiseLabel = "__elementwise_label__";
 
 void outlineFusionOp(func::FuncOp parentFuncOp, gml_st::FusionOp fusionOp,
                      int64_t localFusionId, PatternRewriter& rewriter) {
   Location loc = fusionOp.getLoc();
   MLIRContext* ctx = fusionOp.getContext();
 
-  // Find implicit operands, all of which must be constant-like.
-  Region& fusionBody = fusionOp.getBodyRegion();
-  SetVector<Operation*> implicitConstantLikeOperands;
-  visitUsedValuesDefinedAbove({fusionBody}, [&](OpOperand* operand) -> void {
-    Operation* def = operand->get().getDefiningOp();
-    assert(def && def->getNumOperands() == 0 && isPure(def) &&
-           "expect only constant-like implicit operands");
-    implicitConstantLikeOperands.insert(def);
-  });
-
   // Generate outlined fusion func ops right before the parent func op.
   rewriter.setInsertionPoint(parentFuncOp);
   std::string funcName =
@@ -77,17 +68,6 @@ void outlineFusionOp(func::FuncOp parentFuncOp, gml_st::FusionOp fusionOp,
   // Forward fusion op results.
   rewriter.create<func::ReturnOp>(loc, newFusionOp->getResults());
 
-  // Clone and replace constant-like implicit operands.
-  rewriter.setInsertionPointToStart(funcBlock);
-  for (Operation* constantLikeOp : implicitConstantLikeOperands) {
-    Operation* clonedConstantLikeOp = rewriter.clone(*constantLikeOp);
-    for (auto it : llvm::zip(constantLikeOp->getResults(),
-                             clonedConstantLikeOp->getResults())) {
-      replaceAllUsesInRegionWith(std::get<0>(it), std::get<1>(it),
-                                 funcOp.getBody());
-    }
-  }
-
   // Replace fusion op with a call to the newly outlined function.
   rewriter.setInsertionPoint(fusionOp);
   rewriter.replaceOpWithNewOp<func::CallOp>(fusionOp, funcOp,
@@ -102,6 +82,9 @@ LogicalResult outlineFusionOpPattern(func::FuncOp funcOp,
   // Outline fusion ops one by one.
   int64_t numOutlinedFusions = 0;
   funcOp.walk([&](gml_st::FusionOp fusionOp) {
+    // TODO(shyshkov): Enable outlining for elementwise clusters.
+    if (hasLabel(fusionOp, kElementwiseLabel)) return;
+
     outlineFusionOp(funcOp, fusionOp, numOutlinedFusions++, rewriter);
   });
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc
index a55cba07b38..73a5a3a9ee4 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc
@@ -22,11 +22,9 @@ limitations under the License.
 #include "gml_st/transforms/passes.h"
 #include "gml_st/transforms/transforms.h"
 #include "llvm/ADT/STLExtras.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "mlir/Interfaces/TilingInterface.h"
@@ -60,9 +58,14 @@ bool isElementwiseOp(Operation* op) {
 
 // Returns true is consumer and producer should be fused and tiled together.
 bool allowedToFuse(Operation* consumerOp, Operation* producerOp) {
+  // Verify that only known ops are fused.
+  if (!isa<linalg::LinalgDialect, tensor::TensorDialect, thlo::THLODialect>(
+          producerOp->getDialect()))
+    return false;
+
   if (isa<thlo::ScatterOp, thlo::SortOp>(producerOp)) return false;
 
-  if (isa<linalg::FillOp>(producerOp)) {
+  if (isa<linalg::FillOp, tensor::EmptyOp>(producerOp)) {
     auto dstStyleOp = dyn_cast<DestinationStyleOpInterface>(consumerOp);
     if (!dstStyleOp) return false;
 
@@ -96,7 +99,7 @@ LogicalResult fusionPattern(OpTy op, PatternRewriter& rewriter) {
 
   for (auto& use : op->getUses()) {
     auto* useOp = use.getOwner();
-    // This op can be potentially fused into one of the consumens. Wait until
+    // This op can be potentially fused into one of the consumers. Wait until
     // that other op is processed.
     if (useOp && allowedToFuse(useOp, op.getOperation())) return failure();
   }
@@ -147,33 +150,6 @@ LogicalResult fusionPattern(OpTy op, PatternRewriter& rewriter) {
   return success();
 }
 
-// Duplicate linalg.fill op with rank-0 tensors results that have multiple
-// users. If linalg.fill is used inside and outside of a fusion cluster, it will
-// not be fused and can break some other passes that expect linalg.reduce inits
-// to be linalg.fill.
-LogicalResult copyConstantLikeFillOp(linalg::FillOp fillOp,
-                                     PatternRewriter& rewriter) {
-  // Only modify ops that fill rank-0 tensors.
-  if (fillOp.getRank(fillOp.getDpsInitOperand(0)) != 0) return failure();
-
-  // Nothing to do, because the op has 0 or 1 users.
-  if (std::distance(fillOp->user_begin(), fillOp->user_end()) <= 1)
-    return failure();
-
-  bool modified = false;
-  for (auto& use : llvm::make_early_inc_range(fillOp->getUses())) {
-    Operation* ownerOp = use.getOwner();
-
-    auto dstStyleOp = dyn_cast<DestinationStyleOpInterface>(ownerOp);
-    if (!dstStyleOp || !dstStyleOp.isDpsInit(&use)) continue;
-
-    auto newFillOp = cast<linalg::FillOp>(rewriter.clone(*fillOp));
-    use.set(newFillOp.getResult(0));
-    modified = true;
-  }
-  return success(modified);
-}
-
 // Add attributes with tile sizes for parallel and reduction dimensions.
 // Attribute is empty if there is nothing to tile across respective dimensions.
 struct ComputeTileSizesPattern : public OpRewritePattern<gml_st::FusionOp> {
@@ -220,21 +196,20 @@ struct FusionPlanningForCpuPass
 
   void runOnOperation() override {
     func::FuncOp f = getOperation();
-    MLIRContext* context = &getContext();
+    MLIRContext* ctx = &getContext();
 
     // Cleanup passes to prepare ops for better clustering.
     {
-      RewritePatternSet patterns(context);
-      patterns.add(copyConstantLikeFillOp);
+      RewritePatternSet patterns(ctx);
+      populateDuplicateInitOpsPatterns(patterns);
 
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
         return signalPassFailure();
-      }
     }
 
     // Move ops to gml_st.fusion clusters.
     {
-      RewritePatternSet patterns(context);
+      RewritePatternSet patterns(ctx);
       patterns.add(fusionPattern<linalg::MapOp>);
       patterns.add(fusionPattern<linalg::MatmulOp>);
       patterns.add(fusionPattern<linalg::ReduceOp>);
@@ -256,12 +231,11 @@ struct FusionPlanningForCpuPass
 
     // Add attributes with tile sizes.
     {
-      RewritePatternSet patterns(context);
-      patterns.add<ComputeTileSizesPattern>(context, vectorSize);
+      RewritePatternSet patterns(ctx);
+      patterns.add<ComputeTileSizesPattern>(ctx, vectorSize);
 
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
         return signalPassFailure();
-      }
     }
   }
 };
@@ -270,14 +244,13 @@ struct InlineFusionClustersPass
     : public impl::InlineFusionClustersPassBase<InlineFusionClustersPass> {
   void runOnOperation() override {
     func::FuncOp f = getOperation();
-    MLIRContext* context = &getContext();
+    MLIRContext* ctx = &getContext();
 
-    RewritePatternSet patterns(context);
+    RewritePatternSet patterns(ctx);
     patterns.add(inlineFusionCluster);
 
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
       return signalPassFailure();
-    }
   }
 };
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/remove_label.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/remove_label.cc
new file mode 100644
index 00000000000..bc6e41a9056
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/remove_label.cc
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "gml_st/transforms/transforms.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir::gml_st {
+namespace {
+
+#define GEN_PASS_DEF_REMOVELABELPASS
+#include "gml_st/transforms/passes.h.inc"
+
+struct RemoveLabelPass : public impl::RemoveLabelPassBase<RemoveLabelPass> {
+  using Base::Base;
+
+  void runOnOperation() override {
+    getOperation().walk(
+        [](Operation *op) { removeLabel(op, kTransformedLabel); });
+  }
+};
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createRemoveLabelPass() {
+  return std::make_unique<mlir::gml_st::RemoveLabelPass>();
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_conv_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_conv_for_cpu.cc
deleted file mode 100644
index c206cf29610..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_conv_for_cpu.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "gml_st/transforms/fusion/fusion.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/tiling/tiling.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMCONVFORCPUPASS
-#include "gml_st/transforms/passes.h.inc"
-
-using tensor::CollapseShapeOp;
-using tensor::ExpandShapeOp;
-
-// The Conv2D is transformable into a matmul, if it has the following shape
-//
-// linalg.conv_2d_nhwc_hwcf
-//   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-//   ins(%input, %kernel : tensor<1x(N+L-1)xKx1xf32>, tensor<LxKx1xMxf32>)
-//   outs(%fill : tensor<1xNx1xM>) -> tensor<1xNx1xMxf32>
-//
-// in that case we can tile w.r.t. L to bring it to the following form
-//
-// linalg.conv_2d_nhwc_hwcf
-//   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-//   ins(%input, %kernel : tensor<1xNxKx1xf32>, tensor<1xKx1xMxf32>)
-//   outs(%fill : tensor<1xNx1xM>) -> tensor<1xNx1xMxf32>
-bool isTransformableIntoMatmul(linalg::Conv2DNhwcHwcfOp convOp) {
-  if (!convOp.hasTensorSemantics()) return false;
-
-  Value input = convOp.getInputs()[0];
-  auto inputType = input.getType().cast<RankedTensorType>();
-
-  Value kernel = convOp.getInputs()[1];
-  auto kernelType = kernel.getType().cast<RankedTensorType>();
-
-  Value init = convOp.getOutputs()[0];
-  auto initType = init.getType().cast<RankedTensorType>();
-
-  if (!inputType.hasStaticShape() || !kernelType.hasStaticShape() ||
-      !initType.hasStaticShape()) {
-    return false;
-  }
-
-  auto allOnes = [](DenseIntElementsAttr attr) {
-    return attr.isSplat() && attr.getValues<int64_t>()[0] == 1;
-  };
-  if (!allOnes(convOp.getDilations()) || !allOnes(convOp.getStrides()))
-    return false;
-
-  if (inputType.getDimSize(0) != 1 || inputType.getDimSize(3) != 1 ||
-      kernelType.getDimSize(2) != 1 || initType.getDimSize(0) != 1 ||
-      initType.getDimSize(2) != 1)
-    return false;
-  return true;
-}
-
-// linalg.conv_2d_nhwc_hwcf
-//   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-//   ins(%input, %kernel : tensor<1xNxKx1xf32>, tensor<1xKx1xMxf32>)
-//   outs(%fill : tensor<1xNx1xM>) -> tensor<1xNx1xMxf32>
-//
-//  into
-//
-// linalg.matmul
-//   ins(%lhs, %rhs : tensor<NxKxf32>, tensor<KxMxf32>)
-//   outs(%fill : tensor<NxM>) -> tensor<1xNx1xMxf32>
-FailureOr<linalg::MatmulOp> rewriteConvAsMatmul(linalg::Conv2DNhwcHwcfOp convOp,
-                                                PatternRewriter &rewriter) {
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(convOp);
-  Value input = convOp.getInputs()[0];
-  Value kernel = convOp.getInputs()[1];
-  Value init = convOp.getOutputs()[0];
-
-  auto kernelType = kernel.getType().cast<RankedTensorType>();
-  if (!isTransformableIntoMatmul(convOp) || kernelType.getDimSize(0) != 1)
-    return failure();
-
-  Location loc = convOp.getLoc();
-  SmallVector<ReassociationIndices> map{{0, 1}, {2, 3}};
-  Value newInput = rewriter.create<CollapseShapeOp>(loc, input, map);
-  Value newKernel = rewriter.create<CollapseShapeOp>(loc, kernel, map);
-  Value newInit = rewriter.create<CollapseShapeOp>(loc, init, map);
-
-  auto matmul = rewriter.create<linalg::MatmulOp>(
-      loc, newInit.getType(), ValueRange{newInput, newKernel},
-      ValueRange{newInit});
-
-  rewriter.replaceOpWithNewOp<ExpandShapeOp>(convOp, convOp.getType(0),
-                                             matmul.getResult(0), map);
-  return matmul;
-}
-
-struct Conv2DNhwcHwcfOpTransformPattern
-    : public OpRewritePattern<linalg::Conv2DNhwcHwcfOp> {
-  using OpRewritePattern<linalg::Conv2DNhwcHwcfOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(linalg::Conv2DNhwcHwcfOp convOp,
-                                PatternRewriter &rewriter) const override {
-    if (!isTransformableIntoMatmul(convOp)) return failure();
-    FailureOr<scf::SCFTilingResult> tilingResult =
-        scf::tileUsingSCFForOp(rewriter, convOp.getOperation(),
-                               getSCFTilingOptions({0, 0, 0, 0, 1, 0, 0}));
-    rewriter.replaceOp(convOp, tilingResult->replacements);
-
-    auto tiledConv =
-        cast<linalg::Conv2DNhwcHwcfOp>(tilingResult->tiledOps.front());
-    return rewriteConvAsMatmul(tiledConv, rewriter);
-  }
-};
-
-struct TransformConvForCpuPass
-    : public impl::TransformConvForCpuPassBase<TransformConvForCpuPass> {
-  using Base::Base;
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry
-        .insert<arith::ArithDialect, tensor::TensorDialect, scf::SCFDialect>();
-    linalg::registerTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    RewritePatternSet patterns(ctx);
-    patterns.add<Conv2DNhwcHwcfOpTransformPattern>(ctx);
-    populateCollapseForallOpDimensionsPattern(patterns);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformConvForCpuPass() {
-  return std::make_unique<mlir::gml_st::TransformConvForCpuPass>();
-}
-
-}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc
index 0bca8608827..e9c6acfb895 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc
@@ -23,10 +23,12 @@ limitations under the License.
 #include <utility>
 
 #include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/fusion/fusion.h"
 #include "gml_st/transforms/passes.h"
 #include "gml_st/transforms/peeling/peeling.h"
 #include "gml_st/transforms/tiling/tiling.h"
 #include "gml_st/transforms/transforms.h"
+#include "gml_st/utils/linalg_utils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -36,8 +38,10 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h"
 #include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"
+#include "mlir/IR/Dominance.h"
 #include "mlir/Pass/Pass.h"  // IWYU pragma: keep
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "thlo/IR/thlo_ops.h"
 
 namespace mlir::gml_st {
 namespace {
@@ -45,45 +49,157 @@ namespace {
 #define GEN_PASS_DEF_TRANSFORMDOTFORCPUPASS
 #include "gml_st/transforms/passes.h.inc"
 
-constexpr llvm::StringRef kDotTransformedLabel = "__dot_transformed_label__";
+constexpr llvm::StringRef kFusionPlanningLabel = "__fusion_planning_label__";
 
-FailureOr<scf::SCFTilingResult> tileReductionDim(PatternRewriter &rewriter,
-                                                 Operation *op,
-                                                 ArrayRef<int64_t> tileSizes) {
-  scf::SCFTilingOptions opts;
-  opts.setTileSizes(tileSizes);
+struct MatmulSizes {
+  // [m, k] x [k, n]
+  int64_t m;
+  int64_t n;
+  int64_t k;
+};
 
-  auto tilingResult = scf::tileUsingSCFForOp(rewriter, op, opts);
-  if (failed(tilingResult)) return failure();
+using MatmulTileSizeComputationFn = std::function<MatmulSizes(MatmulSizes)>;
 
-  // Update the results if tiling occurred.
-  if (!tilingResult->loops.empty()) {
-    rewriter.replaceOp(op, tilingResult->replacements);
-    op = tilingResult->tiledOps.front();
-  }
-
-  setLabel(op, kDotTransformedLabel);
-  return tilingResult;
+int64_t roundDownToPowerOfTwo(int64_t n) {
+  if ((n & (n - 1)) == 0) return n;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  n |= n >> 32;
+  return (n + 1) >> 1;
 }
 
-FailureOr<TilingResult> tileParallelDims(PatternRewriter &rewriter,
-                                         Operation *op,
-                                         ArrayRef<int64_t> tileSizes) {
-  auto tilingResult = tileUsingSCFForallOp(getSCFTilingOptions(tileSizes),
-                                           rewriter, cast<TilingInterface>(op));
-  if (failed(tilingResult)) return failure();
+bool isPowerOfTwo(int64_t n) { return (n & (n - 1)) == 0; }
 
-  // Update the results if tiling occurred.
-  if (tilingResult->loop != nullptr) {
-    rewriter.replaceOp(op, tilingResult->loop->getResults());
+// Tiling heuristic that was tuned for static power-of-two sized shapes on
+// Skylake.
+MatmulSizes skylakeTilingHeuristic(MatmulSizes sizes) {
+  if (sizes.m == 1) {
+    // Limit the maximum tiling to an arbitrary 32 to limit code growth. This
+    // needs re-tuning.
+    return {1, std::min<int64_t>(sizes.n, 32), 1};
   }
 
-  return tilingResult;
+  if (sizes.n == 1) {
+    if (sizes.k <= 8) {
+      return {1, 1, 1};
+    }
+    return {std::min<int64_t>(8, sizes.m), 1, 4};
+  }
+
+  MatmulSizes result;
+  result.k = sizes.k <= 8 ? 1 : 4;
+  result.n = std::min<int64_t>(8, sizes.n) << (sizes.m <= 16 ? 1 : 0);
+  result.m = std::min<int64_t>(32, sizes.m) << (sizes.n <= 4 ? 1 : 0);
+  return result;
+}
+
+// Tiling heuristic that was tuned for static power-of-two sized shapes on Zen
+// v2 ("Rome").
+MatmulSizes znver2TilingHeuristic(MatmulSizes sizes) {
+  MatmulSizes result;
+  result.k = sizes.n == 1 ? 8 : 1;
+  if (sizes.n == 1) {
+    result.m = sizes.k >= 32 ? 16 : 8;
+  } else {
+    result.m = sizes.n <= 8 ? 8 : 4;
+  }
+  if (sizes.m == 1) {
+    result.n = std::min<int64_t>(64, sizes.n) * (sizes.k <= 64 ? 1 : 2);
+  } else {
+    result.n = std::min<int64_t>(16, sizes.n);
+  }
+  return result;
+}
+
+// Tiling heuristic that was tuned for static sized shapes on generic Haswell.
+MatmulSizes haswellTilingHeuristic(MatmulSizes sizes) {
+  MatmulSizes result;
+  // Dot
+  if (sizes.m == 1 && sizes.n == 1) {
+    // At this point we only have small tensors, dots with bigger tensors are
+    // already turned into reduce(map).
+    return {1, std::min<int64_t>(sizes.n, 32), 1};
+  }
+
+  // Vecmat
+  if (sizes.m == 1) {
+    result.m = 1;
+    constexpr int64_t kVecmatNThreshold = 64;
+    constexpr int64_t kVecmatSizeThreshold = 16 * kVecmatNThreshold;
+    int64_t numElements = sizes.k * sizes.n;
+    if (sizes.n < kVecmatNThreshold) {
+      result.n = sizes.n;
+      if (numElements < kVecmatSizeThreshold) {
+        result.k = sizes.k;
+      } else if (isPowerOfTwo(sizes.n)) {
+        result.k = 2;
+      } else {
+        result.k = std::min<int64_t>(result.k / 2, 64);
+      }
+    } else {
+      result.n = kVecmatNThreshold;
+      if (sizes.k < 16) {
+        result.k = sizes.k;
+      } else {
+        if (sizes.n >= 256) {
+          result.k = isPowerOfTwo(sizes.k) ? 1 : 8;
+        } else {
+          result.k = isPowerOfTwo(sizes.k) ? 8 : 16;
+        }
+      }
+    }
+    return result;
+  }
+
+  result.k = sizes.n == 1 ? 8 : 1;
+  // Matvec
+  if (sizes.n == 1) {
+    if (sizes.k <= 8) {
+      return {1, 1, 1};
+    }
+    return {std::min<int64_t>(8, sizes.m), 1, 4};
+  }
+  // Matmul
+  result.k = sizes.k <= 8 ? 1 : 4;
+  result.n = std::min<int64_t>(8, sizes.n) << (sizes.m <= 16 ? 1 : 0);
+  result.m = std::min<int64_t>(32, sizes.m) << (sizes.n <= 4 ? 1 : 0);
+  return result;
+}
+
+std::function<MatmulSizes(MatmulSizes)> wrapHeuristic(
+    const std::function<MatmulSizes(MatmulSizes)> &heuristic,
+    MatmulSizes dynamicDefault) {
+  return [=](MatmulSizes sizes) {
+    if (ShapedType::isDynamic(sizes.n) || ShapedType::isDynamic(sizes.m) ||
+        ShapedType::isDynamic(sizes.k)) {
+      return dynamicDefault;
+    }
+
+    sizes.m = roundDownToPowerOfTwo(sizes.m);
+    sizes.n = roundDownToPowerOfTwo(sizes.n);
+    sizes.k = roundDownToPowerOfTwo(sizes.k);
+
+    return heuristic(sizes);
+  };
+}
+
+MatmulSizes getMatmulSizes(linalg::MatmulOp op) {
+  // [m, k] x [k, n]
+  auto lhsTy = op->getOperand(0).getType().cast<ShapedType>();
+  auto rhsTy = op->getOperand(1).getType().cast<ShapedType>();
+  MatmulSizes sizes;
+  sizes.m = lhsTy.getDimSize(0);
+  sizes.k = rhsTy.getDimSize(0);
+  sizes.n = rhsTy.getDimSize(1);
+  return sizes;
 }
 
 MatmulSizes getMatmulSizes(linalg::VecmatOp op) {
   // [1, k] x [k, n]
-  ShapedType ty = op->getOperand(1).getType().cast<ShapedType>();
+  auto ty = op->getOperand(1).getType().cast<ShapedType>();
   MatmulSizes sizes;
   sizes.m = 1;
   sizes.k = ty.getDimSize(0);
@@ -93,7 +209,7 @@ MatmulSizes getMatmulSizes(linalg::VecmatOp op) {
 
 MatmulSizes getMatmulSizes(linalg::MatvecOp op) {
   // [m, k] x [k, 1]
-  ShapedType ty = op->getOperand(0).getType().cast<ShapedType>();
+  auto ty = op->getOperand(0).getType().cast<ShapedType>();
   MatmulSizes sizes;
   sizes.m = ty.getDimSize(0);
   sizes.k = ty.getDimSize(1);
@@ -103,7 +219,7 @@ MatmulSizes getMatmulSizes(linalg::MatvecOp op) {
 
 MatmulSizes getMatmulSizes(linalg::DotOp op) {
   // [1, k] x [k, 1]
-  ShapedType ty = op->getOperand(0).getType().cast<ShapedType>();
+  auto ty = op->getOperand(0).getType().cast<ShapedType>();
   MatmulSizes sizes;
   sizes.m = 1;
   sizes.k = ty.getDimSize(0);
@@ -111,69 +227,288 @@ MatmulSizes getMatmulSizes(linalg::DotOp op) {
   return sizes;
 }
 
-/// Pattern to tile dot operations (linalg.matvec, linalg.vecmat, linalg.dot)
-/// and peel the generated loops.
-template <typename DotTy>
-struct DotTransformPattern : public OpRewritePattern<DotTy> {
-  using OpRewritePattern<DotTy>::OpRewritePattern;
+SmallVector<int64_t> dropZeros(ArrayRef<int64_t> tileSizes) {
+  return to_vector(llvm::make_filter_range(
+      tileSizes, [](int64_t size) { return size != 0; }));
+}
 
-  explicit DotTransformPattern(
-      MLIRContext *context, MatmulTileSizeComputationFn tileSizeFn,
-      std::function<SmallVector<int64_t>(MatmulSizes)> parallelDimTileSizeFn,
-      std::function<SmallVector<int64_t>(MatmulSizes)> reductionDimTileSizeFn,
-      PatternBenefit benefit = 1)
-      : OpRewritePattern<DotTy>(context, benefit),
-        tileSizeFn(std::move(tileSizeFn)),
-        parallelDimTileSizeFn(std::move(parallelDimTileSizeFn)),
-        reductionDimTileSizeFn(std::move(reductionDimTileSizeFn)) {}
+struct DotAddPattern : public OpRewritePattern<linalg::MapOp> {
+  using OpRewritePattern<linalg::MapOp>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(DotTy dotOp,
+  explicit DotAddPattern(MLIRContext *context, PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::MapOp>(context, benefit) {}
+
+  LogicalResult matchAndRewrite(linalg::MapOp mapOp,
                                 PatternRewriter &rewriter) const override {
-    if (hasLabel(dotOp, kDotTransformedLabel)) {
-      return rewriter.notifyMatchFailure(dotOp,
-                                         "has already been transformed.");
+    auto &region = mapOp.getMapper();
+    if (!region.hasOneBlock()) return failure();
+
+    auto &body = region.front();
+    // The body region should only have one add operation and a linalg.yield.
+    if (body.getOperations().size() != 2) return failure();
+
+    auto &mapperOp = body.front();
+    if (!isa<arith::AddIOp, arith::AddFOp>(mapperOp)) return failure();
+
+    // Map of add should always be binary.
+    if (mapOp.getInputs().size() != 2) return failure();
+    if (ValueRange{body.getArguments()} != ValueRange{mapperOp.getOperands()})
+      return failure();
+
+    if (!llvm::any_of(mapOp.getInputs(), [](Value operand) {
+          auto linalgOp = operand.getDefiningOp<linalg::LinalgOp>();
+          return linalg::isaContractionOpInterface(linalgOp);
+        }))
+      return failure();
+
+    auto foldAddIntoDotOperand = [&](unsigned opIdx) {
+      auto dotOp = mapOp.getInputs()[opIdx].getDefiningOp<linalg::LinalgOp>();
+      auto otherOp = mapOp.getInputs()[1 - opIdx];
+      if (!linalg::isaContractionOpInterface(dotOp)) return false;
+      if (!dotOp.getDpsInitOperand(0)->get().getDefiningOp<linalg::FillOp>())
+        return false;
+      if (!dotOp->hasOneUse()) return false;
+      // TODO(vuson): handle the case where we need to move dotOp up or otherOp
+      // down.
+      mlir::DominanceInfo domInfo(mapOp->getParentOp());
+      if (!domInfo.properlyDominates(otherOp, dotOp)) return false;
+      rewriter.updateRootInPlace(
+          dotOp, [&]() { dotOp.setDpsInitOperand(0, otherOp); });
+      rewriter.replaceOp(mapOp, dotOp->getResults());
+      return true;
+    };
+
+    return success(foldAddIntoDotOperand(0) || foldAddIntoDotOperand(1));
+  }
+};
+
+LogicalResult tileAndPeelReductionDim(PatternRewriter &rewriter,
+                                      Operation *reduceOp,
+                                      ArrayRef<int64_t> reductionDimTileSizes) {
+  FailureOr<scf::SCFTilingResult> reductionDimTilingResult =
+      tileUsingSCFForOpAndFuseGreedily(
+          rewriter, reduceOp, getSCFTilingOptions(reductionDimTileSizes));
+  if (failed(reductionDimTilingResult)) return failure();
+
+  SCFForPeelingResult reductionDimPeelingResult =
+      peelSCFForOp(rewriter, reductionDimTilingResult->loops.front());
+  if (reductionDimPeelingResult.mainLoop) {
+    setLabel(reductionDimPeelingResult.mainLoop, kPerfectlyTiledLoopLabel);
+  }
+  return success();
+}
+
+SmallVector<int64_t> getTileSizesForDimsOfType(Operation *iop,
+                                               ArrayRef<int64_t> tileSizes,
+                                               utils::IteratorType iterType) {
+  TilingInterface op = cast<TilingInterface>(iop);
+  SmallVector<utils::IteratorType> iteratorTypes = op.getLoopIteratorTypes();
+  SmallVector<int64_t> tileSizesOfType(iteratorTypes.size(), 0);
+  assert(tileSizes.size() == iteratorTypes.size() &&
+         "the number of provided tile sizes should match the iteration domain "
+         "of the op");
+  SmallVector<unsigned> iteratorTypeDimsPositions;
+  findPositionsOfType(iteratorTypes, iterType, iteratorTypeDimsPositions);
+  for (unsigned pos : iteratorTypeDimsPositions)
+    tileSizesOfType[pos] = tileSizes[pos];
+  return tileSizesOfType;
+}
+
+/// Helper to tile dot operations (linalg.matvec, linalg.vecmat, linalg.dot)
+/// and peel the generated loops. This can be extended to support any op that
+/// implements TilingInterface.
+template <typename DotOpTy>
+LogicalResult tileAndPeelMatmulOp(PatternRewriter &rewriter, DotOpTy dotOp,
+                                  ArrayRef<int64_t> tileSizes) {
+  Operation *tilingRoot = dotOp;
+  if (auto fusionOp = dyn_cast<gml_st::FusionOp>(dotOp->getParentOp())) {
+    tilingRoot = fusionOp.getTerminator().getValues()[0].getDefiningOp();
+  }
+
+  // First level tiling: parallel dimension.
+  auto parallelDimsTileSizes = getTileSizesForDimsOfType(
+      dotOp.getOperation(), tileSizes, utils::IteratorType::parallel);
+  auto reductionDimsTileSizes = getTileSizesForDimsOfType(
+      dotOp.getOperation(), tileSizes, utils::IteratorType::reduction);
+  if (!isa<DotOpTy>(tilingRoot))
+    parallelDimsTileSizes = dropZeros(parallelDimsTileSizes);
+
+  auto tilingParallelDimsResult = tileUsingSCFForallOpAndFuseGreedily(
+      rewriter, tilingRoot, getSCFTilingOptions(parallelDimsTileSizes));
+  if (failed(tilingParallelDimsResult)) return failure();
+
+  if (!tilingParallelDimsResult->loop) {
+    return tileAndPeelReductionDim(rewriter, dotOp, reductionDimsTileSizes);
+  }
+  auto peeledParallelLoop =
+      peelAllLoops(tilingParallelDimsResult->loop, rewriter);
+
+  // Process main parallel loop.
+  scf::ForallOp mainParallelLoop = peeledParallelLoop.mainLoop;
+  if (mainParallelLoop) {
+    auto tiledDotOp = *mainParallelLoop.getBody()->getOps<DotOpTy>().begin();
+    if (failed(tileAndPeelReductionDim(rewriter, tiledDotOp,
+                                       reductionDimsTileSizes))) {
+      return failure();
     }
-    if (isa<scf::ForallOp, scf::ForOp>(dotOp->getParentOp())) {
-      return rewriter.notifyMatchFailure(
-          dotOp, "has already been tiled by another pass.");
+  }
+
+  // Process tail parallel loop.
+  for (scf::ForallOp tailParallelLoop : peeledParallelLoop.tailLoops) {
+    for (auto tiledDotOp : llvm::to_vector(
+             tailParallelLoop.getBody()->template getOps<DotOpTy>())) {
+      auto reductionDimTilingResult = tileUsingSCFForOpAndFuseGreedily(
+          rewriter, tiledDotOp, getSCFTilingOptions(reductionDimsTileSizes));
+      if (failed(reductionDimTilingResult)) return failure();
     }
+  }
+  return success();
+}
 
-    auto tileSizes = tileSizeFn(getMatmulSizes(dotOp));
-    auto tilingParallelDimsResult = tileParallelDims(
-        rewriter, dotOp.getOperation(), parallelDimTileSizeFn(tileSizes));
-    if (failed(tilingParallelDimsResult)) return failure();
+// Tile linalg.conv_2d_nhwc_hwcf to convert it to linalg.matmul..
+struct Conv2DNhwcHwcfOpPattern
+    : public OpRewritePattern<linalg::Conv2DNhwcHwcfOp> {
+  using OpRewritePattern<linalg::Conv2DNhwcHwcfOp>::OpRewritePattern;
 
-    scf::ForallOp forallOp = tilingParallelDimsResult->loop;
-    if (forallOp != nullptr) {
-      dotOp = cast<DotTy>(tilingParallelDimsResult->tiledOps.back());
-    }
+  LogicalResult matchAndRewrite(linalg::Conv2DNhwcHwcfOp convOp,
+                                PatternRewriter &rewriter) const override {
+    if (!isTransformableIntoMatmul(convOp)) return failure();
+    FailureOr<scf::SCFTilingResult> tilingResult = scf::tileUsingSCFForOp(
+        rewriter, cast<TilingInterface>(convOp.getOperation()),
+        getSCFTilingOptions({0, 0, 0, 0, 1, 0, 0}));
+    if (failed(tilingResult)) return failure();
+    rewriter.replaceOp(convOp, tilingResult->replacements);
 
-    // Second level tiling: reduction dimension.
-    auto tilingReductionDimResult = tileReductionDim(
-        rewriter, dotOp.getOperation(), reductionDimTileSizeFn(tileSizes));
-    if (failed(tilingReductionDimResult)) return failure();
+    auto tiledConv =
+        cast<linalg::Conv2DNhwcHwcfOp>(tilingResult->tiledOps.front());
+    return convertConvToMatmul(tiledConv, rewriter);
+  }
+};
 
-    if (!tilingReductionDimResult->loops.empty()) {
-      dotOp = cast<DotTy>(tilingReductionDimResult->tiledOps.back());
-    }
-    // Peel parallel loops.
-    if (forallOp != nullptr) {
-      (void)peelAllLoops(forallOp, rewriter);
-    }
+// Tile linalg.batch_matmul to 1 in the outermost dimension, then transform a
+// unit linalg.batch_matmul into a matmul using reshape ops.
+struct BatchMatmulOpPattern : public OpRewritePattern<linalg::BatchMatmulOp> {
+  using OpRewritePattern<linalg::BatchMatmulOp>::OpRewritePattern;
 
-    // Peel reduction loop inside the main parallel loop, label the main loop as
-    // "perfectly tiled" one, to enable vectorization after canonicalization.
-    auto peelingResult =
-        peelSCFForOp(rewriter, tilingReductionDimResult->loops.front());
-    setLabel(peelingResult.mainLoop, kPerfectlyTiledLoopLabel);
+  LogicalResult matchAndRewrite(linalg::BatchMatmulOp batchMatmulOp,
+                                PatternRewriter &rewriter) const override {
+    // Tile and fuse fillOp into the loop nest.
+    auto tilingResult = tileUsingSCFForallOpAndFuseGreedily(
+        rewriter, batchMatmulOp.getOperation(),
+        getSCFTilingOptions({1, 0, 0, 0}));
+    if (failed(tilingResult)) return failure();
 
-    return success();
+    auto tiledBatchMatmulOp =
+        cast<linalg::BatchMatmulOp>(tilingResult->tiledOps.front());
+    return convertBatchMatmulToMatmul(tiledBatchMatmulOp, rewriter);
+  }
+};
+
+struct MatmulPattern : public OpRewritePattern<linalg::MatmulOp> {
+  using OpRewritePattern<linalg::MatmulOp>::OpRewritePattern;
+
+  MatmulPattern(MLIRContext *context, MatmulTileSizeComputationFn tileSizeFn,
+                PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::MatmulOp>(context, benefit),
+        tileSizeFn(std::move(tileSizeFn)) {}
+
+  LogicalResult matchAndRewrite(linalg::MatmulOp matmulOp,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(matmulOp, kTransformedLabel))
+      return rewriter.notifyMatchFailure(matmulOp, "already transformed");
+
+    MatmulSizes tileSizes = tileSizeFn(getMatmulSizes(matmulOp));
+    return tileAndPeelMatmulOp(rewriter, matmulOp,
+                               {tileSizes.m, tileSizes.n, tileSizes.k});
+  }
+
+ private:
+  MatmulTileSizeComputationFn tileSizeFn;
+};
+
+struct MatvecPattern : public OpRewritePattern<linalg::MatvecOp> {
+  using OpRewritePattern<linalg::MatvecOp>::OpRewritePattern;
+
+  MatvecPattern(MLIRContext *context, MatmulTileSizeComputationFn tileSizeFn,
+                PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::MatvecOp>(context, benefit),
+        tileSizeFn(std::move(tileSizeFn)) {}
+
+  LogicalResult matchAndRewrite(linalg::MatvecOp matvecOp,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(matvecOp, kTransformedLabel))
+      return rewriter.notifyMatchFailure(matvecOp, "already transformed");
+
+    MatmulSizes matmulSizes = getMatmulSizes(matvecOp);
+    // For large K it is beneficial to perform reduction in two steps, i.e.
+    // reduce tensor<K> to tensor<VECTOR_SIZE> and then perform a horizontal
+    // add to reduce tensoSr<VECTOR_SIZE> to a single element.
+    constexpr int64_t kReductionDimSizeThreshold = 96;
+    if (!ShapedType::isDynamic(matmulSizes.k) &&
+        matmulSizes.k > kReductionDimSizeThreshold) {
+      auto tilingParallelDim = tileUsingSCFForallOpAndFuseGreedily(
+          rewriter, matvecOp, getSCFTilingOptions({1, 0}), nullptr);
+      if (failed(tilingParallelDim)) return failure();
+
+      auto tiledMatvecOp =
+          cast<linalg::MatvecOp>(tilingParallelDim->tiledOps.front());
+      return convertMatvecToDotOp(rewriter, tiledMatvecOp);
+    }
+
+    MatmulSizes tileSizes = tileSizeFn(matmulSizes);
+    return tileAndPeelMatmulOp(rewriter, matvecOp, {tileSizes.m, tileSizes.k});
+  }
+
+ private:
+  MatmulTileSizeComputationFn tileSizeFn;
+};
+
+struct VecmatPattern : public OpRewritePattern<linalg::VecmatOp> {
+  using OpRewritePattern<linalg::VecmatOp>::OpRewritePattern;
+
+  VecmatPattern(MLIRContext *context, MatmulTileSizeComputationFn tileSizeFn,
+                PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::VecmatOp>(context, benefit),
+        tileSizeFn(std::move(tileSizeFn)) {}
+
+  LogicalResult matchAndRewrite(linalg::VecmatOp dotOp,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(dotOp, kTransformedLabel))
+      return rewriter.notifyMatchFailure(dotOp, "already transformed");
+
+    MatmulSizes tileSizes = tileSizeFn(getMatmulSizes(dotOp));
+    return tileAndPeelMatmulOp(rewriter, dotOp, {tileSizes.n, tileSizes.k});
+  }
+
+ private:
+  MatmulTileSizeComputationFn tileSizeFn;
+};
+
+struct DotPattern : public OpRewritePattern<linalg::DotOp> {
+  using OpRewritePattern<linalg::DotOp>::OpRewritePattern;
+
+  DotPattern(MLIRContext *context, MatmulTileSizeComputationFn tileSizeFn,
+             PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::DotOp>(context, benefit),
+        tileSizeFn(std::move(tileSizeFn)) {}
+
+  LogicalResult matchAndRewrite(linalg::DotOp dotOp,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(dotOp, kTransformedLabel))
+      return rewriter.notifyMatchFailure(dotOp, "already transformed");
+
+    MatmulSizes matmulSizes = getMatmulSizes(dotOp);
+    constexpr int64_t kReductionDimSizeThreshold = 32;
+    if (!ShapedType::isDynamic(matmulSizes.k) &&
+        matmulSizes.k > kReductionDimSizeThreshold) {
+      return convertDotOpToReduce(dotOp, rewriter);
+    }
+    MatmulSizes tileSizes = tileSizeFn(matmulSizes);
+    return tileAndPeelMatmulOp(rewriter, dotOp, {tileSizes.k});
   }
 
  private:
   MatmulTileSizeComputationFn tileSizeFn;
-  std::function<SmallVector<int64_t>(MatmulSizes)> parallelDimTileSizeFn;
-  std::function<SmallVector<int64_t>(MatmulSizes)> reductionDimTileSizeFn;
 };
 
 Value transposeMatrixConstant(ImplicitLocOpBuilder &builder, Value input) {
@@ -218,6 +553,31 @@ struct MatVecToVecMatPattern : public OpRewritePattern<linalg::MatvecOp> {
   }
 };
 
+template <typename OpTy>
+LogicalResult fusionClusterPattern(OpTy dotOp, PatternRewriter &rewriter) {
+  // The op was already processed.
+  if (dotOp->template getParentOfType<gml_st::FusionOp>()) return failure();
+  if (hasLabel(dotOp, kFusionPlanningLabel)) return failure();
+
+  auto producerFilterFn = [](Operation *op) {
+    return isa<linalg::FillOp, thlo::ReverseOp, tensor::CastOp>(op);
+  };
+  auto consumerFilterFn = [](Operation *op) {
+    if (auto mapOp = dyn_cast<linalg::MapOp>(op))
+      return mapOp.getNumDpsInputs() == 1;
+    return isa<thlo::ReverseOp>(op);
+  };
+
+  auto fusionCluster =
+      getFusionCluster(dotOp, producerFilterFn, consumerFilterFn);
+
+  for (auto *op : fusionCluster.operations) setLabel(op, kFusionPlanningLabel);
+
+  if (failed(wrapFusionCluster(rewriter, fusionCluster))) return failure();
+
+  return success();
+}
+
 struct TransformDotForCpuPass
     : public impl::TransformDotForCpuPassBase<TransformDotForCpuPass> {
   TransformDotForCpuPass() = default;
@@ -238,60 +598,79 @@ struct TransformDotForCpuPass
     func::FuncOp f = getOperation();
     MLIRContext *ctx = &getContext();
 
-    // Dot operations can have at most 3 dimensions ((upto) 2 parallel + 1
-    // reduction), so the first two tileSizes' elements are for parallel
-    // dimensions tiling, and the last element is for reduction dimension
-    // tiling.
-    // - for linalg.matmul: the whole tileSizes vector will be used.
-    // - for linalg.matvec: only the first and last elements of tileSizes are
-    // used.
-    // - for linalg.vecmat: only the second and last elements of tileSizes are
-    // used.
-    // - for linalg.dot: only the last element of tileSizes is used.
+    // Peephole optimization of dot followed by add.
+    {
+      RewritePatternSet patterns(ctx);
+      patterns.add<DotAddPattern>(ctx);
 
-    RewritePatternSet patterns(ctx);
-    patterns.add<MatVecToVecMatPattern>(ctx, 2);
-    patterns.add<DotTransformPattern<linalg::MatvecOp>>(
-        ctx, tileSizeFn,
-        [&](MatmulSizes sizes) -> SmallVector<int64_t> {
-          return {sizes.m, 0};
-        },
-        [&](MatmulSizes sizes) -> SmallVector<int64_t> {
-          return {0, sizes.k};
-        });
-    patterns.add<DotTransformPattern<linalg::VecmatOp>>(
-        ctx, tileSizeFn,
-        [&](MatmulSizes sizes) -> SmallVector<int64_t> {
-          return {sizes.n, 0};
-        },
-        [&](MatmulSizes sizes) -> SmallVector<int64_t> {
-          return {0, sizes.k};
-        });
-    patterns.add<DotTransformPattern<linalg::DotOp>>(
-        ctx, tileSizeFn,
-        [&](MatmulSizes) -> SmallVector<int64_t> { return {}; },
-        [&](MatmulSizes sizes) -> SmallVector<int64_t> { return {sizes.k}; });
-
-    populateCollapseForallOpDimensionsPattern(patterns);
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
-      return signalPassFailure();
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+        return signalPassFailure();
     }
 
-    // Ensure we drop the marker in the end.
-    f.walk([](Operation *op) {
-      if (isa<linalg::MatvecOp, linalg::VecmatOp, linalg::DotOp>(op))
-        removeLabel(op, kDotTransformedLabel);
-    });
+    {
+      RewritePatternSet patterns(ctx);
+      patterns.add<BatchMatmulOpPattern, Conv2DNhwcHwcfOpPattern,
+                   MatVecToVecMatPattern>(ctx);
+
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+        return signalPassFailure();
+    }
+
+    // Cleanup passes to prepare ops for better clustering.
+    {
+      RewritePatternSet patterns(ctx);
+      populateDuplicateInitOpsPatterns(patterns);
+
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+        return signalPassFailure();
+    }
+
+    {
+      RewritePatternSet patterns(ctx);
+      patterns.add(fusionClusterPattern<linalg::DotOp>);
+      patterns.add(fusionClusterPattern<linalg::MatmulOp>);
+      patterns.add(fusionClusterPattern<linalg::MatvecOp>);
+      patterns.add(fusionClusterPattern<linalg::VecmatOp>);
+
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+        return signalPassFailure();
+
+      f.walk([](Operation *op) { removeLabel(op, kFusionPlanningLabel); });
+    }
+
+    {
+      RewritePatternSet patterns(ctx);
+      patterns.add<MatmulPattern, MatvecPattern, VecmatPattern, DotPattern>(
+          ctx, tileSizeFn);
+
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+        return signalPassFailure();
+    }
   }
 
   MatmulTileSizeComputationFn tileSizeFn;
 };
+
 }  // namespace
 
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformDotForCpuPass(MatmulTileSizeComputationFn tileSizeFn) {
+createTransformDotForCpuPass(ArrayRef<int64_t> tileSizes, StringRef cpuName) {
+  std::function<MatmulSizes(MatmulSizes)> tilingHeuristic;
+  if (!tileSizes.empty()) {
+    assert(tileSizes.size() == 3 && "Expected exactly 3 tile sizes for matmul");
+    MatmulSizes fixedSizes{tileSizes[0], tileSizes[1], tileSizes[2]};
+    tilingHeuristic = [=](MatmulSizes) { return fixedSizes; };
+  } else {
+    if (cpuName.starts_with("znver"))
+      tilingHeuristic = wrapHeuristic(znver2TilingHeuristic, {16, 8, 8});
+    else if (cpuName.contains("skylake"))
+      tilingHeuristic = wrapHeuristic(skylakeTilingHeuristic, {16, 16, 4});
+    else
+      // Default to generic Haswell target.
+      tilingHeuristic = wrapHeuristic(haswellTilingHeuristic, {8, 8, 8});
+  }
   return std::make_unique<mlir::gml_st::TransformDotForCpuPass>(
-      std::move(tileSizeFn));
+      std::move(tilingHeuristic));
 }
 
 }  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_elementwise_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_elementwise_for_cpu.cc
new file mode 100644
index 00000000000..917e8b3ff4c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_elementwise_for_cpu.cc
@@ -0,0 +1,398 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <utility>
+
+#include "gml_st/transforms/fusion/fusion.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/peeling/peeling.h"
+#include "gml_st/transforms/transforms.h"
+#include "gml_st/utils/tensor_utils.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/DestinationStyleOpInterface.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "thlo/IR/thlo_ops.h"
+
+namespace mlir::gml_st {
+namespace {
+
+#define GEN_PASS_DEF_TRANSFORMELEMENTWISEFORCPUPASS
+#include "gml_st/transforms/passes.h.inc"
+
+constexpr llvm::StringRef kFusionPlanningLabel = "__fusion_planning_label__";
+constexpr llvm::StringRef kElementwiseLabel = "__elementwise_label__";
+
+// Indicates the the dimension is not mapped to dimensions of the root op.
+constexpr int64_t kNotMappedToRootDims = -1;
+
+using FusionFilterFn = llvm::function_ref<bool(Operation *)>;
+using CandidatesMap = llvm::SmallMapVector<Value, SmallVector<int64_t>, 4>;
+
+// Find the root of the fusion cluster.
+Operation *findRootElementwiseOp(Operation *op, FusionFilterFn fusionFilterFn) {
+  Operation *rootOp = op;
+  Operation *curOp = nullptr;
+  do {
+    curOp = nullptr;
+    for (OpOperand &use : rootOp->getUses()) {
+      Operation *owner = use.getOwner();
+      if (!fusionFilterFn(owner)) continue;
+      if (hasLabel(owner, kTransformedLabel)) continue;
+      if (hasLabel(owner, kFusionPlanningLabel)) continue;
+      if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(owner)) {
+        if (llvm::is_contained(dpsOp.getDpsInitOperands(), &use)) continue;
+      }
+      curOp = owner;
+      rootOp = curOp;
+      break;
+    }
+  } while (curOp != nullptr);
+  // If the root is a reshape, don't use it, use the defining op for the
+  // argument instead.
+  if (isa<tensor::ExpandShapeOp, tensor::CollapseShapeOp>(rootOp))
+    return rootOp->getOperand(0).getDefiningOp();
+  return rootOp;
+}
+
+// Depending on the type of the defining op for the `result`, adds its arguments
+// with the maps to the root result dimensions.
+void addMappedTensorArgs(Value result, const SmallVector<int64_t> &map,
+                         CandidatesMap &args) {
+  Operation *defOp = result.getDefiningOp();
+  if (!defOp) return;
+
+  mlir::TypeSwitch<Operation *>(defOp)
+      .Case<linalg::FillOp, linalg::MapOp, thlo::ReverseOp>([&](auto op) {
+        for (OpOperand *operand :
+             cast<DestinationStyleOpInterface>(op.getOperation())
+                 .getDpsInputOperands()) {
+          Value val = operand->get();
+          if (!isa<RankedTensorType>(val.getType())) continue;
+          args[val] = map;
+        }
+      })
+      .Case<linalg::TransposeOp>([&](auto op) {
+        auto transposeOp = cast<linalg::TransposeOp>(op);
+        SmallVector<int64_t> composed(map.size(), 0);
+        for (auto [index, id] : llvm::enumerate(transposeOp.getPermutation())) {
+          composed[index] = map[id];
+        }
+        args[transposeOp.getInput()] = composed;
+      })
+      .Case<linalg::BroadcastOp>([&](auto op) {
+        auto broadcastOp = cast<linalg::BroadcastOp>(op);
+        SmallVector<int64_t> composed;
+        SmallVector<int64_t> bcastDims = to_vector(broadcastOp.getDimensions());
+
+        for (auto [index, id] : llvm::enumerate(map)) {
+          if (llvm::is_contained(bcastDims, index)) continue;
+          composed.push_back(id);
+        }
+        args[broadcastOp.getInput()] = composed;
+      })
+      .Case<tensor::CollapseShapeOp>([&](auto op) {
+        auto collapseShapeOp = cast<tensor::CollapseShapeOp>(op);
+        auto srcType = collapseShapeOp.getSrcType();
+
+        SmallVector<int64_t> preservedDims = getPreservedDimensions(
+            srcType.getShape(), collapseShapeOp.getReassociationIndices());
+
+        SmallVector<int64_t> composed(srcType.getRank(), kNotMappedToRootDims);
+        for (auto [index, mapDim] : llvm::enumerate(map))
+          composed[preservedDims[index]] = mapDim;
+        args[collapseShapeOp.getSrc()] = composed;
+      })
+      .Case<tensor::ExpandShapeOp>([&](auto op) {
+        auto expandShapeOp = cast<tensor::ExpandShapeOp>(op);
+        auto dstType = expandShapeOp.getResultType();
+
+        SmallVector<int64_t> preservedDims = getPreservedDimensions(
+            dstType.getShape(), expandShapeOp.getReassociationIndices());
+
+        SmallVector<int64_t> composed(expandShapeOp.getSrcType().getRank());
+        for (auto [index, preservedDim] : llvm::enumerate(preservedDims))
+          composed[index] = map[preservedDim];
+        args[expandShapeOp.getSrc()] = composed;
+      })
+      .Default(
+          [](Operation *) { llvm_unreachable("The op is not supported"); });
+}
+
+// Starts a graph traversal from the root trying to fuse all ops that satisfy
+// `fusionFilterFn` and also have no users outside of this fusion cluster.
+FusionCluster findElementwiseCluster(Operation *rootOp,
+                                     FusionFilterFn fusionFilterFn) {
+  Value rootResult = rootOp->getResult(0);
+
+  SetVector<Operation *> resultOps;
+  resultOps.insert(rootOp);
+  CandidatesMap mappedArgs, candidates;
+
+  // Add operands of root.
+  int64_t rootRank = rootResult.getType().cast<RankedTensorType>().getRank();
+  auto identityMap = llvm::to_vector(llvm::seq<int64_t>(0, rootRank));
+  addMappedTensorArgs(rootResult, identityMap, candidates);
+
+  while (!candidates.empty()) {
+    bool fusionHappened = false;
+    SmallVector<Value> argsToErase;
+    for (auto [arg, map] : llvm::reverse(candidates)) {
+      // If the arg is already coming outside of the cluster, i.e. it is a
+      // function argument or a result of some op that is not included by the
+      // fusionFilterFn, then we remove such arg.
+      Operation *defOp = arg.getDefiningOp();
+      if (mappedArgs.contains(arg) || !defOp || resultOps.contains(defOp) ||
+          !fusionFilterFn(defOp)) {
+        mappedArgs[arg] = map;
+        argsToErase.push_back(arg);
+        continue;
+      }
+
+      // If there are any users of this op outside of fusion cluster, then skip.
+      if (llvm::any_of(arg.getUsers(), [&](Operation *user) {
+            return !resultOps.contains(user);
+          })) {
+        continue;
+      }
+
+      resultOps.insert(defOp);
+      addMappedTensorArgs(arg, map, candidates);
+      fusionHappened = true;
+      break;
+    }
+    for (Value argToErase : argsToErase) {
+      candidates.erase(argToErase);
+    }
+
+    // If an op to fuse was not found, we add all current candidates  to the
+    // result.
+    if (!fusionHappened) {
+      for (auto &candidate : candidates) {
+        mappedArgs.insert(candidate);
+      }
+      break;
+    }
+  }
+  FusionCluster fusionCluster;
+  fusionCluster.root = rootOp;
+  fusionCluster.operations = resultOps;
+
+  // Add tensor.empty ops to the cluster.
+  for (auto *op : resultOps) {
+    if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(op)) {
+      for (auto &operand : dpsOp.getDpsInitOperands()) {
+        if (auto emptyOp = dyn_cast_or_null<tensor::EmptyOp>(
+                operand->get().getDefiningOp()))
+          fusionCluster.operations.insert(emptyOp);
+      }
+    }
+  }
+
+  llvm::append_range(fusionCluster.argDimsMapping, mappedArgs);
+  return fusionCluster;
+}
+
+// Searches through the inner-most dimensions of the arguments of the fusion
+// cluster to find the most beneficial dimension to tile. Default tile size is 1
+// x ... x 1 x vector_size, which leads to vector.transfer_write to the init
+// tensor.
+// In case of broadcast, transpose and other maps with the non-identity mapping
+// between op input and op result the innermost dimension of the input can be
+// different from the one of result.
+SmallVector<int64_t> optimizeTileSizes(const FusionCluster &fusionCluster,
+                                       int64_t vectorSize) {
+  auto rootTy =
+      cast<RankedTensorType>(fusionCluster.root->getResultTypes().front());
+
+  if (rootTy.getRank() == 0) return {};
+  SmallVector<int64_t> tileSizes(rootTy.getRank(), 1);
+  tileSizes.back() = vectorSize;
+
+  int64_t rootInnermostDim = rootTy.getRank() - 1;
+  int64_t innermostDimWithMostElements = rootInnermostDim;
+  int64_t innermostDimMaxElements = std::numeric_limits<int64_t>::min();
+  for (auto &[arg, map] : fusionCluster.argDimsMapping) {
+    auto argInnermostDimIt = llvm::find_if(
+        llvm::reverse(map),
+        [](int64_t item) { return item != kNotMappedToRootDims; });
+    if (argInnermostDimIt == map.rend()) continue;
+    int64_t argInnermostDim = *argInnermostDimIt;
+    if (argInnermostDim == rootInnermostDim) continue;
+
+    int64_t numElements = rootTy.getDimSize(argInnermostDim);
+    if (innermostDimMaxElements >= numElements &&
+        !ShapedType::isDynamic(numElements))
+      continue;
+    innermostDimMaxElements = numElements;
+    innermostDimWithMostElements = argInnermostDim;
+  }
+  tileSizes[innermostDimWithMostElements] = vectorSize;
+  return tileSizes;
+}
+
+template <typename OpTy>
+struct FusionClusterPattern : public OpRewritePattern<OpTy> {
+  FusionClusterPattern(MLIRContext *context, int64_t vectorSize,
+                       bool fuseDegenerateReshapes, PatternBenefit benefit = 1)
+      : OpRewritePattern<OpTy>(context, benefit),
+        vectorSize(vectorSize),
+        fuseDegenerateReshapes(fuseDegenerateReshapes) {}
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    if (hasSingleElementOperandsAndResults(op)) return failure();
+    if (hasLabel(op, kFusionPlanningLabel)) return failure();
+    if (hasLabel(op, kTransformedLabel)) return failure();
+    if (op->template getParentOfType<gml_st::FusionOp>()) return failure();
+
+    // Find the root from which to start tiling and fusion.
+    auto fusionFilterFn = [&](Operation *op) {
+      if (fuseDegenerateReshapes) {
+        if (auto reshapeOp = dyn_cast<tensor::CollapseShapeOp>(op))
+          return isDegenerateReshapeOp(reshapeOp);
+        if (auto reshapeOp = dyn_cast<tensor::ExpandShapeOp>(op))
+          return isDegenerateReshapeOp(reshapeOp);
+      }
+      // Add thlo.concatenate here.
+      return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp,
+                 linalg::TransposeOp, thlo::ReverseOp>(op);
+    };
+    Operation *fusionRoot = findRootElementwiseOp(op, fusionFilterFn);
+
+    // Find the fusion cluster and its arguments.
+    FusionCluster fusionCluster =
+        findElementwiseCluster(fusionRoot, fusionFilterFn);
+
+    // Find what dimensions to tile.
+    SmallVector<int64_t> tileSizes =
+        optimizeTileSizes(fusionCluster, vectorSize);
+
+    for (auto *clusterOp : fusionCluster.operations)
+      setLabel(clusterOp, kFusionPlanningLabel);
+
+    auto fusionOp = wrapFusionCluster(rewriter, fusionCluster);
+    if (failed(fusionOp)) return failure();
+
+    fusionOp->setParallelTileSizes(tileSizes);
+    setLabel(*fusionOp, kElementwiseLabel);
+
+    return success();
+  }
+
+ private:
+  int64_t vectorSize;
+  bool fuseDegenerateReshapes;
+};
+
+LogicalResult tileAndFuse(FusionOp fusionOp, PatternRewriter &rewriter) {
+  if (hasLabel(fusionOp, kTransformedLabel)) return failure();
+  if (!hasLabel(fusionOp, kElementwiseLabel)) return failure();
+
+  auto *tilingRootOp = fusionOp.getTerminator().getValues()[0].getDefiningOp();
+  auto tileSizes = *fusionOp.getParallelTileSizes();
+
+  // Tile and fuse.
+  auto tiledLoop = tileUsingSCFForallOpAndFuseGreedily(
+      rewriter, tilingRootOp, getSCFTilingOptions(tileSizes));
+  if (failed(tiledLoop)) return failure();
+
+  // Peel.
+  auto peelingResult = peelAllLoops(tiledLoop->loop, rewriter);
+  setLabel(tiledLoop->loop, kPerfectlyTiledLoopLabel);
+
+  // Tile ops in the peeled loop again, to size 1, so they can be
+  // scalarized.
+  if (failed(tilePeeledOpsToScalars(rewriter, peelingResult))) return failure();
+
+  setLabel(fusionOp, kTransformedLabel);
+  return success();
+}
+
+struct TransformElementwiseForCpuPass
+    : public impl::TransformElementwiseForCpuPassBase<
+          TransformElementwiseForCpuPass> {
+  using Base::Base;
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
+                    linalg::LinalgDialect, tensor::TensorDialect,
+                    scf::SCFDialect>();
+    linalg::registerTilingInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    // Cleanup passes to prepare ops for better clustering.
+    {
+      RewritePatternSet patterns(ctx);
+      populateDuplicateInitOpsPatterns(patterns);
+
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+        return signalPassFailure();
+    }
+
+    {
+      RewritePatternSet patterns(ctx);
+      // clang-format off
+      patterns.add<
+        FusionClusterPattern<linalg::BroadcastOp>,
+        FusionClusterPattern<linalg::FillOp>,
+        FusionClusterPattern<linalg::MapOp>,
+        FusionClusterPattern<linalg::TransposeOp>,
+        FusionClusterPattern<thlo::ReverseOp>
+      >(ctx, vectorSize, fuseDegenerateReshapes);
+      // clang-format on
+
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+        return signalPassFailure();
+    }
+
+    {
+      RewritePatternSet patterns(ctx);
+      patterns.add(tileAndFuse);
+
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+        return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformElementwiseForCpuPass(int64_t vectorSize,
+                                     bool fuseDegenerateReshapes) {
+  TransformElementwiseForCpuPassOptions opts;
+  opts.vectorSize = vectorSize;
+  opts.fuseDegenerateReshapes = fuseDegenerateReshapes;
+  return std::make_unique<mlir::gml_st::TransformElementwiseForCpuPass>(opts);
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_map_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_map_for_cpu.cc
deleted file mode 100644
index 5ada002427d..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_map_for_cpu.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "gml_st/transforms/fusion/fusion.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/peeling/peeling.h"
-#include "gml_st/transforms/tiling/tiling.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMMAPFORCPUPASS
-#include "gml_st/transforms/passes.h.inc"
-
-static constexpr llvm::StringRef kMapTransformedLabel =
-    "__map_transformed_label__";
-
-struct TileMapPattern : public OpRewritePattern<linalg::MapOp> {
-  TileMapPattern(MLIRContext *context, int64_t innerDimTileSize,
-                 PatternBenefit benefit = 1)
-      : OpRewritePattern<linalg::MapOp>(context, benefit),
-        innerDimTileSize(innerDimTileSize) {}
-
-  LogicalResult matchAndRewrite(linalg::MapOp op,
-                                PatternRewriter &rewriter) const override {
-    if (hasLabel(op, kMapTransformedLabel)) return failure();
-
-    if (isa<scf::ForallOp, scf::ForOp>(op->getParentOp())) {
-      return rewriter.notifyMatchFailure(
-          op, "has already been tiled by another pass.");
-    }
-
-    auto fuseFilterFn = [](Operation *op) {
-      return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp>(op);
-    };
-
-    // Find there another linalg.map where this op can be fused.
-    op = findRootMap(op, fuseFilterFn);
-
-    if (hasLabel(op, kMapTransformedLabel)) return failure();
-
-    scf::SCFTilingOptions opts;
-    opts.setTileSizeComputationFunction([&](OpBuilder &b, Operation *op) {
-      auto numLoops = cast<linalg::MapOp>(op).getNumLoops();
-      SmallVector<Value> tiles(
-          numLoops, b.create<arith::ConstantIndexOp>(op->getLoc(), 1));
-      if (!tiles.empty())
-        tiles.back() =
-            b.create<arith::ConstantIndexOp>(op->getLoc(), innerDimTileSize);
-      return tiles;
-    });
-
-    auto tiledLoop = tileUsingSCFForallOpAndFuseGreedily(
-        rewriter, op, opts, kMapTransformedLabel, fuseFilterFn);
-    if (failed(tiledLoop)) return failure();
-
-    // Peel parallel loops.
-    auto peelingResult = peelAllLoops(*tiledLoop, rewriter);
-    setLabel(*tiledLoop, kPerfectlyTiledLoopLabel);
-
-    // Tile ops in the peeled loop again, to size 1, so they can be
-    // scalarized.
-    if (failed(tilePeeledOpsToScalars(rewriter, peelingResult,
-                                      kMapTransformedLabel, fuseFilterFn)))
-      return failure();
-
-    return success();
-  }
-
- private:
-  // Find the root of the fusion cluster.
-  linalg::MapOp findRootMap(
-      linalg::MapOp op,
-      llvm::function_ref<bool(Operation *)> fuseFilterFn) const {
-    linalg::MapOp rootMap = op;
-
-    Operation *curOp = op;
-    while (fuseFilterFn(curOp)) {
-      auto users = llvm::to_vector(curOp->getUsers());
-      // The op has more than 1 user. It will no be fused.
-      if (users.size() != 1) break;
-      curOp = users[0];
-
-      if (auto curMap = dyn_cast<linalg::MapOp>(curOp)) rootMap = curMap;
-    }
-    return rootMap;
-  }
-
-  int64_t innerDimTileSize;
-};
-
-struct TransformMapForCpuPass
-    : public impl::TransformMapForCpuPassBase<TransformMapForCpuPass> {
-  explicit TransformMapForCpuPass(int64_t ts) { tileSize = ts; }
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
-                    linalg::LinalgDialect, tensor::TensorDialect,
-                    scf::SCFDialect>();
-    linalg::registerTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *context = &getContext();
-
-    RewritePatternSet patterns(context);
-    patterns.add<TileMapPattern>(context, tileSize);
-    populateCollapseForallOpDimensionsPattern(patterns);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-
-    f.walk([](linalg::MapOp op) { removeLabel(op, kMapTransformedLabel); });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformMapForCpuPass(int64_t tileSize) {
-  return std::make_unique<mlir::gml_st::TransformMapForCpuPass>(tileSize);
-}
-
-}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_matmul_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_matmul_for_cpu.cc
deleted file mode 100644
index f72e7f8772b..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_matmul_for_cpu.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "gml_st/transforms/fusion/fusion.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/peeling/peeling.h"
-#include "gml_st/transforms/tiling/tiling.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Dialect/SCF/Transforms/Transforms.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMMATMULFORCPUPASS
-#include "gml_st/transforms/passes.h.inc"
-
-FailureOr<TilingResult> tileMatmul(PatternRewriter &rewriter, Operation *op,
-                                   ArrayRef<int64_t> tileSizes) {
-  return tileUsingSCFForallOp(getSCFTilingOptions(tileSizes), rewriter,
-                              cast<TilingInterface>(op));
-}
-
-/// Pattern to tile `linalg.matmul`, fuse `linalg.fill` into generated
-/// `gml_st.parallel`, and peel the generated loops.
-struct MatmulTransformPattern : public OpRewritePattern<linalg::MatmulOp> {
-  using OpRewritePattern<linalg::MatmulOp>::OpRewritePattern;
-
-  MatmulTransformPattern(MLIRContext *context,
-                         MatmulTileSizeComputationFn tileSizeFn,
-                         PatternBenefit benefit = 1)
-      : OpRewritePattern<linalg::MatmulOp>(context, benefit),
-        tileSizeFn(std::move(tileSizeFn)) {}
-
-  LogicalResult matchAndRewrite(linalg::MatmulOp matmulOp,
-                                PatternRewriter &rewriter) const override {
-    if (hasLabel(matmulOp, kTransformedLabel))
-      return rewriter.notifyMatchFailure(matmulOp,
-                                         "has already been transformed.");
-
-    auto cluster = findMapFusionCluster(matmulOp);
-    auto fusionCluster = cluster.operations;
-    auto *tilingRoot = cluster.root;
-
-    auto lhsTy = matmulOp.getOperandTypes()[0].cast<ShapedType>();
-    auto resultTy = matmulOp.getResultTypes()[0].cast<ShapedType>();
-
-    auto tileSize = tileSizeFn(
-        {resultTy.getDimSize(0), resultTy.getDimSize(1), lhsTy.getDimSize(1)});
-
-    // Tiling of linalg.map requires two dimensions, linalg.matmul requires
-    // three.
-    SmallVector<int64_t> parallelDimsTileSizes{tileSize.m, tileSize.n};
-    if (isa<linalg::MatmulOp>(tilingRoot)) parallelDimsTileSizes.push_back(0);
-
-    // First level tiling: parallel dimensions.
-    auto tilingParallelDimsResult =
-        tileMatmul(rewriter, tilingRoot, parallelDimsTileSizes);
-    if (failed(tilingParallelDimsResult)) return failure();
-
-    // Update the results if tiling occurred.
-    if (tilingParallelDimsResult->loop != nullptr) {
-      rewriter.replaceOp(tilingRoot,
-                         tilingParallelDimsResult->loop->getResults());
-      tilingRoot = tilingParallelDimsResult->tiledOps.front();
-
-      // Fuse ops into the loop.
-      fuseGreedily(rewriter, *tilingRoot->getBlock(),
-                   [&](Operation *op) { return fusionCluster.contains(op); });
-      (void)fuseFillOpsIntoForallOp(rewriter, tilingParallelDimsResult->loop);
-    }
-
-    // Second level tiling: reduction dimension for matmuls.
-    SmallVector<scf::SCFTilingResult> tilingReductionDimsResults;
-    for (auto op :
-         llvm::to_vector(tilingRoot->getBlock()->getOps<linalg::MatmulOp>())) {
-      auto result = tileMatmulReductionDims(rewriter, op, tileSize);
-      if (failed(result)) return failure();
-      tilingReductionDimsResults.push_back(*result);
-    }
-
-    // Peel parallel loops.
-    //
-    // We only want to peel (1) the parallel loop then (2) our kernel.
-    auto peelingResult = peelAllLoops(tilingParallelDimsResult->loop, rewriter);
-
-    // Peel reduction loop inside the main parallel loop, label the main loop as
-    // "perfectly tiled" one, to enable vectorization after canonicalization.
-    for (auto &res : tilingReductionDimsResults) {
-      if (res.loops.size() == 1) {
-        auto peelingResult = peelSCFForOp(rewriter, res.loops.front());
-        setLabel(peelingResult.mainLoop, kPerfectlyTiledLoopLabel);
-      }
-    }
-    return success();
-  }
-
- private:
-  FailureOr<scf::SCFTilingResult> tileMatmulReductionDims(
-      PatternRewriter &rewriter, linalg::MatmulOp matmulOp,
-      const MatmulSizes &tileSize) const {
-    SmallVector<int64_t> reductionDimsTileSizes{0, 0, tileSize.k};
-    scf::SCFTilingOptions opts;
-    opts.setTileSizes(reductionDimsTileSizes);
-    auto tilingReductionDimsResult =
-        scf::tileUsingSCFForOp(rewriter, matmulOp.getOperation(), opts);
-    if (failed(tilingReductionDimsResult)) return failure();
-
-    // Update the results if tiling occurred.
-    if (!tilingReductionDimsResult->loops.empty()) {
-      rewriter.replaceOp(matmulOp, tilingReductionDimsResult->replacements);
-      matmulOp =
-          cast<linalg::MatmulOp>(tilingReductionDimsResult->tiledOps.front());
-    }
-
-    setLabel(matmulOp, kTransformedLabel);
-    return tilingReductionDimsResult;
-  }
-
-  MatmulTileSizeComputationFn tileSizeFn;
-};
-
-struct TransformMatmulForCpuPass
-    : public impl::TransformMatmulForCpuPassBase<TransformMatmulForCpuPass> {
-  TransformMatmulForCpuPass() = default;
-
-  explicit TransformMatmulForCpuPass(MatmulTileSizeComputationFn tileSizeFn)
-      : tileSizeFn(tileSizeFn ? std::move(tileSizeFn)
-                              : [](MatmulSizes) -> MatmulSizes {
-          return {4, 4, 4};
-        }) {}
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<arith::ArithDialect, linalg::LinalgDialect, scf::SCFDialect,
-                    tensor::TensorDialect>();
-    linalg::registerTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    RewritePatternSet patterns(ctx);
-    patterns.add<MatmulTransformPattern>(ctx, tileSizeFn);
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-      return signalPassFailure();
-
-    // Ensure we drop the marker in the end.
-    f.walk([](linalg::MatmulOp op) { removeLabel(op, kTransformedLabel); });
-  }
-
- private:
-  MatmulTileSizeComputationFn tileSizeFn;
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformMatmulForCpuPass(MatmulTileSizeComputationFn tileSizeFn) {
-  return std::make_unique<mlir::gml_st::TransformMatmulForCpuPass>(
-      std::move(tileSizeFn));
-}
-
-}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_mmt4d_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_mmt4d_for_cpu.cc
index 02b77aa4be7..5a1cf24867e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_mmt4d_for_cpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_mmt4d_for_cpu.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -38,7 +39,8 @@ namespace {
 FailureOr<Operation *> tileUsingSCFForAndReplace(
     PatternRewriter &rewriter, Operation *op,
     const scf::SCFTilingOptions &tilingOptions) {
-  auto tilingResult = scf::tileUsingSCFForOp(rewriter, op, tilingOptions);
+  auto tilingResult = scf::tileUsingSCFForOp(
+      rewriter, cast<TilingInterface>(op), tilingOptions);
   if (failed(tilingResult) || tilingResult->loops.empty()) return failure();
   rewriter.replaceOp(op, tilingResult->replacements);
   return tilingResult->tiledOps.front();
@@ -96,8 +98,8 @@ LogicalResult tileMmt4DOp(linalg::Mmt4DOp mmt4dOp, PatternRewriter &rewriter) {
     iterTypes.resize(parallelTileSizes.size());
   }
 
-  splitParallelAndReductionTiles(mmt4dOp.getOperation(), parallelTileSizes,
-                                 reductionTileSizes);
+  splitParallelAndReductionTiles(cast<linalg::LinalgOp>(mmt4dOp.getOperation()),
+                                 parallelTileSizes, reductionTileSizes);
 
   // Tile the parallel loops.
   auto tiledOp = tileUsingSCFForAndReplace(
@@ -127,15 +129,12 @@ struct TransformMmt4DForCpuPass
 
   void runOnOperation() override {
     func::FuncOp func = getOperation();
+    MLIRContext *ctx = &getContext();
 
-    RewritePatternSet patterns(&getContext());
+    RewritePatternSet patterns(ctx);
     patterns.add(tileMmt4DOp);
-
     if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
       return signalPassFailure();
-
-    // Ensure we drop the marker in the end.
-    func.walk([](linalg::Mmt4DOp op) { removeLabel(op, kTransformedLabel); });
   }
 };
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_pack_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_pack_for_cpu.cc
index f067b8db1df..e874cd5c747 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_pack_for_cpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_pack_for_cpu.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -47,7 +48,8 @@ FailureOr<Operation *> tileUsingSCFForAndReplace(
     const scf::SCFTilingOptions &tilingOptions) {
   if (hasLabel(op, kTransformedLabel)) return failure();
 
-  auto tilingResult = scf::tileUsingSCFForOp(rewriter, op, tilingOptions);
+  auto tilingResult = scf::tileUsingSCFForOp(
+      rewriter, cast<TilingInterface>(op), tilingOptions);
   if (failed(tilingResult) || tilingResult->loops.empty()) return failure();
 
   for (Operation *tiledOp : tilingResult->tiledOps)
@@ -114,7 +116,6 @@ struct TransformPackForCpuPass
       RewritePatternSet patterns(ctx);
       patterns.add(tilePackOp);
       patterns.add(tileUnpackOp);
-
       if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
         return signalPassFailure();
     }
@@ -123,9 +124,8 @@ struct TransformPackForCpuPass
     // canonicalize tiled ops.
     {
       RewritePatternSet patterns(ctx);
-      linalg::populateLinalgTilingCanonicalizationPatterns(patterns);
-      patterns.add<linalg::GeneralizeOuterUnitDimsPackOpPattern>(ctx);
-      patterns.add<linalg::GeneralizeOuterUnitDimsUnPackOpPattern>(ctx);
+      patterns.add<linalg::GeneralizeOuterUnitDimsPackOpPattern,
+                   linalg::GeneralizeOuterUnitDimsUnPackOpPattern>(ctx);
       if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
         return signalPassFailure();
     }
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
index 0c54e77bf78..84efb1d772f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
@@ -14,10 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <functional>
 #include <memory>
+#include <optional>
 #include <utility>
 
-#include "gml_st/IR/gml_st_ops.h"
 #include "gml_st/transforms/fusion/fusion.h"
 #include "gml_st/transforms/passes.h"
 #include "gml_st/transforms/peeling/peeling.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "gml_st/transforms/transforms.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
@@ -32,10 +34,12 @@ limitations under the License.
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "thlo/IR/thlo_ops.h"
 
 namespace mlir::gml_st {
 namespace {
@@ -43,8 +47,11 @@ namespace {
 #define GEN_PASS_DEF_TRANSFORMREDUCEFORCPUPASS
 #include "gml_st/transforms/passes.h.inc"
 
-constexpr llvm::StringRef kReduceTransformedLabel =
-    "__reduce_transformed_label__";
+struct Reduce1DTileSizes {
+  int64_t tileSize;
+  int64_t splitRatio;
+};
+using Reduce1DTileSizeComputationFn = std::function<Reduce1DTileSizes(int64_t)>;
 
 SmallVector<int64_t> getParallelDimTileSizes(int64_t reductionDim,
                                              int64_t parallelDimTileSize) {
@@ -73,7 +80,7 @@ LogicalResult validateOp(linalg::ReduceOp reduceOp, PatternRewriter &rewriter,
   const int64_t operandRank =
       operands[0]->get().getType().cast<RankedTensorType>().getRank();
   if (operandRank != expectedRank) {
-    return rewriter.notifyMatchFailure(reduceOp, [&](::mlir::Diagnostic &diag) {
+    return rewriter.notifyMatchFailure(reduceOp, [&](Diagnostic &diag) {
       diag << "expects rank " << expectedRank << ". " << operandRank
            << "received.";
     });
@@ -84,28 +91,73 @@ LogicalResult validateOp(linalg::ReduceOp reduceOp, PatternRewriter &rewriter,
 struct Reduce1DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
   using OpRewritePattern<linalg::ReduceOp>::OpRewritePattern;
 
-  explicit Reduce1DTransformPattern(MLIRContext *context, int64_t vectorSize,
-                                    int64_t tileSize,
+  explicit Reduce1DTransformPattern(MLIRContext *context,
+                                    Reduce1DTileSizeComputationFn tileSizeFn,
                                     PatternBenefit benefit = 1)
       : OpRewritePattern<linalg::ReduceOp>(context, benefit),
-        vectorSize(vectorSize),
-        tileSize(tileSize) {}
+        tileSizeFn(std::move(tileSizeFn)) {}
 
   LogicalResult matchAndRewrite(linalg::ReduceOp reduceOp,
                                 PatternRewriter &rewriter) const override {
-    if (hasLabel(reduceOp, kReduceTransformedLabel)) {
+    if (hasLabel(reduceOp, kTransformedLabel)) {
       return rewriter.notifyMatchFailure(reduceOp,
                                          "has already been transformed.");
     }
-
-    if (isa<scf::ForallOp, scf::ForOp>(reduceOp->getParentOp())) {
-      return rewriter.notifyMatchFailure(
-          reduceOp, "has already been tiled by another pass.");
-    }
-
     if (failed(validateOp(reduceOp, rewriter, /*expectedRank=*/1)))
       return failure();
 
+    int64_t inputSize =
+        reduceOp.getOperand(0).getType().cast<RankedTensorType>().getDimSize(0);
+    Reduce1DTileSizes tileSizes = tileSizeFn(inputSize);
+
+    // Rewrite as a tree reduction.
+    FailureOr<SplitReduce1DResult> splitReduce = rewriteReduce1D(
+        rewriter, reduceOp, tileSizes.tileSize, tileSizes.splitRatio);
+    if (failed(splitReduce)) {
+      return rewriter.notifyMatchFailure(reduceOp,
+                                         "failed to split reduction dimension");
+    }
+    scf::ForOp mainLoop = splitReduce->mainLoop;
+    scf::ForOp tailLoop = splitReduce->tailLoop;
+
+    // Fusion.
+    auto fusionFilterFn = [](Operation *op) {
+      return isa<linalg::FillOp, linalg::MapOp, thlo::ReverseOp>(op);
+    };
+    SmallVector<Block *> blocks;
+    if (mainLoop) blocks.push_back(mainLoop.getBody());
+    if (tailLoop) blocks.push_back(tailLoop.getBody());
+    fuseGreedily(rewriter, blocks, fusionFilterFn);
+
+    // Tiling to 1 and fusion in the tail loop.
+    if (tailLoop) {
+      for (auto reduOp :
+           llvm::to_vector(tailLoop.getBody()->getOps<linalg::ReduceOp>())) {
+        if (failed(tileUsingSCFForOpAndFuseGreedily(
+                rewriter, reduOp, getSCFTilingOptions({1}), fusionFilterFn))) {
+          return failure();
+        }
+      }
+    }
+    return success();
+  }
+
+ private:
+  struct SplitReduce1DResult {
+    scf::ForOp mainLoop;
+    scf::ForOp tailLoop;
+    linalg::ReduceOp horizontalReduce;
+    Value result;
+  };
+  // Split reduction tensor<N*tile_size+M x elem_type> -> tensor<elem_type> into
+  //  * scf.for that reduces
+  //    tensor<N*tile_size> -> tensor<split_ratio x elem_type>
+  //  * horizontal reduce tensor<split_ratio x elem_type> -> tensor<elem_type>
+  //  * scf.for that reduces the remaining M elements.
+  FailureOr<SplitReduce1DResult> rewriteReduce1D(PatternRewriter &rewriter,
+                                                 linalg::ReduceOp reduceOp,
+                                                 int64_t tileSize,
+                                                 int64_t splitRatio) const {
     // 0-d tensor with the neutral elements.
     auto fillOp = reduceOp.getInits().front().getDefiningOp<linalg::FillOp>();
     if (!fillOp)
@@ -121,107 +173,129 @@ struct Reduce1DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
 
     // Input.
     Value input = reduceOp.getInputs().front();
-    Value inputSize = rewriter.create<tensor::DimOp>(loc, input, 0);
+    FailureOr<OpFoldResult> inputSizeOfr =
+        tensor::createDimValue(rewriter, loc, input, 0);
+    if (failed(inputSizeOfr))
+      return rewriter.notifyMatchFailure(reduceOp, "cannot get input size");
 
     // Loop boundaries.
     //   tileableBound = inputSize - inputSize % tileSize
     //   remainderSize = inputSize - tileableBound
-    Value tileableBound = getTileableBound(rewriter, loc, inputSize);
-    Value remainderSize =
-        getRemainderSize(rewriter, loc, tileableBound, inputSize);
+    OpFoldResult tileableBoundOfr =
+        getTileableBound(rewriter, loc, *inputSizeOfr, tileSize);
+    Value tileableBoundValue =
+        getValueOrCreateConstantIndexOp(rewriter, loc, tileableBoundOfr);
 
-    // Create tensor<VECTOR_SIZExELEM_TYPE> with neutral elements for tile loop
+    OpFoldResult remainderSize =
+        getRemainderSize(rewriter, loc, tileableBoundOfr, *inputSizeOfr);
+
+    // Create tensor<SPLIT_RATIOxELEM_TYPE> with neutral elements for tile loop
     // init.
     Type elementType = neutralValue.getType();
     Value emptyVector = rewriter.create<tensor::EmptyOp>(
-        loc, llvm::ArrayRef({vectorSize}), elementType);
+        loc, llvm::ArrayRef({splitRatio}), elementType);
     Value filledVector =
         rewriter.create<linalg::FillOp>(loc, neutralValue, emptyVector)
             .getResult(0);
 
-    auto tiledLoopBodyBuilder = [&](OpBuilder &b, Location loc, Value iv,
-                                    ValueRange inits) {
-      // Tile input as tensor<TILE_SIZExELEM_TYPE> and reshape into
-      // tensor<(TILE_SIZE/VECTOR_SIZE)xVECTOR_SIZExELEM_TYPE>.
-      Value inputSlice = tileAndReshapeInput(b, loc, iv, input, elementType);
-
-      tensor::ExtractSliceOp initSlice = create1DSlice(
-          b, loc, inits.front(), b.getIndexAttr(0), b.getIndexAttr(vectorSize));
-
-      // Create `linalg.reduce` to combine
-      // `tensor<(TILE_SIZE/VECTOR_SIZE)xVECTOR_SIZExELEM_TYPE> input with the
-      // `tensor<VECTOR_SIZExELEM_TYPE>` accumulator.
-      auto tiledReduceOp = b.create<linalg::ReduceOp>(
-          loc, ValueRange{inputSlice}, ValueRange{initSlice},
-          /*dimensions=*/SmallVector<int64_t>{0},
-          /*bodyBuilder=*/nullptr, linalg::getPrunedAttributeList(reduceOp));
-      OpBuilder::InsertionGuard g(rewriter);
-      Region &region = tiledReduceOp.getRegion();
-      rewriter.cloneRegionBefore(reduceOp.getRegion(), region, region.end());
-      setLabel(tiledReduceOp, kReduceTransformedLabel);
-
-      b.create<scf::YieldOp>(loc, tiledReduceOp.getResults());
-    };
-
     // Create a tiled loop
-    auto tiledLoop =
-        rewriter.create<scf::ForOp>(loc, zero, tileableBound, tileSizeValue,
-                                    filledVector, tiledLoopBodyBuilder);
-    setLabel(tiledLoop, kPerfectlyTiledLoopLabel);
+    SplitReduce1DResult splitResult;
+    splitResult.result = fillOp.getResult(0);
 
-    // Create `linalg.reduce` from tensor<VECTOR_SIZExELEM_TYPE> to
-    // tensor<ELEM_TYPE>.
-    auto horizontalReduce =
-        cloneReduceOp(rewriter, reduceOp, tiledLoop.getResult(0),
-                      reduceOp.getInits().front());
+    std::optional<int64_t> tileableBoundConstant =
+        getConstantIntValue(tileableBoundOfr);
+    if (!tileableBoundConstant || tileableBoundConstant != 0) {
+      auto tiledLoopBodyBuilder = [&](OpBuilder &b, Location loc, Value iv,
+                                      ValueRange inits) {
+        // Tile input as tensor<TILE_SIZExELEM_TYPE> and reshape into
+        // tensor<(TILE_SIZE/SPLIT_RATIO)xSPLIT_RATIOxELEM_TYPE>.
+        Value inputSlice = tileAndReshapeInput(b, loc, iv, input, elementType,
+                                               tileSize, splitRatio);
 
-    auto remainderLoopBodyBuilder = [&](OpBuilder &b, Location loc, Value iv,
-                                        ValueRange inits) {
-      Value inputSlice = create1DSlice(b, loc, input, iv, remainderSize);
+        tensor::ExtractSliceOp initSlice =
+            create1DSlice(b, loc, inits.front(), b.getIndexAttr(0),
+                          b.getIndexAttr(splitRatio));
 
-      Value initSlice = b.create<tensor::ExtractSliceOp>(
-          loc, inits.front(), /*offsets=*/SmallVector<OpFoldResult>{},
-          /*sizes=*/SmallVector<OpFoldResult>{},
-          /*strides=*/SmallVector<OpFoldResult>{});
+        // Create `linalg.reduce` to combine
+        // `tensor<(TILE_SIZE/SPLIT_RATIO)xSPLIT_RATIOxELEM_TYPE> input with the
+        // `tensor<SPLIT_RATIOxELEM_TYPE>` accumulator.
+        auto tiledReduceOp = b.create<linalg::ReduceOp>(
+            loc, ValueRange{inputSlice}, ValueRange{initSlice},
+            /*dimensions=*/SmallVector<int64_t>{0},
+            /*bodyBuilder=*/nullptr, linalg::getPrunedAttributeList(reduceOp));
+        OpBuilder::InsertionGuard g(rewriter);
+        Region &region = tiledReduceOp.getRegion();
+        rewriter.cloneRegionBefore(reduceOp.getRegion(), region, region.end());
+        setLabel(tiledReduceOp, kTransformedLabel);
 
-      auto newReduce = cloneReduceOp(b, reduceOp, inputSlice, initSlice);
-      b.create<scf::YieldOp>(loc, newReduce);
-    };
+        b.create<scf::YieldOp>(loc, tiledReduceOp.getResults());
+      };
+
+      splitResult.mainLoop = rewriter.create<scf::ForOp>(
+          loc, zero, tileableBoundValue, tileSizeValue, filledVector,
+          tiledLoopBodyBuilder);
+      setLabel(splitResult.mainLoop, kPerfectlyTiledLoopLabel);
+
+      // Create `linalg.reduce` from tensor<SPLIT_RATIOxELEM_TYPE> to
+      // tensor<ELEM_TYPE>.
+      splitResult.horizontalReduce =
+          cloneReduceOp(rewriter, reduceOp, splitResult.mainLoop.getResult(0),
+                        reduceOp.getInits().front());
+      splitResult.result = splitResult.horizontalReduce.getResult(0);
+    }
 
     // Combine `horizontal reduce` with the tail of the input. The tail is
     // always smaller than TILE_SIZE.
-    auto remainderLoop =
-        rewriter
-            .create<scf::ForOp>(loc, tileableBound, inputSize, tileSizeValue,
-                                horizontalReduce, remainderLoopBodyBuilder)
-            .getResult(0);
+    std::optional<int64_t> tripCount = constantTripCount(
+        tileableBoundOfr, *inputSizeOfr, rewriter.getIndexAttr(tileSize));
+    scf::ForOp remainderLoop;
+    if (!tripCount || *tripCount > 0) {
+      auto remainderLoopBodyBuilder = [&](OpBuilder &b, Location loc, Value iv,
+                                          ValueRange inits) {
+        Value inputSlice = create1DSlice(b, loc, input, iv, remainderSize);
 
-    rewriter.replaceOp(reduceOp, remainderLoop);
+        Value initSlice = b.create<tensor::ExtractSliceOp>(
+            loc, inits.front(), /*offsets=*/SmallVector<OpFoldResult>{},
+            /*sizes=*/SmallVector<OpFoldResult>{},
+            /*strides=*/SmallVector<OpFoldResult>{});
 
-    return success();
+        linalg::ReduceOp newReduce =
+            cloneReduceOp(b, reduceOp, inputSlice, initSlice);
+        b.create<scf::YieldOp>(loc, newReduce->getResults());
+      };
+      splitResult.tailLoop = rewriter.create<scf::ForOp>(
+          loc, tileableBoundValue,
+          getValueOrCreateConstantIndexOp(rewriter, loc, *inputSizeOfr),
+          tileSizeValue, splitResult.result, remainderLoopBodyBuilder);
+      splitResult.result = splitResult.tailLoop.getResult(0);
+    }
+    rewriter.replaceOp(reduceOp, splitResult.result);
+    return splitResult;
   }
 
- private:
-  Value getTileableBound(OpBuilder &b, Location loc, Value inputSize) const {
-    if (tileSize == 1) return inputSize;
+  OpFoldResult getTileableBound(OpBuilder &b, Location loc,
+                                OpFoldResult inputSizeOfr,
+                                int64_t tileSize) const {
+    if (tileSize == 1) return inputSizeOfr;
 
-    auto inputSizeInt = getConstantIntValue(inputSize);
-    if (inputSizeInt && *inputSizeInt % tileSize == 0) return inputSize;
+    auto inputSizeInt = getConstantIntValue(inputSizeOfr);
+    if (inputSizeInt && *inputSizeInt < tileSize) return b.getIndexAttr(0);
 
     AffineExpr sym0;
     bindSymbols(b.getContext(), sym0);
 
     auto modMap = AffineMap::get(0, 1, {sym0 - sym0 % tileSize});
-    return b.createOrFold<AffineApplyOp>(loc, modMap, ValueRange{inputSize});
+    return affine::makeComposedFoldedAffineApply(b, loc, modMap, inputSizeOfr);
   }
 
-  Value getRemainderSize(OpBuilder &b, Location loc, Value tileableBound,
-                         Value inputSize) const {
+  OpFoldResult getRemainderSize(OpBuilder &b, Location loc,
+                                OpFoldResult tileableBoundOfr,
+                                OpFoldResult inputSize) const {
     AffineExpr sym0, sym1;
     bindSymbols(b.getContext(), sym0, sym1);
     auto diffMap = AffineMap::get(0, 2, {sym1 - sym0});
-    return b.create<AffineApplyOp>(loc, diffMap,
-                                   ValueRange{tileableBound, inputSize});
+    return affine::makeComposedFoldedAffineApply(b, loc, diffMap,
+                                                 {tileableBoundOfr, inputSize});
   }
 
   tensor::ExtractSliceOp create1DSlice(OpBuilder &b, Location loc, Value source,
@@ -235,34 +309,34 @@ struct Reduce1DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
                                             strides);
   }
 
-  Value cloneReduceOp(OpBuilder &b, linalg::ReduceOp reduceOp,
-                      ValueRange newInputs, Value newInit) const {
+  linalg::ReduceOp cloneReduceOp(OpBuilder &b, linalg::ReduceOp reduceOp,
+                                 ValueRange newInputs, Value newInit) const {
     IRMapping bvm;
     bvm.map(reduceOp.getInputs(), newInputs);
     bvm.map(reduceOp.getInits(), ValueRange{newInit});
 
     auto *newReduceOp = b.clone(*reduceOp.getOperation(), bvm);
-    setLabel(newReduceOp, kReduceTransformedLabel);
-    return newReduceOp->getResult(0);
+    setLabel(newReduceOp, kTransformedLabel);
+    return cast<linalg::ReduceOp>(newReduceOp);
   }
 
   Value tileAndReshapeInput(OpBuilder &b, Location loc, Value iv, Value input,
-                            Type elementType) const {
+                            Type elementType, int64_t tileSize,
+                            int64_t splitRatio) const {
     Value inputSlice =
         create1DSlice(b, loc, input, iv, b.getIndexAttr(tileSize));
 
     auto reshapeType =
-        RankedTensorType::get({tileSize / vectorSize, vectorSize}, elementType);
+        RankedTensorType::get({tileSize / splitRatio, splitRatio}, elementType);
     SmallVector<ReassociationIndices> ri = {{0, 1}};
     return b.create<tensor::ExpandShapeOp>(loc, reshapeType, inputSlice, ri);
   }
 
-  int64_t vectorSize;
-  int64_t tileSize;
+  Reduce1DTileSizeComputationFn tileSizeFn;
 };
 
 /// Pattern to tile `linalg.reduce` and fuse `linalg.fill` into generated
-/// `gml_st.parallel`.
+/// `scf.forall`.
 struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
   using OpRewritePattern<linalg::ReduceOp>::OpRewritePattern;
 
@@ -279,11 +353,14 @@ struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
     if (reduceOp.getDimensions().size() != 1) return failure();
     int64_t reductionDim = reduceOp.getDimensions()[0];
 
-    if (reduceOp->getParentOfType<scf::ForallOp>()) return failure();
-    if (hasLabel(reduceOp, kReduceTransformedLabel)) {
+    if (hasLabel(reduceOp, kTransformedLabel)) {
       return rewriter.notifyMatchFailure(reduceOp,
                                          "has already been transformed.");
     }
+    if (isa<scf::ForallOp, scf::ForOp>(reduceOp->getParentOp())) {
+      return rewriter.notifyMatchFailure(
+          reduceOp, "has already been tiled by another pass.");
+    }
     if (failed(validateOp(reduceOp, rewriter, /*expectedRank=*/2)))
       return failure();
 
@@ -291,84 +368,39 @@ struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
       return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp,
                  linalg::TransposeOp, tensor::CastOp>(op);
     };
+    auto consumerFilterFn = [](Operation *op) {
+      return isa<linalg::MapOp, thlo::ReverseOp>(op);
+    };
     auto fusionClusterFn = [&](Operation *op) {
       return producerFilterFn(op) || isa<linalg::ReduceOp>(op);
     };
-    auto cluster = getFusionCluster(reduceOp, producerFilterFn);
+    auto cluster =
+        getFusionCluster(reduceOp, producerFilterFn, consumerFilterFn);
     auto fusionCluster = cluster.operations;
     auto *tilingRoot = cluster.root;
-    if (!isa<linalg::MapOp>(tilingRoot) && !isa<linalg::ReduceOp>(tilingRoot)) {
-      return rewriter.notifyMatchFailure(
-          tilingRoot,
-          "Expected MapOp or ReduceOp as a root of fusion cluster.");
-    }
 
     // First level tiling: parallel dimension.
-    auto tilingParallelDimsResult =
-        tileParallelDimensions(tilingRoot, rewriter);
+    auto parallelDimsTileSizes =
+        isa<linalg::ReduceOp>(tilingRoot)
+            ? getParallelDimTileSizes(reduceOp.getDimensions()[0],
+                                      parallelDimTileSize)
+            : SmallVector<int64_t>{parallelDimTileSize};
+    auto tilingParallelDimsResult = tileUsingSCFForallOpAndFuseGreedily(
+        rewriter, tilingRoot, getSCFTilingOptions(parallelDimsTileSizes),
+        [&](Operation *op) { return fusionCluster.contains(op); });
     if (failed(tilingParallelDimsResult)) return failure();
 
-    // Update the results if tiling occurred.
-    rewriter.replaceOp(tilingRoot,
-                       tilingParallelDimsResult->loop->getResults());
-    tilingRoot = (tilingParallelDimsResult->tiledOps.front());
-
-    // Fuse greedily into root op.
-    fuseGreedily(rewriter, *tilingRoot->getBlock(),
-                 [&](Operation *op) { return fusionCluster.contains(op); });
-
-    (void)fuseFillOpsIntoForallOp(rewriter, tilingParallelDimsResult->loop);
-
-    // Process main parallel loop.
     auto peeledParallelLoop =
         peelAllLoops(tilingParallelDimsResult->loop, rewriter);
 
+    // Process main parallel loop.
     scf::ForallOp mainParallelLoop = peeledParallelLoop.mainLoop;
     if (mainParallelLoop) {
-      for (auto tiledReduceOp : llvm::to_vector(
-               tilingRoot->getBlock()->getOps<linalg::ReduceOp>())) {
-        FailureOr<scf::SCFTilingResult> reductionDimTilingResult =
-            tileUsingSCFForOpAndFuseGreedily(
-                rewriter, tiledReduceOp,
-                getSCFTilingOptions(getReductionDimTileSizes(
-                    reductionDim, reductionDimTileSize)),
-                kReduceTransformedLabel, producerFilterFn);
-        if (failed(reductionDimTilingResult)) return failure();
-
-        SCFForPeelingResult reductionDimPeelingResult =
-            peelSCFForOp(rewriter, reductionDimTilingResult->loops.front());
-        if (reductionDimPeelingResult.mainLoop) {
-          setLabel(reductionDimPeelingResult.mainLoop,
-                   kPerfectlyTiledLoopLabel);
-        }
-        if (reductionDimPeelingResult.tailLoop) {
-          for (auto reduOp :
-               llvm::to_vector(reductionDimPeelingResult.tailLoop.getBody()
-                                   ->getOps<linalg::ReduceOp>())) {
-            // Column reductions have to be tiled even further, otherwise we
-            // would get vector.multi_reduction 4x1 -> 1, which is expensive.
-            // Potentially, we could lower it to a horizontal add.
-            if (reductionDim == 0) {
-              auto parallelDimSizeOneTilingResult =
-                  tileUsingSCFForOpAndFuseGreedily(
-                      rewriter, reduOp,
-                      getSCFTilingOptions(
-                          getParallelDimTileSizes(reductionDim, 1)),
-                      kReduceTransformedLabel, producerFilterFn);
-              if (failed(parallelDimSizeOneTilingResult)) return failure();
-
-              reduOp = cast<linalg::ReduceOp>(
-                  parallelDimSizeOneTilingResult->tiledOps.front());
-            }
-            if (failed(tileUsingSCFForOpAndFuseGreedily(
-                    rewriter, reduOp,
-                    getSCFTilingOptions(
-                        getReductionDimTileSizes(reductionDim, 1)),
-                    kReduceTransformedLabel, producerFilterFn))) {
-              return failure();
-            }
-          }
-        }
+      auto tiledReduceOp =
+          *mainParallelLoop.getBody()->getOps<linalg::ReduceOp>().begin();
+      if (failed(tileAndPeelReductionDim(rewriter, tiledReduceOp, reductionDim,
+                                         producerFilterFn))) {
+        return failure();
       }
     }
 
@@ -390,19 +422,16 @@ struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
               ? getSCFTilingOptions(getParallelDimTileSizes(reductionDim, 1))
               : getSCFTilingOptions({1});
       auto parallelDimTilingResult = tileUsingSCFForallOpAndFuseGreedily(
-          rewriter, definingOp, parallelDimTilingOpts, kReduceTransformedLabel,
-          fusionClusterFn);
+          rewriter, definingOp, parallelDimTilingOpts, fusionClusterFn);
       if (failed(parallelDimTilingResult)) return failure();
 
-      (void)fuseFillOpsIntoForallOp(rewriter, *parallelDimTilingResult);
-
       for (auto tiledReduceOp :
-           llvm::to_vector(parallelDimTilingResult->getBody()
+           llvm::to_vector(parallelDimTilingResult->loop.getBody()
                                ->getOps<linalg::ReduceOp>())) {
         auto reductionDimTilingResult = tileUsingSCFForOpAndFuseGreedily(
             rewriter, tiledReduceOp,
             getSCFTilingOptions(getReductionDimTileSizes(reductionDim, 1)),
-            kReduceTransformedLabel, producerFilterFn);
+            producerFilterFn);
         if (failed(reductionDimTilingResult)) return failure();
       }
     }
@@ -411,63 +440,50 @@ struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
   }
 
  private:
-  // Find a cluster of operations that can be tiled and fused together around
-  // the root op.
-  FusionCluster getFusionCluster(
-      linalg::ReduceOp reduceOp,
-      llvm::function_ref<bool(Operation *)> filterFn) const {
-    // Find a chain of MapOp users and use the last one as a root of cluster.
-    SetVector<Operation *> resultOps;
-    Operation *rootOp = reduceOp.getOperation();
+  LogicalResult tileAndPeelReductionDim(
+      PatternRewriter &rewriter, linalg::ReduceOp reduceOp,
+      int64_t reductionDim,
+      llvm::function_ref<bool(Operation *)> producerFilterFn) const {
+    FailureOr<scf::SCFTilingResult> reductionDimTilingResult =
+        tileUsingSCFForOpAndFuseGreedily(
+            rewriter, reduceOp,
+            getSCFTilingOptions(
+                getReductionDimTileSizes(reductionDim, reductionDimTileSize)),
+            producerFilterFn);
+    if (failed(reductionDimTilingResult)) return failure();
 
-    while (true) {
-      auto users = llvm::to_vector(rootOp->getUsers());
-
-      if (users.size() != 1) break;
-      if (!isa<linalg::MapOp>(users[0])) break;
-      resultOps.insert(rootOp);
-
-      rootOp = users[0];
+    SCFForPeelingResult reductionDimPeelingResult =
+        peelSCFForOp(rewriter, reductionDimTilingResult->loops.front());
+    if (reductionDimPeelingResult.mainLoop) {
+      setLabel(reductionDimPeelingResult.mainLoop, kPerfectlyTiledLoopLabel);
     }
+    if (reductionDimPeelingResult.tailLoop) {
+      for (auto reduOp :
+           llvm::to_vector(reductionDimPeelingResult.tailLoop.getBody()
+                               ->getOps<linalg::ReduceOp>())) {
+        // Column reductions have to be tiled even further, otherwise we
+        // would get vector.multi_reduction 4x1 -> 1, which is expensive.
+        // Potentially, we could lower it to a horizontal add.
+        if (reductionDim == 0) {
+          auto parallelDimSizeOneTilingResult =
+              tileUsingSCFForOpAndFuseGreedily(
+                  rewriter, reduOp,
+                  getSCFTilingOptions(getParallelDimTileSizes(reductionDim, 1)),
+                  producerFilterFn);
+          if (failed(parallelDimSizeOneTilingResult)) return failure();
 
-    // Run DFS to find all MapOps, TransposeOps, BroadcastOps that can be
-    // fused in the root op.
-    SmallVector<Operation *> remainingProducers;
-    remainingProducers.reserve(reduceOp.getDpsInputOperands().size());
-    resultOps.insert(reduceOp.getOperation());
-    for (Value operand : reduceOp.getOperands())
-      remainingProducers.push_back(operand.getDefiningOp());
-
-    while (!remainingProducers.empty()) {
-      Operation *curOp = remainingProducers.pop_back_val();
-      if (!curOp || resultOps.contains(curOp)) continue;
-      auto linalgOp = dyn_cast<linalg::LinalgOp>(curOp);
-      if (linalgOp && !isa<linalg::ReduceOp>(curOp) && filterFn(curOp)) {
-        resultOps.insert(curOp);
-        for (Value operand : reduceOp.getOperands())
-          remainingProducers.push_back(operand.getDefiningOp());
+          reduOp = cast<linalg::ReduceOp>(
+              parallelDimSizeOneTilingResult->tiledOps.front());
+        }
+        if (failed(tileUsingSCFForOpAndFuseGreedily(
+                rewriter, reduOp,
+                getSCFTilingOptions(getReductionDimTileSizes(reductionDim, 1)),
+                producerFilterFn))) {
+          return failure();
+        }
       }
     }
-    return {resultOps, rootOp};
-  }
-
-  FailureOr<TilingResult> tileParallelDimensions(
-      Operation *tilingRoot, PatternRewriter &rewriter) const {
-    FailureOr<TilingResult> tilingParallelDimsResult;
-    if (auto reduceOp = dyn_cast<linalg::ReduceOp>(tilingRoot)) {
-      tilingParallelDimsResult = tileUsingSCFForallOp(
-          getSCFTilingOptions(getParallelDimTileSizes(
-              reduceOp.getDimensions()[0], parallelDimTileSize)),
-          rewriter, cast<TilingInterface>(reduceOp.getOperation()));
-    } else if (isa<linalg::MapOp>(tilingRoot)) {
-      tilingParallelDimsResult =
-          tileUsingSCFForallOp(getSCFTilingOptions(parallelDimTileSize),
-                               rewriter, cast<TilingInterface>(tilingRoot));
-    } else {
-      return failure();
-    }
-
-    return tilingParallelDimsResult;
+    return success();
   }
 
   int64_t parallelDimTileSize;
@@ -476,19 +492,10 @@ struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
 
 struct TransformReduceForCpuPass
     : public impl::TransformReduceForCpuPassBase<TransformReduceForCpuPass> {
-  TransformReduceForCpuPass() = default;
-
-  explicit TransformReduceForCpuPass(int64_t reduceVectorSize = 8,
-                                     int64_t reduceTileSize1D = 32,
-                                     ArrayRef<int64_t> reduceTileSizes2D = {}) {
-    vectorSize = reduceVectorSize;
-    tileSize1D = reduceTileSize1D;
-    tileSizes2D = reduceTileSizes2D;
-  }
+  using Base::Base;
 
   void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
-                    linalg::LinalgDialect, scf::SCFDialect,
+    registry.insert<arith::ArithDialect, linalg::LinalgDialect, scf::SCFDialect,
                     tensor::TensorDialect>();
     linalg::registerTilingInterfaceExternalModels(registry);
   }
@@ -497,36 +504,32 @@ struct TransformReduceForCpuPass
     func::FuncOp f = getOperation();
     MLIRContext *ctx = &getContext();
 
-    if (tileSizes2D.empty()) {
-      tileSizes2D = {4, 2};
-    }
-
-    assert(tileSizes2D.size() == 2 &&
-           "Tiling sizes for Reduce should have 2 element.");
-
     RewritePatternSet patterns(ctx);
-    patterns.add<Reduce1DTransformPattern>(ctx, vectorSize, tileSize1D);
-    patterns.add<Reduce2DTransformPattern>(ctx, tileSizes2D[0], tileSizes2D[1]);
-    populateCollapseForallOpDimensionsPattern(patterns);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
-      return signalPassFailure();
+    Reduce1DTileSizeComputationFn tilingHeuristic;
+    if (enableHeuristic) {
+      tilingHeuristic = [](int64_t size) {
+        if (!ShapedType::isDynamic(size) && size > 96)
+          return Reduce1DTileSizes{32, 8};
+        return Reduce1DTileSizes{8, 8};
+      };
+    } else {
+      tilingHeuristic = [=](int64_t) {
+        return Reduce1DTileSizes{tileSize1D, splitRatio1D};
+      };
     }
-
-    // Ensure we drop the marker in the end.
-    f.walk([](linalg::ReduceOp reduceOp) {
-      removeLabel(reduceOp, kReduceTransformedLabel);
-    });
+    patterns.add<Reduce1DTransformPattern>(ctx, std::move(tilingHeuristic));
+    patterns.add<Reduce2DTransformPattern>(ctx, parallelDimTileSize2D,
+                                           reductionDimTileSize2D);
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+      return signalPassFailure();
   }
 };
 
 }  // namespace
 
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformReduceForCpuPass(int64_t vectorSize, int64_t tileSize1D,
-                                ArrayRef<int64_t> tileSizes2D) {
-  return std::make_unique<mlir::gml_st::TransformReduceForCpuPass>(
-      vectorSize, tileSize1D, tileSizes2D);
+std::unique_ptr<Pass> createTransformReduceForCpuPass(
+    const TransformReduceForCpuPassOptions &opts) {
+  return std::make_unique<TransformReduceForCpuPass>(opts);
 }
 
 }  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reverse_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reverse_for_cpu.cc
deleted file mode 100644
index 64749b571d8..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reverse_for_cpu.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "gml_st/transforms/fusion/fusion.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/peeling/peeling.h"
-#include "gml_st/transforms/tiling/tiling.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMREVERSEFORCPUPASS
-#include "gml_st/transforms/passes.h.inc"
-
-constexpr llvm::StringRef kReverseTransformedLabel =
-    "__reverse_transformed_label__";
-
-FailureOr<TilingResult> tileReverseAndUpdateResultIfTiled(
-    PatternRewriter &rewriter, thlo::ReverseOp &reverseOp,
-    ArrayRef<int64_t> tileSizes) {
-  auto opts = getSCFTilingOptions(tileSizes);
-  auto tilingResult = tileUsingSCFForallOp(
-      opts, rewriter, cast<TilingInterface>(reverseOp.getOperation()));
-
-  if (failed(tilingResult)) return failure();
-
-  // Update the results if tiling occurred.
-  if (tilingResult->loop != nullptr) {
-    rewriter.replaceOp(reverseOp, tilingResult->loop->getResults());
-    reverseOp = cast<thlo::ReverseOp>(tilingResult->tiledOps.front());
-  }
-
-  return tilingResult;
-}
-
-SmallVector<int64_t> getTileSizes(int64_t rank, int64_t vectorSize,
-                                  bool tileToScalarize) {
-  SmallVector<int64_t> sizes(rank, 1);
-  if (!tileToScalarize) sizes[rank - 1] = vectorSize;
-  return sizes;
-}
-
-/// Pattern to tile `thlo.reverse`.
-struct ReverseTransformPattern : public OpRewritePattern<thlo::ReverseOp> {
-  using OpRewritePattern<thlo::ReverseOp>::OpRewritePattern;
-
-  explicit ReverseTransformPattern(MLIRContext *context, int64_t vectorSize,
-                                   PatternBenefit benefit = 1)
-      : OpRewritePattern<thlo::ReverseOp>(context, benefit),
-        vectorSize(vectorSize) {}
-
-  LogicalResult matchAndRewrite(thlo::ReverseOp reverseOp,
-                                PatternRewriter &rewriter) const override {
-    if (hasLabel(reverseOp, kReverseTransformedLabel))
-      return rewriter.notifyMatchFailure(reverseOp,
-                                         "has already been transformed.");
-    if (isa<scf::ForallOp, scf::ForOp>(reverseOp->getParentOp())) {
-      return rewriter.notifyMatchFailure(
-          reverseOp, "has already been tiled by another pass.");
-    }
-    // Parallel dimension tiling. Tiling will be of the form
-    // 1x1x..x1xVectorSize.
-    int64_t rank = reverseOp.getInput().getType().getRank();
-    auto tilingResult = tileReverseAndUpdateResultIfTiled(
-        rewriter, reverseOp, getTileSizes(rank, vectorSize, false));
-    setLabel(reverseOp, kReverseTransformedLabel);
-
-    // Peel parallel loop.
-    auto peelingResult = peelAllLoops(tilingResult->loop, rewriter);
-
-    // Fold operations in the remainder loops before scalarization. thlo.reverse
-    // will be folded if the last dim is not reversed.
-    for (scf::ForallOp remParLoop : peelingResult.tailLoops) {
-      remParLoop->walk([&](Operation *childOp) {
-        SmallVector<Value> foldValue;
-        if (succeeded(rewriter.tryFold(childOp, foldValue))) {
-          childOp->replaceAllUsesWith(foldValue);
-        }
-      });
-    }
-
-    // Tile ops in the peeled loop again, to size 1, so they can be
-    // scalarized.
-    if (failed(tilePeeledOpsToScalars(rewriter, peelingResult,
-                                      kReverseTransformedLabel,
-                                      /*fuseFilterFn=*/nullptr)))
-      return failure();
-
-    return success();
-  }
-
- private:
-  int64_t vectorSize;
-};
-
-struct TransformReverseForCpuPass
-    : public impl::TransformReverseForCpuPassBase<TransformReverseForCpuPass> {
-  TransformReverseForCpuPass() = default;
-
-  explicit TransformReverseForCpuPass(int64_t reverseVectorSize = 8) {
-    vectorSize = reverseVectorSize;
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
-                    tensor::TensorDialect, scf::SCFDialect>();
-    linalg::registerTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    RewritePatternSet patterns(ctx);
-    patterns.add<ReverseTransformPattern>(ctx, vectorSize);
-    populateCollapseForallOpDimensionsPattern(patterns);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-
-    // Ensure we drop the marker in the end.
-    f.walk([](thlo::ReverseOp reverseOp) {
-      removeLabel(reverseOp, kReverseTransformedLabel);
-    });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformReverseForCpuPass(int64_t vectorSize) {
-  return std::make_unique<mlir::gml_st::TransformReverseForCpuPass>(vectorSize);
-}
-
-}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc
index aec433a8994..a75535891cd 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "gml_st/transforms/transforms.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -36,39 +37,24 @@ namespace {
 #define GEN_PASS_DEF_TRANSFORMSCATTERFORCPUPASS
 #include "gml_st/transforms/passes.h.inc"
 
-constexpr llvm::StringRef kScatterTransformedLabel =
-    "__scatter_transformed_label__";
-
 struct TileScatterPattern : public OpRewritePattern<thlo::ScatterOp> {
   using OpRewritePattern<thlo::ScatterOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(thlo::ScatterOp scatterOp,
                                 PatternRewriter &rewriter) const override {
-    if (hasLabel(scatterOp, kScatterTransformedLabel)) return failure();
-
-    if (isa<scf::ForOp>(scatterOp->getParentOp())) {
-      return rewriter.notifyMatchFailure(
-          scatterOp, "has already been tiled by another pass.");
-    }
+    if (hasLabel(scatterOp, kTransformedLabel)) return failure();
 
     // Tile everything to points and fuse.
     scf::SCFTilingOptions opts;
-    opts.setTileSizeComputationFunction([](OpBuilder &b, Operation *op) {
-      OpBuilder::InsertionGuard guard(b);
-      b.setInsertionPointToStart(
-          &op->getParentOfType<func::FuncOp>().getBody().front());
-
-      auto loops = cast<TilingInterface>(op).getLoopIteratorTypes();
-      return SmallVector<Value>(
-          loops.size(), b.create<arith::ConstantIndexOp>(op->getLoc(), 1));
-    });
+    opts.setTileSizes(
+        SmallVector<int64_t>(scatterOp.getLoopIteratorTypes().size(), 1));
 
     auto fuseFilterFn = [](Operation *op) {
       return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp,
                  thlo::ReverseOp, linalg::TransposeOp>(op);
     };
-    auto tilingResult = tileUsingSCFForOpAndFuseGreedily(
-        rewriter, scatterOp, opts, kScatterTransformedLabel, fuseFilterFn);
+    auto tilingResult = tileUsingSCFForOpAndFuseGreedily(rewriter, scatterOp,
+                                                         opts, fuseFilterFn);
 
     if (failed(tilingResult)) return failure();
 
@@ -81,8 +67,10 @@ struct TileScatterPattern : public OpRewritePattern<thlo::ScatterOp> {
     if (failed(ifOpOr)) return failure();
 
     // Fuse into `then` block.
-    fuseGreedily(rewriter, ifOpOr->getThenRegion().front(), fuseFilterFn);
+    fuseGreedily(rewriter, &ifOpOr->getThenRegion().front(), fuseFilterFn);
 
+    // Remove tiling label to continue generating code inside the region.
+    ifOpOr->walk([](Operation *op) { removeLabel(op, kTransformedLabel); });
     return success();
   }
 };
@@ -103,17 +91,10 @@ struct TransformScatterForCpuPass
 
     if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
       return signalPassFailure();
-    // Ensure we drop the marker in the end.
-    f.walk([](thlo::ScatterOp scatterOp) {
-      removeLabel(scatterOp, kScatterTransformedLabel);
-    });
   }
 };
 
 }  // namespace
-}  // namespace mlir::gml_st
-
-namespace mlir::gml_st {
 
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 createTransformScatterForCpuPass() {
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_transpose_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_transpose_for_cpu.cc
deleted file mode 100644
index f19047f9fb2..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_transpose_for_cpu.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "gml_st/transforms/fusion/fusion.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/peeling/peeling.h"
-#include "gml_st/transforms/tiling/tiling.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Interfaces/LoopLikeInterface.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMTRANSPOSEFORCPUPASS
-#include "gml_st/transforms/passes.h.inc"
-
-using mlir::arith::ConstantIndexOp;
-
-static constexpr llvm::StringRef kTransposeTransformedLabel =
-    "__transpose_transformed_label__";
-
-struct TileTransposePattern : public OpRewritePattern<linalg::TransposeOp> {
-  TileTransposePattern(MLIRContext *context, scf::SCFTilingOptions options,
-                       PatternBenefit benefit = 1)
-      : OpRewritePattern<linalg::TransposeOp>(context, benefit),
-        options(std::move(options)) {}
-
-  LogicalResult matchAndRewrite(linalg::TransposeOp op,
-                                PatternRewriter &rewriter) const override {
-    if (hasLabel(op, kTransposeTransformedLabel)) return failure();
-
-    if (isa<LoopLikeOpInterface>(op->getParentOp()))
-      return rewriter.notifyMatchFailure(
-          op, "has already been tiled by another pass.");
-
-    auto tilingResult = tileUsingSCFForallOp(
-        options, rewriter, cast<TilingInterface>(op.getOperation()));
-    if (failed(tilingResult)) return failure();
-
-    // If we did not tile (e.g. when all tile sizes are 0), do not replace
-    // original op and just mark it as transformed then return.
-    if (tilingResult->loop != nullptr) {
-      rewriter.replaceOp(op, tilingResult->loop->getResults());
-    }
-    setLabel(tilingResult->tiledOps.front(), kTransposeTransformedLabel);
-
-    // Peel parallel loops, label the main loop as "perfectly tiled" one, to
-    // enable vectorization after canonicalization.
-    auto peelingResult = peelAllLoops(tilingResult->loop, rewriter);
-    setLabel(tilingResult->loop, kPerfectlyTiledLoopLabel);
-
-    // Tile ops in the peeled loop again, to size 1, so they can be
-    // scalarized.
-    return tilePeeledOpsToScalars(rewriter, peelingResult,
-                                  kTransposeTransformedLabel,
-                                  /*fuseFilterFn=*/nullptr);
-  }
-
- private:
-  scf::SCFTilingOptions options;
-};
-
-struct TransformTransposeForCpuPass
-    : public impl::TransformTransposeForCpuPassBase<
-          TransformTransposeForCpuPass> {
-  TransformTransposeForCpuPass() = default;
-  explicit TransformTransposeForCpuPass(
-      llvm::ArrayRef<int64_t> transposeTileSizes) {
-    tileSizes = transposeTileSizes;
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<GmlStDialect, arith::ArithDialect, linalg::LinalgDialect,
-                    tensor::TensorDialect>();
-    linalg::registerTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    auto getTileSize = [&](mlir::OpBuilder b, Operation *op) {
-      SmallVector<Value> tiles;
-      for (int64_t tileSize : tileSizes) {
-        tiles.push_back(b.create<ConstantIndexOp>(op->getLoc(), tileSize));
-      }
-      if (!tiles.empty()) return tiles;
-      auto transposeOp = llvm::cast<linalg::TransposeOp>(op);
-      unsigned numLoops = transposeOp.getNumLoops();
-      assert(numLoops >= 2 && "Expect two or more dimension in transpose op");
-
-      // Compute the tile sizes for the 2-D vectorization of the transpose. We
-      // pick eight as default vectorization factor for both dimensions since
-      // it's the most performant AVX2 pattern for now. We pick the contiguous
-      // dimension of the input as first vector dimension and the contiguous
-      // dimension of the output as second vector dimension. This will maximize
-      // contiguous vector loads/stores and minimize insert/extract/gather/
-      // scatter operations.
-      tiles.resize(numLoops, b.create<ConstantIndexOp>(op->getLoc(), 1));
-      auto indexingMaps = transposeOp.getIndexingMapsArray();
-      unsigned lastDim = numLoops - 1;
-      unsigned vecFactor0 = 8, vecFactor1 = 8;
-      unsigned vecDim0 = indexingMaps[0].getDimPosition(lastDim);
-      unsigned vecDim1 = indexingMaps[1].getDimPosition(lastDim);
-
-      // If the contiguous dimensions of both input and output are not
-      // transposed (i.e, they are the same), we vectorize only that dimension.
-      // That transpose case doesn't require intra-register transposition but
-      // just copying a set of contiguous sub-buffers from the input to the
-      // output tensor. Vectorizing a second dimension would increase too much
-      // the memory pressure for no reason.
-      if (vecDim0 == vecDim1) {
-        tiles[vecDim0] = b.create<ConstantIndexOp>(op->getLoc(), vecFactor0);
-      } else {
-        tiles[vecDim0] = b.create<ConstantIndexOp>(op->getLoc(), vecFactor0);
-        tiles[vecDim1] = b.create<ConstantIndexOp>(op->getLoc(), vecFactor1);
-      }
-
-      return tiles;
-    };
-
-    scf::SCFTilingOptions tilingOptions;
-    tilingOptions.setTileSizeComputationFunction(getTileSize);
-
-    auto func = getOperation();
-    RewritePatternSet patterns(func.getContext());
-    patterns.add<TileTransposePattern>(patterns.getContext(), tilingOptions);
-    populateCollapseForallOpDimensionsPattern(patterns);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-
-    // Ensure we drop the marker in the end.
-    func.walk([](linalg::TransposeOp op) {
-      removeLabel(op, kTransposeTransformedLabel);
-    });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformTransposeForCpuPass(llvm::ArrayRef<int64_t> transposeTileSizes) {
-  return std::make_unique<TransformTransposeForCpuPass>(transposeTileSizes);
-}
-
-}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
index 474d3de4fb8..7a72f8e5300 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
@@ -20,9 +20,15 @@ limitations under the License.
 #include <utility>
 
 #include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/tiling/tiling.h"
 #include "gml_st/transforms/transforms.h"
+#include "gml_st/utils/tensor_utils.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -30,14 +36,16 @@ limitations under the License.
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "mlir/Transforms/TopologicalSortUtils.h"
 
-namespace mlir {
-namespace gml_st {
+namespace mlir::gml_st {
 namespace {
 
 bool isEqualOp(const Operation* lhsC, const Operation* rhsC) {
@@ -126,21 +134,136 @@ LogicalResult fuseTensorCast(PatternRewriter& rewriter, tensor::CastOp castOp,
   return success();
 }
 
-// Iterates over tensor::ExtractSliceOp inside the block, finds a suitable
-// candidate for fusion and fuses it. The fusion candidate should satisfy the
-// filter function and not have uses outside of the block. Fails if nothing
-// can be fused.
-LogicalResult fuseGreedilyOneOpIntoBlock(
-    PatternRewriter& rewriter, Block& block,
-    llvm::function_ref<bool(Operation*)> filterFn) {
-  // Ad-hoc CSE to eliminate duplicate MatrializeOp that could have been added
-  // after previous fusions. Running the whole CSE pass would be to expensive
-  // here and unnecessary. Without removing those duplicate, some ops will be
-  // fused multiple times resulting in exponential code growth.
-  eliminateEqualOps<tensor::ExtractSliceOp>(rewriter, block);
+// TODO(vuson): maybe overload this function instead of templating it.
+// Fuse a reshape op being used by an extract_slice op (inside a loop) into the
+// loop by reversing the order of these two ops (and fixing their
+// operands/result accordingly). For example, fusing a tensor.collapse_shape:
+//
+//   %1 = tensor.collapse_shape %0 [[0], [1, 2]] :
+//            tensor<10x10x1xf32> into tensor<10x10xf32>
+//   some_scf_loop (%a1, %a2) = (0, 0) to (10, 10) step (1, 8) ...
+//     %3 = tensor.extract_slice %1[%a1, %a2] [1, 8] [1, 1] :
+//              tensor<10x10xf32> to tensor<1x?xf32>
+//
+// into
+//
+//   some_scf_loop (%a1, %a2) = (0, 0) to (10, 10) step (1, 8) ...
+//     %3 = tensor.extract_slice %0[%a1, %a2, 0] [1, 8, 1] [1, 1, 1] :
+//              tensor<10x10x1xf32> to tensor<1x?x1xf32>
+//     %1 = tensor.collapse_shape %3 [[0], [1, 2]] :
+//              tensor<1x?x1xf32> into tensor<1x?xf32>
+//
+// This also works for tensor.expand_shape instead of tensor.collapse_shape:
+//
+//   %1 = tensor.expand_shape %0 [[0], [1, 2]] :
+//            tensor<10x10xf32> into tensor<10x10x1xf32>
+//   some_scf_loop (%a1, %a2) = (0, 0) to (10, 10) step (1, 8) ...
+//     %3 = tensor.extract_slice %1[%a1, %a2, 0] [1, 8, 1] [1, 1, 1] :
+//              tensor<10x10x1xf32> to tensor<1x?x1xf32>
+//
+// into
+//
+//   some_scf_loop (%a1, %a2) = (0, 0) to (10, 10) step (1, 8) ...
+//     %3 = tensor.extract_slice %0[%a1, %a2] [1, 8] [1, 1] :
+//              tensor<10x10xf32> to tensor<1x?xf32>
+//     %1 = tensor.expand_shape %2 [[0], [1, 2]] :
+//              tensor<1x?xf32> into tensor<1x?x1xf32>
+template <typename TensorReshapeOp>
+LogicalResult fuseTensorReshape(PatternRewriter& rewriter,
+                                TensorReshapeOp reshapeOp,
+                                tensor::ExtractSliceOp sliceOp) {
+  if (!isDegenerateReshapeOp(reshapeOp)) return failure();
 
-  SetVector<Operation*> fusionCandidates;
-  visitUsedValuesDefinedAbove(*block.getParent(), [&](OpOperand* operand) {
+  auto newSliceSrcType = reshapeOp.getSrcType();
+  llvm::ArrayRef<int64_t> newSliceSrcShape = newSliceSrcType.getShape();
+  auto newSliceRank = newSliceSrcType.getRank();
+  // If the source type of reshape op is a rank-0 tensor, there will be no
+  // extract_slice possible from that source value, let's bail out.
+  if (newSliceRank == 0) return failure();
+
+  auto one = rewriter.getIndexAttr(1);
+  auto zero = rewriter.getIndexAttr(0);
+  SmallVector<OpFoldResult> newOffsets(newSliceRank, zero);
+  SmallVector<OpFoldResult> newSizes(newSliceRank, one);
+  SmallVector<OpFoldResult> newStrides(newSliceRank, one);
+
+  llvm::ArrayRef<int64_t> sliceSrcShape = sliceOp.getSourceType().getShape();
+  auto reassociation = reshapeOp.getReassociationIndices();
+  constexpr bool isExpanding =
+      std::is_same<TensorReshapeOp, tensor::ExpandShapeOp>::value;
+  llvm::ArrayRef<int64_t> shape =
+      isExpanding ? sliceSrcShape : newSliceSrcShape;
+  // For each reassociation indices, a degenerate reshape op only has at most
+  // 1 non-unit-dimension. If there is none, it means the source shape already
+  // has some unit-dimensions (e.g. tensor<1x1x1xf32> collapsed into
+  // tensor<1xf32>)
+  assert(
+      static_cast<size_t>(
+          llvm::count_if(reassociation,
+                         [&](auto indices) {
+                           return llvm::count_if(indices, [&](int64_t idx) {
+                                    return shape[idx] != 1;
+                                  }) <= 1;
+                         })) == reassociation.size() &&
+      "Degenerate reshape op should only have at most 1 non-unit dimension for "
+      "each reassociation indices");
+  for (const auto& [enumIdx, indices] : llvm::enumerate(reassociation)) {
+    auto findIt =
+        llvm::find_if(indices, [&](int64_t idx) { return shape[idx] != 1; });
+    // No non-unit dimension, which means the source shape already has some
+    // unit-dimensions. The default values for offset/size/stride (0/1/1) should
+    // be usable. Skip updating offset/size/stride for this dimension.
+    if (findIt == indices.end()) continue;
+    auto newIdx = isExpanding ? enumIdx : *findIt;
+    auto idx = isExpanding ? *findIt : enumIdx;
+    newOffsets[newIdx] = sliceOp.getMixedOffsets()[idx];
+    newSizes[newIdx] = sliceOp.getMixedSizes()[idx];
+    newStrides[newIdx] = sliceOp.getMixedStrides()[idx];
+  }
+
+  RankedTensorType newSliceResultType =
+      tensor::ExtractSliceOp::inferCanonicalRankReducedResultType(
+          newSliceRank, newSliceSrcType, newOffsets, newSizes, newStrides);
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPointAfter(sliceOp);
+  auto newSlice = rewriter.create<tensor::ExtractSliceOp>(
+      sliceOp.getLoc(), newSliceResultType, reshapeOp.getSrc(), newOffsets,
+      newSizes, newStrides);
+
+  rewriter.replaceOpWithNewOp<TensorReshapeOp>(sliceOp, sliceOp.getResultType(),
+                                               newSlice, reassociation);
+  return success();
+}
+
+// Checks that there is at most one user in each given block.
+bool atMostOneUserPerBlock(ArrayRef<Operation*> users,
+                           ArrayRef<Block*> blocks) {
+  if (users.size() == 1) return true;
+  if (users.size() > blocks.size()) return false;
+
+  llvm::SmallSetVector<Block*, 2> blocksWithUsers;
+
+  Block* funcBlock = &users.front()
+                          ->getParentWithTrait<OpTrait::IsIsolatedFromAbove>()
+                          ->getRegion(0)
+                          .front();
+  // Return false if there two users in a block.
+  for (Operation* user : users) {
+    Block* block = user->getBlock();
+    while (block != funcBlock) {
+      if (llvm::is_contained(blocks, block)) {
+        if (!blocksWithUsers.insert(block)) return false;
+      }
+      block = block->getParentOp()->getBlock();
+    }
+  }
+  return true;
+}
+
+DenseSet<Operation*> getFusionCandidates(
+    Region& region, llvm::function_ref<bool(Operation*)> filterFn) {
+  DenseSet<Operation*> fusionCandidates;
+  visitUsedValuesDefinedAbove(region, [&](OpOperand* operand) {
     auto* fusionCandidate = operand->get().getDefiningOp();
     // Do not fuse if there is no defining op. Of example, if it's an
     // extract_slice from a function argument.
@@ -158,11 +281,87 @@ LogicalResult fuseGreedilyOneOpIntoBlock(
                  isa<tensor::DimOp>(op) ||
                  // Trivially dead ops will be removed.
                  isOpTriviallyDead(op);
-        }))
+        })) {
       return;
-
+    }
     fusionCandidates.insert(fusionCandidate);
   });
+  return fusionCandidates;
+}
+
+LogicalResult fuseIntoUser(PatternRewriter& rewriter,
+                           Operation* fusionCandidate,
+                           Operation* candidateUser) {
+  // If the user of the fusion candidate is `tensor.extract_slice`, we use
+  // TilingInterface to rewrite `tensor.extract_slice(fusionOp)` into
+  // `tiledFusionOp(tensor.extract_slice)`.
+  if (auto extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(candidateUser)) {
+    if (auto castOp = dyn_cast<tensor::CastOp>(fusionCandidate)) {
+      return fuseTensorCast(rewriter, castOp, extractSliceOp);
+    }
+    if (auto collapseShapeOp =
+            dyn_cast<tensor::CollapseShapeOp>(fusionCandidate)) {
+      return fuseTensorReshape(rewriter, collapseShapeOp, extractSliceOp);
+    }
+    if (auto expandShapeOp = dyn_cast<tensor::ExpandShapeOp>(fusionCandidate)) {
+      return fuseTensorReshape(rewriter, expandShapeOp, extractSliceOp);
+    }
+    auto fusedOp = fuse(rewriter, extractSliceOp);
+    if (succeeded(fusedOp)) {
+      setLabel(*fusedOp, kTransformedLabel);
+      return success();
+    }
+    return failure();
+  }
+
+  // TODO(shyshkov): Implement fusion into `tensor.extract` using
+  // TilingInterface.
+  if (auto extractOp = dyn_cast<tensor::ExtractOp>(candidateUser)) {
+    return failure();
+  }
+
+  // Otherwise, the fusion candidate op is moved inside of the region.
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(candidateUser);
+  Operation* clonedCandidate = rewriter.clone(*fusionCandidate);
+  rewriter.replaceOp(fusionCandidate, clonedCandidate->getResults());
+  return success();
+}
+
+LogicalResult fuseIntoUsers(PatternRewriter& rewriter,
+                            Operation* fusionCandidate,
+                            ArrayRef<Operation*> fusionCandidateUsers) {
+  for (Operation* candidateUser : fusionCandidateUsers) {
+    if (failed(fuseIntoUser(rewriter, fusionCandidate, candidateUser)))
+      return failure();
+  }
+  return success();
+}
+
+// Iterates over tensor::ExtractSliceOp inside the block, finds a suitable
+// candidate for fusion and fuses it. The fusion candidate should satisfy the
+// filter function and not have uses outside of the block. Fails if nothing
+// can be fused.
+LogicalResult fuseGreedilyOneOpIntoBlock(
+    PatternRewriter& rewriter, ArrayRef<Block*> blocks,
+    llvm::function_ref<bool(Operation*)> filterFn) {
+  if (blocks.empty()) return failure();
+
+  // Ad-hoc CSE to eliminate duplicate MatrializeOp that could have been added
+  // after previous fusions. Running the whole CSE pass would be to expensive
+  // here and unnecessary. Without removing those duplicate, some ops will be
+  // fused multiple times resulting in exponential code growth.
+  DenseSet<Operation*> fusionCandidates;
+  for (auto [index, block] : llvm::enumerate(blocks)) {
+    eliminateEqualOps<tensor::ExtractSliceOp>(rewriter, *block);
+
+    if (index == 0) {
+      fusionCandidates = getFusionCandidates(*block->getParent(), filterFn);
+      continue;
+    }
+    llvm::set_intersect(fusionCandidates,
+                        getFusionCandidates(*block->getParent(), filterFn));
+  }
 
   for (Operation* fusionCandidate : fusionCandidates) {
     // Ad-hoc DCE to trim the fusion candidate from dead users that could have
@@ -176,47 +375,23 @@ LogicalResult fuseGreedilyOneOpIntoBlock(
     // pipeline here is too expensive.
     reifyDimOpsUsers(rewriter, fusionCandidate);
 
-    // After the previous steps, extractSliceOp should be only one user of the
-    // fusion candidate. Otherwise this candidate should not be fused.
+    // After the previous steps, there should be at most one user of the
+    // fusion candidate per block. Otherwise this candidate should not be fused.
+    // We always want to fuse linalg.fill.
     auto fusionCandidateUsers = llvm::to_vector(fusionCandidate->getUsers());
-    if (fusionCandidateUsers.size() != 1) continue;
-
-    Operation* candidateUser = fusionCandidateUsers.front();
-
-    // If the user of the fusion candidate is `tensor.extract_slice`, we use
-    // TilingInterface to rewrite `tensor.extract_slice(fusionOp)` into
-    // `tiledFusionOp(tensor.extract_slice)`.
-    if (auto extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(candidateUser)) {
-      if (auto castOp = dyn_cast<tensor::CastOp>(fusionCandidate)) {
-        if (succeeded(fuseTensorCast(rewriter, castOp, extractSliceOp))) {
-          return success();
-        }
-        continue;
-      }
-      if (succeeded(fuse(rewriter, extractSliceOp))) {
-        return success();
-      }
+    if (!atMostOneUserPerBlock(fusionCandidateUsers, blocks)) {
       continue;
     }
-
-    // TODO(shyshkov): Implement fusion into `tensor.extract` using
-    // TilingInterface.
-    if (auto extractOp = dyn_cast<tensor::ExtractOp>(candidateUser)) {
-      continue;
+    if (succeeded(
+            fuseIntoUsers(rewriter, fusionCandidate, fusionCandidateUsers))) {
+      return success();
     }
-
-    // Otherwise, the fusion candidate op is moved inside of the region.
-    OpBuilder::InsertionGuard g(rewriter);
-    rewriter.setInsertionPoint(candidateUser);
-    Operation* clonedCandidate = rewriter.clone(*fusionCandidate);
-    rewriter.replaceOp(fusionCandidate, clonedCandidate->getResults());
-    return success();
   }
   return failure();
 }
 
-FailureOr<Value> createFusedOp(PatternRewriter& rewriter,
-                               tensor::ExtractSliceOp extractSliceOp) {
+FailureOr<TilingResult> createFusedOp(PatternRewriter& rewriter,
+                                      tensor::ExtractSliceOp extractSliceOp) {
   Value src = extractSliceOp.getSource();
   if (!src) return failure();
   auto tileableOp = src.getDefiningOp<TilingInterface>();
@@ -232,7 +407,7 @@ FailureOr<Value> createFusedOp(PatternRewriter& rewriter,
   // Tile the producer.
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPoint(extractSliceOp);
-  FailureOr<Value> tiledProducer = tileableOp.generateResultTileValue(
+  FailureOr<TilingResult> tiledProducer = tileableOp.generateResultTileValue(
       rewriter, /*resultNumber=*/0, offsets, sizes);
   if (failed(tiledProducer)) {
     return rewriter.notifyMatchFailure(tileableOp,
@@ -242,83 +417,15 @@ FailureOr<Value> createFusedOp(PatternRewriter& rewriter,
   return tiledProducer;
 }
 
-}  // namespace
-
-FailureOr<Operation*> fuse(PatternRewriter& rewriter,
-                           tensor::ExtractSliceOp extractSliceOp) {
-  Location loc = extractSliceOp.getLoc();
-  FailureOr<Value> fusedOr = createFusedOp(rewriter, extractSliceOp);
-  if (failed(fusedOr)) return failure();  // Match failure already notified.
-
-  // Insert cast if needed.
-  Value fused = *fusedOr;
-  if (fused.getType() != extractSliceOp.getType()) {
-    // The result should be a tensor, cast it to the correct shape
-    OpBuilder::InsertionGuard g(rewriter);
-    rewriter.setInsertionPointAfter(fused.getDefiningOp());
-    fused =
-        rewriter.create<tensor::CastOp>(loc, extractSliceOp.getType(), fused);
-  }
-
-  rewriter.replaceOp(extractSliceOp, fused);
-  return fused.getDefiningOp();
-}
-
-void fuseGreedily(PatternRewriter& rewriter, Block& block,
-                  llvm::function_ref<bool(Operation*)> filterFn) {
-  while (succeeded(fuseGreedilyOneOpIntoBlock(rewriter, block, filterFn)))
-    ;
-}
-
-FusionCluster findMapFusionCluster(Operation* op) {
-  // Find the root operation in the chain of elementwise ops. Current approach
-  // doesn't work well if maps don't form a chain.
-  Operation* rootOp = op;
-  while (true) {
-    auto users = llvm::to_vector(rootOp->getUsers());
-
-    if (users.size() != 1) break;
-    if (!isa<linalg::MapOp>(users[0])) break;
-
-    rootOp = users[0];
-  }
-
-  // Run a graph search to find all linalg.map and that can be fused in
-  // the root op.
-  SetVector<Operation*> resultOps;
-  SmallVector<Operation*> remainingProducers{rootOp};
-
-  while (!remainingProducers.empty()) {
-    Operation* curOp = remainingProducers.pop_back_val();
-    if (!curOp) continue;
-
-    if (auto mapOp = dyn_cast<linalg::MapOp>(curOp)) {
-      resultOps.insert(curOp);
-      for (auto* operand : mapOp.getDpsInputOperands())
-        remainingProducers.push_back(operand->get().getDefiningOp());
-    } else if (curOp->getName() == op->getName()) {
-      for (auto* u : curOp->getUsers()) {
-        // Do not fuse curOp that is used by another op of the same type.
-        if (u->getName() == op->getName()) continue;
-      }
-      resultOps.insert(curOp);
-    }
-  }
-  return {resultOps, rootOp};
-}
-
-LogicalResult fuseFillOpsIntoForallOp(PatternRewriter& rewriter,
-                                      scf::ForallOp parallelOp) {
+void fuseFillOpsIntoForallOp(PatternRewriter& rewriter,
+                             scf::ForallOp parallelOp) {
   OpBuilder::InsertionGuard g(rewriter);
   rewriter.setInsertionPointToStart(parallelOp.getBody());
-  bool fillOpsWereFused = false;
   for (OpOperand& output :
        parallelOp->getOpOperands().take_back(parallelOp.getNumResults())) {
     auto fillOp = output.get().getDefiningOp<linalg::FillOp>();
     if (!fillOp) continue;
 
-    fillOpsWereFused = true;
-
     // Clone `linalg.fill` op inside the loop, update the uses of bbArg.
     BlockArgument regionOutputArg = parallelOp.getTiedBlockArgument(&output);
     auto clonedFill = cast<linalg::FillOp>(
@@ -326,6 +433,7 @@ LogicalResult fuseFillOpsIntoForallOp(PatternRewriter& rewriter,
                     {fillOp.value(), regionOutputArg}));
 
     output.set(fillOp.output());
+    setLabel(clonedFill, kTransformedLabel);
 
     SmallVector<tensor::ExtractSliceOp> sliceOps;
     regionOutputArg.replaceUsesWithIf(
@@ -343,68 +451,6 @@ LogicalResult fuseFillOpsIntoForallOp(PatternRewriter& rewriter,
     for (tensor::ExtractSliceOp sliceOp : sliceOps)
       (void)fuse(rewriter, sliceOp);
   }
-  return success(fillOpsWereFused);
-}
-
-FailureOr<scf::ForallOp> tileUsingSCFForallOpAndFuseGreedily(
-    PatternRewriter& rewriter, Operation* op, const scf::SCFTilingOptions& opts,
-    StringRef label, llvm::function_ref<bool(Operation*)> fuseFilterFn) {
-  auto tilingResult =
-      tileUsingSCFForallOp(opts, rewriter, cast<TilingInterface>(op));
-  if (failed(tilingResult)) return failure();
-
-  // If we did not tile (e.g. when all tile sizes are 0), do not replace
-  // original op and just mark it as transformed then return.
-  if (tilingResult->loop != nullptr) {
-    rewriter.replaceOp(op, tilingResult->loop->getResults());
-
-    // Fuse ops into the loop.
-    fuseGreedily(rewriter, *tilingResult->tiledOps.front()->getBlock(),
-                 fuseFilterFn);
-  }
-  setLabel(tilingResult->tiledOps.front(), label);
-  return tilingResult->loop;
-}
-
-FailureOr<scf::SCFTilingResult> tileUsingSCFForOpAndFuseGreedily(
-    PatternRewriter& rewriter, Operation* op, const scf::SCFTilingOptions& opts,
-    StringRef label, llvm::function_ref<bool(Operation*)> fuseFilterFn) {
-  auto tilingResult = scf::tileUsingSCFForOp(rewriter, op, opts);
-  if (failed(tilingResult)) return failure();
-
-  // If we did not tile (e.g. when all tile sizes are 0), do not replace
-  // original op and just mark it as transformed then return.
-  if (!tilingResult->loops.empty()) {
-    rewriter.replaceOp(op, tilingResult->replacements);
-
-    // Fuse ops into the loop.
-    scf::ForOp innerLoop = tilingResult->loops.back();
-    fuseGreedily(rewriter, *innerLoop.getBody(), fuseFilterFn);
-  }
-  setLabel(tilingResult->tiledOps.front(), label);
-  return tilingResult;
-}
-
-LogicalResult tilePeeledOpsToScalars(
-    PatternRewriter& rewriter, const GmlStPeelingResult& peelingResult,
-    StringRef label, llvm::function_ref<bool(Operation*)> fuseFilterFn) {
-  for (scf::ForallOp peeledLoop : peelingResult.tailLoops) {
-    SmallVector<Value> yieldedTensors =
-        getYieldedValues(peeledLoop.getTerminator());
-
-    assert(yieldedTensors.size() == 1 &&
-           "expected to have a single result in scf.forall loop");
-    auto definingOp = yieldedTensors.front().getDefiningOp<TilingInterface>();
-    if (!definingOp) return failure();
-
-    auto opts = getSCFTilingOptions(
-        SmallVector<int64_t>(definingOp.getLoopIteratorTypes().size(), 1));
-    if (failed(tileUsingSCFForallOpAndFuseGreedily(rewriter, definingOp, opts,
-                                                   label, fuseFilterFn))) {
-      return failure();
-    }
-  }
-  return success();
 }
 
 // Finds the source of the operand. It could be a tensor.empty, a region arg or
@@ -457,24 +503,162 @@ SmallVector<Value> getRootOpInitOperands(PatternRewriter& rewriter,
   return initOperands;
 }
 
+}  // namespace
+
+FailureOr<Operation*> fuse(PatternRewriter& rewriter,
+                           tensor::ExtractSliceOp extractSliceOp) {
+  Location loc = extractSliceOp.getLoc();
+  FailureOr<TilingResult> fusedOr = createFusedOp(rewriter, extractSliceOp);
+  if (failed(fusedOr)) return failure();  // Match failure already notified.
+
+  // Insert cast if needed.
+  Value fused = fusedOr->tiledOps.front()->getResult(0);
+  if (fused.getType() != extractSliceOp.getType()) {
+    // The result should be a tensor, cast it to the correct shape
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPointAfter(fused.getDefiningOp());
+    fused =
+        rewriter.create<tensor::CastOp>(loc, extractSliceOp.getType(), fused);
+  }
+
+  rewriter.replaceOp(extractSliceOp, fused);
+  return fused.getDefiningOp();
+}
+
+void fuseGreedily(PatternRewriter& rewriter, ArrayRef<Block*> blocks,
+                  llvm::function_ref<bool(Operation*)> filterFn) {
+  while (succeeded(fuseGreedilyOneOpIntoBlock(rewriter, blocks, filterFn)))
+    ;
+}
+
+// Cluster producers and consumers around the root op.
+FusionCluster getFusionCluster(
+    Operation* op, llvm::function_ref<bool(Operation*)> producerFilterFn,
+    llvm::function_ref<bool(Operation*)> consumerFilterFn) {
+  // Find a chain of users and use the last one as a root of cluster.
+  SetVector<Operation*> resultOps;
+
+  Operation* rootOp = op;
+  while (true) {
+    auto users = llvm::to_vector(rootOp->getUsers());
+
+    if (users.size() != 1) break;
+
+    if (!consumerFilterFn(users[0])) break;
+    resultOps.insert(rootOp);
+
+    rootOp = users[0];
+  }
+  resultOps.insert(rootOp);
+
+  // Run DFS to find all ops that satisfy producerFilterFn.
+  SmallVector<Operation*> remainingProducers;
+  for (Value operand : op->getOperands())
+    remainingProducers.push_back(operand.getDefiningOp());
+
+  while (!remainingProducers.empty()) {
+    Operation* curOp = remainingProducers.pop_back_val();
+    if (!curOp || resultOps.contains(curOp)) continue;
+    if (!llvm::all_of(curOp->getUsers(),
+                      [&](Operation* op) { return resultOps.contains(op); })) {
+      continue;
+    }
+
+    if (curOp == op || producerFilterFn(curOp)) {
+      resultOps.insert(curOp);
+      for (Value operand : curOp->getOperands())
+        remainingProducers.push_back(operand.getDefiningOp());
+    }
+  }
+  return {resultOps, rootOp, {}};
+}
+
+FailureOr<GMLSTTilingResult> tileUsingSCFForallOpAndFuseGreedily(
+    PatternRewriter& rewriter, Operation* op, const scf::SCFTilingOptions& opts,
+    llvm::function_ref<bool(Operation*)> fuseFilterFn) {
+  auto tilingResult =
+      tileUsingSCFForallOp(rewriter, cast<TilingInterface>(op), opts);
+  if (failed(tilingResult)) return failure();
+
+  for (Operation* tiledOp : tilingResult->tiledOps)
+    setLabel(tiledOp, kTransformedLabel);
+
+  // If tiling created an `scf.forall` loop, we fuse.
+  if (tilingResult->loop != nullptr) {
+    rewriter.replaceOp(op, tilingResult->loop->getResults());
+    // Fuse ops into the loop.
+    fuseGreedily(rewriter, tilingResult->tiledOps.front()->getBlock(),
+                 fuseFilterFn);
+    fuseFillOpsIntoForallOp(rewriter, tilingResult->loop);
+  }
+  return tilingResult;
+}
+
+FailureOr<scf::SCFTilingResult> tileUsingSCFForOpAndFuseGreedily(
+    PatternRewriter& rewriter, Operation* op, const scf::SCFTilingOptions& opts,
+    llvm::function_ref<bool(Operation*)> fuseFilterFn) {
+  auto tilingResult =
+      scf::tileUsingSCFForOp(rewriter, cast<TilingInterface>(op), opts);
+  if (failed(tilingResult)) return failure();
+  rewriter.replaceOp(op, tilingResult->replacements);
+
+  for (Operation* tiledOp : tilingResult->tiledOps)
+    setLabel(tiledOp, kTransformedLabel);
+
+  // If tiling created an `scf.for` loop nest, we fuse.
+  if (!tilingResult->loops.empty()) {
+    scf::ForOp innerLoop = tilingResult->loops.back();
+    fuseGreedily(rewriter, innerLoop.getBody(), fuseFilterFn);
+  }
+  return tilingResult;
+}
+
+LogicalResult tilePeeledOpsToScalars(
+    PatternRewriter& rewriter, const GmlStPeelingResult& peelingResult,
+    llvm::function_ref<bool(Operation*)> fuseFilterFn) {
+  for (scf::ForallOp peeledLoop : peelingResult.tailLoops) {
+    SmallVector<Value> yieldedTensors =
+        getYieldedValues(peeledLoop.getTerminator());
+
+    assert(yieldedTensors.size() == 1 &&
+           "expected to have a single result in scf.forall loop");
+    auto definingOp = yieldedTensors.front().getDefiningOp<TilingInterface>();
+    if (!definingOp) return failure();
+
+    auto opts = getSCFTilingOptions(
+        SmallVector<int64_t>(definingOp.getLoopIteratorTypes().size(), 1));
+    if (failed(tileUsingSCFForallOpAndFuseGreedily(rewriter, definingOp, opts,
+                                                   fuseFilterFn))) {
+      return failure();
+    }
+  }
+  return success();
+}
+
 FailureOr<gml_st::FusionOp> wrapFusionCluster(
     PatternRewriter& rewriter, const FusionCluster& fusionCluster) {
   auto loc = fusionCluster.root->getLoc();
 
+  SetVector<Value> inputOperands;
   SmallVector<Value> initOperands =
       getRootOpInitOperands(rewriter, fusionCluster);
 
   // 1. Find operands and results of the cluster op.
-  SetVector<Value> clusterOperands;
   SmallVector<Value> clusterResults;
+  SmallVector<Value> constantOps;
   auto visitOpOperand = [&](OpOperand* operand) {
-    auto* definingOp = operand->get().getDefiningOp();
+    Value operandValue = operand->get();
+    auto* definingOp = operandValue.getDefiningOp();
+
+    if (definingOp && definingOp->hasTrait<OpTrait::ConstantLike>()) {
+      constantOps.push_back(operandValue);
+      return;
+    }
 
     if (fusionCluster.operations.contains(definingOp)) return;
-    if (isa_and_nonnull<arith::ConstantOp>(definingOp)) return;
-    if (llvm::is_contained(initOperands, operand->get())) return;
+    if (llvm::is_contained(initOperands, operandValue)) return;
 
-    clusterOperands.insert(operand->get());
+    inputOperands.insert(operandValue);
   };
 
   for (Operation* op : fusionCluster.operations) {
@@ -490,6 +674,7 @@ FailureOr<gml_st::FusionOp> wrapFusionCluster(
     }
   }
 
+  SetVector<Value> clusterOperands = inputOperands;
   clusterOperands.insert(initOperands.begin(), initOperands.end());
 
   // 2. Create an empty op.
@@ -497,7 +682,8 @@ FailureOr<gml_st::FusionOp> wrapFusionCluster(
   rewriter.setInsertionPointAfter(fusionCluster.root);
   auto fusionClusterOp = rewriter.create<gml_st::FusionOp>(
       loc, TypeRange(ValueRange(clusterResults)),
-      clusterOperands.getArrayRef());
+      ValueRange(inputOperands.getArrayRef()), ValueRange(initOperands),
+      nullptr, nullptr);
 
   // 3. Create block with mapping between operands and block arguments.
   SmallVector<Type, 4> blockArgTypes =
@@ -511,8 +697,15 @@ FailureOr<gml_st::FusionOp> wrapFusionCluster(
   IRMapping mapper;
   mapper.map(clusterOperands, block->getArguments());
 
-  // 4. Copy ops into the cluster region in topoligical order to avoid swapping
-  // depending ops.
+  // 4. Copy ops into the cluster region.
+  // 4.1. Copy constant ops.
+  for (auto v : constantOps) {
+    auto newOp = rewriter.clone(*v.getDefiningOp())->getResult(0);
+    mapper.map(v, newOp);
+  }
+
+  // 4.2. Copy ops into the cluster region in topoligical order to avoid
+  // swapping depending ops.
   SmallVector<Operation*> clusterOps(fusionCluster.operations.begin(),
                                      fusionCluster.operations.end());
 
@@ -521,6 +714,7 @@ FailureOr<gml_st::FusionOp> wrapFusionCluster(
     rewriter.clone(*op, mapper);
   }
 
+  // 4.3 Create terminator gml_st.yield.
   SmallVector<Value> yieldOpOperands = llvm::to_vector(llvm::map_range(
       clusterResults, [&](Value v) { return mapper.lookupOrDefault(v); }));
   auto yieldOp = rewriter.create<gml_st::YieldOp>(loc, yieldOpOperands);
@@ -547,14 +741,42 @@ LogicalResult inlineFusionCluster(FusionOp fusionOp,
     rewriter.clone(op, mapper);
   }
 
-  SmallVector<Value> yieldOpOperands = llvm::to_vector(
-      llvm::map_range(fusionOp.getTerminator().getOperands(),
-                      [&](Value v) { return mapper.lookupOrDefault(v); }));
+  if (fusionOp.hasTensorSemantics()) {
+    SmallVector<Value> yieldOpOperands = llvm::to_vector(
+        llvm::map_range(fusionOp.getTerminator().getOperands(),
+                        [&](Value v) { return mapper.lookupOrDefault(v); }));
 
-  rewriter.replaceOp(fusionOp, yieldOpOperands);
+    rewriter.replaceOp(fusionOp, yieldOpOperands);
+  } else {
+    rewriter.eraseOp(fusionOp);
+  }
 
   return success();
 }
 
-}  // namespace gml_st
-}  // namespace mlir
+// Duplicates the op so each copy has only one use as init parameter.
+template <typename OpTy>
+LogicalResult duplicateInitOps(OpTy op, PatternRewriter& rewriter) {
+  // Nothing to do, because the op has 0 or 1 users.
+  if (std::distance(op->user_begin(), op->user_end()) <= 1) return failure();
+
+  bool modified = false;
+  for (auto& use : llvm::make_early_inc_range(op->getUses())) {
+    Operation* ownerOp = use.getOwner();
+
+    auto dstStyleOp = dyn_cast<DestinationStyleOpInterface>(ownerOp);
+    if (!dstStyleOp || !dstStyleOp.isDpsInit(&use)) continue;
+
+    auto newOp = cast<OpTy>(rewriter.clone(*op));
+    use.set(newOp->getResult(0));
+    modified = true;
+  }
+  return success(modified);
+}
+
+void populateDuplicateInitOpsPatterns(RewritePatternSet& patterns) {
+  patterns.add(duplicateInitOps<linalg::FillOp>);
+  patterns.add(duplicateInitOps<tensor::EmptyOp>);
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.h
index aefe4d73883..b1b9181108e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.h
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.h
@@ -16,58 +16,26 @@ limitations under the License.
 #ifndef MLIR_HLO_GML_ST_TRANSFORMS_FUSION_FUSION_H
 #define MLIR_HLO_GML_ST_TRANSFORMS_FUSION_FUSION_H
 
-#include "gml_st/IR/gml_st_ops.h"
+#include <utility>
+
 #include "gml_st/transforms/peeling/peeling.h"
 #include "gml_st/transforms/tiling/tiling.h"
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/PatternMatch.h"
 
-namespace mlir {
-namespace gml_st {
-
-// Fuses an op into `tensor.extract_slice` and performs the necessary updates to
-// the surrounding loop if any.
-FailureOr<Operation *> fuse(PatternRewriter &rewriter,
-                            tensor::ExtractSliceOp materializeOp);
-
-// Finds `tensor.extract_slice` ops in the block and fuses ops into them.
-// Verifies that fusion candidate doesn't have any uses except the one
-// `tensor.extract_slice` in the block to avoid exponential code growth.
-void fuseGreedily(PatternRewriter &rewriter, Block &block,
-                  llvm::function_ref<bool(Operation *)> filterFn = nullptr);
+namespace mlir::gml_st {
 
 struct FusionCluster {
   SetVector<Operation *> operations;
   Operation *root;
+  // Map from Value of the fusion cluster argument to the root dimensions.
+  llvm::SmallVector<std::pair<Value, SmallVector<int64_t>>> argDimsMapping;
 };
-
-// Find a cluster of operations that can be tiled and fused together around
-// the root op. We want to fuse output of the fusion op with elementwise ops. In
-// general case a cluster is a tree that can have multiple leaf-node ops,
-// e.g. map(op, map(op)).
-// First element of the cluster is always the root for tiling.
-FusionCluster findMapFusionCluster(Operation *op);
-
-// Fuses linalg.fill that is used in output argument of the scf::ForallOp.
-LogicalResult fuseFillOpsIntoForallOp(PatternRewriter &rewriter,
-                                      scf::ForallOp parallelOp);
-
-// Tiles the op to gml_st.parallel and fuses greedily according to the filter.
-FailureOr<scf::ForallOp> tileUsingSCFForallOpAndFuseGreedily(
-    PatternRewriter &rewriter, Operation *op, const scf::SCFTilingOptions &opts,
-    StringRef label, llvm::function_ref<bool(Operation *)> fuseFilterFn);
-
-// Tiles the op to scf.for and fuses greedily according to the filter.
-FailureOr<scf::SCFTilingResult> tileUsingSCFForOpAndFuseGreedily(
-    PatternRewriter &rewriter, Operation *op, const scf::SCFTilingOptions &opts,
-    StringRef label, llvm::function_ref<bool(Operation *)> fuseFilterFn);
-
-// Tiles the op to 1 for all dimensions and fuses greedily according to the
-// filter function.
-LogicalResult tilePeeledOpsToScalars(
-    PatternRewriter &rewriter, const GmlStPeelingResult &peelingResult,
-    StringRef label, llvm::function_ref<bool(Operation *)> fuseFilterFn);
+// Cluster producers and consumers around the root op.
+FusionCluster getFusionCluster(
+    Operation *op, llvm::function_ref<bool(Operation *)> producerFilterFn,
+    llvm::function_ref<bool(Operation *)> consumerFilterFn);
 
 // Creates gml_st.fusion op with a region with ops from the fusion cluster.
 // Operands of the ops in the region are replaced with region arguments to
@@ -79,7 +47,37 @@ FailureOr<gml_st::FusionOp> wrapFusionCluster(
 // Replaces gml_st.fusion op with ops from the region.
 LogicalResult inlineFusionCluster(FusionOp fusionOp, PatternRewriter &rewriter);
 
-}  // namespace gml_st
-}  // namespace mlir
+// Adds patterns to duplicate linalg.fill and tensor.empty that used as init
+// parameters.
+void populateDuplicateInitOpsPatterns(RewritePatternSet &patterns);
+
+// Fuses an op into `tensor.extract_slice` and performs the necessary updates to
+// the surrounding loop if any.
+FailureOr<Operation *> fuse(PatternRewriter &rewriter,
+                            tensor::ExtractSliceOp materializeOp);
+
+// Finds `tensor.extract_slice` ops in the block and fuses ops into them.
+// Verifies that fusion candidate doesn't have any uses except the one
+// `tensor.extract_slice` in the block to avoid exponential code growth.
+void fuseGreedily(PatternRewriter &rewriter, ArrayRef<Block *> blocks,
+                  llvm::function_ref<bool(Operation *)> filterFn = nullptr);
+
+// Tiles the op to gml_st.parallel and fuses greedily according to the filter.
+FailureOr<GMLSTTilingResult> tileUsingSCFForallOpAndFuseGreedily(
+    PatternRewriter &rewriter, Operation *op, const scf::SCFTilingOptions &opts,
+    llvm::function_ref<bool(Operation *)> fuseFilterFn = nullptr);
+
+// Tiles the op to scf.for and fuses greedily according to the filter.
+FailureOr<scf::SCFTilingResult> tileUsingSCFForOpAndFuseGreedily(
+    PatternRewriter &rewriter, Operation *op, const scf::SCFTilingOptions &opts,
+    llvm::function_ref<bool(Operation *)> fuseFilterFn = nullptr);
+
+// Tiles the op to 1 for all dimensions and fuses greedily according to the
+// filter function.
+LogicalResult tilePeeledOpsToScalars(
+    PatternRewriter &rewriter, const GmlStPeelingResult &peelingResult,
+    llvm::function_ref<bool(Operation *)> fuseFilterFn = nullptr);
+
+}  // namespace mlir::gml_st
 
 #endif  // MLIR_HLO_GML_ST_TRANSFORMS_FUSION_FUSION_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.cc
deleted file mode 100644
index 82cfee5caa6..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "gml_st/transforms/passes.h"
-
-#include <optional>
-
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/PatternMatch.h"
-
-namespace mlir::gml_st {
-namespace {
-
-using scf::ForallOp;
-
-struct CollapseForallOpDimensions : public OpRewritePattern<ForallOp> {
-  using OpRewritePattern<ForallOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ForallOp op,
-                                PatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-
-    // Compute new loop bounds that omit all single-iteration loop dimensions.
-    SmallVector<OpFoldResult> newMixedLowerBounds, newMixedUpperBounds,
-        newMixedSteps;
-    IRMapping mapping;
-    for (auto [lowerBound, upperBound, step, iv] :
-         llvm::zip(op.getMixedLowerBound(), op.getMixedUpperBound(),
-                   op.getMixedStep(), op.getInductionVars())) {
-      // Collect the statically known loop bounds.
-      std::optional<int64_t> lowerBoundConstant =
-          getConstantIntValue(lowerBound);
-      std::optional<int64_t> upperBoundConstant =
-          getConstantIntValue(upperBound);
-      std::optional<int64_t> stepConstant = getConstantIntValue(step);
-      // Remove the loop if it performs zero iterations.
-      if (lowerBoundConstant && upperBoundConstant &&
-          *lowerBoundConstant == *upperBoundConstant) {
-        rewriter.replaceOp(op, op.getOutputs());
-        return success();
-      }
-      // Replace the loop induction variable by the lower bound if the loop
-      // performs a single iteration. Otherwise, copy the loop bounds.
-      if (lowerBoundConstant && upperBoundConstant && stepConstant &&
-          (*upperBoundConstant - *lowerBoundConstant) > 0 &&
-          (*upperBoundConstant - *lowerBoundConstant) <= *stepConstant) {
-        mapping.map(iv,
-                    getValueOrCreateConstantIndexOp(rewriter, loc, lowerBound));
-      } else {
-        newMixedLowerBounds.push_back(lowerBound);
-        newMixedUpperBounds.push_back(upperBound);
-        newMixedSteps.push_back(step);
-      }
-    }
-    // Exit if none of the loop dimensions perform a single iteration.
-    if (newMixedLowerBounds.size() == static_cast<unsigned>(op.getRank())) {
-      return failure();
-    }
-
-    // All of the loop dimensions perform a single iteration. Inline loop body.
-    if (newMixedLowerBounds.empty()) {
-      mapping.map(op.getOutputBlockArguments(), op.getOutputs());
-      for (auto &bodyOp : op.getBody()->without_terminator())
-        rewriter.clone(bodyOp, mapping);
-      SmallVector<Value> results;
-      results.reserve(op.getResults().size());
-      scf::InParallelOp terminator = op.getTerminator();
-      for (auto &yieldingOp : terminator.getYieldingOps()) {
-        auto parallelInsertSliceOp =
-            cast<tensor::ParallelInsertSliceOp>(yieldingOp);
-
-        Value dst = parallelInsertSliceOp.getDest();
-        Value src = parallelInsertSliceOp.getSource();
-
-        auto getMappedValues = [&](ValueRange values) {
-          return llvm::to_vector(llvm::map_range(values, [&](Value value) {
-            return mapping.lookupOrDefault(value);
-          }));
-        };
-
-        Value srcVal = mapping.lookupOrDefault(src);
-        if (srcVal.getType().isa<TensorType>()) {
-          results.push_back(rewriter.create<tensor::InsertSliceOp>(
-              op.getLoc(), dst.getType(), srcVal, mapping.lookupOrDefault(dst),
-              getMappedValues(parallelInsertSliceOp.getOffsets()),
-              getMappedValues(parallelInsertSliceOp.getSizes()),
-              getMappedValues(parallelInsertSliceOp.getStrides()),
-              parallelInsertSliceOp.getStaticOffsets(),
-              parallelInsertSliceOp.getStaticSizes(),
-              parallelInsertSliceOp.getStaticStrides()));
-        }
-      }
-      rewriter.replaceOp(op, results);
-      return success();
-    }
-
-    // Replace the loop by a lower-dimensional loop.
-    ForallOp newOp;
-    newOp = rewriter.create<ForallOp>(loc, newMixedLowerBounds,
-                                      newMixedUpperBounds, newMixedSteps,
-                                      op.getOutputs(), std::nullopt, nullptr);
-    newOp.getBodyRegion().getBlocks().clear();
-    // The new loop needs to keep all attributes from the old one, except for
-    // "operand_segment_sizes" which captures the outdated information of the
-    // old iteration domain.
-    SmallVector<StringAttr> elidedAttrs{newOp.getOperandSegmentSizesAttrName(),
-                                        newOp.getStaticLowerBoundAttrName(),
-                                        newOp.getStaticUpperBoundAttrName(),
-                                        newOp.getStaticStepAttrName()};
-    for (const auto &namedAttr : op->getAttrs()) {
-      if (llvm::is_contained(elidedAttrs, namedAttr.getName())) continue;
-      newOp->setAttr(namedAttr.getName(), namedAttr.getValue());
-    }
-
-    // Clone the loop body and remap the block arguments of the collapsed loops
-    // (inlining does not support a cancellable block argument mapping).
-    rewriter.cloneRegionBefore(op.getRegion(), newOp.getRegion(),
-                               newOp.getRegion().begin(), mapping);
-    rewriter.replaceOp(op, newOp.getResults());
-    return success();
-  }
-};
-
-}  // namespace
-
-void populateCollapseForallOpDimensionsPattern(RewritePatternSet &patterns) {
-  patterns.add<CollapseForallOpDimensions>(patterns.getContext());
-}
-
-}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h
index 254ea8e19ae..6b5e09de5f8 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h
@@ -26,21 +26,11 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
 
+namespace mlir::gml_st {
+
 #define GEN_PASS_DECL
 #include "gml_st/transforms/passes.h.inc"
 
-namespace mlir {
-namespace gml_st {
-
-struct MatmulSizes {
-  // [m, k] x [k, n]
-  int64_t m;
-  int64_t n;
-  int64_t k;
-};
-
-using MatmulTileSizeComputationFn = std::function<MatmulSizes(MatmulSizes)>;
-
 /// Pass to fuse producers into a tiled consumer.
 std::unique_ptr<OperationPass<func::FuncOp>> createFusionPass(
     StringRef producer = "", StringRef consumer = "");
@@ -68,7 +58,8 @@ std::unique_ptr<OperationPass<func::FuncOp>>
 createComposeExtractInsertSlicePass();
 
 /// Pass to vectorize compute ops and scf.for loops that are tiled perfectly.
-std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeForCPUPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeForCPUPass(
+    int64_t numElementsThreshold = 1024);
 
 /// Pass to vectorize `memref.copy`.
 std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass();
@@ -83,19 +74,12 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLowerVectorsPass(
 /// Pass to pack linalg.matmul as linalg.mmt4d.
 std::unique_ptr<OperationPass<func::FuncOp>> createPackMatmulPass();
 
-/// Pass to transform a conv op for CPU backend.
-std::unique_ptr<OperationPass<func::FuncOp>> createTransformConvForCpuPass();
-
 /// Pass to transform a thlo.scatter op for CPU backend.
 std::unique_ptr<OperationPass<func::FuncOp>> createTransformScatterForCpuPass();
 
 /// Pass to transform a dot operation for CPU backend.
 std::unique_ptr<OperationPass<func::FuncOp>> createTransformDotForCpuPass(
-    MatmulTileSizeComputationFn tileSizeFn = nullptr);
-
-/// Pass to transform a linalg.matmul op for CPU backend.
-std::unique_ptr<OperationPass<func::FuncOp>> createTransformMatmulForCpuPass(
-    MatmulTileSizeComputationFn tileSizeFn = nullptr);
+    ArrayRef<int64_t> tileSizes = {}, StringRef cpuName = "");
 
 /// Pass to transform tensor.pack/unpack ops for CPU backend.
 std::unique_ptr<OperationPass<func::FuncOp>> createTransformPackForCpuPass();
@@ -107,24 +91,17 @@ std::unique_ptr<OperationPass<func::FuncOp>> createTransformMmt4DForCpuPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createFusionOfTensorOpsPass();
 
 /// Pass to convert ops on tensors with 1 element to scalar ops.
-std::unique_ptr<OperationPass<func::FuncOp>> createScalarizationPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createScalarizationPass(
+    bool scalarizeAllThlo = true);
 
-/// Pass to transform a linalg.map op for CPU backend.
+/// Pass to transform elementwise ops for CPU backend.
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformMapForCpuPass(int64_t tileSize = 1);
+createTransformElementwiseForCpuPass(int64_t vectorSize = 8,
+                                     bool fuseDegenerateReshapes = false);
 
 /// Pass to transform a linalg.reduce op for CPU backend.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformReduceForCpuPass(int64_t vectorSize = 8, int64_t tileSize1D = 32,
-                                ArrayRef<int64_t> tileSizes2D = {});
-
-/// Pass to transform a thlo.reverse op for CPU backend.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformReverseForCpuPass(int64_t vectorSize = 8);
-
-/// Pass to transform a linalg.transpose op for CPU backend.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformTransposeForCpuPass(ArrayRef<int64_t> tileSizes = std::nullopt);
+std::unique_ptr<Pass> createTransformReduceForCpuPass(
+    const TransformReduceForCpuPassOptions &option = {});
 
 /// Pass to create fusion clusters.
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
@@ -137,6 +114,10 @@ std::unique_ptr<OperationPass<mlir::ModuleOp>> createFusionOutliningPass();
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 createInlineFusionClustersPass();
 
+/// Pass with canonicalization patterns for linalg ops.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createOptimizeLinalgOpsPass();
+
 /// Pass to rewrite tensor.from_elements into tensor.insert.
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 createRewriteFromElementsOpPass();
@@ -148,6 +129,14 @@ createRewriteForallOpPass();
 /// Pass to add debug info to be propagated into LLVM backend.
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createAddDebugInfoPass();
 
+/// Pass to print stats about tileable ops.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> createCollectStatsPass(
+    int64_t level = 0);
+
+/// Pass to remove all transformed labels from tiled ops.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createRemoveLabelPass();
+
 /// Populate pattern to remove single/zero iteration scf.forall dimensions.
 void populateCollapseForallOpDimensionsPattern(RewritePatternSet &patterns);
 
@@ -158,23 +147,44 @@ struct GmlStCPUTilingOptions
     this->lowerToMmt4d = opts.lowerToMmt4d;
     this->matmulTileSizes = opts.matmulTileSizes;
     this->reduction1DTileSize = opts.reduction1DTileSize;
-    this->reduction2DTileSizes = opts.reduction2DTileSizes;
+    this->reduction1DSplitRatio = opts.reduction1DSplitRatio;
+    this->reduction2DParallelDimTileSize = opts.reduction2DParallelDimTileSize;
+    this->reduction2DReductionDimTileSize =
+        opts.reduction2DReductionDimTileSize;
     this->vectorSize = opts.vectorSize;
-    this->enableFusionClusters = opts.enableFusionClusters;
+    this->statsDetailLevel = opts.statsDetailLevel;
+    this->cpuName = opts.cpuName;
+    this->inlineFusionClusters = opts.inlineFusionClusters;
   }
 
   Option<int64_t> vectorSize{*this, "vector-size",
                              llvm::cl::desc("Vector size for a 1D reduction."),
                              llvm::cl::init(8)};
 
+  Option<bool> reductionEnableHeuristic{
+      *this, "reduction-enable-heuristic",
+      llvm::cl::desc("Enable tiling parameters heuristic for reductions."),
+      llvm::cl::init(false)};
+
   Option<int64_t> reduction1DTileSize{
       *this, "reduction-1d-tile-size",
       llvm::cl::desc("Tile size for a 1D reduction."), llvm::cl::init(32)};
 
-  ListOption<int64_t> reduction2DTileSizes{
-      *this, "reduction-2d-tile-sizes",
-      llvm::cl::desc("Tile sizes for a 2D reduction."),
-      llvm::cl::list_init<int64_t>({4, 4}), llvm::cl::ZeroOrMore};
+  Option<int64_t> reduction1DSplitRatio{
+      *this, "reduction-1d-split-ratio",
+      llvm::cl::desc("Ratio used to split the reduction dimension"),
+      llvm::cl::init(8)};
+
+  Option<int64_t> reduction2DParallelDimTileSize{
+      *this, "reduction-2d-parallel-dim-tile-size",
+      llvm::cl::desc("Tile size for the parallel dimension of a 2D reduction."),
+      llvm::cl::init(4)};
+
+  Option<int64_t> reduction2DReductionDimTileSize{
+      *this, "reduction-2d-reduction-dim-tile-size",
+      llvm::cl::desc(
+          "Tile size for the reduction dimension of a 2D reduction."),
+      llvm::cl::init(4)};
 
   ListOption<int64_t> matmulTileSizes{
       *this, "matmul-tile-sizes",
@@ -182,32 +192,46 @@ struct GmlStCPUTilingOptions
                      "sizes automatically."),
       llvm::cl::list_init<int64_t>({}), llvm::cl::ZeroOrMore};
 
+  Option<int64_t> vectorizationSizeThreshold{
+      *this, "vectorization-size-threshold",
+      llvm::cl::desc("Threshold size for vectorization."), llvm::cl::init(128)};
+
+  Option<int64_t> vectorizationTiledSizeThreshold{
+      *this, "vectorization-tiled-size-threshold",
+      llvm::cl::desc("Threshold size for vectorization after tiling."),
+      llvm::cl::init(1024)};
+
   Option<bool> lowerToMmt4d{
       *this, "lower-to-mmt4d",
       llvm::cl::desc("Enable the specific code generation (packing) for matmul "
                      "operations."),
       llvm::cl::init(false)};
 
-  Option<bool> enableFusionClusters{
-      *this, "enable-fusion-clusters",
-      llvm::cl::desc("Enable the pass to create gml_st.fusion clusters."),
-      llvm::cl::init(false)};
-
-  Option<bool> enableFusionClusterOutlining{
-      *this, "enable-fusion-cluster-outlining",
-      llvm::cl::desc(
-          "Enable passes to outline and deduplicate gml_st.fusion clusters."),
-      llvm::cl::init(false)};
-
   Option<StringRef> cpuName{
       *this, "cpu",
       llvm::cl::desc("CPU name, similar to llc's -mcpu flag. e.g. 'znver2', "
                      "'skylake-avx512'."),
       llvm::cl::init("")};
+
+  Option<int64_t> statsDetailLevel{
+      *this, "stats-detail-level",
+      llvm::cl::desc("Detail level for collecting IR statistics."),
+      llvm::cl::init(0)};
+
+  Option<bool> fuseDegenerateReshapes{
+      *this, "fuse-degenerate-reshapes",
+      llvm::cl::desc("Fuse through tensor.expand/collapse_shape"),
+      llvm::cl::init(false)};
+
+  Option<bool> inlineFusionClusters{
+      *this, "inline-fusion-clusters",
+      llvm::cl::desc("Inline fusion clusters at the end of the pipeline."),
+      llvm::cl::init(true)};
 };
 
 // Returns default "optimized" tiling parameters.
-GmlStCPUTilingOptions getDefaultCPUPipelineOptions(StringRef cpuName);
+GmlStCPUTilingOptions getDefaultCPUPipelineOptions(
+    StringRef cpuName, int64_t statsDetailLevel = 0);
 
 // Adds tiling-fusion-vectorization passes for tHLO/Linalg ops mix.
 void addCPUTilingPipeline(OpPassManager &pm,
@@ -215,12 +239,12 @@ void addCPUTilingPipeline(OpPassManager &pm,
 
 // Adds tiling-fusion-vectorization passes for tHLO/Linalg ops mix with the
 // "optimized" tiling parameters.
-void addDefaultCPUTilingPipeline(OpPassManager &pm, StringRef cpuName);
+void addDefaultCPUTilingPipeline(OpPassManager &pm, StringRef cpuName,
+                                 int64_t statsDetailLevel = 0);
 
 #define GEN_PASS_REGISTRATION
 #include "gml_st/transforms/passes.h.inc"
 
-}  // namespace gml_st
-}  // namespace mlir
+}  // namespace mlir::gml_st
 
 #endif  // MLIR_HLO_GML_ST_TRANSFORMS_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.td b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.td
index d0ad34af826..2d1235d53d0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.td
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.td
@@ -63,6 +63,11 @@ def VectorizeForCPUPass : Pass<"vectorize-for-cpu", "mlir::func::FuncOp"> {
     "::mlir::vector::VectorDialect",
     "::mlir::tensor::TensorDialect"
   ];
+  let options = [
+    Option<"numElementsThreshold", "num-elements-threshold", "int64_t",
+           /*default=*/"128",
+           "Number of elements max of the tensor operands in order for the op to be vectorized.">,
+  ];
 }
 
 def VectorizeCopyPass :
@@ -99,6 +104,10 @@ def ScalarizationPass : Pass<"scalarize", "mlir::func::FuncOp"> {
     "tensor::TensorDialect"
   ];
   let constructor = "createScalarizationPass()";
+  let options = [
+    Option<"scalarizeAllThlo", "scalarize-all-thlo", "bool", /*default=*/"true",
+           "Enable scalarization of thlo.concatenate/gather/scatter.">,
+  ];
 }
 
 def PackMatmulPass : Pass<"xla-cpu-pack-matmul", "mlir::func::FuncOp"> {
@@ -116,16 +125,9 @@ def TransformScatterForCpuPass :
 def TransformDotForCpuPass :
     Pass<"xla-cpu-transform-dot", "mlir::func::FuncOp"> {
   let summary = "Transform dot ops for running on CPU";
-
   let constructor = "createTransformDotForCpuPass()";
 }
 
-def TransformMatmulForCpuPass :
-    Pass<"xla-cpu-transform-matmul", "mlir::func::FuncOp"> {
-  let summary = "Transform matmul ops for running on CPU";
-  let constructor = "createTransformMatmulForCpuPass()";
-}
-
 def TransformPackForCpuPass :
     Pass<"xla-cpu-transform-pack", "mlir::func::FuncOp"> {
   let summary = "Transform tensor.pack/unpack ops for running on CPU";
@@ -138,63 +140,45 @@ def TransformMmt4DForCpuPass :
   let constructor = "createTransformMmt4DForCpuPass()";
 }
 
-def TransformConvForCpuPass :
-    Pass<"xla-cpu-transform-conv", "mlir::func::FuncOp"> {
-  let summary = "Transform conv ops for running on CPU";
-  let constructor = "createTransformConvForCpuPass()";
-}
-
-def TransformMapForCpuPass :
-    Pass <"gml-st-cpu-transform-map", "mlir::func::FuncOp"> {
-  let summary = "Transform map ops for running on CPU";
-
-  let constructor = "::mlir::gml_st::createTransformMapForCpuPass()";
+def TransformElementwiseForCpuPass :
+    Pass <"xla-cpu-transform-elementwise", "mlir::func::FuncOp"> {
+  let summary = "Transform elementwise ops for running on CPU";
+  let description = [{
+    Transforms elementwise ops, i.e. map, transpose, broadcast, concat, reverse.
+  }];
+  let constructor = "::mlir::gml_st::createTransformElementwiseForCpuPass()";
 
   let options = [
-    Option<"tileSize", "tile-size", "int64_t", "1",
-           "Tile size for the innermost dimension of `linalg.map`">,
-  ];
-}
-
-def TransformTransposeForCpuPass :
-    Pass<"gml-st-cpu-transform-transpose", "mlir::func::FuncOp"> {
-  let summary = "Transform transpose ops for running on CPU";
-
-  let constructor = "createTransformTransposeForCpuPass()";
-
-  let options = [
-    ListOption<"tileSizes", "tile-sizes", "int64_t",
-               "Tile sizes for a `linalg.transpose`">,
+    Option<"vectorSize", "vector-size", "int64_t", "8", "Vector size.">,
+    Option<"fuseDegenerateReshapes", "fuse-degenerate-reshapes", "bool",
+           /*default=*/"false",
+           "Fuse through degenerate tensor.expand/collapse_shape">,
   ];
 }
 
 def TransformReduceForCpuPass :
     Pass<"xla-cpu-transform-reduce", "mlir::func::FuncOp"> {
   let summary = "Transform reduce ops for running on CPU";
-
-  let constructor = "createTransformReduceForCpuPass()";
-
   let options = [
-    Option<"vectorSize", "vector-size", "int64_t", "8",
-           "Vector size for a 1D `linalg.reduce`">,
-    Option<"tileSize1D", "tile-size-1d", "int64_t", "32",
-               "Tile size for a 1D `linalg.reduce`">,
-    ListOption<"tileSizes2D", "tile-sizes-2d", "int64_t",
-               "Tile sizes for a `linalg.reduce`. tileSizes[0] is the parallel "
-               "dimension and tileSizes[1] is the reduction dimension.">,
+    Option<"enableHeuristic", "enable_heuristic", "bool", "false",
+           "Enable heuristic for tiling sizes. Currently only for 1D.">,
+    Option<"tileSize1D", "reduction-1d-tile-size", "int64_t", "32",
+           "Tile size for a 1D reduction.">,
+    Option<"splitRatio1D", "reduction-1d-split-ratio", "int64_t", "8",
+           "Ratio used to split the reduction dimension, i.e. tiled reduce op "
+           "`reduce(tensor<N>)` will be split into a composition of a "
+           " column reduction `reduce(tensor<N/splitRatio1D x splitRatio1D>)` "
+           "and a row 1D reductionreduce(tensor<splitRatio1D>)`." >,
+    Option<"parallelDimTileSize2D",
+            "reduction-2d-parallel-dim-tile-size", "int64_t", "4",
+           "Tile size for the parallel dimension of a 2D reduction.">,
+    Option<"reductionDimTileSize2D",
+           "reduction-2d-reduction-dim-tile-size", "int64_t", "4",
+           "Tile size for the reduction dimension of a 2D reduction.">,
   ];
+  let constructor = "::mlir::gml_st::createTransformReduceForCpuPass()";
 }
 
-def TransformReverseForCpuPass :
-  Pass<"xla-cpu-transform-reverse", "mlir::func::FuncOp"> {
-    let summary = "Transform reverse ops for running on CPU";
-    let constructor = "createTransformReverseForCpuPass()";
-    let options = [
-      Option<"vectorSize", "vector-size", "int64_t", "8",
-           "Vector size for 'thlo.reverse`">,
-    ];
-  }
-
 def AddDebugInfoPass :
     Pass<"add-debug-info", "mlir::ModuleOp"> {
   let summary = "Add debug info for the whole module";
@@ -202,13 +186,24 @@ def AddDebugInfoPass :
   let dependentDialects = ["::mlir::LLVM::LLVMDialect"];
 }
 
+def CollectStatsPass :
+    Pass<"collect-stats", "mlir::func::FuncOp"> {
+  let summary = "Print stats about tileable ops";
+  let constructor = "::mlir::gml_st::createCollectStatsPass()";
+}
+
+def RemoveLabelPass : Pass<"remove-label", "mlir::func::FuncOp"> {
+  let summary = "Remove transformed labels from tiled ops";
+  let constructor = "::mlir::gml_st::createRemoveLabelPass()";
+}
+
 def FusionPlanningForCpuPass :
     Pass<"gml-st-cpu-fusion-planning", "mlir::func::FuncOp"> {
   let summary = "Create fusion clusters.";
   let constructor = "createFusionPlanningForCpuPass()";
   let dependentDialects = [
-    "::mlir::gml_st::GmlStDialect",
     "::mlir::arith::ArithDialect",
+    "::mlir::gml_st::GmlStDialect",
     "::mlir::linalg::LinalgDialect",
     "::mlir::tensor::TensorDialect"
   ];
@@ -244,3 +239,15 @@ def InlineFusionClustersPass :
     "::mlir::gml_st::GmlStDialect",
   ];
 }
+
+def OptimizeLinalgOpsPass
+    : Pass<"gml-st-optimize-linalg-ops-pass", "mlir::func::FuncOp"> {
+  let summary = "Canonicalization patterns for linalg ops.";
+  let constructor = "createOptimizeLinalgOpsPass()";
+
+  let dependentDialects = [
+    "::mlir::arith::ArithDialect",
+    "::mlir::linalg::LinalgDialect",
+    "::mlir::tensor::TensorDialect"
+  ];
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/peeling/peeling.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/peeling/peeling.cc
index 970601d154f..4b07155c490 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/peeling/peeling.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/peeling/peeling.cc
@@ -54,11 +54,11 @@ LogicalResult peelLoop(RewriterBase &b, scf::ForallOp loopOp, int64_t idx,
   // New upper bound: %ub - (%ub - %lb) mod %step
   auto modMap = AffineMap::get(0, 3, exprUb - ((exprUb - exprLb) % exprStep));
   SmallVector<Value> operands{lb, ub, step};
-  canonicalizeMapAndOperands(&modMap, &operands);
+  affine::canonicalizeMapAndOperands(&modMap, &operands);
   modMap = simplifyAffineMap(modMap);
   RewriterBase::InsertionGuard guard(b);
   b.setInsertionPoint(loopOp);
-  splitBound = b.createOrFold<AffineApplyOp>(loc, modMap, operands);
+  splitBound = b.createOrFold<affine::AffineApplyOp>(loc, modMap, operands);
 
   // No specialization necessary if step already divides upper bound evenly.
   if (splitBound == ub || (ubInt && ubInt == getConstantIntValue(splitBound)))
@@ -163,10 +163,10 @@ GmlStPeelingResult peelAllLoops(scf::ForallOp loop,
     Value mainIv = loop.getInductionVars()[peeledIdx],
           remainderIv = remainderLoop.getInductionVars()[peeledIdx];
 
-    rewriteAffineOpAfterPeeling<AffineMinOp>(rewriter, loop, remainderLoop,
-                                             mainIv, remainderIv, ub, step);
-    rewriteAffineOpAfterPeeling<AffineMaxOp>(rewriter, loop, remainderLoop,
-                                             mainIv, remainderIv, ub, step);
+    rewriteAffineOpAfterPeeling<affine::AffineMinOp>(
+        rewriter, loop, remainderLoop, mainIv, remainderIv, ub, step);
+    rewriteAffineOpAfterPeeling<affine::AffineMaxOp>(
+        rewriter, loop, remainderLoop, mainIv, remainderIv, ub, step);
 
     // Mark the new loop if one was created.
     peelingResult.tailLoops.push_back(remainderLoop);
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_scf_forall/rewrite_scf_forall.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_scf_forall/rewrite_scf_forall.cc
index 71f3b1dcb38..0fe18bd3af6 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_scf_forall/rewrite_scf_forall.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_scf_forall/rewrite_scf_forall.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Pass/Pass.h"
@@ -26,65 +28,61 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
-namespace mlir {
-namespace gml_st {
+namespace mlir::gml_st {
 namespace {
 
 // Rewrites `scf.forall` to an `scf.for` loop nest.
-LogicalResult rewriteScfForallToScfFor(scf::ForallOp op,
+LogicalResult rewriteScfForallToScfFor(scf::ForallOp forallOp,
                                        PatternRewriter &rewriter) {
-  if (op.getRank() == 0) return failure();
+  if (forallOp.getRank() == 0) return failure();
   // Do not convert to scf.for if scf.forall is mapped to threads.
-  if (op.getMapping().has_value()) return failure();
-  Location loc = op.getLoc();
+  if (forallOp.getMapping().has_value()) return failure();
 
-  rewriter.setInsertionPoint(op);
-  SmallVector<scf::ForOp> forOps;
-  SmallVector<Value> ivs;
-  ValueRange iterArgs = op.getOutputs();
-  for (auto [lower, upper, step] :
-       llvm::zip(op.getLowerBound(rewriter), op.getUpperBound(rewriter),
-                 op.getStep(rewriter))) {
-    auto forOp = forOps.emplace_back(
-        rewriter.create<scf::ForOp>(loc, lower, upper, step, iterArgs));
-    iterArgs = forOp.getRegionIterArgs();
-    forOp->setAttrs(op->getAttrs());
-    ivs.push_back(forOp.getInductionVar());
+  Location loc = forallOp.getLoc();
+  scf::LoopNest loopNest = scf::buildLoopNest(
+      rewriter, loc, forallOp.getLowerBound(rewriter),
+      forallOp.getUpperBound(rewriter), forallOp.getStep(rewriter),
+      forallOp.getOutputs(),
+      [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange ivs,
+          ValueRange iterArgs) -> scf::ValueVector {
+        IRMapping map;
+        map.map(forallOp.getInductionVars(), ivs);
+        map.map(forallOp.getOutputBlockArguments(), iterArgs);
 
-    if (forOps.size() > 1) {
-      rewriter.create<scf::YieldOp>(loc, forOp.getResults());
-    }
-    rewriter.setInsertionPointToStart(forOp.getBody());
+        for (auto &op : forallOp.getBody()->without_terminator())
+          nestedBuilder.clone(op, map);
+
+        auto inParallelOp = forallOp.getTerminator();
+        scf::ValueVector results;
+        for (auto &op : inParallelOp.getYieldingOps()) {
+          auto mappedOperands =
+              llvm::to_vector(llvm::map_range(op.getOperands(), [&](Value val) {
+                return map.lookupOrDefault(val);
+              }));
+          results.push_back(rewriter.create<tensor::InsertSliceOp>(
+              nestedLoc, mappedOperands, op.getAttrs()));
+        }
+        rewriter.eraseOp(forallOp.getTerminator());
+        return results;
+      });
+
+  // Copy attributes from `scf.forall` to the output
+  SmallVector<StringAttr> elidedAttrs{forallOp.getOperandSegmentSizesAttrName(),
+                                      forallOp.getStaticLowerBoundAttrName(),
+                                      forallOp.getStaticUpperBoundAttrName(),
+                                      forallOp.getStaticStepAttrName()};
+  SmallVector<NamedAttribute> attrs = llvm::to_vector(llvm::make_filter_range(
+      forallOp->getAttrs(), [&](const NamedAttribute &attr) {
+        return !llvm::is_contained(elidedAttrs, attr.getName());
+      }));
+
+  for (scf::ForOp loop : loopNest.loops) {
+    rewriter.updateRootInPlace(loop, [&]() {
+      for (const auto &attr : attrs)
+        loop->setAttr(attr.getName(), attr.getValue());
+    });
   }
-
-  rewriter.replaceAllUsesWith(op.getInductionVars().drop_back(),
-                              ValueRange{ivs}.drop_back(1));
-  op.getBody()->eraseArguments(0, forOps.size() - 1);
-
-  forOps.back().getRegion().takeBody(op.getRegion());
-  rewriter.replaceOp(op, forOps.front().getResults());
-
-  return success();
-}
-
-// Rewrites `in_parallel { parallel_insert_slice* }` to
-// `insert_slice*; scf.yield`.
-LogicalResult rewriteScfInParallel(scf::InParallelOp inParallelOp,
-                                   PatternRewriter &rewriter) {
-  rewriter.setInsertionPoint(inParallelOp);
-  SmallVector<Value> results;
-  for (auto &op :
-       llvm::make_early_inc_range(inParallelOp.getRegion().getOps())) {
-    auto parallelInsertSlice =
-        llvm::dyn_cast<tensor::ParallelInsertSliceOp>(op);
-    if (!parallelInsertSlice) return failure();
-
-    results.push_back(rewriter.create<tensor::InsertSliceOp>(
-        op.getLoc(), op.getOperands(), op.getAttrs()));
-  }
-  rewriter.create<scf::YieldOp>(inParallelOp.getLoc(), results);
-  rewriter.eraseOp(inParallelOp);
-
+  rewriter.replaceOp(forallOp, loopNest.results);
   return success();
 }
 
@@ -98,12 +96,10 @@ class RewriteForallOpPass
     auto *context = &getContext();
 
     RewritePatternSet patterns(context);
-    scf::ForOp::getCanonicalizationPatterns(patterns, context);
     patterns.add(rewriteScfForallToScfFor);
-    patterns.add(rewriteScfInParallel);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+    scf::ForOp::getCanonicalizationPatterns(patterns, context);
+    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
       return signalPassFailure();
-    }
   }
 };
 
@@ -113,5 +109,4 @@ std::unique_ptr<OperationPass<func::FuncOp>> createRewriteForallOpPass() {
   return std::make_unique<RewriteForallOpPass>();
 }
 
-}  // namespace gml_st
-}  // namespace mlir
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc
index 29af673af36..2eb31a7888e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc
@@ -353,6 +353,8 @@ LogicalResult hoistTensorExtractFromIfOp(scf::IfOp ifOp,
 
 struct ScalarizationPass
     : public impl::ScalarizationPassBase<ScalarizationPass> {
+  using Base::Base;
+
   void runOnOperation() override {
     auto func = getOperation();
     auto *ctx = &getContext();
@@ -361,11 +363,14 @@ struct ScalarizationPass
     patterns.add<ScalarizeLinalgOp, FoldTensorFromElementsIntoInsertSlice>(ctx);
     patterns.add(hoistTensorExtractFromForOp);
     patterns.add(hoistTensorExtractFromIfOp);
-    patterns.add(scalarizeConcatenateOp);
     patterns.add(scalarizeDynamicBroadcastInDimOp);
-    patterns.add(scalarizeGatherOp);
     patterns.add(scalarizeReverseOp);
-    patterns.add(scalarizeScatterOp);
+
+    if (scalarizeAllThlo) {
+      patterns.add(scalarizeConcatenateOp);
+      patterns.add(scalarizeGatherOp);
+      patterns.add(scalarizeScatterOp);
+    }
 
     FromElementsOp::getCanonicalizationPatterns(patterns, ctx);
     if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
@@ -495,6 +500,19 @@ LogicalResult scalarizeLinalgOp(LinalgOp linalgOp, PatternRewriter &rewriter) {
   // Fail if not every argument is a scalar or a single-element tensor.
   if (!hasSingleElementOperandsAndResults(linalgOp)) return failure();
 
+  // Do not scalarize linalg::FillOp that is only used by DPS ops as init
+  // operands.
+  if (isa<linalg::FillOp>(linalgOp)) {
+    if (llvm::all_of(linalgOp->getUses(), [&](OpOperand &use) {
+          Operation *user = use.getOwner();
+          return isa<DestinationStyleOpInterface>(user) &&
+                 llvm::is_contained(cast<DestinationStyleOpInterface>(user)
+                                        .getDpsInitOperands(),
+                                    &use);
+        }))
+      return failure();
+  }
+
   // Load the data corresponding to the block arguments that
   // represent input operands.
   SmallVector<Value> indexedValues;
@@ -638,8 +656,11 @@ LogicalResult scalarizeScatterOp(thlo::ScatterOp scatterOp,
   return rewriteScatterOpAsIfOp(scatterOp, rewriter);
 }
 
-std::unique_ptr<OperationPass<func::FuncOp>> createScalarizationPass() {
-  return std::make_unique<ScalarizationPass>();
+std::unique_ptr<OperationPass<func::FuncOp>> createScalarizationPass(
+    bool scalarizeAllThlo) {
+  ScalarizationPassOptions opts;
+  opts.scalarizeAllThlo = scalarizeAllThlo;
+  return std::make_unique<ScalarizationPass>(opts);
 }
 
 }  // namespace gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.cc
index b3672da774c..a5aa31770e1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.cc
@@ -50,8 +50,9 @@ struct GreedyFusionPattern : public OpRewritePattern<scf::ForallOp> {
     if (hasLabel(op, kTestFusionAppliedLabel)) return failure();
 
     rewriter.updateRootInPlace(op, [&]() {
-      fuseGreedily(rewriter, op.getRegion().front(), [](Operation *op) {
-        return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp>(op);
+      fuseGreedily(rewriter, &op.getRegion().front(), [](Operation *op) {
+        return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp,
+                   tensor::CollapseShapeOp, tensor::ExpandShapeOp>(op);
       });
     });
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tile_by_one.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tile_by_one.cc
index 38296f22b3e..c6ee1afcfa5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tile_by_one.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tile_by_one.cc
@@ -42,22 +42,6 @@ namespace {
 
 static constexpr llvm::StringRef kTileByOneLabel = "__tile_by_one_label__";
 
-SmallVector<Value> unitTileSizeComputationFunction(OpBuilder &b,
-                                                   Operation *op) {
-  // Determine rank.
-  auto iface = cast<TilingInterface>(op);
-  int64_t rank = iface.getLoopIteratorTypes().size();
-
-  // Build unit tile sizes.
-  auto one = b.create<arith::ConstantIndexOp>(op->getLoc(), 1);
-  OpBuilder::InsertionGuard guard(b);
-  b.setInsertionPointToStart(
-      &op->getParentOfType<func::FuncOp>().getBody().front());
-  SmallVector<Value> tileSize(rank, one);
-
-  return tileSize;
-}
-
 template <typename OpTy>
 struct TileByOnePattern : public OpRewritePattern<OpTy> {
   using OpRewritePattern<OpTy>::OpRewritePattern;
@@ -83,7 +67,8 @@ struct TileByOnePattern : public OpRewritePattern<OpTy> {
 
     // Tile.
     scf::SCFTilingOptions opts;
-    opts.setTileSizeComputationFunction(unitTileSizeComputationFunction);
+    opts.setTileSizes(
+        SmallVector<int64_t>(iface.getLoopIteratorTypes().size(), 1));
     FailureOr<scf::SCFTilingResult> tilingResult =
         tileUsingSCFForOp(rewriter, iface, opts);
     if (failed(tilingResult))
@@ -116,10 +101,8 @@ struct TileByOnePass : public impl::TileByOnePassBase<TileByOnePass> {
     patterns.add<
         TileByOnePattern<thlo::ConcatenateOp>,
         TileByOnePattern<thlo::GatherOp>,
-        TileByOnePattern<thlo::ReverseOp>,
         TileByOnePattern<thlo::ScatterOp>,
-        TileByOnePattern<thlo::SortOp>,
-        TileByOnePattern<linalg::MapOp>>(ctx);
+        TileByOnePattern<thlo::SortOp>>(ctx);
     // clang-format on
 
     // Apply patterns.
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.cc
index 1214a629c70..b98dc974594 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.cc
@@ -27,8 +27,7 @@ limitations under the License.
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 
-namespace mlir {
-namespace gml_st {
+namespace mlir::gml_st {
 namespace {
 
 // Compute tile size for the tile that starts at `offset`, has size `tileSize`
@@ -50,10 +49,10 @@ OpFoldResult computeTileSizeInDim(OpBuilder &builder, Location loc,
   AffineExpr d0, s0;
   bindDims(builder.getContext(), d0);
   bindSymbols(builder.getContext(), s0);
-  OpFoldResult residualTileSize =
-      makeComposedFoldedAffineApply(builder, loc, s0 - d0, {offset, dimSize});
+  OpFoldResult residualTileSize = affine::makeComposedFoldedAffineApply(
+      builder, loc, s0 - d0, {offset, dimSize});
 
-  return makeComposedFoldedAffineMin(
+  return affine::makeComposedFoldedAffineMin(
       builder, loc, AffineMap::getMultiDimIdentityMap(2, loc.getContext()),
       {residualTileSize, tileSize});
 }
@@ -94,7 +93,7 @@ scf::ForallOp generateTileLoopNest(OpBuilder &builder, Location loc,
 
   SmallVector<OpFoldResult> lbs, ubs, steps;
   SmallVector<unsigned> nonemptyRangeIndices;
-  for (auto &loopRange : llvm::enumerate(loopRanges)) {
+  for (const auto &loopRange : llvm::enumerate(loopRanges)) {
     OpFoldResult offset = loopRange.value().offset;
     OpFoldResult size = loopRange.value().size;
     // No loops if tile size is zero. Set offset and size to the loop offset and
@@ -115,7 +114,8 @@ scf::ForallOp generateTileLoopNest(OpBuilder &builder, Location loc,
   return loop;
 }
 
-void updateOutputs(const TilingResult &tilingResult, ValueRange dstOperands) {
+void updateOutputs(const GMLSTTilingResult &tilingResult,
+                   ValueRange dstOperands) {
   scf::ForallOp parallelLoop = tilingResult.loop;
 
   if (auto dstOp = dyn_cast<DestinationStyleOpInterface>(
@@ -140,9 +140,9 @@ scf::SCFTilingOptions getSCFTilingOptions(ArrayRef<int64_t> tileSizes) {
   return opts;
 }
 
-FailureOr<TilingResult> tileUsingSCFForallOp(
-    const scf::SCFTilingOptions &options, PatternRewriter &rewriter,
-    TilingInterface op) {
+FailureOr<GMLSTTilingResult> tileUsingSCFForallOp(
+    PatternRewriter &rewriter, TilingInterface op,
+    const scf::SCFTilingOptions &options) {
   rewriter.setInsertionPoint(op);
   if (!options.tileSizeComputationFunction) {
     return rewriter.notifyMatchFailure(
@@ -172,7 +172,7 @@ FailureOr<TilingResult> tileUsingSCFForallOp(
 
   if (llvm::all_of(tileSizeVector,
                    [](Value v) { return matchPattern(v, m_Zero()); })) {
-    return TilingResult{{op}, nullptr};
+    return GMLSTTilingResult{{op}, nullptr};
   }
 
   // 3. Materialize an empty loop nest that iterates over the tiles.
@@ -180,7 +180,7 @@ FailureOr<TilingResult> tileUsingSCFForallOp(
   if (failed(tensor::getOrCreateDestinations(rewriter, loc, op, dstOperands)))
     return rewriter.notifyMatchFailure(op, "failed to get destinations");
   SmallVector<OpFoldResult> offsets, sizes;
-  TilingResult tilingResult;
+  GMLSTTilingResult tilingResult;
   tilingResult.loop =
       generateTileLoopNest(rewriter, loc, iterationDomain, tileSizeVector,
                            dstOperands, offsets, sizes);
@@ -190,7 +190,12 @@ FailureOr<TilingResult> tileUsingSCFForallOp(
   rewriter.setInsertionPoint(terminator);
 
   // 4. Insert the tiled implementation within the loop.
-  tilingResult.tiledOps = op.getTiledImplementation(rewriter, offsets, sizes);
+  FailureOr<TilingResult> tiledImplementation =
+      op.getTiledImplementation(rewriter, offsets, sizes);
+  if (failed(tiledImplementation))
+    return rewriter.notifyMatchFailure(op,
+                                       "failed to get tiled implementation");
+  tilingResult.tiledOps = tiledImplementation->tiledOps;
 
   // 5. Compute tiles for the insertion.
   int64_t numResults = op->getNumResults();
@@ -228,5 +233,4 @@ SmallVector<Value> getYieldedValues(scf::InParallelOp inParallelOp) {
       }));
 }
 
-}  // namespace gml_st
-}  // namespace mlir
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.h
index a7cd1a1d11c..7514972fa97 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.h
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.h
@@ -23,27 +23,27 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/TilingInterface.h"
 
-namespace mlir {
-namespace gml_st {
+namespace mlir::gml_st {
 
 // Creates SCFTilingOptions from the list of tile sizes.
 scf::SCFTilingOptions getSCFTilingOptions(ArrayRef<int64_t> tileSizes);
 
-/// Create tiled operation based on the specified tiling options. The result is
-/// equivalent to original op.
-struct TilingResult {
+/// Returns `failure`, when there occurs a problem during tiling. If the tile
+/// sizes are smaller then the iteration domain of the op, it will still create
+/// an `scf.forall` op. This is matches the behavior of tiling to `scf.for`
+/// upstream.
+struct GMLSTTilingResult {
   SmallVector<Operation *> tiledOps;
   scf::ForallOp loop = nullptr;
 };
-FailureOr<TilingResult> tileUsingSCFForallOp(
-    const scf::SCFTilingOptions &options, PatternRewriter &rewriter,
-    TilingInterface op);
+FailureOr<GMLSTTilingResult> tileUsingSCFForallOp(
+    PatternRewriter &rewriter, TilingInterface op,
+    const scf::SCFTilingOptions &options);
 
 /// Extracts all yielded values from scf.in_parallel terminator. It should be
 /// upstreamed.
 SmallVector<Value> getYieldedValues(scf::InParallelOp inParallelOp);
 
-}  // namespace gml_st
-}  // namespace mlir
+}  // namespace mlir::gml_st
 
 #endif  // MLIR_HLO_GML_ST_TRANSFORMS_TILING_TILING_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling_softmax/tiling_softmax.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling_softmax/tiling_softmax.cc
index 017d62b5733..47737062364 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling_softmax/tiling_softmax.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling_softmax/tiling_softmax.cc
@@ -30,8 +30,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
-namespace mlir {
-namespace gml_st {
+namespace mlir::gml_st {
 namespace {
 
 #define GEN_PASS_DEF_TILINGSOFTMAXPASS
@@ -51,7 +50,7 @@ Operation *fuseIthOperandInPlace(PatternRewriter &rewriter, Operation *op,
 
 LogicalResult tilePartialSoftmax(
     TilingInterface op, PatternRewriter &rewriter,
-    llvm::function_ref<FailureOr<Operation *>(Operation *, int64_t)>
+    llvm::function_ref<FailureOr<TilingResult>(Operation *, int64_t)>
         tileOperationFn) {
   // Match cwise root op.
   // Match all operands to be derived from the same source value in one of two
@@ -95,10 +94,12 @@ LogicalResult tilePartialSoftmax(
     return rewriter.notifyMatchFailure(op, "no common dim/src");
 
   // Tile or fuse cwise root op.
-  FailureOr<Operation *> tiledOp = tileOperationFn(op, *commonReductionDim);
-  if (failed(tiledOp))
+  FailureOr<TilingResult> tilingResult =
+      tileOperationFn(op, *commonReductionDim);
+  if (failed(tilingResult))
     return rewriter.notifyMatchFailure(op, "call to tileOperationFn failed");
-  setLabel(*tiledOp, kTileSoftmaxAppliedLabel);
+  Operation *tiledOp = tilingResult->tiledOps[0];
+  setLabel(tiledOp, kTileSoftmaxAppliedLabel);
 
   // Fuse through the bcast reduction chains.
   Value commonTiledSource;
@@ -107,7 +108,7 @@ LogicalResult tilePartialSoftmax(
     if (!simpleBcastReductions[i]) continue;
 
     // Fuse.
-    Operation *tiledBcast = fuseIthOperandInPlace(rewriter, *tiledOp, i);
+    Operation *tiledBcast = fuseIthOperandInPlace(rewriter, tiledOp, i);
     Operation *tiledReduction =
         fuseIthOperandInPlace(rewriter, tiledBcast, /*i=*/0);
 
@@ -122,7 +123,7 @@ LogicalResult tilePartialSoftmax(
   // Also use the common tiled source value for the remaining operands.
   for (size_t i = 0; i < simpleBcastReductions.size(); i++) {
     if (simpleBcastReductions[i]) continue;
-    (*tiledOp)->setOperand(i, commonTiledSource);
+    tiledOp->setOperand(i, commonTiledSource);
   }
 
   return success();
@@ -152,7 +153,7 @@ struct TilePartialSoftmaxPattern
     return tilePartialSoftmax(
         op, rewriter,
         [&](Operation *op,
-            int64_t commonReductionDim) -> FailureOr<Operation *> {
+            int64_t commonReductionDim) -> FailureOr<TilingResult> {
           // Populate tiling options.
           scf::SCFTilingOptions tilingOptions;
           tilingOptions.setTileSizeComputationFunction(
@@ -171,13 +172,16 @@ struct TilePartialSoftmaxPattern
                 return tileSizeValues;
               });
           // Tile.
-          FailureOr<TilingResult> tilingResult =
-              tileUsingSCFForallOp(tilingOptions, rewriter, op);
+          FailureOr<GMLSTTilingResult> tilingResult = tileUsingSCFForallOp(
+              rewriter, cast<TilingInterface>(op), tilingOptions);
           if (failed(tilingResult)) return failure();
 
           rewriter.replaceOp(op, tilingResult->loop->getResults());
           setLabel(tilingResult->tiledOps.front(), kTileSoftmaxAppliedLabel);
-          return tilingResult->tiledOps.front();
+          Operation *tiledOp = tilingResult->tiledOps.front();
+          return TilingResult{{tiledOp},
+                              SmallVector<Value>(tiledOp->result_begin(),
+                                                 tiledOp->result_end())};
         });
   }
 
@@ -198,9 +202,9 @@ struct FusePartialSoftmaxPattern
     if (!llvm::isa<TilingInterface>(def)) return failure();
 
     return tilePartialSoftmax(
-        def, rewriter,
+        cast<TilingInterface>(def), rewriter,
         [&](Operation *cwiseOp,
-            int64_t /*commonReductionDim*/) -> FailureOr<Operation *> {
+            int64_t /*commonReductionDim*/) -> FailureOr<TilingResult> {
           auto iface = llvm::dyn_cast_or_null<TilingInterface>(cwiseOp);
           if (!iface) {
             return rewriter.notifyMatchFailure(
@@ -215,15 +219,15 @@ struct FusePartialSoftmaxPattern
           // Fuse.
           SmallVector<OpFoldResult> offsets = op.getMixedOffsets();
           SmallVector<OpFoldResult> sizes = op.getMixedSizes();
-          FailureOr<Value> result =
+          FailureOr<TilingResult> tilingResult =
               iface.generateResultTileValue(rewriter, 0, offsets, sizes);
-          if (failed(result)) {
+          if (failed(tilingResult)) {
             return rewriter.notifyMatchFailure(
                 cwiseOp, "failed to generate result tile");
           }
 
-          rewriter.replaceOp(op, *result);
-          return result->getDefiningOp();
+          rewriter.replaceOp(op, tilingResult->tiledValues[0]);
+          return tilingResult;
         });
   }
 };
@@ -285,5 +289,4 @@ std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass(
   return std::make_unique<TilingSoftmaxPass>(tileSizes);
 }
 
-}  // namespace gml_st
-}  // namespace mlir
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/lower_vectors.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/lower_vectors.cc
index 3bf96ec38c3..04ce041b6ca 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/lower_vectors.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/lower_vectors.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/X86Vector/Transforms.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -64,11 +65,12 @@ LogicalResult rewriteVectorContract(MLIRContext* ctx, FuncOp funcOp) {
   vector::populateVectorToVectorCanonicalizationPatterns(patterns);
 
   // Currently we always lower vector.contract into vector.outerproduct.
-  patterns.add<vector::ContractionOpToOuterProductOpLowering,
-               vector::ContractionOpLowering>(
+  vector::populateVectorContractLoweringPatterns(
+      patterns,
       vector::VectorTransformsOptions().setVectorTransformsOptions(
           vector::VectorContractLowering::OuterProduct),
-      ctx, 2);
+      /*benefit=*/2,
+      /*disableOuterProductLowering*/ true);
   vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
 
   return applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.cc
index b6404d6d321..2ae94e27692 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.cc
@@ -18,66 +18,15 @@ limitations under the License.
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/OpDefinition.h"
 
 namespace mlir {
 namespace gml_st {
-namespace {
 
-using mlir::tensor::ExpandShapeOp;
-using mlir::vector::TransferReadOp;
 using mlir::vector::TransferWriteOp;
 
-// Rewrite `vector.transfer_read(linalg.expand_shape)` as
-// `vector.shape_cast(vector.transfer_read)`.
-struct TransferReadOfOneDimExpandShape
-    : public mlir::OpRewritePattern<TransferReadOp> {
-  using OpRewritePattern<TransferReadOp>::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      TransferReadOp vectorRead,
-      mlir::PatternRewriter &rewriter) const override {
-    auto expand = vectorRead.getSource().getDefiningOp<ExpandShapeOp>();
-    if (!expand) return failure();
-
-    auto expandSrc = expand.getSrc();
-    auto expandSrcType = expand.getSrcType();
-    auto expandDstType = expand.getResultType();
-    if (expandSrcType.getRank() != 1 || expandDstType.getRank() != 2)
-      return failure();
-
-    auto resultType = vectorRead.getType().dyn_cast<mlir::ShapedType>();
-    if (!resultType || resultType.getShape() != expandDstType.getShape())
-      return failure();
-
-    auto zero = rewriter.create<arith::ConstantIndexOp>(vectorRead.getLoc(), 0);
-    auto map = mlir::AffineMap::get(1, 0, {rewriter.getAffineDimExpr(0)},
-                                    vectorRead.getContext());
-    // TODO(pifon): Also support canonicalization in case the map is not an
-    // identity.
-    if (!map.isIdentity()) return failure();
-
-    auto newRead = rewriter.create<TransferReadOp>(
-        vectorRead.getLoc(),
-        mlir::VectorType::get(expandSrcType.getShape(),
-                              expandSrcType.getElementType()),
-        expandSrc, mlir::ValueRange{zero}, mlir::AffineMapAttr::get(map),
-        vectorRead.getPadding(),
-        /*mask=*/mlir::Value(), rewriter.getBoolArrayAttr({true}));
-    rewriter.replaceOpWithNewOp<mlir::vector::ShapeCastOp>(
-        vectorRead, vectorRead.getType(), newRead);
-    return success();
-  }
-};
-
-}  // namespace
-
-void populateTransferReadOfOneDimExpandShapePattern(
-    RewritePatternSet &patterns) {
-  patterns.add<TransferReadOfOneDimExpandShape>(patterns.getContext());
-}
-
 RewritePatternSet getDefaultVectorizationPatterns(MLIRContext *ctx) {
   RewritePatternSet patterns(ctx);
   vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h
index c3e2ca2e04c..3e650253dda 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h
@@ -50,9 +50,6 @@ struct VectorizationPattern : public mlir::OpRewritePattern<OpTy> {
   llvm::function_ref<bool(OpTy)> filterFn;
 };
 
-void populateTransferReadOfOneDimExpandShapePattern(
-    RewritePatternSet &patterns);
-
 RewritePatternSet getDefaultVectorizationPatterns(MLIRContext *ctx);
 
 }  // namespace gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc
index 7b533b08f16..0be71f39893 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc
@@ -207,48 +207,6 @@ struct InlineCastInIfOpPattern : public OpRewritePattern<tensor::CastOp> {
   }
 };
 
-// Rewrite `vector.transfer_read(linalg.expand_shape)` as
-// `vector.shape_cast(vector.transfer_read)`.
-struct TransferReadOfOneDimExpandShape
-    : public mlir::OpRewritePattern<vector::TransferReadOp> {
-  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      vector::TransferReadOp vectorRead,
-      mlir::PatternRewriter &rewriter) const override {
-    auto expand = vectorRead.getSource().getDefiningOp<tensor::ExpandShapeOp>();
-    if (!expand) return failure();
-
-    auto expandSrc = expand.getSrc();
-    auto expandSrcType = expand.getSrcType();
-    auto expandDstType = expand.getResultType();
-    if (expandSrcType.getRank() != 1 || expandDstType.getRank() != 2)
-      return failure();
-
-    auto resultType = vectorRead.getType().dyn_cast<mlir::ShapedType>();
-    if (!resultType || resultType.getShape() != expandDstType.getShape())
-      return failure();
-
-    auto zero = rewriter.create<arith::ConstantIndexOp>(vectorRead.getLoc(), 0);
-    auto map = mlir::AffineMap::get(1, 0, {rewriter.getAffineDimExpr(0)},
-                                    vectorRead.getContext());
-    // TODO(pifon): Also support canonicalization in case the map is not an
-    // identity.
-    if (!map.isIdentity()) return failure();
-
-    auto newRead = rewriter.create<vector::TransferReadOp>(
-        vectorRead.getLoc(),
-        mlir::VectorType::get(expandSrcType.getShape(),
-                              expandSrcType.getElementType()),
-        expandSrc, mlir::ValueRange{zero}, mlir::AffineMapAttr::get(map),
-        vectorRead.getPadding(),
-        /*mask=*/mlir::Value(), rewriter.getBoolArrayAttr({true}));
-    rewriter.replaceOpWithNewOp<mlir::vector::ShapeCastOp>(
-        vectorRead, vectorRead.getType(), newRead);
-    return success();
-  }
-};
-
 // This currently matches for all thlo.reverse of the form 1x1x..x1xVectorSize.
 // DimSize < kNumElementsVectorization will be handled by Scalarization.
 bool isPerfectlyTiledReverse(thlo::ReverseOp reverseOp) {
@@ -329,23 +287,66 @@ struct IdentityTransposeOpFoldingPattern
   }
 };
 
-bool isNonComplexSmallTensorOrScalar(Type ty) {
-  if (auto rankedTy = ty.dyn_cast<mlir::RankedTensorType>()) {
-    if (rankedTy.getElementType().isa<ComplexType>()) return false;
-    return rankedTy.hasStaticShape() &&
-           rankedTy.getNumElements() < kNumElementsThreshold;
-  }
+// Rewrite `vector.transfer_read(linalg.expand_shape)` as
+// `vector.shape_cast(vector.transfer_read)`.
+struct TransferReadOfOneDimExpandShape
+    : public mlir::OpRewritePattern<vector::TransferReadOp> {
+  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
 
-  if (ty.isa<ComplexType>()) return false;
-  return !isa<ShapedType>(ty);
-}
+  mlir::LogicalResult matchAndRewrite(
+      vector::TransferReadOp vectorRead,
+      mlir::PatternRewriter &rewriter) const override {
+    auto expand = vectorRead.getSource().getDefiningOp<tensor::ExpandShapeOp>();
+    if (!expand) return failure();
+
+    auto expandSrc = expand.getSrc();
+    auto expandSrcType = expand.getSrcType();
+    auto expandDstType = expand.getResultType();
+    if (expandSrcType.getRank() != 1 || expandDstType.getRank() != 2)
+      return failure();
+
+    auto resultType = vectorRead.getType().dyn_cast<mlir::ShapedType>();
+    if (!resultType || resultType.getShape() != expandDstType.getShape())
+      return failure();
+
+    auto zero = rewriter.create<arith::ConstantIndexOp>(vectorRead.getLoc(), 0);
+    auto map = mlir::AffineMap::get(1, 0, {rewriter.getAffineDimExpr(0)},
+                                    vectorRead.getContext());
+    // TODO(pifon): Also support canonicalization in case the map is not an
+    // identity.
+    if (!map.isIdentity()) return failure();
+
+    auto newRead = rewriter.create<vector::TransferReadOp>(
+        vectorRead.getLoc(),
+        mlir::VectorType::get(expandSrcType.getShape(),
+                              expandSrcType.getElementType()),
+        expandSrc, mlir::ValueRange{zero}, mlir::AffineMapAttr::get(map),
+        vectorRead.getPadding(),
+        /*mask=*/mlir::Value(), rewriter.getBoolArrayAttr({true}));
+    rewriter.replaceOpWithNewOp<mlir::vector::ShapeCastOp>(
+        vectorRead, vectorRead.getType(), newRead);
+    return success();
+  }
+};
 
 struct VectorizeForCPUPass
     : public impl::VectorizeForCPUPassBase<VectorizeForCPUPass> {
+  using Base::Base;
+
   void runOnOperation() override {
     auto func = getOperation();
     auto *ctx = func.getContext();
 
+    auto isNonComplexSmallTensorOrScalar = [&](Type ty) {
+      if (getElementTypeOrSelf(ty).isa<ComplexType>()) return false;
+      if (auto rankedTy = ty.dyn_cast<mlir::RankedTensorType>()) {
+        return rankedTy.hasStaticShape() &&
+               rankedTy.getNumElements() < numElementsThreshold;
+      }
+
+      return !isa<ShapedType>(ty);
+    };
+
     auto isOpOnNonComplexSmallTensorOrScalar = [&](Operation *op) {
       return llvm::all_of(op->getOperandTypes(),
                           isNonComplexSmallTensorOrScalar) &&
@@ -380,23 +381,21 @@ struct VectorizeForCPUPass
           VectorizationPattern<VecmatOp>
           >(ctx, isInsidePerfectlyTiledLoopOrSmall);
       // clang-format on
-      patterns.add<PassVectorizedValuesThroughIfOpPattern>(ctx);
-      populateTransferReadOfOneDimExpandShapePattern(patterns);
-      patterns.add<InlineCastInIfOpPattern, ThloReverseVectorizationPattern>(
-          ctx);
+      patterns
+          .add<InlineCastInIfOpPattern, PassVectorizedValuesThroughIfOpPattern,
+               ThloReverseVectorizationPattern,
+               TransferReadOfOneDimExpandShape>(ctx);
       tensor::CastOp::getCanonicalizationPatterns(patterns, ctx);
-      if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+      if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
         return signalPassFailure();
-      }
     }
 
     {
       RewritePatternSet patterns = getDefaultVectorizationPatterns(ctx);
       TransferReadOp::getCanonicalizationPatterns(patterns, ctx);
       patterns.add<IdentityTransposeOpFoldingPattern>(ctx);
-      if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+      if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
         return signalPassFailure();
-      }
     }
 
     // Hoisting transfer_read/transfer_write.
@@ -406,8 +405,11 @@ struct VectorizeForCPUPass
 
 }  // namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeForCPUPass() {
-  return std::make_unique<VectorizeForCPUPass>();
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeForCPUPass(
+    int64_t numElementsThreshold) {
+  VectorizeForCPUPassOptions opts;
+  opts.numElementsThreshold = numElementsThreshold;
+  return std::make_unique<VectorizeForCPUPass>(opts);
 }
 
 }  // namespace gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/CMakeLists.txt
index 2bdbbdbe291..809a3e3a892 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/CMakeLists.txt
@@ -16,7 +16,9 @@
 
 add_mlir_dialect_library(MLIRGmlStUtils
   linalg_utils.cc
+  tensor_utils.cc
 
   LINK_LIBS PUBLIC
   MLIRLinalgDialect
+  MLIRTensorDialect
 )
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.cc
index d53e3ef3b24..8ec5bd8ee08 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.cc
@@ -15,10 +15,31 @@ limitations under the License.
 
 #include "gml_st/utils/linalg_utils.h"
 
-#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include <iterator>
 
-namespace mlir {
-namespace gml_st {
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
+#include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
+
+namespace mlir::gml_st {
+namespace {
+
+using tensor::CollapseShapeOp;
+using tensor::ExpandShapeOp;
+
+Value collapseDpsInit(OpBuilder &b, Location loc, Value init,
+                      ArrayRef<ReassociationIndices> reassociation) {
+  auto fillOp = init.getDefiningOp<linalg::FillOp>();
+  if (!fillOp) return b.create<CollapseShapeOp>(loc, init, reassociation);
+
+  Value collapsedInit = b.create<CollapseShapeOp>(
+      loc, fillOp.getOutputs().front(), reassociation);
+  auto newFill = b.create<linalg::FillOp>(loc, fillOp.getInputs(),
+                                          ValueRange{collapsedInit});
+  return newFill.getResult(0);
+}
+
+}  // namespace
 
 bool isCwiseGenericOp(Operation *op, int64_t *arity) {
   auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op);
@@ -66,5 +87,152 @@ bool isSimpleBcastReduction(Operation *op, int64_t *dimension,
   return true;
 }
 
-}  // namespace gml_st
-}  // namespace mlir
+bool isTransformableIntoMatmul(linalg::Conv2DNhwcHwcfOp convOp) {
+  if (!convOp.hasTensorSemantics()) return false;
+
+  Value input = convOp.getInputs()[0];
+  auto inputType = input.getType().cast<RankedTensorType>();
+
+  Value kernel = convOp.getInputs()[1];
+  auto kernelType = kernel.getType().cast<RankedTensorType>();
+
+  Value init = convOp.getOutputs()[0];
+  auto initType = init.getType().cast<RankedTensorType>();
+
+  if (!inputType.hasStaticShape() || !kernelType.hasStaticShape() ||
+      !initType.hasStaticShape()) {
+    return false;
+  }
+
+  auto allOnes = [](DenseIntElementsAttr attr) {
+    return attr.isSplat() && attr.getValues<int64_t>()[0] == 1;
+  };
+  if (!allOnes(convOp.getDilations()) || !allOnes(convOp.getStrides()))
+    return false;
+
+  if (inputType.getDimSize(0) != 1 || inputType.getDimSize(3) != 1 ||
+      kernelType.getDimSize(2) != 1 || initType.getDimSize(0) != 1 ||
+      initType.getDimSize(2) != 1)
+    return false;
+  return true;
+}
+
+FailureOr<linalg::MatmulOp> convertConvToMatmul(linalg::Conv2DNhwcHwcfOp convOp,
+                                                PatternRewriter &rewriter) {
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(convOp);
+  Value input = convOp.getInputs()[0];
+  Value kernel = convOp.getInputs()[1];
+  Value init = convOp.getOutputs()[0];
+
+  auto kernelType = kernel.getType().cast<RankedTensorType>();
+  if (!isTransformableIntoMatmul(convOp) || kernelType.getDimSize(0) != 1)
+    return failure();
+
+  Location loc = convOp.getLoc();
+  SmallVector<ReassociationIndices> map{{0, 1}, {2, 3}};
+  Value newInput = rewriter.create<CollapseShapeOp>(loc, input, map);
+  Value newKernel = rewriter.create<CollapseShapeOp>(loc, kernel, map);
+  Value newInit = rewriter.create<CollapseShapeOp>(loc, init, map);
+
+  auto matmul = rewriter.create<linalg::MatmulOp>(
+      loc, newInit.getType(), ValueRange{newInput, newKernel},
+      ValueRange{newInit});
+
+  rewriter.replaceOpWithNewOp<ExpandShapeOp>(convOp, convOp.getType(0),
+                                             matmul.getResult(0), map);
+  return matmul;
+}
+
+FailureOr<linalg::MatmulOp> convertBatchMatmulToMatmul(
+    linalg::BatchMatmulOp batchMatmulOp, PatternRewriter &rewriter) {
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(batchMatmulOp);
+  Value lhs = batchMatmulOp.getInputs()[0];
+  Value rhs = batchMatmulOp.getInputs()[1];
+  Value init = batchMatmulOp.getOutputs()[0];
+
+  Location loc = batchMatmulOp.getLoc();
+  SmallVector<ReassociationIndices> map{{0, 1}, {2}};
+  Value newLhs = rewriter.create<CollapseShapeOp>(loc, lhs, map);
+  Value newRhs = rewriter.create<CollapseShapeOp>(loc, rhs, map);
+  Value newInit = collapseDpsInit(rewriter, loc, init, map);
+  auto matmul = rewriter.create<linalg::MatmulOp>(
+      loc, newInit.getType(), ValueRange{newLhs, newRhs}, ValueRange{newInit});
+
+  rewriter.replaceOpWithNewOp<ExpandShapeOp>(
+      batchMatmulOp, batchMatmulOp.getType(0), matmul.getResult(0), map);
+  return matmul;
+}
+
+FailureOr<linalg::DotOp> convertMatvecToDotOp(PatternRewriter &rewriter,
+                                              linalg::MatvecOp matvecOp) {
+  auto resultType = matvecOp.getType(0).cast<RankedTensorType>();
+  if (resultType.getDimSize(0) != 1) return failure();
+
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(matvecOp);
+
+  Location loc = matvecOp.getLoc();
+  Value lhs = matvecOp.getInputs().front();
+  Value rhs = matvecOp.getInputs().back();
+  Value init = matvecOp.getOutputs().front();
+
+  Value collapsedLhs =
+      rewriter.create<CollapseShapeOp>(loc, lhs, ReassociationIndices{{0, 1}});
+  Value collapsedInit = collapseDpsInit(rewriter, loc, init, {});
+  auto dotOp = rewriter.create<linalg::DotOp>(loc, collapsedInit.getType(),
+                                              ValueRange{collapsedLhs, rhs},
+                                              ValueRange{collapsedInit});
+  Value expandResult =
+      rewriter.create<ExpandShapeOp>(loc, init.getType(), dotOp.getResult(0),
+                                     ArrayRef<ReassociationIndices>{});
+
+  rewriter.replaceOp(matvecOp, expandResult);
+  return dotOp;
+}
+
+FailureOr<linalg::ReduceOp> convertDotOpToReduce(linalg::DotOp dotOp,
+                                                 PatternRewriter &rewriter) {
+  Location loc = dotOp.getLoc();
+
+  // Create empty tensor for linalg.map.
+  Value lhs = dotOp.getInputs().front();
+  FailureOr<OpFoldResult> inputSizeOfr =
+      tensor::createDimValue(rewriter, loc, lhs, 0);
+
+  if (failed(inputSizeOfr)) {
+    return rewriter.notifyMatchFailure(
+        dotOp, "cannot get the size of the input tensor");
+  }
+
+  Type elementType = getElementTypeOrSelf(lhs.getType());
+  Value emptyTensor =
+      rewriter.create<tensor::EmptyOp>(loc, *inputSizeOfr, elementType);
+
+  // Create linalg.map.
+  Operation *arithMul = &dotOp.getBody()->front();
+  auto mul = rewriter.create<linalg::MapOp>(
+      loc, dotOp.getOperands().take_front(2), emptyTensor,
+      [&](OpBuilder &b, Location loc, ValueRange args) {
+        auto *n = mlir::clone(b, arithMul, arithMul->getResultTypes(),
+                              args.take_front(2));
+        b.create<linalg::YieldOp>(loc, n->getResults());
+      });
+
+  // Create linalg.reduce.
+  Operation *arithAdd = &(*std::next(dotOp.getBody()->begin()));
+  auto add = rewriter.create<linalg::ReduceOp>(
+      loc, ValueRange{mul.getResult()}, ValueRange{dotOp.getOperand(2)},
+      SmallVector<int64_t>{0},
+      [&](OpBuilder &b, Location loc, ValueRange args) {
+        auto *n = mlir::clone(b, arithAdd, arithAdd->getResultTypes(),
+                              {args[1], args[0]});
+        b.create<linalg::YieldOp>(loc, n->getResults());
+      });
+
+  rewriter.replaceOp(dotOp, add->getResults());
+  return add;
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.h
index ce8830cf91d..aef0a7603ad 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 
-namespace mlir {
-namespace gml_st {
+namespace mlir::gml_st {
 
 // Helper functions to match Linalg ops that implement simple reductions,
 // bcasts, and cwise ops.
@@ -33,7 +32,46 @@ struct SimpleBcastReduction {
 bool isSimpleBcastReduction(Operation *op, int64_t *dimension = nullptr,
                             SimpleBcastReduction *chain = nullptr);
 
-}  // namespace gml_st
-}  // namespace mlir
+// The Conv2D is transformable into a matmul, if it has the following shape
+//
+// linalg.conv_2d_nhwc_hwcf
+//   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+//   ins(%input, %kernel : tensor<1x(N+L-1)xKx1xf32>, tensor<LxKx1xMxf32>)
+//   outs(%fill : tensor<1xNx1xM>) -> tensor<1xNx1xMxf32>
+//
+// in that case we can tile w.r.t. L to bring it to the following form
+//
+// linalg.conv_2d_nhwc_hwcf
+//   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+//   ins(%input, %kernel : tensor<1xNxKx1xf32>, tensor<1xKx1xMxf32>)
+//   outs(%fill : tensor<1xNx1xM>) -> tensor<1xNx1xMxf32>
+bool isTransformableIntoMatmul(linalg::Conv2DNhwcHwcfOp convOp);
 
-#endif
+// linalg.conv_2d_nhwc_hwcf
+//   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+//   ins(%input, %kernel : tensor<1xNxKx1xf32>, tensor<1xKx1xMxf32>)
+//   outs(%fill : tensor<1xNx1xM>) -> tensor<1xNx1xMxf32>
+//
+//  into
+//
+// linalg.matmul
+//   ins(%lhs, %rhs : tensor<NxKxf32>, tensor<KxMxf32>)
+//   outs(%fill : tensor<NxM>) -> tensor<1xNx1xMxf32>
+FailureOr<linalg::MatmulOp> convertConvToMatmul(linalg::Conv2DNhwcHwcfOp convOp,
+                                                PatternRewriter &rewriter);
+
+// Converts linalg.batch_matmul into linalg.matmul.
+FailureOr<linalg::MatmulOp> convertBatchMatmulToMatmul(
+    linalg::BatchMatmulOp batchMatmulOp, PatternRewriter &rewriter);
+
+// Converts linalg.matvec into linalg.dot.
+FailureOr<linalg::DotOp> convertMatvecToDotOp(PatternRewriter &rewriter,
+                                              linalg::MatvecOp matvecOp);
+
+// Converts linalg.dot into linalg.reduce(linalg.map).
+FailureOr<linalg::ReduceOp> convertDotOpToReduce(linalg::DotOp dotOp,
+                                                 PatternRewriter &rewriter);
+
+}  // namespace mlir::gml_st
+
+#endif  // MLIR_HLO_GML_ST_UTILS_LINALG_UTILS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/tensor_utils.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/tensor_utils.cc
new file mode 100644
index 00000000000..50e6cd31c49
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/tensor_utils.cc
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/utils/tensor_utils.h"
+
+namespace mlir::gml_st {
+
+// Returns ids of size-1 dims that were expanded or collapsed by
+// tensor.expand_shape/tensor.collapse_shape.
+SmallVector<int64_t> getPreservedDimensions(
+    ArrayRef<int64_t> shape,
+    ArrayRef<ReassociationIndices> reassociationIndices) {
+  SmallVector<int64_t> result;
+  for (ReassociationIndicesRef indices : reassociationIndices) {
+    const auto* findIt =
+        llvm::find_if(indices, [&](int64_t idx) { return shape[idx] != 1; });
+    result.push_back(findIt == indices.end() ? 0 : *findIt);
+  }
+  return result;
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/tensor_utils.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/tensor_utils.h
new file mode 100644
index 00000000000..0d0de996984
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/tensor_utils.h
@@ -0,0 +1,57 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_GML_ST_UTILS_TENSOR_UTILS_H
+#define MLIR_HLO_GML_ST_UTILS_TENSOR_UTILS_H
+
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
+
+namespace mlir {
+namespace gml_st {
+
+// TODO(vuson): maybe overload this function instead of templating it.
+// Check if the reshape operation is only expanding into/collapsing of
+// unit-dimension.
+template <typename TensorReshapeOp>
+bool isDegenerateReshapeOp(TensorReshapeOp reshapeOp) {
+  constexpr bool isExpanding =
+      std::is_same<TensorReshapeOp, tensor::ExpandShapeOp>::value;
+  llvm::ArrayRef<int64_t> expandedShape =
+      (isExpanding ? reshapeOp.getResultType().getShape()
+                   : reshapeOp.getSrcType().getShape());
+  for (auto& indices : reshapeOp.getReassociationIndices()) {
+    // For each reassociation indices, a degenerate reshape op only has at most
+    // 1 non-unit-dimension, i.e. number of unit-dimensions is greater or equal
+    // to the indices size - 1.
+    if (static_cast<size_t>(
+            llvm::count_if(indices, [&expandedShape](int64_t idx) {
+              return expandedShape[idx] == 1;
+            })) < indices.size() - 1)
+      return false;
+  }
+  return true;
+}
+
+// Returns ids of size-1 dims that were expanded or collapsed by
+// tensor.expand_shape/tensor.collapse_shape.
+SmallVector<int64_t> getPreservedDimensions(
+    ArrayRef<int64_t> shape,
+    ArrayRef<ReassociationIndices> reassociationIndices);
+
+}  // namespace gml_st
+}  // namespace mlir
+
+#endif  // MLIR_HLO_GML_ST_UTILS_TENSOR_UTILS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc
index c2ed779ee7d..3d2197419ca 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc
@@ -116,60 +116,6 @@ LogicalResult AbsOp::verify() {
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// AllGatherOp
-//===----------------------------------------------------------------------===//
-
-// TODO(jurahul): Add verification for output shape.
-LogicalResult AllGatherOp::verify() {
-  AllGatherOp op = *this;
-  return mlir::hlo::verifyReplicaGroups(op.getLoc(), op.getReplicaGroups(),
-                                        /*allGroupsMustHaveSameSize=*/true,
-                                        op.getUseGlobalDeviceIds(),
-                                        /*expectedGroupSize=*/std::nullopt);
-}
-
-//===----------------------------------------------------------------------===//
-// AllToAllOp
-//===----------------------------------------------------------------------===//
-
-// TODO(jurahul): Add verification for output shape.
-LogicalResult AllToAllOp::verify() {
-  AllToAllOp op = *this;
-  return mlir::hlo::verifyReplicaGroups(op.getLoc(), op.getReplicaGroups(),
-                                        /*allGroupsMustHaveSameSize=*/true,
-                                        /*useGlobalDeviceIds=*/false,
-                                        /*expectedGroupSize=*/std::nullopt);
-}
-
-//===----------------------------------------------------------------------===//
-// AllReduceOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult AllReduceOp::verify() {
-  AllReduceOp op = *this;
-  return verifyAllReduce(op);
-}
-
-//===----------------------------------------------------------------------===//
-// ReduceScatterOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult ReduceScatterOp::verify() {
-  ReduceScatterOp op = *this;
-  if (failed(hlo::verifyReplicaGroups(op.getLoc(), op.getReplicaGroups(),
-                                      /*allGroupsMustHaveSameSize=*/true,
-                                      op.getUseGlobalDeviceIds(),
-                                      /*expectedGroupSize=*/std::nullopt)))
-    return failure();
-  if (failed(mlir::hlo::verifyReduceScatter(
-          op, /*operandTypes=*/op.getInputs().getTypes(),
-          /*resultTypes=*/op.getOutputs().getTypes(),
-          /*scatterDimension=*/op.getScatterDimension())))
-    return failure();
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // CaseOp
 //===----------------------------------------------------------------------===//
@@ -187,16 +133,6 @@ void CaseOp::getSuccessorRegions(std::optional<unsigned> index,
   regions.push_back(RegionSuccessor());
 }
 
-//===----------------------------------------------------------------------===//
-// CollectivePermuteOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult CollectivePermuteOp::verify() {
-  CollectivePermuteOp op = *this;
-  return mlir::hlo::verifyCollectivePermuteSourceTargetPairs(
-      op, op.getSourceTargetPairs());
-}
-
 //===----------------------------------------------------------------------===//
 // ConstantOp.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.td b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.td
index 7daf7eacf3d..a7daf42b37d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.td
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.td
@@ -1093,90 +1093,6 @@ def LHLO_ReducePrecisionOp: LHLO_Op<"reduce_precision", [SameTypeOperands]> {
   );
 }
 
-// Common base class for AllReduce, AllGather, and AllToAll.
-class LHLO_CollectiveCommunicationOp<string name, list<Trait> traits = []> :
-  LHLO_Op<name, !listconcat(traits, [SameVariadicOperandSize])> {
-  dag arguments_base = (ins
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$inputs,
-    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$outputs,
-    I64ElementsAttr:$replica_groups,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$constrain_layout,
-    OptionalAttr<MHLO_ChannelHandle>:$channel_id,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$use_global_device_ids
-  );
-  let hasVerifier = 1;
-  let extraClassDeclaration = [{
-    // AllGather is cross replica if channel_id is not set.
-    bool IsCrossReplica() { return !getChannelId().has_value(); }
-  }];
-}
-
-def LHLO_AllGatherOp : LHLO_CollectiveCommunicationOp<"all_gather"> {
-  let summary = "AllGather operator";
-  let description = [{
-    Performs concatenation across replicas.
-
-    See https://www.tensorflow.org/xla/operation_semantics#allgather
-  }];
-  let arguments = !con(
-    arguments_base,
-    (ins I64Attr:$all_gather_dimension));
-}
-
-def LHLO_AllReduceOp : LHLO_CollectiveCommunicationOp<"all_reduce", [SameOperandsElementType]> {
-  let summary = "AllReduce operator";
-  let description = [{
-    Performs a custom reduction across replicas.
-
-    See https://www.tensorflow.org/xla/operation_semantics#allreduce.
-  }];
-  let arguments = arguments_base;
-  let regions = (region SizedRegion<1>:$computation);
-}
-
-def LHLO_ReduceScatterOp : LHLO_CollectiveCommunicationOp<"reduce_scatter", [SameOperandsElementType]> {
-  let summary = "ReduceScatter operator";
-  let description = [{
-     Performs all_reduce followed by a scatter.
-
-     See https://www.tensorflow.org/xla/operation_semantics#reducescatter
-  }];
-  let arguments = !con(
-    arguments_base,
-    (ins I64Attr:$scatter_dimension));
-  let regions = (region SizedRegion<1>:$computation);
-}
-
-def LHLO_AllToAllOp : LHLO_CollectiveCommunicationOp<"all_to_all", [SameOperandsElementType]> {
-  let arguments = !con(
-    arguments_base,
-    (ins OptionalAttr<I64Attr>:$split_dimension));
-}
-
-def LHLO_CollectivePermuteOp: LHLO_Op<"collective_permute", [SameTypeOperands]> {
-  let summary = "CollectivePermute operator";
-  let description = [{
-    CollectivePermute is a collective operation that sends and receives data
-    cross replicas.
-    Note that there are the following restrictions on the source_target_pair:
-    - Any two pairs should not have the same target replica id, and they should
-    not have the same source replica id.
-    - If a replica id is not a target in any pair, then the output on that
-    replica is a tensor consists of 0(s) with the same shape as the input.
-
-    See https://www.tensorflow.org/xla/operation_semantics#collectivepermute.
-
-  }];
-
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    I64ElementsAttr:$source_target_pairs,
-    OptionalAttr<MHLO_ChannelHandle>:$channel_id
-  );
-  let hasVerifier = 1;
-}
-
 def LHLO_FftOp: LHLO_Op<"fft", []> {
   let summary = "Fast fourier transform operator";
   let description = [{
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
index 0e75f30e72b..e5eee50a7cb 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
@@ -42,8 +42,8 @@ static void buildBoundedAffineLoopNest(
     function_ref<void(OpBuilder&, Location, ValueRange)> bodyBuilder) {
   SmallVector<int64_t, 3> lowerBounds(upperBounds.size(), /*Value=*/0);
   SmallVector<int64_t, 3> steps(upperBounds.size(), /*Value=*/1);
-  buildAffineLoopNest(builder, location, lowerBounds, upperBounds, steps,
-                      bodyBuilder);
+  affine::buildAffineLoopNest(builder, location, lowerBounds, upperBounds,
+                              steps, bodyBuilder);
 }
 
 struct DotOpConverter : public OpRewritePattern<DotOp> {
@@ -83,16 +83,16 @@ struct DotOpConverter : public OpRewritePattern<DotOp> {
       SmallVector<Value, 2> lhsIndices{ivs[0], ivs[2]},
           rhsIndices{ivs[2], ivs[1]}, resultIndices{ivs[0], ivs[1]};
 
-      auto l = builder.create<AffineLoadOp>(loc, lhs, lhsIndices);
-      auto r = builder.create<AffineLoadOp>(loc, rhs, rhsIndices);
-      auto result =
-          rewriter.create<AffineLoadOp>(loc, op.getOutput(), resultIndices);
+      auto l = builder.create<affine::AffineLoadOp>(loc, lhs, lhsIndices);
+      auto r = builder.create<affine::AffineLoadOp>(loc, rhs, rhsIndices);
+      auto result = rewriter.create<affine::AffineLoadOp>(loc, op.getOutput(),
+                                                          resultIndices);
       Value opResult = lmhlo::LhloOpToStdScalarOp::map<DotOp>(
           op, elementType, {l, r, result}, &builder);
       mapStatus = success(opResult != nullptr);
       if (failed(mapStatus)) return;
-      builder.create<AffineStoreOp>(loc, opResult, op.getOutput(),
-                                    resultIndices);
+      builder.create<affine::AffineStoreOp>(loc, opResult, op.getOutput(),
+                                            resultIndices);
     };
 
     buildBoundedAffineLoopNest(rewriter, op.getLoc(),
@@ -161,22 +161,22 @@ struct ConcatOpConverter : public OpRewritePattern<ConcatenateOp> {
       // dimension.
       OpBuilder::InsertionGuard guard(rewriter);
       SmallVector<Value, 3> indices;
-      AffineForOp forOp;
+      affine::AffineForOp forOp;
       for (unsigned i = 0; i < outputRank; i++) {
         if (i == concatDim) {
-          forOp = rewriter.create<AffineForOp>(loc, prevBound,
-                                               prevBound + operandShape[i]);
+          forOp = rewriter.create<affine::AffineForOp>(
+              loc, prevBound, prevBound + operandShape[i]);
           prevBound += operandShape[i];
           indices.push_back(forOp.getInductionVar());
         } else {
-          forOp = rewriter.create<AffineForOp>(loc, 0, outputShape[i]);
+          forOp = rewriter.create<affine::AffineForOp>(loc, 0, outputShape[i]);
           indices.push_back(forOp.getInductionVar());
         }
         rewriter.setInsertionPointToStart(forOp.getBody());
       }
       Value storeVal =
-          rewriter.create<AffineLoadOp>(loc, operand, map, indices);
-      rewriter.create<AffineStoreOp>(loc, storeVal, output, indices);
+          rewriter.create<affine::AffineLoadOp>(loc, operand, map, indices);
+      rewriter.create<affine::AffineStoreOp>(loc, storeVal, output, indices);
     }
     rewriter.eraseOp(op);
     return success();
@@ -211,10 +211,10 @@ static void fillBuffer(Location loc, Value buffer, Value fillValue,
   AffineMap idSymMap = builder.getSymbolIdentityMap();
   AffineMap lbMap = builder.getConstantAffineMap(0);
   SmallVector<Value, 4> ivs(rank);
-  AffineForOp forOp;
+  affine::AffineForOp forOp;
   for (unsigned i = 0; i < rank; ++i) {
-    forOp = builder.create<AffineForOp>(loc, std::nullopt, lbMap, dimSizes[i],
-                                        idSymMap);
+    forOp = builder.create<affine::AffineForOp>(loc, std::nullopt, lbMap,
+                                                dimSizes[i], idSymMap);
     builder.setInsertionPointToStart(forOp.getBody());
     ivs[i] = forOp.getInductionVar();
   }
@@ -223,10 +223,10 @@ static void fillBuffer(Location loc, Value buffer, Value fillValue,
   assert(((fillMemRefType && fillMemRefType.getRank() == 0) ||
           fillValueType.isIntOrFloat()) &&
          "init value has to be a 0-d memref or int or fp");
-  Value initVal = fillMemRefType ? builder.create<AffineLoadOp>(
+  Value initVal = fillMemRefType ? builder.create<affine::AffineLoadOp>(
                                        loc, fillValue, /*indices=*/std::nullopt)
                                  : fillValue;
-  builder.create<AffineStoreOp>(loc, initVal, buffer, ivs);
+  builder.create<affine::AffineStoreOp>(loc, initVal, buffer, ivs);
 }
 
 /// Converts GatherOp to Affine nest form.
@@ -333,8 +333,8 @@ class GatherOpConverter : public OpRewritePattern<GatherOp> {
       // ith dimension of start_indices doesn't form a batch if it is equal to
       // index_vector_dim.
       if (i == indexVectorDim) continue;
-      AffineForOp forOp =
-          rewriter.create<AffineForOp>(loc, 0, startIndicesShape[i]);
+      affine::AffineForOp forOp =
+          rewriter.create<affine::AffineForOp>(loc, 0, startIndicesShape[i]);
       batchInductionVars.push_back(forOp.getInductionVar());
       outputInductionVars.push_back(forOp.getInductionVar());
       rewriter.setInsertionPointToStart(forOp.getBody());
@@ -343,8 +343,8 @@ class GatherOpConverter : public OpRewritePattern<GatherOp> {
     // Create loops to iterate over each offset dimension within the output
     // tensor.
     for (unsigned i = 0, k = 0, e = offsetDims.size(); i < e; i++) {
-      AffineForOp forOp =
-          rewriter.create<AffineForOp>(loc, 0, outputShape[offsetDims[i]]);
+      affine::AffineForOp forOp = rewriter.create<affine::AffineForOp>(
+          loc, 0, outputShape[offsetDims[i]]);
       rewriter.setInsertionPointToStart(forOp.getBody());
       // We try to fetch the first non-collapsed dimension.
       while (k < collapsedSliceDims.size() && collapsedSliceDims[k] == i) k++;
@@ -364,8 +364,8 @@ class GatherOpConverter : public OpRewritePattern<GatherOp> {
       // those dimensions of operand tensor which are present in
       // start_index_map.
       if (k < startIndexMap.size() && i == startIndexMap[k++]) {
-        AffineForOp forOp =
-            rewriter.create<AffineForOp>(loc, 0, operandShape[i]);
+        affine::AffineForOp forOp =
+            rewriter.create<affine::AffineForOp>(loc, 0, operandShape[i]);
         operandIndex.push_back(forOp.getInductionVar());
         rewriter.setInsertionPointToStart(forOp.getBody());
       } else {
@@ -380,8 +380,8 @@ class GatherOpConverter : public OpRewritePattern<GatherOp> {
       for (unsigned i = 0; i < startIndicesNumbers; i++) {
         batchInductionVars.insert(batchInductionVars.begin() + indexVectorDim,
                                   startIndicesIndex[i]);
-        Value startIndex = rewriter.create<AffineLoadOp>(loc, startIndices,
-                                                         batchInductionVars);
+        Value startIndex = rewriter.create<affine::AffineLoadOp>(
+            loc, startIndices, batchInductionVars);
         startIndex = rewriter.create<arith::IndexCastOp>(
             loc, rewriter.getIndexType(), startIndex);
         sIn[startIndexMap[i]] = startIndex;
@@ -390,8 +390,8 @@ class GatherOpConverter : public OpRewritePattern<GatherOp> {
     } else {
       // Since index_vector_dim is equal to start_indicesRank we can directly
       // fetch the start_index from batch_induction_vars.
-      Value startIndex =
-          rewriter.create<AffineLoadOp>(loc, startIndices, batchInductionVars);
+      Value startIndex = rewriter.create<affine::AffineLoadOp>(
+          loc, startIndices, batchInductionVars);
       startIndex = rewriter.create<arith::IndexCastOp>(
           loc, rewriter.getIndexType(), startIndex);
       sIn[0] = startIndex;
@@ -399,7 +399,8 @@ class GatherOpConverter : public OpRewritePattern<GatherOp> {
 
     // We load value at a particular operand index and populate the output
     // tensor if the index constraints match.
-    Value loadValue = rewriter.create<AffineLoadOp>(loc, operand, operandIndex);
+    Value loadValue =
+        rewriter.create<affine::AffineLoadOp>(loc, operand, operandIndex);
     SmallVector<Value, 4> predicates;
     // Adding offsets to the corresponding starting index and comparing it with
     // the corresponding operand index.
@@ -445,7 +446,7 @@ class GatherOpConverter : public OpRewritePattern<GatherOp> {
         loc, resultPredicate, loadValue, zeroLoadValue);
     // We load value at output array.
     Value outputValue =
-        rewriter.create<AffineLoadOp>(loc, output, outputInductionVars);
+        rewriter.create<affine::AffineLoadOp>(loc, output, outputInductionVars);
 
     // The selected value is added to the previous value stored in output array.
     if (elementType.isa<FloatType>())
@@ -454,8 +455,8 @@ class GatherOpConverter : public OpRewritePattern<GatherOp> {
     else
       outputValue = rewriter.create<arith::AddIOp>(loc, elementType, selectLoad,
                                                    outputValue);
-    rewriter.create<AffineStoreOp>(loc, outputValue, output,
-                                   outputInductionVars);
+    rewriter.create<affine::AffineStoreOp>(loc, outputValue, output,
+                                           outputInductionVars);
     rewriter.eraseOp(op);
     return success();
   }
@@ -535,27 +536,30 @@ struct PadOpConverter : public OpRewritePattern<PadOp> {
     // Set padding_value to output.
     {
       OpBuilder::InsertionGuard regionGuard(rewriter);
-      Value scalarPaddingValue = rewriter.create<AffineLoadOp>(
+      Value scalarPaddingValue = rewriter.create<affine::AffineLoadOp>(
           loc, paddingValue, SmallVector<Value, 4>());
-      AffineForOp initForOp;
+      affine::AffineForOp initForOp;
       for (unsigned i = 0; i < rank; i++) {
-        initForOp = rewriter.create<AffineForOp>(loc, 0, outputShape[i]);
+        initForOp =
+            rewriter.create<affine::AffineForOp>(loc, 0, outputShape[i]);
         indices.push_back(initForOp.getInductionVar());
         rewriter.setInsertionPointToStart(initForOp.getBody());
       }
-      rewriter.create<AffineStoreOp>(loc, scalarPaddingValue, output, indices);
+      rewriter.create<affine::AffineStoreOp>(loc, scalarPaddingValue, output,
+                                             indices);
     }
 
     // Store `operand` into `output`, loop upper bounds from `operand` shape.
     indices.clear();
-    AffineForOp padForOp;
+    affine::AffineForOp padForOp;
     for (unsigned i = 0; i < rank; i++) {
-      padForOp = rewriter.create<AffineForOp>(loc, 0, operandShape[i]);
+      padForOp = rewriter.create<affine::AffineForOp>(loc, 0, operandShape[i]);
       indices.push_back(padForOp.getInductionVar());
       rewriter.setInsertionPointToStart(padForOp.getBody());
     }
-    Value storeVal = rewriter.create<AffineLoadOp>(loc, operand, indices);
-    rewriter.create<AffineStoreOp>(loc, storeVal, output, map, indices);
+    Value storeVal =
+        rewriter.create<affine::AffineLoadOp>(loc, operand, indices);
+    rewriter.create<affine::AffineStoreOp>(loc, storeVal, output, map, indices);
     rewriter.eraseOp(op);
     return success();
   }
@@ -580,13 +584,14 @@ struct BinaryOpConverter : public OpRewritePattern<LhloOpTy> {
     LogicalResult mapStatus = success();
     auto bodyBuilder = [&](OpBuilder& builder, Location loc,
                            ValueRange inductionVars) {
-      auto l = builder.create<AffineLoadOp>(loc, lhs, inductionVars);
-      auto r = builder.create<AffineLoadOp>(loc, rhs, inductionVars);
+      auto l = builder.create<affine::AffineLoadOp>(loc, lhs, inductionVars);
+      auto r = builder.create<affine::AffineLoadOp>(loc, rhs, inductionVars);
       Value opResult = lmhlo::LhloOpToStdScalarOp::map<LhloOpTy>(
           op, elementType, {l, r}, &builder);
       mapStatus = success(opResult != nullptr);
       if (failed(mapStatus)) return;
-      rewriter.create<AffineStoreOp>(loc, opResult, op.getOut(), inductionVars);
+      rewriter.create<affine::AffineStoreOp>(loc, opResult, op.getOut(),
+                                             inductionVars);
     };
 
     buildBoundedAffineLoopNest(rewriter, op.getLoc(), lhsType.getShape(),
@@ -614,13 +619,14 @@ struct UnaryOpConverter : public OpRewritePattern<LhloOpTy> {
     LogicalResult mapStatus = success();
     auto bodyBuilder = [&](OpBuilder& builder, Location loc,
                            ValueRange inductionVars) {
-      Value loadInput = builder.create<AffineLoadOp>(loc, input, inductionVars);
+      Value loadInput =
+          builder.create<affine::AffineLoadOp>(loc, input, inductionVars);
       Value opResult = lmhlo::LhloOpToStdScalarOp::map<LhloOpTy>(
           op, elementType, {loadInput}, &builder);
       mapStatus = success(opResult != nullptr);
       if (failed(mapStatus)) return;
-      rewriter.create<AffineStoreOp>(loc, opResult, op.getOutput(),
-                                     inductionVars);
+      rewriter.create<affine::AffineStoreOp>(loc, opResult, op.getOutput(),
+                                             inductionVars);
     };
     buildBoundedAffineLoopNest(rewriter, op.getLoc(), shape, bodyBuilder);
     if (failed(mapStatus)) return failure();
@@ -651,7 +657,7 @@ void populateLHLOToAffineConversionPattern(MLIRContext* context,
 struct LhloLegalizeToAffinePass
     : public impl::LhloLegalizeToAffinePassBase<LhloLegalizeToAffinePass> {
   void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<AffineDialect, math::MathDialect>();
+    registry.insert<affine::AffineDialect, math::MathDialect>();
   }
   void runOnOperation() override {
     auto func = getOperation();
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
index e3794a13620..382d3d93dcd 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
@@ -176,8 +176,9 @@ class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
 struct LhloLegalizeToGpuPass
     : public impl::LhloLegalizeToGpuPassBase<LhloLegalizeToGpuPass> {
   void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<AffineDialect, gpu::GPUDialect, linalg::LinalgDialect,
-                    memref::MemRefDialect, scf::SCFDialect>();
+    registry
+        .insert<affine::AffineDialect, gpu::GPUDialect, linalg::LinalgDialect,
+                memref::MemRefDialect, scf::SCFDialect>();
   }
 
   void runOnOperation() override {
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
index 1b1ba831b93..d0817860e8d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
@@ -174,7 +174,7 @@ def LHLOGPU_CublasLtMatmulOp : LHLOGPU_Op<"cublas.lt.matmul", [AttrSizedOperandS
     I64Attr:$algorithm);
 }
 
-def LHLOGPU_CublasLtMatmulF8Op : LHLOGPU_Op<"cublas.lt.matmul.f8"> {
+def LHLOGPU_CublasLtMatmulF8Op : LHLOGPU_Op<"cublas.lt.matmul.f8", [AttrSizedOperandSegments]> {
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$a,
     Arg<LHLO_Buffer, "", [MemRead]>:$b,
@@ -184,6 +184,7 @@ def LHLOGPU_CublasLtMatmulF8Op : LHLOGPU_Op<"cublas.lt.matmul.f8"> {
     Arg<LHLO_Buffer, "", [MemRead]>:$c_scale,
     Arg<LHLO_Buffer, "", [MemRead]>:$d_scale,
     Arg<LHLO_Buffer, "", [MemWrite]>:$d,
+    Arg<Optional<LHLO_Buffer>, "", [MemRead]>:$bias,
     Arg<Optional<LHLO_Buffer>, "", [MemWrite]>:$d_amax,
     MHLO_DotDimensionNumbers:$dot_dimension_numbers,
     MHLO_PrecisionConfigAttr:$precision_config,
@@ -219,7 +220,8 @@ class LHLOGPU_AsyncCollectiveCommunicationOp<string name, list<Trait> traits = [
     I64ElementsAttr:$replica_groups,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$constrain_layout,
     OptionalAttr<MHLO_ChannelHandle>:$channel_id,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$use_global_device_ids
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$use_global_device_ids,
+    BoolAttr:$is_sync
   );
 }
 
@@ -245,7 +247,8 @@ def LHLOGPU_CollectivePermuteStartOp :
     Arg<LHLO_Buffer, "", [MemRead]>:$operand,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output,
     I64ElementsAttr:$source_target_pairs,
-    OptionalAttr<MHLO_ChannelHandle>:$channel_id
+    OptionalAttr<MHLO_ChannelHandle>:$channel_id,
+    BoolAttr:$is_sync
   );
 }
 
@@ -255,7 +258,7 @@ def LHLOGPU_CollectivePermuteDoneOp: LHLOGPU_Op<"collective_permute_done"> {
 }
 
 def LHLOGPU_AllGatherStartOp :
-  LHLOGPU_AsyncCollectiveCommunicationOp<"all_gather_start", [SameOperandsElementType]> {
+  LHLOGPU_AsyncCollectiveCommunicationOp<"all_gather_start"> {
   let summary = "AllGatherStart operator";
   let description = [{
      Performs asynchronous concatenation across replicas.
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index 1c52bfaffa6..324ebe537e1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -156,6 +156,15 @@ void AsyncBundleType::getFlattenedTypes(SmallVectorImpl<Type>& types) {
 }
 
 namespace {
+//===----------------------------------------------------------------------===//
+// Utilities
+//===----------------------------------------------------------------------===//
+
+hlo::HloDialectInterface* getMhloDialect(MLIRContext* context) {
+  MhloDialect* dialect = context->getLoadedDialect<MhloDialect>();
+  return dialect->getRegisteredInterface<hlo::HloDialectInterface>();
+}
+
 void createArgs(ArrayRef<OpAsmParser::UnresolvedOperand> operands,
                 ArrayRef<Type> types,
                 SmallVector<OpAsmParser::Argument>& args) {
@@ -349,15 +358,15 @@ void ReduceScatterOp::build(OpBuilder& odsBuilder, OperationState& odsState,
 
 // TODO(b/231358795): Review the use of InferTypeOpInterface for ops that
 // support quantization or sparsity.
-#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                        \
-  LogicalResult Op::inferReturnTypeComponents(                                \
-      MLIRContext* context, std::optional<Location> location,                 \
-      ValueShapeRange operands, DictionaryAttr attributes,                    \
-      RegionRange regions,                                                    \
-      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {          \
-    return inferReturnTypeComponentsFromOperands(context, location, operands, \
-                                                 attributes, regions,         \
-                                                 inferredReturnShapes);       \
+#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                \
+  LogicalResult Op::inferReturnTypeComponents(                        \
+      MLIRContext* context, std::optional<Location> location,         \
+      ValueShapeRange operands, DictionaryAttr attributes,            \
+      OpaqueProperties properties, RegionRange regions,               \
+      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {  \
+    return inferReturnTypeComponentsFromOperands(                     \
+        context, location, operands, attributes, properties, regions, \
+        inferredReturnShapes);                                        \
   }
 
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AddOp)
@@ -430,7 +439,7 @@ LogicalResult AsyncStartOp::verify() {
   if (calleeThreadName != getExecutionThread()) {
     return emitOpError()
            << "execution_thread does not match the execution_thread of "
-           << getCalledComputation() << ".  Got: \"" << getExecutionThread()
+           << getCalledComputation() << ". Got: \"" << getExecutionThread()
            << "\", but expected " << calleeThreadName << ".";
   }
 
@@ -485,7 +494,7 @@ LogicalResult AsyncUpdateOp::verify() {
     return emitOpError() << "callee must have execution_thread attribute.";
   if (calleeThreadName != getExecutionThread()) {
     return emitOpError() << "execution_thread does not match name of "
-                         << getCalledComputation() << ".  Got: \""
+                         << getCalledComputation() << ". Got: \""
                          << getExecutionThread() << "\", but expected "
                          << calleeThreadName << ".";
   }
@@ -508,9 +517,9 @@ LogicalResult AsyncUpdateOp::verify() {
 
 LogicalResult AsyncUpdateOp::inferReturnTypes(
     MLIRContext*, std::optional<Location>, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  AsyncUpdateOp::Adaptor adaptor(operands, attributes, regions);
+  AsyncUpdateOp::Adaptor adaptor(operands, attributes, {}, regions);
   auto stateType = adaptor.getBundle().getType().cast<AsyncBundleType>();
   inferredReturnTypes.push_back(stateType);
   return success();
@@ -533,7 +542,7 @@ LogicalResult AsyncDoneOp::verify() {
     return emitOpError() << "callee must have execution_thread attribute.";
   if (calleeThreadName != getExecutionThread()) {
     return emitOpError() << "execution_thread does not match name of "
-                         << getCalledComputation() << ".  Got: \""
+                         << getCalledComputation() << ". Got: \""
                          << getExecutionThread() << "\", but expected "
                          << calleeThreadName << ".";
   }
@@ -556,9 +565,9 @@ LogicalResult AsyncDoneOp::verify() {
 
 LogicalResult AsyncDoneOp::inferReturnTypes(
     MLIRContext*, std::optional<Location>, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  AsyncDoneOp::Adaptor adaptor(operands, attributes, regions);
+  AsyncDoneOp::Adaptor adaptor(operands, attributes, {}, regions);
   ModuleOp module =
       adaptor.getBundle().getDefiningOp()->getParentOfType<ModuleOp>();
   auto calledComputation = adaptor.getCalledComputationAttr();
@@ -578,9 +587,10 @@ LogicalResult AsyncDoneOp::inferReturnTypes(
 
 LogicalResult AfterAllOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange,
-    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  auto dialect = context->getLoadedDialect<MhloDialect>();
-  return hlo::inferAfterAllOp(dialect, location, inferredReturnTypes);
+    DictionaryAttr, OpaqueProperties, RegionRange,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  return hlo::inferAfterAllOp(getMhloDialect(context), location,
+                              inferredReturnTypes);
 }
 
 //===----------------------------------------------------------------------===//
@@ -622,7 +632,7 @@ void ConstantOp::build(OpBuilder& /*builder*/, OperationState& result,
 
 LogicalResult ConstantOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   ConstantOpAdaptor adaptor(operands, attributes);
   return hlo::inferConstantOp(location, adaptor.getValue(),
@@ -631,8 +641,8 @@ LogicalResult ConstantOp::inferReturnTypes(
 
 bool ConstantOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
   if (l.size() != r.size() || l.size() != 1) return false;
-  auto lhsTy = l.front().cast<TensorType>();
-  auto rhsTy = r.front().cast<TensorType>();
+  auto lhsTy = l.front().cast<ShapedType>();
+  auto rhsTy = r.front().cast<ShapedType>();
   // For comparisons of the uniform quantized element based tensor type, use the
   // storage type since the constant value will be stored through the underlying
   // storage type.
@@ -750,9 +760,10 @@ LogicalResult FusionOp::verify() { return verifyOutputOperandAliasing(this); }
 
 LogicalResult CreateTokenOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange,
-    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  auto dialect = context->getLoadedDialect<MhloDialect>();
-  return hlo::inferCreateTokenOp(dialect, location, inferredReturnTypes);
+    DictionaryAttr, OpaqueProperties, RegionRange,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  return hlo::inferCreateTokenOp(getMhloDialect(context), location,
+                                 inferredReturnTypes);
 }
 
 //===----------------------------------------------------------------------===//
@@ -907,9 +918,9 @@ void CustomCallOp::getEffects(
 
 LogicalResult CholeskyOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  CholeskyOp::Adaptor adaptor(operands, attributes, regions);
+  CholeskyOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferCholeskyOp(location, adaptor.getA(), inferredReturnShapes);
 }
 
@@ -1042,9 +1053,9 @@ LogicalResult DotGeneralOp::reifyReturnTypeShapes(
 
 LogicalResult FftOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  FftOp::Adaptor adaptor(operands, attributes, regions);
+  FftOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferFftOp(location, adaptor.getOperand(),
                          adaptor.getFftType() == FftType::RFFT,
                          adaptor.getFftType() == FftType::IRFFT,
@@ -1223,9 +1234,10 @@ LogicalResult GatherOp::reifyReturnTypeShapes(
 
 LogicalResult GatherOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  GatherOp::Adaptor adaptor(operands, attributes, regions);
+  GatherOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferGatherOp(
       location, adaptor.getOperand(), adaptor.getStartIndices(),
       adaptor.getDimensionNumbers().getOffsetDims(),
@@ -1280,9 +1292,10 @@ LogicalResult DynamicGatherOp::reifyReturnTypeShapes(
 
 LogicalResult DynamicGatherOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  DynamicGatherOp::Adaptor adaptor(operands, attributes, regions);
+  DynamicGatherOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferDynamicGatherOp(
       location, adaptor.getOperand(), adaptor.getStartIndices(),
       adaptor.getSliceSizes(), adaptor.getDimensionNumbers().getOffsetDims(),
@@ -1297,9 +1310,9 @@ LogicalResult DynamicGatherOp::inferReturnTypeComponents(
 
 LogicalResult GetDimensionSizeOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  GetDimensionSizeOp::Adaptor adaptor(operands, attributes, regions);
+  GetDimensionSizeOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferGetDimensionSizeOp(location, adaptor.getOperand().getType(),
                                       adaptor.getDimension(),
                                       inferredReturnShapes);
@@ -1476,9 +1489,9 @@ LogicalResult DynamicIotaOp::reifyReturnTypeShapes(
 
 LogicalResult DynamicUpdateSliceOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  DynamicUpdateSliceOp::Adaptor adaptor(operands, attributes, regions);
+  DynamicUpdateSliceOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferDynamicUpdateSliceOp(
       location, adaptor.getOperand(), adaptor.getUpdate(),
       adaptor.getStartIndices(), inferredReturnShapes);
@@ -1516,9 +1529,9 @@ OpFoldResult DynamicUpdateSliceOp::fold(FoldAdaptor /*adaptor*/) {
 
 LogicalResult AbsOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  AbsOp::Adaptor adaptor(operands, attributes, regions);
+  AbsOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferAbsOp(location, adaptor.getOperand(), inferredReturnTypes);
 }
 
@@ -1588,8 +1601,8 @@ struct ConvolutionIsDot : public OpRewritePattern<mhlo::ConvolutionOp> {
   using OpRewritePattern<mhlo::ConvolutionOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(mhlo::ConvolutionOp op,
                                 PatternRewriter& rewriter) const override {
-    auto lhs = op.getLhs();
-    auto rhs = op.getRhs();
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
     auto lhsTy = lhs.getType().cast<RankedTensorType>();
     auto rhsTy = rhs.getType().cast<RankedTensorType>();
     auto resultTy = op.getType().cast<RankedTensorType>();
@@ -1716,7 +1729,7 @@ LogicalResult ConvolutionOp::verify() {
 
   // P2.
   if (failed(hlo::verifyConvolutionAttributes(
-          getLoc(), getLhs(), getRhs(),
+          getLoc(), getLhs().getType(), getRhs().getType(),
           getDimensionNumbers().getInputBatchDimension(),
           getDimensionNumbers().getInputFeatureDimension(),
           getDimensionNumbers().getInputSpatialDimensions(),
@@ -1965,9 +1978,9 @@ void TupleOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult AllToAllOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  AllToAllOp::Adaptor adaptor(operands, attributes, regions);
+  AllToAllOp::Adaptor adaptor(operands, attributes, {}, regions);
 
   bool isArrayAllToAll = adaptor.getSplitDimension() &&
                          adaptor.getConcatDimension() &&
@@ -2061,9 +2074,10 @@ LogicalResult AllReduceOp::verify() {
 
 LogicalResult BatchNormGradOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  BatchNormGradOp::Adaptor adaptor(operands, attributes, regions);
+  BatchNormGradOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferBatchNormGradOp(
       location, adaptor.getOperand(), adaptor.getScale(), adaptor.getMean(),
       adaptor.getVariance(), adaptor.getGradOutput(), adaptor.getFeatureIndex(),
@@ -2076,9 +2090,10 @@ LogicalResult BatchNormGradOp::inferReturnTypeComponents(
 
 LogicalResult BatchNormTrainingOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  BatchNormTrainingOp::Adaptor adaptor(operands, attributes, regions);
+  BatchNormTrainingOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferBatchNormTrainingOp(
       location, adaptor.getOperand(), adaptor.getScale(), adaptor.getOffset(),
       adaptor.getFeatureIndex(), inferredReturnShapes);
@@ -2090,9 +2105,10 @@ LogicalResult BatchNormTrainingOp::inferReturnTypeComponents(
 
 LogicalResult BatchNormInferenceOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  BatchNormInferenceOp::Adaptor adaptor(operands, attributes, regions);
+  BatchNormInferenceOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferBatchNormInferenceOp(
       location, adaptor.getOperand(), adaptor.getScale(), adaptor.getOffset(),
       adaptor.getMean(), adaptor.getVariance(), adaptor.getFeatureIndex(),
@@ -2187,9 +2203,9 @@ OpFoldResult BroadcastOp::fold(FoldAdaptor adaptor) {
 
 LogicalResult BroadcastOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  BroadcastOp::Adaptor adaptor(operands, attributes, regions);
+  BroadcastOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferBroadcastOp(location, adaptor.getOperand(),
                                adaptor.getBroadcastSizes(),
                                inferredReturnShapes);
@@ -2491,9 +2507,9 @@ LogicalResult DynamicBroadcastInDimOp::reifyReturnTypeShapes(
 
 LogicalResult ComplexOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  ComplexOp::Adaptor adaptor(operands, attributes, regions);
+  ComplexOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferComplexOp(location, adaptor.getLhs(), inferredReturnTypes);
 }
 
@@ -2513,9 +2529,9 @@ OpFoldResult ComplexOp::fold(FoldAdaptor) {
 
 LogicalResult ImagOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  ImagOp::Adaptor adaptor(operands, attributes, regions);
+  ImagOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferImagOp(location, adaptor.getOperand(), inferredReturnTypes);
 }
 
@@ -2533,9 +2549,9 @@ OpFoldResult ImagOp::fold(FoldAdaptor) {
 
 LogicalResult IsFiniteOp::inferReturnTypes(
     MLIRContext* ctx, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  IsFiniteOp::Adaptor adaptor(operands, attributes, regions);
+  IsFiniteOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferIsFiniteOp(ctx, location, adaptor.getX(),
                               inferredReturnTypes);
 }
@@ -2546,9 +2562,9 @@ LogicalResult IsFiniteOp::inferReturnTypes(
 
 LogicalResult RealOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  RealOp::Adaptor adaptor(operands, attributes, regions);
+  RealOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferRealOp(location, adaptor.getOperand(), inferredReturnTypes);
 }
 
@@ -2644,9 +2660,9 @@ class ConcatenateForwarding : public OpRewritePattern<ConcatenateOp> {
 
 LogicalResult ConcatenateOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  ConcatenateOp::Adaptor adaptor(operands, attributes, regions);
+  ConcatenateOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferConcatenateOp(location, adaptor.getVal().getTypes(),
                                  adaptor.getDimension(), inferredReturnTypes);
 }
@@ -2958,9 +2974,9 @@ void DynamicSliceOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult DynamicSliceOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  DynamicSliceOp::Adaptor adaptor(operands, attributes, regions);
+  DynamicSliceOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferDynamicSliceOp(location, adaptor.getOperand().getType(),
                                   adaptor.getStartIndices().getTypes(),
                                   adaptor.getSliceSizes(),
@@ -3094,8 +3110,8 @@ LogicalResult RealDynamicSliceOp::reifyReturnTypeShapes(
 //===----------------------------------------------------------------------===//
 
 LogicalResult InfeedOp::verify() {
-  auto dialect = getContext()->getLoadedDialect<MhloDialect>();
-  return hlo::verifyInfeedOp(dialect, getLoc(), getLayout(), getResults());
+  return hlo::verifyInfeedOp(getMhloDialect(getContext()), getLoc(),
+                             getLayout(), getResults());
 }
 
 //===----------------------------------------------------------------------===//
@@ -3104,9 +3120,9 @@ LogicalResult InfeedOp::verify() {
 
 LogicalResult MapOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  MapOp::Adaptor adaptor(operands, attributes, regions);
+  MapOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferMapOp(location, adaptor.getInputs(), adaptor.getDimensions(),
                          adaptor.getComputation(), inferredReturnShapes);
 }
@@ -3139,9 +3155,10 @@ LogicalResult MapOp::reifyReturnTypeShapes(
 
 LogicalResult OutfeedOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange,
-    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  auto dialect = context->getLoadedDialect<MhloDialect>();
-  return hlo::inferOutfeedOp(dialect, location, inferredReturnTypes);
+    DictionaryAttr, OpaqueProperties, RegionRange,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  return hlo::inferOutfeedOp(getMhloDialect(context), location,
+                             inferredReturnTypes);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3150,9 +3167,10 @@ LogicalResult OutfeedOp::inferReturnTypes(
 
 LogicalResult SendOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange,
-    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  auto dialect = context->getLoadedDialect<MhloDialect>();
-  return hlo::inferSendOp(dialect, location, inferredReturnTypes);
+    DictionaryAttr, OpaqueProperties, RegionRange,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  return hlo::inferSendOp(getMhloDialect(context), location,
+                          inferredReturnTypes);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3160,8 +3178,8 @@ LogicalResult SendOp::inferReturnTypes(
 //===----------------------------------------------------------------------===//
 
 LogicalResult RecvOp::verify() {
-  auto dialect = getContext()->getLoadedDialect<MhloDialect>();
-  return hlo::verifyRecvOp(dialect, getLoc(), getResults());
+  return hlo::verifyRecvOp(getMhloDialect(getContext()), getLoc(),
+                           getResults());
 }
 
 //===----------------------------------------------------------------------===//
@@ -3176,9 +3194,9 @@ OpFoldResult CopyOp::fold(FoldAdaptor) { return getOperand(); }
 
 LogicalResult ReduceWindowOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  ReduceWindowOp::Adaptor adaptor(operands, attributes, regions);
+  ReduceWindowOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferReduceWindowOp(
       location, adaptor.getInputs(), adaptor.getInitValues(),
       adaptor.getWindowDimensions(), adaptor.getWindowStrides(),
@@ -3318,7 +3336,7 @@ void ReduceWindowOp::build(
   if (mlir::succeeded(ReduceWindowOp::inferReturnTypes(
           odsBuilder.getContext(), odsState.location, odsState.operands,
           odsState.attributes.getDictionary(odsState.getContext()),
-          odsState.regions, inferredReturnTypes)))
+          odsState.getRawProperties(), odsState.regions, inferredReturnTypes)))
     odsState.addTypes(inferredReturnTypes);
   else
     llvm::report_fatal_error("Failed to infer result type(s).");
@@ -3776,9 +3794,9 @@ ParseResult ReduceOp::parse(OpAsmParser& parser, OperationState& result) {
 
 LogicalResult ReduceOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  ReduceOp::Adaptor adaptor(operands, attributes, regions);
+  ReduceOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferReduceOp(location, adaptor.getInputs().getTypes(),
                             adaptor.getInitValues().getTypes(),
                             adaptor.getDimensions(), inferredReturnShapes);
@@ -3928,7 +3946,7 @@ LogicalResult ReduceOp::reifyReturnTypeShapes(
 //===----------------------------------------------------------------------===//
 LogicalResult OptimizationBarrierOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   OptimizationBarrierOp::Adaptor adaptor(operands, attributes);
   return hlo::inferOptimizationBarrierOp(location, adaptor.getOperand(),
@@ -3958,9 +3976,10 @@ LogicalResult RngBitGeneratorOp::verify() {
 
 LogicalResult RngOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  RngOp::Adaptor adaptor(operands, attributes, regions);
+  RngOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferRngOp(
       location, adaptor.getA(), adaptor.getB(), adaptor.getShape(),
       adaptor.getRngDistribution() == RngDistribution::UNIFORM,
@@ -3997,7 +4016,7 @@ LogicalResult XlaRngGetAndUpdateStateOp::verify() {
 
 LogicalResult XlaRngGetAndUpdateStateOp::inferReturnTypes(
     MLIRContext* ctx, std::optional<Location>, ValueRange, DictionaryAttr,
-    RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+    OpaqueProperties, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
   inferredReturnTypes.push_back(mlir::RankedTensorType::get(
       {2}, mlir::IntegerType::get(ctx, 64, IntegerType::Unsigned)));
   return success();
@@ -4040,7 +4059,7 @@ void SelectOp::getCanonicalizationPatterns(RewritePatternSet& results,
 // the return type based on operand type.
 LogicalResult SelectOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
   SelectOp::Adaptor op(operands, attributes);
   return hlo::inferSelectOp(location, op.getPred(), op.getOnTrue(),
@@ -4078,13 +4097,13 @@ OpFoldResult SetDimensionSizeOp::fold(FoldAdaptor adaptor) {
 
 LogicalResult SetDimensionSizeOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  SetDimensionSizeOp::Adaptor adaptor(operands, attributes, regions);
-  auto dialect = context->getLoadedDialect<MhloDialect>();
+  SetDimensionSizeOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferSetDimensionSizeOp(
-      dialect, location, adaptor.getOperand().getType(), adaptor.getSize(),
-      adaptor.getDimension(), inferredReturnShapes);
+      getMhloDialect(context), location, adaptor.getOperand().getType(),
+      adaptor.getSize(), adaptor.getDimension(), inferredReturnShapes);
 }
 
 //===----------------------------------------------------------------------===//
@@ -4093,11 +4112,12 @@ LogicalResult SetDimensionSizeOp::inferReturnTypeComponents(
 
 LogicalResult PadOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  PadOp::Adaptor adaptor(operands, attributes, regions);
-  return hlo::inferPadOp(location, adaptor.getOperand(),
-                         adaptor.getPaddingValue(), adaptor.getEdgePaddingLow(),
+  PadOp::Adaptor adaptor(operands, attributes, {}, regions);
+  return hlo::inferPadOp(location, adaptor.getOperand().getType(),
+                         adaptor.getPaddingValue().getType(),
+                         adaptor.getEdgePaddingLow(),
                          adaptor.getEdgePaddingHigh(),
                          adaptor.getInteriorPadding(), inferredReturnTypes);
 }
@@ -4444,7 +4464,7 @@ void ReshapeOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult ReplicaIdOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location,
-    ValueRange /*operands*/, DictionaryAttr, RegionRange,
+    ValueRange /*operands*/, DictionaryAttr, OpaqueProperties, RegionRange,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   return hlo::inferReplicaIdOp(context, location, inferredReturnTypes);
 }
@@ -4455,7 +4475,7 @@ LogicalResult ReplicaIdOp::inferReturnTypes(
 
 LogicalResult PartitionIdOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location,
-    ValueRange /*operands*/, DictionaryAttr, RegionRange,
+    ValueRange /*operands*/, DictionaryAttr, OpaqueProperties, RegionRange,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   return hlo::inferPartitionIdOp(context, location, inferredReturnTypes);
 }
@@ -4466,7 +4486,8 @@ LogicalResult PartitionIdOp::inferReturnTypes(
 
 LogicalResult AddDependencyOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location>, ValueRange operands,
-    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+    DictionaryAttr, OpaqueProperties, RegionRange,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
   inferredReturnTypes.push_back(operands.getTypes()[0]);
   return success();
 }
@@ -4477,9 +4498,9 @@ LogicalResult AddDependencyOp::inferReturnTypes(
 
 LogicalResult IfOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  IfOp::Adaptor adaptor(operands, attributes, regions);
+  IfOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferIfOp(location, adaptor.getPred(), adaptor.getRegions(),
                         inferredReturnTypes);
 }
@@ -4508,9 +4529,9 @@ void IfOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult CaseOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  CaseOp::Adaptor adaptor(operands, attributes, regions);
+  CaseOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferCaseOp(location, adaptor.getIndex(), adaptor.getRegions(),
                           inferredReturnTypes);
 }
@@ -5031,21 +5052,21 @@ OpFoldResult ClampOp::fold(FoldAdaptor adaptor) {
   if (!operand || !min || !max) {
     return {};
   }
-  if (min.getType().getRank() == 0) {
-    min = DenseElementsAttr::get(operand.getType(),
+  if (min.getShapedType().getRank() == 0) {
+    min = DenseElementsAttr::get(operand.getShapedType(),
                                  min.getValues<Attribute>()[0]);
   }
-  if (max.getType().getRank() == 0) {
-    max = DenseElementsAttr::get(operand.getType(),
+  if (max.getShapedType().getRank() == 0) {
+    max = DenseElementsAttr::get(operand.getShapedType(),
                                  max.getValues<Attribute>()[0]);
   }
   Attribute result = {};
-  if (operand.getType().getElementType().isa<FloatType>()) {
+  if (operand.getShapedType().getElementType().isa<FloatType>()) {
     result = BinaryFolder<ClampOp, FloatType, APFloat, Max<APFloat>>(
         this, ArrayRef<Attribute>{min, operand});
     result = BinaryFolder<ClampOp, FloatType, APFloat, Min<APFloat>>(
         this, ArrayRef<Attribute>{max, result});
-  } else if (operand.getType().getElementType().isa<IntegerType>()) {
+  } else if (operand.getShapedType().getElementType().isa<IntegerType>()) {
     result = BinaryFolder<ClampOp, IntegerType, APInt, Max<APSInt>>(
         this, ArrayRef<Attribute>{min, operand});
     result = BinaryFolder<ClampOp, IntegerType, APInt, Min<APSInt>>(
@@ -5056,9 +5077,9 @@ OpFoldResult ClampOp::fold(FoldAdaptor adaptor) {
 
 LogicalResult ClampOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  ClampOp::Adaptor adaptor(operands, attributes, regions);
+  ClampOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferClampOp(location, adaptor.getMin(), adaptor.getOperand(),
                            adaptor.getMax(), inferredReturnShapes);
 }
@@ -5077,8 +5098,8 @@ LogicalResult ClampOp::reifyReturnTypeShapes(
 
 LogicalResult SliceOp::inferReturnTypes(
     MLIRContext* /*context*/, std::optional<Location> location,
-    ValueRange operands, DictionaryAttr attributes, RegionRange /*regions*/,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
+    ValueRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange /*regions*/, SmallVectorImpl<Type>& inferredReturnTypes) {
   SliceOpAdaptor adaptor(operands, attributes);
   return hlo::inferSliceOp(location, adaptor.getOperand().getType(),
                            adaptor.getStartIndices(), adaptor.getLimitIndices(),
@@ -5299,9 +5320,9 @@ void SortOp::build(OpBuilder& builder, OperationState& state,
 
 LogicalResult SortOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  SortOp::Adaptor adaptor(operands, attributes, regions);
+  SortOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferSortOp(location, adaptor.getInputs(), inferredReturnShapes);
 }
 
@@ -5523,9 +5544,9 @@ LogicalResult TransposeOp::reifyReturnTypeShapes(
 
 LogicalResult TransposeOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> loc, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  TransposeOp::Adaptor adaptor(operands, attributes, regions);
+  TransposeOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferTransposeOp(loc, adaptor.getOperand(),
                                adaptor.getPermutation(), inferredReturnTypes);
 }
@@ -5536,9 +5557,9 @@ LogicalResult TransposeOp::inferReturnTypes(
 
 LogicalResult TriangularSolveOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  TriangularSolveOp::Adaptor adaptor(operands, attributes, regions);
+  TriangularSolveOp::Adaptor adaptor(operands, attributes, {}, regions);
   bool isTransposeAInvalid =
       (adaptor.getTransposeA() == Transpose::TRANSPOSE_INVALID);
   return hlo::inferTriangularSolveOp(location, adaptor.getA(), adaptor.getB(),
@@ -5560,9 +5581,9 @@ OpFoldResult GetTupleElementOp::fold(FoldAdaptor /*adaptor*/) {
 
 LogicalResult GetTupleElementOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  GetTupleElementOp::Adaptor adaptor(operands, attributes, regions);
+  GetTupleElementOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferGetTupleElementOp(location, adaptor.getOperand(),
                                      adaptor.getIndex(), inferredReturnTypes);
 }
@@ -5573,9 +5594,9 @@ LogicalResult GetTupleElementOp::inferReturnTypes(
 
 LogicalResult TupleOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  TupleOp::Adaptor adaptor(operands, attributes, regions);
+  TupleOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferTupleOp(context, location, adaptor.getVal(),
                            inferredReturnTypes);
 }
@@ -5603,9 +5624,10 @@ void CompareOp::build(OpBuilder& builder, OperationState& result, Value lhs,
 
 LogicalResult CompareOp::inferReturnTypeComponents(
     MLIRContext* context, std::optional<Location> location,
-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+    RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  CompareOp::Adaptor adaptor(operands, attributes, regions);
+  CompareOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferCompareOp(context, location, adaptor.getLhs(),
                              inferredReturnShapes);
 }
@@ -5734,9 +5756,9 @@ OpFoldResult CompareOp::fold(FoldAdaptor adaptor) {
 
 LogicalResult SelectAndScatterOp::inferReturnTypes(
     MLIRContext*, std::optional<Location>, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  SelectAndScatterOp::Adaptor adaptor(operands, attributes, regions);
+  SelectAndScatterOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferSelectAndScatterOp(adaptor.getOperand(),
                                       inferredReturnTypes);
 }
@@ -5754,9 +5776,9 @@ LogicalResult SelectAndScatterOp::verify() {
 
 LogicalResult ScatterOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  ScatterOp::Adaptor adaptor(operands, attributes, regions);
+  ScatterOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferScatterOp(location, adaptor.getInputs(),
                              inferredReturnTypes);
 }
@@ -5979,9 +6001,9 @@ void ScatterOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult WhileOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  WhileOp::Adaptor adaptor(operands, attributes, regions);
+  WhileOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferWhileOp(location, adaptor.getOperand(), inferredReturnTypes);
 }
 
@@ -6136,9 +6158,9 @@ void WhileOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult UniformDequantizeOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  UniformDequantizeOp::Adaptor adaptor(operands, attributes, regions);
+  UniformDequantizeOp::Adaptor adaptor(operands, attributes, {}, regions);
   return hlo::inferUniformDequantizeOp(location, adaptor.getOperand(),
                                        inferredReturnShapes);
 }
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
index f8b2568db3f..fa34b3abd61 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <cmath>
+#include <limits>
 #include <numeric>
 #include <vector>
 
@@ -261,11 +262,14 @@ template <typename FTy>
 Value materializePolynomialApproximation(ConversionPatternRewriter &rewriter,
                                          Location loc, Value x,
                                          ArrayRef<FTy> coefficients) {
-  Value poly = chlo::getConstantLike(rewriter, loc, 0.0, x);
-  for (FTy c : coefficients) {
+  if (coefficients.empty()) return chlo::getConstantLike(rewriter, loc, 0.0, x);
+
+  Value poly = chlo::getConstantLike(rewriter, loc, coefficients[0], x);
+  for (size_t i = 1; i < coefficients.size(); ++i) {
     poly = rewriter.create<mhlo::MulOp>(loc, x.getType(), poly, x);
     poly = rewriter.create<mhlo::AddOp>(
-        loc, x.getType(), poly, chlo::getConstantLike(rewriter, loc, c, x));
+        loc, x.getType(), poly,
+        chlo::getConstantLike(rewriter, loc, coefficients[i], x));
   }
   return poly;
 }
@@ -505,7 +509,7 @@ Value materializeErfcApproximationF32ForMagnitudeGeOne(
 
 // Precondition is |x| <= 1. Use erfc approximation, otherwise.
 // This implementation is based on Cephes.
-Value materializeErfApproximationF32ForMagnitudeLeOneCephes(
+Value materializeErfApproximationF32ForMagnitudeLeOne(
     ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
   Value x = args.front();
   assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
@@ -525,8 +529,8 @@ Value materializeErfApproximationF32ForMagnitudeLeOneCephes(
 }
 
 // This is the same approximation as used in Eigen.
-Value materializeErfApproximationF32ForMagnitudeLeOne(
-    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
+Value materializeErfApproximationF32(ConversionPatternRewriter &rewriter,
+                                     Location loc, ValueRange args) {
   Value x = args.front();
   assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
          "expect f32 element type");
@@ -553,7 +557,10 @@ Value materializeErfApproximationF32ForMagnitudeLeOne(
   Value betaPoly = materializePolynomialApproximation(rewriter, loc, xSq,
                                                       llvm::ArrayRef(kBeta));
   Value xMulAlphaPoly = rewriter.create<mhlo::MulOp>(loc, x, alphaPoly);
-  return rewriter.create<mhlo::DivOp>(loc, xMulAlphaPoly, betaPoly);
+  Value erf = rewriter.create<mhlo::DivOp>(loc, xMulAlphaPoly, betaPoly);
+  Value lbErf = chlo::getConstantLike(rewriter, loc, -1.0, x);
+  Value ubErf = chlo::getConstantLike(rewriter, loc, 1.0, x);
+  return rewriter.create<mhlo::ClampOp>(loc, erf.getType(), lbErf, erf, ubErf);
 }
 
 Value materializeErfcApproximationF32(ConversionPatternRewriter &rewriter,
@@ -571,7 +578,7 @@ Value materializeErfcApproximationF32(ConversionPatternRewriter &rewriter,
   //   erfc(x) = 1 - erf_approx(x)
   Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
   Value erfApprox =
-      materializeErfApproximationF32ForMagnitudeLeOneCephes(rewriter, loc, x);
+      materializeErfApproximationF32ForMagnitudeLeOne(rewriter, loc, x);
   Value erfBasedApprox = rewriter.create<mhlo::SubtractOp>(loc, one, erfApprox);
 
   // Materialize approximation selection based on argument.
@@ -582,33 +589,6 @@ Value materializeErfcApproximationF32(ConversionPatternRewriter &rewriter,
                                          erfcApprox);
 }
 
-Value materializeErfApproximationF32(ConversionPatternRewriter &rewriter,
-                                     Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
-         "expect f32 element type");
-
-  // Rely on erf approximation for |x| < 1
-  //   erf(x) = erf_approx(x)
-  Value erfApprox =
-      materializeErfApproximationF32ForMagnitudeLeOne(rewriter, loc, x);
-
-  // Rely on erfc approximation for |x| >= 1 and materialize erf as
-  //   erf(x) = 1 - erfc_approx(x)
-  Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  Value erfcApprox =
-      materializeErfcApproximationF32ForMagnitudeGeOne(rewriter, loc, x);
-  Value erfcBasedApprox =
-      rewriter.create<mhlo::SubtractOp>(loc, one, erfcApprox);
-
-  // Materialize approximation selection based on argument.
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value absXLtOne = rewriter.create<mhlo::CompareOp>(
-      loc, absX, one, mhlo::ComparisonDirection::LT);
-  return rewriter.create<mhlo::SelectOp>(loc, absXLtOne, erfApprox,
-                                         erfcBasedApprox);
-}
-
 struct ConvertErfOp : public OpConversionPattern<ErfOp> {
   using OpConversionPattern<ErfOp>::OpConversionPattern;
   LogicalResult matchAndRewrite(
@@ -1583,13 +1563,45 @@ struct ConvertTopKOp : public OpConversionPattern<TopKOp> {
     int64_t operandRank = operandType.getRank();
     int64_t lastDimIndex = operandRank - 1;
     int64_t lastDimSize = operandType.getDimSize(lastDimIndex);
-    assert(lastDimSize != ShapedType::kDynamic);
+    int64_t lastDimResultSize =
+        std::min(static_cast<int64_t>(op.getK()), lastDimSize);
+    int64_t isDynamic = !operandType.hasStaticShape();
+    auto i32Type = rewriter.getIntegerType(32);
+    Value opShapeValue, resultShapeValue;
+    if (isDynamic) {
+      SmallVector<Value> sizesI32x1;
+      for (auto i = 0; i < operandType.getRank(); ++i) {
+        auto sizeI32 = rewriter.create<mhlo::GetDimensionSizeOp>(
+            op.getLoc(), op.getOperand(), i);
+        auto sizeI32x1 = rewriter.create<mhlo::ReshapeOp>(
+            op.getLoc(), RankedTensorType::get({1}, i32Type), sizeI32);
+        sizesI32x1.push_back(sizeI32x1);
+      }
+      opShapeValue =
+          rewriter.create<mhlo::ConcatenateOp>(op.getLoc(), sizesI32x1,
+                                               /*dimension=*/0);
+      auto lastDimI32 = rewriter.create<mhlo::ConstantOp>(
+          op.getLoc(),
+          rewriter.getI32IntegerAttr(static_cast<int32_t>(lastDimResultSize)));
+      auto lastDimI32x1 = rewriter.create<mhlo::ReshapeOp>(
+          op.getLoc(), RankedTensorType::get({1}, i32Type), lastDimI32);
+      sizesI32x1.back() = lastDimI32x1;
+      resultShapeValue =
+          rewriter.create<mhlo::ConcatenateOp>(op.getLoc(), sizesI32x1,
+                                               /*dimension=*/0);
+    }
 
     // Create an Iota op for indices.
-    auto i32Type = rewriter.getIntegerType(32);
     Type iotaType = RankedTensorType::get(operandType.getShape(), i32Type);
-    Value iotaOp = rewriter.create<mhlo::IotaOp>(
-        op.getLoc(), iotaType, rewriter.getI64IntegerAttr(lastDimIndex));
+    Value iotaOp;
+    if (isDynamic) {
+      iotaOp = rewriter.create<mhlo::DynamicIotaOp>(
+          op.getLoc(), iotaType, opShapeValue,
+          rewriter.getI64IntegerAttr(lastDimIndex));
+    } else {
+      iotaOp = rewriter.create<mhlo::IotaOp>(
+          op.getLoc(), iotaType, rewriter.getI64IntegerAttr(lastDimIndex));
+    }
 
     // Create the sort op. It takes two inputs, one for the original input, the
     // other for the indices. Use TOTALORDER comparison type instead of the
@@ -1607,21 +1619,46 @@ struct ConvertTopKOp : public OpConversionPattern<TopKOp> {
 
     SmallVector<int64_t, 4> beginIndices(operandRank, 0);
     auto endIndices = llvm::to_vector<4>(operandType.getShape());
-    endIndices.back() = std::min(static_cast<int64_t>(op.getK()), lastDimSize);
+    endIndices.back() = lastDimResultSize;
     SmallVector<int64_t, 4> strides(operandRank, 1);
 
     // Get the slice for the top K elements.
     auto indicesTy = RankedTensorType::get(operandRank, rewriter.getI64Type());
-    Value values = rewriter.create<mhlo::SliceOp>(
-        op.getLoc(), tupleFirstElement,
-        DenseIntElementsAttr::get(indicesTy, beginIndices),
-        DenseIntElementsAttr::get(indicesTy, endIndices),
-        DenseIntElementsAttr::get(indicesTy, strides));
-    Value indices = rewriter.create<mhlo::SliceOp>(
-        op.getLoc(), tupleSecondElement,
-        DenseIntElementsAttr::get(indicesTy, beginIndices),
-        DenseIntElementsAttr::get(indicesTy, endIndices),
-        DenseIntElementsAttr::get(indicesTy, strides));
+    Value values, indices;
+    if (isDynamic) {
+      Value startIndices = rewriter.create<mhlo::ConstantOp>(
+          op.getLoc(), DenseIntElementsAttr::get(indicesTy, beginIndices));
+      Value lastIndices = rewriter.create<mhlo::ConvertOp>(
+          op.getLoc(), resultShapeValue, rewriter.getI64Type());
+      Value stridesOp = rewriter.create<mhlo::ConstantOp>(
+          op.getLoc(), DenseIntElementsAttr::get(indicesTy, strides));
+
+      SmallVector<int64_t, 4> resultShape =
+          llvm::to_vector<4>(operandType.getShape());
+      resultShape.back() = lastDimResultSize;
+      RankedTensorType resultType = RankedTensorType::get(
+          resultShape, elementType, operandType.getEncoding());
+      RankedTensorType indexResultType =
+          RankedTensorType::get(resultShape, i32Type);
+
+      values = rewriter.create<mhlo::RealDynamicSliceOp>(
+          op.getLoc(), resultType, tupleFirstElement, startIndices, lastIndices,
+          stridesOp);
+      indices = rewriter.create<mhlo::RealDynamicSliceOp>(
+          op.getLoc(), indexResultType, tupleSecondElement, startIndices,
+          lastIndices, stridesOp);
+    } else {
+      values = rewriter.create<mhlo::SliceOp>(
+          op.getLoc(), tupleFirstElement,
+          DenseIntElementsAttr::get(indicesTy, beginIndices),
+          DenseIntElementsAttr::get(indicesTy, endIndices),
+          DenseIntElementsAttr::get(indicesTy, strides));
+      indices = rewriter.create<mhlo::SliceOp>(
+          op.getLoc(), tupleSecondElement,
+          DenseIntElementsAttr::get(indicesTy, beginIndices),
+          DenseIntElementsAttr::get(indicesTy, endIndices),
+          DenseIntElementsAttr::get(indicesTy, strides));
+    }
 
     rewriter.replaceOp(op, {values, indices});
     return success();
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc
index 3037b380d07..ea4bd7dd660 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc
@@ -58,9 +58,9 @@ class ConvertToSignless : public ConversionPattern {
     if (failed(typeConverter->convertTypes(op->getResultTypes(), resultTypes)))
       return failure();
 
-    auto* newOp = Operation::create(op->getLoc(), op->getName(), resultTypes,
-                                    operands, op->getAttrs(),
-                                    op->getSuccessors(), op->getNumRegions());
+    auto* newOp = Operation::create(
+        op->getLoc(), op->getName(), resultTypes, operands, op->getAttrs(),
+        op->getPropertiesStorage(), op->getSuccessors(), op->getNumRegions());
     for (auto regions : llvm::zip(op->getRegions(), newOp->getRegions())) {
       Region& before = std::get<0>(regions);
       Region& parent = std::get<1>(regions);
@@ -90,8 +90,10 @@ class ConvertConstantToSignless
 
     auto values = llvm::to_vector(
         adaptor.getValue().cast<DenseIntElementsAttr>().getValues<APInt>());
+    Type type = typeConverter->convertType(constantOp.getType());
+    auto shapedType = type.dyn_cast<ShapedType>();
     auto newValues = DenseIntElementsAttr::get(
-        typeConverter->convertType(constantOp.getType()), values);
+        shapedType, values);
 
     rewriter.replaceOpWithNewOp<arith::ConstantOp>(constantOp, newValues);
     return success();
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
index d9a3e6d9520..7f70f4339c7 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
@@ -141,7 +141,13 @@ class ExpandHloTuplesPass
       return;
     }
 
-    expandTupledTensorInReturnOp(entryFunction);
+    // Recursively expand tuples until all of them are gone.
+    while (
+        llvm::any_of(llvm::concat<const Type>(entryFunction.getArgumentTypes(),
+                                              entryFunction.getResultTypes()),
+                     [](Type type) { return type.isa<TupleType>(); })) {
+      expandTupledTensorInReturnOp(entryFunction);
+    }
   }
 };
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc
index cc81761fb6c..7758e4ec94c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc
@@ -270,6 +270,11 @@ struct GroupReductionDimensionsPattern : public OpRewritePattern<ReduceOp> {
     if (op.getInputs().size() != 1 || op.getInitValues().size() != 1)
       return failure();
     Value arg = op.getInputs().front();
+    // Only apply to non-sparse tensors.
+    if (auto rtp = arg.getType().cast<RankedTensorType>();
+        rtp.getEncoding() != nullptr)
+      return failure();
+
     auto argTy = arg.getType().cast<RankedTensorType>();
 
     // Sort reduction dimensions, which is not an invariant of the op.
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_control_flow/legalize_control_flow.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_control_flow/legalize_control_flow.cc
index 87266b7fe52..dec0b535e77 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_control_flow/legalize_control_flow.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_control_flow/legalize_control_flow.cc
@@ -14,28 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 // This file implements logic for lowering MHLO dialect to SCF dialect.
+#include <memory>
+#include <optional>
 #include <utility>
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // TF:llvm-project
 #include "mlir/IR/Block.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/TypeRange.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -77,7 +72,54 @@ Value extractTensorValue(OpBuilder& b, Value tensor) {
   return b.create<tensor::ExtractOp>(loc, tensor, ValueRange());
 }
 
-// Create a memref descriptor given a pointer and memref type information.
+struct ScfForBounds {
+  Value lb;
+  Value ub;
+  Value step;
+  unsigned indexArgIndex;
+};
+
+std::optional<ScfForBounds> extractForBounds(mhlo::WhileOp op) {
+  auto& cond = op.getCond().front();
+  auto& body = op.getBody().front();
+  if (cond.getOperations().size() != 2) return std::nullopt;
+
+  auto matchBbArg = [](Value v, Block& block) -> std::optional<unsigned> {
+    if (!v.isa<BlockArgument>() || v.getParentBlock() != &block)
+      return std::nullopt;
+    return v.cast<BlockArgument>().getArgNumber();
+  };
+
+  auto compare = llvm::dyn_cast<mhlo::CompareOp>(cond.front());
+  // If the rhs of the comapare is defined outside the block, it's a constant
+  // within the loop.
+  if (!compare ||
+      compare.getComparisonDirection() != mhlo::ComparisonDirection::LT ||
+      compare.getRhs().getParentBlock() == &cond ||
+      !getElementTypeOrSelf(compare.getLhs().getType())
+           .isSignlessIntOrIndex()) {
+    return std::nullopt;
+  }
+
+  auto iterArg = matchBbArg(compare.getLhs(), cond);
+  if (!iterArg) return std::nullopt;
+
+  auto add = llvm::dyn_cast_or_null<mhlo::AddOp>(
+      body.getTerminator()->getOperand(*iterArg).getDefiningOp());
+  if (!add || matchBbArg(add.getLhs(), body) != iterArg ||
+      add.getRhs().getParentBlock() == &body) {
+    return std::nullopt;
+  }
+
+  ScfForBounds bounds;
+  bounds.ub = compare.getRhs();
+  bounds.step = add.getRhs();
+  bounds.lb = op->getOperand(*iterArg);
+  bounds.indexArgIndex = *iterArg;
+  return bounds;
+}
+
+// Rewrites `mhlo.while` to `scf.while` or `scf.for`.
 struct WhileOpPattern : public OpConversionPattern<mhlo::WhileOp> {
   using OpConversionPattern<WhileOp>::OpConversionPattern;
 
@@ -86,6 +128,29 @@ struct WhileOpPattern : public OpConversionPattern<mhlo::WhileOp> {
       ConversionPatternRewriter& rewriter) const override {
     auto loc = op.getLoc();
 
+    if (auto bounds = extractForBounds(op)) {
+      auto newForOp = rewriter.create<scf::ForOp>(
+          loc, extractTensorValue(rewriter, bounds->lb),
+          extractTensorValue(rewriter, bounds->ub),
+          extractTensorValue(rewriter, bounds->step), adaptor.getOperands());
+
+      rewriter.setInsertionPointToEnd(newForOp.getBody());
+      // Inline while body, and only replace the mhlo.return with an scf.yield.
+      inlineMhloRegionIntoSCFRegion(rewriter, op.getBody(),
+                                    newForOp.getRegion());
+      auto indexArg = newForOp.getRegion().insertArgument(
+          unsigned{0}, newForOp.getLowerBound().getType(), loc);
+      auto oldIndexArg =
+          newForOp.getRegion().getArgument(1 + bounds->indexArgIndex);
+      rewriter.setInsertionPointToStart(&newForOp.getRegion().front());
+      auto indexArgTensor = rewriter.create<tensor::FromElementsOp>(
+          loc, oldIndexArg.getType(), indexArg);
+      oldIndexArg.replaceAllUsesWith(indexArgTensor);
+
+      rewriter.replaceOp(op, newForOp.getResults());
+      return success();
+    }
+
     auto newWhileOp = rewriter.create<scf::WhileOp>(loc, op.getResultTypes(),
                                                     adaptor.getOperands());
 
@@ -109,7 +174,7 @@ struct WhileOpPattern : public OpConversionPattern<mhlo::WhileOp> {
   }
 };
 
-// Create a memref descriptor given a pointer and memref type information.
+// Rewrites `mhlo.if` to `scf.if`.
 struct IfOpPattern : public OpConversionPattern<mhlo::IfOp> {
   using OpConversionPattern<IfOp>::OpConversionPattern;
 
@@ -129,7 +194,7 @@ struct IfOpPattern : public OpConversionPattern<mhlo::IfOp> {
   }
 };
 
-// Create a memref descriptor given a pointer and memref type information.
+// Rewrites `mhlo.case` to a nested `scf.if`.
 struct CaseOpPattern : public OpConversionPattern<mhlo::CaseOp> {
   using OpConversionPattern<CaseOp>::OpConversionPattern;
 
@@ -142,8 +207,9 @@ struct CaseOpPattern : public OpConversionPattern<mhlo::CaseOp> {
 
     // Determine if the current index matches the case index.
     auto scalarType = idxValue.getType();
+    auto shapedType = scalarType.cast<ShapedType>();
     auto constAttr = DenseElementsAttr::get(
-        scalarType,
+        shapedType,
         {outerBuilder.getI32IntegerAttr(currentIdx).cast<mlir::Attribute>()});
     Value currentIdxVal = outerBuilder.create<mhlo::ConstantOp>(
         loc, idxValue.getType(), constAttr);
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
index b4fadac7667..ca56a4672c2 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
@@ -482,7 +482,7 @@ class EinsumToLinalgConverter : public OpConversionPattern<mhlo::EinsumOp> {
     // Create a 1:1 map from f:strDimension -> affineDimension.
     int64_t nloops = inputInd.size();
     DenseMap<StringRef, AffineExpr> strAffineDimUmap;
-    for (auto& it : llvm::enumerate(inputInd)) {
+    for (const auto& it : llvm::enumerate(inputInd)) {
       strAffineDimUmap[it.value()] = rewriter.getAffineDimExpr(it.index());
     }
 
@@ -715,7 +715,7 @@ class BroadcastOpToBroadcastConverter
         getEmptyTensorFor(rewriter, loc, resultTy, op, adaptor.getOperands());
 
     rewriter.replaceOpWithNewOp<linalg::BroadcastOp>(
-        op, op.getOperand(), emptyTensor, dimensions,
+        op, adaptor.getOperand(), emptyTensor, dimensions,
         linalg::getPrunedAttributeList(op));
     return success();
   }
@@ -774,7 +774,7 @@ Value collapseExpandingDims(PatternRewriter& rewriter, Location loc,
   SmallVector<int64_t> newOperandShape;
   SmallVector<int64_t> newDimensions;
 
-  for (auto& [idx, dim] : llvm::enumerate(dimensions)) {
+  for (const auto& [idx, dim] : llvm::enumerate(dimensions)) {
     currentIndices.push_back(idx);
 
     if (!isExpandingDim(idx)) {
@@ -1073,7 +1073,8 @@ class DynamicBroadcastInDimOpToBroadcastConverter
     auto operandShape = operand.getType().cast<RankedTensorType>().getShape();
     auto broadcastResultShape = llvm::to_vector(resultTy.getShape());
 
-    for (auto [operandIndex, resultIndex] : llvm::enumerate(dimensions)) {
+    for (const auto& [operandIndex, resultIndex] :
+         llvm::enumerate(dimensions)) {
       if (isExpandingDim(operandIndex)) continue;
       broadcastResultShape[resultIndex] = operandShape[operandIndex];
     }
@@ -1412,6 +1413,10 @@ class ReshapeOpConverter : public OpConversionPattern<mhlo::ReshapeOp> {
       for (int i = 0; i < n; ++i) exprs.push_back(rewriter.getAffineDimExpr(i));
       return exprs;
     };
+
+    int64_t totalElems = resultType.getNumElements();
+    auto collapsedType = RankedTensorType::get({totalElems}, elemType);
+
     // Otherwise, we need to first reduce all source dimensions into one and
     // then expand to the destination dimensions. If there is only a single
     // source dimension, the reduce step can be skipped. TensorCollapseShape
@@ -1422,12 +1427,23 @@ class ReshapeOpConverter : public OpConversionPattern<mhlo::ReshapeOp> {
           // dimensions.
           getIdentityExprs(operandType.getRank())};
 
-      collapsedOp =
-          rewriter.create<tensor::CollapseShapeOp>(loc, operand, collapsingMap);
+      // If the operand has a sparse encoding, then the collapsed type should
+      // have a sparse encoding.
+      if (sparse_tensor::getSparseTensorEncoding(operandType)) {
+        SmallVector<int64_t, 1> collapsedShape = {totalElems};
+        auto identityMap =
+            AffineMap::getMultiDimIdentityMap(1, rewriter.getContext());
+        auto oneDimensionType = RankedTensorType::get(collapsedShape, elemType);
+        collapsedType = sparse_tensor::getCOOFromTypeWithOrdering(
+            oneDimensionType, identityMap, true);
+        collapsedOp = rewriter.create<tensor::CollapseShapeOp>(
+            loc, collapsedType, operand, collapsingMap);
+      } else {
+        collapsedOp = rewriter.create<tensor::CollapseShapeOp>(loc, operand,
+                                                               collapsingMap);
+      }
     }
     // Cast to a known static type if the input has dynamic dimensions.
-    int64_t totalElems = resultType.getNumElements();
-    auto collapsedType = RankedTensorType::get({totalElems}, elemType);
     collapsedOp =
         rewriter.create<tensor::CastOp>(loc, collapsedType, collapsedOp);
     if (resultType.getRank() == 1) {
@@ -1497,6 +1513,42 @@ class IotaConverter : public OpConversionPattern<OpTy> {
   }
 };
 
+template <typename OpTy>
+class IotaToMapConverter : public OpConversionPattern<OpTy> {
+ public:
+  using OpConversionPattern<OpTy>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      OpTy iotaOp, typename OpTy::Adaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final {
+    ShapedType resultTy = getHloOpResultType(iotaOp);
+    if (!resultTy) return failure();
+    resultTy = this->typeConverter->convertType(resultTy)
+                   .template dyn_cast<ShapedType>();
+
+    Location loc = iotaOp.getLoc();
+    Value empty = getEmptyTensorFor(rewriter, loc, resultTy, iotaOp,
+                                    adaptor.getOperands());
+
+    auto linalgOp = rewriter.create<linalg::MapOp>(
+        loc, ValueRange{}, empty,
+        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange /*args*/) {
+          Value index = nestedBuilder.create<linalg::IndexOp>(
+              nestedLoc, iotaOp.getIotaDimension());
+          index = nestedBuilder.create<arith::IndexCastOp>(
+              nestedLoc, nestedBuilder.getI64Type(), index);
+          Value result =
+              mhlo::MhloOpToStdScalarOp::mapOpOfType<mhlo::ConvertOp>(
+                  nestedLoc, resultTy.getElementType(), index.getType(),
+                  {ValueRange{index}}, &nestedBuilder);
+          nestedBuilder.create<linalg::YieldOp>(nestedLoc, ValueRange{result});
+        },
+        linalg::getPrunedAttributeList(iotaOp));
+    rewriter.replaceOp(iotaOp, linalgOp.getResult());
+    return success();
+  }
+};
+
 /// Converts mhlo.concatenate operation to a linalg.generic op.
 struct ConcatenateConverter : public OpConversionPattern<mhlo::ConcatenateOp> {
   using OpConversionPattern<mhlo::ConcatenateOp>::OpConversionPattern;
@@ -1543,7 +1595,7 @@ struct ConcatenateConverter : public OpConversionPattern<mhlo::ConcatenateOp> {
           }
 
           Value indexOp = b.create<linalg::IndexOp>(loc, dim);
-          for (auto& it : llvm::enumerate(adaptor.getOperands())) {
+          for (const auto& it : llvm::enumerate(adaptor.getOperands())) {
             Value arg = it.value();
             Value newConcatDimSize;
             scf::IfOp ifOp;
@@ -1681,7 +1733,7 @@ class DynamicSliceConverter : public OpConversionPattern<mhlo::DynamicSliceOp> {
     SmallVector<OpFoldResult, 3> startIndices, sizes;
     Type originalStartIndexType =
         dynamicSliceOp.getStartIndices().front().getType();
-    for (auto& en : llvm::enumerate(
+    for (const auto& en : llvm::enumerate(
              llvm::zip(adaptor.getStartIndices(),
                        dynamicSliceOp.getSliceSizes().getValues<int64_t>()))) {
       int64_t size = std::get<1>(en.value());
@@ -1690,8 +1742,9 @@ class DynamicSliceConverter : public OpConversionPattern<mhlo::DynamicSliceOp> {
       // By mhlo.DynamicSlice definition:
       //   `start_indices[i] = clamp(start_indices[i],
       //       0, operand.dimension_size[i] - size_indices[i])`
-      Value startIndex = extractIndexFromTensor(
-          rewriter, loc, std::get<0>(en.value()), originalStartIndexType);
+      Value startIndex =
+          extractIndexFromTensor(rewriter, loc, std::get<0>(en.value()),
+                                 cast<ShapedType>(originalStartIndexType));
 
       Value mn = rewriter.create<arith::ConstantIndexOp>(loc, 0);
 
@@ -1752,13 +1805,13 @@ class DynamicUpdateSliceConverter
 
     SmallVector<OpFoldResult, 3> startIndices;
     Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    for (auto& en : llvm::enumerate(adaptor.getStartIndices())) {
+    for (const auto& en : llvm::enumerate(adaptor.getStartIndices())) {
       // By mhlo.DynamicUpdateSlice definition:
       //   `start_indices[i] = clamp(start_indices[i],
       //       0, operand.dimension_size[i] - update.dimension_size[i])`
-      Value startIndex =
-          extractIndexFromTensor(rewriter, loc, en.value(),
-                                 op.getStartIndices()[en.index()].getType());
+      Value startIndex = extractIndexFromTensor(
+          rewriter, loc, en.value(),
+          cast<ShapedType>(op.getStartIndices()[en.index()].getType()));
       Value ub = rewriter.create<arith::ConstantIndexOp>(
           loc, operandType.getDimSize(en.index()) -
                    updateType.getDimSize(en.index()));
@@ -1994,7 +2047,7 @@ class MapOpToMapConverter : public OpConversionPattern<mhlo::MapOp> {
     for (Value operand : llvm::drop_begin(adaptor.getOperands(), 1)) {
       coercedOperands.push_back(coerceTensorShape(
           rewriter, loc, cast<TypedValue<ShapedType>>(operand),
-          operand0.getType()));
+          cast<ShapedType>(operand0.getType())));
     }
     Value output = rewriter.create<tensor::EmptyOp>(
         loc, tensor::getMixedSizes(rewriter, loc, operand0),
@@ -2099,8 +2152,9 @@ class ReduceOpToGenericConverter : public OpConversionPattern<mhlo::ReduceOp> {
       initValue = rewriter.createOrFold<tensor::ExtractOp>(loc, initValue);
 
       SmallVector<Value, 8> dynShape = getReduceOpEmptyTensorDynSizes(
-          rewriter, loc, operand, resultType, reductionDims);
-      auto emptyTensor = getEmptyTensor(rewriter, loc, resultType, dynShape);
+          rewriter, loc, operand, cast<ShapedType>(resultType), reductionDims);
+      auto emptyTensor =
+          getEmptyTensor(rewriter, loc, cast<ShapedType>(resultType), dynShape);
       Value filledTensor =
           rewriter.create<linalg::FillOp>(loc, initValue, emptyTensor).result();
       outputs.push_back(filledTensor);
@@ -2206,7 +2260,7 @@ struct ReduceOpToReduceConverter : public OpConversionPattern<mhlo::ReduceOp> {
       // dynamic ones.
       SmallVector<int64_t> resultShape;
       SmallVector<Value, 8> dynShape;
-      for (auto [index, dim] :
+      for (const auto& [index, dim] :
            llvm::enumerate(operand.getType().cast<ShapedType>().getShape())) {
         if (!llvm::is_contained(reductionDims, index)) {
           resultShape.push_back(dim);
@@ -2746,8 +2800,8 @@ struct PadOpConversion : public OpConversionPattern<mhlo::PadOp> {
 
     // We have interior padding, which can be lowered to tensor.insert_slice.
     // Start by filling a result-sized tensor with the pad value.
-    auto emptyTensor =
-        getEmptyTensorFor(rewriter, loc, resultType, op, adaptor.getOperands());
+    auto emptyTensor = getEmptyTensorFor(
+        rewriter, loc, cast<ShapedType>(resultType), op, adaptor.getOperands());
     auto fill =
         rewriter.create<linalg::FillOp>(loc, paddingVal, emptyTensor).result();
 
@@ -2832,7 +2886,7 @@ Value applyConvolutionReversal(Location loc, OpBuilder& b, ConvolutionOp op,
     return filter;
   }
   llvm::SmallVector<int64_t> reversedDims;
-  for (auto [idx, reversed] :
+  for (const auto& [idx, reversed] :
        llvm::enumerate(reversals.value().getValues<bool>())) {
     if (reversed) {
       reversedDims.push_back(
@@ -3724,7 +3778,7 @@ struct ReduceWindowOpConversion
           loc, fakeWindowShapes, resultType.getElementType());
 
       SmallVector<Value> resultDynamicDims;
-      for (auto& en : llvm::enumerate(resultType.getShape())) {
+      for (const auto& en : llvm::enumerate(resultType.getShape())) {
         if (en.value() != ShapedType::kDynamic) continue;
         Value dimSize = rewriter.create<tensor::DimOp>(loc, input, en.index());
         if (en.index() == 0 || static_cast<int64_t>(en.index()) == rank - 1) {
@@ -4048,7 +4102,7 @@ struct GatherConversion : public OpConversionPattern<mhlo::GatherOp> {
     // But then start indices are shuffled by the start index map. To make a
     // full index into the operand, all missing indices are zeroes.
     SmallVector<Value> remappedIndexFromIndices(operandRank, constants[0]);
-    for (auto& it : llvm::enumerate(startIndexMap))
+    for (const auto& it : llvm::enumerate(startIndexMap))
       remappedIndexFromIndices[it.value()] = indexFromStartIndices[it.index()];
 
     // Now we construct the index based on the offset. First we need to remap
@@ -4264,7 +4318,7 @@ class PointwiseToLinalgMapConverter : public OpConversionPattern<OpTy> {
       if (getRank(input) == maxRank) {
         mappedInputs.push_back(coerceTensorShape(
             rewriter, loc, cast<TypedValue<ShapedType>>(input),
-            emptyTensor.getType()));
+            cast<ShapedType>(emptyTensor.getType())));
         scalarInputs.push_back(nullptr);
       } else {
         scalarInputs.push_back(rewriter.create<tensor::ExtractOp>(loc, input));
@@ -4398,9 +4452,7 @@ void populateHloToLinalgConversionPattern(MLIRContext* context,
       BitcastConvertConverter,
       ConcatenateConverter,
       ConstConverterTensor,
-      IotaConverter<mhlo::IotaOp>,
       EinsumToLinalgConverter,
-      IotaConverter<mhlo::DynamicIotaOp>,
       RealDynamicSliceConverter,
       ReshapeOpConverter,
       ReverseConverter,
@@ -4424,6 +4476,8 @@ void populateHloToLinalgConversionPattern(MLIRContext* context,
       BroadcastInDimOpToBroadcastConverter,
       BroadcastOpToBroadcastConverter,
       DynamicBroadcastInDimOpToBroadcastConverter,
+      IotaToMapConverter<mhlo::IotaOp>,
+      IotaToMapConverter<mhlo::DynamicIotaOp>,
       MapOpToMapConverter,
       PointwiseToLinalgMapConverter<mhlo::AbsOp>,
       PointwiseToLinalgMapConverter<mhlo::AddOp>,
@@ -4479,6 +4533,8 @@ void populateHloToLinalgConversionPattern(MLIRContext* context,
   } else {
     patterns->add<
       BroadcastConverter<mhlo::BroadcastOp>,
+      IotaConverter<mhlo::IotaOp>,
+      IotaConverter<mhlo::DynamicIotaOp>,
       HloBroadcastInDimConverter,
       HloDynamicBroadcastInDimConverter,
       MapOpToGenericConverter,
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
index ea734c02816..167ea7a2d72 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
@@ -315,7 +315,7 @@ inline Value getConstantOrSplat(OpBuilder* b, Location loc, Type t,
   if (VectorType vecType = t.dyn_cast<VectorType>()) {
     v = SplatElementsAttr::get(vecType, v);
   }
-  return b->create<arith::ConstantOp>(loc, t, v);
+  return b->create<arith::ConstantOp>(loc, t, cast<TypedAttr>(v));
 }
 
 template <typename PredicateType>
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
index 54892f63ed6..4c9d853c3fb 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
@@ -192,7 +192,7 @@ struct CanonicalizeScatterPattern : public OpRewritePattern<ScatterOp> {
         makeOperandStartIndexPermutations(
             dimsAttrs.getScatterDimsToOperandDims(), operandRank);
 
-    TypedValue<TensorType> canonicalIndices =
+    Value canonicalIndices =
         canonicalizeStartIndices(rewriter, loc, scatterOp.getScatterIndices(),
                                  dimsAttrs.getIndexVectorDim());
 
@@ -204,7 +204,8 @@ struct CanonicalizeScatterPattern : public OpRewritePattern<ScatterOp> {
         dimsAttrs.getScatterDimsToOperandDims(),
         dimsAttrs.getUpdateWindowDims(), dimsAttrs.getInsertedWindowDims());
 
-    int64_t scatterIndicesVectorSize = canonicalIndices.getType().getDimSize(1);
+    int64_t scatterIndicesVectorSize =
+        canonicalIndices.getType().cast<TensorType>().getDimSize(1);
     auto canonicalDimsAttrs = ScatterDimensionNumbersAttr::get(
         rewriter.getContext(),
         /*updateWindowDims=*/
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc
index 76240e8b48b..5d371ed3e4e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc
@@ -46,8 +46,8 @@ struct InferReturnTypesPattern : public RewritePattern {
     SmallVector<Type, 4> types;
     if (failed(definingOpInt.inferReturnTypes(
             op->getContext(), op->getLoc(), definingOp->getOperands(),
-            definingOp->getAttrDictionary(), definingOp->getRegions(),
-            types))) {
+            definingOp->getAttrDictionary(), op->getPropertiesStorage(),
+            definingOp->getRegions(), types))) {
       return failure();
     }
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/CMakeLists.txt
index a2313366aad..4bb7ede7af8 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/CMakeLists.txt
@@ -28,7 +28,6 @@ add_mlir_library(MhloRngUtils
   Core
 
   LINK_LIBS PUBLIC
-  LLVMSupport
   MhloDialect
   MLIRArithDialect
   MLIRIR
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.cc
index cb137e44926..e6161a90154 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.cc
@@ -81,7 +81,7 @@ Value getEmptyTensorFor(OpBuilder& b, Location loc, ShapedType resultType,
     (void)shapeSource.reifyReturnTypeShapes(b, operands, reifiedShapes);
     assert(reifiedShapes.size() == 1 && "Expected one reified result");
     // Construct sizes for the required dimensions.
-    for (auto& en : llvm::enumerate(resultType.getShape())) {
+    for (const auto& en : llvm::enumerate(resultType.getShape())) {
       if (en.value() != ShapedType::kDynamic) continue;
       sizes.push_back(b.create<tensor::ExtractOp>(
           loc, reifiedShapes[0],
@@ -98,7 +98,7 @@ Value preSparsify(Operation* op, llvm::SmallVector<Value, 2>& values, Type rtp,
   // (any sign-op, or an integral abs-op).
   // TODO(peiming, ajcbik): these all can potentially be optimized by applying
   // value transform on sparse_tenosr.value memref
-  if (isa<mhlo::SignOp>(op) ||
+  if (isa<mhlo::SignOp>(op) || isa<mhlo::NegOp>(op) ||
       (isa<mhlo::AbsOp>(op) && hasIntegralShapeType(op)) ||
       isa<chlo::AsinOp>(op) || isa<chlo::AsinhOp>(op) ||
       isa<chlo::AtanOp>(op) || isa<chlo::AtanhOp>(op) ||
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.cc
index 1db992ad266..5564648713a 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.cc
@@ -82,12 +82,11 @@ makeOperandStartIndexPermutations(ArrayRef<int64_t> dimMap, int operandRank) {
   return {permutation, getInversePermutation(permutation)};
 }
 
-TypedValue<TensorType> insertDegenerateDimensions(
-    OpBuilder& b, Location loc, TypedValue<TensorType> tensor,
-    ArrayRef<int64_t> dimsToInsert) {
+Value insertDegenerateDimensions(OpBuilder& b, Location loc, Value tensor,
+                                 ArrayRef<int64_t> dimsToInsert) {
   assert(llvm::is_sorted(dimsToInsert) && "dimsToInsert must be sorted");
   if (dimsToInsert.empty()) return tensor;
-  TensorType type = tensor.getType();
+  TensorType type = tensor.getType().cast<TensorType>();
   SmallVector<int64_t> newShape{type.getShape()};
   for (int64_t dim : dimsToInsert) newShape.insert(newShape.begin() + dim, 1);
   auto newType = RankedTensorType::get(newShape, type.getElementType());
@@ -102,10 +101,10 @@ TypedValue<TensorType> insertDegenerateDimensions(
 // Checks if the indexVectorDim is equal to the rank of `indices`. In that
 // case add the trailing 1 dimension. If indexVectorDim is not the innermost
 // dimension, insert transpose to make it so.
-static TypedValue<TensorType> ensureIndexVectorDimPosition(
-    OpBuilder& b, Location loc, TypedValue<TensorType> indices,
-    int64_t indexVectorDim) {
-  int64_t indicesRank = indices.getType().getRank();
+static Value ensureIndexVectorDimPosition(OpBuilder& b, Location loc,
+                                          Value indices,
+                                          int64_t indexVectorDim) {
+  int64_t indicesRank = indices.getType().cast<TensorType>().getRank();
   if (indexVectorDim == indicesRank - 1) return indices;
   if (indexVectorDim == indicesRank)
     return insertDegenerateDimensions(b, loc, indices, {indicesRank});
@@ -118,12 +117,11 @@ static TypedValue<TensorType> ensureIndexVectorDimPosition(
       .getResult();
 }
 
-TypedValue<TensorType> canonicalizeStartIndices(OpBuilder& b, Location loc,
-                                                TypedValue<TensorType> indices,
-                                                int64_t indexVectorDim) {
+Value canonicalizeStartIndices(OpBuilder& b, Location loc, Value indices,
+                               int64_t indexVectorDim) {
   indices = ensureIndexVectorDimPosition(b, loc, indices, indexVectorDim);
 
-  int64_t indicesRank = indices.getType().getRank();
+  int64_t indicesRank = indices.getType().cast<TensorType>().getRank();
 
   if (indicesRank == 2) return indices;
   if (indicesRank == 1) return insertDegenerateDimensions(b, loc, indices, {0});
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.h
index cf7461be5a0..a0a52105e18 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.h
@@ -46,9 +46,8 @@ bool isCanonicalGather(GatherOp gatherOp);
 //
 // For example tensor<10x4xf32> and dimsToInsert = {0, 2}
 // will result in tensor<1x10x1x4xf32>.
-TypedValue<TensorType> insertDegenerateDimensions(
-    OpBuilder& b, Location loc, TypedValue<TensorType> tensor,
-    ArrayRef<int64_t> dimsToInsert);
+Value insertDegenerateDimensions(OpBuilder& b, Location loc, Value tensor,
+                                 ArrayRef<int64_t> dimsToInsert);
 
 // Given a map from index vector positions to dimension numbers, creates a
 // permutation that when applied to the operand, let you replace the map with
@@ -65,9 +64,8 @@ makeOperandStartIndexPermutations(ArrayRef<int64_t> dimMap, int operandRank);
 //
 // [a, I, b] will be transposed to [a, b, I], then reshaped into [ab, I].
 // [a, b] will be reshaped to [a, b, I(1)] and then reshaped into [ab, I(1)].
-TypedValue<TensorType> canonicalizeStartIndices(OpBuilder& b, Location loc,
-                                                TypedValue<TensorType> indices,
-                                                int64_t indexVectorDim);
+Value canonicalizeStartIndices(OpBuilder& b, Location loc, Value indices,
+                               int64_t indexVectorDim);
 
 }  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/bufferization/hlo_one_shot_bufferize.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/bufferization/hlo_one_shot_bufferize.mlir
new file mode 100644
index 00000000000..cb012feaf59
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/bufferization/hlo_one_shot_bufferize.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-hlo-opt %s --hlo-one-shot-bufferize | FileCheck %s
+
+// TODO(frgossen): Move tests upstream.
+
+func.func @id(%arg : tensor<1x2x3xf32>) -> tensor<1x2x3xf32> {
+  func.return %arg : tensor<1x2x3xf32>
+}
+
+// CHECK: @id(%[[ARG0:.*]]: memref<1x2x3xf32>) -> memref<1x2x3xf32>
+// CHECK:   return %[[ARG0]]
+
+func.func @id_select(%pred : i1, %arg : tensor<1x2x3xf32>)
+    -> tensor<1x2x3xf32> {
+  %0 = arith.select %pred, %arg, %arg : tensor<1x2x3xf32>
+  func.return %0 : tensor<1x2x3xf32>
+}
+
+// CHECK: @id_select(%[[ARG0_0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>) -> memref<1x2x3xf32>
+// CHECK:   %[[SELECT:.*]] = arith.select %[[ARG0_0]], %[[ARG1]], %[[ARG1]]
+// CHECK:   return %[[SELECT]]
+
+func.func @ite(%pred : i1, %lhs : tensor<1x2x3xf32>,
+    %rhs : tensor<1x2x3xf32>) -> tensor<1x2x3xf32> {
+  %0 = scf.if %pred -> tensor<1x2x3xf32> {
+    scf.yield %lhs : tensor<1x2x3xf32>
+  } else {
+    scf.yield %rhs : tensor<1x2x3xf32>
+  }
+  func.return %0 : tensor<1x2x3xf32>
+}
+
+// CHECK: @ite(%[[ARG0_1:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>, %[[ARG2:.*]]: memref<1x2x3xf32>) -> memref<1x2x3xf32>
+// CHECK:   %[[IF:.*]] = scf.if %[[ARG0_1]]
+// CHECK:     scf.yield %[[ARG1_0]]
+// CHECK:   else
+// CHECK:     scf.yield %[[ARG2]]
+// CHECK:   return %[[IF]]
+
+func.func @ite_select(%pred : i1, %lhs : tensor<1x2x3xf32>,
+    %rhs : tensor<1x2x3xf32>) -> tensor<1x2x3xf32> {
+  %0 = arith.select %pred, %lhs, %rhs : tensor<1x2x3xf32>
+  func.return %0 : tensor<1x2x3xf32>
+}
+
+// CHECK: @ite_select(%[[ARG0_2:.*]]: i1, %[[ARG1_1:.*]]: memref<1x2x3xf32>, %[[ARG2_0:.*]]: memref<1x2x3xf32>) -> memref<1x2x3xf32>
+// CHECK:   %[[SELECT_0:.*]] = arith.select %[[ARG0_2]], %[[ARG1_1]], %[[ARG2_0]]
+// CHECK:   return %[[SELECT_0]]
+
+func.func @may_reuse(%pred : i1, %arg : tensor<1x2x3xf32>)
+    -> tensor<1x2x3xf32> {
+  %0 = scf.if %pred -> tensor<1x2x3xf32> {
+    scf.yield %arg : tensor<1x2x3xf32>
+  } else {
+    %new_tensor = bufferization.alloc_tensor() : tensor<1x2x3xf32>
+    scf.yield %new_tensor : tensor<1x2x3xf32>
+  }
+  func.return %0 : tensor<1x2x3xf32>
+}
+
+// CHECK: @may_reuse(%[[ARG0_3:.*]]: i1, %[[ARG1_2:.*]]: memref<1x2x3xf32>) -> memref<1x2x3xf32>
+// CHECK:   %[[IF_0:.*]] = scf.if %[[ARG0_3]]
+// CHECK:     scf.yield %[[ARG1_2]]
+// CHECK:   else
+// CHECK:     %[[ALLOC:.*]] = memref.alloc
+// CHECK:     scf.yield %[[ALLOC]]
+// CHECK:   return %[[IF_0]]
+
+func.func @user(%pred : i1, %arg0 : tensor<1x2x3xf32>,
+    %arg1 : tensor<1x2x3xf32>, %arg2 : tensor<1x2x3xf32>,
+    %arg3 : tensor<1x2x3xf32>) -> tensor<1x2x3xf32> {
+  %0 = func.call @id(%arg0) : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  %1 = func.call @ite(%pred, %0, %arg1)
+      : (i1, tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  %2 = func.call @ite_select(%pred, %1, %arg2)
+      : (i1, tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  %3 = func.call @may_reuse(%pred, %2)
+      : (i1, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  func.return %3 : tensor<1x2x3xf32>
+}
+
+// CHECK: @user(%[[ARG0_4:.*]]: i1, %[[ARG1_3:.*]]: memref<1x2x3xf32>, %[[ARG2_1:.*]]: memref<1x2x3xf32>, %[[ARG3:.*]]: memref<1x2x3xf32>, %[[ARG4:.*]]: memref<1x2x3xf32>) -> memref<1x2x3xf32>
+// CHECK:   %[[VAL:.*]] = call @id(%[[ARG1_3]])
+// CHECK:   %[[VAL_0:.*]] = call @ite(%[[ARG0_4]], %[[VAL]], %[[ARG2_1]])
+// CHECK:   %[[VAL_1:.*]] = call @ite_select(%[[ARG0_4]], %[[VAL_0]], %[[ARG3]])
+// CHECK:   %[[VAL_2:.*]] = call @may_reuse(%[[ARG0_4]], %[[VAL_1]])
+// CHECK:   return %[[VAL_2]]
+
+func.func @user_fusion_0(%arg : tensor<?xf32>, %init : tensor<?xf32>)
+    -> tensor<?xf32> attributes {fusion} {
+  %0 = linalg.map { math.absf } ins(%arg : tensor<?xf32>) outs(%init : tensor<?xf32>)
+  func.return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: @user_fusion_0
+// CHECK-SAME:    %[[ARG:[0-9a-zA-Z]*]]: memref<?xf32, strided<[?], offset: ?>>
+// CHECK-SAME:    %[[INIT:.*]]: memref<?xf32, strided<[?], offset: ?>>
+// CHECK:       linalg.map
+// CHECK-SAME:    ins(%[[ARG]]
+// CHECK-SAME:    outs(%[[INIT]]
+
+func.func @user_dynamic(%arg : tensor<?xf32>, %init : tensor<?xf32>)
+    -> tensor<?xf32> {
+  %0 = func.call @user_fusion_0(%arg, %init)
+      : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  func.return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: @user_dynamic(
+// CHECK-SAME:    %[[ARG:.*]]: memref<?xf32>, %[[INIT:.*]]: memref<?xf32>)
+// CHECK:         %[[ARG_CAST:.*]] = memref.cast %[[ARG]]
+// CHECK-SAME:      : memref<?xf32> to memref<?xf32, strided<[?], offset: ?>>
+// CHECK:         %[[INIT_CAST:.*]] = memref.cast %[[INIT]]
+// CHECK-SAME:      : memref<?xf32> to memref<?xf32, strided<[?], offset: ?>>
+// CHECK:         %[[RES:.*]] = call @user_fusion_0(%[[ARG_CAST]], %[[INIT_CAST]])
+// CHECK:         %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[DIM:.*]] = memref.dim %[[RES]], %[[C0]]
+// CHECK:         %[[ALLOC:.*]] = memref.alloc(%[[DIM]]) : memref<?xf32>
+// CHECK:         memref.copy %[[RES]], %[[ALLOC]]
+// CHECK:         return %[[ALLOC]] : memref<?xf32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
index 248030c881c..073a833578c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt --chlo-legalize-to-hlo --split-input-file %s | FileCheck %s
+// RUN: mlir-hlo-opt --chlo-legalize-to-hlo --split-input-file %s | FileCheck %s --dump-input-context=20
 
 // CHECK-LABEL: func.func @asin_bf16(
 // CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<bf16>
@@ -263,11 +263,8 @@ func.func @conj(%arg0: tensor<3xcomplex<f32>>) -> tensor<3xcomplex<f32>> {
 // CHECK-SAME: %[[ARG:.*]]: tensor<f64>
 func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK: %[[TMP_0:.*]] = mhlo.multiply %[[ARG]], %[[ARG]]
-  // CHECK: %[[TMP_1:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_2:.*]] = mhlo.multiply %[[TMP_1]], %[[TMP_0]]
   // CHECK: %[[TMP_3:.*]] = mhlo.constant dense<9.6049737398705161>
-  // CHECK: %[[TMP_4:.*]] = mhlo.add %[[TMP_2]], %[[TMP_3]]
-  // CHECK: %[[TMP_5:.*]] = mhlo.multiply %[[TMP_4]], %[[TMP_0]]
+  // CHECK: %[[TMP_5:.*]] = mhlo.multiply %[[TMP_3]], %[[TMP_0]]
   // CHECK: %[[TMP_6:.*]] = mhlo.constant dense<90.026019720384269>
   // CHECK: %[[TMP_7:.*]] = mhlo.add %[[TMP_5]], %[[TMP_6]]
   // CHECK: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_7]], %[[TMP_0]]
@@ -280,11 +277,8 @@ func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK: %[[TMP_15:.*]] = mhlo.constant dense<55592.301301039493>
   // CHECK: %[[TMP_16:.*]] = mhlo.add %[[TMP_14]], %[[TMP_15]]
   // CHECK: %[[TMP_17:.*]] = mhlo.multiply %[[ARG]], %[[TMP_16]]
-  // CHECK: %[[TMP_18:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_19:.*]] = mhlo.multiply %[[TMP_18]], %[[TMP_0]]
   // CHECK: %[[TMP_20:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_21:.*]] = mhlo.add %[[TMP_19]], %[[TMP_20]]
-  // CHECK: %[[TMP_22:.*]] = mhlo.multiply %[[TMP_21]], %[[TMP_0]]
+  // CHECK: %[[TMP_22:.*]] = mhlo.multiply %[[TMP_20]], %[[TMP_0]]
   // CHECK: %[[TMP_23:.*]] = mhlo.constant dense<33.561714164750313>
   // CHECK: %[[TMP_24:.*]] = mhlo.add %[[TMP_22]], %[[TMP_23]]
   // CHECK: %[[TMP_25:.*]] = mhlo.multiply %[[TMP_24]], %[[TMP_0]]
@@ -305,11 +299,8 @@ func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK: %[[TMP_40:.*]] = mhlo.negate %[[TMP_39]]
   // CHECK: %[[TMP_41:.*]] = mhlo.exponential %[[TMP_40]]
   // CHECK: %[[TMP_42:.*]] = mhlo.abs %[[ARG]]
-  // CHECK: %[[TMP_43:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_44:.*]] = mhlo.multiply %[[TMP_43]], %[[TMP_42]]
   // CHECK: %[[TMP_45:.*]] = mhlo.constant dense<2.4619698147353052E-10>
-  // CHECK: %[[TMP_46:.*]] = mhlo.add %[[TMP_44]], %[[TMP_45]]
-  // CHECK: %[[TMP_47:.*]] = mhlo.multiply %[[TMP_46]], %[[TMP_42]]
+  // CHECK: %[[TMP_47:.*]] = mhlo.multiply %[[TMP_45]], %[[TMP_42]]
   // CHECK: %[[TMP_48:.*]] = mhlo.constant dense<0.56418956483106886>
   // CHECK: %[[TMP_49:.*]] = mhlo.add %[[TMP_47]], %[[TMP_48]]
   // CHECK: %[[TMP_50:.*]] = mhlo.multiply %[[TMP_49]], %[[TMP_42]]
@@ -334,11 +325,8 @@ func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK: %[[TMP_69:.*]] = mhlo.constant dense<557.53533536939938>
   // CHECK: %[[TMP_70:.*]] = mhlo.add %[[TMP_68]], %[[TMP_69]]
   // CHECK: %[[TMP_71:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_70]]
-  // CHECK: %[[TMP_72:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_73:.*]] = mhlo.multiply %[[TMP_72]], %[[TMP_42]]
   // CHECK: %[[TMP_74:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_75:.*]] = mhlo.add %[[TMP_73]], %[[TMP_74]]
-  // CHECK: %[[TMP_76:.*]] = mhlo.multiply %[[TMP_75]], %[[TMP_42]]
+  // CHECK: %[[TMP_76:.*]] = mhlo.multiply %[[TMP_74]], %[[TMP_42]]
   // CHECK: %[[TMP_77:.*]] = mhlo.constant dense<13.228195115474499>
   // CHECK: %[[TMP_78:.*]] = mhlo.add %[[TMP_76]], %[[TMP_77]]
   // CHECK: %[[TMP_79:.*]] = mhlo.multiply %[[TMP_78]], %[[TMP_42]]
@@ -363,11 +351,8 @@ func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK: %[[TMP_98:.*]] = mhlo.constant dense<557.53534081772773>
   // CHECK: %[[TMP_99:.*]] = mhlo.add %[[TMP_97]], %[[TMP_98]]
   // CHECK: %[[TMP_100:.*]] = mhlo.divide %[[TMP_71]], %[[TMP_99]]
-  // CHECK: %[[TMP_101:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_102:.*]] = mhlo.multiply %[[TMP_101]], %[[TMP_42]]
   // CHECK: %[[TMP_103:.*]] = mhlo.constant dense<0.56418958354775506>
-  // CHECK: %[[TMP_104:.*]] = mhlo.add %[[TMP_102]], %[[TMP_103]]
-  // CHECK: %[[TMP_105:.*]] = mhlo.multiply %[[TMP_104]], %[[TMP_42]]
+  // CHECK: %[[TMP_105:.*]] = mhlo.multiply %[[TMP_103]], %[[TMP_42]]
   // CHECK: %[[TMP_106:.*]] = mhlo.constant dense<1.275366707599781>
   // CHECK: %[[TMP_107:.*]] = mhlo.add %[[TMP_105]], %[[TMP_106]]
   // CHECK: %[[TMP_108:.*]] = mhlo.multiply %[[TMP_107]], %[[TMP_42]]
@@ -383,11 +368,8 @@ func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK: %[[TMP_118:.*]] = mhlo.constant dense<2.9788666537210022>
   // CHECK: %[[TMP_119:.*]] = mhlo.add %[[TMP_117]], %[[TMP_118]]
   // CHECK: %[[TMP_120:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_119]]
-  // CHECK: %[[TMP_121:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_122:.*]] = mhlo.multiply %[[TMP_121]], %[[TMP_42]]
   // CHECK: %[[TMP_123:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_124:.*]] = mhlo.add %[[TMP_122]], %[[TMP_123]]
-  // CHECK: %[[TMP_125:.*]] = mhlo.multiply %[[TMP_124]], %[[TMP_42]]
+  // CHECK: %[[TMP_125:.*]] = mhlo.multiply %[[TMP_123]], %[[TMP_42]]
   // CHECK: %[[TMP_126:.*]] = mhlo.constant dense<2.2605286322011726>
   // CHECK: %[[TMP_127:.*]] = mhlo.add %[[TMP_125]], %[[TMP_126]]
   // CHECK: %[[TMP_128:.*]] = mhlo.multiply %[[TMP_127]], %[[TMP_42]]
@@ -429,130 +411,50 @@ func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
 // -----
 
 // CHECK-LABEL: @erf_f32
-// CHECK-SAME: ([[ARG_0:.*]]: tensor<f32>
-// CHECK:     [[VAL_0:%.*]] = mhlo.constant dense<-4.000000e+00> : tensor<f32>
-// CHECK-NEXT:     [[VAL_1:%.*]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
-// CHECK-NEXT:     [[VAL_2:%.*]] = mhlo.clamp [[VAL_0]], [[ARG_0]], [[VAL_1]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_3:%.*]] = mhlo.multiply [[VAL_2]], [[VAL_2]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_4:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK-NEXT:     [[VAL_5:%.*]] = mhlo.multiply [[VAL_4]], [[VAL_3]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_6:%.*]] = mhlo.constant dense<-2.72614237E-10> : tensor<f32>
-// CHECK-NEXT:     [[VAL_7:%.*]] = mhlo.add [[VAL_5]], [[VAL_6]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_8:%.*]] = mhlo.multiply [[VAL_7]], [[VAL_3]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_9:%.*]] = mhlo.constant dense<2.77068146E-8> : tensor<f32>
-// CHECK-NEXT:     [[VAL_10:%.*]] = mhlo.add [[VAL_8]], [[VAL_9]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_11:%.*]] = mhlo.multiply [[VAL_10]], [[VAL_3]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_12:%.*]] = mhlo.constant dense<-2.10102394E-6> : tensor<f32>
-// CHECK-NEXT:     [[VAL_13:%.*]] = mhlo.add [[VAL_11]], [[VAL_12]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_14:%.*]] = mhlo.multiply [[VAL_13]], [[VAL_3]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_15:%.*]] = mhlo.constant dense<-5.69250624E-5> : tensor<f32>
-// CHECK-NEXT:     [[VAL_16:%.*]] = mhlo.add [[VAL_14]], [[VAL_15]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_17:%.*]] = mhlo.multiply [[VAL_16]], [[VAL_3]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_18:%.*]] = mhlo.constant dense<-7.34990637E-4> : tensor<f32>
-// CHECK-NEXT:     [[VAL_19:%.*]] = mhlo.add [[VAL_17]], [[VAL_18]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_20:%.*]] = mhlo.multiply [[VAL_19]], [[VAL_3]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_21:%.*]] = mhlo.constant dense<-2.954600e-03> : tensor<f32>
-// CHECK-NEXT:     [[VAL_22:%.*]] = mhlo.add [[VAL_20]], [[VAL_21]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_23:%.*]] = mhlo.multiply [[VAL_22]], [[VAL_3]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_24:%.*]] = mhlo.constant dense<-0.0160960332> : tensor<f32>
-// CHECK-NEXT:     [[VAL_25:%.*]] = mhlo.add [[VAL_23]], [[VAL_24]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_26:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK-NEXT:     [[VAL_27:%.*]] = mhlo.multiply [[VAL_26]], [[VAL_3]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_28:%.*]] = mhlo.constant dense<-1.45660715E-5> : tensor<f32>
-// CHECK-NEXT:     [[VAL_29:%.*]] = mhlo.add [[VAL_27]], [[VAL_28]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_30:%.*]] = mhlo.multiply [[VAL_29]], [[VAL_3]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_31:%.*]] = mhlo.constant dense<-2.13374049E-4> : tensor<f32>
-// CHECK-NEXT:     [[VAL_32:%.*]] = mhlo.add [[VAL_30]], [[VAL_31]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_33:%.*]] = mhlo.multiply [[VAL_32]], [[VAL_3]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_34:%.*]] = mhlo.constant dense<-0.00168282702> : tensor<f32>
-// CHECK-NEXT:     [[VAL_35:%.*]] = mhlo.add [[VAL_33]], [[VAL_34]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_36:%.*]] = mhlo.multiply [[VAL_35]], [[VAL_3]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_37:%.*]] = mhlo.constant dense<-0.00737332925> : tensor<f32>
-// CHECK-NEXT:     [[VAL_38:%.*]] = mhlo.add [[VAL_36]], [[VAL_37]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_39:%.*]] = mhlo.multiply [[VAL_38]], [[VAL_3]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_40:%.*]] = mhlo.constant dense<-0.0142647391> : tensor<f32>
-// CHECK-NEXT:     [[VAL_41:%.*]] = mhlo.add [[VAL_39]], [[VAL_40]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_42:%.*]] = mhlo.multiply [[VAL_2]], [[VAL_25]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_43:%.*]] = mhlo.divide [[VAL_42]], [[VAL_41]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_44:%.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
-// CHECK-NEXT:     [[VAL_45:%.*]] = mhlo.multiply [[ARG_0]], [[ARG_0]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_46:%.*]] = mhlo.negate [[VAL_45]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_47:%.*]] = mhlo.abs [[ARG_0]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_48:%.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
-// CHECK-NEXT:     [[VAL_49:%.*]] = mhlo.divide [[VAL_48]], [[VAL_45]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_50:%.*]] = mhlo.exponential [[VAL_46]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_51:%.*]] = mhlo.divide [[VAL_48]], [[VAL_47]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_52:%.*]] = mhlo.multiply [[VAL_50]], [[VAL_51]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_53:%.*]] = mhlo.constant dense<2.000000e+00> : tensor<f32>
-// CHECK-NEXT:     [[VAL_54:%.*]] = mhlo.compare  LT, [[VAL_47]], [[VAL_53]],  NOTYPE : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK-NEXT:     [[VAL_55:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK-NEXT:     [[VAL_56:%.*]] = mhlo.multiply [[VAL_55]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_57:%.*]] = mhlo.constant dense<2.326820e-02> : tensor<f32>
-// CHECK-NEXT:     [[VAL_58:%.*]] = mhlo.add [[VAL_56]], [[VAL_57]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_59:%.*]] = mhlo.multiply [[VAL_58]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_60:%.*]] = mhlo.constant dense<-0.138703942> : tensor<f32>
-// CHECK-NEXT:     [[VAL_61:%.*]] = mhlo.add [[VAL_59]], [[VAL_60]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_62:%.*]] = mhlo.multiply [[VAL_61]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_63:%.*]] = mhlo.constant dense<0.368742466> : tensor<f32>
-// CHECK-NEXT:     [[VAL_64:%.*]] = mhlo.add [[VAL_62]], [[VAL_63]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_65:%.*]] = mhlo.multiply [[VAL_64]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_66:%.*]] = mhlo.constant dense<-0.582473278> : tensor<f32>
-// CHECK-NEXT:     [[VAL_67:%.*]] = mhlo.add [[VAL_65]], [[VAL_66]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_68:%.*]] = mhlo.multiply [[VAL_67]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_69:%.*]] = mhlo.constant dense<0.621000468> : tensor<f32>
-// CHECK-NEXT:     [[VAL_70:%.*]] = mhlo.add [[VAL_68]], [[VAL_69]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_71:%.*]] = mhlo.multiply [[VAL_70]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_72:%.*]] = mhlo.constant dense<-0.494451523> : tensor<f32>
-// CHECK-NEXT:     [[VAL_73:%.*]] = mhlo.add [[VAL_71]], [[VAL_72]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_74:%.*]] = mhlo.multiply [[VAL_73]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_75:%.*]] = mhlo.constant dense<3.404880e-01> : tensor<f32>
-// CHECK-NEXT:     [[VAL_76:%.*]] = mhlo.add [[VAL_74]], [[VAL_75]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_77:%.*]] = mhlo.multiply [[VAL_76]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_78:%.*]] = mhlo.constant dense<-0.274112701> : tensor<f32>
-// CHECK-NEXT:     [[VAL_79:%.*]] = mhlo.add [[VAL_77]], [[VAL_78]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_80:%.*]] = mhlo.multiply [[VAL_79]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_81:%.*]] = mhlo.constant dense<0.563825965> : tensor<f32>
-// CHECK-NEXT:     [[VAL_82:%.*]] = mhlo.add [[VAL_80]], [[VAL_81]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_83:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK-NEXT:     [[VAL_84:%.*]] = mhlo.multiply [[VAL_83]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_85:%.*]] = mhlo.constant dense<-10.477664> : tensor<f32>
-// CHECK-NEXT:     [[VAL_86:%.*]] = mhlo.add [[VAL_84]], [[VAL_85]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_87:%.*]] = mhlo.multiply [[VAL_86]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_88:%.*]] = mhlo.constant dense<1.297720e+01> : tensor<f32>
-// CHECK-NEXT:     [[VAL_89:%.*]] = mhlo.add [[VAL_87]], [[VAL_88]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_90:%.*]] = mhlo.multiply [[VAL_89]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_91:%.*]] = mhlo.constant dense<-7.49551868> : tensor<f32>
-// CHECK-NEXT:     [[VAL_92:%.*]] = mhlo.add [[VAL_90]], [[VAL_91]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_93:%.*]] = mhlo.multiply [[VAL_92]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_94:%.*]] = mhlo.constant dense<2.92101908> : tensor<f32>
-// CHECK-NEXT:     [[VAL_95:%.*]] = mhlo.add [[VAL_93]], [[VAL_94]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_96:%.*]] = mhlo.multiply [[VAL_95]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_97:%.*]] = mhlo.constant dense<-1.01526523> : tensor<f32>
-// CHECK-NEXT:     [[VAL_98:%.*]] = mhlo.add [[VAL_96]], [[VAL_97]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_99:%.*]] = mhlo.multiply [[VAL_98]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_100:%.*]] = mhlo.constant dense<0.42184633> : tensor<f32>
-// CHECK-NEXT:     [[VAL_101:%.*]] = mhlo.add [[VAL_99]], [[VAL_100]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_102:%.*]] = mhlo.multiply [[VAL_101]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_103:%.*]] = mhlo.constant dense<-0.282076746> : tensor<f32>
-// CHECK-NEXT:     [[VAL_104:%.*]] = mhlo.add [[VAL_102]], [[VAL_103]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_105:%.*]] = mhlo.multiply [[VAL_104]], [[VAL_49]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_106:%.*]] = mhlo.constant dense<0.564189494> : tensor<f32>
-// CHECK-NEXT:     [[VAL_107:%.*]] = mhlo.add [[VAL_105]], [[VAL_106]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_108:%.*]] = mhlo.select [[VAL_54]], [[VAL_82]], [[VAL_107]] : tensor<i1>, tensor<f32>
-// CHECK-NEXT:     [[VAL_109:%.*]] = mhlo.multiply [[VAL_52]], [[VAL_108]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_110:%.*]] = mhlo.constant dense<-88.7228394> : tensor<f32>
-// CHECK-NEXT:     [[VAL_111:%.*]] = mhlo.compare  LT, [[VAL_46]], [[VAL_110]],  NOTYPE : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK-NEXT:     [[VAL_112:%.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK-NEXT:     [[VAL_113:%.*]] = mhlo.select [[VAL_111]], [[VAL_112]], [[VAL_109]] : tensor<i1>, tensor<f32>
-// CHECK-NEXT:     [[VAL_114:%.*]] = mhlo.compare  LT, [[ARG_0]], [[VAL_112]],  NOTYPE : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK-NEXT:     [[VAL_115:%.*]] = mhlo.subtract [[VAL_53]], [[VAL_113]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_116:%.*]] = mhlo.select [[VAL_114]], [[VAL_115]], [[VAL_113]] : tensor<i1>, tensor<f32>
-// CHECK-NEXT:     [[VAL_117:%.*]] = mhlo.subtract [[VAL_44]], [[VAL_116]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_118:%.*]] = mhlo.abs [[ARG_0]] : tensor<f32>
-// CHECK-NEXT:     [[VAL_119:%.*]] = mhlo.compare  LT, [[VAL_118]], [[VAL_44]],  NOTYPE : (tensor<f32>, tensor<f32>) -> tensor<i1>
-// CHECK-NEXT:     [[VAL_120:%.*]] = mhlo.select [[VAL_119]], [[VAL_43]], [[VAL_117]] : tensor<i1>, tensor<f32>
-
+// CHECK-SAME: %[[ARG:.*]]: tensor<f32>
 func.func @erf_f32(%arg : tensor<f32>) -> tensor<f32> {
+  // CHECK-DAG: %[[TMP_0:.*]] = mhlo.constant dense<-4.000000e+00>
+  // CHECK-DAG: %[[TMP_1:.*]] = mhlo.constant dense<4.000000e+00>
+  // CHECK: %[[TMP_2:.*]] = mhlo.clamp %[[TMP_0]], %[[ARG]], %[[TMP_1]]
+  // CHECK: %[[TMP_3:.*]] = mhlo.multiply %[[TMP_2]], %[[TMP_2]]
+  // CHECK: %[[TMP_6:.*]] = mhlo.constant dense<-2.72614237E-10>
+  // CHECK: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_6]], %[[TMP_3]]
+  // CHECK: %[[TMP_9:.*]] = mhlo.constant dense<2.77068146E-8>
+  // CHECK: %[[TMP_10:.*]] = mhlo.add %[[TMP_8]], %[[TMP_9]]
+  // CHECK: %[[TMP_11:.*]] = mhlo.multiply %[[TMP_10]], %[[TMP_3]]
+  // CHECK: %[[TMP_12:.*]] = mhlo.constant dense<-2.10102394E-6>
+  // CHECK: %[[TMP_13:.*]] = mhlo.add %[[TMP_11]], %[[TMP_12]]
+  // CHECK: %[[TMP_14:.*]] = mhlo.multiply %[[TMP_13]], %[[TMP_3]]
+  // CHECK: %[[TMP_15:.*]] = mhlo.constant dense<-5.69250624E-5>
+  // CHECK: %[[TMP_16:.*]] = mhlo.add %[[TMP_14]], %[[TMP_15]]
+  // CHECK: %[[TMP_17:.*]] = mhlo.multiply %[[TMP_16]], %[[TMP_3]]
+  // CHECK: %[[TMP_18:.*]] = mhlo.constant dense<-7.34990637E-4>
+  // CHECK: %[[TMP_19:.*]] = mhlo.add %[[TMP_17]], %[[TMP_18]]
+  // CHECK: %[[TMP_20:.*]] = mhlo.multiply %[[TMP_19]], %[[TMP_3]]
+  // CHECK: %[[TMP_21:.*]] = mhlo.constant dense<-2.954600e-03>
+  // CHECK: %[[TMP_22:.*]] = mhlo.add %[[TMP_20]], %[[TMP_21]]
+  // CHECK: %[[TMP_23:.*]] = mhlo.multiply %[[TMP_22]], %[[TMP_3]]
+  // CHECK: %[[TMP_24:.*]] = mhlo.constant dense<-0.0160960332>
+  // CHECK: %[[TMP_25:.*]] = mhlo.add %[[TMP_23]], %[[TMP_24]]
+  // CHECK: %[[TMP_28:.*]] = mhlo.constant dense<-1.45660715E-5>
+  // CHECK: %[[TMP_30:.*]] = mhlo.multiply %[[TMP_28]], %[[TMP_3]]
+  // CHECK: %[[TMP_31:.*]] = mhlo.constant dense<-2.13374049E-4>
+  // CHECK: %[[TMP_32:.*]] = mhlo.add %[[TMP_30]], %[[TMP_31]]
+  // CHECK: %[[TMP_33:.*]] = mhlo.multiply %[[TMP_32]], %[[TMP_3]]
+  // CHECK: %[[TMP_34:.*]] = mhlo.constant dense<-0.00168282702>
+  // CHECK: %[[TMP_35:.*]] = mhlo.add %[[TMP_33]], %[[TMP_34]]
+  // CHECK: %[[TMP_36:.*]] = mhlo.multiply %[[TMP_35]], %[[TMP_3]]
+  // CHECK: %[[TMP_37:.*]] = mhlo.constant dense<-0.00737332925>
+  // CHECK: %[[TMP_38:.*]] = mhlo.add %[[TMP_36]], %[[TMP_37]]
+  // CHECK: %[[TMP_39:.*]] = mhlo.multiply %[[TMP_38]], %[[TMP_3]]
+  // CHECK: %[[TMP_40:.*]] = mhlo.constant dense<-0.0142647391>
+  // CHECK: %[[TMP_41:.*]] = mhlo.add %[[TMP_39]], %[[TMP_40]]
+  // CHECK: %[[TMP_42:.*]] = mhlo.multiply %[[TMP_2]], %[[TMP_25]]
+  // CHECK: %[[TMP_43:.*]] = mhlo.divide %[[TMP_42]], %[[TMP_41]]
+  // CHECK-DAG: %[[TMP_44:.*]] = mhlo.constant dense<-1.000000e+00>
+  // CHECK-DAG: %[[TMP_45:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[RESULT:.*]] = mhlo.clamp %[[TMP_44]], %[[TMP_43]], %[[TMP_45]]
+  // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<f32>) -> tensor<f32>
   func.return %1 : tensor<f32>
 }
@@ -638,11 +540,8 @@ func.func @erfc_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK-NEXT: %[[TMP_1:.*]] = mhlo.negate %[[TMP_0]]
   // CHECK-NEXT: %[[TMP_2:.*]] = mhlo.exponential %[[TMP_1]]
   // CHECK-NEXT: %[[TMP_3:.*]] = mhlo.abs %[[ARG]]
-  // CHECK-NEXT: %[[TMP_4:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK-NEXT: %[[TMP_5:.*]] = mhlo.multiply %[[TMP_4]], %[[TMP_3]]
   // CHECK-NEXT: %[[TMP_6:.*]] = mhlo.constant dense<2.4619698147353052E-10>
-  // CHECK-NEXT: %[[TMP_7:.*]] = mhlo.add %[[TMP_5]], %[[TMP_6]]
-  // CHECK-NEXT: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_7]], %[[TMP_3]]
+  // CHECK-NEXT: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_6]], %[[TMP_3]]
   // CHECK-NEXT: %[[TMP_9:.*]] = mhlo.constant dense<0.56418956483106886>
   // CHECK-NEXT: %[[TMP_10:.*]] = mhlo.add %[[TMP_8]], %[[TMP_9]]
   // CHECK-NEXT: %[[TMP_11:.*]] = mhlo.multiply %[[TMP_10]], %[[TMP_3]]
@@ -667,11 +566,8 @@ func.func @erfc_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK-NEXT: %[[TMP_30:.*]] = mhlo.constant dense<557.53533536939938>
   // CHECK-NEXT: %[[TMP_31:.*]] = mhlo.add %[[TMP_29]], %[[TMP_30]]
   // CHECK-NEXT: %[[TMP_32:.*]] = mhlo.multiply %[[TMP_2]], %[[TMP_31]]
-  // CHECK-NEXT: %[[TMP_33:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK-NEXT: %[[TMP_34:.*]] = mhlo.multiply %[[TMP_33]], %[[TMP_3]]
   // CHECK-NEXT: %[[TMP_35:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK-NEXT: %[[TMP_36:.*]] = mhlo.add %[[TMP_34]], %[[TMP_35]]
-  // CHECK-NEXT: %[[TMP_37:.*]] = mhlo.multiply %[[TMP_36]], %[[TMP_3]]
+  // CHECK-NEXT: %[[TMP_37:.*]] = mhlo.multiply %[[TMP_35]], %[[TMP_3]]
   // CHECK-NEXT: %[[TMP_38:.*]] = mhlo.constant dense<13.228195115474499>
   // CHECK-NEXT: %[[TMP_39:.*]] = mhlo.add %[[TMP_37]], %[[TMP_38]]
   // CHECK-NEXT: %[[TMP_40:.*]] = mhlo.multiply %[[TMP_39]], %[[TMP_3]]
@@ -696,11 +592,8 @@ func.func @erfc_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK-NEXT: %[[TMP_59:.*]] = mhlo.constant dense<557.53534081772773>
   // CHECK-NEXT: %[[TMP_60:.*]] = mhlo.add %[[TMP_58]], %[[TMP_59]]
   // CHECK-NEXT: %[[TMP_61:.*]] = mhlo.divide %[[TMP_32]], %[[TMP_60]]
-  // CHECK-NEXT: %[[TMP_62:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK-NEXT: %[[TMP_63:.*]] = mhlo.multiply %[[TMP_62]], %[[TMP_3]]
   // CHECK-NEXT: %[[TMP_64:.*]] = mhlo.constant dense<0.56418958354775506>
-  // CHECK-NEXT: %[[TMP_65:.*]] = mhlo.add %[[TMP_63]], %[[TMP_64]]
-  // CHECK-NEXT: %[[TMP_66:.*]] = mhlo.multiply %[[TMP_65]], %[[TMP_3]]
+  // CHECK-NEXT: %[[TMP_66:.*]] = mhlo.multiply %[[TMP_64]], %[[TMP_3]]
   // CHECK-NEXT: %[[TMP_67:.*]] = mhlo.constant dense<1.275366707599781>
   // CHECK-NEXT: %[[TMP_68:.*]] = mhlo.add %[[TMP_66]], %[[TMP_67]]
   // CHECK-NEXT: %[[TMP_69:.*]] = mhlo.multiply %[[TMP_68]], %[[TMP_3]]
@@ -716,11 +609,8 @@ func.func @erfc_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK-NEXT: %[[TMP_79:.*]] = mhlo.constant dense<2.9788666537210022>
   // CHECK-NEXT: %[[TMP_80:.*]] = mhlo.add %[[TMP_78]], %[[TMP_79]]
   // CHECK-NEXT: %[[TMP_81:.*]] = mhlo.multiply %[[TMP_2]], %[[TMP_80]]
-  // CHECK-NEXT: %[[TMP_82:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK-NEXT: %[[TMP_83:.*]] = mhlo.multiply %[[TMP_82]], %[[TMP_3]]
   // CHECK-NEXT: %[[TMP_84:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK-NEXT: %[[TMP_85:.*]] = mhlo.add %[[TMP_83]], %[[TMP_84]]
-  // CHECK-NEXT: %[[TMP_86:.*]] = mhlo.multiply %[[TMP_85]], %[[TMP_3]]
+  // CHECK-NEXT: %[[TMP_86:.*]] = mhlo.multiply %[[TMP_84]], %[[TMP_3]]
   // CHECK-NEXT: %[[TMP_87:.*]] = mhlo.constant dense<2.2605286322011726>
   // CHECK-NEXT: %[[TMP_88:.*]] = mhlo.add %[[TMP_86]], %[[TMP_87]]
   // CHECK-NEXT: %[[TMP_89:.*]] = mhlo.multiply %[[TMP_88]], %[[TMP_3]]
@@ -752,11 +642,8 @@ func.func @erfc_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK-NEXT: %[[TMP_116:.*]] = mhlo.select %[[TMP_113]], %[[TMP_115]], %[[TMP_111]]
   // CHECK-NEXT: %[[TMP_117:.*]] = mhlo.constant dense<1.000000e+00>
   // CHECK-NEXT: %[[TMP_118:.*]] = mhlo.multiply %[[ARG]], %[[ARG]]
-  // CHECK-NEXT: %[[TMP_119:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK-NEXT: %[[TMP_120:.*]] = mhlo.multiply %[[TMP_119]], %[[TMP_118]]
   // CHECK-NEXT: %[[TMP_121:.*]] = mhlo.constant dense<9.6049737398705161>
-  // CHECK-NEXT: %[[TMP_122:.*]] = mhlo.add %[[TMP_120]], %[[TMP_121]]
-  // CHECK-NEXT: %[[TMP_123:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_118]]
+  // CHECK-NEXT: %[[TMP_123:.*]] = mhlo.multiply %[[TMP_121]], %[[TMP_118]]
   // CHECK-NEXT: %[[TMP_124:.*]] = mhlo.constant dense<90.026019720384269>
   // CHECK-NEXT: %[[TMP_125:.*]] = mhlo.add %[[TMP_123]], %[[TMP_124]]
   // CHECK-NEXT: %[[TMP_126:.*]] = mhlo.multiply %[[TMP_125]], %[[TMP_118]]
@@ -769,11 +656,8 @@ func.func @erfc_f64(%arg : tensor<f64>) -> tensor<f64> {
   // CHECK-NEXT: %[[TMP_133:.*]] = mhlo.constant dense<55592.301301039493>
   // CHECK-NEXT: %[[TMP_134:.*]] = mhlo.add %[[TMP_132]], %[[TMP_133]]
   // CHECK-NEXT: %[[TMP_135:.*]] = mhlo.multiply %[[ARG]], %[[TMP_134]]
-  // CHECK-NEXT: %[[TMP_136:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK-NEXT: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_136]], %[[TMP_118]]
   // CHECK-NEXT: %[[TMP_138:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK-NEXT: %[[TMP_139:.*]] = mhlo.add %[[TMP_137]], %[[TMP_138]]
-  // CHECK-NEXT: %[[TMP_140:.*]] = mhlo.multiply %[[TMP_139]], %[[TMP_118]]
+  // CHECK-NEXT: %[[TMP_140:.*]] = mhlo.multiply %[[TMP_138]], %[[TMP_118]]
   // CHECK-NEXT: %[[TMP_141:.*]] = mhlo.constant dense<33.561714164750313>
   // CHECK-NEXT: %[[TMP_142:.*]] = mhlo.add %[[TMP_140]], %[[TMP_141]]
   // CHECK-NEXT: %[[TMP_143:.*]] = mhlo.multiply %[[TMP_142]], %[[TMP_118]]
@@ -813,11 +697,8 @@ func.func @erfc_f32(%arg : tensor<f32>) -> tensor<f32> {
   // CHECK: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_5]], %[[TMP_7]]
   // CHECK: %[[TMP_9:.*]] = mhlo.constant dense<2.000000e+00>
   // CHECK: %[[TMP_10:.*]] = mhlo.compare LT, %[[TMP_2]], %[[TMP_9]], NOTYPE
-  // CHECK: %[[TMP_11:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_12:.*]] = mhlo.multiply %[[TMP_11]], %[[TMP_4]]
   // CHECK: %[[TMP_13:.*]] = mhlo.constant dense<2.326820e-02>
-  // CHECK: %[[TMP_14:.*]] = mhlo.add %[[TMP_12]], %[[TMP_13]]
-  // CHECK: %[[TMP_15:.*]] = mhlo.multiply %[[TMP_14]], %[[TMP_4]]
+  // CHECK: %[[TMP_15:.*]] = mhlo.multiply %[[TMP_13]], %[[TMP_4]]
   // CHECK: %[[TMP_16:.*]] = mhlo.constant dense<-0.138703942>
   // CHECK: %[[TMP_17:.*]] = mhlo.add %[[TMP_15]], %[[TMP_16]]
   // CHECK: %[[TMP_18:.*]] = mhlo.multiply %[[TMP_17]], %[[TMP_4]]
@@ -841,11 +722,8 @@ func.func @erfc_f32(%arg : tensor<f32>) -> tensor<f32> {
   // CHECK: %[[TMP_36:.*]] = mhlo.multiply %[[TMP_35]], %[[TMP_4]]
   // CHECK: %[[TMP_37:.*]] = mhlo.constant dense<0.563825965>
   // CHECK: %[[TMP_38:.*]] = mhlo.add %[[TMP_36]], %[[TMP_37]]
-  // CHECK: %[[TMP_39:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_40:.*]] = mhlo.multiply %[[TMP_39]], %[[TMP_4]]
   // CHECK: %[[TMP_41:.*]] = mhlo.constant dense<-10.477664>
-  // CHECK: %[[TMP_42:.*]] = mhlo.add %[[TMP_40]], %[[TMP_41]]
-  // CHECK: %[[TMP_43:.*]] = mhlo.multiply %[[TMP_42]], %[[TMP_4]]
+  // CHECK: %[[TMP_43:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_4]]
   // CHECK: %[[TMP_44:.*]] = mhlo.constant dense<1.297720e+01>
   // CHECK: %[[TMP_45:.*]] = mhlo.add %[[TMP_43]], %[[TMP_44]]
   // CHECK: %[[TMP_46:.*]] = mhlo.multiply %[[TMP_45]], %[[TMP_4]]
@@ -877,11 +755,8 @@ func.func @erfc_f32(%arg : tensor<f32>) -> tensor<f32> {
   // CHECK: %[[TMP_74:.*]] = mhlo.select %[[TMP_71]], %[[TMP_73]], %[[TMP_69]]
   // CHECK: %[[TMP_75:.*]] = mhlo.constant dense<1.000000e+00>
   // CHECK: %[[TMP_76:.*]] = mhlo.multiply %[[ARG]], %[[ARG]]
-  // CHECK: %[[TMP_77:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_78:.*]] = mhlo.multiply %[[TMP_77]], %[[TMP_76]]
   // CHECK: %[[TMP_79:.*]] = mhlo.constant dense<7.85386146E-5>
-  // CHECK: %[[TMP_80:.*]] = mhlo.add %[[TMP_78]], %[[TMP_79]]
-  // CHECK: %[[TMP_81:.*]] = mhlo.multiply %[[TMP_80]], %[[TMP_76]]
+  // CHECK: %[[TMP_81:.*]] = mhlo.multiply %[[TMP_79]], %[[TMP_76]]
   // CHECK: %[[TMP_82:.*]] = mhlo.constant dense<-8.0101937E-4>
   // CHECK: %[[TMP_83:.*]] = mhlo.add %[[TMP_81]], %[[TMP_82]]
   // CHECK: %[[TMP_84:.*]] = mhlo.multiply %[[TMP_83]], %[[TMP_76]]
@@ -2606,6 +2481,41 @@ func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32
 
 // -----
 
+// CHECK-LABEL:   @dyn_top_k
+// CHECK-SAME: ([[ARG_0:%.*]]: tensor<i64>, [[ARG_1:%.*]]: tensor<?x5x3xi1>
+// CHECK-SAME: -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>) {
+// CHECK-DAG:     [[VAL_0:%.*]] = "mhlo.get_dimension_size"([[ARG_1]]) {dimension = 0 : i64} : (tensor<?x5x3xi1>) -> tensor<i32>
+// CHECK-DAG:     [[VAL_1:%.*]] = mhlo.reshape [[VAL_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-DAG:     [[VAL_2:%.*]] = "mhlo.get_dimension_size"([[ARG_1]]) {dimension = 1 : i64} : (tensor<?x5x3xi1>) -> tensor<i32>
+// CHECK-DAG:     [[VAL_3:%.*]] = mhlo.reshape [[VAL_2]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-DAG:     [[VAL_4:%.*]] = "mhlo.get_dimension_size"([[ARG_1]]) {dimension = 2 : i64} : (tensor<?x5x3xi1>) -> tensor<i32>
+// CHECK-DAG:     [[VAL_5:%.*]] = mhlo.reshape [[VAL_4]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-DAG:     [[VAL_6:%.*]] = "mhlo.concatenate"([[VAL_1]], [[VAL_3]], [[VAL_5]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+// CHECK-DAG:     [[VAL_ct2:%.*]] = mhlo.constant dense<2> : tensor<i32>
+// CHECK-DAG:     [[VAL_ct2_tensor:%.*]] = mhlo.reshape [[VAL_ct2]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-DAG:     [[VAL_result_shape:%.*]] = "mhlo.concatenate"([[VAL_1]], [[VAL_3]], [[VAL_ct2_tensor]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+// CHECK-DAG:     [[VAL_7:%.*]] = "mhlo.dynamic_iota"([[VAL_6]]) {iota_dimension = 2 : i64} : (tensor<3xi32>) -> tensor<?x5x3xi32>
+// CHECK-DAG:     [[VAL_8:%.*]]:2 = "mhlo.sort"([[ARG_1]], [[VAL_7]]) ({
+// CHECK-DAG:     ^bb0([[ARG_2:%.*]]: tensor<i1>, [[ARG_3:%.*]]: tensor<i1>, [[ARG_4:%.*]]: tensor<i32>, [[ARG_5:%.*]]: tensor<i32>):
+// CHECK-DAG:       [[VAL_14:%.*]] = mhlo.compare  GT, [[ARG_2]], [[ARG_3]],  NOTYPE : (tensor<i1>, tensor<i1>) -> tensor<i1>
+// CHECK-DAG:       mhlo.return [[VAL_14]] : tensor<i1>
+// CHECK-DAG:     }) {dimension = 2 : i64, is_stable = true} : (tensor<?x5x3xi1>, tensor<?x5x3xi32>) -> (tensor<?x5x3xi1>, tensor<?x5x3xi32>)
+// CHECK-DAG:     [[VAL_9:%.*]] = mhlo.constant dense<0> : tensor<3xi64>
+// CHECK-DAG:     [[VAL_10:%.*]] = mhlo.convert [[VAL_result_shape]] : (tensor<3xi32>) -> tensor<3xi64>
+// CHECK-DAG:     [[VAL_11:%.*]] = mhlo.constant dense<1> : tensor<3xi64>
+// CHECK-DAG:     [[VAL_12:%.*]] = mhlo.real_dynamic_slice [[VAL_8]]#0, [[VAL_9]], [[VAL_10]], [[VAL_11]] : (tensor<?x5x3xi1>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi1>
+// CHECK-DAG:     [[VAL_13:%.*]] = mhlo.real_dynamic_slice [[VAL_8]]#1, [[VAL_9]], [[VAL_10]], [[VAL_11]] : (tensor<?x5x3xi32>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi32>
+// CHECK-DAG:     return [[VAL_12]], [[VAL_13]] : tensor<?x5x2xi1>, tensor<?x5x2xi32>
+// CHECK-DAG:   }
+// CHECK-DAG: }
+
+func.func @dyn_top_k(%arg0: tensor<i64>, %arg1: tensor<?x5x3xi1> {mhlo.sharding = ""}) -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>) {
+  %values, %indices = chlo.top_k(%arg1, k = 2) : tensor<?x5x3xi1> -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
+  return %values, %indices : tensor<?x5x2xi1>, tensor<?x5x2xi32>
+}
+
+// -----
+
 // Verify bessel_i1e operator for f16, f32, f64 separately as they use
 // different coefficients.
 
@@ -3131,8 +3041,6 @@ func.func @bessel_i1e_f64(%arg : tensor<16x16xf64>) -> tensor<16x16xf64> {
   func.return %0 : tensor<16x16xf64>
 }
 
-// -----
-
 // CHECK:  func.func @erf_inv([[ARG_0:%.*]]: tensor<16x16xf32>) {
 // CHECK-DAG:     [[VAL_0:%.*]] = mhlo.negate [[ARG_0]] : tensor<16x16xf32>
 // CHECK-DAG:     [[VAL_1:%.*]] = mhlo.multiply [[ARG_0]], [[VAL_0]] : tensor<16x16xf32>
@@ -3201,8 +3109,6 @@ func.func @erf_inv(%arg0 : tensor<16x16xf32>) {
   return
 }
 
-// -----
-
 // CHECK:  func.func @erf_inv_wide([[ARG_0:%.*]]: tensor<16x16xf64>) {
 // CHECK-DAG:     [[VAL_0:%.*]] = mhlo.negate [[ARG_0]] : tensor<16x16xf64>
 // CHECK-DAG:     [[VAL_1:%.*]] = mhlo.multiply [[ARG_0]], [[VAL_0]] : tensor<16x16xf64>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/buffer_reuse.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/buffer_reuse.mlir
index b65cece4794..4a2015c79b5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/buffer_reuse.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/buffer_reuse.mlir
@@ -2,9 +2,8 @@
 // RUN:                 -hlo-buffer-reuse | \
 // RUN:   FileCheck %s
 
-func.func @simple_reuse() {
-  %condition = "test.make_condition"() : () -> i1
-  scf.if %condition {
+func.func @simple_reuse(%cond: i1) {
+  scf.if %cond {
     %alloc0 = memref.alloc() : memref<2xf32>
     "test.use"(%alloc0) : (memref<2xf32>) -> ()
     memref.dealloc %alloc0 : memref<2xf32>
@@ -16,11 +15,10 @@ func.func @simple_reuse() {
 }
 
 // CHECK-LABEL: @simple_reuse
-//      CHECK: scf.if
-// CHECK-NEXT:   %[[ALLOC:.*]] = memref.alloc()
-// CHECK-NEXT:   "test.use"(%[[ALLOC]])
-// CHECK-NEXT:   "test.use"(%[[ALLOC]])
-// CHECK-NEXT:   memref.dealloc %[[ALLOC]]
+// CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca()
+// CHECK-NEXT: scf.if
+// CHECK-NEXT:   "test.use"(%[[ALLOCA]])
+// CHECK-NEXT:   "test.use"(%[[ALLOCA]])
 // CHECK-NEXT: }
 
 // -----
@@ -87,6 +85,8 @@ func.func @hoist_from_while() {
 // CHECK-NEXT:   scf.yield
 // CHECK-NEXT: }
 
+// -----
+
 func.func @double_buffer_for(%lb: index, %ub: index, %step: index) {
   %init = memref.alloc() : memref<f32>
   %0 = scf.for %i = %lb to %ub step %step iter_args(%arg0 = %init) -> (memref<f32>) {
@@ -107,6 +107,8 @@ func.func @double_buffer_for(%lb: index, %ub: index, %step: index) {
 // CHECK-NEXT:    "test.use"(%[[A]], %[[B]])
 // CHECK-NEXT:    scf.yield %[[B]], %[[A]]
 
+// -----
+
 func.func @double_buffer_while(%lb: index, %ub: index, %step: index) {
   %init = memref.alloc() : memref<f32>
   %0 = scf.while (%arg0 = %init) : (memref<f32>) -> (memref<f32>) {
@@ -135,26 +137,97 @@ func.func @double_buffer_while(%lb: index, %ub: index, %step: index) {
 // CHECK-NEXT:    scf.yield %[[B]], %[[A]]
 // CHECK-NEXT:  }
 
+// -----
+
+func.func @double_buffer_while_before(%lb: index, %ub: index, %step: index) {
+  %init = memref.alloc() : memref<f32>
+  %0 = scf.while (%arg0 = %init) : (memref<f32>) -> (memref<f32>) {
+    %0 = "test.make_condition"() : () -> i1
+    %alloc = memref.alloc() : memref<f32>
+    "test.use"(%arg0, %alloc) : (memref<f32>, memref<f32>) -> ()
+    memref.dealloc %arg0 : memref<f32>
+    scf.condition(%0) %alloc : memref<f32>
+  } do {
+  ^bb0(%arg0: memref<f32>):
+    scf.yield %arg0 : memref<f32>
+  }
+  memref.dealloc %0 : memref<f32>
+  return
+}
+
+// CHECK-LABEL: @double_buffer_while_before
+// CHECK-NEXT:  %[[ALLOC0:.*]] = memref.alloca
+// CHECK-NEXT:  %[[ALLOC1:.*]] = memref.alloca
+// CHECK-NEXT:  scf.while (%[[A:.*]] = %[[ALLOC0]], %[[B:.*]] = %[[ALLOC1]])
+// CHECK-NEXT:    make_condition
+// CHECK-NEXT:    "test.use"(%[[A]], %[[B]])
+// CHECK-NEXT:    condition{{.*}} %[[B]], %[[A]]
+// CHECK-NEXT:  } do {
+// CHECK-NEXT:  ^bb0
+// CHECK-NEXT:    scf.yield %[[A]], %[[B]]
+// CHECK-NEXT:  }
+
+
+// -----
+
+func.func @double_buffer_while_both(%lb: index, %ub: index, %step: index) {
+  %init = memref.alloc() : memref<f32>
+  %0 = scf.while (%arg0 = %init) : (memref<f32>) -> (memref<f32>) {
+    %0 = "test.make_condition"() : () -> i1
+    %alloc = memref.alloc() : memref<f32>
+    "test.use"(%arg0, %alloc) : (memref<f32>, memref<f32>) -> ()
+    memref.dealloc %arg0 : memref<f32>
+    scf.condition(%0) %alloc : memref<f32>
+  } do {
+  ^bb0(%arg0: memref<f32>):
+    %alloc = memref.alloc() : memref<f32>
+    "test.use"(%arg0, %alloc) : (memref<f32>, memref<f32>) -> ()
+    memref.dealloc %arg0 : memref<f32>
+    scf.yield %alloc : memref<f32>
+  }
+  memref.dealloc %0 : memref<f32>
+  return
+}
+
+// CHECK-LABEL: @double_buffer_while_both
+// CHECK-NEXT:  %[[ALLOC0:.*]] = memref.alloca
+// CHECK-NEXT:  %[[ALLOC1:.*]] = memref.alloca
+// TODO(jreiffers): eliminate the unnecessary third alloc.
+// CHECK-NEXT:  %[[ALLOC2:.*]] = memref.alloca
+// CHECK-NEXT:  scf.while (%[[A:.*]] = %[[ALLOC0]], %[[B:.*]] = %[[ALLOC1]], %[[C:.*]] = %[[ALLOC2]])
+// CHECK-NEXT:    make_condition
+// CHECK-NEXT:    "test.use"(%[[A]], %[[B]])
+// CHECK-NEXT:    condition{{.*}} %[[B]], %[[A]], %[[C]]
+// CHECK-NEXT:  } do {
+// CHECK-NEXT:  ^bb0
+// CHECK-NEXT:    "test.use"(%[[A]], %[[C]])
+// CHECK-NEXT:    scf.yield %[[C]], %[[B]], %[[A]]
+// CHECK-NEXT:  }
+
+// -----
+
 func.func @simplify_loop_dealloc() {
   %a = memref.alloc() : memref<f32>
+  %a_owned = deallocation.own %a : memref<f32>
   %b = memref.alloc() : memref<f32>
+  %b_owned = deallocation.own %b : memref<f32>
   %c = memref.alloc() : memref<f32>
-  %null = deallocation.null : memref<f32>
-  %w:6 = scf.while (%arg0 = %a, %arg1 = %b, %arg2 = %c, %arg3 = %a, %arg4 = %b, %arg5 = %c)
-    : (memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) ->
-      (memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) {
+  %c_owned = deallocation.own %c : memref<f32>
+  %w:6 = scf.while (%arg0 = %a, %arg1 = %b, %arg2 = %c, %arg3 = %a_owned, %arg4 = %b_owned, %arg5 = %c_owned)
+    : (memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership) ->
+      (memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership) {
     %cond = "test.make_condition"() : () -> i1
     scf.condition(%cond) %arg2, %arg1, %arg0, %arg5, %arg4, %arg3
-      : memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>
+      : memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership
   } do {
   ^bb0(%arg0: memref<f32>, %arg1: memref<f32>, %arg2: memref<f32>,
-        %arg3: memref<f32>, %arg4: memref<f32>, %arg5: memref<f32>):
+        %arg3: !deallocation.ownership, %arg4: !deallocation.ownership, %arg5: !deallocation.ownership):
     scf.yield %arg1, %arg0, %arg2, %arg4, %arg3, %arg5
-      : memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>
+      : memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership
   }
-  memref.dealloc %w#5 : memref<f32>
-  memref.dealloc %w#4 : memref<f32>
-  memref.dealloc %w#3 : memref<f32>
+  memref.dealloc %w#0 : memref<f32>
+  memref.dealloc %w#1 : memref<f32>
+  memref.dealloc %w#2 : memref<f32>
   return
 }
 
@@ -169,28 +242,30 @@ func.func @simplify_loop_dealloc() {
 
 func.func @hoist_always_reallocated() {
   %a = memref.alloc() : memref<f32>
-  %b = memref.cast %a : memref<f32> to memref<*xf32>
+  %b = deallocation.own %a : memref<f32>
   %w:3 = scf.while(%arg0 = %a, %arg1 = %b)
-    : (memref<f32>, memref<*xf32>) -> (i32, memref<f32>, memref<*xf32>) {
+      : (memref<f32>, !deallocation.ownership)
+     -> (i32, memref<f32>, !deallocation.ownership) {
     %cond = "test.make_condition"() : () -> i1
     %v = "test.dummy"() : () -> i32
-    memref.dealloc %arg1 : memref<*xf32>
+    memref.dealloc %arg0 : memref<f32>
     %0 = memref.alloc() : memref<f32>
-    %1 = memref.cast %0 : memref<f32> to memref<*xf32>
-    scf.condition (%cond) %v, %0, %1 : i32, memref<f32>, memref<*xf32>
+    %1 = deallocation.own %0 : memref<f32>
+    scf.condition (%cond) %v, %0, %1 : i32, memref<f32>, !deallocation.ownership
   } do {
-  ^bb0(%_: i32, %arg0: memref<f32>, %arg1 : memref<*xf32>):
-    memref.dealloc %arg1 : memref<*xf32>
+  ^bb0(%_: i32, %arg0: memref<f32>, %arg1 : !deallocation.ownership):
+    memref.dealloc %arg0 : memref<f32>
     %0 = memref.alloc() : memref<f32>
-    %1 = memref.cast %0 : memref<f32> to memref<*xf32>
-    scf.yield %0, %1 : memref<f32>, memref<*xf32>
+    %1 = deallocation.own %0 : memref<f32>
+    scf.yield %0, %1 : memref<f32>, !deallocation.ownership
   }
-  memref.dealloc %w#2 : memref<*xf32>
+  memref.dealloc %w#1 : memref<f32>
   return
 }
 
 // CHECK-LABEL: @hoist_always_reallocated
 // CHECK-NEXT: memref.alloca
+// CHECK-NEXT: deallocation.null
 // CHECK-NEXT: scf.while
 // CHECK-NOT: memref.alloc
 
@@ -198,25 +273,27 @@ func.func @hoist_always_reallocated() {
 
 func.func @hoist_passthrough() {
   %a = memref.alloc() : memref<f32>
-  %b = memref.cast %a : memref<f32> to memref<*xf32>
+  %b = deallocation.own %a : memref<f32>
   %w:3 = scf.while(%arg0 = %a, %arg1 = %b)
-    : (memref<f32>, memref<*xf32>) -> (i32, memref<f32>, memref<*xf32>) {
+      : (memref<f32>, !deallocation.ownership)
+     -> (i32, memref<f32>, !deallocation.ownership) {
     %cond = "test.make_condition"() : () -> i1
     %v = "test.dummy"() : () -> i32
-    memref.dealloc %arg1 : memref<*xf32>
+    memref.dealloc %arg0 : memref<f32>
     %0 = memref.alloc() : memref<f32>
-    %1 = memref.cast %0 : memref<f32> to memref<*xf32>
-    scf.condition (%cond) %v, %0, %1 : i32, memref<f32>, memref<*xf32>
+    %1 = deallocation.own %0 : memref<f32>
+    scf.condition (%cond) %v, %0, %1 : i32, memref<f32>, !deallocation.ownership
   } do {
-  ^bb0(%_: i32, %arg0: memref<f32>, %arg1: memref<*xf32>):
-    scf.yield %arg0, %arg1 : memref<f32>, memref<*xf32>
+  ^bb0(%_: i32, %arg0: memref<f32>, %arg1: !deallocation.ownership):
+    scf.yield %arg0, %arg1 : memref<f32>, !deallocation.ownership
   }
-  memref.dealloc %w#2 : memref<*xf32>
+  memref.dealloc %w#1 : memref<f32>
   return
 }
 
 // CHECK-LABEL: @hoist_passthrough
 // CHECK-NEXT: memref.alloca
+// CHECK-NEXT: deallocation.null
 // CHECK-NEXT: scf.while
 // CHECK-NOT: memref.alloc
 
@@ -272,89 +349,182 @@ func.func @allocs_in_different_scopes_with_no_overlap_2() {
 }
 
 // CHECK-LABEL: allocs_in_different_scopes_with_no_overlap_2
-// TODO(jreiffers): Eliminate the second alloca.
-// CHECK: memref.alloca
 // CHECK: memref.alloca
 // CHECK-NOT: memref.alloc
 // CHECK-NOT: memref.dealloc
 
 // -----
 
-func.func @elide_for_ownership() {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %alloc_0 = memref.alloc() : memref<1xi64>
-  %cast_0 = memref.cast %alloc_0 : memref<1xi64> to memref<*xi64>
-  %0:2 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg0 = %alloc_0, %arg1 = %cast_0) -> (memref<1xi64>, memref<*xi64>) {
-    memref.dealloc %arg1 : memref<*xi64>
-    %alloc_1 = memref.alloc() : memref<1xi64>
-    %cast_1 = memref.cast %alloc_1 : memref<1xi64> to memref<*xi64>
-    scf.yield %alloc_1, %cast_1 : memref<1xi64>, memref<*xi64>
+func.func @empty_region() {
+  %cond = "test.make_condition"() : () -> i1
+  scf.if %cond {
+    "test.dummy"() : () -> ()
   }
-  memref.dealloc %0#1 : memref<*xi64>
   return
 }
 
-// CHECK-LABEL: @elide_for_ownership
+// Regression test. Just make sure this doesn't crash.
+// CHECK-LABEL: @empty_region
+
+// -----
+
+func.func @copy_to_out_param(
+    %arg0: memref<i32> { deallocation.restrict = true }) {
+  %foo = memref.alloc() : memref<i32>
+  "some.op"(%foo) : (memref<i32>) -> ()
+  memref.copy %foo, %arg0 : memref<i32> to memref<i32>
+  memref.dealloc %foo : memref<i32>
+  return
+}
+
+// CHECK-LABEL: @copy_to_out_param(
+// CHECK-SAME: %[[ARG:.*]]: memref<i32>
+// CHECK-NEXT: "some.op"(%[[ARG]])
 // CHECK-NEXT: return
 
 // -----
 
-func.func @elide_while_ownership() {
-  %alloc_1 = memref.alloc() : memref<5xi32>
-  %alloc_2 = memref.alloc() : memref<5xi32>
-  %alloc_3 = memref.alloc() : memref<5xi32>
-  %cast_1 = memref.cast %alloc_2 : memref<5xi32> to memref<*xi32>
-  %cast_2 = memref.cast %alloc_3 : memref<5xi32> to memref<*xi32>
-  %6:4 = scf.while (%arg0 = %alloc_2, %arg1 = %alloc_3, %arg2 = %cast_1, %arg3 = %cast_2)
-      : (memref<5xi32>, memref<5xi32>, memref<*xi32>, memref<*xi32>) ->
-        (memref<5xi32>, memref<5xi32>, memref<*xi32>, memref<*xi32>) {
-    %alloc_4 = memref.alloc() : memref<5xi32>
-    memref.dealloc %arg2 : memref<*xi32>
-    %alloc_5 = memref.alloc() : memref<5xi32>
-    deallocation.retain() of(%arg3) : (memref<*xi32>) -> ()
-    %cast_3 = memref.cast %alloc_4 : memref<5xi32> to memref<*xi32>
-    %cast_4 = memref.cast %alloc_5 : memref<5xi32> to memref<*xi32>
-    %cond = "test.make_condition"() : () -> (i1)
-    scf.condition(%cond) %alloc_4, %alloc_5, %cast_3, %cast_4
-      : memref<5xi32>, memref<5xi32>, memref<*xi32>, memref<*xi32>
-  } do {
-  ^bb0(%arg0: memref<5xi32>, %arg1: memref<5xi32>, %arg2: memref<*xi32>, %arg3: memref<*xi32>):
-    memref.dealloc %arg2 : memref<*xi32>
-    %alloc_55 = memref.alloc() : memref<5xi32>
-    memref.dealloc %arg3 : memref<*xi32>
-    %null = deallocation.null : memref<*xi32>
-    %cast_3 = memref.cast %alloc_55 : memref<5xi32> to memref<*xi32>
-    scf.yield %alloc_55, %alloc_1, %cast_3, %null
-      : memref<5xi32>, memref<5xi32>, memref<*xi32>, memref<*xi32>
-  }
-  memref.dealloc %alloc_1 : memref<5xi32>
-  memref.dealloc %6#2 : memref<*xi32>
-  memref.dealloc %6#3 : memref<*xi32>
+func.func @copy_to_out_param_no_restrict(
+    %arg0: memref<i32> { deallocation.restrict = false }) {
+  %foo = memref.alloc() : memref<i32>
+  "some.op"(%foo) : (memref<i32>) -> ()
+  memref.copy %foo, %arg0 : memref<i32> to memref<i32>
+  memref.dealloc %foo : memref<i32>
   return
 }
 
-// CHECK-LABEL: @elide_while_ownership
-// CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca()
-// CHECK-NEXT: %[[ALLOC0:.*]] = memref.alloc()
-// CHECK-NEXT: %[[ALLOC1:.*]] = memref.alloc()
-// CHECK-NEXT: %[[CAST:.*]] = memref.cast %[[ALLOC1]]
-// CHECK-NEXT: %[[WHILE:.*]]:2 = scf.while
-// CHECK-SAME:     %[[ARG0:.*]] = %[[ALLOC0]],
-// CHECK-SAME:     %[[ARG1:.*]] = %[[ALLOC1]],
-// CHECK-SAME:     %[[ARG2:.*]] = %[[CAST]]
-// TODO(jreiffers): There's no double buffering for the before region yet.
-// CHECK-NEXT:   %[[ALLOC2:.*]] = memref.alloc()
-// CHECK-NEXT:   deallocation.retain() of(%[[ARG2]])
-// CHECK-NEXT:   test.make_condition
-// CHECK-NEXT:   scf.condition
-// CHECK-SAME:     %[[ALLOC2]], %[[ARG0]]
-// CHECK-NEXT: } do {
-// CHECK-NEXT:   %[[ARG0:.*]]: memref<5xi32>, %[[ARG1:.*]]: memref<5xi32>
-// CHECK-NEXT:   dealloc %[[ARG1]]
-// CHECK-NEXT:   %[[NULL:.*]] = deallocation.null
-// CHECK-NEXT:   scf.yield %[[ARG0]], %[[ALLOCA]], %[[NULL]]
-// CHECK-NEXT: }
-// CHECK-NEXT: dealloc %[[WHILE]]#0
-// CHECK-NEXT: dealloc %[[WHILE]]#1
+// CHECK-LABEL: @copy_to_out_param_no_restrict(
+// CHECK-NEXT: memref.alloca
+// CHECK-NEXT: some.op
+// CHECK-NEXT: memref.copy
 // CHECK-NEXT: return
+
+// -----
+
+func.func @copy_to_out_param_and_change_param(
+    %arg0: memref<2xindex> { deallocation.restrict = true }) {
+  %foo = memref.alloc() : memref<2xindex>
+  "some.op"(%foo) : (memref<2xindex>) -> ()
+  memref.copy %foo, %arg0 : memref<2xindex> to memref<2xindex>
+  memref.dealloc %foo : memref<2xindex>
+  %c1 = arith.constant 1 : index
+  memref.store %c1, %arg0[%c1] : memref<2xindex>
+  return
+}
+
+// CHECK-LABEL: @copy_to_out_param_and_change_param(
+// CHECK-SAME: %[[ARG:.*]]: memref<2xindex>
+// CHECK-DAG: %[[C1:.*]] = arith.constant 1
+// CHECK: "some.op"(%[[ARG]])
+// CHECK: memref.store %[[C1]], %[[ARG]]
+// CHECK-NEXT: return
+
+// -----
+
+func.func @copy_to_out_param_and_change_src(
+    %arg0: memref<2xindex> { deallocation.restrict = true }) {
+  %c1 = arith.constant 1 : index
+  %foo = memref.alloc() : memref<2xindex>
+  "some.op"(%foo) : (memref<2xindex>) -> ()
+  memref.copy %foo, %arg0 : memref<2xindex> to memref<2xindex>
+  memref.store %c1, %foo[%c1] : memref<2xindex>
+  "other.op"(%foo) : (memref<2xindex>) -> ()
+  memref.dealloc %foo : memref<2xindex>
+  return
+}
+
+// CHECK-LABEL: @copy_to_out_param_and_change_src(
+// CHECK-NEXT: arith.constant
+// CHECK-NEXT: memref.alloca
+// CHECK-NEXT: some.op
+// CHECK-NEXT: memref.copy
+// CHECK-NEXT: memref.store
+// CHECK-NEXT: other.op
+// CHECK-NEXT: return
+
+// -----
+
+func.func @copy_to_out_param_and_change_src_and_copy(
+    %arg0: memref<2xindex> { deallocation.restrict = true },
+    %arg1: memref<2xindex> { deallocation.restrict = true }) {
+  %c1 = arith.constant 1 : index
+  %foo = memref.alloc() : memref<2xindex>
+  "some.op"(%foo) : (memref<2xindex>) -> ()
+  memref.copy %foo, %arg0 : memref<2xindex> to memref<2xindex>
+  memref.store %c1, %foo[%c1] : memref<2xindex>
+  "other.op"(%foo) : (memref<2xindex>) -> ()
+  memref.copy %foo, %arg1 : memref<2xindex> to memref<2xindex>
+  memref.dealloc %foo : memref<2xindex>
+  return
+}
+
+// CHECK-LABEL: @copy_to_out_param_and_change_src_and_copy
+// CHECK-SAME:    %[[ARG0:.*]]: memref<2xindex> {{{.*}}},
+// CHECK-SAME:    %[[ARG1:.*]]: memref<2xindex>
+// CHECK-NEXT: arith.constant
+// CHECK-NEXT: "some.op"(%[[ARG1]])
+// CHECK-NEXT: memref.copy %[[ARG1]], %[[ARG0]]
+// CHECK-NEXT: memref.store
+// CHECK-NEXT: "other.op"(%[[ARG1]])
+// CHECK-NEXT: return
+
+// -----
+
+func.func @copy_from_param_to_param(
+    %arg0: memref<i32>, %arg1: memref<i32> { deallocation.restrict = true }) {
+  memref.copy %arg0, %arg1 : memref<i32> to memref<i32>
+  return
+}
+
+// CHECK-LABEL: @copy_from_param_to_param(
+// CHECK-NEXT: memref.copy
+// CHECK-NEXT: return
+
+// -----
+
+func.func @copy_to_alloc() {
+  %foo = memref.alloc() : memref<i32>
+  %bar = memref.alloc() : memref<i32>
+  "some.op"(%foo) : (memref<i32>) -> ()
+  memref.copy %foo, %bar : memref<i32> to memref<i32>
+  memref.dealloc %foo : memref<i32>
+  "some.op"(%bar) : (memref<i32>) -> ()
+  memref.dealloc %bar : memref<i32>
+  return
+}
+
+// CHECK-LABEL: @copy_to_alloc
+// CHECK-NEXT: memref.alloca
+// CHECK-NEXT: some.op
+// CHECK-NEXT: some.op
+// CHECK-NEXT: return
+
+// -----
+
+// TODO(jreiffers): Implement subview optimization (allocate memref<3xf32>, take
+// a subview in the then branch).
+func.func @hoist_from_if(%cond: i1) {
+  scf.if %cond {
+    %a = memref.alloc() : memref<i32>
+    %b = memref.alloc() : memref<2xf32>
+    "some.op"(%a, %b) : (memref<i32>, memref<2xf32>) -> ()
+    memref.dealloc %a : memref<i32>
+    memref.dealloc %b : memref<2xf32>
+  } else {
+    %a = memref.alloc() : memref<3xf32>
+    %b = memref.alloc() : memref<i32>
+    "some.op"(%a, %b) : (memref<3xf32>, memref<i32>) -> ()
+    memref.dealloc %a : memref<3xf32>
+    memref.dealloc %b : memref<i32>
+  }
+  return
+}
+
+// CHECK-LABEL: @hoist_from_if
+// CHECK-NEXT: memref.alloca
+// CHECK-NEXT: memref.alloca
+// CHECK-NEXT: memref.alloca
+// CHECK-NEXT: scf.if
+// CHECK-NEXT: some.op
+// CHECK-NEXT: else
+// CHECK-NEXT: some.op
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/canonicalize.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/canonicalize.mlir
deleted file mode 100644
index e6e42d48c00..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/canonicalize.mlir
+++ /dev/null
@@ -1,174 +0,0 @@
-// RUN: mlir-hlo-opt %s -allow-unregistered-dialect -canonicalize | FileCheck %s
-
-func.func @retain_is_dealloc() {
-  %alloc = memref.alloc() : memref<2xf32>
-  "test.use"(%alloc) : (memref<2xf32>) -> ()
-  deallocation.retain() of (%alloc) : (memref<2xf32>) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_is_dealloc
-// CHECK-NEXT: %[[ALLOC:.*]] = memref.alloc()
-// CHECK: memref.dealloc %[[ALLOC]]
-
-// -----
-
-func.func @retain_is_noop(%arg: memref<*xf32>) -> memref<*xf32> {
-  %ret = deallocation.retain(%arg) of(%arg) :
-     (memref<*xf32>, memref<*xf32>) -> (memref<*xf32>)
-  return %ret : memref<*xf32>
-}
-
-// CHECK-LABEL: @retain_is_noop
-// CHECK-SAME: (%[[ARG:.*]]: memref<*xf32>)
-// CHECK-NEXT: return %[[ARG]]
-
-// -----
-
-func.func @retain_is_cast(%arg: memref<2xf32>) -> memref<*xf32> {
-  %ret = deallocation.retain(%arg) of(%arg) :
-     (memref<2xf32>, memref<2xf32>) -> (memref<*xf32>)
-  return %ret : memref<*xf32>
-}
-
-// CHECK-LABEL: @retain_is_cast
-// CHECK-SAME: (%[[ARG:.*]]: memref<2xf32>)
-// CHECK-NEXT: %[[CAST:.*]] = memref.cast %[[ARG]]
-// CHECK-NEXT: return %[[CAST]] : memref<*xf32>
-
-// -----
-
-func.func @retain_of_nothing(%arg: memref<2xf32>) -> memref<*xf32> {
-  %ret = deallocation.retain(%arg) of() : (memref<2xf32>) -> (memref<*xf32>)
-  return %ret : memref<*xf32>
-}
-
-// CHECK-LABEL: @retain_of_nothing
-// CHECK-SAME: (%[[ARG:.*]]: memref<2xf32>
-// CHECK-NEXT: %[[NULL:.*]] = deallocation.null : memref<*xf32>
-// CHECK-NEXT: return %[[NULL]]
-
-// -----
-
-func.func @retain_is_dealloc_for(%x: memref<2xf32>, %lb: index, %ub: index, %step: index) {
-  %for = scf.for %i = %lb to %ub step %step iter_args(%arg0 = %x)
-      -> (memref<2xf32>) {
-    %alloc = memref.alloc() : memref<2xf32>
-    scf.yield %alloc : memref<2xf32>
-  }
-  deallocation.retain() of(%for) : (memref<2xf32>) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_is_dealloc_for
-// CHECK: %[[FOR:.*]] = scf.for
-// CHECK: memref.dealloc %[[FOR]]
-
-// -----
-
-func.func @retain_is_dealloc_for_cast(%x: memref<*xf32>, %lb: index, %ub: index, %step: index) {
-  %for = scf.for %i = %lb to %ub step %step iter_args(%arg0 = %x)
-      -> (memref<*xf32>) {
-    %alloc = memref.alloc() : memref<2xf32>
-    %cast = memref.cast %alloc : memref<2xf32> to memref<*xf32>
-    scf.yield %cast : memref<*xf32>
-  }
-  deallocation.retain() of(%for) : (memref<*xf32>) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_is_dealloc_for_cast
-// CHECK: %[[FOR:.*]] = scf.for
-// CHECK: memref.dealloc %[[FOR]]
-
-// -----
-
-func.func @retain_is_not_dealloc_for(%x: memref<2xf32>, %lb: index, %ub: index, %step: index) {
-  %for = scf.for %i = %lb to %ub step %step iter_args(%arg0 = %x)
-      -> (memref<2xf32>) {
-    %alloc = memref.alloc() : memref<2xf32>
-    %cast = "test.someop"(%alloc) : (memref<2xf32>) -> (memref<2xf32>)
-    scf.yield %cast : memref<2xf32>
-  }
-  deallocation.retain() of(%for) : (memref<2xf32>) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_is_not_dealloc_for
-// CHECK: %[[FOR:.*]] = scf.for
-// CHECK: deallocation.retain() of(%[[FOR]])
-
-// -----
-
-func.func @retain_is_dealloc_while() {
-  %a = memref.alloc() : memref<2xf32>
-  %while = scf.while (%arg0 = %a) : (memref<2xf32>) -> (memref<2xf32>) {
-    %0 = "test.make_condition"() : () -> i1
-    scf.condition(%0) %arg0 : memref<2xf32>
-  } do {
-  ^bb0(%arg0: memref<2xf32>):
-    %b = memref.alloc() : memref<2xf32>
-    scf.yield %b: memref<2xf32>
-  }
-  deallocation.retain() of (%while) : (memref<2xf32>) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_is_dealloc_while
-// CHECK: %[[WHILE:.*]] = scf.while
-// CHECK: memref.dealloc %[[WHILE]]
-
-func.func @retain_is_dealloc_while_permute() {
-  %a = memref.alloc() : memref<f32>
-  %b = memref.alloc() : memref<f32>
-  %c = memref.alloc() : memref<f32>
-  %null = deallocation.null : memref<f32>
-  %w:6 = scf.while (%arg0 = %a, %arg1 = %b, %arg2 = %c, %arg3 = %a, %arg4 = %b, %arg5 = %c)
-    : (memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) ->
-      (memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) {
-    %cond = "test.make_condition"() : () -> i1
-    scf.condition(%cond) %arg2, %arg1, %arg0, %arg5, %arg4, %arg3
-      : memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>
-  } do {
-  ^bb0(%arg0: memref<f32>, %arg1: memref<f32>, %arg2: memref<f32>,
-        %arg3: memref<f32>, %arg4: memref<f32>, %arg5: memref<f32>):
-    scf.yield %arg1, %arg0, %arg2, %arg4, %arg3, %arg5
-      : memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>
-  }
-  "test.use"(%w#1) : (memref<f32>) -> ()
-  deallocation.retain() of (%w#3, %w#4, %w#5) : (memref<f32>, memref<f32>, memref<f32>) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_is_dealloc_while_permute
-// CHECK: memref.alloc
-// CHECK: memref.alloc
-// CHECK: memref.alloc
-// CHECK: %[[WHILE:.*]]:6 = scf.while
-// CHECK: memref.dealloc %[[WHILE]]
-// CHECK: memref.dealloc %[[WHILE]]
-// CHECK: memref.dealloc %[[WHILE]]
-
-func.func @retain_of_null(%arg0: memref<4xi32>, %arg1: memref<4xi32>,
-                          %arg2: index, %arg3: index, %arg4: index) {
-  %0 = deallocation.null : memref<*xi32>
-  %1 = deallocation.null : memref<*xi32>
-  %2:4 = scf.for %arg5 = %arg2 to %arg3 step %arg4
-      iter_args(%arg6 = %arg0, %arg7 = %arg1, %arg8 = %0, %arg9 = %1) ->
-      (memref<4xi32>, memref<4xi32>, memref<*xi32>, memref<*xi32>) {
-    "test.use"(%arg6, %arg7) : (memref<4xi32>, memref<4xi32>) -> ()
-    %3 = deallocation.retain(%arg6) of(%arg8)
-      : (memref<4xi32>, memref<*xi32>) -> memref<*xi32>
-    %4 = deallocation.retain(%arg7) of(%arg9)
-      : (memref<4xi32>, memref<*xi32>) -> memref<*xi32>
-    scf.yield %arg7, %arg6, %4, %3
-      : memref<4xi32>, memref<4xi32>, memref<*xi32>, memref<*xi32>
-  }
-  deallocation.retain() of(%2#2) : (memref<*xi32>) -> ()
-  deallocation.retain() of(%2#3) : (memref<*xi32>) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_of_null
-// CHECK-NOT: deallocation.null
-// CHECK-NOT: deallocation.retain()
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/convert_deallocation_ops_to_llvm.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/convert_deallocation_ops_to_llvm.mlir
index 6b18eaab642..868ee0319ad 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/convert_deallocation_ops_to_llvm.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/convert_deallocation_ops_to_llvm.mlir
@@ -1,62 +1,19 @@
 // RUN: mlir-hlo-opt -hlo-convert-deallocation-ops-to-llvm %s \
 // RUN: -split-input-file | FileCheck %s
 
-// CHECK-LABEL: func.func @unranked_null()
-func.func @unranked_null() {
-  %null = deallocation.null : memref<*xf32>
-  func.return
+// CHECK-LABEL: func.func @null()
+func.func @null() -> !deallocation.ownership {
+  %null = deallocation.null
+  func.return %null : !deallocation.ownership
 }
-// CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK: [[DESC_0:%.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
-// CHECK: [[DESC_1:%.*]] = llvm.insertvalue [[C0]], [[DESC_0]][0]
-// CHECK: [[PTR:%.*]] = llvm.alloca {{.*}} x i8
-// CHECK: [[NULL:%.*]] = llvm.mlir.null : !llvm.ptr<f32>
-// CHECK: [[BITCAST:%.*]] = llvm.bitcast [[PTR]] : !llvm.ptr<i8> to !llvm.ptr<ptr<f32>>
-// CHECK: llvm.store [[NULL]], [[BITCAST]] : !llvm.ptr<ptr<f32>>
-// CHECK: [[DESC_2:%.*]] = llvm.insertvalue [[PTR]], [[DESC_1]][1]
+// CHECK: %[[NULL:.*]] = llvm.mlir.null : !llvm.ptr
+// CHECK: %[[RET:.*]] = builtin.unrealized_conversion_cast %[[NULL]]
+// CHECK: return %[[RET]]
 
 // -----
 
-// CHECK-LABEL: func.func @ranked_null()
-func.func @ranked_null() {
-  %null = deallocation.null : memref<2x?xf32>
-  func.return
-}
-// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK-NEXT: %[[C1:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK-NEXT: %[[C2:.*]] = llvm.mlir.constant(2 : index) : i64
-// CHECK-NEXT: %[[C1_:.*]] = llvm.mlir.constant(1 : index) : i64
-
-// CHECK: llvm.mlir.null
-// CHECK: %[[NULL:.*]] = llvm.mlir.null : !llvm.ptr<f32>
-// CHECK-NEXT: %[[DESC_0:.*]] = llvm.mlir.undef :
-// CHECK-SAME:   !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-// CHECK-NEXT: %[[DESC_1:.*]] = llvm.insertvalue %[[NULL]], %[[DESC_0]][0]
-// CHECK-NEXT: %[[DESC_2:.*]] = llvm.insertvalue %[[NULL]], %[[DESC_1]][1]
-// CHECK-NEXT: %[[DESC_3:.*]] = llvm.insertvalue %[[C0]], %[[DESC_2]][2]
-// CHECK-NEXT: %[[DESC_4:.*]] = llvm.insertvalue %[[C2]], %[[DESC_3]][3, 0]
-// CHECK-NEXT: %[[DESC_5:.*]] = llvm.insertvalue %[[C1]], %[[DESC_4]][4, 0]
-// CHECK-NEXT: %[[DESC_6:.*]] = llvm.insertvalue %[[C1]], %[[DESC_5]][3, 1]
-// CHECK-NEXT: %[[DESC_7:.*]] = llvm.insertvalue %[[C1_]], %[[DESC_6]][4, 1]
-
-// -----
-
-// CHECK-LABEL: func.func @unranked_get_buffer
-func.func @unranked_get_buffer(%arg0: memref<*xf32>) -> index {
-  %ret = deallocation.get_buffer %arg0 : memref<*xf32>
-  return %ret : index
-}
-
-// CHECK-NEXT: builtin.unrealized_conversion_cast
-// CHECK-NEXT: llvm.extractvalue
-// CHECK-NEXT: llvm.bitcast
-// CHECK-NEXT: llvm.load
-// CHECK-NEXT: llvm.ptrtoint
-
-// -----
-
-// CHECK-LABEL: func.func @ranked_get_buffer
-func.func @ranked_get_buffer(%arg0: memref<2x?xf32>) -> index {
+// CHECK-LABEL: func.func @memref_get_buffer
+func.func @memref_get_buffer(%arg0: memref<2x?xf32>) -> index {
   %ret = deallocation.get_buffer %arg0 : memref<2x?xf32>
   return %ret : index
 }
@@ -64,3 +21,57 @@ func.func @ranked_get_buffer(%arg0: memref<2x?xf32>) -> index {
 // CHECK-NEXT: builtin.unrealized_conversion_cast
 // CHECK-NEXT: llvm.extractvalue
 // CHECK-NEXT: llvm.ptrtoint
+
+// -----
+
+// CHECK-LABEL: func.func @ownership_get_buffer
+func.func @ownership_get_buffer(%arg0: !deallocation.ownership) -> index {
+  %ret = deallocation.get_buffer %arg0 : !deallocation.ownership
+  return %ret : index
+}
+
+// CHECK-NEXT: builtin.unrealized_conversion_cast
+// CHECK-NEXT: llvm.ptrtoint
+
+// -----
+
+// CHECK-LABEL: func.func @own(
+func.func @own(%arg0: memref<2x?xf32>) -> !deallocation.ownership {
+  %ret = deallocation.own %arg0 : memref<2x?xf32>
+  return %ret : !deallocation.ownership
+}
+
+// CHECK-NEXT: builtin.unrealized_conversion_cast
+// CHECK-NEXT: llvm.extractvalue
+// CHECK-NEXT: builtin.unrealized_conversion_cast
+
+// -----
+
+func.func @freeAlloc(%arg0: !deallocation.ownership) {
+  deallocation.free %arg0
+  return
+}
+
+// CHECK: @freeAlloc
+// CHECK-NEXT: builtin.unrealized_conversion_cast
+// CHECK-NEXT: llvm.call @free
+
+// -----
+
+func.func @retain_multiple(%arg0: memref<?xi32>, %arg1: memref<?xi32>,
+        %arg2: !deallocation.ownership, %arg3: !deallocation.ownership)
+    -> (!deallocation.ownership, !deallocation.ownership) {
+  %ret:2 = deallocation.retain(%arg0, %arg1) of (%arg2, %arg3)
+    : (memref<?xi32>, memref<?xi32>, !deallocation.ownership, !deallocation.ownership)
+    -> (!deallocation.ownership, !deallocation.ownership)
+  return %ret#0, %ret#1 : !deallocation.ownership, !deallocation.ownership
+}
+
+// CHECK-LABEL: @retain_multiple
+// CHECK-SAME:     %[[ARG0:.*]]: memref<?xi32>, %[[ARG1:.*]]: memref<?xi32>
+// CHECK-SAME:     %[[ARG2:.*]]: {{.*}}, %[[ARG3:.*]]:
+// CHECK:          memref.alloca_scope
+// CHECK:          llvm.alloca
+// CHECK:          llvm.alloca
+// CHECK:          call @retainBuffers
+// CHECK:          memref.alloca_scope.return
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocate.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocate.mlir
index 61e746d6b86..56679a3c9c8 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocate.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocate.mlir
@@ -1,14 +1,19 @@
-// RUN: mlir-hlo-opt %s -allow-unregistered-dialect -hlo-deallocate | FileCheck %s
-// RUN: mlir-hlo-opt %s -allow-unregistered-dialect -hlo-deallocate -canonicalize | FileCheck %s --check-prefix=CHECK-CANON
+// RUN: mlir-hlo-opt %s --split-input-file --allow-unregistered-dialect \
+// RUN:     --hlo-deallocate | \
+// RUN: FileCheck %s
+
+// RUN: mlir-hlo-opt %s --split-input-file --allow-unregistered-dialect \
+// RUN:     --hlo-deallocate --hlo-deallocation-simplification | \
+// RUN: FileCheck %s --check-prefix=CHECK-SIMPLE
 
 func.func @loop_nested_alloc(
     %lb: index, %ub: index, %step: index,
     %buf: memref<2xf32>, %res: memref<2xf32>) {
   %0 = memref.alloc() : memref<2xf32>
   %1 = scf.for %i = %lb to %ub step %step
-    iter_args(%iterBuf = %buf) -> memref<2xf32> {
+      iter_args(%iterBuf = %buf) -> memref<2xf32> {
     %2 = scf.for %i2 = %lb to %ub step %step
-      iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+        iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
       %3 = memref.alloc() : memref<2xf32>
       %4 = arith.cmpi eq, %i, %ub : index
       %5 = scf.if %4 -> (memref<2xf32>) {
@@ -26,32 +31,32 @@ func.func @loop_nested_alloc(
 }
 
 // CHECK-LABEL: func @loop_nested_alloc
-// CHECK-SAME:     %[[ARG3:[a-z0-9]*]]: memref<2xf32>, %[[OUT:.*]]: memref<2xf32>)
-// CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<2xf32>
-// CHECK: deallocation.retain() of(%[[ALLOC]])
-// CHECK: %[[ARG3_UNOWNED:.*]] = deallocation.null : memref<*xf32>
-// CHECK: %[[FOR1:.*]]:2 = scf.for {{.*}}iter_args(%[[A:.*]] = %[[ARG3]], %[[A_OWNERSHIP:.*]] = %[[ARG3_UNOWNED]])
-// CHECK:   %[[A_UNOWNED:.*]] = deallocation.null : memref<*xf32>
-// CHECK:   %[[FOR2:.*]]:2 = scf.for {{.*}} iter_args(%[[B:.*]] = %[[A]], %[[B_OWNERSHIP:.*]] = %[[A_UNOWNED]])
-// CHECK:     %[[ALLOC2:.*]] = memref.alloc() : memref<2xf32>
-// CHECK:     deallocation.retain() of(%[[ALLOC2]])
-// CHECK:     %[[IF:.*]]:2 = scf.if
-// CHECK:       %[[ALLOC3:.*]] = memref.alloc() : memref<2xf32>
-// CHECK:       %[[ALLOC3_RETAINED:.*]] = memref.cast %[[ALLOC3]] : memref<2xf32> to memref<*xf32>
-// CHECK:       scf.yield %[[ALLOC3]], %[[ALLOC3_RETAINED]]
-// CHECK:     } else {
-// CHECK:       %[[NULL:.*]] = deallocation.retain(%[[B]]) of()
-// CHECK:       scf.yield %[[B]], %[[NULL]]
-// CHECK:     }
-// CHECK:     %[[RETAINED_IF:.*]] = deallocation.retain(%[[IF]]#0) of(%[[B_OWNERSHIP]], %[[IF]]#1)
-// CHECK:     scf.yield %[[IF]]#0, %[[RETAINED_IF]]
-// CHECK:   }
-// CHECK:   %[[RETAINED_FOR2:.*]] = deallocation.retain(%[[FOR2]]#0) of(%[[A_OWNERSHIP]], %[[FOR2]]#1)
-// CHECK:   scf.yield %[[FOR2]]#0, %[[RETAINED_FOR2]]
-// CHECK: }
-// CHECK: memref.copy %[[FOR1]]#0, %[[OUT]]
-// CHECK: deallocation.retain() of(%[[FOR1]]#1)
-// CHECK: return
+// CHECK-SAME:      %[[ARG3:[a-z0-9]*]]: memref<2xf32>, %[[OUT:.*]]: memref<2xf32>)
+// CHECK:       %[[ALLOC:.*]] = memref.alloc() : memref<2xf32>
+// CHECK:       %[[ALLOC_OWNED:.*]] = deallocation.own %[[ALLOC]]
+// CHECK:       deallocation.retain() of(%[[ALLOC_OWNED]])
+// CHECK:       %[[ARG3_UNOWNED:.*]] = deallocation.null
+// CHECK:       %[[FOR1:.*]]:2 = scf.for {{.*}}iter_args(%[[A:.*]] = %[[ARG3]], %[[A_OWNERSHIP:.*]] = %[[ARG3_UNOWNED]])
+// CHECK:         %[[FOR2:.*]]:2 = scf.for {{.*}} iter_args(%[[B:.*]] = %[[A]], %[[B_OWNERSHIP:.*]] = %[[A_OWNERSHIP]])
+// CHECK:           %[[ALLOC2:.*]] = memref.alloc() : memref<2xf32>
+// CHECK:           %[[ALLOC2_OWNED:.*]] = deallocation.own %[[ALLOC2]]
+// CHECK:           deallocation.retain() of(%[[ALLOC2_OWNED]])
+// CHECK:           %[[IF:.*]]:2 = scf.if
+// CHECK:             %[[ALLOC3:.*]] = memref.alloc() : memref<2xf32>
+// CHECK:             %[[ALLOC3_OWNED:.*]] = deallocation.own %[[ALLOC3]]
+// CHECK:             scf.yield %[[ALLOC3]], %[[ALLOC3_OWNED]]
+// CHECK:           } else {
+// CHECK:             %[[NULL:.*]] = deallocation.retain(%[[B]]) of()
+// CHECK:             scf.yield %[[B]], %[[NULL]]
+// CHECK:           }
+// CHECK:           %[[RETAINED_IF:.*]] = deallocation.retain(%[[IF]]#0) of(%[[B_OWNERSHIP]], %[[IF]]#1)
+// CHECK:           scf.yield %[[IF]]#0, %[[RETAINED_IF]]
+// CHECK:         }
+// CHECK:         scf.yield %[[FOR2]]#0, %[[FOR2]]#1
+// CHECK:       }
+// CHECK:       memref.copy %[[FOR1]]#0, %[[OUT]]
+// CHECK:       deallocation.retain() of(%[[FOR1]]#1)
+// CHECK:       return
 
 // -----
 
@@ -75,25 +80,26 @@ func.func @nested_if() -> (memref<2xf32>, memref<2xf32>) {
 }
 
 // CHECK-LABEL: func @nested_if
-// CHECK:      %[[ALLOC0:.*]] = memref.alloc()
-// CHECK:      %[[ALLOC1:.*]] = memref.alloc()
-// CHECK:      %[[IF1:.*]]:2 = scf.if
-// CHECK-NEXT:   %[[ALLOC2:.*]] = memref.alloc()
-// CHECK-NEXT:   %[[ALLOC2_RETAINED:.*]] = memref.cast %[[ALLOC2]] : memref<2xf32> to memref<*xf32>
-// CHECK-NEXT:   scf.yield %[[ALLOC2]], %[[ALLOC2_RETAINED]]
-// CHECK-NEXT: } else {
-// CHECK:        %[[IF2:.*]]:2 = scf.if
-// CHECK-NEXT:     %[[NULL:.*]] = deallocation.retain(%[[ALLOC0]]) of()
-// CHECK-NEXT:     scf.yield %[[ALLOC0]], %[[NULL]]
-// CHECK-NEXT:   } else {
-// CHECK-NEXT:     %[[NULL:.*]] = deallocation.retain(%[[ALLOC1]]) of()
-// CHECK-NEXT:     scf.yield %[[ALLOC1]], %[[NULL]]
-// CHECK-NEXT:   }
-// CHECK-NEXT:   %[[IF2_RETAINED:.*]] = deallocation.retain(%[[IF2]]#0) of(%[[IF2]]#1)
-// CHECK-NEXT:   scf.yield %[[IF2]]#0, %[[IF2_RETAINED]]
-// CHECK-NEXT: }
-// CHECK-NEXT: deallocation.retain(%[[ALLOC0]], %[[IF1]]#0) of(%[[ALLOC0]], %[[ALLOC1]], %[[IF1]]#1)
-// CHECK-NEXT: return %[[ALLOC0]], %[[IF1]]#0 : memref<2xf32>, memref<2xf32>
+// CHECK:       %[[ALLOC0:.*]] = memref.alloc()
+// CHECK:       %[[ALLOC0_OWNED:.*]] = deallocation.own %[[ALLOC0]]
+// CHECK:       %[[ALLOC1:.*]] = memref.alloc()
+// CHECK:       %[[ALLOC1_OWNED:.*]] = deallocation.own %[[ALLOC1]]
+// CHECK:       %[[IF1:.*]]:2 = scf.if
+// CHECK-NEXT:    %[[ALLOC2:.*]] = memref.alloc()
+// CHECK-NEXT:    %[[ALLOC2_OWNED:.*]] = deallocation.own %[[ALLOC2]]
+// CHECK-NEXT:    scf.yield %[[ALLOC2]], %[[ALLOC2_OWNED]]
+// CHECK-NEXT:  } else {
+// CHECK:         %[[IF2:.*]]:2 = scf.if
+// CHECK-NEXT:      %[[NULL:.*]] = deallocation.retain(%[[ALLOC0]]) of()
+// CHECK-NEXT:      scf.yield %[[ALLOC0]], %[[NULL]]
+// CHECK-NEXT:    } else {
+// CHECK-NEXT:      %[[NULL:.*]] = deallocation.retain(%[[ALLOC1]]) of()
+// CHECK-NEXT:      scf.yield %[[ALLOC1]], %[[NULL]]
+// CHECK-NEXT:    }
+// CHECK-NEXT:    scf.yield %[[IF2]]#0, %[[IF2]]#1
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[RETAINED:.*]]:2 = deallocation.retain(%[[ALLOC0]], %[[IF1]]#0) of(%[[ALLOC0_OWNED]], %[[ALLOC1_OWNED]], %[[IF1]]#1)
+// CHECK-NEXT:  return %[[ALLOC0]], %[[IF1]]#0, %[[RETAINED]]#0, %[[RETAINED]]#1 : memref<2xf32>, memref<2xf32>, !deallocation.ownership, !deallocation.ownership
 
 // -----
 
@@ -115,24 +121,20 @@ func.func @while(%arg0: index) -> (memref<?xf32>, memref<?xf32>, memref<?xf32>)
 // CHECK-LABEL: func @while(
 // CHECK-SAME:      %[[ARG0:.*]]:
 // CHECK-NEXT:    %[[ALLOC:.*]] = memref.alloc(%arg0) : memref<?xf32>
-// CHECK-NEXT:    %[[ALLOC_OWNED:.*]] = memref.cast %[[ALLOC]]
+// CHECK-NEXT:    %[[ALLOC_OWNED:.*]] = deallocation.own %[[ALLOC]]
 // CHECK-NEXT:    %[[NULL1:.*]] = deallocation.null
 // CHECK-NEXT:    %[[NULL2:.*]] = deallocation.null
 // CHECK-NEXT:    %[[WHILE:.*]]:6 = scf.while (%[[A:[a-z0-9]*]] = %[[ALLOC]], %[[B:[a-z0-9]*]] = %[[ALLOC]], %[[C:[a-z0-9]*]] = %[[ALLOC]],
 // CHECK-SAME:       %[[A_OWNERSHIP:.*]] = %[[ALLOC_OWNED]], %[[B_OWNERSHIP:.*]] = %[[NULL1]], %[[C_OWNERSHIP:.*]] = %[[NULL2]])
-// CHECK:            %[[A_RETAINED:.*]] = deallocation.retain(%[[A]]) of(%[[A_OWNERSHIP]])
-// CHECK:            %[[B_RETAINED:.*]] = deallocation.retain(%[[B]]) of(%[[B_OWNERSHIP]])
-// CHECK:            %[[C_RETAINED:.*]] = deallocation.retain(%[[C]]) of(%[[C_OWNERSHIP]])
-// CHECK:            scf.condition{{.*}} %[[A]], %[[B]], %[[C]], %[[A_RETAINED]], %[[B_RETAINED]], %[[C_RETAINED]]
+// CHECK:            scf.condition{{.*}} %[[A]], %[[B]], %[[C]], %[[A_OWNERSHIP]], %[[B_OWNERSHIP]], %[[C_OWNERSHIP]]
 // CHECK:         } do {
 // CHECK:           deallocation.retain() of(%[[C_OWNERSHIP]])
 // CHECK:           deallocation.retain() of(%[[A_OWNERSHIP]])
 // CHECK:           %[[ALLOC1:.*]] = memref.alloc(%[[ARG0]])
+// CHECK:           %[[ALLOC1_OWNED:.*]] = deallocation.own %[[ALLOC1]]
 // CHECK:           %[[ALLOC2:.*]] = memref.alloc(%[[ARG0]])
-// CHECK:           %[[B_RETAINED:.*]] = deallocation.retain(%[[B]]) of(%[[B_OWNERSHIP]])
-// CHECK:           %[[ALLOC1_RETAINED:.*]] = memref.cast %[[ALLOC1]]
-// CHECK:           %[[ALLOC2_RETAINED:.*]] = memref.cast %[[ALLOC2]]
-// CHECK:           scf.yield %[[ALLOC2]], %[[ALLOC1]], %[[B]], %[[ALLOC2_RETAINED]], %[[ALLOC1_RETAINED]], %[[B_RETAINED]]
+// CHECK:           %[[ALLOC2_OWNED:.*]] = deallocation.own %[[ALLOC2]]
+// CHECK:           scf.yield %[[ALLOC2]], %[[ALLOC1]], %[[B]], %[[ALLOC2_OWNED]], %[[ALLOC1_OWNED]], %[[B_OWNERSHIP]]
 // CHECK:         }
 // CHECK:         %[[RESULTS_RETAINED:.*]] = deallocation.retain(%[[WHILE]]#0, %[[WHILE]]#1, %[[WHILE]]#2)
 // CHECK-SAME:      of(%[[WHILE]]#3, %[[WHILE]]#4, %[[WHILE]]#5)
@@ -151,16 +153,17 @@ func.func @if_without_else() {
 }
 
 // CHECK-LABEL: @if_without_else
-// CHECK: scf.if
-// CHECK-NEXT: %[[ALLOC:.*]] = memref.alloc
-// CHECK-NEXT: test.use
-// CHECK-NEXT: deallocation.retain() of(%[[ALLOC]])
+// CHECK:       scf.if
+// CHECK-NEXT:  %[[ALLOC:.*]] = memref.alloc
+// CHECK-NEXT:  %[[ALLOC_OWNED:.*]] = deallocation.own %[[ALLOC]]
+// CHECK-NEXT:  test.use
+// CHECK-NEXT:  deallocation.retain() of(%[[ALLOC_OWNED]])
 
-// CHECK-CANON-LABEL: @if_without_else
-// CHECK-CANON:       scf.if
-// CHECK-CANON-NEXT:    memref.alloc
-// CHECK-CANON-NEXT:    test.use
-// CHECK-CANON-NEXT:    memref.dealloc
+// CHECK-SIMPLE-LABEL: @if_without_else
+// CHECK-SIMPLE:       scf.if
+// CHECK-SIMPLE-NEXT:    memref.alloc
+// CHECK-SIMPLE-NEXT:    test.use
+// CHECK-SIMPLE-NEXT:    memref.dealloc
 
 // -----
 
@@ -177,18 +180,19 @@ func.func @yield_same_alloc_twice() {
 }
 
 // CHECK-LABEL: @yield_same_alloc_twice
-// CHECK-NEXT: %[[ALLOC:.*]] = memref.alloc
-// CHECK-NEXT: %[[NULL1:.*]] = deallocation.null
-// CHECK-NEXT: %[[NULL2:.*]] = deallocation.null
-// CHECK: scf.while
-// CHECK-SAME: %[[ALLOC]]
-// CHECK-SAME: %[[ALLOC]]
-// CHECK-SAME: %[[NULL1]]
-// CHECK-SAME: %[[NULL2]]
-// CHECK: } do {
-// CHECK-NEXT: %[[RETAIN:.*]]:2 = deallocation.retain(%[[ALLOC]], %[[ALLOC]]) of()
-// CHECK-NEXT: %[[NULL:.*]] = deallocation.null
-// CHECK-NEXT: scf.yield %[[ALLOC]], %[[ALLOC]], %[[RETAIN]]#1, %[[NULL]]
+// CHECK-NEXT:  %[[ALLOC:.*]] = memref.alloc
+// CHECK-NEXT:  %[[ALLOC_OWNED:.*]] = deallocation.own %[[ALLOC]]
+// CHECK-NEXT:  %[[NULL1:.*]] = deallocation.null
+// CHECK-NEXT:  %[[NULL2:.*]] = deallocation.null
+// CHECK:       scf.while
+// CHECK-SAME:    %[[ALLOC]]
+// CHECK-SAME:    %[[ALLOC]]
+// CHECK-SAME:    %[[NULL1]]
+// CHECK-SAME:    %[[NULL2]]
+// CHECK:       do
+// CHECK-NEXT:    %[[NULL:.*]] = deallocation.null
+// CHECK-NEXT:    %[[RETAIN:.*]]:2 = deallocation.retain(%[[ALLOC]], %[[ALLOC]]) of()
+// CHECK-NEXT:    scf.yield %[[ALLOC]], %[[ALLOC]], %[[RETAIN]]#1, %[[NULL]]
 
 // -----
 
@@ -206,21 +210,23 @@ func.func @yield_derived(%lb: index, %ub: index, %step: index) {
 
 // CHECK-LABEL: @yield_derived
 // CHECK-NEXT:  memref.alloc
-// CHECK-NEXT:  memref.cast
+// CHECK-NEXT:  deallocation.own
 // CHECK-NEXT:  scf.for
 // CHECK-NEXT:    deallocation.retain()
 // CHECK-NEXT:    %[[ALLOC:.*]] = memref.alloc
+// CHECK-NEXT:    %[[ALLOC_OWNED:.*]] = deallocation.own
 // CHECK-NEXT:    "test.someop"
 // CHECK-NEXT:    %[[RESULT:.*]] = "test.someop"
-// CHECK-NEXT:    %[[OWNED:.*]] = memref.cast %[[ALLOC]]
-// CHECK-NEXT:    scf.yield %[[RESULT]], %[[OWNED]]
+// CHECK-NEXT:    scf.yield %[[RESULT]], %[[ALLOC_OWNED]]
 // CHECK-NEXT:  }
 // CHECK-NEXT:  test.use
 // CHECK-NEXT:  retain
 
-// CHECK-CANON-LABEL: @yield_derived
-// CHECK-CANON:       test.use
-// CHECK-CANON-NEXT:  dealloc
+// CHECK-SIMPLE-LABEL: @yield_derived
+// CHECK-SIMPLE:       test.use
+// CHECK-SIMPLE-NEXT:  memref.dealloc
+
+// -----
 
 func.func @unknown_op() {
   %c0 = arith.constant 0 : index
@@ -235,8 +241,678 @@ func.func @unknown_op() {
   return
 }
 
-// CHECK-LABEL: @unknown_op
-// CHECK: scf.parallel
-// CHECK-NEXT: alloc
-// CHECK-NEXT: test.use
-// CHECK-NEXT: dealloc
+// TODO(jreiffers): Remove the `own` op in simplification.
+// CHECK-SIMPLE-LABEL: @unknown_op
+// CHECK-SIMPLE:       scf.parallel
+// CHECK-SIMPLE-NEXT:  memref.alloc()
+// CHECK-SIMPLE:       test.use
+// CHECK-SIMPLE-NEXT:  memref.dealloc
+
+// -----
+
+func.func @unconditional_realloc(%init: index, %new: index) {
+  %alloc = memref.alloc(%init) : memref<?xi32>
+  "test.use"(%alloc) : (memref<?xi32>) -> ()
+  %realloc = memref.realloc %alloc(%new) : memref<?xi32> to memref<?xi32>
+  "test.use"(%realloc) : (memref<?xi32>) -> ()
+  return
+}
+
+// CHECK-LABEL: @unconditional_realloc
+// CHECK-NEXT:  memref.alloc
+// CHECK-NEXT:  deallocation.own
+// CHECK-NEXT:  test.use
+// CHECK-NEXT:  %[[REALLOC:.*]] = memref.realloc
+// CHECK-NEXT:  %[[OWNED:.*]] = deallocation.own %[[REALLOC]]
+// CHECK-NEXT:  test.use
+// CHECK-NEXT:  deallocation.retain() of(%[[OWNED]])
+// CHECK-NEXT:  return
+
+// CHECK-SIMPLE-LABEL: @unconditional_realloc
+// CHECK-SIMPLE-NEXT:  memref.alloc
+// CHECK-SIMPLE-NEXT:  test.use
+// CHECK-SIMPLE-NEXT:  %[[REALLOC:.*]] = memref.realloc
+// CHECK-SIMPLE-NEXT:  test.use
+// CHECK-SIMPLE-NEXT:  memref.dealloc %[[REALLOC]]
+
+// -----
+
+func.func @realloc_in_if(%init: index) {
+  %alloc = memref.alloc(%init) : memref<?xi32>
+  %cond = "test.make_condition"() : () -> (i1)
+  %new_alloc = scf.if %cond -> memref<?xi32> {
+    %new_size = "test.make_index"() : () -> (index)
+    %ret = memref.realloc %alloc(%new_size) : memref<?xi32> to memref<?xi32>
+    scf.yield %ret : memref<?xi32>
+  } else {
+    scf.yield %alloc: memref<?xi32>
+  }
+  "test.use"(%new_alloc) : (memref<?xi32>) -> ()
+  return
+}
+
+// CHECK-LABEL: @realloc_in_if
+// CHECK-NEXT:  %[[ALLOC:.*]] = memref.alloc
+// CHECK-NEXT:  %[[OWNED:.*]] = deallocation.own %[[ALLOC]]
+// CHECK-NEXT:  test.make_condition
+// CHECK-NEXT:  %[[NEW_ALLOC:.*]]:2 = scf.if
+// CHECK-NEXT:    test.make_index
+// CHECK-NEXT:    %[[REALLOC:.*]] = memref.realloc %[[ALLOC]]
+// CHECK-NEXT:    %[[REALLOC_OWNED:.*]] = deallocation.own %[[REALLOC]]
+// CHECK-NEXT:    scf.yield %[[REALLOC]], %[[REALLOC_OWNED]]
+// CHECK-NEXT:  } else {
+// CHECK-NEXT:    deallocation.retain(%[[ALLOC]]) of()
+// CHECK-NEXT:    scf.yield %[[ALLOC]], %[[OWNED]]
+// CHECK-NEXT:  }
+// CHECK-NEXT:  "test.use"(%[[NEW_ALLOC]]#0)
+// CHECK-NEXT:  deallocation.retain() of(%[[NEW_ALLOC]]#1)
+// CHECK-NEXT:  return
+
+// -----
+
+func.func @realloc_in_if_strange_but_ok(%size: index, %cond: i1) {
+  %alloc = memref.alloc(%size) : memref<?xi32>
+  scf.if %cond -> memref<?xi32> {
+    %realloc = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
+    %new = memref.alloc(%size) : memref<?xi32>
+    scf.yield %new : memref<?xi32>
+  } else {
+    "test.dummy"() : () -> ()
+    scf.yield %alloc : memref<?xi32>
+  }
+  return
+}
+
+// CHECK-LABEL: @realloc_in_if_strange_but_ok
+// CHECK-NEXT:  %[[ALLOC:.*]] = memref.alloc
+// CHECK-NEXT:  %[[OWNED:.*]] = deallocation.own %[[ALLOC]]
+// CHECK-NOT:   deallocation.retain() of(%[[OWNED]])
+
+// -----
+
+func.func @realloc_in_loop(%size: index, %lb: index, %ub: index, %step: index) {
+  %alloc = memref.alloc(%size) : memref<?xi32>
+  scf.for %i = %lb to %ub step %step iter_args(%arg0 = %alloc) -> memref<?xi32> {
+    %cond = "test.make_condition"() : () -> i1
+    %new = scf.if %cond -> memref<?xi32> {
+      %realloc = memref.realloc %arg0(%size) : memref<?xi32> to memref<?xi32>
+      scf.yield %realloc : memref<?xi32>
+    } else {
+      scf.yield %arg0 : memref<?xi32>
+    }
+    scf.yield %new : memref<?xi32>
+  }
+  return
+}
+
+// CHECK-LABEL: @realloc_in_loop
+// CHECK-NEXT:  memref.alloc
+// CHECK-NEXT:  %[[OWNED:.*]] = deallocation.own
+// CHECK-NEXT:  %[[FOR:.*]]:2 = scf.for
+// CHECK:         %[[IF:.*]]:2 = scf.if
+// CHECK:         scf.yield %[[IF]]#0, %[[IF]]#1
+// CHECK-NEXT:  }
+// CHECK-NEXT:  deallocation.retain() of(%[[FOR]]#1)
+// CHECK-NEXT:  return
+
+// -----
+
+func.func @dealloc() {
+  %alloc = memref.alloc() : memref<i32>
+  "test.use"(%alloc) : (memref<i32>) -> ()
+  memref.dealloc %alloc: memref<i32>
+  return
+}
+
+// CHECK-LABEL:        @dealloc
+// CHECK-SIMPLE-LABEL: @dealloc
+// CHECK-SIMPLE-NEXT:  memref.alloc
+// CHECK-SIMPLE-NEXT:  test.use
+// CHECK-SIMPLE-NEXT:  memref.dealloc
+// CHECK-SIMPLE-NEXT:  return
+
+// -----
+
+func.func @dealloc_in_loop(%lb: index, %ub: index, %step: index) {
+  scf.for %i = %lb to %ub step %step {
+    %alloc = memref.alloc() : memref<i32>
+    "test.use"(%alloc) : (memref<i32>) -> ()
+    memref.dealloc %alloc: memref<i32>
+  }
+  return
+}
+
+// CHECK-LABEL:        @dealloc_in_loop
+// CHECK-SIMPLE-LABEL: @dealloc_in_loop
+// CHECK-SIMPLE-NEXT:  scf.for
+// CHECK-SIMPLE-NEXT:    memref.alloc
+// CHECK-SIMPLE-NEXT:    test.use
+// CHECK-SIMPLE-NEXT:    memref.dealloc
+// CHECK-SIMPLE-NEXT:  }
+// CHECK-SIMPLE-NEXT:  return
+
+// -----
+
+func.func @dealloc_around_loop(%lb: index, %ub: index, %step: index) {
+  %alloc = memref.alloc() : memref<i32>
+  scf.for %i = %lb to %ub step %step {
+    "test.use"(%alloc) : (memref<i32>) -> ()
+  }
+  memref.dealloc %alloc: memref<i32>
+  return
+}
+
+// CHECK-LABEL:        @dealloc_around_loop
+// CHECK-SIMPLE-LABEL: @dealloc_around_loop
+// CHECK-SIMPLE-NEXT:  memref.alloc
+// CHECK-SIMPLE-NEXT:  scf.for
+// CHECK-SIMPLE-NEXT:    test.use
+// CHECK-SIMPLE-NEXT:  }
+// CHECK-SIMPLE-NEXT:  memref.dealloc
+// CHECK-SIMPLE-NEXT:  return
+
+// -----
+
+func.func @memory_effect_no_free_or_alloc() {
+  %alloc = memref.alloc() : memref<i32>
+  %expand_shape = memref.expand_shape %alloc [] : memref<i32> into memref<1x1xi32>
+  "test.use"(%expand_shape) : (memref<1x1xi32>) -> ()
+  return
+}
+
+// CHECK-LABEL: @memory_effect_no_free_or_alloc
+// CHECK-NEXT:  memref.alloc
+// CHECK-NEXT:  deallocation.own
+// CHECK-NEXT:  memref.expand_shape
+// CHECK-NEXT:  test.use
+// CHECK-NEXT:  deallocation.retain
+
+// -----
+
+func.func @id(%arg0: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
+  return %arg0 : memref<1x2x3xf32>
+}
+
+func.func @user(%arg0: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
+  %0 = call @id(%arg0) : (memref<1x2x3xf32>) -> memref<1x2x3xf32>
+  return %0 : memref<1x2x3xf32>
+}
+
+// CHECK: @id(%[[ARG0:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[RETAIN:.*]] = deallocation.retain(%[[ARG0]]) of()
+// CHECK:   return %[[ARG0]], %[[RETAIN]]
+
+// CHECK: @user(%[[ARG0_0:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[OWNERSHIP:.*]]:2 = call @id(%[[ARG0_0]])
+// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
+
+// -----
+
+func.func @id_select(%arg0: i1, %arg1: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
+  %0 = arith.select %arg0, %arg1, %arg1 : memref<1x2x3xf32>
+  return %0 : memref<1x2x3xf32>
+}
+
+func.func @user(%arg0: i1, %arg1: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
+  %0 = call @id_select(%arg0, %arg1) : (i1, memref<1x2x3xf32>) -> memref<1x2x3xf32>
+  return %0 : memref<1x2x3xf32>
+}
+
+// CHECK: @id_select(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[SELECT:.*]] = arith.select %[[ARG0]], %[[ARG1]], %[[ARG1]]
+// CHECK:   %[[RETAIN:.*]] = deallocation.retain(%[[SELECT]]) of()
+// CHECK:   return %[[SELECT]], %[[RETAIN]]
+
+// CHECK: @user(%[[ARG0_0:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[OWNERSHIP:.*]]:2 = call @id_select(%[[ARG0_0]], %[[ARG1_0]])
+// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
+
+// -----
+
+func.func @ite(%arg0: i1, %arg1: memref<1x2x3xf32>, %arg2: memref<1x2x3xf32>)
+    -> memref<1x2x3xf32> {
+  %0 = scf.if %arg0 -> (memref<1x2x3xf32>) {
+    scf.yield %arg1 : memref<1x2x3xf32>
+  } else {
+    scf.yield %arg2 : memref<1x2x3xf32>
+  }
+  return %0 : memref<1x2x3xf32>
+}
+
+func.func @user(%arg0: i1, %arg1: memref<1x2x3xf32>, %arg2: memref<1x2x3xf32>)
+    -> memref<1x2x3xf32> {
+  %0 = call @ite(%arg0, %arg1, %arg2)
+      : (i1, memref<1x2x3xf32>, memref<1x2x3xf32>) -> memref<1x2x3xf32>
+  return %0 : memref<1x2x3xf32>
+}
+
+// CHECK: @ite(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>, %[[ARG2:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[IF:.*]]:2 = scf.if %[[ARG0]]
+// CHECK:     %[[RETAIN:.*]] = deallocation.retain(%[[ARG1]]) of()
+// CHECK:     scf.yield %[[ARG1]], %[[RETAIN]]
+// CHECK:   else
+// CHECK:     %[[RETAIN_0:.*]] = deallocation.retain(%[[ARG2]]) of()
+// CHECK:     scf.yield %[[ARG2]], %[[RETAIN_0]]
+// CHECK:   return %[[IF]]#0, %[[IF]]#1
+
+// CHECK: @user(%[[ARG0_0:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>, %[[ARG2_0:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[OWNERSHIP:.*]]:2 = call @ite(%[[ARG0_0]], %[[ARG1_0]], %[[ARG2_0]])
+// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
+
+// -----
+
+func.func @ite_select(%arg0: i1, %arg1: memref<1x2x3xf32>,
+    %arg2: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
+  %0 = arith.select %arg0, %arg1, %arg2 : memref<1x2x3xf32>
+  return %0 : memref<1x2x3xf32>
+}
+
+func.func @user(%arg0: i1, %arg1: memref<1x2x3xf32>, %arg2: memref<1x2x3xf32>)
+    -> memref<1x2x3xf32> {
+  %0 = call @ite_select(%arg0, %arg1, %arg2)
+      : (i1, memref<1x2x3xf32>, memref<1x2x3xf32>) -> memref<1x2x3xf32>
+  return %0 : memref<1x2x3xf32>
+}
+
+// CHECK: @ite_select(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>, %[[ARG2:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[SELECT:.*]] = arith.select %[[ARG0]], %[[ARG1]], %[[ARG2]]
+// CHECK:   %[[RETAIN:.*]] = deallocation.retain(%[[SELECT]]) of()
+// CHECK:   return %[[SELECT]], %[[RETAIN]]
+
+// CHECK: @user(%[[ARG0_0:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>, %[[ARG2_0:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[OWNERSHIP:.*]]:2 = call @ite_select(%[[ARG0_0]], %[[ARG1_0]], %[[ARG2_0]])
+// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
+
+// -----
+
+func.func @may_reuse(%arg0: i1, %arg1: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
+  %0 = scf.if %arg0 -> (memref<1x2x3xf32>) {
+    scf.yield %arg1 : memref<1x2x3xf32>
+  } else {
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x2x3xf32>
+    scf.yield %alloc : memref<1x2x3xf32>
+  }
+  return %0 : memref<1x2x3xf32>
+}
+
+func.func @user(%arg0: i1, %arg1: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
+  %0 = call @may_reuse(%arg0, %arg1) : (i1, memref<1x2x3xf32>)
+      -> memref<1x2x3xf32>
+  return %0 : memref<1x2x3xf32>
+}
+
+// CHECK: @may_reuse(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[IF:.*]]:2 = scf.if %[[ARG0]]
+// CHECK:     %[[RETAIN:.*]] = deallocation.retain(%[[ARG1]]) of()
+// CHECK:     scf.yield %[[ARG1]], %[[RETAIN]]
+// CHECK:   else
+// CHECK:     %[[ALLOC:.*]] = memref.alloc
+// CHECK:     %[[OWN:.*]] = deallocation.own %[[ALLOC]]
+// CHECK:     scf.yield %[[ALLOC]], %[[OWN]]
+// CHECK:   return %[[IF]]#0, %[[IF]]#1
+
+// CHECK: @user(%[[ARG0_0:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[OWNERSHIP:.*]]:2 = call @may_reuse(%[[ARG0_0]], %[[ARG1_0]])
+// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
+
+// -----
+
+func.func @insert(%arg0: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 7.000000e+00 : f32
+  memref.store %cst, %arg0[%c0, %c1, %c1] : memref<1x2x3xf32>
+  return %arg0 : memref<1x2x3xf32>
+}
+
+func.func @user(%arg0: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
+  %0 = call @insert(%arg0) : (memref<1x2x3xf32>) -> memref<1x2x3xf32>
+  return %0 : memref<1x2x3xf32>
+}
+
+// CHECK:     @insert(%[[ARG0:.*]]: memref<1x2x3xf32>)
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
+// CHECK-DAG:   %[[CST:.*]] = arith.constant 7.0
+// CHECK:       memref.store %[[CST]], %[[ARG0]][%[[C0]], %[[C1]], %[[C1]]]
+// CHECK:       %[[RETAIN:.*]] = deallocation.retain(%[[ARG0]]) of()
+// CHECK:       return %[[ARG0]], %[[RETAIN]]
+
+// CHECK:     @user(%[[ARG0_0:.*]]: memref<1x2x3xf32>)
+// CHECK:       %[[OWNERSHIP:.*]]:2 = call @insert(%[[ARG0_0]])
+// CHECK:       return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
+
+// -----
+
+func.func @ite_no_yielded_buffers(%pred: i1) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 7.000000e+00 : f32
+  %outer_alloc = memref.alloc() {alignment = 64 : i64} : memref<1x2x3xf32>
+  scf.if %pred {
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x2x3xf32>
+    memref.store %cst, %alloc[%c0, %c1, %c1] : memref<1x2x3xf32>
+    scf.yield
+  } else {
+    memref.store %cst, %outer_alloc[%c0, %c1, %c1] : memref<1x2x3xf32>
+    scf.yield
+  }
+  return
+}
+
+func.func @user(%arg0: i1) {
+  call @ite_no_yielded_buffers(%arg0) : (i1) -> ()
+  return
+}
+
+// CHECK:     @ite_no_yielded_buffers(%[[ARG0:.*]]: i1)
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
+// CHECK-DAG:   %[[CST:.*]] = arith.constant 7.0
+// CHECK:       %[[ALLOC:.*]] = memref.alloc
+// CHECK:       %[[OWN:.*]] = deallocation.own %[[ALLOC]]
+// CHECK:       scf.if %[[ARG0]]
+// CHECK:         %[[ALLOC_0:.*]] = memref.alloc
+// CHECK:         %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
+// CHECK:         memref.store %[[CST]], %[[ALLOC_0]][%[[C0]], %[[C1]], %[[C1]]]
+// CHECK:         deallocation.retain() of(%[[OWN_0]])
+// CHECK:       else
+// CHECK:         memref.store %[[CST]], %[[ALLOC]][%[[C0]], %[[C1]], %[[C1]]]
+// CHECK:       deallocation.retain() of(%[[OWN]])
+// CHECK:       return
+
+// CHECK:     @user(%[[ARG0_0:.*]]: i1)
+// CHECK:       call @ite_no_yielded_buffers(%[[ARG0_0]])
+// CHECK:       return
+
+// -----
+
+func.func @may_reuse(%pred: i1, %arg: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
+  %0 = scf.if %pred -> (memref<1x2x3xf32>) {
+    scf.yield %arg : memref<1x2x3xf32>
+  } else {
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x2x3xf32>
+    scf.yield %alloc : memref<1x2x3xf32>
+  }
+  return %0 : memref<1x2x3xf32>
+}
+
+func.func @user(%pred: i1, %arg: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
+  %may_escape_indirectly = memref.alloc() {alignment = 64 : i64}
+      : memref<1x2x3xf32>
+  %0 = call @may_reuse(%pred, %may_escape_indirectly) : (i1, memref<1x2x3xf32>)
+      -> memref<1x2x3xf32>
+  return %0 : memref<1x2x3xf32>
+}
+
+// CHECK: @may_reuse(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[IF:.*]]:2 = scf.if %[[ARG0]]
+// CHECK:     %[[RETAIN:.*]] = deallocation.retain(%[[ARG1]]) of()
+// CHECK:     scf.yield %[[ARG1]], %[[RETAIN]]
+// CHECK:   else
+// CHECK:     %[[ALLOC:.*]] = memref.alloc
+// CHECK:     %[[OWN:.*]] = deallocation.own %[[ALLOC]]
+// CHECK:     scf.yield %[[ALLOC]], %[[OWN]]
+// CHECK:   return %[[IF]]#0, %[[IF]]#1
+
+// CHECK: @user(%[[ARG0_0:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[ALLOC_0:.*]] = memref.alloc
+// CHECK:   %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
+// CHECK:   %[[OWNERSHIP:.*]]:2 = call @may_reuse(%[[ARG0_0]], %[[ALLOC_0]])
+// CHECK:   %[[RETAIN_0:.*]] = deallocation.retain(%[[OWNERSHIP]]#0) of(%[[OWN_0]], %[[OWNERSHIP]]#1)
+// CHECK:   return %[[OWNERSHIP]]#0, %[[RETAIN_0]]
+
+// -----
+
+func.func @insert_may_reuse_and_forward(%arg0: i1, %arg1: memref<1x2x3xf32>)
+    -> (memref<1x2x3xf32>, memref<1x2x3xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 7.000000e+00 : f32
+  %0 = scf.if %arg0 -> (memref<1x2x3xf32>) {
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x2x3xf32>
+    memref.copy %arg1, %alloc : memref<1x2x3xf32> to memref<1x2x3xf32>
+    memref.store %cst, %alloc[%c0, %c1, %c1] : memref<1x2x3xf32>
+    scf.yield %alloc : memref<1x2x3xf32>
+  } else {
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x2x3xf32>
+    memref.store %cst, %alloc[%c0, %c1, %c1] : memref<1x2x3xf32>
+    scf.yield %alloc : memref<1x2x3xf32>
+  }
+  return %0, %arg1 : memref<1x2x3xf32>, memref<1x2x3xf32>
+}
+
+func.func @user(%arg0: i1, %arg1: memref<1x2x3xf32>)
+    -> (memref<1x2x3xf32>, memref<1x2x3xf32>) {
+  %5:2 = call @insert_may_reuse_and_forward(%arg0, %arg1)
+      : (i1, memref<1x2x3xf32>) -> (memref<1x2x3xf32>, memref<1x2x3xf32>)
+  return %5#0, %5#1 : memref<1x2x3xf32>, memref<1x2x3xf32>
+}
+
+// CHECK:     @insert_may_reuse_and_forward(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>)
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
+// CHECK-DAG:   %[[CST:.*]] = arith.constant 7.0
+// CHECK:       %[[IF:.*]]:2 = scf.if %[[ARG0]]
+// CHECK:         %[[ALLOC:.*]] = memref.alloc
+// CHECK:         %[[OWN:.*]] = deallocation.own %[[ALLOC]]
+// CHECK:         memref.copy %[[ARG1]], %[[ALLOC]]
+// CHECK:         memref.store %[[CST]], %[[ALLOC]][%[[C0]], %[[C1]], %[[C1]]]
+// CHECK:         scf.yield %[[ALLOC]], %[[OWN]]
+// CHECK:       else
+// CHECK:         %[[ALLOC_0:.*]] = memref.alloc
+// CHECK:         %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
+// CHECK:         memref.store %[[CST]], %[[ALLOC_0]][%[[C0]], %[[C1]], %[[C1]]]
+// CHECK:         scf.yield %[[ALLOC_0]], %[[OWN_0]]
+// CHECK:       %[[RETAIN:.*]] = deallocation.retain(%[[ARG1]]) of()
+// CHECK:       return %[[IF]]#0, %[[ARG1]], %[[IF]]#1, %[[RETAIN]]
+
+// CHECK:     @user(%[[ARG0_0:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>)
+// CHECK:       %[[RESULT:.*]]:4 = call @insert_may_reuse_and_forward(%[[ARG0_0]], %[[ARG1_0]])
+// CHECK:       return %[[RESULT]]#0, %[[RESULT]]#1, %[[RESULT]]#2, %[[RESULT]]#3
+
+// -----
+
+func.func @f(%a : memref<1x2x3xf32>, %b : memref<1x2x3xf32>,
+    %c : memref<1x2x3xf32>, %d : memref<1x2x3xf32>, %e : memref<1x2x3xf32>)
+    -> memref<1x2x3xf32> {
+  %0 = func.call @f(%a, %a, %b, %c, %d) : (memref<1x2x3xf32>, memref<1x2x3xf32>,
+      memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>)
+      -> memref<1x2x3xf32>
+  func.return %0 : memref<1x2x3xf32>
+}
+
+func.func @user() -> memref<1x2x3xf32> {
+  %a = memref.alloc() : memref<1x2x3xf32>
+  %b = memref.alloc() : memref<1x2x3xf32>
+  %c = memref.alloc() : memref<1x2x3xf32>
+  %d = memref.alloc() : memref<1x2x3xf32>
+  %e = memref.alloc() : memref<1x2x3xf32>
+  %0 = func.call @f(%a, %b, %c, %d, %e) : (memref<1x2x3xf32>, memref<1x2x3xf32>,
+      memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>)
+      -> memref<1x2x3xf32>
+  return %0 : memref<1x2x3xf32>
+}
+
+// CHECK: @f(%[[ARG0:.*]]: memref<1x2x3xf32>, %[[ARG1:.*]]: memref<1x2x3xf32>, %[[ARG2:.*]]: memref<1x2x3xf32>, %[[ARG3:.*]]: memref<1x2x3xf32>, %[[ARG4:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[OWNERSHIP:.*]]:2 = call @f(%[[ARG0]], %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]])
+// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
+
+// CHECK: @user()
+// CHECK:   %[[ALLOC:.*]] = memref.alloc
+// CHECK:   %[[OWN:.*]] = deallocation.own %[[ALLOC]]
+// CHECK:   %[[ALLOC_0:.*]] = memref.alloc
+// CHECK:   %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
+// CHECK:   %[[ALLOC_1:.*]] = memref.alloc
+// CHECK:   %[[OWN_1:.*]] = deallocation.own %[[ALLOC_1]]
+// CHECK:   %[[ALLOC_2:.*]] = memref.alloc
+// CHECK:   %[[OWN_2:.*]] = deallocation.own %[[ALLOC_2]]
+// CHECK:   %[[ALLOC_3:.*]] = memref.alloc
+// CHECK:   %[[OWN_3:.*]] = deallocation.own %[[ALLOC_3]]
+// CHECK:   %[[OWNERSHIP_0:.*]]:2 = call @f(%[[ALLOC]], %[[ALLOC_0]], %[[ALLOC_1]], %[[ALLOC_2]], %[[ALLOC_3]])
+// CHECK:   deallocation.retain() of(%[[OWN_3]])
+// CHECK:   %[[RETAIN:.*]] = deallocation.retain(%[[OWNERSHIP_0]]#0) of(%[[OWN]], %[[OWN_0]], %[[OWN_1]], %[[OWN_2]], %[[OWNERSHIP_0]]#1)
+// CHECK:   return %[[OWNERSHIP_0]]#0, %[[RETAIN]]
+
+// -----
+
+func.func @terminating_f(%i : i32, %a : memref<1x2x3xf32>,
+    %b : memref<1x2x3xf32>, %c : memref<1x2x3xf32>, %d : memref<1x2x3xf32>,
+    %e : memref<1x2x3xf32>) -> memref<1x2x3xf32> {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+  %pred = arith.cmpi slt, %i, %c0 : i32
+  %0 = scf.if %pred -> memref<1x2x3xf32> {
+    scf.yield %a : memref<1x2x3xf32>
+  } else {
+    %i_ = arith.subi %i, %c1 : i32
+    %1 = func.call @terminating_f(%i_, %a, %a, %b, %c, %d)
+        : (i32, memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>,
+        memref<1x2x3xf32>, memref<1x2x3xf32>) -> memref<1x2x3xf32>
+    scf.yield %1 : memref<1x2x3xf32>
+  }
+  func.return %0 : memref<1x2x3xf32>
+}
+
+func.func @user() -> memref<1x2x3xf32> {
+  %c0 = arith.constant 0 : i32
+  %a = memref.alloc() : memref<1x2x3xf32>
+  %b = memref.alloc() : memref<1x2x3xf32>
+  %c = memref.alloc() : memref<1x2x3xf32>
+  %d = memref.alloc() : memref<1x2x3xf32>
+  %e = memref.alloc() : memref<1x2x3xf32>
+  %0 = func.call @terminating_f(%c0, %a, %b, %c, %d, %e)
+      : (i32, memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>,
+      memref<1x2x3xf32>, memref<1x2x3xf32>) -> memref<1x2x3xf32>
+  return %0 : memref<1x2x3xf32>
+}
+
+// CHECK:     @terminating_f(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: memref<1x2x3xf32>, %[[ARG2:.*]]: memref<1x2x3xf32>, %[[ARG3:.*]]: memref<1x2x3xf32>, %[[ARG4:.*]]: memref<1x2x3xf32>, %[[ARG5:.*]]: memref<1x2x3xf32>)
+// CHECK-DAG:   %[[C0_I32:.*]] = arith.constant 0 : i32
+// CHECK-DAG:   %[[C1_I32:.*]] = arith.constant 1 : i32
+// CHECK:       %[[CMPI:.*]] = arith.cmpi slt, %[[ARG0]], %[[C0_I32]]
+// CHECK:       %[[IF:.*]]:2 = scf.if %[[CMPI]]
+// CHECK:         %[[RETAIN:.*]] = deallocation.retain(%[[ARG1]]) of()
+// CHECK:         scf.yield %[[ARG1]], %[[RETAIN]]
+// CHECK:       else
+// CHECK:         %[[SUBI:.*]] = arith.subi %[[ARG0]], %[[C1_I32]]
+// CHECK:         %[[OWNERSHIP:.*]]:2 = func.call @terminating_f(%[[SUBI]], %[[ARG1]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
+// CHECK:         scf.yield %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
+// CHECK:       return %[[IF]]#0, %[[IF]]#1
+
+// CHECK:     @user()
+// CHECK-DAG:   %[[C0_I32_0:.*]] = arith.constant 0 : i32
+// CHECK:       %[[ALLOC:.*]] = memref.alloc
+// CHECK:       %[[OWN:.*]] = deallocation.own %[[ALLOC]]
+// CHECK:       %[[ALLOC_0:.*]] = memref.alloc
+// CHECK:       %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
+// CHECK:       %[[ALLOC_1:.*]] = memref.alloc
+// CHECK:       %[[OWN_1:.*]] = deallocation.own %[[ALLOC_1]]
+// CHECK:       %[[ALLOC_2:.*]] = memref.alloc
+// CHECK:       %[[OWN_2:.*]] = deallocation.own %[[ALLOC_2]]
+// CHECK:       %[[ALLOC_3:.*]] = memref.alloc
+// CHECK:       %[[OWN_3:.*]] = deallocation.own %[[ALLOC_3]]
+// CHECK:       %[[OWNERSHIP_0:.*]]:2 = call @terminating_f(%[[C0_I32_0]], %[[ALLOC]], %[[ALLOC_0]], %[[ALLOC_1]], %[[ALLOC_2]], %[[ALLOC_3]])
+// CHECK:       deallocation.retain() of(%[[OWN_3]])
+// CHECK:       %[[RETAIN_0:.*]] = deallocation.retain(%[[OWNERSHIP_0]]#0) of(%[[OWN]], %[[OWN_0]], %[[OWN_1]], %[[OWN_2]], %[[OWNERSHIP_0]]#1)
+// CHECK:       return %[[OWNERSHIP_0]]#0, %[[RETAIN_0]]
+
+// -----
+
+func.func @id(%arg0 : memref<1x2x3xf32>, %arg1 : memref<1x2x3xf32>)
+    -> memref<1x2x3xf32> {
+  func.return %arg1 : memref<1x2x3xf32>
+}
+
+func.func @user() -> (memref<1x2x3xf32>, memref<1x2x3xf32>) {
+  %alloc0 = memref.alloc() : memref<1x2x3xf32>
+  %alloc1 = memref.alloc() : memref<1x2x3xf32>
+  %alloc2 = memref.alloc() : memref<1x2x3xf32>
+  %0 = func.call @id(%alloc0, %alloc2) : (memref<1x2x3xf32>, memref<1x2x3xf32>)
+      -> memref<1x2x3xf32>
+  %1 = func.call @id(%alloc1, %alloc2) : (memref<1x2x3xf32>, memref<1x2x3xf32>)
+      -> memref<1x2x3xf32>
+  func.return %0, %1 : memref<1x2x3xf32>, memref<1x2x3xf32>
+}
+
+// CHECK: @id(%[[ARG0:.*]]: memref<1x2x3xf32>, %[[ARG1:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[RETAIN:.*]] = deallocation.retain(%[[ARG1]]) of()
+// CHECK:   return %[[ARG1]], %[[RETAIN]]
+
+// CHECK: @user()
+// CHECK:   %[[ALLOC:.*]] = memref.alloc
+// CHECK:   %[[OWN:.*]] = deallocation.own %[[ALLOC]]
+// CHECK:   %[[ALLOC_0:.*]] = memref.alloc
+// CHECK:   %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]] : memref<1x2x3xf32>
+// CHECK:   %[[ALLOC_1:.*]] = memref.alloc
+// CHECK:   %[[OWN_1:.*]] = deallocation.own %[[ALLOC_1]] : memref<1x2x3xf32>
+// CHECK:   %[[OWNERSHIP:.*]]:2 = call @id(%[[ALLOC]], %[[ALLOC_1]])
+// CHECK:   deallocation.retain() of(%[[OWN]])
+// CHECK:   %[[OWNERSHIP_0:.*]]:2 = call @id(%[[ALLOC_0]], %[[ALLOC_1]])
+// CHECK:   deallocation.retain() of(%[[OWN_0]])
+// CHECK:   %[[RETAIN_0:.*]]:2 = deallocation.retain(%[[OWNERSHIP]]#0, %[[OWNERSHIP_0]]#0) of(%[[OWN_1]], %[[OWNERSHIP]]#1, %[[OWNERSHIP_0]]#1)
+// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP_0]]#0, %[[RETAIN_0]]#0, %[[RETAIN_0]]#1
+
+// -----
+
+func.func @forward(%arg0: memref<1x2x3xf32>, %arg1: memref<1x2x3xf32>,
+    %arg2: memref<1x2x3xf32>) -> (memref<1x2x3xf32>, memref<1x2x3xf32>,
+    memref<1x2x3xf32>) {
+  func.return %arg0, %arg1, %arg2 : memref<1x2x3xf32>, memref<1x2x3xf32>,
+      memref<1x2x3xf32>
+}
+
+func.func @replace(%arg0: memref<1x2x3xf32>, %arg1: memref<1x2x3xf32>,
+    %arg2: memref<1x2x3xf32>) -> (memref<1x2x3xf32>, memref<1x2x3xf32>,
+    memref<1x2x3xf32>) {
+  %alloc0 = memref.alloc() : memref<1x2x3xf32>
+  %alloc1 = memref.alloc() : memref<1x2x3xf32>
+  %alloc2 = memref.alloc() : memref<1x2x3xf32>
+  func.return %alloc0, %alloc1, %alloc2
+      : memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>
+}
+
+func.func @user() -> (memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>,
+    memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>) {
+  %alloc0 = memref.alloc() : memref<1x2x3xf32>
+  %alloc1 = memref.alloc() : memref<1x2x3xf32>
+  %alloc2 = memref.alloc() : memref<1x2x3xf32>
+  %0:3 = func.call @forward(%alloc0, %alloc1, %alloc2)
+      : (memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>)
+      -> (memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>)
+  %1:3 = func.call @replace(%alloc0, %alloc1, %alloc2)
+      : (memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>)
+      -> (memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>)
+  func.return %0#0, %0#1, %0#2, %1#0, %1#1, %1#2 : memref<1x2x3xf32>,
+      memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>,
+      memref<1x2x3xf32>, memref<1x2x3xf32>
+}
+
+// CHECK: @forward(%[[ARG0:.*]]: memref<1x2x3xf32>, %[[ARG1:.*]]: memref<1x2x3xf32>, %[[ARG2:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[RETAIN:.*]] = deallocation.retain(%[[ARG0]]) of()
+// CHECK:   %[[RETAIN_0:.*]] = deallocation.retain(%[[ARG1]]) of()
+// CHECK:   %[[RETAIN_1:.*]] = deallocation.retain(%[[ARG2]]) of()
+// CHECK:   return %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[RETAIN]], %[[RETAIN_0]], %[[RETAIN_1]]
+
+// CHECK: @replace(%[[ARG0_0:.*]]: memref<1x2x3xf32>, %[[ARG1_0:.*]]: memref<1x2x3xf32>, %[[ARG2_0:.*]]: memref<1x2x3xf32>)
+// CHECK:   %[[ALLOC:.*]] = memref.alloc
+// CHECK:   %[[OWN:.*]] = deallocation.own %[[ALLOC]]
+// CHECK:   %[[ALLOC_0:.*]] = memref.alloc
+// CHECK:   %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
+// CHECK:   %[[ALLOC_1:.*]] = memref.alloc
+// CHECK:   %[[OWN_1:.*]] = deallocation.own %[[ALLOC_1]]
+// CHECK:   return %[[ALLOC]], %[[ALLOC_0]], %[[ALLOC_1]], %[[OWN]], %[[OWN_0]], %[[OWN_1]]
+
+// CHECK: @user()
+// CHECK:   %[[ALLOC_2:.*]] = memref.alloc
+// CHECK:   %[[OWN_2:.*]] = deallocation.own %[[ALLOC_2]]
+// CHECK:   %[[ALLOC_0_0:.*]] = memref.alloc
+// CHECK:   %[[OWN_3:.*]] = deallocation.own %[[ALLOC_0_0]]
+// CHECK:   %[[ALLOC_1_0:.*]] = memref.alloc
+// CHECK:   %[[OWN_4:.*]] = deallocation.own %[[ALLOC_1_0]]
+// CHECK:   %[[OWNERSHIP:.*]]:6 = call @forward(%[[ALLOC_2]], %[[ALLOC_0_0]], %[[ALLOC_1_0]])
+// CHECK:   %[[OWNERSHIP_0:.*]]:6 = call @replace(%[[ALLOC_2]], %[[ALLOC_0_0]], %[[ALLOC_1_0]])
+// CHECK:   %[[RETAIN_2:.*]] = deallocation.retain(%[[OWNERSHIP]]#0) of(%[[OWN_2]], %[[OWNERSHIP]]#3)
+// CHECK:   %[[RETAIN_3:.*]] = deallocation.retain(%[[OWNERSHIP]]#1) of(%[[OWN_3]], %[[OWNERSHIP]]#4)
+// CHECK:   %[[RETAIN_4:.*]] = deallocation.retain(%[[OWNERSHIP]]#2) of(%[[OWN_4]], %[[OWNERSHIP]]#5)
+// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1, %[[OWNERSHIP]]#2, %[[OWNERSHIP_0]]#0, %[[OWNERSHIP_0]]#1, %[[OWNERSHIP_0]]#2, %[[RETAIN_2]], %[[RETAIN_3]], %[[RETAIN_4]], %[[OWNERSHIP_0]]#3, %[[OWNERSHIP_0]]#4, %[[OWNERSHIP_0]]#5
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocate_invalid.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocate_invalid.mlir
new file mode 100644
index 00000000000..9604880ced0
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocate_invalid.mlir
@@ -0,0 +1,76 @@
+// RUN: mlir-hlo-opt -allow-unregistered-dialect %s -split-input-file -hlo-deallocate -verify-diagnostics
+
+func.func @dealloc_invalid(%lb: index, %ub: index, %step: index) {
+  %alloc = memref.alloc() : memref<i32>
+  scf.for %i = %lb to %ub step %step {  // expected-error {{can't implicitly capture across loop boundaries}}
+    memref.dealloc %alloc: memref<i32>
+  }
+  return
+}
+
+// -----
+
+func.func @realloc_no_else(%size: index, %cond: i1) {
+  %alloc = memref.alloc(%size) : memref<?xi32>
+  scf.if %cond {  // expected-error {{cannot implicitly capture from an if without else}}
+    %realloc = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
+  }
+  return
+}
+
+// -----
+
+func.func @realloc_not_yielded(%size: index, %cond: i1) {
+  %alloc = memref.alloc(%size) : memref<?xi32>
+  scf.if %cond {  // expected-error {{released value not yielded on other branch}}
+    %realloc = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
+  } else {
+    "test.dummy"() : () -> ()
+  }
+  return
+}
+
+// -----
+
+func.func @realloc_arg(%arg: memref<?xi32>, %size: index) {
+  %realloc = memref.realloc %arg(%size) : memref<?xi32> to memref<?xi32>  // expected-error {{unable to find ownership indicator for operand}}
+  return
+}
+
+// -----
+
+func.func @realloc_twice(%size: index) {  // expected-error {{invalid realloc of memref}}
+  %alloc = memref.alloc(%size) : memref<?xi32>
+  %realloc0 = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
+  %realloc1 = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
+  return
+}
+
+// -----
+
+func.func @realloc_twice_in_if(%size: index, %cond: i1) {  // expected-error {{invalid realloc of memref}}
+  %alloc = memref.alloc(%size) : memref<?xi32>
+  scf.if %cond -> memref<?xi32> {
+    %realloc = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
+    scf.yield %realloc : memref<?xi32>
+  } else {
+    scf.yield %alloc : memref<?xi32>
+  }
+  scf.if %cond -> memref<?xi32> {
+    %realloc = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
+    scf.yield %realloc : memref<?xi32>
+  } else {
+    scf.yield %alloc : memref<?xi32>
+  }
+  return
+}
+
+// -----
+
+func.func @cross_loop_boundary(%size: index, %lb: index, %ub: index, %step: index) {
+  %alloc = memref.alloc(%size) : memref<?xi32>
+  scf.for %i = %lb to %ub step %step {  // expected-error {{can't implicitly capture across loop boundaries}}
+    memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
+  }
+  return
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_ops.mlir
index a28eb61694e..0dd5e47ad9c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_ops.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_ops.mlir
@@ -1,29 +1,39 @@
 // RUN: mlir-hlo-opt %s --split-input-file --verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL: @retain
-func.func @retain(%arg0: memref<*xf32>, %arg1: memref<123xf32>)
-    -> memref<456xf32> {
-  %0 = deallocation.retain(%arg0) of(%arg1, %arg0)
-      : (memref<*xf32>, memref<123xf32>, memref<*xf32>) -> memref<456xf32>
-  return %0 : memref<456xf32>
+func.func @retain(%arg0: memref<2xf32>, %arg1: !deallocation.ownership, %arg2: !deallocation.ownership)
+    -> !deallocation.ownership {
+  %0 = deallocation.retain(%arg0) of(%arg1, %arg2)
+      : (memref<2xf32>, !deallocation.ownership, !deallocation.ownership) -> !deallocation.ownership
+  return %0 : !deallocation.ownership
 }
 
-// -----
-
-func.func @invalid_retain(%arg0: memref<*xf32>, %arg1: memref<123xf64>)
-    -> memref<456xf32> {
-  // expected-error@+1 {{expected homogeneous operand and result element type}}
-  %0 = deallocation.retain(%arg0) of(%arg1, %arg0)
-      : (memref<*xf32>, memref<123xf64>, memref<*xf32>) -> memref<456xf32>
-  return %0 : memref<456xf32>
+// CHECK-LABEL: @get_buffer
+func.func @get_buffer(%arg0: memref<2xf32>) -> index {
+  %0 = deallocation.get_buffer %arg0 : memref<2xf32>
+  return %0 : index
 }
 
-// -----
-
-func.func @invalid_retain_2(%arg0: memref<*xf32>, %arg1: memref<123xf32>)
-    -> memref<456xf64> {
-  // expected-error@+1 {{expected homogeneous operand and result element type}}
-  %0 = deallocation.retain(%arg0) of(%arg1, %arg0)
-      : (memref<*xf32>, memref<123xf32>, memref<*xf32>) -> memref<456xf64>
-  return %0 : memref<456xf64>
+// CHECK-LABEL: @get_ownership_buffer
+func.func @get_ownership_buffer(%arg0: !deallocation.ownership) -> index {
+  %0 = deallocation.get_buffer %arg0 : !deallocation.ownership
+  return %0 : index
 }
+
+// CHECK-LABEL: @own
+func.func @own(%arg0: memref<2xf32>) -> !deallocation.ownership {
+  %0 = deallocation.own %arg0 : memref<2xf32>
+  return %0 : !deallocation.ownership
+}
+
+// CHECK-LABEL: @null
+func.func @null() -> !deallocation.ownership {
+  %0 = deallocation.null
+  return %0 : !deallocation.ownership
+}
+
+// CHECK-LABEL: @free
+func.func @free(%arg0: !deallocation.ownership) {
+  deallocation.free %arg0
+  return
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_simplification.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_simplification.mlir
new file mode 100644
index 00000000000..ebefd3dce49
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_simplification.mlir
@@ -0,0 +1,181 @@
+// RUN: mlir-hlo-opt %s -allow-unregistered-dialect -hlo-deallocation-simplification | FileCheck %s
+
+func.func @retain_is_dealloc() {
+  %alloc = memref.alloc() : memref<2xf32>
+  %alloc_owned = deallocation.own %alloc : memref<2xf32>
+  "test.use"(%alloc) : (memref<2xf32>) -> ()
+  deallocation.retain() of (%alloc_owned) : (!deallocation.ownership) -> ()
+  return
+}
+
+// CHECK-LABEL: @retain_is_dealloc
+// CHECK-NEXT: %[[ALLOC:.*]] = memref.alloc()
+// CHECK-NEXT: test.use
+// CHECK-NEXT: memref.dealloc %[[ALLOC]]
+
+// -----
+
+func.func @retain_of_nothing(%arg: memref<2xf32>) -> !deallocation.ownership {
+  %ret = deallocation.retain(%arg) of() : (memref<2xf32>) -> (!deallocation.ownership)
+  return %ret : !deallocation.ownership
+}
+
+// CHECK-LABEL: @retain_of_nothing
+// CHECK-SAME: (%[[ARG:.*]]: memref<2xf32>
+// CHECK-NEXT: %[[NULL:.*]] = deallocation.null
+// CHECK-NEXT: return %[[NULL]]
+
+// -----
+
+func.func @retain_is_dealloc_for(%lb: index, %ub: index, %step: index) {
+  %alloc = memref.alloc() : memref<2xf32>
+  %alloc_owned = deallocation.own %alloc : memref<2xf32>
+  %for:2 = scf.for %i = %lb to %ub step %step iter_args(%arg0 = %alloc, %arg1 = %alloc_owned)
+      -> (memref<2xf32>, !deallocation.ownership) {
+    "some.use"(%arg0) : (memref<2xf32>) -> ()
+    scf.yield %arg0, %arg1 : memref<2xf32>, !deallocation.ownership
+  }
+  deallocation.retain() of(%for#1) : (!deallocation.ownership) -> ()
+  return
+}
+
+// CHECK-LABEL: @retain_is_dealloc_for
+// CHECK-NEXT: memref.alloc()
+// CHECK-NEXT: deallocation.null
+// CHECK-NEXT: %[[FOR:.*]]:2 = scf.for
+// CHECK-NEXT:   some.use
+// CHECK-NEXT:   scf.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: memref.dealloc %[[FOR]]#0
+// CHECK-NEXT: return
+
+// -----
+
+func.func @retain_is_dealloc_reallocated(%lb: index, %ub: index, %step: index) {
+  %alloc = memref.alloc() : memref<2xf32>
+  %alloc_owned = deallocation.own %alloc : memref<2xf32>
+  %for:2 = scf.for %i = %lb to %ub step %step iter_args(%arg0 = %alloc, %arg1 = %alloc_owned)
+      -> (memref<2xf32>, !deallocation.ownership) {
+    "some.use"(%arg0) : (memref<2xf32>) -> ()
+    deallocation.retain() of(%arg1) : (!deallocation.ownership) -> ()
+    %alloc0 = memref.alloc() : memref<2xf32>
+    %alloc0_owned = deallocation.own %alloc0 : memref<2xf32>
+    scf.yield %alloc, %alloc0_owned : memref<2xf32>, !deallocation.ownership
+  }
+  deallocation.retain() of(%for#1) : (!deallocation.ownership) -> ()
+  return
+}
+
+// CHECK-LABEL: @retain_is_dealloc_reallocated
+// CHECK-NEXT: memref.alloc
+// CHECK-NEXT: deallocation.null
+// CHECK-NEXT: %[[FOR:.*]]:2 = scf.for
+// CHECK:        memref.dealloc
+// CHECK:      }
+// CHECK:      memref.dealloc %[[FOR]]
+
+// -----
+
+func.func @retain_is_not_dealloc_for(
+    %x: memref<2xf32>, %x_owned: !deallocation.ownership,
+    %lb: index, %ub: index, %step: index) {
+  %for:2 = scf.for %i = %lb to %ub step %step iter_args(%arg0 = %x, %arg1 = %x_owned)
+      -> (memref<2xf32>, !deallocation.ownership) {
+    "some.use"(%arg0) : (memref<2xf32>) -> ()
+    deallocation.retain() of(%arg1) : (!deallocation.ownership) -> ()
+    %alloc = memref.alloc() : memref<2xf32>
+    %alloc_owned = deallocation.own %alloc : memref<2xf32>
+    scf.yield %alloc, %alloc_owned : memref<2xf32>, !deallocation.ownership
+  }
+  deallocation.retain() of(%for#1) : (!deallocation.ownership) -> ()
+  return
+}
+
+// CHECK-LABEL: @retain_is_not_dealloc_for
+// CHECK: %[[FOR:.*]]:2 = scf.for
+// CHECK: deallocation.retain() of(%[[FOR]]#1)
+
+// -----
+
+func.func @retain_is_dealloc_while() {
+  %a = memref.alloc() : memref<2xf32>
+  %a_owned = deallocation.own %a : memref<2xf32>
+  %while:2 = scf.while (%arg0 = %a, %arg1 = %a_owned)
+      : (memref<2xf32>, !deallocation.ownership) -> (memref<2xf32>, !deallocation.ownership) {
+    %0 = "test.make_condition"() : () -> i1
+    scf.condition(%0) %arg0, %arg1 : memref<2xf32>, !deallocation.ownership
+  } do {
+  ^bb0(%arg0: memref<2xf32>, %arg1: !deallocation.ownership):
+    "some.use"(%arg0) : (memref<2xf32>) -> ()
+    deallocation.retain() of(%arg1) : (!deallocation.ownership) -> ()
+    %b = memref.alloc() : memref<2xf32>
+    %b_owned = deallocation.own %b : memref<2xf32>
+    scf.yield %b, %b_owned: memref<2xf32>, !deallocation.ownership
+  }
+  deallocation.retain() of (%while#1) : (!deallocation.ownership) -> ()
+  return
+}
+
+// CHECK-LABEL: @retain_is_dealloc_while
+// CHECK: %[[WHILE:.*]]:2 = scf.while
+// CHECK: memref.dealloc %[[WHILE]]#0
+
+// -----
+
+func.func @retain_is_dealloc_while_permute() {
+  %a = memref.alloc() : memref<f32>
+  %a_owned = deallocation.own %a : memref<f32>
+  %b = memref.alloc() : memref<f32>
+  %b_owned = deallocation.own %b : memref<f32>
+  %c = memref.alloc() : memref<f32>
+  %c_owned = deallocation.own %c : memref<f32>
+  %w:6 = scf.while (%arg0 = %a, %arg1 = %b, %arg2 = %c,
+                    %arg3 = %a_owned, %arg4 = %b_owned, %arg5 = %c_owned)
+    : (memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership) ->
+      (memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership) {
+    %cond = "test.make_condition"() : () -> i1
+    scf.condition(%cond) %arg2, %arg1, %arg0, %arg5, %arg4, %arg3
+      : memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership
+  } do {
+  ^bb0(%arg0: memref<f32>, %arg1: memref<f32>, %arg2: memref<f32>,
+        %arg3: !deallocation.ownership, %arg4: !deallocation.ownership, %arg5: !deallocation.ownership):
+    scf.yield %arg1, %arg0, %arg2, %arg4, %arg3, %arg5
+      : memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership
+  }
+  "test.use"(%w#1) : (memref<f32>) -> ()
+  deallocation.retain() of (%w#3) : (!deallocation.ownership) -> ()
+  deallocation.retain() of (%w#4) : (!deallocation.ownership) -> ()
+  deallocation.retain() of (%w#5) : (!deallocation.ownership) -> ()
+  return
+}
+
+// CHECK-LABEL: @retain_is_dealloc_while_permute
+// CHECK: memref.alloc
+// CHECK: memref.alloc
+// CHECK: memref.alloc
+// CHECK: %[[WHILE:.*]]:6 = scf.while
+// CHECK: memref.dealloc %[[WHILE]]
+// CHECK: memref.dealloc %[[WHILE]]
+// CHECK: memref.dealloc %[[WHILE]]
+
+func.func @retain_of_null(%arg0: memref<4xi32>, %arg1: memref<4xi32>,
+                          %arg2: index, %arg3: index, %arg4: index) {
+  %0 = deallocation.null
+  %2:4 = scf.for %arg5 = %arg2 to %arg3 step %arg4
+      iter_args(%arg6 = %arg0, %arg7 = %arg1, %arg8 = %0, %arg9 = %0) ->
+      (memref<4xi32>, memref<4xi32>, !deallocation.ownership, !deallocation.ownership) {
+    "test.use"(%arg6, %arg7) : (memref<4xi32>, memref<4xi32>) -> ()
+    %3 = deallocation.retain(%arg6) of(%arg8)
+      : (memref<4xi32>, !deallocation.ownership) -> !deallocation.ownership
+    %4 = deallocation.retain(%arg7) of(%arg9)
+      : (memref<4xi32>, !deallocation.ownership) -> !deallocation.ownership
+    scf.yield %arg7, %arg6, %4, %3
+      : memref<4xi32>, memref<4xi32>, !deallocation.ownership, !deallocation.ownership
+  }
+  deallocation.retain() of(%2#2) : (!deallocation.ownership) -> ()
+  deallocation.retain() of(%2#3) : (!deallocation.ownership) -> ()
+  return
+}
+
+// CHECK-LABEL: @retain_of_null
+// CHECK-NOT: deallocation.retain()
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_to_scf.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_to_scf.mlir
index 4f5c652ad3d..a181ef97c44 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_to_scf.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_to_scf.mlir
@@ -1,70 +1,42 @@
 // RUN: mlir-hlo-opt %s -hlo-deallocation-to-scf | FileCheck %s
 
-func.func @retain_nothing(%arg0: memref<*xf32>) {
-  deallocation.retain() of (%arg0) : (memref<*xf32>) -> ()
+func.func @retain_nothing(%arg0: !deallocation.ownership) {
+  deallocation.retain() of (%arg0) : (!deallocation.ownership) -> ()
   return
 }
 
 // CHECK-LABEL: @retain_nothing
-// CHECK-SAME:     %[[ARG:.*]]: memref<*xf32>
+// CHECK-SAME:     %[[ARG:.*]]:
 // CHECK-NEXT:  %[[ZERO:.*]] = arith.constant 0 : index
 // CHECK-NEXT:  %[[BUF:.*]] = deallocation.get_buffer %[[ARG]]
 // CHECK-NEXT:  %[[NONNULL:.*]] = arith.cmpi ne, %[[BUF]], %[[ZERO]]
 // CHECK-NEXT:  scf.if %[[NONNULL]] {
-// CHECK-NEXT:    memref.dealloc %[[ARG]]
+// CHECK-NEXT:    deallocation.free %[[ARG]]
 // CHECK-NEXT:  }
 
 // -----
 
-func.func @retain_something(%arg0: memref<2xf32>, %arg1: memref<2xf32>)
-    -> memref<*xf32> {
-  %ret = deallocation.retain(%arg0) of (%arg1) : (memref<2xf32>, memref<2xf32>)
-      -> (memref<*xf32>)
-  return %ret : memref<*xf32>
+func.func @retain_something(%arg0: memref<2xf32>, %arg1: !deallocation.ownership)
+    -> !deallocation.ownership {
+  %ret = deallocation.retain(%arg0) of (%arg1) : (memref<2xf32>, !deallocation.ownership)
+      -> (!deallocation.ownership)
+  return %ret : !deallocation.ownership
 }
 
 // CHECK-LABEL: @retain_something
-// CHECK-SAME:     %[[ARG0:.*]]: memref<2xf32>, %[[ARG1:.*]]: memref<2xf32>
+// CHECK-SAME:     %[[ARG0:.*]]: memref<2xf32>, %[[ARG1:.*]]:
 // CHECK-NEXT:  %[[ZERO:.*]] = arith.constant 0 : index
-// CHECK-NEXT:  %[[ARG1_CAST:.*]] = memref.cast %[[ARG1]]
 // CHECK-NEXT:  %[[BUF:.*]] = deallocation.get_buffer %[[ARG1]]
 // CHECK-NEXT:  %[[NULL:.*]] = deallocation.null
 // CHECK-NEXT:  %[[RETAINED_BUF:.*]] = deallocation.get_buffer %[[ARG0]]
 // CHECK-NEXT:  %[[SAME:.*]] = arith.cmpi eq, %[[RETAINED_BUF]], %[[BUF]]
 // CHECK-NEXT:  %[[RET:.*]]:3 = scf.if %[[SAME]]
-// CHECK-NEXT:    scf.yield %[[NULL]], %[[ZERO]], %[[ARG1_CAST]]
+// CHECK-NEXT:    scf.yield %[[NULL]], %[[ZERO]], %[[ARG1]]
 // CHECK-NEXT:  } else {
-// CHECK-NEXT:    scf.yield %[[ARG1_CAST]], %[[BUF]], %[[NULL]]
+// CHECK-NEXT:    scf.yield %[[ARG1]], %[[BUF]], %[[NULL]]
 // CHECK-NEXT:  }
 // CHECK-NEXT:  %[[DEALLOC:.*]] = arith.cmpi ne, %[[RET]]#1, %[[ZERO]]
 // CHECK-NEXT:  scf.if %[[DEALLOC]] {
-// CHECK-NEXT:    memref.dealloc %[[RET]]#0
+// CHECK-NEXT:    deallocation.free %[[RET]]#0
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return %[[RET]]#2
-
-func.func @retain_multiple(%arg0: memref<?xi32>, %arg1: memref<?xi32>,
-        %arg2: memref<*xi32>, %arg3: memref<*xi32>)
-    -> (memref<*xi32>, memref<*xi32>) {
-  %ret:2 = deallocation.retain(%arg0, %arg1) of (%arg2, %arg3)
-    : (memref<?xi32>, memref<?xi32>, memref<*xi32>, memref<*xi32>)
-    -> (memref<*xi32>, memref<*xi32>)
-  return %ret#0, %ret#1 : memref<*xi32>, memref<*xi32>
-}
-
-// CHECK-LABEL: @retain_multiple
-// CHECK-SAME:     %[[ARG0:.*]]: memref<?xi32>, %[[ARG1:.*]]: memref<?xi32>
-// CHECK-SAME:     %[[ARG2:.*]]: memref<*xi32>, %[[ARG3:.*]]: memref<*xi32>
-// CHECK-NEXT:  %[[ZERO:.*]] = arith.constant 0 : index
-// CHECK-NEXT:  %[[BUF2:.*]] = deallocation.get_buffer %[[ARG2]]
-// CHECK-NEXT:  %[[BUF3:.*]] = deallocation.get_buffer %[[ARG3]]
-// CHECK-NEXT:  %[[NULL:.*]] = deallocation.null
-// CHECK-NEXT:  %[[BUF0:.*]] = deallocation.get_buffer %[[ARG0]]
-// CHECK:       %[[CMP:.*]] = arith.cmpi eq, %[[BUF0]], %[[BUF2]] : index
-// CHECK:       %[[T0:.*]]:3 = scf.if %[[CMP]]
-// CHECK:       %[[CMP:.*]] = arith.cmpi eq, %[[BUF0]], %[[BUF3]] : index
-// CHECK:       %[[T1:.*]]:3 = scf.if %[[CMP]]
-// CHECK:       %[[BUF1:.*]] = deallocation.get_buffer %[[ARG1]]
-// CHECK:       %[[CMP:.*]] = arith.cmpi eq, %[[BUF1]], %[[T0]]#1 : index
-// CHECK:       scf.if %[[CMP]]
-// CHECK:       %[[CMP:.*]] = arith.cmpi eq, %[[BUF1]], %[[T1]]#1 : index
-// CHECK:       scf.if %[[CMP]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/bufferization.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/bufferization.mlir
index 8bdc855c2cd..abcb79adfda 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/bufferization.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/bufferization.mlir
@@ -188,3 +188,116 @@ func.func @scf.forall_private_var(%t: tensor<10xf32>) -> f32 {
   return %r : f32
 }
 
+// -----
+
+func.func @gml_st_fusion(%arg0: tensor<?xf32>,
+    %init: tensor<?xf32>) -> tensor<?xf32> {
+  %0 = gml_st.fusion ins(%a0 = %arg0 : tensor<?xf32>)
+                     inits(%in = %init : tensor<?xf32>) {
+    %res = linalg.map { math.exp }
+      ins(%a0 : tensor<?xf32>)
+      outs(%in : tensor<?xf32>)
+    gml_st.yield %res : tensor<?xf32>
+  } : tensor<?xf32>
+  func.return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: func @gml_st_fusion
+// CHECK-SAME:      %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>
+// CHECK:         gml_st.fusion
+// CHECK-SAME:        ins(%[[ARG0_:.*]] = %[[ARG0]]: memref<?xf32>)
+// CHECK-SAME:        inits(%[[ARG1_:.*]] = %[[ARG1]]: memref<?xf32>)
+// CHECK:           linalg.map { math.exp }
+// CHECK-SAME:          ins(%[[ARG0_]] : memref<?xf32>)
+// CHECK-SAME:          outs(%[[ARG1_]] : memref<?xf32>)
+// CHECK:            gml_st.yield %[[ARG1_]] : memref<?xf32>
+// CHECK:         return %[[ARG1]] : memref<?xf32>
+
+// -----
+
+func.func @gml_st_fusion_temp_tensor(
+    %arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
+  %c0 = arith.constant 0 : index
+  %dim0 = tensor.dim %arg0, %c0 : tensor<?xf32>
+  %init = tensor.empty(%dim0) : tensor<?xf32>
+  %0 = gml_st.fusion ins(%arg0_ = %arg0 : tensor<?xf32>,
+                         %arg1_ = %arg1 : tensor<?xf32>)
+                     inits(%init_ = %init : tensor<?xf32>) {
+    %c0_ = arith.constant 0 : index
+    %dim0_ = tensor.dim %arg0_, %c0_ : tensor<?xf32>
+    %temp = tensor.empty(%dim0_) : tensor<?xf32>
+    %map0 = linalg.map { math.exp }
+      ins(%arg0_ : tensor<?xf32>)
+      outs(%temp : tensor<?xf32>)
+    %map1 = linalg.map { arith.mulf }
+      ins(%map0, %arg1_ : tensor<?xf32>, tensor<?xf32>)
+      outs(%init_ : tensor<?xf32>)
+    gml_st.yield %map1 : tensor<?xf32>
+  } : tensor<?xf32>
+  func.return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL:  func @gml_st_fusion_temp_tensor
+// CHECK-SAME:       (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>)
+// CHECK:          %[[C0:.*]] = arith.constant 0 : index
+// CHECK:          %[[DIM:.*]] = memref.dim %[[ARG0]], %[[C0]] : memref<?xf32>
+// CHECK:          %[[ALLOC:.*]] = memref.alloc(%[[DIM]])
+// CHECK:          gml_st.fusion
+// CHECK-SAME:         ins(%[[ARG0_:.*]] = %[[ARG0]]: memref<?xf32>,
+// CHECK-SAME:             %[[ARG1_:.*]] = %[[ARG1]]: memref<?xf32>)
+// CHECK-SAME:         inits(%[[INIT_:.*]] = %[[ALLOC]]: memref<?xf32>)
+// CHECK-DAG:        %[[C0_:.*]] = arith.constant 0 : index
+// CHECK:            %[[DIM_:.*]] = memref.dim %[[ARG0_]], %[[C0_]]
+// CHECK:            %[[ALLOC_:.*]] = memref.alloc(%[[DIM_]])
+// CHECK:            linalg.map { math.exp }
+// CHECK-SAME:         ins(%[[ARG0_]]
+// CHECK-SAME:         outs(%[[ALLOC_]]
+// CHECK:            linalg.map { arith.mulf }
+// CHECK-SAME:         ins(%[[ALLOC_]], %[[ARG1_]]
+// CHECK-SAME:         outs(%[[INIT_]]
+// CHECK:            gml_st.yield %[[INIT_]] : memref<?xf32>
+// CHECK:          return %[[ALLOC]] : memref<?xf32>
+
+// -----
+
+func.func @gml_st_fusion_scalar_scf_for(%arg0: tensor<?xi64>) -> tensor<i64> {
+  %0 = tensor.empty() : tensor<i64>
+  %1 = gml_st.fusion
+         ins(%arg1 = %arg0: tensor<?xi64>)
+         inits(%arg2 = %0: tensor<i64>) {
+    %c1_i64 = arith.constant 1 : i64
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %dim = tensor.dim %arg1, %c0 : tensor<?xi64>
+    %2 = scf.for %arg3 = %c0 to %dim step %c1
+           iter_args(%arg4 = %c1_i64) -> (i64) {
+      %extracted = tensor.extract %arg1[%arg3] : tensor<?xi64>
+      %3 = arith.muli %arg4, %extracted : i64
+      scf.yield %3 : i64
+    }
+    %from_elements = tensor.from_elements %2 : tensor<i64>
+    gml_st.yield %from_elements : tensor<i64>
+  } : tensor<i64>
+  return %1 : tensor<i64>
+}
+
+// CHECK-LABEL:  func.func @gml_st_fusion_scalar_scf_for
+// CHECK-SAME:       (%[[ARG0:.*]]: memref<?xi64>)
+// CHECK:          %[[ALLOC:.*]] = memref.alloc()
+// CHECK:          gml_st.fusion
+// CHECK-SAME:         ins(%[[ARG0_:.*]] = %[[ARG0]]: memref<?xi64>)
+// CHECK-SAME:         inits(%[[ALLOC_:.*]] = %[[ALLOC]]: memref<i64>)
+// CHECK-DAG:        %[[C1_I64:.*]] = arith.constant 1 : i64
+// CHECK-DAG:        %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:        %[[C1:.*]] = arith.constant 1 : index
+// CHECK:            %[[DIM:.*]] = memref.dim %[[ARG0_]], %[[C0]]
+// CHECK:            %[[FOR:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[DIM]]
+// CHECK-SAME:           step %[[C1]] iter_args(%[[ARG4:.*]] = %[[C1_I64]])
+// CHECK:              %[[LOAD:.*]] = memref.load %[[ARG0_]][%[[ARG3]]]
+// CHECK:              %[[MULI:.*]] = arith.muli %[[ARG4]], %[[LOAD]]
+// CHECK:              scf.yield %[[MULI]] : i64
+// CHECK:            %[[ALLOC_0:.*]] = memref.alloc()
+// CHECK:            memref.store %[[FOR]], %[[ALLOC_0]][]
+// CHECK:            memref.copy %[[ALLOC_0]], %[[ALLOC_]]
+// CHECK:            gml_st.yield %[[ALLOC_]] : memref<i64>
+// CHECK:          return %[[ALLOC]] : memref<i64>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/collapse_forall_dimensions.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/collapse_forall_dimensions.mlir
deleted file mode 100644
index cced4029a0d..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/collapse_forall_dimensions.mlir
+++ /dev/null
@@ -1,79 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-compose-extract-insert-slice | FileCheck %s
-
-func.func @inline_single_iteration_parallel(
-    %in: tensor<8x8xf32>) -> tensor<8x8xf32> {
-  %c8 = arith.constant 8 : index
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = tensor.empty() : tensor<8x8xf32>
-  %13 = scf.forall (%arg4, %arg5) = (%c0, %c0) to (%c1, %c1)
-        step (%c8, %c8) shared_outs (%out_ = %0) -> (tensor<8x8xf32>) {
-    %20 = tensor.extract_slice %out_[%arg4, %arg5] [8, 8] [1, 1]
-      : tensor<8x8xf32> to tensor<8x8xf32>
-    %11 = linalg.fill ins(%cst : f32) outs(%20 : tensor<8x8xf32>)
-          -> tensor<8x8xf32>
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %11 into %out_[%arg4, %arg5] [8, 8] [1, 1]
-        : tensor<8x8xf32> into tensor<8x8xf32>
-    }
-  }
-  return %13 : tensor<8x8xf32>
-}
-
-// CHECK-LABEL: @inline_single_iteration_parallel
-// CHECK-NOT:     scf.forall
-// CHECK:         tensor.empty
-// CHECK-NEXT:    linalg.fill
-
-// -----
-
-func.func @collapse_one_dim_parallel(%in: tensor<8x8xf32>) -> tensor<8x8xf32> {
-  %c8 = arith.constant 8 : index
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c16 = arith.constant 16 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = tensor.empty() : tensor<8x8xf32>
-  %13 = scf.forall (%arg4, %arg5) = (%c0, %c0) to (%c1, %c16)
-        step (%c8, %c8) shared_outs (%out_ = %0) -> (tensor<8x8xf32>) {
-    %11 = linalg.fill ins(%cst : f32) outs(%out_ : tensor<8x8xf32>)
-          -> tensor<8x8xf32>
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %11 into %out_[%arg4, %arg5] [8, 8] [1, 1]
-        : tensor<8x8xf32> into tensor<8x8xf32>
-    }
-  }
-  return %13 : tensor<8x8xf32>
-}
-
-// CHECK-LABEL: @collapse_one_dim_parallel
-// CHECK:         scf.forall (%[[ARG:.*]]) = (%c0) to (%c16) step (%c8)
-// CHECK:           linalg.fill
-// CHECK:           tensor.parallel_insert_slice
-
-// -----
-
-func.func @remove_empty_parallel(%in: tensor<8x8xf32>) -> tensor<8x8xf32> {
-  %c8 = arith.constant 8 : index
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c16 = arith.constant 16 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = tensor.empty() : tensor<8x8xf32>
-  %13 = scf.forall (%arg4, %arg5) = (%c0, %c16) to (%c1, %c16)
-        step (%c8, %c8) shared_outs (%out_ = %0) -> (tensor<8x8xf32>) {
-    %11 = linalg.fill ins(%cst : f32) outs(%out_ : tensor<8x8xf32>)
-          -> tensor<8x8xf32>
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %11 into %out_[%arg4, %arg5] [8, 8] [1, 1]
-        : tensor<8x8xf32> into tensor<8x8xf32>
-    }
-  }
-  return %13 : tensor<8x8xf32>
-}
-
-// CHECK-LABEL: @remove_empty_parallel
-// CHECK-NOT:   scf.forall
-// CHECK:       %[[EMPTY:.*]] = tensor.empty
-// CHECK:       return %[[EMPTY]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/collect_stats.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/collect_stats.mlir
new file mode 100644
index 00000000000..fd4e1dd18a5
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/collect_stats.mlir
@@ -0,0 +1,77 @@
+// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline=stats-detail-level=1 | \
+// RUN: FileCheck %s --check-prefix=CHECK-1
+
+// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline=stats-detail-level=2 | \
+// RUN: FileCheck %s --check-prefix=CHECK-2
+
+// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline=stats-detail-level=3 | \
+// RUN: FileCheck %s --check-prefix=CHECK-3
+
+func.func @foo(%arg0: tensor<2x4xf32>,
+               %arg1: tensor<8x8xf32>,
+               %arg2: tensor<128xf32>) -> tensor<4x2xf32> {
+  %cst = arith.constant 0.0 : f32
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c32 = arith.constant 32 : index
+  %0 = tensor.empty() : tensor<2x4xf32>
+  %1 = linalg.map { arith.negf }
+         ins(%arg0 : tensor<2x4xf32>)
+         outs(%0 : tensor<2x4xf32>)
+  %3 = tensor.collapse_shape %1 [[0, 1]] : tensor<2x4xf32> into tensor<8xf32>
+  %4 = tensor.empty() : tensor<8xf32>
+  %17 = scf.for %arg13 = %c0 to %c8 step %c32 iter_args(%arg14 = %4)
+    -> (tensor<8xf32>) {
+    %extracted_slice = tensor.extract_slice %arg2[%arg13] [32] [1] :
+        tensor<128xf32> to tensor<32xf32>
+    %expanded_17 = tensor.expand_shape %extracted_slice [[0, 1]] :
+        tensor<32xf32> into tensor<4x8xf32>
+    %reduced_18 = linalg.reduce { arith.addf }
+        ins(%expanded_17 : tensor<4x8xf32>)
+        outs(%arg14 : tensor<8xf32>) dimensions = [0]
+    scf.yield %reduced_18 : tensor<8xf32>
+  }
+  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8xf32>)
+      -> tensor<8xf32>
+  %6 = linalg.vecmat ins(%3, %arg1 : tensor<8xf32>, tensor<8x8xf32>)
+                     outs(%5 : tensor<8xf32>) -> tensor<8xf32>
+  %7 = tensor.expand_shape %6 [[0, 1]] : tensor<8xf32> into tensor<8x1xf32>
+  %8 = tensor.collapse_shape %7 [[0, 1]] : tensor<8x1xf32> into tensor<8xf32>
+  %9 = linalg.matvec ins(%arg1, %8 : tensor<8x8xf32>, tensor<8xf32>)
+                     outs(%5 : tensor<8xf32>) -> tensor<8xf32>
+  %10 = linalg.map { arith.addf }
+         ins(%17, %9 : tensor<8xf32>, tensor<8xf32>)
+         outs(%5 : tensor<8xf32>)
+  %11 = tensor.expand_shape %10 [[0, 1]] : tensor<8xf32> into tensor<4x2xf32>
+  return %11 : tensor<4x2xf32>
+}
+
+// CHECK-1:         *** Tileable ops stats (detail level 1) ***
+// CHECK-1-DAG:     1x linalg.fill
+// CHECK-1-DAG:     2x linalg.map
+// CHECK-1-DAG:     1x linalg.matvec
+// CHECK-1-DAG:     1x linalg.reduce
+// CHECK-1-DAG:     1x linalg.vecmat
+// CHECK-1-DAG:     1x tensor.collapse_shape (degenerate)
+// CHECK-1-DAG:     1x tensor.collapse_shape (non-degenerate)
+// CHECK-1-DAG:     3x tensor.expand_shape
+
+// CHECK-2:         *** Tileable ops stats (detail level 2) ***
+// CHECK-2:         1x linalg.fill
+// CHECK-2-NEXT:      1. %{{.*}} = linalg.fill ins({{.*}}) outs({{.*}})
+
+// CHECK-3:         *** Tileable ops stats (detail level 3) ***
+// CHECK-3:         2x linalg.map
+// CHECK-3-DAG:       %{{.*}} = linalg.map { arith.negf } ins({{.*}}) outs({{.*}})
+// CHECK-3-NEXT:        Producers:
+// CHECK-3-NEXT:          <block argument> {{.*}} index: 0
+// CHECK-3-NEXT:          tensor.empty
+// CHECK-3-NEXT:        Consumers:
+// CHECK-3-NEXT:          tensor.collapse_shape
+// CHECK-3-DAG:       %{{.*}} = linalg.map { arith.addf } ins({{.*}}) outs({{.*}})
+// CHECK-3-NEXT:        Producers:
+// CHECK-3-NEXT:          scf.for
+// CHECK-3-NEXT:          linalg.matvec
+// CHECK-3-NEXT:          linalg.fill
+// CHECK-3-NEXT:        Consumers:
+// CHECK-3-NEXT:          tensor.expand_shape
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/compose_extract_insert_slice.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/compose_extract_insert_slice.mlir
index eb58d45d08f..c9616edb586 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/compose_extract_insert_slice.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/compose_extract_insert_slice.mlir
@@ -1,14 +1,15 @@
-// RUN: mlir-hlo-opt %s --gml-compose-extract-insert-slice | FileCheck %s
+// RUN: mlir-hlo-opt %s --gml-compose-extract-insert-slice --split-input-file \
+// RUN: | FileCheck %s
 
-func.func @compose_tiles(%arg: tensor<?x?xf32>, %i: index, %j: index, %k: index,
-    %n: index, %a: index, %b: index) -> tensor<4x?xf32> {
+func.func @compose_slices(%arg: tensor<?x?xf32>, %i: index, %j: index,
+    %k: index, %n: index, %a: index, %b: index) -> tensor<4x?xf32> {
   %4 = tensor.extract_slice %arg[%i, %j] [4, 128] [2, %a]
     : tensor<?x?xf32> to tensor<4x128xf32>
   %5 = tensor.extract_slice %4[0, %k] [4, %n] [1, %b]
     : tensor<4x128xf32> to tensor<4x?xf32>
   return %5 : tensor<4x?xf32>
 }
-// CHECK-LABEL: @compose_tiles
+// CHECK-LABEL: @compose_slices
 // CHECK-SAME:  %[[ARG:[a-z0-9]+]]: tensor<?x?xf32>, %[[I:[a-z0-9]+]]: index,
 // CHECK-SAME:  %[[J:[a-z0-9]+]]: index, %[[K:[a-z0-9]+]]: index,
 // CHECK-SAME:  %[[N:[a-z0-9]+]]: index, %[[A:[a-z0-9]+]]: index,
@@ -19,3 +20,26 @@ func.func @compose_tiles(%arg: tensor<?x?xf32>, %i: index, %j: index, %k: index,
 // CHECK-NEXT: %[[RES:.*]] = tensor.extract_slice %[[ARG]]
 // CHECK-SAME:   [%[[I]], %[[J_PLUS_AK]]] [4, %[[N]]] [2, %[[AB]]]
 // CHECK-SAME:   : tensor<?x?xf32>
+
+// -----
+
+func.func @compose_extract_of_slice(%arg: tensor<?x?xf32>, %i: index, %j: index,
+    %k: index, %l: index) -> f32 {
+  %slice = tensor.extract_slice %arg[%i, %j] [4, 128] [2, %l]
+    : tensor<?x?xf32> to tensor<4x128xf32>
+  %c1 = arith.constant 1 : index
+  %pt = tensor.extract %slice[%c1, %k] : tensor<4x128xf32>
+  return %pt : f32
+}
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0] -> (s0 + 2)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0, s1, s2] -> (s0 * s1 + s2)>
+
+// CHECK-LABEL: func.func @compose_extract_of_slice
+// CHECK-SAME:   (%[[ARG:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:    %[[I:.*]]: index, %[[J:.*]]: index, %[[K:.*]]: index,
+// CHECK-SAME:    %[[L:.*]]: index) -> f32 {
+
+// CHECK:       %[[X:.*]] = affine.apply #[[$MAP0]]()[%[[I]]]
+// CHECK:       %[[Y:.*]] = affine.apply #[[$MAP1]]()[%[[K]], %[[L]], %[[J]]]
+// CHECK:       tensor.extract %[[ARG]][%[[X]], %[[Y]]] : tensor<?x?xf32>
+
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/batch_matmul.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/batch_matmul.mlir
new file mode 100644
index 00000000000..1bbad52432e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/batch_matmul.mlir
@@ -0,0 +1,35 @@
+// RUN: mlir-hlo-opt %s \
+// RUN:   --gml-st-cpu-tiling-pipeline=matmul-tile-sizes=4,4,4 \
+// RUN: | FileCheck %s
+
+func.func @batch_matmul(%lhs: tensor<8x64x32xf32>,
+                        %rhs: tensor<8x32x64xf32>) -> tensor<8x64x64xf32> {
+  %37 = tensor.empty() : tensor<8x64x64xf32>
+  %cst_75 = arith.constant 0.000000e+00 : f32
+  %38 = linalg.fill ins(%cst_75 : f32) outs(%37 : tensor<8x64x64xf32>)
+    -> tensor<8x64x64xf32>
+  %39 = linalg.batch_matmul ins(%lhs, %rhs : tensor<8x64x32xf32>,
+    tensor<8x32x64xf32>) outs(%38 : tensor<8x64x64xf32>) -> tensor<8x64x64xf32>
+
+  func.return %39 : tensor<8x64x64xf32>
+}
+// CHECK-LABEL: @batch_matmul
+
+// CHECK:      scf.for
+// CHECK-DAG:    tensor.collapse_shape
+// CHECK-SAME:       : tensor<1x64x32xf32> into tensor<64x32xf32>
+// CHECK-DAG:    tensor.collapse_shape
+// CHECK-SAME:       : tensor<1x32x64xf32> into tensor<32x64xf32>
+// CHECK-DAG:    tensor.collapse_shape
+// CHECK-SAME:       : tensor<1x64x64xf32> into tensor<64x64xf32>
+// CHECK:        scf.for
+// CHECK:          scf.for
+// CHECK:            scf.for
+// CHECK:              vector.contract
+// CHECK-SAME:           : vector<4x4xf32>, vector<4x4xf32> into vector<4x4xf32>
+// CHECK:              scf.yield %{{.*}} : vector<4x4xf32>
+// CHECK:            scf.yield %{{.*}} : tensor<64x64xf32>
+// CHECK:          scf.yield %{{.*}} : tensor<64x64xf32>
+// CHECK:        %expanded = tensor.expand_shape
+// CHECK:          : tensor<64x64xf32> into tensor<1x64x64xf32>
+// CHECK:        scf.yield %inserted_slice : tensor<8x64x64xf32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/conv_2d_nhwc_hwcf.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/conv_2d_nhwc_hwcf.mlir
index e999491ccc1..f365853dbba 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/conv_2d_nhwc_hwcf.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/conv_2d_nhwc_hwcf.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-hlo-opt %s --split-input-file --xla-cpu-transform-conv \
+// RUN: mlir-hlo-opt %s --split-input-file \
+// RUN:   --gml-st-cpu-tiling-pipeline=matmul-tile-sizes=4,4,4 \
 // RUN: | FileCheck %s
 
 func.func @conv_is_matmul(%input: tensor<1x41x140x1xf32>,
@@ -18,9 +19,7 @@ func.func @conv_is_matmul(%input: tensor<1x41x140x1xf32>,
 }
 // CHECK-LABEL: @conv_is_matmul
 // CHECK:       scf.for
-// CHECK:         linalg.matmul
-// CHECK-SAME:      tensor<41x140xf32>, tensor<140x128xf32>
-// CHECK-SAME:      tensor<41x128xf32>) -> tensor<41x128xf32>
+// CHECK:         scf.yield %{{.*}} : tensor<41x128xf32>
 
 // -----
 
@@ -40,7 +39,14 @@ func.func @conv_is_matmul_after_tiling(%input: tensor<1x45x140x1xf32>,
   func.return %conv : tensor<1x41x1x128xf32>
 }
 // CHECK-LABEL: @conv_is_matmul_after_tiling
-// CHECK:       scf.for
-// CHECK:         linalg.matmul
-// CHECK-SAME:      tensor<41x140xf32>, tensor<140x128xf32>
-// CHECK-SAME:      tensor<41x128xf32>) -> tensor<41x128xf32>
+// CHECK:      scf.for
+// CHECK-DAG:    tensor.collapse_shape
+// CHECK-SAME:     : tensor<1x41x140x1xf32> into tensor<41x140xf32>
+// CHECK-DAG:    tensor.collapse_shape
+// CHECK-SAME:     : tensor<1x140x1x128xf32> into tensor<140x128xf32>
+// CHECK-DAG:    tensor.collapse_shape
+// CHECK-SAME:     : tensor<1x41x1x128xf32> into tensor<41x128xf32>
+// CHECK:        scf.for
+// CHECK:          scf.yield %{{.*}} : tensor<41x128xf32>
+// CHECK:        tensor.expand_shape
+// CHECK-SAME:     : tensor<41x128xf32> into tensor<1x41x1x128xf32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/dot.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/dot.mlir
index d09c3f6de61..9b0d3062479 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/dot.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/dot.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-hlo-opt %s --split-input-file \
-// RUN:   --gml-st-cpu-tiling-pipeline=matmul-tile-sizes=4,5,6 | FileCheck %s
+// RUN:   --gml-st-cpu-tiling-pipeline="matmul-tile-sizes=4,5,6 \
+// RUN:                                 vectorization-size-threshold=1" |\
+// RUN: FileCheck %s
 
 func.func @matvec(%lhs: tensor<33x17xf32>, %rhs: tensor<17xf32>,
                   %output: tensor<33xf32>) -> tensor<33xf32> {
@@ -18,7 +20,7 @@ func.func @matvec(%lhs: tensor<33x17xf32>, %rhs: tensor<17xf32>,
 // CHECK:         scf.for {{.*}} %[[C0]] to %[[C32]] step %[[C4]]
 // CHECK:           scf.for {{.*}} %[[C0]] to %[[C12]] step %[[C6]]
 // CHECK:             vector.contract {{.*}} vector<4x6xf32>
-// CHECK-NEXT:        scf.yield %{{.*}} : {{.*}}, vector<4xf32>
+// CHECK-NEXT:        scf.yield %{{.*}} : vector<4xf32>
 // CHECK:           vector.contract
 // CHECK:           vector.transfer_write
 // CHECK:         scf.for {{.*}} %[[C0]] to %[[C17]] step %[[C6]]
@@ -26,6 +28,29 @@ func.func @matvec(%lhs: tensor<33x17xf32>, %rhs: tensor<17xf32>,
 
 // -----
 
+func.func @large_matvec(%lhs: tensor<33x1024xf32>, %rhs: tensor<1024xf32>,
+                        %output: tensor<33xf32>) -> tensor<33xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %fill = linalg.fill ins(%cst : f32)
+                      outs(%output : tensor<33xf32>) -> tensor<33xf32>
+  %matvec = linalg.matvec ins(%lhs, %rhs : tensor<33x1024xf32>, tensor<1024xf32>)
+                     outs(%fill : tensor<33xf32>) -> tensor<33xf32>
+  return %matvec : tensor<33xf32>
+}
+// CHECK-LABEL: @large_matvec
+
+// CHECK:   scf.for
+// CHECK:      tensor.collapse_shape
+// CHECK-SAME:   : tensor<1x1024xf32> into tensor<1024xf32>
+// CHECK:     scf.for
+// CHECK:       arith.mulf %{{.*}} : vector<32xf32>
+// CHECK:       vector.multi_reduction <add>
+// CHECK:       scf.yield %{{.*}} : vector<8xf32>
+// CHECK:     vector.multi_reduction <add>
+// CHECK:     scf.yield %{{.*}} : tensor<33xf32>
+
+// -----
+
 func.func @vecmat(%lhs: tensor<17xf32>, %rhs: tensor<17x33xf32>,
                   %output: tensor<33xf32>) -> tensor<33xf32> {
   %2 = linalg.vecmat ins(%lhs, %rhs : tensor<17xf32>, tensor<17x33xf32>)
@@ -43,7 +68,7 @@ func.func @vecmat(%lhs: tensor<17xf32>, %rhs: tensor<17x33xf32>,
 // CHECK:         scf.for {{.*}} %[[C0]] to %[[C30]] step %[[C5]]
 // CHECK:           scf.for {{.*}} %[[C0]] to %[[C12]] step %[[C6]]
 // CHECK:             vector.contract {{.*}} vector<6x5xf32>
-// CHECK-NEXT:        scf.yield %{{.*}} : {{.*}}, vector<5xf32>
+// CHECK-NEXT:        scf.yield %{{.*}} : vector<5xf32>
 // CHECK:           vector.contract
 // CHECK:           vector.transfer_write
 // CHECK:         scf.for {{.*}} %[[C0]] to %[[C17]] step %[[C6]]
@@ -52,9 +77,9 @@ func.func @vecmat(%lhs: tensor<17xf32>, %rhs: tensor<17x33xf32>,
 // -----
 
 func.func @dot(%lhs: tensor<19xf32>, %rhs: tensor<19xf32>,
-                  %output: tensor<f32>) -> tensor<f32> {
+               %output: tensor<f32>) -> tensor<f32> {
   %2 = linalg.dot ins(%lhs, %rhs : tensor<19xf32>, tensor<19xf32>)
-                     outs(%output : tensor<f32>) -> tensor<f32>
+                  outs(%output : tensor<f32>) -> tensor<f32>
   return %2 : tensor<f32>
 }
 
@@ -65,10 +90,32 @@ func.func @dot(%lhs: tensor<19xf32>, %rhs: tensor<19xf32>,
 // CHECK:         scf.for {{.*}} %[[C0]] to %[[C18]] step %[[C6]]
 // CHECK:           vector.contract {{.*}} vector<6xf32>
 // CHECK-NEXT:      vector.broadcast
-// CHECK-NEXT:      scf.yield %{{.*}} : {{.*}}, vector<f32>
+// CHECK-NEXT:      scf.yield %{{.*}} : vector<f32>
 // CHECK:         arith.mulf
 // CHECK:         arith.addf
 
+// -----
+
+func.func @large_dot(%lhs: tensor<128xf32>, %rhs: tensor<128xf32>,
+                     %output: tensor<f32>) -> tensor<f32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %fill = linalg.fill ins(%cst : f32)
+                      outs(%output : tensor<f32>) -> tensor<f32>
+  %dot = linalg.dot ins(%lhs, %rhs : tensor<128xf32>, tensor<128xf32>)
+                    outs(%fill : tensor<f32>) -> tensor<f32>
+  return %dot : tensor<f32>
+}
+// CHECK-LABEL: @large_dot
+
+// CHECK: scf.for
+// CHECK:   arith.mulf {{.*}} : vector<32xf32>
+// CHECK:   vector.multi_reduction <add>
+// CHECK:     : vector<4x8xf32> to vector<8xf32>
+// CHECK:   scf.yield %{{.*}} : vector<8xf32>
+// CHECK: vector.multi_reduction <add>
+// CHECK:   : vector<8xf32> to f32
+
+
 // -----
 
 func.func @matvec_to_vecmat(%rhs: tensor<2xi32>,
@@ -82,3 +129,130 @@ func.func @matvec_to_vecmat(%rhs: tensor<2xi32>,
 // CHECK-LABEL: @matvec_to_vecmat
 // CHECK: arith.constant dense<{{\[}}[0, 2, 4], [1, 3, 5]]> : tensor<2x3xi32>
 // CHECK: vector.contract {{.*}} : vector<2xi32>, vector<2x3xi32> into vector<3xi32>
+
+// -----
+
+func.func @matvec_addf(%lhs: tensor<33x17xf32>, %rhs: tensor<17xf32>,
+                       %add: tensor<33xf32>) -> tensor<33xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<33xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<33xf32>) -> tensor<33xf32>
+  %2 = linalg.matvec ins(%lhs, %rhs : tensor<33x17xf32>, tensor<17xf32>)
+                     outs(%1 : tensor<33xf32>) -> tensor<33xf32>
+  %3 = linalg.map { arith.addf } ins(%2, %add : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
+  %4 = linalg.map { arith.addf } ins(%3, %add : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
+  return %4 : tensor<33xf32>
+}
+
+// CHECK-LABEL: @matvec_addf
+// CHECK-SAME:  (%{{.*}}: {{.*}}, %{{.*}}: {{.*}}, %[[ARG_INIT:.*]]: tensor<33xf32>)
+// CHECK:         scf.for {{.*}} iter_args(%[[ARG:.*]] = %[[ARG_INIT]]
+// CHECK:           %[[READ_INIT:.*]] = vector.transfer_read %[[ARG]]
+// CHECK:           %[[FOR:.*]] = scf.for {{.*}} iter_args(%[[ARG_FOR:.*]] = %[[READ_INIT]]
+// CHECK:             vector.contract {{.*}} %[[ARG_FOR]] :
+// CHECK-NEXT:        scf.yield
+// CHECK:           %[[CONTRACT:.*]] = vector.contract {{.*}} %[[FOR]] :
+// CHECK:           vector.transfer_write %[[CONTRACT]], %[[ARG]]
+// CHECK:           scf.yield
+// CHECK:         scf.for
+// CHECK:           linalg.matvec
+// CHECK:         scf.for
+// CHECK:           arith.addf
+// CHECK-NOT:       arith.addf
+// CHECK:           scf.yield
+// CHECK:         arith.addf
+// CHECK-NOT:     arith.addf
+
+// -----
+
+func.func @matvec_no_dominate_addf(%lhs: tensor<33x17xf32>, %rhs: tensor<17xf32>) -> tensor<33xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %cst1 = arith.constant 1.000000e+00 : f32
+  %0 = tensor.empty() : tensor<33xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<33xf32>) -> tensor<33xf32>
+  %2 = linalg.matvec ins(%lhs, %rhs : tensor<33x17xf32>, tensor<17xf32>)
+                     outs(%1 : tensor<33xf32>) -> tensor<33xf32>
+  %3 = tensor.empty() : tensor<33xf32>
+  %4 = linalg.fill ins(%cst1 : f32) outs(%0 : tensor<33xf32>) -> tensor<33xf32>
+  %5 = linalg.map { arith.addf } ins(%2, %4 : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
+  return %5 : tensor<33xf32>
+}
+
+// CHECK-LABEL: @matvec_no_dominate_addf
+// CHECK:         scf.for
+// CHECK:           scf.for
+// CHECK:             vector.contract
+// CHECK-NEXT:        scf.yield
+// CHECK:           vector.contract
+// CHECK:           vector.transfer_write
+// CHECK:           scf.yield
+// CHECK:         scf.for
+// CHECK:           linalg.matvec
+// CHECK:         scf.for
+// CHECK:           arith.addf
+// CHECK:           scf.yield
+// CHECK:         arith.addf
+
+// -----
+
+func.func @vecmat_addf(%lhs: tensor<17xf32>, %rhs: tensor<17x33xf32>,
+                       %add: tensor<33xf32>) -> tensor<33xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<33xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<33xf32>) -> tensor<33xf32>
+  %2 = linalg.vecmat ins(%lhs, %rhs : tensor<17xf32>, tensor<17x33xf32>)
+                     outs(%1 : tensor<33xf32>) -> tensor<33xf32>
+  %3 = linalg.map { arith.addf } ins(%add, %2 : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
+  %4 = linalg.map { arith.addf } ins(%3, %add : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
+  return %4 : tensor<33xf32>
+}
+
+// CHECK-LABEL: @vecmat_addf
+// CHECK-SAME:  (%{{.*}}: {{.*}}, %{{.*}}: {{.*}}, %[[ARG_INIT:.*]]: tensor<33xf32>)
+// CHECK:         scf.for {{.*}} iter_args(%[[ARG:.*]] = %[[ARG_INIT]]
+// CHECK:           %[[READ_INIT:.*]] = vector.transfer_read %[[ARG]]
+// CHECK:           %[[FOR:.*]] = scf.for {{.*}} iter_args(%[[ARG_FOR:.*]] = %[[READ_INIT]]
+// CHECK:             vector.contract {{.*}} %[[ARG_FOR]] :
+// CHECK-NEXT:        scf.yield
+// CHECK:           %[[CONTRACT:.*]] = vector.contract {{.*}} %[[FOR]] :
+// CHECK:           vector.transfer_write %[[CONTRACT]], %[[ARG]]
+// CHECK:           scf.yield
+// CHECK:         scf.for
+// CHECK:           linalg.vecmat
+// CHECK:         scf.for
+// CHECK:           arith.addf
+// CHECK-NOT:       arith.addf
+// CHECK:           scf.yield
+// CHECK:         arith.addf
+// CHECK-NOT:     arith.addf
+
+// -----
+
+func.func @vecmat_multiple_uses_addf(%lhs: tensor<17xf32>, %rhs: tensor<17x33xf32>,
+                                     %add: tensor<33xf32>) -> tensor<33xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<33xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<33xf32>) -> tensor<33xf32>
+  %2 = linalg.vecmat ins(%lhs, %rhs : tensor<17xf32>, tensor<17x33xf32>)
+                     outs(%1 : tensor<33xf32>) -> tensor<33xf32>
+  %3 = linalg.map { arith.addf } ins(%add, %2 : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
+  %4 = linalg.map { arith.addf } ins(%2, %3 : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
+  return %4 : tensor<33xf32>
+}
+
+// CHECK-LABEL: @vecmat_multiple_uses_addf
+// CHECK:         scf.for
+// CHECK:           scf.for
+// CHECK:             vector.contract
+// CHECK-NEXT:        scf.yield
+// CHECK:           vector.contract
+// CHECK:           vector.transfer_write
+// CHECK:           scf.yield
+// CHECK:         scf.for
+// CHECK:           linalg.vecmat
+// CHECK:         scf.for
+// CHECK:           arith.addf
+// CHECK:           arith.addf
+// CHECK:           scf.yield
+// CHECK:         arith.addf
+// CHECK:         arith.addf
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/duplicate_fusions.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/duplicate_fusions.mlir
index 4859efc8713..7b7d998ad07 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/duplicate_fusions.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/duplicate_fusions.mlir
@@ -1,35 +1,45 @@
 // RUN: mlir-hlo-opt %s \
-// RUN:     --gml-st-cpu-tiling-pipeline="enable-fusion-clusters=true enable-fusion-cluster-outlining=true" | \
+// RUN:     --gml-fusion-outlining --duplicate-function-elimination | \
 // RUN: FileCheck %s
 
 func.func @double_bcast_map_reduce(%arg : tensor<?xf32>,
     %init_3d : tensor<?x?x?xf32>, %init_1d : tensor<?xf32>) -> tensor<?xf32> {
 
   // Bcast, map, reduce.
-  %broadcasted = linalg.broadcast ins(%arg : tensor<?xf32>)
-      outs(%init_3d : tensor<?x?x?xf32>) dimensions = [1, 2]
-  %mapped = linalg.map { math.absf } ins(%broadcasted : tensor<?x?x?xf32>)
-      outs(%init_3d : tensor<?x?x?xf32>)
-  %reduced = linalg.reduce { arith.addf } ins(%mapped : tensor<?x?x?xf32>)
-      outs(%init_1d : tensor<?xf32>) dimensions = [1, 2]
+  %0 = gml_st.fusion ins(%arg_ = %arg : tensor<?xf32>,
+                         %init_3d_ = %init_3d : tensor<?x?x?xf32>)
+                     inits(%init_1d_ = %init_1d : tensor<?xf32>) {
+    %broadcasted = linalg.broadcast ins(%arg_ : tensor<?xf32>)
+        outs(%init_3d_ : tensor<?x?x?xf32>) dimensions = [1, 2]
+    %mapped = linalg.map { math.absf } ins(%broadcasted : tensor<?x?x?xf32>)
+        outs(%init_3d_ : tensor<?x?x?xf32>)
+    %reduced = linalg.reduce { arith.addf } ins(%mapped : tensor<?x?x?xf32>)
+        outs(%init_1d_ : tensor<?xf32>) dimensions = [1, 2]
+    gml_st.yield %reduced : tensor<?xf32>
+  } : tensor<?xf32>
 
   // And again...
-  %broadcasted_ = linalg.broadcast ins(%reduced : tensor<?xf32>)
-      outs(%init_3d : tensor<?x?x?xf32>) dimensions = [1, 2]
-  %mapped_ = linalg.map { math.absf } ins(%broadcasted_ : tensor<?x?x?xf32>)
-      outs(%init_3d : tensor<?x?x?xf32>)
-  %reduced_ = linalg.reduce { arith.addf } ins(%mapped_ : tensor<?x?x?xf32>)
-      outs(%init_1d : tensor<?xf32>) dimensions = [1, 2]
+  %1 = gml_st.fusion ins(%arg_ = %0 : tensor<?xf32>,
+                         %init_3d_ = %init_3d : tensor<?x?x?xf32>)
+                     inits(%init_1d_ = %init_1d : tensor<?xf32>) {
+    %broadcasted = linalg.broadcast ins(%arg_ : tensor<?xf32>)
+        outs(%init_3d_ : tensor<?x?x?xf32>) dimensions = [1, 2]
+    %mapped = linalg.map { math.absf } ins(%broadcasted : tensor<?x?x?xf32>)
+        outs(%init_3d_ : tensor<?x?x?xf32>)
+    %reduced = linalg.reduce { arith.addf } ins(%mapped : tensor<?x?x?xf32>)
+        outs(%init_1d_ : tensor<?xf32>) dimensions = [1, 2]
+    gml_st.yield %reduced : tensor<?xf32>
+  } : tensor<?xf32>
 
-  return %reduced_ : tensor<?xf32>
+  return %1 : tensor<?xf32>
 }
 
 // CHECK:      @[[UNIQUE_OUTLINED_FUSION_FUNC:double_bcast_map_reduce_fusion(_[0-9]+)?]]
-// CHECK-SAME: %{{.*}}: tensor<?x?x?xf32>, %{{.*}}: tensor<?xf32>, %{{.*}}: tensor<?xf32>
+// CHECK-SAME: %{{.*}}: tensor<?xf32>, %{{.*}}: tensor<?x?x?xf32>, %{{.*}}: tensor<?xf32>
 // CHECK-SAME: attributes {fusion}
 
 // CHECK:      @double_bcast_map_reduce
 // CHECK-SAME: %[[ARG:.*]]: tensor<?xf32>, %[[INIT_3D:.*]]: tensor<?x?x?xf32>, %[[INIT_1D:.*]]: tensor<?xf32>
-// CHECK:      %[[CALL_0:.*]] = call @[[UNIQUE_OUTLINED_FUSION_FUNC]](%[[INIT_3D]], %[[ARG]], %[[INIT_1D]])
-// CHECK:      %[[CALL_1:.*]] = call @[[UNIQUE_OUTLINED_FUSION_FUNC]](%[[INIT_3D]], %[[CALL_0]], %[[INIT_1D]])
+// CHECK:      %[[CALL_0:.*]] = call @[[UNIQUE_OUTLINED_FUSION_FUNC]](%[[ARG]], %[[INIT_3D]], %[[INIT_1D]])
+// CHECK:      %[[CALL_1:.*]] = call @[[UNIQUE_OUTLINED_FUSION_FUNC]](%[[CALL_0]], %[[INIT_3D]], %[[INIT_1D]])
 // CHECK:      return %[[CALL_1]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_outlining.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_outlining.mlir
index 771dec85555..a1b5575fd3c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_outlining.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_outlining.mlir
@@ -3,8 +3,8 @@
 
 func.func @map_fusion(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
     -> tensor<?x?xf32> {
-  %0 = gml_st.fusion (%arg2 = %arg0: tensor<?x?xf32>,
-      %arg3 = %arg1: tensor<?x?xf32>) {
+  %0 = gml_st.fusion ins(%arg2 = %arg0: tensor<?x?xf32>)
+                     inits(%arg3 = %arg1: tensor<?x?xf32>) {
     %mapped = linalg.map { math.exp } ins(%arg2 : tensor<?x?xf32>)
         outs(%arg3 : tensor<?x?xf32>)
     %mapped_0 = linalg.map { arith.mulf }
@@ -20,7 +20,9 @@ func.func @map_fusion(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
 // CHECK-LABEL: @map_fusion_fusion_0
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<?x?xf32>, %[[ARG1:.*]]: tensor<?x?xf32>
 // CHECK-SAME:       attributes {fusion}
-// CHECK:         %[[FUSION:.*]] = gml_st.fusion (%[[ARG2:.*]] = %[[ARG0]]: tensor<?x?xf32>, %[[ARG3:.*]] = %[[ARG1]]: tensor<?x?xf32>)
+// CHECK:         %[[FUSION:.*]] = gml_st.fusion
+// CHECK-SAME:        ins(%[[ARG2:.*]] = %[[ARG0]]: tensor<?x?xf32>)
+// CHECK-SAME:        inits(%[[ARG3:.*]] = %[[ARG1]]: tensor<?x?xf32>)
 // CHECK:           %[[MAPPED:.*]] = linalg.map { math.exp } ins(%[[ARG2]] : tensor<?x?xf32>) outs(%[[ARG3]] : tensor<?x?xf32>)
 // CHECK:           %[[MAPPED_0:.*]] = linalg.map { arith.mulf } ins(%[[MAPPED]], %[[MAPPED]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[ARG3]] : tensor<?x?xf32>)
 // CHECK:           %[[MAPPED_1:.*]] = linalg.map { math.absf } ins(%[[MAPPED_0]] : tensor<?x?xf32>) outs(%[[ARG3]] : tensor<?x?xf32>)
@@ -34,8 +36,8 @@ func.func @map_fusion(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
 
 func.func @multiple_fusions(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
     %arg2: tensor<?xf32>) -> tensor<?xf32> {
-  %0 = gml_st.fusion (%arg3 = %arg0: tensor<?x?xf32>,
-      %arg4 = %arg1: tensor<?x?xf32>) {
+  %0 = gml_st.fusion ins(%arg3 = %arg0: tensor<?x?xf32>)
+                     inits(%arg4 = %arg1: tensor<?x?xf32>) {
     %sorted0 = thlo.sort ins(%arg3 : tensor<?x?xf32>)
         outs(%arg4 : tensor<?x?xf32>) dimension = 0 is_stable = false
         (%lhs0: f32, %rhs0: f32) {
@@ -44,8 +46,8 @@ func.func @multiple_fusions(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
     }
     gml_st.yield %sorted0 : tensor<?x?xf32>
   } : tensor<?x?xf32>
-  %1 = gml_st.fusion (%arg3 = %0: tensor<?x?xf32>,
-      %arg4 = %arg2: tensor<?xf32>) {
+  %1 = gml_st.fusion ins(%arg3 = %0: tensor<?x?xf32>)
+                     inits(%arg4 = %arg2: tensor<?xf32>) {
     %reduced = linalg.reduce { arith.addf } ins(%arg3 : tensor<?x?xf32>)
         outs(%arg4 : tensor<?xf32>) dimensions = [0]
     %mapped = linalg.map { math.exp } ins(%reduced : tensor<?xf32>)
@@ -58,7 +60,9 @@ func.func @multiple_fusions(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
 // CHECK-LABEL: @multiple_fusions_fusion_0
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<?x?xf32>, %[[ARG1:.*]]: tensor<?x?xf32>
 // CHECK-SAME:      attributes {fusion}
-// CHECK:         %[[FUSION:.*]] = gml_st.fusion (%[[ARG2:.*]] = %[[ARG0]]: tensor<?x?xf32>, %[[ARG3:.*]] = %[[ARG1]]: tensor<?x?xf32>)
+// CHECK:         %[[FUSION:.*]] = gml_st.fusion
+// CHECK-SAME:        ins(%[[ARG2:.*]] = %[[ARG0]]: tensor<?x?xf32>)
+// CHECK-SAME:        inits(%[[ARG3:.*]] = %[[ARG1]]: tensor<?x?xf32>)
 // CHECK:           %[[SORTED0:.*]] = thlo.sort ins(%[[ARG2]] : tensor<?x?xf32>) outs(%[[ARG3]] : tensor<?x?xf32>) dimension = 0 is_stable = false
 // CHECK:             (%[[LHS0:.*]]: f32, %[[RHS0:.*]]: f32)
 // CHECK:               %[[CMPF:.*]] = arith.cmpf ogt, %[[LHS0]], %[[RHS0]] : f32
@@ -68,7 +72,9 @@ func.func @multiple_fusions(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
 // CHECK:       @multiple_fusions_fusion_1
 // CHECK-SAME:      %[[ARG0_0:.*]]: tensor<?x?xf32>, %[[ARG1_0:.*]]: tensor<?xf32>
 // CHECK-SAME:      attributes {fusion}
-// CHECK:         %[[FUSION_0:.*]] = gml_st.fusion (%[[ARG2_0:.*]] = %[[ARG0_0]]: tensor<?x?xf32>, %[[ARG3_0:.*]] = %[[ARG1_0]]: tensor<?xf32>)
+// CHECK:         %[[FUSION_0:.*]] = gml_st.fusion
+// CHECK-SAME:        ins(%[[ARG2_0:.*]] = %[[ARG0_0]]: tensor<?x?xf32>)
+// CHECK-SAME:        inits(%[[ARG3_0:.*]] = %[[ARG1_0]]: tensor<?xf32>)
 // CHECK:           %[[REDUCED:.*]] = linalg.reduce { arith.addf } ins(%[[ARG2_0]] : tensor<?x?xf32>) outs(%[[ARG3_0]] : tensor<?xf32>) dimensions = [0]
 // CHECK:           %[[MAPPED:.*]] = linalg.map { math.exp } ins(%[[REDUCED]] : tensor<?xf32>) outs(%[[ARG3_0]] : tensor<?xf32>)
 // CHECK:           gml_st.yield %[[MAPPED]]
@@ -82,9 +88,9 @@ func.func @multiple_fusions(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
 // -----
 
 func.func @cst_defined_above() -> tensor<1x10xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<1x10xf32>
-  %1 = gml_st.fusion (%arg3 = %0 : tensor<1x10xf32>) {
+  %1 = gml_st.fusion inits(%arg3 = %0 : tensor<1x10xf32>) {
+    %cst = arith.constant 0.000000e+00 : f32
     %2 = linalg.fill ins(%cst : f32) outs(%arg3 : tensor<1x10xf32>) -> tensor<1x10xf32>
     gml_st.yield %2 : tensor<1x10xf32>
   } { some_attr = 123 } : tensor<1x10xf32>
@@ -94,8 +100,8 @@ func.func @cst_defined_above() -> tensor<1x10xf32> {
 // CHECK-LABEL: @cst_defined_above_fusion_0
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<1x10xf32>
 // CHECK-SAME:      attributes {fusion}
-// CHECK-DAG:     %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:         %[[FUSION:.*]] = gml_st.fusion (%[[ARG1:.*]] = %[[ARG0]]: tensor<1x10xf32>) {
+// CHECK:         %[[FUSION:.*]] = gml_st.fusion inits(%[[ARG1:.*]] = %[[ARG0]]: tensor<1x10xf32>) {
+// CHECK:           %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[ARG1]] : tensor<1x10xf32>)
 // CHECK:           gml_st.yield %[[FILL]]
 // CHECK:         } {some_attr = 123 : i64}
@@ -104,3 +110,21 @@ func.func @cst_defined_above() -> tensor<1x10xf32> {
 // CHECK:         %[[EMPTY:.*]] = tensor.empty()
 // CHECK:         %[[VAL:.*]] = call @cst_defined_above_fusion_0(%[[EMPTY]])
 // CHECK:         return %[[VAL]]
+
+// -----
+
+func.func @reduce_wo_init(%arg0: tensor<2xf64>, %arg1: tensor<f64>)
+    -> tensor<f64> {
+  %0 = gml_st.fusion ins(%arg3 = %arg0: tensor<2xf64>)
+                     inits(%arg4 = %arg1: tensor<f64>) {
+    %reduced = linalg.reduce { arith.maxf }
+                 ins(%arg3 : tensor<2xf64>)
+                 outs(%arg4 : tensor<f64>)
+                 dimensions = [0]
+    gml_st.yield %reduced : tensor<f64>
+  } : tensor<f64>
+  return %0 : tensor<f64>
+}
+
+// CHECK: @reduce_wo_init_fusion_0
+// CHECK: @reduce_wo_init
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_planning_for_cpu.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_planning_for_cpu.mlir
index 952d286fa93..f2f13074452 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_planning_for_cpu.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_planning_for_cpu.mlir
@@ -28,16 +28,16 @@ func.func @reverse_reduce_map(%input: tensor<?x?xf32>, %init0: tensor<?x?xf32>,
 // CHECK-SAME: %[[INIT1:.*]]: tensor
 
 // CHECK:       %[[FUSION0:.*]] = gml_st.fusion
-// CHECK-SAME:      (%[[BB_INPUT:.*]] = %[[INPUT]]: tensor<?x?xf32>,
-// CHECK-SAME:      %[[BB_INIT0:.*]] = %[[INIT0]]: tensor<?x?xf32>
+// CHECK-SAME:      ins(%[[BB_INPUT:.*]] = %[[INPUT]]: tensor<?x?xf32>
+// CHECK-SAME:      inits(%[[BB_INIT0:.*]] = %[[INIT0]]: tensor<?x?xf32>
 // CHECK-NEXT:    %[[SORTED:.*]] = thlo.sort
 // CHECK-SAME:      ins(%[[BB_INPUT]]
 // CHECK-SAME:      outs(%[[BB_INIT0]]
 // CHECK:         gml_st.yield %[[SORTED]]
 
 // CHECK:       %[[FUSION1:.*]] = gml_st.fusion
-// CHECK-SAME:      (%[[BB_INPUT:.*]] = %[[FUSION0]]: tensor<?x?xf32>,
-// CHECK-SAME:      %[[BB_INIT1:.*]] = %[[INIT1]]: tensor<?xf32>
+// CHECK-SAME:      ins(%[[BB_INPUT:.*]] = %[[FUSION0]]: tensor<?x?xf32>
+// CHECK-SAME:      inits(%[[BB_INIT1:.*]] = %[[INIT1]]: tensor<?xf32>
 // CHECK:         %[[REDUCED:.*]] = linalg.reduce
 // CHECK-SAME:      ins(%[[BB_INPUT]]
 // CHECK-SAME:      outs(%[[BB_INIT1]]
@@ -211,13 +211,13 @@ func.func @fused_matmul(%arg0: tensor<1x32xf32>, %arg1: tensor<32x10xf32>,
 // CHECK-LABEL: func @fused_matmul
 // CHECK-SAME:      (%[[ARG0:.*]]: tensor<1x32xf32>, %[[ARG1:.*]]: tensor<32x10xf32>
 // CHECK-SAME:      %[[ARG2:.*]]: tensor<10xf32>
-// CHECK:         %[[C0:.*]] = arith.constant 0
 // CHECK:         %[[EMPTY:.*]] = tensor.empty()
 // CHECK:         gml_st.fusion
-// CHECK-SAME:        (%[[ARG2_:.*]] = %[[ARG2]]: tensor<10xf32>
-// CHECK-SAME:        %[[ARG0_:.*]] = %[[ARG0]]: tensor<1x32xf32>
-// CHECK-SAME:        %[[ARG1_:.*]] = %[[ARG1]]: tensor<32x10xf32>
-// CHECK-SAME:        %[[EMPTY_:.*]] = %[[EMPTY]]: tensor<1x10xf32>
+// CHECK-SAME:        ins(%[[ARG2_:.*]] = %[[ARG2]]: tensor<10xf32>
+// CHECK-SAME:            %[[ARG0_:.*]] = %[[ARG0]]: tensor<1x32xf32>
+// CHECK-SAME:            %[[ARG1_:.*]] = %[[ARG1]]: tensor<32x10xf32>
+// CHECK-SAME:        inits(%[[EMPTY_:.*]] = %[[EMPTY]]: tensor<1x10xf32>
+// CHECK:           %[[C0:.*]] = arith.constant 0
 // CHECK:           %[[EXPANDED:.*]] = tensor.expand_shape %[[ARG2_]]
 // CHECK:           %[[TMP:.*]] = tensor.empty
 // CHECK:           %[[FILLED:.*]] = linalg.fill
@@ -305,9 +305,9 @@ func.func @tensor_empty_init(%input: tensor<?xf32>)
 // CHECK:         %[[DIM:.*]] = tensor.dim
 // CHECK:         %[[EMPTY:.*]] = tensor.empty
 // CHECK:         gml_st.fusion
-// CHECK-SAME:        %[[ARG0_:.*]] = %[[ARG0]]: tensor<?xf32>
-// CHECK-SAME:        %[[DIM_:.*]] = %[[DIM]]: index
-// CHECK-SAME:        %[[EMPTY_:.*]] = %[[EMPTY]]
+// CHECK-SAME:        ins(%[[DIM_:.*]] = %[[DIM]]: index
+// CHECK-SAME:            %[[ARG0_:.*]] = %[[ARG0]]: tensor<?xf32>
+// CHECK-SAME:        inits(%[[EMPTY_:.*]] = %[[EMPTY]]
 // CHECK:           %[[TMP:.*]] = tensor.empty(%[[DIM_]])
 // CHECK:           %[[MAPPED:.*]] = linalg.map
 // CHECK-SAME:        outs(%[[TMP]]
@@ -318,6 +318,105 @@ func.func @tensor_empty_init(%input: tensor<?xf32>)
 
 // -----
 
+func.func @shared_tensor_empty_static(%input: tensor<8xf32>)
+    -> (tensor<8xf32>, tensor<8xf32>) {
+  %init = tensor.empty() : tensor<8xf32>
+
+  %exp = linalg.map { math.exp }
+              ins(%input: tensor<8xf32>)
+              outs(%init: tensor<8xf32>)
+  %res0 = linalg.map { math.absf }
+              ins(%exp: tensor<8xf32>)
+              outs(%init: tensor<8xf32>)
+
+  %abs = linalg.map { math.absf }
+              ins(%input: tensor<8xf32>)
+              outs(%init: tensor<8xf32>)
+  %res1 = linalg.map { math.exp }
+              ins(%abs: tensor<8xf32>)
+              outs(%init: tensor<8xf32>)
+
+  func.return %res0, %res1 : tensor<8xf32>, tensor<8xf32>
+}
+
+// CHECK-LABEL: func @shared_tensor_empty_static
+// CHECK-COUNT-2: tensor.empty
+// CHECK:         gml_st.fusion
+// CHECK:           tensor.empty
+// CHECK:           linalg.map { math.exp }
+// CHECK:           linalg.map { math.absf }
+// CHECK:           gml_st.yield
+
+// CHECK:         gml_st.fusion
+// CHECK:           tensor.empty
+// CHECK:           linalg.map { math.absf }
+// CHECK:           linalg.map { math.exp }
+// CHECK:           gml_st.yield
+
+// -----
+
+func.func @shared_tensor_empty_dynamic(%input: tensor<?xf32>, %size : index)
+    -> tensor<?xf32> {
+  %init1 = tensor.empty(%size) : tensor<?xf32>
+  %exp = linalg.map { math.exp }
+              ins(%input: tensor<?xf32>)
+              outs(%init1: tensor<?xf32>)
+
+  %init2 = tensor.empty(%size) : tensor<?xf32>
+  %res = linalg.map { math.absf }
+              ins(%exp: tensor<?xf32>)
+              outs(%init2: tensor<?xf32>)
+  return %res : tensor<?xf32>
+}
+
+// CHECK-LABEL: func @shared_tensor_empty_dynamic(
+// CHECK-SAME:       %[[ARG0:.*]]: tensor<?xf32>, %[[SIZE:.*]]: index
+// CHECK:         %[[EMPTY:.*]] = tensor.empty(%[[SIZE]]) : tensor<?xf32>
+// CHECK:         %[[FUSION:.*]] = gml_st.fusion
+// CHECK-SAME:        ins(%[[ARG2:.*]] = %[[SIZE]]: index,
+// CHECK-SAME:            %[[ARG3:.*]] = %[[ARG0]]: tensor<?xf32>
+// CHECK-SAME:        inits(%[[ARG4:.*]] = %[[EMPTY]]: tensor<?xf32>
+// CHECK:           %[[EMPTY_0:.*]] = tensor.empty(%[[ARG2]]) : tensor<?xf32>
+// CHECK:           %[[MAPPED:.*]] = linalg.map
+// CHECK-SAME:        ins(%[[ARG3]] {{.*}} outs(%[[EMPTY_0]]
+// CHECK:           %[[MAPPED_0:.*]] = linalg.map
+// CHECK-SAME:        ins(%[[MAPPED]] {{.*}} outs(%[[ARG4]]
+// CHECK:           gml_st.yield %[[MAPPED_0]] : tensor<?xf32>
+// CHECK:         return %[[FUSION]] : tensor<?xf32>
+
+// -----
+
+func.func @shared_linalg_fill_dynamic(%input: tensor<?x?xf32>, %size : index)
+    -> (tensor<?xf32>, tensor<?xf32>) {
+  %c0 = arith.constant 0.0 : f32
+  %init = tensor.empty(%size) : tensor<?xf32>
+  %fill = linalg.fill ins(%c0 : f32) outs(%init : tensor<?xf32>) -> tensor<?xf32>
+  %res0 = linalg.reduce { arith.addf }
+              ins(%input: tensor<?x?xf32>)
+              outs(%fill: tensor<?xf32>)
+              dimensions = [0]
+
+  %res1 = linalg.map { math.absf }
+              ins(%fill : tensor<?xf32>)
+              outs(%init: tensor<?xf32>)
+  return %res0, %res1 : tensor<?xf32>, tensor<?xf32>
+}
+
+// CHECK-LABEL:    func.func @shared_linalg_fill_dynamic
+// CHECK:          %[[EMPTY:.*]] = tensor.empty
+// CHECK:          %[[FUSION:.*]] = gml_st.fusion
+// CHECK:            %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:            %[[FILL:.*]] = linalg.fill
+// CHECK:            %[[REDUCED:.*]] = linalg.reduce {{.*}} outs(%[[FILL]]
+// CHECK:          %[[FUSION_0:.*]] = gml_st.fusion
+// CHECK:            %[[EMPTY_0:.*]] = tensor.empty
+// CHECK:            %[[FILL_0:.*]] = linalg.fill {{.*}} outs(%[[EMPTY_0]]
+// CHECK:            %[[MAPPED:.*]] = linalg.map
+// CHECK:            gml_st.yield %[[MAPPED]] : tensor<?xf32>
+// CHECK:          return %[[FUSION]], %[[FUSION_0]]
+
+// -----
+
 func.func @multiple_users_linalg_fill(%arg0: tensor<2xf64>)
     -> (tensor<f64>, tensor<f64>) {
   %cst = arith.constant 0x7FF0000000000000 : f64
@@ -362,3 +461,20 @@ func.func @map_for_matmuls(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
 // CHECK:         gml_st.fusion
 // CHECK:           linalg.matmul
 // CHECK:           linalg.map
+
+// -----
+
+func.func @do_not_fuse_unsupported_op(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+  %init = tensor.empty() : tensor<10xf32>
+  %negated = "mhlo.negate"(%arg0) : (tensor<10xf32>) -> tensor<10xf32>
+  %mapped = linalg.map { math.exp }
+              ins(%negated : tensor<10xf32>)
+              outs(%init : tensor<10xf32>)
+  return %mapped : tensor<10xf32>
+}
+
+// CHECK-LABEL: func @do_not_fuse_unsupported_op
+// CHECK:         tensor.empty
+// CHECK:         mhlo.negate
+// CHECK:         gml_st.fusion
+// CHECK:           linalg.map
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/inline_fusion_clusters.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/inline_fusion_clusters.mlir
index 7a6aa8eee92..8f51d6c89c4 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/inline_fusion_clusters.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/inline_fusion_clusters.mlir
@@ -2,9 +2,11 @@
 // RUN:   --split-input-file \
 // RUN: | FileCheck %s
 
-func.func @two_clusters(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
-                        %arg2: tensor<?xf32>) -> tensor<?xf32> {
-  %0 = gml_st.fusion (%arg3 = %arg0: tensor<?x?xf32>, %arg4 = %arg1: tensor<?x?xf32>) {
+func.func @two_clusters_tensors(
+    %arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?xf32>)
+    -> tensor<?xf32> {
+  %0 = gml_st.fusion ins(%arg3 = %arg0: tensor<?x?xf32>)
+                     inits(%arg4 = %arg1: tensor<?x?xf32>) {
     %sorted0 = thlo.sort
       ins(%arg3 : tensor<?x?xf32>)
       outs(%arg4 : tensor<?x?xf32>)
@@ -16,7 +18,8 @@ func.func @two_clusters(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
       }
     gml_st.yield %sorted0 : tensor<?x?xf32>
   } : tensor<?x?xf32>
-  %1 = gml_st.fusion (%arg3 = %0: tensor<?x?xf32>, %arg4 = %arg2: tensor<?xf32>) {
+  %1 = gml_st.fusion ins(%arg3 = %0: tensor<?x?xf32>)
+                     inits(%arg4 = %arg2: tensor<?xf32>) {
     %reduced = linalg.reduce { arith.addf } ins(%arg3 : tensor<?x?xf32>) outs(%arg4 : tensor<?xf32>) dimensions = [0]
     %mapped = linalg.map { math.exp } ins(%reduced : tensor<?xf32>) outs(%arg4 : tensor<?xf32>)
     gml_st.yield %mapped : tensor<?xf32>
@@ -24,7 +27,46 @@ func.func @two_clusters(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
   return %1 : tensor<?xf32>
 }
 
-// CHECK-LABEL: @two_clusters
+// CHECK-LABEL: @two_clusters_tensors
+// CHECK-NOT:   gml_st.fusion
+// CHECK:       thlo.sort
+// CHECK-NOT:   gml_st.fusion
+// CHECK:       linalg.reduce
+// CHECK:       linalg.map
+
+// -----
+
+func.func @two_clusters_memrefs(
+    %arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?xf32>)
+    -> memref<?xf32> {
+  gml_st.fusion ins(%arg3 = %arg0: memref<?x?xf32>)
+                inits(%arg4 = %arg1: memref<?x?xf32>) {
+    thlo.sort
+      ins(%arg3 : memref<?x?xf32>)
+      outs(%arg4 : memref<?x?xf32>)
+      dimension = 0
+      is_stable = false
+      (%lhs0: f32, %rhs0: f32) {
+        %2 = arith.cmpf ogt, %lhs0, %rhs0 : f32
+        thlo.yield %2 : i1
+      }
+    gml_st.yield %arg4 : memref<?x?xf32>
+  }
+  gml_st.fusion ins(%arg3 = %arg1: memref<?x?xf32>)
+                inits(%arg4 = %arg2: memref<?xf32>) {
+    linalg.reduce { arith.addf }
+      ins(%arg3 : memref<?x?xf32>)
+      outs(%arg4 : memref<?xf32>)
+      dimensions = [0]
+    linalg.map { math.exp }
+      ins(%arg4 : memref<?xf32>)
+      outs(%arg4 : memref<?xf32>)
+    gml_st.yield %arg4 : memref<?xf32>
+  }
+  return %arg2 : memref<?xf32>
+}
+
+// CHECK-LABEL: @two_clusters_memrefs
 // CHECK-NOT:   gml_st.fusion
 // CHECK:       thlo.sort
 // CHECK-NOT:   gml_st.fusion
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_bcast_map.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_bcast_map.mlir
index 71c0073857e..54862c30518 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_bcast_map.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_bcast_map.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-hlo-opt %s \
-// RUN:   --gml-st-cpu-tiling-pipeline="enable-fusion-clusters=true" \
+// RUN: mlir-hlo-opt %s --gml-st-cpu-tiling-pipeline \
 // RUN: | FileCheck %s
 
 func.func @map_bcast_map(%arg0: tensor<?xf32>, %arg1: tensor<?x?x?xf32>,
@@ -22,16 +21,17 @@ func.func @map_bcast_map(%arg0: tensor<?xf32>, %arg1: tensor<?x?x?xf32>,
 
 // CHECK-LABEL: func.func @map_bcast_map
 
-// CHECK: scf.for
-// CHECK:   math.absf %{{.*}} : f32
-// CHECK:   vector.broadcast %{{.*}} : vector<1xf32> to vector<1x8x1xf32>
-// CHECK:   vector.transpose %{{.*}}, [2, 0, 1] : vector<1x8x1xf32> to vector<1x1x8xf32>
-// CHECK:   arith.addf %{{.*}} : vector<1x1x8xf32>
-// CHECK:   vector.transfer_write
+// CHECK:       scf.for
+// CHECK:         math.absf %{{.*}} : vector<8xf32>
+// CHECK:         vector.broadcast %{{.*}} : vector<8xf32> to vector<1x8x8xf32>
+// CHECK:         vector.transpose %{{.*}}, [2, 0, 1]
+// CHECK-SAME:      : vector<1x8x8xf32> to vector<8x1x8xf32>
+// CHECK:         arith.addf %{{.*}} : vector<8x1x8xf32>
+// CHECK:         vector.transfer_write
 
-// CHECK: scf.for
-// CHECK:   scf.for
-// CHECK:     math.absf %{{.*}} : f32
-// CHECK:     arith.addf %{{.*}} : f32
-// CHECK:     tensor.insert
-// CHECK:   tensor.insert_slice
+// CHECK:       scf.for
+// CHECK:         scf.for
+// CHECK:           math.absf %{{.*}} : f32
+// CHECK:           arith.addf %{{.*}} : f32
+// CHECK:           tensor.insert
+// CHECK:         tensor.insert_slice
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_matmul.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_matmul.mlir
index 1ab26d59a1b..7da607be9cd 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_matmul.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_matmul.mlir
@@ -1,77 +1,46 @@
 // RUN: mlir-hlo-opt %s --split-input-file \
-// RUN:   --gml-st-cpu-tiling-pipeline=matmul-tile-sizes=4,4,4 | FileCheck %s
+// RUN:   --gml-st-cpu-tiling-pipeline=matmul-tile-sizes=4,4,4 \
+// RUN: | FileCheck %s
+
+func.func @map_matmul(%lhs0: tensor<16x16xf32>, %rhs0: tensor<16x16xf32>,
+    %lhs1: tensor<16x32xf32>, %rhs1: tensor<32x16xf32>) -> tensor<16x16xf32> {
+  %init = tensor.empty() : tensor<16x16xf32>
 
-func.func @map_matmul(%arg0: tensor<?x?xf32>,
-    %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %dim1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
-  %init = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
   %cst = arith.constant 0.000000e+00 : f32
   %filled = linalg.fill ins(%cst : f32)
-              outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %4 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%filled : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %5 = linalg.matmul ins(%arg0, %arg2 : tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%filled : tensor<?x?xf32>) -> tensor<?x?xf32>
+              outs(%init : tensor<16x16xf32>) -> tensor<16x16xf32>
+
+  %4 = linalg.matmul ins(%lhs0, %rhs0 : tensor<16x16xf32>, tensor<16x16xf32>)
+                     outs(%filled : tensor<16x16xf32>) -> tensor<16x16xf32>
+  %5 = linalg.matmul ins(%lhs1, %rhs1 : tensor<16x32xf32>, tensor<32x16xf32>)
+                     outs(%filled : tensor<16x16xf32>) -> tensor<16x16xf32>
   %6 = linalg.map { math.absf }
-         ins(%5 : tensor<?x?xf32>)
-         outs(%init : tensor<?x?xf32>)
+         ins(%5 : tensor<16x16xf32>)
+         outs(%init : tensor<16x16xf32>)
 
   %result = linalg.map { arith.addf }
-              ins(%4, %6 : tensor<?x?xf32>, tensor<?x?xf32>)
-              outs(%init : tensor<?x?xf32>)
-  return %result : tensor<?x?xf32>
+              ins(%4, %6 : tensor<16x16xf32>, tensor<16x16xf32>)
+              outs(%init : tensor<16x16xf32>)
+  return %result : tensor<16x16xf32>
 }
 
 // CHECK-LABEL: @map_matmul
 
+// Fuse this linalg.fill.
+
+// CHECK-NOT:  linalg.fill
 // CHECK:      scf.for
 // CHECK:        scf.for
 // CHECK-COUNT-2:     vector.transfer_read
 // CHECK:             vector.contract
 // CHECK:          scf.yield
 // CHECK:        scf.for
-// CHECK:          linalg.matmul
-// CHECK:          scf.yield
-// CHECK:        scf.for
 // CHECK-COUNT-2:     vector.transfer_read
 // CHECK:             vector.contract
 // CHECK:          scf.yield
-// CHECK:        scf.for
-// CHECK:          linalg.matmul
-// CHECK:          scf.yield
 // CHECK:        math.absf %{{.*}} : vector<4x4xf32>
-// CHECK:        arith.addf %{{.*}} : vector<4x4xf32>
 // CHECK:        vector.transfer_write
 
 // CHECK:      scf.for
 // CHECK:        scf.for
-// CHECK:          linalg.matmul
-// CHECK:          scf.yield
-// CHECK:        scf.for
-// CHECK:          linalg.matmul
-// CHECK:          scf.yield
-// CHECK:        scf.for
-// CHECK:          scf.for
-// CHECK:            math.absf
-// CHECK:        scf.for
-// CHECK:          scf.for
-// CHECK:            arith.addf
-// CHECK:        tensor.insert
-
-// CHECK:      scf.for
-// CHECK:        scf.for
-// CHECK:          linalg.matmul
-// CHECK:          scf.yield
-// CHECK:        scf.for
-// CHECK:          linalg.matmul
-// CHECK:          scf.yield
-// CHECK:        scf.for
-// CHECK:          scf.for
-// CHECK:            math.absf
-// CHECK:        scf.for
-// CHECK:          scf.for
-// CHECK:            arith.addf
-// CHECK:        tensor.insert
+// CHECK:        arith.addf %{{.*}} : vector<1x8xf32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce.mlir
deleted file mode 100644
index c661d096da9..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: mlir-hlo-opt %s \
-// RUN:     --gml-st-cpu-tiling-pipeline="enable-fusion-clusters=true enable-fusion-cluster-outlining=true"
-
-func.func @reduce_wo_init(%arg0: tensor<2xf64>, %arg1: tensor<f64>)
-    -> tensor<f64> {
-  %reduced = linalg.reduce { arith.maxf } ins(%arg0 : tensor<2xf64>)
-      outs(%arg1 : tensor<f64>) dimensions = [0]
-  return %reduced : tensor<f64>
-}
-
-// CHECK: @reduce_wo_init_fusion_0
-// CHECK: @reduce_wo_init
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce_map.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce_map.mlir
index 956b8c5dce1..e795e00442d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce_map.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce_map.mlir
@@ -36,14 +36,13 @@ func.func @row_reduce_map_fuse_map(%arg0: tensor<?x?xf32>,
 // CHECK:     arith.addf %{{.*}} : vector<4x4xf32>
 // CHECK:     vector.multi_reduction <add>
 // CHECK:       : vector<4x4xf32> to vector<4xf32>
-// CHECK:     scf.yield %{{.*}} : {{.*}}, vector<4xf32>
+// CHECK:     scf.yield %{{.*}} : vector<4xf32>
 // CHECK:   scf.for
 // CHECK:     scf.for
 // CHECK:       arith.addf %{{.*}} : vector<4x1xf32>
-// CHECK:       vector.multi_reduction <add>
-// CHECK:         : vector<4x1xf32> to vector<4xf32>
-// CHECK:       scf.yield %{{.*}} : {{.*}}, vector<4xf32>
-// CHECK:     scf.yield %{{.*}} : {{.*}}, vector<4xf32>
+// CHECK:       arith.addf %{{.*}} : vector<4xf32>
+// CHECK:       scf.yield %{{.*}} : vector<4xf32>
+// CHECK:     scf.yield %{{.*}} : vector<4xf32>
 // CHECK:   math.absf %{{.*}} : vector<4xf32>
 // CHECK:   vector.transfer_write
 
@@ -92,7 +91,7 @@ func.func @col_reduce_map_fuse_map(%arg0: tensor<?x?xf32>,
 // CHECK:     arith.addf %{{.*}} : vector<4x4xf32>
 // CHECK:     vector.multi_reduction <add>
 // CHECK:       : vector<4x4xf32> to vector<4xf32>
-// CHECK:     scf.yield %{{.*}} : {{.*}}, vector<4xf32>
+// CHECK:     scf.yield %{{.*}} : vector<4xf32>
 // CHECK:   scf.for
 // CHECK:     scf.for
 // CHECK:       scf.for
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reshape_map.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reshape_map.mlir
new file mode 100644
index 00000000000..f0ff033d279
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reshape_map.mlir
@@ -0,0 +1,92 @@
+// RUN: mlir-hlo-opt %s \
+// RUN: --gml-st-cpu-tiling-pipeline="fuse-degenerate-reshapes=true" \
+// RUN: | FileCheck %s
+
+func.func @fuse_reshape_map(%arg0: tensor<10x16xf32>,
+    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  %0 = tensor.empty() : tensor<10x1x1x1xf32>
+  %1 = tensor.collapse_shape %0 [[0, 1], [2, 3]] : tensor<10x1x1x1xf32> into tensor<10x1xf32>
+
+  %empty= tensor.empty() : tensor<10x1x4x4x1xf32>
+  %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3, 4]] :
+              tensor<10x16xf32> into tensor<10x1x4x4x1xf32>
+  %neg = linalg.map { arith.negf }
+         ins(%expanded: tensor<10x1x4x4x1xf32>)
+         outs(%empty: tensor<10x1x4x4x1xf32>)
+  %collapsed = tensor.collapse_shape %neg [[0, 1], [2, 3, 4]] :
+              tensor<10x1x4x4x1xf32> into tensor<10x16xf32>
+
+  %empty_3D = tensor.empty() : tensor<10x1x16xf32>
+  %expanded0 = tensor.expand_shape %collapsed [[0], [1, 2]] :
+              tensor<10x16xf32> into tensor<10x1x16xf32>
+  %abs0 = linalg.map { math.absf }
+         ins(%expanded0: tensor<10x1x16xf32>)
+         outs(%empty_3D : tensor<10x1x16xf32>)
+  %collapsed0 = tensor.collapse_shape %abs0 [[0], [1, 2]] :
+               tensor<10x1x16xf32> into tensor<10x16xf32>
+
+  %empty_5D = tensor.empty() : tensor<10x16x1x1x1xf32>
+  %expanded1 = tensor.expand_shape %collapsed0 [[0], [1, 2, 3, 4]] :
+               tensor<10x16xf32> into tensor<10x16x1x1x1xf32>
+  %abs1 = linalg.map { math.absf }
+          ins(%expanded1: tensor<10x16x1x1x1xf32>)
+          outs(%empty_5D : tensor<10x16x1x1x1xf32>)
+  %collapsed1 = tensor.collapse_shape %abs1 [[0], [1, 2, 3, 4]] :
+                tensor<10x16x1x1x1xf32> into tensor<10x16xf32>
+
+  %empty_4D = tensor.empty() : tensor<10x1x16x1xf32>
+  %expanded2 = tensor.expand_shape %collapsed1 [[0, 1], [2, 3]] :
+              tensor<10x16xf32> into tensor<10x1x16x1xf32>
+  %abs2 = linalg.map { math.absf }
+         ins(%expanded2: tensor<10x1x16x1xf32>)
+         outs(%empty_4D : tensor<10x1x16x1xf32>)
+  %collapsed2 = tensor.collapse_shape %abs2 [[0, 1], [2, 3]] :
+              tensor<10x1x16x1xf32> into tensor<10x16xf32>
+
+  %empty_2D = tensor.empty() : tensor<10x16xf32>
+  %add = linalg.map { arith.addf }
+              ins(%collapsed2, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
+              outs(%empty_2D : tensor<10x16xf32>)
+  return %add : tensor<10x16xf32>
+}
+
+// CHECK:       @fuse_reshape_map(%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
+// CHECK:         %[[EXPAND:.*]] = tensor.expand_shape %[[ARG0]] {{.*}} tensor<10x16xf32> into tensor<10x1x4x4x1xf32>
+// CHECK:         %[[RES:.*]] = scf.for {{.*}} (tensor<10x1x4x4x1xf32>) {
+// CHECK:           scf.for
+// CHECK:             scf.for
+// CHECK:               %[[EXTRACT:.*]] = tensor.extract %[[EXPAND]]
+// CHECK:               arith.negf
+// CHECK:             }
+// CHECK:           }
+// CHECK:         }
+// CHECK:         %[[COLLAPSE:.*]] = tensor.collapse_shape %[[RES]] {{.*}} tensor<10x1x4x4x1xf32> into tensor<10x16xf32>
+
+// CHECK:         scf.for
+// CHECK:           scf.for
+// CHECK:             %[[EXTRACT0:.*]] = tensor.extract_slice %[[COLLAPSE]]
+// CHECK:             %[[EXPAND0:.*]] = tensor.expand_shape %[[EXTRACT0]] {{.*}} tensor<1x8xf32> into tensor<1x1x8xf32>
+// CHECK:             %[[READ0:.*]] = vector.transfer_read %[[EXPAND0]]
+// CHECK:             %[[ABS0:.*]] = math.absf %[[READ0]]
+// CHECK:             %[[WRITE0:.*]] = vector.transfer_write %[[ABS0]]
+// CHECK:             %[[COLLAPSE0:.*]] = tensor.collapse_shape %[[WRITE0]] {{.*}} tensor<1x1x8xf32> into tensor<1x8xf32>
+
+// CHECK:             %[[EXPAND1:.*]] = tensor.expand_shape %[[COLLAPSE0]] {{.*}} tensor<1x8xf32> into tensor<1x8x1x1x1xf32>
+// CHECK:             %[[READ1:.*]] = vector.transfer_read %[[EXPAND1]]
+// CHECK:             %[[ABS1:.*]] = math.absf %[[READ1]]
+// CHECK:             %[[WRITE1:.*]] = vector.transfer_write %[[ABS1]]
+// CHECK:             %[[COLLAPSE1:.*]] = tensor.collapse_shape %[[WRITE1]] {{.*}} tensor<1x8x1x1x1xf32> into tensor<1x8xf32>
+
+// CHECK:             %[[EXPAND2:.*]] = tensor.expand_shape %[[COLLAPSE1]] {{.*}} tensor<1x8xf32> into tensor<1x1x8x1xf32>
+// CHECK:             %[[READ2:.*]] = vector.transfer_read %[[EXPAND2]]
+// CHECK:             %[[ABS2:.*]] = math.absf %[[READ2]]
+// CHECK:             %[[WRITE2:.*]] = vector.transfer_write %[[ABS2]]
+// CHECK:             %[[COLLAPSE2:.*]] = tensor.collapse_shape %[[WRITE2]] {{.*}} tensor<1x1x8x1xf32> into tensor<1x8xf32>
+
+// CHECK:             %[[READ1:.*]] = vector.transfer_read %[[COLLAPSE2]]
+// CHECK:             %[[READ2:.*]] = vector.transfer_read %[[ARG1]]
+// CHECK:             %[[ADD:.*]] = arith.addf %[[READ1]], %[[READ2]]
+// CHECK:             vector.transfer_write %[[ADD]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/matmul.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/matmul.mlir
index 9d6390a0664..7a9863ea96c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/matmul.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/matmul.mlir
@@ -17,7 +17,7 @@ func.func @matmul_static(%lhs: tensor<128x16xf32>, %rhs: tensor<16x64xf32>,
 // CHECK-NEXT:      scf.for
 // CHECK-COUNT-2:     vector.transfer_read
 // CHECK:             vector.contract {{.*}} vector<4x6xf32>, vector<6x5xf32>
-// CHECK:             scf.yield {{.*}} : {{.*}}, vector<4x5xf32>
+// CHECK:             scf.yield {{.*}} : vector<4x5xf32>
 // CHECK:           vector.transfer_write
 
 // PACKED-LABEL: @matmul_static
@@ -81,7 +81,7 @@ func.func @matmul(%lhs: tensor<?x?xf32>,
 // CHECK:           scf.for
 // CHECK-COUNT-2:     vector.transfer_read
 // CHECK:             vector.contract
-// CHECK-NEXT:        scf.yield %{{.*}} : {{.*}}, vector<4x5xf32>
+// CHECK-NEXT:        scf.yield %{{.*}} : vector<4x5xf32>
 // CHECK:           vector.transfer_write
 
 // CHECK-NEXT:      scf.for
@@ -115,10 +115,8 @@ func.func @matmul_narrow_static(%lhs: tensor<2x16xf32>, %rhs: tensor<16x64xf32>,
 
 // CHECK:         scf.for
 // CHECK:           scf.for
-// CHECK-COUNT-2:     vector.transfer_read
-// CHECK:             vector.contract
-// CHECK:             scf.yield {{.*}} : {{.*}}, vector<2x5xf32>
-// CHECK:           vector.transfer_write
+// CHECK:             linalg.matmul
+// CHECK:             scf.yield {{.*}} : tensor<2x5xf32>
 
 // PACKED-LABEL: @matmul_narrow_static
 
@@ -143,7 +141,7 @@ func.func @matmul_narrow_static(%lhs: tensor<2x16xf32>, %rhs: tensor<16x64xf32>,
 // PACKED:       scf.for
 // PACKED:         scf.for
 // PACKED:           vector.contract
-// PACKED:           scf.yield %{{.*}} : {{.*}}, vector<1x1x2x8xf32>
+// PACKED:           scf.yield %{{.*}} : vector<1x1x2x8xf32>
 // PACKED:         scf.yield
 
 // PACKED:       tensor.empty() : tensor<2x64xf32>
@@ -154,8 +152,8 @@ func.func @matmul_narrow_static(%lhs: tensor<2x16xf32>, %rhs: tensor<16x64xf32>,
 
 // -----
 
-func.func @matmul_small_static_peeling(%lhs: tensor<2x4xf32>, %arg1: tensor<4x6xf32>,
-                         %output: tensor<2x6xf32>) -> tensor<2x6xf32> {
+func.func @matmul_small_static_peeling(%lhs: tensor<2x4xf32>,
+    %arg1: tensor<4x6xf32>, %output: tensor<2x6xf32>) -> tensor<2x6xf32> {
   %0 = linalg.matmul ins(%lhs, %arg1 : tensor<2x4xf32>, tensor<4x6xf32>)
                      outs(%output : tensor<2x6xf32>) -> tensor<2x6xf32>
   return %0 : tensor<2x6xf32>
@@ -177,9 +175,6 @@ func.func @matvec_static(%lhs: tensor<1x16xf32>, %arg1: tensor<16x64xf32>,
 // CHECK-LABEL: @matvec_static
 
 // CHECK:         scf.for
-// CHECK:           vector.transfer_read
-// CHECK-NEXT:      scf.for
-// CHECK-COUNT-2:     vector.transfer_read
-// CHECK:             vector.contract
-// CHECK:             scf.yield {{.*}} : {{.*}}, vector<1x5xf32>
-// CHECK:           vector.transfer_write
+// CHECK:           scf.for
+// CHECK:             linalg.matmul
+// CHECK:             scf.yield {{.*}} : tensor<1x5xf32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d.mlir
index 49fa54529c4..8953e8816cc 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline \
+// RUN: mlir-hlo-opt %s --split-input-file --mlir-print-ir-after-all\
+// RUN: --gml-st-cpu-tiling-pipeline="reduction-1d-tile-size=32 reduction-1d-split-ratio=8" \
 // RUN: | FileCheck %s
 
 func.func @reduce_1d_static(%arg0: tensor<100xf32>) -> tensor<f32> {
@@ -9,19 +10,17 @@ func.func @reduce_1d_static(%arg0: tensor<100xf32>) -> tensor<f32> {
     ins(%arg0: tensor<100xf32>) outs(%init: tensor<f32>) dimensions = [0]
   return %res : tensor<f32>
 }
-// CHECK-LABEL: @reduce_1d_static
+// CHECK-LABEL: @reduce_1d_static(
+// CHECK-SAME: %[[ARG:.*]]: tensor<100xf32>
 
-//       CHECK: arith.constant dense<0.000000e+00> : vector<8xf32>
-
-//       CHECK: scf.for
-//       CHECK:   vector.multi_reduction <add>
-//  CHECK-SAME:     : vector<4x8xf32> to vector<8xf32>
-//       CHECK:   scf.yield %{{.*}} : {{.*}}, vector<8xf32>
-
-//       CHECK: vector.multi_reduction <add>
-//  CHECK-SAME:   : vector<8xf32> to f32
-//       CHECK: vector.multi_reduction <add>
-//  CHECK-SAME:   : vector<4xf32> to f32
+// CHECK:   %[[CST:.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:   %[[LHS:.*]] = vector.transfer_read %[[ARG]]
+// CHECK:   %[[RHS:.*]] = vector.transfer_read %[[CST]][]
+// CHECK:   %[[EXTRACT:.*]] = vector.extractelement %[[RHS]][]
+// CHECK:   %[[REDUCTION:.*]] = vector.multi_reduction <add>, %[[LHS]], %[[EXTRACT]]
+// CHECK:   %[[BROADCAST:.*]] = vector.broadcast %[[REDUCTION]]
+// CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[BROADCAST]], %[[CST]][]
+// CHECK:   return %[[WRITE]]
 
 // -----
 
@@ -40,11 +39,13 @@ func.func @reduce_1d_dynamic(%arg0: tensor<?xf32>) -> tensor<f32> {
 //       CHECK: scf.for
 //       CHECK:   vector.multi_reduction <add>
 //  CHECK-SAME:     : vector<4x8xf32> to vector<8xf32>
-//       CHECK:   scf.yield %{{.*}} :  {{.*}}, vector<8xf32>
+//       CHECK:   scf.yield %{{.*}} :  vector<8xf32>
 
 //       CHECK: vector.multi_reduction <add>
 //  CHECK-SAME:   : vector<8xf32> to f32
 
 //       CHECK: scf.for
-//       CHECK:   linalg.reduce
-//       CHECK:   scf.yield %{{.*}} : tensor<f32>
+//       CHECK:   scf.for
+//       CHECK:     arith.addf
+//       CHECK:     scf.yield %{{.*}} : f32
+//       CHECK:   scf.yield %{{.*}} : f32
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d_map.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d_map.mlir
new file mode 100644
index 00000000000..079d1df0c1d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d_map.mlir
@@ -0,0 +1,35 @@
+// RUN: mlir-hlo-opt %s \
+// RUN: --gml-st-cpu-tiling-pipeline="reduction-1d-tile-size=32 reduction-1d-split-ratio=8" \
+// RUN: | FileCheck %s
+func.func @reduce_1d_map_aka_dot(%lhs: tensor<?xf32>,
+    %rhs: tensor<?xf32>) -> tensor<f32> {
+  %c0 = arith.constant 0 : index
+  %size = tensor.dim %lhs, %c0 : tensor<?xf32>
+  %init_1d = tensor.empty(%size) : tensor<?xf32>
+
+  %map = linalg.map { arith.mulf }
+    ins(%lhs, %rhs: tensor<?xf32>, tensor<?xf32>) outs(%init_1d: tensor<?xf32>)
+  %cst = arith.constant 0.0 : f32
+  %init_0d = tensor.empty() : tensor<f32>
+
+  %fill = linalg.fill
+    ins(%cst : f32) outs(%init_0d : tensor<f32>) -> tensor<f32>
+  %res = linalg.reduce { arith.addf }
+    ins(%map: tensor<?xf32>) outs(%fill: tensor<f32>) dimensions = [0]
+  return %res : tensor<f32>
+}
+// CHECK-LABEL: func.func @reduce_1d_map_aka_dot
+// CHECK: scf.for
+// CHECK:   arith.mulf {{.*}} : vector<32xf32>
+// CHECK:   vector.multi_reduction <add>
+// CHECK:     : vector<4x8xf32> to vector<8xf32>
+// CHECK:   scf.yield %{{.*}} : vector<8xf32>
+// CHECK: vector.multi_reduction <add>
+// CHECK:   : vector<8xf32> to f32
+// CHECK: scf.for
+// CHECK:   scf.for
+// CHECK:     arith.mulf {{.*}} : f32
+// CHECK:     arith.addf {{.*}} : f32
+// CHECK:     scf.yield {{.*}} : f32
+// CHECK:   scf.yield {{.*}} : f32
+
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_2d.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_2d.mlir
index d2bb30743a7..7af0bdef6b4 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_2d.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_2d.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline \
-// RUN: | FileCheck %s --dump-input=always
+// RUN: | FileCheck %s
 
 func.func @col_reduce_static(%input: tensor<100x10xf32>,
                         %output: tensor<10xf32>) -> tensor<10xf32> {
@@ -15,7 +15,7 @@ func.func @col_reduce_static(%input: tensor<100x10xf32>,
 //       CHECK:   scf.for
 //       CHECK:     vector.multi_reduction
 //  CHECK-SAME:       : vector<4x4xf32> to vector<4xf32>
-//  CHECK-NEXT:     scf.yield %{{.*}} : {{.*}}, vector<4xf32>
+//  CHECK-NEXT:     scf.yield %{{.*}} : vector<4xf32>
 //       CHECK:   vector.transfer_write
 
 // -----
@@ -39,12 +39,12 @@ func.func @row_reduce_dynamic(%input: tensor<?x?xf32>,
 // CHECK:        scf.for
 // CHECK:          vector.multi_reduction
 // CHECK-SAME:       : vector<4x4xf32> to vector<4xf32>
-// CHECK-NEXT:     scf.yield %{{.*}} : {{.*}}, vector<4xf32>
+// CHECK-NEXT:     scf.yield %{{.*}} : vector<4xf32>
 
 // CHECK:        scf.for
-// CHECK:          vector.multi_reduction
-// CHECK-SAME:       : vector<4x1xf32> to vector<4xf32>
-// CHECK-NEXT:     scf.yield %{{.*}} : {{.*}}, vector<4xf32>
+// CHECK:          arith.mulf
+// CHECK-SAME:       : vector<4xf32>
+// CHECK-NEXT:     scf.yield %{{.*}} : vector<4xf32>
 // CHECK:        vector.transfer_write
 
 // CHECK:      scf.for
@@ -76,7 +76,7 @@ func.func @col_reduce_dynamic(%input: tensor<?x?xf32>,
 // CHECK:        scf.for
 // CHECK:          vector.multi_reduction
 // CHECK-SAME:       : vector<4x4xf32> to vector<4xf32>
-// CHECK-NEXT:     scf.yield %{{.*}} : {{.*}}, vector<4xf32>
+// CHECK-NEXT:     scf.yield %{{.*}} : vector<4xf32>
 
 // CHECK:        scf.for
 // CHECK:          arith.mulf %{{.*}} : f32
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/scatter.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/scatter.mlir
index 826e952492d..d31b073f350 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/scatter.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/scatter.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN: --gml-st-cpu-tiling-pipeline=enable-fusion-clusters=false | \
+// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline | \
 // RUN: FileCheck %s
 
 func.func @scatter_fusion(%indices: tensor<?x2xindex>,
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
index 426b2270bb0..357206e322b 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
@@ -181,3 +181,310 @@ transform.sequence failures(propagate) {
       : (!pdl.operation) -> !pdl.operation
     %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1]
 }
+
+// -----
+
+func.func @fuse_reshape_middle_unit_dim_map(%arg0: tensor<10x16xf32>,
+    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  %empty_3D = tensor.empty() : tensor<10x1x16xf32>
+  %expanded = tensor.expand_shape %arg0 [[0], [1, 2]] :
+              tensor<10x16xf32> into tensor<10x1x16xf32>
+  %abs = linalg.map { math.absf }
+         ins(%expanded: tensor<10x1x16xf32>)
+         outs(%empty_3D : tensor<10x1x16xf32>)
+
+  %empty_2D = tensor.empty() : tensor<10x16xf32>
+  %collapsed = tensor.collapse_shape %abs [[0], [1, 2]] :
+               tensor<10x1x16xf32> into tensor<10x16xf32>
+  %add = linalg.map { arith.addf }
+              ins(%collapsed, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
+              outs(%empty_2D : tensor<10x16xf32>)
+    {op_label="root"}
+  return %add : tensor<10x16xf32>
+}
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.map"]}
+                                    attributes{op_label="root"} in %arg1
+      : (!pdl.operation) -> !pdl.operation
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+}
+
+// CHECK-LABEL: func @fuse_reshape_middle_unit_dim_map
+// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
+// CHECK-NOT:      tensor.expand_shape
+// CHECK-NOT:      tensor.collapse_shape
+// CHECK:          scf.forall
+// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK:            %[[EXPAND:.*]] = tensor.expand_shape %[[EXTRACT]]
+// CHECK:            %[[ABS:.*]] = linalg.map { math.absf } ins(%[[EXPAND]]
+// CHECK:            %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]]
+// CHECK:            linalg.map { arith.addf } ins(%[[COLLAPSE]]
+// CHECK:            tensor.parallel_insert_slice
+// CHECK:          return
+
+// -----
+
+func.func @fuse_reshape_trailing_unit_dim_map(%arg0: tensor<10x16xf32>,
+    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  %empty_5D = tensor.empty() : tensor<10x16x1x1x1xf32>
+  %expanded = tensor.expand_shape %arg0 [[0], [1, 2, 3, 4]] :
+              tensor<10x16xf32> into tensor<10x16x1x1x1xf32>
+  %abs = linalg.map { math.absf }
+         ins(%expanded: tensor<10x16x1x1x1xf32>)
+         outs(%empty_5D : tensor<10x16x1x1x1xf32>)
+
+  %empty_2D = tensor.empty() : tensor<10x16xf32>
+  %collapsed = tensor.collapse_shape %abs [[0], [1, 2, 3, 4]] :
+               tensor<10x16x1x1x1xf32> into tensor<10x16xf32>
+  %add = linalg.map { arith.addf }
+              ins(%collapsed, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
+              outs(%empty_2D : tensor<10x16xf32>)
+    {op_label="root"}
+  return %add : tensor<10x16xf32>
+}
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.map"]}
+                                    attributes{op_label="root"} in %arg1
+      : (!pdl.operation) -> !pdl.operation
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+}
+
+// CHECK-LABEL: func @fuse_reshape_trailing_unit_dim_map
+// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
+// CHECK-NOT:      tensor.expand_shape
+// CHECK-NOT:      tensor.collapse_shape
+// CHECK:          scf.forall
+// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK:            %[[EXPAND:.*]] = tensor.expand_shape %[[EXTRACT]]
+// CHECK:            %[[ABS:.*]] = linalg.map { math.absf } ins(%[[EXPAND]]
+// CHECK:            %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]]
+// CHECK:            linalg.map { arith.addf } ins(%[[COLLAPSE]]
+// CHECK:            tensor.parallel_insert_slice
+// CHECK:          return
+
+// -----
+
+func.func @fuse_reshape_leading_unit_dim_map(%arg0: tensor<10x16xf32>,
+    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  %empty_5D = tensor.empty() : tensor<1x1x1x10x16xf32>
+  %expanded = tensor.expand_shape %arg0 [[0, 1, 2, 3], [4]] :
+              tensor<10x16xf32> into tensor<1x1x1x10x16xf32>
+  %abs = linalg.map { math.absf }
+         ins(%expanded: tensor<1x1x1x10x16xf32>)
+         outs(%empty_5D : tensor<1x1x1x10x16xf32>)
+
+  %empty_2D = tensor.empty() : tensor<10x16xf32>
+  %collapsed = tensor.collapse_shape %abs [[0, 1, 2, 3], [4]] :
+               tensor<1x1x1x10x16xf32> into tensor<10x16xf32>
+  %add = linalg.map { arith.addf }
+              ins(%collapsed, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
+              outs(%empty_2D : tensor<10x16xf32>)
+    {op_label="root"}
+  return %add : tensor<10x16xf32>
+}
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.map"]}
+                                    attributes{op_label="root"} in %arg1
+      : (!pdl.operation) -> !pdl.operation
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+}
+
+// CHECK-LABEL: func @fuse_reshape_leading_unit_dim_map
+// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
+// CHECK-NOT:      tensor.expand_shape
+// CHECK-NOT:      tensor.collapse_shape
+// CHECK:          scf.forall
+// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK:            %[[EXPAND:.*]] = tensor.expand_shape %[[EXTRACT]]
+// CHECK:            %[[ABS:.*]] = linalg.map { math.absf } ins(%[[EXPAND]]
+// CHECK:            %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]]
+// CHECK:            linalg.map { arith.addf } ins(%[[COLLAPSE]]
+// CHECK:            tensor.parallel_insert_slice
+// CHECK:          return
+
+// -----
+
+func.func @fuse_reshape_multiple_unit_dims_map(%arg0: tensor<10x16xf32>,
+    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  %empty_4D = tensor.empty() : tensor<10x1x16x1xf32>
+  %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3]] :
+              tensor<10x16xf32> into tensor<10x1x16x1xf32>
+  %abs = linalg.map { math.absf }
+         ins(%expanded: tensor<10x1x16x1xf32>)
+         outs(%empty_4D : tensor<10x1x16x1xf32>)
+
+  %empty_2D = tensor.empty() : tensor<10x16xf32>
+  %collapsed = tensor.collapse_shape %abs [[0, 1], [2, 3]] :
+               tensor<10x1x16x1xf32> into tensor<10x16xf32>
+  %add = linalg.map { arith.addf }
+              ins(%collapsed, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
+              outs(%empty_2D : tensor<10x16xf32>)
+    {op_label="root"}
+  return %add : tensor<10x16xf32>
+}
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.map"]}
+                                    attributes{op_label="root"} in %arg1
+      : (!pdl.operation) -> !pdl.operation
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+}
+
+// CHECK-LABEL: func @fuse_reshape_multiple_unit_dims_map
+// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
+// CHECK-NOT:      tensor.expand_shape
+// CHECK-NOT:      tensor.collapse_shape
+// CHECK:          scf.forall
+// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK:            %[[EXPAND:.*]] = tensor.expand_shape %[[EXTRACT]]
+// CHECK:            %[[ABS:.*]] = linalg.map { math.absf } ins(%[[EXPAND]]
+// CHECK:            %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]]
+// CHECK:            linalg.map { arith.addf } ins(%[[COLLAPSE]]
+// CHECK:            tensor.parallel_insert_slice
+// CHECK:          return
+
+// -----
+
+func.func @fuse_reshape_reassoc_only_unit_dims_map(%arg0: tensor<10x16xf32>)
+    -> tensor<10x16x1xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  %empty_5D = tensor.empty() : tensor<10x1x16x1x1xf32>
+  %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3, 4]] :
+              tensor<10x16xf32> into tensor<10x1x16x1x1xf32>
+  %abs = linalg.map { math.absf }
+         ins(%expanded: tensor<10x1x16x1x1xf32>)
+         outs(%empty_5D : tensor<10x1x16x1x1xf32>)
+
+  %empty_3D = tensor.empty() : tensor<10x16x1xf32>
+  %collapsed = tensor.collapse_shape %abs [[0, 1], [2], [3, 4]] :
+               tensor<10x1x16x1x1xf32> into tensor<10x16x1xf32>
+  %neg = linalg.map { arith.negf }
+              ins(%collapsed : tensor<10x16x1xf32>)
+              outs(%empty_3D : tensor<10x16x1xf32>)
+    {op_label="root"}
+  return %neg : tensor<10x16x1xf32>
+}
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.map"]}
+                                    attributes{op_label="root"} in %arg1
+      : (!pdl.operation) -> !pdl.operation
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+}
+
+// CHECK-LABEL: func @fuse_reshape_reassoc_only_unit_dims_map
+// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>)
+// CHECK-NOT:      tensor.expand_shape
+// CHECK-NOT:      tensor.collapse_shape
+// CHECK:          scf.forall
+// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK:            %[[EXPAND:.*]] = tensor.expand_shape %[[EXTRACT]]
+// CHECK:            %[[ABS:.*]] = linalg.map { math.absf } ins(%[[EXPAND]]
+// CHECK:            %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]]
+// CHECK:            linalg.map { arith.negf } ins(%[[COLLAPSE]]
+// CHECK:            tensor.parallel_insert_slice
+// CHECK:          return
+
+// -----
+
+func.func @do_not_fuse_collapse_shape(%arg0: tensor<10x16xf32>,
+    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  %empty = tensor.empty() : tensor<10x1x4x4x1xf32>
+  %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3, 4]] :
+              tensor<10x16xf32> into tensor<10x1x4x4x1xf32>
+  %abs = linalg.map { math.absf }
+         ins(%expanded: tensor<10x1x4x4x1xf32>)
+         outs(%empty: tensor<10x1x4x4x1xf32>)
+
+  %empty_2D = tensor.empty() : tensor<10x16xf32>
+  %collapsed = tensor.collapse_shape %abs [[0, 1], [2, 3, 4]] :
+              tensor<10x1x4x4x1xf32> into tensor<10x16xf32>
+  %add = linalg.map { arith.addf }
+              ins(%collapsed, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
+              outs(%empty_2D : tensor<10x16xf32>)
+    {op_label="root"}
+  return %add : tensor<10x16xf32>
+}
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.map"]}
+                                    attributes{op_label="root"} in %arg1
+      : (!pdl.operation) -> !pdl.operation
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+}
+
+// CHECK-LABEL: func @do_not_fuse_collapse_shape
+// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
+// CHECK:          %[[EXPAND:.*]] = tensor.expand_shape %[[ARG0]]
+// CHECK:          %[[ABS:.*]] = linalg.map { math.absf } ins(%[[EXPAND]]
+// CHECK:          %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]]
+// CHECK:          scf.forall
+// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[COLLAPSE]]
+// CHECK:            linalg.map { arith.addf } ins(%[[EXTRACT]]
+// CHECK:            tensor.parallel_insert_slice
+// CHECK:          return
+
+//%test = tensor.collapse_shape %abs [[0, 1], [2]] :
+//             tensor<10x16x1xf32> into tensor<160x1xf32>
+
+// -----
+
+func.func @do_not_fuse_expand_shape(%arg0: tensor<10x16xf32>,
+    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  %empty = tensor.empty() : tensor<160xf32>
+  %collapsed = tensor.collapse_shape %arg0 [[0, 1]] :
+               tensor<10x16xf32> into tensor<160xf32>
+  %abs = linalg.map { math.absf }
+         ins(%collapsed: tensor<160xf32>)
+         outs(%empty: tensor<160xf32>)
+
+  %empty_2D = tensor.empty() : tensor<10x16xf32>
+  %expanded = tensor.expand_shape %abs [[0, 1]] :
+              tensor<160xf32> into tensor<10x16xf32>
+  %add = linalg.map { arith.addf }
+              ins(%expanded, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
+              outs(%empty_2D : tensor<10x16xf32>)
+    {op_label="root"}
+  return %add : tensor<10x16xf32>
+}
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.map"]}
+                                    attributes{op_label="root"} in %arg1
+      : (!pdl.operation) -> !pdl.operation
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+}
+
+// CHECK-LABEL: func @do_not_fuse_expand_shape
+// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
+// CHECK:          %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ARG0]]
+// CHECK:          %[[ABS:.*]] = linalg.map { math.absf } ins(%[[COLLAPSE]]
+// CHECK:          %[[EXPAND:.*]] = tensor.expand_shape %[[ABS]]
+// CHECK:          scf.forall
+// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[EXPAND]]
+// CHECK:            linalg.map { arith.addf } ins(%[[EXTRACT]]
+// CHECK:            tensor.parallel_insert_slice
+// CHECK:          return
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/invalid.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/invalid.mlir
index 11a3a885053..31e27a2a74e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/invalid.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/invalid.mlir
@@ -5,9 +5,10 @@ func.func @fusion_cluster_not_isolated(%arg0: tensor<?x?xf32>,
   %map0 = linalg.map { math.exp }
             ins(%arg0 : tensor<?x?xf32>)
             outs(%init : tensor<?x?xf32>)
-  // expected-error@+1 {{op using value defined outside the region}}
-  %0 = gml_st.fusion (%a1 = %arg1 : tensor<?x?xf32>,
-                      %in = %init : tensor<?x?xf32>) {
+  // expected-note@+1 {{required by region isolation constraints}}
+  %0 = gml_st.fusion ins(%a1 = %arg1 : tensor<?x?xf32>)
+                     inits(%in = %init : tensor<?x?xf32>) {
+    // expected-error@+1 {{op using value defined outside the region}}
     %map1 = linalg.map { arith.mulf }
       ins(%map0, %a1 : tensor<?x?xf32>, tensor<?x?xf32>)
       outs(%in : tensor<?x?xf32>)
@@ -15,4 +16,3 @@ func.func @fusion_cluster_not_isolated(%arg0: tensor<?x?xf32>,
   } : tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
-
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/ops.mlir
index c55aabafd44..4718693639c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/ops.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/ops.mlir
@@ -5,9 +5,9 @@
 
 func.func @fusion_cluster(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
     %init: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %0 = gml_st.fusion (%a0 = %arg0 : tensor<?x?xf32>,
-                      %a1 = %arg1 : tensor<?x?xf32>,
-                      %in = %init : tensor<?x?xf32>) {
+  %0 = gml_st.fusion ins(%a0 = %arg0 : tensor<?x?xf32>,
+                         %a1 = %arg1 : tensor<?x?xf32>)
+                     inits(%in = %init : tensor<?x?xf32>) {
     %map0 = linalg.map { math.exp }
       ins(%a0 : tensor<?x?xf32>)
       outs(%in : tensor<?x?xf32>)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/optimize_linalg_ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/optimize_linalg_ops.mlir
new file mode 100644
index 00000000000..a8689befb4c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/optimize_linalg_ops.mlir
@@ -0,0 +1,160 @@
+// RUN: mlir-hlo-opt %s --gml-st-optimize-linalg-ops-pass \
+// RUN: --split-input-file \
+// RUN: | FileCheck %s
+
+func.func @map_no_inputs(%arg: tensor<32xf32>) -> tensor<32xf32> {
+  %c0 = arith.constant 0.0 : f32
+  %init = tensor.empty() : tensor<32xf32>
+
+  %res = linalg.map
+           outs(%init: tensor<32xf32>)
+           () {
+             linalg.yield %c0 : f32
+           }
+  func.return %res : tensor<32xf32>
+}
+
+// CHECK-LABEL:  @map_no_inputs
+// CHECK-DAG:      %[[CST:.*]] = arith.constant
+// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
+// CHECK:          linalg.fill
+// CHECK-SAME:       ins(%[[CST]]
+// CHECK-SAME:       outs(%[[INIT]]
+
+// -----
+
+func.func @map_dense_constant_operand(%arg: tensor<32xf32>) -> tensor<32xf32> {
+  %c0 = arith.constant dense<0.0> : tensor<32xf32>
+  %init = tensor.empty() : tensor<32xf32>
+
+  %res = linalg.map { arith.maxf }
+           ins(%arg, %c0: tensor<32xf32>, tensor<32xf32>)
+           outs(%init: tensor<32xf32>)
+  func.return %res : tensor<32xf32>
+}
+
+// CHECK-LABEL:  @map_dense_constant_operand
+// CHECK-SAME:       (%[[ARG:.*]]: tensor<32xf32>)
+// CHECK-DAG:      %[[CST:.*]] = arith.constant 0.0
+// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
+// CHECK:          linalg.map
+// CHECK-SAME:       ins(%[[ARG]]
+// CHECK-SAME:       outs(%[[INIT]]
+// CHECK-NEXT:       (%[[BBARG:.*]]: f32)
+// CHECK-NEXT:         arith.maxf %[[BBARG]], %[[CST]]
+
+// -----
+
+func.func @map_dense_constant_operand_complex(%arg: tensor<32xcomplex<f64>>)
+    -> tensor<32xcomplex<f64>> {
+  %c0 = arith.constant dense<(1.0000e+00,0.0000e+00)> : tensor<32xcomplex<f64>>
+  %init = tensor.empty() : tensor<32xcomplex<f64>>
+
+  %res = linalg.map { complex.add }
+           ins(%arg, %c0: tensor<32xcomplex<f64>>, tensor<32xcomplex<f64>>)
+           outs(%init: tensor<32xcomplex<f64>>)
+  func.return %res : tensor<32xcomplex<f64>>
+}
+
+// CHECK-LABEL:  @map_dense_constant_operand_complex
+// CHECK-SAME:       (%[[ARG:.*]]: tensor<32xcomplex<f64>>)
+// CHECK-DAG:      %[[CST:.*]] = complex.constant
+// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
+// CHECK:          linalg.map
+// CHECK-SAME:       ins(%[[ARG]]
+// CHECK-SAME:       outs(%[[INIT]]
+// CHECK-NEXT:       (%[[BBARG:.*]]: complex<f64>)
+// CHECK-NEXT:         complex.add %[[BBARG]], %[[CST]]
+
+// -----
+
+func.func @map_fill_operand(%arg: tensor<32xf32>) -> tensor<32xf32> {
+  %c0 = arith.constant 0.0 : f32
+  %init = tensor.empty() : tensor<32xf32>
+
+  %filled = linalg.fill ins(%c0 : f32)
+              outs(%init: tensor<32xf32>) -> tensor<32xf32>
+
+  %res = linalg.map { arith.maxf }
+           ins(%arg, %filled: tensor<32xf32>, tensor<32xf32>)
+           outs(%init: tensor<32xf32>)
+  func.return %res : tensor<32xf32>
+}
+
+// CHECK-LABEL:  @map_fill_operand
+// CHECK-SAME:       (%[[ARG:.*]]: tensor<32xf32>)
+// CHECK-DAG:      %[[CST:.*]] = arith.constant 0.0
+// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
+// CHECK:          linalg.map
+// CHECK-SAME:       ins(%[[ARG]]
+// CHECK-SAME:       outs(%[[INIT]]
+// CHECK-NEXT:       (%[[BBARG:.*]]: f32)
+// CHECK-NEXT:         arith.maxf %[[BBARG]], %[[CST]]
+
+// -----
+
+func.func @map_all_constant_operand(%select: i1) -> tensor<32xf32> {
+  %c0 = arith.constant dense<0.0> : tensor<32xf32>
+  %c1 = arith.constant 1.0 : f32
+  %init = tensor.empty() : tensor<32xf32>
+
+  %filled = linalg.fill ins(%c1 : f32)
+              outs(%init: tensor<32xf32>) -> tensor<32xf32>
+
+  %res = linalg.map
+           ins(%c0, %filled: tensor<32xf32>, tensor<32xf32>)
+           outs(%init: tensor<32xf32>)
+           (%lhs : f32, %rhs : f32) {
+             %0 = arith.select %select, %lhs, %rhs : f32
+             linalg.yield %0 : f32
+           }
+  func.return %res : tensor<32xf32>
+}
+
+// CHECK-LABEL:  @map_all_constant_operand
+// CHECK-DAG:      %[[C0:.*]] = arith.constant 0.0
+// CHECK-DAG:      %[[C1:.*]] = arith.constant 1.0
+// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
+// CHECK-DAG:      %[[VAL:.*]] = arith.select
+// CHECK:          linalg.fill
+// CHECK-SAME:       ins(%[[VAL]]
+// CHECK-SAME:       outs(%[[INIT]]
+
+// -----
+
+func.func @broadcast_of_splat() -> tensor<32x64xf32> {
+  %c0 = arith.constant dense<0.0> : tensor<32xf32>
+  %init = tensor.empty() : tensor<32x64xf32>
+
+  %bcast = linalg.broadcast
+    ins(%c0: tensor<32xf32>)
+    outs(%init: tensor<32x64xf32>)
+    dimensions = [1]
+  func.return %bcast : tensor<32x64xf32>
+}
+// CHECK-LABEL:  @broadcast_of_splat
+// CHECK-DAG:      %[[CST:.*]] = arith.constant
+// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
+// CHECK:          linalg.fill
+// CHECK-SAME:       ins(%[[CST]]
+// CHECK-SAME:       outs(%[[INIT]]
+
+// -----
+
+func.func @broadcast_of_single_element_tensor(%arg: tensor<f32>)
+    -> tensor<32xf32> {
+  %init = tensor.empty() : tensor<32xf32>
+  %bcast = linalg.broadcast
+    ins(%arg: tensor<f32>)
+    outs(%init: tensor<32xf32>)
+    dimensions = [0]
+  func.return %bcast : tensor<32xf32>
+}
+// CHECK-LABEL:  @broadcast_of_single_element_tensor
+// CHECK-SAME:       (%[[ARG:.*]]: tensor<f32>)
+
+// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
+// CHECK-DAG:      %[[EXTRACT:.*]] = tensor.extract %[[ARG]]
+// CHECK:          linalg.fill
+// CHECK-SAME:       ins(%[[EXTRACT]]
+// CHECK-SAME:       outs(%[[INIT]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_forall_to_for.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_forall_to_for.mlir
index d1ace7cd728..ed52346d7a9 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_forall_to_for.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_forall_to_for.mlir
@@ -1,9 +1,11 @@
-// RUN: mlir-hlo-opt %s --gml-st-rewrite-forall-ops | FileCheck %s
+// RUN: mlir-hlo-opt %s --gml-st-rewrite-forall-ops --split-input-file \
+// RUN: | FileCheck %s
 
 func.func @add(%in: tensor<3x3xi32>, %out: tensor<3x3xi32>) -> tensor<3x3xi32> {
   %c3 = arith.constant 3 : index
 
-  %result = scf.forall (%i, %j) in (%c3, %c3) shared_outs(%o = %out) -> tensor<3x3xi32> {
+  %result = scf.forall (%i, %j) in (%c3, %c3)
+      shared_outs(%o = %out) -> tensor<3x3xi32> {
     %addend = tensor.extract_slice %in[%i, %j][1, 1][1, 1]
         : tensor<3x3xi32> to tensor<i32>
     %augend = tensor.extract_slice %out[%i, %j][1, 1][1, 1]
@@ -13,7 +15,7 @@ func.func @add(%in: tensor<3x3xi32>, %out: tensor<3x3xi32>) -> tensor<3x3xi32> {
       tensor.parallel_insert_slice %sum into %o[%i, %j][1, 1][1, 1]
         : tensor<i32> into tensor<3x3xi32>
     }
-  }
+  } {some_attr = "attr_value"}
 
   return %result : tensor<3x3xi32>
 }
@@ -26,9 +28,9 @@ func.func @add(%in: tensor<3x3xi32>, %out: tensor<3x3xi32>) -> tensor<3x3xi32> {
 // CHECK-NEXT:     mhlo.add
 // CHECK-NEXT:     %[[INSERTED:.*]] = tensor.insert_slice
 // CHECK-NEXT:     scf.yield %[[INSERTED]]
-// CHECK-NEXT:   }
+// CHECK-NEXT:   } {some_attr = "attr_value"}
 // CHECK-NEXT:   scf.yield %[[INNER]]
-// CHECK-NEXT: }
+// CHECK-NEXT: } {some_attr = "attr_value"}
 // CHECK-NEXT: return %[[RESULT]]
 
 // -----
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tile_by_one.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tile_by_one.mlir
index b5f9876045e..aa79f0787c0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tile_by_one.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tile_by_one.mlir
@@ -1,54 +1,5 @@
 // RUN: mlir-hlo-opt %s --gml-tile-by-one | FileCheck %s
 
-func.func @reverse_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
-    -> tensor<?x?xf32> {
-  %reversed = thlo.reverse ins(%arg0 : tensor<?x?xf32>)
-      outs(%arg1 : tensor<?x?xf32>) reverse_dimensions = [0, 1]
-  return %reversed : tensor<?x?xf32>
-}
-
-// CHECK:      @reverse_dynamic
-// CHECK:        scf.for
-// CHECK:          scf.for
-// CHECK:            tensor.extract_slice
-// CHECK-SAME:           <?x?xf32> to tensor<1x1xf32>
-
-// -----
-
-func.func @map(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
-    -> tensor<?x?xf32> {
-  %mapped = linalg.map { math.absf } ins(%arg0 : tensor<?x?xf32>)
-      outs(%arg1 : tensor<?x?xf32>)
-  return %mapped : tensor<?x?xf32>
-}
-
-// CHECK:      @map
-// CHECK:        scf.for
-// CHECK:          scf.for
-// CHECK:            tensor.extract_slice
-// CHECK-SAME:           <?x?xf32> to tensor<1x1xf32>
-// CHECK:            linalg.map { math.absf }
-// CHECK-SAME:           tensor<1x1xf32>
-
-// -----
-
-func.func @dont_tile_scalarlike_map(%arg0: tensor<1x1xf32>,
-    %arg1: tensor<1x1xf32>) -> tensor<1x1xf32> {
-  %mapped = linalg.map { math.absf } ins(%arg0 : tensor<1x1xf32>)
-      outs(%arg1 : tensor<1x1xf32>)
-  return %mapped : tensor<1x1xf32>
-}
-
-// CHECK:      @dont_tile_scalarlike_map
-// CHECK-NOT:    scf.for
-// CHECK-NOT:    scf.parallel
-// CHECK:        linalg.map
-// CHECK-SAME:       tensor<1x1xf32>
-// CHECK-NOT:    scf.for
-// CHECK-NOT:    scf.parallel
-
-// -----
-
 func.func @concat(%init : tensor<?x?xi32>, %a: tensor<?x?xi32>,
     %b: tensor<?x?xi32>, %c: tensor<?x?xi32>) -> tensor<?x?xi32> {
   %concat = thlo.concatenate
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir
index 8a1f6c0225f..56ccff75685 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir
@@ -35,15 +35,15 @@ func.func @select_and_scatter(%arg: memref<112x112xf32>,
 // CHECK: ^bb0(%[[ARG_BUF:.*]]: memref<112x112xf32>, %[[SRC_BUF:.*]]: memref<56x56xf32>, %[[INIT_BUF:.*]]: memref<f32>, %[[RESULT_BUF:.*]]: memref<112x112xf32>):
 
 // Constants.
-// CHECK-DAG: %[[C0_F32:.*]] = "arith.constant"() {value = 0.000000e+00 : f32}
-// CHECK-DAG: %[[C0:.*]] = "arith.constant"() {value = 0 : index}
-// CHECK-DAG: %[[C1:.*]] = "arith.constant"() {value = 1 : index}
-// CHECK-DAG: %[[C2:.*]] = "arith.constant"() {value = 2 : index}
-// CHECK-DAG: %[[C3:.*]] = "arith.constant"() {value = 3 : index}
-// CHECK-DAG: %[[C56:.*]] = "arith.constant"() {value = 56 : index}
-// CHECK-DAG: %[[C112:.*]] = "arith.constant"() {value = 112 : index}
-// CHECK-DAG: %[[CFALSE:.*]] = "arith.constant"() {value = false}
-// CHECK-DAG: %[[CTRUE:.*]] = "arith.constant"() {value = true}
+// CHECK-DAG: %[[C0_F32:.*]] = "arith.constant"() <{value = 0.000000e+00 : f32}>
+// CHECK-DAG: %[[C0:.*]] = "arith.constant"() <{value = 0 : index}>
+// CHECK-DAG: %[[C1:.*]] = "arith.constant"() <{value = 1 : index}>
+// CHECK-DAG: %[[C2:.*]] = "arith.constant"() <{value = 2 : index}>
+// CHECK-DAG: %[[C3:.*]] = "arith.constant"() <{value = 3 : index}>
+// CHECK-DAG: %[[C56:.*]] = "arith.constant"() <{value = 56 : index}>
+// CHECK-DAG: %[[C112:.*]] = "arith.constant"() <{value = 112 : index}>
+// CHECK-DAG: %[[CFALSE:.*]] = "arith.constant"() <{value = false}>
+// CHECK-DAG: %[[CTRUE:.*]] = "arith.constant"() <{value = true}>
 
 // Parallel loop to initialize the output buffer.
 // CHECK: %[[INIT:.*]] = "memref.load"(%[[INIT_BUF]]) : (memref<f32>) -> f32
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir
index b6a0604d625..eb92b90d576 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir
@@ -2,92 +2,6 @@
 
 // -----
 
-func.func @invalid_allreduce(%input0: memref<2xf32>, %input1: memref<3xf32>) {
-  // expected-error@+1 {{requires operand #1 (type: 'memref<3xf32>') and result #1 (type: 'memref<2xf32>') to have same type}}
-  "lmhlo.all_reduce"(%input0, %input1, %input0, %input0) ({
-    ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
-      %add = mhlo.add %arg0, %arg1 : tensor<f32>
-      "mhlo.return"(%add) : (tensor<f32>) -> ()
-    })
-  {channel_id = #mhlo.channel_handle<handle = 1, type = 0>, constrain_layout = false,
-   replica_groups = dense<[[0, 1, 2, 3], [5, 6, 7, 4]]> : tensor<2x4xi64>,
-   use_global_device_ids = false} : (memref<2xf32>, memref<3xf32>, memref<2xf32>, memref<2xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_allreduce(%input0: memref<2xf32>, %input1: memref<3xf16>) {
-  // expected-error@+1 {{requires the same element type for all operands}}
-  "lmhlo.all_reduce"(%input0, %input1, %input0, %input1) ({
-    ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
-      %add = mhlo.add %arg0, %arg1 : tensor<f32>
-      "mhlo.return"(%add) : (tensor<f32>) -> ()
-    })
-  {channel_id = #mhlo.channel_handle<handle = 1, type = 0>, constrain_layout = false,
-   replica_groups = dense<[[0, 1, 2, 3], [5, 6, 7, 8]]> : tensor<2x4xi64>,
-   use_global_device_ids = false} : (memref<2xf32>, memref<3xf16>, memref<2xf32>, memref<3xf16>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @reduce_scatter
-func.func @reduce_scatter(%data: memref<4x16xf32>, %result:memref<4x4xf32>) {
-  "lmhlo.reduce_scatter"(%data, %result) ({
-    // reduction computation
-    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %1 = mhlo.add %arg2, %arg3 : tensor<f32>
-    "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      scatter_dimension = 1 : i64} : (memref<4x16xf32>, memref<4x4xf32>) -> ()
-  func.return
-}
-// -----
-
-// CHECK-LABEL: func @mixed_types_allgather
-func.func @mixed_types_allgather(%a0: memref<1x1xf32>, %a1:memref<1x1xi32>) {
-  "lmhlo.all_gather"(%a0, %a1, %a0, %a1) {all_gather_dimension = 0 : i64,
-    constrain_layout = false, replica_groups = dense<0> : tensor<1x1xi64>,
-    use_global_device_ids = false} : (memref<1x1xf32>, memref<1x1xi32>, memref<1x1xf32>, memref<1x1xi32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_allgather(%input0: memref<2xf32>, %output: memref<8xf32>) {
-  // expected-error@+1 {{replica id #1 seen more than once}}
-  "lmhlo.all_gather"(%input0, %output)
-    {channel_id = #mhlo.channel_handle<handle = 1, type = 0>, constrain_layout = false,
-     replica_groups = dense<[[0, 1, 1, 3], [5, 6, 7, 8]]> : tensor<2x4xi64>,
-     use_global_device_ids = false, all_gather_dimension = 0 : i64} : (memref<2xf32>, memref<8xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_alltoall(%input0: memref<2xf32>, %output: memref<8xf32>) {
-  // expected-error@+1 {{replica id #4 not seen in replica groups}}
-  "lmhlo.all_to_all"(%input0, %output)
-    {channel_id = #mhlo.channel_handle<handle = 1, type = 0>, constrain_layout = false,
-     replica_groups = dense<[[0, 1, 2, 3], [5, 6, 7, 8]]> : tensor<2x4xi64>,
-     use_global_device_ids = false} : (memref<2xf32>, memref<8xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_alltoall(%input0: memref<2xf32>, %output: memref<8xf32>) {
-  // expected-error@+1 {{replica groups should be a rank 2 tensor}}
-  "lmhlo.all_to_all"(%input0, %output)
-    {channel_id = #mhlo.channel_handle<handle = 1, type = 0>, constrain_layout = false,
-     replica_groups = dense<0> : tensor<1xi64>,
-     use_global_device_ids = false} : (memref<2xf32>, memref<8xf32>) -> ()
-  func.return
-}
-
-// -----
-
 // CHECK-LABEL: func @ceil
 func.func @ceil(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
   "lmhlo.ceil"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
@@ -769,76 +683,6 @@ func.func @shift_right_logical_memrefs(%arg0: memref<1xf32>, %arg1: memref<1xf32
 
 // -----
 
-// CHECK-LABEL: func @all_reduce_memrefs
-func.func @all_reduce_memrefs(%arg0: memref<10xf32>, %arg_out: memref<10xf32>) -> () {
-  "lmhlo.all_reduce"(%arg0, %arg_out) ({
-    ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
-    %max = mhlo.maximum %lhs, %rhs : tensor<f32>
-    "mhlo.return"(%max) : (tensor<f32>) -> ()
-  })
-  { replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64> }: (memref<10xf32>, memref<10xf32>) -> ()
-
-  "lmhlo.all_reduce"(%arg0, %arg_out) ({
-    ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
-    %max = mhlo.maximum %lhs, %rhs : tensor<f32>
-    "mhlo.return"(%max) : (tensor<f32>) -> ()
-  })
-  {
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>,
-    channel_id = #mhlo.channel_handle<handle = 5, type = 2>,
-    constrain_layout = true,
-    use_global_device_ids = true
-  }: (memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @collective_permute_memrefs
-func.func @collective_permute_memrefs(%arg0: memref<128x32xf32>, %arg_out: memref<128x32xf32>) -> () {
-  "lmhlo.collective_permute"(%arg0, %arg_out) {
-    source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>
-  } : (memref<128x32xf32>, memref<128x32xf32>) -> ()
-
-  "lmhlo.collective_permute"(%arg0, %arg_out) {
-    source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>,
-    channel_id = #mhlo.channel_handle<handle = 5, type = 2>
-  } : (memref<128x32xf32>, memref<128x32xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_collective_permute(%arg0: memref<128x32xf32>, %arg_out: memref<128x32xf32>) -> () {
-  // expected-error@+1{{expect source_target_pairs attribute of shape (N, 2), but got (1, 3)}}
-  "lmhlo.collective_permute"(%arg0, %arg_out) {
-    source_target_pairs = dense<[[2, 3, 4]]> : tensor<1x3xi64>
-  } : (memref<128x32xf32>, memref<128x32xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_collective_permute(%arg0: memref<128x32xf32>, %arg_out: memref<128x32xf32>) -> () {
-  // expected-error@+1{{duplicate sources not allowed.}}
-  "lmhlo.collective_permute"(%arg0, %arg_out) {
-    source_target_pairs = dense<[[1,2], [1,3]]> : tensor<2x2xi64>
-  } : (memref<128x32xf32>, memref<128x32xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_collective_permute(%arg0: memref<128x32xf32>, %arg_out: memref<128x32xf32>) -> () {
-  // expected-error@+1{{duplicate targets not allowed.}}
-  "lmhlo.collective_permute"(%arg0, %arg_out) {
-    source_target_pairs = dense<[[1,2], [0,2]]> : tensor<2x2xi64>
-  } : (memref<128x32xf32>, memref<128x32xf32>) -> ()
-  func.return
-}
-
-// -----
-
 // CHECK-LABEL: func @fft_memrefs
 func.func @fft_memrefs(%arg0: memref<3x9xf32>, %arg_out: memref<3x5xcomplex<f32>>) -> () {
   "lmhlo.fft"(%arg0, %arg_out) {fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>} : (memref<3x9xf32>, memref<3x5xcomplex<f32>>) -> ()
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo_gpu/lhlo_gpu_ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo_gpu/lhlo_gpu_ops.mlir
index 42c0e7a2a46..d925e6199d1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo_gpu/lhlo_gpu_ops.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo_gpu/lhlo_gpu_ops.mlir
@@ -226,8 +226,22 @@ func.func @ag_start(%arg : memref<10x10xf32>, %out: memref<20x10xf32>) {
   %0 = "lmhlo_gpu.all_gather_start"(%arg, %out)
     {
       replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
-      all_gather_dimension = 0
+      all_gather_dimension = 0,
+      is_sync = false
     }
     : (memref<10x10xf32>, memref<20x10xf32>) -> (!mhlo.token)
   func.return
 }
+
+// CHECK-LABEL: func @ag_start_mixed
+func.func @ag_start_mixed(%arg0 : memref<10x10xf32>, %arg1 : memref<10x10xf16>,
+                    %out0: memref<20x10xf32>, %out1: memref<20x10xf16>) {
+  %0 = "lmhlo_gpu.all_gather_start"(%arg0, %arg1, %out0, %out1)
+    {
+      replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      all_gather_dimension = 0,
+      is_sync = true
+    }
+    : (memref<10x10xf32>, memref<10x10xf16>, memref<20x10xf32>, memref<20x10xf16>) -> (!mhlo.token)
+  func.return
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/expand_hlo_tuples.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/expand_hlo_tuples.mlir
index 30b97f1fca9..05ec6e91c26 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/expand_hlo_tuples.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/expand_hlo_tuples.mlir
@@ -50,3 +50,21 @@ func.func @main() -> tuple<tensor<1xf32>, tensor<1xi32>> {
 //       CHECK: %[[T0:.*]] = mhlo.get_tuple_element %[[TUPLE]][0]
 //       CHECK: %[[T1:.*]] = mhlo.get_tuple_element %[[TUPLE]][1]
 //       CHECK: return %[[T0]], %[[T1]]
+
+
+// -----
+
+func.func @main(%arg0: tuple<tuple<tensor<1xi8>>>) -> tuple<tuple<tensor<1xf32>>, tensor<1xi32>> {
+  %0 = "test.dummy"(%arg0) : (tuple<tuple<tensor<1xi8>>>) -> tuple<tuple<tensor<1xf32>>, tensor<1xi32>>
+  func.return %0 : tuple<tuple<tensor<1xf32>>, tensor<1xi32>>
+}
+
+// CHECK-LABEL: func @main
+//  CHECK-SAME: %[[ARG:.+]]: tensor<1xi8>
+//       CHECK: %[[T0:.*]] = mhlo.tuple %[[ARG]] : tuple<tensor<1xi8>>
+//       CHECK: %[[T1:.*]] = mhlo.tuple %[[T0]] : tuple<tuple<tensor<1xi8>>>
+//       CHECK: %[[T:.*]] = "test.dummy"(%[[T1]]) : (tuple<tuple<tensor<1xi8>>>) -> tuple<tuple<tensor<1xf32>>, tensor<1xi32>>
+//       CHECK: %[[GTE0:.*]] = mhlo.get_tuple_element %[[T]][0] : (tuple<tuple<tensor<1xf32>>, tensor<1xi32>>) -> tuple<tensor<1xf32>>
+//       CHECK: %[[GTE1:.*]] = mhlo.get_tuple_element %[[T]][1] : (tuple<tuple<tensor<1xf32>>, tensor<1xi32>>) -> tensor<1xi32>
+//       CHECK: %[[GTE2:.*]] = mhlo.get_tuple_element %[[GTE0]][0] : (tuple<tensor<1xf32>>) -> tensor<1xf32>
+//       CHECK: return %[[GTE2]], %[[GTE1]] : tensor<1xf32>, tensor<1xi32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
index dccfe2bea12..8ab9e1fe72c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
@@ -1957,6 +1957,15 @@ func.func @iota_f32() -> tensor<7x10xf32> {
 // CHECK-NEXT:   %[[FLOAT_CAST:.*]] = arith.sitofp %[[INT_CAST]] : i32 to f32
 // CHECK-NEXT:   linalg.yield %[[FLOAT_CAST]] : f32
 
+// CHECK-PRIMITIVE-LABEL: func @iota_f32
+// CHECK-PRIMITIVE: %[[EMPTY:.*]] = tensor.empty()
+// CHECK-PRIMITIVE: linalg.map outs(%[[EMPTY]] : tensor<7x10xf32>
+// CHECK-PRIMITIVE-SAME: {someattr}
+// CHECK-PRIMITIVE:        %[[INDEX:.*]] = linalg.index 1
+// CHECK-PRIMITIVE-NEXT:   %[[INT_CAST:.*]] = arith.index_cast %[[INDEX]] : index to i64
+// CHECK-PRIMITIVE-NEXT:   %[[FLOAT_CAST:.*]] = arith.sitofp %[[INT_CAST]] : i64 to f32
+// CHECK-PRIMITIVE-NEXT:   linalg.yield %[[FLOAT_CAST]]
+
 // -----
 
 // CHECK: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
@@ -6119,3 +6128,23 @@ func.func @clamp_complex(%min: tensor<8xcomplex<f32>>,
   %result = mhlo.clamp %min, %operand, %max : tensor<8xcomplex<f32>>
   func.return %result : tensor<8xcomplex<f32>>
 }
+
+// -----
+
+// CHECK-LABEL: func @reshape_sparse_encoding
+// CHECK-PRIMITIVE-LABEL: func @reshape_sparse_encoding
+
+#ST_3D = #sparse_tensor.encoding<{
+  dimLevelType = ["compressed", "compressed", "compressed"]
+}>
+
+#ST_4D = #sparse_tensor.encoding<{
+  dimLevelType = ["compressed", "compressed", "compressed", "compressed"]
+}>
+
+func.func @reshape_sparse_encoding(%arg0: tensor<1x49x16xf32, #ST_3D>) -> tensor<1x784x1x1xf32, #ST_4D> {
+  %0 = "mhlo.reshape"(%arg0) : (tensor<1x49x16xf32, #ST_3D>) -> tensor<1x784x1x1xf32, #ST_4D>
+  func.return %0 : tensor<1x784x1x1xf32, #ST_4D>
+}
+// CHECK: tensor.collapse_shape %{{.*}} {{\[}}[0, 1, 2]] : tensor<1x49x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ] }>> into tensor<784xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>
+// CHECK-NEXT: tensor.expand_shape %{{.*}} {{\[}}[0, 1, 2, 3]] : tensor<784xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> into tensor<1x784x1x1xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed", "compressed" ] }>>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
index c0b360ebf58..051df599ddc 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
@@ -4,6 +4,7 @@
 // This test file runs both FileCheck and diagnostic check. These tests all
 // error when the experimental flag is disabled, and pass when it is enabled.
 
+// CHECK-LABEL: "op_all_to_all_tuple"
 func.func @op_all_to_all_tuple(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32>) -> (tensor<128x4xf32>, tensor<128x4xf32>) {
   //               CHECK: "stablehlo.custom_call"(%arg0, %arg1) {
   //          CHECK-SAME:    call_target_name = "mhlo.all_to_all"
@@ -15,10 +16,10 @@ func.func @op_all_to_all_tuple(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32
   } : (tensor<128x4xf32>, tensor<128x4xf32>) -> (tensor<128x4xf32>, tensor<128x4xf32>)
   return %0#0, %0#1 : tensor<128x4xf32>, tensor<128x4xf32>
 }
-// CHECK-LABEL: "op_all_to_all_tuple"
 
 // -----
 
+// CHECK-LABEL: "op_custom_call_api_version_typed_ffi"
 func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f32> {
   //      CHECK: "stablehlo.custom_call"(%arg0) {
   // CHECK-SAME:   call_target_name = "mhlo.custom_call"
@@ -32,10 +33,10 @@ func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f3
   } : (tensor<f32>) -> tensor<f32>
   return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_custom_call_api_version_typed_ffi"
 
 // -----
 
+// CHECK-LABEL: "attr_precision_packed_nibble"
 func.func @attr_precision_packed_nibble(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   //      CHECK: "stablehlo.custom_call"(%arg0, %arg1) {
   // CHECK-SAME:    call_target_name = "mhlo.dot"
@@ -47,4 +48,3 @@ func.func @attr_precision_packed_nibble(%arg0: tensor<8x16xf32>, %arg1: tensor<1
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "attr_precision_packed_nibble"
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
index 44a945dc11b..4d2a5126cae 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
@@ -6,6 +6,7 @@
 // ArgResultAlias aka #mhlo.result_alias is unused at the moment.
 // ChannelHandle aka #mhlo.channel_handle is covered below.
 
+// CHECK-LABEL: "attr_comparison_direction_eq"
 func.func @attr_comparison_direction_eq(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "mhlo.compare"(%arg0, %arg1) {
     // CHECK: comparison_direction = #stablehlo<comparison_direction EQ>
@@ -13,8 +14,8 @@ func.func @attr_comparison_direction_eq(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_direction_eq"
 
+// CHECK-LABEL: "attr_comparison_direction_ne"
 func.func @attr_comparison_direction_ne(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "mhlo.compare"(%arg0, %arg1) {
     // CHECK: comparison_direction = #stablehlo<comparison_direction NE>
@@ -22,8 +23,8 @@ func.func @attr_comparison_direction_ne(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_direction_ne"
 
+// CHECK-LABEL: "attr_comparison_direction_ge"
 func.func @attr_comparison_direction_ge(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "mhlo.compare"(%arg0, %arg1) {
     // CHECK: comparison_direction = #stablehlo<comparison_direction GE>
@@ -31,8 +32,8 @@ func.func @attr_comparison_direction_ge(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_direction_ge"
 
+// CHECK-LABEL: "attr_comparison_direction_gt"
 func.func @attr_comparison_direction_gt(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "mhlo.compare"(%arg0, %arg1) {
     // CHECK: comparison_direction = #stablehlo<comparison_direction GT>
@@ -40,8 +41,8 @@ func.func @attr_comparison_direction_gt(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_direction_gt"
 
+// CHECK-LABEL: "attr_comparison_direction_le"
 func.func @attr_comparison_direction_le(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "mhlo.compare"(%arg0, %arg1) {
     // CHECK: comparison_direction = #stablehlo<comparison_direction LE>
@@ -49,8 +50,8 @@ func.func @attr_comparison_direction_le(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_direction_le"
 
+// CHECK-LABEL: "attr_comparison_direction_lt"
 func.func @attr_comparison_direction_lt(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "mhlo.compare"(%arg0, %arg1) {
     // CHECK: comparison_direction = #stablehlo<comparison_direction LT>
@@ -58,8 +59,8 @@ func.func @attr_comparison_direction_lt(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_direction_lt"
 
+// CHECK-LABEL: "attr_comparison_type_notype"
 func.func @attr_comparison_type_notype(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "mhlo.compare"(%arg0, %arg1) {
     comparison_direction = #mhlo<comparison_direction EQ>,
@@ -68,8 +69,8 @@ func.func @attr_comparison_type_notype(%arg0: tensor<f32>, %arg1: tensor<f32>) -
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_type_notype"
 
+// CHECK-LABEL: "attr_comparison_type_float"
 func.func @attr_comparison_type_float(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "mhlo.compare"(%arg0, %arg1) {
     comparison_direction = #mhlo<comparison_direction EQ>,
@@ -78,8 +79,8 @@ func.func @attr_comparison_type_float(%arg0: tensor<f32>, %arg1: tensor<f32>) ->
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_type_float"
 
+// CHECK-LABEL: "attr_comparison_type_totalorder"
 func.func @attr_comparison_type_totalorder(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "mhlo.compare"(%arg0, %arg1) {
     comparison_direction = #mhlo<comparison_direction EQ>,
@@ -88,8 +89,8 @@ func.func @attr_comparison_type_totalorder(%arg0: tensor<f32>, %arg1: tensor<f32
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_type_totalorder"
 
+// CHECK-LABEL: "attr_comparison_type_signed"
 func.func @attr_comparison_type_signed(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "mhlo.compare"(%arg0, %arg1) {
     comparison_direction = #mhlo<comparison_direction EQ>,
@@ -98,8 +99,8 @@ func.func @attr_comparison_type_signed(%arg0: tensor<f32>, %arg1: tensor<f32>) -
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_type_signed"
 
+// CHECK-LABEL: "attr_comparison_type_unsigned"
 func.func @attr_comparison_type_unsigned(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "mhlo.compare"(%arg0, %arg1) {
     comparison_direction = #mhlo<comparison_direction EQ>,
@@ -108,10 +109,10 @@ func.func @attr_comparison_type_unsigned(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_type_unsigned"
 
 // ConvDimensionNumbers aka #mhlo.conv is covered below.
 
+// CHECK-LABEL: "attr_custom_call_api_version_unspecified"
 func.func @attr_custom_call_api_version_unspecified(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "mhlo.custom_call"(%arg0) {
     call_target_name = "foo",
@@ -120,8 +121,8 @@ func.func @attr_custom_call_api_version_unspecified(%arg0: tensor<f32>) -> tenso
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "attr_custom_call_api_version_unspecified"
 
+// CHECK-LABEL: "attr_custom_call_api_version_original"
 func.func @attr_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "mhlo.custom_call"(%arg0) {
     call_target_name = "foo",
@@ -130,8 +131,8 @@ func.func @attr_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "attr_custom_call_api_version_original"
 
+// CHECK-LABEL: "attr_custom_call_api_version_status_returning"
 func.func @attr_custom_call_api_version_status_returning(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "mhlo.custom_call"(%arg0) {
     call_target_name = "foo",
@@ -140,8 +141,8 @@ func.func @attr_custom_call_api_version_status_returning(%arg0: tensor<f32>) ->
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "attr_custom_call_api_version_status_returning"
 
+// CHECK-LABEL: "attr_custom_call_api_version_status_returning_unified"
 func.func @attr_custom_call_api_version_status_returning_unified(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "mhlo.custom_call"(%arg0) {
     call_target_name = "foo",
@@ -150,13 +151,13 @@ func.func @attr_custom_call_api_version_status_returning_unified(%arg0: tensor<f
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "attr_custom_call_api_version_status_returning_unified"
 
 // CustomCallSchedule aka #mhlo<custom_call_schedule> is unsupported at the moment (see negative test below).
 // DequantizeMode aka #mhlo<dequantize_mode> is unused at the moment.
 // DomainKind aka #mhlo<kind> is unsupported at the moment (see negative test below).
 // DotDimensionNumbers aka #mhlo.dot is covered below.
 
+// CHECK-LABEL: "attr_fft_type_fft"
 func.func @attr_fft_type_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
   %0 = "mhlo.fft"(%arg0) {
     // CHECK: fft_type = #stablehlo<fft_type FFT>
@@ -165,8 +166,8 @@ func.func @attr_fft_type_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomple
   } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   func.return %0 : tensor<16xcomplex<f32>>
 }
-// CHECK-LABEL: "attr_fft_type_fft"
 
+// CHECK-LABEL: "attr_fft_type_ifft"
 func.func @attr_fft_type_ifft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
   %0 = "mhlo.fft"(%arg0) {
     // CHECK: fft_type = #stablehlo<fft_type IFFT>
@@ -175,8 +176,8 @@ func.func @attr_fft_type_ifft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcompl
   } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   func.return %0 : tensor<16xcomplex<f32>>
 }
-// CHECK-LABEL: "attr_fft_type_ifft"
 
+// CHECK-LABEL: "attr_fft_type_rfft"
 func.func @attr_fft_type_rfft(%arg0: tensor<16xf32>) -> tensor<9xcomplex<f32>> {
   %0 = "mhlo.fft"(%arg0) {
     // CHECK: fft_type = #stablehlo<fft_type RFFT>
@@ -185,8 +186,8 @@ func.func @attr_fft_type_rfft(%arg0: tensor<16xf32>) -> tensor<9xcomplex<f32>> {
   } : (tensor<16xf32>) -> tensor<9xcomplex<f32>>
   func.return %0 : tensor<9xcomplex<f32>>
 }
-// CHECK-LABEL: "attr_fft_type_rfft"
 
+// CHECK-LABEL: "attr_fft_type_irfft"
 func.func @attr_fft_type_irfft(%arg0: tensor<9xcomplex<f32>>) -> tensor<16xf32> {
   %0 = "mhlo.fft"(%arg0) {
     // CHECK: fft_type = #stablehlo<fft_type IRFFT>
@@ -195,11 +196,11 @@ func.func @attr_fft_type_irfft(%arg0: tensor<9xcomplex<f32>>) -> tensor<16xf32>
   } : (tensor<9xcomplex<f32>>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "attr_fft_type_irfft"
 
 // FusionKind aka #mhlo<fusion_kind> is unsupported at the moment (see negative test below).
 // GatherDimensionNumbers aka #mhlo.gather is covered below.
 
+// CHECK-LABEL: "attr_precision_default"
 func.func @attr_precision_default(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {
     // CHECK: precision_config = [#stablehlo<precision DEFAULT>]
@@ -207,8 +208,8 @@ func.func @attr_precision_default(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf3
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "attr_precision_default"
 
+// CHECK-LABEL: "attr_precision_high"
 func.func @attr_precision_high(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {
     // CHECK: precision_config = [#stablehlo<precision HIGH>]
@@ -216,8 +217,8 @@ func.func @attr_precision_high(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>)
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "attr_precision_high"
 
+// CHECK-LABEL: "attr_precision_highest"
 func.func @attr_precision_highest(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {
     // CHECK: precision_config = [#stablehlo<precision HIGHEST>]
@@ -225,8 +226,8 @@ func.func @attr_precision_highest(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf3
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "attr_precision_highest"
 
+// CHECK-LABEL: "attr_rng_algorithm_default"
 func.func @attr_rng_algorithm_default(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
   %0:2 = "mhlo.rng_bit_generator"(%arg0) {
     // CHECK: rng_algorithm = #stablehlo<rng_algorithm DEFAULT>
@@ -234,8 +235,8 @@ func.func @attr_rng_algorithm_default(%arg0: tensor<f32>) -> (tensor<f32>, tenso
   } : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
   func.return %0#0, %0#1 : tensor<f32>, tensor<f32>
 }
-// CHECK-LABEL: "attr_rng_algorithm_default"
 
+// CHECK-LABEL: "attr_rng_algorithm_three_fry"
 func.func @attr_rng_algorithm_three_fry(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
   %0:2 = "mhlo.rng_bit_generator"(%arg0) {
     // CHECK: rng_algorithm = #stablehlo<rng_algorithm THREE_FRY>
@@ -243,8 +244,8 @@ func.func @attr_rng_algorithm_three_fry(%arg0: tensor<f32>) -> (tensor<f32>, ten
   } : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
   func.return %0#0, %0#1 : tensor<f32>, tensor<f32>
 }
-// CHECK-LABEL: "attr_rng_algorithm_three_fry"
 
+// CHECK-LABEL: "attr_rng_algorithm_philox"
 func.func @attr_rng_algorithm_philox(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
   %0:2 = "mhlo.rng_bit_generator"(%arg0) {
     // CHECK: rng_algorithm = #stablehlo<rng_algorithm PHILOX>
@@ -252,8 +253,8 @@ func.func @attr_rng_algorithm_philox(%arg0: tensor<f32>) -> (tensor<f32>, tensor
   } : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
   func.return %0#0, %0#1 : tensor<f32>, tensor<f32>
 }
-// CHECK-LABEL: "attr_rng_algorithm_philox"
 
+// CHECK-LABEL: "attr_rng_distribution_uniform"
 func.func @attr_rng_distribution_uniform(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>) -> tensor<f32> {
   %0 = "mhlo.rng"(%arg0, %arg1, %arg2) {
     // CHECK: rng_distribution = #stablehlo<rng_distribution UNIFORM>
@@ -261,8 +262,8 @@ func.func @attr_rng_distribution_uniform(%arg0: tensor<f32>, %arg1: tensor<f32>,
   } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "attr_rng_distribution_uniform"
 
+// CHECK-LABEL: "attr_rng_distribution_normal"
 func.func @attr_rng_distribution_normal(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>) -> tensor<f32> {
   %0 = "mhlo.rng"(%arg0, %arg1, %arg2) {
     // CHECK: rng_distribution = #stablehlo<rng_distribution NORMAL>
@@ -270,10 +271,10 @@ func.func @attr_rng_distribution_normal(%arg0: tensor<f32>, %arg1: tensor<f32>,
   } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "attr_rng_distribution_normal"
 
 // ScatterDimensionNumbers aka #mhlo.scatter is covered below.
 
+// CHECK-LABEL: "attr_transpose_no_transpose"
 func.func @attr_transpose_no_transpose(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>) ->  tensor<16x16xf32> {
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {
     left_side = true,
@@ -284,8 +285,8 @@ func.func @attr_transpose_no_transpose(%arg0: tensor<16x16xf32>, %arg1: tensor<1
   } : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "attr_transpose_no_transpose"
 
+// CHECK-LABEL: "attr_transpose_transpose"
 func.func @attr_transpose_transpose(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>) ->  tensor<16x16xf32> {
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {
     left_side = true,
@@ -296,8 +297,8 @@ func.func @attr_transpose_transpose(%arg0: tensor<16x16xf32>, %arg1: tensor<16x1
   } : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "attr_transpose_transpose"
 
+// CHECK-LABEL: "attr_transpose_adjoint"
 func.func @attr_transpose_adjoint(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>) ->  tensor<16x16xf32> {
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {
     left_side = true,
@@ -308,43 +309,43 @@ func.func @attr_transpose_adjoint(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16x
   } : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "attr_transpose_adjoint"
 
 // TypeExtensionsAttr aka #mhlo.type_extensions is covered below.
 
+// CHECK-LABEL: "attr_type_extensions_bounds"
 func.func @attr_type_extensions_bounds(
     %arg0: tensor<?x?xf32, #mhlo.type_extensions<bounds = [16, ?]>>)
     -> tensor<?x?xf32, #mhlo.type_extensions<bounds = [16, ?]>> {
-  // CHECK: "func.return"(%arg0) : (tensor<?x?xf32, #stablehlo.type_extensions<bounds = [16, ?]>>) -> ()
+  // CHECK: "func.return"(%arg0) : (tensor<?x?xf32, #stablehlo.bounds<16, ?>>) -> ()
   func.return %arg0 : tensor<?x?xf32, #mhlo.type_extensions<bounds = [16, ?]>>
 }
-// CHECK-LABEL: "attr_type_extensions_bounds"
 
 // ============ OPS ============
 
+// CHECK-LABEL: "op_abs"
 func.func @op_abs(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.abs"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.abs"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_abs"
 
 // AddDependencyOp aka mhlo.add_dependency is unsupported at the moment (see negative test below).
 
+// CHECK-LABEL: "op_add"
 func.func @op_add(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_add"
 
+// CHECK-LABEL: "op_after_all"
 func.func @op_after_all(%arg0: !mhlo.token) -> !mhlo.token {
   // CHECK: "stablehlo.after_all"(%arg0) : (!stablehlo.token) -> !stablehlo.token
   %0 = "mhlo.after_all"(%arg0) : (!mhlo.token) -> !mhlo.token
   func.return %0 : !mhlo.token
 }
-// CHECK-LABEL: "op_after_all"
 
+// CHECK-LABEL: "op_all_gather"
 func.func @op_all_gather(%arg0: tensor<16x8xf32>) -> tensor<16x16xf32> {
   //               CHECK: "stablehlo.all_gather"(%arg0) {
   //          CHECK-SAME:   all_gather_dim = 1 : i64,
@@ -360,8 +361,8 @@ func.func @op_all_gather(%arg0: tensor<16x8xf32>) -> tensor<16x16xf32> {
   } : (tensor<16x8xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "op_all_gather"
 
+// CHECK-LABEL: "op_all_reduce"
 func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
   //               CHECK: "stablehlo.all_reduce"(%arg0) ({
   //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
@@ -383,8 +384,8 @@ func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_all_reduce"
 
+// CHECK-LABEL: "op_all_to_all"
 func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
   //               CHECK: "stablehlo.all_to_all"(%arg0) {
   //          CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>,
@@ -403,24 +404,25 @@ func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
   func.return %0 : tensor<16x4xf32>
 }
 
+// CHECK-LABEL: "op_and"
 func.func @op_and(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
   // CHECK: "stablehlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "mhlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "op_and"
 
 // AsyncDoneOp aka mhlo.async_done is unsupported at the moment (see negative test below).
 // AsyncStartOp aka mhlo.async_start is unsupported at the moment (see negative test below).
 // AsyncUpdateOp aka mhlo.async_update is unsupported at the moment (see negative test below).
 
+// CHECK-LABEL: "op_atan2"
 func.func @op_atan2(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.atan2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.atan2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_atan2"
 
+// CHECK-LABEL: "op_batch_norm_grad"
 func.func @op_batch_norm_grad(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>, %arg3: tensor<16xf32>, %arg4: tensor<16x16x16x16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) {
   //      CHECK: "stablehlo.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4) {
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
@@ -432,8 +434,8 @@ func.func @op_batch_norm_grad(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf
   } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16x16x16x16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
   func.return %0#0, %0#1, %0#2 : tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>
 }
-// CHECK-LABEL: "op_batch_norm_grad"
 
+// CHECK-LABEL: "op_batch_norm_inference"
 func.func @op_batch_norm_inference(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>, %arg3: tensor<16xf32>, %arg4: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
   //      CHECK: "stablehlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) {
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
@@ -445,8 +447,8 @@ func.func @op_batch_norm_inference(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor
   } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) -> tensor<16x16x16x16xf32>
   func.return %0 : tensor<16x16x16x16xf32>
 }
-// CHECK-LABEL: "op_batch_norm_inference"
 
+// CHECK-LABEL: "op_batch_norm_training"
 func.func @op_batch_norm_training(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) {
   //      CHECK: "stablehlo.batch_norm_training"(%arg0, %arg1, %arg2) {
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
@@ -458,17 +460,17 @@ func.func @op_batch_norm_training(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<
   } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
   func.return %0#0, %0#1, %0#2 : tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>
 }
-// CHECK-LABEL: "op_batch_norm_training"
 
 // BitcastOp aka mhlo.bitcast is unsupported at the moment (see negative test below).
 
+// CHECK-LABEL: "op_bitcast_convert"
 func.func @op_bitcast_convert(%arg0: tensor<i32>) -> tensor<f32> {
   // CHECK: "stablehlo.bitcast_convert"(%arg0) : (tensor<i32>) -> tensor<f32>
   %0 = "mhlo.bitcast_convert"(%arg0) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_bitcast_convert"
 
+// CHECK-LABEL: "op_broadcast_in_dim"
 func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   //      CHECK: "stablehlo.broadcast_in_dim"(%arg0) {
   // CHECK-SAME:   broadcast_dimensions = dense<1> : tensor<1xi64>
@@ -478,8 +480,8 @@ func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   } : (tensor<16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "op_broadcast_in_dim"
 
+// CHECK-LABEL: "op_broadcast"
 func.func @op_broadcast(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   //      CHECK: "stablehlo.broadcast"(%arg0) {
   // CHECK-SAME:   broadcast_sizes = dense<16> : tensor<1xi64>
@@ -489,8 +491,8 @@ func.func @op_broadcast(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   } : (tensor<16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "op_broadcast"
 
+// CHECK-LABEL: "op_case"
 func.func @op_case(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
   //      CHECK: "stablehlo.case"(%arg0) ({
   // CHECK-NEXT:   "stablehlo.return"(%arg1) : (tensor<f32>) -> ()
@@ -500,22 +502,22 @@ func.func @op_case(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
   }) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_case"
 
+// CHECK-LABEL: "op_cbrt"
 func.func @op_cbrt(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.cbrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.cbrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_cbrt"
 
+// CHECK-LABEL: "op_ceil"
 func.func @op_ceil(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.ceil"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.ceil"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_ceil"
 
+// CHECK-LABEL: "op_cholesky"
 func.func @op_cholesky(%arg0: tensor<1x16x16xf32>) -> tensor<1x16x16xf32> {
   //      CHECK: "stablehlo.cholesky"(%arg0) {
   // CHECK-SAME:   lower = true
@@ -525,22 +527,22 @@ func.func @op_cholesky(%arg0: tensor<1x16x16xf32>) -> tensor<1x16x16xf32> {
   } : (tensor<1x16x16xf32>) -> tensor<1x16x16xf32>
   func.return %0 : tensor<1x16x16xf32>
 }
-// CHECK-LABEL: "op_cholesky"
 
+// CHECK-LABEL: "op_clamp"
 func.func @op_clamp(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.clamp"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.clamp"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_clamp"
 
+// CHECK-LABEL: "op_count_leading_zeros"
 func.func @op_count_leading_zeros(%arg0: tensor<i32>) -> tensor<i32> {
   // CHECK: "stablehlo.count_leading_zeros"(%arg0) : (tensor<i32>) -> tensor<i32>
   %0 = "mhlo.count_leading_zeros"(%arg0) : (tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "op_count_leading_zeros"
 
+// CHECK-LABEL: "op_collective_permute"
 func.func @op_collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
   //               CHECK: "stablehlo.collective_permute"(%arg0) {
   //          CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>,
@@ -552,8 +554,8 @@ func.func @op_collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
   } : (tensor<16x8xf32>) -> tensor<16x8xf32>
   func.return %0 : tensor<16x8xf32>
 }
-// CHECK-LABEL: "op_collective_permute"
 
+// CHECK-LABEL: "op_compare"
 func.func @op_compare(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   //      CHECK: "stablehlo.compare"(%arg0, %arg1) {
   // CHECK-SAME:   compare_type = #stablehlo<comparison_type TOTALORDER>,
@@ -565,22 +567,22 @@ func.func @op_compare(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "op_compare"
 
+// CHECK-LABEL: "op_complex"
 func.func @op_complex(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<complex<f32>> {
   // CHECK: "stablehlo.complex"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<complex<f32>>
   %0 = "mhlo.complex"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<complex<f32>>
   func.return %0 : tensor<complex<f32>>
 }
-// CHECK-LABEL: "op_complex"
 
+// CHECK-LABEL: "op_compute_reshape_shape"
 func.func @op_compute_reshape_shape(%arg0: index, %arg1: tensor<1xindex>) -> tensor<1xindex> {
   // CHECK: "stablehlo.compute_reshape_shape"(%arg0, %arg1) : (index, tensor<1xindex>) -> tensor<1xindex>
   %0 = "mhlo.compute_reshape_shape"(%arg0, %arg1) : (index, tensor<1xindex>) -> tensor<1xindex>
   func.return %0 : tensor<1xindex>
 }
-// CHECK-LABEL: "op_compute_reshape_shape"
 
+// CHECK-LABEL: "op_concatenate"
 func.func @op_concatenate(%arg0: tensor<8xf32>, %arg1: tensor<8xf32>) -> tensor<16xf32> {
   //      CHECK: "stablehlo.concatenate"(%arg0, %arg1) {
   // CHECK-SAME:   dimension = 0 : i64
@@ -590,8 +592,8 @@ func.func @op_concatenate(%arg0: tensor<8xf32>, %arg1: tensor<8xf32>) -> tensor<
   } : (tensor<8xf32>, tensor<8xf32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_concatenate"
 
+// CHECK-LABEL: "op_constant"
 func.func @op_constant(%arg0: tensor<f32>) -> tensor<f32> {
   //      CHECK: "stablehlo.constant"() {
   // CHECK-SAME:   value = dense<0.000000e+00> : tensor<f32>
@@ -601,15 +603,15 @@ func.func @op_constant(%arg0: tensor<f32>) -> tensor<f32> {
   } : () -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_constant"
 
+// CHECK-LABEL: "op_convert"
 func.func @op_convert(%arg0: tensor<i32>) -> tensor<f32> {
   // CHECK: "stablehlo.convert"(%arg0) : (tensor<i32>) -> tensor<f32>
   %0 = "mhlo.convert"(%arg0) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_convert"
 
+// CHECK-LABEL: "op_convolution"
 func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
   //      CHECK: "stablehlo.convolution"(%arg0, %arg1) {
   // CHECK-SAME:   batch_group_count = 1 : i64,
@@ -635,24 +637,24 @@ func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16
   } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
   func.return %0 : tensor<1x8x8x16xf32>
 }
-// CHECK-LABEL: "op_convolution"
 
 // CopyOp aka mhlo.copy is unsupported at the moment (see negative test below).
 
+// CHECK-LABEL: "op_cosine"
 func.func @op_cosine(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.cosine"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.cosine"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_cosine"
 
+// CHECK-LABEL: "op_create_token"
 func.func @op_create_token() -> !mhlo.token {
   // CHECK: "stablehlo.create_token"() : () -> !stablehlo.token
   %0 = "mhlo.create_token"() : () -> !mhlo.token
   func.return %0 : !mhlo.token
 }
-// CHECK-LABEL: "op_create_token"
 
+// CHECK-LABEL: "op_cross_replica_sum"
 func.func @op_cross_replica_sum(%arg0: tensor<f32>) -> tensor<f32> {
   //               CHECK: "stablehlo.cross-replica-sum"(%arg0) {
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
@@ -662,15 +664,15 @@ func.func @op_cross_replica_sum(%arg0: tensor<f32>) -> tensor<f32> {
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_cross_replica_sum"
 
+// CHECK-LABEL: "op_cstr_reshapable"
 func.func @op_cstr_reshapable(%arg0: index, %arg1: tensor<1xindex>) -> !shape.witness {
   // CHECK: "stablehlo.cstr_reshapable"(%arg0, %arg1) : (index, tensor<1xindex>) -> !shape.witness
   %0 = "mhlo.cstr_reshapable"(%arg0, %arg1) : (index, tensor<1xindex>) -> !shape.witness
   func.return %0 : !shape.witness
 }
-// CHECK-LABEL: "op_cstr_reshapable"
 
+// CHECK-LABEL: "op_custom_call_api_version_original"
 func.func @called_computation() { func.return }
 func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32> {
   //      CHECK: "stablehlo.custom_call"(%arg0) {
@@ -703,17 +705,17 @@ func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_custom_call_api_version_original"
 
+// CHECK-LABEL: "op_divide"
 func.func @op_divide(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.divide"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.divide"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_divide"
 
 // DomainOp aka mhlo.domain is unsupported at the moment (see negative test below).
 
+// CHECK-LABEL: "op_dot_general"
 func.func @op_dot_general(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>) -> tensor<8x8x8xf32> {
   //      CHECK: "stablehlo.dot_general"(%arg0, %arg1) {
   // CHECK-SAME:   dot_dimension_numbers = #stablehlo.dot<
@@ -735,8 +737,8 @@ func.func @op_dot_general(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>)
   } : (tensor<8x8x16xf32>, tensor<8x16x8xf32>) -> tensor<8x8x8xf32>
   func.return %0 : tensor<8x8x8xf32>
 }
-// CHECK-LABEL: "op_dot_general"
 
+// CHECK-LABEL: "op_dot"
 func.func @op_dot(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   //      CHECK: "stablehlo.dot"(%arg0, %arg1) {
   // CHECK-SAME:   precision_config = []
@@ -746,8 +748,8 @@ func.func @op_dot(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "op_dot"
 
+// CHECK-LABEL: "op_dynamic_broadcast_in_dim"
 func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
   //      CHECK: "stablehlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {
   // CHECK-SAME:   broadcast_dimensions = dense<1> : tensor<1xi64>,
@@ -761,8 +763,8 @@ func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xind
   } : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
-// CHECK-LABEL: "op_dynamic_broadcast_in_dim"
 
+// CHECK-LABEL: "op_dynamic_conv"
 func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>, %arg2: tensor<4xi32>) -> tensor<1x?x?x16xf32> {
   //      CHECK: "stablehlo.dynamic_conv"(%arg0, %arg1, %arg2) {
   // CHECK-SAME:   batch_group_count = 1 : i64,
@@ -788,8 +790,8 @@ func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x1
   } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
   func.return %0 : tensor<1x?x?x16xf32>
 }
-// CHECK-LABEL: "op_dynamic_conv"
 
+// CHECK-LABEL: "op_dynamic_gather"
 func.func @op_dynamic_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>, %arg2 : tensor<3xi32>) -> tensor<1x5x8xf32> {
   //      CHECK: "stablehlo.dynamic_gather"(%arg0, %arg1, %arg2) {
   // CHECK-SAME:   dimension_numbers = #stablehlo.gather<
@@ -811,8 +813,8 @@ func.func @op_dynamic_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32
   } : (tensor<2x4x9xf32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<1x5x8xf32>
   func.return %0 : tensor<1x5x8xf32>
 }
-// CHECK-LABEL: "op_dynamic_gather"
 
+// CHECK-LABEL: "op_dynamic_iota"
 func.func @op_dynamic_iota(%arg0: tensor<1xindex>) -> tensor<?xf32> {
   //      CHECK: "stablehlo.dynamic_iota"(%arg0) {
   // CHECK-SAME:   iota_dimension = 0 : i64
@@ -822,22 +824,22 @@ func.func @op_dynamic_iota(%arg0: tensor<1xindex>) -> tensor<?xf32> {
   } : (tensor<1xindex>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
-// CHECK-LABEL: "op_dynamic_iota"
 
+// CHECK-LABEL: "op_dynamic_pad"
 func.func @op_dynamic_pad(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tensor<1xindex>, %arg3: tensor<1xindex>, %arg4: tensor<1xindex>) -> tensor<?xf32> {
   // CHECK: "stablehlo.dynamic_pad"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<?xf32>, tensor<f32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   %0 = "mhlo.dynamic_pad"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<?xf32>, tensor<f32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
-// CHECK-LABEL: "op_dynamic_pad"
 
+// CHECK-LABEL: "op_dynamic_reshape"
 func.func @op_dynamic_reshape(%arg0: tensor<16xf32>, %arg1: tensor<?xindex>) -> tensor<?x?xf32> {
   // CHECK: "stablehlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<?xindex>) -> tensor<?x?xf32>
   %0 = "mhlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<?xindex>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
-// CHECK-LABEL: "op_dynamic_reshape"
 
+// CHECK-LABEL: "op_dynamic_slice"
 func.func @op_dynamic_slice(%arg0: tensor<16xf32>, %arg1: tensor<i64>) -> tensor<4xf32> {
   //      CHECK: "stablehlo.dynamic_slice"(%arg0, %arg1) {
   // CHECK-SAME:   slice_sizes = dense<4> : tensor<1xi64>
@@ -847,15 +849,15 @@ func.func @op_dynamic_slice(%arg0: tensor<16xf32>, %arg1: tensor<i64>) -> tensor
   } : (tensor<16xf32>, tensor<i64>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
 }
-// CHECK-LABEL: "op_dynamic_slice"
 
+// CHECK-LABEL: "op_dynamic_update_slice"
 func.func @op_dynamic_update_slice(%arg0: tensor<16xf32>, %arg1: tensor<4xf32>, %arg2: tensor<i64>) -> tensor<16xf32> {
   // CHECK: "stablehlo.dynamic_update_slice"(%arg0, %arg1, %arg2) : (tensor<16xf32>, tensor<4xf32>, tensor<i64>) -> tensor<16xf32>
   %0 = "mhlo.dynamic_update_slice"(%arg0, %arg1, %arg2) : (tensor<16xf32>, tensor<4xf32>, tensor<i64>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_dynamic_update_slice"
 
+// CHECK-LABEL: "op_einsum"
 func.func @op_einsum(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   //      CHECK: "stablehlo.einsum"(%arg0, %arg1) {
   // CHECK-SAME:   einsum_config = "ab,bc->ac"
@@ -865,22 +867,22 @@ func.func @op_einsum(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "op_einsum"
 
+// CHECK-LABEL: "op_exponential_minus_one"
 func.func @op_exponential_minus_one(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.exponential_minus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.exponential_minus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_exponential_minus_one"
 
+// CHECK-LABEL: "op_exponential"
 func.func @op_exponential(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.exponential"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.exponential"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_exponential"
 
+// CHECK-LABEL: "op_fft"
 func.func @op_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
   //      CHECK: "stablehlo.fft"(%arg0) {
   // CHECK-SAME:   fft_length = dense<16> : tensor<1xi64>,
@@ -892,17 +894,17 @@ func.func @op_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
   } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   func.return %0 : tensor<16xcomplex<f32>>
 }
-// CHECK-LABEL: "op_fft"
 
+// CHECK-LABEL: "op_floor"
 func.func @op_floor(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_floor"
 
 // FusionOp aka mhlo.fusion is unsupported at the moment (see negative test below).
 
+// CHECK-LABEL: "op_gather"
 func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> tensor<1x5x1xf32> {
   //      CHECK: "stablehlo.gather"(%arg0, %arg1) {
   // CHECK-SAME:   dimension_numbers = #stablehlo.gather<
@@ -926,8 +928,8 @@ func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> te
   } : (tensor<2x4x9xf32>, tensor<1x5x2xi32>) -> tensor<1x5x1xf32>
   func.return %0 : tensor<1x5x1xf32>
 }
-// CHECK-LABEL: "op_gather"
 
+// CHECK-LABEL: "op_get_dimension_size"
 func.func @op_get_dimension_size(%arg0: tensor<?xf32>) -> tensor<i32> {
   //      CHECK: "stablehlo.get_dimension_size"(%arg0) {
   // CHECK-SAME:   dimension = 0 : i64
@@ -937,8 +939,8 @@ func.func @op_get_dimension_size(%arg0: tensor<?xf32>) -> tensor<i32> {
   } : (tensor<?xf32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "op_get_dimension_size"
 
+// CHECK-LABEL: "op_get_tuple_element"
 func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32> {
   //      CHECK: "stablehlo.get_tuple_element"(%arg0) {
   // CHECK-SAME:   index = 4 : i32
@@ -948,8 +950,8 @@ func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>, tensor<f32>, tensor<f3
   } : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_get_tuple_element"
 
+// CHECK-LABEL: "op_if"
 func.func @op_if(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
   //      CHECK: "stablehlo.if"(%arg0) ({
   // CHECK-NEXT:   "stablehlo.return"(%arg1) : (tensor<f32>) -> ()
@@ -963,15 +965,15 @@ func.func @op_if(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> t
   }) : (tensor<i1>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_if"
 
+// CHECK-LABEL: "op_imag"
 func.func @op_imag(%arg0: tensor<complex<f32>>) -> tensor<f32> {
   // CHECK: "stablehlo.imag"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
   %0 = "mhlo.imag"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_imag"
 
+// CHECK-LABEL: "op_infeed"
 func.func @op_infeed(%arg0: !mhlo.token) -> (tensor<f32>, !mhlo.token) {
   //               CHECK: "stablehlo.infeed"(%arg0) {
   //          CHECK-SAME:   infeed_config = "",
@@ -983,8 +985,8 @@ func.func @op_infeed(%arg0: !mhlo.token) -> (tensor<f32>, !mhlo.token) {
   } : (!mhlo.token) -> (tensor<f32>, !mhlo.token)
   func.return %0#0, %0#1 : tensor<f32>, !mhlo.token
 }
-// CHECK-LABEL: "op_infeed"
 
+// CHECK-LABEL: "op_iota"
 func.func @op_iota() -> tensor<16xf32> {
   //      CHECK: "stablehlo.iota"() {
   // CHECK-SAME:   iota_dimension = 0 : i64
@@ -994,36 +996,36 @@ func.func @op_iota() -> tensor<16xf32> {
   } : () -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_iota"
 
+// CHECK-LABEL: "op_is_finite"
 func.func @op_is_finite(%arg0: tensor<f32>) -> tensor<i1> {
   // CHECK: "stablehlo.is_finite"(%arg0) : (tensor<f32>) -> tensor<i1>
   %0 = "mhlo.is_finite"(%arg0) : (tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "op_is_finite"
 
+// CHECK-LABEL: "op_log"
 func.func @op_log(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_log"
 
+// CHECK-LABEL: "op_log_plus_one"
 func.func @op_log_plus_one(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.log_plus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.log_plus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_log_plus_one"
 
+// CHECK-LABEL: "op_logistic"
 func.func @op_logistic(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.logistic"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.logistic"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_logistic"
 
+// CHECK-LABEL: "op_map"
 func.func @op_map(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   //      CHECK: "stablehlo.map"(%arg0) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>):
@@ -1041,57 +1043,57 @@ func.func @op_map(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   } : (tensor<16xf32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_map"
 
+// CHECK-LABEL: "op_maximum"
 func.func @op_maximum(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.maximum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.maximum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_maximum"
 
+// CHECK-LABEL: "op_minimum"
 func.func @op_minimum(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.minimum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.minimum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_minimum"
 
+// CHECK-LABEL: "op_multiply"
 func.func @op_multiply(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.multiply"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.multiply"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_multiply"
 
+// CHECK-LABEL: "op_negate"
 func.func @op_negate(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_negate"
 
+// CHECK-LABEL: "op_not"
 func.func @op_not(%arg0: tensor<i1>) -> tensor<i1> {
   // CHECK: "stablehlo.not"(%arg0) : (tensor<i1>) -> tensor<i1>
   %0 = "mhlo.not"(%arg0) : (tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "op_not"
 
+// CHECK-LABEL: "op_optimization_barrier"
 func.func @op_optimization_barrier(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.optimization_barrier"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.optimization_barrier"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_optimization_barrier"
 
+// CHECK-LABEL: "op_or"
 func.func @op_or(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
   // CHECK: "stablehlo.or"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "mhlo.or"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "op_or"
 
+// CHECK-LABEL: "op_outfeed"
 func.func @op_outfeed(%arg0: tensor<f32>, %arg1: !mhlo.token) -> !mhlo.token {
   //      CHECK: "stablehlo.outfeed"(%arg0, %arg1) {
   // CHECK-SAME:   outfeed_config = ""
@@ -1101,8 +1103,8 @@ func.func @op_outfeed(%arg0: tensor<f32>, %arg1: !mhlo.token) -> !mhlo.token {
   } : (tensor<f32>, !mhlo.token) -> (!mhlo.token)
   func.return %0 : !mhlo.token
 }
-// CHECK-LABEL: "op_outfeed"
 
+// CHECK-LABEL: "op_pad"
 func.func @op_pad(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tensor<16xf32> {
   //      CHECK: "stablehlo.pad"(%arg0, %arg1) {
   // CHECK-SAME:   edge_padding_high = dense<4> : tensor<1xi64>,
@@ -1116,43 +1118,43 @@ func.func @op_pad(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tensor<16xf32> {
   } : (tensor<8xf32>, tensor<f32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_pad"
 
+// CHECK-LABEL: "op_partition_id"
 func.func @op_partition_id() -> tensor<ui32> {
   // CHECK: "stablehlo.partition_id"() : () -> tensor<ui32>
   %0 = "mhlo.partition_id"() : () -> tensor<ui32>
   func.return %0 : tensor<ui32>
 }
-// CHECK-LABEL: "op_partition_id"
 
+// CHECK-LABEL: "op_popcnt"
 func.func @op_popcnt(%arg0: tensor<i32>) -> tensor<i32> {
   // CHECK: "stablehlo.popcnt"(%arg0) : (tensor<i32>) -> tensor<i32>
   %0 = "mhlo.popcnt"(%arg0) : (tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "op_popcnt"
 
+// CHECK-LABEL: "op_power"
 func.func @op_power(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.power"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.power"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_power"
 
+// CHECK-LABEL: "op_real_dynamic_slice"
 func.func @op_real_dynamic_slice(%arg0: tensor<?xf32>, %arg1: tensor<1xindex>, %arg2: tensor<1xindex>, %arg3: tensor<1xindex>) -> tensor<?xf32> {
   // CHECK: "stablehlo.real_dynamic_slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<?xf32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   %0 = "mhlo.real_dynamic_slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<?xf32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
-// CHECK-LABEL: "op_real_dynamic_slice"
 
+// CHECK-LABEL: "op_real"
 func.func @op_real(%arg0: tensor<complex<f32>>) -> tensor<f32> {
   // CHECK: "stablehlo.real"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
   %0 = "mhlo.real"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_real"
 
+// CHECK-LABEL: "op_recv"
 func.func @op_recv(%arg0: !mhlo.token) -> (tensor<f32>, !mhlo.token) {
   //      CHECK: "stablehlo.recv"(%arg0) {
   // CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>,
@@ -1164,8 +1166,8 @@ func.func @op_recv(%arg0: !mhlo.token) -> (tensor<f32>, !mhlo.token) {
   } : (!mhlo.token) -> (tensor<f32>, !mhlo.token)
   func.return %0#0, %0#1 : tensor<f32>, !mhlo.token
 }
-// CHECK-LABEL: "op_recv"
 
+// CHECK-LABEL: "op_reduce"
 func.func @op_reduce(%arg0: tensor<16xf32>, %arg1: tensor<f32>) -> tensor<f32> {
   %0 = "mhlo.reduce"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
@@ -1176,8 +1178,8 @@ func.func @op_reduce(%arg0: tensor<16xf32>, %arg1: tensor<f32>) -> tensor<f32> {
   } : (tensor<16xf32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_reduce"
 
+// CHECK-LABEL: "op_reduce_precision"
 func.func @op_reduce_precision(%arg0: tensor<f32>) -> tensor<f32> {
   //      CHECK: "stablehlo.reduce_precision"(%arg0) {
   // CHECK-SAME:   exponent_bits = 8 : i32,
@@ -1189,8 +1191,8 @@ func.func @op_reduce_precision(%arg0: tensor<f32>) -> tensor<f32> {
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_reduce_precision"
 
+// CHECK-LABEL: "op_reduce_scatter"
 func.func @op_reduce_scatter(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   //               CHECK: "stablehlo.reduce_scatter"(%arg0) ({
   //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
@@ -1212,8 +1214,8 @@ func.func @op_reduce_scatter(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   } : (tensor<16xf32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_reduce_scatter"
 
+// CHECK-LABEL: "op_reduce_window"
 func.func @op_reduce_window(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) -> tensor<2x5x8x7xf32> {
   //               CHECK: "stablehlo.reduce_window"(%arg0, %arg1) ({
   //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
@@ -1239,29 +1241,29 @@ func.func @op_reduce_window(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) ->
   } : (tensor<2x17x31x7xf32>, tensor<f32>) -> tensor<2x5x8x7xf32>
   func.return %0 : tensor<2x5x8x7xf32>
 }
-// CHECK-LABEL: "op_reduce_window"
 
+// CHECK-LABEL: "op_remainder"
 func.func @op_remainder(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.remainder"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.remainder"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_remainder"
 
+// CHECK-LABEL: "op_replica_id"
 func.func @op_replica_id() -> tensor<ui32> {
   // CHECK: "stablehlo.replica_id"() : () -> tensor<ui32>
   %0 = "mhlo.replica_id"() : () -> tensor<ui32>
   func.return %0 : tensor<ui32>
 }
-// CHECK-LABEL: "op_replica_id"
 
+// CHECK-LABEL: "op_reshape"
 func.func @op_reshape(%arg0: tensor<16xf32>) -> tensor<4x4xf32> {
   // CHECK: "stablehlo.reshape"(%arg0) : (tensor<16xf32>) -> tensor<4x4xf32>
   %0 = "mhlo.reshape"(%arg0) : (tensor<16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
-// CHECK-LABEL: "op_reshape"
 
+// CHECK-LABEL: "op_return"
 func.func @op_return(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
   //      CHECK: "stablehlo.case"(%arg0) ({
   // CHECK-NEXT:   "stablehlo.return"(%arg1) : (tensor<f32>) -> ()
@@ -1271,8 +1273,8 @@ func.func @op_return(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
   }) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_return"
 
+// CHECK-LABEL: "op_reverse"
 func.func @op_reverse(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   //      CHECK: "stablehlo.reverse"(%arg0) {
   // CHECK-SAME:   dimensions = dense<0> : tensor<1xi64>
@@ -1282,8 +1284,8 @@ func.func @op_reverse(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   } : (tensor<16xf32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_reverse"
 
+// CHECK-LABEL: "op_rng_bit_generator"
 func.func @op_rng_bit_generator(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
   //      CHECK: "stablehlo.rng_bit_generator"(%arg0) {
   // CHECK-SAME:   rng_algorithm = #stablehlo<rng_algorithm PHILOX>
@@ -1293,8 +1295,8 @@ func.func @op_rng_bit_generator(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>
   } : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
   func.return %0#0, %0#1 : tensor<f32>, tensor<f32>
 }
-// CHECK-LABEL: "op_rng_bit_generator"
 
+// CHECK-LABEL: "op_rng"
 func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>) -> tensor<f32> {
   //      CHECK: "stablehlo.rng"(%arg0, %arg1, %arg2) {
   // CHECK-SAME:   rng_distribution = #stablehlo<rng_distribution NORMAL>
@@ -1304,29 +1306,29 @@ func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>
   } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_rng"
 
+// CHECK-LABEL: "op_round_nearest_afz"
 func.func @op_round_nearest_afz(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.round_nearest_afz"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.round_nearest_afz"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_round_nearest_afz"
 
+// CHECK-LABEL: "op_round_nearest_even"
 func.func @op_round_nearest_even(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.round_nearest_even"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.round_nearest_even"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_round_nearest_even"
 
+// CHECK-LABEL: "op_rsqrt"
 func.func @op_rsqrt(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.rsqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.rsqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_rsqrt"
 
+// CHECK-LABEL: "op_scatter"
 func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %arg2: tensor<10x300xf32>) -> tensor<200x100x300xf32> {
   //      CHECK: "stablehlo.scatter"(%arg0, %arg1, %arg2) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<f32>, %[[ARG4:arg.*]]: tensor<f32>):
@@ -1358,8 +1360,8 @@ func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %
   } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
   func.return %0 : tensor<200x100x300xf32>
 }
-// CHECK-LABEL: "op_scatter"
 
+// CHECK-LABEL: "op_select_and_scatter"
 func.func @op_select_and_scatter(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>, %arg2: tensor<f32>) -> tensor<10x24x24x64xf32> {
   //      CHECK: "stablehlo.select_and_scatter"(%arg0, %arg1, %arg2) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG31:arg.*]]: tensor<f32>, %[[ARG41:arg.*]]: tensor<f32>):
@@ -1389,15 +1391,15 @@ func.func @op_select_and_scatter(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<1
   } : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
   func.return %0 : tensor<10x24x24x64xf32>
 }
-// CHECK-LABEL: "op_select_and_scatter"
 
+// CHECK-LABEL: "op_select"
 func.func @op_select(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_select"
 
+// CHECK-LABEL: "op_send"
 func.func @op_send(%arg0: tensor<f32>, %arg1: !mhlo.token) -> !mhlo.token {
   //      CHECK: "stablehlo.send"(%arg0, %arg1) {
   // CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>,
@@ -1409,8 +1411,8 @@ func.func @op_send(%arg0: tensor<f32>, %arg1: !mhlo.token) -> !mhlo.token {
   } : (tensor<f32>, !mhlo.token) -> !mhlo.token
   func.return %0 : !mhlo.token
 }
-// CHECK-LABEL: "op_send"
 
+// CHECK-LABEL: "op_set_dimension_size"
 func.func @op_set_dimension_size(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<16xf32> {
   //      CHECK: "stablehlo.set_dimension_size"(%arg0, %arg1) {
   // CHECK-SAME:   dimension = 0 : i64
@@ -1420,43 +1422,43 @@ func.func @op_set_dimension_size(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> te
   } : (tensor<?xf32>, tensor<i32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_set_dimension_size"
 
+// CHECK-LABEL: "op_shift_left"
 func.func @op_shift_left(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
   // CHECK: "stablehlo.shift_left"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "mhlo.shift_left"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "op_shift_left"
 
+// CHECK-LABEL: "op_shift_right_arithmetic"
 func.func @op_shift_right_arithmetic(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
   // CHECK: "stablehlo.shift_right_arithmetic"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "mhlo.shift_right_arithmetic"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "op_shift_right_arithmetic"
 
+// CHECK-LABEL: "op_shift_right_logical"
 func.func @op_shift_right_logical(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
   // CHECK: "stablehlo.shift_right_logical"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "mhlo.shift_right_logical"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "op_shift_right_logical"
 
+// CHECK-LABEL: "op_sign"
 func.func @op_sign(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.sign"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.sign"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_sign"
 
+// CHECK-LABEL: "op_sine"
 func.func @op_sine(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.sine"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.sine"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_sine"
 
+// CHECK-LABEL: "op_slice"
 func.func @op_slice(%arg0: tensor<16xf32>) -> tensor<4xf32> {
   //      CHECK: "stablehlo.slice"(%arg0) {
   // CHECK-SAME:   limit_indices = dense<4> : tensor<1xi64>,
@@ -1470,8 +1472,8 @@ func.func @op_slice(%arg0: tensor<16xf32>) -> tensor<4xf32> {
   } : (tensor<16xf32>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
 }
-// CHECK-LABEL: "op_slice"
 
+// CHECK-LABEL: "op_sort"
 func.func @op_sort(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   //      CHECK: "stablehlo.sort"(%arg0) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
@@ -1491,22 +1493,22 @@ func.func @op_sort(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   } : (tensor<16xf32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_sort"
 
+// CHECK-LABEL: "op_sqrt"
 func.func @op_sqrt(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.sqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.sqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_sqrt"
 
+// CHECK-LABEL: "op_subtract"
 func.func @op_subtract(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.subtract"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.subtract"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_subtract"
 
+// CHECK-LABEL: "op_tan"
 func.func @op_tan(%arg0: tensor<f32>) -> tensor<f32> {
   //               CHECK: "stablehlo.custom_call"(%arg0) {
   //          CHECK-SAME:    call_target_name = "mhlo.tan"
@@ -1516,15 +1518,15 @@ func.func @op_tan(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "mhlo.tan"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_tan"
 
+// CHECK-LABEL: "op_tanh"
 func.func @op_tanh(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.tanh"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.tanh"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_tanh"
 
+// CHECK-LABEL: "op_torch_index_select"
 func.func @op_torch_index_select(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>) ->  tensor<2x1x5xf32> {
   //      CHECK: "stablehlo.torch_index_select"(%arg0, %arg1) {
   // CHECK-SAME:   batch_dims = 0 : i64,
@@ -1536,8 +1538,8 @@ func.func @op_torch_index_select(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>)
   } : (tensor<5x1x5xf32>, tensor<2xi32>) -> tensor<2x1x5xf32>
   func.return %0 : tensor<2x1x5xf32>
 }
-// CHECK-LABEL: "op_torch_index_select"
 
+// CHECK-LABEL: "op_trace"
 func.func @op_trace(%arg0: tensor<f32>) {
   //      CHECK: "stablehlo.trace"(%arg0) {
   // CHECK-SAME:   tag = "foo"
@@ -1547,8 +1549,8 @@ func.func @op_trace(%arg0: tensor<f32>) {
   } : (tensor<f32>) -> ()
   func.return
 }
-// CHECK-LABEL: "op_trace"
 
+// CHECK-LABEL: "op_transpose"
 func.func @op_transpose(%arg0: tensor<16x8xf32>) ->  tensor<8x16xf32> {
   //      CHECK: "stablehlo.transpose"(%arg0) {
   // CHECK-SAME:   permutation = dense<[1, 0]> : tensor<2xi64>
@@ -1558,8 +1560,8 @@ func.func @op_transpose(%arg0: tensor<16x8xf32>) ->  tensor<8x16xf32> {
   } : (tensor<16x8xf32>) -> tensor<8x16xf32>
   func.return %0 : tensor<8x16xf32>
 }
-// CHECK-LABEL: "op_transpose"
 
+// CHECK-LABEL: "op_triangular_solve"
 func.func @op_triangular_solve(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>) ->  tensor<16x16xf32> {
   //      CHECK: "stablehlo.triangular_solve"(%arg0, %arg1) {
   // CHECK-SAME:   left_side = true,
@@ -1575,15 +1577,15 @@ func.func @op_triangular_solve(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32
   } : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "op_triangular_solve"
 
+// CHECK-LABEL: "op_tuple"
 func.func @op_tuple(%arg0: tensor<f32>) -> tuple<tensor<f32>> {
   // CHECK: "stablehlo.tuple"(%arg0) : (tensor<f32>) -> tuple<tensor<f32>>
   %0 = "mhlo.tuple"(%arg0) : (tensor<f32>) -> tuple<tensor<f32>>
   func.return %0 : tuple<tensor<f32>>
 }
-// CHECK-LABEL: "op_tuple"
 
+// CHECK-LABEL: "op_unary_einsum"
 func.func @op_unary_einsum(%arg0: tensor<8x16xf32>) -> tensor<8xf32> {
   //      CHECK: "stablehlo.unary_einsum"(%arg0) {
   // CHECK-SAME:   einsum_config = "ab->a"
@@ -1593,22 +1595,22 @@ func.func @op_unary_einsum(%arg0: tensor<8x16xf32>) -> tensor<8xf32> {
   } : (tensor<8x16xf32>) -> tensor<8xf32>
   func.return %0 : tensor<8xf32>
 }
-// CHECK-LABEL: "op_unary_einsum"
 
+// CHECK-LABEL: "op_uniform_dequantize"
 func.func @op_uniform_dequantize(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>) -> tensor<f32> {
   // CHECK: "stablehlo.uniform_dequantize"(%arg0) : (tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>) -> tensor<f32>
   %0 = "mhlo.uniform_dequantize"(%arg0) : (tensor<!quant.uniform<i8:f32, 34.0:16>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_uniform_dequantize"
 
+// CHECK-LABEL: "op_uniform_quantize"
 func.func @op_uniform_quantize(%arg0: tensor<f32>) -> tensor<!quant.uniform<i8:f32, 34.0:16>> {
   // CHECK: "stablehlo.uniform_quantize"(%arg0) : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>
   %0 = "mhlo.uniform_quantize"(%arg0) : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 34.0:16>>
   func.return %0 : tensor<!quant.uniform<i8:f32, 34.0:16>>
 }
-// CHECK-LABEL: "op_uniform_quantize"
 
+// CHECK-LABEL: "op_while"
 func.func @op_while(%arg0: tensor<i1>) -> tensor<i1> {
   //      CHECK: "stablehlo.while"(%arg0) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<i1>):
@@ -1626,197 +1628,216 @@ func.func @op_while(%arg0: tensor<i1>) -> tensor<i1> {
   }) : (tensor<i1>) -> tensor<i1>
   func.return %0: tensor<i1>
 }
-// CHECK-LABEL: "op_while"
 
 // XlaRngGetAndUpdateStateOp aka mhlo.xla.rng_get_and_update_state is unsupported at the moment (see negative test below).
 
+// CHECK-LABEL: "op_xor"
 func.func @op_xor(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
   // CHECK: "stablehlo.xor"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "mhlo.xor"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "op_xor"
 
 // ============ TYPES ============
 
+// CHECK-LABEL: "type_i1"
 func.func @type_i1(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
   // CHECK: "stablehlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "mhlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "type_i1"
 
+// CHECK-LABEL: "type_i4"
 func.func @type_i4(%arg0: tensor<i4>, %arg1: tensor<i4>) -> tensor<i4> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<i4>, tensor<i4>) -> tensor<i4>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<i4>, tensor<i4>) -> tensor<i4>
   func.return %0 : tensor<i4>
 }
-// CHECK-LABEL: "type_i4"
 
+// CHECK-LABEL: "type_i8"
 func.func @type_i8(%arg0: tensor<i8>, %arg1: tensor<i8>) -> tensor<i8> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<i8>, tensor<i8>) -> tensor<i8>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<i8>, tensor<i8>) -> tensor<i8>
   func.return %0 : tensor<i8>
 }
-// CHECK-LABEL: "type_i8"
 
+// CHECK-LABEL: "type_i16"
 func.func @type_i16(%arg0: tensor<i16>, %arg1: tensor<i16>) -> tensor<i16> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<i16>, tensor<i16>) -> tensor<i16>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<i16>, tensor<i16>) -> tensor<i16>
   func.return %0 : tensor<i16>
 }
-// CHECK-LABEL: "type_i16"
 
+// CHECK-LABEL: "type_i32"
 func.func @type_i32(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "type_i32"
 
+// CHECK-LABEL: "type_i64"
 func.func @type_i64(%arg0: tensor<i64>, %arg1: tensor<i64>) -> tensor<i64> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<i64>, tensor<i64>) -> tensor<i64>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<i64>, tensor<i64>) -> tensor<i64>
   func.return %0 : tensor<i64>
 }
-// CHECK-LABEL: "type_i64"
 
+// CHECK-LABEL: "type_ui4"
 func.func @type_ui4(%arg0: tensor<ui4>, %arg1: tensor<ui4>) -> tensor<ui4> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<ui4>, tensor<ui4>) -> tensor<ui4>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<ui4>, tensor<ui4>) -> tensor<ui4>
   func.return %0 : tensor<ui4>
 }
-// CHECK-LABEL: "type_ui4"
 
+// CHECK-LABEL: "type_ui8"
 func.func @type_ui8(%arg0: tensor<ui8>, %arg1: tensor<ui8>) -> tensor<ui8> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<ui8>, tensor<ui8>) -> tensor<ui8>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<ui8>, tensor<ui8>) -> tensor<ui8>
   func.return %0 : tensor<ui8>
 }
-// CHECK-LABEL: "type_ui8"
 
+// CHECK-LABEL: "type_ui16"
 func.func @type_ui16(%arg0: tensor<ui16>, %arg1: tensor<ui16>) -> tensor<ui16> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<ui16>, tensor<ui16>) -> tensor<ui16>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<ui16>, tensor<ui16>) -> tensor<ui16>
   func.return %0 : tensor<ui16>
 }
-// CHECK-LABEL: "type_ui16"
 
+// CHECK-LABEL: "type_ui32"
 func.func @type_ui32(%arg0: tensor<ui32>, %arg1: tensor<ui32>) -> tensor<ui32> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<ui32>, tensor<ui32>) -> tensor<ui32>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<ui32>, tensor<ui32>) -> tensor<ui32>
   func.return %0 : tensor<ui32>
 }
-// CHECK-LABEL: "type_ui32"
 
+// CHECK-LABEL: "type_ui64"
 func.func @type_ui64(%arg0: tensor<ui64>, %arg1: tensor<ui64>) -> tensor<ui64> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<ui64>, tensor<ui64>) -> tensor<ui64>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<ui64>, tensor<ui64>) -> tensor<ui64>
   func.return %0 : tensor<ui64>
 }
-// CHECK-LABEL: "type_ui64"
 
+// CHECK-LABEL: "type_f8E4M3FN"
 func.func @type_f8E4M3FN(%arg0: tensor<f8E4M3FN>, %arg1: tensor<f8E4M3FN>) -> tensor<f8E4M3FN> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
   func.return %0 : tensor<f8E4M3FN>
 }
-// CHECK-LABEL: "type_f8E4M3FN"
 
+// CHECK-LABEL: "type_f8E4M3FNUZ"
+func.func @type_f8E4M3FNUZ(%arg0: tensor<f8E4M3FNUZ>, %arg1: tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ> {
+  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3FNUZ>, tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ>
+  %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3FNUZ>, tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ>
+  func.return %0 : tensor<f8E4M3FNUZ>
+}
+
+// CHECK-LABEL: "type_f8E4M3B11FNUZ"
+func.func @type_f8E4M3B11FNUZ(%arg0: tensor<f8E4M3B11FNUZ>, %arg1: tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ> {
+  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3B11FNUZ>, tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ>
+  %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3B11FNUZ>, tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ>
+  func.return %0 : tensor<f8E4M3B11FNUZ>
+}
+
+// CHECK-LABEL: "type_f8E5M2"
 func.func @type_f8E5M2(%arg0: tensor<f8E5M2>, %arg1: tensor<f8E5M2>) -> tensor<f8E5M2> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
   func.return %0 : tensor<f8E5M2>
 }
-// CHECK-LABEL: "type_f8E5M2"
 
+// CHECK-LABEL: "type_f8E5M2FNUZ"
+func.func @type_f8E5M2FNUZ(%arg0: tensor<f8E5M2FNUZ>, %arg1: tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ> {
+  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f8E5M2FNUZ>, tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ>
+  %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f8E5M2FNUZ>, tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ>
+  func.return %0 : tensor<f8E5M2FNUZ>
+}
+
+// CHECK-LABEL: "type_bf16"
 func.func @type_bf16(%arg0: tensor<bf16>, %arg1: tensor<bf16>) -> tensor<bf16> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
   func.return %0 : tensor<bf16>
 }
-// CHECK-LABEL: "type_bf16"
 
+// CHECK-LABEL: "type_f16"
 func.func @type_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f16>, tensor<f16>) -> tensor<f16>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f16>, tensor<f16>) -> tensor<f16>
   func.return %0 : tensor<f16>
 }
-// CHECK-LABEL: "type_f16"
 
+// CHECK-LABEL: "type_f32"
 func.func @type_f32(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "type_f32"
 
+// CHECK-LABEL: "type_f64"
 func.func @type_f64(%arg0: tensor<f64>, %arg1: tensor<f64>) -> tensor<f64> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f64>, tensor<f64>) -> tensor<f64>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f64>, tensor<f64>) -> tensor<f64>
   func.return %0 : tensor<f64>
 }
-// CHECK-LABEL: "type_f64"
 
+// CHECK-LABEL: "type_complex_f32"
 func.func @type_complex_f32(%arg0: tensor<complex<f32>>, %arg1: tensor<complex<f32>>) -> tensor<complex<f32>> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<complex<f32>>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<complex<f32>>
   func.return %0 : tensor<complex<f32>>
 }
-// CHECK-LABEL: "type_complex_f32"
 
+// CHECK-LABEL: "type_complex_f64"
 func.func @type_complex_f64(%arg0: tensor<complex<f64>>, %arg1: tensor<complex<f64>>) -> tensor<complex<f64>> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<complex<f64>>, tensor<complex<f64>>) -> tensor<complex<f64>>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<complex<f64>>, tensor<complex<f64>>) -> tensor<complex<f64>>
   func.return %0 : tensor<complex<f64>>
 }
-// CHECK-LABEL: "type_complex_f64"
 
+// CHECK-LABEL: "type_dynamism_ranked"
 func.func @type_dynamism_ranked(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK: "stablehlo.abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   %0 = "mhlo.abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
-// CHECK-LABEL: "type_dynamism_ranked"
 
+// CHECK-LABEL: "type_dynamism_unranked"
 func.func @type_dynamism_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: "stablehlo.abs"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   %0 = "mhlo.abs"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
-// CHECK-LABEL: "type_dynamism_unranked"
 
+// CHECK-LABEL: "type_quantization"
 func.func @type_quantization(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<!quant.uniform<i8:f32, 34.0:16>>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "type_quantization"
 
+// CHECK-LABEL: "type_sparsity"
 func.func @type_sparsity(%arg0: tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<16xf32> {
   // CHECK: "stablehlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<16xf32>
   %0 = "mhlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "type_sparsity"
 
 // AsyncBundle aka !mhlo.async_bundle is unsupported at the moment (see negative test below).
 
 func.func @type_token_callee(%arg0: !mhlo.token) -> !mhlo.token {
+  // CHECK: function_type = (!stablehlo.token) -> !stablehlo.token, sym_name = "type_token_callee"
   // CHECK: "func.return"(%arg0) : (!stablehlo.token) -> ()
   return %arg0 : !mhlo.token
 }
-//       CHECK: function_type = (!stablehlo.token) -> !stablehlo.token
-// CHECK-LABEL: "type_token_callee"
 
 func.func @type_token_caller(%arg0: !mhlo.token) -> !mhlo.token {
-  // CHECK: "func.call"(%arg0) {callee = @type_token_callee} : (!stablehlo.token) -> !stablehlo.token
+  // CHECK: function_type = (!stablehlo.token) -> !stablehlo.token, sym_name = "type_token_caller"
+  // CHECK: "func.call"(%arg0) <{callee = @type_token_callee}> : (!stablehlo.token) -> !stablehlo.token
   %0 = func.call @type_token_callee(%arg0) : (!mhlo.token) -> !mhlo.token
   return %0 : !mhlo.token
 }
-//       CHECK: function_type = (!stablehlo.token) -> !stablehlo.token
-// CHECK-LABEL: "type_token_caller"
 
+// CHECK-LABEL: "type_token_region"
 func.func @type_token_region(%arg0: tensor<i1>, %arg1: !mhlo.token) {
   //      CHECK: "stablehlo.while"(%arg1) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: !stablehlo.token):
@@ -1834,8 +1855,8 @@ func.func @type_token_region(%arg0: tensor<i1>, %arg1: !mhlo.token) {
   }) : (!mhlo.token) -> !mhlo.token
   return
 }
-// CHECK-LABEL: "type_token_region"
 
+// CHECK-LABEL: "type_tuple"
 func.func @type_tuple(%arg0: tuple<tensor<f32>>) -> tuple<!mhlo.token> {
   %0 = "mhlo.custom_call"(%arg0) {
     call_target_name = "foo"
@@ -1843,7 +1864,6 @@ func.func @type_tuple(%arg0: tuple<tensor<f32>>) -> tuple<!mhlo.token> {
   } : (tuple<tensor<f32>>) -> tuple<!mhlo.token>
   return %0 : tuple<!mhlo.token>
 }
-// CHECK-LABEL: "type_tuple"
 
 // ============ NEGATIVE TESTS ============
 // Some ops, attributes and types used in MHLO programs are not supported in StableHLO.
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/legalize-control-flow.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/legalize-control-flow.mlir
index 27615203962..2f91ca95524 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/legalize-control-flow.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/legalize-control-flow.mlir
@@ -243,4 +243,45 @@ func.func @case0_nested(%arg0 : tensor<i32>, %arg1 : tensor<4xf32>) -> tensor<4x
   }) : (tensor<i32>) -> tensor<4xf32>
   // CHECK: return %[[VAL_2]] : tensor<4xf32>
   func.return %1 : tensor<4xf32>
-}
\ No newline at end of file
+}
+
+func.func @while_is_for(%lb: tensor<i32>, %ub: tensor<i32>, %step: tensor<i32>,
+                        %foo: tensor<4xf32>) -> tensor<4xf32> {
+  %0:2 = mhlo.while(%i = %lb, %arg0 = %foo) : tensor<i32>, tensor<4xf32> cond {
+    %1 = mhlo.compare LT, %i, %ub : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    mhlo.return %1 : tensor<i1>
+  } do {
+    %1 = mhlo.add %i, %step : tensor<i32>
+    mhlo.return %1, %arg0 : tensor<i32>, tensor<4xf32>
+  }
+  func.return %0#1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: @while_is_for
+// CHECK-SAME: %[[LB:.*]]: tensor<i32>, %[[UB:.*]]: tensor<i32>, %[[STEP:.*]]: tensor<i32>
+// CHECK-SAME: %[[FOO:.*]]: tensor<4xf32>
+// CHECK-DAG:  %[[LB_EXT:.*]] = tensor.extract %[[LB]]
+// CHECK-DAG:  %[[UB_EXT:.*]] = tensor.extract %[[UB]]
+// CHECK-DAG:  %[[STEP_EXT:.*]] = tensor.extract %[[STEP]]
+// CHECK-NEXT: %[[RET:.*]]:2 = scf.for %[[I:.*]] = %[[LB_EXT]] to %[[UB_EXT]] step %[[STEP_EXT]]
+// CHECK-SAME: iter_args(%[[I2:.*]] = %[[LB]], %[[ARG0:.*]] = %[[FOO]])
+// CHECK-NEXT: %[[TENSOR_I:.*]] = tensor.from_elements %[[I]]
+// CHECK-NEXT: %[[NEXT_I2:.*]] = mhlo.add %[[TENSOR_I]], %[[STEP]]
+// CHECK-NEXT: scf.yield %[[NEXT_I2]], %[[ARG0]]
+// CHECK:      return %[[RET]]#1
+
+func.func @while_is_for_and_unsigned(%lb: tensor<ui32>, %ub: tensor<ui32>,
+                                     %step: tensor<ui32>, %foo: tensor<4xf32>)
+                                     -> tensor<4xf32> {
+  %0:2 = mhlo.while(%i = %lb, %arg0 = %foo) : tensor<ui32>, tensor<4xf32> cond {
+    %1 = mhlo.compare LT, %i, %ub : (tensor<ui32>, tensor<ui32>) -> tensor<i1>
+    mhlo.return %1 : tensor<i1>
+  } do {
+    %1 = mhlo.add %i, %step : tensor<ui32>
+    mhlo.return %1, %arg0 : tensor<ui32>, tensor<4xf32>
+  }
+  func.return %0#1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: @while_is_for_and_unsigned
+// CHECK: scf.while
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
index fec156199fe..e9eb32b3fd0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
@@ -49,6 +49,7 @@ func.func @broadcast(%a : tensor<3xi32>) -> tensor<1x2x3xindex> {
 // -----
 
 func.func @broadcast(%a : tensor<3xi32>) -> tensor<1x2x3xi32> {
+  // expected-error@+2 {{'mhlo.broadcast' op failed to infer returned types}}
   // expected-error@+1 {{Broadcast with negative dimension size -2}}
   %0 = "mhlo.broadcast"(%a) {broadcast_sizes = dense<[1, -2]> : tensor<2xi64>}
       : (tensor<3xi32>) -> tensor<1x2x3xi32>
@@ -96,6 +97,7 @@ func.func @pad_with_bounds(%arg0: tensor<3x?x?xf16, #mhlo.type_extensions<bounds
 // -----
 
 func.func @pad_with_negative_inferred_bounds(%arg0: tensor<3x?x?xf16, #mhlo.type_extensions<bounds = [?, 3, ?]>>, %arg1: tensor<f16>) -> tensor<*xindex> {
+  // expected-error@+2 {{'mhlo.pad' op failed to infer returned types}}
   // expected-error@+1 {{Padding result in negative bound for dimension 1}}
   %0 = "mhlo.pad"(%arg0, %arg1) {
     edge_padding_low = dense<[2, -10, 0]> : tensor<3xi64>,
@@ -478,6 +480,7 @@ func.func @slice_with_bounds(%arg0: tensor<3x?x?xi32, #mhlo.type_extensions<boun
 // -----
 
 func.func @slice_with_index_larger_than_bound_dim(%arg0: tensor<3x?x?xi32, #mhlo.type_extensions<bounds = [?, 4, ?]>>) -> tensor<*xindex> {
+  // expected-error@+2 {{'mhlo.slice' op failed to infer returned types}}
   // expected-error@+1 {{limit index 5 is larger than dimension bound 4 in dimension 1}}
   %0 = "mhlo.slice"(%arg0) {start_indices = dense<[1, 0, 0]> : tensor<3xi64>, limit_indices = dense<[2, 5, 4]> : tensor<3xi64>, strides = dense<[1, 2, 2]> : tensor<3xi64>} : (tensor<3x?x?xi32, #mhlo.type_extensions<bounds = [?, 4, ?]>>) -> tensor<*xi32>
   %1 = "mhlo_test.get_return_types"(%0) : (tensor<*xi32>) -> tensor<*xindex>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
index 42242f1b192..eb6cb42f80c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
@@ -509,6 +509,7 @@ func.func @alltoall_dynamic_concat_dim(%data: tensor<?x16xf32>) -> tensor<?x4xf3
 // -----
 
 func.func @alltoall_negative_split_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+2 {{'mhlo.all_to_all' op failed to infer returned types}}
   // expected-error@+1 {{AllToAll split_dimension cannot be negative}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = -1 : i64,
@@ -522,6 +523,7 @@ func.func @alltoall_negative_split_dimension(%data: tensor<4x16xf32>) -> tensor<
 // -----
 
 func.func @alltoall_out_bound_split_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+2 {{'mhlo.all_to_all' op failed to infer returned types}}
   // expected-error@+1 {{AllToAll split_dimension 2 is out-of-bounds for input rank 2}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = 2 : i64,
@@ -535,6 +537,7 @@ func.func @alltoall_out_bound_split_dimension(%data: tensor<4x16xf32>) -> tensor
 // -----
 
 func.func @alltoall_negative_concat_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+2 {{'mhlo.all_to_all' op failed to infer returned types}}
   // expected-error@+1 {{AllToAll concat_dimension cannot be negative}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = 1 : i64,
@@ -548,6 +551,7 @@ func.func @alltoall_negative_concat_dimension(%data: tensor<4x16xf32>) -> tensor
 // -----
 
 func.func @alltoall_out_bound_concat_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+2 {{'mhlo.all_to_all' op failed to infer returned types}}
   // expected-error@+1 {{AllToAll concat_dimension 2 is out-of-bounds for input rank 2}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = 1 : i64,
@@ -561,6 +565,7 @@ func.func @alltoall_out_bound_concat_dimension(%data: tensor<4x16xf32>) -> tenso
 // -----
 
 func.func @alltoall_invalid_split_count(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+2 {{'mhlo.all_to_all' op failed to infer returned types}}
   // expected-error@+1 {{AllToAll split_count must be > 0}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = 1 : i64,
@@ -574,7 +579,8 @@ func.func @alltoall_invalid_split_count(%data: tensor<4x16xf32>) -> tensor<16x4x
 // -----
 
 func.func @alltoall_invalid_split_dim_size(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-// expected-error@+1 {{split dimension has size 16, expected to be a multiple of split_count 5}}
+  // expected-error@+2 {{'mhlo.all_to_all' op failed to infer returned types}}
+  // expected-error@+1 {{split dimension has size 16, expected to be a multiple of split_count 5}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = 1 : i64,
     concat_dimension = 0 : i64,
@@ -587,6 +593,7 @@ func.func @alltoall_invalid_split_dim_size(%data: tensor<4x16xf32>) -> tensor<16
 // -----
 
 func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+2 {{'mhlo.all_to_all' op failed to infer returned types}}
   // expected-error@+1 {{replica groups should be a rank 2 tensor}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = 1 : i64,
@@ -600,6 +607,7 @@ func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x
 // -----
 
 func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+2 {{'mhlo.all_to_all' op failed to infer returned types}}
   // expected-error@+1 {{replica id #1 not seen in replica groups}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = 1 : i64,
@@ -613,6 +621,7 @@ func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x
 // -----
 
 func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+2 {{'mhlo.all_to_all' op failed to infer returned types}}
   // expected-error@+1 {{replica id #2 seen more than once}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = 1 : i64,
@@ -626,6 +635,7 @@ func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x
 // -----
 
 func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+2 {{'mhlo.all_to_all' op failed to infer returned types}}
   // expected-error@+1 {{replica id #4 not seen in replica groups}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = 1 : i64,
@@ -639,6 +649,7 @@ func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x
 // -----
 
 func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+2 {{'mhlo.all_to_all' op failed to infer returned types}}
   // expected-error@+1 {{group size of replica_groups must be 4}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = 1 : i64,
@@ -819,6 +830,7 @@ func.func @broadcast(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 // -----
 
 func.func @broadcast_bad_sizes_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
+  // expected-error@+2 {{'mhlo.broadcast' op failed to infer returned types}}
   // expected-error@+1 {{broadcast_sizes has rank 2 instead of rank 1}}
   %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[[1, 2]]> : tensor<1x2xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
@@ -827,6 +839,7 @@ func.func @broadcast_bad_sizes_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 // -----
 
 func.func @broadcast_bad_result_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
+  // expected-error@+2 {{'mhlo.broadcast' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.broadcast' op inferred type(s) 'tensor<2x3xi32>' are incompatible with return type(s) of operation 'tensor<1x2x3xi32>'}}
   %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
@@ -835,6 +848,7 @@ func.func @broadcast_bad_result_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32>
 // -----
 
 func.func @broadcast_bad_first_part_result_shape(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
+  // expected-error@+2 {{'mhlo.broadcast' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.broadcast' op inferred type(s) 'tensor<2x3xi32>' are incompatible with return type(s) of operation 'tensor<1x3xi32>'}}
   %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x3xi32>
   func.return %0 : tensor<1x3xi32>
@@ -843,6 +857,7 @@ func.func @broadcast_bad_first_part_result_shape(%arg0: tensor<3xi32>) -> tensor
 // -----
 
 func.func @broadcast_bad_second_part_result_shape(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
+  // expected-error@+2 {{'mhlo.broadcast' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.broadcast' op inferred type(s) 'tensor<2x3xi32>' are incompatible with return type(s) of operation 'tensor<2x1xi32>'}}
   %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<2x1xi32>
   func.return %0 : tensor<2x1xi32>
@@ -976,6 +991,7 @@ func.func @if(%pred : tensor<i1>, %branch_operand : tensor<2xf32>) -> tensor<2xf
 // -----
 
 func.func @if_c1(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<f32> {
+  // expected-error@+2 {{'mhlo.if' op failed to infer returned types}}
   // @expected-error@+1 {{branch 0 must have 0 arguments, but found 1}}
   %0 = "mhlo.if"(%pred) ({
       ^bb0(%arg0: tensor<f32>):
@@ -989,6 +1005,7 @@ func.func @if_c1(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<f3
 // -----
 
 func.func @if_c1(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<f32> {
+  // expected-error@+2 {{'mhlo.if' op failed to infer returned types}}
   // @expected-error@+1 {{branch 1 must have 0 arguments, but found 1}}
   %0 = "mhlo.if"(%pred) ({
       "mhlo.return"(%branch_operand) : (tensor<f32>) -> ()
@@ -1002,6 +1019,7 @@ func.func @if_c1(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<f3
 // -----
 
 func.func @if_c2(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<f32> {
+  // expected-error@+2 {{'mhlo.if' op failed to infer returned types}}
   // @expected-error@+1 {{branch 0 and branch 1 have mismatched return types: 'tensor<f32>', 'tensor<f32>' vs 'tensor<f32>'}}
   %0 = "mhlo.if"(%pred) ({
       "mhlo.return"(%branch_operand, %branch_operand) : (tensor<f32>, tensor<f32>) -> ()
@@ -1014,6 +1032,7 @@ func.func @if_c2(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<f3
 // -----
 
 func.func @if_c3(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<i32> {
+  // expected-error@+2 {{'mhlo.if' op failed to infer returned types}}
   // @expected-error@+1 {{inferred type(s) 'tensor<f32>' are incompatible with return type(s) of operation 'tensor<i32>'}}
   %0 = "mhlo.if"(%pred) ({
       "mhlo.return"(%branch_operand) : (tensor<f32>) -> ()
@@ -1050,6 +1069,7 @@ func.func @if_dynamic_op_result(%pred : tensor<i1>, %branch_operand: tensor<2xf3
 // -----
 
 func.func @if_i1(%pred : tensor<1xi1>, %branch_operand : tensor<f32>) -> tensor<f32> {
+  // expected-error@+2 {{'mhlo.if' op failed to infer returned types}}
   // @expected-error@+1 {{operand should be rank 0 tensor but got rank 1}}
   %0 = "mhlo.if"(%pred) ({
       "mhlo.return"(%branch_operand) : (tensor<f32>) -> ()
@@ -1074,77 +1094,28 @@ func.func @if_unranked(%pred : tensor<i1>, %true_branch_operand: tensor<2xf32>,
 // -----
 
 // CHECK-LABEL: @case
-func.func @case(%index : tensor<i32>, %branch_operand : tensor<2xf32>) {
-  %0 = "mhlo.case"(%index) ({
-      "mhlo.return"(%branch_operand) : (tensor<2xf32>) -> ()
+func.func @case(%index : tensor<i32>, %branch_operand : tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %0, %1 = "mhlo.case"(%index) ({
+    "mhlo.return"(%branch_operand, %branch_operand) : (tensor<f32>, tensor<f32>) -> ()
   }, {
-      "mhlo.return"(%branch_operand) : (tensor<2xf32>) -> ()
-  }) : (tensor<i32>) -> tensor<2xf32>
-  func.return
+    "mhlo.return"(%branch_operand, %branch_operand) : (tensor<f32>, tensor<f32>) -> ()
+  }) : (tensor<i32>) -> (tensor<f32>, tensor<f32>)
+  func.return %0, %1 : tensor<f32>, tensor<f32>
 }
 
 // -----
 
-func.func @case_zero_branches(%index : tensor<i32>, %branch_operand : tensor<2xf32>) {
+func.func @case_c1(%index : tensor<i32>, %branch_operand : tensor<2xf32>) -> tensor<2xf32> {
+  // @expected-error@+2 {{'mhlo.case' op failed to infer returned types}}
   // @expected-error@+1 {{expect at least one branch}}
   %0 = "mhlo.case"(%index) : (tensor<i32>) -> tensor<2xf32>
-  func.return
+  func.return %0 : tensor<2xf32>
 }
 
 // -----
 
-// CHECK-LABEL: @case_dynamic_op_result
-func.func @case_dynamic_op_result(%index : tensor<i32>, %branch_operand : tensor<2xf32>) {
-  %0 = "mhlo.case"(%index) ({
-      "mhlo.return"(%branch_operand) : (tensor<2xf32>) -> ()
-  }, {
-      "mhlo.return"(%branch_operand) : (tensor<2xf32>) -> ()
-  }) : (tensor<i32>) -> tensor<?xf32>
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: @case_dynamic_branch_result
-func.func @case_dynamic_branch_result(%index : tensor<i32>, %branch_operand : tensor<?xf32>) {
-  %0 = "mhlo.case"(%index) ({
-      "mhlo.return"(%branch_operand) : (tensor<?xf32>) -> ()
-  }, {
-      "mhlo.return"(%branch_operand) : (tensor<?xf32>) -> ()
-  }) : (tensor<i32>) -> tensor<2xf32>
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: @case_unranked
-func.func @case_unranked(%index : tensor<i32>, %branch_operand : tensor<*xf32>) {
-  %0 = "mhlo.case"(%index) ({
-      "mhlo.return"(%branch_operand) : (tensor<*xf32>) -> ()
-  }, {
-      "mhlo.return"(%branch_operand) : (tensor<*xf32>) -> ()
-  }) : (tensor<i32>) -> tensor<*xf32>
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: @case_nested_different_return_types(
-func.func @case_nested_different_return_types(%index : tensor<i32>, %branch_operand : tensor<f32>) {
-  %0 = "mhlo.case"(%index) ({
-      "mhlo.return"(%branch_operand) : (tensor<f32>) -> ()
-  }, {
-      %2 = "mhlo.case"(%index) ({
-          "mhlo.return"(%index) : (tensor<i32>) -> ()
-      }) : (tensor<i32>) -> tensor<i32>
-      "mhlo.return"(%branch_operand) : (tensor<f32>) -> ()
-  }) : (tensor<i32>) -> tensor<f32>
-  func.return
-}
-
-// -----
-
-func.func @case_unexpected_arguments_in_region_of_branch_1(%index : tensor<i32>, %branch_operand : tensor<f32>) {
+func.func @case_c2(%index : tensor<i32>, %branch_operand : tensor<f32>) -> tensor<f32> {
+  // @expected-error@+2 {{'mhlo.case' op failed to infer returned types}}
   // @expected-error@+1 {{branch 1 must have 0 arguments, but found 1}}
   %0 = "mhlo.case"(%index) ({
       "mhlo.return"(%branch_operand) : (tensor<f32>) -> ()
@@ -1152,12 +1123,13 @@ func.func @case_unexpected_arguments_in_region_of_branch_1(%index : tensor<i32>,
       ^bb0(%arg0: tensor<f32>):
         "mhlo.return"(%branch_operand) : (tensor<f32>) -> ()
   }) : (tensor<i32>) -> tensor<f32>
-  func.return
+  func.return %0 : tensor<f32>
 }
 
 // -----
 
-func.func @case_mismatch_types_in_branches(%index: tensor<i32>, %operand_1: tensor<f32>, %operand_2: tensor<f32>, %operand_3: tensor<f32>) -> tensor<f32> {
+func.func @case_c3(%index: tensor<i32>, %operand_1: tensor<f32>, %operand_2: tensor<f32>, %operand_3: tensor<f32>) -> tensor<f32> {
+  // @expected-error@+2 {{'mhlo.case' op failed to infer returned types}}
   // expected-error@+1 {{branch 0 and branch 1 have mismatched return types: 'tensor<f32>' vs 'tensor<i32>'}}
   %0 = "mhlo.case"(%index) ({
       %1 = "mhlo.negate"(%operand_1) : (tensor<f32>) -> tensor<f32>
@@ -1165,24 +1137,70 @@ func.func @case_mismatch_types_in_branches(%index: tensor<i32>, %operand_1: tens
     },  {
       %1 = mhlo.constant dense<2> : tensor<i32>
       "mhlo.return"(%1) : (tensor<i32>) -> ()
-    },  {
-      %1 = "mhlo.floor"(%operand_3) : (tensor<f32>) -> tensor<f32>
-      "mhlo.return"(%1) : (tensor<f32>) -> ()
-    }
-  ) : (tensor<i32>) -> tensor<f32>
+    }) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // -----
 
-func.func @case_mismatch_return_type(%index : tensor<i32>, %branch_operand : tensor<f32>) {
+func.func @case_c4(%index : tensor<i32>, %branch_operand : tensor<f32>) -> tensor<i32> {
+  // @expected-error@+2 {{'mhlo.case' op failed to infer returned types}}
   // @expected-error@+1 {{inferred type(s) 'tensor<f32>' are incompatible with return type(s) of operation 'tensor<i32>'}}
   %0 = "mhlo.case"(%index) ({
       "mhlo.return"(%branch_operand) : (tensor<f32>) -> ()
   }, {
       "mhlo.return"(%branch_operand) : (tensor<f32>) -> ()
   }) : (tensor<i32>) -> tensor<i32>
-  func.return
+  func.return %0 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: @case_dynamic_branch_result
+func.func @case_dynamic_branch_result(%index : tensor<i32>, %branch_operand : tensor<?xf32>) -> tensor<2xf32> {
+  %0 = "mhlo.case"(%index) ({
+      "mhlo.return"(%branch_operand) : (tensor<?xf32>) -> ()
+  }, {
+      "mhlo.return"(%branch_operand) : (tensor<?xf32>) -> ()
+  }) : (tensor<i32>) -> tensor<2xf32>
+  func.return %0 : tensor<2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @case_dynamic_op_result
+func.func @case_dynamic_op_result(%index : tensor<i32>, %branch_operand : tensor<2xf32>) -> tensor<?xf32> {
+  %0 = "mhlo.case"(%index) ({
+      "mhlo.return"(%branch_operand) : (tensor<2xf32>) -> ()
+  }, {
+      "mhlo.return"(%branch_operand) : (tensor<2xf32>) -> ()
+  }) : (tensor<i32>) -> tensor<?xf32>
+  func.return %0 : tensor<?xf32>
+}
+
+// -----
+
+func.func @case_i1(%index : tensor<1xi32>, %branch_operand : tensor<2xf32>) -> tensor<2xf32> {
+  // @expected-error@+2 {{'mhlo.case' op failed to infer returned types}}
+  // @expected-error@+1 {{operand should be rank 0 tensor but got rank 1}}
+  %0 = "mhlo.case"(%index) ({
+      "mhlo.return"(%branch_operand) : (tensor<2xf32>) -> ()
+  }, {
+      "mhlo.return"(%branch_operand) : (tensor<2xf32>) -> ()
+  }) : (tensor<1xi32>) -> tensor<2xf32>
+  func.return %0 : tensor<2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @case_unranked
+func.func @case_unranked(%index : tensor<i32>, %branch_operand : tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "mhlo.case"(%index) ({
+      "mhlo.return"(%branch_operand) : (tensor<*xf32>) -> ()
+  }, {
+      "mhlo.return"(%branch_operand) : (tensor<*xf32>) -> ()
+  }) : (tensor<i32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
 }
 
 // -----
@@ -1285,57 +1303,34 @@ func.func @collective_permute_invalid_source_target_pairs(%arg0: tensor<128x32xf
 
 // -----
 
-func.func @concat_0D(%arg0: tensor<i32>, %arg1: tensor<i32>)  -> tensor<2xi32> {
-  // expected-error@+1 {{rank-0 values cannot be concatenated}}
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-
-// -----
-
-func.func @concat_no_operands()  -> tensor<2xi32> {
-  // expected-error@+1 {{expected 1 or more operands, but found 0}}
-  %0 = "mhlo.concatenate"() { dimension = 0 : i64 } : () -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @concat_1D
-func.func @concat_1D(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
+// CHECK-LABEL: @concatenate_1D
+func.func @concatenate_1D(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
   func.return %0 : tensor<3xi32>
 }
 
 // -----
 
-// CHECK-LABEL: @concat_1D
+// CHECK-LABEL: @concatenate_1D
 // Verifies that an error is not thrown if the inferred type is compatible with
 // the result type.
-func.func @concat_1D(%arg0: tensor<1xi32>, %arg1: tensor<*xi32>)  -> tensor<3xi32> {
+func.func @concatenate_1D(%arg0: tensor<1xi32>, %arg1: tensor<*xi32>)  -> tensor<3xi32> {
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<*xi32>) -> tensor<3xi32>
   func.return %0 : tensor<3xi32>
 }
 
 // -----
 
-func.func @concat_1D_type_error(%arg0: tensor<1xi32>, %arg1: tensor<2xf32>)  -> tensor<3xi32> {
-  // expected-error@+1 {{'mhlo.concatenate' op requires the same element type for all operands and results}}
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2xf32>) -> tensor<3xi32>
-  func.return %0 : tensor<3xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @concat_1D_unranked
-func.func @concat_1D_unranked(%arg0: tensor<1xi32>, %arg1: tensor<*xi32>)  -> tensor<*xi32> {
+// CHECK-LABEL: @concatenate_1D_unranked
+func.func @concatenate_1D_unranked(%arg0: tensor<1xi32>, %arg1: tensor<*xi32>)  -> tensor<*xi32> {
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<*xi32>) -> tensor<*xi32>
   func.return %0 : tensor<*xi32>
 }
 
 // -----
 
-func.func @concat_1D_error(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<4xi32> {
+func.func @concatenate_c1_c5(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<4xi32> {
+  // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
   // expected-error@+1 {{op inferred type(s) 'tensor<3xi32>' are incompatible with return type(s) of operation 'tensor<4xi32>'}}
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<4xi32>
   func.return %0 : tensor<4xi32>
@@ -1343,31 +1338,8 @@ func.func @concat_1D_error(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tenso
 
 // -----
 
-func.func @concat_nagetive_dim(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
-  // expected-error@+1 {{dimension -1 is negative}}
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = -1 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
-  func.return %0 : tensor<3xi32>
-}
-
-// -----
-
-func.func @concat_nagetive_dim_with_all_unranked_operands(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>)  -> tensor<*xi32> {
-  // expected-error@+1 {{dimension -1 is negative}}
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = -1 : i64 } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
-  func.return %0 : tensor<*xi32>
-}
-
-// -----
-
-func.func @concat_outofbounds_dim(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
-  // expected-error@+1 {{dimension 10 is out-of-bounds for input rank 1}}
-  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 10 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
-  func.return %0 : tensor<3xi32>
-}
-
-// -----
-
-func.func @concat_mismatch_rank(%arg0: tensor<1xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3xi32> {
+func.func @concatenate_c2(%arg0: tensor<1xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3xi32> {
+  // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
   // expected-error@+1 {{operands (0) and (1) do not match rank}}
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2x2xi32>) -> tensor<3xi32>
   func.return %0 : tensor<3xi32>
@@ -1375,7 +1347,52 @@ func.func @concat_mismatch_rank(%arg0: tensor<1xi32>, %arg1: tensor<2x2xi32>)  -
 
 // -----
 
-func.func @concat_mismatch_dim(%arg0: tensor<1x3xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3x3xi32> {
+func.func @concatenate_c3()  -> tensor<2xi32> {
+  // expected-error@+1 {{expected 1 or more operands, but found 0}}
+  %0 = "mhlo.concatenate"() { dimension = 0 : i64 } : () -> tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+
+// -----
+
+func.func @concatenate_c4(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
+  // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
+  // expected-error@+1 {{dimension -1 is negative}}
+  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = -1 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
+  func.return %0 : tensor<3xi32>
+}
+
+// -----
+
+func.func @concatenate_c4(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>)  -> tensor<*xi32> {
+  // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
+  // expected-error@+1 {{dimension -1 is negative}}
+  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = -1 : i64 } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+  func.return %0 : tensor<*xi32>
+}
+
+// -----
+
+func.func @concatenate_c4(%arg0: tensor<i32>, %arg1: tensor<i32>)  -> tensor<2xi32> {
+  // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
+  // expected-error@+1 {{rank-0 values cannot be concatenated}}
+  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+
+// -----
+
+func.func @concatenate_c4(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
+  // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
+  // expected-error@+1 {{dimension 10 is out-of-bounds for input rank 1}}
+  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 10 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
+  func.return %0 : tensor<3xi32>
+}
+
+// -----
+
+func.func @concatenate_c6(%arg0: tensor<1x3xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3x3xi32> {
+  // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
   // expected-error@+1 {{shapes of operand (0) and (1) do not match at non-concat index: (1, 3) != (2, 2) at non-concat index 1}}
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1x3xi32>, tensor<2x2xi32>) -> tensor<3x3xi32>
   func.return %0 : tensor<3x3xi32>
@@ -1406,6 +1423,7 @@ func.func @clamp_compatible_dynamic_match_static(%arg0: tensor<?xi32>, %arg1: te
 // -----
 
 func.func @clamp_c1(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<1xi32> {
+  // @expected-error@+2 {{'mhlo.clamp' op failed to infer returned types}}
   // expected-error@+1 {{min shape [2] is not scalar and is not compatible to operand shape [1]}}
   %0 = "mhlo.clamp"(%arg1, %arg0, %arg0) : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
   func.return %0: tensor<1xi32>
@@ -1414,6 +1432,7 @@ func.func @clamp_c1(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<1xi32>
 // -----
 
 func.func @clamp_c2(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<1xi32> {
+  // @expected-error@+2 {{'mhlo.clamp' op failed to infer returned types}}
   // expected-error@+1 {{max shape [2] is not scalar and is not compatible to operand shape [1]}}
   %0 = "mhlo.clamp"(%arg0, %arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<1xi32>
   func.return %0: tensor<1xi32>
@@ -1422,7 +1441,8 @@ func.func @clamp_c2(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<1xi32>
 // -----
 
 func.func @clamp_c4(%arg0: tensor<1xi32>) -> tensor<1x2xi32> {
-  // // expected-error@+1{{inferred type(s) 'tensor<1xi32>' are incompatible with return type(s) of operation 'tensor<1x2xi32>'}}
+  // @expected-error@+2 {{'mhlo.clamp' op failed to infer returned types}}
+  // expected-error@+1{{inferred type(s) 'tensor<1xi32>' are incompatible with return type(s) of operation 'tensor<1x2xi32>'}}
   %0 = "mhlo.clamp"(%arg0, %arg0, %arg0) : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2xi32>
   func.return %0: tensor<1x2xi32>
 }
@@ -1446,6 +1466,7 @@ func.func @cholesky(%arg0: tensor<1x2x2xf32>) -> tensor<1x2x2xf32> {
 // -----
 
 func.func @cholesky_error_nonsquare(%arg0: tensor<1x2x1xf32>) -> tensor<1x2x1xf32> {
+  // @expected-error@+2 {{'mhlo.cholesky' op failed to infer returned types}}
   // expected-error@+1 {{minor dimensions of 'a' must have equal size, got shape 1, 2, 1}}
   %0 = "mhlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
   func.return %0: tensor<1x2x1xf32>
@@ -1454,6 +1475,7 @@ func.func @cholesky_error_nonsquare(%arg0: tensor<1x2x1xf32>) -> tensor<1x2x1xf3
 // -----
 
 func.func @cholesky_invalid_rank(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  // @expected-error@+2 {{'mhlo.cholesky' op failed to infer returned types}}
   // expected-error@+1 {{argument 'a' must have rank >= 2, got shape 1}}
   %0 = "mhlo.cholesky"(%arg0) { lower = true } : (tensor<1xf32>) -> tensor<1xf32>
   func.return %0: tensor<1xf32>
@@ -1462,7 +1484,7 @@ func.func @cholesky_invalid_rank(%arg0: tensor<1xf32>) -> tensor<1xf32> {
 // -----
 
 func.func @cholesky_invalid_elt(%arg0: tensor<1x2x2xi32>) -> tensor<1x2x2xi32> {
-  // expected-error@+1 {{operand #0 must be tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<1x2x2xi32>}}
+  // expected-error@+1 {{op operand #0 must be tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<1x2x2xi32>'}}
   %0 = "mhlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x2xi32>) -> tensor<1x2x2xi32>
   func.return %0: tensor<1x2x2xi32>
 }
@@ -1470,6 +1492,7 @@ func.func @cholesky_invalid_elt(%arg0: tensor<1x2x2xi32>) -> tensor<1x2x2xi32> {
 // -----
 
 func.func @cholesky_wrong_infer_shape(%arg0: tensor<1x2x2xf32>) -> tensor<1x2x2x2xf32> {
+  // @expected-error@+2 {{'mhlo.cholesky' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.cholesky' op inferred type(s) 'tensor<1x2x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2x2x2xf32>'}}
   %0 = "mhlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x2xf32>) -> tensor<1x2x2x2xf32>
   func.return %0: tensor<1x2x2x2xf32>
@@ -1576,22 +1599,14 @@ func.func @dot_legal_unranked_rank_type(%arg0: tensor<*xf32>, %arg1: tensor<*xf3
 
 // -----
 
-// CHECK-LABEL: func @imag_fp_input
-func.func @imag_fp_input(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = "mhlo.imag"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
+func.func @imag_c2(%arg0: tensor<2xf32>) -> tensor<2xf16> {
+  // @expected-error@+2 {{'mhlo.imag' op failed to infer returned types}}
+  // expected-error@+1 {{inferred type(s) 'tensor<2xf32>' are incompatible with return type(s) of operation 'tensor<2xf16>'}}
+  %0 = "mhlo.imag"(%arg0) : (tensor<2xf32>) -> tensor<2xf16>
+  func.return %0 : tensor<2xf16>
 }
 
 // -----
-
-func.func @imag_int_input(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-  // expected-error@+1 {{operand #0 must be tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<*xi32>'}}
-  %0 = "mhlo.imag"(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
-  func.return %0 : tensor<*xi32>
-}
-
-// -----
-
 // CHECK-LABEL: func @imag_complex_input
 func.func @imag_complex_input(%arg0: tensor<2x3xcomplex<f32>>) -> tensor<2x3xf32> {
   %0 = "mhlo.imag"(%arg0) : (tensor<2x3xcomplex<f32>>) -> tensor<2x3xf32>
@@ -1600,18 +1615,10 @@ func.func @imag_complex_input(%arg0: tensor<2x3xcomplex<f32>>) -> tensor<2x3xf32
 
 // -----
 
-func.func @imag_mismatch_return_shape(%arg0: tensor<2xf32>) -> tensor<4xf32> {
-  // expected-error@+1 {{all non-scalar operands/results must have the same shape and base type}}
-  %0 = "mhlo.imag"(%arg0) : (tensor<2xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-
-func.func @imag_mismatch_return_element_type(%arg0: tensor<2xf32>) -> tensor<2xf16> {
-  // expected-error@+1 {{inferred type(s) 'tensor<2xf32>' are incompatible with return type(s) of operation 'tensor<2xf16>'}}
-  %0 = "mhlo.imag"(%arg0) : (tensor<2xf32>) -> tensor<2xf16>
-  func.return %0 : tensor<2xf16>
+// CHECK-LABEL: func @imag_unranked
+func.func @imag_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "mhlo.imag"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
 }
 
 // -----
@@ -1721,6 +1728,7 @@ func.func @map_unranked(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*x
 // -----
 
 func.func @map_mismatched_args(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // @expected-error@+2 {{'mhlo.map' op failed to infer returned types}}
   // expected-error@+1 {{expects number of operands to match the arity of map computation, but got: 2 and 1}}
   %0 = "mhlo.map"(%arg0, %arg1) ({
     ^bb0(%arg: tensor<f32>):
@@ -1733,6 +1741,7 @@ func.func @map_mismatched_args(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> te
 // -----
 
 func.func @map_non_scalar_computation_operand(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
+  // @expected-error@+2 {{'mhlo.map' op failed to infer returned types}}
   // expected-error@+1 {{computation arguments must be 0-rank tensor, but got: arg #1 of type 'tensor<5xf32>'}}
   %0 = "mhlo.map"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<5xf32>):
@@ -1745,6 +1754,7 @@ func.func @map_non_scalar_computation_operand(%arg0: tensor<4x5xf32>, %arg1: ten
 // -----
 
 func.func @map_mismatch_operand_and_computation_args(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
+  // @expected-error@+2 {{'mhlo.map' op failed to infer returned types}}
   // expected-error@+1 {{element type of operands and computation arguments must match, but got: 'f32' and 'i32'}}
   %0 = "mhlo.map"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<i32>, %arg3: tensor<i32>):
@@ -1757,6 +1767,7 @@ func.func @map_mismatch_operand_and_computation_args(%arg0: tensor<4x5xf32>, %ar
 // -----
 
 func.func @map_invalid_number_of_computation_output(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
+  // @expected-error@+2 {{'mhlo.map' op failed to infer returned types}}
   // expected-error@+1 {{computation must return single output, but got: 0}}
   %0 = "mhlo.map"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
@@ -1769,6 +1780,7 @@ func.func @map_invalid_number_of_computation_output(%arg0: tensor<4x5xf32>, %arg
 // -----
 
 func.func @main_non_scalar_computation_output(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
+  // @expected-error@+2 {{'mhlo.map' op failed to infer returned types}}
   // expected-error@+1 {{computation must return 0-rank tensor, but got: 'tensor<5xf32>'}}
   %0 = "mhlo.map"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
@@ -1781,6 +1793,7 @@ func.func @main_non_scalar_computation_output(%arg0: tensor<4x5xf32>, %arg1: ten
 // -----
 
 func.func @mismatch_computation_output_type(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
+  // @expected-error@+2 {{'mhlo.map' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<4x5xi32>' are incompatible with return type(s) of operation 'tensor<4x5xf32>'}}
   %0 = "mhlo.map"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
@@ -1793,6 +1806,7 @@ func.func @mismatch_computation_output_type(%arg0: tensor<4x5xf32>, %arg1: tenso
 // -----
 
 func.func @map_invalid_dimension_numbers(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
+  // @expected-error@+2 {{'mhlo.map' op failed to infer returned types}}
   // expected-error@+1 {{requires monotonically increasing dimension numbers, but got: dense<[1, 0]> : tensor<2xi64>}}
   %0 = "mhlo.map"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
@@ -1805,6 +1819,7 @@ func.func @map_invalid_dimension_numbers(%arg0: tensor<4x5xf32>, %arg1: tensor<4
 // -----
 
 func.func @map_mismatch_arguments_and_dimensions(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
+  // @expected-error@+2 {{'mhlo.map' op failed to infer returned types}}
   // expected-error@+1 {{applied to a subset of dimensions currently not supported: operand dimensions = 2, requested map dimensions size = 3}}
   %0 = "mhlo.map"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
@@ -1826,18 +1841,11 @@ func.func @outfeed(%arg0: tensor<3x3x3xi32>, %arg1: !mhlo.token) -> !mhlo.token
 
 // -----
 
-// CHECK-LABEL: func @real_fp_input
-func.func @real_fp_input(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = "mhlo.real"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
-}
-
-// -----
-
-func.func @real_int_input(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-  // expected-error@+1 {{operand #0 must be tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<*xi32>'}}
-  %0 = "mhlo.real"(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
-  func.return %0 : tensor<*xi32>
+func.func @real_c2(%arg0: tensor<2x3xcomplex<f32>>) -> tensor<2x3xf16> {
+  // @expected-error@+2 {{'mhlo.real' op failed to infer returned types}}
+  // expected-error@+1 {{inferred type(s) 'tensor<2x3xf32>' are incompatible with return type(s) of operation 'tensor<2x3xf16>'}}
+  %0 = "mhlo.real"(%arg0) : (tensor<2x3xcomplex<f32>>) -> tensor<2x3xf16>
+  func.return %0 : tensor<2x3xf16>
 }
 
 // -----
@@ -1850,18 +1858,10 @@ func.func @real_complex_input(%arg0: tensor<2x3xcomplex<f32>>) -> tensor<2x3xf32
 
 // -----
 
-func.func @real_mismatch_return_shape(%arg0: tensor<2x3xcomplex<f32>>) -> tensor<2x10xf32> {
-  // expected-error@+1 {{all non-scalar operands/results must have the same shape and base type}}
-  %0 = "mhlo.real"(%arg0) : (tensor<2x3xcomplex<f32>>) -> tensor<2x10xf32>
-  func.return %0 : tensor<2x10xf32>
-}
-
-// -----
-
-func.func @real_mismatch_return_element_type(%arg0: tensor<2x3xcomplex<f32>>) -> tensor<2x3xf16> {
-  // expected-error@+1 {{inferred type(s) 'tensor<2x3xf32>' are incompatible with return type(s) of operation 'tensor<2x3xf16>'}}
-  %0 = "mhlo.real"(%arg0) : (tensor<2x3xcomplex<f32>>) -> tensor<2x3xf16>
-  func.return %0 : tensor<2x3xf16>
+// CHECK-LABEL: func @real_unranked
+func.func @real_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "mhlo.real"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
 }
 
 // -----
@@ -1940,6 +1940,7 @@ func.func @rng_normal_dynamic_dim(%a: tensor<f32>, %b: tensor<f32>, %shape: tens
 
 func.func @rng_normal_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>) {
   %cst = "mhlo.constant"() {value = dense<7> : tensor<1xi64>} : () -> tensor<1xi64>
+  // @expected-error@+2 {{'mhlo.rng' op failed to infer returned types}}
   // expected-error @+1 {{inferred type(s) 'tensor<7xf32>' are incompatible with return type(s) of operation 'tensor<12xf32>'}}
   %0 = "mhlo.rng"(%arg0, %arg1, %cst) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<f32>, tensor<f32>, tensor<1xi64>) -> tensor<12xf32>
   func.return
@@ -1949,7 +1950,7 @@ func.func @rng_normal_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>) {
 
 func.func @rng_normal_invalid_mu_rank(%mu: tensor<1xf32>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+  // expected-error@+1 {{op operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
   %0 = "mhlo.rng"(%mu, %sigma, %shape) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -1958,7 +1959,7 @@ func.func @rng_normal_invalid_mu_rank(%mu: tensor<1xf32>, %sigma: tensor<f32>) -
 
 func.func @rng_normal_invalid_sigma_rank(%mu: tensor<f32>, %sigma: tensor<1xf32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{#1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+  // expected-error@+1 {{op operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
   %0 = "mhlo.rng"(%mu, %sigma, %shape) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -1976,7 +1977,7 @@ func.func @rng_normal_invalid_shape_rank(%mu: tensor<f32>, %sigma: tensor<f32>)
 
 func.func @rng_normal_invalid_type(%arg0: tensor<complex<f32>>, %arg1: tensor<f32>) {
   %cst = "mhlo.constant"() {value = dense<7> : tensor<1xi64>} : () -> tensor<1xi64>
-  // expected-error @+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
+  // expected-error @+1 {{op operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
   %0 = "mhlo.rng"(%arg0, %arg1, %cst) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<complex<f32>>, tensor<f32>, tensor<1xi64>) -> tensor<7xf32>
   func.return
 }
@@ -2009,6 +2010,7 @@ func.func @rng_uniform_dynamic_dim(%a: tensor<f32>, %b: tensor<f32>, %shape: ten
 // -----
 
 func.func @rng_uniform_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<7xi64>) {
+  // @expected-error@+2 {{'mhlo.rng' op failed to infer returned types}}
   // expected-error @+1 {{inferred type(s) 'tensor<?x?x?x?x?x?x?xf32>' are incompatible with return type(s) of operation 'tensor<?xf32>'}}
   %0 = "mhlo.rng"(%arg0, %arg1, %arg2) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<f32>, tensor<f32>, tensor<7xi64>) -> tensor<?xf32>
   func.return
@@ -2018,7 +2020,7 @@ func.func @rng_uniform_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>, %ar
 
 func.func @rng_uniform_invalid_a_rank(%a: tensor<1xf32>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+  // expected-error@+1 {{op operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
   %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -2028,7 +2030,7 @@ func.func @rng_uniform_invalid_a_rank(%a: tensor<1xf32>, %b: tensor<f32>) -> ten
 
 func.func @rng_uniform_invalid_b_rank(%a: tensor<f32>, %b: tensor<1xf32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+  // expected-error@+1 {{op operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
   %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -2046,7 +2048,7 @@ func.func @rng_uniform_invalid_shape_rank(%a: tensor<f32>, %b: tensor<f32>) -> t
 
 func.func @rng_uniform_invalid_type(%a: tensor<complex<f32>>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
+  // expected-error@+1 {{op operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
   %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<complex<f32>>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -2118,6 +2120,7 @@ func.func @select_bad_pred_type(%arg0: tensor<i32>, %arg1: tensor<2x3xi32>, %arg
 // -----
 
 func.func @select_bad_pred_shape(%arg0: tensor<3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
+  // @expected-error@+2 {{'mhlo.select' op failed to infer returned types}}
   // expected-error@+1 {{requires the same shape for all operands}}
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   func.return %0 : tensor<2x3xi32>
@@ -2126,6 +2129,7 @@ func.func @select_bad_pred_shape(%arg0: tensor<3xi1>, %arg1: tensor<2x3xi32>, %a
 // -----
 
 func.func @select_bad_shape_mismatch(%arg0: tensor<3xi1>, %arg1: tensor<2x4xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
+  // @expected-error@+2 {{'mhlo.select' op failed to infer returned types}}
   // expected-error@+1 {{requires compatible types for non-predicate operands}}
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x4xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   func.return %0 : tensor<2x3xi32>
@@ -2141,6 +2145,7 @@ func.func @select_when_pred_is_scalar(%arg0: tensor<i1>, %arg1: tensor<2x3xi32>,
 // -----
 
 func.func @select_element_type_mismatch(%arg0: tensor<i1>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
+  // @expected-error@+2 {{'mhlo.select' op failed to infer returned types}}
   // expected-error@+1 {{requires compatible types for non-predicate operands}}
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x3xf32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   func.return %0 : tensor<2x3xi32>
@@ -2149,6 +2154,7 @@ func.func @select_element_type_mismatch(%arg0: tensor<i1>, %arg1: tensor<2x3xf32
 // -----
 
 func.func @select_element_type_mismatch(%arg0: tensor<i1>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x3xf64>) -> tensor<2x3xf64> {
+  // @expected-error@+2 {{'mhlo.select' op failed to infer returned types}}
   // expected-error@+1 {{requires compatible types for non-predicate operands}}
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x3xf32>, tensor<2x3xf64>) -> tensor<2x3xf64>
   func.return %0 : tensor<2x3xf64>
@@ -2165,6 +2171,7 @@ func.func @slice(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
 // -----
 
 func.func @slice_c2(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
+  // @expected-error@+2 {{'mhlo.slice' op failed to infer returned types}}
   // expected-error@+1 {{the number of elements in start_indices (3) does not match the rank of the operand (2)}}
   %0 = "mhlo.slice"(%arg0) {
     start_indices = dense<[1, 0, 0]> : tensor<3xi64>,
@@ -2177,6 +2184,7 @@ func.func @slice_c2(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
 // -----
 
 func.func @slice_c3(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
+  // @expected-error@+2 {{'mhlo.slice' op failed to infer returned types}}
   // expected-error@+1 {{negative start index -1 in dimension 0}}
   %0 = "mhlo.slice"(%arg0) {
     start_indices = dense<[-1, 0]> : tensor<2xi64>,
@@ -2189,6 +2197,7 @@ func.func @slice_c3(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
 // -----
 
 func.func @slice_c3(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
+  // @expected-error@+2 {{'mhlo.slice' op failed to infer returned types}}
   // expected-error@+1 {{limit index 5 is larger than dimension size 4 in dimension 1}}
   %0 = "mhlo.slice"(%arg0) {
     start_indices = dense<[1, 0]> : tensor<2xi64>,
@@ -2201,6 +2210,7 @@ func.func @slice_c3(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
 // -----
 
 func.func @slice_c3(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
+  // @expected-error@+2 {{'mhlo.slice' op failed to infer returned types}}
   // expected-error@+1 {{start index 3 is larger than limit index 2 in dimension 1}}
   %0 = "mhlo.slice"(%arg0) {
     start_indices = dense<[1, 3]> : tensor<2xi64>,
@@ -2213,6 +2223,7 @@ func.func @slice_c3(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
 // -----
 
 func.func @slice_c4(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
+  // @expected-error@+2 {{'mhlo.slice' op failed to infer returned types}}
   // expected-error@+1 {{stride must be positive but got 0 in dimension 0}}
   %0 = "mhlo.slice"(%arg0) {
     start_indices = dense<[1, 0]> : tensor<2xi64>,
@@ -2237,6 +2248,7 @@ func.func @slice_dynamic_dim(%arg0: tensor<3x?xi32>) -> tensor<1x?xi32> {
 // -----
 
 func.func @slice_i2(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
+  // @expected-error@+2 {{'mhlo.slice' op failed to infer returned types}}
   // expected-error@+1 {{start_indices has rank 2 instead of required rank 1}}
   %0 = "mhlo.slice"(%arg0) {
     start_indices = dense<[[1, 0]]> : tensor<1x2xi64>,
@@ -2265,6 +2277,7 @@ func.func @dynamic_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tens
 // -----
 
 func.func @dynamic_slice_c2(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{has mismatched number of slice sizes (1) and number of start indices (2)}}
   %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[4]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
@@ -2273,6 +2286,7 @@ func.func @dynamic_slice_c2(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: t
 // -----
 
 func.func @dynamic_slice_c2(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>) -> tensor<1x4xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{has mismatched number of start indices (1) and the rank of operand (2)}}
   %0 = "mhlo.dynamic_slice"(%arg0, %arg1) {slice_sizes = dense<[1]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
@@ -2281,6 +2295,7 @@ func.func @dynamic_slice_c2(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>) -> tenso
 // -----
 
 func.func @dynamic_slice_c3(%arg0: tensor<3x4xi32>, %arg1: tensor<i32>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{start indices must have same element type}}
   %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i32>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
@@ -2289,6 +2304,7 @@ func.func @dynamic_slice_c3(%arg0: tensor<3x4xi32>, %arg1: tensor<i32>, %arg2: t
 // -----
 
 func.func @dynamic_slice_c4(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{has negative size index to dynamic slice: -1}}
   %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[-1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
@@ -2297,6 +2313,7 @@ func.func @dynamic_slice_c4(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: t
 // -----
 
 func.func @dynamic_slice_c4(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{has slice size 10 greater than dimension size 4 in dimension 1 of operand}}
   %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 10]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
@@ -2305,6 +2322,7 @@ func.func @dynamic_slice_c4(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: t
 // -----
 
 func.func @dynamic_slice_c5(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<2x4xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<1x4xi32>' are incompatible with return type(s) of operation 'tensor<2x4xi32>'}}
   %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<2x4xi32>
   func.return %0 : tensor<2x4xi32>
@@ -2321,6 +2339,7 @@ func.func @dynamic_slice_dynamic_dim(%arg0: tensor<?x4xi32>, %arg1: tensor<i64>,
 // -----
 
 func.func @dynamic_slice_i3(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
   // expected-error@+1 {{slice_sizes should be rank 1, but got rank 0.}}
   %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<1> : tensor<i64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
@@ -2337,6 +2356,7 @@ func.func @dynamic_update_slice(%operand: tensor<3x4xi64>, %update: tensor<1x4xi
 // -----
 
 func.func @dynamic_update_slice_c1(%operand: tensor<3x4xi64>, %update: tensor<1x4xi64>, %start_indices0: tensor<i64>, %start_indices1: tensor<i64>) -> tensor<3x5xi64> {
+  // @expected-error@+2 {{'mhlo.dynamic_update_slice' op failed to infer returned types}}
   // expected-error@+1 {{op inferred type(s) 'tensor<3x4xi64>' are incompatible with return type(s) of operation 'tensor<3x5xi64>'}}
   %0 = "mhlo.dynamic_update_slice"(%operand, %update, %start_indices0, %start_indices1) : (tensor<3x4xi64>, tensor<1x4xi64>, tensor<i64>, tensor<i64>) -> tensor<3x5xi64>
   func.return %0 : tensor<3x5xi64>
@@ -2345,6 +2365,7 @@ func.func @dynamic_update_slice_c1(%operand: tensor<3x4xi64>, %update: tensor<1x
 // -----
 
 func.func @dynamic_update_slice_c3(%operand: tensor<3x4xi64>, %update: tensor<2xi64>, %start_indices0: tensor<i64>, %start_indices1: tensor<i64>) -> tensor<3x4xi64> {
+  // @expected-error@+2 {{'mhlo.dynamic_update_slice' op failed to infer returned types}}
   // expected-error@+1 {{update rank does not match operand rank: 1 vs 2.}}
   %0 = "mhlo.dynamic_update_slice"(%operand, %update, %start_indices0, %start_indices1) : (tensor<3x4xi64>, tensor<2xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
   func.return %0 : tensor<3x4xi64>
@@ -2353,6 +2374,7 @@ func.func @dynamic_update_slice_c3(%operand: tensor<3x4xi64>, %update: tensor<2x
 // -----
 
 func.func @dynamic_update_slice_c4(%operand: tensor<3x4xi64>, %update: tensor<1x2xi64>, %start_indices0: tensor<i64>) -> tensor<3x4xi64> {
+  // @expected-error@+2 {{'mhlo.dynamic_update_slice' op failed to infer returned types}}
   // expected-error@+1 {{expects number of start_indices to match operand rank: 1 vs 2.}}
   %0 = "mhlo.dynamic_update_slice"(%operand, %update, %start_indices0) : (tensor<3x4xi64>, tensor<1x2xi64>, tensor<i64>) -> tensor<3x4xi64>
   func.return %0 : tensor<3x4xi64>
@@ -2361,6 +2383,7 @@ func.func @dynamic_update_slice_c4(%operand: tensor<3x4xi64>, %update: tensor<1x
 // -----
 
 func.func @dynamic_update_slice_c5(%operand: tensor<11x3x4xi32>, %update: tensor<1x3x4xi32>, %start_indices0: tensor<i32>, %start_indices1: tensor<i64>, %start_indices2: tensor<i64>) -> tensor<11x3x4xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_update_slice' op failed to infer returned types}}
   // expected-error@+1 {{start indices must have same element type}}
   %0 = "mhlo.dynamic_update_slice"(%operand, %update, %start_indices0, %start_indices1, %start_indices2) : (tensor<11x3x4xi32>, tensor<1x3x4xi32>, tensor<i32>, tensor<i64>, tensor<i64>) -> tensor<11x3x4xi32>
   func.return %0 : tensor<11x3x4xi32>
@@ -2369,6 +2392,7 @@ func.func @dynamic_update_slice_c5(%operand: tensor<11x3x4xi32>, %update: tensor
 // -----
 
 func.func @dynamic_update_slice_c6(%operand: tensor<3x4xi64>, %update: tensor<1x5xi64>, %start_indices0: tensor<i64>, %start_indices1: tensor<i64>) -> tensor<3x4xi64> {
+  // @expected-error@+2 {{'mhlo.dynamic_update_slice' op failed to infer returned types}}
   // expected-error@+1 {{expects size at dimension 1 of update to be in range [0, 4]. Got: 5.}}
   %0 = "mhlo.dynamic_update_slice"(%operand, %update, %start_indices0, %start_indices1) : (tensor<3x4xi64>, tensor<1x5xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
   func.return %0 : tensor<3x4xi64>
@@ -2439,6 +2463,7 @@ func.func @transpose_missing_permutation(%arg0: tensor<1x2x3x4xi32>) -> tensor<2
 // -----
 
 func.func @transpose_bad_permutations_rank(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
+  // @expected-error@+2 {{'mhlo.transpose' op failed to infer returned types}}
   // expected-error@+1 {{permutation has rank 2 instead of rank 1}}
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[[1]]> : tensor<1x1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   func.return %0: tensor<2x1x4x3xi32>
@@ -2447,6 +2472,7 @@ func.func @transpose_bad_permutations_rank(%arg0: tensor<1x2x3x4xi32>) ->  tenso
 // -----
 
 func.func @transpose_bad_permutations_size(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
+  // @expected-error@+2 {{'mhlo.transpose' op failed to infer returned types}}
   // expected-error@+1 {{TransposeOp operand rank 4 does not match permutation size 1}}
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1]> : tensor<1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   func.return %0: tensor<2x1x4x3xi32>
@@ -2455,6 +2481,7 @@ func.func @transpose_bad_permutations_size(%arg0: tensor<1x2x3x4xi32>) ->  tenso
 // -----
 
 func.func @transpose_bad_permutation(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
+  // @expected-error@+2 {{'mhlo.transpose' op failed to infer returned types}}
   // expected-error@+1 {{attribute permutation must be a permutation of [0, 1, 2, 3] but got dense<[1, 0, 3, 9]> : tensor<4xi64>}}
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 9]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   func.return %0: tensor<2x1x4x3xi32>
@@ -2463,6 +2490,7 @@ func.func @transpose_bad_permutation(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x
 // -----
 
 func.func @transpose_operand_result_rank_mismatch(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2xi32> {
+  // @expected-error@+2 {{'mhlo.transpose' op failed to infer returned types}}
   // expected-error@+1 {{op inferred type(s) 'tensor<2x1x4x3xi32>' are incompatible with return type(s) of operation 'tensor<2xi32>'}}
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2xi32>
   func.return %0: tensor<2xi32>
@@ -2471,6 +2499,7 @@ func.func @transpose_operand_result_rank_mismatch(%arg0: tensor<1x2x3x4xi32>) ->
 // -----
 
 func.func @transpose_operand_result_permutation_mismatch(%arg0: tensor<1x?x3x?xi32>) ->  tensor<?x2x?x?xi32> {
+  // @expected-error@+2 {{'mhlo.transpose' op failed to infer returned types}}
   // expected-error@+1 {{op inferred type(s) 'tensor<?x1x?x3xi32>' are incompatible with return type(s) of operation 'tensor<?x2x?x?xi32>}}
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x?x3x?xi32>) -> tensor<?x2x?x?xi32>
   func.return %0: tensor<?x2x?x?xi32>
@@ -2535,6 +2564,7 @@ func.func @triangular_solve_b_is_unranked(%arg0: tensor<4x4xf32>, %arg1: tensor<
 // -----
 
 func.func @triangular_solve_rank_less_than_2(%arg0: tensor<4xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x3xf32> {
+  // @expected-error@+2 {{'mhlo.triangular_solve' op failed to infer returned types}}
   // expected-error@+1 {{operand 'a' must have rank >= 2, but got 'tensor<4xf32>'}}
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
   func.return %0 : tensor<4x3xf32>
@@ -2543,6 +2573,7 @@ func.func @triangular_solve_rank_less_than_2(%arg0: tensor<4xf32>, %arg1: tensor
 // -----
 
 func.func @triangular_solve_unequal_minor_dims_a(%arg0: tensor<4x3xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x3xf32> {
+  // @expected-error@+2 {{'mhlo.triangular_solve' op failed to infer returned types}}
   // expected-error@+1 {{two minor dimensions of operand 'a' must be compatible, but got 'tensor<4x3xf32>'}}
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4x3xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
   func.return %0 : tensor<4x3xf32>
@@ -2551,6 +2582,7 @@ func.func @triangular_solve_unequal_minor_dims_a(%arg0: tensor<4x3xf32>, %arg1:
 // -----
 
 func.func @triangular_solve_unequal_rank(%arg0: tensor<10x4x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x3xf32> {
+  // @expected-error@+2 {{'mhlo.triangular_solve' op failed to infer returned types}}
   // expected-error@+1 {{operands must have equal rank, but got 'tensor<10x4x4xf32>' and 'tensor<4x3xf32>'}}
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<10x4x4xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
   func.return %0 : tensor<4x3xf32>
@@ -2559,6 +2591,7 @@ func.func @triangular_solve_unequal_rank(%arg0: tensor<10x4x4xf32>, %arg1: tenso
 // -----
 
 func.func @triangular_solve_mismatch_shared_dim(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> tensor<3x4xf32> {
+  // @expected-error@+2 {{'mhlo.triangular_solve' op failed to infer returned types}}
   // expected-error@+1 {{shared dimension of operands 'a' and 'b' must be compatible, but got 'tensor<4x4xf32>' and 'tensor<3x4xf32>'}}
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4x4xf32>, tensor<3x4xf32>) -> tensor<3x4xf32>
   func.return %0 : tensor<3x4xf32>
@@ -2567,6 +2600,7 @@ func.func @triangular_solve_mismatch_shared_dim(%arg0: tensor<4x4xf32>, %arg1: t
 // -----
 
 func.func @triangular_solve_mismatch_leading_dims(%arg0: tensor<10x5x4x4xf32>, %arg1: tensor<10x6x4x3xf32>) -> tensor<10x6x4x3xf32> {
+  // @expected-error@+2 {{'mhlo.triangular_solve' op failed to infer returned types}}
   // expected-error@+1 {{batch dimensions of the operands must be compatible, but got 'tensor<10x5x4x4xf32>' and 'tensor<10x6x4x3xf32>'}}
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<10x5x4x4xf32>, tensor<10x6x4x3xf32>) -> tensor<10x6x4x3xf32>
   func.return %0 : tensor<10x6x4x3xf32>
@@ -2575,6 +2609,7 @@ func.func @triangular_solve_mismatch_leading_dims(%arg0: tensor<10x5x4x4xf32>, %
 // -----
 
 func.func @triangular_solve_mismatch_result_and_b_type(%arg0: tensor<4x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x4xf32> {
+  // @expected-error@+2 {{'mhlo.triangular_solve' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<4x3xf32>' are incompatible with return type(s) of operation 'tensor<4x4xf32>'}}
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4x4xf32>, tensor<4x3xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
@@ -2583,6 +2618,7 @@ func.func @triangular_solve_mismatch_result_and_b_type(%arg0: tensor<4x4xf32>, %
 // -----
 
 func.func @triangular_solve(%arg0: tensor<10x5x4x4xf32>, %arg1: tensor<10x5x4x4xf32>) -> tensor<10x5x4x4xf32> {
+  // @expected-error@+2 {{'mhlo.triangular_solve' op failed to infer returned types}}
   // expected-error@+1 {{Invalid transpose option value for triangular solve}}
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose TRANSPOSE_INVALID>, unit_diagonal = true} : (tensor<10x5x4x4xf32>, tensor<10x5x4x4xf32>) -> tensor<10x5x4x4xf32>
   func.return %0 : tensor<10x5x4x4xf32>
@@ -2606,6 +2642,7 @@ func.func @tuple_token(%arg0: tensor<f32>, %arg1: !mhlo.token) -> tuple<tensor<f
 // -----
 
 func.func @tuple_arg_size_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>> {
+  // @expected-error@+2 {{'mhlo.tuple' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tuple<tensor<f32>, tensor<f32>>' are incompatible with return type(s) of operation 'tuple<tensor<f32>, tensor<f32>, tensor<f32>>'}}
   %0 = "mhlo.tuple"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>>
   func.return %0 : tuple<tensor<f32>, tensor<f32>, tensor<f32>>
@@ -2614,6 +2651,7 @@ func.func @tuple_arg_size_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tu
 // -----
 
 func.func @tuple_type_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tuple<tensor<f32>, tensor<i32>> {
+  // @expected-error@+2 {{'mhlo.tuple' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tuple<tensor<f32>, tensor<f32>>' are incompatible with return type(s) of operation 'tuple<tensor<f32>, tensor<i32>>'}}
   %0 = "mhlo.tuple"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<i32>>
   func.return %0 : tuple<tensor<f32>, tensor<i32>>
@@ -2636,6 +2674,7 @@ func.func @get_tuple_element_token(%arg0: tuple<tensor<f32>, !mhlo.token>) -> !m
 // -----
 
 func.func @get_tuple_element_bad_type(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<i32> {
+  // @expected-error@+2 {{'mhlo.get_tuple_element' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<f32>' are incompatible with return type(s) of operation 'tensor<i32>'}}
   %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<i32>
   func.return %0 : tensor<i32>
@@ -2644,6 +2683,7 @@ func.func @get_tuple_element_bad_type(%arg0: tuple<tensor<f32>, tensor<i32>>) ->
 // -----
 
 func.func @get_tuple_element_index_out_of_bounds(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
+  // @expected-error@+2 {{'mhlo.get_tuple_element' op failed to infer returned types}}
   // expected-error@+1 {{index 2 is out of bounds of operand with size 2}}
   %0 = "mhlo.get_tuple_element"(%arg0) {index = 2 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
@@ -2675,7 +2715,7 @@ func.func @or_invalid_f32_type(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> te
 // -----
 
 func.func @floor_invalid_i32_type(%arg0: tensor<4xi32>) -> tensor<4xi32> {
-  // expected-error@+1 {{must be tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<4xi32>'}}
+  // expected-error@+1 {{op operand #0 must be tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<4xi32>'}}
   %0 = "mhlo.floor"(%arg0) : (tensor<4xi32>) -> tensor<4xi32>
   func.return %0 : tensor<4xi32>
 }
@@ -2696,6 +2736,7 @@ func.func @constants() -> () {
 // -----
 
 func.func @constant_invalid() -> () {
+  // @expected-error@+2 {{'mhlo.constant' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.constant' op inferred type(s) 'tensor<i32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %0 = "mhlo.constant"() {value = dense<0> : tensor<i32>} : () -> (tensor<3xi32>)
   func.return
@@ -2875,6 +2916,7 @@ func.func @sort_invalid_comparator_return_type(%input0: tensor<16x16xf32>, %inpu
 // -----
 
 func.func @sort_invalid_return_types(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
+  // @expected-error@+2 {{'mhlo.sort' op failed to infer returned types}}
   // expected-error @+1 {{op inferred type(s) 'tensor<16x16xf32>', 'tensor<16x16xi32>' are incompatible with return type(s) of operation 'tensor<16x16xf32>'}}
   %0 = "mhlo.sort"(%input0, %input1) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
@@ -2887,6 +2929,7 @@ func.func @sort_invalid_return_types(%input0: tensor<16x16xf32>, %input1: tensor
 // -----
 
 func.func @sort_invalid_return_types(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
+  // @expected-error@+2 {{'mhlo.sort' op failed to infer returned types}}
   // expected-error @+1 {{inferred type(s) 'tensor<16x16xf32>', 'tensor<16x16xi32>' are incompatible with return type(s) of operation 'tensor<16x16xf32>', 'tensor<16x16xf32>'}}
   %0:2 = "mhlo.sort"(%input0, %input1) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
@@ -3658,6 +3701,7 @@ func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi32>) -> t
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{index_vector_dim 4 is out of bounds for start indices with rank 3}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3675,6 +3719,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (2) is not equal to operand rank (3)}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3692,6 +3737,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{start_index_map size (1) is not equal to size of index dimension (2) of start_indices (2)}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3709,6 +3755,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{slice_sizes.rank != 1}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3726,6 +3773,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{slice_sizes size (2) not equal to (implied) operand rank (3)}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3743,6 +3791,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi32>) -> tensor<*xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{slice_sizes size (6) not equal to (implied) operand rank (3)}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3760,6 +3809,7 @@ func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi32>) -> t
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<3xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<1x5x8xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3777,6 +3827,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<3xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<8x?x7x1x6x1x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3794,6 +3845,7 @@ func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x?xi32>)
 // -----
 
 func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi32>) -> tensor<*xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{slice_sizes collapsed dimension 2 should <= 1 but got 8}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3811,6 +3863,7 @@ func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi32>) -> t
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{collapsed dimension -1 is out of bounds for slice_sizes.size (3)}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3828,6 +3881,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{collapsed dimension 17 is out of bounds for slice_sizes.size (3)}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3845,6 +3899,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<*xi32>) -> tensor<*xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{slice size (-1) is out of bounds for operand dimension (2) at index 2}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3862,6 +3917,7 @@ func.func @gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<*xi32>)
 // -----
 
 func.func @gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<*xi32>) -> tensor<*xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{slice size (8) is out of bounds for operand dimension (2) at index 2}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3879,6 +3935,7 @@ func.func @gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<*xi32>)
 // -----
 
 func.func @gather(%operand : tensor<16x11xi32>, %start_indices : tensor<5x2xi32>) -> tensor<5x8x6xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{expects offset_dims to not repeat, got: [2, 2]}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3896,6 +3953,7 @@ func.func @gather(%operand : tensor<16x11xi32>, %start_indices : tensor<5x2xi32>
 // -----
 
 func.func @gather(%operand : tensor<16x11xi32>, %start_indices : tensor<5x2xi32>) -> tensor<5x8x6xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{expects offset_dims to be sorted, got: [2, 1]}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3913,6 +3971,7 @@ func.func @gather(%operand : tensor<16x11xi32>, %start_indices : tensor<5x2xi32>
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{expects collapsed_slice_dims to not repeat, got: [1, 1]}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3930,6 +3989,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{expects collapsed_slice_dims to be sorted, got: [1, 0]}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3947,6 +4007,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{offset_dims[0]: -1 is out of bounds for implied result rank 3}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3964,6 +4025,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{offset_dims[0]: 3 is out of bounds for implied result rank 3}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3981,6 +4043,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{start_index_map[0]: -2 is out of bounds for operand rank 3}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3998,6 +4061,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{start_index_map[1]: 3 is out of bounds for operand rank 3}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -4015,6 +4079,7 @@ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi3
 // -----
 
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
+  // @expected-error@+2 {{'mhlo.gather' op failed to infer returned types}}
   // expected-error@+1 {{expects start_index_map to not repeat, got: [0, 0]}}
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -4078,6 +4143,7 @@ func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<
 // -----
 
 func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<*xi32>) -> tensor<*xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
   // expected-error@+1 {{index_vector_dim 4 is out of bounds for start indices with rank 3}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4094,6 +4160,7 @@ func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x
 // -----
 
 func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<*xi32>, %slice_sizes : tensor<*xi32>) -> tensor<*xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
   // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (2) is not equal to operand rank (3)}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4110,6 +4177,7 @@ func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<
 // -----
 
 func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x2xi32>, %slice_sizes : tensor<*xi32>) -> tensor<*xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
   // expected-error@+1 {{start_index_map size (1) is not equal to size of index dimension (2) of start_indices (2)}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4126,6 +4194,7 @@ func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x
 // -----
 
 func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi32>, %slice_sizes : tensor<?x?xi32>) -> tensor<*xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
   // expected-error@+1 {{slice_sizes.rank != 1}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4142,6 +4211,7 @@ func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi3
 // -----
 
 func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<*xi32>, %slice_sizes : tensor<2xi32>) -> tensor<*xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
   // expected-error@+1 {{slice_sizes size (2) not equal to (implied) operand rank (3)}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4158,6 +4228,7 @@ func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<
 // -----
 
 func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<1x5x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4174,6 +4245,7 @@ func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<
 // -----
 
 func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<*xi32>) -> tensor<3xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<1x5x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4190,6 +4262,7 @@ func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<
 // -----
 
 func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<?x?x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4206,6 +4279,7 @@ func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<
 // -----
 
 func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<?xi32>) -> tensor<?xi32> {
+  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<?x?x?xi32>' are incompatible with return type(s) of operation 'tensor<?xi32>'}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4222,6 +4296,7 @@ func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x
 // -----
 
 func.func @get_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<i32> {
+  // @expected-error@+2 {{'mhlo.get_dimension_size' op failed to infer returned types}}
   // expected-error@+1 {{requires dimension attribute in range [0, 3); found (3)}}
   %size = "mhlo.get_dimension_size"(%I) {dimension = 3 : i64} : (tensor<1x128x512xf32>) -> tensor<i32>
   func.return %size : tensor<i32>
@@ -4237,6 +4312,7 @@ func.func @get_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<i32> {
 // -----
 
 func.func @get_dimension_size_negative_dimension(%I: tensor<1x128x512xf32>) -> tensor<i32> {
+  // @expected-error@+2 {{'mhlo.get_dimension_size' op failed to infer returned types}}
   // expected-error@+1 {{requires non-negative dimension attribute; found (-1)}}
   %size = "mhlo.get_dimension_size"(%I) {dimension = -1 : i64} : (tensor<1x128x512xf32>) -> tensor<i32>
   func.return %size : tensor<i32>
@@ -4245,6 +4321,7 @@ func.func @get_dimension_size_negative_dimension(%I: tensor<1x128x512xf32>) -> t
 // -----
 
 func.func @get_dimension_size_invalid_dimension(%I: tensor<1x128x512xf32>) -> tensor<i32> {
+  // @expected-error@+2 {{'mhlo.get_dimension_size' op failed to infer returned types}}
   // expected-error@+1 {{requires dimension attribute in range [0, 3); found (3)}}
   %size = "mhlo.get_dimension_size"(%I) {dimension = 3 : i64} : (tensor<1x128x512xf32>) -> tensor<i32>
   func.return %size : tensor<i32>
@@ -4255,6 +4332,7 @@ func.func @get_dimension_size_invalid_dimension(%I: tensor<1x128x512xf32>) -> te
 func.func @set_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<1x128x512xf32> {
   %dim = mhlo.constant dense<512> : tensor<1xi32>
 
+  // @expected-error@+2 {{'mhlo.set_dimension_size' op failed to infer returned types}}
   // expected-error@+1 {{size operand should be of rank-0}}
   %result = "mhlo.set_dimension_size"(%I, %dim) {dimension = 2 : i64} : (tensor<1x128x512xf32>, tensor<1xi32>) -> tensor<1x128x512xf32>
   func.return %result : tensor<1x128x512xf32>
@@ -4264,6 +4342,7 @@ func.func @set_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<1x128x512xf32
 
 func.func @set_dimension_size_negative_dimension(%I: tensor<1x128x512xf32>) -> tensor<1x128x512xf32> {
   %dim = mhlo.constant dense<512> : tensor<i32>
+  // @expected-error@+2 {{'mhlo.set_dimension_size' op failed to infer returned types}}
   // expected-error@+1 {{requires non-negative dimension attribute; found (-1)}}
   %result = "mhlo.set_dimension_size"(%I, %dim) {dimension =-1 : i64} : (tensor<1x128x512xf32>, tensor<i32>) -> tensor<1x128x512xf32>
   func.return %result : tensor<1x128x512xf32>
@@ -4273,6 +4352,7 @@ func.func @set_dimension_size_negative_dimension(%I: tensor<1x128x512xf32>) -> t
 
 func.func @set_dimension_size_invalid_dimension(%I: tensor<1x128x512xf32>) -> tensor<1x128x512xf32> {
   %dim = mhlo.constant dense<512> : tensor<i32>
+  // @expected-error@+2 {{'mhlo.set_dimension_size' op failed to infer returned types}}
   // expected-error@+1 {{requires dimension attribute in range [0, 3); found (3)}}
   %result = "mhlo.set_dimension_size"(%I, %dim) {dimension = 3 : i64} : (tensor<1x128x512xf32>, tensor<i32>) -> tensor<1x128x512xf32>
   func.return %result : tensor<1x128x512xf32>
@@ -5085,6 +5165,7 @@ func.func @batch_norm_train_dynamic(%input: tensor<?x?x2x2xf32>, %scale: tensor<
 // -----
 
 func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %offset: tensor<2xf32>) -> tensor<2x2x2x2xf32> {
+  // @expected-error@+2 {{'mhlo.batch_norm_training' op failed to infer returned types}}
   // expected-error@+1 {{expects featureIndex to be smaller than the rank of multi-dimensional operands; got featureIndex 4, and rank 4.}}
   %0:3 = "mhlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = 4 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
@@ -5093,6 +5174,7 @@ func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2x
 // -----
 
 func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %offset: tensor<2xf32>) -> tensor<2x2x2x2xf32> {
+  // @expected-error@+2 {{'mhlo.batch_norm_training' op failed to infer returned types}}
   // expected-error@+1 {{expects featureIndex to be a non-negative number, got -1.}}
   %0:3 = "mhlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = -1 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
@@ -5101,6 +5183,7 @@ func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2x
 // -----
 
 func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<3xf32>, %offset: tensor<3xf32>) -> tensor<2x2x2x2xf32> {
+  // @expected-error@+2 {{'mhlo.batch_norm_training' op failed to infer returned types}}
   // expected-error@+1 {{expects the size of single-dimensional operands to be compatible with feature count, but the size of single-dimensional operands is 3 and the feature count is 2.}}
   %0:3 = "mhlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = 3 : i64} : (tensor<2x2x2x2xf32>, tensor<3xf32>, tensor<3xf32>) -> (tensor<2x2x2x2xf32>, tensor<3xf32>, tensor<3xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
@@ -5131,6 +5214,7 @@ func.func @batch_norm_inference_dynamic(%input: tensor<4x?xf32>, %scale: tensor<
 // -----
 
 func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>, %mean: tensor<256xf32>, %variance: tensor<256xf32>) -> (tensor<4x256xf32>) {
+  // @expected-error@+2 {{'mhlo.batch_norm_inference' op failed to infer returned types}}
   // expected-error@+1 {{expects featureIndex to be smaller than the rank of multi-dimensional operands; got featureIndex 2, and rank 2.}}
   %0 = "mhlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = 2 : i64} :
       (tensor<4x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
@@ -5141,6 +5225,7 @@ func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<
 // -----
 
 func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>, %mean: tensor<256xf32>, %variance: tensor<256xf32>) -> (tensor<4x256xf32>) {
+  // @expected-error@+2 {{'mhlo.batch_norm_inference' op failed to infer returned types}}
   // expected-error@+1 {{expects featureIndex to be a non-negative number, got -1.}}
   %0 = "mhlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = -1 : i64} :
       (tensor<4x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
@@ -5151,6 +5236,7 @@ func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<
 // -----
 
 func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<25xf32>, %offset: tensor<25xf32>, %mean: tensor<25xf32>, %variance: tensor<25xf32>) -> (tensor<4x256xf32>) {
+  // @expected-error@+2 {{'mhlo.batch_norm_inference' op failed to infer returned types}}
   // expected-error@+1 {{expects the size of single-dimensional operands to be compatible with feature count, but the size of single-dimensional operands is 25 and the feature count is 256.}}
   %0 = "mhlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = 1 : i64} :
       (tensor<4x256xf32>, tensor<25xf32>, tensor<25xf32>, tensor<25xf32>,
@@ -5180,6 +5266,7 @@ func.func @batch_norm_grad_dynamic(%input: tensor<?x2x2x2xf32>, %scale: tensor<?
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
+  // @expected-error@+2 {{'mhlo.batch_norm_grad' op failed to infer returned types}}
   // expected-error@+1 {{expects featureIndex to be smaller than the rank of multi-dimensional operands; got featureIndex 4, and rank 4.}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 4 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
@@ -5188,6 +5275,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
+  // @expected-error@+2 {{'mhlo.batch_norm_grad' op failed to infer returned types}}
   // expected-error@+1 {{expects featureIndex to be a non-negative number, got -1.}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = -1 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
@@ -5196,6 +5284,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf32>, %mean: tensor<4xf32>, %variance: tensor<4xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
+  // @expected-error@+2 {{'mhlo.batch_norm_grad' op failed to infer returned types}}
   // expected-error@+1 {{expects the size of single-dimensional operands to be compatible with feature count, but the size of single-dimensional operands is 4 and the feature count is 2.}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<4xf32>, tensor<4xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
@@ -5204,6 +5293,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
+  // @expected-error@+2 {{'mhlo.batch_norm_grad' op failed to infer returned types}}
   // expected-error@+1 {{expects single-dimensional operands to have compatible shapes}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<4xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
@@ -5212,7 +5302,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xi32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{operand #0 must be ranked tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2x2x2xi32>'}}
+  // expected-error@+1 {{op operand #0 must be ranked tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2x2x2xi32>'}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xi32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -5220,6 +5310,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xi32>, %scale: tensor<2xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2xf32>) -> tensor<2x2x2x2xf32> {
+  // @expected-error@+2 {{'mhlo.batch_norm_grad' op failed to infer returned types}}
   // expected-error@+1 {{expects multi-dimensional operands to have compatible shapes}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
@@ -5244,7 +5335,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{result #1 must be 1D tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2xf32>'}}
+  // expected-error@+1 {{op result #1 must be 1D tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2xf32>'}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2x2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -5252,7 +5343,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<*xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<*xf32> {
-  // expected-error@+1 {{operand #0 must be ranked tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<*xf32>'}}
+  // expected-error@+1 {{op operand #0 must be ranked tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<*xf32>'}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<*xf32>
 }
@@ -5310,6 +5401,7 @@ func.func @rfft_unranked(%arg0: tensor<*xf32>) -> tensor<*xcomplex<f32>> {
 // -----
 
 func.func @rfft_not_float32or64(%arg0: tensor<3x9xf16>) -> tensor<3x5xcomplex<f32>> {
+  // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{RFFT requires f32 or f64 input type, but is given 'f16'.}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x9xf16>) -> tensor<3x5xcomplex<f32>>
   func.return %0 : tensor<3x5xcomplex<f32>>
@@ -5318,6 +5410,7 @@ func.func @rfft_not_float32or64(%arg0: tensor<3x9xf16>) -> tensor<3x5xcomplex<f3
 // -----
 
 func.func @fft_invalid_rank(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
+  // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{rank must be between 1 and 3, but got 4.}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<4xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
   func.return %0 : tensor<3x9xcomplex<f32>>
@@ -5326,6 +5419,7 @@ func.func @fft_invalid_rank(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
 // -----
 
 func.func @fft_rank_mismatch(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
+  // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{operand rank must not be less than fft rank of 3 for operand of type 'tensor<3x9xf32>'}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<3xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
   func.return %0 : tensor<3x9xcomplex<f32>>
@@ -5334,6 +5428,7 @@ func.func @fft_rank_mismatch(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
 // -----
 
 func.func @rfft_invalid_dim(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
+  // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{RFFT requires innermost dimensions to be compatible with fft_length. Got: 3, 9 but wanted 9, 9.}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<2xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
   func.return %0 : tensor<3x9xcomplex<f32>>
@@ -5342,6 +5437,7 @@ func.func @rfft_invalid_dim(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
 // -----
 
 func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32> {
+  // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{IRFFT requires non-final dimensions to be compatible with fft_length. Got: 3, 9 but wanted 9, 9, and 3 != 9.}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<2xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
   func.return %0 : tensor<3x9xf32>
@@ -5350,6 +5446,7 @@ func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
 // -----
 
 func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32> {
+  // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{IRFFT requires innermost dimension to be compatible with fft_length[-1]/2+1. Got: 9 but fft_length is 9.}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
   func.return %0 : tensor<3x9xf32>
@@ -5358,6 +5455,7 @@ func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
 // -----
 
 func.func @irfft_invalid_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
+  // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{FFT/IFFT/IRFFT take a complex tensor as input, but is given 'tensor<3x9xf32>'}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
   func.return %0 : tensor<3x9xcomplex<f32>>
@@ -5366,6 +5464,7 @@ func.func @irfft_invalid_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
 // -----
 
 func.func @irfft_invalid_ret_elt(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x16xcomplex<f32>> {
+  // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<3x16xf32>' are incompatible with return type(s) of operation 'tensor<3x16xcomplex<f32>>'}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x16xcomplex<f32>>
   func.return %0 : tensor<3x16xcomplex<f32>>
@@ -5374,6 +5473,7 @@ func.func @irfft_invalid_ret_elt(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x16
 // -----
 
 func.func @rfft_invalid_ret_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xf32> {
+  // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<3x5xcomplex<f32>>' are incompatible with return type(s) of operation 'tensor<3x9xf32>'}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xf32>
   func.return %0 : tensor<3x9xf32>
@@ -5390,6 +5490,7 @@ func.func @rfft_dynamic(%arg0: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
 // -----
 
 func.func @rfft_dynamic_incompatible_dims(%arg0: tensor<3x10xf32>) -> tensor<?x?xcomplex<f32>> {
+  // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1{{RFFT requires innermost dimensions to be compatible with fft_length. Got: 3, 10 but wanted 9.}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x10xf32>) -> tensor<?x?xcomplex<f32>>
   func.return %0 : tensor<?x?xcomplex<f32>>
@@ -5406,6 +5507,7 @@ func.func @irfft_dynamic(%arg0: tensor<?x?xcomplex<f32>>) -> tensor<?x?xf32> {
 // -----
 
 func.func @irfft_dynamic_incompatible_non_final_dims(%arg0: tensor<?x3x15xcomplex<f32>>) -> tensor<?x?x?xf32> {
+  // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1{{IRFFT requires non-final dimensions to be compatible with fft_length. Got: -9223372036854775808, 3, 15 but wanted 4, 16, and 3 != 4}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<[4, 16]> : tensor<2xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<?x3x15xcomplex<f32>>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
@@ -5414,6 +5516,7 @@ func.func @irfft_dynamic_incompatible_non_final_dims(%arg0: tensor<?x3x15xcomple
 // -----
 
 func.func @irfft_dynamic_incompatible_final_dim(%arg0: tensor<?x8xcomplex<f32>>) -> tensor<?x?xf32> {
+  // @expected-error@+2 {{'mhlo.fft' op failed to infer returned types}}
   // expected-error@+1{{IRFFT requires innermost dimension to be compatible with fft_length[-1]/2+1. Got: 8 but fft_length is 16.}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<?x8xcomplex<f32>>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
@@ -5522,6 +5625,7 @@ func.func @add_dependency_token(%data: tensor<4x16xf32>) -> !mhlo.token {
 // -----
 
 func.func @add_dependency(%data: tensor<4x16xf32>) -> !mhlo.token {
+  // @expected-error@+3 {{'mhlo.add_dependency' op failed to infer returned types}}
   // expected-error@+2 {{'mhlo.add_dependency' op inferred type(s) 'tensor<4x16xf32>' are incompatible with return type(s) of operation '!mhlo.token'}}
   %token = "mhlo.create_token"() : () -> !mhlo.token
   %0 = "mhlo.add_dependency"(%data, %token) : (tensor<4x16xf32>, !mhlo.token) -> !mhlo.token
@@ -5531,6 +5635,7 @@ func.func @add_dependency(%data: tensor<4x16xf32>) -> !mhlo.token {
 // -----
 
 func.func @add_dependency(%data: tensor<4x16xf32>) -> tensor<4x16xf32> {
+  // @expected-error@+4 {{'mhlo.add_dependency' op failed to infer returned types}}
   // expected-error@+3 {{inferred type(s) '!mhlo.token' are incompatible with return type(s) of operation 'tensor<4x16xf32>'}}
   %token = "mhlo.create_token"() : () -> !mhlo.token
   %token2 = "mhlo.create_token"() : () -> !mhlo.token
@@ -5596,6 +5701,7 @@ func.func @quantized_constants() -> (tensor<2x!quant.uniform<i8:f32, 2.0:15>>, t
 // -----
 
 func.func @quantized_constants_invalid_storage_type() -> () {
+  // @expected-error@+2 {{'mhlo.constant' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.constant' op inferred type(s) 'tensor<2xui8>' are incompatible with return type(s) of operation 'tensor<2x!quant.uniform<i8:f32, 2.000000e+00:15>>}}
   %0 = "mhlo.constant"() {value = dense<[1, 2]> : tensor<2xui8>} : () -> tensor<2x!quant.uniform<i8:f32, 2.0:15>>
   func.return
@@ -5684,6 +5790,7 @@ func.func @pad(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16
 // -----
 
 func.func @pad_c2(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16> {
+  // @expected-error@+2 {{'mhlo.pad' op failed to infer returned types}}
   // expected-error@+1 {{edge_padding_low length (2) must match operand rank (3)}}
   %0 = "mhlo.pad"(%arg0, %arg1) {
     edge_padding_low = dense<[0, 1]> : tensor<2xi64>,
@@ -5696,6 +5803,7 @@ func.func @pad_c2(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7x
 // -----
 
 func.func @pad_c3(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x3xf16> {
+  // @expected-error@+2 {{'mhlo.pad' op failed to infer returned types}}
   // expected-error@+1 {{Interior padding cannot be negative: -1}}
   %0 = "mhlo.pad"(%arg0, %arg1) {
     edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
@@ -5708,6 +5816,7 @@ func.func @pad_c3(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x3x
 // -----
 
 func.func @pad_c4(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16> {
+  // @expected-error@+2 {{'mhlo.pad' op failed to infer returned types}}
   // expected-error@+1 {{Padding result in negative size for dimension 2}}
   %0 = "mhlo.pad"(%arg0, %arg1) {
     edge_padding_low = dense<[0, 1, -4]> : tensor<3xi64>,
@@ -5720,6 +5829,7 @@ func.func @pad_c4(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7x
 // -----
 
 func.func @pad_c4(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<8x8x8xf16> {
+  // @expected-error@+2 {{'mhlo.pad' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.pad' op inferred type(s) 'tensor<2x4x7xf16>' are incompatible with return type(s) of operation 'tensor<8x8x8xf16>'}}
   %0 = "mhlo.pad"(%arg0, %arg1) {
     edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
@@ -5745,6 +5855,7 @@ func.func @pad_dynamic(%arg0: tensor<?x48x48x32xf32>) -> tensor<?x48x48x48xf32>
 // -----
 
 func.func @pad_i2(%arg0: tensor<1x2x3xf16>, %arg1: tensor<2xf16>) -> tensor<2x4x7xf16> {
+  // @expected-error@+2 {{'mhlo.pad' op failed to infer returned types}}
   // expected-error@+1 {{padding value type should be a rank-0 tensor, is rank 1}}
   %0 = "mhlo.pad"(%arg0, %arg1) {
     edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
@@ -5757,6 +5868,7 @@ func.func @pad_i2(%arg0: tensor<1x2x3xf16>, %arg1: tensor<2xf16>) -> tensor<2x4x
 // -----
 
 func.func @pad_i3(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16> {
+  // @expected-error@+2 {{'mhlo.pad' op failed to infer returned types}}
   // expected-error@+1 {{edge_padding_low has rank 0 instead of required rank 1}}
   %0 = "mhlo.pad"(%arg0, %arg1) {
     edge_padding_low = dense<1> : tensor<i64>,
@@ -5992,7 +6104,8 @@ func.func @abs_complex(%arg0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf32> {
 // -----
 
 func.func @abs_mismatch_element_type(%arg0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf64> {
-// expected-error@+1 {{'mhlo.abs' op inferred type(s) 'tensor<1x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2xf64>'}}
+  // @expected-error@+2 {{'mhlo.abs' op failed to infer returned types}}
+  // expected-error@+1 {{'mhlo.abs' op inferred type(s) 'tensor<1x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2xf64>'}}
   %0 = "mhlo.abs"(%arg0) {} : (tensor<1x2xcomplex<f32>>) -> tensor<1x2xf64>
   func.return %0 : tensor<1x2xf64>
 }
@@ -6000,8 +6113,9 @@ func.func @abs_mismatch_element_type(%arg0: tensor<1x2xcomplex<f32>>) -> tensor<
 // -----
 
 func.func @abs_complex_mismatch_element_type(%arg0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf64> {
-// expected-error@+1 {{'stablehlo.abs' op inferred type(s) 'tensor<1x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2xf64>'}}
-  %0 = "stablehlo.abs"(%arg0) {} : (tensor<1x2xcomplex<f32>>) -> tensor<1x2xf64>
+  // @expected-error@+2 {{'mhlo.abs' op failed to infer returned types}}
+  // expected-error@+1 {{'mhlo.abs' op inferred type(s) 'tensor<1x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2xf64>'}}
+  %0 = "mhlo.abs"(%arg0) {} : (tensor<1x2xcomplex<f32>>) -> tensor<1x2xf64>
   func.return %0 : tensor<1x2xf64>
 }
 
@@ -6048,6 +6162,7 @@ func.func @complex_f16_input(%arg0: tensor<10x10xf16>, %arg1: tensor<10x10xf16>)
 // -----
 
 func.func @complex_mismatch_return_element_type(%arg0: tensor<10x10xf32>, %arg1: tensor<10x10xf32>) -> tensor<10x10xcomplex<f64>> {
+  // @expected-error@+2 {{'mhlo.complex' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<10x10xcomplex<f32>>' are incompatible with return type(s) of operation 'tensor<10x10xcomplex<f64>>'}}
   %0 = "mhlo.complex"(%arg0, %arg1) {} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xcomplex<f64>>
   func.return %0 : tensor<10x10xcomplex<f64>>
@@ -6164,7 +6279,7 @@ func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
 }
 
 func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
-  // expected-error@+1 {{op execution_thread does not match the execution_thread of async_op.  Got: "thread", but expected "thread2".}}
+  // expected-error@+1 {{op execution_thread does not match the execution_thread of async_op. Got: "thread", but expected "thread2".}}
   %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
   %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
   %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
@@ -6216,7 +6331,7 @@ func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
 
 func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
   %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  // expected-error@+1 {{op execution_thread does not match name of async_op.  Got: "thread2", but expected "thread".}}
+  // expected-error@+1 {{op execution_thread does not match name of async_op. Got: "thread2", but expected "thread".}}
   %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread2"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
   %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
   func.return %2 : tensor<32xf32>
@@ -6236,10 +6351,11 @@ func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
 func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
   %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
   %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  // expected-error@+1 {{op execution_thread does not match name of async_op.  Got: "thread2", but expected "thread".}}
+  // expected-error@+1 {{op execution_thread does not match name of async_op. Got: "thread2", but expected "thread".}}
   %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread2"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
   func.return %2 : tensor<32xf32>
 }
+
 // -----
 
 func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
@@ -6251,7 +6367,8 @@ func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
 func.func @async(%arg0: tensor<10x10xf32>) -> tensor<f32> {
   %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
   %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-// expected-error@+1 {{inferred type(s) 'tensor<32xf32>' are incompatible with return type(s) of operation 'tensor<f32>'}}
+  // expected-error@+2 {{'mhlo.async_done' op failed to infer returned types}}
+  // expected-error@+1 {{inferred type(s) 'tensor<32xf32>' are incompatible with return type(s) of operation 'tensor<f32>'}}
   %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<f32>
   func.return %2 : tensor<f32>
 }
@@ -6267,7 +6384,7 @@ func.func @is_finite(%arg0: tensor<3xf32>) -> tensor<3xi1> {
 // -----
 
 func.func @is_finite_int_input(%arg0: tensor<3xi32>) -> tensor<3xi1> {
-  // expected-error@+1 {{operand #0 must be tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<3xi32>'}}
+  // expected-error@+1 {{op operand #0 must be tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<3xi32>'}}
   %0 = "mhlo.is_finite"(%arg0) {} : (tensor<3xi32>) -> tensor<3xi1>
   func.return %0 : tensor<3xi1>
 }
@@ -6291,6 +6408,7 @@ func.func @is_finite_mismatch_return_shape(%arg0: tensor<3xf32>) -> tensor<4xi1>
 // -----
 
 func.func @negative_dimension_attr(%arg0: tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, -1]>>, %arg1: tensor<i32>) -> tensor<*xf32> {
+  // expected-error@+2 {{'mhlo.set_dimension_size' op failed to infer returned types}}
   // expected-error@+1 {{requires non-negative dimension attribute; found (-1)}}
   %result = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = -1 : i64} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, -1]>>, tensor<i32>) -> tensor<*xf32>
   func.return %result : tensor<*xf32>
@@ -6299,6 +6417,7 @@ func.func @negative_dimension_attr(%arg0: tensor<?x?xf32, #mhlo.type_extensions<
 // -----
 
 func.func @invalid_dimension_attr(%arg0: tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, -1]>>, %arg1: tensor<i32>) -> tensor<*xf32> {
+  // expected-error@+2 {{'mhlo.set_dimension_size' op failed to infer returned types}}
   // expected-error@+1 {{requires dimension attribute in range [0, 2); found (2)}}
   %result = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 2 : i64} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, -1]>>, tensor<i32>) -> tensor<*xf32>
   func.return %result : tensor<*xf32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
index 2c41639b672..4f82994c7af 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
@@ -5,6 +5,7 @@
 // ArgResultAlias aka #stablehlo.result_alias is unused at the moment.
 // ChannelHandle aka #stablehlo.channel_handle is covered below.
 
+// CHECK-LABEL: "attr_comparison_direction_eq"
 func.func @attr_comparison_direction_eq(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "stablehlo.compare"(%arg0, %arg1) {
     // CHECK: comparison_direction = #mhlo<comparison_direction EQ>
@@ -12,8 +13,8 @@ func.func @attr_comparison_direction_eq(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_direction_eq"
 
+// CHECK-LABEL: "attr_comparison_direction_ne"
 func.func @attr_comparison_direction_ne(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "stablehlo.compare"(%arg0, %arg1) {
     // CHECK: comparison_direction = #mhlo<comparison_direction NE>
@@ -21,8 +22,8 @@ func.func @attr_comparison_direction_ne(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_direction_ne"
 
+// CHECK-LABEL: "attr_comparison_direction_ge"
 func.func @attr_comparison_direction_ge(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "stablehlo.compare"(%arg0, %arg1) {
     // CHECK: comparison_direction = #mhlo<comparison_direction GE>
@@ -30,8 +31,8 @@ func.func @attr_comparison_direction_ge(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_direction_ge"
 
+// CHECK-LABEL: "attr_comparison_direction_gt"
 func.func @attr_comparison_direction_gt(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "stablehlo.compare"(%arg0, %arg1) {
     // CHECK: comparison_direction = #mhlo<comparison_direction GT>
@@ -39,8 +40,8 @@ func.func @attr_comparison_direction_gt(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_direction_gt"
 
+// CHECK-LABEL: "attr_comparison_direction_le"
 func.func @attr_comparison_direction_le(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "stablehlo.compare"(%arg0, %arg1) {
     // CHECK: comparison_direction = #mhlo<comparison_direction LE>
@@ -48,8 +49,8 @@ func.func @attr_comparison_direction_le(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_direction_le"
 
+// CHECK-LABEL: "attr_comparison_direction_lt"
 func.func @attr_comparison_direction_lt(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "stablehlo.compare"(%arg0, %arg1) {
     // CHECK: comparison_direction = #mhlo<comparison_direction LT>
@@ -57,8 +58,8 @@ func.func @attr_comparison_direction_lt(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_direction_lt"
 
+// CHECK-LABEL: "attr_comparison_type_notype"
 func.func @attr_comparison_type_notype(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "stablehlo.compare"(%arg0, %arg1) {
     comparison_direction = #stablehlo<comparison_direction EQ>,
@@ -67,8 +68,8 @@ func.func @attr_comparison_type_notype(%arg0: tensor<f32>, %arg1: tensor<f32>) -
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_type_notype"
 
+// CHECK-LABEL: "attr_comparison_type_float"
 func.func @attr_comparison_type_float(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "stablehlo.compare"(%arg0, %arg1) {
     comparison_direction = #stablehlo<comparison_direction EQ>,
@@ -77,8 +78,8 @@ func.func @attr_comparison_type_float(%arg0: tensor<f32>, %arg1: tensor<f32>) ->
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_type_float"
 
+// CHECK-LABEL: "attr_comparison_type_totalorder"
 func.func @attr_comparison_type_totalorder(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "stablehlo.compare"(%arg0, %arg1) {
     comparison_direction = #stablehlo<comparison_direction EQ>,
@@ -87,8 +88,8 @@ func.func @attr_comparison_type_totalorder(%arg0: tensor<f32>, %arg1: tensor<f32
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_type_totalorder"
 
+// CHECK-LABEL: "attr_comparison_type_signed"
 func.func @attr_comparison_type_signed(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "stablehlo.compare"(%arg0, %arg1) {
     comparison_direction = #stablehlo<comparison_direction EQ>,
@@ -97,8 +98,8 @@ func.func @attr_comparison_type_signed(%arg0: tensor<f32>, %arg1: tensor<f32>) -
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_type_signed"
 
+// CHECK-LABEL: "attr_comparison_type_unsigned"
 func.func @attr_comparison_type_unsigned(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   %0 = "stablehlo.compare"(%arg0, %arg1) {
     comparison_direction = #stablehlo<comparison_direction EQ>,
@@ -107,10 +108,10 @@ func.func @attr_comparison_type_unsigned(%arg0: tensor<f32>, %arg1: tensor<f32>)
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "attr_comparison_type_unsigned"
 
 // ConvDimensionNumbers aka #stablehlo.conv is covered below.
 
+// CHECK-LABEL: "attr_custom_call_api_version_unspecified"
 func.func @attr_custom_call_api_version_unspecified(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "stablehlo.custom_call"(%arg0) {
     call_target_name = "foo",
@@ -119,8 +120,8 @@ func.func @attr_custom_call_api_version_unspecified(%arg0: tensor<f32>) -> tenso
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "attr_custom_call_api_version_unspecified"
 
+// CHECK-LABEL: "attr_custom_call_api_version_original"
 func.func @attr_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "stablehlo.custom_call"(%arg0) {
     call_target_name = "foo",
@@ -129,8 +130,8 @@ func.func @attr_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "attr_custom_call_api_version_original"
 
+// CHECK-LABEL: "attr_custom_call_api_version_status_returning"
 func.func @attr_custom_call_api_version_status_returning(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "stablehlo.custom_call"(%arg0) {
     call_target_name = "foo",
@@ -139,8 +140,8 @@ func.func @attr_custom_call_api_version_status_returning(%arg0: tensor<f32>) ->
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "attr_custom_call_api_version_status_returning"
 
+// CHECK-LABEL: "attr_custom_call_api_version_status_returning_unified"
 func.func @attr_custom_call_api_version_status_returning_unified(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "stablehlo.custom_call"(%arg0) {
     call_target_name = "foo",
@@ -149,10 +150,10 @@ func.func @attr_custom_call_api_version_status_returning_unified(%arg0: tensor<f
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "attr_custom_call_api_version_status_returning_unified"
 
 // DotDimensionNumbers aka #stablehlo.dot is covered below.
 
+// CHECK-LABEL: "attr_fft_type_fft"
 func.func @attr_fft_type_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
   %0 = "stablehlo.fft"(%arg0) {
     // CHECK: fft_type = #mhlo<fft_type FFT>
@@ -161,8 +162,8 @@ func.func @attr_fft_type_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomple
   } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   func.return %0 : tensor<16xcomplex<f32>>
 }
-// CHECK-LABEL: "attr_fft_type_fft"
 
+// CHECK-LABEL: "attr_fft_type_ifft"
 func.func @attr_fft_type_ifft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
   %0 = "stablehlo.fft"(%arg0) {
     // CHECK: fft_type = #mhlo<fft_type IFFT>
@@ -171,8 +172,8 @@ func.func @attr_fft_type_ifft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcompl
   } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   func.return %0 : tensor<16xcomplex<f32>>
 }
-// CHECK-LABEL: "attr_fft_type_ifft"
 
+// CHECK-LABEL: "attr_fft_type_rfft"
 func.func @attr_fft_type_rfft(%arg0: tensor<16xf32>) -> tensor<9xcomplex<f32>> {
   %0 = "mhlo.fft"(%arg0) {
     // CHECK: fft_type = #mhlo<fft_type RFFT>
@@ -181,8 +182,8 @@ func.func @attr_fft_type_rfft(%arg0: tensor<16xf32>) -> tensor<9xcomplex<f32>> {
   } : (tensor<16xf32>) -> tensor<9xcomplex<f32>>
   func.return %0 : tensor<9xcomplex<f32>>
 }
-// CHECK-LABEL: "attr_fft_type_rfft"
 
+// CHECK-LABEL: "attr_fft_type_irfft"
 func.func @attr_fft_type_irfft(%arg0: tensor<9xcomplex<f32>>) -> tensor<16xf32> {
   %0 = "mhlo.fft"(%arg0) {
     // CHECK: fft_type = #mhlo<fft_type IRFFT>
@@ -191,10 +192,10 @@ func.func @attr_fft_type_irfft(%arg0: tensor<9xcomplex<f32>>) -> tensor<16xf32>
   } : (tensor<9xcomplex<f32>>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "attr_fft_type_irfft"
 
 // GatherDimensionNumbers aka #stablehlo.gather is covered below.
 
+// CHECK-LABEL: "attr_precision_config_default"
 func.func @attr_precision_config_default(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   %0 = "stablehlo.dot"(%arg0, %arg1) {
     // CHECK: precision_config = [#mhlo<precision DEFAULT>]
@@ -202,8 +203,8 @@ func.func @attr_precision_config_default(%arg0: tensor<8x16xf32>, %arg1: tensor<
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "attr_precision_config_default"
 
+// CHECK-LABEL: "attr_precision_config_high"
 func.func @attr_precision_config_high(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   %0 = "stablehlo.dot"(%arg0, %arg1) {
     // CHECK: precision_config = [#mhlo<precision HIGH>]
@@ -211,8 +212,8 @@ func.func @attr_precision_config_high(%arg0: tensor<8x16xf32>, %arg1: tensor<16x
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "attr_precision_config_high"
 
+// CHECK-LABEL: "attr_precision_config_highest"
 func.func @attr_precision_config_highest(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   %0 = "stablehlo.dot"(%arg0, %arg1) {
     // CHECK: precision_config = [#mhlo<precision HIGHEST>]
@@ -220,8 +221,8 @@ func.func @attr_precision_config_highest(%arg0: tensor<8x16xf32>, %arg1: tensor<
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "attr_precision_config_highest"
 
+// CHECK-LABEL: "attr_rng_algorithm_default"
 func.func @attr_rng_algorithm_default(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
   %0:2 = "stablehlo.rng_bit_generator"(%arg0) {
     // CHECK: rng_algorithm = #mhlo.rng_algorithm<DEFAULT>
@@ -229,8 +230,8 @@ func.func @attr_rng_algorithm_default(%arg0: tensor<f32>) -> (tensor<f32>, tenso
   } : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
   func.return %0#0, %0#1 : tensor<f32>, tensor<f32>
 }
-// CHECK-LABEL: "attr_rng_algorithm_default"
 
+// CHECK-LABEL: "attr_rng_algorithm_three_fry"
 func.func @attr_rng_algorithm_three_fry(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
   %0:2 = "stablehlo.rng_bit_generator"(%arg0) {
     // CHECK: rng_algorithm = #mhlo.rng_algorithm<THREE_FRY>
@@ -238,8 +239,8 @@ func.func @attr_rng_algorithm_three_fry(%arg0: tensor<f32>) -> (tensor<f32>, ten
   } : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
   func.return %0#0, %0#1 : tensor<f32>, tensor<f32>
 }
-// CHECK-LABEL: "attr_rng_algorithm_three_fry"
 
+// CHECK-LABEL: "attr_rng_algorithm_philox"
 func.func @attr_rng_algorithm_philox(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
   %0:2 = "stablehlo.rng_bit_generator"(%arg0) {
     // CHECK: rng_algorithm = #mhlo.rng_algorithm<PHILOX>
@@ -247,8 +248,8 @@ func.func @attr_rng_algorithm_philox(%arg0: tensor<f32>) -> (tensor<f32>, tensor
   } : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
   func.return %0#0, %0#1 : tensor<f32>, tensor<f32>
 }
-// CHECK-LABEL: "attr_rng_algorithm_philox"
 
+// CHECK-LABEL: "attr_rng_distribution_uniform"
 func.func @attr_rng_distribution_uniform(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>) -> tensor<f32> {
   %0 = "stablehlo.rng"(%arg0, %arg1, %arg2) {
     // CHECK: rng_distribution = #mhlo.rng_distribution<UNIFORM>
@@ -256,8 +257,8 @@ func.func @attr_rng_distribution_uniform(%arg0: tensor<f32>, %arg1: tensor<f32>,
   } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "attr_rng_distribution_uniform"
 
+// CHECK-LABEL: "attr_rng_distribution_normal"
 func.func @attr_rng_distribution_normal(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>) -> tensor<f32> {
   %0 = "stablehlo.rng"(%arg0, %arg1, %arg2) {
     // CHECK: rng_distribution = #mhlo.rng_distribution<NORMAL>
@@ -265,10 +266,10 @@ func.func @attr_rng_distribution_normal(%arg0: tensor<f32>, %arg1: tensor<f32>,
   } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "attr_rng_distribution_normal"
 
 // ScatterDimensionNumbers aka #stablehlo.scatter is covered below.
 
+// CHECK-LABEL: "attr_transpose_no_transpose"
 func.func @attr_transpose_no_transpose(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>) ->  tensor<16x16xf32> {
   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {
     left_side = true,
@@ -279,8 +280,8 @@ func.func @attr_transpose_no_transpose(%arg0: tensor<16x16xf32>, %arg1: tensor<1
   } : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "attr_transpose_no_transpose"
 
+// CHECK-LABEL: "attr_transpose_transpose"
 func.func @attr_transpose_transpose(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>) ->  tensor<16x16xf32> {
   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {
     left_side = true,
@@ -291,8 +292,8 @@ func.func @attr_transpose_transpose(%arg0: tensor<16x16xf32>, %arg1: tensor<16x1
   } : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "attr_transpose_transpose"
 
+// CHECK-LABEL: "attr_transpose_adjoint"
 func.func @attr_transpose_adjoint(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>) ->  tensor<16x16xf32> {
   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {
     left_side = true,
@@ -303,10 +304,10 @@ func.func @attr_transpose_adjoint(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16x
   } : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "attr_transpose_adjoint"
 
 // TypeExtensionsAttr aka #stablehlo.type_extensions is covered below.
 
+// CHECK-LABEL: "attr_type_extensions_bounds"
 func.func @attr_type_extensions_bounds(
     %arg0: tensor<?x?xf32, #stablehlo.type_extensions<bounds = [16, ?]>>)
     -> tensor<?x?xf32, #stablehlo.type_extensions<bounds = [16, ?]>> {
@@ -314,31 +315,31 @@ func.func @attr_type_extensions_bounds(
   func.return %arg0 : tensor<?x?xf32, #stablehlo.type_extensions<bounds = [16, ?]>>
 }
 
-// CHECK-LABEL: "attr_type_extensions_bounds"
 
 // ============ OPS ============
 
+// CHECK-LABEL: "op_abs"
 func.func @op_abs(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.abs"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.abs"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_abs"
 
+// CHECK-LABEL: "op_add"
 func.func @op_add(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_add"
 
+// CHECK-LABEL: "op_after_all"
 func.func @op_after_all(%arg0: !stablehlo.token) -> !stablehlo.token {
   // CHECK: "mhlo.after_all"(%arg0) : (!mhlo.token) -> !mhlo.token
   %0 = "stablehlo.after_all"(%arg0) : (!stablehlo.token) -> !stablehlo.token
   func.return %0 : !stablehlo.token
 }
-// CHECK-LABEL: "op_after_all"
 
+// CHECK-LABEL: "op_all_gather"
 func.func @op_all_gather(%arg0: tensor<16x8xf32>) -> tensor<16x16xf32> {
   //               CHECK: "mhlo.all_gather"(%arg0) {
   //          CHECK-SAME:   all_gather_dim = 1 : i64,
@@ -354,8 +355,8 @@ func.func @op_all_gather(%arg0: tensor<16x8xf32>) -> tensor<16x16xf32> {
   } : (tensor<16x8xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "op_all_gather"
 
+// CHECK-LABEL: "op_all_reduce"
 func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
   //               CHECK: "mhlo.all_reduce"(%arg0) ({
   //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
@@ -377,8 +378,8 @@ func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_all_reduce"
 
+// CHECK-LABEL: "op_all_to_all"
 func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
   //               CHECK: "mhlo.all_to_all"(%arg0) {
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
@@ -396,22 +397,22 @@ func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
   } : (tensor<4x16xf32>) -> tensor<16x4xf32>
   func.return %0 : tensor<16x4xf32>
 }
-// CHECK-LABEL: "op_all_to_all"
 
+// CHECK-LABEL: "op_and"
 func.func @op_and(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
   // CHECK: "mhlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "stablehlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "op_and"
 
+// CHECK-LABEL: "op_atan2"
 func.func @op_atan2(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.atan2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.atan2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_atan2"
 
+// CHECK-LABEL: "op_batch_norm_grad"
 func.func @op_batch_norm_grad(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>, %arg3: tensor<16xf32>, %arg4: tensor<16x16x16x16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) {
   //      CHECK: "mhlo.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4) {
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
@@ -423,8 +424,8 @@ func.func @op_batch_norm_grad(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf
   } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16x16x16x16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
   func.return %0#0, %0#1, %0#2 : tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>
 }
-// CHECK-LABEL: "op_batch_norm_grad"
 
+// CHECK-LABEL: "op_batch_norm_inference"
 func.func @op_batch_norm_inference(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>, %arg3: tensor<16xf32>, %arg4: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
   //      CHECK: "mhlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) {
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
@@ -436,8 +437,8 @@ func.func @op_batch_norm_inference(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor
   } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) -> tensor<16x16x16x16xf32>
   func.return %0 : tensor<16x16x16x16xf32>
 }
-// CHECK-LABEL: "op_batch_norm_inference"
 
+// CHECK-LABEL: "op_batch_norm_training"
 func.func @op_batch_norm_training(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) {
   //      CHECK: "mhlo.batch_norm_training"(%arg0, %arg1, %arg2) {
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
@@ -449,15 +450,15 @@ func.func @op_batch_norm_training(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<
   } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
   func.return %0#0, %0#1, %0#2 : tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>
 }
-// CHECK-LABEL: "op_batch_norm_training"
 
+// CHECK-LABEL: "op_bitcast_convert"
 func.func @op_bitcast_convert(%arg0: tensor<i32>) -> tensor<f32> {
   // CHECK: "mhlo.bitcast_convert"(%arg0) : (tensor<i32>) -> tensor<f32>
   %0 = "stablehlo.bitcast_convert"(%arg0) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_bitcast_convert"
 
+// CHECK-LABEL: "op_broadcast_in_dim"
 func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   //      CHECK: "mhlo.broadcast_in_dim"(%arg0) {
   // CHECK-SAME:   broadcast_dimensions = dense<1> : tensor<1xi64>
@@ -467,8 +468,8 @@ func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   } : (tensor<16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "op_broadcast_in_dim"
 
+// CHECK-LABEL: "op_broadcast"
 func.func @op_broadcast(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   //      CHECK: "mhlo.broadcast"(%arg0) {
   // CHECK-SAME:   broadcast_sizes = dense<16> : tensor<1xi64>
@@ -478,8 +479,8 @@ func.func @op_broadcast(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   } : (tensor<16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "op_broadcast"
 
+// CHECK-LABEL: "op_case"
 func.func @op_case(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
   //      CHECK: "mhlo.case"(%arg0) ({
   // CHECK-NEXT:   "mhlo.return"(%arg1) : (tensor<f32>) -> ()
@@ -489,22 +490,22 @@ func.func @op_case(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
   }) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_case"
 
+// CHECK-LABEL: "op_cbrt"
 func.func @op_cbrt(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.cbrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.cbrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_cbrt"
 
+// CHECK-LABEL: "op_ceil"
 func.func @op_ceil(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.ceil"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.ceil"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_ceil"
 
+// CHECK-LABEL: "op_cholesky"
 func.func @op_cholesky(%arg0: tensor<1x16x16xf32>) -> tensor<1x16x16xf32> {
   //      CHECK: "mhlo.cholesky"(%arg0) {
   // CHECK-SAME:   lower = true
@@ -514,22 +515,22 @@ func.func @op_cholesky(%arg0: tensor<1x16x16xf32>) -> tensor<1x16x16xf32> {
   } : (tensor<1x16x16xf32>) -> tensor<1x16x16xf32>
   func.return %0 : tensor<1x16x16xf32>
 }
-// CHECK-LABEL: "op_cholesky"
 
+// CHECK-LABEL: "op_clamp"
 func.func @op_clamp(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.clamp"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.clamp"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_clamp"
 
+// CHECK-LABEL: "op_count_leading_zeros"
 func.func @op_count_leading_zeros(%arg0: tensor<i32>) -> tensor<i32> {
   // CHECK: "mhlo.count_leading_zeros"(%arg0) : (tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.count_leading_zeros"(%arg0) : (tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "op_count_leading_zeros"
 
+// CHECK-LABEL: "op_collective_permute"
 func.func @op_collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
   //               CHECK: "mhlo.collective_permute"(%arg0) {
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
@@ -541,8 +542,8 @@ func.func @op_collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
   } : (tensor<16x8xf32>) -> tensor<16x8xf32>
   func.return %0 : tensor<16x8xf32>
 }
-// CHECK-LABEL: "op_collective_permute"
 
+// CHECK-LABEL: "op_compare"
 func.func @op_compare(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   //      CHECK: "mhlo.compare"(%arg0, %arg1) {
   // CHECK-SAME:   compare_type = #mhlo<comparison_type TOTALORDER>,
@@ -554,22 +555,22 @@ func.func @op_compare(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
   } : (tensor<f32>, tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "op_compare"
 
+// CHECK-LABEL: "op_complex"
 func.func @op_complex(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<complex<f32>> {
   // CHECK: "mhlo.complex"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<complex<f32>>
   %0 = "stablehlo.complex"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<complex<f32>>
   func.return %0 : tensor<complex<f32>>
 }
-// CHECK-LABEL: "op_complex"
 
+// CHECK-LABEL: "op_compute_reshape_shape"
 func.func @op_compute_reshape_shape(%arg0: index, %arg1: tensor<1xindex>) -> tensor<1xindex> {
   // CHECK: "mhlo.compute_reshape_shape"(%arg0, %arg1) : (index, tensor<1xindex>) -> tensor<1xindex>
   %0 = "stablehlo.compute_reshape_shape"(%arg0, %arg1) : (index, tensor<1xindex>) -> tensor<1xindex>
   func.return %0 : tensor<1xindex>
 }
-// CHECK-LABEL: "op_compute_reshape_shape"
 
+// CHECK-LABEL: "op_concatenate"
 func.func @op_concatenate(%arg0: tensor<8xf32>, %arg1: tensor<8xf32>) -> tensor<16xf32> {
   //      CHECK: "mhlo.concatenate"(%arg0, %arg1) {
   // CHECK-SAME:   dimension = 0 : i64
@@ -579,8 +580,8 @@ func.func @op_concatenate(%arg0: tensor<8xf32>, %arg1: tensor<8xf32>) -> tensor<
   } : (tensor<8xf32>, tensor<8xf32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_concatenate"
 
+// CHECK-LABEL: "op_constant"
 func.func @op_constant(%arg0: tensor<f32>) -> tensor<f32> {
   //      CHECK: "mhlo.constant"() {
   // CHECK-SAME:   value = dense<0.000000e+00> : tensor<f32>
@@ -590,15 +591,15 @@ func.func @op_constant(%arg0: tensor<f32>) -> tensor<f32> {
   } : () -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_constant"
 
+// CHECK-LABEL: "op_convert"
 func.func @op_convert(%arg0: tensor<i32>) -> tensor<f32> {
   // CHECK: "mhlo.convert"(%arg0) : (tensor<i32>) -> tensor<f32>
   %0 = "stablehlo.convert"(%arg0) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_convert"
 
+// CHECK-LABEL: "op_convolution"
 func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
   //      CHECK: "mhlo.convolution"(%arg0, %arg1) {
   // CHECK-SAME:   batch_group_count = 1 : i64,
@@ -624,22 +625,22 @@ func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16
   } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
   func.return %0 : tensor<1x8x8x16xf32>
 }
-// CHECK-LABEL: "op_convolution"
 
+// CHECK-LABEL: "op_cosine"
 func.func @op_cosine(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.cosine"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.cosine"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_cosine"
 
+// CHECK-LABEL: "op_create_token"
 func.func @op_create_token() -> !stablehlo.token {
   // CHECK: "mhlo.create_token"() : () -> !mhlo.token
   %0 = "stablehlo.create_token"() : () -> !stablehlo.token
   func.return %0 : !stablehlo.token
 }
-// CHECK-LABEL: "op_create_token"
 
+// CHECK-LABEL: "op_cross_replica_sum"
 func.func @op_cross_replica_sum(%arg0: tensor<f32>) -> tensor<f32> {
   //               CHECK: "mhlo.cross-replica-sum"(%arg0) {
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
@@ -649,15 +650,15 @@ func.func @op_cross_replica_sum(%arg0: tensor<f32>) -> tensor<f32> {
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_cross_replica_sum"
 
+// CHECK-LABEL: "op_cstr_reshapable"
 func.func @op_cstr_reshapable(%arg0: index, %arg1: tensor<1xindex>) -> !shape.witness {
   // CHECK: "mhlo.cstr_reshapable"(%arg0, %arg1) : (index, tensor<1xindex>) -> !shape.witness
   %0 = "stablehlo.cstr_reshapable"(%arg0, %arg1) : (index, tensor<1xindex>) -> !shape.witness
   func.return %0 : !shape.witness
 }
-// CHECK-LABEL: "op_cstr_reshapable"
 
+// CHECK-LABEL: "op_custom_call_api_version_original"
 func.func @called_computation() { func.return }
 func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32> {
   //      CHECK: "mhlo.custom_call"(%arg0) {
@@ -689,8 +690,8 @@ func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_custom_call_api_version_original"
 
+// CHECK-LABEL: "op_custom_call_api_version_typed_ffi"
 func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f32> {
   //      CHECK: "mhlo.custom_call"(%arg0) {
   // CHECK-SAME:   api_version = 4 : i32,
@@ -703,15 +704,15 @@ func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f3
   } : (tensor<f32>) -> tensor<f32>
   return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_custom_call_api_version_typed_ffi"
 
+// CHECK-LABEL: "op_divide"
 func.func @op_divide(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.divide"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.divide"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_divide"
 
+// CHECK-LABEL: "op_dot_general"
 func.func @op_dot_general(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>) -> tensor<8x8x8xf32> {
   //      CHECK: "mhlo.dot_general"(%arg0, %arg1) {
   // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
@@ -733,8 +734,8 @@ func.func @op_dot_general(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>)
   } : (tensor<8x8x16xf32>, tensor<8x16x8xf32>) -> tensor<8x8x8xf32>
   func.return %0 : tensor<8x8x8xf32>
 }
-// CHECK-LABEL: "op_dot_general"
 
+// CHECK-LABEL: "op_dot"
 func.func @op_dot(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   //      CHECK: "mhlo.dot"(%arg0, %arg1) {
   // CHECK-SAME:   precision_config = []
@@ -744,8 +745,8 @@ func.func @op_dot(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "op_dot"
 
+// CHECK-LABEL: "op_dynamic_broadcast_in_dim"
 func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
   //      CHECK: "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {
   // CHECK-SAME:   broadcast_dimensions = dense<1> : tensor<1xi64>,
@@ -759,8 +760,8 @@ func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xind
   } : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
-// CHECK-LABEL: "op_dynamic_broadcast_in_dim"
 
+// CHECK-LABEL: "op_dynamic_conv"
 func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>, %arg2: tensor<4xi32>) -> tensor<1x?x?x16xf32> {
   //      CHECK: "mhlo.dynamic_conv"(%arg0, %arg1, %arg2) {
   // CHECK-SAME:   batch_group_count = 1 : i64,
@@ -786,8 +787,8 @@ func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x1
   } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
   func.return %0 : tensor<1x?x?x16xf32>
 }
-// CHECK-LABEL: "op_dynamic_conv"
 
+// CHECK-LABEL: "op_dynamic_gather"
 func.func @op_dynamic_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>, %arg2 : tensor<3xi32>) -> tensor<1x5x8xf32> {
   //      CHECK: "mhlo.dynamic_gather"(%arg0, %arg1, %arg2) {
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
@@ -809,8 +810,8 @@ func.func @op_dynamic_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32
   } : (tensor<2x4x9xf32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<1x5x8xf32>
   func.return %0 : tensor<1x5x8xf32>
 }
-// CHECK-LABEL: "op_dynamic_gather"
 
+// CHECK-LABEL: "op_dynamic_iota"
 func.func @op_dynamic_iota(%arg0: tensor<1xindex>) -> tensor<?xf32> {
   //      CHECK: "mhlo.dynamic_iota"(%arg0) {
   // CHECK-SAME:   iota_dimension = 0 : i64
@@ -820,22 +821,22 @@ func.func @op_dynamic_iota(%arg0: tensor<1xindex>) -> tensor<?xf32> {
   } : (tensor<1xindex>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
-// CHECK-LABEL: "op_dynamic_iota"
 
+// CHECK-LABEL: "op_dynamic_pad"
 func.func @op_dynamic_pad(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tensor<1xindex>, %arg3: tensor<1xindex>, %arg4: tensor<1xindex>) -> tensor<?xf32> {
   // CHECK: "mhlo.dynamic_pad"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<?xf32>, tensor<f32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   %0 = "stablehlo.dynamic_pad"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<?xf32>, tensor<f32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
-// CHECK-LABEL: "op_dynamic_pad"
 
+// CHECK-LABEL: "op_dynamic_reshape"
 func.func @op_dynamic_reshape(%arg0: tensor<16xf32>, %arg1: tensor<?xindex>) -> tensor<?x?xf32> {
   // CHECK: "mhlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<?xindex>) -> tensor<?x?xf32>
   %0 = "stablehlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<?xindex>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
-// CHECK-LABEL: "op_dynamic_reshape"
 
+// CHECK-LABEL: "op_dynamic_slice"
 func.func @op_dynamic_slice(%arg0: tensor<16xf32>, %arg1: tensor<i64>) -> tensor<4xf32> {
   //      CHECK: "mhlo.dynamic_slice"(%arg0, %arg1) {
   // CHECK-SAME:   slice_sizes = dense<4> : tensor<1xi64>
@@ -845,15 +846,15 @@ func.func @op_dynamic_slice(%arg0: tensor<16xf32>, %arg1: tensor<i64>) -> tensor
   } : (tensor<16xf32>, tensor<i64>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
 }
-// CHECK-LABEL: "op_dynamic_slice"
 
+// CHECK-LABEL: "op_dynamic_update_slice"
 func.func @op_dynamic_update_slice(%arg0: tensor<16xf32>, %arg1: tensor<4xf32>, %arg2: tensor<i64>) -> tensor<16xf32> {
   // CHECK: "mhlo.dynamic_update_slice"(%arg0, %arg1, %arg2) : (tensor<16xf32>, tensor<4xf32>, tensor<i64>) -> tensor<16xf32>
   %0 = "stablehlo.dynamic_update_slice"(%arg0, %arg1, %arg2) : (tensor<16xf32>, tensor<4xf32>, tensor<i64>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_dynamic_update_slice"
 
+// CHECK-LABEL: "op_einsum"
 func.func @op_einsum(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   //      CHECK: "mhlo.einsum"(%arg0, %arg1) {
   // CHECK-SAME:   einsum_config = "ab,bc->ac"
@@ -863,22 +864,22 @@ func.func @op_einsum(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "op_einsum"
 
+// CHECK-LABEL: "op_exponential_minus_one"
 func.func @op_exponential_minus_one(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.exponential_minus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.exponential_minus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_exponential_minus_one"
 
+// CHECK-LABEL: "op_exponential"
 func.func @op_exponential(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.exponential"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.exponential"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_exponential"
 
+// CHECK-LABEL: "op_fft"
 func.func @op_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
   //      CHECK: "mhlo.fft"(%arg0) {
   // CHECK-SAME:   fft_length = dense<16> : tensor<1xi64>,
@@ -890,15 +891,15 @@ func.func @op_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
   } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   func.return %0 : tensor<16xcomplex<f32>>
 }
-// CHECK-LABEL: "op_fft"
 
+// CHECK-LABEL: "op_floor"
 func.func @op_floor(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_floor"
 
+// CHECK-LABEL: "op_gather"
 func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> tensor<1x5x1xf32> {
   //      CHECK: "mhlo.gather"(%arg0, %arg1) {
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
@@ -922,8 +923,8 @@ func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> te
   } : (tensor<2x4x9xf32>, tensor<1x5x2xi32>) -> tensor<1x5x1xf32>
   func.return %0 : tensor<1x5x1xf32>
 }
-// CHECK-LABEL: "op_gather"
 
+// CHECK-LABEL: "op_get_dimension_size"
 func.func @op_get_dimension_size(%arg0: tensor<?xf32>) -> tensor<i32> {
   //      CHECK: "mhlo.get_dimension_size"(%arg0) {
   // CHECK-SAME:   dimension = 0 : i64
@@ -933,8 +934,8 @@ func.func @op_get_dimension_size(%arg0: tensor<?xf32>) -> tensor<i32> {
   } : (tensor<?xf32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "op_get_dimension_size"
 
+// CHECK-LABEL: "op_get_tuple_element"
 func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32> {
   //      CHECK: "mhlo.get_tuple_element"(%arg0) {
   // CHECK-SAME:   index = 4 : i32
@@ -944,8 +945,8 @@ func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>, tensor<f32>, tensor<f3
   } : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_get_tuple_element"
 
+// CHECK-LABEL: "op_if"
 func.func @op_if(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
   //      CHECK: "mhlo.if"(%arg0) ({
   // CHECK-NEXT:   "mhlo.return"(%arg1) : (tensor<f32>) -> ()
@@ -959,15 +960,15 @@ func.func @op_if(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> t
   }) : (tensor<i1>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_if"
 
+// CHECK-LABEL: "op_imag"
 func.func @op_imag(%arg0: tensor<complex<f32>>) -> tensor<f32> {
   // CHECK: "mhlo.imag"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
   %0 = "stablehlo.imag"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_imag"
 
+// CHECK-LABEL: "op_infeed"
 func.func @op_infeed(%arg0: !stablehlo.token) -> (tensor<f32>, !stablehlo.token) {
   //               CHECK: "mhlo.infeed"(%arg0) {
   //          CHECK-SAME:   infeed_config = "",
@@ -979,8 +980,8 @@ func.func @op_infeed(%arg0: !stablehlo.token) -> (tensor<f32>, !stablehlo.token)
   } : (!stablehlo.token) -> (tensor<f32>, !stablehlo.token)
   func.return %0#0, %0#1 : tensor<f32>, !stablehlo.token
 }
-// CHECK-LABEL: "op_infeed"
 
+// CHECK-LABEL: "op_iota"
 func.func @op_iota() -> tensor<16xf32> {
   //      CHECK: "mhlo.iota"() {
   // CHECK-SAME:   iota_dimension = 0 : i64
@@ -990,36 +991,36 @@ func.func @op_iota() -> tensor<16xf32> {
   } : () -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_iota"
 
+// CHECK-LABEL: "op_is_finite"
 func.func @op_is_finite(%arg0: tensor<f32>) -> tensor<i1> {
   // CHECK: "mhlo.is_finite"(%arg0) : (tensor<f32>) -> tensor<i1>
   %0 = "stablehlo.is_finite"(%arg0) : (tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "op_is_finite"
 
+// CHECK-LABEL: "op_log"
 func.func @op_log(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_log"
 
+// CHECK-LABEL: "op_log_plus_one"
 func.func @op_log_plus_one(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.log_plus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.log_plus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_log_plus_one"
 
+// CHECK-LABEL: "op_logistic"
 func.func @op_logistic(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.logistic"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.logistic"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_logistic"
 
+// CHECK-LABEL: "op_map"
 func.func @op_map(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   //      CHECK: "mhlo.map"(%arg0) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>):
@@ -1037,57 +1038,57 @@ func.func @op_map(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   } : (tensor<16xf32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_map"
 
+// CHECK-LABEL: "op_maximum"
 func.func @op_maximum(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.maximum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_maximum"
 
+// CHECK-LABEL: "op_minimum"
 func.func @op_minimum(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.minimum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.minimum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_minimum"
 
+// CHECK-LABEL: "op_multiply"
 func.func @op_multiply(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.multiply"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.multiply"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_multiply"
 
+// CHECK-LABEL: "op_negate"
 func.func @op_negate(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_negate"
 
+// CHECK-LABEL: "op_not"
 func.func @op_not(%arg0: tensor<i1>) -> tensor<i1> {
   // CHECK: "mhlo.not"(%arg0) : (tensor<i1>) -> tensor<i1>
   %0 = "stablehlo.not"(%arg0) : (tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "op_not"
 
+// CHECK-LABEL: "op_optimization_barrier"
 func.func @op_optimization_barrier(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.optimization_barrier"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.optimization_barrier"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_optimization_barrier"
 
+// CHECK-LABEL: "op_or"
 func.func @op_or(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
   // CHECK: "mhlo.or"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "stablehlo.or"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "op_or"
 
+// CHECK-LABEL: "op_outfeed"
 func.func @op_outfeed(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo.token {
   //      CHECK: "mhlo.outfeed"(%arg0, %arg1) {
   // CHECK-SAME:   outfeed_config = ""
@@ -1097,8 +1098,8 @@ func.func @op_outfeed(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo
   } : (tensor<f32>, !stablehlo.token) -> !stablehlo.token
   func.return %0 : !stablehlo.token
 }
-// CHECK-LABEL: "op_outfeed"
 
+// CHECK-LABEL: "op_pad"
 func.func @op_pad(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tensor<16xf32> {
   //      CHECK: "mhlo.pad"(%arg0, %arg1) {
   // CHECK-SAME:   edge_padding_high = dense<4> : tensor<1xi64>,
@@ -1112,43 +1113,43 @@ func.func @op_pad(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tensor<16xf32> {
   } : (tensor<8xf32>, tensor<f32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_pad"
 
+// CHECK-LABEL: "op_partition_id"
 func.func @op_partition_id() -> tensor<ui32> {
   // CHECK: "mhlo.partition_id"() : () -> tensor<ui32>
   %0 = "stablehlo.partition_id"() : () -> tensor<ui32>
   func.return %0 : tensor<ui32>
 }
-// CHECK-LABEL: "op_partition_id"
 
+// CHECK-LABEL: "op_popcnt"
 func.func @op_popcnt(%arg0: tensor<i32>) -> tensor<i32> {
   // CHECK: "mhlo.popcnt"(%arg0) : (tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.popcnt"(%arg0) : (tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "op_popcnt"
 
+// CHECK-LABEL: "op_power"
 func.func @op_power(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.power"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.power"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_power"
 
+// CHECK-LABEL: "op_real_dynamic_slice"
 func.func @op_real_dynamic_slice(%arg0: tensor<?xf32>, %arg1: tensor<1xindex>, %arg2: tensor<1xindex>, %arg3: tensor<1xindex>) -> tensor<?xf32> {
   // CHECK: "mhlo.real_dynamic_slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<?xf32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   %0 = "stablehlo.real_dynamic_slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<?xf32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
-// CHECK-LABEL: "op_real_dynamic_slice"
 
+// CHECK-LABEL: "op_real"
 func.func @op_real(%arg0: tensor<complex<f32>>) -> tensor<f32> {
   // CHECK: "mhlo.real"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
   %0 = "stablehlo.real"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_real"
 
+// CHECK-LABEL: "op_recv"
 func.func @op_recv(%arg0: !stablehlo.token) -> (tensor<f32>, !stablehlo.token) {
   //      CHECK: "mhlo.recv"(%arg0) {
   // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
@@ -1160,8 +1161,8 @@ func.func @op_recv(%arg0: !stablehlo.token) -> (tensor<f32>, !stablehlo.token) {
   } : (!stablehlo.token) -> (tensor<f32>, !stablehlo.token)
   func.return %0#0, %0#1 : tensor<f32>, !stablehlo.token
 }
-// CHECK-LABEL: "op_recv"
 
+// CHECK-LABEL: "op_reduce"
 func.func @op_reduce(%arg0: tensor<16xf32>, %arg1: tensor<f32>) -> tensor<f32> {
   %0 = "stablehlo.reduce"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
@@ -1172,8 +1173,8 @@ func.func @op_reduce(%arg0: tensor<16xf32>, %arg1: tensor<f32>) -> tensor<f32> {
   } : (tensor<16xf32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_reduce"
 
+// CHECK-LABEL: "op_reduce_precision"
 func.func @op_reduce_precision(%arg0: tensor<f32>) -> tensor<f32> {
   //      CHECK: "mhlo.reduce_precision"(%arg0) {
   // CHECK-SAME:   exponent_bits = 8 : i32,
@@ -1185,8 +1186,8 @@ func.func @op_reduce_precision(%arg0: tensor<f32>) -> tensor<f32> {
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_reduce_precision"
 
+// CHECK-LABEL: "op_reduce_scatter"
 func.func @op_reduce_scatter(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   //               CHECK: "mhlo.reduce_scatter"(%arg0) ({
   //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
@@ -1208,8 +1209,8 @@ func.func @op_reduce_scatter(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   } : (tensor<16xf32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_reduce_scatter"
 
+// CHECK-LABEL: "op_reduce_window"
 func.func @op_reduce_window(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) -> tensor<2x5x8x7xf32> {
   //               CHECK: "mhlo.reduce_window"(%arg0, %arg1) ({
   //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
@@ -1235,29 +1236,29 @@ func.func @op_reduce_window(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) ->
   } : (tensor<2x17x31x7xf32>, tensor<f32>) -> tensor<2x5x8x7xf32>
   func.return %0 : tensor<2x5x8x7xf32>
 }
-// CHECK-LABEL: "op_reduce_window"
 
+// CHECK-LABEL: "op_remainder"
 func.func @op_remainder(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.remainder"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.remainder"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_remainder"
 
+// CHECK-LABEL: "op_replica_id"
 func.func @op_replica_id() -> tensor<ui32> {
   // CHECK: "mhlo.replica_id"() : () -> tensor<ui32>
   %0 = "stablehlo.replica_id"() : () -> tensor<ui32>
   func.return %0 : tensor<ui32>
 }
-// CHECK-LABEL: "op_replica_id"
 
+// CHECK-LABEL: "op_reshape"
 func.func @op_reshape(%arg0: tensor<16xf32>) -> tensor<4x4xf32> {
   // CHECK: "mhlo.reshape"(%arg0) : (tensor<16xf32>) -> tensor<4x4xf32>
   %0 = "stablehlo.reshape"(%arg0) : (tensor<16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
-// CHECK-LABEL: "op_reshape"
 
+// CHECK-LABEL: "op_return"
 func.func @op_return(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
   //      CHECK: "mhlo.case"(%arg0) ({
   // CHECK-NEXT:   "mhlo.return"(%arg1) : (tensor<f32>) -> ()
@@ -1267,8 +1268,8 @@ func.func @op_return(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
   }) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_return"
 
+// CHECK-LABEL: "op_reverse"
 func.func @op_reverse(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   //      CHECK: "mhlo.reverse"(%arg0) {
   // CHECK-SAME:   dimensions = dense<0> : tensor<1xi64>
@@ -1278,8 +1279,8 @@ func.func @op_reverse(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   } : (tensor<16xf32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_reverse"
 
+// CHECK-LABEL: "op_rng_bit_generator"
 func.func @op_rng_bit_generator(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
   //      CHECK: "mhlo.rng_bit_generator"(%arg0) {
   // CHECK-SAME:   rng_algorithm = #mhlo.rng_algorithm<PHILOX>
@@ -1289,8 +1290,8 @@ func.func @op_rng_bit_generator(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>
   } : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
   func.return %0#0, %0#1 : tensor<f32>, tensor<f32>
 }
-// CHECK-LABEL: "op_rng_bit_generator"
 
+// CHECK-LABEL: "op_rng"
 func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>) -> tensor<f32> {
   //      CHECK: "mhlo.rng"(%arg0, %arg1, %arg2) {
   // CHECK-SAME:   rng_distribution = #mhlo.rng_distribution<NORMAL>
@@ -1300,29 +1301,29 @@ func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>
   } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_rng"
 
+// CHECK-LABEL: "op_round_nearest_afz"
 func.func @op_round_nearest_afz(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.round_nearest_afz"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.round_nearest_afz"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_round_nearest_afz"
 
+// CHECK-LABEL: "op_round_nearest_even"
 func.func @op_round_nearest_even(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.round_nearest_even"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.round_nearest_even"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_round_nearest_even"
 
+// CHECK-LABEL: "op_rsqrt"
 func.func @op_rsqrt(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.rsqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.rsqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_rsqrt"
 
+// CHECK-LABEL: "op_scatter"
 func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %arg2: tensor<10x300xf32>) -> tensor<200x100x300xf32> {
   //      CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<f32>, %[[ARG4:arg.*]]: tensor<f32>):
@@ -1354,8 +1355,8 @@ func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %
   } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
   func.return %0 : tensor<200x100x300xf32>
 }
-// CHECK-LABEL: "op_scatter"
 
+// CHECK-LABEL: "op_select_and_scatter"
 func.func @op_select_and_scatter(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>, %arg2: tensor<f32>) -> tensor<10x24x24x64xf32> {
   //      CHECK: "mhlo.select_and_scatter"(%arg0, %arg1, %arg2) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG31:arg.*]]: tensor<f32>, %[[ARG41:arg.*]]: tensor<f32>):
@@ -1385,15 +1386,15 @@ func.func @op_select_and_scatter(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<1
   } : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
   func.return %0 : tensor<10x24x24x64xf32>
 }
-// CHECK-LABEL: "op_select_and_scatter"
 
+// CHECK-LABEL: "op_select"
 func.func @op_select(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_select"
 
+// CHECK-LABEL: "op_send"
 func.func @op_send(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo.token {
   //      CHECK: "mhlo.send"(%arg0, %arg1) {
   // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
@@ -1405,8 +1406,8 @@ func.func @op_send(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo.to
   } : (tensor<f32>, !stablehlo.token) -> !stablehlo.token
   func.return %0 : !stablehlo.token
 }
-// CHECK-LABEL: "op_send"
 
+// CHECK-LABEL: "op_set_dimension_size"
 func.func @op_set_dimension_size(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<16xf32> {
   //      CHECK: "mhlo.set_dimension_size"(%arg0, %arg1) {
   // CHECK-SAME:   dimension = 0 : i64
@@ -1416,43 +1417,43 @@ func.func @op_set_dimension_size(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> te
   } : (tensor<?xf32>, tensor<i32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_set_dimension_size"
 
+// CHECK-LABEL: "op_shift_left"
 func.func @op_shift_left(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
   // CHECK: "mhlo.shift_left"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.shift_left"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "op_shift_left"
 
+// CHECK-LABEL: "op_shift_right_arithmetic"
 func.func @op_shift_right_arithmetic(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
   // CHECK: "mhlo.shift_right_arithmetic"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.shift_right_arithmetic"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "op_shift_right_arithmetic"
 
+// CHECK-LABEL: "op_shift_right_logical"
 func.func @op_shift_right_logical(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
   // CHECK: "mhlo.shift_right_logical"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.shift_right_logical"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "op_shift_right_logical"
 
+// CHECK-LABEL: "op_sign"
 func.func @op_sign(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.sign"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.sign"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_sign"
 
+// CHECK-LABEL: "op_sine"
 func.func @op_sine(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.sine"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.sine"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_sine"
 
+// CHECK-LABEL: "op_slice"
 func.func @op_slice(%arg0: tensor<16xf32>) -> tensor<4xf32> {
   //      CHECK: "mhlo.slice"(%arg0) {
   // CHECK-SAME:   limit_indices = dense<4> : tensor<1xi64>,
@@ -1466,8 +1467,8 @@ func.func @op_slice(%arg0: tensor<16xf32>) -> tensor<4xf32> {
   } : (tensor<16xf32>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
 }
-// CHECK-LABEL: "op_slice"
 
+// CHECK-LABEL: "op_sort"
 func.func @op_sort(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   //      CHECK: "mhlo.sort"(%arg0) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
@@ -1487,22 +1488,22 @@ func.func @op_sort(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   } : (tensor<16xf32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "op_sort"
 
+// CHECK-LABEL: "op_sqrt"
 func.func @op_sqrt(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.sqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.sqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_sqrt"
 
+// CHECK-LABEL: "op_subtract"
 func.func @op_subtract(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.subtract"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.subtract"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_subtract"
 
+// CHECK-LABEL: "op_tan_mhlo_v1"
 func.func @op_tan_mhlo_v1(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.tan"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.custom_call"(%arg0) {
@@ -1513,15 +1514,15 @@ func.func @op_tan_mhlo_v1(%arg0: tensor<f32>) -> tensor<f32> {
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_tan_mhlo_v1"
 
+// CHECK-LABEL: "op_tanh"
 func.func @op_tanh(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.tanh"(%arg0) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.tanh"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_tanh"
 
+// CHECK-LABEL: "op_torch_index_select"
 func.func @op_torch_index_select(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>) ->  tensor<2x1x5xf32> {
   //      CHECK: "mhlo.torch_index_select"(%arg0, %arg1) {
   // CHECK-SAME:   batch_dims = 0 : i64,
@@ -1533,8 +1534,8 @@ func.func @op_torch_index_select(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>)
   } : (tensor<5x1x5xf32>, tensor<2xi32>) -> tensor<2x1x5xf32>
   func.return %0 : tensor<2x1x5xf32>
 }
-// CHECK-LABEL: "op_torch_index_select"
 
+// CHECK-LABEL: "op_trace"
 func.func @op_trace(%arg0: tensor<f32>) {
   //      CHECK: "mhlo.trace"(%arg0) {
   // CHECK-SAME:   tag = "foo"
@@ -1544,8 +1545,8 @@ func.func @op_trace(%arg0: tensor<f32>) {
   } : (tensor<f32>) -> ()
   func.return
 }
-// CHECK-LABEL: "op_trace"
 
+// CHECK-LABEL: "op_transpose"
 func.func @op_transpose(%arg0: tensor<16x8xf32>) ->  tensor<8x16xf32> {
   //      CHECK: "mhlo.transpose"(%arg0) {
   // CHECK-SAME:   permutation = dense<[1, 0]> : tensor<2xi64>
@@ -1555,8 +1556,8 @@ func.func @op_transpose(%arg0: tensor<16x8xf32>) ->  tensor<8x16xf32> {
   } : (tensor<16x8xf32>) -> tensor<8x16xf32>
   func.return %0 : tensor<8x16xf32>
 }
-// CHECK-LABEL: "op_transpose"
 
+// CHECK-LABEL: "op_triangular_solve"
 func.func @op_triangular_solve(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>) ->  tensor<16x16xf32> {
   //      CHECK: "mhlo.triangular_solve"(%arg0, %arg1) {
   // CHECK-SAME:   left_side = true,
@@ -1572,15 +1573,15 @@ func.func @op_triangular_solve(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32
   } : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
-// CHECK-LABEL: "op_triangular_solve"
 
+// CHECK-LABEL: "op_tuple"
 func.func @op_tuple(%arg0: tensor<f32>) -> tuple<tensor<f32>> {
   // CHECK: "mhlo.tuple"(%arg0) : (tensor<f32>) -> tuple<tensor<f32>>
   %0 = "stablehlo.tuple"(%arg0) : (tensor<f32>) -> tuple<tensor<f32>>
   func.return %0 : tuple<tensor<f32>>
 }
-// CHECK-LABEL: "op_tuple"
 
+// CHECK-LABEL: "op_unary_einsum"
 func.func @op_unary_einsum(%arg0: tensor<8x16xf32>) -> tensor<8xf32> {
   //      CHECK: "mhlo.unary_einsum"(%arg0) {
   // CHECK-SAME:   einsum_config = "ab->a"
@@ -1590,22 +1591,22 @@ func.func @op_unary_einsum(%arg0: tensor<8x16xf32>) -> tensor<8xf32> {
   } : (tensor<8x16xf32>) -> tensor<8xf32>
   func.return %0 : tensor<8xf32>
 }
-// CHECK-LABEL: "op_unary_einsum"
 
+// CHECK-LABEL: "op_uniform_dequantize"
 func.func @op_uniform_dequantize(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>) -> tensor<f32> {
   // CHECK: "mhlo.uniform_dequantize"(%arg0) : (tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>) -> tensor<f32>
   %0 = "stablehlo.uniform_dequantize"(%arg0) : (tensor<!quant.uniform<i8:f32, 34.0:16>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_uniform_dequantize"
 
+// CHECK-LABEL: "op_uniform_quantize"
 func.func @op_uniform_quantize(%arg0: tensor<f32>) -> tensor<!quant.uniform<i8:f32, 34.0:16>> {
   // CHECK: "mhlo.uniform_quantize"(%arg0) : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>
   %0 = "stablehlo.uniform_quantize"(%arg0) : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 34.0:16>>
   func.return %0 : tensor<!quant.uniform<i8:f32, 34.0:16>>
 }
-// CHECK-LABEL: "op_uniform_quantize"
 
+// CHECK-LABEL: "op_while"
 func.func @op_while(%arg0: tensor<i1>) -> tensor<i1> {
   //      CHECK: "mhlo.while"(%arg0) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<i1>):
@@ -1623,193 +1624,212 @@ func.func @op_while(%arg0: tensor<i1>) -> tensor<i1> {
   }) : (tensor<i1>) -> tensor<i1>
   func.return %0: tensor<i1>
 }
-// CHECK-LABEL: "op_while"
 
+// CHECK-LABEL: "op_xor"
 func.func @op_xor(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
   // CHECK: "mhlo.xor"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "stablehlo.xor"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "op_xor"
 
 // ============ TYPES ============
 
+// CHECK-LABEL: "type_i1"
 func.func @type_i1(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
   // CHECK: "mhlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "stablehlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
-// CHECK-LABEL: "type_i1"
 
+// CHECK-LABEL: "type_i4"
 func.func @type_i4(%arg0: tensor<i4>, %arg1: tensor<i4>) -> tensor<i4> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<i4>, tensor<i4>) -> tensor<i4>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<i4>, tensor<i4>) -> tensor<i4>
   func.return %0 : tensor<i4>
 }
-// CHECK-LABEL: "type_i4"
 
+// CHECK-LABEL: "type_i8"
 func.func @type_i8(%arg0: tensor<i8>, %arg1: tensor<i8>) -> tensor<i8> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<i8>, tensor<i8>) -> tensor<i8>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<i8>, tensor<i8>) -> tensor<i8>
   func.return %0 : tensor<i8>
 }
-// CHECK-LABEL: "type_i8"
 
+// CHECK-LABEL: "type_i16"
 func.func @type_i16(%arg0: tensor<i16>, %arg1: tensor<i16>) -> tensor<i16> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<i16>, tensor<i16>) -> tensor<i16>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<i16>, tensor<i16>) -> tensor<i16>
   func.return %0 : tensor<i16>
 }
-// CHECK-LABEL: "type_i16"
 
+// CHECK-LABEL: "type_i32"
 func.func @type_i32(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
-// CHECK-LABEL: "type_i32"
 
+// CHECK-LABEL: "type_i64"
 func.func @type_i64(%arg0: tensor<i64>, %arg1: tensor<i64>) -> tensor<i64> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<i64>, tensor<i64>) -> tensor<i64>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<i64>, tensor<i64>) -> tensor<i64>
   func.return %0 : tensor<i64>
 }
-// CHECK-LABEL: "type_i64"
 
+// CHECK-LABEL: "type_ui4"
 func.func @type_ui4(%arg0: tensor<ui4>, %arg1: tensor<ui4>) -> tensor<ui4> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<ui4>, tensor<ui4>) -> tensor<ui4>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<ui4>, tensor<ui4>) -> tensor<ui4>
   func.return %0 : tensor<ui4>
 }
-// CHECK-LABEL: "type_ui4"
 
+// CHECK-LABEL: "type_ui8"
 func.func @type_ui8(%arg0: tensor<ui8>, %arg1: tensor<ui8>) -> tensor<ui8> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<ui8>, tensor<ui8>) -> tensor<ui8>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<ui8>, tensor<ui8>) -> tensor<ui8>
   func.return %0 : tensor<ui8>
 }
-// CHECK-LABEL: "type_ui8"
 
+// CHECK-LABEL: "type_ui16"
 func.func @type_ui16(%arg0: tensor<ui16>, %arg1: tensor<ui16>) -> tensor<ui16> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<ui16>, tensor<ui16>) -> tensor<ui16>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<ui16>, tensor<ui16>) -> tensor<ui16>
   func.return %0 : tensor<ui16>
 }
-// CHECK-LABEL: "type_ui16"
 
+// CHECK-LABEL: "type_ui32"
 func.func @type_ui32(%arg0: tensor<ui32>, %arg1: tensor<ui32>) -> tensor<ui32> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<ui32>, tensor<ui32>) -> tensor<ui32>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<ui32>, tensor<ui32>) -> tensor<ui32>
   func.return %0 : tensor<ui32>
 }
-// CHECK-LABEL: "type_ui32"
 
+// CHECK-LABEL: "type_ui64"
 func.func @type_ui64(%arg0: tensor<ui64>, %arg1: tensor<ui64>) -> tensor<ui64> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<ui64>, tensor<ui64>) -> tensor<ui64>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<ui64>, tensor<ui64>) -> tensor<ui64>
   func.return %0 : tensor<ui64>
 }
-// CHECK-LABEL: "type_ui64"
 
+// CHECK-LABEL: "type_f8E4M3FN"
 func.func @type_f8E4M3FN(%arg0: tensor<f8E4M3FN>, %arg1: tensor<f8E4M3FN>) -> tensor<f8E4M3FN> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
   func.return %0 : tensor<f8E4M3FN>
 }
-// CHECK-LABEL: "type_f8E4M3FN"
 
+// CHECK-LABEL: "type_f8E4M3FNUZ"
+func.func @type_f8E4M3FNUZ(%arg0: tensor<f8E4M3FNUZ>, %arg1: tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ> {
+  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3FNUZ>, tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ>
+  %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3FNUZ>, tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ>
+  func.return %0 : tensor<f8E4M3FNUZ>
+}
+
+// CHECK-LABEL: "type_f8E4M3B11FNUZ"
+func.func @type_f8E4M3B11FNUZ(%arg0: tensor<f8E4M3B11FNUZ>, %arg1: tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ> {
+  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3B11FNUZ>, tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ>
+  %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3B11FNUZ>, tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ>
+  func.return %0 : tensor<f8E4M3B11FNUZ>
+}
+
+// CHECK-LABEL: "type_f8E5M2"
 func.func @type_f8E5M2(%arg0: tensor<f8E5M2>, %arg1: tensor<f8E5M2>) -> tensor<f8E5M2> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
   func.return %0 : tensor<f8E5M2>
 }
-// CHECK-LABEL: "type_f8E5M2"
 
+// CHECK-LABEL: "type_f8E5M2FNUZ"
+func.func @type_f8E5M2FNUZ(%arg0: tensor<f8E5M2FNUZ>, %arg1: tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ> {
+  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f8E5M2FNUZ>, tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ>
+  %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f8E5M2FNUZ>, tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ>
+  func.return %0 : tensor<f8E5M2FNUZ>
+}
+
+// CHECK-LABEL: "type_bf16"
 func.func @type_bf16(%arg0: tensor<bf16>, %arg1: tensor<bf16>) -> tensor<bf16> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
   func.return %0 : tensor<bf16>
 }
-// CHECK-LABEL: "type_bf16"
 
+// CHECK-LABEL: "type_f16"
 func.func @type_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f16>, tensor<f16>) -> tensor<f16>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f16>, tensor<f16>) -> tensor<f16>
   func.return %0 : tensor<f16>
 }
-// CHECK-LABEL: "type_f16"
 
+// CHECK-LABEL: "type_f32"
 func.func @type_f32(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "type_f32"
 
+// CHECK-LABEL: "type_f64"
 func.func @type_f64(%arg0: tensor<f64>, %arg1: tensor<f64>) -> tensor<f64> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f64>, tensor<f64>) -> tensor<f64>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f64>, tensor<f64>) -> tensor<f64>
   func.return %0 : tensor<f64>
 }
-// CHECK-LABEL: "type_f64"
 
+// CHECK-LABEL: "type_complex_f32"
 func.func @type_complex_f32(%arg0: tensor<complex<f32>>, %arg1: tensor<complex<f32>>) -> tensor<complex<f32>> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<complex<f32>>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<complex<f32>>
   func.return %0 : tensor<complex<f32>>
 }
-// CHECK-LABEL: "type_complex_f32"
 
+// CHECK-LABEL: "type_complex_f64"
 func.func @type_complex_f64(%arg0: tensor<complex<f64>>, %arg1: tensor<complex<f64>>) -> tensor<complex<f64>> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<complex<f64>>, tensor<complex<f64>>) -> tensor<complex<f64>>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<complex<f64>>, tensor<complex<f64>>) -> tensor<complex<f64>>
   func.return %0 : tensor<complex<f64>>
 }
-// CHECK-LABEL: "type_complex_f64"
 
+// CHECK-LABEL: "type_dynamism_ranked"
 func.func @type_dynamism_ranked(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK: "mhlo.abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   %0 = "stablehlo.abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
-// CHECK-LABEL: "type_dynamism_ranked"
 
+// CHECK-LABEL: "type_dynamism_unranked"
 func.func @type_dynamism_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: "mhlo.abs"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   %0 = "stablehlo.abs"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
-// CHECK-LABEL: "type_dynamism_unranked"
 
+// CHECK-LABEL: "type_quantization"
 func.func @type_quantization(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<!quant.uniform<i8:f32, 34.0:16>>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "type_quantization"
 
+// CHECK-LABEL: "type_sparsity"
 func.func @type_sparsity(%arg0: tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<16xf32> {
   // CHECK: "mhlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<16xf32>
   %0 = "stablehlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
-// CHECK-LABEL: "type_sparsity"
 
 func.func @type_token_callee(%arg0: !stablehlo.token) -> !stablehlo.token {
+  // CHECK: function_type = (!mhlo.token) -> !mhlo.token, sym_name = "type_token_callee"
   // CHECK: "func.return"(%arg0) : (!mhlo.token) -> ()
   return %arg0 : !stablehlo.token
 }
-//       CHECK: function_type = (!mhlo.token) -> !mhlo.token
-// CHECK-LABEL: "type_token_callee"
 
 func.func @type_token_caller(%arg0: !stablehlo.token) -> !stablehlo.token {
-  // CHECK: "func.call"(%arg0) {callee = @type_token_callee} : (!mhlo.token) -> !mhlo.token
+  // CHECK: function_type = (!mhlo.token) -> !mhlo.token, sym_name = "type_token_caller"
+  // CHECK: "func.call"(%arg0) <{callee = @type_token_callee}> : (!mhlo.token) -> !mhlo.token
   %0 = func.call @type_token_callee(%arg0) : (!stablehlo.token) -> !stablehlo.token
   return %0 : !stablehlo.token
 }
-//       CHECK: function_type = (!mhlo.token) -> !mhlo.token
-// CHECK-LABEL: "type_token_caller"
 
+// CHECK-LABEL: "type_token_region"
 func.func @type_token_region(%arg0: tensor<i1>, %arg1: !stablehlo.token) {
   //      CHECK: "mhlo.while"(%arg1) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: !mhlo.token):
@@ -1827,8 +1847,8 @@ func.func @type_token_region(%arg0: tensor<i1>, %arg1: !stablehlo.token) {
   }) : (!stablehlo.token) -> !stablehlo.token
   return
 }
-// CHECK-LABEL: "type_token_region"
 
+// CHECK-LABEL: "type_tuple"
 func.func @type_tuple(%arg0: tuple<tensor<f32>>) -> tuple<!stablehlo.token> {
   %0 = "stablehlo.custom_call"(%arg0) {
     call_target_name = "foo"
@@ -1836,7 +1856,6 @@ func.func @type_tuple(%arg0: tuple<tensor<f32>>) -> tuple<!stablehlo.token> {
   } : (tuple<tensor<f32>>) -> tuple<!stablehlo.token>
   return %0 : tuple<!stablehlo.token>
 }
-// CHECK-LABEL: "type_tuple"
 
 // ============ NEGATIVE TESTS ============
 // Some ops, attributes and types used in StableHLO programs are not supported in MHLO.
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir
index 1d9a912fee6..9ce0ed58559 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir
@@ -401,6 +401,7 @@ func.func @verify_dynamic_operand(%arg0: tensor<8x?xf32>, %arg1 : tensor<4xf32>)
 func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xi32>,
     %arg2: tensor<f32>, %arg3: tensor<i32>) -> (tensor<?xf32>) {
 
+  // expected-error@+2 {{'mhlo.reduce' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<?xf32>', 'tensor<?xi32>' are incompatible with return type(s) of operation 'tensor<?xf32>', 'tensor<?xi32>', 'tensor<?xi32>'}}
   %0:3 = "mhlo.reduce"(%arg0, %arg1, %arg2, %arg3) ({
 
@@ -419,6 +420,7 @@ func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xi32>,
 func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
     -> (tensor<?x?xi32>) {
 
+  // expected-error@+2 {{'mhlo.reduce' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.reduce' op inferred type(s) 'tensor<?xf32>' are incompatible with return type(s) of operation 'tensor<?xf32>', 'tensor<?xf32>'}}
   %0:2 = "mhlo.reduce"(%arg0, %arg1) ({
 
@@ -436,6 +438,7 @@ func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
 func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xi32>,
     %arg2: tensor<f32>, %arg3: tensor<i32>) -> (tensor<?xf32>) {
 
+  // expected-error@+2 {{'mhlo.reduce' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.reduce' op inferred type(s) 'tensor<?xf32>', 'tensor<?xi32>' are incompatible with return type(s) of operation 'tensor<?xf32>', 'tensor<?x?xf32>'}}
   %0:2 = "mhlo.reduce"(%arg0, %arg1, %arg2, %arg3) ({
 
@@ -454,6 +457,7 @@ func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xi32>,
 func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
     -> (tensor<?xi32>) {
 
+  // expected-error@+2 {{'mhlo.reduce' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.reduce' op inferred type(s) 'tensor<?xf32>' are incompatible with return type(s) of operation 'tensor<?xi32>'}}
   %0 = "mhlo.reduce"(%arg0, %arg1) ({
 
@@ -471,6 +475,7 @@ func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
 func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
     -> (tensor<?x?xi32>) {
 
+  // expected-error@+2 {{'mhlo.reduce' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.reduce' op inferred type(s) 'tensor<?xf32>' are incompatible with return type(s) of operation 'tensor<?x?xf32>'}}
   %0 = "mhlo.reduce"(%arg0, %arg1) ({
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_window_op.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_window_op.mlir
index 7a2bcdbf3b7..59c2159e64e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_window_op.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_window_op.mlir
@@ -450,6 +450,7 @@ func.func @reduce_window_invalid_attributes(%arg0: tensor<4x2xf32>,
 func.func @reduce_window_invalid_ret_type(%arg0: tensor<4x2xf32>,
     %arg1: tensor<4x2xi32>, %init0: tensor<f32>, %init1: tensor<i32>) ->
         tensor<2x2xf32> {
+  // expected-error @+2 {{'mhlo.reduce_window' op failed to infer returned types}}
   // expected-error @+1 {{inferred type(s) 'tensor<2x2xf32>', 'tensor<2x2xi32>' are incompatible with return type(s) of operation 'tensor<2x2xf32>'}}
   %0 = "mhlo.reduce_window"(%arg0, %arg1, %init0, %init1) ({
          ^bb0(%a0: tensor<f32>, %a1: tensor<i32>,
@@ -472,6 +473,7 @@ func.func @reduce_window_invalid_ret_type(%arg0: tensor<4x2xf32>,
 func.func @reduce_window_invalid_ret_type(%arg0: tensor<4x2xf32>,
     %arg1: tensor<4x2xi32>, %init0: tensor<f32>, %init1: tensor<i32>) ->
         (tensor<2x2xf32>, tensor<2x3xi32>) {
+  // expected-error @+2 {{'mhlo.reduce_window' op failed to infer returned types}}
   // expected-error @+1 {{inferred type(s) 'tensor<2x2xf32>', 'tensor<2x2xi32>' are incompatible with return type(s) of operation 'tensor<2x2xf32>', 'tensor<2x3xi32>'}}
   %0:2 = "mhlo.reduce_window"(%arg0, %arg1, %init0, %init1) ({
          ^bb0(%a0: tensor<f32>, %a1: tensor<i32>,
@@ -494,6 +496,7 @@ func.func @reduce_window_invalid_ret_type(%arg0: tensor<4x2xf32>,
 func.func @reduce_window_invalid_ret_type(%arg0: tensor<4x2xf32>,
     %arg1: tensor<4x2xi32>, %init0: tensor<f32>, %init1: tensor<i32>) ->
         (tensor<2x2xi32>, tensor<2x2xi32>) {
+  // expected-error @+2 {{'mhlo.reduce_window' op failed to infer returned types}}
   // expected-error @+1 {{inferred type(s) 'tensor<2x2xf32>', 'tensor<2x2xi32>' are incompatible with return type(s) of operation 'tensor<2x2xi32>', 'tensor<2x2xi32>'}}
   %0:2 = "mhlo.reduce_window"(%arg0, %arg1, %init0, %init1) ({
          ^bb0(%a0: tensor<f32>, %a1: tensor<i32>,
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_scatter_op.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_scatter_op.mlir
index 137a6609845..58789086a22 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_scatter_op.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_scatter_op.mlir
@@ -481,6 +481,7 @@ func.func @invalid_scatter_return_type(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100xf32> {
 
+  // expected-error @+2 {{'mhlo.scatter' op failed to infer returned types}}
   // expected-error @+1 {{inferred type(s) 'tensor<200x100x300xf32>' are incompatible with return type(s) of operation 'tensor<200x100xf32>'}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_select_and_scatter_op.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_select_and_scatter_op.mlir
index 1eebafb939d..6f30d4af6aa 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_select_and_scatter_op.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_select_and_scatter_op.mlir
@@ -394,6 +394,7 @@ func.func @select_and_scatter_invalid_ret_type(
     %arg1: tensor<10x12x12x64xf32>) -> () {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
 
+    // expected-error @+2 {{'mhlo.select_and_scatter' op failed to infer returned types}}
     // expected-error @+1 {{inferred type(s) 'tensor<10x24x24x64xf32>' are incompatible with return type(s) of operation 'tensor<10x24x24x32xf32>'}}
     %1 = "mhlo.select_and_scatter"(%arg0, %arg1, %0) ({
     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
@@ -422,6 +423,7 @@ func.func @select_and_scatter_invalid_ret_type(
     %arg1: tensor<10x12x12x64xf32>) -> () {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
 
+    // expected-error @+2 {{'mhlo.select_and_scatter' op failed to infer returned types}}
     // expected-error @+1 {{inferred type(s) 'tensor<10x24x24x64xf32>' are incompatible with return type(s) of operation 'tensor<10x24x24x64xi32>'}}
     %1 = "mhlo.select_and_scatter"(%arg0, %arg1, %0) ({
     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir
index 882f7cf66fc..3f07ee8b71d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir
@@ -96,6 +96,7 @@ func.func @while_with_invalid_types(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %cst_0 = arith.constant dense<0> : tensor<1xi32>
   %cst_1 = arith.constant dense<[100, 100]> : tensor<2xi32>
   %cst_2 = arith.constant dense<1.00> : tensor<1xf32>
+  // expected-error @+2 {{'mhlo.while' op failed to infer returned types}}
   // expected-error @+1 {{inferred type(s) 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' are incompatible with return type(s) of operation 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<3xf32>', 'tensor<1xf32>'}}
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
@@ -121,7 +122,7 @@ func.func @while_with_invalid_tuples(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %cst_2 = arith.constant dense<1.00> : tensor<1xf32>
   %0 = "mhlo.tuple"(%arg0, %cst_2) : (tensor<3xf32>, tensor<1xf32>) -> tuple<tensor<3xf32>, tensor<1xf32>>
   %1 = "mhlo.tuple"(%cst_1, %0) : (tensor<2xi32>, tuple<tensor<3xf32>, tensor<1xf32>>) -> tuple<tensor<2xi32>, tuple<tensor<3xf32>, tensor<1xf32>>>
-  // expected-error @+1 {{op operand #1 must be tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or complex type with 32-bit float or 64-bit float elements or 4/8/16/32-bit uniform quantized signed integer or 4/8/16/32-bit uniform quantized unsigned integer values or token}}
+  // expected-error @+1 {{op operand #1 must be tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or complex type with 32-bit float or 64-bit float elements or 4/8/16/32-bit uniform quantized signed integer or 4/8/16/32-bit uniform quantized unsigned integer values or token, but got 'tuple<tensor<2xi32>, tuple<tensor<3xf32>, tensor<1xf32>>>'}}
   %2:2 = "mhlo.while"(%cst_0, %1) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tuple<tensor<2xi32>, tuple<tensor<1xf32>, tensor<3xf32>>>):
     %t0 = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32} : (tuple<tensor<2xi32>, tuple<tensor<1xf32>, tensor<3xf32>>>) -> tensor<2xi32>
@@ -151,7 +152,7 @@ func.func @while_with_different_types(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %cst_0 = arith.constant dense<0> : tensor<1xi32>
   %cst_1 = arith.constant dense<[100, 100]> : tensor<2xi32>
   %cst_2 = arith.constant dense<1.00> : tensor<1xf32>
-  // expected-error @+1 {{expect operands are compatible with condition block arguments but got 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' vs 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<3xf32>', 'tensor<3xf32>'}}
+  // expected-error @+1 {{expect operands to be compatible with condition block arguments but got 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' vs 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<3xf32>', 'tensor<3xf32>'}}
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<3xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
@@ -174,7 +175,7 @@ func.func @while_with_different_types(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %cst_0 = arith.constant dense<0> : tensor<1xi32>
   %cst_1 = arith.constant dense<[100, 100]> : tensor<2xi32>
   %cst_2 = arith.constant dense<1.00> : tensor<1xf32>
-  // expected-error @+1 {{expect operands are compatible with body block arguments but got 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' vs 'tensor<1xi32>', 'tensor<3xi32>', 'tensor<1xf32>', 'tensor<3xf32>'}}
+  // expected-error @+1 {{expect operands to be compatible with body block arguments but got 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' vs 'tensor<1xi32>', 'tensor<3xi32>', 'tensor<1xf32>', 'tensor<3xf32>'}}
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
@@ -197,7 +198,7 @@ func.func @while_with_block_count_mismatch(%arg0: tensor<3xf32>) -> tensor<3xf32
   %cst_0 = arith.constant dense<0> : tensor<1xi32>
   %cst_1 = arith.constant dense<[100, 100]> : tensor<2xi32>
   %cst_2 = arith.constant dense<1.00> : tensor<1xf32>
-  // expected-error @+1 {{expect operands are compatible with condition block arguments but got 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' vs 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>'}}
+  // expected-error @+1 {{expect operands to be compatible with condition block arguments but got 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' vs 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>'}}
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
@@ -220,7 +221,7 @@ func.func @while_with_block_count_mismatch(%arg0: tensor<3xf32>) -> tensor<3xf32
   %cst_0 = arith.constant dense<0> : tensor<1xi32>
   %cst_1 = arith.constant dense<[100, 100]> : tensor<2xi32>
   %cst_2 = arith.constant dense<1.00> : tensor<1xf32>
-  // expected-error @+1 {{expect operands are compatible with body block arguments but got 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' vs 'tensor<1xi32>', 'tensor<3xi32>', 'tensor<1xf32>'}}
+  // expected-error @+1 {{expect operands to be compatible with body block arguments but got 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' vs 'tensor<1xi32>', 'tensor<3xi32>', 'tensor<1xf32>'}}
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
@@ -303,7 +304,7 @@ func.func @while_with_body_return_mismatch(%arg0: tensor<3xf32>) -> tensor<3xf32
   %cst_0 = arith.constant dense<0> : tensor<1xi32>
   %cst_1 = arith.constant dense<[100, 100]> : tensor<2xi32>
   %cst_2 = arith.constant dense<1.00> : tensor<1xf32>
-  // expected-error @+1 {{expect operands are compatible with body block return types but got 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' vs 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<1xf32>'}}
+  // expected-error @+1 {{expect operands to be compatible with body block return types but got 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' vs 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<1xf32>'}}
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
@@ -347,7 +348,7 @@ func.func @while_mismatch_operand_count_with_body_return(%arg0: tensor<3xf32>) -
   %cst_0 = arith.constant dense<0> : tensor<1xi32>
   %cst_1 = arith.constant dense<[100, 100]> : tensor<2xi32>
   %cst_2 = arith.constant dense<1.00> : tensor<1xf32>
-  // expected-error @+1 {{expect operands are compatible with body block return types but got 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' vs 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>'}}
+  // expected-error @+1 {{expect operands to be compatible with body block return types but got 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>', 'tensor<3xf32>' vs 'tensor<1xi32>', 'tensor<2xi32>', 'tensor<1xf32>'}}
   %1:4 = "mhlo.while"(%cst_0, %cst_1, %cst_2, %arg0) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tensor<2xi32>, %arg3: tensor<1xf32>, %arg4: tensor<3xf32>):
     %2 = arith.constant dense<0> : tensor<i32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.cc b/tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.cc
index 5468e21c412..3b30b16e724 100644
--- a/tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.cc
@@ -259,7 +259,7 @@ LogicalResult checkYieldOutputs(YieldOp yieldOp,
            << yieldOp.getValues().size();
   }
 
-  for (auto &item : llvm::enumerate(
+  for (const auto &item : llvm::enumerate(
            llvm::zip(expectedElementTypes, yieldOp.getOperandTypes()))) {
     Type outputElementType, resultType;
     unsigned index = item.index();
@@ -307,10 +307,10 @@ Value getSingleOperandTiledImplementationForConcatRecursively(
   assert(remainingOperands.size() > 1 &&
          "expect more than one operand at this point");
   Value leadingOperandSizeInConcatDim =
-      b.create<tensor::DimOp>(loc, leadingOperand, concatDim);
+      b.createOrFold<tensor::DimOp>(loc, leadingOperand, concatDim);
   Value remainingOffsetInConcatDim =
       getValueOrCreateConstantIndexOp(b, loc, remainingOffsets[concatDim]);
-  Value leadingOperandPredicate = b.create<arith::CmpIOp>(
+  Value leadingOperandPredicate = b.createOrFold<arith::CmpIOp>(
       loc, arith::CmpIPredicate::ult, remainingOffsetInConcatDim,
       leadingOperandSizeInConcatDim);
   auto ifOp = b.create<scf::IfOp>(
@@ -322,10 +322,9 @@ Value getSingleOperandTiledImplementationForConcatRecursively(
         b.create<scf::YieldOp>(loc, tiledConcat);
       },
       [&](OpBuilder &b, Location loc) {
-        remainingOffsets[concatDim] =
-            b.create<arith::SubIOp>(loc, remainingOffsetInConcatDim,
-                                    leadingOperandSizeInConcatDim)
-                .getResult();
+        remainingOffsets[concatDim] = getAsOpFoldResult(
+            b.createOrFold<arith::SubIOp>(loc, remainingOffsetInConcatDim,
+                                          leadingOperandSizeInConcatDim));
         Value tiledConcat =
             getSingleOperandTiledImplementationForConcatRecursively(
                 b, loc, concatDim, remainingOperands.drop_front(),
@@ -378,16 +377,18 @@ Value getGenericTiledImplementationForConcat(ConcatenateOp op, OpBuilder &b,
     // the remaining offset clamped into the bounds of the operand. Note that
     // the remaining offset is always >= 0.
     Value operandSizeInConcatDim =
-        b.create<tensor::DimOp>(loc, operand, concatDimCst);
-    Value operandTileOffsetInConcatDim = b.create<arith::MinUIOp>(
+        b.createOrFold<tensor::DimOp>(loc, operand, concatDimCst);
+    Value operandTileOffsetInConcatDim = b.createOrFold<arith::MinUIOp>(
         loc, remainingTileOffsetInConcatDim, operandSizeInConcatDim);
-    operandTileOffsetsBase[concatDim] = operandTileOffsetInConcatDim;
+    operandTileOffsetsBase[concatDim] =
+        getAsOpFoldResult(operandTileOffsetInConcatDim);
 
     // Find the current operand's tile size in the concat dimension.
-    Value remainingOperandSizeInConcatDim = b.create<arith::SubIOp>(
+    Value remainingOperandSizeInConcatDim = b.createOrFold<arith::SubIOp>(
         loc, operandSizeInConcatDim, operandTileOffsetInConcatDim);
-    operandTileSizesBase[concatDim] = b.createOrFold<arith::MinUIOp>(
-        loc, remainingOperandSizeInConcatDim, maxTileSizeInConcatDim);
+    operandTileSizesBase[concatDim] =
+        getAsOpFoldResult(b.createOrFold<arith::MinUIOp>(
+            loc, remainingOperandSizeInConcatDim, maxTileSizeInConcatDim));
 
     // Create the operand tile and materialize the subset for this operand.
     tiledOperands.push_back(
@@ -398,13 +399,13 @@ Value getGenericTiledImplementationForConcat(ConcatenateOp op, OpBuilder &b,
     // concat dimension. The remaining offset is subtracted by the operand's
     // size but must remain >= 0.
     if (operand != op.getInputs().back()) {
-      Value cmp = b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ule,
-                                          remainingTileOffsetInConcatDim,
-                                          operandSizeInConcatDim);
-      Value sub = b.create<arith::SubIOp>(loc, remainingTileOffsetInConcatDim,
-                                          operandSizeInConcatDim);
+      Value cmp = b.createOrFold<arith::CmpIOp>(loc, arith::CmpIPredicate::ule,
+                                                remainingTileOffsetInConcatDim,
+                                                operandSizeInConcatDim);
+      Value sub = b.createOrFold<arith::SubIOp>(
+          loc, remainingTileOffsetInConcatDim, operandSizeInConcatDim);
       remainingTileOffsetInConcatDim =
-          b.create<arith::SelectOp>(loc, cmp, zeroCst, sub);
+          b.createOrFold<arith::SelectOp>(loc, cmp, zeroCst, sub);
     }
   }
 
@@ -436,12 +437,12 @@ Value getTiledImplementationForConcat(ConcatenateOp op, OpBuilder &b,
 
 }  // namespace
 
-SmallVector<Operation *> ConcatenateOp::getTiledImplementation(
+FailureOr<TilingResult> ConcatenateOp::getTiledImplementation(
     OpBuilder &b, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes) {
   auto tiled =
       getTiledImplementationForConcat(*this, b, getLoc(), offsets, sizes);
-  return {tiled.getDefiningOp()};
+  return TilingResult{{tiled.getDefiningOp()}, {tiled}};
 }
 
 LogicalResult ConcatenateOp::getResultTilePosition(
@@ -454,14 +455,14 @@ LogicalResult ConcatenateOp::getResultTilePosition(
   return success();
 }
 
-FailureOr<Value> ConcatenateOp::generateResultTileValue(
+FailureOr<TilingResult> ConcatenateOp::generateResultTileValue(
     OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes) {
   assert(resultNumber == 0 && "expect unique result idx");
-  return getTiledImplementation(b, offsets, sizes)
-      .front()
-      ->getResults()
-      .front();
+  FailureOr<TilingResult> tilingResult =
+      getTiledImplementation(b, offsets, sizes);
+  if (failed(tilingResult)) return failure();
+  return tilingResult.value();
 }
 
 LogicalResult ConcatenateOp::reifyResultShapes(
@@ -596,7 +597,7 @@ SmallVector<Range> DynamicBroadcastInDimOp::getIterationDomain(OpBuilder &b) {
   return getIterationDomainForTensor(b, getLoc(), getInit());
 }
 
-SmallVector<Operation *> DynamicBroadcastInDimOp::getTiledImplementation(
+FailureOr<TilingResult> DynamicBroadcastInDimOp::getTiledImplementation(
     OpBuilder &b, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes) {
   // Create tile subset.
@@ -681,10 +682,11 @@ SmallVector<Operation *> DynamicBroadcastInDimOp::getTiledImplementation(
   auto tiledResultTy =
       RankedTensorType::get(tiledInit.getType().cast<ShapedType>().getShape(),
                             resultTy.getElementType());
-  return {b.create<DynamicBroadcastInDimOp>(
+  auto tiledOp = b.create<DynamicBroadcastInDimOp>(
       loc, TypeRange{tiledResultTy}, tiledOperand, tiledInit,
       getBroadcastDimensionsAttr(), getKnownExpandingDimensionsAttr(),
-      getKnownNonexpandingDimensionsAttr())};
+      getKnownNonexpandingDimensionsAttr());
+  return TilingResult{{tiledOp}, {tiledOp.getResult()}};
 }
 
 LogicalResult DynamicBroadcastInDimOp::getResultTilePosition(
@@ -697,14 +699,14 @@ LogicalResult DynamicBroadcastInDimOp::getResultTilePosition(
   return success();
 }
 
-FailureOr<Value> DynamicBroadcastInDimOp::generateResultTileValue(
+FailureOr<TilingResult> DynamicBroadcastInDimOp::generateResultTileValue(
     OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes) {
   assert(resultNumber == 0 && "expect unique result idx");
-  return getTiledImplementation(b, offsets, sizes)
-      .front()
-      ->getResults()
-      .front();
+  FailureOr<TilingResult> tilingResult =
+      getTiledImplementation(b, offsets, sizes);
+  if (failed(tilingResult)) return failure();
+  return tilingResult.value();
 }
 
 void DynamicBroadcastInDimOp::getEffects(
@@ -805,7 +807,7 @@ SmallVector<Range> ScatterOp::getIterationDomain(OpBuilder &b) {
   return {Range{b.getIndexAttr(0), indicesCount, b.getIndexAttr(1)}};
 }
 
-SmallVector<Operation *> ScatterOp::getTiledImplementation(
+FailureOr<TilingResult> ScatterOp::getTiledImplementation(
     OpBuilder &b, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes) {
   Location loc = getLoc();
@@ -845,8 +847,10 @@ SmallVector<Operation *> ScatterOp::getTiledImplementation(
                                 SmallVector<OpFoldResult>(initRank, zeroAttr),
                                 tensor::getMixedSizes(b, loc, this->getInit()));
 
-  return {mlir::clone(b, this->getOperation(), TypeRange{init.getType()},
-                      ValueRange{indicesSlice, updateSlice, init})};
+  Operation *tiledOp =
+      mlir::clone(b, this->getOperation(), TypeRange{init.getType()},
+                  ValueRange{indicesSlice, updateSlice, init});
+  return TilingResult{{tiledOp}, {tiledOp->getResult(0)}};
 }
 
 LogicalResult ScatterOp::getResultTilePosition(
@@ -861,11 +865,14 @@ LogicalResult ScatterOp::getResultTilePosition(
   return success();
 }
 
-FailureOr<Value> ScatterOp::generateResultTileValue(
+FailureOr<TilingResult> ScatterOp::generateResultTileValue(
     OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes) {
   assert(resultNumber == 0 && "variadic scatter is not implemented");
-  return getTiledImplementation(b, offsets, sizes).front()->getResult(0);
+  FailureOr<TilingResult> tilingResult =
+      getTiledImplementation(b, offsets, sizes);
+  if (failed(tilingResult)) return failure();
+  return tilingResult;
 }
 
 void ScatterOp::getEffects(
@@ -921,7 +928,7 @@ SmallVector<Range> GatherOp::getIterationDomain(OpBuilder &b) {
   return {Range{b.getIndexAttr(0), indicesCount, b.getIndexAttr(1)}};
 }
 
-SmallVector<Operation *> GatherOp::getTiledImplementation(
+FailureOr<TilingResult> GatherOp::getTiledImplementation(
     OpBuilder &b, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes) {
   SmallVector<OpFoldResult> startIndexOffsets{offsets.front(),
@@ -940,9 +947,10 @@ SmallVector<Operation *> GatherOp::getTiledImplementation(
   Value initSlice =
       materializeSlice(b, getLoc(), getInit(), initOffsets, initSizes);
 
-  return {
+  auto gatherOp =
       b.create<GatherOp>(getLoc(), TypeRange{initSlice.getType()},
-                         ValueRange{getOperand(), subStartIndices, initSlice})};
+                         ValueRange{getOperand(), subStartIndices, initSlice});
+  return TilingResult{{gatherOp}, {gatherOp.getResult()}};
 }
 
 LogicalResult GatherOp::getResultTilePosition(
@@ -959,11 +967,14 @@ LogicalResult GatherOp::getResultTilePosition(
   return success();
 }
 
-FailureOr<Value> GatherOp::generateResultTileValue(
+FailureOr<TilingResult> GatherOp::generateResultTileValue(
     OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes) {
   assert(resultNumber == 0 && "resultNumber > 0 not implemented");
-  return getTiledImplementation(b, offsets, sizes).front()->getResult(0);
+  FailureOr<TilingResult> tilingResult =
+      getTiledImplementation(b, offsets, sizes);
+  if (failed(tilingResult)) return failure();
+  return tilingResult.value();
 }
 
 void GatherOp::getEffects(
@@ -1088,7 +1099,7 @@ LogicalResult SortOp::verify() {
   ArrayRef<int64_t> referenceShape =
       getInputs().front().getType().cast<ShapedType>().getShape();
 
-  for (auto &item : llvm::enumerate(TypeRange{getInputs()})) {
+  for (const auto &item : llvm::enumerate(TypeRange{getInputs()})) {
     ArrayRef<int64_t> shape = item.value().cast<ShapedType>().getShape();
     if (shape != referenceShape) {
       return emitOpError() << "expected all inputs to have the same shape ("
@@ -1098,7 +1109,7 @@ LogicalResult SortOp::verify() {
   }
 
   // Checks that the outputs have the same shape as the inputs.
-  for (auto &item : llvm::enumerate(getInits())) {
+  for (const auto &item : llvm::enumerate(getInits())) {
     ArrayRef<int64_t> shape =
         item.value().getType().cast<ShapedType>().getShape();
     if (shape != referenceShape) {
@@ -1147,7 +1158,7 @@ SmallVector<Range> SortOp::getIterationDomain(OpBuilder &b) {
   return iterationDomain;
 }
 
-SmallVector<Operation *> SortOp::getTiledImplementation(
+FailureOr<TilingResult> SortOp::getTiledImplementation(
     OpBuilder &b, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes) {
   auto loc = getLoc();
@@ -1186,8 +1197,9 @@ SmallVector<Operation *> SortOp::getTiledImplementation(
         materializeSlice(b, loc, init, tileOffsets, tileSizes));
   }
 
-  return {mlir::clone(b, this->getOperation(), tiledResultTypes,
-                      tiledInputsAndInits)};
+  Operation *tiledOp = mlir::clone(b, this->getOperation(), tiledResultTypes,
+                                   tiledInputsAndInits);
+  return TilingResult{{tiledOp}, SmallVector<Value>(tiledOp->getResults())};
 }
 
 LogicalResult SortOp::getResultTilePosition(
@@ -1206,13 +1218,13 @@ LogicalResult SortOp::getResultTilePosition(
   return success();
 }
 
-FailureOr<Value> SortOp::generateResultTileValue(OpBuilder &b,
-                                                 unsigned resultNumber,
-                                                 ArrayRef<OpFoldResult> offsets,
-                                                 ArrayRef<OpFoldResult> sizes) {
-  return getTiledImplementation(b, offsets, sizes)
-      .front()
-      ->getResult(resultNumber);
+FailureOr<TilingResult> SortOp::generateResultTileValue(
+    OpBuilder &b, unsigned /*resultNumber*/, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  FailureOr<TilingResult> tilingResult =
+      getTiledImplementation(b, offsets, sizes);
+  if (failed(tilingResult)) return failure();
+  return tilingResult.value();
 }
 
 void SortOp::getEffects(
@@ -1285,7 +1297,7 @@ SmallVector<OpFoldResult> getInputTileOffsetsForReverse(
 }
 }  // namespace
 
-SmallVector<Operation *> ReverseOp::getTiledImplementation(
+FailureOr<TilingResult> ReverseOp::getTiledImplementation(
     OpBuilder &b, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes) {
   auto loc = getLoc();
@@ -1305,8 +1317,9 @@ SmallVector<Operation *> ReverseOp::getTiledImplementation(
   auto tiledResultType = RankedTensorType::get(
       tileShape, input.getType().cast<ShapedType>().getElementType());
 
-  return {mlir::clone(b, this->getOperation(), tiledResultType,
-                      tiledInputsAndInits)};
+  Operation *tiledOp = mlir::clone(b, this->getOperation(), tiledResultType,
+                                   tiledInputsAndInits);
+  return TilingResult{{tiledOp}, SmallVector<Value>(tiledOp->getResults())};
 }
 
 LogicalResult ReverseOp::getResultTilePosition(
@@ -1319,12 +1332,13 @@ LogicalResult ReverseOp::getResultTilePosition(
   return success();
 }
 
-FailureOr<Value> ReverseOp::generateResultTileValue(
+FailureOr<TilingResult> ReverseOp::generateResultTileValue(
     OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes) {
-  return getTiledImplementation(b, offsets, sizes)
-      .front()
-      ->getResult(resultNumber);
+  FailureOr<TilingResult> tilingResult =
+      getTiledImplementation(b, offsets, sizes);
+  if (failed(tilingResult)) return failure();
+  return tilingResult.value();
 }
 
 OpFoldResult ReverseOp::fold(
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc
index 14d94d31f06..e065d466604 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc
@@ -57,11 +57,6 @@ int main(int argc, char** argv) {
         gml_st::addDefaultCPUTilingPipeline(pm, /*cpuName=*/"");
       });
 
-  PassPipelineRegistration<> genericHostToLLVMPass(
-      "generic-host-to-llvm",
-      "Pipeline to lower common dialects resulting from HLO to LLVM",
-      hlo::createGenericHostToLLVMPipeline);
-
   DialectRegistry registry;
   registerAllDialects(registry);
   mhlo::registerAllMhloDialects(registry);
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/affine.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/affine.cc
index f237e6e7aba..6cb8f0b8546 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/affine.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/affine.cc
@@ -25,17 +25,19 @@ namespace mlir {
 namespace interpreter {
 namespace {
 
-llvm::SmallVector<int64_t> apply(InterpreterState&, AffineApplyOp op,
+llvm::SmallVector<int64_t> apply(InterpreterState&, affine::AffineApplyOp op,
                                  ArrayRef<int64_t> operands) {
   return evalAffineMap(op.getAffineMap(), operands);
 }
 
-int64_t min(InterpreterState&, AffineMinOp op, ArrayRef<int64_t> operands) {
+int64_t min(InterpreterState&, affine::AffineMinOp op,
+            ArrayRef<int64_t> operands) {
   auto results = evalAffineMap(op.getAffineMap(), operands);
   return *std::min_element(results.begin(), results.end());
 }
 
-int64_t max(InterpreterState&, AffineMaxOp op, ArrayRef<int64_t> operands) {
+int64_t max(InterpreterState&, affine::AffineMaxOp op,
+            ArrayRef<int64_t> operands) {
   auto results = evalAffineMap(op.getAffineMap(), operands);
   return *std::max_element(results.begin(), results.end());
 }
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/deallocation.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/deallocation.cc
index 8f55532106d..11dacc95e87 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/deallocation.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/deallocation.cc
@@ -34,11 +34,26 @@ InterpreterValue null(InterpreterState&, deallocation::NullOp) {
   return {r};
 }
 
+InterpreterValue own(InterpreterState&, deallocation::OwnOp,
+                     const InterpreterValue& alloc) {
+  return alloc;
+}
+
+void freeAlloc(InterpreterState& state, deallocation::FreeOp,
+               const InterpreterValue& alloc) {
+  if (auto* stats = state.getOptions().stats) {
+    stats->heapSize -= alloc.buffer()->getByteSize();
+    ++stats->numDeallocations;
+  }
+  alloc.buffer()->deallocate();
+}
+
 SmallVector<InterpreterValue> retain(InterpreterState& state,
                                      deallocation::RetainOp,
                                      ArrayRef<InterpreterValue> values,
                                      ArrayRef<InterpreterValue> owned) {
-  SmallVector<InterpreterValue> result(values.size());
+  SmallVector<InterpreterValue> result(values.size(), null(state, {}));
+
   llvm::SmallBitVector used(owned.size());
   for (auto [index, v] : llvm::enumerate(values)) {
     for (auto [ownedIndex, o] : llvm::enumerate(owned)) {
@@ -63,8 +78,10 @@ SmallVector<InterpreterValue> retain(InterpreterState& state,
 }
 
 REGISTER_MLIR_INTERPRETER_OP(getBuffer);
+REGISTER_MLIR_INTERPRETER_OP(own);
 REGISTER_MLIR_INTERPRETER_OP(null);
 REGISTER_MLIR_INTERPRETER_OP(retain);
+REGISTER_MLIR_INTERPRETER_OP(freeAlloc);
 
 }  // namespace
 }  // namespace interpreter
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/linalg.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/linalg.cc
index 02fe912dceb..ef04f0ea08d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/linalg.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/linalg.cc
@@ -70,13 +70,15 @@ llvm::SmallVector<InterpreterValue> generic(
     InterpreterState& state, linalg::GenericOp generic,
     MutableArrayRef<InterpreterValue> inputs,
     MutableArrayRef<InterpreterValue> outputsRef) {
-  auto ranges = generic.getStaticLoopRanges();
-  for (auto range : ranges) {
-    (void)range;
-    // TODO(jreiffers): Support this.
-    assert(!mlir::ShapedType::isDynamic(range) &&
-           "Dynamic ranges not supported yet.");
+  SmallVector<int64_t> shapes;
+  for (auto& value : llvm::concat<InterpreterValue>(inputs, outputsRef)) {
+    if (value.isTensor() /* (or memref) */) {
+      llvm::append_range(
+          shapes, ArrayRef<int64_t>(value.view().sizes)
+                      .drop_back(value.view().numVectorDims.value_or(0)));
+    }
   }
+  auto ranges = generic.getShapesToLoopsMap().compose(shapes);
 
   llvm::SmallVector<InterpreterValue> outputs;
   for (int64_t output = 0; output < outputsRef.size(); ++output) {
@@ -91,7 +93,7 @@ llvm::SmallVector<InterpreterValue> generic(
   auto outputMaps = ArrayRef<AffineMap>(indexingMaps).drop_front(inputs.size());
   std::function<void(int64_t)> run;
   run = [&](int64_t loopIndex) {
-    // Abort recursion if we encountered some error previously.s
+    // Abort recursion if we encountered some error previously.
     if (state.hasFailure()) return;
 
     if (loopIndex < ranges.size()) {
@@ -154,27 +156,35 @@ llvm::SmallVector<InterpreterValue> map(InterpreterState& state,
 
 llvm::SmallVector<InterpreterValue> reduce(InterpreterState& state,
                                            linalg::ReduceOp reduce,
-                                           const InterpreterValue& in,
-                                           const InterpreterValue& init) {
-  // TODO(jreiffers): Support variadic reduce.
+                                           ArrayRef<InterpreterValue> ins,
+                                           ArrayRef<InterpreterValue> inits) {
   auto dims = reduce.getDimensions();
-  InterpreterValue output =
-      reduce.getInits()[0].getType().isa<TensorType>() ? init.clone() : init;
-  for (const auto& index : in.view().indices()) {
+  SmallVector<InterpreterValue> output;
+  for (auto [ty, init] : llvm::zip(reduce.getInits().getTypes(), inits)) {
+    output.push_back(ty.isa<TensorType>() ? init.clone() : init);
+  }
+  for (const auto& index : ins[0].view().indices()) {
     auto dstIndex = index;
     for (int64_t dim : llvm::reverse(dims)) {
       dstIndex.erase(dstIndex.begin() + dim);
     }
 
-    auto newValue =
-        interpret(state, reduce.getRegion(),
-                  {in.extractElement(index), output.extractElement(dstIndex)});
+    SmallVector<InterpreterValue> args;
+    for (auto& in : ins) {
+      args.push_back(in.extractElement(index));
+    }
+    for (auto& out : output) {
+      args.push_back(out.extractElement(dstIndex));
+    }
+    auto newValues = interpret(state, reduce.getRegion(), args);
     if (state.hasFailure()) return {};
 
-    output.insertElement(dstIndex, newValue.front());
+    for (auto [out, value] : llvm::zip(output, newValues)) {
+      out.insertElement(dstIndex, value);
+    }
   }
   if (reduce->getNumResults() == 0) return {};
-  return {output};
+  return output;
 }
 
 llvm::SmallVector<InterpreterValue> fill(InterpreterState&, linalg::FillOp op,
@@ -237,9 +247,55 @@ SmallVector<InterpreterValue> transpose(InterpreterState&,
   return {};
 }
 
-REGISTER_MLIR_INTERPRETER_OP("linalg.matmul", "mhlo.dot");
+SmallVector<InterpreterValue> dot(InterpreterState&, linalg::DotOp op,
+                                  ArrayRef<InterpreterValue> inputs,
+                                  InterpreterValue acc) {
+  const auto& lhs = inputs[0];
+  const auto& rhs = inputs[1];
+  if (op.getOutputs()[0].getType().isa<TensorType>()) {
+    acc = acc.clone();
+  }
+  dispatchScalarType(op.getOutputs()[0].getType(), [&](auto dummy) {
+    using TT = TensorOrMemref<decltype(dummy)>;
+    auto lhsTensor = std::get<TT>(lhs.storage);
+    auto rhsTensor = std::get<TT>(rhs.storage);
+    auto resultTensor = std::get<TT>(acc.storage);
+    for (int64_t k = 0; k < lhsTensor.view.sizes[0]; ++k) {
+      resultTensor.at({}) += lhsTensor.at(k) * rhsTensor.at(k);
+    }
+  });
+
+  if (op.getNumResults() == 0) return {};
+  return {acc};
+}
+
+SmallVector<InterpreterValue> vecmat(InterpreterState&, linalg::VecmatOp op,
+                                     ArrayRef<InterpreterValue> inputs,
+                                     InterpreterValue acc) {
+  const auto& lhs = inputs[0];
+  const auto& rhs = inputs[1];
+  if (op.getOutputs()[0].getType().isa<TensorType>()) {
+    acc = acc.clone();
+  }
+  dispatchScalarType(op.getOutputs()[0].getType(), [&](auto dummy) {
+    using TT = TensorOrMemref<decltype(dummy)>;
+    auto lhsTensor = std::get<TT>(lhs.storage);
+    auto rhsTensor = std::get<TT>(rhs.storage);
+    auto resultTensor = std::get<TT>(acc.storage);
+    for (int64_t j = 0; j < resultTensor.view.sizes[0]; ++j) {
+      for (int64_t k = 0; k < lhsTensor.view.sizes[0]; ++k) {
+        resultTensor.at(j) += lhsTensor.at(k) * rhsTensor.at({k, j});
+      }
+    }
+  });
+
+  if (op.getNumResults() == 0) return {};
+  return {acc};
+}
+
 REGISTER_MLIR_INTERPRETER_OP("linalg.yield", noOpTerminator);
 REGISTER_MLIR_INTERPRETER_OP(broadcast);
+REGISTER_MLIR_INTERPRETER_OP(dot);
 REGISTER_MLIR_INTERPRETER_OP(fill);
 REGISTER_MLIR_INTERPRETER_OP(generic);
 REGISTER_MLIR_INTERPRETER_OP(index);
@@ -247,6 +303,7 @@ REGISTER_MLIR_INTERPRETER_OP(map);
 REGISTER_MLIR_INTERPRETER_OP(matmul);
 REGISTER_MLIR_INTERPRETER_OP(reduce);
 REGISTER_MLIR_INTERPRETER_OP(transpose);
+REGISTER_MLIR_INTERPRETER_OP(vecmat);
 
 }  // namespace
 }  // namespace interpreter
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo.cc
index 0188262400a..efb46584e77 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mhlo/IR/hlo_ops.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "tools/mlir_interpreter/dialects/comparators.h"
+#include "tools/mlir_interpreter/dialects/cwise_math.h"
 #include "tools/mlir_interpreter/dialects/util.h"
 #include "tools/mlir_interpreter/framework/interpreter.h"
 #include "tools/mlir_interpreter/framework/interpreter_value.h"
@@ -60,7 +61,7 @@ InterpreterValue getTupleElement(InterpreterState&, mhlo::GetTupleElementOp get,
 
 InterpreterValue bitcastConvert(InterpreterState&, mhlo::BitcastConvertOp op,
                                 const InterpreterValue& in) {
-  ShapedType ty = op->getResultTypes()[0];
+  ShapedType ty = cast<ShapedType>(op->getResultTypes()[0]);
   auto result = dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
     TensorOrMemref<decltype(dummy)> result;
     result.view = {};
@@ -91,6 +92,22 @@ InterpreterValue broadcastInDim(InterpreterState&,
   return out;
 }
 
+InterpreterValue clamp(InterpreterState&, mhlo::ClampOp,
+                       const InterpreterValue& lb, const InterpreterValue& arg,
+                       const InterpreterValue& ub) {
+  auto result = arg.clone();
+  for (const auto& index : arg.view().indices()) {
+    auto lbScalar = lb.isTensor() ? lb.extractElement(index) : lb;
+    auto ubScalar = ub.isTensor() ? ub.extractElement(index) : ub;
+    assert(arg.isTensor() && "clamp only bcasts scalar bounds");
+    auto argScalar = arg.extractElement(index);
+    auto resultScalar = applyCwiseBinaryMap<Min>(
+        applyCwiseBinaryMap<Max>(argScalar, lbScalar), ubScalar);
+    result.insertElement(index, resultScalar);
+  }
+  return result;
+}
+
 InterpreterValue concatenate(InterpreterState&, mhlo::ConcatenateOp concat,
                              ArrayRef<InterpreterValue> vals) {
   uint64_t dim = concat.getDimension();
@@ -742,7 +759,7 @@ InterpreterValue dotGeneralImpl(InterpreterValue& lhs, InterpreterValue& rhs,
 // TODO(jreiffers): Unify this with DotGeneral.
 InterpreterValue dot(InterpreterState& state, mhlo::DotOp op,
                      const InterpreterValue& lhs, const InterpreterValue& rhs) {
-  ShapedType ty = op->getResultTypes()[0];
+  ShapedType ty = cast<ShapedType>(op->getResultTypes()[0]);
   auto result = lhs.typedAlike(ty.getShape());
 
   if (lhs.view().rank() == 1 && rhs.view().rank() == 1) {
@@ -839,6 +856,7 @@ REGISTER_MLIR_INTERPRETER_OP("mhlo.return", noOpTerminator);
 REGISTER_MLIR_INTERPRETER_OP("mhlo.tuple", makeTuple);
 REGISTER_MLIR_INTERPRETER_OP(bitcastConvert);
 REGISTER_MLIR_INTERPRETER_OP(broadcastInDim);
+REGISTER_MLIR_INTERPRETER_OP(clamp);
 REGISTER_MLIR_INTERPRETER_OP(compare);
 REGISTER_MLIR_INTERPRETER_OP(computeReshapeShape);
 REGISTER_MLIR_INTERPRETER_OP(concatenate);
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tensor.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tensor.cc
index da11a30c191..529f5936923 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tensor.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tensor.cc
@@ -16,8 +16,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 
 // clang-format erroneously puts the Tensor.h header above.
-#include <iterator>  // NOLINT
-#include <variant>   // NOLINT
+#include <optional>  // NOLINT
 
 #include "llvm/ADT/STLExtras.h"
 #include "tools/mlir_interpreter/dialects/util.h"
@@ -61,11 +60,41 @@ InterpreterValue fromElements(InterpreterState&, tensor::FromElementsOp op,
   return result;
 }
 
-template <typename Op>
-llvm::SmallVector<InterpreterValue> tensorReshape(
-    InterpreterState&, Op op, const InterpreterValue& tensor) {
+llvm::SmallVector<InterpreterValue> collapseShape(
+    InterpreterState&, tensor::CollapseShapeOp op,
+    const InterpreterValue& tensor) {
+  SmallVector<int64_t> sizes;
+  for (const auto& indices : op.getReassociationIndices()) {
+    int64_t size = 1;
+    for (auto dim : indices) {
+      size *= tensor.view().sizes[dim];
+    }
+    sizes.push_back(size);
+  }
+  return {reshapeTensor(tensor, sizes)};
+}
+
+llvm::SmallVector<InterpreterValue> expandShape(
+    InterpreterState&, tensor::ExpandShapeOp op,
+    const InterpreterValue& tensor) {
   auto ty = op->getResultTypes().front().template cast<mlir::ShapedType>();
-  return {reshapeTensor(tensor, ty.getShape())};
+  auto sizes = llvm::to_vector(ty.getShape());
+  for (const auto& [srcIndex, dstIndices] :
+       llvm::enumerate(op.getReassociationIndices())) {
+    int64_t size = tensor.view().sizes[srcIndex];
+    std::optional<int64_t> dynIndex = std::nullopt;
+    for (auto dim : dstIndices) {
+      if (sizes[dim] < 0) {
+        dynIndex = dim;
+      } else {
+        size /= sizes[dim];
+      }
+    }
+    if (dynIndex) {
+      sizes[*dynIndex] = size;
+    }
+  }
+  return {reshapeTensor(tensor, sizes)};
 }
 
 llvm::SmallVector<InterpreterValue> extractSlice(
@@ -209,8 +238,10 @@ InterpreterValue pad(InterpreterState& state, tensor::PadOp pad,
 REGISTER_MLIR_INTERPRETER_OP("tensor.cast",
                              "builtin.unrealized_conversion_cast");
 REGISTER_MLIR_INTERPRETER_OP("tensor.yield", noOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP(collapseShape);
 REGISTER_MLIR_INTERPRETER_OP(dim);
 REGISTER_MLIR_INTERPRETER_OP(empty);
+REGISTER_MLIR_INTERPRETER_OP(expandShape);
 REGISTER_MLIR_INTERPRETER_OP(extract);
 REGISTER_MLIR_INTERPRETER_OP(extractSlice);
 REGISTER_MLIR_INTERPRETER_OP(fromElements);
@@ -219,8 +250,6 @@ REGISTER_MLIR_INTERPRETER_OP(insert);
 REGISTER_MLIR_INTERPRETER_OP(insertSlice<tensor::InsertSliceOp>);
 REGISTER_MLIR_INTERPRETER_OP(insertSlice<tensor::ParallelInsertSliceOp>);
 REGISTER_MLIR_INTERPRETER_OP(pad);
-REGISTER_MLIR_INTERPRETER_OP(tensorReshape<tensor::CollapseShapeOp>);
-REGISTER_MLIR_INTERPRETER_OP(tensorReshape<tensor::ExpandShapeOp>);
 
 }  // namespace
 }  // namespace interpreter
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/deallocation/deallocation.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/deallocation/deallocation.mlir
index 54b90830fae..e668c0f41d5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/deallocation/deallocation.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/deallocation/deallocation.mlir
@@ -1,44 +1,51 @@
 // RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
 
-func.func @null() -> memref<*xf32> {
-  %null = deallocation.null : memref<*xf32>
-  return %null : memref<*xf32>
+func.func @null() -> !deallocation.ownership {
+  %null = deallocation.null
+  return %null : !deallocation.ownership
 }
 
 // CHECK-LABEL: @null
 // CHECK-NEXT: Results
 // CHECK-NEXT: null
 
-func.func @get_buffer() -> (index, index) {
-  %null = deallocation.null : memref<*xf32>
+func.func @get_buffer() -> (index, index, index) {
+  %null = deallocation.null
   %a = memref.alloc() : memref<f32>
-  %null_buffer = deallocation.get_buffer %null : memref<*xf32>
+  %a_owned = deallocation.own %a : memref<f32>
+  %null_buffer = deallocation.get_buffer %null : !deallocation.ownership
   %a_buffer = deallocation.get_buffer %a : memref<f32>
-  return %null_buffer, %a_buffer : index, index
+  %a_owned_buffer = deallocation.get_buffer %a_owned : !deallocation.ownership
+  return %null_buffer, %a_buffer, %a_owned_buffer : index, index, index
 }
 
 // CHECK-LABEL: @get_buffer
 // CHECK-NEXT: Results
 // CHECK-NEXT: i64: 0
-// CHECK-NEXT: i64: {{[0-9]+$}}
+// CHECK-NEXT: i64: [[ADDR:[0-9]+$]]
+// CHECK-NEXT: i64: [[ADDR]]
 
-func.func @retain() -> (memref<i32>, memref<i32>, memref<*xi32>) {
-  %a = memref.alloc() : memref<i32>
-  %b = memref.alloc() : memref<i32>
+func.func @retain() -> (memref<1xf32>, memref<f32>, !deallocation.ownership) {
+  %a = memref.alloc() : memref<1xf32>
+  %a_owned = deallocation.own %a : memref<1xf32>
+  %b = memref.alloc() : memref<f32>
+  %b_owned = deallocation.own %b : memref<f32>
 
-  %c1 = arith.constant 1 : i32
-  memref.store %c1, %a[] : memref<i32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1.0 : f32
+  memref.store %c1, %a[%c0] : memref<1xf32>
 
-  %d = deallocation.null : memref<*xi32>
+  %d = deallocation.null
 
-  %e = memref.cast %a : memref<i32> to memref<*xi32>
-  %f = deallocation.retain(%e) of (%b, %a, %d) :
-    (memref<*xi32>, memref<i32>, memref<i32>, memref<*xi32>) -> (memref<*xi32>)
-  return %a, %b, %f : memref<i32>, memref<i32>, memref<*xi32>
+  %e = memref.cast %a : memref<1xf32> to memref<?xf32>
+  %f = deallocation.retain(%e) of (%b_owned, %a_owned, %d) :
+    (memref<?xf32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership)
+    -> (!deallocation.ownership)
+  return %a, %b, %f : memref<1xf32>, memref<f32>, !deallocation.ownership
 }
 
 // CHECK-LABEL: @retain
 // CHECK-NEXT: Results
-// CHECK-NEXT: <i32>: 1
+// CHECK-NEXT: <1xf32>: [1.000000e+00]
 // CHECK-NEXT: <<deallocated>>
-// CHECK-NEXT: <i32>: 1
+// CHECK-NEXT: <1xf32>: [1.000000e+00]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/dot.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/dot.mlir
new file mode 100644
index 00000000000..895f51e0b0d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/dot.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dot() -> tensor<i32> {
+  %lhs = arith.constant dense<[1, 2]> : tensor<2xi32>
+  %rhs = arith.constant dense<[3, 4]> : tensor<2xi32>
+  %init = tensor.empty() : tensor<i32>
+  %ret = linalg.dot ins(%lhs, %rhs: tensor<2xi32>, tensor<2xi32>)
+                    outs(%init: tensor<i32>) -> tensor<i32>
+  return %ret : tensor<i32>
+}
+
+// CHECK-LABEL: @dot
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: 11
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir
index 5546db4996c..38f0dd67113 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir
@@ -73,3 +73,41 @@ func.func @vector() -> tensor<4xvector<2xi32>> {
 // CHECK-LABEL: @vector
 // CHECK-NEXT: Results
 // CHECK-NEXT{LITERAL}: TensorOrMemref<4xvector<2xi32>>: [[1, 5], [2, 6], [3, 7], [4, 8]]
+
+func.func @matmul_dynamic() -> tensor<2x2xi32> {
+  %lhs = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
+  %rhs = arith.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
+  %lhs_cast = tensor.cast %lhs : tensor<2x3xi32> to tensor<2x?xi32>
+  %rhs_cast = tensor.cast %rhs : tensor<3x2xi32> to tensor<?x2xi32>
+  %init = tensor.empty() : tensor<2x2xi32>
+  %ret = linalg.generic #matmul_trait
+    ins(%lhs_cast, %rhs_cast : tensor<2x?xi32>, tensor<?x2xi32>)
+    outs(%init : tensor<2x2xi32>) {
+    ^bb(%a: i32, %b: i32, %c: i32):
+      %d = arith.muli %a, %b: i32
+      %e = arith.addi %c, %d: i32
+      linalg.yield %e : i32
+    } -> tensor<2x2xi32>
+  return %ret : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @matmul_dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[22, 28], [49, 64]]
+
+func.func @dynamic_generic_w_cst() -> tensor<4x?xf64> {
+  %cst_1 =  arith.constant 123.456 : f64
+  %extracted_slice_ = arith.constant dense<[[1.1, 2.2], [3.3, 4.4], [5.5, 6.6], [7.7, 8.8]]> : tensor<4x2xf64>
+  %extracted_slice = tensor.cast %extracted_slice_ : tensor<4x2xf64> to tensor<4x?xf64>
+  %6 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> ()>,
+      affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel",
+      "parallel"]} ins(%cst_1 : f64) outs(%extracted_slice : tensor<4x?xf64>) {
+  ^bb0(%in: f64, %out: f64):
+    linalg.yield %in : f64
+  } -> tensor<4x?xf64>
+  return %6 : tensor<4x?xf64>
+}
+
+// CHECK-LABEL: @dynamic_generic_w_cst
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<4x2xf64>: [[1.234560e+02, 1.234560e+02], [1.234560e+02, 1.234560e+02], [1.234560e+02, 1.234560e+02], [1.234560e+02, 1.234560e+02]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir
index f0dcd80d025..c21f9f778db 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir
@@ -17,6 +17,27 @@ func.func @reduce() -> tensor<2xi32> {
 // CHECK-NEXT: Results
 // CHECK-NEXT: [19, 36]
 
+func.func @variadic() -> (tensor<2xi32>, tensor<2xf32>) {
+  %v = arith.constant dense<[[1,2,3,4], [5,6,7,8]]> : tensor<2x4xi32>
+  %w = arith.constant dense<[[1.0,2.0,3.0,4.0], [5.0,6.0,7.0,8.0]]> : tensor<2x4xf32>
+  %init = arith.constant dense<[9, 10]> : tensor<2xi32>
+  %init2 = arith.constant dense<[9.0, 10.0]> : tensor<2xf32>
+  %ret, %retf = linalg.reduce ins(%v, %w : tensor<2x4xi32>, tensor<2x4xf32>)
+                              outs(%init, %init2: tensor<2xi32>, tensor<2xf32>)
+                              dimensions = [1]
+    (%in: i32, %inf: f32, %out: i32, %outf: f32) {
+      %sum = arith.addi %in, %out : i32
+      %sumf = arith.addf %inf, %outf : f32
+      linalg.yield %sum, %sumf: i32, f32
+    }
+  func.return %ret, %retf : tensor<2xi32>, tensor<2xf32>
+}
+
+// CHECK-LABEL: @variadic
+// CHECK-NEXT: Results
+// CHECK-NEXT: [19, 36]
+// CHECK-NEXT: [1.900000e+01, 3.600000e+01]
+
 func.func @bufferized() -> memref<2xi32> {
   %v = arith.constant dense<[[1,2,3,4], [5,6,7,8]]> : tensor<2x4xi32>
   %init = arith.constant dense<[9, 10]> : memref<2xi32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/vecmat.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/vecmat.mlir
new file mode 100644
index 00000000000..2a4280f1581
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/vecmat.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @vecmat() -> tensor<2xi32> {
+  %lhs = arith.constant dense<[4, 5]> : tensor<2xi32>
+  %rhs = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %init = tensor.empty() : tensor<2xi32>
+  %ret = linalg.vecmat ins(%lhs, %rhs: tensor<2xi32>, tensor<2x2xi32>)
+                       outs(%init: tensor<2xi32>) -> tensor<2xi32>
+  return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @vecmat
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [19, 28]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/clamp.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/clamp.mlir
new file mode 100644
index 00000000000..f7a455a78e2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/clamp.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @clamp() -> tensor<2x2xi32> {
+  %lb = mhlo.constant dense<[[1, 7], [1, 7]]> : tensor<2x2xi32>
+  %arg = mhlo.constant dense<[[4, 5], [6, 9]]> : tensor<2x2xi32>
+  %ub = mhlo.constant dense<[[5, 9], [3, 6]]> : tensor<2x2xi32>
+  %clamp = "mhlo.clamp"(%lb, %arg, %ub)
+      : (tensor<2x2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %clamp : tensor<2x2xi32>
+}
+
+// CHECK-LABEL:         @clamp
+// CHECK-NEXT:          Results
+// CHECK-NEXT{LITERAL}: [[4, 7], [3, 6]]
+
+func.func @clamp_f32() -> tensor<2x2xf32> {
+  %lb = mhlo.constant dense<[[1.1, 7.1], [1.1, 7.1]]> : tensor<2x2xf32>
+  %arg = mhlo.constant dense<[[4.1, 5.1], [6.1, 9.1]]> : tensor<2x2xf32>
+  %ub = mhlo.constant dense<[[5.1, 9.1], [3.1, 6.1]]> : tensor<2x2xf32>
+  %clamp = "mhlo.clamp"(%lb, %arg, %ub)
+      : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %clamp : tensor<2x2xf32>
+}
+
+// CHECK-LABEL:         @clamp
+// CHECK-NEXT:          Results
+// CHECK-NEXT{LITERAL}: [[4.100000e+00, 7.100000e+00], [3.100000e+00, 6.100000e+00]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir
index 71b082f16d9..1201a2a342e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir
@@ -29,3 +29,14 @@ func.func @to_unit() -> tensor<i32> {
 // CHECK-LABEL: @to_unit
 // CHECK-NEXT: Results
 // CHECK-NEXT: 42
+
+func.func @dynamic() -> tensor<?xi32> {
+  %cst = arith.constant dense<42> : tensor<2x3xi32>
+  %cast = tensor.cast %cst : tensor<2x3xi32> to tensor<?x3xi32>
+  %collapse = tensor.collapse_shape %cast [[0, 1]] : tensor<?x3xi32> into tensor<?xi32>
+  return %collapse : tensor<?xi32>
+}
+
+// CHECK-LABEL: @dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT: [42, 42, 42, 42, 42, 42]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir
index b0561218d23..724e53384bb 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir
@@ -28,3 +28,28 @@ func.func @from_unit() -> tensor<1x1xi32> {
 // CHECK-LABEL: @from_unit
 // CHECK-NEXT: Results
 // CHECK-NEXT{LITERAL}: TensorOrMemref<1x1xi32>: [[42]]
+
+func.func @dynamic()
+    -> (tensor<?x2x3xi32>) {
+  %cst = arith.constant dense<[[1, 2, 3, 4, 5, 6]]> : tensor<1x6xi32>
+  %cst_cast = tensor.cast %cst : tensor<1x6xi32> to tensor<?x6xi32>
+  %expand1 = tensor.expand_shape %cst_cast [[0], [1, 2]]
+      : tensor<?x6xi32> into tensor<?x2x3xi32>
+  return %expand1 : tensor<?x2x3xi32>
+}
+
+// CHECK-LABEL: @dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
+
+func.func @dynamic2() -> (tensor<?x3xi32>) {
+  %cst = arith.constant dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi32>
+  %cst_cast = tensor.cast %cst : tensor<6xi32> to tensor<?xi32>
+  %expand1 = tensor.expand_shape %cst_cast [[0, 1]]
+      : tensor<?xi32> into tensor<?x3xi32>
+  return %expand1 : tensor<?x3xi32>
+}
+
+// CHECK-LABEL: @dynamic2
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 2, 3], [4, 5, 6]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/concatenate.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/concatenate.mlir
new file mode 100644
index 00000000000..9800bceca69
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/concatenate.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @concatenate() -> tensor<5xi32> {
+  %a = arith.constant dense<[1,2,3]> : tensor<3xi32>
+  %b = arith.constant dense<[4,5]> : tensor<2xi32>
+  %init = tensor.empty() : tensor<5xi32>
+  %cat = thlo.concatenate
+      ins(%a: tensor<3xi32>, %b: tensor<2xi32>)
+      outs(%init: tensor<5xi32>)
+      dimension = 0
+  return %cat : tensor<5xi32>
+}
+
+// CHECK-LABEL: @concatenate
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1, 2, 3, 4, 5]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/reverse.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/reverse.mlir
new file mode 100644
index 00000000000..e98480ada1c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/reverse.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @reverse() -> tensor<3x3xi32> {
+  %tensor = arith.constant dense<[[4, 5, 6], [6, 7, 8], [8, 9, 10]]> : tensor<3x3xi32>
+  %init = tensor.empty() : tensor<3x3xi32>
+  %reverse = thlo.reverse ins(%tensor: tensor<3x3xi32>)
+                          outs(%init: tensor<3x3xi32>)
+                          reverse_dimensions = [1]
+  return %reverse : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @reverse
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[6, 5, 4], [8, 7, 6], [10, 9, 8]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/scatter.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/scatter.mlir
new file mode 100644
index 00000000000..e4b4365671a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/thlo/scatter.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @bounds_check() -> tensor<10xi32> {
+  %operand = arith.constant dense<0> : tensor<10xi32>
+  %indices = arith.constant dense<[[1], [8], [-1]]> : tensor<3x1xindex>
+  %updates = arith.constant dense<[[4, 5, 6], [6, 7, 8], [8, 9, 10]]> : tensor<3x3xi32>
+  %scatter = thlo.scatter ins(%indices: tensor<3x1xindex>, %updates: tensor<3x3xi32>)
+                          outs(%operand: tensor<10xi32>) (%in: i32, %out: i32) {
+    %add = arith.addi %in, %out : i32
+    thlo.yield %add : i32
+  }
+  return %scatter : tensor<10xi32>
+}
+
+// CHECK-LABEL: @bounds_check
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 4, 5, 6, 0, 0, 0, 0, 0, 0]
+
+func.func @update_last_element() -> tensor<2xi32> {
+  %operand = arith.constant dense<[1, 1]> : tensor<2xi32>
+  %indices = arith.constant dense<[[1]]> : tensor<1x1xindex>
+  %updates = arith.constant dense<[[0]]> : tensor<1x1xi32>
+
+  %scatter = thlo.scatter ins(%indices: tensor<1x1xindex>, %updates: tensor<1x1xi32>)
+                          outs(%operand: tensor<2xi32>) (%in: i32, %out: i32) {
+    thlo.yield %in : i32
+  }
+  return %scatter : tensor<2xi32>
+}
+
+// CHECK-LABEL: @update_last_element
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1, 0]
+
+func.func @bufferized() -> memref<10xi32> {
+  %operand = arith.constant dense<0> : memref<10xi32>
+  %indices = arith.constant dense<[[1], [8], [-1]]> : memref<3x1xindex>
+  %updates = arith.constant dense<[[4, 5, 6], [6, 7, 8], [8, 9, 10]]> : memref<3x3xi32>
+  thlo.scatter ins(%indices: memref<3x1xindex>, %updates: memref<3x3xi32>)
+               outs(%operand: memref<10xi32>) (%in: i32, %out: i32) {
+    %add = arith.addi %in, %out : i32
+    thlo.yield %add : i32
+  }
+  return %operand : memref<10xi32>
+}
+
+// CHECK-LABEL: @bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 4, 5, 6, 0, 0, 0, 0, 0, 0]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/thlo.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/thlo.cc
new file mode 100644
index 00000000000..b626f68a27f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/thlo.cc
@@ -0,0 +1,121 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "thlo/IR/thlo_ops.h"
+#include "tools/mlir_interpreter/dialects/util.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+llvm::SmallVector<InterpreterValue> scatter(InterpreterState& state,
+                                            thlo::ScatterOp scatter,
+                                            const InterpreterValue& indices,
+                                            const InterpreterValue& updates,
+                                            InterpreterValue init) {
+  auto out = scatter.getNumResults() == 0 ? init : init.clone();
+
+  const auto& inputView = init.view();
+  const auto& updatesView = updates.view();
+  int64_t operandRank = inputView.rank();
+
+  for (const auto& updateIndices : updatesView.indices()) {
+    llvm::SmallVector<int64_t> inputIndices(operandRank);
+    llvm::SmallVector<int64_t> maxIndices(operandRank);
+    llvm::SmallVector<int64_t> minIndices(operandRank);
+
+    for (int64_t dim = 0; dim < operandRank; ++dim) {
+      int64_t scatterIndex =
+          indices.view().sizes[1] > dim
+              ? indices.extractElement({updateIndices[0], dim}).asInt()
+              : 0;
+      inputIndices[dim] = updateIndices[dim + 1] + scatterIndex;
+      minIndices[dim] = scatterIndex;
+      maxIndices[dim] = updatesView.sizes[dim + 1] - 1 + scatterIndex;
+    }
+
+    if (!inputView.inBounds(minIndices)) continue;
+    if (!inputView.inBounds(maxIndices)) continue;
+
+    auto currentValue = out.extractElement(inputIndices);
+    auto update = updates.extractElement(updateIndices);
+
+    auto result = interpret(state, scatter.getUpdateComputation(),
+                            {update, currentValue});
+    if (state.hasFailure()) {
+      break;
+    }
+    out.insertElement(inputIndices, result.front());
+  }
+
+  if (scatter.getNumResults() == 0) return {};
+  return {out};
+}
+
+llvm::SmallVector<InterpreterValue> concatenate(
+    InterpreterState&, thlo::ConcatenateOp op,
+    ArrayRef<InterpreterValue> values, InterpreterValue init) {
+  if (op.getNumResults() > 0) {
+    init = init.clone();
+  }
+  int64_t dim = op.getDimension().getSExtValue();
+  int64_t offset = 0;
+  for (const auto& value : values) {
+    for (auto index : value.view().indices()) {
+      auto val = value.extractElement(index);
+      index[dim] += offset;
+      init.insertElement(index, val);
+    }
+    offset += value.view().sizes[dim];
+  }
+  if (op.getNumResults() > 0) return {init};
+  return {};
+}
+
+llvm::SmallVector<InterpreterValue> reverse(InterpreterState& state,
+                                            thlo::ReverseOp op,
+                                            const InterpreterValue& in,
+                                            InterpreterValue init) {
+  // No bufferized thlo.reverse currently exists. This if guards against
+  // possible future changes to that.
+  if (op->getNumResults() != 1) {
+    state.addFailure("bufferized thlo.reverse unsupported");
+    return {};
+  }
+
+  init = init.clone();
+  init.fill([&](ArrayRef<int64_t> indices) {
+    auto srcIndex = llvm::to_vector(indices);
+    for (int64_t dim : op.getReverseDimensions()) {
+      srcIndex[dim] = in.view().sizes[dim] - 1 - srcIndex[dim];
+    }
+    return in.extractElement(srcIndex);
+  });
+
+  if (op->getNumResults() > 0) return {init};
+  return {};
+}
+
+REGISTER_MLIR_INTERPRETER_OP("thlo.yield", noOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP(concatenate);
+REGISTER_MLIR_INTERPRETER_OP(scatter);
+REGISTER_MLIR_INTERPRETER_OP(reverse);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc
index c62c5d351b3..a1b4d06fe73 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc
@@ -145,7 +145,7 @@ SmallVector<IntType> extractVector(ArrayAttr arrayAttr) {
 
 InterpreterValue bitcast(InterpreterState&, vector::BitCastOp op,
                          const InterpreterValue& vector) {
-  ShapedType ty = op->getResultTypes()[0];
+  ShapedType ty = cast<ShapedType>(op->getResultTypes()[0]);
   auto flattened = vector.coerceLayout({});
   auto buffer = flattened.buffer();
   auto view = flattened.view();
@@ -547,7 +547,7 @@ InterpreterValue outerProduct(InterpreterState&,
                               const InterpreterValue& lhs,
                               const InterpreterValue& rhs,
                               std::optional<InterpreterValue> acc) {
-  ShapedType ty = outerproduct->getResultTypes()[0];
+  ShapedType ty = cast<ShapedType>(outerproduct->getResultTypes()[0]);
   return dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
     using T = decltype(dummy);
     using TT = TensorOrMemref<T>;
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/xla_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/xla.cc
similarity index 80%
rename from tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/xla_cpu.cc
rename to tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/xla.cc
index fea0c279298..e89395edd33 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/xla_cpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/xla.cc
@@ -19,8 +19,15 @@ namespace mlir {
 namespace interpreter {
 namespace {
 
+llvm::SmallVector<InterpreterValue> bufferToMem(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation*,
+    InterpreterState&) {
+  return {args[0]};
+}
+
 REGISTER_MLIR_INTERPRETER_OP("xla_cpu.memref_element_cast",
                              "builtin.unrealized_conversion_cast");
+REGISTER_MLIR_INTERPRETER_OP("xla_framework.buffer_to_mem", bufferToMem);
 
 }  // namespace
 }  // namespace interpreter
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h
index 57f59714fe0..dbb383e144d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <math.h>
 
+#include <cmath>
+#include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -34,6 +36,30 @@ limitations under the License.
 namespace mlir {
 namespace interpreter {
 
+template <typename T>
+bool isEqual(T a, T b) {
+  return a == b;
+}
+
+// TODO(jreiffers): Replace ifndef with a command line flag.
+#ifndef MLIR_INTERPRETER_COMPARE_DOUBLES_EXACT
+// Compare double precision float with a small tolerance, because complex
+// computations in the interpreted don't olways produce the exact same result.
+template <>
+inline bool isEqual(double a, double b) {
+  if (isinf(a) || isinf(b)) {
+    return a == b;
+  }
+
+  return fabs(a - b) < 1e-14;
+}
+
+template <>
+inline bool isEqual(std::complex<double> a, std::complex<double> b) {
+  return isEqual(a.real(), b.real()) && isEqual(a.imag(), b.imag());
+}
+#endif
+
 // Represents a view into a physical buffer.
 struct BufferView {
   int64_t offset;
@@ -287,7 +313,7 @@ struct TensorOrMemref {
           return false;
         }
       }
-      if (at(indices) != other.at(indices)) return false;
+      if (!isEqual(at(indices), other.at(indices))) return false;
     }
     return true;
   }
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/binary.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/binary.mlir
index d837c1f8a33..21a9d70684f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/binary.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/binary.mlir
@@ -38,7 +38,7 @@ func.func @compare_ne(%arg0 : tensor<10xi32>, %arg1 : tensor<10xi32>) -> tensor<
 
 // CHECK-LABEL: @concatenate
 func.func @concatenate(%arg0 : tensor<3x3xf32>, %arg1 : tensor<3x3xf32>) -> tensor<6x3xf32> {
-  // CHECK: "tosa.concat"(%arg0, %arg1) {axis = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
+  // CHECK: "tosa.concat"(%arg0, %arg1) <{axis = 0 : i64}> : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
   %0 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
   return %0 : tensor<6x3xf32>
 }
@@ -60,8 +60,8 @@ func.func @divide_f32(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<
 
 // CHECK-LABEL: @dot_vector_vector
 func.func @dot_vector_vector(%arg0 : tensor<3xf32>, %arg1 : tensor<3xf32>) -> tensor<f32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 1, 3>}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 3, 1>}
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 1, 3>}>
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 3, 1>}>
   // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
   // CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]])
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<3xf32>, tensor<3xf32>) -> tensor<f32>
@@ -70,8 +70,8 @@ func.func @dot_vector_vector(%arg0 : tensor<3xf32>, %arg1 : tensor<3xf32>) -> te
 
 // CHECK-LABEL: @dot_vector_matrix
 func.func @dot_vector_matrix(%arg0 : tensor<2xf32>, %arg1 : tensor<2x3xf32>) -> tensor<3xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 1, 2>}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 2, 3>}
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 1, 2>}>
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 2, 3>}>
   // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
   // CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]])
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2xf32>, tensor<2x3xf32>) -> tensor<3xf32>
@@ -80,8 +80,8 @@ func.func @dot_vector_matrix(%arg0 : tensor<2xf32>, %arg1 : tensor<2x3xf32>) ->
 
 // CHECK-LABEL: @dot_matrix_vector
 func.func @dot_matrix_vector(%arg0 : tensor<2x3xf32>, %arg1 : tensor<3xf32>) -> tensor<2xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 2, 3>}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 3, 1>}
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 2, 3>}>
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 3, 1>}>
   // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
   // CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]])
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<3xf32>) -> tensor<2xf32>
@@ -90,8 +90,8 @@ func.func @dot_matrix_vector(%arg0 : tensor<2x3xf32>, %arg1 : tensor<3xf32>) ->
 
 // CHECK-LABEL: @dot_matrix_matrix
 func.func @dot_matrix_matrix(%arg0 : tensor<2x3xf32>, %arg1 : tensor<3x4xf32>) -> tensor<2x4xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 2, 3>}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 3, 4>}
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 2, 3>}>
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 3, 4>}>
   // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
   // CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]])
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<3x4xf32>) -> tensor<2x4xf32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/nullary.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/nullary.mlir
index c0c72595b24..2310ceb546d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/nullary.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/nullary.mlir
@@ -17,16 +17,16 @@ func.func @constant_f64() -> tensor<10xf64> {
 
 // CHECK-LABEL: @iota_dimension_0
 func.func @iota_dimension_0() -> tensor<4x8xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.tile"(%[[VAR0]]) {multiples = array<i64: 1, 8>}
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>}>
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.tile"(%[[VAR0]]) <{multiples = array<i64: 1, 8>}>
   %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> (tensor<4x8xf32>)
   return %0 : tensor<4x8xf32>
 }
 
 // CHECK-LABEL: @iota_dimension_1
 func.func @iota_dimension_1() -> tensor<4x8xi32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0, 1, 2, 3, 4, 5, 6, 7]> : tensor<8xi32>}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.tile"(%[[VAR0]]) {multiples = array<i64: 4, 1>}
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[0, 1, 2, 3, 4, 5, 6, 7]> : tensor<8xi32>}>
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.tile"(%[[VAR0]]) <{multiples = array<i64: 4, 1>}>
   %0 = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> (tensor<4x8xi32>)
   return %0 : tensor<4x8xi32>
 }
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/ternary.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/ternary.mlir
index ef0facf9be0..b49c8f4137c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/ternary.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/ternary.mlir
@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @concatenate
 func.func @concatenate(%arg0 : tensor<5x2xf32>, %arg1 : tensor<5x5xf32>, %arg2 : tensor<5x7xf32>) -> tensor<5x14xf32> {
-  // CHECK: "tosa.concat"(%arg0, %arg1, %arg2) {axis = 1 : i64} : (tensor<5x2xf32>, tensor<5x5xf32>, tensor<5x7xf32>) -> tensor<5x14xf32>
+  // CHECK: "tosa.concat"(%arg0, %arg1, %arg2) <{axis = 1 : i64}> : (tensor<5x2xf32>, tensor<5x5xf32>, tensor<5x7xf32>) -> tensor<5x14xf32>
   %0 = "mhlo.concatenate"(%arg0, %arg1, %arg2) {dimension = 1 : i64} : (tensor<5x2xf32>, tensor<5x5xf32>, tensor<5x7xf32>) -> tensor<5x14xf32>
   return %0 : tensor<5x14xf32>
 }
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/unary.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/unary.mlir
index 1ffac4c2e7d..9119b5b2e1e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/unary.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/unary.mlir
@@ -30,7 +30,7 @@ func.func @exponential(%arg : tensor<10xf32>) -> tensor<10xf32> {
 
 // CHECK-LABEL: @exponential_minus_one
 func.func @exponential_minus_one(%arg : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<1.000000e+00>
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00>
   // CHECK-DAG: %[[VAR1:.*]] = "tosa.exp"(%arg0)
   // CHECK-DAG: %[[VAR2:.*]] = "tosa.sub"(%[[VAR1]], %[[VAR0]])
   %0 = "mhlo.exponential_minus_one"(%arg) : (tensor<10xf32>) -> tensor<10xf32>
@@ -46,7 +46,7 @@ func.func @floor(%arg : tensor<10xf32>) -> tensor<10xf32> {
 
 // CHECK-LABEL: @is_finite
 func.func @is_finite(%arg : tensor<10xf32>) -> tensor<10xi1> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0x7F800000>
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0x7F800000>
   // CHECK-DAG: %[[VAR1:.*]] = "tosa.abs"(%arg0)
   // CHECK-DAG: %[[VAR2:.*]] = "tosa.equal"(%[[VAR1]], %[[VAR0]])
   // CHECK-DAG: %[[VAR3:.*]] = "tosa.logical_not"(%[[VAR2]])
@@ -63,7 +63,7 @@ func.func @log(%arg : tensor<10xf32>) -> tensor<10xf32> {
 
 // CHECK-LABEL: @log_plus_one
 func.func @log_plus_one(%arg : tensor<10xf16>) -> tensor<10xf16> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<1.000000e+00>
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00>
   // CHECK-DAG: %[[VAR1:.*]] = "tosa.add"(%arg0, %[[VAR0]])
   // CHECK-DAG: %[[VAR2:.*]] = "tosa.log"(%[[VAR1]])
   %0 = "mhlo.log_plus_one"(%arg) : (tensor<10xf16>) -> tensor<10xf16>
@@ -79,7 +79,7 @@ func.func @negate(%arg : tensor<10xf32>) -> tensor<10xf32> {
 
 // CHECK-LABEL: @slice
 func.func @slice(%arg : tensor<4x3xf32>) -> tensor<2x2xf32> {
-  // CHECK: "tosa.slice"(%arg0) {size = array<i64: 2, 2>, start = array<i64: 2, 1>}
+  // CHECK: "tosa.slice"(%arg0) <{size = array<i64: 2, 2>, start = array<i64: 2, 1>}>
   %0 = "mhlo.slice"(%arg) {
     start_indices = dense<[2, 1]> : tensor<2xi64>,
     limit_indices = dense<[4, 3]> : tensor<2xi64>,
@@ -121,7 +121,7 @@ func.func @tanh(%arg : tensor<10xf32>) -> tensor<10xf32> {
 
 // CHECK-LABEL: @transpose
 func.func @transpose(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[2, 1, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
   // CHECK-DAG: %[[VAR1:.*]] = "tosa.transpose"(%arg0, %[[VAR0]])
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[2, 1, 0]> : tensor<3xi64>} : (tensor<1x2x3xf32>) -> tensor<3x2x1xf32>
   return %0 : tensor<3x2x1xf32>
@@ -129,8 +129,8 @@ func.func @transpose(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
 
 // CHECK-LABEL: @while
 func.func @while(%arg0: tensor<i32>) -> tensor<i32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<3> : tensor<i32>}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<1> : tensor<i32>}
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<3> : tensor<i32>}>
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<1> : tensor<i32>}>
   // CHECK:     %[[VAR2:.*]] = "tosa.while_loop"(%arg0) ({
   // CHECK:     ^bb0(%[[ARG0:.+]]: tensor<i32>):
   // CHECK:       %[[VAR3:.*]] = "tosa.equal"(%[[ARG0]], %[[VAR0]])
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.pdll b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.pdll
index e75a3f0a218..857bcf45155 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.pdll
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.pdll
@@ -27,7 +27,9 @@ Rewrite onesLike(op: Op, type: Type) -> Op [{
   }
 
   return rewriter.create<mlir::tosa::ConstOp>(
-      op->getLoc(), type, mlir::DenseElementsAttr::get(type, outputValue));
+      op->getLoc(), type,
+      mlir::DenseElementsAttr::get(
+        llvm::cast<mlir::ShapedType>(type), outputValue));
 }];
 
 Rewrite positiveFloatInfinityLike(op: Op, type: Type) -> Op [{
@@ -37,10 +39,12 @@ Rewrite positiveFloatInfinityLike(op: Op, type: Type) -> Op [{
 
   llvm::SmallVector<mlir::Attribute, 4> outputValue;
   outputValue.push_back(rewriter.getFloatAttr(
-      elementType, llvm::APFloat::getInf(semantic, false)));
+    elementType, llvm::APFloat::getInf(semantic, false)));
 
   return rewriter.create<mlir::tosa::ConstOp>(
-      op->getLoc(), type, mlir::DenseElementsAttr::get(type, outputValue));
+      op->getLoc(), type,
+      mlir::DenseElementsAttr::get(
+        llvm::cast<mlir::ShapedType>(type), outputValue));
 }];
 
 // Nullary ops.
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/transforms/CMakeLists.txt
index 8a936da52cd..29c8626b929 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/CMakeLists.txt
@@ -39,12 +39,14 @@ add_mlir_library(MLIRBufferTransforms
 
   DEPENDS
   LMHLOTransformsPassIncGen
+  MLIRDeallocationPassesIncGen
 
   LINK_COMPONENTS
   Core
 
   LINK_LIBS PUBLIC
   ChloOps
+  GmlStBufferizableOpInterface
   GmlStDialect
   MLIRGPUOps
   MLIRHLOAnalysis
@@ -56,6 +58,8 @@ add_mlir_library(MLIRBufferTransforms
   MLIRShapeDialect
   MLIRTransformDialectTransforms
   MLIRTransforms
+  MLIRX86VectorDialect
+  MLIRX86VectorTransforms
   MhloDialect
   THLODialect
   ThloBufferizableOpInterface
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/buffer_packing.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/buffer_packing.cc
index 105d6ca65ec..5998ded259f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/buffer_packing.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/buffer_packing.cc
@@ -42,7 +42,7 @@ size_t computeUserangeSize(const UseInterval &interval) {
 /// Compute the byte size of a given Value.
 size_t computeByteSize(const Value &v) {
   auto type = v.getType().cast<ShapedType>();
-  return type.getSizeInBits() / 8;
+  return type.getNumElements() * type.getElementTypeBitWidth() / 8;
 }
 
 /// Compute the 64 byte alinged segments of a given Value.
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/bufferize.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/bufferize.cc
index b86a614e58a..6e7a85f91c9 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/bufferize.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/bufferize.cc
@@ -56,7 +56,7 @@ struct BufferizeConstantOp : public OpConversionPattern<arith::ConstantOp> {
       if (complex::ConstantOp::isBuildableWith(attr, type))
         return rewriter.create<complex::ConstantOp>(loc, type,
                                                     attr.cast<ArrayAttr>());
-      return rewriter.create<arith::ConstantOp>(loc, attr);
+      return rewriter.create<arith::ConstantOp>(loc, cast<TypedAttr>(attr));
     };
 
     if (resultRank == 0) {
@@ -75,7 +75,8 @@ struct BufferizeConstantOp : public OpConversionPattern<arith::ConstantOp> {
     if (allSameElems)
       value = makeConstant(elementsAttr.getSplatValue<mlir::Attribute>(),
                            elementType);
-    for (auto &en : llvm::enumerate(elementsAttr.getValues<Attribute>())) {
+    for (const auto &en :
+         llvm::enumerate(elementsAttr.getValues<Attribute>())) {
       if (!allSameElems) value = makeConstant(en.value(), elementType);
       Value index = rewriter.create<arith::ConstantIndexOp>(loc, en.index());
       rewriter.create<memref::StoreOp>(loc, value, buffer, index);
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/bufferize_pass.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/bufferize_pass.cc
index b194fadf2fa..0713ce30d64 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/bufferize_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/bufferize_pass.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 
 #include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/interfaces/bufferizable_op_interface_impl.h"
 #include "lhlo/IR/lhlo_ops.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -84,6 +85,9 @@ namespace mlir {
 
 namespace {
 
+// Label for functions created by fusion outlining.
+static constexpr char kFusionFunctionLabel[] = "fusion";
+
 /// A helper type converter class that automatically populates the relevant
 /// materializations and type conversions for bufferization.
 
@@ -171,7 +175,7 @@ struct ComputeOpAndFuncBufferizePass
     RewritePatternSet patterns(&getContext());
     auto& context = getContext();
     ConversionTarget target(context);
-    target.addLegalDialect<AffineDialect, arith::ArithDialect,
+    target.addLegalDialect<affine::AffineDialect, arith::ArithDialect,
                            complex::ComplexDialect, func::FuncDialect,
                            lmhlo::LmhloDialect, math::MathDialect,
                            memref::MemRefDialect, tensor::TensorDialect,
@@ -223,6 +227,7 @@ struct OneShotBufferizePass
         registry);
     linalg::registerBufferizableOpInterfaceExternalModels(registry);
     mhlo::registerBufferizableOpInterfaceExternalModels(registry);
+    gml_st::registerBufferizableOpInterfaceExternalModels(registry);
     scf::registerBufferizableOpInterfaceExternalModels(registry);
     shape::registerBufferizableOpInterfaceExternalModels(registry);
     tensor::registerBufferizableOpInterfaceExternalModels(registry);
@@ -234,8 +239,19 @@ struct OneShotBufferizePass
     bufferization::OneShotBufferizationOptions opts;
     opts.allowReturnAllocs = true;
     opts.bufferizeFunctionBoundaries = true;
-    opts.functionBoundaryTypeConversion =
-        bufferization::LayoutMapOption::IdentityLayoutMap;
+    opts.functionArgTypeConverterFn =
+        [=](TensorType tensorType, Attribute memorySpace, func::FuncOp funcOp,
+            const bufferization::BufferizationOptions& options) {
+          // Functions created by fusion outlining should have fully dynamic
+          // layout. All other functions (for now only "main") gets static
+          // layout.
+          if (funcOp->hasAttr(kFusionFunctionLabel))
+            return bufferization::getMemRefTypeWithFullyDynamicLayout(
+                tensorType, memorySpace);
+          return bufferization::getMemRefTypeWithStaticIdentityLayout(
+              tensorType, memorySpace);
+        };
+    opts.inferFunctionResultLayout = false;
     opts.createDeallocs = false;
     opts.bufferAlignment = 64;
 
@@ -254,7 +270,7 @@ struct FinalBufferizePass
 
  public:
   void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<AffineDialect, bufferization::BufferizationDialect,
+    registry.insert<affine::AffineDialect, bufferization::BufferizationDialect,
                     linalg::LinalgDialect, memref::MemRefDialect,
                     scf::SCFDialect, shape::ShapeDialect, tensor::TensorDialect,
                     lmhlo::LmhloDialect, arith::ArithDialect, thlo::THLODialect,
@@ -308,7 +324,7 @@ struct FinalBufferizePass
         arith::ArithDialect, bufferization::BufferizationDialect,
         cf::ControlFlowDialect, complex::ComplexDialect, memref::MemRefDialect,
         func::FuncDialect, scf::SCFDialect, tensor::TensorDialect,
-        AffineDialect, shape::ShapeDialect, lmhlo::LmhloDialect,
+        affine::AffineDialect, shape::ShapeDialect, lmhlo::LmhloDialect,
         linalg::LinalgDialect, math::MathDialect, thlo::THLODialect,
         vector::VectorDialect>();
     target.addLegalOp<func::FuncOp, ModuleOp>();
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/generic_host_to_llvm.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/generic_host_to_llvm.cc
index 16e9cd28318..cf8862fc011 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/generic_host_to_llvm.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/generic_host_to_llvm.cc
@@ -15,60 +15,133 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "deallocation/IR/deallocation_ops.h"  // IWYU pragma: keep
+#include "deallocation/transforms/passes.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
-#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MathToLibm/MathToLibm.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
-#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"  // IWYU pragma: keep
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"      // IWYU pragma: keep
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // IWYU pragma: keep
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
+#include "mlir/Dialect/X86Vector/Transforms.h"
+#include "mlir/Dialect/X86Vector/X86VectorDialect.h"  // IWYU pragma: keep
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "transforms/passes.h"
 
 namespace mlir {
+
+#define GEN_PASS_DEF_GENERICHOSTTOLLVMPASS
+#include "transforms/passes.h.inc"
+
+namespace {
+
+class GenericHostToLLVMPass
+    : public impl::GenericHostToLLVMPassBase<GenericHostToLLVMPass> {
+ public:
+  using GenericHostToLLVMPassBase::GenericHostToLLVMPassBase;
+
+  void runOnOperation() override {
+    ModuleOp m = getOperation();
+
+    // Populate type conversions.
+    MLIRContext* ctx = m.getContext();
+    LLVMTypeConverter typeConverter(ctx);
+
+    {
+      // Perform progressive lowering of vector operations on slices and all
+      // vector contraction operations. Also applies folding and DCE.
+
+      RewritePatternSet patterns(&getContext());
+      vector::populateVectorToVectorCanonicalizationPatterns(patterns);
+      vector::populateVectorBroadcastLoweringPatterns(patterns);
+      vector::populateVectorContractLoweringPatterns(
+          patterns, vector::VectorTransformsOptions());
+      vector::populateVectorMaskOpLoweringPatterns(patterns);
+      vector::populateVectorShapeCastLoweringPatterns(patterns);
+      vector::populateVectorTransposeLoweringPatterns(
+          patterns, vector::VectorTransformsOptions());
+      // Vector transfer ops with rank > 1 should be lowered with VectorToSCF.
+      vector::populateVectorTransferLoweringPatterns(patterns,
+                                                     /*maxTransferRank=*/1);
+      (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    }
+
+    LLVMConversionTarget target(*ctx);
+
+    // Populate patterns.
+    RewritePatternSet patterns(&getContext());
+    populateAffineToStdConversionPatterns(patterns);
+    arith::populateArithExpandOpsPatterns(patterns);
+    memref::populateExpandOpsPatterns(patterns);
+    memref::populateExpandStridedMetadataPatterns(patterns);
+    arith::populateArithToLLVMConversionPatterns(typeConverter, patterns);
+    populateFinalizeMemRefToLLVMConversionPatterns(typeConverter, patterns);
+    populateMathToLLVMConversionPatterns(typeConverter, patterns, false);
+    populateFuncToLLVMConversionPatterns(typeConverter, patterns);
+    cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns);
+    populateSCFToControlFlowConversionPatterns(patterns);
+    populateComplexToLLVMConversionPatterns(typeConverter, patterns);
+    populateLinalgToLLVMConversionPatterns(typeConverter, patterns);
+    populateMathToLibmConversionPatterns(patterns);
+    deallocation::populateDeallocationToLLVMConversionPatterns(typeConverter,
+                                                               patterns);
+
+    // Vector patterns.
+    vector::populateVectorMaskMaterializationPatterns(patterns, true);
+    vector::populateVectorTransferLoweringPatterns(patterns);
+    populateVectorToLLVMMatrixConversionPatterns(typeConverter, patterns);
+    populateVectorToLLVMConversionPatterns(typeConverter, patterns);
+    if (enableAvx2) {
+      configureX86VectorLegalizeForExportTarget(target);
+      populateX86VectorLegalizeForLLVMExportPatterns(typeConverter, patterns);
+    }
+
+    //  Setup target.
+    target.addLegalDialect<LLVM::LLVMDialect>();
+    target.addIllegalDialect<arith::ArithDialect, func::FuncDialect,
+                             complex::ComplexDialect, math::MathDialect>();
+    // Mark modules as legal.
+    target.addLegalOp<ModuleOp>();
+    // Unrealized conversion casts are cleaned up by a separate pass.
+    target.addLegalOp<UnrealizedConversionCastOp>();
+
+    if (failed(applyFullConversion(m, target, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
 namespace hlo {
 
-using func::FuncOp;
-
-void createGenericHostToLLVMPipeline(OpPassManager& pm) {
-  // Convert all linalg operations to parallel loops.
-  pm.addNestedPass<FuncOp>(createConvertLinalgToParallelLoopsPass());
-  // Canonicalize generated scf.parallel operations to remove single iterations.
-  pm.addPass(createCanonicalizerPass());
-
-  // Expand math operations into std/arith dialect operations.
-  pm.addNestedPass<FuncOp>(arith::createArithExpandOpsPass());
-  pm.addNestedPass<FuncOp>(memref::createExpandOpsPass());
-  pm.addNestedPass<FuncOp>(memref::createExpandStridedMetadataPass());
-  pm.addPass(createLowerAffinePass());
-
-  pm.addPass(createConvertLinalgToLLVMPass());
-  pm.addPass(createConvertSCFToCFPass());
-
-  ConvertMathToLLVMPassOptions mathOpts;
-  mathOpts.approximateLog1p = false;
-  pm.addPass(createConvertMathToLLVMPass(mathOpts));
-  pm.addPass(createConvertMathToLibmPass());
-
-  // Convert everything else to LLVM dialect.
-  ConvertVectorToLLVMPassOptions vectorOpts;
-  pm.addPass(createConvertVectorToLLVMPass(vectorOpts));
-  pm.addPass(createFinalizeMemRefToLLVMConversionPass());
-  pm.addPass(createConvertFuncToLLVMPass());
-  pm.addPass(createConvertComplexToLLVMPass());
-  pm.addPass(createReconcileUnrealizedCastsPass());
-
-  // Prepare module for translation to LLVM.
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(createCSEPass());
+std::unique_ptr<OperationPass<ModuleOp>> createGenericHostToLLVMPass(
+    const GenericHostToLLVMPassOptions& options) {
+  return std::make_unique<GenericHostToLLVMPass>(options);
 }
 
 }  // namespace hlo
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_kernel_lowering_passes.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_kernel_lowering_passes.cc
index 6861f13d169..292d70ae514 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_kernel_lowering_passes.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_kernel_lowering_passes.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -63,10 +64,12 @@ static void populateAllCommonVectorProgressiveLoweringPatterns(
     RewritePatternSet& patterns) {
   vector::populateVectorToVectorCanonicalizationPatterns(patterns);
   vector::populateVectorBroadcastLoweringPatterns(patterns);
-  vector::populateVectorContractLoweringPatterns(patterns);
+  vector::populateVectorContractLoweringPatterns(
+      patterns, vector::VectorTransformsOptions());
   vector::populateVectorMaskOpLoweringPatterns(patterns);
   vector::populateVectorShapeCastLoweringPatterns(patterns);
-  vector::populateVectorTransposeLoweringPatterns(patterns);
+  vector::populateVectorTransposeLoweringPatterns(
+      patterns, vector::VectorTransformsOptions());
   // Vector transfer ops with rank > 1 should be lowered with VectorToSCF.
   vector::populateVectorTransferLoweringPatterns(patterns,
                                                  /*maxTransferRank=*/1);
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.td b/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.td
index d77eaa3c405..e277fb6b2d3 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.td
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.td
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_HLO_TRANSFORMS_GPU_PASSES
-#define TENSORFLOW_COMPILER_MLIR_HLO_TRANSFORMS_GPU_PASSES
+#ifndef MLIR_HLO_TRANSFORMS_GPU_PASSES
+#define MLIR_HLO_TRANSFORMS_GPU_PASSES
 
 include "mlir/Pass/PassBase.td"
 
@@ -30,4 +30,4 @@ def GpuKernelToROCDLPass : Pass<"gpu-kernel-to-rocdl", "gpu::GPUModuleOp"> {
   let constructor = "createGpuKernelToRocdlPass()";
 }
 
-#endif // TENSORFLOW_COMPILER_MLIR_HLO_TRANSFORMS_GPU_PASSES
+#endif // MLIR_HLO_TRANSFORMS_GPU_PASSES
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/transforms/passes.h
index 959675ea071..d18c6032052 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/passes.h
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/passes.h
@@ -48,6 +48,7 @@ using BufferizePatternsCallback = std::function<void(
 #define GEN_PASS_DECL_FINALBUFFERIZEPASS
 #define GEN_PASS_DECL_PROPAGATESTATICSHAPESTOKERNELPASS
 #define GEN_PASS_DECL_TILELOOPSPASS
+#define GEN_PASS_DECL_GENERICHOSTTOLLVMPASS
 #include "transforms/passes.h.inc"
 
 /// Creates a pass that merges smaller buffer into bigger buffer to optimize
@@ -105,11 +106,11 @@ void registerTestHloTransformDialectEraseSchedulePass();
 void registerTestHloTransformDialectInterpreterPass();
 
 namespace hlo {
-
-void createGenericHostToLLVMPipeline(OpPassManager& pm);
-
 std::unique_ptr<OperationPass<ModuleOp>> createOneShotBufferizePass();
 
+std::unique_ptr<OperationPass<ModuleOp>> createGenericHostToLLVMPass(
+    const GenericHostToLLVMPassOptions& options = {});
+
 std::unique_ptr<OperationPass<func::FuncOp>> createUnbufferizePass();
 std::unique_ptr<OperationPass<func::FuncOp>> createAllocToArgPass();
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/passes.td b/tensorflow/compiler/xla/mlir_hlo/transforms/passes.td
index 5d72e298d93..b532491a737 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/passes.td
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/passes.td
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_HLO_TRANSFORMS_PASSES
-#define TENSORFLOW_COMPILER_MLIR_HLO_TRANSFORMS_PASSES
+#ifndef MLIR_HLO_TRANSFORMS_PASSES
+#define MLIR_HLO_TRANSFORMS_PASSES
 
 include "mlir/Pass/PassBase.td"
 
@@ -68,7 +68,7 @@ def TileLoopsPass : Pass<"tile-loops", "func::FuncOp"> {
                "factor in each dimension, expressed as the number of elements "
                "in that dimension.", "llvm::cl::ZeroOrMore">,
   ];
-  let dependentDialects = ["AffineDialect"];
+  let dependentDialects = ["affine::AffineDialect"];
 }
 
 def MemoryCount : Pass<"memory-count", "func::FuncOp"> {
@@ -128,6 +128,27 @@ def PropagateStaticShapesToKernelPass : Pass<"propagate-static-shapes", "ModuleO
   ];
 }
 
+def GenericHostToLLVMPass : Pass<"generic-host-to-llvm", "ModuleOp"> {
+  let summary = "Pass to lower common dialects resulting from HLO to LLVM.";
+  let constructor = "hlo::createGenericHostToLLVMPass()";
+  let options = [
+      Option<"enableAvx2", "enable-avx2", "bool",
+             /*default=*/"false", "Whether to enable avx2">,
+  ];
+  let dependentDialects = [
+    "::mlir::LLVM::LLVMDialect",
+    "::mlir::arith::ArithDialect",
+    "::mlir::cf::ControlFlowDialect",
+    "::mlir::complex::ComplexDialect",
+    "::mlir::deallocation::DeallocationDialect",
+    "::mlir::func::FuncDialect",
+    "::mlir::math::MathDialect",
+    "::mlir::memref::MemRefDialect",
+    "::mlir::scf::SCFDialect",
+    "::mlir::x86vector::X86VectorDialect",
+  ];
+}
+
 def UnbufferizePass : Pass<"unbufferize", "mlir::func::FuncOp"> {
   let summary = "Unbufferize partially bufferized functions.";
   let description = [{
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
index dd9eed3dbf4..6e5cb8da37c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
@@ -138,7 +138,8 @@ class TestHloTransformDialectInterpreterPass
     options = options.enableExpensiveChecks(enableExpensiveChecks);
     if (failed(transform::detail::interpreterBaseRunOnOperationImpl(
             getOperation(), getArgument(), getSharedTransformModule(),
-            extraMapping, options, transformFileName, debugPayloadRootTag,
+            getTransformLibraryModule(), extraMapping, options,
+            transformFileName, transformLibraryFileName, debugPayloadRootTag,
             debugTransformRootTag, getBinaryName())))
       return signalPassFailure();
   }
@@ -180,6 +181,11 @@ class TestHloTransformDialectInterpreterPass
           "Optional filename containing a transform dialect specification to "
           "apply. If left empty, the IR is assumed to contain one top-level "
           "transform dialect operation somewhere in the module.")};
+  Option<std::string> transformLibraryFileName{
+      *this, "transform-library-file-name", llvm::cl::init(""),
+      llvm::cl::desc(
+          "Optional name of the file containing transform dialect symbol "
+          "definitions to be injected into the transform module.")};
   Option<std::string> debugPayloadRootTag{
       *this, "debug-payload-root-tag", llvm::cl::init(""),
       llvm::cl::desc(
diff --git a/tensorflow/compiler/xla/mlir_hlo/utils/codegen_utils.h b/tensorflow/compiler/xla/mlir_hlo/utils/codegen_utils.h
index 52be408f8f5..cbf9508e5a0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/utils/codegen_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/utils/codegen_utils.h
@@ -1,6 +1,3 @@
-#ifndef MLIR_HLO_UTILS_CODEGEN_UTILS_H
-#define MLIR_HLO_UTILS_CODEGEN_UTILS_H
-
 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef MLIR_HLO_UTILS_CODEGEN_UTILS_H
+#define MLIR_HLO_UTILS_CODEGEN_UTILS_H
+
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
 
-#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_UTILS_CODEGEN_UTILS_H_
-#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_UTILS_CODEGEN_UTILS_H_
-
 namespace mlir {
 class Value;
 class ValueRange;
@@ -45,6 +42,4 @@ llvm::SmallVector<Value> calcMultiDimIndex(OpBuilder& b, Location loc,
 }  // namespace codegen_utils
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_UTILS_CODEGEN_UTILS_H_
-
-#endif
+#endif  // MLIR_HLO_UTILS_CODEGEN_UTILS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.cc b/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.cc
index 171ed496a17..d76ce70cf34 100644
--- a/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.cc
@@ -196,7 +196,8 @@ std::pair<size_t, size_t> computeMemory(const std::vector<Value>& allocs) {
   size_t allocCounter = 0;
   for (const Value alloc : allocs) {
     auto shape = alloc.getType().cast<ShapedType>();
-    size_t shapeBytes = llvm::divideCeil(shape.getSizeInBits(), 8);
+    size_t shapeBytes = llvm::divideCeil(
+        shape.getNumElements() * shape.getElementTypeBitWidth(), 8);
     size_t alignFactor = llvm::divideCeil(shapeBytes, kPaddingSize);
     size_t size = alignFactor * kPaddingSize;
     totalSize += size;
diff --git a/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h b/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h
index 33796169b0f..f099451b410 100644
--- a/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h
@@ -123,7 +123,7 @@ static Value getConstantLike(OpBuilder& b, Location loc, T constant,
       return complex::NumberAttr::get(complexTy, constant, 0);
     llvm_unreachable("unhandled element type");
   };
-  return b.create<ConstantLikeOp>(loc, getAttr(), val);
+  return b.create<ConstantLikeOp>(loc, cast<TypedAttr>(getAttr()), val);
 }
 
 Value getConstantLike(OpBuilder& b, Location loc, const APFloat& constant,
diff --git a/tensorflow/compiler/xla/mlir_hlo/utils/placement_utils.h b/tensorflow/compiler/xla/mlir_hlo/utils/placement_utils.h
index 3cf9a4ee636..f4b8fb60a58 100644
--- a/tensorflow/compiler/xla/mlir_hlo/utils/placement_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/utils/placement_utils.h
@@ -1,6 +1,3 @@
-#ifndef MLIR_HLO_UTILS_PLACEMENT_UTILS_H
-#define MLIR_HLO_UTILS_PLACEMENT_UTILS_H
-
 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/ADT/StringRef.h"
+#ifndef MLIR_HLO_UTILS_PLACEMENT_UTILS_H
+#define MLIR_HLO_UTILS_PLACEMENT_UTILS_H
 
-#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_UTILS_PLACEMENT_UTIL_H_
-#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_UTILS_PLACEMENT_UTIL_H_
+#include "llvm/ADT/StringRef.h"
 
 namespace mlir {
 namespace mhlo {
@@ -32,6 +29,4 @@ constexpr llvm::StringRef cGpu = "gpu";
 }  // namespace mhlo
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_UTILS_PLACEMENT_UTIL_H_
-
-#endif
+#endif  // MLIR_HLO_UTILS_PLACEMENT_UTILS_H
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.cc b/tensorflow/compiler/xla/parse_flags_from_env.cc
index 86970ac41c9..65fa835fb28 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env.cc
@@ -167,7 +167,7 @@ static void SetArgvFromEnv(absl::string_view envvar, EnvArgv* a) {
         LOG(QFATAL)
             << "Could not open file \"" << env
             << "\" to read flags for environment variable \"" << envvar
-            << "\".  (We assumed \"" << env
+            << "\". (We assumed \"" << env
             << "\" was a file name because it did not start with a \"--\".)";
       }
     }
diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
index 5b39f5d6bfa..7551a3ae6f6 100644
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -184,6 +184,7 @@ cc_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -225,6 +226,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
@@ -507,6 +509,36 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "abstract_tfrt_cpu_buffer",
+    srcs = ["abstract_tfrt_cpu_buffer.cc"],
+    hdrs = ["abstract_tfrt_cpu_buffer.h"],
+    visibility = [
+        "//tensorflow/compiler/xla:friends",
+    ],
+    deps = [
+        ":pjrt_client",
+        ":pjrt_future",
+        ":tracked_tfrt_cpu_device_buffer",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/runtime:cpu_event",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/profiler/lib:connected_traceme",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@tf_runtime//:async_value",
+        "@tf_runtime//:hostcontext",
+    ],
+)
+
 cc_library(
     name = "tfrt_cpu_pjrt_client",
     srcs = ["tfrt_cpu_pjrt_client.cc"],
@@ -515,6 +547,8 @@ cc_library(
         "//tensorflow/compiler/xla:friends",
     ],
     deps = [
+        ":abstract_tfrt_cpu_buffer",
+        ":compile_options_proto_cc",
         ":mlir_to_hlo",
         ":pjrt_client",
         ":pjrt_executable",
@@ -523,18 +557,24 @@ cc_library(
         ":tracked_tfrt_cpu_device_buffer",
         ":transpose",
         ":utils",
-        ":worker_thread",
+        "//tensorflow/compiler/xla:array",
+        "//tensorflow/compiler/xla:cpu_function_runtime",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/runtime:cpu_event",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:computation_placer_hdr",
+        "//tensorflow/compiler/xla/service:custom_call_status_public_headers",
         "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
@@ -543,17 +583,29 @@ cc_library(
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_executable",
         "//tensorflow/compiler/xla/service/cpu:cpu_xfeed",
+        "//tensorflow/tsl/platform:casts",
         "//tensorflow/tsl/platform:denormal",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:fingerprint",
         "//tensorflow/tsl/platform:setround",
+        "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/profiler/lib:connected_traceme",
-        "//tensorflow/tsl/profiler/lib:traceme",
         "//third_party/eigen3",  # TODO(zhangqiaorjc): Remove if use TFRT threadpool.
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:support",
     ],
@@ -566,10 +618,18 @@ xla_cc_test(
         ":tfrt_cpu_pjrt_client",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:custom_call_status_public_headers",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:test",
         "@com_google_googletest//:gtest_main",
     ],
@@ -658,7 +718,6 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_hdrs",
         "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_helpers",
-        "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_wrapper_impl_header",  # TODO(b/269360974): remove
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",  # TODO(b/238999986): Remove this.
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_initializer_helper",
@@ -677,6 +736,7 @@ cc_library(
     deps = [
         ":pjrt_client",
         ":pjrt_future",
+        "//tensorflow/tsl/platform:errors",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
diff --git a/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.cc b/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.cc
new file mode 100644
index 00000000000..732c1dfbd90
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.cc
@@ -0,0 +1,320 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.h"
+
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
+#include "tensorflow/compiler/xla/runtime/cpu_event.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/profiler/lib/connected_traceme.h"
+#include "tfrt/concurrency/async_value.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+
+namespace xla {
+
+using ::xla::runtime::CpuEvent;
+
+void CopyCpuBufferToLiteral(const Shape& device_shape,
+                            TrackedTfrtCpuDeviceBuffer* device_buffer,
+                            MutableLiteralBase* literal) {
+  if (!device_shape.IsTuple()) {
+    const std::shared_ptr<MaybeOwningCpuMemory>& b =
+        device_buffer->Buffers()[0];
+    std::memcpy(literal->untyped_data(), b->data(),
+                ShapeUtil::ByteSizeOf(device_shape));
+  } else {
+    // Tuple case.
+    int num_leaves = literal->shape().tuple_shapes().size();
+    for (int i = 0; i < num_leaves; ++i) {
+      const std::shared_ptr<MaybeOwningCpuMemory>& b =
+          device_buffer->Buffers()[i];
+      std::memcpy(
+          literal->untyped_data({i}), b->data(),
+          ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(device_shape, {i})));
+    }
+  }
+}
+
+AbstractTfrtCpuBuffer::AbstractTfrtCpuBuffer(
+    Shape on_device_shape,
+    std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer)
+    : on_device_shape_(std::move(on_device_shape)),
+      tracked_device_buffer_(std::move(tracked_device_buffer)) {}
+
+AbstractTfrtCpuBuffer::~AbstractTfrtCpuBuffer() {
+  AbstractTfrtCpuBuffer::Delete();
+  CHECK_EQ(external_reference_counter_, 0);
+}
+
+StatusOr<size_t> AbstractTfrtCpuBuffer::GetOnDeviceSizeInBytes() const {
+  return ShapeUtil::ByteSizeOf(on_device_shape_);
+}
+
+StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
+AbstractTfrtCpuBuffer::AcquireExternalReference() {
+  class ScopedExternalReference : public PjRtBuffer::ExternalReference {
+   public:
+    explicit ScopedExternalReference(AbstractTfrtCpuBuffer* buffer,
+                                     std::shared_ptr<MaybeOwningCpuMemory> data)
+        : buffer_(buffer), data_(std::move(data)) {
+      DCHECK(data_);
+      data_ptr_ = data_->data();
+    }
+
+    ~ScopedExternalReference() override { buffer_->DropExternalReference(); }
+
+   private:
+    AbstractTfrtCpuBuffer* buffer_ = nullptr;
+    // Keep a reference to the underlying data used. Note that it is still
+    // users' responsibility to synchronize reads and writes to the data.
+    std::shared_ptr<MaybeOwningCpuMemory> data_;
+  };
+
+  absl::MutexLock lock(&mu_);
+  if (tracked_device_buffer_ == nullptr) {
+    return InvalidArgument("Buffer has been deleted or donated.");
+  }
+
+  ++external_reference_counter_;
+
+  return {std::make_unique<ScopedExternalReference>(
+      this, tracked_device_buffer_->Buffers()[0])};
+}
+
+class TrackedCpuDeviceBufferExternalReference
+    : public PjRtBuffer::ExternalReference {
+ public:
+  explicit TrackedCpuDeviceBufferExternalReference(
+      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer)
+      : tracked_device_buffer_(std::move(tracked_device_buffer)) {
+    data_ptr_ = tracked_device_buffer_->Buffers()[0]->data();
+  }
+
+  ~TrackedCpuDeviceBufferExternalReference() override = default;
+
+ private:
+  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer_;
+};
+
+StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
+AbstractTfrtCpuBuffer::ReleaseDeviceMemoryOwnership(
+    bool wait_for_operations_to_complete) {
+  if (on_device_shape_.IsTuple()) {
+    return InvalidArgument(
+        "ReleaseDeviceMemoryOwnership allowed only for non-tuple");
+  }
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
+      Release(wait_for_operations_to_complete));
+
+  std::unique_ptr<PjRtBuffer::ExternalReference> ref;
+  if (tracked_device_buffer) {
+    ref = std::make_unique<TrackedCpuDeviceBufferExternalReference>(
+        std::move(tracked_device_buffer));
+  }
+  return ref;
+}
+
+void AbstractTfrtCpuBuffer::CommitDonation() {
+  absl::MutexLock lock(&mu_);
+  CHECK(pending_donation_);
+  CHECK(!tracked_device_buffer_);
+  pending_donation_ = false;
+}
+
+void AbstractTfrtCpuBuffer::AbortDonation(
+    std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer) {
+  absl::MutexLock lock(&mu_);
+  CHECK(pending_donation_);
+  CHECK(!tracked_device_buffer_);
+  pending_donation_ = false;
+  tracked_device_buffer_ = std::move(device_buffer);
+}
+
+void AbstractTfrtCpuBuffer::Delete() {
+  auto device_buffer = ReleaseBufferLocked();
+  if (device_buffer == nullptr) return;
+
+  // Now that all holds have completed and no more can be added, we can get
+  // the final set of usage events.
+  absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> usage_events =
+      device_buffer->LockUseAndTransferUsageEvents();
+
+  std::vector<tfrt::AsyncValue*> event_avs;
+  event_avs.reserve(usage_events.size() + 1);
+  for (auto& event : usage_events) {
+    event_avs.push_back(event.GetAsyncValue());
+  }
+
+  // We should also wait for the definition event.
+  event_avs.push_back(device_buffer->definition_event().GetAsyncValue());
+
+  RunWhenReady(event_avs, [device_buffer = std::move(device_buffer)]() mutable {
+    device_buffer.reset();
+  });
+}
+
+bool AbstractTfrtCpuBuffer::IsDeleted() {
+  absl::MutexLock lock(&mu_);
+  return tracked_device_buffer_ == nullptr;
+}
+
+std::unique_ptr<TrackedTfrtCpuDeviceBuffer>
+AbstractTfrtCpuBuffer::ReleaseBufferLocked() {
+  absl::MutexLock lock(&mu_);
+  auto condition = [this]() ABSL_SHARED_LOCKS_REQUIRED(mu_) {
+    return !pending_donation_;
+  };
+  mu_.Await(absl::Condition(&condition));
+  return std::move(tracked_device_buffer_);
+}
+
+StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
+AbstractTfrtCpuBuffer::Release(bool wait_for_operations_to_complete) {
+  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer =
+      ReleaseBufferLocked();
+  if (device_buffer == nullptr) return {nullptr};
+
+  absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> events;
+  // Now that all holds have completed and no more can be added, we can get
+  // the final set of usage events.
+  events = device_buffer->LockUseAndTransferUsageEvents();
+
+  if (wait_for_operations_to_complete) {
+    // Block the host until all usage events have completed. Usage events
+    // dominate definition events, so this also waits for the buffer to be
+    // defined. Return the first error encountered.
+    Status first_error;
+    for (const auto& av : events) {
+      BlockUntilReady(av.GetAsyncValue());
+      if (auto* error = av.GetErrorIfPresent()) {
+        first_error.Update(
+            InternalError("Error Execute: %s", error->message()));
+      }
+    }
+    if (!first_error.ok()) return std::move(first_error);
+  }
+
+  return device_buffer;
+}
+
+TrackedTfrtCpuDeviceBuffer* AbstractTfrtCpuBuffer::AcquireUsage(
+    tfrt::AsyncValueRef<CpuEvent> usage_event) {
+  absl::MutexLock lock(&mu_);
+  if (!tracked_device_buffer_) {
+    return nullptr;
+  }
+
+  tracked_device_buffer_->AddUsageEvents(absl::MakeSpan(&usage_event, 1));
+  return tracked_device_buffer_.get();
+}
+
+StatusOr<AbstractTfrtCpuBuffer::DonationTransaction>
+AbstractTfrtCpuBuffer::AcquireDonation() {
+  absl::MutexLock lock(&mu_);
+
+  if (tracked_device_buffer_ == nullptr) {
+    return InvalidArgument("Donation requested for invalid buffer");
+  }
+
+  if (external_reference_counter_ > 0) {
+    return InvalidArgument(
+        "Donation requested for buffer with external reference");
+  }
+
+  CHECK(!pending_donation_);
+  pending_donation_ = true;
+
+  // Swap out `tracked_device_buffer_` so that no one can acquire a usage event
+  // after this point.
+  return DonationTransaction(this, std::move(tracked_device_buffer_));
+}
+
+PjRtFuture<Status> AbstractTfrtCpuBuffer::GetReadyFuture() {
+  tfrt::AsyncValueRef<CpuEvent> definition_event;
+  {
+    absl::MutexLock lock(&mu_);
+    if (!tracked_device_buffer_) {
+      return PjRtFuture<Status>(InvalidArgument(
+          "GetReadyFuture() called on deleted or donated buffer"));
+    }
+    definition_event = tracked_device_buffer_->definition_event();
+  }
+  DCHECK(definition_event);
+
+  if (definition_event.IsAvailable()) {
+    if (definition_event.IsError()) {
+      return PjRtFuture<Status>(
+          FailedPrecondition("Buffer Definition Event: %s",
+                             definition_event.GetError().message()));
+    }
+    return PjRtFuture<Status>(OkStatus());
+  } else {
+    tfrt::AsyncValueRef<Status> status_event =
+        tfrt::MakeUnconstructedAsyncValueRef<Status>();
+
+    definition_event.AndThen(
+        [definition_event = definition_event.AsPtr(), status_event]() {
+          if (definition_event.IsError()) {
+            status_event.emplace(
+                FailedPrecondition("Buffer Definition Event: %s",
+                                   definition_event.GetError().message()));
+          } else {
+            status_event.emplace(OkStatus());
+          }
+        });
+
+    std::string message = absl::StrCat(buffer_name(), "::Await");
+    return PjRtFuture<Status>(
+        std::move(status_event),
+        /*on_block_start=*/
+        [message]() {
+          absl::string_view message_view(message);
+          tsl::profiler::TraceMeProducer traceme(message_view);
+          VLOG(1) << message_view;
+          return PjRtFutureHelpers::ProfilingKeys(
+              {/*traceme_context_id=*/traceme.GetContextId()});
+        },
+        /*on_block_end=*/
+        [message](PjRtFutureHelpers::ProfilingKeys keys) {
+          absl::string_view message_view(message);
+          tsl::profiler::TraceMeConsumer traceme(message_view,
+                                                 keys.traceme_context_id);
+        });
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.h b/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.h
new file mode 100644
index 00000000000..1b472bca189
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.h
@@ -0,0 +1,240 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_ABSTRACT_TFRT_CPU_BUFFER_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_ABSTRACT_TFRT_CPU_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
+#include "tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
+#include "tensorflow/compiler/xla/runtime/cpu_event.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+
+namespace xla {
+
+void CopyCpuBufferToLiteral(const Shape& device_shape,
+                            TrackedTfrtCpuDeviceBuffer* device_buffer,
+                            MutableLiteralBase* literal);
+
+// A RAII helper class used to set an AsyncValueRef<CpuEvent> to a ready state
+// upon destruction. In many cases in PjRt implementation, there will be
+// multiple return statements in the function, all of which require setting some
+// AsyncValueRef<CpuEvent> to be ready. This class could make such code more
+// robust by using setting the AsyncValue in the destructor.
+class MarkEventReadyOnExit {
+ public:
+  explicit MarkEventReadyOnExit(tfrt::AsyncValueRef<runtime::CpuEvent> event)
+      : event_(std::move(event)) {}
+
+  MarkEventReadyOnExit(const MarkEventReadyOnExit&) = delete;
+  MarkEventReadyOnExit& operator=(const MarkEventReadyOnExit&) = delete;
+  MarkEventReadyOnExit(MarkEventReadyOnExit&&) = default;
+  MarkEventReadyOnExit& operator=(MarkEventReadyOnExit&&) = default;
+
+  ~MarkEventReadyOnExit() {
+    if (event_) event_.SetStateConcrete();
+  }
+
+  tfrt::AsyncValueRef<runtime::CpuEvent> Release() && {
+    return std::move(event_);
+  }
+
+ private:
+  tfrt::AsyncValueRef<runtime::CpuEvent> event_;
+};
+
+class AbstractTfrtCpuBuffer : public PjRtBuffer {
+ public:
+  AbstractTfrtCpuBuffer(
+      Shape on_device_shape,
+      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer);
+  ~AbstractTfrtCpuBuffer() override;
+
+  const Shape& on_device_shape() const override { return on_device_shape_; }
+
+  StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
+      override;
+
+  StatusOr<std::unique_ptr<ExternalReference>> ReleaseDeviceMemoryOwnership(
+      bool wait_for_operations_to_complete) override;
+
+  StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
+
+  PjRtFuture<Status> CopyRawToHost(void* dst, int64_t offset,
+                                   int64_t transfer_size) override {
+    return PjRtFuture<Status>(Unimplemented("CopyRawToHost not implemented"));
+  }
+
+  void Delete() override;
+
+  bool IsDeleted() override;
+
+  void CopyToRemoteDevice(
+      PjRtFuture<StatusOr<std::string>> serialized_descriptor,
+      RemoteSendCallback on_done) override {
+    on_done(Unimplemented("CopyToRemoteDevice not implemented."),
+            /*sends_were_enqueued=*/false);
+  }
+
+  void CopyToRemoteDeviceScattered(
+      PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
+      std::vector<RemoteSendCallback> callbacks,
+      const xla::PjRtBuffer::ScatterDetails& scatter_details) override {
+    for (const auto& on_done : callbacks) {
+      on_done(Unimplemented("Implement CopyToRemoteDeviceScattered."),
+              /*sends_were_enqueued=*/false);
+    }
+  }
+
+  PjRtFuture<Status> GetReadyFuture() override;
+
+  bool IsOnCpu() const override { return true; }
+
+  // Acquires the device buffer for shared read-only usages, and it also adds
+  // the `usage_event` to it. Any donation event in the future is expected to be
+  // serialized after all the usage events added through this method. Returns
+  // nullptr if the buffer is already donated or there is outstanding external
+  // references.
+  TrackedTfrtCpuDeviceBuffer* AcquireUsage(
+      tfrt::AsyncValueRef<runtime::CpuEvent> usage_event);
+
+  // A helper class for managing a pending donation. It should be committed upon
+  // success. Otherwise, the donated buffer is returned to the
+  // AbstractTfrtCpuBuffer.
+  class DonationTransaction {
+   public:
+    explicit DonationTransaction(
+        AbstractTfrtCpuBuffer* buffer,
+        std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer)
+        : buffer_(buffer), device_buffer_(std::move(device_buffer)) {
+      CHECK(buffer_);
+    }
+    DonationTransaction(const DonationTransaction&) = delete;
+    DonationTransaction& operator=(const DonationTransaction&) = delete;
+    DonationTransaction(DonationTransaction&&) = default;
+    DonationTransaction& operator=(DonationTransaction&& other) {
+      Abort();
+
+      buffer_ = other.buffer_;
+      device_buffer_ = std::move(other.device_buffer_);
+      return *this;
+    }
+
+    ~DonationTransaction() { Abort(); }
+
+    // Commit the donation. The rvalue ref qualifier is used to ensure the
+    // semantic that it can be committed at most once.
+    void Commit() && {
+      buffer_->CommitDonation();
+      device_buffer_.reset();
+    }
+
+    TrackedTfrtCpuDeviceBuffer* device_buffer() const {
+      return device_buffer_.get();
+    }
+
+   private:
+    void Abort() {
+      if (device_buffer_) buffer_->AbortDonation(std::move(device_buffer_));
+    }
+
+    AbstractTfrtCpuBuffer* buffer_ = nullptr;
+    std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer_;
+  };
+
+  // Acquires the device buffer for exclusive donation. The caller of this
+  // method is expected to use the usage events and definition events to
+  // serialize this donation with previous usages. After this method is called,
+  // calls to AcquireUsage() will fail. Returns error status if the buffer is
+  // already donated or there is outstanding external references.
+  StatusOr<DonationTransaction> AcquireDonation();
+
+ protected:
+  virtual absl::string_view buffer_name() const = 0;
+
+  bool IsEmptyTuple() const {
+    return on_device_shape_.IsTuple() &&
+           on_device_shape_.tuple_shapes_size() == 0;
+  }
+
+  void DropExternalReference() {
+    absl::MutexLock lock(&mu_);
+    CHECK_GT(external_reference_counter_, 0);
+    --external_reference_counter_;
+  }
+
+  // Commits the pending donation by setting `pending_donation_` to false.
+  // `pending_donation_` must be true before calling this method.
+  void CommitDonation();
+
+  // Aborts the pending donation by returning the donated buffer, and setting
+  // `pending_donation_` to false. `pending_donation_` must be true before
+  // calling this method.
+  void AbortDonation(std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer);
+
+  // Similar to Delete, drops the buffer's reference to its associated device
+  // memory, leaving the buffer in an invalid state, but returns the
+  // TrackedTfrtCpuDeviceBuffer rather than freeing the device memory, so that
+  // another framework can take ownership of it. The buffer returned from
+  // Release may be safely dropped at any time even if it still has pending
+  // async operations. The client should call Await before calling Release with
+  // wait_for_operations_to_complete=false, to ensure that the host has
+  // synchronized past any outstanding write operations to the buffer. If
+  // wait_for_operations_to_complete=true the host will block until any
+  // potentially outstanding asynchronous operations have completed before
+  // returning, in which case it is safe to read or mutate the returned buffer.
+  // If the buffer was shared via an external reference it is the client's
+  // responsibility that accesses via that reference do not interfere with
+  // accesses via the buffer returned from Release.
+  StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>> Release(
+      bool wait_for_operations_to_complete);
+
+  // Releases the device buffer by returning a unique_ptr of it. If there is
+  // outstanding donation or usage holds, this method blocks until those holds
+  // are committed or dropped.
+  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> ReleaseBufferLocked()
+      ABSL_LOCKS_EXCLUDED(mu_);
+
+  const Shape on_device_shape_;
+
+  mutable absl::Mutex mu_;
+  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer_
+      ABSL_GUARDED_BY(mu_);
+  // Count of external references on the buffer.
+  int external_reference_counter_ ABSL_GUARDED_BY(mu_) = 0;
+  // `pending_donation_` indicates whether a donation is pending. The destructor
+  // of the AbstractTfrtCpuBuffer will wait for a pending donation, as the
+  // donation might fail. Note that concurrent calls to AcquireUsage() and
+  // AcquireDonation() might fail even if the pending donation is aborted later.
+  bool pending_donation_ ABSL_GUARDED_BY(mu_) = false;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_ABSTRACT_TFRT_CPU_BUFFER_H_
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h
index f8cc3feda1d..e5918b8eb1d 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_H_
 #define TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_H_
 
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -211,6 +212,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_NamedValue, value_size);
 
 typedef struct PJRT_Client PJRT_Client;
 typedef struct PJRT_Device PJRT_Device;
+typedef struct PJRT_DeviceDescription PJRT_DeviceDescription;
 typedef struct PJRT_Executable PJRT_Executable;
 typedef struct PJRT_LoadedExecutable PJRT_LoadedExecutable;
 typedef struct PJRT_Buffer PJRT_Buffer;
@@ -321,7 +323,8 @@ struct PJRT_Client_LookupDevice_Args {
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_LookupDevice_Args, device);
 
-// Returns a PJRT_Device* with the specified ID as returned by PJRT_Device_Id.
+// Returns a PJRT_Device* with the specified ID as returned by
+// PJRT_DeviceDescription_Id.
 typedef PJRT_Error* PJRT_Client_LookupDevice(
     PJRT_Client_LookupDevice_Args* args);
 
@@ -338,7 +341,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_LookupAddressableDevice_Args,
                           addressable_device);
 
 // Returns an addressable PJRT_Device* with the specified ID as returned by
-// PJRT_Device_LocalHardwareId.
+// PJRT_DeviceDescription_LocalHardwareId.
 typedef PJRT_Error* PJRT_Client_LookupAddressableDevice(
     PJRT_Client_LookupAddressableDevice_Args* args);
 
@@ -433,6 +436,11 @@ typedef enum {
   PJRT_Buffer_Type_C64,
   // Paired F64 (real, imag), as in std::complex<double>.
   PJRT_Buffer_Type_C128,
+
+  // Truncated 8 bit floating-point formats.
+  PJRT_Buffer_Type_F8E5M2,
+  PJRT_Buffer_Type_F8E4M3FN,
+  PJRT_Buffer_Type_F8E4M3B11FNUZ,
 } PJRT_Buffer_Type;
 
 typedef enum {
@@ -495,49 +503,118 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_BufferFromHostBuffer_Args, buffer);
 typedef PJRT_Error* PJRT_Client_BufferFromHostBuffer(
     PJRT_Client_BufferFromHostBuffer_Args* args);
 
-// --------------------------------- Devices -----------------------------------
+// -------------------------- Device Descriptions ------------------------------
 
-struct PJRT_Device_Id_Args {
+// Device descriptions may be associated with an actual device
+// (via PJRT_Device_GetDescription), but they can also be used to describe a
+// device that isn't currently available to the plugin. This is useful for
+// compiling executables without hardware available, which can then be
+// serialized and written somewhere durable, and then loaded and run on actual
+// hardware later.
+
+struct PJRT_DeviceDescription_Id_Args {
   size_t struct_size;
   void* priv;
-  PJRT_Device* device;
+  PJRT_DeviceDescription* device_description;
   int id;  // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_Id_Args, id);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_Id_Args, id);
 
 // The ID of this device. IDs are unique among devices of this type
 // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
 // hosts' devices.
-typedef PJRT_Error* PJRT_Device_Id(PJRT_Device_Id_Args* args);
+typedef PJRT_Error* PJRT_DeviceDescription_Id(
+    PJRT_DeviceDescription_Id_Args* args);
 
-struct PJRT_Device_LocalHardwareId_Args {
+struct PJRT_DeviceDescription_ProcessIndex_Args {
   size_t struct_size;
   void* priv;
-  PJRT_Device* device;
-  int local_hardware_id;  // out
-};
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_LocalHardwareId_Args, local_hardware_id);
-
-// Opaque hardware ID, e.g., the CUDA device number. In general, not guaranteed
-// to be dense, and -1 if undefined.
-typedef PJRT_Error* PJRT_Device_LocalHardwareId(
-    PJRT_Device_LocalHardwareId_Args* args);
-
-struct PJRT_Device_ProcessIndex_Args {
-  size_t struct_size;
-  void* priv;
-  PJRT_Device* device;
+  PJRT_DeviceDescription* device_description;
   int process_index;  // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_ProcessIndex_Args, process_index);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_ProcessIndex_Args,
+                          process_index);
 
 // The index of the process that this device belongs to, i.e. is addressable
 // from. This is not always identical to PJRT_Client_ProcessIndex in a
 // multi-process setting, where each client can see devices from all
 // processes, but only a subset of them are addressable and have the same
 // process_index as the client.
-typedef PJRT_Error* PJRT_Device_ProcessIndex(
-    PJRT_Device_ProcessIndex_Args* args);
+typedef PJRT_Error* PJRT_DeviceDescription_ProcessIndex(
+    PJRT_DeviceDescription_ProcessIndex_Args* args);
+
+struct PJRT_DeviceDescription_Attributes_Args {
+  size_t struct_size;
+  void* priv;
+  PJRT_DeviceDescription* device_description;
+  size_t num_attributes;        // out
+  PJRT_NamedValue* attributes;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_Attributes_Args, attributes);
+
+// Returns an array of device specific attributes with attribute name, value
+// and value type.
+typedef PJRT_Error* PJRT_DeviceDescription_Attributes(
+    PJRT_DeviceDescription_Attributes_Args* args);
+
+struct PJRT_DeviceDescription_Kind_Args {
+  size_t struct_size;
+  void* priv;
+  PJRT_DeviceDescription* device_description;
+  // `device_kind` string is owned by `device` and has same lifetime as
+  // `device`.
+  const char* device_kind;  // out
+  size_t device_kind_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_Kind_Args, device_kind_size);
+
+// A vendor-dependent string that uniquely identifies the kind of device,
+// e.g., "Tesla V100-SXM2-16GB".
+typedef PJRT_Error* PJRT_DeviceDescription_Kind(
+    PJRT_DeviceDescription_Kind_Args* args);
+
+struct PJRT_DeviceDescription_DebugString_Args {
+  size_t struct_size;
+  void* priv;
+  PJRT_DeviceDescription* device_description;
+  const char* debug_string;  // out
+  size_t debug_string_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_DebugString_Args,
+                          debug_string_size);
+
+// Debug string suitable for logging when errors occur. Should be verbose
+// enough to describe the current device unambiguously.
+typedef PJRT_Error* PJRT_DeviceDescription_DebugString(
+    PJRT_DeviceDescription_DebugString_Args* args);
+
+struct PJRT_DeviceDescription_ToString_Args {
+  size_t struct_size;
+  void* priv;
+  PJRT_DeviceDescription* device_description;
+  const char* to_string;  // out
+  size_t to_string_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceDescription_ToString_Args, to_string_size);
+
+// Debug string suitable for reading by end users, should be reasonably terse,
+// for example: "CpuDevice(id=0)".
+typedef PJRT_Error* PJRT_DeviceDescription_ToString(
+    PJRT_DeviceDescription_ToString_Args* args);
+
+// --------------------------------- Devices -----------------------------------
+
+struct PJRT_Device_GetDescription_Args {
+  size_t struct_size;
+  void* priv;
+  PJRT_Device* device;
+  PJRT_DeviceDescription* device_description;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_GetDescription_Args, device_description);
+
+// Fetch the DeviceDescription associated with this device.
+typedef PJRT_Error* PJRT_Device_GetDescription(
+    PJRT_Device_GetDescription_Args* args);
 
 struct PJRT_Device_IsAddressable_Args {
   size_t struct_size;
@@ -551,59 +628,18 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_IsAddressable_Args, is_addressable);
 typedef PJRT_Error* PJRT_Device_IsAddressable(
     PJRT_Device_IsAddressable_Args* args);
 
-struct PJRT_Device_Attributes_Args {
+struct PJRT_Device_LocalHardwareId_Args {
   size_t struct_size;
   void* priv;
   PJRT_Device* device;
-  size_t num_attributes;        // out
-  PJRT_NamedValue* attributes;  // out
+  int local_hardware_id;  // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_Attributes_Args, attributes);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_LocalHardwareId_Args, local_hardware_id);
 
-// Returns an array of device specific attributes with attribute name, value
-// and value type.
-typedef PJRT_Error* PJRT_Device_Attributes(PJRT_Device_Attributes_Args* args);
-
-struct PJRT_Device_Kind_Args {
-  size_t struct_size;
-  void* priv;
-  PJRT_Device* device;
-  // `device_kind` string is owned by `device` and has same lifetime as
-  // `device`.
-  const char* device_kind;  // out
-  size_t device_kind_size;  // out
-};
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_Kind_Args, device_kind_size);
-
-// A vendor-dependent string that uniquely identifies the kind of device,
-// e.g., "Tesla V100-SXM2-16GB".
-typedef PJRT_Error* PJRT_Device_Kind(PJRT_Device_Kind_Args* args);
-
-struct PJRT_Device_DebugString_Args {
-  size_t struct_size;
-  void* priv;
-  PJRT_Device* device;
-  const char* debug_string;  // out
-  size_t debug_string_size;  // out
-};
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_DebugString_Args, debug_string_size);
-
-// Debug string suitable for logging when errors occur. Should be verbose
-// enough to describe the current device unambiguously.
-typedef PJRT_Error* PJRT_Device_DebugString(PJRT_Device_DebugString_Args* args);
-
-struct PJRT_Device_ToString_Args {
-  size_t struct_size;
-  void* priv;
-  PJRT_Device* device;
-  const char* to_string;  // out
-  size_t to_string_size;  // out
-};
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_ToString_Args, to_string_size);
-
-// Debug string suitable for reading by end users, should be reasonably terse,
-// for example: "CpuDevice(id=0)".
-typedef PJRT_Error* PJRT_Device_ToString(PJRT_Device_ToString_Args* args);
+// Opaque hardware ID, e.g., the CUDA device number. In general, not guaranteed
+// to be dense, and -1 if undefined.
+typedef PJRT_Error* PJRT_Device_LocalHardwareId(
+    PJRT_Device_LocalHardwareId_Args* args);
 
 // ------------------------------- Executables ---------------------------------
 
@@ -756,13 +792,13 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_IsDeleted_Args, is_deleted);
 typedef PJRT_Error* PJRT_LoadedExecutable_IsDeleted(
     PJRT_LoadedExecutable_IsDeleted_Args* args);
 
-struct PJRT_Chunk {
+typedef struct PJRT_Chunk {
   void* data;
   size_t size;
   void (*deleter)(void* data, void* deleter_arg);
   // `deleter_arg` will be passed to `deleter` as `deleter_arg` argument.
   void* deleter_arg;
-};
+} PJRT_Chunk;
 
 // TODO(b/263390934) implement C API that calls `AddChunk` and other
 // `xla::CopyToDeviceStream`.
@@ -770,16 +806,21 @@ typedef struct PJRT_CopyToDeviceStream PJRT_CopyToDeviceStream;
 
 struct PJRT_TransferMetadata;
 
-// Returns bool because the caller can't create PJRT_Error, which should be
-// returned by C API only. False indicates an error. The callback must call
+// Returns PJRT_Error* with an error status. The status carries a callback's
+// error status code and message.
+typedef PJRT_Error* (*PJRT_CallbackError)(PJRT_Error_Code code,
+                                          const char* message,
+                                          size_t message_size);
+
+// Returns PJRT_Error* created by PJRT_CallbackError in case of error.
+// Otherwise, returns nullptr. The callback must call
 // `chunk->deleter(chunk->data, chunk->deleter_arg)` when it's finished with
 // `chunk`.
-// TODO(b/267255088) need to bubble up the callback error message to the caller.
-typedef bool (*PJRT_SendCallback)(PJRT_TransferMetadata* metadata,
-                                  PJRT_Chunk* chunk, size_t total_size_in_bytes,
-                                  bool done, void* user_arg);
-typedef void (*PJRT_RecvCallback)(PJRT_TransferMetadata* metadata,
-                                  PJRT_CopyToDeviceStream* stream,
+typedef PJRT_Error* (*PJRT_SendCallback)(PJRT_Chunk* chunk,
+                                         PJRT_CallbackError* callback_error,
+                                         size_t total_size_in_bytes, bool done,
+                                         void* user_arg);
+typedef void (*PJRT_RecvCallback)(PJRT_CopyToDeviceStream* stream,
                                   void* user_arg);
 
 struct PJRT_SendCallbackInfo {
@@ -812,8 +853,8 @@ struct PJRT_ExecuteOptions {
   // functions must outlive the execution (but not the info structs or lists).
   PJRT_SendCallbackInfo** send_callbacks;
   PJRT_RecvCallbackInfo** recv_callbacks;
-  size_t num_send_ops = 0;
-  size_t num_recv_ops = 0;
+  size_t num_send_ops;
+  size_t num_recv_ops;
   // If non-zero, identifies this execution as part of a potentially
   // multi-device launch. This can be used to detect scheduling errors, e.g. if
   // multi-host programs are launched in different orders on different hosts,
@@ -990,6 +1031,8 @@ struct PJRT_Buffer_OnDeviceTrimmedShape_Args {
   Int64List dimensions;         // out
   BoolList dynamic_dimensions;  // out
   bool has_layout;
+  // Whether it calls logical_on_device_shape.
+  bool is_logical_on_device_shape;
   XLA_Layout layout;  // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_OnDeviceTrimmedShape_Args, layout);
@@ -1193,67 +1236,68 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_CurrentBytes(
 
 // ------------------------------ Device Topology ------------------------------
 
-typedef struct PJRT_DeviceTopology PJRT_DeviceTopology;
+typedef struct PJRT_TopologyDescription PJRT_TopologyDescription;
 
-struct PJRT_DeviceTopology_Create_Args {
+struct PJRT_TopologyDescription_Create_Args {
   size_t struct_size;
   void* priv;
-  PJRT_DeviceTopology* topology;  // out
+  PJRT_TopologyDescription* topology;  // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceTopology_Create_Args, topology);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Create_Args, topology);
 
-// Creates and initializes a new PJRT_DeviceTopology and returns in `topology`.
-typedef PJRT_Error* PJRT_DeviceTopology_Create(
-    PJRT_DeviceTopology_Create_Args* args);
+// Creates and initializes a new PJRT_TopologyDescription and returns in
+// `topology`.
+typedef PJRT_Error* PJRT_TopologyDescription_Create(
+    PJRT_TopologyDescription_Create_Args* args);
 
-struct PJRT_DeviceTopology_Destroy_Args {
+struct PJRT_TopologyDescription_Destroy_Args {
   size_t struct_size;
   void* priv;
-  PJRT_DeviceTopology* topology;
+  PJRT_TopologyDescription* topology;
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceTopology_Destroy_Args, topology);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Destroy_Args, topology);
 
 // Frees `topology`. `topology` can be nullptr.
-typedef PJRT_Error* PJRT_DeviceTopology_Destroy(
-    PJRT_DeviceTopology_Destroy_Args* args);
+typedef PJRT_Error* PJRT_TopologyDescription_Destroy(
+    PJRT_TopologyDescription_Destroy_Args* args);
 
-struct PJRT_DeviceTopology_PlatformVersion_Args {
+struct PJRT_TopologyDescription_PlatformVersion_Args {
   size_t struct_size;
   void* priv;
-  PJRT_DeviceTopology* topology;
+  PJRT_TopologyDescription* topology;
   // `platform_version` has the same lifetime as `topology`. It's owned by
   // `topology`.
   const char* platform_version;  // out
   size_t platform_version_size;  // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceTopology_PlatformVersion_Args,
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_PlatformVersion_Args,
                           platform_version_size);
 
 // Returns a string containing human-readable, platform-specific version info
 // (e.g. the CUDA version on GPU or libtpu version on Cloud TPU).
-typedef PJRT_Error* PJRT_DeviceTopology_PlatformVersion(
-    PJRT_DeviceTopology_PlatformVersion_Args* args);
+typedef PJRT_Error* PJRT_TopologyDescription_PlatformVersion(
+    PJRT_TopologyDescription_PlatformVersion_Args* args);
 
-struct PJRT_DeviceTopology_PlatformName_Args {
+struct PJRT_TopologyDescription_PlatformName_Args {
   size_t struct_size;
   void* priv;
-  PJRT_DeviceTopology* topology;
+  PJRT_TopologyDescription* topology;
   // `platform_name` has the same lifetime as `topology`. It is owned by
   // `topology`.
   const char* platform_name;  // out
   size_t platform_name_size;  // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_DeviceTopology_PlatformName_Args,
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_PlatformName_Args,
                           platform_name_size);
 
 // Returns a string that identifies the platform (e.g. "cpu", "gpu", "tpu").
-typedef PJRT_Error* PJRT_DeviceTopology_PlatformName(
-    PJRT_DeviceTopology_PlatformName_Args* args);
+typedef PJRT_Error* PJRT_TopologyDescription_PlatformName(
+    PJRT_TopologyDescription_PlatformName_Args* args);
 
 struct PJRT_Compile_Args {
   size_t struct_size;
   void* priv;
-  const PJRT_DeviceTopology* topology;
+  const PJRT_TopologyDescription* topology;
   // Only needs to stay alive for the duration of the Compile call.
   // `program->format` and `program->format_size` are owned by the caller.
   PJRT_Program* program;
@@ -1305,14 +1349,16 @@ typedef struct {
   _PJRT_API_STRUCT_FIELD(PJRT_Client_DefaultDeviceAssignment);
   _PJRT_API_STRUCT_FIELD(PJRT_Client_BufferFromHostBuffer);
 
-  _PJRT_API_STRUCT_FIELD(PJRT_Device_Id);
-  _PJRT_API_STRUCT_FIELD(PJRT_Device_ProcessIndex);
+  _PJRT_API_STRUCT_FIELD(PJRT_DeviceDescription_Id);
+  _PJRT_API_STRUCT_FIELD(PJRT_DeviceDescription_ProcessIndex);
+  _PJRT_API_STRUCT_FIELD(PJRT_DeviceDescription_Attributes);
+  _PJRT_API_STRUCT_FIELD(PJRT_DeviceDescription_Kind);
+  _PJRT_API_STRUCT_FIELD(PJRT_DeviceDescription_DebugString);
+  _PJRT_API_STRUCT_FIELD(PJRT_DeviceDescription_ToString);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Device_GetDescription);
   _PJRT_API_STRUCT_FIELD(PJRT_Device_IsAddressable);
-  _PJRT_API_STRUCT_FIELD(PJRT_Device_Attributes);
-  _PJRT_API_STRUCT_FIELD(PJRT_Device_Kind);
   _PJRT_API_STRUCT_FIELD(PJRT_Device_LocalHardwareId);
-  _PJRT_API_STRUCT_FIELD(PJRT_Device_DebugString);
-  _PJRT_API_STRUCT_FIELD(PJRT_Device_ToString);
 
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_Destroy);
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_Name);
@@ -1352,10 +1398,10 @@ typedef struct {
   _PJRT_API_STRUCT_FIELD(PJRT_CopyToDeviceStream_GranuleSize);
   _PJRT_API_STRUCT_FIELD(PJRT_CopyToDeviceStream_CurrentBytes);
 
-  _PJRT_API_STRUCT_FIELD(PJRT_DeviceTopology_Create);
-  _PJRT_API_STRUCT_FIELD(PJRT_DeviceTopology_Destroy);
-  _PJRT_API_STRUCT_FIELD(PJRT_DeviceTopology_PlatformName);
-  _PJRT_API_STRUCT_FIELD(PJRT_DeviceTopology_PlatformVersion);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_Create);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_PlatformName);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_PlatformVersion);
 
   _PJRT_API_STRUCT_FIELD(PJRT_Compile);
 } PJRT_Api;
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu.cc
index f88c63969ab..c39bfa5c950 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu.cc
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu.cc
@@ -43,7 +43,7 @@ PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
 }
 
 PJRT_Error* PJRT_CpuDeviceTopology_Create(
-    PJRT_DeviceTopology_Create_Args* args) {
+    PJRT_TopologyDescription_Create_Args* args) {
   return new PJRT_Error{tsl::errors::Unimplemented(
       "Topology not supported for CPU compilation.")};
 }
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.cc
index 433f05aa626..e2639c62be1 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -101,15 +101,17 @@ xla::Status PjrtErrorToStatus(const PJRT_Error* error, const PJRT_Api* api) {
   return status;
 }
 
-PJRT_DeviceTopologyDeleter MakeDeviceTopologyDeleter(const PJRT_Api* api) {
-  return [api](PJRT_DeviceTopology* topology) -> void {
-    PJRT_DeviceTopology_Destroy_Args destroy_args;
-    destroy_args.struct_size = PJRT_DeviceTopology_Destroy_Args_STRUCT_SIZE;
+PJRT_TopologyDescriptionDeleter MakeTopologyDescriptionDeleter(
+    const PJRT_Api* api) {
+  return [api](PJRT_TopologyDescription* topology) -> void {
+    PJRT_TopologyDescription_Destroy_Args destroy_args;
+    destroy_args.struct_size =
+        PJRT_TopologyDescription_Destroy_Args_STRUCT_SIZE;
     destroy_args.priv = nullptr;
     destroy_args.topology = topology;
 
-    pjrt::LogFatalIfPjrtError(api->PJRT_DeviceTopology_Destroy(&destroy_args),
-                              api);
+    pjrt::LogFatalIfPjrtError(
+        api->PJRT_TopologyDescription_Destroy(&destroy_args), api);
   };
 }
 
@@ -190,7 +192,7 @@ void LogFatalIfPjrtError(PJRT_Error* error, const PJRT_Api* api) {
       error, MakeErrorDeleter(api));
   xla::Status _status = PjrtErrorToStatus(_error.get(), api);
   if (!_status.ok()) {
-    LOG(FATAL) << "Unexpected error status " << _status.error_message();
+    LOG(FATAL) << "Unexpected error status " << _status.message();
   }
 }
 
@@ -236,6 +238,12 @@ PJRT_Buffer_Type ConvertToPjRtBufferType(xla::PrimitiveType type) {
       return PJRT_Buffer_Type::PJRT_Buffer_Type_BF16;
     case xla::PrimitiveType::F64:
       return PJRT_Buffer_Type::PJRT_Buffer_Type_F64;
+    case xla::PrimitiveType::F8E5M2:
+      return PJRT_Buffer_Type::PJRT_Buffer_Type_F8E5M2;
+    case xla::PrimitiveType::F8E4M3FN:
+      return PJRT_Buffer_Type::PJRT_Buffer_Type_F8E4M3FN;
+    case xla::PrimitiveType::F8E4M3B11FNUZ:
+      return PJRT_Buffer_Type::PJRT_Buffer_Type_F8E4M3B11FNUZ;
     case xla::PrimitiveType::C64:
       return PJRT_Buffer_Type::PJRT_Buffer_Type_C64;
     case xla::PrimitiveType::C128:
@@ -279,6 +287,12 @@ xla::PrimitiveType ConvertFromPjRtBufferType(PJRT_Buffer_Type type) {
       return xla::PrimitiveType::C64;
     case PJRT_Buffer_Type::PJRT_Buffer_Type_C128:
       return xla::PrimitiveType::C128;
+    case PJRT_Buffer_Type::PJRT_Buffer_Type_F8E5M2:
+      return xla::PrimitiveType::F8E5M2;
+    case PJRT_Buffer_Type::PJRT_Buffer_Type_F8E4M3FN:
+      return xla::PrimitiveType::F8E4M3FN;
+    case PJRT_Buffer_Type::PJRT_Buffer_Type_F8E4M3B11FNUZ:
+      return xla::PrimitiveType::F8E4M3B11FNUZ;
     case PJRT_Buffer_Type::PJRT_Buffer_Type_INVALID:
       CHECK(false) << "Buffer type is not supported in C API layer.";
   }
@@ -560,4 +574,14 @@ xla::PjRtChunk ConvertToCppChunk(const PJRT_Chunk& chunk) {
       });
 }
 
+PJRT_DeviceDescription* GetDeviceDescription(const PJRT_Api* api,
+                                             PJRT_Device* device) {
+  PJRT_Device_GetDescription_Args args;
+  args.struct_size = PJRT_Device_GetDescription_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.device = device;
+  pjrt::LogFatalIfPjrtError(api->PJRT_Device_GetDescription(&args), api);
+  return args.device_description;
+}
+
 }  // namespace pjrt
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h
index 6fc9df8c964..4caa178071b 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h
@@ -82,11 +82,13 @@ using PJRT_SerializedExecutableDeleter =
 PJRT_SerializedExecutableDeleter MakeSerializedExecutableDeleter(
     const PJRT_Api* api);
 
-using PJRT_DeviceTopologyDeleter = std::function<void(PJRT_DeviceTopology*)>;
+using PJRT_TopologyDescriptionDeleter =
+    std::function<void(PJRT_TopologyDescription*)>;
 
 // Pass in an API pointer; receive a custom deleter for smart pointers.
 // The lifetime of the Api pointed to must be longer than the client.
-PJRT_DeviceTopologyDeleter MakeDeviceTopologyDeleter(const PJRT_Api* api);
+PJRT_TopologyDescriptionDeleter MakeTopologyDescriptionDeleter(
+    const PJRT_Api* api);
 
 // Fatal error logging if status is not success. This terminates the process
 // and frees the PJRT_Error passed in.
@@ -155,6 +157,9 @@ PJRT_Chunk ConvertFromCppChunk(xla::PjRtChunk chunk);
 // should not be called).
 xla::PjRtChunk ConvertToCppChunk(const PJRT_Chunk& chunk);
 
+PJRT_DeviceDescription* GetDeviceDescription(const PJRT_Api* api,
+                                             PJRT_Device* device);
+
 }  // namespace pjrt
 
 #endif  // TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_HELPERS_H_
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers_test.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers_test.cc
index af8c6176f87..91d0567aab5 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers_test.cc
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers_test.cc
@@ -68,7 +68,7 @@ TEST(PjRtCApiHelperTest, InvalidOptionName) {
   auto status = ValidateCreateOptions(invalid_map, expected);
 
   EXPECT_NE(status, tsl::OkStatus());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Unexpected option name passed to PJRT_Client_Create"));
 }
 
@@ -83,7 +83,7 @@ TEST(PjRtCApiHelperTest, InvalidOptionTypeIndex) {
   auto status = ValidateCreateOptions(invalid_map, expected);
 
   EXPECT_NE(status, tsl::OkStatus());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Option passed to PJRT_Client_Create with name string "
                         "has type index 1 but expected type index is 0"));
 }
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index e3f6eb1d2e2..300b92c5342 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -123,7 +123,7 @@ void PJRT_Error_Destroy(PJRT_Error_Destroy_Args* args) {
       "PJRT_Error_Destroy_Args", PJRT_Error_Destroy_Args_STRUCT_SIZE,
       args->struct_size);
   if (!struct_size_check.ok()) {
-    LOG(ERROR) << struct_size_check.error_message();
+    LOG(ERROR) << struct_size_check.message();
   }
   if (args->struct_size >= PJRT_STRUCT_SIZE(PJRT_Error_Destroy_Args, error)) {
     delete args->error;
@@ -135,12 +135,12 @@ void PJRT_Error_Message(PJRT_Error_Message_Args* args) {
       "PJRT_Error_Message_Args", PJRT_Error_Message_Args_STRUCT_SIZE,
       args->struct_size);
   if (!struct_size_check.ok()) {
-    LOG(ERROR) << struct_size_check.error_message();
+    LOG(ERROR) << struct_size_check.message();
   }
   if (args->struct_size >= PJRT_STRUCT_SIZE(PJRT_Error_Destroy_Args, error)) {
     const xla::Status* status = &args->error->status;
-    args->message = status->error_message().data();
-    args->message_size = status->error_message().size();
+    args->message = status->message().data();
+    args->message_size = status->message().size();
   }
 }
 
@@ -428,20 +428,81 @@ PJRT_Error* PJRT_Client_BufferFromHostBuffer(
 
 // --------------------------------- Devices -----------------------------------
 
-PJRT_Error* PJRT_Device_Id(PJRT_Device_Id_Args* args) {
-  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes("PJRT_Device_Id_Args",
-                                                PJRT_Device_Id_Args_STRUCT_SIZE,
-                                                args->struct_size));
+PJRT_Error* PJRT_DeviceDescription_Id(PJRT_DeviceDescription_Id_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_DeviceDescription_Id_Args",
+      PJRT_DeviceDescription_Id_Args_STRUCT_SIZE, args->struct_size));
 
-  args->id = args->device->device->id();
+  args->id = args->device_description->device_description->id();
   return nullptr;
 }
 
-PJRT_Error* PJRT_Device_ProcessIndex(PJRT_Device_ProcessIndex_Args* args) {
+PJRT_Error* PJRT_DeviceDescription_ProcessIndex(
+    PJRT_DeviceDescription_ProcessIndex_Args* args) {
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
-      "PJRT_Device_ProcessIndex_Args",
-      PJRT_Device_ProcessIndex_Args_STRUCT_SIZE, args->struct_size));
-  args->process_index = args->device->device->process_index();
+      "PJRT_DeviceDescription_ProcessIndex_Args",
+      PJRT_DeviceDescription_ProcessIndex_Args_STRUCT_SIZE, args->struct_size));
+  args->process_index =
+      args->device_description->device_description->process_index();
+  return nullptr;
+}
+
+PJRT_Error* PJRT_DeviceDescription_Attributes(
+    PJRT_DeviceDescription_Attributes_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_DeviceDescription_Attributes_Args",
+      PJRT_DeviceDescription_Attributes_Args_STRUCT_SIZE, args->struct_size));
+
+  // Returns the attributes that were initialized during PJRT_Device creation.
+  args->num_attributes = args->device_description->attributes.size();
+  args->attributes = args->device_description->attributes.data();
+
+  return nullptr;
+}
+
+PJRT_Error* PJRT_DeviceDescription_Kind(
+    PJRT_DeviceDescription_Kind_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_DeviceDescription_Kind_Args",
+      PJRT_DeviceDescription_Kind_Args_STRUCT_SIZE, args->struct_size));
+
+  args->device_kind =
+      args->device_description->device_description->device_kind().data();
+  args->device_kind_size =
+      args->device_description->device_description->device_kind().size();
+  return nullptr;
+}
+
+PJRT_Error* PJRT_DeviceDescription_DebugString(
+    PJRT_DeviceDescription_DebugString_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_DeviceDescription_DebugString_Args",
+      PJRT_DeviceDescription_DebugString_Args_STRUCT_SIZE, args->struct_size));
+
+  args->debug_string =
+      args->device_description->device_description->DebugString().data();
+  args->debug_string_size =
+      args->device_description->device_description->DebugString().size();
+  return nullptr;
+}
+
+PJRT_Error* PJRT_DeviceDescription_ToString(
+    PJRT_DeviceDescription_ToString_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_DeviceDescription_ToString_Args",
+      PJRT_DeviceDescription_ToString_Args_STRUCT_SIZE, args->struct_size));
+  args->to_string =
+      args->device_description->device_description->ToString().data();
+  args->to_string_size =
+      args->device_description->device_description->ToString().size();
+  return nullptr;
+}
+
+PJRT_Error* PJRT_Device_GetDescription(PJRT_Device_GetDescription_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_Device_GetDescription_Args",
+      PJRT_Device_GetDescription_Args_STRUCT_SIZE, args->struct_size));
+  args->device_description = &args->device->description;
   return nullptr;
 }
 
@@ -453,28 +514,6 @@ PJRT_Error* PJRT_Device_IsAddressable(PJRT_Device_IsAddressable_Args* args) {
   return nullptr;
 }
 
-PJRT_Error* PJRT_Device_Attributes(PJRT_Device_Attributes_Args* args) {
-  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
-      "PJRT_Device_Attributes_Args", PJRT_Device_Attributes_Args_STRUCT_SIZE,
-      args->struct_size));
-
-  // Returns the attributes that were initialized during PJRT_Device creation.
-  args->num_attributes = args->device->attributes.size();
-  args->attributes = args->device->attributes.data();
-
-  return nullptr;
-}
-
-PJRT_Error* PJRT_Device_Kind(PJRT_Device_Kind_Args* args) {
-  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
-      "PJRT_Device_Kind_Args", PJRT_Device_Kind_Args_STRUCT_SIZE,
-      args->struct_size));
-
-  args->device_kind = args->device->device->device_kind().data();
-  args->device_kind_size = args->device->device->device_kind().size();
-  return nullptr;
-}
-
 PJRT_Error* PJRT_Device_LocalHardwareId(
     PJRT_Device_LocalHardwareId_Args* args) {
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
@@ -484,25 +523,6 @@ PJRT_Error* PJRT_Device_LocalHardwareId(
   return nullptr;
 }
 
-PJRT_Error* PJRT_Device_DebugString(PJRT_Device_DebugString_Args* args) {
-  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
-      "PJRT_Device_DebugString_Args", PJRT_Device_DebugString_Args_STRUCT_SIZE,
-      args->struct_size));
-
-  args->debug_string = args->device->device->DebugString().data();
-  args->debug_string_size = args->device->device->DebugString().size();
-  return nullptr;
-}
-
-PJRT_Error* PJRT_Device_ToString(PJRT_Device_ToString_Args* args) {
-  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
-      "PJRT_Device_ToString_Args", PJRT_Device_ToString_Args_STRUCT_SIZE,
-      args->struct_size));
-  args->to_string = args->device->device->ToString().data();
-  args->to_string_size = args->device->device->ToString().size();
-  return nullptr;
-}
-
 // ------------------------------- Executables ---------------------------------
 
 PJRT_Error* PJRT_Executable_Destroy(PJRT_Executable_Destroy_Args* args) {
@@ -709,26 +729,35 @@ static xla::SendCallback CSendCallbackToCpp(
     const PJRT_SendCallbackInfo& c_callback) {
   return xla::SendCallback{
       c_callback.channel_id,
+      // Transfer metadata is unused because PJRT C API doesn't support
+      // use_major_to_minor_data_layout_for_callbacks = false
       [user_arg = c_callback.user_arg, callback = c_callback.send_callback](
-          const xla::PjRtTransferMetadata& metadata, xla::PjRtChunk input,
-          size_t total_size_in_bytes, bool done) -> xla::Status {
-        PJRT_TransferMetadata c_metadata{metadata.device_shape};
+          const xla::PjRtTransferMetadata& unused_metadata,
+          xla::PjRtChunk input, size_t total_size_in_bytes,
+          bool done) -> xla::Status {
         PJRT_Chunk c_chunk = ConvertFromCppChunk(std::move(input));
+        // PJRT_CallbackError creates PJRT_Error in the implementation, but
+        // using the caller's callback status code & message. This way, the
+        // caller avoids creating PJRT_Error itself, and the PJRT_Error is fully
+        // managed in the implementation layer.
+        PJRT_CallbackError c_callback_error =
+            [](PJRT_Error_Code code, const char* message, size_t message_size) {
+              return new PJRT_Error{
+                  xla::Status(static_cast<absl::StatusCode>(code),
+                              std::string(message, message_size))};
+            };
 
-        // TODO(b/267255088) retrieve up the callback error message.
-        bool success = callback(&c_metadata, &c_chunk, total_size_in_bytes,
-                                done, user_arg);
-        if (success) {
+        std::unique_ptr<PJRT_Error> error(callback(
+            &c_chunk, &c_callback_error, total_size_in_bytes, done, user_arg));
+        if (error == nullptr) {
           return tsl::OkStatus();
         }
-        return xla::Status(tsl::error::UNKNOWN,
-                           "PJRT_SendCallback returned false (error).");
+        return error->status;
       }};
 }
 
-// Create new libtpu C++ callbacks that does the following:
-// - convert libtpu PjRtTransferMetadata to PJRT_TransferMetadata, etc.
-// - call C API callback with the converted arguments
+// Create new libtpu C++ callbacks that calls C API callback with converted
+// arguments.
 static void CSendCallbackListsToCpp(
     PJRT_SendCallbackInfo** c_lists, size_t outer_size, size_t inner_size,
     std::vector<std::vector<xla::SendCallback>>& cpp_lists) {
@@ -746,15 +775,13 @@ static xla::RecvCallback CRecvCallbackToCpp(
     const PJRT_RecvCallbackInfo& c_callback) {
   return xla::RecvCallback{
       c_callback.channel_id,
+      // Transfer metadata is unused because PJRT C API doesn't support
+      // use_major_to_minor_data_layout_for_callbacks = false
       [user_arg = c_callback.user_arg, callback = c_callback.recv_callback](
-          const xla::PjRtTransferMetadata& metadata,
+          const xla::PjRtTransferMetadata& unused_metadata,
           std::unique_ptr<xla::CopyToDeviceStream> stream) {
-        Int64List c_dimensions;
-        ApiConverter::CreateVector(metadata.device_shape.dimensions(),
-                                   &c_dimensions);
-        PJRT_TransferMetadata c_metadata{metadata.device_shape};
         PJRT_CopyToDeviceStream c_stream{std::move(stream)};
-        callback(&c_metadata, &c_stream, user_arg);
+        callback(&c_stream, user_arg);
       }};
 }
 
@@ -800,6 +827,7 @@ PJRT_Error* PJRT_LoadedExecutable_Execute(
   options.untuple_result = true;
   options.context = nullptr;
   options.multi_slice_config = nullptr;
+  options.use_major_to_minor_data_layout_for_callbacks = true;
 
   std::vector<std::vector<xla::PjRtBuffer*>> cpp_argument_lists =
       Convert2DCBuffersToCppBuffers(args->argument_lists, args->num_devices,
@@ -884,8 +912,9 @@ PJRT_Error* PJRT_LoadedExecutable_Execute(
     std::vector<std::unique_ptr<xla::PjRtBuffer>> cpp_buffer_list;
     std::optional<xla::PjRtFuture<xla::Status>> returned_future;
     bool fill_future = args->device_complete_events != nullptr;
-    if (args->executable->get()->num_partitions() == 1 &&
-        args->executable->get()->num_replicas() == 1) {
+    PJRT_ASSIGN_OR_RETURN(xla::CompileOptions compile_options,
+                          args->executable->get()->GetCompileOptions());
+    if (compile_options.compile_portable_executable) {
       PJRT_ASSIGN_OR_RETURN(
           cpp_buffer_list,
           args->executable->get()->ExecutePortable(
@@ -986,7 +1015,13 @@ PJRT_Error* PJRT_Buffer_OnDeviceTrimmedShape(
       "PJRT_Buffer_OnDeviceTrimmedShape_Args",
       PJRT_Buffer_OnDeviceTrimmedShape_Args_STRUCT_SIZE, args->struct_size));
 
-  const xla::Shape& shape = args->buffer->buffer->on_device_shape();
+  xla::Shape shape;
+  if (args->is_logical_on_device_shape) {
+    PJRT_ASSIGN_OR_RETURN(shape,
+                          args->buffer->buffer->logical_on_device_shape());
+  } else {
+    shape = args->buffer->buffer->on_device_shape();
+  }
   args->element_type = shape.element_type();
   ApiConverter::CreateVector(shape.dimensions(), &args->dimensions);
   ApiConverter::CreateVector(shape.dynamic_dimensions(),
@@ -1066,8 +1101,15 @@ PJRT_Error* PJRT_Buffer_ToHostBuffer(PJRT_Buffer_ToHostBuffer_Args* args) {
       "PJRT_Buffer_ToHostBuffer_Args",
       PJRT_Buffer_ToHostBuffer_Args_STRUCT_SIZE, args->struct_size));
 
-  const xla::Shape& host_shape = xla::ShapeUtil::DeviceShapeToHostShape(
-      args->src->buffer->on_device_shape());
+  xla::Shape device_shape;
+  if (args->src->buffer->on_device_shape().is_dynamic()) {
+    PJRT_ASSIGN_OR_RETURN(device_shape,
+                          args->src->buffer->logical_on_device_shape());
+  } else {
+    device_shape = args->src->buffer->on_device_shape();
+  }
+  const xla::Shape& host_shape =
+      xla::ShapeUtil::DeviceShapeToHostShape(device_shape);
 
   size_t host_buffer_size = xla::ShapeUtil::ByteSizeOfElements(host_shape);
 
@@ -1239,31 +1281,33 @@ PJRT_Error* PJRT_Event_OnReady(PJRT_Event_OnReady_Args* args) {
 
 // ------------------------------ Device Topology ------------------------------
 
-PJRT_Error* PJRT_DeviceTopology_Destroy(
-    PJRT_DeviceTopology_Destroy_Args* args) {
+PJRT_Error* PJRT_TopologyDescription_Destroy(
+    PJRT_TopologyDescription_Destroy_Args* args) {
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
-      "PJRT_DeviceTopology_Destroy_Args",
-      PJRT_DeviceTopology_Destroy_Args_STRUCT_SIZE, args->struct_size));
+      "PJRT_TopologyDescription_Destroy_Args",
+      PJRT_TopologyDescription_Destroy_Args_STRUCT_SIZE, args->struct_size));
   delete args->topology;
   return nullptr;
 }
 
-PJRT_Error* PJRT_DeviceTopology_PlatformName(
-    PJRT_DeviceTopology_PlatformName_Args* args) {
+PJRT_Error* PJRT_TopologyDescription_PlatformName(
+    PJRT_TopologyDescription_PlatformName_Args* args) {
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
-      "PJRT_DeviceTopology_PlatformName_Args",
-      PJRT_DeviceTopology_PlatformName_Args_STRUCT_SIZE, args->struct_size));
+      "PJRT_TopologyDescription_PlatformName_Args",
+      PJRT_TopologyDescription_PlatformName_Args_STRUCT_SIZE,
+      args->struct_size));
   absl::string_view platform_name = args->topology->topology->platform_name();
   args->platform_name = platform_name.data();
   args->platform_name_size = platform_name.size();
   return nullptr;
 }
 
-PJRT_Error* PJRT_DeviceTopology_PlatformVersion(
-    PJRT_DeviceTopology_PlatformVersion_Args* args) {
+PJRT_Error* PJRT_TopologyDescription_PlatformVersion(
+    PJRT_TopologyDescription_PlatformVersion_Args* args) {
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
-      "PJRT_DeviceTopology_PlatformVersion_Args",
-      PJRT_DeviceTopology_PlatformVersion_Args_STRUCT_SIZE, args->struct_size));
+      "PJRT_TopologyDescription_PlatformVersion_Args",
+      PJRT_TopologyDescription_PlatformVersion_Args_STRUCT_SIZE,
+      args->struct_size));
   absl::string_view platform_version =
       args->topology->topology->platform_version();
   args->platform_version = platform_version.data();
@@ -1301,21 +1345,22 @@ PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args) {
   return nullptr;
 }
 
-// Populates `c_device->attributes` with shallow copy of the vendor specific
-// attributes about the device.
-static void PopulatePjrtDeviceAttributes(PJRT_Device* c_device) {
-  CHECK(c_device != nullptr) << ": c device is null";
-  CHECK(c_device->device != nullptr) << ": cpp device is null";
+// Populates `c_device_description->attributes` with shallow copy of the vendor
+// specific attributes about the device.
+static void PopulatePjrtDeviceDescriptionAttributes(
+    PJRT_DeviceDescription* c_device_description) {
+  CHECK(c_device_description->device_description != nullptr)
+      << ": cpp device description is null";
 
   const absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>& attributes =
-      c_device->device->Attributes();
+      c_device_description->device_description->Attributes();
 
-  c_device->attributes.resize(attributes.size());
+  c_device_description->attributes.resize(attributes.size());
   int ind = 0;
   // Doing shallow copy of attribute names and values when it's string or an
   // array.
   for (auto const& [name, value] : attributes) {
-    PJRT_NamedValue& cur_attribute = c_device->attributes[ind];
+    PJRT_NamedValue& cur_attribute = c_device_description->attributes[ind];
     cur_attribute.struct_size = PJRT_NamedValue_STRUCT_SIZE;
     cur_attribute.priv = nullptr;
     cur_attribute.name = name.c_str();
@@ -1355,9 +1400,10 @@ PJRT_Client* CreateWrapperClient(std::unique_ptr<xla::PjRtClient> cpp_client) {
       c_client->client->addressable_device_count());
 
   for (xla::PjRtDevice* device : cpp_devices) {
-    c_client->owned_devices.push_back(PJRT_Device{device});
+    c_client->owned_devices.push_back(
+        PJRT_Device{device, {&device->description()}});
     PJRT_Device* c_device = &c_client->owned_devices.back();
-    PopulatePjrtDeviceAttributes(c_device);
+    PopulatePjrtDeviceDescriptionAttributes(&c_device->description);
     c_client->devices.push_back(c_device);
     if (device->IsAddressable()) {
       c_client->addressable_devices.push_back(c_device);
@@ -1369,10 +1415,10 @@ PJRT_Client* CreateWrapperClient(std::unique_ptr<xla::PjRtClient> cpp_client) {
   return c_client;
 }
 
-PJRT_DeviceTopology* CreateWrapperDeviceTopology(
-    std::unique_ptr<xla::PjRtDeviceTopology> cpp_topology) {
-  PJRT_DeviceTopology* c_topology =
-      new PJRT_DeviceTopology{std::move(cpp_topology)};
+PJRT_TopologyDescription* CreateWrapperDeviceTopology(
+    std::unique_ptr<xla::PjRtTopologyDescription> cpp_topology) {
+  PJRT_TopologyDescription* c_topology =
+      new PJRT_TopologyDescription{std::move(cpp_topology)};
   return c_topology;
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index 63319595c60..14d77aaa2f1 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -45,12 +45,20 @@ struct PJRT_Client {
   absl::flat_hash_map<xla::PjRtDevice*, PJRT_Device*> c_device_from_cpp_device;
 };
 
+// PJRT_DeviceDescriptions are owned by their corresponding PJRT_Device.
+struct PJRT_DeviceDescription {
+  // The xla::PjRtDeviceDescription* is owned transitively by the
+  // corresponding xla::PjRtClient.
+  const xla::PjRtDeviceDescription* device_description;
+  // The device specific attributes which are initialized once per device.
+  std::vector<PJRT_NamedValue> attributes;
+};
+
 // PJRT_Devices are owned by their corresponding PJRT_Client.
 struct PJRT_Device {
   // The xla::PjRtDevice* is owned by the corresponding xla::PjRtClient.
   xla::PjRtDevice* device;
-  // The device specific attributes which are initialized once per device.
-  std::vector<PJRT_NamedValue> attributes;
+  PJRT_DeviceDescription description;
 };
 
 struct PJRT_Executable {
@@ -105,8 +113,8 @@ struct PJRT_SerializedExecutable {
   std::string serialized;
 };
 
-struct PJRT_DeviceTopology {
-  std::unique_ptr<xla::PjRtDeviceTopology> topology;
+struct PJRT_TopologyDescription {
+  std::unique_ptr<xla::PjRtTopologyDescription> topology;
 };
 
 struct PJRT_TransferMetadata {
@@ -149,14 +157,20 @@ PJRT_Error* PJRT_Client_DefaultDeviceAssignment(
 PJRT_Error* PJRT_Client_BufferFromHostBuffer(
     PJRT_Client_BufferFromHostBuffer_Args* args);
 
-PJRT_Error* PJRT_Device_Id(PJRT_Device_Id_Args* args);
-PJRT_Error* PJRT_Device_ProcessIndex(PJRT_Device_ProcessIndex_Args* args);
+PJRT_Error* PJRT_DeviceDescription_Id(PJRT_DeviceDescription_Id_Args* args);
+PJRT_Error* PJRT_DeviceDescription_ProcessIndex(
+    PJRT_DeviceDescription_ProcessIndex_Args* args);
+PJRT_Error* PJRT_DeviceDescription_Attributes(
+    PJRT_DeviceDescription_Attributes_Args* args);
+PJRT_Error* PJRT_DeviceDescription_Kind(PJRT_DeviceDescription_Kind_Args* args);
+PJRT_Error* PJRT_DeviceDescription_DebugString(
+    PJRT_DeviceDescription_DebugString_Args* args);
+PJRT_Error* PJRT_DeviceDescription_ToString(
+    PJRT_DeviceDescription_ToString_Args* args);
+
+PJRT_Error* PJRT_Device_GetDescription(PJRT_Device_GetDescription_Args* args);
 PJRT_Error* PJRT_Device_IsAddressable(PJRT_Device_IsAddressable_Args* args);
-PJRT_Error* PJRT_Device_Attributes(PJRT_Device_Attributes_Args* args);
-PJRT_Error* PJRT_Device_Kind(PJRT_Device_Kind_Args* args);
 PJRT_Error* PJRT_Device_LocalHardwareId(PJRT_Device_LocalHardwareId_Args* args);
-PJRT_Error* PJRT_Device_DebugString(PJRT_Device_DebugString_Args* args);
-PJRT_Error* PJRT_Device_ToString(PJRT_Device_ToString_Args* args);
 
 PJRT_Error* PJRT_Executable_Destroy(PJRT_Executable_Destroy_Args* args);
 PJRT_Error* PJRT_Executable_Name(PJRT_Executable_Name_Args* args);
@@ -215,11 +229,12 @@ PJRT_Error* PJRT_CopyToDeviceStream_GranuleSize(
 PJRT_Error* PJRT_CopyToDeviceStream_CurrentBytes(
     PJRT_CopyToDeviceStream_CurrentBytes_Args* args);
 
-PJRT_Error* PJRT_DeviceTopology_Destroy(PJRT_DeviceTopology_Destroy_Args* args);
-PJRT_Error* PJRT_DeviceTopology_PlatformName(
-    PJRT_DeviceTopology_PlatformName_Args* args);
-PJRT_Error* PJRT_DeviceTopology_PlatformVersion(
-    PJRT_DeviceTopology_PlatformVersion_Args* args);
+PJRT_Error* PJRT_TopologyDescription_Destroy(
+    PJRT_TopologyDescription_Destroy_Args* args);
+PJRT_Error* PJRT_TopologyDescription_PlatformName(
+    PJRT_TopologyDescription_PlatformName_Args* args);
+PJRT_Error* PJRT_TopologyDescription_PlatformVersion(
+    PJRT_TopologyDescription_PlatformVersion_Args* args);
 
 PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args);
 
@@ -257,9 +272,9 @@ std::string ProgramFormatErrorMsg(absl::string_view program_format);
 
 // Creates a C PJRT topology from a C++ PJRT topology.
 // The returned topology is owned by the caller and
-// should be destroyed with PJRT_DeviceTopology_Destroy.
-PJRT_DeviceTopology* CreateWrapperDeviceTopology(
-    std::unique_ptr<xla::PjRtDeviceTopology> cpp_topology);
+// should be destroyed with PJRT_TopologyDescription_Destroy.
+PJRT_TopologyDescription* CreateWrapperDeviceTopology(
+    std::unique_ptr<xla::PjRtTopologyDescription> cpp_topology);
 
 // Creates a C PJRT client from a C++ PJRT client and creates C PJRT devices
 // from cpp_client's devices. The returned client is owned by the caller and
@@ -270,7 +285,7 @@ PJRT_Client* CreateWrapperClient(std::unique_ptr<xla::PjRtClient> cpp_client);
 // pjrt_c_api_wrapper_impl.
 constexpr PJRT_Api CreatePjrtApi(
     PJRT_Client_Create* create_fn,
-    PJRT_DeviceTopology_Create* topology_create_fn) {
+    PJRT_TopologyDescription_Create* topology_create_fn) {
   return PJRT_Api{
       .struct_size = PJRT_Api_STRUCT_SIZE,
       .priv = nullptr,
@@ -301,14 +316,19 @@ constexpr PJRT_Api CreatePjrtApi(
       .PJRT_Client_BufferFromHostBuffer =
           pjrt::PJRT_Client_BufferFromHostBuffer,
 
-      .PJRT_Device_Id = pjrt::PJRT_Device_Id,
-      .PJRT_Device_ProcessIndex = pjrt::PJRT_Device_ProcessIndex,
+      .PJRT_DeviceDescription_Id = pjrt::PJRT_DeviceDescription_Id,
+      .PJRT_DeviceDescription_ProcessIndex =
+          pjrt::PJRT_DeviceDescription_ProcessIndex,
+      .PJRT_DeviceDescription_Attributes =
+          pjrt::PJRT_DeviceDescription_Attributes,
+      .PJRT_DeviceDescription_Kind = pjrt::PJRT_DeviceDescription_Kind,
+      .PJRT_DeviceDescription_DebugString =
+          pjrt::PJRT_DeviceDescription_DebugString,
+      .PJRT_DeviceDescription_ToString = pjrt::PJRT_DeviceDescription_ToString,
+
+      .PJRT_Device_GetDescription = pjrt::PJRT_Device_GetDescription,
       .PJRT_Device_IsAddressable = pjrt::PJRT_Device_IsAddressable,
-      .PJRT_Device_Attributes = pjrt::PJRT_Device_Attributes,
-      .PJRT_Device_Kind = pjrt::PJRT_Device_Kind,
       .PJRT_Device_LocalHardwareId = pjrt::PJRT_Device_LocalHardwareId,
-      .PJRT_Device_DebugString = pjrt::PJRT_Device_DebugString,
-      .PJRT_Device_ToString = pjrt::PJRT_Device_ToString,
 
       .PJRT_Executable_Destroy = pjrt::PJRT_Executable_Destroy,
       .PJRT_Executable_Name = pjrt::PJRT_Executable_Name,
@@ -360,12 +380,13 @@ constexpr PJRT_Api CreatePjrtApi(
       .PJRT_CopyToDeviceStream_CurrentBytes =
           pjrt::PJRT_CopyToDeviceStream_CurrentBytes,
 
-      .PJRT_DeviceTopology_Create = topology_create_fn,
-      .PJRT_DeviceTopology_Destroy = pjrt::PJRT_DeviceTopology_Destroy,
-      .PJRT_DeviceTopology_PlatformName =
-          pjrt::PJRT_DeviceTopology_PlatformName,
-      .PJRT_DeviceTopology_PlatformVersion =
-          pjrt::PJRT_DeviceTopology_PlatformVersion,
+      .PJRT_TopologyDescription_Create = topology_create_fn,
+      .PJRT_TopologyDescription_Destroy =
+          pjrt::PJRT_TopologyDescription_Destroy,
+      .PJRT_TopologyDescription_PlatformName =
+          pjrt::PJRT_TopologyDescription_PlatformName,
+      .PJRT_TopologyDescription_PlatformVersion =
+          pjrt::PJRT_TopologyDescription_PlatformVersion,
 
       .PJRT_Compile = pjrt::PJRT_Compile,
   };
diff --git a/tensorflow/compiler/xla/pjrt/compile_options.proto b/tensorflow/compiler/xla/pjrt/compile_options.proto
index e3b16e1108b..3321363973d 100644
--- a/tensorflow/compiler/xla/pjrt/compile_options.proto
+++ b/tensorflow/compiler/xla/pjrt/compile_options.proto
@@ -80,6 +80,13 @@ message ExecutableBuildOptionsProto {
   repeated bool allow_spmd_sharding_propagation_to_output = 12;
 }
 
+message OptionOverrideProto {
+  oneof value {
+    string string_field = 1;
+    bool bool_field = 2;
+  }
+}
+
 message CompileOptionsProto {
   // Refer CompileOptions for documentation of fields.
   repeated ShapeProto argument_layouts = 1;
@@ -88,4 +95,11 @@ message CompileOptionsProto {
   bool compile_portable_executable = 4;
   int64 profile_version = 5;
   bytes serialized_multi_slice_config = 6;
+  map<string, OptionOverrideProto> env_option_overrides = 7;
+}
+
+// Helper for serializing opaque executables alongside CompileOptions.
+message ExecutableAndOptionsProto {
+  bytes serialized_executable = 1;
+  CompileOptionsProto compile_options = 2;
 }
diff --git a/tensorflow/compiler/xla/pjrt/distributed/BUILD b/tensorflow/compiler/xla/pjrt/distributed/BUILD
index 9d82c32549b..76585aafc1d 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/BUILD
+++ b/tensorflow/compiler/xla/pjrt/distributed/BUILD
@@ -45,23 +45,23 @@ cc_library(
         ":protocol",
         ":protocol_cc_grpc_proto",
         ":util",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/memory",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_impl",
-        "//tensorflow/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
         "//tensorflow/tsl/distributed_runtime/rpc:async_service_interface",
+        "//tensorflow/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:random",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:random",
         "//tensorflow/tsl/protobuf:coordination_config_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -91,21 +91,21 @@ cc_library(
         ":protocol",
         ":protocol_cc_grpc_proto",
         ":util",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_client",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_error_util",
+        "//tensorflow/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/distributed_runtime/coordination:coordination_client",
-        "//tensorflow/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
-        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//tensorflow/tsl/platform:random",
-        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
         "//tensorflow/tsl/protobuf:coordination_config_proto_cc",
+        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -137,15 +137,15 @@ xla_cc_test(
         ":distributed",
         ":protocol_proto_cc",
         ":service",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/strings",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
-        "//tensorflow/tsl/platform:errors",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
     ] + tsl_grpc_cc_dependencies(),
 )
diff --git a/tensorflow/compiler/xla/pjrt/distributed/util.h b/tensorflow/compiler/xla/pjrt/distributed/util.h
index 82f5aee9b8e..783b1bf5698 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/util.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/util.h
@@ -35,7 +35,7 @@ inline ::grpc::Status ToGrpcStatus(const Status& s) {
     return ::grpc::Status::OK;
   } else {
     return ::grpc::Status(static_cast<::grpc::StatusCode>(s.code()),
-                          s.error_message());
+                          std::string(s.message()));
   }
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/gpu/BUILD b/tensorflow/compiler/xla/pjrt/gpu/BUILD
index 461af3cd678..03baed9de4b 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/BUILD
+++ b/tensorflow/compiler/xla/pjrt/gpu/BUILD
@@ -41,25 +41,25 @@ cc_library(
     visibility = ["//tensorflow/compiler/xla/pjrt:friends"],
     deps = [
         ":gpu_helpers",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/compiler/xla/pjrt:pjrt_stream_executor_client",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/pjrt:pjrt_stream_executor_client",
         "//tensorflow/compiler/xla/pjrt/distributed:client",
         "//tensorflow/compiler/xla/service:platform_util",
-        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
+        "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
+        "//tensorflow/compiler/xla/stream_executor:device_memory",
+        "//tensorflow/compiler/xla/stream_executor:tf_allocator_adapter",
         "//tensorflow/tsl/framework:bfc_allocator",
         "//tensorflow/tsl/framework:device_id",
         "//tensorflow/tsl/framework:device_id_impl",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/util:env_var",
-        "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
-        "//tensorflow/compiler/xla/stream_executor:device_memory",
-        "//tensorflow/compiler/xla/stream_executor:tf_allocator_adapter",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
     ] + if_cuda([
         ":nccl_id_store_cuda",
         "@local_config_cuda//cuda:cuda_headers",
@@ -97,14 +97,14 @@ cc_library(
     hdrs = ["nccl_id_store.h"],
     defines = if_nccl(["NCCL_ENABLED=1"]),
     deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/pjrt/distributed:client",
+        "//tensorflow/compiler/xla/service:global_device_id",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/synchronization",
-        "//tensorflow/compiler/xla/pjrt/distributed:client",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:global_device_id",
-        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
     ] + if_nccl(["@local_config_nccl//:nccl"]),
 )
 
@@ -114,14 +114,14 @@ cc_library(
     hdrs = ["nccl_id_store.h"],
     defines = if_nccl(["NCCL_ENABLED=1"]),
     deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/pjrt/distributed:client",
+        "//tensorflow/compiler/xla/service:global_device_id",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/synchronization",
-        "//tensorflow/compiler/xla/pjrt/distributed:client",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:global_device_id",
-        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
     ] + if_nccl(["@local_config_nccl//:nccl"]),
 )
 
diff --git a/tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.cc b/tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.cc
index bb3dc7999bf..6cc8fd8bb47 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.cc
@@ -78,7 +78,7 @@ StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
                                           &enable_unified_memory);
   if (!status.ok()) {
     LOG(ERROR) << "Unable to read TF_FORCE_UNIFIED_MEMORY: "
-               << status.error_message();
+               << status.message();
   }
 
   int device_ordinal = executor->device_ordinal();
@@ -117,7 +117,7 @@ StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
 }
 
 // Returns a GPU pinned host memory allocator to use when staging host->GPU
-// transfers. We use a fixed 64MB pool of pinned memory.
+// transfers. We use a fixed 64GB pool of pinned memory.
 std::unique_ptr<tsl::BFCAllocator> GetGpuHostAllocator(
     se::StreamExecutor* executor) {
   std::unique_ptr<tsl::SubAllocator> sub_allocator(
diff --git a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 45cdb142558..49f72b6092f 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -156,7 +156,7 @@ StreamExecutorGpuClient::GetDefaultDeviceAssignment(int num_replicas,
 
 // Builds a LocalDeviceState for each GPU present.
 StatusOr<std::map<int, std::unique_ptr<LocalDeviceState>>>
-BuildLocalDeviceStates(LocalClient* xla_client, bool asynchronous) {
+BuildLocalDeviceStates(LocalClient* xla_client) {
   std::map<int, std::unique_ptr<LocalDeviceState>> addressable_devices;
   for (se::StreamExecutor* executor :
        xla_client->backend().stream_executors()) {
@@ -223,15 +223,15 @@ GetStreamExecutorGpuDeviceAllocator(
 }
 
 std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
-    std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states) {
+    std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
+    int node_id) {
   std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
   for (auto& ordinal_and_device : local_device_states) {
     const se::DeviceDescription& description =
         ordinal_and_device.second->executor()->GetDeviceDescription();
     auto device = std::make_unique<StreamExecutorGpuDevice>(
         ordinal_and_device.first, std::move(ordinal_and_device.second),
-        description.name(), description.device_vendor(),
-        /*node_id=*/0);
+        description.name(), description.device_vendor(), node_id);
     devices.push_back(std::move(device));
   }
   return devices;
@@ -367,8 +367,7 @@ StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
   TF_ASSIGN_OR_RETURN(LocalClient * xla_client,
                       GetGpuXlaClient(platform_name, allowed_devices));
   std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states;
-  TF_ASSIGN_OR_RETURN(local_device_states,
-                      BuildLocalDeviceStates(xla_client, asynchronous));
+  TF_ASSIGN_OR_RETURN(local_device_states, BuildLocalDeviceStates(xla_client));
   EnablePeerAccess(xla_client->backend().stream_executors());
   TF_ASSIGN_OR_RETURN(
       auto allocator,
@@ -384,7 +383,7 @@ StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
         std::move(local_device_states), std::move(distributed_client), node_id,
         &devices, gpu_run_options.get()));
   } else {
-    devices = BuildLocalDevices(std::move(local_device_states));
+    devices = BuildLocalDevices(std::move(local_device_states), node_id);
   }
 
   return std::unique_ptr<PjRtClient>(std::make_unique<StreamExecutorGpuClient>(
diff --git a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index d23323a366e..458917337e5 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -169,7 +169,7 @@ TEST(StreamExecutorGpuClientTest, SendErrorNoDeadLock) {
 
   // Check that send error safely rejected and we do not dead lock.
   auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
-  EXPECT_TRUE(absl::StrContains(result.status().error_message(),
+  EXPECT_TRUE(absl::StrContains(result.status().message(),
                                 "Uh-oh, can send chunk to host"));
 }
 
@@ -207,7 +207,7 @@ TEST(StreamExecutorGpuClientTest, RecvErrorNoDeadLock) {
 
   // Check that invalid chunk safely rejected and we do not dead lock.
   auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
-  EXPECT_TRUE(absl::StrContains(result.status().error_message(),
+  EXPECT_TRUE(absl::StrContains(result.status().message(),
                                 "Adding chunk of size 40 would overflow buffer "
                                 "of size 8 (0 already transferred)"));
 }
diff --git a/tensorflow/compiler/xla/pjrt/host_callback.cc b/tensorflow/compiler/xla/pjrt/host_callback.cc
index 5683e1e8949..0d26cc73eae 100644
--- a/tensorflow/compiler/xla/pjrt/host_callback.cc
+++ b/tensorflow/compiler/xla/pjrt/host_callback.cc
@@ -23,22 +23,26 @@ namespace xla {
 Status HostCallbackContext::OnSend(int arg_num,
                                    const PjRtTransferMetadata& metadata,
                                    PjRtChunk data) {
-  const auto& arg_info = host_callback_.operands.at(arg_num);
-  const auto& host_shape = arg_info.shape;
-  const auto& device_shape = metadata.device_shape;
+  if (!use_major_to_minor_data_layout_for_callbacks_) {
+    const auto& arg_info = host_callback_.operands.at(arg_num);
+    const auto& host_shape = arg_info.shape;
+    const auto& device_shape = metadata.device_shape;
 
-  size_t host_size = ShapeUtil::ByteSizeOf(host_shape);
-  DCHECK_GE(data.size(), host_size);
+    size_t host_size = ShapeUtil::ByteSizeOf(host_shape);
+    DCHECK_GE(data.size(), host_size);
 
-  auto delinearized = PjRtChunk::AllocateDefault(host_size);
-  TF_CHECK_OK(host_memory_for_device_manager_->ToHostLayout(
-      data.data(), data.size(), device_shape, delinearized.data(),
-      delinearized.size(), host_shape));
+    auto delinearized = PjRtChunk::AllocateDefault(host_size);
+    TF_CHECK_OK(host_memory_for_device_manager_->ToHostLayout(
+        data.data(), data.size(), device_shape, delinearized.data(),
+        delinearized.size(), host_shape));
+
+    data = std::move(delinearized);
+  }
 
   // This assignment to update `args_` will not race with the assignments in
   // future send ops for this `arg_num` because send callbacks are supposed to
   // be invoked sequentially.
-  args_.at(arg_num) = std::move(delinearized);
+  args_.at(arg_num) = std::move(data);
 
   DCHECK_GE(ready_count_.load(), 1);
   if (ready_count_.fetch_sub(1) != 1) {
@@ -92,14 +96,15 @@ void HostCallbackContext::Receive(int res_num,
   auto& result_channel = result_channels_.at(res_num);
   PjRtChunk chunk = result_channel->Pop();
 
-  const auto& host_shape = host_callback_.results.at(res_num).shape;
-  const auto& device_shape = metadata.device_shape;
+  if (!use_major_to_minor_data_layout_for_callbacks_) {
+    const auto& host_shape = host_callback_.results.at(res_num).shape;
+    const auto& device_shape = metadata.device_shape;
+    auto statusor_linearized = host_memory_for_device_manager_->ToDeviceLayout(
+        chunk.data(), chunk.size(), host_shape, device_shape);
+    chunk = std::move(statusor_linearized.value());
+  }
 
-  auto statusor_linearized = host_memory_for_device_manager_->ToDeviceLayout(
-      chunk.data(), chunk.size(), host_shape, device_shape);
-  stream.AddChunk(std::move(statusor_linearized).value()).OnReady([](Status s) {
-    TF_CHECK_OK(s);
-  });
+  stream.AddChunk(std::move(chunk)).OnReady([](Status s) { TF_CHECK_OK(s); });
 }
 
 std::unique_ptr<HostCallbackContext>
@@ -107,9 +112,11 @@ CreateHostCallbackStateAndAppendSendRecvCallbacks(
     HostCallback host_callback,
     PjRtHostMemoryForDeviceManager* host_memory_for_device_manager,
     std::vector<SendCallback>& send_callbacks,
-    std::vector<RecvCallback>& recv_callbacks) {
+    std::vector<RecvCallback>& recv_callbacks,
+    bool use_major_to_minor_data_layout_for_callbacks) {
   auto context = std::make_unique<HostCallbackContext>(
-      std::move(host_callback), host_memory_for_device_manager);
+      std::move(host_callback), use_major_to_minor_data_layout_for_callbacks,
+      host_memory_for_device_manager);
 
   const auto& hb = context->host_callback();
   for (int arg_num = 0; arg_num < hb.operands.size(); ++arg_num) {
diff --git a/tensorflow/compiler/xla/pjrt/host_callback.h b/tensorflow/compiler/xla/pjrt/host_callback.h
index 2e4a64aa834..c3f054028a6 100644
--- a/tensorflow/compiler/xla/pjrt/host_callback.h
+++ b/tensorflow/compiler/xla/pjrt/host_callback.h
@@ -83,20 +83,20 @@ struct HostCallback {
 // A helper class that maintains the send/recv states for a host callback.
 class HostCallbackContext {
  public:
-  HostCallbackContext(HostCallback host_callback, PjRtClient* client)
-      : HostCallbackContext(std::move(host_callback),
-                            client->GetPjRtHostMemoryForDeviceManager()) {}
-
   HostCallbackContext(
       HostCallback host_callback,
+      bool use_major_to_minor_data_layout_for_callbacks,
       PjRtHostMemoryForDeviceManager* host_memory_for_device_manager)
       : host_callback_(std::move(host_callback)),
+        use_major_to_minor_data_layout_for_callbacks_(
+            use_major_to_minor_data_layout_for_callbacks),
         host_memory_for_device_manager_(host_memory_for_device_manager),
         args_(host_callback_.operands.size()),
         result_channels_(host_callback_.results.size()),
         ready_count_(args_.size()) {
-    CHECK(host_memory_for_device_manager_);
-
+    if (!use_major_to_minor_data_layout_for_callbacks_) {
+      CHECK(host_memory_for_device_manager_);
+    }
     for (auto& channel : result_channels_) {
       channel = std::make_unique<ThreadSafePjRtChunkQueue>();
     }
@@ -112,6 +112,7 @@ class HostCallbackContext {
 
  private:
   HostCallback host_callback_;
+  bool use_major_to_minor_data_layout_for_callbacks_;
   PjRtHostMemoryForDeviceManager* host_memory_for_device_manager_ = nullptr;
   std::vector<PjRtChunk> args_;
   std::vector<std::unique_ptr<ThreadSafePjRtChunkQueue>> result_channels_;
@@ -128,13 +129,20 @@ struct HostCallbackStates {
   std::vector<std::vector<RecvCallback>> recv_callbacks;
 };
 
-// Creates the execution context for the `host_callback` for one replica.
+// Creates the execution context for the `host_callback` for one
+// replica.
+//
+// `use_major_to_minor_data_layout_for_callbacks` should match the value set in
+// the corresponding ExecuteOptions; see the comment there for more
+// info. `host_memory_for_device_manager` may be nullptr if
+// `use_major_to_minor_data_layout_for_callbacks` is true.
 std::unique_ptr<HostCallbackContext>
 CreateHostCallbackStateAndAppendSendRecvCallbacks(
     HostCallback host_callback,
     PjRtHostMemoryForDeviceManager* host_memory_for_device_manager,
     std::vector<SendCallback>& send_callbacks,
-    std::vector<RecvCallback>& recv_callbacks);
+    std::vector<RecvCallback>& recv_callbacks,
+    bool use_major_to_minor_data_layout_for_callbacks);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/pjrt/host_callback_test.cc b/tensorflow/compiler/xla/pjrt/host_callback_test.cc
index 6220592301f..0c95b5168fd 100644
--- a/tensorflow/compiler/xla/pjrt/host_callback_test.cc
+++ b/tensorflow/compiler/xla/pjrt/host_callback_test.cc
@@ -91,7 +91,8 @@ TEST(HostCallbackTest, Basic) {
 
   auto context = CreateHostCallbackStateAndAppendSendRecvCallbacks(
       std::move(host_callback), &test_host_memory_for_device_manager,
-      send_callbacks, recv_callbacks);
+      send_callbacks, recv_callbacks,
+      /*use_major_to_minor_data_layout_for_callback=*/false);
 
   PjRtTransferMetadata metadata;
   metadata.device_shape = shape;
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc
index ab582a0e303..1ea892250f6 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc
@@ -28,8 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
 #include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h"
-// TODO(skyewm): remove when everything goes through C API
-#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_api.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
@@ -47,8 +45,6 @@ limitations under the License.
 
 namespace xla {
 
-bool kPjRtCApiBypass = false;
-
 // Helper macros
 
 // Return error status if not success and frees the PJRT_Error returned by
@@ -89,10 +85,6 @@ PjRtCApiClient::PjRtCApiClient(const PJRT_Api* c_api, PJRT_Client* c_client)
       //   Built on Mar 4 2021 15:25:57 (1614900357) cl/360760169
       platform_version_(absl::StrCat(
           "PJRT C API\n", ::pjrt::GetPlatformVersion(c_client, c_api))) {
-  if (kPjRtCApiBypass) {
-    wrapped_ = c_client_->client.get();
-  }
-
   InitDevices();
   LOG(INFO) << "PjRtCApiClient created.";
 }
@@ -278,7 +270,8 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
   std::string module_bytecode;
   {
     llvm::raw_string_ostream os(module_bytecode);
-    mlir::writeBytecodeToFile(module, os);
+    if (mlir::failed(mlir::writeBytecodeToFile(module, os)))
+      return absl::UnknownError("writeBytecodeToFile() failed.");
   }
   std::string format(pjrt::kMlirFormat);
   return InitializeArgsAndCompile(this, c_api_, c_client_.get(), options,
@@ -332,13 +325,6 @@ StatusOr<std::uintptr_t> PjRtCApiClient::UnsafeBufferPointer(
   return args.buffer_pointer;
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::WrapBuffer(
-    StatusOr<std::unique_ptr<PjRtBuffer>> to_wrap) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> buffer, std::move(to_wrap));
-  return std::unique_ptr<PjRtBuffer>(std::make_unique<PjRtCApiBuffer>(
-      this, new PJRT_Buffer{std::move(buffer), pjrt_c_client()}));
-}
-
 StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::BufferFromHostBuffer(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
     std::optional<absl::Span<int64_t const>> byte_strides,
@@ -414,54 +400,39 @@ const PJRT_Api* PjRtCApiClient::pjrt_c_api() const { return c_api_; }
 
 // --------------------------------- Devices -----------------------------------
 
-PjRtCApiDevice::PjRtCApiDevice(PJRT_Device* device, PjRtCApiClient* client)
-    : client_(client), device_(device) {
-  if (kPjRtCApiBypass) {
-    wrapped_ = device_->device;
-  }
+PjRtCApiDeviceDescription::PjRtCApiDeviceDescription(
+    const PJRT_Api* c_api, PJRT_DeviceDescription* device_description)
+    : c_api_(c_api), device_description_(device_description) {
   InitAttributes();
 }
 
-PjRtClient* PjRtCApiDevice::client() const { return client_; }
-
-int PjRtCApiDevice::id() const {
-  PJRT_Device_Id_Args args;
-  args.struct_size = PJRT_Device_Id_Args_STRUCT_SIZE;
+int PjRtCApiDeviceDescription::id() const {
+  PJRT_DeviceDescription_Id_Args args;
+  args.struct_size = PJRT_DeviceDescription_Id_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.device = device_;
-  const PJRT_Api* api = client_->pjrt_c_api();
-  pjrt::LogFatalIfPjrtError(api->PJRT_Device_Id(&args), api);
+  args.device_description = device_description_;
+  pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceDescription_Id(&args), c_api_);
   return args.id;
 }
 
-int PjRtCApiDevice::process_index() const {
-  PJRT_Device_ProcessIndex_Args args;
-  args.struct_size = PJRT_Device_ProcessIndex_Args_STRUCT_SIZE;
+int PjRtCApiDeviceDescription::process_index() const {
+  PJRT_DeviceDescription_ProcessIndex_Args args;
+  args.struct_size = PJRT_DeviceDescription_ProcessIndex_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.device = device_;
-  const PJRT_Api* api = client_->pjrt_c_api();
-  pjrt::LogFatalIfPjrtError(api->PJRT_Device_ProcessIndex(&args), api);
+  args.device_description = device_description_;
+  pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceDescription_ProcessIndex(&args),
+                            c_api_);
   return args.process_index;
 }
 
-bool PjRtCApiDevice::IsAddressable() const {
-  PJRT_Device_IsAddressable_Args args;
-  args.struct_size = PJRT_Device_IsAddressable_Args_STRUCT_SIZE;
-  args.priv = nullptr;
-  args.device = device_;
-  const PJRT_Api* api = client_->pjrt_c_api();
-  pjrt::LogFatalIfPjrtError(api->PJRT_Device_IsAddressable(&args), api);
-  return args.is_addressable;
-}
-
-void PjRtCApiDevice::InitAttributes() {
+void PjRtCApiDeviceDescription::InitAttributes() {
   attributes_ = {};
-  PJRT_Device_Attributes_Args args;
-  args.struct_size = PJRT_Device_Attributes_Args_STRUCT_SIZE;
+  PJRT_DeviceDescription_Attributes_Args args;
+  args.struct_size = PJRT_DeviceDescription_Attributes_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.device = device_;
-  const PJRT_Api* api = client_->pjrt_c_api();
-  pjrt::LogFatalIfPjrtError(api->PJRT_Device_Attributes(&args), api);
+  args.device_description = device_description_;
+  pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceDescription_Attributes(&args),
+                            c_api_);
 
   for (int i = 0; i < args.num_attributes; ++i) {
     const auto& attribute = args.attributes[i];
@@ -491,9 +462,10 @@ void PjRtCApiDevice::InitAttributes() {
       // PJRT library is a newer version that returns types not supported by
       // this client). Failing here to prevent undefined behavior.
       default: {
-        LOG(FATAL) << "PJRT_Device_Attributes() returned attribute '"
+        LOG(FATAL) << "PJRT_DeviceDescription_Attributes() returned attribute '"
                    << attribute_name << "' with unsupported type "
-                   << attribute.type << " to PjRtCApiDevice::InitAttributes()";
+                   << attribute.type
+                   << " to PjRtCApiDeviceDescription::InitAttributes()";
         break;
       }
     }
@@ -501,23 +473,62 @@ void PjRtCApiDevice::InitAttributes() {
 }
 
 const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
-PjRtCApiDevice::Attributes() const {
+PjRtCApiDeviceDescription::Attributes() const {
   return attributes_;
 }
 
-absl::string_view PjRtCApiDevice::device_kind() const {
-  PJRT_Device_Kind_Args args;
-  args.struct_size = PJRT_Device_Kind_Args_STRUCT_SIZE;
+absl::string_view PjRtCApiDeviceDescription::device_kind() const {
+  PJRT_DeviceDescription_Kind_Args args;
+  args.struct_size = PJRT_DeviceDescription_Kind_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.device = device_;
+  args.device_description = device_description_;
 
-  const PJRT_Api* c_api = client_->pjrt_c_api();
-  pjrt::LogFatalIfPjrtError(c_api->PJRT_Device_Kind(&args), c_api);
+  pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceDescription_Kind(&args), c_api_);
 
   absl::string_view device_kind(args.device_kind, args.device_kind_size);
   return device_kind;
 }
 
+absl::string_view PjRtCApiDeviceDescription::DebugString() const {
+  PJRT_DeviceDescription_DebugString_Args args;
+  args.struct_size = PJRT_DeviceDescription_DebugString_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.device_description = device_description_;
+  pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceDescription_DebugString(&args),
+                            c_api_);
+  absl::string_view debug_string(args.debug_string, args.debug_string_size);
+  return debug_string;
+}
+
+absl::string_view PjRtCApiDeviceDescription::ToString() const {
+  PJRT_DeviceDescription_ToString_Args args;
+  args.struct_size = PJRT_DeviceDescription_ToString_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.device_description = device_description_;
+  pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceDescription_ToString(&args),
+                            c_api_);
+  absl::string_view to_string(args.to_string, args.to_string_size);
+  return to_string;
+}
+
+PjRtCApiDevice::PjRtCApiDevice(PJRT_Device* device, PjRtCApiClient* client)
+    : client_(client),
+      device_(device),
+      description_(client->pjrt_c_api(),
+                   pjrt::GetDeviceDescription(client->pjrt_c_api(), device)) {}
+
+PjRtClient* PjRtCApiDevice::client() const { return client_; }
+
+bool PjRtCApiDevice::IsAddressable() const {
+  PJRT_Device_IsAddressable_Args args;
+  args.struct_size = PJRT_Device_IsAddressable_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.device = device_;
+  const PJRT_Api* api = client_->pjrt_c_api();
+  pjrt::LogFatalIfPjrtError(api->PJRT_Device_IsAddressable(&args), api);
+  return args.is_addressable;
+}
+
 int PjRtCApiDevice::local_hardware_id() const {
   PJRT_Device_LocalHardwareId_Args args;
   args.struct_size = PJRT_Device_LocalHardwareId_Args_STRUCT_SIZE;
@@ -528,28 +539,6 @@ int PjRtCApiDevice::local_hardware_id() const {
   return args.local_hardware_id;
 }
 
-absl::string_view PjRtCApiDevice::DebugString() const {
-  PJRT_Device_DebugString_Args args;
-  args.struct_size = PJRT_Device_DebugString_Args_STRUCT_SIZE;
-  args.priv = nullptr;
-  args.device = device_;
-  const PJRT_Api* c_api = client_->pjrt_c_api();
-  pjrt::LogFatalIfPjrtError(c_api->PJRT_Device_DebugString(&args), c_api);
-  absl::string_view debug_string(args.debug_string, args.debug_string_size);
-  return debug_string;
-}
-
-absl::string_view PjRtCApiDevice::ToString() const {
-  PJRT_Device_ToString_Args args;
-  args.struct_size = PJRT_Device_ToString_Args_STRUCT_SIZE;
-  args.priv = nullptr;
-  args.device = device_;
-  const PJRT_Api* c_api = client_->pjrt_c_api();
-  pjrt::LogFatalIfPjrtError(c_api->PJRT_Device_ToString(&args), c_api);
-  absl::string_view to_string(args.to_string, args.to_string_size);
-  return to_string;
-}
-
 // ------------------------------- Executables ---------------------------------
 
 PjRtCApiExecutable::PjRtCApiExecutable(const PJRT_Api* c_api,
@@ -757,26 +746,29 @@ Convert2DCBuffersToCppBuffers(PJRT_Buffer*** c_lists, size_t outer_size,
 PJRT_SendCallbackInfo CppSendCallbackToC(
     const xla::SendCallback& cpp_send_callback,
     PjRtCApiLoadedExecutable::SendCallbackFunction* send_callback_function) {
-  *send_callback_function = [&send_callback = cpp_send_callback.callback](
-                                PJRT_TransferMetadata* metadata,
-                                PJRT_Chunk* chunk, size_t total_size_in_bytes,
-                                bool done) -> bool {
-    // TODO(b/238999986) use `shape` with full information when available.
-    xla::Shape shape = metadata->device_shape;
-    xla::Status status = send_callback(xla::PjRtTransferMetadata{shape},
+  *send_callback_function =
+      [&send_callback = cpp_send_callback.callback](
+          PJRT_Chunk* chunk, PJRT_CallbackError* callback_error,
+          size_t total_size_in_bytes, bool done) -> PJRT_Error* {
+    // PJRT C API doesn't support
+    // use_major_to_minor_data_layout_for_callbacks = false
+    xla::Shape dummy_shape;
+    xla::Status status = send_callback(xla::PjRtTransferMetadata{dummy_shape},
                                        ::pjrt::ConvertToCppChunk(*chunk),
                                        total_size_in_bytes, done);
     if (!status.ok()) {
-      return false;
+      absl::string_view message = status.message();
+      return (*callback_error)(pjrt::StatusCodeToPjrtErrorCode(status.code()),
+                               message.data(), message.size());
     }
-    return true;
+    return nullptr;
   };
   return PJRT_SendCallbackInfo{
       /*channel_id=*/cpp_send_callback.channel_id,
       /*user_arg=*/send_callback_function,
       /*send_callback=*/
-      [](PJRT_TransferMetadata* metadata, PJRT_Chunk* chunk,
-         size_t total_size_in_bytes, bool done, void* user_arg) -> bool {
+      [](PJRT_Chunk* chunk, PJRT_CallbackError* callback_error,
+         size_t total_size_in_bytes, bool done, void* user_arg) -> PJRT_Error* {
         // PJRT_SendCallback, `send_callback` is internal C interface callback
         // representation that cpatures the client C++ callback in void*
         // `user_arg` and reinterprets in the lower-level runtime for execution.
@@ -785,7 +777,8 @@ PJRT_SendCallbackInfo CppSendCallbackToC(
         PjRtCApiLoadedExecutable::SendCallbackFunction* send_callback =
             reinterpret_cast<PjRtCApiLoadedExecutable::SendCallbackFunction*>(
                 user_arg);
-        return (*send_callback)(metadata, chunk, total_size_in_bytes, done);
+        return (*send_callback)(chunk, callback_error, total_size_in_bytes,
+                                done);
       }};
 }
 
@@ -857,19 +850,18 @@ PJRT_RecvCallbackInfo CppRecvCallbackToC(
     const xla::RecvCallback& cpp_recv_callback, const PJRT_Api* c_api,
     PjRtCApiLoadedExecutable::RecvCallbackFunction* recv_callback_function) {
   *recv_callback_function = [&recv_callback = cpp_recv_callback.callback,
-                             c_api](PJRT_TransferMetadata* metadata,
-                                    PJRT_CopyToDeviceStream* stream) {
-    // TODO(b/238999986) use `shape` with full information when available.
-    xla::Shape shape = metadata->device_shape;
-    recv_callback(xla::PjRtTransferMetadata{shape},
+                             c_api](PJRT_CopyToDeviceStream* stream) {
+    // PJRT C API doesn't support
+    // use_major_to_minor_data_layout_for_callbacks = false
+    xla::Shape dummy_shape;
+    recv_callback(xla::PjRtTransferMetadata{dummy_shape},
                   std::make_unique<CApiCopyToDeviceStream>(stream, c_api));
   };
   return PJRT_RecvCallbackInfo{
       /*channel_id=*/cpp_recv_callback.channel_id,
       /*user_arg=*/recv_callback_function,
       /*recv_callback=*/
-      [](PJRT_TransferMetadata* metadata, PJRT_CopyToDeviceStream* stream,
-         void* user_arg) {
+      [](PJRT_CopyToDeviceStream* stream, void* user_arg) {
         // PJRT_RecvCallback, `recv_callback` is internal C interface callback
         // representation that cpatures the client C++ callback in void*
         // `user_arg` and reinterprets in the lower-level runtime for execution.
@@ -878,7 +870,7 @@ PJRT_RecvCallbackInfo CppRecvCallbackToC(
         PjRtCApiLoadedExecutable::RecvCallbackFunction* recv_callback =
             reinterpret_cast<PjRtCApiLoadedExecutable::RecvCallbackFunction*>(
                 user_arg);
-        (*recv_callback)(metadata, stream);
+        (*recv_callback)(stream);
       }};
 }
 
@@ -935,6 +927,15 @@ PjRtCApiLoadedExecutable::GetCommonExecuteArgs(
     std::vector<PJRT_Buffer**>& c_output_lists,
     std::optional<std::vector<PJRT_Event*>>& device_complete_events,
     SendRecvCallbackData& callback_data) {
+  bool using_host_callbacks =
+      !options.send_callbacks.empty() || !options.recv_callbacks.empty();
+  if (using_host_callbacks &&
+      !options.use_major_to_minor_data_layout_for_callbacks) {
+    return Unimplemented(
+        "PJRT C API doesn't support "
+        "ExecuteOptions::use_major_to_minor_data_layout_for_callbacks = false");
+  }
+
   PJRT_LoadedExecutable_Execute_Args args;
   args.struct_size = PJRT_LoadedExecutable_Execute_Args_STRUCT_SIZE;
   args.priv = nullptr;
@@ -945,8 +946,7 @@ PjRtCApiLoadedExecutable::GetCommonExecuteArgs(
   args.num_devices = argument_handles.size();
   CHECK_GT(args.num_devices, 0);
   args.num_args = argument_handles[0].size();
-  if (device_complete_events.has_value() || !options.send_callbacks.empty() ||
-      !options.recv_callbacks.empty()) {
+  if (device_complete_events.has_value() || using_host_callbacks) {
     device_complete_events->resize(args.num_devices);
     args.device_complete_events = device_complete_events->data();
   } else {
@@ -1017,6 +1017,8 @@ PjRtCApiLoadedExecutable::Execute(
   std::vector<std::vector<PJRT_Buffer*>> c_output_lists_storage;
   std::vector<PJRT_Buffer**> c_output_lists;
   PJRT_ExecuteOptions c_options;
+  c_options.num_send_ops = 0;
+  c_options.num_recv_ops = 0;
   std::vector<PJRT_Buffer**> c_arguments;
   std::optional<std::vector<PJRT_Event*>> device_complete_events;
   if (returned_futures.has_value()) {
@@ -1068,7 +1070,7 @@ PjRtCApiLoadedExecutable::ExecuteWithSingleDevice(
     const ExecuteOptions& options,
     std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
   if (!options.send_callbacks.empty() || !options.recv_callbacks.empty()) {
-    return Status(tensorflow::error::UNIMPLEMENTED,
+    return Status(absl::StatusCode::kUnimplemented,
                   "Send/recv callbacks not implemented for "
                   "PjRtCApiLoadedExecutable::ExecuteWithSingleDevice.");
   }
@@ -1080,6 +1082,8 @@ PjRtCApiLoadedExecutable::ExecuteWithSingleDevice(
   std::vector<std::vector<PJRT_Buffer*>> c_output_lists_storage;
   std::vector<PJRT_Buffer**> c_output_lists;
   PJRT_ExecuteOptions c_options;
+  c_options.num_send_ops = 0;
+  c_options.num_recv_ops = 0;
   std::vector<PJRT_Buffer**> c_arguments;
   std::optional<std::vector<PJRT_Event*>> device_complete_events;
   if (fill_future) {
@@ -1182,15 +1186,15 @@ const Shape& PjRtCApiBuffer::on_device_shape() const {
   return shape_.value();
 }
 
-void PjRtCApiBuffer::set_shape() {
+static Shape GetDeviceShape(PJRT_Buffer* c_buffer, const PJRT_Api* api,
+                            bool is_logical_on_device_shape) {
   PJRT_Buffer_OnDeviceTrimmedShape_Args args;
   args.struct_size = PJRT_Buffer_OnDeviceTrimmedShape_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.buffer = buffer_.get();
+  args.buffer = c_buffer;
+  args.is_logical_on_device_shape = is_logical_on_device_shape;
 
-  pjrt::LogFatalIfPjrtError(
-      client_->pjrt_c_api()->PJRT_Buffer_OnDeviceTrimmedShape(&args),
-      client_->pjrt_c_api());
+  pjrt::LogFatalIfPjrtError(api->PJRT_Buffer_OnDeviceTrimmedShape(&args), api);
 
   xla::PrimitiveType element_type =
       static_cast<xla::PrimitiveType>(args.element_type);
@@ -1207,8 +1211,6 @@ void PjRtCApiBuffer::set_shape() {
     *(trimmed_shape.mutable_layout()) = ApiConverter::FromC(&args.layout);
   }
 
-  shape_ = trimmed_shape;
-
   // TODO(amangu): Refactor the deletion.
   if (args.dimensions.size > TPU_C_API_MAX_INLINED) {
     delete[] args.dimensions.heap;
@@ -1227,6 +1229,17 @@ void PjRtCApiBuffer::set_shape() {
       delete[] args.layout.tiles.heap;
     }
   }
+  return trimmed_shape;
+}
+
+void PjRtCApiBuffer::set_shape() {
+  shape_ = GetDeviceShape(buffer_.get(), client_->pjrt_c_api(),
+                          /*is_logical_on_device_shape=*/false);
+}
+
+StatusOr<Shape> PjRtCApiBuffer::logical_on_device_shape() {
+  return GetDeviceShape(buffer_.get(), client_->pjrt_c_api(),
+                        /*is_logical_on_device_shape=*/true);
 }
 
 PjRtFuture<Status> PjRtCApiBuffer::ToLiteral(MutableLiteralBase* literal) {
@@ -1393,29 +1406,29 @@ PjRtFuture<Status> PjRtCApiBuffer::GetReadyFuture() {
 
 // ------------------------------ Device Topology ------------------------------
 
-PjRtCApiDeviceTopology::PjRtCApiDeviceTopology(const PJRT_Api* c_api,
-                                               PJRT_DeviceTopology* c_topology)
+PjRtCApiTopologyDescription::PjRtCApiTopologyDescription(
+    const PJRT_Api* c_api, PJRT_TopologyDescription* c_topology)
     : compiler_(std::make_unique<PjRtCApiCompiler>(c_api)),
       c_api_(c_api),
-      c_topology_(c_topology, ::pjrt::MakeDeviceTopologyDeleter(c_api)) {}
+      c_topology_(c_topology, ::pjrt::MakeTopologyDescriptionDeleter(c_api)) {}
 
-absl::string_view PjRtCApiDeviceTopology::platform_name() const {
-  PJRT_DeviceTopology_PlatformName_Args args;
+absl::string_view PjRtCApiTopologyDescription::platform_name() const {
+  PJRT_TopologyDescription_PlatformName_Args args;
   args.topology = c_topology_.get();
-  args.struct_size = PJRT_DeviceTopology_PlatformName_Args_STRUCT_SIZE;
+  args.struct_size = PJRT_TopologyDescription_PlatformName_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceTopology_PlatformName(&args),
-                            c_api_);
+  pjrt::LogFatalIfPjrtError(
+      c_api_->PJRT_TopologyDescription_PlatformName(&args), c_api_);
   return absl::string_view(args.platform_name, args.platform_name_size);
 }
 
-absl::string_view PjRtCApiDeviceTopology::platform_version() const {
-  PJRT_DeviceTopology_PlatformVersion_Args args;
-  args.struct_size = PJRT_DeviceTopology_PlatformVersion_Args_STRUCT_SIZE;
+absl::string_view PjRtCApiTopologyDescription::platform_version() const {
+  PJRT_TopologyDescription_PlatformVersion_Args args;
+  args.struct_size = PJRT_TopologyDescription_PlatformVersion_Args_STRUCT_SIZE;
   args.priv = nullptr;
   args.topology = c_topology_.get();
-  pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceTopology_PlatformVersion(&args),
-                            c_api_);
+  pjrt::LogFatalIfPjrtError(
+      c_api_->PJRT_TopologyDescription_PlatformVersion(&args), c_api_);
   return absl::string_view(args.platform_version, args.platform_version_size);
 }
 
@@ -1423,7 +1436,7 @@ absl::string_view PjRtCApiDeviceTopology::platform_version() const {
 // API PJRT_Compile().
 static StatusOr<std::unique_ptr<PjRtExecutable>> InitializeArgsAndCompileAot(
     const PJRT_Api* c_api, PjRtClient* client, const CompileOptions& options,
-    const PjRtDeviceTopology& topology, const std::string& code,
+    const PjRtTopologyDescription& topology, const std::string& code,
     const std::string& format) {
   PJRT_Compile_Args args;
   args.struct_size = PJRT_Compile_Args_STRUCT_SIZE;
@@ -1435,7 +1448,7 @@ static StatusOr<std::unique_ptr<PjRtExecutable>> InitializeArgsAndCompileAot(
         tensorflow::down_cast<PjRtCApiClient*>(client)->pjrt_c_client();
   }
   args.topology =
-      tensorflow::down_cast<const PjRtCApiDeviceTopology*>(&topology)
+      tensorflow::down_cast<const PjRtCApiTopologyDescription*>(&topology)
           ->c_topology();
   TF_ASSIGN_OR_RETURN(const CompileOptionsProto options_proto,
                       options.ToProto());
@@ -1460,7 +1473,7 @@ static StatusOr<std::unique_ptr<PjRtExecutable>> InitializeArgsAndCompileAot(
 
 StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
     CompileOptions options, const XlaComputation& computation,
-    const PjRtDeviceTopology& topology, PjRtClient* client) {
+    const PjRtTopologyDescription& topology, PjRtClient* client) {
   std::string module_str = computation.proto().SerializeAsString();
   std::string format(pjrt::kHloFormat);
   return InitializeArgsAndCompileAot(c_api_, client, options, topology,
@@ -1469,11 +1482,12 @@ StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
 
 StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
     CompileOptions options, mlir::ModuleOp module,
-    const PjRtDeviceTopology& topology, PjRtClient* client) {
+    const PjRtTopologyDescription& topology, PjRtClient* client) {
   std::string module_bytecode;
   {
     llvm::raw_string_ostream os(module_bytecode);
-    mlir::writeBytecodeToFile(module, os);
+    if (mlir::failed(mlir::writeBytecodeToFile(module, os)))
+      return absl::UnknownError("writeBytecodeToFile() failed.");
   }
   std::string format(pjrt::kMlirFormat);
   return InitializeArgsAndCompileAot(c_api_, client, options, topology,
@@ -1510,20 +1524,21 @@ StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient(
       std::make_unique<PjRtCApiClient>(c_api, c_client));
 }
 
-StatusOr<std::unique_ptr<PjRtDeviceTopology>> GetCApiTopology(
+StatusOr<std::unique_ptr<PjRtTopologyDescription>> GetCApiTopology(
     absl::string_view device_type) {
   TF_ASSIGN_OR_RETURN(const PJRT_Api* c_api, pjrt::PjrtApi(device_type));
   if (c_api == nullptr) {
     return InternalError("PJRT C API is nullptr for %s", device_type);
   }
 
-  PJRT_DeviceTopology_Create_Args init_args;
-  init_args.struct_size = PJRT_DeviceTopology_Create_Args_STRUCT_SIZE;
+  PJRT_TopologyDescription_Create_Args init_args;
+  init_args.struct_size = PJRT_TopologyDescription_Create_Args_STRUCT_SIZE;
   init_args.priv = nullptr;
-  RETURN_STATUS_IF_ERROR(c_api->PJRT_DeviceTopology_Create(&init_args), c_api);
-  PJRT_DeviceTopology* c_topology = init_args.topology;
-  return std::unique_ptr<PjRtDeviceTopology>(
-      std::make_unique<PjRtCApiDeviceTopology>(c_api, c_topology));
+  RETURN_STATUS_IF_ERROR(c_api->PJRT_TopologyDescription_Create(&init_args),
+                         c_api);
+  PJRT_TopologyDescription* c_topology = init_args.topology;
+  return std::unique_ptr<PjRtTopologyDescription>(
+      std::make_unique<PjRtCApiTopologyDescription>(c_api, c_topology));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h b/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h
index 1b5a84998bc..0e16e95e074 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h
@@ -29,15 +29,38 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/pjrt_compiler.h"
 
 namespace xla {
-// If false, PjRtCApiClient will raise an error on methods unimplemented in the
-// PJRT C API. If true, PjRtCApiClient and related classes will assume the
-// wrapper impl is being used and call directly into the wrapped C++ PJRT
-// client. This can be useful for testing, but is generally not safe to use
-// across library boundaries.
-extern bool kPjRtCApiBypass;
 
 class PjRtCApiClient;
 
+class PjRtCApiDeviceDescription : public PjRtDeviceDescription {
+ public:
+  PjRtCApiDeviceDescription(const PJRT_Api* c_api_,
+                            PJRT_DeviceDescription* device_description);
+
+  int id() const override;
+
+  int process_index() const override;
+
+  absl::string_view device_kind() const override;
+
+  absl::string_view DebugString() const override;
+
+  absl::string_view ToString() const override;
+
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override;
+
+ private:
+  const PJRT_Api* c_api_;
+  // `device_description_` is owned by the `PJRT_Client` wrapped by `client_`
+  PJRT_DeviceDescription* device_description_;
+  // Device specific attributes with corresponding values.
+  absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
+
+  // Initializes device specific attributes.
+  void InitAttributes();
+};
+
 class PjRtCApiDevice : public PjRtDevice {
  public:
   explicit PjRtCApiDevice(PJRT_Device* device, PjRtCApiClient* client);
@@ -46,68 +69,33 @@ class PjRtCApiDevice : public PjRtDevice {
 
   bool IsAddressable() const override;
 
-  int id() const override;
-
-  int process_index() const override;
-
   int local_hardware_id() const override;
 
-  absl::string_view device_kind() const override;
-
-  absl::string_view DebugString() const override;
-
-  absl::string_view ToString() const override;
-
   Status TransferToInfeed(const LiteralSlice& literal) override {
-    if (kPjRtCApiBypass) {
-      VLOG(1) << "PJRT C API BYPASS: TransferToInfeed";
-      return wrapped_->TransferToInfeed(literal);
-    }
     return Unimplemented("PJRT C API does not support TransferToInfeed");
   }
 
   Status TransferFromOutfeed(MutableBorrowingLiteral literal) override {
-    if (kPjRtCApiBypass) {
-      VLOG(1) << "PJRT C API BYPASS: TransferFromOutfeed";
-      return wrapped_->TransferFromOutfeed(std::move(literal));
-    }
     return Unimplemented("PJRT C API does not support TransferFromOutfeed");
   }
 
   std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
       absl::string_view description) const override {
-    if (kPjRtCApiBypass) {
-      VLOG(1) << "PJRT C API BYPASS: CreateAsyncTrackingEvent";
-      return wrapped_->CreateAsyncTrackingEvent(description);
-    }
-    LOG(WARNING) << "PJRT C API does not support CreateAsyncTrackingEvent";
+    LOG(FATAL) << "PJRT C API does not support CreateAsyncTrackingEvent";
     return nullptr;
   }
 
-  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
-      const override;
-
   PJRT_Device* c_device() const { return device_; }
 
-  PjRtDevice* wrapped() const { return wrapped_; }
-
-  static PjRtDevice* GetWrapped(PjRtDevice* c_api_device) {
-    return tensorflow::down_cast<PjRtCApiDevice*>(c_api_device)->wrapped();
+  const PjRtCApiDeviceDescription& description() const override {
+    return description_;
   }
 
  private:
   PjRtCApiClient* client_ = nullptr;
   // `device_` is owned by the `PJRT_Client` wrapped by `client_`
   PJRT_Device* device_;
-  // TODO(shahrokhi): wrapped_ is a non-C API pointer that was used to bypass
-  // the C API calls until all the C API's got implemented. Remove it when it's
-  // usage is reduced to zero.
-  PjRtDevice* wrapped_;
-  // Device specific attributes with corresponding values.
-  absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
-
-  // Initializes device specific attributes.
-  void InitAttributes();
+  PjRtCApiDeviceDescription description_;
 };
 
 class PjRtCApiClient : public PjRtClient {
@@ -128,10 +116,6 @@ class PjRtCApiClient : public PjRtClient {
       int local_hardware_id) const override;
 
   PjRtPlatformId platform_id() const override {
-    if (kPjRtCApiBypass) {
-      VLOG(1) << "PJRT C API BYPASS: platform_id";
-      return wrapped_->platform_id();
-    }
     CHECK(false) << "PJRT C API does not support platform_id.";
   }
 
@@ -141,10 +125,6 @@ class PjRtCApiClient : public PjRtClient {
 
   // TODO(b/244756954): Rethink this function altogether
   PjRtRuntimeType runtime_type() const override {
-    if (kPjRtCApiBypass) {
-      VLOG(1) << "PJRT C API BYPASS: runtime_type";
-      return wrapped_->runtime_type();
-    }
     return PjRtRuntimeType::kTfrt;
   }
 
@@ -192,23 +172,12 @@ class PjRtCApiClient : public PjRtClient {
 
   StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtDevice* device) override {
-    if (kPjRtCApiBypass) {
-      VLOG(1) << "PJRT C API BYPASS: BufferFromHostLiteral";
-      return WrapBuffer(wrapped_->BufferFromHostLiteral(
-          literal, PjRtCApiDevice::GetWrapped(device)));
-    }
     return Unimplemented("PJRT C API does not support BufferFromHostLiteral");
   }
 
   StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
       void* device_ptr, const Shape& shape, PjRtDevice* device,
       std::function<void()> on_delete_callback) override {
-    if (kPjRtCApiBypass) {
-      VLOG(1) << "PJRT C API BYPASS: CreateViewOfDeviceBuffer";
-      return WrapBuffer(wrapped_->CreateViewOfDeviceBuffer(
-          device_ptr, shape, PjRtCApiDevice::GetWrapped(device),
-          on_delete_callback));
-    }
     return Unimplemented(
         "PJRT C API does not support CreateViewOfDeviceBuffer");
   }
@@ -249,8 +218,7 @@ class PjRtCApiClient : public PjRtClient {
     return Unimplemented("PJRT C API does not support Defragment");
   }
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> WrapBuffer(
-      StatusOr<std::unique_ptr<PjRtBuffer>> to_wrap);
+  bool SupportsSendRecvCallbacks() const override { return true; }
 
   const PJRT_Api* pjrt_c_api() const;
 
@@ -262,16 +230,8 @@ class PjRtCApiClient : public PjRtClient {
     return it->second;
   }
 
-  // Returns nullptr if `kPjRtCApiBypass` is not set, until the C API
-  // device manager is implemented.
-  // TODO(b/267063498) return the PjRtHostMemoryForDeviceManager for the wrapped
-  // client.
   PjRtHostMemoryForDeviceManager* GetPjRtHostMemoryForDeviceManager()
       const override {
-    if (kPjRtCApiBypass) {
-      VLOG(1) << "PJRT C API BYPASS: GetPjRtHostMemoryForDeviceManager";
-      return wrapped_->GetPjRtHostMemoryForDeviceManager();
-    }
     return nullptr;
   }
 
@@ -287,12 +247,6 @@ class PjRtCApiClient : public PjRtClient {
   absl::flat_hash_map<PJRT_Device*, PjRtCApiDevice*> c_to_cpp_device_map_;
 
   const std::string platform_version_;
-
-  // TODO(skyewm): this is a shim so we can run PjRtCApiClient code without the
-  // C API being fully implemented. All methods using wrapped_ should either be
-  // marked unimplemented or implemented in terms of the C API, at which point
-  // wrapped_ and related functionality should be removed.
-  PjRtClient* wrapped_;
 };
 
 class PjRtCApiBuffer : public PjRtBuffer {
@@ -301,9 +255,7 @@ class PjRtCApiBuffer : public PjRtBuffer {
 
   const Shape& on_device_shape() const override;
 
-  StatusOr<Shape> logical_on_device_shape() override {
-    return Unimplemented("PJRT C API does not support logical_on_device_shape");
-  }
+  StatusOr<Shape> logical_on_device_shape() override;
 
   PjRtDevice* device() const override;
 
@@ -478,11 +430,10 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
   bool IsReturnedFutureSupported() const override { return true; }
 
   // std::function version of PJRT_SendCallback
-  using SendCallbackFunction =
-      std::function<bool(PJRT_TransferMetadata*, PJRT_Chunk*, size_t, bool)>;
+  using SendCallbackFunction = std::function<PJRT_Error*(
+      PJRT_Chunk*, PJRT_CallbackError*, size_t, bool)>;
   // std::function version of PJRT_RecvCallback
-  using RecvCallbackFunction =
-      std::function<void(PJRT_TransferMetadata*, PJRT_CopyToDeviceStream*)>;
+  using RecvCallbackFunction = std::function<void(PJRT_CopyToDeviceStream*)>;
 
  private:
   // Groups data needed to support send/recv execution callbacks.
@@ -528,20 +479,20 @@ class PjRtCApiCompiler : public PjRtCompiler {
 
   StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
       CompileOptions options, const XlaComputation& computation,
-      const PjRtDeviceTopology& topology, PjRtClient* client) override;
+      const PjRtTopologyDescription& topology, PjRtClient* client) override;
 
   StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
       CompileOptions options, mlir::ModuleOp module,
-      const PjRtDeviceTopology& topology, PjRtClient* client) override;
+      const PjRtTopologyDescription& topology, PjRtClient* client) override;
 
  private:
   const PJRT_Api* c_api_;
 };
 
-class PjRtCApiDeviceTopology : public PjRtDeviceTopology {
+class PjRtCApiTopologyDescription : public PjRtTopologyDescription {
  public:
-  PjRtCApiDeviceTopology(const PJRT_Api* c_api,
-                         PJRT_DeviceTopology* c_topology);
+  PjRtCApiTopologyDescription(const PJRT_Api* c_api,
+                              PJRT_TopologyDescription* c_topology);
 
   PjRtPlatformId platform_id() const override {
     CHECK(false) << "PJRT C API does not support platform_id.";
@@ -555,12 +506,20 @@ class PjRtCApiDeviceTopology : public PjRtDeviceTopology {
     return compiler_.get();
   }
 
-  const PJRT_DeviceTopology* c_topology() const { return c_topology_.get(); }
+  const PJRT_TopologyDescription* c_topology() const {
+    return c_topology_.get();
+  }
+
+  std::vector<std::unique_ptr<const PjRtDeviceDescription>> DeviceDescriptions()
+      const {
+    LOG(FATAL) << "PJRT C API DeviceDescription not implemented.";
+  }
 
  private:
   std::unique_ptr<PjRtCApiCompiler> compiler_;
   const PJRT_Api* c_api_;
-  std::unique_ptr<PJRT_DeviceTopology, ::pjrt::PJRT_DeviceTopologyDeleter>
+  std::unique_ptr<PJRT_TopologyDescription,
+                  ::pjrt::PJRT_TopologyDescriptionDeleter>
       c_topology_;
 };
 
@@ -580,7 +539,7 @@ StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient(
     absl::string_view device_type,
     const absl::flat_hash_map<std::string, PjRtValueType>& create_options = {});
 
-StatusOr<std::unique_ptr<PjRtDeviceTopology>> GetCApiTopology(
+StatusOr<std::unique_ptr<PjRtTopologyDescription>> GetCApiTopology(
     absl::string_view device_type);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index fa651c788b4..51b0ec292d4 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -27,6 +27,7 @@ limitations under the License.
 
 #include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/notification.h"
@@ -89,20 +90,38 @@ inline constexpr absl::string_view PjRtRuntimeTypeString(PjRtRuntimeType type) {
 }
 
 class PjRtClient;
+class PjRtDevice;
 
 using PjRtValueType =
     std::variant<std::string, int64_t, std::vector<int64_t>, float>;
 using PjRtDeviceAttribute = PjRtValueType;
 
-class PjRtDevice {
+class PjRtMemorySpace {
  public:
-  virtual ~PjRtDevice() = default;
+  virtual ~PjRtMemorySpace() = default;
 
-  // Return the client that owns this device.
+  // The owner of this memory space.
   virtual PjRtClient* client() const = 0;
 
-  // Whether client can issue command to this device.
-  virtual bool IsAddressable() const = 0;
+  // The devices that this memory space is attached to.
+  virtual absl::Span<PjRtDevice* const> devices() const = 0;
+
+  // The ID of this memory space. IDs are unique among memory spaces of this
+  // type.
+  virtual int id() const = 0;
+
+  // A platform-dependent string that uniquely identifies the kind of the
+  // memory space.
+  virtual absl::string_view memory_space_kind() const = 0;
+
+  // Debug string suitable for logging when errors occur. Should be verbose
+  // enough to describe the current memory space unambiguously.
+  virtual absl::string_view DebugString() const = 0;
+};
+
+class PjRtDeviceDescription {
+ public:
+  virtual ~PjRtDeviceDescription() = default;
 
   // The ID of this device. IDs are unique among devices of this type
   // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
@@ -116,11 +135,6 @@ class PjRtDevice {
   // process_index as the client.
   virtual int process_index() const = 0;
 
-  // Opaque hardware ID, e.g., the CUDA device number, useful for identifying
-  // which GPU when interacting with non-JAX code. In general, not guaranteed to
-  // be dense, and -1 if undefined.
-  virtual int local_hardware_id() const = 0;
-
   // A vendor-dependent string that uniquely identifies the kind of device,
   // e.g., "Tesla V100-SXM2-16GB". May be used to determine whether two GPUs are
   // compatible compilation.
@@ -134,6 +148,72 @@ class PjRtDevice {
   // for example: "CpuDevice(id=0)".
   virtual absl::string_view ToString() const = 0;
 
+  // Returns vendor specific attributes about the device. For example the model
+  // number of a GPU, or the mesh coordinates of a TPU device. The returned
+  // reference will remain valid for the lifetime of the PjRtDevice.
+  virtual const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
+  Attributes() const = 0;
+};
+
+class PjRtDevice {
+ public:
+  virtual ~PjRtDevice() = default;
+
+  // Return the client that owns this device.
+  virtual PjRtClient* client() const = 0;
+
+  // Whether client can issue command to this device.
+  virtual bool IsAddressable() const = 0;
+
+  virtual const PjRtDeviceDescription& description() const {
+    LOG(FATAL) << "PjRtDeviceDescription not available (must override "
+                  "PjRtDevice::description).";
+  }
+
+  // The ID of this device. IDs are unique among devices of this type
+  // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
+  // hosts' devices.  This is the ID that should be used in a DeviceAssignment.
+  virtual int id() const { return description().id(); }
+
+  // The index of the process that this device belongs to, i.e. is addressable
+  // from. This is not always identical to PjRtClient::process_index() in a
+  // multi-process setting, where each client can see devices from all
+  // processes, but only a subset of them are addressable and have the same
+  // process_index as the client.
+  virtual int process_index() const { return description().process_index(); }
+
+  // Opaque hardware ID, e.g., the CUDA device number, useful for identifying
+  // which GPU when interacting with non-JAX code. In general, not guaranteed to
+  // be dense, and -1 if undefined.
+  virtual int local_hardware_id() const = 0;
+
+  // A vendor-dependent string that uniquely identifies the kind of device,
+  // e.g., "Tesla V100-SXM2-16GB". May be used to determine whether two GPUs are
+  // compatible compilation.
+  virtual absl::string_view device_kind() const {
+    return description().device_kind();
+  }
+
+  // Debug string suitable for logging when errors occur. Should be verbose
+  // enough to describe the current device unambiguously.
+  virtual absl::string_view DebugString() const {
+    return description().DebugString();
+  }
+
+  // Debug string suitable for reading by end users, should be reasonably terse,
+  // for example: "CpuDevice(id=0)".
+  virtual absl::string_view ToString() const {
+    return description().ToString();
+  }
+
+  // Returns vendor specific attributes about the device. For example the model
+  // number of a GPU, or the mesh coordinates of a TPU device. The returned
+  // reference will remain valid for the lifetime of the PjRtDevice.
+  virtual const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
+  Attributes() const {
+    return description().Attributes();
+  }
+
   // Returns a scoped event that the caller uses to tell the PjRtClient that
   // there is asynchronous work happening that depends on activity on the
   // PjRtDevice. See comment on class definition in pjrt_future.h.
@@ -149,18 +229,17 @@ class PjRtDevice {
   // Transfer and return a value of the given shape from the outfeed queue.
   virtual Status TransferFromOutfeed(MutableBorrowingLiteral literal) = 0;
 
-  // Returns vendor specific attributes about the device. For example the model
-  // number of a GPU, or the mesh coordinates of a TPU device. The returned
-  // reference will remain valid for the lifetime of the PjRtDevice.
-  virtual const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
-  Attributes() const = 0;
-
   // Returns allocator stats for the device. Only some PjRtDevice
   // implementations support allocator_stats, and those that do not will return
   // an Unimplemented error.
   virtual StatusOr<tsl::AllocatorStats> GetAllocatorStats() const {
     return Unimplemented("GetAllocatorStats is not supported");
   }
+
+  // Returns all memory spaces attached to this device.
+  virtual absl::Span<PjRtMemorySpace* const> memory_spaces() const {
+    return {};
+  }
 };
 
 // Forward declaration.
@@ -439,6 +518,11 @@ class PjRtClient {
   virtual StatusOr<PjRtDevice*> LookupAddressableDevice(
       int local_hardware_id) const = 0;
 
+  // Return all memory spaces owned by the client.
+  virtual absl::Span<PjRtMemorySpace* const> memory_spaces() const {
+    return {};
+  }
+
   // Return an ID that identifies the platform (CPU/GPU/TPU).
   virtual PjRtPlatformId platform_id() const = 0;
 
@@ -667,12 +751,52 @@ class PjRtClient {
       HostBufferSemantics host_buffer_semantics,
       std::function<void()> on_done_with_host_buffer, PjRtDevice* device) = 0;
 
+  // Variant of BufferFromHostBuffer that takes an optional device layout. It is
+  // used when non-compact layout is preferred.
+  // TODO(b/275645543): remove BufferFromHostBuffer without optional device
+  // layout after all the inherited classes and call sites are updated.
+  virtual StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      std::function<void()> on_done_with_host_buffer, PjRtDevice* device,
+      const Layout* device_layout) {
+    return tsl::errors::Unimplemented(
+        "BufferFromHostBuffer with an optional device layout is not "
+        "implemented on platform: ",
+        platform_name());
+  }
+
+  // TODO(b/277820585): remove BufferFromHostBuffer with PjRtDevice after the
+  // migration is done.
+  virtual StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      std::function<void()> on_done_with_host_buffer,
+      PjRtMemorySpace* memory_space, const Layout* device_layout) {
+    return tsl::errors::Unimplemented(
+        "BufferFromHostBuffer with PjRtMemorySpace is not implemented on "
+        "platform: ",
+        platform_name());
+  }
+
   // Note that literal must remain in scope until the transfer has completed, so
   // the caller should, for example, wait for GetReadyFuture().Await()
   // completes on the return value before letting literal go out of scope.
   virtual StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtDevice* device) = 0;
 
+  // TODO(b/277820585): remove BufferFromHostLiteral with PjRtDevice after the
+  // migration is done.
+  virtual StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtMemorySpace* memory_space) {
+    return tsl::errors::Unimplemented(
+        "BufferFromHostLiteral with PjRtMemorySpace is not implemented on "
+        "platform: ",
+        platform_name());
+  }
+
   // Creates a PjRtBuffer that is a non-owned view of an on-device
   // buffer (typically allocated by another library).
   // on_delete_callback is called when the PjRtBuffer is done with the on-device
@@ -755,6 +879,11 @@ class PjRtClient {
   // Defragment device memory.
   virtual Status Defragment() = 0;
 
+  // If false, this client does not support send/recv host callbacks, and
+  // callers should not set the `send_callbacks` and `recv_callbacks` arguments
+  // in ExecuteOptions.
+  virtual bool SupportsSendRecvCallbacks() const { return false; }
+
   // Return the PjRtHostMemoryForDeviceManager for this client. It can be
   // nullptr if the implementation does not provide one.
   virtual PjRtHostMemoryForDeviceManager* GetPjRtHostMemoryForDeviceManager()
@@ -788,6 +917,8 @@ class PjRtBuffer {
   // Since this method actually acquires locks and communicate with the device,
   // it does not have the const qualifier, similar to what ToLiteral does.
   virtual StatusOr<Shape> logical_on_device_shape() = 0;
+  virtual PjRtMemorySpace* memory_space() const { return nullptr; }
+  // TODO(b/277820585): remove device() after the migration is done.
   virtual PjRtDevice* device() const = 0;
   virtual PjRtClient* client() const = 0;
 
@@ -841,8 +972,12 @@ class PjRtBuffer {
   // Convenience synchronous overload that allocates a literal with a default
   // layout.
   StatusOr<std::shared_ptr<Literal>> ToLiteralSync() {
+    Shape device_shape = on_device_shape();
+    if (device_shape.is_dynamic()) {
+      TF_ASSIGN_OR_RETURN(device_shape, logical_on_device_shape());
+    }
     auto literal = std::make_shared<Literal>(
-        ShapeUtil::DeviceShapeToHostShape(on_device_shape()));
+        ShapeUtil::DeviceShapeToHostShape(device_shape));
     TF_RETURN_IF_ERROR(ToLiteralSync(literal.get()));
     return literal;
   }
@@ -1051,8 +1186,8 @@ class PjRtBuffer {
   Status BlockHostUntilReady() {
     auto s = GetReadyFuture().Await();
     // Fix up error string because some clients rely on it.
-    if (!s.ok() && s.error_message() ==
-                       "GetReadyFuture() called on deleted or donated buffer") {
+    if (!s.ok() &&
+        s.message() == "GetReadyFuture() called on deleted or donated buffer") {
       return InvalidArgument(
           "BlockHostUntilReady() called on deleted or donated buffer");
     }
@@ -1088,6 +1223,9 @@ class ExecuteContext {
 };
 
 struct PjRtTransferMetadata {
+  // May be invalid if
+  // ExecuteOptions::use_major_to_minor_data_layout_for_callbacks is true for
+  // this execution.
   Shape device_shape;
 };
 
@@ -1159,12 +1297,30 @@ struct ExecuteOptions {
   absl::Span<const std::vector<SendCallback>> send_callbacks;
   absl::Span<const std::vector<RecvCallback>> recv_callbacks;
 
+  // If true, send callbacks are passed PjRtChunks in major-to-minor layout, and
+  // recv functions should pass major-to-minor chunks to
+  // CopyToDeviceStream::AddChunk.
+  //
+  // If false, send callbacks are passed PjRtChunks in the on-device layout
+  // specified in the PjRtTransferMetadata, and recv functions should similarly
+  // pass device-layout chunks to CopyToDeviceStream::AddChunk.
+  bool use_major_to_minor_data_layout_for_callbacks = false;
+
   // The `execution_mode` decides whether the execution will be invoked in the
   // caller thread or launched to a separate thread. By default, the
   // implementation may choose either strategy or use a heuristic to decide.
   // Currently it is only applied to CPU implementations
   enum class ExecutionMode { kDefault = 0, kSynchronous, kAsynchronous };
   ExecutionMode execution_mode = ExecutionMode::kDefault;
+
+  // A set of indices denoting the input buffers that should not be donated.
+  // An input buffer may be non-donable, for example, if it is referenced more
+  // than once. Since such runtime information is not available at compile time,
+  // the compiler might mark the input as `may-alias`, which could lead PjRt to
+  // donate the input buffer when it should not. By defining this set of
+  // indices, a higher-level PjRt caller can instruct PjRtClient not to donate
+  // specific input buffers.
+  absl::flat_hash_set<int> non_donatable_input_indices;
 };
 
 // Represents a compiled computation that can be executed given handles to
@@ -1206,7 +1362,7 @@ class PjRtLoadedExecutable : public PjRtExecutable {
   //
   // The following Execute*() methods will donate the input buffer to the
   // execution if it is specified in the executable. Donation is usually
-  // implemented as a transaction: it is acquired in the begining and committed
+  // implemented as a transaction: it is acquired in the beginning and committed
   // when the device execution is successully launched. Concurrent donations
   // might either block or return failures.
   //
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client_test.cc b/tensorflow/compiler/xla/pjrt/pjrt_client_test.cc
index 9d1641e4689..45f416d56f1 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client_test.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client_test.cc
@@ -248,7 +248,7 @@ TEST_P(PjRtClientTest, ExecuteWithDonationAbort) {
 
   auto resultsor = executable->Execute({{buffer.get()}}, options);
   ASSERT_FALSE(resultsor.ok());
-  EXPECT_THAT(resultsor.status().error_message(),
+  EXPECT_THAT(resultsor.status().message(),
               ::testing::HasSubstr(
                   "Donation requested for buffer with external reference"));
 }
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_compiler.cc b/tensorflow/compiler/xla/pjrt/pjrt_compiler.cc
index 34c6aef1ef6..7f00f837ed2 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_compiler.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_compiler.cc
@@ -44,7 +44,7 @@ void PjRtRegisterCompiler(absl::string_view platform_name,
 
 StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     CompileOptions options, const XlaComputation& computation,
-    const PjRtDeviceTopology& topology, PjRtClient* client) {
+    const PjRtTopologyDescription& topology, PjRtClient* client) {
   auto topology_compiler = topology.compiler();
   if (topology_compiler.has_value()) {
     return (*topology_compiler)
@@ -62,7 +62,7 @@ StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
 
 StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     CompileOptions options, mlir::ModuleOp module,
-    const PjRtDeviceTopology& topology, PjRtClient* client) {
+    const PjRtTopologyDescription& topology, PjRtClient* client) {
   auto topology_compiler = topology.compiler();
   if (topology_compiler.has_value()) {
     return (*topology_compiler)
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_compiler.h b/tensorflow/compiler/xla/pjrt/pjrt_compiler.h
index 00108c619a6..a77d47e9d57 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_compiler.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_compiler.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <string>
+#include <vector>
 
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
@@ -30,9 +32,9 @@ class PjRtCompiler;
 // TODO(b/240299401): Move CompileOptions to this file.
 
 // Abstract interface to represent device topology that is used by the compiler.
-class PjRtDeviceTopology {
+class PjRtTopologyDescription {
  public:
-  virtual ~PjRtDeviceTopology() {}
+  virtual ~PjRtTopologyDescription() = default;
 
   // Return an ID that identifies the platform (CPU/GPU/TPU).
   virtual PjRtPlatformId platform_id() const = 0;
@@ -46,6 +48,10 @@ class PjRtDeviceTopology {
 
   // If non-null, overrides the compiler for this topology.
   virtual std::optional<PjRtCompiler*> compiler() const { return std::nullopt; }
+
+  // Returns an unordered list of descriptions for all devices in this topology.
+  virtual std::vector<std::unique_ptr<const PjRtDeviceDescription>>
+  DeviceDescriptions() const = 0;
 };
 
 // Abstract interface that all registered compilers must implement.
@@ -57,12 +63,12 @@ class PjRtCompiler {
   // PjRtExecutable must be loaded by a compatible client before execution.
   virtual StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
       CompileOptions options, const XlaComputation& computation,
-      const PjRtDeviceTopology& topology, PjRtClient* client) = 0;
+      const PjRtTopologyDescription& topology, PjRtClient* client) = 0;
 
   // Variant of `Compile` that accepts an MLIR module.
   virtual StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
       CompileOptions options, mlir::ModuleOp module,
-      const PjRtDeviceTopology& topology, PjRtClient* client) = 0;
+      const PjRtTopologyDescription& topology, PjRtClient* client) = 0;
 };
 
 // Registers a compiler to compile programs for 'platform_name'.
@@ -76,19 +82,19 @@ void PjRtRegisterCompiler(absl::string_view platform_name,
 // registered for the platform using PjRtRegisterCompiler. The returned
 // PjRtExecutable must be loaded by a compatible client before execution.
 //
-// The actual compiler used may be overriden by Topology::compiler().
+// The actual compiler used may be overridden by Topology::compiler().
 //
 // Returns error::NotFound if a compiler has not been registered for the
 // platform. Forwards errors returned from the registered compiler in case of a
 // compilation failure.
 StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     CompileOptions options, const XlaComputation& computation,
-    const PjRtDeviceTopology& topology, PjRtClient* client = nullptr);
+    const PjRtTopologyDescription& topology, PjRtClient* client = nullptr);
 
 // Variant of `PjRtCompile` that accepts an MLIR module.
 StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     CompileOptions options, mlir::ModuleOp module,
-    const PjRtDeviceTopology& topology, PjRtClient* client = nullptr);
+    const PjRtTopologyDescription& topology, PjRtClient* client = nullptr);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_compiler_test.cc b/tensorflow/compiler/xla/pjrt/pjrt_compiler_test.cc
index 3fd88138dca..a8e9ff63646 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_compiler_test.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_compiler_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -27,13 +28,17 @@ namespace xla {
 namespace {
 
 TEST(PjRtCompilerTest, CompilerNotRegistered) {
-  class PjRtTestTopology : public PjRtDeviceTopology {
+  class PjRtTestTopology : public PjRtTopologyDescription {
    public:
     PjRtPlatformId platform_id() const override { return 0; }
     absl::string_view platform_name() const override {
       return "not_registered";
     }
     absl::string_view platform_version() const override { return "test"; }
+    std::vector<std::unique_ptr<const PjRtDeviceDescription>>
+    DeviceDescriptions() const override {
+      LOG(FATAL) << "Unused";
+    }
   };
   PjRtTestTopology topology;
 
@@ -45,11 +50,15 @@ TEST(PjRtCompilerTest, CompilerNotRegistered) {
 }
 
 TEST(PjRtCompilerTest, CompilerRegistered) {
-  class PjRtTestTopology : public PjRtDeviceTopology {
+  class PjRtTestTopology : public PjRtTopologyDescription {
    public:
     PjRtPlatformId platform_id() const override { return 0; }
     absl::string_view platform_name() const override { return "registered"; }
     absl::string_view platform_version() const override { return "test"; }
+    std::vector<std::unique_ptr<const PjRtDeviceDescription>>
+    DeviceDescriptions() const override {
+      LOG(FATAL) << "Unused";
+    }
   };
   PjRtTestTopology topology;
 
@@ -57,12 +66,12 @@ TEST(PjRtCompilerTest, CompilerRegistered) {
    public:
     StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
         CompileOptions options, const XlaComputation& computation,
-        const PjRtDeviceTopology& topology, PjRtClient* client) override {
+        const PjRtTopologyDescription& topology, PjRtClient* client) override {
       return tsl::errors::Unimplemented("test compiler!");
     }
     StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
         CompileOptions options, mlir::ModuleOp module,
-        const PjRtDeviceTopology& topology, PjRtClient* client) override {
+        const PjRtTopologyDescription& topology, PjRtClient* client) override {
       return tsl::errors::Unimplemented("test compiler!");
     }
   };
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_executable.cc b/tensorflow/compiler/xla/pjrt/pjrt_executable.cc
index 5170db10023..668b98f7aa3 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_executable.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_executable.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <algorithm>
 #include <optional>
+#include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
@@ -25,6 +27,17 @@ limitations under the License.
 #include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
+namespace {
+
+void SetOptionOverride(OptionOverrideProto& option, const std::string& value) {
+  option.set_string_field(value);
+}
+
+void SetOptionOverride(OptionOverrideProto& option, bool value) {
+  option.set_bool_field(value);
+}
+
+}  // namespace
 
 StatusOr<CompileOptionsProto> CompileOptions::ToProto() const {
   CompileOptionsProto output;
@@ -41,6 +54,12 @@ StatusOr<CompileOptionsProto> CompileOptions::ToProto() const {
   if (multi_slice_config != nullptr) {
     output.set_serialized_multi_slice_config(multi_slice_config->Serialize());
   }
+  for (auto& env_option_override : env_option_overrides) {
+    auto& tmp =
+        (*output.mutable_env_option_overrides())[env_option_override.first];
+    std::visit([&](const auto& arg) { SetOptionOverride(tmp, arg); },
+               env_option_override.second);
+  }
   return output;
 }
 
@@ -67,6 +86,24 @@ StatusOr<CompileOptions> CompileOptions::FromProto(
   output.executable_build_options = executable_build_options;
   output.compile_portable_executable = proto.compile_portable_executable();
   output.profile_version = proto.profile_version();
+  for (auto& env_option_override : proto.env_option_overrides()) {
+    switch (env_option_override.second.value_case()) {
+      case OptionOverrideProto::kStringField:
+        output.env_option_overrides.push_back(
+            {env_option_override.first,
+             std::variant<std::string, bool>(
+                 env_option_override.second.string_field())});
+        break;
+      case OptionOverrideProto::kBoolField:
+        output.env_option_overrides.push_back(
+            {env_option_override.first,
+             std::variant<std::string, bool>(
+                 env_option_override.second.bool_field())});
+        break;
+      case OptionOverrideProto::VALUE_NOT_SET:
+        return InternalError("OptionOverrideProto value not set.");
+    }
+  }
   return output;
 }
 
@@ -108,4 +145,42 @@ std::optional<std::vector<OpSharding>> PjRtExecutable::GetParameterShardings()
   return out;
 }
 
+Status CompileOptions::ApplyOption(const std::string& key,
+                                   const OptionOverride& value) {
+  if (auto* xla_field = xla::DebugOptions::descriptor()->FindFieldByName(key)) {
+    xla::DebugOptions& debug_options =
+        *executable_build_options.mutable_debug_options();
+    const tsl::protobuf::Reflection* reflection = debug_options.GetReflection();
+    if (!reflection) {
+      return InvalidArgument(
+          "No reflection object associated with xla::DebugOptions.");
+    }
+    if (xla_field->type() == tsl::protobuf::FieldDescriptor::TYPE_BOOL &&
+        std::holds_alternative<bool>(value)) {
+      reflection->SetBool(&debug_options, xla_field, std::get<bool>(value));
+      return OkStatus();
+    } else if (xla_field->type() ==
+                   tsl::protobuf::FieldDescriptor::TYPE_STRING &&
+               std::holds_alternative<std::string>(value)) {
+      reflection->SetString(&debug_options, xla_field,
+                            std::get<std::string>(value));
+      return OkStatus();
+    } else {
+      return InvalidArgument(
+          "While setting option %s, '%s' is not a valid %s value.", key,
+          std::visit([](auto&& arg) { return absl::StrCat(arg); }, value),
+          xla_field->type_name());
+    }
+  } else {
+    return InvalidArgument("No such compile option: '%s'", key);
+  }
+}
+
+Status CompileOptions::ApplyAllOptionOverrides() {
+  for (auto& option : env_option_overrides) {
+    TF_RETURN_IF_ERROR(ApplyOption(option.first, option.second));
+  }
+  return OkStatus();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_executable.h b/tensorflow/compiler/xla/pjrt/pjrt_executable.h
index d2a0ed97678..aeaa1f615be 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_executable.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_executable.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/strings/string_view.h"
@@ -28,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -79,6 +82,21 @@ struct CompileOptions {
   // slice operation.
   const MultiSliceConfig* multi_slice_config = nullptr;
 
+  // Key-value string pairs, parsed in order to set miscellaneous options,
+  // overriding if appropriate.
+  using OptionOverride = std::variant<std::string, bool>;
+  std::vector<std::pair<std::string, OptionOverride>> env_option_overrides;
+
+  // Used to indicate the precision configuration.
+  PrecisionConfig::Precision matrix_unit_operand_precision =
+      PrecisionConfig::DEFAULT;
+
+  // Applies env_option_overrides to executable_build_options.debug_options().
+  Status ApplyAllOptionOverrides();
+
+  // Applies a single option to executable_build_options.debug_options().
+  Status ApplyOption(const std::string& key, const OptionOverride& value);
+
   // Serialize the CompileOptions into a CompileOptionsProto.
   StatusOr<CompileOptionsProto> ToProto() const;
 
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_executable_test.cc b/tensorflow/compiler/xla/pjrt/pjrt_executable_test.cc
index 02d0b78bd86..747755a05bd 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_executable_test.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_executable_test.cc
@@ -48,7 +48,7 @@ TEST(FromProtoTest, MultiSliceConfigNotSupported) {
   auto option = CompileOptions::FromProto(proto);
 
   EXPECT_EQ(option.status().code(), tensorflow::error::UNIMPLEMENTED);
-  EXPECT_EQ(option.status().error_message(),
+  EXPECT_EQ(option.status().message(),
             "multi_slice_config not supported in CompileOptions::FromProto.");
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc
index 88078beb65c..140b614486b 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc
@@ -70,6 +70,7 @@ limitations under the License.
 #include <cstdlib>
 #include <cstring>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <string>
@@ -690,12 +691,13 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
     std::optional<absl::Span<int64_t const>> byte_strides,
     HostBufferSemantics host_buffer_semantics,
-    std::function<void()> on_done_with_host_buffer, PjRtDevice* device) {
+    std::function<void()> on_done_with_host_buffer, PjRtDevice* device,
+    const Layout* device_layout) {
   tsl::profiler::TraceMe traceme(
       "PjRtStreamExecutorClient::BufferFromHostBuffer");
-  Shape shape = ShapeUtil::MakeShape(type, dims);
+  Shape device_shape = ShapeUtil::MakeShape(type, dims);
   VLOG(1) << "PjRtStreamExecutorClient::BufferFromHostBuffer: shape: "
-          << shape.ToString() << " device: " << device->DebugString();
+          << device_shape.ToString() << " device: " << device->DebugString();
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
                       tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
                           ->GetLocalDeviceState());
@@ -704,20 +706,24 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
   if (!byte_strides) {
     tmp_strides.resize(dims.size());
     TF_RETURN_IF_ERROR(
-        ShapeUtil::ByteStrides(shape, absl::MakeSpan(tmp_strides)));
+        ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(tmp_strides)));
     byte_strides = tmp_strides;
   }
-  int64_t size = ShapeUtil::ByteSizeOf(shape);
+  int64_t size = ShapeUtil::ByteSizeOf(device_shape);
 
   TransferManager* transfer_manager = client()->backend().transfer_manager();
-  TF_ASSIGN_OR_RETURN(Shape compact_shape,
-                      transfer_manager->ChooseCompactLayoutForShape(shape));
-  absl::InlinedVector<int64_t, 4> compact_shape_strides(
-      compact_shape.dimensions_size());
-  TF_RETURN_IF_ERROR(ShapeUtil::ByteStrides(
-      compact_shape, absl::MakeSpan(compact_shape_strides)));
+  if (device_layout != nullptr) {
+    *(device_shape.mutable_layout()) = *device_layout;
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        device_shape,
+        transfer_manager->ChooseCompactLayoutForShape(device_shape));
+  }
+  absl::InlinedVector<int64_t, 4> shape_strides(device_shape.dimensions_size());
+  TF_RETURN_IF_ERROR(
+      ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(shape_strides)));
   bool host_and_device_strides_equal =
-      (size == 0 || *byte_strides == compact_shape_strides);
+      (size == 0 || *byte_strides == shape_strides);
   // The CPU platform is special because the "host" and the "device" are in the
   // same memory space. If the input shape is in the correct layout and we don't
   // want to defer the copy onto a thread, we can use the following fast
@@ -768,13 +774,13 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
           definition_events, std::move(on_delete_callback));
       return std::unique_ptr<PjRtBuffer>(
           std::make_unique<PjRtStreamExecutorBuffer>(
-              shape, std::move(device_buffer), this, device));
+              device_shape, std::move(device_buffer), this, device));
     }
   }
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<PjRtStreamExecutorBuffer> py_buffer,
-      AllocateDestinationBuffer(compact_shape, device, local_device,
+      AllocateDestinationBuffer(device_shape, device, local_device,
                                 local_device->host_to_device_stream(),
                                 /*is_uninitialized_create=*/false, this));
 
@@ -799,7 +805,7 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
   std::shared_ptr<TransposePlan> transpose;
   if (!host_and_device_strides_equal) {
     absl::InlinedVector<int64_t, 4> permutation(dims.size());
-    absl::c_reverse_copy(compact_shape.layout().minor_to_major(),
+    absl::c_reverse_copy(device_shape.layout().minor_to_major(),
                          permutation.begin());
     absl::MutexLock lock(&transpose_mu_);
     TF_ASSIGN_OR_RETURN(transpose,
@@ -832,7 +838,7 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
   // put the transfer into the calling thread for small literals.
   auto transfer_h2d =
       [local_client = client(), transfer_manager, local_device, data, size,
-       movable_device_buffer{device_buffer.ToClosure()}, shape,
+       movable_device_buffer{device_buffer.ToClosure()}, device_shape,
        py_buffer{py_buffer.get()},
        on_device_shape{py_buffer->on_device_shape()},
        staging_buffer{std::move(staging_buffer)},
@@ -902,6 +908,17 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
   return std::unique_ptr<PjRtBuffer>(std::move(py_buffer));
 }
 
+StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtStreamExecutorClient::BufferFromHostBuffer(
+    const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+    std::optional<absl::Span<int64_t const>> byte_strides,
+    HostBufferSemantics host_buffer_semantics,
+    std::function<void()> on_done_with_host_buffer, PjRtDevice* device) {
+  return BufferFromHostBuffer(data, type, dims, byte_strides,
+                              host_buffer_semantics, on_done_with_host_buffer,
+                              device, /*device_layout=*/nullptr);
+}
+
 StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtStreamExecutorClient::CreateUninitializedBuffer(const Shape& shape,
                                                     PjRtDevice* device) {
@@ -1781,17 +1798,26 @@ std::unique_ptr<PjRtBuffer> OutputBufferHelper(
               /*prefer_to_retain_reference=*/false, &buffers_to_release);
   return std::unique_ptr<PjRtBuffer>(std::move(pjrt_buffer));
 }
+
+bool IsAllZeros(const DeviceAssignment& assignment) {
+  return std::all_of(
+      assignment.begin(), assignment.end(),
+      [](const DeviceAssignment::value_type& v) { return v == 0; });
+}
+
 }  // namespace
 
 PjRtStreamExecutorExecutable::PjRtStreamExecutorExecutable(
     std::vector<std::unique_ptr<LocalExecutable>> executables,
     bool parameter_is_tupled_arguments,
     std::shared_ptr<DeviceAssignment> device_assignment,
+    CompileOptions compile_options,
     std::vector<LogicalDeviceIds> addressable_device_logical_ids,
     std::vector<PjRtDevice*> addressable_devices,
     PjRtStreamExecutorClient* client)
     : client_(client),
       device_assignment_(std::move(device_assignment)),
+      compile_options_(std::move(compile_options)),
       parameter_is_tupled_arguments_(parameter_is_tupled_arguments),
       addressable_device_logical_ids_(
           std::move(addressable_device_logical_ids)),
@@ -1824,8 +1850,23 @@ PjRtStreamExecutorExecutable::PjRtStreamExecutorExecutable(
     VLOG(3) << "PjRtStreamExecutorExecutable device_assignment:\n"
             << device_assignment_->ToString();
     CHECK_GE(addressable_devices_.size(), 1) << device_assignment_->ToString();
-    CHECK_LE(addressable_devices_.size(), client_->addressable_device_count())
-        << "Inconsistent local device count.";
+
+    if ((device_assignment_->replica_count() > 1 ||
+         device_assignment_->computation_count() > 1) &&
+        IsAllZeros(*device_assignment_)) {
+      // This code path should only be triggered when we intentionally compile
+      // an HLO without having enough devices to actually run it. See the
+      // "--run=false" option in
+      // tensorflow/compiler/xla/tools/multihost_hlo_runner/hlo_runner_main.cc.
+      // That will help us debug the XLA compiler locally.
+      LOG(INFO)
+          << "A workaround is in effect to allow compiling multi-device "
+             "HLOs on machines with fewer devices. Don't run this executable.";
+    } else {
+      CHECK_LE(addressable_devices_.size(), client_->addressable_device_count())
+          << "Inconsistent local device count.";
+    }
+
     num_partitions = device_assignment_->computation_count();
   }
 
@@ -1937,8 +1978,10 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
     tsl::thread::ThreadPool* thread_pool) {
   // Check if we have callbacks registered for the given device ordinal.
   if (device_ordinal >= options.send_callbacks.size()) {
-    return [device_ordinal](int64_t channel_id, se::Stream*, const Shape&,
-                            const se::DeviceMemoryBase&) {
+    return [device_ordinal](
+               int64_t channel_id, se::Stream*, const Shape&,
+               const se::DeviceMemoryBase&,
+               const absl::flat_hash_map<std::string, std::string>&) {
       return InvalidArgument(
           "Failed to send a buffer to the channel_id=%d, there was no send "
           "callbacks registered for the device_ordinal=%d",
@@ -1950,9 +1993,10 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
   absl::Span<const SendCallback> callbacks =
       options.send_callbacks[device_ordinal];
 
-  return [callbacks, thread_pool](int64_t channel_id, se::Stream* stream,
-                                  const Shape& shape,
-                                  const se::DeviceMemoryBase& src)
+  return [callbacks, thread_pool](
+             int64_t channel_id, se::Stream* stream, const Shape& shape,
+             const se::DeviceMemoryBase& src,
+             const absl::flat_hash_map<std::string, std::string>&)
              -> StatusOr<AsyncValueRef<se::Event>> {
     VLOG(3) << "Send " << src.size() << " bytes to channel #" << channel_id
             << " (shape=" << shape.ToString() << ")";
@@ -1988,7 +2032,7 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
       if (auto st = stream->BlockHostUntilDone(); !st.ok()) {
         done_event.SetError(absl::InternalError(absl::StrFormat(
             "failed to synchronize send operation with a stream: %s",
-            st.error_message())));
+            st.message())));
         return;
       }
 
@@ -2089,8 +2133,10 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
     int device_ordinal, const ExecuteOptions& options) {
   // Check if we have callbacks registered for the given device ordinal.
   if (device_ordinal >= options.send_callbacks.size()) {
-    return [device_ordinal](int64_t channel_id, se::Stream*, const Shape&,
-                            se::DeviceMemoryBase*) {
+    return [device_ordinal](
+               int64_t channel_id, se::Stream*, const Shape&,
+               se::DeviceMemoryBase*,
+               const absl::flat_hash_map<std::string, std::string>&) {
       return InvalidArgument(
           "Failed to receive a buffer from the channel_id=%d, there was no "
           "recv callbacks registered for the device_ordinal=%d",
@@ -2102,9 +2148,10 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
   absl::Span<const RecvCallback> callbacks =
       options.recv_callbacks[device_ordinal];
 
-  return [callbacks](
-             int64_t channel_id, se::Stream* stream, const Shape& shape,
-             se::DeviceMemoryBase* dst) -> StatusOr<AsyncValueRef<se::Event>> {
+  return [callbacks](int64_t channel_id, se::Stream* stream, const Shape& shape,
+                     se::DeviceMemoryBase* dst,
+                     const absl::flat_hash_map<std::string, std::string>&)
+             -> StatusOr<AsyncValueRef<se::Event>> {
     VLOG(3) << "Recv from channel #" << channel_id
             << " (shape=" << shape.ToString() << ")";
 
@@ -2557,7 +2604,7 @@ PjRtStreamExecutorExecutable::Execute(
                "terminated. Aborting process to work around deadlock. "
                "Failure message (there may have been multiple failures, see "
                "the error log for all failures): \n\n"
-            << first_failure_status.error_message();
+            << first_failure_status.message();
       }
     }
   }
@@ -2748,6 +2795,9 @@ PjRtStreamExecutorClient::Compile(const XlaComputation& computation,
                                   CompileOptions options) {
   tsl::profiler::TraceMe traceme("PjRtStreamExecutorClient::Compile");
   VLOG(1) << "PjRtStreamExecutorClient::Compile";
+  auto input_options = options;
+
+  TF_RETURN_IF_ERROR(options.ApplyAllOptionOverrides());
 
   TF_ASSIGN_OR_RETURN(ExecutableExtras extras, GetExecutableExtras(&options));
   std::shared_ptr<DeviceAssignment>& device_assignment =
@@ -2774,8 +2824,9 @@ PjRtStreamExecutorClient::Compile(const XlaComputation& computation,
 
   auto executable = std::make_unique<PjRtStreamExecutorExecutable>(
       std::move(local_executables), options.parameter_is_tupled_arguments,
-      std::move(device_assignment), std::move(addressable_device_logical_ids),
-      std::move(addressable_devices), this);
+      std::move(device_assignment), std::move(input_options),
+      std::move(addressable_device_logical_ids), std::move(addressable_devices),
+      this);
 
   TF_RETURN_IF_ERROR(
       executable->SetUpDonation(options.parameter_is_tupled_arguments));
@@ -2814,53 +2865,71 @@ StatusOr<std::string> PjRtStreamExecutorClient::SerializeExecutable(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<AotCompilationResult> aot_result,
                       compiler->Export(built_executable));
   TF_ASSIGN_OR_RETURN(std::string serialized, aot_result->SerializeAsString());
-
   if (serialized.empty()) {
     return Internal(
         "PjRtStreamExecutorClient::SerializeExecutable proto serialization "
         "failed");
   }
-  return serialized;
+  ExecutableAndOptionsProto proto;
+  *proto.mutable_serialized_executable() = std::move(serialized);
+  TF_ASSIGN_OR_RETURN(*proto.mutable_compile_options(),
+                      se_executable->compile_options_.ToProto());
+  return proto.SerializeAsString();
 }
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 PjRtStreamExecutorClient::DeserializeExecutable(
     absl::string_view serialized, std::optional<CompileOptions> options) {
-  if (serialized.empty()) {
-    return InternalError("Serialized executable is empty");
+  ExecutableAndOptionsProto proto;
+  if (serialized.size() > std::numeric_limits<int>::max()) {
+    return Internal(
+        "PjRtStreamExecutorClient::DeserializeExecutable proto too large "
+        "(>2GB)");
+  }
+  if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
+    return Internal(
+        "PjRtStreamExecutorClient::DeserializeExecutable proto deserialization "
+        "failed");
   }
 
-  if (!options.has_value()) {
-    return InvalidArgument(
-        "PjRtStreamExecutorClient requires `CompileOptions` for "
-        "`DeserializeExecutable()`");
+  CompileOptions compile_options;
+  if (options.has_value()) {
+    compile_options = *std::move(options);
+  } else {
+    TF_ASSIGN_OR_RETURN(compile_options,
+                        CompileOptions::FromProto(proto.compile_options()));
   }
+  auto input_options = compile_options;
 
   tsl::profiler::TraceMe traceme(
       "PjRtStreamExecutorClient::DeserializeExecutable");
   VLOG(1) << "PjRtStreamExecutorClient::DeserializeExecutable";
 
-  TF_ASSIGN_OR_RETURN(ExecutableExtras extras, GetExecutableExtras(&*options));
+  TF_ASSIGN_OR_RETURN(ExecutableExtras extras,
+                      GetExecutableExtras(&compile_options));
   std::shared_ptr<DeviceAssignment>& device_assignment =
       extras.device_assignment;
   std::vector<PjRtStreamExecutorExecutable::LogicalDeviceIds>&
       addressable_device_logical_ids = extras.addressable_device_logical_ids;
   std::vector<PjRtDevice*>& addressable_devices = extras.addressable_devices;
 
-  std::string str(serialized);
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<LocalExecutable> loaded,
-                      client()->Load(str, options->executable_build_options));
+  std::string str = std::move(*proto.mutable_serialized_executable());
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<LocalExecutable> loaded,
+      client()->Load(str, compile_options.executable_build_options));
 
   std::vector<std::unique_ptr<LocalExecutable>> local_executables;
   local_executables.push_back(std::move(loaded));
 
   auto executable = std::make_unique<PjRtStreamExecutorExecutable>(
-      std::move(local_executables), options->parameter_is_tupled_arguments,
-      std::move(device_assignment), std::move(addressable_device_logical_ids),
-      std::move(addressable_devices), this);
+      std::move(local_executables),
+      compile_options.parameter_is_tupled_arguments,
+      std::move(device_assignment), std::move(input_options),
+      std::move(addressable_device_logical_ids), std::move(addressable_devices),
+      this);
 
   TF_RETURN_IF_ERROR(
-      executable->SetUpDonation(options->parameter_is_tupled_arguments));
+      executable->SetUpDonation(compile_options.parameter_is_tupled_arguments));
   return std::unique_ptr<PjRtLoadedExecutable>(std::move(executable));
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h
index b2635cdfcab..20ea59d870e 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h
@@ -232,6 +232,13 @@ class PjRtStreamExecutorClient : public PjRtClient {
     return Unimplemented("Async transfer to buffers not implemented");
   };
 
+  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      std::function<void()> on_done_with_host_buffer, PjRtDevice* device,
+      const Layout* device_layout) override;
+
   StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
@@ -714,6 +721,7 @@ class PjRtStreamExecutorExecutable : public PjRtLoadedExecutable {
       std::vector<std::unique_ptr<LocalExecutable>> executables,
       bool parameter_is_tupled_arguments,
       std::shared_ptr<DeviceAssignment> device_assignment,
+      CompileOptions compile_options,
       std::vector<LogicalDeviceIds> addressable_device_logical_ids,
       std::vector<PjRtDevice*> addressable_devices,
       PjRtStreamExecutorClient* client);
@@ -868,6 +876,7 @@ class PjRtStreamExecutorExecutable : public PjRtLoadedExecutable {
   // and thus must be donated when executing the computation.
   std::vector<std::vector<int>> parameters_that_must_be_donated_;
   std::shared_ptr<DeviceAssignment> device_assignment_;
+  CompileOptions compile_options_;
 
   // True if the executables were compiled expecting arguments in a single
   // tuple.
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client_test.cc b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client_test.cc
index 2d289e7f81e..1618c61d527 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client_test.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client_test.cc
@@ -96,13 +96,13 @@ TEST(PjRtStreamExecutorClientTest, DonateSameBufferTwice) {
   status = ExecuteWithSameInputBuffer(
       [](XlaBuilder& builder) { builder.SetUpAlias({0}, 0, {}); });
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("f(donate(a), a)"));
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("f(donate(a), a)"));
 
   // f(a, donate(a))
   status = ExecuteWithSameInputBuffer(
       [](XlaBuilder& builder) { builder.SetUpAlias({0}, 1, {}); });
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("f(a, donate(a))"));
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("f(a, donate(a))"));
 
   // f(donate(a), donate(a))
   status = ExecuteWithSameInputBuffer([](XlaBuilder& builder) {
@@ -110,7 +110,7 @@ TEST(PjRtStreamExecutorClientTest, DonateSameBufferTwice) {
     builder.SetUpAlias({1}, 1, {});
   });
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::HasSubstr("f(donate(a), donate(a))"));
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/tf_pjrt_client.cc b/tensorflow/compiler/xla/pjrt/tf_pjrt_client.cc
index 28a2afdf727..e15ada0c62d 100644
--- a/tensorflow/compiler/xla/pjrt/tf_pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/tf_pjrt_client.cc
@@ -110,7 +110,11 @@ TfPjRtExecutable::ExecutePortable(
 }
 
 TfPjRtClient::TfPjRtClient(std::unique_ptr<PjRtClient> wrapped)
-    : wrapped_(std::move(wrapped)) {}
+    : wrapped_(std::move(wrapped)) {
+  LOG(INFO) << "TfPjRtClient created.";
+}
+
+TfPjRtClient::~TfPjRtClient() { LOG(INFO) << "TfPjRtClient destroyed."; }
 
 StatusOr<std::unique_ptr<PjRtBuffer>> TfPjRtClient::WrapBuffer(
     StatusOr<std::unique_ptr<PjRtBuffer>> to_wrap) {
@@ -146,6 +150,7 @@ void TfPjRtClient::DestroyWrappedBuffersAndClient() {
   }
   mu_.Unlock();
   wrapped_.reset(nullptr);
+  LOG(INFO) << "TfPjRtClient::DestroyWrappedBuffersAndClient completed.";
 }
 
 std::unique_ptr<TfPjRtClient> TfPjRtClient::CreateTfPjRtClient(
diff --git a/tensorflow/compiler/xla/pjrt/tf_pjrt_client.h b/tensorflow/compiler/xla/pjrt/tf_pjrt_client.h
index 3f0196c6b56..215d84e0054 100644
--- a/tensorflow/compiler/xla/pjrt/tf_pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/tf_pjrt_client.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
 
@@ -158,6 +159,10 @@ class TfPjRtExecutable : public PjRtLoadedExecutable {
     return wrapped_->SerializeExecutable();
   }
 
+  StatusOr<struct CompileOptions> GetCompileOptions() const override {
+    return wrapped_->GetCompileOptions();
+  }
+
  private:
   TfPjRtClient* client_;
   std::unique_ptr<PjRtLoadedExecutable> wrapped_;
@@ -170,6 +175,7 @@ class TfPjRtClient : public PjRtClient {
   static std::unique_ptr<TfPjRtClient> CreateTfPjRtClient(
       std::unique_ptr<PjRtClient> wrapped);
   explicit TfPjRtClient(std::unique_ptr<PjRtClient> wrapped);
+  ~TfPjRtClient() override;
   int process_index() const override { return wrapped_->process_index(); }
   int device_count() const override { return wrapped_->device_count(); }
   int addressable_device_count() const override {
@@ -186,6 +192,10 @@ class TfPjRtClient : public PjRtClient {
   }
   StatusOr<PjRtDevice*> LookupAddressableDevice(
       int local_hardware_id) const override {
+    if (wrapped_ == nullptr) {
+      return tsl::errors::Internal(
+          "Wrapped PJRT client in TfPjRtClient is already destoryed.");
+    }
     return wrapped_->LookupAddressableDevice(local_hardware_id);
   }
   PjRtPlatformId platform_id() const override {
@@ -248,6 +258,16 @@ class TfPjRtClient : public PjRtClient {
         data, type, dims, byte_strides, host_buffer_semantics,
         on_done_with_host_buffer, device));
   }
+  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      std::function<void()> on_done_with_host_buffer, PjRtDevice* device,
+      const Layout* device_layout) override {
+    return WrapBuffer(wrapped_->BufferFromHostBuffer(
+        data, type, dims, byte_strides, host_buffer_semantics,
+        on_done_with_host_buffer, device, device_layout));
+  }
   StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtDevice* device) override {
     return WrapBuffer(wrapped_->BufferFromHostLiteral(literal, device));
diff --git a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc
index 3e47ffcd64f..11bf0a0ac43 100644
--- a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc
@@ -16,49 +16,78 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
 #include <cstring>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/runtime/cpu_event.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/tsl/platform/errors.h"
-
 #define EIGEN_USE_THREADS
 
-#include "absl/base/thread_annotations.h"
+#include "absl/algorithm/container.h"
+#include "absl/base/casts.h"
+#include "absl/base/dynamic_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/array.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/cpu_function_runtime.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/pjrt/compile_options.pb.h"
 #include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
 #include "tensorflow/compiler/xla/pjrt/semaphore.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
+#include "tensorflow/compiler/xla/pjrt/transpose.h"
 #include "tensorflow/compiler/xla/pjrt/utils.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/runtime/cpu_event.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_xfeed.h"
+#include "tensorflow/compiler/xla/service/custom_call_status.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_module_util.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/casts.h"
 #include "tensorflow/tsl/platform/denormal.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/setround.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/threadpool.h"
 #include "tensorflow/tsl/profiler/lib/connected_traceme.h"
 #include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 #include "tfrt/support/forward_decls.h"  // from @tf_runtime
@@ -68,42 +97,60 @@ namespace {
 
 using ::xla::runtime::CpuEvent;
 
-// A RAII helper class used to set an AsyncValueRef<CpuEvent> to a ready state
-// upon destruction. In many cases in PjRt implementation, there will be
-// multiple return statements in the function, all of which require setting some
-// AsyncValueRef<CpuEvent> to be ready. This class could make such code more
-// robust by using setting the AsyncValue in the destructor.
-class MarkEventReadyOnExit {
- public:
-  explicit MarkEventReadyOnExit(tfrt::AsyncValueRef<CpuEvent> event)
-      : event_(std::move(event)) {}
-
-  MarkEventReadyOnExit(const MarkEventReadyOnExit&) = delete;
-  MarkEventReadyOnExit& operator=(const MarkEventReadyOnExit&) = delete;
-  MarkEventReadyOnExit(MarkEventReadyOnExit&&) = default;
-  MarkEventReadyOnExit& operator=(MarkEventReadyOnExit&&) = default;
-
-  ~MarkEventReadyOnExit() {
-    if (event_) event_.SetStateConcrete();
+StatusOr<std::unique_ptr<TfrtCpuBuffer>> AllocateDestinationBuffer(
+    const Shape& on_device_shape,
+    absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> definition_events,
+    TfrtCpuDevice* device, TfrtCpuClient* client) {
+  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers;
+  if (!on_device_shape.IsTuple()) {
+    size_t byte_size = ShapeUtil::ByteSizeOf(on_device_shape);
+    TF_ASSIGN_OR_RETURN(auto device_buffer,
+                        MaybeOwningCpuMemory::AllocateShared(byte_size));
+    buffers.push_back(std::move(device_buffer));
+    return std::make_unique<TfrtCpuBuffer>(
+        on_device_shape,
+        std::make_unique<TrackedTfrtCpuDeviceBuffer>(
+            /*is_tuple=*/false, std::move(buffers),
+            std::move(definition_events)),
+        client, device);
   }
+  // Tuple case.
+  buffers.reserve(on_device_shape.tuple_shapes().size());
+  for (const auto& leaf_shape : on_device_shape.tuple_shapes()) {
+    size_t byte_size = ShapeUtil::ByteSizeOf(leaf_shape);
+    TF_ASSIGN_OR_RETURN(auto device_buffer,
+                        MaybeOwningCpuMemory::AllocateShared(byte_size));
+    buffers.push_back(std::move(device_buffer));
+  }
+  return std::make_unique<TfrtCpuBuffer>(
+      on_device_shape,
+      std::make_unique<TrackedTfrtCpuDeviceBuffer>(
+          /*is_tuple=*/true, std::move(buffers), std::move(definition_events)),
+      client, device);
+}
 
-  tfrt::AsyncValueRef<CpuEvent> Release() && { return std::move(event_); }
-
- private:
-  tfrt::AsyncValueRef<CpuEvent> event_;
-};
-
-}  // namespace
+StatusOr<std::unique_ptr<TfrtCpuBuffer>> AllocateDestinationBufferAndAvs(
+    const Shape& shape,
+    absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4>* avs,
+    TfrtCpuDevice* device, TfrtCpuClient* client) {
+  // Add a placeholder definition event for each leaf buffer when creating the
+  // buffer.
+  absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> definition_events;
+  int num_leaf_buffers = shape.IsTuple() ? shape.tuple_shapes_size() : 1;
+  for (int i = 0; i < num_leaf_buffers; ++i) {
+    tfrt::AsyncValueRef<CpuEvent> definition_event =
+        tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
+    definition_events.push_back(definition_event.CopyRef());
+    avs->push_back(std::move(definition_event));
+  }
+  return AllocateDestinationBuffer(
+      shape, std::move(definition_events),
+      tensorflow::down_cast<TfrtCpuDevice*>(device), client);
+}
 
 static const char kCpuPlatformName[] = "cpu";
 static constexpr size_t kSmallDataTransferByteSize = 102400;  // 100 KiB
 
-static tfrt::AsyncValueRef<CpuEvent> GetOrCreateReadyEvent() {
-  static const auto* ready_event = new tfrt::AsyncValueRef<CpuEvent>(
-      tfrt::MakeAvailableAsyncValueRef<CpuEvent>());
-  return ready_event->CopyRef();
-}
-
 static void EnqueueWork(tsl::thread::ThreadPool* pool,
                         absl::AnyInvocable<void()> callee) {
   // TSL TheadPool expects std::function that must be copyable, so we are
@@ -124,20 +171,198 @@ static void EnqueueWorkWhenReady(
   });
 }
 
-TfrtCpuDevice::TfrtCpuDevice(int id, bool asynchronous)
-    : id_(id),
-      max_inflight_computations_semaphore_(/*capacity=*/asynchronous ? 32 : 1) {
+class TfrtCpuAsyncHostToDeviceTransferManager
+    : public PjRtClient::AsyncHostToDeviceTransferManager {
+ public:
+  static StatusOr<std::unique_ptr<TfrtCpuAsyncHostToDeviceTransferManager>>
+  Create(absl::Span<const Shape> shapes, TfrtCpuDevice* device,
+         TfrtCpuClient* client) {
+    for (const Shape& shape : shapes) {
+      if (shape.IsTuple()) {
+        return Unimplemented(
+            "Tuples are not supported by "
+            "TfrtCpuAsyncHostToDeviceTransferManager");
+      }
+    }
+    absl::InlinedVector<std::unique_ptr<TfrtCpuBuffer>, 4> buffers;
+    buffers.reserve(shapes.size());
+    absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4> avs;
+    avs.reserve(shapes.size());
+    for (const Shape& shape : shapes) {
+      absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4> local_avs;
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<TfrtCpuBuffer> buffer,
+          AllocateDestinationBufferAndAvs(shape, &local_avs, device, client));
+      CHECK_EQ(local_avs.size(), 1);
+      avs.push_back(std::move(local_avs[0]));
+      buffers.push_back(std::move(buffer));
+    }
+    absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4> device_buffers;
+    device_buffers.reserve(buffers.size());
+    for (const std::unique_ptr<TfrtCpuBuffer>& buffer : buffers) {
+      auto usage_event = tfrt::MakeAvailableAsyncValueRef<CpuEvent>();
+      auto* device_buffer = buffer->AcquireUsage(std::move(usage_event));
+      CHECK(device_buffer);
+      device_buffers.push_back(device_buffer);
+    }
+
+    absl::InlinedVector<size_t, 4> buffer_sizes;
+    buffer_sizes.reserve(buffers.size());
+    for (const std::unique_ptr<TfrtCpuBuffer>& buffer : buffers) {
+      TF_ASSIGN_OR_RETURN(const size_t buffer_size,
+                          buffer->GetOnDeviceSizeInBytes());
+      buffer_sizes.push_back(buffer_size);
+    }
+
+    return absl::WrapUnique(new TfrtCpuAsyncHostToDeviceTransferManager(
+        std::move(avs), std::move(buffers), std::move(device_buffers),
+        std::move(buffer_sizes), DefaultThreadPoolSize(), device));
+  }
+
+  ~TfrtCpuAsyncHostToDeviceTransferManager() override {
+    // Wait for in-flight transfers to finish.
+    absl::Condition transfers_finished(
+        +[](int* t) { return *t == 0; }, &transfers_in_flight_);
+    absl::MutexLock l(&mu_);
+    mu_.Await(transfers_finished);
+    for (auto& avref : avs_) {
+      auto av = avref.CopyRef();
+      if (av && av->IsUnavailable()) {
+        av->SetError(absl::InternalError(
+            "Async transfer object was deleted before transfers completed."));
+      }
+    }
+  }
+
+  size_t buffer_count() const override { return buffer_sizes_.size(); };
+
+  size_t buffer_size(int buffer_index) const override {
+    CHECK_GE(buffer_index, 0);
+    CHECK_LT(buffer_index, buffer_sizes_.size());
+    return buffer_sizes_[buffer_index];
+  }
+
+  PjRtDevice* device() const override { return device_; }
+
+  std::unique_ptr<PjRtBuffer> RetrieveBuffer(int buffer_index) override {
+    absl::MutexLock l(&mu_);
+    CHECK_GE(buffer_index, 0);
+    CHECK_LT(buffer_index, buffers_.size());
+    return std::move(buffers_[buffer_index]);
+  }
+
+  Status TransferLiteralToBuffer(
+      int buffer_index, const LiteralSlice& literal,
+      absl::AnyInvocable<void() &&> on_done) override {
+    return TransferRawDataToSubBuffer(buffer_index, literal.untyped_data(), 0,
+                                      literal.size_bytes(), true,
+                                      std::move(on_done));
+  }
+
+  Status TransferRawDataToBuffer(
+      int buffer_index, absl::string_view data,
+      absl::AnyInvocable<void() &&> on_done) override {
+    return TransferRawDataToSubBuffer(buffer_index, data.data(), 0, data.size(),
+                                      true, std::move(on_done));
+  }
+
+  Status TransferRawDataToSubBuffer(
+      int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
+      bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override {
+    absl::MutexLock l(&mu_);
+    CHECK_GE(buffer_index, 0);
+    CHECK_LT(buffer_index, buffers_.size());
+    CHECK_LE(transfer_size + offset, buffer_sizes_[buffer_index]);
+    ++transfers_in_flight_;
+    EnqueueWork(
+        thread_pool_.get(),
+        [this, device_buffer = device_buffers_[buffer_index],
+         av = avs_[buffer_index].CopyRef(), data, offset, transfer_size,
+         is_last_transfer, on_done = std::move(on_done)]() mutable -> void {
+          absl::MutexLock l(&mu_);
+          const std::shared_ptr<MaybeOwningCpuMemory>& b =
+              device_buffer->Buffers()[0];
+          std::memcpy(reinterpret_cast<char*>(b->data()) + offset, data,
+                      transfer_size);
+          std::move(on_done)();
+          if (is_last_transfer) {
+            av->SetStateConcrete();
+          }
+          --transfers_in_flight_;
+        });
+    return tsl::OkStatus();
+  }
+
+  void SetBufferError(int buffer_index, Status error) override {
+    absl::MutexLock l(&mu_);
+    avs_[buffer_index]->SetError(ToAbslStatus(error));
+  }
+
+  void AddTransferMetadata(const TransferMetadata& meta) override {
+    LOG(WARNING) << "AddTransferMetadata not implemented for TfrtCpuClient";
+  }
+
+ private:
+  TfrtCpuAsyncHostToDeviceTransferManager(
+      absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4> avs,
+      absl::InlinedVector<std::unique_ptr<TfrtCpuBuffer>, 4> buffers,
+      absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4> device_buffers,
+      absl::InlinedVector<size_t, 4> buffer_sizes, size_t num_threads,
+      TfrtCpuDevice* device)
+      : transfers_in_flight_(0),
+        avs_(std::move(avs)),
+        buffers_(std::move(buffers)),
+        device_buffers_(std::move(device_buffers)),
+        buffer_sizes_(std::move(buffer_sizes)),
+        thread_pool_(std::make_unique<tsl::thread::ThreadPool>(
+            tsl::Env::Default(),
+            "XLATfrtCpuTfrtCpuAsyncHostToDeviceTransferManager", num_threads)),
+        device_(device) {}
+
+  mutable absl::Mutex mu_;
+  // The number of transfers that are currently in flight.
+  int transfers_in_flight_ ABSL_GUARDED_BY(mu_);
+  // AsyncValues used to mark buffers as ready for consumption.
+  absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4> avs_
+      ABSL_GUARDED_BY(mu_);
+  // The newly created buffers, which will be returned to the caller via
+  // Retrieve.
+  absl::InlinedVector<std::unique_ptr<TfrtCpuBuffer>, 4> buffers_
+      ABSL_GUARDED_BY(mu_);
+  // Device buffers which we use to get the underlying memory to populate.
+  absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4> device_buffers_
+      ABSL_GUARDED_BY(mu_);
+  // Cached versions of the sizes of all the buffers. Not modified after
+  // creation, so not guarded by mu_.
+  absl::InlinedVector<size_t, 4> buffer_sizes_;
+
+  std::unique_ptr<tsl::thread::ThreadPool> thread_pool_;
+  TfrtCpuDevice* device_;  // not owned.
+};
+
+}  // namespace
+
+TfrtCpuDeviceDescription::TfrtCpuDeviceDescription(int id) : id_(id) {
   debug_string_ = absl::StrCat("TFRT_CPU_", id);
   to_string_ = absl::StrCat("CpuDevice(id=", id, ")");
 }
 
-absl::string_view TfrtCpuDevice::device_kind() const {
+absl::string_view TfrtCpuDeviceDescription::device_kind() const {
   return kCpuPlatformName;
 }
 
-absl::string_view TfrtCpuDevice::DebugString() const { return debug_string_; }
+absl::string_view TfrtCpuDeviceDescription::DebugString() const {
+  return debug_string_;
+}
 
-absl::string_view TfrtCpuDevice::ToString() const { return to_string_; }
+absl::string_view TfrtCpuDeviceDescription::ToString() const {
+  return to_string_;
+}
+
+TfrtCpuDevice::TfrtCpuDevice(int id, bool asynchronous)
+    : description_(id),
+      max_inflight_computations_semaphore_(/*capacity=*/asynchronous ? 32 : 1) {
+}
 
 Status TfrtCpuDevice::TransferToInfeed(const LiteralSlice& literal) {
   return TransferLiteralToInfeedOnCpu(local_hardware_id(), literal);
@@ -311,25 +536,40 @@ StatusOr<std::string> TfrtCpuExecutable::SerializeExecutable() const {
                       compiler.Export(cpu_executable_.get()));
 
   TF_ASSIGN_OR_RETURN(std::string serialized, aot_result->SerializeAsString());
-
   if (serialized.empty()) {
     return Internal(
         "TfrtCpuClient::SerializeExecutable proto serialization failed");
   }
-  return serialized;
+  ExecutableAndOptionsProto proto;
+  *proto.mutable_serialized_executable() = std::move(serialized);
+  TF_ASSIGN_OR_RETURN(*proto.mutable_compile_options(),
+                      compile_options_.ToProto());
+  return proto.SerializeAsString();
 }
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
                                      std::optional<CompileOptions> options) {
-  if (!options.has_value()) {
-    return InvalidArgument(
-        "TfrtCpuClient requires `CompileOptions` for "
-        "`DeserializeExecutable()`");
+  ExecutableAndOptionsProto proto;
+  if (serialized.size() > std::numeric_limits<int>::max()) {
+    return Internal(
+        "TfrtCpuClient::DeserializeExecutable proto too large (>2GB)");
   }
+  if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
+    return Internal(
+        "TfrtCpuClient::DeserializeExecutable proto deserialization failed");
+  }
+  CompileOptions compile_options;
+  if (options.has_value()) {
+    compile_options = *std::move(options);
+  } else {
+    TF_ASSIGN_OR_RETURN(compile_options,
+                        CompileOptions::FromProto(proto.compile_options()));
+  }
+  auto input_options = compile_options;
   // Load a CpuExecutable
   cpu::CpuCompiler compiler;
-  std::string str(serialized);
+  std::string str = std::move(*proto.mutable_serialized_executable());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<AotCompilationResult> aot_result,
                       compiler.LoadAotCompilationResult(str));
   TF_ASSIGN_OR_RETURN(
@@ -343,7 +583,8 @@ TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
   int num_partitions;
   std::shared_ptr<DeviceAssignment> device_assignment;
   TF_RETURN_IF_ERROR(ParseDeviceAssignmentCompileOptions(
-      options->compile_portable_executable, &options->executable_build_options,
+      compile_options.compile_portable_executable,
+      &compile_options.executable_build_options,
       [this](int num_replicas, int num_partitions) {
         return this->GetDefaultDeviceAssignment(num_replicas, num_partitions);
       },
@@ -369,7 +610,8 @@ TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
   std::vector<PjRtLoadedExecutable::LogicalDeviceIds>
       addressable_device_logical_ids;
   std::vector<PjRtDevice*> addressable_devices;
-  ExecutableBuildOptions& build_options = options->executable_build_options;
+  ExecutableBuildOptions& build_options =
+      compile_options.executable_build_options;
   if (device_assignment != nullptr) {
     addressable_device_logical_ids.reserve(num_replicas * num_partitions);
     addressable_devices.reserve(num_replicas * num_partitions);
@@ -402,12 +644,13 @@ TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
 
   auto tfrt_cpu_executable = std::make_unique<TfrtCpuExecutable>(
       num_replicas, num_partitions, std::move(device_assignment),
-      options->parameter_is_tupled_arguments, std::move(executable),
-      result_slice.index(), std::move(result_buffer_indices),
+      compile_options.parameter_is_tupled_arguments, std::move(input_options),
+      std::move(executable), result_slice.index(),
+      std::move(result_buffer_indices),
       std::move(addressable_device_logical_ids), std::move(addressable_devices),
       this);
   TF_RETURN_IF_ERROR(tfrt_cpu_executable->SetUpDonation(
-      options->parameter_is_tupled_arguments));
+      compile_options.parameter_is_tupled_arguments));
 
   return std::unique_ptr<PjRtLoadedExecutable>(std::move(tfrt_cpu_executable));
 }
@@ -453,8 +696,11 @@ static StatusOr<std::unique_ptr<xla::Executable>> JitCompile(
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
     const XlaComputation& computation, CompileOptions options) {
   tsl::profiler::TraceMe traceme("TfrtCpuClient::Compile");
+  auto input_options = options;
   ExecutableBuildOptions& build_options = options.executable_build_options;
 
+  TF_RETURN_IF_ERROR(options.ApplyAllOptionOverrides());
+
   int num_replicas;
   int num_partitions;
   std::shared_ptr<DeviceAssignment> device_assignment;
@@ -529,8 +775,9 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
 
   auto executable = std::make_unique<TfrtCpuExecutable>(
       num_replicas, num_partitions, std::move(device_assignment),
-      options.parameter_is_tupled_arguments, std::move(cpu_executable),
-      result_slice.index(), std::move(result_buffer_indices),
+      options.parameter_is_tupled_arguments, std::move(input_options),
+      std::move(cpu_executable), result_slice.index(),
+      std::move(result_buffer_indices),
       std::move(addressable_device_logical_ids), std::move(addressable_devices),
       this);
   TF_RETURN_IF_ERROR(
@@ -549,38 +796,6 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
   return Compile(xla_computation, options);
 }
 
-StatusOr<std::unique_ptr<TfrtCpuBuffer>> AllocateDestinationBuffer(
-    const Shape& on_device_shape,
-    absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> definition_events,
-    TfrtCpuDevice* device, TfrtCpuClient* client) {
-  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers;
-  if (!on_device_shape.IsTuple()) {
-    size_t byte_size = ShapeUtil::ByteSizeOf(on_device_shape);
-    TF_ASSIGN_OR_RETURN(auto device_buffer,
-                        MaybeOwningCpuMemory::AllocateShared(byte_size));
-    buffers.push_back(std::move(device_buffer));
-    return std::make_unique<TfrtCpuBuffer>(
-        on_device_shape,
-        std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-            /*is_tuple=*/false, std::move(buffers),
-            std::move(definition_events)),
-        client, device);
-  }
-  // Tuple case.
-  buffers.reserve(on_device_shape.tuple_shapes().size());
-  for (const auto& leaf_shape : on_device_shape.tuple_shapes()) {
-    size_t byte_size = ShapeUtil::ByteSizeOf(leaf_shape);
-    TF_ASSIGN_OR_RETURN(auto device_buffer,
-                        MaybeOwningCpuMemory::AllocateShared(byte_size));
-    buffers.push_back(std::move(device_buffer));
-  }
-  return std::make_unique<TfrtCpuBuffer>(
-      on_device_shape,
-      std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-          /*is_tuple=*/true, std::move(buffers), std::move(definition_events)),
-      client, device);
-}
-
 StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::CreateViewOfDeviceBuffer(
     void* device_ptr, const Shape& shape, PjRtDevice* device,
     std::function<void()> on_delete_callback) {
@@ -608,6 +823,14 @@ StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::CreateUninitializedBuffer(
       tensorflow::down_cast<TfrtCpuDevice*>(device), this);
 }
 
+StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+TfrtCpuClient::CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                                 PjRtDevice* device) {
+  auto* tfrt_device = tensorflow::down_cast<TfrtCpuDevice*>(device);
+  return TfrtCpuAsyncHostToDeviceTransferManager::Create(shapes, tfrt_device,
+                                                         this);
+}
+
 StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::BufferFromHostBuffer(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
     std::optional<absl::Span<int64_t const>> byte_strides,
@@ -711,21 +934,11 @@ StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::BufferFromHostLiteral(
           << " device: " << device->DebugString();
   const Shape& shape = literal.shape();
 
-  // Add a placeholder definition event for each leaf buffer when creating the
-  // buffer. They are set only after h2d dispatch.
-  absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> definition_events;
   absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4> avs;
-  int num_leaf_buffers = shape.IsTuple() ? shape.tuple_shapes_size() : 1;
-  for (int i = 0; i < num_leaf_buffers; ++i) {
-    tfrt::AsyncValueRef<CpuEvent> definition_event =
-        tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
-    definition_events.push_back(definition_event.CopyRef());
-    avs.push_back(std::move(definition_event));
-  }
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<TfrtCpuBuffer> output_buffer,
-                      AllocateDestinationBuffer(
-                          shape, std::move(definition_events),
-                          tensorflow::down_cast<TfrtCpuDevice*>(device), this));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<TfrtCpuBuffer> output_buffer,
+      AllocateDestinationBufferAndAvs(
+          shape, &avs, tensorflow::down_cast<TfrtCpuDevice*>(device), this));
 
   auto usage_event = tfrt::MakeAvailableAsyncValueRef<CpuEvent>();
   auto* device_buffer = output_buffer->AcquireUsage(std::move(usage_event));
@@ -768,198 +981,10 @@ TfrtCpuBuffer::TfrtCpuBuffer(
     Shape on_device_shape,
     std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
     TfrtCpuClient* client, TfrtCpuDevice* device)
-    : client_(client),
-      on_device_shape_(std::move(on_device_shape)),
-      device_(device),
-      tracked_device_buffer_(std::move(tracked_device_buffer)) {}
-
-TfrtCpuBuffer::~TfrtCpuBuffer() {
-  Delete();
-  CHECK_EQ(external_reference_counter_, 0);
-}
-
-StatusOr<size_t> TfrtCpuBuffer::GetOnDeviceSizeInBytes() const {
-  return ShapeUtil::ByteSizeOf(on_device_shape_);
-}
-
-StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
-TfrtCpuBuffer::AcquireExternalReference() {
-  class ScopedExternalReference : public PjRtBuffer::ExternalReference {
-   public:
-    explicit ScopedExternalReference(TfrtCpuBuffer* buffer,
-                                     std::shared_ptr<MaybeOwningCpuMemory> data)
-        : buffer_(buffer), data_(std::move(data)) {
-      DCHECK(data_);
-      data_ptr_ = data_->data();
-    }
-
-    ~ScopedExternalReference() override { buffer_->DropExternalReference(); }
-
-   private:
-    TfrtCpuBuffer* buffer_ = nullptr;
-    // Keep a reference to the underlying data used. Note that it is still
-    // users' responsibility to synchronize reads and writes to the data.
-    std::shared_ptr<MaybeOwningCpuMemory> data_;
-  };
-
-  absl::MutexLock lock(&mu_);
-  if (tracked_device_buffer_ == nullptr) {
-    return InvalidArgument("Buffer has been deleted or donated.");
-  }
-
-  ++external_reference_counter_;
-
-  return {std::make_unique<ScopedExternalReference>(
-      this, tracked_device_buffer_->Buffers()[0])};
-}
-
-class TrackedCpuDeviceBufferExternalReference
-    : public PjRtBuffer::ExternalReference {
- public:
-  explicit TrackedCpuDeviceBufferExternalReference(
-      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer)
-      : tracked_device_buffer_(std::move(tracked_device_buffer)) {
-    data_ptr_ = tracked_device_buffer_->Buffers()[0]->data();
-  }
-
-  ~TrackedCpuDeviceBufferExternalReference() override = default;
-
- private:
-  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer_;
-};
-
-StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
-TfrtCpuBuffer::ReleaseDeviceMemoryOwnership(
-    bool wait_for_operations_to_complete) {
-  if (on_device_shape_.IsTuple()) {
-    return InvalidArgument(
-        "ReleaseDeviceMemoryOwnership allowed only for non-tuple");
-  }
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
-      Release(wait_for_operations_to_complete));
-
-  std::unique_ptr<PjRtBuffer::ExternalReference> ref;
-  if (tracked_device_buffer) {
-    ref = std::make_unique<TrackedCpuDeviceBufferExternalReference>(
-        std::move(tracked_device_buffer));
-  }
-  return ref;
-}
-
-void TfrtCpuBuffer::CommitDonation() {
-  absl::MutexLock lock(&mu_);
-  CHECK(pending_donation_);
-  CHECK(!tracked_device_buffer_);
-  pending_donation_ = false;
-}
-
-void TfrtCpuBuffer::AbortDonation(
-    std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer) {
-  absl::MutexLock lock(&mu_);
-  CHECK(pending_donation_);
-  CHECK(!tracked_device_buffer_);
-  pending_donation_ = false;
-  tracked_device_buffer_ = std::move(device_buffer);
-}
-
-void TfrtCpuBuffer::Delete() {
-  auto device_buffer = ReleaseBufferLocked();
-  if (device_buffer == nullptr) return;
-
-  // Now that all holds have completed and no more can be added, we can get
-  // the final set of usage events.
-  absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> usage_events =
-      device_buffer->LockUseAndTransferUsageEvents();
-
-  std::vector<tfrt::AsyncValue*> event_avs;
-  event_avs.reserve(usage_events.size() + 1);
-  for (auto& event : usage_events) {
-    event_avs.push_back(event.GetAsyncValue());
-  }
-
-  // We should also wait for the definition event.
-  event_avs.push_back(device_buffer->definition_event().GetAsyncValue());
-
-  RunWhenReady(event_avs, [device_buffer = std::move(device_buffer)]() mutable {
-    device_buffer.reset();
-  });
-}
-
-bool TfrtCpuBuffer::IsDeleted() {
-  absl::MutexLock lock(&mu_);
-  return tracked_device_buffer_ == nullptr;
-}
-
-std::unique_ptr<TrackedTfrtCpuDeviceBuffer>
-TfrtCpuBuffer::ReleaseBufferLocked() {
-  absl::MutexLock lock(&mu_);
-  auto condition = [this]() ABSL_SHARED_LOCKS_REQUIRED(mu_) {
-    return !pending_donation_;
-  };
-  mu_.Await(absl::Condition(&condition));
-  return std::move(tracked_device_buffer_);
-}
-
-StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>> TfrtCpuBuffer::Release(
-    bool wait_for_operations_to_complete) {
-  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer =
-      ReleaseBufferLocked();
-  if (device_buffer == nullptr) return {nullptr};
-
-  absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> events;
-  // Now that all holds have completed and no more can be added, we can get
-  // the final set of usage events.
-  events = device_buffer->LockUseAndTransferUsageEvents();
-
-  if (wait_for_operations_to_complete) {
-    // Block the host until all usage events have completed. Usage events
-    // dominate definition events, so this also waits for the buffer to be
-    // defined. Return the first error encountered.
-    Status first_error;
-    for (const auto& av : events) {
-      BlockUntilReady(av.GetAsyncValue());
-      if (auto* error = av.GetErrorIfPresent()) {
-        first_error.Update(
-            InternalError("Error Execute: %s", error->message()));
-      }
-    }
-    if (!first_error.ok()) return std::move(first_error);
-  }
-
-  return device_buffer;
-}
-
-TrackedTfrtCpuDeviceBuffer* TfrtCpuBuffer::AcquireUsage(
-    tfrt::AsyncValueRef<CpuEvent> usage_event) {
-  absl::MutexLock lock(&mu_);
-  if (!tracked_device_buffer_) {
-    return nullptr;
-  }
-
-  tracked_device_buffer_->AddUsageEvents(absl::MakeSpan(&usage_event, 1));
-  return tracked_device_buffer_.get();
-}
-
-StatusOr<TfrtCpuBuffer::DonationTransaction> TfrtCpuBuffer::AcquireDonation() {
-  absl::MutexLock lock(&mu_);
-
-  if (tracked_device_buffer_ == nullptr) {
-    return InvalidArgument("Donation requested for invalid buffer");
-  }
-
-  if (external_reference_counter_ > 0) {
-    return InvalidArgument(
-        "Donation requested for buffer with external reference");
-  }
-
-  CHECK(!pending_donation_);
-  pending_donation_ = true;
-
-  // Swap out `tracked_device_buffer_` so that no one can acquire a usage event
-  // after this point.
-  return DonationTransaction(this, std::move(tracked_device_buffer_));
-}
+    : AbstractTfrtCpuBuffer(std::move(on_device_shape),
+                            std::move(tracked_device_buffer)),
+      client_(client),
+      device_(device) {}
 
 static ShapedBuffer AsShapedBuffer(
     int device_ordinal, const Shape& on_device_shape,
@@ -1045,20 +1070,12 @@ PjRtFuture<Status> TfrtCpuBuffer::ToLiteral(MutableLiteralBase* literal) {
 
   bool should_sync_copy = device_buffer_wait_avs.empty() &&
                           literal->size_bytes() < kSmallDataTransferByteSize;
+  StatusOr<Shape> device_shape = logical_on_device_shape();
+  if (!device_shape.ok()) {
+    return PjRtFuture<Status>(device_shape.status());
+  }
   if (should_sync_copy) {
-    if (!on_device_shape().IsTuple()) {
-      const std::shared_ptr<MaybeOwningCpuMemory>& b =
-          device_buffer->Buffers()[0];
-      std::memcpy(literal->untyped_data(), b->data(), b->size());
-    } else {
-      // Tuple case.
-      int num_leaves = literal->shape().tuple_shapes().size();
-      for (int i = 0; i < num_leaves; ++i) {
-        const std::shared_ptr<MaybeOwningCpuMemory>& b =
-            device_buffer->Buffers()[i];
-        std::memcpy(literal->untyped_data({i}), b->data(), b->size());
-      }
-    }
+    CopyCpuBufferToLiteral(*device_shape, device_buffer, literal);
     // Unblock ToLiteral caller.
     return PjRtFuture<Status>(OkStatus());
   } else {
@@ -1071,7 +1088,7 @@ PjRtFuture<Status> TfrtCpuBuffer::ToLiteral(MutableLiteralBase* literal) {
         client()->pjrt_client_thread_pool(), device_buffer_wait_avs,
         [this, device_buffer_wait_avs = std::move(device_buffer_wait_avs_copy),
          literal, ready_event = ready_event.CopyRef(), device_buffer,
-         ready_on_exit = std::move(ready_on_exit)]() mutable {
+         device_shape, ready_on_exit = std::move(ready_on_exit)]() mutable {
           tsl::profiler::TraceMe traceme("D2H Dispatch");
           // Errors in src buffer are surfaced to user.
           for (const auto& av : device_buffer_wait_avs) {
@@ -1081,21 +1098,7 @@ PjRtFuture<Status> TfrtCpuBuffer::ToLiteral(MutableLiteralBase* literal) {
               return;
             }
           }
-
-          if (!on_device_shape().IsTuple()) {
-            const std::shared_ptr<MaybeOwningCpuMemory>& b =
-                device_buffer->Buffers()[0];
-            std::memcpy(literal->untyped_data(), b->data(), b->size());
-          } else {
-            // Tuple case.
-            int num_leaves = literal->shape().tuple_shapes().size();
-            for (int i = 0; i < num_leaves; ++i) {
-              const std::shared_ptr<MaybeOwningCpuMemory>& b =
-                  device_buffer->Buffers()[i];
-              std::memcpy(literal->untyped_data({i}), b->data(), b->size());
-            }
-          }
-
+          CopyCpuBufferToLiteral(*device_shape, device_buffer, literal);
           // Unblock ToLiteral event.
           ready_event.emplace(OkStatus());
         });
@@ -1210,61 +1213,10 @@ StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuBuffer::CopyToDevice(
       client(), tensorflow::down_cast<TfrtCpuDevice*>(dst_device)));
 }
 
-PjRtFuture<Status> TfrtCpuBuffer::GetReadyFuture() {
-  tfrt::AsyncValueRef<CpuEvent> definition_event;
-  {
-    absl::MutexLock lock(&mu_);
-    if (!tracked_device_buffer_) {
-      return PjRtFuture<Status>(InvalidArgument(
-          "GetReadyFuture() called on deleted or donated buffer"));
-    }
-    definition_event = tracked_device_buffer_->definition_event();
-  }
-  DCHECK(definition_event);
-
-  if (definition_event.IsAvailable()) {
-    if (definition_event.IsError()) {
-      return PjRtFuture<Status>(
-          FailedPrecondition("Buffer Definition Event: %s",
-                             definition_event.GetError().message()));
-    }
-    return PjRtFuture<Status>(OkStatus());
-  } else {
-    tfrt::AsyncValueRef<Status> status_event =
-        tfrt::MakeUnconstructedAsyncValueRef<Status>();
-
-    definition_event.AndThen(
-        [definition_event = definition_event.AsPtr(), status_event]() {
-          if (definition_event.IsError()) {
-            status_event.emplace(
-                FailedPrecondition("Buffer Definition Event: %s",
-                                   definition_event.GetError().message()));
-          } else {
-            status_event.emplace(OkStatus());
-          }
-        });
-
-    return PjRtFuture<Status>(
-        std::move(status_event),
-        /*on_block_start=*/
-        []() {
-          tsl::profiler::TraceMeProducer traceme("TfrtCpuBuffer::Await");
-          VLOG(1) << "TfrtCpuBuffer::Await";
-          return PjRtFutureHelpers::ProfilingKeys(
-              {/*traceme_context_id=*/traceme.GetContextId()});
-        },
-        /*on_block_end=*/
-        [](PjRtFutureHelpers::ProfilingKeys keys) {
-          tsl::profiler::TraceMeConsumer traceme("TfrtCpuBuffer::Await",
-                                                 keys.traceme_context_id);
-        });
-  }
-}
-
 TfrtCpuExecutable::TfrtCpuExecutable(
     int num_replicas, int num_partitions,
     std::shared_ptr<DeviceAssignment> device_assignment,
-    bool parameter_is_tupled_arguments,
+    bool parameter_is_tupled_arguments, CompileOptions compile_options,
     std::unique_ptr<Executable> cpu_executable,
     BufferAllocation::Index result_buffer_index,
     absl::InlinedVector<BufferAllocation::Index, 4> result_buffer_indices,
@@ -1275,6 +1227,7 @@ TfrtCpuExecutable::TfrtCpuExecutable(
       num_partitions_(num_partitions),
       device_assignment_(std::move(device_assignment)),
       parameter_is_tupled_arguments_(parameter_is_tupled_arguments),
+      compile_options_(std::move(compile_options)),
       cpu_executable_(std::move(cpu_executable)),
       result_buffer_index_(result_buffer_index),
       result_buffer_indices_(std::move(result_buffer_indices)),
@@ -1421,10 +1374,9 @@ Status TfrtCpuExecutable::CheckBufferCompatibilities(
 static std::vector<xla::cpu::BufferDesc> MakeXLARuntimeDescriptorTable(
     absl::Span<const std::shared_ptr<MaybeOwningCpuMemory>> buffer_table) {
   std::vector<xla::cpu::BufferDesc> descriptor_table;
-  descriptor_table.reserve(descriptor_table.size());
+  descriptor_table.reserve(buffer_table.size());
   for (const auto& buf : buffer_table) {
-    descriptor_table.emplace_back(
-        xla::cpu::BufferDesc{buf->data(), buf->size()});
+    descriptor_table.emplace_back(buf->data(), buf->size());
   }
   return descriptor_table;
 }
diff --git a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h
index cd33ed052ab..41d17558a9c 100644
--- a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PJRT_TFRT_CPU_PJRT_CLIENT_H_
 #define TENSORFLOW_COMPILER_XLA_PJRT_TFRT_CPU_PJRT_CLIENT_H_
 
+#include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -24,38 +26,75 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/layout.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
 #include "tensorflow/compiler/xla/pjrt/semaphore.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
 #include "tensorflow/compiler/xla/pjrt/transpose.h"
-#include "tensorflow/compiler/xla/pjrt/worker_thread.h"
 #include "tensorflow/compiler/xla/runtime/cpu_event.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_module_util.h"
-#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/tsl/profiler/lib/traceme.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/fingerprint.h"
+#include "tensorflow/tsl/platform/threadpool.h"
 #include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 
 namespace xla {
 
+class TfrtCpuDeviceDescription final : public PjRtDeviceDescription {
+ public:
+  explicit TfrtCpuDeviceDescription(int id);
+
+  int id() const override { return id_; }
+
+  int process_index() const override { return 0; }
+
+  absl::string_view device_kind() const override;
+
+  absl::string_view DebugString() const override;
+
+  absl::string_view ToString() const override;
+
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+ private:
+  int id_;
+  std::string debug_string_;
+  std::string to_string_;
+  absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_ = {};
+};
+
 class TfrtCpuDevice final : public PjRtDevice {
  public:
   TfrtCpuDevice(int id, bool asynchronous);
 
+  const TfrtCpuDeviceDescription& description() const override {
+    return description_;
+  }
+
   void SetClient(PjRtClient* client) {
     CHECK(client_ == nullptr);
     client_ = client;
@@ -67,18 +106,8 @@ class TfrtCpuDevice final : public PjRtDevice {
     return process_index() == client()->process_index();
   }
 
-  int id() const override { return id_; }
-
-  int process_index() const override { return 0; }
-
   // Used as `device_ordinal`.
-  int local_hardware_id() const override { return id_; }
-
-  absl::string_view device_kind() const override;
-
-  absl::string_view DebugString() const override;
-
-  absl::string_view ToString() const override;
+  int local_hardware_id() const override { return id(); }
 
   Status TransferToInfeed(const LiteralSlice& literal) override;
 
@@ -94,26 +123,16 @@ class TfrtCpuDevice final : public PjRtDevice {
     return nullptr;
   }
 
-  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
-      const override {
-    return attributes_;
-  }
-
  private:
-  int id_;
   PjRtClient* client_ = nullptr;
-  std::string debug_string_;
-  std::string to_string_;
+  TfrtCpuDeviceDescription description_;
 
   // TODO(zhangqiaorjc): Optimize semaphore related overhead.
   // Semaphore used to limit how many programs can be enqueued by the host
   // ahead of the device.
   Semaphore max_inflight_computations_semaphore_;
-  absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_ = {};
 };
 
-class TfrtCpuExecutable;
-
 class TfrtCpuClient final : public PjRtClient {
  public:
   TfrtCpuClient(int process_index,
@@ -176,9 +195,7 @@ class TfrtCpuClient final : public PjRtClient {
 
   StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
-                                    PjRtDevice* device) override {
-    return Unimplemented("Async transfer to buffers not implemented");
-  };
+                                    PjRtDevice* device) override;
 
   StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
@@ -282,192 +299,34 @@ class TfrtCpuClient final : public PjRtClient {
   TransposePlanCache transpose_cache_ ABSL_GUARDED_BY(transpose_mu_);
 };
 
-class TfrtCpuBuffer final : public PjRtBuffer {
+class TfrtCpuBuffer final : public AbstractTfrtCpuBuffer {
  public:
   TfrtCpuBuffer(
       Shape on_device_shape,
       std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
       TfrtCpuClient* client, TfrtCpuDevice* device);
-  ~TfrtCpuBuffer() override;
 
   TfrtCpuBuffer(const TfrtCpuBuffer&) = delete;
   TfrtCpuBuffer(TfrtCpuBuffer&&) = delete;
   TfrtCpuBuffer& operator=(const TfrtCpuBuffer&) = delete;
   TfrtCpuBuffer& operator=(TfrtCpuBuffer&&) = delete;
 
-  const Shape& on_device_shape() const override { return on_device_shape_; }
   TfrtCpuDevice* device() const override { return device_; }
   TfrtCpuClient* client() const override { return client_; }
 
   StatusOr<Shape> logical_on_device_shape() override;
 
-  StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
-      override;
-
-  StatusOr<std::unique_ptr<ExternalReference>> ReleaseDeviceMemoryOwnership(
-      bool wait_for_operations_to_complete) override;
-
   using PjRtBuffer::ToLiteralSync;
   PjRtFuture<Status> ToLiteral(MutableLiteralBase* literal) override;
 
-  StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
-
-  PjRtFuture<Status> CopyRawToHost(void* dst, int64_t offset,
-                                   int64_t transfer_size) override {
-    return PjRtFuture<Status>(Unimplemented("CopyRawToHost not implemented"));
-  }
-
-  void Delete() override;
-
-  bool IsDeleted() override;
-
   StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
       PjRtDevice* dst_device) override;
 
-  void CopyToRemoteDevice(
-      PjRtFuture<StatusOr<std::string>> serialized_descriptor,
-      RemoteSendCallback on_done) override {
-    on_done(Unimplemented("CopyToRemoteDevice not implemented."),
-            /*sends_were_enqueued=*/false);
-  }
-
-  void CopyToRemoteDeviceScattered(
-      PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
-      std::vector<RemoteSendCallback> callbacks,
-      const xla::PjRtBuffer::ScatterDetails& scatter_details) override {
-    for (const auto& on_done : callbacks) {
-      on_done(Unimplemented("Implement CopyToRemoteDeviceScattered."),
-              /*sends_were_enqueued=*/false);
-    }
-  }
-
-  PjRtFuture<Status> GetReadyFuture() override;
-
-  bool IsOnCpu() const override { return true; }
-
  private:
-  bool IsEmptyTuple() const {
-    return on_device_shape_.IsTuple() &&
-           on_device_shape_.tuple_shapes_size() == 0;
-  }
-
-  StatusOr<tfrt::AsyncValueRef<Literal>> CopyToHostAsyncInternal(
-      bool discard_cached_copy, std::optional<xla::Layout> layout);
-
-  // Acquires the device buffer for shared read-only usages, and it also adds
-  // the `usage_event` to it. Any donation event in the future is expected to be
-  // serialized after all the usage events added through this method. Returns
-  // nullptr if the buffer is already donated or there is outstanding external
-  // references.
-  TrackedTfrtCpuDeviceBuffer* AcquireUsage(
-      tfrt::AsyncValueRef<runtime::CpuEvent> usage_event);
-
-  // A helper class for managing a pending donation. It should be committed upon
-  // success. Otherwise, the donated buffer is returned to the TfrtCpuBuffer.
-  class DonationTransaction {
-   public:
-    explicit DonationTransaction(
-        TfrtCpuBuffer* buffer,
-        std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer)
-        : buffer_(buffer), device_buffer_(std::move(device_buffer)) {
-      CHECK(buffer_);
-    }
-    DonationTransaction(const DonationTransaction&) = delete;
-    DonationTransaction& operator=(const DonationTransaction&) = delete;
-    DonationTransaction(DonationTransaction&&) = default;
-    DonationTransaction& operator=(DonationTransaction&& other) {
-      Abort();
-
-      buffer_ = other.buffer_;
-      device_buffer_ = std::move(other.device_buffer_);
-      return *this;
-    }
-
-    ~DonationTransaction() { Abort(); }
-
-    // Commit the donation. The rvalue ref qualifier is used to ensure the
-    // semantic that it can be committed at most once.
-    void Commit() && {
-      buffer_->CommitDonation();
-      device_buffer_.reset();
-    }
-
-    TrackedTfrtCpuDeviceBuffer* device_buffer() const {
-      return device_buffer_.get();
-    }
-
-   private:
-    void Abort() {
-      if (device_buffer_) buffer_->AbortDonation(std::move(device_buffer_));
-    }
-
-    TfrtCpuBuffer* buffer_ = nullptr;
-    std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer_;
-  };
-
-  // Acquires the device buffer for exclusive donation. The caller of this
-  // method is expected to use the usage events and definition events to
-  // serialize this donation with previous usages. After this method is called,
-  // calls to AcquireUsage() will fail. Returns error status if the buffer is
-  // already donated or there is outstanding external references.
-  StatusOr<DonationTransaction> AcquireDonation();
-
-  void DropExternalReference() {
-    absl::MutexLock lock(&mu_);
-    CHECK_GT(external_reference_counter_, 0);
-    --external_reference_counter_;
-  }
-
-  // Commits the pending donation by setting `pending_donation_` to false.
-  // `pending_donation_` must be true before calling this method.
-  void CommitDonation();
-
-  // Aborts the pending donation by returning the donated buffer, and setting
-  // `pending_donation_` to false. `pending_donation_` must be true before
-  // calling this method.
-  void AbortDonation(std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer);
-
-  // Similar to Delete, drops the buffer's reference to its associated device
-  // memory, leaving the buffer in an invalid state, but returns the
-  // TrackedTfrtCpuDeviceBuffer rather than freeing the device memory, so that
-  // another framework can take ownership of it. The buffer returned from
-  // Release may be safely dropped at any time even if it still has pending
-  // async operations. The client should call Await before calling Release with
-  // wait_for_operations_to_complete=false, to ensure that the host has
-  // synchronized past any outstanding write operations to the buffer. If
-  // wait_for_operations_to_complete=true the host will block until any
-  // potentially outstanding asynchronous operations have completed before
-  // returning, in which case it is safe to read or mutate the returned buffer.
-  // If the buffer was shared via an external reference it is the client's
-  // responsibility that accesses via that reference do not interfere with
-  // accesses via the buffer returned from Release.
-  StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>> Release(
-      bool wait_for_operations_to_complete);
-
-  // Releases the device buffer by returning a unique_ptr of it. If there is
-  // outstanding donation or usage holds, this method blocks until those holds
-  // are commited or dropped.
-  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> ReleaseBufferLocked()
-      ABSL_LOCKS_EXCLUDED(mu_);
+  absl::string_view buffer_name() const override { return "TfrtCpuBuffer"; }
 
   TfrtCpuClient* client_;
-  const Shape on_device_shape_;
   TfrtCpuDevice* const device_;
-
-  mutable absl::Mutex mu_;
-  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer_
-      ABSL_GUARDED_BY(mu_);
-  // Count of external references on the buffer.
-  int external_reference_counter_ ABSL_GUARDED_BY(mu_) = 0;
-
-  // `pending_donation_` indicates whether a donation is pending. The destructor
-  // of the TfrtCpuBuffer will wait for a pending donation, as the donation
-  // might fail. Note that concurrent calls to AcquireUsage() and
-  // AcquireDonation() might fail even if the pending donation is aborted later.
-  bool pending_donation_ ABSL_GUARDED_BY(mu_) = false;
-
-  friend class TfrtCpuClient;
-  friend class TfrtCpuExecutable;
 };
 
 class TfrtCpuExecutable final : public PjRtLoadedExecutable {
@@ -475,7 +334,7 @@ class TfrtCpuExecutable final : public PjRtLoadedExecutable {
   TfrtCpuExecutable(
       int num_replicas, int num_partitions,
       std::shared_ptr<DeviceAssignment> device_assignment,
-      bool parameter_is_tupled_arguments,
+      bool parameter_is_tupled_arguments, CompileOptions compile_options,
       std::unique_ptr<Executable> cpu_executable,
       BufferAllocation::Index result_buffer_index,
       absl::InlinedVector<BufferAllocation::Index, 4> result_buffer_indices,
@@ -585,6 +444,7 @@ class TfrtCpuExecutable final : public PjRtLoadedExecutable {
   int num_partitions_;
   std::shared_ptr<DeviceAssignment> device_assignment_;
   bool parameter_is_tupled_arguments_;
+  CompileOptions compile_options_;
 
   std::shared_ptr<Executable> cpu_executable_;
 
diff --git a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client_test.cc b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client_test.cc
index 55e655518f2..87dbc81c839 100644
--- a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client_test.cc
+++ b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client_test.cc
@@ -15,22 +15,39 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
 
+#include <algorithm>
+#include <cstring>
 #include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/custom_call_status.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/file_system.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
+using ::testing::Each;
+using ::testing::ElementsAreArray;
+using ::testing::HasSubstr;
+using ::testing::IsFalse;
+
 void TestError(void* out, const void** in, XlaCustomCallStatus* status) {
   static constexpr char kError[] = "test error.";
   XlaCustomCallStatusSetFailure(status, kError, sizeof(kError));
@@ -67,13 +84,12 @@ ENTRY DonationWithExecutionError() -> f32[2, 2] {
   auto result = pjrt_executable->Execute(/*argument_handles=*/{{buffer.get()}},
                                          /*options=*/{});
   ASSERT_FALSE(result.ok());
-  EXPECT_THAT(result.status().error_message(),
-              ::testing::HasSubstr("test error."));
+  EXPECT_THAT(result.status().message(), ::testing::HasSubstr("test error."));
 
   result = pjrt_executable->Execute(/*argument_handles=*/{{buffer.get()}},
                                     /*options=*/{});
   ASSERT_FALSE(result.ok());
-  EXPECT_THAT(result.status().error_message(),
+  EXPECT_THAT(result.status().message(),
               ::testing::HasSubstr("buffer has been deleted or donated."));
 }
 
@@ -143,5 +159,119 @@ TEST(TfrtCpuClientTest, HloSnapshot) {
       LiteralUtil::CreateR2<float>({{11.0, 22.0}, {33.0, 44.0}, {55.0, 66.0}}));
 }
 
+TEST(TfrtCpuClientTest, AsyncTransferRawData) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
+  TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                          client->CreateBuffersForAsyncHostToDevice(
+                              {shape}, client->addressable_devices()[0]));
+  auto buffer = transfer_manager->RetrieveBuffer(0);
+  auto ready_future = buffer->GetReadyFuture();
+  EXPECT_THAT(ready_future.IsReady(), IsFalse());
+  constexpr size_t raw_data_size = 3 * 2 * 4;
+  char raw_data[raw_data_size];
+  std::fill(raw_data, raw_data + raw_data_size, 0x42);
+  absl::string_view raw_data_view(raw_data, raw_data_size);
+  TF_ASSERT_OK(transfer_manager->TransferRawDataToBuffer(
+      0, absl::string_view(raw_data, raw_data_size), []() {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteralSync());
+  ASSERT_EQ(literal->element_count(), 3 * 2);
+  EXPECT_THAT(literal->data<uint32_t>(), Each(0x42424242));
+}
+
+TEST(TfrtCpuClientTest, AsyncTransferLiteral) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  xla::Shape shape = xla::ShapeUtil::MakeShape(F32, {128, 256});
+  TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                          client->CreateBuffersForAsyncHostToDevice(
+                              {shape}, client->addressable_devices()[0]));
+  auto buffer = transfer_manager->RetrieveBuffer(0);
+  auto ready_future = buffer->GetReadyFuture();
+  EXPECT_THAT(ready_future.IsReady(), IsFalse());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, xla::MakeFakeLiteral(shape));
+  TF_ASSERT_OK(transfer_manager->TransferLiteralToBuffer(0, literal, []() {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+  EXPECT_THAT(received_literal->data<float>(),
+              ElementsAreArray(literal.data<float>()));
+}
+
+TEST(TfrtCpuClientTest, AsyncTransferCallsOnDone) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  xla::Shape shape = ShapeUtil::MakeShape(F32, {3, 2});
+  TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                          client->CreateBuffersForAsyncHostToDevice(
+                              {shape}, client->addressable_devices()[0]));
+  auto buffer = transfer_manager->RetrieveBuffer(0);
+  auto ready_future = buffer->GetReadyFuture();
+  EXPECT_THAT(ready_future.IsReady(), IsFalse());
+  char raw_data[3 * 2 * 4] = {0};
+  absl::string_view raw_data_view(raw_data, sizeof(raw_data));
+  absl::Notification done;
+  auto mark_done = [&]() { done.Notify(); };
+  TF_ASSERT_OK(
+      transfer_manager->TransferRawDataToBuffer(0, raw_data_view, mark_done));
+  done.WaitForNotification();
+}
+
+TEST(TfrtCpuClientTest, AsyncTransferNeverTransferred) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
+  TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                          client->CreateBuffersForAsyncHostToDevice(
+                              {shape}, client->addressable_devices()[0]));
+  auto buffer = transfer_manager->RetrieveBuffer(0);
+  transfer_manager.reset();
+  EXPECT_THAT(
+      buffer->ToLiteralSync(),
+      tsl::testing::StatusIs(tsl::error::INTERNAL,
+                             HasSubstr("Async transfer object was deleted "
+                                       "before transfers completed.")));
+}
+
+TEST(TfrtCpuClientTest, AsyncTransferBufferCount) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
+  TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                          client->CreateBuffersForAsyncHostToDevice(
+                              {shape}, client->addressable_devices()[0]));
+  EXPECT_EQ(transfer_manager->buffer_count(), 1);
+  TF_ASSERT_OK_AND_ASSIGN(
+      transfer_manager, client->CreateBuffersForAsyncHostToDevice(
+                            {shape, shape}, client->addressable_devices()[0]));
+  EXPECT_EQ(transfer_manager->buffer_count(), 2);
+}
+
+TEST(TfrtCpuClientTest, AsyncTransferBufferSize) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
+  TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                          client->CreateBuffersForAsyncHostToDevice(
+                              {shape}, client->addressable_devices()[0]));
+  EXPECT_EQ(transfer_manager->buffer_size(0), 3 * 2 * 4);
+}
+
+TEST(TfrtCpuClientTest, AsyncTransferDevice) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
+  auto* device = client->addressable_devices()[0];
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto transfer_manager,
+      client->CreateBuffersForAsyncHostToDevice({shape}, device));
+  EXPECT_EQ(transfer_manager->device(), device);
+}
+
+TEST(TfrtCpuClientTest, AsyncTransferSetBufferError) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
+  TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                          client->CreateBuffersForAsyncHostToDevice(
+                              {shape}, client->addressable_devices()[0]));
+  auto buffer = transfer_manager->RetrieveBuffer(0);
+  transfer_manager->SetBufferError(0, InternalError("foobar"));
+  EXPECT_THAT(
+      buffer->ToLiteralSync(),
+      tsl::testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar")));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/tpu_client.cc b/tensorflow/compiler/xla/pjrt/tpu_client.cc
index 12a10fe2ee5..a7a9baa0bad 100644
--- a/tensorflow/compiler/xla/pjrt/tpu_client.cc
+++ b/tensorflow/compiler/xla/pjrt/tpu_client.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/tpu_client.h"
 
 #include <array>
+#include <limits>
 #include <map>
 #include <memory>
 #include <optional>
@@ -165,20 +166,43 @@ StatusOr<std::string> PjRtTpuClient::SerializeExecutable(
   const TpuExecutable* tpu_executable =
       tensorflow::down_cast<const TpuExecutable*>(
           se_executable->executables()[0]->executable());
-  return tpu_executable->Serialize();
+  ExecutableAndOptionsProto proto;
+  TF_ASSIGN_OR_RETURN(*proto.mutable_serialized_executable(),
+                      tpu_executable->Serialize());
+  TF_ASSIGN_OR_RETURN(*proto.mutable_compile_options(),
+                      se_executable->compile_options_.ToProto());
+  return proto.SerializeAsString();
 }
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 PjRtTpuClient::DeserializeExecutable(absl::string_view serialized,
                                      std::optional<CompileOptions> options) {
-  if (!options.has_value()) {
-    return InvalidArgument(
-        "PjRtTpuClient::DeserializeExecutable() requires CompileOptions");
+  ExecutableAndOptionsProto proto;
+  if (serialized.size() > std::numeric_limits<int>::max()) {
+    return Internal(
+        "PjRtTpuClient::DeserializeExecutable proto too large (>2GB)");
+  }
+  if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
+    return Internal(
+        "PjRtTpuClient::DeserializeExecutable proto deserialization failed");
   }
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<TpuExecutable> tpu_executable,
-                      TpuExecutable::Deserialize(serialized));
 
-  TF_ASSIGN_OR_RETURN(ExecutableExtras extras, GetExecutableExtras(&*options));
+  CompileOptions compile_options;
+  if (options.has_value()) {
+    compile_options = *std::move(options);
+  } else {
+    TF_ASSIGN_OR_RETURN(compile_options,
+                        CompileOptions::FromProto(proto.compile_options()));
+  }
+
+  auto input_options = compile_options;
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<TpuExecutable> tpu_executable,
+      TpuExecutable::Deserialize(proto.serialized_executable()));
+
+  TF_ASSIGN_OR_RETURN(ExecutableExtras extras,
+                      GetExecutableExtras(&compile_options));
 
   // TODO(skyewm): can we streamline this? e.g. removing proto serialization
   XlaComputation computation(tpu_executable->module().ToProto());
@@ -192,22 +216,24 @@ PjRtTpuClient::DeserializeExecutable(absl::string_view serialized,
             .transfer_manager()
             ->ChooseCompactLayoutForShape(shape);
       },
-      options->argument_layouts, &options->executable_build_options,
+      compile_options.argument_layouts,
+      &compile_options.executable_build_options,
       &unused_argument_layout_pointers));
 
   auto local_executable = std::make_unique<LocalExecutable>(
       std::move(tpu_executable), client_->mutable_backend(),
-      options->executable_build_options);
+      compile_options.executable_build_options);
   std::vector<std::unique_ptr<LocalExecutable>> local_executables;
   local_executables.emplace_back(std::move(local_executable));
 
   auto pjrt_executable = std::make_unique<PjRtStreamExecutorExecutable>(
-      std::move(local_executables), options->parameter_is_tupled_arguments,
-      std::move(extras.device_assignment),
+      std::move(local_executables),
+      compile_options.parameter_is_tupled_arguments,
+      std::move(extras.device_assignment), std::move(input_options),
       std::move(extras.addressable_device_logical_ids),
       std::move(extras.addressable_devices), this);
-  TF_RETURN_IF_ERROR(
-      pjrt_executable->SetUpDonation(options->parameter_is_tupled_arguments));
+  TF_RETURN_IF_ERROR(pjrt_executable->SetUpDonation(
+      compile_options.parameter_is_tupled_arguments));
   return std::unique_ptr<PjRtLoadedExecutable>(std::move(pjrt_executable));
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/transpose.cc b/tensorflow/compiler/xla/pjrt/transpose.cc
index ab60abaff21..18897562a5b 100644
--- a/tensorflow/compiler/xla/pjrt/transpose.cc
+++ b/tensorflow/compiler/xla/pjrt/transpose.cc
@@ -1325,19 +1325,6 @@ std::string TransposePlan::ToString() const {
       scratch_size_, nodes_str);
 }
 
-struct TransposePlanCacheKey {
-  size_t elem_size_in_bytes;
-  absl::InlinedVector<int64_t, 4> dims;
-  absl::InlinedVector<int64_t, 4> permutation;
-  bool input_layout_is_tiling;
-  absl::InlinedVector<int64_t, 4> input_layout;
-  absl::InlinedVector<int64_t, 4> output_tiling;
-  TransposePlan::Transformation transformation;
-  int num_threads;
-
-  bool operator==(const TransposePlanCacheKey& other) const;
-};
-
 bool TransposePlanCacheKey::operator==(
     const TransposePlanCacheKey& other) const {
   return elem_size_in_bytes == other.elem_size_in_bytes && dims == other.dims &&
diff --git a/tensorflow/compiler/xla/pjrt/transpose.h b/tensorflow/compiler/xla/pjrt/transpose.h
index 0c64c3a8588..296d1b3caf2 100644
--- a/tensorflow/compiler/xla/pjrt/transpose.h
+++ b/tensorflow/compiler/xla/pjrt/transpose.h
@@ -240,7 +240,21 @@ class TransposePlan {
   int64_t scratch_size_ = 0;
 };
 
-struct TransposePlanCacheKey;
+struct TransposePlanCacheKey {
+  template <typename H>
+  friend H AbslHashValue(H h, const TransposePlanCacheKey& key);
+
+  size_t elem_size_in_bytes;
+  absl::InlinedVector<int64_t, 4> dims;
+  absl::InlinedVector<int64_t, 4> permutation;
+  bool input_layout_is_tiling;
+  absl::InlinedVector<int64_t, 4> input_layout;
+  absl::InlinedVector<int64_t, 4> output_tiling;
+  TransposePlan::Transformation transformation;
+  int num_threads;
+
+  bool operator==(const TransposePlanCacheKey& other) const;
+};
 
 template <typename H>
 H AbslHashValue(H h, const TransposePlanCacheKey& key);
diff --git a/tensorflow/compiler/xla/pjrt/transpose_test.cc b/tensorflow/compiler/xla/pjrt/transpose_test.cc
index cf4146d9f68..202e56672a8 100644
--- a/tensorflow/compiler/xla/pjrt/transpose_test.cc
+++ b/tensorflow/compiler/xla/pjrt/transpose_test.cc
@@ -122,7 +122,7 @@ TEST(TransposeTest, InvalidTilings) {
                             /*output_tiling=*/TransposePlan::Tiling{{4}});
   EXPECT_EQ(plan.status().code(), tsl::error::UNIMPLEMENTED);
   EXPECT_THAT(
-      plan.status().error_message(),
+      plan.status().message(),
       testing::HasSubstr(
           "Only one of the input and output may have a non-trivial tiling"));
 }
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index c70fa2e1847..e4b41d87e8b 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -42,6 +42,8 @@ int SignificandWidth(PrimitiveType type) {
       return std::numeric_limits<tsl::float8_e5m2>::digits;
     case F8E4M3FN:
       return std::numeric_limits<tsl::float8_e4m3fn>::digits;
+    case F8E4M3B11FNUZ:
+      return std::numeric_limits<tsl::float8_e4m3b11>::digits;
     default:
       LOG(FATAL) << "Not a floating data type " << type;
   }
@@ -60,6 +62,31 @@ int ExponentWidth(PrimitiveType type) {
   return total_bit_width - (trailing_significand_field_width + kSignBitWidth);
 }
 
+int UnderflowExponent(PrimitiveType type) {
+  // |std::numeric_limits<float>::min_exponent| is defined as: "minimum negative
+  // integer such that radix raised to the power one less than that integer is a
+  // normalized floating-point number." as such it does not actually yield the
+  // minimum exponent but the exponent of the first integer which overflows.
+  switch (type) {
+    case F32:
+      return std::numeric_limits<float>::min_exponent;
+    case F64:
+      return std::numeric_limits<double>::min_exponent;
+    case BF16:
+      return std::numeric_limits<bfloat16>::min_exponent;
+    case F16:
+      return std::numeric_limits<half>::min_exponent;
+    case F8E5M2:
+      return std::numeric_limits<tsl::float8_e5m2>::min_exponent;
+    case F8E4M3FN:
+      return std::numeric_limits<tsl::float8_e4m3fn>::min_exponent;
+    case F8E4M3B11FNUZ:
+      return std::numeric_limits<tsl::float8_e4m3b11>::min_exponent;
+    default:
+      LOG(FATAL) << "Not a floating data type " << type;
+  }
+}
+
 int OverflowExponent(PrimitiveType type) {
   // |std::numeric_limits<float>::max_exponent| is defined as: "Maximum positive
   // integer such that radix raised to the power one less than that integer is a
@@ -79,6 +106,8 @@ int OverflowExponent(PrimitiveType type) {
       return std::numeric_limits<tsl::float8_e5m2>::max_exponent;
     case F8E4M3FN:
       return std::numeric_limits<tsl::float8_e4m3fn>::max_exponent;
+    case F8E4M3B11FNUZ:
+      return std::numeric_limits<tsl::float8_e4m3b11>::max_exponent;
     default:
       LOG(FATAL) << "Not a floating data type " << type;
   }
@@ -86,7 +115,7 @@ int OverflowExponent(PrimitiveType type) {
 
 bool IsFloatingPointType(PrimitiveType type) {
   return type == F16 || type == F32 || type == F64 || type == BF16 ||
-         type == F8E5M2 || type == F8E4M3FN;
+         type == F8E5M2 || type == F8E4M3FN || type == F8E4M3B11FNUZ;
 }
 
 bool IsComplexType(PrimitiveType type) { return type == C64 || type == C128; }
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index f7b5b0cc0e3..656291250f1 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -40,6 +40,10 @@ int SignificandWidth(PrimitiveType type);
 // For non-float datatypes, results in a LOG(FATAL).
 int ExponentWidth(PrimitiveType type);
 
+// Returns the exponent of the smallest number which cannot be represented.
+// For non-float datatypes, results in a LOG(FATAL).
+int UnderflowExponent(PrimitiveType type);
+
 // Returns the exponent of the smallest number which cannot be represented.
 // For non-float datatypes, results in a LOG(FATAL).
 int OverflowExponent(PrimitiveType type);
@@ -148,6 +152,11 @@ inline PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3fn>() {
   return F8E4M3FN;
 }
 
+template <>
+inline PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3b11>() {
+  return F8E4M3B11FNUZ;
+}
+
 // Complex
 template <>
 inline PrimitiveType NativeToPrimitiveType<complex64>() {
@@ -169,6 +178,10 @@ bool IsUnsignedIntegralType(PrimitiveType type);
 
 bool IsIntegralType(PrimitiveType type);
 
+inline bool IsF8Type(PrimitiveType type) {
+  return type == F8E5M2 || type == F8E4M3FN || type == F8E4M3B11FNUZ;
+}
+
 // Returns true if values of the given primitive type are held in array shapes.
 inline constexpr bool IsArrayType(PrimitiveType primitive_type) {
   return primitive_type != PRIMITIVE_TYPE_INVALID && primitive_type != TUPLE &&
@@ -189,6 +202,7 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE inline int BitWidth(PrimitiveType type) {
     case U8:
     case F8E5M2:
     case F8E4M3FN:
+    case F8E4M3B11FNUZ:
       return 8;
 
     case S16:
@@ -236,6 +250,7 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE inline int ByteWidth(PrimitiveType type) {
     case U8:
     case F8E5M2:
     case F8E4M3FN:
+    case F8E4M3B11FNUZ:
       return 1;
 
     case S16:
@@ -258,6 +273,10 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE inline int ByteWidth(PrimitiveType type) {
     case C128:
       return 16;
 
+    case TOKEN:
+      // Tokens require no space.
+      return 0;
+
     case TUPLE:
       LOG(FATAL) << "TUPLE is an invalid type for ByteWidth";
 
@@ -349,6 +368,8 @@ inline bool CastPreservesValues(PrimitiveType from_type,
                primitive_util::SignificandWidth(to_type) &&
            primitive_util::ExponentWidth(from_type) <=
                primitive_util::ExponentWidth(to_type) &&
+           primitive_util::UnderflowExponent(from_type) >=
+               primitive_util::UnderflowExponent(to_type) &&
            primitive_util::OverflowExponent(from_type) <=
                primitive_util::OverflowExponent(to_type);
   }
@@ -479,6 +500,11 @@ struct PrimitiveTypeToNative<F8E4M3FN> {
   using type = tsl::float8_e4m3fn;
 };
 
+template <>
+struct PrimitiveTypeToNative<F8E4M3B11FNUZ> {
+  using type = tsl::float8_e4m3b11;
+};
+
 // Complex
 template <>
 struct PrimitiveTypeToNative<C64> {
@@ -515,6 +541,7 @@ bool IsCanonicalRepresentation(PrimitiveType type) {
     case F64:
     case F8E5M2:
     case F8E4M3FN:
+    case F8E4M3B11FNUZ:
     case C64:
     case C128:
       return NativeToPrimitiveType<T>() == type;
diff --git a/tensorflow/compiler/xla/primitive_util_test.cc b/tensorflow/compiler/xla/primitive_util_test.cc
index dd907427725..72aeae51a74 100644
--- a/tensorflow/compiler/xla/primitive_util_test.cc
+++ b/tensorflow/compiler/xla/primitive_util_test.cc
@@ -73,6 +73,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[PRED][C128] = true;
   expecteds[PRED][F8E5M2] = true;
   expecteds[PRED][F8E4M3FN] = true;
+  expecteds[PRED][F8E4M3B11FNUZ] = true;
   expecteds[S4][PRED] = false;
   expecteds[S4][S4] = true;
   expecteds[S4][S8] = true;
@@ -92,6 +93,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S4][C128] = true;
   expecteds[S4][F8E5M2] = true;
   expecteds[S4][F8E4M3FN] = true;
+  expecteds[S4][F8E4M3B11FNUZ] = true;
   expecteds[S8][PRED] = false;
   expecteds[S8][S4] = false;
   expecteds[S8][S8] = true;
@@ -111,6 +113,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S8][C128] = true;
   expecteds[S8][F8E5M2] = false;
   expecteds[S8][F8E4M3FN] = false;
+  expecteds[S8][F8E4M3B11FNUZ] = false;
   expecteds[S16][PRED] = false;
   expecteds[S16][S4] = false;
   expecteds[S16][S8] = false;
@@ -130,6 +133,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S16][C128] = true;
   expecteds[S16][F8E5M2] = false;
   expecteds[S16][F8E4M3FN] = false;
+  expecteds[S16][F8E4M3B11FNUZ] = false;
   expecteds[S32][PRED] = false;
   expecteds[S32][S4] = false;
   expecteds[S32][S8] = false;
@@ -149,6 +153,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S32][C128] = true;
   expecteds[S32][F8E5M2] = false;
   expecteds[S32][F8E4M3FN] = false;
+  expecteds[S32][F8E4M3B11FNUZ] = false;
   expecteds[S64][PRED] = false;
   expecteds[S64][S4] = false;
   expecteds[S64][S8] = false;
@@ -168,6 +173,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S64][C128] = false;
   expecteds[S64][F8E5M2] = false;
   expecteds[S64][F8E4M3FN] = false;
+  expecteds[S64][F8E4M3B11FNUZ] = false;
   expecteds[U4][PRED] = false;
   expecteds[U4][S4] = false;
   expecteds[U4][S8] = true;
@@ -189,6 +195,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U4][C128] = true;
   expecteds[U4][F8E5M2] = false;
   expecteds[U4][F8E4M3FN] = true;
+  expecteds[U4][F8E4M3B11FNUZ] = true;
   expecteds[U8][PRED] = false;
   expecteds[U8][S4] = false;
   expecteds[U8][S8] = false;
@@ -210,6 +217,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U8][C128] = true;
   expecteds[U8][F8E5M2] = false;
   expecteds[U8][F8E4M3FN] = false;
+  expecteds[U8][F8E4M3B11FNUZ] = false;
   expecteds[U16][PRED] = false;
   expecteds[U16][S4] = false;
   expecteds[U16][S8] = false;
@@ -229,6 +237,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U16][C128] = true;
   expecteds[U16][F8E5M2] = false;
   expecteds[U16][F8E4M3FN] = false;
+  expecteds[U16][F8E4M3B11FNUZ] = false;
   expecteds[U32][PRED] = false;
   expecteds[U32][S4] = false;
   expecteds[U32][S8] = false;
@@ -248,6 +257,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U32][C128] = true;
   expecteds[U32][F8E5M2] = false;
   expecteds[U32][F8E4M3FN] = false;
+  expecteds[U32][F8E4M3B11FNUZ] = false;
   expecteds[U64][PRED] = false;
   expecteds[U64][S4] = false;
   expecteds[U64][S8] = false;
@@ -267,6 +277,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U64][C128] = false;
   expecteds[U64][F8E5M2] = false;
   expecteds[U64][F8E4M3FN] = false;
+  expecteds[U64][F8E4M3B11FNUZ] = false;
   expecteds[F16][PRED] = false;
   expecteds[F16][S4] = false;
   expecteds[F16][S8] = false;
@@ -286,6 +297,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F16][C128] = true;
   expecteds[F16][F8E5M2] = false;
   expecteds[F16][F8E4M3FN] = false;
+  expecteds[F16][F8E4M3B11FNUZ] = false;
   expecteds[F32][PRED] = false;
   expecteds[F32][S4] = false;
   expecteds[F32][S8] = false;
@@ -305,6 +317,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F32][C128] = true;
   expecteds[F32][F8E5M2] = false;
   expecteds[F32][F8E4M3FN] = false;
+  expecteds[F32][F8E4M3B11FNUZ] = false;
   expecteds[F64][PRED] = false;
   expecteds[F64][S4] = false;
   expecteds[F64][S8] = false;
@@ -324,6 +337,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F64][C128] = true;
   expecteds[F64][F8E5M2] = false;
   expecteds[F64][F8E4M3FN] = false;
+  expecteds[F64][F8E4M3B11FNUZ] = false;
   expecteds[C64][PRED] = false;
   expecteds[C64][S4] = false;
   expecteds[C64][S8] = false;
@@ -343,6 +357,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[C64][C128] = true;
   expecteds[C64][F8E5M2] = false;
   expecteds[C64][F8E4M3FN] = false;
+  expecteds[C64][F8E4M3B11FNUZ] = false;
   expecteds[BF16][PRED] = false;
   expecteds[BF16][S4] = false;
   expecteds[BF16][S8] = false;
@@ -362,6 +377,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[BF16][C128] = true;
   expecteds[BF16][F8E5M2] = false;
   expecteds[BF16][F8E4M3FN] = false;
+  expecteds[BF16][F8E4M3B11FNUZ] = false;
   expecteds[C128][PRED] = false;
   expecteds[C128][S4] = false;
   expecteds[C128][S8] = false;
@@ -381,6 +397,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[C128][C128] = true;
   expecteds[C128][F8E5M2] = false;
   expecteds[C128][F8E4M3FN] = false;
+  expecteds[C128][F8E4M3B11FNUZ] = false;
   expecteds[F8E5M2][PRED] = false;
   expecteds[F8E5M2][S4] = false;
   expecteds[F8E5M2][S8] = false;
@@ -400,6 +417,7 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F8E5M2][C128] = true;
   expecteds[F8E5M2][F8E5M2] = true;
   expecteds[F8E5M2][F8E4M3FN] = false;
+  expecteds[F8E5M2][F8E4M3B11FNUZ] = false;
   expecteds[F8E4M3FN][PRED] = false;
   expecteds[F8E4M3FN][S4] = false;
   expecteds[F8E4M3FN][S8] = false;
@@ -419,6 +437,27 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F8E4M3FN][C128] = true;
   expecteds[F8E4M3FN][F8E5M2] = false;
   expecteds[F8E4M3FN][F8E4M3FN] = true;
+  expecteds[F8E4M3FN][F8E4M3B11FNUZ] = false;
+  expecteds[F8E4M3B11FNUZ][PRED] = false;
+  expecteds[F8E4M3B11FNUZ][S4] = false;
+  expecteds[F8E4M3B11FNUZ][S8] = false;
+  expecteds[F8E4M3B11FNUZ][S16] = false;
+  expecteds[F8E4M3B11FNUZ][S32] = false;
+  expecteds[F8E4M3B11FNUZ][S64] = false;
+  expecteds[F8E4M3B11FNUZ][U4] = false;
+  expecteds[F8E4M3B11FNUZ][U8] = false;
+  expecteds[F8E4M3B11FNUZ][U16] = false;
+  expecteds[F8E4M3B11FNUZ][U32] = false;
+  expecteds[F8E4M3B11FNUZ][U64] = false;
+  expecteds[F8E4M3B11FNUZ][F16] = true;
+  expecteds[F8E4M3B11FNUZ][F32] = true;
+  expecteds[F8E4M3B11FNUZ][F64] = true;
+  expecteds[F8E4M3B11FNUZ][C64] = true;
+  expecteds[F8E4M3B11FNUZ][BF16] = true;
+  expecteds[F8E4M3B11FNUZ][C128] = true;
+  expecteds[F8E4M3B11FNUZ][F8E5M2] = false;
+  expecteds[F8E4M3B11FNUZ][F8E4M3FN] = false;
+  expecteds[F8E4M3B11FNUZ][F8E4M3B11FNUZ] = true;
 
   for (int from_type_int = PrimitiveType_MIN;
        from_type_int < PrimitiveType_ARRAYSIZE; ++from_type_int) {
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index f7150824a74..69957460518 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -15,6 +15,7 @@ load(
     "if_cuda_or_rocm",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("//tensorflow/tsl:tsl.default.bzl", "tsl_pybind_extension")
 load("//tensorflow:pytype.default.bzl", "pytype_library")
 
@@ -145,10 +146,35 @@ cc_library(
         ":exceptions",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/tsl/platform:macros",
         "@pybind11",
     ],
 )
 
+tsl_pybind_extension(
+    name = "status_casters_ext",
+    srcs = ["status_casters_ext.cc"],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":exceptions",
+        ":status_casters",
+        "@pybind11",
+    ],
+)
+
+py_test(
+    name = "status_casters_test",
+    srcs = ["status_casters_test.py"],
+    main = "status_casters_test.py",
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = ["no_oss"],
+    deps = [
+        ":status_casters_ext",
+        "@absl_py//absl/testing:absltest",
+    ] + xla_py_test_deps(),
+)
+
 cc_library(
     name = "exceptions",
     hdrs = ["exceptions.h"],
@@ -176,7 +202,6 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":exceptions",
-        ":status_casters",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
@@ -187,8 +212,6 @@ cc_library(
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:protobuf",
-        "//tensorflow/tsl/python/lib/core:bfloat16_lib",
-        "//tensorflow/tsl/python/lib/core:float8_lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@pybind11",
@@ -282,9 +305,9 @@ cc_library(
         "py_array.cc",
         "py_buffer.cc",
         "py_client.cc",
+        "py_compile_only_client.cc",
         "py_executable.cc",
         "py_values.cc",
-        "sharded_device_array.cc",
         "sharding.cc",
     ] + if_cuda_or_rocm([
         "py_client_gpu.cc",
@@ -294,6 +317,7 @@ cc_library(
         "py_array.h",
         "py_buffer.h",
         "py_client.h",
+        "py_compile_only_client.h",
         "py_executable.h",
         "py_values.h",
         "sharded_device_array.h",
@@ -306,13 +330,12 @@ cc_library(
         "-fexceptions",
         "-fno-strict-aliasing",
     ],
-    defines = if_cuda(["GOOGLE_CUDA=1"]),
+    defines = if_cuda(["GOOGLE_CUDA=1"]) + if_rocm([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     features = ["-use_header_modules"],
     deps = [
         ":exceptions",
-        "//tensorflow/compiler/xla/pjrt:lru_cache",
-        "//tensorflow/tsl/platform:float8",
-        "//tensorflow/tsl/python/lib/core:numpy",
         ":pprof_profile_builder",
         ":python_ref_manager",
         ":python_utils",
@@ -321,26 +344,18 @@ cc_library(
         ":transfer_guard_lib",
         ":types",
         ":util",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
-        "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:absl_casters",
-        "@llvm-project//llvm:Support",
-        "//tensorflow/compiler/xla/pjrt:host_callback",
-        "//tensorflow/compiler/xla/pjrt:pjrt_future",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/pjrt:host_callback",
+        "//tensorflow/compiler/xla/pjrt:lru_cache",
         "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_compiler",
+        "//tensorflow/compiler/xla/pjrt:pjrt_future",
         "//tensorflow/compiler/xla/pjrt:pjrt_stream_executor_client",
         "//tensorflow/compiler/xla/pjrt:transpose",
         "//tensorflow/compiler/xla/python/ifrt",
@@ -349,11 +364,25 @@ cc_library(
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:fingerprint",
+        "//tensorflow/tsl/platform:float8",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/profiler/lib:traceme",
-        "//tensorflow/tsl/platform:fingerprint",
+        "//tensorflow/tsl/python/lib/core:numpy",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
+        "@llvm-project//llvm:Support",
+        "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:absl_casters",
     ] + if_cuda([
         "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm([
+        "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
 
@@ -402,6 +431,7 @@ cc_library(
         ":python_ref_manager",
         ":python_utils",
         ":pytree",
+        ":status_casters",
         ":types",
         ":util",
         "//tensorflow/compiler/xla:shape_util",
@@ -425,6 +455,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "inspect_sharding",
+    srcs = ["inspect_sharding.cc"],
+    hdrs = ["inspect_sharding.h"],
+    deps = [
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:custom_call_sharding_helper",
+        "//tensorflow/compiler/xla/service/spmd:spmd_partitioner",
+    ],
+    # Always register 'InspectSharding' custom partitioning handler.
+    alwayslink = 1,
+)
+
 cc_library(
     name = "custom_call_sharding",
     srcs = ["custom_call_sharding.cc"],
@@ -437,6 +480,7 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:private"],
     deps = [
+        ":inspect_sharding",
         ":status_casters",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
@@ -460,6 +504,7 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
+        ":status_casters",
         ":types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -487,13 +532,14 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:sharding_builder",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_executable",
         "//tensorflow/tsl/profiler/lib:traceme",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
     ],
 )
@@ -540,6 +586,7 @@ cc_library(
         ":jax_jit",
         ":py_client",
         ":python_utils",
+        ":status_casters",
         ":types",
         ":util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -595,11 +642,12 @@ cc_library(
     deps = [
         ":outfeed_receiver",
         ":py_client",
+        ":status_casters",
         ":types",
+        "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
         "@pybind11",
     ],
@@ -641,6 +689,7 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
+        ":status_casters",
         ":types",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla/client:xla_computation",
@@ -655,6 +704,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:SparseTensorDialect",
         "@pybind11",
         "@stablehlo//:chlo_ops",
@@ -675,6 +725,7 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
+        ":status_casters",
         ":types",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla/backends/profiler:profiler_backends",
@@ -700,6 +751,7 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = [":friends"],
     deps = [
+        ":status_casters",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:util",
         "@com_google_absl//absl/base:core_headers",
@@ -760,6 +812,7 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         ":py_client",
+        ":status_casters",
         ":types",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_util",
@@ -787,6 +840,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/tsl/lib/strings:proto_serialization",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@pybind11",
@@ -879,6 +933,7 @@ tsl_pybind_extension(
     }),
     linkopts = select({
         ":use_jax_cuda_pip_rpaths": [
+            "-Wl,-rpath,$$ORIGIN/../nvidia/cuda_cupti/lib",
             "-Wl,-rpath,$$ORIGIN/../nvidia/cuda_runtime/lib",
             "-Wl,-rpath,$$ORIGIN/../nvidia/cublas/lib",
             "-Wl,-rpath,$$ORIGIN/../nvidia/cufft/lib",
@@ -893,39 +948,35 @@ tsl_pybind_extension(
     pytype_srcs = glob(["xla_extension/*.pyi"]),
     visibility = ["//visibility:public"],
     deps = [
-        ":dlpack",
         ":custom_call_sharding",
+        ":dlpack",
         ":jax_jit",
         ":mlir",
-        "//tensorflow/tsl/python/lib/core:numpy",
         ":ops",
-        ":util",
-        ":pmap_lib",
+        ":outfeed_receiver_py",
         ":pjit",
-        ":weakref_lru_cache",
+        ":pmap_lib",
         ":pprof_profile_builder",
         ":profiler",
-        ":transfer_guard_lib",
         ":py_client",
-        ":pytree",
         ":python_ref_manager",
+        ":pytree",
+        ":status_casters",
         ":traceback",
-        ":outfeed_receiver_py",
+        ":transfer_guard_lib",
         ":types",
+        ":util",
+        ":weakref_lru_cache",
         ":xla_compiler",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@pybind11",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
         "//tensorflow/compiler/xla/pjrt:interpreter_device",
+        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
+        "//tensorflow/compiler/xla/pjrt:pjrt_api",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt:pjrt_compiler",
         "//tensorflow/compiler/xla/pjrt:tfrt_cpu_pjrt_client",
@@ -934,11 +985,14 @@ tsl_pybind_extension(
         "//tensorflow/compiler/xla/pjrt/distributed:service",
         "//tensorflow/compiler/xla/python/ifrt",
         "//tensorflow/compiler/xla/python/pjrt_ifrt",
-        "//tensorflow/compiler/xla/pjrt:pjrt_api",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/distributed_runtime/preemption:preemption_sync_manager",
-        "//tensorflow/tsl/python/lib/core:bfloat16_lib",
-        "//tensorflow/tsl/python/lib/core:float8_lib",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/python/lib/core:numpy",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@pybind11",
     ] + select({
         ":gpu_enabled": [
             "//tensorflow/compiler/xla/pjrt/gpu:se_gpu_pjrt_client",
diff --git a/tensorflow/compiler/xla/python/callback.cc b/tensorflow/compiler/xla/python/callback.cc
index f99a5fc7d14..decee881fca 100644
--- a/tensorflow/compiler/xla/python/callback.cc
+++ b/tensorflow/compiler/xla/python/callback.cc
@@ -79,8 +79,8 @@ void CpuCallback::PrepareAndCall(void* result, void** arg_ptrs,
                                  XlaCustomCallStatus* status) {
   auto s = PrepareAndCallInternal(result, arg_ptrs);
   if (!s.ok()) {
-    XlaCustomCallStatusSetFailure(status, s.error_message().c_str(),
-                                  s.error_message().length());
+    auto msg = s.message();
+    XlaCustomCallStatusSetFailure(status, msg.data(), msg.length());
     return;
   }
 }
@@ -143,9 +143,8 @@ std::optional<py::tuple> CpuCallback::Call(py::tuple args,
                                            XlaCustomCallStatus* status) {
   auto statusor = CallInternal(std::move(args));
   if (!statusor.ok()) {
-    XlaCustomCallStatusSetFailure(status,
-                                  statusor.status().error_message().c_str(),
-                                  statusor.status().error_message().length());
+    absl::string_view msg = statusor.status().message();
+    XlaCustomCallStatusSetFailure(status, msg.data(), msg.length());
     return std::nullopt;
   }
   return std::move(statusor).value();
diff --git a/tensorflow/compiler/xla/python/custom_call_sharding.cc b/tensorflow/compiler/xla/python/custom_call_sharding.cc
index 4b31adfd7cc..7fbd754cee1 100644
--- a/tensorflow/compiler/xla/python/custom_call_sharding.cc
+++ b/tensorflow/compiler/xla/python/custom_call_sharding.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 
 #include "pybind11/pybind11.h"  // from @pybind11
@@ -25,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.h"
+#include "tensorflow/compiler/xla/python/inspect_sharding.h"
 #include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/service/custom_call_sharding_helper.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
@@ -103,58 +105,62 @@ class PyCustomCallPartitioner : public CustomCallPartitioner {
   xla::Status Partition(spmd::SpmdPartitioningVisitor* partitioner,
                         HloInstruction* instruction) const override {
     py::gil_scoped_acquire gil;
-    auto py_result =
-        partition_(GetArgShapes(instruction), GetArgShardings(instruction),
-                   instruction->shape(), instruction->sharding(),
-                   instruction->raw_backend_config_string());
-
-    const XlaComputation* computation = nullptr;  // Kept alive by py_result.
-    std::vector<HloSharding> arg_shardings;
-    std::optional<HloSharding> result_sharding;
     try {
-      std::tie(computation, arg_shardings, result_sharding) =
-          py::cast<std::tuple<const XlaComputation*, std::vector<HloSharding>,
-                              HloSharding>>(py_result);
-    } catch (const py::cast_error& e) {
-      return xla::InternalError(
-          "Shardings returned from partitioning %s: expected "
-          "Tuple[XlaComputation, List[HloSharding], HloSharding] got: %s",
-          instruction->ToString(), py::repr(py_result));
-    }
-    auto hlo_module_config =
-        xla::HloModule::CreateModuleConfigFromProto(
-            computation->proto(), xla::DefaultDebugOptionsIgnoringFlags())
-            .value();
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                        xla::HloModule::CreateFromProto(computation->proto(),
-                                                        hlo_module_config));
-    std::vector<HloInstruction*> operands;
-    operands.reserve(instruction->operand_count());
-    if (arg_shardings.size() != instruction->operand_count()) {
-      return xla::InternalError(
-          "Shardings returned from partitioning %s must match: %d vs %d",
-          instruction->ToString(), arg_shardings.size(),
-          instruction->operand_count());
-    }
-    for (size_t i = 0; i < instruction->operand_count(); ++i) {
-      operands.push_back(
-          partitioner->GetPartitionedHlo(instruction->mutable_operand(i))
-              .Reshard(arg_shardings[i])
-              .hlo());
-    }
+      auto py_result =
+          partition_(GetArgShapes(instruction), GetArgShardings(instruction),
+                     instruction->shape(), instruction->sharding(),
+                     instruction->raw_backend_config_string());
 
-    auto* partitioned_hlo = InlineHloComputation(
-        instruction, hlo_module->entry_computation(), partitioner->builder(),
-        operands, "_custom_call_lowering_rule");
-    partitioned_hlo->set_sharding(result_sharding.value());
+      const XlaComputation* computation = nullptr;  // Kept alive by py_result.
+      std::vector<HloSharding> arg_shardings;
+      std::optional<HloSharding> result_sharding;
+      try {
+        std::tie(computation, arg_shardings, result_sharding) =
+            py::cast<std::tuple<const XlaComputation*, std::vector<HloSharding>,
+                                HloSharding>>(py_result);
+      } catch (const py::cast_error& e) {
+        return xla::InternalError(
+            "Shardings returned from partitioning %s: expected "
+            "Tuple[XlaComputation, List[HloSharding], HloSharding] got: %s",
+            instruction->ToString(), py::repr(py_result));
+      }
+      auto hlo_module_config =
+          xla::HloModule::CreateModuleConfigFromProto(
+              computation->proto(), xla::DefaultDebugOptionsIgnoringFlags())
+              .value();
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                          xla::HloModule::CreateFromProto(computation->proto(),
+                                                          hlo_module_config));
+      std::vector<HloInstruction*> operands;
+      operands.reserve(instruction->operand_count());
+      if (arg_shardings.size() != instruction->operand_count()) {
+        return xla::InternalError(
+            "Shardings returned from partitioning %s must match: %d vs %d",
+            instruction->ToString(), arg_shardings.size(),
+            instruction->operand_count());
+      }
+      for (size_t i = 0; i < instruction->operand_count(); ++i) {
+        operands.push_back(
+            partitioner->GetPartitionedHlo(instruction->mutable_operand(i))
+                .Reshard(arg_shardings[i])
+                .hlo());
+      }
 
-    spmd::PartitionedHlo result_partitioned =
-        spmd::PartitionedHlo(partitioned_hlo, instruction->shape(),
-                             partitioner->MakePartitioningState())
-            .Reshard(instruction->sharding());
+      auto* partitioned_hlo = InlineHloComputation(
+          instruction, hlo_module->entry_computation(), partitioner->builder(),
+          operands, "_custom_call_lowering_rule");
+      partitioned_hlo->set_sharding(result_sharding.value());
 
-    partitioner->SetPartitionedHlo(instruction, result_partitioned);
-    return xla::OkStatus();
+      spmd::PartitionedHlo result_partitioned =
+          spmd::PartitionedHlo(partitioned_hlo, instruction->shape(),
+                               partitioner->MakePartitioningState())
+              .Reshard(instruction->sharding());
+
+      partitioner->SetPartitionedHlo(instruction, result_partitioned);
+      return xla::OkStatus();
+    } catch (const pybind11::error_already_set& e) {
+      return xla::InternalError("custom_partitioner: %s", e.what());
+    }
   }
   HloSharding PropagateUserSharding(
       const HloInstruction* instruction, const HloInstruction* user,
@@ -173,7 +179,6 @@ class PyCustomCallPartitioner : public CustomCallPartitioner {
       const HloInstruction* instruction) const override {
     std::vector<Shape> arg_shapes = GetArgShapes(instruction);
     auto arg_shardings = GetArgShardings(instruction);
-
     py::gil_scoped_acquire gil;
     auto py_result = infer_sharding_from_operands_(
         arg_shapes, arg_shardings, instruction->shape(),
@@ -190,12 +195,30 @@ class PyCustomCallPartitioner : public CustomCallPartitioner {
     return can_side_effecting_have_replicated_sharding_;
   }
 
+  absl::Status status_set_;
   py::object prop_user_sharding_;
   py::object partition_;
   py::object infer_sharding_from_operands_;
   bool can_side_effecting_have_replicated_sharding_;
 };
 
+namespace {
+
+void CallInspectSharding(void* obj, JAX_InspectSharding_Callback_Args* args) {
+  std::optional<xla::HloSharding> arg = jax::InspectShardingReadArgs(args);
+  if (!arg.has_value()) {
+    return;
+  }
+  try {
+    py::gil_scoped_acquire gil;
+    py::handle(reinterpret_cast<PyObject*>(obj))(*std::move(arg));
+  } catch (const pybind11::error_already_set& e) {
+    jax::InspectShardingSetError(args, std::string(e.what()));
+  }
+}
+
+}  // namespace
+
 void BuildCustomCallShardingPybindAPI(pybind11::module& m) {
   m.def(
       "register_custom_call_partitioner",
@@ -225,6 +248,15 @@ Args:
       py::arg("name"), py::arg("prop_user_sharding"), py::arg("partition"),
       py::arg("infer_sharding_from_operands"),
       py::arg("can_side_effecting_have_replicated_sharding") = false);
+  m.def("encode_inspect_sharding_callback",
+        [](py::object handler) -> py::bytes {
+          JAX_InspectSharding_Callback cb;
+          cb.call = &CallInspectSharding;
+          cb.data = handler.ptr();
+          char bytes[sizeof(JAX_InspectSharding_Callback)];
+          memcpy(&bytes, &cb, sizeof(JAX_InspectSharding_Callback));
+          return py::bytes(bytes, sizeof(JAX_InspectSharding_Callback));
+        });
 
   py::module hlo_sharding_util_m = m.def_submodule(
       "hlo_sharding_util", "Utilities for manipulating HloSharding.");
diff --git a/tensorflow/compiler/xla/python/dlpack.cc b/tensorflow/compiler/xla/python/dlpack.cc
index 25a392d01ba..739d246aad8 100644
--- a/tensorflow/compiler/xla/python/dlpack.cc
+++ b/tensorflow/compiler/xla/python/dlpack.cc
@@ -284,13 +284,7 @@ StatusOr<PjRtDevice*> DeviceForDLDevice(const PjRtClient* cpu_client,
 
 StatusOr<py::capsule> BufferToDLPackManagedTensor(py::handle py_buffer,
                                                   bool take_ownership) {
-  ifrt::Array* ifrt_array = nullptr;
-  if (PyArray::IsPyArray(py_buffer)) {
-    ifrt_array = py::cast<xla::PyArray>(py_buffer).ifrt_array();
-  } else {
-    TF_ASSIGN_OR_RETURN(PyBuffer * buffer, PyBuffer::AsPyBuffer(py_buffer));
-    ifrt_array = buffer->ifrt_array();
-  }
+  ifrt::Array* ifrt_array = py::cast<xla::PyArray>(py_buffer).ifrt_array();
   auto pack = std::make_unique<DLPackTensor>();
   if (ifrt_array == nullptr) {
     return Unimplemented(
@@ -370,7 +364,7 @@ StatusOr<py::capsule> BufferToDLPackManagedTensor(py::handle py_buffer,
 
 StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
     const pybind11::capsule& tensor, std::shared_ptr<PyClient> cpu_client,
-    std::shared_ptr<PyClient> gpu_client, bool make_jax_array) {
+    std::shared_ptr<PyClient> gpu_client) {
   // TODO(hyeontaek): This is a potential target for an IFRT client to multiplex
   // multiple PjRt clients. Devices from these PjRt clients could be expressed
   // as a unified set of IFRT devices.
@@ -448,14 +442,8 @@ StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
   }
   TF_ASSIGN_OR_RETURN(auto ifrt_array,
                       ifrt_client->CreatePjRtArray(std::move(pjrt_buffer)));
-  if (make_jax_array) {
-    return PyArray::MakeFromSingleDeviceArray(
-        std::move(client), Traceback::Get(), std::move(ifrt_array), false,
-        true);
-  } else {
-    return PyBuffer::Make(std::move(client), std::move(ifrt_array),
-                          Traceback::Get());
-  }
+  return PyArray::MakeFromSingleDeviceArray(std::move(client), Traceback::Get(),
+                                            std::move(ifrt_array), false, true);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/dlpack.h b/tensorflow/compiler/xla/python/dlpack.h
index 48e91f98025..8212cdf9125 100644
--- a/tensorflow/compiler/xla/python/dlpack.h
+++ b/tensorflow/compiler/xla/python/dlpack.h
@@ -30,7 +30,7 @@ StatusOr<pybind11::capsule> BufferToDLPackManagedTensor(pybind11::handle buffer,
 
 StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
     const pybind11::capsule& tensor, std::shared_ptr<PyClient> cpu_client,
-    std::shared_ptr<PyClient> gpu_client, bool make_jax_array);
+    std::shared_ptr<PyClient> gpu_client);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/python/ifrt/BUILD b/tensorflow/compiler/xla/python/ifrt/BUILD
index 6f1a3c2436d..3001886bf4c 100644
--- a/tensorflow/compiler/xla/python/ifrt/BUILD
+++ b/tensorflow/compiler/xla/python/ifrt/BUILD
@@ -65,9 +65,12 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/python/ifrt/ir",
         "//tensorflow/tsl/platform:logging",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
@@ -137,7 +140,10 @@ xla_cc_test(
     srcs = ["sharding_test.cc"],
     deps = [
         ":ifrt",
-        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/compiler/xla/python/ifrt/ir",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
     ],
diff --git a/tensorflow/compiler/xla/python/ifrt/compiler.h b/tensorflow/compiler/xla/python/ifrt/compiler.h
index f3d083ec18d..e02403ea576 100644
--- a/tensorflow/compiler/xla/python/ifrt/compiler.h
+++ b/tensorflow/compiler/xla/python/ifrt/compiler.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_COMPILER_H_
 
 #include <memory>
+#include <optional>
 
 #include "absl/strings/string_view.h"
 #include "llvm/Support/ExtensibleRTTI.h"
@@ -45,7 +46,7 @@ class Compiler : public llvm::RTTIExtends<Compiler, llvm::RTTIRoot> {
   // implementation specific.
   virtual StatusOr<std::unique_ptr<LoadedExecutable>>
   DeserializeLoadedExecutable(absl::string_view serialized,
-                              CompileOptions options) = 0;
+                              std::optional<CompileOptions> options) = 0;
 
   static char ID;  // NOLINT
 };
diff --git a/tensorflow/compiler/xla/python/ifrt/device.h b/tensorflow/compiler/xla/python/ifrt/device.h
index f30bfa6280f..b21a708a510 100644
--- a/tensorflow/compiler/xla/python/ifrt/device.h
+++ b/tensorflow/compiler/xla/python/ifrt/device.h
@@ -31,6 +31,8 @@ using Device = ::xla::PjRtDevice;
 // Ordered list of devices.
 class DeviceList {
  public:
+  using value_type = Device*;
+
   // Number of devices to inline in `Devices`.
   static constexpr int kInlineDeviceSize = 1;
 
diff --git a/tensorflow/compiler/xla/python/ifrt/dtype.h b/tensorflow/compiler/xla/python/ifrt/dtype.h
index 3352be88571..b4b9ceed950 100644
--- a/tensorflow/compiler/xla/python/ifrt/dtype.h
+++ b/tensorflow/compiler/xla/python/ifrt/dtype.h
@@ -71,9 +71,10 @@ class DType {
     kToken = 17,
 
     kF8E4M3FN = 19,
+    kF8E4M3B11FNUZ = 23,
     kF8E5M2 = 20,
 
-    // Next = 21
+    // Next = 24
 
     // String is not support in XLA. DType.Kind needs to match xla.PrimitiveType
     // enum, so choose a large enum to avoid collision.
diff --git a/tensorflow/compiler/xla/python/ifrt/executable.h b/tensorflow/compiler/xla/python/ifrt/executable.h
index 75c21bcabd8..7844d7d099b 100644
--- a/tensorflow/compiler/xla/python/ifrt/executable.h
+++ b/tensorflow/compiler/xla/python/ifrt/executable.h
@@ -159,7 +159,6 @@ class LoadedExecutable
   // TODO(hyeontaek): Move the following XLA-specific methods to
   // pjrt_executable.h and put it in an `XlaCompatibleExecutable`.
 
-  virtual const DeviceAssignment& device_assignment() const = 0;
   using LogicalDeviceIds = ::xla::PjRtLoadedExecutable::LogicalDeviceIds;
   virtual absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
       const = 0;
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/BUILD b/tensorflow/compiler/xla/python/ifrt/ir/BUILD
new file mode 100644
index 00000000000..b344b8b9a54
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/BUILD
@@ -0,0 +1,135 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+td_library(
+    name = "ifrt_td",
+    srcs = [
+        "ifrt_dialect.td",
+        "ifrt_interfaces.td",
+        "ifrt_ops.td",
+    ],
+    visibility = ["//tensorflow/compiler/xla/python/ifrt:friends"],
+    deps = [
+        "@llvm-project//mlir:AttrTdFiles",
+        "@llvm-project//mlir:BuiltinDialectTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "ifrt_dialect_inc_gen",
+    tbl_outs = [
+        (
+            [
+                "-gen-dialect-decls",
+                "-dialect=ifrt",
+            ],
+            "ifrt_dialect.h.inc",
+        ),
+        (
+            [
+                "-gen-dialect-defs",
+                "-dialect=ifrt",
+            ],
+            "ifrt_dialect.cc.inc",
+        ),
+        (
+            [
+                "-gen-typedef-decls",
+                "--typedefs-dialect=ifrt",
+            ],
+            "ifrt_types.h.inc",
+        ),
+        (
+            [
+                "-gen-typedef-defs",
+                "--typedefs-dialect=ifrt",
+            ],
+            "ifrt_types.cc.inc",
+        ),
+        (
+            [
+                "-gen-attrdef-decls",
+                "--attrdefs-dialect=ifrt",
+            ],
+            "ifrt_attrs.h.inc",
+        ),
+        (
+            [
+                "-gen-attrdef-defs",
+                "--attrdefs-dialect=ifrt",
+            ],
+            "ifrt_attrs.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ifrt_dialect.td",
+    test = True,
+    deps = [":ifrt_td"],
+)
+
+gentbl_cc_library(
+    name = "ifrt_ops_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "ifrt_ops.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "ifrt_ops.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ifrt_ops.td",
+    test = True,
+    deps = [":ifrt_td"],
+)
+
+gentbl_cc_library(
+    name = "ifrt_interfaces_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-op-interface-decls"],
+            "ifrt_interfaces.h.inc",
+        ),
+        (
+            ["-gen-op-interface-defs"],
+            "ifrt_interfaces.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ifrt_interfaces.td",
+    test = True,
+    deps = [":ifrt_td"],
+)
+
+cc_library(
+    name = "ir",
+    srcs = [
+        "ifrt_dialect.cc",
+        "ifrt_interfaces.cc",
+        "ifrt_ops.cc",
+        "sharding_param.cc",
+    ],
+    hdrs = [
+        "ifrt_dialect.h",
+        "ifrt_interfaces.h",
+        "ifrt_ops.h",
+        "sharding_param.h",
+    ],
+    visibility = ["//tensorflow/compiler/xla/python/ifrt:friends"],
+    deps = [
+        ":ifrt_dialect_inc_gen",
+        ":ifrt_interfaces_inc_gen",
+        ":ifrt_ops_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.cc b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.cc
new file mode 100644
index 00000000000..5ea4e7a8cd9
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.cc
@@ -0,0 +1,88 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.h"
+
+#include <cstdint>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h"
+
+// Generated definitions.
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.cc.inc"
+#include "tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h"
+#define GET_TYPEDEF_CLASSES
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_types.cc.inc"
+#define GET_ATTRDEF_CLASSES
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_attrs.cc.inc"
+
+namespace xla {
+namespace ifrt {
+
+void IfrtDialect::initialize() {
+  addTypes<
+#define GET_TYPEDEF_LIST
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_types.cc.inc"
+      >();
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_attrs.cc.inc"
+      >();
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.cc.inc"
+      >();
+}
+
+// static
+mlir::LogicalResult IfrtArrayType::verify(
+    llvm::function_ref<mlir::InFlightDiagnostic()> emit_error,
+    mlir::RankedTensorType global, ShardingParam sharding,
+    llvm::ArrayRef<int64_t> devices) {
+  llvm::SmallSet<int64_t, 4> device_set;
+  for (auto device : devices) {
+    if (!device_set.insert(device).second) {
+      return emit_error() << "`devices` has duplicated id " << device;
+    }
+  }
+
+  if (mlir::failed(sharding.verify(emit_error))) {
+    return mlir::failure();
+  }
+
+  int64_t devices_in_mesh = 1;
+  for (const int64_t axis_size : sharding.minor_to_major().axis_sizes) {
+    devices_in_mesh *= axis_size;
+  }
+  if (devices_in_mesh != devices.size()) {
+    return emit_error() << "Requires the same amount of `devices` and from "
+                           "`sharding`. Actual: "
+                        << devices.size() << " vs " << devices_in_mesh;
+  }
+
+  return mlir::success();
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.h b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.h
new file mode 100644
index 00000000000..401ae3d5d5b
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_DIALECT_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_DIALECT_H_
+
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h"
+
+// Generated definitions.
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.h.inc"  // IWYU pragma: export
+#define GET_TYPEDEF_CLASSES
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_types.h.inc"  // IWYU pragma: export
+#define GET_ATTRDEF_CLASSES
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_attrs.h.inc"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_DIALECT_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.td b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.td
new file mode 100644
index 00000000000..95a99a3c78c
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.td
@@ -0,0 +1,68 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_DIALECT_TD_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_DIALECT_TD_
+
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/BuiltinTypes.td"
+include "mlir/IR/DialectBase.td"
+
+def Ifrt_Dialect : Dialect {
+  let name = "ifrt";
+  let summary = "IFRT dialect";
+  let cppNamespace = "::xla::ifrt";
+  let useDefaultTypePrinterParser = 1;
+  let useDefaultAttributePrinterParser = 1;
+}
+
+def Ifrt_ShardingParameter :
+    AttrOrTypeParameter<"::xla::ifrt::ShardingParam", ""> {
+  let parser = "::xla::ifrt::ShardingParam::Parse($_parser)";
+}
+
+def Ifrt_ArrayType : TypeDef<Ifrt_Dialect, "IfrtArray"> {
+  let mnemonic = "array";
+  let summary = "An Ifrt array sharded on a set of devices.";
+
+  let parameters = (ins
+    Builtin_RankedTensor:$shape,
+    Ifrt_ShardingParameter:$sharding,
+    ArrayRefParameter<"int64_t">:$devices);
+
+  let assemblyFormat = "`<` $shape`,` $sharding`,` `[`$devices`]` `>`";
+
+  let genVerifyDecl = 1;
+}
+
+def Ifrt_ControlType : TypeDef<Ifrt_Dialect, "IfrtControl"> {
+  let mnemonic = "control";
+  let summary = [{
+    Represents execution dependency.
+
+    When an op takes a control as argument, it will not be scheduled for
+    execution before the op emitting the control has finished execution.
+  }];
+}
+
+def Ifrt_ShardingAttr : AttrDef<Ifrt_Dialect, "IfrtSharding"> {
+  let mnemonic = "sharding";
+  let summary = "ShardingParam as an attribute.";
+
+  let parameters = (ins Ifrt_ShardingParameter:$sharding);
+  let assemblyFormat = "`<` $sharding `>`";
+}
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_DIALECT_TD_
diff --git a/tensorflow/lite/core/shims/c/shims_test_util.cc b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.cc
similarity index 72%
rename from tensorflow/lite/core/shims/c/shims_test_util.cc
rename to tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.cc
index 67eebd539b2..5659d0633a8 100644
--- a/tensorflow/lite/core/shims/c/shims_test_util.cc
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/core/shims/c/shims_test_util.h"
 
-int TfLiteInitializeShimsForTest() {
-  return 0;
-}
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.h"
+
+// Generated definitions.
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.cc.inc"
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.h b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.h
new file mode 100644
index 00000000000..4b815a64da0
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.h
@@ -0,0 +1,28 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_H_
+
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h"
+
+// Generated definitions.
+#define GET_OP_INTERFACE_CLASSES
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.h.inc"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.td b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.td
new file mode 100644
index 00000000000..0a63d36efef
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.td
@@ -0,0 +1,57 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_TD_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_TD_
+
+include "mlir/IR/OpBase.td"
+
+def Ifrt_SpmdExpandableInterface : OpInterface<"IfrtSpmdExpandable"> {
+  let cppNamespace = "::xla::ifrt";
+
+  let description = [{
+    Interface to conduct SPMD expansion on ops. It contains the SPMD expansion
+    method as well as the layout computation methods as they are prerequisite 
+    for SPMD expansion.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/"Rewrite the global op to per-shard view.",
+      /*retTy=*/"mlir::FailureOr<mlir::Operation*>",
+      /*methodName=*/"SpmdExpand",
+      /*args=*/(ins),
+      /*methodBody=*/""
+    >,
+    InterfaceMethod<
+      /*desc=*/"Compute sharding parameters for the outputs of op from input sharding parameters.",
+      /*retTy=*/"mlir::FailureOr<llvm::DenseMap<int, xla::ifrt::ShardingParam>>",
+      /*methodName=*/"ComputeShardingForward",
+      /*args=*/(ins "const llvm::DenseMap<int, xla::ifrt::ShardingParam>&":$input_shardings
+      ),
+      /*methodBody=*/""
+    >,
+    InterfaceMethod<
+      /*desc=*/"Compute sharding parameters for the inputs of op from output sharding parameters",
+      /*retTy=*/"mlir::FailureOr<llvm::DenseMap<int, xla::ifrt::ShardingParam>>",
+      /*methodName=*/"ComputeShardingBackward",
+      /*args=*/(ins "const llvm::DenseMap<int, xla::ifrt::ShardingParam>&":$output_shardings
+      ),
+      /*methodBody=*/""
+    >
+  ];
+}
+
+#endif // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_TD_
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.cc b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.cc
new file mode 100644
index 00000000000..2d395cb34c2
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.cc
@@ -0,0 +1,392 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+#include <utility>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.h"
+
+// Generated definitions.
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.cc.inc"
+
+namespace xla {
+namespace ifrt {
+
+namespace {
+
+mlir::FailureOr<mlir::RankedTensorType> GetGlobalShape(mlir::Type type) {
+  if (auto ranked_tensor = type.dyn_cast<mlir::RankedTensorType>()) {
+    return ranked_tensor;
+  } else if (auto array = type.dyn_cast<IfrtArrayType>()) {
+    return array.getShape();
+  } else {
+    return mlir::failure();
+  }
+}
+
+mlir::FailureOr<mlir::RankedTensorType> GetGlobalShape(mlir::Value value) {
+  return GetGlobalShape(value.getType());
+}
+
+template <typename T, typename U>
+mlir::LogicalResult VerifySameGlobalShape(mlir::Operation* op,
+                                          llvm::StringRef lhs_mnemonic, T lhs,
+                                          llvm::StringRef rhs_mnemonic, U rhs) {
+  mlir::FailureOr<mlir::RankedTensorType> lhs_shape = GetGlobalShape(lhs);
+  if (mlir::failed(lhs_shape)) {
+    return op->emitOpError()
+           << "fails to get global shape from " << lhs_mnemonic << ": " << lhs;
+  }
+  mlir::FailureOr<mlir::RankedTensorType> rhs_shape = GetGlobalShape(rhs);
+  if (mlir::failed(rhs_shape)) {
+    return op->emitOpError()
+           << "fails to get global shape from " << rhs_mnemonic << ": " << rhs;
+  }
+  if (*lhs_shape != *rhs_shape) {
+    return op->emitOpError()
+           << "requires the same global shape. " << lhs_mnemonic << " "
+           << *lhs_shape << " vs " << rhs_mnemonic << " " << *rhs_shape;
+  }
+  return mlir::success();
+}
+
+// Verifies that
+// 1. Elements in `devices` are unique.
+// 2. Each of `inputs` and `outputs` is placed on a subset of `devices`.
+mlir::LogicalResult VerifyDevicePlacement(
+    mlir::Operation* op, llvm::ArrayRef<int64_t> devices,
+    llvm::ArrayRef<IfrtArrayType> inputs,
+    llvm::ArrayRef<IfrtArrayType> outputs) {
+  llvm::SmallSet<int64_t, 4> attr_devices;
+  for (const int64_t device : devices) {
+    if (!attr_devices.insert(device).second) {
+      return op->emitOpError()
+             << "has duplicate device id " << device << " in `devices` attr";
+    }
+  }
+
+  for (const IfrtArrayType input : inputs) {
+    for (const int64_t input_device : input.getDevices()) {
+      if (!attr_devices.count(input_device)) {
+        return op->emitOpError()
+               << "requires all inputs placed on `devices` attr. The following "
+                  "input is placed on device "
+               << input_device << " not found in `devices` attr. " << input;
+      }
+    }
+  }
+
+  for (const IfrtArrayType output : outputs) {
+    for (const int64_t output_device : output.getDevices()) {
+      if (!attr_devices.count(output_device)) {
+        return op->emitOpError()
+               << "requires all outputs placed on `devices` attr. The "
+                  "following output is placed on device "
+               << output_device << " not found in `devices` attr. " << output;
+      }
+    }
+  }
+
+  return mlir::success();
+}
+
+struct IoAlias {
+  int input_index;
+  int output_index;
+};
+
+mlir::LogicalResult VerifyIoAlias(mlir::Operation* op, IoAlias io_alias,
+                                  llvm::ArrayRef<IfrtArrayType> inputs,
+                                  llvm::ArrayRef<IfrtArrayType> outputs) {
+  if (io_alias.input_index < 0 || io_alias.input_index >= inputs.size()) {
+    return op->emitOpError()
+           << "can't alias input #" << io_alias.input_index << " to output #"
+           << io_alias.output_index << " as only having " << inputs.size()
+           << " inputs";
+  }
+  if (io_alias.output_index < 0 || io_alias.output_index >= outputs.size()) {
+    return op->emitOpError()
+           << "can't alias input #" << io_alias.input_index << " to output #"
+           << io_alias.output_index << " as only having " << outputs.size()
+           << " outputs";
+  }
+  if (inputs[io_alias.input_index] != outputs[io_alias.output_index]) {
+    return op->emitOpError()
+           << "can't alias input #" << io_alias.input_index << " to output #"
+           << io_alias.output_index
+           << " with different types: " << inputs[io_alias.input_index]
+           << " vs " << outputs[io_alias.output_index];
+  }
+  return mlir::success();
+}
+
+mlir::LogicalResult VerifyIoAliases(mlir::Operation* op,
+                                    mlir::ArrayAttr io_aliases,
+                                    llvm::ArrayRef<IfrtArrayType> inputs,
+                                    llvm::ArrayRef<IfrtArrayType> outputs) {
+  llvm::SmallSet<int, 4> aliased_inputs;
+  llvm::SmallSet<int, 4> aliased_outputs;
+  for (const auto& raw_io_alias :
+       io_aliases.getAsRange<mlir::DenseI32ArrayAttr>()) {
+    llvm::ArrayRef<int> io_alias_as_array = raw_io_alias.asArrayRef();
+    int aliased_input = io_alias_as_array[0];
+    int aliased_output = io_alias_as_array[1];
+    if (mlir::failed(VerifyIoAlias(op, IoAlias{aliased_input, aliased_output},
+                                   inputs, outputs))) {
+      return mlir::failure();
+    }
+    if (!aliased_inputs.insert(aliased_input).second) {
+      return op->emitOpError()
+             << "can't alias input #" << aliased_input << " more than once";
+    }
+    if (!aliased_outputs.insert(aliased_output).second) {
+      return op->emitOpError()
+             << "can't alias output #" << aliased_outputs << " more than once";
+    }
+  }
+  return mlir::success();
+}
+
+}  // namespace
+
+mlir::LogicalResult ReshardOp::verify() {
+  return VerifySameGlobalShape(*this, "Input", getInput(), "Output",
+                               getOutput());
+}
+
+mlir::LogicalResult AssembleOp::verify() {
+  llvm::SmallVector<int64_t, 4> input_devices;
+  for (const mlir::Value input : getInputs()) {
+    const auto array = llvm::cast<IfrtArrayType>(input.getType());
+    if (array.getDevices().size() != 1) {
+      return emitOpError()
+             << "requires every input to be a single device array. Actual: "
+             << input.getType();
+    }
+    input_devices.push_back(array.getDevices()[0]);
+  }
+  const llvm::ArrayRef<int64_t> output_devices =
+      getOutput().getType().getDevices();
+  if (!std::equal(input_devices.begin(), input_devices.end(),
+                  output_devices.begin())) {
+    return emitOpError() << "requires the same input/output device list. Input "
+                         << input_devices << " vs Output " << output_devices;
+  }
+  return mlir::success();
+}
+
+mlir::LogicalResult DisassembleOp::verify() {
+  llvm::SmallVector<int64_t, 4> output_devices;
+  for (const mlir::Value output : getOutputs()) {
+    const auto array = llvm::cast<IfrtArrayType>(output.getType());
+    if (array.getDevices().size() != 1) {
+      return emitOpError()
+             << "requires every output to be a single device array. Actual: "
+             << output.getType();
+    }
+    output_devices.push_back(array.getDevices()[0]);
+  }
+  const llvm::ArrayRef<int64_t> input_devices =
+      getInput().getType().getDevices();
+  if (!std::equal(input_devices.begin(), input_devices.end(),
+                  output_devices.begin())) {
+    return emitOpError() << "requires the same input/output device list. Input "
+                         << input_devices << " vs Output " << output_devices;
+  }
+  return mlir::success();
+}
+
+mlir::CallInterfaceCallable CallOp::getCallableForCallee() {
+  return (*this)->getAttrOfType<mlir::SymbolRefAttr>("callee");
+}
+
+mlir::Operation::operand_range CallOp::getArgOperands() { return getInputs(); }
+
+mlir::LogicalResult CallOp::verifySymbolUses(
+    mlir::SymbolTableCollection& symbolTable) {
+  const auto callee_attr =
+      (*this)->getAttrOfType<mlir::SymbolRefAttr>("callee");
+  if (!callee_attr) {
+    return emitOpError() << "requires `callee` SymbolRefAttr";
+  }
+  auto callee = symbolTable.lookupNearestSymbolFrom<mlir::func::FuncOp>(
+      *this, callee_attr);
+  if (!callee) {
+    return emitOpError() << "requires '" << callee_attr
+                         << "' to reference a valid function";
+  }
+  mlir::FunctionType callee_type = callee.getFunctionType();
+
+  // Verify inputs.
+  if (callee_type.getNumInputs() != getInputs().size()) {
+    return emitOpError() << "requires the same input size. Input "
+                         << getInputs().size() << " vs Callee "
+                         << callee_type.getNumInputs();
+  }
+  for (int i = 0; i < callee_type.getNumInputs(); ++i) {
+    if (mlir::failed(VerifySameGlobalShape(
+            *this, llvm::Twine("Input #").concat(llvm::Twine(i)).str(),
+            getInputs()[i], "Callee", callee_type.getInput(i)))) {
+      return mlir::failure();
+    }
+  }
+
+  // Verify outputs.
+  if (callee_type.getNumResults() != getOutputs().size()) {
+    return emitOpError() << "requires the same output size. Output "
+                         << getOutputs().size() << " vs Callee "
+                         << callee_type.getNumResults();
+  }
+  for (int i = 0; i < callee_type.getNumResults(); ++i) {
+    if (mlir::failed(VerifySameGlobalShape(
+            *this, llvm::Twine("Output #").concat(llvm::Twine(i)).str(),
+            getOutputs()[i], "Callee", callee_type.getResult(i)))) {
+      return mlir::failure();
+    }
+  }
+
+  return mlir::success();
+}
+
+mlir::LogicalResult CallOp::verify() {
+  llvm::SmallVector<IfrtArrayType, 4> input_arrays;
+  input_arrays.reserve(getInputs().size());
+  for (const mlir::Value input : getInputs()) {
+    input_arrays.push_back(input.getType().cast<IfrtArrayType>());
+  }
+
+  llvm::SmallVector<IfrtArrayType, 4> output_arrays;
+  output_arrays.reserve(getOutputs().size());
+  for (const mlir::Value output : getOutputs()) {
+    output_arrays.push_back(output.getType().cast<IfrtArrayType>());
+  }
+
+  if (mlir::failed(VerifyDevicePlacement(*this, getDevices(), input_arrays,
+                                         output_arrays)) ||
+      mlir::failed(VerifyIoAliases(*this, getIoAliases(), input_arrays,
+                                   output_arrays))) {
+    return mlir::failure();
+  }
+  return mlir::success();
+}
+
+mlir::CallInterfaceCallable CallLoadedExecutableOp::getCallableForCallee() {
+  return (*this)->getAttrOfType<mlir::SymbolRefAttr>("callee");
+}
+
+mlir::Operation::operand_range CallLoadedExecutableOp::getArgOperands() {
+  return getInputs();
+}
+
+mlir::LogicalResult CallLoadedExecutableOp::verifySymbolUses(
+    mlir::SymbolTableCollection& symbolTable) {
+  const auto callee_attr =
+      (*this)->getAttrOfType<mlir::SymbolRefAttr>("callee");
+  if (!callee_attr) {
+    return emitOpError() << "requires `callee` SymbolRefAttr";
+  }
+  auto callee = symbolTable.lookupNearestSymbolFrom<LoadedExecutableOp>(
+      *this, callee_attr);
+  if (!callee) {
+    return emitOpError() << "requires '" << callee_attr
+                         << "' to reference a valid LoadedExecutable";
+  }
+
+  llvm::SmallVector<mlir::Type, 4> input_types;
+  input_types.reserve(getInputs().size());
+  for (const mlir::Value input : getInputs()) {
+    input_types.push_back(input.getType());
+  }
+  llvm::SmallVector<mlir::Type, 4> output_types;
+  output_types.reserve(getOutputs().size());
+  for (const mlir::Value output : getOutputs()) {
+    output_types.push_back(output.getType());
+  }
+  auto func_type =
+      mlir::FunctionType::get(getContext(), input_types, output_types);
+  if (callee.getFunctionType() != func_type) {
+    return emitOpError() << "requires callee signature matching " << func_type
+                         << ". Actual " << callee.getFunctionType();
+  }
+  return mlir::success();
+}
+
+mlir::LogicalResult CallLoadedExecutableOp::verify() {
+  llvm::SmallVector<IfrtArrayType, 4> input_arrays;
+  input_arrays.reserve(getInputs().size());
+  for (const mlir::Value input : getInputs()) {
+    input_arrays.push_back(input.getType().cast<IfrtArrayType>());
+  }
+
+  llvm::SmallVector<IfrtArrayType, 4> output_arrays;
+  output_arrays.reserve(getOutputs().size());
+  for (const mlir::Value output : getOutputs()) {
+    output_arrays.push_back(output.getType().cast<IfrtArrayType>());
+  }
+
+  return VerifyIoAliases(*this, getIoAliases(), input_arrays, output_arrays);
+}
+
+mlir::LogicalResult LoadedExecutableOp::verify() {
+  mlir::FunctionType func_type = getFunctionType();
+
+  llvm::SmallVector<IfrtArrayType, 4> input_arrays;
+  input_arrays.reserve(func_type.getInputs().size());
+  for (const mlir::Type input : func_type.getInputs()) {
+    if (auto input_array = llvm::dyn_cast<IfrtArrayType>(input)) {
+      input_arrays.push_back(input_array);
+    } else {
+      return emitOpError() << "requires all inputs to be IfrtArrayType. Found "
+                           << input;
+    }
+  }
+
+  llvm::SmallVector<IfrtArrayType, 4> output_arrays;
+  output_arrays.reserve(func_type.getResults().size());
+  for (const mlir::Type output : func_type.getResults()) {
+    if (auto output_array = llvm::dyn_cast<IfrtArrayType>(output)) {
+      output_arrays.push_back(output_array);
+    } else {
+      return emitOpError() << "requires all outputs to be IfrtArrayType. Found "
+                           << output;
+    }
+  }
+
+  return VerifyDevicePlacement(*this, getDevices(), input_arrays,
+                               output_arrays);
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h
new file mode 100644
index 00000000000..6a0ef3f2991
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_OPS_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_OPS_H_
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.h"
+
+// Generated definitions.
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h.inc"  // IWYU pragma: export
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_OPS_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.td b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.td
new file mode 100644
index 00000000000..c98770323b4
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.td
@@ -0,0 +1,210 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_OPS_TD_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_OPS_TD_
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/BuiltinTypes.td"
+include "mlir/IR/SymbolInterfaces.td"
+include "mlir/Interfaces/CallInterfaces.td"
+include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.td"
+
+class Ifrt_Op<string mnemonic, list<Trait> traits = []> :
+    Op<Ifrt_Dialect, mnemonic, traits>;
+
+def Ifrt_ReshardOp : Ifrt_Op<"Reshard"> {
+  let summary = "Reshards a host tensor or device array";
+  let description = [{
+    Copies the host tensor or device array to a new sharding.
+
+    The input and output must have the same global shape.
+
+    The input and output are allowed to have the same sharding and devices. In
+    this case, this ReshardOp will have no transfer across devices.
+  }];
+
+  let arguments = (ins
+    Ifrt_ArrayType:$input,
+    Variadic<Ifrt_ControlType>:$control_inputs);
+  let results = (outs
+    Ifrt_ArrayType:$output);
+
+  let hasVerifier = 1;
+}
+
+def Ifrt_AssembleOp : Ifrt_Op<"Assemble", [AttrSizedOperandSegments]> {
+  let summary = "Assembles single device arrays to a sharded array";
+  let description = [{
+    Builds a larger array out of individual per-device arrays.
+
+    On each device, the local shard of the result array is the input array.
+    Thus, in an efficient implementation, no buffer transfer is needed.
+
+    The inputs' `device` parameter must have size 1. They should be distinct and
+    combine to the output's `device` list.
+  }];
+
+  let arguments = (ins
+    Variadic<Ifrt_ArrayType>:$inputs,
+    Variadic<Ifrt_ControlType>:$control_inputs);
+  let results = (outs
+    Ifrt_ArrayType:$output);
+
+  let hasVerifier = 1;
+}
+
+def Ifrt_DisassembleOp : Ifrt_Op<"Disassemble"> {
+  let summary = "Disassembles a sharded array to single device arrays";
+  let description = [{
+    Breaks an array up into per-device arrays.
+
+    On each device, the result array is its local shard in the input array.
+    Thus, in an efficient implementation, no buffer transfer is needed.
+
+    The outputs' `devices` parameter must have size 1. They should be distinct
+    and combine to the input's `devices` list.
+  }];
+
+  let arguments = (ins
+    Ifrt_ArrayType:$input,
+    Variadic<Ifrt_ControlType>:$controlInputs);
+  let results = (
+    outs Variadic<Ifrt_ArrayType>:$outputs);
+
+  let hasVerifier = 1;
+}
+
+def IoAliasesAttr : TypedArrayAttrBase<
+    ConfinedAttr<DenseI32ArrayAttr, [DenseArrayCount<2>]>,
+    "Array of pairs of aliased input/output indices">;
+
+def Ifrt_CallOp : Ifrt_Op<"Call",
+    [AttrSizedOperandSegments,
+     DeclareOpInterfaceMethods<CallOpInterface>,
+     DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+  let summary = "Call some function on a set of devices";
+  let description = [{
+    The callee is a global FuncOp to be sharded onto the given set of devices.
+    It's up to the IFRT implementation to define which dialects are legal in the
+    callee.
+
+    The callee's inputs/outputs should be regular tensors, the shape of which
+    are the same as inputs/outputs' global shape, correspondingly. Every
+    input/output of the callee must have a `ifrt.sharding` attribute matching
+    that on the corresponding input/output of this CallOp.
+
+    The callee doesn't specify the device placement. It's specified at the
+    `devices` attribute of this CallOp. Every input/output must be placed on
+    a subset of these devices.
+
+    `io_aliases` represents pairs of inputs and outputs, where the input buffer
+    may be donated and used as the output buffer. The aliased pair must have the
+    same Ifrt_ArrayType. It's up to IFRT implementations whether to respect this
+    hint or not.
+  }];
+
+  let arguments = (ins
+    Variadic<Ifrt_ArrayType>:$inputs,
+    Variadic<Ifrt_ControlType>:$control_inputs,
+
+    SymbolRefAttr:$callee,
+    DenseI64ArrayAttr:$devices,
+    DefaultValuedAttr<IoAliasesAttr, "{}">:$io_aliases);
+  let results = (outs
+    Variadic<Ifrt_ArrayType>:$outputs,
+    Ifrt_ControlType:$control_output);
+
+  let assemblyFormat = [{
+    $callee `(` $inputs `)` oilist(`after` $control_inputs) attr-dict `:` functional-type($inputs, $outputs)
+  }];
+  let hasVerifier = 1;
+}
+
+def Ifrt_CallLoadedExecutableOp : Ifrt_Op<"CallLoadedExecutable",
+    [AttrSizedOperandSegments,
+     DeclareOpInterfaceMethods<CallOpInterface>,
+     DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+  let summary = "Call some loaded executable";
+  let description = [{
+    The callee is a LoadedExecutableOp that declares the signature of the
+    LoadedExecutable. The actual LoadedExecutable instance is bound through
+    CompileOptions.
+
+    The callee is placed on devices because it's loaded. Every input/output must
+    be placed on a subset of these devices.
+
+    `io_aliases` represents pairs of inputs and outputs, where the input buffer
+    may be donated and used as the output buffer. The aliased pair must have the
+    same Ifrt_ArrayType. It's up to IFRT implementations whether to respect this
+    hint or not.
+  }];
+
+  let arguments = (ins
+    Variadic<Ifrt_ArrayType>:$inputs,
+    Variadic<Ifrt_ControlType>:$control_inputs,
+
+    SymbolRefAttr:$callee,
+    DefaultValuedAttr<IoAliasesAttr, "{}">:$io_aliases);
+  let results = (outs
+    Variadic<Ifrt_ArrayType>:$outputs,
+    Ifrt_ControlType:$control_output);
+
+  let assemblyFormat = [{
+    $callee `(` $inputs `)` oilist(`after` $control_inputs) attr-dict `:` functional-type($inputs, $outputs)
+  }];
+  let hasVerifier = 1;
+}
+
+def Ifrt_LoadedExecutableOp : Ifrt_Op<"LoadedExecutable",
+    [DeclareOpInterfaceMethods<Symbol>]> {
+  let summary = "Declare a loaded executable";
+  let description = [{
+    The inputs/outputs of `function_type` should be IFRT arrays. They should be
+    placed on a subset of the `devices`.
+  }];
+
+  let arguments = (ins
+    SymbolNameAttr: $sym_name,
+    TypeAttrOf<FunctionType>: $function_type,
+    DenseI64ArrayAttr:$devices
+  );
+
+  let assemblyFormat = "$sym_name attr-dict `:` $function_type";
+  let hasVerifier = 1;
+}
+
+def Ifrt_AfterOp : Ifrt_Op<"After"> {
+  let summary = "Get a control handle for array materialization.";
+  let description = [{
+    When depending on the `control_output`, the op will be not be scheduled until
+    all `inputs` are materialized. For example, this could mean the data
+    transfer is completed from a `Reshard` op.
+
+    Moreover, this op provides fine-grained control dependency on `CallOp`'s
+    result. Consider
+      %0, %1, %ctrl_0 = "ifrt.Call" @callee() ...
+    The following 3 control handles may resolve at difference time instances:
+      "ifrt.After"(%0)
+      "ifrt.After"(%1)
+      %ctrl_0
+  }];
+
+  let arguments = (ins Variadic<Ifrt_ArrayType>:$inputs);
+  let results = (outs Ifrt_ControlType:$control_output);
+}
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_OPS_TD_
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.cc b/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.cc
new file mode 100644
index 00000000000..ca14e0a5254
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.cc
@@ -0,0 +1,179 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h"
+
+#include <cstdint>
+#include <string>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+void PrintDims(llvm::raw_ostream& os, llvm::ArrayRef<int64_t> dims) {
+  os << dims[0];
+  for (int i = 1; i < dims.size(); ++i) {
+    os << "x" << dims[i];
+  }
+}
+
+// This function runs recursively to expand `permutation` from major to minor.
+// `axis_sizes` is the size of mesh dimensions before the permutation.
+// `cum_sizes` is the cumulative product of the element in `sizes`.
+// `base` is the start device id of this slice of `permutation`.
+void PopulateDevices(llvm::ArrayRef<int64_t> permutation,
+                     llvm::ArrayRef<int64_t> axis_sizes,
+                     llvm::ArrayRef<int64_t> cum_sizes,
+                     llvm::SmallVectorImpl<int64_t>& out_devices,
+                     int64_t base = 0) {
+  const int64_t expanding_dim = permutation.back();
+  const int64_t expanding_dim_size = axis_sizes[expanding_dim];
+  const int64_t expanding_cum_dim_size = cum_sizes[expanding_dim];
+  for (int64_t i = 0; i < expanding_dim_size; ++i) {
+    if (permutation.size() == 1) {
+      out_devices.push_back(base + i * expanding_cum_dim_size);
+    } else {
+      PopulateDevices(permutation.drop_back(), axis_sizes, cum_sizes,
+                      out_devices, base + i * expanding_cum_dim_size);
+    }
+  }
+}
+
+}  // namespace
+
+mlir::LogicalResult ShardingParam::MinorToMajor::verify(
+    llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const {
+  if (permutation.size() != axis_sizes.size()) {
+    return emit_error()
+           << "Expect same size for `permutation` and `axis_sizes`. Actual "
+           << permutation.size() << " vs " << axis_sizes.size();
+  }
+  return mlir::success();
+}
+
+void ShardingParam::MinorToMajor::ToDeviceList(
+    llvm::SmallVectorImpl<int64_t>& out_devices) const {
+  llvm::SmallVector<int64_t, 4> cum_sizes;
+  int64_t cum_size = 1;
+  cum_sizes.reserve(axis_sizes.size());
+  for (auto size : axis_sizes) {
+    cum_sizes.push_back(cum_size);
+    cum_size *= size;
+  }
+  PopulateDevices(permutation, axis_sizes, cum_sizes, out_devices);
+}
+
+mlir::FailureOr<ShardingParam> ShardingParam::Parse(
+    mlir::AsmParser& ods_parser) {
+  llvm::SmallVector<int64_t, 4> dim_shards;
+  MinorToMajor minor_to_major;
+
+  auto parseIntoPermutation = [&]() -> mlir::ParseResult {
+    int item;
+    if (auto result = ods_parser.parseInteger(item)) {
+      return result;
+    } else {
+      minor_to_major.permutation.push_back(item);
+    }
+    return mlir::ParseResult::success();
+  };
+
+  if (ods_parser.parseDimensionList(dim_shards, false, false) ||
+      ods_parser.parseKeyword("to") ||
+      ods_parser.parseCommaSeparatedList(mlir::AsmParser::Delimiter::Square,
+                                         parseIntoPermutation) ||
+      ods_parser.parseKeyword("on") ||
+      ods_parser.parseDimensionList(minor_to_major.axis_sizes, false, false)) {
+    return mlir::failure();
+  }
+
+  return ShardingParam(dim_shards, minor_to_major);
+}
+
+mlir::LogicalResult ShardingParam::verify(
+    llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const {
+  if (mlir::failed(minor_to_major().verify(emit_error))) {
+    return mlir::failure();
+  }
+
+  int64_t dim_index = 0;
+  int64_t cum_size = 1;
+  for (const int64_t index : minor_to_major().permutation) {
+    while (dim_index < dim_shards().size() && dim_shards()[dim_index] == 1) {
+      dim_index++;
+    }
+    if (dim_index == dim_shards().size()) {
+      break;
+    }
+
+    cum_size *= minor_to_major().axis_sizes[index];
+    if (cum_size > dim_shards()[dim_index]) {
+      return emit_error() << "Dimension #" << dim_index << " of "
+                          << dim_shards()[dim_index]
+                          << " shards can't be assigned to the axes";
+    } else if (cum_size == dim_shards()[dim_index]) {
+      cum_size = 1;
+      dim_index++;
+    }
+  }
+  while (dim_index < dim_shards().size() && dim_shards()[dim_index] == 1) {
+    dim_index++;
+  }
+  if (dim_index != dim_shards().size()) {
+    return emit_error() << "Can't shard the dims " << dim_shards()
+                        << " to the mesh of " << minor_to_major().permutation
+                        << " on " << minor_to_major().axis_sizes;
+  }
+
+  return mlir::success();
+}
+
+std::string ShardingParam::DebugString() const {
+  std::string result;
+  llvm::raw_string_ostream os(result);
+  os << *this;
+  return result;
+}
+
+llvm::hash_code hash_value(ShardingParam sharding) {
+  return sharding.hash_value();
+}
+
+mlir::AsmPrinter& operator<<(mlir::AsmPrinter& os, ShardingParam sharding) {
+  os.getStream() << sharding;
+  return os;
+}
+
+llvm::raw_ostream& operator<<(llvm::raw_ostream& os, ShardingParam sharding) {
+  PrintDims(os, sharding.dim_shards());
+  os << " to [";
+  llvm::interleaveComma(
+      llvm::ArrayRef<int64_t>(sharding.minor_to_major().permutation), os);
+  os << "] on ";
+  PrintDims(os, sharding.minor_to_major().axis_sizes);
+  return os;
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h b/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h
new file mode 100644
index 00000000000..6c965e1cfd3
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h
@@ -0,0 +1,134 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_SHARDING_PARAM_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_SHARDING_PARAM_H_
+
+#include <cstdint>
+#include <string>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace xla {
+namespace ifrt {
+
+// Represents the sharding of an array in IFRT IR.
+//
+// The assembly format is
+//   $dim_shards to $permutation on $axis_sizes
+//
+// `dim_shards` has rank matching the tensor. Its sizes tell how to distribute
+// the corresponding dimensions of the tensor to the mesh axes. The `dim_shards`
+// then will be mapped to the `permutation` of axes in `minor_to_major`,
+// uniquely determining the slice of tensor on each logical device. For example:
+//
+// 2x1x3 to [1,0] on 3x2
+//   means to shard a rank-3 tensor into 2 slices in dim-0 and 3 slices in
+//   dim-2. The 6 slices will be distributed to 6 logical devices in the order
+//   of 0,3,1,4,2,5.
+//
+// 2x1 to [0,1] on 2x3
+//   means to shard a rank-2 tensor into 2 slices in dim-0. The 2 slices will
+//   be distributed to 2 groups replicated on the 3 devices in each group. The
+//   groups of logical devices are (0,1,2), (3,4,5).
+//
+// 4 to [1,0] on 2x2
+//   means to shard a rank-1 tensor into 4 slices. The 4 slices will be
+//   distributed to 4 logical devices in the order of 0,2,1,3.
+//
+// 1x1 to [0,1] on 2
+//   is invalid, because `permutation` and `axis_sizes` has different sizes.
+//
+// 2x2 to [0] on 2
+//   is invalid, because the 4 slices can't be distributed to 2 devices.
+//
+// 1x2 to [0,1] on 3x2
+//   is invalid, because the 2 slices on dim-1 can't be distributed to 3 devices
+//   in axis-0.
+//
+// See `support` directory for conversions with other sharding annotations.
+//
+// TODO(b/271129892): Should we support maximal sharding here?
+class ShardingParam {
+ public:
+  // Represents a permutation of mesh dimensions from minor to major.
+  //
+  // Sizes of `permutation` and `sizes` must be equal.
+  struct MinorToMajor {
+    // A permutation of range [0...n].
+    llvm::SmallVector<int64_t, 4> permutation;
+    // The size of mesh dimensions before the permutation.
+    llvm::SmallVector<int64_t, 4> axis_sizes;
+
+    mlir::LogicalResult verify(
+        llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const;
+
+    bool operator==(const MinorToMajor& other) const {
+      return permutation == other.permutation && axis_sizes == other.axis_sizes;
+    }
+
+    // Produces a flat list of device ids according to the permutation.
+    void ToDeviceList(llvm::SmallVectorImpl<int64_t>& out_devices) const;
+  };
+
+  ShardingParam(llvm::ArrayRef<int64_t> dim_shards, MinorToMajor minor_to_major)
+      : dim_shards_(dim_shards), minor_to_major_(minor_to_major) {}
+
+  static mlir::FailureOr<ShardingParam> Parse(mlir::AsmParser& ods_parser);
+  mlir::LogicalResult verify(
+      llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const;
+
+  llvm::ArrayRef<int64_t> dim_shards() const { return dim_shards_; }
+  const MinorToMajor& minor_to_major() const { return minor_to_major_; }
+
+  bool operator==(const ShardingParam& other) const {
+    return dim_shards_ == other.dim_shards_ &&
+           minor_to_major_ == other.minor_to_major_;
+  }
+
+  bool operator!=(const ShardingParam& other) const {
+    return !(*this == other);
+  }
+
+  llvm::hash_code hash_value() const {
+    return llvm::hash_combine(
+        dim_shards(), llvm::ArrayRef<int64_t>(minor_to_major_.permutation),
+        llvm::ArrayRef<int64_t>(minor_to_major_.axis_sizes));
+  }
+
+  std::string DebugString() const;
+
+ private:
+  llvm::SmallVector<int64_t, 4> dim_shards_;
+  MinorToMajor minor_to_major_;
+};
+
+llvm::hash_code hash_value(ShardingParam sharding);
+
+mlir::AsmPrinter& operator<<(mlir::AsmPrinter& os, ShardingParam sharding);
+
+llvm::raw_ostream& operator<<(llvm::raw_ostream& os, ShardingParam sharding);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_SHARDING_PARAM_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/BUILD b/tensorflow/compiler/xla/python/ifrt/ir/tests/BUILD
new file mode 100644
index 00000000000..32967d42ea0
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/BUILD
@@ -0,0 +1,33 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "//tensorflow/compiler/xla:run_lit.sh",
+    test_file_exts = ["mlir"],
+)
+
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        ":ifrt-opt",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+)
+
+cc_binary(
+    name = "ifrt-opt",
+    srcs = ["ifrt-opt.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/python/ifrt/ir",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MlirOptLib",
+    ],
+)
diff --git a/tensorflow/lite/core/shims/cc/tools/verifier_internal.h b/tensorflow/compiler/xla/python/ifrt/ir/tests/ifrt-opt.cc
similarity index 51%
rename from tensorflow/lite/core/shims/cc/tools/verifier_internal.h
rename to tensorflow/compiler/xla/python/ifrt/ir/tests/ifrt-opt.cc
index b9e23a7b5b0..63d173a6dab 100644
--- a/tensorflow/lite/core/shims/cc/tools/verifier_internal.h
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/ifrt-opt.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,19 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_TOOLS_VERIFIER_INTERNAL_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_CC_TOOLS_VERIFIER_INTERNAL_H_
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.h"
 
-/// For documentation,
-/// see third_party/tensorflow/lite/tools/verifier_internal.h.
-#include "tensorflow/lite/tools/verifier_internal.h"
+int main(int argc, char** argv) {
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  registry.insert<xla::ifrt::IfrtDialect>();
 
-namespace tflite_shims {
-namespace internal {
-
-using ::tflite::internal::VerifyFlatBufferAndGetModel;  // NOLINT
-
-}  // namespace internal
-}  // namespace tflite_shims
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_TOOLS_VERIFIER_INTERNAL_H_
+  return mlir::asMainReturnCode(
+      mlir::MlirOptMain(argc, argv, "IFRT dialect driver\n", registry));
+}
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_array.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_array.mlir
new file mode 100644
index 00000000000..73d63bb49bc
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_array.mlir
@@ -0,0 +1,63 @@
+// RUN: ifrt-opt %s -split-input-file -verify-diagnostics
+
+func.func @good_array() {
+  /// Dim 0 of the tensor is sharded into 4 slices.
+  /// Dim 1 is unsharded.
+  /// The 4 slices are distributed to the axes [1,0] of the 2x2x3 mesh.
+  /// Axes 2 of size 3 is replicated.
+  /// Specifically, the 4 slices are distributed to:
+  ///   Slice 0 to device 0,4,8
+  ///   Slice 1 to device 2,6,10
+  ///   Slice 2 to device 1,5,9
+  ///   Slice 3 to device 3,7,11
+  /// The equivalent HloSharding is
+  ///   {devices=[4,1,3]0,2,1,3,4,6,5,7,8,10,9,11 replicate_on_last_dim}
+  %0 = builtin.unrealized_conversion_cast to
+      !ifrt.array<tensor<4x6xi32>, 4x1 to [1,0,2] on 2x2x3, [0,1,2,3,4,5,6,7,8,9,10,11]>
+  return
+}
+
+// -----
+
+func.func @array_devices_should_be_distinct() {
+  // expected-error@+2 {{`devices` has duplicated id 0}}
+  %0 = builtin.unrealized_conversion_cast to
+      !ifrt.array<tensor<4x4xi32>, 1x1 to [0] on 2, [0,0]>
+  return
+}
+
+// -----
+
+func.func @array_requires_same_permutation_and_axis_sizes() {
+  // expected-error@+2 {{Expect same size for `permutation` and `axis_sizes`. Actual 2 vs 1}}
+  %0 = builtin.unrealized_conversion_cast to
+      !ifrt.array<tensor<4x4xi32>, 1x1 to [0,1] on 2, [0,1]>
+  return
+}
+
+// -----
+
+func.func @array_requires_enough_devices() {
+  // expected-error@+2 {{Can't shard the dims 2, 2 to the mesh of 0 on 2}}
+  %0 = builtin.unrealized_conversion_cast to
+      !ifrt.array<tensor<4x4xi32>, 2x2 to [0] on 2, [0,1]>
+  return
+}
+
+// -----
+
+func.func @array_requires_shard_distributable_to_axes() {
+  // expected-error@+2 {{Dimension #1 of 2 shards can't be assigned to the axes}}
+  %0 = builtin.unrealized_conversion_cast to
+      !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 3, [0,1,2]>
+  return
+}
+
+// -----
+
+func.func @array_requires_same_size_of_devices_and_from_axes() {
+  // expected-error@+2 {{Requires the same amount of `devices` and from `sharding`. Actual: 3 vs 4}}
+  %0 = builtin.unrealized_conversion_cast to
+      !ifrt.array<tensor<4x4xi32>, 2x2 to [0,1] on 2x2, [0,1,2]>
+  return
+}
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_assemble.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_assemble.mlir
new file mode 100644
index 00000000000..3c5017a5020
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_assemble.mlir
@@ -0,0 +1,40 @@
+// RUN: ifrt-opt %s -split-input-file -verify-diagnostics
+
+func.func @good_assemble(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
+    %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>) {
+  %0 = "ifrt.Assemble"(%arg0, %arg1)
+      {operand_segment_sizes=array<i32: 2, 0>}
+      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
+         !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
+      -> !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+// -----
+
+func.func @assemble_requires_inputs_on_single_devices(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0,1]>,
+    %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [2]>) {
+  // expected-error@+1 {{'ifrt.Assemble' op requires every input to be a single device array. Actual: '!ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0, 1]>'}}
+  %0 = "ifrt.Assemble"(%arg0, %arg1)
+      {operand_segment_sizes=array<i32: 2, 0>}
+      : (!ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0,1]>,
+         !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [2]>)
+      -> !ifrt.array<tensor<2x4xi32>, 1x3 to [0] on 3, [0,1,2]>
+  return
+}
+
+// -----
+
+func.func @assemble_requires_same_device_list(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
+    %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>) {
+  // expected-error@+1 {{'ifrt.Assemble' op requires the same input/output device list. Input 0, 1 vs Output 1, 2}}
+  %0 = "ifrt.Assemble"(%arg0, %arg1)
+      {operand_segment_sizes=array<i32: 2, 0>}
+      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
+         !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
+      -> !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [1,2]>
+  return
+}
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call.mlir
new file mode 100644
index 00000000000..3220029bf31
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call.mlir
@@ -0,0 +1,281 @@
+// RUN: ifrt-opt %s -split-input-file -verify-diagnostics
+
+func.func @good_call(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  %0, %ctrl_0 = ifrt.Call @good_call_callee(%arg0)
+    {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @good_call_callee(%arg0: tensor<2x2xi32>) -> tensor<4x4xi32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
+  return %0 : tensor<4x4xi32>
+}
+
+// -----
+
+func.func @good_call_with_control_dep(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>, %arg1: !ifrt.control) {
+  %0, %ctrl_0 = ifrt.Call @good_call_with_control_dep_callee(%arg0) after %arg1
+    {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @good_call_with_control_dep_callee(%arg0: tensor<2x2xi32>) -> tensor<4x4xi32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
+  return %0 : tensor<4x4xi32>
+}
+
+// -----
+
+func.func @good_call_with_io_aliases(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0)
+    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 0, 0>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+  return %arg0 : tensor<2x2xi32>
+}
+
+// -----
+
+func.func @call_requires_valid_reference(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op requires '@missing_reference' to reference a valid function}}
+  %0, %ctrl_0 = ifrt.Call @missing_reference(%arg0)
+    {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+// -----
+
+func.func @call_requires_same_input_size(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op requires the same input size. Input 1 vs Callee 0}}
+  %0, %ctrl_0 = ifrt.Call @call_requires_same_input_size_callee(%arg0)
+    {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @call_requires_same_input_size_callee() -> (tensor<4x4xi32>) {
+  %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
+  return %0 : tensor<4x4xi32>
+}
+
+// -----
+
+func.func @call_requires_same_input_shape(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op requires the same global shape. Input #0 'tensor<2x2xi32>' vs Callee 'tensor<2x4xi32>'}}
+  %0, %ctrl_0 = ifrt.Call @call_requires_same_input_shape_callee(%arg0)
+    {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @call_requires_same_input_shape_callee(%arg0: tensor<2x4xi32>)
+   -> tensor<4x4xi32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
+  return %0 : tensor<4x4xi32>
+}
+
+// -----
+
+func.func @call_requires_same_output_size(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op requires the same output size. Output 1 vs Callee 0}}
+  %0, %ctrl_0 = ifrt.Call @call_requires_same_output_size_callee(%arg0)
+    {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @call_requires_same_output_size_callee(%arg0: tensor<2x2xi32>) {
+  return
+}
+
+// -----
+
+func.func @call_requires_same_output_shape(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op requires the same global shape. Output #0 'tensor<4x4xi32>' vs Callee 'tensor<2x4xi32>'}}
+  %0, %ctrl_0 = ifrt.Call @call_requires_same_output_shape_callee(%arg0)
+    {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @call_requires_same_output_shape_callee(%arg0: tensor<2x2xi32>)
+    -> tensor<2x4xi32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<2x4xi32>
+  return %0 : tensor<2x4xi32>
+}
+
+// -----
+
+func.func @call_requires_unique_devices_attr(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op has duplicate device id 0 in `devices` attr}}
+  %0, %ctrl_0 = ifrt.Call @call_requires_unique_devices_attr_callee(%arg0)
+    {devices=array<i64: 0, 0>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @call_requires_unique_devices_attr_callee(%arg0: tensor<2x2xi32>)
+    -> tensor<4x4xi32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
+  return %0 : tensor<4x4xi32>
+}
+
+// -----
+
+func.func @call_requires_input_place_on_devices(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,2]>) {
+  // expected-error@+1 {{'ifrt.Call' op requires all inputs placed on `devices` attr. The following input is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 2]>'}}
+  %0, %ctrl_0 = ifrt.Call @call_requires_input_place_on_devices_callee(%arg0)
+    {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,2]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @call_requires_input_place_on_devices_callee(%arg0: tensor<2x2xi32>)
+    -> tensor<4x4xi32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
+  return %0 : tensor<4x4xi32>
+}
+
+// -----
+
+func.func @call_requires_output_place_on_devices(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op requires all outputs placed on `devices` attr. The following output is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0, 2]>'}}
+  %0, %ctrl_0 = ifrt.Call @call_requires_output_place_on_devices_callee(%arg0)
+    {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,2]>
+  return
+}
+
+func.func @call_requires_output_place_on_devices_callee(%arg0: tensor<2x2xi32>)
+    -> tensor<4x4xi32> {
+  %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
+  return %0 : tensor<4x4xi32>
+}
+
+// -----
+
+func.func @io_aliases_should_be_pairs(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op attribute 'io_aliases' failed to satisfy constraint: Array of pairs of aliased input/output indices}}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0)
+    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 0>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+  return %arg0 : tensor<2x2xi32>
+}
+
+// -----
+
+func.func @io_aliases_should_have_valid_input_index(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op can't alias input #1 to output #0 as only having 1 inputs}}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0)
+    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 1, 0>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+  return %arg0 : tensor<2x2xi32>
+}
+
+// -----
+
+func.func @io_aliases_should_only_alias_input_once(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op can't alias input #0 more than once}}
+  %0, %1, %ctrl_0 = ifrt.Call @callee(%arg0)
+    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 0, 0>, array<i32: 0, 1>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
+        !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+  return
+}
+
+func.func @callee(%arg0: tensor<2x2xi32>)
+    -> (tensor<2x2xi32>, tensor<2x2xi32>) {
+  return %arg0, %arg0 : tensor<2x2xi32>, tensor<2x2xi32>
+}
+
+// -----
+
+func.func @io_aliases_should_have_valid_output_index(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op can't alias input #0 to output #1 as only having 1 outputs}}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0)
+    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 0, 1>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+  return %arg0 : tensor<2x2xi32>
+}
+
+// -----
+
+func.func @io_aliases_should_only_alias_output_once(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op can't alias output #0 more than once}}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0, %arg0)
+    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 0, 0>, array<i32: 1, 0>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
+       !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @callee(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>)
+    -> tensor<2x2xi32> {
+  return %arg0 : tensor<2x2xi32>
+}
+
+// -----
+
+func.func @io_aliases_should_have_same_type(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Call' op can't alias input #0 to output #0 with different types: '!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 1]>' vs '!ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0, 1]>'}}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0)
+    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 0, 0>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+  return %arg0 : tensor<2x2xi32>
+}
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call_loaded_executable.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call_loaded_executable.mlir
new file mode 100644
index 00000000000..5de3b308b19
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call_loaded_executable.mlir
@@ -0,0 +1,162 @@
+// RUN: ifrt-opt %s -split-input-file -verify-diagnostics
+
+func.func @good(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+
+// -----
+
+func.func @good_with_control_dep(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
+    %arg1: !ifrt.control) {
+  %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0) after %arg1
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+
+// -----
+
+func.func @requires_valid_reference() {
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op requires '@missing_reference' to reference a valid LoadedExecutable}}
+  %ctrl_0 = ifrt.CallLoadedExecutable @missing_reference() : () -> ()
+  return
+}
+
+// -----
+
+func.func @requires_loaded_executable_callee(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op requires '@wrong_reference' to reference a valid LoadedExecutable}}
+  %ctrl_0 = ifrt.CallLoadedExecutable @wrong_reference() : () -> ()
+  return
+}
+
+func.func @wrong_reference() {
+  return
+}
+
+// -----
+
+func.func @requires_matching_signature(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op requires callee signature matching '(!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 1]>) -> !ifrt.array<tensor<4x3xi32>, 1x2 to [0] on 2, [0, 1]>'. Actual '(!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 1]>) -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0, 1]>'}}
+  %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x3xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+
+// -----
+
+func.func @io_aliases_should_be_pairs(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op attribute 'io_aliases' failed to satisfy constraint: Array of pairs of aliased input/output indices}}
+  %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
+    {io_aliases=[array<i32: 0>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+  return
+}
+
+ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+
+// -----
+
+func.func @io_aliases_should_have_valid_input_index(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #1 to output #0 as only having 1 inputs}}
+  %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
+    {io_aliases=[array<i32: 1, 0>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+  return
+}
+
+ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+
+// -----
+
+func.func @io_aliases_should_only_alias_input_once(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #0 more than once}}
+  %0, %1, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
+    {io_aliases=[array<i32: 0, 0>, array<i32: 0, 1>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
+        !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+  return
+}
+
+ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+
+// -----
+
+func.func @io_aliases_should_have_valid_output_index(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #0 to output #1 as only having 1 outputs}}
+  %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
+    {io_aliases=[array<i32: 0, 1>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+  return
+}
+
+ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+
+// -----
+
+func.func @io_aliases_should_only_alias_output_once(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias output #0 more than once}}
+  %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0, %arg0)
+    {io_aliases=[array<i32: 0, 0>, array<i32: 1, 0>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
+       !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+  return
+}
+
+ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+
+// -----
+
+func.func @io_aliases_should_have_same_type(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #0 to output #0 with different types: '!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 1]>' vs '!ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0, 1]>'}}
+  %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
+    {io_aliases=[array<i32: 0, 0>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+  return
+}
+
+ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_disassemble.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_disassemble.mlir
new file mode 100644
index 00000000000..28fc16a0f4b
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_disassemble.mlir
@@ -0,0 +1,37 @@
+// RUN: ifrt-opt %s -split-input-file -verify-diagnostics
+
+func.func @good_disassemble(
+    %arg0: !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>) {
+  %0, %1 = "ifrt.Disassemble"(%arg0)
+      {operand_segment_sizes=array<i32: 2, 0>}
+      : (!ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>)
+      -> (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
+          !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
+  return
+}
+
+// -----
+
+func.func @disassemble_requires_outputs_on_single_devices(
+    %arg0: !ifrt.array<tensor<2x4xi32>, 1x4 to [0, 1] on 2x2, [0,1,2,3]>) {
+  // expected-error@+1 {{'ifrt.Disassemble' op requires every output to be a single device array. Actual: '!ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0, 1]>'}}
+  %0, %1 = "ifrt.Disassemble"(%arg0)
+      {operand_segment_sizes=array<i32: 2, 0>}
+      : (!ifrt.array<tensor<2x4xi32>, 1x4 to [0, 1] on 2x2, [0,1,2,3]>)
+      -> (!ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0,1]>,
+          !ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [2,3]>)
+  return
+}
+
+// -----
+
+func.func @disassemble_requires_same_device_list(
+    %arg0: !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Disassemble' op requires the same input/output device list. Input 0, 1 vs Output 1, 2}}
+  %0, %1 = "ifrt.Disassemble"(%arg0)
+      {operand_segment_sizes=array<i32: 2, 0>}
+      : (!ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>)
+      -> (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>,
+          !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [2]>)
+  return
+}
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_loaded_executable.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_loaded_executable.mlir
new file mode 100644
index 00000000000..b5c978e5bd2
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_loaded_executable.mlir
@@ -0,0 +1,40 @@
+// RUN: ifrt-opt %s -split-input-file -verify-diagnostics
+
+ifrt.LoadedExecutable @good {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+
+// -----
+
+// expected-error@+1 {{'ifrt.LoadedExecutable' op requires all inputs to be IfrtArrayType. Found 'tensor<2x2xi32>'}}
+ifrt.LoadedExecutable @requires_array_input {devices=array<i64: 0, 1>}
+    : (tensor<2x2xi32>) -> ()
+
+// -----
+
+// expected-error@+1 {{'ifrt.LoadedExecutable' op requires all outputs to be IfrtArrayType. Found 'tensor<2x2xi32>'}}
+ifrt.LoadedExecutable @requires_array_output {devices=array<i64: 0, 1>}
+    : () -> tensor<2x2xi32>
+
+// -----
+
+// expected-error@+1 {{'ifrt.LoadedExecutable' op has duplicate device id 0 in `devices` attr}}
+ifrt.LoadedExecutable @requires_unique_devices_attr {devices=array<i64: 0, 0>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+
+// -----
+
+// expected-error@+1 {{'ifrt.LoadedExecutable' op requires all inputs placed on `devices` attr. The following input is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 2]>'}}
+ifrt.LoadedExecutable @requires_input_place_on_devices
+    {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,2]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+
+// -----
+
+// expected-error@+1 {{'ifrt.LoadedExecutable' op requires all outputs placed on `devices` attr. The following output is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0, 2]>'}}
+ifrt.LoadedExecutable @requires_output_place_on_devices
+    {devices=array<i64: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,2]>
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_reshard.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_reshard.mlir
new file mode 100644
index 00000000000..821bbbef57d
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_reshard.mlir
@@ -0,0 +1,20 @@
+// RUN: ifrt-opt %s -split-input-file -verify-diagnostics
+
+func.func @good_reshard(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  %0 = "ifrt.Reshard"(%arg0)
+      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 4, [0,1,2,3]>
+  return
+}
+
+// -----
+
+func.func @reshard_requires_same_global_shape(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Reshard' op requires the same global shape. Input 'tensor<2x2xi32>' vs Output 'tensor<2x1xi32>'}}
+  %0 = "ifrt.Reshard"(%arg0)
+      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+      -> !ifrt.array<tensor<2x1xi32>, 1x1 to [0] on 2, [2,3]>
+  return
+}
diff --git a/tensorflow/compiler/xla/python/ifrt/sharding.cc b/tensorflow/compiler/xla/python/ifrt/sharding.cc
index 5c23039766d..f9a7a2bb3bd 100644
--- a/tensorflow/compiler/xla/python/ifrt/sharding.cc
+++ b/tensorflow/compiler/xla/python/ifrt/sharding.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/ifrt/sharding.h"
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <ostream>
@@ -22,8 +23,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "llvm/ADT/SmallVector.h"
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+#include "tensorflow/compiler/xla/python/ifrt/index.h"
 #include "tensorflow/compiler/xla/python/ifrt/index_domain.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -31,9 +37,121 @@ limitations under the License.
 namespace xla {
 namespace ifrt {
 
+namespace {
+
+// Iterates the major-to-minor Cartesian product of a Span of containers of the
+// same type.
+//
+// For example, for {1, 2, 3} x {4, 5}, it iterates in the order of
+//   {1, 4}, {1, 5}, {2, 4}, {2, 5}, {3, 4}, {3, 5}
+// The values are copied into the result vectors.
+template <typename ContainerT>
+class MajorToMinorIter {
+ public:
+  using IteratorT = typename ContainerT::const_iterator;
+  using ValueT = typename ContainerT::value_type;
+
+  // Returns the iterator at the begin of the Cartesian product.
+  static MajorToMinorIter<ContainerT> cbegin(
+      absl::Span<const ContainerT> containers) {
+    std::vector<IteratorT> iters;
+    iters.reserve(containers.size());
+    for (const ContainerT& container : containers) {
+      iters.push_back(container.cbegin());
+    }
+    return MajorToMinorIter(containers, std::move(iters));
+  }
+
+  // Returns the vector of values at the iteration point.
+  std::vector<ValueT> operator*() const {
+    std::vector<ValueT> result;
+    result.reserve(iters_.size());
+    for (const auto& iter : iters_) {
+      result.push_back(*iter);
+    }
+    return result;
+  }
+
+  // Moves to the next.
+  void operator++() {
+    for (int i = iters_.size() - 1; i >= 0; --i) {
+      ++iters_[i];
+      if (iters_[i] != containers_[i].end()) {
+        break;
+      }
+      if (i != 0) {
+        // Carry over.
+        iters_[i] = containers_[i].begin();
+      }
+    }
+  }
+
+  // Returns whether the iterator has reached the end.
+  // Note: Due to the implementation of ++, not all iters_ is end().
+  bool IsEnd() const {
+    return iters_.empty() || iters_[0] == containers_[0].end();
+  }
+
+ private:
+  MajorToMinorIter(absl::Span<const ContainerT> containers,
+                   std::vector<IteratorT> iters)
+      : containers_(containers), iters_(iters) {
+    DCHECK_EQ(iters.size(), containers.size());
+  }
+
+  absl::Span<const ContainerT> containers_;
+  std::vector<IteratorT> iters_;
+};
+
+// Returns the indices of the tiles.
+//
+// For example, when `dim_shards` is {2, 3}, the result is
+//   {0, 0}, {0, 1}, {0, 2}, {1, 0}, {1, 1}, {1, 2}
+std::vector<Index> GetTileIndicies(absl::Span<const int64_t> dim_shards) {
+  std::vector<std::vector<int64_t>> indices;
+  indices.reserve(dim_shards.size());
+  for (const int64_t dim_shard : dim_shards) {
+    std::vector<int64_t> index(dim_shard);
+    absl::c_iota(index, 0);
+    indices.push_back(std::move(index));
+  }
+
+  std::vector<Index> result;
+  int64_t shard_count =
+      absl::c_accumulate(dim_shards, 1, std::multiplies<int64_t>());
+  result.reserve(shard_count);
+  for (auto iter = MajorToMinorIter<std::vector<int64_t>>::cbegin(indices);
+       !iter.IsEnd(); ++iter) {
+    result.push_back(Index(*iter));
+  }
+  return result;
+}
+
+// Returns the tile shape after disassembling `shape` with `sharding_param`.
+//
+// Fails if can't shard evenly.
+StatusOr<Shape> GetDisassembledShape(const ShardingParam& sharding_param,
+                                     const Shape& shape) {
+  std::vector<int64_t> dims;
+  dims.reserve(shape.dims().size());
+  for (const auto [dim, dim_shards] :
+       llvm::zip(shape.dims(), sharding_param.dim_shards())) {
+    if (dim % dim_shards != 0) {
+      return FailedPrecondition(
+          "Uneven shard is not supported. dim: %d, dim_shards: %d", dim,
+          dim_shards);
+    }
+    dims.push_back(dim / dim_shards);
+  }
+  return Shape(dims);
+}
+
+}  // namespace
+
 char Sharding::ID = 0;
 char SingleDeviceSharding::ID = 0;
 char OpaqueSharding::ID = 0;
+char ShardingParamSharding::ID = 0;
 
 std::ostream& operator<<(std::ostream& os, const Sharding& sharding) {
   return os << sharding.DebugString();
@@ -131,5 +249,88 @@ std::string OpaqueSharding::DebugString() const {
       }));
 }
 
+StatusOr<std::shared_ptr<const Sharding>> ShardingParamSharding::Create(
+    ShardingParam sharding_param, DeviceList devices) {
+  int64_t device_count =
+      absl::c_accumulate(sharding_param.minor_to_major().axis_sizes, 1,
+                         std::multiplies<int64_t>());
+  if (device_count != devices.size()) {
+    return FailedPrecondition(
+        "Device counts don't match. From ShardingParam %d vs from DeviceList "
+        "%d",
+        device_count, devices.size());
+  }
+  return std::shared_ptr<const Sharding>(
+      new ShardingParamSharding(std::move(sharding_param), std::move(devices)));
+}
+
+StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+ShardingParamSharding::Disassemble(const Shape& shape) const {
+  DCHECK(this);
+  if (shape.dims().size() != sharding_param_.dim_shards().size()) {
+    return FailedPrecondition(
+        "Ranks don't match. From Shape %d vs from ShardingParam %d",
+        shape.dims().size(), sharding_param_.dim_shards().size());
+  }
+
+  TF_ASSIGN_OR_RETURN(Shape local_shape,
+                      GetDisassembledShape(sharding_param_, shape));
+
+  std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>> result;
+  for (Device* device : devices_) {
+    result.push_back({local_shape, SingleDeviceSharding::Create(device)});
+  }
+
+  return result;
+}
+
+StatusOr<std::vector<IndexDomain>> ShardingParamSharding::IndexDomains(
+    const Shape& shape) const {
+  DCHECK(this);
+
+  // Calculate the origins of tiles, ignoring device assignments.
+  TF_ASSIGN_OR_RETURN(Shape local_shape,
+                      GetDisassembledShape(sharding_param_, shape));
+  std::vector<Index> tile_indices =
+      GetTileIndicies(sharding_param_.dim_shards());
+  std::vector<Index> origins;
+  origins.reserve(tile_indices.size());
+  for (const Index& tile_index : tile_indices) {
+    origins.push_back(tile_index * local_shape.dims());
+  }
+
+  // Calculate the device assignments.
+  // `origins[i]` should go to `device_list[i]`.
+  static constexpr int64_t kInvalidIndex = -1;
+  llvm::SmallVector<int64_t, 4> device_list;
+  sharding_param_.minor_to_major().ToDeviceList(device_list);
+  std::vector<int64_t> device_to_index(device_list.size(), kInvalidIndex);
+  for (int i = 0; i < device_list.size(); ++i) {
+    device_to_index[device_list[i]] = i;
+  }
+
+  // Replication is the minor axis in `device_list`.
+  DCHECK_EQ(device_to_index.size() % origins.size(), 0);
+  int replication = device_to_index.size() / origins.size();
+
+  std::vector<IndexDomain> result;
+  result.reserve(device_to_index.size());
+  for (int i = 0; i < device_to_index.size(); ++i) {
+    int64_t index = device_to_index[i];
+    DCHECK_NE(index, kInvalidIndex);
+    result.push_back(IndexDomain(origins[index / replication], local_shape));
+  }
+  return result;
+}
+
+std::string ShardingParamSharding::DebugString() const {
+  DCHECK(this);
+  return absl::StrFormat(
+      "ShardingParamSharding(%s, devices: %s)", sharding_param_.DebugString(),
+      absl::StrJoin(devices_, ",", [](std::string* out, const Device* device) {
+        absl::StrAppend(out, device->ToString());
+      }));
+}
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/sharding.h b/tensorflow/compiler/xla/python/ifrt/sharding.h
index fe3a31d99a2..c969865a2fb 100644
--- a/tensorflow/compiler/xla/python/ifrt/sharding.h
+++ b/tensorflow/compiler/xla/python/ifrt/sharding.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "tensorflow/compiler/xla/python/ifrt/device.h"
 #include "tensorflow/compiler/xla/python/ifrt/index_domain.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h"
 #include "tensorflow/compiler/xla/python/ifrt/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
@@ -152,6 +153,31 @@ class OpaqueSharding : public llvm::RTTIExtends<OpaqueSharding, Sharding> {
   DisassembleFunc disassemble_func_;
 };
 
+// Sharding derived from an IR ShardingParam.
+class ShardingParamSharding
+    : public llvm::RTTIExtends<ShardingParamSharding, Sharding> {
+ public:
+  static StatusOr<std::shared_ptr<const Sharding>> Create(
+      ShardingParam sharding_param, DeviceList devices);
+
+  StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+  Disassemble(const Shape& shape) const override;
+
+  StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape) const override;
+
+  std::string DebugString() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  ShardingParamSharding(ShardingParam sharding_param, DeviceList devices)
+      : llvm::RTTIExtends<ShardingParamSharding, Sharding>(devices),
+        sharding_param_(sharding_param) {}
+
+  ShardingParam sharding_param_;
+};
+
 }  // namespace ifrt
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/python/ifrt/sharding_test.cc b/tensorflow/compiler/xla/python/ifrt/sharding_test.cc
index b3e2a13cf9c..adc54d8a774 100644
--- a/tensorflow/compiler/xla/python/ifrt/sharding_test.cc
+++ b/tensorflow/compiler/xla/python/ifrt/sharding_test.cc
@@ -22,30 +22,41 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "llvm/Support/Casting.h"
-#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
 namespace {
 
 using ::testing::ElementsAre;
+using ::testing::HasSubstr;
+using ::testing::SizeIs;
+using ::tsl::testing::StatusIs;
+
+DeviceList CreateDummyDevices(int count) {
+  DeviceList::Devices devices;
+  devices.reserve(count);
+  for (int i = 0; i < count; ++i) {
+    devices.push_back(reinterpret_cast<Device*>(i + 1));
+  }
+  return DeviceList(std::move(devices));
+}
 
 TEST(SingleDeviceShardingTest, IndexDomains) {
   std::shared_ptr<const Sharding> sharding =
       SingleDeviceSharding::Create(reinterpret_cast<Device*>(1));
 
   Shape shape({10, 20});
-  auto index_domains = sharding->IndexDomains(shape);
-  TF_ASSERT_OK(index_domains.status());
-  EXPECT_THAT(*index_domains, ElementsAre(IndexDomain(shape)));
+  TF_ASSERT_OK_AND_ASSIGN(auto index_domains, sharding->IndexDomains(shape));
+  EXPECT_THAT(index_domains, ElementsAre(IndexDomain(shape)));
 }
 
 TEST(OpaqueShardingTest, Disassemble) {
-  DeviceList::Devices devices;
-  devices.reserve(2);
-  devices.push_back(reinterpret_cast<Device*>(1));
-  devices.push_back(reinterpret_cast<Device*>(2));
-  DeviceList device_list(std::move(devices));
+  DeviceList device_list = CreateDummyDevices(2);
 
   std::vector<Shape> shapes;
   shapes.reserve(2);
@@ -54,21 +65,135 @@ TEST(OpaqueShardingTest, Disassemble) {
   OpaqueSharding::DisassembleFunc disassemble_func =
       OpaqueSharding::MakeDisassembleFuncFromShapes(shapes);
 
-  std::shared_ptr<const Sharding> sharding =
+  std::shared_ptr<const Sharding> opaque_sharding =
       OpaqueSharding::Create(device_list, std::move(disassemble_func));
 
-  auto exploded = sharding->Disassemble(Shape({30}));
-  TF_ASSERT_OK(exploded.status());
+  TF_ASSERT_OK_AND_ASSIGN(auto exploded,
+                          opaque_sharding->Disassemble(Shape({30})));
 
-  ASSERT_THAT(*exploded, testing::SizeIs(2));
+  ASSERT_THAT(exploded, SizeIs(2));
   for (int i = 0; i < 2; ++i) {
-    EXPECT_EQ((*exploded)[i].first, shapes[i]);
-    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>((*exploded)[i].second.get()));
-    EXPECT_THAT((*exploded)[i].second->devices().devices(),
-                testing::ElementsAre(device_list.devices()[i]));
+    const auto& [shape, sharding] = exploded[i];
+    EXPECT_EQ(shape, shapes[i]);
+    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>(*sharding));
+    EXPECT_THAT(sharding->devices().devices(),
+                ElementsAre(device_list.devices()[i]));
   }
 }
 
+TEST(ShardingParamShardingTest, FailToCreateWhenDeviceCountNotMatch) {
+  DeviceList device_list = CreateDummyDevices(2);
+  ShardingParam param{/*dim_shards=*/{2, 3},
+                      {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+
+  EXPECT_THAT(ShardingParamSharding::Create(param, device_list),
+              StatusIs(tsl::error::FAILED_PRECONDITION,
+                       HasSubstr("Device counts don't match. From "
+                                 "ShardingParam 6 vs from DeviceList 2")));
+}
+
+TEST(ShardingParamShardingTest, Disassemble) {
+  DeviceList device_list = CreateDummyDevices(6);
+  ShardingParam param{/*dim_shards=*/{2, 3},
+                      {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<const Sharding> param_sharding,
+      ShardingParamSharding::Create(param, CreateDummyDevices(6)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto exploded,
+                          param_sharding->Disassemble(Shape({6, 6})));
+  ASSERT_THAT(exploded, SizeIs(6));
+  for (int i = 0; i < 6; ++i) {
+    const auto& [shape, sharding] = exploded[i];
+    EXPECT_EQ(shape, Shape({3, 2}));
+    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>(*sharding));
+    EXPECT_THAT(sharding->devices().devices(),
+                ElementsAre(device_list.devices()[i]));
+  }
+}
+
+TEST(ShardingParamShardingTest, DisassembleFailsWhenRankNotMatch) {
+  ShardingParam param{/*dim_shards=*/{2, 3},
+                      {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<const Sharding> param_sharding,
+      ShardingParamSharding::Create(param, CreateDummyDevices(6)));
+
+  EXPECT_THAT(
+      param_sharding->Disassemble(Shape({6, 6, 6})),
+      StatusIs(tsl::error::FAILED_PRECONDITION,
+               HasSubstr(
+                   "Ranks don't match. From Shape 3 vs from ShardingParam 2")));
+}
+
+TEST(ShardingParamShardingTest, DisassembleFailsForUnevenSharding) {
+  ShardingParam param{/*dim_shards=*/{2, 3},
+                      {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<const Sharding> param_sharding,
+      ShardingParamSharding::Create(param, CreateDummyDevices(6)));
+
+  EXPECT_THAT(
+      param_sharding->Disassemble(Shape({7, 6})),
+      StatusIs(
+          tsl::error::FAILED_PRECONDITION,
+          HasSubstr("Uneven shard is not supported. dim: 7, dim_shards: 2")));
+}
+
+TEST(ShardingParamShardingTest, IndexDomain) {
+  ShardingParam param{/*dim_shards=*/{2, 3},
+                      {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<const Sharding> param_sharding,
+      ShardingParamSharding::Create(param, CreateDummyDevices(6)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto index_domains,
+                          param_sharding->IndexDomains(Shape({6, 6})));
+  EXPECT_THAT(index_domains,
+              ElementsAre(IndexDomain(Index({0, 0}), Shape({3, 2})),
+                          IndexDomain(Index({0, 2}), Shape({3, 2})),
+                          IndexDomain(Index({0, 4}), Shape({3, 2})),
+                          IndexDomain(Index({3, 0}), Shape({3, 2})),
+                          IndexDomain(Index({3, 2}), Shape({3, 2})),
+                          IndexDomain(Index({3, 4}), Shape({3, 2}))));
+}
+
+TEST(ShardingParamShardingTest, IndexDomainWithPermutation) {
+  ShardingParam param{/*dim_shards=*/{2, 3},
+                      {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<const Sharding> param_sharding,
+      ShardingParamSharding::Create(param, CreateDummyDevices(6)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto index_domains,
+                          param_sharding->IndexDomains(Shape({6, 6})));
+  EXPECT_THAT(index_domains,
+              ElementsAre(IndexDomain(Index({0, 0}), Shape({3, 2})),
+                          IndexDomain(Index({0, 4}), Shape({3, 2})),
+                          IndexDomain(Index({3, 2}), Shape({3, 2})),
+                          IndexDomain(Index({0, 2}), Shape({3, 2})),
+                          IndexDomain(Index({3, 0}), Shape({3, 2})),
+                          IndexDomain(Index({3, 4}), Shape({3, 2}))));
+}
+
+TEST(ShardingParamShardingTest, IndexDomainWithReplication) {
+  ShardingParam param{/*dim_shards=*/{2, 1},
+                      {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<const Sharding> param_sharding,
+      ShardingParamSharding::Create(param, CreateDummyDevices(6)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto index_domains,
+                          param_sharding->IndexDomains(Shape({6, 6})));
+  EXPECT_THAT(index_domains,
+              ElementsAre(IndexDomain(Index({0, 0}), Shape({3, 6})),
+                          IndexDomain(Index({0, 0}), Shape({3, 6})),
+                          IndexDomain(Index({0, 0}), Shape({3, 6})),
+                          IndexDomain(Index({3, 0}), Shape({3, 6})),
+                          IndexDomain(Index({3, 0}), Shape({3, 6})),
+                          IndexDomain(Index({3, 0}), Shape({3, 6}))));
+}
+
 }  // namespace
 }  // namespace ifrt
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/support/BUILD b/tensorflow/compiler/xla/python/ifrt/support/BUILD
new file mode 100644
index 00000000000..448dca40f67
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/support/BUILD
@@ -0,0 +1,40 @@
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "sharding_param_to_op_sharding",
+    srcs = ["sharding_param_to_op_sharding.cc"],
+    hdrs = ["sharding_param_to_op_sharding.h"],
+    visibility = ["//tensorflow/compiler/xla/python/ifrt:friends"],
+    deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/python/ifrt/ir",
+        "//tensorflow/tsl/platform:errors",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "sharding_param_to_op_sharding_test",
+    srcs = ["sharding_param_to_op_sharding_test.cc"],
+    deps = [
+        ":sharding_param_to_op_sharding",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/python/ifrt",
+        "//tensorflow/compiler/xla/python/ifrt/ir",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.cc b/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.cc
new file mode 100644
index 00000000000..295de10acde
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.cc
@@ -0,0 +1,70 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.h"
+
+#include <cstdint>
+
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
+
+namespace xla {
+namespace ifrt {
+namespace support {
+
+StatusOr<OpSharding> ToOpSharding(const ShardingParam& sharding_param,
+                                  absl::Span<const int64_t> device_mapping) {
+  OpSharding op_sharding;
+  op_sharding.set_type(OpSharding::OTHER);
+
+  // Populate tile_assignment_dimensions.
+  auto* tile_assignment_dims = op_sharding.mutable_tile_assignment_dimensions();
+  int64_t cum_size = 1;
+  tile_assignment_dims->Reserve(sharding_param.dim_shards().size() + 1);
+  for (const int64_t dim_shard : sharding_param.dim_shards()) {
+    cum_size *= dim_shard;
+    tile_assignment_dims->Add(dim_shard);
+  }
+  int64_t device_count = 1;
+  for (const int64_t axis_size : sharding_param.minor_to_major().axis_sizes) {
+    device_count *= axis_size;
+  }
+  if (device_count != cum_size) {
+    op_sharding.set_replicate_on_last_tile_dim(true);
+    tile_assignment_dims->Add(device_count / cum_size);
+  }
+
+  // Populate tile_assignment_devices.
+  llvm::SmallVector<int64_t, 4> devices;
+  sharding_param.minor_to_major().ToDeviceList(devices);
+  auto* tile_assignment_devices = op_sharding.mutable_tile_assignment_devices();
+  tile_assignment_devices->Reserve(devices.size());
+  for (const int64_t device : devices) {
+    if (device < 0 || device >= device_mapping.size()) {
+      return tsl::errors::OutOfRange("Can't map device ", device);
+    }
+    tile_assignment_devices->Add(device_mapping[device]);
+  }
+
+  return op_sharding;
+}
+
+}  // namespace support
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.h b/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.h
new file mode 100644
index 00000000000..36ab05e6bd2
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_SUPPORT_SHARDING_PARAM_TO_OP_SHARDING_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_SUPPORT_SHARDING_PARAM_TO_OP_SHARDING_H_
+
+#include <cstdint>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace support {
+
+// Converts ShardingParam to OpSharding.
+//
+// This assumes that `sharding_param` is valid.
+//
+// Returns error when `device_mapping` can't map the logical devices in
+// `sharding_param`.
+StatusOr<OpSharding> ToOpSharding(const ShardingParam& sharding_param,
+                                  absl::Span<const int64_t> device_mapping);
+
+}  // namespace support
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_SUPPORT_SHARDING_PARAM_TO_OP_SHARDING_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding_test.cc b/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding_test.cc
new file mode 100644
index 00000000000..47c1113d23c
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding_test.cc
@@ -0,0 +1,165 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h"
+#include "tensorflow/compiler/xla/python/ifrt/shape.h"
+#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace support {
+namespace {
+
+using ::tsl::testing::StatusIs;
+
+StatusOr<xla::HloSharding> ToHloSharding(
+    const ShardingParam& sharding_param,
+    absl::Span<const int64_t> device_list) {
+  TF_ASSIGN_OR_RETURN(xla::OpSharding op_sharding,
+                      ToOpSharding(sharding_param, device_list));
+  return xla::HloSharding::FromProto(op_sharding);
+}
+
+DeviceList CreateDummyDevices(int count) {
+  DeviceList::Devices devices;
+  devices.reserve(count);
+  for (int i = 0; i < count; ++i) {
+    devices.push_back(reinterpret_cast<Device*>(i + 1));
+  }
+  return DeviceList(std::move(devices));
+}
+
+TEST(ShardingParamToOpShardingTest, Replicated) {
+  ShardingParam sharding_param{/*dim_shards=*/{1, 1, 1},
+                               {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
+  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding actual,
+                          ToHloSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
+  EXPECT_EQ(actual.ToString(), "{replicated}");
+}
+
+TEST(ShardingParamToOpShardingTest, Maximal) {
+  ShardingParam sharding_param{/*dim_shards=*/{1, 1},
+                               {/*permutation=*/{0}, /*axis_sizes=*/{1}}};
+  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding actual,
+                          ToHloSharding(sharding_param, {0}));
+  EXPECT_EQ(actual.ToString(), "{maximal device=0}");
+}
+
+TEST(ShardingParamToOpShardingTest, Permutation) {
+  ShardingParam sharding_param{/*dim_shards=*/{2, 1, 3},
+                               {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding actual,
+                          ToHloSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
+  EXPECT_EQ(actual.ToString(), "{devices=[2,1,3]0,3,1,4,2,5}");
+}
+
+TEST(ShardingParamToOpShardingTest, Partial) {
+  ShardingParam sharding_param{/*dim_shards=*/{2, 1},
+                               {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
+  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding actual,
+                          ToHloSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
+  EXPECT_EQ(actual.ToString(),
+            "{devices=[2,1,3]0,1,2,3,4,5 last_tile_dim_replicate}");
+}
+
+TEST(ShardingParamToOpShardingTest, OneDimToTwoAxes) {
+  ShardingParam sharding_param{/*dim_shards=*/{4},
+                               {/*permutation=*/{1, 0}, /*axis_sizes=*/{2, 2}}};
+  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding actual,
+                          ToHloSharding(sharding_param, {0, 1, 2, 3}));
+  EXPECT_EQ(actual.ToString(), "{devices=[4]0,2,1,3}");
+}
+
+TEST(ShardingParamToOpShardingTest, NonTrivialDeviceAssignment) {
+  ShardingParam sharding_param{/*dim_shards=*/{2, 1, 3},
+                               {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding actual,
+                          ToHloSharding(sharding_param, {6, 5, 4, 3, 2, 1}));
+  EXPECT_EQ(actual.ToString(), "{devices=[2,1,3]6,3,5,2,4,1}");
+}
+
+TEST(ShardingParamToOpShardingTest, ErrorOnDeviceAssignment) {
+  ShardingParam sharding_param{/*dim_shards=*/{2, 1, 3},
+                               {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  EXPECT_THAT(ToHloSharding(sharding_param, {6, 5, 4, 3, 2}),
+              StatusIs(tsl::error::OUT_OF_RANGE, "Can't map device 5"));
+}
+
+void AssertSameTiling(const ShardingParam& sharding_param,
+                      const HloSharding& hlo_sharding, const Shape& shape) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<const Sharding> sharding,
+      ShardingParamSharding::Create(sharding_param, CreateDummyDevices(6)));
+  const xla::Shape xla_shape(PrimitiveType::F16, shape.dims(), {}, {});
+
+  TF_ASSERT_OK_AND_ASSIGN(const std::vector<IndexDomain> index_domains,
+                          sharding->IndexDomains(shape));
+  ASSERT_EQ(index_domains.size(),
+            hlo_sharding.tile_assignment().num_elements());
+  const xla::Shape xla_tile_shape = hlo_sharding.TileShape(xla_shape);
+  for (int i = 0; i < index_domains.size(); ++i) {
+    SCOPED_TRACE(absl::StrCat("on device ", i));
+    EXPECT_EQ(index_domains[i].origin().elements(),
+              hlo_sharding.TileOffsetForDevice(xla_shape, i));
+    EXPECT_EQ(index_domains[i].shape().dims(), xla_tile_shape.dimensions());
+  }
+}
+
+TEST(ShardingParamToOpShardingEquivalentTest, FullySharded) {
+  ShardingParam sharding_param{/*dim_shards=*/{2, 3},
+                               {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
+  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_sharding,
+                          ToHloSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
+  AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
+}
+
+TEST(ShardingParamToOpShardingEquivalentTest, WithPermutation) {
+  ShardingParam sharding_param{/*dim_shards=*/{2, 3},
+                               {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_sharding,
+                          ToHloSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
+  AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
+}
+
+TEST(ShardingParamToOpShardingEquivalentTest, WithReplication) {
+  ShardingParam sharding_param{/*dim_shards=*/{2, 1},
+                               {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
+  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_sharding,
+                          ToHloSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
+  AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
+}
+
+}  // namespace
+}  // namespace support
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/inspect_sharding.cc b/tensorflow/compiler/xla/python/inspect_sharding.cc
new file mode 100644
index 00000000000..ab6f64f12fd
--- /dev/null
+++ b/tensorflow/compiler/xla/python/inspect_sharding.cc
@@ -0,0 +1,120 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/inspect_sharding.h"
+
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/custom_call_sharding_helper.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
+
+namespace jax {
+
+using xla::HloInstruction;
+using xla::HloSharding;
+
+void InspectShardingSetError(JAX_InspectSharding_Callback_Args* args,
+                             std::string msg) {
+  auto* tmp_error = new std::string(std::move(msg));
+  args->error_txt = tmp_error->c_str();
+  args->error_scratch = tmp_error;
+  args->free_error = +[](JAX_InspectSharding_Callback_Args* args) {
+    delete reinterpret_cast<std::string*>(args->error_scratch);
+  };
+}
+std::optional<xla::HloSharding> InspectShardingReadArgs(
+    JAX_InspectSharding_Callback_Args* args) {
+  xla::OpSharding proto;
+  if (args->sharding_spec_size > std::numeric_limits<int>::max() ||
+      !proto.ParseFromArray(args->sharding_spec, args->sharding_spec_size)) {
+    InspectShardingSetError(args,
+                            "inspect_sharding: error parsing OpShardingProto");
+    return std::nullopt;
+  }
+  auto result = xla::HloSharding::FromProto(std::move(proto));
+  if (!result.ok()) {
+    InspectShardingSetError(args, std::string(result.status().message()));
+  }
+  return std::move(*result);
+}
+
+class InspectShardingCallPartitioner : public xla::CustomCallPartitioner {
+ public:
+  xla::Status Partition(xla::spmd::SpmdPartitioningVisitor* partitioner,
+                        HloInstruction* instruction) const override {
+    const HloInstruction* operand = instruction->operand(0);
+    if (!operand->has_sharding()) {
+      return xla::InternalError(
+          "Inspect sharding called but no sharding is available.");
+    }
+    std::string sharding_spec =
+        operand->sharding().ToProto().SerializeAsString();
+    JAX_InspectSharding_Callback_Args args;
+    args.sharding_spec = sharding_spec.data();
+    args.sharding_spec_size = sharding_spec.size();
+    args.error_txt = nullptr;
+    const auto& str = instruction->raw_backend_config_string();
+    if (str.size() != sizeof(JAX_InspectSharding_Callback)) {
+      return xla::InternalError("Invalid config string for inspect sharding.");
+    }
+    JAX_InspectSharding_Callback cb;
+    memcpy(&cb, str.data(), sizeof(JAX_InspectSharding_Callback));
+    cb.call(cb.data, &args);
+    if (args.error_txt) {
+      auto result = xla::InternalError("Error calling inspect_sharding: %s",
+                                       args.error_txt);
+      args.free_error(&args);
+      return result;
+    }
+    partitioner->SetPartitionedHlo(
+        instruction,
+        partitioner->GetPartitionedHlo(instruction->mutable_operand(0)));
+    return xla::OkStatus();
+  }
+  HloSharding PropagateUserSharding(
+      const HloInstruction* instruction, const HloInstruction* user,
+      const HloSharding& sharding) const override {
+    return sharding;
+  }
+  std::optional<HloSharding> InferShardingFromOperands(
+      const HloInstruction* instruction) const override {
+    const HloInstruction* operand = instruction->operand(0);
+    if (!operand->has_sharding()) {
+      return std::nullopt;
+    }
+    return operand->sharding();
+  }
+  bool IsCustomCallShardable(const HloInstruction* instruction) const override {
+    return true;
+  }
+  bool CanSideEffectingHaveReplicatedSharding() const override { return true; }
+};
+
+namespace {
+struct Registerer {
+  Registerer() {
+    RegisterCustomCallPartitioner(
+        "InspectSharding",
+        std::make_unique<jax::InspectShardingCallPartitioner>());
+  }
+};
+Registerer custom_call_registerer;
+}  // namespace
+
+}  // namespace jax
diff --git a/tensorflow/compiler/xla/python/inspect_sharding.h b/tensorflow/compiler/xla/python/inspect_sharding.h
new file mode 100644
index 00000000000..91a0cdce8a0
--- /dev/null
+++ b/tensorflow/compiler/xla/python/inspect_sharding.h
@@ -0,0 +1,63 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_INSPECT_SHARDING_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_INSPECT_SHARDING_H_
+
+#include <optional>
+#include <string>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
+
+// Marshalls xla::HloSharding across the .so boundary between jaxlib and a
+// compiler plugin. This library must be linked into xla-based compiler plugins
+// that want to support InspectSharding.
+
+// Use "C" linkage to force the struct layouts to use the c rules.
+extern "C" {
+
+struct JAX_InspectSharding_Callback_Args {
+  // Serialized xla::HloSharding.
+  char* sharding_spec;
+  size_t sharding_spec_size;
+  const char* error_txt;  // out
+  void* error_scratch;
+  // Deleter for the returned error.
+  void (*free_error)(JAX_InspectSharding_Callback_Args* args);
+};
+
+// Memcpy-ed into the `backend_config` field of the "InspectSharding" custom
+// call. During compilation, the provided callback will be called with both
+// the provided data argument and the serialized xla::HloSharding (in args).
+//
+// All pointers here must outlive compilation.
+struct JAX_InspectSharding_Callback {
+  void (*call)(void* data, JAX_InspectSharding_Callback_Args* args);
+  void* data;
+};
+
+}  // extern "C"
+
+namespace jax {
+
+// Helpers for reading and writing to JAX_InspectSharding_Callback_Args.
+void InspectShardingSetError(JAX_InspectSharding_Callback_Args* args,
+                             std::string msg);
+std::optional<xla::HloSharding> InspectShardingReadArgs(
+    JAX_InspectSharding_Callback_Args* args);
+
+}  // namespace jax
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_INSPECT_SHARDING_H_
diff --git a/tensorflow/compiler/xla/python/jax_jit.cc b/tensorflow/compiler/xla/python/jax_jit.cc
index fed48dd6148..763d4c391d9 100644
--- a/tensorflow/compiler/xla/python/jax_jit.cc
+++ b/tensorflow/compiler/xla/python/jax_jit.cc
@@ -50,12 +50,12 @@ limitations under the License.
 #include "pybind11/numpy.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
-#include "tensorflow/compiler/xla/python/ifrt/array.h"
-#include "tensorflow/compiler/xla/python/ifrt/client.h"
-#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
 #include "tensorflow/compiler/xla/pjrt/lru_cache.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/exceptions.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
 #include "tensorflow/compiler/xla/python/py_array.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
 #include "tensorflow/compiler/xla/python/py_executable.h"
@@ -63,6 +63,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/python_utils.h"
 #include "tensorflow/compiler/xla/python/pytree.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/python/util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -123,13 +124,6 @@ bool GetEnableX64() {
   return thread_local_state.enable_x64.value_or(*global_state.enable_x64);
 }
 
-bool GetEnableJaxArray() {
-  auto& global_state = GlobalJitState();
-  auto& thread_local_state = ThreadLocalJitState();
-  CHECK(global_state.jax_array.has_value());
-  return thread_local_state.jax_array.value_or(*global_state.jax_array);
-}
-
 std::optional<py::object> GetDefaultDevice() {
   auto& global_state = GlobalJitState();
   auto& thread_local_state = ThreadLocalJitState();
@@ -178,7 +172,6 @@ std::string CallSignature::DebugString() const {
       "device: %s\n"
       "default_device: %s\n"
       "jax_enable_x64: %d\n"
-      "jax_array: %d\n"
       "global_extra_jit_context: %s\n"
       "thread_local_extra_jit_context: %s\n",
       absl::StrJoin(static_args, ",", py_object_formatter),
@@ -189,7 +182,7 @@ std::string CallSignature::DebugString() const {
       absl::StrJoin(dynamic_arg_names, ",", py_object_formatter),
       absl::StrJoin(dynamic_arg_treedefs, "| ", treedef_formatter),  // new line
       device != nullptr ? device->DebugString() : "nullptr",
-      OptionalDebugString(default_device), jax_enable_x64, jax_array,
+      OptionalDebugString(default_device), jax_enable_x64,
       OptionalDebugString(global_extra_jit_context),
       OptionalDebugString(thread_local_extra_jit_context));
 }
@@ -198,12 +191,12 @@ bool CallSignature::operator==(const CallSignature& other) const {
   // TODO(chky): Consider implementing hashing and equality for sharding in cpp
   // instead of hashing and checking sharding's pointer values.
   return std::tie(dynamic_arg_treedefs, dynamic_arg_names,
-                  dynamic_arg_signatures, device, jax_enable_x64, jax_array,
+                  dynamic_arg_signatures, device, jax_enable_x64,
                   static_arg_names, committed_args) ==
              std::tie(other.dynamic_arg_treedefs, other.dynamic_arg_names,
                       other.dynamic_arg_signatures, other.device,
-                      other.jax_enable_x64, other.jax_array,
-                      other.static_arg_names, other.committed_args) &&
+                      other.jax_enable_x64, other.static_arg_names,
+                      other.committed_args) &&
          // `==` on py:objects is the Python `is`. We need equal.
          std::equal(dynamic_arg_shardings.begin(), dynamic_arg_shardings.end(),
                     other.dynamic_arg_shardings.begin(),
@@ -329,1171 +322,12 @@ xla::Status ParseArguments(absl::Span<PyObject* const> positional_args,
   return ::tsl::OkStatus();
 }
 
-namespace {
-
-// Elements of CacheEntry are protected by the GIL.
-struct CacheEntry {
-  // Ensures a single thread performs the compilation for a given executable.
-  //
-  // The first thread (holding the GIL) will create the CacheEntry associated to
-  // a signature and fill it. Other threads will wait for the notification.
-  // If an error occurred during the compilation, `fall_back_to_python` is set
-  // to `true`, and other threads will fail with the same error.
-  absl::Notification compilation_complete;
-  std::thread::id thread_id = std::this_thread::get_id();
-
-  std::shared_ptr<xla::PyLoadedExecutable> executable;
-  xla::PyTreeDef out_pytree_def;
-  // We use Python types within the vector because this is what we will be
-  // returning to Python. No need to convert back and forth.
-  // We need py::object to maintain the objects alive.
-  std::vector<py::object> out_avals;
-  std::vector<bool> out_weak_types;
-  std::vector<py::dtype> out_dtypes;
-  std::vector<std::vector<int64_t>> out_shapes;
-  std::vector<py::object> out_shardings;
-  std::vector<bool> committed;
-
-  // Bitvector of kept arguments from Jaxpr DCE pass. Used to drop some `args`
-  // in CompiledFunction::Call before calling into compiled computation.
-  std::vector<bool> kept_var_bitvec;
-  std::optional<xla::ClientAndPtr<xla::PjRtDevice>> sticky_device;
-
-  // Fallback to Python happens:
-  // - for trivial computations
-  // - when running a jax(pmap)
-  // - after a compilation error, for threads that did not compile it the first
-  //   time
-  bool fall_back_to_python = false;
-
-  // Python objects (notably in the cache key) that must remain alive as long
-  // as the cache entry does. Currently this is the `key` values in the kwarg
-  // entries in the cache key.
-  std::vector<py::object> keepalive;
-};
-
-// A CompiledFunctionCache represents a cache of compiled functions that can be
-// shared between one or more CompiledFunction objects. It serves two goals:
-// - reduce the number of lru caches (hash map) across multiple JITs.
-// - make the cache global to increase cache hits (e.g. calling jit(f)(3) twice)
-//   keeping entries alive as long as the underlying function f is alive.
-// Assume the cache is protected by the GIL.
-class CompiledFunctionCache {
- public:
-  static constexpr int kDefaultCapacity = 4096;
-  explicit CompiledFunctionCache(int capacity);
-
-  // Cache entries are shared_ptr<>s because it's possible the cache entry
-  // might be evicted before we finish tracing/compiling.
-  typedef xla::LRUCache<CallSignature, std::shared_ptr<CacheEntry>> Cache;
-
-  // We include as part of the cache key `donate_argnums` (and any other fields
-  // that aren't subsumed by the CallSignature we compute for each call).
-  std::shared_ptr<Cache> Lookup(py::handle function,
-                                absl::Span<const int> donate_argnums);
-
-  int Size() const { return lru_list_.Size(); }
-  int Capacity() const { return lru_list_.Capacity(); }
-  void Clear() { lru_list_.Clear(); }
-
- private:
-  struct Key {
-    py::handle function;  // Does not hold a reference.
-
-    // Other fields that are part of the arguments to `jit`, but are not
-    // otherwise part of CallSignature.
-    std::vector<int> donate_argnums;
-
-    bool operator==(const Key& other) const {
-      return std::tie(function, donate_argnums) ==
-             std::tie(other.function, other.donate_argnums);
-    }
-  };
-  template <typename H>
-  friend H AbslHashValue(H h, const Key& key) {
-    h = H::combine(std::move(h), key.function.ptr());
-    h = H::combine_contiguous(std::move(h), key.donate_argnums.data(),
-                              key.donate_argnums.size());
-    return h;
-  }
-
-  struct Value {
-    explicit Value(std::shared_ptr<Cache> cache) : cache(std::move(cache)) {}
-    std::shared_ptr<Cache> cache;
-
-    // A weak reference to the key function. We use the weak reference to
-    // register a callback that is triggered when the key function is destroyed.
-    // We use a weak pointer because we want to allow caching across multiple
-    // calls to `jax.jit(f)` if `f` remains alive, but we do not want the cache
-    // to keep `f` alive if all other references are dropped.
-    py::weakref weakref;
-  };
-
-  Cache::LRUList lru_list_;
-  absl::flat_hash_map<Key, std::unique_ptr<Value>> functions_;
-};
-
-CompiledFunctionCache::CompiledFunctionCache(int capacity)
-    : lru_list_(capacity) {}
-
-std::shared_ptr<CompiledFunctionCache::Cache> CompiledFunctionCache::Lookup(
-    py::handle function, absl::Span<const int> donate_argnums) {
-  Key key;
-  key.function = function;
-  key.donate_argnums =
-      std::vector<int>(donate_argnums.begin(), donate_argnums.end());
-  auto insert = functions_.emplace(key, nullptr);
-  if (!insert.second) {
-    return insert.first->second->cache;
-  }
-  std::shared_ptr<Cache> cache = std::make_shared<Cache>(&lru_list_);
-  py::cpp_function callback([this, key{std::move(key)}](py::handle weakref) {
-    functions_.erase(key);
-  });
-  PyObject* weakref = PyWeakref_NewRef(function.ptr(), callback.ptr());
-  if (weakref) {
-    std::unique_ptr<Value>& entry = insert.first->second;
-    entry = std::make_unique<Value>(cache);
-    entry->weakref = py::reinterpret_steal<py::weakref>(weakref);
-  } else {
-    PyErr_Clear();
-    // `function` is not weak-referenceable. Don't bother adding it to the
-    // shared cache in that case; the `jit` object will hold the only shared
-    // reference to the cache entry.
-    functions_.erase(insert.first);
-  }
-  return cache;
-}
-
-// A `CompiledFunction` is associated to a `jax.jit(f)` and takes care of the
-// bookkeeping of the different signatures used and the dispatch of calls to
-// the correct underlying `PyLoadedExecutable`. This class is thread-safe.
-class CompiledFunction {
- public:
-  CompiledFunction(py::function fun, py::function cache_miss,
-                   py::function get_device, bool has_explicit_device,
-                   std::vector<int> static_argnums,
-                   std::vector<py::str> static_argnames,
-                   std::vector<int> donate_argnums,
-                   std::shared_ptr<CompiledFunctionCache> cache);
-  ~CompiledFunction();
-
-  // pybind11::object typed subclass for CompiledFunction objects.
-  class pyobject : public py::object {
-   public:
-    PYBIND11_OBJECT(pyobject,  // NOLINT
-                    py::object, CompiledFunction::IsCompiledFunction);
-    pyobject() = default;
-    CompiledFunction* func() const {
-      return CompiledFunction::AsCompiledFunctionUnchecked(*this);
-    }
-  };
-  // Alias as ::object; outside the scope above we won't confuse pybind11's
-  // macros.
-  using object = pyobject;
-
-  // Returns true if `h` is a CompiledFunction.
-  static bool IsCompiledFunction(py::handle handle);
-  // Converts `handle` to a CompiledFunction*. Does not do any checking.
-  static CompiledFunction* AsCompiledFunctionUnchecked(py::handle handle);
-
-  // This function will:
-  // (a) flatten the inputs using pytree
-  // (b) get buffer objects from the arguments
-  // (c) call the executable
-  // (d) construct `DeviceArray` objects from the outputs
-  // (e) reconstruct the `PyTree`.
-  xla::StatusOr<py::object> Call(py::handle callable, PyObject* const* args,
-                                 size_t nargs, PyObject* kwnames);
-
-  // This allows `inspect.signature(cpp_jitted_f)` from Python.
-  py::object PythonSignature() {
-    static const auto* inspect = new py::module(py::module::import("inspect"));
-    return inspect->attr("signature")(fun_);
-  }
-
-  int cache_size() const { return executables_->Size(); }
-  void ClearCache() {
-// Setting `default_device_` to nullptr forces Call() to retrieve the
-// device.
-    default_client_ = nullptr;
-    default_device_ = nullptr;
-    executables_->Clear();
-  }
-
-  const py::function& fun() const { return fun_; }
-  const py::function& cache_miss() const { return cache_miss_; }
-  const py::function& get_device() const { return get_device_; }
-  bool has_explicit_device() const { return has_explicit_device_; }
-  const std::vector<int>& static_argnums() const { return static_argnums_; }
-  const std::vector<py::str>& static_argnames() const {
-    return static_argnames_;
-  }
-  const std::vector<int>& donate_argnums() const { return donate_argnums_; }
-  const std::shared_ptr<CompiledFunctionCache>& cache() const { return cache_; }
-
-  // Helper function used by the tp_clear GC method.
-  void ClearPythonReferences() {
-    py::function fun, cache_miss, get_device;
-    // Swap values for nulls before they are destroyed. See the Python
-    // Py_CLEAR() documentation for a discussion of this topic.
-    std::swap(fun_, fun);
-    std::swap(cache_miss_, cache_miss);
-    std::swap(get_device_, get_device);
-  }
-
-  const std::string& function_name() const { return function_name_; }
-
- private:
-  // Attempts to populate default_device_. May release the GIL; is
-  // reentrant-safe.
-  void TryToPopulateDefaultDevice();
-
-  void PopulateCacheEntry(CacheEntry* entry, const CallSignature& signature,
-                          const py::tuple& out_and_fastpath_data);
-  bool always_fallback_to_python_ = false;
-
-  py::function fun_;  // The Python function to jit.
-  std::string function_name_;
-
-  // See JAX _cpp_jit in api.py for documentation.
-  py::function cache_miss_;
-
-  // We need to know the static arguments to remove them from the arguments
-  // passed to the underlying PyLoadedExecutable. In sorted order.
-  std::vector<int> static_argnums_;
-  // Keyword arguments, interned.
-  std::vector<py::str> static_argnames_;
-  std::vector<int> donate_argnums_;
-
-  // Whether this function has an explicit device set by either the `device` or
-  // `backend` arguments to jit.
-  bool has_explicit_device_;
-
-  // A function taking no arguments and returning the default device and whether
-  // jax.jit has been committed to it.
-  py::function get_device_;
-
-  // Keeps the shared LRU cache alive as long as the CompiledFunction is alive.
-  std::shared_ptr<CompiledFunctionCache> cache_;
-
-  // The part of cache_ specific to this CompiledFunction.
-  std::shared_ptr<CompiledFunctionCache::Cache> executables_;
-
-  // The logic if the following:
-  // - if `device` or `backend` are not specified to `jax.jit`, we will use
-  //   the input sticky buffer device, or `default_device_` if there is no
-  //   such sticky buffer.
-  // - When one of `device` or `backend` is specified, this will determine
-  //   the `default_device_` which will be used as the targeted device. In
-  //   which case, we will always copy input buffers to this device.
-  // These fields are protected by the GIL.
-
-  xla::ifrt::Client* default_client_ = nullptr;
-  xla::PjRtDevice* default_device_ = nullptr;
-  bool is_committed_;
-};
-
-// This class keeps references to all CompiledFunctions. This class is
-// thread-compatible.
-class CompiledFunctionStore {
- public:
-  void Insert(CompiledFunction* function) {
-    compiled_functions_.insert(function);
-  }
-
-  void Erase(CompiledFunction* function) {
-    compiled_functions_.erase(function);
-  }
-
-  void ClearFunctionCache() {
-    for (auto* function : compiled_functions_) {
-      function->ClearCache();
-    }
-  }
-
- private:
-  absl::flat_hash_set<CompiledFunction*> compiled_functions_;
-};
-
-// Protected by GIL.
-CompiledFunctionStore& GetGlobalCompiledFunctionStore() {
-  static auto* const store = new CompiledFunctionStore();
-  return *store;
-}
-
-CompiledFunction::CompiledFunction(py::function fun, py::function cache_miss,
-                                   py::function get_device,
-                                   bool has_explicit_device,
-                                   std::vector<int> static_argnums,
-                                   std::vector<py::str> static_argnames,
-                                   std::vector<int> donate_argnums,
-                                   std::shared_ptr<CompiledFunctionCache> cache)
-    : fun_(std::move(fun)),
-      cache_miss_(std::move(cache_miss)),
-      static_argnums_(std::move(static_argnums)),
-      static_argnames_(std::move(static_argnames)),
-      donate_argnums_(donate_argnums),
-      has_explicit_device_(std::move(has_explicit_device)),
-      get_device_(std::move(get_device)),
-      cache_(std::move(cache)) {
-  std::sort(static_argnums_.begin(), static_argnums_.end());
-  for (py::str& s : static_argnames_) {
-    PyUnicode_InternInPlace(&s.ptr());
-  }
-  executables_ = cache_->Lookup(fun_, donate_argnums);
-  function_name_ = py::str(py::getattr(fun_, "__name__", fun));
-
-  GetGlobalCompiledFunctionStore().Insert(this);
-}
-
-CompiledFunction::~CompiledFunction() {
-  GetGlobalCompiledFunctionStore().Erase(this);
-}
-
-// Returns nullptr if arg has no sticky device
-static xla::StatusOr<std::pair<xla::ifrt::Client*, xla::PjRtDevice*>>
-GetJitArgumentStickyDevice(py::handle arg) {
-  struct PythonTypes {
-    py::object device_array;
-  };
-  static const auto& types = *[]() -> PythonTypes* {
-    py::module xla_module(py::module::import("jax.interpreters.xla"));
-    py::object device_array;
-    if (py::hasattr(xla_module, "_DeviceArray")) {
-      device_array = xla_module.attr("_DeviceArray");
-    }
-    return new PythonTypes{device_array};
-  }();
-
-  if (arg.get_type() == xla::PyArray::type()) {
-    auto py_array = py::reinterpret_borrow<xla::PyArray>(arg);
-    if (py_array.fastpath_enabled()) {
-      if (py_array.num_shards() != 1) {
-        return xla::InvalidArgument(
-            "Only single-sharded Array is expected in C++ JIT.");
-      }
-
-      if (!py_array.committed()) {
-        return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{nullptr,
-                                                               nullptr};
-      }
-      return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{
-          py_array.ifrt_array()->client(),
-          py_array.ifrt_array()->sharding().devices().front()};
-    }
-  }
-
-  // We specically only deal with DeviceArray (not ShardedDeviceArray).
-  // (Can happen in jit(pmap), e.g. "test_jit_nested_donate_ignored").
-  if (arg.get_type().ptr() == xla::PyBuffer::type()) {
-    xla::PyBuffer* buffer = xla::PyBuffer::AsPyBufferUnchecked(arg);
-    if (!buffer->sticky_device()) {
-      return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{nullptr, nullptr};
-    }
-    return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{
-        buffer->ifrt_array()->client(), buffer->sticky_device()};
-  }
-
-  if (arg.get_type().ptr() == types.device_array.ptr()) {
-    if (arg.attr("_device").is_none()) {
-      return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{nullptr, nullptr};
-    }
-    try {
-      // This can fail, e.g. for cloud TPU 2VM buffers.
-      TF_ASSIGN_OR_RETURN(xla::PyBuffer * buffer,
-                          xla::PyBuffer::AsPyBuffer(arg.attr("device_buffer")));
-      return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{
-          buffer->ifrt_array()->client(),
-          buffer->ifrt_array()->sharding().devices().front()};
-    } catch (const py::cast_error& e) {
-      return xla::InvalidArgument(
-          "%s", absl::StrCat("[jaxjit] Unsupported subclass of `DeviceArray`: "
-                             "`device_buffer` field is of type ",
-                             py::cast<std::string>(
-                                 arg.attr("device_buffer").get_type().str()),
-                             " while a `PyBuffer` was expected."));
-    }
-  }
-
-  return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{nullptr, nullptr};
-}
-
-// Compute signature for arguments.
-//
-// Returns `OkStatus()` on success. Returning an error should lead to
-// calling the Python fallback.
-xla::Status ComputeSignature(bool jax_enable_x64,
-                             xla::ifrt::Client* default_client,
-                             xla::PjRtDevice* default_device, bool is_committed,
-                             ParsedArgumentsAsBuffers& arguments) {
-  tsl::profiler::TraceMe traceme("ComputeSignature");
-
-  int num_flat_dynamic_args = arguments.flat_dynamic_args.size();
-  // When the jitted function is not committed, we first check whether any
-  // sticky `DeviceArray` is present and on which device they live. See also:
-  // https://github.com/google/jax/pull/1884
-  // https://github.com/google/jax/pull/1916 for the rationale why the
-  // computation follows the data locality.
-  // It's also similar to PyTorch's behavior.
-  xla::ifrt::Client* ifrt_client = nullptr;
-  xla::PjRtDevice* data_device = nullptr;
-  if (!is_committed) {
-    for (int i = 0; i < num_flat_dynamic_args; ++i) {
-      TF_ASSIGN_OR_RETURN(
-          auto client_and_device,
-          GetJitArgumentStickyDevice(arguments.flat_dynamic_args[i]));
-      xla::ifrt::Client* client = client_and_device.first;
-      xla::PjRtDevice* device = client_and_device.second;
-      if (device) {
-        if (data_device && (device != data_device)) {
-          throw std::invalid_argument(absl::StrCat(
-              "primitive arguments must be colocated on the same device ("
-              "C++ jax.jit). Arguments are on devices: ",
-              device->DebugString(), " and ", data_device->DebugString()));
-        } else {
-          ifrt_client = client;
-          data_device = device;
-        }
-      }
-    }
-  }
-  if (!data_device) {
-    ifrt_client = default_client;
-    data_device = default_device;
-  }
-  CHECK(data_device);
-  arguments.ifrt_client = ifrt_client;
-  arguments.signature.device = data_device;
-
-  arguments.signature.dynamic_arg_signatures.reserve(num_flat_dynamic_args);
-  for (int i = 0; i < num_flat_dynamic_args; ++i) {
-    py::handle arg = arguments.flat_dynamic_args[i];
-    TF_ASSIGN_OR_RETURN(auto sig,
-                        xla::PyArgSignatureOfValue(arg, jax_enable_x64));
-    arguments.signature.dynamic_arg_signatures.push_back(std::move(sig));
-  }
-  return ::tsl::OkStatus();
-}
-
-// Copy buffers to device, skipping pruned arguments.
-// Returns `OkStatus()` on success. Returning an error should lead to
-// calling the Python fallback.
-xla::Status CopyBuffersToDevice(bool jax_enable_x64,
-                                const std::vector<bool>& kept_args,
-                                ParsedArgumentsAsBuffers& arguments) {
-  std::vector<tsl::RCReference<xla::ifrt::Array>>& ifrt_arg_arrays =
-      arguments.ifrt_arg_arrays;
-  xla::PjRtDevice* data_device = arguments.signature.device;
-
-  int num_flat_dynamic_args = arguments.flat_dynamic_args.size();
-  xla::DevicePutOptions options;
-  options.squash_64bit_types = !jax_enable_x64;
-  options.allow_zero_copy = true;
-  ifrt_arg_arrays.reserve(num_flat_dynamic_args);
-  for (int i = 0; i < num_flat_dynamic_args; ++i) {
-    if (!kept_args[i]) {
-      continue;
-    }
-
-    py::handle arg = arguments.flat_dynamic_args[i];
-    TF_ASSIGN_OR_RETURN(xla::DevicePutResult on_device,
-                        DevicePut(arg,
-                                  arguments.ifrt_client,
-                                  data_device, options));
-
-    ifrt_arg_arrays.push_back(std::move(on_device.ifrt_array));
-    if (on_device.owning_pybuffer) {
-      arguments.keep_alive_objects.push_back(
-          std::move(on_device.owning_pybuffer));
-    }
-  }
-  return ::tsl::OkStatus();
-}
-
-void CompiledFunction::PopulateCacheEntry(
-    CacheEntry* cache_entry, const CallSignature& signature,
-    const py::tuple& out_and_fastpath_data) {
-  CHECK_EQ(out_and_fastpath_data.size(), 2);
-  if (out_and_fastpath_data[1].is_none()) {
-    cache_entry->fall_back_to_python = true;
-    return;
-  }
-
-  py::tuple executable_handlers_out_tree =
-      py::cast<py::tuple>(out_and_fastpath_data[1]);
-  auto executable = py::cast<std::shared_ptr<xla::PyLoadedExecutable>>(
-      executable_handlers_out_tree.attr("xla_executable"));
-  cache_entry->executable = std::move(executable);
-  int num_devices =
-      cache_entry->executable->ifrt_executable()->addressable_devices().size();
-  // The presence of jit(pmap) is detected from Python.
-  CHECK_EQ(num_devices, 1);
-
-  auto out_tree = py::cast<xla::PyTreeDef>(
-      executable_handlers_out_tree.attr("out_pytree_def"));
-  cache_entry->out_pytree_def = std::move(out_tree);
-
-  cache_entry->sticky_device =
-      py::cast<std::optional<xla::ClientAndPtr<xla::PjRtDevice>>>(
-          executable_handlers_out_tree.attr("sticky_device"));
-  auto avals = py::cast<py::list>(executable_handlers_out_tree.attr("avals"));
-
-  cache_entry->out_avals.reserve(avals.size());
-  cache_entry->out_weak_types.reserve(avals.size());
-  cache_entry->out_dtypes.reserve(avals.size());
-  cache_entry->out_shapes.reserve(avals.size());
-  for (int i = 0; i < avals.size(); ++i) {
-    py::object shaped_array = py::reinterpret_borrow<py::object>(avals[i]);
-
-    cache_entry->out_avals.push_back(shaped_array);
-    cache_entry->out_weak_types.push_back(
-        py::cast<bool>(shaped_array.attr("weak_type")));
-    cache_entry->out_dtypes.push_back(shaped_array.attr("dtype"));
-    cache_entry->out_shapes.push_back(
-        py::cast<std::vector<int64_t>>(shaped_array.attr("shape")));
-  }
-
-  auto shardings =
-      py::cast<py::list>(executable_handlers_out_tree.attr("shardings"));
-  cache_entry->out_shardings.reserve(shardings.size());
-  for (const auto& sharding : shardings) {
-    cache_entry->out_shardings.push_back(
-        py::reinterpret_borrow<py::object>(sharding));
-  }
-
-  auto committed =
-      py::cast<py::list>(executable_handlers_out_tree.attr("committed"));
-  cache_entry->committed.reserve(shardings.size());
-  for (const auto& c : committed) {
-    cache_entry->committed.push_back(c.cast<bool>());
-  }
-
-  auto kept_var_bitvec =
-      py::cast<py::list>(executable_handlers_out_tree.attr("kept_var_bitvec"));
-  cache_entry->kept_var_bitvec.reserve(kept_var_bitvec.size());
-  for (const auto& b : kept_var_bitvec) {
-    cache_entry->kept_var_bitvec.push_back(b.cast<bool>());
-  }
-}
-
-void CompiledFunction::TryToPopulateDefaultDevice() {
-  // The following line calls Python and may release the GIL.
-  py::object device_and_is_committed;
-  try {
-    device_and_is_committed = get_device_();
-  } catch (py::error_already_set& e) {
-    // Backend or device initialization failed. Handle this in Python.
-    always_fallback_to_python_ = true;
-    return;
-  }
-  // If the GIL was released by the call to get_device_, another thread may
-  // have filled in default_device_.
-  if (!default_device_) {
-    try {
-      auto default_pydevice = py::cast<xla::ClientAndPtr<xla::PjRtDevice>>(
-          device_and_is_committed.attr("default_device"));
-      is_committed_ =
-          py::cast<bool>(device_and_is_committed.attr("committed_to_device"));
-      default_client_ = default_pydevice.client->ifrt_client();
-      default_device_ = default_pydevice.contents;
-    } catch (const py::cast_error& e) {
-      // Pathways, Cloud TPU 2VM, and UPTC runtime.
-      always_fallback_to_python_ = true;
-    }
-  }
-}
-
-xla::StatusOr<py::object> CompiledFunction::Call(py::handle callable,
-                                                 PyObject* const* args,
-                                                 size_t nargs,
-                                                 PyObject* kwnames) {
-  VLOG(3) << "Calling CompiledFunction " << function_name_;
-
-  // Make sure we trigger a garbage collection on JIT function calls. Otherwise
-  // code like
-  // f = jit(...)
-  // while True:
-  //   f(x)
-  // may never free temporary buffers for copies of arguments.
-  xla::GlobalPyRefManager()->MaybeCollectGarbage();
-
-  auto& global_state = GlobalJitState();
-  auto& tls = ThreadLocalJitState();
-  if (GetDisableJit()) {
-    return py::reinterpret_steal<py::object>(
-        JAX_PyObject_Vectorcall(fun_.ptr(), args, nargs, kwnames));
-  }
-
-  // Calls the cache_miss_ function. This just calls the Python function; it may
-  // return nullptr value if a Python exception is thrown.
-  auto cache_miss = [&]() -> py::tuple {
-    return py::reinterpret_steal<py::tuple>(
-        JAX_PyObject_Vectorcall(cache_miss_.ptr(), args, nargs, kwnames));
-  };
-
-  // Call the cache_miss() function, extracting the output data and ignoring
-  // the fastpath data. If the cache miss returns a Python error, returns
-  // nullptr and leaves the Python error set.
-  auto fallback_to_cache_miss = [&]() {
-    py::tuple cache_miss_output = cache_miss();
-    if (!cache_miss_output.ptr()) {
-      return py::object();
-    }
-    return py::object(cache_miss_output[0]);
-  };
-
-  if (always_fallback_to_python_) {
-    return fallback_to_cache_miss();
-  }
-
-  xla::ifrt::Client* client = nullptr;
-  xla::PjRtDevice* device = nullptr;
-  // Whether `device` should override an input with a sticky device.
-  bool is_committed;
-  if (!has_explicit_device_ && GetDefaultDevice().has_value()) {
-    xla::ClientAndPtr<xla::PjRtDevice> pjrt_device_ptr;
-    bool cast_success = true;
-    try {
-      pjrt_device_ptr =
-          GetDefaultDevice()->cast<xla::ClientAndPtr<xla::PjRtDevice>>();
-    } catch (py::cast_error& e) {
-      // We assume GetDefaultDevice() returned a non-PJRT device object. Leave
-      // `device` unset so we fallback to Python path and handle default device
-      // there.
-      cast_success = false;
-    }
-    if (cast_success) {
-      client = pjrt_device_ptr.client->ifrt_client();
-      device = pjrt_device_ptr.get();
-      is_committed = false;
-      VLOG(3) << "Using config.default_device (uncommitted): "
-              << device->DebugString();
-    }
-  }
-  if (device == nullptr) {
-    // Call back into Python to find system default device, which will be stored
-    // in default_device_.
-    if (!default_device_) {
-      // On the first call to `Call`, compute a default device. We need to wait
-      // until after platform initialization is complete before doing so, but
-      // @jit may be used as a decorator.
-      TryToPopulateDefaultDevice();
-      if (!default_device_) {
-        return fallback_to_cache_miss();
-      }
-    }
-    client = default_client_;
-    device = default_device_;
-    is_committed = is_committed_;
-    VLOG(3) << "Using device from Python): " << device->DebugString()
-            << ", committed: " << is_committed;
-  }
-  CHECK(device != nullptr);
-
-  ParsedArgumentsAsBuffers arguments;
-  arguments.signature.function_name = function_name_;
-  size_t num_positional_args = PyVectorcall_NARGS(nargs);
-  size_t num_keyword_args = kwnames ? PyTuple_GET_SIZE(kwnames) : 0;
-  absl::Span<PyObject* const> positional_args(args, num_positional_args);
-  absl::Span<PyObject* const> keyword_args(args + num_positional_args,
-                                           num_keyword_args);
-  xla::Status status =
-      ParseArguments(positional_args, keyword_args, kwnames, static_argnums_,
-                     static_argnames_, arguments);
-  if (!status.ok()) {
-    VLOG(2) << "ParseArguments failed: " << status;
-    return fallback_to_cache_miss();
-  }
-
-  bool jax_enable_x64 = GetEnableX64();
-  arguments.signature.jax_enable_x64 = jax_enable_x64;
-  arguments.signature.jax_array = GetEnableJaxArray();
-  // The C++ jit do not support Tracers arguments inputs yet. The Python-based
-  // jit function will be called if any of the dynamic arguments is unsupported.
-  status =
-      ComputeSignature(jax_enable_x64, client, device, is_committed, arguments);
-  if (!status.ok()) {
-    VLOG(2) << "ComputeSignature failed: " << status;
-    return fallback_to_cache_miss();
-  }
-  arguments.signature.global_extra_jit_context = global_state.extra_jit_context;
-  arguments.signature.thread_local_extra_jit_context = tls.extra_jit_context;
-
-  VLOG(3) << "CallSignature:\n" << arguments.signature.DebugString();
-  bool inserted = false;
-  std::shared_ptr<CacheEntry> cache_entry = executables_->GetOrCreateIfAbsent(
-      arguments.signature, [&inserted](const CallSignature& key) {
-        inserted = true;
-        return std::make_shared<CacheEntry>();
-      });
-
-  if (!cache_entry->compilation_complete.HasBeenNotified()) {
-    // In case of several threads attempting to compile the executable, only
-    // the one that inserted the item will perform the compilation.
-    if (inserted) {
-      py::object out_and_fastpath_data;
-      py::tuple out_tuple;
-      VLOG(2) << "Cache miss for\n" << arguments.signature.DebugString();
-      try {
-        // Calls Python and may release the GIL. May also throw if
-        // compilation/tracing fails.
-        out_and_fastpath_data = cache_miss();
-        if (!out_and_fastpath_data.ptr()) {
-          throw py::error_already_set();
-        }
-        out_tuple = py::cast<py::tuple>(out_and_fastpath_data);
-        PopulateCacheEntry(cache_entry.get(), arguments.signature, out_tuple);
-      } catch (const std::exception& e) {
-        cache_entry->fall_back_to_python = true;
-        cache_entry->compilation_complete.Notify();
-        throw;
-      }
-      cache_entry->compilation_complete.Notify();
-
-      // We have already computed the result in the miss path so we can return
-      // it. We are even *required* to do so if there are donated arguments,
-      // because any donated buffers will now be invalid.
-      return py::object(out_tuple[0]);
-    } else {
-      if (cache_entry->thread_id == std::this_thread::get_id()) {
-        auto error_string = absl::StrCat("Recursively calling jit: ",
-                                         arguments.signature.DebugString());
-        PyErr_SetString(PyExc_RecursionError, error_string.c_str());
-        throw pybind11::error_already_set();
-      }
-      // Release the GIL while we wait, making sure the compile thread can
-      // lock it.
-      py::gil_scoped_release release;
-      cache_entry->compilation_complete.WaitForNotification();
-    }
-  }
-  // It's hard to reraise the exact same kind of errors when a compilation error
-  // occurred. If the first compilation failed, other threads will also execute
-  // the Python path.
-  if (cache_entry->fall_back_to_python) {
-    VLOG(2) << "fallback to python: " << function_name_;
-    return fallback_to_cache_miss();
-  }
-
-  status = CopyBuffersToDevice(jax_enable_x64, cache_entry->kept_var_bitvec,
-                               arguments);
-  if (!status.ok()) {
-    VLOG(2) << "CopyBuffersToDevice failed: " << status;
-    return fallback_to_cache_miss();
-  }
-
-// Executes the computation.
-  std::vector<tsl::RCReference<xla::ifrt::Array>> output_arrays;
-  {
-    py::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(auto result,
-                        cache_entry->executable->ifrt_executable()->Execute(
-                            absl::MakeSpan(arguments.ifrt_arg_arrays),
-                            cache_entry->executable->options(),
-                            /*devices=*/std::nullopt));
-    output_arrays = std::move(result.outputs);
-  }
-  auto traceback = xla::Traceback::Get();
-
-  int num_outputs = output_arrays.size();
-  absl::InlinedVector<py::object, 1> flat_device_arrays;
-  flat_device_arrays.reserve(num_outputs);
-
-  if (!cache_entry->out_shardings.empty()) {
-    for (int i = 0; i < output_arrays.size(); ++i) {
-      xla::PyArray array(
-          cache_entry->out_avals[i], cache_entry->out_weak_types[i],
-          cache_entry->out_dtypes[i], cache_entry->out_shapes[i],
-          cache_entry->out_shardings.at(i), cache_entry->executable->client(),
-          traceback, std::move(output_arrays[i]),
-          /*committed=*/cache_entry->committed.at(i), /*skip_checks=*/true);
-      flat_device_arrays.push_back(std::move(array));
-    }
-  } else {
-    for (int i = 0; i < output_arrays.size(); ++i) {
-      bool last = (i == (num_outputs - 1));
-      xla::PyBuffer::object buffer = xla::PyBuffer::Make(
-          cache_entry->executable->client(), std::move(output_arrays[i]),
-          last ? std::move(traceback) : traceback);
-      buffer.buf()->SetAval(cache_entry->out_avals[i]);
-      buffer.buf()->set_weak_type(cache_entry->out_weak_types[i]);
-      if (cache_entry->sticky_device.has_value()) {
-        TF_RETURN_IF_ERROR(buffer.buf()->set_sticky_device(
-            (*cache_entry->sticky_device).get()));
-      }
-      flat_device_arrays.push_back(std::move(buffer));
-    }
-  }
-  py::object out = cache_entry->out_pytree_def.Unflatten(flat_device_arrays);
-
-  // If there is a post-hook function, call it with the inputs and the outputs.
-  std::optional<py::object> post_hook = GetPostHook();
-  if (post_hook) {
-    py::tuple args_tuple(num_positional_args);
-    for (size_t i = 0; i < num_positional_args; ++i) {
-      args_tuple[i] = args[i];
-    }
-    py::dict kwargs;
-    if (kwnames) {
-      for (size_t i = 0; i < num_keyword_args; ++i) {
-        kwargs[py::handle(PyTuple_GET_ITEM(kwnames, i))] =
-            args[num_positional_args + i];
-      }
-    }
-
-    (*post_hook)(callable, args_tuple, kwargs, out);
-  }
-  return std::move(out);
-}
-
-struct JaxCompiledFunctionObject {
-  PyObject_HEAD;
-  PyObject* dict;      // Dictionary for __dict__
-  PyObject* weakrefs;  // Weak references; for use by the Python interpreter.
-  vectorcallfunc vectorcall;
-  CompiledFunction fun;
-};
-
-PyObject* JaxCompiledFunction_Type = nullptr;
-
-bool CompiledFunction::IsCompiledFunction(py::handle handle) {
-  return handle.get_type() == JaxCompiledFunction_Type;
-}
-
-CompiledFunction* CompiledFunction::AsCompiledFunctionUnchecked(
-    py::handle handle) {
-  return &(reinterpret_cast<JaxCompiledFunctionObject*>(handle.ptr())->fun);
-}
-
-xla::StatusOr<CompiledFunction*> AsCompiledFunction(py::handle handle) {
-  if (!CompiledFunction::IsCompiledFunction(handle)) {
-    return xla::InvalidArgument("Expected a CompiledFunction");
-  }
-  return CompiledFunction::AsCompiledFunctionUnchecked(handle);
-}
-
-extern "C" {
-
-PyObject* JaxCompiledFunction_tp_vectorcall(PyObject* callable,
-                                            PyObject* const* args, size_t nargs,
-                                            PyObject* kwnames) {
-  JaxCompiledFunctionObject* o =
-      reinterpret_cast<JaxCompiledFunctionObject*>(callable);
-  tsl::profiler::TraceMe traceme([&] {
-    return absl::StrCat("JaxCompiledFunction(", o->fun.function_name(), ")");
-  });
-  try {
-    xla::StatusOr<py::object> out = o->fun.Call(callable, args, nargs, kwnames);
-    if (!out.ok()) {
-      PyErr_SetString(PyExc_ValueError, out.status().ToString().c_str());
-      return nullptr;
-    }
-    return out.value().release().ptr();
-  } catch (py::error_already_set& e) {
-    e.restore();
-    return nullptr;
-  } catch (py::cast_error& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  } catch (std::invalid_argument& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  } catch (std::runtime_error& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  }
-}
-
-PyObject* JaxCompiledFunction_tp_new(PyTypeObject* subtype, PyObject* args,
-                                     PyObject* kwds) {
-  JaxCompiledFunctionObject* self =
-      reinterpret_cast<JaxCompiledFunctionObject*>(
-          subtype->tp_alloc(subtype, 0));
-  if (!self) return nullptr;
-  self->dict = nullptr;
-  self->weakrefs = nullptr;
-  self->vectorcall = JaxCompiledFunction_tp_vectorcall;
-  return reinterpret_cast<PyObject*>(self);
-}
-
-void JaxCompiledFunction_tp_dealloc(PyObject* self) {
-  PyTypeObject* tp = Py_TYPE(self);
-  JaxCompiledFunctionObject* o =
-      reinterpret_cast<JaxCompiledFunctionObject*>(self);
-  if (o->weakrefs) {
-    PyObject_ClearWeakRefs(self);
-  }
-  Py_CLEAR(o->dict);
-  o->fun.~CompiledFunction();
-  tp->tp_free(self);
-  Py_DECREF(tp);
-}
-
-int JaxCompiledFunction_tp_traverse(PyObject* self, visitproc visit,
-                                    void* arg) {
-  JaxCompiledFunctionObject* o =
-      reinterpret_cast<JaxCompiledFunctionObject*>(self);
-#if PY_VERSION_HEX >= 0x03090000
-  // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
-  Py_VISIT(Py_TYPE(self));
-#endif
-  Py_VISIT(o->dict);
-  Py_VISIT(o->fun.fun().ptr());
-  Py_VISIT(o->fun.cache_miss().ptr());
-  Py_VISIT(o->fun.get_device().ptr());
-  return 0;
-}
-
-int JaxCompiledFunction_tp_clear(PyObject* self) {
-  JaxCompiledFunctionObject* o =
-      reinterpret_cast<JaxCompiledFunctionObject*>(self);
-  Py_CLEAR(o->dict);
-  o->fun.ClearPythonReferences();
-  return 0;
-}
-
-// Implements the Python descriptor protocol so JIT-compiled functions can be
-// used as bound methods. See:
-// https://docs.python.org/3/howto/descriptor.html#functions-and-methods
-PyObject* JaxCompiledFunction_tp_descr_get(PyObject* self, PyObject* obj,
-                                           PyObject* type) {
-  if (obj == nullptr || obj == Py_None) {
-    Py_INCREF(self);
-    return self;
-  }
-  return PyMethod_New(self, obj);
-}
-
-// Support d = instance.__dict__.
-PyObject* JaxCompiledFunction_get_dict(PyObject* self, void*) {
-  JaxCompiledFunctionObject* o =
-      reinterpret_cast<JaxCompiledFunctionObject*>(self);
-  if (!o->dict) {
-    o->dict = PyDict_New();
-  }
-  Py_XINCREF(o->dict);
-  return o->dict;
-}
-
-int JaxCompiledFunction_set_dict(PyObject* self, PyObject* new_dict, void*) {
-  JaxCompiledFunctionObject* o =
-      reinterpret_cast<JaxCompiledFunctionObject*>(self);
-  if (!PyDict_Check(new_dict)) {
-    PyErr_Format(PyExc_TypeError,
-                 "__dict__ must be set to a dictionary, not a '%s'",
-                 Py_TYPE(new_dict)->tp_name);
-    return -1;
-  }
-  Py_INCREF(new_dict);
-  Py_CLEAR(o->dict);
-  o->dict = new_dict;
-  return 0;
-}
-
-static PyGetSetDef JaxCompiledFunction_tp_getset[] = {
-    // Having a __dict__ seems necessary to allow !functool.wraps to override
-    // __doc__.
-    {const_cast<char*>("__dict__"), JaxCompiledFunction_get_dict,
-     JaxCompiledFunction_set_dict, nullptr, nullptr},
-    {nullptr, nullptr, nullptr, nullptr, nullptr}};
-
-PyObject* JaxCompiledFunction_tp_repr(PyObject* self) {
-  try {
-    const std::string& repr = absl::StrFormat(
-        "<CompiledFunction of %s>",
-        static_cast<std::string>(py::repr(py::getattr(self, "__wrapped__"))));
-    return PyUnicode_FromString(repr.c_str());
-  } catch (...) {
-    // Ignore all errors when accessing a repr.
-    return PyUnicode_FromString("<CompiledFunction>");
-  }
-}
-
-void InitializeCompiledFunction(JaxCompiledFunctionObject* cfun,
-                                py::function fun, py::function cache_miss,
-                                py::function get_device,
-                                bool has_explicit_device,
-                                std::vector<int> static_argnums,
-                                std::vector<py::str> static_argnames,
-                                std::vector<int> donate_argnums,
-                                std::shared_ptr<CompiledFunctionCache> cache) {
-  new (&cfun->fun) CompiledFunction(
-      std::move(fun), std::move(cache_miss), std::move(get_device),
-      has_explicit_device, std::move(static_argnums),
-      std::move(static_argnames), std::move(donate_argnums), std::move(cache));
-}
-
-}  // extern "C"
-
-py::object MakeCompiledFunction(py::function fun, py::function cache_miss,
-                                py::function get_device,
-                                bool has_explicit_device,
-                                std::vector<int> static_argnums,
-                                std::vector<py::str> static_argnames,
-                                std::vector<int> donate_argnums,
-                                std::shared_ptr<CompiledFunctionCache> cache) {
-  py::object obj = py::reinterpret_steal<py::object>(JaxCompiledFunction_tp_new(
-      reinterpret_cast<PyTypeObject*>(JaxCompiledFunction_Type), nullptr,
-      nullptr));
-  JaxCompiledFunctionObject* buf =
-      reinterpret_cast<JaxCompiledFunctionObject*>(obj.ptr());
-  if (!cache) {
-    cache = std::make_shared<CompiledFunctionCache>(
-        CompiledFunctionCache::kDefaultCapacity);
-  }
-  InitializeCompiledFunction(
-      buf, std::move(fun), std::move(cache_miss), std::move(get_device),
-      has_explicit_device, std::move(static_argnums),
-      std::move(static_argnames), std::move(donate_argnums), std::move(cache));
-  return obj;
-}
-
-// Version numbers for the pickled representations of
-// CompiledFunction/CompiledFunctionCache. Increment these if changing them.
-const int kCompiledFunctionCachePickleVersion = 1;
-const int kCompiledFunctionPickleVersion = 1;
-
-}  // namespace
-
 void BuildJaxjitSubmodule(py::module& m) {
   py::module jitlib = m.def_submodule("jax_jit", "Jax C++ jit library");
 
-  py::class_<CompiledFunctionCache, std::shared_ptr<CompiledFunctionCache>>
-      cache(jitlib, "CompiledFunctionCache");
-  cache.def(py::init<int>(),
-            py::arg("capacity") = CompiledFunctionCache::kDefaultCapacity);
-  cache.def("size", &CompiledFunctionCache::Size);
-  cache.def("capacity", &CompiledFunctionCache::Capacity);
-  cache.def("clear", &CompiledFunctionCache::Clear);
-  cache.def_static("clear_all", []() {
-    GetGlobalCompiledFunctionStore().ClearFunctionCache();
-  });
-  cache.def(py::pickle(
-      // __getstate__
-      // Pickles as an empty cache; the client can repopulate as needed.
-      [](const CompiledFunctionCache& cache) {
-        py::dict pickle;
-        pickle["version"] = kCompiledFunctionCachePickleVersion;
-        pickle["capacity"] = cache.Capacity();
-        return pickle;
-      },
-      // __setstate__
-      [](const py::dict& pickle) {
-        int version = py::cast<int>(pickle["version"]);
-        if (version != kCompiledFunctionCachePickleVersion) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Invalid CompiledFunction pickle version, got %d, expected %d",
-              version, kCompiledFunctionCachePickleVersion));
-        }
-        int capacity = py::cast<int>(pickle["capacity"]);
-        return std::make_shared<CompiledFunctionCache>(capacity);
-      }));
-
-  // We need to use heap-allocated type objects because we want to add
-  // additional methods dynamically.
-  py::object cfun;
-  {
-    py::str name = py::str("CompiledFunction");
-    py::str qualname = py::str("CompiledFunction");
-    PyHeapTypeObject* heap_type = reinterpret_cast<PyHeapTypeObject*>(
-        PyType_Type.tp_alloc(&PyType_Type, 0));
-    // Caution: we must not call any functions that might invoke the GC until
-    // PyType_Ready() is called. Otherwise the GC might see a half-constructed
-    // type object.
-    CHECK(heap_type) << "Unable to create heap type object";
-    heap_type->ht_name = name.release().ptr();
-    heap_type->ht_qualname = qualname.release().ptr();
-    PyTypeObject* type = &heap_type->ht_type;
-    type->tp_name = "CompiledFunction";
-    type->tp_basicsize = sizeof(JaxCompiledFunctionObject);
-    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE |
-                     Py_TPFLAGS_HAVE_GC | JAX_TPFLAGS_HAVE_VECTORCALL;
-    type->tp_new = JaxCompiledFunction_tp_new;
-    type->tp_dealloc = JaxCompiledFunction_tp_dealloc;
-    type->tp_dictoffset = offsetof(JaxCompiledFunctionObject, dict);
-    type->tp_traverse = JaxCompiledFunction_tp_traverse;
-    type->tp_clear = JaxCompiledFunction_tp_clear;
-    type->tp_weaklistoffset = offsetof(JaxCompiledFunctionObject, weakrefs);
-    type->tp_getset = JaxCompiledFunction_tp_getset;
-    type->tp_descr_get = JaxCompiledFunction_tp_descr_get;
-    type->tp_call = PyVectorcall_Call;
-    type->tp_vectorcall_offset =
-        offsetof(JaxCompiledFunctionObject, vectorcall);
-    type->tp_repr = JaxCompiledFunction_tp_repr;
-    CHECK_EQ(PyType_Ready(type), 0);
-    JaxCompiledFunction_Type = reinterpret_cast<PyObject*>(type);
-    cfun = py::reinterpret_borrow<py::object>(JaxCompiledFunction_Type);
-  }
-  py::object cfun_type =
-      py::reinterpret_borrow<py::object>(JaxCompiledFunction_Type);
-
-  // Add CompiledFunction to the xla_extension module so it can be pickled.
-  m.attr("CompiledFunction") = cfun_type;
-  cfun.attr("__module__") = m.attr("__name__");
-
-  cfun.attr("__signature__") =
-      property_readonly([](py::handle self) -> xla::StatusOr<py::object> {
-        TF_ASSIGN_OR_RETURN(CompiledFunction * fun, AsCompiledFunction(self));
-        return fun->PythonSignature();
-      });
-  cfun.attr("_cache_miss") =
-      property_readonly([](py::handle self) -> xla::StatusOr<py::object> {
-        TF_ASSIGN_OR_RETURN(CompiledFunction * fun, AsCompiledFunction(self));
-        return fun->cache_miss();
-      });
-  cfun.attr("__getstate__") = py::cpp_function(
-      [](const CompiledFunction::object& self) {
-        CompiledFunction* fn = self.func();
-        py::dict pickle;
-        pickle["version"] = kCompiledFunctionPickleVersion;
-        pickle["fun"] = fn->fun();
-        pickle["cache_miss"] = fn->cache_miss();
-        pickle["get_device"] = fn->get_device();
-        pickle["has_explicit_device"] = fn->has_explicit_device();
-        pickle["static_argnums"] = fn->static_argnums();
-        pickle["static_argnames"] = fn->static_argnames();
-        pickle["donate_argnums"] = fn->donate_argnums();
-        pickle["cache"] = fn->cache();
-        return pickle;
-      },
-      py::is_method(cfun_type));
-  cfun.attr("__setstate__") = py::cpp_function(
-      [](CompiledFunction::object& self, const py::dict& pickle) {
-        int version = py::cast<int>(pickle["version"]);
-        if (version != kCompiledFunctionPickleVersion) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Invalid CompiledFunction pickle version, got %d, expected %d. "
-              "Pickling/Unpickling jitted functions using different JAX "
-              "versions is not supported.",
-              version, kCompiledFunctionPickleVersion));
-        }
-        py::function fun = py::cast<py::function>(pickle["fun"]);
-        py::function cache_miss = py::cast<py::function>(pickle["cache_miss"]);
-        py::function get_device = py::cast<py::function>(pickle["get_device"]);
-        bool has_explicit_device =
-            py::cast<bool>(pickle["has_explicit_device"]);
-        std::vector<int> static_argnums =
-            py::cast<std::vector<int>>(pickle["static_argnums"]);
-        std::vector<py::str> static_argnames =
-            py::cast<std::vector<py::str>>(pickle["static_argnames"]);
-        std::vector<int> donate_argnums =
-            py::cast<std::vector<int>>(pickle["donate_argnums"]);
-        std::shared_ptr<CompiledFunctionCache> cache =
-            py::cast<std::shared_ptr<CompiledFunctionCache>>(pickle["cache"]);
-        InitializeCompiledFunction(
-            reinterpret_cast<JaxCompiledFunctionObject*>(self.ptr()),
-            std::move(fun), std::move(cache_miss), std::move(get_device),
-            has_explicit_device, std::move(static_argnums),
-            std::move(static_argnames), std::move(donate_argnums),
-            std::move(cache));
-      },
-      py::is_method(cfun_type));
-
   py::class_<JitState> jit_state_(jitlib, "JitState");
   jit_state_.def_readwrite("disable_jit", &JitState::disable_jit);
   jit_state_.def_readwrite("enable_x64", &JitState::enable_x64);
-  jit_state_.def_readwrite("jax_array", &JitState::jax_array);
   jit_state_.def_readwrite("default_device", &JitState::default_device);
   jit_state_.def_readwrite("extra_jit_context", &JitState::extra_jit_context);
   jit_state_.def_readwrite("post_hook", &JitState::post_hook);
@@ -1510,52 +344,26 @@ void BuildJaxjitSubmodule(py::module& m) {
   jitlib.def("set_thread_local_state_initialization_callback",
              [](py::object f) { initialize_local_state = f; });
 
-  jitlib.def(
-      "jit",
-      [](py::function fun, py::function cache_miss, py::function get_device,
-         std::vector<int> static_argnums, std::vector<py::str> static_argnames,
-         std::vector<int> donate_argnums, bool has_explicit_device,
-         std::shared_ptr<CompiledFunctionCache> cache) -> py::object {
-        return MakeCompiledFunction(
-            std::move(fun), std::move(cache_miss), std::move(get_device),
-            has_explicit_device, std::move(static_argnums),
-            std::move(static_argnames), std::move(donate_argnums),
-            std::move(cache));
-      },
-      py::arg("fun"), py::arg("cache_miss"), py::arg("get_device"),
-      py::arg("static_argnums"),
-      py::arg("static_argnames") = std::vector<py::str>(),
-      py::arg("donate_argnums") = std::vector<int>(),
-      py::arg("has_explicit_device") = false, py::arg("cache") = nullptr);
+  // TODO(yashkatariya, phawkins): Remove after 3 months from March 20, 2023.
+  struct CompiledFunction {};
+  py::class_<CompiledFunction>(m, "CompiledFunction");
 
   py::class_<xla::PyArgSignature> arg_signature(jitlib, "PyArgSignature");
   arg_signature
-      .def_property_readonly("dtype",
-                             [](const xla::PyArgSignature& sig) {
-                               return PrimitiveTypeToDtype(sig.dtype);
-                             })
+      .def_property_readonly(
+          "dtype",
+          [](const xla::PyArgSignature& sig) {
+            return xla::ValueOrThrow(PrimitiveTypeToDtype(sig.dtype));
+          })
       .def_property_readonly(
           "shape",
           [](const xla::PyArgSignature& sig) {
             return xla::SpanToTuple(absl::MakeConstSpan(sig.shape));
           })
       .def_readonly("weak_type", &xla::PyArgSignature::weak_type);
-  jitlib.def("_ArgSignatureOfValue", &xla::PyArgSignatureOfValue);
+  jitlib.def("_ArgSignatureOfValue",
+             xla::ValueOrThrowWrapper(xla::PyArgSignatureOfValue));
 
-  // All private members are only for testing/debugging purposes
-  cfun.attr("_cache_size") = py::cpp_function(
-      [](py::handle self) -> xla::StatusOr<int> {
-        TF_ASSIGN_OR_RETURN(CompiledFunction * fun, AsCompiledFunction(self));
-        return fun->cache_size();
-      },
-      py::is_method(cfun));
-  cfun.attr("_clear_cache") = py::cpp_function(
-      [](py::handle self) -> xla::Status {
-        TF_ASSIGN_OR_RETURN(CompiledFunction * fun, AsCompiledFunction(self));
-        fun->ClearCache();
-        return ::tsl::OkStatus();
-      },
-      py::is_method(cfun));
   jitlib.def("_is_float0", &xla::IsFloat0);
 }
 
diff --git a/tensorflow/compiler/xla/python/jax_jit.h b/tensorflow/compiler/xla/python/jax_jit.h
index 76ae2ec6c54..462e525129b 100644
--- a/tensorflow/compiler/xla/python/jax_jit.h
+++ b/tensorflow/compiler/xla/python/jax_jit.h
@@ -55,7 +55,6 @@ struct JitState {
 
   std::optional<bool> disable_jit;
   std::optional<bool> enable_x64;
-  std::optional<bool> jax_array;
 
   // Used to manually set the default device jax should use. May be unset even
   // in global state, indicating there is no manual override.
@@ -81,7 +80,6 @@ JitState& ThreadLocalJitState();
 // fallback to global state.
 bool GetDisableJit();
 bool GetEnableX64();
-bool GetEnableJaxArray();
 // TODO(skyewm): return a C++ type when all JAX backends support a single C++
 // device interface
 std::optional<pybind11::object> GetDefaultDevice();
@@ -129,7 +127,6 @@ struct CallSignature {
   // This is not the case for PMAP, and is set to `nullptr`.
   xla::PjRtDevice* device = nullptr;
   bool jax_enable_x64;
-  bool jax_array = false;
 
   // For JIT on PJIT, we need to fallback to python whenever default_device
   // changes.
diff --git a/tensorflow/compiler/xla/python/mlir.cc b/tensorflow/compiler/xla/python/mlir.cc
index 62aa7253ef5..39e2dd1d21e 100644
--- a/tensorflow/compiler/xla/python/mlir.cc
+++ b/tensorflow/compiler/xla/python/mlir.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <string>
 
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -24,6 +25,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "pybind11/cast.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/stl.h"  // from @pybind11
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
 #include "stablehlo/dialect/Serialization.h"  // from @stablehlo
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
@@ -33,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
@@ -158,18 +161,22 @@ StatusOr<std::string> PyStablehloToMhlo(std::string mlir_module) {
 StatusOr<py::bytes> PySerializePortableArtifact(std::string mlir_module,
                                                 std::string target) {
   mlir::MLIRContext context;
+  if (VLOG_IS_ON(3)) context.disableMultithreading();
   TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
                       ParseModule(&context, mlir_module));
 
-  // Legalize CHLO -> MHLO -> StableHLO
+  // Legalize CHLO -> [MHLO+Shape] -> StableHLO
   mlir::PassManager pm(&context);
-  // FIXME: Add other dialect registrations here.
   if (VLOG_IS_ON(3)) EnablePrintBeforeAndAfter(pm);
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::mhlo::createChloLegalizeToHloPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createShapeLegalizeToHloPass());
+  pm.addPass(mlir::createReconcileUnrealizedCastsPass());
   pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
   if (!mlir::succeeded(pm.run(*module))) {
-    return tsl::errors::InvalidArgument("CHLO => MHLO => StableHLO failed");
+    return tsl::errors::InvalidArgument(
+        "CHLO => [MHLO+Shape] => StableHLO failed");
   }
 
   // Serialize portable artifact
@@ -195,20 +202,24 @@ void BuildMlirSubmodule(py::module& m) {
   py::module mlir_module = m.def_submodule("mlir", "MLIR/XLA integration");
 
   mlir_module.def("xla_computation_to_mlir_module",
-                  &PyXlaComputationToMlirModule, py::arg("computation"),
-                  py::arg("emit_stable_hlo") = true);
+                  xla::ValueOrThrowWrapper(PyXlaComputationToMlirModule),
+                  py::arg("computation"), py::arg("emit_stable_hlo") = true);
   mlir_module.def("mlir_module_to_xla_computation",
-                  &PyMlirModuleToXlaComputation, py::arg("mlir_module"),
-                  py::arg("use_tuple_args") = false,
+                  xla::ValueOrThrowWrapper(PyMlirModuleToXlaComputation),
+                  py::arg("mlir_module"), py::arg("use_tuple_args") = false,
                   py::arg("return_tuple") = false);
-  mlir_module.def("mhlo_to_stablehlo", &PyMhloToStablehlo,
+  mlir_module.def("mhlo_to_stablehlo",
+                  xla::ValueOrThrowWrapper(PyMhloToStablehlo),
                   py::arg("mlir_module"));
-  mlir_module.def("stablehlo_to_mhlo", &PyStablehloToMhlo,
+  mlir_module.def("stablehlo_to_mhlo",
+                  xla::ValueOrThrowWrapper(PyStablehloToMhlo),
                   py::arg("mlir_module"));
-  mlir_module.def("serialize_portable_artifact", &PySerializePortableArtifact,
+  mlir_module.def("serialize_portable_artifact",
+                  xla::ValueOrThrowWrapper(PySerializePortableArtifact),
                   py::arg("mlir_module"), py::arg("target"));
   mlir_module.def("deserialize_portable_artifact",
-                  &PyDeserializePortableArtifact, py::arg("mlir_module"));
+                  xla::ValueOrThrowWrapper(PyDeserializePortableArtifact),
+                  py::arg("mlir_module"));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ops.cc b/tensorflow/compiler/xla/python/ops.cc
index 83ade237dca..1af1b90ef58 100644
--- a/tensorflow/compiler/xla/python/ops.cc
+++ b/tensorflow/compiler/xla/python/ops.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <optional>
 #include <string>
+#include <tuple>
+#include <utility>
 #include <vector>
 
 #include "absl/types/span.h"
@@ -33,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/svd.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -107,7 +110,8 @@ void BuildOpsSubmodule(py::module* m) {
           py::arg("reduction_dim"), py::arg("comparator"),
           py::arg("recall_target") = 0.9, py::arg("aggregate_to_topk") = true,
           py::arg("reduction_input_size_override") = -1);
-  ops.def("ApproxTopKReductionOutputSize", &ApproxTopKReductionOutputSize,
+  ops.def("ApproxTopKReductionOutputSize",
+          xla::ValueOrThrowWrapper(ApproxTopKReductionOutputSize),
           py::arg("input_size"), py::arg("rank"), py::arg("top_k"),
           py::arg("recall_target"), py::arg("aggregate_to_topk") = true,
           py::arg("input_size_override") = -1);
@@ -279,7 +283,7 @@ void BuildOpsSubmodule(py::module* m) {
           py::arg("builder"), py::arg("type"), py::arg("size"));
   ops.def(
       "LU",
-      [](XlaOp a) -> StatusOr<std::tuple<XlaOp, XlaOp, XlaOp>> {
+      [](XlaOp a) -> std::tuple<XlaOp, XlaOp, XlaOp> {
         LuDecompositionResult lu = LuDecomposition(a);
         return std::make_tuple(lu.lu, lu.pivots, lu.permutation);
       },
@@ -305,7 +309,7 @@ void BuildOpsSubmodule(py::module* m) {
           py::arg("taus"));
   ops.def(
       "QR",
-      [](XlaOp a, bool full_matrices) -> StatusOr<std::pair<XlaOp, XlaOp>> {
+      [](XlaOp a, bool full_matrices) -> std::pair<XlaOp, XlaOp> {
         XlaOp q, r;
         QrExplicit(a, full_matrices, q, r);
         return std::make_pair(q, r);
@@ -313,7 +317,7 @@ void BuildOpsSubmodule(py::module* m) {
       py::arg("operand"), py::arg("full_matrices"));
   ops.def(
       "QrDecomposition",
-      [](XlaOp a) -> StatusOr<std::pair<XlaOp, XlaOp>> {
+      [](XlaOp a) -> std::pair<XlaOp, XlaOp> {
         QrDecomposition d = Qr(a);
         return std::make_pair(d.q_and_r, d.taus);
       },
@@ -405,11 +409,11 @@ void BuildOpsSubmodule(py::module* m) {
       [](XlaBuilder* builder, absl::Span<const XlaOp> operands,
          std::optional<const XlaComputation*> comparator, int64_t dimension,
          bool is_stable) -> XlaOp {
-        return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+        return builder->ReportErrorOrReturn([&]() -> XlaOp {
           std::vector<PrimitiveType> operand_types;
           operand_types.reserve(operands.size());
           for (const auto& operand : operands) {
-            TF_ASSIGN_OR_RETURN(auto operand_shape, builder->GetShape(operand));
+            auto operand_shape = xla::ValueOrThrow(builder->GetShape(operand));
             operand_types.push_back(operand_shape.element_type());
           }
 
@@ -433,7 +437,12 @@ void BuildOpsSubmodule(py::module* m) {
         return std::make_tuple(svd.u, svd.d, svd.v);
       },
       py::arg("a"), py::arg("max_iter") = 100, py::arg("epsilon") = 1e-6);
-  ops.def("TopK", &TopK, py::arg("input"), py::arg("k"));
+  ops.def(
+      "TopK",
+      [](XlaOp input, int64_t k) {
+        return TopK(input, k, /*index_type=*/PrimitiveType::S32);
+      },
+      py::arg("input"), py::arg("k"));
   ops.def("Transpose", &Transpose, py::arg("operand"), py::arg("permutation"));
   ops.def("TriangularSolve", &TriangularSolve, py::arg("a"), py::arg("b"),
           py::arg("left_side"), py::arg("lower"), py::arg("unit_diagonal"),
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.cc b/tensorflow/compiler/xla/python/outfeed_receiver.cc
index 128543f9226..71e82601509 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.cc
@@ -19,16 +19,18 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <queue>
-#include <sstream>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
 
@@ -153,9 +155,10 @@ std::string OutfeedData::DebugString() const {
 
 class OutfeedReceiverImpl {
  public:
-  OutfeedReceiverImpl(OutfeedReceiver::Callback callback,
-                      absl::Span<PjRtClient* const> clients,
-                      ssize_t max_callback_queue_size_bytes);
+  OutfeedReceiverImpl(
+      OutfeedReceiver::Callback callback, absl::Span<PjRtClient* const> clients,
+      ssize_t max_callback_queue_size_bytes,
+      const std::optional<ExecutableBuildOptions>& executable_build_options);
 
   OutfeedReceiverImpl(const OutfeedReceiverImpl&) = delete;
   OutfeedReceiverImpl& operator=(const OutfeedReceiverImpl&) = delete;
@@ -204,6 +207,7 @@ class OutfeedReceiverImpl {
   std::vector<PjRtDevice*> devices_;
   // Maximum bytes capacity of the ensemble of callback queues.
   uint64_t max_callback_queue_size_bytes_;
+  std::optional<ExecutableBuildOptions> executable_build_options_;
 
   absl::Mutex mu_;
   // Registered shapes by consumer id.
@@ -228,7 +232,9 @@ class OutfeedReceiverImpl {
 
 OutfeedReceiverImpl::OutfeedReceiverImpl(
     OutfeedReceiver::Callback callback, absl::Span<PjRtClient* const> clients,
-    ssize_t max_callback_queue_size_bytes) {
+    ssize_t max_callback_queue_size_bytes,
+    const std::optional<ExecutableBuildOptions>& executable_build_options)
+    : executable_build_options_(executable_build_options) {
   callback_ = callback;
   max_callback_queue_size_bytes_ = max_callback_queue_size_bytes;
   for (const auto& client : clients) {
@@ -414,6 +420,9 @@ Status OutfeedReceiverImpl::SendShutdownOutfeedHeader(int device_idx) {
   XlaComputation computation = builder.Build(add_dep).value();
 
   CompileOptions compile_options;
+  if (executable_build_options_) {
+    compile_options.executable_build_options = *executable_build_options_;
+  }
   compile_options.executable_build_options.set_num_replicas(1);
   compile_options.executable_build_options.set_num_partitions(1);
   DeviceAssignment device_assignment(1, 1);
@@ -474,11 +483,13 @@ StatusOr<XlaOp> OutfeedReceiverImpl::AddOutfeedToBuilder(
   return token;
 }
 
-OutfeedReceiver::OutfeedReceiver(Callback callback,
-                                 absl::Span<PjRtClient* const> clients,
-                                 ssize_t max_callback_queue_size_bytes) {
-  p_impl_ = std::make_unique<OutfeedReceiverImpl>(
-      callback, clients, max_callback_queue_size_bytes);
+OutfeedReceiver::OutfeedReceiver(
+    Callback callback, absl::Span<PjRtClient* const> clients,
+    ssize_t max_callback_queue_size_bytes,
+    const std::optional<ExecutableBuildOptions>& executable_build_options) {
+  p_impl_ = std::make_unique<OutfeedReceiverImpl>(callback, clients,
+                                                  max_callback_queue_size_bytes,
+                                                  executable_build_options);
 }
 
 OutfeedReceiver::~OutfeedReceiver() {}
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.h b/tensorflow/compiler/xla/python/outfeed_receiver.h
index 58775ca14b0..f7e821a0001 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver.h
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.h
@@ -19,8 +19,10 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
@@ -47,8 +49,10 @@ class OutfeedReceiver {
   //   max_callback_queue_size_bytes: the maximum number of bytes for all
   //     received outfeeds queued to be processed. When this limit is reached
   //     we pause receiving outfeeds from devices.
-  OutfeedReceiver(Callback callback, absl::Span<PjRtClient* const> clients,
-                  ssize_t max_callback_queue_size_bytes);
+  OutfeedReceiver(
+      Callback callback, absl::Span<PjRtClient* const> clients,
+      ssize_t max_callback_queue_size_bytes,
+      const std::optional<ExecutableBuildOptions>& executable_build_options);
 
   OutfeedReceiver(const OutfeedReceiver&) = delete;
   OutfeedReceiver& operator=(const OutfeedReceiver&) = delete;
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_py.cc b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
index 74c1ba95684..7b9f31c0256 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
@@ -16,17 +16,23 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/outfeed_receiver_py.h"
 
 #include <cstdint>
+#include <functional>
 #include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/synchronization/mutex.h"
 #include "pybind11/cast.h"  // from @pybind11
 #include "pybind11/functional.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/outfeed_receiver.h"
 #include "tensorflow/compiler/xla/python/py_client.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/python/types.h"
 
 namespace xla {
@@ -43,9 +49,11 @@ class OutfeedReceiverForPython {
   using CallbackToPython =
       std::function<void(ClientAndPtr<PjRtDevice>, uint32_t, pybind11::object)>;
 
-  OutfeedReceiverForPython(CallbackToPython callback_python,
-                           std::vector<std::shared_ptr<PyClient>> clients,
-                           ssize_t max_callback_queue_size_bytes)
+  OutfeedReceiverForPython(
+      CallbackToPython callback_python,
+      std::vector<std::shared_ptr<PyClient>> clients,
+      ssize_t max_callback_queue_size_bytes,
+      const std::optional<ExecutableBuildOptions>& executable_build_options)
       : callback_python_(std::move(callback_python)),
         clients_(std::move(clients)) {
     OutfeedReceiver::Callback callback =
@@ -59,7 +67,8 @@ class OutfeedReceiverForPython {
                         return client->pjrt_client();
                       });
     outfeed_receiver_ = std::make_unique<OutfeedReceiver>(
-        callback, client_ptrs, max_callback_queue_size_bytes);
+        callback, client_ptrs, max_callback_queue_size_bytes,
+        executable_build_options);
   }
   OutfeedReceiverForPython(const OutfeedReceiverForPython&) = delete;
   OutfeedReceiverForPython& operator=(const OutfeedReceiverForPython&) = delete;
@@ -128,15 +137,18 @@ void BuildOutfeedReceiverSubmodule(py::module* m) {
       "start",
       [](OutfeedReceiverForPython::CallbackToPython callback_to_python,
          std::vector<std::shared_ptr<PyClient>> clients,
-         ssize_t max_callback_queue_size_bytes)
+         ssize_t max_callback_queue_size_bytes,
+         std::optional<ExecutableBuildOptions> executable_build_options)
           -> std::unique_ptr<OutfeedReceiverForPython> {
         auto server = std::make_unique<OutfeedReceiverForPython>(
-            callback_to_python, clients, max_callback_queue_size_bytes);
+            callback_to_python, clients, max_callback_queue_size_bytes,
+            executable_build_options);
         server->Start();
         return server;
       },
       py::arg("callback_to_python"), py::arg("backends"),
       py::arg("max_queue_size_bytes") = 256 * 1024 * 1024,
+      py::arg("executable_build_options") = std::nullopt,
       R"(Starts a multithreaded outfeed receiver.
 
       There is one thread for each of the specified devices. When Python
@@ -158,9 +170,10 @@ void BuildOutfeedReceiverSubmodule(py::module* m) {
       outfeed_receiver, "OutfeedReceiverForPython");
 
   outfeed_receiver_class.def(
-      "add_outfeed", &OutfeedReceiverForPython::AddOutfeed, py::arg("builder"),
-      py::arg("token"), py::arg("consumer_id"), py::arg("arrays"),
-      py::arg("device_idx"),
+      "add_outfeed",
+      xla::ValueOrThrowWrapper(&OutfeedReceiverForPython::AddOutfeed),
+      py::arg("builder"), py::arg("token"), py::arg("consumer_id"),
+      py::arg("arrays"), py::arg("device_idx"),
       R"(Adds an outfeed into the given computation builder.
 
       Has the side-effect of registering the sent shape along with the consumer
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
index daa813a9707..f1a88c8a67b 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/outfeed_receiver.h"
 
 #include <memory>
+#include <optional>
 
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -120,7 +121,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedSimple) {
         receiver->Receive(consumer_id, data);
       };
   auto outfeed_receiver =
-      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+      std::make_shared<OutfeedReceiver>(callback, clients, 128, std::nullopt);
   outfeed_receiver->Start();
 
   XlaBuilder builder("execute_test_outfeed");
@@ -153,7 +154,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
         receiver->Receive(consumer_id, data);
       };
   auto outfeed_receiver =
-      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+      std::make_shared<OutfeedReceiver>(callback, clients, 128, std::nullopt);
   outfeed_receiver->Start();
 
   XlaBuilder builder0("execute_test_outfeed_0");
@@ -198,7 +199,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
         receiver->Receive(consumer_id, data);
       };
   auto outfeed_receiver =
-      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+      std::make_shared<OutfeedReceiver>(callback, clients, 128, std::nullopt);
   outfeed_receiver->Start();
 
   XlaBuilder builder("execute_test_outfeed");
@@ -241,7 +242,7 @@ TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
         receiver->Receive(consumer_id, data);
       };
   auto outfeed_receiver =
-      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+      std::make_shared<OutfeedReceiver>(callback, clients, 128, std::nullopt);
   outfeed_receiver->Start();
 
   XlaBuilder builder("execute_test_outfeed");
@@ -275,7 +276,7 @@ TEST(OutfeedReceiverTest, InvalidConsumerIdError) {
         receiver->Receive(consumer_id, data);
       };
   auto outfeed_receiver =
-      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+      std::make_shared<OutfeedReceiver>(callback, clients, 128, std::nullopt);
   outfeed_receiver->Start();
 
   XlaBuilder builder("execute_test_outfeed");
diff --git a/tensorflow/compiler/xla/python/pjit.cc b/tensorflow/compiler/xla/python/pjit.cc
index 2342d75011a..12b6f401244 100644
--- a/tensorflow/compiler/xla/python/pjit.cc
+++ b/tensorflow/compiler/xla/python/pjit.cc
@@ -1026,7 +1026,7 @@ void BuildPjitSubmodule(py::module& m) {
       },
       py::is_method(cfun_type));
   cfun.attr("__signature__") =
-      property_readonly([](py::handle self) -> xla::StatusOr<py::object> {
+      property_readonly([](py::handle self) -> py::object {
         return AsPjitFunction(self)->PythonSignature();
       });
   cfun.attr("_cache_miss") =
@@ -1035,15 +1035,12 @@ void BuildPjitSubmodule(py::module& m) {
       });
   // All private members are only for testing/debugging purposes
   cfun.attr("_cache_size") = py::cpp_function(
-      [](py::handle self) -> xla::StatusOr<int> {
+      [](py::handle self) -> int {
         return AsPjitFunction(self)->cache_capacity();
       },
       py::is_method(cfun));
   cfun.attr("_clear_cache") = py::cpp_function(
-      [](py::handle self) -> xla::Status {
-        AsPjitFunction(self)->ClearCache();
-        return ::tsl::OkStatus();
-      },
+      [](py::handle self) { AsPjitFunction(self)->ClearCache(); },
       py::is_method(cfun));
 
   m.def(
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.cc
index e935b8af5c9..c215b841835 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -50,6 +50,7 @@ StatusOr<xla::PrimitiveType> ToPrimitiveType(DType dtype) {
     case DType::kU32:
     case DType::kU64:
     case DType::kF8E4M3FN:
+    case DType::kF8E4M3B11FNUZ:
     case DType::kF8E5M2:
     case DType::kF16:
     case DType::kF32:
@@ -79,6 +80,7 @@ StatusOr<DType> ToDType(xla::PrimitiveType primitive_type) {
     case xla::PrimitiveType::U32:
     case xla::PrimitiveType::U64:
     case xla::PrimitiveType::F8E4M3FN:
+    case xla::PrimitiveType::F8E4M3B11FNUZ:
     case xla::PrimitiveType::F8E5M2:
     case xla::PrimitiveType::F16:
     case xla::PrimitiveType::F32:
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.cc
index 7a9c000ec30..55e28572271 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.cc
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.cc
@@ -37,8 +37,8 @@ StatusOr<std::unique_ptr<LoadedExecutable>> PjRtCompiler::Compile(
 }
 
 StatusOr<std::unique_ptr<LoadedExecutable>>
-PjRtCompiler::DeserializeLoadedExecutable(absl::string_view serialized,
-                                          CompileOptions options) {
+PjRtCompiler::DeserializeLoadedExecutable(
+    absl::string_view serialized, std::optional<CompileOptions> options) {
   DCHECK(this);
   TF_ASSIGN_OR_RETURN(auto pjrt_loaded_executble,
                       client_->pjrt_client()->DeserializeExecutable(
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h
index 2f9cc797cf5..467d80381b5 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h
@@ -40,7 +40,8 @@ class PjRtCompiler final : public llvm::RTTIExtends<PjRtCompiler, Compiler> {
       mlir::ModuleOp mlir_module, CompileOptions options) override;
 
   StatusOr<std::unique_ptr<LoadedExecutable>> DeserializeLoadedExecutable(
-      absl::string_view serialized, CompileOptions options) override;
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override;
 
   static char ID;  // NOLINT
 
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h
index adc9e094b77..366755532a1 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -228,10 +228,6 @@ class PjRtLoadedExecutable final
     return pjrt_loaded_executable_->IsDeleted();
   }
 
-  const DeviceAssignment& device_assignment() const override {
-    DCHECK(this);
-    return pjrt_loaded_executable_->device_assignment();
-  }
   absl::Span<const LoadedExecutable::LogicalDeviceIds>
   addressable_device_logical_ids() const override {
     DCHECK(this);
diff --git a/tensorflow/compiler/xla/python/pmap_lib.cc b/tensorflow/compiler/xla/python/pmap_lib.cc
index a0c3d6cf83c..a134bf54fdc 100644
--- a/tensorflow/compiler/xla/python/pmap_lib.cc
+++ b/tensorflow/compiler/xla/python/pmap_lib.cc
@@ -36,10 +36,10 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
+#include "tensorflow/compiler/xla/python/exceptions.h"
 #include "tensorflow/compiler/xla/python/ifrt/array.h"
 #include "tensorflow/compiler/xla/python/ifrt/dtype.h"
 #include "tensorflow/compiler/xla/python/ifrt/sharding.h"
-#include "tensorflow/compiler/xla/python/exceptions.h"
 #include "tensorflow/compiler/xla/python/jax_jit.h"
 #include "tensorflow/compiler/xla/python/py_array.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
@@ -48,6 +48,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/python_utils.h"
 #include "tensorflow/compiler/xla/python/sharded_device_array.h"
 #include "tensorflow/compiler/xla/python/sharding.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/python/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -75,7 +76,7 @@ struct InputSpec {
   py::object array_sharding;
 };
 
-// An object containing the arguments to create ShardedDeviceArray from the
+// An object containing the arguments to create Array from the
 // output buffers.
 struct ResultSpec {
  public:
@@ -101,15 +102,15 @@ struct ShardArgResult {
 
 // Shars a single argument over devices.
 //
-// We currently only support fully in C++, C++ ShardedDeviceArray. For all
-// other usages, we call a Python function returning C++ ShardedDeviceArray
+// We currently only support fully in C++, C++ Array. For all
+// other usages, we call a Python function returning C++ Array
 // that will be casted back to the C++ objects.
 //
 // This function is not usable for JAX extensions that do not comply with the
 // PjRt interfaces.
 //
 // Arguments:
-// `arg`: The object to shard across `devices`. If a `ShardedDeviceArray`,
+// `arg`: The object to shard across `devices`. If a `Array`,
 //   a fast-path will be executed if it's already correctly sharded.
 //
 // Returns a failure Status when an unrecoverable error occurred, so we don't
@@ -157,34 +158,6 @@ xla::StatusOr<ShardArgResult> ShardArg(
     }
   }
 
-  if (ShardedDeviceArray::IsShardedDeviceArray(arg)) {
-    ShardedDeviceArray* sda =
-        ShardedDeviceArray::AsShardedDeviceArrayUnchecked(arg);
-    const ShardingSpec& sharding_spec = input_spec.sharding_spec;
-    if (sharding_spec == sda->GetShardingSpec()) {
-      ShardArgResult result;
-      result.owning_sda = py::reinterpret_borrow<py::object>(arg);
-      TF_ASSIGN_OR_RETURN(auto ifrt_array, sda->ifrt_array());
-      result.ifrt_array = tsl::FormRef(ifrt_array);
-      if (result.ifrt_array == nullptr) {
-        return xla::InvalidArgument("Array has been deleted.");
-      }
-      if (result.ifrt_array->sharding().devices().devices() != devices) {
-        xla::ifrt::DeviceList::Devices ifrt_devices;
-        ifrt_devices.reserve(devices.size());
-        ifrt_devices.insert(ifrt_devices.end(), devices.begin(), devices.end());
-        auto sharding = xla::ifrt::OpaqueSharding::Create(
-            xla::ifrt::DeviceList(std::move(ifrt_devices)));
-        TF_ASSIGN_OR_RETURN(auto copied_ifrt_array,
-                            result.ifrt_array->Reshard(
-                                std::move(sharding),
-                                xla::ifrt::ArrayCopySemantics::kReuseInput));
-        result.ifrt_array = std::move(copied_ifrt_array);
-      }
-      return result;
-    }
-  }
-
   static auto ndarray_type = py::module::import("numpy").attr("ndarray").ptr();
   auto ndarray = py::array::ensure(arg);
   if (ndarray && py::type::of(arg) == ndarray_type &&
@@ -218,11 +191,14 @@ xla::StatusOr<ShardArgResult> ShardArg(
     for (size_t i = 0; i < n_devices; ++i) {
       auto to_device =
           py::cast<xla::ClientAndPtr<xla::PjRtDevice>>(py_devices_list[i]);
+      if (to_device.get_client() == nullptr) {
+        return xla::InvalidArgument("Cannot copy to unattached devices.");
+      }
 
       TF_ASSIGN_OR_RETURN(
           xla::DevicePutResult on_device,
-          DevicePut(arg[indices[i]], to_device.client->ifrt_client(),
-                    to_device.contents, options));
+          DevicePut(arg[indices[i]], to_device.get_client()->ifrt_client(),
+                    to_device.get(), options));
 
       per_device_arrays.push_back(std::move(on_device.ifrt_array));
       devices.push_back(per_device_arrays.back()->sharding().devices().front());
@@ -253,56 +229,10 @@ xla::StatusOr<ShardArgResult> ShardArg(
   auto py_array_or_bufs = python_fallback(arg, py_devices, input_spec.indices,
                                           input_spec.array_sharding);
 
-  if (py_array_or_bufs.get_type() == xla::PyArray::type()) {
-    auto py_array = py::cast<xla::PyArray>(py_array_or_bufs);
-    ShardArgResult result;
-    result.owning_sda = py_array_or_bufs;
-    result.ifrt_array = tsl::FormRef(py_array.ifrt_array());
-    return result;
-  }
-
-  // This fallback is better than nothing, but ideally we should be able to
-  // convert the argument in C++. At least, we can call the C++ DevicePut from
-  // Python.
-  auto per_device_pybuffers = py::cast<py::list>(py_array_or_bufs);
+  auto py_array = py::cast<xla::PyArray>(py_array_or_bufs);
   ShardArgResult result;
-  result.owning_sda = py::reinterpret_borrow<py::object>(per_device_pybuffers);
-  if (!per_device_pybuffers.empty()) {
-    std::vector<tsl::RCReference<xla::ifrt::Array>> per_device_arrays;
-    per_device_arrays.reserve(per_device_pybuffers.size());
-    xla::ifrt::DeviceList::Devices devices;
-    devices.reserve(per_device_pybuffers.size());
-    // TODO(hyeontaek): The created array will never be disassembled. We should
-    // omit collecting shapes and make the OpaqueSharding non-disassemblable?
-    std::vector<xla::ifrt::Shape> shapes;
-    shapes.reserve(per_device_pybuffers.size());
-
-    // The JAX Python shard_arg function is expected to return JAX PyBuffer
-    // objects. If executing a JAX extension, it should have fallbacked to
-    // Python well before this point.
-    TF_RET_CHECK(xla::PyBuffer::IsPyBuffer(per_device_pybuffers[0]));
-    for (py::handle per_device_pybuffer : per_device_pybuffers) {
-      auto b = xla::PyBuffer::AsPyBuffer(per_device_pybuffer).value();
-      per_device_arrays.push_back(tsl::FormRef(b->ifrt_array()));
-      devices.push_back(per_device_arrays.back()->sharding().devices().front());
-      shapes.push_back(per_device_arrays.back()->shape());
-    }
-    TF_ASSIGN_OR_RETURN(
-        result.ifrt_array,
-        per_device_arrays.front()
-            ->client()
-            ->AssembleArrayFromSingleDeviceArrays(
-                // TODO(hyeontaek): The logical shape here is inaccurate. We
-                // may want to avoid creating a new Array or specialize Array
-                // to disallow access to the logical shape.
-                per_device_arrays.front()->shape(),
-                xla::ifrt::OpaqueSharding::Create(
-                    xla::ifrt::DeviceList(std::move(devices)),
-                    xla::ifrt::OpaqueSharding::MakeDisassembleFuncFromShapes(
-                        std::move(shapes))),
-                absl::MakeSpan(per_device_arrays),
-                xla::ifrt::ArrayCopySemantics::kReuseInput));
-  }
+  result.owning_sda = py_array_or_bufs;
+  result.ifrt_array = tsl::FormRef(py_array.ifrt_array());
   return result;
 }
 
@@ -313,7 +243,7 @@ struct PmapCacheEntry {
   std::vector<xla::PjRtDevice*> devices;
   std::vector<InputSpec> input_specs;
   xla::PyTreeDef out_pytree_def;
-  // Objects necessary to build the out ShardedDeviceArray objects.
+  // Objects necessary to build the out Array objects.
   std::vector<ResultSpec> out_result_specs;
 
   std::vector<py::object> out_array_shardings;
@@ -347,7 +277,7 @@ class PmapFunction {
         python_shard_arg_fallback_(std::move(python_shard_arg_fallback)) {
     std::sort(static_argnums_.begin(), static_argnums_.end());
 
-    function_name_ = py::str(py::getattr(fun_, "__name__", fun));
+    function_name_ = py::str(py::getattr(fun_, "__name__", fun_));
   }
   PmapFunction(const PmapFunction&) = delete;
   PmapFunction& operator=(const PmapFunction& other) = delete;
@@ -358,7 +288,7 @@ class PmapFunction {
   // (a) flatten the inputs using pytree
   // (b) get buffer objects from the arguments
   // (c) call the executable
-  // (d) construct `ShardedDeviceArray` objects from the outputs
+  // (d) construct `Array` objects from the outputs
   // (e) reconstruct the `PyTree`.
   xla::StatusOr<py::object> Call(py::handle callable, PyObject* const* args,
                                  size_t nargs, PyObject* kwnames);
@@ -369,6 +299,7 @@ class PmapFunction {
   }
 
   int cache_size() const { return executables_.size(); }
+  void cache_clear() { return executables_.clear(); }
   const py::function& fun() const { return fun_; }
   const py::function& cache_miss() const { return cache_miss_; }
   const std::string& function_name() const { return function_name_; }
@@ -690,8 +621,6 @@ xla::StatusOr<py::object> PmapFunction::Call(py::handle callable,
 
   // 1. Parse arguments.
   std::vector<xla::PjRtDevice*>& input_devices = cache_entry.devices;
-  const int num_computations =
-      cache_entry.executable->AddressableDevices().size();
   std::vector<InputSpec>& input_specs = cache_entry.input_specs;
   const int num_args = arguments.flat_dynamic_args.size();
 
@@ -722,7 +651,7 @@ xla::StatusOr<py::object> PmapFunction::Call(py::handle callable,
   }
 
   // TODO(jblespiau): We don't need to create the PyBuffer objects.
-  // Having a C++ `ShardedDeviceArray`, keeping internally the PjRtBuffer
+  // Having a C++ `Array`, keeping internally the PjRtBuffer
   // objects is sufficient, and we can lazily create the `PyBuffer` only if
   // we access them from Python.
   auto traceback = xla::Traceback::Get();
@@ -737,41 +666,15 @@ xla::StatusOr<py::object> PmapFunction::Call(py::handle callable,
 
   const auto& output_specs = cache_entry.out_result_specs;
 
-  if (!cache_entry.out_array_shardings.empty()) {
-    for (int i = 0; i < num_outputs; ++i) {
-      const ResultSpec& result_spec = output_specs[i];
-      xla::PyArray py_array(
-          result_spec.out_aval, result_spec.weak_type,
-          cache_entry.out_dtypes[i], cache_entry.out_shapes[i],
-          cache_entry.out_array_shardings[i], client, traceback,
-          std::move(output_arrays[i]), cache_entry.out_committed[i]);
+  TF_RET_CHECK(cache_entry.out_array_shardings.size() == num_outputs);
+  for (int i = 0; i < num_outputs; ++i) {
+    const ResultSpec& result_spec = output_specs[i];
+    xla::PyArray py_array(
+        result_spec.out_aval, result_spec.weak_type, cache_entry.out_dtypes[i],
+        cache_entry.out_shapes[i], cache_entry.out_array_shardings[i], client,
+        traceback, std::move(output_arrays[i]), cache_entry.out_committed[i]);
 
-      flat_sharded_device_arrays.push_back(std::move(py_array));
-    }
-  } else {
-    std::vector<std::vector<xla::PyBuffer::object>> outputs;
-    outputs.resize(num_outputs);
-    for (int output_id = 0; output_id < num_outputs; ++output_id) {
-      outputs[output_id].reserve(num_computations);
-      TF_ASSIGN_OR_RETURN(
-          auto single_device_arrays,
-          output_arrays[output_id]->DisassembleIntoSingleDeviceArrays(
-              xla::ifrt::ArrayCopySemantics::kReuseInput));
-      for (auto& single_device_array : single_device_arrays) {
-        outputs[output_id].push_back(xla::PyBuffer::Make(
-            client, std::move(single_device_array), traceback));
-      }
-    }
-
-    for (int i = 0; i < num_outputs; ++i) {
-      const ResultSpec& result_spec = output_specs[i];
-      flat_sharded_device_arrays.push_back(ShardedDeviceArray::Make(
-          /*aval=*/result_spec.out_aval,
-          /*sharding_spec=*/result_spec.out_spec,
-          /*device_buffers=*/py::cast(std::move(outputs[i])),
-          /*indices=*/result_spec.out_indices,
-          /*weak_type=*/result_spec.weak_type));
-    }
+    flat_sharded_device_arrays.push_back(std::move(py_array));
   }
 
   py::object out =
@@ -1082,8 +985,6 @@ void BuildPmapSubmodule(py::module& m) {
         return py::int_(hash);
       });
 
-  TF_CHECK_OK(ShardedDeviceArray::RegisterTypes(pmap_lib));
-
   // We need to use heap-allocated type objects because we want to add
   // additional methods dynamically.
   py::object cfun;
@@ -1124,14 +1025,14 @@ void BuildPmapSubmodule(py::module& m) {
   m.attr("PmapFunction") = cfun_type;
 
   cfun.attr("__signature__") =
-      property_readonly([](py::handle self) -> xla::StatusOr<py::object> {
-        TF_ASSIGN_OR_RETURN(PmapFunction * fun, AsPmapFunction(self));
+      property_readonly([](py::handle self) -> py::object {
+        PmapFunction* fun = xla::ValueOrThrow(AsPmapFunction(self));
         return fun->PythonSignature();
       });
   // Required by `post_hook`.
   cfun.attr("_cache_miss") =
-      property_readonly([](py::handle self) -> xla::StatusOr<py::object> {
-        TF_ASSIGN_OR_RETURN(PmapFunction * fun, AsPmapFunction(self));
+      property_readonly([](py::handle self) -> py::object {
+        PmapFunction* fun = xla::ValueOrThrow(AsPmapFunction(self));
         return fun->cache_miss();
       });
   cfun.attr("__getstate__") = py::cpp_function(
@@ -1172,14 +1073,21 @@ void BuildPmapSubmodule(py::module& m) {
 
   // This is only for testing/debugging purposes.
   cfun.attr("_cache_size") =
-      property_readonly([](py::handle self) -> xla::StatusOr<py::object> {
-        TF_ASSIGN_OR_RETURN(PmapFunction * fun, AsPmapFunction(self));
+      property_readonly([](py::handle self) -> py::object {
+        PmapFunction* fun = xla::ValueOrThrow(AsPmapFunction(self));
         return py::cast<int>(fun->cache_size());
       });
 
+  cfun.attr("_cache_clear") = py::cpp_function(
+      [](py::handle self) {
+        PmapFunction* fun = xla::ValueOrThrow(AsPmapFunction(self));
+        fun->cache_clear();
+      },
+      py::is_method(cfun));
+
   cfun.attr("_debug_cache_keys") = py::cpp_function(
-      [](py::handle self) -> xla::StatusOr<std::string> {
-        TF_ASSIGN_OR_RETURN(PmapFunction * fun, AsPmapFunction(self));
+      [](py::handle self) -> std::string {
+        PmapFunction* fun = xla::ValueOrThrow(AsPmapFunction(self));
         return fun->DebugCacheKeys();
       },
       py::is_method(cfun_type));
diff --git a/tensorflow/compiler/xla/python/profiler.cc b/tensorflow/compiler/xla/python/profiler.cc
index ed6bb085095..789095a519e 100644
--- a/tensorflow/compiler/xla/python/profiler.cc
+++ b/tensorflow/compiler/xla/python/profiler.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/profiler.h"
 
 #include <memory>
+#include <string>
 
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "tensorflow/compiler/xla/python/profiler/internal/traceme_wrapper.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/tsl/profiler/lib/profiler_session.h"
@@ -71,12 +73,12 @@ void BuildProfilerSubmodule(py::module* m) {
       }))
       .def("stop_and_export",
            [](tsl::ProfilerSession* sess,
-              const std::string& tensorboard_dir) -> xla::Status {
+              const std::string& tensorboard_dir) -> void {
              tensorflow::profiler::XSpace xspace;
              // Disables the ProfilerSession
-             TF_RETURN_IF_ERROR(sess->CollectData(&xspace));
-             return tsl::profiler::ExportToTensorBoard(
-                 xspace, tensorboard_dir, /* also_export_trace_json= */ true);
+             xla::ThrowIfError(sess->CollectData(&xspace));
+             xla::ThrowIfError(tsl::profiler::ExportToTensorBoard(
+                 xspace, tensorboard_dir, /* also_export_trace_json= */ true));
            });
 
   py::class_<tensorflow::ProfileOptions> profile_options_class(
diff --git a/tensorflow/compiler/xla/python/py_array.cc b/tensorflow/compiler/xla/python/py_array.cc
index 7fd78911e16..bf69a033a39 100644
--- a/tensorflow/compiler/xla/python/py_array.cc
+++ b/tensorflow/compiler/xla/python/py_array.cc
@@ -28,6 +28,9 @@ limitations under the License.
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
 #include "tensorflow/compiler/xla/pjrt/lru_cache.h"
 #include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h"
+#include "tensorflow/compiler/xla/python/py_buffer.h"
 #include "tensorflow/compiler/xla/python/py_values.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/python_utils.h"
@@ -43,42 +46,6 @@ namespace {
 
 namespace py = pybind11;
 
-tsl::RCReference<ifrt::Array> CreateIfRtArrayFromPyBuffers(
-    py::dtype dtype, absl::Span<const int64_t> shape,
-    absl::Span<const PyBuffer::object> py_buffers) {
-  if (py_buffers.empty()) {
-    // TODO(hyeontaek): Return a Status.
-    throw py::value_error("At least one buffer must be provided.");
-  }
-
-  auto* ifrt_client = py_buffers.front().buf()->client()->ifrt_client();
-
-  std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
-  ifrt_arrays.reserve(py_buffers.size());
-  ifrt::DeviceList::Devices devices;
-  devices.reserve(py_buffers.size());
-  std::vector<ifrt::Shape> shapes;
-  shapes.reserve(py_buffers.size());
-
-  for (const auto& py_buffer : py_buffers) {
-    ifrt_arrays.push_back(tsl::FormRef(py_buffer.buf()->ifrt_array()));
-    devices.push_back(ifrt_arrays.back()->sharding().devices()[0]);
-    shapes.push_back(ifrt_arrays.back()->shape());
-  }
-  auto ifrt_array = ifrt_client->AssembleArrayFromSingleDeviceArrays(
-      ifrt::Shape(shape),
-      ifrt::OpaqueSharding::Create(
-          ifrt::DeviceList(std::move(devices)),
-          xla::ifrt::OpaqueSharding::MakeDisassembleFuncFromShapes(
-              std::move(shapes))),
-      absl::MakeSpan(ifrt_arrays), ifrt::ArrayCopySemantics::kReuseInput);
-  if (!ifrt_array.ok()) {
-    // TODO(hyeontaek): Return a Status.
-    throw py::value_error(ifrt_array.status().ToString());
-  }
-  return *std::move(ifrt_array);
-}
-
 tsl::RCReference<ifrt::Array> CreateIfRtArrayFromSingleDeviceShardedPyArrays(
     py::object dtype, absl::Span<const int64_t> shape,
     absl::Span<const PyArray> py_arrays) {
@@ -214,7 +181,7 @@ struct ShapedArrayCacheKey {
   }
 };
 
-// Constucting ShapedArrays has gotten slow. Cache it.
+// Constructing ShapedArrays has gotten slow. Cache it.
 py::object MakeShapedArrayCached(const ShapedArrayCacheKey& key) {
   using CacheT =
       LRUCache<ShapedArrayCacheKey, std::shared_ptr<std::optional<py::object>>>;
@@ -297,25 +264,6 @@ void PyArray::PyInit(py::object self, py::object aval, py::object sharding,
   }
 }
 
-void PyArray::PyInit(py::object self, py::object aval, py::object sharding,
-                     absl::Span<const PyBuffer::object> py_buffers,
-                     bool committed, bool skip_checks) {
-  auto dtype = aval.attr("dtype");
-  auto shape = pybind11::cast<std::vector<int64_t>>(aval.attr("shape"));
-  auto ifrt_array = CreateIfRtArrayFromPyBuffers(dtype, shape, py_buffers);
-  Construct(reinterpret_cast<PyArrayObject*>(self.ptr()), aval,
-            pybind11::cast<bool>(aval.attr("weak_type")), std::move(dtype),
-            std::move(shape), std::move(sharding), committed,
-            py_buffers.at(0).buf()->client(), Traceback::Get(),
-            std::move(ifrt_array));
-
-  PyArray py_array = self;
-
-  if (!skip_checks) {
-    py_array.CheckAndRearrange();
-  }
-}
-
 void PyArray::PyInit(py::object self, DisableFastpath) {
   Construct(reinterpret_cast<PyArrayObject*>(self.ptr()),
             PyArray_Storage::DisableFastpath());
@@ -354,12 +302,6 @@ PyArrayResultHandler::PyArrayResultHandler(py::object aval, py::object sharding,
   shape_ = pybind11::cast<std::vector<int64_t>>(aval_.attr("shape"));
 }
 
-PyArray PyArrayResultHandler::Call(
-    absl::Span<const PyBuffer::object> py_buffers) const {
-  return Call(py_buffers.at(0).buf()->client(),
-              CreateIfRtArrayFromPyBuffers(dtype_, shape_, py_buffers));
-}
-
 PyArray PyArrayResultHandler::Call(absl::Span<const PyArray> py_arrays) const {
   return Call(py_arrays.at(0).py_client(),
               CreateIfRtArrayFromSingleDeviceShardedPyArrays(dtype_, shape_,
@@ -382,16 +324,14 @@ PyArray::PyArray(py::object aval, bool weak_type, py::dtype dtype,
                  std::vector<int64_t> shape, py::object sharding,
                  std::shared_ptr<PyClient> py_client,
                  std::shared_ptr<Traceback> traceback,
-                 tsl::RCReference<ifrt::Array> ifrt_array,
-                 bool committed, bool skip_checks) {
+                 tsl::RCReference<ifrt::Array> ifrt_array, bool committed,
+                 bool skip_checks) {
   auto* self =
       PyArray_tp_new(reinterpret_cast<PyTypeObject*>(type_), nullptr, nullptr);
   ptr() = self;
   Construct(reinterpret_cast<PyArrayObject*>(self), std::move(aval), weak_type,
             std::move(dtype), std::move(shape), std::move(sharding), committed,
-            std::move(py_client), std::move(traceback),
-            std::move(ifrt_array)
-  );
+            std::move(py_client), std::move(traceback), std::move(ifrt_array));
 
   if (!skip_checks) {
     CheckAndRearrange();
@@ -412,53 +352,49 @@ void PyArray::SetIfrtArray(tsl::RCReference<ifrt::Array> ifrt_array) {
   GetStorage().ifrt_array = std::move(ifrt_array);
 }
 
-const std::vector<PyBuffer::object>& PyArray::py_buffers_cached() {
-  auto& py_buffers = this->py_buffers();
+const std::vector<PyArray>& PyArray::py_arrays_cached() {
+  auto& py_arrays = this->py_arrays();
 
-  if (py_buffers.empty()) {
-    if (llvm::isa<ifrt::SingleDeviceSharding>(&ifrt_array()->sharding())) {
-      py_buffers.reserve(1);
-      py_buffers.push_back(
-          PyBuffer::Make(py_client(),
-                         ifrt_array()
-                             ->Reshard(ifrt_array()->shared_ptr_sharding(),
-                                       ifrt::ArrayCopySemantics::kReuseInput)
-                             .value(),
-                         traceback()));
-    } else {
-      auto ifrt_arrays = ifrt_array()->DisassembleIntoSingleDeviceArrays(
-          ifrt::ArrayCopySemantics::kReuseInput);
-      if (!ifrt_arrays.ok()) {
-        throw py::value_error(
-            absl::StrCat("Failed to disassemble into single-device arrays: ",
-                         ifrt_arrays.status().ToString()));
-      }
-      py_buffers.reserve(ifrt_arrays->size());
-      for (auto& ifrt_array : *ifrt_arrays) {
-        py_buffers.push_back(
-            PyBuffer::Make(py_client(), std::move(ifrt_array), traceback()));
-      }
+  if (py_arrays.empty()) {
+    auto ifrt_arrays = ifrt_array()->DisassembleIntoSingleDeviceArrays(
+        ifrt::ArrayCopySemantics::kReuseInput);
+    if (!ifrt_arrays.ok()) {
+      throw py::value_error(
+          absl::StrCat("Failed to disassemble into single-device arrays: ",
+                       ifrt_arrays.status().ToString()));
+    }
+    py_arrays.reserve(ifrt_arrays->size());
+    for (auto& ifrt_array : *ifrt_arrays) {
+      py_arrays.push_back(PyArray::MakeFromSingleDeviceArray(
+          py_client(), traceback(), std::move(ifrt_array), weak_type(),
+          committed()));
     }
   }
 
-  return py_buffers;
+  return py_arrays;
 }
 
 py::object PyArray::arrays() {
   // For performance, we only keep pjrt buffers by default. But on python side
-  // "_arrays" returns PyBuffers instead, and subsequent calls to "_arrays"
-  // should return the same PyBuffers (to avoid duplicate device to host
-  // transfers). So we create PyBuffers the first time it is called and reuse
+  // "_arrays" returns PyArrays instead, and subsequent calls to "_arrays"
+  // should return the same PyArrays (to avoid duplicate device to host
+  // transfers). So we create PyArrays the first time it is called and reuse
   // them later.
   if (ifrt_array() == nullptr) return py::none();
 
-  return py::cast(py_buffers_cached());
+  if (llvm::isa<ifrt::SingleDeviceSharding>(&ifrt_array()->sharding())) {
+    std::vector<PyArray> py_arrays;
+    py_arrays.push_back(*this);
+    return py::cast(py_arrays);
+  }
+
+  return py::cast(py_arrays_cached());
 }
 
 Status PyArray::set_arrays(py::object obj) {
   if (obj.is_none()) {
     SetIfrtArray(tsl::RCReference<ifrt::Array>());
-    py_buffers().clear();
+    py_arrays().clear();
     return OkStatus();
   }
 
@@ -472,7 +408,7 @@ Status PyArray::set_arrays(py::object obj) {
   if (list.empty()) return OkStatus();
 
   SetIfrtArray(tsl::RCReference<ifrt::Array>());
-  py_buffers().clear();
+  py_arrays().clear();
   std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
   ifrt_arrays.reserve(list.size());
   ifrt::DeviceList::Devices devices;
@@ -480,17 +416,7 @@ Status PyArray::set_arrays(py::object obj) {
   std::vector<ifrt::Shape> shapes;
   shapes.reserve(list.size());
   for (py::handle obj : list) {
-    // TODO(chky): Currently only List[Buffer] is handled here. We need to
-    // handle List[Array] as well.
-    if (obj.get_type().ptr() == PyBuffer::type()) {
-      auto* py_buffer = PyBuffer::AsPyBufferUnchecked(obj);
-      DCHECK_EQ(py_buffer->client(), py_client());
-      // TODO(hyeontaek): This should return an error instead of failing.
-      CHECK(py_buffer->ifrt_array() != nullptr);
-      ifrt_arrays.push_back(tsl::FormRef(py_buffer->ifrt_array()));
-      devices.push_back(ifrt_arrays.back()->sharding().devices().front());
-      shapes.push_back(ifrt_arrays.back()->shape());
-    } else if (obj.get_type().is(PyArray::type())) {
+    if (obj.get_type().is(PyArray::type())) {
       auto py_array = py::reinterpret_borrow<PyArray>(obj);
       if (py_array.py_client() != py_client()) {
         return InvalidArgument("Client mismatch when assigning to _arrays.");
@@ -520,15 +446,33 @@ Status PyArray::set_arrays(py::object obj) {
   return OkStatus();
 }
 
+StatusOr<PyArray> PyArray::FullyReplicatedShard() {
+  if (ifrt_array() == nullptr) {
+    return InvalidArgument(
+        "FullyReplicatedShard() called on deleted or donated buffer");
+  }
+
+  auto* client = llvm::dyn_cast_or_null<xla::ifrt::PjRtCompatibleClient>(
+      ifrt_array()->client());
+  auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array());
+  if (arr == nullptr) {
+    throw XlaRuntimeError(
+        "This operation is implemented for a PjRt-compatible backend only.");
+  }
+  auto fully_replicated_ifrt_shard =
+      ifrt::PjRtArray::Create(client, std::move(arr->pjrt_buffers().front()));
+  return MakeFromSingleDeviceArray(py_client(), traceback(),
+                                   *fully_replicated_ifrt_shard, weak_type(),
+                                   committed());
+}
+
 Status PyArray::BlockUntilReady() const {
   pybind11::gil_scoped_release gil_release;
-  Status status;
   if (ifrt_array() == nullptr) {
     return InvalidArgument(
         "BlockHostUntilReady() called on deleted or donated buffer");
   }
   return AwaitBuffersReady(ifrt_array());
-  return status;
 }
 
 StatusOr<size_t> PyArray::GetOnDeviceSizeInBytes() {
@@ -537,55 +481,79 @@ StatusOr<size_t> PyArray::GetOnDeviceSizeInBytes() {
         "GetOnDeviceSizeInBytes() called on deleted or donated buffer");
   }
 
-  TF_ASSIGN_OR_RETURN(size_t shard_size,
-                      py_buffers_cached()[0].buf()->OnDeviceSizeInBytes());
+  TF_ASSIGN_OR_RETURN(
+      size_t shard_size,
+      IfrtHelpers::pjrt_buffer(ifrt_array())->GetOnDeviceSizeInBytes());
   return shard_size * py::len(sharding().attr("device_set"));
 }
 
-StatusOr<PyBuffer::object> PyArray::FetchSingleShard(std::string_view api) {
+StatusOr<PyArray> PyArray::FetchSingleShard(std::string_view api) {
   if (ifrt_array() == nullptr) {
     return InvalidArgument("%s( called on deleted or donated buffer", api);
   }
 
-  auto& py_buffers = py_buffers_cached();
-  if (py_buffers.empty() ||
-      py_buffers[0].buf()->ifrt_array()->shape().dims() != shape()) {
+  if (llvm::isa<ifrt::SingleDeviceSharding>(&ifrt_array()->sharding())) {
+    return *this;
+  }
+
+  auto& py_arrays = py_arrays_cached();
+  if (py_arrays.empty() || py_arrays[0].shape() != shape()) {
     return InvalidArgument("%s() is supported only for unsharded arrays.", api);
   }
-  return py_buffers[0];
+  return py_arrays[0];
 }
 
 StatusOr<pybind11::object> PyArray::SingleDeviceArrayToNumpyArray() {
-  TF_ASSIGN_OR_RETURN(auto buf,
+  TF_ASSIGN_OR_RETURN(auto arr,
                       FetchSingleShard("SingleDeviceArrayToNumpyArray"));
-  return buf.buf()->AsNumPyArray(buf);
+  return PyHostValue::AsNumPyArray(arr.GetStorage().host_value,
+                                   arr.GetStorage().dynamic_shape,
+                                   arr.ifrt_array(), arr);
 }
 
 Status PyArray::CopySingleDeviceArrayToHostAsync() {
-  TF_ASSIGN_OR_RETURN(auto buf,
+  TF_ASSIGN_OR_RETURN(auto arr,
                       FetchSingleShard("CopySingleDeviceArrayToHostAsync"));
-  return buf.buf()->CopyToHostAsync();
+  return PyHostValue::CopyToHostAsync(arr.GetStorage().host_value,
+                                      arr.GetStorage().dynamic_shape,
+                                      arr.ifrt_array());
+}
+
+StatusOr<PyArray> PyArray::AssertUnsharded(std::string_view api) {
+  if (ifrt_array() == nullptr) {
+    return InvalidArgument("%s( called on deleted or donated buffer", api);
+  }
+
+  if (llvm::isa<ifrt::SingleDeviceSharding>(&ifrt_array()->sharding())) {
+    return *this;
+  }
+
+  auto& py_arrays = py_arrays_cached();
+  if (py_arrays.size() != 1) {
+    return InvalidArgument("%s() is supported only for unsharded arrays.", api);
+  }
+  return py_arrays[0];
+}
+
+StatusOr<std::uintptr_t> PyArray::UnsafeBufferPointer() {
+  TF_ASSIGN_OR_RETURN(auto arr, AssertUnsharded("UnsafeBufferPointer"));
+
+  return py_client()->pjrt_client()->UnsafeBufferPointer(
+      IfrtHelpers::pjrt_buffer(arr.ifrt_array()));
 }
 
 StatusOr<py::dict> PyArray::CudaArrayInterface() {
-  if (ifrt_array() == nullptr) {
-    return InvalidArgument(
-        "CudaArrayInterface() called on deleted or donated buffer");
-  }
+  TF_ASSIGN_OR_RETURN(auto arr, AssertUnsharded("UnsafeBufferPointer"));
 
-  auto& py_buffers = py_buffers_cached();
-  if (py_buffers.size() != 1) {
-    return InvalidArgument(
-        "CudaArrayInterface() is supported only for unsharded arrays.");
-  }
-  return py_buffers[0].buf()->CudaArrayInterface();
+  return IfrtHelpers::CudaArrayInterface(arr.ifrt_array(),
+                                         arr.GetStorage().dynamic_shape);
 }
 
 Status PyArray::Delete() {
-  for (auto& arr : py_buffers()) {
-    arr.buf()->Delete();
+  for (auto& arr : py_arrays()) {
+    TF_RETURN_IF_ERROR(arr.Delete());
   }
-  py_buffers().clear();
+  py_arrays().clear();
   if (ifrt_array() != nullptr) {
     TF_RETURN_IF_ERROR(ifrt_array()->Delete().Await());
     SetIfrtArray(tsl::RCReference<ifrt::Array>());
@@ -692,6 +660,11 @@ StatusOr<PyArray> PyArray::BatchedDevicePut(
                      "%zu and be nonzero",
                      dst_devices.size(), xs.size()));
   }
+  for (ClientAndPtr<PjRtDevice>& device : dst_devices) {
+    if (device.get_client() == nullptr) {
+      return InvalidArgument("Cannot copy to unattached devices.");
+    }
+  }
   auto transfer_guard_formatter = [&aval, &sharding] {
     return absl::StrCat(
         "aval=", py::cast<std::string>(py::repr(aval)),
@@ -725,7 +698,7 @@ StatusOr<PyArray> PyArray::BatchedDevicePut(
           jax::ApplyTransferGuardToHostToDevice(transfer_guard_formatter));
     }
     TF_ASSIGN_OR_RETURN(DevicePutResult on_device,
-                        DevicePut(x, dst_devices[i].client->ifrt_client(),
+                        DevicePut(x, dst_devices[i].get_client()->ifrt_client(),
                                   dst_devices[i].get(), options));
     ifrt_arrays.push_back(std::move(on_device.ifrt_array));
     devices.push_back(ifrt_arrays.back()->sharding().devices().front());
@@ -751,7 +724,7 @@ StatusOr<PyArray> PyArray::BatchedDevicePut(
           xla::ifrt::ArrayCopySemantics::kReuseInput));
 
   return PyArray(aval, weak_type, dtype, std::move(shape), sharding,
-                 dst_devices[0].client, Traceback::Get(), ifrt_array,
+                 dst_devices[0].client(), Traceback::Get(), ifrt_array,
                  committed);
 }
 
@@ -820,10 +793,6 @@ Status PyArray::RegisterTypes(py::module& m) {
           auto py_arrays = py::cast<std::vector<PyArray>>(arrays);
           PyArray::PyInit(self, std::move(aval), std::move(sharding), py_arrays,
                           committed, skip_checks);
-        } else if (arrays[0].get_type().ptr() == PyBuffer::type()) {
-          auto py_buffers = py::cast<std::vector<PyBuffer::object>>(arrays);
-          PyArray::PyInit(self, std::move(aval), std::move(sharding),
-                          py_buffers, committed, skip_checks);
         } else {
           throw py::type_error(
               absl::StrCat("Unsupported type for elements in `arrays`: ",
@@ -838,30 +807,44 @@ Status PyArray::RegisterTypes(py::module& m) {
         PyArray::PyInit(self, PyArray::DisableFastpath());
       },
       py::is_method(type));
-  type.attr("delete") = py::cpp_function(&PyArray::Delete, py::is_method(type));
+  type.attr("delete") =
+      py::cpp_function([](PyArray& self) { xla::ThrowIfError(self.Delete()); },
+                       py::is_method(type));
   type.attr("_sharding") = jax::property_readonly(&PyArray::sharding);
   type.attr("aval") = jax::property(&PyArray::aval, &PyArray::set_aval);
-  type.attr("_arrays") = jax::property(&PyArray::arrays, &PyArray::set_arrays);
+  type.attr("_arrays") =
+      jax::property(&PyArray::arrays, [](PyArray& self, py::object obj) {
+        xla::ThrowIfError(self.set_arrays(obj));
+      });
+  type.attr("_fully_replicated_shard") = py::cpp_function(
+      [](PyArray self) {
+        return xla::ValueOrThrow(self.FullyReplicatedShard());
+      },
+      py::is_method(type));
   type.attr("_npy_value") =
       jax::property(&PyArray::npy_value, &PyArray::set_npy_value);
   type.attr("_committed") = jax::property_readonly(&PyArray::committed);
   type.attr("unsafe_buffer_pointer") = py::cpp_function(
       [](PyArray self) {
-        return self.py_client()->pjrt_client()->UnsafeBufferPointer(
-            IfrtHelpers::pjrt_buffer(self.ifrt_array()));
+        return xla::ValueOrThrow(self.UnsafeBufferPointer());
       },
       py::is_method(type));
   type.attr("__cuda_array_interface__") = jax::property_readonly(
       [](PyArray self) { return self.CudaArrayInterface(); });
-  type.attr("on_device_size_in_bytes") =
-      py::cpp_function(&PyArray::GetOnDeviceSizeInBytes, py::is_method(type));
+  type.attr("on_device_size_in_bytes") = py::cpp_function(
+      xla::ValueOrThrowWrapper(&PyArray::GetOnDeviceSizeInBytes),
+      py::is_method(type));
   type.attr("_single_device_array_to_np_array") = py::cpp_function(
-      &PyArray::SingleDeviceArrayToNumpyArray, py::is_method(type));
+      xla::ValueOrThrowWrapper(&PyArray::SingleDeviceArrayToNumpyArray),
+      py::is_method(type));
   type.attr("_copy_single_device_array_to_host_async") = py::cpp_function(
-      &PyArray::CopySingleDeviceArrayToHostAsync, py::is_method(type));
+      [](PyArray& self) {
+        xla::ThrowIfError(self.CopySingleDeviceArrayToHostAsync());
+      },
+      py::is_method(type));
   type.attr("block_until_ready") = py::cpp_function(
-      [](PyArray self) -> StatusOr<py::object> {
-        TF_RETURN_IF_ERROR(self.BlockUntilReady());
+      [](PyArray self) -> py::object {
+        xla::ThrowIfError(self.BlockUntilReady());
         return self;
       },
       py::is_method(type));
@@ -869,7 +852,8 @@ Status PyArray::RegisterTypes(py::module& m) {
       [](PyArray self) { return self.ifrt_array()->client()->platform_name(); },
       py::is_method(type));
   type.attr("is_ready") = py::cpp_function(
-      [](PyArray self) { return self.IsReady(); }, py::is_method(type));
+      [](PyArray self) { return xla::ValueOrThrow(self.IsReady()); },
+      py::is_method(type));
   type.attr("is_deleted") =
       py::cpp_function(&PyArray::IsDeleted, py::is_method(type));
   type.attr("traceback") = jax::property_readonly(&PyArray::traceback);
@@ -884,8 +868,8 @@ Status PyArray::RegisterTypes(py::module& m) {
         for (auto& d : dst_devices) {
           devices.push_back(d.get());
         }
-        return self.CopyToDeviceWithSharding(ifrt::DeviceList(devices),
-                                             std::move(sharding));
+        return xla::ValueOrThrow(self.CopyToDeviceWithSharding(
+            ifrt::DeviceList(devices), std::move(sharding)));
       });
   m.attr("array_result_handler") = py::cpp_function(
       [](py::object aval, py::object sharding, bool committed,
@@ -899,11 +883,6 @@ Status PyArray::RegisterTypes(py::module& m) {
   py::class_<PyArrayResultHandler>(m, "ResultHandler")
       .def("__call__", [](const PyArrayResultHandler& self,
                           PyArray arg) { return self.Call(arg); })
-      .def("__call__",
-           [](const PyArrayResultHandler& self,
-              std::vector<PyBuffer::object> py_arrays) {
-             return self.Call(py_arrays);
-           })
       .def("__call__",
            [](const PyArrayResultHandler& self,
               std::vector<PyArray> py_arrays) { return self.Call(py_arrays); });
diff --git a/tensorflow/compiler/xla/python/py_array.h b/tensorflow/compiler/xla/python/py_array.h
index be76c5cddb0..529d6689999 100644
--- a/tensorflow/compiler/xla/python/py_array.h
+++ b/tensorflow/compiler/xla/python/py_array.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PYTHON_PY_ARRAY_H_
 
 #include <memory>
+#include <optional>
 #include <string_view>
 #include <utility>
 #include <vector>
@@ -62,10 +63,12 @@ struct PyArray_Storage {
   tsl::RCReference<ifrt::Array> ifrt_array;
 
   // optional field, used only in python
-  std::vector<PyBuffer::object> py_buffers;
+  std::vector<PyArray> py_arrays;
+  std::shared_ptr<PyHostValue> host_value;  // Protected by the GIL.
+  std::optional<Shape> dynamic_shape = std::nullopt;
 
   // Doubly-linked list of all PyArrays known to the client. Protected by the
-  // GIL. Since multiple PyBuffers may share the same PjRtBuffer, there may be
+  // GIL. Since multiple PyArrays may share the same PjRtBuffer, there may be
   // duplicate PjRtBuffers in this list.
   PyArray_Storage* next;
   PyArray_Storage* prev;
@@ -80,11 +83,6 @@ class PyArray : public pybind11::object {
   PyArray() = default;
 
   // "__init__" methods. Only used in python
-  static void PyInit(pybind11::object self, pybind11::object aval,
-                     pybind11::object sharding,
-                     absl::Span<const PyBuffer::object> py_buffers,
-                     bool committed, bool skip_checks);
-
   static void PyInit(pybind11::object self, pybind11::object aval,
                      pybind11::object sharding,
                      absl::Span<const PyArray> py_arrays, bool committed,
@@ -177,16 +175,15 @@ class PyArray : public pybind11::object {
     return arr->pjrt_buffers().size();
   }
 
-  std::vector<PyBuffer::object>& py_buffers() {
-    return GetStorage().py_buffers;
+  std::vector<PyArray>& py_arrays() { return GetStorage().py_arrays; }
+  const std::vector<PyArray>& py_arrays() const {
+    return GetStorage().py_arrays;
   }
-  const std::vector<PyBuffer::object>& py_buffers() const {
-    return GetStorage().py_buffers;
-  }
-  const std::vector<PyBuffer::object>& py_buffers_cached();
+  const std::vector<PyArray>& py_arrays_cached();
 
   pybind11::object arrays();
   Status set_arrays(pybind11::object obj);
+  StatusOr<PyArray> FullyReplicatedShard();
 
   int num_shards() const {
     ifrt::Array* ifrt_array_ptr = ifrt_array();
@@ -214,6 +211,7 @@ class PyArray : public pybind11::object {
   StatusOr<pybind11::object> SingleDeviceArrayToNumpyArray();
   Status CopySingleDeviceArrayToHostAsync();
   StatusOr<pybind11::dict> CudaArrayInterface();
+  StatusOr<std::uintptr_t> UnsafeBufferPointer();
 
   Status Delete();
 
@@ -232,7 +230,8 @@ class PyArray : public pybind11::object {
       bool jax_enable_x64);
 
  private:
-  StatusOr<PyBuffer::object> FetchSingleShard(std::string_view api);
+  StatusOr<PyArray> FetchSingleShard(std::string_view api);
+  StatusOr<PyArray> AssertUnsharded(std::string_view api);
 
   void CheckAndRearrange();
 
@@ -251,7 +250,6 @@ class PyArrayResultHandler {
   PyArrayResultHandler(pybind11::object aval, pybind11::object sharding,
                        bool committed, bool skip_checks);
 
-  PyArray Call(absl::Span<const PyBuffer::object> py_buffers) const;
   PyArray Call(absl::Span<const PyArray> py_arrays) const;
   PyArray Call(PyArray py_array) const;
 
diff --git a/tensorflow/compiler/xla/python/py_buffer.cc b/tensorflow/compiler/xla/python/py_buffer.cc
index da56075271a..f03ecf63de5 100644
--- a/tensorflow/compiler/xla/python/py_buffer.cc
+++ b/tensorflow/compiler/xla/python/py_buffer.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/py_client.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/python_utils.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/python/transfer_guard_lib.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/python/util.h"
@@ -45,48 +46,6 @@ namespace py = pybind11;
 
 namespace {
 
-// Representation of a DeviceArrayBase as a Python object. Since
-// a DeviceArrayBase has no fields, this is just a PyObject.
-struct PyBufferBasePyObject {
-  PyObject_HEAD;
-};
-static_assert(std::is_standard_layout<PyBufferBasePyObject>::value,
-              "PyBufferBasePyObject must be standard layout");
-
-// Representation of a DeviceArray as a Python object.
-struct PyBufferPyObject {
-  PyBufferBasePyObject base;
-  PyBuffer buffer;
-  // Used by the Python interpreter to maintain a list of weak references to
-  // this object.
-  PyObject* weakrefs;
-};
-static_assert(std::is_standard_layout<PyBufferPyObject>::value,
-              "PyBufferPyObject must be standard layout. This error "
-              "can occur if the target is compiled with the Clang compiler and "
-              "the GCC standard library. In that case either switch to the GCC "
-              "toolchain or use -stdlib=libc++.");
-
-PyObject* PyBuffer_tp_new(PyTypeObject* subtype, PyObject* args,
-                          PyObject* kwds) {
-  PyBufferPyObject* self =
-      reinterpret_cast<PyBufferPyObject*>(subtype->tp_alloc(subtype, 0));
-  if (!self) return nullptr;
-  self->weakrefs = nullptr;
-  return reinterpret_cast<PyObject*>(self);
-}
-
-void PyBuffer_tp_dealloc(PyObject* self) {
-  PyTypeObject* tp = Py_TYPE(self);
-  PyBufferPyObject* o = reinterpret_cast<PyBufferPyObject*>(self);
-  if (o->weakrefs) {
-    PyObject_ClearWeakRefs(self);
-  }
-  o->buffer.~PyBuffer();
-  tp->tp_free(self);
-  Py_DECREF(tp);
-}
-
 // Returns if shape has a major-to-minor layout.
 bool HasMajorToMinorLayout(const xla::Shape& shape) {
   if (shape.has_layout()) {
@@ -111,84 +70,6 @@ std::optional<std::vector<int64_t>> ByteStridesOrDefaultForShapeInt64(
 
 }  // namespace
 
-/*static*/ PyBuffer::object PyBuffer::Make(
-    std::shared_ptr<PyClient> client, tsl::RCReference<ifrt::Array> ifrt_array,
-    std::shared_ptr<Traceback> traceback) {
-  py::object obj = py::reinterpret_steal<py::object>(PyBuffer_tp_new(
-      reinterpret_cast<PyTypeObject*>(type_), nullptr, nullptr));
-  PyBufferPyObject* buf = reinterpret_cast<PyBufferPyObject*>(obj.ptr());
-  new (&buf->buffer)
-      PyBuffer(std::move(client), std::move(ifrt_array), std::move(traceback));
-  return py::reinterpret_borrow<PyBuffer::object>(obj);
-}
-
-bool PyBuffer::IsPyBuffer(py::handle handle) {
-  return handle.get_type() == PyBuffer::type();
-}
-
-/*static*/ PyBuffer* PyBuffer::AsPyBufferUnchecked(pybind11::handle handle) {
-  return &(reinterpret_cast<PyBufferPyObject*>(handle.ptr())->buffer);
-}
-
-/*static*/ StatusOr<PyBuffer*> PyBuffer::AsPyBuffer(pybind11::handle handle) {
-  if (!IsPyBuffer(handle)) {
-    return InvalidArgument("Expected a DeviceArray, got object of type %s",
-                           py::cast<std::string>(py::str(handle.get_type())));
-  }
-  return AsPyBufferUnchecked(handle);
-}
-
-py::handle PyBuffer::AsHandle() {
-  return reinterpret_cast<PyObject*>(reinterpret_cast<char*>(this) -
-                                     offsetof(PyBufferPyObject, buffer));
-}
-
-PyBuffer::PyBuffer(std::shared_ptr<PyClient> client,
-                   tsl::RCReference<ifrt::Array> ifrt_array,
-                   std::shared_ptr<Traceback> traceback)
-    : client_(std::move(client)),
-      ifrt_array_(std::move(ifrt_array)),
-      traceback_(std::move(traceback)) {
-  CHECK(PyGILState_Check());
-  const int device_id = ifrt_array_->sharding().devices().front()->id();
-  if (device_id >= client_->buffers_.size()) {
-    client_->buffers_.resize(device_id + 1);
-  }
-  next_ = client_->buffers_[device_id];
-  client_->buffers_[device_id] = this;
-  prev_ = nullptr;
-  if (next_) {
-    next_->prev_ = this;
-  }
-}
-
-PyBuffer::~PyBuffer() {
-  CHECK(PyGILState_Check());
-  const int device_id = ifrt_array_->sharding().devices().front()->id();
-  if (client_->buffers_[device_id] == this) {
-    client_->buffers_[device_id] = next_;
-  }
-  if (prev_) {
-    prev_->next_ = next_;
-  }
-  if (next_) {
-    next_->prev_ = prev_;
-  }
-}
-
-StatusOr<int64_t> PyBuffer::size() {
-  if (llvm::isa<ifrt::PjRtCompatibleArray>(ifrt_array_.get())) {
-    Shape max_buffer_shape = pjrt_buffer()->on_device_shape();
-    if (max_buffer_shape.is_dynamic()) {
-      TF_ASSIGN_OR_RETURN(const auto* dynamic_shape, xla_dynamic_shape());
-      return ShapeUtil::ElementsIn(*dynamic_shape);
-    }
-    return ShapeUtil::ElementsIn(max_buffer_shape);
-  } else {
-    return ifrt_array_->shape().num_elements();
-  }
-}
-
 /* static */ PjRtBuffer* IfrtHelpers::pjrt_buffer(ifrt::Array* ifrt_array) {
   auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array);
   if (arr == nullptr) {
@@ -223,11 +104,6 @@ StatusOr<int64_t> PyBuffer::size() {
   return &scratch.value();
 }
 
-StatusOr<const Shape*> PyBuffer::xla_dynamic_shape() {
-  CHECK(PyGILState_Check());
-  return IfrtHelpers::xla_dynamic_shape(ifrt_array(), dynamic_shape_);
-}
-
 pybind11::tuple IfrtHelpers::python_shape(ifrt::Array* ifrt_array) {
   return SpanToTuple(ifrt_array->shape().dims());
 }
@@ -238,22 +114,6 @@ pybind11::dtype IfrtHelpers::python_dtype(ifrt::Array* ifrt_array) {
   return PrimitiveTypeToDtype(primitive).value();
 }
 
-ClientAndPtr<PjRtDevice> PyBuffer::device() const {
-  return WrapWithClient(client_, ifrt_array_->sharding().devices().front());
-}
-
-PyBuffer::object PyBuffer::Clone() const {
-  auto buffer = Make(client_,
-                     ifrt_array_
-                         ->Reshard(ifrt_array_->shared_ptr_sharding(),
-                                   ifrt::ArrayCopySemantics::kReuseInput)
-                         .value(),
-                     traceback_);
-  buffer.buf()->sticky_device_ = sticky_device_;
-  buffer.buf()->aval_ = aval_;
-  return buffer;
-}
-
 /* static */ StatusOr<tsl::RCReference<ifrt::Array>> IfrtHelpers::CopyToDevice(
     ifrt::Array* ifrt_array, PjRtDevice* dst_device) {
   CHECK(dst_device != nullptr);
@@ -273,44 +133,6 @@ PyBuffer::object PyBuffer::Clone() const {
                              ifrt::ArrayCopySemantics::kReuseInput);
 }
 
-StatusOr<py::object> PyBuffer::CopyToDevice(
-    const ClientAndPtr<PjRtDevice>& dst_device) const {
-  TF_ASSIGN_OR_RETURN(
-      tsl::RCReference<ifrt::Array> out,
-      IfrtHelpers::CopyToDevice(ifrt_array(), dst_device.get()));
-  auto traceback = Traceback::Get();
-  return Make(dst_device.client, std::move(out), std::move(traceback));
-}
-
-std::pair<Status, bool> PyBuffer::CopyToRemoteDevice(
-    absl::string_view serialized_descriptor) const {
-  absl::Mutex mu;
-  bool done = false;
-  Status status;
-  bool sends_were_enqueued;
-  pjrt_buffer()->CopyToRemoteDevice(
-      PjRtFuture<StatusOr<std::string>>(std::string(serialized_descriptor)),
-      [&done, &status, &sends_were_enqueued, &mu](Status s, bool dispatched) {
-        absl::MutexLock l(&mu);
-        done = true;
-        status = s;
-        sends_were_enqueued = dispatched;
-      });
-  {
-    py::gil_scoped_release gil_release;
-    absl::MutexLock l(&mu);
-    mu.Await(absl::Condition(
-        +[](bool* done) { return *done; }, &done));
-  }
-  return std::make_pair(status, sends_were_enqueued);
-}
-
-Status PyBuffer::BlockHostUntilReady() {
-  GlobalPyRefManager()->CollectGarbage();
-  py::gil_scoped_release gil_release;
-  return AwaitBuffersReady(ifrt_array_.get());
-}
-
 /* static */ StatusOr<pybind11::object> PyHostValue::AsNumPyArray(
     std::shared_ptr<PyHostValue>& host_value,
     std::optional<Shape>& dynamic_shape_holder, ifrt::Array* ifrt_array,
@@ -429,55 +251,48 @@ Status PyBuffer::BlockHostUntilReady() {
   return OkStatus();
 }
 
-Status PyBuffer::CopyToHostAsync() {
-  return PyHostValue::CopyToHostAsync(host_value_, dynamic_shape_,
-                                      ifrt_array_.get());
-}
-
-StatusOr<pybind11::object> PyBuffer::AsNumPyArray(py::handle this_obj) {
-  return PyHostValue::AsNumPyArray(host_value_, dynamic_shape_,
-                                   ifrt_array_.get(), this_obj);
-}
-
-StatusOr<std::uintptr_t> PyBuffer::UnsafeBufferPointer() const {
-  return client_->pjrt_client()->UnsafeBufferPointer(pjrt_buffer());
-}
-
-StatusOr<py::dict> PyBuffer::CudaArrayInterface() {
+StatusOr<pybind11::dict> IfrtHelpers::CudaArrayInterface(
+    ifrt::Array* ifrt_array, std::optional<Shape>& scratch) {
+  auto* pjrt_buffer = IfrtHelpers::pjrt_buffer(ifrt_array);
   // TODO(zhangqiaorjc): Differentiate between NVidia and other GPUs.
-  if (pjrt_buffer()->client()->platform_id() != GpuId()) {
+  if (pjrt_buffer->client()->platform_id() != GpuId()) {
     return InvalidArgument(
         "__cuda_array_interface__ is only defined for NVidia GPU buffers.");
   }
-  if (!pjrt_buffer()->on_device_shape().IsArray()) {
+  if (!pjrt_buffer->on_device_shape().IsArray()) {
     return InvalidArgument(
         "__cuda_array_interface__ is only defined for array buffers.");
   }
-  if (pjrt_buffer()->on_device_shape().element_type() == BF16) {
+  if (pjrt_buffer->on_device_shape().element_type() == BF16) {
     return InvalidArgument(
         "__cuda_array_interface__ is not supported for bfloat16 buffers.");
   }
-  if (pjrt_buffer()->on_device_shape().element_type() == F8E4M3FN) {
+  if (pjrt_buffer->on_device_shape().element_type() == F8E4M3FN) {
     return InvalidArgument(
         "__cuda_array_interface__ is not supported for F8E4M3FN buffers.");
   }
-  if (pjrt_buffer()->on_device_shape().element_type() == F8E5M2) {
+  if (pjrt_buffer->on_device_shape().element_type() == F8E4M3B11FNUZ) {
+    return InvalidArgument(
+        "__cuda_array_interface__ is not supported for F8E4M3B11FNUZ buffers.");
+  }
+  if (pjrt_buffer->on_device_shape().element_type() == F8E5M2) {
     return InvalidArgument(
         "__cuda_array_interface__ is not supported for F8E5M2 buffers.");
   }
   TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(
-      pjrt_buffer()->on_device_shape().layout()));
+      pjrt_buffer->on_device_shape().layout()));
 
   py::dict result;
-  TF_ASSIGN_OR_RETURN(const auto* dynamic_shape, xla_dynamic_shape());
+  TF_ASSIGN_OR_RETURN(const auto* dynamic_shape,
+                      IfrtHelpers::xla_dynamic_shape(ifrt_array, scratch));
   result["shape"] = SpanToTuple(dynamic_shape->dimensions());
   TF_ASSIGN_OR_RETURN(py::str typestr,
                       TypeDescriptorForPrimitiveType(
-                          pjrt_buffer()->on_device_shape().element_type()));
+                          pjrt_buffer->on_device_shape().element_type()));
   result["typestr"] = std::move(typestr);
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold,
-      pjrt_buffer()->AcquireExternalReference());
+      pjrt_buffer->AcquireExternalReference());
   const void* root_ptr =
       external_reference_hold->OpaqueDeviceMemoryDataPointer();
   py::tuple data(2);
@@ -488,354 +303,6 @@ StatusOr<py::dict> PyBuffer::CudaArrayInterface() {
   return result;
 }
 
-// PEP 3118 buffer protocol implementation.
-
-namespace {
-
-// Extra data to be kept alive by the consumer of the buffer protocol.
-struct ExtraBufferInfo {
-  explicit ExtraBufferInfo(
-      std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold)
-      : external_reference_hold(std::move(external_reference_hold)) {}
-
-  std::string format;
-  std::vector<Py_ssize_t> strides;
-  // We keep an external reference hold to the PjRtBuffer. This prevents a
-  // use-after-free in the event that Delete() is called on a buffer with an
-  // live buffer protocol view. It does however mean that Delete() sometimes
-  // won't actually delete immediately.
-  std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold;
-};
-
-int PyBuffer_bf_getbuffer(PyObject* exporter, Py_buffer* view, int flags) {
-  Status status = [&]() {
-    TF_ASSIGN_OR_RETURN(PyBuffer * py_buffer, PyBuffer::AsPyBuffer(exporter));
-    PjRtBuffer* buffer_ptr;
-    try {
-      buffer_ptr = py_buffer->pjrt_buffer();
-    } catch (const XlaRuntimeError& e) {
-      return InvalidArgument("%s", e.what());
-    }
-
-    PjRtBuffer& buffer = *buffer_ptr;
-    if (!buffer.IsOnCpu()) {
-      return InvalidArgument(
-          "Python buffer protocol is only defined for CPU buffers.");
-    }
-
-    TF_ASSIGN_OR_RETURN(const auto* shape, py_buffer->xla_dynamic_shape());
-    // Py_buffer objects are POD C structures, so we don't need to hold the GIL.
-    // Additionally we call BlockHostUntilReady() below, which may block.
-    py::gil_scoped_release gil_release;
-
-    if (!buffer.on_device_shape().IsArray()) {
-      return InvalidArgument(
-          "Python buffer protocol is only defined for array buffers.");
-    }
-    // If we allowed exports of formatted BF16 buffers, consumers would get
-    // confused about the type because there is no way to describe BF16 to
-    // Python.
-    if (buffer.on_device_shape().element_type() == BF16 &&
-        ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)) {
-      return InvalidArgument(
-          "bfloat16 buffer format not supported by Python buffer protocol.");
-    }
-    if (buffer.on_device_shape().element_type() == F8E4M3FN &&
-        ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)) {
-      return InvalidArgument(
-          "F8E4M3FN buffer format not supported by Python buffer protocol.");
-    }
-    if (buffer.on_device_shape().element_type() == F8E5M2 &&
-        ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)) {
-      return InvalidArgument(
-          "F8E5M2 buffer format not supported by Python buffer protocol.");
-    }
-    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
-      return InvalidArgument("XLA buffers are read-only.");
-    }
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold,
-        buffer.AcquireExternalReference());
-    if (buffer.IsDeleted()) {
-      return InvalidArgument("Deleted buffer used in buffer protocol.");
-    }
-
-    if (((flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS ||
-         (flags & PyBUF_STRIDES) == PyBUF_ND) &&
-        !LayoutUtil::IsMonotonicWithDim0Major(shape->layout())) {
-      return InvalidArgument("Buffer is not in C-contiguous layout.");
-    } else if ((flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS &&
-               !LayoutUtil::IsMonotonicWithDim0Minor(shape->layout())) {
-      return InvalidArgument("Buffer is not in F-contiguous layout.");
-    } else if ((flags & PyBUF_ANY_CONTIGUOUS) == PyBUF_ANY_CONTIGUOUS &&
-               !LayoutUtil::IsMonotonicWithDim0Major(shape->layout()) &&
-               !LayoutUtil::IsMonotonicWithDim0Minor(shape->layout())) {
-      return InvalidArgument("Buffer is not in contiguous layout.");
-    }
-    std::memset(view, 0, sizeof(Py_buffer));
-    const void* root_ptr =
-        external_reference_hold->OpaqueDeviceMemoryDataPointer();
-    view->buf = const_cast<void*>(root_ptr);
-    auto extra =
-        std::make_unique<ExtraBufferInfo>(std::move(external_reference_hold));
-    view->itemsize = ShapeUtil::ByteSizeOfPrimitiveType(shape->element_type());
-    view->len = ShapeUtil::ByteSizeOf(*shape);
-    view->readonly = 1;
-    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
-      TF_ASSIGN_OR_RETURN(extra->format, FormatDescriptorForPrimitiveType(
-                                             shape->element_type()));
-      view->format = const_cast<char*>(extra->format.c_str());
-    }
-    if ((flags & PyBUF_ND) == PyBUF_ND) {
-      view->ndim = shape->dimensions_size();
-      static_assert(sizeof(int64_t) == sizeof(Py_ssize_t),
-                    "Py_ssize_t must be 64 bits");
-      if (view->ndim != 0) {
-        view->shape = reinterpret_cast<Py_ssize_t*>(
-            const_cast<int64_t*>(shape->dimensions().data()));
-        if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
-          extra->strides = ByteStridesForShape(*shape);
-          view->strides = extra->strides.data();
-        }
-      }
-    }
-    TF_RETURN_IF_ERROR(buffer.BlockHostUntilReady());
-    view->internal = extra.release();
-    return OkStatus();
-  }();
-  if (!status.ok()) {
-    // numpy.asarray(...) silents the PyExc_BufferError. Adding a log here helps
-    // debugging when the error really occurs.
-    VLOG(1) << "Buffer Protocol Error: " << status;
-    PyErr_SetString(PyExc_BufferError, status.ToString().c_str());
-    return -1;
-  }
-  view->obj = exporter;
-  Py_INCREF(view->obj);
-  return 0;
-}
-
-void PyBuffer_bf_releasebuffer(PyObject*, Py_buffer* buffer) {
-  auto extra = static_cast<ExtraBufferInfo*>(buffer->internal);
-  delete extra;
-}
-
-PyBufferProcs PyBuffer_tp_as_buffer = []() {
-  PyBufferProcs procs;
-  procs.bf_getbuffer = &PyBuffer_bf_getbuffer;
-  procs.bf_releasebuffer = &PyBuffer_bf_releasebuffer;
-  return procs;
-}();
-
-}  // namespace
-
-PyObject* PyBuffer::base_type_ = nullptr;
-PyObject* PyBuffer::type_ = nullptr;
-
-Status PyBuffer::RegisterTypes(py::module& m) {
-  // We do not use pybind11::class_ to build Python wrapper objects because
-  // creation, destruction, and casting of buffer objects is performance
-  // critical. By using hand-written Python classes, we can avoid extra C heap
-  // allocations, and we can avoid pybind11's slow cast<>() implementation
-  // during jit dispatch.
-
-  // We need to use heap-allocated type objects because we want to add
-  // additional methods dynamically.
-  {
-    py::str name = py::str("DeviceArrayBase");
-    py::str qualname = py::str("DeviceArrayBase");
-    PyHeapTypeObject* heap_type = reinterpret_cast<PyHeapTypeObject*>(
-        PyType_Type.tp_alloc(&PyType_Type, 0));
-    // Caution: we must not call any functions that might invoke the GC until
-    // PyType_Ready() is called. Otherwise the GC might see a half-constructed
-    // type object.
-    if (!heap_type) {
-      return Internal("Unable to create heap type object");
-    }
-    heap_type->ht_name = name.release().ptr();
-    heap_type->ht_qualname = qualname.release().ptr();
-    PyTypeObject* type = &heap_type->ht_type;
-    type->tp_name = "DeviceArrayBase";
-    type->tp_basicsize = sizeof(PyBufferBasePyObject);
-    type->tp_flags =
-        Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE | Py_TPFLAGS_BASETYPE;
-    TF_RET_CHECK(PyType_Ready(type) == 0);
-    base_type_ = reinterpret_cast<PyObject*>(type);
-  }
-  py::object base_type = py::reinterpret_borrow<py::object>(base_type_);
-  base_type.attr("__module__") = m.attr("__name__");
-  m.attr("DeviceArrayBase") = base_type;
-
-  {
-    py::tuple bases = py::make_tuple(base_type);
-    py::str name = py::str("DeviceArray");
-    py::str qualname = py::str("DeviceArray");
-    PyHeapTypeObject* heap_type = reinterpret_cast<PyHeapTypeObject*>(
-        PyType_Type.tp_alloc(&PyType_Type, 0));
-    // Caution: we must not call any functions that might invoke the GC until
-    // PyType_Ready() is called below. Otherwise the GC might see a
-    // half-constructed type object.
-    if (!heap_type) {
-      return Internal("Unable to create heap type object");
-    }
-    heap_type->ht_name = name.release().ptr();
-    heap_type->ht_qualname = qualname.release().ptr();
-    PyTypeObject* type = &heap_type->ht_type;
-    type->tp_name = "DeviceArray";
-    type->tp_basicsize = sizeof(PyBufferPyObject);
-    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE;
-    type->tp_bases = bases.release().ptr();
-    type->tp_dealloc = PyBuffer_tp_dealloc;
-    type->tp_new = PyBuffer_tp_new;
-    // Supported protocols
-    type->tp_as_number = &heap_type->as_number;
-    type->tp_as_sequence = &heap_type->as_sequence;
-    type->tp_as_mapping = &heap_type->as_mapping;
-    type->tp_as_buffer = &PyBuffer_tp_as_buffer;
-
-    // Allow weak references to DeviceArray objects.
-    type->tp_weaklistoffset = offsetof(PyBufferPyObject, weakrefs);
-
-    TF_RET_CHECK(PyType_Ready(type) == 0);
-    type_ = reinterpret_cast<PyObject*>(type);
-  }
-  py::object type = py::reinterpret_borrow<py::object>(type_);
-  m.attr("DeviceArray") = type;
-  m.attr("PyLocalBuffer") = type;
-  m.attr("Buffer") = type;
-
-  // Add methods and properties to the class. We use pybind11 and add methods
-  // dynamically mostly because this is easy to write and allows us to use
-  // pybind11's casting logic. This is most likely slightly slower than
-  // hand-writing bindings, but most of these methods are not performance
-  // critical.
-  using jax::property;
-  using jax::property_readonly;
-  type.attr("__array__") = py::cpp_function(
-      [](PyBuffer::object self, py::object dtype, py::object context) {
-        py::object array = ValueOrThrow(self.buf()->AsNumPyArray(self));
-        if (!dtype.is_none()) {
-          return array.attr("astype")(dtype);
-        }
-        return array;
-      },
-      py::is_method(type), py::arg("dtype") = py::none(),
-      py::arg("context") = py::none());
-  type.attr("__array_priority__") =
-      property_readonly([](py::object self) -> int { return 100; });
-  type.attr("_device") = property(
-      [](PyBuffer::object self) -> ClientAndPtr<PjRtDevice> {
-        return WrapWithClient(self.buf()->client(),
-                              self.buf()->sticky_device());
-      },
-      [](PyBuffer::object self, PjRtDevice* sticky_device) {
-        return self.buf()->set_sticky_device(sticky_device);
-      });
-  type.attr("aval") = property(
-      [](PyBuffer::object self) -> py::object { return self.buf()->GetAval(); },
-      [](PyBuffer::object self, py::object aval) {
-        return self.buf()->SetAval(std::move(aval));
-      });
-  type.attr("weak_type") = property(
-      [](PyBuffer::object self) -> std::optional<bool> {
-        return self.buf()->weak_type();
-      },
-      [](PyBuffer::object self, std::optional<bool> weak_type) {
-        return self.buf()->set_weak_type(weak_type);
-      });
-  type.attr("device_buffer") =
-      property_readonly([](py::object self) { return self; });
-  type.attr("shape") =
-      property_readonly([](PyBuffer::object self) -> py::tuple {
-        return SpanToTuple(self.buf()->ifrt_array()->shape().dims());
-      });
-  type.attr("dtype") =
-      property_readonly([](PyBuffer::object self) -> StatusOr<py::dtype> {
-        TF_ASSIGN_OR_RETURN(
-            auto primitive_type,
-            ifrt::ToPrimitiveType(self.buf()->ifrt_array()->dtype()));
-        return PrimitiveTypeToDtype(primitive_type);
-      });
-  type.attr("size") =
-      property_readonly([](PyBuffer::object self) -> StatusOr<int64_t> {
-        return self.buf()->size();
-      });
-  type.attr("ndim") = property_readonly(
-      [](PyBuffer::object self) -> int { return self.buf()->ndim(); });
-  type.attr("_value") = property_readonly(
-      [](PyBuffer::object self) -> StatusOr<pybind11::object> {
-        GlobalPyRefManager()->CollectGarbage();
-        return self.buf()->AsNumPyArray(self);
-      });
-  type.attr("copy_to_device") = py::cpp_function(
-      [](PyBuffer::object self, const ClientAndPtr<PjRtDevice>& dst_device) {
-        return self.buf()->CopyToDevice(dst_device);
-      },
-      py::is_method(type));
-  type.attr("copy_to_remote_device") = py::cpp_function(
-      [](PyBuffer::object self, const py::bytes serialized_descriptor) {
-        // TODO(phawkins): remove the std::string cast after C++17 is required.
-        // py::bytes has a std::string_view cast, but not an absl::string_view
-        // cast.
-        return self.buf()->CopyToRemoteDevice(
-            static_cast<std::string>(serialized_descriptor));
-      },
-      py::is_method(type));
-
-  type.attr("on_device_size_in_bytes") = py::cpp_function(
-      [](PyBuffer::object self) -> StatusOr<size_t> {
-        return self.buf()->OnDeviceSizeInBytes();
-      },
-      py::is_method(type));
-  type.attr("delete") = py::cpp_function(
-      [](PyBuffer::object self) { self.buf()->Delete(); }, py::is_method(type));
-  type.attr("is_ready") = py::cpp_function(
-      [](PyBuffer::object self) { return self.buf()->IsReady(); },
-      py::is_method(type));
-  type.attr("is_known_ready") = py::cpp_function(
-      [](PyBuffer::object self) { return self.buf()->IsKnownReady(); },
-      py::is_method(type));
-  type.attr("block_until_ready") = py::cpp_function(
-      [](PyBuffer::object self) -> StatusOr<PyBuffer::object> {
-        TF_RETURN_IF_ERROR(self.buf()->BlockHostUntilReady());
-        return std::move(self);
-      },
-      py::is_method(type));
-  type.attr("copy_to_host_async") = py::cpp_function(
-      [](PyBuffer::object self) { return self.buf()->CopyToHostAsync(); },
-      py::is_method(type));
-  type.attr("xla_shape") = py::cpp_function(
-      [](PyBuffer::object self) { return self.buf()->shape(); },
-      py::is_method(type));
-  type.attr("xla_dynamic_shape") = py::cpp_function(
-      [](PyBuffer::object self) { return self.buf()->xla_dynamic_shape(); },
-      py::is_method(type));
-  type.attr("client") = property_readonly(
-      [](PyBuffer::object self) { return self.buf()->client(); });
-  type.attr("device") = py::cpp_function(
-      [](PyBuffer::object self) { return self.buf()->device(); },
-      py::is_method(type));
-  type.attr("platform") = py::cpp_function(
-      [](PyBuffer::object self) { return self.buf()->platform_name(); },
-      py::is_method(type));
-  type.attr("is_deleted") = py::cpp_function(
-      [](PyBuffer::object self) { return self.buf()->is_deleted(); },
-      py::is_method(type));
-  type.attr("unsafe_buffer_pointer") = py::cpp_function(
-      [](PyBuffer::object self) { return self.buf()->UnsafeBufferPointer(); },
-      py::is_method(type));
-  type.attr("__cuda_array_interface__") = property_readonly(
-      [](PyBuffer::object self) { return self.buf()->CudaArrayInterface(); });
-  type.attr("traceback") = property_readonly(
-      [](PyBuffer::object self) { return self.buf()->traceback(); });
-  type.attr("clone") = py::cpp_function(
-      [](PyBuffer::object self) { return self.buf()->Clone(); },
-      py::is_method(type));
-  type.attr("__module__") = m.attr("__name__");
-
-  return OkStatus();
-}
-
 StatusOr<ifrt::DType> ToIfRtDType(py::dtype dtype) {
   TF_ASSIGN_OR_RETURN(auto primitive_type, DtypeToPrimitiveType(dtype));
   return ifrt::ToDType(primitive_type);
diff --git a/tensorflow/compiler/xla/python/py_buffer.h b/tensorflow/compiler/xla/python/py_buffer.h
index d168be2afa7..bd5a193a94e 100644
--- a/tensorflow/compiler/xla/python/py_buffer.h
+++ b/tensorflow/compiler/xla/python/py_buffer.h
@@ -35,6 +35,7 @@ limitations under the License.
 
 namespace xla {
 
+// TODO(parkers): Move everything in this file to a better home.
 struct PyHostValue {
   static Status CopyToHostAsync(std::shared_ptr<PyHostValue>& host_value,
                                 std::optional<Shape>& dynamic_shape_holder,
@@ -59,220 +60,8 @@ struct IfrtHelpers {
   static PjRtDevice* pjrt_device(ifrt::Array* ifrt_array);
   static pybind11::tuple python_shape(ifrt::Array* ifrt_array);
   static pybind11::dtype python_dtype(ifrt::Array* ifrt_array);
-};
-
-// Python wrapper around PjRtBuffer. We use a wrapper class:
-// a) to keep the PjRtClient alive via a std::shared_ptr<>
-// b) to add Python-specific functionality.
-//
-// A `PyBuffer` can be used from Python without being wrapped in a Python
-// `DeviceArray` object.
-class PyBuffer {
- public:
-  // pybind11::object typed subclass for PyBuffer objects.
-  class pyobject : public pybind11::object {
-   public:
-    PYBIND11_OBJECT(pyobject,  // NOLINT
-                    pybind11::object, PyBuffer::IsPyBuffer);
-    pyobject() = default;
-    PyBuffer* buf() const { return PyBuffer::AsPyBufferUnchecked(*this); }
-  };
-  using object = pyobject;
-
-  static object Make(std::shared_ptr<PyClient> client,
-                     tsl::RCReference<ifrt::Array> ifrt_array,
-                     std::shared_ptr<Traceback> traceback);
-
-  // Returns true if `h` is a PyBuffer.
-  static bool IsPyBuffer(pybind11::handle handle);
-  // Converts `handle` to a PyBuffer*. Does not do any checking.
-  static PyBuffer* AsPyBufferUnchecked(pybind11::handle handle);
-  // Converts `handle` to a PyBuffer*. Returns an error status if
-  // !IsPyBuffer(handle)
-  static StatusOr<PyBuffer*> AsPyBuffer(pybind11::handle handle);
-
-  // Gets a Python handle to an existing PyBuffer. Assumes the PyObject was
-  // allocated on the Python heap, which is the case if Make() was used.
-  pybind11::handle AsHandle();
-
-  ~PyBuffer();
-
-  std::shared_ptr<PyClient> client() const { return client_; }
-
-  ifrt::Array* ifrt_array() const { return ifrt_array_.get(); }
-
-  // Short-term escape hatch to get PjRtBuffer from PyBuffer.
-  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
-  PjRtBuffer* pjrt_buffer() const {
-    return IfrtHelpers::pjrt_buffer(ifrt_array_.get());
-    auto* arr =
-        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array_.get());
-    if (arr == nullptr) {
-      throw XlaRuntimeError(
-          "This operation is implemented for a PjRt-compatible backend only.");
-    }
-    return arr->pjrt_buffers().front().get();
-  }
-
-  // Short-term escape hatch to get PjRtBuffer from PyBuffer.
-  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
-  std::shared_ptr<PjRtBuffer> shared_ptr_pjrt_buffer() const {
-    auto* arr =
-        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array_.get());
-    if (arr == nullptr) {
-      throw XlaRuntimeError(
-          "This operation is implemented for a PjRt-compatible backend only.");
-    }
-    return arr->pjrt_buffers().front();
-  }
-
-  void SetPjRtBuffer(std::shared_ptr<PjRtBuffer> buffer) {
-    auto* client = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(
-        client_->ifrt_client());
-    if (client == nullptr) {
-      throw XlaRuntimeError(
-          "This operation is implemented for a PjRt-compatible backend only.");
-    }
-    auto ifrt_array = client->CreatePjRtArray(std::move(buffer));
-    TF_CHECK_OK(ifrt_array.status());
-    ifrt_array_ = *std::move(ifrt_array);
-  }
-
-  // Legacy alises.
-  PjRtBuffer* buffer() const { return pjrt_buffer(); }
-  std::shared_ptr<PjRtBuffer> shared_ptr_buffer() const {
-    return shared_ptr_pjrt_buffer();
-  }
-
-  ClientAndPtr<PjRtDevice> device() const;
-  absl::string_view platform_name() const {
-    return ifrt_array_->client()->platform_name();
-  }
-  bool is_deleted() const { return ifrt_array_->IsDeleted(); }
-
-  StatusOr<pybind11::object> CopyToDevice(
-      const ClientAndPtr<PjRtDevice>& dst_device) const;
-  std::pair<Status, bool> CopyToRemoteDevice(
-      absl::string_view serialized_descriptor) const;
-
-  StatusOr<size_t> OnDeviceSizeInBytes() {
-    return pjrt_buffer()->GetOnDeviceSizeInBytes();
-  }
-
-  void Delete() {
-    // TODO(hyeontaek): Return Status.
-    TF_CHECK_OK(ifrt_array_->Delete().Await());
-    host_value_ = nullptr;
-  }
-
-  // Makes a copy of this PyBuffer object that shares the underlying PjRtBuffer.
-  // This is useful because we may wish to change JAX metadata (e.g., the sticky
-  // device) without copying the buffer.
-  object Clone() const;
-
-  // Returns xla::InvalidArgument if the buffer has been deleted.
-  // See `PjRtFuture` for the semantics of `IsReady` and `IsKnownReady`.
-  StatusOr<bool> IsReady() {
-    if (ifrt_array_->IsDeleted()) {
-      return InvalidArgument("DeviceArray has been deleted.");
-    }
-    return ifrt_array_->GetReadyFuture().IsReady();
-  }
-  StatusOr<bool> IsKnownReady() {
-    if (ifrt_array_->IsDeleted()) {
-      return InvalidArgument("DeviceArray has been deleted.");
-    }
-    return ifrt_array_->GetReadyFuture().IsKnownReady();
-  }
-
-  // Returns xla::InvalidArgument if the buffer has been deleted.
-  Status BlockHostUntilReady();
-  Status CopyToHostAsync();
-
-  const Shape& shape() { return pjrt_buffer()->on_device_shape(); }
-
-  StatusOr<std::uintptr_t> UnsafeBufferPointer() const;
-
-  // Implementation of the CUDA array interface for sharing GPU buffers with
-  // other Python libraries.
-  StatusOr<pybind11::dict> CudaArrayInterface();
-
-  const std::shared_ptr<Traceback>& traceback() const { return traceback_; }
-
-  // Returns the size (i.e. number of elements) of the (host) numpy array.
-  StatusOr<int64_t> size();
-
-  // Returns the number of dimensions of the (host) numpy array.
-  int ndim() const { return ifrt_array_->shape().dims().size(); }
-
-  pybind11::tuple python_shape() const {
-    return IfrtHelpers::python_shape(ifrt_array());
-  }
-  pybind11::dtype python_dtype() const {
-    return IfrtHelpers::python_dtype(ifrt_array());
-  }
-
-  // Representing the logical view of the underlying dynamic shapes.
-  StatusOr<const Shape*> xla_dynamic_shape();
-
-  Status set_sticky_device(PjRtDevice* sticky_device) {
-    TF_RET_CHECK(sticky_device == nullptr ||
-                 sticky_device == ifrt_array_->sharding().devices().front());
-    sticky_device_ = sticky_device;
-    return OkStatus();
-  }
-  PjRtDevice* sticky_device() const { return sticky_device_; }
-
-  void set_weak_type(std::optional<bool> weak_type) { weak_type_ = weak_type; }
-  std::optional<bool> weak_type() const { return weak_type_; }
-
-  StatusOr<pybind11::object> AsNumPyArray(pybind11::handle this_obj);
-
-  void SetAval(pybind11::object aval) { aval_ = aval; }
-  pybind11::object GetAval() const { return aval_; }
-
-  static Status RegisterTypes(pybind11::module& m);
-  static PyObject* base_type() { return base_type_; }
-  static PyObject* type() { return type_; }
-
- private:
-  // PyBuffer objects must not be allocated directly since they must always live
-  // on the Python heap. Use Make() instead.
-  PyBuffer(std::shared_ptr<PyClient> client,
-           tsl::RCReference<ifrt::Array> array,
-           std::shared_ptr<Traceback> traceback);
-
-  static PyObject* base_type_;
-  static PyObject* type_;
-
-  friend class PyClient;
-
-  std::shared_ptr<PyClient> client_;
-  tsl::RCReference<ifrt::Array> ifrt_array_;
-  std::shared_ptr<Traceback> traceback_;
-  std::shared_ptr<PyHostValue> host_value_;  // Protected by the GIL.
-
-  // JAX uses this field to record whether a buffer is committed to a particular
-  // device by the user (https://github.com/google/jax/pull/1916).
-  PjRtDevice* sticky_device_ = nullptr;
-
-  // TODO(phawkins): consider not keeping an explicit aval on C++ buffer
-  // objects.
-  pybind11::object aval_ = pybind11::none();
-
-  // An optional weak type. If absent, the JAX jit code computes the weak_type
-  // from the aval_.weak_type attribute. This is a backwards compatibility
-  // measure for older Python code that does not set weak_type explicitly.
-  // TODO(phawkins): drop support for older jax Python versions and make
-  // weak_type mandatory.
-  std::optional<bool> weak_type_ = std::nullopt;
-
-  std::optional<Shape> dynamic_shape_ = std::nullopt;
-  // Doubly-linked list of all PyBuffers known to the client. Protected by the
-  // GIL. Since multiple PyBuffers may share the same PjRtBuffer, there may be
-  // duplicate PjRtBuffers in this list.
-  PyBuffer* next_;
-  PyBuffer* prev_;
+  static StatusOr<pybind11::dict> CudaArrayInterface(
+      ifrt::Array* ifrt_array, std::optional<Shape>& scratch);
 };
 
 // TODO(hyeontaek): Move the following functions to a separate file.
diff --git a/tensorflow/compiler/xla/python/py_client.cc b/tensorflow/compiler/xla/python/py_client.cc
index 058b617d0a8..8a9618b51aa 100644
--- a/tensorflow/compiler/xla/python/py_client.cc
+++ b/tensorflow/compiler/xla/python/py_client.cc
@@ -55,12 +55,6 @@ namespace py = pybind11;
 PyClient::PyClient(std::shared_ptr<ifrt::Client> ifrt_client)
     : ifrt_client_(std::move(ifrt_client)) {
   CHECK(ifrt_client_);
-  buffers_.resize(ifrt_client_->device_count());
-  for (ifrt::Device* device : ifrt_client_->addressable_devices()) {
-    if (device->id() >= buffers_.size()) {
-      buffers_.resize(device->id() + 1);
-    }
-  }
 }
 
 PyClient::~PyClient() {
@@ -90,33 +84,12 @@ std::vector<ClientAndPtr<PjRtDevice>> PyClient::LocalDevices() {
 std::vector<py::object> PyClient::LiveBuffers() {
   CHECK(PyGILState_Check());
   std::vector<py::object> buffers;
-  for (PyBuffer* device_buffers : buffers_) {
-    for (PyBuffer* buffer = device_buffers; buffer; buffer = buffer->next_) {
-      if (!buffer->is_deleted()) {
-        buffers.push_back(
-            py::reinterpret_borrow<py::object>(buffer->AsHandle()));
-      }
-    }
-  }
   for (py::object& array : LiveArrays()) {
     buffers.push_back(std::move(array));
   }
   return buffers;
 }
 
-std::vector<py::object> PyClient::LiveBuffersOnDevice(PjRtDevice* device) {
-  CHECK_EQ(device->client(), pjrt_client());
-  CHECK(PyGILState_Check());
-  std::vector<py::object> buffers;
-  for (PyBuffer* buffer = buffers_[device->id()]; buffer;
-       buffer = buffer->next_) {
-    if (!buffer->is_deleted()) {
-      buffers.push_back(py::reinterpret_borrow<py::object>(buffer->AsHandle()));
-    }
-  }
-  return buffers;
-}
-
 std::vector<std::shared_ptr<PyLoadedExecutable>> PyClient::LiveExecutables() {
   CHECK(PyGILState_Check());
   std::vector<std::shared_ptr<PyLoadedExecutable>> executables;
@@ -136,10 +109,6 @@ Status PyClient::Defragment() {
   } else if (runtime_type ==
              PjRtRuntimeTypeString(PjRtRuntimeType::kStreamExecutor)) {
     struct TmpBuffer {
-      // TODO(skyewm): Arrays create multiple PyBuffers for the same
-      // PjRtBuffer when Array._arrays is called.  This should theoretically
-      // be a single possibly-null PyBuffer* for Arrays.
-      std::vector<PyBuffer*> py_buffers;
       // Non-empty for buffers found in a PyArray_Storage. Multiple Arrays
       // can reference the same PjRtBuffer.
       std::vector<std::shared_ptr<PjRtBuffer>*> pjrt_buffer_ptrs;
@@ -149,20 +118,6 @@ Status PyClient::Defragment() {
 
     // Synchronously copy all buffers to host
     absl::flat_hash_map<PjRtBuffer*, TmpBuffer> pjrt_buf_to_tmp_buffer;
-    for (PyBuffer* device_buffers : buffers_) {
-      for (PyBuffer* buffer = device_buffers; buffer; buffer = buffer->next_) {
-        if (buffer->is_deleted()) {
-          continue;
-        }
-        auto [iter, inserted] =
-            pjrt_buf_to_tmp_buffer.insert({buffer->pjrt_buffer(), TmpBuffer()});
-        if (inserted) {
-          TF_ASSIGN_OR_RETURN(iter->second.host_copy,
-                              buffer->pjrt_buffer()->ToLiteralSync());
-        }
-        iter->second.py_buffers.push_back(buffer);
-      }
-    }
 
     for (PyArray_Storage* array = arrays_; array; array = array->next) {
       // TODO(hyeontaek): Support non-PjRt Arrays.
@@ -209,7 +164,7 @@ Status PyClient::Defragment() {
                       .status());
     }
 
-    // Copy host copies back to device and update PyBuffers in-place.
+    // Copy host copies back to device and update PyArrays in-place.
     for (auto& it : pjrt_buf_to_tmp_buffer) {
       PjRtBuffer* pjrt_buf = it.first;
       TmpBuffer& tmp_buffer = it.second;
@@ -220,9 +175,6 @@ Status PyClient::Defragment() {
       TF_CHECK_OK(new_copy->BlockHostUntilReady());
 
       std::shared_ptr<PjRtBuffer> new_pjrt_buf_ptr(new_copy.release());
-      for (PyBuffer* py_buffer : tmp_buffer.py_buffers) {
-        py_buffer->SetPjRtBuffer(new_pjrt_buf_ptr);
-      }
       for (std::shared_ptr<PjRtBuffer>* pjrt_buffer_ptr :
            tmp_buffer.pjrt_buffer_ptrs) {
         *pjrt_buffer_ptr = new_pjrt_buf_ptr;
@@ -319,8 +271,10 @@ StatusOr<py::object> PyClient::BufferFromPyval(
 
   if (put.ifrt_array) {
     auto traceback = Traceback::Get();
-    return PyBuffer::Make(shared_from_this(), std::move(put.ifrt_array),
-                          std::move(traceback));
+    return PyArray::MakeFromSingleDeviceArray(
+        shared_from_this(), std::move(traceback), std::move(put.ifrt_array),
+        /*weak_type=*/false,
+        /*committed=*/false);
   } else {
     return py::reinterpret_borrow<py::object>(put.owning_pybuffer);
   }
@@ -374,8 +328,10 @@ PyClient::MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
     }
     TF_ASSIGN_OR_RETURN(auto ifrt_array,
                         client->CreatePjRtArray(std::move(buffers[i])));
-    auto py_buf =
-        PyBuffer::Make(shared_from_this(), std::move(ifrt_array), traceback);
+    auto py_buf = PyArray::MakeFromSingleDeviceArray(
+        shared_from_this(), Traceback::Get(), std::move(ifrt_array),
+        /*weak_type=*/false,
+        /*committed=*/false);
     result.push_back(std::make_pair(std::move(py_desc), std::move(py_buf)));
   }
   return result;
@@ -408,7 +364,7 @@ StatusOr<py::bytes> PyClient::SerializeExecutable(
 }
 
 StatusOr<std::shared_ptr<PyLoadedExecutable>> PyClient::DeserializeExecutable(
-    const std::string& serialized, CompileOptions options,
+    const std::string& serialized, std::optional<CompileOptions> options,
     std::vector<pybind11::capsule> host_callbacks) {
   std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable;
   std::optional<std::string> fingerprint;
@@ -467,7 +423,7 @@ StatusOr<py::bytes> PyClient::HeapProfile() {
 
   auto add_buffer_to_profile = [&](PjRtBuffer* buffer, Traceback* traceback) {
     // We only wish to count each PjRtBuffer once, even though they may be
-    // shared by multiple PyBuffers.
+    // shared by multiple PyArrays.
     if (!buffer->IsDeleted() && buffer_set.insert(buffer).second) {
       TF_ASSIGN_OR_RETURN(size_t size, buffer->GetOnDeviceSizeInBytes());
       HeapProfileKey key{traceback, static_cast<int64_t>(size),
@@ -477,13 +433,6 @@ StatusOr<py::bytes> PyClient::HeapProfile() {
     return OkStatus();
   };
 
-  for (PyBuffer* device_buffers : buffers_) {
-    for (PyBuffer* buffer = device_buffers; buffer; buffer = buffer->next_) {
-      TF_RETURN_IF_ERROR(add_buffer_to_profile(buffer->pjrt_buffer(),
-                                               buffer->traceback().get()));
-    }
-  }
-
   for (PyArray_Storage* array = arrays_; array; array = array->next) {
     if (array->ifrt_array == nullptr) {
       continue;
diff --git a/tensorflow/compiler/xla/python/py_client.h b/tensorflow/compiler/xla/python/py_client.h
index d37d503f99f..e71de116387 100644
--- a/tensorflow/compiler/xla/python/py_client.h
+++ b/tensorflow/compiler/xla/python/py_client.h
@@ -33,7 +33,6 @@ limitations under the License.
 
 namespace xla {
 
-class PyBuffer;
 class PyClient;
 class PyLoadedExecutable;
 class PyArray;
@@ -60,7 +59,8 @@ struct PyArray_Storage;
 
 // A pair of a PyClient reference and an unowned pointer to T.
 template <typename T>
-struct ClientAndPtr {
+class ClientAndPtr {
+ public:
   ClientAndPtr() = default;
   // pybind11 requires that we define a constructor that takes a raw pointer,
   // but it should be unreachable.
@@ -73,12 +73,22 @@ struct ClientAndPtr {
   ClientAndPtr& operator=(const ClientAndPtr&) = default;
   ClientAndPtr& operator=(ClientAndPtr&&) = default;
 
-  std::shared_ptr<PyClient> client;
-  T* contents;
+  PyClient* get_client() const { return client_; }
 
-  T* get() const { return contents; }
-  T* operator->() const { return contents; }
-  T& operator*() const { return *contents; }
+  std::shared_ptr<PyClient> client() const {
+    return std::shared_ptr<PyClient>(contents_, client_);
+  }
+
+  T* get() const { return contents_.get(); }
+  T* operator->() const { return contents_.get(); }
+  T& operator*() const { return *contents_; }
+
+ private:
+  template <typename U>
+  friend ClientAndPtr<U> WrapWithClient(std::shared_ptr<PyClient> client,
+                                        U* contents);
+  std::shared_ptr<T> contents_;
+  PyClient* client_;
 };
 
 // By defining a templated helper function, we can use return type deduction
@@ -86,8 +96,8 @@ struct ClientAndPtr {
 template <typename T>
 ClientAndPtr<T> WrapWithClient(std::shared_ptr<PyClient> client, T* contents) {
   ClientAndPtr<T> result;
-  result.client = std::move(client);
-  result.contents = contents;
+  result.client_ = client.get();
+  result.contents_ = std::shared_ptr<T>(std::move(client), contents);
   return result;
 }
 
@@ -144,7 +154,7 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   std::vector<ClientAndPtr<PjRtDevice>> Devices();
   std::vector<ClientAndPtr<PjRtDevice>> LocalDevices();
 
-  // Returns a vector of live PyBuffer objects. PyBuffer objects may share
+  // Returns a vector of live PyArray objects. PyArray objects may share
   // PjRtBuffers, so there may be duplicates of the same underlying device
   // buffer.
   std::vector<pybind11::object> LiveBuffers();
@@ -188,17 +198,9 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   StatusOr<pybind11::bytes> SerializeExecutable(
       const PyLoadedExecutable& executable) const;
   StatusOr<std::shared_ptr<PyLoadedExecutable>> DeserializeExecutable(
-      const std::string& serialized, CompileOptions options,
+      const std::string& serialized, std::optional<CompileOptions> options,
       std::vector<pybind11::capsule> host_callbacks);
 
-  // TODO(skyewm): remove when jax stop providing hlo_module
-  StatusOr<std::shared_ptr<PyLoadedExecutable>> DeserializeExecutable(
-      const std::string& serialized, std::shared_ptr<HloModule> hlo_module,
-      CompileOptions options, std::vector<pybind11::capsule> host_callbacks) {
-    return DeserializeExecutable(serialized, options,
-                                 std::move(host_callbacks));
-  }
-
   StatusOr<pybind11::bytes> HeapProfile();
 
   // `GetEmitPythonCallbackDescriptor` takes in an input Python callable that
@@ -249,19 +251,16 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   std::vector<pybind11::object> LiveArrays();
 
  private:
-  friend class PyBuffer;
   friend class PyLoadedExecutable;
   friend class PyArray;
   friend struct PyArray_Storage;
 
   std::shared_ptr<ifrt::Client> ifrt_client_;
 
-  // Pointers to intrusive doubly-linked lists of buffers and executables, used
+  // Pointers to intrusive doubly-linked lists of arrays and executables, used
   // to iterate over all known objects when heap profiling. The list structure
   // is protected by the GIL.
 
-  // buffers_ is a per-device list, indexed by device->id().
-  std::vector<PyBuffer*> buffers_;
   PyLoadedExecutable* executables_ = nullptr;
   PyArray_Storage* arrays_ = nullptr;
 };
diff --git a/tensorflow/compiler/xla/python/py_client_gpu.cc b/tensorflow/compiler/xla/python/py_client_gpu.cc
index d2082e74ad5..2e25fafd4ac 100644
--- a/tensorflow/compiler/xla/python/py_client_gpu.cc
+++ b/tensorflow/compiler/xla/python/py_client_gpu.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "absl/strings/numbers.h"
+#include "tensorflow/tsl/platform/errors.h"
 #if TENSORFLOW_USE_ROCM
 #include "rocm/include/hip/hip_runtime.h"
 #else
@@ -28,12 +29,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/exceptions.h"
 
 #if TENSORFLOW_USE_ROCM
+#define gpuSuccess hipSuccess
 #define gpuStreamHandle hipStream_t
 #define gpuMemcpyAsync hipMemcpyAsync
 #define gpuStreamSynchronize hipStreamSynchronize
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #else
+#define gpuSuccess cudaSuccess
 #define gpuStreamHandle CUstream
 #define gpuMemcpyAsync cudaMemcpyAsync
 #define gpuStreamSynchronize cudaStreamSynchronize
@@ -69,10 +72,12 @@ void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
     void* buf = new char[arg.size_in_bytes];
     host_input_buffers[i] = buf;
     // TODO(b/238441608): Use pinned memory here to speed up the transfer.
-    gpuMemcpyAsync(buf, buffers[i], arg.size_in_bytes, gpuMemcpyDeviceToHost,
-                   stream);
+    auto gpu_res = gpuMemcpyAsync(buf, buffers[i], arg.size_in_bytes,
+                                  gpuMemcpyDeviceToHost, stream);
+    CHECK_EQ(gpu_res, gpuSuccess) << "Failed to gpuMemcpyAsync";
   }
-  gpuStreamSynchronize(stream);
+  CHECK_EQ(gpuStreamSynchronize(stream), gpuSuccess)
+      << "Failed to gpuStreamSynchronize";
   py::gil_scoped_acquire gil;
   py::tuple host_input_arrays(arity);
   for (size_t i = 0; i < arity; ++i) {
@@ -108,8 +113,10 @@ void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
     absl::Span<int64_t const> strides(
         reinterpret_cast<const int64_t*>(array.strides()), array.ndim());
     if (strides == result.expected_strides) {
-      gpuMemcpyAsync(buffers[arity + i], array.data(), result.size_in_bytes,
-                     gpuMemcpyHostToDevice, stream);
+      auto gpu_res =
+          gpuMemcpyAsync(buffers[arity + i], array.data(), result.size_in_bytes,
+                         gpuMemcpyHostToDevice, stream);
+      CHECK_EQ(gpu_res, gpuSuccess) << "Failed to gpuMemcpyAsync";
     } else {
       void* temp = new char[result.size_in_bytes];
       temp_buffers.push_back(temp);
@@ -122,12 +129,15 @@ void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
         throw xla::XlaRuntimeError(plan.status().ToString());
       }
       plan.value()->Execute(array.data(), temp);
-      gpuMemcpyAsync(buffers[arity + i], temp, result.size_in_bytes,
-                     gpuMemcpyHostToDevice, stream);
+      auto gpu_res =
+          gpuMemcpyAsync(buffers[arity + i], temp, result.size_in_bytes,
+                         gpuMemcpyHostToDevice, stream);
+      CHECK_EQ(gpu_res, gpuSuccess) << "Failed to gpuMemcpyAsync";
     }
   }
   py::gil_scoped_release release;
-  gpuStreamSynchronize(stream);
+  CHECK_EQ(gpuStreamSynchronize(stream), gpuSuccess)
+      << "Failed to gpuStreamSynchronize";
   for (int i = 0; i < temp_buffers.size(); ++i) {
     delete[] static_cast<char*>(temp_buffers[i]);
   }
diff --git a/tensorflow/compiler/xla/python/py_compile_only_client.cc b/tensorflow/compiler/xla/python/py_compile_only_client.cc
new file mode 100644
index 00000000000..1af4c733b5f
--- /dev/null
+++ b/tensorflow/compiler/xla/python/py_compile_only_client.cc
@@ -0,0 +1,219 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/py_compile_only_client.h"
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "pybind11/stl.h"  // from @pybind11
+#include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
+#include "tensorflow/tsl/python/lib/core/numpy.h"  //NOLINT
+
+namespace xla {
+
+namespace {
+
+class PjRtCompileOnlyDevice : public PjRtDevice {
+ public:
+  explicit PjRtCompileOnlyDevice(const PjRtDeviceDescription* description)
+      : description_(std::move(description)) {}
+
+  const PjRtDeviceDescription& description() const override {
+    return *description_;
+  }
+
+  PjRtClient* client() const override { return nullptr; }
+  bool IsAddressable() const override { return false; }
+  int local_hardware_id() const override { return -1; }
+  std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
+      absl::string_view description) const override {
+    return nullptr;
+  }
+  Status TransferToInfeed(const LiteralSlice& literal) override {
+    return Unimplemented("TransferToInfeed is not supported");
+  }
+  Status TransferFromOutfeed(MutableBorrowingLiteral literal) override {
+    return Unimplemented("TransferFromOutfeed is not supported");
+  }
+
+ private:
+  const PjRtDeviceDescription* description_;
+};
+
+class InvalidIfrtCompiler final
+    : public llvm::RTTIExtends<InvalidIfrtCompiler, ifrt::Compiler> {
+ public:
+  StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> Compile(
+      mlir::ModuleOp mlir_module, CompileOptions options) override {
+    return Unimplemented("Compile not implemented.");
+  }
+
+  StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> DeserializeLoadedExecutable(
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override {
+    return Unimplemented("DeserializeLoadedExecutable not implemented.");
+  }
+
+  static char ID;  // NOLINT
+};
+char InvalidIfrtCompiler::ID = 0;
+
+class CompileOnlyIfRtClient final
+    : public llvm::RTTIExtends<CompileOnlyIfRtClient, ifrt::Client> {
+ public:
+  explicit CompileOnlyIfRtClient(
+      std::shared_ptr<PjRtTopologyDescription> topology)
+      : topology_(std::move(topology)),
+        descriptions_(topology_->DeviceDescriptions()) {
+    for (auto& description : descriptions_) {
+      owned_devices_.push_back(
+          std::make_unique<PjRtCompileOnlyDevice>(description.get()));
+      devices_.push_back(owned_devices_.back().get());
+    }
+  }
+
+  StatusOr<tsl::RCReference<ifrt::Array>> MakeArrayFromHostBuffer(
+      const void* data, ifrt::DType dtype, ifrt::Shape shape,
+      std::optional<absl::Span<const int64_t>> byte_strides,
+      std::shared_ptr<const ifrt::Sharding> sharding,
+      HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer) override {
+    return Unimplemented(
+        "MakeArrayFromHostBuffer not available with compile-only client.");
+  }
+
+  StatusOr<tsl::RCReference<ifrt::Array>> AssembleArrayFromSingleDeviceArrays(
+      ifrt::Shape shape, std::shared_ptr<const ifrt::Sharding> sharding,
+      absl::Span<tsl::RCReference<ifrt::Array>> arrays,
+      ifrt::ArrayCopySemantics semantics) override {
+    return Unimplemented(
+        "AssembleArrayFromSingleDeviceArrays not available with compile-only "
+        "client.");
+  }
+
+  StatusOr<tsl::RCReference<ifrt::Tuple>> MakeTuple(
+      absl::Span<tsl::RCReference<ifrt::Value>> values) override {
+    return Unimplemented("MakeTuple not available with compile-only client.");
+  }
+
+  absl::string_view runtime_type() const override {
+    return "compile_only_runtime";
+  }
+
+  absl::string_view platform_name() const override {
+    return topology_->platform_name();
+  }
+  absl::string_view platform_version() const override {
+    return topology_->platform_version();
+  }
+  ifrt::PlatformId platform_id() const override {
+    return topology_->platform_id();
+  }
+
+  int device_count() const override { return devices().size(); }
+  int addressable_device_count() const override { return 0; }
+  absl::Span<ifrt::Device* const> devices() const override { return devices_; }
+  absl::Span<ifrt::Device* const> addressable_devices() const override {
+    return {};
+  }
+  int process_index() const override { return 0; }
+  StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override {
+    return Unimplemented(
+        "GetDefaultDeviceAssignment not available with compile-only client.");
+  }
+  StatusOr<ifrt::Device*> LookupDevice(int device_id) const override {
+    return Unimplemented(
+        "LookupDevice not available with compile-only client.");
+  }
+
+  StatusOr<ifrt::ChannelHandle> CreateDeviceToHostChannelHandle() override {
+    return Unimplemented(
+        "CreateDeviceToHostChannelHandle not available with compile-only "
+        "client.");
+  }
+  StatusOr<ifrt::ChannelHandle> CreateHostToDeviceChannelHandle() override {
+    return Unimplemented(
+        "CreateHostToDeviceChannelHandle not available with compile-only "
+        "client.");
+  }
+
+  ifrt::Compiler* GetDefaultCompiler() override { return &default_compiler_; }
+
+  static char ID;  // NOLINT
+
+  const PjRtTopologyDescription& topology() const { return *topology_; }
+
+ private:
+  InvalidIfrtCompiler default_compiler_;
+  std::shared_ptr<PjRtTopologyDescription> topology_;
+  std::vector<std::unique_ptr<const PjRtDeviceDescription>> descriptions_;
+  std::vector<std::unique_ptr<PjRtCompileOnlyDevice>> owned_devices_;
+  std::vector<PjRtDevice*> devices_;
+};
+
+char CompileOnlyIfRtClient::ID = 0;
+
+class CompileOnlyPyClient : public PyClient {
+ public:
+  using PyClient::PyClient;
+
+  StatusOr<std::shared_ptr<PjRtExecutable>> CompileUnloaded(
+      std::string mlir_module, CompileOptions options,
+      std::vector<pybind11::capsule> host_callbacks) {
+    if (!host_callbacks.empty()) {
+      return Unimplemented(
+          "Compiling with host_callbacks not available with compile-only "
+          "client.");
+    }
+    pybind11::gil_scoped_release gil_release;
+    mlir::MLIRContext context;
+    TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                        ParseMlirModuleString(mlir_module, context));
+    auto* ifrt_client =
+        llvm::dyn_cast_or_null<CompileOnlyIfRtClient>(this->ifrt_client());
+    CHECK(ifrt_client) << "CompileOnlyPyClient requires ifrt_client be a "
+                          "CompileOnlyIfRtClient";
+    return PjRtCompile(std::move(options), module.get(),
+                       ifrt_client->topology());
+  }
+};
+
+}  // namespace
+
+std::shared_ptr<PyClient> MakeCompileOnlyClient(
+    std::shared_ptr<PjRtTopologyDescription> topology) {
+  return std::make_shared<CompileOnlyPyClient>(
+      std::make_unique<CompileOnlyIfRtClient>(std::move(topology)));
+}
+
+void RegisterCompileOnlyClient(pybind11::module& m) {
+  pybind11::class_<CompileOnlyPyClient, PyClient,
+                   std::shared_ptr<CompileOnlyPyClient>>(m,
+                                                         "CompileOnlyPyClient")
+      .def("compile",
+           xla::ValueOrThrowWrapper(&CompileOnlyPyClient::CompileUnloaded),
+           pybind11::arg("computation"),
+           pybind11::arg("compile_options") = CompileOptions(),
+           pybind11::arg("host_callbacks") = std::vector<pybind11::capsule>());
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/py_compile_only_client.h b/tensorflow/compiler/xla/python/py_compile_only_client.h
new file mode 100644
index 00000000000..117ef0c87be
--- /dev/null
+++ b/tensorflow/compiler/xla/python/py_compile_only_client.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_PY_COMPILE_ONLY_CLIENT_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_PY_COMPILE_ONLY_CLIENT_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/pjrt/pjrt_compiler.h"
+#include "tensorflow/compiler/xla/python/py_client.h"
+
+namespace xla {
+
+// This is a workaround for AOT compilation until topologies and device
+// descriptions are better integrated into jax's Python code. It returns a
+// PyClient that will return errors for all non-AOT methods. It also exposes a
+// different compile method that returns an unloaded executable (vs. PyClient
+// usually returns a loaded executable). RegisterCompileOnlyClient() overloads
+// the Python "compile" method to return the unloaded executable, and we rely on
+// Python duck typing to treat the unloaded executable like a loaded executable
+// (except it will raise errors if you try to run it, which is what we want for
+// AOT environments).
+std::shared_ptr<PyClient> MakeCompileOnlyClient(
+    std::shared_ptr<PjRtTopologyDescription>);
+
+void RegisterCompileOnlyClient(pybind11::module& m);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PY_COMPILE_ONLY_CLIENT_H_
diff --git a/tensorflow/compiler/xla/python/py_executable.cc b/tensorflow/compiler/xla/python/py_executable.cc
index ada27b2029c..69d4194b4e5 100644
--- a/tensorflow/compiler/xla/python/py_executable.cc
+++ b/tensorflow/compiler/xla/python/py_executable.cc
@@ -76,6 +76,7 @@ PyLoadedExecutable::PyLoadedExecutable(
     VLOG(1) << "Fingerprint for executable " << ifrt_loaded_executable_->name()
             << ": " << *fingerprint_;
   }
+  options_.use_major_to_minor_data_layout_for_callbacks = true;
 }
 
 PyLoadedExecutable::~PyLoadedExecutable() {
@@ -101,89 +102,9 @@ std::vector<ClientAndPtr<PjRtDevice>> PyLoadedExecutable::AddressableDevices()
   return devices;
 }
 
-StatusOr<std::pair<std::vector<PyBuffer::object>, ifrt::Future<Status>>>
-PyLoadedExecutable::ExecuteInternal(absl::Span<PyBuffer::object const> args,
-                                    PjRtDevice* device) {
-  ifrt::LoadedExecutable::ExecuteResult result;
-  {
-    auto options = options_;
-    std::shared_ptr<HostCallbackStates> host_callback_states;
-
-    if (!host_callbacks_.empty()) {
-      auto* host_memory_for_device_manager =
-          client()->pjrt_client()->GetPjRtHostMemoryForDeviceManager();
-      if (host_memory_for_device_manager == nullptr) {
-        return InternalError("Host callback not supported for runtime type: %s",
-                             client()->runtime_type());
-      }
-
-      host_callback_states = std::make_shared<HostCallbackStates>();
-      auto& contexts = host_callback_states->contexts.emplace_back();
-      auto& send_callbacks =
-          host_callback_states->send_callbacks.emplace_back();
-      auto& recv_callbacks =
-          host_callback_states->recv_callbacks.emplace_back();
-
-      for (const py::capsule& host_callback : host_callbacks_) {
-        contexts.push_back(CreateHostCallbackStateAndAppendSendRecvCallbacks(
-            *host_callback.get_pointer<HostCallback>(),
-            host_memory_for_device_manager, send_callbacks, recv_callbacks));
-      }
-      options.send_callbacks = host_callback_states->send_callbacks;
-      options.recv_callbacks = host_callback_states->recv_callbacks;
-    }
-
-    py::gil_scoped_release gil_release;
-    std::vector<tsl::RCReference<ifrt::Array>> arg_arrays(args.size());
-    absl::c_transform(args, arg_arrays.begin(),
-                      [](const PyBuffer::object& buf) {
-                        return tsl::FormRef(buf.buf()->ifrt_array());
-                      });
-
-    if (device) {
-      TF_ASSIGN_OR_RETURN(
-          result, ifrt_loaded_executable()->Execute(
-                      absl::MakeSpan(arg_arrays), options,
-                      /*devices=*/
-                      ifrt::DeviceList(ifrt::DeviceList::Devices({device}))));
-    } else {
-      TF_ASSIGN_OR_RETURN(result, ifrt_loaded_executable()->Execute(
-                                      absl::MakeSpan(arg_arrays), options,
-                                      /*devices=*/std::nullopt));
-    }
-
-    if (!host_callbacks_.empty()) {
-      result.status.OnReady([host_callback_states](Status) mutable {
-        host_callback_states.reset();
-      });
-    }
-  }
-  auto traceback = Traceback::Get();
-  std::vector<PyBuffer::object> outputs;
-  outputs.reserve(result.outputs.size());
-  for (auto& array : result.outputs) {
-    outputs.push_back(PyBuffer::Make(client_, std::move(array), traceback));
-  }
-  return std::pair<std::vector<PyBuffer::object>, ifrt::Future<Status>>(
-      std::move(outputs), std::move(result.status));
-}
-
-StatusOr<std::pair<std::vector<PyBuffer::object>, PyToken>>
-PyLoadedExecutable::ExecuteWithToken(absl::Span<PyBuffer::object const> args,
-                                     PjRtDevice* device) {
-  TF_ASSIGN_OR_RETURN(auto out, ExecuteInternal(args, device));
-  return std::make_pair(std::move(out.first), PyToken(std::move(out.second)));
-}
-
-StatusOr<std::vector<PyBuffer::object>> PyLoadedExecutable::Execute(
-    absl::Span<PyBuffer::object const> args, PjRtDevice* device) {
-  TF_ASSIGN_OR_RETURN(auto out, ExecuteInternal(args, device));
-  return std::move(out.first);
-}
-
 namespace {
 
-// Traits classes of common methods for std::vector<PyBuffer::object>.
+// Traits classes of common methods for std::vector<PyArray>.
 template <typename ShardedBufferT>
 struct ShardedBufferAdapter;
 
@@ -194,8 +115,7 @@ struct ShardedBufferAdapter<ExecuteShardedArg> {
       CHECK(std::get<PyArray>(arg).fastpath_enabled());
       return std::get<PyArray>(arg).num_addressable_shards();
     } else {
-      return std::get<std::vector<std::variant<PyBuffer::object, PyArray>>>(arg)
-          .size();
+      return std::get<std::vector<PyArray>>(arg).size();
     }
   }
   static tsl::RCReference<ifrt::Array> GetIfRtArray(
@@ -204,8 +124,7 @@ struct ShardedBufferAdapter<ExecuteShardedArg> {
       CHECK(std::get<PyArray>(arg).fastpath_enabled());
       return tsl::FormRef(std::get<PyArray>(arg).ifrt_array());
     }
-    auto& arg_vector =
-        std::get<std::vector<std::variant<PyBuffer::object, PyArray>>>(arg);
+    auto& arg_vector = std::get<std::vector<PyArray>>(arg);
 
     // TODO(hyeontaek): This on-demand Array creation is not efficient and has
     // insufficient information about the shape (a dummy shape is used). This
@@ -215,25 +134,11 @@ struct ShardedBufferAdapter<ExecuteShardedArg> {
     ifrt_arrays.reserve(arg_vector.size());
     ifrt::DeviceList::Devices devices;
     devices.reserve(arg_vector.size());
-    for (auto& buf_or_arr : arg_vector) {
-      if (std::holds_alternative<PyBuffer::object>(buf_or_arr)) {
-        auto& buf = std::get<PyBuffer::object>(buf_or_arr);
-        DCHECK(buf.buf());
-        DCHECK(buf.buf()->ifrt_array());
-        ifrt_arrays.push_back(tsl::FormRef(buf.buf()->ifrt_array()));
-        devices.push_back(
-            buf.buf()->ifrt_array()->sharding().devices().front());
-        // Do not need to collect per-device shapes because the created array is
-        // not supposed to explode.
-      } else if (std::holds_alternative<PyArray>(buf_or_arr)) {
-        auto& arr = std::get<PyArray>(buf_or_arr);
-        CHECK_EQ(arr.ifrt_array()->sharding().devices().size(), 1)
-            << arr.ifrt_array()->sharding().DebugString();
-        ifrt_arrays.push_back(tsl::FormRef(arr.ifrt_array()));
-        devices.push_back(arr.ifrt_array()->sharding().devices().front());
-      } else {
-        CHECK(false) << "Unhandled variant case.";
-      }
+    for (auto& arr : arg_vector) {
+      CHECK_EQ(arr.ifrt_array()->sharding().devices().size(), 1)
+          << arr.ifrt_array()->sharding().DebugString();
+      ifrt_arrays.push_back(tsl::FormRef(arr.ifrt_array()));
+      devices.push_back(arr.ifrt_array()->sharding().devices().front());
     }
     CHECK(!ifrt_arrays.empty());
     // Use a dummy shape.
@@ -282,14 +187,11 @@ StatusOr<PyExecuteResults> ExecuteShardedOnLocalDevicesInternal(
     auto opts = options;
     std::shared_ptr<HostCallbackStates> host_callback_states;
     if (!host_callbacks.empty()) {
-      auto* host_memory_for_device_manager =
-          client->pjrt_client()->GetPjRtHostMemoryForDeviceManager();
-      if (host_memory_for_device_manager == nullptr) {
+      if (!client->pjrt_client()->SupportsSendRecvCallbacks()) {
         return InternalError("Host callback not supported for runtime type: %s",
                              client->runtime_type());
       }
       returned_futures.emplace();
-
       host_callback_states = std::make_shared<HostCallbackStates>();
 
       for (int i = 0; i < num_computations; ++i) {
@@ -302,7 +204,9 @@ StatusOr<PyExecuteResults> ExecuteShardedOnLocalDevicesInternal(
         for (const py::capsule& host_callback : host_callbacks) {
           contexts.push_back(CreateHostCallbackStateAndAppendSendRecvCallbacks(
               *host_callback.get_pointer<HostCallback>(),
-              host_memory_for_device_manager, send_callbacks, recv_callbacks));
+              /*host_memory_for_device_manager=*/nullptr, send_callbacks,
+              recv_callbacks,
+              /*use_major_to_minor_data_layout_for_callbacks=*/true));
         }
       }
       opts.send_callbacks = host_callback_states->send_callbacks;
diff --git a/tensorflow/compiler/xla/python/py_executable.h b/tensorflow/compiler/xla/python/py_executable.h
index 2073af120f3..f38832e8d59 100644
--- a/tensorflow/compiler/xla/python/py_executable.h
+++ b/tensorflow/compiler/xla/python/py_executable.h
@@ -104,8 +104,7 @@ class PyExecuteResults {
   PyShardedToken token_;
 };
 
-using ExecuteShardedArg =
-    std::variant<PyArray, std::vector<std::variant<PyBuffer::object, PyArray>>>;
+using ExecuteShardedArg = std::variant<PyArray, std::vector<PyArray>>;
 
 // Python wrapper around PjRtExecutable. We use a wrapper class:
 // a) to keep the PyClient alive via a std::shared_ptr<>
@@ -148,12 +147,6 @@ class PyLoadedExecutable
 
   bool is_deleted() { return ifrt_loaded_executable_->IsDeleted(); }
 
-  StatusOr<std::vector<PyBuffer::object>> Execute(
-      absl::Span<PyBuffer::object const> args, PjRtDevice* device);
-
-  StatusOr<std::pair<std::vector<PyBuffer::object>, PyToken>> ExecuteWithToken(
-      absl::Span<PyBuffer::object const> args, PjRtDevice* device);
-
   // Takes args indexed by argid then deviceid, transposes them, and passes to
   // PjRtExecutable::Execute. The result is similarly transposed back into the
   // argid,deviceid format.
@@ -208,9 +201,6 @@ class PyLoadedExecutable
   void KeepAlive(pybind11::object obj);
 
  private:
-  StatusOr<std::pair<std::vector<PyBuffer::object>, ifrt::Future<Status>>>
-  ExecuteInternal(absl::Span<PyBuffer::object const> args, PjRtDevice* device);
-
   friend class PyClient;
 
   std::shared_ptr<PyClient> client_;
diff --git a/tensorflow/compiler/xla/python/py_values.cc b/tensorflow/compiler/xla/python/py_values.cc
index 59a3305a120..70a3278104a 100644
--- a/tensorflow/compiler/xla/python/py_values.cc
+++ b/tensorflow/compiler/xla/python/py_values.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/py_array.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
-#include "tensorflow/compiler/xla/python/sharded_device_array.h"
 #include "tensorflow/compiler/xla/python/sharding.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -164,6 +163,10 @@ StatusOr<DevicePutResult> HandleNumpyScalar(py::handle h,
     // For extension types, ScalarAsCtype returns a pointer to the data.
     PyArray_ScalarAsCtype(h.ptr(), &ptr);
     type = F8E4M3FN;
+  } else if (std::is_same<T, tsl::float8_e4m3b11>()) {
+    // For extension types, ScalarAsCtype returns a pointer to the data.
+    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    type = F8E4M3B11FNUZ;
   } else if (std::is_same<T, tsl::float8_e5m2>()) {
     // For extension types, ScalarAsCtype returns a pointer to the data.
     PyArray_ScalarAsCtype(h.ptr(), &ptr);
@@ -245,33 +248,6 @@ StatusOr<DevicePutResult> HandleNumpyArray(py::handle h,
   return DevicePutResult(std::move(ifrt_array), /*weak_type=*/false);
 }
 
-StatusOr<DevicePutResult> PyBufferHelper(py::handle obj, py::handle py_buffer,
-                                         PyBuffer* buffer,
-                                         PjRtDevice* to_device) {
-  bool weak_type = buffer->weak_type()
-                       ? *buffer->weak_type()
-                       : py::cast<bool>(obj.attr("aval").attr("weak_type"));
-  if (buffer->ifrt_array()->sharding().devices().front() == to_device) {
-    return DevicePutResult(
-        tsl::FormRef(buffer->ifrt_array()), weak_type,
-        /*owning_pybuffer=*/py::reinterpret_borrow<py::object>(py_buffer));
-  } else {
-    TF_ASSIGN_OR_RETURN(tsl::RCReference<ifrt::Array> copied_ifrt_array,
-                        buffer->ifrt_array()->Reshard(
-                            ifrt::SingleDeviceSharding::Create(to_device),
-                            ifrt::ArrayCopySemantics::kReuseInput));
-    return DevicePutResult(std::move(copied_ifrt_array), weak_type);
-  }
-}
-
-StatusOr<DevicePutResult> HandlePyBuffer(py::handle obj,
-                                         ifrt::Client* client,
-                                         ifrt::Device* to_device,
-                                         const DevicePutOptions& options) {
-  return PyBufferHelper(obj, obj, PyBuffer::AsPyBufferUnchecked(obj),
-                        to_device);
-}
-
 StatusOr<DevicePutResult> HandlePyArray(py::handle obj,
                                         ifrt::Client* client,
                                         ifrt::Device* to_device,
@@ -309,22 +285,6 @@ StatusOr<DevicePutResult> HandlePyArray(py::handle obj,
   }
 }
 
-StatusOr<DevicePutResult> HandleDeviceArray(py::handle obj,
-                                            ifrt::Client* client,
-                                            ifrt::Device* to_device,
-                                            const DevicePutOptions& options) {
-  // Handle Python DeviceArray objects provided they have a .device_buffer field
-  // Otherwise, fallback to handling as a NumPy array, since we do not
-  // understand how to get a buffer object out. For example, ShardedDeviceArray
-  // in JAX is handled by this path.
-  py::object buffer = py::getattr(obj, "device_buffer", py::none());
-  if (buffer.is_none()) {
-    return HandleNumpyArray(obj, client, to_device, options);
-  }
-
-  return PyBufferHelper(obj, buffer, py::cast<PyBuffer*>(buffer), to_device);
-}
-
 }  // namespace
 
 StatusOr<DevicePutResult> DevicePut(py::handle arg,
@@ -347,31 +307,6 @@ StatusOr<DevicePutResult> DevicePut(py::handle arg,
         (*p)[reinterpret_cast<PyObject*>(&PyComplex_Type)] =
             HandlePythonScalar<complex128, complex64>;
 
-        // Generic subclasses of DeviceArray, e.g., ShardedDeviceArray.
-        (*p)[PyBuffer::base_type()] = HandleDeviceArray;
-
-        try {
-          py::object xla_module = py::module::import("jax.interpreters.xla");
-          py::object device_array =
-              py::getattr(xla_module, "_DeviceArray", py::none());
-          if (!device_array.is_none()) {
-            (*p)[device_array.ptr()] = HandleDeviceArray;
-          }
-        } catch (const py::error_already_set& e) {
-          // Ignore; jax may not be present.
-        }
-
-        try {
-          py::object pxla_module = py::module::import("jax.interpreters.pxla");
-          py::object sda =
-              py::getattr(pxla_module, "ShardedDeviceArray", py::none());
-          if (!sda.is_none()) {
-            (*p)[sda.ptr()] = HandleDeviceArray;
-          }
-        } catch (const py::error_already_set& e) {
-          // Ignore; jax may not be present.
-        }
-
         const auto numpy = py::module::import("numpy");
         (*p)[numpy.attr("ndarray").ptr()] = HandleNumpyArray;
 
@@ -388,6 +323,8 @@ StatusOr<DevicePutResult> DevicePut(py::handle arg,
         (*p)[dtypes.np_uint64.ptr()] = HandleNumpyScalar<uint64_t, uint32_t>;
         (*p)[dtypes.np_float8_e4m3fn.ptr()] =
             HandleNumpyScalar<tsl::float8_e4m3fn>;
+        (*p)[dtypes.np_float8_e4m3b11fnuz.ptr()] =
+            HandleNumpyScalar<tsl::float8_e4m3b11>;
         (*p)[dtypes.np_float8_e5m2.ptr()] = HandleNumpyScalar<tsl::float8_e5m2>;
         (*p)[dtypes.np_bfloat16.ptr()] = HandleNumpyScalar<bfloat16>;
         (*p)[dtypes.np_float16.ptr()] = HandleNumpyScalar<half>;
@@ -413,11 +350,6 @@ StatusOr<DevicePutResult> DevicePut(py::handle arg,
     }
   }
 
-  // Fast-path for the most common case of PyBuffer.
-  if (arg.get_type().ptr() == PyBuffer::type()) {
-    return HandlePyBuffer(arg, client, to_device, options);
-  }
-
   auto res = handlers->find(arg.get_type().ptr());
   if (res == handlers->end()) {
     for (auto base_class : arg.get_type().attr("__mro__")) {
@@ -509,40 +441,6 @@ StatusOr<PyArgSignature> PyArgSignatureOfValue(py::handle arg,
         (*p)[reinterpret_cast<PyObject*>(&PyFloat_Type)] = float_handler;
         (*p)[reinterpret_cast<PyObject*>(&PyComplex_Type)] = complex_handler;
 
-        // The Buffer types except for fast-path PyBuffer.
-        ToPyArgSignatureHandler device_array_handler =
-            [](py::handle h, bool jax_enable_x64) -> StatusOr<PyArgSignature> {
-          py::handle aval = h.attr("aval");
-          TF_ASSIGN_OR_RETURN(auto dtype,
-                              DtypeToPrimitiveType(aval.attr("dtype")));
-          return PyArgSignature(
-              dtype, py::cast<std::vector<int64_t>>(aval.attr("shape")),
-              py::cast<py::bool_>(aval.attr("weak_type")));
-        };
-        (*p)[PyBuffer::base_type()] = device_array_handler;
-
-        try {
-          py::object xla_module = py::module::import("jax.interpreters.xla");
-          py::object device_array =
-              py::getattr(xla_module, "_DeviceArray", py::none());
-          if (!device_array.is_none()) {
-            (*p)[device_array.ptr()] = device_array_handler;
-          }
-        } catch (const py::error_already_set& e) {
-          // Ignore; jax may not be present.
-        }
-
-        try {
-          py::object pxla_module = py::module::import("jax.interpreters.pxla");
-          py::object sda =
-              py::getattr(pxla_module, "ShardedDeviceArray", py::none());
-          if (!sda.is_none()) {
-            (*p)[sda.ptr()] = device_array_handler;
-          }
-        } catch (const py::error_already_set& e) {
-          // Ignore; jax may not be present.
-        }
-
         ToPyArgSignatureHandler numpy_handler =
             [](py::handle h, bool jax_enable_x64) -> StatusOr<PyArgSignature> {
           py::array numpy_array = py::cast<py::array>(h);
@@ -605,6 +503,7 @@ StatusOr<PyArgSignature> PyArgSignatureOfValue(py::handle arg,
         (*p)[dtypes.np_uint32.ptr()] = numpy_array_handler;
         (*p)[dtypes.np_uint64.ptr()] = np_uint64_handler;
         (*p)[dtypes.np_float8_e4m3fn.ptr()] = numpy_array_handler;
+        (*p)[dtypes.np_float8_e4m3b11fnuz.ptr()] = numpy_array_handler;
         (*p)[dtypes.np_float8_e5m2.ptr()] = numpy_array_handler;
         (*p)[dtypes.np_float16.ptr()] = numpy_array_handler;
         (*p)[dtypes.np_bfloat16.ptr()] = numpy_array_handler;
@@ -631,32 +530,6 @@ StatusOr<PyArgSignature> PyArgSignatureOfValue(py::handle arg,
     }
   }
 
-  // Fast-path for the most common case of PyBuffer.
-  if (arg.get_type().ptr() == PyBuffer::type()) {
-    TF_ASSIGN_OR_RETURN(PyBuffer * buffer, PyBuffer::AsPyBuffer(arg));
-    bool weak_type = buffer->weak_type().has_value()
-                         ? *buffer->weak_type()
-                         : py::cast<bool>(arg.attr("aval").attr("weak_type"));
-    TF_ASSIGN_OR_RETURN(auto primitive_type,
-                        ifrt::ToPrimitiveType(buffer->ifrt_array()->dtype()));
-    return PyArgSignature(primitive_type, buffer->ifrt_array()->shape().dims(),
-                          weak_type);
-  }
-
-  // Fast-path for ShardedDeviceArray.
-  if (jax::ShardedDeviceArray::IsShardedDeviceArray(arg)) {
-    jax::ShardedDeviceArray* sda =
-        jax::ShardedDeviceArray::AsShardedDeviceArrayUnchecked(arg);
-
-    // TODO(jblespiau): See if we can be faster not accessing the aval attribute
-    // and storing these directly.
-    py::handle aval = arg.attr("aval");
-    TF_ASSIGN_OR_RETURN(auto dtype, DtypeToPrimitiveType(aval.attr("dtype")));
-    return PyArgSignature(dtype,
-                          py::cast<std::vector<int64_t>>(aval.attr("shape")),
-                          sda->weak_type());
-  }
-
   auto res = handlers->find(arg.get_type().ptr());
   if (res == handlers->end()) {
     // We attempt to look at the MRO classes
@@ -669,7 +542,7 @@ StatusOr<PyArgSignature> PyArgSignatureOfValue(py::handle arg,
     return InvalidArgument(
         "%s",
         absl::StrCat("Not supported: The C++ ToPyArgSignature only accepts "
-                     "Buffer/DeviceArray/ShardedDeviceArray, Numpy "
+                     "Buffer/DeviceArray, Numpy "
                      "arrays scalars of supported types "
                      "(see implementation), or Python scalars. Got type ",
                      py::cast<std::string>(py::str(arg.get_type()))));
diff --git a/tensorflow/compiler/xla/python/sharded_device_array.cc b/tensorflow/compiler/xla/python/sharded_device_array.cc
deleted file mode 100644
index f4ba88dcdda..00000000000
--- a/tensorflow/compiler/xla/python/sharded_device_array.cc
+++ /dev/null
@@ -1,325 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/python/sharded_device_array.h"
-
-#include <memory>
-#include <optional>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/python/ifrt/array.h"
-#include "tensorflow/compiler/xla/python/ifrt/device.h"
-#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
-#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h"
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
-#include "tensorflow/compiler/xla/python/py_buffer.h"
-#include "tensorflow/compiler/xla/python/python_utils.h"
-#include "tensorflow/tsl/platform/statusor.h"
-
-namespace jax {
-
-namespace py = pybind11;
-
-namespace {
-
-struct ShardedDeviceArrayBaseObject {
-  PyObject_HEAD;
-};
-static_assert(std::is_standard_layout<ShardedDeviceArrayBaseObject>::value,
-              "ShardedDeviceArrayBaseObject must be standard layout");
-
-struct ShardedDeviceArrayObject {
-  ShardedDeviceArrayBaseObject base;
-  ShardedDeviceArray sda;
-  // Used by the Python interpreter to maintain a list of weak references to
-  // this object.
-  PyObject* weakrefs;
-};
-static_assert(std::is_standard_layout<ShardedDeviceArrayObject>::value,
-              "ShardedDeviceArrayObject must be standard layout");
-
-PyObject* sharded_device_array_tp_new(PyTypeObject* subtype, PyObject* args,
-                                      PyObject* kwds) {
-  ShardedDeviceArrayObject* self = reinterpret_cast<ShardedDeviceArrayObject*>(
-      subtype->tp_alloc(subtype, 0));
-  if (!self) return nullptr;
-  self->weakrefs = nullptr;
-  return reinterpret_cast<PyObject*>(self);
-}
-
-void sharded_device_array_tp_dealloc(PyObject* self) {
-  PyTypeObject* tp = Py_TYPE(self);
-  ShardedDeviceArrayObject* o =
-      reinterpret_cast<ShardedDeviceArrayObject*>(self);
-  if (o->weakrefs) {
-    PyObject_ClearWeakRefs(self);
-  }
-  o->sda.~ShardedDeviceArray();
-  tp->tp_free(self);
-  Py_DECREF(tp);
-}
-
-}  // namespace
-
-void ShardedDeviceArray::Delete() {
-  // If already deleted, do nothing.
-  if (is_deleted_) {
-    return;
-  }
-  auto array = ifrt_array();
-  if (!array.ok()) {
-    return;
-  }
-  ifrt_array_ = std::nullopt;
-  device_buffers_ = std::nullopt;
-  cpp_device_buffers_ = std::nullopt;
-  npy_value_ = std::nullopt;
-  is_deleted_ = true;
-}
-
-xla::StatusOr<xla::ifrt::Array*> ShardedDeviceArray::ifrt_array() {
-  if (ifrt_array_.has_value()) {
-    return ifrt_array_->get();
-  }
-  if (!device_buffers_.has_value()) {
-    return xla::InvalidArgument("ShardedDeviceArray has been deleted.");
-  }
-  const int num_devices = device_buffers_->size();
-  std::vector<tsl::RCReference<xla::ifrt::Array>> ifrt_arrays;
-  ifrt_arrays.reserve(num_devices);
-  std::vector<xla::ifrt::Shape> shapes;
-  shapes.reserve(num_devices);
-  xla::ifrt::DeviceList::Devices devices;
-  devices.reserve(num_devices);
-  for (auto& handle : device_buffers_.value()) {
-    // Note that invariants guarantee the cast should never fail.
-    TF_ASSIGN_OR_RETURN(xla::PyBuffer * pybuffer,
-                        xla::PyBuffer::AsPyBuffer(handle));
-    ifrt_arrays.push_back(tsl::FormRef(pybuffer->ifrt_array()));
-    shapes.push_back(pybuffer->ifrt_array()->shape());
-    devices.push_back(pybuffer->ifrt_array()->sharding().devices().front());
-  }
-  xla::ifrt::Client* client = ifrt_arrays.front()->client();
-  xla::ifrt::Shape shape(
-      pybind11::cast<std::vector<int64_t>>(aval_.attr("shape")));
-  auto sharding = xla::ifrt::OpaqueSharding::Create(
-      xla::ifrt::DeviceList(std::move(devices)),
-      xla::ifrt::OpaqueSharding::MakeDisassembleFuncFromShapes(
-          std::move(shapes)));
-  TF_ASSIGN_OR_RETURN(
-      auto ifrt_array,
-      client->AssembleArrayFromSingleDeviceArrays(
-          std::move(shape), std::move(sharding), absl::MakeSpan(ifrt_arrays),
-          xla::ifrt::ArrayCopySemantics::kReuseInput));
-  ifrt_array_ = std::move(ifrt_array);
-  return ifrt_array_->get();
-}
-
-xla::StatusOr<absl::Span<xla::PjRtBuffer* const>>
-ShardedDeviceArray::pjrt_buffers() {
-  if (cpp_device_buffers_.has_value()) {
-    return absl::MakeConstSpan(*cpp_device_buffers_);
-  }
-
-  TF_ASSIGN_OR_RETURN(auto* ifrt_array, ifrt_array());
-  auto* pjrt_array =
-      llvm::dyn_cast_or_null<xla::ifrt::PjRtCompatibleArray>(ifrt_array);
-  if (pjrt_array == nullptr) {
-    throw xla::XlaRuntimeError(
-        "This operation is implemented for a PjRt-compatible backend only.");
-  }
-  const int num_devices = device_buffers_->size();
-  std::vector<xla::PjRtBuffer*> cpp_device_buffers;
-  cpp_device_buffers.reserve(num_devices);
-  for (const auto& pjrt_buffer : pjrt_array->pjrt_buffers()) {
-    cpp_device_buffers.push_back(pjrt_buffer.get());
-  }
-  cpp_device_buffers_ = std::move(cpp_device_buffers);
-  return absl::MakeConstSpan(cpp_device_buffers_.value());
-}
-
-PyObject* ShardedDeviceArray::base_type_ = nullptr;
-PyObject* ShardedDeviceArray::type_ = nullptr;
-
-/*static*/ ShardedDeviceArray::object ShardedDeviceArray::Make(
-    py::object aval, ShardingSpec sharding_spec, py::list device_buffers,
-    py::object indices, bool weak_type) {
-  py::object obj =
-      py::reinterpret_steal<py::object>(sharded_device_array_tp_new(
-          reinterpret_cast<PyTypeObject*>(type_), nullptr, nullptr));
-  ShardedDeviceArrayObject* sda =
-      reinterpret_cast<ShardedDeviceArrayObject*>(obj.ptr());
-  new (&sda->sda)
-      ShardedDeviceArray(aval, std::move(sharding_spec),
-                         std::move(device_buffers), indices, weak_type);
-  return py::reinterpret_borrow<ShardedDeviceArray::object>(obj);
-}
-
-bool ShardedDeviceArray::IsShardedDeviceArray(py::handle handle) {
-  return handle.get_type() == ShardedDeviceArray::type();
-}
-
-/*static*/ ShardedDeviceArray*
-ShardedDeviceArray::AsShardedDeviceArrayUnchecked(py::handle handle) {
-  return &(reinterpret_cast<ShardedDeviceArrayObject*>(handle.ptr())->sda);
-}
-
-/*static*/ xla::StatusOr<ShardedDeviceArray*>
-ShardedDeviceArray::AsShardedDeviceArray(py::handle handle) {
-  if (!IsShardedDeviceArray(handle)) {
-    return xla::InvalidArgument("Expected a ShardedDeviceArray");
-  }
-  return AsShardedDeviceArrayUnchecked(handle);
-}
-
-py::handle ShardedDeviceArray::AsHandle() {
-  return reinterpret_cast<PyObject*>(reinterpret_cast<char*>(this) -
-                                     offsetof(ShardedDeviceArrayObject, sda));
-}
-
-/*static*/ xla::Status ShardedDeviceArray::RegisterTypes(py::module& m) {
-  // We need to use heap-allocated type objects because we want to add
-  // additional methods dynamically.
-  // Similar to py_buffer.cc
-  {
-    py::str name = py::str("ShardedDeviceArrayBase");
-    py::str qualname = py::str("ShardedDeviceArrayBase");
-    PyHeapTypeObject* heap_type = reinterpret_cast<PyHeapTypeObject*>(
-        PyType_Type.tp_alloc(&PyType_Type, 0));
-    // Caution: we must not call any functions that might invoke the GC until
-    // PyType_Ready() is called. Otherwise the GC might see a half-constructed
-    // type object.
-    if (!heap_type) {
-      return xla::Internal("Unable to create heap type object");
-    }
-    heap_type->ht_name = name.release().ptr();
-    heap_type->ht_qualname = qualname.release().ptr();
-    PyTypeObject* type = &heap_type->ht_type;
-    type->tp_name = "ShardedDeviceArrayBase";
-    type->tp_basicsize = sizeof(ShardedDeviceArrayBaseObject);
-    type->tp_flags =
-        Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE | Py_TPFLAGS_BASETYPE;
-    TF_RET_CHECK(PyType_Ready(type) == 0);
-    base_type_ = reinterpret_cast<PyObject*>(type);
-  }
-  py::object base_type = py::reinterpret_borrow<py::object>(base_type_);
-  base_type.attr("__module__") = m.attr("__name__");
-  m.attr("ShardedDeviceArrayBase") = base_type;
-
-  {
-    py::tuple bases = py::make_tuple(base_type);
-    py::str name = py::str("ShardedDeviceArray");
-    py::str qualname = py::str("ShardedDeviceArray");
-    PyHeapTypeObject* heap_type = reinterpret_cast<PyHeapTypeObject*>(
-        PyType_Type.tp_alloc(&PyType_Type, 0));
-    // Caution: we must not call any functions that might invoke the GC until
-    // PyType_Ready() is called below. Otherwise the GC might see a
-    // half-constructed type object.
-    if (!heap_type) {
-      return xla::Internal("Unable to create heap type object");
-    }
-    heap_type->ht_name = name.release().ptr();
-    heap_type->ht_qualname = qualname.release().ptr();
-    PyTypeObject* type = &heap_type->ht_type;
-    type->tp_name = "ShardedDeviceArray";
-    type->tp_basicsize = sizeof(ShardedDeviceArrayObject);
-    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE;
-    type->tp_bases = bases.release().ptr();
-    type->tp_dealloc = sharded_device_array_tp_dealloc;
-    type->tp_new = sharded_device_array_tp_new;
-    // Supported protocols
-    type->tp_as_number = &heap_type->as_number;
-    type->tp_as_sequence = &heap_type->as_sequence;
-    type->tp_as_mapping = &heap_type->as_mapping;
-    type->tp_as_buffer = nullptr;
-
-    // Allow weak references to DeviceArray objects.
-    type->tp_weaklistoffset = offsetof(ShardedDeviceArrayObject, weakrefs);
-
-    TF_RET_CHECK(PyType_Ready(type) == 0);
-    type_ = reinterpret_cast<PyObject*>(type);
-  }
-  py::object type = py::reinterpret_borrow<py::object>(type_);
-  type.attr("__module__") = m.attr("__name__");
-  m.attr("ShardedDeviceArray") = type;
-
-  type.attr("make") = def_static([](py::object aval, ShardingSpec sharding_spec,
-                                    py::object sharded_buffer_or_device_buffers,
-                                    py::object indices, bool weak_type) {
-    return ShardedDeviceArray::Make(
-        aval, sharding_spec,
-        std::move(sharded_buffer_or_device_buffers).cast<py::list>(), indices,
-        weak_type);
-  });
-  type.attr("aval") =
-      property_readonly([](ShardedDeviceArray::object self) -> py::object {
-        return self.sda()->aval();
-      });
-  type.attr("indices") =
-      property_readonly([](ShardedDeviceArray::object self) -> py::object {
-        return self.sda()->indices();
-      });
-  type.attr("sharding_spec") =
-      property_readonly([](ShardedDeviceArray::object self) {
-        return self.sda()->GetShardingSpec();
-      });
-  type.attr("device_buffers") =
-      property_readonly([](ShardedDeviceArray::object self) {
-        return self.sda()->device_buffers();
-      });
-  type.attr("_npy_value") = property(
-      [](ShardedDeviceArray::object self) { return self.sda()->npy_value(); },
-      [](ShardedDeviceArray::object self, py::object npy_value) {
-        return self.sda()->set_npy_value(npy_value);
-      });
-  type.attr("_one_replica_buffer_indices") = property(
-      [](ShardedDeviceArray::object self) {
-        return self.sda()->one_replica_buffer_indices();
-      },
-      [](ShardedDeviceArray::object self, py::object obj) {
-        return self.sda()->set_one_replica_buffer_indices(obj);
-      });
-  type.attr("shape") = property_readonly([](ShardedDeviceArray::object self) {
-    return self.sda()->aval().attr("shape");
-  });
-  type.attr("dtype") = property_readonly([](ShardedDeviceArray::object self) {
-    return self.sda()->aval().attr("dtype");
-  });
-  type.attr("size") = property_readonly([](ShardedDeviceArray::object self) {
-    py::tuple shape = py::cast<py::tuple>(self.sda()->aval().attr("shape"));
-    int64_t size = 1;
-    for (auto dim : shape) {
-      size *= py::cast<int64_t>(dim);
-    }
-    return size;
-  });
-  type.attr("ndim") = property_readonly([](ShardedDeviceArray::object self) {
-    return py::len(self.sda()->aval().attr("shape"));
-  });
-
-  type.attr("delete") = py::cpp_function(
-      [](ShardedDeviceArray::object self) { self.sda()->Delete(); },
-      py::is_method(type));
-  type.attr("is_deleted") = py::cpp_function(
-      [](ShardedDeviceArray::object self) { return self.sda()->is_deleted(); },
-      py::is_method(type));
-
-  return ::tsl::OkStatus();
-}
-
-}  // namespace jax
diff --git a/tensorflow/compiler/xla/python/sharded_device_array.h b/tensorflow/compiler/xla/python/sharded_device_array.h
index ad0e3367b8c..ba157b48c32 100644
--- a/tensorflow/compiler/xla/python/sharded_device_array.h
+++ b/tensorflow/compiler/xla/python/sharded_device_array.h
@@ -217,140 +217,6 @@ H AbslHashValue(H h, const ShardingSpec& key) {
   return h;
 }
 
-// A ShardedDeviceArray is an ndarray sharded across devices.
-//
-// The purpose of a ShardedDeviceArray is to reduce the number of transfers when
-// executing replicated computations, by allowing results to persist on the
-// devices that produced them. That way dispatching a similarly replicated
-// computation that consumes the same sharded memory layout does not incur any
-// transfers.
-
-// A ShardedDeviceArray represents one logical ndarray value, and simulates the
-// behavior of an ndarray so that it can be treated by user code as an ndarray;
-// that is, it is only an optimization to reduce transfers.
-
-// Design note: We move to C++, only what will need to be accessed by C++ to
-// execute a pmap computation. A large part of the logic is still in Python.
-class ShardedDeviceArray {
- public:
-  ShardedDeviceArray(const ShardedDeviceArray&) = delete;
-  ShardedDeviceArray& operator=(const ShardedDeviceArray&) = delete;
-  ShardedDeviceArray(ShardedDeviceArray&&) = default;
-  ShardedDeviceArray& operator=(ShardedDeviceArray&&) = default;
-
-  // Delete all the underlying buffers (freeing memory on device).
-  // The Numpy value on the host, if it exists, will also be deleted.
-  void Delete();
-  const ShardingSpec& GetShardingSpec() const { return sharding_spec_; }
-
-  // Returns an error status iff the object has been deleted.
-  xla::StatusOr<xla::ifrt::Array*> ifrt_array();
-
-  // Returns an error status iff the object has been deleted.
-  xla::StatusOr<absl::Span<xla::PjRtBuffer* const>> pjrt_buffers();
-
-  bool is_deleted() const { return is_deleted_; }
-  bool weak_type() const { return weak_type_; }
-  std::optional<pybind11::list> device_buffers() const {
-    return device_buffers_;
-  }
-  pybind11::object aval() const { return aval_; }
-  pybind11::object indices() const { return indices_; }
-
-  std::optional<pybind11::object> npy_value() const { return npy_value_; }
-  void set_npy_value(pybind11::object npy_value) { npy_value_ = npy_value; }
-
-  std::optional<pybind11::object> one_replica_buffer_indices() const {
-    return one_replica_buffer_indices_;
-  }
-  void set_one_replica_buffer_indices(pybind11::object obj) {
-    one_replica_buffer_indices_ = obj;
-  }
-
-  // Python-wrapper definitions.
-
-  // pybind11::object typed subclass for PyBuffer objects.
-  class pyobject : public pybind11::object {
-   public:
-    PYBIND11_OBJECT(pyobject,  // NOLINT
-                    pybind11::object, ShardedDeviceArray::IsShardedDeviceArray);
-    pyobject() = default;
-    ShardedDeviceArray* sda() const {
-      return ShardedDeviceArray::AsShardedDeviceArrayUnchecked(*this);
-    }
-  };
-  using object = pyobject;
-
-  // Returns true if `handle` is a IsShardedDeviceArray.
-  static bool IsShardedDeviceArray(pybind11::handle handle);
-  // Converts `handle` to a PyBuffer*. Does not do any checking.
-  static ShardedDeviceArray* AsShardedDeviceArrayUnchecked(
-      pybind11::handle handle);
-  // Converts `handle` to a PyBuffer*. Returns an error status if
-  // !IsPyBuffer(handle)
-  static xla::StatusOr<ShardedDeviceArray*> AsShardedDeviceArray(
-      pybind11::handle handle);
-
-  // Gets a Python handle to an existing ShardedDeviceArray. Assumes the
-  // PyObject was allocated on the Python heap, which is the case if Make() was
-  // used.
-  pybind11::handle AsHandle();
-
-  static object Make(pybind11::object aval, ShardingSpec sharding_spec,
-                     pybind11::list device_buffers, pybind11::object indices,
-                     bool weak_type);
-
-  static xla::Status RegisterTypes(pybind11::module& m);
-  static PyObject* base_type() { return base_type_; }
-  static PyObject* type() { return type_; }
-
- private:
-  // Buffers are expected to be xla::PyBuffer objects, but as there are
-  // alternative backend implementations, this may not be guaranteed.
-  // TODO(jblespiau): As soon as PjRtBuffer is supported by all
-  // implementations, we should be able to store this with the C++ objects.
-  ShardedDeviceArray(pybind11::object aval, ShardingSpec sharding_spec,
-                     pybind11::list device_buffers, pybind11::object indices,
-                     bool weak_type)
-      : aval_(std::move(aval)),
-        sharding_spec_(std::move(sharding_spec)),
-        indices_(std::move(indices)),
-        device_buffers_(std::move(device_buffers)),
-        weak_type_(weak_type) {}
-  static PyObject* base_type_;
-  static PyObject* type_;
-
-  // A ShapedArray indicating the shape and dtype of this array.
-  pybind11::object aval_;
-  // Describes how this array is sharded across `device_buffers`.
-  ShardingSpec sharding_spec_;
-  // The `indices` used to slice numpy array into the underlying list of
-  // buffers. See the Python pxla.py:spec_to_indices function.
-  pybind11::object indices_;
-  // The buffers containing the data for this array. Each buffer is the same
-  // shape and on a different device. Buffers are in row-major order, with
-  // replication treated as an extra innermost dimension.
-  std::optional<pybind11::list> device_buffers_;
-
-  std::optional<pybind11::object> npy_value_ = std::nullopt;
-  std::optional<pybind11::object> one_replica_buffer_indices_ = std::nullopt;
-
-  std::optional<tsl::RCReference<xla::ifrt::Array>> ifrt_array_ = std::nullopt;
-
-  // The device_buffers as a C++ object. As this is what we consume from C++
-  // and this is also what we generate from C++, cache the result so that
-  // we don't have to perform casts.
-  // TODO(jblespiau): Make this the default, and have `device_buffers_` the
-  // the optional Python value if it's accessed from Python.
-  std::optional<std::vector<xla::PjRtBuffer*>> cpp_device_buffers_ =
-      std::nullopt;
-
-  // The weak_type to prevent accessing the "aval_.weak_type" attribute which
-  // is significantly slower.
-  bool weak_type_;
-  bool is_deleted_ = false;
-};
-
 }  // namespace jax
 
 #endif  // TENSORFLOW_COMPILER_XLA_PYTHON_SHARDED_DEVICE_ARRAY_H_
diff --git a/tensorflow/compiler/xla/python/sharding.h b/tensorflow/compiler/xla/python/sharding.h
index 20912147e7d..656093723fe 100644
--- a/tensorflow/compiler/xla/python/sharding.h
+++ b/tensorflow/compiler/xla/python/sharding.h
@@ -155,7 +155,7 @@ class GSPMDSharding : public XLACompatibleSharding {
   xla::HloSharding hlo_sharding() const {
     auto hlo_sharding = xla::HloSharding::FromProto(op_sharding_);
     if (!hlo_sharding.ok()) {
-      throw xla::XlaRuntimeError(hlo_sharding.status().error_message());
+      throw xla::XlaRuntimeError(std::string(hlo_sharding.status().message()));
     }
     return hlo_sharding.value();
   }
@@ -170,7 +170,7 @@ class GSPMDSharding : public XLACompatibleSharding {
     // We only hash `op_sharding_` here for performance.
     auto hlo_sharding = xla::HloSharding::FromProto(op_sharding_);
     if (!hlo_sharding.ok()) {
-      throw xla::XlaRuntimeError(hlo_sharding.status().error_message());
+      throw xla::XlaRuntimeError(std::string(hlo_sharding.status().message()));
     }
     return absl::Hash<xla::HloSharding>()(*hlo_sharding);
   }
diff --git a/tensorflow/compiler/xla/python/status_casters.h b/tensorflow/compiler/xla/python/status_casters.h
index 82eb5535669..87fe8a48de4 100644
--- a/tensorflow/compiler/xla/python/status_casters.h
+++ b/tensorflow/compiler/xla/python/status_casters.h
@@ -17,14 +17,129 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PYTHON_STATUS_CASTERS_H_
 
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
 #include "tensorflow/compiler/xla/python/exceptions.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/tsl/platform/macros.h"
 
 namespace xla {
 
-// Helper that converts a failing StatusOr to an exception.
-// For use only inside pybind11 code.
+// C++ -> Python caster helpers.
+//
+// Failing statuses become Python exceptions; OK Status() becomes None.
+//
+// Given there can be only a single global pybind11 type_caster for the
+// `absl::Status` type, and given XLA wants a custom exception being raised,
+// we use a dedicated helper to implement this feature without relying on a
+// global `type_caster`.
+//
+// For example:
+//
+// - Functions without arguments:
+//   m.def("my_func", []() { xla::ThrowIfError(MyFunc()); }
+// - Classes with a single argument:
+//   py_class.def("delete", [](Buffer& self) {
+//     xla::ThrowIfError(self.Delete());
+//   }
+//
+// For functions with more arguments, you can either inline the arguments,
+// or use the `ThrowIfErrorWrapper` wrapper defined below:
+//
+// m.def("my_func", xla::ThrowIfErrorWrapper(MyFunc));
+//
+// Nonstatic member functions can be wrapped by passing a
+// pointer-to-member-function:
+// xla::ThrowIfErrorWrapper(&MyClass::MyMethod)
+
+inline void ThrowIfError(xla::Status src) {
+  if (!src.ok()) {
+    throw xla::XlaRuntimeError(src);
+  }
+}
+
+// If one does not want to have to define a lambda specifying the inputs
+// arguments, on can use the `ThrowIfErrorWrapper` wrapper.
+//
+// There are three specializations:
+// - For free functions, `Sig` is the function type and `F` is `Sig&`.
+// - For callable types, `Sig` is the pointer to member function type
+//   and `F` is the type of the callable.
+// - For a nonstatic member function of a class `C`, `Sig` is the function type
+//   and `F` is Sig C::*.
+//
+// In the first two cases, the wrapper returns a callable with signature `Sig`;
+// in the third case, the wrapper returns callable with a modified signature
+// that takes a C instance as the first argument.
+template <typename Sig, typename F>
+struct ThrowIfErrorWrapper;
+
+// C++17 "deduction guide" that guides class template argument deduction (CTAD)
+// For free functions.
+template <typename F>
+ThrowIfErrorWrapper(F) -> ThrowIfErrorWrapper<decltype(&F::operator()), F>;
+
+// For callable types (with operator()).
+template <typename... Args>
+ThrowIfErrorWrapper(xla::Status (&)(Args...))
+    -> ThrowIfErrorWrapper<xla::Status(Args...), xla::Status (&)(Args...)>;
+
+// For unbound nonstatic member functions.
+template <typename C, typename... Args>
+ThrowIfErrorWrapper(xla::Status (C::*)(Args...))
+    -> ThrowIfErrorWrapper<xla::Status(Args...), C>;
+
+// Template specializations.
+
+// For free functions.
+template <typename... Args>
+struct ThrowIfErrorWrapper<xla::Status(Args...), xla::Status (&)(Args...)> {
+  explicit ThrowIfErrorWrapper(xla::Status (&f)(Args...)) : func(f) {}
+  void operator()(Args... args) {
+    xla::ThrowIfError(func(std::forward<Args>(args)...));
+  }
+  xla::Status (&func)(Args...);
+};
+
+// For callable types (with operator()), non-const and const versions.
+template <typename C, typename... Args, typename F>
+struct ThrowIfErrorWrapper<xla::Status (C::*)(Args...), F> {
+  explicit ThrowIfErrorWrapper(F&& f) : func(std::move(f)) {}
+  void operator()(Args... args) {
+    xla::ThrowIfError(func(std::forward<Args>(args)...));
+  }
+  F func;
+};
+template <typename C, typename... Args, typename F>
+struct ThrowIfErrorWrapper<xla::Status (C::*)(Args...) const, F> {
+  explicit ThrowIfErrorWrapper(F&& f) : func(std::move(f)) {}
+  void operator()(Args... args) const {
+    xla::ThrowIfError(func(std::forward<Args>(args)...));
+  }
+  F func;
+};
+
+// For unbound nonstatic member functions, non-const and const versions.
+// `ptmf` stands for "pointer to member function".
+template <typename C, typename... Args>
+struct ThrowIfErrorWrapper<xla::Status(Args...), C> {
+  explicit ThrowIfErrorWrapper(xla::Status (C::*ptmf)(Args...)) : ptmf(ptmf) {}
+  void operator()(C& instance, Args... args) {
+    xla::ThrowIfError((instance.*ptmf)(std::forward<Args>(args)...));
+  }
+  xla::Status (C::*ptmf)(Args...);
+};
+template <typename C, typename... Args>
+struct ThrowIfErrorWrapper<xla::Status(Args...) const, C> {
+  explicit ThrowIfErrorWrapper(xla::Status (C::*ptmf)(Args...) const)
+      : ptmf(ptmf) {}
+  void operator()(const C& instance, Args... args) const {
+    xla::ThrowIfError((instance.*ptmf)(std::forward<Args>(args)...));
+  }
+  xla::Status (C::*ptmf)(Args...) const;
+};
+
+// Utilities for `StatusOr`.
 template <typename T>
 T ValueOrThrow(StatusOr<T> v) {
   if (!v.ok()) {
@@ -33,50 +148,73 @@ T ValueOrThrow(StatusOr<T> v) {
   return std::move(v).value();
 }
 
+template <typename Sig, typename F>
+struct ValueOrThrowWrapper;
+
+template <typename F>
+ValueOrThrowWrapper(F) -> ValueOrThrowWrapper<decltype(&F::operator()), F>;
+
+template <typename R, typename... Args>
+ValueOrThrowWrapper(xla::StatusOr<R> (&)(Args...))
+    -> ValueOrThrowWrapper<xla::StatusOr<R>(Args...),
+                           xla::StatusOr<R> (&)(Args...)>;
+
+template <typename C, typename R, typename... Args>
+ValueOrThrowWrapper(xla::StatusOr<R> (C::*)(Args...))
+    -> ValueOrThrowWrapper<xla::StatusOr<R>(Args...), C>;
+
+// Deduction guide for const methods.
+template <typename C, typename R, typename... Args>
+ValueOrThrowWrapper(xla::StatusOr<R> (C::*)(Args...) const)
+    -> ValueOrThrowWrapper<xla::StatusOr<R>(Args...) const, C>;
+
+template <typename R, typename... Args>
+struct ValueOrThrowWrapper<xla::StatusOr<R>(Args...),
+                           xla::StatusOr<R> (&)(Args...)> {
+  explicit ValueOrThrowWrapper(xla::StatusOr<R> (&f)(Args...)) : func(f) {}
+  R operator()(Args... args) {
+    return xla::ValueOrThrow(func(std::forward<Args>(args)...));
+  }
+  xla::StatusOr<R> (&func)(Args...);
+};
+template <typename R, typename C, typename... Args, typename F>
+struct ValueOrThrowWrapper<xla::StatusOr<R> (C::*)(Args...), F> {
+  explicit ValueOrThrowWrapper(F&& f) : func(std::move(f)) {}
+  R operator()(Args... args) {
+    return xla::ValueOrThrow(func(std::forward<Args>(args)...));
+  }
+  F func;
+};
+template <typename R, typename C, typename... Args, typename F>
+struct ValueOrThrowWrapper<xla::StatusOr<R> (C::*)(Args...) const, F> {
+  explicit ValueOrThrowWrapper(F&& f) : func(std::move(f)) {}
+  R operator()(Args... args) const {
+    return xla::ValueOrThrow(func(std::forward<Args>(args)...));
+  }
+  F func;
+};
+
+// For unbound nonstatic member functions, non-const and const versions.
+// `ptmf` stands for "pointer to member function".
+template <typename R, typename C, typename... Args>
+struct ValueOrThrowWrapper<xla::StatusOr<R>(Args...), C> {
+  explicit ValueOrThrowWrapper(xla::StatusOr<R> (C::*ptmf)(Args...))
+      : ptmf(ptmf) {}
+  R operator()(C& instance, Args... args) {
+    return xla::ValueOrThrow((instance.*ptmf)(std::forward<Args>(args)...));
+  }
+  xla::StatusOr<R> (C::*ptmf)(Args...);
+};
+template <typename R, typename C, typename... Args>
+struct ValueOrThrowWrapper<xla::StatusOr<R>(Args...) const, C> {
+  explicit ValueOrThrowWrapper(xla::StatusOr<R> (C::*ptmf)(Args...) const)
+      : ptmf(ptmf) {}
+  R operator()(const C& instance, Args... args) const {
+    return xla::ValueOrThrow((instance.*ptmf)(std::forward<Args>(args)...));
+  }
+  xla::StatusOr<R> (C::*ptmf)(Args...) const;
+};
+
 }  // namespace xla
 
-// This namespace is a documented pybind11 extension point.
-// Caution: Unusually for Google code, this code uses C++ exceptions because
-// they are the only mechanism for reporting cast failures to pybind11. However,
-// the exceptions are local to the binding code.
-namespace pybind11 {
-namespace detail {
-
-// Status, StatusOr. Failing statuses become Python exceptions; OK Status()
-// becomes None.
-template <>
-struct type_caster<xla::Status> {
- public:
-  PYBIND11_TYPE_CASTER(xla::Status, _("Status"));
-
-  static handle cast(xla::Status src, return_value_policy /* policy */,
-                     handle /* parent */) {
-    if (!src.ok()) {
-      throw xla::XlaRuntimeError(src);
-    }
-    return none().inc_ref();
-  }
-};
-
-template <typename T>
-struct type_caster<xla::StatusOr<T>> {
- public:
-  using value_conv = make_caster<T>;
-
-  PYBIND11_TYPE_CASTER(xla::StatusOr<T>,
-                       _("StatusOr[") + value_conv::name + _("]"));
-
-  static handle cast(xla::StatusOr<T> src, return_value_policy policy,
-                     handle parent) {
-    if (!src.ok()) {
-      throw xla::XlaRuntimeError(src.status());
-    }
-    return value_conv::cast(std::forward<xla::StatusOr<T>>(src).value(), policy,
-                            parent);
-  }
-};
-
-}  // namespace detail
-}  // namespace pybind11
-
 #endif  // TENSORFLOW_COMPILER_XLA_PYTHON_STATUS_CASTERS_H_
diff --git a/tensorflow/compiler/xla/python/status_casters_ext.cc b/tensorflow/compiler/xla/python/status_casters_ext.cc
new file mode 100644
index 00000000000..b721c60de89
--- /dev/null
+++ b/tensorflow/compiler/xla/python/status_casters_ext.cc
@@ -0,0 +1,65 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "tensorflow/compiler/xla/python/exceptions.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
+
+namespace xla {
+
+namespace py = ::pybind11;
+
+namespace {
+
+xla::Status MyFunc() { return xla::OkStatus(); }
+
+class MyClass {
+ public:
+  xla::Status MyMethod(int a, int b) { return xla::OkStatus(); }
+  xla::Status MyMethodConst(int a, int b) const { return xla::OkStatus(); }
+
+  xla::StatusOr<int> MyStatusOrMethod(int a, int b) { return a + b; }
+  xla::StatusOr<int> MyStatusOrMethodConst(int a, int b) const { return a + b; }
+};
+
+xla::StatusOr<int> StatusOrIdentity(int i) { return i; }
+
+PYBIND11_MODULE(status_casters_ext, m) {
+  // Exceptions
+  py::register_exception<xla::XlaRuntimeError>(m, "XlaRuntimeError",
+                                               PyExc_RuntimeError);
+
+  m.def("my_lambda",
+        xla::ThrowIfErrorWrapper([]() { return xla::OkStatus(); }));
+  m.def("my_lambda2", xla::ThrowIfErrorWrapper(MyFunc));
+
+  m.def("my_lambda_statusor",
+        xla::ValueOrThrowWrapper([]() -> xla::StatusOr<int> { return 1; }));
+  m.def("status_or_identity", xla::ValueOrThrowWrapper(StatusOrIdentity));
+
+  py::class_<MyClass> my_class(m, "MyClass");
+  my_class.def(py::init<>());
+  my_class.def("my_method", xla::ThrowIfErrorWrapper(&MyClass::MyMethod));
+  my_class.def("my_method_const", xla::ThrowIfErrorWrapper(&MyClass::MyMethod));
+  my_class.def("my_method_status_or",
+               xla::ValueOrThrowWrapper(&MyClass::MyStatusOrMethod));
+  my_class.def("my_method_status_or_const",
+               xla::ValueOrThrowWrapper(&MyClass::MyStatusOrMethodConst));
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/status_casters_test.py b/tensorflow/compiler/xla/python/status_casters_test.py
new file mode 100644
index 00000000000..049f73b52ef
--- /dev/null
+++ b/tensorflow/compiler/xla/python/status_casters_test.py
@@ -0,0 +1,42 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests and examples for status_casters.h."""
+
+from absl.testing import absltest
+
+from tensorflow.compiler.xla.python import status_casters_ext
+
+
+class StatusCastersTest(absltest.TestCase):
+
+  def test_status_wrappers(self):
+    self.assertIsNone(status_casters_ext.my_lambda())
+    self.assertIsNone(status_casters_ext.my_lambda2())
+
+    self.assertIsNone(status_casters_ext.MyClass().my_method(1, 2))
+    self.assertIsNone(status_casters_ext.MyClass().my_method_const(1, 2))
+
+  def test_status_or_wrappers(self):
+    self.assertEqual(status_casters_ext.my_lambda_statusor(), 1)
+    self.assertEqual(status_casters_ext.status_or_identity(2), 2)
+
+    self.assertEqual(status_casters_ext.MyClass().my_method_status_or(1, 2), 3)
+    self.assertEqual(
+        status_casters_ext.MyClass().my_method_status_or_const(1, 2), 3
+    )
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/tensorflow/compiler/xla/python/tpu_driver/BUILD b/tensorflow/compiler/xla/python/tpu_driver/BUILD
index 6952c57b8e1..e24fff18720 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/BUILD
+++ b/tensorflow/compiler/xla/python/tpu_driver/BUILD
@@ -47,14 +47,14 @@ cc_library(
         "tpu_driver.h",
     ],
     deps = [
-        "//tensorflow/tsl/platform:logging",
+        ":tpu_driver_proto_cc",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        ":tpu_driver_proto_cc",
+        "//tensorflow/tsl/platform:logging",
     ] + external_deps(),
 )
 
@@ -66,14 +66,14 @@ cc_library(
     hdrs = ["grpc_tpu_driver.h"],
     deps = [
         ":tpu_driver",
-        "//tensorflow/tsl/platform:logging",
+        ":tpu_driver_proto_cc",
+        ":tpu_service_cc_grpc_proto",
+        ":tpu_service_proto_cc",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        ":tpu_service_proto_cc",
-        ":tpu_service_cc_grpc_proto",
-        ":tpu_driver_proto_cc",
+        "//tensorflow/tsl/platform:logging",
     ] + tsl_grpc_cc_dependencies() + external_deps(),
     alwayslink = 1,
 )
@@ -84,16 +84,16 @@ cc_library(
     compatible_with = [],
     deps = [
         ":tpu_driver",
-        "@com_google_absl//absl/strings:str_format",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/tsl/platform:logging",
+        ":tpu_driver_proto_cc",
+        ":tpu_service_proto_cc",
         "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        ":tpu_service_proto_cc",
-        ":tpu_driver_proto_cc",
         "//tensorflow/compiler/xla/python/tpu_driver/client:libtpu",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/tsl/platform:logging",
+        "@com_google_absl//absl/strings:str_format",
     ] + external_deps(),
     alwayslink = 1,
 )
@@ -105,17 +105,17 @@ cc_library(
     ],
     deps = [
         ":tpu_driver",
-        "@com_google_absl//absl/base",
-        "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
+        ":tpu_driver_proto_cc",
+        ":tpu_service_cc_grpc_proto",
+        ":tpu_service_proto_cc",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
-        ":tpu_driver_proto_cc",
-        ":tpu_service_proto_cc",
-        ":tpu_service_cc_grpc_proto",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:logging",
+        "@com_google_absl//absl/base",
     ] + external_deps(),
     alwayslink = 1,
 )
@@ -127,13 +127,13 @@ cc_library(
         ":grpc_tpu_driver",
         ":tpu_driver",
         ":tpu_driver_proto_cc",
-        "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/compiler/xla/pjrt:semaphore",
         "//tensorflow/compiler/xla/pjrt:worker_thread",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + tsl_grpc_cc_dependencies() + external_deps(),
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
index 1211557c373..bca2da5ca22 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
@@ -61,6 +61,7 @@ tsl_pybind_extension(
         ":tpu_client",
         "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
         "//tensorflow/compiler/xla/python:python_ref_manager",
+        "//tensorflow/compiler/xla/python:status_casters",
         "//tensorflow/compiler/xla/python:types",
         "//tensorflow/compiler/xla/python:util",
         "//tensorflow/compiler/xla/service:computation_placer",
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
index a206ee89022..fea8c8dabf7 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h"
 
 #include <algorithm>
+#include <array>
 #include <memory>
 #include <string>
 #include <utility>
@@ -38,8 +39,9 @@ limitations under the License.
 
 namespace xla {
 
-TpuDevice::TpuDevice(int id, int process_index,
-                     const std::array<int, 3>& coords, int core_on_chip)
+TpuDeviceDescription::TpuDeviceDescription(int id, int process_index,
+                                           const std::array<int, 3>& coords,
+                                           int core_on_chip)
     : id_(id),
       process_index_(process_index),
       coords_(coords),
@@ -52,9 +54,9 @@ TpuDevice::TpuDevice(int id, int process_index,
       process_index_, absl::StrJoin(coords_, ","), core_on_chip_);
 }
 
-absl::string_view TpuDevice::DebugString() const { return debug_string_; }
-
-absl::string_view TpuDevice::ToString() const { return to_string_; }
+TpuDevice::TpuDevice(int id, int process_index,
+                     const std::array<int, 3>& coords, int core_on_chip)
+    : description_(id, process_index, coords, core_on_chip) {}
 
 xla::StatusOr<std::vector<std::shared_ptr<xla::PjRtDevice>>>
 TpuDevice::GetTpuDevices(const tpu_driver::SystemInfo& system_info) {
@@ -336,9 +338,8 @@ Status PyTpuBuffer::CopyToHostAsync() {
     t->AddCallback([host_value](const xla::Status& status) {
       VLOG(1) << "Device to host transfer finished.";
       if (!status.ok()) {
-        host_value->status =
-            Status(static_cast<absl::StatusCode>(status.code()),
-                   status.error_message());
+        host_value->status = Status(
+            static_cast<absl::StatusCode>(status.code()), status.message());
       }
 
       absl::MutexLock m(&host_value->mutex);
@@ -737,7 +738,7 @@ PyTpuExecutable::ExecuteOnLocalDevices(
     if (!s.ok()) {
       if (failed == 0) {
         first_failure_status =
-            Status(static_cast<absl::StatusCode>(s.code()), s.error_message());
+            Status(static_cast<absl::StatusCode>(s.code()), s.message());
       }
       ++failed;
     }
@@ -860,8 +861,8 @@ PyTpuExecutable::ExecuteShardedOnLocalDevices(
         compiled_program->program_shape(&program_shape_proto);
 
     if (!fetch_metadata_status.ok()) {
-      return Status(static_cast<tsl::error::Code>(fetch_metadata_status.code()),
-                    fetch_metadata_status.error_message());
+      return Status(static_cast<absl::StatusCode>(fetch_metadata_status.code()),
+                    fetch_metadata_status.message());
     }
     result_layout = ::xla::Shape(program_shape_proto.result());
   }
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
index babd2e8a159..378bcd68b3a 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_TPU_CLIENT_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_TPU_CLIENT_H_
 
+#include <array>
 #include <functional>
 #include <memory>
 #include <string>
@@ -45,6 +46,41 @@ inline const char* TpuPlatform() {
   return kTpuPlatform;
 }
 
+class TpuDeviceDescription : public PjRtDeviceDescription {
+ public:
+  TpuDeviceDescription(int id, int process_index,
+                       const std::array<int, 3>& coords, int core_on_chip);
+
+  const std::array<int, 3>& coords() const { return coords_; }
+  int core_on_chip() const { return core_on_chip_; }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+  int id() const override { return id_; }
+
+  int process_index() const override { return process_index_; }
+
+  absl::string_view device_kind() const override { return device_kind_; }
+
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+ private:
+  const int id_;
+  const int process_index_;
+  const std::array<int, 3> coords_;
+  // Index of the core of the same chip.
+  int core_on_chip_;
+  const std::string device_kind_ = "Cloud TPU";
+  std::string debug_string_;
+  std::string to_string_;
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_ = {};
+};
+
 class PyTpuClient;
 
 class TpuDevice : public PjRtDevice {
@@ -52,12 +88,12 @@ class TpuDevice : public PjRtDevice {
   TpuDevice(int id, int process_index, const std::array<int, 3>& coords,
             int core_on_chip);
 
-  const std::array<int, 3>& coords() const { return coords_; }
-  int core_on_chip() const { return core_on_chip_; }
+  const TpuDeviceDescription& description() const override {
+    return description_;
+  }
 
-  absl::string_view DebugString() const override;
-
-  absl::string_view ToString() const override;
+  const std::array<int, 3>& coords() const { return description().coords(); }
+  int core_on_chip() const { return description().core_on_chip(); }
 
   static xla::StatusOr<std::vector<std::shared_ptr<xla::PjRtDevice>>>
   GetTpuDevices(const tpu_driver::SystemInfo& system_info);
@@ -68,14 +104,8 @@ class TpuDevice : public PjRtDevice {
 
   bool IsAddressable() const override { return false; }
 
-  int id() const override { return id_; }
-
-  int process_index() const override { return process_index_; }
-
   int local_hardware_id() const override { return -1; }
 
-  absl::string_view device_kind() const override { return device_kind_; }
-
   Status TransferToInfeed(const LiteralSlice& literal) override {
     return Unimplemented("Infeed not yet implemented via this API");
   }
@@ -89,21 +119,8 @@ class TpuDevice : public PjRtDevice {
     return nullptr;
   }
 
-  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
-      const override {
-    return attributes_;
-  }
-
  private:
-  const int id_;
-  const int process_index_;
-  const std::array<int, 3> coords_;
-  const std::string device_kind_ = "Cloud TPU";
-  std::string debug_string_;
-  std::string to_string_;
-  const absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_ = {};
-  // Index of the core of the same chip.
-  int core_on_chip_;
+  TpuDeviceDescription description_;
   PyTpuClient* tpu_client_;
 };
 
@@ -188,6 +205,7 @@ struct TpuSharedBuffer final {
 
   ~TpuSharedBuffer() {
     std::vector<tpu_driver::Event*> events;
+    events.reserve(wait_for_use.size());
     for (const auto& e : wait_for_use) {
       events.push_back(e.get());
     }
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
index 5d3c56a4a34..be74b06a9dd 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iterator>
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -21,6 +23,7 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/python/util.h"
@@ -46,11 +49,10 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       .def("task_id", &PyTpuClient::process_index)
       .def("get_default_device_assignment",
            [](PyTpuClient* client, int num_replicas, int num_partitions)
-               -> StatusOr<
-                   std::vector<std::vector<std::shared_ptr<PjRtDevice>>>> {
-             TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
-                                 client->GetDefaultDeviceAssignment(
-                                     num_replicas, num_partitions));
+               -> std::vector<std::vector<std::shared_ptr<PjRtDevice>>> {
+             DeviceAssignment device_assignment =
+                 xla::ValueOrThrow(client->GetDefaultDeviceAssignment(
+                     num_replicas, num_partitions));
              std::vector<std::vector<std::shared_ptr<PjRtDevice>>> result;
              result.resize(num_replicas);
              for (int r = 0; r < num_replicas; ++r) {
@@ -66,11 +68,11 @@ PYBIND11_MODULE(tpu_client_extension, m) {
            })
       // TODO(skye): delete after all callers can handle 2D output
       .def("get_default_device_assignment",
-           [](PyTpuClient* client, int num_replicas)
-               -> StatusOr<std::vector<std::shared_ptr<PjRtDevice>>> {
-             TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
-                                 client->GetDefaultDeviceAssignment(
-                                     num_replicas, /*num_partitions=*/1));
+           [](PyTpuClient* client,
+              int num_replicas) -> std::vector<std::shared_ptr<PjRtDevice>> {
+             DeviceAssignment device_assignment =
+                 xla::ValueOrThrow(client->GetDefaultDeviceAssignment(
+                     num_replicas, /*num_partitions=*/1));
              std::vector<std::shared_ptr<PjRtDevice>> result;
              for (int i = 0; i < num_replicas; ++i) {
                int device_id = device_assignment(i, 0);
@@ -85,57 +87,61 @@ PYBIND11_MODULE(tpu_client_extension, m) {
               int device_ordinal) {
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
-             return client->TransferToInfeed(literal, device_ordinal);
+             xla::ThrowIfError(
+                 client->TransferToInfeed(literal, device_ordinal));
            })
       .def("transfer_from_outfeed",
-           [](PyTpuClient* client, const Shape& shape,
-              int device_ordinal) -> StatusOr<py::object> {
-             GlobalPyRefManager()->CollectGarbage();
-             std::shared_ptr<Literal> literal_shared;
-             {
-               py::gil_scoped_release gil_release;
-               TF_ASSIGN_OR_RETURN(Literal literal, client->TransferFromOutfeed(
-                                                        shape, device_ordinal));
-               literal_shared = std::make_shared<Literal>(std::move(literal));
-             }
-             return LiteralToPython(std::move(literal_shared));
-           })
-      .def(
-          "buffer_from_pyval",
-          [](std::shared_ptr<PyTpuClient> client,
-             const pybind11::object& argument,
-             std::shared_ptr<PjRtDevice> device,
-             bool force_copy) -> StatusOr<std::unique_ptr<PyTpuBuffer>> {
-            if (device == nullptr) {
-              TF_RET_CHECK(!client->local_devices().empty());
-              device = client->local_devices().front();
-            }
-            auto iter = client->id_to_device().find(device->id());
-            if (iter->second != device) {
-              return InvalidArgument(
-                  "Cannot copy value to device '%s' with '%s' backend",
-                  device->DebugString(), client->platform_name());
-            }
-            GlobalPyRefManager()->CollectGarbage();
-            TF_ASSIGN_OR_RETURN(PythonBufferTree tree,
-                                GetPythonBufferTree(argument));
-            std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref =
-                GlobalPyRefManager()->ManageReferences(
-                    absl::MakeSpan(tree.arrays));
-            tree.arrays.clear();
+           xla::ValueOrThrowWrapper(xla::ValueOrThrowWrapper(
+               [](PyTpuClient* client, const Shape& shape,
+                  int device_ordinal) -> StatusOr<py::object> {
+                 GlobalPyRefManager()->CollectGarbage();
+                 std::shared_ptr<Literal> literal_shared;
+                 {
+                   py::gil_scoped_release gil_release;
+                   TF_ASSIGN_OR_RETURN(
+                       Literal literal,
+                       client->TransferFromOutfeed(shape, device_ordinal));
+                   literal_shared =
+                       std::make_shared<Literal>(std::move(literal));
+                 }
+                 return LiteralToPython(std::move(literal_shared));
+               })))
+      .def("buffer_from_pyval",
+           xla::ValueOrThrowWrapper(
+               [](std::shared_ptr<PyTpuClient> client,
+                  const pybind11::object& argument,
+                  std::shared_ptr<PjRtDevice> device,
+                  bool force_copy) -> StatusOr<std::unique_ptr<PyTpuBuffer>> {
+                 if (device == nullptr) {
+                   TF_RET_CHECK(!client->local_devices().empty());
+                   device = client->local_devices().front();
+                 }
+                 auto iter = client->id_to_device().find(device->id());
+                 if (iter->second != device) {
+                   return InvalidArgument(
+                       "Cannot copy value to device '%s' with '%s' backend",
+                       device->DebugString(), client->platform_name());
+                 }
+                 GlobalPyRefManager()->CollectGarbage();
+                 TF_ASSIGN_OR_RETURN(PythonBufferTree tree,
+                                     GetPythonBufferTree(argument));
+                 std::shared_ptr<PythonRefManager::ManagedPyObjects>
+                     py_buffer_ref = GlobalPyRefManager()->ManageReferences(
+                         absl::MakeSpan(tree.arrays));
+                 tree.arrays.clear();
 
-            std::vector<BorrowingLiteral> leaves;
-            leaves.insert(leaves.end(),
-                          std::make_move_iterator(tree.leaves.begin()),
-                          std::make_move_iterator(tree.leaves.end()));
+                 std::vector<BorrowingLiteral> leaves;
+                 leaves.insert(leaves.end(),
+                               std::make_move_iterator(tree.leaves.begin()),
+                               std::make_move_iterator(tree.leaves.end()));
 
-            py::gil_scoped_release gil_release;
-            return PyTpuBuffer::FromLiterals(
-                std::move(leaves), tree.shape, std::move(py_buffer_ref),
-                std::move(client), std::move(device));
-          },
-          py::arg("argument"), py::arg("device") = nullptr,
-          py::arg("force_copy") = false)
+                 py::gil_scoped_release gil_release;
+                 return PyTpuBuffer::FromLiterals(
+                     std::move(leaves), tree.shape, std::move(py_buffer_ref),
+                     std::move(client), std::move(device));
+               }),
+           py::arg("argument"), py::arg("device") = nullptr,
+           py::arg("force_copy") = false)
       .def(
           "compile",
           [](std::shared_ptr<PyTpuClient> client,
@@ -227,22 +233,37 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       .def("size_of_generated_code_in_bytes",
            &PyTpuExecutable::SizeOfGeneratedCodeInBytes)
       .def("Delete", &PyTpuExecutable::Delete)
-      .def("Execute", &PyTpuExecutable::Execute,
-           py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
-      .def("ExecuteOnLocalDevices", &PyTpuExecutable::ExecuteOnLocalDevices,
+      .def(
+          "Execute",
+          [](PyTpuExecutable& self,
+             absl::Span<PyTpuBuffer* const> argument_handles) {
+            return xla::ValueOrThrow(self.Execute(argument_handles));
+          },
+          py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
+      .def("ExecuteOnLocalDevices",
+           xla::ValueOrThrowWrapper(&PyTpuExecutable::ExecuteOnLocalDevices),
            py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
       .def("delete", &PyTpuExecutable::Delete)
-      .def("execute", &PyTpuExecutable::Execute,
+      .def(
+          "execute",
+          [](PyTpuExecutable& self,
+             absl::Span<PyTpuBuffer* const> argument_handles) {
+            return xla::ValueOrThrow(self.Execute(argument_handles));
+          },
+          py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
+      .def("execute_with_token",
+           xla::ValueOrThrowWrapper(&PyTpuExecutable::ExecuteWithToken),
            py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
-      .def("execute_with_token", &PyTpuExecutable::ExecuteWithToken,
-           py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
-      .def("execute_on_local_devices", &PyTpuExecutable::ExecuteOnLocalDevices,
+      .def("execute_on_local_devices",
+           xla::ValueOrThrowWrapper(&PyTpuExecutable::ExecuteOnLocalDevices),
            py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
       .def("execute_sharded_on_local_devices",
-           &PyTpuExecutable::ExecuteShardedOnLocalDevices,
+           xla::ValueOrThrowWrapper(
+               &PyTpuExecutable::ExecuteShardedOnLocalDevices),
            py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
       .def("execute_sharded_on_local_devices_with_tokens",
-           &PyTpuExecutable::ExecuteShardedOnLocalDevicesWithTokens,
+           xla::ValueOrThrowWrapper(
+               &PyTpuExecutable::ExecuteShardedOnLocalDevicesWithTokens),
            py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
       // TODO(phawkins): implement traceback support.
       .def_property_readonly("traceback",
diff --git a/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.cc b/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.cc
index 089bc779cbb..015c42b0af6 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.cc
@@ -529,9 +529,9 @@ void GrpcTpuStream::UpdateEventStatus(EventId id, Status status) {
   if (it->second.done) {
     // Done and deleted events must have already been removed.
     CHECK(!it->second.deleted);
-    VLOG(1) << "Received a second status update: " << status.error_message()
-            << ", for GrpcEvent " << id << " already done with status: "
-            << it->second.status.error_message();
+    VLOG(1) << "Received a second status update: " << status.message()
+            << ", for GrpcEvent " << id
+            << " already done with status: " << it->second.status.message();
     return;
   }
 
@@ -710,7 +710,7 @@ void GrpcTpuStream::StreamReaderFn() {
       if (entry.status().code() != tsl::error::Code::OK) {
         UpdateEventStatus(
             event_id,
-            Status(static_cast<tsl::error::Code>(entry.status().code()),
+            Status(static_cast<absl::StatusCode>(entry.status().code()),
                    entry.status().message()));
       } else {
         UpdateEventStatus(event_id, OkStatus());
@@ -1028,7 +1028,7 @@ Status GrpcTpuDriver::Reset() {
     LOG(ERROR) << "Failed to reset the gRPC driver: " << status.error_code()
                << ": " << status.error_message() << ": "
                << status.error_details();
-    return xla::Status(tsl::error::Code(status.error_code()),
+    return xla::Status(absl::StatusCode(status.error_code()),
                        absl::StrCat("Failed to reset TPU driver. Error was: ",
                                     status.error_message(),
                                     ". Details: ", status.error_details()));
@@ -1048,7 +1048,7 @@ Status GrpcTpuDriver::Close() {
   CloseResponse resp;
   ::grpc::Status status = stub->Close(&ctx, req, &resp);
   if (!status.ok()) {
-    return xla::Status(tsl::error::Code(status.error_code()),
+    return xla::Status(absl::StatusCode(status.error_code()),
                        absl::StrCat("Failed to close TPU driver. Error was: ",
                                     status.error_message(),
                                     ". Details: ", status.error_details()));
@@ -1075,7 +1075,7 @@ xla::StatusOr<std::unique_ptr<TpuDriver>> CreateGrpcTpuDriver(
                << ": " << status.error_message() << ": "
                << status.error_details();
     return xla::Status(
-        tsl::error::Code(status.error_code()),
+        absl::StatusCode(status.error_code()),
         absl::StrCat(
             "Failed to connect to remote server at address: ", config.worker(),
             ". Error from gRPC: ", status.error_message(),
diff --git a/tensorflow/compiler/xla/python/transfer_guard_lib.cc b/tensorflow/compiler/xla/python/transfer_guard_lib.cc
index 2d5a62ddcb3..49015e3e507 100644
--- a/tensorflow/compiler/xla/python/transfer_guard_lib.cc
+++ b/tensorflow/compiler/xla/python/transfer_guard_lib.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "pybind11/cast.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
+#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/util.h"
 
diff --git a/tensorflow/compiler/xla/python/types.cc b/tensorflow/compiler/xla/python/types.cc
index 322e3c2f625..6305aa95e3a 100644
--- a/tensorflow/compiler/xla/python/types.cc
+++ b/tensorflow/compiler/xla/python/types.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/types.h"
 
+#include <tuple>
+
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/python/exceptions.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/tsl/python/lib/core/bfloat16.h"
-#include "tensorflow/tsl/python/lib/core/float8.h"
 
 namespace xla {
 
@@ -28,34 +28,45 @@ namespace py = pybind11;
 
 xla::StatusOr<PrimitiveType> DtypeToPrimitiveType(const py::dtype& np_type) {
   static auto* types =
-      new absl::flat_hash_map<std::pair<char, int>, PrimitiveType>({
-          {{'b', 1}, PRED},
-          {{'i', 1}, S8},
-          {{'i', 2}, S16},
-          {{'i', 4}, S32},
-          {{'i', 8}, S64},
-          {{'u', 1}, U8},
-          {{'u', 2}, U16},
-          {{'u', 4}, U32},
-          {{'u', 8}, U64},
-          {{'V', 1}, F8E4M3FN},
-          {{'f', 1}, F8E5M2},
-          {{'V', 2}, BF16},  // array protocol code for raw data (void*)
-          {{'f', 2}, F16},
-          {{'f', 4}, F32},
-          {{'f', 8}, F64},
-          {{'c', 8}, C64},
-          {{'c', 16}, C128},
+      new absl::flat_hash_map<std::tuple<char, char, int>, PrimitiveType>({
+          {{'?', 'b', 1}, PRED},          {{'b', 'i', 1}, S8},
+          {{'h', 'i', 2}, S16},           {{'i', 'i', 4}, S32},
+          {{'l', 'i', 4}, S32},           {{'q', 'i', 8}, S64},
+          {{'l', 'i', 8}, S64},           {{'B', 'u', 1}, U8},
+          {{'H', 'u', 2}, U16},           {{'I', 'u', 4}, U32},
+          {{'L', 'u', 4}, U32},           {{'Q', 'u', 8}, U64},
+          {{'L', 'u', 8}, U64},           {{'4', 'V', 1}, F8E4M3FN},
+          {{'L', 'V', 1}, F8E4M3B11FNUZ}, {{'5', 'f', 1}, F8E5M2},
+          {{'E', 'V', 2}, BF16},  // array protocol code for raw data (void*)
+          {{'e', 'f', 2}, F16},           {{'f', 'f', 4}, F32},
+          {{'d', 'f', 8}, F64},           {{'F', 'c', 8}, C64},
+          {{'D', 'c', 16}, C128},
       });
-  auto it = types->find({np_type.kind(), np_type.itemsize()});
+  auto it = types->find({np_type.char_(), np_type.kind(), np_type.itemsize()});
   if (it == types->end()) {
-    return InvalidArgument("Unknown NumPy type %c size %d", np_type.kind(),
-                           np_type.itemsize());
+    return InvalidArgument("Unknown NumPy type %c kind %c size %d",
+                           np_type.char_(), np_type.kind(), np_type.itemsize());
   }
   return it->second;
 }
 
 xla::StatusOr<py::dtype> PrimitiveTypeToDtype(PrimitiveType type) {
+  struct FloatTypes {
+    py::dtype bfloat16;
+    py::dtype float8_e4m3fn;
+    py::dtype float8_e4m3b11;
+    py::dtype float8_e5m2;
+  };
+
+  static const FloatTypes& float_types = *[]() {
+    py::module ml_dtypes = py::module::import("ml_dtypes");
+    return new FloatTypes{
+        py::dtype::from_args(ml_dtypes.attr("bfloat16")),
+        py::dtype::from_args(ml_dtypes.attr("float8_e4m3fn")),
+        py::dtype::from_args(ml_dtypes.attr("float8_e4m3b11")),
+        py::dtype::from_args(ml_dtypes.attr("float8_e5m2")),
+    };
+  }();
   switch (type) {
     case PRED:
       return py::dtype::of<bool>();
@@ -75,19 +86,14 @@ xla::StatusOr<py::dtype> PrimitiveTypeToDtype(PrimitiveType type) {
       return py::dtype::of<uint32_t>();
     case U64:
       return py::dtype::of<uint64_t>();
-    case F8E4M3FN: {
-      py::handle f8_e4m3fn(tsl::Float8e4m3fnDtype());
-      return py::dtype::from_args(
-          py::reinterpret_borrow<py::object>(f8_e4m3fn));
-    }
-    case F8E5M2: {
-      py::handle f8_e5m2(tsl::Float8e5m2Dtype());
-      return py::dtype::from_args(py::reinterpret_borrow<py::object>(f8_e5m2));
-    }
-    case BF16: {
-      py::handle bfloat16(tsl::Bfloat16Dtype());
-      return py::dtype::from_args(py::reinterpret_borrow<py::object>(bfloat16));
-    }
+    case F8E4M3FN:
+      return float_types.float8_e4m3fn;
+    case F8E4M3B11FNUZ:
+      return float_types.float8_e4m3b11;
+    case F8E5M2:
+      return float_types.float8_e5m2;
+    case BF16:
+      return float_types.bfloat16;
     case F16:
       return py::dtype("e");  // PEP 3118 code for "float16
     case F32:
@@ -107,7 +113,8 @@ xla::StatusOr<py::dtype> PrimitiveTypeToDtype(PrimitiveType type) {
 const NumpyScalarTypes& GetNumpyScalarTypes() {
   static const NumpyScalarTypes* singleton = []() {
     NumpyScalarTypes* dtypes = new NumpyScalarTypes();
-    const auto numpy = py::module::import("numpy");
+    py::module numpy = py::module::import("numpy");
+    py::module ml_dtypes = py::module::import("ml_dtypes");
     dtypes->np_bool = py::object(numpy.attr("bool_"));
     dtypes->np_int8 = py::object(numpy.attr("int8"));
     dtypes->np_int16 = py::object(numpy.attr("int16"));
@@ -117,12 +124,11 @@ const NumpyScalarTypes& GetNumpyScalarTypes() {
     dtypes->np_uint16 = py::object(numpy.attr("uint16"));
     dtypes->np_uint32 = py::object(numpy.attr("uint32"));
     dtypes->np_uint64 = py::object(numpy.attr("uint64"));
-    dtypes->np_bfloat16 =
-        py::reinterpret_borrow<py::object>(tsl::Bfloat16Dtype());
-    dtypes->np_float8_e4m3fn =
-        py::reinterpret_borrow<py::object>(tsl::Float8e4m3fnDtype());
-    dtypes->np_float8_e5m2 =
-        py::reinterpret_borrow<py::object>(tsl::Float8e5m2Dtype());
+    dtypes->np_bfloat16 = py::object(ml_dtypes.attr("bfloat16"));
+    dtypes->np_float8_e4m3fn = py::object(ml_dtypes.attr("float8_e4m3fn"));
+    dtypes->np_float8_e4m3b11fnuz =
+        py::object(ml_dtypes.attr("float8_e4m3b11"));
+    dtypes->np_float8_e5m2 = py::object(ml_dtypes.attr("float8_e5m2"));
     dtypes->np_float16 = py::object(numpy.attr("float16"));
     dtypes->np_float32 = py::object(numpy.attr("float32"));
     dtypes->np_float64 = py::object(numpy.attr("float64"));
diff --git a/tensorflow/compiler/xla/python/types.h b/tensorflow/compiler/xla/python/types.h
index bd1da847fb5..162c952e7ec 100644
--- a/tensorflow/compiler/xla/python/types.h
+++ b/tensorflow/compiler/xla/python/types.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_TYPES_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_TYPES_H_
 
+#include <algorithm>
 #include <memory>
 #include <optional>
+#include <string>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
@@ -27,7 +29,6 @@ limitations under the License.
 #include "pybind11/stl.h"  // from @pybind11
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -61,6 +62,7 @@ struct NumpyScalarTypes {
   pybind11::object np_uint64;
   pybind11::object np_bfloat16;
   pybind11::object np_float8_e4m3fn;
+  pybind11::object np_float8_e4m3b11fnuz;
   pybind11::object np_float8_e5m2;
   pybind11::object np_float16;
   pybind11::object np_float32;
diff --git a/tensorflow/compiler/xla/python/util.cc b/tensorflow/compiler/xla/python/util.cc
index d6799cc2a46..0c7bcdf3ad1 100644
--- a/tensorflow/compiler/xla/python/util.cc
+++ b/tensorflow/compiler/xla/python/util.cc
@@ -31,8 +31,7 @@ Status AwaitBuffersReady(ifrt::Array* ifrt_array) {
   Status s = ifrt_array->GetReadyFuture().Await();
   if (!s.ok()) {
     // Fix up error string because some clients rely on it.
-    if (s.error_message() ==
-        "GetReadyFuture() called on deleted or donated buffer") {
+    if (s.message() == "GetReadyFuture() called on deleted or donated buffer") {
       s = InvalidArgument(
           "BlockHostUntilReady() called on deleted or donated buffer");
     }
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 319339836ac..ca4de2a1157 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <memory>
+#include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 // clang-format off
@@ -68,10 +71,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/profiler.h"
 #include "tensorflow/compiler/xla/python/py_array.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
+#include "tensorflow/compiler/xla/python/py_compile_only_client.h"
 #include "tensorflow/compiler/xla/python/py_executable.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/pytree.h"
 #include "tensorflow/compiler/xla/python/sharding.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/python/traceback.h"
 #include "tensorflow/compiler/xla/python/transfer_guard_lib.h"
 #include "tensorflow/compiler/xla/python/types.h"
@@ -83,8 +88,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager.h"
-#include "tensorflow/tsl/python/lib/core/bfloat16.h"
-#include "tensorflow/tsl/python/lib/core/float8.h"
 
 // TODO(phawkins): remove host_id properties after JAX is update to avoid them.
 
@@ -133,9 +136,6 @@ bool IsSanitized() { return IsAsan() || IsMsan() || IsTsan(); }
 
 PYBIND11_MODULE(xla_extension, m) {
   tsl::ImportNumpy();
-  CHECK(tsl::RegisterNumpyBfloat16());
-  CHECK(tsl::RegisterNumpyFloat8e4m3fn());
-  CHECK(tsl::RegisterNumpyFloat8e5m2());
 
   // Exceptions
   py::register_exception<XlaRuntimeError>(m, "XlaRuntimeError",
@@ -155,6 +155,7 @@ PYBIND11_MODULE(xla_extension, m) {
       .value("U64", U64)
       .value("F16", F16)
       .value("F8E4M3FN", F8E4M3FN)
+      .value("F8E4M3B11FNUZ", F8E4M3B11FNUZ)
       .value("F8E5M2", F8E5M2)
       .value("BF16", BF16)
       .value("F32", F32)
@@ -165,20 +166,15 @@ PYBIND11_MODULE(xla_extension, m) {
       .value("OPAQUE_TYPE", OPAQUE_TYPE)
       .value("TOKEN", TOKEN);
 
-  m.def("bfloat16_dtype", []() { return py::handle(tsl::Bfloat16Dtype()); });
-  m.def("float8_e4m3fn_dtype",
-        []() { return py::handle(tsl::Float8e4m3fnDtype()); });
-  m.def("float8_e5m2_dtype",
-        []() { return py::handle(tsl::Float8e5m2Dtype()); });
-
   // Must be before PyClient.compile.
   BuildXlaCompilerSubmodule(m);
 
-  py::class_<PjRtDevice, ClientAndPtr<PjRtDevice>>(
+  py::class_<PjRtDevice, ClientAndPtr<PjRtDevice>> device(
       m, "Device",
       "A descriptor of an available device.\n\nSubclasses are used to "
       "represent specific types of devices, e.g. CPUs, GPUs. Subclasses may "
-      "have additional properties specific to that device type.")
+      "have additional properties specific to that device type.");
+  device
       .def_property_readonly(
           "id", &PjRtDevice::id,
           "Integer ID of this device.\n\nUnique across all available devices "
@@ -193,22 +189,23 @@ PYBIND11_MODULE(xla_extension, m) {
                              "Deprecated; please use process_index")
       .def_property_readonly("platform",
                              [](const ClientAndPtr<PjRtDevice>& device) {
-                               return device.client->platform_name();
+                               return device.client()->platform_name();
                              })
       .def_property_readonly("device_kind", &PjRtDevice::device_kind)
-      .def_property_readonly(
-          "client",
-          [](const ClientAndPtr<PjRtDevice>& device) { return device.client; })
+      .def_property_readonly("client",
+                             [](const ClientAndPtr<PjRtDevice>& device) {
+                               return device.client();
+                             })
       .def("__str__", &PjRtDevice::DebugString)
       .def("__repr__", &PjRtDevice::ToString)
       .def("transfer_to_infeed",
            [](PjRtDevice& device, const LiteralSlice& literal) {
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
-             return device.TransferToInfeed(literal);
+             xla::ThrowIfError(device.TransferToInfeed(literal));
            })
       .def("transfer_from_outfeed",
-           [](PjRtDevice& device, const Shape& shape) -> StatusOr<py::object> {
+           [](PjRtDevice& device, const Shape& shape) -> py::object {
              GlobalPyRefManager()->CollectGarbage();
              std::shared_ptr<Literal> literal;
              {
@@ -221,28 +218,53 @@ PYBIND11_MODULE(xla_extension, m) {
                      }
                    });
                literal = std::make_shared<Literal>(shape_with_layout);
-               TF_RETURN_IF_ERROR(device.TransferFromOutfeed(literal.get()));
+               xla::ThrowIfError(device.TransferFromOutfeed(literal.get()));
              }
-             return LiteralToPython(std::move(literal));
+             return ValueOrThrow(LiteralToPython(std::move(literal)));
            })
       .def("live_buffers",
            [](const ClientAndPtr<PjRtDevice>& device) {
              PythonDeprecationWarning(
                  "Per device live_buffers() is going to be deprecated. Please "
                  "use the jax.live_arrays() for jax.Arrays instead.");
-             return device.client->LiveBuffersOnDevice(device.get());
-           })
-      .def(
-          "__getattr__",
-          [](PjRtDevice& device, std::string name) -> py::object {
-            const auto& attrs = device.Attributes();
-            auto it = attrs.find(name);
-            if (it != attrs.end()) {
-              return std::visit([](auto&& v) { return py::cast(v); },
-                                it->second);
-            }
-            throw py::attribute_error(absl::StrCat("Unknown attribute ", name));
-          });
+             return py::list();
+           });
+  static PyMethodDef get_attr_method = {
+      "__getattr__",
+      +[](PyObject* self, PyObject* args) -> PyObject* {
+        PyObject* key;
+        if (!PyArg_ParseTuple(args, "O", &key)) {
+          PyErr_SetString(PyExc_TypeError, "__getattr__ must take 1 argument.");
+          return nullptr;
+        }
+        try {
+          auto device = py::cast<PjRtDevice*>(py::handle(self));
+          auto name = py::cast<std::string>(py::handle(key));
+          const auto& attrs = device->Attributes();
+          auto it = attrs.find(name);
+          if (it != attrs.end()) {
+            auto result =
+                std::visit([](auto&& v) { return py::cast(v); }, it->second);
+            return result.release().ptr();
+          }
+          PyErr_SetNone(PyExc_AttributeError);
+          return nullptr;
+        } catch (std::exception& e) {
+          PyErr_Format(PyExc_SystemError,
+                       "Some unhandled pybind11 exception: %s", e.what());
+          return nullptr;
+        } catch (...) {
+          PyErr_SetString(PyExc_SystemError,
+                          "Some unhandled pybind11 exception.");
+          return nullptr;
+        }
+      },
+      METH_VARARGS,
+      nullptr,
+  };
+  device.attr("__getattr__") =
+      py::reinterpret_steal<py::object>(PyDescr_NewMethod(
+          reinterpret_cast<PyTypeObject*>(device.ptr()), &get_attr_method));
 
   // Local XLA client methods.
 
@@ -270,15 +292,16 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("host_id", &PyClient::process_index)
       .def("task_id", &PyClient::process_index)
       .def("get_default_device_assignment",
-           &PyClient::GetDefaultDeviceAssignment)
+           xla::ValueOrThrowWrapper(&PyClient::GetDefaultDeviceAssignment))
       // TODO(skye): delete after all callers can handle 2D output
       .def("get_default_device_assignment",
-           &PyClient::GetDefaultDeviceAssignment1D)
-      .def("create_channel_handle", &PyClient::CreateChannelHandle)
+           xla::ValueOrThrowWrapper(&PyClient::GetDefaultDeviceAssignment1D))
+      .def("create_channel_handle",
+           xla::ValueOrThrowWrapper(&PyClient::CreateChannelHandle))
       .def("create_device_to_host_channel_handle",
-           &PyClient::CreateDeviceToHostChannelHandle)
+           xla::ValueOrThrowWrapper(&PyClient::CreateDeviceToHostChannelHandle))
       .def("create_host_to_device_channel_handle",
-           &PyClient::CreateHostToDeviceChannelHandle)
+           xla::ValueOrThrowWrapper(&PyClient::CreateHostToDeviceChannelHandle))
       .def(
           "buffer_from_pyval",
           [](py::handle py_client, py::handle argument, py::handle py_device,
@@ -288,71 +311,67 @@ PYBIND11_MODULE(xla_extension, m) {
             PjRtDevice* device = py_device.is_none()
                                      ? nullptr
                                      : fast_cast<PjRtDevice>(py_device);
-            return client->BufferFromPyval(argument, device, force_copy,
-                                           host_buffer_semantics);
+            return ValueOrThrow(client->BufferFromPyval(
+                argument, device, force_copy, host_buffer_semantics));
           },
           py::arg("argument"), py::arg("device") = nullptr,
           py::arg("force_copy") = false,
           py::arg("host_buffer_semantics") =
               PjRtClient::HostBufferSemantics::kZeroCopy)
       .def("make_cross_host_receive_buffers",
-           &PyClient::MakeCrossHostReceiveBuffers, py::arg("shapes"),
-           py::arg("device"))
-      .def("compile", &PyClient::Compile, py::arg("computation"),
+           xla::ValueOrThrowWrapper(&PyClient::MakeCrossHostReceiveBuffers),
+           py::arg("shapes"), py::arg("device"))
+      .def("compile", xla::ValueOrThrowWrapper(&PyClient::Compile),
+           py::arg("computation"),
            py::arg("compile_options") = CompileOptions(),
            py::arg("host_callbacks") = std::vector<py::capsule>())
-      .def("serialize_executable", &PyClient::SerializeExecutable)
+      .def("serialize_executable",
+           xla::ValueOrThrowWrapper(&PyClient::SerializeExecutable))
       .def("deserialize_executable",
-           py::overload_cast<const std::string&, CompileOptions,
-                             std::vector<py::capsule>>(
-               &PyClient::DeserializeExecutable),
-           py::arg("serialized"), py::arg("compile_options"),
+           xla::ValueOrThrowWrapper(&PyClient::DeserializeExecutable),
+           py::arg("serialized"), py::arg("compile_options") = std::nullopt,
            py::arg("host_callbacks") = std::vector<py::capsule>())
-      // TODO(skyewm): remove when jax stop providing hlo_module
-      .def("deserialize_executable",
-           py::overload_cast<const std::string&, std::shared_ptr<HloModule>,
-                             CompileOptions, std::vector<py::capsule>>(
-               &PyClient::DeserializeExecutable),
-           py::arg("serialized"), py::arg("hlo_module"),
-           py::arg("compile_options"),
-           py::arg("host_callbacks") = std::vector<py::capsule>())
-      .def("heap_profile", &PyClient::HeapProfile)
+      .def("heap_profile", xla::ValueOrThrowWrapper(&PyClient::HeapProfile))
       // TODO(zhangqiaorjc): Experimental.
-      .def("defragment", &PyClient::Defragment)
+      .def("defragment",
+           [](PyClient& self) { xla::ThrowIfError(self.Defragment()); })
       .def("get_emit_python_callback_descriptor",
-           &PyClient::GetEmitPythonCallbackDescriptor, py::arg("callable"),
-           py::arg("operand_shapes"), py::arg("result_shapes") = std::nullopt)
+           xla::ValueOrThrowWrapper(&PyClient::GetEmitPythonCallbackDescriptor),
+           py::arg("callable"), py::arg("operand_shapes"),
+           py::arg("result_shapes") = std::nullopt)
       .def("make_python_callback_from_host_send_and_recv",
-           &PyClient::MakePythonCallbackUsingHostSendAndRecv,
+           xla::ValueOrThrowWrapper(
+               &PyClient::MakePythonCallbackUsingHostSendAndRecv),
            py::arg("callable"), py::arg("operand_shapes"),
            py::arg("result_shapes"), py::arg("send_channel_ids"),
            py::arg("recv_channel_ids"))
       // Deprecated: please use `get_emit_python_callback_descriptor` instead.
-      .def("emit_python_callback", &PyClient::EmitPythonCallback,
+      .def("emit_python_callback",
+           xla::ValueOrThrowWrapper(&PyClient::EmitPythonCallback),
            py::arg("callable"), py::arg("builder"), py::arg("operands"),
            py::arg("result_shapes"), py::arg("operand_layouts") = std::nullopt,
            py::arg("has_side_effects") = false);
 
   m.def(
       "get_tfrt_cpu_client",
-      [](bool asynchronous) -> StatusOr<std::shared_ptr<PyClient>> {
+      [](bool asynchronous) -> std::shared_ptr<PyClient> {
         py::gil_scoped_release gil_release;
-        TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
-                            GetTfrtCpuClient(asynchronous));
+        std::unique_ptr<PjRtClient> client =
+            xla::ValueOrThrow(GetTfrtCpuClient(asynchronous));
         return std::make_shared<PyClient>(
             ifrt::PjRtClient::Create(std::move(client)));
       },
       py::arg("asynchronous") = true);
-  m.def("get_interpreter_client", []() -> StatusOr<std::shared_ptr<PyClient>> {
+  m.def("get_interpreter_client", []() -> std::shared_ptr<PyClient> {
     py::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
-                        GetInterpreterClient());
+    std::unique_ptr<PjRtClient> client =
+        xla::ValueOrThrow(GetInterpreterClient());
     return std::make_shared<PyClient>(
         ifrt::PjRtClient::Create(std::move(client)));
   });
   m.def("load_pjrt_plugin",
-        [](std::string platform_name, std::string library_path) -> Status {
-          return pjrt::LoadPjrtPlugin(platform_name, library_path);
+        [](std::string platform_name, std::string library_path) {
+          xla::ThrowIfError(pjrt::LoadPjrtPlugin(platform_name, library_path));
         });
 
 #ifdef XLA_PYTHON_ENABLE_GPU
@@ -373,13 +392,12 @@ PYBIND11_MODULE(xla_extension, m) {
          std::shared_ptr<DistributedRuntimeClient> distributed_client,
          int node_id, std::optional<std::set<int>> allowed_devices,
          std::optional<std::string> platform_name)
-          -> StatusOr<std::shared_ptr<PyClient>> {
+          -> std::shared_ptr<PyClient> {
         py::gil_scoped_release gil_release;
-        TF_ASSIGN_OR_RETURN(
-            std::unique_ptr<PjRtClient> client,
-            GetStreamExecutorGpuClient(asynchronous, allocator_config,
-                                       std::move(distributed_client), node_id,
-                                       allowed_devices, platform_name));
+        std::unique_ptr<PjRtClient> client =
+            xla::ValueOrThrow(GetStreamExecutorGpuClient(
+                asynchronous, allocator_config, std::move(distributed_client),
+                node_id, allowed_devices, platform_name));
         return std::make_shared<PyClient>(
             ifrt::PjRtClient::Create(std::move(client)));
       },
@@ -393,10 +411,10 @@ PYBIND11_MODULE(xla_extension, m) {
 #ifdef XLA_PYTHON_ENABLE_TPU
   m.def(
       "get_tpu_client",
-      [](int max_inflight_computations) -> StatusOr<std::shared_ptr<PyClient>> {
+      [](int max_inflight_computations) -> std::shared_ptr<PyClient> {
         py::gil_scoped_release gil_release;
-        TF_ASSIGN_OR_RETURN(std::shared_ptr<PjRtClient> client,
-                            GetTpuClient(max_inflight_computations));
+        std::shared_ptr<PjRtClient> client =
+            xla::ValueOrThrow(GetTpuClient(max_inflight_computations));
         return std::make_shared<PyClient>(
             ifrt::PjRtClient::Create(std::move(client)));
       },
@@ -407,29 +425,34 @@ PYBIND11_MODULE(xla_extension, m) {
       "get_c_api_client",
       [](std::string platform_name,
          const absl::flat_hash_map<std::string, PjRtValueType>& options)
-          -> StatusOr<std::shared_ptr<PyClient>> {
+          -> std::shared_ptr<PyClient> {
         py::gil_scoped_release gil_release;
-        TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> c_api_client,
-                            GetCApiClient(platform_name, options));
+        std::unique_ptr<PjRtClient> c_api_client =
+            xla::ValueOrThrow(GetCApiClient(platform_name, options));
         return std::make_shared<PyClient>(
             ifrt::PjRtClient::Create(std::move(c_api_client)));
       },
       py::arg("platform_name"),
       py::arg("options") = absl::flat_hash_map<std::string, PjRtValueType>());
+  // TODO(b/262050449): move out from `#ifdef XLA_PYTHON_ENABLE_TPU` when
+  // GetCApiTopology does not depend on TPU.
+  m.def("get_default_c_api_topology",
+        [](std::string platform_name)
+            -> std::shared_ptr<PjRtTopologyDescription> {
+          return xla::ValueOrThrow(GetCApiTopology(platform_name));
+        });
 #endif  // XLA_PYTHON_ENABLE_TPU
 
 #ifdef XLA_PYTHON_ENABLE_PLUGIN_DEVICE
-  m.def("get_plugin_device_client",
-        []() -> StatusOr<std::shared_ptr<PyClient>> {
-          py::gil_scoped_release gil_release;
-          TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
-                              GetTfrtPluginDeviceClient());
-          return std::make_shared<PyClient>(
-              ifrt::PjRtClient::Create(std::move(client)));
-        });
+  m.def("get_plugin_device_client", []() -> std::shared_ptr<PyClient> {
+    py::gil_scoped_release gil_release;
+    std::unique_ptr<PjRtClient> client =
+        xla::ValueOrThrow(GetTfrtPluginDeviceClient());
+    return std::make_shared<PyClient>(
+        ifrt::PjRtClient::Create(std::move(client)));
+  });
 #endif  // XLA_PYTHON_ENABLE_PLUGIN_DEVICE
 
-  TF_CHECK_OK(PyBuffer::RegisterTypes(m));
   TF_CHECK_OK(PyArray::RegisterTypes(m));
   jax::RegisterSharding(m);
 
@@ -486,32 +509,31 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("local_devices", &PyLoadedExecutable::AddressableDevices)
       .def("size_of_generated_code_in_bytes",
            &PyLoadedExecutable::SizeOfGeneratedCodeInBytes)
-      .def("get_compiled_memory_stats",
-           &PyLoadedExecutable::GetCompiledMemoryStats)
+      .def(
+          "get_compiled_memory_stats",
+          xla::ValueOrThrowWrapper(&PyLoadedExecutable::GetCompiledMemoryStats))
       .def("delete", &PyLoadedExecutable::Delete)
-      .def("execute", &PyLoadedExecutable::Execute, py::arg("arguments"),
-           py::arg("device") = std::nullopt)
-      // TODO(chky): Change execute() to always return token rather than hanving
-      // two API entry points.
-      .def("execute_with_token", &PyLoadedExecutable::ExecuteWithToken,
-           py::arg("arguments"), py::arg("device") = std::nullopt)
       .def("execute_sharded_on_local_devices",
-           &PyLoadedExecutable::ExecuteShardedOnLocalDevices,
+           xla::ValueOrThrowWrapper(
+               &PyLoadedExecutable::ExecuteShardedOnLocalDevices),
            py::arg("arguments"))
       .def("execute_sharded_on_local_devices_with_tokens",
-           &PyLoadedExecutable::ExecuteShardedOnLocalDevicesWithTokens,
+           xla::ValueOrThrowWrapper(
+               &PyLoadedExecutable::ExecuteShardedOnLocalDevicesWithTokens),
            py::arg("arguments"))
       // TODO(parkers): Switch execute_sharded_on_local_devices* to this.
-      .def("execute_sharded", &PyLoadedExecutable::ExecuteSharded,
+      .def("execute_sharded",
+           xla::ValueOrThrowWrapper(&PyLoadedExecutable::ExecuteSharded),
            py::arg("arguments"), py::arg("with_tokens") = false)
-      .def("hlo_modules", &PyLoadedExecutable::HloModules)
+      .def("hlo_modules",
+           xla::ValueOrThrowWrapper(&PyLoadedExecutable::HloModules))
       .def("get_output_shardings", &PyLoadedExecutable::GetOutputShardings)
       .def("get_parameter_shardings",
            &PyLoadedExecutable::GetParameterShardings)
       .def("keep_alive", &PyLoadedExecutable::KeepAlive)
       .def("compile_options",
            [](const PyLoadedExecutable& self) {
-             return self.pjrt_executable()->GetCompileOptions();
+             return ValueOrThrow(self.pjrt_executable()->GetCompileOptions());
            })
       .def_property_readonly("traceback", &PyLoadedExecutable::traceback)
       .def_property_readonly("fingerprint",
@@ -523,20 +545,23 @@ PYBIND11_MODULE(xla_extension, m) {
                                }
                              });
   py::class_<PyToken> token(m, "Token");
-  token.def("block_until_ready", &PyToken::Await);
+  token.def("block_until_ready",
+            [](PyToken& self) { xla::ThrowIfError(self.Await()); });
   py::class_<PyShardedToken> sharded_token(m, "ShardedToken");
-  sharded_token.def("block_until_ready", &PyShardedToken::Await);
+  sharded_token.def("block_until_ready", [](PyShardedToken& self) {
+    xla::ThrowIfError(self.Await());
+  });
   sharded_token.def("get_token", &PyShardedToken::GetPyToken);
 
-  m.def("buffer_to_dlpack_managed_tensor", BufferToDLPackManagedTensor,
+  m.def("buffer_to_dlpack_managed_tensor",
+        xla::ValueOrThrowWrapper(BufferToDLPackManagedTensor),
         py::arg("buffer"), py::arg("take_ownership") = true);
   m.def(
       "dlpack_managed_tensor_to_buffer",
       [](const pybind11::capsule& tensor, std::shared_ptr<PyClient> cpu_client,
          std::shared_ptr<PyClient> gpu_client) {
-        return DLPackManagedTensorToBuffer(tensor, std::move(cpu_client),
-                                           std::move(gpu_client),
-                                           jax::GetEnableJaxArray());
+        return xla::ValueOrThrow(DLPackManagedTensorToBuffer(
+            tensor, std::move(cpu_client), std::move(gpu_client)));
       },
       py::arg("dlpack"), py::arg("cpu_backend") = nullptr,
       py::arg("gpu_backend") = nullptr);
@@ -561,9 +586,9 @@ PYBIND11_MODULE(xla_extension, m) {
           "initialize",
           [](tsl::PreemptionSyncManager& manager,
              DistributedRuntimeClient* client) {
-            TF_ASSIGN_OR_RETURN(tsl::CoordinationServiceAgent * agent,
-                                client->GetCoordinationServiceAgent());
-            return manager.Initialize(agent);
+            tsl::CoordinationServiceAgent* agent =
+                xla::ValueOrThrow(client->GetCoordinationServiceAgent());
+            xla::ThrowIfError(manager.Initialize(agent));
           },
           py::arg("distributed_client"))
       .def("reached_sync_point",
@@ -583,10 +608,16 @@ PYBIND11_MODULE(xla_extension, m) {
              std::shared_ptr<DistributedRuntimeClient>>
       distributed_runtime_client(m, "DistributedRuntimeClient");
   distributed_runtime_client
-      .def("connect", &DistributedRuntimeClient::Connect,
-           py::call_guard<py::gil_scoped_release>())
-      .def("shutdown", &DistributedRuntimeClient::Shutdown,
-           py::call_guard<py::gil_scoped_release>())
+      .def("connect",
+           [](DistributedRuntimeClient& self) {
+             py::gil_scoped_release gil_release;
+             xla::ThrowIfError(self.Connect());
+           })
+      .def("shutdown",
+           [](DistributedRuntimeClient& self) {
+             py::gil_scoped_release gil_release;
+             xla::ThrowIfError(self.Shutdown());
+           })
       // This method assumes that the value is a Python string. Use
       // `blocking_key_value_get_bytes()` if key_value_set() was called with a
       // Python bytes object as its value.
@@ -595,8 +626,8 @@ PYBIND11_MODULE(xla_extension, m) {
           [](DistributedRuntimeClient& client, std::string key,
              int64_t timeout_in_ms) {
             py::gil_scoped_release gil_release;
-            return client.BlockingKeyValueGet(
-                key, absl::Milliseconds(timeout_in_ms));
+            return xla::ValueOrThrow(client.BlockingKeyValueGet(
+                key, absl::Milliseconds(timeout_in_ms)));
           },
           py::arg("key"), py::arg("timeout_in_ms"))
       // Same as `blocking_key_value_get()`, but retrieves the raw Python byte
@@ -604,14 +635,11 @@ PYBIND11_MODULE(xla_extension, m) {
       .def(
           "blocking_key_value_get_bytes",
           [](DistributedRuntimeClient& client, std::string key,
-             int64_t timeout_in_ms) -> StatusOr<py::bytes> {
+             int64_t timeout_in_ms) -> py::bytes {
             py::gil_scoped_release gil_release;
-            xla::StatusOr<std::string> result = client.BlockingKeyValueGet(
-                key, absl::Milliseconds(timeout_in_ms));
-            if (!result.ok()) {
-              return result.status();
-            }
-            return py::bytes(*result);
+            std::string result = xla::ValueOrThrow(client.BlockingKeyValueGet(
+                key, absl::Milliseconds(timeout_in_ms)));
+            return py::bytes(result);
           },
           py::arg("key"), py::arg("timeout_in_ms"))
       .def(
@@ -619,8 +647,8 @@ PYBIND11_MODULE(xla_extension, m) {
           [](DistributedRuntimeClient& client, std::string barrier_id,
              int64_t timeout_in_ms) {
             py::gil_scoped_release gil_release;
-            return client.WaitAtBarrier(barrier_id,
-                                        absl::Milliseconds(timeout_in_ms));
+            xla::ThrowIfError(client.WaitAtBarrier(
+                barrier_id, absl::Milliseconds(timeout_in_ms)));
           },
           py::arg("barrier_id"), py::arg("timeout_in_ms"))
       // The key must be a string, but the value can either be a Python string
@@ -634,7 +662,7 @@ PYBIND11_MODULE(xla_extension, m) {
           [](DistributedRuntimeClient& client, std::string key,
              std::string value) {
             py::gil_scoped_release gil_release;
-            return client.KeyValueSet(key, value);
+            xla::ThrowIfError(client.KeyValueSet(key, value));
           },
           py::arg("key"), py::arg("value"))
       // Assumes that all values in the directory are Python strings.
@@ -642,7 +670,7 @@ PYBIND11_MODULE(xla_extension, m) {
           "key_value_dir_get",
           [](DistributedRuntimeClient& client, std::string key) {
             py::gil_scoped_release gil_release;
-            return client.KeyValueDirGet(key);
+            return xla::ValueOrThrow(client.KeyValueDirGet(key));
           },
           py::arg("key"))
       // Assumes that all values in the directory are Python byte objects.
@@ -651,17 +679,14 @@ PYBIND11_MODULE(xla_extension, m) {
       .def(
           "key_value_dir_get_bytes",
           [](DistributedRuntimeClient& client, std::string key)
-              -> StatusOr<std::vector<std::pair<std::string, py::bytes>>> {
+              -> std::vector<std::pair<std::string, py::bytes>> {
             py::gil_scoped_release gil_release;
-            xla::StatusOr<std::vector<std::pair<std::string, std::string>>>
-                result = client.KeyValueDirGet(key);
-            if (!result.ok()) {
-              return result.status();
-            }
+            std::vector<std::pair<std::string, std::string>> result =
+                xla::ValueOrThrow(client.KeyValueDirGet(key));
             // Convert std::string values to py::bytes.
             std::vector<std::pair<std::string, py::bytes>> kvs;
-            kvs.reserve(result->size());
-            for (const auto& kv : *result) {
+            kvs.reserve(result.size());
+            for (const auto& kv : result) {
               kvs.push_back(std::pair(kv.first, py::bytes(kv.second)));
             }
             return kvs;
@@ -682,7 +707,7 @@ PYBIND11_MODULE(xla_extension, m) {
          std::optional<int> max_missing_heartbeats,
          std::optional<int> enumerate_devices_timeout,
          std::optional<int> shutdown_timeout)
-          -> StatusOr<std::unique_ptr<DistributedRuntimeService>> {
+          -> std::unique_ptr<DistributedRuntimeService> {
         DistributedRuntimeServiceImpl::Options options;
         options.num_nodes = num_nodes;
         if (heartbeat_interval.has_value()) {
@@ -698,9 +723,9 @@ PYBIND11_MODULE(xla_extension, m) {
         if (shutdown_timeout.has_value()) {
           options.shutdown_timeout = absl::Seconds(*shutdown_timeout);
         }
-        TF_ASSIGN_OR_RETURN(std::unique_ptr<DistributedRuntimeService> service,
-                            GetDistributedRuntimeService(
-                                address, options, use_coordination_service));
+        std::unique_ptr<DistributedRuntimeService> service =
+            xla::ValueOrThrow(GetDistributedRuntimeService(
+                address, options, use_coordination_service));
         return service;
       },
       py::arg("address"), py::arg("num_nodes"),
@@ -721,7 +746,7 @@ PYBIND11_MODULE(xla_extension, m) {
                                           bool coordinator_reported_failure)>>
              missed_heartbeat_callback,
          std::optional<bool> shutdown_on_destruction)
-          -> StatusOr<std::shared_ptr<DistributedRuntimeClient>> {
+          -> std::shared_ptr<DistributedRuntimeClient> {
         DistributedRuntimeClient::Options options;
         options.node_id = node_id;
         if (rpc_timeout.has_value()) {
@@ -763,47 +788,42 @@ PYBIND11_MODULE(xla_extension, m) {
 
   m.def("is_optimized_build", &IsOptimizedBuild);
 
-  m.def("json_to_pprof_profile", &JsonToPprofProfile,
+  m.def("json_to_pprof_profile", xla::ValueOrThrowWrapper(JsonToPprofProfile),
         "Encodes the JSON representation of a pprof Profile into its binary "
         "protocol buffer encoding.");
-  m.def("pprof_profile_to_json", &PprofProfileToJson,
+  m.def("pprof_profile_to_json", xla::ValueOrThrowWrapper(PprofProfileToJson),
         "Decodes an uncompressed pprof Profile protocol buffer into a JSON "
         "representation");
 
-  py::class_<PjRtDeviceTopology>(m, "DeviceTopology")
-      .def_property_readonly("platform", [](PjRtDeviceTopology& topology) {
-        return topology.platform_name();
-      });
+  RegisterCompileOnlyClient(m);
+  py::class_<PjRtTopologyDescription, std::shared_ptr<PjRtTopologyDescription>>(
+      m, "DeviceTopology")
+      .def("_make_compile_only_devices",
+           [](std::shared_ptr<PjRtTopologyDescription> topology) {
+             return MakeCompileOnlyClient(topology)->Devices();
+           })
+      .def_property_readonly("platform",
+                             [](PjRtTopologyDescription& topology) {
+                               return topology.platform_name();
+                             })
+      .def_property_readonly("platform_version",
+                             [](PjRtTopologyDescription& topology) {
+                               return topology.platform_version();
+                             });
 
   py::class_<PjRtExecutable, std::shared_ptr<PjRtExecutable>>(m, "Executable")
-      .def("hlo_modules", &PjRtExecutable::GetHloModules)
+      .def("hlo_modules",
+           xla::ValueOrThrowWrapper(&PjRtExecutable::GetHloModules))
       .def("get_output_shardings", &PjRtExecutable::GetOutputShardings)
       .def("get_parameter_shardings", &PjRtExecutable::GetParameterShardings)
-      .def("get_compiled_memory_stats", &PjRtExecutable::GetCompiledMemoryStats)
-      .def("compile_options", &PjRtExecutable::GetCompileOptions)
+      .def("get_compiled_memory_stats",
+           xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompiledMemoryStats))
+      .def("compile_options",
+           xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompileOptions))
       .def("serialize", [](const PjRtExecutable& exec) -> py::bytes {
         return ValueOrThrow(exec.SerializeExecutable());
       });
 
-  m.def(
-      "compile",
-      [](const PjRtDeviceTopology& topology, std::string mlir_module,
-         CompileOptions options) -> StatusOr<std::shared_ptr<PjRtExecutable>> {
-        std::unique_ptr<PjRtExecutable> executable;
-        std::optional<std::string> fingerprint;
-        {
-          py::gil_scoped_release gil_release;
-          mlir::MLIRContext context;
-          TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
-                              ParseMlirModuleString(mlir_module, context));
-          TF_ASSIGN_OR_RETURN(executable, PjRtCompile(std::move(options),
-                                                      module.get(), topology));
-        }
-        return std::shared_ptr<PjRtExecutable>(std::move(executable));
-      },
-      py::arg("topology"), py::arg("computation"),
-      py::arg("compile_options") = CompileOptions());
-
   m.def("is_asan", IsAsan);
   m.def("is_msan", IsMsan);
   m.def("is_tsan", IsTsan);
@@ -812,12 +832,12 @@ PYBIND11_MODULE(xla_extension, m) {
   m.attr("batched_device_put") = py::cpp_function(
       [](py::object aval, py::object sharding, std::vector<py::object> xs,
          std::vector<ClientAndPtr<PjRtDevice>> dst_devices, bool committed,
-         bool force_copy, PjRtClient::HostBufferSemantics host_buffer_semantics)
-          -> StatusOr<PyArray> {
-        return PyArray::BatchedDevicePut(
+         bool force_copy,
+         PjRtClient::HostBufferSemantics host_buffer_semantics) -> PyArray {
+        return ValueOrThrow(PyArray::BatchedDevicePut(
             std::move(aval), std::move(sharding), std::move(xs),
             std::move(dst_devices), committed, force_copy,
-            host_buffer_semantics, jax::GetEnableX64());
+            host_buffer_semantics, jax::GetEnableX64()));
       },
       py::arg("aval"), py::arg("sharding"), py::arg("xs"), py::arg("devices"),
       py::arg("committed") = true, py::arg("force_copy") = false,
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index f9e056f69a0..583512817e7 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -24,6 +24,7 @@ import os
 from typing import Any, List, Mapping, Optional, Sequence, Tuple, Union
 
 from . import xla_extension as _xla
+import ml_dtypes
 import numpy as np
 
 # Note this module does *not* depend on any Python protocol buffers. The XLA
@@ -43,10 +44,10 @@ profiler = _xla.profiler
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes.
-_version = 139
+_version = 151
 
 # Version number for MLIR:Python components.
-mlir_api_version = 46
+mlir_api_version = 47
 
 xla_platform_names = {
     'cpu': 'Host',
@@ -105,6 +106,13 @@ def make_tfrt_tpu_c_api_client(options: Optional[_NameValueMapping] = None):
   return _xla.get_c_api_client('tpu', options)
 
 
+DeviceTopology = _xla.DeviceTopology
+
+
+def make_tfrt_tpu_c_api_device_topology() -> DeviceTopology:
+  return _xla.get_default_c_api_topology('tpu')
+
+
 def load_pjrt_plugin_dynamically(plugin_name: str, library_path: str) -> None:
   _xla.load_pjrt_plugin(plugin_name, library_path)
 
@@ -194,9 +202,10 @@ def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
 
 PrimitiveType = _xla.PrimitiveType
 
-bfloat16 = _xla.bfloat16_dtype()
-float8_e4m3fn = _xla.float8_e4m3fn_dtype()
-float8_e5m2 = _xla.float8_e5m2_dtype()
+bfloat16 = ml_dtypes.bfloat16
+float8_e4m3fn = ml_dtypes.float8_e4m3fn
+float8_e4m3b11fnuz = ml_dtypes.float8_e4m3b11
+float8_e5m2 = ml_dtypes.float8_e5m2
 
 XLA_ELEMENT_TYPE_TO_DTYPE = {
     PrimitiveType.PRED: np.dtype('bool'),
@@ -209,6 +218,7 @@ XLA_ELEMENT_TYPE_TO_DTYPE = {
     PrimitiveType.U32: np.dtype('uint32'),
     PrimitiveType.U64: np.dtype('uint64'),
     PrimitiveType.F8E4M3FN: np.dtype(float8_e4m3fn),
+    PrimitiveType.F8E4M3B11FNUZ: np.dtype(float8_e4m3b11fnuz),
     PrimitiveType.F8E5M2: np.dtype(float8_e5m2),
     PrimitiveType.BF16: np.dtype(bfloat16),
     PrimitiveType.F16: np.dtype('float16'),
@@ -453,9 +463,7 @@ XlaComputation = _xla.XlaComputation
 XlaOp = _xla.XlaOp
 FftType = _xla.FftType
 Client = _xla.Client
-Buffer = _xla.Buffer
 ArrayImpl = _xla.ArrayImpl
-DeviceArrayBase = _xla.DeviceArrayBase
 LoadedExecutable = _xla.LoadedExecutable
 OpSharding = _xla.OpSharding
 HloSharding = _xla.HloSharding
@@ -467,6 +475,25 @@ PmapSharding = _xla.PmapSharding
 GSPMDSharding = _xla.GSPMDSharding
 
 
+def LoadedExecutable_execute(self, arguments, device=None):
+  del device
+  results = self.execute_sharded(arguments)
+  return [x[0] for x in results.disassemble_into_single_device_arrays()]
+
+
+def LoadedExecutable_execute_with_token(self, arguments, device=None):
+  del device
+  results = self.execute_sharded(arguments, with_tokens=True)
+  return (
+      [x[0] for x in results.disassemble_into_single_device_arrays()],
+      results.consume_token().get_token(0),
+  )
+
+
+LoadedExecutable.execute = LoadedExecutable_execute
+LoadedExecutable.execute_with_token = LoadedExecutable_execute_with_token
+
+
 def register_custom_call_target(
     name: str, fn: Any, platform: str = 'cpu'
 ) -> None:
@@ -486,6 +513,7 @@ def register_custom_call_target(
 # Deprecated. Use register_custom_call_target instead.
 register_cpu_custom_call_target = register_custom_call_target
 register_custom_call_partitioner = _xla.register_custom_call_partitioner
+encode_inspect_sharding_callback = _xla.encode_inspect_sharding_callback
 hlo_sharding_util = _xla.hlo_sharding_util
 
 
diff --git a/tensorflow/compiler/xla/python/xla_client.pyi b/tensorflow/compiler/xla/python/xla_client.pyi
index 5d2af9030be..5358903ee65 100644
--- a/tensorflow/compiler/xla/python/xla_client.pyi
+++ b/tensorflow/compiler/xla/python/xla_client.pyi
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Type, Union
 
 import numpy
 
@@ -23,13 +23,12 @@ from .xla_extension import Layout as Layout
 from .xla_extension import ops as ops
 from .xla_extension import profiler as profiler
 
-from .xla_extension import Buffer as Buffer
 from .xla_extension import ArrayImpl as ArrayImpl
 from .xla_extension import Client as Client
 from .xla_extension import CompileOptions as CompileOptions
 from .xla_extension import Device as Device
-from .xla_extension import DeviceArrayBase as DeviceArrayBase
 from .xla_extension import DeviceAssignment as DeviceAssignment
+from .xla_extension import DeviceTopology as DeviceTopology
 from .xla_extension import DistributedRuntimeClient as DistributedRuntimeClient
 from .xla_extension import LoadedExecutable as LoadedExecutable
 from .xla_extension import FftType as FftType
@@ -53,9 +52,10 @@ _version: int
 
 mlir_api_version: int
 
-bfloat16: numpy.dtype
-float8_e4m3fn: numpy.dtype
-float8_e5m2: numpy.dtype
+bfloat16: Type[numpy.generic]
+float8_e4m3fn: Type[numpy.generic]
+float8_e4m3b11fnuz: Type[numpy.generic]
+float8_e5m2: Type[numpy.generic]
 XLA_ELEMENT_TYPE_TO_DTYPE: Dict[PrimitiveType, numpy.dtype]
 
 _NameValueMapping = Mapping[str, Union[str, int, List[int], float]]
@@ -97,6 +97,10 @@ def make_tfrt_tpu_c_api_client(options: Optional[_NameValueMapping] = None) -> C
   ...
 
 
+def make_tfrt_tpu_c_api_device_topology() -> DeviceTopology:
+  ...
+
+
 def make_tpu_client() -> Client:
   ...
 
@@ -225,3 +229,4 @@ def register_custom_call_target(
     name: str, fn: Callable, platform: str = ...
 ) -> None:
   ...
+def encode_inspect_sharding_callback(handler: Any) -> bytes: ...
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 9c9309ec996..3d6c42dead0 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -35,13 +35,9 @@ try:
 except ImportError:
   custom_call_for_test = None
 
-xla_client._xla.jax_jit.set_thread_local_state_initialization_callback(
-    lambda: None
-)
-xla_client._xla.jax_jit.global_state().jax_array = True
-
 bfloat16 = xla_client.bfloat16
 float8_e4m3fn = xla_client.float8_e4m3fn
+float8_e4m3b11fnuz = xla_client.float8_e4m3b11fnuz
 float8_e5m2 = xla_client.float8_e5m2
 ops = xla_client.ops
 xla_computation_to_mlir_module = (
@@ -57,9 +53,17 @@ def jax_array_device(self):
   return self._sharding._device
 
 
+def jax_array_copy_to_host_async(self):
+  self._copy_single_device_array_to_host_async()
+
+
 Array = xla_client.ArrayImpl
 Array.__array__ = jax_array_convert_to_array
+Array.copy_to_host_async = jax_array_copy_to_host_async
 Array.device = jax_array_device
+xla_client.SingleDeviceSharding.device_set = property(
+    lambda self: {self._device}
+)
 # pylint: enable=invalid-name
 
 
@@ -78,21 +82,15 @@ def TestFactory(xla_backend,
                 pathways=False):
   tests = []
 
-  if not cloud_tpu:
-    int_dtypes = [np.int32, np.int64, np.uint32, np.uint64]
-    # TODO(phawkins): test np.float16, where supported.
-    float_dtypes = [bfloat16, np.float32, np.float64]
-    complex_dtypes = [np.complex64, np.complex128]
-    standard_dtypes = int_dtypes + float_dtypes + complex_dtypes + [np.bool_]
-  else:
-    int_dtypes = [np.int32, np.uint32]
-    float_dtypes = [np.float32]
-    complex_dtypes = [np.complex64]
-    standard_dtypes = int_dtypes + float_dtypes + complex_dtypes + [np.bool_]
+  int_dtypes = [np.int32, np.int64, np.uint32, np.uint64]
+  # TODO(phawkins): test np.float16, where supported.
+  float_dtypes = [bfloat16, np.float32, np.float64]
+  complex_dtypes = [np.complex64, np.complex128]
+  standard_dtypes = int_dtypes + float_dtypes + complex_dtypes + [np.bool_]
   # TODO(zhangqiaorjc): test fp8 types when XLA support is complete.
   # standard_dtypes is only used for BufferProtocolTest so we only test fp8
   # round trip tests.
-  standard_dtypes += [float8_e4m3fn, float8_e5m2]
+  standard_dtypes += [float8_e4m3b11fnuz, float8_e4m3fn, float8_e5m2]
   dlpack_dtypes = int_dtypes + float_dtypes + [np.bool_] + complex_dtypes
 
   class ComputationTest(parameterized.TestCase):
@@ -233,8 +231,6 @@ def TestFactory(xla_backend,
         "dtype": dtype,
     } for dtype in int_dtypes + float_dtypes)
     def testConstantScalarSum(self, dtype):
-      if dtype == np.int8 and self.backend.platform == "tpu":
-        self.skipTest("TPU doesn't support int8")
       c = self._NewComputation()
       ops.Add(ops.Constant(c, dtype(1.11)), ops.Constant(c, dtype(3.14)))
       self._ExecuteAndCompareClose(c, expected=[dtype(1.11) + dtype(3.14)])
@@ -628,13 +624,6 @@ def TestFactory(xla_backend,
       with self.assertRaises(xla_client.XlaRuntimeError):
         compiled_c.execute([arg_buffer])
 
-    def testXlaShape(self):
-      pyval = np.array([[1., 2.]], np.float32)
-      local_buffer = self.backend.buffer_from_pyval(pyval)
-      xla_shape = local_buffer.xla_shape()
-      self.assertEqual(xla_shape.dimensions(), (1, 2))
-      self.assertEqual(np.dtype(xla_shape.element_type()), np.dtype(np.float32))
-
     def testXlaShapeIndex(self):
       a = xla_client.ShapeIndex((1, 2))
       b = xla_client.ShapeIndex((1, 2))
@@ -674,31 +663,6 @@ def TestFactory(xla_backend,
               "BlockHostUntilReady() called on deleted or donated buffer")):
         buffer.block_until_ready()
 
-    @unittest.skipIf(pjrt_c_api, "b/264472918")
-    def testDeviceArrayBaseSignatures(self):
-      # When extending `DeviceArrayBase`, the object behaves as a `DeviceArray`
-      # and thus needs to correctly implement the following methods.
-      arg = np.array([[1., 2., 3.]], np.float32)
-      buffer = self.backend.buffer_from_pyval(arg)
-      if not isinstance(buffer, xla_client.DeviceArrayBase):
-        raise unittest.SkipTest(
-            "The objectof type {} do not extend DeviceArrayBase".format(
-                type(buffer)))
-
-      self.assertEqual(buffer.__array_priority__, 100)
-      self.assertEqual(buffer.shape, (1, 3))
-      self.assertEqual(buffer.dtype, np.float32)
-      self.assertEqual(buffer.size, 3)
-      self.assertEqual(buffer.ndim, 2)
-
-      self.assertIs(buffer, buffer.block_until_ready())
-      self.assertTrue(buffer.is_ready())
-      buffer.delete()
-      with self.assertRaises(xla_client.XlaRuntimeError):
-        buffer.block_until_ready()
-      with self.assertRaises(xla_client.XlaRuntimeError):
-        buffer.is_ready()
-
     def testOnDeviceSizeInBytes(self):
       if not isinstance(self.backend, xla_client.Client):
         self.skipTest("TPU Driver doesn't support OnDeviceSizeInBytes.")
@@ -728,8 +692,6 @@ def TestFactory(xla_backend,
       self.assertIs(self.backend.live_buffers()[0], arg2_buffer)
       self.assertIs(self.backend.live_buffers()[1], arg1_buffer)
       self.assertIs(self.backend.live_buffers()[2], arg0_buffer)
-      self.assertEqual(self.backend.devices()[0].live_buffers(),
-                       self.backend.live_buffers())
 
       arg1_buffer.delete()
       self.assertLen(self.backend.live_buffers(), 2)
@@ -765,11 +727,7 @@ def TestFactory(xla_backend,
 
     def testStandardTypes(self):
       for dtype in standard_dtypes:
-        if dtype == bfloat16 or dtype == np.complex128:
-          continue
-        # NV FP8 not supported on TPU.
-        if (dtype in [float8_e4m3fn, float8_e5m2] and
-            self.backend.platform == "tpu"):
+        if dtype == np.complex128:
           continue
         arr = self.backend.buffer_from_pyval(np.array([0, 1], dtype))
         arr = np.asarray(arr)
@@ -797,13 +755,6 @@ def TestFactory(xla_backend,
       np.testing.assert_array_equal(np.asarray(y), np.asarray(z))
       self.assertEqual(y.unsafe_buffer_pointer(), z.unsafe_buffer_pointer())
 
-    @unittest.skipIf(cloud_tpu or pathways, "not implemented")
-    def testJaxAttributesHaveCorrectDefaults(self):
-      x = np.array([[3., 4., 5.]], np.float32)
-      y = self.backend.buffer_from_pyval(x)
-      self.assertIsNone(y.aval)
-      self.assertIsNone(y._device)
-
   tests.append(BufferTest)
 
   class SingleOpTest(ComputationTest):
@@ -2215,6 +2166,7 @@ def TestFactory(xla_backend,
       outfeed_shape = xla_client.shape_from_pyval(
           to_round_trip[0]).with_major_to_minor_layout_if_absent()
       ops.OutfeedWithToken(x, token, outfeed_shape)
+      ops.Tuple(c, ())
 
       compiled_c = self.backend.compile(
           xla_computation_to_mlir_module(c.build()))
@@ -2594,9 +2546,7 @@ def TestFactory(xla_backend,
       self.assertLen(executable.hlo_modules(), 1)
 
       serialized = self.backend.serialize_executable(executable)
-      deserialized = self.backend.deserialize_executable(
-          serialized,
-          executable.hlo_modules()[0], options)
+      deserialized = self.backend.deserialize_executable(serialized, options)
 
       expected, = xla_client.execute_with_python_values(executable, (),
                                                         self.backend)
@@ -2962,18 +2912,14 @@ def TestFactory(xla_backend,
       self.assertIsInstance(results[0], list)
       self.assertLen(results[0], 1)
       results[0][0].block_until_ready()
-      self.assertIsInstance(
-          results[0][0], (xla_client.Buffer, xla_client.ArrayImpl)
-      )
+      self.assertIsInstance(results[0][0], xla_client.ArrayImpl)
 
       results, _ = compiled_c.execute_sharded_on_local_devices_with_tokens([])
       self.assertLen(results, 1)
       self.assertIsInstance(results[0], list)
       self.assertLen(results[0], 1)
       results[0][0].block_until_ready()
-      self.assertIsInstance(
-          results[0][0], (xla_client.Buffer, xla_client.ArrayImpl)
-      )
+      self.assertIsInstance(results[0][0], xla_client.ArrayImpl)
 
     def testExecuteShardedOverloadBufferInput(self):
       arg = np.arange(12, dtype=np.int16).reshape(3, 4)
@@ -2992,9 +2938,7 @@ def TestFactory(xla_backend,
       self.assertIsInstance(results[0], list)
       self.assertLen(results[0], 1)
       results[0][0].block_until_ready()
-      self.assertIsInstance(
-          results[0][0], (xla_client.Buffer, xla_client.ArrayImpl)
-      )
+      self.assertIsInstance(results[0][0], xla_client.ArrayImpl)
 
       results, _ = compiled_c.execute_sharded_on_local_devices_with_tokens(
           [[buffer]])
@@ -3002,9 +2946,7 @@ def TestFactory(xla_backend,
       self.assertIsInstance(results[0], list)
       self.assertLen(results[0], 1)
       results[0][0].block_until_ready()
-      self.assertIsInstance(
-          results[0][0], (xla_client.Buffer, xla_client.ArrayImpl)
-      )
+      self.assertIsInstance(results[0][0], xla_client.ArrayImpl)
 
   tests.append(ExecuteShardedOverloadTest)
 
diff --git a/tensorflow/compiler/xla/python/xla_compiler.cc b/tensorflow/compiler/xla/python/xla_compiler.cc
index 11989b15e9a..e8494bb1099 100644
--- a/tensorflow/compiler/xla/python/xla_compiler.cc
+++ b/tensorflow/compiler/xla/python/xla_compiler.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/hash/hash.h"
+#include "absl/strings/str_join.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "pybind11/attr.h"  // from @pybind11
@@ -41,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/python/py_client.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
@@ -222,6 +224,66 @@ void DefRepeatedProperty(py::class_<T>& cls, const char* name,
       });
 }
 
+StatusOr<bool> IsOpShardingFullyReplicated(const OpSharding& op_sharding) {
+  switch (op_sharding.type()) {
+    case OpSharding::REPLICATED:
+    case OpSharding::MAXIMAL:
+      return true;
+    case OpSharding::TUPLE: {
+      for (const OpSharding& tuple_sharding : op_sharding.tuple_shardings()) {
+        TF_ASSIGN_OR_RETURN(bool replicated,
+                            IsOpShardingFullyReplicated(tuple_sharding));
+        if (!replicated) {
+          return false;
+        }
+      }
+      return true;
+    }
+    case OpSharding::OTHER: {
+      if (op_sharding.tile_assignment_devices_size() == 1) {
+        return true;
+      }
+      if (op_sharding.last_tile_dims_size() > 0) {
+        if (op_sharding.last_tile_dims_size() >
+            op_sharding.tile_assignment_dimensions_size()) {
+          return InvalidArgument(
+              "last_tile_dims is larger than tile_assignment_dimensions");
+        }
+        size_t last_dims = op_sharding.tile_assignment_dimensions_size() -
+                           op_sharding.last_tile_dims_size();
+        for (size_t i = 0; i < last_dims; ++i) {
+          if (op_sharding.tile_assignment_dimensions(i) != 1) {
+            return false;
+          }
+        }
+        // This handles cases like [MANUAL, REPLICATED], where all the
+        // non-replicated dimensions have tile dimension 1.
+        for (size_t i = 0; i < op_sharding.last_tile_dims_size(); ++i) {
+          if (op_sharding.tile_assignment_dimensions(last_dims + i) != 1 &&
+              op_sharding.last_tile_dims(i) != OpSharding::REPLICATED) {
+            return false;
+          }
+        }
+        return true;
+      } else if (op_sharding.replicate_on_last_tile_dim()) {
+        for (size_t i = 0;
+             i + 1 < op_sharding.tile_assignment_dimensions_size(); ++i) {
+          if (op_sharding.tile_assignment_dimensions(i) != 1) {
+            return false;
+          }
+        }
+        return true;
+      } else {
+        return false;
+      }
+    }
+    case OpSharding::MANUAL:
+      return op_sharding.tile_assignment_devices_size() == 1;
+    default:
+      return InvalidArgument("Unknown/invalid op_sharding type");
+  }
+}
+
 }  // namespace
 
 void BuildXlaCompilerSubmodule(py::module& m) {
@@ -249,44 +311,46 @@ void BuildXlaCompilerSubmodule(py::module& m) {
             return ShapeUtil::MakeTupleShape(shapes);
           },
           "Constructs a tuple shape.")
+      .def_static("array_shape",
+                  xla::ValueOrThrowWrapper(
+                      [](PrimitiveType type, py::object dims_seq,
+                         std::optional<py::object> layout_seq,
+                         std::optional<std::vector<bool>> dynamic_dimensions)
+                          -> StatusOr<Shape> {
+                        std::vector<int64_t> dims =
+                            SequenceToVector<int64_t>(dims_seq);
+                        if (layout_seq) {
+                          std::vector<int64_t> layout =
+                              SequenceToVector<int64_t>(*layout_seq);
+                          return MakeShapeWithDenseLayout(type, dims, layout,
+                                                          dynamic_dimensions);
+                        } else {
+                          return MakeShapeWithDenseLayout(
+                              type, dims, std::nullopt, dynamic_dimensions);
+                        }
+                      }),
+                  "Constructs an array shape.", py::arg("type"),
+                  py::arg("dims"), py::arg("layout") = std::nullopt,
+                  py::arg("dynamic_dimensions") = std::nullopt)
       .def_static(
           "array_shape",
-          [](PrimitiveType type, py::object dims_seq,
-             std::optional<py::object> layout_seq,
-             std::optional<std::vector<bool>> dynamic_dimensions)
-              -> StatusOr<Shape> {
-            std::vector<int64_t> dims = SequenceToVector<int64_t>(dims_seq);
-            if (layout_seq) {
-              std::vector<int64_t> layout =
-                  SequenceToVector<int64_t>(*layout_seq);
-              return MakeShapeWithDenseLayout(type, dims, layout,
-                                              dynamic_dimensions);
-            } else {
-              return MakeShapeWithDenseLayout(type, dims, std::nullopt,
-                                              dynamic_dimensions);
-            }
-          },
-          "Constructs an array shape.", py::arg("type"), py::arg("dims"),
-          py::arg("layout") = std::nullopt,
-          py::arg("dynamic_dimensions") = std::nullopt)
-      .def_static(
-          "array_shape",
-          [](py::dtype dtype, py::object dims_seq,
-             std::optional<py::object> layout_seq,
-             std::optional<std::vector<bool>> dynamic_dimensions)
-              -> StatusOr<Shape> {
-            PrimitiveType type = ValueOrThrow(DtypeToPrimitiveType(dtype));
-            std::vector<int64_t> dims = SequenceToVector<int64_t>(dims_seq);
-            if (layout_seq) {
-              std::vector<int64_t> layout =
-                  SequenceToVector<int64_t>(*layout_seq);
-              return MakeShapeWithDenseLayout(type, dims, layout,
-                                              dynamic_dimensions);
-            } else {
-              return MakeShapeWithDenseLayout(type, dims, std::nullopt,
-                                              dynamic_dimensions);
-            }
-          },
+          xla::ValueOrThrowWrapper(
+              [](py::dtype dtype, py::object dims_seq,
+                 std::optional<py::object> layout_seq,
+                 std::optional<std::vector<bool>> dynamic_dimensions)
+                  -> StatusOr<Shape> {
+                PrimitiveType type = ValueOrThrow(DtypeToPrimitiveType(dtype));
+                std::vector<int64_t> dims = SequenceToVector<int64_t>(dims_seq);
+                if (layout_seq) {
+                  std::vector<int64_t> layout =
+                      SequenceToVector<int64_t>(*layout_seq);
+                  return MakeShapeWithDenseLayout(type, dims, layout,
+                                                  dynamic_dimensions);
+                } else {
+                  return MakeShapeWithDenseLayout(type, dims, std::nullopt,
+                                                  dynamic_dimensions);
+                }
+              }),
           "Constructs an array shape.", py::arg("type"), py::arg("dims"),
           py::arg("layout") = std::nullopt,
           py::arg("dynamic_dimensions") = std::nullopt)
@@ -299,8 +363,8 @@ void BuildXlaCompilerSubmodule(py::module& m) {
           "Constructs a scalar shape.", py::arg("type"))
       .def_static(
           "scalar_shape",
-          [](py::dtype dtype) -> StatusOr<Shape> {
-            PrimitiveType type = ValueOrThrow(DtypeToPrimitiveType(dtype));
+          [](py::dtype dtype) -> Shape {
+            PrimitiveType type = xla::ValueOrThrow(DtypeToPrimitiveType(dtype));
             return ShapeUtil::MakeScalarShape(type);
           },
           "Constructs a scalar shape.", py::arg("type"))
@@ -313,14 +377,16 @@ void BuildXlaCompilerSubmodule(py::module& m) {
       .def("xla_element_type", &Shape::element_type)
       .def("element_type",
            [](const Shape& shape) {
-             return ValueOrThrow(PrimitiveTypeToDtype(shape.element_type()));
+             return xla::ValueOrThrow(
+                 PrimitiveTypeToDtype(shape.element_type()));
            })
       .def("numpy_dtype",
            [](const Shape& shape) {
              if (shape.IsTuple()) {
                return py::dtype("O");
              }
-             return ValueOrThrow(PrimitiveTypeToDtype(shape.element_type()));
+             return xla::ValueOrThrow(
+                 PrimitiveTypeToDtype(shape.element_type()));
            })
       .def("is_tuple", &Shape::IsTuple)
       .def("is_array", &Shape::IsArray)
@@ -405,15 +471,18 @@ void BuildXlaCompilerSubmodule(py::module& m) {
         proto.ParseFromString(std::string(serialized_hlo_module_proto));
         return std::make_unique<XlaComputation>(proto);
       }))
-      .def("get_hlo_module", &GetHloModule)
-      .def("program_shape", &XlaComputation::GetProgramShape)
+      .def("get_hlo_module", xla::ValueOrThrowWrapper(GetHloModule))
+      .def("program_shape",
+           xla::ValueOrThrowWrapper(&XlaComputation::GetProgramShape))
       .def("name", &XlaComputation::name)
-      .def("as_serialized_hlo_module_proto", &GetComputationSerializedProto)
-      .def("as_hlo_text", &GetComputationHloText,
+      .def("as_serialized_hlo_module_proto",
+           xla::ValueOrThrowWrapper(GetComputationSerializedProto))
+      .def("as_hlo_text", xla::ValueOrThrowWrapper(GetComputationHloText),
            py::arg("print_large_constants") = false)
-      .def("as_hlo_dot_graph", &GetComputationHloDotGraph)
-      .def("hash", &HashComputation)
-      .def("as_hlo_module", &GetHloModule);
+      .def("as_hlo_dot_graph",
+           xla::ValueOrThrowWrapper(GetComputationHloDotGraph))
+      .def("hash", xla::ValueOrThrowWrapper(HashComputation))
+      .def("as_hlo_module", xla::ValueOrThrowWrapper(GetHloModule));
 
   py::class_<HloPrintOptions> hlo_print_options_class(m, "HloPrintOptions");
   hlo_print_options_class.def(py::init<>())
@@ -474,8 +543,10 @@ void BuildXlaCompilerSubmodule(py::module& m) {
           static_cast<std::string (HloModule::*)(const HloPrintOptions&) const>(
               &HloModule::ToString),
           py::arg("options") = HloPrintOptions())
-      .def("as_serialized_hlo_module_proto", &GetHloModuleSerializedProto)
-      .def("from_serialized_hlo_module_proto", &HloModuleFromSerializedProto)
+      .def("as_serialized_hlo_module_proto",
+           xla::ValueOrThrowWrapper(GetHloModuleSerializedProto))
+      .def("from_serialized_hlo_module_proto",
+           xla::ValueOrThrowWrapper(HloModuleFromSerializedProto))
       .def_property_readonly(
           "spmd_output_sharding",
           [](const HloModule& m) -> std::optional<xla::OpSharding> {
@@ -524,34 +595,35 @@ void BuildXlaCompilerSubmodule(py::module& m) {
            });
 
   m.def("hlo_module_to_dot_graph",
-        [](const HloModule& hlo_module) -> StatusOr<std::string> {
-          return RenderGraph(*hlo_module.entry_computation(), /*label=*/"",
-                             hlo_module.config().debug_options(),
-                             RenderedGraphFormat::kDot);
+        [](const HloModule& hlo_module) -> std::string {
+          return xla::ValueOrThrow(RenderGraph(
+              *hlo_module.entry_computation(), /*label=*/"",
+              hlo_module.config().debug_options(), RenderedGraphFormat::kDot));
         });
-  m.def(
-      "hlo_module_cost_analysis",
-      [](PyClient* client, const HloModule& module)
-          -> StatusOr<absl::flat_hash_map<std::string, float>> {
-        TF_ASSIGN_OR_RETURN(auto analysis,
-                            client->pjrt_client()->GetHloCostAnalysis());
-        TF_RETURN_IF_ERROR(module.entry_computation()->Accept(analysis.get()));
+  m.def("hlo_module_cost_analysis",
+        xla::ValueOrThrowWrapper(
+            [](PyClient* client, const HloModule& module)
+                -> StatusOr<absl::flat_hash_map<std::string, float>> {
+              TF_ASSIGN_OR_RETURN(auto analysis,
+                                  client->pjrt_client()->GetHloCostAnalysis());
+              TF_RETURN_IF_ERROR(
+                  module.entry_computation()->Accept(analysis.get()));
 
-        // Convert from HloCostAnalysis::Properties to a standard map.
-        absl::flat_hash_map<std::string, float> ret;
-        analysis->properties().ForEach(
-            [&](absl::string_view key, float val) { ret[key] = val; });
-        return ret;
-      });
+              // Convert from HloCostAnalysis::Properties to a standard map.
+              absl::flat_hash_map<std::string, float> ret;
+              analysis->properties().ForEach(
+                  [&](absl::string_view key, float val) { ret[key] = val; });
+              return ret;
+            }));
   m.def("hlo_module_from_text",
-        [](const std::string& hlo_module_text)
-            -> StatusOr<std::shared_ptr<HloModule>> {
+        xla::ValueOrThrowWrapper([](const std::string& hlo_module_text)
+                                     -> StatusOr<std::shared_ptr<HloModule>> {
           auto hlo_module =
               xla::ParseAndReturnUnverifiedModule(hlo_module_text);
           TF_RETURN_IF_ERROR(hlo_module.status());
           std::shared_ptr<HloModule> result(std::move(*hlo_module));
           return result;
-        });
+        }));
 
   py::class_<XlaOp> xla_op_class(m, "XlaOp");
 
@@ -560,23 +632,23 @@ void BuildXlaCompilerSubmodule(py::module& m) {
         return std::make_unique<XlaBuilder>(UniquifyName(name));
       }))
       // TODO(phawkins): delete capitalized names after updating callers.
-      .def(
-          "Build",
-          [](XlaBuilder& builder, std::optional<XlaOp> root) {
-            return root ? builder.Build(*root) : builder.Build();
-          },
-          "Builds a computation from the contents of the builder.",
-          py::arg("root") = std::nullopt)
-      .def("GetShape", &XlaBuilder::GetShape)
-      .def(
-          "build",
-          [](XlaBuilder& builder, std::optional<XlaOp> root) {
-            return root ? builder.Build(*root) : builder.Build();
-          },
-          "Builds a computation from the contents of the builder.",
-          py::arg("root") = std::nullopt)
+      .def("Build",
+           xla::ValueOrThrowWrapper(
+               [](XlaBuilder& builder, std::optional<XlaOp> root) {
+                 return root ? builder.Build(*root) : builder.Build();
+               }),
+           "Builds a computation from the contents of the builder.",
+           py::arg("root") = std::nullopt)
+      .def("GetShape", xla::ValueOrThrowWrapper(&XlaBuilder::GetShape))
+      .def("build",
+           xla::ValueOrThrowWrapper(
+               [](XlaBuilder& builder, std::optional<XlaOp> root) {
+                 return root ? builder.Build(*root) : builder.Build();
+               }),
+           "Builds a computation from the contents of the builder.",
+           py::arg("root") = std::nullopt)
       .def("clear_op_metadata", &XlaBuilder::ClearOpMetadata)
-      .def("get_shape", &XlaBuilder::GetShape)
+      .def("get_shape", xla::ValueOrThrowWrapper(&XlaBuilder::GetShape))
       .def(
           "get_program_shape",
           [](const XlaBuilder& builder,
@@ -585,7 +657,7 @@ void BuildXlaCompilerSubmodule(py::module& m) {
                         : builder.GetProgramShape();
           },
           py::arg("root") = std::nullopt)
-      .def("is_constant", &XlaBuilder::IsConstant)
+      .def("is_constant", xla::ValueOrThrowWrapper(&XlaBuilder::IsConstant))
       .def("set_op_metadata", &XlaBuilder::SetOpMetadata)
       .def("set_sharding", &XlaBuilder::SetSharding)
       .def("clear_sharding", &XlaBuilder::ClearSharding)
@@ -602,34 +674,37 @@ void BuildXlaCompilerSubmodule(py::module& m) {
 
   // Device assignments
   py::class_<DeviceAssignment>(m, "DeviceAssignment")
-      .def_static("create",
-                  [](py::array_t<int> array) -> StatusOr<DeviceAssignment> {
-                    if (array.ndim() != 2) {
-                      return InvalidArgument(
-                          "Argument to DeviceAssignment constructor must be a "
-                          "2D array, received an %dD array.",
-                          array.ndim());
-                    }
-                    DeviceAssignment result(array.shape(0), array.shape(1));
-                    for (int i = 0; i < array.shape(0); ++i) {
-                      for (int j = 0; j < array.shape(1); ++j) {
-                        result(i, j) = array.at(i, j);
-                      }
-                    }
-                    return result;
-                  })
+      .def_static(
+          "create",
+          xla::ValueOrThrowWrapper(
+              [](py::array_t<int> array) -> StatusOr<DeviceAssignment> {
+                if (array.ndim() != 2) {
+                  return InvalidArgument(
+                      "Argument to DeviceAssignment constructor must be a "
+                      "2D array, received an %dD array.",
+                      array.ndim());
+                }
+                DeviceAssignment result(array.shape(0), array.shape(1));
+                for (int i = 0; i < array.shape(0); ++i) {
+                  for (int j = 0; j < array.shape(1); ++j) {
+                    result(i, j) = array.at(i, j);
+                  }
+                }
+                return result;
+              }))
       .def("replica_count", &DeviceAssignment::replica_count)
       .def("computation_count", &DeviceAssignment::computation_count)
       .def("__repr__", &DeviceAssignment::ToString)
-      .def("serialize", [](const DeviceAssignment& da) -> StatusOr<py::bytes> {
-        DeviceAssignmentProto proto;
-        TF_RETURN_IF_ERROR(da.Serialize(&proto));
-        std::string result;
-        if (!tsl::SerializeToStringDeterministic(proto, &result)) {
-          return Unknown("Failed to serialize the DeviceAssignmentProto.");
-        }
-        return py::bytes(result);
-      });
+      .def("serialize", xla::ValueOrThrowWrapper([](const DeviceAssignment& da)
+                                                     -> StatusOr<py::bytes> {
+             DeviceAssignmentProto proto;
+             TF_RETURN_IF_ERROR(da.Serialize(&proto));
+             std::string result;
+             if (!tsl::SerializeToStringDeterministic(proto, &result)) {
+               return Unknown("Failed to serialize the DeviceAssignmentProto.");
+             }
+             return py::bytes(result);
+           }));
 
   py::class_<CompileOptions> compile_options(m, "CompileOptions");
   compile_options
@@ -669,6 +744,8 @@ void BuildXlaCompilerSubmodule(py::module& m) {
                      &CompileOptions::compile_portable_executable)
       .def_readonly("executable_build_options",
                     &CompileOptions::executable_build_options)
+      .def_readwrite("env_option_overrides",
+                     &CompileOptions::env_option_overrides)
       // TODO(phawkins): the following fields exist for backward compatibility.
       // Remove them after JAX has been updated not to use them.
       .def_readwrite("tuple_arguments",
@@ -711,7 +788,12 @@ void BuildXlaCompilerSubmodule(py::module& m) {
           });
 
   // Custom-call targets.
-  m.def("register_custom_call_target", &PyRegisterCustomCallTarget);
+  m.def("register_custom_call_target",
+        [](const std::string& fn_name, py::capsule capsule,
+           const std::string& platform) {
+          xla::ThrowIfError(PyRegisterCustomCallTarget(
+              fn_name, std::move(capsule), platform));
+        });
 
   py::class_<DebugOptions>(m, "DebugOptions")
       .def("__repr__", &DebugOptions::DebugString)
@@ -750,6 +832,30 @@ void BuildXlaCompilerSubmodule(py::module& m) {
       .def_property("xla_llvm_disable_expensive_passes",
                     &DebugOptions::xla_llvm_disable_expensive_passes,
                     &DebugOptions::set_xla_llvm_disable_expensive_passes)
+      .def_property(
+          "xla_disable_hlo_passes",
+          [](DebugOptions* self) {
+            return absl::StrJoin(self->xla_disable_hlo_passes(), ",");
+          },
+          [](DebugOptions* self, std::string value) {
+            self->clear_xla_disable_hlo_passes();
+            for (const auto& passname :
+                 std::vector<std::string>(absl::StrSplit(value, ','))) {
+              self->add_xla_disable_hlo_passes(passname);
+            }
+          })
+      .def_property(
+          "xla_enable_hlo_passes_only",
+          [](DebugOptions* self) {
+            return absl::StrJoin(self->xla_enable_hlo_passes_only(), ",");
+          },
+          [](DebugOptions* self, std::string value) {
+            self->clear_xla_enable_hlo_passes_only();
+            for (const auto& passname :
+                 std::vector<std::string>(absl::StrSplit(value, ','))) {
+              self->add_xla_enable_hlo_passes_only(passname);
+            }
+          })
       .def_property("xla_test_all_input_layouts",
                     &DebugOptions::xla_test_all_input_layouts,
                     &DebugOptions::set_xla_test_all_input_layouts);
@@ -854,8 +960,14 @@ void BuildXlaCompilerSubmodule(py::module& m) {
   DefRepeatedProperty(op_sharding, "last_tile_dims",
                       &xla::OpSharding::mutable_last_tile_dims);
 
+  m.def("is_op_sharding_fully_replicated",
+        xla::ValueOrThrowWrapper(IsOpShardingFullyReplicated));
+
   py::class_<HloSharding> hlo_sharding(m, "HloSharding");
-  hlo_sharding.def_static("from_proto", &xla::HloSharding::FromProto)
+  hlo_sharding
+      .def_static("from_proto",
+                  xla::ValueOrThrowWrapper(xla::HloSharding::FromProto))
+      .def_static("from_string", xla::ValueOrThrowWrapper(xla::ParseSharding))
       .def("__eq__", [](const xla::HloSharding& a,
                         const xla::HloSharding& b) { return a == b; })
       .def("__hash__",
@@ -906,13 +1018,12 @@ void BuildXlaCompilerSubmodule(py::module& m) {
   hlo_pass_interface.def_property_readonly("name", &HloPassInterface::name)
       .def("is_pass_pipeline", &HloPassInterface::IsPassPipeline)
       .def("run",
-           [](HloPassInterface& pass, HloModule* module) -> StatusOr<bool> {
-             return pass.Run(module);
+           [](HloPassInterface& pass, HloModule* module) -> bool {
+             return xla::ValueOrThrow(pass.Run(module));
            })
       .def("run_on_module_group",
-           [](HloPassInterface& pass,
-              HloModuleGroup* module_group) -> StatusOr<bool> {
-             return pass.RunOnModuleGroup(module_group);
+           [](HloPassInterface& pass, HloModuleGroup* module_group) -> bool {
+             return xla::ValueOrThrow(pass.RunOnModuleGroup(module_group));
            });
 
   py::class_<HloDCE, HloPassInterface>(m, "HloDCE").def(py::init<>());
diff --git a/tensorflow/compiler/xla/python/xla_extension/__init__.pyi b/tensorflow/compiler/xla/python/xla_extension/__init__.pyi
index 69cfb14c893..21d1eedb2c2 100644
--- a/tensorflow/compiler/xla/python/xla_extension/__init__.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/__init__.pyi
@@ -52,6 +52,7 @@ class PrimitiveType(enum.IntEnum):
   U32: PrimitiveType
   U64: PrimitiveType
   F8_E4M3FN: PrimitiveType
+  F8_E4M3B11FNUZ: PrimitiveType
   F8_E5M2: PrimitiveType
   BF16: PrimitiveType
   F16: PrimitiveType
@@ -63,10 +64,6 @@ class PrimitiveType(enum.IntEnum):
   OPAQUE_TYPE: PrimitiveType
   TOKEN: PrimitiveType
 
-def bfloat16_dtype() -> Type[Any]: ...
-def float8_e4m3fn_dtype() -> Type[Any]: ...
-def float8_e5m2_dtype() -> Type[Any]: ...
-
 # === BEGIN xla_compiler.cc
 
 class Shape:
@@ -230,11 +227,13 @@ class CompileOptions:
   profile_version: int
   device_assignment: Optional[DeviceAssignment]
   compile_portable_executable: bool
+  env_option_overrides: List[Tuple[str,str]]
 
 def register_custom_call_target(fn_name: str, capsule: Any, platform: str) -> _Status: ...
 def register_custom_call_partitioner(name: str, prop_user_sharding: Callable,
                                      partition: Callable, infer_sharding_from_operands: Callable,
                                      can_side_effecting_have_replicated_sharding: bool) -> None: ...
+def encode_inspect_sharding_callback(handler: Any) -> bytes: ...
 
 class DebugOptions:
   def __repr__(self) -> str: ...
@@ -248,6 +247,8 @@ class DebugOptions:
   xla_cpu_enable_xprof_traceme: bool
   xla_llvm_disable_expensive_passes: bool
   xla_test_all_input_layouts: bool
+  xla_disable_hlo_passes: str
+  xla_enable_hlo_passes_only: str
 
 class CompiledMemoryStats:
   generated_code_size_in_bytes: int
@@ -296,9 +297,13 @@ class OpSharding:
   def SerializeToString(self) -> bytes: ...
   def clone(self) -> OpSharding: ...
 
+def is_op_sharding_fully_replicated(sharding: OpSharding, /) -> bool: ...
+
 class HloSharding:
   @staticmethod
   def from_proto(proto: OpSharding) -> HloSharding: ...
+  @staticmethod
+  def from_string(sharding: str) -> HloSharding: ...
   def __eq__(self, other: HloSharding) -> bool: ...
   def __hash__(self) -> int: ...
   def __repr__(self) -> str: ...
@@ -336,7 +341,7 @@ class Device:
   def __str__(self) -> str: ...
   def transfer_to_infeed(self, literal: _LiteralSlice): ...
   def transfer_from_outfeed(self, shape: Shape): ...
-  def live_buffers(self) -> List[Buffer]: ...
+  def live_buffers(self) -> List[Any]: ...
   def __getattr__(self, name: str) -> Any: ...
 
 class _GpuAllocatorKind(enum.IntEnum):
@@ -368,7 +373,7 @@ class Client:
   def local_device_count(self) -> int: ...
   def devices(self) -> List[Device]: ...
   def local_devices(self) -> List[Device]: ...
-  def live_buffers(self) -> List[Buffer]: ...
+  def live_buffers(self) -> List[Any]: ...
   def live_arrays(self) -> List[ArrayImpl]: ...
   def live_executables(self) -> List[LoadedExecutable]: ...
   def host_id(self) -> int: ...
@@ -390,11 +395,11 @@ class Client:
       argument: Any,
       device: Optional[Device] = ...,
       force_copy: bool = ...,
-      host_buffer_semantics: HostBufferSemantics = ...) -> Buffer: ...
+      host_buffer_semantics: HostBufferSemantics = ...) -> ArrayImpl: ...
   def make_cross_host_receive_buffers(
       self,
       shapes: Sequence[Shape],
-      device: Device) -> List[Tuple[Buffer, bytes]]: ...
+      device: Device) -> List[Tuple[ArrayImpl, bytes]]: ...
   def compile(
       self,
       computation: Union[str, bytes],
@@ -402,12 +407,7 @@ class Client:
   def serialize_executable(self, executable: LoadedExecutable) -> bytes: ...
   def deserialize_executable(
       self, serialized: bytes,
-      options: CompileOptions, host_callbacks: Sequence[Any] = ...) -> LoadedExecutable: ...
-  # TODO(skyewm): remove when jax stop providing hlo_module
-  def deserialize_executable(
-      self, serialized: bytes,
-      hlo_module: HloModule,
-      options: CompileOptions, host_callbacks: Sequence[Any] = ...) -> LoadedExecutable: ...
+      options: Optional[CompileOptions], host_callbacks: Sequence[Any] = ...) -> LoadedExecutable: ...
   def heap_profile(self) -> bytes: ...
   def defragment(self) -> _Status: ...
   def get_emit_python_callback_descriptor(
@@ -437,46 +437,9 @@ def get_gpu_client(
     platform_name: Optional[str] = ...) -> Client:...
 def get_tpu_client(max_inflight_computations: int = ...) -> Client: ...
 def get_c_api_client(platform_name: str, options: Dict[str, Union[str, int, List[int], float]]) -> Client: ...
+def get_default_c_api_topology(platform_name: str) -> DeviceTopology: ...
 def load_pjrt_plugin(platform_name: str, library_path: str) -> _Status: ...
 
-class DeviceArrayBase: ...
-
-class DeviceArray(DeviceArrayBase):
-  __array_priority__: int
-  _device: Optional[Device]
-  aval: Any
-  weak_type: Optional[bool]
-  @property
-  def device_buffer(self: _T) -> _T: ...
-  shape: Tuple[int, ...]
-  dtype: np.dtype
-  size: int
-  ndim: int
-  _value: np.ndarray
-  def __array__(self, dtype=None, context=None) -> np.ndarray: ...
-  def copy_to_device(self, dst_device: Device) -> DeviceArray: ...
-  def copy_to_remote_device(self,
-                            descriptor: bytes) -> Tuple[_Status, bool]: ...
-  def on_device_size_in_bytes(self) -> int: ...
-  def delete(self) -> None: ...
-  def is_ready(self) -> bool: ...
-  def is_known_ready(self) -> bool: ...
-  def block_until_ready(self) -> DeviceArray: ...
-  def copy_to_host_async(self) -> _Status: ...
-  def xla_shape(self) -> Shape: ...
-  def xla_dynamic_shape(self) -> Shape: ...
-  client: Client
-  def device(self) -> Device: ...
-  def platform(self) -> str: ...
-  def is_deleted(self) -> bool: ...
-  def unsafe_buffer_pointer(self) -> Any: ...
-  __cuda_array_interface__: Dict[str, Any]
-  traceback: Traceback
-  def clone(self) -> DeviceArray: ...
-
-PyLocalBuffer = DeviceArray
-Buffer = DeviceArray
-
 ArrayImpl = Any
 
 # TODO(phawkins): this type is problematic because it is not a subtype of
@@ -485,7 +448,7 @@ ArrayImpl = Any
 #   def __init__(self,
 #                aval: Any,
 #                sharding: Any,
-#                arrays: Sequence[DeviceArray],
+#                arrays: Sequence[ArrayImpl],
 #                committed: bool,
 #                _skip_checks: bool = ...): ...
 #   def block_until_ready(self) -> ArrayImpl: ...
@@ -499,6 +462,7 @@ ArrayImpl = Any
 #   def _copy_single_device_array_to_host_async(self): ...
 #   def _single_device_array_to_np_array(self) -> np.ndarray: ...
 #   def on_device_size_in_bytes(self) -> int: ...
+#   def _fully_replicated_shard(self) -> ArrayImpl: ...
 #   __cuda_array_interface__: Dict[str, Any]
 #   dtype: np.dtype
 #   shape: Tuple[int, ...]
@@ -534,8 +498,8 @@ class ShardedToken:
 
 class ExecuteResults:
   def __len__(self) -> int: ...
-  def disassemble_into_single_device_arrays(self) -> List[List[DeviceArray]]: ...
-  def disassemble_prefix_into_single_device_arrays(self, n: int) -> List[List[DeviceArray]]: ...
+  def disassemble_into_single_device_arrays(self) -> List[List[ArrayImpl]]: ...
+  def disassemble_prefix_into_single_device_arrays(self, n: int) -> List[List[ArrayImpl]]: ...
   def consume_with_handlers(self, handlers: List[Callable]) -> List[Any]: ...
   def consume_token(self) -> ShardedToken: ...
 
@@ -545,20 +509,20 @@ class LoadedExecutable:
   def local_devices(self) -> List[Device]: ...
   def size_of_generated_code_in_bytes(self) -> int: ...
   def delete(self) -> None: ...
-  def execute(self, arguments: Sequence[DeviceArray]) -> List[DeviceArray]: ...
+  def execute(self, arguments: Sequence[ArrayImpl]) -> List[ArrayImpl]: ...
   def execute_with_token(
       self,
-      arguments: Sequence[DeviceArray]) -> Tuple[List[DeviceArray], Token]:
+      arguments: Sequence[ArrayImpl]) -> Tuple[List[ArrayImpl], Token]:
     ...
   def execute_sharded_on_local_devices(
       self,
-      arguments: Sequence[List[DeviceArray]]) -> List[List[DeviceArray]]: ...
+      arguments: Sequence[List[ArrayImpl]]) -> List[List[ArrayImpl]]: ...
   def execute_sharded_on_local_devices_with_tokens(
       self,
-      arguments: Sequence[List[DeviceArray]]) -> Tuple[List[List[DeviceArray]], ShardedToken]: ...
+      arguments: Sequence[List[ArrayImpl]]) -> Tuple[List[List[ArrayImpl]], ShardedToken]: ...
   def execute_sharded(
       self,
-      arguments: Sequence[List[DeviceArray]], with_tokens: bool = ...) -> ExecuteResults: ...
+      arguments: Sequence[List[ArrayImpl]], with_tokens: bool = ...) -> ExecuteResults: ...
   def hlo_modules(self) -> List[HloModule]: ...
   def get_compiled_memory_stats(self) -> CompiledMemoryStats: ...
   def keep_alive(self) -> None: ...
@@ -576,15 +540,16 @@ class Executable:
 
 class DeviceTopology:
   platform: str
+  platform_version: str
+  def _make_compile_only_devices(self) -> List[Device]: ...
 
-def compile(topology: DeviceTopology, mlir_module: str) -> Executable: ...
 
 def buffer_to_dlpack_managed_tensor(
-    buffer: Buffer,
+    buffer: ArrayImpl,
     take_ownership: bool = ...) -> Any: ...
 def dlpack_managed_tensor_to_buffer(
     tensor: Any, cpu_backend: Optional[Client] = ...,
-    gpu_backend: Optional[Client] = ...) -> Buffer: ...
+    gpu_backend: Optional[Client] = ...) -> ArrayImpl: ...
 
 # === BEGIN py_traceback.cc
 
@@ -659,13 +624,8 @@ def json_to_pprof_profile(json: str) -> bytes: ...
 def pprof_profile_to_json(proto: bytes) -> str: ...
 
 
-class CompiledFunction:
-  def __call__(self, *args, **kwargs) -> Any: ...
-  def __getstate__(self) -> Any: ...
-  def __setstate__(self, Any): ...
-  __signature__: inspect.Signature
-  def _cache_size(self) -> int: ...
-  def _clear_cache(self) -> None: ...
+CompiledFunction = Any
+
 
 class PmapFunction:
   def __call__(self, *args, **kwargs) -> Any: ...
@@ -673,7 +633,7 @@ class PmapFunction:
   def __setstate__(self, Any): ...
   __signature__: inspect.Signature
   def _cache_size(self) -> int: ...
-  def _clear_cache(self) -> None: ...
+  def _cache_clear(self) -> None: ...
 
 def weakref_lru_cache(cache_context_fn: Callable, call: Callable, maxsize=...):
   ...
diff --git a/tensorflow/compiler/xla/python/xla_extension/jax_jit.pyi b/tensorflow/compiler/xla/python/xla_extension/jax_jit.pyi
index 0ebb83b49c3..c870b8fc3f3 100644
--- a/tensorflow/compiler/xla/python/xla_extension/jax_jit.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/jax_jit.pyi
@@ -23,16 +23,6 @@ Device = xla_extension.Device
 
 CompiledFunction = xla_extension.CompiledFunction
 
-class CompiledFunctionCache:
-  def __init__(self, capacity: int = ...): ...
-  def __getstate__(self) -> Any: ...
-  def __setstate__(self, Any): ...
-  def size(self) -> int: ...
-  def capacity(self) -> int: ...
-  def clear(self): ...
-  @staticmethod
-  def clear_all(): ...
-
 class JitState:
   disable_jit: Optional[bool]
   enable_x64: Optional[bool]
@@ -47,15 +37,6 @@ def get_enable_x64() -> bool: ...
 def set_thread_local_state_initialization_callback(
     function: Callable[[], None]): ...
 
-def jit(fun: Callable[..., Any],
-        cache_miss: Callable[..., Any],
-        get_device: Callable[..., Any],
-        static_argnums: Sequence[int],
-        static_argnames: Sequence[str] = ...,
-        donate_argnums: Sequence[int] = ...,
-        jit_device: Optional[Device] = ...,
-        cache: Optional[CompiledFunctionCache] = ...) -> CompiledFunction: ...
-
 class ArgSignature:
   dtype: np.dtype
   shape: Tuple[int, ...]
diff --git a/tensorflow/compiler/xla/python/xla_extension/mlir.pyi b/tensorflow/compiler/xla/python/xla_extension/mlir.pyi
index 23d6769372c..f2972638ea4 100644
--- a/tensorflow/compiler/xla/python/xla_extension/mlir.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/mlir.pyi
@@ -13,13 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 
+from typing import Union
 from . import XlaComputation
 
 def xla_computation_to_mlir_module(computation: XlaComputation) -> str: ...
 def mlir_module_to_xla_computation(
     mlir_module: str, use_tuple_args: bool = ...,
     return_tuple: bool = ...) -> XlaComputation: ...
-def mhlo_to_stablehlo(mlir_module: str) -> str: ...
-def stablehlo_to_mhlo(mlir_module: str) -> str: ...
+def mhlo_to_stablehlo(mlir_module: Union[bytes, str]) -> str: ...
+def stablehlo_to_mhlo(mlir_module: Union[bytes, str]) -> str: ...
 def serialize_portable_artifact(mlir_module: str, target:str) -> bytes: ...
 def deserialize_portable_artifact(mlir_module: bytes) -> str: ...
diff --git a/tensorflow/compiler/xla/python/xla_extension/outfeed_receiver.pyi b/tensorflow/compiler/xla/python/xla_extension/outfeed_receiver.pyi
index f53d9f7a763..716af0a6292 100644
--- a/tensorflow/compiler/xla/python/xla_extension/outfeed_receiver.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/outfeed_receiver.pyi
@@ -14,7 +14,7 @@
 # ==============================================================================
 
 import enum
-from typing import Any, Sequence
+from typing import Any, Optional, Sequence
 
 from tensorflow.compiler.xla.python import xla_extension
 
@@ -24,15 +24,23 @@ XlaOp = xla_extension.XlaOp
 
 _CallbackToPython = Any
 
+
 def start(
     callback_to_python: _CallbackToPython,
     backends: Sequence[Client],
-    max_queue_size_bytes: int = ...) -> OutfeedReceiverForPython: ...
+    max_queue_size_bytes: int = ...,
+    compile_options: Optional[xla_extension.ExecutableBuildOptions] = ...,
+) -> OutfeedReceiverForPython:
+  ...
+
 
 class OutfeedReceiverForPython:
+
   def add_outfeed(
       builder: XlaBuilder,
       token: XlaOp,
       consumer_id: int,
       arrays: Sequence[XlaOp],
-      device_idx: int) -> XlaOp: ...
+      device_idx: int,
+  ) -> XlaOp:
+    ...
diff --git a/tensorflow/compiler/xla/python/xla_extension/pmap_lib.pyi b/tensorflow/compiler/xla/python/xla_extension/pmap_lib.pyi
index e31f0224337..fe3fd0fb30f 100644
--- a/tensorflow/compiler/xla/python/xla_extension/pmap_lib.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/pmap_lib.pyi
@@ -64,38 +64,7 @@ class ShardingSpec:
   def __eq__(self, __other: ShardingSpec) -> bool: ...
   def __hash__(self) -> int: ...
 
-class ShardedDeviceArrayBase:
-  ...
-
-class ShardedDeviceArray(ShardedDeviceArrayBase):
-  def __init__(self,
-               aval: Any,
-               sharding_spec: ShardingSpec,
-               device_buffers: List[Any],
-               indices: Any,
-               weak_type: bool) -> None: ...
-  aval: Any
-  indices: Any
-  sharding_spec: ShardingSpec
-  @property
-  def device_buffers(self) -> Optional[List[Any]]: ...
-  _npy_value: Optional[np.ndarray]
-  _one_replica_buffer_indices: Optional[Any]
-
-  @property
-  def shape(self) -> Tuple[int]: ...
-  @property
-  def dtype(self) -> np.dtype: ...
-  @property
-  def size(self) -> int: ...
-  @property
-  def ndim(self) -> int: ...
-
-  def delete(self) -> None: ...
-
-  @staticmethod
-  def make(aval: Any, sharding_spec: ShardingSpec, device_buffers: List[Any],
-           indices: Any, weak_type: bool) -> ShardedDeviceArray: ...
+  _HAS_DYNAMIC_ATTRIBUTES = True
 
 class PmapFunction:
   def __call__(self, *args, **kwargs) -> Any: ...
@@ -103,6 +72,7 @@ class PmapFunction:
   def __setstate__(self, Any): ...
   __signature__: inspect.Signature
   def _cache_size(self) -> int: ...
+  def _cache_clear(self) -> None: ...
   def _debug_cache_keys(self) -> str: ...
 
 def pmap(__fun: Callable[..., Any],
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index ebbf728a7bc..e2828a1fbc2 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -54,11 +54,11 @@ cc_library(
     srcs = ["grpc_service_main.cc"],
     deps = [
         ":grpc_service",
-        "@com_google_absl//absl/strings:str_format",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/util:command_line_flags",
+        "@com_google_absl//absl/strings:str_format",
     ] + tsl_grpc_cc_dependencies() + if_libtpu(
         if_false = ["//tensorflow/compiler/xla/service:cpu_plugin"],
         if_true = [],
@@ -81,16 +81,16 @@ xla_cc_test(
     ],
     deps = [
         ":grpc_stub",
-        "@com_google_absl//absl/strings:str_format",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:net",
+        "//tensorflow/tsl/platform:path",
         "//tensorflow/tsl/platform:subprocess",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
-        "//tensorflow/tsl/platform:path",
+        "@com_google_absl//absl/strings:str_format",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -99,8 +99,8 @@ cc_library(
     srcs = ["grpc_service.cc"],
     hdrs = ["grpc_service.h"],
     deps = [
-        ":xla_service_proto_cc",
         ":xla_service_cc_grpc_proto",
+        ":xla_service_proto_cc",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/tsl/distributed_runtime/rpc:grpc_util",
diff --git a/tensorflow/compiler/xla/runlit.cfg.py b/tensorflow/compiler/xla/runlit.cfg.py
index 1a1d7bcd33d..7ecafd3f1e3 100644
--- a/tensorflow/compiler/xla/runlit.cfg.py
+++ b/tensorflow/compiler/xla/runlit.cfg.py
@@ -65,6 +65,7 @@ tool_dirs = config.mlir_tf_tools_dirs + [
 ]
 tool_names = [
     'hlo_to_llvm_ir',
+    'ifrt-opt',
     'kernel-gen-opt',
     'mhlo-tosa-opt',
     'mlir-bisect',
diff --git a/tensorflow/compiler/xla/runlit.site.cfg.py b/tensorflow/compiler/xla/runlit.site.cfg.py
index 861e788c287..c8141154107 100644
--- a/tensorflow/compiler/xla/runlit.site.cfg.py
+++ b/tensorflow/compiler/xla/runlit.site.cfg.py
@@ -44,6 +44,7 @@ mlir_tf_tools_dirs = [
     'mlir/tools/mlir_bisect',
     'mlir_hlo',
     'mlir_hlo/tosa',
+    'python/ifrt/ir/tests',
     'service/gpu/tests',
     'service/mlir_gpu',
     'translate',
diff --git a/tensorflow/compiler/xla/runtime/BUILD b/tensorflow/compiler/xla/runtime/BUILD
index 5ba6f1bd30d..2cf7e7881ad 100644
--- a/tensorflow/compiler/xla/runtime/BUILD
+++ b/tensorflow/compiler/xla/runtime/BUILD
@@ -18,6 +18,7 @@ cc_library(
     hdrs = ["arguments.h"],
     compatible_with = get_compatible_with_cloud(),
     deps = [
+        ":async_runtime",
         ":errors",
         ":types",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/xla/runtime/arguments.cc b/tensorflow/compiler/xla/runtime/arguments.cc
index 81c01a74fb3..e50e8eaea3c 100644
--- a/tensorflow/compiler/xla/runtime/arguments.cc
+++ b/tensorflow/compiler/xla/runtime/arguments.cc
@@ -16,10 +16,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/runtime/arguments.h"
 
 #include <cstddef>
+#include <cstdlib>
+#include <cstring>
 #include <optional>
 #include <string>
 #include <string_view>
 #include <type_traits>
+#include <utility>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -27,7 +30,9 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/runtime/async_runtime.h"
 #include "tensorflow/compiler/xla/runtime/types.h"
 
 namespace xla {
@@ -255,5 +260,114 @@ Status VerifyMemrefArgument(unsigned index, const Type& type,
   return absl::OkStatus();
 }
 
+//===----------------------------------------------------------------------===//
+// AsyncTokenArg.
+//===----------------------------------------------------------------------===//
+
+Status AsyncTokenArg::Verify(const Type& type) const {
+  if (isa<AsyncTokenType>(type)) return absl::OkStatus();
+  return InvalidArgumentError(
+      absl::StrCat("expected async token type, got: ", type.ToString()));
+}
+
+void AsyncTokenArg::Pack(absl::Span<void*> args) const {
+  args[0] = const_cast<void*>(reinterpret_cast<const void*>(&storage_));
+}
+
+std::string AsyncTokenArg::ToString() const { return "Async token argument"; }
+
+//===----------------------------------------------------------------------===//
+// AsyncScalarArg.
+//===----------------------------------------------------------------------===//
+
+absl::Status AsyncScalarArg::Verify(const Type& type) const {
+  auto* value_type = llvm::dyn_cast<AsyncValueType>(&type);
+  if (!value_type)
+    return absl::InvalidArgumentError(
+        absl::StrCat("expected async value type, got: ", type.ToString()));
+
+  auto* scalar = llvm::dyn_cast<ScalarType>(&value_type->value_type());
+  if (scalar && scalar->type() == type_) return absl::OkStatus();
+  return absl::InvalidArgumentError(
+      absl::StrCat("unsupported scalar argument type: ", type.ToString()));
+}
+
+void AsyncScalarArg::Pack(absl::Span<void*> args) const {
+  args[0] = const_cast<void*>(reinterpret_cast<const void*>(&storage_));
+}
+
+std::string AsyncScalarArg::ToString() const {
+  return absl::StrFormat("Async value type: %s",
+                         primitive_util::LowercasePrimitiveTypeName(type_));
+}
+
+//===----------------------------------------------------------------------===//
+// AsyncMemrefArg.
+//===----------------------------------------------------------------------===//
+
+AsyncMemrefArg::AsyncMemrefArg(tsl::AsyncValueRef<MemrefDesc> value)
+    : value_(value) {
+  struct MemrefDescriptor {
+    void* allocated_ptr;
+    void* aligned_ptr;
+    int64_t offset;
+    int64_t dims[];
+  };
+
+  auto size_and_alignment =
+      [](const MemrefDesc* desc) -> std::pair<size_t, size_t> {
+    size_t size = 3 * sizeof(int64_t) + 2 * desc->rank() * sizeof(int64_t);
+    return std::make_pair(size, alignof(std::max_align_t));
+  };
+
+  auto write = [](const MemrefDesc* v, std::byte* store) {
+    MemrefDescriptor* store_t = reinterpret_cast<MemrefDescriptor*>(store);
+    auto rank = v->rank();
+    for (unsigned i = 0; i < rank; ++i) {
+      store_t->dims[i] = v->size(i);
+      store_t->dims[i + rank] = v->stride(i);
+    }
+
+    store_t->allocated_ptr = v->data();
+    store_t->aligned_ptr = v->data();
+    store_t->offset = 0;
+  };
+
+  storage_ =
+      AsyncRuntime::AsValue<MemrefDesc>(value_, size_and_alignment, write);
+}
+
+Status AsyncMemrefArg::Verify(const Type& type) const {
+  auto* value_type = llvm::dyn_cast<AsyncValueType>(&type);
+  if (!value_type)
+    return InvalidArgumentError(
+        absl::StrCat("expected async value type, got: ", type.ToString()));
+  auto* memref = llvm::dyn_cast<MemrefType>(&value_type->value_type());
+  if (!memref)
+    return InvalidArgumentError(
+        absl::StrCat("expected async memref type, got ",
+                     value_type->value_type().ToString()));
+  value_.AndThen([memref](absl::StatusOr<MemrefDesc*> status_or) {
+    if (!status_or.ok()) {
+      llvm::errs() << status_or.status().message();
+      assert(false && "async memref argument is in error state");
+    } else {
+      auto status = VerifyMemrefArgument(memref->element_type(),
+                                         memref->sizes(), **status_or);
+      if (!status.ok()) {
+        llvm::errs() << status.message();
+        assert(false && "failed to verify memref argument");
+      }
+    }
+  });
+  return absl::OkStatus();
+}
+
+void AsyncMemrefArg::Pack(absl::Span<void*> args) const {
+  args[0] = const_cast<void*>(reinterpret_cast<const void*>(&storage_));
+}
+
+std::string AsyncMemrefArg::ToString() const { return "Async memref argument"; }
+
 }  // namespace runtime
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/runtime/arguments.h b/tensorflow/compiler/xla/runtime/arguments.h
index 6ad54891043..74908a8dad9 100644
--- a/tensorflow/compiler/xla/runtime/arguments.h
+++ b/tensorflow/compiler/xla/runtime/arguments.h
@@ -22,8 +22,11 @@ limitations under the License.
 #include <type_traits>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/runtime/async_runtime.h"
 #include "tensorflow/compiler/xla/runtime/types.h"
 
 namespace xla {
@@ -176,7 +179,7 @@ class ArgumentsRef {
       : ArgumentsRef(llvm::ArrayRef<T>(arr)) {}
 
   template <typename T, std::enable_if_t<is_argument<T>>* = nullptr>
-  ArgumentsRef(std::initializer_list<T> list)  // NOLINT
+  ArgumentsRef(const std::initializer_list<T>& list)  // NOLINT
       : ArgumentsRef(llvm::ArrayRef<T>(list)) {}
 
   const Argument& operator[](size_t index) const {
@@ -224,19 +227,20 @@ class OpaqueArg final : public llvm::RTTIExtends<OpaqueArg, Argument> {
   void* ptr_;
 };
 
+template <typename T>
+using EnableIfScalarType = typename std::enable_if_t<
+    std::disjunction_v<std::is_same<T, float>, std::is_same<T, int32_t>,
+                       std::is_same<T, int64_t>>>;
+
 //===----------------------------------------------------------------------===//
 // ScalarArg for passing integer or float scalar arguments.
 //===----------------------------------------------------------------------===//
 
 class ScalarArg final : public llvm::RTTIExtends<ScalarArg, Argument> {
-  template <typename T, typename... Ts>
-  static inline constexpr bool kIsOneOf = (std::is_same_v<T, Ts> || ...);
-
  public:
   static constexpr char ID = 0;  // NOLINT
 
-  template <typename T,
-            std::enable_if_t<kIsOneOf<T, float, int32_t, int64_t>>* = nullptr>
+  template <typename T, EnableIfScalarType<T>* = nullptr>
   explicit ScalarArg(T value)
       : type_(primitive_util::NativeToPrimitiveType<T>()), value_(value) {}
 
@@ -352,6 +356,77 @@ MemrefDesc::MemrefDesc(unsigned rank, PrimitiveType dtype, void* data,
 absl::Status VerifyMemrefArgument(unsigned index, const Type& type,
                                   const MemrefDesc& arg);
 
+//===----------------------------------------------------------------------===//
+// AsyncTokenArg for passing async token arguments
+//===----------------------------------------------------------------------===//
+
+class AsyncTokenArg final : public llvm::RTTIExtends<AsyncTokenArg, Argument> {
+ public:
+  static constexpr char ID = 0;  // NOLINT
+
+  explicit AsyncTokenArg(tsl::AsyncValueRef<tsl::Chain> value)
+      : storage_(AsyncRuntime::AsToken(value)) {}
+
+  absl::Status Verify(const Type& type) const final;
+  void Pack(absl::Span<void*> args) const final;
+  std::string ToString() const final;
+
+ private:
+  // In the runtime execution, we unpack args with pointer to pointer
+  // dereferening. We declare storage_ as a member variable (instead of a local
+  // inside the Pack function) to keep its address valid when unpacking.
+  AsyncRuntime::Token* storage_;
+};
+
+//===----------------------------------------------------------------------===//
+// AsyncScalarArg for passing async scalar arguments
+//===----------------------------------------------------------------------===//
+
+class AsyncScalarArg final
+    : public llvm::RTTIExtends<AsyncScalarArg, Argument> {
+ public:
+  static constexpr char ID = 0;  // NOLINT
+
+  template <typename T, EnableIfScalarType<T>* = nullptr>
+  explicit AsyncScalarArg(tsl::AsyncValueRef<T> value)
+      : type_(primitive_util::NativeToPrimitiveType<T>()) {
+    auto write = [](const T* v, std::byte* store) {
+      T* store_t = reinterpret_cast<T*>(store);
+      *store_t = *v;
+    };
+
+    storage_ = AsyncRuntime::AsValue<T>(value, sizeof(T),
+                                        alignof(std::max_align_t), write);
+  }
+
+  absl::Status Verify(const Type& type) const final;
+  void Pack(absl::Span<void*> args) const final;
+
+  std::string ToString() const final;
+
+ private:
+  PrimitiveType type_;
+  AsyncRuntime::Value* storage_;
+};
+
+//===----------------------------------------------------------------------===//
+// AsyncMemrefArg for passing async memref arguments
+//===----------------------------------------------------------------------===//
+class AsyncMemrefArg final
+    : public llvm::RTTIExtends<AsyncMemrefArg, Argument> {
+ public:
+  static constexpr char ID = 0;  // NOLINT
+
+  explicit AsyncMemrefArg(tsl::AsyncValueRef<MemrefDesc> value);
+
+  absl::Status Verify(const Type& type) const final;
+  void Pack(absl::Span<void*> args) const final;
+  std::string ToString() const final;
+
+ private:
+  tsl::AsyncValueRef<MemrefDesc> value_;
+  AsyncRuntime::Value* storage_;
+};
 }  // namespace runtime
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/runtime/async_runtime.cc b/tensorflow/compiler/xla/runtime/async_runtime.cc
index 1acc5ed3727..4fdd664a4d3 100644
--- a/tensorflow/compiler/xla/runtime/async_runtime.cc
+++ b/tensorflow/compiler/xla/runtime/async_runtime.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdlib>
 #include <memory>
+#include <optional>
 #include <type_traits>
 #include <utility>
 
@@ -60,9 +61,13 @@ struct AsyncToken : public AsyncRuntimeObject {
 };
 
 struct AsyncValue : public AsyncRuntimeObject {
+  explicit AsyncValue(unsigned ref_count = 1)
+      : AsyncRuntimeObject(ref_count),
+        chain(MakeConstructedAsyncValueRef<Chain>(storage)) {}
+
   explicit AsyncValue(size_t size, size_t alignment, unsigned ref_count = 1)
       : AsyncRuntimeObject(ref_count),
-        data_storage(size, alignment),
+        data_storage(Storage(size, alignment)),
         chain(MakeConstructedAsyncValueRef<Chain>(storage)) {
     // Storage memory will be initialized by the compiled executable.
     ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(GetStorage(), size);
@@ -70,8 +75,15 @@ struct AsyncValue : public AsyncRuntimeObject {
 
   std::byte* GetStorage() {
     assert(!GetAsyncValue()->IsError() && "unexpected error state");
-    if (data_storage.is_inline) return &data_storage.inline_buffer[0];
-    return data_storage.allocated_buffer;
+    assert(data_storage.has_value() && "unallocated data storage");
+    if (data_storage->is_inline) return &data_storage->inline_buffer[0];
+    return data_storage->allocated_buffer;
+  }
+
+  void AllocateStorage(size_t size, size_t alignment) {
+    data_storage = Storage(size, alignment);
+    // Storage memory will be initialized by the compiled executable.
+    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(GetStorage(), size);
   }
 
   tsl::AsyncValue* GetAsyncValue() const { return chain.AsPtr().value(); }
@@ -105,7 +117,7 @@ struct AsyncValue : public AsyncRuntimeObject {
     };
   };
 
-  Storage data_storage;
+  std::optional<Storage> data_storage;
 
   // Async value that tracks value readiness. It becomes available when result
   // is written to the data storage and ready for consumption.
@@ -208,6 +220,11 @@ static_assert(sizeof(AsyncRuntime) == 1 * sizeof(void*),
   return value->GetStorage();
 }
 
+/*static*/ void AsyncRuntime::AllocateStorage(Value* value, size_t size,
+                                              size_t alignment) {
+  return value->AllocateStorage(size, alignment);
+}
+
 /*static*/ AsyncValue* AsyncRuntime::GetAsyncValue(AsyncRuntime::Value* value) {
   return value->GetAsyncValue();
 }
@@ -285,12 +302,21 @@ static_assert(sizeof(AsyncRuntime) == 1 * sizeof(void*),
   Await(token->GetAsyncValue());
 }
 
+/*static*/ AsyncRuntime::Value* AsyncRuntime::CreateValue() {
+  // AsyncRuntime::Value created with a reference count of 2 because it will be
+  // returned to the `async.execute` caller and also will be later on emplaced
+  // by the asynchronously executed task. If the caller immediately will drop
+  // its reference we must ensure that the value will be alive until the
+  // asynchronous operation is completed.
+  return new AsyncRuntime::Value(/*ref_count=*/2);
+}
+
 /*static*/ AsyncRuntime::Value* AsyncRuntime::CreateValue(size_t size,
                                                           size_t alignment) {
   // AsyncRuntime::Value created with a reference count of 2 because it will be
   // returned to the `async.execute` caller and also will be later on emplaced
   // by the asynchronously executed task. If the caller immediately will drop
-  // its reference we must ensure that the token will be alive until the
+  // its reference we must ensure that the value will be alive until the
   // asynchronous operation is completed.
   return new AsyncRuntime::Value(size, alignment, /*ref_count=*/2);
 }
diff --git a/tensorflow/compiler/xla/runtime/async_runtime.h b/tensorflow/compiler/xla/runtime/async_runtime.h
index 27590aedf34..f534855f078 100644
--- a/tensorflow/compiler/xla/runtime/async_runtime.h
+++ b/tensorflow/compiler/xla/runtime/async_runtime.h
@@ -106,6 +106,9 @@ class AsyncRuntime {
   // Async Value API.
   // ------------------------------------------------------------------------ //
 
+  // Creates a new value in not-ready state without allocating storage
+  static Value* CreateValue();
+
   // Creates a new value in not-ready state with a storage of the given size.
   static Value* CreateValue(size_t size, size_t alignment);
 
@@ -161,6 +164,9 @@ class AsyncRuntime {
   // Returns a pointer to the async value storage.
   static std::byte* GetStorage(Value* value);
 
+  // Allocate storage for the async value
+  static void AllocateStorage(Value* value, size_t size, size_t alignment);
+
   // Extracts async value that holds a chain owned by the value.
   static tsl::AsyncValue* GetAsyncValue(Value* value);
 
@@ -184,10 +190,9 @@ class AsyncRuntime {
 
   template <typename T>
   static Value* AsValue(
-      tsl::AsyncValueRef<T> value, size_t size,
+      tsl::AsyncValueRef<T> value, size_t size, size_t alignment,
       absl::FunctionRef<void(const T*, std::byte* storage)> write) {
-    Value* runtime_async_value =
-        AsyncRuntime::CreateValue(size, alignof(std::max_align_t));
+    Value* runtime_async_value = AsyncRuntime::CreateValue(size, alignment);
     value.AndThen([runtime_async_value, write](absl::StatusOr<T*> status_or) {
       if (!status_or.ok()) {
         AsyncRuntime::SetError(runtime_async_value);
@@ -200,6 +205,28 @@ class AsyncRuntime {
     return runtime_async_value;
   }
 
+  template <typename T>
+  static Value* AsValue(
+      tsl::AsyncValueRef<T> value,
+      absl::FunctionRef<std::pair<size_t, size_t>(const T*)> size_and_alignment,
+      absl::FunctionRef<void(const T*, std::byte* storage)> write) {
+    Value* runtime_async_value = AsyncRuntime::CreateValue();
+    value.AndThen([runtime_async_value, size_and_alignment,
+                   write](absl::StatusOr<T*> status_or) {
+      if (!status_or.ok()) {
+        AsyncRuntime::SetError(runtime_async_value);
+      } else {
+        auto size_alignment = size_and_alignment(*status_or);
+        AsyncRuntime::AllocateStorage(runtime_async_value, size_alignment.first,
+                                      size_alignment.second);
+        auto* store = AsyncRuntime::GetStorage(runtime_async_value);
+        write(*status_or, store);
+        AsyncRuntime::SetAvailable(runtime_async_value);
+      }
+    });
+    return runtime_async_value;
+  }
+
   AsyncTaskRunner* runner() const { return runner_; }
 
  private:
diff --git a/tensorflow/compiler/xla/runtime/async_runtime_test.cc b/tensorflow/compiler/xla/runtime/async_runtime_test.cc
index 20ddc21f3fe..5f263caf4d3 100644
--- a/tensorflow/compiler/xla/runtime/async_runtime_test.cc
+++ b/tensorflow/compiler/xla/runtime/async_runtime_test.cc
@@ -151,21 +151,21 @@ TEST_F(AsyncRuntimeTest, AsValue) {
     *store_t = *v;
   };
 
-  auto *value1 =
-      AsyncRuntime::AsValue<int32_t>(async_value1, sizeof(int32_t), write);
+  auto *value1 = AsyncRuntime::AsValue<int32_t>(
+      async_value1, sizeof(int32_t), alignof(std::max_align_t), write);
   auto *storage1 =
       reinterpret_cast<int32_t *>(AsyncRuntime::GetStorage(value1));
   EXPECT_EQ(*storage1, 42);
 
   auto async_value2 = tsl::MakeConstructedAsyncValueRef<int32_t>();
   async_value2.SetError("error");
-  auto *value2 =
-      AsyncRuntime::AsValue<int32_t>(async_value2, sizeof(int32_t), write);
+  auto *value2 = AsyncRuntime::AsValue<int32_t>(
+      async_value2, sizeof(int32_t), alignof(std::max_align_t), write);
   EXPECT_EQ(AsyncRuntime::IsError(value2), true);
 
   auto async_value3 = tsl::MakeConstructedAsyncValueRef<int32_t>(42);
-  auto *value3 =
-      AsyncRuntime::AsValue<int32_t>(async_value3, sizeof(int32_t), write);
+  auto *value3 = AsyncRuntime::AsValue<int32_t>(
+      async_value3, sizeof(int32_t), alignof(std::max_align_t), write);
   EXPECT_EQ(AsyncRuntime::GetAsyncValue(value3)->IsAvailable(), false);
   async_value3.SetStateConcrete();
   AsyncRuntime::AwaitValue(value3);
diff --git a/tensorflow/compiler/xla/runtime/custom_call.h b/tensorflow/compiler/xla/runtime/custom_call.h
index fc1a8473a98..f04fbdd7b9b 100644
--- a/tensorflow/compiler/xla/runtime/custom_call.h
+++ b/tensorflow/compiler/xla/runtime/custom_call.h
@@ -1395,32 +1395,33 @@ struct CustomCallRetDecoding<MemrefView, checks> {
 //===----------------------------------------------------------------------===//
 
 // Custom call AsyncValueRef result decoding
-#define XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(T)               \
-  template <>                                                                 \
-  class Result<tsl::AsyncValueRef<T>> {                                       \
-   public:                                                                    \
-    explicit Result(void** storage) : storage_(storage) {}                    \
-    void Set(tsl::AsyncValueRef<T> value) {                                   \
-      auto write = [](const T* v, std::byte* store) {                         \
-        T* store_t = reinterpret_cast<T*>(store);                             \
-        *store_t = *v;                                                        \
-      };                                                                      \
-      *storage_ = runtime::AsyncRuntime::AsValue<T>(value, sizeof(T), write); \
-    }                                                                         \
-                                                                              \
-   private:                                                                   \
-    void** storage_;                                                          \
-  };                                                                          \
-                                                                              \
-  template <CustomCall::RuntimeChecks checks>                                 \
-  struct CustomCallRetDecoding<tsl::AsyncValueRef<T>, checks> {               \
-    LLVM_ATTRIBUTE_ALWAYS_INLINE                                              \
-    static FailureOr<Result<tsl::AsyncValueRef<T>>> Decode(TypeID type_id,    \
-                                                           void* value) {     \
-      if (!CustomCall::Isa<tsl::AsyncValueRef<T>>(checks, type_id))           \
-        return failure();                                                     \
-      return Result<tsl::AsyncValueRef<T>>(reinterpret_cast<void**>(value));  \
-    }                                                                         \
+#define XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(T)              \
+  template <>                                                                \
+  class Result<tsl::AsyncValueRef<T>> {                                      \
+   public:                                                                   \
+    explicit Result(void** storage) : storage_(storage) {}                   \
+    void Set(tsl::AsyncValueRef<T> value) {                                  \
+      auto write = [](const T* v, std::byte* store) {                        \
+        T* store_t = reinterpret_cast<T*>(store);                            \
+        *store_t = *v;                                                       \
+      };                                                                     \
+      *storage_ = runtime::AsyncRuntime::AsValue<T>(                         \
+          value, sizeof(T), alignof(std::max_align_t), write);               \
+    }                                                                        \
+                                                                             \
+   private:                                                                  \
+    void** storage_;                                                         \
+  };                                                                         \
+                                                                             \
+  template <CustomCall::RuntimeChecks checks>                                \
+  struct CustomCallRetDecoding<tsl::AsyncValueRef<T>, checks> {              \
+    LLVM_ATTRIBUTE_ALWAYS_INLINE                                             \
+    static FailureOr<Result<tsl::AsyncValueRef<T>>> Decode(TypeID type_id,   \
+                                                           void* value) {    \
+      if (!CustomCall::Isa<tsl::AsyncValueRef<T>>(checks, type_id))          \
+        return failure();                                                    \
+      return Result<tsl::AsyncValueRef<T>>(reinterpret_cast<void**>(value)); \
+    }                                                                        \
   };
 
 XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(bool);
@@ -1485,7 +1486,7 @@ class Result<tsl::AsyncValueRef<MemrefView>> {
     };
     storage_->data = runtime::AsyncRuntime::AsValue<MemrefView>(
         value, 3 * sizeof(int64_t) + 2 * storage_->rank * sizeof(int64_t),
-        write);
+        alignof(std::max_align_t), write);
   }
 
   PrimitiveType GetDType() { return PrimitiveType{storage_->dtype}; }
@@ -1798,6 +1799,14 @@ class Dictionary {
 
   int64_t size() { return attrs_.size(); }
 
+  std::vector<std::string_view> keys() {
+    std::vector<std::string_view> attr_keys(attrs_.size());
+    for (int64_t i = 0; i < attrs_.size(); ++i) {
+      attr_keys[i] = attrs_[i].name;
+    }
+    return attr_keys;
+  }
+
   template <typename T, RuntimeChecks checks = RuntimeChecks::kDefault>
   ABSL_ATTRIBUTE_ALWAYS_INLINE FailureOr<T> get(std::string_view name) const {
     // TODO(ezhulenev): Use `std::binary_search` because it's guaranteed that
diff --git a/tensorflow/compiler/xla/runtime/custom_call_test.cc b/tensorflow/compiler/xla/runtime/custom_call_test.cc
index 262e289a37c..8aaecbf1c0f 100644
--- a/tensorflow/compiler/xla/runtime/custom_call_test.cc
+++ b/tensorflow/compiler/xla/runtime/custom_call_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
 
-#include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -1199,6 +1199,7 @@ TEST(CustomCallTest, DictionaryAttr) {
   std::string foo;
   int32_t bar = 0;
   std::vector<int32_t> baz;
+  std::vector<std::string> dictionary_keys;
 
   auto handler = [&](Dictionary dict) -> LogicalResult {
     if (dict.size() != 3) return failure();
@@ -1208,6 +1209,12 @@ TEST(CustomCallTest, DictionaryAttr) {
     auto span = dict.get<absl::Span<const int32_t>>("baz");
     baz = std::vector<int32_t>(span->begin(), span->end());
 
+    // Need to copy to vector of strings since strings string_view points to
+    // will no longer exist once this runs.
+    for (auto key : dict.keys()) {
+      dictionary_keys.push_back(std::string(key));
+    }
+
     return success();
   };
 
@@ -1221,6 +1228,45 @@ TEST(CustomCallTest, DictionaryAttr) {
   EXPECT_EQ(foo, "Uh oh");
   EXPECT_EQ(bar, 42);
   EXPECT_EQ(baz, std::vector<int32_t>({1, 2}));
+  EXPECT_EQ(dictionary_keys, std::vector<std::string>({"bar", "baz", "foo"}));
+}
+
+TEST(CustomCallTest, MemrefF8Arg) {
+  absl::string_view source = R"(
+    func.func private @custom_call(%arg0: memref<?xf8E4M3FN>)
+      attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
+
+    func.func @test(%arg0: memref<?xf8E4M3FN>) {
+      call @custom_call(%arg0) : (memref<?xf8E4M3FN>) -> ()
+      return
+    }
+  )";
+
+  xla::PrimitiveType dtype = xla::PrimitiveType::PRIMITIVE_TYPE_INVALID;
+  std::vector<int64_t> sizes;
+
+  auto handler = [&](StridedMemrefView arg0) {
+    dtype = arg0.dtype;
+    sizes.assign(arg0.sizes.begin(), arg0.sizes.end());
+    return success();
+  };
+
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
+    registry.Register(CustomCall::Bind("test.custom_call")
+                          .Arg<StridedMemrefView>()
+                          .To(handler));
+  }};
+
+  std::vector<std::byte> data(42);
+  MemrefDesc arg0(PrimitiveType::F8E4M3FN, data.data(), 0, {42}, {1});
+
+  Arguments<MemrefDesc> args(1);
+  args.emplace_back(std::move(arg0));
+
+  ASSERT_TRUE(CompileAndExecute(source, args, registry).ok());
+  EXPECT_EQ(dtype, PrimitiveType::F8E4M3FN);
+  EXPECT_EQ(sizes.size(), 1);
+  EXPECT_EQ(sizes[0], 42);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/xla/runtime/executable.cc b/tensorflow/compiler/xla/runtime/executable.cc
index 3c2f9075fbc..fb9849965b1 100644
--- a/tensorflow/compiler/xla/runtime/executable.cc
+++ b/tensorflow/compiler/xla/runtime/executable.cc
@@ -93,17 +93,16 @@ ExecutionEngine::SymbolsBinding ToSymbolsBinding(
     using DirectCustomCall = DirectCustomCallRegistry::DirectCustomCall;
     custom_call_registry.ForEach([&](std::string_view name,
                                      DirectCustomCall custom_call) {
-      symbol_map[mangle(name)] = llvm::JITEvaluatedSymbol(
-          llvm::pointerToJITTargetAddress(custom_call), llvm::JITSymbolFlags());
+      symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(custom_call),
+                                  llvm::JITSymbolFlags()};
     });
 
     // Register type id symbols.
     type_registry.ForEach([&](std::string_view name, TypeID type_id) {
       auto type_id_ptr =
           reinterpret_cast<std::uintptr_t>(type_id.getAsOpaquePointer());
-      symbol_map[mangle(name)] = llvm::JITEvaluatedSymbol(
-          static_cast<llvm::JITTargetAddress>(type_id_ptr),
-          llvm::JITSymbolFlags());
+      symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr(type_id_ptr),
+                                  llvm::JITSymbolFlags()};
     });
 
     return symbol_map;
@@ -435,7 +434,7 @@ Status Executable::ReturnResults(unsigned ordinal,
   // Prepare exported functions for the executable.
   std::vector<Executable::Function> functions;
 
-  for (auto& indexed : llvm::enumerate(load_functions)) {
+  for (const auto& indexed : llvm::enumerate(load_functions)) {
     LoadFunction& fn = indexed.value();
 
     // Get the memory layout for passing function arguments.
@@ -449,7 +448,8 @@ Status Executable::ReturnResults(unsigned ordinal,
     functions.push_back(Executable::Function(
         std::move(fn.name), (*engine)->exported(indexed.index()),
         std::move(fn.signature), std::move(fn.runtime_signature),
-        std::move(*args_memory_layout), std::move(*results_memory_layout)));
+        std::move(*args_memory_layout), std::move(*results_memory_layout),
+        true));
   }
 
   return Executable(name, std::move(memory_mapper), std::move(*engine),
@@ -531,8 +531,8 @@ llvm::orc::SymbolMap RuntimeApiSymbolMap(llvm::orc::MangleAndInterner mangle) {
   llvm::orc::SymbolMap symbol_map;
 
   auto bind = [&](std::string_view name, auto symbol_ptr) {
-    symbol_map[mangle(name)] = llvm::JITEvaluatedSymbol(
-        llvm::pointerToJITTargetAddress(symbol_ptr), llvm::JITSymbolFlags());
+    symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(symbol_ptr),
+                                llvm::JITSymbolFlags()};
   };
 
   bind("runtimeGetResultStorage", &GetResultStorage);
diff --git a/tensorflow/compiler/xla/runtime/executable.h b/tensorflow/compiler/xla/runtime/executable.h
index aa8b4dd3954..0b016085b0f 100644
--- a/tensorflow/compiler/xla/runtime/executable.h
+++ b/tensorflow/compiler/xla/runtime/executable.h
@@ -298,6 +298,10 @@ class Executable {
   static LogicalResult Call(ExecutionContext* ctx, CustomCall& call,
                             void** args, void** attrs, void** rets);
 
+  bool RequiresBlas(int ordinal) const {
+    return functions_[ordinal].requires_blas;
+  }
+
  private:
   friend class JitCompiler;  // see `mlir/runtime/transforms/jit_compiler.h`
 
@@ -310,13 +314,14 @@ class Executable {
     Function(std::string_view name, ExecutionEngine::ExportedFunctionPtr fptr,
              FunctionType signature, FunctionType runtime_signature,
              ArgumentsMemoryLayout arguments_memory_layout,
-             ResultsMemoryLayout results_memory_layout)
+             ResultsMemoryLayout results_memory_layout, bool requires_blas)
         : name(name),
           fptr(std::move(fptr)),
           signature(std::move(signature)),
           runtime_signature(std::move(runtime_signature)),
           arguments_memory_layout(std::move(arguments_memory_layout)),
-          results_memory_layout(std::move(results_memory_layout)) {}
+          results_memory_layout(std::move(results_memory_layout)),
+          requires_blas(requires_blas) {}
     Function(const Function&) = delete;
     Function(Function&&) = default;
 
@@ -355,6 +360,10 @@ class Executable {
 
     // Memory layout for returning function results.
     ResultsMemoryLayout results_memory_layout;
+
+    // If this flag is true, then this function is outlined for cuda graph, and
+    // cuBlas should be initiated when capturing the cuda graph.
+    bool requires_blas;
   };
 
   Executable(std::string_view name,
@@ -402,6 +411,8 @@ class FunctionRef {
       ArgumentsRef arguments, const ResultConverter& results,
       const Executable::ExecuteOpts& opts, bool verify_arguments = true) const;
 
+  bool RequiresBlas() const { return executable_->RequiresBlas(ordinal_); }
+
  private:
   const Executable* executable_;
   unsigned ordinal_;
diff --git a/tensorflow/compiler/xla/runtime/executable_test.cc b/tensorflow/compiler/xla/runtime/executable_test.cc
index 6f1b85cc6fb..6684b0e2407 100644
--- a/tensorflow/compiler/xla/runtime/executable_test.cc
+++ b/tensorflow/compiler/xla/runtime/executable_test.cc
@@ -168,19 +168,22 @@ void Emplace(void* int_ptr, AsyncValue* dst) {
   v = *reinterpret_cast<int32_t*>(int_ptr);
 }
 
-struct ReturnI32 {
+template <typename T>
+struct ReturnScalar {
   LogicalResult operator()(unsigned result_index, const Type* type,
                            const Type* runtime_type, void* ret) const {
-    auto* scalar = llvm::dyn_cast<ScalarType>(type);
-    if (scalar && scalar->type() == PrimitiveType::S32) {
-      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(ret, sizeof(int32_t));
-      *ptr = *reinterpret_cast<int32_t*>(ret);
+    PrimitiveType dtype = primitive_util::NativeToPrimitiveType<T>();
+
+    if (auto* s = llvm::dyn_cast<ScalarType>(type); s && s->type() == dtype) {
+      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(ret, sizeof(T));
+      *ptr = *reinterpret_cast<T*>(ret);
       return success();
     }
+
     return failure();
   }
 
-  int32_t* ptr = nullptr;
+  T* ptr = nullptr;
 };
 
 struct ReturnMemref {
@@ -248,6 +251,26 @@ struct ReturnAsyncI32 {
   AsyncValuePtr<int32_t> ptr;
 };
 
+template <typename MemrefImpl>
+struct FetchMemrefDescFromAsyncValue {
+  void operator()(AsyncValue* value, MemrefDesc&& desc) const;
+};
+
+template <>
+struct FetchMemrefDescFromAsyncValue<OwnedMemref> {
+  void operator()(AsyncValue* value, MemrefDesc&& desc) const {
+    value->get<OwnedMemref>().desc = std::move(desc);
+  }
+};
+
+template <>
+struct FetchMemrefDescFromAsyncValue<MemrefDesc> {
+  void operator()(AsyncValue* value, MemrefDesc&& desc) const {
+    value->get<MemrefDesc>() = std::move(desc);
+  }
+};
+
+template <typename MemrefImpl>
 struct ReturnAsyncMemref {
   LogicalResult operator()(unsigned result_index, const Type* type,
                            const Type* runtime_type, void* result_ptr) const {
@@ -261,15 +284,16 @@ struct ReturnAsyncMemref {
     auto* memref = llvm::dyn_cast<MemrefType>(&value_type->value_type());
 
     if (memref) {
-      // TODO(ezhulenev): Emplace function captures `memref` by reference, and
-      // if `value` is not available, then it will lead to asan errors. We need
-      // an `ExtractAsyncValue` that can take absl::AnyInvocable callback, that
-      // will capture all referenced values. Alternative solution is a large
-      // switch statement that will dispatch for different types and ranks.
-      ExtractAsyncValue(value, ptr.value(), [&](void* data, AsyncValue* dst) {
-        auto desc = ConvertReturnedMemref<MemrefDesc>(*this, memref, data);
-        if (succeeded(desc)) dst->get<OwnedMemref>().desc = std::move(*desc);
-      });
+      ExtractAsyncValue(
+          value, ptr.value(),
+          [converter = *this, m = *memref](void* data, AsyncValue* dst) {
+            auto desc = ConvertReturnedMemref<MemrefDesc>(converter, &m, data);
+            if (succeeded(desc)) {
+              FetchMemrefDescFromAsyncValue<MemrefImpl>()(dst,
+                                                          std::move(*desc));
+              dst->SetStateConcrete();
+            }
+          });
       return success();
     }
 
@@ -283,9 +307,12 @@ struct ReturnAsyncMemref {
     return MemrefDesc(element_type, base_ptr, offset, sizes, strides);
   }
 
-  AsyncValuePtr<OwnedMemref> ptr;
+  AsyncValuePtr<MemrefImpl> ptr;
 };
 
+using ReturnAsyncOwnedMemref = ReturnAsyncMemref<OwnedMemref>;
+using ReturnAsyncMemrefDesc = ReturnAsyncMemref<MemrefDesc>;
+
 // Execute all tasks in the caller thread immediately.
 class InlineAsyncTaskRunner : public AsyncTaskRunner {
  public:
@@ -307,7 +334,7 @@ TEST(ExecutableTest, ReturnScalar) {
   )";
 
   int32_t result = 0;
-  ResultConverterSet converter(AssertNoError, ReturnI32{&result});
+  ResultConverterSet converter(AssertNoError, ReturnScalar<int32_t>{&result});
 
   ASSERT_TRUE(CompileAndExecute(module, {}, converter).ok());
   EXPECT_EQ(result, 42);
@@ -342,7 +369,7 @@ TEST(ExecutableTest, ScalarArgs) {
   )";
 
   int32_t result = 0;
-  ResultConverterSet converter(AssertNoError, ReturnI32{&result});
+  ResultConverterSet converter(AssertNoError, ReturnScalar<int32_t>{&result});
 
   ScalarArg arg0(static_cast<int32_t>(20));
   ScalarArg arg1(static_cast<int32_t>(22));
@@ -351,6 +378,27 @@ TEST(ExecutableTest, ScalarArgs) {
   EXPECT_EQ(result, 42);
 }
 
+TEST(ExecutableTest, MemrefF8Arg) {
+  absl::string_view module = R"(
+    func.func @test(%arg0: memref<?xf8E4M3FN>) -> index {
+      %c0 = arith.constant 0 : index
+      %0 = memref.dim %arg0, %c0 : memref<?xf8E4M3FN>
+      return %0 : index
+    }
+  )";
+
+  int64_t result = 0;
+  ResultConverterSet converter(AssertNoError, ReturnScalar<int64_t>{&result});
+
+  MemrefDesc arg0(PrimitiveType::F8E4M3FN, nullptr, 0, {42}, {1});
+
+  Arguments<MemrefDesc> args(1);
+  args.emplace_back(std::move(arg0));
+
+  ASSERT_TRUE(CompileAndExecute(module, args, converter).ok());
+  EXPECT_EQ(result, 42);
+}
+
 TEST(ExecutableTest, MultipleFunctions) {
   absl::string_view module = R"(
     func.func @add(%arg0: i32, %arg1: i32) -> i32 {
@@ -369,7 +417,7 @@ TEST(ExecutableTest, MultipleFunctions) {
   EXPECT_EQ(compiled->num_functions(), 2);
 
   int32_t result = 0;
-  ResultConverterSet converter(AssertNoError, ReturnI32{&result});
+  ResultConverterSet converter(AssertNoError, ReturnScalar<int32_t>{&result});
 
   ScalarArg arg0(static_cast<int32_t>(20));
   ScalarArg arg1(static_cast<int32_t>(22));
@@ -420,7 +468,7 @@ TEST(ExecutableTest, AssertionFailureOrResult) {
 
   {
     int32_t result = 0;
-    ResultConverterSet converter(AssertNoError, ReturnI32{&result});
+    ResultConverterSet converter(AssertNoError, ReturnScalar<int32_t>{&result});
 
     ScalarArg arg0(int32_t{20});
     EXPECT_TRUE(CompileAndExecute(module, {arg0}, converter).ok());
@@ -429,7 +477,7 @@ TEST(ExecutableTest, AssertionFailureOrResult) {
 
   {
     int32_t result = 0;
-    ResultConverterSet converter(IgnoreError, ReturnI32{&result});
+    ResultConverterSet converter(IgnoreError, ReturnScalar<int32_t>{&result});
 
     ScalarArg arg0(int32_t{42});
     auto executed = CompileAndExecute(module, {arg0}, converter);
@@ -453,7 +501,7 @@ TEST(ExecutableTest, AsyncExecuteAndAwait) {
   )";
 
   int32_t result = 0;
-  ResultConverterSet converter(AssertNoError, ReturnI32{&result});
+  ResultConverterSet converter(AssertNoError, ReturnScalar<int32_t>{&result});
 
   ScalarArg arg0(static_cast<int32_t>(20));
   ScalarArg arg1(static_cast<int32_t>(22));
@@ -497,6 +545,92 @@ TEST(ExecutableTest, AsyncScalarRet) {
   EXPECT_EQ(result.get(), 42);
 }
 
+TEST(ExecutableTest, AsyncTokenArg) {
+  absl::string_view module = R"(
+    async.func @test(%arg0: !async.token, %arg1: i32) -> !async.value<i32> {
+      async.await %arg0 : !async.token
+      return %arg1 : i32
+    }
+  )";
+
+  AsyncValueRef<int32_t> result = MakeConstructedAsyncValueRef<int32_t>();
+  ResultConverterSet converter(AssertNoError, ReturnAsyncI32{result.AsPtr()});
+
+  AsyncValueRef<Chain> ch = tsl::MakeAvailableAsyncValueRef<Chain>();
+
+  Arguments<AsyncTokenArg, ScalarArg> arguments(2);
+  arguments.emplace_back(AsyncTokenArg(ch));
+  arguments.push_back(ScalarArg(static_cast<int32_t>(22)));
+
+  ASSERT_TRUE(CompileAndExecute(module, arguments, converter).ok());
+  EXPECT_EQ(result.get(), 22);
+}
+
+TEST(ExecutableTest, AsyncScalarArg) {
+  absl::string_view module = R"(
+    async.func @test(%arg0: !async.value<i32>, %arg1: i32) -> !async.value<i32> {
+      %0 = async.await %arg0 : !async.value<i32>
+      %1 = arith.addi %0, %arg1 : i32
+      return %1 : i32
+    }
+  )";
+
+  AsyncValueRef<int32_t> result = MakeConstructedAsyncValueRef<int32_t>();
+  ResultConverterSet converter(AssertNoError, ReturnAsyncI32{result.AsPtr()});
+
+  AsyncValueRef<int32_t> async_val =
+      tsl::MakeAvailableAsyncValueRef<int32_t>(20);
+  AsyncScalarArg arg0(async_val);
+  ScalarArg arg1(static_cast<int32_t>(22));
+
+  Arguments<AsyncScalarArg, ScalarArg> arguments(2);
+  arguments.push_back(arg0);
+  arguments.push_back(arg1);
+
+  ASSERT_TRUE(CompileAndExecute(module, arguments, converter).ok());
+  EXPECT_EQ(result.get(), 42);
+}
+
+TEST(ExecutableTest, AsyncMemrefArg) {
+  absl::string_view module = R"(
+    async.func @test(%arg0: !async.value<memref<?x?xf32>>) ->
+    !async.value<memref<?x?xf32>> {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+
+      %0 = async.await %arg0 : !async.value<memref<?x?xf32>>
+      %dim0 = memref.dim %0, %c0 : memref<?x?xf32>
+      %dim1 = memref.dim %0, %c1 : memref<?x?xf32>
+      %1 = memref.alloc(%dim0, %dim1) : memref<?x?xf32>
+
+      memref.copy %0, %1 : memref<?x?xf32> to memref<?x?xf32>
+
+      return %1 : memref<?x?xf32>
+    }
+  )";
+
+  AsyncValueRef<OwnedMemref> result =
+      MakeConstructedAsyncValueRef<OwnedMemref>();
+  ResultConverterSet converter(AssertNoError,
+                               ReturnAsyncOwnedMemref{result.AsPtr()});
+  std::vector<float> input = {42.0, 42.0, 42.0, 42.0, 42.0, 42.0, 42.0, 42.0};
+  MemrefDesc memref{
+      PrimitiveType::F32, input.data(), 0, {4, 2}, {4, 2} /*fake strides*/};
+  AsyncValueRef<MemrefDesc> async_memref =
+      tsl::MakeAvailableAsyncValueRef<MemrefDesc>(std::move(memref));
+
+  AsyncMemrefArg arg0(async_memref);
+
+  ASSERT_TRUE(CompileAndExecute(module, {arg0}, converter).ok());
+  ASSERT_TRUE(result.get().desc.has_value());
+  EXPECT_EQ(result.get()->rank(), 2);
+  EXPECT_EQ(result.get()->size(0), 4);
+  EXPECT_EQ(result.get()->size(1), 2);
+
+  float* data = reinterpret_cast<float*>(result.get()->data());
+  EXPECT_TRUE(std::all_of(data, data + 8, [](float v) { return v == 42.0f; }));
+}
+
 TEST(ExecutableTest, AsyncMemrefRet) {
   absl::string_view module = R"(
     async.func @test(%arg0: index) -> !async.value<memref<?xf32>> {
@@ -516,7 +650,7 @@ TEST(ExecutableTest, AsyncMemrefRet) {
   AsyncValueRef<OwnedMemref> result =
       MakeConstructedAsyncValueRef<OwnedMemref>();
   ResultConverterSet converter(AssertNoError,
-                               ReturnAsyncMemref{result.AsPtr()});
+                               ReturnAsyncOwnedMemref{result.AsPtr()});
 
   ScalarArg arg0(static_cast<int64_t>(32));
 
@@ -529,6 +663,90 @@ TEST(ExecutableTest, AsyncMemrefRet) {
   EXPECT_TRUE(std::all_of(data, data + 32, [](float v) { return v == 42.0f; }));
 }
 
+TEST(ExecutableTest, AsyncMemrefInputsAndRets) {
+  absl::string_view module = R"(
+    func.func private @custom_call(%arg0: memref<2x2xf32>,
+                                   %arg1: memref<2x2xf32>)
+      attributes { rt.dynamic, rt.custom_call = "test.double" }
+
+    async.func @test(%input: !async.value<memref<2x2xf32>>,
+                     %output: memref<2x2xf32>)
+      -> !async.value<memref<2x2xf32>> {
+      %token, %result = execute -> !async.value<memref<2x2xf32>> {
+        %0 = async.await %input : !async.value<memref<2x2xf32>>
+        func.call @custom_call(%0, %output)
+            : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+        async.yield %output : memref<2x2xf32>
+      }
+      %1 = async.await %result : !async.value<memref<2x2xf32>>
+      return %1 : memref<2x2xf32>
+    }
+  )";
+
+  // Doubles every element in the array.
+  auto test_double = [&](MemrefView input, MemrefView output) {
+    float* in = reinterpret_cast<float*>(input.data);
+    float* out = reinterpret_cast<float*>(output.data);
+    for (int i = 0; i < 4; ++i) {
+      out[i] = in[i] * 2;
+    }
+    return success();
+  };
+
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
+    registry.Register(CustomCall::Bind("test.double")
+                          .Arg<MemrefView>()  // input
+                          .Arg<MemrefView>()  // output
+                          .To(test_double));
+  }};
+
+  // Allocates storage and sets the initial data.
+  // In this test case, this buffer is shared across all inputs and outputs,
+  // which mimics the buffer reuse behavior in XLA.
+  std::array<float, 4> storage = {1.0, 2.0, 3.0, 4.0};
+  std::array<int64_t, 2> sizes = {2, 2};
+  const auto& fake_strides = sizes;
+
+  // Constructs inputs and output for the first run.
+  AsyncValueRef<MemrefDesc> input_1 =
+      tsl::MakeAvailableAsyncValueRef<MemrefDesc>(
+          PrimitiveType::F32, storage.data(), 0, sizes, fake_strides);
+  // Wraps the output fed in the parameter packs as an async output.
+  auto result_1 = MakeConstructedAsyncValueRef<MemrefDesc>(
+      PrimitiveType::F32, storage.data(), 0, sizes, fake_strides);
+  ResultConverterSet first_converter(AssertNoError,
+                                     ReturnAsyncMemrefDesc{result_1.AsPtr()});
+
+  Arguments<AsyncMemrefArg, MemrefDesc> args_1(2);
+  args_1.emplace_back(AsyncMemrefArg(input_1));
+  args_1.push_back(
+      MemrefDesc(PrimitiveType::F32, storage.data(), 0, sizes, fake_strides));
+
+  LazyAsyncTaskRunner runner;
+  auto exec_ref =
+      CompileAndExecute(module, args_1, first_converter, &runner, registry,
+                        /*use_lazy_runner=*/true);
+  ASSERT_TRUE(exec_ref.ok());
+  result_1.AndThen([exec_ref = *std::move(exec_ref)] {});
+
+  // Constructs inputs and output for the second run.
+  auto result_2 = MakeConstructedAsyncValueRef<MemrefDesc>(
+      MemrefDesc(PrimitiveType::F32, storage.data(), 0, sizes, fake_strides));
+  ResultConverterSet second_converter(AssertNoError,
+                                      ReturnAsyncMemrefDesc{result_2.AsPtr()});
+  Arguments<AsyncMemrefArg, MemrefDesc> args_2(2);
+  args_2.emplace_back(AsyncMemrefArg(result_1));
+  args_2.push_back(
+      MemrefDesc(PrimitiveType::F32, storage.data(), 0, sizes, fake_strides));
+  exec_ref =
+      CompileAndExecute(module, args_2, second_converter, &runner, registry,
+                        /*use_lazy_runner=*/true);
+  result_2.AndThen([exec_ref = *std::move(exec_ref)] {});
+  tsl::BlockUntilReady(result_2.GetAsyncValue());
+
+  EXPECT_THAT(storage, testing::ElementsAre(4.0, 8.0, 12.0, 16.0));
+}
+
 TEST(ExecutableTest, AsyncWaiting) {
   absl::string_view module = R"(
     async.func @test2(%arg0: i32, %arg1: i32) -> !async.value<i32> {
@@ -684,7 +902,7 @@ void BM_AsyncExecuteAndAwait(benchmark::State& state) {
   )";
 
   int32_t result = 0;
-  ResultConverterSet converter(AssertNoError, ReturnI32{&result});
+  ResultConverterSet converter(AssertNoError, ReturnScalar<int32_t>{&result});
 
   ScalarArg arg0(static_cast<int32_t>(20));
   ScalarArg arg1(static_cast<int32_t>(22));
diff --git a/tensorflow/compiler/xla/runtime/execution_engine.cc b/tensorflow/compiler/xla/runtime/execution_engine.cc
index c019c520b6b..0b0a448ac1a 100644
--- a/tensorflow/compiler/xla/runtime/execution_engine.cc
+++ b/tensorflow/compiler/xla/runtime/execution_engine.cc
@@ -133,7 +133,7 @@ static absl::Status SetUpExportedFunction(llvm::Module &module,
   llvm::SmallVector<llvm::Value *> args;
   args.reserve(llvm::size(func->args()));
 
-  for (auto &indexed_arg : llvm::enumerate(func->args())) {
+  for (const auto &indexed_arg : llvm::enumerate(func->args())) {
     llvm::Type *art_ty = indexed_arg.value().getType();
 
     llvm::Value *arg_ptr_gep = builder.CreateConstGEP1_64(
@@ -257,8 +257,8 @@ ExecutionEngine::CreateFromModule(std::unique_ptr<llvm::LLVMContext> ctx,
   // TODO(ezhulenev): We should have out own optimizing transformer pipelines
   // for different Xla backends, e.g. there is absolutely no need to run
   // SLV vectorizer for Xla Gpi host side executable.
-  auto transformer = options.make_optimizing_transformer(
-      llvm::CodeGenOpt::Default, /*sizeLevel=*/0, options.target_machine);
+  auto transformer =
+      options.make_optimizing_transformer(options.target_machine);
   if (auto err = transformer(module_ptr))
     return InternalError("failed to run optimization pipeline: %s",
                          ToString(err));
diff --git a/tensorflow/compiler/xla/runtime/execution_engine.h b/tensorflow/compiler/xla/runtime/execution_engine.h
index 9e0765c45b8..90c70448d6f 100644
--- a/tensorflow/compiler/xla/runtime/execution_engine.h
+++ b/tensorflow/compiler/xla/runtime/execution_engine.h
@@ -70,9 +70,8 @@ class ExecutionEngine {
   using OptimizingTransformer = std::function<llvm::Error(llvm::Module *)>;
 
   // Callback to construct an optimizing transformer for the given options.
-  using MakeOptimizingTransformer = std::function<OptimizingTransformer(
-      unsigned opt_level, unsigned size_level,
-      llvm::TargetMachine *targetMachine)>;
+  using MakeOptimizingTransformer =
+      std::function<OptimizingTransformer(llvm::TargetMachine *targetMachine)>;
 
   // Compose multiple symbol bindings into a single symbol binding function.
   static SymbolsBinding BindAll(std::vector<SymbolsBinding> bindings);
diff --git a/tensorflow/compiler/xla/runtime/ffi/ffi_api.h b/tensorflow/compiler/xla/runtime/ffi/ffi_api.h
index 8111c7b48f6..69f16d379dc 100644
--- a/tensorflow/compiler/xla/runtime/ffi/ffi_api.h
+++ b/tensorflow/compiler/xla/runtime/ffi/ffi_api.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_FFI_API_H_
 
 #include <algorithm>
+#include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <iostream>
diff --git a/tensorflow/compiler/xla/runtime/runner/BUILD b/tensorflow/compiler/xla/runtime/runner/BUILD
index 2706b2dd3a6..3c614f990f1 100644
--- a/tensorflow/compiler/xla/runtime/runner/BUILD
+++ b/tensorflow/compiler/xla/runtime/runner/BUILD
@@ -3,7 +3,6 @@ load("//tensorflow/compiler/xla:xla.bzl", "xla_py_proto_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/compiler/xla:internal"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/compiler/xla/runtime/runner/runner.cc b/tensorflow/compiler/xla/runtime/runner/runner.cc
index c176e7cab27..dfba6e21ee6 100644
--- a/tensorflow/compiler/xla/runtime/runner/runner.cc
+++ b/tensorflow/compiler/xla/runtime/runner/runner.cc
@@ -262,7 +262,7 @@ absl::Status Execute(RunnerFlags flags,
   if (auto st = ReadFileToString(env, flags.module_path, &module); !st.ok()) {
     return InternalError(
         StrFormat("failed to read module input from %s, error: %s",
-                  flags.module_path, st.error_message()));
+                  flags.module_path, st.message()));
   }
 
   // Read arguments from the input file.
diff --git a/tensorflow/compiler/xla/runtime/types.cc b/tensorflow/compiler/xla/runtime/types.cc
index 32b8ba93439..8406536127d 100644
--- a/tensorflow/compiler/xla/runtime/types.cc
+++ b/tensorflow/compiler/xla/runtime/types.cc
@@ -88,12 +88,22 @@ std::string OpaqueOperandType::ToString() const { return "!rt.opaque"; }
 using ArgumentAbi = Type::ArgumentAbi;
 using ResultAbi = Type::ResultAbi;
 
+// Async token passed as a pointer to the runtime async token.
+absl::StatusOr<ArgumentAbi> AsyncTokenType::AsArgument() const {
+  return ArgumentAbi{1};
+}
+
 // Async token returned as a pointer to the runtime async token.
 absl::StatusOr<ResultAbi> AsyncTokenType::AsResult() const {
   return ResultAbi{sizeof(void*)};
 }
 
-// Async value returned as a pointer to the runtime async token.
+// Async value passed as a pointer to the runtime async value.
+absl::StatusOr<ArgumentAbi> AsyncValueType::AsArgument() const {
+  return ArgumentAbi{1};
+}
+
+// Async value returned as a pointer to the runtime async value.
 absl::StatusOr<ResultAbi> AsyncValueType::AsResult() const {
   return ResultAbi{sizeof(void*)};
 }
diff --git a/tensorflow/compiler/xla/runtime/types.h b/tensorflow/compiler/xla/runtime/types.h
index e84b8e3670d..a70639d3211 100644
--- a/tensorflow/compiler/xla/runtime/types.h
+++ b/tensorflow/compiler/xla/runtime/types.h
@@ -101,6 +101,7 @@ class AsyncTokenType : public llvm::RTTIExtends<AsyncTokenType, Type> {
  public:
   static constexpr char ID = 0;  // NOLINT
 
+  absl::StatusOr<ArgumentAbi> AsArgument() const final;
   absl::StatusOr<ResultAbi> AsResult() const final;
 
   std::string ToString() const final;
@@ -119,6 +120,7 @@ class AsyncValueType : public llvm::RTTIExtends<AsyncValueType, Type> {
 
   const Type& value_type() const { return *value_type_; }
 
+  absl::StatusOr<ArgumentAbi> AsArgument() const final;
   absl::StatusOr<ResultAbi> AsResult() const final;
 
   std::string ToString() const final;
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 044385b8722..6b32e013760 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -13,6 +13,7 @@ load(
 load(
     "//tensorflow/tsl/platform:build_config.bzl",
     "tf_proto_library",
+    "tf_pyclif_proto_library",
 )
 load("//tensorflow/tsl:tsl.bzl", "if_libtpu")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable", "internal_hlo_deps")
@@ -22,6 +23,7 @@ load(
 )
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
     "if_rocm_is_configured",
 )
 load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
@@ -51,6 +53,13 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "hlo_pyclif",
+    proto_lib = ":hlo_proto",
+    proto_srcfile = "hlo.proto",
+    visibility = ["//visibility:public"],
+)
+
 tf_proto_library(
     name = "hlo_profile_printer_data",
     srcs = ["hlo_profile_printer_data.proto"],
@@ -115,6 +124,8 @@ xla_cc_test(
         ":async_collective_creator",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -158,14 +169,7 @@ cc_library(
     name = "all_reduce_promotion",
     srcs = ["all_reduce_promotion.cc"],
     hdrs = ["all_reduce_promotion.h"],
-    deps = [
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service:change_op_data_type",
-        "//tensorflow/compiler/xla/service:hlo_pass",
-    ],
+    deps = [":change_op_data_type"],
 )
 
 xla_cc_test(
@@ -192,10 +196,13 @@ cc_library(
         ":collective_ops_utils",
         ":hlo_domain_map",
         ":hlo_pass",
-        ":hlo_query",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/tsl/platform:errors",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -204,11 +211,15 @@ xla_cc_test(
     srcs = ["all_reduce_reassociate_test.cc"],
     deps = [
         ":all_reduce_reassociate",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
+        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -219,9 +230,9 @@ cc_library(
     deps = [
         ":all_reduce_key",
         ":hlo_pass",
-        ":hlo_query",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -392,6 +403,37 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "convert_async_collectives_to_sync",
+    srcs = ["convert_async_collectives_to_sync.cc"],
+    hdrs = ["convert_async_collectives_to_sync.h"],
+    deps = [
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
+        "//tensorflow/tsl/platform:errors",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "convert_async_collectives_to_sync_test",
+    srcs = ["convert_async_collectives_to_sync_test.cc"],
+    deps = [
+        ":convert_async_collectives_to_sync",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:test",
+    ],
+)
+
 cc_library(
     name = "dump",
     srcs = ["dump.cc"],
@@ -407,7 +449,6 @@ cc_library(
         "//tensorflow/tsl/lib/io:zlib_outputbuffer",
         "//tensorflow/tsl/lib/strings:proto_serialization",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:path",
         "//tensorflow/tsl/platform:regexp",
         "//tensorflow/tsl/platform:status",
@@ -504,7 +545,6 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -1051,9 +1091,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -1074,8 +1112,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -1087,7 +1123,7 @@ cc_library(
         ":latency_hiding_scheduler",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -1103,8 +1139,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
@@ -1243,17 +1277,12 @@ cc_library(
     deps = [
         ":computation_layout",
         ":dump",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
         ":hlo_execution_profile",
         ":hlo_graph_dumper",
         ":hlo_proto_cc",
         ":maybe_owning_device_memory",
         ":shaped_buffer",
         ":stream_pool",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:shape_tree",
@@ -1262,14 +1291,20 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_description",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
+        "//tensorflow/compiler/xla/stream_executor:timer",
+        "//tensorflow/tsl/lib/strings:proto_serialization",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/lib/strings:proto_serialization",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
     ] + internal_hlo_deps(),
 )
 
@@ -1488,17 +1523,12 @@ xla_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1562,6 +1592,7 @@ cc_library(
         ":memory_space_assignment_repacking",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
@@ -1570,6 +1601,9 @@ cc_library(
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -1579,9 +1613,6 @@ xla_cc_test(
     deps = [
         ":buffer_value",
         ":heap_simulator",
-        ":hlo_alias_analysis",
-        ":hlo_buffer",
-        ":hlo_dataflow_analysis",
         ":hlo_ordering",
         ":hlo_value",
         ":tuple_points_to_analysis",
@@ -1591,11 +1622,9 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1767,18 +1796,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_query",
-    srcs = ["hlo_query.cc"],
-    hdrs = ["hlo_query.h"],
-    deps = [
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "@com_google_absl//absl/container:flat_hash_set",
-    ],
-)
-
 cc_library(
     name = "fusion_queue",
     hdrs = ["fusion_queue.h"],
@@ -1804,7 +1821,6 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -2287,7 +2303,6 @@ cc_library(
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
-        ":hlo_query",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal",
@@ -2295,7 +2310,6 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:permutation_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -2303,6 +2317,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -2463,9 +2478,8 @@ cc_library(
     hdrs = ["collectives_schedule_linearizer.h"],
     deps = [
         ":hlo_pass",
-        "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
         "//tensorflow/tsl/platform:errors",
@@ -2531,11 +2545,11 @@ cc_library(
     hdrs = ["all_gather_broadcast_reorder.h"],
     deps = [
         ":hlo_pass",
-        ":hlo_query",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
@@ -2602,7 +2616,6 @@ cc_library(
         ":collective_combiner_utils",
         ":hlo_domain_map",
         ":hlo_pass",
-        ":hlo_query",
         ":shape_inference",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -2612,6 +2625,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/hlo/utils:hlo_sharding_util",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
@@ -2652,7 +2666,6 @@ cc_library(
         ":collective_combiner_utils",
         ":hlo_domain_map",
         ":hlo_pass",
-        ":hlo_query",
         ":shape_inference",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -2662,6 +2675,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/hlo/utils:hlo_sharding_util",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
@@ -2699,11 +2713,11 @@ cc_library(
     hdrs = ["all_reduce_contiguous.h"],
     deps = [
         ":hlo_pass",
-        ":hlo_query",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
     ],
 )
 
@@ -2730,7 +2744,6 @@ cc_library(
         ":collective_ops_utils",
         ":hlo_domain_map",
         ":hlo_pass",
-        ":hlo_query",
         ":shape_inference",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -2740,6 +2753,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2818,11 +2832,11 @@ cc_library(
         ":hlo_creation_utils",
         ":hlo_module_config",
         ":hlo_pass",
-        ":hlo_query",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2857,9 +2871,9 @@ cc_library(
         ":collective_ops_utils",
         ":hlo_domain_map",
         ":hlo_pass",
-        ":hlo_query",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/tsl/platform:errors",
     ],
 )
@@ -2916,8 +2930,8 @@ xla_cc_test(
     srcs = ["gather_expander_test.cc"],
     deps = [
         ":gather_expander",
-        ":hlo_query",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_macros_header",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
@@ -2940,7 +2954,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3142,13 +3155,13 @@ cc_library(
     deps = [
         ":call_inliner",
         ":hlo_pass",
-        ":hlo_query",
         ":pattern_matcher",
         ":while_loop_analysis",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:union_find",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3326,7 +3339,6 @@ cc_library(
         ":hlo_creation_utils",
         ":hlo_pass",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service/graphcycles",
     ],
 )
 
@@ -3354,15 +3366,9 @@ cc_library(
     deps = [
         ":hlo_pass",
         ":op_expander_pass",
-        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/tsl/platform:logging",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -3730,9 +3736,6 @@ cc_library(
     hdrs = ["computation_placer.h"],
     deps = [
         ":global_device_id",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -3742,14 +3745,17 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:platform",
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform_id",
         "//tensorflow/compiler/xla/stream_executor/host:host_platform_id",
         "//tensorflow/compiler/xla/stream_executor/rocm:rocm_platform_id",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + if_libtpu([":tpu_computation_placer"]),
     alwayslink = True,  # Contains per-platform computation placer registration
 )
@@ -3881,7 +3887,6 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -4020,6 +4025,7 @@ cc_library(
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -4085,22 +4091,10 @@ cc_library(
     srcs = ["hlo_phi_graph.cc"],
     hdrs = ["hlo_phi_graph.h"],
     deps = [
-        ":call_graph",
         ":hlo_value",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/tsl/platform:logging",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -4127,32 +4121,35 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "hlo_activation_analysis",
-    srcs = ["hlo_activation_analysis.cc"],
-    hdrs = ["hlo_activation_analysis.h"],
+    name = "hlo_value_semantics_analysis",
+    srcs = ["hlo_value_semantics_analysis.cc"],
+    hdrs = ["hlo_value_semantics_analysis.h"],
     deps = [
+        ":hlo_value",
+        "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
 xla_cc_test(
-    name = "hlo_activation_analysis_test",
-    srcs = ["hlo_activation_analysis_test.cc"],
+    name = "hlo_value_semantics_analysis_test",
+    srcs = ["hlo_value_semantics_analysis_test.cc"],
     deps = [
-        ":hlo_activation_analysis",
+        ":hlo_value_semantics_analysis",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -4588,6 +4585,7 @@ cc_library(
     deps = [
         ":heap_simulator",
         ":hlo_cost_analysis",
+        ":memory_space_assignment_proto_cc",
         ":memory_space_assignment_repacking",
         ":memory_space_assignment_tuning_utils",
         ":memory_space_assignment_utils",
@@ -4606,13 +4604,11 @@ xla_cc_test(
     deps = [
         ":instruction_hoister",
         ":memory_space_assignment",
-        ":memory_space_assignment_utils",
-        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/lib/core:status_test_util",
     ],
 )
 
@@ -4718,7 +4714,10 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/base:log_severity",
+        "@com_google_absl//absl/log:scoped_mock_log",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -4733,7 +4732,6 @@ cc_library(
         ":hlo_dce",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
-        ":hlo_query",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -4742,6 +4740,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -5003,15 +5002,14 @@ cc_library(
     hdrs = ["hlo_constant_folding.h"],
     deps = [
         ":hlo_pass",
-        ":hlo_query",
         ":slow_operation_alarm",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
     ],
 )
 
@@ -5121,12 +5119,12 @@ cc_library(
     hdrs = ["hlo_element_type_converter.h"],
     deps = [
         ":hlo_pass",
-        ":hlo_query",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
     ],
@@ -5708,7 +5706,6 @@ cc_library(
         ":call_graph",
         ":collective_ops_utils",
         ":hlo_pass",
-        ":hlo_query",
         ":hlo_replication_analysis",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:status",
@@ -5716,8 +5713,10 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -5728,7 +5727,6 @@ xla_cc_test(
     deps = [
         ":hlo_verifier",
         ":while_loop_all_reduce_code_motion",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -6026,10 +6024,10 @@ cc_library(
     hdrs = ["map_inliner.h"],
     deps = [
         ":hlo_pass",
-        ":hlo_query",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -6082,7 +6080,6 @@ cc_library(
     deps = [
         ":call_graph",
         ":hlo_pass",
-        ":hlo_query",
         ":hlo_replication_analysis",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
@@ -6092,6 +6089,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -6378,9 +6376,7 @@ cc_library(
     srcs = ["slow_operation_alarm.cc"],
     hdrs = ["slow_operation_alarm.h"],
     deps = [
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
@@ -6400,6 +6396,8 @@ cc_library(
         ":global_device_id",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
@@ -6443,15 +6441,13 @@ xla_cc_test(
     srcs = ["topk_rewriter_test.cc"],
     deps = [
         ":hlo_dce",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         ":topk_rewriter",
         ":tuple_simplifier",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:test_macros_cpu",
-        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:status_matchers",
@@ -6941,10 +6937,7 @@ xla_cc_binary(
     visibility = ["//visibility:public"],
     deps = [
         ":compiler",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
+        "//tensorflow/compiler/xla:autotune_results_proto_cc",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/mlir_hlo",
@@ -6955,13 +6948,24 @@ xla_cc_binary(
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/util:command_line_flags",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
         "@stablehlo//:register",
-        "//tensorflow/compiler/xla:autotune_results_proto_cc",
     ] + if_cuda_is_configured([
         "//tensorflow/compiler/xla/service/gpu:executable_proto_cc",
         "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
         "//tensorflow/compiler/xla/service/gpu:nvptx_compiler_impl",
-    ]) + if_cuda(["//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin"]),
+    ]) + if_rocm_is_configured([
+        "//tensorflow/compiler/xla/service/gpu:executable_proto_cc",
+        "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
+        "//tensorflow/compiler/xla/service/gpu:amdgpu_compiler_impl",
+    ]) + if_cuda([
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
+    ]) + if_rocm([
+        "//tensorflow/compiler/xla/stream_executor/rocm:rocblas_plugin",
+    ]),
 )
 
 # A simple test of xla_aot_compile which generates an output file from an mhlo file.
@@ -7096,3 +7100,10 @@ xla_cc_test(
         "//tensorflow/tsl/platform:test",
     ]),
 )
+
+tf_proto_library(
+    name = "memory_space_assignment_proto",
+    srcs = ["memory_space_assignment.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index c852fc06838..5ff9f5e82b5 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_comparison.h"
@@ -51,7 +52,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -1687,6 +1687,83 @@ AlgebraicSimplifierVisitor::TrySimplifyTautologicalBitcastConvert(
   return true;
 }
 
+Status
+AlgebraicSimplifierVisitor::TryRemoveUpcastAndDowncastSurroundingBinaryOp(
+    HloInstruction* convert_instruction) {
+  HloInstruction* arg_1 = nullptr;
+  HloInstruction* arg_2 = nullptr;
+  HloInstruction* bin_op_instr = nullptr;
+  HloInstruction* final_convert_instr = nullptr;
+
+  // TODO(b/277095115): Instead, consider a more broad matching here which will
+  // also catch constants. For an example, look at
+  // cudnn_fused_conv_rewriter.cc's IsLosslesslyConvertibleTo().
+  auto arg_1_pattern = m::Convert(m::Op(&arg_1)).WithOneUser();
+  auto arg_2_pattern = m::Convert(m::Op(&arg_2)).WithOneUser();
+
+  auto is_unsigned_int_pred = [](const HloInstruction* instr) {
+    // Only unsigned integer division/remainder is safe. Signed integer division
+    // can result in undefined behavior. For example, in S8 consider -128/-1.
+    return primitive_util::IsUnsignedIntegralType(
+        instr->shape().element_type());
+  };
+
+  auto bin_op_pattern =
+      m::Convert(&final_convert_instr,
+                 m::AnyOf<HloInstruction>(
+                     m::Add(&bin_op_instr, arg_1_pattern, arg_2_pattern),
+                     m::Subtract(&bin_op_instr, arg_1_pattern, arg_2_pattern),
+                     m::Multiply(&bin_op_instr, arg_1_pattern, arg_2_pattern),
+                     m::Divide(&bin_op_instr, arg_1_pattern, arg_2_pattern)
+                         .WithPredicate(is_unsigned_int_pred),
+                     m::Remainder(&bin_op_instr, arg_1_pattern, arg_2_pattern)
+                         .WithPredicate(is_unsigned_int_pred))
+                     .WithOneUser());
+
+  if (!Match(convert_instruction, bin_op_pattern)) {
+    return OkStatus();
+  }
+
+  const PrimitiveType arg_1_type = arg_1->shape().element_type();
+  const PrimitiveType arg_2_type = arg_2->shape().element_type();
+  const PrimitiveType final_type = final_convert_instr->shape().element_type();
+
+  if (arg_1_type != final_type || arg_2_type != final_type) {
+    // Only match when the series of instructions ends with the same types that
+    // it started with.
+    return OkStatus();
+  }
+
+  const PrimitiveType bin_op_type = bin_op_instr->shape().element_type();
+  if (!primitive_util::IsIntegralType(final_type) ||
+      !primitive_util::IsIntegralType(bin_op_type) ||
+      (primitive_util::IsSignedIntegralType(final_type) !=
+       primitive_util::IsSignedIntegralType(bin_op_type)) ||
+      (primitive_util::IsUnsignedIntegralType(final_type) !=
+       primitive_util::IsUnsignedIntegralType(bin_op_type))) {
+    // So far, only the safety of this transformation with same signedness
+    // integer types has been verified.
+    // TODO(b/277095299): Add support for floating point types.
+    return OkStatus();
+  }
+
+  // Ensure that bin_op_type can represent everything that final_type can. This
+  // is ensuring that the pattern is matching the case when we upcast, perform
+  // the op, and then downcast.
+  if (!primitive_util::CastPreservesValues(final_type, bin_op_type)) {
+    return OkStatus();
+  }
+
+  // Change the type of the binary op to the smaller type.
+  HloComputation* computation = convert_instruction->parent();
+  HloInstruction* new_bin_op =
+      computation->AddInstruction(bin_op_instr->CloneWithNewOperands(
+          ShapeUtil::ChangeElementType(bin_op_instr->shape(), final_type),
+          {arg_1, arg_2}));
+  TF_RETURN_IF_ERROR(ReplaceInstruction(final_convert_instr, new_bin_op));
+  return OkStatus();
+}
+
 static HloInstruction* BuildTupleConstant(HloComputation* computation,
                                           const LiteralSlice& literal,
                                           AlgebraicSimplifier* simplifier) {
@@ -1954,7 +2031,7 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   }
 
   // A/sqrt(B) => A*rsqrt(X).
-  if (Match(divide, m::Divide(m::Op(&a), m::Sqrt(m::Op(&b))))) {
+  if (Match(divide, m::Divide(m::Op(&a), m::Sqrt(m::Op(&b)).WithOneUse()))) {
     auto* rsqrt = divide->mutable_operand(1)->AddInstruction(
         HloInstruction::CreateUnary(divide->shape(), HloOpcode::kRsqrt, b));
     return ReplaceWithNewInstruction(
@@ -1963,7 +2040,7 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   }
 
   // A/rsqrt(B) => A*sqrt(B).
-  if (Match(divide, m::Divide(m::Op(&a), m::Rsqrt(m::Op(&b))))) {
+  if (Match(divide, m::Divide(m::Op(&a), m::Rsqrt(m::Op(&b)).WithOneUse()))) {
     auto* sqrt = divide->mutable_operand(1)->AddInstruction(
         HloInstruction::CreateUnary(divide->shape(), HloOpcode::kSqrt, b));
     return ReplaceWithNewInstruction(
@@ -3648,7 +3725,7 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
       }
       if (OutputIsPermutationOfOperandElements(user, broadcast) ||
           OutputIsSubsetOfOperandElements(user, broadcast)) {
-        VLOG(10) << "transform permuting/subset  of a scalar broadcast into "
+        VLOG(10) << "transform permuting/subset of a scalar broadcast into "
                  << "a single broadcast";
         HloInstruction* new_broadcast = user->AddInstruction(
             HloInstruction::CreateBroadcast(user->shape(), operand, {}));
@@ -3787,7 +3864,8 @@ Status AlgebraicSimplifierVisitor::HandleConvert(HloInstruction* convert) {
     return ReplaceInstruction(convert,
                               convert->mutable_operand(0)->mutable_operand(0));
   }
-  return OkStatus();
+
+  return TryRemoveUpcastAndDowncastSurroundingBinaryOp(convert);
 }
 
 // Complex(Real(c), Imag(c)) -> c
@@ -4166,9 +4244,9 @@ AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
         new_operands.push_back(operand);
       }
     }
-    VLOG(4) << "Sinking broadcast after user:";
-    VLOG(4) << "  old broadcast: " << broadcast->ToString();
-    VLOG(4) << "  old user: " << user->ToString();
+    VLOG(4) << "Sinking broadcast after user:"
+            << "\n  old broadcast: " << broadcast->ToString()
+            << "\n  old user: " << user->ToString();
     changed_shape = ShapeUtil::ChangeElementType(operand->shape(),
                                                  user->shape().element_type());
     simplifier_->UpdateLayout(&changed_shape);
@@ -5757,10 +5835,11 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
   // field if the output of the reduce is a vector or scalar. Higher ranked
   // result may require a transpose of the output.
   if (arg->opcode() == HloOpcode::kTranspose &&
-      (reduce->shape().rank() < 2 || arg->user_count() == 1 ||
-       absl::c_all_of(arg->users(), [](HloInstruction* use) {
-         return use->opcode() == HloOpcode::kReduce;
-       }))) {
+      (options_.unconditionally_simplify_reduce_of_transpose_or_reshape() ||
+       (reduce->shape().rank() < 2 || arg->user_count() == 1 ||
+        absl::c_all_of(arg->users(), [](HloInstruction* use) {
+          return use->opcode() == HloOpcode::kReduce;
+        })))) {
     auto transpose_dimensions = arg->dimensions();
     std::vector<int64_t> new_reduce_dimensions;
     new_reduce_dimensions.reserve(dimensions.size());
@@ -5830,24 +5909,35 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
                     new_dimensions, function));
   }
 
-  // A reshape that collapses multiple dimensions into a dimension being
-  // reduced can just reduce all of those dimensions instead of doing a
-  // collapsing reshape before a reduction.
+  // Handle two cases of reduce(reshape(x)).
+  //
+  // 1. The reshape collapses/expands only dimensions that are being reduced.
+  //    In this case we can just reduce those dimensions and skip the reshape.
+  // 2. The reshape collapses/expands only dimensions that are *not* being
+  //    reduced.  In this case we can do the reshape after the reduce.  This is
+  //    beneficial because the reduce will now operate on less data.
   if (options_.enable_reduce_of_reshape() &&
       arg->opcode() == HloOpcode::kReshape) {
     std::vector<std::pair<int64_t, int64_t>> unmodified_dims =
         ShapeUtil::DimensionsUnmodifiedByReshape(arg->operand(0)->shape(),
                                                  arg->shape());
-    std::vector<bool> arg_dim_in_output(arg->shape().rank(), true);
-    std::vector<bool> arg_dim_unmodified(arg->shape().rank(), false);
+
+    // True for those dimensions of the reduce input that are not reduced, false
+    // for the dims that are reduced.
+    absl::InlinedVector<bool, 8> arg_dim_in_output(arg->shape().rank(), true);
     for (auto dim : dimensions) {
       arg_dim_in_output[dim] = false;
     }
-    for (auto dim_pair : unmodified_dims) {
-      arg_dim_unmodified[dim_pair.second] = true;
+
+    // True for those dimensions of the reduce input that are unmodified by the
+    // reshape.
+    absl::InlinedVector<bool, 8> arg_dim_unmodified(arg->shape().rank(), false);
+    for (auto [input_idx, output_idx] : unmodified_dims) {
+      arg_dim_unmodified[output_idx] = true;
     }
-    // The goal is to verify that all dimensions that are not removed in the
-    // reduce are unmodified by the reshape. For example:
+
+    // Case 1: Check whether all dimensions that are not removed in the reduce
+    // are unmodified by the reshape. For example:
     // reduce(reshape([A,B*C], a[A,B,C]),[1]) = reduce(a[A, B, C], [1, 2])
     bool can_move_reshape_into_reduce = true;
     for (int64_t i = 0; i < arg_dim_in_output.size(); ++i) {
@@ -5874,7 +5964,32 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
                       reduce_result_shape, arg->mutable_operand(0), init_value,
                       new_reduce_dimensions, function));
     }
+
+    // Case 2: Check whether the reshape only modifies non-reduction dimensions.
+    // Equivalently, the reduction dimensions are all preserved by the reshape.
+    if ((arg->user_count() == 1 ||
+         options_.unconditionally_simplify_reduce_of_transpose_or_reshape()) &&
+        absl::c_all_of(dimensions,
+                       [&](int64_t dim) { return arg_dim_unmodified[dim]; })) {
+      absl::InlinedVector<int64_t, 8> new_reduce_dims;
+      for (auto dim : dimensions) {
+        auto matching_dim_it = absl::c_find_if(
+            unmodified_dims,
+            [&](const auto& dim_pair) { return dim_pair.second == dim; });
+        CHECK(matching_dim_it != unmodified_dims.end());
+        new_reduce_dims.push_back(matching_dim_it->first);
+      }
+
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * new_reduce,
+          MakeReduceHlo(arg->mutable_operand(0), init_value, new_reduce_dims,
+                        reduce->to_apply(), &reduce->metadata()));
+      TF_ASSIGN_OR_RETURN(HloInstruction * new_reshape,
+                          MakeReshapeHlo(reduce->shape(), new_reduce));
+      return ReplaceInstruction(reduce, new_reshape);
+    }
   }
+
   // Convert Reduce(concat({a,b,...})) to
   //  map(reduce(a),map(reduce(b),...,))
   //
@@ -6779,6 +6894,48 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
     }
   }
 
+  // transpose(broadcast(x)) -> broadcast(x), if the transpose leaves the
+  // relative order of the dimensions of `x` unchanged.
+  //
+  // To understand the permutations logic here, consider a simple case.
+  //
+  //  bcast = f32[1,2,3,4] broadcast(f32[2,4] x), dimensions={1,3}
+  //  trans = f32[2,3,1,4] transpose(f32[1,2,3,4] bcast), dimensions={1,2,0,3}
+  //
+  // We want to transform this into
+  //
+  //  bcast' = f32[2,3,1,4] broadcast(f32[2,4] x), dimensions={0,3}
+  //
+  // The algorithm to compute bcast'.dimensions() is:
+  //
+  //  * Let p' be the inverse of trans.dimensions(); in the example, {2,0,1,3}.
+  //  * bcast'.dimensions() is [p'[dim] for dim in bcast.dimensions()].  In the
+  //    example, p'[1] = 0, meaning that broadcast dim 1 (size 2) ends up at
+  //    index 0 after the transpose.
+  //
+  // We also need to check that bcast'.dimensions() is "sorted the same" as
+  // bcast.dimensions() -- otherwise, we're simply moving the transpose into the
+  // broadcast op.  For now we cowardly refuse to consider broadcasts except
+  // where their dimensions() are sorted, so we need only check that
+  // bcast'.dimensions() is sorted.
+  //
+  // No one-user requirement on the transpose because having two different
+  // broadcasts of x should be cheap -- certainly cheaper than using the
+  // fully-materialized broadcasted+transposed value.
+  if (operand->opcode() == HloOpcode::kBroadcast &&
+      absl::c_is_sorted(operand->dimensions())) {
+    auto inv_perm = InversePermutation(transpose->dimensions());
+    absl::InlinedVector<int64_t, 8> new_bcast_dims;
+    for (int64_t dim : operand->dimensions()) {
+      new_bcast_dims.push_back(inv_perm[dim]);
+    }
+    if (absl::c_is_sorted(new_bcast_dims)) {
+      return ReplaceInstruction(
+          transpose, MakeBroadcastHlo(operand->mutable_operand(0),
+                                      new_bcast_dims, transpose->shape()));
+    }
+  }
+
   return OkStatus();
 }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index 5d5662ab382..bcc5e6afe1f 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -186,6 +186,17 @@ class AlgebraicSimplifierOptions {
 
   bool enable_sink_broadcast() const { return enable_sink_broadcast_; }
 
+  // If true, always simplify reduce(transpose(x)) and reduce(reshape(x)), even
+  // if the transpose/reshape has multiple users.  This can be beneficial
+  // on platforms where the extra transpose/reshape isn't as expensive as
+  // the optimization benefits brought about by simplifying the graph.
+  bool unconditionally_simplify_reduce_of_transpose_or_reshape() const {
+    return unconditionally_simplify_reduce_of_transpose_or_reshape_;
+  }
+  void set_unconditionally_simplify_reduce_of_transpose_or_reshape(bool val) {
+    unconditionally_simplify_reduce_of_transpose_or_reshape_ = val;
+  }
+
   // If true, min(x, NaN) = NaN.  If false, min(x, NaN) = x.
   //
   // TODO(b/209827141): Remove this and make minmax_propagate_nan uncondtionally
@@ -220,6 +231,7 @@ class AlgebraicSimplifierOptions {
   bool enable_reduce_of_reshape_{true};
   bool enable_negative_padding_replacement_{true};
   bool enable_sink_broadcast_{true};
+  bool unconditionally_simplify_reduce_of_transpose_or_reshape_{false};
   int64_t very_small_gather_size_{4};
   bool minmax_propagate_nan_{true};
   Metadata metadata_;
@@ -537,6 +549,17 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   // the types of inner and outer bitcast-convert cancel out.
   StatusOr<bool> TrySimplifyTautologicalBitcastConvert(HloInstruction* bitcast);
 
+  // Tries to remove surrounding converts around a binary op where the op has a
+  // more precise type than its inputs and output.
+  //
+  // convert<TS>(bin_op<TL>(convert<TL>(data1<TS>),
+  //                        convert<TL>(data2<TS>)))
+  //  where TS is a smaller point type than TL (ex, TS=fp16, TL=fp32)
+  // ->
+  // bin_op<TS>(data1<TS>, data2<TS>)
+  Status TryRemoveUpcastAndDowncastSurroundingBinaryOp(
+      HloInstruction* convert_instruction);
+
   // Useful when we want to use the same visitor over multiple computations.
   void ResetState(HloComputation* computation);
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_alive2_proofs.md b/tensorflow/compiler/xla/service/algebraic_simplifier_alive2_proofs.md
new file mode 100644
index 00000000000..eb6319ecd99
--- /dev/null
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_alive2_proofs.md
@@ -0,0 +1,114 @@
+# Proofs for TryRemoveUpcastAndDowncastSurroundingBinaryOp
+
+We want to verify for which types and ops the operation
+`upcasting_convert -> binary_op -> downcasting_convert` is equivalent to
+`binary_op` in the original precision.
+
+XLA types analyzed:
+ - s8
+ - s16
+ - s32
+ - s64
+ - u8
+ - u16
+ - u32
+ - u64
+
+Binary ops:
+- Add
+- Subtract
+- Multiply
+- Divide
+- Remainder
+
+## Integer Tests
+
+For integers, we verified using [Alive2](https://alive2.llvm.org/ce/). Tests
+were only done with types `i8`->`i16`->`i8`; this should generalize to other
+integer types. For integer tests, the online version of the tool was sufficient.
+
+### Add/Sub/Mul
+
+Below is a test for signed integer addition. To test signed and unsigned, I
+changed `sext` to `zext`. For subtraction and multiplication, I replaced the
+`add` with `sub` or `mul` respectively. All of these show that these
+transformations are safe. I did not use `nsw` or `nuw` because we do not use
+them in XLA. In XLA, when creating add/sub/mul, we pass no additional arguments
+apart from rhs and lhs, which means that nsw and nuw are false.
+
+```
+define i8 @src(i8, i8) {
+  %cast1 = sext i8 %0 to i16
+  %cast2 = sext i8 %1 to i16
+  %sum = add i16 %cast1, %cast2
+  %trunc = trunc i16 %sum to i8
+  ret i8 %trunc
+}
+
+define i8 @tgt(i8, i8) {
+  %r = add i8 %0, %1
+  ret i8 %r
+}
+```
+
+### Div
+
+Only unsigned division is safe, there can be overflow in signed integer
+division. This test shows that it “seems to be correct” for unsigned integers.
+
+```
+define i8 @src(i8, i8) {
+  %cast1 = zext i8 %0 to i16
+  %cast2 = zext i8 %1 to i16
+  %sum = udiv i16 %cast1, %cast2
+  %trunc = trunc i16 %sum to i8
+  ret i8 %trunc
+}
+
+define i8 @tgt(i8, i8) {
+  %r = udiv i8 %0, %1
+  ret i8 %r
+}
+```
+
+### Remainder
+
+Similarly for the remainder op, unsigned is safe and signed is not safe. This
+test shows that it “seems to be correct” for unsigned integers.
+
+```
+define i8 @src(i8, i8) {
+  %cast1 = zext i8 %0 to i16
+  %cast2 = zext i8 %1 to i16
+  %sum = urem i16 %cast1, %cast2
+  %trunc = trunc i16 %sum to i8
+  ret i8 %trunc
+}
+
+define i8 @tgt(i8, i8) {
+  %r = urem i8 %0, %1
+  ret i8 %r
+}
+```
+
+## Next Steps
+
+Assess the validity of floating point and complex types for these
+transformations.
+
+### Floating Point Tests
+
+Floating point tests timeout on the online Alive2 tool. Cloning, building, and
+running the tool locally without timeout on the below test has taken at least
+10,000 minutes without finishing:
+
+```
+  %cast1 = fpext half %0 to float
+  %cast2 = fpext half %1 to float
+  %sum = fadd float %cast1, %cast2
+  %trunc = fptrunc float %sum to half
+  ret half %trunc
+=>
+  %r = fadd half %0, %1
+  ret half %r
+```
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 1997ab6695f..ede9e7f7963 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 
 #include <memory>
+#include <optional>
+#include <ostream>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -24,6 +26,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
@@ -31,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
@@ -47,6 +51,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -731,6 +736,87 @@ TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
   EXPECT_EQ(root->dimensions(), std::vector<int64_t>({0, 2, 3}));
 }
 
+TEST_F(AlgebraicSimplifierTest, ReduceOfMergeNoncontractingDims) {
+  const char* kModuleStr = R"(
+    HloModule m
+    add_f32 {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT r = f32[] add(p0, p1)
+    }
+
+    ENTRY test {
+      p = f32[3,5,7] parameter(0)
+      reshape = f32[15,7] reshape(p)
+      ROOT reduce = f32[15] reduce(reshape, f32[] constant(0)), dimensions={1}, to_apply=add_f32
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_unconditionally_simplify_reduce_of_transpose_or_reshape(true);
+  ASSERT_TRUE(AlgebraicSimplifier(options).Run(m.get()).value());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Reshape(m::Reduce()
+                                .WithShape(F32, {3, 5})
+                                .WithPredicate([](const HloInstruction* instr) {
+                                  return instr->dimensions() ==
+                                         std::vector<int64_t>({2});
+                                }))
+                     .WithShape(F32, {15})));
+}
+
+TEST_F(AlgebraicSimplifierTest, ReduceOfSplitNoncontractingDims) {
+  const char* kModuleStr = R"(
+    HloModule m
+    add_f32 {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT r = f32[] add(p0, p1)
+    }
+
+    ENTRY test {
+      p = f32[3,35] parameter(0)
+      reshape = f32[3,5,7] reshape(p)
+      ROOT reduce = f32[5,7] reduce(reshape, f32[] constant(0)), dimensions={0}, to_apply=add_f32
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_unconditionally_simplify_reduce_of_transpose_or_reshape(true);
+  ASSERT_TRUE(AlgebraicSimplifier(options).Run(m.get()).value());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Reshape(m::Reduce().WithShape(F32, {35}).WithPredicate(
+                                [](const HloInstruction* instr) {
+                                  return instr->dimensions() ==
+                                         std::vector<int64_t>({0});
+                                }))
+                     .WithShape(F32, {5, 7})));
+}
+
+TEST_F(AlgebraicSimplifierTest,
+       ReduceOfReshapeOfContractingAndNoncontractingDims) {
+  const char* kModuleStr = R"(
+    HloModule m
+    add_f32 {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT r = f32[] add(p0, p1)
+    }
+
+    ENTRY test {
+      ROOT reduce = f32[8] reduce(
+        f32[8,4] reshape(f32[32] parameter(0)), f32[] constant(0)),
+        dimensions={1}, to_apply=add_f32
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_unconditionally_simplify_reduce_of_transpose_or_reshape(true);
+  ASSERT_FALSE(AlgebraicSimplifier(options).Run(m.get()).value());
+}
+
 // Test that Const + A is canonicalized to A + Const.
 TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
   auto m = CreateNewVerifiedModule();
@@ -8398,10 +8484,9 @@ ENTRY %main {
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
   ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
-  int64_t reduce_count = absl::c_count_if(
-      m->entry_computation()->instructions(), [](const HloInstruction* hlo) {
-        return hlo->opcode() == HloOpcode::kReduce;
-      });
+  int64_t reduce_count =
+      absl::c_count_if(m->entry_computation()->instructions(),
+                       HloPredicateIsOp<HloOpcode::kReduce>);
   // Expect one Reduce operation after simplification.
   EXPECT_EQ(1, reduce_count);
   auto variadic_reduce = m::Reduce().WithShape(m::Shape().IsTuple());
@@ -8459,10 +8544,9 @@ ENTRY %main {
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
   ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
-  int64_t reduce_count = absl::c_count_if(
-      m->entry_computation()->instructions(), [](const HloInstruction* hlo) {
-        return hlo->opcode() == HloOpcode::kReduce;
-      });
+  int64_t reduce_count =
+      absl::c_count_if(m->entry_computation()->instructions(),
+                       HloPredicateIsOp<HloOpcode::kReduce>);
   // Expect one Reduce operation after simplification.
   EXPECT_EQ(1, reduce_count);
   auto variadic_reduce = m::Reduce().WithShape(m::Shape().IsTuple());
@@ -9217,5 +9301,161 @@ TEST_F(AlgebraicSimplifierTest, MultiplyOfConvertedPred) {
   EXPECT_TRUE(verifier().Run(m.get()).status().ok());
 }
 
+TEST_F(AlgebraicSimplifierTest, TransposeOfBroadcast) {
+  const char* kModuleStr = R"(
+   HloModule m
+   test {
+     bcast = f32[10,2,3,4] broadcast(f32[2,4] parameter(0)), dimensions={1,3}
+     ROOT trans = f32[2,3,10,4] transpose(bcast), dimensions={1,2,0,3}
+   }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  EXPECT_TRUE(
+      RunHloPass(AlgebraicSimplifier(default_options_), m.get()).value());
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(
+          m::Broadcast(m::Parameter(0))
+              .WithPredicate([](const HloInstruction* instr) {
+                return instr->dimensions() == std::vector<int64_t>({0, 3});
+              })));
+}
+
+TEST_F(AlgebraicSimplifierTest, TransposeOfBroadcastSkipped) {
+  const char* kModuleStr = R"(
+   HloModule m
+   test {
+     bcast = f32[10,2,3,4] broadcast(f32[2,4] parameter(0)), dimensions={1,3}
+     ROOT trans = f32[4,2,3,10] transpose(bcast), dimensions={3,1,2,0}
+   }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  bool changed =
+      RunHloPass(AlgebraicSimplifier(default_options_), m.get()).value();
+  SCOPED_TRACE(m->ToString());
+  EXPECT_FALSE(changed);
+}
+
+class AlgebraicSimplifierUpcastDowncastTest
+    : public AlgebraicSimplifierTest,
+      public ::testing::WithParamInterface<
+          std::tuple<HloOpcode, PrimitiveType, PrimitiveType, bool>> {
+ public:
+  AlgebraicSimplifierUpcastDowncastTest()
+      : AlgebraicSimplifierTest(),
+        binary_opcode_(std::get<0>(GetParam())),
+        outer_type_(std::get<1>(GetParam())),
+        binary_op_type_(std::get<2>(GetParam())),
+        should_rewrite_(std::get<3>(GetParam())) {}
+
+ protected:
+  const HloOpcode binary_opcode_;
+  const PrimitiveType outer_type_;
+  const PrimitiveType binary_op_type_;
+  const bool should_rewrite_;
+};
+
+TEST_P(AlgebraicSimplifierUpcastDowncastTest,
+       CheckUpcastingAndDowncastingConvertsAreRemoved) {
+  const std::string& src_type_str =
+      primitive_util::LowercasePrimitiveTypeName(outer_type_);
+  const std::string& dest_type_str =
+      primitive_util::LowercasePrimitiveTypeName(binary_op_type_);
+  absl::string_view op_str = HloOpcodeString(binary_opcode_);
+  const std::string kModuleStr =
+      absl::StrFormat(R"(
+    HloModule m
+    test {
+      p1 = %s[] parameter(0)
+      p2 = %s[] parameter(1)
+      c1 = %s[] convert(p1)
+      c2 = %s[] convert(p2)
+      res = %s[] %s(c1, c2)
+      ROOT cres = %s[] convert(res)
+    }
+  )",
+                      src_type_str, src_type_str, dest_type_str, dest_type_str,
+                      dest_type_str, op_str, src_type_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, RunHloPass(AlgebraicSimplifier(default_options_), m.get()));
+  SCOPED_TRACE(m->ToString());
+  EXPECT_EQ(changed, should_rewrite_);
+  if (should_rewrite_) {
+    EXPECT_THAT(
+        m->entry_computation()->root_instruction(),
+        GmockMatch(
+            m::Op()
+                .WithOpcode(binary_opcode_)
+                .WithOperand(0, m::Parameter(0).WithElementType(outer_type_))
+                .WithOperand(1, m::Parameter(1).WithElementType(outer_type_))));
+  }
+}
+
+std::vector<std::tuple<HloOpcode, PrimitiveType, PrimitiveType, bool>>
+GetUpcastDowncastTestCases() {
+  std::vector<std::tuple<HloOpcode, PrimitiveType, PrimitiveType, bool>> result;
+  const std::vector<PrimitiveType> types = {
+      S8, S16, S32, S64, U8, U16, U32, U64, F16, F32, BF16, F64, C64, C128};
+  for (const auto op :
+       {HloOpcode::kAdd, HloOpcode::kSubtract, HloOpcode::kMultiply,
+        HloOpcode::kDivide, HloOpcode::kRemainder}) {
+    for (const auto original_type : types) {
+      for (const auto upcast_type : types) {
+        const bool should_rewrite = [&] {
+          if (original_type == upcast_type) {
+            // Even though the function we're targeting does not support certain
+            // types, something else in AlgebraicSimplifier will handle this
+            // case.
+            return true;
+          }
+          if ((primitive_util::IsSignedIntegralType(original_type) !=
+               primitive_util::IsSignedIntegralType(upcast_type)) ||
+              (primitive_util::IsUnsignedIntegralType(original_type) !=
+               primitive_util::IsUnsignedIntegralType(upcast_type)) ||
+              (primitive_util::IsFloatingPointType(original_type) !=
+               primitive_util::IsFloatingPointType(upcast_type)) ||
+              (primitive_util::IsComplexType(original_type) !=
+               primitive_util::IsComplexType(upcast_type))) {
+            // Not yet handling conversions from one class of types to another
+            // class of types (ex. integer to floating point).
+            return false;
+          }
+          if (primitive_util::IsComplexType(original_type) ||
+              primitive_util::IsComplexType(upcast_type)) {
+            // Not yet handling complex types.
+            return false;
+          }
+          if (primitive_util::IsFloatingPointType(original_type) ||
+              primitive_util::IsFloatingPointType(upcast_type)) {
+            // Not yet handling floating point types.
+            return false;
+          }
+          if (!primitive_util::CastPreservesValues(original_type,
+                                                   upcast_type)) {
+            // We are looking for upcast->bin_op->downcast; this is the opposite
+            // direction.
+            return false;
+          }
+          if (op == HloOpcode::kDivide || op == HloOpcode::kRemainder) {
+            if (primitive_util::IsSignedIntegralType(original_type)) {
+              // This transformation is not safe for divide or remainder with
+              // signed integers.
+              return false;
+            }
+          }
+          return true;
+        }();
+        result.emplace_back(op, original_type, upcast_type, should_rewrite);
+      }
+    }
+  }
+  return result;
+}
+
+INSTANTIATE_TEST_SUITE_P(AllTypes, AlgebraicSimplifierUpcastDowncastTest,
+                         ::testing::ValuesIn(GetUpcastDowncastTestCases()));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_gather_broadcast_reorder.cc b/tensorflow/compiler/xla/service/all_gather_broadcast_reorder.cc
index b700d86e013..de28cb80ac0 100644
--- a/tensorflow/compiler/xla/service/all_gather_broadcast_reorder.cc
+++ b/tensorflow/compiler/xla/service/all_gather_broadcast_reorder.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 
diff --git a/tensorflow/compiler/xla/service/all_gather_combiner.cc b/tensorflow/compiler/xla/service/all_gather_combiner.cc
index 858f6abfcfd..1807149b460 100644
--- a/tensorflow/compiler/xla/service/all_gather_combiner.cc
+++ b/tensorflow/compiler/xla/service/all_gather_combiner.cc
@@ -30,10 +30,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/collective_combiner_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/all_reduce_combiner.cc b/tensorflow/compiler/xla/service/all_reduce_combiner.cc
index 17f41cddecf..e6c975810a0 100644
--- a/tensorflow/compiler/xla/service/all_reduce_combiner.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_combiner.cc
@@ -29,11 +29,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/all_reduce_key.h"
 #include "tensorflow/compiler/xla/service/collective_combiner_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/all_reduce_contiguous.cc b/tensorflow/compiler/xla/service/all_reduce_contiguous.cc
index a4efedbad94..0315c16cfb0 100644
--- a/tensorflow/compiler/xla/service/all_reduce_contiguous.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_contiguous.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_folder.cc b/tensorflow/compiler/xla/service/all_reduce_folder.cc
index 2f762b83506..ee0f3531d03 100644
--- a/tensorflow/compiler/xla/service/all_reduce_folder.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_folder.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/service/all_reduce_key.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/service/all_reduce_folder_test.cc b/tensorflow/compiler/xla/service/all_reduce_folder_test.cc
index eb759bb4bd1..bfca0759383 100644
--- a/tensorflow/compiler/xla/service/all_reduce_folder_test.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_folder_test.cc
@@ -43,9 +43,7 @@ class AllReduceFolderTest : public HloTestBase {
 
   size_t AllReduceCount(std::unique_ptr<HloModule> &module) {
     return absl::c_count_if(module->entry_computation()->instructions(),
-                            [](const HloInstruction *inst) {
-                              return inst->opcode() == HloOpcode::kAllReduce;
-                            });
+                            HloPredicateIsOp<HloOpcode::kAllReduce>);
   }
 };
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_promotion.cc b/tensorflow/compiler/xla/service/all_reduce_promotion.cc
index e6c09b4b07b..bb83c91fb47 100644
--- a/tensorflow/compiler/xla/service/all_reduce_promotion.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_promotion.cc
@@ -37,7 +37,7 @@ std::unique_ptr<HloInstruction> CloneAllReduce(
   HloComputation* to_apply = new_inst->to_apply();
   HloComputation* to_apply_promoted = [&]() {
     PrimitiveType type = shape.element_type();
-    std::string name = to_apply->name() + "_promoted";
+    std::string name = absl::StrCat(to_apply->name(), "_promoted");
     HloComputation::Builder promoted(name);
     auto x = promoted.AddInstruction(HloInstruction::CreateParameter(
         /*parameter_number=*/0, ShapeUtil::MakeShape(type, {}), "x"));
diff --git a/tensorflow/compiler/xla/service/all_reduce_reassociate.cc b/tensorflow/compiler/xla/service/all_reduce_reassociate.cc
index affd2e6d0fe..4bd5e46cdcd 100644
--- a/tensorflow/compiler/xla/service/all_reduce_reassociate.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_reassociate.cc
@@ -15,36 +15,154 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/all_reduce_reassociate.h"
 
+#include <cstdint>
+#include <optional>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/all_reduce_key.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
 namespace {
 
+namespace m = match;
+
+bool AreAllreduceKeysEqual(AllReduceKey& key0, AllReduceKey& key1,
+                           bool ignore_element_type) {
+  // We compare all elements in the tuple except element_type which is the
+  // second member in the AllReduceKey tuple.
+  if (ignore_element_type) {
+    return std::get<0>(key0) == std::get<0>(key1) &&
+           std::get<2>(key0) == std::get<2>(key1) &&
+           std::get<3>(key0) == std::get<3>(key1) &&
+           std::get<4>(key0) == std::get<4>(key1) &&
+           std::get<5>(key0) == std::get<5>(key1);
+  } else {
+    return key0 == key1;
+  }
+}
 // Returns if the given all reduce instructions are compatible with each other.
 // Note that since the given all-reduce instructions are connected to another
 // instruction by a direct data flow edge, they must belong to the same domain.
 // As a result, we don't need to include any domain information in the
 // AllReduceKey to check compatibility.
-bool AreCompatible(const HloAllReduceInstruction *ar0,
-                   const HloAllReduceInstruction *ar1, ReductionKind op_kind) {
+bool AreCompatible(const HloAllReduceInstruction* ar0,
+                   const HloAllReduceInstruction* ar1, ReductionKind op_kind,
+                   bool ignore_element_type) {
   std::optional<AllReduceKey> key0 = GetAllReduceKey(ar0);
   std::optional<AllReduceKey> key1 = GetAllReduceKey(ar1);
   auto kind0 = MatchReductionComputation(ar0->to_apply());
-  return key0 && key1 && kind0 && *key0 == *key1 && kind0 == op_kind;
+  // If ignore_element_type is true, then we compare all elements in the
+  // AllReduceKey tuple except the element_type
+  return key0 && key1 && kind0 &&
+         AreAllreduceKeysEqual(*key0, *key1, ignore_element_type) &&
+         kind0 == op_kind;
 }
 
+// Look-through some formatting operations that might be in front of the
+// all-reduces we want to reassociate. Making sure the chain only has 1 user
+// throughout.
+HloAllReduceInstruction* LookThroughForAllReduce(
+    HloInstruction* instr, const Literal& reduction_identity) {
+  while (instr->opcode() != HloOpcode::kAllReduce) {
+    if (instr->user_count() != 1) {
+      return nullptr;
+    }
+    if (instr->opcode() != HloOpcode::kReshape &&
+        instr->opcode() != HloOpcode::kPad &&
+        instr->opcode() != HloOpcode::kSlice &&
+        instr->opcode() != HloOpcode::kConvert) {
+      return nullptr;
+    }
+    if (instr->opcode() == HloOpcode::kPad) {
+      if (!instr->operand(1)->IsConstant()) {
+        return nullptr;
+      }
+      if (instr->operand(1)->literal() != reduction_identity) {
+        return nullptr;
+      }
+    }
+    instr = instr->mutable_operand(0);
+  }
+  if (instr->user_count() != 1) {
+    return nullptr;
+  }
+  return Cast<HloAllReduceInstruction>(instr);
+}
+
+// Because we can look through pads its possible that reassociating the
+// all-reduce makes us reduce over more than the sum of the two unpadded
+// individual all-reduces. Check that's not the case.
+bool ReassociateAllReduceIsProfitable(HloInstruction* ar0, HloInstruction* ar1,
+                                      HloInstruction* reassociated_inst) {
+  int64_t pre_reassociated_size = ShapeUtil::ElementsIn(ar0->shape());
+  if (ar0 != ar1) {
+    pre_reassociated_size += ShapeUtil::ElementsIn(ar1->shape());
+  }
+  return pre_reassociated_size >=
+         ShapeUtil::ElementsIn(reassociated_inst->shape());
+}
+
+bool AreCompatibleConverts(const HloInstruction* convert0,
+                           const HloInstruction* convert1) {
+  bool is_compatible = true;
+  // For numerical stability, we only re-order ops with casts from a narrow type
+  // to a wider type.
+  if (convert0) {
+    is_compatible &= primitive_util::CastPreservesValues(
+        convert0->operand(0)->shape().element_type(),
+        convert0->shape().element_type());
+  }
+
+  if (convert1) {
+    is_compatible &= primitive_util::CastPreservesValues(
+        convert1->operand(0)->shape().element_type(),
+        convert1->shape().element_type());
+  }
+
+  if (convert0 && convert1) {
+    CHECK(convert0->shape().element_type() == convert1->shape().element_type());
+    is_compatible &= convert0->operand(0)->shape().element_type() ==
+                     convert1->operand(0)->shape().element_type();
+  }
+  return is_compatible;
+}
+
+template <typename Pattern>
+auto OptionalConvertWithOneUser(HloInstruction** optional_convert,
+                                Pattern pattern) {
+  return m::AnyOf<HloInstruction>(
+      m::Convert(optional_convert, pattern).WithOneUser(), std::move(pattern));
+}
+
+bool MatchOperandsToAllReduceWithOptionalConvert(HloInstruction* inst,
+                                                 HloInstruction** convert0,
+                                                 HloInstruction** convert1) {
+  auto ar_op_optional_convert_pattern =
+      m::Op()
+          .WithOperand(0, OptionalConvertWithOneUser(convert0, m::AllReduce()))
+          .WithOperand(1, OptionalConvertWithOneUser(convert1, m::AllReduce()))
+          .WithPredicate([](const HloInstruction* inst) {
+            return inst->shape().IsArray();
+          });
+  return Match(inst, ar_op_optional_convert_pattern);
+}
 }  // namespace
 
 StatusOr<bool> AllReduceReassociate::Run(
-    HloModule *module,
-    const absl::flat_hash_set<absl::string_view> &execution_threads) {
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (hlo_query::ContainsLayoutConstrainedAllReduce(*module)) {
     VLOG(1)
         << "Skip AllReduceReassociate because the module contains all-reduce "
@@ -56,43 +174,144 @@ StatusOr<bool> AllReduceReassociate::Run(
 
   bool changed = false;
   for (auto computation : module->computations(execution_threads)) {
-    for (HloInstruction *inst : computation->MakeInstructionPostOrder()) {
+    for (HloInstruction* inst : computation->MakeInstructionPostOrder()) {
+      // Check if the instruction we want to reassociate with will match any
+      // valid all-reduce reduction function. Save the ReductionKind object for
+      // later.
       std::optional<ReductionKind> kind = MatchReductionInstruction(inst);
-      if (!kind || inst->operand(0)->opcode() != HloOpcode::kAllReduce ||
-          inst->operand(1)->opcode() != HloOpcode::kAllReduce ||
-          !inst->shape().IsArray()) {
+      if (!kind) {
+        continue;
+      }
+      std::optional<Literal> reduction_identity =
+          GetReductionIdentity(*kind, inst->shape().element_type());
+      // Unsupported reduction type.
+      if (!reduction_identity) {
+        continue;
+      }
+      // Find LHS all-reduce.
+      HloAllReduceInstruction* ar0 = LookThroughForAllReduce(
+          inst->mutable_operand(0), *reduction_identity);
+      if (ar0 == nullptr) {
+        continue;
+      }
+      // Find RHS all-reduce.
+      HloAllReduceInstruction* ar1 = LookThroughForAllReduce(
+          inst->mutable_operand(1), *reduction_identity);
+      if (ar1 == nullptr) {
+        continue;
+      }
+      if (!inst->shape().IsArray()) {
+        continue;
+      }
+      // Because we look through pads it might not be profitable to actually
+      // reassociate if reassociating makes us all-reduce more values.
+      if (!ReassociateAllReduceIsProfitable(ar0, ar1, inst)) {
         continue;
       }
 
-      auto *ar0 = Cast<HloAllReduceInstruction>(inst->mutable_operand(0));
-      auto *ar1 = Cast<HloAllReduceInstruction>(inst->mutable_operand(1));
-      if (!AreCompatible(ar0, ar1, *kind)) {
+      HloInstruction* convert0 = nullptr;
+      HloInstruction* convert1 = nullptr;
+      if (!MatchOperandsToAllReduceWithOptionalConvert(inst, &convert0,
+                                                       &convert1)) {
+        VLOG(2) << "One or both inputs are type-converted.";
+      }
+      // Check to see if input converts are present and preserving values and
+      // precision.
+      bool should_promote_ar = convert0 || convert1;
+      if (should_promote_ar) {
+        if (!reassociate_converted_ar_) {
+          VLOG(2) << "Promotions of all_reduces for reassociation will be "
+                     "disabled.";
+          continue;
+        }
+        if (!AreCompatibleConverts(convert0, convert1)) {
+          VLOG(2) << "Inputs' Converts are not preserving "
+                     "value, skipping";
+          continue;
+        }
+      }
+
+      HloInstruction* op_operand0 = inst->mutable_operand(0);
+      HloInstruction* op_operand1 = inst->mutable_operand(1);
+      if (convert0) {
+        op_operand0 = convert0->mutable_operand(0);
+      }
+      if (convert1) {
+        op_operand1 = convert1->mutable_operand(0);
+      }
+      if (!AreCompatible(ar0, ar1, *kind,
+                         /*ignore_element_type=*/should_promote_ar)) {
         VLOG(2) << "All-Reduce operations are not compatible, skipping";
         continue;
       }
+      VLOG(2) << "Reassociated:";
+      VLOG(2) << "\tAR0: " << ar0->opcode();
+      VLOG(2) << "\tAR1: " << ar1->opcode();
 
-      if (ar0->user_count() != 1 || ar1->user_count() != 1) {
-        VLOG(2) << "All-Reduce operations have > 1 users";
-        continue;
+      auto op_users = inst->users();
+      // Found pattern op(ar(x), ar(y)). Transform it into ar(op(x,y)).
+      HloInstruction* new_op_operand0 = ar0->mutable_operand(0);
+      HloInstruction* new_op_operand1 = ar1->mutable_operand(0);
+      if (convert0) {
+        HloInstruction* ar0_operand = ar0->mutable_operand(0);
+        TF_RETURN_IF_ERROR(convert0->ReplaceOperandWith(0, ar0_operand));
+        new_op_operand0 = convert0;
+      }
+      if (convert1) {
+        HloInstruction* ar1_operand = ar1->mutable_operand(0);
+        TF_RETURN_IF_ERROR(convert1->ReplaceOperandWith(0, ar1_operand));
+        new_op_operand1 = convert1;
       }
 
-      // Found pattern op(ar(x), ar(y)). Transform it into ar(op(x,y)).
-      HloInstruction *new_op = computation->AddInstruction(
-          inst->CloneWithNewOperands(inst->shape(), {ar0->mutable_operand(0),
-                                                     ar1->mutable_operand(0)}));
-      HloInstruction *new_ar = computation->AddInstruction(
-          ar0->CloneWithNewOperands(inst->shape(), {new_op}));
+      HloInstruction* new_op =
+          should_promote_ar
+              ? computation->AddInstruction(inst->CloneWithNewOperands(
+                    inst->shape(), {new_op_operand0, new_op_operand1}))
+              : inst;
 
+      Shape new_ar_out_shape = inst->shape();
+      if (should_promote_ar) {
+        new_ar_out_shape.set_element_type(
+            new_op_operand0->shape().element_type());
+      } else {
+        TF_RETURN_IF_ERROR(ar0->ReplaceAllUsesWith(ar0->mutable_operand(0)));
+        TF_RETURN_IF_ERROR(ar1->ReplaceAllUsesWith(ar1->mutable_operand(0)));
+      }
+
+      HloInstruction* new_ar = computation->AddInstruction(
+          ar0->CloneWithNewOperands(new_ar_out_shape, {new_op}));
       // Do not reuse channel_id from the existing instruction.
       if (new_ar->channel_id()) {
         new_ar->set_channel_id(next_channel_id++);
       }
 
-      TF_RETURN_IF_ERROR(inst->ReplaceAllUsesWith(new_ar));
+      if (should_promote_ar) {
+        HloComputation* to_apply = new_ar->to_apply();
+        PrimitiveType type = new_ar->shape().element_type();
+        std::string name = absl::StrCat(to_apply->name(), "_reassoc_promoted");
+        HloComputation::Builder promoted(name);
+        auto x = promoted.AddInstruction(HloInstruction::CreateParameter(
+            /*parameter_number=*/0, ShapeUtil::MakeShape(type, {}), "x"));
+        auto y = promoted.AddInstruction(HloInstruction::CreateParameter(
+            /*parameter_number=*/1, ShapeUtil::MakeShape(type, {}), "y"));
+        promoted.AddInstruction(HloInstruction::CreateBinary(
+            ShapeUtil::MakeShape(type, {}),
+            to_apply->root_instruction()->opcode(), x, y));
+
+        HloComputation* to_apply_promoted =
+            inst->GetModule()->AddEmbeddedComputation(promoted.Build());
+        new_ar->set_to_apply(to_apply_promoted);
+        TF_RETURN_IF_ERROR(inst->ReplaceAllUsesWith(new_ar));
+      } else {
+        TF_RETURN_IF_ERROR(inst->ReplaceUsesWith(op_users, new_ar));
+      }
+
       // Note that RemoveInstructionAndUnusedOperands may not remove the 2
       // all-reduce operands of `inst` if they are not safe to remove otherwise,
       // so manually these instructions.
-      TF_RETURN_IF_ERROR(computation->RemoveInstruction(inst));
+      if (should_promote_ar) {
+        TF_RETURN_IF_ERROR(computation->RemoveInstruction(inst));
+      }
       TF_RETURN_IF_ERROR(computation->RemoveInstruction(ar0));
       if (ar0 != ar1) {
         TF_RETURN_IF_ERROR(computation->RemoveInstruction(ar1));
diff --git a/tensorflow/compiler/xla/service/all_reduce_reassociate.h b/tensorflow/compiler/xla/service/all_reduce_reassociate.h
index ecb44d6b09d..778cdd1f9de 100644
--- a/tensorflow/compiler/xla/service/all_reduce_reassociate.h
+++ b/tensorflow/compiler/xla/service/all_reduce_reassociate.h
@@ -32,12 +32,20 @@ namespace xla {
 
 class AllReduceReassociate : public HloModulePass {
  public:
+  explicit AllReduceReassociate(bool reassociate_converted_ar = false)
+      : reassociate_converted_ar_(reassociate_converted_ar) {}
+
   absl::string_view name() const override { return "all-reduce-reassociate"; }
 
   using HloPassInterface::Run;
   StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Specify whether we should reassociate allreduces that are consumed by
+  // Convert nodes.
+  bool reassociate_converted_ar_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_reduce_reassociate_test.cc b/tensorflow/compiler/xla/service/all_reduce_reassociate_test.cc
index 3bb6f398ff8..2eb9d372cd6 100644
--- a/tensorflow/compiler/xla/service/all_reduce_reassociate_test.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_reassociate_test.cc
@@ -15,22 +15,33 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/all_reduce_reassociate.h"
 
+#include <cstddef>
+#include <memory>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace {
 
 namespace m = xla::testing::opcode_matchers;
+using ::testing::_;
 
 class AllReduceSimplifierTest : public HloTestBase {
  public:
-  StatusOr<std::unique_ptr<HloModule>> RunPass(absl::string_view hlo_module,
-                                               bool expect_change) {
+  StatusOr<std::unique_ptr<HloModule>> RunPass(
+      absl::string_view hlo_module, bool expect_change,
+      bool reassociate_converted_ar = false) {
     TF_ASSIGN_OR_RETURN(auto module, ParseAndReturnVerifiedModule(hlo_module));
-    auto changed = AllReduceReassociate().Run(module.get());
+    auto changed =
+        AllReduceReassociate(reassociate_converted_ar).Run(module.get());
     if (!changed.ok()) {
       return changed.status();
     }
@@ -40,9 +51,7 @@ class AllReduceSimplifierTest : public HloTestBase {
 
   size_t AllReduceCount(std::unique_ptr<HloModule>& module) {
     return absl::c_count_if(module->entry_computation()->instructions(),
-                            [](const HloInstruction* inst) {
-                              return inst->opcode() == HloOpcode::kAllReduce;
-                            });
+                            HloPredicateIsOp<HloOpcode::kAllReduce>);
   }
 };
 
@@ -332,5 +341,293 @@ ENTRY main {
                           RunPass(hlo_string, /*expect_change=*/true));
 }
 
+TEST_F(AllReduceSimplifierTest, PaddedUse) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+ENTRY main {
+  p0 = f32[8] parameter(0)
+  p1 = f32[8] parameter(1)
+  ar0 = f32[8] all-reduce(p0), replica_groups={}, to_apply=sum
+  ar1 = f32[8] all-reduce(p1), replica_groups={}, to_apply=sum
+  %constant.1 = f32[] constant(0)
+  pad = f32[12]{0} pad(ar0, constant.1), padding=0_4
+  pad.1 = f32[12]{0} pad(ar1, constant.1), padding=0_4
+  ROOT add = f32[12] add(pad, pad.1)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          RunPass(hlo_string, /*expect_change=*/true));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              m::AllReduce(m::Add(m::Pad(m::Parameter(0), _),
+                                  m::Pad(m::Parameter(1), _))));
+  EXPECT_EQ(AllReduceCount(module), 1);
+}
+
+TEST_F(AllReduceSimplifierTest, PaddedUseInvalidReduceValue) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+ENTRY main {
+  p0 = f32[8] parameter(0)
+  p1 = f32[8] parameter(1)
+  ar0 = f32[8] all-reduce(p0), replica_groups={}, to_apply=sum
+  ar1 = f32[8] all-reduce(p1), replica_groups={}, to_apply=sum
+  %constant.1 = f32[] constant(-1.0)
+  pad = f32[12]{0} pad(ar0, constant.1), padding=0_4
+  pad.1 = f32[12]{0} pad(ar1, constant.1), padding=0_4
+  ROOT add = f32[12] add(pad, pad.1)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          RunPass(hlo_string, /*expect_change=*/false));
+  EXPECT_EQ(AllReduceCount(module), 2);
+}
+
+TEST_F(AllReduceSimplifierTest, PaddedUseNotProfitable) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+ENTRY main {
+  p0 = f32[8] parameter(0)
+  p1 = f32[8] parameter(1)
+  ar0 = f32[8] all-reduce(p0), replica_groups={}, to_apply=sum
+  ar1 = f32[8] all-reduce(p1), replica_groups={}, to_apply=sum
+  %constant.1 = f32[] constant(0)
+  pad = f32[17]{0} pad(ar0, constant.1), padding=0_9
+  pad.1 = f32[17]{0} pad(ar1, constant.1), padding=0_9
+  ROOT add = f32[17] add(pad, pad.1)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          RunPass(hlo_string, /*expect_change=*/false));
+  EXPECT_EQ(AllReduceCount(module), 2);
+}
+
+TEST_F(AllReduceSimplifierTest, PaddedUseDoubleUseNotProfitable) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+ENTRY main {
+  p0 = f32[8] parameter(0)
+  p1 = f32[8] parameter(1)
+  ar0 = f32[8] all-reduce(p0), replica_groups={}, to_apply=sum
+  %constant.1 = f32[] constant(0)
+  pad = f32[9]{0} pad(ar0, constant.1), padding=0_1
+  ROOT add = f32[9] add(pad, pad)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          RunPass(hlo_string, /*expect_change=*/false));
+  EXPECT_EQ(AllReduceCount(module), 1);
+}
+
+TEST_F(AllReduceSimplifierTest, ReshapeUse) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+ENTRY main {
+  p0 = f32[1,8] parameter(0)
+  p1 = f32[1,8] parameter(1)
+  ar0 = f32[1,8] all-reduce(p0), replica_groups={}, to_apply=sum
+  ar1 = f32[1,8] all-reduce(p1), replica_groups={}, to_apply=sum
+  rshp0 = f32[8]{0} reshape(ar0)
+  rshp1 = f32[8]{0} reshape(ar1)
+  ROOT add = f32[8] add(rshp0, rshp1)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          RunPass(hlo_string, /*expect_change=*/true));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              m::AllReduce(m::Add(m::Reshape(m::Parameter(0)),
+                                  m::Reshape(m::Parameter(1)))));
+  EXPECT_EQ(AllReduceCount(module), 1);
+}
+
+TEST_F(AllReduceSimplifierTest, SliceUse) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+ENTRY main {
+  p0 = f32[8] parameter(0)
+  p1 = f32[8] parameter(1)
+  ar0 = f32[8] all-reduce(p0), replica_groups={}, to_apply=sum
+  ar1 = f32[8] all-reduce(p1), replica_groups={}, to_apply=sum
+  rshp0 = f32[4]{0} slice(ar0), slice={[0:4]}
+  rshp1 = f32[4]{0} slice(ar1), slice={[0:4]}
+  ROOT add = f32[4] add(rshp0, rshp1)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          RunPass(hlo_string, /*expect_change=*/true));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              m::AllReduce(m::Add(m::Slice(m::Parameter(0)),
+                                  m::Slice(m::Parameter(1)))));
+  EXPECT_EQ(AllReduceCount(module), 1);
+}
+
+// Checks whether a linear chain of converts-adds of ARs is reassociated in a
+// single pass.
+TEST_F(AllReduceSimplifierTest, ChainWithConvert) {
+  absl::string_view hlo_string = R"(
+HloModule m
+add.1 {
+  x.47 = bf16[] parameter(0)
+  y.47 = bf16[] parameter(1)
+  ROOT add.2532 = bf16[] add(x.47, y.47)
+}
+ENTRY main {
+  p0 = bf16[8] parameter(0)
+  p1 = bf16[8] parameter(1)
+  p2 = bf16[8] parameter(2)
+  p3 = bf16[8] parameter(3)
+  ar0 = bf16[8] all-reduce(p0), replica_groups={}, to_apply=add.1
+  ar1 = bf16[8] all-reduce(p1), replica_groups={}, to_apply=add.1
+  ar2 = bf16[8] all-reduce(p2), replica_groups={}, to_apply=add.1
+  ar3 = bf16[8] all-reduce(p3), replica_groups={}, to_apply=add.1
+  convert0 = f32[8] convert(ar0)
+  convert1 = f32[8] convert(ar1)
+  add0 = f32[8] add(convert0, convert1)
+  convert2 = f32[8] convert(ar2)
+  add1 = f32[8] add(add0, convert2)
+  convert3 = f32[8] convert(ar3)
+  add2 = f32[8] add(add1, convert3)
+  ROOT convert4 = bf16[8] convert(add2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          RunPass(hlo_string, /*expect_change=*/true,
+                                  /*reassociate_converted_ar*/ true));
+  SCOPED_TRACE(module->ToString());
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      m::Convert(m::AllReduce(m::Add(m::Add(m::Add(m::Convert(m::Parameter(0)),
+                                                   m::Convert(m::Parameter(1))),
+                                            m::Convert(m::Parameter(2))),
+                                     m::Convert(m::Parameter(3))))));
+  EXPECT_EQ(AllReduceCount(module), 1);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction()->operand(0)->shape(),
+      GmockMatch(::xla::match::Shape().WithElementType(F32)));
+}
+
+// Checks that a list of incompatible converts-adds of ARs should NOT be
+// reassociated.
+TEST_F(AllReduceSimplifierTest, AllreduceWithConvertIncompatibleType) {
+  absl::string_view hlo_string = R"(
+HloModule m
+add.1 {
+  x.47 = bf16[] parameter(0)
+  y.47 = bf16[] parameter(1)
+  ROOT add.2532 = bf16[] add(x.47, y.47)
+}
+max.1 {
+  x.48 = bf16[] parameter(0)
+  y.48 = bf16[] parameter(1)
+  ROOT max.2533 = bf16[] maximum(x.48, y.48)
+}
+min.1 {
+  x.49 = bf16[] parameter(0)
+  y.49 = bf16[] parameter(1)
+  ROOT min.2534 = bf16[] minimum(x.49, y.49)
+}
+mul.1 {
+  x.50 = bf16[] parameter(0)
+  y.50 = bf16[] parameter(1)
+  ROOT mul.2535 = bf16[] multiply(x.50, y.50)
+}
+ENTRY main {
+  p0 = bf16[8] parameter(0)
+  p1 = bf16[8] parameter(1)
+  p2 = bf16[8] parameter(2)
+  p3 = bf16[8] parameter(3)
+  ar0 = bf16[8] all-reduce(p0), replica_groups={}, to_apply=add.1
+  ar1 = bf16[8] all-reduce(p1), replica_groups={}, to_apply=max.1
+  ar2 = bf16[8] all-reduce(p2), replica_groups={}, to_apply=min.1
+  ar3 = bf16[8] all-reduce(p3), replica_groups={}, to_apply=mul.1
+  convert0 = f32[8] convert(ar0)
+  convert1 = f32[8] convert(ar1)
+  add0 = f32[8] add(convert0, convert1)
+  convert2 = f32[8] convert(ar2)
+  add1 = f32[8] add(add0, convert2)
+  convert3 = f32[8] convert(ar3)
+  add2 = f32[8] add(add1, convert3)
+  ROOT convert4 = bf16[8] convert(add2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          RunPass(hlo_string, /*expect_change=*/false));
+  SCOPED_TRACE(module->ToString());
+}
+
+// Checks that a list of incompatible converts-adds of ARs should NOT be
+// reassociated.
+TEST_F(AllReduceSimplifierTest, AllreduceWithLossyConvert) {
+  absl::string_view hlo_string = R"(
+HloModule m
+add.1 {
+  x.47 = bf16[] parameter(0)
+  y.47 = bf16[] parameter(1)
+  ROOT add.2532 = bf16[] add(x.47, y.47)
+}
+ENTRY main {
+  p0 = bf16[8] parameter(0)
+  p1 = bf16[8] parameter(1)
+  p2 = bf16[8] parameter(2)
+  p3 = bf16[8] parameter(3)
+  ar0 = bf16[8] all-reduce(p0), replica_groups={}, to_apply=add.1
+  ar1 = bf16[8] all-reduce(p1), replica_groups={}, to_apply=add.1
+  ar2 = bf16[8] all-reduce(p2), replica_groups={}, to_apply=add.1
+  ar3 = bf16[8] all-reduce(p3), replica_groups={}, to_apply=add.1
+  convert0 = u32[8] convert(ar0)
+  convert1 = u32[8] convert(ar1)
+  add0 = u32[8] add(convert0, convert1)
+  convert2 = u32[8] convert(ar2)
+  add1 = u32[8] add(add0, convert2)
+  convert3 = u32[8] convert(ar3)
+  add2 = u32[8] add(add1, convert3)
+  ROOT convert4 = bf16[8] convert(add2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          RunPass(hlo_string, /*expect_change=*/false));
+  SCOPED_TRACE(module->ToString());
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_to_all_decomposer.cc b/tensorflow/compiler/xla/service/all_to_all_decomposer.cc
index ee97a89b92e..c1954d70bef 100644
--- a/tensorflow/compiler/xla/service/all_to_all_decomposer.cc
+++ b/tensorflow/compiler/xla/service/all_to_all_decomposer.cc
@@ -18,20 +18,13 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
-#include "absl/algorithm/container.h"
-#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/tsl/platform/logging.h"
 
 namespace xla {
 bool AllToAllDecomposer::InstructionMatchesPattern(
diff --git a/tensorflow/compiler/xla/service/all_to_all_decomposer.h b/tensorflow/compiler/xla/service/all_to_all_decomposer.h
index 6557f616ef9..3a21ed044fd 100644
--- a/tensorflow/compiler/xla/service/all_to_all_decomposer.h
+++ b/tensorflow/compiler/xla/service/all_to_all_decomposer.h
@@ -17,9 +17,6 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_TO_ALL_DECOMPOSER_H_
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index b4141cd330b..28888f5b7d9 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/hlo_replication_analysis.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
index a069c2ca857..8b55778fcfc 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.h
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -100,21 +100,16 @@ class ArCrsCombiner : public HloModulePass {
         : ar(all_reduce), crs(cross_replica_sum), distance(dist) {}
 
     std::string ToString() {
-      std::vector<std::string> pieces;
-      pieces.push_back("(");
+      std::string result;
+      absl::StrAppend(&result, "(");
       HloInstruction* instruction = ar;
       while (instruction != crs) {
-        pieces.push_back(instruction->name());
-        pieces.push_back(",");
+        absl::StrAppend(&result, instruction->name(), ",");
         instruction = instruction->users()[0];
       }
-      pieces.push_back(instruction->name());
-      pieces.push_back(")[id:");
-      pieces.push_back(std::to_string(*(ar->channel_id())));
-      pieces.push_back(",dist:");
-      pieces.push_back(std::to_string(distance));
-      pieces.push_back("]");
-      return absl::StrJoin(pieces, "");
+      absl::StrAppend(&result, instruction->name(),
+                      ")[id:", *(ar->channel_id()), ",dist:", distance, "]");
+      return result;
     }
   };
 
diff --git a/tensorflow/compiler/xla/service/async_collective_creator.cc b/tensorflow/compiler/xla/service/async_collective_creator.cc
index 7699fda7a52..6b1da14b2c1 100644
--- a/tensorflow/compiler/xla/service/async_collective_creator.cc
+++ b/tensorflow/compiler/xla/service/async_collective_creator.cc
@@ -16,41 +16,135 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/async_collective_creator.h"
 
 #include <iterator>
-#include <memory>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/frontend_attributes.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
+namespace {
+
+struct ReplacedAsync {
+  HloInstruction* start;
+  HloInstruction* done;
+};
+
+StatusOr<ReplacedAsync> CreateAsyncAllReduce(HloInstruction* instruction) {
+  HloComputation* computation = instruction->parent();
+  auto* ar = Cast<HloAllReduceInstruction>(instruction);
+  HloInstruction* start =
+      computation->AddInstruction(HloInstruction::CreateAllReduceStart(
+          ar->shape(), ar->operands(), ar->to_apply(), ar->replica_groups(),
+          ar->constrain_layout(), ar->channel_id(),
+          ar->use_global_device_ids()));
+  HloInstruction* done =
+      computation->AddInstruction(HloInstruction::CreateUnary(
+          ar->shape(), HloOpcode::kAllReduceDone, start));
+  return ReplacedAsync{start, done};
+}
+
+StatusOr<ReplacedAsync> CreateAsyncAllGather(HloInstruction* instruction) {
+  HloComputation* computation = instruction->parent();
+  auto* ag = Cast<HloAllGatherInstruction>(instruction);
+  std::vector<const Shape*> operand_shapes;
+  operand_shapes.reserve(ag->operand_count());
+  for (const HloInstruction* op : ag->operands()) {
+    operand_shapes.push_back(&op->shape());
+  }
+  Shape shape = ShapeUtil::MakeTupleShape(
+      {ag->operand_count() > 1
+           ? ShapeUtil::MakeTupleShapeWithPtrs(operand_shapes)
+           : *operand_shapes[0],
+       ag->shape()});
+  HloInstruction* start =
+      computation->AddInstruction(HloInstruction::CreateAllGatherStart(
+          shape, ag->operands(), ag->all_gather_dimension(),
+          ag->replica_groups(), ag->constrain_layout(), ag->channel_id(),
+          ag->use_global_device_ids()));
+  HloInstruction* done =
+      computation->AddInstruction(HloInstruction::CreateUnary(
+          ag->shape(), HloOpcode::kAllGatherDone, start));
+  return ReplacedAsync{start, done};
+}
+
+StatusOr<ReplacedAsync> CreateAsyncCollectivePermute(
+    HloInstruction* instruction) {
+  HloComputation* computation = instruction->parent();
+  auto* cp = Cast<HloCollectivePermuteInstruction>(instruction);
+  HloInstruction* start;
+  HloInstruction* operand = cp->mutable_operand(0);
+  if (cp->operand_count() == 1) {
+    start = computation->AddInstruction(
+        HloInstruction::CreateCollectivePermuteStart(
+            ShapeUtil::MakeTupleShape({operand->shape(), cp->shape(),
+                                       ShapeUtil::MakeShape(U32, {}, {}),
+                                       ShapeUtil::MakeShape(U32, {}, {})}),
+            operand, cp->source_target_pairs(), cp->channel_id()));
+  } else {
+    CHECK_EQ(cp->operand_count(), 4);
+    std::vector<const Shape*> operand_shapes;
+    absl::c_transform(
+        cp->operands(), std::back_inserter(operand_shapes),
+        [](const HloInstruction* operand) { return &(operand->shape()); });
+    start = computation->AddInstruction(
+        HloInstruction::CreateCollectivePermuteStart(
+            ShapeInference::InferCollectivePermuteStartShape(operand_shapes)
+                .value(),
+            operand, cp->mutable_operand(1), cp->mutable_operand(2),
+            cp->mutable_operand(3), cp->source_target_pairs(),
+            cp->dynamic_slice_sizes_list(), cp->channel_id()));
+    if (HasDisjointReadWriteRegionsAttr(cp)) {
+      SetDisjointReadWriteRegionsAttr(start);
+    }
+  }
+  HloInstruction* done =
+      computation->AddInstruction(HloInstruction::CreateUnary(
+          cp->shape(), HloOpcode::kCollectivePermuteDone, start));
+  return ReplacedAsync{start, done};
+}
+
+StatusOr<ReplacedAsync> CreateAsyncStartDone(
+    HloInstruction* instruction, absl::Span<const Shape> context_shapes) {
+  HloComputation* computation = instruction->parent();
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * done,
+      computation->CreateAsyncInstructions(instruction, context_shapes,
+                                           HloInstruction::kMainExecutionThread,
+                                           /*replace=*/false));
+  HloInstruction* start = done->mutable_operand(0);
+  return ReplacedAsync{start, done};
+}
+
+}  // namespace
 
 StatusOr<bool> AsyncCollectiveCreator::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
-  struct ReplacedAsync {
-    HloInstruction* start;
-    HloInstruction* done;
-  };
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
     // Find all supported collective ops first as we can't modify the
     // instructions while iterating through them.
     std::vector<HloInstruction*> supported_collectives;
     for (HloInstruction* instruction : computation->instructions()) {
-      if ((instruction->opcode() == HloOpcode::kAllReduce &&
-           convert_all_reduce_(instruction)) ||
-          (instruction->opcode() == HloOpcode::kAllGather &&
-           convert_all_gather_(instruction)) ||
-          (instruction->opcode() == HloOpcode::kCollectivePermute &&
-           convert_collective_permute_(instruction)) ||
-          (instruction->opcode() == HloOpcode::kAllToAll &&
-           convert_all_to_all_(instruction))) {
+      const HloOpcode op = instruction->opcode();
+      if ((op == HloOpcode::kAllReduce &&
+           config_.convert_all_reduce(instruction)) ||
+          (op == HloOpcode::kAllGather &&
+           config_.convert_all_gather(instruction)) ||
+          (op == HloOpcode::kCollectivePermute &&
+           config_.convert_collective_permute(instruction)) ||
+          (op == HloOpcode::kAllToAll &&
+           config_.convert_all_to_all(instruction)) ||
+          (op == HloOpcode::kReduceScatter &&
+           config_.convert_reduce_scatter(instruction))) {
         supported_collectives.push_back(instruction);
       }
     }
@@ -59,119 +153,50 @@ StatusOr<bool> AsyncCollectiveCreator::Run(
     }
 
     absl::flat_hash_map<HloInstruction*, ReplacedAsync> replaced_pairs;
-    bool should_update_schedule =
+    const bool should_update_schedule =
         module->has_schedule() &&
         module->schedule().is_computation_scheduled(computation);
     for (HloInstruction* instruction : supported_collectives) {
-      if (HloAllReduceInstruction* ar =
-              DynCast<HloAllReduceInstruction>(instruction)) {
-        HloInstruction* start =
-            computation->AddInstruction(HloInstruction::CreateAllReduceStart(
-                ar->shape(), ar->operands(), ar->to_apply(),
-                ar->replica_groups(), ar->constrain_layout(), ar->channel_id(),
-                ar->use_global_device_ids()));
-        std::unique_ptr<HloInstruction> done = HloInstruction::CreateUnary(
-            ar->shape(), HloOpcode::kAllReduceDone, start);
-        start->set_metadata(ar->metadata());
-        start->CopyBackendConfigFrom(ar);
-        if (should_update_schedule) {
-          replaced_pairs[ar] = ReplacedAsync{start, done.get()};
-        }
-        TF_RETURN_WITH_CONTEXT_IF_ERROR(
-            computation->ReplaceWithNewInstruction(ar, std::move(done)),
-            "replacing ", ar->ToShortString());
-        changed = true;
-        continue;
+      StatusOr<ReplacedAsync> async_pair;
+      switch (instruction->opcode()) {
+        case HloOpcode::kAllReduce:
+          async_pair = CreateAsyncAllReduce(instruction);
+          break;
+        case HloOpcode::kAllGather:
+          async_pair = CreateAsyncAllGather(instruction);
+          break;
+        case HloOpcode::kCollectivePermute:
+          async_pair = CreateAsyncCollectivePermute(instruction);
+          break;
+        case HloOpcode::kAllToAll:
+        case HloOpcode::kReduceScatter:
+          async_pair = CreateAsyncStartDone(
+              instruction, config_.get_context_shapes(instruction));
+          break;
+        default:
+          return InternalError("Unexpected opcode %s",
+                               HloOpcodeString(instruction->opcode()));
       }
-      if (HloAllGatherInstruction* ag =
-              DynCast<HloAllGatherInstruction>(instruction)) {
-        std::vector<const Shape*> operand_shapes;
-        operand_shapes.reserve(ag->operand_count());
-        for (const HloInstruction* op : ag->operands()) {
-          operand_shapes.push_back(&op->shape());
-        }
-        Shape shape = ShapeUtil::MakeTupleShape(
-            {ag->operand_count() > 1
-                 ? ShapeUtil::MakeTupleShapeWithPtrs(operand_shapes)
-                 : *operand_shapes[0],
-             ag->shape()});
-        HloInstruction* start =
-            computation->AddInstruction(HloInstruction::CreateAllGatherStart(
-                shape, ag->operands(), ag->all_gather_dimension(),
-                ag->replica_groups(), ag->constrain_layout(), ag->channel_id(),
-                ag->use_global_device_ids()));
-        std::unique_ptr<HloInstruction> done = HloInstruction::CreateUnary(
-            ag->shape(), HloOpcode::kAllGatherDone, start);
-        start->set_metadata(ag->metadata());
-        start->CopyBackendConfigFrom(ag);
-        if (should_update_schedule) {
-          replaced_pairs[ag] = ReplacedAsync{start, done.get()};
-        }
-        TF_RETURN_WITH_CONTEXT_IF_ERROR(
-            computation->ReplaceWithNewInstruction(ag, std::move(done)),
-            "replacing ", ag->ToShortString());
-        changed = true;
-        continue;
+      TF_RETURN_IF_ERROR(async_pair.status());
+      async_pair->start->set_metadata(instruction->metadata());
+      async_pair->start->CopyBackendConfigFrom(instruction);
+      if (should_update_schedule) {
+        replaced_pairs[instruction] = *async_pair;
       }
-      if (HloCollectivePermuteInstruction* cp =
-              DynCast<HloCollectivePermuteInstruction>(instruction)) {
-        HloInstruction* collective_permute_start;
-        HloInstruction* operand = cp->mutable_operand(0);
-        if (cp->operand_count() == 1) {
-          collective_permute_start = computation->AddInstruction(
-              HloInstruction::CreateCollectivePermuteStart(
-                  ShapeUtil::MakeTupleShape(
-                      {operand->shape(), cp->shape(),
-                       ShapeUtil::MakeShape(U32, {}, {}),
-                       ShapeUtil::MakeShape(U32, {}, {})}),
-                  operand, cp->source_target_pairs(), cp->channel_id()));
-        } else {
-          CHECK_EQ(cp->operand_count(), 4);
-          std::vector<const Shape*> operand_shapes;
-          absl::c_transform(cp->operands(), std::back_inserter(operand_shapes),
-                            [](const HloInstruction* operand) {
-                              return &(operand->shape());
-                            });
-          collective_permute_start = computation->AddInstruction(
-              HloInstruction::CreateCollectivePermuteStart(
-                  ShapeInference::InferCollectivePermuteStartShape(
-                      operand_shapes)
-                      .value(),
-                  operand, cp->mutable_operand(1), cp->mutable_operand(2),
-                  cp->mutable_operand(3), cp->source_target_pairs(),
-                  cp->dynamic_slice_sizes_list(), cp->channel_id()));
-          if (HasDisjointReadWriteRegionsAttr(cp)) {
-            SetDisjointReadWriteRegionsAttr(collective_permute_start);
-          }
-        }
-        collective_permute_start->set_metadata(cp->metadata());
-        collective_permute_start->CopyBackendConfigFrom(cp);
-        HloInstruction* collective_permute_done =
-            computation->AddInstruction(HloInstruction::CreateUnary(
-                cp->shape(), HloOpcode::kCollectivePermuteDone,
-                collective_permute_start));
-        if (should_update_schedule) {
-          replaced_pairs[cp] =
-              ReplacedAsync{collective_permute_start, collective_permute_done};
-        }
-        TF_RETURN_IF_ERROR(
-            computation->ReplaceInstruction(cp, collective_permute_done));
-        changed = true;
-        continue;
+
+      // Update control dependencies if present.
+      for (HloInstruction* pred : instruction->control_predecessors()) {
+        TF_RETURN_IF_ERROR(pred->AddControlDependencyTo(async_pair->start));
       }
-      if (HloAllToAllInstruction* ata =
-              DynCast<HloAllToAllInstruction>(instruction)) {
-        Shape sync_shape = ShapeUtil::MakeScalarShape(U32);
-        TF_ASSIGN_OR_RETURN(HloInstruction * async_done,
-                            computation->CreateAsyncInstructions(
-                                ata, {sync_shape, sync_shape}));
-        if (should_update_schedule) {
-          HloInstruction* async_start = async_done->mutable_operand(0);
-          replaced_pairs[ata] = ReplacedAsync{async_start, async_done};
-        }
-        changed = true;
-        continue;
+      for (HloInstruction* succ : instruction->control_successors()) {
+        TF_RETURN_IF_ERROR(async_pair->done->AddControlDependencyTo(succ));
       }
+      TF_RETURN_IF_ERROR(instruction->DropAllControlDeps());
+
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(
+          computation->ReplaceInstruction(instruction, async_pair->done),
+          "replacing ", instruction->ToShortString());
+      changed = true;
     }
     if (should_update_schedule) {
       std::vector<HloInstruction*> new_sequence;
diff --git a/tensorflow/compiler/xla/service/async_collective_creator.h b/tensorflow/compiler/xla/service/async_collective_creator.h
index 3da6674015c..9ee3e5c7fc4 100644
--- a/tensorflow/compiler/xla/service/async_collective_creator.h
+++ b/tensorflow/compiler/xla/service/async_collective_creator.h
@@ -17,8 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ASYNC_COLLECTIVE_CREATOR_H_
 
 #include <functional>
-#include <iterator>
-#include <memory>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -29,25 +28,22 @@ namespace xla {
 // all-reduce-done.
 class AsyncCollectiveCreator : public HloModulePass {
  public:
+  // Function to query the shape of the "context" for collectives that use
+  // HLO async-start/async-done.
+  using ContextShapeQuery =
+      std::function<std::vector<Shape>(const HloInstruction*)>;
   struct CollectiveCreatorConfig {
-    HloPredicate convert_all_reduce = [](const HloInstruction*) {
-      return false;
-    };
-    HloPredicate convert_all_gather = [](const HloInstruction*) {
-      return false;
-    };
-    HloPredicate convert_collective_permute = [](const HloInstruction*) {
-      return false;
-    };
-    HloPredicate convert_all_to_all = [](const HloInstruction*) {
-      return false;
+    HloPredicate convert_all_reduce = HloPredicateFalse;
+    HloPredicate convert_all_gather = HloPredicateFalse;
+    HloPredicate convert_collective_permute = HloPredicateFalse;
+    HloPredicate convert_all_to_all = HloPredicateFalse;
+    HloPredicate convert_reduce_scatter = HloPredicateFalse;
+    ContextShapeQuery get_context_shapes = [](const HloInstruction*) {
+      return std::vector<Shape>{};
     };
   };
   explicit AsyncCollectiveCreator(CollectiveCreatorConfig creator_config)
-      : convert_all_reduce_(creator_config.convert_all_reduce),
-        convert_all_gather_(creator_config.convert_all_gather),
-        convert_collective_permute_(creator_config.convert_collective_permute),
-        convert_all_to_all_(creator_config.convert_all_to_all) {}
+      : config_(std::move(creator_config)) {}
   absl::string_view name() const override { return "async-collective-creator"; }
 
   using HloPassInterface::Run;
@@ -56,12 +52,9 @@ class AsyncCollectiveCreator : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  HloPredicate convert_all_reduce_;
-  HloPredicate convert_all_gather_;
-  HloPredicate convert_collective_permute_;
-  HloPredicate convert_all_to_all_;
+  CollectiveCreatorConfig config_;
 };
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ASYNC_ALL_REDUCE_CREATOR_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ASYNC_COLLECTIVE_CREATOR_H_
diff --git a/tensorflow/compiler/xla/service/async_collective_creator_test.cc b/tensorflow/compiler/xla/service/async_collective_creator_test.cc
index 952cc143814..a382530821f 100644
--- a/tensorflow/compiler/xla/service/async_collective_creator_test.cc
+++ b/tensorflow/compiler/xla/service/async_collective_creator_test.cc
@@ -15,9 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/async_collective_creator.h"
 
+#include <string>
+
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
@@ -25,6 +29,8 @@ limitations under the License.
 namespace xla {
 namespace {
 
+namespace m = ::xla::match;
+
 using ::testing::NotNull;
 using ::testing::SizeIs;
 
@@ -47,7 +53,7 @@ TEST_F(AsyncAllReduceCreatorTest, SplitsSingleAllReduce) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
                           ParseAndReturnVerifiedModule(hlo_string));
   AsyncCollectiveCreator::CollectiveCreatorConfig config;
-  config.convert_all_reduce = [](const HloInstruction*) { return true; };
+  config.convert_all_reduce = HloPredicateTrue;
   TF_ASSERT_OK(AsyncCollectiveCreator(config).Run(hlo_module.get()).status());
 
   HloComputation* computation = hlo_module->entry_computation();
@@ -72,7 +78,7 @@ TEST_F(AsyncAllReduceCreatorTest, SplitsSingleAllGather) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
                           ParseAndReturnVerifiedModule(hlo_string));
   AsyncCollectiveCreator::CollectiveCreatorConfig config;
-  config.convert_all_gather = [](const HloInstruction*) { return true; };
+  config.convert_all_gather = HloPredicateTrue;
   TF_ASSERT_OK(AsyncCollectiveCreator(config).Run(hlo_module.get()).status());
 
   HloComputation* computation = hlo_module->entry_computation();
@@ -97,9 +103,7 @@ TEST_F(AsyncAllReduceCreatorTest, SplitsSingleCollectivePermute) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
                           ParseAndReturnVerifiedModule(hlo_string));
   AsyncCollectiveCreator::CollectiveCreatorConfig config;
-  config.convert_collective_permute = [](const HloInstruction*) {
-    return true;
-  };
+  config.convert_collective_permute = HloPredicateTrue;
   TF_ASSERT_OK(AsyncCollectiveCreator(config).Run(hlo_module.get()).status());
 
   HloComputation* computation = hlo_module->entry_computation();
@@ -129,9 +133,7 @@ ENTRY %module_spmd () -> f32[4,4,128] {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
                           ParseAndReturnVerifiedModule(hlo_string));
   AsyncCollectiveCreator::CollectiveCreatorConfig config;
-  config.convert_collective_permute = [](const HloInstruction*) {
-    return true;
-  };
+  config.convert_collective_permute = HloPredicateTrue;
   TF_ASSERT_OK(AsyncCollectiveCreator(config).Run(hlo_module.get()).status());
 
   HloComputation* computation = hlo_module->entry_computation();
@@ -159,9 +161,7 @@ TEST_F(AsyncAllReduceCreatorTest, SplitsSingleCollectivePermuteScheduled) {
       hlo_module->schedule().sequence(hlo_module->entry_computation()).size();
 
   AsyncCollectiveCreator::CollectiveCreatorConfig config;
-  config.convert_collective_permute = [](const HloInstruction*) {
-    return true;
-  };
+  config.convert_collective_permute = HloPredicateTrue;
   TF_ASSERT_OK(AsyncCollectiveCreator(config).Run(hlo_module.get()).status());
 
   HloComputation* computation = hlo_module->entry_computation();
@@ -189,7 +189,7 @@ TEST_F(AsyncAllReduceCreatorTest, SplitsSingleAllToAll) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
                           ParseAndReturnVerifiedModule(hlo_string));
   AsyncCollectiveCreator::CollectiveCreatorConfig config;
-  config.convert_all_to_all = [](const HloInstruction*) { return true; };
+  config.convert_all_to_all = HloPredicateTrue;
   TF_ASSERT_OK(AsyncCollectiveCreator(config).Run(hlo_module.get()).status());
   XLA_VLOG_LINES(0, hlo_module->ToString());
 
@@ -205,5 +205,74 @@ TEST_F(AsyncAllReduceCreatorTest, SplitsSingleAllToAll) {
   EXPECT_THAT(start->async_wrapped_opcode(), HloOpcode::kAllToAll);
 }
 
+TEST_F(AsyncAllReduceCreatorTest, SplitsSingleReduceScatter) {
+  constexpr absl::string_view hlo_string = R"(
+  HloModule test
+  add {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+  ENTRY entry {
+    p0 = f32[8,16] parameter(0)
+    ROOT ata = f32[1,16] reduce-scatter(p0), dimensions={0}, replica_groups={{0,1,2,3,4,5,6,7}}, to_apply=add
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AsyncCollectiveCreator::CollectiveCreatorConfig config;
+  config.convert_reduce_scatter = HloPredicateTrue;
+  TF_ASSERT_OK(AsyncCollectiveCreator(config).Run(hlo_module.get()).status());
+  XLA_VLOG_LINES(0, hlo_module->ToString());
+
+  HloComputation* computation = hlo_module->entry_computation();
+  ASSERT_THAT(computation, NotNull());
+  ASSERT_EQ(computation->instruction_count(), 3);
+  const HloInstruction* done = computation->root_instruction();
+  EXPECT_EQ(done->opcode(), HloOpcode::kAsyncDone);
+  ASSERT_THAT(done->operands(), SizeIs(1));
+  const HloInstruction* start = done->operand(0);
+  EXPECT_EQ(start->opcode(), HloOpcode::kAsyncStart);
+  ASSERT_THAT(start->async_wrapped_instruction(), NotNull());
+  EXPECT_THAT(start->async_wrapped_opcode(), HloOpcode::kReduceScatter);
+}
+
+TEST_F(AsyncAllReduceCreatorTest, ControlPredecessor) {
+  constexpr absl::string_view hlo_string = R"(
+  HloModule test
+  ENTRY entry {
+    p0 = f32[1] parameter(0)
+    ag = f32[8] all-gather(p0), dimensions={0}, replica_groups={{0,1,2,3,4,5,6,7}}, control-predecessors={p0}
+    p1 = f32[1] parameter(1), control-predecessors={ag}
+    ROOT sum = add(ag, ag)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AsyncCollectiveCreator::CollectiveCreatorConfig config;
+  config.convert_all_gather = HloPredicateTrue;
+  TF_ASSERT_OK(
+      RunHloPass(AsyncCollectiveCreator(config), hlo_module.get()).status());
+  SCOPED_TRACE(hlo_module->ToString());
+
+  HloInstruction* start;
+  HloInstruction* done;
+  ASSERT_THAT(
+      hlo_module->entry_computation()->root_instruction(),
+      GmockMatch(m::Add(m::Op(),
+                        m::Op(&done)
+                            .WithOpcode(HloOpcode::kAllGatherDone)
+                            .WithOperand(0, m::Op(&start).WithOpcode(
+                                                HloOpcode::kAllGatherStart)))));
+  EXPECT_EQ(start->control_successors().size(), 0);
+  ASSERT_EQ(start->control_predecessors().size(), 1);
+  EXPECT_THAT(start->control_predecessors()[0], GmockMatch(m::Parameter(0)));
+
+  EXPECT_EQ(done->control_predecessors().size(), 0);
+  ASSERT_EQ(done->control_successors().size(), 1);
+  EXPECT_THAT(done->control_successors()[0], GmockMatch(m::Parameter(1)));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/async_op_canonicalizer.cc b/tensorflow/compiler/xla/service/async_op_canonicalizer.cc
index f01ffa91e1f..6ba687ec409 100644
--- a/tensorflow/compiler/xla/service/async_op_canonicalizer.cc
+++ b/tensorflow/compiler/xla/service/async_op_canonicalizer.cc
@@ -111,7 +111,7 @@ StatusOr<bool> AsyncOpCanonicalizer::Run(
             instruction);
         instruction->ReplaceCalledComputations(
             [&](HloComputation*) { return computation; });
-        computation->AddAsyncInstruction(instruction);
+        computation->AddAsyncInstruction(*instruction);
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 0bf8b985117..20ee88153b3 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -138,7 +138,7 @@ Backend::Backend(se::Platform* platform, Compiler* compiler,
     const int num_threads = intra_op_parallelism_threads > 0
                                 ? intra_op_parallelism_threads
                                 : tsl::port::MaxParallelism();
-    intra_op_thread_pool_.reset(new IntraOpThreadPool(num_threads));
+    intra_op_thread_pool_ = std::make_unique<IntraOpThreadPool>(num_threads);
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.h b/tensorflow/compiler/xla/service/batchnorm_expander.h
index 41bfe71ac89..7107ef1e7bb 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.h
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.h
@@ -35,7 +35,7 @@ class BatchNormExpander : public HloModulePass {
       : rewrite_training_op_(rewrite_training_op),
         rewrite_inference_op_(rewrite_inference_op),
         rewrite_grad_op_(rewrite_grad_op) {}
-  ~BatchNormExpander() = default;
+  ~BatchNormExpander() override = default;
   absl::string_view name() const override { return "batchnorm_expander"; }
 
   // Run operation expander on the given computation. Returns whether the
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 6a6f3bd81b5..694b26310e2 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -751,8 +751,10 @@ TEST_F(BufferAssignmentTest, PresetAssignments) {
   module->AddEntryComputation(builder.Build());
 
   auto preset_assignments = std::make_unique<PresetAssignments>();
-  preset_assignments->add_chunk({mul, {}}, {/*offset=*/100, /*size=*/400});
-  preset_assignments->add_chunk({add, {}}, {/*offset=*/550, /*size=*/400});
+  preset_assignments->add_chunk({mul, {}},
+                                HeapSimulator::Chunk::FromOffsetSize(100, 400));
+  preset_assignments->add_chunk({add, {}},
+                                HeapSimulator::Chunk::FromOffsetSize(550, 400));
   preset_assignments->assignment_information_for_space(/*memory_space=*/1)
       ->size = 950;
 
@@ -858,7 +860,8 @@ TEST_F(BufferAssignmentTest, PresetAssignmentsWhile) {
 
   // Set only one preset assignment for while data and its aliases.
   auto preset_assignments = std::make_unique<PresetAssignments>();
-  preset_assignments->add_chunk({negate, {}}, {/*offset=*/100, /*size=*/40});
+  preset_assignments->add_chunk({negate, {}},
+                                HeapSimulator::Chunk::FromOffsetSize(100, 40));
   preset_assignments->assignment_information_for_space(/*memory_space=*/1)
       ->size = 140;
 
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 8a2a98e20a6..f4b735920d1 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -83,7 +83,7 @@ std::string CallSite::ToString() const {
       CallContextToString(context()), ": ",
       absl::StrJoin(called_computations(), ", ",
                     [](std::string* out, const HloComputation* computation) {
-                      out->append(computation->name());
+                      absl::StrAppend(out, computation->name());
                     }));
 }
 
@@ -99,7 +99,9 @@ const CallSite* CallGraphNode::GetCallSite(
   return &callsites_[it->second];
 }
 
-std::string CallGraphNode::ToString() const { return computation_->name(); }
+absl::string_view CallGraphNode::ToString() const {
+  return computation_->name();
+}
 
 void CallGraphNode::AddCallerCallSite(const CallSite& caller_callsite) {
   caller_callsites_.push_back(caller_callsite);
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 11fbd1b39a4..defacf095fc 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -125,7 +125,7 @@ class CallGraphNode {
   // (usually the entry computation node) to this node.
   int depth() const { return depth_; }
 
-  std::string ToString() const;
+  absl::string_view ToString() const;
 
   CallGraphNode(const CallGraphNode&) = delete;
   CallGraphNode& operator=(const CallGraphNode&) = delete;
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index 3bec0739e9f..c02c5d8d68b 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -565,8 +565,7 @@ TEST_F(CallGraphTest, VisitWithError) {
 
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.code(), tsl::error::INTERNAL);
-  ASSERT_THAT(status.error_message(),
-              ::testing::HasSubstr("Visitation failed"));
+  ASSERT_THAT(status.message(), ::testing::HasSubstr("Visitation failed"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/change_op_data_type_test.cc b/tensorflow/compiler/xla/service/change_op_data_type_test.cc
index eb630ca23ba..e9913ba177f 100644
--- a/tensorflow/compiler/xla/service/change_op_data_type_test.cc
+++ b/tensorflow/compiler/xla/service/change_op_data_type_test.cc
@@ -45,7 +45,7 @@ TEST_F(ChangeOpDataTypeTest, Simple) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr));
 
-  ChangeOpDataType pass(F16, F32, [](const HloInstruction*) { return true; });
+  ChangeOpDataType pass(F16, F32, HloPredicateTrue);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
   SCOPED_TRACE(module->ToString());
   EXPECT_TRUE(changed);
@@ -66,7 +66,7 @@ TEST_F(ChangeOpDataTypeTest, AllTypesMustBeSame) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr));
 
-  ChangeOpDataType pass(F16, F32, [](const HloInstruction*) { return true; });
+  ChangeOpDataType pass(F16, F32, HloPredicateTrue);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
   SCOPED_TRACE(module->ToString());
   EXPECT_FALSE(changed);
@@ -85,10 +85,8 @@ TEST_F(ChangeOpDataTypeTest, DotAndConv) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr));
 
-  ChangeOpDataType pass(F16, F32, [](const HloInstruction* instr) {
-    return instr->opcode() == HloOpcode::kDot ||
-           instr->opcode() == HloOpcode::kConvolution;
-  });
+  ChangeOpDataType pass(
+      F16, F32, HloPredicateIsOp<HloOpcode::kDot, HloOpcode::kConvolution>);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
   SCOPED_TRACE(module->ToString());
   EXPECT_TRUE(changed);
@@ -114,7 +112,7 @@ TEST_F(ChangeOpDataTypeTest, SimpleWithCloner) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr));
 
-  HloPredicate matcher = [](const HloInstruction*) { return true; };
+  HloPredicate matcher = HloPredicateTrue;
 
   int count = 0;
   ChangeOpDataType::HloCloner cloner =
@@ -141,7 +139,7 @@ TEST_F(ChangeOpDataTypeTest, SimpleWithMultipleTypes) {
   })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr));
-  HloPredicate matcher = [](const HloInstruction*) { return true; };
+  HloPredicate matcher = HloPredicateTrue;
   ChangeOpDataType pass({{F16, F32}, {U16, U32}}, matcher);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
   SCOPED_TRACE(module->ToString());
diff --git a/tensorflow/compiler/xla/service/collective_ops_utils.cc b/tensorflow/compiler/xla/service/collective_ops_utils.cc
index a823ee7d2b5..995b9103a98 100644
--- a/tensorflow/compiler/xla/service/collective_ops_utils.cc
+++ b/tensorflow/compiler/xla/service/collective_ops_utils.cc
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/global_device_id.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -64,6 +66,22 @@ std::optional<ReductionKind> MatchReductionComputation(
   return kind;
 }
 
+std::optional<Literal> GetReductionIdentity(ReductionKind kind,
+                                            PrimitiveType type) {
+  switch (kind) {
+    case ReductionKind::SUM:
+      return LiteralUtil::Zero(type);
+    case ReductionKind::PRODUCT:
+      return LiteralUtil::One(type);
+    case ReductionKind::MIN:
+      return LiteralUtil::MaxValue(type);
+    case ReductionKind::MAX:
+      return LiteralUtil::MinValue(type);
+    default:
+      return std::nullopt;
+  }
+}
+
 StatusOr<std::vector<int>> GetParticipatingIDs(
     int current_id, std::optional<int> total_participant_count,
     absl::Span<const ReplicaGroup> groups) {
diff --git a/tensorflow/compiler/xla/service/collective_ops_utils.h b/tensorflow/compiler/xla/service/collective_ops_utils.h
index 3e04dcf3d0e..2f835dd8ee0 100644
--- a/tensorflow/compiler/xla/service/collective_ops_utils.h
+++ b/tensorflow/compiler/xla/service/collective_ops_utils.h
@@ -43,6 +43,11 @@ std::optional<ReductionKind> MatchReductionInstruction(
 std::optional<ReductionKind> MatchReductionComputation(
     const HloComputation* computation);
 
+// Returns the reduction identity value for a certain ReductionKind and
+// PrimitiveType.
+std::optional<Literal> GetReductionIdentity(ReductionKind kind,
+                                            PrimitiveType type);
+
 // Figures out which IDs are participating in the collective subgroup.
 // An empty `groups` indicates that all [0, total_participant_count) IDs
 // are participating. Note that for CollectiveOpGroupMode::kFlattenedID,
@@ -224,7 +229,7 @@ void WaitAndLogIfStuck(tsl::BlockingCounter* counter, const DescFn& desc_fn) {
   LOG(ERROR) << "This thread has been waiting for " << timeout.count()
              << "ms for and may be stuck: " << desc_fn();
   counter->Wait();
-  LOG(ERROR) << "Thread is unstuck!  Warning above was a false-positive.  "
+  LOG(ERROR) << "Thread is unstuck! Warning above was a false-positive. "
                 "Perhaps the timeout is too short: "
              << desc_fn();
 }
@@ -271,6 +276,7 @@ struct AllReduceParticipantData : ParticipantData {
 
   std::string ToString() const override {
     std::vector<std::string> buffer_strs;
+    buffer_strs.reserve(buffers.size());
     for (const Buffer& buffer : buffers) {
       buffer_strs.push_back(
           absl::StrFormat("{element_count=%d}", buffer.element_count));
@@ -367,7 +373,7 @@ class Rendezvous {
       if (!participants_.empty() &&
           participants_.back().rendezvous_key != participant.rendezvous_key) {
         return InvalidArgument(
-            "Mismatch among all-reduce participants.  Expected same "
+            "Mismatch among all-reduce participants. Expected same "
             "replica-count, element-count, and rendezvous-key but were %s and "
             "%s",
             participants_.back().ToString(), participant.ToString());
diff --git a/tensorflow/compiler/xla/service/collectives_schedule_linearizer.cc b/tensorflow/compiler/xla/service/collectives_schedule_linearizer.cc
index 0698f861234..6d76a2531d7 100644
--- a/tensorflow/compiler/xla/service/collectives_schedule_linearizer.cc
+++ b/tensorflow/compiler/xla/service/collectives_schedule_linearizer.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_reachability.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
@@ -38,6 +37,9 @@ namespace xla {
 StatusOr<bool> CollectivesScheduleLinearizer::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  if (is_enabled_ && !is_enabled_(module)) {
+    return false;
+  }
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
diff --git a/tensorflow/compiler/xla/service/collectives_schedule_linearizer.h b/tensorflow/compiler/xla/service/collectives_schedule_linearizer.h
index e38221b9f14..96202f7eb10 100644
--- a/tensorflow/compiler/xla/service/collectives_schedule_linearizer.h
+++ b/tensorflow/compiler/xla/service/collectives_schedule_linearizer.h
@@ -17,11 +17,10 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_COLLECTIVES_SCHEDULE_LINEARIZER_H_
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 
@@ -32,16 +31,20 @@ namespace xla {
 // within each computation.
 class CollectivesScheduleLinearizer : public HloModulePass {
  public:
+  explicit CollectivesScheduleLinearizer(HloModulePredicate is_enabled = {})
+      : is_enabled_(is_enabled) {}
+
   absl::string_view name() const override {
     return "collectives-schedule-linearizer";
   }
 
-  CollectivesScheduleLinearizer() = default;
-
   using HloPassInterface::Run;
   StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  HloModulePredicate is_enabled_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index 45fb93b3d9f..840d4ad28f5 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -68,7 +68,11 @@ void ComputationLayout::Print(Printer* printer) const {
   if (!parameter_layouts_.empty()) {
     parameter_layouts_[0].Print(printer);
     for (int i = 1; i < parameter_layouts_.size(); ++i) {
-      printer->Append(",");
+      if (i % 5 == 0) {
+        printer->Append(absl::StrFormat(", /*index=%lld*/", i));
+      } else {
+        printer->Append(", ");
+      }
       parameter_layouts_[i].Print(printer);
     }
   }
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.cc b/tensorflow/compiler/xla/service/conditional_code_motion.cc
index 975e72545fb..372f2060f79 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.cc
@@ -767,7 +767,7 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
   *conditional->mutable_shape() = new_root->shape();
   // Keep conditional instruction sharding consistent with the branches. Note
   // that this sharding could be lost after this pass.
-  conditional->set_sharding(new_root->sharding_ptr());
+  conditional->copy_sharding(new_root);
   VLOG(2) << "done moving instructions out of branches\n"
           << conditional_parent->ToString(HloPrintOptions::Fingerprint());
   return true;
@@ -912,7 +912,7 @@ StatusOr<bool> ConditionalCodeMotion::MoveUserInstructionsIn(
   *conditional->mutable_shape() = new_root->shape();
   // Keep conditional instruction sharding consistent with the branches. Note
   // that this sharding could be lost after this pass.
-  conditional->set_sharding(new_root->sharding_ptr());
+  conditional->copy_sharding(new_root);
   // Reset shapes of user gtes to the new shape.
   if (use_index != -1) {
     for (auto* user : conditional->users()) {
@@ -935,8 +935,8 @@ Status ReplaceInputAndMoveIntoBranches(
         insert_into_branch) {
   VLOG(2) << "MoveIntoBranches: input =" << input->ToString() << "\n";
   VLOG(2) << "MoveIntoBranches: user =" << user->ToString() << "\n";
-  CHECK_EQ(input->user_count(), 1);
-  // The nested tuple indices leading from input to the eventual user.
+  CHECK(input->user_count() == 1 || input->opcode() == HloOpcode::kBroadcast);
+  // The nested tuple indices leading from input to conditional parameter.
   std::vector<std::vector<int64_t>> matching_tuple_indices;
   absl::InlinedVector<HloInstruction*, 4> new_operands;
   for (int64_t j = 0; j < input->operand_count(); ++j) {
@@ -948,23 +948,22 @@ Status ReplaceInputAndMoveIntoBranches(
     }
   }
   if (user->opcode() == HloOpcode::kTuple) {
-    // Find the nested tuple indices where input is enclosed inside user.
-    for (HloInstruction* user_now = input; user_now != user;
-         user_now = user_now->users()[0]) {
-      CHECK_EQ(user_now->user_count(), 1);
+    // Find the nested tuple indices before reaching the conditional.
+    for (HloInstruction *input_now = input, *user_now = user;
+         user_now->opcode() != HloOpcode::kConditional;
+         input_now = user_now, user_now = user_now->users()[0]) {
       std::vector<int64_t> matching_tuple_index;
-      auto* user_now_user = user_now->users()[0];
-      for (int64_t i = 0; i < user_now_user->operand_count(); ++i) {
-        if (user_now_user->operand(i) != user_now) {
+      for (int64_t i = 0; i < user_now->operand_count(); ++i) {
+        if (user_now->operand(i) != input_now) {
           continue;
         }
         matching_tuple_index.push_back(i);
       }
       CHECK(!matching_tuple_index.empty());
       matching_tuple_indices.push_back(matching_tuple_index);
+      CHECK_EQ(user_now->user_count(), 1);
     }
     CHECK(!matching_tuple_indices.empty());
-    auto* user_now = input->users()[0];
     int64_t repl_count = 0;
     for (auto opd_index : matching_tuple_indices[0]) {
       HloInstruction* new_input =
@@ -975,40 +974,38 @@ Status ReplaceInputAndMoveIntoBranches(
       VLOG(2) << "Mapping operand " << repl_count << " = "
               << new_input->ToString() << " to " << opd_index;
       TF_RETURN_IF_ERROR(
-          user_now->ReplaceOperandWithDifferentShape(opd_index, new_input));
-      *user_now->mutable_shape()->mutable_tuple_shapes(opd_index) =
+          user->ReplaceOperandWithDifferentShape(opd_index, new_input));
+      *user->mutable_shape()->mutable_tuple_shapes(opd_index) =
           new_input->shape();
     }
     while (repl_count < new_operands.size()) {
       HloInstruction* new_input = new_operands[repl_count++];
-      auto new_input_in_user = std::find(user_now->operands().begin(),
-                                         user_now->operands().end(), new_input);
-      int64_t opd_index =
-          (new_input_in_user == user_now->operands().end())
-              ? user_now->operand_count()
-              : new_input_in_user - user_now->operands().begin();
+      auto new_input_in_user = std::find(user->operands().begin(),
+                                         user->operands().end(), new_input);
+      int64_t opd_index = (new_input_in_user == user->operands().end())
+                              ? user->operand_count()
+                              : new_input_in_user - user->operands().begin();
       op_map[new_input] = opd_index;
       CHECK(op_map.contains(new_input));
       VLOG(2) << "Mapping operand " << new_input->ToString() << " to "
               << opd_index;
-      user_now->AppendOperand(new_input);
-      user_now->mutable_shape()->mutable_tuple_shapes()->push_back(
+      user->AppendOperand(new_input);
+      user->mutable_shape()->mutable_tuple_shapes()->push_back(
           new_input->shape());
     }
     int64_t nesting_index = 1;
-    for (auto input_now = user_now->users()[0];
+    for (auto user_now = user->users()[0];
          nesting_index < matching_tuple_indices.size() &&
-         input_now != user->users()[0];
-         user_now = input_now, input_now = input_now->users()[0],
-              nesting_index++) {
-      VLOG(2) << "Replacing tuple: " << user_now->ToString();
-      CHECK(input_now->shape().IsTuple());
+         user_now->opcode() != HloOpcode::kConditional;
+         user = user_now, user_now = user_now->users()[0], nesting_index++) {
+      VLOG(2) << "Replacing tuple: " << user->ToString();
+      CHECK(user_now->shape().IsTuple());
       for (auto opd_index : matching_tuple_indices[nesting_index]) {
-        *input_now->mutable_shape()->mutable_tuple_shapes(opd_index) =
-            user_now->shape();
+        *user_now->mutable_shape()->mutable_tuple_shapes(opd_index) =
+            user->shape();
       }
-      VLOG(2) << "Done replacing tuple:" << user_now->ToString();
-      CHECK_EQ(input_now->user_count(), 1);
+      VLOG(2) << "Done replacing tuple:" << user->ToString();
+      CHECK_EQ(user_now->user_count(), 1);
     }
     VLOG(2) << "User: " << user->ToString() << "\n";
   }
@@ -1063,52 +1060,54 @@ Status ReplaceInputAndMoveIntoBranches(
         CHECK_NE(new_tuple, nullptr);
         VLOG(5) << "Cloned new tuple:" << new_tuple->parent()->ToString()
                 << "\n";
-        std::vector<HloInstruction*> gte_users;
+        std::vector<std::vector<HloInstruction*>> gte_users;
         for (int64_t j = 0; j < branch_param->shape().tuple_shapes_size();
              ++j) {
-          gte_users.push_back(nullptr);
+          gte_users.push_back(std::vector<HloInstruction*>());
         }
         for (auto* param_user : branch_param->users()) {
           if (param_user->opcode() == HloOpcode::kGetTupleElement) {
             CHECK_LT(param_user->tuple_index(), gte_users.size());
-            gte_users[param_user->tuple_index()] = param_user;
+            gte_users[param_user->tuple_index()].push_back(param_user);
           }
         }
 
         used = false;
         *branch_param->mutable_shape() = *param_shape;
         const Shape* new_param_shape = nullptr;
-        for (auto param_user : gte_users) {
-          if (param_user == nullptr) continue;
-          VLOG(2) << "Processing gte user: " << param_user->ToString() << "\n";
-          CHECK_EQ(param_user->opcode(), HloOpcode::kGetTupleElement);
-          auto tuple_index = param_user->tuple_index();
+        for (auto param_users : gte_users) {
+          if (param_users.empty()) continue;
+          CHECK_EQ(param_users[0]->opcode(), HloOpcode::kGetTupleElement);
+          auto tuple_index = param_users[0]->tuple_index();
+          VLOG(1) << "Processing gte users: " << param_users.size() << "\n";
+          VLOG(1) << "tuple_index: " << tuple_index << "\n";
+          VLOG(1) << "matching_tuple_indices: "
+                  << matching_tuple_indices[matching_index][0] << "\n";
           if (matching_tuple_indices[matching_index].end() ==
               std::find(matching_tuple_indices[matching_index].begin(),
                         matching_tuple_indices[matching_index].end(),
                         tuple_index)) {
             continue;
           }
-          VLOG(2) << "param_user: " << param_user->ToString() << "\n";
-          VLOG(2) << "new_param_shape="
-                  << ((new_param_shape == nullptr)
-                          ? "null"
-                          : new_param_shape->ToString());
-          if (new_param_shape == nullptr) {
-            branch_param = param_user;
-            if (matching_index > 0) {
-              param_tuple = branch_param;
+          for (HloInstruction* param_user : param_users) {
+            VLOG(1) << "param_user: " << param_user->ToString() << "\n";
+            if (new_param_shape == nullptr) {
+              branch_param = param_user;
+              if (matching_index > 0) {
+                param_tuple = branch_param;
+              }
+              CHECK_GT(param_shape->tuple_shapes_size(), tuple_index);
+              new_param_shape = &param_shape->tuple_shapes(tuple_index);
+              param_shape = new_param_shape;
+              VLOG(1) << "new_param_shape: " << param_shape->ToString();
+              *param_user->mutable_shape() = *new_param_shape;
+              VLOG(1) << "branch parameter: " << param_user->ToString();
+              used = true;
+            } else {
+              VLOG(1) << "new_param_shape=" << new_param_shape->ToString();
+              *param_user->mutable_shape() = *new_param_shape;
+              TF_RETURN_IF_ERROR(param_user->ReplaceAllUsesWith(branch_param));
             }
-            CHECK_GT(param_shape->tuple_shapes_size(), tuple_index);
-            new_param_shape = &param_shape->tuple_shapes(tuple_index);
-            param_shape = new_param_shape;
-            VLOG(2) << "new_param_shape: " << param_shape->ToString();
-            *param_user->mutable_shape() = *new_param_shape;
-            VLOG(2) << "branch parameter: " << param_user->ToString();
-            used = true;
-          } else {
-            *param_user->mutable_shape() = *new_param_shape;
-            TF_RETURN_IF_ERROR(param_user->ReplaceAllUsesWith(branch_param));
           }
         }
         if (!used) {
@@ -1209,7 +1208,9 @@ Status MoveIntoBranch(
         return branch_comp->AddInstruction(
             inst->CloneWithNewOperands(inst->shape(), operands));
       }));
-  TF_RETURN_IF_ERROR(inst->parent()->RemoveInstruction(inst));
+  if (inst->user_count() == 0) {
+    TF_RETURN_IF_ERROR(inst->parent()->RemoveInstruction(inst));
+  }
   return OkStatus();
 }
 
@@ -1222,25 +1223,42 @@ StatusOr<bool> ConditionalCodeMotion::MoveOperandInstructionsIn(
 
   VLOG(2) << "Before moving operand instructions inside branch: "
           << conditional->ToString(HloPrintOptions::Fingerprint()) << "\n";
-
-  int64_t cp_start = 0;
   HloInstruction* user = conditional;
+  int64_t user_index = 0;
   absl::flat_hash_map<const HloInstruction*, int64_t> op_map;
-  if (to_move_in[cp_start].operands()[0]->opcode() == HloOpcode::kTuple) {
-    user = to_move_in[cp_start].operands()[0];
-    cp_start++;
-  }
-  for (int64_t to_move_index = cp_start; to_move_index < to_move_in_size;
+  for (int64_t to_move_index = 0; to_move_index < to_move_in_size;
        to_move_index++) {
     Boundary b_to_move = to_move_in[to_move_index];
     HloInstruction* op = b_to_move.operands()[0];
+    CHECK_NE(op, nullptr);
+    if (op->user_count() == 1) {
+      user = op->users()[0];
+      user_index = user->operand_index(op);
+    }
     if (op->opcode() == HloOpcode::kTuple) {
       continue;
     }
-    CHECK_NE(op, nullptr);
-    VLOG(2) << "Mapping new boundary instr: " << op->ToString() << "\n";
-    Boundary b(Boundary::Position::kInsideBranch);
+    VLOG(1) << "Mapping new boundary instr: " << op->ToString() << "\n";
+    VLOG(1) << "current user = " << user->ToString();
+    std::vector<std::pair<HloInstruction*, int64_t>> users;
+    for (auto* user_now = user; user_now != conditional;
+         user_now = user_now->users()[0]) {
+      CHECK_EQ(user_now->user_count(), 1);
+      VLOG(1) << "Saving user: " << user_now->ToString() << "\n";
+      users.push_back(std::make_pair(
+          user_now->users()[0], user_now->users()[0]->operand_index(user_now)));
+    }
     TF_RETURN_IF_ERROR(MoveIntoBranch(op, user, op_map));
+    // Update the user chain of the original op to find the new user.
+    for (int64_t i = users.size() - 1; i > 0; --i) {
+      CHECK_NE(users[i].first, nullptr);
+      CHECK_NE(users[i - 1].first, nullptr);
+      users[i - 1].first = users[i].first->mutable_operand(users[i].second);
+    }
+    if (!users.empty()) {
+      user = users.front().first->mutable_operand(users.front().second);
+      VLOG(1) << "Updated user: " << user->ToString() << "\n";
+    }
   }
   VLOG(2) << "Done moving operand instructions inside branch: "
           << conditional->ToString(HloPrintOptions::Fingerprint()) << "\n";
@@ -1584,6 +1602,14 @@ class GroupConnectedBoundaries {
   int64_t BenefitForMovingBoundaries(const std::vector<Boundary>& boundaries,
                                      bool perform_reuse_analysis = true) {
     int64_t reuses_before = 0, reuses_after = 0;
+    if ((boundaries[0].IsInsideBranch() ||
+         boundaries[0].IsOutsideBranchOperand()) &&
+        absl::c_count_if(boundaries, [](const Boundary& b) {
+          return b.operands()[0]->opcode() != HloOpcode::kTuple;
+        }) == 0) {
+      // The only boundary to move is the tuple op.
+      return -1;
+    }
     if (boundaries.size() == 1) {
       if (boundaries[0].IsOutsideBranchUser() &&
           boundaries[0].operands()[0]->opcode() ==
@@ -1591,12 +1617,6 @@ class GroupConnectedBoundaries {
         // The only boundary of moving-in is the get_tuple_element op.
         return -1;
       }
-      if ((boundaries[0].IsInsideBranch() ||
-           boundaries[0].IsOutsideBranchOperand()) &&
-          boundaries[0].operands()[0]->opcode() == HloOpcode::kTuple) {
-        // The only boundary of moving-out is the tuple op inside branches.
-        return -1;
-      }
     }
     // If trying alternative moving configurations, turn off reuse analysis.
     if (!perform_reuse_analysis) {
@@ -1699,6 +1719,14 @@ class GroupConnectedBoundaries {
             ? next_boundary.operands()[0]->user_count()
             : CountNonLeafOps(next_boundary.operands()[0]->operands());
     if (next_boundary_count <= 1) {
+      if (next_boundary.IsOutsideBranchOperand() &&
+          next_boundary.operands()[0]->users()[0] == conditional_ &&
+          next_boundary.operands()[0] == conditional_->operand(0)) {
+        // Not safe to move if the operand is also the conditional operand(0)
+        // in addition to being passed into the true or false branches.
+        return false;
+      }
+
       // If boundary has only a single or no dependent, safe to move.
       return true;
     } else {
@@ -1760,8 +1788,9 @@ class GroupConnectedBoundaries {
            InstructionWithinBranchIdentical(b.operands(),
                                             is_layout_sensitive_)) &&
           IsSafeToMoveBoundary(b) &&
-          WorthHoisting(b.operands()[0], b.GetPosition(), boundary_index++)) {
+          WorthHoisting(b.operands()[0], b.GetPosition(), boundary_index)) {
         connected_boundaries_.push_back(b);
+        boundary_index++;
         auto output_size = calc_memory_size(b.operands()[0]);
         connected_boundaries_memory_increase_ -= output_size;
         VLOG(1) << "memory incr = " << connected_boundaries_memory_increase_;
@@ -1781,6 +1810,24 @@ class GroupConnectedBoundaries {
                     << connected_boundaries_memory_increase_;
           }
         }
+      } else if (b.IsOutsideBranchOperand() &&
+                 b.operands()[0]->opcode() == HloOpcode::kBroadcast &&
+                 connected_boundaries_.size() > 1 &&
+                 absl::c_find(
+                     b.operands()[0]->users(),
+                     connected_boundaries_[connected_boundaries_.size() - 1]
+                         .operands()[0]) != b.operands()[0]->users().end() &&
+                 connected_boundaries_[connected_boundaries_.size() - 1]
+                         .operands()[0]
+                         ->opcode() != HloOpcode::kTuple) {
+        // Try replicate the broadcast inside the conditional branches.
+        VLOG(1) << "Replicating multi-use broadcasts:" << b.ToString() << "\n";
+        connected_boundaries_.push_back(b);
+        auto output_size = calc_memory_size(b.operands()[0]) -
+                           calc_memory_size(b.operands()[0]->operand(0));
+        connected_boundaries_memory_increase_ -= output_size;
+        VLOG(1) << "memory incr = " << connected_boundaries_memory_increase_;
+        VLOG(1) << "boundary can be moved.";
       } else {
         VLOG(1) << "boundary cannot be moved\n";
         visited_count_[b.operands()[0]] = 1;
@@ -1855,7 +1902,7 @@ ConditionalCodeMotion::Decision ConditionalCodeMotion::ConsiderCodeMotion(
                 << " > " << kMemoryAllowance << "\n";
         benefit = -1;
       } else {
-        VLOG(1) << "Increase memory pressure by  " << move_in_or_out.second
+        VLOG(1) << "Increase memory pressure by " << move_in_or_out.second
                 << "\n";
         memory_increase_ += move_in_or_out.second;
       }
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
index eae2dce6703..024a4deef0e 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
@@ -1858,9 +1858,9 @@ ENTRY %xla_computation_unknown.45 (parameter.3: u8[], parameter.4: u8[], paramet
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
   ConditionalCodeMotion pass(true, true);
   pass.Run(&*module).value();
+  VLOG(3) << module->ToString();
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Conditional());
-  // We do not move reduce operations due to potential memory considerations.
   EXPECT_EQ(root->branch_computation(0)->instruction_count(), 4);
   EXPECT_EQ(root->branch_computation(1)->instruction_count(), 8);
   // Expect the add.0 and convert.35 in brnach_1_comp.31 to be moved
@@ -2366,6 +2366,55 @@ ENTRY %xla_computation  {
   EXPECT_THAT(conditional_true->shape().tuple_shapes_size(), 2);
   EXPECT_THAT(conditional_true->shape().tuple_shapes(1).tuple_shapes_size(), 2);
 }
+
+// Move partially used operands inside empty conditional branches.
+TEST_F(ConditionalCodeMotionTest, MovePartialyUsedOperands11) {
+  absl::string_view hlo_string =
+      R"(
+HloModule xla_computation
+
+region_2.494 {
+  Arg_.495 = (u32[], u32[]) parameter(0)
+  get-tuple-element = u32[] get-tuple-element(Arg_.495), index=1, metadata={op_type="Less" op_name="cond_1/Less"}
+  bitcast-convert = s32[] bitcast-convert(get-tuple-element), metadata={op_type="Less" op_name="cond_1/Less"}
+  constant.172 = s32[] constant(0), metadata={op_type="Less" op_name="cond_1/Less"}
+  compare = pred[] compare(bitcast-convert, constant.172), direction=LT, metadata={op_type="Less" op_name="cond_1/Less"}
+  constant.1 = u32[] constant(0)
+  compare.1 = pred[] compare(get-tuple-element, constant.1), direction=EQ, metadata={op_type="Less" op_name="cond_1/Less"}
+  get-tuple-element.2 = u32[] get-tuple-element(Arg_.495), index=0, metadata={op_type="Less" op_name="cond_1/Less"}
+  constant = u32[] constant(25000), metadata={op_type="Less" op_name="cond_1/Less"}
+  compare.2 = pred[] compare(get-tuple-element.2, constant), direction=LT, metadata={op_type="Less" op_name="cond_1/Less"}
+  and = pred[] and(compare.1, compare.2), metadata={op_type="Less" op_name="cond_1/Less"}
+  or = pred[] or(compare, and), metadata={op_type="Less" op_name="cond_1/Less"}
+  ROOT tuple.1 = (pred[]) tuple(or)
+}
+
+region_3.498 {
+  Arg_.499 = pred[] parameter(0)
+  ROOT tuple.2 = (pred[]) tuple(Arg_.499)
+}
+
+ENTRY %xla_computation  {
+  custom-call = u32[]{:T(256)} parameter(0)
+  bitcast-convert.31 = s32[]{:T(256)} parameter(1)
+  constant.202 = s32[]{:T(256)} parameter(2)
+  constant.21 = u32[]{:T(256)} parameter(3)
+  custom-call.1 = u32[]{:T(256)} parameter(4)
+  compare.38 = pred[]{:T(256)} compare(bitcast-convert.31, constant.202), direction=GT, metadata={op_type="GreaterEqual" op_name="GreaterEqual"}
+  compare.39 = pred[]{:T(256)} compare(custom-call, constant.21), direction=EQ, metadata={op_type="GreaterEqual" op_name="GreaterEqual"}
+  or.17 = pred[]{:T(256)} or(compare.38, compare.39), metadata={op_type="GreaterEqual" op_name="GreaterEqual"}
+  tuple.20 = (u32[]{:T(256)}, u32[]{:T(256)}) tuple(custom-call.1, custom-call), sharding={{maximal device=0}, {maximal device=0}}
+  ROOT conditional = (pred[]) conditional(or.17, tuple.20, or.17), true_computation=region_2.494, false_computation=region_3.498, metadata={op_type="If" op_name="cond_1"}
+}
+
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  ConditionalCodeMotion pass(true, true);
+  pass.Run(&*module).value();
+  // No code motion is supposed to be done.
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Conditional(op::Or(), op::Tuple(), op::Or()));
+}
 }  // namespace conditional_opt
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index c286cb3c11e..3fa6b6dfc6c 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -48,11 +48,9 @@ namespace {
 // considered empty.
 bool ComputationIsEmptyWithArrayRoot(const HloComputation* computation) {
   bool empty_operations = absl::c_all_of(
-      computation->MakeInstructionPostOrder(), [](const HloInstruction* inst) {
-        return inst->opcode() == HloOpcode::kTuple ||
-               inst->opcode() == HloOpcode::kGetTupleElement ||
-               inst->opcode() == HloOpcode::kParameter;
-      });
+      computation->MakeInstructionPostOrder(),
+      HloPredicateIsOp<HloOpcode::kTuple, HloOpcode::kGetTupleElement,
+                       HloOpcode::kParameter>);
   bool contains_array = false;
   ShapeUtil::ForEachSubshape(computation->root_instruction()->shape(),
                              [&](const Shape& shape, const ShapeIndex& index) {
@@ -513,7 +511,7 @@ StatusOr<bool> ConditionalSimplifier::TryRemoveConditional(
       absl::c_any_of(conditional->branch_computation(1)->instructions(),
                      instruction_is_expensive)) {
     VLOG(2)
-        << "Not attempting  to remove conditional as its branch_index is not a "
+        << "Not attempting to remove conditional as its branch_index is not a "
            "compile-time constant or contains expensive instructions: "
         << conditional->ToShortString();
     return false;
diff --git a/tensorflow/compiler/xla/service/convert_async_collectives_to_sync.cc b/tensorflow/compiler/xla/service/convert_async_collectives_to_sync.cc
new file mode 100644
index 00000000000..95f7137926a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/convert_async_collectives_to_sync.cc
@@ -0,0 +1,232 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/convert_async_collectives_to_sync.h"
+
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/tsl/platform/errors.h"
+
+namespace xla {
+
+StatusOr<HloInstruction*> CreateSyncVariant(HloInstruction* async_start,
+                                            HloInstruction* async_done) {
+  HloInstruction* sync_instruction = nullptr;
+  HloComputation* computation = async_start->parent();
+
+  const HloOpcode async_start_op = async_start->opcode();
+  switch (async_start_op) {
+    case HloOpcode::kAllReduceStart: {
+      auto* async_ar = Cast<HloAllReduceInstruction>(async_start);
+      sync_instruction =
+          computation->AddInstruction(HloInstruction::CreateAllReduce(
+              async_done->shape(), async_ar->operands(), async_ar->to_apply(),
+              async_ar->replica_groups(), async_ar->constrain_layout(),
+              async_ar->channel_id(), async_ar->use_global_device_ids()));
+      break;
+    }
+    case HloOpcode::kAllGatherStart: {
+      auto* async_ag = Cast<HloAllGatherInstruction>(async_start);
+      sync_instruction =
+          computation->AddInstruction(HloInstruction::CreateAllGather(
+              async_done->shape(), async_ag->operands(),
+              async_ag->all_gather_dimension(), async_ag->replica_groups(),
+              async_ag->constrain_layout(), async_ag->channel_id(),
+              async_ag->use_global_device_ids()));
+      break;
+    }
+    case HloOpcode::kCollectivePermuteStart: {
+      auto* async_cp = Cast<HloCollectivePermuteInstruction>(async_start);
+      TF_RET_CHECK(async_cp->operand_count() == 1);
+      sync_instruction =
+          computation->AddInstruction(HloInstruction::CreateCollectivePermute(
+              async_done->shape(), async_cp->mutable_operand(0),
+              async_cp->source_target_pairs(), async_cp->channel_id()));
+      break;
+    }
+    case HloOpcode::kAsyncStart: {
+      auto* as_start = Cast<HloAsyncInstruction>(async_start);
+      HloInstruction* wrapped = as_start->async_wrapped_instruction();
+      sync_instruction =
+          computation->AddInstruction(wrapped->CloneWithNewOperands(
+              async_done->shape(), as_start->operands()));
+      break;
+    }
+    default:
+      return InternalError("Unexpected async start op %s",
+                           HloOpcodeString(async_start->opcode()));
+  }
+
+  sync_instruction->set_metadata(async_start->metadata());
+  sync_instruction->CopyBackendConfigFrom(async_start);
+
+  TF_RETURN_IF_ERROR(async_done->ReplaceAllUsesWith(sync_instruction));
+
+  // Collectives may have control dependencies due to passes like collective
+  // schedule linearizer. Since we are running post scheduling, we can safely
+  // ignore these control dependencies. Drop them to prepare for removal of the
+  // async-start/done.
+  TF_RETURN_IF_ERROR(async_start->DropAllControlDeps());
+  TF_RETURN_IF_ERROR(async_done->DropAllControlDeps());
+
+  // For the generic async-start/done, we also need to disconnect them from
+  // the called computations.
+  if (async_start_op == HloOpcode::kAsyncStart) {
+    auto disconnect_called_computation =
+        [](HloInstruction* async_op) -> Status {
+      TF_RET_CHECK(async_op->called_computations().size() == 1);
+      HloComputation* called = async_op->called_computations().front();
+      called->RemoveAsyncInstruction(async_op);
+      return OkStatus();
+    };
+    TF_RETURN_IF_ERROR(disconnect_called_computation(async_start));
+    TF_RETURN_IF_ERROR(disconnect_called_computation(async_done));
+  }
+
+  // When we remove the async-done (and its unused operands), in most cases,
+  // the async-start may not be deleted if its considered as having side effects
+  // but in some cases it will be (e.g., the generic HLO kAsyncStart). Track its
+  // removal and remove it if it was not removed when async-done is removed.
+  bool is_async_start_removed = false;
+  auto track_async_start_removed = [&](const HloInstruction* instr) {
+    is_async_start_removed |= instr == async_start;
+  };
+  TF_RETURN_IF_ERROR(computation->RemoveInstructionAndUnusedOperands(
+      async_done, track_async_start_removed));
+  if (!is_async_start_removed) {
+    TF_RETURN_IF_ERROR(computation->RemoveInstruction(async_start));
+  }
+  return sync_instruction;
+}
+
+/*static*/ Status
+ConvertAsyncCollectivesToSync::ReplaceAsyncInstructionsWithSync(
+    HloComputation* computation,
+    absl::Span<const std::pair<HloInstruction*, HloInstruction*>> async_pairs) {
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> replaced_ops;
+  for (auto& [async_start, async_done] : async_pairs) {
+    TF_ASSIGN_OR_RETURN(HloInstruction * sync,
+                        CreateSyncVariant(async_start, async_done));
+    // Remember name of async instruction for profile usability.
+    FrontendAttributes attributes;
+    auto& map = *attributes.mutable_map();
+    map[kAsyncCollectiveNameAttributeName] = async_start->name();
+    sync->add_frontend_attributes(std::move(attributes));
+
+    replaced_ops[async_start] = nullptr;
+    replaced_ops[async_done] = sync;
+  }
+
+  // Update schedule.
+  HloModule* module = computation->parent();
+  const HloInstructionSequence& sequence =
+      module->schedule().sequence(computation);
+  std::vector<HloInstruction*> new_sequence;
+  new_sequence.reserve(sequence.size());
+  for (HloInstruction* instr : sequence.instructions()) {
+    auto it = replaced_ops.find(instr);
+    if (it != replaced_ops.end()) {
+      if (it->second != nullptr) {
+        new_sequence.push_back(it->second);
+      }
+    } else {
+      new_sequence.push_back(instr);
+    }
+  }
+  module->schedule().set_sequence(computation, new_sequence);
+  return OkStatus();
+}
+
+StatusOr<bool> ConvertAsyncCollectivesToSync::RunOnComputation(
+    HloComputation* computation) {
+  HloModule* module = computation->parent();
+  std::vector<std::pair<HloInstruction*, HloInstruction*>> async_pairs;
+
+  const HloInstructionSequence& sequence =
+      module->schedule().sequence(computation);
+
+  // Set of async-start ops that are currently in flight, i.e., their done not
+  // yet seen.
+  absl::flat_hash_set<HloInstruction*> in_flight_ops;
+
+  for (HloInstruction* instruction : sequence.instructions()) {
+    if (hlo_query::IsAsyncCollectiveStartOp(instruction->opcode())) {
+      in_flight_ops.insert(instruction);
+      VLOG(3) << "Found async start " << instruction->ToString();
+    } else if (hlo_query::IsAsyncCollectiveDoneOp(instruction->opcode())) {
+      // If this done is matching with the previous start and all intervening
+      // ops are nops (i.e., prev_async_start was not reset to null), then we
+      // were unable to schedule an independent op to overlap with this async
+      // collective, so convert it to sync.
+      VLOG(3) << "Found async done " << instruction->ToString();
+
+      // All async-done ops are unary ops.
+      TF_RET_CHECK(instruction->operand_count() == 1);
+      HloInstruction* matching_async_start = instruction->mutable_operand(0);
+
+      // Find if corresponding async-start is in the set of in-flight ops and
+      // erase it (since it cannot be paired with any other async-done).
+      if (in_flight_ops.erase(matching_async_start) == 1) {
+        async_pairs.push_back({matching_async_start, instruction});
+        VLOG(3) << "Added pair: {" << matching_async_start->name() << ", "
+                << instruction->name();
+      }
+    } else if (!in_flight_ops.empty() && (!is_nop_ || !is_nop_(instruction))) {
+      VLOG(3) << "Found intervening non-NOP instruction "
+              << instruction->ToString();
+      in_flight_ops.clear();
+    }
+  }
+
+  if (async_pairs.empty()) {
+    return false;
+  }
+
+  TF_RETURN_IF_ERROR(ConvertAsyncInstructionsToSync(computation, async_pairs));
+  return true;
+}
+
+StatusOr<bool> ConvertAsyncCollectivesToSync::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  if (!module->has_schedule()) {
+    VLOG(3) << "Skipping as module is not scheduled";
+    return false;
+  }
+  bool changed = false;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    if (!module->schedule().is_computation_scheduled(computation)) {
+      VLOG(3) << "Skipping computation" << computation->name()
+              << " as it is not scheduled";
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(bool computation_changed,
+                        RunOnComputation(computation));
+    changed |= computation_changed;
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/convert_async_collectives_to_sync.h b/tensorflow/compiler/xla/service/convert_async_collectives_to_sync.h
new file mode 100644
index 00000000000..55f5d271268
--- /dev/null
+++ b/tensorflow/compiler/xla/service/convert_async_collectives_to_sync.h
@@ -0,0 +1,66 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
+
+#include <utility>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Convert asynchronous collectives to synchronous (after HLO scheduling) if
+// there are no compute operations overlapping with them.
+
+class ConvertAsyncCollectivesToSync : public HloModulePass {
+ public:
+  explicit ConvertAsyncCollectivesToSync(HloPredicate is_nop = {})
+      : is_nop_(is_nop) {}
+  absl::string_view name() const override {
+    return "convert-async-collectives-to-sync";
+  }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  virtual Status ConvertAsyncInstructionsToSync(
+      HloComputation* computation,
+      absl::Span<const std::pair<HloInstruction*, HloInstruction*>> async_pairs)
+      const {
+    return ReplaceAsyncInstructionsWithSync(computation, async_pairs);
+  }
+
+  // Helper utility to replace a list of pairs of async-start/done ops in a
+  // computation with their synchronous variants and update the schedule.
+  static Status ReplaceAsyncInstructionsWithSync(
+      HloComputation* computation,
+      absl::Span<const std::pair<HloInstruction*, HloInstruction*>>
+          async_pairs);
+
+  static constexpr char kAsyncCollectiveNameAttributeName[] =
+      "async_collective_name";
+
+ private:
+  StatusOr<bool> RunOnComputation(HloComputation* computation);
+  HloPredicate is_nop_;
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
diff --git a/tensorflow/compiler/xla/service/convert_async_collectives_to_sync_test.cc b/tensorflow/compiler/xla/service/convert_async_collectives_to_sync_test.cc
new file mode 100644
index 00000000000..11e7eee22a9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/convert_async_collectives_to_sync_test.cc
@@ -0,0 +1,349 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/convert_async_collectives_to_sync.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+
+namespace xla {
+
+namespace {
+
+namespace m = xla::testing::opcode_matchers;
+
+// Note: The pass only processes modules that are already scheduled. If the test
+// does not work as epxected, make sure to check if "is_scheduled=true" is added
+// to the HLO module string.
+class ConvertAsyncCollectivesToSyncTest : public HloTestBase {
+ public:
+  Status RunPass(HloModule *module, bool expect_change,
+                 HloPredicate is_nop = {}) {
+    TF_ASSIGN_OR_RETURN(bool changed,
+                        ConvertAsyncCollectivesToSync{is_nop}.Run(module));
+    EXPECT_EQ(changed, expect_change);
+    return OkStatus();
+  }
+
+  absl::string_view GetAsyncName(const HloInstruction *inst) {
+    const auto &map = inst->frontend_attributes().map();
+    return map.at(
+        ConvertAsyncCollectivesToSync::kAsyncCollectiveNameAttributeName);
+  }
+
+  HloPredicate is_nop_simple_ =
+      HloPredicateIsOp<HloOpcode::kBitcast, HloOpcode::kGetTupleElement,
+                       HloOpcode::kParameter>;
+};
+
+TEST_F(ConvertAsyncCollectivesToSyncTest, SimpleAllReduce) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3
+        ROOT done = u32[] all-reduce-done(start)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, m::AllReduce(m::ReplicaId()));
+  const auto *ar = Cast<HloAllReduceInstruction>(root);
+  EXPECT_TRUE(ar->channel_id().has_value());
+  EXPECT_EQ(ar->channel_id().value(), 3);
+  EXPECT_EQ(GetAsyncName(ar), "start");
+}
+
+TEST_F(ConvertAsyncCollectivesToSyncTest, SimpleAllReduceWithNop) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3, replica_groups={{0,1}, {2,3}}
+        id2 = f32[] bitcast(id)
+        ROOT done = u32[] all-reduce-done(start)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true, is_nop_simple_));
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, m::AllReduce(m::ReplicaId()));
+  const auto *ar = Cast<HloAllReduceInstruction>(root);
+  EXPECT_TRUE(ar->channel_id().has_value());
+  EXPECT_EQ(ar->channel_id().value(), 3);
+  EXPECT_THAT(ar, m::ReplicaGroups({{0, 1}, {2, 3}}));
+  EXPECT_EQ(GetAsyncName(ar), "start");
+}
+
+TEST_F(ConvertAsyncCollectivesToSyncTest, SimpleAllReduceWithNonNop) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3
+        id2 = u32[] add(id, id)
+        ROOT done = u32[] all-reduce-done(start)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/false));
+}
+
+TEST_F(ConvertAsyncCollectivesToSyncTest, SimpleAllGather) {
+  const absl::string_view hlo_string = R"(
+  HloModule test, is_scheduled=true
+  ENTRY test_computation {
+    a1 = u32[1, 2] parameter(0)
+    ags = (u32[1, 2], u32[2, 2]) all-gather-start(a1), dimensions={0}, channel_id=3
+    ROOT allgather = u32[2,2] all-gather-done(ags)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, m::AllGather(m::Parameter(0)));
+  const auto *ag = Cast<HloAllGatherInstruction>(root);
+  EXPECT_TRUE(ag->channel_id().has_value());
+  EXPECT_EQ(ag->channel_id().value(), 3);
+  EXPECT_EQ(ag->all_gather_dimension(), 0);
+  EXPECT_EQ(GetAsyncName(ag), "ags");
+}
+
+TEST_F(ConvertAsyncCollectivesToSyncTest, SimpleCollectivePermute) {
+  const absl::string_view hlo_string = R"(
+  HloModule test, is_scheduled=true
+
+  ENTRY test_computation {
+    p = u32[2] parameter(0)
+    start = (u32[2], u32[2], u32[], u32[]) collective-permute-start(p), source_target_pairs={{0,1}, {1,0}}
+    ROOT done = u32[2] collective-permute-done(start)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, m::CollectivePermute(m::Parameter(0)));
+  const auto *cp = Cast<HloCollectivePermuteInstruction>(root);
+  EXPECT_THAT(cp, m::SourceTargetPairs({{0, 1}, {1, 0}}));
+  EXPECT_EQ(GetAsyncName(cp), "start");
+}
+
+TEST_F(ConvertAsyncCollectivesToSyncTest, SimpleReduceScatter) {
+  const absl::string_view hlo_string = R"(
+  HloModule test, is_scheduled=true
+
+  add {
+    lhs = u32[] parameter(0)
+    rhs = u32[] parameter(1)
+    ROOT add = u32[] add(lhs, rhs)
+  }
+
+  reduce_scatter {
+    p0 = u32[8] parameter(0)
+    ROOT result = u32[4] reduce-scatter(p0), replica_groups={{0,3}, {1,2}},
+                      dimensions={0}, to_apply=add
+  }
+
+  ENTRY main {
+    data = u32[8] parameter(0)
+    rs-start = ((u32[8]{0}), u32[4]{0}) async-start(u32[8]{0} %data), calls=reduce_scatter
+    ROOT %ars = u32[4]{0} async-done(((u32[8]{0}), u32[4]{0}) %rs-start), calls=reduce_scatter
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, m::ReduceScatter(m::Parameter(0)));
+  const auto *rs = Cast<HloReduceScatterInstruction>(root);
+  EXPECT_THAT(rs, m::ReplicaGroups({{0, 3}, {1, 2}}));
+  EXPECT_EQ(rs->scatter_dimension(), 0);
+  EXPECT_EQ(GetAsyncName(rs), "rs-start");
+}
+
+TEST_F(ConvertAsyncCollectivesToSyncTest, SimpleAllToAll) {
+  const absl::string_view hlo_string = R"(
+  HloModule test, is_scheduled=true
+
+  all_to_all {
+    p0 = u32[2] parameter(0)
+    ROOT result = u32[2] all-to-all(p0), dimensions={0}, replica_groups={{0,1},{2,3}}
+  }
+
+  ENTRY test_computation {
+    a1 = u32[2] parameter(0)
+    a2a-start = ((u32[2]), u32[2]) async-start(u32[2] a1), calls=all_to_all
+    ROOT a2s = u32[2] async-done(a2a-start), calls=all_to_all
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, m::AllToAll(m::Parameter(0)));
+  const auto *a2a = Cast<HloAllToAllInstruction>(root);
+  EXPECT_THAT(a2a, m::ReplicaGroups({{0, 1}, {2, 3}}));
+  EXPECT_TRUE(a2a->split_dimension().has_value());
+  EXPECT_EQ(a2a->split_dimension().value(), 0);
+  EXPECT_EQ(GetAsyncName(a2a), "a2a-start");
+}
+
+TEST_F(ConvertAsyncCollectivesToSyncTest, ControlDeps) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start1 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3
+        done1 = u32[] all-reduce-done(start1)
+        start2 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=4, control-predecessors={done1}
+        done2 = u32[] all-reduce-done(start2)
+        ROOT x = u32[] add(done1, done2)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, m::Add(m::AllReduce(), m::AllReduce()));
+}
+
+// Test multiple in-flight collectives that are ordered in a streaming fashion:
+// i.e., ends are in start order (FIFO).
+TEST_F(ConvertAsyncCollectivesToSyncTest, MultipleInFlightStreaming) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start1 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3
+        start2 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=4
+        done1 = u32[] all-reduce-done(start1)
+        done2 = u32[] all-reduce-done(start2)
+        ROOT x = u32[] add(done1, done2)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, m::Add(m::AllReduce(), m::AllReduce()));
+}
+
+// Test multiple in-flight collectives that are nested: {s0,{s1,e1},e0}
+TEST_F(ConvertAsyncCollectivesToSyncTest, MultipleInFlightNested) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start1 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3
+        start2 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=4
+        done2 = u32[] all-reduce-done(start2)
+        done1 = u32[] all-reduce-done(start1)
+        ROOT x = u32[] add(done1, done2)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, m::Add(m::AllReduce(), m::AllReduce()));
+}
+
+// Test multiple in-flight collectives that are nested: {s0,{s1,e1},e0} where
+// inner pair can be converted but not outer.
+TEST_F(ConvertAsyncCollectivesToSyncTest, MultipleInFlightNestedPartial) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start1 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3
+        start2 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=4
+        done2 = u32[] all-reduce-done(start2)
+        id2 = u32[] add(done2, done2)
+        done1 = u32[] all-reduce-done(start1)
+        ROOT x = u32[] add(done1, done2)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  // We expect start2/done2 to be converted to async, start1/done1 will stay
+  // unchanged.
+  EXPECT_THAT(root, m::Add(m::AllReduceDone(), m::AllReduce()));
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/convert_mover.cc b/tensorflow/compiler/xla/service/convert_mover.cc
index 6f9ce6d63ae..6e1cd91306c 100644
--- a/tensorflow/compiler/xla/service/convert_mover.cc
+++ b/tensorflow/compiler/xla/service/convert_mover.cc
@@ -81,10 +81,8 @@ StatusOr<bool> MoveConvertPrecisionOps(HloComputation* comp) {
     }
     // At least one of the operands must be a kConvert op, and all of the
     // kConverts must have the same src data type.
-    auto convert_op_it =
-        absl::c_find_if(instr->operands(), [](const HloInstruction* operand) {
-          return operand->opcode() == HloOpcode::kConvert;
-        });
+    auto convert_op_it = absl::c_find_if(instr->operands(),
+                                         HloPredicateIsOp<HloOpcode::kConvert>);
     if (convert_op_it == instr->operands().end()) {
       continue;
     }
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 15658a515d6..f224e21184f 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -1723,10 +1723,9 @@ class CopyRemover {
     std::string result = "{";
     auto VisitValueNode = [&](const ValueNode* node) {
       if (result == "{") {
-        result = node->value->ToShortString();
-      } else {
-        StrAppend(&result, ", ");
         StrAppend(&result, node->value->ToShortString());
+      } else {
+        StrAppend(&result, ", ", node->value->ToShortString());
       }
     };
     VisitValueNode(element);
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 5781ff0f876..e606d6fd69f 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -57,6 +57,7 @@ cc_library(
 # When using mlir based HloLowering, the following utils will sometimes be needed to define used symbols.
 cc_library(
     name = "runtime_mlir_utils",
+    visibility = ["//visibility:public"],
     deps = [
         "@llvm-project//mlir:mlir_c_runner_utils",
     ],
@@ -111,6 +112,20 @@ filegroup(
     visibility = [":friends"],
 )
 
+filegroup(
+    name = "xla_runtime_runner_hdrs",
+    srcs = [
+        "buffer_desc.h",
+    ],
+    visibility = [":friends"],
+)
+
+filegroup(
+    name = "xla_runtime_runner_srcs",
+    srcs = [],
+    visibility = [":friends"],
+)
+
 cc_library(
     name = "cpu_xfeed",
     srcs = ["cpu_xfeed.cc"],
@@ -183,8 +198,8 @@ cc_library(
     srcs = ["cpu_compiler.cc"],
     hdrs = ["cpu_compiler.h"],
     deps = [
-        ":compiler_functor",
         ":buffer_info_util",
+        ":compiler_functor",
         ":conv_canonicalization",
         ":cpu_executable",
         ":cpu_instruction_fusion",
@@ -194,88 +209,13 @@ cc_library(
         ":dot_op_emitter",
         ":executable_proto_cc",
         ":hlo_xla_runtime_pipeline",
-        "@com_google_absl//absl/base:dynamic_annotations",
         ":ir_emission_utils",
         ":ir_emitter",
         ":parallel_task_assignment",
         ":simple_orc_jit",
-        ":xla_framework",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         ":target_machine_features",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:TargetParser",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineToStandard",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithTransforms",
-        "@llvm-project//mlir:BufferizationTransforms",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
-        "@llvm-project//mlir:ToLLVMIRTranslation",
-        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:compiler_trace_instrumentation",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
-        "//tensorflow/compiler/xla/service/cpu/runtime:collectives",
-        "//tensorflow/compiler/xla/service/cpu/runtime:custom_call",
-        "//tensorflow/compiler/xla/service/cpu/runtime:fft_call",
-        "//tensorflow/compiler/xla/service/cpu/runtime:rng",
-        "//tensorflow/compiler/xla/service/cpu/runtime:xfeed",
-        "//tensorflow/compiler/xla/service:stochastic_convert_decomposer",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
-        "//tensorflow/compiler/xla/mlir_hlo:transforms_passes",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir/framework/ir:xla_framework",
-        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:MemRefTransforms",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ReconcileUnrealizedCasts",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:VectorDialect",
-        "//tensorflow/compiler/xla/mlir/runtime/transforms:calling_convention",
-        "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_cpu",
-        "//tensorflow/compiler/xla/mlir/runtime/transforms:compiler",
-        "//tensorflow/compiler/xla/mlir_hlo:all_passes",
-        "//tensorflow/compiler/xla/mlir_hlo:gml_st_passes",
-        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
-        "//tensorflow/compiler/xla/service/spmd:stateful_rng_spmd_partitioner",
-        "//tensorflow/compiler/xla/service:all_gather_decomposer",
-        "//tensorflow/compiler/xla/service:all_reduce_promotion",
-        "//tensorflow/compiler/xla/service:all_to_all_decomposer",
-        "//tensorflow/compiler/xla/service:float_normalization",
-        "//tensorflow/compiler/xla/service:bitcast_dtypes_expander",
-        "//tensorflow/compiler/xla/service:broadcast_canonicalizer",
-        "//tensorflow/compiler/xla/service:copy_insertion",
-        "//tensorflow/compiler/xla/service:dump",
-        "//tensorflow/compiler/xla/service:reshape_decomposer",
-        "@com_google_absl//absl/base:core_headers",
-        "//tensorflow/compiler/xla/service:map_inliner",
-        "//tensorflow/compiler/xla/service:result_caster",
-        "//tensorflow/compiler/xla/service:rng_bit_generator_expander",
-        "//tensorflow/compiler/xla/service:select_and_scatter_expander",
-        "//tensorflow/compiler/xla/service:sharding_propagation",
-        "//tensorflow/compiler/xla/service:sharding_remover",
-        "//tensorflow/compiler/xla/service:topk_rewriter",
-        "//tensorflow/compiler/xla/service:tree_reduction_rewriter",
-        "//tensorflow/compiler/xla/service:conditional_canonicalizer",
-        "//tensorflow/compiler/xla/service:conditional_to_select",
-        "//tensorflow/compiler/xla/service:slow_operation_alarm",
-        "//tensorflow/compiler/xla/service:scatter_expander",
-        "//tensorflow/compiler/xla/service:comparison_expander",
-        "//tensorflow/compiler/xla/service:slice_sinker",
-        "//tensorflow/compiler/xla/service:reduce_decomposer",
+        ":xla_framework",
         "//tensorflow/compiler/xla:cpu_function_runtime",
-        "//tensorflow/compiler/xla/service:operand_upcaster",
-        "//tensorflow/compiler/xla/service:optimization_barrier_expander",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -283,65 +223,143 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/mlir/framework/ir:xla_framework",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:calling_convention",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_cpu",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:compiler",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:compiler_trace_instrumentation",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:all_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:transforms_passes",
         "//tensorflow/compiler/xla/runtime:executable",
         "//tensorflow/compiler/xla/runtime:jit_executable",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
-        "//tensorflow/compiler/xla/service:logistic_expander",
+        "//tensorflow/compiler/xla/service:all_gather_decomposer",
+        "//tensorflow/compiler/xla/service:all_reduce_promotion",
+        "//tensorflow/compiler/xla/service:all_to_all_decomposer",
         "//tensorflow/compiler/xla/service:batch_dot_simplification",
         "//tensorflow/compiler/xla/service:batchnorm_expander",
-        "//tensorflow/compiler/xla/service:dynamic_dimension_simplifier",
+        "//tensorflow/compiler/xla/service:bitcast_dtypes_expander",
+        "//tensorflow/compiler/xla/service:broadcast_canonicalizer",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:call_inliner",
+        "//tensorflow/compiler/xla/service:change_op_data_type",
         "//tensorflow/compiler/xla/service:cholesky_expander",
-        "//tensorflow/compiler/xla/service:eigh_expander",
-        "//tensorflow/compiler/xla/service:qr_expander",
+        "//tensorflow/compiler/xla/service:comparison_expander",
+        "//tensorflow/compiler/xla/service:conditional_canonicalizer",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
+        "//tensorflow/compiler/xla/service:conditional_to_select",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
+        "//tensorflow/compiler/xla/service:copy_insertion",
         "//tensorflow/compiler/xla/service:dot_decomposer",
-        "//tensorflow/compiler/xla/service:dynamic_padder",
+        "//tensorflow/compiler/xla/service:dump",
+        "//tensorflow/compiler/xla/service:dynamic_dimension_simplifier",
         "//tensorflow/compiler/xla/service:dynamic_index_splitter",
+        "//tensorflow/compiler/xla/service:dynamic_padder",
+        "//tensorflow/compiler/xla/service:eigh_expander",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:float_normalization",
+        "//tensorflow/compiler/xla/service:gather_expander",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
-        "//tensorflow/compiler/xla/service:change_op_data_type",
+        "//tensorflow/compiler/xla/service:hlo_memory_scheduler",
         "//tensorflow/compiler/xla/service:hlo_ordering",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service:hlo_proto_util",
-        "//tensorflow/compiler/xla/service:hlo_memory_scheduler",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:indexed_array_analysis",
         "//tensorflow/compiler/xla/service:llvm_compiler",
-        "//tensorflow/compiler/xla/service:gather_expander",
+        "//tensorflow/compiler/xla/service:logistic_expander",
+        "//tensorflow/compiler/xla/service:map_inliner",
+        "//tensorflow/compiler/xla/service:operand_upcaster",
+        "//tensorflow/compiler/xla/service:optimization_barrier_expander",
+        "//tensorflow/compiler/xla/service:qr_expander",
+        "//tensorflow/compiler/xla/service:reduce_decomposer",
         "//tensorflow/compiler/xla/service:reduce_scatter_decomposer",
+        "//tensorflow/compiler/xla/service:reshape_decomposer",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:result_caster",
+        "//tensorflow/compiler/xla/service:rng_bit_generator_expander",
         "//tensorflow/compiler/xla/service:rng_expander",
+        "//tensorflow/compiler/xla/service:scatter_expander",
+        "//tensorflow/compiler/xla/service:select_and_scatter_expander",
+        "//tensorflow/compiler/xla/service:sharding_propagation",
+        "//tensorflow/compiler/xla/service:sharding_remover",
+        "//tensorflow/compiler/xla/service:slice_sinker",
+        "//tensorflow/compiler/xla/service:slow_operation_alarm",
         "//tensorflow/compiler/xla/service:sort_simplifier",
+        "//tensorflow/compiler/xla/service:stochastic_convert_decomposer",
+        "//tensorflow/compiler/xla/service:topk_rewriter",
         "//tensorflow/compiler/xla/service:transpose_folding",
+        "//tensorflow/compiler/xla/service:tree_reduction_rewriter",
         "//tensorflow/compiler/xla/service:triangular_solve_expander",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
         "//tensorflow/compiler/xla/service:while_loop_invariant_code_motion",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
         "//tensorflow/compiler/xla/service:zero_sized_hlo_elimination",
+        "//tensorflow/compiler/xla/service/cpu/runtime:collectives",
+        "//tensorflow/compiler/xla/service/cpu/runtime:convolution_call",
+        "//tensorflow/compiler/xla/service/cpu/runtime:custom_call",
+        "//tensorflow/compiler/xla/service/cpu/runtime:fft_call",
+        "//tensorflow/compiler/xla/service/cpu/runtime:retain",
+        "//tensorflow/compiler/xla/service/cpu/runtime:rng",
+        "//tensorflow/compiler/xla/service/cpu/runtime:xfeed",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_command_line_options",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/service/spmd:stateful_rng_spmd_partitioner",
         "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/stream_executor/host:host_platform_id",
         "//tensorflow/compiler/xla/stream_executor/host:host_platform",
+        "//tensorflow/compiler/xla/stream_executor/host:host_platform_id",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Object",
         "@llvm-project//llvm:MC",
+        "@llvm-project//llvm:Object",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
+        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AffineToStandard",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithTransforms",
+        "@llvm-project//mlir:BufferizationTransforms",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:MemRefTransforms",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ReconcileUnrealizedCasts",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:ToLLVMIRTranslation",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:VectorDialect",
     ] + select({
         "//tensorflow/tsl:arm_any": [
             "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
@@ -392,6 +410,7 @@ cc_library(
         "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compiler",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st_bufferizable_op_interface",
         "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
         "//tensorflow/compiler/xla/mlir_hlo:transforms_passes",
         "//tensorflow/compiler/xla/runtime:compiler",
@@ -401,7 +420,10 @@ cc_library(
         "@llvm-project//mlir:BufferizationToMemRef",
         "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:ComplexToStandard",
+        "@llvm-project//mlir:ConversionPasses",
         "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncTransforms",
+        "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:MemRefTransforms",
@@ -451,6 +473,10 @@ cc_library(
         ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
         ":runtime_topk",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
+        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:ExecutionEngine",
@@ -460,10 +486,6 @@ cc_library(
         "@llvm-project//llvm:Target",  # fixdeps: keep
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//mlir:mlir_c_runner_utils",
-        "//tensorflow/compiler/xla/service:custom_call_target_registry",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/tsl/platform:logging",
     ] + ORC_JIT_MEMORY_MAPPER_TARGETS,
 )
 
@@ -498,11 +520,17 @@ cc_library(
     deps = ["@com_google_absl//absl/base:core_headers"],
 )
 
+cc_library(
+    name = "buffer_desc",
+    hdrs = ["buffer_desc.h"],
+)
+
 cc_library(
     name = "cpu_executable",
     srcs = ["cpu_executable.cc"],
     hdrs = ["cpu_executable.h"],
     deps = [
+        ":buffer_desc",
         ":simple_orc_jit",
         ":xla_framework",
         "//tensorflow/compiler/xla:shape_tree",
@@ -561,8 +589,6 @@ cc_library(
         ":ir_emission_utils",
         ":ir_function",
         ":parallel_loop_emitter",
-        ":shape_partition",
-        ":simple_orc_jit",
         ":target_machine_features",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -597,10 +623,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:CodeGen",
         "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:Target",
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//mlir:IR",
     ],
@@ -718,7 +741,6 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
     ],
 )
@@ -902,10 +924,10 @@ cc_library(
         ":runtime_conv2d",
         ":runtime_single_threaded_conv2d",
         "//tensorflow/compiler/xla:executable_run_options",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
         "//tensorflow/tsl/framework/convolution:eigen_helpers",
         "//third_party/eigen3",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
     ] + mkl_deps(),
 )
 
@@ -968,10 +990,10 @@ cc_library(
         ":runtime_lightweight_check",
         ":runtime_matmul",
         "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/tsl/platform:types",
-        "//third_party/eigen3",
         "//tensorflow/tsl/platform:dynamic_annotations",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:types",
+        "//third_party/eigen3",
     ] + acl_deps(),
 )
 
@@ -987,11 +1009,11 @@ cc_library(
         ":runtime_conv2d",
         ":runtime_single_threaded_conv2d",
         "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/tsl/platform:dynamic_annotations",
-        "//tensorflow/tsl/platform:types",
         "//tensorflow/tsl/framework/convolution:eigen_helpers",
-        "//third_party/eigen3",
+        "//tensorflow/tsl/platform:dynamic_annotations",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:types",
+        "//third_party/eigen3",
     ] + acl_deps(),
 )
 
diff --git a/tensorflow/compiler/xla/service/hlo_activation_analysis.h b/tensorflow/compiler/xla/service/cpu/buffer_desc.h
similarity index 51%
rename from tensorflow/compiler/xla/service/hlo_activation_analysis.h
rename to tensorflow/compiler/xla/service/cpu/buffer_desc.h
index bf2b7fded6a..7a66e3e1b21 100644
--- a/tensorflow/compiler/xla/service/hlo_activation_analysis.h
+++ b/tensorflow/compiler/xla/service/cpu/buffer_desc.h
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,20 +13,27 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ACTIVATION_ANALYSIS_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ACTIVATION_ANALYSIS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_BUFFER_DESC_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_BUFFER_DESC_H_
 
-#include <memory>
-
-#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include <cstddef>
 
 namespace xla {
+namespace cpu {
 
-// Returns a set of nodes that are considered activations. The inputs will not
-// be considered as activations with the current implementation.
-ConstHloInstructionSet ComputeHloActivationAnalysis(const HloModule* module);
+// BufferDesc for passing raw `buffer` (i.e. void ptr + size) arguments.
+class BufferDesc {
+ public:
+  BufferDesc(void* data, size_t size) : data_(data), size_(size) {}
+  void* data() const { return data_; }
+  size_t size() const { return size_; }
 
+ private:
+  void* data_;
+  size_t size_;
+};
+
+}  // namespace cpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ACTIVATION_ANALYSIS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_BUFFER_DESC_H_
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 607bc66cf32..8033eadd78a 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -119,7 +119,7 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> CompilerFunctor::operator()(
 
   llvm::PipelineTuningOptions pto;
   pto.LoopVectorization = !optimize_for_size_;
-  pto.SLPVectorization = !optimize_for_size_;
+  pto.SLPVectorization = !optimize_for_size_ && !disable_slp_vectorizer_;
   pto.LoopUnrolling = false;
 
   llvm::LoopAnalysisManager lam;
@@ -129,7 +129,7 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> CompilerFunctor::operator()(
 
   llvm::PassInstrumentationCallbacks pic;
   llvm::StandardInstrumentations si(module.getContext(), false);
-  si.registerCallbacks(pic, &fam);
+  si.registerCallbacks(pic, &mam);
 
   llvm::PassBuilder pb(target_machine_, pto, {}, &pic);
 
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.h b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
index a8dfe8c69c5..f706b9ff5ab 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.h
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
@@ -40,7 +40,7 @@ class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
   explicit CompilerFunctor(
       llvm::TargetMachine* target_machine, int opt_level,
       bool optimize_for_size, bool disable_expensive_passes,
-      llvm::FastMathFlags fast_math_flags,
+      bool disable_slp_vectorizer, llvm::FastMathFlags fast_math_flags,
       LLVMCompiler::ModuleHook pre_optimization_hook = nullptr,
       LLVMCompiler::ModuleHook post_optimization_hook = nullptr,
       std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook =
@@ -52,6 +52,7 @@ class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
         opt_level_(opt_level),
         optimize_for_size_(optimize_for_size),
         disable_expensive_passes_(disable_expensive_passes),
+        disable_slp_vectorizer_(disable_slp_vectorizer),
         fast_math_flags_(fast_math_flags),
         pre_optimization_hook_(std::move(pre_optimization_hook)),
         post_optimization_hook_(std::move(post_optimization_hook)),
@@ -68,6 +69,7 @@ class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
   const unsigned opt_level_;
   const bool optimize_for_size_;
   const bool disable_expensive_passes_;
+  const bool disable_slp_vectorizer_;
   const llvm::FastMathFlags fast_math_flags_;
   LLVMCompiler::ModuleHook pre_optimization_hook_;
   LLVMCompiler::ModuleHook post_optimization_hook_;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 5a285e2ae5c..3e99b2941b2 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/Triple.h"
+#include "llvm/TargetParser/X86TargetParser.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
@@ -69,6 +70,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
@@ -123,6 +125,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime/collectives.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime/convolution_call.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime/fft_call.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime/rng.h"
@@ -200,9 +203,10 @@ namespace {
 void LoadMLIRDialects(mlir::MLIRContext& context) {
   context.loadDialect<mlir::arith::ArithDialect, mlir::linalg::LinalgDialect,
                       mlir::scf::SCFDialect, mlir::vector::VectorDialect,
-                      mlir::func::FuncDialect, mlir::AffineDialect,
+                      mlir::func::FuncDialect, mlir::affine::AffineDialect,
                       mlir::tensor::TensorDialect,
                       mlir::xla_framework::XLAFrameworkDialect>();
+  mlir::registerBuiltinDialectTranslation(context);
   mlir::registerLLVMDialectTranslation(context);
 }
 
@@ -314,6 +318,8 @@ runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions(
     const HloModule& module) {
   runtime::CpuPipelineOptions copts;
   runtime::JitExecutable::Options opts;
+  copts.xla_cpu_sparse_cuda_threads =
+      GetDebugOptionsFromFlags().xla_cpu_sparse_cuda_threads();
   opts.specialization = runtime::JitExecutable::Specialization::kDisabled;
   opts.compiler.register_dialects =
       [](xla::runtime::DialectRegistry& dialects) {
@@ -324,10 +330,11 @@ runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions(
   opts.compiler.symbols_binding = runtime::ToSymbolsBinding(
       [](runtime::DirectCustomCallRegistry& registry) {
         PopulateXlaCpuCollectivesCall(registry);
+        PopulateXlaCpuConvolutionCall(registry);
         PopulateXlaCpuCustomCall(registry);
-        PopulateXlaXfeedCall(registry);
         PopulateXlaCpuFftCall(registry);
         PopulateXlaCpuRngCall(registry);
+        PopulateXlaXfeedCall(registry);
       });
   opts.compiler
       .create_compilation_pipeline = [&module, copts](
@@ -335,11 +342,26 @@ runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions(
     HloXlaRuntimePipelineOptions options;
     options.enable_tiling_and_fusion =
         GetDebugOptionsFromFlags().xla_cpu_enable_mlir_tiling_and_fusion();
+    options.experimental_deallocation =
+        GetDebugOptionsFromFlags().xla_cpu_enable_experimental_deallocation();
     options.cpu_name = llvm::sys::getHostCPUName();
+    if (GetDebugOptionsFromFlags().xla_cpu_enable_custom_matmul_tiling()) {
+      options.matmul_tile_sizes = {
+          GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_m_dim(),
+          GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_n_dim(),
+          GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_k_dim()};
+    }
+    if (GetDebugOptionsFromFlags().xla_cpu_enable_mlir_fusion_outlining()) {
+      options.enable_fusion_outlining = true;
+      options.experimental_deallocation = true;
+    }
+    options.xla_cpu_sparse_cuda_threads =
+        GetDebugOptionsFromFlags().xla_cpu_sparse_cuda_threads();
+
     Status status = CreateHloXlaRuntimePipeline(passes, options);
     if (!status.ok()) {
       LOG(FATAL) << "HLO-XLA Runtime pipeline failed with: "
-                 << status.error_message();
+                 << status.message();
     }
     runtime::CreateDefaultXlaCpuRuntimeCompilationPipeline(passes, copts);
 
@@ -621,6 +643,8 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
   FloatSupport f8e4m3fn_support(F8E4M3FN);
   pipeline.AddPass<FloatNormalization>(&f8e4m3fn_support);
+  FloatSupport f8e4m3b11fnuz_support(F8E4M3B11FNUZ);
+  pipeline.AddPass<FloatNormalization>(&f8e4m3b11fnuz_support);
   // After canonicalization, there may be more batch dots that can be
   // simplified.
   pipeline.AddPass<BatchDotSimplification>();
@@ -647,8 +671,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
       /*rewrite_training_op=*/true,
       /*rewrite_inference_op=*/true,
       /*rewrite_grad_op=*/true);
-  pipeline.AddPass<LogisticExpander>(
-      /*expansion_type=*/LogisticExpansionType::kExp);
+  pipeline.AddPass<LogisticExpander>();
   pipeline.AddPass<ConditionalCanonicalizer>();
   pipeline.AddPass<DynamicDimensionSimplifier>();
   auto dynamic_padder_options = DynamicPadderOptions();
@@ -671,15 +694,12 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   //     accumulation happens in f32.
   if (!module->config().debug_options().xla_cpu_strict_dot_conv_math()) {
     pipeline.AddPass<ChangeOpDataType>(
-        F16, F32, [](const HloInstruction* instr) {
-          return instr->opcode() == HloOpcode::kDot ||
-                 instr->opcode() == HloOpcode::kConvolution;
-        });
+        F16, F32, HloPredicateIsOp<HloOpcode::kDot, HloOpcode::kConvolution>);
   }
 
   // Run the following passes to a fixed point.
   [&pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification"),
-   is_mlir_compile, this] {
+   this] {
     AddHloVerifier(&pipeline, allow_sparse_shapes_, HloVerifierOpts{},
                    /*debug_only=*/true);
 
@@ -694,13 +714,8 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     pipeline.AddPass<HloDCE>();
     pipeline.AddPass<GatherExpander>(GatherExpander::kEliminateSimpleGathers);
 
-    // Disable TreeReductionRewriter for MLIR compiles. Reduce window is quite
-    // slow, and reduce is supposed to have similar numerics using a tree-like
-    // tiling pattern.
-    if (!is_mlir_compile) {
-      // Needs to happen after algebraic simplifier.
-      pipeline.AddPass<TreeReductionRewriter>();
-    }
+    // Needs to happen after algebraic simplifier.
+    pipeline.AddPass<TreeReductionRewriter>();
 
     // BatchNormExpander can create zero-sized ops, so zero-sized HLO
     // elimination has to come after that pass.
@@ -763,14 +778,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
 Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     HloModule* module, bool is_aot_compile,
     LLVMTargetMachineFeatures* target_machine_features, bool is_mlir_compile) {
-  {
-    HloPassPipeline pipeline("hlo normalization");
-    pipeline.AddPass<ReshapeDecomposer>();
-    pipeline.AddPass<ReduceDecomposer>();
-    pipeline.AddPass<BroadcastCanonicalizer>();
-    TF_RETURN_IF_ERROR(pipeline.Run(module).status());
-  }
-
   HloPassPipeline pipeline("HLO passes after layout assignment");
 
   // CopyInsertion is still needed by BufferAssignment. MLIR passes will handle
@@ -781,6 +788,14 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     return pipeline.Run(module).status();
   }
 
+  {
+    HloPassPipeline normalization_pipeline("hlo normalization");
+    normalization_pipeline.AddPass<ReshapeDecomposer>();
+    normalization_pipeline.AddPass<ReduceDecomposer>();
+    normalization_pipeline.AddPass<BroadcastCanonicalizer>();
+    TF_RETURN_IF_ERROR(normalization_pipeline.Run(module).status());
+  }
+
   // After layout assignment, use a layout-sensitive verifier.
   pipeline.AddPass<HloPassPipeline>("after layout assignment");
   AddHloVerifier(&pipeline, allow_sparse_shapes_,
@@ -1055,17 +1070,40 @@ Status LowerMLIRModule(HloModule* module, mlir::ModuleOp mlir_module,
         /*printAfterOnlyOnFailure=*/false, llvm::errs(), printing_flags);
   }
 
+  if (DumpingEnabledForHloModule(*module)) {
+    pm.addInstrumentation(
+        std::make_unique<mlir::interpreter::MlirCompilerTraceInstrumentation>(
+            module->config().debug_options().xla_dump_to(), module->unique_id(),
+            module->name()));
+  }
+
   xla::runtime::PassManager xla_pm(&pm);
   HloXlaRuntimePipelineOptions options;
   options.enable_tiling_and_fusion =
       GetDebugOptionsFromFlags().xla_cpu_enable_mlir_tiling_and_fusion();
+  if (GetDebugOptionsFromFlags().xla_cpu_enable_custom_matmul_tiling()) {
+    options.matmul_tile_sizes = {
+        GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_m_dim(),
+        GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_n_dim(),
+        GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_k_dim()};
+  }
   options.sparse_bufferization = false;
   options.outline_with_xla_framework = true;
   options.experimental_deallocation =
       GetDebugOptionsFromFlags().xla_cpu_enable_experimental_deallocation();
-  // TODO(b/271126383): The flag should depend on the lowering target.
-  options.enable_avx2 = true;
+  options.enable_avx2 = [&] {
+    // Derive whether this is an x86 CPU with AVX2 enabled.
+    if (!target.getTargetTriple().isX86()) return false;
+    llvm::SmallVector<llvm::StringRef> cpu_features;
+    llvm::X86::getFeaturesForCPU(target.getTargetCPU(), cpu_features);
+    return llvm::is_contained(cpu_features, "avx2");
+  }();
   options.cpu_name = target.getTargetCPU();
+  if (GetDebugOptionsFromFlags().xla_cpu_enable_mlir_fusion_outlining()) {
+    options.enable_fusion_outlining = true;
+    options.sparse_bufferization = false;
+    options.experimental_deallocation = true;
+  }
   TF_RETURN_IF_ERROR(CreateHloXlaRuntimePipeline(xla_pm, options));
 
   runtime::CpuPipelineOptions cpu_pipeline_opts;
@@ -1087,15 +1125,30 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> createMLIRModule(
   auto mlir_module = builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
   TF_RETURN_IF_ERROR(ConvertHloToMlirHlo(mlir_module, module));
 
+  // Flatten tuples before we set up the input mapping. The flattening pass
+  // doesn't preserve attributes so we'd lose some in the process.
+  mlir::PassManager pm(mlir_module.getOperation()->getName(),
+                       mlir::PassManager::Nesting::Implicit);
+  pm.addPass(mlir::mhlo::createExpandHloTuplesPass("main"));
+  if (failed(pm.run(mlir_module.getOperation()))) {
+    return tsl::errors::Internal("Failed to flatten tuples");
+  }
+
   // Add buffer mappings. The first attribute is the index of the slice, the
   // second is a boolean attribute on whether the allocation is writeable.
   llvm::SmallVector<std::pair<mlir::Attribute, mlir::Attribute>>
       operand_mapping;
   for (auto i : module->entry_computation()->parameter_instructions()) {
-    auto slice = assignment->GetUniqueTopLevelSlice(i);
-    operand_mapping.emplace_back(
-        builder.getI32IntegerAttr(static_cast<int32_t>(slice->index())),
-        builder.getBoolAttr(!slice->allocation()->is_readonly()));
+    ShapeUtil::ForEachSubshape(
+        i->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+          if (subshape.IsTuple()) {
+            return;
+          }
+          auto slice = assignment->GetUniqueSlice(i, index);
+          operand_mapping.emplace_back(
+              builder.getI32IntegerAttr(static_cast<int32_t>(slice->index())),
+              builder.getBoolAttr(!slice->allocation()->is_readonly()));
+        });
   }
 
   auto root_instr = module->entry_computation()->root_instruction();
@@ -1104,22 +1157,27 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> createMLIRModule(
   // Gather mappings to each element in the tuple if necessary
   llvm::SmallVector<mlir::Attribute> result_inner_mapping;
   if (output_allocation->allocation()->is_tuple()) {
-    for (auto i : llvm::seq<int>(0, root_instr->shape().tuple_shapes_size())) {
-      int64_t result_index =
-          assignment->GetUniqueSlice(root_instr, {i})->index();
-      result_inner_mapping.push_back(mlir::IntegerAttr::get(
-          mlir::IntegerType::get(&mlir_context, 64), result_index));
-      if (export_mapping != nullptr) {
-        export_mapping->flattened_outputs.push_back(result_index);
-      }
-    }
+    ShapeUtil::ForEachSubshape(
+        root_instr->shape(),
+        [&](const Shape& subshape, const ShapeIndex& index) {
+          if (subshape.IsTuple()) {
+            return;
+          }
+          int64_t result_index =
+              assignment->GetUniqueSlice(root_instr, index)->index();
+          result_inner_mapping.push_back(
+              builder.getI64IntegerAttr(result_index));
+          if (export_mapping != nullptr) {
+            export_mapping->flattened_outputs.push_back(result_index);
+          }
+        });
   }
 
   int output_index = static_cast<int>(output_allocation->index());
   auto result_mapping = builder.getI32IntegerAttr(output_index);
   mlir_module->walk([&](mlir::func::FuncOp f) {
     if (f.getSymName() == "main") {
-      for (auto& p : llvm::enumerate(operand_mapping)) {
+      for (const auto& p : llvm::enumerate(operand_mapping)) {
         f.setArgAttr(p.index(), "xla_framework.input_mapping", p.value().first);
         if (export_mapping != nullptr) {
           auto index_attr = p.value().first.dyn_cast<mlir::IntegerAttr>();
@@ -1232,6 +1290,7 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
       CodeGenOptLevel(module->config()),
       options::OptimizeForSizeRequested(module->config()),
       module->config().debug_options().xla_llvm_disable_expensive_passes(),
+      options::SlpVectorizerDisabled(module->config()),
       llvm_ir::GetCpuFastMathFlags(module->config()), pre_optimization_ir_hook,
       post_optimization_ir_hook,
       OrcJITPostCompilationHook::Create(module.get()));
@@ -1313,9 +1372,9 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
                 subcomputation.allow_reassociation)
             .status());
   }
-  std::string function_name_prefix = entry_computation->name().empty()
-                                         ? "__compute"
-                                         : entry_computation->name();
+  absl::string_view function_name_prefix = entry_computation->name().empty()
+                                               ? "__compute"
+                                               : entry_computation->name();
   TF_ASSIGN_OR_RETURN(llvm::Function * entry_function,
                       ir_emitter.EmitComputation(
                           entry_computation, function_name_prefix,
@@ -1725,6 +1784,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
         target_machine.get(), opt_level,
         options::OptimizeForSizeRequested(module->config()),
         module->config().debug_options().xla_llvm_disable_expensive_passes(),
+        options::SlpVectorizerDisabled(module->config()),
         llvm_ir::GetCpuFastMathFlags(module->config()),
         pre_optimization_ir_hook, post_optimization_ir_hook, post_codegen_hook,
         aot_options.sanitize_dataflow(),
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index b3530432b98..797e9dcdbfe 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -81,14 +81,15 @@ CpuExecutable::CpuExecutable(
 
   // Resolve symbols in the constructor rather than at execution time to avoid
   // races because FindSymbol is not thread safe.
-  llvm::Expected<llvm::JITEvaluatedSymbol> sym =
+  llvm::Expected<llvm::orc::ExecutorSymbolDef> sym =
       jit_->FindCompiledSymbol(entry_function_name);
   // We expect to find the symbol provided with entry_function_name; otherwise
   // this is an internal error.
-  CHECK(*sym) << "Symbol " << entry_function_name << " not found.";
+  CHECK(sym->getAddress()) << "Symbol " << entry_function_name << " not found.";
   // getAddress can do work under the hood in the jit, so it needs to be
   // guarded by the mutex.
-  compute_function_ = reinterpret_cast<ComputeFunctionType>(sym->getAddress());
+  compute_function_ =
+      reinterpret_cast<ComputeFunctionType>(sym->getAddress().getValue());
   VLOG(1) << "compute_function_ at address "
           << reinterpret_cast<void*>(compute_function_);
   jit_->DoneCompiling();
@@ -267,6 +268,9 @@ StatusOr<std::unique_ptr<Executable>> CpuExecutable::LoadFromObjFile(
     std::unique_ptr<BufferAssignment> buffer_assignment,
     XlaFrameworkMapping xla_framework_mapping,
     runtime::JitExecutable::Options opts) {
+  VLOG(1) << "Load serialized Cpu executable from object file: module="
+          << hlo_module->name();
+
   runtime::DialectRegistry dialects;
   opts.compiler.register_dialects(dialects);
   auto threading = mlir::MLIRContext::Threading::DISABLED;
@@ -521,14 +525,24 @@ Status XlaRuntimeCpuExecutable::Execute(
   for (int64_t index : xla_framework_mapping_.inputs) {
     TF_RETURN_IF_ERROR(append_converted_buffer(index));
   }
-  // If we have a tuple (possibly empty) as output, then .output_is_tuple
-  // is set and .result should be ignored.
+
+  int64_t result_index = xla_framework_mapping_.result;
   if (xla_framework_mapping_.output_is_tuple) {
-    for (int64_t index : xla_framework_mapping_.flattened_outputs) {
-      TF_RETURN_IF_ERROR(append_converted_buffer(index));
+    size_t num_outputs = xla_framework_mapping_.flattened_outputs.size();
+    for (size_t i = 0; i < num_outputs; ++i) {
+      int64_t output_index = xla_framework_mapping_.flattened_outputs[i];
+
+      TF_RETURN_IF_ERROR(append_converted_buffer(output_index));
+
+      // Populate the output tuple with a pointer to this result.
+      // TODO(b/249078472): make this work with nested tuples, if needed.
+      assert(result_index != -1);
+      void** results =
+          static_cast<void**>(descriptor_table[result_index].data());
+      results[i] = descriptor_table[output_index].data();
     }
-  } else if (xla_framework_mapping_.result != -1) {
-    TF_RETURN_IF_ERROR(append_converted_buffer(xla_framework_mapping_.result));
+  } else if (result_index != -1) {
+    TF_RETURN_IF_ERROR(append_converted_buffer(result_index));
   }
 
   runtime::Executable::CallFrame call_frame;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 8e03c770aa3..58f8c8334ed 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/runtime/ffi.h"
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/cpu/buffer_desc.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/cpu/xla_framework.h"
 #include "tensorflow/compiler/xla/service/custom_call_status_internal.h"
@@ -46,18 +47,6 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-// BufferDesc for passing raw `buffer` (i.e. void ptr + size) arguments.
-class BufferDesc {
- public:
-  BufferDesc(void* data, size_t size) : data_(data), size_(size) {}
-  void* data() const { return data_; }
-  size_t size() const { return size_; }
-
- private:
-  void* data_;
-  size_t size_;
-};
-
 class XlaRuntimeCpuExecutable {
   using FfiModulesState = ::xla::runtime::ffi::FfiModulesState;
 
@@ -167,8 +156,9 @@ class CpuExecutable : public Executable {
 
   bool IsXlaRuntime() const { return xla_runtime_executable_ != nullptr; }
 
-  Status ExecuteXlaRuntime(const std::vector<BufferDesc>& descriptor_table,
-                           const ExecutableRunOptions* run_options = nullptr) {
+  Status ExecuteXlaRuntime(
+      const std::vector<BufferDesc>& descriptor_table,
+      const ExecutableRunOptions* run_options = nullptr) const {
     return xla_runtime_executable_->Execute(descriptor_table, run_options);
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
index b1201c84399..5b709d2ed62 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
@@ -38,6 +38,9 @@ class CpuLayoutAssignment : public LayoutAssignment {
 
  protected:
   Status AddBackendConstraints(LayoutConstraints* constraints) override;
+  // The CPU backend does not use memory spaces, so there is no need to
+  // propagate them.
+  Status PropagateMemorySpace(HloModule* module) override { return OkStatus(); }
 
   const TargetMachineFeatures& target_machine_features_;
 };
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index 49d14b92be8..c5323097216 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -25,6 +25,7 @@ const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 const char* const kXlaForceEnableExperimentalLlvmIrGemm =
     "xla_force_enable_experimental_llvm_ir_gemm";
 const char* const kLlvmIrGemmTileSize = "xla_llvm_ir_gemm_tile_size";
+const char* const kDisableSlpVectorizer = "xla_cpu_disable_slp_vectorizer";
 
 }  // namespace
 
@@ -44,6 +45,12 @@ bool VectorizedReduceDisabled(const HloModuleConfig& config) {
   return extra_options_map.count(kXlaOptimizeForSizeCpuOption) > 0;
 }
 
+bool SlpVectorizerDisabled(const HloModuleConfig& config) {
+  const auto& extra_options_map =
+      config.debug_options().xla_backend_extra_options();
+  return extra_options_map.count(kDisableSlpVectorizer) > 0;
+}
+
 std::optional<int64_t> LlvmIrGemvTilingFactor(const HloModuleConfig& config) {
   const auto& extra_options_map =
       config.debug_options().xla_backend_extra_options();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index f3c613589d1..76088da2679 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -26,6 +26,7 @@ namespace options {
 
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
+bool SlpVectorizerDisabled(const HloModuleConfig& config);
 bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
 std::optional<int64_t> LlvmIrGemvTilingFactor(const HloModuleConfig& config);
 std::optional<std::tuple<int64_t, int64_t, int64_t>> LlvmIrGemmTileSize(
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 8c630975a42..f3a81d0cf03 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -1391,8 +1391,8 @@ Status EmitBatchDotOperation(
           dot, target_array, lhs_array, rhs_array, executable_run_options_value,
           b, mlir_context, hlo_module_config, target_machine_features,
           dot_info)) {
-    DotOpEmitter dot_emitter(dot_info, dot.name(), target_array, lhs_array,
-                             rhs_array, nullptr /*addend_array*/,
+    DotOpEmitter dot_emitter(dot_info, std::string(dot.name()), target_array,
+                             lhs_array, rhs_array, nullptr /*addend_array*/,
                              executable_run_options_value, b, mlir_context,
                              hlo_module_config, target_machine_features);
 
@@ -1453,9 +1453,9 @@ Status EmitBatchDotOperation(
 
           // Emit the inner non-batch dot operation.
           return EmitNonBatchDotOperation(
-              dot_info, dot.name(), target_slice, lhs_slice, rhs_slice, nullptr,
-              executable_run_options_value, b, mlir_context, hlo_module_config,
-              target_machine_features);
+              dot_info, std::string(dot.name()), target_slice, lhs_slice,
+              rhs_slice, nullptr, executable_run_options_value, b, mlir_context,
+              hlo_module_config, target_machine_features);
         });
   }
 }
@@ -1523,10 +1523,10 @@ Status EmitDotOperation(const HloInstruction& dot,
                                  hlo_module_config, target_machine_features);
   }
 
-  return EmitNonBatchDotOperation(DotInfo(dot), dot.name(), target_array,
-                                  lhs_array, rhs_array, addend_array,
-                                  executable_run_options_value, b, mlir_context,
-                                  hlo_module_config, target_machine_features);
+  return EmitNonBatchDotOperation(
+      DotInfo(dot), std::string(dot.name()), target_array, lhs_array, rhs_array,
+      addend_array, executable_run_options_value, b, mlir_context,
+      hlo_module_config, target_machine_features);
 }
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.cc b/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.cc
index 70b4cdcce7a..e48b466cd02 100644
--- a/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.cc
+++ b/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
 #include "mlir/Conversion/BufferizationToMemRef/BufferizationToMemRef.h"  // from @llvm-project
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"  // from @llvm-project
+#include "mlir/Conversion/Passes.h"  // from @llvm-project
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/TensorToLinalg/TensorToLinalgPass.h"  // from @llvm-project
@@ -28,12 +29,15 @@ limitations under the License.
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"  // from @llvm-project
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Func/Transforms/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/Transforms/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/SparseTensor/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
@@ -43,6 +47,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
 #include "tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h"
 #include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/interfaces/bufferizable_op_interface_impl.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
@@ -57,7 +62,8 @@ namespace {
 
 using mlir::func::FuncOp;
 
-mlir::bufferization::OneShotBufferizationOptions GetBufferizationOptions() {
+mlir::bufferization::OneShotBufferizationOptions GetBufferizationOptions(
+    bool new_deallocator) {
   using mlir::bufferization::BufferizationOptions;
   using mlir::bufferization::LayoutMapOption;
   using mlir::bufferization::OneShotBufferizationOptions;
@@ -65,7 +71,8 @@ mlir::bufferization::OneShotBufferizationOptions GetBufferizationOptions() {
   OneShotBufferizationOptions options;
   options.bufferizeFunctionBoundaries = true;
   options.allowReturnAllocs = true;
-  options.functionBoundaryTypeConversion = LayoutMapOption::IdentityLayoutMap;
+  options.setFunctionBoundaryTypeConversion(LayoutMapOption::IdentityLayoutMap);
+  options.createDeallocs = !new_deallocator;
   options.unknownTypeConverterFn = [](mlir::Value value,
                                       mlir::Attribute memorySpace,
                                       const BufferizationOptions& options) {
@@ -75,22 +82,43 @@ mlir::bufferization::OneShotBufferizationOptions GetBufferizationOptions() {
   return options;
 }
 
-void AddSparsificationPasses(mlir::OpPassManager& pm) {
+void AddSparsificationPasses(mlir::OpPassManager& pm, bool new_deallocator,
+                             int32_t xla_cpu_sparse_cuda_threads) {
+  // Sparse GPU acceleration enables parallel loops.
+  const bool gpu_codegen = xla_cpu_sparse_cuda_threads > 0;
+  mlir::SparsificationOptions sparsification_options;
+  if (gpu_codegen) {
+    sparsification_options.parallelizationStrategy =
+        mlir::SparseParallelizationStrategy::kDenseOuterLoop;
+  }
+  // Sparsification set up.
   pm.addNestedPass<FuncOp>(mlir::createLinalgGeneralizationPass());
   pm.addNestedPass<FuncOp>(mlir::gml_st::createRewriteFromElementsOpPass());
   pm.addPass(mlir::bufferization::createEmptyTensorEliminationPass());
-  pm.addNestedPass<FuncOp>(
-      mlir::bufferization::createEmptyTensorToAllocTensorPass());
-  pm.addPass(mlir::createPreSparsificationRewritePass());
   pm.addPass(mlir::createSparsificationAndBufferizationPass(
-      GetBufferizationOptions(), mlir::SparsificationOptions(),
-      mlir::SparseTensorConversionOptions(), /*enableRuntimeLibrary=*/false,
+      GetBufferizationOptions(new_deallocator), sparsification_options,
+      mlir::SparseTensorConversionOptions(),
+      /*createSparseDeallocs=*/false,
+      /*enableRuntimeLibrary=*/false,
       /*enableBufferInitialization=*/false,
       /*vectorLength=*/0,
       /*enableVLAVectorization=*/false,
       /*enableSIMDIndex32*/ false));
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::bufferization::createFinalizingBufferizePass());
+  // Sparse GPU acceleration lowers to GPU dialect.
+  if (gpu_codegen) {
+    pm.addPass(mlir::createSparseGPUCodegenPass());
+    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createStripDebugInfoPass());
+    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertSCFToCFPass());
+    pm.addNestedPass<mlir::gpu::GPUModuleOp>(
+        mlir::createLowerGpuOpsToNVVMOpsPass());
+  }
+}
+
+void AddSparsificationPassPipeline(mlir::OpPassManager& pm) {
+  AddSparsificationPasses(pm, false, /*xla_cpu_sparse_cuda_threads=*/0);
 }
 
 }  // namespace
@@ -120,6 +148,14 @@ static Status CreateHloXlaPipeline(
   // Some early sparse rewriting rules.
   if (options.sparse_bufferization) {
     pm.addNestedPass<FuncOp>(createSparseCustomCallRewritingPass());
+    // We wrap some CHLO unary operations with custom calls to preserve the
+    // sparsity information for those operations during the roundtrip. We now
+    // invoke the needed passes to lower such CHLO operations to HLO after we
+    // rewrite the custom calls back to such CHLO unary operations.
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::mhlo::createLegalizeSparseChloToLinalgPass());
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::mhlo::createChloLegalizeToHloPass());
     pm.addNestedPass<mlir::func::FuncOp>(
         mlir::mhlo::createSparseRewritingPass());
   }
@@ -129,8 +165,11 @@ static Status CreateHloXlaPipeline(
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::mhlo::createLegalizeControlFlowPass());
   pm.addPass(::mlir::mhlo::createLegalizeToArithmeticPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      xla::cpu::createLegalizeCollectiveOpsPass());
+  // Outlined ABI doesn't support XLA Runtime FFI.
+  if (!options.outline_with_xla_framework) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        xla::cpu::createLegalizeLibraryOpsPass());
+  }
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::mhlo::createMhloExpandOpsSimplifierPass());
   pm.addNestedPass<mlir::func::FuncOp>(
@@ -165,8 +204,15 @@ static Status CreateHloXlaPipeline(
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::memref::createResolveShapedTypeResultDimsPass());
   pm.addPass(mlir::createCanonicalizerPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::gml_st::createOptimizeLinalgOpsPass());
   if (options.enable_tiling_and_fusion) {
-    mlir::gml_st::addDefaultCPUTilingPipeline(pm, options.cpu_name);
+    mlir::gml_st::GmlStCPUTilingOptions opts =
+        mlir::gml_st::getDefaultCPUPipelineOptions(options.cpu_name);
+    opts.matmulTileSizes = options.matmul_tile_sizes;
+    opts.inlineFusionClusters = false;
+    mlir::gml_st::addCPUTilingPipeline(pm, opts);
+
   } else {
     pm.addNestedPass<mlir::func::FuncOp>(
         mlir::createLinalgElementwiseOpFusionPass());
@@ -194,20 +240,27 @@ static Status CreateHloXlaPipeline(
   // bufferizing anything.
   pm.addPass(mlir::createCanonicalizerPass());
 
+  if (options.experimental_deallocation) {
+    // Experimental deallocation needs input IR without any buffer reuse to
+    // work optimally. This pass ensures that's the case.
+    pm.addNestedPass<FuncOp>(mlir::deallocation::createSplitAllocTensorsPass());
+  }
+
   if (options.sparse_bufferization) {
     // Convert Sparse tensors.
-    AddSparsificationPasses(pm);
+    AddSparsificationPasses(pm, options.experimental_deallocation,
+                            options.xla_cpu_sparse_cuda_threads);
   } else {
-    if (options.experimental_deallocation) {
-      // Experimental deallocation needs input IR without any buffer reuse to
-      // work optimally. This pass ensures that's the case.
-      pm.addNestedPass<FuncOp>(
-          mlir::deallocation::createSplitAllocTensorsPass());
-    }
     pm.addPass(mlir::hlo::createOneShotBufferizePass());
   }
   pm.addNestedPass<mlir::func::FuncOp>(createRewriteReallocToAllocPass());
 
+  if (options.enable_fusion_outlining) {
+    pm.addPass(mlir::gml_st::createFusionOutliningPass());
+    pm.addPass(mlir::func::createDuplicateFunctionEliminationPass());
+  }
+  pm.addNestedPass<FuncOp>(mlir::gml_st::createInlineFusionClustersPass());
+
   if (options.enable_tiling_and_fusion) {
     pm.addNestedPass<FuncOp>(mlir::gml_st::createVectorizeCopyPass());
     pm.addNestedPass<FuncOp>(mlir::gml_st::createNaiveCopyRemovalPass());
@@ -227,16 +280,18 @@ static Status CreateHloXlaPipeline(
   if (options.outline_with_xla_framework) {
     pm.addPass(mlir::xla_framework::CreateOutlineWithXLAFrameworkPass());
   }
-  pm.addPass(mlir::createInlinerPass());
 
   if (options.experimental_deallocation) {
-    CHECK(!options.sparse_bufferization)
-        << "Sparse bufferization and experimental deallocation are mutually "
-           "exclusive.";
-    pm.addNestedPass<FuncOp>(mlir::deallocation::createDeallocatePass());
-    pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
+    pm.addNestedPass<FuncOp>(
+        mlir::deallocation::createXlaBufferArgRewritePass());
+    pm.addPass(mlir::deallocation::createDeallocatePass());
+    pm.addNestedPass<FuncOp>(
+        mlir::deallocation::createDeallocationSimplificationPass());
+    // Remove SCF iter args that became redundant after simplification.
+    pm.addPass(mlir::createCanonicalizerPass());
     pm.addNestedPass<FuncOp>(mlir::deallocation::createBufferReusePass());
-    pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
+    pm.addNestedPass<FuncOp>(
+        mlir::deallocation::createDeallocationSimplificationPass());
     pm.addNestedPass<FuncOp>(mlir::deallocation::createDeallocationToScfPass());
   } else {
     pm.addNestedPass<FuncOp>(
@@ -265,8 +320,6 @@ static Status CreateHloXlaPipeline(
   pm.addNestedPass<FuncOp>(xla::cpu::createLegalizeI1VectorTransferOpsPass());
   pm.addNestedPass<FuncOp>(
       xla::cpu::createConvertXlaCpuMemRefElementCastToLLVMPass());
-  pm.addNestedPass<FuncOp>(
-      mlir::deallocation::createConvertDeallocationOpsToLLVM());
   return OkStatus();
 }
 
@@ -285,11 +338,13 @@ void RegisterHloXlaRuntimePipelineDialects(mlir::DialectRegistry& dialects) {
   mlir::arith::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::bufferization::func_ext::registerBufferizableOpInterfaceExternalModels(
       dialects);
+  mlir::gml_st::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::linalg::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::linalg::registerTilingInterfaceExternalModels(dialects);
   mlir::mhlo::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::scf::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::shape::registerBufferizableOpInterfaceExternalModels(dialects);
+  mlir::sparse_tensor::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::tensor::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::vector::registerBufferizableOpInterfaceExternalModels(dialects);
 }
@@ -302,14 +357,14 @@ static mlir::PassPipelineRegistration<> hlo_xla_runtime_pipeline(
       Status status = CreateHloXlaPipeline(pm, options);
       if (!status.ok()) {
         LOG(FATAL) << "HLO-XLA Runtime pipeline failed with: "
-                   << status.error_message();
+                   << status.message();
       }
     });
 
 static mlir::PassPipelineRegistration<> sparsification_pipeline(
     "hlo-xla-runtime-sparsification",
     "Sparsification passes from HLO-XLA Runtime pipeline",
-    AddSparsificationPasses);
+    AddSparsificationPassPipeline);
 
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h b/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h
index e8b7195a2fc..15c4e081708 100644
--- a/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h
+++ b/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_HLO_XLA_RUNTIME_PIPELINE_H_
 
 #include <string>
+#include <vector>
 
 #include "tensorflow/compiler/xla/runtime/compiler.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -30,12 +31,17 @@ namespace cpu {
 
 struct HloXlaRuntimePipelineOptions {
   bool enable_tiling_and_fusion = false;
+  bool enable_fusion_outlining = true;
   bool sparse_bufferization = true;
   bool outline_with_xla_framework = false;
   bool experimental_deallocation = false;
   bool enable_avx2 = true;
+  // Accelerate sparse computations with CUDA threading.
+  // This is an experimental feature, so off by default.
+  int32_t xla_cpu_sparse_cuda_threads = 0;
   // Optional CPU name, similar to llc's -mcpu flag.
   std::string cpu_name = "";
+  std::vector<int64_t> matmul_tile_sizes = {};
 };
 
 // Creates a pipeline that lowers modules from HLO to Linalg on buffers.
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 04c830cdd50..0e06d7c0144 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -162,7 +162,7 @@ void IrEmitter::EmitThreadLocalFunctionEpilogue(HloComputation* computation) {
 }
 
 StatusOr<llvm::Function*> IrEmitter::EmitComputation(
-    HloComputation* computation, const std::string& function_name_prefix,
+    HloComputation* computation, absl::string_view function_name_prefix,
     bool is_top_level_computation,
     absl::Span<HloInstruction* const> instruction_order,
     bool allow_reassociation) {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index cfdbd8d64a2..70a78129588 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -113,7 +113,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // If 'allow_reassociation' is true, the fast-math reassociation flag will
   // be enabled in the function's body. This is used when emitting reducers.
   StatusOr<llvm::Function*> EmitComputation(
-      HloComputation* computation, const std::string& function_name_prefix,
+      HloComputation* computation, absl::string_view function_name_prefix,
       bool is_top_level_computation,
       absl::Span<HloInstruction* const> instruction_order,
       bool allow_reassociation);
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc
index 916622db857..9e886b8f446 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc
@@ -251,7 +251,7 @@ std::vector<llvm::Value*> GetArrayFunctionCallArguments(
 Status EmitCallToParallelForkJoin(
     const std::vector<llvm::Value*>& arguments, const Shape& shape,
     absl::Span<const int64_t> dimension_partition_counts, llvm::IRBuilder<>* b,
-    llvm::Function* parallel_function, const std::string& name) {
+    llvm::Function* parallel_function, absl::string_view name) {
   llvm::Module* module = b->GetInsertBlock()->getModule();
 
   // Build ParallelForkJoin function type.
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.h b/tensorflow/compiler/xla/service/cpu/ir_function.h
index 59858abfcfd..95367e2ccf7 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.h
@@ -148,7 +148,7 @@ std::vector<llvm::Value*> GetArrayFunctionCallArguments(
 Status EmitCallToParallelForkJoin(
     const std::vector<llvm::Value*>& arguments, const Shape& shape,
     absl::Span<const int64_t> dimension_partition_counts, llvm::IRBuilder<>* b,
-    llvm::Function* parallel_function, const std::string& name);
+    llvm::Function* parallel_function, absl::string_view name);
 
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/BUILD b/tensorflow/compiler/xla/service/cpu/runtime/BUILD
index e13428c13a1..413a51132c7 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/runtime/BUILD
@@ -13,6 +13,13 @@ package_group(
     ],
 )
 
+cc_library(
+    name = "retain",
+    srcs = ["retain.cc"],
+    visibility = ["//visibility:public"],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "collectives",
     srcs = ["collectives.cc"],
@@ -29,6 +36,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "convolution_call",
+    srcs = ["convolution_call.cc"],
+    hdrs = ["convolution_call.h"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
+        "//tensorflow/compiler/xla/service/cpu:runtime_conv3d",
+        "//tensorflow/compiler/xla/service/cpu:runtime_fft",
+        "@com_google_absl//absl/status",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "custom_call",
     srcs = ["custom_call.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/collectives.cc b/tensorflow/compiler/xla/service/cpu/runtime/collectives.cc
index 4c254f32d1b..a1aa5b57787 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime/collectives.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime/collectives.cc
@@ -237,14 +237,16 @@ namespace {
 struct XlaTupleAllToAll {
   absl::Status operator()(const ExecutableRunOptions* run_options,
                           CustomCall::RemainingArgs buffers,
-                          CustomCall::TensorRef<int64_t> replica_groups) const;
+                          CustomCall::TensorRef<int64_t> replica_groups,
+                          int32_t channel_id_present, int64_t op_id) const;
   static XlaTupleAllToAll Handler() { return XlaTupleAllToAll(); }
 };
 }  // namespace
 
 absl::Status XlaTupleAllToAll::operator()(
     const ExecutableRunOptions* run_options, CustomCall::RemainingArgs buffers,
-    CustomCall::TensorRef<int64_t> replica_groups) const {
+    CustomCall::TensorRef<int64_t> replica_groups, int32_t channel_id_present,
+    int64_t op_id) const {
   if (replica_groups.shape.size() != 2) {
     return absl::InvalidArgumentError("replica_groups must be a 2d tensor.");
   }
@@ -273,11 +275,11 @@ absl::Status XlaTupleAllToAll::operator()(
   size_t buffer_size = ShapeUtil::ByteSizeOfElements(
       ShapeUtil::MakeShape(first_input.dtype, first_input.sizes));
 
-  __xla_cpu_runtime_AllToAll(run_options, 0, 0, replica_groups_str.c_str(),
-                             static_cast<int32_t>(replica_groups_str.size()),
-                             static_cast<int32_t>(num_buffers),
-                             static_cast<int64_t>(buffer_size),
-                             input_buffers.data(), output_buffers.data());
+  __xla_cpu_runtime_AllToAll(
+      run_options, channel_id_present, op_id, replica_groups_str.c_str(),
+      static_cast<int32_t>(replica_groups_str.size()),
+      static_cast<int32_t>(num_buffers), static_cast<int64_t>(buffer_size),
+      input_buffers.data(), output_buffers.data());
 
   return absl::OkStatus();
 }
@@ -289,6 +291,8 @@ static bool TupleAllToAll(xla::runtime::ExecutionContext* ctx, void** args,
           .UserData<const ExecutableRunOptions*>()
           .RemainingArgs()
           .Attr<CustomCall::TensorRef<int64_t>>("replica_groups")
+          .Attr<int32_t>("channel_id_present")
+          .Attr<int64_t>("op_id")
           .To<RuntimeChecks()>(XlaTupleAllToAll::Handler())
           .release();
   return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/convolution_call.cc b/tensorflow/compiler/xla/service/cpu/runtime/convolution_call.cc
new file mode 100644
index 00000000000..850e31e5dac
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/convolution_call.cc
@@ -0,0 +1,286 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "tensorflow/compiler/xla/service/cpu/runtime/convolution_call.h"
+
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_conv2d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_conv3d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+
+namespace xla {
+namespace cpu {
+
+using ::xla::runtime::CustomCall;
+using ::xla::runtime::Executable;
+using ::xla::runtime::MemrefView;
+
+// Disable all CustomCall checks in optimized build.
+static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
+#if defined(NDEBUG)
+  return CustomCall::RuntimeChecks::kNone;
+#else
+  return CustomCall::RuntimeChecks::kDefault;
+#endif
+}
+
+namespace {
+struct XlaConvolution {
+  absl::Status operator()(const ExecutableRunOptions* run_options,
+                          MemrefView input, MemrefView kernel,
+                          MemrefView output, int64_t inputBatchDimension,
+                          absl::Span<const int64_t> inputSpatialDimensions,
+                          int64_t inputFeatureDimension,
+                          absl::Span<const int64_t> kernelSpatialDimensions,
+                          int64_t kernelInputFeatureDimension,
+                          int64_t kernelOutputFeatureDimension,
+                          absl::Span<const int64_t> outputSpatialDimensions,
+                          absl::Span<const int64_t> window_strides,
+                          absl::Span<const int64_t> padding,
+                          absl::Span<const int64_t> lhs_dilation,
+                          absl::Span<const int64_t> rhs_dilation,
+                          int64_t feature_group_count) const;
+  static XlaConvolution Handler() { return XlaConvolution(); }
+};
+}  // namespace
+
+absl::Status XlaConvolution::operator()(
+    const ExecutableRunOptions* run_options, MemrefView input,
+    MemrefView kernel, MemrefView output, int64_t inputBatchDimension,
+    absl::Span<const int64_t> inputSpatialDimensions,
+    int64_t inputFeatureDimension,
+    absl::Span<const int64_t> kernelSpatialDimensions,
+    int64_t kernelInputFeatureDimension, int64_t kernelOutputFeatureDimension,
+    absl::Span<const int64_t> outputSpatialDimensions,
+    absl::Span<const int64_t> window_strides, absl::Span<const int64_t> padding,
+    absl::Span<const int64_t> lhs_dilation,
+    absl::Span<const int64_t> rhs_dilation, int64_t feature_group_count) const {
+  auto size = inputSpatialDimensions.size();
+  if (size < 1 || size > 3) {
+    return absl::InvalidArgumentError(
+        "Only 1D, 2D and 3D convolutions are supported");
+  }
+
+  if (size != kernelSpatialDimensions.size() ||
+      size != outputSpatialDimensions.size() || size != window_strides.size() ||
+      size * 2 != padding.size() || size != lhs_dilation.size() ||
+      size != rhs_dilation.size()) {
+    return absl::InvalidArgumentError("Number of attributes mismatched");
+  }
+
+  // We lower 1D convolutions into calls to the same Eigen function as 2D
+  // convolutions, except that we pretend that the 1D convolution is really a 2D
+  // convolution with the missing dimension set to 1.  We also adjust the
+  // padding, dilation parameters as needed.
+  llvm::SmallVector<int64_t, 3> input_dims;
+  llvm::SmallVector<int64_t, 3> kernel_dims;
+  llvm::SmallVector<int64_t, 3> output_dims;
+  llvm::SmallVector<int64_t, 3> strides;
+  llvm::SmallVector<int64_t, 3> pad;
+  llvm::SmallVector<int64_t, 3> base_dilation;
+  llvm::SmallVector<int64_t, 3> window_dilation;
+  if (size == 1) {
+    input_dims.push_back(1);
+    kernel_dims.push_back(1);
+    output_dims.push_back(1);
+    strides.push_back(1);
+    pad.append({0, 0});
+    base_dilation.push_back(1);
+    window_dilation.push_back(1);
+  }
+  for (auto dim : inputSpatialDimensions) {
+    input_dims.push_back(input.sizes[dim]);
+  }
+  for (auto dim : kernelSpatialDimensions) {
+    kernel_dims.push_back(kernel.sizes[dim]);
+  }
+  for (auto dim : outputSpatialDimensions) {
+    output_dims.push_back(output.sizes[dim]);
+  }
+  strides.append(window_strides.begin(), window_strides.end());
+  pad.append(padding.begin(), padding.end());
+  base_dilation.append(lhs_dilation.begin(), lhs_dilation.end());
+  window_dilation.append(rhs_dilation.begin(), rhs_dilation.end());
+
+  if (output.dtype == PrimitiveType::F16) {
+    auto* out = reinterpret_cast<Eigen::half*>(output.data);
+    auto* lhs = reinterpret_cast<Eigen::half*>(input.data);
+    auto* rhs = reinterpret_cast<Eigen::half*>(kernel.data);
+    if (size != 3) {
+      __xla_cpu_runtime_EigenConv2DF16(
+          run_options, out, lhs, rhs,
+          /*input_batch*/ input.sizes[inputBatchDimension],
+          /*input_rows*/ input_dims[0],
+          /*input_cols*/ input_dims[1],
+          /*input_channels*/ input.sizes[inputFeatureDimension],
+          /*kernel_rows*/ kernel_dims[0],
+          /*kernel_cols*/ kernel_dims[1],
+          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
+          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
+          /*output_rows*/ output_dims[0],
+          /*output_cols*/ output_dims[1],
+          /*row_stride*/ strides[0],
+          /*col_stride*/ strides[1],
+          /*padding_top*/ pad[0],
+          /*padding_bottom*/ pad[1],
+          /*padding_left*/ pad[2],
+          /*padding_right*/ pad[3],
+          /*lhs_row_dilation*/ base_dilation[0],
+          /*lhs_col_dilation*/ base_dilation[1],
+          /*rhs_row_dilation*/ window_dilation[0],
+          /*rhs_col_dilation*/ window_dilation[1], feature_group_count);
+    } else {
+      __xla_cpu_runtime_EigenConv3DF16(
+          run_options, out, lhs, rhs,
+          /*input_batch*/ input.sizes[inputBatchDimension],
+          /*input_x*/ input_dims[0],
+          /*input_y*/ input_dims[1],
+          /*input_z*/ input_dims[2],
+          /*input_channels*/ input.sizes[inputFeatureDimension],
+          /*kernel_x*/ kernel_dims[0],
+          /*kernel_y*/ kernel_dims[1],
+          /*kernel_z*/ kernel_dims[2],
+          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
+          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
+          /*output_x*/ output_dims[0],
+          /*output_y*/ output_dims[1],
+          /*output_z*/ output_dims[2],
+          /*x_stride*/ strides[0],
+          /*y_stride*/ strides[1],
+          /*z_stride*/ strides[2],
+          /*padding_x_before*/ pad[0],
+          /*padding_x_after*/ pad[1],
+          /*padding_y_before*/ pad[2],
+          /*padding_y_after*/ pad[3],
+          /*padding_z_before*/ pad[4],
+          /*padding_z_after*/ pad[5],
+          /*lhs_x_dilation*/ base_dilation[0],
+          /*lhs_y_dilation*/ base_dilation[1],
+          /*lhs_z_dilation*/ base_dilation[2],
+          /*rhs_x_dilation*/ window_dilation[0],
+          /*rhs_y_dilation*/ window_dilation[1],
+          /*rhs_z_dilation*/ window_dilation[2], feature_group_count);
+    }
+  } else {
+    auto* out = reinterpret_cast<float*>(output.data);
+    auto* lhs = reinterpret_cast<float*>(input.data);
+    auto* rhs = reinterpret_cast<float*>(kernel.data);
+    if (size != 3) {
+      __xla_cpu_runtime_EigenConv2DF32(
+          run_options, out, lhs, rhs,
+          /*input_batch*/ input.sizes[inputBatchDimension],
+          /*input_rows*/ input_dims[0],
+          /*input_cols*/ input_dims[1],
+          /*input_channels*/ input.sizes[inputFeatureDimension],
+          /*kernel_rows*/ kernel_dims[0],
+          /*kernel_cols*/ kernel_dims[1],
+          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
+          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
+          /*output_rows*/ output_dims[0],
+          /*output_cols*/ output_dims[1],
+          /*row_stride*/ strides[0],
+          /*col_stride*/ strides[1],
+          /*padding_top*/ pad[0],
+          /*padding_bottom*/ pad[1],
+          /*padding_left*/ pad[2],
+          /*padding_right*/ pad[3],
+          /*lhs_row_dilation*/ base_dilation[0],
+          /*lhs_col_dilation*/ base_dilation[1],
+          /*rhs_row_dilation*/ window_dilation[0],
+          /*rhs_col_dilation*/ window_dilation[1], feature_group_count);
+    } else {
+      __xla_cpu_runtime_EigenConv3DF32(
+          run_options, out, lhs, rhs,
+          /*input_batch*/ input.sizes[inputBatchDimension],
+          /*input_x*/ input_dims[0],
+          /*input_y*/ input_dims[1],
+          /*input_z*/ input_dims[2],
+          /*input_channels*/ input.sizes[inputFeatureDimension],
+          /*kernel_x*/ kernel_dims[0],
+          /*kernel_y*/ kernel_dims[1],
+          /*kernel_z*/ kernel_dims[2],
+          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
+          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
+          /*output_x*/ output_dims[0],
+          /*output_y*/ output_dims[1],
+          /*output_z*/ output_dims[2],
+          /*x_stride*/ strides[0],
+          /*y_stride*/ strides[1],
+          /*z_stride*/ strides[2],
+          /*padding_x_before*/ pad[0],
+          /*padding_x_after*/ pad[1],
+          /*padding_y_before*/ pad[2],
+          /*padding_y_after*/ pad[3],
+          /*padding_z_before*/ pad[4],
+          /*padding_z_after*/ pad[5],
+          /*lhs_x_dilation*/ base_dilation[0],
+          /*lhs_y_dilation*/ base_dilation[1],
+          /*lhs_z_dilation*/ base_dilation[2],
+          /*rhs_x_dilation*/ window_dilation[0],
+          /*rhs_y_dilation*/ window_dilation[1],
+          /*rhs_z_dilation*/ window_dilation[2], feature_group_count);
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+static bool Convolution(xla::runtime::ExecutionContext* ctx, void** args,
+                        void** attrs, void** rets) {
+  static auto* handler =
+      CustomCall::Bind("xla.cpu.convolution")
+          .UserData<const ExecutableRunOptions*>()
+          .Arg<MemrefView>()  // input
+          .Arg<MemrefView>()  // kernel
+          .Arg<MemrefView>()  // output
+          .Attr<int64_t>("inputBatchDimension")
+          .Attr<absl::Span<const int64_t>>("inputSpatialDimensions")
+          .Attr<int64_t>("inputFeatureDimension")
+          .Attr<absl::Span<const int64_t>>("kernelSpatialDimensions")
+          .Attr<int64_t>("kernelInputFeatureDimension")
+          .Attr<int64_t>("kernelOutputFeatureDimension")
+          .Attr<absl::Span<const int64_t>>("outputSpatialDimensions")
+          .Attr<absl::Span<const int64_t>>("window_strides")
+          .Attr<absl::Span<const int64_t>>("padding")
+          .Attr<absl::Span<const int64_t>>("lhs_dilation")
+          .Attr<absl::Span<const int64_t>>("rhs_dilation")
+          .Attr<int64_t>("feature_group_count")
+          .To<RuntimeChecks()>(XlaConvolution::Handler())
+          .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+void PopulateXlaCpuConvolutionCall(
+    xla::runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.cpu.convolution", &Convolution);
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/convolution_call.h b/tensorflow/compiler/xla/service/cpu/runtime/convolution_call.h
new file mode 100644
index 00000000000..87af62fd3fc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/convolution_call.h
@@ -0,0 +1,28 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_CALL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_CALL_H_
+
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+
+namespace xla {
+namespace cpu {
+
+// Populate custom call implementing XLA CPU Convolution.
+void PopulateXlaCpuConvolutionCall(runtime::DirectCustomCallRegistry& registry);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_CALL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/custom_call.cc b/tensorflow/compiler/xla/service/cpu/runtime/custom_call.cc
index 3e6f1bab459..10ad0abc7e7 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime/custom_call.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime/custom_call.cc
@@ -59,7 +59,7 @@ namespace {
 struct XlaCustomCall {
   absl::Status operator()(CustomCall::RemainingArgs args, int32_t num_results,
                           bool output_tuple, StringRef call_target_name,
-                          int32_t api_version) const;
+                          int32_t api_version, StringRef backend_config) const;
   static XlaCustomCall Handler() { return XlaCustomCall(); }
 };
 }  // namespace
@@ -67,7 +67,8 @@ struct XlaCustomCall {
 absl::Status XlaCustomCall::operator()(CustomCall::RemainingArgs args,
                                        int32_t num_results, bool output_tuple,
                                        StringRef call_target_name,
-                                       int32_t api_version) const {
+                                       int32_t api_version,
+                                       StringRef backend_config) const {
   // Find the Xla custom call handler.
   void* call_target = CustomCallTargetRegistry::Global()->Lookup(
       call_target_name.str(), "Host");
@@ -127,6 +128,24 @@ absl::Status XlaCustomCall::operator()(CustomCall::RemainingArgs args,
     }
   }
 
+  if (api_version ==
+      CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED) {
+    using XlaCustomCallType =
+        void (*)(void* /*result*/, void** /*args*/, const char*, size_t,
+                 XlaCustomCallStatus* /*status*/);
+    auto xla_call_target = reinterpret_cast<XlaCustomCallType>(call_target);
+
+    XlaCustomCallStatus custom_call_status;
+    xla_call_target(result_buffer, buffers.data(), backend_config.data(),
+                    backend_config.size(), &custom_call_status);
+
+    if (auto message = CustomCallStatusGetMessage(&custom_call_status)) {
+      return absl::InternalError(message.value());
+    } else {
+      return absl::OkStatus();
+    }
+  }
+
   return absl::InvalidArgumentError("Incorrect custom call API version");
 }
 
@@ -138,6 +157,7 @@ static bool CustomCall(runtime::ExecutionContext* ctx, void** args,
                              .Attr<bool>("output_tuple")
                              .Attr<std::string_view>("call_target_name")
                              .Attr<int32_t>("api_version")
+                             .Attr<std::string_view>("backend_config")
                              .To<RuntimeChecks()>(XlaCustomCall::Handler())
                              .release();
   return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/retain.cc b/tensorflow/compiler/xla/service/cpu/runtime/retain.cc
new file mode 100644
index 00000000000..5e96ca5f4f4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/retain.cc
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <cstdlib>
+#include <utility>
+
+extern "C" void retainBuffers(int64_t numAllocs, void** allocBuffers,
+                              int64_t numRetained, void** retainedBuffers) {
+  for (int64_t i = 0; i < numRetained; ++i) {
+    void* retained = retainedBuffers[i];
+    retainedBuffers[i] = nullptr;
+    for (int64_t j = 0; j < numAllocs; ++j) {
+      if (allocBuffers[j] == retained) {
+        std::swap(allocBuffers[j], retainedBuffers[i]);
+        break;
+      }
+    }
+  }
+
+  for (int64_t i = 0; i < numAllocs; ++i) {
+    if (allocBuffers[i]) {
+      free(allocBuffers[i]);
+    }
+  }
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/rng.cc b/tensorflow/compiler/xla/service/cpu/runtime/rng.cc
index 297c0a0da8a..f3c731d1952 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime/rng.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime/rng.cc
@@ -149,17 +149,30 @@ absl::Status XlaThreeFry::operator()(const ExecutableRunOptions*,
   std::array<uint32_t, 2> key{state_vals[0], state_vals[1]};
   std::array<uint32_t, 2> ctr{state_vals[2], state_vals[3]};
 
-  if (values_buffer.dtype == PrimitiveType::U16 ||
-      values_buffer.dtype == PrimitiveType::S16 ||
-      values_buffer.dtype == PrimitiveType::F16) {
-    // XLA's RngBitGeneratorExpander has a corner case for U16 where it discards
-    // half the bits. We don't really need that, but some TF tests depend on it,
-    // somehow.
-    FillBuffer<uint16_t>(values_buffer.data, state_out_buffer.data,
-                         values_buffer.size_in_bytes, threefry2x32, ctr, key);
-  } else {
-    FillBuffer<uint32_t>(values_buffer.data, state_out_buffer.data,
-                         values_buffer.size_in_bytes, threefry2x32, ctr, key);
+  switch (values_buffer.dtype) {
+    case S8:
+    case U8:
+    case F16:
+    case U16:
+    case S16:
+      // XLA's RngBitGeneratorExpander has a corner case for bit widths less
+      // than 32 where it discards half the bits. We don't really need that, but
+      // some TF tests depend on it, somehow.
+      FillBuffer<uint16_t>(values_buffer.data, state_out_buffer.data,
+                           values_buffer.size_in_bytes, threefry2x32, ctr, key);
+      break;
+    case F32:
+    case U32:
+    case S32:
+    case F64:
+    case U64:
+    case S64:
+      FillBuffer<uint32_t>(values_buffer.data, state_out_buffer.data,
+                           values_buffer.size_in_bytes, threefry2x32, ctr, key);
+      break;
+    default:
+      return absl::UnimplementedError(
+          "Type not implemented by ThreeFryBitGenerator");
   }
 
   return absl::OkStatus();
@@ -181,14 +194,27 @@ absl::Status XlaPhilox::operator()(const ExecutableRunOptions*,
                               state_vals[is_24 ? 4 : 0],
                               state_vals[is_24 ? 5 : 1]};
 
-  if (values_buffer.dtype == PrimitiveType::U16 ||
-      values_buffer.dtype == PrimitiveType::S16 ||
-      values_buffer.dtype == PrimitiveType::F16) {
-    FillBuffer<uint16_t>(values_buffer.data, state_out_buffer.data,
-                         values_buffer.size_in_bytes, philox4x32, ctr, key);
-  } else {
-    FillBuffer<uint32_t>(values_buffer.data, state_out_buffer.data,
-                         values_buffer.size_in_bytes, philox4x32, ctr, key);
+  switch (values_buffer.dtype) {
+    case S8:
+    case U8:
+    case F16:
+    case U16:
+    case S16:
+      FillBuffer<uint16_t>(values_buffer.data, state_out_buffer.data,
+                           values_buffer.size_in_bytes, philox4x32, ctr, key);
+      break;
+    case F32:
+    case U32:
+    case S32:
+    case F64:
+    case U64:
+    case S64:
+      FillBuffer<uint32_t>(values_buffer.data, state_out_buffer.data,
+                           values_buffer.size_in_bytes, philox4x32, ctr, key);
+      break;
+    default:
+      return absl::UnimplementedError(
+          "Type not implemented by PhiloxBitGenerator");
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv2d_acl.h b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_acl.h
index bdd37d69a16..43c130383ff 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_conv2d_acl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_acl.h
@@ -76,7 +76,7 @@ extern void __xla_cpu_runtime_ACLConv2DF32(
 #include <iostream>
 
 extern "C" {
-extern void __xla_cpu_runtime_ACLConv2DF32(
+inline extern void __xla_cpu_runtime_ACLConv2DF32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
     float* lhs, float* rhs, int64_t input_batch, int64_t input_rows,
     int64_t input_cols, int64_t input_channels, int64_t kernel_rows,
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_acl.h b/tensorflow/compiler/xla/service/cpu/runtime_matmul_acl.h
index 36c44fae804..568602b043d 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul_acl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_acl.h
@@ -63,7 +63,7 @@ extern void __xla_cpu_runtime_ACLBatchMatMulF32(
 }  // extern "C"
 #else
 extern "C" {
-extern void __xla_cpu_runtime_ACLMatMulF32(
+inline extern void __xla_cpu_runtime_ACLMatMulF32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
     float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
     int32_t transpose_lhs, int32_t transpose_rhs) {
@@ -73,7 +73,7 @@ extern void __xla_cpu_runtime_ACLMatMulF32(
   exit(1);
 }
 
-extern void __xla_cpu_runtime_ACLBatchMatMulF32(
+inline extern void __xla_cpu_runtime_ACLBatchMatMulF32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
     float* lhs, float* rhs, int64_t m, int64_t n, int64_t k, int64_t batch_size,
     int32_t transpose_lhs, int32_t transpose_rhs) {
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
index 2b7ac77e5ce..e527421a69f 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
@@ -39,7 +39,7 @@ extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
     int32_t transpose_lhs, int32_t transpose_rhs);
 
 #else
-extern void __xla_cpu_runtime_MKLMatMulF32(
+inline extern void __xla_cpu_runtime_MKLMatMulF32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
     float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
     int32_t transpose_lhs, int32_t transpose_rhs) {
@@ -47,7 +47,7 @@ extern void __xla_cpu_runtime_MKLMatMulF32(
                "ENABLE_MKL. Add --config=mkl to build with MKL.";
   exit(1);
 }
-extern void __xla_cpu_runtime_MKLMatMulF64(
+inline extern void __xla_cpu_runtime_MKLMatMulF64(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
     double* lhs, double* rhs, int64_t m, int64_t n, int64_t k,
     int32_t transpose_lhs, int32_t transpose_rhs) {
@@ -55,7 +55,7 @@ extern void __xla_cpu_runtime_MKLMatMulF64(
                "ENABLE_MKL. Add --config=mkl to build with MKL.";
   exit(1);
 }
-extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+inline extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
     float* lhs, float* rhs, int64_t m, int64_t n, int64_t k,
     int32_t transpose_lhs, int32_t transpose_rhs) {
@@ -63,7 +63,7 @@ extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
                "ENABLE_MKL. Add --config=mkl to build with MKL.";
   exit(1);
 }
-extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
+inline extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
     double* lhs, double* rhs, int64_t m, int64_t n, int64_t k,
     int32_t transpose_lhs, int32_t transpose_rhs) {
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index f6b87333a3a..72aa4b34a7b 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -102,7 +102,8 @@ SimpleOrcJIT::SimpleOrcJIT(
     std::unique_ptr<llvm::orc::ExecutionSession> execution_session,
     const llvm::TargetOptions& target_options,
     llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
-    bool disable_expensive_passes, llvm::FastMathFlags fast_math_flags,
+    bool disable_expensive_passes, bool disable_slp_vectorizer,
+    llvm::FastMathFlags fast_math_flags,
     LLVMCompiler::ModuleHook pre_optimization_hook,
     LLVMCompiler::ModuleHook post_optimization_hook,
     std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook)
@@ -120,7 +121,7 @@ SimpleOrcJIT::SimpleOrcJIT(
           *execution_session_, object_layer_,
           std::make_unique<CompilerFunctor>(
               target_machine_.get(), opt_level, optimize_for_size,
-              disable_expensive_passes, fast_math_flags,
+              disable_expensive_passes, disable_slp_vectorizer, fast_math_flags,
               std::move(pre_optimization_hook),
               std::move(post_optimization_hook), std::move(post_codegen_hook))),
       main_jit_dylib_(&execution_session_->createBareJITDylib("<main>")),
@@ -145,8 +146,8 @@ SimpleOrcJIT::SimpleOrcJIT(
 
       for (const auto& kv : names) {
         const auto& name = kv.first;
-        if (llvm::JITEvaluatedSymbol symbol =
-                jit_.ResolveRuntimeSymbol(*name)) {
+        llvm::orc::ExecutorSymbolDef symbol = jit_.ResolveRuntimeSymbol(*name);
+        if (symbol.getAddress()) {
           new_defs[name] = symbol;
         }
       }
@@ -178,7 +179,8 @@ SimpleOrcJIT::~SimpleOrcJIT() {
 llvm::Expected<std::unique_ptr<SimpleOrcJIT>> SimpleOrcJIT::Create(
     const llvm::TargetOptions& target_options,
     llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
-    bool disable_expensive_passes, llvm::FastMathFlags fast_math_flags,
+    bool disable_expensive_passes, bool disable_slp_vectorizer,
+    llvm::FastMathFlags fast_math_flags,
     LLVMCompiler::ModuleHook pre_optimization_hook,
     LLVMCompiler::ModuleHook post_optimization_hook,
     std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook) {
@@ -194,11 +196,11 @@ llvm::Expected<std::unique_ptr<SimpleOrcJIT>> SimpleOrcJIT::Create(
   return std::make_unique<SimpleOrcJIT>(
       std::move(*target_process_control), std::move(execution_session),
       target_options, opt_level, optimize_for_size, disable_expensive_passes,
-      fast_math_flags, std::move(pre_optimization_hook),
+      disable_slp_vectorizer, fast_math_flags, std::move(pre_optimization_hook),
       std::move(post_optimization_hook), std::move(post_codegen_hook));
 }
 
-llvm::JITEvaluatedSymbol SimpleOrcJIT::ResolveRuntimeSymbol(
+llvm::orc::ExecutorSymbolDef SimpleOrcJIT::ResolveRuntimeSymbol(
     llvm::StringRef name) {
   void* func_addr = nullptr;
   if (name.size() > 1 && name.front() == data_layout_.getGlobalPrefix()) {
@@ -215,14 +217,13 @@ llvm::JITEvaluatedSymbol SimpleOrcJIT::ResolveRuntimeSymbol(
   if (func_addr == nullptr) {
     LOG(ERROR)
         << "Unable to resolve runtime symbol: `" << name.str()
-        << "'.  Hint: if the symbol a custom call target, make sure you've "
+        << "'. Hint: if the symbol a custom call target, make sure you've "
            "registered it with the JIT using "
            "XLA_CPU_REGISTER_CUSTOM_CALL_TARGET.";
-    return nullptr;
+    return {};
   }
-  llvm::JITEvaluatedSymbol symbol_info(reinterpret_cast<uint64_t>(func_addr),
-                                       llvm::JITSymbolFlags::None);
-  return symbol_info;
+  return {llvm::orc::ExecutorAddr(reinterpret_cast<uint64_t>(func_addr)),
+          llvm::JITSymbolFlags::None};
 }
 
 void SimpleOrcJIT::notifyObjectLoaded(
@@ -247,7 +248,7 @@ void SimpleOrcJIT::DoneCompiling() {
   target_machine_.reset();
 }
 
-llvm::Expected<llvm::JITEvaluatedSymbol> SimpleOrcJIT::FindCompiledSymbol(
+llvm::Expected<llvm::orc::ExecutorSymbolDef> SimpleOrcJIT::FindCompiledSymbol(
     const std::string& name) {
   return execution_session_->lookup({main_jit_dylib_}, name);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index e475b23fd6b..d5d5b0beaa8 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -58,7 +58,8 @@ class SimpleOrcJIT : public llvm::JITEventListener {
       std::unique_ptr<llvm::orc::ExecutionSession> execution_session,
       const llvm::TargetOptions& target_options,
       llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
-      bool disable_expensive_passes, llvm::FastMathFlags fast_math_flags,
+      bool disable_expensive_passes, bool disable_slp_vectorizer,
+      llvm::FastMathFlags fast_math_flags,
       LLVMCompiler::ModuleHook pre_optimization_hook,
       LLVMCompiler::ModuleHook post_optimization_hook,
       std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook);
@@ -66,7 +67,8 @@ class SimpleOrcJIT : public llvm::JITEventListener {
   static llvm::Expected<std::unique_ptr<SimpleOrcJIT>> Create(
       const llvm::TargetOptions& target_options,
       llvm::CodeGenOpt::Level opt_level, bool optimize_for_size,
-      bool disable_expensive_passes, llvm::FastMathFlags fast_math_flags,
+      bool disable_expensive_passes, bool disable_slp_vectorizer,
+      llvm::FastMathFlags fast_math_flags,
       LLVMCompiler::ModuleHook pre_optimization_hook,
       LLVMCompiler::ModuleHook post_optimization_hook,
       std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook);
@@ -84,7 +86,7 @@ class SimpleOrcJIT : public llvm::JITEventListener {
 
   // Get the runtime address of the compiled symbol whose name is given. Returns
   // nullptr if the symbol cannot be found.
-  llvm::Expected<llvm::JITEvaluatedSymbol> FindCompiledSymbol(
+  llvm::Expected<llvm::orc::ExecutorSymbolDef> FindCompiledSymbol(
       const std::string& name);
 
   llvm::TargetMachine* target_machine() const { return target_machine_.get(); }
@@ -100,7 +102,7 @@ class SimpleOrcJIT : public llvm::JITEventListener {
   }
 
  private:
-  llvm::JITEvaluatedSymbol ResolveRuntimeSymbol(llvm::StringRef name);
+  llvm::orc::ExecutorSymbolDef ResolveRuntimeSymbol(llvm::StringRef name);
 
   void notifyObjectLoaded(
       llvm::JITEventListener::ObjectKey key,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 0f59c5590ba..bb39d3300af 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -264,9 +264,9 @@ xla_cc_test(
     srcs = ["cpu_spmd_compile_test.cc"],
     deps = [
         ":cpu_codegen_test",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service:hlo_query",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_spmd_compile_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_spmd_compile_test.cc
index fa30cacc163..9abd4edd8dc 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_spmd_compile_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_spmd_compile_test.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/test.h"
diff --git a/tensorflow/compiler/xla/service/dot_merger.cc b/tensorflow/compiler/xla/service/dot_merger.cc
index a572fe1de3c..7653031d8c7 100644
--- a/tensorflow/compiler/xla/service/dot_merger.cc
+++ b/tensorflow/compiler/xla/service/dot_merger.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dot_merger.h"
 
 #include <functional>
+#include <set>
 #include <string>
+#include <utility>
 
 #include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
@@ -25,53 +27,15 @@ limitations under the License.
 namespace xla {
 namespace {
 
-// Determines if this dot is canonical according to DotDecomposer's rules.  The
-// LHS and RHS must have
-//
-//  - batch dimensions at the beginning, followed by
-//  - one non-contracting dimension and one contracting dimension, in either
-//    order.
-//
-// (Note: DotDecomposer doesn't guarantee that the LHS contracting dim is the
-// last dim or the RHS contracting dim is the second-to-last.)
-bool IsCanonicalDot(HloInstruction* dot) {
-  if (dot->opcode() != HloOpcode::kDot) {
-    return false;
-  }
-
-  // Checks that the given list is a permutation of [0, 1, ..., n].
-  auto is_permutation_of_iota =
-      [](const tsl::protobuf::RepeatedField<int64_t>& vals) {
-        DimensionVector copy(vals.begin(), vals.end());
-        absl::c_sort(copy);
-        for (int i = 0; i < copy.size(); i++) {
-          if (copy[i] != i) {
-            return false;
-          }
-        }
-        return true;
-      };
-
-  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
-  CHECK_EQ(dnums.lhs_batch_dimensions_size(),
-           dnums.rhs_batch_dimensions_size());
-  int64_t batch_size = dnums.lhs_batch_dimensions_size();
-  return is_permutation_of_iota(dnums.lhs_batch_dimensions()) &&
-         dnums.lhs_contracting_dimensions_size() == 1 &&
-         dot->operand(0)->shape().dimensions_size() == batch_size + 2 &&
-         is_permutation_of_iota(dnums.rhs_batch_dimensions()) &&
-         dnums.rhs_contracting_dimensions_size() == 1 &&
-         dot->operand(1)->shape().dimensions_size() == batch_size + 2;
-}
-
 // Tries to merge dot instructions a and b if they share an operand.  Example:
 //
 //   lhs = f32[200,100] parameter(0)
 //   rhs0 = f32[100,10] parameter(1)
 //   rhs1 = f32[100,50] parameter(2)
-//   dot0 = f32[200,10] dot(lhs, rhs0), lhs_contracting_dims={1},
-//   rhs_contracting_dims={0} dot1 = f32[200,50] dot(lhs, rhs1),
-//   lhs_contracting_dims={1}, rhs_contracting_dims={0}
+//   dot0 = f32[200,10] dot(lhs, rhs0),
+//     lhs_contracting_dims={1}, rhs_contracting_dims={0}
+//   dot1 = f32[200,50] dot(lhs, rhs1),
+//     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 //
 // can be merged to
 //
@@ -80,12 +44,18 @@ bool IsCanonicalDot(HloInstruction* dot) {
 //   dot1 = slice(dot)
 //
 // Preconditions:
-//  - `a` and `b` are canonical dots.
+//  - `a` and `b` are dots.
 //  - `a` does not transitively depend on the value of `b`, and `b` does not
 //    transitively depend on the value of `a`.
 //
 StatusOr<HloInstruction*> TryMergeSameOperand(HloInstruction* a,
                                               HloInstruction* b) {
+  if (a->shape().layout() != b->shape().layout()) {
+    VLOG(3) << "Can't merge dots because they have a different layout:\n"
+            << "\t" << a->ToString() << "\n"
+            << "\t" << b->ToString();
+    return nullptr;
+  }
   if (a->operand(0) != b->operand(0) && a->operand(1) != b->operand(1)) {
     VLOG(4) << "Can't merge dots because they don't share an operand.\n"
             << "\t" << a->ToString() << "\n"
@@ -153,25 +123,59 @@ StatusOr<HloInstruction*> TryMergeSameOperand(HloInstruction* a,
   HloInstruction* shared_op = a->mutable_operand(lhs_same ? 0 : 1);
   HloInstruction* diff_op_a = a->mutable_operand(lhs_same ? 1 : 0);
   HloInstruction* diff_op_b = b->mutable_operand(lhs_same ? 1 : 0);
+  if (diff_op_a->shape().layout() != diff_op_b->shape().layout()) {
+    VLOG(3) << "Can't merge dots because the different operands have a "
+               "different layout:\n"
+            << "\t" << diff_op_a->ToString() << "\n"
+            << "\t" << diff_op_b->ToString();
+    return nullptr;
+  }
 
   // Dimension along which we're going to concatenate diff_op_a and diff_op_b.
-  // This is the outer (i.e. non-contracing) dim.  Because the dot is canonical,
-  // we know that the dimensions are
-  //
-  //  [batch_dims ..., outer/contracting dim, contracting/outer dim].
-  //
+  // We only support the case where there is exactly one non-contracting
+  // dimension. We can find it by collecting all other dimensions in a set, and
+  // then picking the first dimension which is not in the set.
   CHECK_EQ(dnums.lhs_batch_dimensions_size(),
            dnums.rhs_batch_dimensions_size());
-  int64_t contracting_dim = (lhs_same ? dnums.rhs_contracting_dimensions()
-                                      : dnums.lhs_contracting_dimensions())[0];
-  int64_t outer_dim = contracting_dim == dnums.lhs_batch_dimensions_size()
-                          ? contracting_dim + 1
-                          : contracting_dim - 1;
+  std::set<int64_t> used_dims;
+  int64_t shared_op_num_non_contracting_dims =
+      shared_op->shape().rank() - dnums.lhs_batch_dimensions_size();
+  if (lhs_same) {
+    shared_op_num_non_contracting_dims -=
+        dnums.lhs_contracting_dimensions_size();
+    used_dims.insert(dnums.rhs_contracting_dimensions().begin(),
+                     dnums.rhs_contracting_dimensions().end());
+    used_dims.insert(dnums.rhs_batch_dimensions().begin(),
+                     dnums.rhs_batch_dimensions().end());
+  } else {
+    shared_op_num_non_contracting_dims -=
+        dnums.rhs_contracting_dimensions_size();
+    used_dims.insert(dnums.lhs_contracting_dimensions().begin(),
+                     dnums.lhs_contracting_dimensions().end());
+    used_dims.insert(dnums.lhs_batch_dimensions().begin(),
+                     dnums.lhs_batch_dimensions().end());
+  }
+  if (used_dims.size() + 1 != diff_op_a->shape().rank()) {
+    VLOG(3)
+        << "Can't merge dots because the different operands don't have exactly "
+           "one non-contracting dimension:\n"
+        << "\t" << a->ToString() << "\n"
+        << "\t" << b->ToString();
+    return nullptr;
+  }
+  int64_t outer_dim = 0;
+  for (auto used_dim : used_dims) {
+    if (used_dim != outer_dim) {
+      break;
+    }
+    ++outer_dim;
+  }
 
   TF_ASSIGN_OR_RETURN(
       Shape concat_shape,
       ShapeInference::InferConcatOpShape(
           {&diff_op_a->shape(), &diff_op_b->shape()}, outer_dim));
+  *concat_shape.mutable_layout() = diff_op_a->shape().layout();
   HloInstruction* concat_op =
       diff_op_a->AddInstruction(HloInstruction::CreateConcatenate(
           concat_shape, {diff_op_a, diff_op_b}, outer_dim));
@@ -183,6 +187,7 @@ StatusOr<HloInstruction*> TryMergeSameOperand(HloInstruction* a,
       ShapeInference::InferDotOpShape(
           dot_lhs->shape(), dot_rhs->shape(), dnums,
           /*preferred_element_type=*/a->shape().element_type()));
+  *new_dot_shape.mutable_layout() = a->shape().layout();
   HloInstruction* new_dot = a->AddInstruction(HloInstruction::CreateDot(
       new_dot_shape, dot_lhs, dot_rhs, dnums, a->precision_config()));
 
@@ -199,7 +204,8 @@ StatusOr<HloInstruction*> TryMergeSameOperand(HloInstruction* a,
                                 new_dot_shape.dimensions().end());
   DimensionVector strides(new_dot_shape.dimensions_size(), 1);
 
-  int64_t slice_dim = new_dot_shape.dimensions_size() - (lhs_same ? 1 : 2);
+  int64_t slice_dim = new_dot_shape.dimensions_size() -
+                      (lhs_same ? 1 : 1 + shared_op_num_non_contracting_dims);
   limit_indices[slice_dim] = a->shape().dimensions(slice_dim);
   // Important: We do RAUW, not ReplaceInstruction, because the old instruction
   // must live until the end of the pass.
@@ -237,7 +243,8 @@ StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge) {
       equivalence_classes;
   for (HloInstruction* instr : comp->instructions()) {
     // Cowardly skip instructions with control dependencies.
-    if (!IsCanonicalDot(instr) || !instr->control_predecessors().empty() ||
+    if (instr->opcode() != HloOpcode::kDot ||
+        !instr->control_predecessors().empty() ||
         !instr->control_successors().empty()) {
       continue;
     }
diff --git a/tensorflow/compiler/xla/service/dot_merger.h b/tensorflow/compiler/xla/service/dot_merger.h
index cd8ffca8745..571f505fb6c 100644
--- a/tensorflow/compiler/xla/service/dot_merger.h
+++ b/tensorflow/compiler/xla/service/dot_merger.h
@@ -49,8 +49,8 @@ namespace xla {
 // below the threshold because backends likely want to allow merging a "small"
 // dot into a "large" dot while preventing two large dots from being merged.)
 //
-// Assumes DotDecomposer has already canonicalized the gemms and will skip
-// noncanonical gemms.
+// Will skip gemms with more than one non-contracting dimension in the dot
+// operands to be concatenated.
 class DotMerger : public HloModulePass {
  public:
   explicit DotMerger(int64_t max_size_to_merge)
diff --git a/tensorflow/compiler/xla/service/dot_merger_test.cc b/tensorflow/compiler/xla/service/dot_merger_test.cc
index 275940dd3b2..2690ad01f45 100644
--- a/tensorflow/compiler/xla/service/dot_merger_test.cc
+++ b/tensorflow/compiler/xla/service/dot_merger_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/dot_merger.h"
 
+#include <limits>
+
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
@@ -64,6 +66,58 @@ TEST_F(DotMergerTest, MergeRHS) {
                                     m::Parameter(1), m::Parameter(2)))));
 }
 
+TEST_F(DotMergerTest, MergeRHSWithLayouts) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    lhs  = f32[200,100] parameter(0)
+    rhs0 = f32[100, 10]{0,1} parameter(1)
+    rhs1 = f32[100, 50]{0,1} parameter(2)
+    dot0 = f32[200, 10] dot(lhs, rhs0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    dot1 = f32[200, 50] dot(lhs, rhs1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ROOT tuple = (f32[200,10], f32[200,50]) tuple(dot0, dot1)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  DotMerger pass(/*max_size_to_merge=*/std::numeric_limits<int64_t>::max());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
+  EXPECT_TRUE(changed);
+  const HloInstruction* dot0 = nullptr;
+  const HloInstruction* dot1 = nullptr;
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(m::Slice(m::Op(&dot0)), m::Slice(m::Op(&dot1)))));
+  EXPECT_EQ(dot0, dot1);
+  Shape expected_concat_shape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {100, 60}, {0, 1});
+  EXPECT_THAT(
+      dot0, GmockMatch(m::Dot(m::Parameter(0),
+                              m::Concatenate()
+                                  .WithBinaryOperandsAnyOrder(m::Parameter(1),
+                                                              m::Parameter(2))
+                                  .WithShapeEqualTo(&expected_concat_shape))));
+}
+
+TEST_F(DotMergerTest, NoMergeDifferentLayoutRHS) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    lhs  = f32[200,100] parameter(0)
+    rhs0 = f32[100, 10]{0,1} parameter(1)
+    rhs1 = f32[100, 50]{1,0} parameter(2)
+    dot0 = f32[200, 10] dot(lhs, rhs0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    dot1 = f32[200, 50] dot(lhs, rhs1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ROOT tuple = (f32[200,10], f32[200,50]) tuple(dot0, dot1)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  DotMerger pass(/*max_size_to_merge=*/std::numeric_limits<int64_t>::max());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
+  EXPECT_FALSE(changed);
+}
+
 TEST_F(DotMergerTest, MergeLHS) {
   absl::string_view module_string = R"(
   HloModule module
@@ -85,6 +139,73 @@ TEST_F(DotMergerTest, MergeLHS) {
               GmockMatch(m::Tuple(m::Slice(), m::Slice())));
 }
 
+TEST_F(DotMergerTest, MergeLHSDotsWithNonDefaultLayout) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    lhs0 = f32[100,200] parameter(0)
+    lhs1 = f32[300,200] parameter(1)
+    rhs  = f32[200, 50] parameter(2)
+    dot0 = f32[100, 50]{0,1} dot(lhs0, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    dot1 = f32[300, 50]{0,1} dot(lhs1, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ROOT tuple = (f32[100,50]{0,1}, f32[300,50]{0,1}) tuple(dot0, dot1)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  DotMerger pass(/*max_size_to_merge=*/std::numeric_limits<int64_t>::max());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
+  EXPECT_TRUE(changed);
+  Shape expected_dot_shape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {400, 50}, {0, 1});
+  const HloInstruction* dot0 = nullptr;
+  const HloInstruction* dot1 = nullptr;
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(m::Slice(m::Dot(&dot0, m::Op(), m::Op())
+                                       .WithShapeEqualTo(&expected_dot_shape)),
+                          m::Slice(m::Dot(&dot1, m::Op(), m::Op())))));
+  EXPECT_EQ(dot0, dot1);
+}
+
+TEST_F(DotMergerTest, NoMergeDifferentLayoutLHS) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    lhs0 = f32[100,200]{1,0} parameter(0)
+    lhs1 = f32[300,200]{0,1} parameter(1)
+    rhs  = f32[200, 50] parameter(2)
+    dot0 = f32[100, 50] dot(lhs0, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    dot1 = f32[300, 50] dot(lhs1, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ROOT tuple = (f32[100,50], f32[300,50]) tuple(dot0, dot1)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  DotMerger pass(/*max_size_to_merge=*/std::numeric_limits<int64_t>::max());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(DotMergerTest, NoMergeDifferentDotLayout) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    lhs0 = f32[100,200] parameter(0)
+    lhs1 = f32[300,200] parameter(1)
+    rhs  = f32[200, 50] parameter(2)
+    dot0 = f32[100, 50]{0,1} dot(lhs0, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    dot1 = f32[300, 50]{1,0} dot(lhs1, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ROOT tuple = (f32[100,50]{0,1}, f32[300,50]{1,0}) tuple(dot0, dot1)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  DotMerger pass(/*max_size_to_merge=*/std::numeric_limits<int64_t>::max());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
+  EXPECT_FALSE(changed);
+}
+
 TEST_F(DotMergerTest, MergeThree) {
   absl::string_view module_string = R"(
   HloModule module
@@ -232,6 +353,30 @@ TEST_F(DotMergerTest, MergeWithBatchDims) {
               GmockMatch(m::Tuple(m::Slice(), m::Slice())));
 }
 
+TEST_F(DotMergerTest, MergeWithBatchDimsAndMultipleContractingDims) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    lhs  = f32[2,3,4,5] parameter(0)
+    rhs0 = f32[2,6,3,4,5] parameter(1)
+    rhs1 = f32[2,7,3,4,5] parameter(2)
+    dot0 = f32[2,4,6] dot(lhs, rhs0), lhs_batch_dims={0,2}, rhs_batch_dims={0,3},
+                                      lhs_contracting_dims={1,3}, rhs_contracting_dims={2,4}
+    dot1 = f32[2,4,7] dot(lhs, rhs1), lhs_batch_dims={0,2}, rhs_batch_dims={0,3},
+                                      lhs_contracting_dims={1,3}, rhs_contracting_dims={2,4}
+    ROOT tuple = (f32[2,4,6], f32[2,4,7]) tuple(dot0, dot1)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  DotMerger pass(/*max_size_to_merge=*/std::numeric_limits<int64_t>::max());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
+  EXPECT_TRUE(changed);
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Slice(), m::Slice())));
+}
+
 TEST_F(DotMergerTest, MergeWithUnsortedBatchDims) {
   absl::string_view module_string = R"(
   HloModule module
@@ -298,7 +443,7 @@ TEST_F(DotMergerTest, NoMergeDueToIsMergeCandidate) {
   EXPECT_NE(s0, s2);
 }
 
-TEST_F(DotMergerTest, NoMergeNonCanonicalLhsBatch) {
+TEST_F(DotMergerTest, NoMergeDifferentLhsBatchDims) {
   absl::string_view module_string = R"(
   HloModule module
 
@@ -317,7 +462,7 @@ TEST_F(DotMergerTest, NoMergeNonCanonicalLhsBatch) {
   EXPECT_FALSE(changed);
 }
 
-TEST_F(DotMergerTest, NoMergeNonCanonicalRhsBatch) {
+TEST_F(DotMergerTest, NoMergeDifferentRhsBatchDims) {
   absl::string_view module_string = R"(
   HloModule module
 
@@ -336,7 +481,7 @@ TEST_F(DotMergerTest, NoMergeNonCanonicalRhsBatch) {
   EXPECT_FALSE(changed);
 }
 
-TEST_F(DotMergerTest, NoMergeMultipleContractingDims) {
+TEST_F(DotMergerTest, MergeMultipleContractingDims) {
   absl::string_view module_string = R"(
   HloModule module
 
@@ -352,7 +497,49 @@ TEST_F(DotMergerTest, NoMergeMultipleContractingDims) {
                           ParseAndReturnVerifiedModule(module_string));
   DotMerger pass(/*max_size_to_merge=*/std::numeric_limits<int64_t>::max());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
-  EXPECT_FALSE(changed);
+  EXPECT_TRUE(changed);
+
+  const HloInstruction* s0 = nullptr;
+  const HloInstruction* s1 = nullptr;
+  SCOPED_TRACE(module->ToString());
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          m::Slice(m::Dot(&s0, m::Concatenate(m::Parameter(0), m::Parameter(1)),
+                          m::Parameter(2))),
+          m::Slice(m::Op(&s1)))));
+  EXPECT_EQ(s0, s1);
+}
+
+TEST_F(DotMergerTest, MergeMultipleNonContractingDimsInRhsSharedOperand) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    lhs0 = f32[8,9,10] parameter(0)
+    lhs1 = f32[8,9,11] parameter(1)
+    rhs  = f32[8,9,12,13] parameter(2)
+    dot0 = f32[10,12,13] dot(lhs0, rhs), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+    dot1 = f32[11,12,13] dot(lhs1, rhs), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}
+    ROOT tuple = (f32[10,12,13], f32[11,12,13]) tuple(dot0, dot1)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  DotMerger pass(/*max_size_to_merge=*/std::numeric_limits<int64_t>::max());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
+  EXPECT_TRUE(changed);
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+
+  const HloInstruction* s0 = nullptr;
+  const HloInstruction* s1 = nullptr;
+  SCOPED_TRACE(module->ToString());
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          m::Slice(m::Dot(&s0, m::Concatenate(m::Parameter(0), m::Parameter(1)),
+                          m::Parameter(2))),
+          m::Slice(m::Op(&s1)))));
+  EXPECT_EQ(s0, s1);
 }
 
 TEST_F(DotMergerTest, NoMergeMultipleOuterDims) {
diff --git a/tensorflow/compiler/xla/service/dump.cc b/tensorflow/compiler/xla/service/dump.cc
index c6bda02b18a..f8d73799807 100644
--- a/tensorflow/compiler/xla/service/dump.cc
+++ b/tensorflow/compiler/xla/service/dump.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
 #include "mlir/Transforms/LocationSnapshot.h"  // from @llvm-project
@@ -244,7 +245,7 @@ static std::optional<std::string> GetDumpFilePath(
     string_view filename, const CanonicalDebugOptions& opts) {
   if (opts.dumping_to_stdout()) {
     LOG(ERROR) << "Refusing to write " << filename
-               << " to stdout.  Pass --xla_dump_to=<path> to write to a file.";
+               << " to stdout. Pass --xla_dump_to=<path> to write to a file.";
     return std::nullopt;
   }
 
@@ -388,7 +389,7 @@ static std::vector<std::string> DumpHloModuleImpl(
 
   if (opts.dump_as_text) {
     auto print_options = opts.dump_as_long_text
-                             ? HloPrintOptions()
+                             ? HloPrintOptions::Default()
                              : HloPrintOptions::ShortParsable();
     print_options.set_print_large_constants(false);
     print_options.set_print_control_dependencies(true);
@@ -613,27 +614,17 @@ void DumpToFileInDirOrStdout(const HloModule& module, string_view file_prefix,
   CanonicalDebugOptions opts(module.config().debug_options());
   if (opts.dumping_to_stdout()) return op->dump();
 
-  auto file_path =
-      GetDumpFilePath(FilenameFor(module, file_prefix, "mlir"), opts);
-  if (!file_path) return;
-
-  std::string error;
-  std::unique_ptr<llvm::ToolOutputFile> outputFile =
-      mlir::openOutputFile(llvm::SmallString<32>(*file_path), &error);
-  if (!outputFile) {
-    LOG(ERROR) << "Error: " << error << std::endl
-               << "Failed to open file: " << *file_path;
-    return;
-  }
-
   mlir::OpPrintingFlags print_flags = mlir::OpPrintingFlags().useLocalScope();
   // Enable debug info so that it is easier to see the corresponding HLO node.
   if (file_prefix == "lmhlo") {
     print_flags.enableDebugInfo(/*enable=*/true,
                                 /*prettyForm=*/opts.dump_mlir_pretty_form);
   }
-  op->print(outputFile->os(), print_flags);
-  outputFile->keep();
+  std::string content;
+  llvm::raw_string_ostream string_stream(content);
+  op->print(string_stream, print_flags);
+  DumpToFileInDirOrStdoutImpl(FilenameFor(module, file_prefix, "mlir"), content,
+                              opts);
 }
 
 void DumpProtobufToFile(const tsl::protobuf::Message& proto,
@@ -767,7 +758,7 @@ void DumpHloSnapshotIfEnabled(const HloModule& module,
              ".hlo_snapshot.pb");
   if (opts.dumping_to_stdout()) {
     LOG(ERROR) << "Refusing to write HLO snapshot proto for " << filename
-               << " to stdout.  Pass --xla_dump_to=<path> to write to a file.";
+               << " to stdout. Pass --xla_dump_to=<path> to write to a file.";
     return;
   }
   std::string pb;
@@ -799,7 +790,7 @@ void DumpHloSnapshotIfEnabled(const HloSnapshot& snapshot,
                                    name, execution_count);
   if (canonical_opts.dumping_to_stdout()) {
     LOG(ERROR) << "Refusing to write HLO snapshot proto for " << filename
-               << " to stdout.  Pass --xla_dump_to=<path> to write to a file.";
+               << " to stdout. Pass --xla_dump_to=<path> to write to a file.";
     return;
   }
   std::string pb;
diff --git a/tensorflow/compiler/xla/service/eigh_expander.cc b/tensorflow/compiler/xla/service/eigh_expander.cc
index 91ca35d1c7d..2d95f7d7dfc 100644
--- a/tensorflow/compiler/xla/service/eigh_expander.cc
+++ b/tensorflow/compiler/xla/service/eigh_expander.cc
@@ -15,7 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/eigh_expander.h"
 
+#include <algorithm>
+#include <limits>
 #include <memory>
+#include <numeric>
+#include <string>
+#include <tuple>
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
@@ -63,17 +68,6 @@ struct Eigh2x2 {
   XlaOp s;  // sine.
 };
 
-// sqrt(x**2 + y**2), calculated avoiding overflow.
-XlaOp Hypot(XlaOp x, XlaOp y) {
-  x = Abs(x);
-  y = Abs(y);
-  auto xy_min = Min(x, y);
-  auto xy_max = Max(x, y);
-  auto out = xy_max * Sqrt(ScalarLike(x, 1) + Square(xy_min / xy_max));
-  return Select(Eq(xy_min, xy_max), xy_min * ScalarLike(xy_min, std::sqrt(2.)),
-                out);
-}
-
 // Given an n-by-n symmetric A and integers p and q that satisfy 0 <= p < q < n,
 // a Jacobi rotation computes a rotation matrix G = [[c, s], [-s, c]], such that
 //   G_T * A[[p, q], [p, q]] * G
@@ -85,12 +79,16 @@ XlaOp Hypot(XlaOp x, XlaOp y) {
 // their rotations do not interfere and hence can be computed in parallel.
 //
 // def sym_schur2x2(w_tl, w_tr, w_br):
-//   off_diag = np.diag(w_tr)
-//   tau = (np.diag(w_br) - np.diag(w_tl)) / (2 * off_diag)
+//   a = np.diag(w_br)
+//   b = np.diag(w_tr)
+//   c = np.diag(w_tl)
+//   tau = (a - c) / (2 * b)
 //   t = np.where(tau >= 0, 1.0 / (tau + np.sqrt(1 + tau ** 2)),
 //                -1.0 / (-tau + np.sqrt(1 + tau ** 2)))
-//   pred = np.abs(off_diag) > 1e-6
-//   t = np.where(pred, t, 0.)
+//   fudge_factor = 0.1
+//   b_is_tiny = np.abs(b) <= (fudge_factor*eps*
+//     np.min(np.abs(a), np.abs(c)))
+//   t = np.where(b_is_tiny, 0., t)
 //   c = 1.0 / np.sqrt(1.0 + t ** 2)
 //   s = t * c
 //   rt1 = w_tl - t * w_tr
@@ -116,11 +114,15 @@ StatusOr<Eigh2x2> HermitianEigenDecomposition2x2(XlaOp w_tl, XlaOp w_tr,
     w_tr = abs_tr;
   }
 
-  auto tol = ScalarLike(w_tr, 1e-6);
   auto tau = (w_br - w_tl) / (two * w_tr);
   auto t = Sqrt(one + Square(tau));
   t = Reciprocal(tau + Select(Ge(tau, zero), t, Neg(t)));
-  t = Select(Gt(Abs(w_tr), tol), t, ZerosLike(t));
+
+  constexpr float kFudgeFactor = 0.1f;
+  auto tiny =
+      ScalarLike(w_tr, kFudgeFactor * std::numeric_limits<float>::epsilon());
+  auto off_diag_is_tiny = Le(Abs(w_tr), Mul(tiny, Min(Abs(w_tl), Abs(w_br))));
+  t = Select(off_diag_is_tiny, ZerosLike(t), t);
   auto c = Rsqrt(one + Square(t));
   auto s = t * c;
 
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 7d1dab8cf09..5fb73e43218 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <string>
@@ -463,6 +464,76 @@ llvm::Value* EmitF8e4m3fnToF16(llvm::Value* f8_value, llvm::IRBuilder<>* b) {
   return b->CreateBitCast(f16_as_int, b->getHalfTy());
 }
 
+llvm::Value* EmitF16ToF8e4m3b11fnuz(llvm::Value* f16_value,
+                                    llvm::IRBuilder<>* b) {
+  using llvm::APInt;
+  using llvm::Value;
+
+  llvm::IntegerType* i8_type = b->getInt8Ty();
+  auto i8_const = [i8_type](int val) {
+    return llvm::ConstantInt::get(i8_type, val);
+  };
+  auto type = f16_value->getType();
+  auto f16_abs_value = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs,
+                                                    {f16_value}, {type}, b);
+  auto f16_zero = llvm::ConstantFP::getZero(type);
+  auto is_zero = b->CreateFCmpOEQ(f16_abs_value, f16_zero);
+  auto f8_overflow_threshold = llvm::ConstantFP::get(type, 0x1.fp+4);
+  auto no_overflow = b->CreateFCmpOLT(f16_abs_value, f8_overflow_threshold);
+
+  // Re-scale the f16, then convert as-if it were e4m3fn.
+  f16_value = b->CreateFMul(
+      f16_value, llvm::ConstantFP::get(f16_value->getType(), 1 << (11 - 7)));
+  auto* f8_value = EmitF16ToF8e4m3fn(f16_value, b);
+
+  // e4m3b11 overflows to NaN.
+  f8_value = b->CreateSelect(no_overflow, f8_value, i8_const(0x80));
+  // e4m3b11 has no negative zero.
+  f8_value = b->CreateSelect(is_zero, i8_const(0x00), f8_value);
+  return f8_value;
+}
+
+llvm::Value* EmitF8e4m3b11fnuzToF16(llvm::Value* f8_value,
+                                    llvm::IRBuilder<>* b) {
+  using llvm::APInt;
+  using llvm::Value;
+
+  llvm::IntegerType* i8_type = b->getInt8Ty();
+  llvm::IntegerType* i16_type = b->getInt16Ty();
+  auto i8_const = [i8_type](int val) {
+    return llvm::ConstantInt::get(i8_type, val);
+  };
+
+  Value* f8_as_int = b->CreateBitCast(f8_value, i8_type);
+  Value* is_nan = b->CreateICmpEQ(f8_as_int, i8_const(0x80));
+
+  Value* f8_abs_bits = b->CreateAnd(f8_as_int, i8_const(0x7F));
+  Value* is_max = b->CreateICmpEQ(f8_abs_bits, i8_const(0x7F));
+  Value* f8_sign_bit = b->CreateAnd(f8_as_int, i8_const(0x80));
+  Value* f16_sign_bit =
+      b->CreateShl(b->CreateZExt(f8_sign_bit, i16_type), 16 - 8);
+
+  auto* f16_value = EmitF8e4m3fnToF16(f8_value, b);
+  f16_value = b->CreateFMul(
+      f16_value,
+      llvm::ConstantFP::get(f16_value->getType(), 1.0 / (1 << (11 - 7))));
+  f16_value = b->CreateSelect(
+      is_nan,
+      llvm::ConstantFP::get(f16_value->getType(),
+                            std::numeric_limits<double>::quiet_NaN()),
+      f16_value);
+
+  llvm::Value* max_like_sign = llvm::ConstantFP::get(
+      f16_value->getType(),
+      static_cast<float>(std::numeric_limits<tsl::float8_e4m3b11>::max()));
+  max_like_sign = b->CreateBitCast(max_like_sign, f16_sign_bit->getType());
+  max_like_sign = b->CreateOr(max_like_sign, f16_sign_bit);
+  max_like_sign = b->CreateBitCast(max_like_sign, f16_value->getType());
+  f16_value = b->CreateSelect(is_max, max_like_sign, f16_value);
+
+  return f16_value;
+}
+
 llvm::Value* EmitIntegralToFloating(llvm::Value* integer_value,
                                     PrimitiveType from_type,
                                     PrimitiveType to_type, llvm::Module* module,
@@ -548,6 +619,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
                                      b_),
               b_);
         }
+        if (to_type == F8E4M3B11FNUZ) {
+          return EmitF16ToF8e4m3b11fnuz(
+              EmitIntegralToFloating(operand_value, from_type, F16, module_,
+                                     b_),
+              b_);
+        }
         return EmitIntegralToFloating(operand_value, from_type, to_type,
                                       module_, b_);
       }
@@ -674,6 +751,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
           return operand_value;
         }
       }
+      if (from_type == F8E4M3B11FNUZ) {
+        TF_RET_CHECK(to_type != F8E4M3B11FNUZ);
+        operand_value = EmitF8e4m3b11fnuzToF16(operand_value, b_);
+        from_type = F16;
+        if (from_type == to_type) {
+          return operand_value;
+        }
+      }
       if (primitive_util::IsComplexType(to_type)) {
         PrimitiveType to_component_type =
             primitive_util::ComplexComponentType(to_type);
@@ -711,6 +796,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
         }
         return EmitF16ToF8e4m3fn(operand_value, b_);
       }
+      if (to_type == F8E4M3B11FNUZ) {
+        // Cast to F16 first. Casts to F8E4M3B11FNUZ must be from F16.
+        if (from_type != F16) {
+          operand_value = b_->CreateFPCast(
+              operand_value, llvm_ir::PrimitiveTypeToIrType(F16, module_));
+        }
+        return EmitF16ToF8e4m3b11fnuz(operand_value, b_);
+      }
       if (to_type == PRED) {
         return b_->CreateZExt(
             FCmpUNE(operand_value,
@@ -1933,7 +2026,7 @@ llvm::Value* ElementalIrEmitter::GetIntSMin(llvm::Type* type) {
 llvm::Value* ElementalIrEmitter::GetMinusOne(llvm::Type* type) {
   auto* integer_type = llvm::cast<llvm::IntegerType>(type);
   return llvm::ConstantInt::get(
-      integer_type, llvm::APInt::getAllOnesValue(integer_type->getBitWidth()));
+      integer_type, llvm::APInt::getAllOnes(integer_type->getBitWidth()));
 }
 
 llvm::Value* ElementalIrEmitter::IsZero(llvm::Value* v) {
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
index d140338fa7b..e93c9dc062a 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -66,7 +66,7 @@ void ReplaceCalledComputation(HloInstruction* instruction,
       computation->RemoveAsyncInstruction(instruction);
       instruction->ReplaceCalledComputations(
           [&](HloComputation*) { return new_computation; });
-      new_computation->AddAsyncInstruction(instruction);
+      new_computation->AddAsyncInstruction(*instruction);
       break;
     }
     default:
diff --git a/tensorflow/compiler/xla/service/float_support.h b/tensorflow/compiler/xla/service/float_support.h
index 315404bcf44..ef3bc66ea63 100644
--- a/tensorflow/compiler/xla/service/float_support.h
+++ b/tensorflow/compiler/xla/service/float_support.h
@@ -39,7 +39,8 @@ class FloatSupport {
   // type if the backend does not support the low-precision type for a certain
   // instruction.
   PrimitiveType HighPrecisionType() const {
-    if (low_precision_type_ == F8E5M2 || low_precision_type_ == F8E4M3FN) {
+    if (low_precision_type_ == F8E5M2 || low_precision_type_ == F8E4M3FN ||
+        low_precision_type_ == F8E4M3B11FNUZ) {
       return F16;
     }
     DCHECK_EQ(low_precision_type_, BF16);
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
index 68dfbf7f822..d6253244bff 100644
--- a/tensorflow/compiler/xla/service/gather_expander_test.cc
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gather_expander.h"
 
-#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -49,7 +49,7 @@ ENTRY main {
   EXPECT_EQ(status.code(), tsl::error::UNIMPLEMENTED);
 
   ASSERT_THAT(
-      status.error_message(),
+      status.message(),
       ::testing::HasSubstr("Gather operations with more than 2147483647 gather "
                            "indices are not supported."));
 }
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index d5796f55a61..65abde1e004 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -61,6 +61,18 @@ tf_proto_library(
     ],
 )
 
+xla_cc_test(
+    name = "backend_configs_test",
+    srcs = ["backend_configs_test.cc"],
+    deps = [
+        ":backend_configs_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/tsl/platform:status_matchers",
+    ],
+)
+
 cc_library(
     name = "gpu_executable_run_options",
     srcs = ["gpu_executable_run_options.cc"],
@@ -138,22 +150,22 @@ xla_cc_test(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     tags = tf_cuda_tests_tags(),
     deps = [
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/runtime:module",
+        "//tensorflow/compiler/xla/runtime:module_registry",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_api",
         "//tensorflow/compiler/xla/service:custom_call_status",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:gpu_plugin",
-        "//tensorflow/compiler/xla/runtime:module",
-        "//tensorflow/compiler/xla/runtime/ffi:ffi_api",
-        "//tensorflow/compiler/xla/runtime:module_registry",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:test",
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm_is_configured([
@@ -237,11 +249,14 @@ cc_library(
 xla_cc_test(
     name = "gpu_device_info_test",
     srcs = if_cuda_is_configured(["gpu_device_info_test.cc"]),
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
     deps = if_cuda_is_configured([
+        ":gpu_device_info",
         ":gpu_device_info_for_tests",
         "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_executor_header",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_description",
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_gpu_executor_header",
@@ -274,8 +289,8 @@ cc_library(
         ":gpu_asm_opts_util",
         ":gpu_constants",
         ":gpu_conv_runner",
-        ":gpu_executable",
         ":gpu_device_info",
+        ":gpu_executable",
         ":gpu_fusible",
         ":hlo_to_ir_bindings",
         ":ir_emission_utils",
@@ -285,37 +300,6 @@ cc_library(
         ":parallel_loop_emitter",
         ":target_util",
         ":thunk",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Linker",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
-        "@llvm-project//mlir:ToLLVMIRTranslation",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:NVVMToLLVMIRTranslation",
-        "//tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
-        "//tensorflow/compiler/xla/mlir_hlo:transforms_gpu_passes",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_module_importer",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
-        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
-        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:permutation_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -326,12 +310,16 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
+        "//tensorflow/compiler/xla/mlir_hlo:transforms_gpu_passes",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:collective_ops_utils",
         "//tensorflow/compiler/xla/service:custom_call_status",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/compiler/xla/service:pattern_matcher",
@@ -349,15 +337,43 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
         "//tensorflow/compiler/xla/service/llvm_ir:sort_util",
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_module_importer",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:attribute_exporter",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:location_exporter",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
+        "//tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:human_readable_json",
+        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/protobuf:autotuning_proto_cc",
         "//tensorflow/tsl/protobuf:dnn_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Linker",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:NVVMToLLVMIRTranslation",
+        "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:ToLLVMIRTranslation",
     ] + if_gpu_is_configured([
         ":triangular_solve_thunk",
         ":cholesky_thunk",
@@ -376,42 +392,44 @@ cc_library(
         ":gpu_device_info",
         ":ir_emission_utils",
         ":launch_dimensions",
+        ":matmul_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/protobuf:autotuning_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithToLLVM",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:IndexToLLVM",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:NVVMDialect",
-        "@llvm-project//mlir:ArithToLLVM",
-        "@llvm-project//mlir:IndexToLLVM",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
-        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:path",
-        "//tensorflow/tsl/protobuf:autotuning_proto_cc",
-    ] + if_cuda_is_configured([
         "@triton//:TritonDialect",
+        "@triton//:TritonTransforms",
+    ] + if_cuda_is_configured([
         "@triton//:TritonGPUToLLVM",
         "@triton//:TritonGPUTransforms",
         "@triton//:TritonLLVMIR",
         "@triton//:TritonToTritonGPU",
-        "@triton//:TritonTransforms",
     ]),
 )
 
 xla_test(
     name = "ir_emitter_triton_test",
-    srcs = ["ir_emitter_triton_test.cc"],
+    srcs = if_cuda_is_configured(["ir_emitter_triton_test.cc"]),
     backend_tags = {"gpu": [
         "requires-gpu-sm70",
     ]},
@@ -420,16 +438,64 @@ xla_test(
     ],
     tags = ["nomac"],
     deps = [
+        ":gpu_device_info_for_tests",
         ":ir_emitter_triton",
-        "//tensorflow/compiler/xla:comparison_util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        ":launch_dimensions",
+        "//tensorflow/compiler/xla:error_spec",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/stream_executor:device_description",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
+        "//tensorflow/compiler/xla/tests:verified_hlo_module",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:autotuning_proto_cc",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+xla_test(
+    name = "ir_emitter_triton_large_test",
+    srcs = if_cuda_is_configured(["ir_emitter_triton_large_test.cc"]),
+    backend_tags = {"gpu": [
+        "requires-gpu-sm70",
+    ]},
+    backends = [
+        "gpu",
+    ],
+    tags = [
+        "large",
+        "no_oss",
+        "nomac",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:error_spec",
+        "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+    ],
+)
+
+xla_test(
+    name = "ir_emitter_triton_parametrized_test",
+    srcs = if_cuda_is_configured(["ir_emitter_triton_parametrized_test.cc"]),
+    backend_tags = {"gpu": [
+        "requires-gpu-sm70",
+    ]},
+    backends = [
+        "gpu",
+    ],
+    tags = ["nomac"],
+    deps = [
+        "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:error_spec",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
+        "//tensorflow/compiler/xla/stream_executor:device_description",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//tensorflow/tsl/platform:test",
-        "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -440,28 +506,53 @@ cc_library(
     hdrs = if_cuda_is_configured(["triton_autotuner.h"]),
     deps = if_cuda_is_configured([
         ":buffer_comparator",
+        ":compile_module_to_llvm_ir",
+        ":gemm_rewriter_triton",
         ":gpu_asm_opts_util",
         ":gpu_device_info",
+        ":gpu_executable",
+        ":gpu_float_support",
+        ":gpu_fusible",
+        ":gpu_serializable_autotuner",
+        ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter_triton",
         ":launch_dimensions",
         ":stream_executor_util",
         ":target_constants",
         ":target_util",
-        ":gpu_serializable_autotuner",
+        ":thunk",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:ir_headers",
         "//tensorflow/compiler/xla:autotune_results_proto_cc",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:float_normalization",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
+        "//tensorflow/compiler/xla/stream_executor:device_description",
+        "//tensorflow/compiler/xla/stream_executor:device_memory",
         "//tensorflow/compiler/xla/stream_executor/gpu:asm_compiler_header",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_asm_opts",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_timer_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:blocking_counter",
         "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:autotuning_proto_cc",
         "//tensorflow/tsl/util/proto:proto_utils",
-    ]),
+    ]) + ["@com_google_absl//absl/algorithm:container"],
 )
 
 xla_test(
@@ -476,15 +567,18 @@ xla_test(
     tags = ["nomac"],
     deps = [
         ":gemm_rewriter_triton",
-        ":gpu_executable",
         ":triton_autotuner",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
-        "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
+        "//tensorflow/compiler/xla/stream_executor:device_description",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/tsl/platform:test",
-        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:verified_hlo_module",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/protobuf:autotuning_proto_cc",
         "@com_google_absl//absl/strings",
     ],
@@ -613,6 +707,8 @@ tsl_gpu_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:collective_ops_utils",
         "//tensorflow/compiler/xla/service:global_device_id",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_activation",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_activation_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
@@ -714,7 +810,6 @@ cc_library(
         "memset_thunk.cc",
         "outfeed_thunk.cc",
         "replica_id_thunk.cc",
-        "reusable_kernel_thunk.cc",
         "sequential_thunk.cc",
         "while_thunk.cc",
     ],
@@ -731,12 +826,10 @@ cc_library(
         "memset_thunk.h",
         "outfeed_thunk.h",
         "replica_id_thunk.h",
-        "reusable_kernel_thunk.h",
         "sequential_thunk.h",
         "while_thunk.h",
     ],
     deps = [
-        ":non_atomically_upgradeable_rw_lock",
         ":backend_configs_cc",
         ":buffer_allocations",
         ":cusolver_context",
@@ -753,8 +846,63 @@ cc_library(
         ":launch_dimensions",
         ":matmul_utils",
         ":nccl_collective_thunks",
+        ":non_atomically_upgradeable_rw_lock",
         ":stream_executor_util",
         ":thunk",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:refcounting_hash_map",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_gpu",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:type_converter",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
+        "//tensorflow/compiler/xla/runtime:diagnostics",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/runtime:jit_executable",
+        "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/compiler/xla/service:custom_call_status_internal",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
+        "//tensorflow/compiler/xla/service:hlo_execution_profile",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:logical_buffer",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:transfer_manager",
+        "//tensorflow/compiler/xla/service:xla_debug_info_manager",
+        "//tensorflow/compiler/xla/service/gpu/runtime:collectives",
+        "//tensorflow/compiler/xla/service/gpu/runtime:cublas_lt_matmul",
+        "//tensorflow/compiler/xla/service/gpu/runtime:executable",
+        "//tensorflow/compiler/xla/service/gpu/runtime:gemm",
+        "//tensorflow/compiler/xla/service/gpu/runtime:kernel_launch",
+        "//tensorflow/compiler/xla/service/gpu/runtime:support",
+        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
+        "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/compiler/xla/stream_executor:blas",
+        "//tensorflow/compiler/xla/stream_executor:device_memory",
+        "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
+        "//tensorflow/compiler/xla/stream_executor:kernel",
+        "//tensorflow/compiler/xla/stream_executor:scratch_allocator",
+        "//tensorflow/compiler/xla/stream_executor/gpu:asm_compiler",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_asm_opts",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
+        "//tensorflow/tsl/lib/gtl:map_util",
+        "//tensorflow/tsl/platform:casts",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:random",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/profiler/lib:scoped_annotation",
+        "//tensorflow/tsl/profiler/lib:traceme",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
@@ -771,68 +919,15 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Support",
-        "//tensorflow/compiler/xla/service/gpu/runtime:gemm",
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:refcounting_hash_map",
-        "//tensorflow/compiler/xla:shape_tree",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_gpu",
-        "//tensorflow/compiler/xla/mlir/runtime/transforms:type_converter",
-        "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
-        "//tensorflow/compiler/xla/runtime:diagnostics",
-        "//tensorflow/compiler/xla/runtime:executable",
-        "//tensorflow/compiler/xla/runtime:jit_executable",
-        "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:custom_call_status_internal",
-        "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
-        "//tensorflow/compiler/xla/service:hlo_execution_profile",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service:logical_buffer",
-        "//tensorflow/compiler/xla/service:shaped_buffer",
-        "//tensorflow/compiler/xla/service:transfer_manager",
-        "//tensorflow/compiler/xla/service:xla_debug_info_manager",
-        "//tensorflow/compiler/xla/service/gpu/runtime:kernel_launch",
-        "//tensorflow/compiler/xla/service/gpu/runtime:collectives",
-        "//tensorflow/compiler/xla/service/gpu/runtime:cublas_lt_matmul",
-        "//tensorflow/compiler/xla/service/gpu/runtime:support",
-        "//tensorflow/compiler/xla/service/gpu/runtime:executable",
-        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
-        "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/stream_executor:blas",
-        "//tensorflow/compiler/xla/stream_executor:device_memory",
-        "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
-        "//tensorflow/compiler/xla/stream_executor:kernel",
-        "//tensorflow/compiler/xla/stream_executor:scratch_allocator",
-        "//tensorflow/compiler/xla/stream_executor/gpu:asm_compiler",
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_asm_opts",
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
-        "//tensorflow/tsl/platform:casts",
-        "//tensorflow/tsl/profiler/lib:scoped_annotation",
-        "//tensorflow/tsl/profiler/lib:traceme",
-        "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/lib/gtl:map_util",
     ] + if_gpu_is_configured([
         ":cholesky_thunk",
         ":precompiled_kernels",
         ":triangular_solve_thunk",
     ]) + if_cuda_is_configured([
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_stream",
-        "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
-        "//tensorflow/tsl/platform/default/build_config:cudnn_plugin",
-        "//tensorflow/tsl/platform/default/build_config:cufft_plugin",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cufft_plugin",
         "//tensorflow/compiler/xla/stream_executor/cuda:stream_executor_cuda",
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm_is_configured([
@@ -987,6 +1082,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service/gpu:gpu_types",
         "//tensorflow/compiler/xla/stream_executor:blas",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:statusor",
@@ -1002,17 +1098,26 @@ cc_library(
     hdrs = ["gemm_rewriter_triton.h"],
     deps = [
         ":ir_emission_utils",
+        ":matmul_utils",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service/gpu:gpu_types",
         "//tensorflow/compiler/xla/stream_executor:device_description",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:autotuning_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -1022,10 +1127,15 @@ xla_cc_test(
     deps = [
         ":gemm_rewriter_triton",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
         "//tensorflow/compiler/xla/stream_executor:device_description",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status_matchers",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1054,10 +1164,10 @@ cc_library(
         ":thunk",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla:status",
-        "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/compiler/xla/stream_executor/cuda:cublas_lt_header",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
     ]) + ["//tensorflow/tsl/platform:logging"],
 )
 
@@ -1083,12 +1193,12 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:blas",
         "//tensorflow/compiler/xla/stream_executor/cuda:cublas_lt_header",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
         "//tensorflow/tsl/platform:logger",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:statusor",
@@ -1146,10 +1256,6 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":ir_emission_utils",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/types:span",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -1157,11 +1263,15 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
     ] + if_cuda_is_configured([
         "//tensorflow/compiler/xla/stream_executor/cuda:cublas_lt_header",
-        "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
         "//tensorflow/tsl/platform:tensor_float_32_hdr_lib",
         "//tensorflow/compiler/xla/stream_executor:host_or_device_scalar",
         "//tensorflow/compiler/xla/stream_executor:scratch_allocator",
@@ -1177,6 +1287,7 @@ xla_cc_test(
         ":matmul_utils",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "//tensorflow/tsl/platform:status_matchers",
         "@com_google_absl//absl/strings",
@@ -1214,6 +1325,61 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "gpu_async_collective_annotator",
+    srcs = ["gpu_async_collective_annotator.cc"],
+    hdrs = ["gpu_async_collective_annotator.h"],
+    deps = [
+        ":backend_configs_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_async_collective_annotator_test",
+    srcs = ["gpu_async_collective_annotator_test.cc"],
+    deps = [
+        ":backend_configs_cc",
+        ":gpu_async_collective_annotator",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_macros_header",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "gpu_convert_async_collectives_to_sync",
+    srcs = ["gpu_convert_async_collectives_to_sync.cc"],
+    hdrs = ["gpu_convert_async_collectives_to_sync.h"],
+    deps = [
+        ":backend_configs_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:convert_async_collectives_to_sync",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_convert_async_collectives_to_sync_test",
+    srcs = ["gpu_convert_async_collectives_to_sync_test.cc"],
+    deps = [
+        ":backend_configs_cc",
+        ":gpu_convert_async_collectives_to_sync",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_macros_header",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "gpu_conv_algorithm_picker",
     srcs = ["gpu_conv_algorithm_picker.cc"],
@@ -1224,32 +1390,27 @@ cc_library(
         ":gpu_asm_opts_util",
         ":gpu_autotuning_proto_cc",
         ":gpu_conv_runner",
-        ":hlo_algorithm_denylist",
-        ":ir_emission_utils",
         ":gpu_serializable_autotuner",
+        ":hlo_algorithm_denylist",
         ":stream_executor_util",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/time",
         "//tensorflow/compiler/xla:autotune_results_proto_cc",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
+        "//tensorflow/compiler/xla/stream_executor:scratch_allocator",
         "//tensorflow/tsl/platform:logger",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:numbers",
         "//tensorflow/tsl/protobuf:autotuning_proto_cc",
-        "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/stream_executor:scratch_allocator",
         "//tensorflow/tsl/util/proto:proto_utils",
-        "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
-        "//tensorflow/compiler/xla/stream_executor:dnn_proto_cc",
-        "//tensorflow/tsl/util:env_var",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ] + if_cuda_is_configured([
         ":buffer_comparator",
         "@local_config_cuda//cuda:cudnn_header",
@@ -1387,11 +1548,11 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:blas",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
+        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
@@ -1849,12 +2010,77 @@ cc_library(
     hdrs = ["gpu_reduce_scatter_creator.h"],
     deps = [
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/compiler/xla/service:hlo_query",
         "//tensorflow/compiler/xla/service:reduce_scatter_utils",
     ],
 )
 
+cc_library(
+    name = "gpu_float_support",
+    srcs = ["gpu_float_support.cc"],
+    hdrs = ["gpu_float_support.h"],
+    deps = [
+        "//tensorflow/compiler/xla/service:float_support",
+    ],
+)
+
+cc_library(
+    name = "compile_module_to_llvm_ir",
+    srcs = [
+        "compile_module_to_llvm_ir.cc",
+    ],
+    hdrs = [
+        "compile_module_to_llvm_ir.h",
+    ],
+    deps = [
+        ":executable_proto_cc",
+        ":gpu_constants",
+        ":gpu_convert_async_collectives_to_sync",
+        ":gpu_device_info",
+        ":gpu_executable",
+        ":gpu_float_support",
+        ":gpu_hlo_schedule",
+        ":ir_emitter",
+        ":metrics",
+        ":runtime_intrinsics",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/mlir/backends/gpu/transforms:passes",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_gpu",
+        "//tensorflow/compiler/xla/mlir_hlo:transforms_gpu_passes",
+        "//tensorflow/compiler/xla/service:bitcast_dtypes_expander",
+        "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/compiler/xla/service:dump",
+        "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/compiler/xla/service:hlo_rematerialization",
+        "//tensorflow/compiler/xla/service:optimization_barrier_expander",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/compiler/xla/stream_executor:device_description",
+        "//tensorflow/compiler/xla/stream_executor/rocm:rocm_platform_id",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:location_exporter",
+        "//tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:AsmParser",
+        "@llvm-project//llvm:TransformUtils",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "gpu_compiler",
     srcs = [
@@ -1866,6 +2092,7 @@ cc_library(
     deps = [
         ":alias_passthrough_params",
         ":all_reduce_blueconnect",
+        ":compile_module_to_llvm_ir",
         ":conv_layout_normalization",
         ":dot_dimension_sorter",
         ":executable_proto_cc",
@@ -1873,19 +2100,21 @@ cc_library(
         ":gemm_broadcast_folding_rewriter",
         ":gemm_rewriter",
         ":gemm_rewriter_triton",
+        ":gpu_async_collective_annotator",
         ":gpu_constants",
         ":gpu_conv_algorithm_picker",
         ":gpu_conv_rewriter",
         ":gpu_device_info",
         ":gpu_executable",
+        ":gpu_float_support",
         ":gpu_hlo_cost_analysis",
         ":gpu_hlo_schedule",
         ":gpu_layout_assignment",
         ":gpu_reduce_scatter_creator",
         ":gpu_sanitize_constant_names",
         ":gpu_scatter_expander",
-        ":gpu_shape_verifier",
         ":gpu_serializable_autotuner",
+        ":gpu_shape_verifier",
         ":hlo_fusion_stats",
         ":horizontal_input_fusion",
         ":horizontal_loop_fusion",
@@ -1902,25 +2131,17 @@ cc_library(
         ":reduction_splitter",
         ":runtime_intrinsics",
         ":scatter_slice_simplifier",
+        ":topk_specializer",
+        ":topk_splitter",
         ":tree_reduction_rewriter",
         ":variadic_op_splitter",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:variant",
-        "@llvm-project//llvm:AsmParser",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:TransformUtils",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "//tensorflow/compiler/xla/service:topk_rewriter",
         "//tensorflow/compiler/xla:autotune_results_proto_cc",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/transforms:hlo_constant_splitter",
         "//tensorflow/compiler/xla/mlir/backends/gpu/transforms:passes",
@@ -1935,10 +2156,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:all_reduce_folder",
         "//tensorflow/compiler/xla/service:all_reduce_promotion",
         "//tensorflow/compiler/xla/service:all_reduce_reassociate",
-        "//tensorflow/compiler/xla/service:all_to_all_decomposer",
         "//tensorflow/compiler/xla/service:async_collective_creator",
         "//tensorflow/compiler/xla/service:batchnorm_expander",
-        "//tensorflow/compiler/xla/service:float_normalization",
         "//tensorflow/compiler/xla/service:bitcast_dtypes_expander",
         "//tensorflow/compiler/xla/service:broadcast_canonicalizer",
         "//tensorflow/compiler/xla/service:buffer_assignment",
@@ -1947,6 +2166,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:comparison_expander",
         "//tensorflow/compiler/xla/service:conditional_canonicalizer",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
+        "//tensorflow/compiler/xla/service:convert_async_collectives_to_sync",
         "//tensorflow/compiler/xla/service:convert_mover",
         "//tensorflow/compiler/xla/service:convolution_4d_expander",
         "//tensorflow/compiler/xla/service:convolution_pred_expander",
@@ -1960,6 +2180,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:eigh_expander",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
+        "//tensorflow/compiler/xla/service:float_normalization",
         "//tensorflow/compiler/xla/service:gather_expander",
         "//tensorflow/compiler/xla/service:gather_simplifier",
         "//tensorflow/compiler/xla/service:hlo_computation_deduplicator",
@@ -1967,9 +2188,11 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/compiler/xla/service:hlo_rematerialization",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:layout_normalization",
         "//tensorflow/compiler/xla/service:llvm_compiler",
@@ -1996,6 +2219,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:sort_simplifier",
         "//tensorflow/compiler/xla/service:stable_sort_expander",
         "//tensorflow/compiler/xla/service:stochastic_convert_decomposer",
+        "//tensorflow/compiler/xla/service:topk_rewriter",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_all_reduce_code_motion",
@@ -2017,11 +2241,23 @@ cc_library(
         "//tensorflow/tsl/platform:blocking_counter",
         "//tensorflow/tsl/platform:casts",
         "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/profiler/lib:traceme",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:variant",
+        "@llvm-project//llvm:AsmParser",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TransformUtils",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ] + if_cuda_is_configured([
         ":gemm_algorithm_picker",
         ":triton_autotuner",
@@ -2228,9 +2464,9 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/compiler/xla/service:hlo_query",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/types:span",
@@ -2349,18 +2585,17 @@ cc_library(
     srcs = ["gpu_hlo_schedule.cc"],
     hdrs = ["gpu_hlo_schedule.h"],
     deps = [
+        ":backend_configs_cc",
         ":cublas_cudnn",
         ":gpu_device_info",
-        ":ir_emission_utils",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service:buffer_value",
-        "//tensorflow/compiler/xla/service:hlo_cost_analysis",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/service:hlo_memory_scheduler",
         "//tensorflow/compiler/xla/service:hlo_ordering",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:latency_hiding_scheduler",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/compiler/xla/service:profile_guided_latency_estimator",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -2374,14 +2609,12 @@ xla_cc_test(
     deps = [
         ":gpu_device_info",
         ":gpu_hlo_schedule",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -2429,7 +2662,6 @@ cc_library(
         "//tensorflow/tsl/platform:regexp",
         "//tensorflow/tsl/profiler/lib:traceme",
         "//tensorflow/tsl/protobuf:autotuning_proto_cc",
-        "//tensorflow/tsl/util:determinism_for_kernels",
         "//tensorflow/tsl/util:env_var",
         "//tensorflow/tsl/util/proto:proto_utils",
         "@com_google_absl//absl/memory",
@@ -2523,10 +2755,11 @@ xla_cc_test(
     srcs = if_cuda_is_configured(["buffer_comparator_test.cc"]),
     tags = tf_cuda_tests_tags(),
     deps = [
-        "//tensorflow/tsl/platform:test_main",
+        ":stream_executor_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
     ] + if_cuda_is_configured([
         ":buffer_comparator",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
@@ -2923,7 +3156,9 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:collective_ops_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service/gpu:gpu_types",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2958,7 +3193,6 @@ test_suite(
         "//tensorflow/compiler/xla/service/gpu:custom_call_test",
         # copybara:uncomment "//tensorflow/compiler/xla/service/gpu:gpu_aot_compilation_test",
         "//tensorflow/compiler/xla/service/gpu/tests:add_preds.hlo.test",
-        "//tensorflow/compiler/xla/service/gpu/tests:all_reduce.hlo.test",
         "//tensorflow/compiler/xla/service/gpu/tests:concat.hlo.test",
         "//tensorflow/compiler/xla/service/gpu/tests:constant.hlo.test",
         "//tensorflow/compiler/xla/service/gpu/tests:copy.hlo.test",
@@ -3145,6 +3379,7 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_asm_opts",
     ]) + if_rocm_is_configured([
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/compiler/xla/stream_executor/rocm:rocm_helpers",
     ]),
 )
 
@@ -3241,6 +3476,7 @@ cc_library(
     srcs = ["conv_layout_normalization.cc"],
     hdrs = ["conv_layout_normalization.h"],
     deps = [
+        ":cublas_cudnn",
         "//tensorflow/compiler/xla:permutation_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -3250,9 +3486,87 @@ cc_library(
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
-        "//tensorflow/compiler/xla/service/gpu:cublas_cudnn",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "topk_specializer",
+    srcs = if_cuda_is_configured(
+        ["topk_specializer.cc"],
+        ["topk_specializer_nocuda.cc"],
+    ),
+    hdrs = ["topk_specializer.h"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/runtime:state",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/compiler/xla/service:tuple_util",
+        "//tensorflow/compiler/xla/service/gpu/runtime:support",
+        "//tensorflow/compiler/xla/service/gpu/runtime:topk_kernel",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "topk_splitter",
+    srcs = ["topk_splitter.cc"],
+    hdrs = ["topk_splitter.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_creation_utils",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "topk_splitter_test",
+    srcs = ["topk_splitter_test.cc"],
+    deps = [
+        ":topk_splitter",
+        "//tensorflow/compiler/xla:error_spec",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service:topk_rewriter",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:verified_hlo_module",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.cc b/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.cc
index 1fda37121b2..43525b1217c 100644
--- a/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.cc
+++ b/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
index 82625458c2d..d881f6c6330 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
@@ -109,12 +109,10 @@ Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
 
 Status AMDGPUCompiler::OptimizeHloPostLayoutAssignment(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    se::DeviceMemoryAllocator* device_allocator,
-    const GpuTargetConfig& gpu_target_config,
+    const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
     const AutotuneResults* autotune_results) {
   TF_RETURN_IF_ERROR(GpuCompiler::OptimizeHloPostLayoutAssignment(
-      hlo_module, stream_exec, device_allocator, gpu_target_config,
-      autotune_results));
+      hlo_module, stream_exec, options, gpu_target_config, autotune_results));
 
   HloPassPipeline post_pipeline("AMDGPU post-layout_assignment");
 
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
index b00f875468c..479098f93dd 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
@@ -39,8 +39,7 @@ class AMDGPUCompiler : public GpuCompiler {
 
   Status OptimizeHloPostLayoutAssignment(
       HloModule* hlo_module, se::StreamExecutor* stream_exec,
-      se::DeviceMemoryAllocator* device_allocator,
-      const GpuTargetConfig& gpu_target_config,
+      const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
       const AutotuneResults* autotune_results) override;
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
diff --git a/tensorflow/compiler/xla/service/gpu/backend_configs.proto b/tensorflow/compiler/xla/service/gpu/backend_configs.proto
index b898a60277d..6488ad460c5 100644
--- a/tensorflow/compiler/xla/service/gpu/backend_configs.proto
+++ b/tensorflow/compiler/xla/service/gpu/backend_configs.proto
@@ -88,3 +88,11 @@ message BitcastBackendConfig {
   LayoutProto source_layout = 1;
   LayoutProto result_layout = 2;
 }
+
+// Backend config for async collective operations. Note that for is_sync will
+// be false by default, so even if a backend config is not explicitly attached
+// to the HLOInstruction, getting the backend_config will yield a default valued
+// proto which will have is_sync = false.
+message CollectiveBackendConfig {
+  bool is_sync = 1;
+}
diff --git a/tensorflow/compiler/xla/service/gpu/backend_configs_test.cc b/tensorflow/compiler/xla/service/gpu/backend_configs_test.cc
new file mode 100644
index 00000000000..ce0c42897c4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/backend_configs_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::testing::IsFalse;
+using ::tsl::testing::IsOk;
+
+using BackendConfigsTest = HloTestBase;
+
+TEST_F(BackendConfigsTest, DefaultCollectiveBackendConfig) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule ModuleWithAsync
+
+  ENTRY entry {
+    pf32 = f32[1] parameter(0)
+
+    agf32-start = (f32[1], f32[2]) all-gather-start(pf32), dimensions={0}
+    ROOT agf32-done = f32[2] all-gather-done(agf32-start)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(kHloString, /*replica_count=*/2));
+
+  const HloInstruction *ags = FindInstruction(module.get(), "agf32-start");
+  EXPECT_THAT(ags->has_backend_config(), IsFalse());
+  auto collective_backend_config =
+      ags->backend_config<CollectiveBackendConfig>();
+  EXPECT_THAT(collective_backend_config.status(), IsOk());
+  EXPECT_THAT(collective_backend_config->is_sync(), IsFalse());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index 3a11100bae1..d5d8bd02818 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 #include <set>
+#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -72,7 +73,7 @@ class BufferAllocations {
   Status TearDown(const std::set<se::DeviceMemoryBase>& live_addresses,
                   absl::Span<const BufferAllocation> allocations);
 
-  std::string ToString() {
+  std::string ToString() const {
     std::string out;
     for (BufferAllocation::Index i = 0; i < buffers_.size(); ++i) {
       const auto& buf = buffers_[i];
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
index e1d6a74a898..8f489245e8a 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
@@ -16,10 +16,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 
 #include <complex>
+#include <cstdint>
 #include <limits>
 #include <string>
 
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/platform/test.h"
@@ -255,6 +258,27 @@ TEST_F(BufferComparatorTest, TestMultiple) {
   }
 }
 
+TEST_F(BufferComparatorTest, BF16) {
+  const int element_count = 3123;
+  int64_t rng_state = 0;
+
+  se::Stream stream(stream_exec_);
+  stream.Init();
+
+  se::ScopedDeviceMemory<Eigen::bfloat16> lhs =
+      stream_exec_->AllocateOwnedArray<Eigen::bfloat16>(element_count);
+  InitializeBuffer(&stream, BF16, &rng_state, *lhs.ptr());
+
+  se::ScopedDeviceMemory<Eigen::bfloat16> rhs =
+      stream_exec_->AllocateOwnedArray<Eigen::bfloat16>(element_count);
+  InitializeBuffer(&stream, BF16, &rng_state, *rhs.ptr());
+
+  BufferComparator comparator(ShapeUtil::MakeShape(BF16, {element_count}),
+                              HloModuleConfig());
+  EXPECT_FALSE(
+      comparator.CompareEqual(&stream, *lhs.ptr(), *rhs.ptr()).value());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.cc b/tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.cc
new file mode 100644
index 00000000000..05fd734df4e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -0,0 +1,464 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.h"
+
+#include <stdlib.h>
+
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Transforms/Utils/SplitModule.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
+#include "tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.h"
+#include "tensorflow/compiler/xla/service/bitcast_dtypes_expander.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/dump.h"
+#include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/for_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_convert_async_collectives_to_sync.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
+#include "tensorflow/compiler/xla/service/gpu/metrics.h"
+#include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_rematerialization.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/optimization_barrier_expander.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
+#include "tensorflow/compiler/xla/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
+#include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h"
+#include "tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.h"
+#include "tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Prints mlir diagnostic messages to VLOG level 2.
+static mlir::LogicalResult DiagnosticHandler(mlir::Diagnostic& diag) {
+  VLOG(2) << diag.str();
+  return mlir::failure();
+}
+
+static bool HasFp8(const HloModule& hlo_module) {
+  for (const HloComputation* computation : hlo_module.computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (ShapeUtil::HasPrimitiveType(instruction->shape(), F8E5M2) ||
+          ShapeUtil::HasPrimitiveType(instruction->shape(), F8E4M3FN) ||
+          ShapeUtil::HasPrimitiveType(instruction->shape(), F8E4M3B11FNUZ)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Lowers MLIR module to the XLA Gpu runtime custom calls.
+static Status LowerToXlaGpuRuntime(mlir::ModuleOp module,
+                                   llvm::StringRef entry_function_name,
+                                   llvm::ArrayRef<int64_t> buffer_sizes,
+                                   ThunkSequence* thunk_sequence,
+                                   const DebugOptions& debug_options) {
+  if (!module) {
+    return InternalError("No MLIR module to lower.");
+  }
+
+  mlir::PassManager pm(module->getName(), mlir::PassManager::Nesting::Implicit);
+
+  GpuPipelineOpts opts;
+  opts.cuda_graph_level = debug_options.xla_gpu_cuda_graph_level();
+  populateXlaGpuRuntimePasses(pm, thunk_sequence, opts);
+
+  if (pm.run(module).failed()) {
+    return InternalError("Failed to lower LMHLO to Gpu runtime custom calls.");
+  }
+
+  return OkStatus();
+}
+
+void ForAllThunks(const std::function<void(Thunk*)>& fn,
+                  ThunkSequence* thunk_sequence) {
+  for (std::unique_ptr<Thunk>& thunk : *thunk_sequence) {
+    if (thunk->kind() == Thunk::kConditional) {
+      auto* cond_thunk = static_cast<ConditionalThunk*>(thunk.get());
+      for (const std::unique_ptr<SequentialThunk>& branch_thunks :
+           cond_thunk->branch_thunks()) {
+        ForAllThunks(fn, &branch_thunks->thunks());
+      }
+    } else if (thunk->kind() == Thunk::kFor) {
+      auto* for_thunk = static_cast<ForThunk*>(thunk.get());
+      ForAllThunks(fn, &for_thunk->body_thunk_sequence()->thunks());
+    } else if (thunk->kind() == Thunk::kSequential) {
+      auto* sequential_thunk = static_cast<SequentialThunk*>(thunk.get());
+      ForAllThunks(fn, &sequential_thunk->thunks());
+    } else if (thunk->kind() == Thunk::kWhile) {
+      auto* while_thunk = static_cast<WhileThunk*>(thunk.get());
+      ForAllThunks(fn, &while_thunk->condition_thunk_sequence()->thunks());
+      ForAllThunks(fn, &while_thunk->body_thunk_sequence()->thunks());
+    } else {
+      fn(thunk.get());
+    }
+  }
+}
+
+}  // namespace
+
+std::optional<bool> DummyCanShareBufferFunction(const HloInstruction*,
+                                                const HloInstruction*,
+                                                const ShapeIndex&) {
+  return std::nullopt;
+}
+
+StatusOr<GpuExecutable::OwnedGpuRuntimeProgram> LowerToJitRt(
+    mlir::ModuleOp mlir_module, llvm::StringRef entry_function_name,
+    llvm::ArrayRef<int64_t> buffer_sizes, const HloModuleConfig& module_config,
+    std::unique_ptr<ThunkSequence> thunk_sequence,
+    const HloModule* hlo_module_for_dump) {
+  // Forward collective (NCCL) attributes for use by the lowering pipeline.
+  mlir::OpBuilder builder(mlir_module.getContext());
+  mlir::IntegerAttr replica_count_attr =
+      builder.getI64IntegerAttr(module_config.replica_count());
+  mlir::IntegerAttr num_partitions_attr =
+      builder.getI64IntegerAttr(module_config.num_partitions());
+  mlir::func::FuncOp func =
+      mlir_module.lookupSymbol<mlir::func::FuncOp>(entry_function_name);
+  func->setAttr("replica_count", replica_count_attr);
+  func->setAttr("num_partitions", num_partitions_attr);
+
+  // Lower LMHLO operations to the JitRt compatible custom calls.
+  TF_RETURN_IF_ERROR(LowerToXlaGpuRuntime(
+      mlir_module, {entry_function_name.data(), entry_function_name.size()},
+      buffer_sizes, thunk_sequence.get(), module_config.debug_options()));
+
+  // TODO(b/232033540): Pass MLIR module directly to Gpu runtime executable
+  // without forcing serialization.
+  std::string module_str = llvm_ir::DumpToString(mlir_module);
+
+  if (hlo_module_for_dump != nullptr) {
+    DumpToFileInDirOrStdout(*hlo_module_for_dump, "gpu_rt_host", "mlir",
+                            module_str);
+  }
+
+  return std::make_unique<GpuRuntimeProgram>(
+      entry_function_name.str(), std::move(module_str), buffer_sizes.vec(),
+      module_config.debug_options());
+}
+
+StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
+    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
+    const std::string& target_triple, const std::string& data_layout,
+    const std::string& platform_name, const se::Platform::Id platform_id,
+    GpuDeviceInfo gpu_device_info,
+    se::CudaComputeCapability cuda_compute_capability,
+    se::RocmComputeCapability rocm_compute_capability, int pointer_size) {
+  CompileModuleResults results;
+  TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
+      hlo_module, llvm_context, target_triple, data_layout, platform_name,
+      platform_id, gpu_device_info, cuda_compute_capability,
+      rocm_compute_capability, DummyCanShareBufferFunction, pointer_size,
+      &results));
+  return std::move(results.llvm_module);
+}
+
+// The order of `thunk_sequence` corresponds to
+// `hlo_schedule->ThunkLaunchOrder()`.
+Status CompileModuleToLlvmIrImpl(
+    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
+    const std::string& target_triple, const std::string& data_layout,
+    const std::string& platform_name, se::Platform::Id platform_id,
+    GpuDeviceInfo gpu_device_info,
+    se::CudaComputeCapability cuda_compute_capability,
+    se::RocmComputeCapability rocm_compute_capability,
+    const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
+    int pointer_size, CompileModuleResults* results,
+    se::StreamExecutor* stream_exec) {
+  results->llvm_module = std::make_unique<llvm::Module>("", *llvm_context);
+  results->llvm_module->setTargetTriple(target_triple);
+  results->llvm_module->setDataLayout(data_layout);
+
+  TF_RETURN_IF_ERROR(
+      ScheduleGpuModule(hlo_module, pointer_size, gpu_device_info));
+  {
+    HloPassPipeline pipeline("post-scheduling-passes");
+
+    HloPredicate is_nop =
+        HloPredicateIsOp<HloOpcode::kParameter, HloOpcode::kConstant,
+                         HloOpcode::kBitcast, HloOpcode::kGetTupleElement>;
+    pipeline.AddPass<GpuConvertAsyncCollectivesToSync>(is_nop);
+    pipeline.AddPass<OptimizationBarrierExpander>();
+
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+
+  auto buffer_size_bytes_function =
+      [pointer_size](const BufferValue& buffer_value) -> int64_t {
+    return GetSizeOfShape(buffer_value.shape(), pointer_size);
+  };
+
+  HloRematerialization::RematerializationSizes sizes;
+  HloRematerialization remat(
+      [pointer_size](const Shape& shape) {
+        return GetSizeOfShape(shape, pointer_size);
+      },
+      // Assume 75% of the total device memory is available for XLA.
+      /*memory_limit_bytes=*/gpu_device_info.device_memory_size * 0.75,
+      /*sizes=*/&sizes,
+      HloRematerialization::RematerializationPass::kPostFusion,
+      /*block_size_limit=*/1, /*block_rematerialization_factor=*/1,
+      /*compact_shape_function=*/nullptr,
+      HloRematerialization::RematerializationMode::kRecomputeAndCompress);
+  TF_ASSIGN_OR_RETURN(bool changed, remat.Run(hlo_module));
+  if (changed) {
+    VLOG(1) << "HloRematerialization saved "
+            << sizes.before_bytes - sizes.after_bytes << " bytes";
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      results->buffer_assignment,
+      BufferAssigner::Run(
+          hlo_module,
+          std::make_unique<SequentialHloOrdering>(hlo_module->schedule()),
+          buffer_size_bytes_function,
+          /*color_alignment=*/
+          [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
+          /*allocate_buffers_for_constants=*/true,
+          /*colorer=*/BufferAssigner::DefaultColorer(),
+          /*must_not_live_out=*/{}, can_share_buffer_function));
+
+  VLOG(1) << "Buffer Assignment Stats for " << hlo_module->name() << "\n"
+          << results->buffer_assignment->GetStats().ToString();
+  DumpHloModuleIfEnabled(*hlo_module, *results->buffer_assignment,
+                         absl::StrCat("sm_", cuda_compute_capability.ToString(),
+                                      "_gpu_", kAfterOptimizationsDumpName));
+
+  VLOG(1) << "After optimization module fingerprint for " << hlo_module->name()
+          << ": " << hlo_module->GetFingerprint128();
+
+  uint64_t start_usecs = tsl::Env::Default()->NowMicros();
+  mlir::DialectRegistry registry;
+  IrEmitterUnnested::GetDependentDialects(registry);
+  mlir::MLIRContext mlir_context(registry);
+  mlir_context.getDiagEngine().registerHandler(DiagnosticHandler);
+  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
+      mlir::ModuleOp::create(mlir::Builder(&mlir_context).getUnknownLoc());
+
+  TF_RETURN_IF_ERROR(
+      HloToLhloModule(*results->buffer_assignment, *hlo_module, *mlir_module));
+
+  results->module_name =
+      mlir::mhlo::GetDebugNameFromLocation(mlir_module->getLoc());
+
+  if (DumpingEnabledForHloModule(*hlo_module)) {
+    DumpToFileInDirOrStdout(*hlo_module, "lmhlo", mlir_module.get());
+  }
+
+  auto entry_function = mlir::cast<mlir::func::FuncOp>(
+      mlir_module->lookupSymbol(hlo_module->entry_computation()->name()));
+
+  TF_RETURN_IF_ERROR(GetMlirAllocationInfo(
+      entry_function, &results->allocations, &results->output_info,
+      &results->output_shape, &results->entry_func_attrs));
+
+  IrEmitterContext ir_emitter_context(
+      /*hlo_module=*/nullptr, /*buffer_assignment=*/nullptr, platform_name,
+      gpu_device_info, cuda_compute_capability, rocm_compute_capability,
+      &mlir_context, results->llvm_module.get());
+
+  ir_emitter_context.set_allocations(results->allocations);
+
+  TF_ASSIGN_OR_RETURN(
+      auto ir_emitter,
+      IrEmitterUnnested::Create(hlo_module->config(), &ir_emitter_context));
+
+  {
+    XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
+        "GpuCompiler::RunBackend - IR emission for ", hlo_module->name()));
+
+    TF_RETURN_IF_ERROR(ir_emitter->EmitLmhloRegion(&entry_function.getBody()));
+
+    bool supports_runtime_managed_constants =
+        // TODO(b/218907125): Implement this feature for ROCm as well.
+        platform_id != se::rocm::kROCmPlatformId &&
+        hlo_module->config().debug_options().xla_gpu_enable_shared_constants();
+    if (supports_runtime_managed_constants) {
+      // Remove these globals from the generated code to indicate that XLA is
+      // responsible for allocating and initializing them.
+      RemoveUnusedAndUninitializedGlobals(ir_emitter_context.llvm_module(),
+                                          ir_emitter_context.constants());
+    }
+
+    results->constants = std::move(ir_emitter_context.constants());
+    uint64_t end_usecs = tsl::Env::Default()->NowMicros();
+
+    // This won't record values for calls that error out (because if they error
+    // out we have no way of telling how far through the process we got).
+    RecordHloToLlvmDuration(end_usecs - start_usecs);
+  }
+
+  // TODO(ezhulenev): Remove the FP8 check once https://reviews.llvm.org/D140088
+  // is submitted. Currently we can't emit LLVM IR with fp8 types.
+  if (IsXlaRuntimeExecutableEnabled(hlo_module->config()) &&
+      !HasFp8(*hlo_module)) {
+    std::vector<int64_t> buffer_sizes;
+    llvm::transform(
+        results->allocations, std::back_inserter(buffer_sizes),
+        [](const BufferAllocation& allocation) { return allocation.size(); });
+    TF_ASSIGN_OR_RETURN(
+        results->executable,
+        LowerToJitRt(*mlir_module, entry_function.getName(), buffer_sizes,
+                     hlo_module->config(), ir_emitter->ConsumeThunkSequence(),
+                     /*hlo_module_for_dump=*/hlo_module));
+    return OkStatus();
+  }
+
+  auto thunk_sequence = ir_emitter->ConsumeThunkSequence();
+  ForAllThunks([](Thunk* thunk) { thunk->ClearCompileTimeInfo(); },
+               thunk_sequence.get());
+  results->executable = std::move(thunk_sequence);
+  return OkStatus();
+}
+
+// Analyze the function signature to reconstruct a vector of BufferAllocation
+// objects, as well as other output information.
+//
+// This function also serves as a half-baked verifier for function arg
+// attributes, since a full verifier doesn't exist yet.
+Status GetMlirAllocationInfo(
+    mlir::func::FuncOp func, std::vector<BufferAllocation>* allocations,
+    absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>* output_info,
+    Shape* output_shape, EntryFunctionAttributes* entry_func_attrs) {
+  CHECK(allocations->empty());
+  allocations->reserve(func.getNumArguments());
+
+  std::vector<int64_t> buffer_sizes;
+  for (int i = 0; i < func.getNumArguments(); i++) {
+    mlir::BlockArgument arg = func.getArgument(i);
+
+    TF_RET_CHECK(arg.getType().isa<mlir::ShapedType>());
+    mlir::ShapedType type = arg.getType().cast<mlir::ShapedType>();
+    TF_ASSIGN_OR_RETURN(auto element_type_bytes,
+                        GetElementTypeBytes(type.getElementType()));
+    size_t size = type.getNumElements() * element_type_bytes;
+    buffer_sizes.push_back(size);
+  }
+
+  for (int i = 0; i < func.getNumArguments(); i++) {
+    llvm::ArrayRef<mlir::NamedAttribute> attrs =
+        mlir::function_interface_impl::getArgAttrs(func, i);
+    for (const mlir::NamedAttribute& attr : attrs) {
+      TF_RET_CHECK(attr.getName() == "lmhlo.params" ||
+                   attr.getName() == "lmhlo.param_shape_index" ||
+                   attr.getName() == "lmhlo.constant_name" ||
+                   attr.getName() == "lmhlo.must_alias" ||
+                   attr.getName() == "lmhlo.output_index");
+    }
+  }
+
+  // Encode buffer parameter metadata in a proto for persisting, because BEF
+  // doesn't persist function attributes.
+  for (int i = 0; i < func.getNumArguments(); i++) {
+    auto buffer = entry_func_attrs->add_buffers();
+    if (auto param_attr = func.getArgAttr(i, "lmhlo.params")) {
+      buffer->set_lmhlo_params_present(true);
+      buffer->set_lmhlo_params(param_attr.cast<mlir::IntegerAttr>().getInt());
+    }
+    if (auto shape_index_attr = func.getArgAttr(i, "lmhlo.param_shape_index")) {
+      auto param_shape_index = buffer->mutable_lmhlo_param_shape_index();
+      for (const llvm::APInt& element :
+           shape_index_attr.cast<mlir::DenseIntElementsAttr>()) {
+        param_shape_index->add_indices(element.getSExtValue());
+      }
+    }
+    if (auto constant_name_attr = func.getArgAttr(i, "lmhlo.constant_name")) {
+      buffer->set_lmhlo_constant_name(
+          constant_name_attr.cast<mlir::StringAttr>().str());
+    }
+    if (func.getArgAttr(i, "lmhlo.must_alias")) {
+      buffer->set_lmhlo_must_alias(true);
+    }
+    if (auto output_index_attr = func.getArgAttr(i, "lmhlo.output_index")) {
+      auto output_index = buffer->mutable_lmhlo_output_index();
+      for (const llvm::APInt& element :
+           output_index_attr.cast<mlir::DenseIntElementsAttr>()) {
+        output_index->add_indices(element.getSExtValue());
+      }
+    }
+  }
+  entry_func_attrs->set_result_xla_shape(
+      func->getAttrOfType<mlir::StringAttr>("result_xla_shape")
+          .getValue()
+          .str());
+
+  return GpuExecutable::SetUpMlirAllocation(func, buffer_sizes, allocations,
+                                            output_info, output_shape);
+}
+
+// Removes all globals from the given module that are both uninitialized and
+// have no uses within that module.
+void RemoveUnusedAndUninitializedGlobals(
+    llvm::Module* llvm_module,
+    const std::vector<GpuExecutable::ConstantInfo>& constants) {
+  for (const auto& info : constants) {
+    // Empty content means the constant is initialized in the LLVM IR, so we
+    // must not remove it.
+    if (!info.content.empty()) {
+      llvm::GlobalVariable* global =
+          llvm_module->getGlobalVariable(info.symbol_name);
+      CHECK(global != nullptr);
+      if (global->use_empty()) {
+        global->eraseFromParent();
+      }
+    }
+  }
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.h b/tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.h
new file mode 100644
index 00000000000..6f24f0db6fd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -0,0 +1,99 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_COMPILE_MODULE_TO_LLVM_IR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_COMPILE_MODULE_TO_LLVM_IR_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/gpu/executable.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
+#include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
+#include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+struct CompileModuleResults {
+  std::unique_ptr<llvm::Module> llvm_module;
+  std::unique_ptr<BufferAssignment> buffer_assignment;
+  std::vector<BufferAllocation> allocations;
+  std::variant<GpuExecutable::OwnedThunkSequence,
+               GpuExecutable::OwnedGpuRuntimeProgram>
+      executable;
+  EntryFunctionAttributes entry_func_attrs;
+  std::vector<GpuExecutable::ConstantInfo> constants;
+  absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo> output_info;
+  Shape output_shape;
+  std::string module_name;
+};
+
+Status GetMlirAllocationInfo(
+    mlir::func::FuncOp func, std::vector<BufferAllocation>* allocations,
+    absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>* output_info,
+    Shape* output_shape, EntryFunctionAttributes* entry_func_attrs);
+
+// Removes all globals from the given module that are both uninitialized and
+// have no uses within that module.
+void RemoveUnusedAndUninitializedGlobals(
+    llvm::Module* llvm_module,
+    const std::vector<GpuExecutable::ConstantInfo>& constants);
+
+StatusOr<GpuExecutable::OwnedGpuRuntimeProgram> LowerToJitRt(
+    mlir::ModuleOp mlir_module, llvm::StringRef entry_function_name,
+    llvm::ArrayRef<int64_t> buffer_sizes, const HloModuleConfig& module_config,
+    std::unique_ptr<ThunkSequence> thunk_sequence,
+    const HloModule* hlo_module_for_dump = nullptr);
+
+std::optional<bool> DummyCanShareBufferFunction(const HloInstruction*,
+                                                const HloInstruction*,
+                                                const ShapeIndex&);
+
+// Compile `hlo_module` using XLA GPU and return the LLVM module thus generated.
+// The GpuExecutable (and the Thunks that are part of it) are not returned.
+StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
+    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
+    const std::string& target_triple, const std::string& data_layout,
+    const std::string& platform_name, se::Platform::Id platform_id,
+    GpuDeviceInfo gpu_device_info,
+    se::CudaComputeCapability cuda_compute_capability,
+    se::RocmComputeCapability rocm_compute_capability, int pointer_size);
+
+Status CompileModuleToLlvmIrImpl(
+    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
+    const std::string& target_triple, const std::string& data_layout,
+    const std::string& platform_name, se::Platform::Id platform_id,
+    GpuDeviceInfo gpu_device_info,
+    se::CudaComputeCapability cuda_compute_capability,
+    se::RocmComputeCapability rocm_compute_capability,
+    const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
+    int pointer_size, CompileModuleResults* results,
+    se::StreamExecutor* stream_exec = nullptr);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_COMPILE_MODULE_TO_LLVM_IR_H_
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index ccaa279e9f3..30d42aac7d5 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -62,7 +62,7 @@ Status ConditionalThunk::ExecuteOnStream(const ExecuteParams& params) {
   if (!block_status.ok()) {
     return InternalError(
         "Failed to retrieve branch_index value on stream %p: %s.", &stream,
-        block_status.error_message());
+        block_status.message());
   }
   if (config_.branch_index_is_bool) {
     branch_index = pred ? 0 : 1;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
index a7412a2598d..5b1315e9177 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
@@ -1024,9 +1024,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseEluIfMultipleUses) {
                         .WithComparisonDirection(ComparisonDirection::kGt),
                     gte_pattern,
                     m::Op()
-                        .WithPredicate([](const HloInstruction* instr) {
-                          return instr->opcode() == HloOpcode::kExpm1;
-                        })
+                        .WithPredicate(HloPredicateIsOp<HloOpcode::kExpm1>)
                         .WithOperand(0, gte_pattern)),
           m::Minimum())));
   TF_ASSERT_OK_AND_ASSIGN(auto config,
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions.cc b/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions.cc
index 6859e864aa8..03eded056b5 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions.cc
@@ -527,10 +527,10 @@ static StatusOr<bool> TryVectorizeConv(
     new_operands.push_back(Parameter(&b, 2, conv->operand(2)->shape(), "bias"));
   }
   if (conv->operand_count() > 3) {
-    // Handle side input, which has same shape as the input.
+    // Handle side input, which has same shape as the output.
     new_operands.push_back(
         SplitAtDim(Parameter(&b, 3, conv->operand(3)->shape(), "side_input"),
-                   dnums->input_feature_dimension(), vect_size));
+                   dnums->output_feature_dimension(), vect_size));
   }
   if (conv->operand_count() > 4) {
     return InvalidArgument(
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions_test.cc
index de54d38abd9..6af1dff9d24 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions_test.cc
@@ -444,6 +444,58 @@ TEST_F(CudnnVectorizeConvolutionsTest, BiasAndSideInput) {
                   ->reordered_int8_nchw_vect());
 }
 
+TEST_F(CudnnVectorizeConvolutionsTest, InputNHWC_OutputNCHW) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    input = s8[10,20,30,64] parameter(0)
+    filter = s8[2,2,64,128] parameter(1)
+    bias = f32[128] parameter(2)
+    side_input = s8[10,128,20,30] parameter(3)
+
+    ROOT result = (s8[10,128,20,30], u8[0]) custom-call(input, filter, bias, side_input),
+                  window={size=2x2}, dim_labels=b01f_01io->bf01,
+                  custom_call_target="__cudnn$convForward"
+  })")
+                    .value();
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, Run({7, 5}, module.get()));
+  EXPECT_TRUE(changed);
+
+  SCOPED_TRACE(module->ToString());
+  auto* root = module->entry_computation()->root_instruction();
+
+  const HloInstruction* conv = nullptr;
+  ASSERT_THAT(
+      root,
+      GmockMatch(m::Tuple(
+          m::Reshape(
+              m::GetTupleElement(
+                  m::CustomCall(
+                      &conv, {kCudnnConvForwardCallTarget},
+                      m::Reshape(m::Parameter(0))
+                          .WithShape(S8, {10, 20, 30, 2, 32}),
+                      m::Reshape(m::Transpose(m::Reshape(m::Parameter(1))))
+                          .WithShape(S8, {128, 2, 2, 2, 32}),
+                      m::Reshape(
+                          m::Transpose(m::Reshape(m::Parameter(2))
+                                           .WithShape(F32, {4, 4, 2, 4}))
+                              .WithShape(F32, {4, 2, 4, 4})
+                              .WithPredicate([](const HloInstruction* instr) {
+                                return absl::c_equal(
+                                    instr->dimensions(),
+                                    std::vector<int64_t>{0, 2, 1, 3});
+                              }))
+                          .WithShape(F32, {128}),
+                      m::Reshape(m::Parameter(3))
+                          .WithShape(S8, {10, 4, 32, 20, 30})))
+                  .WithShape(S8, {10, 4, 32, 20, 30})),
+          m::Op())));
+
+  EXPECT_TRUE(conv->backend_config<CudnnConvBackendConfig>()
+                  ->reordered_int8_nchw_vect());
+}
+
 TEST_F(CudnnVectorizeConvolutionsTest, NoVectorizeTo32) {
   auto module = ParseAndReturnVerifiedModule(R"(
   HloModule TestModule
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_test.cc b/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
index 8c03cd785ee..c8e2184d171 100644
--- a/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
@@ -325,7 +325,7 @@ TEST_F(CustomCallTest, WithStatusFailed) {
       /*api_version=*/CustomCallApiVersion::API_VERSION_STATUS_RETURNING);
   auto status = Execute(&b, {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
-  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("Failed"));
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed"));
 }
 
 //===----------------------------------------------------------------------===//
@@ -384,8 +384,8 @@ TEST_F(CustomCallTest, ExportedAlwaysFail) {
              /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
   auto status = Execute(&b, {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
-  VLOG(0) << status.error_message();
-  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("Uh oh, too bad"));
+  VLOG(0) << status.message();
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("Uh oh, too bad"));
 }
 
 TEST_F(CustomCallTest, ExportedMemcpy) {
@@ -466,8 +466,8 @@ TEST_F(CustomCallTest, ExportedFfiAlwaysFail) {
              /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
   auto status = Execute(&b, {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
-  VLOG(0) << status.error_message();
-  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("Uh oh, too bad"));
+  VLOG(0) << status.message();
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("Uh oh, too bad"));
 }
 
 TEST_F(CustomCallTest, ExportedFfiMemcpy) {
diff --git a/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter_test.cc b/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter_test.cc
index 106637e0bbe..80c671ba8c6 100644
--- a/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter_test.cc
@@ -93,7 +93,7 @@ ENTRY e {
 )";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_modified,
-                                      ErrorSpec{1e-6, 1e-6},
+                                      ErrorSpec{1e-5, 1e-5},
                                       /*run_hlo_passes=*/true));
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
index 55ec10ee1e2..d6936053cf1 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
@@ -490,10 +490,9 @@ StatusOr<bool> RunOnComputation(HloComputation* computation,
   for (HloInstruction* instr : computation->instructions()) {
     if (IsCublasGemm(*instr)) {
       bool result;
-      if (std::holds_alternative<DeviceConfig>(config)) {
+      if (auto device_config = std::get_if<DeviceConfig>(&config)) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
-        TF_ASSIGN_OR_RETURN(
-            result, RunOnInstruction(instr, std::get<DeviceConfig>(config)));
+        TF_ASSIGN_OR_RETURN(result, RunOnInstruction(instr, *device_config));
 #else
         LOG(FATAL) << "GPU-enabled build is required to run autotuning";
 #endif
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc b/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc
index 8e997dab83b..526555ceaf5 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc
@@ -94,60 +94,135 @@ bool SupportsEpilogueFusion(PrimitiveType type) {
 }
 
 bool IsF8Type(const HloInstruction *instr) {
-  if (instr->shape().element_type() == F8E4M3FN ||
-      instr->shape().element_type() == F8E5M2) {
-    return true;
-  } else {
-    return false;
-  }
+  return primitive_util::IsF8Type(instr->shape().element_type());
 }
 
-bool IsF8TypeRecursiveImpl(const HloInstruction *instr,
-                           absl::flat_hash_set<int> &visited_instrs) {
+// Recursively collects unary, pad, divide or multiply operands of instr until
+// an instruction with FP8 element type is reached. Returns std::nullopt when no
+// FP8 instruction is reached.
+std::optional<std::vector<HloInstruction *>> FindF8SubgraphRecursive(
+    HloInstruction *instr, absl::flat_hash_set<int> &visited_instrs,
+    std::vector<HloInstruction *> subgraph) {
   // Avoid visiting the same instruction more than once.
   if (!visited_instrs.emplace(instr->unique_id()).second) {
-    return false;
+    return std::nullopt;
   }
+  subgraph.emplace_back(instr);
   if (IsF8Type(instr)) {
-    return true;
+    return subgraph;
   } else {
     if (instr->operand_count() == 1 || instr->opcode() == HloOpcode::kDivide ||
         instr->opcode() == HloOpcode::kPad) {
-      return IsF8TypeRecursiveImpl(instr->operand(0), visited_instrs);
+      return FindF8SubgraphRecursive(instr->mutable_operand(0), visited_instrs,
+                                     subgraph);
     } else if (instr->opcode() == HloOpcode::kMultiply) {
-      return IsF8TypeRecursiveImpl(instr->operand(0), visited_instrs) ||
-             IsF8TypeRecursiveImpl(instr->operand(1), visited_instrs);
-    } else {
+      for (int k = 0; k < 2; ++k) {
+        auto mult_subgraph = FindF8SubgraphRecursive(instr->mutable_operand(k),
+                                                     visited_instrs, subgraph);
+        if (mult_subgraph.has_value()) {
+          return mult_subgraph;
+        }
+      }
+    }
+    return std::nullopt;
+  }
+}
+
+// Returns whether instr and its operands describe a pattern which is compatible
+// with rewriting the dot operating on instr into an FP8 Custom Call. If
+// applicable, captures the operand of the Custom Call, its scaling factor,
+// whether the scaling factor is applied by multiplication and intermediate
+// unary ops.
+bool IsSupportedF8Pattern(HloInstruction *instr, HloInstruction *&x,
+                          HloInstruction *&x_scale, bool &x_mult_scale,
+                          std::vector<HloInstruction *> &x_unary_ops) {
+  absl::flat_hash_set<int> visited_instrs;
+  std::optional<std::vector<HloInstruction *>> subgraph =
+      FindF8SubgraphRecursive(instr, visited_instrs,
+                              std::vector<HloInstruction *>{});
+
+  if (!subgraph.has_value()) {
+    return false;
+  }
+  std::reverse(subgraph->begin(), subgraph->end());
+
+  // Directly operating on an FP8 operand.
+  if (subgraph->size() == 1) {
+    x = (*subgraph)[0];
+    return true;
+  }
+
+  // When not operating directly on an FP8 operand, the second and
+  // third instructions in the subgraph must describe a dequantization, i.e. a
+  // convert instruction followed by a multiply/divide instruction.
+  if (subgraph->size() > 2 &&
+      Match((*subgraph)[2],
+            m::MultiplyAnyOrder(m::Convert(m::Op(&x)),
+                                m::Broadcast(m::Op(&x_scale))))) {
+    x_mult_scale = true;
+  } else if (subgraph->size() > 2 &&
+             Match((*subgraph)[2], m::Divide(m::Convert(m::Op(&x)),
+                                             m::Broadcast(m::Op(&x_scale))))) {
+    x_mult_scale = false;
+  } else {
+    VLOG(1) << "Possible intended FP8 GEMM operating on "
+            << instr->ToShortString() << " not rewritten into FP8 Custom Call.";
+    return false;
+  }
+
+  auto preserves_element_type = [](const HloInstruction *instr) -> bool {
+    return ShapeUtil::SameElementType(instr->shape(),
+                                      instr->operand(0)->shape());
+  };
+  for (int i = 3; i < subgraph->size(); ++i) {
+    // The remaining instructions must be commutative with dequantization.
+    // Bitcast, broadcast, copy, pad, reshape and slice instructions are
+    // supported.
+    if (!Match((*subgraph)[i],
+               m::AnyOf<HloInstruction>(
+                   m::Bitcast().WithPredicate(preserves_element_type),
+                   m::Broadcast(), m::Copy(), m::Pad(), m::Reshape(),
+                   m::Slice()))) {
+      VLOG(1) << "Possible intended FP8 GEMM operating on "
+              << instr->ToShortString()
+              << " not rewritten into FP8 Custom Call.";
       return false;
     }
   }
-}
 
-bool IsF8TypeRecursive(const HloInstruction *instr) {
-  absl::flat_hash_set<int> visited_instrs;
-  return IsF8TypeRecursiveImpl(instr, visited_instrs);
-}
-
-void VlogF8PatternMiss(const HloInstruction *instr) {
-  if (Match(instr, m::Dot(m::Op().WithPredicate(IsF8TypeRecursive),
-                          m::Op().WithPredicate(IsF8TypeRecursive)))) {
-    VLOG(1) << "Possible intended FP8 GEMM " << instr->ToShortString()
-            << " not rewritten into FP8 Custom Call.";
-  }
-}
-
-bool IsCublasSupportedMatrixMultiplication(
-    const HloInstruction &dot, se::CudaComputeCapability compute_capability) {
-  if (!IsMatrixMultiplication(dot)) {
-    return false;
-  }
-  if (IsF8Type(dot.operand(0)) || IsF8Type(dot.operand(1))) {
-    // cuBLAS LT only supports F8 matmuls on Hopper and above.
-    return compute_capability.IsAtLeast(se::CudaComputeCapability::HOPPER);
-  }
+  x_unary_ops = {subgraph->begin() + 3, subgraph->end()};
   return true;
 }
 
+// Transposes a matrix by swapping the contracting and non-contracting
+// dimension. There must be only one contracting and only one non-contracting
+// dimension. Keeps the layout the same.
+HloInstruction *TransposeMatrix(HloInstruction *instr, int64_t contracting_dim,
+                                absl::Span<const int64_t> batch_dims) {
+  // Identify the dimensional order which describes a transpose of the
+  // contracting and non-contracting dimensions of the GEMM.
+  std::vector<int64_t> permutation(instr->shape().dimensions_size(), -1);
+  // Discard the batch dimensions.
+  for (int64_t batch_dim : batch_dims) {
+    permutation[batch_dim] = batch_dim;
+  }
+  // Identify the non-contracting dimension.
+  int non_contracting_dim;
+  for (int i = 0; i < instr->shape().dimensions_size(); ++i) {
+    if (permutation[i] == -1 && contracting_dim != i) {
+      non_contracting_dim = i;
+    }
+  }
+  permutation[non_contracting_dim] = contracting_dim;
+  permutation[contracting_dim] = non_contracting_dim;
+
+  Shape new_shape = ShapeUtil::PermuteDimensions(permutation, instr->shape());
+  *new_shape.mutable_layout() = instr->shape().layout();
+  return instr->AddInstruction(
+      HloInstruction::CreateTranspose(new_shape, instr, permutation));
+}
+
+
 // If the bias is a sequence of ops that depend only on broadcasts of
 // constants, materialize the bias if it's small.
 //
@@ -249,19 +324,9 @@ auto OptionalSlice(HloInstruction **optional_slice, Pattern pattern) {
 }
 
 template <typename Pattern>
-auto OptionalBitcastPreservingElementType(HloInstruction **optional_bitcast,
-                                          Pattern pattern) {
-  return m::AnyOf<HloInstruction>(
-      m::Bitcast(optional_bitcast, pattern)
-          .WithPredicate([](const HloInstruction *instr) {
-            return ShapeUtil::SameElementType(instr->shape(),
-                                              instr->operand(0)->shape());
-          }),
-      std::move(pattern));
-}
-
-auto ConvertFromF8(HloInstruction **instr) {
-  return m::Convert(m::Op(instr).WithPredicate(IsF8Type));
+auto OptionalConvert(HloInstruction **optional_convert, Pattern pattern) {
+  return m::AnyOf<HloInstruction>(m::Convert(optional_convert, pattern),
+                                  std::move(pattern));
 }
 
 // The rewriting proceeds in a bottom-up way:
@@ -276,111 +341,39 @@ auto ConvertFromF8(HloInstruction **instr) {
 // and provided C has no other users).
 // We then guide the buffer assignment to alias the buffer of the custom call
 // and C.
+//
+// For scaled FP8 GEMMs on Hopper systems, the following steps mentioned in
+// RFC #22 (https://github.com/openxla/xla/discussions/22) are elided and
+// rewritten into a Custom Call:
+//
+// 1. Cast each input from FP8 to a wider type such as FP16 or FP32.
+// 2. Unscale each input by multiplying each input by the corresponding input
+// scale.
+// 3. Evaluate the matrix multiplication on the scaled inputs.
+// 4. Compute the maximum of the absolute values in the result of the GEMM
+// (DAmax).
+// 5. Scale the output by dividing the output by the output scale.
+// 6. Cast the output back to FP8. Since saturation should be done on
+// overflow, this is represented by a Clamp instruction followed by a Convert
+// instruction.
+
+// Steps 1 through 3 can be elided independently of the remainder. Steps 5 and
+// 6 are elided only if steps 1 through 3 were successfully transformed. Step
+// 4 requires steps 5 and 6, i.e. the computation of DAmax can be elided only
+// when the output of the GEMM is requested in FP8 format.
 class GemmRewriterVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit GemmRewriterVisitor(
-      se::CudaComputeCapability cuda_compute_capability)
-      : cuda_compute_capability_(cuda_compute_capability) {}
-
-  // For scaled FP8 GEMMs on Hopper systems, the following steps mentioned in
-  // RFC #22 (https://github.com/openxla/xla/discussions/22) are elided and
-  // rewritten into a Custom Call:
-  //
-  // 1. Cast each input from FP8 to a wider type such as FP16 or FP32.
-  // 2. Unscale each input by multiplying each input by the corresponding input
-  // scale.
-  // 3. Evaluate the matrix multiplication on the scaled inputs.
-  // 4. Compute the maximum of the absolute values in the result of the GEMM
-  // (DAmax).
-  // 5. Scale the output by dividing the output by the output scale.
-  // 6. Cast the output back to FP8. Since saturation should be done on
-  // overflow, this is represented by a Clamp instruction followed by a Convert
-  // instruction.
-
-  // Steps 1 through 3 can be elided independently of the remainder. Steps 5 and
-  // 6 are elided only if steps 1 through 3 were successfully transformed. Step
-  // 4 requires steps 5 and 6, i.e. the computation of DAmax can be elided only
-  // when the output of the GEMM is requested in FP8 format.
-  StatusOr<bool> TryRewriteFp8Gemm(HloInstruction *dot_instr,
-                                   GemmBackendConfig &gemm_backend_config) {
-    TF_ASSIGN_OR_RETURN(
-        bool gemm_is_supported_by_cublas_lt,
-        GemmIsSupportedByCublasLt(*dot_instr, gemm_backend_config));
-    if (!gemm_is_supported_by_cublas_lt) {
-      return false;
-    }
-
-    HloInstruction *a, *b, *a_scale, *b_scale, *a_binary, *b_binary,
-        *a_bitcast = nullptr, *b_bitcast = nullptr;
-    // Attempt to elide an FP8 GEMM with scaled inputs as described by steps 1
-    // through 3 detailed above and rewrite into a Custom Call.
-    if (Match(
-            dot_instr,
-            m::Dot(
-                m::AnyOf<HloInstruction>(
-                    OptionalBitcastPreservingElementType(
-                        &a_bitcast,
-                        m::MultiplyAnyOrder(&a_binary, ConvertFromF8(&a),
-                                            m::Broadcast(m::Op(&a_scale)))),
-                    OptionalBitcastPreservingElementType(
-                        &a_bitcast, m::Divide(&a_binary, ConvertFromF8(&a),
-                                              m::Broadcast(m::Op(&a_scale))))),
-                m::AnyOf<HloInstruction>(
-                    OptionalBitcastPreservingElementType(
-                        &b_bitcast,
-                        m::MultiplyAnyOrder(&b_binary, ConvertFromF8(&b),
-                                            m::Broadcast(m::Op(&b_scale)))),
-                    OptionalBitcastPreservingElementType(
-                        &b_bitcast,
-                        m::Divide(&b_binary, ConvertFromF8(&b),
-                                  m::Broadcast(m::Op(&b_scale)))))))) {
-      TF_ASSIGN_OR_RETURN(
-          bool created_call,
-          CreateF8CustomCall(
-              dot_instr, gemm_backend_config, a, b, a_scale, b_scale, a_bitcast,
-              b_bitcast,
-              /*a_mult_scale=*/a_binary->opcode() == HloOpcode::kMultiply,
-              /*b_mult_scale=*/b_binary->opcode() == HloOpcode::kMultiply));
-      if (created_call) {
-        return true;
-      }
-    }
-
-    // Attempt to rewrite an FP8 GEMM directly operating on the unscaled but
-    // possibly type converted FP8 operands into a Custom Call.
-    if (Match(dot_instr, m::AnyOf<HloInstruction>(
-                             m::Dot(ConvertFromF8(&a), ConvertFromF8(&b)),
-                             m::Dot(m::Op(&a).WithPredicate(IsF8Type),
-                                    m::Op(&b).WithPredicate(IsF8Type))))) {
-      TF_ASSIGN_OR_RETURN(
-          bool created_call,
-          CreateF8CustomCall(dot_instr, gemm_backend_config, a, b));
-      if (created_call) {
-        return true;
-      }
-    }
-
-    // Warn when a GEMM (indirectly) operating on FP8 operands and possibly
-    // intended to be rewritten into an FP8 Custom Call is not pattern matched.
-    if (VLOG_IS_ON(1)) {
-      VlogF8PatternMiss(dot_instr);
-    }
-
-    // Did not rewrite as FP8 CublasLt CustomCall.
-    return false;
-  }
+  explicit GemmRewriterVisitor(GpuVersion gpu_version)
+      : gpu_version_(gpu_version) {}
 
   Status HandleDot(HloInstruction *instr) override {
-    if (!IsCublasSupportedMatrixMultiplication(*instr,
-                                               cuda_compute_capability_)) {
+    if (!IsMatrixMultiplication(*instr)) {
       return OkStatus();
     }
 
     CHECK(!instr->IsRank2Transpose());
-    HloInstruction *lhs = instr->mutable_operand(0);
-    HloInstruction *rhs = instr->mutable_operand(1);
-    CHECK(!lhs->IsRank2Transpose());
-    CHECK(!rhs->IsRank2Transpose());
+    CHECK(!instr->mutable_operand(0)->IsRank2Transpose());
+    CHECK(!instr->mutable_operand(1)->IsRank2Transpose());
 
     // Create a GemmBackendConfig based on the instruction.
     GemmBackendConfig gemm_backend_config;
@@ -392,10 +385,37 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     *gemm_backend_config.mutable_precision_config() = instr->precision_config();
 
     // First try to match the fp8 gemm pattern.
-    TF_ASSIGN_OR_RETURN(bool rewrote_dot,
-                        TryRewriteFp8Gemm(instr, gemm_backend_config));
-    if (rewrote_dot) {
-      return OkStatus();
+    TF_ASSIGN_OR_RETURN(bool supported_by_cublaslt,
+                        GemmIsSupportedByCublasLt(*instr, gemm_backend_config));
+    HloInstruction *a, *b, *a_scale = nullptr, *b_scale = nullptr;
+    std::vector<HloInstruction *> a_unary_ops, b_unary_ops;
+    bool a_mult_scale, b_mult_scale;
+    if (supported_by_cublaslt &&
+        Match(instr,
+              m::Dot(m::Op().WithPredicate([&](const HloInstruction *instr) {
+                return IsSupportedF8Pattern(const_cast<HloInstruction *>(instr),
+                                            a, a_scale, a_mult_scale,
+                                            a_unary_ops);
+              }),
+                     m::Op().WithPredicate([&](const HloInstruction *instr) {
+                       return IsSupportedF8Pattern(
+                           const_cast<HloInstruction *>(instr), b, b_scale,
+                           b_mult_scale, b_unary_ops);
+                     })))) {
+      TF_ASSIGN_OR_RETURN(
+          bool created_call,
+          CreateF8CustomCall(instr, gemm_backend_config, a, b, a_scale, b_scale,
+                             a_mult_scale, b_mult_scale, a_unary_ops,
+                             b_unary_ops));
+      if (created_call) {
+        return OkStatus();
+      }
+    }
+
+    if (IsF8Type(instr->operand(0))) {
+      // Couldn't rewrite as an FP8 cublasLt custom call, so turn into an FP16
+      // dot and below it will be rewritten as an FP16 cublas or cublasLt call.
+      TF_ASSIGN_OR_RETURN(instr, TurnF8DotIntoF16Dot(instr));
     }
 
     // Couldn't rewrite as an FP8 cublasLt custom call, rewrite as a cublas or
@@ -406,7 +426,9 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     const Shape &output_shape = instr->shape();
     HloInstruction *gemm_call =
         instr->AddInstruction(HloInstruction::CreateCustomCall(
-            output_shape, {lhs, rhs}, gemm_custom_call_target));
+            output_shape,
+            {instr->mutable_operand(0), instr->mutable_operand(1)},
+            gemm_custom_call_target));
     TF_RETURN_IF_ERROR(gemm_call->set_backend_config(gemm_backend_config));
     TF_RETURN_IF_ERROR(ReplaceInstruction(instr, gemm_call));
     return OkStatus();
@@ -475,17 +497,20 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
   Status HandleAdd(HloInstruction *instr) override {
     HloInstruction *bias, *existing_gemm;
     HloInstruction *optional_slice = nullptr;
+    HloInstruction *optional_convert = nullptr;
     // Attempt to elide broadcast and fuse addition of a vector bias into GEMM,
     // including when slicing is applied to the result.
     if (Match(instr,
               m::AddAnyOrder(
-                  OptionalSlice(&optional_slice,
-                                CublasLtMatmul(&existing_gemm).WithOneUser())
+                  OptionalSlice(
+                      &optional_slice,
+                      CublasLtMatmulMaybeF8(&existing_gemm).WithOneUser())
                       .WithOneUser(),
-                  m::Broadcast(&bias, m::Op())))) {
-      TF_ASSIGN_OR_RETURN(
-          bool was_fused,
-          FuseVectorBiasAdd(instr, bias, existing_gemm, optional_slice));
+                  m::Broadcast(&bias,
+                               OptionalConvert(&optional_convert, m::Op()))))) {
+      TF_ASSIGN_OR_RETURN(bool was_fused,
+                          FuseVectorBiasAdd(instr, bias, existing_gemm,
+                                            optional_slice, optional_convert));
 
       if (was_fused) {
         return OkStatus();
@@ -610,23 +635,27 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     return OkStatus();
   }
 
-  StatusOr<bool> CreateF8CustomCall(
-      HloInstruction *instr, GemmBackendConfig &gemm_backend_config,
-      HloInstruction *a, HloInstruction *b, HloInstruction *a_scale = nullptr,
-      HloInstruction *b_scale = nullptr, HloInstruction *a_bitcast = nullptr,
-      HloInstruction *b_bitcast = nullptr, bool a_mult_scale = true,
-      bool b_mult_scale = true) {
+  StatusOr<bool> CreateF8CustomCall(HloInstruction *instr,
+                                    GemmBackendConfig &gemm_backend_config,
+                                    HloInstruction *a, HloInstruction *b,
+                                    HloInstruction *a_scale,
+                                    HloInstruction *b_scale, bool a_mult_scale,
+                                    bool b_mult_scale,
+                                    std::vector<HloInstruction *> a_unary_ops,
+                                    std::vector<HloInstruction *> b_unary_ops) {
+    auto cuda_compute_capability_ =
+        std::get<se::CudaComputeCapability>(gpu_version_);
     // FP8 GEMM kernels are only available on Hopper and newer architectures.
     if (!cuda_compute_capability_.IsAtLeast(
             se::CudaComputeCapability::HOPPER)) {
       VLOG(1) << "FP8 Custom Calls require Hopper or newer architecture.";
       return false;
     }
-#if CUDA_VERSION < 11080
-    // FP8 GEMM kernels are only available with CUDA 11.8 and above
-    VLOG(1) << "FP8 Custom Calls require CUDA 11.8 or newer.";
+#if CUDA_VERSION < 12000
+    // FP8 GEMM kernels are only available with CUDA 12.0 and above
+    VLOG(1) << "FP8 Custom Calls require CUDA 12.0 or newer.";
     return false;
-#endif
+#endif  // CUDA_VERSION < 12000
 
     // cuBLASLt FP8 GEMM kernels require one of the two operands to be in
     // F8E4M3FN format.
@@ -639,32 +668,8 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return false;
     }
 
-    // cuBLASLt FP8 GEMM kernels require the non-batch dimensions of the
-    // operands to be multiples of 16.
-    absl::Span<const int64_t> a_dims =
-        (a_bitcast ? a_bitcast : a)->shape().dimensions();
-    absl::Span<const int64_t> b_dims =
-        (b_bitcast ? b_bitcast : b)->shape().dimensions();
-    absl::Span<const int64_t> a_batch_dims =
-        gemm_backend_config.dot_dimension_numbers().lhs_batch_dimensions();
-    absl::Span<const int64_t> b_batch_dims =
+    absl::Span<const int64_t> batch_dims =
         gemm_backend_config.dot_dimension_numbers().rhs_batch_dimensions();
-    for (int i = 0; i < a_dims.size(); ++i) {
-      if (a_dims[i] % 16 && !absl::c_linear_search(a_batch_dims, i)) {
-        VLOG(1) << "Failed to rewrite " << instr->ToShortString()
-                << " into FP8 Custom Call. The non-batch dimensions of A must "
-                   "be multiples of 16.";
-        return false;
-      }
-    }
-    for (int i = 0; i < b_dims.size(); ++i) {
-      if (b_dims[i] % 16 && !absl::c_linear_search(b_batch_dims, i)) {
-        VLOG(1) << "Failed to rewrite " << instr->ToShortString()
-                << " into FP8 Custom Call. The non-batch dimensions of B must "
-                   "be multiples of 16.";
-        return false;
-      }
-    }
 
     // cuBLASLt FP8 GEMM kernels require the scaling factors to be in F32
     // format. Set the factors to one when no scaling factors were captured.
@@ -721,7 +726,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // fusion of the scaling and conversion of D into the Custom Call. Fusing
     // a matrix bias is only supported with CUDA 12 and above.
     HloInstruction *c = nullptr;
-#if CUDA_VERSION > 12000
+
     if (instr->user_count() == 1 &&
         instr->users()[0]->opcode() == HloOpcode::kAdd) {
       HloInstruction *add = instr->users()[0];
@@ -732,7 +737,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
         TF_RETURN_IF_ERROR(ReplaceInstruction(add, instr));
       }
     }
-#endif  // CUDA_VERSION > 12000
+
     // If a matrix bias was not fused, set C to a matrix of zeros.
     if (!c) {
       Literal c_literal = LiteralUtil::Zero(c_type);
@@ -756,11 +761,15 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
                  "dimension.";
       return false;
     }
-    if ((a_bitcast ? a_bitcast : a)->shape().dimensions_size() -
-                a_batch_dims.size() !=
+    if ((a_unary_ops.empty() ? a : a_unary_ops.back())
+                    ->shape()
+                    .dimensions_size() -
+                batch_dims.size() !=
             2 ||
-        (b_bitcast ? b_bitcast : b)->shape().dimensions_size() -
-                b_batch_dims.size() !=
+        (b_unary_ops.empty() ? b : b_unary_ops.back())
+                    ->shape()
+                    .dimensions_size() -
+                batch_dims.size() !=
             2) {
       VLOG(1) << "Failed to rewrite " << instr->ToShortString()
               << "into FP8 Custom Call. A and B must have one non-contracting "
@@ -768,120 +777,123 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return false;
     }
 
-    // Shift any bitcasts to the unconverted and unscaled operands.
-    if (a_bitcast) {
-      a = instr->AddInstruction(a_bitcast->CloneWithNewOperands(
-          ShapeUtil::MakeShapeWithDenseLayout(
-              a->shape().element_type(), a_bitcast->shape().dimensions(),
-              a_bitcast->shape().layout().minor_to_major()),
-          {a}));
-    }
-    if (b_bitcast) {
-      b = instr->AddInstruction(b_bitcast->CloneWithNewOperands(
-          ShapeUtil::MakeShapeWithDenseLayout(
-              b->shape().element_type(), b_bitcast->shape().dimensions(),
-              b_bitcast->shape().layout().minor_to_major()),
-          {b}));
-    }
-
-    // Identify the dimensional order which describes a transpose of the
-    // contracting and non-contracting dimensions of the GEMM.
-    auto transp_dim_order =
-        [](HloInstruction *x, int64_t x_contracting_dim,
-           absl::Span<const int64_t> x_batch_dims) -> std::vector<int64_t> {
-      std::vector<int64_t> dims(x->shape().dimensions_size(), -1);
-      // Discard the batch dimensions.
-      for (int64_t batch_dim : x_batch_dims) {
-        dims[batch_dim] = batch_dim;
-      }
-      // Identify the non-contracting dimension.
-      int non_contracting_dim;
-      for (int i = 0; i < x->shape().dimensions_size(); ++i) {
-        if (dims[i] == -1 && x_contracting_dim != i) {
-          non_contracting_dim = i;
+    // Sequentially apply the collected unary and pad ops to the unconverted and
+    // unscaled operands.
+    auto shift_unary_ops =
+        [&instr](HloInstruction *&x,
+                 std::vector<HloInstruction *> &x_unary_ops) -> void {
+      for (HloInstruction *unary_op : x_unary_ops) {
+        std::vector<HloInstruction *> operands = {x};
+        if (unary_op->opcode() == HloOpcode::kPad) {
+          HloInstruction *convert =
+              instr->AddInstruction(HloInstruction::CreateConvert(
+                  ShapeUtil::ChangeElementType(unary_op->operand(1)->shape(),
+                                               x->shape().element_type()),
+                  unary_op->mutable_operand(1)));
+          operands.emplace_back(convert);
         }
+        x = instr->AddInstruction(unary_op->CloneWithNewOperands(
+            ShapeUtil::MakeShapeWithDenseLayout(
+                x->shape().element_type(), unary_op->shape().dimensions(),
+                unary_op->shape().layout().minor_to_major()),
+            operands));
       }
-      dims[non_contracting_dim] = x_contracting_dim;
-      dims[x_contracting_dim] = non_contracting_dim;
-      return dims;
+      return;
     };
+    shift_unary_ops(a, a_unary_ops);
+    shift_unary_ops(b, b_unary_ops);
 
-    auto transp_dims =
-        [](HloInstruction *x,
-           absl::Span<const int64_t> transp_dim_order) -> std::vector<int64_t> {
-      std::vector<int64_t> transp_dims;
-      transp_dims.reserve(x->shape().dimensions_size());
-      for (int64_t dim : transp_dim_order) {
-        transp_dims.emplace_back(x->shape().dimensions(dim));
-      }
-      return transp_dims;
-    };
-    // Plain transpose on a or b. Plain transposes a matrix by permuting its
-    // dimension without changing storage order.
-    auto plain_transpose =
-        [&](HloInstruction **x,
-            const absl::Span<const int64_t> &contracting_dims,
-            const absl::Span<const int64_t> &batch_dims) {
-          std::vector<int64_t> new_dim_order =
-              transp_dim_order(*x, contracting_dims[0], batch_dims);
-          *x = instr->AddInstruction(HloInstruction::CreateTranspose(
-              ShapeUtil::MakeShapeWithDenseLayout(
-                  (*x)->shape().element_type(), transp_dims(*x, new_dim_order),
-                  (*x)->shape().layout().minor_to_major()),
-              *x, new_dim_order));
-        };
-
-    // cuBLASLt FP8 GEMM kernels currently require the first operand, i.e. A, to
-    // be transposed. If the result of the GEMM is not in column major order, A
-    // and B are later exchanged, and B is transposed here instead.
-    // TODO(philipphack): Remove once cuBLASLt supports the NN configuration.
     TF_ASSIGN_OR_RETURN(bool a_is_col_major,
                         MatrixIsColumnMajor(*instr, gemm_backend_config, "a"));
     TF_ASSIGN_OR_RETURN(bool b_is_col_major,
                         MatrixIsColumnMajor(*instr, gemm_backend_config, "b"));
 
-    // Apply necessary transposes to accommodate canonicalize matmul(lhs and rhs
-    // contracting dims are 1 and 0). Also assuming transpose folding pass later
-    // will remove duplcated transposes. The last transpose is required by
-    // cublas fp8 matmul restriction.
     DotDimensionNumbers *dim_nums =
         gemm_backend_config.mutable_dot_dimension_numbers();
-    int a_batch_dim_offset = a_batch_dims.size();
-    int b_batch_dim_offset = b_batch_dims.size();
+    int batch_dim_offset = batch_dims.size();
 
+    // cuBLASLt FP8 GEMM kernels currently require the first operand, i.e. A, to
+    // be row-major. If A is column-major, swap the contracting and
+    // non-contracting dimension and transpose the matrix to effectively make it
+    // column-major.
+    // TODO(philipphack): Remove once cuBLASLt supports A being column-major
     if (a_is_col_major) {
-      // Swap contracting dimensions and convert a to row major
-      CHECK(a_contracting_dims[0] == a_batch_dim_offset ||
-            a_contracting_dims[0] == a_batch_dim_offset + 1);
-      if (a_contracting_dims[0] == a_batch_dim_offset) {
-        dim_nums->set_lhs_contracting_dimensions(0, a_batch_dim_offset + 1);
+      CHECK(a_contracting_dims[0] == batch_dim_offset ||
+            a_contracting_dims[0] == batch_dim_offset + 1);
+      if (a_contracting_dims[0] == batch_dim_offset) {
+        dim_nums->set_lhs_contracting_dimensions(0, batch_dim_offset + 1);
       } else {
-        dim_nums->set_lhs_contracting_dimensions(0, a_batch_dim_offset);
+        dim_nums->set_lhs_contracting_dimensions(0, batch_dim_offset);
       }
-      plain_transpose(&a, a_contracting_dims, a_batch_dims);
+      a = TransposeMatrix(a, a_contracting_dims[0], batch_dims);
     }
 
+    // Similarly, cuBLASLt requires the second operand to be column-major, so
+    // make it column-major if it is currently row-major.
     if (!b_is_col_major) {
-      // Swap contracting dimensions and convert b to col major
-      CHECK(b_contracting_dims[0] == b_batch_dim_offset ||
-            b_contracting_dims[0] == b_batch_dim_offset + 1);
-      if (b_contracting_dims[0] == b_batch_dim_offset) {
-        dim_nums->set_rhs_contracting_dimensions(0, b_batch_dim_offset + 1);
+      CHECK(b_contracting_dims[0] == batch_dim_offset ||
+            b_contracting_dims[0] == batch_dim_offset + 1);
+      if (b_contracting_dims[0] == batch_dim_offset) {
+        dim_nums->set_rhs_contracting_dimensions(0, batch_dim_offset + 1);
       } else {
-        dim_nums->set_rhs_contracting_dimensions(0, b_batch_dim_offset);
+        dim_nums->set_rhs_contracting_dimensions(0, batch_dim_offset);
       }
-      plain_transpose(&b, b_contracting_dims, b_batch_dims);
+      b = TransposeMatrix(b, b_contracting_dims[0], batch_dims);
     }
-    std::unique_ptr<HloInstruction> new_custom_call =
-        HloInstruction::CreateCustomCall(
-            instr->shape(), {a, b, c, scales_f32[0], scales_f32[1], one, one},
-            kCublasLtMatmulF8CallTarget);
+
+    // Pad the non-batch dimensions of the operands to multiples of 16 as
+    // required by cuBLASLt.
+    auto pad_operand = [&instr, &batch_dims](HloInstruction *&x) -> void {
+      PaddingConfig padding_config;
+      Shape padded_shape = x->shape();
+      for (int i = 0; i < x->shape().rank(); ++i) {
+        auto dimension = padding_config.add_dimensions();
+        if (!absl::c_linear_search(batch_dims, i)) {
+          int64_t padded_dimension =
+              RoundUpTo<int64_t>(x->shape().dimensions(i), 16);
+          dimension->set_edge_padding_low(0);
+          dimension->set_edge_padding_high(padded_dimension -
+                                           x->shape().dimensions(i));
+          dimension->set_interior_padding(0);
+          padded_shape.set_dimensions(i, padded_dimension);
+        }
+      }
+      if (!ShapeUtil::Equal(padded_shape, x->shape())) {
+        HloInstruction *zero =
+            instr->AddInstruction(HloInstruction::CreateConstant(
+                LiteralUtil::Zero(x->shape().element_type())));
+        x = instr->AddInstruction(
+            HloInstruction::CreatePad(padded_shape, x, zero, padding_config));
+      }
+      return;
+    };
+    pad_operand(a);
+    pad_operand(b);
+    pad_operand(c);
+
+    HloInstruction *new_custom_call =
+        instr->AddInstruction(HloInstruction::CreateCustomCall(
+            ShapeUtil::MakeShapeWithDenseLayout(
+                instr->shape().element_type(), c->shape().dimensions(),
+                instr->shape().layout().minor_to_major()),
+            {a, b, c, scales_f32[0], scales_f32[1], one, one},
+            kCublasLtMatmulF8CallTarget));
 
     TF_RETURN_IF_ERROR(
         new_custom_call->set_backend_config(gemm_backend_config));
-    TF_RETURN_IF_ERROR(SetName(instr->GetModule(), new_custom_call.get()));
+    TF_RETURN_IF_ERROR(SetName(instr->GetModule(), new_custom_call));
+
+    // Slice the result of the GEMM if the operands were padded.
+    HloInstruction *slice = nullptr;
+    if (c->shape().dimensions() != instr->shape().dimensions()) {
+      std::vector<int64_t> start_indices(instr->shape().rank(), 0);
+      std::vector<int64_t> strides(instr->shape().rank(), 1);
+      slice = instr->AddInstruction(HloInstruction::CreateSlice(
+          instr->shape(), new_custom_call, start_indices,
+          instr->shape().dimensions(), strides));
+    }
     TF_RETURN_IF_ERROR(
-        ReplaceWithNewInstruction(instr, std::move(new_custom_call)));
+        ReplaceInstruction(instr, slice ? slice : new_custom_call));
     return true;
   }
 
@@ -918,19 +930,32 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     const std::vector<HloInstruction *> gemm_users = existing_gemm->users();
     HloInstruction *reduce_damax = nullptr;
     if (gemm_users.size() == 2) {
+      // In the presence of a ReLU activation, the abs instruction is elided
+      // since abs(ReLU(x)) = ReLU(x).
+      TF_ASSIGN_OR_RETURN(auto config,
+                          existing_gemm->backend_config<GemmBackendConfig>());
       for (int i = 0; i < gemm_users.size(); ++i) {
-        if (gemm_users[i]->opcode() == HloOpcode::kAbs &&
-            gemm_users[i]->users().size() == 1 &&
-            gemm_users[i]->users()[0]->opcode() == HloOpcode::kReduce &&
-            gemm_users[i]->users()[0]->operands().size() == 2 &&
-            gemm_users[i]->users()[0]->operand(1)->opcode() ==
-                HloOpcode::kConstant &&
-            ShapeUtil::IsScalar(
-                gemm_users[i]->users()[0]->operand(1)->shape())) {
-          HloInstruction *reduce = gemm_users[i]->users()[0];
+        HloInstruction *maybe_reduce = nullptr;
+        if (gemm_users[i]->opcode() == HloOpcode::kAbs) {
+          if (gemm_users[i]->users().size() != 1) continue;
+          maybe_reduce = gemm_users[i]->users()[0];
+        } else {
+          // If there is no Abs instruction, relu is required as epilogue to
+          // ensure all values are nonnegative.
+          if (config.epilogue() != GemmBackendConfig::BIAS_RELU &&
+              config.epilogue() != GemmBackendConfig::RELU)
+            continue;
+          maybe_reduce = gemm_users[i];
+        }
+
+        if (maybe_reduce->opcode() == HloOpcode::kReduce &&
+            maybe_reduce->operands().size() == 2 &&
+            maybe_reduce->operand(1)->opcode() == HloOpcode::kConstant &&
+            ShapeUtil::IsScalar(maybe_reduce->operand(1)->shape())) {
+          HloInstruction *reduce = maybe_reduce;
           HloComputation *reduce_comp = reduce->to_apply();
           HloInstruction *reduce_comp_root = reduce_comp->root_instruction();
-          if (reduce->operand(1)->literal().Get<float>({}) <= 0. &&
+          if (reduce->operand(1)->literal().GetAsDouble({}) <= 0. &&
               reduce_comp_root->opcode() == HloOpcode::kMaximum &&
               reduce_comp_root->operand(0)->opcode() == HloOpcode::kParameter &&
               reduce_comp_root->operand(1)->opcode() == HloOpcode::kParameter) {
@@ -1122,7 +1147,8 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
   StatusOr<bool> FuseVectorBiasAdd(HloInstruction *instr,
                                    HloInstruction *broadcast,
                                    HloInstruction *gemm,
-                                   HloInstruction *slice = nullptr) {
+                                   HloInstruction *slice = nullptr,
+                                   HloInstruction *convert = nullptr) {
     TF_RET_CHECK(ShapeUtil::Compatible(
         broadcast->shape(), (slice ? slice->shape() : gemm->shape())));
 
@@ -1166,17 +1192,57 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       }
     }
 
-    // Replace add(gemm, broadcast) with fused new_gemm.
-    config.set_epilogue(GemmBackendConfig::BIAS);
     std::vector<HloInstruction *> operands(gemm->operands().begin(),
                                            gemm->operands().end());
-    operands.push_back(bias);
+    // When (non-trivial) matrix and vector bias co-exist for FP8 matmul, just
+    // fuse matrix bias.
+    if (gemm->custom_call_target() == kCublasLtMatmulF8CallTarget &&
+        config.beta() != 0.0) {
+      return true;
+    }
 
+    if (gemm->custom_call_target() == kCublasLtMatmulF8CallTarget &&
+        bias->shape().element_type() == F32) {
+      if (convert == nullptr) {
+        return false;
+      }
+
+      HloInstruction *bias_f16_or_bf16 = convert->mutable_operand(0);
+      auto compatible_bias_type = [](const PrimitiveType bias_type,
+                                     const PrimitiveType output_type) {
+        if (bias_type == BF16) {
+          return output_type == F8E4M3FN || output_type == F8E5M2 ||
+                 output_type == F32 || output_type == BF16;
+        } else if (bias_type == F16) {
+          return output_type == F16 || output_type == F8E4M3FN ||
+                 output_type == F8E5M2;
+        }
+        return false;
+      };
+
+      // cuBLAS LT does not support FP32 biases on matmuls with FP8 inputs,
+      // even if the matmul output is FP32. We do not unconditionally convert
+      // the bias to a supported precision (F16 or BF16) because this lowers
+      // precision. Instead, we only fuse the bias if the bias itself is a
+      // convert from F16 or BF16, fusing the input of the convert instruction
+      // to the matmul.
+      if (compatible_bias_type(bias_f16_or_bf16->shape().element_type(),
+                               gemm->shape().element_type())) {
+        bias = bias_f16_or_bf16;
+      } else {
+        VLOG(1) << "Epilogue fusion of FP32 vector bias into FP8 GEMM is "
+                   "currently not supported. See the cublasLT support matrix.";
+        return false;
+      }
+    }
+
+    // Replace add(gemm, broadcast) with fused new_gemm.
+    operands.push_back(bias);
+    config.set_epilogue(GemmBackendConfig::BIAS);
     std::unique_ptr<HloInstruction> result =
         gemm->CloneWithNewOperands(gemm->shape(), operands);
     TF_RETURN_IF_ERROR(result->set_backend_config(config));
     TF_RETURN_IF_ERROR(SetName(result->GetModule(), result.get()));
-
     if (slice != nullptr) {
       result = slice->CloneWithNewOperands(
           slice->shape(), {slice->parent()->AddInstruction(std::move(result))});
@@ -1260,7 +1326,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
   }
 
  private:
-  se::CudaComputeCapability cuda_compute_capability_;
+  GpuVersion gpu_version_;
 
   // Choose cublas or cublasLt for the target of the custom call that instr will
   // be rewritten into.
@@ -1480,6 +1546,8 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return true;
     }
 
+    auto cuda_compute_capability_ =
+        std::get<se::CudaComputeCapability>(gpu_version_);
     if (cuda_compute_capability_.IsAtLeast(se::CudaComputeCapability::AMPERE)) {
       // cuBlasLt has an implementation for complex data with compute type
       // 32F_FAST_32TF that uses tensor cores and that is free from the
@@ -1521,20 +1589,47 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // Check that the size of the non-contracting dimension is not too large.
     return lhs_non_contracting_dimension_size <= kMaxDimensionSize;
   }
+
+  // Turns an F8 dot into an F16 dot, converting operands to F16 and
+  // converting the output back to F8.
+  StatusOr<HloInstruction *> TurnF8DotIntoF16Dot(HloInstruction *instr) {
+    DCHECK(IsF8Type(instr));
+    DCHECK(IsF8Type(instr->operand(0)));
+    DCHECK(IsF8Type(instr->operand(1)));
+
+    // Convert operands to F16
+    for (int i = 0; i < 2; ++i) {
+      Shape operand_f16_shape = instr->operand(i)->shape();
+      operand_f16_shape.set_element_type(F16);
+      HloInstruction *convert =
+          instr->AddInstruction(HloInstruction::CreateConvert(
+              operand_f16_shape, instr->mutable_operand(i)));
+      TF_RETURN_IF_ERROR(instr->ReplaceOperandWith(i, convert));
+    }
+
+    // Clone instruction and convert output to F8
+    Shape output_f16_shape = instr->shape();
+    output_f16_shape.set_element_type(F16);
+    HloInstruction *f16_dot =
+        instr->AddInstruction(instr->CloneWithNewShape(output_f16_shape));
+    HloInstruction *convert_to_f8 = instr->AddInstruction(
+        HloInstruction::CreateConvert(instr->shape(), f16_dot));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(instr, convert_to_f8));
+    return f16_dot;
+  }
 };
 
-StatusOr<bool> RunOnComputation(
-    HloComputation *computation,
-    se::CudaComputeCapability cuda_compute_capability) {
-  GemmRewriterVisitor visitor(cuda_compute_capability);
+StatusOr<bool> RunOnComputation(HloComputation *computation,
+                                GpuVersion gpu_version) {
+  GemmRewriterVisitor visitor(gpu_version);
   TF_RETURN_IF_ERROR(computation->Accept(&visitor));
   return visitor.changed();
 }
 
 }  // anonymous namespace
 
-GemmRewriter::GemmRewriter(se::CudaComputeCapability cuda_compute_capability)
-    : cuda_compute_capability_(cuda_compute_capability) {}
+GemmRewriter::GemmRewriter(GpuVersion gpu_version)
+    : gpu_version_(gpu_version) {}
 
 StatusOr<bool> GemmRewriter::Run(
     HloModule *module,
@@ -1542,8 +1637,8 @@ StatusOr<bool> GemmRewriter::Run(
   bool changed = false;
   for (HloComputation *computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    TF_ASSIGN_OR_RETURN(
-        bool result, RunOnComputation(computation, cuda_compute_capability_));
+    TF_ASSIGN_OR_RETURN(bool result,
+                        RunOnComputation(computation, gpu_version_));
     changed |= result;
   }
   return changed;
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_rewriter.h b/tensorflow/compiler/xla/service/gpu/gemm_rewriter.h
index 7c59baf9204..afc1d04f3b8 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_rewriter.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
@@ -43,7 +44,7 @@ namespace gpu {
 // stored in the backend config.
 class GemmRewriter : public HloModulePass {
  public:
-  explicit GemmRewriter(se::CudaComputeCapability cuda_compute_capability);
+  explicit GemmRewriter(GpuVersion gpu_version);
   absl::string_view name() const override { return "cublas-gemm-rewriter"; }
 
   using HloPassInterface::Run;
@@ -52,7 +53,7 @@ class GemmRewriter : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  se::CudaComputeCapability cuda_compute_capability_;
+  GpuVersion gpu_version_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.cc b/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.cc
index 81d690304ab..8a0542c4853 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.cc
@@ -16,45 +16,76 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.h"
 
 #include <array>
+#include <cmath>
 #include <cstdint>
 #include <stack>
 #include <string>
-#include <tuple>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/layout.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/protobuf/autotuning.pb.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-int FirstBatchDimensionIndex(const DotDimensionNumbers& dimension_numbers,
-                             const int operand_number) {
+// Batch dimensions of an operand of a dot instruction.
+// Just an unified accessor to lhs_batch_dimensions and rhs_batch_dimensions.
+const tsl::protobuf::RepeatedField<int64_t>& BatchDimensionsForOperand(
+    const HloInstruction& dot, const int operand_number) {
+  const DotDimensionNumbers& dimension_numbers = dot.dot_dimension_numbers();
   if (operand_number == 0) {
-    return dimension_numbers.lhs_batch_dimensions_size()
-               ? dimension_numbers.lhs_batch_dimensions(0)
-               : -1;
+    return dimension_numbers.lhs_batch_dimensions();
   }
-  return dimension_numbers.rhs_batch_dimensions_size()
-             ? dimension_numbers.rhs_batch_dimensions(0)
-             : -1;
+  return dimension_numbers.rhs_batch_dimensions();
+}
+
+// Index of first batch dimension of dot instruction operand; -1 if none exist.
+int64_t FirstBatchDimensionForOperand(const HloInstruction& dot,
+                                      const int operand_number) {
+  tsl::protobuf::RepeatedField<int64_t> dimensions =
+      BatchDimensionsForOperand(dot, operand_number);
+  return dimensions.empty() ? -1 : dimensions[0];
+}
+
+// Index of first contracting dimension of dot instruction operand.
+int64_t FirstContractingDimensionIndex(const HloInstruction& dot,
+                                       const int operand_number) {
+  const DotDimensionNumbers& dimension_numbers = dot.dot_dimension_numbers();
+  if (operand_number == 0) {
+    return dimension_numbers.lhs_contracting_dimensions(0);
+  }
+  return dimension_numbers.rhs_contracting_dimensions(0);
 }
 
 // Data types that are tested to work in the triton GEMM emitter.
-bool IsTritonSupportedInputType(
-    PrimitiveType t, se::CudaComputeCapability cuda_compute_capability) {
+bool IsTritonSupportedInputType(PrimitiveType t, GpuVersion gpu_version) {
+  auto cuda_compute_capability =
+      std::get<se::CudaComputeCapability>(gpu_version);
   switch (t) {
     case PRED:
     case S8:
@@ -70,11 +101,10 @@ bool IsTritonSupportedInputType(
   }
 }
 
-Status RequireTritonFusibleConvert(
-    const HloInstruction* input,
-    se::CudaComputeCapability cuda_compute_capability) {
+Status RequireTritonFusibleConvert(const HloInstruction* input,
+                                   GpuVersion gpu_version) {
   if (!IsTritonSupportedInputType(input->operand(0)->shape().element_type(),
-                                  cuda_compute_capability)) {
+                                  gpu_version)) {
     return Unimplemented("unsupported data type");
   }
   // TODO(b/266862494): Can pick up almost any
@@ -120,6 +150,11 @@ class DimensionOrder {
     }
   }
 
+  // Create dimension order describing a dot operand according to
+  // the currently supported configurations.
+  static DimensionOrder FromDotOperand(const HloInstruction& dot,
+                                       int operand_number, int64_t split_k = 1);
+
   // Transforms the DimensionOrder so that from a description of the output
   // of `hlo` it becomes a description of the input of `hlo`.
   Status HandleInstruction(const HloInstruction* hlo) {
@@ -161,86 +196,107 @@ class DimensionOrder {
   int64_t splittable_dimension_index_;
 };
 
+DimensionOrder DimensionOrder::FromDotOperand(const HloInstruction& dot,
+                                              const int operand_number,
+                                              const int64_t split_k) {
+  const HloInstruction* operand = dot.operand(operand_number);
+  // There can be either none or one split-K batch dimension.
+  const int num_split_k_batch_dims = split_k > 1;
+  // LHS non-contracting dimension can be split if non-splitK batch is absent.
+  if (operand_number == 0 &&
+      dot.dot_dimension_numbers().lhs_batch_dimensions_size() -
+              num_split_k_batch_dims ==
+          0) {
+    return DimensionOrder(
+        operand, /*batch_dimension_index=*/-1,
+        GetNonContractingDims(operand->shape(), /*batch_dims=*/{},
+                              {FirstContractingDimensionIndex(dot, 0)})
+            .value()[0]);
+  }
+  return DimensionOrder(operand,
+                        FirstBatchDimensionForOperand(dot, operand_number),
+                        /*splittable_dimension_index=*/-1);
+}
+
 Status DimensionOrder::HandleBitcast(const HloInstruction* hlo) {
   const Shape& operand_shape = hlo->operand(0)->shape();
   DimOrderVector operand_dim_order;
-  operand_dim_order.reserve(operand_shape.rank());
-  // Subdimension index tracking dimension splits.
-  int subdim_index = 0;
-  // Iterate in parallel over output and operand dimensions
+  operand_dim_order.reserve(dim_order_.size());
+  // Size of not yet assigned part of current operand dimension.
+  int64_t operand_remaining_size = 1;
+  // Iterate in parallel over output dimension order and operand dimensions
   // in minor_to_major order. Find groups of dimensions of equal size
   // and project the output dimension order onto the operand.
   auto operand_dim_iter = operand_shape.layout().minor_to_major().cbegin();
-  for (int64_t out_dim_index = 0; out_dim_index < hlo->shape().rank();
-       ++out_dim_index) {
-    int64_t out_dim_size = hlo->shape().dimensions_minor(out_dim_index);
-    if (operand_dim_iter == operand_shape.layout().minor_to_major().cend()) {
-      // Out of dimensions of the operand -> output should only have
-      // degenerate dimensions from here.
-      if (out_dim_size == 1) {
-        continue;
+  for (auto out_dim = dim_order_.cbegin(); out_dim != dim_order_.cend();
+       ++out_dim) {
+    if (operand_remaining_size >= out_dim->size) {
+      if (operand_remaining_size % out_dim->size) {
+        return Unimplemented("Unsupported bitcast: %s", hlo->ToString());
       }
-      // Otherwise this is an arbitrary transformation like
-      // [2, 3] -> [3, 2] which is not supported yet
-      return Unimplemented("general bitcast");
-    }
-    int64_t operand_dim_size = operand_shape.dimensions(*operand_dim_iter);
-    VLOG(9) << hlo->shape().layout().minor_to_major(out_dim_index) << " "
-            << *operand_dim_iter;
-    VLOG(9) << out_dim_size << " " << operand_dim_size;
-    subdim_index = 0;
-    if (out_dim_size == operand_dim_size) {
-      // 1:1 matching dimensions.
-      operand_dim_order.push_back(dim_order_[out_dim_index]);
-    } else if (out_dim_size < operand_dim_size) {
-      // Multiple output dimensions <- one operand dimension:
-      //  just keep their order.
-      do {
-        operand_dim_order.push_back(dim_order_[out_dim_index]);
-        ++out_dim_index;
-        if (out_dim_index == hlo->shape().rank()) {
-          return Unimplemented("general bitcast");
-        }
-        out_dim_size *= hlo->shape().dimensions_minor(out_dim_index);
-      } while (out_dim_size != operand_dim_size);
-      operand_dim_order.push_back(dim_order_[out_dim_index]);
+      // Output dimension fragment completely fits into the operand one:
+      // just copy it as is.
+      operand_dim_order.push_back(*out_dim);
+      // Update the size of the remaining part of the operand that is
+      // carried over to next output dimensions.
+      operand_remaining_size /= out_dim->size;
     } else {
-      // One output dimension <- multiple operand dimensions:
-      //  create new sub-dimensions.
-      do {
-        if (dim_order_[out_dim_index].subdim_number != 0) {
-          return Unimplemented("split of subdimension");
+      // Output is larger than input. Assign further operand dimensions.
+      // Size of the not yet assigned part of the output dimension.
+      int64_t out_remaining_size = out_dim->size;
+      // Subdimension index tracking dimension splits.
+      int subdim_index = out_dim->subdim_number;
+      if (operand_remaining_size > 1) {
+        // If there is a remaining fragment of a previous operand dimension
+        // assign it first.
+        if (out_remaining_size % operand_remaining_size) {
+          return Unimplemented("Unsupported bitcast: %s", hlo->ToString());
         }
         operand_dim_order.push_back(
-            {dim_order_[out_dim_index].target_dim_number, subdim_index,
-             operand_shape.dimensions(*operand_dim_iter)});
+            {out_dim->target_dim_number, subdim_index, operand_remaining_size});
         ++subdim_index;
-        ++operand_dim_iter;
-        if (operand_dim_iter ==
-            operand_shape.layout().minor_to_major().cend()) {
-          return Unimplemented("general bitcast");
+        // Update the size of the fragment remaining to assign.
+        out_remaining_size /= operand_remaining_size;
+        operand_remaining_size = 1;
+      }
+      while (out_remaining_size > 1) {
+        // Assign operand dimensions until the output remainder is covered.
+        int64_t operand_dim_size = operand_shape.dimensions(*operand_dim_iter);
+        int64_t new_fragment_size = operand_dim_size;
+        if (operand_dim_size > out_remaining_size) {
+          // If adding the next operand dimension exceeds output fragment size
+          // assign the remainder of the output and carry over the remainder
+          // of the operand.
+          if (operand_dim_size % out_remaining_size) {
+            return Unimplemented("Unsupported bitcast: %s", hlo->ToString());
+          }
+          operand_remaining_size = operand_dim_size / out_remaining_size;
+          new_fragment_size = out_remaining_size;
         }
-        operand_dim_size *= operand_shape.dimensions(*operand_dim_iter);
-      } while (out_dim_size != operand_dim_size);
-      operand_dim_order.push_back(
-          {dim_order_[out_dim_index].target_dim_number, subdim_index,
-           operand_shape.dimensions(*operand_dim_iter)});
+        operand_dim_order.push_back(
+            {out_dim->target_dim_number, subdim_index, new_fragment_size});
+        out_remaining_size /= new_fragment_size;
+        ++operand_dim_iter;
+        ++subdim_index;
+      }
     }
-    ++operand_dim_iter;
   }
+  CHECK_EQ(operand_remaining_size, 1);
+
   // Handle remaining major dimensions of the operand. Call all degenerate
   // ones subdimensions of the most-major non-degenerate one. Otherwise
   // give up.
+  int subdim_index = operand_dim_order.back().subdim_number + 1;
   while (operand_dim_iter != operand_shape.layout().minor_to_major().cend()) {
-    ++subdim_index;
     if (operand_shape.dimensions(*operand_dim_iter) != 1) {
-      return Unimplemented("general bitcast");
+      return Unimplemented("Unsupported bitcast: %s", hlo->ToString());
     }
     operand_dim_order.push_back(
-        {dim_order_[hlo->shape().rank() - 1].target_dim_number, subdim_index,
-         1});
+        {operand_dim_order.back().target_dim_number, subdim_index, 1});
+    ++subdim_index;
     ++operand_dim_iter;
   }
+
   dim_order_ = operand_dim_order;
   return OkStatus();
 }
@@ -302,22 +358,23 @@ Status DimensionOrder::HandleCopyOrTranspose(const HloInstruction* hlo) {
 // physically once by other dimensions. Other ones can be only split logically.
 // All subdimensions within a dimension have to be ordered.
 Status RequireTritonGemmSupportedDimOrder(const DimensionOrder& order) {
-  std::array<int, 3> subdim_counters = {-1, -1, -1};
-  std::array<int, 3> split_counters = {0, 0, 0};
-  int previous_dim_number = -1;
-  for (int i = 0; i < order.GetDimOrderVector().size(); i++) {
-    const auto [dim_number, subdim_number, size] = order.GetDimOrderVector()[i];
-    VLOG(8) << dim_number << " " << subdim_number << " " << size;
-    if (dim_number == order.BatchDimensionIndex() &&
-        i != order.GetDimOrderVector().size() - 1) {
-      return Unimplemented("non-major-most batch dimension");
-    }
+  // At most: contracting, non-contracting, split-K, another batch.
+  std::array<int, 4> subdim_counters = {-1, -1, -1, -1};
+  std::array<int, 4> split_counters = {-1, -1, -1, -1};
+  const DimensionOrder::DimOrderVector& dim_order_vector =
+      order.GetDimOrderVector();
+  for (int i = 0; i < dim_order_vector.size(); i++) {
+    const auto [dim_number, subdim_number, size] = dim_order_vector[i];
+    VLOG(8) << dim_number << "\t" << subdim_number << "\t" << size;
     if (subdim_counters[dim_number] != subdim_number - 1) {
       return Unimplemented("transpose within a dimension");
     }
     ++subdim_counters[dim_number];
-    if (previous_dim_number >= 0 && previous_dim_number != dim_number) {
-      ++split_counters[previous_dim_number];
+    if (size == 1) {
+      continue;
+    }
+    if (i == 0 || dim_order_vector[i - 1].target_dim_number != dim_number) {
+      ++split_counters[dim_number];
       if (dim_number == order.SplittableDimensionIndex()) {
         if (split_counters[dim_number] > 1) {
           return Unimplemented("2nd split of a splittable dimension");
@@ -326,7 +383,6 @@ Status RequireTritonGemmSupportedDimOrder(const DimensionOrder& order) {
         return Unimplemented("split of a non-splittable dimension");
       }
     }
-    previous_dim_number = dim_number;
   }
   return OkStatus();
 }
@@ -334,9 +390,9 @@ Status RequireTritonGemmSupportedDimOrder(const DimensionOrder& order) {
 // Tries to transform dim_order describing the output of `hlo` into a
 // description of its input if it is supported by the triton GEMM emitter.
 Status TryToFuse(const HloInstruction* hlo, DimensionOrder& dim_order,
-                 const se::CudaComputeCapability cuda_compute_capability) {
+                 const GpuVersion gpu_version) {
   if (hlo->opcode() == HloOpcode::kConvert) {
-    return RequireTritonFusibleConvert(hlo, cuda_compute_capability);
+    return RequireTritonFusibleConvert(hlo, gpu_version);
   }
   TF_RETURN_IF_ERROR(dim_order.HandleInstruction(hlo));
   return RequireTritonGemmSupportedDimOrder(dim_order);
@@ -346,20 +402,21 @@ Status TryToFuse(const HloInstruction* hlo, DimensionOrder& dim_order,
 // operations that can target the triton GEMM emitter.
 class GemmRewriterTritonVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit GemmRewriterTritonVisitor(const se::CudaComputeCapability cc)
-      : cuda_compute_capability_(cc) {}
+  explicit GemmRewriterTritonVisitor(const GpuVersion gpu_version)
+      : gpu_version_(gpu_version) {}
   // Checks that a dot() should be targeting the triton GEMM emitter;
   // if so - fuses all its compatible inputs and outputs as a new computation
   // and replaces the original dot() with a call to the computation.
   Status HandleDot(HloInstruction* dot) override {
     VLOG(5) << dot->ToString();
-    if (!IsTritonHandledGEMM(*dot, cuda_compute_capability_)) {
+    if (!IsTritonHandledGEMM(*dot, gpu_version_)) {
       return OkStatus();
     }
 
     // TODO(b/266857789): also fuse convert(dot()) at output if present:
     // seen on s8xf32->bf16
-    HloComputation::Builder builder(absl::StrCat("triton_gemm_", dot->name()));
+    std::string suggested_name = absl::StrCat("triton_gemm_", dot->name());
+    HloComputation::Builder builder(suggested_name);
     // Original instruction -> fused one.
     absl::flat_hash_map<const HloInstruction*, HloInstruction*>
         old_to_new_mapping;
@@ -380,32 +437,16 @@ class GemmRewriterTritonVisitor : public DfsHloRewriteVisitor {
           DimensionOrder operand_dim_order = [&] {
             // Direct dot inputs are described by default dimension orders.
             if (operand == dot->operand(0)) {
-              if (dot->dot_dimension_numbers().lhs_batch_dimensions_size()) {
-                return DimensionOrder(
-                    operand,
-                    dot->dot_dimension_numbers().lhs_batch_dimensions_size(),
-                    -1);
-              }
-              // Non-contracting dimension can be split if batch is absent.
-              return DimensionOrder(
-                  operand, -1,
-                  NoncontractingDimensionIndex(
-                      dot->dot_dimension_numbers().lhs_contracting_dimensions(
-                          0),
-                      -1));
+              return DimensionOrder::FromDotOperand(*dot, 0);
             } else if (operand == dot->operand(1)) {
-              return DimensionOrder(
-                  operand,
-                  FirstBatchDimensionIndex(dot->dot_dimension_numbers(), 1),
-                  -1);
+              return DimensionOrder::FromDotOperand(*dot, 1);
             }
             // Otherwise operand's output is described by its consumer's input.
             return DimensionOrder(dim_orders.at(hlo));
           }();
           // TryToFuse() makes output -> input transformation of
           // operand_dim_order if succeeds.
-          if (TryToFuse(operand, operand_dim_order, cuda_compute_capability_)
-                  .ok()) {
+          if (TryToFuse(operand, operand_dim_order, gpu_version_).ok()) {
             VLOG(3) << "Fusing " << operand->ToString();
             to_fuse.push(operand);
             // Save the dimension order description of operand's input.
@@ -449,6 +490,8 @@ class GemmRewriterTritonVisitor : public DfsHloRewriteVisitor {
         dot->parent()->AddInstruction(HloInstruction::CreateFusion(
             dot->shape(), HloInstruction::FusionKind::kCustom, call_operands,
             computation));
+    dot_fusion->GetModule()->SetAndUniquifyInstrName(dot_fusion,
+                                                     suggested_name);
     dot_fusion->set_raw_backend_config_string(
         std::string(kTritonGemmBackendConfig));
     if (dot->IsRoot()) {
@@ -464,31 +507,162 @@ class GemmRewriterTritonVisitor : public DfsHloRewriteVisitor {
   }
 
  private:
-  se::CudaComputeCapability cuda_compute_capability_;
+  GpuVersion gpu_version_;
 };
 
-StatusOr<bool> RunOnComputation(
-    HloComputation* computation,
-    se::CudaComputeCapability cuda_compute_capability) {
-  GemmRewriterTritonVisitor visitor(cuda_compute_capability);
+StatusOr<bool> RunOnComputation(HloComputation* computation,
+                                GpuVersion gpu_version) {
+  GemmRewriterTritonVisitor visitor(gpu_version);
   TF_RETURN_IF_ERROR(computation->Accept(&visitor));
   return visitor.changed();
 }
 
-}  // anonymous namespace
-
-int NoncontractingDimensionIndex(const int contracting_dimension_index,
-                                 const int batch_dimension_index) {
-  // Sum of all indices is 0 + 1 = 1 if only two dimensions are present.
-  int ret = 1 - contracting_dimension_index;
-  if (batch_dimension_index >= 0) {
-    // Sum of all indices is 0 + 1 + 2 = 3 if three dimensions are present.
-    ret += (2 - batch_dimension_index);
+// Copy source values into destination incrementing those >= threshold by 1.
+void CopyIncrementingAboveThreshold(
+    const tsl::protobuf::RepeatedField<int64_t>& source,
+    tsl::protobuf::RepeatedField<int64_t>& destination, const int threshold) {
+  destination.Reserve(source.size());
+  for (int64_t x : source) {
+    if (x >= threshold) {
+      ++x;
+    }
+    destination.Add(x);
   }
-  return ret;
 }
 
-DotFusionAnalysis::DotFusionAnalysis(const HloInstruction* root) {
+StatusOr<HloInstruction*> MakeSplitKOperand(
+    HloInstruction& dot,
+    const tensorflow::AutotuneResult::TritonGemmKey& tiling,
+    const int64_t contracting_dim_idx, const int operand_number) {
+  const Shape& shape = dot.operand(operand_number)->shape();
+  Shape new_shape(shape.element_type(), {}, {}, {});
+
+  // TODO(b/274775195): implement split-K with padding.
+  if (tiling.split_k() > shape.dimensions(contracting_dim_idx)) {
+    return Cancelled("Too small total contracting dimension size.");
+  }
+  const DotFusionAnalysis analysis(&dot);
+  int64_t size_to_split = tiling.split_k();
+  auto fragment = analysis.IterSpec(operand_number, contracting_dim_idx)[0]
+                      .subfragments.crbegin();
+  while (size_to_split > *fragment) {
+    if (size_to_split % *fragment) {
+      return Cancelled("Contracting dimension is too fragmented.");
+    }
+    size_to_split /= *fragment;
+    ++fragment;
+  }
+  if (*fragment % size_to_split) {
+    return Cancelled("Contracting dimension is too fragmented.");
+  }
+  if (tiling.split_k() >
+      ceil(1.0 *
+           analysis.IterSpec(operand_number, contracting_dim_idx)[0].count /
+           tiling.block_k())) {
+    return Cancelled("Too small divisible part of the contracting dimension.");
+  }
+
+  for (int i = 0; i < shape.rank(); ++i) {
+    const int64_t dimension_size = shape.dimensions(i);
+    if (i == contracting_dim_idx) {
+      new_shape.add_dimensions(tiling.split_k());
+      new_shape.add_dimensions(dimension_size / tiling.split_k());
+    } else {
+      new_shape.add_dimensions(dimension_size);
+    }
+  }
+
+  absl::Span<const int64_t> physical_dim_order =
+      shape.layout().minor_to_major();
+  const int contracting_dim_physical_idx =
+      absl::c_find(physical_dim_order, contracting_dim_idx) -
+      physical_dim_order.begin();
+  Layout* batch_dot_layout = new_shape.mutable_layout();
+  for (int64_t physical_dim_idx : physical_dim_order) {
+    // When physical_dim_idx == contracting_dim_physical_idx add both
+    // physical_dim_idx+1 and physical_dim_idx because it gets split into two.
+    if (physical_dim_idx >= contracting_dim_physical_idx) {
+      batch_dot_layout->add_minor_to_major(physical_dim_idx + 1);
+    }
+    if (physical_dim_idx <= contracting_dim_physical_idx) {
+      batch_dot_layout->add_minor_to_major(physical_dim_idx);
+    }
+  }
+  return MakeBitcastHlo(dot.mutable_operand(operand_number), new_shape);
+}
+
+}  // anonymous namespace
+
+Status MakeDotComputationSplitKBatch(
+    HloComputation* computation,
+    const tensorflow::AutotuneResult::TritonGemmKey& tiling) {
+  HloInstruction* dot = computation->root_instruction();
+  CHECK_EQ(dot->opcode(), HloOpcode::kDot);
+  const DotDimensionNumbers& old_dim_numbers = dot->dot_dimension_numbers();
+  DotDimensionNumbers new_dim_numbers;
+
+  const int64_t lhs_contracting_idx = FirstContractingDimensionIndex(*dot, 0);
+  TF_ASSIGN_OR_RETURN(HloInstruction * lhs,
+                      MakeSplitKOperand(*dot, tiling, lhs_contracting_idx, 0));
+  CopyIncrementingAboveThreshold(
+      old_dim_numbers.lhs_contracting_dimensions(),
+      *new_dim_numbers.mutable_lhs_contracting_dimensions(),
+      lhs_contracting_idx);
+  CopyIncrementingAboveThreshold(
+      old_dim_numbers.lhs_batch_dimensions(),
+      *new_dim_numbers.mutable_lhs_batch_dimensions(), lhs_contracting_idx);
+  new_dim_numbers.mutable_lhs_batch_dimensions()->Add(lhs_contracting_idx);
+
+  const int64_t rhs_contracting_idx = FirstContractingDimensionIndex(*dot, 1);
+  TF_ASSIGN_OR_RETURN(HloInstruction * rhs,
+                      MakeSplitKOperand(*dot, tiling, rhs_contracting_idx, 1));
+  CopyIncrementingAboveThreshold(
+      old_dim_numbers.rhs_contracting_dimensions(),
+      *new_dim_numbers.mutable_rhs_contracting_dimensions(),
+      rhs_contracting_idx);
+  CopyIncrementingAboveThreshold(
+      old_dim_numbers.rhs_batch_dimensions(),
+      *new_dim_numbers.mutable_rhs_batch_dimensions(), rhs_contracting_idx);
+  new_dim_numbers.mutable_rhs_batch_dimensions()->Add(rhs_contracting_idx);
+
+  HloInstruction* new_dot =
+      MakeDotHlo(lhs, rhs, new_dim_numbers, dot->precision_config(),
+                 dot->shape().element_type())
+          .value();
+  dot->SetupDerivedInstruction(new_dot);
+  TF_RETURN_IF_ERROR(dot->ReplaceAllUsesWithDifferentShape(new_dot));
+  TF_RETURN_IF_ERROR(dot->parent()->RemoveInstruction(dot));
+  return OkStatus();
+}
+
+Status MakeDotSplitKBatch(
+    HloInstruction* dot_fusion,
+    const tensorflow::AutotuneResult::TritonGemmKey& tiling) {
+  CHECK_EQ(dot_fusion->opcode(), HloOpcode::kFusion);
+  TF_RETURN_IF_ERROR(MakeDotComputationSplitKBatch(
+      dot_fusion->fused_instructions_computation(), tiling));
+  const HloInstruction* dot = dot_fusion->fused_expression_root();
+
+  *dot_fusion->mutable_shape() = dot->shape();
+  HloInstruction* zero =
+      dot_fusion->parent()->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::Zero(dot->shape().element_type())));
+  const int new_batch_dim_idx =
+      dot->dot_dimension_numbers().lhs_batch_dimensions().size() - 1;
+  HloInstruction* reduce =
+      MakeReduceHlo(dot_fusion, zero, {new_batch_dim_idx}, HloOpcode::kAdd)
+          .value();
+
+  if (dot_fusion->IsRoot()) {
+    dot_fusion->parent()->set_root_instruction(reduce, true);
+  } else {
+    TF_RETURN_IF_ERROR(dot_fusion->ReplaceAllUsesWithDifferentShape(reduce));
+  }
+  return OkStatus();
+}
+
+DotFusionAnalysis::DotFusionAnalysis(const HloInstruction* root,
+                                     const int64_t split_k) {
   VLOG(5) << root->parent()->ToString();
 
   while (root->opcode() != HloOpcode::kDot) {
@@ -499,10 +673,13 @@ DotFusionAnalysis::DotFusionAnalysis(const HloInstruction* root) {
   for (int64_t operand_number = 0; operand_number < root->operand_count();
        ++operand_number) {
     const HloInstruction* parameter = root->operand(operand_number);
-    DimensionOrder dim_order(parameter, -1, -1);
+    DimensionOrder dim_order =
+        DimensionOrder::FromDotOperand(*root, operand_number, split_k);
     while (parameter->opcode() != HloOpcode::kParameter) {
       CHECK_EQ(parameter->operand_count(), 1);
-      dim_order.HandleInstruction(parameter).ok();
+      TF_CHECK_OK(dim_order.HandleInstruction(parameter));
+      TF_CHECK_OK(RequireTritonGemmSupportedDimOrder(dim_order))
+          << " " << root->parent()->ToString();
       parameter = parameter->operand(0);
     }
     operand_to_parameter_[operand_number] = parameter;
@@ -515,7 +692,7 @@ DotFusionAnalysis::DotFusionAnalysis(const HloInstruction* root) {
          ++dim_order_index) {
       const DimensionOrder::DimDescription& dim =
           dim_order_vector[dim_order_index];
-      VLOG(6) << dim.target_dim_number << " " << dim.subdim_number << " "
+      VLOG(6) << dim.target_dim_number << "\t" << dim.subdim_number << "\t"
               << dim.size;
 
       if (dim.size == 1) {
@@ -530,13 +707,14 @@ DotFusionAnalysis::DotFusionAnalysis(const HloInstruction* root) {
         if (iter_spec.empty()) {
           // Previous parts of this dimension were degenerate -
           // so create the dimension here.
-          iter_spec.push_back({accumulated_stride, dim.size});
+          iter_spec.push_back({accumulated_stride, dim.size, {dim.size}});
         } else {
           // Contiguous dimension, split only logically. Merge it back.
           iter_spec.back().count *= dim.size;
+          iter_spec.back().subfragments.push_back(dim.size);
         }
       } else {
-        iter_spec.push_back({accumulated_stride, dim.size});
+        iter_spec.push_back({accumulated_stride, dim.size, {dim.size}});
       }
 
       accumulated_stride *= dim.size;
@@ -544,17 +722,17 @@ DotFusionAnalysis::DotFusionAnalysis(const HloInstruction* root) {
   }
 }
 
-bool IsTritonHandledGEMM(
-    const HloInstruction& dot,
-    const se::CudaComputeCapability cuda_compute_capability) {
+bool IsTritonHandledGEMM(const HloInstruction& dot,
+                         const GpuVersion gpu_version) {
   if (dot.opcode() != HloOpcode::kDot ||
       absl::c_any_of(dot.precision_config().operand_precision(),
                      [](int x) { return x != PrecisionConfig::DEFAULT; })) {
     return false;
   }
-  const DotDimensionNumbers& dimension_numbers = dot.dot_dimension_numbers();
 
   auto supported_output_type = [&](const PrimitiveType t) {
+    auto cuda_compute_capability =
+        std::get<se::CudaComputeCapability>(gpu_version);
     switch (t) {
       case F16:
       case F32:
@@ -573,14 +751,14 @@ bool IsTritonHandledGEMM(
   }
 
   if (!IsTritonSupportedInputType(dot.operand(0)->shape().element_type(),
-                                  cuda_compute_capability) ||
+                                  gpu_version) ||
       !IsTritonSupportedInputType(dot.operand(1)->shape().element_type(),
-                                  cuda_compute_capability)) {
+                                  gpu_version)) {
     return false;
   }
 
   // TODO(b/269580541): support multiple batch dimensions.
-  if (dimension_numbers.lhs_batch_dimensions().size() > 1) {
+  if (dot.dot_dimension_numbers().lhs_batch_dimensions().size() > 1) {
     return false;
   }
 
@@ -590,12 +768,11 @@ bool IsTritonHandledGEMM(
 
   // Traverse HLO graph part checking that it both can be fused
   // and is worth fusing.
-  auto has_triton_fusible_inputs = [&](const HloInstruction* input,
-                                       int64_t batch_dimension_index,
-                                       int64_t contracting_dimension_index) {
-    DimensionOrder dim_order(input, batch_dimension_index,
-                             contracting_dimension_index);
-    while (TryToFuse(input, dim_order, cuda_compute_capability).ok()) {
+  auto has_triton_fusible_inputs = [&](const int operand_number) {
+    const HloInstruction* input = dot.operand(operand_number);
+    DimensionOrder dim_order =
+        DimensionOrder::FromDotOperand(dot, operand_number);
+    while (TryToFuse(input, dim_order, gpu_version).ok()) {
       if (input->opcode() == HloOpcode::kConvert ||
           input->opcode() == HloOpcode::kTranspose) {
         return true;
@@ -605,12 +782,7 @@ bool IsTritonHandledGEMM(
     return false;
   };
 
-  return has_triton_fusible_inputs(
-             dot.operand(0), FirstBatchDimensionIndex(dimension_numbers, 0),
-             dimension_numbers.lhs_contracting_dimensions(0)) ||
-         has_triton_fusible_inputs(
-             dot.operand(1), FirstBatchDimensionIndex(dimension_numbers, 1),
-             dimension_numbers.rhs_contracting_dimensions(0));
+  return has_triton_fusible_inputs(0) || has_triton_fusible_inputs(1);
 
   // TODO(b/266857789): either check that no output fusion (axpy, relu etc)
   // is expected or actually support it.
@@ -622,8 +794,8 @@ StatusOr<bool> GemmRewriterTriton::Run(
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    TF_ASSIGN_OR_RETURN(
-        bool result, RunOnComputation(computation, cuda_compute_capability_));
+    TF_ASSIGN_OR_RETURN(bool result,
+                        RunOnComputation(computation, gpu_version_));
     changed |= result;
   }
   return changed;
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.h b/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.h
index db01ad8a731..916cf380f6b 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.h
@@ -16,26 +16,37 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GEMM_REWRITER_TRITON_H_
 
 #include <array>
-#include <optional>
+#include <cstdint>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/stream_executor/device_description.h"
+#include "tensorflow/tsl/protobuf/autotuning.pb.h"
 
 namespace xla {
 namespace gpu {
 
-// Index of non-contracting dimension of a dot() operand assuming that
-// it only has one contracting, one non-contracting and an optional batch
-// dimension.
-int NoncontractingDimensionIndex(int contracting_dimension_index,
-                                 int batch_dimension_index);
+// Apply split K configuration from the tiling to the fused dot() computation:
+// bitcast the operands, change the output shape and the dot dimensions.
+Status MakeDotComputationSplitKBatch(
+    HloComputation* computation,
+    const tensorflow::AutotuneResult::TritonGemmKey& tiling);
+
+// Apply split K configuration from the tiling to the fusion instruction:
+// in addition to MakeDotComputationSplitKBatch on its computation add the
+// necessary reduction after it.
+Status MakeDotSplitKBatch(
+    HloInstruction* dot_fusion,
+    const tensorflow::AutotuneResult::TritonGemmKey& tiling);
 
 // Filters GEMMs which are better to handle using Triton.
-bool IsTritonHandledGEMM(const HloInstruction&,
-                         se::CudaComputeCapability cuda_compute_capability);
+bool IsTritonHandledGEMM(const HloInstruction&, GpuVersion gpu_version);
 
 // Analysis of iteration of HLO shapes within a fusion around dot().
 class DotFusionAnalysis {
@@ -44,6 +55,9 @@ class DotFusionAnalysis {
   struct IterationSpecFragment {
     int64_t stride;
     int64_t count;
+    // Logical subfragments when this iteration is composed
+    // of several HLO dimensions. Product of subfragments equals `count`.
+    std::vector<int64_t> subfragments;
   };
 
   // Description of complex iteration over a sequence of several strides.
@@ -52,7 +66,9 @@ class DotFusionAnalysis {
   using IterationSpec = std::vector<IterationSpecFragment>;
 
   // Execute analysis of fusion rooted with the instruction.
-  explicit DotFusionAnalysis(const HloInstruction*);
+  // split_k indicates whether this operation was converted to the split-K
+  // form and tells the analysis how to interpret the batch dimensions.
+  explicit DotFusionAnalysis(const HloInstruction* root, int64_t split_k = 1);
 
   // Description of iteration of given dimension of given operand of `root`.
   const IterationSpec& IterSpec(const int operand_number,
@@ -75,8 +91,8 @@ class DotFusionAnalysis {
 // that target Triton-based matmul emitter.
 class GemmRewriterTriton : public HloModulePass {
  public:
-  explicit GemmRewriterTriton(se::CudaComputeCapability cc)
-      : cuda_compute_capability_(cc) {}
+  explicit GemmRewriterTriton(GpuVersion gpu_version)
+      : gpu_version_(gpu_version) {}
   absl::string_view name() const override { return "triton-gemm-rewriter"; }
 
   using HloPassInterface::Run;
@@ -85,7 +101,7 @@ class GemmRewriterTriton : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  se::CudaComputeCapability cuda_compute_capability_;
+  GpuVersion gpu_version_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton_test.cc b/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton_test.cc
index e73da575a64..915196d4ac4 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton_test.cc
@@ -18,9 +18,15 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/stream_executor/device_description.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace gpu {
@@ -29,6 +35,8 @@ namespace {
 using ::testing::ElementsAre;
 using ::testing::FieldsAre;
 
+namespace m = ::xla::match;
+
 using GemmRewriterTritonTest = HloTestBase;
 
 TEST_F(GemmRewriterTritonTest, TransposeSubdimensionGroup) {
@@ -49,9 +57,36 @@ ENTRY e {
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })")
                     .value();
-  EXPECT_TRUE(GemmRewriterTriton({se::CudaComputeCapability::AMPERE, 0})
-                  .Run(module.get())
-                  .value());
+  GpuVersion gpu_version{
+      se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0}};
+  EXPECT_TRUE(GemmRewriterTriton(gpu_version).Run(module.get()).value());
+}
+
+TEST_F(GemmRewriterTritonTest, BitcastChain) {
+  // This HLO is artificial because unnecessary reshapes get optimized
+  // out during compilation. It tests the ability of GemmRewriterTriton
+  // to handle various kinds of bitcasts.
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+ENTRY e {
+  p0 = s8[60,5] parameter(0)
+  r0 = s8[3,20,5] reshape(p0)
+  c0 = f16[3,20,5] convert(r0)
+  p1 = f16[3,200] parameter(1)
+  r12 = f16[600] reshape(p1)
+  r11 = f16[30,20] reshape(r12)
+  r1 = f16[3,10,20] reshape(r11)
+  ROOT d = f16[3,5,10] dot(c0, r1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={2},
+    lhs_batch_dims={0}, rhs_batch_dims={0}
+})")
+                    .value();
+  GpuVersion gpu_version{
+      se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0}};
+  EXPECT_TRUE(GemmRewriterTriton(gpu_version).Run(module.get()).value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
 }
 
 using TritonDotAnalysisTest = HloTestBase;
@@ -90,14 +125,18 @@ ENTRY e {
             dot_computation->parameter_instruction(0));
   EXPECT_EQ(analysis.OperandToParameter(1),
             dot_computation->parameter_instruction(1));
-  EXPECT_THAT(analysis.IterSpec(0, 0),
-              ElementsAre(FieldsAre(/*stride=*/4, /*count=*/48)));
-  EXPECT_THAT(analysis.IterSpec(0, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/4)));
-  EXPECT_THAT(analysis.IterSpec(1, 0),
-              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4)));
-  EXPECT_THAT(analysis.IterSpec(1, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3)));
+  EXPECT_THAT(
+      analysis.IterSpec(0, 0),
+      ElementsAre(FieldsAre(/*stride=*/4, /*count=*/48, ElementsAre(48))));
+  EXPECT_THAT(
+      analysis.IterSpec(0, 1),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/4, ElementsAre(4))));
+  EXPECT_THAT(
+      analysis.IterSpec(1, 0),
+      ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4, ElementsAre(4))));
+  EXPECT_THAT(
+      analysis.IterSpec(1, 1),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3, ElementsAre(3))));
 }
 
 TEST_F(TritonDotAnalysisTest, Merge) {
@@ -134,13 +173,17 @@ ENTRY e {
   EXPECT_EQ(analysis.OperandToParameter(1),
             dot_computation->parameter_instruction(1));
   EXPECT_THAT(analysis.IterSpec(0, 0),
-              ElementsAre(FieldsAre(/*stride=*/4, /*count=*/6 * 8)));
+              ElementsAre(FieldsAre(/*stride=*/4, /*count=*/6 * 8,
+                                    /*subfragments=*/ElementsAre(6, 8))));
   EXPECT_THAT(analysis.IterSpec(0, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/4)));
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
   EXPECT_THAT(analysis.IterSpec(1, 0),
-              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4)));
+              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
   EXPECT_THAT(analysis.IterSpec(1, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3)));
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3,
+                                    /*subfragments=*/ElementsAre(3))));
 }
 
 TEST_F(TritonDotAnalysisTest, Split) {
@@ -174,13 +217,17 @@ ENTRY e {
   EXPECT_EQ(analysis.OperandToParameter(1),
             dot_computation->parameter_instruction(0));
   EXPECT_THAT(analysis.IterSpec(0, 0),
-              ElementsAre(FieldsAre(/*stride=*/2, /*count=*/24000)));
+              ElementsAre(FieldsAre(/*stride=*/2, /*count=*/24000,
+                                    /*subfragments=*/ElementsAre(24000))));
   EXPECT_THAT(analysis.IterSpec(0, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/2)));
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/2,
+                                    /*subfragments=*/ElementsAre(2))));
   EXPECT_THAT(analysis.IterSpec(1, 0),
-              ElementsAre(FieldsAre(/*stride=*/2, /*count=*/2)));
+              ElementsAre(FieldsAre(/*stride=*/2, /*count=*/2,
+                                    /*subfragments=*/ElementsAre(2))));
   EXPECT_THAT(analysis.IterSpec(1, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/2)));
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/2,
+                                    /*subfragments=*/ElementsAre(2))));
 }
 
 TEST_F(TritonDotAnalysisTest, TransposeMerge) {
@@ -218,13 +265,17 @@ ENTRY e {
   EXPECT_EQ(analysis.OperandToParameter(1),
             dot_computation->parameter_instruction(1));
   EXPECT_THAT(analysis.IterSpec(0, 0),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8 * 6)));
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8 * 6,
+                                    /*subfragments=*/ElementsAre(6, 8))));
   EXPECT_THAT(analysis.IterSpec(0, 1),
-              ElementsAre(FieldsAre(/*stride=*/8 * 6, /*count=*/4)));
+              ElementsAre(FieldsAre(/*stride=*/8 * 6, /*count=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
   EXPECT_THAT(analysis.IterSpec(1, 0),
-              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4)));
+              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
   EXPECT_THAT(analysis.IterSpec(1, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3)));
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3,
+                                    /*subfragments=*/ElementsAre(3))));
 }
 
 TEST_F(TritonDotAnalysisTest, CopyMerge) {
@@ -263,13 +314,17 @@ ENTRY e {
   EXPECT_EQ(analysis.OperandToParameter(1),
             dot_computation->parameter_instruction(1));
   EXPECT_THAT(analysis.IterSpec(0, 0),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8 * 6)));
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8 * 6,
+                                    /*subfragments=*/ElementsAre(6, 8))));
   EXPECT_THAT(analysis.IterSpec(0, 1),
-              ElementsAre(FieldsAre(/*stride=*/8 * 6, /*count=*/4)));
+              ElementsAre(FieldsAre(/*stride=*/8 * 6, /*count=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
   EXPECT_THAT(analysis.IterSpec(1, 0),
-              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4)));
+              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
   EXPECT_THAT(analysis.IterSpec(1, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3)));
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3,
+                                    /*subfragments=*/ElementsAre(3))));
 }
 
 TEST_F(TritonDotAnalysisTest, TransposeMergeNCN) {
@@ -305,57 +360,262 @@ ENTRY e {
   EXPECT_EQ(analysis.OperandToParameter(1),
             dot_computation->parameter_instruction(1));
   EXPECT_THAT(analysis.IterSpec(0, 0),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8),
-                          FieldsAre(/*stride=*/4 * 8, /*count=*/3)));
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8,
+                                    /*subfragments=*/ElementsAre(8)),
+                          FieldsAre(/*stride=*/4 * 8, /*count=*/3,
+                                    /*subfragments=*/ElementsAre(3))));
   EXPECT_THAT(analysis.IterSpec(0, 1),
-              ElementsAre(FieldsAre(/*stride=*/8, /*count=*/4)));
+              ElementsAre(FieldsAre(/*stride=*/8, /*count=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
   EXPECT_THAT(analysis.IterSpec(1, 0),
-              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4)));
+              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
   EXPECT_THAT(analysis.IterSpec(1, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3)));
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3,
+                                    /*subfragments=*/ElementsAre(3))));
 }
 
-TEST_F(TritonDotAnalysisTest, SplitRhsContracting) {
+using SplitKTest = HloTestBase;
+
+TEST_F(SplitKTest, MakeSplitK) {
   const std::string hlo_text = R"(
 HloModule t
 
-triton_dot {
-  p0 = s8[5,8] parameter(0)
-  c0 = f16[5,8] convert(p0)
-  p1 = f16[2,3,4] parameter(1)
-  t1 = f16[3,2,4] transpose(p1), dimensions={1,0,2}
-  r1 = f16[3,8] reshape(t1)
-  ROOT _ = f16[5,3] dot(c0, r1),
+triton_gemm_dot {
+  parameter_0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
+  bitcast.1 = s8[3,5,32,128]{2,1,3,0} bitcast(parameter_0)
+  copy.1 = s8[3,5,32,128]{3,2,1,0} copy(bitcast.1)
+  reshape.5 = s8[480,128]{1,0} reshape(copy.1)
+  convert.8 = bf16[480,128]{1,0} convert(reshape.5)
+  parameter_1 = bf16[16,128]{1,0} parameter(1)
+  ROOT dot.0 = bf16[480,16]{1,0} dot(convert.8, parameter_1),
     lhs_contracting_dims={1}, rhs_contracting_dims={1}
 }
 
 ENTRY e {
-  p0 = s8[5,8] parameter(0)
-  p1 = f16[2,3,4] parameter(1)
-  ROOT custom-call = f16[5,3] custom-call(p0, p1),
-    custom_call_target="__triton", called_computations={triton_dot}
+  p0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
+  p1 = bf16[16,128]{1,0} parameter(1)
+  ROOT fusion = bf16[480,16]{1,0} fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  const HloComputation* dot_computation =
-      module->entry_computation()->root_instruction()->called_computations()[0];
-  const HloInstruction* dot = dot_computation->root_instruction();
-  const DotFusionAnalysis analysis(dot);
-  EXPECT_EQ(analysis.OperandToParameter(0),
-            dot_computation->parameter_instruction(0));
-  EXPECT_EQ(analysis.OperandToParameter(1),
-            dot_computation->parameter_instruction(1));
-  EXPECT_THAT(analysis.IterSpec(0, 0),
-              ElementsAre(FieldsAre(/*stride=*/8, /*count=*/5)));
-  EXPECT_THAT(analysis.IterSpec(0, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8)));
-  EXPECT_THAT(analysis.IterSpec(1, 0),
-              ElementsAre(FieldsAre(/*stride=*/4, /*count=*/3)));
-  EXPECT_THAT(analysis.IterSpec(1, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/4),
-                          FieldsAre(/*stride=*/12, /*count=*/2)));
+  tensorflow::AutotuneResult::TritonGemmKey key;
+  key.set_block_m(16);
+  key.set_block_n(16);
+  key.set_block_k(16);
+  key.set_split_k(4);
+  key.set_num_stages(1);
+  key.set_num_warps(4);
+  TF_EXPECT_OK(
+      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  EXPECT_TRUE(VerifyHloModule(module.get(), /*layout_sensitive=*/true,
+                              /*allow_mixed_precision=*/false)
+                  .ok());
+  EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
+            HloOpcode::kReduce);
 }
 
+TEST_F(SplitKTest, MakeSplitKWithExistingBatchDim) {
+  const std::string hlo_text = R"(
+HloModule m
+
+triton_gemm_dot.24 {
+  parameter_1 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
+  bitcast.3 = bf16[800,5,128]{2,1,0} bitcast(parameter_1)
+  convert.3 = f32[800,5,128]{2,1,0} convert(bitcast.3)
+  parameter_0 = f32[1,5,700,800]{3,2,1,0} parameter(0)
+  bitcast.2 = f32[5,700,800]{2,1,0} bitcast(parameter_0)
+  ROOT dot.26 = f32[5,128,700]{2,1,0} dot(convert.3, bitcast.2),
+    lhs_batch_dims={1}, lhs_contracting_dims={0},
+    rhs_batch_dims={0}, rhs_contracting_dims={2}
+}
+
+ENTRY e {
+  tmp_3 = f32[1,5,700,800]{3,2,1,0} parameter(0)
+  tmp_0 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
+  ROOT triton_gemm_dot.24 = f32[5,128,700]{2,1,0} fusion(tmp_3, tmp_0),
+    kind=kCustom, calls=triton_gemm_dot.24,
+    backend_config="__triton_gemm"
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  tensorflow::AutotuneResult::TritonGemmKey key;
+  key.set_block_m(32);
+  key.set_block_n(64);
+  key.set_block_k(64);
+  key.set_split_k(8);
+  key.set_num_stages(1);
+  key.set_num_warps(4);
+  TF_EXPECT_OK(
+      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  EXPECT_TRUE(VerifyHloModule(module.get(), /*layout_sensitive=*/true,
+                              /*allow_mixed_precision=*/false)
+                  .ok());
+  EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
+            HloOpcode::kReduce);
+}
+
+TEST_F(SplitKTest, SkipIndivisible) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_gemm_dot {
+  parameter_0 = s8[3,129,5,32]{3,2,1,0} parameter(0)
+  bitcast.1 = s8[3,5,32,129]{2,1,3,0} bitcast(parameter_0)
+  copy.1 = s8[3,5,32,129]{3,2,1,0} copy(bitcast.1)
+  reshape.5 = s8[480,129]{1,0} reshape(copy.1)
+  convert.8 = bf16[480,129]{1,0} convert(reshape.5)
+  parameter_1 = bf16[16,129]{1,0} parameter(1)
+  ROOT dot.0 = bf16[480,16]{1,0} dot(convert.8, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = s8[3,129,5,32]{3,2,1,0} parameter(0)
+  p1 = bf16[16,129]{1,0} parameter(1)
+  ROOT fusion = bf16[480,16]{1,0} fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  tensorflow::AutotuneResult::TritonGemmKey key;
+  key.set_block_m(16);
+  key.set_block_n(16);
+  key.set_block_k(16);
+  key.set_split_k(4);
+  key.set_num_stages(1);
+  key.set_num_warps(4);
+  EXPECT_THAT(
+      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key),
+      tsl::testing::StatusIs(tsl::error::CANCELLED,
+                             "Contracting dimension is too fragmented."));
+}
+
+TEST_F(SplitKTest, SkipSmallK) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_gemm_dot {
+  parameter_0 = s8[3,64,5,32]{3,2,1,0} parameter(0)
+  bitcast.1 = s8[3,5,32,64]{2,1,3,0} bitcast(parameter_0)
+  copy.1 = s8[3,5,32,64]{3,2,1,0} copy(bitcast.1)
+  reshape.5 = s8[480,64]{1,0} reshape(copy.1)
+  convert.8 = bf16[480,64]{1,0} convert(reshape.5)
+  parameter_1 = bf16[16,64]{1,0} parameter(1)
+  ROOT dot.0 = bf16[480,16]{1,0} dot(convert.8, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = s8[3,64,5,32]{3,2,1,0} parameter(0)
+  p1 = bf16[16,64]{1,0} parameter(1)
+  ROOT fusion = bf16[480,16]{1,0} fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  tensorflow::AutotuneResult::TritonGemmKey key;
+  key.set_block_m(16);
+  key.set_block_n(16);
+  key.set_block_k(128);
+  key.set_split_k(4);
+  key.set_num_stages(1);
+  key.set_num_warps(4);
+  EXPECT_THAT(
+      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key),
+      tsl::testing::StatusIs(
+          tsl::error::CANCELLED,
+          "Too small divisible part of the contracting dimension."));
+}
+
+TEST_F(SplitKTest, FragmentedKSupported) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_gemm_dot {
+  p0 = f16[7,2,16,4,20] parameter(0)
+  t0 = f16[2,16,4,20,7] transpose(p0), dimensions={1,2,3,4,0}
+  b0 = f16[2560,7] bitcast(t0)
+  a1 = f16[2560,5] parameter(1)
+  ROOT r = f16[7,5] dot(b0, a1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f16[7,2,16,4,20] parameter(0)
+  p1 = f16[2560,5] parameter(1)
+  ROOT fusion = f16[7,5] fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+
+  tensorflow::AutotuneResult::TritonGemmKey key;
+  key.set_block_m(32);
+  key.set_block_n(32);
+  key.set_block_k(16);
+  key.set_num_stages(1);
+  key.set_num_warps(4);
+
+  // 5 divides the contracting dimension, but not its major subdimensions.
+  key.set_split_k(5);
+  EXPECT_THAT(
+      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key),
+      tsl::testing::StatusIs(tsl::error::CANCELLED,
+                             "Contracting dimension is too fragmented."));
+
+  // 8 fits the constraints.
+  key.set_split_k(8);
+  TF_EXPECT_OK(
+      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+  TF_EXPECT_OK(VerifyHloModule(module.get(), /*layout_sensitive=*/true,
+                               /*allow_mixed_precision=*/false));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kReduce);
+  DotFusionAnalysis analysis(root->operand(0)->fused_expression_root(),
+                             key.split_k());
+  EXPECT_THAT(analysis.IterSpec(0, 0),
+              ElementsAre(FieldsAre(/*stride=*/320, /*count=*/8,
+                                    /*subfragments=*/ElementsAre(4, 2))));
+  EXPECT_THAT(analysis.IterSpec(0, 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/320,
+                                    /*subfragments=*/ElementsAre(20, 4, 4))));
+}
+
+TEST_F(SplitKTest, FragmentedKUnsupported) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_gemm_dot {
+  p0 = f32[3,128,77] parameter(0)
+  b0 = f32[384,77] bitcast(p0)
+  a1 = f32[384,25] parameter(1)
+  ROOT r = f32[77,25] dot(b0, a1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[3,128,77] parameter(0)
+  p1 = f32[384,25] parameter(1)
+  ROOT fusion = f32[77,25] fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+
+  tensorflow::AutotuneResult::TritonGemmKey key;
+  key.set_block_m(16);
+  key.set_block_n(16);
+  key.set_block_k(16);
+  key.set_num_stages(1);
+  key.set_num_warps(4);
+  key.set_split_k(4);
+  EXPECT_THAT(
+      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key),
+      tsl::testing::StatusIs(tsl::error::CANCELLED,
+                             "Contracting dimension is too fragmented."));
+}
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_async_collective_annotator.cc b/tensorflow/compiler/xla/service/gpu/gpu_async_collective_annotator.cc
new file mode 100644
index 00000000000..51a122347c2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_async_collective_annotator.cc
@@ -0,0 +1,44 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_async_collective_annotator.h"
+
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+
+namespace xla {
+namespace gpu {
+
+StatusOr<bool> GpuAsyncCollectiveAnnotator::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (!hlo_query::IsAsyncCollectiveStartOp(instruction->opcode())) {
+        continue;
+      }
+      CollectiveBackendConfig config;
+      config.set_is_sync(!is_collective_async_(instruction));
+      TF_RETURN_IF_ERROR(instruction->set_backend_config(config));
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_async_collective_annotator.h b/tensorflow/compiler/xla/service/gpu/gpu_async_collective_annotator.h
new file mode 100644
index 00000000000..d87e4fe91e0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_async_collective_annotator.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_ASYNC_COLLECTIVE_ANNOTATOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_ASYNC_COLLECTIVE_ANNOTATOR_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Annotate async collectives with CollectiveBackendConfig.
+class GpuAsyncCollectiveAnnotator : public HloModulePass {
+ public:
+  explicit GpuAsyncCollectiveAnnotator(HloPredicate is_collective_async)
+      : is_collective_async_(std::move(is_collective_async)) {}
+  absl::string_view name() const override {
+    return "gpu-async-collective-annotator";
+  }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  HloPredicate is_collective_async_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_ASYNC_COLLECTIVE_ANNOTATOR_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_async_collective_annotator_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_async_collective_annotator_test.cc
new file mode 100644
index 00000000000..4e548d6f666
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_async_collective_annotator_test.cc
@@ -0,0 +1,174 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_async_collective_annotator.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+constexpr absl::string_view kHloString = R"(
+  HloModule ModuleWithAsync
+
+  addf32 {
+    p0 = f32[] parameter(0)
+    p1 = f32[] parameter(1)
+    ROOT add = f32[] add(p0, p1)
+  }
+
+  addf16 {
+    p0 = f16[] parameter(0)
+    p1 = f16[] parameter(1)
+    ROOT add = f16[] add(p0, p1)
+  }
+
+  reduce_scatterf32 {
+    p0 = f32[2] parameter(0)
+    ROOT result = f32[1] reduce-scatter(p0), replica_groups={},
+                      dimensions={0}, to_apply=addf32
+  }
+
+  ENTRY entry {
+    pf32 = f32[1] parameter(0)
+    pf16 = f16[1] parameter(1)
+
+    arf32-start = f32[1] all-reduce-start(pf32), to_apply=addf32
+    arf32-done = f32[1] all-reduce-done(arf32-start)
+
+    arf16-start = f16[1] all-reduce-start(pf16), to_apply=addf16
+    arf16-done = f16[1] all-reduce-done(arf16-start)
+
+    agf32-start = (f32[1], f32[2]) all-gather-start(pf32), dimensions={0}
+    agf32-done = f32[2] all-gather-done(agf32-start)
+
+    agf16-start = (f16[1], f16[2]) all-gather-start(pf16), dimensions={0}
+    agf16-done = f16[2] all-gather-done(agf16-start)
+
+    cpf32-start = (f32[1], f32[1], u32[], u32[]) collective-permute-start(pf32),
+                    source_target_pairs={{0,1}, {1,0}}
+    cpf32-done = f32[1] collective-permute-done(cpf32-start)
+
+    cpf16-start = (f16[1], f16[1], u32[], u32[]) collective-permute-start(pf16),
+                    source_target_pairs={{0,1}, {1,0}}
+    cpf16-done = f16[1] collective-permute-done(cpf16-start)
+
+    rsf32-start = ((f32[2]), f32[1]) async-start(agf32-done), calls=reduce_scatterf32
+    rsf32-done = f32[1] async-done(rsf32-start), calls=reduce_scatterf32
+
+    ROOT tuple = (f32[1], f16[1], f32[2], f16[2], f32[1], f16[1], f32[1])
+                tuple(arf32-done, arf16-done, agf32-done, agf16-done, cpf32-done,
+                      cpf16-done, rsf32-done)
+  }
+)";
+
+struct TestCase {
+  std::string test_name;
+  HloPredicate is_async_predicate;
+  absl::flat_hash_set<absl::string_view> expected_async;
+  absl::flat_hash_set<absl::string_view> expected_sync;
+};
+
+class GpuAsyncCollectiveAnnotatorTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<TestCase> {};
+
+XLA_TEST_P(GpuAsyncCollectiveAnnotatorTest, Test) {
+  const TestCase& test_case = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseAndReturnVerifiedModule(kHloString, /*replica_count=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, GpuAsyncCollectiveAnnotator(test_case.is_async_predicate)
+                        .Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  // Assert that all async collectives are annotated with the backend config.
+  for (const HloInstruction* hlo :
+       module->entry_computation()->instructions()) {
+    if (!hlo_query::IsAsyncCollectiveStartOp(hlo->opcode())) {
+      continue;
+    }
+    StatusOr<CollectiveBackendConfig> backend_config =
+        hlo->backend_config<CollectiveBackendConfig>();
+    ASSERT_TRUE(backend_config.ok());
+    if (test_case.expected_async.contains(hlo->name())) {
+      EXPECT_FALSE(backend_config->is_sync());
+    }
+
+    if (test_case.expected_sync.contains(hlo->name())) {
+      EXPECT_TRUE(backend_config->is_sync());
+    }
+  }
+}
+
+std::vector<TestCase> TestCases() {
+  HloPredicate is_f16 = [](const HloInstruction* hlo) {
+    return hlo->operand(0)->shape().element_type() == PrimitiveType::F16;
+  };
+
+  return {
+      {"all_async",
+       HloPredicateTrue, /*expected_async=*/
+       {"arf32-start", "arf16-start", "agf32-start", "agf16-start",
+        "cpf32-start", "cpf16-start", "rsf32-start"},
+       /*expected_sync=*/{}},
+      {"all_sync",
+       HloPredicateFalse,
+       /*expected_async=*/{},
+       /*expected_sync=*/
+       {"arf32-start", "arf16-start", "agf32-start", "agf16-start",
+        "cpf32-start", "cpf16-start", "rsf32-start"}},
+      {"ar_async",
+       HloPredicateIsOp<HloOpcode::kAllReduceStart>,
+       /*expected_async=*/
+       {"arf32-start", "arf16-start"},
+       /*expected_sync=*/
+       {"agf32-start", "agf16-start", "cpf32-start", "cpf16-start",
+        "rsf32-start"}},
+      {"cp_async",
+       HloPredicateIsOp<HloOpcode::kCollectivePermuteStart>,
+       /*expected_async=*/
+       {"cpf32-start", "cpf16-start"},
+       /*expected_sync=*/
+       {"arf32-start", "arf16-start", "agf32-start", "agf16-start",
+        "rsf32-start"}},
+      {"f16_async",
+       is_f16,
+       /*expected_async=*/{"arf16-start", "agf16-start", "cpf16-start"},
+       /*expected_sync=*/
+       {"arf32-start", "agf32-start", "cpf32-start", "rsf32-start"}},
+  };
+}
+
+std::string TestCaseName(const ::testing::TestParamInfo<TestCase>& test_case) {
+  return test_case.param.test_name;
+}
+
+INSTANTIATE_TEST_SUITE_P(GpuAsyncCollectiveAnnotatorTest,
+                         GpuAsyncCollectiveAnnotatorTest,
+                         ::testing::ValuesIn(TestCases()), TestCaseName);
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 876dd41f0e0..6cc0b79efa9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -15,11 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
 
-#include <stdlib.h>
-
 #include <algorithm>
 #include <any>
-#include <atomic>
+#include <cstdint>
 #include <functional>
 #include <iterator>
 #include <memory>
@@ -43,11 +41,10 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter.h"
 #include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
@@ -61,7 +58,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/all_reduce_folder.h"
 #include "tensorflow/compiler/xla/service/all_reduce_promotion.h"
 #include "tensorflow/compiler/xla/service/all_reduce_reassociate.h"
-#include "tensorflow/compiler/xla/service/all_to_all_decomposer.h"
 #include "tensorflow/compiler/xla/service/async_collective_creator.h"
 #include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 #include "tensorflow/compiler/xla/service/bitcast_dtypes_expander.h"
@@ -89,19 +85,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gather_simplifier.h"
 #include "tensorflow/compiler/xla/service/gpu/alias_passthrough_params.h"
 #include "tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.h"
-#include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "tensorflow/compiler/xla/service/gpu/conv_layout_normalization.h"
 #include "tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.h"
-#include "tensorflow/compiler/xla/service/gpu/for_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_broadcast_folding_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_async_collective_annotator.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_float_support.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
@@ -127,15 +124,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/reduction_splitter.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime_intrinsics.h"
 #include "tensorflow/compiler/xla/service/gpu/scatter_slice_simplifier.h"
-#include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/topk_specializer.h"
+#include "tensorflow/compiler/xla/service/gpu/topk_splitter.h"
 #include "tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/variadic_op_splitter.h"
-#include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_computation_deduplicator.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
-#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
@@ -144,7 +141,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/logistic_expander.h"
 #include "tensorflow/compiler/xla/service/loop_schedule_linearizer.h"
 #include "tensorflow/compiler/xla/service/operand_upcaster.h"
-#include "tensorflow/compiler/xla/service/optimization_barrier_expander.h"
 #include "tensorflow/compiler/xla/service/qr_expander.h"
 #include "tensorflow/compiler/xla/service/real_imag_expander.h"
 #include "tensorflow/compiler/xla/service/reduce_decomposer.h"
@@ -182,16 +178,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
-#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h"
-#include "tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.h"
-#include "tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/blocking_counter.h"
 #include "tensorflow/tsl/platform/casts.h"
+#include "tensorflow/tsl/platform/cpu_info.h"
 #include "tensorflow/tsl/platform/env.h"
-#include "tensorflow/tsl/platform/logging.h"
-#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/platform/threadpool.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
@@ -199,73 +193,41 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"
 #include "tensorflow/compiler/xla/service/gpu/triton_autotuner.h"
-#endif  // GOOGLE_CUDA
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace xla {
 namespace gpu {
 namespace {
 
-class GpuFloatSupport : public FloatSupport {
- public:
-  explicit GpuFloatSupport(PrimitiveType low_precision_type)
-      : FloatSupport(low_precision_type) {}
-
-  bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
-                                   int64_t operand_index) const override {
-    return FloatSupport::SupportsLowPrecisionOperand(hlo, operand_index) ||
-           IsSupported(hlo);
-  }
-
-  bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
-    return FloatSupport::SupportsLowPrecisionOutput(hlo) || IsSupported(hlo);
-  }
-
- private:
-  bool IsSupported(const HloInstruction& hlo) const {
-    switch (hlo.opcode()) {
-      // Collective ops.
-      case HloOpcode::kAllGather:
-      case HloOpcode::kAllReduce:
-      case HloOpcode::kAllReduceStart:
-      case HloOpcode::kAllReduceDone:
-      case HloOpcode::kAllToAll:
-      case HloOpcode::kCollectivePermute:
-      case HloOpcode::kReduceScatter:
-      // Handled by Triton GEMM.
-      case HloOpcode::kDot:
-        return LowPrecisionType() == BF16;
-      // Data movement only ops.
-      case HloOpcode::kBroadcast:
-      case HloOpcode::kConcatenate:
-      case HloOpcode::kCopy:
-      case HloOpcode::kDynamicSlice:
-      case HloOpcode::kDynamicUpdateSlice:
-      case HloOpcode::kGather:
-      case HloOpcode::kPad:
-      case HloOpcode::kReshape:
-      case HloOpcode::kReverse:
-      case HloOpcode::kScatter:
-      case HloOpcode::kSelect:
-      case HloOpcode::kSelectAndScatter:
-      case HloOpcode::kSlice:
-      case HloOpcode::kTranspose:
-      // Other special ops.
-      case HloOpcode::kBitcast:
-        return true;
-      default:
-        return false;
-    }
-  }
-};
-
 bool ConvIsLowerable(HloInstruction* conv) {
   return GpuConvRewriter::ConvIsLowerable(conv);
 }
 
-}  // end anonymous namespace
+// CollectivesScheduleLinearizer enforces a total ordering between collectives
+// to work around (1) divergence in initial HLOs across executables that are
+// communicating with each other using HLO collectives, and (2) divergence in
+// executables introduced due to auto tuning, specifically the use of extra
+// scratch space for convolutions.
+// We always apply this pass when not using SPMD (where initial HLO divergence
+// may be possible). This function decided whether to apply this pass when using
+// SPMD partitioning. When using SPMD, if convolutions are present in the code
+// and we are using "online" autotuning (i.e., not AOT) we need to use the pass,
+// else we do not need to enable the pass.
+bool RequiresCollectiveScheduleLinearizer(const HloModule* module) {
+  for (const HloComputation* comp : module->MakeNonfusionComputations()) {
+    for (const HloInstruction* inst : comp->instructions()) {
+      if (GpuConvAlgorithmPicker::IsCandidate(inst)) {
+        return true;
+      }
+    }
+  }
+  // No convolution auto-tuning candidates found in the module.
+  return false;
+}
 
-using OwnedThunkSequence = GpuExecutable::OwnedThunkSequence;
-using OwnedGpuRuntimeProgram = GpuExecutable::OwnedGpuRuntimeProgram;
+}  // end anonymous namespace
 
 StatusOr<std::unique_ptr<Executable>>
 GpuXlaRuntimeAotCompilationResult::LoadExecutable(
@@ -365,11 +327,11 @@ void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {},
 }  // namespace
 
 // Runs optimization passes on the given HLO module.
-Status GpuCompiler::OptimizeHloModule(
-    HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    se::DeviceMemoryAllocator* device_allocator,
-    const GpuTargetConfig& gpu_target_config,
-    const AutotuneResults* autotune_results) {
+Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
+                                      se::StreamExecutor* stream_exec,
+                                      const CompileOptions& options,
+                                      const GpuTargetConfig& gpu_target_config,
+                                      const AutotuneResults* autotune_results) {
   const DebugOptions& debug_options = hlo_module->config().debug_options();
 
   AlgebraicSimplifierOptions layout_insensitive_algsimp_opts({},
@@ -382,10 +344,31 @@ Status GpuCompiler::OptimizeHloModule(
   layout_insensitive_algsimp_opts.set_minmax_propagate_nan(
       !debug_options.xla_gpu_enable_fast_min_max());
 
+  // Always simplify reduce(transpose(x)) and reduce(reshape(x)), even when
+  // the transpose/reshape has multiple users.  This helps int8 models, which
+  // tend to have lots of transpose+reshape's (converting between NCHW and
+  // NCHW_VECT_C).  Without this, those reshape+transposes can get materialized
+  // out, which is really bad for perf.
+  layout_insensitive_algsimp_opts
+      .set_unconditionally_simplify_reduce_of_transpose_or_reshape(true);
+
   if (gpu_target_config.platform_name == "ROCM") {
     layout_insensitive_algsimp_opts.set_enable_conv_operand_swap(false);
   }
 
+  HloPassPipeline pre_spmd_pipeline("pre-spmd-partitioner");
+  // Run some IR cleanup passes before running the SPMD partitioning
+  // passes.
+  pre_spmd_pipeline.AddPass<CallInliner>();
+  pre_spmd_pipeline.AddPass<ZeroSizedHloElimination>();
+  pre_spmd_pipeline.AddPass<ConditionalCanonicalizer>();
+  // The SPMD partitioner would mess up the sort+slice structure, so we need to
+  // rewrite Topk before that happens.
+  pre_spmd_pipeline.AddPass<TopkRewriter>(
+      [](const HloSortInstruction*, int64_t) { return true; });
+
+  TF_RETURN_IF_ERROR(pre_spmd_pipeline.Run(hlo_module).status());
+
   const int64_t num_partitions = hlo_module->config().num_partitions();
   if (num_partitions > 1) {
     if (!hlo_module->config().use_spmd_partitioning()) {
@@ -394,16 +377,6 @@ Status GpuCompiler::OptimizeHloModule(
           num_partitions);
     }
     HloPassPipeline spmd_pipeline("spmd-partitioner");
-    // Run some IR cleanup passes before running the SPMD partitioning
-    // passes.
-    spmd_pipeline.AddPass<CallInliner>();
-    spmd_pipeline.AddPass<ZeroSizedHloElimination>();
-    spmd_pipeline.AddPass<ConditionalCanonicalizer>();
-    spmd_pipeline.AddPass<TopkRewriter>(
-        // We're only rewriting TopK to prevent SPMD partitioning from blowing
-        // it up. Always allow it.
-        [](const HloSortInstruction*, int64_t) { return true; });
-
     HloPassPipeline& spmd_simplify =
         spmd_pipeline.AddPass<HloPassFix<HloPassPipeline>>("spmd-simplify");
 
@@ -444,8 +417,9 @@ Status GpuCompiler::OptimizeHloModule(
   {
     HloPassPipeline pipeline("optimization");
     AddHloVerifier(&pipeline);
+    pipeline.AddPass<TopKSplitter>();
+    pipeline.AddPass<TopkSpecializer>();
     pipeline.AddPass<TopkDecomposer>();
-    pipeline.AddPass<AllToAllDecomposer>();
 
     HloPredicate upcaster_filter = [&](const HloInstruction* instr) {
       return !stream_exec->GetDeviceDescription()
@@ -505,8 +479,7 @@ Status GpuCompiler::OptimizeHloModule(
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true);
 
-    pipeline.AddPass<LogisticExpander>(
-        /*expansion_type=*/LogisticExpansionType::kExp);
+    pipeline.AddPass<LogisticExpander>();
     pipeline.AddPass<ConditionalCanonicalizer>();
     pipeline.AddPass<DynamicDimensionSimplifier>();
 
@@ -599,7 +572,8 @@ Status GpuCompiler::OptimizeHloModule(
     // annotations added by this pass may not be correct after the
     // modifications.
     pipeline.AddPass<WhileLoopTripCountAnnotator>();
-    pipeline.AddPass<HloComputationDeduplicator>();
+    pipeline.AddPass<HloComputationDeduplicator>(
+        /*mark_fusion_duplications=*/false);
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
@@ -608,10 +582,14 @@ Status GpuCompiler::OptimizeHloModule(
   {
     HloPassPipeline collectives_pipeline("collective-optimizations");
     collectives_pipeline.AddPass<AllReduceFolder>();
-    collectives_pipeline.AddPass<WhileLoopAllReduceCodeMotion>();
     collectives_pipeline.AddPass<ReduceScatterCreator>();
-    collectives_pipeline.AddPass<AllReduceReassociate>();
+    collectives_pipeline.AddPass<AllReduceReassociate>(
+        debug_options.xla_gpu_enable_reassociation_for_converted_ar());
     collectives_pipeline.AddPass<ReduceScatterReassociate>();
+    collectives_pipeline.AddPass<WhileLoopAllReduceCodeMotion>(
+        /*enable_reduce_scatter=*/hlo_module->config()
+            .debug_options()
+            .xla_gpu_enable_while_loop_reduce_scatter_code_motion());
 
     // Run algebraic simplifier to reshape(broadcast) into a broadcast when
     // the reshape is just adding a unit dimension. This will help with the
@@ -638,12 +616,16 @@ Status GpuCompiler::OptimizeHloModule(
   if (stream_exec != nullptr) {
     gpu_version = GetGpuVersion(stream_exec);
     se::dnn::DnnSupport* dnn = stream_exec->AsDnn();
-    TF_RET_CHECK(dnn != nullptr);
+    if (dnn == nullptr) {
+      return tsl::errors::FailedPrecondition(
+          "DNN library initialization failed."
+          " Look at the errors above for more details.");
+    }
     TF_ASSIGN_OR_RETURN(dnn_version, dnn->GetVersion());
   }
 
   TF_RETURN_IF_ERROR(OptimizeHloConvolutionCanonicalization(
-      hlo_module, gpu_version, dnn_version, device_allocator));
+      hlo_module, gpu_version, dnn_version, options.device_allocator));
 
   {
     // Run layout assignment in a separate pipeline from
@@ -664,9 +646,8 @@ Status GpuCompiler::OptimizeHloModule(
   }
 
   // Run target-specific HLO optimization passes after layout assignment.
-  TF_RETURN_IF_ERROR(
-      OptimizeHloPostLayoutAssignment(hlo_module, stream_exec, device_allocator,
-                                      gpu_target_config, autotune_results));
+  TF_RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(
+      hlo_module, stream_exec, options, gpu_target_config, autotune_results));
 
   const GpuDeviceInfo& gpu_device_info = gpu_target_config.gpu_device_info;
 
@@ -731,27 +712,58 @@ Status GpuCompiler::OptimizeHloModule(
       pipeline.AddPass<AllReduceBlueConnect>(blueconnect_num_devices_per_host);
     }
 
-    bool async_all_reduce = debug_options.xla_gpu_enable_async_all_reduce();
-    bool async_collective_permute =
-        debug_options.xla_gpu_enable_async_collective_permute();
-
-    if (async_all_reduce || async_collective_permute) {
+    {
+      // Convert all collectives to their async form, and then annotate the ones
+      // that actually need to run asynchronously with an GPU specific backend
+      // config.
       AsyncCollectiveCreator::CollectiveCreatorConfig config;
-      config.convert_all_reduce = [=](const HloInstruction*) {
-        return async_all_reduce;
-      };
-      config.convert_collective_permute = [=](const HloInstruction*) {
-        return async_collective_permute;
-      };
+      config.convert_all_reduce = HloPredicateTrue;
+      config.convert_collective_permute = HloPredicateTrue;
+      config.convert_all_gather = HloPredicateTrue;
+      config.convert_reduce_scatter = HloPredicateTrue;
+      config.convert_all_to_all = HloPredicateTrue;
       pipeline.AddPass<AsyncCollectiveCreator>(std::move(config));
+
+      auto convert_to_async = [&debug_options](const HloInstruction* inst) {
+        switch (inst->opcode()) {
+          case HloOpcode::kAllReduceStart:
+            return debug_options.xla_gpu_enable_async_all_reduce();
+          case HloOpcode::kAllGatherStart:
+            return debug_options.xla_gpu_enable_async_all_gather();
+          case HloOpcode::kCollectivePermuteStart:
+            return debug_options.xla_gpu_enable_async_collective_permute();
+          case HloOpcode::kAsyncStart: {
+            auto async_inst = Cast<HloAsyncInstruction>(inst);
+            switch (async_inst->async_wrapped_opcode()) {
+              case HloOpcode::kReduceScatter:
+                return debug_options.xla_gpu_enable_async_reduce_scatter();
+              case HloOpcode::kAllToAll:
+                return debug_options.xla_gpu_enable_async_all_to_all();
+              default:
+                return false;
+            }
+          }
+          default:
+            return false;
+        }
+      };
+      pipeline.AddPass<GpuAsyncCollectiveAnnotator>(convert_to_async);
     }
 
-    pipeline.AddPass<CollectivesScheduleLinearizer>();
+    if (!hlo_module->config().use_spmd_partitioning()) {
+      pipeline.AddPass<CollectivesScheduleLinearizer>();
+    }
 
     AlgebraicSimplifierOptions options = layout_insensitive_algsimp_opts;
     options.set_is_layout_sensitive(true);
     pipeline.AddPass<AlgebraicSimplifier>(options);
 
+    // This invocation is used to populate deduplicated_name for fusions that
+    // are considered duplicates according to the comparator in this pass.
+    // Currently, the pass doesn't actually deduplicate the fusions.
+    pipeline.AddPass<HloComputationDeduplicator>(
+        /*mark_fusion_duplications=*/true);
+
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
@@ -798,8 +810,7 @@ Status GpuCompiler::PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
 
 Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    se::DeviceMemoryAllocator* device_allocator,
-    const GpuTargetConfig& gpu_target_config,
+    const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
     const AutotuneResults* autotune_results) {
   const DebugOptions& debug_options = hlo_module->config().debug_options();
 
@@ -829,15 +840,17 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     });
     pipeline.AddPass<HloPassFix<MoveCopyToUsers>>();
 
-    const stream_executor::CudaComputeCapability& compute_capability =
-        std::get<se::CudaComputeCapability>(gpu_target_config.gpu_version);
-
     // Rewrite GEMMs into custom calls.
     if (debug_options.xla_gpu_enable_triton_gemm() &&
-        compute_capability.IsAtLeast(se::CudaComputeCapability::VOLTA)) {
-      pipeline.AddPass<GemmRewriterTriton>(compute_capability);
+        std::holds_alternative<se::CudaComputeCapability>(
+            gpu_target_config.gpu_version)) {
+      auto cuda_compute_capability =
+          std::get<se::CudaComputeCapability>(gpu_target_config.gpu_version);
+      if (cuda_compute_capability.IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+        pipeline.AddPass<GemmRewriterTriton>(gpu_target_config.gpu_version);
+      }
     }
-    pipeline.AddPass<GemmRewriter>(compute_capability);
+    pipeline.AddPass<GemmRewriter>(gpu_target_config.gpu_version);
 
     // Rewrite GEMMs with broadcasted inputs as strided GEMMs.
     pipeline.AddPass<GemmBroadcastFoldingRewriter>();
@@ -852,7 +865,8 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     pipeline.AddPass<ReductionLayoutNormalizer>();
     pipeline.AddPass<ReductionDimensionGrouper>();
     pipeline.AddPass<HloPassFix<ReductionSplitter>>();
-    pipeline.AddPass<HloPassFix<GpuTreeReductionRewriter>>(compute_capability);
+    pipeline.AddPass<HloPassFix<GpuTreeReductionRewriter>>(
+        gpu_target_config.gpu_version);
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
@@ -866,23 +880,25 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
                      .VerifyReshapeIsBitcast(),
                  /*debug_only=*/true);
 
-  GpuFloatSupport bf16_support(BF16);
-  pipeline.AddPass<FloatNormalization>(&bf16_support);
-  GpuFloatSupport f8e5m2_support(F8E5M2);
-  pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
-  GpuFloatSupport f8e4m3fn_support(F8E4M3FN);
-  pipeline.AddPass<FloatNormalization>(&f8e4m3fn_support);
+  AutotuningConfig autotune_config =
+      stream_exec ? AutotuningConfig{DeviceConfig{stream_exec,
+                                                  options.device_allocator}}
+                  : AutotuningConfig{DevicelessConfig{
+                        gpu_target_config.device_description_str}};
 
-  // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
-  if (debug_options.xla_gpu_simplify_all_fp_conversions()) {
-    pipeline.AddPass<SimplifyFPConversions>();
+  // Linearize collective schedule under SPMD partitioning if online autotuning
+  // of convolutions is enabled.
+  const bool enable_collecive_schedule_linearizer_for_spmd =
+      hlo_module->config().use_spmd_partitioning() &&
+      autotune_config.is_online() &&
+      GpuConvAlgorithmPicker::IsEnabled(hlo_module);
+
+  if (enable_collecive_schedule_linearizer_for_spmd) {
+    pipeline.AddPass<CollectivesScheduleLinearizer>(
+        RequiresCollectiveScheduleLinearizer);
   }
 
-  AutotuningConfig config =
-      stream_exec
-          ? AutotuningConfig{DeviceConfig{stream_exec, device_allocator}}
-          : DevicelessConfig{gpu_target_config.device_description_str};
-  if (!stream_exec) {
+  if (autotune_config.is_offline()) {
     GpuConvAlgorithmPicker::ClearAutotuneResults();
     TF_RETURN_IF_ERROR(
         GpuConvAlgorithmPicker::LoadAutotuneResults(*autotune_results));
@@ -894,15 +910,46 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     TF_RETURN_IF_ERROR(TritonAutotuner::LoadAutotuneResults(*autotune_results));
 #endif  // GOOGLE_CUDA
   }
-  pipeline.AddPass<GpuConvAlgorithmPicker>(config);
+  if (GpuConvAlgorithmPicker::IsEnabled(hlo_module)) {
+    pipeline.AddPass<GpuConvAlgorithmPicker>(autotune_config);
+  }
 #if GOOGLE_CUDA
-  pipeline.AddPass<GemmAlgorithmPicker>(config);
-  pipeline.AddPass<TritonAutotuner>(
-      config, debug_options.xla_gpu_force_compilation_parallelism()
-                  ? debug_options.xla_gpu_force_compilation_parallelism()
-                  : tsl::port::MaxParallelism());
+  pipeline.AddPass<GemmAlgorithmPicker>(autotune_config);
+
+  // By default use an externally provided thread pool.
+  tsl::thread::ThreadPool* thread_pool = options.thread_pool;
+  std::optional<tsl::thread::ThreadPool> overriding_thread_pool;
+  int num_threads = hlo_module->config()
+                        .debug_options()
+                        .xla_gpu_force_compilation_parallelism();
+  // If an external thread pool is provided or single-threaded operation is
+  // requested do not create a thread pool.
+  if (thread_pool == nullptr && num_threads != 1) {
+    // Zero means "default", treat it as "max parallelism" here.
+    if (num_threads == 0) {
+      num_threads = tsl::port::MaxParallelism();
+    }
+    overriding_thread_pool.emplace(tsl::Env::Default(), "", num_threads);
+    thread_pool = &*overriding_thread_pool;
+  }
+
+  pipeline.AddPass<TritonAutotuner>(autotune_config, thread_pool);
 #endif  // GOOGLE_CUDA
 
+  GpuFloatSupport bf16_support(BF16);
+  pipeline.AddPass<FloatNormalization>(&bf16_support);
+  GpuFloatSupport f8e5m2_support(F8E5M2);
+  pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
+  GpuFloatSupport f8e4m3fn_support(F8E4M3FN);
+  pipeline.AddPass<FloatNormalization>(&f8e4m3fn_support);
+  FloatSupport f8e4m3b11fnuz_support(F8E4M3B11FNUZ);
+  pipeline.AddPass<FloatNormalization>(&f8e4m3b11fnuz_support);
+
+  // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
+  if (debug_options.xla_gpu_simplify_all_fp_conversions()) {
+    pipeline.AddPass<SimplifyFPConversions>();
+  }
+
   // Clean up new_tuple described above.
   pipeline.AddPass<TupleSimplifier>();
 
@@ -937,9 +984,9 @@ StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
       tsl::profiler::TraceMeLevel::kInfo);
 
   GpuTargetConfig gpu_target_config = GetGpuTargetConfig(stream_exec);
-  TF_RETURN_IF_ERROR(
-      OptimizeHloModule(module.get(), stream_exec, options.device_allocator,
-                        gpu_target_config, /*autotune_results=*/nullptr));
+  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(), stream_exec, options,
+                                       gpu_target_config,
+                                       /*autotune_results=*/nullptr));
 
   TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
 
@@ -963,8 +1010,7 @@ StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPassesWithoutDevice(
   tsl::profiler::TraceMe activity(
       [&] { return absl::StrCat("HLO Transforms:", module->name()); },
       tsl::profiler::TraceMeLevel::kInfo);
-  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(), nullptr,
-                                       options.device_allocator,
+  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(), nullptr, options,
                                        gpu_target_config, &autotune_results));
 
   TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
@@ -978,12 +1024,6 @@ StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPassesWithoutDevice(
   return std::move(module);
 }
 
-static std::optional<bool> DummyCanShareBufferFunction(const HloInstruction*,
-                                                       const HloInstruction*,
-                                                       const ShapeIndex&) {
-  return std::nullopt;
-}
-
 StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
     HloModule* hlo_module, se::StreamExecutor* stream_exec) {
   const GpuDeviceInfo gpu_device_info = GetGpuDeviceInfo(stream_exec);
@@ -1010,279 +1050,8 @@ StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
   return std::move(assignment);
 }
 
-// Lowers MLIR module to the XLA Gpu runtime custom calls.
-static Status LowerToXlaGpuRuntime(mlir::ModuleOp module,
-                                   llvm::StringRef entry_function_name,
-                                   llvm::ArrayRef<int64_t> buffer_sizes,
-                                   ThunkSequence* thunk_sequence,
-                                   const DebugOptions& debug_options) {
-  if (!module) {
-    return InternalError("No MLIR module to lower.");
-  }
-
-  mlir::PassManager pm(module->getName(), mlir::PassManager::Nesting::Implicit);
-
-  GpuPipelineOpts opts;
-  opts.enable_cuda_graphs = debug_options.xla_gpu_enable_cuda_graphs();
-  populateXlaGpuRuntimePasses(pm, thunk_sequence, opts);
-
-  if (pm.run(module).failed()) {
-    return InternalError("Failed to lower LMHLO to Gpu runtime custom calls.");
-  }
-
-  return OkStatus();
-}
-
-static StatusOr<OwnedGpuRuntimeProgram> LowerToJitRt(
-    mlir::ModuleOp mlir_module, llvm::StringRef entry_function_name,
-    llvm::ArrayRef<int64_t> buffer_sizes, const HloModuleConfig& module_config,
-    std::unique_ptr<ThunkSequence> thunk_sequence,
-    const HloModule* hlo_module_for_dump = nullptr) {
-  // Forward collective (NCCL) attributes for use by the lowering pipeline.
-  mlir::OpBuilder builder(mlir_module.getContext());
-  mlir::IntegerAttr replica_count_attr =
-      builder.getI64IntegerAttr(module_config.replica_count());
-  mlir::IntegerAttr num_partitions_attr =
-      builder.getI64IntegerAttr(module_config.num_partitions());
-  mlir::func::FuncOp func =
-      mlir_module.lookupSymbol<mlir::func::FuncOp>(entry_function_name);
-  func->setAttr("replica_count", replica_count_attr);
-  func->setAttr("num_partitions", num_partitions_attr);
-
-  // Lower LMHLO operations to the JitRt compatible custom calls.
-  TF_RETURN_IF_ERROR(LowerToXlaGpuRuntime(
-      mlir_module, {entry_function_name.data(), entry_function_name.size()},
-      buffer_sizes, thunk_sequence.get(), module_config.debug_options()));
-
-  // TODO(b/232033540): Pass MLIR module directly to Gpu runtime executable
-  // without forcing serialization.
-  std::string module_str = llvm_ir::DumpToString(mlir_module);
-
-  if (hlo_module_for_dump != nullptr) {
-    DumpToFileInDirOrStdout(*hlo_module_for_dump, "gpu_rt_host", "mlir",
-                            module_str);
-  }
-
-  return std::make_unique<GpuRuntimeProgram>(
-      entry_function_name.str(), std::move(module_str), buffer_sizes.vec(),
-      module_config.debug_options());
-}
-
 using OutputInfoMap =
     absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>;
-static Status GetMlirAllocationInfo(mlir::func::FuncOp func,
-                                    std::vector<BufferAllocation>* allocations,
-                                    OutputInfoMap* output_info,
-                                    Shape* output_shape,
-                                    EntryFunctionAttributes* entry_func_attrs);
-
-namespace {
-// Removes all globals from the given module that are both uninitialized and
-// have no uses within that module.
-void RemoveUnusedAndUninitializedGlobals(
-    llvm::Module* llvm_module,
-    const std::vector<GpuExecutable::ConstantInfo>& constants) {
-  for (const auto& info : constants) {
-    // Empty content means the constant is initialized in the LLVM IR, so we
-    // must not remove it.
-    if (!info.content.empty()) {
-      llvm::GlobalVariable* global =
-          llvm_module->getGlobalVariable(info.symbol_name);
-      CHECK(global != nullptr);
-      if (global->use_empty()) {
-        global->eraseFromParent();
-      }
-    }
-  }
-}
-}  // namespace
-
-struct CompileModuleResults {
-  std::unique_ptr<llvm::Module> llvm_module;
-  std::unique_ptr<BufferAssignment> buffer_assignment;
-  std::vector<BufferAllocation> allocations;
-  std::variant<OwnedThunkSequence, OwnedGpuRuntimeProgram> executable;
-  EntryFunctionAttributes entry_func_attrs;
-  std::vector<GpuExecutable::ConstantInfo> constants;
-  OutputInfoMap output_info;
-  Shape output_shape;
-  std::string module_name;
-};
-
-static void ForAllThunks(const std::function<void(Thunk*)>& fn,
-                         ThunkSequence* thunk_sequence) {
-  for (std::unique_ptr<Thunk>& thunk : *thunk_sequence) {
-    if (thunk->kind() == Thunk::kConditional) {
-      auto* cond_thunk = static_cast<ConditionalThunk*>(thunk.get());
-      for (const std::unique_ptr<SequentialThunk>& branch_thunks :
-           cond_thunk->branch_thunks()) {
-        ForAllThunks(fn, &branch_thunks->thunks());
-      }
-    } else if (thunk->kind() == Thunk::kFor) {
-      auto* for_thunk = static_cast<ForThunk*>(thunk.get());
-      ForAllThunks(fn, &for_thunk->body_thunk_sequence()->thunks());
-    } else if (thunk->kind() == Thunk::kSequential) {
-      auto* sequential_thunk = static_cast<SequentialThunk*>(thunk.get());
-      ForAllThunks(fn, &sequential_thunk->thunks());
-    } else if (thunk->kind() == Thunk::kWhile) {
-      auto* while_thunk = static_cast<WhileThunk*>(thunk.get());
-      ForAllThunks(fn, &while_thunk->condition_thunk_sequence()->thunks());
-      ForAllThunks(fn, &while_thunk->body_thunk_sequence()->thunks());
-    } else {
-      fn(thunk.get());
-    }
-  }
-}
-
-static bool HasFp8(const HloModule& hlo_module) {
-  for (const HloComputation* computation : hlo_module.computations()) {
-    for (const HloInstruction* instruction : computation->instructions()) {
-      if (ShapeUtil::HasPrimitiveType(instruction->shape(), F8E5M2) ||
-          ShapeUtil::HasPrimitiveType(instruction->shape(), F8E4M3FN)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-// Prints mlir diagnostic messages to VLOG level 2.
-static mlir::LogicalResult DiagnosticHandler(mlir::Diagnostic& diag) {
-  VLOG(2) << diag.str();
-  return mlir::failure();
-}
-
-// The order of `thunk_sequence` corresponds to
-// `hlo_schedule->ThunkLaunchOrder()`.
-static Status CompileModuleToLlvmIrImpl(
-    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
-    const std::string& target_triple, const std::string& data_layout,
-    const std::string& platform_name, se::Platform::Id platform_id,
-    GpuDeviceInfo gpu_device_info,
-    se::CudaComputeCapability cuda_compute_capability,
-    se::RocmComputeCapability rocm_compute_capability,
-    const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
-    int pointer_size, CompileModuleResults* results,
-    se::StreamExecutor* stream_exec = nullptr) {
-  results->llvm_module = std::make_unique<llvm::Module>("", *llvm_context);
-  results->llvm_module->setTargetTriple(target_triple);
-  results->llvm_module->setDataLayout(data_layout);
-
-  TF_RETURN_IF_ERROR(
-      ScheduleGpuModule(hlo_module, pointer_size, gpu_device_info));
-  {
-    HloPassPipeline pipeline("opt-barrier-expander");
-    pipeline.AddPass<OptimizationBarrierExpander>();
-
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
-  }
-
-  auto buffer_size_bytes_function =
-      [pointer_size](const BufferValue& buffer_value) -> int64_t {
-    return GetSizeOfShape(buffer_value.shape(), pointer_size);
-  };
-
-  TF_ASSIGN_OR_RETURN(
-      results->buffer_assignment,
-      BufferAssigner::Run(
-          hlo_module,
-          std::make_unique<SequentialHloOrdering>(hlo_module->schedule()),
-          buffer_size_bytes_function,
-          /*color_alignment=*/
-          [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
-          /*allocate_buffers_for_constants=*/true,
-          /*colorer=*/BufferAssigner::DefaultColorer(),
-          /*must_not_live_out=*/{}, can_share_buffer_function));
-
-  VLOG(1) << "Buffer Assignment Stats for " << hlo_module->name() << "\n"
-          << results->buffer_assignment->GetStats().ToString();
-  DumpHloModuleIfEnabled(*hlo_module, *results->buffer_assignment,
-                         absl::StrCat("sm_", cuda_compute_capability.ToString(),
-                                      "_gpu_", kAfterOptimizationsDumpName));
-
-  uint64_t start_usecs = tsl::Env::Default()->NowMicros();
-  mlir::DialectRegistry registry;
-  IrEmitterUnnested::GetDependentDialects(registry);
-  mlir::MLIRContext mlir_context(registry);
-  mlir_context.getDiagEngine().registerHandler(DiagnosticHandler);
-  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
-      mlir::ModuleOp::create(mlir::Builder(&mlir_context).getUnknownLoc());
-
-  TF_RETURN_IF_ERROR(
-      HloToLhloModule(*results->buffer_assignment, *hlo_module, *mlir_module));
-
-  results->module_name =
-      mlir::mhlo::GetDebugNameFromLocation(mlir_module->getLoc());
-
-  if (DumpingEnabledForHloModule(*hlo_module)) {
-    DumpToFileInDirOrStdout(*hlo_module, "lmhlo", mlir_module.get());
-  }
-
-  auto entry_function = mlir::cast<mlir::func::FuncOp>(
-      mlir_module->lookupSymbol(hlo_module->entry_computation()->name()));
-
-  TF_RETURN_IF_ERROR(GetMlirAllocationInfo(
-      entry_function, &results->allocations, &results->output_info,
-      &results->output_shape, &results->entry_func_attrs));
-
-  IrEmitterContext ir_emitter_context(
-      /*hlo_module=*/nullptr, /*buffer_assignment=*/nullptr, platform_name,
-      gpu_device_info, cuda_compute_capability, rocm_compute_capability,
-      &mlir_context, results->llvm_module.get());
-
-  ir_emitter_context.set_allocations(results->allocations);
-
-  TF_ASSIGN_OR_RETURN(
-      auto ir_emitter,
-      IrEmitterUnnested::Create(hlo_module->config(), &ir_emitter_context));
-
-  {
-    XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
-        "GpuCompiler::RunBackend - IR emission for ", hlo_module->name()));
-
-    TF_RETURN_IF_ERROR(ir_emitter->EmitLmhloRegion(&entry_function.getBody()));
-
-    bool supports_runtime_managed_constants =
-        // TODO(b/218907125): Implement this feature for ROCm as well.
-        platform_id != se::rocm::kROCmPlatformId &&
-        hlo_module->config().debug_options().xla_gpu_enable_shared_constants();
-    if (supports_runtime_managed_constants) {
-      // Remove these globals from the generated code to indicate that XLA is
-      // responsible for allocating and initializing them.
-      RemoveUnusedAndUninitializedGlobals(ir_emitter_context.llvm_module(),
-                                          ir_emitter_context.constants());
-    }
-
-    results->constants = std::move(ir_emitter_context.constants());
-    uint64_t end_usecs = tsl::Env::Default()->NowMicros();
-
-    // This won't record values for calls that error out (because if they error
-    // out we have no way of telling how far through the process we got).
-    RecordHloToLlvmDuration(end_usecs - start_usecs);
-  }
-
-  // TODO(ezhulenev): Remove the FP8 check once https://reviews.llvm.org/D140088
-  // is submitted. Currently we can't emit LLVM IR with fp8 types.
-  if (IsXlaRuntimeExecutableEnabled(hlo_module->config()) &&
-      !HasFp8(*hlo_module)) {
-    std::vector<int64_t> buffer_sizes;
-    llvm::transform(
-        results->allocations, std::back_inserter(buffer_sizes),
-        [](const BufferAllocation& allocation) { return allocation.size(); });
-    TF_ASSIGN_OR_RETURN(
-        results->executable,
-        LowerToJitRt(*mlir_module, entry_function.getName(), buffer_sizes,
-                     hlo_module->config(), ir_emitter->ConsumeThunkSequence(),
-                     /*hlo_module_for_dump=*/hlo_module));
-    return OkStatus();
-  }
-
-  auto thunk_sequence = ir_emitter->ConsumeThunkSequence();
-  ForAllThunks([](Thunk* thunk) { thunk->ClearCompileTimeInfo(); },
-               thunk_sequence.get());
-  results->executable = std::move(thunk_sequence);
-  return OkStatus();
-}
 
 static void NullDiagnosticHandler(const llvm::DiagnosticInfo& diag_info,
                                   void* context) {
@@ -1409,13 +1178,17 @@ GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
   }
 
   // Test whether LinkModules is supported.
+#if GOOGLE_CUDA
   TF_ASSIGN_OR_RETURN(bool can_use_link_modules,
                       CanUseLinkModules(module_config));
   if (!can_use_link_modules) {
     return compile_single_module(llvm_module.get(), /*relocatable=*/false,
                                  /*shard_number=*/std::nullopt);
   }
-
+#elif TENSORFLOW_USE_ROCM
+  return compile_single_module(llvm_module.get(), /*relocatable=*/false,
+                               /*shard_number=*/std::nullopt);
+#endif
   std::vector<std::unique_ptr<llvm::Module>> llvm_modules;
   int num_functions = 0;
   for (llvm::Function& func : llvm_module->functions()) {
@@ -1583,10 +1356,11 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
           module->config(), std::move(compile_module_results.llvm_module),
           GetGpuVersion(stream_exec), stream_exec, options, module.get()));
   if (DumpingEnabledForHloModule(*module) &&
-      std::holds_alternative<OwnedThunkSequence>(
+      std::holds_alternative<GpuExecutable::OwnedThunkSequence>(
           compile_module_results.executable)) {
     const ThunkSequence& thunk_sequence =
-        *std::get<OwnedThunkSequence>(compile_module_results.executable);
+        *std::get<GpuExecutable::OwnedThunkSequence>(
+            compile_module_results.executable);
     DumpToFileInDirOrStdout(*module, "", "thunk_sequence.txt",
                             thunk_sequence.ToString());
   }
@@ -1610,6 +1384,9 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
            compile_module_results.module_name,
            compile_module_results.output_shape,
            std::move(compile_module_results.allocations),
+           module->config()
+               .debug_options()
+               .xla_gpu_enable_persistent_temp_buffers(),
            std::move(buffer_assignment_proto),
            [buffer_assignment] { return buffer_assignment->ToVerboseString(); },
            std::move(module)}));
@@ -1707,14 +1484,16 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
     auto& compiled_executable = compile_module_results.executable;
 
-    if (!std::holds_alternative<OwnedGpuRuntimeProgram>(compiled_executable)) {
+    if (!std::holds_alternative<GpuExecutable::OwnedGpuRuntimeProgram>(
+            compiled_executable)) {
       return InternalError("Gpu runtime program was not provided");
     }
 
     // TODO(ezhulenev): Unify AOT compilation with GpuRuntimeExecutable::Create
     // (see `gpu/runtime/executable.h`).
 
-    const auto& program = std::get<OwnedGpuRuntimeProgram>(compiled_executable);
+    const auto& program =
+        std::get<GpuExecutable::OwnedGpuRuntimeProgram>(compiled_executable);
 
     // Options for the default XLA runtime compilation pipeline.
     runtime::CompilationPipelineOptions copts;
@@ -1792,98 +1571,6 @@ StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
   return result;
 }
 
-StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
-    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
-    const std::string& target_triple, const std::string& data_layout,
-    const std::string& platform_name, const se::Platform::Id platform_id,
-    GpuDeviceInfo gpu_device_info,
-    se::CudaComputeCapability cuda_compute_capability,
-    se::RocmComputeCapability rocm_compute_capability, int pointer_size) {
-  CompileModuleResults results;
-  TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
-      hlo_module, llvm_context, target_triple, data_layout, platform_name,
-      platform_id, gpu_device_info, cuda_compute_capability,
-      rocm_compute_capability, DummyCanShareBufferFunction, pointer_size,
-      &results));
-  return std::move(results.llvm_module);
-}
-
-// Analyze the function signature to reconstruct a vector of BufferAllocation
-// objects, as well as other output information.
-//
-// This function also serves as a half-baked verifier for function arg
-// attributes, since a full verifier doens't exist yet.
-static Status GetMlirAllocationInfo(mlir::func::FuncOp func,
-                                    std::vector<BufferAllocation>* allocations,
-                                    OutputInfoMap* output_info,
-                                    Shape* output_shape,
-                                    EntryFunctionAttributes* entry_func_attrs) {
-  CHECK(allocations->empty());
-  allocations->reserve(func.getNumArguments());
-
-  std::vector<int64_t> buffer_sizes;
-  for (int i = 0; i < func.getNumArguments(); i++) {
-    mlir::BlockArgument arg = func.getArgument(i);
-
-    TF_RET_CHECK(arg.getType().isa<mlir::ShapedType>());
-    mlir::ShapedType type = arg.getType().cast<mlir::ShapedType>();
-    TF_ASSIGN_OR_RETURN(auto element_type_bytes,
-                        GetElementTypeBytes(type.getElementType()));
-    size_t size = type.getNumElements() * element_type_bytes;
-    buffer_sizes.push_back(size);
-  }
-
-  for (int i = 0; i < func.getNumArguments(); i++) {
-    llvm::ArrayRef<mlir::NamedAttribute> attrs =
-        mlir::function_interface_impl::getArgAttrs(func, i);
-    for (const mlir::NamedAttribute& attr : attrs) {
-      TF_RET_CHECK(attr.getName() == "lmhlo.params" ||
-                   attr.getName() == "lmhlo.param_shape_index" ||
-                   attr.getName() == "lmhlo.constant_name" ||
-                   attr.getName() == "lmhlo.must_alias" ||
-                   attr.getName() == "lmhlo.output_index");
-    }
-  }
-
-  // Encode buffer parameter metadata in a proto for persisting, because BEF
-  // doesn't persist function attributes.
-  for (int i = 0; i < func.getNumArguments(); i++) {
-    auto buffer = entry_func_attrs->add_buffers();
-    if (auto param_attr = func.getArgAttr(i, "lmhlo.params")) {
-      buffer->set_lmhlo_params_present(true);
-      buffer->set_lmhlo_params(param_attr.cast<mlir::IntegerAttr>().getInt());
-    }
-    if (auto shape_index_attr = func.getArgAttr(i, "lmhlo.param_shape_index")) {
-      auto param_shape_index = buffer->mutable_lmhlo_param_shape_index();
-      for (const llvm::APInt& element :
-           shape_index_attr.cast<mlir::DenseIntElementsAttr>()) {
-        param_shape_index->add_indices(element.getSExtValue());
-      }
-    }
-    if (auto constant_name_attr = func.getArgAttr(i, "lmhlo.constant_name")) {
-      buffer->set_lmhlo_constant_name(
-          constant_name_attr.cast<mlir::StringAttr>().str());
-    }
-    if (func.getArgAttr(i, "lmhlo.must_alias")) {
-      buffer->set_lmhlo_must_alias(true);
-    }
-    if (auto output_index_attr = func.getArgAttr(i, "lmhlo.output_index")) {
-      auto output_index = buffer->mutable_lmhlo_output_index();
-      for (const llvm::APInt& element :
-           output_index_attr.cast<mlir::DenseIntElementsAttr>()) {
-        output_index->add_indices(element.getSExtValue());
-      }
-    }
-  }
-  entry_func_attrs->set_result_xla_shape(
-      func->getAttrOfType<mlir::StringAttr>("result_xla_shape")
-          .getValue()
-          .str());
-
-  return GpuExecutable::SetUpMlirAllocation(func, buffer_sizes, allocations,
-                                            output_info, output_shape);
-}
-
 StatusOr<std::unique_ptr<Executable>> CompileLmhloToExecutable(
     GpuCompiler* compiler, mlir::ModuleOp module, std::string module_name,
     const HloModuleConfig& module_config,
@@ -1921,6 +1608,8 @@ StatusOr<std::unique_ptr<Executable>> CompileLmhloToExecutable(
                                         ir_emitter_context->constants());
   }
 
+  bool enable_persistent_temp_buffers =
+      module_config.debug_options().xla_gpu_enable_persistent_temp_buffers();
   using BackendCompileResult = std::pair<std::string, std::vector<uint8_t>>;
   TF_ASSIGN_OR_RETURN(BackendCompileResult backend_result,
                       compiler->CompileToTargetBinary(
@@ -1943,7 +1632,8 @@ StatusOr<std::unique_ptr<Executable>> CompileLmhloToExecutable(
         {std::move(backend_result.first), std::move(backend_result.second),
          gpu_version, std::move(executable), entry_func_attrs,
          std::move(ir_emitter_context->constants()), std::move(output_info),
-         module_name, output_shape, std::move(allocations)});
+         module_name, output_shape, std::move(allocations),
+         enable_persistent_temp_buffers});
   }
 
   auto thunk_sequence = ir_emitter->ConsumeThunkSequence();
@@ -1953,7 +1643,8 @@ StatusOr<std::unique_ptr<Executable>> CompileLmhloToExecutable(
       {std::move(backend_result.first), std::move(backend_result.second),
        gpu_version, std::move(thunk_sequence), entry_func_attrs,
        std::move(ir_emitter_context->constants()), std::move(output_info),
-       module_name, output_shape, std::move(allocations)});
+       module_name, output_shape, std::move(allocations),
+       enable_persistent_temp_buffers});
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index 9278775a18c..5233bcedc8c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -180,8 +180,7 @@ class GpuCompiler : public LLVMCompiler {
   // autotune_results != null.
   virtual Status OptimizeHloPostLayoutAssignment(
       HloModule* hlo_module, se::StreamExecutor* stream_exec,
-      se::DeviceMemoryAllocator* device_allocator,
-      const GpuTargetConfig& gpu_target_config,
+      const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
       const AutotuneResults* autotune_results);
 
  private:
@@ -190,7 +189,7 @@ class GpuCompiler : public LLVMCompiler {
   // autotune_results != null.
   Status OptimizeHloModule(HloModule* hlo_module,
                            se::StreamExecutor* stream_exec,
-                           se::DeviceMemoryAllocator* device_allocator,
+                           const CompileOptions& options,
                            const GpuTargetConfig& gpu_target_config,
                            const AutotuneResults* autotune_results);
 
@@ -240,16 +239,6 @@ class GpuCompiler : public LLVMCompiler {
   GpuCompiler& operator=(const GpuCompiler&) = delete;
 };
 
-// Compile `hlo_module` using XLA GPU and return the LLVM module thus generated.
-// The GpuExecutable (and the Thunks that are part of it) are not returned.
-StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
-    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
-    const std::string& target_triple, const std::string& data_layout,
-    const std::string& platform_name, se::Platform::Id platform_id,
-    GpuDeviceInfo gpu_device_info,
-    se::CudaComputeCapability cuda_compute_capability,
-    se::RocmComputeCapability rocm_compute_capability, int pointer_size);
-
 // Compiles the given LMHLO module to an executable.
 // ir_emitter_context should be partially populated: buffer_assignment
 // or buffer_allocations should not be populated, while other fields should be
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
index b44e4d9c26b..3df71da9cff 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
@@ -29,18 +29,15 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "absl/time/time.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/stream_executor/dnn.pb.h"
 #include "tensorflow/compiler/xla/stream_executor/scratch_allocator.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
@@ -48,7 +45,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/logger.h"
 #include "tensorflow/tsl/platform/numbers.h"
-#include "tensorflow/tsl/util/env_var.h"
 #include "tensorflow/tsl/util/proto/proto_utils.h"
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
@@ -98,7 +94,7 @@ StatusOr<se::DeviceMemory<uint8_t>> ScratchAllocator::AllocateBytes(
   CHECK_GE(byte_size, 0) << "byte_size must be positive.";
   if (byte_size > GetMemoryLimitInBytes()) {
     return Status(
-        tsl::error::RESOURCE_EXHAUSTED,
+        absl::StatusCode::kResourceExhausted,
         absl::StrFormat(
             "Allocating %d bytes exceeds the memory limit of %d bytes.",
             byte_size, GetMemoryLimitInBytes()));
@@ -116,7 +112,7 @@ StatusOr<se::DeviceMemory<uint8_t>> ScratchAllocator::AllocateBytes(
 
 StatusOr<std::vector<MaybeFusedConvRunner>> GetAlgorithms(
     const GpuConvConfig& config, se::Stream* stream, bool use_cudnn_frontend,
-    bool use_fallback) {
+    bool use_fallback, bool deterministic_ops) {
   TF_ASSIGN_OR_RETURN(se::dnn::ConvolutionKind kind,
                       GetDNNConvKindFromCudnnConvKind(config.kind));
 
@@ -150,7 +146,8 @@ StatusOr<std::vector<MaybeFusedConvRunner>> GetAlgorithms(
           /* leakyrelu_alpha = */ 0.0, stream, config.input_descriptor,
           config.filter_descriptor, config.bias_descriptor,
           config.output_descriptor, config.conv_desc, use_fallback,
-          config.fusion->mode, &runners));
+          config.fusion->mode, se::NumericOptions{deterministic_ops},
+          &runners));
       for (auto& runner : runners) {
         TF_ASSIGN_OR_RETURN(
             auto runner_cache,
@@ -175,7 +172,8 @@ StatusOr<std::vector<MaybeFusedConvRunner>> GetAlgorithms(
           /* filter_data = */ DeviceMemoryBase(nullptr),
           config.output_descriptor,
           /* output_data = */ DeviceMemoryBase(nullptr), config.conv_desc,
-          use_fallback, nullptr, &runners));
+          use_fallback, nullptr, se::NumericOptions{deterministic_ops},
+          &runners));
       for (auto& runner : runners) {
         TF_ASSIGN_OR_RETURN(
             auto runner_cache,
@@ -195,7 +193,8 @@ GetMIOpenAlgorithms(const HloCustomCallInstruction* instr,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::StreamExecutor* stream_exec,
-                    ScratchAllocator* scratch_allocator, se::Stream* stream) {
+                    ScratchAllocator* scratch_allocator, se::Stream* stream,
+                    bool deterministic_ops) {
   TF_ASSIGN_OR_RETURN(GpuConvConfig config, GetGpuConvConfig(instr));
 
   TF_ASSIGN_OR_RETURN(se::dnn::ConvolutionKind kind,
@@ -214,7 +213,7 @@ GetMIOpenAlgorithms(const HloCustomCallInstruction* instr,
       params.config->filter_descriptor, params.filter_buf,
       params.config->output_descriptor, params.output_buf,
       params.config->conv_desc, /* use_fallback = */ false, scratch_allocator,
-      &runners));
+      se::NumericOptions{deterministic_ops}, &runners));
 
   return runners;
 }
@@ -756,7 +755,8 @@ GpuConvAlgorithmPicker::AutotuneOneConvRunner(
                  << (*reference_result)->algorithm.ToString() << " against "
                  << alg.ToString() << " for " << instr_str << ": "
                  << compare_result.status();
-      if (compare_result.status().code() == tsl::error::RESOURCE_EXHAUSTED) {
+      if (compare_result.status().code() ==
+          absl::StatusCode::kResourceExhausted) {
         // Possibly OOM. Propagate the error.
         return compare_result.status();
       }
@@ -827,16 +827,18 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
 
   const bool cudnn_frontend_enabled =
       debug_options.xla_gpu_enable_cudnn_frontend();
+  const bool deterministic_ops = debug_options.xla_gpu_deterministic_ops();
 
   // Use the first algorithm that's supported as reference. There isn't a
   // particular reason to use it, as any algorithm suffices. It doesn't make
   // this algorithm considered correct, though.
   std::optional<ReferenceResult> reference_result;
 
-  TF_ASSIGN_OR_RETURN(std::vector<MaybeFusedConvRunner> runners,
-                      GetAlgorithms(runtime_arguments.gpu_conv_config, stream,
-                                    cudnn_frontend_enabled,
-                                    /* use_fallback = */ false));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<MaybeFusedConvRunner> runners,
+      GetAlgorithms(runtime_arguments.gpu_conv_config, stream,
+                    cudnn_frontend_enabled,
+                    /* use_fallback = */ false, deterministic_ops));
 
   std::vector<AutotuneResult> profile_results;
   for (auto& runner_cache : runners) {
@@ -856,10 +858,11 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
       LOG(WARNING) << "Conv: " << runtime_arguments.canonical_hlo.value();
     }
 
-    TF_ASSIGN_OR_RETURN(std::vector<MaybeFusedConvRunner> fallback_runners,
-                        GetAlgorithms(runtime_arguments.gpu_conv_config, stream,
-                                      cudnn_frontend_enabled,
-                                      /* use_fallback = */ true));
+    TF_ASSIGN_OR_RETURN(
+        std::vector<MaybeFusedConvRunner> fallback_runners,
+        GetAlgorithms(runtime_arguments.gpu_conv_config, stream,
+                      cudnn_frontend_enabled,
+                      /* use_fallback = */ true, deterministic_ops));
 
     for (auto& runner_cache : fallback_runners) {
       TF_ASSIGN_OR_RETURN(
@@ -927,6 +930,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmWithAllocatedBuffer(
 #if GOOGLE_CUDA
   Shape output_shape = conv_config.output_shape;
   HloModuleConfig hlo_module_config;
+  hlo_module_config.set_debug_options(*debug_options);
   se::Stream* stream = run_options->stream();
   se::DeviceMemoryAllocator* allocator = run_options->allocator();
   se::RedzoneAllocator input_output_allocator(
@@ -954,6 +958,10 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
   XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
       "GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
 
+  const DebugOptions& debug_options =
+      instr->GetModule()->config().debug_options();
+  const bool deterministic_ops = debug_options.xla_gpu_deterministic_ops();
+
   se::StreamExecutor* stream_exec = std::get<DeviceConfig>(config_).stream_exec;
   const auto device_ordinal = stream_exec->device_ordinal();
   std::vector<se::DeviceMemoryBase> operand_buffers;
@@ -989,7 +997,8 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<const se::dnn::ConvRunner>> runners,
       GetMIOpenAlgorithms(instr, absl::MakeSpan(operand_buffers), result_buffer,
-                          stream_exec, &scratch_allocator, stream));
+                          stream_exec, &scratch_allocator, stream,
+                          deterministic_ops));
 
   std::vector<AutotuneResult> profile_results;
 
@@ -1142,14 +1151,14 @@ StatusOr<bool> GpuConvAlgorithmPicker::RunOnInstruction(HloInstruction* instr) {
 StatusOr<bool> GpuConvAlgorithmPicker::RunOnComputation(
     HloComputation* computation) {
   std::vector<HloInstruction*> convs;
-  for (auto* instr : computation->instructions()) {
-    if (IsCustomCallToDnnConvolution(*instr)) {
+  for (HloInstruction* instr : computation->instructions()) {
+    if (IsCandidate(instr)) {
       convs.push_back(instr);
     }
   }
 
   bool changed = false;
-  for (auto* instr : convs) {
+  for (HloInstruction* instr : convs) {
     TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr));
     changed |= result;
   }
@@ -1162,7 +1171,7 @@ StatusOr<bool> GpuConvAlgorithmPicker::Run(
   XLA_SCOPED_LOGGING_TIMER(
       absl::StrCat("GpuConvAlgorithmPicker for ", module->name()));
 
-  if (module->config().debug_options().xla_gpu_autotune_level() == 0) {
+  if (!IsEnabled(module)) {
     VLOG(3) << "Convolution auto-tuning disabled, GpuConvAlgorithmPicker "
                "returning early.";
     return false;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h
index 8650387680e..5b2905891f0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h
@@ -19,17 +19,15 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <string_view>
-#include <variant>
 #include <vector>
 
-#include "absl/time/time.h"
 #include "tensorflow/compiler/xla/autotune_results.pb.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
-#include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_serializable_autotuner.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/tsl/protobuf/autotuning.pb.h"
@@ -87,6 +85,14 @@ class GpuConvAlgorithmPicker : public HloModulePass {
     return "gpu-conv-algorithm-picker";
   }
 
+  static bool IsEnabled(const HloModule* module) {
+    return module->config().debug_options().xla_gpu_autotune_level() != 0;
+  }
+
+  static bool IsCandidate(const HloInstruction* instr) {
+    return IsCustomCallToDnnConvolution(*instr);
+  }
+
   using HloPassInterface::Run;
   StatusOr<bool> Run(
       HloModule* module,
@@ -147,7 +153,7 @@ class GpuConvAlgorithmPicker : public HloModulePass {
 
   StatusOr<tensorflow::AutotuneResult> AutotuneOneConvRunner(
       se::DeviceMemoryAllocator* allocator, se::Stream* stream,
-      MaybeFusedConvRunner* const runner,
+      MaybeFusedConvRunner* runner,
       std::optional<ReferenceResult>* reference_result,
       absl::Span<const stream_executor::dnn::AlgorithmDesc> disabled_algos,
       std::optional<AutotuneInstructionInfo> instruction_info,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_convert_async_collectives_to_sync.cc b/tensorflow/compiler/xla/service/gpu/gpu_convert_async_collectives_to_sync.cc
new file mode 100644
index 00000000000..9b4f6be9aea
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_convert_async_collectives_to_sync.cc
@@ -0,0 +1,69 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_convert_async_collectives_to_sync.h"
+
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+
+namespace xla {
+namespace gpu {
+
+Status GpuConvertAsyncCollectivesToSync::ConvertAsyncInstructionsToSync(
+    HloComputation* computation,
+    absl::Span<const std::pair<HloInstruction*, HloInstruction*>> async_pairs)
+    const {
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> replaced_ops;
+  CollectiveBackendConfig sync_config;
+  sync_config.set_is_sync(true);
+  for (auto& [async_start, async_done] : async_pairs) {
+    // Tag the async start with is_sync = true.
+    TF_RETURN_IF_ERROR(async_start->set_backend_config(sync_config));
+    replaced_ops[async_start] = nullptr;
+    replaced_ops[async_done] = async_start;
+  }
+
+  // Update schedule.
+  HloModule* module = computation->parent();
+  const HloInstructionSequence& sequence =
+      module->schedule().sequence(computation);
+  std::vector<HloInstruction*> new_sequence;
+  new_sequence.reserve(sequence.size());
+  for (HloInstruction* instr : sequence.instructions()) {
+    auto it = replaced_ops.find(instr);
+    // If its not a start or done, add it to new schedule.
+    if (it == replaced_ops.end()) {
+      new_sequence.push_back(instr);
+      continue;
+    }
+
+    // If its a start op, do not add it to the schedule yet.
+    if (it->second == nullptr) {
+      continue;
+    }
+
+    // Its a done op. First add the start and then the done.
+    new_sequence.push_back(it->second);
+    new_sequence.push_back(instr);
+  }
+  module->schedule().set_sequence(computation, new_sequence);
+  return OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_convert_async_collectives_to_sync.h b/tensorflow/compiler/xla/service/gpu/gpu_convert_async_collectives_to_sync.h
new file mode 100644
index 00000000000..a4933cb55e1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_convert_async_collectives_to_sync.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/convert_async_collectives_to_sync.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuConvertAsyncCollectivesToSync : public ConvertAsyncCollectivesToSync {
+ public:
+  using ConvertAsyncCollectivesToSync::ConvertAsyncCollectivesToSync;
+  absl::string_view name() const override {
+    return "gpu-convert-async-collectives-to-sync";
+  }
+
+  Status ConvertAsyncInstructionsToSync(
+      HloComputation* computation,
+      absl::Span<const std::pair<HloInstruction*, HloInstruction*>> async_pairs)
+      const override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_convert_async_collectives_to_sync_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_convert_async_collectives_to_sync_test.cc
new file mode 100644
index 00000000000..46a89375c9d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_convert_async_collectives_to_sync_test.cc
@@ -0,0 +1,319 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_convert_async_collectives_to_sync.h"
+
+#include <string_view>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+
+// Note: The pass only processes modules that are already scheduled. If the test
+// does not work as epxected, make sure to check if "is_scheduled=true" is added
+// to the HLO module string.
+class GpuConvertAsyncCollectivesToSyncTest : public HloTestBase {
+ public:
+  Status RunPass(HloModule *module, bool expect_change,
+                 HloPredicate is_nop = {}) {
+    TF_ASSIGN_OR_RETURN(bool changed,
+                        GpuConvertAsyncCollectivesToSync{is_nop}.Run(module));
+    EXPECT_EQ(changed, expect_change);
+    return OkStatus();
+  }
+
+  // Returns true if the instruction with the given name is synchronous.
+  bool IsSync(HloModule *module, std::string_view name) {
+    const HloInstruction *inst = FindInstruction(module, name);
+    if (inst == nullptr) {
+      return false;
+    }
+    auto backend_config =
+        inst->backend_config<CollectiveBackendConfig>().value();
+    return backend_config.is_sync();
+  }
+
+  HloPredicate is_nop_simple_ =
+      HloPredicateIsOp<HloOpcode::kBitcast, HloOpcode::kGetTupleElement,
+                       HloOpcode::kParameter>;
+};
+
+TEST_F(GpuConvertAsyncCollectivesToSyncTest, SimpleAllReduce) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3
+        ROOT done = u32[] all-reduce-done(start)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  EXPECT_THAT(IsSync(module.get(), "start"), IsTrue());
+}
+
+TEST_F(GpuConvertAsyncCollectivesToSyncTest, SimpleAllReduceWithNop) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3, replica_groups={{0,1}, {2,3}}
+        id2 = f32[] bitcast(id)
+        ROOT done = u32[] all-reduce-done(start)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true, is_nop_simple_));
+  EXPECT_THAT(IsSync(module.get(), "start"), IsTrue());
+}
+
+TEST_F(GpuConvertAsyncCollectivesToSyncTest, SimpleAllReduceWithNonNop) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3
+        id2 = u32[] add(id, id)
+        ROOT done = u32[] all-reduce-done(start)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/false));
+}
+
+TEST_F(GpuConvertAsyncCollectivesToSyncTest, SimpleAllGather) {
+  const absl::string_view hlo_string = R"(
+  HloModule test, is_scheduled=true
+  ENTRY test_computation {
+    a1 = u32[1, 2] parameter(0)
+    ags = (u32[1, 2], u32[2, 2]) all-gather-start(a1), dimensions={0}, channel_id=3
+    ROOT allgather = u32[2,2] all-gather-done(ags)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  EXPECT_THAT(IsSync(module.get(), "ags"), IsTrue());
+}
+
+TEST_F(GpuConvertAsyncCollectivesToSyncTest, SimpleCollectivePermute) {
+  const absl::string_view hlo_string = R"(
+  HloModule test, is_scheduled=true
+
+  ENTRY test_computation {
+    p = u32[2] parameter(0)
+    start = (u32[2], u32[2], u32[], u32[]) collective-permute-start(p), source_target_pairs={{0,1}, {1,0}}
+    ROOT done = u32[2] collective-permute-done(start)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  EXPECT_THAT(IsSync(module.get(), "start"), IsTrue());
+}
+
+TEST_F(GpuConvertAsyncCollectivesToSyncTest, SimpleReduceScatter) {
+  const absl::string_view hlo_string = R"(
+  HloModule test, is_scheduled=true
+
+  add {
+    lhs = u32[] parameter(0)
+    rhs = u32[] parameter(1)
+    ROOT add = u32[] add(lhs, rhs)
+  }
+
+  reduce_scatter {
+    p0 = u32[8] parameter(0)
+    ROOT result = u32[4] reduce-scatter(p0), replica_groups={{0,3}, {1,2}},
+                      dimensions={0}, to_apply=add
+  }
+
+  ENTRY main {
+    data = u32[8] parameter(0)
+    rs-start = ((u32[8]{0}), u32[4]{0}) async-start(u32[8]{0} %data), calls=reduce_scatter
+    ROOT %ars = u32[4]{0} async-done(((u32[8]{0}), u32[4]{0}) %rs-start), calls=reduce_scatter
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  EXPECT_THAT(IsSync(module.get(), "rs-start"), IsTrue());
+}
+
+TEST_F(GpuConvertAsyncCollectivesToSyncTest, SimpleAllToAll) {
+  const absl::string_view hlo_string = R"(
+  HloModule test, is_scheduled=true
+
+  all_to_all {
+    p0 = u32[2] parameter(0)
+    ROOT result = u32[2] all-to-all(p0), dimensions={0}, replica_groups={{0,1},{2,3}}
+  }
+
+  ENTRY test_computation {
+    a1 = u32[2] parameter(0)
+    a2a-start = ((u32[2]), u32[2]) async-start(u32[2] a1), calls=all_to_all
+    ROOT a2s = u32[2] async-done(a2a-start), calls=all_to_all
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  EXPECT_THAT(IsSync(module.get(), "a2a-start"), IsTrue());
+}
+
+TEST_F(GpuConvertAsyncCollectivesToSyncTest, ControlDeps) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start1 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3
+        done1 = u32[] all-reduce-done(start1)
+        start2 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=4, control-predecessors={done1}
+        done2 = u32[] all-reduce-done(start2)
+        ROOT x = u32[] add(done1, done2)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  EXPECT_THAT(IsSync(module.get(), "start1"), IsTrue());
+  EXPECT_THAT(IsSync(module.get(), "start2"), IsTrue());
+}
+
+// Test multiple in-flight collectives that are ordered in a streaming fashion:
+// i.e., ends are in start order (FIFO).
+TEST_F(GpuConvertAsyncCollectivesToSyncTest, MultipleInFlightStreaming) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start1 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3
+        start2 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=4
+        done1 = u32[] all-reduce-done(start1)
+        done2 = u32[] all-reduce-done(start2)
+        ROOT x = u32[] add(done1, done2)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  EXPECT_THAT(IsSync(module.get(), "start1"), IsTrue());
+  EXPECT_THAT(IsSync(module.get(), "start2"), IsTrue());
+}
+
+// Test multiple in-flight collectives that are nested: {s0,{s1,e1},e0}
+TEST_F(GpuConvertAsyncCollectivesToSyncTest, MultipleInFlightNested) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start1 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3
+        start2 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=4
+        done2 = u32[] all-reduce-done(start2)
+        done1 = u32[] all-reduce-done(start1)
+        ROOT x = u32[] add(done1, done2)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  EXPECT_THAT(IsSync(module.get(), "start1"), IsTrue());
+  EXPECT_THAT(IsSync(module.get(), "start2"), IsTrue());
+}
+
+// Test multiple in-flight collectives that are nested: {s0,{s1,e1},e0} where
+// inner pair can be converted but not outer.
+TEST_F(GpuConvertAsyncCollectivesToSyncTest, MultipleInFlightNestedPartial) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start1 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3
+        start2 = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=4
+        done2 = u32[] all-reduce-done(start2)
+        id2 = u32[] add(done2, done2)
+        done1 = u32[] all-reduce-done(start1)
+        ROOT x = u32[] add(done1, done2)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  EXPECT_THAT(IsSync(module.get(), "start1"), IsFalse());
+  EXPECT_THAT(IsSync(module.get(), "start2"), IsTrue());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info.cc b/tensorflow/compiler/xla/service/gpu/gpu_device_info.cc
index d8a28b51f51..5885ffe5f5a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_device_info.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info.cc
@@ -25,6 +25,8 @@ GpuDeviceInfo GetGpuDeviceInfo(
   device_info.threads_per_block_limit = device.threads_per_block_limit();
   device_info.threads_per_warp = device.threads_per_warp();
   device_info.shared_memory_per_block = device.shared_memory_per_block();
+  device_info.shared_memory_per_block_optin =
+      device.shared_memory_per_block_optin();
   device_info.shared_memory_per_core = device.shared_memory_per_core();
   device_info.threads_per_core_limit = device.threads_per_core_limit();
   device_info.core_count = device.core_count();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info.h b/tensorflow/compiler/xla/service/gpu/gpu_device_info.h
index 907cbd90c7a..600a1359e98 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_device_info.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info.h
@@ -32,6 +32,7 @@ struct GpuDeviceInfo {
   int threads_per_block_limit;
   int threads_per_warp;
   int shared_memory_per_block;
+  int shared_memory_per_block_optin;
   int shared_memory_per_core;
   int threads_per_core_limit;
   int core_count;
@@ -49,6 +50,7 @@ struct GpuDeviceInfo {
     proto.set_threads_per_block_limit(threads_per_block_limit);
     proto.set_threads_per_warp(threads_per_warp);
     proto.set_shared_memory_per_block(shared_memory_per_block);
+    proto.set_shared_memory_per_block_optin(shared_memory_per_block_optin);
     proto.set_shared_memory_per_core(shared_memory_per_core);
     proto.set_threads_per_core_limit(threads_per_core_limit);
     proto.set_core_count(core_count);
@@ -68,6 +70,7 @@ struct GpuDeviceInfo {
     threads_per_block_limit = proto.threads_per_block_limit();
     threads_per_warp = proto.threads_per_warp();
     shared_memory_per_block = proto.shared_memory_per_block();
+    shared_memory_per_block_optin = proto.shared_memory_per_block_optin();
     shared_memory_per_core = proto.shared_memory_per_core();
     threads_per_core_limit = proto.threads_per_core_limit();
     core_count = proto.core_count();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.cc b/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.cc
index 6dcaefc1b66..b5b5f942057 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.cc
@@ -15,26 +15,30 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
 
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
+
 namespace xla {
 namespace gpu {
 
-/*static*/ GpuDeviceInfo TestGpuDeviceInfo::RTXA6000DeviceInfo() {
-  GpuDeviceInfo d;
-  d.name = "NVIDIA RTX A6000";
-  d.threads_per_block_limit = 1024;
-  d.threads_per_warp = 32;
-  d.shared_memory_per_block = 49152;
-  d.shared_memory_per_core = 100 * 1024;
-  d.threads_per_core_limit = 1536;
-  d.core_count = 84;
-  d.fpus_per_core = 128;
-  d.block_dim_limit_x = 2'147'483'647;
-  d.block_dim_limit_y = 65535;
-  d.block_dim_limit_z = 65535;
-  d.memory_bandwidth = 768'096'000'000;
-  d.l2_cache_size = 6 * 1024 * 1024;
-  d.clock_rate_ghz = 1.410;
-  return d;
+GpuDeviceInfo TestGpuDeviceInfo::RTXA6000DeviceInfo() {
+  GpuDeviceInfo info;
+  info.name = "NVIDIA RTX A6000";
+  info.threads_per_block_limit = 1024;
+  info.threads_per_warp = 32;
+  info.shared_memory_per_block = 48 * 1024;
+  info.shared_memory_per_block_optin = 99 * 1024;
+  info.shared_memory_per_core = 100 * 1024;
+  info.threads_per_core_limit = 1536;
+  info.core_count = 84;
+  info.fpus_per_core = 128;
+  info.block_dim_limit_x = 2'147'483'647;
+  info.block_dim_limit_y = 65535;
+  info.block_dim_limit_z = 65535;
+  info.memory_bandwidth = 768'096'000'000;
+  info.l2_cache_size = 6 * 1024 * 1024;
+  info.clock_rate_ghz = 1.410;
+  info.device_memory_size = 51'050'250'240;
+  return info;
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_device_info_test.cc
index 6eccb16b37b..c16a82f16b2 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_device_info_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info_test.cc
@@ -13,48 +13,75 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-#include <string>
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.h"
-#include "tensorflow/compiler/xla/stream_executor/device_description.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace stream_executor {
 namespace gpu {
 namespace {
 
-TEST(DeviceInfoTest, DeviceInfo) {
-  ASSERT_FALSE(cuInit(/*Flags=*/0));
-  std::unique_ptr<DeviceDescription> d =
-      GpuExecutor::CreateDeviceDescription(/*device_ordinal=*/0).value();
-  absl::string_view name(d->name());
+namespace se = stream_executor;
+
+TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").value();
+  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+  const xla::gpu::GpuDeviceInfo dev_info = xla::gpu::GetGpuDeviceInfo(executor);
+  absl::string_view name(dev_info.name);
   if (name == "NVIDIA RTX A6000") {
     xla::gpu::GpuDeviceInfo test_info =
         xla::gpu::TestGpuDeviceInfo::RTXA6000DeviceInfo();
-    EXPECT_EQ(test_info.name, d->name());
-    EXPECT_EQ(test_info.threads_per_block_limit, d->threads_per_block_limit());
-    EXPECT_EQ(test_info.threads_per_warp, d->threads_per_warp());
-    EXPECT_EQ(test_info.shared_memory_per_block, d->shared_memory_per_block());
-    EXPECT_EQ(test_info.shared_memory_per_core, d->shared_memory_per_core());
-    EXPECT_EQ(test_info.threads_per_core_limit, d->threads_per_core_limit());
-    EXPECT_EQ(test_info.core_count, d->core_count());
-    EXPECT_EQ(test_info.fpus_per_core, d->fpus_per_core());
-    EXPECT_EQ(test_info.block_dim_limit_x, d->block_dim_limit().x);
-    EXPECT_EQ(test_info.block_dim_limit_y, d->block_dim_limit().y);
-    EXPECT_EQ(test_info.block_dim_limit_z, d->block_dim_limit().z);
-    EXPECT_EQ(test_info.memory_bandwidth, d->memory_bandwidth());
-    EXPECT_EQ(test_info.l2_cache_size, d->l2_cache_size());
-    // Clock rate can vary between base and boost values.
-    EXPECT_LE(test_info.clock_rate_ghz, d->clock_rate_ghz());
+    EXPECT_THAT(
+        dev_info,
+        ::testing::FieldsAre(
+            test_info.name, test_info.threads_per_block_limit,
+            test_info.threads_per_warp, test_info.shared_memory_per_block,
+            test_info.shared_memory_per_block_optin,
+            test_info.shared_memory_per_core, test_info.threads_per_core_limit,
+            test_info.core_count, test_info.fpus_per_core,
+            test_info.block_dim_limit_x, test_info.block_dim_limit_y,
+            test_info.block_dim_limit_z, test_info.memory_bandwidth,
+            test_info.l2_cache_size,
+            // Clock rate can vary between base and boost values.
+            ::testing::Ge(test_info.clock_rate_ghz),
+            dev_info.device_memory_size));
   } else if (name == "Quadro P1000") {
-    EXPECT_EQ(d->fpus_per_core(), 128);
-    EXPECT_EQ(d->l2_cache_size(), 1024 * 1024);
+    EXPECT_THAT(
+        dev_info,
+        ::testing::FieldsAre(
+            name, /*threads_per_block_limit=*/1024,
+            /*threads_per_warp=*/32, /*shared_memory_per_block=*/48 * 1024,
+            /*shared_memory_per_block_optin=*/48 * 1024,
+            /*shared_memory_per_core=*/96 * 1024,
+            /*threads_per_core_limit=*/2048, /*core_count=*/5,
+            /*fpus_per_core=*/128,
+            /*block_dim_limit_x=*/2'147'483'647,
+            /*block_dim_limit_y=*/65535,
+            /*block_dim_limit_z=*/65535,
+            /*memory_bandwidth=*/80'160'000'000, /*l2_cache_size=*/1024 * 1024,
+            /*clock_rate_ghz=*/::testing::Ge(1.4),
+            /*device_memory_size=*/4'234'346'496));
   } else if (name == "Tesla P100-SXM2-16GB") {
-    EXPECT_EQ(d->fpus_per_core(), 64);
-    EXPECT_EQ(d->l2_cache_size(), 4 * 1024 * 1024);
+    EXPECT_THAT(
+        dev_info,
+        ::testing::FieldsAre(name, /*threads_per_block_limit=*/1024,
+                             /*threads_per_warp=*/32,
+                             /*shared_memory_per_block=*/48 * 1024,
+                             /*shared_memory_per_block_optin=*/48 * 1024,
+                             /*shared_memory_per_core=*/64 * 1024,
+                             /*threads_per_core_limit=*/2048, /*core_count=*/56,
+                             /*fpus_per_core=*/64,
+                             /*block_dim_limit_x=*/2'147'483'647,
+                             /*block_dim_limit_y=*/65535,
+                             /*block_dim_limit_z=*/65535,
+                             /*memory_bandwidth=*/732'160'000'000,
+                             /*l2_cache_size=*/4 * 1024 * 1024,
+                             /*clock_rate_ghz=*/::testing::Ge(1.4),
+                             /*device_memory_size=*/17'066'622'976));
   } else {
     VLOG(1) << "Not tested for " << name;
   }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 426c770eac4..75928db319d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/runtime/diagnostics.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
@@ -62,13 +63,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/xla_debug_info_manager.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
+#include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/lib/gtl/map_util.h"
 #include "tensorflow/tsl/platform/casts.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/random.h"
 #include "tensorflow/tsl/profiler/lib/scoped_annotation.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
 
@@ -126,11 +131,19 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params)
       module_name_(params.module_name),
       output_shape_(params.output_shape),
       allocations_(std::move(params.allocations)),
+      enable_persistent_temp_buffers_(params.enable_persistent_temp_buffers),
       debug_buffer_assignment_(std::move(params.debug_buffer_assignment)),
       verbose_buffer_assignment_string_dumper_(
           params.verbose_buffer_assignment_string_dumper),
       constants_(std::move(params.constants)),
       output_info_(std::move(params.output_info)) {
+#if TENSORFLOW_USE_ROCM
+  // ROCm uses hsaco hashes to distinguish between modules.
+  // Bad things happen if multiple modules with identical code are loaded.
+  binary_.reserve(binary_.size() + 272);
+  *(uint64_t*)(&binary_[binary_.size() - 16]) = tsl::EnvTime::NowNanos();
+  *(uint64_t*)(&binary_[binary_.size() - 8]) = tsl::random::New64();
+#endif
   if (has_module()) {
     XlaDebugInfoManager::Get()->RegisterModule(
         module().unique_id(), shared_module(), debug_buffer_assignment_);
@@ -141,6 +154,14 @@ GpuExecutable::~GpuExecutable() {
   if (has_module()) {
     XlaDebugInfoManager::Get()->UnregisterModule(module().unique_id());
   }
+
+  // Deallocate all persistent buffers.
+  for (auto& [executor, map] : persistent_temp_buffers_) {
+    for (const auto& alloc_buffer : map) {
+      se::DeviceMemoryBase buffer = alloc_buffer.second;
+      executor->UnifiedMemoryDeallocate(buffer.opaque());
+    }
+  }
 }
 
 Status GpuExecutable::CheckCompatibilityWithServiceExecutableRunOptions(
@@ -233,7 +254,7 @@ Status MaybeSyncAndProfile(const ServiceExecutableRunOptions* run_options,
     if (!block_status.ok()) {
       return InternalError(
           "Failed to complete all kernels launched on stream %p: %s",
-          stream_to_sync, block_status.error_message());
+          stream_to_sync, block_status.message());
     }
   }
 
@@ -382,7 +403,7 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
       StatusOr<se::OwningDeviceMemory> buffer =
           memory_allocator->Allocate(device_ordinal, buffer_size);
       if (!buffer.ok()) {
-        return ResourceExhausted("%s\n%s\n", buffer.status().error_message(),
+        return ResourceExhausted("%s\n%s\n", buffer.status().message(),
                                  verbose_buffer_assignment_string_dumper_());
       }
       buffer_address = buffer->Release();
@@ -415,7 +436,8 @@ static Status CheckAlignment(const BufferAllocation& allocation,
 StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
     VariantArguments arguments,
     const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
-    se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal) {
+    se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal,
+    const BufferAllocToDeviceMemoryMap& buffer_alloc_to_persistent_memory_map) {
   tsl::profiler::TraceMe hlo_module_activity(
       [&] { return std::string("Build buffer allocations"); },
       tsl::profiler::TraceMeLevel::kInfo);
@@ -425,10 +447,16 @@ StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
   buffers.reserve(num_buffers);
   for (int64_t i = 0; i < num_buffers; ++i) {
     const BufferAllocation& allocation = allocations_[i];
-    TF_ASSIGN_OR_RETURN(
-        se::DeviceMemoryBase buffer,
-        BufferForAllocation(arguments, globals, allocation, memory_allocator,
-                            device_ordinal, i));
+    // Check if the buffer is already stored as a persistent buffer.
+    se::DeviceMemoryBase buffer;
+    if (buffer_alloc_to_persistent_memory_map.contains(allocation.index())) {
+      buffer = buffer_alloc_to_persistent_memory_map.at(allocation.index());
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          buffer, BufferForAllocation(arguments, globals, allocation,
+                                      memory_allocator, device_ordinal, i));
+    }
+
     buffers.push_back(buffer);
     TF_RETURN_IF_ERROR(CheckAlignment(allocation, buffer, i));
   }
@@ -485,17 +513,59 @@ static Status ExecuteXlaRuntime(const std::string& module_name,
       block_host_until_done ? run_options->stream() : nullptr);
 }
 
+Status GpuExecutable::PopulatePersistentTempBuffers(
+    se::StreamExecutor* executor) {
+  auto search = persistent_temp_buffers_.find(executor);
+  if (search != persistent_temp_buffers_.end()) {
+    return OkStatus();
+  }
+
+  // Allocate persistent temp buffers.
+  BufferAllocToDeviceMemoryMap buffer_alloc_to_device_memory_map;
+  for (const BufferAllocation& allocation : allocations_) {
+    if (!allocation.IsPreallocatedTempBuffer()) {
+      continue;
+    }
+
+    const int64_t buffer_size = allocation.size();
+    void* ptr = executor->UnifiedMemoryAllocate(buffer_size);
+    if (ptr) {
+      se::DeviceMemoryBase buffer(ptr, buffer_size);
+      buffer_alloc_to_device_memory_map[allocation.index()] = buffer;
+    }
+  }
+
+  persistent_temp_buffers_[executor] = buffer_alloc_to_device_memory_map;
+  return OkStatus();
+}
+
 StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
     const ServiceExecutableRunOptions* run_options,
     VariantArguments arguments) {
   XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
       "GpuExecutable::ExecuteAsyncOnStreamImpl(", module_name_, ")"));
   se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
+  se::StreamExecutor* executor = run_options->stream()->parent();
+
+  // If persistent buffers are enabled, the executable cannot execute
+  // concurrently, therefore performance can suffer under contention.
+  absl::MutexLockMaybe lock(
+      enable_persistent_temp_buffers_ ? &persistent_temp_buffers_mu_ : nullptr);
+
+  // Map from buffer allocation to persistent temp buffers. It is empty if
+  // persistent temp buffer is not enabled.
+  BufferAllocToDeviceMemoryMap persistent_buffers_map = {};
+
+  if (enable_persistent_temp_buffers_) {
+    persistent_temp_buffers_mu_.AssertHeld();
+    TF_RETURN_IF_ERROR(PopulatePersistentTempBuffers(executor));
+    persistent_buffers_map = persistent_temp_buffers_[executor];
+  }
+
   // Force synchronous execution if the allocator requires it.
   const bool block_host_until_done =
       !memory_allocator->AllowsAsynchronousDeallocation();
 
-  se::StreamExecutor* executor = run_options->stream()->parent();
 
   // Lock the GPU with a shared lock so that we don't interfere with autotuning
   // that may be running during JIT compilation while allowing multiple XLA
@@ -518,7 +588,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
   TF_ASSIGN_OR_RETURN(
       BufferAllocations buffer_allocations,
       GenerateBufferAllocations(arguments, globals, memory_allocator,
-                                device_ordinal));
+                                device_ordinal, persistent_buffers_map));
   VLOG(2) << buffer_allocations.ToString();
   std::set<se::DeviceMemoryBase> buffers_in_result;
 
@@ -602,7 +672,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
             memory_allocator->Allocate(device_ordinal, allocation_size);
         if (!allocated_buffer.ok()) {
           return ResourceExhausted("%s\n%s\n",
-                                   allocated_buffer.status().error_message(),
+                                   allocated_buffer.status().message(),
                                    verbose_buffer_assignment_string_dumper_());
         }
         result_buffer = allocated_buffer->Release();
@@ -635,8 +705,14 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
       run_options, buffer_allocations, block_host_until_done, gpu_lock));
 
   // Free all temporary allocations.
-  TF_RETURN_IF_ERROR(
-      buffer_allocations.TearDown(buffers_in_result, allocations_));
+  std::vector<BufferAllocation> non_persistent_allocations;
+  for (const BufferAllocation& allocation : allocations_) {
+    if (!persistent_buffers_map.contains(allocation.index())) {
+      non_persistent_allocations.push_back(allocation);
+    }
+  }
+  TF_RETURN_IF_ERROR(buffer_allocations.TearDown(buffers_in_result,
+                                                 non_persistent_allocations));
 
   // Free allocations for arguments.
   if (auto args = std::get_if<absl::Span<ExecutionInput>>(&arguments)) {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index d4e326f3fee..51010a8fa31 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -92,6 +92,7 @@ class GpuExecutable : public Executable {
     std::string module_name;
     xla::Shape output_shape;
     std::vector<BufferAllocation> allocations;
+    bool enable_persistent_temp_buffers;
     std::unique_ptr<BufferAssignmentProto> debug_buffer_assignment = nullptr;
 
     // A callable that dumps out a debug string upon device OOM. It's not the
@@ -220,6 +221,11 @@ class GpuExecutable : public Executable {
   StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals(
       stream_executor::Stream* stream);
 
+  // Allocate the temp buffers and store them with the GpuExecutable. This
+  // function only allocates buffers on the first run for each executor.
+  Status PopulatePersistentTempBuffers(se::StreamExecutor* executor)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(persistent_temp_buffers_mu_);
+
   // GpuExecutable check with either AMD's ISA version, or Nvidia's major minor
   // version for compute capability, depending on the hardware.
   Status CheckCompatibilityWithServiceExecutableRunOptions(
@@ -228,7 +234,9 @@ class GpuExecutable : public Executable {
   StatusOr<BufferAllocations> GenerateBufferAllocations(
       VariantArguments arguments,
       const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
-      se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal);
+      se::DeviceMemoryAllocator* memory_allocator, int device_ordinal,
+      const BufferAllocToDeviceMemoryMap&
+          buffer_alloc_to_persistent_memory_map);
 
   StatusOr<se::DeviceMemoryBase> BufferForAllocation(
       VariantArguments arguments,
@@ -252,8 +260,11 @@ class GpuExecutable : public Executable {
   // compute_capability_.
   //
   // May be empty, in which case we leave compilation up to the GPU driver.
+#ifdef TENSORFLOW_USE_ROCM
+  std::vector<uint8_t> binary_;
+#else
   const std::vector<uint8_t> binary_;
-
+#endif
   // The GPU version for compute compatibility check.
   GpuVersion gpu_version_;
 
@@ -276,6 +287,16 @@ class GpuExecutable : public Executable {
   // memory for every output/temp buffers.
   const std::vector<BufferAllocation> allocations_;
 
+  bool enable_persistent_temp_buffers_ = false;
+
+  absl::Mutex persistent_temp_buffers_mu_;
+  // Temp buffers can be allocated once and be reused whenever the GpuExecutable
+  // is executed. The persistent temp buffer is stored in a map that maps from
+  // a BufferAllocation to the temp buffer.
+  absl::flat_hash_map<stream_executor::StreamExecutor*,
+                      BufferAllocToDeviceMemoryMap>
+      persistent_temp_buffers_ ABSL_GUARDED_BY(persistent_temp_buffers_mu_);
+
   std::shared_ptr<BufferAssignmentProto> debug_buffer_assignment_;
   std::function<std::string()> verbose_buffer_assignment_string_dumper_;
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_float_support.cc b/tensorflow/compiler/xla/service/gpu/gpu_float_support.cc
new file mode 100644
index 00000000000..bed84cbc046
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_float_support.cc
@@ -0,0 +1,58 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_float_support.h"
+
+namespace xla {
+namespace gpu {
+
+bool GpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
+  switch (hlo.opcode()) {
+    // Collective ops.
+    case HloOpcode::kAllGather:
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kAllReduceStart:
+    case HloOpcode::kAllReduceDone:
+    case HloOpcode::kAllToAll:
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kReduceScatter:
+    // Handled by Triton GEMM.
+    case HloOpcode::kDot:
+      return LowPrecisionType() == BF16;
+    // Data movement only ops.
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kCopy:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kGather:
+    case HloOpcode::kPad:
+    case HloOpcode::kReshape:
+    case HloOpcode::kReverse:
+    case HloOpcode::kScatter:
+    case HloOpcode::kSelect:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSlice:
+    case HloOpcode::kTranspose:
+    // Other special ops.
+    case HloOpcode::kBitcast:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_float_support.h b/tensorflow/compiler/xla/service/gpu/gpu_float_support.h
new file mode 100644
index 00000000000..ea21175693d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_float_support.h
@@ -0,0 +1,46 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FLOAT_SUPPORT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FLOAT_SUPPORT_H_
+
+#include "tensorflow/compiler/xla/service/float_support.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuFloatSupport : public FloatSupport {
+ public:
+  explicit GpuFloatSupport(PrimitiveType low_precision_type)
+      : FloatSupport(low_precision_type) {}
+
+  bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
+                                   int64_t operand_index) const override {
+    return FloatSupport::SupportsLowPrecisionOperand(hlo, operand_index) ||
+           IsSupported(hlo);
+  }
+
+  bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
+    return FloatSupport::SupportsLowPrecisionOutput(hlo) || IsSupported(hlo);
+  }
+
+ private:
+  bool IsSupported(const HloInstruction& hlo) const;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FLOAT_SUPPORT_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
index 2331edf7e22..0559629001d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
@@ -15,34 +15,44 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
 
-#include <cstddef>
 #include <deque>
-#include <iostream>
 #include <memory>
-#include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
-#include "tensorflow/compiler/xla/service/buffer_value.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/latency_hiding_scheduler.h"
+#include "tensorflow/compiler/xla/service/profile_guided_latency_estimator.h"
 
 namespace xla {
 namespace gpu {
 
 namespace {
 
+bool IsSyncCollective(const HloInstruction& instr) {
+  auto backend_config = instr.backend_config<CollectiveBackendConfig>().value();
+  return backend_config.is_sync();
+}
+
+bool IsNopInstruction(const HloInstruction& hlo) {
+  HloOpcode op = hlo.opcode();
+  return op == HloOpcode::kGetTupleElement || op == HloOpcode::kBitcast ||
+         op == HloOpcode::kConstant || op == HloOpcode::kParameter ||
+         hlo.IsEffectiveBitcast();
+}
+
 bool ShouldScheduleAsEarlyAsPossible(const HloInstruction& instr) {
   switch (instr.opcode()) {
     case HloOpcode::kAllReduceStart:
     case HloOpcode::kCollectivePermuteStart:
-      return true;
+      return !IsSyncCollective(instr);
     case HloOpcode::kCustomCall:
       return static_cast<const HloCustomCallInstruction&>(instr)
                  .custom_call_schedule() ==
@@ -63,7 +73,7 @@ bool ShouldScheduleAsLateAsPossible(const HloInstruction& instr) {
   switch (instr.opcode()) {
     case HloOpcode::kAllReduceDone:
     case HloOpcode::kCollectivePermuteDone:
-      return true;
+      return ShouldScheduleAsEarlyAsPossible(*instr.operand(0));
     case HloOpcode::kCustomCall:
       return static_cast<const HloCustomCallInstruction&>(instr)
                  .custom_call_schedule() == CustomCallSchedule::SCHEDULE_LATEST;
@@ -159,18 +169,40 @@ HloInstructionSequence PostprocessorToScheduleAsEarlyOrLateAsPossible(
   return result;
 }
 
+// Post process to move start/done for synchronous collectives next to each
+// other.
+HloInstructionSequence PostprocessorToScheduleSyncCollectives(
+    const HloInstructionSequence& input) {
+  HloInstructionSequence result;
+  auto is_synchronous_op = [](const HloInstruction* instr) {
+    return hlo_query::IsAsyncCollectiveStartOp(instr->opcode()) &&
+           IsSyncCollective(*instr);
+  };
+  for (HloInstruction* instr : input.instructions()) {
+    if (is_synchronous_op(instr)) {
+      continue;
+    }
+    if (hlo_query::IsAsyncCollectiveDoneOp(instr->opcode())) {
+      // Place the start op just before the done op if its synchronous.
+      HloInstruction* start = instr->mutable_operand(0);
+      if (is_synchronous_op(start)) {
+        result.push_back(start);
+      }
+    }
+    result.push_back(instr);
+  }
+  return result;
+}
+
 StatusOr<HloSchedule> ScheduleGpuModuleWithMemoryScheduler(
-    const HloModule* module, int64_t pointer_size, bool enable_post_processor) {
-  MemorySchedulerPostprocessor post_processor =
-      enable_post_processor ? PostprocessorToScheduleAsEarlyOrLateAsPossible
-                            : nullptr;
+    const HloModule* module, int64_t pointer_size) {
   return ScheduleModule(
       module,
       [pointer_size](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
       },
       ComputationSchedulerToModuleScheduler(DefaultMemoryScheduler,
-                                            post_processor));
+                                            PostProcessSchedule));
 }
 
 // Latency hiding scheduler support.
@@ -183,13 +215,117 @@ SchedulerConfig GetSchedulerConfig(const GpuDeviceInfo& gpu_info) {
   config.aggressive_scheduling_policies = true;
 
   // Assume 75% of the total device memory is available for XLA.
-  config.memory_limit = gpu_info.device_memory_size * 0.75;
+  config.memory_limit = gpu_info.device_memory_size * 0.95;
   return config;
 }
 
+// GPU specific resources for latency hiding scheduler.
+enum class GpuResourceType {
+  kGpuAsyncStream = 0,  // The async stream for collectives.
+  kNumTargetResources = 1,
+};
+
+// Base GPU async tracker that enables async tracking only for async collectives
+// that are marked for async execution.
+class GpuAsyncTrackerBase : public AsyncTracker {
+ public:
+  using AsyncTracker::AsyncTracker;
+
+  bool IsSupportedAsyncDone(const HloInstruction& hlo) const override {
+    return hlo_query::IsAsyncCollectiveDoneOp(hlo.opcode()) &&
+           !IsSyncCollective(*hlo.operand(0));
+  }
+
+  // Returns if this is an Async op start that the scheduler supports.
+  bool IsSupportedAsyncStart(const HloInstruction& hlo) const override {
+    return hlo_query::IsAsyncCollectiveStartOp(hlo.opcode()) &&
+           !IsSyncCollective(hlo);
+  }
+};
+
+// GPU async tracker maps all collectives onto an async stream resource.
+class GpuAsyncTracker : public GpuAsyncTrackerBase {
+ public:
+  explicit GpuAsyncTracker(const SchedulerConfig& config)
+      : GpuAsyncTrackerBase(config) {}
+
+  ResourcesVector GetResourcesFromInstruction(
+      const HloInstruction& instr) const override {
+    CanonicalAsyncOp op = GetCanonicalAsyncOp(instr);
+    if (op.outer == HloOpcode::kAsyncStart ||
+        op.outer == HloOpcode::kAsyncDone) {
+      ResourceUsageType usage = op.outer == HloOpcode::kAsyncStart
+                                    ? ResourceUsageType::kResourceRelease
+                                    : ResourceUsageType::kResourceOccupy;
+
+      const int64_t gpu_stream_resource =
+          GetFirstTargetDefinedResource() +
+          static_cast<int64_t>(GpuResourceType::kGpuAsyncStream);
+      return {std::make_pair(gpu_stream_resource, usage)};
+    }
+    return GpuAsyncTrackerBase::GetResourcesFromInstruction(instr);
+  }
+
+  int64_t GetNumTargetDefinedResources() const override {
+    return static_cast<int64_t>(GpuResourceType::kNumTargetResources);
+  };
+
+  // Returns how many instructions using the given resource_type we can overlap
+  int64_t GetNumAvailableResources(int64_t resource_type) const override {
+    const int64_t first_target_resource = GetFirstTargetDefinedResource();
+    if (resource_type < first_target_resource) {
+      return GpuAsyncTrackerBase::GetNumAvailableResources(resource_type);
+    }
+    CHECK_EQ(resource_type,
+             first_target_resource +
+                 static_cast<int64_t>(GpuResourceType::kGpuAsyncStream));
+
+    // We will allow upto 1 outstanding collective on the async stream. This
+    // controls the number of collectives in flight in the schedule (a
+    // collective is in flight if the start is issued but not done). As an
+    // example, with 1, LHS will generate the schedule: s0,e0,s1,e1, i.e., s1
+    // is not scheduled until e0 is scheduled. With 2, the scheduler can
+    // schedule s0,s1,e0,e1, because it assumes that the 2 instances of the
+    // resources do not interfere with each other. If we do want to support > 1
+    // async stream, we can increase this number and then do a post-pass on the
+    // scheduled code to assign async stream-id to collectives (and actually
+    // support > 1 async stream in the runtime).
+    return 1;
+  }
+
+  absl::string_view GetResourceName(int64_t resource_type) const override {
+    const int64_t first_target_resource = GetFirstTargetDefinedResource();
+    if (resource_type < first_target_resource) {
+      return GpuAsyncTrackerBase::GetResourceName(resource_type);
+    }
+    CHECK_LE(resource_type,
+             first_target_resource + GetNumTargetDefinedResources());
+    switch (resource_type - first_target_resource) {
+      case static_cast<int64_t>(GpuResourceType::kGpuAsyncStream):
+        return "kGpuAsyncStream";
+      default:
+        return "kUnsupportedResource";
+    }
+  }
+
+  ResourceHazardType GetResourceHazardType(
+      int64_t resource_type) const override {
+    const int64_t first_target_resource = GetFirstTargetDefinedResource();
+    if (resource_type < first_target_resource) {
+      return GpuAsyncTrackerBase::GetResourceHazardType(resource_type);
+    }
+    CHECK_LE(resource_type,
+             first_target_resource + GetNumTargetDefinedResources());
+    return ResourceHazardType::kUnshareable;
+  }
+};
+
 class GpuLatencyEstimator : public ApproximateLatencyEstimator {
  public:
   TimeCost NodeCost(const HloInstruction* instr) const override {
+    if (IsNopInstruction(*instr)) {
+      return 0.0;
+    }
     // Consider cublas/cuddn/softmax custom calls as medium cost. Since the
     // latency between async-start and async-done is 5000 and cost of each
     // custom call is 1000, the LHS will try to schedule approximately 5 of
@@ -220,22 +356,62 @@ int64_t GetSizeOfShape(const Shape& shape, int pointer_size) {
 
 Status ScheduleGpuModule(HloModule* module, int64_t pointer_size,
                          const GpuDeviceInfo& gpu_info) {
+  TF_ASSIGN_OR_RETURN(
+      HloSchedule schedule,
+      ScheduleGpuModuleWithMemoryScheduler(module, pointer_size));
+  TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
+
+  // Tag the module with its 128 bit fingeprint. The fingerprint should include
+  // instruction name with ids
+  std::string fingerprint = module->GetFingerprint128(
+      HloPrintOptions::Canonical().set_print_backend_config(true));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  FrontendAttributes attributes;
+  (*attributes.mutable_map())[kFingerprintBeforeLHS] = fingerprint;
+  root->add_frontend_attributes(attributes);
+  VLOG(1) << "Fingerprint before LHS for module " << module->name() << "("
+          << module->unique_id() << ") = " << fingerprint;
+
   const bool enable_latency_hiding_scheduler =
       module->config()
           .debug_options()
           .xla_gpu_enable_latency_hiding_scheduler();
-  TF_ASSIGN_OR_RETURN(
-      HloSchedule schedule,
-      ScheduleGpuModuleWithMemoryScheduler(module, pointer_size,
-                                           !enable_latency_hiding_scheduler));
-  TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
 
   if (!enable_latency_hiding_scheduler) {
     return OkStatus();
   }
+
   SchedulerConfig config = GetSchedulerConfig(gpu_info);
-  auto latency_estimator = std::make_unique<GpuLatencyEstimator>();
-  auto async_tracker = std::make_unique<AsyncTracker>(config);
+  auto gpu_latency_estimator = std::make_unique<GpuLatencyEstimator>();
+
+  std::unique_ptr<LatencyEstimator> latency_estimator;
+  const std::string& pgle_profile_dir =
+      module->config().debug_options().xla_gpu_pgle_profile_directory();
+  if (!pgle_profile_dir.empty()) {
+    std::string pgle_profile_path =
+        pgle_profile_dir + "/" + fingerprint + ".pbtxt";
+    ProfiledInstructionsProto proto;
+    Status s =
+        tsl::ReadTextProto(tsl::Env::Default(), pgle_profile_path, &proto);
+    if (s.ok()) {
+      LOG(INFO) << "Found profile for module, using PGLE";
+      latency_estimator = std::make_unique<ProfileGuidedLatencyEstimator>(
+          config, std::move(gpu_latency_estimator), proto);
+    } else {
+      LOG(INFO) << "Unable to read PGLE profile: " << s.message();
+      latency_estimator = std::move(gpu_latency_estimator);
+    }
+  } else {
+    latency_estimator = std::move(gpu_latency_estimator);
+  }
+
+  auto async_tracker = [&]() -> std::unique_ptr<AsyncTracker> {
+    return module->config()
+                   .debug_options()
+                   .xla_gpu_lhs_enable_gpu_async_tracker()
+               ? std::make_unique<GpuAsyncTracker>(config)
+               : std::make_unique<GpuAsyncTrackerBase>(config);
+  }();
 
   auto shape_size_in_bytes = [pointer_size](const Shape& shape) {
     return GetSizeOfShape(shape, pointer_size);
@@ -253,5 +429,11 @@ Status ScheduleGpuModule(HloModule* module, int64_t pointer_size,
   return OkStatus();
 }
 
+HloInstructionSequence PostProcessSchedule(
+    const HloInstructionSequence& input) {
+  HloInstructionSequence result = PostprocessorToScheduleSyncCollectives(input);
+  return PostprocessorToScheduleAsEarlyOrLateAsPossible(result);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
index 0b6382b27f9..3eb7647c716 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
-#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 
 namespace xla {
 namespace gpu {
@@ -31,6 +30,9 @@ int64_t GetSizeOfShape(const Shape& shape, int pointer_size);
 // Determines the schedule of HLO instructions for a module run on the GPU.
 Status ScheduleGpuModule(HloModule* module, int64_t pointer_size,
                          const GpuDeviceInfo& gpu_info);
+HloInstructionSequence PostProcessSchedule(const HloInstructionSequence& input);
+
+constexpr absl::string_view kFingerprintBeforeLHS = "fingerprint_before_lhs";
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
index 8c836a37e12..3fb86614da9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -18,17 +18,17 @@ limitations under the License.
 #include <algorithm>
 #include <memory>
 #include <optional>
+#include <string_view>
 #include <vector>
 
-#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
-#include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace gpu {
@@ -48,11 +48,14 @@ class GpuHloScheduleTest : public HloTestBase {
     return SequentialHloOrdering{module->schedule()};
   }
 
-  HloModuleConfig GetModuleConfig(bool enable_latency_hiding_scheduler) {
+  HloModuleConfig GetModuleConfig(bool enable_latency_hiding_scheduler,
+                                  bool enable_gpu_async_tracker = false) {
     HloModuleConfig config;
     DebugOptions debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_enable_latency_hiding_scheduler(
         enable_latency_hiding_scheduler);
+    debug_options.set_xla_gpu_lhs_enable_gpu_async_tracker(
+        enable_gpu_async_tracker);
     config.set_debug_options(debug_options);
     return config;
   }
@@ -62,6 +65,17 @@ class GpuHloScheduleTest : public HloTestBase {
     return std::make_unique<HloModule>(
         "test_module", GetModuleConfig(enable_latency_hiding_scheduler));
   }
+
+  static bool HasValidFingerprint(HloModule* module) {
+    // Verify that the fingerprint of HLO prior to LHS is present.
+    const HloInstruction* root =
+        module->entry_computation()->root_instruction();
+    const FrontendAttributes& attrs = root->frontend_attributes();
+    auto it = attrs.map().find(kFingerprintBeforeLHS);
+
+    // The fingerprint is 128 bits stored as a hex string (128/4 hex digits).
+    return it != attrs.map().end() && it->second.size() == 128 / 4;
+  }
 };
 
 // Test of a single stream, where data dependencies fully determine the
@@ -88,6 +102,7 @@ TEST_F(GpuHloScheduleTest, SequentialMatMul) {
   EXPECT_TRUE(order.ExecutesBefore(z, dot1));
   EXPECT_TRUE(order.ExecutesBefore(z, dot2));
   EXPECT_TRUE(order.ExecutesBefore(dot1, dot2));
+  EXPECT_TRUE(HasValidFingerprint(module.get()));
 }
 
 // Test of a single stream, where data dependencies do not fully determine the
@@ -117,6 +132,7 @@ TEST_F(GpuHloScheduleTest, SequentialAdd) {
   EXPECT_TRUE(order.ExecutesBefore(z, add2));
   EXPECT_TRUE(order.ExecutesBefore(add1, add2));
   EXPECT_TRUE(order.ExecutesBefore(add2, add3));
+  EXPECT_TRUE(HasValidFingerprint(module.get()));
 }
 
 TEST_F(GpuHloScheduleTest, AsyncCustomCall) {
@@ -176,6 +192,7 @@ TEST_F(GpuHloScheduleTest, AsyncCustomCall) {
   // LATEST is in effect.
   EXPECT_TRUE(order.ExecutesBefore(add3, blocking_call));
   EXPECT_TRUE(order.ExecutesBefore(blocking_call, add4));
+  EXPECT_TRUE(HasValidFingerprint(module.get()));
 }
 
 TEST_F(GpuHloScheduleTest, AsyncCollectivePermute) {
@@ -231,6 +248,7 @@ TEST_F(GpuHloScheduleTest, AsyncCollectivePermute) {
   // Test that all_reduce_done is scheduled after add3.
   EXPECT_TRUE(order.ExecutesBefore(add3, collective_permute_done));
   EXPECT_TRUE(order.ExecutesBefore(collective_permute_done, add4));
+  EXPECT_TRUE(HasValidFingerprint(module.get()));
 }
 
 TEST_F(GpuHloScheduleTest, LHSCostModel) {
@@ -247,7 +265,7 @@ TEST_F(GpuHloScheduleTest, LHSCostModel) {
     p1 = f32[32, 32] parameter(1)
     p2 = f32[32, 32] parameter(2)
     p3 = f32[32] parameter(3)
-   
+
     dot0 = f32[32,32]{1,0} custom-call(p1, p2), custom_call_target="__cublas$gemm"
     dot1 = f32[32,32]{1,0} custom-call(dot0, p2), custom_call_target="__cublas$gemm"
     dot2 = f32[32,32]{1,0} custom-call(dot1, p2), custom_call_target="__cublas$gemm"
@@ -255,7 +273,7 @@ TEST_F(GpuHloScheduleTest, LHSCostModel) {
     dot4 = f32[32,32]{1,0} custom-call(dot3, p2), custom_call_target="__cublas$gemm"
     dot5 = f32[32,32]{1,0} custom-call(dot4, p2), custom_call_target="__cublas$gemm"
     dot6 = f32[32,32]{1,0} custom-call(dot5, p2), custom_call_target="__cublas$gemm"
-  
+
     ar-start = f32[32] all-reduce-start(p0), to_apply=apply_op
     ar-done = f32[32] all-reduce-done(ar-start)
 
@@ -268,7 +286,7 @@ TEST_F(GpuHloScheduleTest, LHSCostModel) {
     add3 = f32[32,32] add(add2, dot4)
     add4 = f32[32,32] add(add3, dot5)
     add5 = f32[32,32] add(add4, dot6)
-   
+
     ROOT t = (f32[32], f32[32], f32[32,32]) tuple(ar-done, ar-done1, add5)
   })";
 
@@ -299,6 +317,7 @@ TEST_F(GpuHloScheduleTest, LHSCostModel) {
   EXPECT_EQ(count_between_pairs.size(), 2);
   EXPECT_GT(count_between_pairs[0], 0);
   EXPECT_GT(count_between_pairs[1], 0);
+  EXPECT_TRUE(HasValidFingerprint(module.get()));
 }
 
 class GpuHloScheduleParameterizedTest
@@ -375,10 +394,136 @@ TEST_P(GpuHloScheduleParameterizedTest, AsyncAllReduce) {
   // Test that all_reduce_done is scheduled after add3.
   EXPECT_TRUE(order.ExecutesBefore(add3, all_reduce_done));
   EXPECT_TRUE(order.ExecutesBefore(all_reduce_done, add4));
+  EXPECT_TRUE(HasValidFingerprint(module.get()));
+}
+
+TEST_P(GpuHloScheduleParameterizedTest, LHSResourceModel) {
+  const char* hlo_text = R"(
+  HloModule AsyncModule
+  apply_op {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT apply_op = f32[] add(x, y)
+  }
+
+  ENTRY ar {
+    p0 = f32[32] parameter(0)
+    p1 = f32[32, 32] parameter(1)
+    p2 = f32[32, 32] parameter(2)
+    p3 = f32[32] parameter(3)
+
+    dot0 = f32[32,32]{1,0} custom-call(p1, p2), custom_call_target="__cublas$gemm"
+    dot1 = f32[32,32]{1,0} custom-call(dot0, p2), custom_call_target="__cublas$gemm"
+    dot2 = f32[32,32]{1,0} custom-call(dot1, p2), custom_call_target="__cublas$gemm"
+    dot3 = f32[32,32]{1,0} custom-call(dot2, p2), custom_call_target="__cublas$gemm"
+    dot4 = f32[32,32]{1,0} custom-call(dot3, p2), custom_call_target="__cublas$gemm"
+    dot5 = f32[32,32]{1,0} custom-call(dot4, p2), custom_call_target="__cublas$gemm"
+    dot6 = f32[32,32]{1,0} custom-call(dot5, p2), custom_call_target="__cublas$gemm"
+
+    ar-start = f32[32] all-reduce-start(p0), to_apply=apply_op
+    ar-done = f32[32] all-reduce-done(ar-start)
+
+    %ag-start = (f32[32], f32[64]) all-gather-start(p3), dimensions={0}
+    %ag-done = f32[64] all-gather-done(%ag-start)
+
+    add0 = f32[32,32] add(dot0, dot1)
+    add1 = f32[32,32] add(add0, dot2)
+    add2 = f32[32,32] add(add1, dot3)
+    add3 = f32[32,32] add(add2, dot4)
+    add4 = f32[32,32] add(add3, dot5)
+    add5 = f32[32,32] add(add4, dot6)
+
+    ROOT t = (f32[32], f32[64], f32[32,32]) tuple(ar-done, %ag-done, add5)
+  })";
+
+  const bool enable_gpu_async_tracker = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      ParseAndReturnVerifiedModule(
+          hlo_text,
+          GetModuleConfig(
+              /*enable_latency_hiding_scheduler=*/true,
+              /*enable_gpu_async_tracker=*/enable_gpu_async_tracker)));
+  SequentialHloOrdering order = BuildHloOrdering(module.get());
+
+  // Count the number of collectives in flight. Without gpu async tracker, we
+  // will incorrectly have 2 in-flight (as base async tracker assumes each
+  // collective can be scheduled independently as they use different resource
+  // types), but with gpu async tracker we will have 1.
+  uint32_t in_flight = 0;
+  uint32_t max_in_flight = 0;
+  for (const HloInstruction* inst :
+       order.SequentialOrder(*module->entry_computation())->instructions()) {
+    HloOpcode op = inst->opcode();
+    if (hlo_query::IsAsyncCollectiveStartOp(op)) {
+      in_flight++;
+      max_in_flight = std::max(max_in_flight, in_flight);
+    } else if (hlo_query::IsAsyncCollectiveDoneOp(op)) {
+      in_flight--;
+    }
+  }
+
+  const uint32_t expected_max_in_flight = enable_gpu_async_tracker ? 1 : 2;
+  EXPECT_EQ(expected_max_in_flight, max_in_flight);
+  EXPECT_TRUE(HasValidFingerprint(module.get()));
 }
 
 INSTANTIATE_TEST_SUITE_P(GpuHloScheduleParameterizedTest,
                          GpuHloScheduleParameterizedTest, ::testing::Bool());
 
+using GpuHloSchedulePostProcessTest = HloTestBase;
+
+TEST_F(GpuHloSchedulePostProcessTest, PostProcessAsyncCollectives) {
+  const char* hlo_text = R"(
+  HloModule AsyncModule, is_scheduled=true
+  apply_op {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT apply_op = f32[] add(x, y)
+  }
+
+  ENTRY ar {
+    p0 = f32[32] parameter(0)
+    p1 = f32[32] parameter(1)
+
+    // This is async by default, so we expect the start/done to be moved.
+    ar-start = f32[32] all-reduce-start(p0), to_apply=apply_op
+    add0 = f32[32] add(p0, p0)
+    ar-done = f32[32] all-reduce-done(ar-start)
+
+    // This will be sync, so we expect the start/done to be moved next to each
+    // other.
+    ag-start = (f32[32], f32[64]) all-gather-start(p1), dimensions={0}, backend_config="{\"is_sync\":true}"
+    add1 = f32[32] add(p1, p1)
+    ag-done = f32[64] all-gather-done(ag-start)
+
+    add2 = f32[32] add(add0, add1)
+    add3 = f32[32] add(add2, ar-done)
+    ROOT result = (f32[32], f32[64]) tuple(add3, ag-done)
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(hlo_text, /*replica_count=*/2));
+
+  const HloInstructionSequence& input =
+      module->schedule().sequence(module->entry_computation());
+  HloInstructionSequence result = PostProcessSchedule(input);
+
+  const std::vector<std::string_view> expected_sequence = {
+      "p0",
+      "ar-start",  // ar-start is async, should be scheduled as early as
+                   // possible.
+      "p1", "add0", "add1",
+      "ag-start",  // ag-start is sync, so its scheduled right before its done.
+      "ag-done", "add2",
+      "ar-done",  // ar-done is async, should be scheduled as late as possible.
+      "add3", "result"};
+
+  ASSERT_EQ(expected_sequence.size(), result.size());
+  for (int i = 0; i < result.size(); ++i) {
+    EXPECT_EQ(expected_sequence[i], result.instructions()[i]->name());
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index c93ff0d2e1c..c0f9a392717 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -235,9 +235,13 @@ bool DotCanSupportShapeWithLayout(const HloInstruction* dot,
   // If we are able to construct a `MatrixLayout` then the dot can support
   // this layout.
   return MatrixLayout::For(shape, dot_dims.lhs_batch_dimensions().size(),
-                           dot_dims.lhs_contracting_dimensions().size(),
+                           dot->operand(0)->shape().rank() -
+                               dot_dims.lhs_contracting_dimensions().size() -
+                               dot_dims.lhs_batch_dimensions().size(),
                            dot_dims.rhs_batch_dimensions().size(),
-                           dot_dims.rhs_contracting_dimensions().size())
+                           dot->operand(1)->shape().rank() -
+                               dot_dims.rhs_contracting_dimensions().size() -
+                               dot_dims.rhs_batch_dimensions().size())
       .ok();
 }
 
@@ -260,7 +264,7 @@ Status GpuLayoutAssignment::AddBackendConstraints(
     CHECK(!IsCublasGemm(*instruction))
         << "Gemm rewriting should run after layout assignment";
 
-    if (IsMatrixMultiplication(*instruction)) {
+    if (instruction->opcode() == HloOpcode::kDot) {
       const Shape& output_shape = instruction->shape();
       const Shape& lhs_shape = instruction->operand(0)->shape();
       const Shape& rhs_shape = instruction->operand(1)->shape();
@@ -273,19 +277,19 @@ Status GpuLayoutAssignment::AddBackendConstraints(
       // minor physical dimension for inputs or the output.
       absl::Span<const int64_t> lhs_batch_dims =
           dot_dims.lhs_batch_dimensions();
-      absl::Span<const int64_t> lhs_col_dims =
+      absl::Span<const int64_t> lhs_contracting_dims =
           dot_dims.lhs_contracting_dimensions();
-      TF_ASSIGN_OR_RETURN(
-          std::vector<int64_t> lhs_row_dims,
-          GetNonContractingDims(lhs_shape, lhs_batch_dims, lhs_col_dims));
+      TF_ASSIGN_OR_RETURN(std::vector<int64_t> lhs_non_contracting_dims,
+                          GetNonContractingDims(lhs_shape, lhs_batch_dims,
+                                                lhs_contracting_dims));
 
       absl::Span<const int64_t> rhs_batch_dims =
           dot_dims.rhs_batch_dimensions();
-      absl::Span<const int64_t> rhs_row_dims =
+      absl::Span<const int64_t> rhs_contracting_dims =
           dot_dims.rhs_contracting_dimensions();
-      TF_ASSIGN_OR_RETURN(
-          std::vector<int64_t> rhs_col_dims,
-          GetNonContractingDims(rhs_shape, rhs_batch_dims, rhs_row_dims));
+      TF_ASSIGN_OR_RETURN(std::vector<int64_t> rhs_non_contracting_dims,
+                          GetNonContractingDims(rhs_shape, rhs_batch_dims,
+                                                rhs_contracting_dims));
 
       // For unbatched S8xS8->S32 matrix multiplication enforce a TN layout,
       // which will allow the NVidia GPUs to use TensorCores.
@@ -298,16 +302,32 @@ Status GpuLayoutAssignment::AddBackendConstraints(
 
       if (is_s8_to_s32) {
         TF_RETURN_IF_ERROR(SetOperandBatchRowsColsLayout(
-            instruction, 0, lhs_batch_dims, lhs_row_dims, lhs_col_dims));
+            instruction, 0, lhs_batch_dims, lhs_non_contracting_dims,
+            lhs_contracting_dims));
         TF_RETURN_IF_ERROR(SetOperandBatchRowsColsLayout(
-            instruction, 1, rhs_batch_dims, rhs_col_dims, rhs_row_dims));
-        TF_RETURN_IF_ERROR(SetDotLayout(instruction, constraints));
-      } else if (!lhs_batch_dims.empty()) {
-        TF_RETURN_IF_ERROR(SetDotOperandLayout(instruction, 0, lhs_batch_dims,
-                                               lhs_row_dims, lhs_col_dims));
-        TF_RETURN_IF_ERROR(SetDotOperandLayout(instruction, 1, rhs_batch_dims,
-                                               rhs_row_dims, rhs_col_dims));
+            instruction, 1, rhs_batch_dims, rhs_non_contracting_dims,
+            rhs_contracting_dims));
         TF_RETURN_IF_ERROR(SetDotLayout(instruction, constraints));
+      } else {
+        if (!lhs_batch_dims.empty() || lhs_contracting_dims.size() > 1 ||
+            lhs_non_contracting_dims.size() > 1) {
+          TF_RETURN_IF_ERROR(SetDotOperandLayout(instruction, 0, lhs_batch_dims,
+                                                 lhs_contracting_dims,
+                                                 lhs_non_contracting_dims));
+        }
+        if (!rhs_batch_dims.empty() || rhs_non_contracting_dims.size() > 1 ||
+            rhs_contracting_dims.size() > 1) {
+          TF_RETURN_IF_ERROR(SetDotOperandLayout(instruction, 1, rhs_batch_dims,
+                                                 rhs_contracting_dims,
+                                                 rhs_non_contracting_dims));
+        }
+        // If we have at least one batch dimension or there is more than one
+        // non-contracting dimension on lhs or rhs, we need to set a layout for
+        // the dot output.
+        if (!lhs_batch_dims.empty() || lhs_non_contracting_dims.size() > 1 ||
+            rhs_non_contracting_dims.size() > 1) {
+          TF_RETURN_IF_ERROR(SetDotLayout(instruction, constraints));
+        }
       }
     } else if (instruction->opcode() == HloOpcode::kTranspose) {
       const HloInstruction* operand = instruction->operand(0);
@@ -417,17 +437,9 @@ Status GpuLayoutAssignment::SetDotOperandLayout(
   if (MatrixLayout::For(shape, batch_dims, row_dims, col_dims).ok())
     return SetOperandLayout(shape, instruction, operand);
 
-  // Otherwise, fallback to forcing the same layout as chosen by dot
-  // normalization, i.e. (batch, rows, cols) layout for the second operand and
-  // (batch, cols, rows) layout for the first operand.
-  if (operand == 1) {
-    return SetOperandBatchRowsColsLayout(instruction, operand, batch_dims,
-                                         row_dims, col_dims);
-  }
-  // To get (batch, cols, rows) layout, simply swap 'row_dims' with 'col_dims'
-  // when calling SetOperandBatchRowsColsLayout.
+  // Otherwise, fallback to forcing (batch, rows, cols) layout.
   return SetOperandBatchRowsColsLayout(instruction, operand, batch_dims,
-                                       col_dims, row_dims);
+                                       row_dims, col_dims);
 }
 
 Status GpuLayoutAssignment::SetOperandBatchRowsColsLayout(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
index dd932bd00b1..431b01e135c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
@@ -39,6 +39,9 @@ class GpuLayoutAssignment : public LayoutAssignment {
 
  protected:
   Status AddBackendConstraints(LayoutConstraints* constraints) override;
+  // The GPU backend does not use memory spaces, so there is no need to
+  // propagate them.
+  Status PropagateMemorySpace(HloModule* module) override { return OkStatus(); }
 
  private:
   Status AddBackendConstraintsToDnnConvCustomCall(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index ad2ffc1c212..b057885a5f2 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -209,11 +209,11 @@ TEST_F(LayoutAssignmentTest, TransposedDotLayout) {
   HloModule DotLayout
   ENTRY dot {
     p0 = f32[5,2,3] parameter(0)
-    p1 = f32[5,3,4] parameter(1)
-    dot = f32[5,2,4] dot(p0, p1),
+    p1 = f32[5,3,4,6] parameter(1)
+    dot = f32[5,2,4,6] dot(p0, p1),
       lhs_batch_dims={0}, lhs_contracting_dims={2},
       rhs_batch_dims={0}, rhs_contracting_dims={1}
-    ROOT out = f32[2,5,4] transpose(dot), dimensions={1,0,2}
+    ROOT out = f32[2,5,4,6] transpose(dot), dimensions={1,0,2,3}
   })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -227,11 +227,53 @@ TEST_F(LayoutAssignmentTest, TransposedDotLayout) {
 
   EXPECT_THAT(layout_assignment.Run(module.get()), IsOkAndHolds(true));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              AllOf(op::Transpose(
-                        AllOf(op::Dot(op::ShapeWithLayout("f32[5,2,3]{2,1,0}"),
-                                      op::ShapeWithLayout("f32[5,3,4]{2,1,0}")),
-                              op::ShapeWithLayout("f32[5,2,4]{2,0,1}"))),
-                    op::ShapeWithLayout("f32[2,5,4]{2,1,0}")));
+              AllOf(op::Transpose(AllOf(
+                        op::Dot(op::ShapeWithLayout("f32[5,2,3]{2,1,0}"),
+                                op::ShapeWithLayout("f32[5,3,4,6]{3,2,1,0}")),
+                        op::ShapeWithLayout("f32[5,2,4,6]{3,2,0,1}"))),
+                    op::ShapeWithLayout("f32[2,5,4,6]{3,2,1,0}")));
+}
+
+TEST_F(LayoutAssignmentTest, TransposedDotOfDotLayout) {
+  const char* hlo_text = R"(
+  HloModule DotLayout
+  ENTRY dot {
+    p0 = f32[8,50] parameter(0)
+    p1 = f32[2,8,4,4] parameter(1)
+    p2 = f32[4,38] parameter(2)
+    dot.1 = f32[50,2,4,4]{3,2,1,0} dot(p0, p1),
+      lhs_contracting_dims={0}, rhs_contracting_dims={1}
+    dot.2 = f32[50,2,4,38]{3,2,1,0} dot(dot.1, p2),
+      lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    ROOT out = f32[2,50,38,4]{2,3,0,1} transpose(dot.2), dimensions={1,0,3,2}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+
+  ComputationLayout computation_layout(
+      module->entry_computation()->ComputeProgramShape(),
+      /*ignore_layouts=*/false);
+  GpuLayoutAssignment layout_assignment(&computation_layout,
+                                        backend().default_stream_executor());
+
+  EXPECT_THAT(layout_assignment.Run(module.get()), IsOkAndHolds(true));
+  // The transpose layout is not supported by dot.2. Also, we need a copy
+  // between dot.1 and dot.2, because the needed operand layout for the lhs of
+  // dot.1 cannot be used as layout for dot.1
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      AllOf(
+          op::Transpose(AllOf(
+              op::Dot(AllOf(op::Copy(AllOf(
+                                op::Dot(op::ShapeWithLayout("f32[8,50]{1,0}"),
+                                        op::ShapeWithLayout(
+                                            "f32[2,8,4,4]{3,2,0,1}")),
+                                op::ShapeWithLayout("f32[50,2,4,4]{3,2,1,0}"))),
+                            op::ShapeWithLayout("f32[50,2,4,4]{3,1,0,2}")),
+                      op::ShapeWithLayout("f32[4,38]{1,0}")),
+              op::ShapeWithLayout("f32[50,2,4,38]{3,2,1,0}"))),
+          op::ShapeWithLayout("f32[2,50,38,4]{2,3,0,1}")));
 }
 
 TEST_F(LayoutAssignmentTest, DotLayoutS8) {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_reduce_scatter_creator.cc b/tensorflow/compiler/xla/service/gpu/gpu_reduce_scatter_creator.cc
index 8b974f42b8a..3e33df5840c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_reduce_scatter_creator.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_reduce_scatter_creator.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/service/reduce_scatter_utils.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc
index 0833402d355..e223aa35c0a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc
@@ -15,9 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
 
-#include <memory>
-#include <set>
-#include <vector>
+#include <string>
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
@@ -42,9 +40,7 @@ StatusOr<bool> GpuSanitizeConstantNames::Run(
         continue;
       }
 
-      const std::string& old_name = instr->name();
       instr->UniquifyName(&instr_name_uniquer);
-      CHECK_EQ(old_name, instr->name());
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_serializable_autotuner.h b/tensorflow/compiler/xla/service/gpu/gpu_serializable_autotuner.h
index 75138bedc81..b80e96e89f4 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_serializable_autotuner.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_serializable_autotuner.h
@@ -48,7 +48,13 @@ struct DevicelessConfig {
   se::CudaComputeCapability cuda_compute_capability{0, 0};
 };
 
-using AutotuningConfig = std::variant<DeviceConfig, DevicelessConfig>;
+struct AutotuningConfig : public std::variant<DeviceConfig, DevicelessConfig> {
+  using std::variant<DeviceConfig, DevicelessConfig>::variant;
+  bool is_offline() const {
+    return std::holds_alternative<DevicelessConfig>(*this);
+  }
+  bool is_online() const { return std::holds_alternative<DeviceConfig>(*this); }
+};
 
 using AutotuneCacheKey =
     std::tuple<std::string /* stream_exec->GetDeviceDescription().model_str()*/,
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
index e975af9ea66..ac362e02721 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
@@ -80,7 +80,7 @@ Status InfeedManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
   Status block_status = stream()->BlockHostUntilDone();
   if (!block_status.ok()) {
     return InternalError("Failed to complete data transfer on stream %p: %s",
-                         stream(), block_status.error_message());
+                         stream(), block_status.message());
   }
 
   EnqueueDestination(std::move(buffer_tree));
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
index 8f57e36642e..24154aa876e 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -62,7 +62,7 @@ Status InfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
   Status block_status = stream.BlockHostUntilDone();
   if (!block_status.ok()) {
     return InternalError("Failed to complete data transfer on stream %p: %s",
-                         &stream, block_status.error_message());
+                         &stream, block_status.message());
   }
 
   VLOG(2) << "Infeeding to GPU complete";
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 5139ac5833c..24e5b0543a7 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -492,8 +492,13 @@ static int64_t GetMemRefSizeInBytes(mlir::MemRefType type) {
   // For i1 memrefs, the underlying allocation is 8 bits.
   if (type.getElementType().isInteger(/*width=*/1)) {
     return type.getNumElements();
+  } else if (auto complexType =
+                 type.getElementType().dyn_cast<mlir::ComplexType>()) {
+    auto elementType = complexType.getElementType();
+    return elementType.getIntOrFloatBitWidth() * type.getNumElements() * 2 /
+           CHAR_BIT;
   } else {
-    return type.cast<mlir::ShapedType>().getSizeInBits() / CHAR_BIT;
+    return type.getNumElements() * type.getElementTypeBitWidth() / CHAR_BIT;
   }
 }
 
@@ -647,8 +652,8 @@ std::vector<HloInstruction*> GetFusionRoots(HloComputation* computation) {
   return out;
 }
 
-static std::optional<Vector3> FindTiledTranspose(const HloInstruction& instr,
-                                                 Vector3& permutation) {
+std::optional<Vector3> FindTiledTranspose(const HloInstruction& instr,
+                                          Vector3& permutation) {
   if (instr.opcode() != HloOpcode::kCopy) {
     return std::nullopt;
   }
@@ -656,7 +661,8 @@ static std::optional<Vector3> FindTiledTranspose(const HloInstruction& instr,
   if (std::optional<Vector3> tr = ShapeUtil::GetNormalizedTransposeShape(
           instr.operand(0)->shape(), instr.shape(), Vector3{0, 2, 1})) {
     if (tr->at(1) >= kMinDimensionToTransposeTiled &&
-        tr->at(2) >= kMinDimensionToTransposeTiled) {
+        (tr->at(2) >= kMinDimensionToTransposeTiled ||
+         tr->at(1) * tr->at(2) >= kMinTotalDimensionsToTransposeTiled)) {
       permutation = Vector3{0, 2, 1};
       return tr;
     }
@@ -664,7 +670,8 @@ static std::optional<Vector3> FindTiledTranspose(const HloInstruction& instr,
   if (std::optional<Vector3> tr = ShapeUtil::GetNormalizedTransposeShape(
           instr.operand(0)->shape(), instr.shape(), Vector3{2, 1, 0})) {
     if (tr->at(0) >= kMinDimensionToTransposeTiled &&
-        tr->at(2) >= kMinDimensionToTransposeTiled) {
+        (tr->at(2) >= kMinDimensionToTransposeTiled ||
+         tr->at(0) * tr->at(2) >= kMinTotalDimensionsToTransposeTiled)) {
       permutation = Vector3{2, 1, 0};
       return tr;
     }
@@ -684,7 +691,8 @@ std::optional<Vector3> FindTiledLogicalTranspose(const HloInstruction& instr,
           instr.operand(0)->shape(), instr.shape(), instr.dimensions(),
           Vector3{0, 2, 1})) {
     if (tr->at(1) >= kMinDimensionToTransposeTiled &&
-        tr->at(2) >= kMinDimensionToTransposeTiled) {
+        (tr->at(2) >= kMinDimensionToTransposeTiled ||
+         tr->at(1) * tr->at(2) >= kMinTotalDimensionsToTransposeTiled)) {
       permutation = Vector3{0, 2, 1};
       return tr;
     }
@@ -693,7 +701,8 @@ std::optional<Vector3> FindTiledLogicalTranspose(const HloInstruction& instr,
           instr.operand(0)->shape(), instr.shape(), instr.dimensions(),
           Vector3{2, 1, 0})) {
     if (tr->at(0) >= kMinDimensionToTransposeTiled &&
-        tr->at(2) >= kMinDimensionToTransposeTiled) {
+        (tr->at(2) >= kMinDimensionToTransposeTiled ||
+         tr->at(0) * tr->at(2) >= kMinTotalDimensionsToTransposeTiled)) {
       permutation = Vector3{2, 1, 0};
       return tr;
     }
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index 4c58e866c90..1a63f980f64 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -31,9 +31,12 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// If a dimensions is smaller than this, untiled transposition may be more
-// efficient.
+// If the most minor dimension in the transpose operand is smaller than this,
+// untiled transposition may be more efficient.
 inline constexpr int64_t kMinDimensionToTransposeTiled = 16;
+// But if the product of the dimensions to be swapped is larger than this, tiled
+// transposition may be more efficient.
+inline constexpr int64_t kMinTotalDimensionsToTransposeTiled = 64 * 128;
 
 // Matrix multiplication before the rewrite.
 //
@@ -204,6 +207,9 @@ const HloInstruction& FindNonTrivialHero(const HloInstruction& instr);
 // Whether there is a fusion root triggering transposition emitter.
 bool HasAnyTiledTransposeRoot(HloComputation* computation);
 
+std::optional<Vector3> FindTiledTranspose(const HloInstruction& instr,
+                                          Vector3& permutation);
+
 std::optional<Vector3> FindTiledLogicalTranspose(const HloInstruction& instr,
                                                  Vector3& permutation);
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils_test.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils_test.cc
index 832d0018b55..99d0243a1a7 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils_test.cc
@@ -122,5 +122,45 @@ ENTRY entry {
                 std::make_pair(Vector3{64, 48, 32}, Vector3{2, 1, 0})));
 }
 
+TEST_F(IrEmissionUtilsTest, FindTiledTransposeOneSwapDimIsSmall) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f32[10,11,12,16]{3,2,1,0} parameter(0)
+  ROOT c = f32[10,11,12,16]{1,0,2,3} copy(p)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* copy = module->entry_computation()->root_instruction();
+  Vector3 permutation;
+  EXPECT_EQ(FindTiledTranspose(*copy, permutation),
+            std::make_optional(Vector3{16, 12, 110}));
+  Vector3 expected_permutation{2, 1, 0};
+  EXPECT_EQ(permutation, expected_permutation);
+}
+
+TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeOneSwapDimIsSmall) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f32[10,11,12,16]{3,2,1,0} parameter(0)
+  ROOT t = f32[16,12,10,11]{3,2,1,0} transpose(p), dimensions={3,2,0,1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* tr = module->entry_computation()->root_instruction();
+  Vector3 permutation;
+  EXPECT_EQ(FindTiledLogicalTranspose(*tr, permutation),
+            std::make_optional(Vector3{16, 12, 110}));
+  Vector3 expected_permutation{2, 1, 0};
+  EXPECT_EQ(permutation, expected_permutation);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index d96d3196bad..f5aba830e7f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
-#include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index e0fb0b63d5b..8dbfff94709 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -94,9 +94,9 @@ Status IrEmitterNested::CodegenNestedComputation() {
       function_type,                       // The function type.
       llvm::GlobalValue::InternalLinkage,  // The linkage type.
       ir_emitter_context_->name_uniquer()->GetUniqueName(
-          llvm_ir::SanitizeFunctionName(
-              nested_computation_.name())),  // The name of the function.
-      ir_emitter_context_->llvm_module());   // The parent LLVM module.
+          llvm_ir::SanitizeFunctionName(std::string(
+              nested_computation_.name()))),  // The name of the function.
+      ir_emitter_context_->llvm_module());    // The parent LLVM module.
   for (size_t arg_no = 0; arg_no < argument_dereferenceable_bytes.size();
        ++arg_no) {
     int64_t arg_size = argument_dereferenceable_bytes[arg_no];
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton.cc
index c421d9cd0c2..8bedd8b60d3 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton.cc
@@ -15,27 +15,22 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_triton.h"
 
-#include <algorithm>
+#include <climits>
 #include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
-#include <ostream>
-#include <sstream>
 #include <string>
 #include <system_error>  // NOLINT(build/c++11): required to interface with LLVM
-#include <tuple>
 #include <utility>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"  // from @llvm-project
@@ -56,6 +51,7 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
@@ -70,11 +66,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/path.h"
-#include "triton/Conversion/TritonGPUToLLVM/ArithToIndexPass.h"
 #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h"
 #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -175,7 +171,8 @@ Value Cast(mlir::OpBuilder b, mlir::Location loc, Value value,
 }
 
 // Create a scalar constant.
-ma::ConstantOp CreateConst(mlir::ImplicitLocOpBuilder b, Type type, int value) {
+ma::ConstantOp CreateConst(mlir::ImplicitLocOpBuilder b, Type type,
+                           int64_t value) {
   if (type.isa<mlir::IntegerType>()) {
     return b.create<ma::ConstantOp>(b.getIntegerAttr(type, value));
   }
@@ -187,8 +184,8 @@ ma::ConstantOp CreateConst(mlir::ImplicitLocOpBuilder b, Type type, int value) {
 }
 
 // Create a tensor constant.
-ma::ConstantOp CreateConst(mlir::ImplicitLocOpBuilder b, Type type, int value,
-                           mlir::ArrayRef<int64_t> shape) {
+ma::ConstantOp CreateConst(mlir::ImplicitLocOpBuilder b, Type type,
+                           int64_t value, mlir::ArrayRef<int64_t> shape) {
   auto tensor_type = mlir::RankedTensorType::get(shape, type);
   if (auto int_type = type.dyn_cast<mlir::IntegerType>()) {
     return b.create<ma::ConstantOp>(mlir::DenseElementsAttr::get(
@@ -205,8 +202,8 @@ void CreateTritonPipeline(mlir::OpPassManager& pm,
                           const se::CudaComputeCapability& cc, int num_warps,
                           int num_stages) {
   const int ccAsInt = cc.major * 10 + cc.minor;
-  // Based on optimize_triton_ir() in
-  // @triton//:python/triton/compiler.py
+  // Based on optimize_ttir() in
+  // @triton//:python/triton/compiler/compiler.py
   pm.addPass(mlir::createInlinerPass());
   pm.addPass(mt::createCombineOpsPass());
   pm.addPass(mlir::createCanonicalizerPass());
@@ -214,11 +211,12 @@ void CreateTritonPipeline(mlir::OpPassManager& pm,
   pm.addPass(mlir::createLoopInvariantCodeMotionPass());
   pm.addPass(mlir::createSymbolDCEPass());
   // Based on ttir_to_ttgir() in
-  // @triton//:python/triton/compiler.py
+  // @triton//:python/triton/compiler/compiler.py
   pm.addPass(mt::createConvertTritonToTritonGPUPass(num_warps));
   // Based on optimize_ttgir() in
-  // @triton//:python/triton/compiler.py
+  // @triton//:python/triton/compiler/compiler.py
   pm.addPass(mlir::createTritonGPUCoalescePass());
+  pm.addPass(mlir::createTritonGPURemoveLayoutConversionsPass());
   pm.addPass(mlir::createTritonGPUAccelerateMatmulPass(ccAsInt));
   pm.addPass(mlir::createTritonGPURemoveLayoutConversionsPass());
   pm.addPass(mlir::createTritonGPUOptimizeDotOperandsPass());
@@ -227,16 +225,12 @@ void CreateTritonPipeline(mlir::OpPassManager& pm,
   pm.addPass(mlir::createTritonGPUOptimizeDotOperandsPass());
   pm.addPass(mlir::createTritonGPURemoveLayoutConversionsPass());
   pm.addPass(mlir::createTritonGPUDecomposeConversionsPass());
-  if (cc.major == se::CudaComputeCapability::VOLTA) {
-    pm.addPass(mlir::createTritonGPUUpdateMmaForVoltaPass());
-  }
   pm.addPass(mlir::createTritonGPUReorderInstructionsPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createSymbolDCEPass());
   // Based on translateTritonGPUToLLVMIR() in
   // @triton//:lib/Target/LLVMIR/LLVMIRTranslation.cpp
   pm.addPass(mlir::createConvertSCFToCFPass());
-  pm.addPass(mt::createTritonConvertArithToIndexPass());
   pm.addPass(mlir::createConvertIndexToLLVMPass());
   pm.addPass(mt::createConvertTritonGPUToLLVMPass(ccAsInt));
   pm.addPass(mlir::createArithToLLVMConversionPass());
@@ -324,13 +318,12 @@ struct GeneralizeKernelSignaturePass
   }
 };
 
-}  // namespace
-
 // Variable naming: lhs [m, k] x rhs [k, n] -> out [m, n].
 // TODO(b/270937368): Split this up into smaller functions.
-std::optional<LaunchDimensions> MatMul(
+template <typename IndexT>
+StatusOr<LaunchDimensions> MatMulImpl(
     mlir::OpBuilder builder, const HloDotInstruction* dot_instr,
-    mlir::func::FuncOp fn,
+    mlir::triton::FuncOp fn,
     const tensorflow::AutotuneResult::TritonGemmKey& config, int shmem_budget) {
   // We'll be creating a lot of instructions from a single dot, use an
   // implicit loc builder so we don't have to pass around the location all the
@@ -338,8 +331,14 @@ std::optional<LaunchDimensions> MatMul(
   auto loc = mlir::NameLoc::get(builder.getStringAttr(dot_instr->name()));
   mlir::ImplicitLocOpBuilder b(loc, builder);
   Type i32_ty = b.getI32Type();
+  Type int_ty;
+  if constexpr (std::is_same_v<IndexT, int64_t>) {
+    int_ty = b.getI64Type();
+  } else {
+    int_ty = b.getI32Type();
+  }
   const DotDimensionNumbers& dims = dot_instr->dot_dimension_numbers();
-  const DotFusionAnalysis analysis(dot_instr);
+  const DotFusionAnalysis analysis(dot_instr, config.split_k());
   const HloInstruction* hlo_lhs_param = analysis.OperandToParameter(0);
   const HloInstruction* hlo_rhs_param = analysis.OperandToParameter(1);
 
@@ -347,59 +346,85 @@ std::optional<LaunchDimensions> MatMul(
   Type rhs_ty = TritonType(b, hlo_rhs_param->shape().element_type());
 
   // Rely on dot decomposer: there is just one contracting and one
-  // non-contracting dimension on each side + one batch optionally.
+  // non-contracting dimension on each side + batch ones optionally.
   CHECK_EQ(dims.lhs_contracting_dimensions_size(), 1);
   CHECK_EQ(dims.rhs_contracting_dimensions_size(), 1);
-  CHECK_LE(dims.lhs_batch_dimensions_size(), 1);
-  const bool batch = !dims.lhs_batch_dimensions().empty();
-  CHECK_EQ(dot_instr->operand(0)->shape().rank(), 2 + batch);
-  const int lhs_noncontracting_dim_idx =
-      NoncontractingDimensionIndex(dims.lhs_contracting_dimensions(0),
-                                   batch ? dims.lhs_batch_dimensions(0) : -1);
-  const int rhs_noncontracting_dim_idx =
-      NoncontractingDimensionIndex(dims.rhs_contracting_dimensions(0),
-                                   batch ? dims.rhs_batch_dimensions(0) : -1);
+
+  const bool have_split_k = config.split_k() > 1;
+  if (have_split_k) {
+    // Split-K dimension has to be the last batch one and have an index
+    // just before the contracting one.
+    // Size of this dimension has to match the split_k value.
+    CHECK_EQ(*dims.lhs_batch_dimensions().rbegin(),
+             dims.lhs_contracting_dimensions(0) - 1);
+    CHECK_EQ(*dims.rhs_batch_dimensions().rbegin(),
+             dims.rhs_contracting_dimensions(0) - 1);
+    CHECK_EQ(config.split_k(), dot_instr->operand(0)->shape().dimensions(
+                                   dims.lhs_contracting_dimensions(0) - 1));
+    CHECK_EQ(config.split_k(), dot_instr->operand(1)->shape().dimensions(
+                                   dims.rhs_contracting_dimensions(0) - 1));
+  }
+
+  CHECK_LE(dims.lhs_batch_dimensions_size(), 1 + have_split_k);
+  const bool have_batch = dims.lhs_batch_dimensions_size() - have_split_k;
+  CHECK_EQ(dot_instr->operand(0)->shape().rank(),
+           2 + have_split_k + have_batch);
+  const int64_t lhs_noncontracting_dim_idx =
+      GetNonContractingDims(dot_instr->operand(0)->shape(),
+                            dims.lhs_batch_dimensions(),
+                            dims.lhs_contracting_dimensions())
+          .value()[0];
+  const int64_t rhs_noncontracting_dim_idx =
+      GetNonContractingDims(dot_instr->operand(1)->shape(),
+                            dims.rhs_batch_dimensions(),
+                            dims.rhs_contracting_dimensions())
+          .value()[0];
 
   // Non-contracting dimension lengths.
   // Just the fastest-varying part of it if the dimension is split.
-  const int m = analysis.IterSpec(0, lhs_noncontracting_dim_idx)[0].count;
+  const int m_minor = analysis.IterSpec(0, lhs_noncontracting_dim_idx)[0].count;
   const int n = analysis.IterSpec(1, rhs_noncontracting_dim_idx)[0].count;
 
   // Contracting dimension length.
   const int k = dot_instr->operand(0)->shape().dimensions(
-      dims.lhs_contracting_dimensions(0));
+                    dims.lhs_contracting_dimensions(0)) *
+                config.split_k();
 
   // LHS non-contracting can be split into two.
   const bool lhs_nc_split =
       (analysis.IterSpec(0, lhs_noncontracting_dim_idx).size() > 1);
   CHECK_EQ(analysis.IterSpec(0, lhs_noncontracting_dim_idx).size(),
            1 + lhs_nc_split);
-  // For now split and batch are not supported simultaneously because they
-  // are implemented via same mechanism.
-  CHECK_LE(batch + lhs_nc_split, 1);
+  // For now split non-contracting and batch are not supported simultaneously
+  // because they are implemented via same mechanism.
+  CHECK_LE(have_batch + lhs_nc_split, 1);
   // Splitting of the other ones is not supported yet.
   CHECK_EQ(analysis.IterSpec(1, rhs_noncontracting_dim_idx).size(), 1);
   CHECK_EQ(analysis.IterSpec(0, dims.lhs_contracting_dimensions(0)).size(), 1);
 
-  const int stride_lhs_m =
+  const IndexT stride_lhs_m =
       analysis.IterSpec(0, lhs_noncontracting_dim_idx)[0].stride;
-  const int stride_lhs_k =
+  const IndexT stride_lhs_k =
       analysis.IterSpec(0, dims.lhs_contracting_dimensions(0))[0].stride;
-  const int stride_rhs_k =
+  const IndexT stride_rhs_k =
       analysis.IterSpec(1, dims.rhs_contracting_dimensions(0))[0].stride;
-  const int stride_rhs_n =
+  const IndexT stride_rhs_n =
       analysis.IterSpec(1, rhs_noncontracting_dim_idx)[0].stride;
 
   // Either batch size or upper part of the length of a split nc dimension.
   int batch_size = 1;
-  int stride_batch_lhs = 0;
-  int stride_batch_rhs = 0;
+  IndexT stride_batch_lhs = 0;
+  IndexT stride_batch_rhs = 0;
+  // LHS non-contracting can be split, so this holds its full size unlike the
+  // m_minor.
+  int m_full = m_minor;
   if (lhs_nc_split) {
     batch_size = analysis.IterSpec(0, lhs_noncontracting_dim_idx)[1].count;
     stride_batch_lhs =
         analysis.IterSpec(0, lhs_noncontracting_dim_idx)[1].stride;
     stride_batch_rhs = 0;
-  } else if (batch) {
+    m_full *= batch_size;
+  } else if (have_batch) {
     // Batch dimension should have same length left and right.
     CHECK_EQ(analysis.IterSpec(0, dims.lhs_batch_dimensions(0))[0].count,
              analysis.IterSpec(1, dims.rhs_batch_dimensions(0))[0].count);
@@ -410,33 +435,78 @@ std::optional<LaunchDimensions> MatMul(
         analysis.IterSpec(1, dims.rhs_batch_dimensions(0))[0].stride;
   }
 
-  constexpr int64_t group_m = 8;
+  constexpr int group_m = 8;
 
-  bool transpose_output =
-      !LayoutUtil::IsMonotonicWithDim0Major(dot_instr->shape().layout());
-  const int stride_out_m = transpose_output ? 1 : n;
-  const int stride_out_n = transpose_output ? m : 1;
+  // Logical output dimensions are always ordered as:
+  //   batch, split-K, non-contracting LHS, non-contracting RHS,
+  // where batch and split-K are optional.
+  const int rhs_nc_out_logical_idx = dot_instr->shape().rank() - 1;
+  const int lhs_nc_out_logical_idx = dot_instr->shape().rank() - 2;
+  const int split_k_out_logical_idx = have_split_k ? (have_batch ? 1 : 0) : -1;
+  const int batch_out_logical_idx = have_batch ? 0 : -1;
+
+  IndexT stride_out_m = 0;
+  IndexT stride_out_n = 0;
+  IndexT stride_out_split_k = 0;
+  IndexT stride_out_batch = 0;
+
+  // Iterate over output's physical dimension starting from the fastest
+  // varying one; detect their types and populate the strides accordingly.
+  int64_t out_stride_size_accumulator = 1;
+  for (int64_t logical_idx : dot_instr->shape().layout().minor_to_major()) {
+    const int64_t dim_size = dot_instr->shape().dimensions(logical_idx);
+    if (logical_idx == rhs_nc_out_logical_idx) {
+      CHECK_EQ(dim_size, n);
+      stride_out_n = out_stride_size_accumulator;
+    } else if (logical_idx == lhs_nc_out_logical_idx) {
+      CHECK_EQ(dim_size, m_full);
+      stride_out_m = out_stride_size_accumulator;
+      if (lhs_nc_split) {
+        // Dimension of the output produced by the non-contracting LHS one
+        // is physically contiguous even if the producing LHS one is split.
+        // Because the major part of the split is implemented using the batch
+        // logic stride_out_batch is populated here as the stride of the minor
+        // part times its size.
+        stride_out_batch = out_stride_size_accumulator * m_minor;
+      }
+    } else if (logical_idx == split_k_out_logical_idx) {
+      CHECK_EQ(dim_size, config.split_k());
+      stride_out_split_k = out_stride_size_accumulator;
+    } else if (logical_idx == batch_out_logical_idx) {
+      CHECK_EQ(dim_size, batch_size);
+      stride_out_batch = out_stride_size_accumulator;
+    } else {
+      CHECK(false) << "Unexpected dimension";
+    }
+    out_stride_size_accumulator *= dim_size;
+  }
+  CHECK_GE(stride_out_m, 1);
+  CHECK_GE(stride_out_n, 1);
+  // The next two should never be minor-most, so stride > 1.
+  if (have_split_k) {
+    CHECK_GT(stride_out_split_k, 1);
+  }
+  if (have_batch || lhs_nc_split) {
+    CHECK_GT(stride_out_batch, 1);
+  }
 
   const int block_m = config.block_m();
   const int block_k = config.block_k();
   const int block_n = config.block_n();
-  CHECK_GE(block_m, 32);
-  CHECK_GE(block_k, 32);
-  CHECK_GE(block_n, 32);
+
+  CHECK_GE(block_m, 16);
+  CHECK_GE(block_k, 16);
+  CHECK_GE(block_n, 16);
 
   VLOG(3) << block_m << " " << block_k << " " << block_n << " "
           << config.num_warps() << " " << config.num_stages();
 
-  const int grid_m = ceil(1.0 * m / block_m);
+  const int grid_m = ceil(1.0 * m_minor / block_m);
   const int grid_n = ceil(1.0 * n / block_n);
   const int width = group_m * grid_n;
 
-  // TODO(b/266863137): handle atomic add for split_k > 1.
-  // This also requires output zero-init.
-  const unsigned int split_k = config.split_k();
-  CHECK_EQ(split_k, 1);
   const LaunchDimensions launch_dimensions{
-      {grid_m * grid_n, split_k, batch_size},
+      {grid_m * grid_n, config.split_k(), batch_size},
       {config.num_warps() * WarpSize(), 1, 1}};
   Type root_ty = TritonType(b, dot_instr->shape().element_type());
   // Data type to which dot() inputs are converted.
@@ -452,11 +522,9 @@ std::optional<LaunchDimensions> MatMul(
   const int required_shmem_size = (block_m * lhs_ty.getIntOrFloatBitWidth() +
                                    block_n * rhs_ty.getIntOrFloatBitWidth()) *
                                   block_k * config.num_stages() / 8;
-  // TODO(b/266857785): Add dynamic shared memory size.
   if (required_shmem_size > shmem_budget) {
-    VLOG(2) << "Requires too much shared memory: " << required_shmem_size
-            << " B.";
-    return std::nullopt;
+    return ResourceExhausted("Requires too much shared memory: %d > %d",
+                             required_shmem_size, shmem_budget);
   }
 
   // TODO(b/266862493): Accumulator can be integer too.
@@ -487,9 +555,9 @@ std::optional<LaunchDimensions> MatMul(
     return b.create<mt::SplatOp>(type, value);
   };
 
-  auto build_range = [&](uint32_t start, uint32_t end) {
-    auto type = mlir::RankedTensorType::get(end - start, b.getI32Type());
-    return b.create<mt::MakeRangeOp>(type, start, end);
+  auto build_range = [&](int32_t limit) {
+    auto type = mlir::RankedTensorType::get(limit, b.getI32Type());
+    return b.create<mt::MakeRangeOp>(type, 0, limit);
   };
 
   using TensorValue = mlir::TypedValue<mlir::RankedTensorType>;
@@ -503,6 +571,22 @@ std::optional<LaunchDimensions> MatMul(
     return b.create<mt::AddPtrOp>(ptr.getType(), ptr, offset);
   };
 
+  // Extend int32 indexes to int64, if necessary.
+  auto convert_scalar = [&](Value value) -> Value {
+    if constexpr (std::is_same_v<IndexT, int64_t>) {
+      return b.create<ma::ExtSIOp>(int_ty, value);
+    }
+    return value;
+  };
+  auto convert_range = [&](Value value) -> Value {
+    if constexpr (std::is_same_v<IndexT, int64_t>) {
+      auto type = mlir::RankedTensorType::get(
+          value.dyn_cast<TensorValue>().getType().getShape(), int_ty);
+      return b.create<ma::ExtSIOp>(type, value);
+    }
+    return value;
+  };
+
   auto pid_m = b.create<ma::AddIOp>(first_pid_m,
                                     b.create<ma::RemSIOp>(pid0, group_size));
   auto pid_m_stride =
@@ -510,55 +594,59 @@ std::optional<LaunchDimensions> MatMul(
   // TODO(b/270351731): Consider regenerating range_m to reduce register
   // pressure if we figure out how to make this optimization survive CSE.
   auto range_m = b.create<ma::AddIOp>(build_splat(pid_m_stride, block_m),
-                                      build_range(0, block_m));
+                                      build_range(block_m));
 
   auto pid_n = b.create<ma::DivSIOp>(
       b.create<ma::RemSIOp>(pid0, CreateConst(b, i32_ty, width)), group_size);
   auto pid_n_stride =
       b.create<ma::MulIOp>(pid_n, CreateConst(b, i32_ty, block_n));
   auto range_n = b.create<ma::AddIOp>(build_splat(pid_n_stride, block_n),
-                                      build_range(0, block_n));
+                                      build_range(block_n));
 
   auto range_k = b.create<ma::AddIOp>(
       build_splat(b.create<ma::MulIOp>(pid1, CreateConst(b, i32_ty, block_k)),
                   block_k),
-      build_range(0, block_k));
+      build_range(block_k));
 
   SmallVector<int64_t, 2> shape_m_1{block_m, 1};
-  auto range_lhs_m =
-      b.create<ma::RemSIOp>(range_m, CreateConst(b, i32_ty, m, block_m));
+  auto range_lhs_m = convert_range(
+      b.create<ma::RemSIOp>(range_m, CreateConst(b, i32_ty, m_minor, block_m)));
   auto lhs_offset_m =
       b.create<ma::MulIOp>(b.create<mt::ExpandDimsOp>(range_lhs_m, 1),
-                           CreateConst(b, i32_ty, stride_lhs_m, shape_m_1));
+                           CreateConst(b, int_ty, stride_lhs_m, shape_m_1));
   SmallVector<int64_t, 2> shape_1_k{1, block_k};
-  auto lhs_offset_k =
-      b.create<ma::MulIOp>(b.create<mt::ExpandDimsOp>(range_k, 0),
-                           CreateConst(b, i32_ty, stride_lhs_k, shape_1_k));
+  auto lhs_offset_k = b.create<ma::MulIOp>(
+      b.create<mt::ExpandDimsOp>(convert_range(range_k), 0),
+      CreateConst(b, int_ty, stride_lhs_k, shape_1_k));
   SmallVector<int64_t, 2> shape_m_k{block_m, block_k};
   auto lhs_offset = b.create<ma::AddIOp>(
-      build_bcast(lhs_offset_m.getResult().cast<TensorValue>(), shape_m_k),
-      build_bcast(lhs_offset_k.getResult().cast<TensorValue>(), shape_m_k));
-  auto lhs_offset_batch =
-      b.create<ma::MulIOp>(pid2, CreateConst(b, i32_ty, stride_batch_lhs));
+      build_bcast(lhs_offset_m.getResult().template cast<TensorValue>(),
+                  shape_m_k),
+      build_bcast(lhs_offset_k.getResult().template cast<TensorValue>(),
+                  shape_m_k));
+  auto lhs_offset_batch = b.create<ma::MulIOp>(
+      convert_scalar(pid2), CreateConst(b, int_ty, stride_batch_lhs));
   mt::AddPtrOp lhs_ptrs_base = build_addptr(
       build_splat(build_addptr(lhs, lhs_offset_batch), shape_m_k), lhs_offset);
 
   SmallVector<int64_t, 2> shape_k_1{block_k, 1};
-  auto rhs_off_k =
-      b.create<ma::MulIOp>(b.create<mt::ExpandDimsOp>(range_k, 1),
-                           CreateConst(b, i32_ty, stride_rhs_k, shape_k_1));
+  auto rhs_off_k = b.create<ma::MulIOp>(
+      b.create<mt::ExpandDimsOp>(convert_range(range_k), 1),
+      CreateConst(b, int_ty, stride_rhs_k, shape_k_1));
   SmallVector<int64_t, 2> shape_1_n{1, block_n};
-  auto range_rhs_n =
-      b.create<ma::RemSIOp>(range_n, CreateConst(b, i32_ty, n, block_n));
+  auto range_rhs_n = convert_range(
+      b.create<ma::RemSIOp>(range_n, CreateConst(b, i32_ty, n, block_n)));
   auto rhs_offset_n =
       b.create<ma::MulIOp>(b.create<mt::ExpandDimsOp>(range_rhs_n, 0),
-                           CreateConst(b, i32_ty, stride_rhs_n, shape_1_n));
+                           CreateConst(b, int_ty, stride_rhs_n, shape_1_n));
   SmallVector<int64_t, 2> shape_k_n{block_k, block_n};
   auto rhs_offset = b.create<ma::AddIOp>(
-      build_bcast(rhs_off_k.getResult().cast<TensorValue>(), shape_k_n),
-      build_bcast(rhs_offset_n.getResult().cast<TensorValue>(), shape_k_n));
-  auto rhs_offset_batch =
-      b.create<ma::MulIOp>(pid2, CreateConst(b, i32_ty, stride_batch_rhs));
+      build_bcast(rhs_off_k.getResult().template cast<TensorValue>(),
+                  shape_k_n),
+      build_bcast(rhs_offset_n.getResult().template cast<TensorValue>(),
+                  shape_k_n));
+  auto rhs_offset_batch = b.create<ma::MulIOp>(
+      convert_scalar(pid2), CreateConst(b, int_ty, stride_batch_rhs));
   mt::AddPtrOp rhs_ptrs_base = build_addptr(
       build_splat(build_addptr(rhs, rhs_offset_batch), shape_k_n), rhs_offset);
   SmallVector<int64_t, 2> shape_m_n{block_m, block_n};
@@ -575,25 +663,24 @@ std::optional<LaunchDimensions> MatMul(
     Value zeros_like_rhs = nullptr;
     // TODO(b/269726484): Peel the loop instead of inserting a masked load in
     // every iteration, even the ones that do not need it.
-    if (k % (block_k * split_k) > 0) {
+    if (k % (block_k * config.split_k()) > 0) {
       zeros_like_lhs = CreateConst(b, lhs_ty, 0, shape_m_k);
       zeros_like_rhs = CreateConst(b, rhs_ty, 0, shape_k_n);
       auto elements_in_tile =
-          b.create<ma::SubIOp>(CreateConst(b, i32_ty, k),
-                               b.create<ma::IndexCastOp>(b.getI32Type(), ki));
+          b.create<ma::SubIOp>(CreateConst(b, i32_ty, k), ki);
       lhs_mask = build_bcast(
           b.create<ma::CmpIOp>(ma::CmpIPredicate::slt,
                                b.create<mt::ExpandDimsOp>(range_k, 0),
                                build_splat(elements_in_tile, shape_1_k))
               .getResult()
-              .cast<TensorValue>(),
+              .template cast<TensorValue>(),
           shape_m_k);
       rhs_mask = build_bcast(
           b.create<ma::CmpIOp>(ma::CmpIPredicate::slt,
                                b.create<mt::ExpandDimsOp>(range_k, 1),
                                build_splat(elements_in_tile, shape_k_1))
               .getResult()
-              .cast<TensorValue>(),
+              .template cast<TensorValue>(),
           shape_k_n);
     }
     auto lhs_tile = b.create<mt::LoadOp>(lhs_ptrs, lhs_mask, zeros_like_lhs,
@@ -612,63 +699,91 @@ std::optional<LaunchDimensions> MatMul(
 
     mt::AddPtrOp lhs_ptrs_inc = build_addptr(
         lhs_ptrs,
-        CreateConst(b, i32_ty, block_k * split_k * stride_lhs_k, shape_m_k));
+        CreateConst(b, int_ty, block_k * config.split_k() * stride_lhs_k,
+                    shape_m_k));
     mt::AddPtrOp rhs_ptrs_inc = build_addptr(
         rhs_ptrs,
-        CreateConst(b, i32_ty, block_k * split_k * stride_rhs_k, shape_k_n));
+        CreateConst(b, int_ty, block_k * config.split_k() * stride_rhs_k,
+                    shape_k_n));
 
     b.create<mlir::scf::YieldOp>(
         mlir::ValueRange{lhs_ptrs_inc, rhs_ptrs_inc, acc_next});
   };
   Value acc_final =
       b.create<mlir::scf::ForOp>(
-           /*lowerBound=*/b.create<ma::ConstantIndexOp>(0),
-           /*upperBound=*/b.create<ma::ConstantIndexOp>(k),
-           /*step=*/b.create<ma::ConstantIndexOp>(block_k * split_k),
+           /*lowerBound=*/b.create<ma::ConstantIntOp>(0, /*width=*/32),
+           /*upperBound=*/b.create<ma::ConstantIntOp>(k, /*width=*/32),
+           /*step=*/
+           b.create<ma::ConstantIntOp>(block_k * config.split_k(),
+                                       /*width=*/32),
            /*iterArgs=*/
            mlir::ValueRange{lhs_ptrs_base, rhs_ptrs_base, acc_init},
            body_builder)
           .getResult(2);
 
   // Output tile offsets.
-  auto out_offset_batch =
-      b.create<ma::MulIOp>(pid2, CreateConst(b, i32_ty, m * n));
-  auto out_offset_m =
-      b.create<ma::MulIOp>(b.create<mt::ExpandDimsOp>(range_m, 1),
-                           CreateConst(b, i32_ty, stride_out_m, shape_m_1));
-  mt::AddPtrOp out_ptrs_m =
-      build_addptr(build_splat(build_addptr(out, out_offset_batch), shape_m_1),
-                   out_offset_m);
+  auto out_offset_batch = b.create<ma::MulIOp>(
+      convert_scalar(pid2), CreateConst(b, int_ty, stride_out_batch));
+  auto out_offset_split_k = b.create<ma::MulIOp>(
+      convert_scalar(pid1), CreateConst(b, int_ty, stride_out_split_k));
+  auto out_offset_m = b.create<ma::MulIOp>(
+      b.create<mt::ExpandDimsOp>(convert_range(range_m), 1),
+      CreateConst(b, int_ty, stride_out_m, shape_m_1));
+  mt::AddPtrOp out_ptrs_m = build_addptr(
+      build_splat(
+          build_addptr(build_addptr(out, out_offset_batch), out_offset_split_k),
+          shape_m_1),
+      out_offset_m);
 
-  auto out_offset_n =
-      b.create<ma::MulIOp>(b.create<mt::ExpandDimsOp>(range_n, 0),
-                           CreateConst(b, i32_ty, stride_out_n, shape_1_n));
+  auto out_offset_n = b.create<ma::MulIOp>(
+      b.create<mt::ExpandDimsOp>(convert_range(range_n), 0),
+      CreateConst(b, int_ty, stride_out_n, shape_1_n));
   mt::AddPtrOp out_ptrs = build_addptr(
       build_bcast(out_ptrs_m.getResult().cast<TensorValue>(), shape_m_n),
-      build_bcast(out_offset_n.getResult().cast<TensorValue>(), shape_m_n));
+      build_bcast(out_offset_n.getResult().template cast<TensorValue>(),
+                  shape_m_n));
 
   // Output tile store mask: check that the indices are within [M, N].
-  auto rm_cmp = b.create<ma::CmpIOp>(ma::CmpIPredicate::slt,
-                                     b.create<mt::ExpandDimsOp>(range_m, 1),
-                                     CreateConst(b, i32_ty, m, shape_m_1));
+  auto rm_cmp = b.create<ma::CmpIOp>(
+      ma::CmpIPredicate::slt, b.create<mt::ExpandDimsOp>(range_m, 1),
+      CreateConst(b, i32_ty, m_minor, shape_m_1));
   auto rn_cmp = b.create<ma::CmpIOp>(ma::CmpIPredicate::slt,
                                      b.create<mt::ExpandDimsOp>(range_n, 0),
                                      CreateConst(b, i32_ty, n, shape_1_n));
   auto mask = b.create<ma::AndIOp>(
-      build_bcast(rm_cmp.getResult().cast<TensorValue>(), shape_m_n),
-      build_bcast(rn_cmp.getResult().cast<TensorValue>(), shape_m_n));
+      build_bcast(rm_cmp.getResult().template cast<TensorValue>(), shape_m_n),
+      build_bcast(rn_cmp.getResult().template cast<TensorValue>(), shape_m_n));
 
-  b.create<mt::StoreOp>(out_ptrs, Cast(b, loc, acc_final, root_ty), mask);
+  b.create<mt::StoreOp>(out_ptrs, Cast(b, loc, acc_final, root_ty), mask,
+                        mt::CacheModifier::NONE, mt::EvictionPolicy::NORMAL);
   return launch_dimensions;
 }
 
-std::optional<LaunchDimensions> TritonWrapper(
+}  // namespace
+
+StatusOr<LaunchDimensions> MatMul(
+    mlir::OpBuilder builder, const HloDotInstruction* dot_instr,
+    mlir::triton::FuncOp fn,
+    const tensorflow::AutotuneResult::TritonGemmKey& config, int shmem_budget) {
+  // Use 32-bit indexing if addressing any of the inputs or the output (which
+  // could grow if split_k is set) does not cross the INT_MAX boundary.
+  // Otherwise, fall back to 64-bit indexing, which is slower.
+  bool use_64bit_indexing =
+      ShapeUtil::ElementsIn(dot_instr->operand(0)->shape()) > INT_MAX ||
+      ShapeUtil::ElementsIn(dot_instr->operand(1)->shape()) > INT_MAX ||
+      ShapeUtil::ElementsIn(dot_instr->shape()) * config.split_k() > INT_MAX;
+  if (use_64bit_indexing) {
+    return MatMulImpl<int64_t>(builder, dot_instr, fn, config, shmem_budget);
+  } else {
+    return MatMulImpl<int32_t>(builder, dot_instr, fn, config, shmem_budget);
+  }
+}
+
+StatusOr<LaunchDimensions> TritonWrapper(
     absl::string_view fn_name, const HloComputation* hlo_computation,
     const se::CudaComputeCapability& cc, const GpuDeviceInfo& device_info,
     const AutotuneResult::TritonGemmKey& config, llvm::Module* llvm_module,
-    LaunchDimensionsGenerator generator) {
-  // TODO(b/264317991): Pass in a context instead if this becomes to slow.
-  mlir::MLIRContext mlir_context;
+    LaunchDimensionsGenerator generator, mlir::MLIRContext& mlir_context) {
   mlir_context.loadDialect<mt::TritonDialect>();
   mlir::OpBuilder b(&mlir_context);
   auto loc = mlir::NameLoc::get(b.getStringAttr(hlo_computation->name()));
@@ -695,21 +810,20 @@ std::optional<LaunchDimensions> TritonWrapper(
 
   fn_arg_types.push_back(mt::PointerType::get(root_ty, mn::kGlobalMemorySpace));
 
-  auto fn = b.create<mlir::func::FuncOp>(
-      loc, fn_name, b.getFunctionType(fn_arg_types, std::nullopt));
+  auto fn = b.create<mt::FuncOp>(loc, fn_name,
+                                 b.getFunctionType(fn_arg_types, std::nullopt));
   for (int i = 0; i < fn.getNumArguments(); ++i) {
     fn.setArgAttr(i, "tt.divisibility", b.getIntegerAttr(b.getI32Type(), 16));
   }
   fn.addEntryBlock();
   b.setInsertionPointToStart(&fn.front());
 
-  std::optional<LaunchDimensions> launch_dimensions =
-      generator(b, ::xla::Cast<HloDotInstruction>(root), fn, config,
-                device_info.shared_memory_per_block);
-  if (!launch_dimensions.has_value()) {
-    return std::nullopt;
-  }
-  b.create<mlir::func::ReturnOp>(loc);
+  TF_ASSIGN_OR_RETURN(
+      LaunchDimensions launch_dimensions,
+      generator(b, xla::Cast<HloDotInstruction>(root), fn, config,
+                device_info.shared_memory_per_block_optin));
+
+  b.create<mt::ReturnOp>(loc);
   CHECK(mlir::succeeded(mlir::verify(triton_module)));
 
   VLOG(4) << llvm_ir::DumpToString(triton_module);
@@ -724,20 +838,28 @@ std::optional<LaunchDimensions> TritonWrapper(
         absl::StrCat(absl::string_view(tsl::io::Basename(hlo_module->name())),
                      ".triton-passes.log");
     std::string outputs_dir;
-    tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir);
-    std::string path = tsl::io::JoinPath(outputs_dir, basename);
-    std::error_code err;
-    log_stream.emplace(path, err, llvm::sys::fs::OF_None);
-    if (err) {
-      log_stream.reset();
+    if (!tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir)) {
+      outputs_dir = hlo_module->config().debug_options().xla_dump_to();
+    }
+    if (!outputs_dir.empty()) {
+      std::string path = tsl::io::JoinPath(outputs_dir, basename);
+      std::error_code err;
+      log_stream.emplace(path, err, llvm::sys::fs::OF_None);
+      if (err) {
+        log_stream.reset();
+      }
+      auto print_before = [](mlir::Pass*, mlir::Operation*) { return true; };
+      auto print_after = [](mlir::Pass*, mlir::Operation*) { return false; };
+      pm.getContext()->disableMultithreading();
+      pm.enableIRPrinting(print_before, print_after, /*printModuleScope=*/true,
+                          /*printAfterOnlyOnChange=*/true,
+                          /*printAfterOnlyOnFailure=*/false, *log_stream,
+                          /*opPrintingFlags=*/{});
+    } else {
+      LOG(ERROR) << "--xla_gpu_dump_llvmir is set, but neither the environment "
+                 << "variable TEST_UNDECLARED_OUTPUTS_DIR nor the flag "
+                 << "--xla_dump_to is set, so the llvm dumps are disabled.";
     }
-    auto print_before = [](mlir::Pass*, mlir::Operation*) { return true; };
-    auto print_after = [](mlir::Pass*, mlir::Operation*) { return false; };
-    pm.getContext()->disableMultithreading();
-    pm.enableIRPrinting(print_before, print_after, /*printModuleScope=*/true,
-                        /*printAfterOnlyOnChange=*/true,
-                        /*printAfterOnlyOnFailure=*/false, *log_stream,
-                        /*opPrintingFlags=*/{});
   }
 
   CreateTritonPipeline(pm, cc, config.num_warps(), config.num_stages());
@@ -758,15 +880,13 @@ std::optional<LaunchDimensions> TritonWrapper(
       triton_module->getAttrOfType<mlir::IntegerAttr>("triton_gpu.shared")
           .getInt();
   VLOG(2) << "Shared memory usage: " << shared_mem_bytes << " B";
-  // TODO(b/266857785): Add dynamic shared memory size.
-  if (shared_mem_bytes > device_info.shared_memory_per_block) {
-    LOG(WARNING) << "Shared memory size limit exceeded.";
-    return std::nullopt;
+  if (shared_mem_bytes > device_info.shared_memory_per_block_optin) {
+    return ResourceExhausted("Shared memory size limit exceeded.");
   }
-  launch_dimensions->SetSharedMemBytes(shared_mem_bytes);
+  launch_dimensions.SetSharedMemBytes(shared_mem_bytes);
 
-  std::unique_ptr<llvm::Module> ll_triton_module =
-      mt::translateLLVMToLLVMIR(&llvm_module->getContext(), triton_module);
+  std::unique_ptr<llvm::Module> ll_triton_module = mt::translateLLVMToLLVMIR(
+      &llvm_module->getContext(), triton_module, /*isROCM=*/false);
   LogAndVerify(ll_triton_module.get());
   for (auto& metadata :
        llvm::make_early_inc_range(ll_triton_module->named_metadata())) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton.h
index d772b23cd3d..4da54e7be2c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton.h
@@ -17,9 +17,6 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_TRITON_H_
 
 #include <functional>
-#include <optional>
-#include <string>
-#include <vector>
 
 #include "llvm/IR/Module.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -27,8 +24,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/tsl/protobuf/autotuning.pb.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace xla {
 namespace gpu {
@@ -38,25 +36,24 @@ using tensorflow::AutotuneResult;
 // Generate matrix multiplication in Triton IR inside 'fn'
 // for 'dot_instr' which is described by an HLO custom call computation.
 // Use tiling and execution parameters from 'config'.
-// Values in 'config' can be adjusted by this function if the original ones
-// are not executable or inefficient.
-std::optional<LaunchDimensions> MatMul(
-    mlir::OpBuilder b, const HloDotInstruction* dot_instr,
-    mlir::func::FuncOp fn, const AutotuneResult::TritonGemmKey& config,
-    int shmem_budget);
+StatusOr<LaunchDimensions> MatMul(mlir::OpBuilder b,
+                                  const HloDotInstruction* dot_instr,
+                                  mlir::triton::FuncOp fn,
+                                  const AutotuneResult::TritonGemmKey& config,
+                                  int shmem_budget);
 
-using LaunchDimensionsGenerator = std::function<std::optional<LaunchDimensions>(
-    mlir::OpBuilder, const HloDotInstruction*, mlir::func::FuncOp,
+using LaunchDimensionsGenerator = std::function<StatusOr<LaunchDimensions>(
+    mlir::OpBuilder, const HloDotInstruction*, mlir::triton::FuncOp,
     const AutotuneResult::TritonGemmKey&, int)>;
 
 // Generate Triton IR by running the provided generator, compile it into LLVM IR
-// and return either launch dimensions or std::nullopt if generation failed.
+// and return launch dimensions.
 // The MatMul() above is one of such possible IR generators.
-std::optional<LaunchDimensions> TritonWrapper(
+StatusOr<LaunchDimensions> TritonWrapper(
     absl::string_view fn_name, const HloComputation* hlo_computation,
     const se::CudaComputeCapability& cc, const GpuDeviceInfo& device_info,
     const AutotuneResult::TritonGemmKey& config, llvm::Module* llvm_module,
-    LaunchDimensionsGenerator generator);
+    LaunchDimensionsGenerator generator, mlir::MLIRContext& mlir_context);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_large_test.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_large_test.cc
new file mode 100644
index 00000000000..baabb8743bd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_large_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/error_spec.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class CompareTest : public GpuCodegenTest {};
+
+TEST_F(CompareTest, IndexUsing64Bits) {
+  const char* hlo_text_ref = R"(
+HloModule r
+
+ENTRY e {
+  arg0 = f16[65536,32800] parameter(0)
+  arg1 = f16[32800,32] parameter(1)
+  ROOT custom-call = f16[65536,32] custom-call(arg0, arg1),
+    custom_call_target="__cublas$gemm",
+    backend_config="{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
+}
+)";
+
+  const char* hlo_text_triton = R"(
+HloModule t
+
+triton_dot {
+  p0 = f16[65536,32800] parameter(0)
+  p1 = f16[32800,32] parameter(1)
+  ROOT dot = f16[65536,32] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f16[65536,32800] parameter(0)
+  p1 = f16[32800,32] parameter(1)
+  ROOT _ = f16[65536,32] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config="{\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"1\"}"
+}
+)";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
+                                      ErrorSpec{1e-3, 1e-3},
+                                      /*run_hlo_passes=*/false));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
new file mode 100644
index 00000000000..674c39d00ba
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -0,0 +1,126 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "absl/strings/substitute.h"
+#include "tensorflow/compiler/xla/error_spec.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+struct GemmTestParams {
+  PrimitiveType lhs_ty;
+  PrimitiveType rhs_ty;
+  int m;
+  int k;
+  int n;
+  float aabs = 1e-6;
+  float arel = 1e-6;
+};
+
+class ParametrizedRewriteTest
+    : public GpuCodegenTest,
+      public ::testing::WithParamInterface<GemmTestParams> {
+ public:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+};
+
+TEST_P(ParametrizedRewriteTest, Main) {
+  GemmTestParams params = GetParam();
+  if ((params.lhs_ty == BF16 || params.rhs_ty == BF16) &&
+      !GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const std::string hlo_string_template = R"(
+HloModule m
+
+ENTRY e {
+  p0 = $0[$2,$3] parameter(0)
+  p0c = $1[$2,$3] convert(p0)
+  p1 = $1[$3,$4] parameter(1)
+  ROOT _ = $1[$2,$4] dot(p0c, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+  std::string hlo_string = absl::Substitute(
+      hlo_string_template,
+      primitive_util::LowercasePrimitiveTypeName(params.lhs_ty),
+      primitive_util::LowercasePrimitiveTypeName(params.rhs_ty), params.m,
+      params.k, params.n);
+  MatchOptimizedHlo(hlo_string, R"(
+; CHECK: fusion(%p0, %p1)
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: backend_config="{\"block_m\":\"
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{params.aabs, params.arel}));
+}
+
+std::string GemmTestParamsParamsToString(
+    const ::testing::TestParamInfo<GemmTestParams>& data) {
+  return absl::StrCat(
+      primitive_util::LowercasePrimitiveTypeName(data.param.lhs_ty), "_",
+      primitive_util::LowercasePrimitiveTypeName(data.param.rhs_ty), "_",
+      data.param.m, "_", data.param.k, "_", data.param.n);
+}
+
+INSTANTIATE_TEST_SUITE_P(RewriteTestSuite, ParametrizedRewriteTest,
+                         ::testing::ValuesIn({
+                             GemmTestParams{PRED, F16, 16, 32, 8},
+                             GemmTestParams{PRED, BF16, 16, 32, 8},
+                             GemmTestParams{PRED, F32, 16, 32, 8, 1e-4, 1e-3},
+                             GemmTestParams{S8, F16, 16, 32, 8},
+                             GemmTestParams{S8, BF16, 16, 32, 8},
+                             GemmTestParams{S8, F32, 16, 32, 8, 5e-2, 1e-2},
+                             GemmTestParams{S8, F32, 101, 7, 303, 0.1, 0.1},
+                             GemmTestParams{S8, F32, 101, 32, 303, 0.1, 0.1},
+                             GemmTestParams{S8, F32, 101, 2048, 303, 0.5, 0.1},
+                             GemmTestParams{S8, F32, 101, 2555, 303, 0.5, 0.1},
+                             // Is supported but overflows.
+                             //  GemmTestParams{S32, F16},
+                             GemmTestParams{S32, F32, 4, 4, 4, 1, 1e-2},
+                             GemmTestParams{F16, BF16, 16, 32, 8},
+                             GemmTestParams{F16, F32, 16, 32, 8, 1e-3, 1e-6},
+                             GemmTestParams{BF16, F16, 16, 32, 8, 1e-3, 1e-6},
+                             GemmTestParams{BF16, F32, 16, 32, 8, 1e-3, 1e-6},
+                             // Supported but disabled because narrowing
+                             // converts should rather belong to producers.
+                             // TODO(b/266862493): Move these to CompareTest.
+                             // TritonRewriteTest2Params{S32, BF16},
+                             //  TritonRewriteTest2Params{F32, F16},
+                             //  TritonRewriteTest2Params{F32, BF16},
+                             GemmTestParams{S8, BF16, 24, 40, 8},
+                             GemmTestParams{S8, F16, 80, 16, 32},
+                             GemmTestParams{F16, F32, 127, 3, 300, 1e-2, 1e-2},
+                             GemmTestParams{F16, BF16, 544, 96, 16, 1e-3, 1e-3},
+                             GemmTestParams{BF16, F32, 77, 500, 333, 3e-3,
+                                            3e-3},
+                         }),
+                         GemmTestParamsParamsToString);
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_test.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_test.cc
index 2534d4a6351..2563312f046 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_test.cc
@@ -13,14 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_triton.h"
+
+#include <memory>
 #include <string>
 
-#include "absl/strings/substitute.h"
-#include "tensorflow/compiler/xla/primitive_util.h"
+#include "llvm/IR/LLVMContext.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/error_spec.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
+#include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/protobuf/autotuning.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -28,11 +37,6 @@ namespace {
 
 class TritonGemmTest : public GpuCodegenTest {
  public:
-  DebugOptions GetDebugOptionsForTest() override {
-    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_triton_gemm(true);
-    return debug_options;
-  }
   se::CudaComputeCapability GetCudaComputeCapability() {
     return backend()
         .default_stream_executor()
@@ -41,15 +45,75 @@ class TritonGemmTest : public GpuCodegenTest {
   }
 };
 
+TEST_F(TritonGemmTest, FailIfTooMuchShmem) {
+  const std::string kHloText = R"(
+HloModule module, is_scheduled=true
+
+triton_gemm_dot {
+  p0 = s8[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  c0 = f32[1024,1024] convert(p0)
+  ROOT dot.0 = f32[1024,1024] dot(c0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = s8[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  ROOT r = f32[1024,1024] fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  const HloComputation* triton_dot_computation =
+      hlo_module->entry_computation()
+          ->root_instruction()
+          ->fused_instructions_computation();
+  const GpuDeviceInfo dev_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  llvm::LLVMContext llvm_ctx;
+  llvm::Module llvm_module("module", llvm_ctx);
+  mlir::MLIRContext mlir_context;
+
+  tensorflow::AutotuneResult::TritonGemmKey config;
+  config.set_block_m(512);
+  config.set_block_n(512);
+  config.set_block_k(512);
+  config.set_split_k(1);
+  config.set_num_stages(1);
+  config.set_num_warps(2);
+  EXPECT_THAT(
+      TritonWrapper("test_fn", triton_dot_computation,
+                    se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
+                                              /*minor=*/0},
+                    dev_info, config, &llvm_module, &MatMul, mlir_context),
+      tsl::testing::StatusIs(
+          tsl::error::RESOURCE_EXHAUSTED,
+          absl::StrFormat("Requires too much shared memory: 1310720 > %d",
+                          dev_info.shared_memory_per_block_optin)));
+
+  config.set_block_m(64);
+  config.set_block_n(128);
+  config.set_block_k(128);
+  TF_ASSERT_OK_AND_ASSIGN(
+      const LaunchDimensions launch_dimensions,
+      TritonWrapper("test_fn", triton_dot_computation,
+                    se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
+                                              /*minor=*/0},
+                    dev_info, config, &llvm_module, &MatMul, mlir_context));
+  // Use optin shared memory which is > shared_memory_per_block.
+  EXPECT_GT(launch_dimensions.SharedMemBytes(),
+            dev_info.shared_memory_per_block);
+}
+
 TEST_F(TritonGemmTest, MultipleDims) {
   const std::string hlo_text = R"(
 HloModule t
 
 ENTRY e {
-  p0 = f16[1,16,96,32]{3,2,1,0} parameter(0)
-  p1 = s8[16,96,32]{2,1,0} parameter(1)
-  cp1 = f16[16,96,32]{2,1,0} convert(p1)
-  ROOT _ = f16[1,16,16]{2,1,0} dot(p0, cp1),
+  p0 = f16[1,16,17,3] parameter(0)
+  p1 = s8[16,17,3] parameter(1)
+  cp1 = f16[16,17,3] convert(p1)
+  ROOT _ = f16[1,16,16] dot(p0, cp1),
     lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}
 })";
 
@@ -198,6 +262,28 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-4, 1e-2}));
 }
 
+TEST_F(TritonGemmTest, NonMajorMostInputBatchWorksCorrectly) {
+  const std::string hlo_text = R"(
+HloModule t
+
+ENTRY e {
+  x = f32[20,50,30] parameter(0)
+  y = f16[30,50,40] parameter(1)
+  cy = f32[30,50,40] convert(y)
+  ROOT _ = f32[50,20,40] dot(x, cy),
+    lhs_contracting_dims={2}, rhs_contracting_dims={0},
+    lhs_batch_dims={1}, rhs_batch_dims={1}
+})";
+
+  MatchOptimizedHlo(hlo_text, R"(
+; CHECK: fusion(%y, %x)
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: backend_config="{\"block_m\":\"
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+}
+
 TEST_F(TritonGemmTest, BatchTransposeF32F16) {
   const std::string hlo_text = R"(
 HloModule t
@@ -226,18 +312,18 @@ TEST_F(TritonGemmTest, DoNotFuseArbitraryReshape) {
 HloModule m
 
 ENTRY e {
-  Arg_0.1 = f16[2,2,3]{2,1,0} parameter(0)
-  c = f32[2,2,3]{2,1,0} convert(Arg_0.1)
-  Arg_1.2 = f32[4,3]{1,0} parameter(1)
-  reshape.4 = f32[3,2,2]{2,1,0} reshape(Arg_1.2)
-  ROOT dot.5 = f32[2,2,2]{2,1,0} dot(c, reshape.4),
-    lhs_batch_dims={1}, lhs_contracting_dims={2},
-    rhs_batch_dims={1}, rhs_contracting_dims={0}
+  p0 = f16[5,2,3] parameter(0)
+  p0c = f32[5,2,3] convert(p0)
+  p1 = f32[20,3] parameter(1)
+  p1r = f32[5,3,4] reshape(p1)
+  ROOT dot.5 = f32[5,2,4] dot(p0c, p1r),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
 })";
 
   MatchOptimizedHlo(hlo_text, R"(
-; CHECK: f32[3,2,2]{2,1,0} bitcast(%Arg_1.2)
-; CHECK: fusion(%Arg_0.1, %bitcast
+; CHECK: f32[5,3,4]{2,1,0} bitcast(%p1)
+; CHECK: fusion(%p0, %bitcast
 ; CHECK-SAME: kind=kCustom
 ; CHECK-SAME: backend_config="{\"block_m\":\"
 )");
@@ -344,98 +430,31 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-6, 1e-6}));
 }
 
-struct GemmTestParams {
-  PrimitiveType lhs_ty;
-  PrimitiveType rhs_ty;
-  int m;
-  int k;
-  int n;
-  float aabs = 1e-6;
-  float arel = 1e-6;
-};
-
-class ParametrizedRewriteTest
-    : public TritonGemmTest,
-      public ::testing::WithParamInterface<GemmTestParams> {};
-
-TEST_P(ParametrizedRewriteTest, Main) {
-  GemmTestParams params = GetParam();
-  if ((params.lhs_ty == BF16 || params.rhs_ty == BF16) &&
-      !GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
-  const std::string hlo_string_template = R"(
-HloModule m
+TEST_F(TritonGemmTest, Naming) {
+  const char* hlo_text = R"(
+HloModule t
 
 ENTRY e {
-  p0 = $0[$2,$3] parameter(0)
-  p0c = $1[$2,$3] convert(p0)
-  p1 = $1[$3,$4] parameter(1)
-  ROOT _ = $1[$2,$4] dot(p0c, p1),
+  p0 = f16[15,19] parameter(0)
+  p1 = s8[19,17] parameter(1)
+  cp1 = f16[19,17] convert(p1)
+  ROOT r = f16[15,17] dot(p0, cp1),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
-  std::string hlo_string = absl::Substitute(
-      hlo_string_template,
-      primitive_util::LowercasePrimitiveTypeName(params.lhs_ty),
-      primitive_util::LowercasePrimitiveTypeName(params.rhs_ty), params.m,
-      params.k, params.n);
-  MatchOptimizedHlo(hlo_string, R"(
-; CHECK: fusion(%p0, %p1)
-; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+
+  MatchOptimizedHlo(hlo_text, R"(
+; CHECK: %triton_gemm_r (
+; CHECK: %triton_gemm_r =
+; CHECK-SAME: fusion
 )");
-
-  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{params.aabs, params.arel}));
 }
 
-std::string GemmTestParamsParamsToString(
-    const ::testing::TestParamInfo<GemmTestParams>& data) {
-  return absl::StrCat(
-      primitive_util::LowercasePrimitiveTypeName(data.param.lhs_ty), "_",
-      primitive_util::LowercasePrimitiveTypeName(data.param.rhs_ty), "_",
-      data.param.m, "_", data.param.k, "_", data.param.n);
-}
-
-INSTANTIATE_TEST_SUITE_P(RewriteTestSuite, ParametrizedRewriteTest,
-                         ::testing::ValuesIn({
-                             GemmTestParams{PRED, F16, 16, 32, 8},
-                             GemmTestParams{PRED, BF16, 16, 32, 8},
-                             GemmTestParams{PRED, F32, 16, 32, 8, 1e-4, 1e-3},
-                             GemmTestParams{S8, F16, 16, 32, 8},
-                             GemmTestParams{S8, BF16, 16, 32, 8},
-                             GemmTestParams{S8, F32, 16, 32, 8, 5e-2, 1e-2},
-                             GemmTestParams{S8, F32, 101, 7, 303, 0.1, 0.1},
-                             GemmTestParams{S8, F32, 101, 32, 303, 0.1, 0.1},
-                             GemmTestParams{S8, F32, 101, 2048, 303, 0.5, 0.1},
-                             GemmTestParams{S8, F32, 101, 2555, 303, 0.5, 0.1},
-                             // Is supported but overflows.
-                             //  GemmTestParams{S32, F16},
-                             GemmTestParams{S32, F32, 4, 4, 4, 1, 1e-2},
-                             GemmTestParams{F16, BF16, 16, 32, 8},
-                             GemmTestParams{F16, F32, 16, 32, 8, 1e-3, 1e-6},
-                             GemmTestParams{BF16, F16, 16, 32, 8, 1e-3, 1e-6},
-                             GemmTestParams{BF16, F32, 16, 32, 8, 1e-3, 1e-6},
-                             // Supported but disabled because narrowing
-                             // converts should rather belong to producers.
-                             // TODO(b/266862493): Move these to CompareTest.
-                             // TritonRewriteTest2Params{S32, BF16},
-                             //  TritonRewriteTest2Params{F32, F16},
-                             //  TritonRewriteTest2Params{F32, BF16},
-                             GemmTestParams{S8, BF16, 24, 40, 8},
-                             GemmTestParams{S8, F16, 80, 16, 32},
-                             GemmTestParams{F16, F32, 127, 3, 300, 1e-2, 1e-2},
-                             GemmTestParams{F16, BF16, 544, 96, 16, 1e-3, 1e-3},
-                             GemmTestParams{BF16, F32, 77, 500, 333, 3e-3,
-                                            3e-3},
-                         }),
-                         GemmTestParamsParamsToString);
 
 // This group of tests compares GPU results of dots already rewritten
 // into Triton fusions.
 using CompareTest = TritonGemmTest;
 
-TEST_F(CompareTest, DifferentTilings) {
+TEST_F(CompareTest, DifferentTilingsProduceSameResult) {
   const char* hlo_text_ref = R"(
 HloModule t
 
@@ -450,7 +469,7 @@ ENTRY e {
   p0 = s8[101,202]{1,0} parameter(0)
   p1 = f32[202,303]{1,0} parameter(1)
   ROOT _ = f32[101,303] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"128\",\"block_n\":\"64\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"3\",\"num_warps\":\"8\"}"
+    backend_config="{\"block_m\":\"16\",\"block_n\":\"64\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"3\",\"num_warps\":\"8\"}"
 })";
 
   const char* hlo_text_triton = R"(
@@ -579,7 +598,7 @@ ENTRY e {
   arg0 = bf16[512,16]{1,0} parameter(0)
   arg1 = bf16[512,256]{1,0} parameter(1)
   ROOT _ = bf16[16,256]{1,0} fusion(arg0, arg1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"128\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"4\"}"
+    backend_config="{\"block_m\":\"128\",\"block_n\":\"32\",\"block_k\":\"16\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"4\"}"
 }
 )";
 
@@ -588,6 +607,84 @@ ENTRY e {
                                       /*run_hlo_passes=*/false));
 }
 
+TEST_F(CompareTest, UsingOptinSharedMemoryOnAmpereProducesSameResult) {
+  // On pre-Ampere GPUs the test would use a different amount of shared memory.
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "This test is for Ampere+ GPUs.";
+  }
+  const GpuDeviceInfo dev_info =
+      GetGpuDeviceInfo(backend().default_stream_executor());
+  constexpr int kBytesOfSharedMemoryTested = 64 * 1024;
+  EXPECT_GE(dev_info.shared_memory_per_block_optin, kBytesOfSharedMemoryTested);
+
+  const std::string kHloTextOptinShmem = R"(
+HloModule t
+
+triton_dot {
+  param_0.1 = s8[332,441]{1,0} parameter(0)
+  param_1.1 = f16[441,39]{1,0} parameter(1)
+  ROOT dot = f16[332,39]{1,0} dot(param_0.1, param_1.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s8[332,441]{1,0} parameter(0)
+  p1 = f16[441,39]{1,0} parameter(1)
+  ROOT _ = f16[332,39]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config="{\"block_m\":\"128\",\"block_n\":\"128\",\"block_k\":\"128\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"32\"}"
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTextOptinShmem));
+  const HloComputation* triton_dot_computation =
+      hlo_module->entry_computation()
+          ->root_instruction()
+          ->fused_instructions_computation();
+  llvm::LLVMContext llvm_ctx;
+  llvm::Module llvm_module("module", llvm_ctx);
+  mlir::MLIRContext mlir_context;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      const tensorflow::AutotuneResult::TritonGemmKey config,
+      hlo_module->entry_computation()
+          ->root_instruction()
+          ->backend_config<tensorflow::AutotuneResult::TritonGemmKey>());
+  TF_ASSERT_OK_AND_ASSIGN(
+      const LaunchDimensions launch_dimensions,
+      TritonWrapper("test_fn", triton_dot_computation,
+                    GetCudaComputeCapability(), dev_info, config, &llvm_module,
+                    &MatMul, mlir_context));
+  // The config is chosen so that the used memory size is slightly above the
+  // 48 kB boundary of standard / optin shared memory so that any GPU that
+  // has the optin one should be able to execute the test.
+  EXPECT_EQ(launch_dimensions.SharedMemBytes(), kBytesOfSharedMemoryTested);
+  // Make sure the written config indeed has to use optin shared memory.
+  EXPECT_GT(launch_dimensions.SharedMemBytes(),
+            dev_info.shared_memory_per_block);
+
+  const std::string kHloTextLowShmem = R"(
+HloModule t
+
+triton_dot {
+  param_0.1 = s8[332,441]{1,0} parameter(0)
+  param_1.1 = f16[441,39]{1,0} parameter(1)
+  ROOT dot = f16[332,39]{1,0} dot(param_0.1, param_1.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s8[332,441]{1,0} parameter(0)
+  p1 = f16[441,39]{1,0} parameter(1)
+  ROOT _ = f16[332,39]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config="{\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
+})";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextLowShmem, kHloTextOptinShmem,
+                                      ErrorSpec{1e-6, 1e-6},
+                                      /*run_hlo_passes=*/false));
+}
+
 TEST_F(CompareTest, F16TransposedRHS) {
   const char* hlo_text_ref = R"(
 HloModule r
@@ -706,6 +803,263 @@ ENTRY e {
                                       /*run_hlo_passes=*/false));
 }
 
+TEST_F(CompareTest, SplitK) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const std::string hlo_text_ref = R"(
+HloModule t, is_scheduled=true
+
+triton_gemm_r {
+  parameter_0 = s8[480,120]{1,0} parameter(0)
+  convert.3 = bf16[480,120]{1,0} convert(parameter_0)
+  parameter_1 = bf16[16,120]{1,0} parameter(1)
+  ROOT r.1 = bf16[480,16]{1,0} dot(convert.3, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p1 = bf16[16,120]{1,0} parameter(1)
+  p0 = s8[3,120,5,32]{3,2,1,0} parameter(0)
+  bitcast.4 = s8[480,120]{1,0} bitcast(p0)
+  ROOT triton_gemm_r = bf16[480,16]{1,0} fusion(bitcast.4, p1), kind=kCustom,
+    calls=triton_gemm_r,
+    backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}"
+})";
+
+  const std::string hlo_text_splitk = R"(
+HloModule t, is_scheduled=true
+
+triton_gemm_r {
+  parameter_0 = s8[480,120]{1,0} parameter(0)
+  convert.3 = bf16[480,120]{1,0} convert(parameter_0)
+  bitcast.11 = bf16[480,4,30]{2,1,0} bitcast(convert.3)
+  parameter_1 = bf16[16,120]{1,0} parameter(1)
+  bitcast.12 = bf16[16,4,30]{2,1,0} bitcast(parameter_1)
+  ROOT dot.1 = bf16[4,480,16]{2,1,0} dot(bitcast.11, bitcast.12),
+    lhs_batch_dims={1}, lhs_contracting_dims={2},
+    rhs_batch_dims={1}, rhs_contracting_dims={2}
+}
+
+add {
+  rhs.1 = f32[] parameter(1)
+  lhs.1 = f32[] parameter(0)
+  ROOT add.1 = f32[] add(lhs.1, rhs.1)
+}
+
+fused_computation {
+  param_0.2 = bf16[4,480,16]{2,1,0} parameter(0)
+  convert.18 = f32[4,480,16]{2,1,0} convert(param_0.2)
+  constant_1 = bf16[] constant(0)
+  convert.17 = f32[] convert(constant_1)
+  reduce.1 = f32[480,16]{1,0} reduce(convert.18, convert.17), dimensions={0},
+    to_apply=add
+  ROOT convert.16 = bf16[480,16]{1,0} convert(reduce.1)
+}
+
+ENTRY e {
+  p1 = bf16[16,120]{1,0} parameter(1)
+  p0 = s8[3,120,5,32]{3,2,1,0} parameter(0)
+  bitcast.4 = s8[480,120]{1,0} bitcast(p0)
+  triton_gemm_r = bf16[4,480,16]{2,1,0} fusion(bitcast.4, p1), kind=kCustom,
+    calls=triton_gemm_r,
+    backend_config="{\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"128\",\"split_k\":\"4\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
+  ROOT fusion.1 = bf16[480,16]{1,0} fusion(triton_gemm_r), kind=kLoop,
+    calls=fused_computation
+})";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_splitk,
+                                      ErrorSpec{1e-6, 1e-6},
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(CompareTest, SplitKBatch) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const std::string hlo_text_ref = R"(
+HloModule m, is_scheduled=true
+
+triton_gemm_dot.24 {
+  parameter_1 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
+  bitcast.3 = bf16[800,5,128]{2,1,0} bitcast(parameter_1)
+  convert.3 = f32[800,5,128]{2,1,0} convert(bitcast.3)
+  parameter_0 = f32[1,5,700,800]{3,2,1,0} parameter(0)
+  bitcast.2 = f32[5,700,800]{2,1,0} bitcast(parameter_0)
+  ROOT dot.26 = f32[5,128,700]{2,1,0} dot(convert.3, bitcast.2), lhs_batch_dims={1}, lhs_contracting_dims={0}, rhs_batch_dims={0}, rhs_contracting_dims={2}
+}
+
+ENTRY e {
+  tmp_3 = f32[1,5,700,800]{3,2,1,0} parameter(0)
+  tmp_0 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
+  ROOT triton_gemm_dot.24 = f32[5,128,700]{2,1,0} fusion(tmp_3, tmp_0),
+    kind=kCustom, calls=triton_gemm_dot.24,
+    backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"8\"}"
+})";
+
+  const std::string hlo_text_splitk = R"(
+HloModule m, is_scheduled=true
+
+triton_gemm_dot {
+  parameter_1 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
+  bitcast.3 = bf16[800,5,128]{2,1,0} bitcast(parameter_1)
+  convert.3 = f32[800,5,128]{2,1,0} convert(bitcast.3)
+  bitcast = f32[8,100,5,128]{3,2,1,0} bitcast(convert.3)
+  parameter_0 = f32[1,5,700,800]{3,2,1,0} parameter(0)
+  bitcast.2 = f32[5,700,800]{2,1,0} bitcast(parameter_0)
+  bitcast.1 = f32[5,700,8,100]{3,2,1,0} bitcast(bitcast.2)
+  ROOT dot = f32[5,8,128,700]{3,2,1,0} dot(bitcast, bitcast.1), lhs_batch_dims={2,0}, lhs_contracting_dims={1}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}
+}
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY e {
+  tmp_3 = f32[1,5,700,800]{3,2,1,0} parameter(0)
+  tmp_0 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
+  triton_gemm_dot.24 = f32[5,8,128,700]{3,2,1,0} fusion(tmp_3, tmp_0),
+    kind=kCustom, calls=triton_gemm_dot,
+    backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"8\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
+  constant = f32[] constant(0)
+  ROOT reduce = f32[5,128,700]{2,1,0} reduce(triton_gemm_dot.24, constant), dimensions={1}, to_apply=add
+})";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_splitk,
+                                      ErrorSpec{1e-3, 1e-3},
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(CompareTest, SplitKNontrivialBitcast) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const std::string kHloTextRef = R"(
+HloModule module, is_scheduled=true
+
+triton_gemm_dot.5316 {
+  parameter_1 = bf16[16,4,128]{2,1,0} parameter(1)
+  bitcast.2 = bf16[16,512]{1,0} bitcast(parameter_1)
+  parameter_0 = s8[512,96]{1,0} parameter(0)
+  convert.4 = bf16[512,96]{1,0} convert(parameter_0)
+  ROOT dot.0 = bf16[16,96]{1,0} dot(bitcast.2, convert.4),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  parameter_0.1 = s8[96,4,128]{2,1,0} parameter(0)
+  bitcast.6 = s8[512,96]{1,0} bitcast(parameter_0.1)
+  parameter_1.1 = bf16[16,4,128]{2,1,0} parameter(1)
+  ROOT triton_gemm_dot.5316 = bf16[16,96]{1,0} fusion(bitcast.6, parameter_1.1),
+    kind=kCustom, calls=triton_gemm_dot.5316,
+    backend_config="{\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"256\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
+})";
+
+  const std::string kHloTextSplitK = R"(
+HloModule module, is_scheduled=true
+
+triton_gemm_dot.5316 {
+  parameter_1 = bf16[16,4,128]{2,1,0} parameter(1)
+  bitcast.2 = bf16[16,512]{1,0} bitcast(parameter_1)
+  bitcast.17 = bf16[16,16,32]{2,1,0} bitcast(bitcast.2)
+  parameter_0 = s8[512,96]{1,0} parameter(0)
+  convert.4 = bf16[512,96]{1,0} convert(parameter_0)
+  bitcast.18 = bf16[16,32,96]{2,1,0} bitcast(convert.4)
+  ROOT dot.4 = bf16[16,16,96]{2,1,0} dot(bitcast.17, bitcast.18),
+    lhs_batch_dims={1}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+
+triton_gemm_dot.5316.reduce_sub_computation.clone {
+  rhs.1 = f32[] parameter(1)
+  lhs.1 = f32[] parameter(0)
+  ROOT add.1 = f32[] add(lhs.1, rhs.1)
+}
+
+fused_computation {
+  param_0.2 = bf16[16,16,96]{2,1,0} parameter(0)
+  convert.19 = f32[16,16,96]{2,1,0} convert(param_0.2)
+  constant_1 = bf16[] constant(0)
+  convert.18 = f32[] convert(constant_1)
+  reduce.1 = f32[16,96]{1,0} reduce(convert.19, convert.18),
+    dimensions={0}, to_apply=triton_gemm_dot.5316.reduce_sub_computation.clone
+  ROOT convert.17 = bf16[16,96]{1,0} convert(reduce.1)
+}
+
+ENTRY entry {
+  parameter_0.1 = s8[96,4,128]{2,1,0} parameter(0)
+  bitcast.6 = s8[512,96]{1,0} bitcast(parameter_0.1)
+  parameter_1.1 = bf16[16,4,128]{2,1,0} parameter(1)
+  triton_gemm_dot.5316 = bf16[16,16,96]{2,1,0} fusion(bitcast.6, parameter_1.1),
+    kind=kCustom, calls=triton_gemm_dot.5316,
+    backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"16\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
+  ROOT fusion.1 = bf16[16,96]{1,0} fusion(triton_gemm_dot.5316),
+    kind=kLoop, calls=fused_computation
+})";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextSplitK,
+                                      ErrorSpec{1e-2, 1},
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(CompareTest, NonMajorMostOutputBatchWorksCorrectly) {
+  const std::string kHloTextTest = R"(
+HloModule m
+
+triton_gemm_dot.6 {
+  parameter_1 = f32[32,50,104]{2,1,0} parameter(1)
+  parameter_0 = s8[32,26,104]{2,1,0} parameter(0)
+  convert.22 = f32[32,26,104]{2,1,0} convert(parameter_0)
+  ROOT dot.127 = f32[32,50,26]{2,0,1} dot(parameter_1, convert.22),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={2}
+}
+
+ENTRY e {
+  p0 = s8[32,26,104]{2,1,0} parameter(0)
+  p1 = f32[32,50,104]{2,1,0} parameter(1)
+  ROOT triton_gemm_dot.6 = f32[32,50,26]{2,0,1} fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot.6,
+    backend_config="{\"block_m\":\"64\",\"block_n\":\"16\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
+})";
+
+  const std::string kHloTextRef = R"(
+HloModule m
+
+%triton_gemm_dot.127 {
+  %parameter_1.1 = f32[32,50,104]{2,1,0} parameter(1)
+  %parameter_0.1 = s8[32,26,104]{2,1,0} parameter(0)
+  %convert.0 = f32[32,26,104]{2,1,0} convert(%parameter_0.1)
+  ROOT %dot.0 = f32[32,50,26]{2,1,0} dot(%parameter_1.1, %convert.0),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={2}
+}
+
+%fused_computation {
+  %param_0.1 = f32[32,50,26]{2,1,0} parameter(0)
+  %transpose.1 = f32[50,32,26]{2,1,0} transpose(%param_0.1), dimensions={1,0,2}
+  ROOT %bitcast.7 = f32[32,50,26]{2,0,1} bitcast(%transpose.1)
+}
+
+ENTRY e {
+  %parameter_0 = s8[32,26,104]{2,1,0} parameter(0)
+  %parameter_1 = f32[32,50,104]{2,1,0} parameter(1)
+  %triton_gemm_dot.127 = f32[32,50,26]{2,1,0} fusion(%parameter_0, %parameter_1),
+    kind=kCustom, calls=%triton_gemm_dot.127,
+    backend_config="{\"block_m\":\"32\",\"block_n\":\"128\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"4\"}"
+  ROOT %fusion.1 = f32[32,50,26]{2,0,1} fusion(%triton_gemm_dot.127), kind=kLoop, calls=%fused_computation
+})";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextTest,
+                                      ErrorSpec{1e-6, 1e-6},
+                                      /*run_hlo_passes=*/false));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 816e5e3aa68..6af87fcb3dc 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -26,10 +26,8 @@ limitations under the License.
 #include <memory>
 #include <numeric>
 #include <optional>
-#include <sstream>
 #include <string>
 #include <tuple>
-#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -65,6 +63,8 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"  // from @llvm-project
@@ -82,7 +82,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
@@ -114,7 +113,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/replica_id_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/reusable_kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/target_util.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
@@ -140,7 +138,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/human_readable_json.h"
-#include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/protobuf/dnn.pb.h"
@@ -513,74 +510,9 @@ StatusOr<std::unique_ptr<IrEmitterUnnested>> IrEmitterUnnested::Create(
       new IrEmitterUnnested(hlo_module_config, ir_emitter_context));
 }
 
-llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
-    absl::string_view name, absl::Span<const BufferAllocation* const> args) {
-  // Compute the kernel name. The opcode string may contain "-" which cannot be
-  // in a PTX function name, so sanitize the name before uniquifying it.
-  std::string kernel_name = ir_emitter_context_->name_uniquer()->GetUniqueName(
-      llvm_ir::SanitizeFunctionName(std::string(name)));
-
-  // Create the kernel and add it to the module.
-  llvm::LLVMContext& context = module_->getContext();
-  llvm::FunctionType* kernel_type = llvm::FunctionType::get(
-      /*Result=*/llvm::Type::getVoidTy(context),
-      std::vector<llvm::Type*>(args.size(), b_.getInt8PtrTy()),
-      /*isVarArg=*/false);
-  llvm::Function* kernel =
-      llvm::Function::Create(kernel_type, llvm::GlobalValue::ExternalLinkage,
-                             kernel_name.c_str(), module_);
-
-  // Add dereferenceable and alignment information to each of the kernel's
-  // parameters.
-  auto arg_it = kernel->arg_begin();
-  for (size_t arg_no = 0; arg_no < args.size(); ++arg_no) {
-    const BufferAllocation* alloc = args[arg_no];
-    llvm::Argument& fn_arg = *arg_it;
-    ++arg_it;
-
-    kernel->addDereferenceableParamAttr(arg_no, alloc->size());
-
-    const int64_t alignment = [&] {
-      if (alloc->is_entry_computation_parameter()) {
-        return kEntryParameterAlignBytes;
-      } else if (alloc->is_constant()) {
-        return kConstantBufferAlignBytes;
-      } else {
-        return kXlaAllocatedBufferAlignBytes;
-      }
-    }();
-
-    kernel->addParamAttr(
-        arg_no,
-        llvm::Attribute::get(context, llvm::Attribute::Alignment, alignment));
-
-    if (alloc->IsPreallocatedTempBuffer()) {
-      fn_arg.setName("temp_buf");
-    } else {
-      fn_arg.setName(StrCat("alloc", alloc->index()));
-    }
-  }
-
-  AnnotateFunctionAsGpuKernel(module_, kernel, &b_);
-
-  // TODO(b/65380986): Investigate if adding fast math flags for generated
-  // kernels makes sense.
-
-  // Update the insert point to the entry basic block.
-  llvm::BasicBlock* entry_bb =
-      llvm::BasicBlock::Create(context, /*Name=*/"entry", /*Parent=*/kernel);
-
-  // Emit a "return void" at entry_bb's end, and set the insert point before
-  // that return instruction.
-  b_.SetInsertPoint(llvm::ReturnInst::Create(context, entry_bb));
-
-  return kernel;
-}
-
-IrEmitterUnnested::KernelAndIrArrays
-IrEmitterUnnested::BuildReusableKernelPrototype(
+IrEmitterUnnested::KernelAndIrArrays IrEmitterUnnested::BuildKernelPrototype(
     absl::string_view suggested_name,
-    absl::Span<const ReusableKernelArgument> arguments,
+    absl::Span<const KernelArgument> arguments,
     const LaunchDimensions& launch_dimensions) {
   // If some arguments have the same buffer, we will pass them only once.
   llvm::SmallVector<int> to_llvm_arg_no(arguments.size());
@@ -628,8 +560,7 @@ IrEmitterUnnested::BuildReusableKernelPrototype(
 
   for (size_t llvm_arg_no = 0; llvm_arg_no < kernel->arg_size();
        ++llvm_arg_no) {
-    const ReusableKernelArgument& kernel_argument =
-        arguments[to_arg_no[llvm_arg_no]];
+    const KernelArgument& kernel_argument = arguments[to_arg_no[llvm_arg_no]];
     llvm::Argument& llvm_arg = *kernel->getArg(llvm_arg_no);
 
     llvm_arg.setName(StrCat("arg", llvm_arg_no));
@@ -651,7 +582,7 @@ IrEmitterUnnested::BuildReusableKernelPrototype(
 
   std::vector<llvm_ir::IrArray> ir_arrays;
   for (size_t arg_no = 0; arg_no < arguments.size(); ++arg_no) {
-    const ReusableKernelArgument& kernel_argument = arguments[arg_no];
+    const KernelArgument& kernel_argument = arguments[arg_no];
     llvm::Argument& llvm_arg = *kernel->getArg(to_llvm_arg_no[arg_no]);
 
     llvm::Type* ir_type =
@@ -820,8 +751,9 @@ Status IrEmitterUnnested::EmitPadToStatic(mlir::Operation* op) {
                       CalculateLaunchDimensions(
                           input_shape, ir_emitter_context_->gpu_device_info(),
                           {unroll_factor}));
-  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
-                      BuildKernelThunk(pad_to_static, launch_dimensions));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm_ir::IrArray> ir_arrays,
+      BuildKernelThunkForNonFusionOp(pad_to_static, launch_dimensions));
 
   const llvm_ir::IrArray source_array = ir_arrays[0];
   const llvm_ir::IrArray output_array = ir_arrays[1];
@@ -945,8 +877,9 @@ Status IrEmitterUnnested::EmitSliceToDynamic(mlir::Operation* op) {
                           {unroll_factor}));
   llvm::Type* index_ty = GetIndexTypeForKernel(
       slice_to_dynamic, launch_dimensions.launch_bound(), &b_);
-  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
-                      BuildKernelThunk(slice_to_dynamic, launch_dimensions));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm_ir::IrArray> ir_arrays,
+      BuildKernelThunkForNonFusionOp(slice_to_dynamic, launch_dimensions));
 
   TF_RET_CHECK(slice_to_dynamic.getOutput().size() == 1);
   const Shape& data_shape = GetShape(slice_to_dynamic.getOutput().front());
@@ -1243,15 +1176,19 @@ Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(mlir::Operation* op) {
                       GetAllocationSlice(matmul.getCScale()));
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice d_scale,
                       GetAllocationSlice(matmul.getDScale()));
-  BufferAllocation::Slice d_amax;
+  BufferAllocation::Slice d_amax, bias;
   if (matmul.getDAmax() != nullptr) {
     TF_ASSIGN_OR_RETURN(d_amax, GetAllocationSlice(matmul.getDAmax()));
   }
+  if (matmul.getBias() != nullptr) {
+    TF_ASSIGN_OR_RETURN(bias, GetAllocationSlice(matmul.getBias()));
+  }
 
-  BufferAllocation::Slice bias, aux;  // Not used.
+  BufferAllocation::Slice aux;  // Not used.
 
   TF_ASSIGN_OR_RETURN(cublas_lt::MatmulPlan plan,
                       cublas_lt::MatmulPlan::For(matmul));
+
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
       GetThunkInfo(op), std::move(plan), matmul.getAlgorithm(), a, b, c, d,
       bias, aux, a_scale, b_scale, c_scale, d_scale, d_amax);
@@ -1782,28 +1719,27 @@ Status IrEmitterUnnested::EmitLaunchFunc(mlir::Operation* op) {
   std::vector<KernelArgument> kernel_arguments;
   unsigned num_kernel_operands = launch_func.getNumKernelOperands();
   kernel_arguments.reserve(num_kernel_operands);
-  int i = 0;
   mlir::ArrayRef<mlir::Attribute> written_operands =
       mlir::getWrittenOperandsAttribute(launch_func).getValue();
   for (const auto& [operand, written] :
        llvm::zip_first(launch_func.getKernelOperands(),
                        written_operands.take_back(num_kernel_operands))) {
-    auto& kernel_argument = kernel_arguments.emplace_back();
-    kernel_argument.order = i++;
-    kernel_argument.value = operand;
-    auto& slice = kernel_argument.slice;
-    TF_ASSIGN_OR_RETURN(slice.buffer_slice,
-                        GetAllocationSlice(operand, &slice.constant_name));
-    slice.shape = GetShape(operand);
-    slice.written = written.cast<mlir::BoolAttr>().getValue();
+    TF_ASSIGN_OR_RETURN(
+        KernelArgument kernel_argument,
+        ValueToKernelArgument(
+            operand, /*is_written=*/written.cast<mlir::BoolAttr>().getValue()));
+    kernel_arguments.push_back(kernel_argument);
   }
+  ProcessKernelArguments(absl::MakeSpan(kernel_arguments));
 
   // Add kernel prototype to module_, kernel thunk to thunk_sequence_.
   std::string kernel_name = GetIrNameFromLoc(launch_func.getLoc());
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunkImpl(kernel_name, GetThunkInfo(op),
-                           std::move(kernel_arguments), launch_dimensions));
+  auto [kernel, ir_arrays] =
+      BuildKernelPrototype(kernel_name, kernel_arguments, launch_dimensions);
+  TF_RETURN_IF_ERROR(BuildKernelThunkImpl(kernel->getName().str(),
+                                          kernel_arguments, GetThunkInfo(op),
+                                          launch_dimensions)
+                         .status());
 
   // Move function body into kernel prototype.
   llvm::Function* prototype_func = b_.GetInsertBlock()->getParent();
@@ -1828,64 +1764,75 @@ Status IrEmitterUnnested::EmitLaunchFunc(mlir::Operation* op) {
 #if GOOGLE_CUDA
 Status IrEmitterUnnested::EmitTritonFusion(
     mlir::Operation* op, tensorflow::AutotuneResult::TritonGemmKey& config) {
+  // Note: In this method we can't use `BuildKernelThunk` as usual,
+  // because we only get the launch dimensions after code generation. So we
+  // implement kernel reuse using lower level APIs, such as
+  // `BuildKernelThunkImpl`.
+
   VLOG(3) << llvm_ir::DumpToString(op);
   auto fusion_op = mlir::cast<mlir::lmhlo::FusionOp>(op);
 
+  std::string suggested_kernel_name = GetIrNameFromLoc(fusion_op->getLoc());
+  TF_ASSIGN_OR_RETURN(std::vector<KernelArgument> kernel_arguments,
+                      GetKernelArgumentsForFusion(fusion_op));
+
   TF_ASSIGN_OR_RETURN(
       const HloComputation* hlo_computation,
       GetOrCreateSubComputationFromRegion(&fusion_op->getRegion(0),
                                           /*is_fusion=*/false));
 
-  const std::string fingerprint =
-      hlo_computation->ToString(HloPrintOptions::Fingerprint()
-                                    .set_print_only_essential_constants(false)
-                                    .set_print_operand_shape(false));
+  std::string fingerprint = GetFingerprint(hlo_computation, kernel_arguments);
+  VLOG(4) << "Fingerprint: ";
+  XLA_VLOG_LINES(4, fingerprint);
 
-  // TODO(tdanyluk): Consider removing this level of caching, because we already
-  // cache the wrapper_fn now.
-  auto cache_it = triton_cache_.find(fingerprint);
-  llvm::Function* impl_fn;
-  if (cache_it == triton_cache_.end()) {
-    const std::string fn_name =
-        ir_emitter_context_->name_uniquer()->GetUniqueName(
-            llvm_ir::SanitizeFunctionName(
-                fusion_op->getName().getStringRef().str()));
-    const std::optional<LaunchDimensions> launch_dimensions = TritonWrapper(
-        fn_name, hlo_computation,
-        ir_emitter_context_->cuda_compute_capability(),
-        ir_emitter_context_->gpu_device_info(), config, module_, &MatMul);
-    TF_RET_CHECK(launch_dimensions.has_value());
-    impl_fn = module_->getFunction(fn_name);
-    TF_RET_CHECK(impl_fn);
-    triton_cache_[fingerprint] =
-        std::make_pair(impl_fn, launch_dimensions.value());
-  } else {
-    VLOG(10) << "Duplicate computation reused.";
-    impl_fn = cache_it->second.first;
-  }
+  if (auto cache_it = kernel_reuse_cache_.find(fingerprint);
+      cache_it != kernel_reuse_cache_.end()) {
+    KernelThunk* old_thunk = cache_it->second;
+
+    VLOG(3) << "Reuse: " << suggested_kernel_name << " -> "
+            << old_thunk->kernel_name();
+
+    TF_RETURN_IF_ERROR(BuildKernelThunkImpl(old_thunk->kernel_name(),
+                                            kernel_arguments,
+                                            GetThunkInfo(fusion_op),
+                                            old_thunk->launch_dimensions())
+                           .status());
 
-  // Call the (cached) impl_fn from the wrapper_fn corresponding to this thunk.
-  // Using BuildReusableKernelThunk actually speeds up the compilation
-  // considerably, despite the caching of the impl_fn.
-  TF_ASSIGN_OR_RETURN(
-      std::optional<std::vector<llvm_ir::IrArray>> opt_ir_arrays,
-      BuildReusableKernelThunk(
-          fusion_op, /*launch_dimensions=*/triton_cache_[fingerprint].second));
-  if (!opt_ir_arrays.has_value()) {
-    // The kernel was reused, no need to emit code.
     return OkStatus();
   }
-  std::vector<llvm_ir::IrArray>& ir_arrays = opt_ir_arrays.value();
 
-  std::vector<llvm::Value*> args;
-  args.reserve(ir_arrays.size());
-  for (const llvm_ir::IrArray& a : ir_arrays) {
-    args.push_back(a.GetBasePointer());
+  VLOG(3) << "Generating: " << suggested_kernel_name;
+
+  const std::string impl_fn_name =
+      ir_emitter_context_->name_uniquer()->GetUniqueName(
+          llvm_ir::SanitizeFunctionName(
+              absl::StrCat(suggested_kernel_name, "_impl")));
+  TF_ASSIGN_OR_RETURN(
+      LaunchDimensions launch_dimensions,
+      TritonWrapper(impl_fn_name, hlo_computation,
+                    ir_emitter_context_->cuda_compute_capability(),
+                    ir_emitter_context_->gpu_device_info(), config, module_,
+                    &MatMul, *ir_emitter_context_->mlir_context()));
+  llvm::Function* impl_fn = module_->getFunction(impl_fn_name);
+  TF_RET_CHECK(impl_fn);
+
+  auto [kernel, ir_arrays] = BuildKernelPrototype(
+      suggested_kernel_name, kernel_arguments, launch_dimensions);
+
+  TF_ASSIGN_OR_RETURN(
+      KernelThunk * thunk,
+      BuildKernelThunkImpl(kernel->getName().str(), kernel_arguments,
+                           GetThunkInfo(fusion_op), launch_dimensions));
+  kernel_reuse_cache_[fingerprint] = thunk;
+
+  // Move function body into kernel prototype.
+  llvm::Function* prototype_func = b_.GetInsertBlock()->getParent();
+  prototype_func->splice(prototype_func->begin(), impl_fn);
+  for (const auto& [arg, ir_array] :
+       llvm::zip_first(impl_fn->args(), ir_arrays)) {
+    arg.replaceAllUsesWith(ir_array.GetBasePointer());
   }
-  llvm::Function* wrapper_fn = b_.GetInsertBlock()->getParent();
-  llvm::CallInst::Create(
-      impl_fn, args, /*NameStr=*/"",
-      /*InsertBefore=*/wrapper_fn->getEntryBlock().getTerminator());
+  impl_fn->eraseFromParent();
 
   LogAndVerify(module_);
 
@@ -1968,7 +1915,7 @@ Status IrEmitterUnnested::EmitLoopFusion(mlir::Operation* op) {
 
   TF_ASSIGN_OR_RETURN(
       std::optional<std::vector<llvm_ir::IrArray>> opt_ir_arrays,
-      BuildReusableKernelThunk(fusion, launch_dimensions));
+      BuildKernelThunkForFusion(fusion, launch_dimensions));
   if (!opt_ir_arrays.has_value()) {
     // The kernel was reused, no need to emit code.
     return OkStatus();
@@ -2049,7 +1996,7 @@ Status IrEmitterUnnested::EmitUnnestedTranspose(
 
   TF_ASSIGN_OR_RETURN(
       std::optional<std::vector<llvm_ir::IrArray>> opt_ir_arrays,
-      BuildReusableKernelThunk(fusion, launch_dimensions));
+      BuildKernelThunkForFusion(fusion, launch_dimensions));
   if (!opt_ir_arrays.has_value()) {
     // The kernel was reused, no need to emit code.
     return OkStatus();
@@ -2171,8 +2118,9 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce(
 
   // Compute all extra output values before writing them. This avoids
   // overwriting aliased input/output buffers before all reads occurred.
-  absl::flat_hash_map<const HloInstruction*, llvm::Value*>
+  std::vector<std::pair<const HloInstruction*, llvm::Value*>>
       extra_output_ir_values;
+  extra_output_ir_values.reserve(extra_output_gens.size());
 
   auto get_index = [&](const HloInstruction* instr) {
     const Shape& s = instr->shape();
@@ -2184,7 +2132,7 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce(
   for (const auto& [instr, generator] : extra_output_gens) {
     TF_ASSIGN_OR_RETURN(llvm::Value* const extra_output_ir_value,
                         generator(get_index(instr)));
-    extra_output_ir_values[instr] = extra_output_ir_value;
+    extra_output_ir_values.emplace_back(instr, extra_output_ir_value);
   }
 
   for (const auto& [instr, generator] : extra_output_ir_values) {
@@ -2240,12 +2188,13 @@ Status IrEmitterUnnested::EmitSelectAndScatter(mlir::Operation* op) {
                                 ir_emitter_context_->gpu_device_info()));
 
   // Init value is not needed in IR emission.
-  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
-                      BuildKernelThunk(select_and_scatter_op,
-                                       {select_and_scatter_op.getOperand(),
-                                        select_and_scatter_op.getSource(),
-                                        select_and_scatter_op.getOut()},
-                                       launch_dimensions));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm_ir::IrArray> ir_arrays,
+      BuildKernelThunkForNonFusionOp(
+          select_and_scatter_op,
+          {select_and_scatter_op.getOperand(),
+           select_and_scatter_op.getSource(), select_and_scatter_op.getOut()},
+          launch_dimensions));
 
   CHECK_EQ(ir_arrays.size(), 3);
   const IrArray& operand_array = ir_arrays[0];
@@ -2484,9 +2433,9 @@ Status IrEmitterUnnested::EmitRngGetAndUpdateState(mlir::Operation* op) {
   auto rng_op = mlir::dyn_cast<mlir::lmhlo::RngGetAndUpdateStateOp>(op);
 
   // Emit a kernel to increment the global state for Philox RNG algorithm.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(rng_op, rng_op.getState(), LaunchDimensions()));
+  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                      BuildKernelThunkForNonFusionOp(
+                          rng_op /*, rng_op.getState(),*/, LaunchDimensions()));
 
   llvm::Value* old_state =
       llvm_ir::RngGetAndUpdateState(rng_op.getDelta(), module_, &b_);
@@ -2533,10 +2482,10 @@ Status IrEmitterUnnested::EmitScatter(mlir::Operation* op) {
   // Create kernel thunk for all operands except the first one (`operand`). The
   // code generated for scatter below assumes that the input operand is already
   // copied into the output, so does not use it in codegen.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(scatter_op, scatter_op.getOperands().drop_front(),
-                       launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                      BuildKernelThunkForNonFusionOp(
+                          scatter_op, scatter_op.getOperands().drop_front(),
+                          launch_dimensions));
 
   CHECK_EQ(ir_arrays.size(), 3);
   const IrArray& scatter_indices = ir_arrays[0];
@@ -2829,7 +2778,7 @@ IrEmitterUnnested::GetOrCreateSubComputationFromRegion(mlir::Region* region,
           // this once we get rid of this function, or don't rely on the op name
           // (which shouldn't be the identity) to generate LLVM symbols.
           instr->SetAndSanitizeName(llvm_ir::SanitizeConstantName(
-              module->name() + "_" + instr->name()));
+              absl::StrCat(module->name(), "_", instr->name())));
         }
       }
     }
@@ -2971,9 +2920,9 @@ Status IrEmitterUnnested::EmitSort(mlir::Operation* op) {
     LaunchDimensions launch_dimensions = xor_masks.size() > 1
                                              ? tiled_launch_dimensions
                                              : standard_launch_dimensions;
-    TF_ASSIGN_OR_RETURN(
-        std::vector<llvm_ir::IrArray> ir_arrays,
-        BuildKernelThunk(sort_op, sort_op.getOutput(), launch_dimensions));
+    TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                        BuildKernelThunkForNonFusionOp(
+                            sort_op, sort_op.getOutput(), launch_dimensions));
     std::vector<IrArray> values_arrays;
     values_arrays.reserve(operands.size());
     for (int64_t i = 0; i < operands.size(); ++i) {
@@ -3078,7 +3027,8 @@ Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {
   OpT op = mlir::cast<OpT>(untyped_op);
   int64_t replica_count = hlo_module_config_.replica_count();
   int64_t partition_count = hlo_module_config_.num_partitions();
-  VLOG(2) << NcclThunkType::GetName() << "; replica count: " << replica_count
+  VLOG(2) << NcclThunkType::GetHloOpName()
+          << "; replica count: " << replica_count
           << "; partition count: " << partition_count
           << "; operand count: " << op.getOperands().size()
           << "; NCCL is enabled: " << NcclThunkType::NcclIsEnabled();
@@ -3088,8 +3038,9 @@ Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {
   // and we can just copy the input to the output.
   bool is_degenerate =
       NcclThunkType::IsDegenerate(op, replica_count, partition_count);
-  bool should_use_nccl_thunk =
-      !is_degenerate && NcclThunkType::CanImplement(op);
+  Status implementable_status =
+      NcclThunkType::CheckImplementable(op, replica_count, partition_count);
+  bool should_use_nccl_thunk = !is_degenerate && implementable_status.ok();
 
   // Stash relevant information in NcclCollectiveThunk::Buffer even if we may
   // not generate an NcclCollectiveThunk.
@@ -3122,32 +3073,18 @@ Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {
     return OkStatus();
   }
 
+  if (!is_degenerate) {
+    return implementable_status;
+  }
+
   // Signal that start thunk not created with nullptr.
   if constexpr (NcclThunkType::IsAsync()) {
     async_executors_.insert({untyped_op, nullptr});
   }
 
-  if (!is_degenerate) {
-    CollectiveOpGroupMode group_mode = NcclThunkType::GetGroupMode(op);
-
-    std::string message = absl::StrFormat(
-        "Requested %s not implemented on GPU; replica_count: %d; "
-        "partition_count: %d, group_mode: %s, operand_count: %d; NCCL support: "
-        "%d",
-        NcclThunkType::GetName(), replica_count, partition_count,
-        CollectiveOpGroupModeToString(group_mode), op.getOperands().size(),
-        NcclThunkType::NcclIsEnabled());
-    if (!op.getOperands().empty()) {
-      const Shape shape = GetShape(op.getOperands().front());
-      absl::StrAppendFormat(&message, "; first operand array element-type: %s",
-                            PrimitiveType_Name(shape.element_type()));
-    }
-    return Unimplemented("%s", message);
-  }
-
   VLOG(1) << "Collective call is degenerate, not doing NCCL call";
 
-  // All-gather with one replica is simply the identity function. Buffer
+  // Degenerate collectives are simply identity function. Buffer
   // assignment expects a copy, so that's what we do.
   ThunkSequence thunks;
   for (int64_t i = 0; i < buffers.size(); i++) {
@@ -3228,278 +3165,74 @@ Status IrEmitterUnnested::EmitOutfeed(mlir::Operation* op) {
   return OkStatus();
 }
 
-StatusOr<std::vector<llvm_ir::IrArray>> IrEmitterUnnested::BuildKernelThunkImpl(
-    absl::string_view name, Thunk::ThunkInfo thunk_info,
-    std::vector<KernelArgument> kernel_arguments,
-    const LaunchDimensions& launch_dimensions) {
-  std::vector<llvm_ir::IrArray> ir_arrays;
-
-  // Temporarily reorder the values/slices to match the way we supply buffer
-  // allocation arguments to the GPU kernels (see below).
-  absl::c_sort(kernel_arguments,
-               [](const KernelArgument& a, const KernelArgument& b) {
-                 return a.slice.buffer_slice.allocation()->index() <
-                        b.slice.buffer_slice.allocation()->index();
-               });
-
-  // Figure out which buffer allocations need to be passed as arguments to our
-  // kernel.  This is simply all of the allocations referenced in slices,
-  // plus the XLA temp buffer (if we have it).  We always include the temp
-  // buffer because even if the kernel itself doesn't use it, a nested
-  // subcomputation within the kernel (e.g. a kMap's computation) might.
-  // For XLA runtime, do the same for mlir::Value(s).
-  absl::flat_hash_set<const BufferAllocation*> buffers_needed;
-  std::vector<mlir::Value> values_needed;
-  for (const auto& kernel_argument : kernel_arguments) {
-    const BufferSlice& slice = kernel_argument.slice;
-
-    if (slice.buffer_slice.allocation()->is_constant()) continue;
-
-    auto result = buffers_needed.insert(slice.buffer_slice.allocation());
-    if (!result.second) continue;
-
-    auto add_if_not_exists = [&](mlir::Value buffer_alloc_arg) {
-      if (!absl::c_linear_search(values_needed, buffer_alloc_arg)) {
-        values_needed.push_back(buffer_alloc_arg);
-      }
-    };
-
-    mlir::Value argument = kernel_argument.value;
-    auto defining_op = argument.getDefiningOp();
-    if (defining_op == nullptr) {
-      add_if_not_exists(argument);
-      continue;
-    }
-
-    if (auto view_op = llvm::dyn_cast<mlir::memref::ViewOp>(defining_op)) {
-      argument = view_op.getOperand(0);
-      add_if_not_exists(argument);
-      continue;
-    }
-
-    if (auto cast_op =
-            llvm::dyn_cast<mlir::memref::ReinterpretCastOp>(defining_op)) {
-      argument = cast_op.getOperand(0);
-      if (auto view_op =
-              llvm::dyn_cast<mlir::memref::ViewOp>(argument.getDefiningOp())) {
-        argument = view_op.getOperand(0);
-      }
-      add_if_not_exists(argument);
-      continue;
-    }
-
-    if (auto collapse_shape_op =
-            llvm::dyn_cast<mlir::memref::CollapseShapeOp>(defining_op)) {
-      argument = collapse_shape_op.getSrc();
-      if (auto view_op =
-              llvm::dyn_cast<mlir::memref::ViewOp>(argument.getDefiningOp())) {
-        argument = view_op.getOperand(0);
-      }
-      add_if_not_exists(argument);
-      continue;
-    }
-
-    return Unimplemented(
-        "Defining op for argument to GPU kernel not handled: %s",
-        defining_op->getName().getStringRef().str());
-  }
-  std::optional<const BufferAllocation*> temp_buffer;
-  for (const BufferAllocation& alloc : ir_emitter_context_->allocations()) {
-    if (alloc.IsPreallocatedTempBuffer()) {
-      if (!temp_buffer.has_value()) {
-        // Retrieve the first seen temp buffer.
-        temp_buffer = &alloc;
-      }
-    }
-  }
-  if (temp_buffer.has_value()) {
-    buffers_needed.insert(*temp_buffer);
-  }
-
-  // We'll pass a pointer to each of the elements of `buffers` to our kernel, in
-  // this order.
-  std::vector<const BufferAllocation*> buffers_ordered;
-  absl::c_copy(buffers_needed, std::back_inserter(buffers_ordered));
-  absl::c_sort(buffers_ordered,
-               [](const BufferAllocation* a, const BufferAllocation* b) {
-                 return a->index() < b->index();
-               });
-
-  llvm::Function* kernel = BuildKernelPrototype(name, buffers_ordered);
-
-  // Build a map from a BufferAllocation to the corresponding argument in our
-  // kernel.
-  absl::flat_hash_map<const BufferAllocation*, llvm::Value*> kernel_args;
-  {
-    auto arg_it = kernel->arg_begin();
-    auto buffers_it = buffers_ordered.begin();
-    for (; arg_it != kernel->arg_end(); ++arg_it, ++buffers_it) {
-      kernel_args[*buffers_it] = arg_it;
-
-      // Annotate all allocations with LLVM's `noalias`.
-      // There are three kinds of allocations:
-      // * Read-only allocations, aka input parameters that are not aliased with
-      // outputs.
-      // * Read-write allocations, including all output buffers, some of which
-      // may alias with input HLO parameters, but aliased HLO buffers are always
-      // assigned with the same allocation.
-      // * The temp buffer.
-      //
-      // Read-only allocations may overlap with each other, but since they are
-      // not mutated, they can always be annotated with `noalias` per LLVM
-      // semantics.
-      //
-      // Read-write allocations and the temp buffer don't overlap with any
-      // allocations, therefore they can also be annotated with `noalias`.
-      kernel->addParamAttr(
-          arg_it->getArgNo(),
-          llvm::Attribute::get(arg_it->getContext(), llvm::Attribute::NoAlias));
-    }
-  }
-
-  // Recover the original ordering of values/slices.
-  absl::c_sort(kernel_arguments,
-               [](const KernelArgument& a, const KernelArgument& b) {
-                 return a.order < b.order;
-               });
-
-  absl::flat_hash_set<BufferAllocation::Slice> buffers_written;
-  for (const auto& kernel_argument : kernel_arguments) {
-    const auto& slice = kernel_argument.slice;
-    if (slice.written) {
-      buffers_written.insert(slice.buffer_slice);
-    }
-  }
-
-  // For each buffer our kernel might want to touch, bind it to a value derived
-  // from our kernel args.
-  for (const auto& kernel_argument : kernel_arguments) {
-    const auto& slice = kernel_argument.slice;
-    const BufferAllocation::Slice& buffer_slice = slice.buffer_slice;
-
-    llvm::Value* loc;
-    if (!slice.constant_name.empty()) {
-      loc = module_->getGlobalVariable(slice.constant_name);
-      CHECK_NE(loc, nullptr)
-          << "Could not find variable '" << slice.constant_name << "'";
-    } else {
-      CHECK(!buffer_slice.allocation()->is_constant());
-      loc =
-          InBoundsGEP(b_.getInt8Ty(), kernel_args.at(buffer_slice.allocation()),
-                      {b_.getInt64(buffer_slice.offset())});
-    }
-
-    llvm::Type* ir_type = llvm_ir::ShapeToIrType(slice.shape, module_);
-    llvm_ir::IrArray ir_array(CastToTypedValue(slice.shape, loc, &b_), ir_type,
-                              slice.shape);
-    if (!buffers_written.contains(slice.buffer_slice)) {
-      ir_array.MarkInvariantOverWholeProgram(&loc->getContext());
-    }
-
-    ir_arrays.push_back(ir_array);
-  }
-
-  AnnotateKernelLaunchDimensions(launch_dimensions,
-                                 std::string(kernel->getName()), module_);
-
-  AddThunkToThunkSequence(std::make_unique<KernelThunk>(
-      thunk_info, buffers_ordered, std::string(kernel->getName()),
-      launch_dimensions, std::move(values_needed)));
-  return ir_arrays;
-}
-
 StatusOr<IrEmitterUnnested::KernelArgument>
-IrEmitterUnnested::ValueToKernelArgument(mlir::Value operand, int order,
-                                         bool is_written) {
+IrEmitterUnnested::ValueToKernelArgument(mlir::Value operand, bool is_written) {
   KernelArgument kernel_argument;
-  kernel_argument.order = order;
   kernel_argument.value = operand;
-  BufferSlice& slice = kernel_argument.slice;
-  TF_ASSIGN_OR_RETURN(slice.buffer_slice,
-                      GetAllocationSlice(operand, &slice.constant_name));
-  slice.written = is_written;
-  slice.shape = GetShape(operand);
+  kernel_argument.shape = GetShape(operand);
+  TF_ASSIGN_OR_RETURN(kernel_argument.slice,
+                      GetAllocationSlice(operand, nullptr));
+  // Note: We may change this later in ProcessKernelArguments.
+  kernel_argument.written = is_written;
   return kernel_argument;
 }
 
-StatusOr<std::vector<llvm_ir::IrArray>> IrEmitterUnnested::BuildKernelThunk(
-    mlir::Operation* op, mlir::ValueRange operands,
-    const LaunchDimensions& launch_dimensions) {
-  TF_RET_CHECK(!mlir::isa<mlir::lmhlo::FusionOp>(op));
+StatusOr<std::vector<IrEmitterUnnested::KernelArgument>>
+IrEmitterUnnested::GetKernelArgumentsForFusion(
+    mlir::lmhlo::FusionOp fusion_op) {
+  std::vector<KernelArgument> kernel_arguments;
 
-  std::vector<KernelArgument> kernel_arguments(operands.size());
-  for (auto& [i, operand] : llvm::enumerate(operands)) {
+  llvm::SmallVector<mlir::Value> operands = GetHloOperands(fusion_op);
+  llvm::SmallVector<mlir::Value> outputs = GetHloOutputs(fusion_op);
+  kernel_arguments.reserve(operands.size() + outputs.size());
+
+  int i = 0;
+  for (mlir::Value value : llvm::concat<mlir::Value>(operands, outputs)) {
     TF_ASSIGN_OR_RETURN(
-        kernel_arguments[i],
-        ValueToKernelArgument(operand, i, WritesMlirBuffer(op, operand)));
+        KernelArgument kernel_argument,
+        ValueToKernelArgument(value, /*is_written=*/i >= operands.size()));
+    kernel_arguments.push_back(kernel_argument);
+    i += 1;
   }
-  std::string name = GetIrNameFromLoc(op->getLoc());
-  auto thunk_info = GetThunkInfo(op);
-  return BuildKernelThunkImpl(name, thunk_info, std::move(kernel_arguments),
-                              launch_dimensions);
+
+  ProcessKernelArguments(absl::MakeSpan(kernel_arguments));
+
+  return kernel_arguments;
 }
 
-StatusOr<std::vector<llvm_ir::IrArray>> IrEmitterUnnested::BuildKernelThunk(
-    mlir::Operation* op, const LaunchDimensions& launch_dimensions) {
-  // TODO(tdanyluk): Consider disallowing fusions here, if no fusions use this
-  // anymore.
-  if (auto fusion = mlir::dyn_cast<mlir::lmhlo::FusionOp>(op)) {
-    llvm::SmallVector<mlir::Value> operands = GetHloOperands(op);
-    llvm::SmallVector<mlir::Value> outputs = GetHloOutputs(op);
+StatusOr<std::vector<IrEmitterUnnested::KernelArgument>>
+IrEmitterUnnested::GetKernelArgumentsForNonFusionOp(
+    mlir::Operation* op, mlir::ValueRange needed_operands) {
+  std::vector<KernelArgument> kernel_arguments;
+  kernel_arguments.reserve(needed_operands.size());
 
-    std::vector<KernelArgument> kernel_arguments(operands.size() +
-                                                 outputs.size());
-    int i = 0;
-    for (mlir::Value op : llvm::concat<mlir::Value>(operands, outputs)) {
-      TF_ASSIGN_OR_RETURN(
-          kernel_arguments[i],
-          ValueToKernelArgument(op, i, /*is_written=*/i >= operands.size()));
-      i++;
-    }
-
-    std::string name = GetIrNameFromLoc(op->getLoc());
-    return BuildKernelThunkImpl(name, GetThunkInfo(op), kernel_arguments,
-                                launch_dimensions);
+  for (const auto& [i, value] : llvm::enumerate(needed_operands)) {
+    TF_ASSIGN_OR_RETURN(KernelArgument kernel_argument,
+                        ValueToKernelArgument(
+                            value, /*is_written=*/WritesMlirBuffer(op, value)));
+    kernel_arguments.push_back(kernel_argument);
   }
-  return BuildKernelThunk(op, op->getOperands(), launch_dimensions);
+
+  ProcessKernelArguments(absl::MakeSpan(kernel_arguments));
+
+  return kernel_arguments;
 }
 
-StatusOr<std::vector<IrEmitterUnnested::ReusableKernelArgument>>
-IrEmitterUnnested::GetReusableKernelArguments(mlir::lmhlo::FusionOp fusion_op) {
-  std::vector<ReusableKernelArgument> kernel_arguments;
-  {
-    llvm::SmallVector<mlir::Value> operands = GetHloOperands(fusion_op);
-    llvm::SmallVector<mlir::Value> outputs = GetHloOutputs(fusion_op);
-    kernel_arguments.reserve(operands.size() + outputs.size());
-
-    int i = 0;
-    for (mlir::Value value : llvm::concat<mlir::Value>(operands, outputs)) {
-      ReusableKernelArgument kernel_argument;
-      kernel_argument.value = value;
-      kernel_argument.shape = GetShape(value);
-      TF_ASSIGN_OR_RETURN(kernel_argument.slice,
-                          GetAllocationSlice(value, nullptr));
-      // Note: We change this on a later line.
-      kernel_argument.written = i >= operands.size();
-      kernel_arguments.push_back(std::move(kernel_argument));
-      i += 1;
-    }
-  }
-
+void IrEmitterUnnested::ProcessKernelArguments(
+    absl::Span<KernelArgument> kernel_arguments) {
   absl::flat_hash_set<BufferAllocation::Slice> buffers_written;
-  for (const ReusableKernelArgument& kernel_argument : kernel_arguments) {
+  for (const KernelArgument& kernel_argument : kernel_arguments) {
     if (kernel_argument.written) {
       buffers_written.insert(kernel_argument.slice);
     }
   }
 
   for (int i = 0; i < static_cast<int>(kernel_arguments.size()); ++i) {
-    ReusableKernelArgument& kernel_argument = kernel_arguments[i];
+    KernelArgument& kernel_argument = kernel_arguments[i];
 
     kernel_argument.first_with_same_slice = [&]() -> std::optional<int> {
       for (int j = 0; j < i; ++j) {
-        const ReusableKernelArgument& other_kernel_argument =
-            kernel_arguments[j];
+        const KernelArgument& other_kernel_argument = kernel_arguments[j];
         if (kernel_argument.slice == other_kernel_argument.slice) {
           return j;
         }
@@ -3508,7 +3241,7 @@ IrEmitterUnnested::GetReusableKernelArguments(mlir::lmhlo::FusionOp fusion_op) {
     }();
 
     if (kernel_argument.first_with_same_slice.has_value()) {
-      const ReusableKernelArgument& same =
+      const KernelArgument& same =
           kernel_arguments[kernel_argument.first_with_same_slice.value()];
       kernel_argument.alignment = same.alignment;
       kernel_argument.aliased = same.aliased;
@@ -3527,12 +3260,19 @@ IrEmitterUnnested::GetReusableKernelArguments(mlir::lmhlo::FusionOp fusion_op) {
       }
     }();
 
+    // Note: This code here doesn't check if any partially overlapping buffers
+    // are written. Our investigation shows that HloDataflowAnalysis only
+    // aliases input and output buffers if they are exactly the same size and
+    // location and it aliases one output with at most one input. If that
+    // changes then we will have to modify this to something like:
+    //
+    // kernel_argument.written =
+    //   OverlapsAny(buffers_written, kernel_argument.slice);
     kernel_argument.written = buffers_written.contains(kernel_argument.slice);
 
     kernel_argument.aliased = kernel_argument.written && [&] {
       for (size_t j = 0; j < kernel_arguments.size(); ++j) {
-        const ReusableKernelArgument& other_kernel_argument =
-            kernel_arguments[j];
+        const KernelArgument& other_kernel_argument = kernel_arguments[j];
         if (i != j && kernel_argument.slice != other_kernel_argument.slice &&
             kernel_argument.slice.OverlapsWith(other_kernel_argument.slice)) {
           return true;
@@ -3541,32 +3281,29 @@ IrEmitterUnnested::GetReusableKernelArguments(mlir::lmhlo::FusionOp fusion_op) {
       return false;
     }();
   }
-
-  return kernel_arguments;
 }
 
 std::string IrEmitterUnnested::GetArgumentFingerprint(
-    absl::Span<const ReusableKernelArgument> kernel_arguments) {
-  return absl::StrJoin(kernel_arguments, ",",
-                       [](std::string* s, const ReusableKernelArgument& arg) {
-                         if (arg.first_with_same_slice.has_value()) {
-                           absl::StrAppend(s, "=",
-                                           arg.first_with_same_slice.value());
-                           return;
-                         }
-                         absl::StrAppend(s, arg.alignment);
-                         if (arg.aliased) {
-                           absl::StrAppend(s, "a");
-                         }
-                         if (arg.written) {
-                           absl::StrAppend(s, "w");
-                         }
-                       });
+    absl::Span<const KernelArgument> kernel_arguments) {
+  return absl::StrJoin(
+      kernel_arguments, ",", [](std::string* s, const KernelArgument& arg) {
+        if (arg.first_with_same_slice.has_value()) {
+          absl::StrAppend(s, "=", arg.first_with_same_slice.value());
+          return;
+        }
+        absl::StrAppend(s, arg.alignment);
+        if (arg.aliased) {
+          absl::StrAppend(s, "a");
+        }
+        if (arg.written) {
+          absl::StrAppend(s, "w");
+        }
+      });
 }
 
 std::string IrEmitterUnnested::GetFingerprint(
     const HloComputation* fused_computation,
-    absl::Span<const ReusableKernelArgument> kernel_arguments,
+    absl::Span<const KernelArgument> kernel_arguments,
     absl::string_view discriminator) {
   // We have to print constants, because otherwise we would accidentally reuse
   // kernels which have different builtin constants.
@@ -3584,10 +3321,6 @@ std::string IrEmitterUnnested::GetFingerprint(
 
 StatusOr<mlir::Value> IrEmitterUnnested::RemoveTransformingOperations(
     mlir::Value value) {
-  // This is based on BuildKernelThunkImpl, but ViewOp's are not removed,
-  // because we want to refer to the exact location of the arguments, not the
-  // whole allocation.
-
   mlir::Operation* defining_op = value.getDefiningOp();
   // Don't remove GetGlobalOp's and ViewOp's:
   // Allow passing constants and views to kernels.
@@ -3608,9 +3341,9 @@ StatusOr<mlir::Value> IrEmitterUnnested::RemoveTransformingOperations(
                        defining_op->getName().getStringRef().str());
 }
 
-StatusOr<ReusableKernelThunk*> IrEmitterUnnested::BuildReusableKernelThunkImpl(
+StatusOr<KernelThunk*> IrEmitterUnnested::BuildKernelThunkImpl(
     absl::string_view kernel_name,
-    absl::Span<const ReusableKernelArgument> kernel_arguments,
+    absl::Span<const KernelArgument> kernel_arguments,
     Thunk::ThunkInfo thunk_info, const LaunchDimensions& launch_dimensions) {
   std::vector<BufferAllocation::Slice> arg_slices;
   arg_slices.reserve(kernel_arguments.size());
@@ -3630,23 +3363,23 @@ StatusOr<ReusableKernelThunk*> IrEmitterUnnested::BuildReusableKernelThunkImpl(
     }
   }
 
-  auto thunk_ptr = std::make_unique<ReusableKernelThunk>(
+  auto thunk_ptr = std::make_unique<KernelThunk>(
       std::move(thunk_info), std::move(arg_slices), std::string(kernel_name),
       launch_dimensions, std::move(values));
-  ReusableKernelThunk* raw_thunk_ptr = thunk_ptr.get();
+  KernelThunk* raw_thunk_ptr = thunk_ptr.get();
   AddThunkToThunkSequence(std::move(thunk_ptr));
 
   return raw_thunk_ptr;
 }
 
 StatusOr<std::optional<std::vector<llvm_ir::IrArray>>>
-IrEmitterUnnested::BuildReusableKernelThunk(
+IrEmitterUnnested::BuildKernelThunkForFusion(
     mlir::lmhlo::FusionOp fusion_op, const LaunchDimensions& launch_dimensions,
     absl::string_view discriminator) {
   std::string suggested_kernel_name = GetIrNameFromLoc(fusion_op->getLoc());
 
-  TF_ASSIGN_OR_RETURN(std::vector<ReusableKernelArgument> kernel_arguments,
-                      GetReusableKernelArguments(fusion_op));
+  TF_ASSIGN_OR_RETURN(std::vector<KernelArgument> kernel_arguments,
+                      GetKernelArgumentsForFusion(fusion_op));
 
   TF_ASSIGN_OR_RETURN(
       const HloComputation* fused_computation,
@@ -3659,7 +3392,7 @@ IrEmitterUnnested::BuildReusableKernelThunk(
 
   auto cache_it = kernel_reuse_cache_.find(fingerprint);
   if (cache_it != kernel_reuse_cache_.end()) {
-    ReusableKernelThunk* old_thunk = cache_it->second;
+    KernelThunk* old_thunk = cache_it->second;
 
     VLOG(3) << "Reuse: " << suggested_kernel_name << " -> "
             << old_thunk->kernel_name();
@@ -3673,8 +3406,8 @@ IrEmitterUnnested::BuildReusableKernelThunk(
     // We are not reusing the ThunkInfo of the old thunk, because the current
     // thunk info must reference the current HLO operation.
     TF_RETURN_IF_ERROR(
-        BuildReusableKernelThunkImpl(old_thunk->kernel_name(), kernel_arguments,
-                                     GetThunkInfo(fusion_op), launch_dimensions)
+        BuildKernelThunkImpl(old_thunk->kernel_name(), kernel_arguments,
+                             GetThunkInfo(fusion_op), launch_dimensions)
             .status());
 
     return {std::nullopt};
@@ -3682,18 +3415,50 @@ IrEmitterUnnested::BuildReusableKernelThunk(
 
   VLOG(3) << "Generating: " << suggested_kernel_name;
 
-  auto [kernel, ir_arrays] = BuildReusableKernelPrototype(
+  auto [kernel, ir_arrays] = BuildKernelPrototype(
       suggested_kernel_name, kernel_arguments, launch_dimensions);
 
   TF_ASSIGN_OR_RETURN(
-      ReusableKernelThunk * thunk,
-      BuildReusableKernelThunkImpl(kernel->getName().str(), kernel_arguments,
-                                   GetThunkInfo(fusion_op), launch_dimensions));
+      KernelThunk * thunk,
+      BuildKernelThunkImpl(kernel->getName().str(), kernel_arguments,
+                           GetThunkInfo(fusion_op), launch_dimensions));
   kernel_reuse_cache_[fingerprint] = thunk;
 
   return {ir_arrays};
 }
 
+StatusOr<std::vector<llvm_ir::IrArray>>
+IrEmitterUnnested::BuildKernelThunkForNonFusionOp(
+    mlir::Operation* op, mlir::ValueRange needed_operands,
+    const LaunchDimensions& launch_dimensions) {
+  TF_RET_CHECK(!mlir::isa<mlir::lmhlo::FusionOp>(op))
+      << "Please use BuildKernelThunkForFusion!";
+
+  std::string suggested_kernel_name = GetIrNameFromLoc(op->getLoc());
+
+  TF_ASSIGN_OR_RETURN(std::vector<KernelArgument> kernel_arguments,
+                      GetKernelArgumentsForNonFusionOp(op, needed_operands));
+
+  VLOG(3) << "Generating (without reuse check): " << suggested_kernel_name;
+
+  auto [kernel, ir_arrays] = BuildKernelPrototype(
+      suggested_kernel_name, kernel_arguments, launch_dimensions);
+
+  TF_RETURN_IF_ERROR(BuildKernelThunkImpl(kernel->getName().str(),
+                                          kernel_arguments, GetThunkInfo(op),
+                                          launch_dimensions)
+                         .status());
+
+  return {ir_arrays};
+}
+
+StatusOr<std::vector<llvm_ir::IrArray>>
+IrEmitterUnnested::BuildKernelThunkForNonFusionOp(
+    mlir::Operation* op, const LaunchDimensions& launch_dimensions) {
+  return BuildKernelThunkForNonFusionOp(op, op->getOperands(),
+                                        launch_dimensions);
+}
+
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildConstantInitializerThunk(
     mlir::Operation* op, absl::Span<const uint8_t> init_value, mlir::Value dest,
     const BufferAllocation::Slice& dest_slice, const Shape& output_shape) {
@@ -3794,9 +3559,9 @@ Status IrEmitterUnnested::BuildInitializerThunk(mlir::Operation* op,
   TF_ASSIGN_OR_RETURN(LaunchDimensions launch_dimensions,
                       CalculateLaunchDimensions(
                           dest_shape, ir_emitter_context_->gpu_device_info()));
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(op, {init_value, dest}, launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                      BuildKernelThunkForNonFusionOp(op, {init_value, dest},
+                                                     launch_dimensions));
 
   const llvm_ir::IrArray init_array = ir_arrays[0];
   const llvm_ir::IrArray dest_array = ir_arrays[1];
@@ -3838,7 +3603,7 @@ Status IrEmitterUnnested::BuildFusedInitializerThunk(
 
   TF_ASSIGN_OR_RETURN(
       std::optional<std::vector<llvm_ir::IrArray>> opt_ir_arrays,
-      BuildReusableKernelThunk(
+      BuildKernelThunkForFusion(
           fusion, launch_dimensions,
           /*discriminator=*/absl::StrCat("init_", output_index)));
   if (!opt_ir_arrays.has_value()) {
@@ -4350,7 +4115,7 @@ void IrEmitterUnnested::WriteReductionOutput(
     const HloReduceInstruction* reduction, int partial_result_idx,
     const absl::Span<TypedPointer const> values) {
   const HloComputation* reducer = reduction->to_apply();
-  for (auto [oidx, typed_ptr] : llvm::enumerate(values)) {
+  for (const auto& [oidx, typed_ptr] : llvm::enumerate(values)) {
     auto [output_ptr, type] = typed_ptr;
     llvm::Value* output_address = GetOutputAddressForReduction(
         partial_result_idx, index_ty, reduction_codegen_state,
@@ -5528,7 +5293,7 @@ Status IrEmitterUnnested::EmitUnnestedReduction(
 
   TF_ASSIGN_OR_RETURN(
       std::optional<std::vector<llvm_ir::IrArray>> opt_ir_arrays,
-      BuildReusableKernelThunk(fusion, launch_dimensions));
+      BuildKernelThunkForFusion(fusion, launch_dimensions));
   if (!opt_ir_arrays.has_value()) {
     // The kernel was reused, no need to emit code.
     return OkStatus();
@@ -5692,7 +5457,7 @@ Status IrEmitterUnnested::EmitInputFusibleNonStridedSlices(
 
   TF_ASSIGN_OR_RETURN(
       std::optional<std::vector<llvm_ir::IrArray>> opt_ir_arrays,
-      BuildReusableKernelThunk(fusion, launch_dimensions));
+      BuildKernelThunkForFusion(fusion, launch_dimensions));
   if (!opt_ir_arrays.has_value()) {
     // The kernel was reused, no need to emit code.
     return OkStatus();
@@ -5730,7 +5495,7 @@ Status IrEmitterUnnested::EmitDynamicUpdateSlice(
   // Set up kernel thunk and fused ir emitter.
   TF_ASSIGN_OR_RETURN(
       std::optional<std::vector<llvm_ir::IrArray>> opt_ir_arrays,
-      BuildReusableKernelThunk(fusion_op, launch_dimensions));
+      BuildKernelThunkForFusion(fusion_op, launch_dimensions));
   if (!opt_ir_arrays.has_value()) {
     // The kernel was reused, no need to emit code.
     return OkStatus();
@@ -5774,8 +5539,8 @@ Status IrEmitterUnnested::EmitScatter(mlir::lmhlo::FusionOp fusion_op,
 
     TF_ASSIGN_OR_RETURN(
         std::optional<std::vector<llvm_ir::IrArray>> opt_ir_arrays,
-        BuildReusableKernelThunk(fusion_op, launch_dimensions,
-                                 /*discriminator=*/"init"));
+        BuildKernelThunkForFusion(fusion_op, launch_dimensions,
+                                  /*discriminator=*/"init"));
     if (!opt_ir_arrays.has_value()) {
       // The kernel was reused, no need to emit code.
       return OkStatus();
@@ -5816,8 +5581,8 @@ Status IrEmitterUnnested::EmitScatter(mlir::lmhlo::FusionOp fusion_op,
 
     TF_ASSIGN_OR_RETURN(
         std::optional<std::vector<llvm_ir::IrArray>> opt_ir_arrays,
-        BuildReusableKernelThunk(fusion_op, launch_dimensions,
-                                 /*discriminator=*/"scatter"));
+        BuildKernelThunkForFusion(fusion_op, launch_dimensions,
+                                  /*discriminator=*/"scatter"));
     if (!opt_ir_arrays.has_value()) {
       // The kernel was reused, no need to emit code.
       return OkStatus();
@@ -5963,11 +5728,6 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
                                     mlir::lmhlo::PartitionIdOp>(op);
   }
 
-  if (mlir::isa<mlir::lmhlo::CollectivePermuteOp>(op)) {
-    return EmitCollectivePermute<NcclCollectivePermuteThunk,
-                                 mlir::lmhlo::CollectivePermuteOp>(op);
-  }
-
   if (mlir::isa<mlir::lmhlo_gpu::CollectivePermuteStartOp>(op)) {
     return EmitCollectivePermute<NcclCollectivePermuteStartThunk,
                                  mlir::lmhlo_gpu::CollectivePermuteStartOp>(op);
@@ -5978,10 +5738,6 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
                              mlir::lmhlo_gpu::CollectivePermuteDoneOp>(op);
   }
 
-  if (mlir::isa<mlir::lmhlo::AllGatherOp>(op)) {
-    return EmitNcclThunk<NcclAllGatherThunk, mlir::lmhlo::AllGatherOp>(op);
-  }
-
   if (mlir::isa<mlir::lmhlo_gpu::AllGatherStartOp>(op)) {
     return EmitNcclThunk<NcclAllGatherStartThunk,
                          mlir::lmhlo_gpu::AllGatherStartOp>(op);
@@ -5992,10 +5748,6 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
                              mlir::lmhlo_gpu::AllGatherDoneOp>(op);
   }
 
-  if (mlir::isa<mlir::lmhlo::AllReduceOp>(op)) {
-    return EmitNcclThunk<NcclAllReduceThunk, mlir::lmhlo::AllReduceOp>(op);
-  }
-
   if (mlir::isa<mlir::lmhlo_gpu::AllReduceStartOp>(op)) {
     return EmitNcclThunk<NcclAllReduceStartThunk,
                          mlir::lmhlo_gpu::AllReduceStartOp>(op);
@@ -6006,11 +5758,6 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
                              mlir::lmhlo_gpu::AllReduceDoneOp>(op);
   }
 
-  if (mlir::isa<mlir::lmhlo::ReduceScatterOp>(op)) {
-    return EmitNcclThunk<NcclReduceScatterThunk, mlir::lmhlo::ReduceScatterOp>(
-        op);
-  }
-
   if (mlir::isa<mlir::lmhlo_gpu::ReduceScatterStartOp>(op)) {
     return EmitNcclThunk<NcclReduceScatterStartThunk,
                          mlir::lmhlo_gpu::ReduceScatterStartOp>(op);
@@ -6021,10 +5768,6 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
                              mlir::lmhlo_gpu::ReduceScatterDoneOp>(op);
   }
 
-  if (mlir::isa<mlir::lmhlo::AllToAllOp>(op)) {
-    return EmitNcclThunk<NcclAllToAllThunk, mlir::lmhlo::AllToAllOp>(op);
-  }
-
   if (mlir::isa<mlir::lmhlo_gpu::AllToAllStartOp>(op)) {
     return EmitNcclThunk<NcclAllToAllStartThunk,
                          mlir::lmhlo_gpu::AllToAllStartOp>(op);
@@ -6089,6 +5832,7 @@ void IrEmitterUnnested::GetDependentDialects(mlir::DialectRegistry& registry) {
                   mlir::gpu::GPUDialect, mlir::lmhlo::LmhloDialect,
                   mlir::lmhlo_gpu::LmhloGpuDialect, mlir::mhlo::MhloDialect,
                   mlir::memref::MemRefDialect>();
+  mlir::registerBuiltinDialectTranslation(registry);
   mlir::registerLLVMDialectTranslation(registry);
   mlir::registerNVVMDialectTranslation(registry);
   mlir::registerROCDLDialectTranslation(registry);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 902d0268dea..90e94ff1b6a 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -27,11 +27,12 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h"
+#include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/reusable_kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -371,13 +372,8 @@ class IrEmitterUnnested : public IrEmitter {
         shape, ir_emitter_context_->llvm_module()->getDataLayout());
   }
 
-  // Builds the prototype of the IR kernel for `inst` and adds it to the module.
-  // This kernel takes as arguments pointers to the given buffer allocations.
-  llvm::Function* BuildKernelPrototype(
-      absl::string_view name, absl::Span<const BufferAllocation* const> args);
-
-  // An argument descriptor for "reusable kernels".
-  struct ReusableKernelArgument {
+  // An argument descriptor for kernels.
+  struct KernelArgument {
     mlir::Value value;
     Shape shape;
     BufferAllocation::Slice slice;
@@ -389,15 +385,15 @@ class IrEmitterUnnested : public IrEmitter {
     std::optional<int> first_with_same_slice;
   };
 
-  // The return type of BuildReusableKernelPrototype.
+  // The return type of BuildKernelPrototype.
   struct KernelAndIrArrays {
     llvm::Function* kernel = nullptr;
     std::vector<llvm_ir::IrArray> ir_arrays;
   };
 
-  KernelAndIrArrays BuildReusableKernelPrototype(
+  KernelAndIrArrays BuildKernelPrototype(
       absl::string_view suggested_name,
-      absl::Span<const ReusableKernelArgument> arguments,
+      absl::Span<const KernelArgument> arguments,
       const LaunchDimensions& launch_dimensions);
 
   // Helper for writing extra outputs from inside a reduce kernel.
@@ -707,33 +703,22 @@ class IrEmitterUnnested : public IrEmitter {
       absl::Span<int64_t const> dimensions_major_to_minor,
       absl::string_view buffer_name = "");
 
-  struct KernelArgument {
-    int order;
-    mlir::Value value;
-    BufferSlice slice;
-  };
-
-  StatusOr<KernelArgument> ValueToKernelArgument(mlir::Value operand, int order,
+  StatusOr<KernelArgument> ValueToKernelArgument(mlir::Value operand,
                                                  bool is_written);
 
-  // Build a kernel thunk, add it to list of thunks, and return IrArrays backing
-  // kernel arguments.
-  StatusOr<std::vector<llvm_ir::IrArray>> BuildKernelThunkImpl(
-      absl::string_view name, Thunk::ThunkInfo thunk_info,
-      std::vector<KernelArgument> kernel_arguments,
-      const LaunchDimensions& launch_dimensions);
+  // Calculate some KernelArgument attributes which are needed for generating
+  // the kernel thunk.
+  static void ProcessKernelArguments(
+      absl::Span<KernelArgument> kernel_arguments);
 
-  StatusOr<std::vector<llvm_ir::IrArray>> BuildKernelThunk(
-      mlir::Operation* op, mlir::ValueRange operands,
-      const LaunchDimensions& launch_dimensions);
-
-  StatusOr<std::vector<llvm_ir::IrArray>> BuildKernelThunk(
-      mlir::Operation* op, const LaunchDimensions& launch_dimensions);
-
-  // Generates the argument descriptors for a "reusable kernel".
-  StatusOr<std::vector<ReusableKernelArgument>> GetReusableKernelArguments(
+  // Generates the kernel argument descriptors for a fusion operation.
+  StatusOr<std::vector<KernelArgument>> GetKernelArgumentsForFusion(
       mlir::lmhlo::FusionOp fusion_op);
 
+  // Generates the kernel argument descriptors for a non-fusion operation.
+  StatusOr<std::vector<KernelArgument>> GetKernelArgumentsForNonFusionOp(
+      mlir::Operation* op, mlir::ValueRange needed_operands);
+
   // Calculates a fingerprint of the kernel arguments, which can be used for
   // checking reusability.
   //
@@ -747,7 +732,7 @@ class IrEmitterUnnested : public IrEmitter {
   // the 1st argument is the same as the 0th and the 3rd is the same as the 2nd.
   // These duplicated parameters are passed to the kernel only once.
   static std::string GetArgumentFingerprint(
-      absl::Span<const ReusableKernelArgument> kernel_arguments);
+      absl::Span<const KernelArgument> kernel_arguments);
 
   // Calculates the fingerprint of a (fused_computation, kernel_arguments,
   // discriminator) tuple.
@@ -759,20 +744,20 @@ class IrEmitterUnnested : public IrEmitter {
   // generated for the first computation.
   static std::string GetFingerprint(
       const HloComputation* fused_computation,
-      absl::Span<const ReusableKernelArgument> kernel_arguments,
+      absl::Span<const KernelArgument> kernel_arguments,
       absl::string_view discriminator = "");
 
   // Removes some unneeded defining operations from the calculation of `value`,
-  // before passing it to a ReusableKernelThunk.
+  // before passing it to a KernelThunk.
   static StatusOr<mlir::Value> RemoveTransformingOperations(mlir::Value value);
 
-  // Creates a ReusableKernelThunk.
-  StatusOr<ReusableKernelThunk*> BuildReusableKernelThunkImpl(
+  // Creates a KernelThunk.
+  StatusOr<KernelThunk*> BuildKernelThunkImpl(
       absl::string_view kernel_name,
-      absl::Span<const ReusableKernelArgument> kernel_arguments,
+      absl::Span<const KernelArgument> kernel_arguments,
       Thunk::ThunkInfo thunk_info, const LaunchDimensions& launch_dimensions);
 
-  // Builds a thunk that calls a new or a reused "reusable kernel".
+  // Builds a thunk that calls a new or reused kernel for a fusion operation.
   //
   // The caller must specify the same launch dimensions for fusions which have
   // the same computation.
@@ -789,7 +774,7 @@ class IrEmitterUnnested : public IrEmitter {
   // ```
   // TF_ASSIGN_OR_RETURN(
   //   std::optional<std::vector<llvm_ir::IrArray>> opt_ir_arrays,
-  //   BuildReusableKernelThunk(fusion_op, launch_dimensions));
+  //   BuildKernelThunkForFusion(fusion_op, launch_dimensions));
   // if (!opt_ir_arrays.has_value()) {
   //   // The kernel was reused, no need to emit code.
   //   return OkStatus();
@@ -798,13 +783,27 @@ class IrEmitterUnnested : public IrEmitter {
   //
   // EmitYourSpecificKernelCode(ir_arrays);
   // ```
-  //
-  // TODO(tdanyluk): Consider also using reusable kernels for kernel generating
-  // operations which are not fusions.
   StatusOr<std::optional<std::vector<llvm_ir::IrArray>>>
-  BuildReusableKernelThunk(mlir::lmhlo::FusionOp fusion_op,
-                           const LaunchDimensions& launch_dimensions,
-                           absl::string_view discriminator = "");
+  BuildKernelThunkForFusion(mlir::lmhlo::FusionOp fusion_op,
+                            const LaunchDimensions& launch_dimensions,
+                            absl::string_view discriminator = "");
+
+  // Builds a kernel thunk for a non-fusion operation, without reuse.
+  //
+  // All input and output tensors of `op` are passed to the kernel.
+  //
+  // TODO(tdanyluk): Consider also reusing non-fusion kernels.
+  StatusOr<std::vector<llvm_ir::IrArray>> BuildKernelThunkForNonFusionOp(
+      mlir::Operation* op, const LaunchDimensions& launch_dimensions);
+
+  // Builds a kernel thunk for a non-fusion operation, without reuse.
+  //
+  // Only the tensors specified in `needed_operands` are passed to the kernel.
+  //
+  // TODO(tdanyluk): Consider also reusing non-fusion kernels.
+  StatusOr<std::vector<llvm_ir::IrArray>> BuildKernelThunkForNonFusionOp(
+      mlir::Operation* op, mlir::ValueRange needed_operands,
+      const LaunchDimensions& launch_dimensions);
 
   // Returns a thunk that, given a reduce or select-and-scatter op,
   // initializes its memory to the appropriate initial value.
@@ -830,7 +829,7 @@ class IrEmitterUnnested : public IrEmitter {
   // sequence from the 'body' sub-computation of the while instruction 'hlo'.
   StatusOr<std::unique_ptr<Thunk>> BuildForThunk(
       mlir::lmhlo::WhileOp while_op, const Thunk::ThunkInfo& thunk_info,
-      const int64_t loop_limit);
+      int64_t loop_limit);
 
   // Returns a ConditionalThunk which executes the thunk sequence for the
   // 'branch_computation' corresponding to the predicate/branch_index of the
@@ -908,14 +907,9 @@ class IrEmitterUnnested : public IrEmitter {
 
   GpuElementalIrEmitter elemental_emitter_;
 
-  // Cache of already compiled Triton GEMMs keyed by
-  // HLO computation fingerprint.
-  absl::flat_hash_map<std::string, std::pair<llvm::Function*, LaunchDimensions>>
-      triton_cache_;
-
   // Maps computation fingerprints generated by GetFingerprint() to the first
-  // ReusableKernelThunk generated for them.
-  absl::flat_hash_map<std::string, ReusableKernelThunk*> kernel_reuse_cache_;
+  // KernelThunk generated for them.
+  absl::flat_hash_map<std::string, KernelThunk*> kernel_reuse_cache_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index 9cc800ee1c7..eaa5c5383dd 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -36,12 +37,12 @@ namespace xla {
 namespace gpu {
 
 KernelThunk::KernelThunk(ThunkInfo thunk_info,
-                         absl::Span<const BufferAllocation* const> args,
+                         std::vector<BufferAllocation::Slice> args,
                          const std::string& kernel_name,
                          const LaunchDimensions& launch_dimensions,
                          std::vector<mlir::Value> values)
     : Thunk(Kind::kKernel, thunk_info),
-      args_(args.begin(), args.end()),
+      args_(std::move(args)),
       kernel_name_(kernel_name),
       launch_dimensions_(launch_dimensions),
       values_(std::move(values)) {}
@@ -108,11 +109,10 @@ Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   VLOG(3) << "Launching " << kernel->name();
   absl::InlinedVector<se::DeviceMemoryBase, 4> buffer_args;
-  for (const BufferAllocation* arg : args_) {
-    se::DeviceMemoryBase buf =
-        params.buffer_allocations->GetDeviceAddress(arg->index());
-    VLOG(3) << "  Arg: alloc #" << arg->index() << ": " << buf.opaque() << "  ("
-            << buf.size() << "B)";
+  for (const BufferAllocation::Slice& arg : args_) {
+    se::DeviceMemoryBase buf = params.buffer_allocations->GetDeviceAddress(arg);
+    VLOG(3) << "  Arg: alloc #" << arg.index() << ", offset: " << arg.offset()
+            << ": " << buf.opaque() << " (" << buf.size() << "B)";
     buffer_args.push_back(buf);
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index f66b0f18d5d..f0baff61037 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
@@ -44,9 +45,11 @@ class KernelThunk : public Thunk {
  public:
   // Constructs a thunk for the given kernel.
   //
-  // `hlo_instruction` is as in Thunk. Other arguments are as the class members.
-  KernelThunk(ThunkInfo thunk_info,
-              absl::Span<const BufferAllocation* const> args,
+  // KernelThunk takes args as BufferAllocation::Slice's. Each slice directly
+  // corresponds to an argument or output of the computation. Also, the values
+  // must correspond to each arg directly, not to their base allocation (e.g.
+  // they can be the result of an mlir::memref::ViewOp).
+  KernelThunk(ThunkInfo thunk_info, std::vector<BufferAllocation::Slice> args,
               const std::string& kernel_name,
               const LaunchDimensions& launch_dimensions,
               std::vector<mlir::Value> values);
@@ -67,7 +70,7 @@ class KernelThunk : public Thunk {
     }
   }
 
-  const std::vector<const BufferAllocation*>& arguments() const {
+  const std::vector<BufferAllocation::Slice>& arguments() const {
     return args_;
   }
   const std::string& kernel_name() const { return kernel_name_; }
@@ -77,8 +80,8 @@ class KernelThunk : public Thunk {
   absl::Span<const mlir::Value> values() const { return values_; }
 
  private:
-  // Buffers passed to the kernel as arguments.
-  const std::vector<const BufferAllocation*> args_;
+  // Buffer slices passed to the kernel as arguments.
+  const std::vector<BufferAllocation::Slice> args_;
 
   // Entry kernel name for the computation.
   const std::string kernel_name_;
@@ -86,7 +89,7 @@ class KernelThunk : public Thunk {
   // The thread and block dimension used to launch the kernel.
   const LaunchDimensions launch_dimensions_;
 
-  // mlir::Value(s) corresponding to the buffer allocation arguments.
+  // mlir::Value(s) corresponding to the buffer slice arguments.
   std::vector<mlir::Value> values_;
 
   mutable absl::Mutex mutex_;
diff --git a/tensorflow/compiler/xla/service/gpu/launch_dimensions.h b/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
index 036864c0de6..fbf2abe989a 100644
--- a/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
+++ b/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAUNCH_DIMENSIONS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAUNCH_DIMENSIONS_H_
 
-#include <map>
-#include <memory>
+#include <ostream>
 #include <string>
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
@@ -71,11 +70,11 @@ class LaunchDimensions {
   }
 
   std::string ToString() const {
-    return absl::StrCat("blocks: {", block_counts_.x, ", ", block_counts_.y,
-                        ", ", block_counts_.z, "}, threads/block: {",
-                        thread_counts_per_block_.x, ", ",
-                        thread_counts_per_block_.y, ", ",
-                        thread_counts_per_block_.z, "}");
+    return absl::StrCat(
+        "blocks: {", block_counts_.x, ", ", block_counts_.y, ", ",
+        block_counts_.z, "}, threads/block: {", thread_counts_per_block_.x,
+        ", ", thread_counts_per_block_.y, ", ", thread_counts_per_block_.z,
+        "}, shared memory [B]: ", shared_mem_bytes_);
   }
 
   void SetSharedMemBytes(uint32_t shared_mem_bytes) {
@@ -128,7 +127,7 @@ struct LaunchDimensionsConfig {
   }
 };
 
-// Returns -1 if the shape doesn't allows the row vectorization code path.
+// Returns -1 if the shape doesn't allow the row vectorization code path.
 // If supported, return the number of threads to use in that case.
 int64_t ThreadsPerBlockRowVectorized(const Shape& shape,
                                      GpuDeviceInfo gpu_device_info,
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index b55037c7cae..56e4d551bd4 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -28,12 +28,26 @@ cc_library(
         "utils.h",
     ],
     deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service/gpu:gpu_types",
+        "//tensorflow/compiler/xla/service/gpu:metrics",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_command_line_options",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_type_conversion_util",
+        "//tensorflow/tsl/platform:cuda_libdevice_path",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:random",
+        "//tensorflow/tsl/profiler/lib:traceme",
+        "//tensorflow/tsl/util:env_var",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:BitReader",
         "@llvm-project//llvm:BitWriter",
@@ -41,28 +55,14 @@ cc_library(
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:IPO",
         "@llvm-project//llvm:IRReader",
-        "@llvm-project//llvm:MC",
         "@llvm-project//llvm:Linker",
-        "@llvm-project//llvm:Passes",
+        "@llvm-project//llvm:MC",
         "@llvm-project//llvm:NVPTXCodeGen",  # buildcleaner: keep
         "@llvm-project//llvm:ObjCARC",  # buildcleaner: keep
+        "@llvm-project//llvm:Passes",
         "@llvm-project//llvm:Scalar",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
-        "//tensorflow/compiler/xla/service/gpu:metrics",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/tsl/platform:cuda_libdevice_path",
-        "//tensorflow/compiler/xla/service:hlo_module_config",
-        "//tensorflow/compiler/xla/service/gpu:gpu_types",
-        "//tensorflow/compiler/xla/service/llvm_ir:llvm_command_line_options",
-        "//tensorflow/compiler/xla/service/llvm_ir:llvm_type_conversion_util",
-        "//tensorflow/tsl/platform:random",
-        "//tensorflow/tsl/profiler/lib:traceme",
-        "//tensorflow/tsl/platform:path",
-        "//tensorflow/tsl/util:env_var",
     ] + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
         "@llvm-project//llvm:AMDGPUCodeGen",
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 363bcfde3d4..211572ce025 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -356,9 +356,10 @@ const llvm::Module* GetModule(llvm::Any IR) {
   return nullptr;
 }
 
-auto DumpCallbackForModule(std::string module_identifier) {
+auto DumpCallbackForModule(std::string module_identifier,
+                           std::string outputs_dir) {
   int i = 0;
-  return [module_identifier, i](llvm::StringRef pass, llvm::Any ir) mutable {
+  return [=](llvm::StringRef pass, llvm::Any ir) mutable {
     const llvm::Module* module = GetModule(ir);
     if (!module) {
       return;
@@ -368,8 +369,6 @@ auto DumpCallbackForModule(std::string module_identifier) {
         absl::string_view(tsl::io::Basename(module_identifier)),
         absl::StrFormat("pass-%02d.before.%s.ll", i++,
                         absl::string_view(pass.str())));
-    std::string outputs_dir;
-    tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir);
     DumpModule(tsl::io::JoinPath(outputs_dir, basename), module);
   };
 }
@@ -398,7 +397,7 @@ Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
   llvm::PassInstrumentationCallbacks pic;
 
   llvm::StandardInstrumentations si(module->getContext(), false);
-  si.registerCallbacks(pic, &fam);
+  si.registerCallbacks(pic, &mam);
 
   llvm::PassBuilder pb(target_machine, pto, std::nullopt, &pic);
   pb.registerModuleAnalyses(mam);
@@ -408,8 +407,18 @@ Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
   pb.crossRegisterProxies(lam, fam, cgam, mam);
 
   if (hlo_module_config.debug_options().xla_gpu_dump_llvmir()) {
-    pic.registerBeforeNonSkippedPassCallback(
-        DumpCallbackForModule(module->getModuleIdentifier()));
+    std::string outputs_dir;
+    if (!tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir)) {
+      outputs_dir = hlo_module_config.debug_options().xla_dump_to();
+    }
+    if (!outputs_dir.empty()) {
+      pic.registerBeforeNonSkippedPassCallback(
+          DumpCallbackForModule(module->getModuleIdentifier(), outputs_dir));
+    } else {
+      LOG(ERROR) << "--xla_gpu_dump_llvmir is set, but neither the environment "
+                 << "variable TEST_UNDECLARED_OUTPUTS_DIR nor the flag "
+                 << "--xla_dump_to is set, so the llvm dumps are disabled.";
+    }
   }
 
   int32_t opt_level =
diff --git a/tensorflow/compiler/xla/service/gpu/matmul_utils.cc b/tensorflow/compiler/xla/service/gpu/matmul_utils.cc
index e80517fd11e..5dae63157b5 100644
--- a/tensorflow/compiler/xla/service/gpu/matmul_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/matmul_utils.cc
@@ -71,8 +71,6 @@ StatusOr<Shape> GetBatchRowColumnShape(const Shape& shape,
                                        absl::Span<const int64_t> row_dims,
                                        absl::Span<const int64_t> col_dims) {
   TF_RET_CHECK(shape.has_layout());
-  TF_RET_CHECK(!row_dims.empty());
-  TF_RET_CHECK(!col_dims.empty());
 
   std::vector<int64_t> minor_to_major;
   for (size_t i = 0; i < shape.rank();) {
@@ -83,16 +81,16 @@ StatusOr<Shape> GetBatchRowColumnShape(const Shape& shape,
       for (auto it = dims.rbegin(); it != dims.rend(); ++it) {
         // NOTE: `i` is incremented as we check the dimensions.
         if (*it != shape.layout().minor_to_major()[i++])
-          return InvalidArgument("dims not physically sequential");
+          return InvalidArgument("dims not physically_sequential");
       }
       return OkStatus();
     };
 
     int64_t dim = shape.layout().minor_to_major()[i];
-    if (dim == row_dims.back()) {
+    if (!row_dims.empty() && dim == row_dims.back()) {
       minor_to_major.push_back(1);
       TF_RETURN_IF_ERROR(check_physically_sequential(row_dims));
-    } else if (dim == col_dims.back()) {
+    } else if (!col_dims.empty() && dim == col_dims.back()) {
       minor_to_major.push_back(2);
       TF_RETURN_IF_ERROR(check_physically_sequential(col_dims));
     } else if (!batch_dims.empty() && (dim == batch_dims.back())) {
@@ -103,6 +101,8 @@ StatusOr<Shape> GetBatchRowColumnShape(const Shape& shape,
     }
   }
 
+  if (col_dims.empty()) minor_to_major.push_back(2);
+  if (row_dims.empty()) minor_to_major.push_back(1);
   if (batch_dims.empty()) minor_to_major.push_back(0);
 
   auto dim_size = [&](absl::Span<const int64_t> dims) {
@@ -195,6 +195,21 @@ void MatrixLayout::Transpose() {
   order = (order == Order::kRowMajor) ? Order::kColumnMajor : Order::kRowMajor;
 }
 
+namespace {
+// Returns the relative order of 'dims' as indices from 0 to dims.size() - 1.
+// Let 'indices' be the returned vector, then it holds that
+// dims[indices[i - 1]] < dims[indices[i]] for 0 < i < dims.size()
+std::vector<int64_t> NormalizedRelativeOrder(absl::Span<const int64_t> dims) {
+  // Remap the dimensions to values between 0 and dims.size() - 1, keeping their
+  // relative order the same.
+  std::vector<int64_t> indices(dims.size());
+  absl::c_iota(indices, 0);
+  absl::c_sort(indices,
+               [&](int64_t a, int64_t b) { return dims[a] < dims[b]; });
+  return indices;
+}
+}  // namespace
+
 StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
                                               int64_t operand_idx) {
   TF_RET_CHECK(dot.opcode() == HloOpcode::kDot);
@@ -223,11 +238,20 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
       std::vector<int64_t> non_contracting_dims,
       GetNonContractingDims(transpose.shape(), batch_dims, contracting_dims));
 
+  // TransposeFolding assumes that folding the transpose into the dot operand
+  // doesn't change the dot shape. This means that the non-contracting
+  // dimensions of the dot operand need to keep their relative order.
+  auto transposed_non_contracting_dims = transposed(non_contracting_dims);
+  if (NormalizedRelativeOrder(non_contracting_dims) !=
+      NormalizedRelativeOrder(transposed_non_contracting_dims)) {
+    return false;
+  }
+
   // If we're able to construct a valid `MatrixLayout` for the transposed
   // dimensions, then GeMM can support folding the transpose.
   return MatrixLayout::For(transpose.operand(0)->shape(),
                            transposed(batch_dims), transposed(contracting_dims),
-                           transposed(non_contracting_dims))
+                           transposed_non_contracting_dims)
       .ok();
 }
 
@@ -815,14 +839,27 @@ StatusOr<se::cuda::BlasLt::Epilogue> AsBlasLtEpilogue(
   lhs_layout.batch_size = batch_size;
   rhs_layout.batch_size = batch_size;
 
-  // cuBLASLt FP8 GEMM kernels require A (i.e. lhs) to be transposed and this
-  // equivalents to A being row major stored if no transpose is explicitly
-  // applied on A.
-  se::blas::Transpose trans_a = se::blas::Transpose::kNoTranspose;
-
   bool must_swap_operands =
       MakeOutputColumnMajor(lhs_layout, rhs_layout, c_layout, output_layout);
 
+  // Do not transopse either input. Note the cuBLASLt documentation somewhat
+  // incorrectly claims "A must be transposed and B non-transposed" when A and B
+  // are FP8 (https://docs.nvidia.com/cuda/cublas/#cublasltmatmul). In reality,
+  // this is only true if A and B are column-major. If A is row-major, A must
+  // *not* be transposed, and if B is row-major, B must be transposed. We never
+  // transpose A or B, and expect the caller to ensure A is row-major and B is
+  // column when A and B are FP8.
+  const se::blas::Transpose trans_a = se::blas::Transpose::kNoTranspose;
+  const se::blas::Transpose trans_b = se::blas::Transpose::kNoTranspose;
+  if (primitive_util::IsF8Type(lhs_layout.dtype) &&
+      lhs_layout.order == MatrixLayout::Order::kColumnMajor) {
+    return InternalError("The F8 LHS must be column-major");
+  }
+  if (primitive_util::IsF8Type(rhs_layout.dtype) &&
+      rhs_layout.order == MatrixLayout::Order::kRowMajor) {
+    return InternalError("The F8 RHS must be row-major");
+  }
+
   TF_ASSIGN_OR_RETURN(se::blas::DataType output_dtype,
                       AsBlasDataType(output_layout.dtype));
   TF_ASSIGN_OR_RETURN(
@@ -834,7 +871,7 @@ StatusOr<se::cuda::BlasLt::Epilogue> AsBlasLtEpilogue(
       se::cuda::BlasLt::MatmulDesc op_desc,
       se::cuda::BlasLt::MatmulDesc::Create(
           computation_type, GetScaleType(output_dtype, computation_type),
-          trans_a, /*trans_b=*/se::blas::Transpose::kNoTranspose, epilogue));
+          trans_a, trans_b, epilogue));
 
   TF_ASSIGN_OR_RETURN(se::cuda::BlasLt::MatrixLayout a_desc,
                       AsBlasLtMatrixLayout(lhs_layout));
diff --git a/tensorflow/compiler/xla/service/gpu/matmul_utils_test.cc b/tensorflow/compiler/xla/service/gpu/matmul_utils_test.cc
index 7f7b27de3de..ff8644d4296 100644
--- a/tensorflow/compiler/xla/service/gpu/matmul_utils_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/matmul_utils_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/platform/status_matchers.h"
 
 namespace xla {
@@ -36,6 +37,100 @@ TEST(GetNonContractingDimsTest, Valid) {
               IsOkAndHolds(ElementsAre(0, 2, 3)));
 }
 
+using CanFoldTransposeOperandIntoDotTest = HloTestBase;
+
+TEST_F(CanFoldTransposeOperandIntoDotTest, ArgTransposeFoldGemm) {
+  const char* hlo_text = R"(
+HloModule ArgTransposeFoldGemm
+
+ENTRY AddDotsFunc {
+  x = f32[3,2] parameter(0)
+  y = f32[3,4] parameter(1)
+  x_transposed = f32[2,3] transpose(x), dimensions={1, 0}
+  ROOT dot_a = f32[2,4] dot(x_transposed, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  auto dot = module->entry_computation()->root_instruction();
+  EXPECT_THAT(CanFoldTransposeOperandIntoDot(*dot, 0), IsOkAndHolds(true));
+}
+
+TEST_F(CanFoldTransposeOperandIntoDotTest, BatchedArgRowColTransposeFoldGemm) {
+  const char* hlo_text = R"(
+HloModule BatchedArgRowColTransposeFoldGemm
+
+ENTRY AddDotsFunc {
+  x = f32[5,3,2] parameter(0)
+  y = f32[5,3,4] parameter(1)
+  x_transposed = f32[5,2,3] transpose(x), dimensions={0, 2, 1}
+  ROOT dot_a = f32[5,2,4] dot(x_transposed, y), lhs_contracting_dims={2}, rhs_contracting_dims={1}, lhs_batch_dims={0}, rhs_batch_dims={0}
+}
+
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  auto dot = module->entry_computation()->root_instruction();
+  EXPECT_THAT(CanFoldTransposeOperandIntoDot(*dot, 0), IsOkAndHolds(true));
+}
+
+TEST_F(CanFoldTransposeOperandIntoDotTest, BatchRowTransposeFoldGemm) {
+  const char* hlo_text = R"(
+HloModule BatchRowTransposeFoldCheck
+
+ENTRY AddDotsFunc {
+  x = f32[2,5,3] parameter(0)
+  y = f32[5,3,4] parameter(1)
+  x_transposed = f32[5,2,3] transpose(x), dimensions={1, 0, 2}
+  ROOT dot_a = f32[5,2,4] dot(x_transposed, y), lhs_contracting_dims={2}, rhs_contracting_dims={1}, lhs_batch_dims={0}, rhs_batch_dims={0}
+}
+
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  auto dot = module->entry_computation()->root_instruction();
+  EXPECT_THAT(CanFoldTransposeOperandIntoDot(*dot, 0), IsOkAndHolds(true));
+}
+
+TEST_F(CanFoldTransposeOperandIntoDotTest,
+       BatchFromMinorDimTransposeDoesntFold) {
+  const char* hlo_text = R"(
+HloModule BatchFromMinorDimTransposeDoesntFold
+
+ENTRY AddDotsFunc {
+  x = f32[3,2,5] parameter(0)
+  y = f32[5,3,4] parameter(1)
+  x_transposed = f32[5,2,3] transpose(x), dimensions={2, 1, 0}
+  ROOT dot_a = f32[5,2,4] dot(x_transposed, y), lhs_contracting_dims={2}, rhs_contracting_dims={1}, lhs_batch_dims={0}, rhs_batch_dims={0}
+}
+
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  auto dot = module->entry_computation()->root_instruction();
+  EXPECT_THAT(CanFoldTransposeOperandIntoDot(*dot, 0), IsOkAndHolds(false));
+}
+
+TEST_F(CanFoldTransposeOperandIntoDotTest,
+       TransposedNonContractingDimsDontFold) {
+  const char* hlo_text = R"(
+HloModule TransposedNonContractingDimsDontFold
+
+ENTRY AddDotsFunc {
+  x = f32[5,3,4]{2,1,0} parameter(1)
+  y = f32[5,2,6,3]{3,1,2,0} parameter(0)
+  y_transposed = f32[5,6,2,3]{3,2,1,0} transpose(y), dimensions={0, 2, 1, 3}
+  ROOT dot_a = f32[5,4,6,2]{3,2,1,0} dot(x, y_transposed), lhs_contracting_dims={1}, rhs_contracting_dims={3}, lhs_batch_dims={0}, rhs_batch_dims={0}
+}
+
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  auto dot = module->entry_computation()->root_instruction();
+  EXPECT_THAT(CanFoldTransposeOperandIntoDot(*dot, 1), IsOkAndHolds(false));
+}
+
 struct GetBatchRowColumnShapeTestParams {
   absl::string_view shape;
   std::vector<int64_t> batch_dims;
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index c76b4e882b1..717710d194f 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -336,7 +336,7 @@ StatusOr<bool> GpuMultiOutputFusion::DoMultiOutputFusion() {
     HloInstruction* producer = defs_before_uses.back();
 
     // Copy on purpose: to use after removing the producer.
-    std::string producer_name = producer->name();
+    absl::string_view producer_name = producer->name();
     defs_before_uses.pop_back();
     // Never multi-output fuse constants.  To the extent that we want to fuse
     // constants, that should be handled by the regular fusion pass.
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.cc
index 403681dd747..6df3d19341f 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.cc
@@ -15,15 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h"
 
-#include <chrono>  // NOLINT (required by TF interfaces)
 #include <cstdlib>
-#include <memory>
-#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/strings/str_format.h"
-#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 
 #if XLA_ENABLE_XCCL
@@ -43,13 +39,19 @@ NcclAllGatherConfig GetNcclAllGatherConfig(OpT op) {
 }
 
 template <typename OpT>
-bool CanImplement(OpT op) {
-  return absl::c_all_of(op.getInputs(), [&](mlir::Value operand) {
+Status CheckImplementable(OpT op) {
+  TF_RETURN_IF_ERROR(NcclCollectiveThunk::CheckImplementable());
+  for (mlir::Value operand : op.getInputs()) {
+    TF_RETURN_IF_ERROR(IsValidOperand(operand, Thunk::kNcclAllGather));
     Shape shape = GetShape(operand);
-    return LayoutUtil::IsDenseArray(shape) &&
-           IsTypeSupportedByNccl(shape.element_type(), Thunk::kNcclAllGather) &&
-           LayoutUtil::MinorToMajor(shape).back() == op.getAllGatherDimension();
-  });
+    if (!ShapeUtil::IsEffectivelyMostMajorDimension(
+            shape, op.getAllGatherDimension())) {
+      return tsl::errors::Unimplemented(absl::StrFormat(
+          "all-gather dim %u is not the most major in input shape %s",
+          op.getAllGatherDimension(), shape.ToString(/*print_layout=*/true)));
+    }
+  }
+  return OkStatus();
 }
 }  // namespace impl
 
@@ -72,44 +74,18 @@ Status NcclAllGatherThunkBase::RunAllGather(const ExecuteParams& params,
   return xla::gpu::RunAllGather(device_buffers, stream, comm);
 }
 
-NcclAllGatherThunk::NcclAllGatherThunk(
-    ThunkInfo thunk_info, mlir::lmhlo::AllGatherOp op,
-    std::vector<NcclAllGatherThunk::Buffer> buffers)
-    : NcclAllGatherThunkBase(Thunk::kNcclAllGather, thunk_info,
-                             impl::GetNcclAllGatherConfig(op),
-                             std::move(buffers)) {}
-
-/*static*/ bool NcclAllGatherThunk::CanImplement(mlir::lmhlo::AllGatherOp op) {
-  return impl::CanImplement(op);
-}
-
-/*static*/ bool NcclAllGatherThunk::IsDegenerate(mlir::lmhlo::AllGatherOp op,
-                                                 int64_t replica_count,
-                                                 int64_t partition_count) {
-  return impl::GetNcclAllGatherConfig(op).config.IsDegenerate(replica_count,
-                                                              partition_count);
-}
-
-/*static*/ CollectiveOpGroupMode NcclAllGatherThunk::GetGroupMode(
-    mlir::lmhlo::AllGatherOp op) {
-  return impl::GetNcclAllGatherConfig(op).config.group_mode;
-}
-
-Status NcclAllGatherThunk::RunNcclCollective(const ExecuteParams& params,
-                                             ncclComm_t comm) {
-  return RunAllGather(params, *params.stream, comm);
-}
-
 NcclAllGatherStartThunk::NcclAllGatherStartThunk(
     ThunkInfo thunk_info, mlir::lmhlo_gpu::AllGatherStartOp op,
-    std::vector<NcclAllGatherThunk::Buffer> buffers)
+    std::vector<NcclCollectiveThunk::Buffer> buffers)
     : NcclAllGatherThunkBase(Thunk::kNcclAllGatherStart, thunk_info,
                              impl::GetNcclAllGatherConfig(op),
                              std::move(buffers)) {}
 
-/*static*/ bool NcclAllGatherStartThunk::CanImplement(
-    mlir::lmhlo_gpu::AllGatherStartOp op) {
-  return impl::CanImplement(op);
+/*static*/ Status NcclAllGatherStartThunk::CheckImplementable(
+    mlir::lmhlo_gpu::AllGatherStartOp op, int64_t replica_count,
+    int64_t partition_count) {
+  return AddOpDescription<NcclAllGatherStartThunk>(
+      impl::CheckImplementable(op), op, replica_count, partition_count);
 }
 
 /*static*/ bool NcclAllGatherStartThunk::IsDegenerate(
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h
index aed28f19315..72d51d72d5f 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h
@@ -18,10 +18,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -48,34 +46,17 @@ class NcclAllGatherThunkBase : public NcclCollectiveThunk {
   const std::vector<Buffer> buffers_;
 };
 
-class NcclAllGatherThunk : public NcclAllGatherThunkBase {
- public:
-  NcclAllGatherThunk(ThunkInfo thunk_info, mlir::lmhlo::AllGatherOp op,
-                     std::vector<Buffer> buffers);
-
-  // Returns whether the given instruction can be lowered to a nccl all-gather
-  // call.
-  static bool CanImplement(mlir::lmhlo::AllGatherOp op);
-  static const char* GetName() { return "AllGather"; }
-  static bool IsDegenerate(mlir::lmhlo::AllGatherOp op, int64_t replica_count,
-                           int64_t partition_count);
-  static CollectiveOpGroupMode GetGroupMode(mlir::lmhlo::AllGatherOp op);
-  static constexpr bool IsAsync() { return false; }
-
- protected:
-  Status RunNcclCollective(const ExecuteParams& params,
-                           ncclComm_t comm) override;
-};
-
 class NcclAllGatherStartThunk : public NcclAllGatherThunkBase {
  public:
   NcclAllGatherStartThunk(ThunkInfo thunk_info,
                           mlir::lmhlo_gpu::AllGatherStartOp op,
                           std::vector<Buffer> buffers);
 
-  static const char* GetName() { return "AllGatherStart"; }
+  static const char* GetHloOpName() { return "all-gather-start"; }
 
-  static bool CanImplement(mlir::lmhlo_gpu::AllGatherStartOp op);
+  static Status CheckImplementable(mlir::lmhlo_gpu::AllGatherStartOp op,
+                                   int64_t replica_count,
+                                   int64_t partition_count);
   static bool IsDegenerate(mlir::lmhlo_gpu::AllGatherStartOp op,
                            int64_t replica_count, int64_t partition_count);
   static CollectiveOpGroupMode GetGroupMode(
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
index fcc0e5800e8..049023aeb20 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -16,16 +16,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
 
 #include <array>
-#include <chrono>  // NOLINT (required by TF interfaces)
 #include <cstdlib>
-#include <memory>
 #include <optional>
-#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/strings/str_format.h"
-#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.h"
@@ -81,12 +77,6 @@ Status RunAllReduce(ReductionKind reduction_kind,
 
 namespace {
 
-bool IsValidOperand(mlir::Value operand, Thunk::Kind reduction_op) {
-  Shape shape = TypeToShape(operand.getType());
-  return LayoutUtil::IsDenseArray(shape) &&
-         IsTypeSupportedByNccl(shape.element_type(), reduction_op);
-}
-
 // Generally, the reduction op should be the only operation in the block, except
 // the terminator. However, if the type is bf16, the `FloatNormalization`
 // pass will have converted the op to float32 and added type conversions.
@@ -138,14 +128,17 @@ StatusOr<mlir::Operation*> FindReductionOp(mlir::Block& block) {
 namespace impl {
 
 template <typename OpT>
-bool CanImplement(OpT op, Thunk::Kind reduction_op) {
-  return absl::c_all_of(op.getInputs(),
-                        [reduction_op](mlir::Value operand) {
-                          return IsValidOperand(operand, reduction_op);
-                        }) &&
-         NcclAllReduceReduceScatterThunkBase::MatchAllReduceComputation(
-             op.getComputation())
-             .has_value();
+Status CheckImplementable(OpT op, Thunk::Kind reduction_op) {
+  TF_RETURN_IF_ERROR(NcclCollectiveThunk::CheckImplementable());
+  for (mlir::Value operand : op.getInputs()) {
+    TF_RETURN_IF_ERROR(IsValidOperand(operand, reduction_op));
+  }
+  if (!NcclAllReduceReduceScatterThunkBase::MatchAllReduceComputation(
+           op.getComputation())
+           .has_value()) {
+    return tsl::errors::Unimplemented("Unrecognized reduction computation");
+  }
+  return OkStatus();
 }
 
 template <typename OpT>
@@ -239,33 +232,6 @@ Status NcclAllReduceThunkBase::RunAllReduce(const ExecuteParams& params,
                                   stream, comm);
 }
 
-NcclAllReduceThunk::NcclAllReduceThunk(ThunkInfo thunk_info,
-                                       mlir::lmhlo::AllReduceOp op,
-                                       std::vector<Buffer> buffers)
-    : NcclAllReduceThunkBase(Thunk::kNcclAllReduce, thunk_info,
-                             impl::GetNcclAllReduceConfig(op),
-                             std::move(buffers)) {}
-
-bool NcclAllReduceThunk::CanImplement(mlir::lmhlo::AllReduceOp op) {
-  return impl::CanImplement(op, Thunk::kNcclAllReduce);
-}
-
-bool NcclAllReduceThunk::IsDegenerate(mlir::lmhlo::AllReduceOp op,
-                                      int64_t replica_count,
-                                      int64_t partition_count) {
-  return impl::IsDegenerate(op, replica_count, partition_count);
-}
-
-CollectiveOpGroupMode NcclAllReduceThunk::GetGroupMode(
-    mlir::lmhlo::AllReduceOp op) {
-  return impl::GetGroupMode(op);
-}
-
-Status NcclAllReduceThunk::RunNcclCollective(const ExecuteParams& params,
-                                             ncclComm_t comm) {
-  return RunAllReduce(params, *params.stream, comm);
-}
-
 NcclAllReduceStartThunk::NcclAllReduceStartThunk(
     ThunkInfo thunk_info, mlir::lmhlo_gpu::AllReduceStartOp op,
     std::vector<Buffer> buffers)
@@ -273,9 +239,12 @@ NcclAllReduceStartThunk::NcclAllReduceStartThunk(
                              impl::GetNcclAllReduceConfig(op),
                              std::move(buffers)) {}
 
-bool NcclAllReduceStartThunk::CanImplement(
-    mlir::lmhlo_gpu::AllReduceStartOp op) {
-  return impl::CanImplement(op, Thunk::kNcclAllReduceStart);
+Status NcclAllReduceStartThunk::CheckImplementable(
+    mlir::lmhlo_gpu::AllReduceStartOp op, int64_t replica_count,
+    int64_t partition_count) {
+  return AddOpDescription<NcclAllReduceStartThunk>(
+      impl::CheckImplementable(op, Thunk::kNcclAllReduceStart), op,
+      replica_count, partition_count);
 }
 
 bool NcclAllReduceStartThunk::IsDegenerate(mlir::lmhlo_gpu::AllReduceStartOp op,
@@ -309,44 +278,19 @@ Status NcclReduceScatterThunkBase::RunReduceScatter(const ExecuteParams& params,
                                       stream, comm);
 }
 
-NcclReduceScatterThunk::NcclReduceScatterThunk(
-    ThunkInfo thunk_info, mlir::lmhlo::ReduceScatterOp op,
-    std::vector<NcclAllReduceThunk::Buffer> buffers)
-    : NcclReduceScatterThunkBase(Thunk::kNcclReduceScatter, thunk_info,
-                                 impl::GetNcclAllReduceConfig(op),
-                                 std::move(buffers)) {}
-
-/*static*/ bool NcclReduceScatterThunk::CanImplement(
-    mlir::lmhlo::ReduceScatterOp op) {
-  return impl::CanImplement(op, Thunk::kNcclReduceScatter);
-}
-
-/*static*/ bool NcclReduceScatterThunk::IsDegenerate(
-    mlir::lmhlo::ReduceScatterOp op, int64_t replica_count,
-    int64_t partition_count) {
-  return impl::IsDegenerate(op, replica_count, partition_count);
-}
-
-/*static*/ CollectiveOpGroupMode NcclReduceScatterThunk::GetGroupMode(
-    mlir::lmhlo::ReduceScatterOp op) {
-  return impl::GetGroupMode(op);
-}
-
-Status NcclReduceScatterThunk::RunNcclCollective(const ExecuteParams& params,
-                                                 ncclComm_t comm) {
-  return RunReduceScatter(params, *params.stream, comm);
-}
-
 NcclReduceScatterStartThunk::NcclReduceScatterStartThunk(
     ThunkInfo thunk_info, mlir::lmhlo_gpu::ReduceScatterStartOp op,
-    std::vector<NcclAllReduceThunk::Buffer> buffers)
+    std::vector<NcclCollectiveThunk::Buffer> buffers)
     : NcclReduceScatterThunkBase(Thunk::kNcclReduceScatterStart, thunk_info,
                                  impl::GetNcclAllReduceConfig(op),
                                  std::move(buffers)) {}
 
-/*static*/ bool NcclReduceScatterStartThunk::CanImplement(
-    mlir::lmhlo_gpu::ReduceScatterStartOp op) {
-  return impl::CanImplement(op, Thunk::kNcclReduceScatterStart);
+/*static*/ Status NcclReduceScatterStartThunk::CheckImplementable(
+    mlir::lmhlo_gpu::ReduceScatterStartOp op, int64_t replica_count,
+    int64_t partition_count) {
+  return AddOpDescription<NcclReduceScatterStartThunk>(
+      impl::CheckImplementable(op, Thunk::kNcclReduceScatterStart), op,
+      replica_count, partition_count);
 }
 
 /*static*/ bool NcclReduceScatterStartThunk::IsDegenerate(
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
index 074c78ad84d..60f03cb0e81 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
@@ -16,15 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_REDUCE_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_REDUCE_THUNK_H_
 
-#include <memory>
 #include <optional>
 #include <vector>
 
-#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -66,33 +63,17 @@ class NcclAllReduceThunkBase : public NcclAllReduceReduceScatterThunkBase {
                       ncclComm_t comm);
 };
 
-class NcclAllReduceThunk : public NcclAllReduceThunkBase {
- public:
-  NcclAllReduceThunk(ThunkInfo thunk_info, mlir::lmhlo::AllReduceOp op,
-                     std::vector<Buffer> buffers);
-
-  static const char* GetName() { return "AllReduce"; }
-
-  static bool CanImplement(mlir::lmhlo::AllReduceOp op);
-  static bool IsDegenerate(mlir::lmhlo::AllReduceOp op, int64_t replica_count,
-                           int64_t partition_count);
-  static CollectiveOpGroupMode GetGroupMode(mlir::lmhlo::AllReduceOp op);
-  static constexpr bool IsAsync() { return false; }
-
- protected:
-  Status RunNcclCollective(const ExecuteParams& params,
-                           ncclComm_t comm) override;
-};
-
 class NcclAllReduceStartThunk : public NcclAllReduceThunkBase {
  public:
   NcclAllReduceStartThunk(ThunkInfo thunk_info,
                           mlir::lmhlo_gpu::AllReduceStartOp op,
                           std::vector<Buffer> buffers);
 
-  static const char* GetName() { return "AllReduceStart"; }
+  static const char* GetHloOpName() { return "all-reduce-start"; }
 
-  static bool CanImplement(mlir::lmhlo_gpu::AllReduceStartOp op);
+  static Status CheckImplementable(mlir::lmhlo_gpu::AllReduceStartOp op,
+                                   int64_t replica_count,
+                                   int64_t partition_count);
   static bool IsDegenerate(mlir::lmhlo_gpu::AllReduceStartOp op,
                            int64_t replica_count, int64_t partition_count);
   static CollectiveOpGroupMode GetGroupMode(
@@ -130,37 +111,17 @@ class NcclReduceScatterThunkBase : public NcclAllReduceReduceScatterThunkBase {
                           ncclComm_t comm);
 };
 
-class NcclReduceScatterThunk : public NcclReduceScatterThunkBase {
- public:
-  NcclReduceScatterThunk(ThunkInfo thunk_info, mlir::lmhlo::ReduceScatterOp op,
-                         std::vector<Buffer> buffers);
-
-  static const char* GetName() { return "ReduceScatter"; }
-
-  // Returns whether the given instruction can be lowered to a nccl
-  // reduce-scatter call.
-  static bool CanImplement(mlir::lmhlo::ReduceScatterOp op);
-  static bool IsDegenerate(mlir::lmhlo::ReduceScatterOp op,
-                           int64_t replica_count, int64_t partition_count);
-  static CollectiveOpGroupMode GetGroupMode(mlir::lmhlo::ReduceScatterOp op);
-  static constexpr bool IsAsync() { return false; }
-
- protected:
-  Status RunNcclCollective(const ExecuteParams& params,
-                           ncclComm_t comm) override;
-};
-
 class NcclReduceScatterStartThunk : public NcclReduceScatterThunkBase {
  public:
   NcclReduceScatterStartThunk(ThunkInfo thunk_info,
                               mlir::lmhlo_gpu::ReduceScatterStartOp op,
                               std::vector<Buffer> buffers);
 
-  static const char* GetName() { return "ReduceScatterStart"; }
+  static const char* GetHloOpName() { return "reduce-scatter-start"; }
 
-  // Returns whether the given instruction can be lowered to a nccl
-  // reduce-scatter call.
-  static bool CanImplement(mlir::lmhlo_gpu::ReduceScatterStartOp op);
+  static Status CheckImplementable(mlir::lmhlo_gpu::ReduceScatterStartOp op,
+                                   int64_t replica_count,
+                                   int64_t partition_count);
   static bool IsDegenerate(mlir::lmhlo_gpu::ReduceScatterStartOp op,
                            int64_t replica_count, int64_t partition_count);
   static CollectiveOpGroupMode GetGroupMode(
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.cc
index f9b96548773..f46f743ed16 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.cc
@@ -15,16 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h"
 
-#include <chrono>  // NOLINT (required by TF interfaces)
 #include <cstdlib>
-#include <memory>
 #include <optional>
-#include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/strings/str_format.h"
-#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -48,14 +43,20 @@ NcclAllToAllConfig GetNcclAllToAllConfig(OpT op) {
 }
 
 template <typename OpT>
-bool CanImplement(OpT op) {
-  return absl::c_all_of(op.getInputs(), [&op](mlir::Value operand) {
+Status CheckImplementable(OpT op) {
+  TF_RETURN_IF_ERROR(NcclCollectiveThunk::CheckImplementable());
+  std::optional<uint64_t> split_dim = op.getSplitDimension();
+  for (mlir::Value operand : op.getInputs()) {
+    TF_RETURN_IF_ERROR(IsValidOperand(operand, Thunk::kNcclAllToAll));
     Shape shape = GetShape(operand);
-    return LayoutUtil::IsDenseArray(shape) &&
-           IsTypeSupportedByNccl(shape.element_type(), Thunk::kNcclAllToAll) &&
-           (!op.getSplitDimension() ||
-            LayoutUtil::MinorToMajor(shape).back() == *op.getSplitDimension());
-  });
+    if (split_dim &&
+        !ShapeUtil::IsEffectivelyMostMajorDimension(shape, *split_dim)) {
+      return tsl::errors::Unimplemented(
+          "all-to-all split dim %u is not the most major in input shape %s",
+          *split_dim, shape.ToString(/*print_layout=*/true));
+    }
+  }
+  return OkStatus();
 }
 }  // namespace impl
 
@@ -78,44 +79,18 @@ Status NcclAllToAllThunkBase::RunAllToAll(const ExecuteParams& params,
                                stream, comm);
 }
 
-NcclAllToAllThunk::NcclAllToAllThunk(
-    ThunkInfo thunk_info, mlir::lmhlo::AllToAllOp op,
-    std::vector<NcclAllToAllThunk::Buffer> buffers)
-    : NcclAllToAllThunkBase(Thunk::kNcclAllToAll, thunk_info,
-                            impl::GetNcclAllToAllConfig(op),
-                            std::move(buffers)) {}
-
-/*static*/ bool NcclAllToAllThunk::CanImplement(mlir::lmhlo::AllToAllOp op) {
-  return impl::CanImplement(op);
-}
-
-/*static*/ bool NcclAllToAllThunk::IsDegenerate(mlir::lmhlo::AllToAllOp op,
-                                                int64_t replica_count,
-                                                int64_t partition_count) {
-  return impl::GetNcclAllToAllConfig(op).config.IsDegenerate(replica_count,
-                                                             partition_count);
-}
-
-/*static*/ CollectiveOpGroupMode NcclAllToAllThunk::GetGroupMode(
-    mlir::lmhlo::AllToAllOp op) {
-  return impl::GetNcclAllToAllConfig(op).config.group_mode;
-}
-
-Status NcclAllToAllThunk::RunNcclCollective(const ExecuteParams& params,
-                                            ncclComm_t comm) {
-  return RunAllToAll(params, *params.stream, comm);
-}
-
 NcclAllToAllStartThunk::NcclAllToAllStartThunk(
     ThunkInfo thunk_info, mlir::lmhlo_gpu::AllToAllStartOp op,
-    std::vector<NcclAllToAllThunk::Buffer> buffers)
+    std::vector<NcclCollectiveThunk::Buffer> buffers)
     : NcclAllToAllThunkBase(Thunk::kNcclAllToAllStart, thunk_info,
                             impl::GetNcclAllToAllConfig(op),
                             std::move(buffers)) {}
 
-/*static*/ bool NcclAllToAllStartThunk::CanImplement(
-    mlir::lmhlo_gpu::AllToAllStartOp op) {
-  return impl::CanImplement(op);
+/*static*/ Status NcclAllToAllStartThunk::CheckImplementable(
+    mlir::lmhlo_gpu::AllToAllStartOp op, int64_t replica_count,
+    int64_t partition_count) {
+  return AddOpDescription<NcclAllToAllStartThunk>(
+      impl::CheckImplementable(op), op, replica_count, partition_count);
 }
 
 /*static*/ bool NcclAllToAllStartThunk::IsDegenerate(
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h
index d3f1240305a..b6f33caec99 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h
@@ -18,10 +18,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -48,26 +46,6 @@ class NcclAllToAllThunkBase : public NcclCollectiveThunk {
   const std::vector<Buffer> buffers_;
 };
 
-class NcclAllToAllThunk : public NcclAllToAllThunkBase {
- public:
-  NcclAllToAllThunk(ThunkInfo thunk_info, mlir::lmhlo::AllToAllOp op,
-                    std::vector<Buffer> buffers);
-
-  // Returns whether the given instruction can be lowered to a nccl all-to-all
-  // call.
-  static bool CanImplement(mlir::lmhlo::AllToAllOp op);
-
-  static const char* GetName() { return "AllToAll"; }
-  static bool IsDegenerate(mlir::lmhlo::AllToAllOp op, int64_t replica_count,
-                           int64_t partition_count);
-  static CollectiveOpGroupMode GetGroupMode(mlir::lmhlo::AllToAllOp op);
-  static constexpr bool IsAsync() { return false; }
-
- protected:
-  Status RunNcclCollective(const ExecuteParams& params,
-                           ncclComm_t comm) override;
-};
-
 class NcclAllToAllStartThunk : public NcclAllToAllThunkBase {
  public:
   NcclAllToAllStartThunk(ThunkInfo thunk_info,
@@ -76,9 +54,11 @@ class NcclAllToAllStartThunk : public NcclAllToAllThunkBase {
 
   // Returns whether the given instruction can be lowered to a nccl all-to-all
   // call.
-  static bool CanImplement(mlir::lmhlo_gpu::AllToAllStartOp op);
+  static Status CheckImplementable(mlir::lmhlo_gpu::AllToAllStartOp op,
+                                   int64_t replica_count,
+                                   int64_t partition_count);
 
-  static const char* GetName() { return "AllToAllStart"; }
+  static const char* GetHloOpName() { return "all-to-all-start"; }
   static bool IsDegenerate(mlir::lmhlo_gpu::AllToAllStartOp op,
                            int64_t replica_count, int64_t partition_count);
   static CollectiveOpGroupMode GetGroupMode(
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.cc
index 33ca6342ac1..5476359bfb4 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h"
 
-#include <map>
 #include <optional>
 #include <string>
 #include <utility>
@@ -101,10 +100,9 @@ bool IsDegenerate(OpT op, int64_t replica_count, int64_t partition_count) {
 }
 
 template <typename OpT>
-bool CanImplement(OpT op) {
-  const Shape shape = GetShape(op.getOperand());
-  return IsTypeSupportedByNccl(shape.element_type(),
-                               Thunk::kNcclCollectivePermute);
+Status CheckImplementable(OpT op) {
+  TF_RETURN_IF_ERROR(NcclCollectiveThunk::CheckImplementable());
+  return IsValidOperand(op.getOperand(), Thunk::kNcclCollectivePermute);
 }
 
 }  // namespace impl
@@ -144,43 +142,6 @@ Status NcclCollectivePermuteThunkBase::RunCollectivePermute(
                                           current_id);
 }
 
-/*static*/ NcclCollectivePermuteConfig
-NcclCollectivePermuteThunk::GetNcclCollectivePermuteConfig(
-    mlir::lmhlo::CollectivePermuteOp op, int64_t replica_count,
-    int64_t partition_count) {
-  return impl::GetNcclCollectivePermuteConfig(op, replica_count,
-                                              partition_count);
-}
-
-/*static*/ bool NcclCollectivePermuteThunk::CanImplement(
-    mlir::lmhlo::CollectivePermuteOp op) {
-  return impl::CanImplement(op);
-}
-
-/*static*/ bool NcclCollectivePermuteThunk::IsDegenerate(
-    mlir::lmhlo::CollectivePermuteOp op, int64_t replica_count,
-    int64_t partition_count) {
-  return impl::IsDegenerate(op, replica_count, partition_count);
-}
-
-/*static*/ CollectiveOpGroupMode NcclCollectivePermuteThunk::GetGroupMode(
-    mlir::lmhlo::CollectivePermuteOp op) {
-  return impl::GetGroupMode(op);
-}
-
-NcclCollectivePermuteThunk::NcclCollectivePermuteThunk(
-    ThunkInfo thunk_info, mlir::lmhlo::CollectivePermuteOp op,
-    int64_t replica_count, int64_t partition_count, const Buffer& buffer)
-    : NcclCollectivePermuteThunkBase(
-          Thunk::kNcclCollectivePermute, thunk_info,
-          GetNcclCollectivePermuteConfig(op, replica_count, partition_count),
-          buffer) {}
-
-Status NcclCollectivePermuteThunk::RunNcclCollective(
-    const ExecuteParams& params, ncclComm_t comm) {
-  return RunCollectivePermute(params, *params.stream, comm);
-}
-
 /*static*/ NcclCollectivePermuteConfig
 NcclCollectivePermuteStartThunk::GetNcclCollectivePermuteConfig(
     mlir::lmhlo_gpu::CollectivePermuteStartOp op, int64_t replica_count,
@@ -189,9 +150,11 @@ NcclCollectivePermuteStartThunk::GetNcclCollectivePermuteConfig(
                                               partition_count);
 }
 
-/*static*/ bool NcclCollectivePermuteStartThunk::CanImplement(
-    mlir::lmhlo_gpu::CollectivePermuteStartOp op) {
-  return impl::CanImplement(op);
+/*static*/ Status NcclCollectivePermuteStartThunk::CheckImplementable(
+    mlir::lmhlo_gpu::CollectivePermuteStartOp op, int64_t replica_count,
+    int64_t partition_count) {
+  return AddOpDescription<NcclCollectivePermuteStartThunk>(
+      impl::CheckImplementable(op), op, replica_count, partition_count);
 }
 
 /*static*/ bool NcclCollectivePermuteStartThunk::IsDegenerate(
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h
index cac8be7871c..94feb73225b 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h
@@ -16,13 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_COLLECTIVE_PERMUTE_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_COLLECTIVE_PERMUTE_THUNK_H_
 
+#include <optional>
+
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
-#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
-#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -74,42 +72,20 @@ class NcclCollectivePermuteThunkBase : public NcclCollectiveThunk {
   const Buffer buffer_;
 };
 
-class NcclCollectivePermuteThunk : public NcclCollectivePermuteThunkBase {
- public:
-  static NcclCollectivePermuteConfig GetNcclCollectivePermuteConfig(
-      mlir::lmhlo::CollectivePermuteOp op, int64_t replica_count,
-      int64_t partition_count);
-
-  static bool CanImplement(mlir::lmhlo::CollectivePermuteOp op);
-  static bool IsDegenerate(mlir::lmhlo::CollectivePermuteOp op,
-                           int64_t replica_count, int64_t partition_count);
-  static CollectiveOpGroupMode GetGroupMode(
-      mlir::lmhlo::CollectivePermuteOp op);
-  static const char* GetName() { return "CollectivePermute"; }
-  static constexpr bool IsAsync() { return false; }
-
-  NcclCollectivePermuteThunk(ThunkInfo thunk_info,
-                             mlir::lmhlo::CollectivePermuteOp op,
-                             int64_t replica_count, int64_t partition_count,
-                             const Buffer& buffer);
-
- protected:
-  Status RunNcclCollective(const ExecuteParams& params,
-                           ncclComm_t comm) override;
-};
-
 class NcclCollectivePermuteStartThunk : public NcclCollectivePermuteThunkBase {
  public:
   static NcclCollectivePermuteConfig GetNcclCollectivePermuteConfig(
       mlir::lmhlo_gpu::CollectivePermuteStartOp op, int64_t replica_count,
       int64_t partition_count);
 
-  static bool CanImplement(mlir::lmhlo_gpu::CollectivePermuteStartOp op);
+  static Status CheckImplementable(mlir::lmhlo_gpu::CollectivePermuteStartOp op,
+                                   int64_t replica_count,
+                                   int64_t partition_count);
   static bool IsDegenerate(mlir::lmhlo_gpu::CollectivePermuteStartOp op,
                            int64_t replica_count, int64_t partition_count);
   static CollectiveOpGroupMode GetGroupMode(
       mlir::lmhlo_gpu::CollectivePermuteStartOp op);
-  static const char* GetName() { return "CollectivePermuteStart"; }
+  static const char* GetHloOpName() { return "collective-permute-start"; }
   static constexpr bool IsAsync() { return true; }
 
   NcclCollectivePermuteStartThunk(ThunkInfo thunk_info,
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.cc
index 8e4c870e989..71c52bca3ab 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.cc
@@ -15,9 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
 
-#include <chrono>  // NOLINT (required by TF interfaces)
 #include <cstdlib>
-#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -31,6 +29,39 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+namespace {
+
+bool IsTypeSupportedByNccl(PrimitiveType element_type,
+                           Thunk::Kind reduction_op) {
+  switch (element_type) {
+    case S8:
+    case PRED:
+    case U8:
+    case S32:
+    case U32:
+    case S64:
+    case U64:
+    case F16:
+    case F32:
+    case F64:
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case BF16:
+#endif
+    case C64:
+    case C128:
+      return true;
+    case S16:
+    case U16:
+      // 16-bit integer reductions are not directly supported by NCCL and cannot
+      // be implicitly converted into other 16-bit types like ncclFloat16 as
+      // they involve actual computation and not just data movement.
+      return !IsReductionCollective(reduction_op);
+    default:
+      return false;
+  }
+}
+
+}  // namespace
 
 // This file runs collective ops (i.e. ops that communicate between multiple
 // GPUs) using NCCL.
@@ -107,6 +138,13 @@ bool NcclCollectiveConfig::IsDegenerate(int64_t replica_count,
 #endif
 }
 
+/* static */ Status NcclCollectiveThunk::CheckImplementable() {
+  if (!NcclIsEnabled()) {
+    return tsl::errors::Unimplemented("NCCL is not enabled");
+  }
+  return OkStatus();
+}
+
 #if XLA_ENABLE_XCCL
 StatusOr<NcclComm::Lock> LockNcclComm(
     const NcclExecuteParams& params,
@@ -251,34 +289,19 @@ Status NcclCollectiveDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
   return async_.Await(params);
 }
 
-bool IsTypeSupportedByNccl(PrimitiveType element_type,
-                           Thunk::Kind reduction_op) {
-  switch (element_type) {
-    case S8:
-    case PRED:
-    case U8:
-    case S32:
-    case U32:
-    case S64:
-    case U64:
-    case F16:
-    case F32:
-    case F64:
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    case BF16:
-#endif
-    case C64:
-    case C128:
-      return true;
-    case S16:
-    case U16:
-      // 16-bit integer reductions are not directly suppored by NCCL and cannot
-      // be implicitly converted into other 16-bit types like ncclFloat16 as
-      // they involve actual comptation andn not just data movement.
-      return !IsReductionCollective(reduction_op);
-    default:
-      return false;
+Status IsValidOperand(mlir::Value operand, Thunk::Kind reduction_op) {
+  Shape shape = GetShape(operand);
+  if (!LayoutUtil::IsDenseArray(shape)) {
+    return tsl::errors::Unimplemented(
+        absl::StrFormat("input is not a dense array: %s",
+                        shape.ToString(/*print_layout=*/true)));
   }
+  if (!IsTypeSupportedByNccl(shape.element_type(), reduction_op)) {
+    return tsl::errors::Unimplemented(absl::StrFormat(
+        "element type %s not suppored by NCCL",
+        primitive_util::LowercasePrimitiveTypeName(shape.element_type())));
+  }
+  return OkStatus();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h
index 9cd0f3f9b5e..1f40efd7767 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -129,6 +130,7 @@ class NcclCollectiveThunk : public Thunk {
   // When this is false, the ExecuteOnStream() call will simply return a status
   // error.
   static bool NcclIsEnabled();
+  static Status CheckImplementable();
 
   // Logging support.
   static std::string GetDeviceString(const NcclExecuteParams& params);
@@ -157,10 +159,25 @@ class NcclCollectiveDoneThunk : public Thunk {
   NcclCollectiveThunk::AsyncExecutor& async_;
 };
 
-// Returns if the given data type is supported by NCCL.
-// Note: Keep this in sync with ToNcclDataType().
-bool IsTypeSupportedByNccl(PrimitiveType element_type,
-                           Thunk::Kind reduction_op);
+Status IsValidOperand(mlir::Value operand, Thunk::Kind reduction_op);
+
+template <typename NcclThunkType, typename OpT>
+Status AddOpDescription(Status status, OpT op, int64_t replica_count,
+                        int64_t partition_count) {
+  if (status.ok()) {
+    return status;
+  }
+  CollectiveOpGroupMode group_mode = NcclThunkType::GetGroupMode(op);
+  return Status(
+      status.code(),
+      absl::StrFormat(
+          "%s\n"
+          "%s with replica_count: %d, partition_count: %d, group_mode: %s, "
+          "operand_count: %d\n%s",
+          status.message(), NcclThunkType::GetHloOpName(), replica_count,
+          partition_count, CollectiveOpGroupModeToString(group_mode),
+          op->getNumOperands() / 2, llvm_ir::DumpToString(op.getOperation())));
+}
 
 #if XLA_ENABLE_XCCL
 // TODO(hanbinyoon): Consider moving to nccl_utils.h when deprecating Thunks.
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_utils.cc b/tensorflow/compiler/xla/service/gpu/nccl_utils.cc
index 916ff810825..c3c90bb9c5d 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_utils.cc
@@ -43,14 +43,6 @@ bool IsGlobalNcclConfig() {
   return global_nccl_config;
 }
 
-bool IsNcclLaunchModeParallel() {
-  static const bool is_launch_mode_parallel = []() {
-    const char* launch_mode = std::getenv("NCCL_LAUNCH_MODE");
-    return launch_mode && std::string_view(launch_mode) == "PARALLEL";
-  }();
-  return is_launch_mode_parallel;
-}
-
 Status ToStatus(ncclResult_t s, const char* file, int64_t line,
                 const char* expr) {
   if (s == ncclSuccess) {
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_utils.h b/tensorflow/compiler/xla/service/gpu/nccl_utils.h
index 5e1215f198a..d098e0cafe0 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_utils.h
@@ -47,7 +47,6 @@ StatusOr<std::pair<ncclDataType_t, int>> ToNcclDataTypeAndCountMultiplier(
     PrimitiveType element_type, Thunk::Kind reduction_op);
 
 bool IsGlobalNcclConfig();
-bool IsNcclLaunchModeParallel();
 
 Status ToStatus(ncclResult_t s, const char* file, int64_t line,
                 const char* expr);
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 01ec3b18c7f..b58cd2d5989 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -131,6 +131,8 @@ Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
 
   AlgebraicSimplifierOptions algsimp_options;
   algsimp_options.set_enable_conv_operand_swap(false);
+  algsimp_options.set_unconditionally_simplify_reduce_of_transpose_or_reshape(
+      true);
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(algsimp_options);
 
   // CudnnSimplifyPadding gets rid of some padding introduced by
@@ -165,8 +167,7 @@ Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
 
 Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    se::DeviceMemoryAllocator* device_allocator,
-    const GpuTargetConfig& gpu_target_config,
+    const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
     const AutotuneResults* autotune_results) {
   HloPassPipeline pre_pipeline("nvptx post-layout_assignment part 1");
 
@@ -196,8 +197,7 @@ Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
   TF_RETURN_IF_ERROR(pre_pipeline.Run(hlo_module).status());
 
   TF_RETURN_IF_ERROR(GpuCompiler::OptimizeHloPostLayoutAssignment(
-      hlo_module, stream_exec, device_allocator, gpu_target_config,
-      autotune_results));
+      hlo_module, stream_exec, options, gpu_target_config, autotune_results));
 
   HloPassPipeline post_pipeline("nvptx post-layout_assignment part 2");
 
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index 9c94786f8ca..85bda21ab0e 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -44,8 +44,7 @@ class NVPTXCompiler : public GpuCompiler {
 
   Status OptimizeHloPostLayoutAssignment(
       HloModule* hlo_module, se::StreamExecutor* stream_exec,
-      se::DeviceMemoryAllocator* device_allocator,
-      const GpuTargetConfig& gpu_target_config,
+      const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
       const AutotuneResults* autotune_results) override;
 
   HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() override;
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
index 1543a17dd75..e40fe6ba894 100644
--- a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
@@ -97,7 +97,7 @@ Status OutfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
   Status block_status = stream.BlockHostUntilDone();
   if (!block_status.ok()) {
     return InternalError("Failed to complete data transfer on stream %p: %s",
-                         &stream, block_status.error_message());
+                         &stream, block_status.message());
   }
 
   VLOG(2) << "Outfeeding from GPU complete";
diff --git a/tensorflow/compiler/xla/service/gpu/reusable_kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/reusable_kernel_thunk.cc
deleted file mode 100644
index 95596da3b62..00000000000
--- a/tensorflow/compiler/xla/service/gpu/reusable_kernel_thunk.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/reusable_kernel_thunk.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
-#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/stream_executor/device_memory.h"
-#include "tensorflow/compiler/xla/stream_executor/kernel.h"
-#include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/tsl/platform/errors.h"
-#include "tensorflow/tsl/platform/logging.h"
-
-namespace xla {
-namespace gpu {
-
-ReusableKernelThunk::ReusableKernelThunk(
-    ThunkInfo thunk_info, std::vector<BufferAllocation::Slice> args,
-    const std::string& kernel_name, const LaunchDimensions& launch_dimensions,
-    std::vector<mlir::Value> values)
-    : Thunk(Kind::kReusableKernel, thunk_info),
-      args_(std::move(args)),
-      kernel_name_(kernel_name),
-      launch_dimensions_(launch_dimensions),
-      values_(std::move(values)) {}
-
-std::string ReusableKernelThunk::ToStringExtra(int indent) const {
-  return absl::StrFormat(", kernel = %s, launch dimensions = %s", kernel_name_,
-                         launch_dimensions_.ToString());
-}
-
-Status ReusableKernelThunk::Initialize(const GpuExecutable& executable,
-                                       se::StreamExecutor* executor) {
-  absl::MutexLock lock(&mutex_);
-
-  // Load the kernel into the device if necessary.
-  //
-  // We could alternatively do this within ExecuteOnStream, but doing it here
-  // lets the time spent loading the kernel not count towards our execution
-  // profiles.
-  auto it = kernel_cache_.find(executor);
-  if (kernel_cache_.end() == it) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<se::KernelBase> kernel,
-        CreateKernel(kernel_name_, args_.size(), executable.text(),
-                     executable.binary(), executor,
-                     launch_dimensions_.SharedMemBytes()));
-
-    kernel_cache_.emplace(executor, std::move(kernel));
-  }
-
-  return OkStatus();
-}
-
-static void PrintBufferContents(
-    se::Stream* stream, absl::Span<const se::DeviceMemoryBase> buffer_args) {
-  int input_idx = 0;
-  for (const se::DeviceMemoryBase& buf : buffer_args) {
-    auto host_buffer = std::make_unique<char[]>(buf.size());
-    CHECK(stream->ThenMemcpy(host_buffer.get(), buf, buf.size()).ok());
-    CHECK(stream->BlockHostUntilDone().ok());
-
-    std::string buffer_contents;
-    for (int i = 0; i < buf.size(); i++) {
-      absl::StrAppendFormat(&buffer_contents, "%x ",
-                            static_cast<unsigned>(host_buffer[i]));
-    }
-    VLOG(100) << "BUF(" << input_idx++ << ") = " << buffer_contents;
-  }
-}
-
-Status ReusableKernelThunk::ExecuteOnStream(const ExecuteParams& params) {
-  // Load the kernel.
-  se::StreamExecutor* executor = params.stream->parent();
-  LaunchDimensions launch_dimensions;
-  const se::KernelBase* kernel = nullptr;
-
-  {
-    absl::MutexLock lock(&mutex_);
-    auto it = kernel_cache_.find(executor);
-    CHECK(it != kernel_cache_.end())
-        << "Initialize() not called for StreamExecutor " << executor;
-    launch_dimensions = launch_dimensions_;
-    kernel = it->second.get();
-  }
-
-  VLOG(3) << "Launching " << kernel->name();
-  absl::InlinedVector<se::DeviceMemoryBase, 4> buffer_args;
-  for (const BufferAllocation::Slice& arg : args_) {
-    se::DeviceMemoryBase buf = params.buffer_allocations->GetDeviceAddress(arg);
-    VLOG(3) << "  Arg: alloc #" << arg.index() << ", offset: " << arg.offset()
-            << ": " << buf.opaque() << " (" << buf.size() << "B)";
-    buffer_args.push_back(buf);
-  }
-
-  if (VLOG_IS_ON(100)) {
-    PrintBufferContents(params.stream, buffer_args);
-  }
-
-  return ExecuteKernelOnStream(*kernel, buffer_args, launch_dimensions,
-                               params.stream);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/reusable_kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/reusable_kernel_thunk.h
deleted file mode 100644
index f2794beb656..00000000000
--- a/tensorflow/compiler/xla/service/gpu/reusable_kernel_thunk.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REUSABLE_KERNEL_THUNK_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REUSABLE_KERNEL_THUNK_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/types/span.h"
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
-#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
-#include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/compiler/xla/types.h"
-
-namespace xla {
-namespace gpu {
-
-class GpuExecutable;
-
-// Allows StreamExecutor to launch so-called "reusable kernels".
-//
-// We have 2 calling conventions in use for passing parameters to kernels:
-// - Old "kernels" take a pointer to each *allocation* and the code of the
-// kernel calculates the pointer to each *argument* based on the allocation
-// pointer + some hardcoded offsets. The pointers to constant arguments are
-// hardcoded.
-// - "Reusable kernels" take pointers to each *argument* separately, these
-// pointers are often not pointing to the first byte of an allocation, but an
-// offset is already added in the host code. Pointers to constant arguments are
-// also passed normally. This calling convention often allows reusing a kernel
-// for multiple operations.
-//
-// Note: The thunk itself is not reusable: we need a separate thunk for each
-// kernel launch in the code.
-//
-// TODO(b/272252440): Fully replace the old KernelThunk with this and rename
-// this to KernelThunk.
-//
-// This is thread-compatible.
-class ReusableKernelThunk : public Thunk {
- public:
-  // Constructs a thunk for the given "reusable kernel".
-  //
-  // ReusableKernelThunk takes args as BufferAllocation::Slice's as opposed
-  // to BufferAllocation's. Each slice directly corresponds to an argument or
-  // output of the computation. Also, the values must correspond to each arg
-  // directly, not to their base allocation (e.g. they can be the result of an
-  // mlir::memref::ViewOp).
-  ReusableKernelThunk(ThunkInfo thunk_info,
-                      std::vector<BufferAllocation::Slice> args,
-                      const std::string& kernel_name,
-                      const LaunchDimensions& launch_dimensions,
-                      std::vector<mlir::Value> values);
-  ReusableKernelThunk(const ReusableKernelThunk&) = delete;
-  ReusableKernelThunk& operator=(const ReusableKernelThunk&) = delete;
-  ~ReusableKernelThunk() override = default;
-
-  std::string ToStringExtra(int indent) const override;
-
-  Status Initialize(const GpuExecutable& executable,
-                    se::StreamExecutor* executor) override;
-  Status ExecuteOnStream(const ExecuteParams& params) override;
-
-  void ClearCompileTimeInfo() override {
-    Thunk::ClearCompileTimeInfo();
-    for (auto& value : values_) {
-      value = nullptr;
-    }
-  }
-
-  const std::vector<BufferAllocation::Slice>& arguments() const {
-    return args_;
-  }
-  const std::string& kernel_name() const { return kernel_name_; }
-  const LaunchDimensions& launch_dimensions() const {
-    return launch_dimensions_;
-  }
-  absl::Span<const mlir::Value> values() const { return values_; }
-
- private:
-  // Buffer slices passed to the kernel as arguments.
-  const std::vector<BufferAllocation::Slice> args_;
-
-  // Entry kernel name for the computation.
-  const std::string kernel_name_;
-
-  // The thread and block dimension used to launch the kernel.
-  const LaunchDimensions launch_dimensions_;
-
-  // mlir::Value(s) corresponding to the buffer slice arguments.
-  std::vector<mlir::Value> values_;
-
-  mutable absl::Mutex mutex_;
-
-  // Loaded kernels for each `StreamExecutor`.  Requires pointer stability of
-  // values.
-  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::KernelBase>>
-      kernel_cache_ ABSL_GUARDED_BY(mutex_);
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REUSABLE_KERNEL_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/BUILD b/tensorflow/compiler/xla/service/gpu/runtime/BUILD
index bc136486e7b..c37013f4b5c 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/runtime/BUILD
@@ -1,4 +1,10 @@
 load("//tensorflow/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+load(
+    "//tensorflow/tsl/platform:build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -132,6 +138,7 @@ cc_library(
         ":memset",
         ":send_recv",
         ":support",
+        ":topk",
         ":tracing",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_gpu",
@@ -165,13 +172,135 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "topk_kernel",
+    srcs = if_cuda_is_configured(
+        [
+            "topk_kernel.cc",
+        ],
+    ),
+    hdrs = if_cuda_is_configured(["topk_kernel.h"]),
+    compatible_with = [],
+    deps = [
+        ":topk_kernel_cuda",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/stream_executor:platform",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",  # build_cleaner: keep
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+cuda_library(
+    name = "topk_kernel_cuda",
+    srcs = if_cuda_is_configured(
+        [
+            "topk_kernel.cu.cc",
+        ],
+    ),
+    hdrs = if_cuda_is_configured(["topk_kernel_common.h"]),
+    compatible_with = [],
+    deps = [
+        "//third_party/eigen3",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+xla_cc_test(
+    name = "topk_kernel_test",
+    srcs = if_cuda_is_configured(["topk_kernel_test.cc"]),
+    linkstatic = 1,
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":topk_kernel",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_benchmark",
+        "//tensorflow/tsl/platform:test_main",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/random",
+        "@com_google_absl//absl/strings",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+xla_cc_test(
+    name = "topk_test",
+    srcs = if_cuda_is_configured(["topk_test.cc"]),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":topk",
+        "//tensorflow/compiler/xla:error_spec",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/service:topk_rewriter",
+        "//tensorflow/compiler/xla/service/gpu:topk_specializer",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:verified_hlo_module",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "topk",
+    srcs = if_cuda_is_configured(
+        ["topk.cc"],
+        ["topk_no_cuda.cc"],
+    ),
+    hdrs = ["topk.h"],
+    deps = if_cuda_is_configured([":topk_kernel"]) + [
+        ":support",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:custom_call_encoding",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/runtime:state",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_api",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_c_api_hdrs",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:tuple_util",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "gemm",
     srcs = ["gemm.cc"],
     hdrs = ["gemm.h"],
     deps = [
         ":support",
-        "@com_google_absl//absl/container:node_hash_map",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/runtime:custom_call",
@@ -181,9 +310,10 @@ cc_library(
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/gpu:matmul_utils",
+        "//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock",
         "//tensorflow/compiler/xla/stream_executor:blas",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
-        "//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock",
+        "@com_google_absl//absl/container:node_hash_map",
     ] + if_cuda_is_configured([
         "//tensorflow/compiler/xla/service/gpu:gemm_algorithm_picker",
         "//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
@@ -199,16 +329,16 @@ cc_library(
         ":gemm",
         ":kernel_launch",
         ":support",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/synchronization",
-        "//tensorflow/compiler/xla/runtime:state",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/runtime:custom_call",
         "//tensorflow/compiler/xla/runtime:custom_call_registry",
         "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/runtime:state",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock",
+        "//tensorflow/compiler/xla/stream_executor",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/synchronization",
     ] + if_cuda_is_configured([
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_graph",
     ]),
@@ -245,7 +375,9 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/synchronization",
-    ],
+    ] + if_cuda_is_configured([
+        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_graph",
+    ]),
 )
 
 cc_library(
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/collectives.cc b/tensorflow/compiler/xla/service/gpu/runtime/collectives.cc
index 2f5a45f4252..f290bb1411e 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/collectives.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/collectives.cc
@@ -41,6 +41,44 @@ using xla::runtime::CustomCall;
 using xla::runtime::FlatMemrefView;
 using xla::runtime::StridedMemrefView;
 
+namespace {
+
+Status RunRepeated(int32_t count, absl::FunctionRef<Status()> to_run) {
+  if (count != 0) {
+    VLOG(3) << "Running each collective " << count << " times\n";
+  }
+  for (int32_t i = 0; i < count; ++i) {
+    TF_RETURN_IF_ERROR(to_run());
+  }
+  return OkStatus();
+}
+
+// Helper function to run a collective either synchronously on main stream or
+// asynchronously on the async stream.
+absl::Status RunSyncOrAsync(
+    const ServiceExecutableRunOptions* run_options,
+    CollectivesSupport* collectives, AsyncCollectivesSupport* async_collectives,
+    int32_t uid, bool is_async,
+    absl::FunctionRef<absl::Status(se::Stream*)> to_run) {
+  se::Stream* main_stream = run_options->stream();
+  se::Stream* async_stream = async_collectives->async_comm_stream();
+
+  if (is_async) {
+    // Wait until compute inputs are ready.
+    async_stream->ThenWaitFor(main_stream);
+  }
+
+  // Launch the collective on either the main or async stream.
+  auto status = to_run(is_async ? async_stream : main_stream);
+  if (!status.ok()) return status;
+
+  if (is_async) {
+    return async_collectives->RecordEvent(uid);
+  }
+  int32_t device_ordinal = main_stream->parent()->device_ordinal();
+  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, main_stream);
+}
+
 #if XLA_ENABLE_XCCL
 StatusOr<NcclComm::Lock> GetNcclComm(
     const NcclExecuteParams& params, int64_t group_mode, int64_t op_id,
@@ -87,60 +125,12 @@ StatusOr<std::vector<DeviceBufferPair>> GetDeviceBufferPairs(
   return device_buffers;
 }
 
-//===----------------------------------------------------------------------===//
-// Collectives support library.
-//===----------------------------------------------------------------------===//
-
-static int64_t Key(int32_t uid, int32_t device_ordinal) {
-  return static_cast<int64_t>(uid) << 32 | device_ordinal;
-}
-
-AsyncCollectivesSupport::AsyncCollectivesSupport(se::Stream* async_comm_stream)
-    : async_comm_stream_(async_comm_stream) {}
-
-absl::Status CollectivesSupport::MaybeBlockAfterFirstRun(int32_t uid,
-                                                         int32_t device_ordinal,
-                                                         se::Stream* stream) {
-  bool block = [&] {
-    absl::MutexLock lock(&mutex_);
-    return executed_.insert(Key(uid, device_ordinal)).second;
-  }();
-  return block ? ToAbslStatus(stream->BlockHostUntilDone()) : absl::OkStatus();
-}
-
-absl::Status AsyncCollectivesSupport::RecordEvent(int32_t uid) {
-  // Create an event on the async stream for the completion of the collective.
-  se::Event done_event(async_comm_stream_->parent());
-  if (!done_event.Init()) return absl::InternalError("Failed to create event");
-  async_comm_stream_->ThenRecordEvent(&done_event);
-
-  absl::MutexLock lock(&mutex_);
-  auto [_, was_inserted] = done_events_.insert({uid, std::move(done_event)});
-  if (!was_inserted) {
-    return absl::InternalError(absl::StrFormat(
-        "Async done event has not been consumed (uid=%d, device_ordinal=%d)",
-        uid, async_comm_stream_->parent()->device_ordinal()));
-  }
-  return absl::OkStatus();
-}
-
-absl::StatusOr<se::Event> AsyncCollectivesSupport::PopEvent(int32_t uid) {
-  absl::MutexLock lock(&mutex_);
-  auto done_event = done_events_.extract(uid);
-  if (!done_event) {
-    return absl::InternalError(absl::StrFormat(
-        "Async done event was not found (uid=%d, device_ordinal=%d)", uid,
-        async_comm_stream_->parent()->device_ordinal()));
-  }
-  return std::move(done_event.mapped());
-}
-
-static absl::Status AsyncDoneImpl(
-    const ServiceExecutableRunOptions* run_options,
-    CollectivesSupport* collectives, AsyncCollectivesSupport* async_collectives,
-    const char* op_name, int32_t uid) {
+absl::Status AsyncDoneImpl(const ServiceExecutableRunOptions* run_options,
+                           CollectivesSupport* collectives,
+                           AsyncCollectivesSupport* async_collectives,
+                           int32_t uid, std::string_view done_type) {
 #if XLA_ENABLE_XCCL
-  VLOG(3) << "Running " << op_name;
+  VLOG(3) << "Running " << done_type;
   se::Stream* stream = run_options->stream();
 
   auto event = async_collectives->PopEvent(uid);
@@ -159,13 +149,14 @@ static absl::Status AsyncDoneImpl(
 //===----------------------------------------------------------------------===//
 
 #if XLA_ENABLE_XCCL
-static absl::Status CollectivePermuteImplCommon(
-    const ServiceExecutableRunOptions* run_options, se::Stream* stream,
+absl::Status CollectivePermuteImplCommon(
+    const ServiceExecutableRunOptions* run_options,
+    const DebugOptions* debug_options, se::Stream* stream,
     CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
     absl::Span<const int64_t> replica_group_offsets,
     absl::Span<const int64_t> replica_group_values,
     absl::Span<const int64_t> source_peers,
-    absl::Span<const int64_t> target_peers) {
+    absl::Span<const int64_t> target_peers, int32_t repeat_count = 1) {
   NcclExecuteParams params(*run_options, stream->parent());
 
   auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
@@ -203,30 +194,32 @@ static absl::Status CollectivePermuteImplCommon(
       NcclCollectivePermuteConfig::GetSourceTarget(id_to_source_target,
                                                    current_id);
 
-  return ToAbslStatus(RunCollectivePermute(source_target, (*device_buffers)[0],
-                                           *stream, **comm, device_string,
-                                           current_id));
+  return ToAbslStatus(
+      RunRepeated(debug_options->xla_gpu_collective_inflation_factor(), [&]() {
+        return RunCollectivePermute(source_target, (*device_buffers)[0],
+                                    *stream, **comm, device_string, current_id);
+      }));
 }
 #endif  // XLA_ENABLE_XCCL
 
-static absl::Status CollectivePermuteImpl(
+absl::Status CollectivePermuteImpl(
     const ServiceExecutableRunOptions* run_options,
-    CollectivesSupport* collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, int64_t op_id,
+    const DebugOptions* debug_options, CollectivesSupport* collectives,
+    AsyncCollectivesSupport* async_collectives, CustomCall::RemainingArgs args,
+    int32_t uid, int64_t group_mode, int64_t op_id, bool is_async,
     absl::Span<const int64_t> replica_group_offsets,
     absl::Span<const int64_t> replica_group_values,
     absl::Span<const int64_t> source_peers,
     absl::Span<const int64_t> target_peers) {
 #if XLA_ENABLE_XCCL
-  VLOG(3) << "Running CollectivePermute";
-  se::Stream* stream = run_options->stream();
-  auto status = CollectivePermuteImplCommon(
-      run_options, stream, args, group_mode, op_id, replica_group_offsets,
-      replica_group_values, source_peers, target_peers);
-  if (!status.ok()) return status;
-
-  int32_t device_ordinal = stream->parent()->device_ordinal();
-  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
+  VLOG(3) << "Running CollectivePermute " << (is_async ? "(Async)" : "(Sync)");
+  return RunSyncOrAsync(run_options, collectives, async_collectives, uid,
+                        is_async, [&](se::Stream* stream) {
+                          return CollectivePermuteImplCommon(
+                              run_options, debug_options, stream, args,
+                              group_mode, op_id, replica_group_offsets,
+                              replica_group_values, source_peers, target_peers);
+                        });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
 #endif  // XLA_ENABLE_XCCL
@@ -236,82 +229,27 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
     CollectivePermute, FunctionWrapper<CollectivePermuteImpl>(), checks,
     CustomCall::Bind("xla.gpu.collective_permute")
         .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<const DebugOptions*>()
         .UserData<CollectivesSupport*>()
-        .RemainingArgs()  // args
-        .Attr<int32_t>("uid")
-        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-        .Attr<int64_t>("op_id")
-        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
-        .Attr<absl::Span<const int64_t>>("replica_group_values")
-        .Attr<absl::Span<const int64_t>>("source_peers")
-        .Attr<absl::Span<const int64_t>>("target_peers"));
-
-//===----------------------------------------------------------------------===//
-// CollectivePermuteStart.
-//===----------------------------------------------------------------------===//
-
-static absl::Status CollectivePermuteStartImpl(
-    const ServiceExecutableRunOptions* run_options,
-    AsyncCollectivesSupport* async_collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, int64_t op_id,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values,
-    absl::Span<const int64_t> source_peers,
-    absl::Span<const int64_t> target_peers) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running CollectivePermuteStart";
-  se::Stream* stream = run_options->stream();
-  se::Stream* async_stream = async_collectives->async_comm_stream();
-
-  // Wait until compute inputs are ready.
-  async_stream->ThenWaitFor(stream);
-
-  auto status = CollectivePermuteImplCommon(
-      run_options, async_stream, args, group_mode, op_id, replica_group_offsets,
-      replica_group_values, source_peers, target_peers);
-  if (!status.ok()) return status;
-
-  return async_collectives->RecordEvent(uid);
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    CollectivePermuteStart, FunctionWrapper<CollectivePermuteStartImpl>(),
-    checks,
-    CustomCall::Bind("xla.gpu.collective_permute_start")
-        .UserData<const ServiceExecutableRunOptions*>()
         .UserData<AsyncCollectivesSupport*>()
         .RemainingArgs()  // args
         .Attr<int32_t>("uid")
         .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
         .Attr<int64_t>("op_id")
+        .Attr<bool>("is_async")
         .Attr<absl::Span<const int64_t>>("replica_group_offsets")
         .Attr<absl::Span<const int64_t>>("replica_group_values")
         .Attr<absl::Span<const int64_t>>("source_peers")
         .Attr<absl::Span<const int64_t>>("target_peers"));
 
-//===----------------------------------------------------------------------===//
-// CollectivePermuteDone.
-//===----------------------------------------------------------------------===//
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    CollectivePermuteDone, FunctionWrapper<AsyncDoneImpl>(), checks,
-    CustomCall::Bind("xla.gpu.collective_permute_done")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<CollectivesSupport*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .Value("CollectivePermuteDone")
-        .Attr<int32_t>("uid"));
-
 //===----------------------------------------------------------------------===//
 // AllGather.
 //===----------------------------------------------------------------------===//
 
 #if XLA_ENABLE_XCCL
-static absl::Status AllGatherImplCommon(
-    const ServiceExecutableRunOptions* run_options, se::Stream* stream,
+absl::Status AllGatherImplCommon(
+    const ServiceExecutableRunOptions* run_options,
+    const DebugOptions* debug_options, se::Stream* stream,
     CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
     absl::Span<const int64_t> replica_group_offsets,
     absl::Span<const int64_t> replica_group_values) {
@@ -324,26 +262,29 @@ static absl::Status AllGatherImplCommon(
   auto device_buffers = GetDeviceBufferPairs(args);
   if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
 
-  return ToAbslStatus(RunAllGather(*device_buffers, *stream, **comm));
+  return ToAbslStatus(RunRepeated(
+      debug_options->xla_gpu_collective_inflation_factor(),
+      [&]() { return RunAllGather(*device_buffers, *stream, **comm); }));
 }
 #endif  // XLA_ENABLE_XCCL
 
-static absl::Status AllGatherImpl(
-    const ServiceExecutableRunOptions* run_options,
-    CollectivesSupport* collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, int64_t op_id,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values) {
+absl::Status AllGatherImpl(const ServiceExecutableRunOptions* run_options,
+                           const DebugOptions* debug_options,
+                           CollectivesSupport* collectives,
+                           AsyncCollectivesSupport* async_collectives,
+                           CustomCall::RemainingArgs args, int32_t uid,
+                           int64_t group_mode, int64_t op_id, bool is_async,
+                           absl::Span<const int64_t> replica_group_offsets,
+                           absl::Span<const int64_t> replica_group_values) {
 #if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllGather";
-  se::Stream* stream = run_options->stream();
-  auto status =
-      AllGatherImplCommon(run_options, stream, args, group_mode, op_id,
-                          replica_group_offsets, replica_group_values);
-  if (!status.ok()) return status;
-
-  int32_t device_ordinal = stream->parent()->device_ordinal();
-  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
+  VLOG(3) << "Running AllGather " << (is_async ? "(Async)" : "(Sync)");
+  return RunSyncOrAsync(run_options, collectives, async_collectives, uid,
+                        is_async, [&](se::Stream* stream) {
+                          return AllGatherImplCommon(
+                              run_options, debug_options, stream, args,
+                              group_mode, op_id, replica_group_offsets,
+                              replica_group_values);
+                        });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL diasbled");
 #endif  // XLA_ENABLE_XCCL
@@ -353,75 +294,25 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
     AllGather, FunctionWrapper<AllGatherImpl>(), checks,
     CustomCall::Bind("xla.gpu.all_gather")
         .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<const DebugOptions*>()
         .UserData<CollectivesSupport*>()
+        .UserData<AsyncCollectivesSupport*>()
         .RemainingArgs()  // args
         .Attr<int32_t>("uid")
         .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
         .Attr<int64_t>("op_id")
+        .Attr<bool>("is_async")
         .Attr<absl::Span<const int64_t>>("replica_group_offsets")
         .Attr<absl::Span<const int64_t>>("replica_group_values"));
 
-//===----------------------------------------------------------------------===//
-// AllGatherStart.
-//===----------------------------------------------------------------------===//
-
-static absl::Status AllGatherStartImpl(
-    const ServiceExecutableRunOptions* run_options,
-    AsyncCollectivesSupport* async_collectives, CustomCall::RemainingArgs args,
-    int64_t group_mode, int64_t op_id,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values, int32_t uid) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllGatherStart";
-  se::Stream* stream = run_options->stream();
-  se::Stream* async_stream = async_collectives->async_comm_stream();
-
-  // Wait until compute inputs are ready.
-  async_stream->ThenWaitFor(stream);
-
-  auto status =
-      AllGatherImplCommon(run_options, async_stream, args, group_mode, op_id,
-                          replica_group_offsets, replica_group_values);
-  if (!status.ok()) return status;
-
-  return async_collectives->RecordEvent(uid);
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    AllGatherStart, FunctionWrapper<AllGatherStartImpl>(), checks,
-    CustomCall::Bind("xla.gpu.all_gather_start")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .RemainingArgs()              // args
-        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-        .Attr<int64_t>("op_id")
-        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
-        .Attr<absl::Span<const int64_t>>("replica_group_values")
-        .Attr<int32_t>("uid"));
-
-//===----------------------------------------------------------------------===//
-// AllGatherDone.
-//===----------------------------------------------------------------------===//
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    AllGatherDone, FunctionWrapper<AsyncDoneImpl>(), checks,
-    CustomCall::Bind("xla.gpu.all_gather_done")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<CollectivesSupport*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .Value("AllGatherDone")
-        .Attr<int32_t>("uid"));
-
 //===----------------------------------------------------------------------===//
 // AllReduce.
 //===----------------------------------------------------------------------===//
 
 #if XLA_ENABLE_XCCL
-static absl::Status AllReduceImplCommon(
-    const ServiceExecutableRunOptions* run_options, se::Stream* stream,
+absl::Status AllReduceImplCommon(
+    const ServiceExecutableRunOptions* run_options,
+    const DebugOptions* debug_options, se::Stream* stream,
     CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
     int64_t reduction_kind, absl::Span<const int64_t> replica_group_offsets,
     absl::Span<const int64_t> replica_group_values) {
@@ -434,27 +325,32 @@ static absl::Status AllReduceImplCommon(
   auto device_buffers = GetDeviceBufferPairs(args);
   if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
 
-  return ToAbslStatus(RunAllReduce(static_cast<ReductionKind>(reduction_kind),
-                                   *device_buffers, *stream, **comm));
+  return ToAbslStatus(
+      RunRepeated(debug_options->xla_gpu_collective_inflation_factor(), [&]() {
+        return RunAllReduce(static_cast<ReductionKind>(reduction_kind),
+                            *device_buffers, *stream, **comm);
+      }));
 }
 #endif  // XLA_ENABLE_XCCL
 
-static absl::Status AllReduceImpl(
-    const ServiceExecutableRunOptions* run_options,
-    CollectivesSupport* collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, int64_t op_id, int64_t reduction_kind,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values) {
+absl::Status AllReduceImpl(const ServiceExecutableRunOptions* run_options,
+                           const DebugOptions* debug_options,
+                           CollectivesSupport* collectives,
+                           AsyncCollectivesSupport* async_collectives,
+                           CustomCall::RemainingArgs args, int32_t uid,
+                           int64_t group_mode, int64_t op_id, bool is_async,
+                           int64_t reduction_kind,
+                           absl::Span<const int64_t> replica_group_offsets,
+                           absl::Span<const int64_t> replica_group_values) {
 #if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllReduce";
-  se::Stream* stream = run_options->stream();
-  auto status = AllReduceImplCommon(
-      run_options, stream, args, group_mode, op_id, reduction_kind,
-      replica_group_offsets, replica_group_values);
-  if (!status.ok()) return status;
-
-  int32_t device_ordinal = stream->parent()->device_ordinal();
-  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
+  VLOG(3) << "Running AllReduce " << (is_async ? "(Async)" : "(Sync)");
+  return RunSyncOrAsync(run_options, collectives, async_collectives, uid,
+                        is_async, [&](se::Stream* stream) {
+                          return AllReduceImplCommon(
+                              run_options, debug_options, stream, args,
+                              group_mode, op_id, reduction_kind,
+                              replica_group_offsets, replica_group_values);
+                        });
 #else   // XLA_ENABLE_XCCL
   // NCCL disabled.
   return absl::InternalError("NCCL disabled");
@@ -465,77 +361,26 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
     AllReduce, FunctionWrapper<AllReduceImpl>(), checks,
     CustomCall::Bind("xla.gpu.all_reduce")
         .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<const DebugOptions*>()
         .UserData<CollectivesSupport*>()
+        .UserData<AsyncCollectivesSupport*>()
         .RemainingArgs()  // args
         .Attr<int32_t>("uid")
         .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
         .Attr<int64_t>("op_id")
+        .Attr<bool>("is_async")
         .Attr<int64_t>("reduction_kind")  // ReductionKind
         .Attr<absl::Span<const int64_t>>("replica_group_offsets")
         .Attr<absl::Span<const int64_t>>("replica_group_values"));
 
-//===----------------------------------------------------------------------===//
-// AllReduceStart.
-//===----------------------------------------------------------------------===//
-
-static absl::Status AllReduceStartImpl(
-    const ServiceExecutableRunOptions* run_options,
-    AsyncCollectivesSupport* async_collectives, CustomCall::RemainingArgs args,
-    int64_t group_mode, int64_t op_id, int64_t reduction_kind,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values, int32_t uid) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllReduceStart";
-  se::Stream* stream = run_options->stream();
-  se::Stream* async_stream = async_collectives->async_comm_stream();
-
-  // Wait until compute inputs are ready.
-  async_stream->ThenWaitFor(stream);
-
-  auto status = AllReduceImplCommon(
-      run_options, async_stream, args, group_mode, op_id, reduction_kind,
-      replica_group_offsets, replica_group_values);
-  if (!status.ok()) return status;
-
-  return async_collectives->RecordEvent(uid);
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    AllReduceStart, FunctionWrapper<AllReduceStartImpl>(), checks,
-    CustomCall::Bind("xla.gpu.all_reduce_start")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .RemainingArgs()              // args
-        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-        .Attr<int64_t>("op_id")
-        .Attr<int64_t>("reduction_kind")  // ReductionKind
-        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
-        .Attr<absl::Span<const int64_t>>("replica_group_values")
-        .Attr<int32_t>("uid"));
-
-//===----------------------------------------------------------------------===//
-// AllReduceDone.
-//===----------------------------------------------------------------------===//
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    AllReduceDone, FunctionWrapper<AsyncDoneImpl>(), checks,
-    CustomCall::Bind("xla.gpu.all_reduce_done")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<CollectivesSupport*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .Value("AllReduceDone")
-        .Attr<int32_t>("uid"));
-
 //===----------------------------------------------------------------------===//
 // AllToAll.
 //===----------------------------------------------------------------------===//
 
 #if XLA_ENABLE_XCCL
-static absl::Status AllToAllImplCommon(
-    const ServiceExecutableRunOptions* run_options, se::Stream* stream,
+absl::Status AllToAllImplCommon(
+    const ServiceExecutableRunOptions* run_options,
+    const DebugOptions* debug_options, se::Stream* stream,
     CustomCall::RemainingArgs args, int64_t group_mode,
     bool has_split_dimension, int64_t op_id,
     absl::Span<const int64_t> replica_group_offsets,
@@ -550,26 +395,31 @@ static absl::Status AllToAllImplCommon(
   if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
 
   return ToAbslStatus(
-      RunAllToAll(has_split_dimension, *device_buffers, *stream, **comm));
+      RunRepeated(debug_options->xla_gpu_collective_inflation_factor(), [&]() {
+        return RunAllToAll(has_split_dimension, *device_buffers, *stream,
+                           **comm);
+      }));
 }
 #endif  // XLA_ENABLE_XCCL
 
-static absl::Status AllToAllImpl(
-    const ServiceExecutableRunOptions* run_options,
-    CollectivesSupport* collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, bool has_split_dimension, int64_t op_id,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values) {
+absl::Status AllToAllImpl(const ServiceExecutableRunOptions* run_options,
+                          const DebugOptions* debug_options,
+                          CollectivesSupport* collectives,
+                          AsyncCollectivesSupport* async_collectives,
+                          CustomCall::RemainingArgs args, int32_t uid,
+                          int64_t group_mode, bool has_split_dimension,
+                          int64_t op_id, bool is_async,
+                          absl::Span<const int64_t> replica_group_offsets,
+                          absl::Span<const int64_t> replica_group_values) {
 #if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllToAll";
-  se::Stream* stream = run_options->stream();
-  auto status = AllToAllImplCommon(run_options, stream, args, group_mode,
-                                   has_split_dimension, op_id,
-                                   replica_group_offsets, replica_group_values);
-  if (!status.ok()) return status;
-
-  int32_t device_ordinal = stream->parent()->device_ordinal();
-  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
+  VLOG(3) << "Running AllToAll " << (is_async ? "(Async)" : "(Sync)");
+  return RunSyncOrAsync(run_options, collectives, async_collectives, uid,
+                        is_async, [&](se::Stream* stream) {
+                          return AllToAllImplCommon(
+                              run_options, debug_options, stream, args,
+                              group_mode, has_split_dimension, op_id,
+                              replica_group_offsets, replica_group_values);
+                        });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
 #endif  // XLA_ENABLE_XCCL
@@ -579,75 +429,26 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
     AllToAll, FunctionWrapper<AllToAllImpl>(), checks,
     CustomCall::Bind("xla.gpu.all_to_all")
         .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<const DebugOptions*>()
         .UserData<CollectivesSupport*>()
-        .RemainingArgs()  // args
-        .Attr<int32_t>("uid")
-        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-        .Attr<bool>("has_split_dimension")
-        .Attr<int64_t>("op_id")
-        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
-        .Attr<absl::Span<const int64_t>>("replica_group_values"));
-
-//===----------------------------------------------------------------------===//
-// AllToAllStart.
-//===----------------------------------------------------------------------===//
-static absl::Status AllToAllStartImpl(
-    const ServiceExecutableRunOptions* run_options,
-    AsyncCollectivesSupport* async_collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, bool has_split_dimension, int64_t op_id,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllToAllStart";
-  se::Stream* stream = run_options->stream();
-  se::Stream* async_stream = async_collectives->async_comm_stream();
-
-  // Wait until compute inputs are ready.
-  async_stream->ThenWaitFor(stream);
-
-  auto status = AllToAllImplCommon(run_options, async_stream, args, group_mode,
-                                   has_split_dimension, op_id,
-                                   replica_group_offsets, replica_group_values);
-  if (!status.ok()) return status;
-  return async_collectives->RecordEvent(uid);
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    AllToAllStart, FunctionWrapper<AllToAllStartImpl>(), checks,
-    CustomCall::Bind("xla.gpu.all_to_all_start")
-        .UserData<const ServiceExecutableRunOptions*>()
         .UserData<AsyncCollectivesSupport*>()
         .RemainingArgs()  // args
         .Attr<int32_t>("uid")
         .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
         .Attr<bool>("has_split_dimension")
         .Attr<int64_t>("op_id")
+        .Attr<bool>("is_async")
         .Attr<absl::Span<const int64_t>>("replica_group_offsets")
         .Attr<absl::Span<const int64_t>>("replica_group_values"));
 
-//===----------------------------------------------------------------------===//
-// AllToAllDone.
-//===----------------------------------------------------------------------===//
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    AllToAllDone, FunctionWrapper<AsyncDoneImpl>(), checks,
-    CustomCall::Bind("xla.gpu.all_to_all_done")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<CollectivesSupport*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .Value("AllToAllDone")
-        .Attr<int32_t>("uid"));
-
 //===----------------------------------------------------------------------===//
 // ReduceScatter.
 //===----------------------------------------------------------------------===//
 
 #if XLA_ENABLE_XCCL
-static absl::Status ReduceScatterImplCommon(
-    const ServiceExecutableRunOptions* run_options, se::Stream* stream,
+absl::Status ReduceScatterImplCommon(
+    const ServiceExecutableRunOptions* run_options,
+    const DebugOptions* debug_options, se::Stream* stream,
     CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
     int64_t reduction_kind, absl::Span<const int64_t> replica_group_offsets,
     absl::Span<const int64_t> replica_group_values) {
@@ -661,27 +462,31 @@ static absl::Status ReduceScatterImplCommon(
   if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
 
   return ToAbslStatus(
-      RunReduceScatter(static_cast<ReductionKind>(reduction_kind),
-                       *device_buffers, *stream, **comm));
+      RunRepeated(debug_options->xla_gpu_collective_inflation_factor(), [&]() {
+        return RunReduceScatter(static_cast<ReductionKind>(reduction_kind),
+                                *device_buffers, *stream, **comm);
+      }));
 }
 #endif  // XLA_ENABLE_XCCL
 
-static absl::Status ReduceScatterImpl(
-    const ServiceExecutableRunOptions* run_options,
-    CollectivesSupport* collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, int64_t op_id, int64_t reduction_kind,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values) {
+absl::Status ReduceScatterImpl(const ServiceExecutableRunOptions* run_options,
+                               const DebugOptions* debug_options,
+                               CollectivesSupport* collectives,
+                               AsyncCollectivesSupport* async_collectives,
+                               CustomCall::RemainingArgs args, int32_t uid,
+                               int64_t group_mode, int64_t op_id, bool is_async,
+                               int64_t reduction_kind,
+                               absl::Span<const int64_t> replica_group_offsets,
+                               absl::Span<const int64_t> replica_group_values) {
 #if XLA_ENABLE_XCCL
-  VLOG(3) << "Running ReduceScatter";
-  se::Stream* stream = run_options->stream();
-  auto status = ReduceScatterImplCommon(
-      run_options, stream, args, group_mode, op_id, reduction_kind,
-      replica_group_offsets, replica_group_values);
-  if (!status.ok()) return status;
-
-  int32_t device_ordinal = stream->parent()->device_ordinal();
-  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
+  VLOG(3) << "Running ReduceScatter " << (is_async ? "(Async)" : "(Sync)");
+  return RunSyncOrAsync(run_options, collectives, async_collectives, uid,
+                        is_async, [&](se::Stream* stream) {
+                          return ReduceScatterImplCommon(
+                              run_options, debug_options, stream, args,
+                              group_mode, op_id, reduction_kind,
+                              replica_group_offsets, replica_group_values);
+                        });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
 #endif  // XLA_ENABLE_XCCL
@@ -691,75 +496,36 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
     ReduceScatter, FunctionWrapper<ReduceScatterImpl>(), checks,
     CustomCall::Bind("xla.gpu.reduce_scatter")
         .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<const DebugOptions*>()
         .UserData<CollectivesSupport*>()
-        .RemainingArgs()  // args
-        .Attr<int32_t>("uid")
-        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-        .Attr<int64_t>("op_id")
-        .Attr<int64_t>("reduction_kind")  // ReductionKind
-        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
-        .Attr<absl::Span<const int64_t>>("replica_group_values"));
-
-//===----------------------------------------------------------------------===//
-// ReduceScatterStart.
-//===----------------------------------------------------------------------===//
-
-static absl::Status ReduceScatterStartImpl(
-    const ServiceExecutableRunOptions* run_options,
-    AsyncCollectivesSupport* async_collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, int64_t op_id, int64_t reduction_kind,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running ReduceScatterStart";
-  se::Stream* stream = run_options->stream();
-  se::Stream* async_stream = async_collectives->async_comm_stream();
-
-  // Wait until compute inputs are ready.
-  async_stream->ThenWaitFor(stream);
-
-  auto status = ReduceScatterImplCommon(
-      run_options, async_stream, args, group_mode, op_id, reduction_kind,
-      replica_group_offsets, replica_group_values);
-  if (!status.ok()) return status;
-
-  return async_collectives->RecordEvent(uid);
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    ReduceScatterStart, FunctionWrapper<ReduceScatterStartImpl>(), checks,
-    CustomCall::Bind("xla.gpu.reduce_scatter_start")
-        .UserData<const ServiceExecutableRunOptions*>()
         .UserData<AsyncCollectivesSupport*>()
         .RemainingArgs()  // args
         .Attr<int32_t>("uid")
         .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
         .Attr<int64_t>("op_id")
+        .Attr<bool>("is_async")
         .Attr<int64_t>("reduction_kind")  // ReductionKind
         .Attr<absl::Span<const int64_t>>("replica_group_offsets")
         .Attr<absl::Span<const int64_t>>("replica_group_values"));
 
 //===----------------------------------------------------------------------===//
-// ReduceScatterDone.
+// AsyncDone.
 //===----------------------------------------------------------------------===//
 
 XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    ReduceScatterDone, FunctionWrapper<AsyncDoneImpl>(), checks,
-    CustomCall::Bind("xla.gpu.reduce_scatter_done")
+    AsyncDone, FunctionWrapper<AsyncDoneImpl>(), checks,
+    CustomCall::Bind("xla.gpu.async_collective_done")
         .UserData<const ServiceExecutableRunOptions*>()
         .UserData<CollectivesSupport*>()
         .UserData<AsyncCollectivesSupport*>()
-        .Value("ReduceScatterDone")
-        .Attr<int32_t>("uid"));
+        .Attr<int32_t>("uid")
+        .Attr<std::string_view>("done_type"));
 
 //===----------------------------------------------------------------------===//
 // ReplicaId.
 //===----------------------------------------------------------------------===//
 
-static absl::Status ReplicaPartitionIdImpl(
+absl::Status ReplicaPartitionIdImpl(
     const ServiceExecutableRunOptions* run_options, FlatMemrefView result,
     bool is_replica_id) {
   VLOG(3) << "Running " << (is_replica_id ? "ReplicaId" : "PartitionId");
@@ -780,8 +546,8 @@ static absl::Status ReplicaPartitionIdImpl(
   return absl::OkStatus();
 }
 
-static absl::Status ReplicaIdImpl(
-    const ServiceExecutableRunOptions* run_options, FlatMemrefView result) {
+absl::Status ReplicaIdImpl(const ServiceExecutableRunOptions* run_options,
+                           FlatMemrefView result) {
   return ReplicaPartitionIdImpl(run_options, result, /*is_replica_id=*/true);
 }
 
@@ -795,8 +561,8 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
 // PartitionId.
 //===----------------------------------------------------------------------===//
 
-static absl::Status PartitionIdImpl(
-    const ServiceExecutableRunOptions* run_options, FlatMemrefView result) {
+absl::Status PartitionIdImpl(const ServiceExecutableRunOptions* run_options,
+                             FlatMemrefView result) {
   return ReplicaPartitionIdImpl(run_options, result, /*is_replica_id=*/false);
 }
 
@@ -808,23 +574,66 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
 
 //===----------------------------------------------------------------------===//
 
+int64_t Key(int32_t uid, int32_t device_ordinal) {
+  return static_cast<int64_t>(uid) << 32 | device_ordinal;
+}
+
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// Collectives support library.
+//===----------------------------------------------------------------------===//
+
+absl::Status CollectivesSupport::MaybeBlockAfterFirstRun(int32_t uid,
+                                                         int32_t device_ordinal,
+                                                         se::Stream* stream) {
+  bool block = [&] {
+    absl::MutexLock lock(&mutex_);
+    return executed_.insert(Key(uid, device_ordinal)).second;
+  }();
+  return block ? ToAbslStatus(stream->BlockHostUntilDone()) : absl::OkStatus();
+}
+
+AsyncCollectivesSupport::AsyncCollectivesSupport(se::Stream* async_comm_stream)
+    : async_comm_stream_(async_comm_stream) {}
+
+absl::Status AsyncCollectivesSupport::RecordEvent(int32_t uid) {
+  // Create an event on the async stream for the completion of the collective.
+  se::Event done_event(async_comm_stream_->parent());
+  if (!done_event.Init()) return absl::InternalError("Failed to create event");
+  async_comm_stream_->ThenRecordEvent(&done_event);
+
+  absl::MutexLock lock(&mutex_);
+  auto [_, was_inserted] = done_events_.insert({uid, std::move(done_event)});
+  if (!was_inserted) {
+    return absl::InternalError(absl::StrFormat(
+        "Async done event has not been consumed (uid=%d, device_ordinal=%d)",
+        uid, async_comm_stream_->parent()->device_ordinal()));
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<se::Event> AsyncCollectivesSupport::PopEvent(int32_t uid) {
+  absl::MutexLock lock(&mutex_);
+  auto done_event = done_events_.extract(uid);
+  if (!done_event) {
+    return absl::InternalError(absl::StrFormat(
+        "Async done event was not found (uid=%d, device_ordinal=%d)", uid,
+        async_comm_stream_->parent()->device_ordinal()));
+  }
+  return std::move(done_event.mapped());
+}
+
 void RegisterCollectiveCustomCalls(
     runtime::DirectCustomCallRegistry& registry) {
   registry.Register("xla.gpu.collective_permute", CollectivePermute);
-  registry.Register("xla.gpu.collective_permute_done", CollectivePermuteDone);
-  registry.Register("xla.gpu.collective_permute_start", CollectivePermuteStart);
   registry.Register("xla.gpu.all_gather", AllGather);
-  registry.Register("xla.gpu.all_gather_done", AllGatherDone);
-  registry.Register("xla.gpu.all_gather_start", AllGatherStart);
   registry.Register("xla.gpu.all_reduce", AllReduce);
-  registry.Register("xla.gpu.all_reduce_done", AllReduceDone);
-  registry.Register("xla.gpu.all_reduce_start", AllReduceStart);
   registry.Register("xla.gpu.all_to_all", AllToAll);
-  registry.Register("xla.gpu.all_to_all_start", AllToAllStart);
-  registry.Register("xla.gpu.all_to_all_done", AllToAllDone);
   registry.Register("xla.gpu.reduce_scatter", ReduceScatter);
-  registry.Register("xla.gpu.reduce_scatter_start", ReduceScatterStart);
-  registry.Register("xla.gpu.reduce_scatter_done", ReduceScatterDone);
+
+  registry.Register("xla.gpu.collective_done", AsyncDone);
+
   registry.Register("xla.gpu.partition_id", PartitionId);
   registry.Register("xla.gpu.replica_id", ReplicaId);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/executable.cc b/tensorflow/compiler/xla/service/gpu/runtime/executable.cc
index 5e893118c68..c1c6f74feaf 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/executable.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/runtime/memset.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/send_recv.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/topk.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/tracing.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/tsl/protobuf/dnn.pb.h"
@@ -94,6 +95,7 @@ void RegisterXlaGpuRuntimeCustomCalls(DirectCustomCallRegistry& registry) {
   RegisterIoFeedCustomCalls(registry);
   RegisterMemsetCustomCalls(registry);
   RegisterSendRecvCustomCalls(registry);
+  RegisterTopkCustomCall(registry);
 
 #if GOOGLE_CUDA
   // Graph launch kernels depend on Cuda Graph API.
@@ -366,6 +368,8 @@ Status GpuRuntimeExecutable::Execute(
 #if GOOGLE_CUDA
   StreamExecutorGraphInstances::Snapshot graph_instances =
       graph_instances_(executor)->snapshot();
+  CapturedFunctionExecutionCount::Snapshot execution_count =
+      captured_function_counts_(executor)->snapshot();
 #endif  // GOOGLE_CUDA
 
   // State cached globally for gpu executable.
@@ -387,7 +391,7 @@ Status GpuRuntimeExecutable::Execute(
       &collectives_, &fft_plans, &send_recv_events, &gpu_lock,
 #if GOOGLE_CUDA
       // Auxiliary data that is available only if compiled with CUDA support.
-      &matmul_plans, &graph_instances,
+      &matmul_plans, &graph_instances, &execution_count,
 #endif  // GOOGLE_CUDA
       // Null pointer will be interpreted as an absence of async collectives
       // support and custom calls will safely return an error.
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/executable.h b/tensorflow/compiler/xla/service/gpu/runtime/executable.h
index 8f090f3f5cd..114c3655711 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/executable.h
+++ b/tensorflow/compiler/xla/service/gpu/runtime/executable.h
@@ -161,6 +161,7 @@ class GpuRuntimeExecutable {
 
   // Keep captured and instantiated CUDA graphs instances.
   GraphInstances graph_instances_;
+  CapturedFunctionExecutionCounts captured_function_counts_;
 #endif  // GOOGLE_CUDA
 
   // Keep an executable state for all registered runtime modules.
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.cc b/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.cc
index 109efe2fca6..b9af00c4255 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/runtime/graph_launch.h"
 
+#include <atomic>
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -57,6 +59,12 @@ StreamExecutorGraphInstances* GraphInstances::operator()(
   return &graphs_[executor];
 }
 
+CapturedFunctionExecutionCount* CapturedFunctionExecutionCounts::operator()(
+    se::StreamExecutor* executor) {
+  absl::MutexLock lock(&mutex_);
+  return &counts_[executor];
+}
+
 //===----------------------------------------------------------------------===//
 // Helper structure to hash the remaining arguments' memref pointers.
 //===----------------------------------------------------------------------===//
@@ -104,9 +112,10 @@ static absl::StatusOr<OwnedCudaGraph> CaptureGraph(
 
   // Initialize (with memoization) BlasSupport here because cublasCreate fails
   // during cuda graph capturing.
-  // TODO(b/272559361): The initialization should be conditional.
-  if (!executor->AsBlas()) {
-    return absl::InternalError("Failed to initialize BLAS support");
+  if (function_ref.RequiresBlas()) {
+    if (!executor->AsBlas()) {
+      return absl::InternalError("Failed to initialize BLAS support");
+    }
   }
 
   StatusOr<StreamPool::Ptr> capture_stream =
@@ -115,7 +124,7 @@ static absl::StatusOr<OwnedCudaGraph> CaptureGraph(
   if (!capture_stream.ok())
     return absl::InternalError(
         absl::StrFormat("Failed to borrow a stream for graph capture: %s",
-                        capture_stream.status().error_message()));
+                        capture_stream.status().message()));
 
   // TODO(ezhulenev): Pass graph capture context explicitly to the custom calls
   // via UserData to be able to detect when executing custom call in graph
@@ -177,6 +186,48 @@ static absl::StatusOr<OwnedCudaGraph> CaptureGraph(
   return std::move(*captured);
 }
 
+static absl::Status RunGraphWithoutCapture(
+    const ServiceExecutableRunOptions* run_options,
+    runtime::FunctionRef function_ref, CustomCall::RemainingArgs fwd_args,
+    CustomCall::UserData user_data) {
+  // Prepare options for executing graph capture function.
+  Executable::ExecuteOpts opts;
+  opts.custom_call_data = &user_data;
+
+  std::string error;
+  runtime::DiagnosticEngine diagnostic_engine;
+  diagnostic_engine.AddHandler([&](runtime::Diagnostic& diagnostic) {
+    error.append(diagnostic.status().message());
+    return runtime::success();
+  });
+  opts.diagnostic_engine = &diagnostic_engine;
+
+  // Graph capture function should not launch any async tasks.
+  opts.async_task_runner = reinterpret_cast<AsyncTaskRunner*>(0XDEADBEEF);
+
+  Arguments<ScalarArg, MemrefDesc> args(fwd_args.size());
+
+  for (size_t i = 0; i < fwd_args.size(); ++i) {
+    // `index` argument passed as int64_t.
+    if (auto idx = fwd_args.get<int64_t>(i); succeeded(idx)) {
+      args.emplace_back<ScalarArg>(*idx);
+      continue;
+    }
+
+    // Pass `memref` argument as a MemrefDesc.
+    if (auto memref = fwd_args.get<StridedMemrefView>(i); succeeded(memref)) {
+      args.emplace_back<MemrefDesc>(memref->dtype, memref->data, /*offset=*/0,
+                                    memref->sizes, memref->strides);
+      continue;
+    }
+
+    return absl::InvalidArgumentError("Unsupported argument type");
+  }
+
+  return function_ref(args, runtime::NoResultConverter{}, opts, InDebugMode())
+      .status();
+}
+
 #endif  // #if GOOGLE_CUDA
 
 //===----------------------------------------------------------------------===//
@@ -190,6 +241,7 @@ static absl::Status LaunchGraph(
     StreamExecutorKernels::Snapshot* kernels,
     StreamExecutorConvRunners::Snapshot* convs,
     StreamExecutorGraphInstances::Snapshot* instances,
+    CapturedFunctionExecutionCount::Snapshot* counts,
     GemmConfigs::Snapshot* gemm_config, runtime::Executable* executable,
     NonAtomicallyUpgradeableRWLock* gpu_lock,
     CustomCall::RemainingArgs fwd_args, CustomCall::FunctionOrdinal capture) {
@@ -209,6 +261,24 @@ static absl::Status LaunchGraph(
                                 gemm_config, gpu_lock);
   };
 
+  absl::StatusOr<std::unique_ptr<std::atomic<uint64_t>>*> get_count =
+      counts->GetOrCreate(
+          capture.ordinal,
+          []() -> absl::StatusOr<std::unique_ptr<std::atomic<uint64_t>>> {
+            return std::make_unique<std::atomic<uint64_t>>(0);
+          });
+  if (!get_count.ok()) return get_count.status();
+  uint64_t count = (**get_count)->fetch_add(1);
+  uint64_t instantiation_threshold =
+      debug_options->xla_gpu_cuda_graph_instantiation_threshold();
+  if (count < instantiation_threshold) {
+    // Run captured graph directly.
+    absl::Status result = RunGraphWithoutCapture(run_options, function_ref,
+                                                 fwd_args, user_data());
+    if (!result.ok()) return result;
+    return absl::OkStatus();
+  }
+
   absl::StatusOr<GraphInstance*> instance = instances->GetOrCreate(
       capture.ordinal, [&]() -> absl::StatusOr<GraphInstance> {
         auto g = CaptureGraph(run_options, function_ref, fwd_args, user_data());
@@ -267,6 +337,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .UserData<StreamExecutorKernels::Snapshot*>()
         .UserData<StreamExecutorConvRunners::Snapshot*>()
         .UserData<StreamExecutorGraphInstances::Snapshot*>()
+        .UserData<CapturedFunctionExecutionCount::Snapshot*>()
         .UserData<GemmConfigs::Snapshot*>()
         .UserData<Executable*>()
         .UserData<NonAtomicallyUpgradeableRWLock*>()
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.h b/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.h
index cec98701ee4..7171862da99 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.h
+++ b/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_GRAPH_LAUNCH_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_GRAPH_LAUNCH_H_
 
+#include <atomic>
 #include <memory>
 #include <optional>
 #include <string_view>
@@ -40,6 +41,11 @@ void RegisterGraphLaunchCustomCalls(
 struct GraphInstance;                // Forward declare
 class StreamExecutorGraphInstances;  // Forward declare
 
+// A state vector that keeps track of the number of times a capture function
+// gets executed. Graph capture function ordinal is the key in this container.
+class CapturedFunctionExecutionCount
+    : public runtime::StateVector<std::unique_ptr<std::atomic<uint64_t>>> {};
+
 #if GOOGLE_CUDA
 
 // A state vector that owns all instantiated CUDA graphs. Graph capture function
@@ -83,6 +89,17 @@ class GraphInstances {
       ABSL_GUARDED_BY(mutex_);
 };
 
+// Xla executable keeps a mapping from stream executors to execution counts.
+class CapturedFunctionExecutionCounts {
+ public:
+  CapturedFunctionExecutionCount* operator()(se::StreamExecutor* executor);
+
+ private:
+  mutable absl::Mutex mutex_;
+  absl::node_hash_map<se::StreamExecutor*, CapturedFunctionExecutionCount>
+      counts_ ABSL_GUARDED_BY(mutex_);
+};
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.cc b/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.cc
index 08af785b72e..ceef00ae284 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.cc
@@ -30,6 +30,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/stream_executor/kernel.h"
 
+#if GOOGLE_CUDA
+#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h"
+#endif  // #if GOOGLE_CUDA
+
 namespace xla {
 namespace gpu {
 
@@ -74,7 +78,18 @@ static absl::Status LaunchImpl(
   if (!kernel.ok()) return kernel.status();
   assert((**kernel)->name() == name && "unexpected loaded kernel");
 
+#if GOOGLE_CUDA
+  absl::StatusOr<bool> is_capturing = se::gpu::IsStreamCapturing(stream);
+  if (!is_capturing.ok()) return is_capturing.status();
+  if (is_capturing.value()) {
+    VLOG(3) << "Launching " << (**kernel)->name()
+            << "during CUDA graph capture";
+  } else {
+    VLOG(3) << "Launching " << (**kernel)->name();
+  }
+#else
   VLOG(3) << "Launching " << (**kernel)->name();
+#endif
   absl::InlinedVector<se::DeviceMemoryBase, 8> buffer_args(
       args_size_including_temp_buffer);
 
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/send_recv.cc b/tensorflow/compiler/xla/service/gpu/runtime/send_recv.cc
index 885051a6640..cc890b17e97 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/send_recv.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/send_recv.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/runtime/send_recv.h"
 
 #include <memory>
+#include <string>
 #include <string_view>
 #include <utility>
 #include <vector>
@@ -126,6 +127,22 @@ absl::StatusOr<AsyncValueRef<se::Event>> SendRecvEvents::PopEvent(
       StrFormat("Async send/recv event was not found (handle==%d)", handle));
 }
 
+//===----------------------------------------------------------------------===//
+// Generate a map with frontend attributes.
+//===----------------------------------------------------------------------===//
+
+absl::flat_hash_map<std::string, std::string> GenerateFrontEndAttributeMap(
+    Dictionary frontend_attrs) {
+  absl::flat_hash_map<std::string, std::string> frontend_attr_map;
+  for (std::string_view key : frontend_attrs.keys()) {
+    auto frontend_attr = frontend_attrs.get<std::string_view>(key);
+    if (mlir::succeeded(frontend_attr)) {
+      frontend_attr_map.insert({std::string(key), std::string(*frontend_attr)});
+    }
+  }
+  return frontend_attr_map;
+}
+
 //===----------------------------------------------------------------------===//
 // Send/Recv custom call implementation.
 //===----------------------------------------------------------------------===//
@@ -151,7 +168,8 @@ static absl::Status SendImpl(const ServiceExecutableRunOptions* run_options,
   // Send buffer to a handler registered with the run options.
   if (auto* send = run_options->run_options().send_device_memory_function()) {
     auto done_event =
-        (*send)(channel.handle, stream, ToShape(arg), GetDeviceAddress(arg));
+        (*send)(channel.handle, stream, ToShape(arg), GetDeviceAddress(arg),
+                GenerateFrontEndAttributeMap(frontend_attrs));
     if (!done_event.ok()) return ToAbslStatus(done_event.status());
     return events->PushEvent(channel.handle, std::move(*done_event));
   }
@@ -180,7 +198,8 @@ static absl::Status RecvImpl(const ServiceExecutableRunOptions* run_options,
   // Recv buffer from a handler registered with the run options.
   if (auto* recv = run_options->run_options().recv_device_memory_function()) {
     auto dst = GetDeviceAddress(arg);
-    auto done_event = (*recv)(channel.handle, stream, ToShape(arg), &dst);
+    auto done_event = (*recv)(channel.handle, stream, ToShape(arg), &dst,
+                              GenerateFrontEndAttributeMap(frontend_attrs));
     if (!done_event.ok()) return ToAbslStatus(done_event.status());
     return events->PushEvent(channel.handle, std::move(*done_event));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/topk.cc b/tensorflow/compiler/xla/service/gpu/runtime/topk.cc
new file mode 100644
index 00000000000..1921c996e81
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/topk.cc
@@ -0,0 +1,68 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/topk.h"
+
+#include <stdint.h>
+
+#include <cstddef>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla::gpu {
+using ::xla::runtime::CustomCall;
+using ::xla::runtime::StridedMemrefView;
+
+static absl::Status TopkImpl(const ServiceExecutableRunOptions* run_options,
+                             StridedMemrefView data,
+                             StridedMemrefView top_elements,
+                             StridedMemrefView indices) {
+  if (data.sizes.size() > 2)
+    return absl::InvalidArgumentError("Invalid input shape");
+  if (indices.dtype != PrimitiveType::S32)
+    return absl::InvalidArgumentError("Indices should be S32");
+  bool has_batch = data.sizes.size() == 2;
+  size_t batch_size = has_batch ? data.sizes[0] : 1;
+  size_t n = has_batch ? data.sizes[1] : data.sizes[0];
+  size_t k = has_batch ? top_elements.sizes[1] : top_elements.sizes[0];
+  return RunTopk(se::gpu::AsGpuStreamValue(run_options->stream()), data.dtype,
+                 data.data, n, top_elements.data,
+                 static_cast<uint32_t*>(indices.data), k, batch_size);
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    Topk, FunctionWrapper<TopkImpl>(), checks,
+    CustomCall::Bind("__gpu$TopK")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .Arg<StridedMemrefView>()  // input
+        .Arg<StridedMemrefView>()  // output (values)
+        .Arg<StridedMemrefView>()  // output (indices)
+);
+
+void RegisterTopkCustomCall(runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("__gpu$TopK", Topk);
+}
+
+}  // namespace xla::gpu
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/topk.h b/tensorflow/compiler/xla/service/gpu/runtime/topk.h
new file mode 100644
index 00000000000..320b88dbbcc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/topk.h
@@ -0,0 +1,28 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_TOPK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_TOPK_H_
+
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+
+namespace xla::gpu {
+
+// Registers XLA Gpu runtime TopK custom calls.
+void RegisterTopkCustomCall(runtime::DirectCustomCallRegistry& registry);
+
+}  // namespace xla::gpu
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_TOPK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.cc b/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.cc
new file mode 100644
index 00000000000..bbbae3da28a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.cc
@@ -0,0 +1,137 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains bespoke and optimized implementation for TopK shapes. When
+// adding support for new shapes/dtypes, you also need to modify the rewritter
+// on topk_specializer.cc for these changes to be picked up.
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.h"
+
+#include <algorithm>
+
+#include "absl/numeric/bits.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/topk_kernel_common.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+namespace {
+
+using ::stream_executor::gpu::GpuStreamHandle;
+
+size_t NumThreads(size_t n, size_t k, size_t batch_size) {
+  // Estimate number of threads per block that can run concurrently given the
+  // register footprint.
+  size_t simultaneous_threads_per_block = 512 * (16 / k);
+  size_t threads_per_block =
+      std::min(simultaneous_threads_per_block, kTopKMaxThreadsPerBlock);
+  // Minimum amount of data that each thread needs to receive for the algorithm.
+  size_t min_slice = absl::bit_floor(n / absl::bit_ceil(k));
+  return std::min(threads_per_block, min_slice);
+}
+
+// Helper type for converting the untyped arguments of RunTopk to TypedTopk
+template <typename T>
+struct TopkArgs {
+  TopkArgs(GpuStreamHandle stream, PrimitiveType dtype, T* data,
+           size_t num_elements, T* top_elements, uint32_t* top_indices,
+           size_t k, size_t batch_size)
+      : stream(stream),
+        dtype(dtype),
+        data(data),
+        num_elements(num_elements),
+        top_elements(top_elements),
+        top_indices(top_indices),
+        k(k),
+        batch_size(batch_size) {}
+
+  template <typename T2>
+  TopkArgs<T2> Convert() const {
+    return TopkArgs<T2>(stream, dtype, static_cast<T2*>(data), num_elements,
+                        static_cast<T2*>(top_elements), top_indices, k,
+                        batch_size);
+  }
+
+  GpuStreamHandle stream;
+  PrimitiveType dtype;
+  T* data;
+  size_t num_elements;
+  T* top_elements;
+  uint32_t* top_indices;
+  size_t k;
+  size_t batch_size;
+};
+
+template <typename T>
+absl::StatusOr<void*> GetKernel(int n, int k) {
+  if (k <= 1) return GetTopKKernelForK<T, 1>(n);
+  if (k <= 2) return GetTopKKernelForK<T, 2>(n);
+  if (k <= 4) return GetTopKKernelForK<T, 4>(n);
+  if (k <= 8) return GetTopKKernelForK<T, 8>(n);
+  if (k <= 16) return GetTopKKernelForK<T, 16>(n);
+  return absl::UnimplementedError(absl::StrCat("Unsupported K: ", k));
+}
+
+template <typename T>
+absl::Status TypedTopK(TopkArgs<T> args) {
+  int num_threads = NumThreads(args.num_elements, args.k, args.batch_size);
+  if (num_threads == 0) {
+    return absl::FailedPreconditionError(
+        "Invalid kernel pameters. This is likely a bug in the "
+        "TopkSpecializer.");
+  }
+  absl::StatusOr<void*> kernel = GetKernel<T>(args.num_elements, args.k);
+  if (!kernel.ok()) return kernel.status();
+  int blocks_per_grid = args.batch_size;
+  constexpr size_t max_kv_size = sizeof(uint64_t);
+  // Allocate shmem assuming we have a full reduction.
+  int shmem_size = absl::bit_ceil(args.k) * max_kv_size * 32;
+  void* kernel_args[] = {&args.data, &args.num_elements, &args.top_elements,
+                         &args.top_indices, &args.k};
+  cudaError_t launch_status =
+      cudaLaunchKernel(*kernel, blocks_per_grid, num_threads, kernel_args,
+                       shmem_size, args.stream);
+  if (launch_status != cudaSuccess) {
+    return absl::InternalError(absl::StrCat("Failed to launch kernel: ",
+                                            cudaGetErrorString(launch_status)));
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace
+
+absl::Status RunTopk(GpuStreamHandle stream, PrimitiveType dtype, void* data,
+                     size_t num_elements, void* top_elements,
+                     uint32_t* top_indices, size_t k, size_t batch_size) {
+  VLOG(2) << "TopK: " << primitive_util::LowercasePrimitiveTypeName(dtype)
+          << ", n: " << num_elements << ", k: " << k << ", bs: " << batch_size;
+  auto args = TopkArgs<void>(stream, dtype, data, num_elements, top_elements,
+                             top_indices, k, batch_size);
+  switch (dtype) {
+    case PrimitiveType::F32:
+      return TypedTopK(args.Convert<float>());
+    case PrimitiveType::BF16:
+      return TypedTopK(args.Convert<Eigen::bfloat16>());
+    default:
+      return absl::UnimplementedError("GpuTopK not implemented for this dtype");
+  }
+}
+
+}  // namespace xla::gpu
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.cu.cc b/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.cu.cc
new file mode 100644
index 00000000000..e8b193d4376
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.cu.cc
@@ -0,0 +1,290 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains bespoke and optimized implementation for TopK shapes. When
+// adding support for new shapes/dtypes, you also need to modify the rewritter
+// on topk_specializer.cc for these changes to be picked up.
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/compiler/xla/service/gpu/runtime/topk_kernel_common.h"
+
+namespace xla::gpu {
+namespace {
+
+// Default implementation for KV holder. Useful for testing while adding support
+// for a new type, but generally bitpacking those values is more efficient. See
+// implementations below.
+template <typename T, typename V>
+struct Descending {
+  class KVT {
+   public:
+    __device__ KVT() = default;
+    __device__ KVT& operator=(const KVT&) = default;
+    __device__ KVT& operator=(KVT&&) = default;
+    __device__ KVT(const KVT&) = default;
+    __device__ KVT(KVT&&) = default;
+
+    __device__ KVT(T k, V v) : k_(k), v_(v) {}
+    __forceinline__ __device__ void Write(T* key, uint32_t* value) const {
+      *key = k_;
+      *value = v_;
+    }
+
+    __device__ __forceinline__ KVT ShuffleDown(int offset) const {
+      unsigned FULL_MASK = 0xffffffff;
+      // The static casts here are necessary because some types will be
+      // broadened (e.g. bfloat16 -> f32), so we need to narrow them back after
+      // the shuffle.
+      return KVT(static_cast<T>(__shfl_down_sync(FULL_MASK, k_, offset)),
+                 static_cast<V>(__shfl_down_sync(FULL_MASK, v_, offset)));
+    }
+
+   private:
+    T k_;
+    V v_;
+    friend class Descending<T, V>;
+  };
+
+  __device__ __forceinline__ static constexpr bool Gt(const KVT& lhs,
+                                                      const KVT& rhs) {
+    return lhs.k_ == rhs.k_ ? lhs.v_ < rhs.v_ : lhs.k_ > rhs.k_;
+  }
+};
+
+// -----------------------------------------------------------------------------
+// More efficient implementation of Descending.
+// -----------------------------------------------------------------------------
+
+// Strided indexing.
+__device__ __forceinline__ int Idx(int i) {
+  return blockDim.x * i + threadIdx.x;
+}
+
+// TopK implements a faster TopK for K < 16.
+//
+// To compute the final largest K elements, we shard the data threads and each
+// of them computes the top k elements for the data in its slice. When all lanes
+// in a warp are done with their TopK, we merge all the lane-local topks into
+// lane 0 using warp-local reductions. The lane-local topk is computed at
+// PerWarpTopK() and the warp reduction is computed in Reduce(). The warp-local
+// results are stored in shared memory.
+//
+// Once all warps are done, we load all previously produced results into a
+// single warp and repeat the reduction described above. This is implemented in
+// MergeTopKs() and we reuse the Reduce() implementation described above. On
+// MergeTopKs we also write the final results to the user-provided buffer.
+//
+// === Detailed Design
+//
+// The high level goals of this implementations are:
+//  - Low latency for small N (i.e. kilobytes).
+//  - High throughput for large N and/or large batch.
+//
+// Non-goals:
+//  - K > 32. Register pressure will be too high.
+//  - Sharding over multiple SMs. As explained later, we can use TopK's
+//    structure to get this "for free".
+//
+// The core observation of this implementation is that reading/writing to main
+// memory is the bottleneck in usual the Sort/TopK implementations and that for
+// K<16 a linear scan with in-register data is faster than using a heap with
+// shared memory, especially when K is a power of two.
+//
+// The heap for K=7 looks like:
+//
+//             a0
+//        a1        a2
+//      a3  a4    a5  a6
+//
+// When performing a push/pop, in the worst case scenario we need to compare it
+// with the root, both of its children, and one of the two subtrees. This means
+// that using a heap for K=7 only save us 2/7 comparions. Additionally, if the
+// tree were unbalanced(e.g. K=8), we would not be able to unroll this
+// computation.
+//
+// If we're using linear insertion, the worst case results in the full K
+// comparisons comparisons, but with care all of those values can be kept in
+// registers, replacing somewhat load/store instructions with movs. This
+// performance are more than enough to surpass the heap.
+//
+// We split the data evenly over T (<=1024) threads, and use the algorithm above
+// to maintain a sorted list of K elements in registers and perform linear
+// insertions on every new element. Once a warp is done with their local slice,
+// we reduce the slice-local data using shfl and the insertion described above,
+// by adding the other lane's TopK results to the local lane. Once the warp is
+// done, lane 0 writes its results to shared memory. This step has complexity:
+//    theta(k * slice_size + k^2 * log2(k))
+//
+// On a second pass, we use a single warp to consume the results of the previous
+// step and merge them into a final topk, using an analogous algorithm to what
+// has been previously described. Complexity of this stage is:
+//    theta(k^2 * log2(k)).
+//
+// This algorithm only uses a single block per batch dimension, but for large N,
+// we can split the input into B batches of size N/B, calculate each of their
+// topks and then compute a final topk, fixing the indices in the process.
+//
+// Future improvements:
+//  - Use optimal sort/merge networks to reduce the complexity the algorithm and
+//    allow better scaling past K=16. This is fairly tricky to implement
+//    efficiently, so it was let out of v1.
+//
+template <size_t K, typename KT, typename VT,
+          template <typename KT1, typename VT2> class Traits = Descending>
+class TopK {
+ public:
+  using Trait = Traits<KT, VT>;
+  using KVT = typename Trait::KVT;
+
+  __device__ TopK(void* buffer, int num_outputs)
+      : buffer_(reinterpret_cast<KVT*>(buffer)), num_outputs_(num_outputs) {}
+
+  __device__ void Run(KT* key, int n, KT* keys, uint32_t* values) {
+    PerWarpTopK(key, n);
+    MergeTopKs(keys, values);
+  }
+
+ private:
+  // Compute a per-warp topk of a slice of data.
+  __device__ void PerWarpTopK(KT* key, int n) {
+    KVT tmp[K];
+    // TODO(doak): Use bitonic sort.
+#pragma unroll
+    for (int i = 0; i < K; i++) tmp[i] = KVT(key[Idx(i)], Idx(i));
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+#pragma unroll
+      for (int j = i + 1; j < K; j++) {
+        KVT ti = tmp[i];
+        KVT tj = tmp[j];
+        bool cmp = Trait::Gt(ti, tj);
+        tmp[i] = cmp ? ti : tj;
+        tmp[j] = cmp ? tj : ti;
+      }
+    }
+
+    for (int idx = K; idx < n; idx++) {
+      KVT kv(key[Idx(idx)], Idx(idx));
+      Push(tmp, kv);
+    }
+
+    Reduce(tmp, 32);
+
+    if (threadIdx.x % 32 != 0) return;
+    int warp_id = threadIdx.x / 32;
+    for (int i = 0; i < K; i++) {
+      buffer_[i * 32 + warp_id] = tmp[i];
+    }
+  }
+
+  // Merge the per-warp topks into a single topk. The final data is written to
+  // `keys` and `values`
+  __device__ void MergeTopKs(KT* keys, uint32_t* values) {
+    KVT tmp[K];
+    // We only use one warp for this step.
+    if (threadIdx.x / 32 != 0) return;
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < K; i++) tmp[i] = buffer_[i * 32 + threadIdx.x];
+    Reduce(tmp, blockDim.x / 32);
+    if (threadIdx.x != 0) return;
+    for (int i = 0; i < num_outputs_; ++i) {
+      tmp[i].Write(&keys[i], &values[i]);
+    }
+  }
+
+  // Merge `tmp` (a reverse-sorted array) from (0, `num_lanes`) lanes. The
+  // resulting array is stored in the tmp array of lane 0. For all other lanes,
+  // `tmp` is unspecified after this function is called.
+  __device__ __forceinline__ void Reduce(KVT tmp[K], int num_lanes) {
+    int lane_id = threadIdx.x % 32;
+    for (int offset = num_lanes / 2; offset > 0; offset /= 2) {
+#pragma unroll
+      for (int i = 0; i < K; i++) {
+        KVT kv = tmp[i].ShuffleDown(offset);
+        if (lane_id >= offset) continue;
+        Push(tmp, kv);
+      }
+    }
+  }
+
+  // Given a K-array of previously reverse-sorted KVTs, add kv to to it and
+  // remove the smallest element of the resulting array. Preserves the sorted
+  // order of `tmp`.
+  static __device__ __forceinline__ bool Push(KVT tmp[K], const KVT& kv) {
+    if (Trait::Gt(tmp[K - 1], kv)) return false;
+    tmp[K - 1] = kv;
+    if constexpr (K >= 2) {
+#pragma unroll
+      for (int i = K - 2; i >= 0; --i) {
+        if (Trait::Gt(tmp[i], kv)) break;
+        // Swap
+        KVT t = tmp[i];
+        tmp[i] = tmp[i + 1];
+        tmp[i + 1] = t;
+      }
+    }
+    return true;
+  }
+
+  int source_ = 0;
+  KVT* buffer_;
+  int num_outputs_;
+};
+
+// This shared memory buffer needs to be declared outside of the templated
+// Run(), as otherwise it would generate name conflicts from the multiple
+// instantiations of Run() from the multiple monomorphizations of Run().
+extern __device__ __shared__ int shmem[];
+
+template <size_t K, typename KT, typename VT>
+__launch_bounds__(kTopKMaxThreadsPerBlock, 1) __global__
+    void Run(KT* data, int n, KT* result, uint32_t* result_idxs, int k) {
+  TopK<K, KT, VT> top_k(shmem, k);
+  int slice_size = n / blockDim.x;
+  if (threadIdx.x < n % blockDim.x) {
+    slice_size++;
+  }
+  top_k.Run(&data[n * blockIdx.x], slice_size, &result[k * blockIdx.x],
+            &result_idxs[k * blockIdx.x]);
+}
+
+}  // namespace
+
+template <typename T, size_t K>
+void* GetTopKKernelForK(int n) {
+  // TODO(doak): Switch to uint32_t if we don't have an efficient
+  // implemementation for uint16_t.
+  return n < std::numeric_limits<uint16_t>::max()
+             ? reinterpret_cast<void*>(&Run<K, T, uint16_t>)
+             : reinterpret_cast<void*>(&Run<K, T, uint32_t>);
+}
+
+template void* GetTopKKernelForK<float, 1>(int n);
+template void* GetTopKKernelForK<float, 2>(int n);
+template void* GetTopKKernelForK<float, 4>(int n);
+template void* GetTopKKernelForK<float, 8>(int n);
+template void* GetTopKKernelForK<float, 16>(int n);
+template void* GetTopKKernelForK<Eigen::bfloat16, 1>(int n);
+template void* GetTopKKernelForK<Eigen::bfloat16, 2>(int n);
+template void* GetTopKKernelForK<Eigen::bfloat16, 4>(int n);
+template void* GetTopKKernelForK<Eigen::bfloat16, 8>(int n);
+template void* GetTopKKernelForK<Eigen::bfloat16, 16>(int n);
+
+}  // namespace xla::gpu
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.h b/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.h
new file mode 100644
index 00000000000..4d27ec35bde
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_types.h"
+#include "tensorflow/compiler/xla/stream_executor/platform.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+// Input: [batch_size, num_elements]dtype
+// Output:
+//  - top_elements: [batch_size, k] dtype
+//  - top_indices: [batch_size, k] u32
+// Where `top_elements` contains the largest elements of the input, and
+// `top_indices` their original indices.
+absl::Status RunTopk(::tensorflow::se::gpu::GpuStreamHandle stream,
+                     PrimitiveType dtype, void* data, size_t num_elements,
+                     void* top_elements, uint32_t* top_indices, size_t k,
+                     size_t batch_size);
+
+}  // namespace xla::gpu
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel_common.h b/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel_common.h
new file mode 100644
index 00000000000..8d186406cd2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel_common.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_COMMON_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_COMMON_H_
+
+// Contains shared declarations between topk_kernel.cc and topk_kernel.cu.cc
+// but avoids including ABSL, etc. which some CUDA compilers cannot
+// handle.
+
+namespace xla::gpu {
+
+// We perform 2 32-way reductions, which means the largest number of threads per
+// block we support is 1024.
+static constexpr size_t kTopKMaxThreadsPerBlock = 1024;
+
+template <typename T, size_t K>
+void* GetTopKKernelForK(int n);
+
+}  // namespace xla::gpu
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_COMMON_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel_test.cc b/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel_test.cc
new file mode 100644
index 00000000000..8f3cf6103f7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel_test.cc
@@ -0,0 +1,212 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <functional>
+#include <tuple>
+#include <vector>
+
+#include "absl/random/random.h"
+#include "absl/strings/substitute.h"
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/platform/test_benchmark.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::stream_executor::gpu::GpuStreamHandle;
+using ::testing::Combine;
+using ::testing::Values;
+
+#define CUDA_CHECK(s)                                  \
+  do {                                                 \
+    CHECK_EQ(s, cudaSuccess) << cudaGetErrorString(s); \
+  } while (0)
+
+template <typename T>
+T* AllocateGpuBuffer(int num_elements) {
+  void* buffer;
+  CUDA_CHECK(cudaMalloc(&buffer, num_elements * sizeof(T)));
+  return static_cast<T*>(buffer);
+}
+
+template <typename T>
+std::vector<T> RandomFillRange(void* buffer, int num_elements, T start, T end) {
+  std::vector<T> local;
+  local.reserve(num_elements);
+  thread_local absl::BitGen gen;
+  for (int i = 0; i < num_elements; ++i) {
+    local.push_back(absl::Uniform<T>(gen, start, end));
+  }
+  CUDA_CHECK(cudaMemcpy(buffer, local.data(), num_elements * sizeof(T),
+                        cudaMemcpyHostToDevice));
+  return local;
+}
+
+template <typename T>
+std::vector<T> RandomFill(void* buffer, int num_elements) {
+  return RandomFillRange(buffer, num_elements, static_cast<T>(0),
+                         static_cast<T>(num_elements));
+}
+
+template <typename T>
+std::vector<T> RandomFillNegative(void* buffer, int num_elements) {
+  return RandomFillRange(buffer, num_elements, -static_cast<T>(num_elements),
+                         static_cast<T>(0));
+}
+
+PrimitiveType Get(float) { return PrimitiveType::F32; }
+PrimitiveType Get(Eigen::bfloat16) { return PrimitiveType::BF16; }
+
+// Params:
+//  - n_kb: number of elements in kilobytes.
+//  - k: number of elements to return.
+//  - batch_size
+//  - offset
+using TopkTest = ::testing::TestWithParam<std::tuple<int, int, int, int>>;
+
+// In this test we only check that the TopK logic works with float. For the full
+// dtype coverage suite, please add them to topk_test.cc, where we can use XLA
+// utilities to simplify the test logic.
+TEST_P(TopkTest, TopKFloat) {
+  using T = float;
+  const auto [n_kb, k, batch_size, offset] = GetParam();
+  const size_t n = n_kb * 1024 + offset;
+  T* input_buffer = AllocateGpuBuffer<T>(n * batch_size);
+  auto source = RandomFill<T>(input_buffer, n * batch_size);
+  T* output_values = AllocateGpuBuffer<T>(k * batch_size);
+  auto* output_indices =
+      static_cast<uint32_t*>(AllocateGpuBuffer<uint32_t>(k * batch_size));
+  GpuStreamHandle stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+  ASSERT_TRUE(RunTopk(stream, Get(T()), input_buffer, n, output_values,
+                      output_indices, k, batch_size)
+                  .ok());
+  std::vector<T> got(k);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  for (int i = 0; i < batch_size; i++) {
+    CUDA_CHECK(cudaMemcpy(got.data(), &output_values[k * i], k * sizeof(T),
+                          cudaMemcpyDeviceToHost));
+    std::vector<T> slice(source.data() + n * i, source.data() + n * (i + 1));
+    std::sort(slice.begin(), slice.end(), std::greater<T>());
+    slice.resize(k);
+    EXPECT_THAT(got, ::testing::ElementsAreArray(slice))
+        << " k=" << k << ", batch_size=" << batch_size;
+  }
+  CUDA_CHECK(cudaFree(input_buffer));
+  CUDA_CHECK(cudaFree(output_indices));
+  CUDA_CHECK(cudaFree(output_values));
+}
+
+TEST_P(TopkTest, TopKPackedNegative) {
+  using T = float;
+  const auto [n_kb, k, batch_size, offset] = GetParam();
+  const size_t n = n_kb * 1024 + offset;
+  T* input_buffer = AllocateGpuBuffer<T>(n * batch_size);
+  auto source = RandomFillNegative<T>(input_buffer, n * batch_size);
+  T* output_values = AllocateGpuBuffer<T>(k * batch_size);
+  auto* output_indices =
+      static_cast<uint32_t*>(AllocateGpuBuffer<uint32_t>(k * batch_size));
+  GpuStreamHandle stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+  ASSERT_TRUE(RunTopk(stream, Get(T()), input_buffer, n, output_values,
+                      output_indices, k, batch_size)
+                  .ok());
+  std::vector<T> got(k);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  for (int i = 0; i < batch_size; i++) {
+    CUDA_CHECK(cudaMemcpy(got.data(), &output_values[k * i], k * sizeof(T),
+                          cudaMemcpyDeviceToHost));
+    std::vector<T> slice(source.data() + n * i, source.data() + n * (i + 1));
+    std::sort(slice.begin(), slice.end(), std::greater<T>());
+    slice.resize(k);
+    EXPECT_THAT(got, ::testing::ElementsAreArray(slice))
+        << " k=" << k << ", batch_size=" << batch_size;
+  }
+  CUDA_CHECK(cudaFree(input_buffer));
+  CUDA_CHECK(cudaFree(output_indices));
+  CUDA_CHECK(cudaFree(output_values));
+}
+
+INSTANTIATE_TEST_SUITE_P(TopkTests, TopkTest,
+                         Combine(
+                             /*n_kb=*/Values(1, 8, 12, 64, 128),
+                             /*k=*/Values(1, 2, 8, 16, 7, 12),
+                             /*batch_size=*/Values(1, 16, 64, 128),
+                             /*offset=*/Values(0, 7, 4)),
+                         [](const auto& info) {
+                           return absl::Substitute(
+                               "n$0KiB_k$1_batch_size$2_offset$3",
+                               std::get<0>(info.param), std::get<1>(info.param),
+                               std::get<2>(info.param),
+                               std::get<3>(info.param));
+                         });
+
+template <size_t K>
+void BM_SmallTopk(benchmark::State& state) {
+  using T = float;
+  size_t k = K;
+  size_t batch_size = state.range(0);
+  size_t n = state.range(1) * 1024;
+  state.SetLabel(
+      absl::Substitute("n=$0Ki k=$1 batch_size=$2", n / 1024, k, batch_size));
+  void* input_buffer = AllocateGpuBuffer<T>(n * batch_size);
+  auto source = RandomFill<T>(input_buffer, n);
+  void* output_values = AllocateGpuBuffer<T>(k);
+  auto* output_indices = static_cast<uint32_t*>(AllocateGpuBuffer<uint32_t>(k));
+  GpuStreamHandle stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    CUDA_CHECK(cudaEventCreate(&start));
+    CUDA_CHECK(cudaEventCreate(&stop));
+    CUDA_CHECK(cudaEventRecord(start, stream));
+    CHECK(RunTopk(stream, Get(T()), input_buffer, n, output_values,
+                  output_indices, k, batch_size)
+              .ok());
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaEventRecord(stop, stream));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+    float milliseconds = 0;
+    CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
+    state.SetIterationTime(static_cast<double>(milliseconds) / 1000);
+    CUDA_CHECK(cudaEventDestroy(start));
+    CUDA_CHECK(cudaEventDestroy(stop));
+  }
+  size_t items_processed = batch_size * n * state.iterations();
+  state.SetItemsProcessed(items_processed);
+  state.SetBytesProcessed(items_processed * sizeof(T));
+  CUDA_CHECK(cudaFree(input_buffer));
+  CUDA_CHECK(cudaFree(output_values));
+  CUDA_CHECK(cudaFree(output_indices));
+}
+
+BENCHMARK(BM_SmallTopk<1>)->RangePair(1, 512, 16, 1024)->UseManualTime();
+BENCHMARK(BM_SmallTopk<2>)->RangePair(1, 512, 16, 1024)->UseManualTime();
+BENCHMARK(BM_SmallTopk<4>)->RangePair(1, 512, 16, 1024)->UseManualTime();
+BENCHMARK(BM_SmallTopk<8>)->RangePair(1, 512, 16, 1024)->UseManualTime();
+BENCHMARK(BM_SmallTopk<16>)->RangePair(1, 512, 16, 1024)->UseManualTime();
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/tensorflow/lite/core/shims/c/c_api.h b/tensorflow/compiler/xla/service/gpu/runtime/topk_no_cuda.cc
similarity index 71%
rename from tensorflow/lite/core/shims/c/c_api.h
rename to tensorflow/compiler/xla/service/gpu/runtime/topk_no_cuda.cc
index f4ae831e527..1287a63ee64 100644
--- a/tensorflow/lite/core/shims/c/c_api.h
+++ b/tensorflow/compiler/xla/service/gpu/runtime/topk_no_cuda.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_C_C_API_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_C_C_API_H_
 
-#include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/topk.h"
 
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_C_C_API_H_
+namespace xla::gpu {
+
+void RegisterTopkCustomCall(runtime::DirectCustomCallRegistry&) {}
+
+}  // namespace xla::gpu
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/topk_test.cc b/tensorflow/compiler/xla/service/gpu/runtime/topk_test.cc
new file mode 100644
index 00000000000..c1bbd535e41
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/topk_test.cc
@@ -0,0 +1,164 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/topk.h"
+
+#include <stddef.h>
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/compiler/xla/error_spec.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/gpu/topk_specializer.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/service/topk_rewriter.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+using ::testing::Combine;
+using ::testing::Values;
+
+// Params:
+//  - n_kb: number of elements in kilobytes.
+//  - k: number of elements to return.
+//  - batch_size
+//  - dtype
+using ParameterizedInterface =
+    ::testing::WithParamInterface<std::tuple<int, int, int, std::string_view>>;
+
+class TopkTest : public HloTestBase, public ParameterizedInterface {
+ public:
+  TopkTest()
+      : HloTestBase(*PlatformUtil::GetPlatform("CUDA"),
+                    *PlatformUtil::GetPlatform("CUDA"), true, true, {}) {}
+
+ protected:
+  StatusOr<std::unique_ptr<HloModule>> TopkHlo(int n, int k,
+                                               std::string_view dtype) {
+    return ParseAndReturnVerifiedModule(absl::Substitute(
+        R"(
+      %compare {
+        %p.1.lhs.40628 = s32[] parameter(2)
+        %p.1.rhs.40629 = s32[] parameter(3)
+        %constant.40630 = pred[] constant(true)
+        %broadcast.40631 = pred[] broadcast(pred[] %constant.40630), dimensions={}
+        %p.0.lhs.40626 = f32[] parameter(0)
+        %p.0.rhs.40627 = f32[] parameter(1)
+        %compare.40632 = pred[] compare(f32[] %p.0.lhs.40626, f32[] %p.0.rhs.40627), direction=GT, type=TOTALORDER
+        ROOT %select.40633 = pred[] select(pred[] %broadcast.40631, pred[] %compare.40632, pred[] %broadcast.40631)
+      }
+
+      ENTRY top_k {
+        %arg = $2[32,$0] parameter(0)
+        ROOT %result = ($2[32,$1], s32[32,$1]) custom-call(%arg), custom_call_target="TopK", to_apply=%compare
+      }
+    )",
+        n, k, dtype));
+  }
+};
+
+class GeneralizeTopkVisitor : public DfsHloRewriteVisitor {
+ public:
+  Status HandleCustomCall(HloInstruction* inst) override {
+    HloCustomCallInstruction* topk = DynCast<HloCustomCallInstruction>(inst);
+    if (topk == nullptr || topk->custom_call_target() != "__gpu$TopK") {
+      return OkStatus();
+    }
+    HloComputation* comp = topk->parent();
+    auto original_shape = ShapeUtil::SliceTuple(topk->shape(), 0, 2);
+    HloInstruction* original_topk =
+        comp->AddInstruction(HloInstruction::CreateCustomCall(
+            original_shape, topk->operands(), topk->to_apply(), "TopK"));
+    // TupleUtil::ExtractPrefix creates the following structure:
+    //      TopK
+    //   -------------
+    //   |     |     |
+    //  Get   Get   Get
+    //    \    |     /
+    //     CreateTuple
+    // Here we walk to Create Tuple and replace it with the original topk.
+    HloInstruction* new_tuple = topk->users()[0]->users()[0];
+    return ReplaceInstruction(new_tuple, original_topk);
+  }
+};
+
+class GeneralizeTopk : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "generalized-topk"; }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(HloModule* module,
+                     const absl::flat_hash_set<absl::string_view>&
+                         execution_threads) override {
+    return GeneralizeTopkVisitor().RunOnModule(module, execution_threads);
+  }
+};
+
+void ToSortAndSlice(HloModule* module) {
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, GeneralizeTopk().Run(module));
+  ASSERT_TRUE(changed);
+  TF_ASSERT_OK_AND_ASSIGN(changed, TopkDecomposer().Run(module));
+  ASSERT_TRUE(changed);
+}
+
+TEST_P(TopkTest, ProducesCorrectResult) {
+  const auto [n_kb, k, batch_size, dtype] = GetParam();
+  const size_t n = n_kb * 1024;
+  TF_ASSERT_OK_AND_ASSIGN(auto topk_module, TopkHlo(n, k, dtype));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          gpu::TopkSpecializer().Run(topk_module.get()));
+  ASSERT_TRUE(changed);
+  EXPECT_TRUE(
+      RunAndCompare(std::move(topk_module), std::nullopt, ToSortAndSlice));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    TopkTests, TopkTest,
+    Combine(
+        /*n_kb=*/Values(1, 8, 12, 32),
+        /*k=*/Values(1, 2, 4, 8, 16, 7, 12),
+        /*batch_size=*/Values(1, 16, 64, 128),
+        /*dtype=*/Values(absl::string_view("f32"), "bf16")),
+    [](const auto& info) {
+      return absl::Substitute("n$0KiB_k$1_batch_size$2_$3",
+                              std::get<0>(info.param), std::get<1>(info.param),
+                              std::get<2>(info.param), std::get<3>(info.param));
+    });
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/runtime_intrinsics.cc b/tensorflow/compiler/xla/service/gpu/runtime_intrinsics.cc
index e045e8b3898..1f8f040039b 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime_intrinsics.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime_intrinsics.cc
@@ -70,8 +70,8 @@ static void AssertionCustomCall(void* stream_handle, void** buffers,
       AssertOnGpu(stream_handle, buffers[0],
                   absl::string_view{opaque, static_cast<uint64_t>(opaque_len)});
   if (!s.ok()) {
-    XlaCustomCallStatusSetFailure(status, s.error_message().c_str(),
-                                  s.error_message().size());
+    auto msg = s.message();
+    XlaCustomCallStatusSetFailure(status, msg.data(), msg.size());
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
index 37c86d2b0eb..bd0fc2cae18 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/regexp.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
-#include "tensorflow/tsl/util/determinism.h"
 #include "tensorflow/tsl/util/env_var.h"
 #include "tensorflow/tsl/util/proto/proto_utils.h"
 
@@ -405,8 +404,9 @@ static void InitializeTypedBuffer(se::Stream* stream,
       // Only double gets random values in double.  Other data types get random
       // values in float then cast them to the target data types.
       using RandomFloatingPointType =
-          typename std::conditional<std::is_same<T, Eigen::half>::value, float,
-                                    T>::type;
+          typename std::conditional<std::is_same<T, Eigen::half>::value ||
+                                        std::is_same<T, Eigen::bfloat16>::value,
+                                    float, T>::type;
       using RandomType =
           typename std::conditional<std::is_integral<T>::value, float,
                                     RandomFloatingPointType>::type;
@@ -414,7 +414,7 @@ static void InitializeTypedBuffer(se::Stream* stream,
       auto upper_bound =
           RandomType(std::is_same<T, Eigen::half>::value ? 0.1 : 1.0);
       auto rand_val = UniformDistribution(RandomType(0), upper_bound, &gen);
-      // For float or double, it is between [0,1].
+      // For bf16, float or double, it is between [0,1].
       // For fp16, it ranges between [0, 0.1].
       // For integer types, element is either 0 or 1 for less overflows
       // especially for int8_t.
@@ -448,11 +448,9 @@ void InitializeBuffer(se::Stream* stream, PrimitiveType buffer_type,
                       int64_t* rng_state, se::DeviceMemoryBase buffer) {
   switch (buffer_type) {
     case xla::F16:
-    case xla::BF16:
-      // Using F16 for BF16 initialization: it's fine since we only need some
-      // random number there, and random generator is not working for BF16 (not
-      // all required overloads are there).
       return InitializeTypedBuffer<Eigen::half>(stream, buffer, rng_state);
+    case xla::BF16:
+      return InitializeTypedBuffer<Eigen::bfloat16>(stream, buffer, rng_state);
     case xla::F32:
     case xla::C64:
       return InitializeTypedBuffer<float>(stream, buffer, rng_state);
@@ -519,7 +517,7 @@ bool RequireDeterminism(const HloModuleConfig& config) {
                                         &cudnn_deterministic));
     return cudnn_deterministic;
   }();
-  return tsl::OpDeterminismRequired() || require_cudnn_determinism ||
+  return require_cudnn_determinism ||
          config.debug_options().xla_gpu_deterministic_ops();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index e9e3cd2990a..3f8aae17d67 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -191,12 +191,9 @@ xla_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/service:hlo_module_config",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service:hlo_query",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
@@ -790,30 +787,35 @@ build_test(
 
 xla_cc_binary(
     name = "hlo_to_llvm_ir",
+    testonly = True,
     srcs = ["hlo_to_llvm_ir.cc"],
     copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
     deps = [
-        "@llvm-project//llvm:Target",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service/gpu:compile_module_to_llvm_ir",
         "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
-        "//tensorflow/compiler/xla/service/gpu:gpu_device_info",
+        "//tensorflow/compiler/xla/service/gpu:gpu_device_info_for_tests",
         "//tensorflow/compiler/xla/service/gpu:target_constants",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
+        "//tensorflow/compiler/xla/stream_executor:device_description",
+        "//tensorflow/compiler/xla/stream_executor:device_description_proto_cc_impl",
+        "//tensorflow/compiler/xla/stream_executor:dnn",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform_id",
         "//tensorflow/compiler/xla/tools:hlo_module_loader",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform_id",
-        "//tensorflow/compiler/xla/stream_executor:device_description_proto_cc_impl",
-        "//tensorflow/compiler/xla/stream_executor:device_description",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
-        "//tensorflow/compiler/xla/stream_executor:dnn",
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/util:command_line_flags",
+        "@llvm-project//llvm:Target",
     ] + if_cuda_is_configured([
         "//tensorflow/compiler/xla/service/gpu:nvptx_compiler_impl",
         "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
     ]) + if_rocm_is_configured([
+        "//tensorflow/tsl/platform:rocm_rocdl_path",
         "//tensorflow/compiler/xla/service/gpu:amdgpu_compiler_impl",
+        "//tensorflow/compiler/xla/stream_executor/rocm:rocblas_plugin",
+        "//tensorflow/compiler/xla/stream_executor/rocm:rocm_helpers",
     ]),
 )
 
@@ -870,13 +872,14 @@ xla_cc_test(
     srcs = if_cuda_is_configured(["dynamic_shared_memory_test.cc"]),
     tags = tf_cuda_tests_tags(),
     deps = [
-        "@com_google_absl//absl/strings",
-        "//tensorflow/tsl/platform:statusor",
-        "//tensorflow/compiler/xla:xla_proto_cc",
-        "//tensorflow/tsl/platform:test_main",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/strings",
     ] + if_cuda_is_configured([
         "//tensorflow/compiler/xla/stream_executor/gpu:asm_compiler",
         "//tensorflow/compiler/xla/service/gpu:gpu_asm_opts_util",
diff --git a/tensorflow/compiler/xla/service/gpu/tests/all_reduce.hlo b/tensorflow/compiler/xla/service/gpu/tests/all_reduce.hlo
deleted file mode 100644
index dfadb9b83fb..00000000000
--- a/tensorflow/compiler/xla/service/gpu/tests/all_reduce.hlo
+++ /dev/null
@@ -1,24 +0,0 @@
-// RUN: hlo_to_llvm_ir %s
-
-HloModule Test
-
-%fused_computation (param_0.5307: bf16[], param_1.5984: bf16[]) -> bf16[] {
-  %param_1.5984 = bf16[] parameter(1)
-  %convert.72239 = f32[] convert(bf16[] %param_1.5984)
-  %param_0.5307 = bf16[] parameter(0)
-  %convert.72238 = f32[] convert(bf16[] %param_0.5307)
-  %add.3846 = f32[] add(f32[] %convert.72239, f32[] %convert.72238), metadata={op_type="add" op_name="add"}
-  ROOT %convert.72237 = bf16[] convert(f32[] %add.3846)
-}
-
-%all_reduce_computation (parameter.47449: bf16[], parameter.47450: bf16[]) -> bf16[] {
-  %parameter.47450 = bf16[] parameter(1), metadata={op_type="add" op_name="add"}
-  %parameter.47449 = bf16[] parameter(0), metadata={op_type="add" op_name="add"}
-  ROOT %fusion.1743 = bf16[] fusion(bf16[] %parameter.47450, bf16[] %parameter.47449), kind=kLoop, calls=%fused_computation
-}
-
-ENTRY main {
-  input = bf16[8]{0} parameter(0)
-  ROOT crs = bf16[8]{0} all-reduce(input), replica_groups={{0}}, to_apply=%all_reduce_computation
-}
-
diff --git a/tensorflow/compiler/xla/service/gpu/tests/calling_convention.hlo b/tensorflow/compiler/xla/service/gpu/tests/calling_convention.hlo
new file mode 100644
index 00000000000..281609e9ff3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/calling_convention.hlo
@@ -0,0 +1,25 @@
+// RUN: hlo_to_llvm_ir %s | FileCheck %s
+
+// Arguments are passed separately.
+// Even constant arguments are passed as arguments.
+// Repeated arguments are only passed once.
+// CHECK-LABEL: target triple
+// CHECK: @buffer_for_dynamic
+// CHECK: @buffer_for_static
+// CHECK: define void @custom_call(ptr noalias align 16 dereferenceable(32) %arg0, ptr noalias align 128 dereferenceable(4) %arg1, ptr noalias align 128 dereferenceable(4) %arg2, ptr noalias align 128 dereferenceable(32) %arg3) {
+// CHECK-NOT: @buffer_for_dynamic
+// CHECK-NOT: @buffer_for_static
+
+HloModule SliceToDynamic
+
+ENTRY main {
+  %param = s32[2,2,2]{2,0,1} parameter(0)
+  %static = s32[] constant(2)
+  %dynamic = s32[] constant(1)
+  ROOT %custom-call = s32[2,<=2, 2]{2,0,1} custom-call(s32[2,2,2]{2,0,1} %param,
+                                                  s32[] %static,
+                                                  s32[] %dynamic,
+                                                  s32[] %static),
+                                      custom_call_target="SliceToDynamic",
+                                      backend_config=""
+}
diff --git a/tensorflow/compiler/xla/service/gpu/tests/copy_nested.hlo b/tensorflow/compiler/xla/service/gpu/tests/copy_nested.hlo
index b6937a7b5b9..2dfc6d5b6c0 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/copy_nested.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/copy_nested.hlo
@@ -11,14 +11,14 @@
 // CHECK:         %[[VAL_5:.*]] = icmp uge i32 %[[VAL_4]], 6000000
 // CHECK:         br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]]
 // CHECK:       loop.loop_body:                                   ; preds = %[[VAL_1]]
-// CHECK:         %[[VAL_8:.*]] = add nuw nsw i32 %[[VAL_4]], 655360
+// CHECK:         %[[VAL_8:.*]] = add nuw nsw i32 %[[VAL_4]], 516096
 // CHECK:         store i32 %[[VAL_8]], ptr %[[VAL_0]], align 4
 // CHECK:         %[[VAL_9:.*]] = icmp eq i32 %[[VAL_4]], 0
 // CHECK:         %[[VAL_10:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
 // CHECK:         %[[VAL_11:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
 // CHECK:         %[[VAL_12:.*]] = mul nuw nsw i32 %[[VAL_10]], 128
 // CHECK:         %[[VAL_13:.*]] = add nuw nsw i32 %[[VAL_12]], %[[VAL_11]]
-// CHECK:         %[[VAL_14:.*]] = icmp ult i32 %[[VAL_13]], 163840
+// CHECK:         %[[VAL_14:.*]] = icmp ult i32 %[[VAL_13]], 129024
 // CHECK:         call void @llvm.assume(i1 %[[VAL_14]])
 // CHECK:         %[[VAL_15:.*]] = mul nuw nsw i32 %[[VAL_13]], 4
 // CHECK:         %[[VAL_16:.*]] = add nuw nsw i32 %[[VAL_15]], %[[VAL_4]]
diff --git a/tensorflow/compiler/xla/service/gpu/tests/dynamic_shared_memory_test.cc b/tensorflow/compiler/xla/service/gpu/tests/dynamic_shared_memory_test.cc
index ce3938c7150..2d6cb896f3e 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/dynamic_shared_memory_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/dynamic_shared_memory_test.cc
@@ -13,15 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+#include <memory>
 #include <vector>
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/platform/test.h"
 
@@ -31,75 +35,159 @@ namespace {
 
 namespace se = stream_executor;
 
-static const char *dyn_shmem_ptx = R"(
+// PTX below is compiled from the following CUDA code:
+//
+// __global__ void dyn_shmem_kernel(dtype* buf, uint32_t* n_cols,
+//                                  uint32_t* n_rows) {
+//   extern __shared__ dtype shmem[];
+//   uint32_t src_col = threadIdx.x;
+//   uint32_t dst_col = *n_cols - src_col - 1;
+//   for (uint32_t i = 0; i < *n_rows; i++) {
+//     shmem[src_col + *n_cols * i] = buf[src_col + *n_cols * i];
+//   }
+//   __syncthreads();
+//   for (uint32_t i = 0; i < *n_rows; i++) {
+//     buf[dst_col + *n_cols * (*n_rows - i - 1)] =
+//         shmem[src_col + *n_cols * i];
+//   }
+// }
+
+const absl::string_view kPTX = R"(
 .version 4.2
 .target sm_30
 .address_size 64
 
-.extern .shared .align 4 .b8 s[];
+.extern .shared .align 1 .b8 shmem[];
 
-.visible .entry dyn_shmem_kernel(.param .u64 buf) {
-.reg .b32 %r<6>;
-.reg .b64 %rd<9>;
+.visible .entry dyn_shmem_kernel(
+.param .u64 buf,
+.param .u64 n_cols,
+.param .u64 n_rows
+)
+{
+.reg .pred %p<5>;
+.reg .b16 %rs<3>;
+.reg .b32 %r<30>;
+.reg .b64 %rd<17>;
 
-ld.param.u64 %rd1, [buf];
-cvta.to.global.u64 %rd2, %rd1;
+ld.param.u64 %rd4, [buf];
+ld.param.u64 %rd5, [n_rows];
+cvta.to.global.u64 %rd1, %rd5;
+ld.param.u64 %rd6, [n_cols];
+cvta.to.global.u64 %rd2, %rd6;
+cvta.to.global.u64 %rd3, %rd4;
 mov.u32 %r1, %tid.x;
-mov.u32 %r2, 511;
-sub.s32 %r3, %r2, %r1;
-mul.wide.s32 %rd3, %r1, 4;
-add.s64 %rd4, %rd2, %rd3;
-ld.global.u32 %r4, [%rd4];
-mov.u64 %rd5, s;
-add.s64 %rd6, %rd5, %rd3;
-st.shared.u32 [%rd6], %r4;
+ld.global.u32 %r12, [%rd2];
+ld.global.u32 %r14, [%rd1];
+setp.eq.s32 %p1, %r14, 0;
+mov.u64 %rd16, shmem;
+@%p1 bra $L__BB0_3;
+mov.u32 %r26, 0;
+$L__BB0_2:
+ld.global.u32 %r16, [%rd2];
+mad.lo.s32 %r17, %r16, %r26, %r1;
+cvt.u64.u32 %rd7, %r17;
+add.s64 %rd8, %rd3, %rd7;
+ld.global.u8 %rs1, [%rd8];
+add.s64 %rd10, %rd16, %rd7;
+st.shared.u8 [%rd10], %rs1;
+add.s32 %r26, %r26, 1;
+ld.global.u32 %r18, [%rd1];
+setp.lt.u32 %p2, %r26, %r18;
+@%p2 bra $L__BB0_2;
+$L__BB0_3:
 bar.sync 0;
-mul.wide.s32 %rd7, %r3, 4;
-add.s64 %rd8, %rd5, %rd7;
-ld.shared.u32 %r5, [%rd8];
-st.global.u32 [%rd4], %r5;
+ld.global.u32 %r28, [%rd1];
+setp.eq.s32 %p3, %r28, 0;
+@%p3 bra $L__BB0_6;
+not.b32 %r13, %r1;
+add.s32 %r2, %r12, %r13;
+mov.u32 %r29, 0;
+mov.u32 %r27, -1;
+$L__BB0_5:
+ld.global.u32 %r21, [%rd2];
+mad.lo.s32 %r22, %r21, %r29, %r1;
+cvt.u64.u32 %rd11, %r22;
+add.s64 %rd13, %rd16, %rd11;
+ld.shared.u8 %rs2, [%rd13];
+add.s32 %r23, %r28, %r27;
+mad.lo.s32 %r24, %r21, %r23, %r2;
+cvt.u64.u32 %rd14, %r24;
+add.s64 %rd15, %rd3, %rd14;
+st.global.u8 [%rd15], %rs2;
+add.s32 %r29, %r29, 1;
+ld.global.u32 %r28, [%rd1];
+add.s32 %r27, %r27, -1;
+setp.gt.u32 %p4, %r28, %r29;
+@%p4 bra $L__BB0_5;
+$L__BB0_6:
 ret;
-}
-)";
+})";
 
-TEST(ShmemTest, ReverseArray) {
-  // Testing that dynamic shared memory is allocated to kernels requesting it.
-  se::Platform *platform =
+TEST(SharedMemoryUseTest, ArrayReversalWorks) {
+  // Test that shared memory is fully available to kernels requesting it.
+  // Create an array with a 2D pattern of numbers, fill the requested shared
+  // memory with it, read it back inverting both axes,
+  // copy the result back to the host and verify it.
+  se::Platform* platform =
       se::MultiPlatformManager::PlatformWithName("cuda").value();
-  se::StreamExecutor *executor = platform->ExecutorForDevice(0).value();
+  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
   se::Stream stream(executor);
   stream.Init();
 
-  constexpr int n_elements = 512;
-  using dtype = int32_t;
-  constexpr int buffer_size_bytes = n_elements * sizeof(dtype);
-
-  se::DeviceMemory<dtype> dev_buf = executor->AllocateArray<dtype>(n_elements);
-  std::vector<dtype> host_buf(n_elements);
-  for (int i = 0; i < n_elements; ++i) {
-    host_buf[i] = i;
-  }
-  stream.ThenMemcpy(&dev_buf, host_buf.data(), buffer_size_bytes);
-  TF_CHECK_OK(stream.BlockHostUntilDone());
+  // Use 90% of the available shared memory to verify that a fractional
+  // amount works as well, not only the full size.
+  const int n_cols = executor->GetDeviceDescription().threads_per_block_limit();
+  const int n_rows =
+      0.9 * executor->GetDeviceDescription().shared_memory_per_block_optin() /
+      n_cols;
+  const int n_elements = n_cols * n_rows;
+  using data_type = uint8_t;
+  constexpr int max_value = UINT8_MAX;
+  const int buffer_size_bytes = n_elements * sizeof(data_type);
+  VLOG(1) << "Using " << buffer_size_bytes << " bytes of shared memory";
 
   std::vector<uint8_t> compiled_ptx =
-      se::CompileGpuAsm(executor->device_ordinal(), dyn_shmem_ptx,
+      se::CompileGpuAsm(executor->device_ordinal(), kPTX.data(),
                         PtxOptsFromDebugOptions(DebugOptions{}))
           .value();
+  std::unique_ptr<stream_executor::KernelBase> kernel =
+      CreateKernel("dyn_shmem_kernel", /*num_args=*/3,
+                   reinterpret_cast<char*>(compiled_ptx.data()),
+                   /*cubin_data=*/{}, executor,
+                   /*shared_mem_bytes=*/buffer_size_bytes)
+          .value();
 
-  auto kernel = CreateKernel("dyn_shmem_kernel", /*num_args=*/1,
-                             reinterpret_cast<char *>(compiled_ptx.data()),
-                             /*cubin_data=*/{}, executor,
-                             /*shared_mem_bytes=*/n_elements * sizeof(dtype))
-                    .value();
-  ExecuteKernelOnStream(
-      *kernel, {dev_buf},
-      {/*block_x_count=*/1, /*thread_x_count_per_block=*/n_elements}, &stream)
-      .ok();
+  se::DeviceMemory<data_type> device_buffer =
+      executor->AllocateArray<data_type>(n_elements);
+  std::vector<data_type> host_buffer(n_elements);
+  for (int row = 0; row < n_rows; ++row) {
+    for (int col = 0; col < n_cols; ++col) {
+      // Fill the buffer with a reasonably non-uniform pattern, multiples of
+      // 3 and 5 make it non-symmetric with respect to the main diagonal.
+      host_buffer[row * n_cols + col] = (3 * col + 5 * row) % max_value;
+    }
+  }
+
+  stream.ThenMemcpy(&device_buffer, host_buffer.data(), buffer_size_bytes);
+  se::DeviceMemory<uint32_t> dev_n_cols = executor->AllocateScalar<uint32_t>();
+  stream.ThenMemcpy(&dev_n_cols, &n_cols, sizeof(uint32_t));
+  se::DeviceMemory<uint32_t> dev_n_rows = executor->AllocateScalar<uint32_t>();
+  stream.ThenMemcpy(&dev_n_rows, &n_rows, sizeof(uint32_t));
   TF_CHECK_OK(stream.BlockHostUntilDone());
-  stream.ThenMemcpy(host_buf.data(), dev_buf, n_elements * sizeof(dtype));
-  for (int i = 0; i < n_elements; ++i) {
-    EXPECT_EQ(host_buf[i], n_elements - 1 - i);
+  TF_CHECK_OK(ExecuteKernelOnStream(
+      *kernel, {device_buffer, dev_n_cols, dev_n_rows},
+      {/*block_x_count=*/1, /*thread_x_count_per_block=*/n_cols}, &stream));
+  TF_CHECK_OK(stream.BlockHostUntilDone());
+  stream.ThenMemcpy(host_buffer.data(), device_buffer, buffer_size_bytes);
+  TF_CHECK_OK(stream.BlockHostUntilDone());
+
+  for (int row = 0; row < n_rows; ++row) {
+    for (int col = 0; col < n_cols; ++col) {
+      EXPECT_EQ(host_buffer[(n_rows - row - 1) * n_cols + (n_cols - col - 1)],
+                (3 * col + 5 * row) % max_value)
+          << row << " " << col;
+    }
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/fused_scatter.hlo b/tensorflow/compiler/xla/service/gpu/tests/fused_scatter.hlo
index cbac51df6ac..a07c97e08e8 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/fused_scatter.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/fused_scatter.hlo
@@ -17,93 +17,90 @@
 // CHECK:       indices.in_bounds-true:                           ; preds = %[[VAL_9]]
 // CHECK:         %[[VAL_10:.*]] = getelementptr inbounds i32, ptr %[[VAL_11:.*]], i32 %[[VAL_3]]
 // CHECK:         %[[VAL_12:.*]] = load i32, ptr %[[VAL_10]], align 4, !invariant.load !10
-// CHECK:         %[[VAL_13:.*]] = getelementptr inbounds i32, ptr %[[VAL_14:.*]], i32 %[[VAL_3]]
-// CHECK:         %[[VAL_15:.*]] = load i32, ptr %[[VAL_13]], align 4, !invariant.load !10
-// CHECK:         %[[VAL_16:.*]] = add i32 %[[VAL_12]], %[[VAL_15]]
-// CHECK:         %[[VAL_17:.*]] = getelementptr inbounds i32, ptr %[[VAL_18:.*]], i32 %[[VAL_3]]
-// CHECK:         store i32 %[[VAL_16]], ptr %[[VAL_17]], align 4
+// CHECK:         %[[VAL_13:.*]] = getelementptr inbounds i32, ptr %[[VAL_11]], i32 %[[VAL_3]]
+// CHECK:         %[[VAL_14:.*]] = load i32, ptr %[[VAL_13]], align 4, !invariant.load !10
+// CHECK:         %[[VAL_15:.*]] = add i32 %[[VAL_12]], %[[VAL_14]]
+// CHECK:         %[[VAL_16:.*]] = getelementptr inbounds i32, ptr %[[VAL_17:.*]], i32 %[[VAL_3]]
+// CHECK:         store i32 %[[VAL_15]], ptr %[[VAL_16]], align 4
 // CHECK:         br label %[[VAL_8]]
 // CHECK:       entry:
-// CHECK:         %[[VAL_19:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !8
-// CHECK:         %[[VAL_20:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !11
-// CHECK:         %[[VAL_21:.*]] = mul nuw nsw i32 %[[VAL_19]], 6
-// CHECK:         %[[VAL_22:.*]] = add nuw nsw i32 %[[VAL_21]], %[[VAL_20]]
-// CHECK:         %[[VAL_23:.*]] = icmp ult i32 %[[VAL_22]], 6
-// CHECK:         call void @llvm.assume(i1 %[[VAL_23]])
-// CHECK:         %[[VAL_24:.*]] = udiv i32 %[[VAL_22]], 1
-// CHECK:         %[[VAL_25:.*]] = urem i32 %[[VAL_24]], 3
-// CHECK:         %[[VAL_26:.*]] = udiv i32 %[[VAL_22]], 3
-// CHECK:         %[[VAL_27:.*]] = icmp ult i32 %[[VAL_22]], 6
-// CHECK:         br i1 %[[VAL_27]], label %[[VAL_28:.*]], label %[[VAL_29:.*]]
-// CHECK:       updates.in_bounds-after:                          ; preds = %[[VAL_28]], %[[VAL_30:.*]]
+// CHECK:         %[[VAL_18:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !8
+// CHECK:         %[[VAL_19:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !11
+// CHECK:         %[[VAL_20:.*]] = mul nuw nsw i32 %[[VAL_18]], 6
+// CHECK:         %[[VAL_21:.*]] = add nuw nsw i32 %[[VAL_20]], %[[VAL_19]]
+// CHECK:         %[[VAL_22:.*]] = icmp ult i32 %[[VAL_21]], 6
+// CHECK:         call void @llvm.assume(i1 %[[VAL_22]])
+// CHECK:         %[[VAL_23:.*]] = udiv i32 %[[VAL_21]], 1
+// CHECK:         %[[VAL_24:.*]] = urem i32 %[[VAL_23]], 3
+// CHECK:         %[[VAL_25:.*]] = udiv i32 %[[VAL_21]], 3
+// CHECK:         %[[VAL_26:.*]] = icmp ult i32 %[[VAL_21]], 6
+// CHECK:         br i1 %[[VAL_26]], label %[[VAL_27:.*]], label %[[VAL_28:.*]]
+// CHECK:       updates.in_bounds-after:                          ; preds = %[[VAL_27]], %[[VAL_29:.*]]
 // CHECK:         ret void
-// CHECK:       updates.in_bounds-true:                           ; preds = %[[VAL_30]]
-// CHECK:         %[[VAL_31:.*]] = getelementptr inbounds i32, ptr %[[VAL_32:.*]], i32 %[[VAL_22]]
-// CHECK:         %[[VAL_33:.*]] = load i32, ptr %[[VAL_31]], align 4, !invariant.load !10
-// CHECK:         %[[VAL_34:.*]] = getelementptr inbounds i32, ptr %[[VAL_35:.*]], i32 %[[VAL_22]]
-// CHECK:         %[[VAL_36:.*]] = load i32, ptr %[[VAL_34]], align 4, !invariant.load !10
-// CHECK:         %[[VAL_37:.*]] = add i32 %[[VAL_33]], %[[VAL_36]]
-// CHECK:         %[[VAL_38:.*]] = getelementptr inbounds i32, ptr %[[VAL_39:.*]], i32 %[[VAL_22]]
-// CHECK:         store i32 %[[VAL_37]], ptr %[[VAL_38]], align 4
-// CHECK:         br label %[[VAL_29]]
+// CHECK:       updates.in_bounds-true:                           ; preds = %[[VAL_29]]
+// CHECK:         %[[VAL_30:.*]] = getelementptr inbounds i32, ptr %[[VAL_31:.*]], i32 %[[VAL_21]]
+// CHECK:         %[[VAL_32:.*]] = load i32, ptr %[[VAL_30]], align 4, !invariant.load !10
+// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds i32, ptr %[[VAL_31]], i32 %[[VAL_21]]
+// CHECK:         %[[VAL_34:.*]] = load i32, ptr %[[VAL_33]], align 4, !invariant.load !10
+// CHECK:         %[[VAL_35:.*]] = add i32 %[[VAL_32]], %[[VAL_34]]
+// CHECK:         %[[VAL_36:.*]] = getelementptr inbounds i32, ptr %[[VAL_37:.*]], i32 %[[VAL_21]]
+// CHECK:         store i32 %[[VAL_35]], ptr %[[VAL_36]], align 4
+// CHECK:         br label %[[VAL_28]]
 // CHECK:       entry:
-// CHECK:         %[[VAL_40:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !8
-// CHECK:         %[[VAL_41:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12
-// CHECK:         %[[VAL_42:.*]] = mul nuw nsw i32 %[[VAL_40]], 9
-// CHECK:         %[[VAL_43:.*]] = add nuw nsw i32 %[[VAL_42]], %[[VAL_41]]
-// CHECK:         %[[VAL_44:.*]] = icmp ult i32 %[[VAL_43]], 9
-// CHECK:         call void @llvm.assume(i1 %[[VAL_44]])
-// CHECK:         %[[VAL_45:.*]] = udiv i32 %[[VAL_43]], 1
-// CHECK:         %[[VAL_46:.*]] = urem i32 %[[VAL_45]], 3
-// CHECK:         %[[VAL_47:.*]] = udiv i32 %[[VAL_43]], 3
-// CHECK:         %[[VAL_48:.*]] = icmp ult i32 %[[VAL_43]], 9
-// CHECK:         br i1 %[[VAL_48]], label %[[VAL_49:.*]], label %[[VAL_50:.*]]
-// CHECK:       operand.in_bounds-after:                          ; preds = %[[VAL_49]], %[[VAL_51:.*]]
+// CHECK:         %[[VAL_38:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !8
+// CHECK:         %[[VAL_39:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12
+// CHECK:         %[[VAL_40:.*]] = mul nuw nsw i32 %[[VAL_38]], 9
+// CHECK:         %[[VAL_41:.*]] = add nuw nsw i32 %[[VAL_40]], %[[VAL_39]]
+// CHECK:         %[[VAL_42:.*]] = icmp ult i32 %[[VAL_41]], 9
+// CHECK:         call void @llvm.assume(i1 %[[VAL_42]])
+// CHECK:         %[[VAL_43:.*]] = udiv i32 %[[VAL_41]], 1
+// CHECK:         %[[VAL_44:.*]] = urem i32 %[[VAL_43]], 3
+// CHECK:         %[[VAL_45:.*]] = udiv i32 %[[VAL_41]], 3
+// CHECK:         %[[VAL_46:.*]] = icmp ult i32 %[[VAL_41]], 9
+// CHECK:         br i1 %[[VAL_46]], label %[[VAL_47:.*]], label %[[VAL_48:.*]]
+// CHECK:       operand.in_bounds-after:                          ; preds = %[[VAL_47]], %[[VAL_49:.*]]
 // CHECK:         ret void
-// CHECK:       operand.in_bounds-true:                           ; preds = %[[VAL_51]]
-// CHECK:         %[[VAL_52:.*]] = getelementptr inbounds i32, ptr %[[VAL_53:.*]], i32 %[[VAL_43]]
-// CHECK:         %[[VAL_54:.*]] = load i32, ptr %[[VAL_52]], align 4, !invariant.load !10
-// CHECK:         %[[VAL_55:.*]] = getelementptr inbounds i32, ptr %[[VAL_56:.*]], i32 %[[VAL_43]]
-// CHECK:         %[[VAL_57:.*]] = load i32, ptr %[[VAL_55]], align 4, !invariant.load !10
-// CHECK:         %[[VAL_58:.*]] = add i32 %[[VAL_54]], %[[VAL_57]]
-// CHECK:         %[[VAL_59:.*]] = getelementptr inbounds i32, ptr %[[VAL_60:.*]], i32 %[[VAL_43]]
-// CHECK:         store i32 %[[VAL_58]], ptr %[[VAL_59]], align 4
-// CHECK:         br label %[[VAL_50]]
+// CHECK:       operand.in_bounds-true:                           ; preds = %[[VAL_49]]
+// CHECK:         %[[VAL_50:.*]] = getelementptr inbounds i32, ptr %[[VAL_51:.*]], i32 %[[VAL_41]]
+// CHECK:         %[[VAL_52:.*]] = load i32, ptr %[[VAL_50]], align 4, !invariant.load !10
+// CHECK:         %[[VAL_53:.*]] = getelementptr inbounds i32, ptr %[[VAL_51]], i32 %[[VAL_41]]
+// CHECK:         %[[VAL_54:.*]] = load i32, ptr %[[VAL_53]], align 4, !invariant.load !10
+// CHECK:         %[[VAL_55:.*]] = add i32 %[[VAL_52]], %[[VAL_54]]
+// CHECK:         %[[VAL_56:.*]] = getelementptr inbounds i32, ptr %[[VAL_57:.*]], i32 %[[VAL_41]]
+// CHECK:         store i32 %[[VAL_55]], ptr %[[VAL_56]], align 4
+// CHECK:         br label %[[VAL_48]]
 // CHECK:       entry:
-// CHECK:         %[[VAL_61:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_62:.*]] = getelementptr inbounds i8, ptr %[[VAL_63:.*]], i64 128
-// CHECK:         %[[VAL_64:.*]] = getelementptr inbounds i8, ptr %[[VAL_63]], i64 0
-// CHECK:         %[[VAL_65:.*]] = getelementptr inbounds i8, ptr %[[VAL_66:.*]], i64 0
-// CHECK:         %[[VAL_67:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !8
-// CHECK:         %[[VAL_68:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !11
-// CHECK:         %[[VAL_69:.*]] = mul nuw nsw i32 %[[VAL_67]], 6
-// CHECK:         %[[VAL_70:.*]] = add nuw nsw i32 %[[VAL_69]], %[[VAL_68]]
-// CHECK:         %[[VAL_71:.*]] = icmp ult i32 %[[VAL_70]], 6
-// CHECK:         call void @llvm.assume(i1 %[[VAL_71]])
-// CHECK:         %[[VAL_72:.*]] = udiv i32 %[[VAL_70]], 1
-// CHECK:         %[[VAL_73:.*]] = urem i32 %[[VAL_72]], 3
-// CHECK:         %[[VAL_74:.*]] = udiv i32 %[[VAL_70]], 3
-// CHECK:         %[[VAL_75:.*]] = icmp ult i32 %[[VAL_70]], 6
-// CHECK:         br i1 %[[VAL_75]], label %[[VAL_76:.*]], label %[[VAL_77:.*]]
-// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_78:.*]], %[[VAL_79:.*]]
+// CHECK:         %[[VAL_58:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_59:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !8
+// CHECK:         %[[VAL_60:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !11
+// CHECK:         %[[VAL_61:.*]] = mul nuw nsw i32 %[[VAL_59]], 6
+// CHECK:         %[[VAL_62:.*]] = add nuw nsw i32 %[[VAL_61]], %[[VAL_60]]
+// CHECK:         %[[VAL_63:.*]] = icmp ult i32 %[[VAL_62]], 6
+// CHECK:         call void @llvm.assume(i1 %[[VAL_63]])
+// CHECK:         %[[VAL_64:.*]] = udiv i32 %[[VAL_62]], 1
+// CHECK:         %[[VAL_65:.*]] = urem i32 %[[VAL_64]], 3
+// CHECK:         %[[VAL_66:.*]] = udiv i32 %[[VAL_62]], 3
+// CHECK:         %[[VAL_67:.*]] = icmp ult i32 %[[VAL_62]], 6
+// CHECK:         br i1 %[[VAL_67]], label %[[VAL_68:.*]], label %[[VAL_69:.*]]
+// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_70:.*]], %[[VAL_71:.*]]
 // CHECK:         ret void
-// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_79]]
-// CHECK:         %[[VAL_80:.*]] = getelementptr inbounds [2 x i32], ptr %[[VAL_62]], i32 0, i32 %[[VAL_74]]
-// CHECK:         %[[VAL_81:.*]] = load i32, ptr %[[VAL_80]], align 4, !invariant.load !10
-// CHECK:         %[[VAL_82:.*]] = add i32 0, %[[VAL_81]]
-// CHECK:         %[[VAL_83:.*]] = icmp ult i32 %[[VAL_81]], 3
-// CHECK:         %[[VAL_84:.*]] = and i1 true, %[[VAL_83]]
-// CHECK:         br i1 %[[VAL_84]], label %[[VAL_85:.*]], label %[[VAL_78]]
-// CHECK:       scatter.in_bounds-after3:                         ; preds = %[[VAL_85]], %[[VAL_76]]
-// CHECK:         br label %[[VAL_77]]
-// CHECK:       scatter.in_bounds-true2:                          ; preds = %[[VAL_76]]
-// CHECK:         %[[VAL_86:.*]] = getelementptr inbounds [3 x [3 x i32]], ptr %[[VAL_65]], i32 0, i32 %[[VAL_82]], i32 %[[VAL_73]]
-// CHECK:         %[[VAL_87:.*]] = getelementptr inbounds i32, ptr %[[VAL_64]], i32 %[[VAL_70]]
-// CHECK:         %[[VAL_88:.*]] = load i32, ptr %[[VAL_87]], align 4, !invariant.load !10
-// CHECK:         store i32 %[[VAL_88]], ptr %[[VAL_61]], align 4
-// CHECK:         %[[VAL_89:.*]] = load i32, ptr %[[VAL_61]], align 4
-// CHECK:         store atomic i32 %[[VAL_89]], ptr %[[VAL_86]] unordered, align 4
-// CHECK:         br label %[[VAL_78]]
+// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_71]]
+// CHECK:         %[[VAL_72:.*]] = getelementptr inbounds [2 x i32], ptr %[[VAL_73:.*]], i32 0, i32 %[[VAL_66]]
+// CHECK:         %[[VAL_74:.*]] = load i32, ptr %[[VAL_72]], align 4, !invariant.load !10
+// CHECK:         %[[VAL_75:.*]] = add i32 0, %[[VAL_74]]
+// CHECK:         %[[VAL_76:.*]] = icmp ult i32 %[[VAL_74]], 3
+// CHECK:         %[[VAL_77:.*]] = and i1 true, %[[VAL_76]]
+// CHECK:         br i1 %[[VAL_77]], label %[[VAL_78:.*]], label %[[VAL_70]]
+// CHECK:       scatter.in_bounds-after3:                         ; preds = %[[VAL_78]], %[[VAL_68]]
+// CHECK:         br label %[[VAL_69]]
+// CHECK:       scatter.in_bounds-true2:                          ; preds = %[[VAL_68]]
+// CHECK:         %[[VAL_79:.*]] = getelementptr inbounds [3 x [3 x i32]], ptr %[[VAL_80:.*]], i32 0, i32 %[[VAL_75]], i32 %[[VAL_65]]
+// CHECK:         %[[VAL_81:.*]] = getelementptr inbounds i32, ptr %[[VAL_82:.*]], i32 %[[VAL_62]]
+// CHECK:         %[[VAL_83:.*]] = load i32, ptr %[[VAL_81]], align 4, !invariant.load !10
+// CHECK:         store i32 %[[VAL_83]], ptr %[[VAL_58]], align 4
+// CHECK:         %[[VAL_84:.*]] = load i32, ptr %[[VAL_58]], align 4
+// CHECK:         store atomic i32 %[[VAL_84]], ptr %[[VAL_79]] unordered, align 4
+// CHECK:         br label %[[VAL_70]]
 
 HloModule TensorFlowScatterV1
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc
index f083321223e..3b37efa1ce8 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -219,10 +219,12 @@ class ParameterizedGemmRewriteTest
     return replacements_[kCustomCallTargetPlaceholder];
   }
 
+ protected:
+  absl::flat_hash_map<absl::string_view, absl::string_view> replacements_;
+
  private:
   static constexpr const char* kCustomCallTargetPlaceholder{
       "<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"};
-  absl::flat_hash_map<absl::string_view, absl::string_view> replacements_;
 };
 
 TEST_P(ParameterizedGemmRewriteTest, Simple) {
@@ -4225,11 +4227,60 @@ TEST_P(ParameterizedFp8GemmRewriteTest, DoNotRewriteToF8OnPreHopper) {
   MatchOptimizedHlo(hlo_text,
                     R"(
 ; CHECK-LABEL: ENTRY %PreHopperTest (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16]) -> f8e4m3fn[16,16] {
-; CHECK:    {{.*}} = {{.*}}[16,16]{1,0} dot({{.*}}, {{.*}}), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+; CHECK:    {{.*}} = f16[16,16]{1,0} custom-call({{.*}}, {{.*}})
+; CHECK-DAG:  custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
           )");
 }
 
+TEST_P(ParameterizedFp8GemmRewriteTest, UnsupportedTypesF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+  // Test with types unsupported by cuBLAS LT when FP8 is used. cuBLAS LT with
+  // FP8 requires one of the operands to be F8E4M3FN.
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY unsupported_types {
+      x = f8e5m2[16,16] parameter(0)
+      y = f8e5m2[16,16] parameter(1)
+      ROOT out = f8e5m2[16,16] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+          }
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-2, 1e-2}));
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(GetCudaComputeCapability()),
+                            absl::StrReplaceAll(R"(
+; CHECK-LABEL: ENTRY %unsupported_types (x: f8e5m2[16,16], y: f8e5m2[16,16]) -> f8e5m2[16,16] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e5m2[16,16]{1,0} parameter(0)
+; CHECK-NEXT:    [[P0_CONVERT:%[^ ]+]] = f16[16,16]{1,0} convert([[P0]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e5m2[16,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_CONVERT:%[^ ]+]] = f16[16,16]{1,0} convert([[P1]])
+; CHECK-NEXT:    [[DOT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0_CONVERT]], [[P1_CONVERT]]),
+; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
+; CHECK:           }"
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f8e5m2[16,16]{1,0} convert([[DOT]])
+      )",
+                                                replacements_));
+}
+
 TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -4274,6 +4325,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -4327,7 +4381,74 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8) {
       )");
 }
 
-TEST_P(ParameterizedFp8GemmRewriteTest, BitcastScaledABUnscaledDF8) {
+TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDPaddedF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[13,17] parameter(0)
+      y = f8e4m3fn[17,31] parameter(1)
+      x_f32 = f32[13,17] convert(x)
+      y_f32 = f32[17,31] convert(y)
+      x_scale = f32[] parameter(2)
+      y_scale = f32[] parameter(3)
+      x_scale_bcast = f32[13,17] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f32[17,31] broadcast(y_scale), dimensions={}
+      x_unscaled = f32[13,17] multiply(x_f32, x_scale_bcast)
+      y_unscaled = f32[17,31] multiply(y_f32, y_scale_bcast)
+      ROOT out = f32[13,31] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+          }
+
+)";
+
+  CheckFp8IfOnHopper(hlo_text);
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::HOPPER, 0}),
+                            R"(
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[13,17], y: f8e4m3fn[17,31], x_scale: f32[], y_scale: f32[]) -> f32[13,31] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[13,17]{1,0} parameter(0)
+; CHECK-NEXT:    [[C0:%[^ ]+]] = f8e4m3fn[] constant(0)
+; CHECK-NEXT:    [[P0_PADDED:%[^ ]+]] = f8e4m3fn[16,32]{1,0} pad([[P0]], [[C0]]), padding=0_3x0_15
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[17,31]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[31,17]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f8e4m3fn[] constant(0)
+; CHECK-NEXT:    [[P1_TRANSPOSE_PADDED:%[^ ]+]] = f8e4m3fn[32,32]{1,0} pad([[P1_TRANSPOSE]], [[C1]])
+; CHECK-NEXT:    [[C2:%[^ ]+]] = f32[] constant(0)
+; CHECK-NEXT:    [[C2_BCAST:%[^ ]+]] = f32[13,31]{1,0} broadcast([[C2]]), dimensions={}
+; CHECK-NEXT:    [[C3:%[^ ]+]] = f32[] constant(0)
+; CHECK-NEXT:    [[C2_BCAST_PADDED:%[^ ]+]] = f32[16,32]{1,0} pad([[C2_BCAST]], [[C3]]), padding=0_3x0_1
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
+; CHECK-NEXT:    [[C4:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[DOT:%[^ ]+]] = f32[16,32]{1,0} custom-call([[P0_PADDED]], [[P1_TRANSPOSE_PADDED]], [[C2_BCAST_PADDED]], [[P2]], [[P3]], /*index=5*/[[C4]], [[C4]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
+; CHECK:           }"
+; CHECK-NEXT: ROOT [[OUT:%[^ ]+]] = f32[13,31]{1,0} slice([[DOT]]), slice={[0:13], [0:31]}
+      )");
+}
+
+TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDBitcastF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -4361,7 +4482,78 @@ TEST_P(ParameterizedFp8GemmRewriteTest, BitcastScaledABUnscaledDF8) {
           m::CustomCall({"__cublas$lt$matmul$f8"}).WithShape(F32, {16, 16})));
 }
 
+TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDUnaryOpsF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[3] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      x_f32 = f32[3] convert(x)
+      y_f32 = f32[32,16] convert(y)
+      x_scale = f32[] parameter(2)
+      y_scale = f32[] parameter(3)
+      x_scale_bcast = f32[3] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f32[32,16] broadcast(y_scale), dimensions={}
+      x_unscaled = f32[3] multiply(x_f32, x_scale_bcast)
+      zero = f32[] constant(0)
+      x_unscaled_padded = f32[30] pad(x_unscaled, zero), padding=0_27
+      x_unscaled_padded_bcast = f32[30,8,5] broadcast(x_unscaled_padded), dimensions={0}
+      x_unscaled_padded_bcast_sliced = f32[16,8,4] slice(x_unscaled_padded_bcast), slice={[2:18], [0:8], [0:4]}
+      x_unscaled_padded_bcast_sliced_reshaped = f32[16,32] reshape(x_unscaled_padded_bcast_sliced)
+      y_unscaled = f32[32,16] multiply(y_f32, y_scale_bcast)
+      ROOT out = f32[16,16] dot(x_unscaled_padded_bcast_sliced_reshaped, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+          }
+
+)";
+  CheckFp8IfOnHopper(hlo_text);
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::HOPPER, 0}),
+                            R"(
+
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[3], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[3]{0} parameter(0)
+; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[] constant(0)
+; CHECK-NEXT:    [[C0_CONVERT:%[^ ]+]] = f8e4m3fn[] convert([[C0]])
+; CHECK-NEXT:    [[P0_U0:%[^ ]+]] = f8e4m3fn[30]{0} pad([[P0]], [[C0_CONVERT]]), padding=0_27
+; CHECK-NEXT:    [[P0_U1:%[^ ]+]] = f8e4m3fn[30,8,5]{2,1,0} broadcast([[P0_U0]]), dimensions={0}
+; CHECK-NEXT:    [[P0_U2:%[^ ]+]] = f8e4m3fn[16,8,4]{2,1,0} slice([[P0_U1]]), slice={[2:18], [0:8], [0:4]}
+; CHECK-NEXT:    [[P0_U3:%[^ ]+]] = f8e4m3fn[16,32]{1,0} reshape([[P0_U2]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(0)
+; CHECK-NEXT:    [[C1_BCAST:%[^ ]+]] = f32[16,16]{1,0} broadcast([[C1]]), dimensions={}
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
+; CHECK-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0_U3]], [[P1_TRANSPOSE]], [[C1_BCAST]], [[P2]], [[P3]], /*index=5*/[[C2]], [[C2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
+; CHECK:           }"
+      )");
+}
+
 TEST_P(ParameterizedFp8GemmRewriteTest, BatchedScaledABUnscaledDF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -4416,6 +4608,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, BatchedScaledABUnscaledDF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABAlphaDF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -4474,6 +4669,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABAlphaDF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDReluActivationF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -4532,6 +4730,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDReluActivationF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, InvScaledABUnscaledDF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -4562,8 +4763,8 @@ TEST_P(ParameterizedFp8GemmRewriteTest, InvScaledABUnscaledDF8) {
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasF8) {
 #if CUDA_VERSION < 12000
-  GTEST_SKIP() << "A matrix bias on a matmul is only supported in CUDA 12";
-#endif
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -4620,6 +4821,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -4686,6 +4890,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABInvScaledDF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -4728,6 +4935,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABInvScaledDF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDReluActivationF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
     ENTRY test {
@@ -4796,8 +5006,8 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDReluActivationF8) {
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDMatrixBiasF8) {
 #if CUDA_VERSION < 12000
-  GTEST_SKIP() << "A matrix bias on a matmul is only supported in CUDA 12";
-#endif
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -4863,7 +5073,283 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDMatrixBiasF8) {
       )");
 }
 
+TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDVectorBiasF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[16,32] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      x_f16 = f16[16,32] convert(x)
+      y_f16 = f16[32,16] convert(y)
+      b = f16[16] parameter(2)
+      b_bcast = f16[16,16] broadcast(b), dimensions={1}
+      x_scale = f16[] parameter(3)
+      y_scale = f16[] parameter(4)
+      z_scale = f16[] parameter(5)
+      x_scale_bcast = f16[16,32] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f16[32,16] broadcast(y_scale), dimensions={}
+      z_scale_bcast = f16[16,16] broadcast(z_scale), dimensions={}
+      x_unscaled = f16[16,32] multiply(x_f16, x_scale_bcast)
+      y_unscaled = f16[32,16] multiply(y_f16, y_scale_bcast)
+      dot_a = f16[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      dot_a_bias = f16[16,16] add(dot_a, b_bcast)
+      dot_a_scaled = f16[16,16] divide(dot_a_bias, z_scale_bcast)
+      c1 = f16[] constant(-448.)
+      c1_bcast = f16[16,16] broadcast(c1), dimensions={}
+      c2 = f16[] constant(448.)
+      c2_bcast = f16[16,16] broadcast(c2), dimensions={}
+      dot_a_clamped = f16[16,16] clamp(c1_bcast, dot_a_scaled, c2_bcast)
+      ROOT dot_a_f8 = f8e4m3fn[16,16] convert(dot_a_clamped)
+          }
+
+)";
+
+  CheckFp8IfOnHopper(hlo_text, ErrorSpec{0.1, 0.1});
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::HOPPER, 0}),
+                            R"(
+
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], b: f16[16], x_scale: f16[], y_scale: f16[], z_scale: f16[]) -> f8e4m3fn[16,16] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f16[] constant(0)
+; CHECK-NEXT:    [[BC:%[^ ]+]] = f16[16,16]{1,0} broadcast([[C1]]), dimensions={}
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[] parameter(3)
+; CHECK-NEXT:    [[CV:%[^ ]+]] = f32[] convert([[P2]])
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f16[] parameter(4)
+; CHECK-NEXT:    [[CV1:%[^ ]+]] = f32[] convert([[P3]])
+; CHECK-NEXT:    [[C:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[C2:%[^ ]+]] = f16[] constant(1)
+; CHECK-NEXT:    [[P4:%[^ ]+]] = f16[] parameter(5)
+; CHECK-NEXT:    [[DV:%[^ ]+]] = f16[] divide([[C2]], [[P4]])
+; CHECK-NEXT:    [[CV2:%[^ ]+]] = f32[] convert([[DV]])
+; CHECK-NEXT:    [[VB:%[^ ]+]] = f16[16]{0} parameter(2)
+; CHECK:         ROOT [[OUT:%[^ ]+]] = f8e4m3fn[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[BC]], [[CV]], [[CV1]], /*index=5*/[[C]], [[CV2]], [[VB]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{ 
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[] 
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"BIAS\"
+; CHECK:           }"
+      )");
+}
+
+TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF32VectorBiasF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[16,32] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      x_f32 = f32[16,32] convert(x)
+      y_f32 = f32[32,16] convert(y)
+      b = f32[16] parameter(2)
+      b_bf16 = bf16[16] convert(b)
+      b_f32 = f32[16] convert(b_bf16)
+      b_bcast = f32[16,16] broadcast(b_f32), dimensions={1}
+      x_scale = f32[] parameter(3)
+      y_scale = f32[] parameter(4)
+      x_scale_bcast = f32[16,32] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f32[32,16] broadcast(y_scale), dimensions={}
+      x_unscaled = f32[16,32] multiply(x_f32, x_scale_bcast)
+      y_unscaled = f32[32,16] multiply(y_f32, y_scale_bcast)
+      dot_a = f32[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      ROOT out = f32[16,16] add(dot_a, b_bcast)
+           }
+
+)";
+
+  CheckFp8IfOnHopper(hlo_text);
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::HOPPER, 0}),
+                            R"(
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], b: f32[16], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
+; CHECK:         [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(0)
+; CHECK-NEXT:    [[BC:%[^ ]+]] = f32[16,16]{1,0} broadcast([[C1]]), dimensions={}
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(3)
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(4)
+; CHECK-NEXT:    [[C:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[VB:%[^ ]+]] = f32[16]{0} parameter(2)
+; CHECK-NEXT:    [[VBC:%[^ ]+]] = bf16[16]{0} convert([[VB]])
+; CHECK:         ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[BC]], [[P2]], [[P3]], /*index=5*/[[C]], [[C]], [[VBC]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{ 
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[] 
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"BIAS\"
+; CHECK:           }"
+      )");
+}
+
+TEST_P(ParameterizedFp8GemmRewriteTest,
+       ScaledABUnscaledDVectorBiasThenReluActivationF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[16,32] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      b = f16[16] parameter(2)
+      b_bcast = f16[16,16] broadcast(b), dimensions={1}
+      x_f32 = f16[16,32] convert(x)
+      y_f32 = f16[32,16] convert(y)
+      x_scale = f16[] parameter(3)
+      y_scale = f16[] parameter(4)
+      x_scale_bcast = f16[16,32] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f16[32,16] broadcast(y_scale), dimensions={}
+      x_unscaled = f16[16,32] multiply(x_f32, x_scale_bcast)
+      y_unscaled = f16[32,16] multiply(y_f32, y_scale_bcast)
+      c = f16[] constant(0)
+      c_bcast = f16[16,16] broadcast(c), dimensions={}
+      dot_a0 = f16[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      dot_a = f16[16,16] add(dot_a0, b_bcast)
+      ROOT out = f16[16,16] maximum(dot_a, c_bcast)
+          }
+)";
+
+  CheckFp8IfOnHopper(hlo_text, ErrorSpec{2e-3, 0.});
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::HOPPER, 0}),
+                            R"(
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], b: f16[16], x_scale: f16[], y_scale: f16[]) -> f16[16,16] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f16[] constant(0)
+; CHECK-NEXT:    [[BC:%[^ ]+]] = f16[16,16]{1,0} broadcast([[C1]]), dimensions={}
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[] parameter(3)
+; CHECK-NEXT:    [[CV:%[^ ]+]] = f32[] convert([[P2]])
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f16[] parameter(4)
+; CHECK-NEXT:    [[CV1:%[^ ]+]] = f32[] convert([[P3]])
+; CHECK-NEXT:    [[C:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[VB:%[^ ]+]] = f16[16]{0} parameter(2)
+; CHECK     :    ROOT [[OUT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[BC]], [[CV]], [[CV1]], /*index=5*/[[C]], [[C]], [[VB]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
+; CHECK:           }"
+      )");
+}
+
+TEST_P(ParameterizedFp8GemmRewriteTest,
+       ScaledABUnscaledDMatrixBiasThenVectorBiasF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[16,32] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      x_f16 = f16[16,32] convert(x)
+      y_f16 = f16[32,16] convert(y)
+      b = f16[16] parameter(2)
+      b_bcast = f16[16,16] broadcast(b), dimensions={1}
+      b2 = f16[16,16] parameter(3)
+      x_scale = f16[] parameter(4)
+      y_scale = f16[] parameter(5)
+      x_scale_bcast = f16[16,32] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f16[32,16] broadcast(y_scale), dimensions={}
+      x_unscaled = f16[16,32] multiply(x_f16, x_scale_bcast)
+      y_unscaled = f16[32,16] multiply(y_f16, y_scale_bcast)
+      dot_a = f16[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      dot_a_bias1 = f16[16,16] add(dot_a, b2)
+      ROOT dot_a_bias = f16[16,16] add(dot_a_bias1, b_bcast)
+          }
+
+)";
+  CheckFp8IfOnHopper(hlo_text, ErrorSpec{2e-3, 0.});
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::HOPPER, 0}),
+                            R"(
+; CHECK-LABEL:   ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], b: f16[16], b2: f16[16,16], x_scale: f16[], y_scale: f16[]) -> f16[16,16] {
+; CHECK-DAG:     [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[MB:%[^ ]+]] = f16[16,16]{1,0} parameter(3)
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[] parameter(4)
+; CHECK-NEXT:    [[CV0:%[^ ]+]] = f32[] convert([[P2]])
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f16[] parameter(5)
+; CHECK-NEXT:    [[CV1:%[^ ]+]] = f32[] convert([[P3]])
+; CHECK:         [[C1:%[^ ]+]] = f32[] constant(1)
+; CHECK:         [[GEMMOUT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[MB]], [[CV0]], [[CV1]], /*index=5*/[[C1]], [[C1]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":1
+; CHECK-DAG:         \"dot_dimension_numbers\":{ 
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[] 
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
+; CHECK:           }"
+; CHECK:         [[VB:%[^ ]+]] = f16[16]{0} parameter(2)
+; CHECK:         [[VBC:%[^ ]+]] = f16[16,16]{1,0} broadcast([[VB]]), dimensions={1}
+; CHECK:         ROOT [[OUT:%[^ ]+]] = f16[16,16]{1,0} add([[GEMMOUT]], [[VBC]])
+      )");
+}
+
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDWithDAmaxF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -4939,7 +5425,177 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDWithDAmaxF8) {
       )");
 }
 
+TEST_P(ParameterizedFp8GemmRewriteTest,
+       ScaledABScaledDWithDAmaxF8WithF16Intermediates) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+  // This is the same as ScaledABScaledDWithDAmaxF8, but uses F16 intermediate
+  // values instead of F32 intermediate values.
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f16[] parameter(0)
+      b = f16[] parameter(1)
+      ROOT c = f16[] maximum(a, b)
+    }
+
+    ENTRY test {
+      x = f8e4m3fn[16,32] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      x_f16 = f16[16,32] convert(x)
+      y_f16 = f16[32,16] convert(y)
+      x_scale = f16[] parameter(2)
+      y_scale = f16[] parameter(3)
+      z_scale = f16[] parameter(4)
+      x_scale_bcast = f16[16,32] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f16[32,16] broadcast(y_scale), dimensions={}
+      z_scale_bcast = f16[16,16] broadcast(z_scale), dimensions={}
+      x_unscaled = f16[16,32] multiply(x_f16, x_scale_bcast)
+      y_unscaled = f16[32,16] multiply(y_f16, y_scale_bcast)
+      dot_a = f16[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      abs_dot_a = f16[16,16] abs(dot_a)
+      c0 = f16[] constant(-inf)
+      amax = f16[] reduce(abs_dot_a, c0), dimensions={0,1}, to_apply=apply
+      dot_a_scaled = f16[16,16] divide(dot_a, z_scale_bcast)
+      c1 = f16[] constant(-448.)
+      c1_bcast = f16[16,16] broadcast(c1), dimensions={}
+      c2 = f16[] constant(448.)
+      c2_bcast = f16[16,16] broadcast(c2), dimensions={}
+      dot_a_clamped = f16[16,16] clamp(c1_bcast, dot_a_scaled, c2_bcast)
+      dot_a_f8 = f8e4m3fn[16,16] convert(dot_a_clamped)
+      ROOT out = (f8e4m3fn[16,16], f16[]) tuple(dot_a_f8, amax)
+          }
+
+)";
+
+  CheckFp8IfOnHopper(hlo_text);
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::HOPPER, 0}),
+                            R"(
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f16[], y_scale: f16[], z_scale: f16[]) -> (f8e4m3fn[16,16], f16[]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]])
+; CHECK-NEXT:    [[C0:%[^ ]+]] = f16[] constant(0)
+; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = f16[16,16]{1,0} broadcast([[C0]]), dimensions={}
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[] parameter(2)
+; CHECK-NEXT:    [[P2_CONVERT:%[^ ]+]] = f32[] convert([[P2]])
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f16[] parameter(3)
+; CHECK-NEXT:    [[P3_CONVERT:%[^ ]+]] = f32[] convert([[P3]])
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[C2:%[^ ]+]] = f16[] constant(1)
+; CHECK-NEXT:    [[P4:%[^ ]+]] = f16[] parameter(4)
+; CHECK-NEXT:    [[P4_INV:%[^ ]+]] = f16[] divide([[C2]], [[P4]])
+; CHECK-NEXT:    [[P4_INV_CONVERT:%[^ ]+]] = f32[] convert([[P4_INV]])
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f8e4m3fn[16,16]{1,0}, f32[]) custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2_CONVERT]], [[P3_CONVERT]], /*index=5*/[[C1]], [[P4_INV_CONVERT]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
+; CHECK:           }"
+      )");
+}
+
+TEST_P(ParameterizedFp8GemmRewriteTest,
+       ScaledABScaledDReluActivationWithDAmaxF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] maximum(a, b)
+    }
+
+    ENTRY test {
+      x = f8e4m3fn[16,32] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      x_f32 = f32[16,32] convert(x)
+      y_f32 = f32[32,16] convert(y)
+      x_scale = f32[] parameter(2)
+      y_scale = f32[] parameter(3)
+      z_scale = f32[] parameter(4)
+      x_scale_bcast = f32[16,32] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f32[32,16] broadcast(y_scale), dimensions={}
+      z_scale_bcast = f32[16,16] broadcast(z_scale), dimensions={}
+      x_unscaled = f32[16,32] multiply(x_f32, x_scale_bcast)
+      y_unscaled = f32[32,16] multiply(y_f32, y_scale_bcast)
+      dot_a = f32[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      czero = f32[] constant(0)
+      czero_bcast = f32[16,16] broadcast(czero), dimensions={}
+      dot_a_relu = f32[16,16] maximum(dot_a, czero_bcast)
+      c0 = f32[] constant(-inf)
+      amax = f32[] reduce(dot_a_relu, c0), dimensions={0,1}, to_apply=apply
+      dot_a_scaled = f32[16,16] divide(dot_a_relu, z_scale_bcast)
+      c1 = f32[] constant(-448.)
+      c1_bcast = f32[16,16] broadcast(c1), dimensions={}
+      c2 = f32[] constant(448.)
+      c2_bcast = f32[16,16] broadcast(c2), dimensions={}
+      dot_a_clamped = f32[16,16] clamp(c1_bcast, dot_a_scaled, c2_bcast)
+      dot_a_f8 = f8e4m3fn[16,16] convert(dot_a_clamped)
+      ROOT out = (f8e4m3fn[16,16], f32[]) tuple(dot_a_f8, amax)
+          }
+
+)";
+
+  CheckFp8IfOnHopper(hlo_text);
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::HOPPER, 0}),
+                            R"(
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[], z_scale: f32[]) -> (f8e4m3fn[16,16], f32[]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]])
+; CHECK-NEXT:    [[C0:%[^ ]+]] = bf16[] constant(0)
+; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = bf16[16,16]{1,0} broadcast([[C0]]), dimensions={}
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
+; CHECK-NEXT:    [[P4_INV:%[^ ]+]] = f32[] divide([[C2]], [[P4]])
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f8e4m3fn[16,16]{1,0}, f32[]) custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[P4_INV]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"RELU\"
+; CHECK:           }"
+      )");
+}
+
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8Parameterized) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   std::array<std::array<absl::string_view, 7>, 32> combinations;
   int i = 0;
 
@@ -5007,6 +5663,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8Parameterized) {
 
 TEST_P(ParameterizedFp8GemmRewriteTest,
        ScaledABUnscaledDF8ParameterizedBatched) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   // TODO(wenscarl): For batched matmaul, not all combinations of A, B and
   // output layouts get pattern matched successfully to FP8 custom call. Only
   // a handful of cases are tested here.
@@ -5072,6 +5731,9 @@ ENTRY f {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8TF32E5M2) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index a8a3ea7389e..b51f894dcda 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -80,8 +80,8 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
     HloModule unnested_transpose_2
 
     ENTRY unnested_transpose_2 {
-      para0 = f16[2,3,64]{2,1,0} parameter(0)
-      ROOT copy1 = f16[2,3,64]{1,0,2} copy(para0)
+      para0 = f16[2,3,4]{2,1,0} parameter(0)
+      ROOT copy1 = f16[2,3,4]{1,0,2} copy(para0)
     })";
 
   // Check that a call to llvm.nvvm.barrier0 is not generated.  As in
@@ -844,7 +844,7 @@ TEST_F(GpuKernelTilingTest, ReductionInputTooLarge) {
   Status status = CompileToExecutable(std::move(hlo_module)).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kFailedPrecondition);
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       ::testing::HasSubstr(
           "Number of physical blocks (4294967296) does not fit in an i32"));
 }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_reduce_scatter_creator_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_reduce_scatter_creator_test.cc
index ec2acec9d37..a80f897735f 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_reduce_scatter_creator_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_reduce_scatter_creator_test.cc
@@ -56,9 +56,7 @@ class GpuReduceScatterCreatorTest : public HloTestBase {
 
   size_t AllReduceCount(std::unique_ptr<HloModule> &module) {
     return absl::c_count_if(module->entry_computation()->instructions(),
-                            [](const HloInstruction *inst) {
-                              return inst->opcode() == HloOpcode::kAllReduce;
-                            });
+                            HloPredicateIsOp<HloOpcode::kAllReduce>);
   }
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_spmd_e2e_compile_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_spmd_e2e_compile_test.cc
index 3cb3d6280e8..6af9aa972b0 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_spmd_e2e_compile_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_spmd_e2e_compile_test.cc
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
+#include <utility>
 
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
-#include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
@@ -82,6 +81,79 @@ ENTRY main {
   EXPECT_FALSE(has_collective_ops);
 }
 
+TEST_F(GpuSpmdE2ECompileTest, CollectivesScheduleLinearizerNoDeps) {
+  // Setup the module such that we will need to generate > 1 collective for
+  // sharding
+  const char *const hlo_string = R"(
+HloModule test
+
+ENTRY main {
+  %x = f32[1024,1024] parameter(0), sharding={devices=[2,2]0,1,2,3}
+  %y = f32[1024,1024] parameter(1), sharding={devices=[2,2]0,1,2,3}
+  %dot1 = f32[1024,1024] dot(%x, %y), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sharding={replicated}
+  %dot2 = f32[1024,1024] dot(%dot1, %y), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sharding={devices=[2,2]0,1,2,3}
+  ROOT r = copy(%dot2), sharding={replicated}
+})";
+
+  HloModuleConfig config;
+  config.set_use_spmd_partitioning(true);
+  config.set_num_partitions(4);
+  config.set_debug_options(GetDebugOptionsFromFlags());
+  auto hlo_module = ParseAndReturnVerifiedModule(hlo_string, config).value();
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
+                          GetOptimizedModule(std::move(hlo_module)));
+  // Verify that none of the collective operations generated have control
+  // dependencies.
+  const HloComputation *entry = optimized_module->entry_computation();
+  for (const HloInstruction *instr : entry->instructions()) {
+    if (!hlo_query::IsCollectiveCommunicationOp(instr->opcode())) {
+      continue;
+    }
+    EXPECT_TRUE(instr->control_predecessors().empty());
+    EXPECT_TRUE(instr->control_successors().empty());
+  }
+}
+
+TEST_F(GpuSpmdE2ECompileTest, CollectivesScheduleLinearizerDepsWithConv) {
+  // Setup the module such that we will need to generate > 1 collective for
+  // sharding, and verify that linearizer inserts control deps as there are
+  // convolutions that can be auto tuned.
+  const char *const hlo_string = R"(
+HloModule test
+
+ENTRY main {
+  %x = f32[1024,1024] parameter(0), sharding={devices=[2,2]0,1,2,3}
+  %y = f32[1024,1024] parameter(1), sharding={devices=[2,2]0,1,2,3}
+  %dot1 = f32[1024,1024] dot(%x, %y), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sharding={replicated}
+  %dot2 = f32[1024,1024] dot(%dot1, %y), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sharding={devices=[2,2]0,1,2,3}
+  %p0 = f32[8,5,5,1] parameter(2), sharding={replicated}
+  %p1 = f32[3,3,1,32] parameter(3), sharding={replicated}
+  %conv = f32[8,5,5,32] convolution(%p0, %p1), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f
+  ROOT %t = (f32[1024,1024], f32[8,5,5,32]) tuple(%dot2, %conv), sharding={replicated}
+})";
+
+  HloModuleConfig config;
+  config.set_use_spmd_partitioning(true);
+  config.set_num_partitions(4);
+  config.set_debug_options(GetDebugOptionsFromFlags());
+  auto hlo_module = ParseAndReturnVerifiedModule(hlo_string, config).value();
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
+                          GetOptimizedModule(std::move(hlo_module)));
+  // Verify that control dependencies are inserted for collectives.
+  bool has_control_deps = false;
+  const HloComputation *entry = optimized_module->entry_computation();
+  for (const HloInstruction *instr : entry->instructions()) {
+    if (!hlo_query::IsCollectiveCommunicationOp(instr->opcode())) {
+      continue;
+    }
+    has_control_deps |= !instr->control_predecessors().empty() ||
+                        !instr->control_successors().empty();
+  }
+  EXPECT_TRUE(has_control_deps);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/hlo_to_llvm_ir.cc b/tensorflow/compiler/xla/service/gpu/tests/hlo_to_llvm_ir.cc
index 3296273bb17..562aa6b70b0 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/hlo_to_llvm_ir.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/hlo_to_llvm_ir.cc
@@ -13,20 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/Target/TargetMachine.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
-#if GOOGLE_CUDA
-#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
-#endif
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/compiler/xla/tools/hlo_module_loader.h"
 #include "tensorflow/tsl/platform/init_main.h"
-#include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/util/command_line_flags.h"
 
 const char* const kUsage = R"(
@@ -50,18 +45,8 @@ xla::Status CompileAndPrintLlvmIr(const std::string& hlo_text,
       std::unique_ptr<xla::HloModule> hlo_module,
       xla::LoadModuleFromData(/*data=*/hlo_text, /*format=*/"hlo"));
   llvm::LLVMContext llvm_context;
-  // For now we pretend we're compiling for V100.  This can be generalized
-  // later.
-
-  xla::gpu::GpuDeviceInfo gpu_device_info{};
-  gpu_device_info.threads_per_block_limit = 1024;
-  gpu_device_info.threads_per_warp = 32;
-  gpu_device_info.shared_memory_per_block = 49152;
-  gpu_device_info.core_count = 80;
-  gpu_device_info.threads_per_core_limit = 2048;
-  gpu_device_info.block_dim_limit_x = 2147483647;
-  gpu_device_info.block_dim_limit_y = 65535;
-  gpu_device_info.block_dim_limit_z = 65535;
+  xla::gpu::GpuDeviceInfo gpu_device_info =
+      xla::gpu::TestGpuDeviceInfo::RTXA6000DeviceInfo();
 
   tensorflow::se::CudaComputeCapability cuda_compute_capability;
   cuda_compute_capability.major = sm / 10;
@@ -92,7 +77,8 @@ xla::Status CompileAndPrintLlvmIr(const std::string& hlo_text,
             llvm_module.get(), cuda_compute_capability, hlo_module->config()));
     std::cout << ptx << std::endl;
 #else
-    return {tsl::error::UNIMPLEMENTED, "Feature not yet implemented in ROCm"};
+    return {absl::StatusCode::kUnimplemented,
+            "Feature not yet implemented in ROCm"};
 #endif
   }
   return xla::OkStatus();
diff --git a/tensorflow/compiler/xla/service/gpu/tests/kernel_reuse.hlo b/tensorflow/compiler/xla/service/gpu/tests/kernel_reuse.hlo
index 4d6c239b381..fde36ed636c 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/kernel_reuse.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/kernel_reuse.hlo
@@ -42,6 +42,40 @@ ENTRY main {
 
 // -----
 
+// All (Triton) fusions must reuse the same kernel:
+// CHECK-LABEL: target triple
+// CHECK-NOT: define void
+// CHECK: define void @triton_gemm_dot1(
+// CHECK-NOT: define void
+
+HloModule t
+
+triton_gemm_dot0 {
+  parameter_1 = f16[15,19]{1,0} parameter(1)
+  parameter_0 = s8[19,17]{1,0} parameter(0)
+  cp1.1 = f16[19,17]{1,0} convert(parameter_0)
+  ROOT dot0.1 = f16[15,17]{1,0} dot(parameter_1, cp1.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+triton_gemm_dot1 {
+  parameter_1.1 = f16[15,19]{1,0} parameter(1)
+  parameter_0.1 = s8[19,17]{1,0} parameter(0)
+  cp3.1 = f16[19,17]{1,0} convert(parameter_0.1)
+  ROOT dot1.1 = f16[15,17]{1,0} dot(parameter_1.1, cp3.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p3 = s8[19,17]{1,0} parameter(3)
+  p2 = f16[15,19]{1,0} parameter(2)
+  p1 = s8[19,17]{1,0} parameter(1)
+  p0 = f16[15,19]{1,0} parameter(0)
+  triton_gemm_dot0 = f16[15,17]{1,0} fusion(p1, p0), kind=kCustom, calls=triton_gemm_dot0, backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}"
+  triton_gemm_dot1 = f16[15,17]{1,0} fusion(p3, p2), kind=kCustom, calls=triton_gemm_dot1, backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}"
+  ROOT tuple = (f16[15,17]{1,0}, f16[15,17]{1,0}) tuple(triton_gemm_dot0, triton_gemm_dot1)
+}
+
+// -----
+
 // We need 2 different kernels:
 // - @fusion_2's %arg0 must have align 16, because we are passing a module input
 // - @fusion_1's %arg0 must have align 128, because we are passing an internal buffer
diff --git a/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo b/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo
index e7e3b5a97d2..dbffbbf1af6 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo
@@ -24,7 +24,7 @@ ENTRY main {
 // CHECK-LABEL: entry:
 // CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-DAG:   ![[ctaid_range]] = !{i32 0, i32 1280}
+// CHECK-DAG:   ![[ctaid_range]] = !{i32 0, i32 1008}
 // CHECK-DAG:   ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule Test
@@ -42,7 +42,7 @@ ENTRY main {
 // CHECK-LABEL: entry:
 // CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-DAG:   ![[ctaid_range]] = !{i32 0, i32 1280}
+// CHECK-DAG:   ![[ctaid_range]] = !{i32 0, i32 1008}
 // CHECK-DAG:   ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule ScalarBroadcast
@@ -69,7 +69,7 @@ ENTRY main {
 // CHECK-LABEL: entry:
 // CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-DAG:   ![[ctaid_range]] = !{i32 0, i32 1280}
+// CHECK-DAG:   ![[ctaid_range]] = !{i32 0, i32 1008}
 // CHECK-DAG:   ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule Test
diff --git a/tensorflow/compiler/xla/service/gpu/tests/mlir_gpu_compile_test.cc b/tensorflow/compiler/xla/service/gpu/tests/mlir_gpu_compile_test.cc
index a9ecad94fb8..711961942da 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/mlir_gpu_compile_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/mlir_gpu_compile_test.cc
@@ -25,14 +25,18 @@ class CompileTest : public MlirGpuTestBase {};
 
 TEST_F(CompileTest, InvalidCollectivePermuteOp) {
   const char* mlir_text = R"(
-      func.func @main(%arg0: memref<4xf32> {lmhlo.params = 0 : index},
-                 %arg1: memref<4xf32> {lmhlo.output_index = dense<[0]> : tensor<1xindex>}) -> () {
-          "lmhlo.collective_permute"(%arg0, %arg1) {source_target_pairs = dense<[[0, 1, 2]]> : tensor<1x3xi64>} : (memref<4xf32>, memref<4xf32>) -> ()
+      func.func @main(
+         %arg0: memref<4xf32> {lmhlo.params = 0 : index},
+          %arg1: memref<4xf32> {lmhlo.output_index = dense<[0]> : tensor<1xindex>}) -> () {
+          %token = "lmhlo_gpu.collective_permute_start"(%arg0, %arg1) {
+             source_target_pairs = dense<[[0, 1, 2]]> : tensor<1x3xi64>,
+             is_sync = true
+             } : (memref<4xf32>, memref<4xf32>) -> (!mhlo.token)
           "func.return" () : () -> ()
       })";
   auto executable = CompileMlirText(mlir_text);
   ASSERT_FALSE(executable.ok());
-  EXPECT_THAT(executable.status().error_message().c_str(),
+  EXPECT_THAT(executable.status().message(),
               ::testing::HasSubstr("expect source_target_pairs attribute of "
                                    "shape (N, 2), but got (1, 3)"));
 }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/pad_to_static.hlo b/tensorflow/compiler/xla/service/gpu/tests/pad_to_static.hlo
index 9ec3baca782..d70a3b829fb 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/pad_to_static.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/pad_to_static.hlo
@@ -3,71 +3,66 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
 // CHECK-LABEL: entry:
-// CHECK:         %[[VAL_0:.*]] = getelementptr inbounds i8, ptr %[[VAL_1:.*]], i64 0
-// CHECK:         %[[VAL_2:.*]] = getelementptr inbounds i8, ptr %[[VAL_3:.*]], i64 0
-// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds i8, ptr %[[VAL_5:.*]], i64 0
-// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds i8, ptr %[[VAL_7:.*]], i64 0
-// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds i8, ptr %[[VAL_9:.*]], i64 0
-// CHECK:         %[[VAL_10:.*]] = getelementptr inbounds i8, ptr %[[VAL_0]], i32 32
-// CHECK:         %[[VAL_11:.*]] = load i32, ptr %[[VAL_10]], align 4
-// CHECK:         %[[VAL_12:.*]] = getelementptr inbounds i8, ptr %[[VAL_0]], i32 36
-// CHECK:         %[[VAL_13:.*]] = load i32, ptr %[[VAL_12]], align 4
-// CHECK:         %[[VAL_14:.*]] = getelementptr inbounds i8, ptr %[[VAL_0]], i32 40
-// CHECK:         %[[VAL_15:.*]] = load i32, ptr %[[VAL_14]], align 4
-// CHECK:         %[[VAL_16:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CHECK:         %[[VAL_17:.*]] = icmp eq i32 0, %[[VAL_16]]
-// CHECK:         %[[VAL_18:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-// CHECK:         %[[VAL_19:.*]] = icmp eq i32 0, %[[VAL_18]]
-// CHECK:         %[[VAL_20:.*]] = and i1 %[[VAL_17]], %[[VAL_19]]
-// CHECK:         br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_22:.*]]
-// CHECK:       is_thread_0-after:                                ; preds = %[[VAL_21]], %[[VAL_23:.*]]
-// CHECK:         %[[VAL_24:.*]] = mul i32 1, %[[VAL_11]]
-// CHECK:         %[[VAL_25:.*]] = mul i32 %[[VAL_24]], %[[VAL_13]]
-// CHECK:         %[[VAL_26:.*]] = mul i32 %[[VAL_25]], %[[VAL_15]]
-// CHECK:         %[[VAL_27:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
-// CHECK:         %[[VAL_28:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
-// CHECK:         %[[VAL_29:.*]] = mul nuw nsw i32 %[[VAL_27]], 8
-// CHECK:         %[[VAL_30:.*]] = add nuw nsw i32 %[[VAL_29]], %[[VAL_28]]
-// CHECK:         %[[VAL_31:.*]] = icmp ult i32 %[[VAL_30]], 8
-// CHECK:         call void @llvm.assume(i1 %[[VAL_31]])
-// CHECK:         %[[VAL_32:.*]] = udiv i32 %[[VAL_30]], 1
-// CHECK:         %[[VAL_33:.*]] = urem i32 %[[VAL_32]], 2
-// CHECK:         %[[VAL_34:.*]] = udiv i32 %[[VAL_30]], 2
-// CHECK:         %[[VAL_35:.*]] = urem i32 %[[VAL_34]], 2
-// CHECK:         %[[VAL_36:.*]] = udiv i32 %[[VAL_30]], 4
-// CHECK:         %[[VAL_37:.*]] = icmp ult i32 %[[VAL_30]], 8
-// CHECK:         br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_39:.*]]
-// CHECK:       custom_call_2.in_bounds-after:                    ; preds = %[[VAL_40:.*]], %[[VAL_22]]
+// CHECK:         %[[VAL_0:.*]] = getelementptr inbounds i8, ptr %[[VAL_1:.*]], i32 32
+// CHECK:         %[[VAL_2:.*]] = load i32, ptr %[[VAL_0]], align 4
+// CHECK:         %[[VAL_3:.*]] = getelementptr inbounds i8, ptr %[[VAL_1]], i32 36
+// CHECK:         %[[VAL_4:.*]] = load i32, ptr %[[VAL_3]], align 4
+// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds i8, ptr %[[VAL_1]], i32 40
+// CHECK:         %[[VAL_6:.*]] = load i32, ptr %[[VAL_5]], align 4
+// CHECK:         %[[VAL_7:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK:         %[[VAL_8:.*]] = icmp eq i32 0, %[[VAL_7]]
+// CHECK:         %[[VAL_9:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+// CHECK:         %[[VAL_10:.*]] = icmp eq i32 0, %[[VAL_9]]
+// CHECK:         %[[VAL_11:.*]] = and i1 %[[VAL_8]], %[[VAL_10]]
+// CHECK:         br i1 %[[VAL_11]], label %[[VAL_12:.*]], label %[[VAL_13:.*]]
+// CHECK:       is_thread_0-after:                                ; preds = %[[VAL_12]], %[[VAL_14:.*]]
+// CHECK:         %[[VAL_15:.*]] = mul i32 1, %[[VAL_2]]
+// CHECK:         %[[VAL_16:.*]] = mul i32 %[[VAL_15]], %[[VAL_4]]
+// CHECK:         %[[VAL_17:.*]] = mul i32 %[[VAL_16]], %[[VAL_6]]
+// CHECK:         %[[VAL_18:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_19:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
+// CHECK:         %[[VAL_20:.*]] = mul nuw nsw i32 %[[VAL_18]], 8
+// CHECK:         %[[VAL_21:.*]] = add nuw nsw i32 %[[VAL_20]], %[[VAL_19]]
+// CHECK:         %[[VAL_22:.*]] = icmp ult i32 %[[VAL_21]], 8
+// CHECK:         call void @llvm.assume(i1 %[[VAL_22]])
+// CHECK:         %[[VAL_23:.*]] = udiv i32 %[[VAL_21]], 1
+// CHECK:         %[[VAL_24:.*]] = urem i32 %[[VAL_23]], 2
+// CHECK:         %[[VAL_25:.*]] = udiv i32 %[[VAL_21]], 2
+// CHECK:         %[[VAL_26:.*]] = urem i32 %[[VAL_25]], 2
+// CHECK:         %[[VAL_27:.*]] = udiv i32 %[[VAL_21]], 4
+// CHECK:         %[[VAL_28:.*]] = icmp ult i32 %[[VAL_21]], 8
+// CHECK:         br i1 %[[VAL_28]], label %[[VAL_29:.*]], label %[[VAL_30:.*]]
+// CHECK:       custom_call_2.in_bounds-after:                    ; preds = %[[VAL_31:.*]], %[[VAL_13]]
 // CHECK:         ret void
-// CHECK:       is_thread_0-true:                                 ; preds = %[[VAL_23]]
-// CHECK:         store i32 %[[VAL_11]], ptr %[[VAL_4]], align 4
-// CHECK:         store i32 %[[VAL_13]], ptr %[[VAL_6]], align 4
-// CHECK:         store i32 %[[VAL_15]], ptr %[[VAL_8]], align 4
-// CHECK:         br label %[[VAL_22]]
-// CHECK:       custom_call_2.in_bounds-true:                     ; preds = %[[VAL_22]]
-// CHECK:         %[[VAL_41:.*]] = mul nuw nsw i32 %[[VAL_33]], 1
-// CHECK:         %[[VAL_42:.*]] = add nuw nsw i32 0, %[[VAL_41]]
-// CHECK:         %[[VAL_43:.*]] = mul nuw nsw i32 %[[VAL_35]], 2
-// CHECK:         %[[VAL_44:.*]] = add nuw nsw i32 %[[VAL_42]], %[[VAL_43]]
-// CHECK:         %[[VAL_45:.*]] = mul nuw nsw i32 %[[VAL_36]], 4
-// CHECK:         %[[VAL_46:.*]] = add nuw nsw i32 %[[VAL_44]], %[[VAL_45]]
-// CHECK:         %[[VAL_47:.*]] = icmp ult i32 %[[VAL_46]], %[[VAL_26]]
-// CHECK:         br i1 %[[VAL_47]], label %[[VAL_48:.*]], label %[[VAL_40]]
-// CHECK:       custom_call_2.in_dyn_bounds-after:                ; preds = %[[VAL_48]], %[[VAL_38]]
-// CHECK:         br label %[[VAL_39]]
-// CHECK:       custom_call_2.in_dyn_bounds-true:                 ; preds = %[[VAL_38]]
-// CHECK:         %[[VAL_49:.*]] = udiv i32 %[[VAL_46]], 1
-// CHECK:         %[[VAL_50:.*]] = urem i32 %[[VAL_49]], %[[VAL_15]]
-// CHECK:         %[[VAL_51:.*]] = mul i32 1, %[[VAL_15]]
-// CHECK:         %[[VAL_52:.*]] = udiv i32 %[[VAL_46]], %[[VAL_51]]
-// CHECK:         %[[VAL_53:.*]] = urem i32 %[[VAL_52]], %[[VAL_13]]
-// CHECK:         %[[VAL_54:.*]] = mul i32 %[[VAL_51]], %[[VAL_13]]
-// CHECK:         %[[VAL_55:.*]] = udiv i32 %[[VAL_46]], %[[VAL_54]]
-// CHECK:         %[[VAL_56:.*]] = getelementptr inbounds i32, ptr %[[VAL_0]], i32 %[[VAL_30]]
-// CHECK:         %[[VAL_57:.*]] = load i32, ptr %[[VAL_56]], align 4, !invariant.load !4
-// CHECK:         %[[VAL_58:.*]] = getelementptr inbounds [2 x [2 x [2 x i32]]], ptr %[[VAL_2]], i32 0, i32 %[[VAL_55]], i32 %[[VAL_53]], i32 %[[VAL_50]]
-// CHECK:         store i32 %[[VAL_57]], ptr %[[VAL_58]], align 4
-// CHECK:         br label %[[VAL_40]]
+// CHECK:       is_thread_0-true:                                 ; preds = %[[VAL_14]]
+// CHECK:         store i32 %[[VAL_2]], ptr %[[VAL_32:.*]], align 4
+// CHECK:         store i32 %[[VAL_4]], ptr %[[VAL_33:.*]], align 4
+// CHECK:         store i32 %[[VAL_6]], ptr %[[VAL_34:.*]], align 4
+// CHECK:         br label %[[VAL_13]]
+// CHECK:       custom_call_2.in_bounds-true:                     ; preds = %[[VAL_13]]
+// CHECK:         %[[VAL_35:.*]] = mul nuw nsw i32 %[[VAL_24]], 1
+// CHECK:         %[[VAL_36:.*]] = add nuw nsw i32 0, %[[VAL_35]]
+// CHECK:         %[[VAL_37:.*]] = mul nuw nsw i32 %[[VAL_26]], 2
+// CHECK:         %[[VAL_38:.*]] = add nuw nsw i32 %[[VAL_36]], %[[VAL_37]]
+// CHECK:         %[[VAL_39:.*]] = mul nuw nsw i32 %[[VAL_27]], 4
+// CHECK:         %[[VAL_40:.*]] = add nuw nsw i32 %[[VAL_38]], %[[VAL_39]]
+// CHECK:         %[[VAL_41:.*]] = icmp ult i32 %[[VAL_40]], %[[VAL_17]]
+// CHECK:         br i1 %[[VAL_41]], label %[[VAL_42:.*]], label %[[VAL_31]]
+// CHECK:       custom_call_2.in_dyn_bounds-after:                ; preds = %[[VAL_42]], %[[VAL_29]]
+// CHECK:         br label %[[VAL_30]]
+// CHECK:       custom_call_2.in_dyn_bounds-true:                 ; preds = %[[VAL_29]]
+// CHECK:         %[[VAL_43:.*]] = udiv i32 %[[VAL_40]], 1
+// CHECK:         %[[VAL_44:.*]] = urem i32 %[[VAL_43]], %[[VAL_6]]
+// CHECK:         %[[VAL_45:.*]] = mul i32 1, %[[VAL_6]]
+// CHECK:         %[[VAL_46:.*]] = udiv i32 %[[VAL_40]], %[[VAL_45]]
+// CHECK:         %[[VAL_47:.*]] = urem i32 %[[VAL_46]], %[[VAL_4]]
+// CHECK:         %[[VAL_48:.*]] = mul i32 %[[VAL_45]], %[[VAL_4]]
+// CHECK:         %[[VAL_49:.*]] = udiv i32 %[[VAL_40]], %[[VAL_48]]
+// CHECK:         %[[VAL_50:.*]] = getelementptr inbounds i32, ptr %[[VAL_1]], i32 %[[VAL_21]]
+// CHECK:         %[[VAL_51:.*]] = load i32, ptr %[[VAL_50]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_52:.*]] = getelementptr inbounds [2 x [2 x [2 x i32]]], ptr %[[VAL_53:.*]], i32 0, i32 %[[VAL_49]], i32 %[[VAL_47]], i32 %[[VAL_44]]
+// CHECK:         store i32 %[[VAL_51]], ptr %[[VAL_52]], align 4
+// CHECK:         br label %[[VAL_31]]
 
 
 HloModule PadToStatic
diff --git a/tensorflow/compiler/xla/service/gpu/tests/rng_get_and_update_state.hlo b/tensorflow/compiler/xla/service/gpu/tests/rng_get_and_update_state.hlo
index 646584dbcf5..f681979f3a9 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/rng_get_and_update_state.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/rng_get_and_update_state.hlo
@@ -3,12 +3,11 @@
 HloModule TestModule
 
 // CHECK-LABEL: entry:
-// CHECK:         %[[VAL_0:.*]] = getelementptr inbounds i8, ptr %[[VAL_1:.*]], i64 0
-// CHECK:         %[[VAL_2:.*]] = load i128, ptr addrspace(1) @rng_state, align 16
-// CHECK:         %[[VAL_3:.*]] = add i128 %[[VAL_2]], 131072
-// CHECK:         store i128 %[[VAL_3]], ptr addrspace(1) @rng_state, align 16
-// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds i64, ptr %[[VAL_0]], i64 0
-// CHECK:         store i128 %[[VAL_2]], ptr %[[VAL_4]], align 16
+// CHECK:         %[[VAL_0:.*]] = load i128, ptr addrspace(1) @rng_state, align 16
+// CHECK:         %[[VAL_1:.*]] = add i128 %[[VAL_0]], 131072
+// CHECK:         store i128 %[[VAL_1]], ptr addrspace(1) @rng_state, align 16
+// CHECK:         %[[VAL_2:.*]] = getelementptr inbounds i64, ptr %[[VAL_3:.*]], i64 0
+// CHECK:         store i128 %[[VAL_0]], ptr %[[VAL_2]], align 16
 // CHECK:         ret void
 ENTRY Test {
   ROOT %rng-get-and-update-state = u64[2]{0} rng-get-and-update-state(), delta=131072
diff --git a/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo b/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
index 87de41db803..7a091853295 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
@@ -2,159 +2,146 @@
 
 // RUN: hlo_to_llvm_ir %s | FileCheck %s
 
-// CHECK-LABEL: define void @scatter_TensorFlowScatterV1(ptr noalias align 16 dereferenceable(36) %alloc0, ptr noalias align 16 dereferenceable(8) %alloc1, ptr noalias align 16 dereferenceable(24) %alloc2) {
 // CHECK-LABEL: entry:
 // CHECK:         %[[VAL_0:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_1:.*]] = getelementptr inbounds i8, ptr %[[VAL_2:.*]], i64 0
-// CHECK:         %[[VAL_3:.*]] = getelementptr inbounds i8, ptr %[[VAL_4:.*]], i64 0
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds i8, ptr %[[VAL_6:.*]], i64 0
-// CHECK:         %[[VAL_7:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
-// CHECK:         %[[VAL_8:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
-// CHECK:         %[[VAL_9:.*]] = mul nuw nsw i32 %[[VAL_7]], 6
-// CHECK:         %[[VAL_10:.*]] = add nuw nsw i32 %[[VAL_9]], %[[VAL_8]]
-// CHECK:         %[[VAL_11:.*]] = icmp ult i32 %[[VAL_10]], 6
-// CHECK:         call void @llvm.assume(i1 %[[VAL_11]])
-// CHECK:         %[[VAL_12:.*]] = udiv i32 %[[VAL_10]], 1
-// CHECK:         %[[VAL_13:.*]] = urem i32 %[[VAL_12]], 3
-// CHECK:         %[[VAL_14:.*]] = udiv i32 %[[VAL_10]], 3
-// CHECK:         %[[VAL_15:.*]] = icmp ult i32 %[[VAL_10]], 6
-// CHECK:         br i1 %[[VAL_15]], label %[[VAL_16:.*]], label %[[VAL_17:.*]]
-// CHECK:       scatter_TensorFlowScatterV1.in_bounds-after:      ; preds = %[[VAL_18:.*]], %[[VAL_19:.*]]
+// CHECK:         %[[VAL_1:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_2:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
+// CHECK:         %[[VAL_3:.*]] = mul nuw nsw i32 %[[VAL_1]], 6
+// CHECK:         %[[VAL_4:.*]] = add nuw nsw i32 %[[VAL_3]], %[[VAL_2]]
+// CHECK:         %[[VAL_5:.*]] = icmp ult i32 %[[VAL_4]], 6
+// CHECK:         call void @llvm.assume(i1 %[[VAL_5]])
+// CHECK:         %[[VAL_6:.*]] = udiv i32 %[[VAL_4]], 1
+// CHECK:         %[[VAL_7:.*]] = urem i32 %[[VAL_6]], 3
+// CHECK:         %[[VAL_8:.*]] = udiv i32 %[[VAL_4]], 3
+// CHECK:         %[[VAL_9:.*]] = icmp ult i32 %[[VAL_4]], 6
+// CHECK:         br i1 %[[VAL_9]], label %[[VAL_10:.*]], label %[[VAL_11:.*]]
+// CHECK:       scatter_TensorFlowScatterV1.in_bounds-after:      ; preds = %[[VAL_12:.*]], %[[VAL_13:.*]]
 // CHECK:         ret void
-// CHECK:       scatter_TensorFlowScatterV1.in_bounds-true:       ; preds = %[[VAL_19]]
-// CHECK:         %[[VAL_20:.*]] = getelementptr inbounds [2 x i32], ptr %[[VAL_1]], i32 0, i32 %[[VAL_14]]
-// CHECK:         %[[VAL_21:.*]] = load i32, ptr %[[VAL_20]], align 4, !invariant.load !4
-// CHECK:         %[[VAL_22:.*]] = add i32 0, %[[VAL_21]]
-// CHECK:         %[[VAL_23:.*]] = icmp ult i32 %[[VAL_21]], 3
-// CHECK:         %[[VAL_24:.*]] = and i1 true, %[[VAL_23]]
-// CHECK:         br i1 %[[VAL_24]], label %[[VAL_25:.*]], label %[[VAL_18]]
-// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_25]], %[[VAL_16]]
-// CHECK:         br label %[[VAL_17]]
-// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_16]]
-// CHECK:         %[[VAL_26:.*]] = getelementptr inbounds [3 x [3 x i32]], ptr %[[VAL_5]], i32 0, i32 %[[VAL_22]], i32 %[[VAL_13]]
-// CHECK:         %[[VAL_27:.*]] = getelementptr inbounds i32, ptr %[[VAL_3]], i32 %[[VAL_10]]
-// CHECK:         %[[VAL_28:.*]] = load i32, ptr %[[VAL_27]], align 4, !invariant.load !4
-// CHECK:         store i32 %[[VAL_28]], ptr %[[VAL_0]], align 4
-// CHECK:         %[[VAL_29:.*]] = load i32, ptr %[[VAL_0]], align 4
-// CHECK:         store atomic i32 %[[VAL_29]], ptr %[[VAL_26]] unordered, align 4
-// CHECK:         br label %[[VAL_18]]
+// CHECK:       scatter_TensorFlowScatterV1.in_bounds-true:       ; preds = %[[VAL_13]]
+// CHECK:         %[[VAL_14:.*]] = getelementptr inbounds [2 x i32], ptr %[[VAL_15:.*]], i32 0, i32 %[[VAL_8]]
+// CHECK:         %[[VAL_16:.*]] = load i32, ptr %[[VAL_14]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_17:.*]] = add i32 0, %[[VAL_16]]
+// CHECK:         %[[VAL_18:.*]] = icmp ult i32 %[[VAL_16]], 3
+// CHECK:         %[[VAL_19:.*]] = and i1 true, %[[VAL_18]]
+// CHECK:         br i1 %[[VAL_19]], label %[[VAL_20:.*]], label %[[VAL_12]]
+// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_20]], %[[VAL_10]]
+// CHECK:         br label %[[VAL_11]]
+// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_10]]
+// CHECK:         %[[VAL_21:.*]] = getelementptr inbounds [3 x [3 x i32]], ptr %[[VAL_22:.*]], i32 0, i32 %[[VAL_17]], i32 %[[VAL_7]]
+// CHECK:         %[[VAL_23:.*]] = getelementptr inbounds i32, ptr %[[VAL_24:.*]], i32 %[[VAL_4]]
+// CHECK:         %[[VAL_25:.*]] = load i32, ptr %[[VAL_23]], align 4, !invariant.load !4
+// CHECK:         store i32 %[[VAL_25]], ptr %[[VAL_0]], align 4
+// CHECK:         %[[VAL_26:.*]] = load i32, ptr %[[VAL_0]], align 4
+// CHECK:         store atomic i32 %[[VAL_26]], ptr %[[VAL_21]] unordered, align 4
+// CHECK:         br label %[[VAL_12]]
 // CHECK:       entry:
-// CHECK:         %[[VAL_30:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_31:.*]] = getelementptr inbounds i8, ptr %[[VAL_32:.*]], i64 0
-// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds i8, ptr %[[VAL_34:.*]], i64 0
-// CHECK:         %[[VAL_35:.*]] = getelementptr inbounds i8, ptr %[[VAL_36:.*]], i64 0
-// CHECK:         %[[VAL_37:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
-// CHECK:         %[[VAL_38:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
-// CHECK:         %[[VAL_39:.*]] = mul nuw nsw i32 %[[VAL_37]], 1
-// CHECK:         %[[VAL_40:.*]] = add nuw nsw i32 %[[VAL_39]], %[[VAL_38]]
-// CHECK:         %[[VAL_41:.*]] = icmp ult i32 %[[VAL_40]], 1
-// CHECK:         call void @llvm.assume(i1 %[[VAL_41]])
-// CHECK:         %[[VAL_42:.*]] = icmp ult i32 %[[VAL_40]], 1
-// CHECK:         br i1 %[[VAL_42]], label %[[VAL_43:.*]], label %[[VAL_44:.*]]
-// CHECK:       scatter_ScatterIntoScalar.in_bounds-after:        ; preds = %[[VAL_45:.*]], %[[VAL_46:.*]]
+// CHECK:         %[[VAL_27:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_28:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_29:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
+// CHECK:         %[[VAL_30:.*]] = mul nuw nsw i32 %[[VAL_28]], 1
+// CHECK:         %[[VAL_31:.*]] = add nuw nsw i32 %[[VAL_30]], %[[VAL_29]]
+// CHECK:         %[[VAL_32:.*]] = icmp ult i32 %[[VAL_31]], 1
+// CHECK:         call void @llvm.assume(i1 %[[VAL_32]])
+// CHECK:         %[[VAL_33:.*]] = icmp ult i32 %[[VAL_31]], 1
+// CHECK:         br i1 %[[VAL_33]], label %[[VAL_34:.*]], label %[[VAL_35:.*]]
+// CHECK:       scatter_ScatterIntoScalar.in_bounds-after:        ; preds = %[[VAL_36:.*]], %[[VAL_37:.*]]
 // CHECK:         ret void
-// CHECK:       scatter_ScatterIntoScalar.in_bounds-true:         ; preds = %[[VAL_46]]
-// CHECK:         br i1 true, label %[[VAL_47:.*]], label %[[VAL_45]]
-// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_47]], %[[VAL_43]]
-// CHECK:         br label %[[VAL_44]]
-// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_43]]
-// CHECK:         %[[VAL_48:.*]] = load i32, ptr %[[VAL_33]], align 4, !invariant.load !3
-// CHECK:         store i32 %[[VAL_48]], ptr %[[VAL_30]], align 4
-// CHECK:         %[[VAL_49:.*]] = load i32, ptr %[[VAL_30]], align 4
-// CHECK:         store atomic i32 %[[VAL_49]], ptr %[[VAL_35]] unordered, align 4
-// CHECK:         br label %[[VAL_45]]
+// CHECK:       scatter_ScatterIntoScalar.in_bounds-true:         ; preds = %[[VAL_37]]
+// CHECK:         br i1 true, label %[[VAL_38:.*]], label %[[VAL_36]]
+// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_38]], %[[VAL_34]]
+// CHECK:         br label %[[VAL_35]]
+// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_34]]
+// CHECK:         %[[VAL_39:.*]] = load i32, ptr %[[VAL_40:.*]], align 4, !invariant.load !3
+// CHECK:         store i32 %[[VAL_39]], ptr %[[VAL_27]], align 4
+// CHECK:         %[[VAL_41:.*]] = load i32, ptr %[[VAL_27]], align 4
+// CHECK:         store atomic i32 %[[VAL_41]], ptr %[[VAL_42:.*]] unordered, align 4
+// CHECK:         br label %[[VAL_36]]
 // CHECK:       entry:
-// CHECK:         %[[VAL_50:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_51:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_52:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_53:.*]] = getelementptr inbounds i8, ptr %[[VAL_54:.*]], i64 0
-// CHECK:         %[[VAL_55:.*]] = getelementptr inbounds i8, ptr %[[VAL_56:.*]], i64 0
-// CHECK:         %[[VAL_57:.*]] = getelementptr inbounds i8, ptr %[[VAL_58:.*]], i64 0
-// CHECK:         %[[VAL_59:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
-// CHECK:         %[[VAL_60:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
-// CHECK:         %[[VAL_61:.*]] = mul nuw nsw i32 %[[VAL_59]], 6
-// CHECK:         %[[VAL_62:.*]] = add nuw nsw i32 %[[VAL_61]], %[[VAL_60]]
-// CHECK:         %[[VAL_63:.*]] = icmp ult i32 %[[VAL_62]], 6
-// CHECK:         call void @llvm.assume(i1 %[[VAL_63]])
-// CHECK:         %[[VAL_64:.*]] = udiv i32 %[[VAL_62]], 1
-// CHECK:         %[[VAL_65:.*]] = urem i32 %[[VAL_64]], 3
-// CHECK:         %[[VAL_66:.*]] = udiv i32 %[[VAL_62]], 3
-// CHECK:         %[[VAL_67:.*]] = icmp ult i32 %[[VAL_62]], 6
-// CHECK:         br i1 %[[VAL_67]], label %[[VAL_68:.*]], label %[[VAL_69:.*]]
-// CHECK:       scatter_TensorFlowScatter_Mul.in_bounds-after:    ; preds = %[[VAL_70:.*]], %[[VAL_71:.*]]
+// CHECK:         %[[VAL_43:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_44:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_45:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_46:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_47:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
+// CHECK:         %[[VAL_48:.*]] = mul nuw nsw i32 %[[VAL_46]], 6
+// CHECK:         %[[VAL_49:.*]] = add nuw nsw i32 %[[VAL_48]], %[[VAL_47]]
+// CHECK:         %[[VAL_50:.*]] = icmp ult i32 %[[VAL_49]], 6
+// CHECK:         call void @llvm.assume(i1 %[[VAL_50]])
+// CHECK:         %[[VAL_51:.*]] = udiv i32 %[[VAL_49]], 1
+// CHECK:         %[[VAL_52:.*]] = urem i32 %[[VAL_51]], 3
+// CHECK:         %[[VAL_53:.*]] = udiv i32 %[[VAL_49]], 3
+// CHECK:         %[[VAL_54:.*]] = icmp ult i32 %[[VAL_49]], 6
+// CHECK:         br i1 %[[VAL_54]], label %[[VAL_55:.*]], label %[[VAL_56:.*]]
+// CHECK:       scatter_TensorFlowScatter_Mul.in_bounds-after:    ; preds = %[[VAL_57:.*]], %[[VAL_58:.*]]
 // CHECK:         ret void
-// CHECK:       scatter_TensorFlowScatter_Mul.in_bounds-true:     ; preds = %[[VAL_71]]
-// CHECK:         %[[VAL_72:.*]] = getelementptr inbounds [2 x i32], ptr %[[VAL_53]], i32 0, i32 %[[VAL_66]]
-// CHECK:         %[[VAL_73:.*]] = load i32, ptr %[[VAL_72]], align 4, !invariant.load !4
-// CHECK:         %[[VAL_74:.*]] = add i32 0, %[[VAL_73]]
-// CHECK:         %[[VAL_75:.*]] = icmp ult i32 %[[VAL_73]], 3
-// CHECK:         %[[VAL_76:.*]] = and i1 true, %[[VAL_75]]
-// CHECK:         br i1 %[[VAL_76]], label %[[VAL_77:.*]], label %[[VAL_70]]
-// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_78:.*]], %[[VAL_68]]
-// CHECK:         br label %[[VAL_69]]
-// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_68]]
-// CHECK:         %[[VAL_79:.*]] = getelementptr inbounds [3 x [3 x i32]], ptr %[[VAL_57]], i32 0, i32 %[[VAL_74]], i32 %[[VAL_65]]
-// CHECK:         %[[VAL_80:.*]] = getelementptr inbounds i32, ptr %[[VAL_55]], i32 %[[VAL_62]]
-// CHECK:         %[[VAL_81:.*]] = load i32, ptr %[[VAL_80]], align 4, !invariant.load !4
-// CHECK:         store i32 %[[VAL_81]], ptr %[[VAL_52]], align 4
-// CHECK:         %[[VAL_82:.*]] = load i32, ptr %[[VAL_52]], align 4
-// CHECK:         %[[VAL_83:.*]] = load i32, ptr %[[VAL_79]], align 4
-// CHECK:         store i32 %[[VAL_83]], ptr %[[VAL_51]], align 4
-// CHECK:         br label %[[VAL_84:.*]]
-// CHECK:       atomic_op_loop_exit:                              ; preds = %[[VAL_85:.*]], %[[VAL_84]]
-// CHECK:         br label %[[VAL_70]]
-// CHECK:       atomic_op_loop_body:                              ; preds = %[[VAL_85]], %[[VAL_77]]
-// CHECK:         %[[VAL_86:.*]] = load i32, ptr %[[VAL_51]], align 4
-// CHECK:         store i32 %[[VAL_86]], ptr %[[VAL_50]], align 4
-// CHECK:         call void @region_0_4(ptr %[[VAL_50]], ptr %[[VAL_52]], ptr %[[VAL_50]])
-// CHECK:         %[[VAL_87:.*]] = load i32, ptr %[[VAL_50]], align 4
-// CHECK:         %[[VAL_88:.*]] = icmp eq i32 %[[VAL_86]], %[[VAL_87]]
-// CHECK:         br i1 %[[VAL_88]], label %[[VAL_78]], label %[[VAL_85]]
-// CHECK:       atomic_op_loop_cas:                               ; preds = %[[VAL_84]]
-// CHECK:         %[[VAL_89:.*]] = cmpxchg ptr %[[VAL_79]], i32 %[[VAL_86]], i32 %[[VAL_87]] seq_cst seq_cst, align 4
-// CHECK:         %[[VAL_90:.*]] = extractvalue { i32, i1 } %[[VAL_89]], 0
-// CHECK:         store i32 %[[VAL_90]], ptr %[[VAL_51]], align 4
-// CHECK:         %[[VAL_91:.*]] = extractvalue { i32, i1 } %[[VAL_89]], 1
-// CHECK:         br i1 %[[VAL_91]], label %[[VAL_78]], label %[[VAL_84]]
+// CHECK:       scatter_TensorFlowScatter_Mul.in_bounds-true:     ; preds = %[[VAL_58]]
+// CHECK:         %[[VAL_59:.*]] = getelementptr inbounds [2 x i32], ptr %[[VAL_60:.*]], i32 0, i32 %[[VAL_53]]
+// CHECK:         %[[VAL_61:.*]] = load i32, ptr %[[VAL_59]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_62:.*]] = add i32 0, %[[VAL_61]]
+// CHECK:         %[[VAL_63:.*]] = icmp ult i32 %[[VAL_61]], 3
+// CHECK:         %[[VAL_64:.*]] = and i1 true, %[[VAL_63]]
+// CHECK:         br i1 %[[VAL_64]], label %[[VAL_65:.*]], label %[[VAL_57]]
+// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_66:.*]], %[[VAL_55]]
+// CHECK:         br label %[[VAL_56]]
+// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_55]]
+// CHECK:         %[[VAL_67:.*]] = getelementptr inbounds [3 x [3 x i32]], ptr %[[VAL_68:.*]], i32 0, i32 %[[VAL_62]], i32 %[[VAL_52]]
+// CHECK:         %[[VAL_69:.*]] = getelementptr inbounds i32, ptr %[[VAL_70:.*]], i32 %[[VAL_49]]
+// CHECK:         %[[VAL_71:.*]] = load i32, ptr %[[VAL_69]], align 4, !invariant.load !4
+// CHECK:         store i32 %[[VAL_71]], ptr %[[VAL_45]], align 4
+// CHECK:         %[[VAL_72:.*]] = load i32, ptr %[[VAL_45]], align 4
+// CHECK:         %[[VAL_73:.*]] = load i32, ptr %[[VAL_67]], align 4
+// CHECK:         store i32 %[[VAL_73]], ptr %[[VAL_44]], align 4
+// CHECK:         br label %[[VAL_74:.*]]
+// CHECK:       atomic_op_loop_exit:                              ; preds = %[[VAL_75:.*]], %[[VAL_74]]
+// CHECK:         br label %[[VAL_57]]
+// CHECK:       atomic_op_loop_body:                              ; preds = %[[VAL_75]], %[[VAL_65]]
+// CHECK:         %[[VAL_76:.*]] = load i32, ptr %[[VAL_44]], align 4
+// CHECK:         store i32 %[[VAL_76]], ptr %[[VAL_43]], align 4
+// CHECK:         call void @region_0_4(ptr %[[VAL_43]], ptr %[[VAL_45]], ptr %[[VAL_43]])
+// CHECK:         %[[VAL_77:.*]] = load i32, ptr %[[VAL_43]], align 4
+// CHECK:         %[[VAL_78:.*]] = icmp eq i32 %[[VAL_76]], %[[VAL_77]]
+// CHECK:         br i1 %[[VAL_78]], label %[[VAL_66]], label %[[VAL_75]]
+// CHECK:       atomic_op_loop_cas:                               ; preds = %[[VAL_74]]
+// CHECK:         %[[VAL_79:.*]] = cmpxchg ptr %[[VAL_67]], i32 %[[VAL_76]], i32 %[[VAL_77]] seq_cst seq_cst, align 4
+// CHECK:         %[[VAL_80:.*]] = extractvalue { i32, i1 } %[[VAL_79]], 0
+// CHECK:         store i32 %[[VAL_80]], ptr %[[VAL_44]], align 4
+// CHECK:         %[[VAL_81:.*]] = extractvalue { i32, i1 } %[[VAL_79]], 1
+// CHECK:         br i1 %[[VAL_81]], label %[[VAL_66]], label %[[VAL_74]]
 // CHECK:       entry:
-// CHECK:         %[[VAL_92:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_93:.*]] = load i32, ptr %[[VAL_94:.*]], align 4
-// CHECK:         %[[VAL_95:.*]] = load i32, ptr %[[VAL_96:.*]], align 4
-// CHECK:         %[[VAL_97:.*]] = mul i32 %[[VAL_93]], %[[VAL_95]]
-// CHECK:         store i32 %[[VAL_97]], ptr %[[VAL_92]], align 4
-// CHECK:         %[[VAL_98:.*]] = load i32, ptr %[[VAL_92]], align 4
-// CHECK:         store i32 %[[VAL_98]], ptr %[[VAL_99:.*]], align 4
+// CHECK:         %[[VAL_82:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_83:.*]] = load i32, ptr %[[VAL_84:.*]], align 4
+// CHECK:         %[[VAL_85:.*]] = load i32, ptr %[[VAL_86:.*]], align 4
+// CHECK:         %[[VAL_87:.*]] = mul i32 %[[VAL_83]], %[[VAL_85]]
+// CHECK:         store i32 %[[VAL_87]], ptr %[[VAL_82]], align 4
+// CHECK:         %[[VAL_88:.*]] = load i32, ptr %[[VAL_82]], align 4
+// CHECK:         store i32 %[[VAL_88]], ptr %[[VAL_89:.*]], align 4
 // CHECK:         ret void
 // CHECK:       entry:
-// CHECK:         %[[VAL_100:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_101:.*]] = getelementptr inbounds i8, ptr %[[VAL_102:.*]], i64 0
-// CHECK:         %[[VAL_103:.*]] = getelementptr inbounds i8, ptr %[[VAL_104:.*]], i64 0
-// CHECK:         %[[VAL_105:.*]] = getelementptr inbounds i8, ptr %[[VAL_106:.*]], i64 0
-// CHECK:         %[[VAL_107:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
-// CHECK:         %[[VAL_108:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
-// CHECK:         %[[VAL_109:.*]] = mul nuw nsw i32 %[[VAL_107]], 1
-// CHECK:         %[[VAL_110:.*]] = add nuw nsw i32 %[[VAL_109]], %[[VAL_108]]
-// CHECK:         %[[VAL_111:.*]] = icmp ult i32 %[[VAL_110]], 1
-// CHECK:         call void @llvm.assume(i1 %[[VAL_111]])
-// CHECK:         %[[VAL_112:.*]] = icmp ult i32 %[[VAL_110]], 1
-// CHECK:         br i1 %[[VAL_112]], label %[[VAL_113:.*]], label %[[VAL_114:.*]]
-// CHECK:       scatter_ScalarUpdate.in_bounds-after:             ; preds = %[[VAL_115:.*]], %[[VAL_116:.*]]
+// CHECK:         %[[VAL_90:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_91:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_92:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
+// CHECK:         %[[VAL_93:.*]] = mul nuw nsw i32 %[[VAL_91]], 1
+// CHECK:         %[[VAL_94:.*]] = add nuw nsw i32 %[[VAL_93]], %[[VAL_92]]
+// CHECK:         %[[VAL_95:.*]] = icmp ult i32 %[[VAL_94]], 1
+// CHECK:         call void @llvm.assume(i1 %[[VAL_95]])
+// CHECK:         %[[VAL_96:.*]] = icmp ult i32 %[[VAL_94]], 1
+// CHECK:         br i1 %[[VAL_96]], label %[[VAL_97:.*]], label %[[VAL_98:.*]]
+// CHECK:       scatter_ScalarUpdate.in_bounds-after:             ; preds = %[[VAL_99:.*]], %[[VAL_100:.*]]
 // CHECK:         ret void
-// CHECK:       scatter_ScalarUpdate.in_bounds-true:              ; preds = %[[VAL_116]]
-// CHECK:         %[[VAL_117:.*]] = load i32, ptr %[[VAL_101]], align 4, !invariant.load !3
-// CHECK:         %[[VAL_118:.*]] = add i32 0, %[[VAL_117]]
-// CHECK:         %[[VAL_119:.*]] = icmp ult i32 %[[VAL_117]], 4
-// CHECK:         %[[VAL_120:.*]] = and i1 true, %[[VAL_119]]
-// CHECK:         br i1 %[[VAL_120]], label %[[VAL_121:.*]], label %[[VAL_115]]
-// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_121]], %[[VAL_113]]
-// CHECK:         br label %[[VAL_114]]
-// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_113]]
-// CHECK:         %[[VAL_122:.*]] = getelementptr inbounds [4 x i32], ptr %[[VAL_105]], i32 0, i32 %[[VAL_118]]
-// CHECK:         %[[VAL_123:.*]] = load i32, ptr %[[VAL_103]], align 4, !invariant.load !3
-// CHECK:         store i32 %[[VAL_123]], ptr %[[VAL_100]], align 4
-// CHECK:         %[[VAL_124:.*]] = load i32, ptr %[[VAL_100]], align 4
-// CHECK:         store atomic i32 %[[VAL_124]], ptr %[[VAL_122]] unordered, align 4
-// CHECK:         br label %[[VAL_115]]
+// CHECK:       scatter_ScalarUpdate.in_bounds-true:              ; preds = %[[VAL_100]]
+// CHECK:         %[[VAL_101:.*]] = load i32, ptr %[[VAL_102:.*]], align 4, !invariant.load !3
+// CHECK:         %[[VAL_103:.*]] = add i32 0, %[[VAL_101]]
+// CHECK:         %[[VAL_104:.*]] = icmp ult i32 %[[VAL_101]], 4
+// CHECK:         %[[VAL_105:.*]] = and i1 true, %[[VAL_104]]
+// CHECK:         br i1 %[[VAL_105]], label %[[VAL_106:.*]], label %[[VAL_99]]
+// CHECK:       scatter.in_bounds-after:                          ; preds = %[[VAL_106]], %[[VAL_97]]
+// CHECK:         br label %[[VAL_98]]
+// CHECK:       scatter.in_bounds-true:                           ; preds = %[[VAL_97]]
+// CHECK:         %[[VAL_107:.*]] = getelementptr inbounds [4 x i32], ptr %[[VAL_108:.*]], i32 0, i32 %[[VAL_103]]
+// CHECK:         %[[VAL_109:.*]] = load i32, ptr %[[VAL_110:.*]], align 4, !invariant.load !3
+// CHECK:         store i32 %[[VAL_109]], ptr %[[VAL_90]], align 4
+// CHECK:         %[[VAL_111:.*]] = load i32, ptr %[[VAL_90]], align 4
+// CHECK:         store atomic i32 %[[VAL_111]], ptr %[[VAL_107]] unordered, align 4
+// CHECK:         br label %[[VAL_99]]
 
 HloModule TensorFlowScatterV1
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/select_and_scatter.hlo b/tensorflow/compiler/xla/service/gpu/tests/select_and_scatter.hlo
index a6d486f1599..ceff3d7c4b5 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/select_and_scatter.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/select_and_scatter.hlo
@@ -8,92 +8,89 @@
 // CHECK:         %[[VAL_2:.*]] = alloca i1, align 1
 // CHECK:         %[[VAL_3:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_4:.*]] = alloca float, align 4
-// CHECK:         %[[VAL_5:.*]] = getelementptr inbounds i8, ptr %[[VAL_6:.*]], i64 0
-// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds i8, ptr %[[VAL_8:.*]], i64 0
-// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds i8, ptr %[[VAL_10:.*]], i64 0
-// CHECK:         %[[VAL_11:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
-// CHECK:         %[[VAL_12:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
-// CHECK:         %[[VAL_13:.*]] = mul nuw nsw i32 %[[VAL_11]], 2
-// CHECK:         %[[VAL_14:.*]] = add nuw nsw i32 %[[VAL_13]], %[[VAL_12]]
-// CHECK:         %[[VAL_15:.*]] = icmp ult i32 %[[VAL_14]], 2
-// CHECK:         call void @llvm.assume(i1 %[[VAL_15]])
-// CHECK:         %[[VAL_16:.*]] = udiv i32 %[[VAL_14]], 1
-// CHECK:         %[[VAL_17:.*]] = icmp ult i32 %[[VAL_14]], 2
-// CHECK:         br i1 %[[VAL_17]], label %[[VAL_18:.*]], label %[[VAL_19:.*]]
-// CHECK:       select_and_scatter_12.in_bounds-after:            ; preds = %[[VAL_20:.*]], %[[VAL_21:.*]]
+// CHECK:         %[[VAL_5:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_6:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
+// CHECK:         %[[VAL_7:.*]] = mul nuw nsw i32 %[[VAL_5]], 2
+// CHECK:         %[[VAL_8:.*]] = add nuw nsw i32 %[[VAL_7]], %[[VAL_6]]
+// CHECK:         %[[VAL_9:.*]] = icmp ult i32 %[[VAL_8]], 2
+// CHECK:         call void @llvm.assume(i1 %[[VAL_9]])
+// CHECK:         %[[VAL_10:.*]] = udiv i32 %[[VAL_8]], 1
+// CHECK:         %[[VAL_11:.*]] = icmp ult i32 %[[VAL_8]], 2
+// CHECK:         br i1 %[[VAL_11]], label %[[VAL_12:.*]], label %[[VAL_13:.*]]
+// CHECK:       select_and_scatter_12.in_bounds-after:            ; preds = %[[VAL_14:.*]], %[[VAL_15:.*]]
 // CHECK:         ret void
-// CHECK:       select_and_scatter_12.in_bounds-true:             ; preds = %[[VAL_21]]
+// CHECK:       select_and_scatter_12.in_bounds-true:             ; preds = %[[VAL_15]]
 // CHECK:         store i1 false, ptr %[[VAL_2]], align 1
 // CHECK:         store i32 0, ptr %[[VAL_1]], align 4
-// CHECK:         br label %[[VAL_22:.*]]
-// CHECK:       select_and_scatter_12inner.loop_header.window.0:  ; preds = %[[VAL_23:.*]], %[[VAL_18]]
-// CHECK:         %[[VAL_24:.*]] = load i32, ptr %[[VAL_1]], align 4
-// CHECK:         %[[VAL_25:.*]] = icmp uge i32 %[[VAL_24]], 3
-// CHECK:         br i1 %[[VAL_25]], label %[[VAL_26:.*]], label %[[VAL_27:.*]]
-// CHECK:       select_and_scatter_12inner.loop_body.window.0:    ; preds = %[[VAL_22]]
-// CHECK:         %[[VAL_28:.*]] = mul nsw i32 %[[VAL_16]], 3
-// CHECK:         %[[VAL_29:.*]] = add nsw i32 %[[VAL_28]], %[[VAL_24]]
-// CHECK:         %[[VAL_30:.*]] = sub nsw i32 %[[VAL_29]], 0
-// CHECK:         %[[VAL_31:.*]] = icmp ult i32 %[[VAL_30]], 6
-// CHECK:         %[[VAL_32:.*]] = and i1 true, %[[VAL_31]]
-// CHECK:         br i1 %[[VAL_32]], label %[[VAL_33:.*]], label %[[VAL_34:.*]]
-// CHECK:       in-bounds-after:                                  ; preds = %[[VAL_34]], %[[VAL_35:.*]]
-// CHECK:         %[[VAL_36:.*]] = add nuw nsw i32 %[[VAL_24]], 1
-// CHECK:         store i32 %[[VAL_36]], ptr %[[VAL_1]], align 4
-// CHECK:         br label %[[VAL_22]]
-// CHECK:       select_and_scatter_12inner.loop_exit.window.0:    ; preds = %[[VAL_22]]
-// CHECK:         %[[VAL_37:.*]] = load i1, ptr %[[VAL_2]], align 1
-// CHECK:         br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_20]]
-// CHECK:       should-store-after:                               ; preds = %[[VAL_38]], %[[VAL_26]]
-// CHECK:         br label %[[VAL_19]]
-// CHECK:       in-bounds-true:                                   ; preds = %[[VAL_27]]
-// CHECK:         %[[VAL_39:.*]] = load i1, ptr %[[VAL_2]], align 1
-// CHECK:         br i1 %[[VAL_39]], label %[[VAL_40:.*]], label %[[VAL_41:.*]]
-// CHECK:       initialized-after:                                ; preds = %[[VAL_41]], %[[VAL_42:.*]]
-// CHECK:         br label %[[VAL_23]]
-// CHECK:       in-bounds-false:                                  ; preds = %[[VAL_27]]
-// CHECK:         br label %[[VAL_23]]
-// CHECK:       initialized-true:                                 ; preds = %[[VAL_33]]
-// CHECK:         %[[VAL_43:.*]] = getelementptr inbounds [6 x float], ptr %[[VAL_5]], i32 0, i32 %[[VAL_30]]
-// CHECK:         call void @region_0_4(ptr %[[VAL_4]], ptr %[[VAL_43]], ptr %[[VAL_0]])
-// CHECK:         %[[VAL_44:.*]] = load i8, ptr %[[VAL_0]], align 1
-// CHECK:         %[[VAL_45:.*]] = icmp ne i8 %[[VAL_44]], 0
-// CHECK:         br i1 %[[VAL_45]], label %[[VAL_46:.*]], label %[[VAL_47:.*]]
-// CHECK:       if-select-lhs-after:                              ; preds = %[[VAL_47]], %[[VAL_46]]
-// CHECK:         br label %[[VAL_35]]
-// CHECK:       initialized-false:                                ; preds = %[[VAL_33]]
-// CHECK:         %[[VAL_48:.*]] = getelementptr inbounds [6 x float], ptr %[[VAL_5]], i32 0, i32 %[[VAL_30]]
-// CHECK:         %[[VAL_49:.*]] = load float, ptr %[[VAL_48]], align 4, !invariant.load !4
-// CHECK:         store float %[[VAL_49]], ptr %[[VAL_4]], align 4
-// CHECK:         %[[VAL_50:.*]] = getelementptr inbounds i32, ptr %[[VAL_3]], i32 0
-// CHECK:         store i32 %[[VAL_30]], ptr %[[VAL_50]], align 4
+// CHECK:         br label %[[VAL_16:.*]]
+// CHECK:       select_and_scatter_12inner.loop_header.window.0:  ; preds = %[[VAL_17:.*]], %[[VAL_12]]
+// CHECK:         %[[VAL_18:.*]] = load i32, ptr %[[VAL_1]], align 4
+// CHECK:         %[[VAL_19:.*]] = icmp uge i32 %[[VAL_18]], 3
+// CHECK:         br i1 %[[VAL_19]], label %[[VAL_20:.*]], label %[[VAL_21:.*]]
+// CHECK:       select_and_scatter_12inner.loop_body.window.0:    ; preds = %[[VAL_16]]
+// CHECK:         %[[VAL_22:.*]] = mul nsw i32 %[[VAL_10]], 3
+// CHECK:         %[[VAL_23:.*]] = add nsw i32 %[[VAL_22]], %[[VAL_18]]
+// CHECK:         %[[VAL_24:.*]] = sub nsw i32 %[[VAL_23]], 0
+// CHECK:         %[[VAL_25:.*]] = icmp ult i32 %[[VAL_24]], 6
+// CHECK:         %[[VAL_26:.*]] = and i1 true, %[[VAL_25]]
+// CHECK:         br i1 %[[VAL_26]], label %[[VAL_27:.*]], label %[[VAL_28:.*]]
+// CHECK:       in-bounds-after:                                  ; preds = %[[VAL_28]], %[[VAL_29:.*]]
+// CHECK:         %[[VAL_30:.*]] = add nuw nsw i32 %[[VAL_18]], 1
+// CHECK:         store i32 %[[VAL_30]], ptr %[[VAL_1]], align 4
+// CHECK:         br label %[[VAL_16]]
+// CHECK:       select_and_scatter_12inner.loop_exit.window.0:    ; preds = %[[VAL_16]]
+// CHECK:         %[[VAL_31:.*]] = load i1, ptr %[[VAL_2]], align 1
+// CHECK:         br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_14]]
+// CHECK:       should-store-after:                               ; preds = %[[VAL_32]], %[[VAL_20]]
+// CHECK:         br label %[[VAL_13]]
+// CHECK:       in-bounds-true:                                   ; preds = %[[VAL_21]]
+// CHECK:         %[[VAL_33:.*]] = load i1, ptr %[[VAL_2]], align 1
+// CHECK:         br i1 %[[VAL_33]], label %[[VAL_34:.*]], label %[[VAL_35:.*]]
+// CHECK:       initialized-after:                                ; preds = %[[VAL_35]], %[[VAL_36:.*]]
+// CHECK:         br label %[[VAL_17]]
+// CHECK:       in-bounds-false:                                  ; preds = %[[VAL_21]]
+// CHECK:         br label %[[VAL_17]]
+// CHECK:       initialized-true:                                 ; preds = %[[VAL_27]]
+// CHECK:         %[[VAL_37:.*]] = getelementptr inbounds [6 x float], ptr %[[VAL_38:.*]], i32 0, i32 %[[VAL_24]]
+// CHECK:         call void @region_0_4(ptr %[[VAL_4]], ptr %[[VAL_37]], ptr %[[VAL_0]])
+// CHECK:         %[[VAL_39:.*]] = load i8, ptr %[[VAL_0]], align 1
+// CHECK:         %[[VAL_40:.*]] = icmp ne i8 %[[VAL_39]], 0
+// CHECK:         br i1 %[[VAL_40]], label %[[VAL_41:.*]], label %[[VAL_42:.*]]
+// CHECK:       if-select-lhs-after:                              ; preds = %[[VAL_42]], %[[VAL_41]]
+// CHECK:         br label %[[VAL_29]]
+// CHECK:       initialized-false:                                ; preds = %[[VAL_27]]
+// CHECK:         %[[VAL_43:.*]] = getelementptr inbounds [6 x float], ptr %[[VAL_38]], i32 0, i32 %[[VAL_24]]
+// CHECK:         %[[VAL_44:.*]] = load float, ptr %[[VAL_43]], align 4, !invariant.load !4
+// CHECK:         store float %[[VAL_44]], ptr %[[VAL_4]], align 4
+// CHECK:         %[[VAL_45:.*]] = getelementptr inbounds i32, ptr %[[VAL_3]], i32 0
+// CHECK:         store i32 %[[VAL_24]], ptr %[[VAL_45]], align 4
 // CHECK:         store i1 true, ptr %[[VAL_2]], align 1
-// CHECK:         br label %[[VAL_35]]
-// CHECK:       if-select-lhs-true:                               ; preds = %[[VAL_40]]
-// CHECK:         br label %[[VAL_42]]
-// CHECK:       if-select-lhs-false:                              ; preds = %[[VAL_40]]
-// CHECK:         %[[VAL_51:.*]] = load float, ptr %[[VAL_43]], align 4
-// CHECK:         store float %[[VAL_51]], ptr %[[VAL_4]], align 4
-// CHECK:         %[[VAL_52:.*]] = getelementptr inbounds i32, ptr %[[VAL_3]], i32 0
-// CHECK:         store i32 %[[VAL_30]], ptr %[[VAL_52]], align 4
-// CHECK:         br label %[[VAL_42]]
-// CHECK:       should-store-true:                                ; preds = %[[VAL_26]]
-// CHECK:         %[[VAL_53:.*]] = getelementptr inbounds i32, ptr %[[VAL_3]], i32 0
-// CHECK:         %[[VAL_54:.*]] = load i32, ptr %[[VAL_53]], align 4
-// CHECK:         %[[VAL_55:.*]] = getelementptr inbounds float, ptr %[[VAL_7]], i32 %[[VAL_14]]
-// CHECK:         %[[VAL_56:.*]] = getelementptr inbounds [6 x float], ptr %[[VAL_9]], i32 0, i32 %[[VAL_54]]
-// CHECK:         %[[VAL_57:.*]] = load float, ptr %[[VAL_55]], align 4
-// CHECK:         %[[VAL_58:.*]] = atomicrmw fadd ptr %[[VAL_56]], float %[[VAL_57]] seq_cst, align 4
-// CHECK:         br label %[[VAL_20]]
+// CHECK:         br label %[[VAL_29]]
+// CHECK:       if-select-lhs-true:                               ; preds = %[[VAL_34]]
+// CHECK:         br label %[[VAL_36]]
+// CHECK:       if-select-lhs-false:                              ; preds = %[[VAL_34]]
+// CHECK:         %[[VAL_46:.*]] = load float, ptr %[[VAL_37]], align 4
+// CHECK:         store float %[[VAL_46]], ptr %[[VAL_4]], align 4
+// CHECK:         %[[VAL_47:.*]] = getelementptr inbounds i32, ptr %[[VAL_3]], i32 0
+// CHECK:         store i32 %[[VAL_24]], ptr %[[VAL_47]], align 4
+// CHECK:         br label %[[VAL_36]]
+// CHECK:       should-store-true:                                ; preds = %[[VAL_20]]
+// CHECK:         %[[VAL_48:.*]] = getelementptr inbounds i32, ptr %[[VAL_3]], i32 0
+// CHECK:         %[[VAL_49:.*]] = load i32, ptr %[[VAL_48]], align 4
+// CHECK:         %[[VAL_50:.*]] = getelementptr inbounds float, ptr %[[VAL_51:.*]], i32 %[[VAL_8]]
+// CHECK:         %[[VAL_52:.*]] = getelementptr inbounds [6 x float], ptr %[[VAL_53:.*]], i32 0, i32 %[[VAL_49]]
+// CHECK:         %[[VAL_54:.*]] = load float, ptr %[[VAL_50]], align 4
+// CHECK:         %[[VAL_55:.*]] = atomicrmw fadd ptr %[[VAL_52]], float %[[VAL_54]] seq_cst, align 4
+// CHECK:         br label %[[VAL_14]]
 // CHECK:       entry:
-// CHECK:         %[[VAL_59:.*]] = alloca i8, align 1
-// CHECK:         %[[VAL_60:.*]] = load float, ptr %[[VAL_61:.*]], align 4
-// CHECK:         %[[VAL_62:.*]] = load float, ptr %[[VAL_63:.*]], align 4
-// CHECK:         %[[VAL_64:.*]] = fcmp oge float %[[VAL_60]], %[[VAL_62]]
-// CHECK:         %[[VAL_65:.*]] = zext i1 %[[VAL_64]] to i8
-// CHECK:         store i8 %[[VAL_65]], ptr %[[VAL_59]], align 1
-// CHECK:         %[[VAL_66:.*]] = load i8, ptr %[[VAL_59]], align 1
-// CHECK:         store i8 %[[VAL_66]], ptr %[[VAL_67:.*]], align 1
+// CHECK:         %[[VAL_56:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_57:.*]] = load float, ptr %[[VAL_58:.*]], align 4
+// CHECK:         %[[VAL_59:.*]] = load float, ptr %[[VAL_60:.*]], align 4
+// CHECK:         %[[VAL_61:.*]] = fcmp oge float %[[VAL_57]], %[[VAL_59]]
+// CHECK:         %[[VAL_62:.*]] = zext i1 %[[VAL_61]] to i8
+// CHECK:         store i8 %[[VAL_62]], ptr %[[VAL_56]], align 1
+// CHECK:         %[[VAL_63:.*]] = load i8, ptr %[[VAL_56]], align 1
+// CHECK:         store i8 %[[VAL_63]], ptr %[[VAL_64:.*]], align 1
 // CHECK:         ret void
 
 HloModule SelectAndScatter
diff --git a/tensorflow/compiler/xla/service/gpu/tests/slice_to_dynamic.hlo b/tensorflow/compiler/xla/service/gpu/tests/slice_to_dynamic.hlo
index d47f70b24b7..d868362426e 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/slice_to_dynamic.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/slice_to_dynamic.hlo
@@ -3,68 +3,66 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
 // CHECK-LABEL: entry:
-// CHECK:         %[[VAL_0:.*]] = getelementptr inbounds i8, ptr %[[VAL_1:.*]], i64 0
-// CHECK:         %[[VAL_2:.*]] = getelementptr inbounds i8, ptr %[[VAL_3:.*]], i64 0
-// CHECK:         %[[VAL_4:.*]] = load i32, ptr @buffer_for_static, align 4
-// CHECK:         %[[VAL_5:.*]] = load i32, ptr @buffer_for_dynamic, align 4
-// CHECK:         %[[VAL_6:.*]] = load i32, ptr @buffer_for_static, align 4
-// CHECK:         %[[VAL_7:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK:         %[[VAL_0:.*]] = load i32, ptr %[[VAL_1:.*]], align 4
+// CHECK:         %[[VAL_2:.*]] = load i32, ptr %[[VAL_3:.*]], align 4
+// CHECK:         %[[VAL_4:.*]] = load i32, ptr %[[VAL_1]], align 4
+// CHECK:         %[[VAL_5:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK:         %[[VAL_6:.*]] = icmp eq i32 0, %[[VAL_5]]
+// CHECK:         %[[VAL_7:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 // CHECK:         %[[VAL_8:.*]] = icmp eq i32 0, %[[VAL_7]]
-// CHECK:         %[[VAL_9:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-// CHECK:         %[[VAL_10:.*]] = icmp eq i32 0, %[[VAL_9]]
-// CHECK:         %[[VAL_11:.*]] = and i1 %[[VAL_8]], %[[VAL_10]]
-// CHECK:         br i1 %[[VAL_11]], label %[[VAL_12:.*]], label %[[VAL_13:.*]]
-// CHECK:       is_thread_0-after:                                ; preds = %[[VAL_12]], %[[VAL_14:.*]]
-// CHECK:         %[[VAL_15:.*]] = mul i32 1, %[[VAL_4]]
-// CHECK:         %[[VAL_16:.*]] = mul i32 %[[VAL_15]], %[[VAL_5]]
-// CHECK:         %[[VAL_17:.*]] = mul i32 %[[VAL_16]], %[[VAL_6]]
-// CHECK:         %[[VAL_18:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
-// CHECK:         %[[VAL_19:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
-// CHECK:         %[[VAL_20:.*]] = mul nuw nsw i32 %[[VAL_18]], 8
-// CHECK:         %[[VAL_21:.*]] = add nuw nsw i32 %[[VAL_20]], %[[VAL_19]]
-// CHECK:         %[[VAL_22:.*]] = icmp ult i32 %[[VAL_21]], 8
-// CHECK:         call void @llvm.assume(i1 %[[VAL_22]])
-// CHECK:         %[[VAL_23:.*]] = udiv i32 %[[VAL_21]], 1
+// CHECK:         %[[VAL_9:.*]] = and i1 %[[VAL_6]], %[[VAL_8]]
+// CHECK:         br i1 %[[VAL_9]], label %[[VAL_10:.*]], label %[[VAL_11:.*]]
+// CHECK:       is_thread_0-after:                                ; preds = %[[VAL_10]], %[[VAL_12:.*]]
+// CHECK:         %[[VAL_13:.*]] = mul i32 1, %[[VAL_0]]
+// CHECK:         %[[VAL_14:.*]] = mul i32 %[[VAL_13]], %[[VAL_2]]
+// CHECK:         %[[VAL_15:.*]] = mul i32 %[[VAL_14]], %[[VAL_4]]
+// CHECK:         %[[VAL_16:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_17:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
+// CHECK:         %[[VAL_18:.*]] = mul nuw nsw i32 %[[VAL_16]], 8
+// CHECK:         %[[VAL_19:.*]] = add nuw nsw i32 %[[VAL_18]], %[[VAL_17]]
+// CHECK:         %[[VAL_20:.*]] = icmp ult i32 %[[VAL_19]], 8
+// CHECK:         call void @llvm.assume(i1 %[[VAL_20]])
+// CHECK:         %[[VAL_21:.*]] = udiv i32 %[[VAL_19]], 1
+// CHECK:         %[[VAL_22:.*]] = urem i32 %[[VAL_21]], 2
+// CHECK:         %[[VAL_23:.*]] = udiv i32 %[[VAL_19]], 2
 // CHECK:         %[[VAL_24:.*]] = urem i32 %[[VAL_23]], 2
-// CHECK:         %[[VAL_25:.*]] = udiv i32 %[[VAL_21]], 2
-// CHECK:         %[[VAL_26:.*]] = urem i32 %[[VAL_25]], 2
-// CHECK:         %[[VAL_27:.*]] = udiv i32 %[[VAL_21]], 4
-// CHECK:         %[[VAL_28:.*]] = icmp ult i32 %[[VAL_21]], 8
-// CHECK:         br i1 %[[VAL_28]], label %[[VAL_29:.*]], label %[[VAL_30:.*]]
-// CHECK:       custom_call.in_bounds-after:                      ; preds = %[[VAL_31:.*]], %[[VAL_13]]
+// CHECK:         %[[VAL_25:.*]] = udiv i32 %[[VAL_19]], 4
+// CHECK:         %[[VAL_26:.*]] = icmp ult i32 %[[VAL_19]], 8
+// CHECK:         br i1 %[[VAL_26]], label %[[VAL_27:.*]], label %[[VAL_28:.*]]
+// CHECK:       custom_call.in_bounds-after:                      ; preds = %[[VAL_29:.*]], %[[VAL_11]]
 // CHECK:         ret void
-// CHECK:       is_thread_0-true:                                 ; preds = %[[VAL_14]]
-// CHECK:         %[[VAL_32:.*]] = getelementptr inbounds i8, ptr %[[VAL_2]], i32 32
-// CHECK:         store i32 %[[VAL_4]], ptr %[[VAL_32]], align 4
-// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds i8, ptr %[[VAL_2]], i32 36
-// CHECK:         store i32 %[[VAL_5]], ptr %[[VAL_33]], align 4
-// CHECK:         %[[VAL_34:.*]] = getelementptr inbounds i8, ptr %[[VAL_2]], i32 40
-// CHECK:         store i32 %[[VAL_6]], ptr %[[VAL_34]], align 4
-// CHECK:         br label %[[VAL_13]]
-// CHECK:       custom_call.in_bounds-true:                       ; preds = %[[VAL_13]]
-// CHECK:         %[[VAL_35:.*]] = mul nuw nsw i32 %[[VAL_24]], 1
-// CHECK:         %[[VAL_36:.*]] = add nuw nsw i32 0, %[[VAL_35]]
-// CHECK:         %[[VAL_37:.*]] = mul nuw nsw i32 %[[VAL_27]], 2
-// CHECK:         %[[VAL_38:.*]] = add nuw nsw i32 %[[VAL_36]], %[[VAL_37]]
-// CHECK:         %[[VAL_39:.*]] = mul nuw nsw i32 %[[VAL_26]], 4
-// CHECK:         %[[VAL_40:.*]] = add nuw nsw i32 %[[VAL_38]], %[[VAL_39]]
-// CHECK:         %[[VAL_41:.*]] = icmp ult i32 %[[VAL_40]], %[[VAL_17]]
-// CHECK:         br i1 %[[VAL_41]], label %[[VAL_42:.*]], label %[[VAL_31]]
-// CHECK:       custom_call.in_dyn_bounds-after:                  ; preds = %[[VAL_42]], %[[VAL_29]]
-// CHECK:         br label %[[VAL_30]]
-// CHECK:       custom_call.in_dyn_bounds-true:                   ; preds = %[[VAL_29]]
-// CHECK:         %[[VAL_43:.*]] = udiv i32 %[[VAL_40]], 1
-// CHECK:         %[[VAL_44:.*]] = urem i32 %[[VAL_43]], %[[VAL_6]]
-// CHECK:         %[[VAL_45:.*]] = mul i32 1, %[[VAL_6]]
-// CHECK:         %[[VAL_46:.*]] = udiv i32 %[[VAL_40]], %[[VAL_45]]
-// CHECK:         %[[VAL_47:.*]] = urem i32 %[[VAL_46]], %[[VAL_4]]
-// CHECK:         %[[VAL_48:.*]] = mul i32 %[[VAL_45]], %[[VAL_4]]
-// CHECK:         %[[VAL_49:.*]] = udiv i32 %[[VAL_40]], %[[VAL_48]]
-// CHECK:         %[[VAL_50:.*]] = getelementptr inbounds [2 x [2 x [2 x i32]]], ptr %[[VAL_0]], i32 0, i32 %[[VAL_49]], i32 %[[VAL_47]], i32 %[[VAL_44]]
-// CHECK:         %[[VAL_51:.*]] = load i32, ptr %[[VAL_50]], align 4, !invariant.load !4
-// CHECK:         %[[VAL_52:.*]] = getelementptr inbounds i32, ptr %[[VAL_2]], i32 %[[VAL_21]]
+// CHECK:       is_thread_0-true:                                 ; preds = %[[VAL_12]]
+// CHECK:         %[[VAL_30:.*]] = getelementptr inbounds i8, ptr %[[VAL_31:.*]], i32 32
+// CHECK:         store i32 %[[VAL_0]], ptr %[[VAL_30]], align 4
+// CHECK:         %[[VAL_32:.*]] = getelementptr inbounds i8, ptr %[[VAL_31]], i32 36
+// CHECK:         store i32 %[[VAL_2]], ptr %[[VAL_32]], align 4
+// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds i8, ptr %[[VAL_31]], i32 40
+// CHECK:         store i32 %[[VAL_4]], ptr %[[VAL_33]], align 4
+// CHECK:         br label %[[VAL_11]]
+// CHECK:       custom_call.in_bounds-true:                       ; preds = %[[VAL_11]]
+// CHECK:         %[[VAL_34:.*]] = mul nuw nsw i32 %[[VAL_22]], 1
+// CHECK:         %[[VAL_35:.*]] = add nuw nsw i32 0, %[[VAL_34]]
+// CHECK:         %[[VAL_36:.*]] = mul nuw nsw i32 %[[VAL_25]], 2
+// CHECK:         %[[VAL_37:.*]] = add nuw nsw i32 %[[VAL_35]], %[[VAL_36]]
+// CHECK:         %[[VAL_38:.*]] = mul nuw nsw i32 %[[VAL_24]], 4
+// CHECK:         %[[VAL_39:.*]] = add nuw nsw i32 %[[VAL_37]], %[[VAL_38]]
+// CHECK:         %[[VAL_40:.*]] = icmp ult i32 %[[VAL_39]], %[[VAL_15]]
+// CHECK:         br i1 %[[VAL_40]], label %[[VAL_41:.*]], label %[[VAL_29]]
+// CHECK:       custom_call.in_dyn_bounds-after:                  ; preds = %[[VAL_41]], %[[VAL_27]]
+// CHECK:         br label %[[VAL_28]]
+// CHECK:       custom_call.in_dyn_bounds-true:                   ; preds = %[[VAL_27]]
+// CHECK:         %[[VAL_42:.*]] = udiv i32 %[[VAL_39]], 1
+// CHECK:         %[[VAL_43:.*]] = urem i32 %[[VAL_42]], %[[VAL_4]]
+// CHECK:         %[[VAL_44:.*]] = mul i32 1, %[[VAL_4]]
+// CHECK:         %[[VAL_45:.*]] = udiv i32 %[[VAL_39]], %[[VAL_44]]
+// CHECK:         %[[VAL_46:.*]] = urem i32 %[[VAL_45]], %[[VAL_0]]
+// CHECK:         %[[VAL_47:.*]] = mul i32 %[[VAL_44]], %[[VAL_0]]
+// CHECK:         %[[VAL_48:.*]] = udiv i32 %[[VAL_39]], %[[VAL_47]]
+// CHECK:         %[[VAL_49:.*]] = getelementptr inbounds [2 x [2 x [2 x i32]]], ptr %[[VAL_50:.*]], i32 0, i32 %[[VAL_48]], i32 %[[VAL_46]], i32 %[[VAL_43]]
+// CHECK:         %[[VAL_51:.*]] = load i32, ptr %[[VAL_49]], align 4, !invariant.load !4
+// CHECK:         %[[VAL_52:.*]] = getelementptr inbounds i32, ptr %[[VAL_31]], i32 %[[VAL_19]]
 // CHECK:         store i32 %[[VAL_51]], ptr %[[VAL_52]], align 4
-// CHECK:         br label %[[VAL_31]]
+// CHECK:         br label %[[VAL_29]]
 
 
 HloModule SliceToDynamic
diff --git a/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo b/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
index fff43a69b5f..378f0fbb35e 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
@@ -16,258 +16,257 @@ compare {
 // CHECK:         %[[VAL_3:.*]] = alloca i8, align 1
 // CHECK:         %[[VAL_4:.*]] = alloca i8, align 1
 // CHECK:         %[[VAL_5:.*]] = alloca i8, align 1
-// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds i8, ptr %[[VAL_7:.*]], i64 0
-// CHECK:         %[[VAL_8:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_6:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_7:.*]] = zext i32 %[[VAL_6]] to i64
+// CHECK:         %[[VAL_8:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
 // CHECK:         %[[VAL_9:.*]] = zext i32 %[[VAL_8]] to i64
-// CHECK:         %[[VAL_10:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
-// CHECK:         %[[VAL_11:.*]] = zext i32 %[[VAL_10]] to i64
-// CHECK:         %[[VAL_12:.*]] = mul nuw nsw i64 %[[VAL_9]], 2
-// CHECK:         %[[VAL_13:.*]] = add nuw nsw i64 %[[VAL_12]], %[[VAL_11]]
-// CHECK:         %[[VAL_14:.*]] = icmp ult i64 %[[VAL_13]], 4
-// CHECK:         call void @llvm.assume(i1 %[[VAL_14]])
-// CHECK:         %[[VAL_15:.*]] = udiv i64 %[[VAL_13]], 1
-// CHECK:         %[[VAL_16:.*]] = urem i64 %[[VAL_15]], 2
-// CHECK:         %[[VAL_17:.*]] = udiv i64 %[[VAL_13]], 2
-// CHECK:         %[[VAL_18:.*]] = icmp ult i64 %[[VAL_13]], 4
-// CHECK:         br i1 %[[VAL_18]], label %[[VAL_19:.*]], label %[[VAL_20:.*]]
-// CHECK:       sort.in_bounds-after:                             ; preds = %[[VAL_21:.*]], %[[VAL_22:.*]]
+// CHECK:         %[[VAL_10:.*]] = mul nuw nsw i64 %[[VAL_7]], 2
+// CHECK:         %[[VAL_11:.*]] = add nuw nsw i64 %[[VAL_10]], %[[VAL_9]]
+// CHECK:         %[[VAL_12:.*]] = icmp ult i64 %[[VAL_11]], 4
+// CHECK:         call void @llvm.assume(i1 %[[VAL_12]])
+// CHECK:         %[[VAL_13:.*]] = udiv i64 %[[VAL_11]], 1
+// CHECK:         %[[VAL_14:.*]] = urem i64 %[[VAL_13]], 2
+// CHECK:         %[[VAL_15:.*]] = udiv i64 %[[VAL_11]], 2
+// CHECK:         %[[VAL_16:.*]] = icmp ult i64 %[[VAL_11]], 4
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
+// CHECK:       sort.in_bounds-after:                             ; preds = %[[VAL_19:.*]], %[[VAL_20:.*]]
 // CHECK:         ret void
-// CHECK:       sort.in_bounds-true:                              ; preds = %[[VAL_22]]
-// CHECK:         %[[VAL_23:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
-// CHECK:         %[[VAL_24:.*]] = sext i32 %[[VAL_23]] to i64
-// CHECK:         %[[VAL_25:.*]] = shl i64 %[[VAL_16]], 1
-// CHECK:         %[[VAL_26:.*]] = icmp slt i64 %[[VAL_25]], 3
-// CHECK:         br i1 %[[VAL_26]], label %[[VAL_27:.*]], label %[[VAL_28:.*]]
-// CHECK:       smaller_keys_index-after:                         ; preds = %[[VAL_29:.*]], %[[VAL_19]]
+// CHECK:       sort.in_bounds-true:                              ; preds = %[[VAL_20]]
+// CHECK:         %[[VAL_21:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
+// CHECK:         %[[VAL_22:.*]] = sext i32 %[[VAL_21]] to i64
+// CHECK:         %[[VAL_23:.*]] = shl i64 %[[VAL_14]], 1
+// CHECK:         %[[VAL_24:.*]] = icmp slt i64 %[[VAL_23]], 3
+// CHECK:         br i1 %[[VAL_24]], label %[[VAL_25:.*]], label %[[VAL_26:.*]]
+// CHECK:       smaller_keys_index-after:                         ; preds = %[[VAL_27:.*]], %[[VAL_17]]
 // CHECK:         call void @llvm.nvvm.barrier0()
-// CHECK:         %[[VAL_30:.*]] = mul i64 %[[VAL_16]], 2
-// CHECK:         %[[VAL_31:.*]] = icmp uge i64 %[[VAL_30]], 0
-// CHECK:         br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_33:.*]]
-// CHECK:       is_last_tile-after:                               ; preds = %[[VAL_34:.*]], %[[VAL_35:.*]]
+// CHECK:         %[[VAL_28:.*]] = mul i64 %[[VAL_14]], 2
+// CHECK:         %[[VAL_29:.*]] = icmp uge i64 %[[VAL_28]], 0
+// CHECK:         br i1 %[[VAL_29]], label %[[VAL_30:.*]], label %[[VAL_31:.*]]
+// CHECK:       is_last_tile-after:                               ; preds = %[[VAL_32:.*]], %[[VAL_33:.*]]
 // CHECK:         call void @llvm.nvvm.barrier0()
-// CHECK:         %[[VAL_36:.*]] = mul i64 %[[VAL_16]], 2
-// CHECK:         %[[VAL_37:.*]] = icmp uge i64 %[[VAL_36]], 0
-// CHECK:         br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_39:.*]]
-// CHECK:       is_last_tile-after9:                              ; preds = %[[VAL_40:.*]], %[[VAL_41:.*]]
+// CHECK:         %[[VAL_34:.*]] = mul i64 %[[VAL_14]], 2
+// CHECK:         %[[VAL_35:.*]] = icmp uge i64 %[[VAL_34]], 0
+// CHECK:         br i1 %[[VAL_35]], label %[[VAL_36:.*]], label %[[VAL_37:.*]]
+// CHECK:       is_last_tile-after9:                              ; preds = %[[VAL_38:.*]], %[[VAL_39:.*]]
 // CHECK:         call void @llvm.nvvm.barrier0()
-// CHECK:         %[[VAL_42:.*]] = mul i64 %[[VAL_16]], 2
-// CHECK:         %[[VAL_43:.*]] = icmp uge i64 %[[VAL_42]], 0
-// CHECK:         br i1 %[[VAL_43]], label %[[VAL_44:.*]], label %[[VAL_45:.*]]
-// CHECK:       is_last_tile-after24:                             ; preds = %[[VAL_46:.*]], %[[VAL_47:.*]]
+// CHECK:         %[[VAL_40:.*]] = mul i64 %[[VAL_14]], 2
+// CHECK:         %[[VAL_41:.*]] = icmp uge i64 %[[VAL_40]], 0
+// CHECK:         br i1 %[[VAL_41]], label %[[VAL_42:.*]], label %[[VAL_43:.*]]
+// CHECK:       is_last_tile-after24:                             ; preds = %[[VAL_44:.*]], %[[VAL_45:.*]]
 // CHECK:         call void @llvm.nvvm.barrier0()
-// CHECK:         %[[VAL_48:.*]] = shl i64 %[[VAL_16]], 1
-// CHECK:         %[[VAL_49:.*]] = icmp slt i64 %[[VAL_48]], 3
-// CHECK:         br i1 %[[VAL_49]], label %[[VAL_50:.*]], label %[[VAL_21]]
-// CHECK:       smaller_keys_index-after38:                       ; preds = %[[VAL_51:.*]], %[[VAL_52:.*]]
-// CHECK:         br label %[[VAL_20]]
-// CHECK:       smaller_keys_index-true:                          ; preds = %[[VAL_19]]
-// CHECK:         %[[VAL_53:.*]] = shl i64 %[[VAL_24]], 1
-// CHECK:         %[[VAL_54:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_6]], i64 0, i64 %[[VAL_17]], i64 %[[VAL_25]]
-// CHECK:         %[[VAL_55:.*]] = load float, ptr %[[VAL_54]], align 4
-// CHECK:         %[[VAL_56:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_53]]
-// CHECK:         store float %[[VAL_55]], ptr addrspace(3) %[[VAL_56]], align 4
-// CHECK:         %[[VAL_57:.*]] = add i64 %[[VAL_25]], 1
-// CHECK:         %[[VAL_58:.*]] = icmp slt i64 %[[VAL_57]], 3
-// CHECK:         br i1 %[[VAL_58]], label %[[VAL_59:.*]], label %[[VAL_29]]
-// CHECK:       inner_smaller_keys_index-after:                   ; preds = %[[VAL_59]], %[[VAL_27]]
-// CHECK:         br label %[[VAL_28]]
-// CHECK:       inner_smaller_keys_index-true:                    ; preds = %[[VAL_27]]
-// CHECK:         %[[VAL_60:.*]] = add i64 %[[VAL_53]], 1
-// CHECK:         %[[VAL_61:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_6]], i64 0, i64 %[[VAL_17]], i64 %[[VAL_57]]
-// CHECK:         %[[VAL_62:.*]] = load float, ptr %[[VAL_61]], align 4
-// CHECK:         %[[VAL_63:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_60]]
-// CHECK:         store float %[[VAL_62]], ptr addrspace(3) %[[VAL_63]], align 4
-// CHECK:         br label %[[VAL_29]]
-// CHECK:       is_last_tile-true:                                ; preds = %[[VAL_28]]
-// CHECK:         %[[VAL_64:.*]] = mul i64 %[[VAL_24]], 2
-// CHECK:         %[[VAL_65:.*]] = xor i64 %[[VAL_64]], 1
-// CHECK:         %[[VAL_66:.*]] = icmp slt i64 %[[VAL_64]], %[[VAL_65]]
-// CHECK:         %[[VAL_67:.*]] = icmp slt i64 %[[VAL_65]], 3
-// CHECK:         %[[VAL_68:.*]] = and i1 %[[VAL_66]], %[[VAL_67]]
-// CHECK:         br i1 %[[VAL_68]], label %[[VAL_69:.*]], label %[[VAL_35]]
-// CHECK:       smaller_comparison_index-after:                   ; preds = %[[VAL_70:.*]], %[[VAL_32]]
-// CHECK:         br label %[[VAL_71:.*]]
-// CHECK:       is_last_tile-false:                               ; preds = %[[VAL_28]]
-// CHECK:         %[[VAL_72:.*]] = mul i64 %[[VAL_24]], 2
-// CHECK:         %[[VAL_73:.*]] = xor i64 %[[VAL_72]], 1
-// CHECK:         %[[VAL_74:.*]] = icmp slt i64 %[[VAL_72]], %[[VAL_73]]
-// CHECK:         %[[VAL_75:.*]] = icmp slt i64 %[[VAL_73]], 4
-// CHECK:         br i1 true, label %[[VAL_76:.*]], label %[[VAL_34]]
-// CHECK:       smaller_comparison_index-after2:                  ; preds = %[[VAL_77:.*]], %[[VAL_33]]
-// CHECK:         br label %[[VAL_71]]
-// CHECK:       smaller_comparison_index-true:                    ; preds = %[[VAL_32]]
-// CHECK:         %[[VAL_78:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_65]]
-// CHECK:         %[[VAL_79:.*]] = addrspacecast ptr addrspace(3) %[[VAL_78]] to ptr
-// CHECK:         %[[VAL_80:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_64]]
-// CHECK:         %[[VAL_81:.*]] = addrspacecast ptr addrspace(3) %[[VAL_80]] to ptr
-// CHECK:         call void @region_0_4(ptr %[[VAL_79]], ptr %[[VAL_81]], ptr %[[VAL_5]])
-// CHECK:         %[[VAL_82:.*]] = load i8, ptr %[[VAL_5]], align 1
-// CHECK:         %[[VAL_83:.*]] = icmp ne i8 %[[VAL_82]], 0
-// CHECK:         br i1 %[[VAL_83]], label %[[VAL_84:.*]], label %[[VAL_70]]
-// CHECK:       is_smaller_than-after:                            ; preds = %[[VAL_84]], %[[VAL_69]]
-// CHECK:         br label %[[VAL_35]]
-// CHECK:       is_smaller_than-true:                             ; preds = %[[VAL_69]]
-// CHECK:         %[[VAL_85:.*]] = load float, ptr %[[VAL_79]], align 4
-// CHECK:         %[[VAL_86:.*]] = load float, ptr %[[VAL_81]], align 4
+// CHECK:         %[[VAL_46:.*]] = shl i64 %[[VAL_14]], 1
+// CHECK:         %[[VAL_47:.*]] = icmp slt i64 %[[VAL_46]], 3
+// CHECK:         br i1 %[[VAL_47]], label %[[VAL_48:.*]], label %[[VAL_19]]
+// CHECK:       smaller_keys_index-after38:                       ; preds = %[[VAL_49:.*]], %[[VAL_50:.*]]
+// CHECK:         br label %[[VAL_18]]
+// CHECK:       smaller_keys_index-true:                          ; preds = %[[VAL_17]]
+// CHECK:         %[[VAL_51:.*]] = shl i64 %[[VAL_22]], 1
+// CHECK:         %[[VAL_52:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_53:.*]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_23]]
+// CHECK:         %[[VAL_54:.*]] = load float, ptr %[[VAL_52]], align 4
+// CHECK:         %[[VAL_55:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_51]]
+// CHECK:         store float %[[VAL_54]], ptr addrspace(3) %[[VAL_55]], align 4
+// CHECK:         %[[VAL_56:.*]] = add i64 %[[VAL_23]], 1
+// CHECK:         %[[VAL_57:.*]] = icmp slt i64 %[[VAL_56]], 3
+// CHECK:         br i1 %[[VAL_57]], label %[[VAL_58:.*]], label %[[VAL_27]]
+// CHECK:       inner_smaller_keys_index-after:                   ; preds = %[[VAL_58]], %[[VAL_25]]
+// CHECK:         br label %[[VAL_26]]
+// CHECK:       inner_smaller_keys_index-true:                    ; preds = %[[VAL_25]]
+// CHECK:         %[[VAL_59:.*]] = add i64 %[[VAL_51]], 1
+// CHECK:         %[[VAL_60:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_53]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_56]]
+// CHECK:         %[[VAL_61:.*]] = load float, ptr %[[VAL_60]], align 4
+// CHECK:         %[[VAL_62:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_59]]
+// CHECK:         store float %[[VAL_61]], ptr addrspace(3) %[[VAL_62]], align 4
+// CHECK:         br label %[[VAL_27]]
+// CHECK:       is_last_tile-true:                                ; preds = %[[VAL_26]]
+// CHECK:         %[[VAL_63:.*]] = mul i64 %[[VAL_22]], 2
+// CHECK:         %[[VAL_64:.*]] = xor i64 %[[VAL_63]], 1
+// CHECK:         %[[VAL_65:.*]] = icmp slt i64 %[[VAL_63]], %[[VAL_64]]
+// CHECK:         %[[VAL_66:.*]] = icmp slt i64 %[[VAL_64]], 3
+// CHECK:         %[[VAL_67:.*]] = and i1 %[[VAL_65]], %[[VAL_66]]
+// CHECK:         br i1 %[[VAL_67]], label %[[VAL_68:.*]], label %[[VAL_33]]
+// CHECK:       smaller_comparison_index-after:                   ; preds = %[[VAL_69:.*]], %[[VAL_30]]
+// CHECK:         br label %[[VAL_70:.*]]
+// CHECK:       is_last_tile-false:                               ; preds = %[[VAL_26]]
+// CHECK:         %[[VAL_71:.*]] = mul i64 %[[VAL_22]], 2
+// CHECK:         %[[VAL_72:.*]] = xor i64 %[[VAL_71]], 1
+// CHECK:         %[[VAL_73:.*]] = icmp slt i64 %[[VAL_71]], %[[VAL_72]]
+// CHECK:         %[[VAL_74:.*]] = icmp slt i64 %[[VAL_72]], 4
+// CHECK:         br i1 true, label %[[VAL_75:.*]], label %[[VAL_32]]
+// CHECK:       smaller_comparison_index-after2:                  ; preds = %[[VAL_76:.*]], %[[VAL_31]]
+// CHECK:         br label %[[VAL_70]]
+// CHECK:       smaller_comparison_index-true:                    ; preds = %[[VAL_30]]
+// CHECK:         %[[VAL_77:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_64]]
+// CHECK:         %[[VAL_78:.*]] = addrspacecast ptr addrspace(3) %[[VAL_77]] to ptr
+// CHECK:         %[[VAL_79:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_63]]
+// CHECK:         %[[VAL_80:.*]] = addrspacecast ptr addrspace(3) %[[VAL_79]] to ptr
+// CHECK:         call void @region_0_4(ptr %[[VAL_78]], ptr %[[VAL_80]], ptr %[[VAL_5]])
+// CHECK:         %[[VAL_81:.*]] = load i8, ptr %[[VAL_5]], align 1
+// CHECK:         %[[VAL_82:.*]] = icmp ne i8 %[[VAL_81]], 0
+// CHECK:         br i1 %[[VAL_82]], label %[[VAL_83:.*]], label %[[VAL_69]]
+// CHECK:       is_smaller_than-after:                            ; preds = %[[VAL_83]], %[[VAL_68]]
+// CHECK:         br label %[[VAL_33]]
+// CHECK:       is_smaller_than-true:                             ; preds = %[[VAL_68]]
+// CHECK:         %[[VAL_84:.*]] = load float, ptr %[[VAL_78]], align 4
+// CHECK:         %[[VAL_85:.*]] = load float, ptr %[[VAL_80]], align 4
+// CHECK:         %[[VAL_86:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_63]]
+// CHECK:         store float %[[VAL_84]], ptr addrspace(3) %[[VAL_86]], align 4
 // CHECK:         %[[VAL_87:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_64]]
 // CHECK:         store float %[[VAL_85]], ptr addrspace(3) %[[VAL_87]], align 4
-// CHECK:         %[[VAL_88:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_65]]
-// CHECK:         store float %[[VAL_86]], ptr addrspace(3) %[[VAL_88]], align 4
-// CHECK:         br label %[[VAL_70]]
-// CHECK:       smaller_comparison_index-true1:                   ; preds = %[[VAL_33]]
-// CHECK:         %[[VAL_89:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_73]]
-// CHECK:         %[[VAL_90:.*]] = addrspacecast ptr addrspace(3) %[[VAL_89]] to ptr
-// CHECK:         %[[VAL_91:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_72]]
-// CHECK:         %[[VAL_92:.*]] = addrspacecast ptr addrspace(3) %[[VAL_91]] to ptr
-// CHECK:         call void @region_0_4(ptr %[[VAL_90]], ptr %[[VAL_92]], ptr %[[VAL_4]])
-// CHECK:         %[[VAL_93:.*]] = load i8, ptr %[[VAL_4]], align 1
-// CHECK:         %[[VAL_94:.*]] = icmp ne i8 %[[VAL_93]], 0
-// CHECK:         br i1 %[[VAL_94]], label %[[VAL_95:.*]], label %[[VAL_77]]
-// CHECK:       is_smaller_than-after6:                           ; preds = %[[VAL_95]], %[[VAL_76]]
-// CHECK:         br label %[[VAL_34]]
-// CHECK:       is_smaller_than-true5:                            ; preds = %[[VAL_76]]
-// CHECK:         %[[VAL_96:.*]] = load float, ptr %[[VAL_90]], align 4
-// CHECK:         %[[VAL_97:.*]] = load float, ptr %[[VAL_92]], align 4
+// CHECK:         br label %[[VAL_69]]
+// CHECK:       smaller_comparison_index-true1:                   ; preds = %[[VAL_31]]
+// CHECK:         %[[VAL_88:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_72]]
+// CHECK:         %[[VAL_89:.*]] = addrspacecast ptr addrspace(3) %[[VAL_88]] to ptr
+// CHECK:         %[[VAL_90:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_71]]
+// CHECK:         %[[VAL_91:.*]] = addrspacecast ptr addrspace(3) %[[VAL_90]] to ptr
+// CHECK:         call void @region_0_4(ptr %[[VAL_89]], ptr %[[VAL_91]], ptr %[[VAL_4]])
+// CHECK:         %[[VAL_92:.*]] = load i8, ptr %[[VAL_4]], align 1
+// CHECK:         %[[VAL_93:.*]] = icmp ne i8 %[[VAL_92]], 0
+// CHECK:         br i1 %[[VAL_93]], label %[[VAL_94:.*]], label %[[VAL_76]]
+// CHECK:       is_smaller_than-after6:                           ; preds = %[[VAL_94]], %[[VAL_75]]
+// CHECK:         br label %[[VAL_32]]
+// CHECK:       is_smaller_than-true5:                            ; preds = %[[VAL_75]]
+// CHECK:         %[[VAL_95:.*]] = load float, ptr %[[VAL_89]], align 4
+// CHECK:         %[[VAL_96:.*]] = load float, ptr %[[VAL_91]], align 4
+// CHECK:         %[[VAL_97:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_71]]
+// CHECK:         store float %[[VAL_95]], ptr addrspace(3) %[[VAL_97]], align 4
 // CHECK:         %[[VAL_98:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_72]]
 // CHECK:         store float %[[VAL_96]], ptr addrspace(3) %[[VAL_98]], align 4
-// CHECK:         %[[VAL_99:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_73]]
-// CHECK:         store float %[[VAL_97]], ptr addrspace(3) %[[VAL_99]], align 4
-// CHECK:         br label %[[VAL_77]]
-// CHECK:       is_last_tile-true7:                               ; preds = %[[VAL_71]]
-// CHECK:         %[[VAL_100:.*]] = xor i64 %[[VAL_24]], 3
-// CHECK:         %[[VAL_101:.*]] = icmp slt i64 %[[VAL_24]], %[[VAL_100]]
-// CHECK:         %[[VAL_102:.*]] = icmp slt i64 %[[VAL_100]], 3
-// CHECK:         %[[VAL_103:.*]] = and i1 %[[VAL_101]], %[[VAL_102]]
-// CHECK:         br i1 %[[VAL_103]], label %[[VAL_104:.*]], label %[[VAL_41]]
-// CHECK:       smaller_comparison_index-after11:                 ; preds = %[[VAL_105:.*]], %[[VAL_38]]
-// CHECK:         br label %[[VAL_106:.*]]
-// CHECK:       is_last_tile-false8:                              ; preds = %[[VAL_71]]
-// CHECK:         %[[VAL_107:.*]] = xor i64 %[[VAL_24]], 3
-// CHECK:         %[[VAL_108:.*]] = icmp slt i64 %[[VAL_24]], %[[VAL_107]]
-// CHECK:         %[[VAL_109:.*]] = icmp slt i64 %[[VAL_107]], 4
-// CHECK:         br i1 true, label %[[VAL_110:.*]], label %[[VAL_40]]
-// CHECK:       smaller_comparison_index-after17:                 ; preds = %[[VAL_111:.*]], %[[VAL_39]]
-// CHECK:         br label %[[VAL_106]]
-// CHECK:       smaller_comparison_index-true10:                  ; preds = %[[VAL_38]]
-// CHECK:         %[[VAL_112:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_100]]
-// CHECK:         %[[VAL_113:.*]] = addrspacecast ptr addrspace(3) %[[VAL_112]] to ptr
-// CHECK:         %[[VAL_114:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_24]]
-// CHECK:         %[[VAL_115:.*]] = addrspacecast ptr addrspace(3) %[[VAL_114]] to ptr
-// CHECK:         call void @region_0_4(ptr %[[VAL_113]], ptr %[[VAL_115]], ptr %[[VAL_3]])
-// CHECK:         %[[VAL_116:.*]] = load i8, ptr %[[VAL_3]], align 1
-// CHECK:         %[[VAL_117:.*]] = icmp ne i8 %[[VAL_116]], 0
-// CHECK:         br i1 %[[VAL_117]], label %[[VAL_118:.*]], label %[[VAL_105]]
-// CHECK:       is_smaller_than-after15:                          ; preds = %[[VAL_118]], %[[VAL_104]]
-// CHECK:         br label %[[VAL_41]]
-// CHECK:       is_smaller_than-true14:                           ; preds = %[[VAL_104]]
-// CHECK:         %[[VAL_119:.*]] = load float, ptr %[[VAL_113]], align 4
-// CHECK:         %[[VAL_120:.*]] = load float, ptr %[[VAL_115]], align 4
-// CHECK:         %[[VAL_121:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_24]]
-// CHECK:         store float %[[VAL_119]], ptr addrspace(3) %[[VAL_121]], align 4
-// CHECK:         %[[VAL_122:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_100]]
-// CHECK:         store float %[[VAL_120]], ptr addrspace(3) %[[VAL_122]], align 4
+// CHECK:         br label %[[VAL_76]]
+// CHECK:       is_last_tile-true7:                               ; preds = %[[VAL_70]]
+// CHECK:         %[[VAL_99:.*]] = xor i64 %[[VAL_22]], 3
+// CHECK:         %[[VAL_100:.*]] = icmp slt i64 %[[VAL_22]], %[[VAL_99]]
+// CHECK:         %[[VAL_101:.*]] = icmp slt i64 %[[VAL_99]], 3
+// CHECK:         %[[VAL_102:.*]] = and i1 %[[VAL_100]], %[[VAL_101]]
+// CHECK:         br i1 %[[VAL_102]], label %[[VAL_103:.*]], label %[[VAL_39]]
+// CHECK:       smaller_comparison_index-after11:                 ; preds = %[[VAL_104:.*]], %[[VAL_36]]
+// CHECK:         br label %[[VAL_105:.*]]
+// CHECK:       is_last_tile-false8:                              ; preds = %[[VAL_70]]
+// CHECK:         %[[VAL_106:.*]] = xor i64 %[[VAL_22]], 3
+// CHECK:         %[[VAL_107:.*]] = icmp slt i64 %[[VAL_22]], %[[VAL_106]]
+// CHECK:         %[[VAL_108:.*]] = icmp slt i64 %[[VAL_106]], 4
+// CHECK:         br i1 true, label %[[VAL_109:.*]], label %[[VAL_38]]
+// CHECK:       smaller_comparison_index-after17:                 ; preds = %[[VAL_110:.*]], %[[VAL_37]]
 // CHECK:         br label %[[VAL_105]]
-// CHECK:       smaller_comparison_index-true16:                  ; preds = %[[VAL_39]]
-// CHECK:         %[[VAL_123:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_107]]
-// CHECK:         %[[VAL_124:.*]] = addrspacecast ptr addrspace(3) %[[VAL_123]] to ptr
-// CHECK:         %[[VAL_125:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_24]]
-// CHECK:         %[[VAL_126:.*]] = addrspacecast ptr addrspace(3) %[[VAL_125]] to ptr
-// CHECK:         call void @region_0_4(ptr %[[VAL_124]], ptr %[[VAL_126]], ptr %[[VAL_2]])
-// CHECK:         %[[VAL_127:.*]] = load i8, ptr %[[VAL_2]], align 1
-// CHECK:         %[[VAL_128:.*]] = icmp ne i8 %[[VAL_127]], 0
-// CHECK:         br i1 %[[VAL_128]], label %[[VAL_129:.*]], label %[[VAL_111]]
-// CHECK:       is_smaller_than-after21:                          ; preds = %[[VAL_129]], %[[VAL_110]]
-// CHECK:         br label %[[VAL_40]]
-// CHECK:       is_smaller_than-true20:                           ; preds = %[[VAL_110]]
-// CHECK:         %[[VAL_130:.*]] = load float, ptr %[[VAL_124]], align 4
-// CHECK:         %[[VAL_131:.*]] = load float, ptr %[[VAL_126]], align 4
-// CHECK:         %[[VAL_132:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_24]]
+// CHECK:       smaller_comparison_index-true10:                  ; preds = %[[VAL_36]]
+// CHECK:         %[[VAL_111:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_99]]
+// CHECK:         %[[VAL_112:.*]] = addrspacecast ptr addrspace(3) %[[VAL_111]] to ptr
+// CHECK:         %[[VAL_113:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_22]]
+// CHECK:         %[[VAL_114:.*]] = addrspacecast ptr addrspace(3) %[[VAL_113]] to ptr
+// CHECK:         call void @region_0_4(ptr %[[VAL_112]], ptr %[[VAL_114]], ptr %[[VAL_3]])
+// CHECK:         %[[VAL_115:.*]] = load i8, ptr %[[VAL_3]], align 1
+// CHECK:         %[[VAL_116:.*]] = icmp ne i8 %[[VAL_115]], 0
+// CHECK:         br i1 %[[VAL_116]], label %[[VAL_117:.*]], label %[[VAL_104]]
+// CHECK:       is_smaller_than-after15:                          ; preds = %[[VAL_117]], %[[VAL_103]]
+// CHECK:         br label %[[VAL_39]]
+// CHECK:       is_smaller_than-true14:                           ; preds = %[[VAL_103]]
+// CHECK:         %[[VAL_118:.*]] = load float, ptr %[[VAL_112]], align 4
+// CHECK:         %[[VAL_119:.*]] = load float, ptr %[[VAL_114]], align 4
+// CHECK:         %[[VAL_120:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_22]]
+// CHECK:         store float %[[VAL_118]], ptr addrspace(3) %[[VAL_120]], align 4
+// CHECK:         %[[VAL_121:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_99]]
+// CHECK:         store float %[[VAL_119]], ptr addrspace(3) %[[VAL_121]], align 4
+// CHECK:         br label %[[VAL_104]]
+// CHECK:       smaller_comparison_index-true16:                  ; preds = %[[VAL_37]]
+// CHECK:         %[[VAL_122:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_106]]
+// CHECK:         %[[VAL_123:.*]] = addrspacecast ptr addrspace(3) %[[VAL_122]] to ptr
+// CHECK:         %[[VAL_124:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_22]]
+// CHECK:         %[[VAL_125:.*]] = addrspacecast ptr addrspace(3) %[[VAL_124]] to ptr
+// CHECK:         call void @region_0_4(ptr %[[VAL_123]], ptr %[[VAL_125]], ptr %[[VAL_2]])
+// CHECK:         %[[VAL_126:.*]] = load i8, ptr %[[VAL_2]], align 1
+// CHECK:         %[[VAL_127:.*]] = icmp ne i8 %[[VAL_126]], 0
+// CHECK:         br i1 %[[VAL_127]], label %[[VAL_128:.*]], label %[[VAL_110]]
+// CHECK:       is_smaller_than-after21:                          ; preds = %[[VAL_128]], %[[VAL_109]]
+// CHECK:         br label %[[VAL_38]]
+// CHECK:       is_smaller_than-true20:                           ; preds = %[[VAL_109]]
+// CHECK:         %[[VAL_129:.*]] = load float, ptr %[[VAL_123]], align 4
+// CHECK:         %[[VAL_130:.*]] = load float, ptr %[[VAL_125]], align 4
+// CHECK:         %[[VAL_131:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_22]]
+// CHECK:         store float %[[VAL_129]], ptr addrspace(3) %[[VAL_131]], align 4
+// CHECK:         %[[VAL_132:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_106]]
 // CHECK:         store float %[[VAL_130]], ptr addrspace(3) %[[VAL_132]], align 4
-// CHECK:         %[[VAL_133:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_107]]
-// CHECK:         store float %[[VAL_131]], ptr addrspace(3) %[[VAL_133]], align 4
-// CHECK:         br label %[[VAL_111]]
-// CHECK:       is_last_tile-true22:                              ; preds = %[[VAL_106]]
-// CHECK:         %[[VAL_134:.*]] = mul i64 %[[VAL_24]], 2
-// CHECK:         %[[VAL_135:.*]] = xor i64 %[[VAL_134]], 1
-// CHECK:         %[[VAL_136:.*]] = icmp slt i64 %[[VAL_134]], %[[VAL_135]]
-// CHECK:         %[[VAL_137:.*]] = icmp slt i64 %[[VAL_135]], 3
-// CHECK:         %[[VAL_138:.*]] = and i1 %[[VAL_136]], %[[VAL_137]]
-// CHECK:         br i1 %[[VAL_138]], label %[[VAL_139:.*]], label %[[VAL_47]]
-// CHECK:       smaller_comparison_index-after26:                 ; preds = %[[VAL_140:.*]], %[[VAL_44]]
-// CHECK:         br label %[[VAL_52]]
-// CHECK:       is_last_tile-false23:                             ; preds = %[[VAL_106]]
-// CHECK:         %[[VAL_141:.*]] = mul i64 %[[VAL_24]], 2
-// CHECK:         %[[VAL_142:.*]] = xor i64 %[[VAL_141]], 1
-// CHECK:         %[[VAL_143:.*]] = icmp slt i64 %[[VAL_141]], %[[VAL_142]]
-// CHECK:         %[[VAL_144:.*]] = icmp slt i64 %[[VAL_142]], 4
-// CHECK:         br i1 true, label %[[VAL_145:.*]], label %[[VAL_46]]
-// CHECK:       smaller_comparison_index-after32:                 ; preds = %[[VAL_146:.*]], %[[VAL_45]]
-// CHECK:         br label %[[VAL_52]]
-// CHECK:       smaller_comparison_index-true25:                  ; preds = %[[VAL_44]]
-// CHECK:         %[[VAL_147:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_135]]
-// CHECK:         %[[VAL_148:.*]] = addrspacecast ptr addrspace(3) %[[VAL_147]] to ptr
-// CHECK:         %[[VAL_149:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_134]]
-// CHECK:         %[[VAL_150:.*]] = addrspacecast ptr addrspace(3) %[[VAL_149]] to ptr
-// CHECK:         call void @region_0_4(ptr %[[VAL_148]], ptr %[[VAL_150]], ptr %[[VAL_1]])
-// CHECK:         %[[VAL_151:.*]] = load i8, ptr %[[VAL_1]], align 1
-// CHECK:         %[[VAL_152:.*]] = icmp ne i8 %[[VAL_151]], 0
-// CHECK:         br i1 %[[VAL_152]], label %[[VAL_153:.*]], label %[[VAL_140]]
-// CHECK:       is_smaller_than-after30:                          ; preds = %[[VAL_153]], %[[VAL_139]]
-// CHECK:         br label %[[VAL_47]]
-// CHECK:       is_smaller_than-true29:                           ; preds = %[[VAL_139]]
-// CHECK:         %[[VAL_154:.*]] = load float, ptr %[[VAL_148]], align 4
-// CHECK:         %[[VAL_155:.*]] = load float, ptr %[[VAL_150]], align 4
+// CHECK:         br label %[[VAL_110]]
+// CHECK:       is_last_tile-true22:                              ; preds = %[[VAL_105]]
+// CHECK:         %[[VAL_133:.*]] = mul i64 %[[VAL_22]], 2
+// CHECK:         %[[VAL_134:.*]] = xor i64 %[[VAL_133]], 1
+// CHECK:         %[[VAL_135:.*]] = icmp slt i64 %[[VAL_133]], %[[VAL_134]]
+// CHECK:         %[[VAL_136:.*]] = icmp slt i64 %[[VAL_134]], 3
+// CHECK:         %[[VAL_137:.*]] = and i1 %[[VAL_135]], %[[VAL_136]]
+// CHECK:         br i1 %[[VAL_137]], label %[[VAL_138:.*]], label %[[VAL_45]]
+// CHECK:       smaller_comparison_index-after26:                 ; preds = %[[VAL_139:.*]], %[[VAL_42]]
+// CHECK:         br label %[[VAL_50]]
+// CHECK:       is_last_tile-false23:                             ; preds = %[[VAL_105]]
+// CHECK:         %[[VAL_140:.*]] = mul i64 %[[VAL_22]], 2
+// CHECK:         %[[VAL_141:.*]] = xor i64 %[[VAL_140]], 1
+// CHECK:         %[[VAL_142:.*]] = icmp slt i64 %[[VAL_140]], %[[VAL_141]]
+// CHECK:         %[[VAL_143:.*]] = icmp slt i64 %[[VAL_141]], 4
+// CHECK:         br i1 true, label %[[VAL_144:.*]], label %[[VAL_44]]
+// CHECK:       smaller_comparison_index-after32:                 ; preds = %[[VAL_145:.*]], %[[VAL_43]]
+// CHECK:         br label %[[VAL_50]]
+// CHECK:       smaller_comparison_index-true25:                  ; preds = %[[VAL_42]]
+// CHECK:         %[[VAL_146:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_134]]
+// CHECK:         %[[VAL_147:.*]] = addrspacecast ptr addrspace(3) %[[VAL_146]] to ptr
+// CHECK:         %[[VAL_148:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_133]]
+// CHECK:         %[[VAL_149:.*]] = addrspacecast ptr addrspace(3) %[[VAL_148]] to ptr
+// CHECK:         call void @region_0_4(ptr %[[VAL_147]], ptr %[[VAL_149]], ptr %[[VAL_1]])
+// CHECK:         %[[VAL_150:.*]] = load i8, ptr %[[VAL_1]], align 1
+// CHECK:         %[[VAL_151:.*]] = icmp ne i8 %[[VAL_150]], 0
+// CHECK:         br i1 %[[VAL_151]], label %[[VAL_152:.*]], label %[[VAL_139]]
+// CHECK:       is_smaller_than-after30:                          ; preds = %[[VAL_152]], %[[VAL_138]]
+// CHECK:         br label %[[VAL_45]]
+// CHECK:       is_smaller_than-true29:                           ; preds = %[[VAL_138]]
+// CHECK:         %[[VAL_153:.*]] = load float, ptr %[[VAL_147]], align 4
+// CHECK:         %[[VAL_154:.*]] = load float, ptr %[[VAL_149]], align 4
+// CHECK:         %[[VAL_155:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_133]]
+// CHECK:         store float %[[VAL_153]], ptr addrspace(3) %[[VAL_155]], align 4
 // CHECK:         %[[VAL_156:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_134]]
 // CHECK:         store float %[[VAL_154]], ptr addrspace(3) %[[VAL_156]], align 4
-// CHECK:         %[[VAL_157:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_135]]
-// CHECK:         store float %[[VAL_155]], ptr addrspace(3) %[[VAL_157]], align 4
-// CHECK:         br label %[[VAL_140]]
-// CHECK:       smaller_comparison_index-true31:                  ; preds = %[[VAL_45]]
-// CHECK:         %[[VAL_158:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_142]]
-// CHECK:         %[[VAL_159:.*]] = addrspacecast ptr addrspace(3) %[[VAL_158]] to ptr
-// CHECK:         %[[VAL_160:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_141]]
-// CHECK:         %[[VAL_161:.*]] = addrspacecast ptr addrspace(3) %[[VAL_160]] to ptr
-// CHECK:         call void @region_0_4(ptr %[[VAL_159]], ptr %[[VAL_161]], ptr %[[VAL_0]])
-// CHECK:         %[[VAL_162:.*]] = load i8, ptr %[[VAL_0]], align 1
-// CHECK:         %[[VAL_163:.*]] = icmp ne i8 %[[VAL_162]], 0
-// CHECK:         br i1 %[[VAL_163]], label %[[VAL_164:.*]], label %[[VAL_146]]
-// CHECK:       is_smaller_than-after36:                          ; preds = %[[VAL_164]], %[[VAL_145]]
-// CHECK:         br label %[[VAL_46]]
-// CHECK:       is_smaller_than-true35:                           ; preds = %[[VAL_145]]
-// CHECK:         %[[VAL_165:.*]] = load float, ptr %[[VAL_159]], align 4
-// CHECK:         %[[VAL_166:.*]] = load float, ptr %[[VAL_161]], align 4
+// CHECK:         br label %[[VAL_139]]
+// CHECK:       smaller_comparison_index-true31:                  ; preds = %[[VAL_43]]
+// CHECK:         %[[VAL_157:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_141]]
+// CHECK:         %[[VAL_158:.*]] = addrspacecast ptr addrspace(3) %[[VAL_157]] to ptr
+// CHECK:         %[[VAL_159:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_140]]
+// CHECK:         %[[VAL_160:.*]] = addrspacecast ptr addrspace(3) %[[VAL_159]] to ptr
+// CHECK:         call void @region_0_4(ptr %[[VAL_158]], ptr %[[VAL_160]], ptr %[[VAL_0]])
+// CHECK:         %[[VAL_161:.*]] = load i8, ptr %[[VAL_0]], align 1
+// CHECK:         %[[VAL_162:.*]] = icmp ne i8 %[[VAL_161]], 0
+// CHECK:         br i1 %[[VAL_162]], label %[[VAL_163:.*]], label %[[VAL_145]]
+// CHECK:       is_smaller_than-after36:                          ; preds = %[[VAL_163]], %[[VAL_144]]
+// CHECK:         br label %[[VAL_44]]
+// CHECK:       is_smaller_than-true35:                           ; preds = %[[VAL_144]]
+// CHECK:         %[[VAL_164:.*]] = load float, ptr %[[VAL_158]], align 4
+// CHECK:         %[[VAL_165:.*]] = load float, ptr %[[VAL_160]], align 4
+// CHECK:         %[[VAL_166:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_140]]
+// CHECK:         store float %[[VAL_164]], ptr addrspace(3) %[[VAL_166]], align 4
 // CHECK:         %[[VAL_167:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_141]]
 // CHECK:         store float %[[VAL_165]], ptr addrspace(3) %[[VAL_167]], align 4
-// CHECK:         %[[VAL_168:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_142]]
-// CHECK:         store float %[[VAL_166]], ptr addrspace(3) %[[VAL_168]], align 4
-// CHECK:         br label %[[VAL_146]]
-// CHECK:       smaller_keys_index-true37:                        ; preds = %[[VAL_52]]
-// CHECK:         %[[VAL_169:.*]] = shl i64 %[[VAL_24]], 1
-// CHECK:         %[[VAL_170:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_169]]
-// CHECK:         %[[VAL_171:.*]] = load float, ptr addrspace(3) %[[VAL_170]], align 4
-// CHECK:         %[[VAL_172:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_6]], i64 0, i64 %[[VAL_17]], i64 %[[VAL_48]]
-// CHECK:         store float %[[VAL_171]], ptr %[[VAL_172]], align 4
-// CHECK:         %[[VAL_173:.*]] = add i64 %[[VAL_48]], 1
-// CHECK:         %[[VAL_174:.*]] = icmp slt i64 %[[VAL_173]], 3
-// CHECK:         br i1 %[[VAL_174]], label %[[VAL_175:.*]], label %[[VAL_51]]
-// CHECK:       inner_smaller_keys_index-after40:                 ; preds = %[[VAL_175]], %[[VAL_50]]
-// CHECK:         br label %[[VAL_21]]
-// CHECK:       inner_smaller_keys_index-true39:                  ; preds = %[[VAL_50]]
-// CHECK:         %[[VAL_176:.*]] = add i64 %[[VAL_169]], 1
-// CHECK:         %[[VAL_177:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_176]]
-// CHECK:         %[[VAL_178:.*]] = load float, ptr addrspace(3) %[[VAL_177]], align 4
-// CHECK:         %[[VAL_179:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_6]], i64 0, i64 %[[VAL_17]], i64 %[[VAL_173]]
-// CHECK:         store float %[[VAL_178]], ptr %[[VAL_179]], align 4
-// CHECK:         br label %[[VAL_51]]
+// CHECK:         br label %[[VAL_145]]
+// CHECK:       smaller_keys_index-true37:                        ; preds = %[[VAL_50]]
+// CHECK:         %[[VAL_168:.*]] = shl i64 %[[VAL_22]], 1
+// CHECK:         %[[VAL_169:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_168]]
+// CHECK:         %[[VAL_170:.*]] = load float, ptr addrspace(3) %[[VAL_169]], align 4
+// CHECK:         %[[VAL_171:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_53]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_46]]
+// CHECK:         store float %[[VAL_170]], ptr %[[VAL_171]], align 4
+// CHECK:         %[[VAL_172:.*]] = add i64 %[[VAL_46]], 1
+// CHECK:         %[[VAL_173:.*]] = icmp slt i64 %[[VAL_172]], 3
+// CHECK:         br i1 %[[VAL_173]], label %[[VAL_174:.*]], label %[[VAL_49]]
+// CHECK:       inner_smaller_keys_index-after40:                 ; preds = %[[VAL_174]], %[[VAL_48]]
+// CHECK:         br label %[[VAL_19]]
+// CHECK:       inner_smaller_keys_index-true39:                  ; preds = %[[VAL_48]]
+// CHECK:         %[[VAL_175:.*]] = add i64 %[[VAL_168]], 1
+// CHECK:         %[[VAL_176:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_175]]
+// CHECK:         %[[VAL_177:.*]] = load float, ptr addrspace(3) %[[VAL_176]], align 4
+// CHECK:         %[[VAL_178:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_53]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_172]]
+// CHECK:         store float %[[VAL_177]], ptr %[[VAL_178]], align 4
+// CHECK:         br label %[[VAL_49]]
 // CHECK:       entry:
-// CHECK:         %[[VAL_180:.*]] = alloca i8, align 1
-// CHECK:         %[[VAL_181:.*]] = load float, ptr %[[VAL_182:.*]], align 4
-// CHECK:         %[[VAL_183:.*]] = load float, ptr %[[VAL_184:.*]], align 4
-// CHECK:         %[[VAL_185:.*]] = fcmp olt float %[[VAL_181]], %[[VAL_183]]
-// CHECK:         %[[VAL_186:.*]] = zext i1 %[[VAL_185]] to i8
-// CHECK:         store i8 %[[VAL_186]], ptr %[[VAL_180]], align 1
-// CHECK:         %[[VAL_187:.*]] = load i8, ptr %[[VAL_180]], align 1
-// CHECK:         store i8 %[[VAL_187]], ptr %[[VAL_188:.*]], align 1
+// CHECK:         %[[VAL_179:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_180:.*]] = load float, ptr %[[VAL_181:.*]], align 4
+// CHECK:         %[[VAL_182:.*]] = load float, ptr %[[VAL_183:.*]], align 4
+// CHECK:         %[[VAL_184:.*]] = fcmp olt float %[[VAL_180]], %[[VAL_182]]
+// CHECK:         %[[VAL_185:.*]] = zext i1 %[[VAL_184]] to i8
+// CHECK:         store i8 %[[VAL_185]], ptr %[[VAL_179]], align 1
+// CHECK:         %[[VAL_186:.*]] = load i8, ptr %[[VAL_179]], align 1
+// CHECK:         store i8 %[[VAL_186]], ptr %[[VAL_187:.*]], align 1
 // CHECK:         ret void
 
 ENTRY main {
@@ -288,369 +287,367 @@ compare {
 }
 
 // CHECK:       entry:
+// CHECK:         %[[VAL_188:.*]] = alloca i8, align 1
 // CHECK:         %[[VAL_189:.*]] = alloca i8, align 1
 // CHECK:         %[[VAL_190:.*]] = alloca i8, align 1
 // CHECK:         %[[VAL_191:.*]] = alloca i8, align 1
 // CHECK:         %[[VAL_192:.*]] = alloca i8, align 1
 // CHECK:         %[[VAL_193:.*]] = alloca i8, align 1
-// CHECK:         %[[VAL_194:.*]] = alloca i8, align 1
-// CHECK:         %[[VAL_195:.*]] = getelementptr inbounds i8, ptr %[[VAL_196:.*]], i64 0
-// CHECK:         %[[VAL_197:.*]] = getelementptr inbounds i8, ptr %[[VAL_198:.*]], i64 0
-// CHECK:         %[[VAL_199:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
-// CHECK:         %[[VAL_200:.*]] = zext i32 %[[VAL_199]] to i64
-// CHECK:         %[[VAL_201:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
-// CHECK:         %[[VAL_202:.*]] = zext i32 %[[VAL_201]] to i64
-// CHECK:         %[[VAL_203:.*]] = mul nuw nsw i64 %[[VAL_200]], 2
-// CHECK:         %[[VAL_204:.*]] = add nuw nsw i64 %[[VAL_203]], %[[VAL_202]]
-// CHECK:         %[[VAL_205:.*]] = icmp ult i64 %[[VAL_204]], 4
-// CHECK:         call void @llvm.assume(i1 %[[VAL_205]])
-// CHECK:         %[[VAL_206:.*]] = udiv i64 %[[VAL_204]], 1
-// CHECK:         %[[VAL_207:.*]] = urem i64 %[[VAL_206]], 2
-// CHECK:         %[[VAL_208:.*]] = udiv i64 %[[VAL_204]], 2
-// CHECK:         %[[VAL_209:.*]] = icmp ult i64 %[[VAL_204]], 4
-// CHECK:         br i1 %[[VAL_209]], label %[[VAL_210:.*]], label %[[VAL_211:.*]]
-// CHECK:       sort.in_bounds-after:                             ; preds = %[[VAL_212:.*]], %[[VAL_213:.*]]
+// CHECK:         %[[VAL_194:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK:         %[[VAL_195:.*]] = zext i32 %[[VAL_194]] to i64
+// CHECK:         %[[VAL_196:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
+// CHECK:         %[[VAL_197:.*]] = zext i32 %[[VAL_196]] to i64
+// CHECK:         %[[VAL_198:.*]] = mul nuw nsw i64 %[[VAL_195]], 2
+// CHECK:         %[[VAL_199:.*]] = add nuw nsw i64 %[[VAL_198]], %[[VAL_197]]
+// CHECK:         %[[VAL_200:.*]] = icmp ult i64 %[[VAL_199]], 4
+// CHECK:         call void @llvm.assume(i1 %[[VAL_200]])
+// CHECK:         %[[VAL_201:.*]] = udiv i64 %[[VAL_199]], 1
+// CHECK:         %[[VAL_202:.*]] = urem i64 %[[VAL_201]], 2
+// CHECK:         %[[VAL_203:.*]] = udiv i64 %[[VAL_199]], 2
+// CHECK:         %[[VAL_204:.*]] = icmp ult i64 %[[VAL_199]], 4
+// CHECK:         br i1 %[[VAL_204]], label %[[VAL_205:.*]], label %[[VAL_206:.*]]
+// CHECK:       sort.in_bounds-after:                             ; preds = %[[VAL_207:.*]], %[[VAL_208:.*]]
 // CHECK:         ret void
-// CHECK:       sort.in_bounds-true:                              ; preds = %[[VAL_213]]
-// CHECK:         %[[VAL_214:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
-// CHECK:         %[[VAL_215:.*]] = sext i32 %[[VAL_214]] to i64
-// CHECK:         %[[VAL_216:.*]] = shl i64 %[[VAL_207]], 1
+// CHECK:       sort.in_bounds-true:                              ; preds = %[[VAL_208]]
+// CHECK:         %[[VAL_209:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
+// CHECK:         %[[VAL_210:.*]] = sext i32 %[[VAL_209]] to i64
+// CHECK:         %[[VAL_211:.*]] = shl i64 %[[VAL_202]], 1
+// CHECK:         %[[VAL_212:.*]] = icmp slt i64 %[[VAL_211]], 3
+// CHECK:         br i1 %[[VAL_212]], label %[[VAL_213:.*]], label %[[VAL_214:.*]]
+// CHECK:       smaller_keys_index-after:                         ; preds = %[[VAL_215:.*]], %[[VAL_205]]
+// CHECK:         %[[VAL_216:.*]] = shl i64 %[[VAL_202]], 1
 // CHECK:         %[[VAL_217:.*]] = icmp slt i64 %[[VAL_216]], 3
 // CHECK:         br i1 %[[VAL_217]], label %[[VAL_218:.*]], label %[[VAL_219:.*]]
-// CHECK:       smaller_keys_index-after:                         ; preds = %[[VAL_220:.*]], %[[VAL_210]]
-// CHECK:         %[[VAL_221:.*]] = shl i64 %[[VAL_207]], 1
-// CHECK:         %[[VAL_222:.*]] = icmp slt i64 %[[VAL_221]], 3
+// CHECK:       smaller_keys_index-after2:                        ; preds = %[[VAL_220:.*]], %[[VAL_214]]
+// CHECK:         call void @llvm.nvvm.barrier0()
+// CHECK:         %[[VAL_221:.*]] = mul i64 %[[VAL_202]], 2
+// CHECK:         %[[VAL_222:.*]] = icmp uge i64 %[[VAL_221]], 0
 // CHECK:         br i1 %[[VAL_222]], label %[[VAL_223:.*]], label %[[VAL_224:.*]]
-// CHECK:       smaller_keys_index-after2:                        ; preds = %[[VAL_225:.*]], %[[VAL_219]]
+// CHECK:       is_last_tile-after:                               ; preds = %[[VAL_225:.*]], %[[VAL_226:.*]]
 // CHECK:         call void @llvm.nvvm.barrier0()
-// CHECK:         %[[VAL_226:.*]] = mul i64 %[[VAL_207]], 2
-// CHECK:         %[[VAL_227:.*]] = icmp uge i64 %[[VAL_226]], 0
-// CHECK:         br i1 %[[VAL_227]], label %[[VAL_228:.*]], label %[[VAL_229:.*]]
-// CHECK:       is_last_tile-after:                               ; preds = %[[VAL_230:.*]], %[[VAL_231:.*]]
+// CHECK:         %[[VAL_227:.*]] = mul i64 %[[VAL_202]], 2
+// CHECK:         %[[VAL_228:.*]] = icmp uge i64 %[[VAL_227]], 0
+// CHECK:         br i1 %[[VAL_228]], label %[[VAL_229:.*]], label %[[VAL_230:.*]]
+// CHECK:       is_last_tile-after13:                             ; preds = %[[VAL_231:.*]], %[[VAL_232:.*]]
 // CHECK:         call void @llvm.nvvm.barrier0()
-// CHECK:         %[[VAL_232:.*]] = mul i64 %[[VAL_207]], 2
-// CHECK:         %[[VAL_233:.*]] = icmp uge i64 %[[VAL_232]], 0
-// CHECK:         br i1 %[[VAL_233]], label %[[VAL_234:.*]], label %[[VAL_235:.*]]
-// CHECK:       is_last_tile-after13:                             ; preds = %[[VAL_236:.*]], %[[VAL_237:.*]]
+// CHECK:         %[[VAL_233:.*]] = mul i64 %[[VAL_202]], 2
+// CHECK:         %[[VAL_234:.*]] = icmp uge i64 %[[VAL_233]], 0
+// CHECK:         br i1 %[[VAL_234]], label %[[VAL_235:.*]], label %[[VAL_236:.*]]
+// CHECK:       is_last_tile-after28:                             ; preds = %[[VAL_237:.*]], %[[VAL_238:.*]]
 // CHECK:         call void @llvm.nvvm.barrier0()
-// CHECK:         %[[VAL_238:.*]] = mul i64 %[[VAL_207]], 2
-// CHECK:         %[[VAL_239:.*]] = icmp uge i64 %[[VAL_238]], 0
-// CHECK:         br i1 %[[VAL_239]], label %[[VAL_240:.*]], label %[[VAL_241:.*]]
-// CHECK:       is_last_tile-after28:                             ; preds = %[[VAL_242:.*]], %[[VAL_243:.*]]
-// CHECK:         call void @llvm.nvvm.barrier0()
-// CHECK:         %[[VAL_244:.*]] = shl i64 %[[VAL_207]], 1
-// CHECK:         %[[VAL_245:.*]] = icmp slt i64 %[[VAL_244]], 3
-// CHECK:         br i1 %[[VAL_245]], label %[[VAL_246:.*]], label %[[VAL_247:.*]]
-// CHECK:       smaller_keys_index-after42:                       ; preds = %[[VAL_248:.*]], %[[VAL_249:.*]]
-// CHECK:         %[[VAL_250:.*]] = shl i64 %[[VAL_207]], 1
-// CHECK:         %[[VAL_251:.*]] = icmp slt i64 %[[VAL_250]], 3
-// CHECK:         br i1 %[[VAL_251]], label %[[VAL_252:.*]], label %[[VAL_212]]
-// CHECK:       smaller_keys_index-after46:                       ; preds = %[[VAL_253:.*]], %[[VAL_247]]
-// CHECK:         br label %[[VAL_211]]
-// CHECK:       smaller_keys_index-true:                          ; preds = %[[VAL_210]]
-// CHECK:         %[[VAL_254:.*]] = shl i64 %[[VAL_215]], 1
-// CHECK:         %[[VAL_255:.*]] = getelementptr inbounds [2 x [3 x i32]], ptr %[[VAL_195]], i64 0, i64 %[[VAL_208]], i64 %[[VAL_216]]
-// CHECK:         %[[VAL_256:.*]] = load i32, ptr %[[VAL_255]], align 4
-// CHECK:         %[[VAL_257:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_254]]
-// CHECK:         store i32 %[[VAL_256]], ptr addrspace(3) %[[VAL_257]], align 4
-// CHECK:         %[[VAL_258:.*]] = add i64 %[[VAL_216]], 1
-// CHECK:         %[[VAL_259:.*]] = icmp slt i64 %[[VAL_258]], 3
-// CHECK:         br i1 %[[VAL_259]], label %[[VAL_260:.*]], label %[[VAL_220]]
-// CHECK:       inner_smaller_keys_index-after:                   ; preds = %[[VAL_260]], %[[VAL_218]]
+// CHECK:         %[[VAL_239:.*]] = shl i64 %[[VAL_202]], 1
+// CHECK:         %[[VAL_240:.*]] = icmp slt i64 %[[VAL_239]], 3
+// CHECK:         br i1 %[[VAL_240]], label %[[VAL_241:.*]], label %[[VAL_242:.*]]
+// CHECK:       smaller_keys_index-after42:                       ; preds = %[[VAL_243:.*]], %[[VAL_244:.*]]
+// CHECK:         %[[VAL_245:.*]] = shl i64 %[[VAL_202]], 1
+// CHECK:         %[[VAL_246:.*]] = icmp slt i64 %[[VAL_245]], 3
+// CHECK:         br i1 %[[VAL_246]], label %[[VAL_247:.*]], label %[[VAL_207]]
+// CHECK:       smaller_keys_index-after46:                       ; preds = %[[VAL_248:.*]], %[[VAL_242]]
+// CHECK:         br label %[[VAL_206]]
+// CHECK:       smaller_keys_index-true:                          ; preds = %[[VAL_205]]
+// CHECK:         %[[VAL_249:.*]] = shl i64 %[[VAL_210]], 1
+// CHECK:         %[[VAL_250:.*]] = getelementptr inbounds [2 x [3 x i32]], ptr %[[VAL_251:.*]], i64 0, i64 %[[VAL_203]], i64 %[[VAL_211]]
+// CHECK:         %[[VAL_252:.*]] = load i32, ptr %[[VAL_250]], align 4
+// CHECK:         %[[VAL_253:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_249]]
+// CHECK:         store i32 %[[VAL_252]], ptr addrspace(3) %[[VAL_253]], align 4
+// CHECK:         %[[VAL_254:.*]] = add i64 %[[VAL_211]], 1
+// CHECK:         %[[VAL_255:.*]] = icmp slt i64 %[[VAL_254]], 3
+// CHECK:         br i1 %[[VAL_255]], label %[[VAL_256:.*]], label %[[VAL_215]]
+// CHECK:       inner_smaller_keys_index-after:                   ; preds = %[[VAL_256]], %[[VAL_213]]
+// CHECK:         br label %[[VAL_214]]
+// CHECK:       inner_smaller_keys_index-true:                    ; preds = %[[VAL_213]]
+// CHECK:         %[[VAL_257:.*]] = add i64 %[[VAL_249]], 1
+// CHECK:         %[[VAL_258:.*]] = getelementptr inbounds [2 x [3 x i32]], ptr %[[VAL_251]], i64 0, i64 %[[VAL_203]], i64 %[[VAL_254]]
+// CHECK:         %[[VAL_259:.*]] = load i32, ptr %[[VAL_258]], align 4
+// CHECK:         %[[VAL_260:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_257]]
+// CHECK:         store i32 %[[VAL_259]], ptr addrspace(3) %[[VAL_260]], align 4
+// CHECK:         br label %[[VAL_215]]
+// CHECK:       smaller_keys_index-true1:                         ; preds = %[[VAL_214]]
+// CHECK:         %[[VAL_261:.*]] = shl i64 %[[VAL_210]], 1
+// CHECK:         %[[VAL_262:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_263:.*]], i64 0, i64 %[[VAL_203]], i64 %[[VAL_216]]
+// CHECK:         %[[VAL_264:.*]] = load float, ptr %[[VAL_262]], align 4
+// CHECK:         %[[VAL_265:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_261]]
+// CHECK:         store float %[[VAL_264]], ptr addrspace(3) %[[VAL_265]], align 4
+// CHECK:         %[[VAL_266:.*]] = add i64 %[[VAL_216]], 1
+// CHECK:         %[[VAL_267:.*]] = icmp slt i64 %[[VAL_266]], 3
+// CHECK:         br i1 %[[VAL_267]], label %[[VAL_268:.*]], label %[[VAL_220]]
+// CHECK:       inner_smaller_keys_index-after4:                  ; preds = %[[VAL_268]], %[[VAL_218]]
 // CHECK:         br label %[[VAL_219]]
-// CHECK:       inner_smaller_keys_index-true:                    ; preds = %[[VAL_218]]
-// CHECK:         %[[VAL_261:.*]] = add i64 %[[VAL_254]], 1
-// CHECK:         %[[VAL_262:.*]] = getelementptr inbounds [2 x [3 x i32]], ptr %[[VAL_195]], i64 0, i64 %[[VAL_208]], i64 %[[VAL_258]]
-// CHECK:         %[[VAL_263:.*]] = load i32, ptr %[[VAL_262]], align 4
-// CHECK:         %[[VAL_264:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_261]]
-// CHECK:         store i32 %[[VAL_263]], ptr addrspace(3) %[[VAL_264]], align 4
+// CHECK:       inner_smaller_keys_index-true3:                   ; preds = %[[VAL_218]]
+// CHECK:         %[[VAL_269:.*]] = add i64 %[[VAL_261]], 1
+// CHECK:         %[[VAL_270:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_263]], i64 0, i64 %[[VAL_203]], i64 %[[VAL_266]]
+// CHECK:         %[[VAL_271:.*]] = load float, ptr %[[VAL_270]], align 4
+// CHECK:         %[[VAL_272:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_269]]
+// CHECK:         store float %[[VAL_271]], ptr addrspace(3) %[[VAL_272]], align 4
 // CHECK:         br label %[[VAL_220]]
-// CHECK:       smaller_keys_index-true1:                         ; preds = %[[VAL_219]]
-// CHECK:         %[[VAL_265:.*]] = shl i64 %[[VAL_215]], 1
-// CHECK:         %[[VAL_266:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_197]], i64 0, i64 %[[VAL_208]], i64 %[[VAL_221]]
-// CHECK:         %[[VAL_267:.*]] = load float, ptr %[[VAL_266]], align 4
-// CHECK:         %[[VAL_268:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_265]]
-// CHECK:         store float %[[VAL_267]], ptr addrspace(3) %[[VAL_268]], align 4
-// CHECK:         %[[VAL_269:.*]] = add i64 %[[VAL_221]], 1
-// CHECK:         %[[VAL_270:.*]] = icmp slt i64 %[[VAL_269]], 3
-// CHECK:         br i1 %[[VAL_270]], label %[[VAL_271:.*]], label %[[VAL_225]]
-// CHECK:       inner_smaller_keys_index-after4:                  ; preds = %[[VAL_271]], %[[VAL_223]]
-// CHECK:         br label %[[VAL_224]]
-// CHECK:       inner_smaller_keys_index-true3:                   ; preds = %[[VAL_223]]
-// CHECK:         %[[VAL_272:.*]] = add i64 %[[VAL_265]], 1
-// CHECK:         %[[VAL_273:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_197]], i64 0, i64 %[[VAL_208]], i64 %[[VAL_269]]
-// CHECK:         %[[VAL_274:.*]] = load float, ptr %[[VAL_273]], align 4
-// CHECK:         %[[VAL_275:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_272]]
-// CHECK:         store float %[[VAL_274]], ptr addrspace(3) %[[VAL_275]], align 4
+// CHECK:       is_last_tile-true:                                ; preds = %[[VAL_219]]
+// CHECK:         %[[VAL_273:.*]] = mul i64 %[[VAL_210]], 2
+// CHECK:         %[[VAL_274:.*]] = xor i64 %[[VAL_273]], 1
+// CHECK:         %[[VAL_275:.*]] = icmp slt i64 %[[VAL_273]], %[[VAL_274]]
+// CHECK:         %[[VAL_276:.*]] = icmp slt i64 %[[VAL_274]], 3
+// CHECK:         %[[VAL_277:.*]] = and i1 %[[VAL_275]], %[[VAL_276]]
+// CHECK:         br i1 %[[VAL_277]], label %[[VAL_278:.*]], label %[[VAL_226]]
+// CHECK:       smaller_comparison_index-after:                   ; preds = %[[VAL_279:.*]], %[[VAL_223]]
+// CHECK:         br label %[[VAL_280:.*]]
+// CHECK:       is_last_tile-false:                               ; preds = %[[VAL_219]]
+// CHECK:         %[[VAL_281:.*]] = mul i64 %[[VAL_210]], 2
+// CHECK:         %[[VAL_282:.*]] = xor i64 %[[VAL_281]], 1
+// CHECK:         %[[VAL_283:.*]] = icmp slt i64 %[[VAL_281]], %[[VAL_282]]
+// CHECK:         %[[VAL_284:.*]] = icmp slt i64 %[[VAL_282]], 4
+// CHECK:         br i1 true, label %[[VAL_285:.*]], label %[[VAL_225]]
+// CHECK:       smaller_comparison_index-after6:                  ; preds = %[[VAL_286:.*]], %[[VAL_224]]
+// CHECK:         br label %[[VAL_280]]
+// CHECK:       smaller_comparison_index-true:                    ; preds = %[[VAL_223]]
+// CHECK:         %[[VAL_287:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_274]]
+// CHECK:         %[[VAL_288:.*]] = addrspacecast ptr addrspace(3) %[[VAL_287]] to ptr
+// CHECK:         %[[VAL_289:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_273]]
+// CHECK:         %[[VAL_290:.*]] = addrspacecast ptr addrspace(3) %[[VAL_289]] to ptr
+// CHECK:         %[[VAL_291:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_274]]
+// CHECK:         %[[VAL_292:.*]] = addrspacecast ptr addrspace(3) %[[VAL_291]] to ptr
+// CHECK:         %[[VAL_293:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_273]]
+// CHECK:         %[[VAL_294:.*]] = addrspacecast ptr addrspace(3) %[[VAL_293]] to ptr
+// CHECK:         call void @region_0_6(ptr %[[VAL_288]], ptr %[[VAL_290]], ptr %[[VAL_292]], ptr %[[VAL_294]], ptr %[[VAL_193]])
+// CHECK:         %[[VAL_295:.*]] = load i8, ptr %[[VAL_193]], align 1
+// CHECK:         %[[VAL_296:.*]] = icmp ne i8 %[[VAL_295]], 0
+// CHECK:         br i1 %[[VAL_296]], label %[[VAL_297:.*]], label %[[VAL_279]]
+// CHECK:       is_smaller_than-after:                            ; preds = %[[VAL_297]], %[[VAL_278]]
+// CHECK:         br label %[[VAL_226]]
+// CHECK:       is_smaller_than-true:                             ; preds = %[[VAL_278]]
+// CHECK:         %[[VAL_298:.*]] = load i32, ptr %[[VAL_288]], align 4
+// CHECK:         %[[VAL_299:.*]] = load i32, ptr %[[VAL_290]], align 4
+// CHECK:         %[[VAL_300:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_273]]
+// CHECK:         store i32 %[[VAL_298]], ptr addrspace(3) %[[VAL_300]], align 4
+// CHECK:         %[[VAL_301:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_274]]
+// CHECK:         store i32 %[[VAL_299]], ptr addrspace(3) %[[VAL_301]], align 4
+// CHECK:         %[[VAL_302:.*]] = load float, ptr %[[VAL_292]], align 4
+// CHECK:         %[[VAL_303:.*]] = load float, ptr %[[VAL_294]], align 4
+// CHECK:         %[[VAL_304:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_273]]
+// CHECK:         store float %[[VAL_302]], ptr addrspace(3) %[[VAL_304]], align 4
+// CHECK:         %[[VAL_305:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_274]]
+// CHECK:         store float %[[VAL_303]], ptr addrspace(3) %[[VAL_305]], align 4
+// CHECK:         br label %[[VAL_279]]
+// CHECK:       smaller_comparison_index-true5:                   ; preds = %[[VAL_224]]
+// CHECK:         %[[VAL_306:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_282]]
+// CHECK:         %[[VAL_307:.*]] = addrspacecast ptr addrspace(3) %[[VAL_306]] to ptr
+// CHECK:         %[[VAL_308:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_281]]
+// CHECK:         %[[VAL_309:.*]] = addrspacecast ptr addrspace(3) %[[VAL_308]] to ptr
+// CHECK:         %[[VAL_310:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_282]]
+// CHECK:         %[[VAL_311:.*]] = addrspacecast ptr addrspace(3) %[[VAL_310]] to ptr
+// CHECK:         %[[VAL_312:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_281]]
+// CHECK:         %[[VAL_313:.*]] = addrspacecast ptr addrspace(3) %[[VAL_312]] to ptr
+// CHECK:         call void @region_0_6(ptr %[[VAL_307]], ptr %[[VAL_309]], ptr %[[VAL_311]], ptr %[[VAL_313]], ptr %[[VAL_192]])
+// CHECK:         %[[VAL_314:.*]] = load i8, ptr %[[VAL_192]], align 1
+// CHECK:         %[[VAL_315:.*]] = icmp ne i8 %[[VAL_314]], 0
+// CHECK:         br i1 %[[VAL_315]], label %[[VAL_316:.*]], label %[[VAL_286]]
+// CHECK:       is_smaller_than-after10:                          ; preds = %[[VAL_316]], %[[VAL_285]]
 // CHECK:         br label %[[VAL_225]]
-// CHECK:       is_last_tile-true:                                ; preds = %[[VAL_224]]
-// CHECK:         %[[VAL_276:.*]] = mul i64 %[[VAL_215]], 2
-// CHECK:         %[[VAL_277:.*]] = xor i64 %[[VAL_276]], 1
-// CHECK:         %[[VAL_278:.*]] = icmp slt i64 %[[VAL_276]], %[[VAL_277]]
-// CHECK:         %[[VAL_279:.*]] = icmp slt i64 %[[VAL_277]], 3
-// CHECK:         %[[VAL_280:.*]] = and i1 %[[VAL_278]], %[[VAL_279]]
-// CHECK:         br i1 %[[VAL_280]], label %[[VAL_281:.*]], label %[[VAL_231]]
-// CHECK:       smaller_comparison_index-after:                   ; preds = %[[VAL_282:.*]], %[[VAL_228]]
-// CHECK:         br label %[[VAL_283:.*]]
-// CHECK:       is_last_tile-false:                               ; preds = %[[VAL_224]]
-// CHECK:         %[[VAL_284:.*]] = mul i64 %[[VAL_215]], 2
-// CHECK:         %[[VAL_285:.*]] = xor i64 %[[VAL_284]], 1
-// CHECK:         %[[VAL_286:.*]] = icmp slt i64 %[[VAL_284]], %[[VAL_285]]
-// CHECK:         %[[VAL_287:.*]] = icmp slt i64 %[[VAL_285]], 4
-// CHECK:         br i1 true, label %[[VAL_288:.*]], label %[[VAL_230]]
-// CHECK:       smaller_comparison_index-after6:                  ; preds = %[[VAL_289:.*]], %[[VAL_229]]
-// CHECK:         br label %[[VAL_283]]
-// CHECK:       smaller_comparison_index-true:                    ; preds = %[[VAL_228]]
-// CHECK:         %[[VAL_290:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_277]]
-// CHECK:         %[[VAL_291:.*]] = addrspacecast ptr addrspace(3) %[[VAL_290]] to ptr
-// CHECK:         %[[VAL_292:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_276]]
-// CHECK:         %[[VAL_293:.*]] = addrspacecast ptr addrspace(3) %[[VAL_292]] to ptr
-// CHECK:         %[[VAL_294:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_277]]
-// CHECK:         %[[VAL_295:.*]] = addrspacecast ptr addrspace(3) %[[VAL_294]] to ptr
-// CHECK:         %[[VAL_296:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_276]]
-// CHECK:         %[[VAL_297:.*]] = addrspacecast ptr addrspace(3) %[[VAL_296]] to ptr
-// CHECK:         call void @region_0_6(ptr %[[VAL_291]], ptr %[[VAL_293]], ptr %[[VAL_295]], ptr %[[VAL_297]], ptr %[[VAL_194]])
-// CHECK:         %[[VAL_298:.*]] = load i8, ptr %[[VAL_194]], align 1
-// CHECK:         %[[VAL_299:.*]] = icmp ne i8 %[[VAL_298]], 0
-// CHECK:         br i1 %[[VAL_299]], label %[[VAL_300:.*]], label %[[VAL_282]]
-// CHECK:       is_smaller_than-after:                            ; preds = %[[VAL_300]], %[[VAL_281]]
+// CHECK:       is_smaller_than-true9:                            ; preds = %[[VAL_285]]
+// CHECK:         %[[VAL_317:.*]] = load i32, ptr %[[VAL_307]], align 4
+// CHECK:         %[[VAL_318:.*]] = load i32, ptr %[[VAL_309]], align 4
+// CHECK:         %[[VAL_319:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_281]]
+// CHECK:         store i32 %[[VAL_317]], ptr addrspace(3) %[[VAL_319]], align 4
+// CHECK:         %[[VAL_320:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_282]]
+// CHECK:         store i32 %[[VAL_318]], ptr addrspace(3) %[[VAL_320]], align 4
+// CHECK:         %[[VAL_321:.*]] = load float, ptr %[[VAL_311]], align 4
+// CHECK:         %[[VAL_322:.*]] = load float, ptr %[[VAL_313]], align 4
+// CHECK:         %[[VAL_323:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_281]]
+// CHECK:         store float %[[VAL_321]], ptr addrspace(3) %[[VAL_323]], align 4
+// CHECK:         %[[VAL_324:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_282]]
+// CHECK:         store float %[[VAL_322]], ptr addrspace(3) %[[VAL_324]], align 4
+// CHECK:         br label %[[VAL_286]]
+// CHECK:       is_last_tile-true11:                              ; preds = %[[VAL_280]]
+// CHECK:         %[[VAL_325:.*]] = xor i64 %[[VAL_210]], 3
+// CHECK:         %[[VAL_326:.*]] = icmp slt i64 %[[VAL_210]], %[[VAL_325]]
+// CHECK:         %[[VAL_327:.*]] = icmp slt i64 %[[VAL_325]], 3
+// CHECK:         %[[VAL_328:.*]] = and i1 %[[VAL_326]], %[[VAL_327]]
+// CHECK:         br i1 %[[VAL_328]], label %[[VAL_329:.*]], label %[[VAL_232]]
+// CHECK:       smaller_comparison_index-after15:                 ; preds = %[[VAL_330:.*]], %[[VAL_229]]
+// CHECK:         br label %[[VAL_331:.*]]
+// CHECK:       is_last_tile-false12:                             ; preds = %[[VAL_280]]
+// CHECK:         %[[VAL_332:.*]] = xor i64 %[[VAL_210]], 3
+// CHECK:         %[[VAL_333:.*]] = icmp slt i64 %[[VAL_210]], %[[VAL_332]]
+// CHECK:         %[[VAL_334:.*]] = icmp slt i64 %[[VAL_332]], 4
+// CHECK:         br i1 true, label %[[VAL_335:.*]], label %[[VAL_231]]
+// CHECK:       smaller_comparison_index-after21:                 ; preds = %[[VAL_336:.*]], %[[VAL_230]]
+// CHECK:         br label %[[VAL_331]]
+// CHECK:       smaller_comparison_index-true14:                  ; preds = %[[VAL_229]]
+// CHECK:         %[[VAL_337:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_325]]
+// CHECK:         %[[VAL_338:.*]] = addrspacecast ptr addrspace(3) %[[VAL_337]] to ptr
+// CHECK:         %[[VAL_339:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_210]]
+// CHECK:         %[[VAL_340:.*]] = addrspacecast ptr addrspace(3) %[[VAL_339]] to ptr
+// CHECK:         %[[VAL_341:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_325]]
+// CHECK:         %[[VAL_342:.*]] = addrspacecast ptr addrspace(3) %[[VAL_341]] to ptr
+// CHECK:         %[[VAL_343:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_210]]
+// CHECK:         %[[VAL_344:.*]] = addrspacecast ptr addrspace(3) %[[VAL_343]] to ptr
+// CHECK:         call void @region_0_6(ptr %[[VAL_338]], ptr %[[VAL_340]], ptr %[[VAL_342]], ptr %[[VAL_344]], ptr %[[VAL_191]])
+// CHECK:         %[[VAL_345:.*]] = load i8, ptr %[[VAL_191]], align 1
+// CHECK:         %[[VAL_346:.*]] = icmp ne i8 %[[VAL_345]], 0
+// CHECK:         br i1 %[[VAL_346]], label %[[VAL_347:.*]], label %[[VAL_330]]
+// CHECK:       is_smaller_than-after19:                          ; preds = %[[VAL_347]], %[[VAL_329]]
+// CHECK:         br label %[[VAL_232]]
+// CHECK:       is_smaller_than-true18:                           ; preds = %[[VAL_329]]
+// CHECK:         %[[VAL_348:.*]] = load i32, ptr %[[VAL_338]], align 4
+// CHECK:         %[[VAL_349:.*]] = load i32, ptr %[[VAL_340]], align 4
+// CHECK:         %[[VAL_350:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_210]]
+// CHECK:         store i32 %[[VAL_348]], ptr addrspace(3) %[[VAL_350]], align 4
+// CHECK:         %[[VAL_351:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_325]]
+// CHECK:         store i32 %[[VAL_349]], ptr addrspace(3) %[[VAL_351]], align 4
+// CHECK:         %[[VAL_352:.*]] = load float, ptr %[[VAL_342]], align 4
+// CHECK:         %[[VAL_353:.*]] = load float, ptr %[[VAL_344]], align 4
+// CHECK:         %[[VAL_354:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_210]]
+// CHECK:         store float %[[VAL_352]], ptr addrspace(3) %[[VAL_354]], align 4
+// CHECK:         %[[VAL_355:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_325]]
+// CHECK:         store float %[[VAL_353]], ptr addrspace(3) %[[VAL_355]], align 4
+// CHECK:         br label %[[VAL_330]]
+// CHECK:       smaller_comparison_index-true20:                  ; preds = %[[VAL_230]]
+// CHECK:         %[[VAL_356:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_332]]
+// CHECK:         %[[VAL_357:.*]] = addrspacecast ptr addrspace(3) %[[VAL_356]] to ptr
+// CHECK:         %[[VAL_358:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_210]]
+// CHECK:         %[[VAL_359:.*]] = addrspacecast ptr addrspace(3) %[[VAL_358]] to ptr
+// CHECK:         %[[VAL_360:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_332]]
+// CHECK:         %[[VAL_361:.*]] = addrspacecast ptr addrspace(3) %[[VAL_360]] to ptr
+// CHECK:         %[[VAL_362:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_210]]
+// CHECK:         %[[VAL_363:.*]] = addrspacecast ptr addrspace(3) %[[VAL_362]] to ptr
+// CHECK:         call void @region_0_6(ptr %[[VAL_357]], ptr %[[VAL_359]], ptr %[[VAL_361]], ptr %[[VAL_363]], ptr %[[VAL_190]])
+// CHECK:         %[[VAL_364:.*]] = load i8, ptr %[[VAL_190]], align 1
+// CHECK:         %[[VAL_365:.*]] = icmp ne i8 %[[VAL_364]], 0
+// CHECK:         br i1 %[[VAL_365]], label %[[VAL_366:.*]], label %[[VAL_336]]
+// CHECK:       is_smaller_than-after25:                          ; preds = %[[VAL_366]], %[[VAL_335]]
 // CHECK:         br label %[[VAL_231]]
-// CHECK:       is_smaller_than-true:                             ; preds = %[[VAL_281]]
-// CHECK:         %[[VAL_301:.*]] = load i32, ptr %[[VAL_291]], align 4
-// CHECK:         %[[VAL_302:.*]] = load i32, ptr %[[VAL_293]], align 4
-// CHECK:         %[[VAL_303:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_276]]
-// CHECK:         store i32 %[[VAL_301]], ptr addrspace(3) %[[VAL_303]], align 4
-// CHECK:         %[[VAL_304:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_277]]
-// CHECK:         store i32 %[[VAL_302]], ptr addrspace(3) %[[VAL_304]], align 4
-// CHECK:         %[[VAL_305:.*]] = load float, ptr %[[VAL_295]], align 4
-// CHECK:         %[[VAL_306:.*]] = load float, ptr %[[VAL_297]], align 4
-// CHECK:         %[[VAL_307:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_276]]
-// CHECK:         store float %[[VAL_305]], ptr addrspace(3) %[[VAL_307]], align 4
-// CHECK:         %[[VAL_308:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_277]]
-// CHECK:         store float %[[VAL_306]], ptr addrspace(3) %[[VAL_308]], align 4
-// CHECK:         br label %[[VAL_282]]
-// CHECK:       smaller_comparison_index-true5:                   ; preds = %[[VAL_229]]
-// CHECK:         %[[VAL_309:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_285]]
-// CHECK:         %[[VAL_310:.*]] = addrspacecast ptr addrspace(3) %[[VAL_309]] to ptr
-// CHECK:         %[[VAL_311:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_284]]
-// CHECK:         %[[VAL_312:.*]] = addrspacecast ptr addrspace(3) %[[VAL_311]] to ptr
-// CHECK:         %[[VAL_313:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_285]]
-// CHECK:         %[[VAL_314:.*]] = addrspacecast ptr addrspace(3) %[[VAL_313]] to ptr
-// CHECK:         %[[VAL_315:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_284]]
-// CHECK:         %[[VAL_316:.*]] = addrspacecast ptr addrspace(3) %[[VAL_315]] to ptr
-// CHECK:         call void @region_0_6(ptr %[[VAL_310]], ptr %[[VAL_312]], ptr %[[VAL_314]], ptr %[[VAL_316]], ptr %[[VAL_193]])
-// CHECK:         %[[VAL_317:.*]] = load i8, ptr %[[VAL_193]], align 1
-// CHECK:         %[[VAL_318:.*]] = icmp ne i8 %[[VAL_317]], 0
-// CHECK:         br i1 %[[VAL_318]], label %[[VAL_319:.*]], label %[[VAL_289]]
-// CHECK:       is_smaller_than-after10:                          ; preds = %[[VAL_319]], %[[VAL_288]]
-// CHECK:         br label %[[VAL_230]]
-// CHECK:       is_smaller_than-true9:                            ; preds = %[[VAL_288]]
-// CHECK:         %[[VAL_320:.*]] = load i32, ptr %[[VAL_310]], align 4
-// CHECK:         %[[VAL_321:.*]] = load i32, ptr %[[VAL_312]], align 4
-// CHECK:         %[[VAL_322:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_284]]
-// CHECK:         store i32 %[[VAL_320]], ptr addrspace(3) %[[VAL_322]], align 4
-// CHECK:         %[[VAL_323:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_285]]
-// CHECK:         store i32 %[[VAL_321]], ptr addrspace(3) %[[VAL_323]], align 4
-// CHECK:         %[[VAL_324:.*]] = load float, ptr %[[VAL_314]], align 4
-// CHECK:         %[[VAL_325:.*]] = load float, ptr %[[VAL_316]], align 4
-// CHECK:         %[[VAL_326:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_284]]
-// CHECK:         store float %[[VAL_324]], ptr addrspace(3) %[[VAL_326]], align 4
-// CHECK:         %[[VAL_327:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_285]]
-// CHECK:         store float %[[VAL_325]], ptr addrspace(3) %[[VAL_327]], align 4
-// CHECK:         br label %[[VAL_289]]
-// CHECK:       is_last_tile-true11:                              ; preds = %[[VAL_283]]
-// CHECK:         %[[VAL_328:.*]] = xor i64 %[[VAL_215]], 3
-// CHECK:         %[[VAL_329:.*]] = icmp slt i64 %[[VAL_215]], %[[VAL_328]]
-// CHECK:         %[[VAL_330:.*]] = icmp slt i64 %[[VAL_328]], 3
-// CHECK:         %[[VAL_331:.*]] = and i1 %[[VAL_329]], %[[VAL_330]]
-// CHECK:         br i1 %[[VAL_331]], label %[[VAL_332:.*]], label %[[VAL_237]]
-// CHECK:       smaller_comparison_index-after15:                 ; preds = %[[VAL_333:.*]], %[[VAL_234]]
-// CHECK:         br label %[[VAL_334:.*]]
-// CHECK:       is_last_tile-false12:                             ; preds = %[[VAL_283]]
-// CHECK:         %[[VAL_335:.*]] = xor i64 %[[VAL_215]], 3
-// CHECK:         %[[VAL_336:.*]] = icmp slt i64 %[[VAL_215]], %[[VAL_335]]
-// CHECK:         %[[VAL_337:.*]] = icmp slt i64 %[[VAL_335]], 4
-// CHECK:         br i1 true, label %[[VAL_338:.*]], label %[[VAL_236]]
-// CHECK:       smaller_comparison_index-after21:                 ; preds = %[[VAL_339:.*]], %[[VAL_235]]
-// CHECK:         br label %[[VAL_334]]
-// CHECK:       smaller_comparison_index-true14:                  ; preds = %[[VAL_234]]
-// CHECK:         %[[VAL_340:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_328]]
-// CHECK:         %[[VAL_341:.*]] = addrspacecast ptr addrspace(3) %[[VAL_340]] to ptr
-// CHECK:         %[[VAL_342:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_215]]
-// CHECK:         %[[VAL_343:.*]] = addrspacecast ptr addrspace(3) %[[VAL_342]] to ptr
-// CHECK:         %[[VAL_344:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_328]]
-// CHECK:         %[[VAL_345:.*]] = addrspacecast ptr addrspace(3) %[[VAL_344]] to ptr
-// CHECK:         %[[VAL_346:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_215]]
-// CHECK:         %[[VAL_347:.*]] = addrspacecast ptr addrspace(3) %[[VAL_346]] to ptr
-// CHECK:         call void @region_0_6(ptr %[[VAL_341]], ptr %[[VAL_343]], ptr %[[VAL_345]], ptr %[[VAL_347]], ptr %[[VAL_192]])
-// CHECK:         %[[VAL_348:.*]] = load i8, ptr %[[VAL_192]], align 1
-// CHECK:         %[[VAL_349:.*]] = icmp ne i8 %[[VAL_348]], 0
-// CHECK:         br i1 %[[VAL_349]], label %[[VAL_350:.*]], label %[[VAL_333]]
-// CHECK:       is_smaller_than-after19:                          ; preds = %[[VAL_350]], %[[VAL_332]]
+// CHECK:       is_smaller_than-true24:                           ; preds = %[[VAL_335]]
+// CHECK:         %[[VAL_367:.*]] = load i32, ptr %[[VAL_357]], align 4
+// CHECK:         %[[VAL_368:.*]] = load i32, ptr %[[VAL_359]], align 4
+// CHECK:         %[[VAL_369:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_210]]
+// CHECK:         store i32 %[[VAL_367]], ptr addrspace(3) %[[VAL_369]], align 4
+// CHECK:         %[[VAL_370:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_332]]
+// CHECK:         store i32 %[[VAL_368]], ptr addrspace(3) %[[VAL_370]], align 4
+// CHECK:         %[[VAL_371:.*]] = load float, ptr %[[VAL_361]], align 4
+// CHECK:         %[[VAL_372:.*]] = load float, ptr %[[VAL_363]], align 4
+// CHECK:         %[[VAL_373:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_210]]
+// CHECK:         store float %[[VAL_371]], ptr addrspace(3) %[[VAL_373]], align 4
+// CHECK:         %[[VAL_374:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_332]]
+// CHECK:         store float %[[VAL_372]], ptr addrspace(3) %[[VAL_374]], align 4
+// CHECK:         br label %[[VAL_336]]
+// CHECK:       is_last_tile-true26:                              ; preds = %[[VAL_331]]
+// CHECK:         %[[VAL_375:.*]] = mul i64 %[[VAL_210]], 2
+// CHECK:         %[[VAL_376:.*]] = xor i64 %[[VAL_375]], 1
+// CHECK:         %[[VAL_377:.*]] = icmp slt i64 %[[VAL_375]], %[[VAL_376]]
+// CHECK:         %[[VAL_378:.*]] = icmp slt i64 %[[VAL_376]], 3
+// CHECK:         %[[VAL_379:.*]] = and i1 %[[VAL_377]], %[[VAL_378]]
+// CHECK:         br i1 %[[VAL_379]], label %[[VAL_380:.*]], label %[[VAL_238]]
+// CHECK:       smaller_comparison_index-after30:                 ; preds = %[[VAL_381:.*]], %[[VAL_235]]
+// CHECK:         br label %[[VAL_244]]
+// CHECK:       is_last_tile-false27:                             ; preds = %[[VAL_331]]
+// CHECK:         %[[VAL_382:.*]] = mul i64 %[[VAL_210]], 2
+// CHECK:         %[[VAL_383:.*]] = xor i64 %[[VAL_382]], 1
+// CHECK:         %[[VAL_384:.*]] = icmp slt i64 %[[VAL_382]], %[[VAL_383]]
+// CHECK:         %[[VAL_385:.*]] = icmp slt i64 %[[VAL_383]], 4
+// CHECK:         br i1 true, label %[[VAL_386:.*]], label %[[VAL_237]]
+// CHECK:       smaller_comparison_index-after36:                 ; preds = %[[VAL_387:.*]], %[[VAL_236]]
+// CHECK:         br label %[[VAL_244]]
+// CHECK:       smaller_comparison_index-true29:                  ; preds = %[[VAL_235]]
+// CHECK:         %[[VAL_388:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_376]]
+// CHECK:         %[[VAL_389:.*]] = addrspacecast ptr addrspace(3) %[[VAL_388]] to ptr
+// CHECK:         %[[VAL_390:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_375]]
+// CHECK:         %[[VAL_391:.*]] = addrspacecast ptr addrspace(3) %[[VAL_390]] to ptr
+// CHECK:         %[[VAL_392:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_376]]
+// CHECK:         %[[VAL_393:.*]] = addrspacecast ptr addrspace(3) %[[VAL_392]] to ptr
+// CHECK:         %[[VAL_394:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_375]]
+// CHECK:         %[[VAL_395:.*]] = addrspacecast ptr addrspace(3) %[[VAL_394]] to ptr
+// CHECK:         call void @region_0_6(ptr %[[VAL_389]], ptr %[[VAL_391]], ptr %[[VAL_393]], ptr %[[VAL_395]], ptr %[[VAL_189]])
+// CHECK:         %[[VAL_396:.*]] = load i8, ptr %[[VAL_189]], align 1
+// CHECK:         %[[VAL_397:.*]] = icmp ne i8 %[[VAL_396]], 0
+// CHECK:         br i1 %[[VAL_397]], label %[[VAL_398:.*]], label %[[VAL_381]]
+// CHECK:       is_smaller_than-after34:                          ; preds = %[[VAL_398]], %[[VAL_380]]
+// CHECK:         br label %[[VAL_238]]
+// CHECK:       is_smaller_than-true33:                           ; preds = %[[VAL_380]]
+// CHECK:         %[[VAL_399:.*]] = load i32, ptr %[[VAL_389]], align 4
+// CHECK:         %[[VAL_400:.*]] = load i32, ptr %[[VAL_391]], align 4
+// CHECK:         %[[VAL_401:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_375]]
+// CHECK:         store i32 %[[VAL_399]], ptr addrspace(3) %[[VAL_401]], align 4
+// CHECK:         %[[VAL_402:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_376]]
+// CHECK:         store i32 %[[VAL_400]], ptr addrspace(3) %[[VAL_402]], align 4
+// CHECK:         %[[VAL_403:.*]] = load float, ptr %[[VAL_393]], align 4
+// CHECK:         %[[VAL_404:.*]] = load float, ptr %[[VAL_395]], align 4
+// CHECK:         %[[VAL_405:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_375]]
+// CHECK:         store float %[[VAL_403]], ptr addrspace(3) %[[VAL_405]], align 4
+// CHECK:         %[[VAL_406:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_376]]
+// CHECK:         store float %[[VAL_404]], ptr addrspace(3) %[[VAL_406]], align 4
+// CHECK:         br label %[[VAL_381]]
+// CHECK:       smaller_comparison_index-true35:                  ; preds = %[[VAL_236]]
+// CHECK:         %[[VAL_407:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_383]]
+// CHECK:         %[[VAL_408:.*]] = addrspacecast ptr addrspace(3) %[[VAL_407]] to ptr
+// CHECK:         %[[VAL_409:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_382]]
+// CHECK:         %[[VAL_410:.*]] = addrspacecast ptr addrspace(3) %[[VAL_409]] to ptr
+// CHECK:         %[[VAL_411:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_383]]
+// CHECK:         %[[VAL_412:.*]] = addrspacecast ptr addrspace(3) %[[VAL_411]] to ptr
+// CHECK:         %[[VAL_413:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_382]]
+// CHECK:         %[[VAL_414:.*]] = addrspacecast ptr addrspace(3) %[[VAL_413]] to ptr
+// CHECK:         call void @region_0_6(ptr %[[VAL_408]], ptr %[[VAL_410]], ptr %[[VAL_412]], ptr %[[VAL_414]], ptr %[[VAL_188]])
+// CHECK:         %[[VAL_415:.*]] = load i8, ptr %[[VAL_188]], align 1
+// CHECK:         %[[VAL_416:.*]] = icmp ne i8 %[[VAL_415]], 0
+// CHECK:         br i1 %[[VAL_416]], label %[[VAL_417:.*]], label %[[VAL_387]]
+// CHECK:       is_smaller_than-after40:                          ; preds = %[[VAL_417]], %[[VAL_386]]
 // CHECK:         br label %[[VAL_237]]
-// CHECK:       is_smaller_than-true18:                           ; preds = %[[VAL_332]]
-// CHECK:         %[[VAL_351:.*]] = load i32, ptr %[[VAL_341]], align 4
-// CHECK:         %[[VAL_352:.*]] = load i32, ptr %[[VAL_343]], align 4
-// CHECK:         %[[VAL_353:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_215]]
-// CHECK:         store i32 %[[VAL_351]], ptr addrspace(3) %[[VAL_353]], align 4
-// CHECK:         %[[VAL_354:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_328]]
-// CHECK:         store i32 %[[VAL_352]], ptr addrspace(3) %[[VAL_354]], align 4
-// CHECK:         %[[VAL_355:.*]] = load float, ptr %[[VAL_345]], align 4
-// CHECK:         %[[VAL_356:.*]] = load float, ptr %[[VAL_347]], align 4
-// CHECK:         %[[VAL_357:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_215]]
-// CHECK:         store float %[[VAL_355]], ptr addrspace(3) %[[VAL_357]], align 4
-// CHECK:         %[[VAL_358:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_328]]
-// CHECK:         store float %[[VAL_356]], ptr addrspace(3) %[[VAL_358]], align 4
-// CHECK:         br label %[[VAL_333]]
-// CHECK:       smaller_comparison_index-true20:                  ; preds = %[[VAL_235]]
-// CHECK:         %[[VAL_359:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_335]]
-// CHECK:         %[[VAL_360:.*]] = addrspacecast ptr addrspace(3) %[[VAL_359]] to ptr
-// CHECK:         %[[VAL_361:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_215]]
-// CHECK:         %[[VAL_362:.*]] = addrspacecast ptr addrspace(3) %[[VAL_361]] to ptr
-// CHECK:         %[[VAL_363:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_335]]
-// CHECK:         %[[VAL_364:.*]] = addrspacecast ptr addrspace(3) %[[VAL_363]] to ptr
-// CHECK:         %[[VAL_365:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_215]]
-// CHECK:         %[[VAL_366:.*]] = addrspacecast ptr addrspace(3) %[[VAL_365]] to ptr
-// CHECK:         call void @region_0_6(ptr %[[VAL_360]], ptr %[[VAL_362]], ptr %[[VAL_364]], ptr %[[VAL_366]], ptr %[[VAL_191]])
-// CHECK:         %[[VAL_367:.*]] = load i8, ptr %[[VAL_191]], align 1
-// CHECK:         %[[VAL_368:.*]] = icmp ne i8 %[[VAL_367]], 0
-// CHECK:         br i1 %[[VAL_368]], label %[[VAL_369:.*]], label %[[VAL_339]]
-// CHECK:       is_smaller_than-after25:                          ; preds = %[[VAL_369]], %[[VAL_338]]
-// CHECK:         br label %[[VAL_236]]
-// CHECK:       is_smaller_than-true24:                           ; preds = %[[VAL_338]]
-// CHECK:         %[[VAL_370:.*]] = load i32, ptr %[[VAL_360]], align 4
-// CHECK:         %[[VAL_371:.*]] = load i32, ptr %[[VAL_362]], align 4
-// CHECK:         %[[VAL_372:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_215]]
-// CHECK:         store i32 %[[VAL_370]], ptr addrspace(3) %[[VAL_372]], align 4
-// CHECK:         %[[VAL_373:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_335]]
-// CHECK:         store i32 %[[VAL_371]], ptr addrspace(3) %[[VAL_373]], align 4
-// CHECK:         %[[VAL_374:.*]] = load float, ptr %[[VAL_364]], align 4
-// CHECK:         %[[VAL_375:.*]] = load float, ptr %[[VAL_366]], align 4
-// CHECK:         %[[VAL_376:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_215]]
-// CHECK:         store float %[[VAL_374]], ptr addrspace(3) %[[VAL_376]], align 4
-// CHECK:         %[[VAL_377:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_335]]
-// CHECK:         store float %[[VAL_375]], ptr addrspace(3) %[[VAL_377]], align 4
-// CHECK:         br label %[[VAL_339]]
-// CHECK:       is_last_tile-true26:                              ; preds = %[[VAL_334]]
-// CHECK:         %[[VAL_378:.*]] = mul i64 %[[VAL_215]], 2
-// CHECK:         %[[VAL_379:.*]] = xor i64 %[[VAL_378]], 1
-// CHECK:         %[[VAL_380:.*]] = icmp slt i64 %[[VAL_378]], %[[VAL_379]]
-// CHECK:         %[[VAL_381:.*]] = icmp slt i64 %[[VAL_379]], 3
-// CHECK:         %[[VAL_382:.*]] = and i1 %[[VAL_380]], %[[VAL_381]]
-// CHECK:         br i1 %[[VAL_382]], label %[[VAL_383:.*]], label %[[VAL_243]]
-// CHECK:       smaller_comparison_index-after30:                 ; preds = %[[VAL_384:.*]], %[[VAL_240]]
-// CHECK:         br label %[[VAL_249]]
-// CHECK:       is_last_tile-false27:                             ; preds = %[[VAL_334]]
-// CHECK:         %[[VAL_385:.*]] = mul i64 %[[VAL_215]], 2
-// CHECK:         %[[VAL_386:.*]] = xor i64 %[[VAL_385]], 1
-// CHECK:         %[[VAL_387:.*]] = icmp slt i64 %[[VAL_385]], %[[VAL_386]]
-// CHECK:         %[[VAL_388:.*]] = icmp slt i64 %[[VAL_386]], 4
-// CHECK:         br i1 true, label %[[VAL_389:.*]], label %[[VAL_242]]
-// CHECK:       smaller_comparison_index-after36:                 ; preds = %[[VAL_390:.*]], %[[VAL_241]]
-// CHECK:         br label %[[VAL_249]]
-// CHECK:       smaller_comparison_index-true29:                  ; preds = %[[VAL_240]]
-// CHECK:         %[[VAL_391:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_379]]
-// CHECK:         %[[VAL_392:.*]] = addrspacecast ptr addrspace(3) %[[VAL_391]] to ptr
-// CHECK:         %[[VAL_393:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_378]]
-// CHECK:         %[[VAL_394:.*]] = addrspacecast ptr addrspace(3) %[[VAL_393]] to ptr
-// CHECK:         %[[VAL_395:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_379]]
-// CHECK:         %[[VAL_396:.*]] = addrspacecast ptr addrspace(3) %[[VAL_395]] to ptr
-// CHECK:         %[[VAL_397:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_378]]
-// CHECK:         %[[VAL_398:.*]] = addrspacecast ptr addrspace(3) %[[VAL_397]] to ptr
-// CHECK:         call void @region_0_6(ptr %[[VAL_392]], ptr %[[VAL_394]], ptr %[[VAL_396]], ptr %[[VAL_398]], ptr %[[VAL_190]])
-// CHECK:         %[[VAL_399:.*]] = load i8, ptr %[[VAL_190]], align 1
-// CHECK:         %[[VAL_400:.*]] = icmp ne i8 %[[VAL_399]], 0
-// CHECK:         br i1 %[[VAL_400]], label %[[VAL_401:.*]], label %[[VAL_384]]
-// CHECK:       is_smaller_than-after34:                          ; preds = %[[VAL_401]], %[[VAL_383]]
-// CHECK:         br label %[[VAL_243]]
-// CHECK:       is_smaller_than-true33:                           ; preds = %[[VAL_383]]
-// CHECK:         %[[VAL_402:.*]] = load i32, ptr %[[VAL_392]], align 4
-// CHECK:         %[[VAL_403:.*]] = load i32, ptr %[[VAL_394]], align 4
-// CHECK:         %[[VAL_404:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_378]]
-// CHECK:         store i32 %[[VAL_402]], ptr addrspace(3) %[[VAL_404]], align 4
-// CHECK:         %[[VAL_405:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_379]]
-// CHECK:         store i32 %[[VAL_403]], ptr addrspace(3) %[[VAL_405]], align 4
-// CHECK:         %[[VAL_406:.*]] = load float, ptr %[[VAL_396]], align 4
-// CHECK:         %[[VAL_407:.*]] = load float, ptr %[[VAL_398]], align 4
-// CHECK:         %[[VAL_408:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_378]]
-// CHECK:         store float %[[VAL_406]], ptr addrspace(3) %[[VAL_408]], align 4
-// CHECK:         %[[VAL_409:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_379]]
-// CHECK:         store float %[[VAL_407]], ptr addrspace(3) %[[VAL_409]], align 4
-// CHECK:         br label %[[VAL_384]]
-// CHECK:       smaller_comparison_index-true35:                  ; preds = %[[VAL_241]]
-// CHECK:         %[[VAL_410:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_386]]
-// CHECK:         %[[VAL_411:.*]] = addrspacecast ptr addrspace(3) %[[VAL_410]] to ptr
-// CHECK:         %[[VAL_412:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_385]]
-// CHECK:         %[[VAL_413:.*]] = addrspacecast ptr addrspace(3) %[[VAL_412]] to ptr
-// CHECK:         %[[VAL_414:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_386]]
-// CHECK:         %[[VAL_415:.*]] = addrspacecast ptr addrspace(3) %[[VAL_414]] to ptr
-// CHECK:         %[[VAL_416:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_385]]
-// CHECK:         %[[VAL_417:.*]] = addrspacecast ptr addrspace(3) %[[VAL_416]] to ptr
-// CHECK:         call void @region_0_6(ptr %[[VAL_411]], ptr %[[VAL_413]], ptr %[[VAL_415]], ptr %[[VAL_417]], ptr %[[VAL_189]])
-// CHECK:         %[[VAL_418:.*]] = load i8, ptr %[[VAL_189]], align 1
-// CHECK:         %[[VAL_419:.*]] = icmp ne i8 %[[VAL_418]], 0
-// CHECK:         br i1 %[[VAL_419]], label %[[VAL_420:.*]], label %[[VAL_390]]
-// CHECK:       is_smaller_than-after40:                          ; preds = %[[VAL_420]], %[[VAL_389]]
+// CHECK:       is_smaller_than-true39:                           ; preds = %[[VAL_386]]
+// CHECK:         %[[VAL_418:.*]] = load i32, ptr %[[VAL_408]], align 4
+// CHECK:         %[[VAL_419:.*]] = load i32, ptr %[[VAL_410]], align 4
+// CHECK:         %[[VAL_420:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_382]]
+// CHECK:         store i32 %[[VAL_418]], ptr addrspace(3) %[[VAL_420]], align 4
+// CHECK:         %[[VAL_421:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_383]]
+// CHECK:         store i32 %[[VAL_419]], ptr addrspace(3) %[[VAL_421]], align 4
+// CHECK:         %[[VAL_422:.*]] = load float, ptr %[[VAL_412]], align 4
+// CHECK:         %[[VAL_423:.*]] = load float, ptr %[[VAL_414]], align 4
+// CHECK:         %[[VAL_424:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_382]]
+// CHECK:         store float %[[VAL_422]], ptr addrspace(3) %[[VAL_424]], align 4
+// CHECK:         %[[VAL_425:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_383]]
+// CHECK:         store float %[[VAL_423]], ptr addrspace(3) %[[VAL_425]], align 4
+// CHECK:         br label %[[VAL_387]]
+// CHECK:       smaller_keys_index-true41:                        ; preds = %[[VAL_244]]
+// CHECK:         %[[VAL_426:.*]] = shl i64 %[[VAL_210]], 1
+// CHECK:         %[[VAL_427:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_426]]
+// CHECK:         %[[VAL_428:.*]] = load i32, ptr addrspace(3) %[[VAL_427]], align 4
+// CHECK:         %[[VAL_429:.*]] = getelementptr inbounds [2 x [3 x i32]], ptr %[[VAL_251]], i64 0, i64 %[[VAL_203]], i64 %[[VAL_239]]
+// CHECK:         store i32 %[[VAL_428]], ptr %[[VAL_429]], align 4
+// CHECK:         %[[VAL_430:.*]] = add i64 %[[VAL_239]], 1
+// CHECK:         %[[VAL_431:.*]] = icmp slt i64 %[[VAL_430]], 3
+// CHECK:         br i1 %[[VAL_431]], label %[[VAL_432:.*]], label %[[VAL_243]]
+// CHECK:       inner_smaller_keys_index-after44:                 ; preds = %[[VAL_432]], %[[VAL_241]]
 // CHECK:         br label %[[VAL_242]]
-// CHECK:       is_smaller_than-true39:                           ; preds = %[[VAL_389]]
-// CHECK:         %[[VAL_421:.*]] = load i32, ptr %[[VAL_411]], align 4
-// CHECK:         %[[VAL_422:.*]] = load i32, ptr %[[VAL_413]], align 4
-// CHECK:         %[[VAL_423:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_385]]
-// CHECK:         store i32 %[[VAL_421]], ptr addrspace(3) %[[VAL_423]], align 4
-// CHECK:         %[[VAL_424:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_386]]
-// CHECK:         store i32 %[[VAL_422]], ptr addrspace(3) %[[VAL_424]], align 4
-// CHECK:         %[[VAL_425:.*]] = load float, ptr %[[VAL_415]], align 4
-// CHECK:         %[[VAL_426:.*]] = load float, ptr %[[VAL_417]], align 4
-// CHECK:         %[[VAL_427:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_385]]
-// CHECK:         store float %[[VAL_425]], ptr addrspace(3) %[[VAL_427]], align 4
-// CHECK:         %[[VAL_428:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_386]]
-// CHECK:         store float %[[VAL_426]], ptr addrspace(3) %[[VAL_428]], align 4
-// CHECK:         br label %[[VAL_390]]
-// CHECK:       smaller_keys_index-true41:                        ; preds = %[[VAL_249]]
-// CHECK:         %[[VAL_429:.*]] = shl i64 %[[VAL_215]], 1
-// CHECK:         %[[VAL_430:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_429]]
-// CHECK:         %[[VAL_431:.*]] = load i32, ptr addrspace(3) %[[VAL_430]], align 4
-// CHECK:         %[[VAL_432:.*]] = getelementptr inbounds [2 x [3 x i32]], ptr %[[VAL_195]], i64 0, i64 %[[VAL_208]], i64 %[[VAL_244]]
-// CHECK:         store i32 %[[VAL_431]], ptr %[[VAL_432]], align 4
-// CHECK:         %[[VAL_433:.*]] = add i64 %[[VAL_244]], 1
-// CHECK:         %[[VAL_434:.*]] = icmp slt i64 %[[VAL_433]], 3
-// CHECK:         br i1 %[[VAL_434]], label %[[VAL_435:.*]], label %[[VAL_248]]
-// CHECK:       inner_smaller_keys_index-after44:                 ; preds = %[[VAL_435]], %[[VAL_246]]
-// CHECK:         br label %[[VAL_247]]
-// CHECK:       inner_smaller_keys_index-true43:                  ; preds = %[[VAL_246]]
-// CHECK:         %[[VAL_436:.*]] = add i64 %[[VAL_429]], 1
-// CHECK:         %[[VAL_437:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_436]]
-// CHECK:         %[[VAL_438:.*]] = load i32, ptr addrspace(3) %[[VAL_437]], align 4
-// CHECK:         %[[VAL_439:.*]] = getelementptr inbounds [2 x [3 x i32]], ptr %[[VAL_195]], i64 0, i64 %[[VAL_208]], i64 %[[VAL_433]]
-// CHECK:         store i32 %[[VAL_438]], ptr %[[VAL_439]], align 4
+// CHECK:       inner_smaller_keys_index-true43:                  ; preds = %[[VAL_241]]
+// CHECK:         %[[VAL_433:.*]] = add i64 %[[VAL_426]], 1
+// CHECK:         %[[VAL_434:.*]] = getelementptr [64 x i32], ptr addrspace(3) @sort_tile_param_0, i64 0, i64 %[[VAL_433]]
+// CHECK:         %[[VAL_435:.*]] = load i32, ptr addrspace(3) %[[VAL_434]], align 4
+// CHECK:         %[[VAL_436:.*]] = getelementptr inbounds [2 x [3 x i32]], ptr %[[VAL_251]], i64 0, i64 %[[VAL_203]], i64 %[[VAL_430]]
+// CHECK:         store i32 %[[VAL_435]], ptr %[[VAL_436]], align 4
+// CHECK:         br label %[[VAL_243]]
+// CHECK:       smaller_keys_index-true45:                        ; preds = %[[VAL_242]]
+// CHECK:         %[[VAL_437:.*]] = shl i64 %[[VAL_210]], 1
+// CHECK:         %[[VAL_438:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_437]]
+// CHECK:         %[[VAL_439:.*]] = load float, ptr addrspace(3) %[[VAL_438]], align 4
+// CHECK:         %[[VAL_440:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_263]], i64 0, i64 %[[VAL_203]], i64 %[[VAL_245]]
+// CHECK:         store float %[[VAL_439]], ptr %[[VAL_440]], align 4
+// CHECK:         %[[VAL_441:.*]] = add i64 %[[VAL_245]], 1
+// CHECK:         %[[VAL_442:.*]] = icmp slt i64 %[[VAL_441]], 3
+// CHECK:         br i1 %[[VAL_442]], label %[[VAL_443:.*]], label %[[VAL_248]]
+// CHECK:       inner_smaller_keys_index-after48:                 ; preds = %[[VAL_443]], %[[VAL_247]]
+// CHECK:         br label %[[VAL_207]]
+// CHECK:       inner_smaller_keys_index-true47:                  ; preds = %[[VAL_247]]
+// CHECK:         %[[VAL_444:.*]] = add i64 %[[VAL_437]], 1
+// CHECK:         %[[VAL_445:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_444]]
+// CHECK:         %[[VAL_446:.*]] = load float, ptr addrspace(3) %[[VAL_445]], align 4
+// CHECK:         %[[VAL_447:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_263]], i64 0, i64 %[[VAL_203]], i64 %[[VAL_441]]
+// CHECK:         store float %[[VAL_446]], ptr %[[VAL_447]], align 4
 // CHECK:         br label %[[VAL_248]]
-// CHECK:       smaller_keys_index-true45:                        ; preds = %[[VAL_247]]
-// CHECK:         %[[VAL_440:.*]] = shl i64 %[[VAL_215]], 1
-// CHECK:         %[[VAL_441:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_440]]
-// CHECK:         %[[VAL_442:.*]] = load float, ptr addrspace(3) %[[VAL_441]], align 4
-// CHECK:         %[[VAL_443:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_197]], i64 0, i64 %[[VAL_208]], i64 %[[VAL_250]]
-// CHECK:         store float %[[VAL_442]], ptr %[[VAL_443]], align 4
-// CHECK:         %[[VAL_444:.*]] = add i64 %[[VAL_250]], 1
-// CHECK:         %[[VAL_445:.*]] = icmp slt i64 %[[VAL_444]], 3
-// CHECK:         br i1 %[[VAL_445]], label %[[VAL_446:.*]], label %[[VAL_253]]
-// CHECK:       inner_smaller_keys_index-after48:                 ; preds = %[[VAL_446]], %[[VAL_252]]
-// CHECK:         br label %[[VAL_212]]
-// CHECK:       inner_smaller_keys_index-true47:                  ; preds = %[[VAL_252]]
-// CHECK:         %[[VAL_447:.*]] = add i64 %[[VAL_440]], 1
-// CHECK:         %[[VAL_448:.*]] = getelementptr [64 x float], ptr addrspace(3) @sort_tile_param_1, i64 0, i64 %[[VAL_447]]
-// CHECK:         %[[VAL_449:.*]] = load float, ptr addrspace(3) %[[VAL_448]], align 4
-// CHECK:         %[[VAL_450:.*]] = getelementptr inbounds [2 x [3 x float]], ptr %[[VAL_197]], i64 0, i64 %[[VAL_208]], i64 %[[VAL_444]]
-// CHECK:         store float %[[VAL_449]], ptr %[[VAL_450]], align 4
-// CHECK:         br label %[[VAL_253]]
 // CHECK:       entry:
-// CHECK:         %[[VAL_451:.*]] = alloca i8, align 1
-// CHECK:         %[[VAL_452:.*]] = load float, ptr %[[VAL_453:.*]], align 4
-// CHECK:         %[[VAL_454:.*]] = load float, ptr %[[VAL_455:.*]], align 4
-// CHECK:         %[[VAL_456:.*]] = fcmp olt float %[[VAL_452]], %[[VAL_454]]
-// CHECK:         %[[VAL_457:.*]] = zext i1 %[[VAL_456]] to i8
-// CHECK:         store i8 %[[VAL_457]], ptr %[[VAL_451]], align 1
-// CHECK:         %[[VAL_458:.*]] = load i8, ptr %[[VAL_451]], align 1
-// CHECK:         store i8 %[[VAL_458]], ptr %[[VAL_459:.*]], align 1
+// CHECK:         %[[VAL_448:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_449:.*]] = load float, ptr %[[VAL_450:.*]], align 4
+// CHECK:         %[[VAL_451:.*]] = load float, ptr %[[VAL_452:.*]], align 4
+// CHECK:         %[[VAL_453:.*]] = fcmp olt float %[[VAL_449]], %[[VAL_451]]
+// CHECK:         %[[VAL_454:.*]] = zext i1 %[[VAL_453]] to i8
+// CHECK:         store i8 %[[VAL_454]], ptr %[[VAL_448]], align 1
+// CHECK:         %[[VAL_455:.*]] = load i8, ptr %[[VAL_448]], align 1
+// CHECK:         store i8 %[[VAL_455]], ptr %[[VAL_456:.*]], align 1
 // CHECK:         ret void
 
 ENTRY main {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/tree_reduction_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/tests/tree_reduction_rewriter_test.cc
index 2f1b6d189ef..9c02307c1ab 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/tree_reduction_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/tree_reduction_rewriter_test.cc
@@ -37,7 +37,14 @@ class TreeReductionRewriterTest : public HloTestBase {
   void CheckTreeRewriter(absl::string_view hlo,
                          std::optional<absl::string_view> expected) {
     RunAndFilecheckHloRewrite(
-        hlo, gpu::GpuTreeReductionRewriter{se::CudaComputeCapability{8, 1}},
+        hlo,
+#if TENSORFLOW_USE_ROCM
+        gpu::GpuTreeReductionRewriter{se::RocmComputeCapability {
+          "908"
+        }},
+#else
+        gpu::GpuTreeReductionRewriter{se::CudaComputeCapability{8, 1}},
+#endif
         expected);
   }
 };
@@ -61,9 +68,9 @@ ENTRY main {
 
   CheckTreeRewriter(hlo,
                     R"(
-// CHECK: [[pad_0:%[^ ]+]] = f32[50176]{0} pad([[input_1:%[^ ]+]], [[zero_2:%[^ ]+]]), padding=0_176
-// CHECK: [[bitcast_3:%[^ ]+]] = f32[224,224]{1,0} bitcast([[pad_0]])
-// CHECK: [[reduce_4:%[^ ]+]] = f32[224]{0} reduce([[bitcast_3]], [[zero_2]]), dimensions={1}, to_apply=[[add_5:%[^ ]+]]
+// CHECK: [[pad_0:%[^ ]+]] = f32[50048]{0} pad([[input_1:%[^ ]+]], [[zero_2:%[^ ]+]]), padding=0_48
+// CHECK: [[bitcast_3:%[^ ]+]] = f32[128,391]{1,0} bitcast([[pad_0]])
+// CHECK: [[reduce_4:%[^ ]+]] = f32[128]{0} reduce([[bitcast_3]], [[zero_2]]), dimensions={1}, to_apply=[[add_5:%[^ ]+]]
 // CHECK: ROOT [[out_1_6:%[^ ]+]] = f32[] reduce([[reduce_4]], [[zero_2]]), dimensions={0}, to_apply=[[add_5]]
       )");
 }
@@ -104,7 +111,7 @@ add {
 }
 
 ENTRY main {
-  input = f32[49952] parameter(0)
+  input = f32[50048] parameter(0)
   zero = f32[] constant(0)
   ROOT out = f32[] reduce(input, zero), dimensions={0}, to_apply=add
 }
@@ -112,10 +119,10 @@ ENTRY main {
 
   CheckTreeRewriter(hlo,
                     R"(
-// CHECK: [[input_0:%[^ ]+]] = f32[49952]{0} parameter(0)
-// CHECK: [[bitcast_1:%[^ ]+]] = f32[223,224]{1,0} bitcast([[input_0]])
+// CHECK: [[input_0:%[^ ]+]] = f32[50048]{0} parameter(0)
+// CHECK: [[bitcast_1:%[^ ]+]] = f32[128,391]{1,0} bitcast([[input_0]])
 // CHECK: [[zero_2:%[^ ]+]] = f32[] constant(0)
-// CHECK: [[reduce_3:%[^ ]+]] = f32[223]{0} reduce([[bitcast_1]], [[zero_2]]), dimensions={1}, to_apply=[[add_4:%[^ ]+]]
+// CHECK: [[reduce_3:%[^ ]+]] = f32[128]{0} reduce([[bitcast_1]], [[zero_2]]), dimensions={1}, to_apply=[[add_4:%[^ ]+]]
 // CHECK: ROOT [[out_1_5:%[^ ]+]] = f32[] reduce([[reduce_3]], [[zero_2]]), dimensions={0}, to_apply=[[add_4]]
       )");
 }
@@ -131,7 +138,7 @@ add {
 }
 
 ENTRY main {
-  input = f32[100,10,90000] parameter(0)
+  input = f32[100,10,65536] parameter(0)
   zero = f32[] constant(0)
   ROOT out = f32[100,10] reduce(input, zero), dimensions={2}, to_apply=add
 }
@@ -139,9 +146,9 @@ ENTRY main {
 
   CheckTreeRewriter(hlo,
                     R"(
-// CHECK: [[bitcast_0:%[^ ]+]] = f32[100,10,300,300]{3,2,1,0} bitcast([[input_1:%[^ ]+]])
+// CHECK: [[bitcast_0:%[^ ]+]] = f32[100,10,256,256]{3,2,1,0} bitcast([[input_1:%[^ ]+]])
 // CHECK: [[zero_2:%[^ ]+]] = f32[] constant(0)
-// CHECK: [[reduce_3:%[^ ]+]] = f32[100,10,300]{2,1,0} reduce([[bitcast_0]], [[zero_2]]), dimensions={3}, to_apply=[[add_4:%[^ ]+]]
+// CHECK: [[reduce_3:%[^ ]+]] = f32[100,10,256]{2,1,0} reduce([[bitcast_0]], [[zero_2]]), dimensions={3}, to_apply=[[add_4:%[^ ]+]]
 // CHECK: ROOT [[out_1_5:%[^ ]+]] = f32[100,10]{1,0} reduce([[reduce_3]], [[zero_2]]), dimensions={2}, to_apply=[[add_4]]
       )");
 }
@@ -158,7 +165,7 @@ add {
 }
 
 ENTRY main {
-  input = f32[1000000] parameter(0)
+  input = f32[1048576] parameter(0)
   zero = f32[] constant(0)
   ROOT out = f32[] reduce(input, zero), dimensions={0}, to_apply=add
 }
@@ -166,10 +173,10 @@ ENTRY main {
 
   CheckTreeRewriter(hlo,
                     R"(
-// CHECK:  [[input_0:%[^ ]+]] = f32[1000000]{0} parameter(0)
-// CHECK:  [[bitcast_1:%[^ ]+]] = f32[1000,1000]{1,0} bitcast([[input_0]])
+// CHECK:  [[input_0:%[^ ]+]] = f32[1048576]{0} parameter(0)
+// CHECK:  [[bitcast_1:%[^ ]+]] = f32[1024,1024]{1,0} bitcast([[input_0]])
 // CHECK:  [[zero_2:%[^ ]+]] = f32[] constant(0)
-// CHECK:  [[reduce_3:%[^ ]+]] = f32[1000]{0} reduce([[bitcast_1]], [[zero_2]]), dimensions={1}, to_apply=[[add_4:%[^ ]+]]
+// CHECK:  [[reduce_3:%[^ ]+]] = f32[1024]{0} reduce([[bitcast_1]], [[zero_2]]), dimensions={1}, to_apply=[[add_4:%[^ ]+]]
 // CHECK:  ROOT [[out_1_5:%[^ ]+]] = f32[] reduce([[reduce_3]], [[zero_2]]), dimensions={0}, to_apply=[[add_4]]
       )");
 }
@@ -185,7 +192,7 @@ add {
 }
 
 ENTRY main {
-  input = f32[8,100,90000] parameter(0)
+  input = f32[8,100,65536] parameter(0)
   zero = f32[] constant(0)
   ROOT out = f32[100] reduce(input, zero), dimensions={0,2}, to_apply=add
 }
@@ -193,9 +200,9 @@ ENTRY main {
 
   CheckTreeRewriter(hlo,
                     R"(
-// CHECK:  [[bitcast_0:%[^ ]+]] = f32[8,100,300,300]{3,2,1,0} bitcast([[input_1:%[^ ]+]])
+// CHECK:  [[bitcast_0:%[^ ]+]] = f32[8,100,256,256]{3,2,1,0} bitcast([[input_1:%[^ ]+]])
 // CHECK:  [[zero_2:%[^ ]+]] = f32[] constant(0)
-// CHECK:  [[reduce_3:%[^ ]+]] = f32[100,300]{1,0} reduce([[bitcast_0]], [[zero_2]]), dimensions={3,0}, to_apply=[[add_4:%[^ ]+]]
+// CHECK:  [[reduce_3:%[^ ]+]] = f32[100,256]{1,0} reduce([[bitcast_0]], [[zero_2]]), dimensions={3,0}, to_apply=[[add_4:%[^ ]+]]
 // CHECK:  ROOT [[out_1_5:%[^ ]+]] = f32[100]{0} reduce([[reduce_3]], [[zero_2]]), dimensions={1}, to_apply=[[add_4]]
       )");
 }
@@ -235,7 +242,7 @@ add {
 }
 
 ENTRY main {
-  input = f32[10000,100] parameter(0)
+  input = f32[16384,100] parameter(0)
   zero = f32[] constant(0)
   ROOT out = f32[100] reduce(input, zero), dimensions={0}, to_apply=add
 }
@@ -244,14 +251,14 @@ ENTRY main {
   CheckTreeRewriter(hlo,
                     R"(
 
-// CHECK:  [[input_0:%[^ ]+]] = f32[10000,100]{1,0} parameter(0)
-// CHECK:  [[bitcast_1:%[^ ]+]] = f32[100,100,100]{2,1,0} bitcast([[input_0]])
-// CHECK:  [[reduce_2:%[^ ]+]] = f32[100,100]{1,0} reduce([[bitcast_1]], [[zero_3:%[^ ]+]]), dimensions={0}, to_apply=[[add_4:%[^ ]+]]
+// CHECK:  [[input_0:%[^ ]+]] = f32[16384,100]{1,0} parameter(0)
+// CHECK:  [[bitcast_1:%[^ ]+]] = f32[128,128,100]{2,1,0} bitcast([[input_0]])
+// CHECK:  [[reduce_2:%[^ ]+]] = f32[128,100]{1,0} reduce([[bitcast_1]], [[zero_3:%[^ ]+]]), dimensions={1}, to_apply=[[add_4:%[^ ]+]]
 // CHECK:  ROOT [[out_1_5:%[^ ]+]] = f32[100]{0} reduce([[reduce_2]], [[zero_3]]), dimensions={0}, to_apply=[[add_4]]
       )");
 }
 
-TEST_F(TreeReductionRewriterTest, ColumnReductionSimpleNoSquareDivisible) {
+TEST_F(TreeReductionRewriterTest, ColumnReductionSimpleNoDivisible) {
   const char* hlo = R"(
 HloModule ReduceWithPadding
 
@@ -271,9 +278,10 @@ ENTRY main {
   CheckTreeRewriter(hlo,
                     R"(
 // CHECK:  [[input_0:%[^ ]+]] = f32[10302,100]{1,0} parameter(0)
-// CHECK:  [[bitcast_1:%[^ ]+]] = f32[101,102,100]{2,1,0} bitcast([[input_0]])
 // CHECK:  [[zero_2:%[^ ]+]] = f32[] constant(0)
-// CHECK:  [[reduce_3:%[^ ]+]] = f32[102,100]{1,0} reduce([[bitcast_1]], [[zero_2]]), dimensions={0}, to_apply=[[add_4:%[^ ]+]]
+// CHECK:  [[pad_0:%[^ ]+]] = f32[10304,100]{1,0} pad([[input_1:%[^ ]+]], [[zero_2:%[^ ]+]]), padding=0_2x0_0
+// CHECK:  [[bitcast_1:%[^ ]+]] = f32[64,161,100]{2,1,0} bitcast([[pad_0]])
+// CHECK:  [[reduce_3:%[^ ]+]] = f32[64,100]{1,0} reduce([[bitcast_1]], [[zero_2]]), dimensions={1}, to_apply=[[add_4:%[^ ]+]]
 // CHECK:  ROOT [[out_1_5:%[^ ]+]] = f32[100]{0} reduce([[reduce_3]], [[zero_2]]), dimensions={0}, to_apply=[[add_4]]
       )");
 }
@@ -289,7 +297,7 @@ add {
 }
 
 ENTRY main {
-  input = f32[10000,2,2,2] parameter(0)
+  input = f32[16384,2,2,2] parameter(0)
   zero = f32[] constant(0)
   ROOT out = f32[2,2,2] reduce(input, zero), dimensions={0}, to_apply=add
 }
@@ -297,10 +305,10 @@ ENTRY main {
 
   CheckTreeRewriter(hlo,
                     R"(
-// CHECK:  [[input_0:%[^ ]+]] = f32[10000,2,2,2]{3,2,1,0} parameter(0)
-// CHECK:  [[bitcast_1:%[^ ]+]] = f32[100,100,2,2,2]{4,3,2,1,0} bitcast([[input_0]])
+// CHECK:  [[input_0:%[^ ]+]] = f32[16384,2,2,2]{3,2,1,0} parameter(0)
+// CHECK:  [[bitcast_1:%[^ ]+]] = f32[128,128,2,2,2]{4,3,2,1,0} bitcast([[input_0]])
 // CHECK:  [[zero_2:%[^ ]+]] = f32[] constant(0)
-// CHECK:  [[reduce_3:%[^ ]+]] = f32[100,2,2,2]{3,2,1,0} reduce([[bitcast_1]], [[zero_2]]), dimensions={0}, to_apply=[[add_4:%[^ ]+]]
+// CHECK:  [[reduce_3:%[^ ]+]] = f32[128,2,2,2]{3,2,1,0} reduce([[bitcast_1]], [[zero_2]]), dimensions={1}, to_apply=[[add_4:%[^ ]+]]
 // CHECK:  ROOT [[out_1_5:%[^ ]+]] = f32[2,2,2]{2,1,0} reduce([[reduce_3]], [[zero_2]]), dimensions={0}, to_apply=[[add_4]]
       )");
 }
@@ -316,7 +324,7 @@ add {
 }
 
 ENTRY main {
-  input = f32[1000000,5] parameter(0)
+  input = f32[1048576,5] parameter(0)
   zero = f32[] constant(0)
   ROOT out = f32[5] reduce(input, zero), dimensions={0}, to_apply=add
 }
@@ -325,9 +333,9 @@ ENTRY main {
   CheckTreeRewriter(hlo,
                     R"(
 
-// CHECK:  [[bitcast_0:%[^ ]+]] = f32[1000,1000,5]{2,1,0} bitcast([[input_1:%[^ ]+]])
+// CHECK:  [[bitcast_0:%[^ ]+]] = f32[1024,1024,5]{2,1,0} bitcast([[input_1:%[^ ]+]])
 // CHECK:  [[zero_2:%[^ ]+]] = f32[] constant(0)
-// CHECK:  [[reduce_3:%[^ ]+]] = f32[1000,5]{1,0} reduce([[bitcast_0]], [[zero_2]]), dimensions={0}, to_apply=[[add_4:%[^ ]+]]
+// CHECK:  [[reduce_3:%[^ ]+]] = f32[1024,5]{1,0} reduce([[bitcast_0]], [[zero_2]]), dimensions={1}, to_apply=[[add_4:%[^ ]+]]
 // CHECK:  ROOT [[out_1_5:%[^ ]+]] = f32[5]{0} reduce([[reduce_3]], [[zero_2]]), dimensions={0}, to_apply=[[add_4]]
       )");
 }
@@ -368,14 +376,14 @@ ENTRY main {
 
   CheckTreeRewriter(hlo,
                     R"(
-// CHECK:  [[pad_0:%[^ ]+]] = f32[2,100489]{1,0} pad([[input_1:%[^ ]+]], [[zero_2:%[^ ]+]]), padding=0_0x0_489
-// CHECK:  [[bitcast_3:%[^ ]+]] = f32[2,317,317]{2,1,0} bitcast([[pad_0]])
+// CHECK:  [[pad_0:%[^ ]+]] = f32[2,100096]{1,0} pad([[input_1:%[^ ]+]], [[zero_2:%[^ ]+]]), padding=0_0x0_96
+// CHECK:  [[bitcast_3:%[^ ]+]] = f32[2,256,391]{2,1,0} bitcast([[pad_0]])
 // CHECK:  [[zero_idx_4:%[^ ]+]] = u32[] constant(0)
-// CHECK:  [[pad_1_5:%[^ ]+]] = u32[2,100489]{1,0} pad([[idxs_6:%[^ ]+]], [[zero_idx_4]]), padding=0_0x0_489
-// CHECK:  [[bitcast_1_7:%[^ ]+]] = u32[2,317,317]{2,1,0} bitcast([[pad_1_5]])
-// CHECK:  [[reduce_8:%[^ ]+]] = (f32[2,317]{1,0}, u32[2,317]{1,0}) reduce([[bitcast_3]], [[bitcast_1_7]], [[zero_2]], [[zero_idx_4]]), dimensions={2}, to_apply=[[argmax_9:%[^ ]+]]
-// CHECK:  [[get_tuple_element_10:%[^ ]+]] = f32[2,317]{1,0} get-tuple-element([[reduce_8]]), index=0
-// CHECK:  [[get_tuple_element_1_11:%[^ ]+]] = u32[2,317]{1,0} get-tuple-element([[reduce_8]]), index=1
+// CHECK:  [[pad_1_5:%[^ ]+]] = u32[2,100096]{1,0} pad([[idxs_6:%[^ ]+]], [[zero_idx_4]]), padding=0_0x0_96
+// CHECK:  [[bitcast_1_7:%[^ ]+]] = u32[2,256,391]{2,1,0} bitcast([[pad_1_5]])
+// CHECK:  [[reduce_8:%[^ ]+]] = (f32[2,256]{1,0}, u32[2,256]{1,0}) reduce([[bitcast_3]], [[bitcast_1_7]], [[zero_2]], [[zero_idx_4]]), dimensions={2}, to_apply=[[argmax_9:%[^ ]+]]
+// CHECK:  [[get_tuple_element_10:%[^ ]+]] = f32[2,256]{1,0} get-tuple-element([[reduce_8]]), index=0
+// CHECK:  [[get_tuple_element_1_11:%[^ ]+]] = u32[2,256]{1,0} get-tuple-element([[reduce_8]]), index=1
 // CHECK:  ROOT [[out_1_12:%[^ ]+]] = (f32[2]{0}, u32[2]{0}) reduce([[get_tuple_element_10]], [[get_tuple_element_1_11]], [[zero_2]], [[zero_idx_4]]), dimensions={1}, to_apply=[[argmax_9]]
       )");
 }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/triton_naming.hlo b/tensorflow/compiler/xla/service/gpu/tests/triton_naming.hlo
new file mode 100644
index 00000000000..7937b0ee4b7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/triton_naming.hlo
@@ -0,0 +1,18 @@
+// RUN: hlo_to_llvm_ir %s | FileCheck %s
+
+// CHECK: define void @triton_gemm_r(
+
+HloModule t, entry_computation_layout={(f16[15,19]{1,0},s8[19,17]{1,0})->f16[15,17]{1,0}}
+
+%triton_gemm_r (parameter_0: s8[19,17], parameter_1: f16[15,19]) -> f16[15,17] {
+  %parameter_1 = f16[15,19]{1,0} parameter(1)
+  %parameter_0 = s8[19,17]{1,0} parameter(0)
+  %cp1.1 = f16[19,17]{1,0} convert(%parameter_0)
+  ROOT %r.1 = f16[15,17]{1,0} dot(%parameter_1, %cp1.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY %e (p0: f16[15,19], p1: s8[19,17]) -> f16[15,17] {
+  %p1 = s8[19,17]{1,0} parameter(1)
+  %p0 = f16[15,19]{1,0} parameter(0)
+  ROOT %triton_gemm_r = f16[15,17]{1,0} fusion(%p1, %p0), kind=kCustom, calls=%triton_gemm_r, backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"8\"}"
+}
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.cc b/tensorflow/compiler/xla/service/gpu/thunk.cc
index aaa6e4cfde8..5f9ab48f5da 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk.cc
@@ -64,7 +64,6 @@ Thunk::ExecuteParams::ExecuteParams(
     CASE(kGemm);
     CASE(kInfeed);
     CASE(kKernel);
-    CASE(kReusableKernel);
     CASE(kMemset32BitValue);
     CASE(kMemzero);
     CASE(kOutfeed);
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 5d7292125ef..03d2b5806a4 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -59,7 +59,6 @@ class Thunk {
     kGemm,
     kInfeed,
     kKernel,
-    kReusableKernel,
     kMemset32BitValue,
     kMemzero,
     kNcclAllGather,
diff --git a/tensorflow/compiler/xla/service/gpu/topk_specializer.cc b/tensorflow/compiler/xla/service/gpu/topk_specializer.cc
new file mode 100644
index 00000000000..7384ed61321
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/topk_specializer.cc
@@ -0,0 +1,113 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/topk_specializer.h"
+
+#include <stddef.h>
+
+#include <initializer_list>
+#include <string>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/tuple_util.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+StatusOr<HloInstruction*> SmallBufferOptimization(
+    HloCustomCallInstruction* topk) {
+  Shape data_shape = topk->operand(0)->shape();
+  auto supported_dtypes = {F32, BF16};
+  if (!absl::c_linear_search(supported_dtypes, data_shape.element_type())) {
+    return InvalidArgument(
+        "Invalid Dtype: %s",
+        primitive_util::LowercasePrimitiveTypeName(data_shape.element_type()));
+  }
+  // We only support topk of the shape [x] or [batch, x].
+  if (data_shape.dimensions_size() > 2) {
+    return InvalidArgument("Invalid input dimensions: %s",
+                           data_shape.ToString());
+  }
+  bool has_batch = data_shape.dimensions_size() == 2;
+  constexpr size_t max_k = 16;
+  constexpr size_t min_n = 1024;
+  size_t n = data_shape.dimensions(has_batch ? 1 : 0);
+  size_t k = topk->shape().tuple_shapes(0).dimensions(has_batch ? 1 : 0);
+  if (k > max_k) {
+    return InvalidArgument("k too large (%d), must be <= %d", k, max_k);
+  }
+  if (n < min_n) {
+    return InvalidArgument("Input too small (n=%d, min_n=%d)", n, min_n);
+  }
+  HloComputation* comp = topk->parent();
+  HloInstruction* new_topk =
+      comp->AddInstruction(HloInstruction::CreateCustomCall(
+          topk->shape(), topk->operands(),
+          // We don't need the original to_apply, but keeping it around allows
+          // us to round-trip this CustomCall on tests.
+          topk->to_apply(), "__gpu$TopK",
+          /*opaque=*/"", CustomCallApiVersion::API_VERSION_TYPED_FFI));
+  return TupleUtil::ExtractPrefix(new_topk, 2);
+}
+
+class SpecializeTopkVisitor : public DfsHloRewriteVisitor {
+ public:
+  Status HandleCustomCall(HloInstruction* inst) override {
+    HloCustomCallInstruction* topk = DynCast<HloCustomCallInstruction>(inst);
+    if (topk == nullptr || topk->custom_call_target() != "TopK") {
+      return OkStatus();
+    }
+    TF_RET_CHECK(topk->operand_count() == 1);
+
+    if (auto small_topk = SmallBufferOptimization(topk); small_topk.ok()) {
+      return ReplaceInstruction(topk, *small_topk);
+    } else {
+      VLOG(2) << "Small TopK optimization doesn't match: "
+              << small_topk.status();
+    }
+
+    return OkStatus();
+  }
+};
+
+}  // namespace
+
+StatusOr<bool> TopkSpecializer::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return SpecializeTopkVisitor().RunOnModule(module, execution_threads);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/topk_specializer.h b/tensorflow/compiler/xla/service/gpu/topk_specializer.h
new file mode 100644
index 00000000000..7293f42bf6f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/topk_specializer.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TOPK_SPECIALIZER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TOPK_SPECIALIZER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+// This pass transforms eligible TopK CustomCall into a call to be executed by
+// runtime/topk.cc.
+class TopkSpecializer : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "topk-specializer"; }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TOPK_SPECIALIZER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/topk_specializer_nocuda.cc b/tensorflow/compiler/xla/service/gpu/topk_specializer_nocuda.cc
new file mode 100644
index 00000000000..0ce06919b27
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/topk_specializer_nocuda.cc
@@ -0,0 +1,25 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/topk_specializer.h"
+
+namespace xla::gpu {
+
+StatusOr<bool> TopkSpecializer::Run(
+    HloModule*, const absl::flat_hash_set<absl::string_view>&) {
+  return false;
+}
+
+}  // namespace xla::gpu
diff --git a/tensorflow/compiler/xla/service/gpu/topk_splitter.cc b/tensorflow/compiler/xla/service/gpu/topk_splitter.cc
new file mode 100644
index 00000000000..3f4597ae30b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/topk_splitter.cc
@@ -0,0 +1,153 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/topk_splitter.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/numeric/bits.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+constexpr size_t kRequiredAlignment = 1024;
+constexpr size_t kMaximumBatchSize = 1024;
+
+class TopkSplitterVisitor : public DfsHloRewriteVisitor {
+ public:
+  explicit TopkSplitterVisitor(size_t split_threshold)
+      : split_threshold_(split_threshold) {}
+
+  Status HandleCustomCall(HloInstruction* inst) override {
+    HloCustomCallInstruction* topk = DynCast<HloCustomCallInstruction>(inst);
+    if (topk == nullptr || topk->custom_call_target() != "TopK") {
+      return OkStatus();
+    }
+    HloComputation* comp = inst->parent();
+    Shape data_shape = topk->operand(0)->shape();
+    bool has_batch = data_shape.dimensions_size() == 2;
+    // TODO(doak): Support multiple batches.
+    if (has_batch && data_shape.dimensions(0) != 1) {
+      return OkStatus();
+    }
+    size_t n = data_shape.dimensions(has_batch ? 1 : 0);
+    int64_t k = topk->shape().tuple_shapes(0).dimensions(has_batch ? 1 : 0);
+    // If K approaches N, splitting the input will not be beneficial anymore.
+    if (k > sqrt(n)) {
+      return OkStatus();
+    }
+    // TODO(doak): Relax this alignment requirement.
+    if (n % kRequiredAlignment != 0) {
+      return OkStatus();
+    }
+    if (n < split_threshold_) return OkStatus();
+    int new_batch =
+        std::min(absl::bit_floor(n / split_threshold_), kMaximumBatchSize);
+    int new_n = n / new_batch;
+    // Split the input into B batches and compute TopK over the batched arrays.
+    Shape split_input_shape =
+        ShapeUtil::MakeShape(data_shape.element_type(), {new_batch, new_n});
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * reshaped,
+        MakeReshapeHlo(split_input_shape, topk->mutable_operand(0)));
+    Shape batch_topk_shape = ShapeUtil::MakeTupleShape(
+        {ShapeUtil::MakeShape(data_shape.element_type(), {new_batch, k}),
+         ShapeUtil::MakeShape(S32, {new_batch, k})});
+    HloInstruction* batch_topk =
+        comp->AddInstruction(HloInstruction::CreateCustomCall(
+            batch_topk_shape, {reshaped}, topk->to_apply(), "TopK",
+            /*opaque=*/""));
+    // Fix indices, adding j*split_N to the j-th batch of indices.
+    TF_ASSIGN_OR_RETURN(HloInstruction * indices,
+                        MakeGetTupleElementHlo(batch_topk, 1));
+    TF_ASSIGN_OR_RETURN(HloInstruction * values,
+                        MakeGetTupleElementHlo(batch_topk, 0));
+    Shape iota_shape = ShapeUtil::MakeShape(S32, {new_batch});
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * fix,
+        MakeBinaryHlo(
+            HloOpcode::kMultiply, MakeIotaHlo(comp, iota_shape, 0),
+            MakeBroadcastHlo(MakeR0ConstantHlo<int32_t>(comp, new_n),
+                             /*broadcast_dimensions=*/{}, iota_shape)));
+    TF_ASSIGN_OR_RETURN(
+        indices, MakeBinaryHlo(HloOpcode::kAdd, indices,
+                               MakeBroadcastHlo(fix, {0}, indices->shape())));
+    // With the indices restored, compute a final top-k. Since this topk uses
+    // arbitrary indices, we need to use sort+slice.
+    Shape linear_index_shape = ShapeUtil::MakeShape(S32, {k * new_batch});
+    Shape linear_shape = ShapeUtil::ChangeElementType(
+        linear_index_shape, data_shape.element_type());
+    Shape linear_sort_shape =
+        ShapeUtil::MakeTupleShape({linear_shape, linear_index_shape});
+    // Assuming the outputs of the TopK above are stably sorted, using a stable
+    // sort here is enough to guarantee global stable sorting:
+    //  - Within a blocks elements are stably sorted by TopK.
+    //  - Since blocks are organized linearly from smallest to largest, the
+    //    index used on the stable sort below will also respect block ordering.
+    HloInstruction* aggregated_sort =
+        comp->AddInstruction(HloInstruction::CreateSort(
+            linear_sort_shape, 0,
+            {*MakeReshapeHlo(linear_shape, values),
+             *MakeReshapeHlo(linear_index_shape, indices)},
+            topk->to_apply(), /*is_stable=*/true));
+    auto slice_tuple = [&](HloInstruction* sort, const size_t index) {
+      return *MakeReshapeHlo(
+          topk->shape().tuple_shapes(index),
+          *MakeSliceHlo(*MakeGetTupleElementHlo(sort, index), {0}, {k}, {1}));
+    };
+    return ReplaceInstruction(topk,
+                              comp->AddInstruction(HloInstruction::CreateTuple({
+                                  slice_tuple(aggregated_sort, 0),
+                                  slice_tuple(aggregated_sort, 1),
+                              })));
+  }
+
+ private:
+  size_t split_threshold_;
+};
+
+}  // namespace
+
+StatusOr<bool> TopKSplitter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return TopkSplitterVisitor(split_threshold_)
+      .RunOnModule(module, execution_threads);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/topk_splitter.h b/tensorflow/compiler/xla/service/gpu/topk_splitter.h
new file mode 100644
index 00000000000..7d6e681a12b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/topk_splitter.h
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TOPK_SPLITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TOPK_SPLITTER_H_
+
+#include <cstddef>
+#include <string_view>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+// Splits large TopK into batches of smaller TopKs, followed by sorting and
+// slicing the results of those smaller topks. We consider TopKs to be 'large'
+// the last dimension of the TopK is larger than `split_threshold`.
+class TopKSplitter : public HloModulePass {
+ public:
+  explicit TopKSplitter(size_t split_threshold = 1024 * 1024)
+      : split_threshold_(split_threshold) {}
+  absl::string_view name() const override { return "topk-splitter"; }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const size_t split_threshold_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TOPK_SPLITTER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/topk_splitter_test.cc b/tensorflow/compiler/xla/service/gpu/topk_splitter_test.cc
new file mode 100644
index 00000000000..fcc7befc42f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/topk_splitter_test.cc
@@ -0,0 +1,212 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/topk_splitter.h"
+
+#include <stdint.h>
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/compiler/xla/error_spec.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/topk_rewriter.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace m = ::xla::match;
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::tsl::testing::IsOkAndHolds;
+using TopkSplitterTest = HloTestBase;
+
+constexpr absl::string_view kComparator = R"(
+  %compare {
+    %p.1.lhs.40628 = s32[] parameter(2)
+    %p.1.rhs.40629 = s32[] parameter(3)
+    %constant.40630 = pred[] constant(true)
+    %broadcast.40631 = pred[] broadcast(pred[] %constant.40630), dimensions={}
+    %p.0.lhs.40626 = f32[] parameter(0)
+    %p.0.rhs.40627 = f32[] parameter(1)
+    %compare.40632 = pred[] compare(f32[] %p.0.lhs.40626, f32[] %p.0.rhs.40627), direction=GT, type=TOTALORDER
+    ROOT %select.40633 = pred[] select(pred[] %broadcast.40631, pred[] %compare.40632, pred[] %broadcast.40631)
+  })";
+
+TEST_F(TopkSplitterTest, SplitsTopK) {
+  const std::string hlo_string = absl::Substitute(R"(
+HloModule module
+$0
+ENTRY cluster {
+  %arg.1 = f32[1,1073741824] parameter(0)
+  ROOT %cc.2 = (f32[1,5], s32[1,5]) custom-call(%arg.1), custom_call_target= "TopK", to_apply=%compare
+})",
+                                                  kComparator);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_THAT(RunHloPass(TopKSplitter(), module.get()), IsOkAndHolds(true));
+  auto first_topk = m::CustomCall(m::Reshape(m::Parameter(0)));
+  auto slice_result = [&](auto input, size_t i) {
+    return m::Reshape(m::Slice(m::GetTupleElement(input, i)));
+  };
+  auto index_correction =
+      m::Broadcast(m::Multiply(m::Iota(), m::Broadcast(m::Constant())));
+  auto sorted = m::Sort(
+      m::Reshape(m::GetTupleElement(first_topk, 0)),
+      m::Reshape(m::Add(m::GetTupleElement(first_topk, 1), index_correction)));
+  EXPECT_TRUE(
+      Match(module->entry_computation()->root_instruction(),
+            m::Tuple(slice_result(sorted, 0), slice_result(sorted, 1))));
+}
+
+TEST_F(TopkSplitterTest, SplitsTopKNoBatchDimension) {
+  const std::string hlo_string = absl::Substitute(R"(
+HloModule module
+$0
+ENTRY cluster {
+  %arg.1 = f32[1073741824] parameter(0)
+  ROOT %cc.2 = (f32[5], s32[5]) custom-call(%arg.1), custom_call_target= "TopK", to_apply=%compare
+})",
+                                                  kComparator);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_THAT(RunHloPass(TopKSplitter(), module.get()), IsOkAndHolds(true));
+  auto first_topk = m::CustomCall(m::Reshape(m::Parameter(0)));
+  auto slice_result = [&](auto input, size_t i) {
+    return m::Reshape(m::Slice(m::GetTupleElement(input, i)));
+  };
+  auto index_correction =
+      m::Broadcast(m::Multiply(m::Iota(), m::Broadcast(m::Constant())));
+  auto sorted = m::Sort(
+      m::Reshape(m::GetTupleElement(first_topk, 0)),
+      m::Reshape(m::Add(m::GetTupleElement(first_topk, 1), index_correction)));
+  EXPECT_TRUE(
+      Match(module->entry_computation()->root_instruction(),
+            m::Tuple(slice_result(sorted, 0), slice_result(sorted, 1))));
+}
+
+TEST_F(TopkSplitterTest, SplitFailsUnderThreshold) {
+  const std::string hlo_string = absl::Substitute(R"(
+HloModule module
+$0
+ENTRY cluster {
+  %arg.1 = f32[1,524288] parameter(0)
+  ROOT %cc.2 = (f32[1,5], s32[1,5]) custom-call(%arg.1), custom_call_target= "TopK", to_apply=%compare
+})",
+                                                  kComparator);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_THAT(
+      RunHloPass(TopKSplitter(/*split_threshold=*/1048576), module.get()),
+      IsOkAndHolds(false));
+}
+
+TEST_F(TopkSplitterTest, SplitFailsUnaligned) {
+  const std::string hlo_string = absl::Substitute(R"(
+HloModule module
+$0
+ENTRY cluster {
+  %arg.1 = f32[1,524289] parameter(0)
+  ROOT %cc.2 = (f32[1,5], s32[1,5]) custom-call(%arg.1), custom_call_target= "TopK", to_apply=%compare
+})",
+                                                  kComparator);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_THAT(RunHloPass(TopKSplitter(/*split_threshold=*/1024), module.get()),
+              IsOkAndHolds(false));
+}
+
+TEST_F(TopkSplitterTest, SplitFailsLargeK) {
+  const std::string hlo_string = absl::Substitute(R"(
+HloModule module
+$0
+ENTRY cluster {
+  %arg.1 = f32[1,524288] parameter(0)
+  ROOT %cc.2 = (f32[1,1024], s32[1,1024]) custom-call(%arg.1), custom_call_target= "TopK", to_apply=%compare
+})",
+                                                  kComparator);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_THAT(RunHloPass(TopKSplitter(/*split_threshold=*/1024), module.get()),
+              IsOkAndHolds(false));
+}
+
+TEST_F(TopkSplitterTest, Equivalent) {
+  const std::string hlo_string = absl::Substitute(R"(
+HloModule module
+$0
+ENTRY cluster {
+  %arg.1 = f32[1,16384] parameter(0)
+  ROOT %cc.2 = (f32[1,5], s32[1,5]) custom-call(%arg.1), custom_call_target= "TopK", to_apply=%compare
+})",
+                                                  kComparator);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_THAT(TopkDecomposer().Run(module.get()), IsOkAndHolds(true));
+  auto round_trip = [](HloModule* module) {
+    EXPECT_THAT(TopkRewriter([](const HloSortInstruction*, int64_t) {
+                  return true;
+                }).Run(module),
+                IsOkAndHolds(true));
+    EXPECT_THAT(TopKSplitter(1024).Run(module), IsOkAndHolds(true));
+    EXPECT_THAT(TopkDecomposer().Run(module), IsOkAndHolds(true));
+    EXPECT_TRUE(HloDCE().Run(module).status().ok());
+  };
+  EXPECT_TRUE(RunAndCompare(std::move(module), std::nullopt, round_trip));
+}
+
+TEST_F(TopkSplitterTest, StableSorts) {
+  const std::string hlo_string = absl::Substitute(R"(
+HloModule module
+$0
+ENTRY cluster {
+  %constant.1 = f32[] constant(42)
+  %broadcast.2= f32[1,16384] broadcast(f32[] %constant.1), dimensions={}
+  ROOT %cc.3 = (f32[1,5], s32[1,5]) custom-call(%broadcast.2), custom_call_target= "TopK", to_apply=%compare
+})",
+                                                  kComparator);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_THAT(TopkDecomposer().Run(module.get()), IsOkAndHolds(true));
+  auto round_trip = [](HloModule* module) {
+    EXPECT_THAT(TopkRewriter([](const HloSortInstruction*, int64_t) {
+                  return true;
+                }).Run(module),
+                IsOkAndHolds(true));
+    EXPECT_THAT(TopKSplitter(1024).Run(module), IsOkAndHolds(true));
+    EXPECT_THAT(TopkDecomposer().Run(module), IsOkAndHolds(true));
+    EXPECT_TRUE(HloDCE().Run(module).status().ok());
+  };
+  EXPECT_TRUE(RunAndCompare(std::move(module), std::nullopt, round_trip));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.cc b/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.cc
index db86c459ee2..ccf274bd2a0 100644
--- a/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/numeric/bits.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
@@ -38,16 +39,10 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// Returns the square root of the input rounded up to the nearest square.
-static int64_t SqrtOfRoundUpToSquare(int64_t input) {
-  return static_cast<int64_t>(std::ceil(std::sqrt(input)));
-}
-
 class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit ReductionRewriterVisitor(
-      se::CudaComputeCapability cuda_compute_capability)
-      : cuda_compute_capability_(cuda_compute_capability) {}
+  explicit ReductionRewriterVisitor(GpuVersion gpu_version)
+      : gpu_version_(gpu_version) {}
 
   Status HandleReduce(HloInstruction *hlo) override {
     if (IsMinMaxReduction(hlo)) {
@@ -110,19 +105,45 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
     }
 
     VLOG(1) << "Input: " << hlo->ToString();
-    int64_t reduced_dim_size = input_shape_dims[reduced_input_dimension];
-    VLOG(3) << "reduced_dim_size = " << reduced_dim_size;
+    int64_t n = input_shape_dims[reduced_input_dimension];
+    VLOG(3) << "n = " << n;
 
-    // We pad to a nearest square (ceil(sqrt(x)))^2.  Given that:
+    // We will do this reduction in two stages.  The first will reduce from n
+    // elements to k elements in the reduction dimension.  The second will
+    // reduce further, from k to 1 element.
     //
-    // (n + 1)^2 = n^2 + (2n+1)
+    // We do this by splitting the input shape [a, n, b] into [a, k, n / k, b].
     //
-    // it can be seen that the distance to the nearest square is at most twice
-    // the square root of the input number.
-    int64_t num_fit = SqrtOfRoundUpToSquare(reduced_dim_size);
+    // We want to choose k to be roughly equal to sqrt(n) so that we process
+    // "most of" the reduction in the first step.  We also want k to be a power
+    // of 2, so that the GPU kernel doesn't spend all its time doing slow
+    // integer divmods to compute indices into the shape [a,k,n/k,b].  This
+    // means we may need to pad n so that n is divisible by k.
+    //
+    // Thus we consider two options for k:
+    //
+    //   k1 = round_up_pow2(sqrt(n))
+    //   k2 = round_down_pow2(sqrt(n))
+    //
+    // and we choose the value of k that results in the least amount of padding.
+    int64_t k1 = absl::bit_ceil(static_cast<uint64_t>(std::ceil(std::sqrt(n))));
+    int64_t k2 =
+        absl::bit_floor(static_cast<uint64_t>(std::floor(std::sqrt(n))));
+    int64_t padded_n_k1 = RoundUpTo(n, k1);
+    int64_t padded_n_k2 = RoundUpTo(n, k2);
+
+    int64_t k;
+    int64_t padded_n;
+    if (padded_n_k1 < padded_n_k2) {
+      k = k1;
+      padded_n = padded_n_k1;
+    } else {
+      k = k2;
+      padded_n = padded_n_k2;
+    }
 
     // Pad reduced dimension to the required number of elements.
-    bool no_padding_necessary = reduced_dim_size % num_fit == 0;
+    bool no_padding_necessary = n == padded_n;
     using InstructionVector = absl::InlinedVector<HloInstruction *, 2>;
     auto padded = [&]() -> InstructionVector {
       if (no_padding_necessary) {
@@ -130,14 +151,13 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
                                  reduce->inputs().end());
       }
 
-      int64_t padded_num_elements = num_fit * num_fit;
       PaddingConfig padding_config =
           MakeNoPaddingConfig(input_shape_dims.size());
       padding_config.mutable_dimensions(reduced_input_dimension)
-          ->set_edge_padding_high(padded_num_elements - reduced_dim_size);
+          ->set_edge_padding_high(padded_n - n);
       std::vector<int64_t> padded_dimensions(input_shape_dims.begin(),
                                              input_shape_dims.end());
-      padded_dimensions[reduced_input_dimension] = padded_num_elements;
+      padded_dimensions[reduced_input_dimension] = padded_n;
 
       absl::InlinedVector<HloInstruction *, 2> out;
       out.reserve(reduce->input_count());
@@ -159,13 +179,8 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
     for (int64_t dim_idx = 0; dim_idx < padded[0]->shape().dimensions_size();
          dim_idx++) {
       if (dim_idx == reduced_input_dimension) {
-        if (no_padding_necessary) {
-          reshaped_dimensions.push_back(reduced_dim_size / num_fit);
-        } else {
-          reshaped_dimensions.push_back(num_fit);
-        }
-
-        reshaped_dimensions.push_back(num_fit);
+        reshaped_dimensions.push_back(k);
+        reshaped_dimensions.push_back(padded_n / k);
       } else {
         reshaped_dimensions.push_back(padded[0]->shape().dimensions(dim_idx));
       }
@@ -173,9 +188,12 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
 
     absl::InlinedVector<int64_t, 3> inner_reduce_dimensions =
         reshaped_dimensions;
+    // We split reduced_input_dimension into two new dims.  We have the choice
+    // of reducing along either of them.  We choose to reduce along the second,
+    // more-minor dimension, because this should use the GPU caches better.
     int64_t inner_reduced_dimension = is_row_reduction
                                           ? inner_reduce_dimensions.size() - 1
-                                          : reduced_input_dimension;
+                                          : reduced_input_dimension + 1;
     VLOG(2) << "inner_reduced_dimension = " << inner_reduced_dimension;
     inner_reduce_dimensions.erase(inner_reduce_dimensions.begin() +
                                   inner_reduced_dimension);
@@ -257,7 +275,7 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
     return ReplaceWithNewInstruction(hlo, std::move(out));
   }
 
-  se::CudaComputeCapability cuda_compute_capability_;
+  GpuVersion gpu_version_;
 };
 
 StatusOr<bool> GpuTreeReductionRewriter::Run(
@@ -265,7 +283,7 @@ StatusOr<bool> GpuTreeReductionRewriter::Run(
     const absl::flat_hash_set<absl::string_view> &execution_threads) {
   VLOG(5) << "Rewriter input: " << module->ToString();
   TF_ASSIGN_OR_RETURN(bool changed,
-                      ReductionRewriterVisitor(cuda_compute_capability_)
+                      ReductionRewriterVisitor(gpu_version_)
                           .RunOnModule(module, execution_threads));
   VLOG(5) << "Rewriter output: " << module->ToString();
   return changed;
diff --git a/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.h b/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.h
index d5a97d3a50f..6325f994f17 100644
--- a/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
@@ -75,9 +76,8 @@ namespace gpu {
 //
 class GpuTreeReductionRewriter : public HloModulePass {
  public:
-  explicit GpuTreeReductionRewriter(
-      se::CudaComputeCapability cuda_compute_capability)
-      : cuda_compute_capability_(cuda_compute_capability) {}
+  explicit GpuTreeReductionRewriter(GpuVersion gpu_version)
+      : gpu_version_(gpu_version) {}
 
   ~GpuTreeReductionRewriter() override = default;
   absl::string_view name() const override {
@@ -90,7 +90,7 @@ class GpuTreeReductionRewriter : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  se::CudaComputeCapability cuda_compute_capability_;
+  GpuVersion gpu_version_;
 };
 
 }  // end namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/triton_autotuner.cc b/tensorflow/compiler/xla/service/gpu/triton_autotuner.cc
index 6c06df45536..b5948df588b 100644
--- a/tensorflow/compiler/xla/service/gpu/triton_autotuner.cc
+++ b/tensorflow/compiler/xla/service/gpu/triton_autotuner.cc
@@ -21,33 +21,62 @@ limitations under the License.
 #include <limits>
 #include <memory>
 #include <optional>
-#include <sstream>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/base/const_init.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "llvm/IR/LLVMContext.h"
 #include "tensorflow/compiler/xla/autotune_results.pb.h"
 #include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_clone_context.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/float_normalization.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
+#include "tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.h"
+#include "tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_float_support.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_serializable_autotuner.h"
+#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emitter_triton.h"
+#include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
-#include "tensorflow/compiler/xla/service/gpu/target_util.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
+#include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_timer.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/tsl/platform/blocking_counter.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/threadpool.h"
+#include "tensorflow/tsl/protobuf/autotuning.pb.h"
 #include "tensorflow/tsl/util/proto/proto_utils.h"
 
 namespace xla {
@@ -72,21 +101,13 @@ static AutotuneResult::TritonGemmKey GemmKey(int64_t block_m, int64_t block_n,
   return key;
 }
 
-// TODO(b/266210099): have a way to generate/load these dynamically.
-// Returns a list of possible tilings for a gemm performed in Triton.
-static std::vector<AutotuneResult::TritonGemmKey>
-GetPossibleMatmulAutotuneConfigs() {
-  return {GemmKey(128, 256, 32, 1, 3, 8),  GemmKey(256, 128, 32, 1, 3, 8),
-          GemmKey(256, 64, 32, 1, 4, 4),   GemmKey(64, 256, 32, 1, 4, 4),
-          GemmKey(128, 64, 32, 1, 4, 4),   GemmKey(64, 128, 32, 1, 4, 4),
-          GemmKey(128, 256, 32, 1, 3, 8),  GemmKey(256, 128, 128, 1, 3, 8),
-          GemmKey(256, 64, 128, 1, 4, 4),  GemmKey(64, 256, 128, 1, 4, 4),
-          GemmKey(128, 128, 128, 1, 4, 4), GemmKey(128, 64, 64, 1, 4, 4),
-          GemmKey(64, 128, 64, 1, 4, 4),   GemmKey(128, 32, 64, 1, 4, 4),
-          GemmKey(64, 32, 64, 1, 4, 4),    GemmKey(32, 128, 32, 1, 4, 4),
-          GemmKey(64, 32, 64, 1, 2, 8),    GemmKey(128, 128, 32, 1, 4, 4),
-          GemmKey(32, 32, 256, 1, 1, 4)};
-}
+// Maximum number of independent thread blocks along K dimension.
+// The actual value is split_k in the tiling configuration
+// and has to be <= kMaxSplitK.
+// Requires a separate temporary output buffer for each block, so should
+// be limited reasonably. The current maximum value was chosen based on
+// some matmul configurations benchmarked so far and can be increased further.
+constexpr int kMaxSplitK = 16;
 
 // We assume that the string representation is general enough for caching
 // purposes.
@@ -110,7 +131,7 @@ static auto& autotune_cache ABSL_GUARDED_BY(autotune_cache_mu) =
     *new AutotuneCacheMap();
 
 struct TritonTilingWrapper {
-  AutotuneResult::TritonGemmKey key;
+  const AutotuneResult::TritonGemmKey key;
 
   template <typename H>
   friend H AbslHashValue(H h, const TritonTilingWrapper& w) {
@@ -135,9 +156,16 @@ struct AutotuneConfig {
 struct CompilationResult {
   std::string ptx;
   std::vector<uint8_t> cubin;
-  LaunchDimensions launch_dimensions;
+  std::vector<std::string> kernel_names;
+  std::vector<LaunchDimensions> launch_dimensions;
 };
 
+using CompilationKey = std::pair<std::string, TritonTilingWrapper>;
+static absl::Mutex compilation_cache_mutex(absl::kConstInit);
+static auto& compilation_cache ABSL_GUARDED_BY(compilation_cache_mutex) =
+    *new absl::node_hash_map<CompilationKey,
+                             std::optional<CompilationResult>>();
+
 // TODO(b/266210099): Do not duplicate this functionality with
 // gemm_algorithm_picker.
 static AutotuneConfig GetConfig(const DebugOptions& debug_options) {
@@ -150,33 +178,40 @@ static AutotuneConfig GetConfig(const DebugOptions& debug_options) {
 // TODO(b/266210099): Do not duplicate this functionality with
 // gemm_algorithm_picker.
 static StatusOr<se::DeviceMemoryBase> CreateBuffer(
-    se::RedzoneAllocator& allocator, const HloInstruction& op,
-    const AutotuneConfig& config, int64_t& rng_state) {
-  TF_ASSIGN_OR_RETURN(
-      se::DeviceMemoryBase buffer,
-      allocator.AllocateBytes(ShapeUtil::ByteSizeOf(op.shape())));
+    se::RedzoneAllocator& allocator, int64_t byte_size,
+    PrimitiveType element_type, const AutotuneConfig& config,
+    int64_t& rng_state) {
+  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
+                      allocator.AllocateBytes(byte_size));
   if (config.should_init_buffers()) {
-    InitializeBuffer(allocator.stream(), op.shape().element_type(), &rng_state,
-                     buffer);
+    InitializeBuffer(allocator.stream(), element_type, &rng_state, buffer);
   }
   return buffer;
 }
 
 class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
  public:
-  TritonAutotunerVisitor(const AutotuningConfig& config, int num_extra_threads)
-      : config_(config), num_extra_threads_(num_extra_threads) {}
+  TritonAutotunerVisitor(const AutotuningConfig& config,
+                         tsl::thread::ThreadPool* thread_pool)
+      : config_(config), thread_pool_(thread_pool) {}
 
   Status HandleFusion(HloInstruction* hlo) override {
     if (hlo->raw_backend_config_string() != kTritonGemmBackendConfig) {
       return OkStatus();
     }
 
+    VLOG(1) << "Tuning " << hlo->ToString();
     TF_ASSIGN_OR_RETURN(AutotuneResult autotune_result,
-                        AutotuneMatmul(hlo->called_computations()[0]));
+                        AutotuneMatmul(*hlo->called_computations()[0]));
+    VLOG(1) << "Result: " << autotune_result.DebugString();
 
     TF_RET_CHECK(autotune_result.has_triton());
     AutotuneResult::TritonGemmKey tiling = autotune_result.triton();
+
+    if (tiling.split_k() > 1) {
+      TF_RETURN_IF_ERROR(MakeDotSplitKBatch(hlo, tiling));
+    }
+
     TF_RETURN_IF_ERROR(hlo->set_backend_config(tiling));
     MarkAsChanged();
     return OkStatus();
@@ -184,11 +219,11 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
 
  private:
   // Autotune a tiling for a given matmul fusion.
-  StatusOr<AutotuneResult> AutotuneMatmul(HloComputation* fusion) {
+  StatusOr<AutotuneResult> AutotuneMatmul(const HloComputation& fusion) {
     if (auto deviceless_config = std::get_if<DevicelessConfig>(&config_)) {
       const std::string& device_description = deviceless_config->model_str;
       AutotuneCacheKey key =
-          std::make_tuple(ToCanonicalString(fusion), device_description);
+          std::make_tuple(ToCanonicalString(&fusion), device_description);
       if (AutotuneResult* autotune_result = TryFindInCache(key)) {
         return *autotune_result;
       }
@@ -201,7 +236,7 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
         device_config.stream_exec->GetDeviceDescription().model_str();
 
     AutotuneCacheKey key =
-        std::make_tuple(ToCanonicalString(fusion), device_description);
+        std::make_tuple(ToCanonicalString(&fusion), device_description);
     if (AutotuneResult* autotune_result = TryFindInCache(key)) {
       return *autotune_result;
     }
@@ -225,21 +260,19 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
   }
 
   StatusOr<AutotuneResult> AutotuneMatmulNoCache(
-      HloComputation* fusion, const DeviceConfig& device_config) {
+      const HloComputation& fusion, const DeviceConfig& device_config) {
     se::StreamExecutor* stream_exec = device_config.stream_exec;
     if (!stream_exec->SynchronizeAllActivity()) {
       return InternalError("Failed to synchronize GPU for autotuning.");
     }
 
-    HloInstruction* root = fusion->root_instruction();
-    CHECK(!root->shape().IsTuple())
-        << "Can only autotune single-output fusions";
+    HloInstruction* root = fusion.root_instruction();
     TF_ASSIGN_OR_RETURN(
         se::Stream* const stream,
         device_config.allocator->GetStream(stream_exec->device_ordinal()));
 
-    DebugOptions debug_opts = fusion->parent()->config().debug_options();
-    auto autotune_cfg = GetConfig(debug_opts);
+    const DebugOptions debug_opts = fusion.parent()->config().debug_options();
+    const AutotuneConfig autotune_cfg = GetConfig(debug_opts);
 
     std::vector<AutotuneResult> results;
     se::RedzoneAllocator rz_allocator(
@@ -257,40 +290,50 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
           rz_allocator.AllocateBytes(ShapeUtil::ByteSizeOf(root->shape())));
     }
 
-    BufferComparator comparator(root->shape(), fusion->parent()->config());
+    BufferComparator comparator(root->shape(), fusion.parent()->config());
+
+    const std::vector<AutotuneResult::TritonGemmKey> configurations =
+        GetPossibleMatmulAutotuneConfigs(
+            device_config.stream_exec->GetDeviceDescription()
+                .cuda_compute_capability());
 
     // Pre-compile all versions first using the thread pool.
-    if (num_extra_threads_ > 0) {
-      tsl::thread::ThreadPool thread_pool(
-          tsl::Env::Default(), "compilation_pool", num_extra_threads_);
-      for (const AutotuneResult::TritonGemmKey& conf :
-           GetPossibleMatmulAutotuneConfigs()) {
-        thread_pool.Schedule([=] {
+    if (thread_pool_) {
+      tsl::BlockingCounter counter(configurations.size());
+      for (const AutotuneResult::TritonGemmKey& conf : configurations) {
+        thread_pool_->Schedule([&] {
           StatusOr<CompilationResult*> res =
               Compile(fusion, device_config, conf);
           if (!res.ok()) {
             LOG(ERROR) << "Failure: " << res.status().ToString();
           }
+          counter.DecrementCount();
         });
       }
+      counter.Wait();
     }
 
-    std::vector<se::DeviceMemoryBase> args;
+    std::vector<se::DeviceMemoryBase> inputs;
     int64_t rng_state = 0;
-    for (const HloInstruction* param : fusion->parameter_instructions()) {
+    for (const HloInstruction* param : fusion.parameter_instructions()) {
       TF_ASSIGN_OR_RETURN(
           se::DeviceMemoryBase param_buffer,
-          CreateBuffer(rz_allocator, *param, autotune_cfg, rng_state));
-      args.push_back(param_buffer);
+          CreateBuffer(rz_allocator, ShapeUtil::ByteSizeOf(param->shape()),
+                       param->shape().element_type(), autotune_cfg, rng_state));
+      inputs.push_back(param_buffer);
     }
 
+    // The intermediate one does not need to be initialized.
+    TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase intermediate_buffer,
+                        rz_allocator.AllocateBytes(
+                            ShapeUtil::ByteSizeOf(root->shape()) * kMaxSplitK));
+
     TF_ASSIGN_OR_RETURN(
         se::DeviceMemoryBase output_buffer,
-        CreateBuffer(rz_allocator, *root, autotune_cfg, rng_state));
-    args.push_back(output_buffer);
+        CreateBuffer(rz_allocator, ShapeUtil::ByteSizeOf(root->shape()),
+                     root->shape().element_type(), autotune_cfg, rng_state));
 
-    for (AutotuneResult::TritonGemmKey& conf :
-         GetPossibleMatmulAutotuneConfigs()) {
+    for (const AutotuneResult::TritonGemmKey& conf : configurations) {
       VLOG(1) << "Trying triton tiling: " << conf.DebugString();
 
       AutotuneResult res;
@@ -298,44 +341,54 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
 
       TF_ASSIGN_OR_RETURN(
           std::optional<absl::Duration> duration,
-          RunMatmulWithConfig(fusion, conf, device_config, stream, args));
+          RunMatmulWithConfig(fusion, conf, device_config, stream, inputs,
+                              intermediate_buffer, output_buffer));
 
       if (!duration) {
-        VLOG(1) << "Skipping tiling " << conf.DebugString();
+        VLOG(1) << "Skipping this tiling.";
         continue;
       }
 
+      VLOG(1) << "Running the kernel took: " << *duration;
       *res.mutable_run_time() = tsl::proto_utils::ToDurationProto(*duration);
-      VLOG(1) << "Running kernel took: " << *duration;
 
-      TF_ASSIGN_OR_RETURN(
-          se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
-          rz_allocator.CheckRedzones());
-
-      if (!rz_check_status.ok()) {
-        res.mutable_failure()->set_kind(AutotuneResult::REDZONE_MODIFIED);
-        *res.mutable_failure()->mutable_msg() =
-            rz_check_status.RedzoneFailureMsg();
-        CHECK(!autotune_cfg.should_crash_on_check_failure);
-        continue;
-      }
-
-      if (!reference_tiling && autotune_cfg.should_check_correctness()) {
-        stream->ThenMemcpy(&reference_buffer, output_buffer,
-                           output_buffer.size());
-        reference_tiling = res.triton();
-      } else {
+      if (autotune_cfg.should_check_correctness()) {
         TF_ASSIGN_OR_RETURN(
-            bool outputs_match,
-            comparator.CompareEqual(stream, output_buffer, reference_buffer));
-        if (!outputs_match) {
-          LOG(ERROR) << "Results mismatch between different tilings. "
-                     << "This is likely a bug/unexpected loss of precision.";
+            se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
+            rz_allocator.CheckRedzones());
+        if (!rz_check_status.ok()) {
+          LOG(ERROR) << "Red zone modified";
+          res.mutable_failure()->set_kind(AutotuneResult::REDZONE_MODIFIED);
+          *res.mutable_failure()->mutable_msg() =
+              rz_check_status.RedzoneFailureMsg();
           CHECK(!autotune_cfg.should_crash_on_check_failure);
-          res.mutable_failure()->set_kind(AutotuneResult::WRONG_RESULT);
+          continue;
+        }
+
+        if (!reference_tiling) {
+          stream->ThenMemcpy(&reference_buffer, output_buffer,
+                             output_buffer.size());
+          reference_tiling = res.triton();
+        } else {
+          TF_ASSIGN_OR_RETURN(
+              bool outputs_match,
+              comparator.CompareEqual(stream, output_buffer, reference_buffer));
+          if (!outputs_match) {
+            LOG(ERROR) << "Results mismatch between different tilings. "
+                       << "This is likely a bug/unexpected loss of precision.";
+            CHECK(!autotune_cfg.should_crash_on_check_failure);
+            // WRONG_RESULT is not taken seriously by PickBestResult(), so
+            // use DISQUALIFIED.
+            res.mutable_failure()->set_kind(AutotuneResult::DISQUALIFIED);
+          }
         }
       }
       results.push_back(res);
+
+      if (autotune_cfg.should_reinit_output_buffer()) {
+        InitializeBuffer(stream, root->shape().element_type(), &rng_state,
+                         output_buffer);
+      }
     }
 
     TF_ASSIGN_OR_RETURN(
@@ -348,45 +401,68 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
   // Returns `true` if run successfully, `false` if the tiling has to be
   // skipped.
   StatusOr<std::optional<absl::Duration>> RunMatmulWithConfig(
-      HloComputation* hlo_computation,
+      const HloComputation& hlo_computation,
       const AutotuneResult::TritonGemmKey& autotune_config,
       const DeviceConfig& device_config, se::Stream* stream,
-      absl::Span<se::DeviceMemoryBase const> device_buffers) {
+      absl::Span<se::DeviceMemoryBase const> input_buffers,
+      se::DeviceMemoryBase intermediate_buffer,
+      se::DeviceMemoryBase output_buffer) {
     TF_ASSIGN_OR_RETURN(
         CompilationResult * res,
         Compile(hlo_computation, device_config, autotune_config));
     if (!res) {
-      // Out of shmem budget.
+      // Out of shared memory budget.
       return {std::nullopt};
     }
 
     // Don't run autotuning concurrently on the same GPU.
     absl::MutexLock gpu_lock(&GetGpuMutex(stream->parent()));
 
-    auto& [ptx, cubin, launch_dimensions] = *res;
+    auto& [ptx, cubin, kernel_names, launch_dimensions] = *res;
+    const bool have_reduction = kernel_names.size() > 1;
+
+    std::vector<se::DeviceMemoryBase> matmul_args;
+    for (const se::DeviceMemoryBase& buffer : input_buffers) {
+      matmul_args.push_back(buffer);
+    }
+    matmul_args.push_back(have_reduction ? intermediate_buffer : output_buffer);
 
     TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<se::KernelBase> kernel,
-        // TODO(cheshire): Where is "1" coming from?
-        CreateKernel(absl::StrCat(triton_fn_name_, 1), device_buffers.size(),
-                     ptx, cubin, stream->parent(),
-                     launch_dimensions.SharedMemBytes()));
+        std::unique_ptr<se::KernelBase> matmul_kernel,
+        CreateKernel(kernel_names[0], matmul_args.size(), ptx, cubin,
+                     stream->parent(), launch_dimensions[0].SharedMemBytes()));
+    std::unique_ptr<se::KernelBase> reduce_kernel;
+    std::vector<se::DeviceMemoryBase> reduce_args = {intermediate_buffer,
+                                                     output_buffer};
+    if (have_reduction) {
+      TF_ASSIGN_OR_RETURN(reduce_kernel,
+                          CreateKernel(kernel_names[1], reduce_args.size(), ptx,
+                                       cubin, stream->parent(),
+                                       launch_dimensions[1].SharedMemBytes()));
+    }
 
     se::gpu::GpuExecutor* cuda_executor =
         dynamic_cast<se::gpu::GpuExecutor*>(stream->parent()->implementation());
     std::unique_ptr<se::gpu::GpuTimer, se::gpu::GpuTimerDeleter> timer(
         new se::gpu::GpuTimer(cuda_executor));
+
     // Warmup: in and out buffers are reused while probing different configs, so
     // GPU caches should be in some comparable states during measurements.
+    TF_RETURN_IF_ERROR(ExecuteKernelOnStream(*matmul_kernel, matmul_args,
+                                             launch_dimensions[0], stream));
     TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-    if (!timer->Init() || !timer->Start(se::gpu::AsGpuStream(stream))) {
-      return Status(tsl::error::INTERNAL, "Failed to start timer");
-    }
-    TF_RETURN_IF_ERROR(ExecuteKernelOnStream(*kernel, device_buffers,
-                                             launch_dimensions, stream));
 
+    if (!timer->Init() || !timer->Start(se::gpu::AsGpuStream(stream))) {
+      return Status(absl::StatusCode::kInternal, "Failed to start timer");
+    }
+    TF_RETURN_IF_ERROR(ExecuteKernelOnStream(*matmul_kernel, matmul_args,
+                                             launch_dimensions[0], stream));
+    if (have_reduction) {
+      TF_RETURN_IF_ERROR(ExecuteKernelOnStream(*reduce_kernel, reduce_args,
+                                               launch_dimensions[1], stream));
+    }
     if (!timer->Stop(se::gpu::AsGpuStream(stream))) {
-      return Status(tsl::error::INTERNAL, "Failed to stop timer");
+      return Status(absl::StatusCode::kInternal, "Failed to stop timer");
     }
     return std::make_optional(absl::Nanoseconds(timer->Nanoseconds()));
   }
@@ -395,24 +471,19 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
   // computation cache. Returns a raw pointer into the map to avoid copying the
   // values. Returning `nullptr` means that the kernel could not be generated.
   StatusOr<CompilationResult*> Compile(
-      HloComputation* hlo_computation, const DeviceConfig& device_config,
+      const HloComputation& hlo_computation, const DeviceConfig& device_config,
       const AutotuneResult::TritonGemmKey& autotune_config) {
-    using CompilationKey = std::pair<std::string, TritonTilingWrapper>;
-    static absl::Mutex mutex(absl::kConstInit);
-    static auto& cache ABSL_GUARDED_BY(mutex) =
-        *new absl::node_hash_map<CompilationKey,
-                                 std::optional<CompilationResult>>();
-    CompilationKey key = std::make_pair(ToCanonicalString(hlo_computation),
+    CompilationKey key = std::make_pair(ToCanonicalString(&hlo_computation),
                                         TritonTilingWrapper{autotune_config});
 
     // TODO(b/266210099): Avoid duplication.
     {
-      absl::MutexLock lock(&mutex);
-      auto it = cache.find(key);
-      if (it != cache.end()) {
+      absl::MutexLock lock(&compilation_cache_mutex);
+      auto it = compilation_cache.find(key);
+      if (it != compilation_cache.end()) {
+        VLOG(4) << "Compilation cache hit";
         std::optional<CompilationResult>& res = it->second;
         if (res.has_value()) {
-          VLOG(1) << "Compilation cache hit";
           return &*res;
         }
         return nullptr;
@@ -423,8 +494,8 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
         std::optional<CompilationResult> res,
         CompileNoCache(hlo_computation, device_config, autotune_config));
     {
-      absl::MutexLock lock(&mutex);
-      auto [it2, inserted] = cache.emplace(key, res);
+      absl::MutexLock lock(&compilation_cache_mutex);
+      auto [it2, inserted] = compilation_cache.emplace(key, res);
       std::optional<CompilationResult>& res_inserted = it2->second;
       if (res_inserted.has_value()) {
         return &*res_inserted;
@@ -434,73 +505,104 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
   }
 
   StatusOr<std::optional<CompilationResult>> CompileNoCache(
-      HloComputation* hlo_computation, const DeviceConfig& device_config,
+      const HloComputation& original_computation,
+      const DeviceConfig& device_config,
       const AutotuneResult::TritonGemmKey& autotune_config) {
-    llvm::LLVMContext llvm_ctx;
-    std::vector<uint64_t> arg_sizes;
-    for (HloInstruction* param : hlo_computation->parameter_instructions()) {
-      arg_sizes.push_back(ShapeUtil::ByteSizeOf(param->shape()));
-    }
-    CHECK(!hlo_computation->root_instruction()->shape().IsTuple());
-    arg_sizes.push_back(
-        ShapeUtil::ByteSizeOf(hlo_computation->root_instruction()->shape()));
-
-    const HloModuleConfig& module_config = hlo_computation->parent()->config();
-    const se::CudaComputeCapability& cc =
-        device_config.stream_exec->GetDeviceDescription()
-            .cuda_compute_capability();
-
     uint64_t start_compilation_nanos = tsl::Env::Default()->NowNanos();
 
-    llvm::Module module("module", llvm_ctx);
-    // TODO(b/266210099): Duplication against nvptx_compiler.cc
-    module.setTargetTriple(nvptx::TargetTriple());
-    module.setDataLayout(nvptx::DataLayout());
+    const se::DeviceDescription& device_description =
+        device_config.stream_exec->GetDeviceDescription();
+    const GpuDeviceInfo gpu_device_info =
+        GetGpuDeviceInfo(device_config.stream_exec);
 
-    const GpuDeviceInfo dev_info = GetGpuDeviceInfo(device_config.stream_exec);
-    std::optional<LaunchDimensions> launch_dimensions =
-        TritonWrapper(triton_fn_name_, hlo_computation, cc, dev_info,
-                      autotune_config, &module, &MatMul);
-    if (!launch_dimensions.has_value()) {
-      // Out of shmem budget.
+    std::unique_ptr<HloModule> new_hlo_module = ExtractInstructionIntoNewModule(
+        *original_computation.FusionInstruction());
+
+    new_hlo_module->set_config(original_computation.parent()->config());
+    DebugOptions options =
+        original_computation.parent()->config().debug_options();
+    // Require thunks because so far we are relying on them for execution here.
+    // TODO(b/277066525): stop using thunks.
+    options.set_xla_gpu_enable_xla_runtime_executable(false);
+    // Avoid dumping compilation steps of every autotuning variant.
+    options.set_xla_dump_to("");
+    options.set_xla_gpu_dump_llvmir(false);
+    // Avoid using another thread pool for PTX compilation - there are maximum
+    // two functions to compile here.
+    options.set_xla_gpu_force_compilation_parallelism(1);
+    new_hlo_module->config().set_debug_options(options);
+    HloComputation* entry_computation = new_hlo_module->entry_computation();
+    HloInstruction* cloned_dot_fusion = entry_computation->root_instruction();
+    TF_RETURN_IF_ERROR(cloned_dot_fusion->set_backend_config(autotune_config));
+    if (autotune_config.split_k() > 1) {
+      if (!MakeDotSplitKBatch(cloned_dot_fusion, autotune_config).ok()) {
+        return {std::nullopt};
+      }
+      GpuFloatSupport bf16_support(BF16);
+      FloatNormalization float_normalization(&bf16_support);
+      TF_RETURN_IF_ERROR(
+          float_normalization.Run(new_hlo_module.get()).status());
+      GpuInstructionFusion instruction_fusion(/*may_duplicate=*/false,
+                                              gpu_device_info);
+      TF_RETURN_IF_ERROR(instruction_fusion.Run(new_hlo_module.get()).status());
+      HloInstruction* root = entry_computation->root_instruction();
+      // If the instruction fusion pass above skipped the reduction, turn it
+      // into a fusion for a universal set of arguments for execution.
+      if (root->opcode() == HloOpcode::kReduce) {
+        HloInstruction* fusion_instruction =
+            entry_computation->AddInstruction(HloInstruction::CreateFusion(
+                root->shape(), ChooseFusionKind(*root->operand(0), *root),
+                root));
+        HloInstruction* init_value = root->mutable_operand(1);
+        TF_CHECK_OK(
+            entry_computation->ReplaceInstruction(root, fusion_instruction));
+        fusion_instruction->FuseInstruction(init_value);
+        TF_CHECK_OK(entry_computation->RemoveInstruction(init_value));
+      }
+    }
+
+    llvm::LLVMContext llvm_context;
+    CompileModuleResults compile_module_results;
+    Status compilation_status = xla::gpu::CompileModuleToLlvmIrImpl(
+        new_hlo_module.get(), &llvm_context,
+        /*target_triple=*/nvptx::TargetTriple(),
+        /*data_layout=*/nvptx::DataLayout(),
+        /*platform_name=*/device_config.stream_exec->platform()->Name(),
+        /*platform_id=*/device_config.stream_exec->platform()->id(),
+        gpu_device_info, device_description.cuda_compute_capability(),
+        device_description.rocm_compute_capability(),
+        DummyCanShareBufferFunction,
+        /*pointer_size=*/8, &compile_module_results);
+    if (!compilation_status.ok()) {
+      VLOG(2) << "Compilation of autotuning variant failed: "
+              << compilation_status;
       return {std::nullopt};
     }
 
-    llvm::IRBuilder<> b_(llvm_ctx);
-    llvm::Function* kernel_prototype = BuildKernelPrototype(
-        triton_fn_name_.c_str(), arg_sizes, b_, module, llvm_ctx);
-
-    // Move function body into kernel prototype.
-    // Device kernel we are building.
-    llvm::Function* prototype_func = b_.GetInsertBlock()->getParent();
-
-    // Function as created by Triton.
-    llvm::Function* implementation_fn = module.getFunction(triton_fn_name_);
-    QCHECK(implementation_fn);
-    prototype_func->splice(prototype_func->end(), implementation_fn);
-    for (const auto& [arg, prototype_arg] :
-         llvm::zip_first(implementation_fn->args(), kernel_prototype->args())) {
-      arg.replaceAllUsesWith(&prototype_arg);
+    std::vector<std::string> kernel_names;
+    std::vector<LaunchDimensions> launch_dimensions;
+    CHECK(std::holds_alternative<GpuExecutable::OwnedThunkSequence>(
+        compile_module_results.executable));
+    const ThunkSequence& thunk_sequence =
+        *std::get<GpuExecutable::OwnedThunkSequence>(
+            compile_module_results.executable);
+    // Expect at maximum two kernels: matmul and an optional reduction.
+    CHECK_LE(thunk_sequence.size(), 2);
+    for (const std::unique_ptr<Thunk>& thunk : thunk_sequence) {
+      CHECK_EQ(thunk->kind(), Thunk::kKernel);
+      KernelThunk* kernel_thunk = static_cast<KernelThunk*>(thunk.get());
+      kernel_names.push_back(kernel_thunk->kernel_name());
+      launch_dimensions.push_back(kernel_thunk->launch_dimensions());
     }
-    implementation_fn->eraseFromParent();
-
-    // Replace pre-existing return with unconditional branch to next block.
-    llvm::Instruction* terminator =
-        prototype_func->getEntryBlock().getTerminator();
-    llvm::BranchInst::Create(&*std::next(prototype_func->begin()), terminator);
-    terminator->eraseFromParent();
-
-    LogAndVerify(&module);
 
     TF_ASSIGN_OR_RETURN(
         std::string ptx,
-        nvptx::CompileToPtx(&module,
-                            device_config.stream_exec->GetDeviceDescription()
-                                .cuda_compute_capability(),
-                            module_config));
+        nvptx::CompileToPtx(compile_module_results.llvm_module.get(),
+                            device_description.cuda_compute_capability(),
+                            new_hlo_module->config()));
 
     se::GpuAsmOpts ptxas_config =
-        PtxOptsFromDebugOptions(module_config.debug_options());
+        PtxOptsFromDebugOptions(new_hlo_module->config().debug_options());
     TF_ASSIGN_OR_RETURN(
         std::vector<uint8_t> cubin,
         se::CompileGpuAsm(device_config.stream_exec->device_ordinal(),
@@ -512,61 +614,68 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
     VLOG(1) << "Compilation took: " << compilation_time_span;
 
     return std::make_optional(
-        CompilationResult{ptx, cubin, *launch_dimensions});
-  }
-
-  // TODO(b/266210099): Refactor, do not duplicate code vs. ir_emitter_unnested.
-  // Builds a prototype for a function with given arguments.
-  llvm::Function* BuildKernelPrototype(const char* kernel_name,
-                                       std::vector<uint64_t> arg_sizes,
-                                       llvm::IRBuilder<>& b_,
-                                       llvm::Module& module,
-                                       llvm::LLVMContext& llvm_ctx) {
-    llvm::FunctionType* kernel_type = llvm::FunctionType::get(
-        /*Result=*/llvm::Type::getVoidTy(llvm_ctx),
-        std::vector<llvm::Type*>(arg_sizes.size(), b_.getInt8PtrTy()),
-        /*isVarArg=*/false);
-    llvm::Function* kernel = llvm::Function::Create(
-        kernel_type, llvm::GlobalValue::ExternalLinkage, kernel_name, module);
-
-    // Add dereferenceable and alignment information to each of the kernel's
-    // parameters.
-    auto arg_it = kernel->arg_begin();
-    for (size_t arg_no = 0; arg_no < arg_sizes.size(); ++arg_no) {
-      uint64_t arg_size = arg_sizes[arg_no];
-      llvm::Argument& fn_arg = *arg_it;
-      ++arg_it;
-
-      kernel->addDereferenceableParamAttr(arg_no, arg_size);
-      kernel->addParamAttr(
-          arg_no,
-          llvm::Attribute::get(llvm_ctx, llvm::Attribute::Alignment, 128));
-      fn_arg.setName(absl::StrCat("alloc", arg_no));
-    }
-
-    AnnotateFunctionAsGpuKernel(&module, kernel, &b_);
-    // Update the insert point to the entry basic block.
-    llvm::BasicBlock* entry_bb =
-        llvm::BasicBlock::Create(llvm_ctx, /*Name=*/"entry", /*Parent=*/kernel);
-
-    // Emit a "return void" at entry_bb's end, and set the insert point before
-    // that return instruction.
-    b_.SetInsertPoint(llvm::ReturnInst::Create(llvm_ctx, entry_bb));
-    return kernel;
+        CompilationResult{ptx, cubin, kernel_names, launch_dimensions});
   }
 
   AutotuningConfig config_;
-  int num_extra_threads_;
-
-  std::string triton_fn_name_ = "matmul_autotune";
+  tsl::thread::ThreadPool* thread_pool_;
 };
 
 }  // anonymous namespace
 
+std::vector<AutotuneResult::TritonGemmKey> GetPossibleMatmulAutotuneConfigs(
+    const se::CudaComputeCapability compute_capability) {
+  std::vector<AutotuneResult::TritonGemmKey> configs = {
+      GemmKey(32, 32, 256, 1, 1, 4), GemmKey(64, 32, 32, 16, 1, 4),
+      GemmKey(32, 64, 64, 4, 1, 4),  GemmKey(128, 128, 64, 4, 1, 4),
+      GemmKey(16, 16, 256, 1, 1, 4), GemmKey(16, 128, 32, 16, 1, 4),
+      GemmKey(16, 64, 128, 1, 1, 4), GemmKey(16, 128, 32, 8, 1, 4),
+      GemmKey(16, 16, 512, 1, 1, 4), GemmKey(32, 16, 512, 1, 1, 4),
+      GemmKey(64, 32, 64, 1, 2, 8)};
+  if (compute_capability.IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+    absl::c_copy(
+        std::vector<AutotuneResult::TritonGemmKey>{
+            GemmKey(128, 256, 32, 1, 3, 8), GemmKey(256, 128, 32, 1, 3, 8),
+            GemmKey(256, 64, 32, 1, 4, 4), GemmKey(64, 256, 32, 1, 4, 4),
+            GemmKey(128, 64, 32, 1, 4, 4), GemmKey(64, 128, 32, 1, 4, 4),
+            GemmKey(128, 256, 32, 1, 3, 8), GemmKey(256, 128, 128, 1, 3, 8),
+            GemmKey(256, 64, 128, 1, 4, 4), GemmKey(64, 256, 128, 1, 4, 4),
+            GemmKey(128, 128, 128, 1, 4, 4), GemmKey(128, 64, 64, 1, 4, 4),
+            GemmKey(64, 128, 64, 1, 4, 4), GemmKey(128, 32, 64, 1, 4, 4),
+            GemmKey(64, 32, 64, 1, 4, 4), GemmKey(32, 128, 32, 1, 4, 4),
+            GemmKey(128, 128, 32, 1, 4, 4), GemmKey(16, 16, 256, 1, 3, 4)},
+        std::back_inserter(configs));
+  }
+  return configs;
+}
+
+std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
+    const HloInstruction& hlo) {
+  auto new_hlo_module = std::make_unique<HloModule>(
+      "extracted", HloModuleConfig{},
+      std::make_unique<CompilationEnvironments>(hlo.GetModule()->comp_envs()));
+  int parameter_number = 0;
+  HloComputation::Builder builder("entry_computation");
+  HloCloneContext clone_context(new_hlo_module.get());
+  std::vector<HloInstruction*> new_operands;
+  for (const HloInstruction* operand : hlo.operands()) {
+    std::unique_ptr<HloInstruction> new_parameter =
+        HloInstruction::CreateParameter(parameter_number, operand->shape(),
+                                        operand->name());
+    ++parameter_number;
+    new_operands.push_back(builder.AddInstruction(std::move(new_parameter)));
+  }
+  std::unique_ptr<HloInstruction> new_instruction =
+      hlo.CloneWithNewOperands(hlo.shape(), new_operands, &clone_context);
+  builder.AddInstruction(std::move(new_instruction));
+  new_hlo_module->AddEntryComputation(builder.Build());
+  return new_hlo_module;
+}
+
 StatusOr<bool> TritonAutotuner::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  return TritonAutotunerVisitor{config_, num_extra_threads_}.RunOnModule(
+  return TritonAutotunerVisitor{config_, thread_pool_}.RunOnModule(
       module, execution_threads);
 }
 
@@ -608,5 +717,10 @@ void TritonAutotuner::ClearAutotuneResults() {
   autotune_cache.clear();
 }
 
+void TritonAutotuner::ClearCompilationCache() {
+  absl::MutexLock lock(&compilation_cache_mutex);
+  compilation_cache.clear();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/triton_autotuner.h b/tensorflow/compiler/xla/service/gpu/triton_autotuner.h
index e317bc7ac5b..c9f0cf68844 100644
--- a/tensorflow/compiler/xla/service/gpu/triton_autotuner.h
+++ b/tensorflow/compiler/xla/service/gpu/triton_autotuner.h
@@ -15,28 +15,31 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRITON_AUTOTUNER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRITON_AUTOTUNER_H_
 
-#include <optional>
-#include <string>
-#include <variant>
+#include <memory>
+#include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_serializable_autotuner.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/tsl/platform/threadpool.h"
+#include "tensorflow/tsl/protobuf/autotuning.pb.h"
 
 namespace xla {
 namespace gpu {
 
 // Find best tiling configuration for each triton fusion outlined.
-// num_extra_threads: number of threads the pass can use to perform compilation.
 class TritonAutotuner : public HloModulePass {
  public:
   explicit TritonAutotuner(const AutotuningConfig& config,
-                           int num_extra_threads = 0)
-      : config_(config), num_extra_threads_(num_extra_threads) {}
+                           tsl::thread::ThreadPool* thread_pool)
+      : config_(config), thread_pool_(thread_pool) {}
 
   absl::string_view name() const override { return "triton-autotuner"; }
 
   static void ClearAutotuneResults();
+  static void ClearCompilationCache();
   static Status WriteAutotuneResults(AutotuneResults* results);
   static Status LoadAutotuneResults(const AutotuneResults& results);
 
@@ -47,9 +50,19 @@ class TritonAutotuner : public HloModulePass {
 
  private:
   AutotuningConfig config_;
-  int num_extra_threads_;
+  tsl::thread::ThreadPool* thread_pool_;
 };
 
+// TODO(b/266210099): have a way to generate/load these dynamically.
+// Returns a list of possible tilings for a gemm performed in Triton.
+std::vector<tensorflow::AutotuneResult::TritonGemmKey>
+GetPossibleMatmulAutotuneConfigs(se::CudaComputeCapability compute_capability);
+
+// Extracts an HLO instruction into a new HLO module replacing its operands
+// with parameter instructions.
+std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
+    const HloInstruction& hlo);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/triton_autotuner_test.cc b/tensorflow/compiler/xla/service/gpu/triton_autotuner_test.cc
index 0b48f5ff36e..9c07b145525 100644
--- a/tensorflow/compiler/xla/service/gpu/triton_autotuner_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/triton_autotuner_test.cc
@@ -15,25 +15,73 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/triton_autotuner.h"
 
+#include <algorithm>
+#include <memory>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/protobuf/autotuning.pb.h"
 
 namespace xla {
 namespace gpu {
-
 namespace {
 
+namespace m = ::xla::match;
+
+using HloExtractionTest = HloTestBase;
+
+TEST_F(HloExtractionTest, ExtractionIsCorrect) {
+  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
+HloModule module
+
+triton_gemm_dot {
+  p0 = s8[10,10] parameter(0)
+  p1 = f32[10,10] parameter(1)
+  c0 = f32[10,10] convert(p0)
+  ROOT dot.0 = f32[10,10] dot(c0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = s8[10,10] parameter(0)
+  p1 = f32[10,10] parameter(1)
+  s = f32[10,10] sqrt(p1)
+  d = f32[10,10] fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot
+  ROOT r = f32[10,10] add(d, s)
+})")
+                                                  .value();
+
+  std::unique_ptr<HloModule> extracted_module = ExtractInstructionIntoNewModule(
+      *module->entry_computation()->root_instruction()->operand(0));
+
+  // Destroy the original module to be sure that the extracted one has no
+  // dependency on it.
+  module.release();
+
+  EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
+  EXPECT_EQ(extracted_module->entry_computation()->instruction_count(), 3);
+  TF_EXPECT_OK(VerifyHloModule(extracted_module.get(),
+                               /*layout_sensitive=*/true,
+                               /*allow_mixed_precision=*/false));
+}
+
 class TritonAutotunerTest : public HloTestBase {
  public:
   DebugOptions GetDebugOptionsForTest() override {
@@ -42,6 +90,13 @@ class TritonAutotunerTest : public HloTestBase {
     return debug_options;
   }
 
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+
   void CheckTritonAutotuning(absl::string_view hlo,
                              absl::string_view expected) {
     HloPassPipeline pipeline("gemm_rewrite");
@@ -49,16 +104,24 @@ class TritonAutotunerTest : public HloTestBase {
                                              .default_stream_executor()
                                              ->GetDeviceDescription()
                                              .cuda_compute_capability());
+    tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "",
+                                        tsl::port::MaxParallelism());
     pipeline.AddPass<TritonAutotuner>(
         DeviceConfig{backend().default_stream_executor(),
                      backend().memory_allocator()},
-        tsl::port::MaxParallelism());
+        &thread_pool);
 
     RunAndFilecheckHloRewrite(
         hlo, std::move(pipeline), expected, [](const HloModule* m) {
+          VLOG(5) << m->ToString();
+          const HloInstruction* dot_fusion =
+              m->entry_computation()->root_instruction();
+          if (dot_fusion->opcode() == HloOpcode::kReduce) {
+            dot_fusion = dot_fusion->operand(0);
+          }
+          CHECK_EQ(dot_fusion->opcode(), HloOpcode::kFusion);
           CHECK_GT(
-              m->entry_computation()
-                  ->root_instruction()
+              dot_fusion
                   ->backend_config<tensorflow::AutotuneResult::TritonGemmKey>()
                   .value()
                   .block_m(),
@@ -67,6 +130,30 @@ class TritonAutotunerTest : public HloTestBase {
   }
 };
 
+TEST_F(TritonAutotunerTest, VoltaUsesNoMoreThanTwoStages) {
+  const se::CudaComputeCapability compute_capability{
+      se::CudaComputeCapability::VOLTA, /*minor=*/0};
+  const std::vector<tensorflow::AutotuneResult::TritonGemmKey> configs =
+      GetPossibleMatmulAutotuneConfigs(compute_capability);
+  EXPECT_FALSE(
+      std::any_of(configs.begin(), configs.end(),
+                  [](const tensorflow::AutotuneResult::TritonGemmKey& key) {
+                    return key.num_stages() > 2;
+                  }));
+}
+
+TEST_F(TritonAutotunerTest, AmpereUsesMoreThanTwoStages) {
+  const se::CudaComputeCapability compute_capability{
+      se::CudaComputeCapability::AMPERE, /*minor=*/0};
+  const std::vector<tensorflow::AutotuneResult::TritonGemmKey> configs =
+      GetPossibleMatmulAutotuneConfigs(compute_capability);
+  EXPECT_TRUE(
+      std::any_of(configs.begin(), configs.end(),
+                  [](const tensorflow::AutotuneResult::TritonGemmKey& key) {
+                    return key.num_stages() > 2;
+                  }));
+}
+
 TEST_F(TritonAutotunerTest, Int8FusedGemm) {
   const std::string hlo = R"(
 HloModule module
@@ -83,7 +170,7 @@ ENTRY e {
   CheckTritonAutotuning(hlo, R"(
 // CHECK:   %triton_gemm_out
 // CHECK:   ROOT %out.1 = f16[128,6144]{1,0} dot(%c.1, %parameter_1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-// CHECK:   ROOT %fusion = f16[128,6144]{1,0} fusion(%x, %y), kind=kCustom, calls=%triton_gemm_out, backend_config="{\"block_m\":\"
+// CHECK:   ROOT %triton_gemm_out = f16[128,6144]{1,0} fusion(%x, %y), kind=kCustom, calls=%triton_gemm_out, backend_config="{\"block_m\":\"
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{5e-3, 5e-3}));
@@ -106,39 +193,94 @@ ENTRY e {
   CheckTritonAutotuning(hlo, R"(
 // CHECK:   %triton_gemm_out (
 // CHECK:   ROOT %out.1 = f16[128,6144]{1,0} dot(%c.1, %parameter_1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-// CHECK:   ROOT %fusion = f16[128,6144]{1,0} fusion(%x, %y), kind=kCustom, calls=%triton_gemm_out, backend_config="{\"block_m\":\"
+// CHECK:   ROOT %triton_gemm_out = f16[128,6144]{1,0} fusion(%x, %y), kind=kCustom, calls=%triton_gemm_out, backend_config="{\"block_m\":\"
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{1e-2, 1e-2}));
 }
 
-TEST_F(TritonAutotunerTest, KnownBestConfig) {
-  const std::string hlo = R"(
+TEST_F(TritonAutotunerTest, SelectsSplitK) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  // Shapes with K >> M, N have to force split-K configurations.
+  const std::string kHloText = R"(
 HloModule t
 
 ENTRY e {
-  p0 = f16[16,12288]{1,0} parameter(0)
-  p1 = s8[2304,12288]{1,0} parameter(1)
-  c = f16[2304,12288]{1,0} convert(p1)
-  ROOT _ = f16[2304,16]{1,0} dot(c, p0), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  p0 = s8[7,4096] parameter(0)
+  p0c = bf16[7,4096] convert(p0)
+  p1 = bf16[4096,18] parameter(1)
+  ROOT dot.0 = bf16[7,18] dot(p0c, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
 
-  // This is the fastest config amongst the currently probed ones
-  // at least on RTX A6000 and V100; feel free to modify on related changes.
-  const se::DeviceDescription& device_description =
-      GetTestPlatform()->ExecutorForDevice(0).value()->GetDeviceDescription();
-  const std::string& name = device_description.name();
-  if (name == "NVIDIA RTX A6000" || name == "Tesla V100-SXM2-16GB") {
-    CheckTritonAutotuning(hlo, R"(
-// CHECK: backend_config="{\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"256\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
-  )");
-  } else {
-    VLOG(1) << "Not tested on " << name;
-  }
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: reduce
+; CHECK: fusion(%p0, %p1), kind=kCustom
+; CHECK: ROOT %fusion.1
+)");
 
-  EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{0.02, 0.01}));
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{1e-1, 1e-1}));
 }
 
+TEST_F(TritonAutotunerTest, SkipConfigsProducingDeviantResults) {
+  const std::string kHloText = R"(
+HloModule module
+
+ENTRY e {
+  tmp_1 = pred[8192,12800]{1,0} parameter(0)
+  tmp_2 = f16[8192,12800]{1,0} convert(tmp_1)
+  tmp_3 = f16[4096,12800]{1,0} parameter(1)
+  ROOT tmp_4 = f16[8192,4096]{0,1} dot(tmp_2, tmp_3),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+
+  // Here split-K configs deviate strongly due to intermediate rounding
+  // but do execute fast - make sure they are filtered out (split_k = 1).
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: fusion(%tmp_1, %tmp_3), kind=kCustom
+; CHECK-SAME: split_k\":\"1\"
+)");
+}
+
+class TritonAutotunerLevelTest : public HloTestBase,
+                                 public ::testing::WithParamInterface<int> {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_autotune_level(GetParam());
+    return debug_options;
+  }
+};
+
+TEST_P(TritonAutotunerLevelTest, PredF32) {
+  const std::string hlo_text = R"(
+HloModule m
+
+ENTRY e {
+  p0 = pred[64,10] parameter(0)
+  p0c = f32[64,10] convert(p0)
+  p1 = f32[10,128] parameter(1)
+  ROOT r = f32[64,128] dot(p0c, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TritonAutotuner::ClearAutotuneResults();
+
+  MatchOptimizedHlo(hlo_text, R"(
+; CHECK: fusion(%p0, %p1), kind=kCustom
+; CHECK-SAME: backend_config="{\"block_m\":\"
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+}
+
+INSTANTIATE_TEST_SUITE_P(TritonAutotunerLevelSweep, TritonAutotunerLevelTest,
+                         ::testing::Range(0, 5));
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index ed91cf93a3b..d7ce4595053 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -64,7 +64,7 @@ Status WhileThunk::ExecuteOnStream(const ExecuteParams& params) {
     if (!block_status.ok()) {
       return InternalError(
           "Failed to complete all kernels launched on stream %p: %s", &stream,
-          block_status.error_message());
+          block_status.message());
     }
 
     if (!condition_result) {
diff --git a/tensorflow/compiler/xla/service/graphcycles/BUILD b/tensorflow/compiler/xla/service/graphcycles/BUILD
index 0d62a48b358..2ad84cabd54 100644
--- a/tensorflow/compiler/xla/service/graphcycles/BUILD
+++ b/tensorflow/compiler/xla/service/graphcycles/BUILD
@@ -1,11 +1,12 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//tensorflow/compiler:__subpackages__",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index df91624c404..94b65eb3223 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -16,21 +16,31 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <iterator>
+#include <limits>
 #include <memory>
+#include <numeric>
+#include <ostream>
+#include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/hlo/utils/hlo_live_range.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/memory_space_assignment_repacking.h"
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -38,12 +48,32 @@ namespace xla {
 using absl::flat_hash_map;
 using absl::flat_hash_set;
 
+HeapSimulator::Chunk HeapSimulator::Chunk::FromOffsetEnd(int64_t offset,
+                                                         int64_t end) {
+  return FromOffsetSize(offset, end - offset);
+}
+
+HeapSimulator::Chunk HeapSimulator::Chunk::FromOffsetSize(int64_t offset,
+                                                          int64_t size) {
+  return Chunk(offset, size);
+}
+
+std::string HeapSimulator::Chunk::ToString() const {
+  return absl::StrCat("[", offset, ",", chunk_end(), ")");
+}
+
 bool HeapSimulator::Chunk::OverlapsWith(Chunk other_chunk) const {
   CHECK_NE(size, 0);
   CHECK_NE(other_chunk.size, 0);
   return offset < other_chunk.chunk_end() && other_chunk.offset < chunk_end();
 }
 
+std::ostream& operator<<(std::ostream& stream,
+                         const HeapSimulator::Chunk& chunk) {
+  stream << chunk.ToString();
+  return stream;
+}
+
 /*static*/
 StatusOr<int64_t> HeapSimulator::MinimumMemoryForModule(
     const HloSchedule& schedule,
@@ -442,8 +472,9 @@ void HeapSimulator::FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
   HeapSimulatorTrace::Event* event = debug_trace_.add_events();
   event->set_kind(kind);
   event->set_buffer_id(buffer->id());
-  event->set_computation_name(instruction->parent()->name());
-  event->set_instruction_name(instruction->name());
+  *event->mutable_computation_name() =
+      std::string(instruction->parent()->name());
+  *event->mutable_instruction_name() = std::string(instruction->name());
   if (kind == HeapSimulatorTrace::Event::SHARE_WITH) {
     CHECK(share_with_canonical != nullptr);
     event->set_share_with_canonical_id(share_with_canonical->id());
@@ -547,7 +578,7 @@ void GlobalDecreasingSizeBestFitHeap<BufferType>::Alloc(
     const BufferType* buffer, int64_t size) {
   // Degenerate case: 0-sized buffers are always allocated at offset 0.
   if (size == 0) {
-    result_.chunk_map.emplace(buffer, Chunk{0, 0});
+    result_.chunk_map.emplace(buffer, Chunk::FromOffsetSize(0, 0));
     return;
   }
 
@@ -562,7 +593,7 @@ void GlobalDecreasingSizeBestFitHeap<BufferType>::ShareWith(
     const BufferType* buffer, const BufferType* share_with, int64_t size) {
   // Degenerate case: 0-sized buffers are always allocated at offset 0.
   if (size == 0) {
-    result_.chunk_map.emplace(buffer, Chunk{0, 0});
+    result_.chunk_map.emplace(buffer, Chunk::FromOffsetSize(0, 0));
     return;
   }
   DCHECK_NE(buffer_intervals_.count(share_with), 0);
@@ -801,6 +832,602 @@ std::vector<Chunk> BufferIntervalTree::ChunksOverlappingInTime(
   return result;
 }
 
+template <typename BufferType>
+std::string
+GlobalDecreasingSizeBestFitHeap<BufferType>::BufferInterval::ToString() const {
+  return absl::StrCat("{ ",  //
+                      "buffer: ", (buffer ? buffer->ToString() : "null"),
+                      ", ",                                           //
+                      "size: ", size, ", ",                           //
+                      "start: ", start, ", ",                         //
+                      "end: ", end, ", ",                             //
+                      "num_colocations: ", colocations.size(), ", ",  //
+                      "need_allocation: ", need_allocation,           //
+                      " }");
+}
+
+template <typename BufferType>
+std::string GlobalDecreasingSizeBestFitHeap<
+    BufferType>::SlicedBufferInterval::ToString() const {
+  return absl::StrCat(
+      "{ full_buffer_interval: ", full_buffer_interval.ToString(),
+      ", sorted_slices: [ ",
+      absl::StrJoin(sorted_slices, ", ",
+                    [](std::string* out,
+                       const SlicedBufferInterval::IntervalSlice& slice) {
+                      absl::StrAppend(out, "{ size: ", slice.size,
+                                      ", allocation_start_time: ",
+                                      slice.allocation_start_time, " }");
+                    }),
+      " ] }");
+}
+
+template <typename BufferType>
+std::string GlobalDecreasingSizeBestFitHeap<
+    BufferType>::SlicedAllocationFinder::FreeChunkPiece::ToString() const {
+  return absl::StrCat("{ dimensions: ", dimensions.ToString(), ", free at: t",
+                      earliest_free_slice_time, " }");
+}
+
+template <typename BufferType>
+std::string GlobalDecreasingSizeBestFitHeap<
+    BufferType>::SlicedAllocationFinder::FreeChunkRoot::ToString() const {
+  return absl::StrCat(
+      "{ chunk: ", chunk.ToString(), ", pieces: { ",
+      absl::StrJoin(
+          pieces.rbegin(), pieces.rend(), ", ",
+          [](std::string* out, const auto& offset_sliced_free_chunk_pair) {
+            absl::StrAppend(out,
+                            offset_sliced_free_chunk_pair.second.ToString());
+          }),
+      " } }");
+}
+
+template <typename BufferType>
+GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
+    FreeChunkRoot::FreeChunkRoot(const Chunk& free_chunk,
+                                 int64_t free_chunk_slice_time)
+    : chunk(free_chunk),
+      pieces({{free_chunk.offset, {free_chunk_slice_time, free_chunk}}}) {}
+
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
+    FreeChunkRoot::Update(const Chunk& free_chunk,
+                          int64_t free_chunk_slice_time) {
+  VLOG(4) << "Updating root " << chunk.ToString() << " with "
+          << free_chunk.ToString() << ", free at t" << free_chunk_slice_time;
+
+  // Iterate through all pieces that might overlap with free_chunk.
+  std::vector<FreeChunkPiece> new_pieces;
+  for (auto it = pieces.lower_bound(free_chunk.chunk_end() - 1);
+       it != pieces.end() &&
+       it->second.dimensions.chunk_end() >= free_chunk.offset;) {
+    const FreeChunkPiece& piece = it->second;
+    if (!free_chunk.OverlapsWith(piece.dimensions) ||
+        free_chunk_slice_time != piece.earliest_free_slice_time - 1) {
+      ++it;
+      continue;
+    }
+
+    // If free_chunk overlaps with piece and the slice time of piece is 1 before
+    // that of free_chunk, replace piece with up to 3 new pieces:
+    // - new_piece0: the part of piece that spatially comes before free_chunk
+    // - new_piece1: the part of free_chunk that overlaps with piece
+    // - new_piece2: the part of piece that spatially comes after free_chunk
+
+    if (free_chunk.offset > piece.dimensions.offset) {
+      FreeChunkPiece new_piece0(
+          {piece.earliest_free_slice_time,
+           Chunk::FromOffsetEnd(
+               piece.dimensions.offset,
+               std::min(free_chunk.offset, piece.dimensions.chunk_end()))});
+      new_pieces.push_back(new_piece0);
+    }
+
+    FreeChunkPiece new_piece1(
+        {free_chunk_slice_time,
+         Chunk::FromOffsetEnd(
+             std::max(free_chunk.offset, piece.dimensions.offset),
+             std::min(free_chunk.chunk_end(), piece.dimensions.chunk_end()))});
+    new_pieces.push_back(new_piece1);
+
+    if (free_chunk.chunk_end() < piece.dimensions.chunk_end()) {
+      FreeChunkPiece new_piece2(
+          {piece.earliest_free_slice_time,
+           Chunk::FromOffsetEnd(free_chunk.chunk_end(),
+                                piece.dimensions.chunk_end())});
+      new_pieces.push_back(new_piece2);
+    }
+    it = pieces.erase(it);
+  }
+
+  for (auto it = new_pieces.begin(); it != new_pieces.end(); ++it) {
+    pieces.insert({it->dimensions.offset, *it});
+  }
+
+  VLOG(4) << "Root after update: " << ToString();
+}
+
+namespace {
+
+// Code for rendering time_by_chunks as ascii art. Since this is for debugging,
+// we only render ascii art of certain dimensions.
+constexpr int64_t kMaxRenderOffset = 200;
+constexpr int64_t kMaxRenderSliceTime = 9;
+std::string RenderTimeByFreeChunks(
+    const std::vector<std::vector<Chunk>>& time_by_chunks) {
+  if (time_by_chunks.size() - 1 > kMaxRenderSliceTime) {
+    return "too many time slices to render";
+  }
+
+  std::vector<std::string> time_by_memory_units;
+  for (int i = 0; i < time_by_chunks.size(); ++i) {
+    // Populate each row with Xs to start.
+    time_by_memory_units.push_back(std::string(kMaxRenderOffset + 1, 'X'));
+
+    for (const Chunk& chunk : time_by_chunks[i]) {
+      if (chunk.chunk_end() > kMaxRenderOffset) {
+        return "largest offset is too large to render";
+      }
+      for (int j = chunk.offset; j < chunk.chunk_end(); ++j) {
+        // Overwrite X with a space if memory_unit j is free at slice time i.
+        time_by_memory_units[i][j] = ' ';
+      }
+    }
+  }
+
+  // Create the final ascii art lines.
+  std::vector<std::string> lines;
+  lines.push_back("   ^");
+  for (int i = time_by_memory_units.size() - 1; i >= 0; --i) {
+    lines.push_back(absl::StrCat("t", i, " |", time_by_memory_units[i]));
+  }
+  std::string yaxis = "   +";
+  for (int i = 0; i < kMaxRenderOffset + 1; ++i) {
+    if (i % 10 == 0) {
+      yaxis += "!";
+      continue;
+    }
+    if (i % 5 == 0) {
+      yaxis += "|";
+      continue;
+    }
+    yaxis += "-";
+  }
+  lines.push_back(absl::StrCat(yaxis, ">"));
+  lines.push_back("         space");
+
+  return absl::StrJoin(lines, "\n");
+}
+
+}  // namespace
+
+template <typename BufferType>
+GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
+    SlicedAllocationFinder(
+        absl::Span<const FreeChunks> free_chunks_per_slice_time,
+        std::vector<int64_t> sorted_slice_sizes, int64_t max_colocation_size,
+        int64_t preferred_offset, int64_t alignment)
+    : sorted_slice_sizes_(std::move(sorted_slice_sizes)),
+      slice_size_sum_(std::accumulate(sorted_slice_sizes_.begin(),
+                                      sorted_slice_sizes_.end(),
+                                      static_cast<int64_t>(0))),
+      max_colocation_size_(max_colocation_size),
+      preferred_offset_(preferred_offset),
+      alignment_(alignment) {
+  CHECK_EQ(sorted_slice_sizes_.size(), free_chunks_per_slice_time.size())
+      << "We expect a data structure explaining the free chunks at each slice "
+         "time.";
+  CHECK(!free_chunks_per_slice_time.empty())
+      << "Even an unsliced allocation is expected to have a list of free "
+         "chunks at slice time t0.";
+
+  if (VLOG_IS_ON(1)) {
+    // Create a 2d vector where each row represents a slice time and each
+    // column represents a free chunk at that slice time.
+    std::vector<std::vector<Chunk>> time_by_chunks;
+    for (int64_t i = 0; i < free_chunks_per_slice_time.size(); ++i) {
+      std::vector<Chunk> chunks;
+      for (const auto& free_chunk : free_chunks_per_slice_time[i]) {
+        chunks.push_back(
+            Chunk::FromOffsetEnd(free_chunk.first, free_chunk.second));
+      }
+      time_by_chunks.push_back(chunks);
+    }
+
+    LOG(INFO) << "Initial free space:\n"
+              << RenderTimeByFreeChunks(time_by_chunks);
+  }
+
+  if (max_colocation_size_ < slice_size_sum_) {
+    // If max_colocation_size was specified as -1 (or some other incorrect
+    // value), set it to the sum of the real slices.
+    max_colocation_size_ = slice_size_sum_;
+  }
+
+  // Build free_chunks_.
+  //
+  // Start by initializing FreeChunkRoots at LatestSliceTime().
+  for (const std::pair<const int64_t, int64_t>& free_chunk_pair :
+       free_chunks_per_slice_time.back()) {
+    free_chunks_.insert(
+        {free_chunk_pair.first,
+         FreeChunkRoot(Chunk::FromOffsetEnd(free_chunk_pair.first,
+                                            free_chunk_pair.second),
+                       LatestSliceTime())});
+  }
+  // For slice times < LatestSliceTime(), slice the space of each root according
+  // to when each subset of that root space is available.
+  for (int64_t free_chunk_slice_time = LatestSliceTime() - 1;
+       free_chunk_slice_time >= EarliestSliceTime(); --free_chunk_slice_time) {
+    // Note, free_chunks_ and free_chunks_per_slice_time[] are sorted in
+    // descending order of free chunk offsets. We simultaneously iterate through
+    // the 2 data structures, increasing the iterator for whichever one points
+    // to the greater chunk position.
+    auto it = free_chunks_.begin();
+    for (const std::pair<const int64_t, int64_t>& free_chunk_pair :
+         free_chunks_per_slice_time[free_chunk_slice_time]) {
+      Chunk free_chunk =
+          Chunk::FromOffsetEnd(free_chunk_pair.first, free_chunk_pair.second);
+
+      // Increment it while all of free_chunk < all of it.
+      for (; it != free_chunks_.end() &&
+             free_chunk.chunk_end() - 1 < it->second.chunk.offset;
+           ++it) {
+      }
+
+      if (it == free_chunks_.end()) {
+        // free_chunk (and everything remaining in
+        // free_chunks_per_slice_time[free_chunk_slice_time]) spatially come
+        // before everything in free_chunks_.
+        break;
+      }
+
+      // At this point, free_chunk and it overlap OR all of it < all of
+      // free_chunk. For example, the following diagram illustrates the
+      // relationship between the position of it and the possible positions for
+      // free_chunk (fc below):
+      //
+      //           [---- it ----)
+      //       [-fc-)..................................[-fc-)... ->
+
+      // While free_chunk and it overlap, keep iterating it and updating
+      // the root at it.
+
+      // We restore it to 1 before its last value (in the loop) because at the
+      // end of the loop it no longer overlaps with free_chunk, and it - 1
+      // may overlap with the next free_chunk as well.
+      auto previous_it = it;
+      for (; it != free_chunks_.end() &&
+             it->second.chunk.OverlapsWith(free_chunk);
+           previous_it = it, ++it) {
+        FreeChunkRoot& root = it->second;
+        root.Update(free_chunk, free_chunk_slice_time);
+      }
+      it = previous_it;
+    }
+  }
+
+  VLOG(1) << "Initial candidates:\n" << FreeChunksToAsciiArt();
+  VLOG(2) << "SlicedAllocationFinder:\n" << ToString();
+}
+
+template <typename BufferType>
+std::string GlobalDecreasingSizeBestFitHeap<
+    BufferType>::SlicedAllocationFinder::FreeChunksToAsciiArt() const {
+  auto it = free_chunks_.begin();
+  if (it == free_chunks_.end()) {
+    return "no candidate data";
+  }
+  int64_t final_offset = it->second.chunk.chunk_end();
+
+  if (LatestSliceTime() > kMaxRenderSliceTime ||
+      final_offset > kMaxRenderOffset) {
+    return "candidates too large to render";
+  }
+
+  std::vector<std::vector<Chunk>> time_by_chunks;
+  for (int64_t i = EarliestSliceTime(); i <= LatestSliceTime(); ++i) {
+    time_by_chunks.push_back({});
+  }
+
+  for (const std::pair<const int64_t, FreeChunkRoot>& offset_root_pair :
+       free_chunks_) {
+    for (const std::pair<const int64_t, FreeChunkPiece>& offset_piece_pair :
+         offset_root_pair.second.pieces) {
+      for (int64_t slice_time =
+               offset_piece_pair.second.earliest_free_slice_time;
+           slice_time <= LatestSliceTime(); ++slice_time) {
+        time_by_chunks[slice_time].push_back(
+            offset_piece_pair.second.dimensions);
+      }
+    }
+  }
+
+  return RenderTimeByFreeChunks(time_by_chunks);
+}
+
+template <typename BufferType>
+std::string GlobalDecreasingSizeBestFitHeap<
+    BufferType>::SlicedAllocationFinder::ToString() const {
+  std::vector<std::string> lines;
+
+  lines.push_back(absl::StrCat("slices:              { ",
+                               absl::StrJoin(sorted_slice_sizes_, ", "), " }"));
+  lines.push_back(absl::StrCat("max_colocation_size: ", max_colocation_size_));
+  lines.push_back(absl::StrCat("preferred_offset:    ", preferred_offset_));
+  lines.push_back("free chunks:");
+  int i = 0;
+  for (auto it = free_chunks_.rbegin(); it != free_chunks_.rend(); ++it) {
+    lines.push_back(absl::StrCat("  chunk ", i, ": ", it->second.ToString()));
+    ++i;
+  }
+
+  return absl::StrJoin(lines, "\n");
+}
+
+template <typename BufferType>
+typename GlobalDecreasingSizeBestFitHeap<
+    BufferType>::SlicedAllocationFinder::ChunksSortedBySliceTime
+GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::Find()
+    const {
+  // Check if we can place the fully allocated buffer at the preferred offset
+  if (preferred_offset_ >= 0) {
+    VLOG(2) << "SlicedAllocationFinder::Find() searching preferred offset "
+            << preferred_offset_;
+    auto it = free_chunks_.lower_bound(preferred_offset_);
+    if (it != free_chunks_.end()) {
+      const FreeChunkRoot* root = &it->second;
+      ChunksSortedBySliceTime chunks =
+          FindInRoot(*root, /*only_try_preferred_offset=*/true);
+      if (!chunks.empty()) {
+        return chunks;
+      }
+    }
+  }
+
+  // Find the smallest overall chunk that fits the allocation request
+  std::vector<const FreeChunkRoot*> root_heap;
+  for (auto it = free_chunks_.rbegin(); it != free_chunks_.rend(); ++it) {
+    root_heap.push_back(&it->second);
+  }
+  auto heap_cmp = [](const FreeChunkRoot* lhs, const FreeChunkRoot* rhs) {
+    if (lhs->chunk.size != rhs->chunk.size) {
+      return lhs->chunk.size > rhs->chunk.size;
+    }
+    return lhs->chunk.offset > rhs->chunk.offset;
+  };
+  auto heap_next = [&]() -> const FreeChunkRoot* {
+    if (root_heap.empty()) {
+      return nullptr;
+    }
+    absl::c_pop_heap(root_heap, heap_cmp);
+    const FreeChunkRoot* root = root_heap.back();
+    root_heap.pop_back();
+    return root;
+  };
+  absl::c_make_heap(root_heap, heap_cmp);
+  // Each call to heap_next() gives us the next smallest root.
+  for (const FreeChunkRoot* root = heap_next(); root != nullptr;
+       root = heap_next()) {
+    VLOG(2) << "SlicedAllocationFinder::Find() searching " << root->ToString();
+    ChunksSortedBySliceTime chunks =
+        FindInRoot(*root, /*only_try_preferred_offset=*/false);
+    if (!chunks.empty()) {
+      return chunks;
+    }
+  }
+
+  LOG(ERROR) << "We did not find a place for our sliced allocation. This "
+                "should not happen because MSA operates on an infinitely "
+                "sized heap.";
+  return {};
+}
+
+template <typename BufferType>
+Status GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
+    DoesPermutationFit(const std::vector<int64_t>& permutation_of_slice_times,
+                       const FreeChunkRoot& root, int64_t offset) const {
+  Status result =
+      DoesPermutationFitImpl(permutation_of_slice_times, root, offset);
+  VLOG(3) << "SlicedAllocationFinder::DoesPermutationFit\n"
+          << "  permutation of slice times: [ "
+          << absl::StrJoin(permutation_of_slice_times, ",") << " ]\n"
+          << "  offset: " << offset << "\n"
+          << "  root: " << root.ToString() << "\n"
+          << "  -> " << result;
+  return result;
+}
+
+template <typename BufferType>
+Status GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
+    DoesPermutationFitImpl(
+        const std::vector<int64_t>& permutation_of_slice_times,
+        const FreeChunkRoot& root, int64_t offset) const {
+  if (permutation_of_slice_times.size() != sorted_slice_sizes_.size()) {
+    return InvalidArgumentStrCat(
+        sorted_slice_sizes_.size(), " slices times expected in permutation. ",
+        permutation_of_slice_times.size(), " specified.");
+  }
+  if (offset >= root.chunk.chunk_end()) {
+    return FailedPrecondition(
+        "%s", absl::StrCat("Free chunk root ", root.chunk.ToString(),
+                           " does not overlap with offset ", offset, "."));
+  }
+  if (offset + max_colocation_size_ > root.chunk.chunk_end()) {
+    return FailedPrecondition(
+        "%s", absl::StrCat("Not enough space to fit enitre allocation [",
+                           offset, ", ", offset + max_colocation_size_,
+                           ") in free chunk root ", root.chunk.ToString()));
+  }
+
+  auto piece_fwd_it = root.pieces.lower_bound(offset);
+  if (piece_fwd_it == root.pieces.end()) {
+    return FailedPrecondition(
+        "%s", absl::StrCat("Offset ", offset, " comes before free chunk root ",
+                           root.chunk.ToString()));
+  }
+  ++piece_fwd_it;
+  auto piece_reverse_it = std::make_reverse_iterator(piece_fwd_it);
+  auto at_pieces_end = [&](auto it) { return it == root.pieces.rend(); };
+  size_t slice_index = 0;
+  auto out_of_slices = [&](size_t index) { return index > LatestSliceTime(); };
+
+  // Check to see if the slices will fit in pieces, starting at
+  // piece_reverse_it.
+  int64_t amount_of_current_slice_consumed = 0;
+  int64_t current_offset = offset;
+  while (!at_pieces_end(piece_reverse_it) && !out_of_slices(slice_index)) {
+    int64_t current_slice_time = permutation_of_slice_times[slice_index];
+    int64_t current_slice_size = sorted_slice_sizes_[slice_index];
+    int64_t remaining_in_slice =
+        current_slice_size - amount_of_current_slice_consumed;
+
+    int64_t current_piece_time =
+        piece_reverse_it->second.earliest_free_slice_time;
+    int64_t remaining_in_piece =
+        piece_reverse_it->second.dimensions.chunk_end() - current_offset;
+
+    int64_t amount_to_consume =
+        std::min(remaining_in_slice, remaining_in_piece);
+
+    if (current_piece_time > current_slice_time) {
+      // The current piece is not free far enough back in time to support the
+      // current slice.
+      return FailedPrecondition(
+          "%s",
+          absl::StrCat("At slice time t", current_slice_time, ", slice ",
+                       slice_index, " does not fit at offset ", current_offset,
+                       " in root ", root.chunk.ToString()));
+    }
+
+    if (remaining_in_slice >= remaining_in_piece) {
+      ++piece_reverse_it;
+      amount_of_current_slice_consumed += amount_to_consume;
+    }
+    if (remaining_in_slice <= remaining_in_piece) {
+      ++slice_index;
+      amount_of_current_slice_consumed = 0;
+    }
+
+    current_offset += amount_to_consume;
+  }
+
+  if (!out_of_slices(slice_index)) {
+    return InternalErrorStrCat("Ran out of space in root ",
+                               root.chunk.ToString(),
+                               " to fit slice permutation; however, we should "
+                               "have caught such a condition earlier.");
+  }
+
+  return OkStatus();
+}
+
+namespace {
+
+// An iterator for iterating through permuations of slice times.
+class SliceTimePermutationIterator {
+ public:
+  explicit SliceTimePermutationIterator(int64_t latest_slice_time)
+      : done_(latest_slice_time < 0) {
+    permutation_.reserve(latest_slice_time + 1);
+    for (int64_t i = 0; i <= latest_slice_time; ++i) {
+      permutation_.push_back(i);
+    }
+  }
+
+  bool Done() const { return done_; }
+
+  void Next() {
+    if (Done()) {
+      return;
+    }
+    done_ = !absl::c_next_permutation(permutation_);
+  }
+
+  const std::vector<int64_t>& Get() const { return permutation_; }
+
+ private:
+  bool done_ = false;
+  std::vector<int64_t> permutation_;
+};
+
+}  // namespace
+
+// Future opportunities:
+// 1) Potential optimization: We don't have to try every offset in
+//    [root.chunk.offset, root.chunk.chunk_end()). If a permutation doesn't fit
+//    at offset, it won't fit at offset + 1, unless the geometry of the free
+//    space changes at offset + 1. If we carefully choose which offsets to try,
+//    we don't have to try them all.
+// 2) Potential tuning: We don't have a specific way to prioritize 1 permutation
+//    or 1 offset over another. For example, it is likely better to place an
+//    allocation at the beginning or the end of a root, to minimize
+//    fragmentation. In the future, we may want to prioritize such
+//    considerations.
+template <typename BufferType>
+typename GlobalDecreasingSizeBestFitHeap<
+    BufferType>::SlicedAllocationFinder::ChunksSortedBySliceTime
+GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::FindInRoot(
+    const FreeChunkRoot& root, bool only_try_preferred_offset) const {
+  int64_t first_offset = root.chunk.offset;
+  int64_t last_end = root.chunk.chunk_end();
+  if (only_try_preferred_offset) {
+    first_offset = preferred_offset_;
+    last_end = preferred_offset_ + max_colocation_size_;
+    if (preferred_offset_ % alignment_ != 0) {
+      return {};
+    }
+  } else if (first_offset % alignment_ != 0) {
+    first_offset = first_offset + (alignment_ - (first_offset % alignment_));
+  }
+  CHECK_EQ(first_offset % alignment_, 0);
+  for (int64_t offset = first_offset; offset + max_colocation_size_ <= last_end;
+       offset += alignment_) {
+    for (SliceTimePermutationIterator permutation_it(LatestSliceTime());
+         !permutation_it.Done(); permutation_it.Next()) {
+      if (DoesPermutationFit(permutation_it.Get(), root, offset).ok()) {
+        return PermutationToChunks(permutation_it.Get(), offset);
+      }
+    }
+
+    // Optimization: We can skip checking other offsets if the root
+    // represents the same space at all slice times. In such a case, if we
+    // don't fit at the first offset, we won't fit at any offset.
+    if (root.pieces.size() == 1) {
+      break;
+    }
+  }
+
+  return {};
+}
+
+template <typename BufferType>
+typename GlobalDecreasingSizeBestFitHeap<
+    BufferType>::SlicedAllocationFinder::ChunksSortedBySliceTime
+GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
+    PermutationToChunks(const std::vector<int64_t>& permutation_of_slice_times,
+                        int64_t offset) const {
+  ChunksSortedBySliceTime chunks(permutation_of_slice_times.size() + 1,
+                                 Chunk::FromOffsetSize(-1, 1));
+  int64_t current_offset = offset;
+  for (int64_t slice_index = 0; slice_index <= LatestSliceTime();
+       ++slice_index) {
+    int64_t size = sorted_slice_sizes_[slice_index];
+    chunks[permutation_of_slice_times[slice_index]] =
+        Chunk::FromOffsetSize(current_offset, size);
+    current_offset += size;
+  }
+  chunks.back() = Chunk::FromOffsetSize(
+      current_offset, max_colocation_size_ - (current_offset - offset));
+
+  DCHECK(std::all_of(chunks.begin(), chunks.end(), [](const Chunk& chunk) {
+    return chunk.offset >= 0 && chunk.size >= 0;
+  }));
+
+  return chunks;
+}
+
 template <typename BufferType>
 HeapSimulator::Result<BufferType>
 GlobalDecreasingSizeBestFitHeap<BufferType>::Finish() {
@@ -842,45 +1469,23 @@ typename GlobalDecreasingSizeBestFitHeap<BufferType>::Chunk
 GlobalDecreasingSizeBestFitHeap<BufferType>::FindChunkCandidate(
     const GlobalDecreasingSizeBestFitHeap::BufferInterval& buffer_interval,
     int64_t preferred_offset) const {
-  VLOG(1) << "Finding chunks for buffer: "
-          << buffer_interval.buffer->ToString();
-  VLOG(1) << "Size " << buffer_interval.size << ", start "
-          << buffer_interval.start << ", end " << buffer_interval.end;
-  // Get all colocated buffers and gather all interferenced chunks.
-  //
-  // Imagine that we've already allocated three chunks : a, b and c.  And now
-  // we want to allocate d. Since e is colocated with d, we have to allocate
-  // chunks for them together at the same address. To do this, we first gather
-  // all chunks that overlap with d and e on the time dimension, in this case
-  // the overlapped chunks are a and b (c doesn't overlap with either of d and
-  // e), then find create a new chunk that doesn't overlap with a and b on the
-  // space dimension.
-  //
-  // space
-  //   ^
-  //   |+--d---+      +---e---+
-  //   |
-  //   |+---+  +---------------+  +-------+
-  //   ||   |  |               |  |       |
-  //   ||   |  |               |  |       |
-  //   |+-a-+  +-------b-------+  +---c---+
-  //   ----------------------------------------> time
+  SlicedBufferInterval sliced_buffer_interval(buffer_interval);
+  std::vector<Chunk> chunks =
+      FindChunkCandidates(sliced_buffer_interval, preferred_offset);
+  CHECK_EQ(chunks.size(), 1);
+  return chunks[0];
+}
 
+template <typename BufferType>
+typename GlobalDecreasingSizeBestFitHeap<BufferType>::FreeChunks
+GlobalDecreasingSizeBestFitHeap<BufferType>::MakeFreeChunks(
+    const BufferInterval& buffer_interval, int64_t max_colocation_size) const {
   // Map free chunk offsets -> ends.
   // We use `greater` for the comparison so that we can use `lower_bound` to
   // find the largest key less than or equal to the lookup value.
-  absl::btree_map<int64_t, int64_t, std::greater<int64_t>> free_chunks{
+  FreeChunks free_chunks{
       {0, INT64_MAX}};  // Initialize with "infinite" free memory.
 
-  // Find the max size of interval across its colocations and use this value to
-  // determine whether the buffer will fit in the heap.
-  int64_t max_colocation_size = buffer_interval.size;
-  for (const BufferType* colocation :
-       GetTransitiveColocations(buffer_interval)) {
-    max_colocation_size =
-        std::max(max_colocation_size, buffer_intervals_.at(colocation).size);
-  }
-
   // Subtract chunks that are in use from the free chunks.
   auto subtract_used_chunks = [&](const std::vector<Chunk>& used_chunks) {
     for (const Chunk& used_chunk : used_chunks) {
@@ -898,7 +1503,8 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::FindChunkCandidate(
         if (used_chunk.offset - it_start->first >= buffer_interval.size) {
           it_start->second = std::min(it_start->second, used_chunk.offset);
         } else {
-          ++it_start;  // Increment iterator so that this entry is erased below.
+          ++it_start;  // Increment iterator so that this entry is erased
+                       // below.
         }
       }
 
@@ -928,8 +1534,45 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::FindChunkCandidate(
         interval_tree_.ChunksOverlappingInTime(interval.start, interval.end));
   }
 
-  // Try to find a large enough free chunk containing the preferred offset.
-  Chunk chunk{preferred_offset, max_colocation_size};
+  return free_chunks;
+}
+
+template <typename BufferType>
+std::vector<typename GlobalDecreasingSizeBestFitHeap<BufferType>::Chunk>
+GlobalDecreasingSizeBestFitHeap<BufferType>::FindChunkCandidates(
+    const SlicedBufferInterval& sliced_buffer_interval,
+    int64_t preferred_offset) const {
+  const BufferInterval& buffer_interval =
+      sliced_buffer_interval.full_buffer_interval;
+  // TODO(b/275905276): changes this method to account for slicing and remove
+  // the following check
+  CHECK(sliced_buffer_interval.sorted_slices.empty())
+      << "Chunk slicing is not yet supported.";
+
+  VLOG(1) << "Finding chunks for sliced buffer interval: "
+          << sliced_buffer_interval.ToString();
+
+  // Find the max size of interval across its colocations and use this value
+  // to determine whether the buffer will fit in the heap.
+  int64_t max_colocation_size = buffer_interval.size;
+  for (const BufferType* colocation :
+       GetTransitiveColocations(buffer_interval)) {
+    max_colocation_size =
+        std::max(max_colocation_size, buffer_intervals_.at(colocation).size);
+  }
+
+  // Get all colocated buffers and gather all interferenced chunks.
+  FreeChunks free_chunks = MakeFreeChunks(buffer_interval, max_colocation_size);
+
+  // TODO(b/275905276): when slicing, build free_chunks for each consecutive
+  // slice time, where slice time is logical time.
+
+  // TODO(b/275905276): when slicing, merge the free_chunks for each slice time.
+  // The end result should be a list of free chunks in which buffer_interval not
+  // only fits in each free chunk, but the slices of buffer interval can be
+  // allocated according to their requirements. Try to find a large enough free
+  // chunk containing the preferred offset.
+  Chunk chunk = Chunk::FromOffsetSize(preferred_offset, max_colocation_size);
   auto it = (preferred_offset < 0) ? free_chunks.end()
                                    : free_chunks.lower_bound(preferred_offset);
   if (it == free_chunks.end() || (it->second < chunk.chunk_end())) {
@@ -941,7 +1584,7 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::FindChunkCandidate(
                             std::forward_as_tuple(b.second - b.first, b.first);
                    })->first;
   }
-  return chunk;
+  return {chunk};
 }
 
 template <typename BufferType>
@@ -961,7 +1604,8 @@ void GlobalDecreasingSizeBestFitHeap<BufferType>::CommitChunk(
     auto colocation_interval = buffer_intervals_[colocation];
     // Create a colocation chunk with the same offset but with the correct size
     // of the colocated interval in case the colocations are of different sizes.
-    Chunk colocation_chunk{chunk.offset, colocation_interval.size};
+    Chunk colocation_chunk =
+        Chunk::FromOffsetSize(chunk.offset, colocation_interval.size);
     AddToChunkMap(colocation, colocation_chunk);
     interval_tree_.Add(colocation_interval.start, colocation_interval.end,
                        colocation_chunk);
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 0c81e90600f..342523b25e9 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -17,13 +17,23 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
 
 #include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <iostream>
 #include <memory>
 #include <set>
+#include <string>
 #include <utility>
 #include <vector>
 
+// TODO(b/210891274): Use btree_map after build issue in Windows is resolved.
+#if defined(__GNUC__) || defined(__clang__)
+#include "absl/container/btree_map.h"
+#endif
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
@@ -57,6 +67,12 @@ class HeapSimulator {
   // Chunk represents a contiguous piece of memory.  Each BufferValue will be
   // associated with a chunk in the assignment result.
   struct Chunk {
+    static Chunk FromOffsetEnd(int64_t offset, int64_t end);
+    static Chunk FromOffsetSize(int64_t offset, int64_t size);
+    Chunk() : Chunk(-1, 0) {}
+
+    std::string ToString() const;
+
     int64_t offset;
     int64_t size;
 
@@ -67,6 +83,11 @@ class HeapSimulator {
     bool operator==(const Chunk& other) const {
       return offset == other.offset && size == other.size;
     }
+
+   private:
+    Chunk(int64_t offset, int64_t size) : offset(offset), size(size) {}
+
+    friend std::ostream& operator<<(std::ostream& stream, const Chunk& chunk);
   };
 
   template <typename BufferType>
@@ -368,6 +389,13 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
   using Result = HeapSimulator::Result<BufferType>;
   using Chunk = HeapSimulator::Chunk;
 
+  // A mapping from a free chunk offset to the end of that chunk (exclusive).
+#if defined(__GNUC__) || defined(__clang__)
+  using FreeChunks = absl::btree_map<int64_t, int64_t, std::greater<int64_t>>;
+#else
+  using FreeChunks = std::map<int64_t, int64_t, std::greater<int64_t>>;
+#endif
+
   enum Type {
     kSpatial = 0,
     kTemporal,
@@ -375,25 +403,252 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
 
   // BufferInterval stores a buffer's size and time interval.
   struct BufferInterval {
-    const BufferType* buffer;
-    int64_t size;
+    // Convenience method for use with debugging and logging.
+    std::string ToString() const;
+
+    const BufferType* buffer = nullptr;
+    int64_t size = -1;
     // Alloc time of the buffer.
-    int64_t start;
+    int64_t start = -1;
     // Free time of the buffer.
-    int64_t end;
+    int64_t end = -1;
 
     // Colocation buffers that need to be collocated with this one.
     absl::InlinedVector<const BufferType*, 2> colocations;
 
     // True if this buffer needs an allocation. False if it is collocated with
     // other buffer.
-    bool need_allocation;
+    bool need_allocation = false;
   };
 
   // Comparison function that is used to store buffer intervals.
   using BufferIntervalCompare =
       std::function<bool(const BufferInterval&, const BufferInterval&)>;
 
+  // A BufferInterval that we intend to allocate in slices. If
+  // sorted_slices.empty(), the allocation is not sliced. Sometimes we refer to
+  // such allocations as having a single slice.
+  //
+  // For example, instead of allocating A in space and time as illustrated on
+  // the left, we may wish to allocate A0 and A1 overlapping in time, contiguous
+  // in memory, (as illustrated on the right). Doing so allows us to free up
+  // allocation space between [s,i], but we only have the full allocation for A
+  // from [i,e].
+  //
+  //   ^
+  // s | +-----------+                 s |       +-----+
+  // p | |           |                 p |       |  A1 |
+  // a | |     A     |                 a | +-----+-----+
+  // c | |           |                 c | |     A0    |
+  // e | +-----------+                 e | +-----------+
+  //   --|-----------|------->           --|-----|-----|------->
+  //     s           e   time              s     i     e   time
+  //
+  // The allocation slices of a SlicedBufferInterval have the following
+  // properties:
+  // slice 0:
+  //   * size = full_buffer_interval.size - sum_over_j(sorted_slices[j].size)
+  //   * lifetime = [full_buffer_interval.start, full_buffer_interval.end)
+  // slice i (for i > 0):
+  //   * size = sorted_slices[i - 1].size
+  //   * lifetime = [sorted_slices[i - 1].start, full_buffer_interval.end)
+  //
+  // The only requirement on the spatial ordering of the slices is that they
+  // form a contiguous spatial block of memory, once all slices have been
+  // allocated.
+  struct SlicedBufferInterval {
+    // Represents a slice of full_buffer_interval that lives from start to
+    // full_buffer_interval.end.
+    struct IntervalSlice {
+      int64_t size;
+      int64_t allocation_start_time;
+    };
+
+    explicit SlicedBufferInterval(const BufferInterval& buffer_interval)
+        : full_buffer_interval(buffer_interval) {}
+    SlicedBufferInterval() = delete;
+
+    // Convenience method for use with debugging and logging.
+    std::string ToString() const;
+
+    const BufferInterval& full_buffer_interval;
+
+    // Describes allocations slices, after slice 0.
+    //
+    // sorted_slices is expected to be sorted according to
+    // sorted_slices[i].start < sorted_slices[i+1].start.
+    std::vector<IntervalSlice> sorted_slices;
+  };
+
+  // A class for finding locations to allocate a sliced allocation. A sliced
+  // allocation is an allocation of a buffer, in which slices of the buffer are
+  // allocated at different times, called slice times. Slice time is a logical
+  // time. For example, a requestor may ask for 15 Mib, allocated 5 MiB at a
+  // time, at 3 slices times t0, t1, and t2.
+  //
+  // The primary data structure inside this class is free_chunks_. free_chunks_
+  // is a sorted map of the chunks of memory that are free at the latest
+  // requested slice time. For each memory offset within each of those chunks,
+  // we track the earliest slice time t, such that the memory offset is
+  // continuously free during [t, latest requested slice time].
+  //
+  // For example, the following depiction of free_chunks_ indicates that
+  // at slice time t2, we have 2 free chunks, [5,15) and [20, 25). At slice time
+  // t1, the free chunk [5,15) is still free at [6,8) and [10,12). At slice time
+  // t0, the free chunk [5,15) is still free at [7,8). The free chunk [20, 25)
+  // is also free at slice times t0 and t1. (In the depicition, `x` indicates
+  // used space and ` ` indicates free space.)
+  //
+  //    ^
+  // t2 |xxxxx          xxxxx     xxxxxx
+  // t1 |xxxxxx  xx  xxxxxxxx     xxxxxx
+  // t0 |xxxxxxx xxxxxxxxxxxx     xxxxxx
+  //    +!----|----!----|----!----|----!>
+  //          space
+  class SlicedAllocationFinder {
+   public:
+    // The chunk at index i is the chunk that should be allocated at slice time
+    // i.
+    using ChunksSortedBySliceTime = std::vector<Chunk>;
+
+    // A structure representing a piece of a free chunk that is continuously
+    // free in [piece.earliest_free_slice_time, LatestSliceTime()].
+    struct FreeChunkPiece {
+      std::string ToString() const;
+
+      int64_t earliest_free_slice_time;
+      Chunk dimensions;
+    };
+
+    // A sorted map (indexed by starting offset) describing how far back in
+    // slice time different pieces of a FreeChunkRoot are free.
+#if defined(__GNUC__) || defined(__clang__)
+    using FreeChunkPieces =
+        absl::btree_map<int64_t, FreeChunkPiece, std::greater<int64_t>>;
+#else
+    using FreeChunkPieces =
+        std::map<int64_t, FreeChunkPiece, std::greater<int64_t>>;
+#endif
+
+    // A free chunk that has been split into FreeChunkPieces.
+    struct FreeChunkRoot {
+      FreeChunkRoot(const Chunk& free_chunk, int64_t free_chunk_slice_time);
+
+      std::string ToString() const;
+
+      // Update pieces in accordance with the knowledge that free_chunk is
+      // free at free_chunk_slice_time.
+      //
+      // REQUIRES:
+      // - We must process all updates at free_chunk_slice_time x before
+      //   processing those at free time x-1.
+      void Update(const Chunk& free_chunk, int64_t free_chunk_slice_time);
+
+      Chunk chunk;
+      FreeChunkPieces pieces;
+    };
+
+    // A sorted map (indexed by starting offset) of FreeChunkRoots.
+#if defined(__GNUC__) || defined(__clang__)
+    using FreeChunkRoots =
+        absl::btree_map<int64_t, FreeChunkRoot, std::greater<int64_t>>;
+#else
+    using FreeChunkRoots =
+        std::map<int64_t, FreeChunkRoot, std::greater<int64_t>>;
+#endif
+
+    // Arguments:
+    // - free_chunks_per_slice_time[i]: Describes free chunks at slice time i.
+    // - sorted_slice_sizes: A sliced allocation request. In space, the i+1th
+    //   slice immediately follows the ith slice.
+    // - max_colocation_size: The max size of any buffer that will be colocated
+    //   with the fully allocated sliced allocation.
+    // - preferred_offset: The preferred starting offset for the fully allocated
+    //   sliced allocation.
+    // - is_allocation_offset_allowed_fn: Indicates if a the entire sliced
+    //   allocation is allowed to be allocated staring at a given offset.
+    //
+    // REQUIRES:
+    // - sorted_slice_sizes.size() == free_chunks_per_slice_time.size()
+    // - any slice can be allocated at any slice time
+    // - alignment >= 1
+    //
+    // In the future, if we want to restrict certain slices to be fetched at
+    // certain slice times (e.g., because certain slices don't represent enough
+    // real time to allocate a larger slice), we can take a lambda to indicate
+    // what is permitted.
+    SlicedAllocationFinder(
+        absl::Span<const FreeChunks> free_chunks_per_slice_time,
+        std::vector<int64_t> sorted_slice_sizes, int64_t max_colocation_size,
+        int64_t preferred_offset, int64_t alignment);
+
+    std::string FreeChunksToAsciiArt() const;
+    std::string ToString() const;
+
+    // Finds a set of chunks in which to allocate the sliced allocation request.
+    // Returns a vector of chunks in which the ith element is the chunk that
+    // should be allocated at slice time i. If no such chunks can be found, an
+    // empty vector is returned.
+    //
+    // The returned vector will always be 1 larger than the initial request,
+    // with a chunk to represent any additional allocation needed for
+    // max_colocation_size_. This extra chunk will always come at the end of
+    // the returned vector and will be present even if its size is 0.
+    ChunksSortedBySliceTime Find() const;
+
+   private:
+    // The earliest slice time for the specified sliced allocation request.
+    int64_t EarliestSliceTime() const { return 0; }
+
+    // The latest slice time for the specified sliced allocation request.
+    int64_t LatestSliceTime() const { return sorted_slice_sizes_.size() - 1; }
+
+    // Returns ok if the given permutation of slice times results in an
+    // allocation of free space in root, at the specified offset. Otherwise,
+    // returns the reason such an allocation would not fit.
+    //
+    // permutation_of_slice_times[i] is the slice time that the ith slice
+    // (spatially) should be allocated. Such a slice has size
+    // sorted_slice_sizes_[i] and would be allocated at offset +
+    // sum(sorted_slice_sizes[j], for j in [0, i-1]).
+    Status DoesPermutationFit(
+        const std::vector<int64_t>& permutation_of_slice_times,
+        const FreeChunkRoot& root, int64_t offset) const;
+
+    // Only DoesSlicedPermutationFit() should call this method directly. Other
+    // callers should call DoesSlicedPermutationFit(), which contains some
+    // wrapper VLOGGING.
+    Status DoesPermutationFitImpl(
+        const std::vector<int64_t>& permutation_of_slice_times,
+        const FreeChunkRoot& root, int64_t offset) const;
+
+    // Same as Find() except only checks root to see if it can hold the sliced
+    // allocation request. If only_try_preferred_offset is true, only tries the
+    // preferred_offset_, when trying to find a fit for the sliced allocation
+    // request.
+    ChunksSortedBySliceTime FindInRoot(const FreeChunkRoot& root,
+                                       bool only_try_preferred_offset) const;
+
+    // Given a permutation of slice times (see DoesSlicedPermutationFit()),
+    // return a vector of chunks, in which the ith chunk should be allocated at
+    // slice time i, with size sorted_slice_sizes_[i] and at offset +
+    // sum(sorted_slice_sizes[j], for j in [0, i-1]).
+    //
+    // PermutationToChunks() does the additional job of adding a Chunk to the
+    // end of the result to account for an additional colocation space that
+    // need to be allocated. This Chunk is added, even if it is of size 0.
+    ChunksSortedBySliceTime PermutationToChunks(
+        const std::vector<int64_t>& permutation_of_slice_times,
+        int64_t offset) const;
+
+    std::vector<int64_t> sorted_slice_sizes_;
+    int64_t slice_size_sum_;
+    int64_t max_colocation_size_;
+    int64_t preferred_offset_;
+    int64_t alignment_;
+    FreeChunkRoots free_chunks_;
+  };
+
   explicit GlobalDecreasingSizeBestFitHeap(int64_t alignment,
                                            Type type = kSpatial);
   ~GlobalDecreasingSizeBestFitHeap() override {}
@@ -414,6 +669,35 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
   // Returns the buffer intervals sorted according to buffer_interval_compare_.
   std::vector<BufferInterval> GetSortedBufferIntervals() const;
 
+  // Compute free chunks as all memory - chunks that are allocated at some time
+  // during the lifetime of buffer_interval or during the lifetime of buffers
+  // that are colocated with buffer_interval.
+  //
+  // Imagine that we've already allocated three chunks : a, b and c.  And now
+  // we want to allocate d. Since e is colocated with d, we have to allocate
+  // chunks for them together at the same address. To do this, we first gather
+  // all chunks that overlap with d and e on the time dimension, in this case
+  // the overlapped chunks are a and b (c doesn't overlap with either of d and
+  // e), then find create a new chunk that doesn't overlap with a and b on the
+  // space dimension.
+  //
+  // space
+  //   ^
+  //   |+--d---+      +---e---+
+  //   |
+  //   |+---+  +---------------+  +-------+
+  //   ||   |  |               |  |       |
+  //   ||   |  |               |  |       |
+  //   |+-a-+  +-------b-------+  +---c---+
+  //   ----------------------------------------> time
+  //
+  // MakeFreeChunks imposes the following additional constraints on its output:
+  // - The chunks in the result will start on alignment_ boundaries.
+  // - A free chunk will not be returned if it does not have enough space to fit
+  //   max_colocation_size.
+  FreeChunks MakeFreeChunks(const BufferInterval& buffer_interval,
+                            int64_t max_colocation_size) const;
+
   // These two methods below are exposed to other heap algorithms that inherit
   // from this class. The Finish() method tries to find a candidate chunk for
   // each BufferInterval, after calling GetSortedBufferIntervals. If a
@@ -423,6 +707,13 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
   // heap size is within the limits.
   Chunk FindChunkCandidate(const BufferInterval& buffer_interval,
                            int64_t preferred_offset = -1) const;
+  // FindChunkCandidates is the same as FindChunkCandidate, except it finds
+  // spatially contiguous chunks candidates for a sliced buffer interval.
+  // Returned chunk i will correspond to slice i, as described in
+  // SlicedBufferInterval::sorted_slices.
+  std::vector<Chunk> FindChunkCandidates(
+      const SlicedBufferInterval& sliced_buffer_interval,
+      int64_t preferred_offset = -1) const;
   void CommitChunk(const BufferInterval& buffer_interval, Chunk chunk);
 
   // Adds the buffer and the chunk to the result chunk map.
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 8de0242d5cf..31d3c0a5868 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 
+#include <cstdint>
+#include <limits>
 #include <memory>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
@@ -237,7 +240,7 @@ class HeapCallRecorder : public HeapAlgorithm<HloValue> {
     // call.  This isn't a valid assignment, but allows us to easily test for
     // buffer sharing.
     const int64_t offset = result_.chunk_map.size();
-    result_.chunk_map.emplace(buffer, Chunk{offset, size});
+    result_.chunk_map.emplace(buffer, Chunk::FromOffsetSize(offset, size));
   }
 
   void ShareWith(const HloValue* buffer, const HloValue* shared,
@@ -247,7 +250,7 @@ class HeapCallRecorder : public HeapAlgorithm<HloValue> {
     // call.  This isn't a valid assignment, but allows us to easily test for
     // buffer sharing.
     const int64_t offset = result_.chunk_map[shared].offset;
-    result_.chunk_map.emplace(buffer, Chunk{offset, size});
+    result_.chunk_map.emplace(buffer, Chunk::FromOffsetSize(offset, size));
   }
   void Free(const HloValue* buffer, int64_t size) override {
     calls_->emplace_back(kFree, buffer);
@@ -1516,7 +1519,7 @@ TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
 class IntervalTreeTest : public ::testing::Test {};
 
 TEST_F(IntervalTreeTest, InsertAndRemove) {
-  HeapSimulator::Chunk chunk({1, 2});
+  HeapSimulator::Chunk chunk = HeapSimulator::Chunk::FromOffsetSize(1, 2);
   BufferIntervalTree tree;
   tree.Add(1, 2, chunk);
   EXPECT_TRUE(tree.Remove(1, 2, chunk));
@@ -1530,7 +1533,8 @@ TEST_F(IntervalTreeTest, InsertAndRemove) {
 }
 
 TEST_F(IntervalTreeTest, InsertAndRemoveTwoLevelsLeft) {
-  HeapSimulator::Chunk chunk({1, 2});  // Value in chunk doesn't matter here.
+  HeapSimulator::Chunk chunk = HeapSimulator::Chunk::FromOffsetSize(
+      1, 2);  // Value in chunk doesn't matter here.
   //    [20, 36] (45)
   //     /
   //  [1, 45] (45)
@@ -1545,7 +1549,8 @@ TEST_F(IntervalTreeTest, InsertAndRemoveTwoLevelsLeft) {
 }
 
 TEST_F(IntervalTreeTest, InsertAndRemoveTwoLevelsRight) {
-  HeapSimulator::Chunk chunk({1, 2});  // Value in chunk doesn't matter here.
+  HeapSimulator::Chunk chunk = HeapSimulator::Chunk::FromOffsetSize(
+      1, 2);  // Value in chunk doesn't matter here.
   //    [20, 36] (45)
   //          \
   //         [21, 45] (45)
@@ -1559,7 +1564,8 @@ TEST_F(IntervalTreeTest, InsertAndRemoveTwoLevelsRight) {
 }
 
 TEST_F(IntervalTreeTest, TwoLevelsRight_RootFirst) {
-  HeapSimulator::Chunk chunk({1, 2});  // Value in chunk doesn't matter here.
+  HeapSimulator::Chunk chunk = HeapSimulator::Chunk::FromOffsetSize(
+      1, 2);  // Value in chunk doesn't matter here.
   //    [20, 36] (45)
   //          \
   //         [21, 45] (45)
@@ -1577,7 +1583,8 @@ TEST_F(IntervalTreeTest, TwoLevelsRight_RootFirst) {
 }
 
 TEST_F(IntervalTreeTest, TwoLevelsLeft_RootFirst) {
-  HeapSimulator::Chunk chunk({1, 2});  // Value in chunk doesn't matter here.
+  HeapSimulator::Chunk chunk = HeapSimulator::Chunk::FromOffsetSize(
+      1, 2);  // Value in chunk doesn't matter here.
   //    [20, 36] (45)
   //      /
   //  [1, 45] (45)
@@ -1595,7 +1602,8 @@ TEST_F(IntervalTreeTest, TwoLevelsLeft_RootFirst) {
 }
 
 TEST_F(IntervalTreeTest, ThreeLevelsRight) {
-  HeapSimulator::Chunk chunk({1, 2});  // Value in chunk doesn't matter here.
+  HeapSimulator::Chunk chunk = HeapSimulator::Chunk::FromOffsetSize(
+      1, 2);  // Value in chunk doesn't matter here.
   //    [20, 36] (45)
   //          \
   //         [21, 45] (45)
@@ -1613,7 +1621,8 @@ TEST_F(IntervalTreeTest, ThreeLevelsRight) {
   ASSERT_EQ(tree.GetRoot(), nullptr);
 }
 TEST_F(IntervalTreeTest, ThreeLevelsLeftLeft) {
-  HeapSimulator::Chunk chunk({1, 2});  // Value in chunk doesn't matter here.
+  HeapSimulator::Chunk chunk = HeapSimulator::Chunk::FromOffsetSize(
+      1, 2);  // Value in chunk doesn't matter here.
   //    [20, 36] (45)
   //       /
   //  [10, 45] (45)
@@ -1632,7 +1641,8 @@ TEST_F(IntervalTreeTest, ThreeLevelsLeftLeft) {
 }
 
 TEST_F(IntervalTreeTest, ThreeLevelsLeftRight) {
-  HeapSimulator::Chunk chunk({1, 2});  // Value in chunk doesn't matter here.
+  HeapSimulator::Chunk chunk = HeapSimulator::Chunk::FromOffsetSize(
+      1, 2);  // Value in chunk doesn't matter here.
   //    [20, 36] (45)
   //       /
   //  [10, 45] (45)
@@ -1651,7 +1661,8 @@ TEST_F(IntervalTreeTest, ThreeLevelsLeftRight) {
 }
 
 TEST_F(IntervalTreeTest, ThreeLevelsRightLeft) {
-  HeapSimulator::Chunk chunk({1, 2});  // Value in chunk doesn't matter here.
+  HeapSimulator::Chunk chunk = HeapSimulator::Chunk::FromOffsetSize(
+      1, 2);  // Value in chunk doesn't matter here.
   //    [20, 36] (45)
   //          \
   //         [25, 45] (45)
@@ -1670,9 +1681,9 @@ TEST_F(IntervalTreeTest, ThreeLevelsRightLeft) {
 }
 
 TEST_F(IntervalTreeTest, ThreeLevelsRightLeftChunkDifferent) {
-  HeapSimulator::Chunk chunk1({1, 2});
-  HeapSimulator::Chunk chunk2({2, 3});
-  HeapSimulator::Chunk chunk3({3, 4});
+  HeapSimulator::Chunk chunk1 = HeapSimulator::Chunk::FromOffsetSize(1, 2);
+  HeapSimulator::Chunk chunk2 = HeapSimulator::Chunk::FromOffsetSize(2, 3);
+  HeapSimulator::Chunk chunk3 = HeapSimulator::Chunk::FromOffsetSize(3, 4);
   //    [20, 36] (45) Chunk1({1, 2})
   //          \
   //         [25, 45] (45) Chunk2({2, 3})
@@ -1696,5 +1707,790 @@ TEST_F(IntervalTreeTest, ThreeLevelsRightLeftChunkDifferent) {
   ASSERT_EQ(tree.GetRoot(), nullptr);
 }
 
+class SlicedAllocationFinderTest : public ::testing::Test {
+ public:
+  using HeapTy = GlobalDecreasingSizeBestFitHeap<HloValue>;
+  using FreeChunks = typename HeapTy::FreeChunks;
+  using Chunk = HeapSimulator::Chunk;
+  using Finder = typename HeapTy::SlicedAllocationFinder;
+};
+
+TEST_F(SlicedAllocationFinderTest, NoSlices) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t0 |xxxxx  xxx                              xxxxx000xxxxxxxxxxxx          x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+
+The full buffer goes in the smallest chunk that fits.
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 40},
+          {45, 48},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(Chunk::FromOffsetSize(45, 3),
+                                     Chunk::FromOffsetSize(48, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, NoSlicesLargerMaxColloc) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t0 |xxxxx  xxx                              xxxxx   xxxxxxxxxxxx000       x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+
+The max colocation size does not fit in the smallest free chunk.
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 40},
+          {45, 48},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3};
+  int64_t max_colocation_size = 6;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(Chunk::FromOffsetSize(60, 3),
+                                     Chunk::FromOffsetSize(63, 3)));
+}
+
+TEST_F(SlicedAllocationFinderTest, NoSlicesSmallestTie) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t0 |xxxxx  xxx000xx                         xxxxx   xxxxxxxxxxxx          x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+
+Multiple free chunks have size 3. We pick the one with the smallest offset.
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 13},
+          {15, 40},
+          {45, 48},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(Chunk::FromOffsetSize(10, 3),
+                                     Chunk::FromOffsetSize(13, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, LeftHole) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxx  xxx                              xxxxx000111222xxxxxx          x
+t1 |xxxxx  xxx                              xxxxx000111xxxxxxxxx          x
+t0 |xxxxx  xxx                              xxxxx000xxxxxxxxxxxx          x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 40},
+          {45, 48},
+          {60, 70},
+      },
+      // Slice time 1
+      {
+          {5, 7},
+          {10, 40},
+          {45, 51},
+          {60, 70},
+      },
+      // Slice time 2
+      {
+          {5, 7},
+          {10, 40},
+          {45, 54},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3, 3, 3};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(45, 3), Chunk::FromOffsetSize(48, 3),
+                  Chunk::FromOffsetSize(51, 3), Chunk::FromOffsetSize(54, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, RightHole) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxx  xxx                              xxxxx000111222xxxxxx          x
+t1 |xxxxx  xxx                              xxxxxxxx111222xxxxxx          x
+t0 |xxxxx  xxx                              xxxxxxxxxxx222xxxxxx          x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 40},
+          {51, 54},
+          {60, 70},
+      },
+      // Slice time 1
+      {
+          {5, 7},
+          {10, 40},
+          {48, 54},
+          {60, 70},
+      },
+      // Slice time 2
+      {
+          {5, 7},
+          {10, 40},
+          {45, 54},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3, 3, 3};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(51, 3), Chunk::FromOffsetSize(48, 3),
+                  Chunk::FromOffsetSize(45, 3), Chunk::FromOffsetSize(54, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, MiddleHole) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxx  xxx                              xxxxx000111222xxxxxx          x
+t1 |xxxxx  xxx                              xxxxxxxx111222xxxxxx          x
+t0 |xxxxx  xxx                              xxxxxxxx111xxxxxxxxx          x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 40},
+          {48, 51},
+          {60, 70},
+      },
+      // Slice time 1
+      {
+          {5, 7},
+          {10, 40},
+          {48, 54},
+          {60, 70},
+      },
+      // Slice time 2
+      {
+          {5, 7},
+          {10, 40},
+          {45, 54},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3, 3, 3};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(48, 3), Chunk::FromOffsetSize(51, 3),
+                  Chunk::FromOffsetSize(45, 3), Chunk::FromOffsetSize(54, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, ManyHoles) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxx                          xxxxx    000111222              xx  xxxxxxx
+t1 |xxxxx                          xxxxxxx  000 xx222  xxx     xxx  xxxxxxx
+t0 |xxxxx                          xxxxxxxx   xxxx222  xxx     xxx  xxxxxxx
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+
+Note, the free chunk @t1 offset 38 is not aligned with the free chunk @t0
+offset 39 in a way that would fit any offset of the slices. (A slice can't be
+subsliced by MSA.)
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 31},
+          {39, 42},
+          {46, 51},
+          {54, 60},
+          {62, 64},
+      },
+      // Slice time 1
+      {
+          {5, 31},
+          {38, 44},
+          {46, 51},
+          {54, 59},
+          {62, 64},
+      },
+      // Slice time 2
+      {
+          {5, 31},
+          {36, 59},
+          {62, 64},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3, 3, 3};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(46, 3), Chunk::FromOffsetSize(40, 3),
+                  Chunk::FromOffsetSize(43, 3), Chunk::FromOffsetSize(49, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, EarlySliceTimesHaveLargeFreeChunks) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxx  xxx                              xxxxx000111222xxxxxx          x
+t1 |xxxxx                    xxx            xxxxxxxx111222xxxxxx          x
+t0 |xxxxxx                                          111                 xxx
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {6, 68},
+      },
+      // Slice time 1
+      {
+          {5, 25},
+          {28, 40},
+          {48, 54},
+          {60, 70},
+      },
+      // Slice time 2
+      {
+          {5, 7},
+          {10, 40},
+          {45, 54},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3, 3, 3};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(48, 3), Chunk::FromOffsetSize(51, 3),
+                  Chunk::FromOffsetSize(45, 3), Chunk::FromOffsetSize(54, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, DifferentSliceSizes1) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxx  xxx                              xx000001112222xxxxxx          x
+t1 |xxxxx  xxx                              xxxxxx 1112222xxxxxx          x
+t0 |xxxxx  xxx                              xxxxxx 111 xxxxxxxxx          x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 40},
+          {46, 51},
+          {60, 70},
+      },
+      // Slice time 1
+      {
+          {5, 7},
+          {10, 40},
+          {46, 54},
+          {60, 70},
+      },
+      // Slice time 2
+      {
+          {5, 7},
+          {10, 40},
+          {42, 54},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {5, 3, 4};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(47, 3), Chunk::FromOffsetSize(50, 4),
+                  Chunk::FromOffsetSize(42, 5), Chunk::FromOffsetSize(54, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, DifferentSliceSizes2) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxx  xxx000001112222                  xx            xxxxxx          x
+t1 |xxxxx  xxx00000111                      xxxxxx        xxxxxx          x
+t0 |xxxxx  xxx00000                         xxxxxx   xxxxxxxxxxx          x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 40},
+          {46, 49},
+          {60, 70},
+      },
+      // Slice time 1
+      {
+          {5, 7},
+          {10, 40},
+          {46, 54},
+          {60, 70},
+      },
+      // Slice time 2
+      {
+          {5, 7},
+          {10, 40},
+          {42, 54},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {5, 3, 4};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(10, 5), Chunk::FromOffsetSize(15, 3),
+                  Chunk::FromOffsetSize(18, 4), Chunk::FromOffsetSize(22, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, LargerMaxColloc) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxx  xxx                              xxxxx         xxxxxx000111222 x
+t1 |xxxxx  xxx                              xxxxxxxx      xxxxxx000111    x
+t0 |xxxxx  xxx                              xxxxxxxx   xxxxxxxxx000       x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+  */
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 40},
+          {48, 51},
+          {60, 70},
+      },
+      // Slice time 1
+      {
+          {5, 7},
+          {10, 40},
+          {48, 54},
+          {60, 70},
+      },
+      // Slice time 2
+      {
+          {5, 7},
+          {10, 40},
+          {45, 54},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3, 3, 3};
+  int64_t max_colocation_size = 10;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(60, 3), Chunk::FromOffsetSize(63, 3),
+                  Chunk::FromOffsetSize(66, 3), Chunk::FromOffsetSize(69, 1)));
+}
+
+TEST_F(SlicedAllocationFinderTest, PreferredOffsetFit) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxx  xxx          000111222           xxxxx         xxxxxx          x
+t1 |xxxxx  xxx          000111              xxxxxxxx      xxxxxx          x
+t0 |xxxxx  xxx          000                 xxxxxxxx   xxxxxxxxx          x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 40},
+          {48, 51},
+          {60, 70},
+      },
+      // Slice time 1
+      {
+          {5, 7},
+          {10, 40},
+          {48, 54},
+          {60, 70},
+      },
+      // Slice time 2
+      {
+          {5, 7},
+          {10, 40},
+          {45, 54},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3, 3, 3};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = 20;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(20, 3), Chunk::FromOffsetSize(23, 3),
+                  Chunk::FromOffsetSize(26, 3), Chunk::FromOffsetSize(29, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, PreferredOffsetNoFit) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxx  xxx                              xxxxx000111222xxxxxx          x
+t1 |xxxxx  xxx                              xxxxxxxx111222xxxxxx          x
+t0 |xxxxx  xxx                              xxxxxxxx111xxxxxxxxx          x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+
+The sliced allocation does not fit at the preferred offset.
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 40},
+          {48, 51},
+          {60, 70},
+      },
+      // Slice time 1
+      {
+          {5, 7},
+          {10, 40},
+          {48, 54},
+          {60, 70},
+      },
+      // Slice time 2
+      {
+          {5, 7},
+          {10, 40},
+          {45, 54},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3, 3, 3};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = 35;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(48, 3), Chunk::FromOffsetSize(51, 3),
+                  Chunk::FromOffsetSize(45, 3), Chunk::FromOffsetSize(54, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, Misaligned) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxx  xxx                              xxx 000011112222 xxx          x
+t1 |xxxxx  xxx                              xxxxxxx 11112222 xxx          x
+t0 |xxxxx  xxx                              xxxxxxx 1111 xxxxxxx          x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+
+The IsSliceOffsetAllowedFn is set such that we are only allowed to start slices
+on spatial boundaries of 2.
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 40},
+          {47, 53},
+          {60, 70},
+      },
+      // Slice time 1
+      {
+          {5, 7},
+          {10, 40},
+          {47, 57},
+          {60, 70},
+      },
+      // Slice time 2
+      {
+          {5, 7},
+          {10, 40},
+          {43, 57},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {4, 4, 4};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 2;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(48, 4), Chunk::FromOffsetSize(52, 4),
+                  Chunk::FromOffsetSize(44, 4), Chunk::FromOffsetSize(56, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, PreferredOffsetMisaligned) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxx  xxx                              xxx 000011112222 xxx          x
+t1 |xxxxx  xxx                              xxxxxxx 11112222 xxx          x
+t0 |xxxxx  xxx                              xxxxxxx 1111 xxxxxxx          x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+
+The IsSliceOffsetAllowedFn is set such that we are only allowed to start slices
+on spatial boundaries of 2.
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 7},
+          {10, 40},
+          {47, 53},
+          {60, 70},
+      },
+      // Slice time 1
+      {
+          {5, 7},
+          {10, 40},
+          {47, 57},
+          {60, 70},
+      },
+      // Slice time 2
+      {
+          {5, 7},
+          {10, 40},
+          {43, 57},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {4, 4, 4};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = 21;
+  int64_t alignment = 2;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(48, 4), Chunk::FromOffsetSize(52, 4),
+                  Chunk::FromOffsetSize(44, 4), Chunk::FromOffsetSize(56, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, CorrectInitialization1) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t1 |xxxxx000111xxxxxxxxxxxxxx      xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+t0 |xxxxx000   xxxx      xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 11},
+          {15, 21},
+      },
+      // Slice time 1
+      {
+          {5, 11},
+          {25, 31},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3, 3};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(Chunk::FromOffsetSize(5, 3),
+                                     Chunk::FromOffsetSize(8, 3),
+                                     Chunk::FromOffsetSize(11, 0)));
+}
+
+TEST_F(SlicedAllocationFinderTest, CorrectInitialization2) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t1 |xxxxx000111     xxxxxxxxxx      xxxxxxxxxx   xxxxxxxxxxxxxxxxxxxxxxxxxx
+t0 |xxxxx000        xxxx      xxxxxxxxxxxxxx   xxxxxxxxxxxxxxxxxxxxxxxxxxxx
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 16},
+          {20, 26},
+          {40, 43},
+      },
+      // Slice time 1
+      {
+          {5, 16},
+          {26, 32},
+          {42, 45},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3, 3};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(Chunk::FromOffsetSize(5, 3),
+                                     Chunk::FromOffsetSize(8, 3),
+                                     Chunk::FromOffsetSize(11, 0)));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_activation_analysis.cc b/tensorflow/compiler/xla/service/hlo_activation_analysis.cc
deleted file mode 100644
index 51549d80c13..00000000000
--- a/tensorflow/compiler/xla/service/hlo_activation_analysis.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/hlo_activation_analysis.h"
-
-#include <memory>
-
-#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
-
-namespace xla {
-
-void ActivationAnalysisOnComputation(const HloComputation* computation,
-                                     ConstHloInstructionSet* activation_set) {
-  for (const HloInstruction* hlo : computation->MakeInstructionPostOrder()) {
-    // Dot or convolution create an "Activation".
-    if (hlo->opcode() == HloOpcode::kDot ||
-        hlo->opcode() == HloOpcode::kConvolution) {
-      activation_set->insert(hlo);
-      continue;
-    }
-
-    // Don't mark tuples directly since we want to indirect through tuples.
-    if (hlo->opcode() == HloOpcode::kTuple) {
-      continue;
-    }
-
-    if (hlo->opcode() == HloOpcode::kGetTupleElement &&
-        hlo->operand(0)->opcode() == HloOpcode::kTuple) {
-      if (activation_set->count(hlo->operand(0)->operand(hlo->tuple_index()))) {
-        activation_set->insert(hlo);
-      }
-      continue;
-    }
-
-    if (hlo->opcode() == HloOpcode::kGetTupleElement &&
-        hlo->operand(0)->opcode() == HloOpcode::kWhile) {
-      if (activation_set->count(
-              hlo->operand(0)->while_body()->root_instruction()->operand(
-                  hlo->tuple_index()))) {
-        activation_set->insert(hlo);
-      }
-      continue;
-    }
-
-    if (hlo->opcode() == HloOpcode::kGetTupleElement &&
-        hlo->operand(0)->opcode() == HloOpcode::kCall) {
-      if (activation_set->count(DynCast<HloCallableInstruction>(hlo->operand(0))
-                                    ->called_computation_root()
-                                    ->operand(hlo->tuple_index()))) {
-        activation_set->insert(hlo);
-      }
-      continue;
-    }
-
-    if (hlo->opcode() == HloOpcode::kGetTupleElement &&
-        hlo->operand(0)->opcode() == HloOpcode::kConditional) {
-      for (auto branch : hlo->operand(0)->branch_computations()) {
-        if (activation_set->count(
-                branch->root_instruction()->operand(hlo->tuple_index()))) {
-          activation_set->insert(hlo);
-        }
-      }
-      continue;
-    }
-
-    if (hlo->opcode() == HloOpcode::kWhile) {
-      const HloInstruction* body_param =
-          hlo->while_body()->parameter_instruction(0);
-      if (!body_param->shape().IsTuple()) {
-        if (activation_set->count(hlo->operand(0))) {
-          activation_set->insert(body_param);
-        }
-      }
-      for (const HloInstruction* use : body_param->users()) {
-        if (use->opcode() == HloOpcode::kGetTupleElement &&
-            activation_set->count(
-                hlo->operand(0)->operand(use->tuple_index()))) {
-          activation_set->insert(use);
-        }
-      }
-      ActivationAnalysisOnComputation(hlo->while_body(), activation_set);
-      continue;
-    }
-
-    // Skipping conditional and call for now.
-    if (hlo->opcode() == HloOpcode::kConditional) {
-      continue;
-    }
-    if (hlo->opcode() == HloOpcode::kCall) {
-      continue;
-    }
-
-    for (const HloInstruction* operand : hlo->operands()) {
-      if (activation_set->count(operand) ||
-          (operand->opcode() == HloOpcode::kWhile &&
-           activation_set->count(operand->while_body()->root_instruction()))) {
-        activation_set->insert(hlo);
-        break;
-      }
-    }
-  }
-}
-
-ConstHloInstructionSet ComputeHloActivationAnalysis(const HloModule* module) {
-  ConstHloInstructionSet activation_set;
-  ActivationAnalysisOnComputation(module->entry_computation(), &activation_set);
-  return activation_set;
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_computation_deduplicator.cc b/tensorflow/compiler/xla/service/hlo_computation_deduplicator.cc
index 715ef6da738..429f66c6bd7 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_deduplicator.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_deduplicator.cc
@@ -99,7 +99,11 @@ StatusOr<bool> HloComputationDeduplicator::Run(
       unique_comps[std::move(comp_str)] = comp;
     }
   }
-  module->ReplaceComputations(replacement);
+  if (mark_fusion_duplications_) {
+    module->MarkFusionDuplications(replacement);
+  } else {
+    module->ReplaceComputations(replacement);
+  }
   return !replacement.empty();
 }
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_computation_deduplicator.h b/tensorflow/compiler/xla/service/hlo_computation_deduplicator.h
index e3b437c1053..ee47e0b9470 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_deduplicator.h
+++ b/tensorflow/compiler/xla/service/hlo_computation_deduplicator.h
@@ -25,8 +25,15 @@ namespace xla {
 class HloComputationDeduplicator : public HloModulePass {
  private:
   bool ContainsLargeConstants(HloComputation* comp);
+  bool mark_fusion_duplications_;
 
  public:
+  // Setting mark_fusion_duplications to true will only process fusions in the
+  // HLO. The comparator in this pass will mark duplicate fusions which is
+  // needed for groupings in analysis (e.g. Xprof). Currently, the pass
+  // doesn't change the HLO if the flag is set to true.
+  explicit HloComputationDeduplicator(bool mark_fusion_duplications = false)
+      : mark_fusion_duplications_(mark_fusion_duplications) {}
   absl::string_view name() const override { return "computation-deduplicator"; }
 
   using HloPassInterface::Run;
diff --git a/tensorflow/compiler/xla/service/hlo_computation_deduplicator_test.cc b/tensorflow/compiler/xla/service/hlo_computation_deduplicator_test.cc
index a667f275ece..c60eb961247 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_deduplicator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_deduplicator_test.cc
@@ -52,7 +52,7 @@ class HloComputationDeduplicatorTest : public HloTestBase {
     EXPECT_EQ(changed, expect_true);
     std::vector<std::string> computation_names;
     for (auto comp : module->computations()) {
-      computation_names.push_back(comp->name());
+      computation_names.emplace_back(comp->name());
     }
     return computation_names;
   }
@@ -251,7 +251,7 @@ TEST_F(HloComputationDeduplicatorTest, DontRemoveRegionsWithDifferentSubcomp) {
 
   auto computation_names = RunDeduplicatePass(text, /*expect_true=*/true);
   // Region_X has a multiply() instead of add(). This one change should just
-  // mark region_a, region_b and region_Y as duplicates of eachother.
+  // mark region_a, region_b and region_Y as duplicates of each other.
   int region_x_count = 0;
   int region_y_count = 0;
   int main_16_count = 0;
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index d8e194fa16f..b832559525e 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -419,7 +419,7 @@ TEST_F(HloComputationTest, CycleDetection) {
       [](HloInstruction* instruction) { return OkStatus(); });
   auto visit_status = computation->Accept(&visitor);
   ASSERT_FALSE(visit_status.ok());
-  ASSERT_THAT(visit_status.error_message(),
+  ASSERT_THAT(visit_status.message(),
               ::testing::ContainsRegex("cycle is detecte"));
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index c45c989af6b..492f3257d45 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/slow_operation_alarm.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -106,9 +106,7 @@ StatusOr<bool> HloConstantFolding::Run(
       //    broadcasts of constants, e.g. op(constant, broadcast(constant)).
       //
       if (!absl::c_any_of(instruction->operands(),
-                          [](const HloInstruction* operand) {
-                            return operand->opcode() == HloOpcode::kConstant;
-                          }) ||
+                          HloPredicateIsOp<HloOpcode::kConstant>) ||
           !absl::c_all_of(
               instruction->operands(), [](const HloInstruction* operand) {
                 return operand->opcode() == HloOpcode::kConstant ||
@@ -213,14 +211,14 @@ StatusOr<bool> HloConstantFolding::Run(
             ndebug
                 ? "This isn't necessarily a bug; constant-folding is "
                   "inherently a trade-off between compilation time and speed "
-                  "at runtime.  XLA has some guards that attempt to keep "
+                  "at runtime. XLA has some guards that attempt to keep "
                   "constant folding from taking too long, but fundamentally "
                   "you'll always be able to come up with an input program that "
                   "takes a long time.\n\n"
                   "If you'd like to file a bug, run with envvar "
                   "XLA_FLAGS=--xla_dump_to=/tmp/foo and attach the results."
                 : "XLA was built without compiler optimizations, which can be "
-                  "slow.  Try rebuilding with -c opt.";
+                  "slow. Try rebuilding with -c opt.";
         return absl::StrFormat(
             "Constant folding an instruction is taking > %s:\n\n"
             "  %s\n\n"  // instruction->name() or instruction->ToString()
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index e72e203bbaf..686f5615fb7 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -780,6 +780,19 @@ int64_t HloCostAnalysis::GetConvolutionFlops(const HloInstruction* convolution,
     // Loop over each point in the kernel.
     for (int64_t kernel_idx = 0; kernel_idx < kernel_limits[spatial_dimension];
          ++kernel_idx) {
+      // Skip loop for trivial stride() and base_dilation()
+      if (window_dim.stride() == 1 && window_dim.base_dilation() == 1) {
+        const int64_t undilated_index_base =
+            window_dim.padding_low() -
+            kernel_idx * window_dim.window_dilation();
+        valid_position_count += std::max<int64_t>(
+            std::min<int64_t>(
+                input_limits[spatial_dimension] + undilated_index_base,
+                output_limits[spatial_dimension]) -
+                std::max<int64_t>(undilated_index_base, 0l),
+            0l);
+        continue;
+      }
       // Loop over each point in the output.
       for (int64_t output_idx = 0;
            output_idx < output_limits[spatial_dimension]; ++output_idx) {
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index e45bd51d839..1df8ad5720b 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -439,7 +439,8 @@ StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
   auto scalar_shape = ShapeUtil::MakeShape(operand->shape().element_type(), {});
   HloComputation* reduce_computation;
   {
-    HloComputation::Builder b(operand->name() + ".reduce_sub_computation");
+    HloComputation::Builder b(
+        absl::StrCat(operand->name(), ".reduce_sub_computation"));
     auto lhs = b.AddInstruction(
         HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
     auto rhs = b.AddInstruction(
@@ -465,7 +466,8 @@ StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
   auto scalar_shape = ShapeUtil::MakeShape(operand->shape().element_type(), {});
   HloComputation* reduce_computation;
   {
-    HloComputation::Builder b(operand->name() + ".reduce_sub_computation");
+    HloComputation::Builder b(
+        absl::StrCat(operand->name(), ".reduce_sub_computation"));
     auto lhs = b.AddInstruction(
         HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
     auto rhs = b.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 62c7e3b5e80..c9b8b81c975 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -1033,11 +1033,17 @@ bool HloDataflowAnalysis::UpdateAllGatherDoneValueSet(
     HloInstruction* all_gather_done) {
   CHECK_EQ(all_gather_done->opcode(), HloOpcode::kAllGatherDone);
   bool changed = false;
-  // AllGatherDone forwards the operand value at {1} to its output.
+  // AllGatherDone forwards the operand value at {1} to its output. If the
+  // output is a tuple, then that tuple is defined by all-gather-done, so
+  // only update the value set for tuple leaf elements (arrays).
   for (auto& pair : GetInstructionValueSet(all_gather_done)) {
     const ShapeIndex& output_index = pair.first;
     HloValueSet& value_set = pair.second;
 
+    if (!ShapeUtil::GetSubshape(all_gather_done->shape(), output_index)
+             .IsArray()) {
+      continue;
+    }
     ShapeIndex operand_index = {1};
     for (int64_t i : output_index) {
       operand_index.push_back(i);
@@ -1436,9 +1442,16 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
           break;
         case HloOpcode::kAllGatherStart:
           // AllGatherStart produces a tuple of
-          // {aliased operand, destination buffer}.
-          define_value_at(/*index=*/{});
-          define_value_at(/*index=*/{1});
+          // {aliased operands, destination buffers}. If there is more than
+          // one operand, then both aliased operands and destination buffers
+          // will be tuples themselves. all-gather-start will define all tuples
+          // and all tuple leaves (arrays) in tuple sub-index 1 (destination
+          // buffers).
+          define_all_values([&](const ShapeIndex& index) {
+            return ShapeUtil::GetSubshape(instruction->shape(), index)
+                       .IsTuple() ||
+                   index.front() == 1;
+          });
           break;
         case HloOpcode::kAllGatherDone:
           // AllGatherDone's output aliases its input tuple element {1}.
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index 1dbf45ca7d4..37902400106 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -206,6 +206,9 @@ class HloDataflowAnalysis {
   static std::vector<std::pair<HloOperandIndex, ShapeIndex>>
   GetInPlaceInputOutputPairs(const HloInstruction* instruction);
 
+  // Verifies various invariants of the dataflow analysis.
+  Status Verify() const;
+
  private:
   static bool AreTransitiveUsesElementwiseOrTuple(const HloInstruction* inst);
 
@@ -308,9 +311,6 @@ class HloDataflowAnalysis {
       HloInstruction* instruction, const InstructionValueSet& new_value_set,
       const InstructionValueSet* prev_value_set = nullptr);
 
-  // Verifies various invariants of the dataflow analysis.
-  Status Verify() const;
-
   const HloModule& module_;
   const bool ssa_form_;
   const bool bitcast_defines_value_;
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 3f551c6550b..8d226bf0282 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -2058,6 +2058,55 @@ TEST_F(HloDataflowAnalysisTest, AllReduceStartAndDoneTwoOperands) {
               UnorderedElementsAre(HloUse{done, 0, {}}));
 }
 
+TEST_F(HloDataflowAnalysisTest, AllGatherStartAndDoneWithTuple) {
+  const char* hlo_text = R"(
+    HloModule test
+    ENTRY entry {
+      p0 = f32[2] parameter(0)
+      p1 = bf16[2] parameter(1)
+      start = ((f32[2], bf16[2]), (f32[4], bf16[4])) all-gather-start(p0, p1), dimensions={0}
+      ROOT done = (f32[4], bf16[4]) all-gather-done(start)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(module_, ParseAndReturnVerifiedModule(hlo_text));
+  const HloDataflowAnalysis& analysis = RunAnalysis(/*ssa_form=*/false);
+  Status status = analysis.Verify();
+  EXPECT_TRUE(status.ok()) << status.ToString();
+
+  HloInstruction* done = module_->entry_computation()->root_instruction();
+  HloInstruction* start = done->mutable_operand(0);
+  HloInstruction* param0 = start->mutable_operand(0);
+  HloInstruction* param1 = start->mutable_operand(1);
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(start, /*index=*/{}));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(start, /*index=*/{0}));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(start, /*index=*/{1}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(start, /*index=*/{0, 0}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(start, /*index=*/{0, 1}));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(start, /*index=*/{1, 0}));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(start, /*index=*/{1, 1}));
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(done, /*index=*/{}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(done, /*index=*/{0}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(done, /*index=*/{1}));
+
+  EXPECT_THAT(
+      analysis.GetValueDefinedAt(param0).GetUses(),
+      UnorderedElementsAre(HloUse{start, 0, {}}, HloUse{done, 0, {0, 0}}));
+  EXPECT_THAT(
+      analysis.GetValueDefinedAt(param1).GetUses(),
+      UnorderedElementsAre(HloUse{start, 1, {}}, HloUse{done, 0, {0, 1}}));
+
+  EXPECT_THAT(HloValuesAt(start, /*index=*/{0, 0}),
+              UnorderedElementsAre(&analysis.GetValueDefinedAt(param0, {})));
+  EXPECT_THAT(HloValuesAt(start, /*index=*/{0, 1}),
+              UnorderedElementsAre(&analysis.GetValueDefinedAt(param1, {})));
+  EXPECT_THAT(HloValuesAt(done, /*index=*/{0}),
+              UnorderedElementsAre(&analysis.GetValueDefinedAt(start, {1, 0})));
+  EXPECT_THAT(HloValuesAt(done, /*index=*/{1}),
+              UnorderedElementsAre(&analysis.GetValueDefinedAt(start, {1, 1})));
+}
+
 INSTANTIATE_TEST_SUITE_P(HloDataflowAnalysisInstantiation,
                          HloDataflowAnalysisTest,
                          ::testing::Values(false, true));
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index 306a851d5ea..e0a7c61379e 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/platform/errors.h"
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index 2085824ec79..c7e56308d69 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -50,7 +50,7 @@ HloProfileIndexMap::HloProfileIndexMap(
 std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
     const HloProfileIndexMap& hlo_profile_index_map,
     const HloCostAnalysis& cost_analysis,
-    const std::string& entry_computation_name) {
+    absl::string_view entry_computation_name) {
   using HloComputationInfo = HloProfilePrinterData::HloComputationInfo;
   using HloInstructionInfo = HloProfilePrinterData::HloInstructionInfo;
 
@@ -87,7 +87,7 @@ std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
     HloComputationInfo* computation_info =
         profile_printer_data->add_computation_infos();
 
-    computation_info->set_name(computation->name());
+    *computation_info->mutable_name() = std::string(computation->name());
     computation_info->set_profile_index(pair.second);
     computation_info->mutable_instruction_infos()->Reserve(
         computation->instruction_count());
@@ -117,7 +117,8 @@ std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
         {pair.first, pair.second});
   }
 
-  profile_printer_data->set_entry_computation(entry_computation_name);
+  *profile_printer_data->mutable_entry_computation() =
+      std::string(entry_computation_name);
 
   return profile_printer_data;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 3a9e6314ada..3ec91c4030f 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -100,7 +100,7 @@ class HloProfileIndexMap {
 std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
     const HloProfileIndexMap& hlo_profile_index_map,
     const HloCostAnalysis& cost_analysis,
-    const std::string& entry_computation_name);
+    absl::string_view entry_computation_name);
 
 // Describes how much time each HLO operation took.
 //
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 40d9513bb81..fe749367453 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -190,7 +190,7 @@ NodeColors NodeColorsForScheme(ColorScheme color) {
     case kRed:
       return NodeColors{"filled", "#ffcdd2", "#cb9ca1", "black"};
     case kWhite:
-      return NodeColors{"filled", "white", "black", "black"};
+      return NodeColors{"filled", "white", "#9e9e9e", "black"};
     case kYellow:
       return NodeColors{"filled", "#fff9c4", "#cbc693", "black"};
     case kDashedBorder:
@@ -891,7 +891,7 @@ std::string HloDotDumper::GetInstructionNodeInlinedOperands(
     // Otherwise, print e.g. "%constant.42 (s32[100])".
     std::string constant_name;
     if (absl::StartsWith(constant->name(), "constant")) {
-      constant_name = constant->name();
+      constant_name = std::string(constant->name());
     } else {
       constant_name = StrCat("constant ", constant->name());
     }
@@ -928,7 +928,7 @@ std::string HloDotDumper::GetInstructionNodeInlinedOperands(
                       operand->operand(0)->name(),
                       ShapeUtil::HumanStringWithLayout(operand->shape()));
       } else {
-        operand_str = operand->name();
+        operand_str = std::string(operand->name());
       }
     }
 
@@ -1024,6 +1024,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kXor:
     case HloOpcode::kPower:
     case HloOpcode::kReal:
+    case HloOpcode::kReducePrecision:
     case HloOpcode::kRemainder:
     case HloOpcode::kRng:
     case HloOpcode::kRngGetAndUpdateState:
@@ -1046,52 +1047,30 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kSubtract:
     case HloOpcode::kTan:
     case HloOpcode::kTanh:
-      // De-emphasize scalar-shaped elementwise ops -- they're generally
-      // uninteresting.
-      if (ShapeUtil::IsEffectiveScalar(instr->shape())) {
-        return kWhite;
-      }
-      return kYellow;
-    case HloOpcode::kBitcast:
-    case HloOpcode::kGetTupleElement:
-    case HloOpcode::kAfterAll:
+      return kWhite;
     case HloOpcode::kAddDependency:
-    case HloOpcode::kTuple:
+    case HloOpcode::kAfterAll:
+    case HloOpcode::kGetTupleElement:
     case HloOpcode::kOptimizationBarrier:
+    case HloOpcode::kPad:
+    case HloOpcode::kTuple:
       return kWhite;
     case HloOpcode::kConstant:
       // Constants aren't usually shown as their own nodes, but they'll be
       // present if e.g. they're the root of a computation.
       return kWhite;
     case HloOpcode::kBroadcast:
-      // De-emphasize nodes which broadcast a scalar within a fusion node --
-      // these are essentially free.
-      if (instr->IsFused() &&
-          ShapeUtil::IsEffectiveScalar(instr->operand(0)->shape())) {
-        return kWhite;
-      }
-      return kGreen;
+    case HloOpcode::kDynamicUpdateSlice:
+      return kYellow;
     case HloOpcode::kConcatenate:
     case HloOpcode::kDynamicSlice:
-    case HloOpcode::kGather:
-    case HloOpcode::kPad:
     case HloOpcode::kReshape:
     case HloOpcode::kDynamicReshape:
     case HloOpcode::kReverse:
     case HloOpcode::kTranspose:
-      // De-emphasize scalar-shaped data movement ops and all data movement ops
-      // inside fusion nodes, both of which are essentially free.
-      if (ShapeUtil::IsEffectiveScalar(instr->shape()) || instr->IsFused()) {
-        return kWhite;
-      }
-      return kGreen;
-    case HloOpcode::kDynamicUpdateSlice:
-      // Unlike the data-movement ops above, dynamic-update-slice is not ~free
-      // inside of fusion nodes, so we de-emphasize it only if it's
-      // scalar-shaped.
-      if (ShapeUtil::IsEffectiveScalar(instr->shape())) {
-        return kWhite;
-      }
+      // These data-movement ops can be expensive; emphasize them.  (Yes, even
+      // concat can be expensive, at least on GPU, as it can create warp
+      // divergence.)
       return kGreen;
     case HloOpcode::kCopy:
     case HloOpcode::kCopyStart:
@@ -1099,6 +1078,13 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
       // Emphasize copy nodes, which are either physical transposes (and thus
       // significant), or copies of read-only buffers (and thus dead weight).
       return kGreen;
+    case HloOpcode::kBitcast:
+      // Unfused bitcast is free, but fused bitcast should count as a non-free
+      // data-movement op (e.g. requires linearization of indices on GPU).
+      if (!instr->IsFused()) {
+        return kWhite;
+      }
+      return kGreen;
     case HloOpcode::kAsyncStart:
     case HloOpcode::kAsyncUpdate:
     case HloOpcode::kAsyncDone:
@@ -1109,8 +1095,6 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kTriangularSolve:
     case HloOpcode::kCholesky:
       return kDarkBlue;
-    case HloOpcode::kReducePrecision:
-      return kRed;
     case HloOpcode::kParameter:
       return parameter_color;
     case HloOpcode::kBatchNormGrad:
@@ -1120,6 +1104,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kReduceWindow:
     case HloOpcode::kScatter:  // scatter is a kind of reduction
     case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kGather:  // not a reduction, but goes with scatter
       return kPurple;
     case HloOpcode::kDomain:
     case HloOpcode::kFusion:
@@ -2053,7 +2038,7 @@ void RegisterGraphToURLRenderer(
     std::function<StatusOr<std::string>(absl::string_view)> renderer) {
   absl::MutexLock lock(&url_renderer_mu);
   if (url_renderer != nullptr) {
-    LOG(WARNING) << "Multiple calls to RegisterGraphToURLRenderer.  Last call "
+    LOG(WARNING) << "Multiple calls to RegisterGraphToURLRenderer. Last call "
                     "wins, but because order of initialization in C++ is "
                     "nondeterministic, this may not be what you want.";
   }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index e6096372467..c9dc160999c 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -1746,9 +1746,9 @@ TEST_F(HloInstructionTest, StringifyAsyncOps) {
 
 ENTRY %Entry (p0: f32[10]) -> f32[20] {
   %p0 = f32[10]{0} parameter(0)
-  %async-start = ((f32[10]{0}), f32[20]{0}, s32[]) custom-call-start(f32[10]{0} %p0), async_execution_thread="parallel_thread", custom_call_target="foo"
-  %async-update = ((f32[10]{0}), f32[20]{0}, s32[]) custom-call-update(((f32[10]{0}), f32[20]{0}, s32[]) %async-start), async_execution_thread="parallel_thread", custom_call_target="foo"
-  ROOT %async-done = f32[20]{0} custom-call-done(((f32[10]{0}), f32[20]{0}, s32[]) %async-update), async_execution_thread="parallel_thread", custom_call_target="foo"
+  %custom-call-start = ((f32[10]{0}), f32[20]{0}, s32[]) custom-call-start(f32[10]{0} %p0), async_execution_thread="parallel_thread", custom_call_target="foo"
+  %custom-call-update = ((f32[10]{0}), f32[20]{0}, s32[]) custom-call-update(((f32[10]{0}), f32[20]{0}, s32[]) %custom-call-start), async_execution_thread="parallel_thread", custom_call_target="foo"
+  ROOT %custom-call-done = f32[20]{0} custom-call-done(((f32[10]{0}), f32[20]{0}, s32[]) %custom-call-update), async_execution_thread="parallel_thread", custom_call_target="foo"
 }
 
 )";
@@ -1763,9 +1763,108 @@ ENTRY %Entry (p0: f32[10]) -> f32[20] {
 
 ENTRY %Entry (p0: f32[10]) -> f32[20] {
   %p0 = f32[10]{0} parameter(0)
-  %async-start = ((f32[10]{0}), f32[20]{0}, s32[]) async-start(f32[10]{0} %p0), async_execution_thread="parallel_thread", calls=%AsyncOp
-  %async-update = ((f32[10]{0}), f32[20]{0}, s32[]) async-update(((f32[10]{0}), f32[20]{0}, s32[]) %async-start), async_execution_thread="parallel_thread", calls=%AsyncOp
-  ROOT %async-done = f32[20]{0} async-done(((f32[10]{0}), f32[20]{0}, s32[]) %async-update), async_execution_thread="parallel_thread", calls=%AsyncOp
+  %custom-call-start = ((f32[10]{0}), f32[20]{0}, s32[]) async-start(f32[10]{0} %p0), async_execution_thread="parallel_thread", calls=%AsyncOp
+  %custom-call-update = ((f32[10]{0}), f32[20]{0}, s32[]) async-update(((f32[10]{0}), f32[20]{0}, s32[]) %custom-call-start), async_execution_thread="parallel_thread", calls=%AsyncOp
+  ROOT %custom-call-done = f32[20]{0} async-done(((f32[10]{0}), f32[20]{0}, s32[]) %custom-call-update), async_execution_thread="parallel_thread", calls=%AsyncOp
+}
+
+)";
+  auto options = HloPrintOptions().set_syntax_sugar_async_ops(false);
+  EXPECT_EQ(module->ToString(options), expected_without_syntax_sugar);
+}
+
+TEST_F(HloInstructionTest, StringifyAsyncOpsWithReduceScatter) {
+  const Shape rs_input_shape = ShapeUtil::MakeShape(F32, {20});
+  const Shape rs_output_shape = ShapeUtil::MakeShape(F32, {10});
+
+  std::unique_ptr<HloComputation> add_computation;
+  {
+    const Shape scalar_shape = ShapeUtil::MakeScalarShape(F32);
+    HloComputation::Builder add_builder("add");
+    HloInstruction* param0 = add_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* param1 = add_builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    add_builder.AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kAdd, param0, param1));
+    add_computation = add_builder.Build();
+  }
+
+  std::unique_ptr<HloComputation> async_computation;
+  {
+    HloComputation::Builder async_builder("AsyncOp");
+    HloInstruction* param = async_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, rs_input_shape, "pasync"));
+    async_builder.AddInstruction(HloInstruction::CreateReduceScatter(
+        rs_output_shape, {param}, add_computation.get(), {}, false,
+        std::nullopt, false, 0));
+    async_computation = async_builder.Build();
+  }
+
+  const Shape async_start_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeTupleShape({rs_input_shape}), rs_output_shape});
+
+  HloComputation::Builder entry_builder("Entry");
+  HloInstruction* entry_param = entry_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, rs_input_shape, "pentry"));
+  HloInstruction* async_start =
+      entry_builder.AddInstruction(HloInstruction::CreateAsyncStart(
+          async_start_shape, {entry_param}, async_computation.get(),
+          /*async_group_id=*/std::nullopt,
+          /*async_execution_thread=*/"parallel_thread"));
+  HloInstruction* async_update =
+      entry_builder.AddInstruction(HloInstruction::CreateAsyncUpdate(
+          async_start_shape, async_start, async_computation.get(),
+          /*async_group_id=*/std::nullopt,
+          /*async_execution_thread=*/"parallel_thread"));
+  entry_builder.AddInstruction(HloInstruction::CreateAsyncDone(
+      rs_output_shape, async_update, async_computation.get(),
+      /*async_group_id=*/std::nullopt,
+      /*async_execution_thread=*/"parallel_thread"));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(entry_builder.Build());
+  module->AddEmbeddedComputation(std::move(async_computation));
+  module->AddEmbeddedComputation(std::move(add_computation));
+
+  const std::string expected_with_syntax_sugar =
+      R"(HloModule StringifyAsyncOpsWithReduceScatter, entry_computation_layout={(f32[20]{0})->f32[10]{0}}
+
+%add (p0: f32[], p1: f32[]) -> f32[] {
+  %p0 = f32[] parameter(0)
+  %p1 = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %p0, f32[] %p1)
+}, execution_thread="parallel_thread"
+
+ENTRY %Entry (pentry: f32[20]) -> f32[10] {
+  %pentry = f32[20]{0} parameter(0)
+  %reduce-scatter-start = ((f32[20]{0}), f32[10]{0}) reduce-scatter-start(f32[20]{0} %pentry), async_execution_thread="parallel_thread", replica_groups={}, dimensions={0}, to_apply=%add
+  %reduce-scatter-update = ((f32[20]{0}), f32[10]{0}) reduce-scatter-update(((f32[20]{0}), f32[10]{0}) %reduce-scatter-start), async_execution_thread="parallel_thread", replica_groups={}, dimensions={0}, to_apply=%add
+  ROOT %reduce-scatter-done = f32[10]{0} reduce-scatter-done(((f32[20]{0}), f32[10]{0}) %reduce-scatter-update), async_execution_thread="parallel_thread", replica_groups={}, dimensions={0}, to_apply=%add
+}
+
+)";
+  EXPECT_EQ(module->ToString(), expected_with_syntax_sugar);
+
+  const std::string expected_without_syntax_sugar =
+      R"(HloModule StringifyAsyncOpsWithReduceScatter, entry_computation_layout={(f32[20]{0})->f32[10]{0}}
+
+%add (p0: f32[], p1: f32[]) -> f32[] {
+  %p0 = f32[] parameter(0)
+  %p1 = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %p0, f32[] %p1)
+}, execution_thread="parallel_thread"
+
+%AsyncOp (pasync: f32[20]) -> f32[10] {
+  %pasync = f32[20]{0} parameter(0)
+  ROOT %reduce-scatter = f32[10]{0} reduce-scatter(f32[20]{0} %pasync), replica_groups={}, dimensions={0}, to_apply=%add
+}, execution_thread="parallel_thread"
+
+ENTRY %Entry (pentry: f32[20]) -> f32[10] {
+  %pentry = f32[20]{0} parameter(0)
+  %reduce-scatter-start = ((f32[20]{0}), f32[10]{0}) async-start(f32[20]{0} %pentry), async_execution_thread="parallel_thread", calls=%AsyncOp
+  %reduce-scatter-update = ((f32[20]{0}), f32[10]{0}) async-update(((f32[20]{0}), f32[10]{0}) %reduce-scatter-start), async_execution_thread="parallel_thread", calls=%AsyncOp
+  ROOT %reduce-scatter-done = f32[10]{0} async-done(((f32[20]{0}), f32[10]{0}) %reduce-scatter-update), async_execution_thread="parallel_thread", calls=%AsyncOp
 }
 
 )";
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index 5b83a85ee8e..7ac5295c0eb 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -185,8 +185,7 @@ class HloModuleConfig {
       // TODO(yuemmawang) Remove this warning once auto sharding is thoroughly
       // tested with fleetwide models.
       LOG(WARNING) << "Warning: Using auto_spmd_partitioning. It is "
-                      "experimental and may "
-                      "contain bugs!";
+                      "experimental and may contain bugs!";
       LOG(INFO) << "Overwriting use_spmd_partitioning to true, because "
                    "use_auto_spmd_partitioning is true.";
       set_use_spmd_partitioning(true);
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
index 0a7f1e72088..7c9244feb11 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
@@ -208,7 +208,7 @@ std::vector<HloInstruction*> HloModuleGroupUtil::RootInstructions(
 
 std::string HloModuleGroupUtil::CycleToString(
     HloInstruction* init_instruction) {
-  std::vector<std::string> names;
+  std::vector<absl::string_view> names;
   absl::flat_hash_set<HloInstruction*> seen;
 
   std::function<bool(HloInstruction*)> helper =
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index bb0800e6464..fa44568c2bd 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -145,7 +145,7 @@ TEST_F(HloModuleTest, CloneTest) {
   for (auto origin = post_order.begin(), copied = post_order_copied.begin();
        origin != post_order.end() && copied != post_order_copied.end();
        ++origin, ++copied) {
-    EXPECT_EQ((*origin)->name() + ".copy", (*copied)->name());
+    EXPECT_EQ(absl::StrCat((*origin)->name(), ".copy"), (*copied)->name());
   }
 }
 
@@ -184,9 +184,9 @@ TEST_F(HloModuleTest, CloneHasFusion) {
     if ((*origin)->name() == "Fused") {
       // Clone of the fused computation is handled when its fusion instruction
       // is cloned, which always use suffix ".clone".
-      EXPECT_EQ((*origin)->name() + ".clone", (*copied)->name());
+      EXPECT_EQ(absl::StrCat((*origin)->name(), ".clone"), (*copied)->name());
     } else {
-      EXPECT_EQ((*origin)->name() + ".copy", (*copied)->name());
+      EXPECT_EQ(absl::StrCat((*origin)->name(), ".copy"), (*copied)->name());
     }
   }
 }
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 3e7bb9d5f1c..da986031399 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -909,7 +909,7 @@ bool HloParserImpl::ParseCustomCallSchedule(CustomCallSchedule* result) {
   if (!status_or_result.ok()) {
     return TokenError(
         StrFormat("expects custom-call schedule but sees: %s, error: %s", val,
-                  status_or_result.status().error_message()));
+                  status_or_result.status().message()));
   }
   *result = status_or_result.value();
   lexer_.Lex();
@@ -926,7 +926,7 @@ bool HloParserImpl::ParseCustomCallApiVersion(CustomCallApiVersion* result) {
   if (!status_or_result.ok()) {
     return TokenError(
         StrFormat("expects custom-call API version but sees: %s, error: %s",
-                  val, status_or_result.status().error_message()));
+                  val, status_or_result.status().message()));
   }
   *result = status_or_result.value();
   lexer_.Lex();
@@ -977,7 +977,7 @@ bool HloParserImpl::ParseHloModule(HloModule* module,
   }
 
   if (parse_module_without_header) {
-    name = "module_" + module->entry_computation()->name();
+    name = absl::StrCat("module_", module->entry_computation()->name());
     entry_computation_layout =
         ComputationLayout(module->entry_computation()->ComputeProgramShape(),
                           /*ignore_layouts*/ false);
@@ -1013,7 +1013,7 @@ bool HloParserImpl::ParseHloModule(HloModule* module,
           alias_config.SetUpAlias(p.first, p.second.parameter_number,
                                   p.second.parameter_index, p.second.kind);
       if (!st.ok()) {
-        return TokenError(st.error_message());
+        return TokenError(st.message());
       }
     }
     module->input_output_alias_config() = alias_config;
@@ -1305,9 +1305,9 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
         }
         auto inferred = infer();
         if (!inferred.ok()) {
-          return TokenError(StrFormat(
-              "failed to infer shape for opcode: %s, error: %s",
-              HloOpcodeString(opcode), inferred.status().error_message()));
+          return TokenError(
+              StrFormat("failed to infer shape for opcode: %s, error: %s",
+                        HloOpcodeString(opcode), inferred.status().message()));
         }
         shape = std::move(inferred).value();
         return true;
@@ -1611,9 +1611,9 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
       if (!slice_sizes.has_value()) {
         if (operands.size() != 1) {
           TokenError(
-              "CollectivePermute and CollectivePermuteStart must "
-              "have exactly  one operand (input buffer) unless "
-              "it performs dynamic-slice and in-place update.");
+              "CollectivePermute and CollectivePermuteStart must have exactly "
+              "one operand (input buffer) unless it performs dynamic-slice and "
+              "in-place update.");
           return nullptr;
         }
         if (opcode == HloOpcode::kCollectivePermute) {
@@ -3374,6 +3374,8 @@ bool HloParserImpl::SetValueInLiteral(LocTy loc, int64_t value, int64_t index,
                                       Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
+    case S4:
+      return SetValueInLiteralHelper<s4>(loc, value, index, literal);
     case S8:
       return SetValueInLiteralHelper<int8_t>(loc, value, index, literal);
     case S16:
@@ -3382,6 +3384,8 @@ bool HloParserImpl::SetValueInLiteral(LocTy loc, int64_t value, int64_t index,
       return SetValueInLiteralHelper<int32_t>(loc, value, index, literal);
     case S64:
       return SetValueInLiteralHelper<int64_t>(loc, value, index, literal);
+    case U4:
+      return SetValueInLiteralHelper<u4>(loc, value, index, literal);
     case U8:
       return SetValueInLiteralHelper<uint8_t>(loc, value, index, literal);
     case U16:
@@ -3410,6 +3414,9 @@ bool HloParserImpl::SetValueInLiteral(LocTy loc, double value, int64_t index,
     case F8E4M3FN:
       return SetValueInLiteralHelper<tsl::float8_e4m3fn>(loc, value, index,
                                                          literal);
+    case F8E4M3B11FNUZ:
+      return SetValueInLiteralHelper<tsl::float8_e4m3b11>(loc, value, index,
+                                                          literal);
     case F16:
       return SetValueInLiteralHelper<Eigen::half>(loc, value, index, literal);
     case BF16:
@@ -3463,7 +3470,7 @@ std::string StringifyValue(std::complex<double> val) {
 
 // Evaluates to V when T == U.
 template <typename T, typename U, typename V>
-using EnableIfSameWithType = std::enable_if_t<std::is_same<T, U>::value, V>;
+using EnableIfSameWithType = std::enable_if_t<std::is_same_v<T, U>, V>;
 
 template <class T, EnableIfSameWithType<T, bool, bool> = false>
 uint64_t GetNanPayload(T val) {
@@ -3552,17 +3559,7 @@ bool HloParserImpl::SetValueInLiteralHelper(LocTy loc, ParsedElemT value,
           return true;
         }
         auto nan_payload = GetNanPayload(parsed_value_component);
-        if constexpr (std::is_same<LiteralNativeComponentT,
-                                   tsl::float8_e4m3fn>::value) {
-          if (nan_payload != QuietNanWithoutPayload<double>()) {
-            return Error(
-                loc, StrCat("tries to set NaN payload 0x",
-                            absl::Hex(nan_payload), " to a literal in shape ",
-                            ShapeUtil::HumanString(literal->shape()),
-                            " at linear index ", index,
-                            ", but f8e4m3fn does not support payloads"));
-          }
-        } else {
+        if constexpr (NanPayloadBits<LiteralNativeComponentT>() > 0) {
           if (nan_payload == QuietNanWithoutPayload<double>()) {
             nan_payload = QuietNanWithoutPayload<LiteralNativeComponentT>();
           }
@@ -3582,14 +3579,25 @@ bool HloParserImpl::SetValueInLiteralHelper(LocTy loc, ParsedElemT value,
                   /*sign=*/std::signbit(
                       static_cast<double>(parsed_value_component)),
                   /*nan_payload=*/nan_payload);
+        } else {
+          if (nan_payload != QuietNanWithoutPayload<double>()) {
+            return Error(
+                loc, StrCat("tries to set NaN payload 0x",
+                            absl::Hex(nan_payload), " to a literal in shape ",
+                            ShapeUtil::HumanString(literal->shape()),
+                            " at linear index ", index, ", but ",
+                            primitive_util::LowercasePrimitiveTypeName(
+                                literal->shape().element_type()),
+                            " does not support payloads"));
+          }
         }
         return true;
       };
   const ParsedElemComponentT parsed_real_value = GetReal(value);
   auto literal_real_value =
       static_cast<LiteralNativeComponentT>(parsed_real_value);
-  if (std::is_floating_point<ParsedElemT>::value ||
-      std::is_same<ParsedElemT, std::complex<double>>::value) {
+  if (std::is_floating_point_v<ParsedElemT> ||
+      std::is_same_v<ParsedElemT, std::complex<double>>) {
     if (!handle_nan(parsed_real_value, &literal_real_value)) {
       return false;
     }
@@ -3597,7 +3605,7 @@ bool HloParserImpl::SetValueInLiteralHelper(LocTy loc, ParsedElemT value,
   const ParsedElemComponentT parsed_imag_value = GetImag(value);
   auto literal_imag_value =
       static_cast<LiteralNativeComponentT>(parsed_imag_value);
-  if (std::is_same<ParsedElemT, std::complex<double>>::value) {
+  if constexpr (std::is_same_v<ParsedElemT, std::complex<double>>) {
     if (!handle_nan(parsed_real_value, &literal_imag_value)) {
       return false;
     }
@@ -3773,7 +3781,7 @@ bool HloParserImpl::ParseDenseLiteral(Literal* literal, const Shape& shape) {
       case TokKind::kLparen: {
         if (!primitive_util::IsComplexType(shape.element_type())) {
           return TokenError(
-              absl::StrFormat("unexpected '(' in literal.  Parens are only "
+              absl::StrFormat("unexpected '(' in literal. Parens are only "
                               "valid for complex literals"));
         }
 
@@ -3895,6 +3903,10 @@ template <>
 struct MinMaxFiniteValue<tsl::float8_e4m3fn>
     : MinMaxFiniteValueCustomFloat<tsl::float8_e4m3fn> {};
 
+template <>
+struct MinMaxFiniteValue<tsl::float8_e4m3b11>
+    : MinMaxFiniteValueCustomFloat<tsl::float8_e4m3b11> {};
+
 // MSVC's standard C++ library does not define isnan/isfinite for integer types.
 // To work around that we will need to provide our own.
 template <typename T>
@@ -3916,7 +3928,7 @@ std::enable_if_t<std::is_integral<T>::value, bool> IsNaN(T val) {
 
 template <typename LiteralNativeT, typename ParsedElemT>
 bool HloParserImpl::CheckParsedValueIsInRange(LocTy loc, ParsedElemT value) {
-  if (std::is_floating_point<ParsedElemT>::value) {
+  if constexpr (std::is_floating_point_v<ParsedElemT>) {
     auto value_as_native_t = static_cast<LiteralNativeT>(value);
     auto value_double_converted = static_cast<ParsedElemT>(value_as_native_t);
     if (!IsFinite(value) || IsFinite(value_double_converted)) {
@@ -3930,10 +3942,10 @@ bool HloParserImpl::CheckParsedValueIsInRange(LocTy loc, ParsedElemT value) {
        (std::numeric_limits<ParsedElemT>::infinity() == value ||
         -std::numeric_limits<ParsedElemT>::infinity() == value))) {
     // Skip range checking for non-finite value.
-  } else if (std::is_unsigned<LiteralNativeT>::value) {
-    CHECK((std::is_same<ParsedElemT, int64_t>::value ||
-           std::is_same<ParsedElemT, bool>::value))
-        << "Unimplemented checking for ParsedElemT";
+  } else if constexpr (std::is_unsigned<LiteralNativeT>::value) {
+    static_assert(std::is_same_v<ParsedElemT, int64_t> ||
+                      std::is_same_v<ParsedElemT, bool>,
+                  "Unimplemented checking for ParsedElemT");
 
     const uint64_t unsigned_value = value;
     const uint64_t upper_bound =
@@ -3945,14 +3957,20 @@ bool HloParserImpl::CheckParsedValueIsInRange(LocTy loc, ParsedElemT value) {
                                PrimitiveType_Name(literal_ty), " namely [0, ",
                                upper_bound, "]."));
     }
-  } else if (value > MinMaxFiniteValue<LiteralNativeT>::max() ||
-             value < MinMaxFiniteValue<LiteralNativeT>::min()) {
+  } else if (value > static_cast<ParsedElemT>(
+                         MinMaxFiniteValue<LiteralNativeT>::max()) ||
+             value < static_cast<ParsedElemT>(
+                         MinMaxFiniteValue<LiteralNativeT>::min())) {
     // Value is out of range for LiteralNativeT.
-    return Error(loc, StrCat("value ", value,
-                             " is out of range for literal's primitive type ",
-                             PrimitiveType_Name(literal_ty), " namely [",
-                             MinMaxFiniteValue<LiteralNativeT>::min(), ", ",
-                             MinMaxFiniteValue<LiteralNativeT>::max(), "]."));
+    return Error(
+        loc,
+        StrCat(
+            "value ", value, " is out of range for literal's primitive type ",
+            PrimitiveType_Name(literal_ty), " namely [",
+            static_cast<ParsedElemT>(MinMaxFiniteValue<LiteralNativeT>::min()),
+            ", ",
+            static_cast<ParsedElemT>(MinMaxFiniteValue<LiteralNativeT>::max()),
+            "]."));
   }
   return true;
 }
@@ -4205,8 +4223,8 @@ bool HloParserImpl::ParseAttributeHelper(
                            StrAppend(out, kv.first);
                          }));
     }
-    return Error(loc, StrFormat("unexpected attribute \"%s\".  %s", name,
-                                allowed_attrs));
+    return Error(
+        loc, StrFormat("unexpected attribute \"%s\". %s", name, allowed_attrs));
   }
   AttrTy attr_type = attr_it->second.attr_type;
   void* attr_out_ptr = attr_it->second.result;
@@ -4571,7 +4589,7 @@ bool HloParserImpl::CopyAttributeToProtoMessage(
         }
       }
       return TokenError(
-          StrFormat("unexpected attribute \"%s\".  %s", name, allowed_attrs));
+          StrFormat("unexpected attribute \"%s\". %s", name, allowed_attrs));
     }
 
     CHECK(!fd->is_repeated());  // Repeated fields not implemented.
@@ -5170,6 +5188,10 @@ bool HloParserImpl::ParseDimLevelTypes(
         lexer_.Lex();
         dim_level_type = DIM_SINGLETON;
         dim_level_type_valid = true;
+      } else if (lexer_.GetStrVal() == "H") {
+        lexer_.Lex();
+        dim_level_type = DIM_COMPRESSED_WITH_HI;
+        dim_level_type_valid = true;
       }
       if (dim_level_type_valid) {
         bool new_dim_unique = true;
@@ -5623,12 +5645,15 @@ bool HloParserImpl::ParseMetadata(OpMetadata* metadata) {
   optional<std::string> source_file;
   optional<int32_t> source_line;
   optional<std::vector<int64_t>> profile_type;
+  optional<std::string> deduplicated_name;
   attrs["op_type"] = {/*required=*/false, AttrTy::kString, &op_type};
   attrs["op_name"] = {/*required=*/false, AttrTy::kString, &op_name};
   attrs["source_file"] = {/*required=*/false, AttrTy::kString, &source_file};
   attrs["source_line"] = {/*required=*/false, AttrTy::kInt32, &source_line};
   attrs["profile_type"] = {/*required=*/false, AttrTy::kBracedInt64List,
                            &profile_type};
+  attrs["deduplicated_name"] = {/*required=*/false, AttrTy::kString,
+                                &deduplicated_name};
   if (!ParseSubAttributes(attrs)) {
     return false;
   }
@@ -5652,6 +5677,9 @@ bool HloParserImpl::ParseMetadata(OpMetadata* metadata) {
       metadata->add_profile_type(static_cast<ProfileType>(type));
     }
   }
+  if (deduplicated_name) {
+    metadata->set_deduplicated_name(*deduplicated_name);
+  }
   return true;
 }
 
@@ -5744,12 +5772,12 @@ bool HloParserImpl::ParseOpcode(
       if (!status_or_result.ok()) {
         return TokenError(
             StrFormat("expects async wrapped opcode but sees: %s, error: %s",
-                      val, status_or_result.status().error_message()));
+                      val, status_or_result.status().message()));
       }
       *async_wrapped_opcode = status_or_result.value();
     } else {
       return TokenError(StrFormat("expects opcode but sees: %s, error: %s", val,
-                                  status_or_result.status().error_message()));
+                                  status_or_result.status().message()));
     }
   } else {
     *opcode = status_or_result.value();
@@ -5824,8 +5852,7 @@ bool HloParserImpl::ParseFusionKind(HloInstruction::FusionKind* result) {
   auto status_or_result = StringToFusionKind(val);
   if (!status_or_result.ok()) {
     return TokenError(StrFormat("expects fusion kind but sees: %s, error: %s",
-                                val,
-                                status_or_result.status().error_message()));
+                                val, status_or_result.status().message()));
   }
   *result = status_or_result.value();
   lexer_.Lex();
@@ -5842,7 +5869,7 @@ bool HloParserImpl::ParseRandomDistribution(RandomDistribution* result) {
   if (!status_or_result.ok()) {
     return TokenError(
         StrFormat("expects random distribution but sees: %s, error: %s", val,
-                  status_or_result.status().error_message()));
+                  status_or_result.status().message()));
   }
   *result = status_or_result.value();
   lexer_.Lex();
@@ -5859,7 +5886,7 @@ bool HloParserImpl::ParseRandomAlgorithm(RandomAlgorithm* result) {
   if (!status_or_result.ok()) {
     return TokenError(
         StrFormat("expects random algorithm but sees: %s, error: %s", val,
-                  status_or_result.status().error_message()));
+                  status_or_result.status().message()));
   }
   *result = status_or_result.value();
   lexer_.Lex();
@@ -5875,8 +5902,7 @@ bool HloParserImpl::ParsePrecision(PrecisionConfig::Precision* result) {
   auto status_or_result = StringToPrecision(val);
   if (!status_or_result.ok()) {
     return TokenError(StrFormat("expects precision but sees: %s, error: %s",
-                                val,
-                                status_or_result.status().error_message()));
+                                val, status_or_result.status().message()));
   }
   *result = status_or_result.value();
   lexer_.Lex();
@@ -6169,8 +6195,8 @@ bool HloParserImpl::ParseSingleInstruction(HloModule* module) {
   if (lexer_.GetKind() != TokKind::kEof) {
     Error(
         lexer_.GetLoc(),
-        "Syntax error:\nExpected eof after parsing single instruction.  Did "
-        "you mean to write an HLO module and forget the \"HloModule\" header?");
+        "Syntax error:\nExpected eof after parsing single instruction. Did you"
+        " mean to write an HLO module and forget the \"HloModule\" header?");
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index b2c963a6d80..42ec54a6206 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -84,7 +84,7 @@ std::vector<TestData> CreateTestCases() {
 // ax + y
 {
 "AxpyParam",
-R"(HloModule axpy_module, entry_computation_layout={(f32[],f32[2,4]{1,0},f32[2,4]{1,0})->f32[2,4]{1,0}}
+R"(HloModule axpy_module, entry_computation_layout={(f32[], f32[2,4]{1,0}, f32[2,4]{1,0})->f32[2,4]{1,0}}
 
 ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   %alpha = f32[] parameter(0)
@@ -100,7 +100,7 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
 // parameter replication
 {
 "ParamReplication",
-R"(HloModule param_replication_module, entry_computation_layout={(f32[],(f32[2,4]{1,0}, (f32[2,4]{1,0})))->(f32[], (f32[2,4]{1,0}, (f32[2,4]{1,0})))}
+R"(HloModule param_replication_module, entry_computation_layout={(f32[], (f32[2,4]{1,0}, (f32[2,4]{1,0})))->(f32[], (f32[2,4]{1,0}, (f32[2,4]{1,0})))}
 
 ENTRY %param_replication (a: f32[], b: (f32[2,4], (f32[2,4]))) -> (f32[], (f32[2,4], (f32[2,4]))) {
   %a = f32[] parameter(0), parameter_replication={true}
@@ -210,6 +210,17 @@ ENTRY %IsFiniteR1F32s.v2 () -> f8e4m3fn[3] {
   ROOT %constant = f8e4m3fn[3]{0} constant({nan, 7, -nan})
 }
 
+)"
+},
+// NaN constants for F8E4M3B11
+{
+"ConstantNonFiniteE4M3B11",
+R"(HloModule ConstantR1F8E4M3B11_module, entry_computation_layout={()->f8e4m3b11fnuz[2]{0}}
+
+ENTRY %IsFiniteR1F32s.v2 () -> f8e4m3b11fnuz[2] {
+  ROOT %constant = f8e4m3b11fnuz[2]{0} constant({-nan, 7})
+}
+
 )"
 },
 // constant f16
@@ -260,7 +271,7 @@ ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) {
 // v1 > v2 ? v1 : v2
 {
 "SelectR1F32",
-R"(HloModule SelectR1F32WithCmpR1F32sFromParamsSmall_module, entry_computation_layout={(f32[4]{0},f32[4]{0})->f32[4]{0}}
+R"(HloModule SelectR1F32WithCmpR1F32sFromParamsSmall_module, entry_computation_layout={(f32[4]{0}, f32[4]{0})->f32[4]{0}}
 
 ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f32[4] {
   %v1 = f32[4]{0} parameter(0), sharding={maximal device=1}
@@ -285,7 +296,7 @@ ENTRY %EmptyTupleCreate.v1 () -> () {
 // tuple
 {
 "TupleCreate",
-R"(HloModule TupleCreate_module, entry_computation_layout={(f32[],f32[3]{0},f32[2,3]{1,0})->(f32[], f32[3]{0}, f32[2,3]{1,0})}
+R"(HloModule TupleCreate_module, entry_computation_layout={(f32[], f32[3]{0}, f32[2,3]{1,0})->(f32[], f32[3]{0}, f32[2,3]{1,0})}
 
 ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) {
   %v1 = f32[] parameter(0)
@@ -310,7 +321,7 @@ ENTRY %TupleCreate.v4 (v: f32[]) -> (f32[], f32[], f32[], f32[], f32[], /*index=
 },
 {
 "ShardedTupleCreate",
-R"(HloModule ShardedTupleCreate_module, entry_computation_layout={(f32[],f32[3]{0},f32[2,3]{1,0})->(f32[], f32[3]{0}, f32[2,3]{1,0})}
+R"(HloModule ShardedTupleCreate_module, entry_computation_layout={(f32[], f32[3]{0}, f32[2,3]{1,0})->(f32[], f32[3]{0}, f32[2,3]{1,0})}
 
 ENTRY %ShardedTupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) {
   %v1 = f32[] parameter(0), sharding={manual}
@@ -361,7 +372,7 @@ ENTRY %WhileWithScalarS32Result.v2 () -> s32[] {
 {
 "CopyStartAndCopyDone",
 
-R"(HloModule CopyStartAndCopyDone_module, entry_computation_layout={(f32[],f32[2,3]{1,0:S(1)})->(f32[], f32[2,3]{1,0:S(2)})}
+R"(HloModule CopyStartAndCopyDone_module, entry_computation_layout={(f32[], f32[2,3]{1,0:S(1)})->(f32[], f32[2,3]{1,0:S(2)})}
 
 ENTRY %CopyStartAndCopyDone (v1: f32[], v2: f32[2,3]) -> (f32[], f32[2,3]) {
   %v1 = f32[] parameter(0)
@@ -551,7 +562,7 @@ ENTRY %R4UnitWindowScalar () -> (f32[], f32[]) {
 // convolution
 {
 "Convolution",
-R"(HloModule Convolve1D1Window_0_module, entry_computation_layout={(f32[1,2,1]{2,1,0},f32[1,1,1]{2,1,0})->f32[1,2,1]{2,0,1}}
+R"(HloModule Convolve1D1Window_0_module, entry_computation_layout={(f32[1,2,1]{2,1,0}, f32[1,1,1]{2,1,0})->f32[1,2,1]{2,0,1}}
 
 ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
   %input = f32[1,2,1]{2,1,0} parameter(0)
@@ -565,7 +576,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 // convolution dynamic
 {
 "ConvolutionDynamic",
-R"(HloModule Convolve1D1Window_0_module, entry_computation_layout={(f32[1,2,1]{2,1,0},f32[1,1,1]{2,1,0})->f32[1,2,1]{2,0,1}}
+R"(HloModule Convolve1D1Window_0_module, entry_computation_layout={(f32[1,2,1]{2,1,0}, f32[1,1,1]{2,1,0})->f32[1,2,1]{2,0,1}}
 
 ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
   %input = f32[1,2,1]{2,1,0} parameter(0)
@@ -579,7 +590,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 // convolution rank 2
 {
 "ConvolutionR2",
-R"(HloModule ConvolveR2_module, entry_computation_layout={(f32[1,2]{1,0},f32[2,2]{1,0})->f32[1,2]{0,1}}
+R"(HloModule ConvolveR2_module, entry_computation_layout={(f32[1,2]{1,0}, f32[2,2]{1,0})->f32[1,2]{0,1}}
 
 ENTRY %ConvolveR2.v3 (input: f32[1,2], filter: f32[2,2]) -> f32[1,2] {
   %input = f32[1,2]{1,0} parameter(0)
@@ -592,7 +603,7 @@ ENTRY %ConvolveR2.v3 (input: f32[1,2], filter: f32[2,2]) -> f32[1,2] {
 // convolution backward
 {
 "ConvolutionBackward",
-R"(HloModule ConvolveBackward_module, entry_computation_layout={(f32[128,7,7,512]{0,3,2,1},f32[3,3,512,512]{3,2,1,0})->f32[128,14,14,512]{0,3,2,1}}
+R"(HloModule ConvolveBackward_module, entry_computation_layout={(f32[128,7,7,512]{0,3,2,1}, f32[3,3,512,512]{3,2,1,0})->f32[128,14,14,512]{0,3,2,1}}
 
 ENTRY %ConvolveBackward (input: f32[128,7,7,512], filter: f32[3,3,512,512]) -> f32[128,14,14,512] {
   %input = f32[128,7,7,512]{0,3,2,1} parameter(0)
@@ -741,7 +752,7 @@ ENTRY %Transpose.v3 (input: c128[1,2,3]) -> c128[1,2,3] {
 // Triangular solve
 {
 "TriangularSolve",
-R"(HloModule TriangularSolve_module, entry_computation_layout={(f32[4,4]{1,0},f32[3,4]{1,0})->f32[3,4]{1,0}}
+R"(HloModule TriangularSolve_module, entry_computation_layout={(f32[4,4]{1,0}, f32[3,4]{1,0})->f32[3,4]{1,0}}
 
 ENTRY %SimpleRightLowerNotranspose.4 (a.1: f32[4,4], b.2: f32[3,4]) -> f32[3,4] {
   %a.1 = f32[4,4]{1,0} parameter(0)
@@ -754,7 +765,7 @@ ENTRY %SimpleRightLowerNotranspose.4 (a.1: f32[4,4], b.2: f32[3,4]) -> f32[3,4]
 // Dynamic slice
 {
 "DynamicSlice",
-R"(HloModule DynamicSlice_module, entry_computation_layout={(s32[2,2,258]{2,1,0},s32[1]{0})->s32[2,2,258]{2,1,0}}
+R"(HloModule DynamicSlice_module, entry_computation_layout={(s32[2,2,258]{2,1,0}, s32[1]{0})->s32[2,2,258]{2,1,0}}
 
 ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -> s32[2,2,258] {
   %original_parameter = s32[2,2,258]{2,1,0} parameter(0)
@@ -769,7 +780,7 @@ ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -
 // Dynamic slice with scalar indices
 {
 "DynamicSliceScalarIndices",
-R"(HloModule DynamicSlice_module, entry_computation_layout={(s32[2,2,258]{2,1,0},s32[])->s32[2,2,258]{2,1,0}}
+R"(HloModule DynamicSlice_module, entry_computation_layout={(s32[2,2,258]{2,1,0}, s32[])->s32[2,2,258]{2,1,0}}
 
 ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[]) -> s32[2,2,258] {
   %original_parameter = s32[2,2,258]{2,1,0} parameter(0)
@@ -783,7 +794,7 @@ ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[]) ->
 // Dynamic update slice
 {
 "DynamicUpdateSlice",
-R"(HloModule DynamicSlice_module, entry_computation_layout={(s32[1,1,25,1]{3,2,1,0},s32[1,1,2,1]{3,2,1,0},s32[4]{0})->s32[1,1,25,1]{3,2,1,0}}
+R"(HloModule DynamicSlice_module, entry_computation_layout={(s32[1,1,25,1]{3,2,1,0}, s32[1,1,2,1]{3,2,1,0}, s32[4]{0})->s32[1,1,25,1]{3,2,1,0}}
 
 ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_indices: s32[4]) -> s32[1,1,25,1] {
   %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
@@ -797,7 +808,7 @@ ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_
 // Dynamic update slice with scalar indices
 {
 "DynamicUpdateSliceScalarIndex",
-R"(HloModule DynamicUpdateSlice_module, entry_computation_layout={(s32[1,1,25,1]{3,2,1,0},s32[1,1,2,1]{3,2,1,0},s32[],s32[],s32[],s32[])->s32[1,1,25,1]{3,2,1,0}}
+R"(HloModule DynamicUpdateSlice_module, entry_computation_layout={(s32[1,1,25,1]{3,2,1,0}, s32[1,1,2,1]{3,2,1,0}, s32[], s32[], s32[], /*index=5*/s32[])->s32[1,1,25,1]{3,2,1,0}}
 
 ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_index.0: s32[], start_index.1: s32[], start_index.2: s32[], start_index.3: s32[]) -> s32[1,1,25,1] {
   %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
@@ -828,7 +839,7 @@ ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
 // batch norm inference
 {
 "BatchNormInference",
-R"(HloModule BatchNormInference_module, entry_computation_layout={(f32[2,2,2,2]{3,2,1,0},f32[2]{0},f32[2]{0},f32[2]{0},f32[2]{0})->f32[2,2,2,2]{3,2,1,0}}
+R"(HloModule BatchNormInference_module, entry_computation_layout={(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}, f32[2]{0}, f32[2]{0})->f32[2,2,2,2]{3,2,1,0}}
 
 ENTRY %BatchNormInference.v6 (input: f32[2,2,2,2], offset: f32[2], scale: f32[2], mean: f32[2], variance: f32[2]) -> f32[2,2,2,2] {
   %input = f32[2,2,2,2]{3,2,1,0} parameter(0)
@@ -844,7 +855,7 @@ ENTRY %BatchNormInference.v6 (input: f32[2,2,2,2], offset: f32[2], scale: f32[2]
 // batch norm grad
 {
 "BatchNormGrad",
-R"(HloModule BatchNormGrad_module, entry_computation_layout={(f32[2,2,2,2]{3,2,1,0},f32[2]{0},f32[2]{0},f32[2]{0},f32[2,2,2,2]{3,2,1,0})->(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0})}
+R"(HloModule BatchNormGrad_module, entry_computation_layout={(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}, f32[2]{0}, f32[2,2,2,2]{3,2,1,0})->(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0})}
 
 ENTRY %BatchNormGrad.v4 (input: f32[2,2,2,2], scale: f32[2], mean: f32[2], variance: f32[2], grad_output: f32[2,2,2,2]) -> (f32[2,2,2,2], f32[2], f32[2]) {
   %input = f32[2,2,2,2]{3,2,1,0} parameter(0)
@@ -979,7 +990,7 @@ ENTRY %fusion.v3 () -> f32[3,2,1,1] {
 // FusionWithAliasing
 {
 "FusionWithAliasing",
-R"(HloModule FusionWithAliasing, entry_computation_layout={((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}),f32[123,4]{0,1})->(f32[123,4]{0,1}, f32[2,2]{0,1}, f32[1,2,3]{0,1,2})}
+R"(HloModule FusionWithAliasing, entry_computation_layout={((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}), f32[123,4]{0,1})->(f32[123,4]{0,1}, f32[2,2]{0,1}, f32[1,2,3]{0,1,2})}
 
 %FusedComp (p0: (f32[2,2], f32[42,2,3]), p1: f32[123,4]) -> (f32[123,4], f32[2,2], f32[1,2,3]) {
   %p1 = f32[123,4]{0,1} parameter(1)
@@ -1000,7 +1011,7 @@ ENTRY %FusionWithAliasing (p0.1: (f32[2,2], f32[42,2,3]), p1.1: f32[123,4]) -> (
 },
 {
 "Gather",
-R"(HloModule StringifyGather, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0},s64[10,9,8,7,5]{4,3,2,1,0})->f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0}}
+R"(HloModule StringifyGather, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0}, s64[10,9,8,7,5]{4,3,2,1,0})->f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0}}
 
 ENTRY %Gather (input_tensor: f32[50,49,48,47,46], start_indices: s64[10,9,8,7,5]) -> f32[10,9,8,7,30,29,28,27,26] {
   %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
@@ -1012,7 +1023,7 @@ ENTRY %Gather (input_tensor: f32[50,49,48,47,46], start_indices: s64[10,9,8,7,5]
 },
 {
 "SortedGather",
-R"(HloModule StringifyGather, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0},s64[10,9,8,7,5]{4,3,2,1,0})->f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0}}
+R"(HloModule StringifyGather, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0}, s64[10,9,8,7,5]{4,3,2,1,0})->f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0}}
 
 ENTRY %Gather (input_tensor: f32[50,49,48,47,46], start_indices: s64[10,9,8,7,5]) -> f32[10,9,8,7,30,29,28,27,26] {
   %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
@@ -1024,7 +1035,7 @@ ENTRY %Gather (input_tensor: f32[50,49,48,47,46], start_indices: s64[10,9,8,7,5]
 },
 {
 "Scatter",
-R"(HloModule StringifyScatter, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0},s64[10,9,8,7,5]{4,3,2,1,0},f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0})->f32[50,49,48,47,46]{4,3,2,1,0}}
+R"(HloModule StringifyScatter, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0}, s64[10,9,8,7,5]{4,3,2,1,0}, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0})->f32[50,49,48,47,46]{4,3,2,1,0}}
 
 %add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
   %lhs = f32[] parameter(0)
@@ -1043,7 +1054,7 @@ ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7
 },
 {
 "TupleScatter",
-R"(HloModule TupleScatter, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0},bf16[50,49,48,47,46]{4,3,2,1,0},s64[10,9,8,7,5]{4,3,2,1,0},f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0},bf16[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0})->(f32[50,49,48,47,46]{4,3,2,1,0}, bf16[50,49,48,47,46]{4,3,2,1,0})}
+R"(HloModule TupleScatter, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0}, bf16[50,49,48,47,46]{4,3,2,1,0}, s64[10,9,8,7,5]{4,3,2,1,0}, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0}, bf16[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0})->(f32[50,49,48,47,46]{4,3,2,1,0}, bf16[50,49,48,47,46]{4,3,2,1,0})}
 
 %add_F32_mul_BF16 (lhs_0: f32[], lhs_1: bf16[], rhs_0: f32[], rhs_1: bf16[]) -> (f32[], bf16[]) {
   %lhs_0 = f32[] parameter(0)
@@ -1068,7 +1079,7 @@ ENTRY %Scatter (input_0: f32[50,49,48,47,46], input_1: bf16[50,49,48,47,46], sca
 },
 {
 "SortedScatter",
-R"(HloModule StringifySortedScatter, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0},s64[10,9,8,7,5]{4,3,2,1,0},f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0})->f32[50,49,48,47,46]{4,3,2,1,0}}
+R"(HloModule StringifySortedScatter, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0}, s64[10,9,8,7,5]{4,3,2,1,0}, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0})->f32[50,49,48,47,46]{4,3,2,1,0}}
 
 %add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
   %lhs = f32[] parameter(0)
@@ -1087,7 +1098,7 @@ ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7
 },
 {
 "UniqueIndicesScatter",
-R"(HloModule StringifyUniqueIndicesScatter, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0},s64[10,9,8,7,5]{4,3,2,1,0},f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0})->f32[50,49,48,47,46]{4,3,2,1,0}}
+R"(HloModule StringifyUniqueIndicesScatter, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0}, s64[10,9,8,7,5]{4,3,2,1,0}, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0})->f32[50,49,48,47,46]{4,3,2,1,0}}
 
 %add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
   %lhs = f32[] parameter(0)
@@ -1128,7 +1139,7 @@ ENTRY %ConstantUnsignedNoOverflow () -> u64[] {
 // CustomCallWithLayoutConstraints
 {
 "CustomCallWithLayoutConstraints",
-R"(HloModule CustomCallWithLayoutConstraints, entry_computation_layout={(f32[42,2,3]{0,1,2},f32[123,4]{0,1})->f32[1,2,3]{0,2,1}}
+R"(HloModule CustomCallWithLayoutConstraints, entry_computation_layout={(f32[42,2,3]{0,1,2}, f32[123,4]{0,1})->f32[1,2,3]{0,2,1}}
 
 ENTRY %CustomCallWithLayoutConstraints (p0: f32[42,2,3], p1: f32[123,4]) -> f32[1,2,3] {
   %p0 = f32[42,2,3]{0,1,2} parameter(0)
@@ -1152,7 +1163,7 @@ ENTRY %CustomCallWithLayoutConstraints () -> f32[1,2,3] {
 // CustomCallWithLayoutConstraintsTupleShapes
 {
 "CustomCallWithLayoutConstraintsTupleShapes",
-R"(HloModule CustomCallWithLayoutConstraintsTupleShapes, entry_computation_layout={((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}),f32[123,4]{0,1})->(f32[1,2,3]{0,2,1}, f32[1,2,3]{1,2,0})}
+R"(HloModule CustomCallWithLayoutConstraintsTupleShapes, entry_computation_layout={((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}), f32[123,4]{0,1})->(f32[1,2,3]{0,2,1}, f32[1,2,3]{1,2,0})}
 
 ENTRY %CustomCallWithLayoutConstraints (p0: (f32[2,2], f32[42,2,3]), p1: f32[123,4]) -> (f32[1,2,3], f32[1,2,3]) {
   %p0 = (f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) parameter(0)
@@ -1165,7 +1176,7 @@ ENTRY %CustomCallWithLayoutConstraints (p0: (f32[2,2], f32[42,2,3]), p1: f32[123
 // CustomCallWithHasSideEffect
 {
 "CustomCallWithHasSideEffect",
-R"(HloModule CustomCallWithHasSideEffect, entry_computation_layout={((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}),f32[123,4]{0,1})->(f32[1,2,3]{0,2,1}, f32[1,2,3]{1,2,0})}
+R"(HloModule CustomCallWithHasSideEffect, entry_computation_layout={((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}), f32[123,4]{0,1})->(f32[1,2,3]{0,2,1}, f32[1,2,3]{1,2,0})}
 
 ENTRY %CustomCallWithHasSideEffect (p0: (f32[2,2], f32[42,2,3]), p1: f32[123,4]) -> (f32[1,2,3], f32[1,2,3]) {
   %p0 = (f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) parameter(0)
@@ -1178,7 +1189,7 @@ ENTRY %CustomCallWithHasSideEffect (p0: (f32[2,2], f32[42,2,3]), p1: f32[123,4])
 // CustomCallWithAliasing
 {
 "CustomCallWithAliasing",
-R"(HloModule CustomCallWithAliasing, entry_computation_layout={((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}),f32[123,4]{0,1})->(f32[123,4]{0,1}, f32[2,2]{0,1}, f32[1,2,3]{0,1,2})}
+R"(HloModule CustomCallWithAliasing, entry_computation_layout={((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}), f32[123,4]{0,1})->(f32[123,4]{0,1}, f32[2,2]{0,1}, f32[1,2,3]{0,1,2})}
 
 ENTRY %CustomCallWithAliasing (p0: (f32[2,2], f32[42,2,3]), p1: f32[123,4]) -> (f32[123,4], f32[2,2], f32[1,2,3]) {
   %p0 = (f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) parameter(0)
@@ -1341,7 +1352,19 @@ ENTRY %Entry (p0: f32[10]) -> f32[20] {
 }, execution_thread="main_thread"
 
 )"
-  },
+},
+
+{
+"MetadataFields",
+R"(HloModule test, entry_computation_layout={(f32[100]{0})->u32[100]{0}}
+
+ENTRY %test (p: f32[100]) -> u32[100] {
+  %p = f32[100]{0} parameter(0)
+  ROOT %root = u32[100]{0} bitcast-convert(f32[100]{0} %p), metadata={op_type="a" op_name="b" source_file="c" source_line=1 profile_type={1} deduplicated_name="d"}
+}
+
+)"
+},
 });
   // clang-format on
 }
@@ -1352,7 +1375,7 @@ std::vector<TestData> CreateShortTestCases() {
 // map
 {
 "Map",
-R"(HloModule MapBinaryAdder_module, entry_computation_layout={(f32[4]{0},f32[4]{0})->f32[4]{0}}
+R"(HloModule MapBinaryAdder_module, entry_computation_layout={(f32[4]{0}, f32[4]{0})->f32[4]{0}}
 
 add_F32.v3 {
   lhs = f32[] parameter(0)
@@ -1390,7 +1413,7 @@ ENTRY ReduceR3ToR2.v3 {
 // tuple reduce
 {
 "TupleReduce",
-R"(HloModule TupleReduce, entry_computation_layout={(f32[1024]{0},s32[1024]{0})->(f32[], s32[])}
+R"(HloModule TupleReduce, entry_computation_layout={(f32[1024]{0}, s32[1024]{0})->(f32[], s32[])}
 
 max_argmax {
   value = f32[] parameter(2)
@@ -1477,7 +1500,7 @@ ENTRY Sort {
 // Sort (Key, Value)
 {
 "SortKeyValue",
-R"(HloModule sort, entry_computation_layout={(f32[1024]{0},s32[1024]{0})->(f32[1024]{0}, s32[1024]{0})}
+R"(HloModule sort, entry_computation_layout={(f32[1024]{0}, s32[1024]{0})->(f32[1024]{0}, s32[1024]{0})}
 
 compare {
   p.1.lhs = s32[] parameter(2)
@@ -1516,7 +1539,7 @@ ENTRY Sort {
 // R2 Sort (Key, Value)
 {
 "SortKeyValueR2",
-R"(HloModule sort, entry_computation_layout={(f32[1024,16]{0,1},s32[1024,16]{0,1})->(f32[1024,16]{0,1}, s32[1024,16]{0,1})}
+R"(HloModule sort, entry_computation_layout={(f32[1024,16]{0,1}, s32[1024,16]{0,1})->(f32[1024,16]{0,1}, s32[1024,16]{0,1})}
 
 compare {
   p.1.lhs = s32[] parameter(2)
@@ -1537,7 +1560,7 @@ ENTRY Sort {
 // Sort (Key, Value, Value, Value)
 {
 "SortManyValues",
-R"(HloModule sort, entry_computation_layout={(f32[1024,16]{0,1},s32[1024,16]{0,1},u32[1024,16]{0,1},f32[1024,16]{0,1})->(f32[1024,16]{0,1}, s32[1024,16]{0,1}, u32[1024,16]{0,1}, f32[1024,16]{0,1})}
+R"(HloModule sort, entry_computation_layout={(f32[1024,16]{0,1}, s32[1024,16]{0,1}, u32[1024,16]{0,1}, f32[1024,16]{0,1})->(f32[1024,16]{0,1}, s32[1024,16]{0,1}, u32[1024,16]{0,1}, f32[1024,16]{0,1})}
 
 compare {
   p.1.lhs = s32[] parameter(2)
@@ -1695,7 +1718,7 @@ ENTRY add_constants {
 },
 {
 "Dot",
-R"(HloModule dot, entry_computation_layout={(f32[2,10]{1,0},f32[10,2]{1,0})->f32[2]{0}}
+R"(HloModule dot, entry_computation_layout={(f32[2,10]{1,0}, f32[10,2]{1,0})->f32[2]{0}}
 
 ENTRY dot {
   a = f32[2,10]{1,0} parameter(0)
@@ -1707,7 +1730,7 @@ ENTRY dot {
 },
 {
 "gather",
-R"(HloModule gather, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0},s64[10,9,8,7,5]{4,3,2,1,0})->f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0}}
+R"(HloModule gather, entry_computation_layout={(f32[50,49,48,47,46]{4,3,2,1,0}, s64[10,9,8,7,5]{4,3,2,1,0})->f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0}}
 
 ENTRY Gather {
   input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
@@ -1880,7 +1903,7 @@ ENTRY AllToAll {
 // all-to-all with subgroups
 {
 "AllToAllWithSubgroups",
-R"(HloModule AllToAllWithSubgroups, entry_computation_layout={(f32[128,32]{0,1},f32[128,32]{0,1})->(f32[128,32]{0,1}, f32[128,32]{0,1})}
+R"(HloModule AllToAllWithSubgroups, entry_computation_layout={(f32[128,32]{0,1}, f32[128,32]{0,1})->(f32[128,32]{0,1}, f32[128,32]{0,1})}
 
 ENTRY AllToAllWithSubgroups {
   p0 = f32[128,32]{0,1} parameter(0)
@@ -2095,7 +2118,7 @@ ENTRY Computation {
 // is_scheduled=true attribute
 {
 "ScheduledModule",
-R"(HloModule scheduled_module, is_scheduled=true, entry_computation_layout={(f32[1024]{0},s32[1024]{0})->(f32[1024]{0}, s32[1024]{0})}
+R"(HloModule scheduled_module, is_scheduled=true, entry_computation_layout={(f32[1024]{0}, s32[1024]{0})->(f32[1024]{0}, s32[1024]{0})}
 
 compare {
   p.1.lhs = s32[] parameter(2)
@@ -2151,9 +2174,11 @@ ENTRY AddDependency {
 R"(HloModule MinMaxValues, entry_computation_layout={()->c128[2]{0}}
 
 ENTRY MinMaxValues {
+  x.s4 = s4[2]{0} constant({-8, 7})
   x.s8 = s8[2]{0} constant({-128, 127})
   x.s16 = s16[2]{0} constant({-32768, 32767})
   x.s32 = s32[2]{0} constant({-2147483648, 2147483647})
+  x.u4 = u4[2]{0} constant({0, 15})
   x.u8 = u8[2]{0} constant({0, 255})
   x.u16 = u16[2]{0} constant({0, 65535})
   x.u32 = u32[2]{0} constant({0, 4294967295})
@@ -2194,7 +2219,7 @@ R"(HloModule test
 ENTRY test {
     ROOT root = add(f32[10] parameter(0), multiply(f32[10] parameter(1), f32[10] parameter(2)))
 })",
-R"(HloModule test, entry_computation_layout={(f32[10]{0},f32[10]{0},f32[10]{0})->f32[10]{0}}
+R"(HloModule test, entry_computation_layout={(f32[10]{0}, f32[10]{0}, f32[10]{0})->f32[10]{0}}
 
 ENTRY test {
   parameter.anon = f32[10]{0} parameter(0)
@@ -2212,7 +2237,7 @@ ENTRY test {
   add = add(f32[10] parameter(0), f32[10] parameter(1))
   ROOT add2 = add(add, add(add, add))
 })",
-R"(HloModule test, entry_computation_layout={(f32[10]{0},f32[10]{0})->f32[10]{0}}
+R"(HloModule test, entry_computation_layout={(f32[10]{0}, f32[10]{0})->f32[10]{0}}
 
 ENTRY test {
   parameter.anon = f32[10]{0} parameter(0)
@@ -2232,7 +2257,7 @@ ENTRY test {
     (f32[10], f16[10]) tuple(f32[10] parameter(0), f16[10] parameter(1))
   ), index=0
 })",
-R"(HloModule test, entry_computation_layout={(f32[10]{0},f16[10]{0})->f32[10]{0}}
+R"(HloModule test, entry_computation_layout={(f32[10]{0}, f16[10]{0})->f32[10]{0}}
 
 ENTRY test {
   parameter.anon = f32[10]{0} parameter(0)
@@ -2250,7 +2275,7 @@ ENTRY test {
   add = add(f32[10] parameter(0), f32[10] parameter(1))
   ROOT root = tuple(add, add(add, add), add)
 })",
-R"(HloModule test, entry_computation_layout={(f32[10]{0},f32[10]{0})->(f32[10]{0}, f32[10]{0}, f32[10]{0})}
+R"(HloModule test, entry_computation_layout={(f32[10]{0}, f32[10]{0})->(f32[10]{0}, f32[10]{0}, f32[10]{0})}
 
 ENTRY test {
   parameter.anon = f32[10]{0} parameter(0)
@@ -2626,7 +2651,7 @@ ENTRY %some_2x3 () -> f32[2,3] {
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
   EXPECT_NE(OkStatus(), result.status());
-  ExpectHasSubstr(result.status().error_message(), "unexpected '}' token");
+  ExpectHasSubstr(result.status().message(), "unexpected '}' token");
 }
 
 TEST_F(HloParserTest, LiteralDimensionsMismatch_1) {
@@ -2639,7 +2664,7 @@ ENTRY %some_2 () -> f32[2] {
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
   EXPECT_NE(OkStatus(), result.status());
-  ExpectHasSubstr(result.status().error_message(),
+  ExpectHasSubstr(result.status().message(),
                   "expects nested array in rank 1, but sees larger");
 }
 
@@ -2653,7 +2678,7 @@ ENTRY %some_2x3 () -> f32[2,3] {
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
   EXPECT_NE(OkStatus(), result.status());
-  ExpectHasSubstr(result.status().error_message(),
+  ExpectHasSubstr(result.status().message(),
                   "expects nested array in rank 2, but sees 1");
 }
 
@@ -2667,7 +2692,7 @@ ENTRY %some_2x3x2 () -> f32[2,3,2] {
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
   EXPECT_NE(OkStatus(), result.status());
-  ExpectHasSubstr(result.status().error_message(),
+  ExpectHasSubstr(result.status().message(),
                   "expects 3 elements in the [0]th element");
 }
 
@@ -2682,7 +2707,7 @@ ENTRY %ConstantF16Overflow.v4 () -> f16[] {
 )";
   auto result = ParseAndReturnUnverifiedModule(original);
   EXPECT_NE(OkStatus(), result.status());
-  ExpectHasSubstr(result.status().error_message(),
+  ExpectHasSubstr(result.status().message(),
                   "is out of range for literal's primitive type F16");
 }
 
@@ -2703,9 +2728,56 @@ TEST_F(HloParserTest, ConstantBf16Overflow) {
   ENTRY test {
     ROOT c = bf16[] constant(1e100)
   })";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "out of range");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "out of range");
+}
+
+TEST_F(HloParserTest, ConstantU4Underflow) {
+  const std::string original = R"(
+      HloModule ConstantU4Underflow_module
+      ENTRY %ConstantU4Underflow () -> u4[] {
+        ROOT %constant = u4[] constant(-1)
+      })";
+  auto result = ParseAndReturnUnverifiedModule(original);
+  EXPECT_NE(OkStatus(), result.status());
+  ExpectHasSubstr(result.status().message(),
+                  "is out of range for literal's primitive type U4");
+}
+
+TEST_F(HloParserTest, ConstantU4Overflow) {
+  const std::string original = R"(
+      HloModule ConstantU4Overflow_module
+      ENTRY %ConstantU4Overflow () -> u4[] {
+        ROOT %constant = u4[] constant(16)
+      })";
+  auto result = ParseAndReturnUnverifiedModule(original);
+  EXPECT_NE(OkStatus(), result.status());
+  ExpectHasSubstr(result.status().message(),
+                  "is out of range for literal's primitive type U4");
+}
+
+TEST_F(HloParserTest, ConstantS4Underflow) {
+  const std::string original = R"(
+      HloModule ConstantS4Underflow_module
+      ENTRY %ConstantS4Underflow () -> s4[] {
+        ROOT %constant = s4[] constant(-9)
+      })";
+  auto result = ParseAndReturnUnverifiedModule(original);
+  EXPECT_NE(OkStatus(), result.status());
+  ExpectHasSubstr(result.status().message(),
+                  "is out of range for literal's primitive type S4");
+}
+
+TEST_F(HloParserTest, ConstantS4Overflow) {
+  const std::string original = R"(
+      HloModule ConstantS4Overflow_module
+      ENTRY %ConstantS4Overflow () -> s4[] {
+        ROOT %constant = s4[] constant(8)
+      })";
+  auto result = ParseAndReturnUnverifiedModule(original);
+  EXPECT_NE(OkStatus(), result.status());
+  ExpectHasSubstr(result.status().message(),
+                  "is out of range for literal's primitive type S4");
 }
 
 TEST_F(HloParserTest, ConstantUnsignedUnderflow) {
@@ -2726,7 +2798,7 @@ TEST_F(HloParserTest, ConstantUnsignedOverflow) {
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
   EXPECT_NE(OkStatus(), result.status());
-  ExpectHasSubstr(result.status().error_message(),
+  ExpectHasSubstr(result.status().message(),
                   "is out of range for literal's primitive type U32");
 }
 
@@ -2846,9 +2918,8 @@ ENTRY %NanPayload () -> bf16[1] {
 }
 
 )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "tries to set NaN payload 0x3ff");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "tries to set NaN payload 0x3ff");
 }
 
 TEST_F(HloParserTest, InvalidNanPayloadF8e4m3fn) {
@@ -2860,9 +2931,21 @@ ENTRY %NanPayload () -> f8e4m3fn[1] {
 }
 
 )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "tries to set NaN payload 0x1");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "tries to set NaN payload 0x1");
+}
+
+TEST_F(HloParserTest, InvalidNanPayloadF8e4m3b11fnuz) {
+  const std::string original =
+      R"(HloModule InvalidNanPayloadF8e4m3b11fnuz_module, entry_computation_layout={()->f8e4m3b11fnuz[1]{0}}
+
+ENTRY %NanPayload () -> f8e4m3b11fnuz[1] {
+  ROOT %constant = f8e4m3b11fnuz[1]{0} constant({nan(0x1)})
+}
+
+)";
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "tries to set NaN payload 0x1");
 }
 
 TEST_F(HloParserTest, AttributesAnyOrder) {
@@ -2895,19 +2978,19 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
   ExpectHasSubstr(ParseAndReturnUnverifiedModule(
                       absl::StrCat(prefix, ",dim_labels=00_01->10", suffix))
                       .status()
-                      .error_message(),
+                      .message(),
                   "expects unique");
 
   ExpectHasSubstr(ParseAndReturnUnverifiedModule(
                       absl::StrCat(prefix, ",dim_labels=012_0123->210", suffix))
                       .status()
-                      .error_message(),
+                      .message(),
                   "must have same number of spatial dimensions");
 
   ExpectHasSubstr(ParseAndReturnUnverifiedModule(
                       absl::StrCat(prefix, ",dim_labels=013_0123->210", suffix))
                       .status()
-                      .error_message(),
+                      .message(),
                   "expects [0-2bf?]");
 }
 
@@ -2924,9 +3007,8 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "unexpected attribute \"calls\"");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "unexpected attribute \"calls\"");
 }
 
 TEST_F(HloParserTest, MissingAttribute) {
@@ -2942,9 +3024,8 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "attribute channel_id is expected but not seen");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "attribute channel_id is expected but not seen");
 }
 
 TEST_F(HloParserTest, PredecessorUndefined) {
@@ -2960,9 +3041,8 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "'done' is not defined");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "'done' is not defined");
 }
 
 TEST_F(HloParserTest, SliceAllowOmitStride1) {
@@ -2988,9 +3068,8 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 }
 
 )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "expects padding_low and padding_high separated by '_'");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "expects padding_low and padding_high separated by '_'");
 }
 
 TEST_F(HloParserTest, CommaBetweenSubAttributes) {
@@ -3011,10 +3090,9 @@ ENTRY %CustomCall () -> f32[1] {
   %constant = f32[1]{0} constant({12345})
   ROOT %foo = f32[1,2,3]{0,2,1} custom-call(f32[1]{0} %constant), custom_call_target="foo\"bar"
 })";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "Shape of computation CustomCall, f32[1], is not compatible "
-      "with that of its root instruction foo, f32[1,2,3]");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "Shape of computation CustomCall, f32[1], is not compatible "
+                  "with that of its root instruction foo, f32[1,2,3]");
 }
 
 TEST_F(HloParserTest, EntryComputationWithLayout) {
@@ -3115,9 +3193,8 @@ ENTRY c1 {
 })";
   // Verify that the error message points to the beginning of the unterminated
   // comment.
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "/* unterminated\n^");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "/* unterminated\n^");
 }
 
 TEST_F(HloParserTest, SlashSlashComments) {
@@ -3155,9 +3232,8 @@ ENTRY c1 {
 ENTRY c2 {
   const2 = f32[1]{0} constant({67890})
 })";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "expects only one ENTRY");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "expects only one ENTRY");
 }
 
 TEST_F(HloParserTest, SimpleAliasing) {
@@ -3218,9 +3294,8 @@ ENTRY entry {
   ROOT %out = (f32[], f32[]) tuple(%p0, %p1)
 }
   )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "Expects '}' at the end of ShapeIndex");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "Expects '}' at the end of ShapeIndex");
 }
 
 TEST_F(HloParserTest, AliasingShapeIndexNotNumerical) {
@@ -3234,9 +3309,8 @@ ENTRY entry {
   ROOT %out = (f32[], f32[]) tuple(%p0, %p1)
 }
   )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "expects integer");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "expects integer");
 }
 
 TEST_F(HloParserTest, AliasingWrongFormatNoColon) {
@@ -3250,9 +3324,8 @@ ENTRY entry {
   ROOT %out = (f32[], f32[]) tuple(%p0, %p1)
 }
   )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "Expects '{' at the start of ShapeIndex");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "Expects '{' at the start of ShapeIndex");
 }
 
 TEST_F(HloParserTest, AliasingWrongFormatTwoColons) {
@@ -3266,9 +3339,8 @@ ENTRY entry {
   ROOT %out = (f32[], f32[]) tuple(%p0, %p1)
 }
   )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "Expects '}' at the end of aliasing description");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "Expects '}' at the end of aliasing description");
 }
 
 TEST_F(HloParserTest, AliasingWrongFormatAlphaParam) {
@@ -3282,9 +3354,8 @@ ENTRY entry {
   ROOT %out = (f32[], f32[]) tuple(%p0, %p1)
 }
   )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "expects integer");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "expects integer");
 }
 
 TEST_F(HloParserTest, MultipleRoots) {
@@ -3293,9 +3364,8 @@ ENTRY consts {
   ROOT const1 = f32[1]{0} constant({12345})
   ROOT const2 = f32[1]{0} constant({12345})
 })";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "one computation should have only one ROOT");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "one computation should have only one ROOT");
 }
 
 TEST_F(HloParserTest, ComputationExists) {
@@ -3306,9 +3376,8 @@ comp {
 comp {
   const2 = f32[1]{0} constant({67890})
 })";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      R"(was parsing 2:1: error: computation previously defined here
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  R"(was parsing 2:1: error: computation previously defined here
 comp {
 ^)");
 }
@@ -3330,7 +3399,7 @@ ENTRY entry {
   ROOT call1 = s32[] call(param), to_apply=tcallb
 })";
   ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
+      ParseAndReturnUnverifiedModule(original).status().message(),
       "was parsing 8:39: error: instruction does not exist: aparam");
 }
 
@@ -3450,9 +3519,8 @@ ENTRY nontuple_infeed {
   token0 = token[] after-all()
   ROOT infeed = pred[] infeed(token0)
 })";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "infeed must have a non-empty tuple shape");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "infeed must have a non-empty tuple shape");
 }
 
 TEST(HloParserSingleOpTest, SingleOp) {
@@ -3598,7 +3666,7 @@ TEST(HloParserSingleOpTest, SingleOpWithNested_DoesNotExist) {
 })";
   auto status = ParseAndReturnUnverifiedModule(text).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("does not exist: x"));
+  EXPECT_THAT(status.message(), HasSubstr("does not exist: x"));
 }
 
 TEST(HloParserSingleOpTest, SingleOpWithNested_NoLhs) {
@@ -3609,7 +3677,7 @@ TEST(HloParserSingleOpTest, SingleOpWithNested_NoLhs) {
 })";
   auto status = ParseAndReturnUnverifiedModule(text).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("expects name"));
+  EXPECT_THAT(status.message(), HasSubstr("expects name"));
 }
 
 TEST(HloParserSingleOpTest, SingleOpWithNested_NoOperandName) {
@@ -3620,7 +3688,7 @@ TEST(HloParserSingleOpTest, SingleOpWithNested_NoOperandName) {
 })";
   auto status = ParseAndReturnUnverifiedModule(text).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("expects name"));
+  EXPECT_THAT(status.message(), HasSubstr("expects name"));
 }
 
 TEST(HloParserSingleOpTest, ConvolutionTrivialFeatureGroupCount) {
@@ -3643,7 +3711,7 @@ TEST(HloParserSingleOpTest, MultipleOpsProducesError) {
   )";
   auto status = ParseAndReturnUnverifiedModule(text).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("Expected eof"));
+  EXPECT_THAT(status.message(), HasSubstr("Expected eof"));
 }
 
 TEST_F(HloParserTest, IsScheduledIsFalse) {
@@ -3744,9 +3812,8 @@ ENTRY %CustomCallWrongNumberofOperandConstraints (p0: f32[42,2,3], p1: f32[123,4
 }
 
 )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "Expected 2 operand layout constraints, 1 given");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "Expected 2 operand layout constraints, 1 given");
 }
 
 TEST_F(HloParserTest, CustomCallIncompatibleOperandConstraints) {
@@ -3760,9 +3827,8 @@ ENTRY %CustomCallIncompatibleOperandConstraints (p0: f32[42,2,3], p1: f32[123,4]
 }
 
 )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "operand 1 is not compatible with operand shape");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "operand 1 is not compatible with operand shape");
 }
 
 TEST_F(HloParserTest, CustomCallWithNonexistentVersion) {
@@ -3774,9 +3840,8 @@ ENTRY %CustomCall () -> f32[1,2,3] {
 }
 
 )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "Unknown API version");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "Unknown API version");
 }
 
 TEST_F(HloParserTest, CustomCallWithUnspecifiedVersion) {
@@ -3788,9 +3853,8 @@ ENTRY %CustomCall () -> f32[1,2,3] {
 }
 
 )";
-  ExpectHasSubstr(
-      ParseAndReturnUnverifiedModule(original).status().error_message(),
-      "Invalid API version");
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(original).status().message(),
+                  "Invalid API version");
 }
 
 TEST_F(HloParserTest, AllowShapeWhitespace) {
@@ -3815,7 +3879,7 @@ ENTRY %entrycomp (p: f32[2,2]) -> f32[2,2] {
 }
 )";
 
-  ExpectHasSubstr(ParseAndReturnUnverifiedModule(text).status().error_message(),
+  ExpectHasSubstr(ParseAndReturnUnverifiedModule(text).status().message(),
                   "The declared operand shape f32[2,5]{1,0} is not compatible"
                   " with the shape of the operand instruction f32[2,2]{1,0}.");
 }
@@ -3897,7 +3961,7 @@ TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
   // Wrong minor_to_major.
   shape_string = "f32[123,456,789]{1:T(2, * , 128)}";
   auto result = ParseShape(shape_string);
-  ExpectHasSubstr(result.status().error_message(),
+  ExpectHasSubstr(result.status().message(),
                   "Dimensions size is 3, but minor to major size is 1.");
 }
 
@@ -4002,7 +4066,7 @@ TEST_F(HloParserTest, NegativeParameterNumber) {
   const std::string hlo_string = "par0 = f32[3,5] parameter(-1)";
   auto result = ParseAndReturnUnverifiedModule(hlo_string);
   ASSERT_FALSE(result.status().ok());
-  EXPECT_THAT(result.status().error_message(),
+  EXPECT_THAT(result.status().message(),
               HasSubstr("parameter number must be >= 0"));
 }
 
@@ -4012,7 +4076,7 @@ TEST_F(HloParserTest, WrongNumberOfParameterLeafBuffersInReplication) {
       "parameter_replication={true,false,true}";
   auto result = ParseAndReturnUnverifiedModule(hlo_string);
   ASSERT_FALSE(result.status().ok());
-  EXPECT_THAT(result.status().error_message(),
+  EXPECT_THAT(result.status().message(),
               HasSubstr("parameter has 2 leaf buffers, but "
                         "parameter_replication has 3 elements"));
 }
@@ -4040,7 +4104,7 @@ TEST_F(HloParserTest, CheckIndexedConditionalDimension) {
   )";
   auto result = ParseAndReturnUnverifiedModule(hlo_string);
   EXPECT_NE(OkStatus(), result.status());
-  EXPECT_THAT(result.status().error_message(),
+  EXPECT_THAT(result.status().message(),
               HasSubstr("The first operand must be a scalar"));
 }
 
@@ -4067,7 +4131,7 @@ TEST_F(HloParserTest, CheckIndexedConditionalElementType) {
   )";
   auto result = ParseAndReturnUnverifiedModule(hlo_string);
   EXPECT_NE(OkStatus(), result.status());
-  EXPECT_THAT(result.status().error_message(),
+  EXPECT_THAT(result.status().message(),
               HasSubstr("The first operand must be a scalar of PRED or S32"));
 }
 
@@ -4095,7 +4159,7 @@ TEST_F(HloParserTest,
   )";
   auto result = ParseAndReturnUnverifiedModule(hlo_string);
   EXPECT_NE(OkStatus(), result.status());
-  EXPECT_THAT(result.status().error_message(),
+  EXPECT_THAT(result.status().message(),
               HasSubstr("unexpected attribute \"branch_computations\""));
 }
 
@@ -4258,7 +4322,7 @@ ENTRY test {
 )";
   auto result = ParseAndReturnVerifiedModule(hlo_string);
   EXPECT_NE(OkStatus(), result.status());
-  EXPECT_THAT(result.status().error_message(), HasSubstr("dimensions"));
+  EXPECT_THAT(result.status().message(), HasSubstr("dimensions"));
 }
 
 TEST_F(HloParserTest, InvalidDimLevelType) {
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index d15d3b3f156..02a9fc025db 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -137,8 +137,8 @@ Status HloPassPipeline::RunInvariantCheckers(
       XLA_VLOG_LINES(2, hlo->ToString());
       return tsl::errors::CreateWithUpdatedMessage(
           changed_status.status(),
-          absl::StrCat(changed_status.status().error_message(),
-                       "\n\nFailed after ", after_pass_name));
+          absl::StrCat(changed_status.status().message(), "\n\nFailed after ",
+                       after_pass_name));
     }
     TF_RET_CHECK(!changed_status.value())
         << "invariant checkers must not change the graph";
@@ -189,7 +189,7 @@ StatusOr<bool> HloPassPipeline::RunPassesInternal(
           auto status_or = RunHelper(pass, hlo, execution_threads);
           if (!status_or.ok()) {
             compilation_stats_->RecordPassError(
-                pass_name, tsl::error_name(status_or.status().code()));
+                pass_name, absl::StatusCodeToString(status_or.status().code()));
           }
           return status_or;
         };
@@ -212,8 +212,8 @@ StatusOr<bool> HloPassPipeline::RunPassesInternal(
                                                   absl::string_view pass_name) {
         auto status = RunInvariantCheckers(hlo, pass_name);
         if (!status.ok()) {
-          compilation_stats_->RecordPassError(pass_name,
-                                              tsl::error_name(status.code()));
+          compilation_stats_->RecordPassError(
+              pass_name, absl::StatusCodeToString(status.code()));
         }
         return status;
       };
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
index c1383c89386..ab071a955c8 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
@@ -79,7 +79,7 @@ class ReverseStringModulePass : public HloModulePass {
     for (HloComputation* computation :
          module->computations(execution_threads)) {
       HloInstruction* root = computation->root_instruction();
-      std::string name = root->name();
+      std::string name(root->name());
       std::reverse(name.begin(), name.end());
       root->SetAndSanitizeName(name);
       changed = true;
@@ -320,10 +320,9 @@ ENTRY main {
 
     Status status = pipeline.Run(module.get()).status();
     ASSERT_IS_NOT_OK(status);
-    EXPECT_THAT(status.error_message(),
+    EXPECT_THAT(status.message(),
                 ::testing::HasSubstr("Module has instruction named bar"));
-    EXPECT_THAT(status.error_message(),
-                ::testing::HasSubstr("Failed after foo2bar"));
+    EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed after foo2bar"));
   }
 
   {
@@ -333,9 +332,9 @@ ENTRY main {
 
     Status status = pipeline.Run(module.get()).status();
     ASSERT_IS_NOT_OK(status);
-    EXPECT_THAT(status.error_message(),
+    EXPECT_THAT(status.message(),
                 ::testing::HasSubstr("Module has instruction named bar"));
-    EXPECT_THAT(status.error_message(),
+    EXPECT_THAT(status.message(),
                 ::testing::HasSubstr("Failed after pipeline-start"));
   }
 }
@@ -359,7 +358,7 @@ ENTRY main {
   Status status = pipeline.Run(module.get()).status();
   ASSERT_IS_NOT_OK(status);
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       ::testing::HasSubstr("Module group pass cannot be run on a module"));
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_phi_graph.cc b/tensorflow/compiler/xla/service/hlo_phi_graph.cc
index 52b2c0eba55..2790410d3f9 100644
--- a/tensorflow/compiler/xla/service/hlo_phi_graph.cc
+++ b/tensorflow/compiler/xla/service/hlo_phi_graph.cc
@@ -115,14 +115,15 @@ void PhiGraph::RegisterPhi(const HloValue& value,
 std::string PhiGraph::ToString() {
   std::string out = "PhiGraph: \n";
   for (auto& node : node_storage_) {
-    std::string is_phi = node->is_phi ? ", phi" : "";
-    std::string is_optimized = node->mark_as_dead ? ", dead" : "";
     absl::StrAppend(&out, node->value_id);
-    absl::StrAppend(&out, is_phi);
-    absl::StrAppend(&out, is_optimized, ":\n");
+    if (node->is_phi) {
+      absl::StrAppend(&out, ", phi");
+    }
+    if (node->mark_as_dead) {
+      absl::StrAppend(&out, ", dead", ":\n");
+    }
     for (Node* input : node->operands) {
-      absl::StrAppend(&out, "  ", input->value_id);
-      absl::StrAppend(&out, "\n");
+      absl::StrAppend(&out, "  ", input->value_id, "\n");
     }
   }
   return out;
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util_test.cc b/tensorflow/compiler/xla/service/hlo_proto_util_test.cc
index f8a86314184..e75c3282335 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_proto_util_test.cc
@@ -33,8 +33,7 @@ TEST_F(HloProtoUtilTest, ParamsAndOutputShapeMissingModule) {
 
   auto status = EntryComputationParameterShapes(hlo_proto).status();
   ASSERT_FALSE(status.ok());
-  ASSERT_THAT(status.error_message(),
-              ::testing::HasSubstr("missing HloModuleProto"));
+  ASSERT_THAT(status.message(), ::testing::HasSubstr("missing HloModuleProto"));
 }
 
 TEST_F(HloProtoUtilTest, MissingProgramShape) {
@@ -44,8 +43,7 @@ TEST_F(HloProtoUtilTest, MissingProgramShape) {
 
   auto status = EntryComputationParameterShapes(hlo_proto).status();
   ASSERT_FALSE(status.ok());
-  ASSERT_THAT(status.error_message(),
-              ::testing::HasSubstr("missing program shape"));
+  ASSERT_THAT(status.message(), ::testing::HasSubstr("missing program shape"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 9e17ebc3476..c635f466b7e 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -31,11 +31,14 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
@@ -43,7 +46,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -1229,13 +1231,14 @@ std::string MemoryUsageTracker::ToString() const {
   for (auto* item = instruction_list_.first(); item != nullptr;
        item = instruction_list_.next(item)) {
     const HloInstruction* instruction = item->instruction;
-    std::string inprogress = item == in_progress_item_ ? " in-progress" : "";
-    std::string placed = item->placed ? " placed" : "";
+    absl::string_view inprogress =
+        item == in_progress_item_ ? " in-progress" : "";
+    absl::string_view placed = item->placed ? " placed" : "";
     absl::StrAppend(&output, "  ", instruction->name(), inprogress, placed,
                     "\n    Defines:\n");
     for (BufferId buffer_id : item->buffers_defined) {
       const Buffer& buffer = buffers_[buffer_id];
-      std::string live = IsCurrentlyLive(buffer_id) ? " live" : "";
+      absl::string_view live = IsCurrentlyLive(buffer_id) ? " live" : "";
       absl::StrAppend(&output, "      ", buffer.ToString(), live, ", ",
                       buffer.unfinished_user_count, " unfinished uses\n");
     }
@@ -1449,9 +1452,7 @@ MemoryUsageTracker::PickRematerializationCandidates(
                           << candidate->ToShortString() << ")"
                           << " now best when compressed into "
                           << compact_shape.ToString(true);
-                  RematStrategy strategy;
-                  strategy.kind = RematStrategy::kCompress;
-                  best_strategy = strategy;
+                  best_strategy.kind = RematStrategy::kCompress;
                   best_strategy.compact_shape = compact_shape;
                   best_items = block;
                   best_cost = cost;
@@ -1549,7 +1550,7 @@ const UsesList MemoryUsageTracker::GetItemUses(Item* item) const {
 StatusOr<int64_t> RematerializeInstructions(
     MemoryUsageTracker* memory_tracker, std::vector<Item*>* best_items,
     absl::flat_hash_set<const HloInstruction*>* remat_move_instructions,
-    InstructionList* instruction_list,
+    InstructionList* instruction_list, HloSchedule* schedule,
     HloRematerialization* rematerialization) {
   int64_t net_instructions_added = 0;
   int64_t total_memory_saved =
@@ -1570,8 +1571,21 @@ StatusOr<int64_t> RematerializeInstructions(
       continue;
     }
 
+    HloCloneContext context(computation->parent());
     HloInstruction* remat =
-        computation->AddInstruction(best->Clone(/*suffix=*/"remat"));
+        computation->AddInstruction(best->Clone(/*suffix=*/"remat", &context));
+    for (auto& cloned_computation_pair : context.cloned_computations()) {
+      if (!schedule->is_computation_scheduled(cloned_computation_pair.first)) {
+        continue;
+      }
+      HloInstructionSequence& sequence =
+          schedule->GetOrCreateSequence(cloned_computation_pair.second);
+      HloInstructionSequence& old_sequence =
+          schedule->GetOrCreateSequence(cloned_computation_pair.first);
+      for (HloInstruction* instr : old_sequence.instructions()) {
+        sequence.push_back(instr);
+      }
+    }
     // Increment channel_id on channel instructions with a channel id.
     if (DynCast<HloChannelInstruction>(best) &&
         DynCast<HloChannelInstruction>(best)->channel_id()) {
@@ -1735,11 +1749,11 @@ StatusOr<int64_t> CompressInstruction(MemoryUsageTracker* memory_tracker,
   HloComputation* computation = best->parent();
   HloInstruction* compressed = computation->AddInstruction(
       HloInstruction::CreateUnary(compact_shape, HloOpcode::kCopy, best),
-      /*new_name=*/best->name() + ".remat_compressed");
+      /*new_name=*/absl::StrCat(best->name(), ".remat_compressed"));
 
   HloInstruction* uncompressed = computation->AddInstruction(
       HloInstruction::CreateUnary(best->shape(), HloOpcode::kCopy, compressed),
-      /*new_name=*/best->name() + ".remat_uncompressed");
+      /*new_name=*/absl::StrCat(best->name(), ".remat_uncompressed"));
 
   Item* compressed_item = instruction_list->CreateItem(compressed);
   compressed_item->placed = true;
@@ -1794,7 +1808,8 @@ struct InstructionsAdded {
 // instructions can be found. Returns number of instructions rematerialized.
 StatusOr<InstructionsAdded> RematerializeBestBlock(
     int min_block_size, int max_block_size, MemoryUsageTracker* memory_tracker,
-    InstructionList* instruction_list, int64_t memory_limit_bytes,
+    InstructionList* instruction_list, HloSchedule* schedule,
+    int64_t memory_limit_bytes,
     absl::flat_hash_map<const HloInstruction*, bool>* rematerializable_map,
     absl::flat_hash_set<const HloInstruction*>* remat_move_instructions,
     HloRematerialization* rematerialization) {
@@ -1835,7 +1850,7 @@ StatusOr<InstructionsAdded> RematerializeBestBlock(
         num_instructions_added.net_instructions_added,
         RematerializeInstructions(memory_tracker, &best_items,
                                   remat_move_instructions, instruction_list,
-                                  rematerialization));
+                                  schedule, rematerialization));
   }
   return num_instructions_added;
 }
@@ -1976,7 +1991,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
         TF_ASSIGN_OR_RETURN(
             InstructionsAdded instructions_added,
             RematerializeBestBlock(min_block_size, max_block_size,
-                                   &memory_tracker, &instruction_list,
+                                   &memory_tracker, &instruction_list, schedule,
                                    memory_limit_bytes, &rematerializable_map,
                                    &remat_move_instructions, this));
         net_instructions_added += instructions_added.net_instructions_added;
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index d9676671e8d..b165d696918 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_rematerialization.h"
 
+#include <algorithm>
 #include <memory>
 #include <string>
 
@@ -1197,6 +1198,79 @@ ENTRY %entry {
                       op::Fusion(AllOf(op::Fusion(), ::testing::Ne(fusion0)))));
 }
 
+TEST_F(HloRematerializationTest, RematFusionUpdateSchedule) {
+  const std::string& hlo_string = R"(
+HloModule fusion, is_scheduled=true
+
+%custom_call_comp {
+  %p = f32[1024]{0} parameter(0)
+  ROOT %n = f32[1024]{0} negate(p)
+}
+
+%add_mul_comp {
+  %p0 = f32[] parameter(0)
+  %p1 = f32[] parameter(1)
+  %x = f32[1024]{0} broadcast(f32[] %p0), dimensions={}
+  %y = f32[1024]{0} broadcast(f32[] %p1), dimensions={}
+  %add = f32[1024] add(%x, %y)
+  %mul = f32[1024] multiply(%x, %y)
+  %c = f32[1024] custom-call(%mul), custom_call_target="SomeCall", called_computations={custom_call_comp}
+  ROOT %out = (f32[1024], f32[1024]) tuple(%add, %c)
+}
+
+ENTRY %entry {
+  %param.0 = f32[] parameter(0)
+  %param.1 = f32[] parameter(1)
+  %fus = (f32[1024]{0}, f32[1024]{0}) fusion(%param.0, %param.1), kind=kLoop,
+    calls=%add_mul_comp
+  %gte.1 = f32[1024]{0} get-tuple-element(%fus), index=0
+  %add = f32[1024]{0} add(f32[1024]{0} %gte.1, f32[1024]{0} %gte.1)
+  %broadcast.1 = f32[1024]{0} broadcast(f32[] %param.0), dimensions={}
+  %mul = f32[1024]{0} multiply(f32[1024]{0} %add, f32[1024]{0} %broadcast.1)
+  %gte.2 = f32[1024]{0} get-tuple-element(%fus), index=1
+  %gte.3 = f32[1024]{0} get-tuple-element(%fus), index=0
+  %add.2 = f32[1024]{0} add(f32[1024]{0} %mul, f32[1024]{0} %gte.2)
+  ROOT %mul.2 = f32[1024]{0} multiply(f32[1024]{0} %add.2, f32[1024]{0} %gte.3)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  const HloComputation* computation = module->entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunHloRematerialization(
+                              /*memory_limit_bytes=*/11 * 1024, module.get()));
+  EXPECT_TRUE(changed);
+  XLA_VLOG_LINES(1, module->ToString());
+  const HloInstruction* add = computation->root_instruction();
+  const HloInstruction* fusion = add->operand(0)->operand(0);
+  ASSERT_THAT(
+      add,
+      op::Multiply(
+          op::Add(op::Multiply(), op::GetTupleElement(AllOf(
+                                      op::Fusion(), ::testing::Ne(fusion)))),
+          op::GetTupleElement(AllOf(op::Fusion(), ::testing::Ne(fusion)))));
+  // Check that the rematerialized fusion is the same for both ops.
+  const HloInstruction* fusion0 = add->operand(0)->operand(1)->operand(0);
+  const HloInstruction* fusion1 = add->operand(1)->operand(0);
+  auto it = std::find_if(fusion0->fused_instructions().begin(),
+                         fusion0->fused_instructions().end(),
+                         [](const HloInstruction* instr) {
+                           return instr->opcode() == HloOpcode::kCustomCall;
+                         });
+  ASSERT_NE(it, fusion0->fused_instructions().end());
+  auto it2 = std::find_if(fusion1->fused_instructions().begin(),
+                          fusion1->fused_instructions().end(),
+                          [](const HloInstruction* instr) {
+                            return instr->opcode() == HloOpcode::kCustomCall;
+                          });
+  ASSERT_NE(it2, fusion1->fused_instructions().end());
+  EXPECT_TRUE(module->schedule().is_computation_scheduled(
+      (*it)->called_computations()[0]));
+  EXPECT_TRUE(module->schedule().is_computation_scheduled(
+      (*it2)->called_computations()[0]));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index 529f34698f5..245a156c307 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -156,7 +157,7 @@ void HloValue::SetPositions(absl::Span<const HloPosition> positions) {
       IsRootOf(defining_instruction()->GetModule()->entry_computation());
 }
 
-std::vector<HloUse> HloValue::ComputeUses() const {
+HloValue::Uses HloValue::ComputeUses() const {
   // Gather the computation roots at which this value appears.
   absl::flat_hash_set<HloInstruction*> root_positions;
   for (const HloPosition& position : positions_) {
@@ -165,7 +166,7 @@ std::vector<HloUse> HloValue::ComputeUses() const {
     }
   }
 
-  std::vector<HloUse> uses;
+  Uses uses;
   // Build vector of HloUses for the value.
   for (const HloPosition& position : positions_) {
     for (HloInstruction* user : position.instruction->users()) {
diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h
index 9fbdd6bdcfa..5e13554afe9 100644
--- a/tensorflow/compiler/xla/service/hlo_value.h
+++ b/tensorflow/compiler/xla/service/hlo_value.h
@@ -142,8 +142,9 @@ class HloValue : public BufferValue {
   // Return the shape of this HloValue.
   const Shape& shape() const override { return defining_position().shape(); }
 
+  using Positions = absl::InlinedVector<HloPosition, 3>;
   // Return all positions of the HloValue in the module.
-  const std::vector<HloPosition>& positions() const { return positions_; }
+  const Positions& positions() const { return positions_; }
 
   // Return all uses of the HloValue. This computes the uses lazily, and the
   // overhead could be non-trivial for the first invocation. Therefore even
@@ -167,16 +168,17 @@ class HloValue : public BufferValue {
   std::string ToString() const override { return ToString(0); }
 
  private:
+  using Uses = absl::InlinedVector<HloUse, 3>;
   // Called when lazily computing the uses.
-  std::vector<HloUse> ComputeUses() const;
+  Uses ComputeUses() const;
 
   // The set of positions of this HloValue. The first element is always the
   // position of the definition.
-  std::vector<HloPosition> positions_;
+  Positions positions_;
 
   // The set of uses of this HloValue. This is lazily constructed until getting
   // accessed.
-  Lazy<std::vector<HloUse>> uses_;
+  Lazy<Uses> uses_;
 
   // Whether this instruction is a phi value.
   const bool is_phi_;
diff --git a/tensorflow/compiler/xla/service/hlo_value_semantics_analysis.cc b/tensorflow/compiler/xla/service/hlo_value_semantics_analysis.cc
new file mode 100644
index 00000000000..90ae19ebc27
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_value_semantics_analysis.cc
@@ -0,0 +1,846 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_value_semantics_analysis.h"
+
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_value.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+
+std::string HloValueSemanticLabelToString(HloValueSemanticLabel label) {
+  switch (label) {
+    case HloValueSemanticLabel::kStatic:
+      return "Static";
+    case HloValueSemanticLabel::kRandom:
+      return "Random";
+    case HloValueSemanticLabel::kWeight:
+      return "Weight";
+    case HloValueSemanticLabel::kActivation:
+      return "Activation";
+    case HloValueSemanticLabel::kActivationGradient:
+      return "ActivationGradient";
+    case HloValueSemanticLabel::kWeightGradient:
+      return "WeightGradient";
+    case HloValueSemanticLabel::kTupleOrToken:
+      return "TupleOrToken";
+  }
+}
+
+std::string HloValueSemantics::ToString() const {
+  std::string content = absl::StrJoin(
+      {absl::StrCat("label: ", HloValueSemanticLabelToString(label_)),
+       absl::StrCat("origin: ", origin_.ToString())},
+      ", ");
+  return absl::StrCat("{", content, "}");
+}
+
+HloValueSemantics::HloValueSemantics(HloValueSemanticLabel label,
+                                     const HloPosition& origin)
+    : HloValueSemantics(0, label, origin) {}
+
+HloValueSemantics::HloValueSemantics(Id id, HloValueSemanticLabel label,
+                                     const HloPosition& origin)
+    : id_(id), label_(label), origin_(origin) {}
+
+HloValueSemanticsAnalysis::HloValueSemanticsAnalysis(const HloModule& module)
+    : module_(module), next_id_(0) {}
+
+const HloValueSemantics* HloValueSemanticsAnalysis::GetSemantics(
+    const HloInstruction* instruction, const ShapeIndex& index) const {
+  return GetInstructionSemantics(instruction).element(index);
+}
+
+StatusOr<std::unique_ptr<HloValueSemanticsAnalysis>>
+HloValueSemanticsAnalysis::Run(const HloModule& module) {
+  std::unique_ptr<HloValueSemanticsAnalysis> value_semantics_analysis =
+      absl::WrapUnique(new HloValueSemanticsAnalysis(module));
+  value_semantics_analysis->AnnotateWeights();
+  TF_RETURN_IF_ERROR(
+      value_semantics_analysis->RunOnComputation(*module.entry_computation()));
+  return value_semantics_analysis;
+}
+
+HloValueSemantics::Id HloValueSemanticsAnalysis::NextId() { return next_id_++; }
+
+const HloValueSemantics* HloValueSemanticsAnalysis::NewHloValueSemantics(
+    HloValueSemanticLabel label, const HloPosition& origin) {
+  HloValueSemantics::Id id = NextId();
+  auto inserted = value_semantics_map_.insert(std::make_pair(
+      id, std::make_unique<HloValueSemantics>(id, label, origin)));
+  return inserted.first->second.get();
+}
+
+const ShapeTree<const HloValueSemantics*>&
+HloValueSemanticsAnalysis::GetInstructionSemantics(
+    const HloInstruction* instruction) const {
+  auto semantics_iter = value_semantics_.find(instruction);
+  CHECK(semantics_iter != value_semantics_.end())
+      << "instruction: " << instruction->ToString();
+  return semantics_iter->second;
+}
+
+void HloValueSemanticsAnalysis::DeepCopyHloValueSemantics(
+    ShapeTree<const HloValueSemantics*>& copy_to,
+    const ShapeTree<const HloValueSemantics*>& copy_from,
+    const ShapeIndex& source_index, const ShapeIndex& destination_index) {
+  copy_to.ForEachMutableElement(
+      [this, &copy_from, &source_index, &destination_index](
+          const ShapeIndex& index, const HloValueSemantics** semantics) {
+        if (index.size() < destination_index.size()) {
+          return;
+        }
+        bool in_subtree_to_copy = true;
+        for (int i = 0; i < destination_index.size(); ++i) {
+          if (index[i] != destination_index[i]) {
+            in_subtree_to_copy = false;
+            break;
+          }
+        }
+        if (!in_subtree_to_copy) {
+          return;
+        }
+        ShapeIndex full_source_index = source_index;
+        for (int i = destination_index.size(); i < index.size(); ++i) {
+          full_source_index.push_back(index[i]);
+        }
+        const HloValueSemantics* source_semantics =
+            copy_from.element(full_source_index);
+        *semantics = NewHloValueSemantics(source_semantics->label(),
+                                          source_semantics->origin());
+      });
+}
+
+void HloValueSemanticsAnalysis::DeepCopyHloValueSemantics(
+    const HloInstruction* target,
+    const ShapeTree<const HloValueSemantics*>& copy_from,
+    const ShapeIndex& source_index) {
+  auto semantics_iter = value_semantics_.find(target);
+  if (semantics_iter != value_semantics_.end()) {
+    DeleteHloValueSemantics(semantics_iter->second);
+    DeepCopyHloValueSemantics(semantics_iter->second, copy_from, source_index,
+                              {});
+    return;
+  }
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(target->shape(),
+                                                           nullptr);
+  DeepCopyHloValueSemantics(semantics_shape_tree, copy_from, source_index, {});
+  value_semantics_[target] = std::move(semantics_shape_tree);
+}
+
+void HloValueSemanticsAnalysis::SetHloValueSemantics(
+    const HloInstruction* target,
+    const ShapeTree<const HloValueSemantics*>& semantics) {
+  auto semantics_iter = value_semantics_.find(target);
+  if (semantics_iter != value_semantics_.end()) {
+    DeleteHloValueSemantics(semantics_iter->second);
+  }
+  value_semantics_[target] = semantics;
+}
+
+void HloValueSemanticsAnalysis::DeleteHloValueSemantics(
+    const HloValueSemantics* to_delete) {
+  value_semantics_map_.erase(to_delete->id());
+}
+
+void HloValueSemanticsAnalysis::DeleteHloValueSemantics(
+    const ShapeTree<const HloValueSemantics*>& to_delete) {
+  to_delete.ForEachElement(
+      [this](const ShapeIndex& index, const HloValueSemantics* semantics) {
+        DeleteHloValueSemantics(semantics);
+      });
+}
+
+void HloValueSemanticsAnalysis::AnnotateWeights() {
+  const HloComputation* entry_computation = module_.entry_computation();
+  for (HloInstruction* parameter :
+       entry_computation->parameter_instructions()) {
+    ShapeTree<const HloValueSemantics*> semantics_shape_tree(parameter->shape(),
+                                                             nullptr);
+    semantics_shape_tree.ForEachMutableElement(
+        [this, &semantics_shape_tree, parameter](
+            const ShapeIndex& index, const HloValueSemantics** semantics) {
+          if (!semantics_shape_tree.IsLeaf(index)) {
+            *semantics = NewHloValueSemantics(
+                HloValueSemanticLabel::kTupleOrToken, {parameter, index});
+          }
+          *semantics = NewHloValueSemantics(HloValueSemanticLabel::kWeight,
+                                            {parameter, index});
+        });
+    value_semantics_[parameter] = std::move(semantics_shape_tree);
+  }
+}
+Status HloValueSemanticsAnalysis::RunOnComputation(
+    const HloComputation& computation,
+    absl::Span<const HloInstruction* const> operands) {
+  CHECK_EQ(computation.num_parameters(), operands.size());
+  for (int i = 0; i < computation.num_parameters(); ++i) {
+    auto semantics_iter = value_semantics_.find(operands[i]);
+    CHECK(semantics_iter != value_semantics_.end());
+    DeepCopyHloValueSemantics(computation.parameter_instructions()[i],
+                              semantics_iter->second);
+  }
+  return RunOnComputation(computation);
+}
+
+Status HloValueSemanticsAnalysis::RunOnComputation(
+    const HloComputation& computation) {
+  HloValueSemanticsPropagation propagation(this);
+  return propagation.Run(computation);
+}
+
+HloValueSemanticsPropagation::HloValueSemanticsPropagation(
+    HloValueSemanticsAnalysis* analysis)
+    : analysis_(analysis) {}
+
+Status HloValueSemanticsPropagation::Run(const HloComputation& computation) {
+  return computation.root_instruction()->Accept(this);
+}
+
+HloValueSemantics HloValueSemanticsPropagation::CopySemantics(
+    const HloValueSemantics& semantics) {
+  return HloValueSemantics(semantics.label(), semantics.origin());
+}
+
+HloValueSemantics HloValueSemanticsPropagation::CopySemanticsWithNewOrigin(
+    const HloValueSemantics& semantics, HloInstruction* new_origin,
+    const ShapeIndex& index) {
+  return HloValueSemantics(semantics.label(), {new_origin, index});
+}
+
+const HloValueSemantics* HloValueSemanticsPropagation::AddSemantics(
+    const HloValueSemantics& semantics) {
+  return analysis_->NewHloValueSemantics(semantics.label(), semantics.origin());
+}
+
+bool HloValueSemanticsPropagation::IsActivationOriginDependentOn(
+    const HloValueSemantics& activation_semantics,
+    const HloPosition& origin_dependence, bool recursive) const {
+  CHECK(activation_semantics.label() == HloValueSemanticLabel::kActivation);
+  std::vector<HloPosition> stack;
+  absl::flat_hash_set<HloPosition> visited;
+  stack.push_back(activation_semantics.origin());
+  while (!stack.empty()) {
+    HloPosition origin = stack.back();
+    stack.pop_back();
+    if (visited.contains(origin)) {
+      continue;
+    }
+    visited.insert(origin);
+    absl::Span<const HloInstruction* const> operands =
+        origin.instruction->operands();
+    // Do not check slice indices.
+    if (origin.instruction->opcode() == HloOpcode::kDynamicUpdateSlice) {
+      operands = operands.subspan(0, 2);
+    }
+    if (origin.instruction->opcode() == HloOpcode::kDynamicSlice) {
+      operands = operands.subspan(0, 1);
+    }
+    for (const HloInstruction* origin_operand : operands) {
+      const HloValueSemantics* origin_operand_semantics =
+          analysis_->GetSemantics(origin_operand);
+      if (origin_operand_semantics->origin() == origin_dependence) {
+        return true;
+      }
+      if (recursive) {
+        stack.push_back(origin_operand_semantics->origin());
+      }
+    }
+  }
+  return false;
+}
+
+StatusOr<HloValueSemantics>
+HloValueSemanticsPropagation::ComputeSemanticsFromStaticAndOther(
+    const HloValueSemantics& static_semantics,
+    const HloValueSemantics& other_semantics, HloInstruction* instruction) {
+  CHECK(static_semantics.label() == HloValueSemanticLabel::kStatic)
+      << __func__ << ", : " << static_semantics.ToString();
+  if (other_semantics.label() == HloValueSemanticLabel::kStatic) {
+    return CopySemanticsWithNewOrigin(other_semantics, instruction);
+  }
+  return CopySemantics(other_semantics);
+}
+
+StatusOr<HloValueSemantics>
+HloValueSemanticsPropagation::ComputeSemanticsFromRandomAndOther(
+    const HloValueSemantics& random_semantics,
+    const HloValueSemantics& other_semantics, HloInstruction* instruction) {
+  CHECK(random_semantics.label() == HloValueSemanticLabel::kRandom);
+  CHECK(other_semantics.label() != HloValueSemanticLabel::kStatic);
+  if (other_semantics.label() == HloValueSemanticLabel::kRandom) {
+    return CopySemanticsWithNewOrigin(other_semantics, instruction);
+  }
+  return CopySemantics(other_semantics);
+}
+
+StatusOr<HloValueSemantics>
+HloValueSemanticsPropagation::ComputeSemanticsFromWeightAndOther(
+    const HloValueSemantics& weight_semantics,
+    const HloValueSemantics& other_semantics, HloInstruction* instruction) {
+  CHECK(weight_semantics.label() == HloValueSemanticLabel::kWeight);
+  CHECK(other_semantics.label() != HloValueSemanticLabel::kStatic &&
+        other_semantics.label() != HloValueSemanticLabel::kRandom);
+  bool is_dot_or_convolution = instruction->opcode() == HloOpcode::kDot ||
+                               instruction->opcode() == HloOpcode::kConvolution;
+  if (other_semantics.label() == HloValueSemanticLabel::kWeight) {
+    if (!is_dot_or_convolution) {
+      return CopySemanticsWithNewOrigin(other_semantics, instruction);
+    }
+    return HloValueSemantics(HloValueSemanticLabel::kActivation,
+                             {instruction, {}});
+  }
+  if (!is_dot_or_convolution) {
+    return CopySemantics(other_semantics);
+  }
+  if (other_semantics.label() == HloValueSemanticLabel::kActivation) {
+    if (IsActivationOriginDependentOn(other_semantics,
+                                      weight_semantics.origin(),
+                                      /*recursive=*/true)) {
+      return HloValueSemantics(HloValueSemanticLabel::kActivationGradient,
+                               {instruction, {}});
+    }
+    return CopySemanticsWithNewOrigin(other_semantics, instruction);
+  }
+  if (other_semantics.label() == HloValueSemanticLabel::kActivationGradient) {
+    return CopySemanticsWithNewOrigin(other_semantics, instruction);
+  }
+  CHECK(other_semantics.label() == HloValueSemanticLabel::kWeightGradient);
+  return CopySemantics(other_semantics);
+}
+
+StatusOr<HloValueSemantics>
+HloValueSemanticsPropagation::ComputeSemanticsFromActivationAndOther(
+    const HloValueSemantics& activation_semantics,
+    const HloValueSemantics& other_semantics, HloInstruction* instruction) {
+  CHECK(activation_semantics.label() == HloValueSemanticLabel::kActivation);
+  CHECK(other_semantics.label() != HloValueSemanticLabel::kStatic &&
+        other_semantics.label() != HloValueSemanticLabel::kRandom &&
+        other_semantics.label() != HloValueSemanticLabel::kWeight);
+  bool is_dot_or_convolution = instruction->opcode() == HloOpcode::kDot ||
+                               instruction->opcode() == HloOpcode::kConvolution;
+  if (!is_dot_or_convolution) {
+    return CopySemanticsWithNewOrigin(other_semantics, instruction);
+  }
+  if (other_semantics.label() == HloValueSemanticLabel::kActivation) {
+    return CopySemanticsWithNewOrigin(other_semantics, instruction);
+  }
+  if (other_semantics.label() == HloValueSemanticLabel::kActivationGradient) {
+    return HloValueSemantics(HloValueSemanticLabel::kWeightGradient,
+                             {instruction, {}});
+  }
+  CHECK(other_semantics.label() == HloValueSemanticLabel::kWeightGradient)
+      << "instruction:  " << instruction->ToString()
+      << ", semantics: " << other_semantics.ToString()
+      << ", expected: WeightGradient.";
+
+  return CopySemantics(other_semantics);
+}
+
+StatusOr<HloValueSemantics>
+HloValueSemanticsPropagation::ComputeSemanticsFromActivationGradientAndOther(
+    const HloValueSemantics& activation_gradient_semantics,
+    const HloValueSemantics& other_semantics, HloInstruction* instruction) {
+  CHECK(activation_gradient_semantics.label() ==
+        HloValueSemanticLabel::kActivationGradient);
+  CHECK(other_semantics.label() != HloValueSemanticLabel::kStatic &&
+        other_semantics.label() != HloValueSemanticLabel::kRandom &&
+        other_semantics.label() != HloValueSemanticLabel::kWeight &&
+        other_semantics.label() != HloValueSemanticLabel::kActivation);
+  if (other_semantics.label() == HloValueSemanticLabel::kActivationGradient) {
+    return CopySemanticsWithNewOrigin(other_semantics, instruction);
+  }
+
+  CHECK(other_semantics.label() == HloValueSemanticLabel::kWeightGradient);
+  return CopySemantics(other_semantics);
+}
+
+StatusOr<HloValueSemantics>
+HloValueSemanticsPropagation::ComputeSemanticsFromWeightGradientAndOther(
+    const HloValueSemantics& weight_gradient_semantics,
+    const HloValueSemantics& other_semantics, HloInstruction* instruction) {
+  CHECK(weight_gradient_semantics.label() ==
+        HloValueSemanticLabel::kWeightGradient);
+  CHECK(other_semantics.label() != HloValueSemanticLabel::kStatic &&
+        other_semantics.label() != HloValueSemanticLabel::kRandom &&
+        other_semantics.label() != HloValueSemanticLabel::kWeight &&
+        other_semantics.label() != HloValueSemanticLabel::kActivation &&
+        other_semantics.label() != HloValueSemanticLabel::kActivationGradient);
+  return CopySemantics(weight_gradient_semantics);
+}
+
+StatusOr<HloValueSemantics>
+HloValueSemanticsPropagation::ComputeSemanticsFromOperands(
+    HloInstruction* instruction, absl::Span<const int64_t> operand_indices,
+    absl::Span<const ShapeIndex> operand_shape_indices) {
+  CHECK(!operand_indices.empty());
+  CHECK(operand_shape_indices.empty() ||
+        operand_indices.size() == operand_shape_indices.size());
+  std::vector<HloValueSemantics> semantics_vec;
+  for (int64_t operand_index : operand_indices) {
+    const HloInstruction* operand = instruction->operand(operand_index);
+    const HloValueSemantics* operand_semantics = analysis_->GetSemantics(
+        operand, operand_shape_indices.empty()
+                     ? ShapeIndex()
+                     : operand_shape_indices[operand_index]);
+    semantics_vec.push_back(*operand_semantics);
+  }
+  while (semantics_vec.size() >= 2) {
+    absl::Span<const HloValueSemantics> operand_list =
+        absl::MakeConstSpan(semantics_vec).subspan(semantics_vec.size() - 2, 2);
+    auto find_operand_index_with_label =
+        [&operand_list](
+            HloValueSemanticLabel label) -> std::optional<int64_t> {
+      auto iter = absl::c_find_if(operand_list,
+                                  [label](const HloValueSemantics& operand) {
+                                    return operand.label() == label;
+                                  });
+      return (iter != operand_list.end())
+                 ? std::optional<int64_t>(
+                       std::distance(operand_list.begin(), iter))
+                 : std::nullopt;
+    };
+    auto replace_operands_semantics_with =
+        [&semantics_vec](const HloValueSemantics& result_semantics) {
+          semantics_vec.pop_back();
+          semantics_vec.pop_back();
+          semantics_vec.push_back(result_semantics);
+        };
+    if (auto index =
+            find_operand_index_with_label(HloValueSemanticLabel::kStatic)) {
+      TF_ASSIGN_OR_RETURN(
+          HloValueSemantics semantics,
+          ComputeSemanticsFromStaticAndOther(
+              operand_list[*index], operand_list[1 - *index], instruction));
+      replace_operands_semantics_with(semantics);
+      continue;
+    }
+    if (auto index =
+            find_operand_index_with_label(HloValueSemanticLabel::kRandom)) {
+      TF_ASSIGN_OR_RETURN(
+          HloValueSemantics semantics,
+          ComputeSemanticsFromRandomAndOther(
+              operand_list[*index], operand_list[1 - *index], instruction));
+      replace_operands_semantics_with(semantics);
+      continue;
+    }
+    if (auto index =
+            find_operand_index_with_label(HloValueSemanticLabel::kWeight)) {
+      TF_ASSIGN_OR_RETURN(
+          HloValueSemantics semantics,
+          ComputeSemanticsFromWeightAndOther(
+              operand_list[*index], operand_list[1 - *index], instruction));
+      replace_operands_semantics_with(semantics);
+      continue;
+    }
+    if (auto index =
+            find_operand_index_with_label(HloValueSemanticLabel::kActivation)) {
+      TF_ASSIGN_OR_RETURN(
+          HloValueSemantics semantics,
+          ComputeSemanticsFromActivationAndOther(
+              operand_list[*index], operand_list[1 - *index], instruction));
+      replace_operands_semantics_with(semantics);
+      continue;
+    }
+    if (auto index = find_operand_index_with_label(
+            HloValueSemanticLabel::kActivationGradient)) {
+      TF_ASSIGN_OR_RETURN(
+          HloValueSemantics semantics,
+          ComputeSemanticsFromActivationGradientAndOther(
+              operand_list[*index], operand_list[1 - *index], instruction));
+      replace_operands_semantics_with(semantics);
+      continue;
+    }
+    if (auto index = find_operand_index_with_label(
+            HloValueSemanticLabel::kWeightGradient)) {
+      TF_ASSIGN_OR_RETURN(
+          HloValueSemantics semantics,
+          ComputeSemanticsFromWeightGradientAndOther(
+              operand_list[*index], operand_list[1 - *index], instruction));
+      replace_operands_semantics_with(semantics);
+      continue;
+    }
+    LOG(FATAL) << "We don't expect to handle operands of label "
+               << HloValueSemanticLabelToString(operand_list[0].label())
+               << " and "
+               << HloValueSemanticLabelToString(operand_list[1].label())
+               << " in ComputeSemanticsFromOperands. Instruction: "
+               << instruction->name()
+               << " should be handled in its own handler instead of the "
+                  "default handler.";
+  }
+  return semantics_vec.back();
+}
+
+Status HloValueSemanticsPropagation::DefaultAction(
+    HloInstruction* instruction) {
+  std::vector<int64_t> operand_indices(instruction->operand_count());
+  std::iota(operand_indices.begin(), operand_indices.end(), 0);
+  TF_ASSIGN_OR_RETURN(
+      HloValueSemantics semantics,
+      ComputeSemanticsFromOperands(instruction, operand_indices));
+  const HloValueSemantics* semantics_ptr = AddSemantics(semantics);
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(instruction->shape(),
+                                                           semantics_ptr);
+  analysis_->SetHloValueSemantics(instruction, semantics_shape_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleParameter(
+    HloInstruction* parameter) {
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleConstant(HloInstruction* constant) {
+  const HloValueSemantics* constant_semantics = analysis_->NewHloValueSemantics(
+      HloValueSemanticLabel::kStatic, {constant, {}});
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(constant->shape(),
+                                                           constant_semantics);
+  analysis_->SetHloValueSemantics(constant, semantics_shape_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleIota(HloInstruction* iota) {
+  const HloValueSemantics* semantics = analysis_->NewHloValueSemantics(
+      HloValueSemanticLabel::kStatic, {iota, {}});
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(iota->shape(),
+                                                           semantics);
+  analysis_->SetHloValueSemantics(iota, semantics_shape_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandlePartitionId(
+    HloInstruction* partition_id) {
+  const HloValueSemantics* semantics = analysis_->NewHloValueSemantics(
+      HloValueSemanticLabel::kStatic, {partition_id, {}});
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(
+      partition_id->shape(), semantics);
+  analysis_->SetHloValueSemantics(partition_id, semantics_shape_tree);
+  return OkStatus();
+}
+Status HloValueSemanticsPropagation::HandleReplicaId(
+    HloInstruction* replica_id) {
+  const HloValueSemantics* semantics = analysis_->NewHloValueSemantics(
+      HloValueSemanticLabel::kStatic, {replica_id, {}});
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(replica_id->shape(),
+                                                           semantics);
+  analysis_->SetHloValueSemantics(replica_id, semantics_shape_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleClamp(HloInstruction* clamp) {
+  const ShapeTree<const HloValueSemantics*>& operand_semantics =
+      analysis_->GetInstructionSemantics(clamp->operand(1));
+  analysis_->DeepCopyHloValueSemantics(clamp, operand_semantics);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleTuple(HloInstruction* tuple) {
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(tuple->shape(),
+                                                           nullptr);
+  for (int operand_index = 0; operand_index < tuple->operand_count();
+       ++operand_index) {
+    const HloInstruction* operand = tuple->operand(operand_index);
+    const ShapeTree<const HloValueSemantics*>& operand_semantics =
+        analysis_->GetInstructionSemantics(operand);
+    analysis_->DeepCopyHloValueSemantics(
+        semantics_shape_tree, operand_semantics, {}, {operand_index});
+  }
+  semantics_shape_tree.ForEachMutableElement(
+      [tuple, this](const ShapeIndex& index,
+                    const HloValueSemantics** semantics) {
+        if (index.empty()) {
+          *semantics = analysis_->NewHloValueSemantics(
+              HloValueSemanticLabel::kTupleOrToken, {tuple, {}});
+          return;
+        }
+      });
+  analysis_->SetHloValueSemantics(tuple, semantics_shape_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleGetTupleElement(
+    HloInstruction* get_tuple_element) {
+  const HloInstruction* tuple = get_tuple_element->operand(0);
+  int64_t tuple_index = get_tuple_element->tuple_index();
+  const ShapeTree<const HloValueSemantics*>& tuple_semantics =
+      analysis_->GetInstructionSemantics(tuple);
+  TF_ASSIGN_OR_RETURN(
+      ShapeTree<const HloValueSemantics*> tuple_element_semantics,
+      tuple_semantics.SubShapeTree({tuple_index}));
+  analysis_->DeepCopyHloValueSemantics(get_tuple_element,
+                                       tuple_element_semantics);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleCall(HloInstruction* call) {
+  HloComputation* computation = call->called_computations()[0];
+  TF_RETURN_IF_ERROR(
+      analysis_->RunOnComputation(*computation, call->operands()));
+  const ShapeTree<const HloValueSemantics*>& root_semantics =
+      analysis_->GetInstructionSemantics(computation->root_instruction());
+  analysis_->DeepCopyHloValueSemantics(call, root_semantics);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleFusion(HloInstruction* fusion) {
+  HloComputation* computation = fusion->called_computations()[0];
+  TF_RETURN_IF_ERROR(
+      analysis_->RunOnComputation(*computation, fusion->operands()));
+  const ShapeTree<const HloValueSemantics*>& root_semantics =
+      analysis_->GetInstructionSemantics(computation->root_instruction());
+  analysis_->DeepCopyHloValueSemantics(fusion, root_semantics);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleWhile(HloInstruction* xla_while) {
+  TF_RETURN_IF_ERROR(analysis_->RunOnComputation(*xla_while->while_condition(),
+                                                 xla_while->operands()));
+  HloComputation* computation = xla_while->while_body();
+  TF_RETURN_IF_ERROR(
+      analysis_->RunOnComputation(*computation, xla_while->operands()));
+  const ShapeTree<const HloValueSemantics*>& root_semantics =
+      analysis_->GetInstructionSemantics(computation->root_instruction());
+  analysis_->DeepCopyHloValueSemantics(xla_while, root_semantics);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleCustomCall(
+    HloInstruction* custom_call) {
+  return Unimplemented("Unimplemented custom-call: %s",
+                       custom_call->custom_call_target());
+}
+
+Status HloValueSemanticsPropagation::HandleConditional(
+    HloInstruction* conditional) {
+  for (int i = 0; i < conditional->called_computations().size(); ++i) {
+    TF_RETURN_IF_ERROR(analysis_->RunOnComputation(
+        *conditional->called_computations()[i], conditional->operands()));
+  }
+  HloComputation* computation = conditional->called_computations()[0];
+  const ShapeTree<const HloValueSemantics*>& root_semantics =
+      analysis_->GetInstructionSemantics(computation->root_instruction());
+  analysis_->DeepCopyHloValueSemantics(conditional, root_semantics);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleSelect(HloInstruction* select) {
+  TF_ASSIGN_OR_RETURN(HloValueSemantics semantics,
+                      ComputeSemanticsFromOperands(select, {1, 2}));
+  const HloValueSemantics* semantics_ptr = AddSemantics(semantics);
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(select->shape(),
+                                                           semantics_ptr);
+  analysis_->SetHloValueSemantics(select, semantics_shape_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleConcatenate(
+    HloInstruction* concatenate) {
+  const ShapeTree<const HloValueSemantics*>& operand_semantics =
+      analysis_->GetInstructionSemantics(concatenate->operand(0));
+  analysis_->DeepCopyHloValueSemantics(concatenate, operand_semantics);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleDynamicSlice(
+    HloInstruction* dynamic_slice) {
+  const HloInstruction* dynamic_slice_operand = dynamic_slice->operand(0);
+  const HloValueSemantics* operand_semantics =
+      analysis_->GetSemantics(dynamic_slice_operand);
+  const HloValueSemantics* semantics = nullptr;
+  if (operand_semantics->label() == HloValueSemanticLabel::kStatic ||
+      operand_semantics->label() == HloValueSemanticLabel::kRandom ||
+      operand_semantics->label() == HloValueSemanticLabel::kWeight) {
+    semantics = analysis_->NewHloValueSemantics(operand_semantics->label(),
+                                                {dynamic_slice, {}});
+  } else {
+    HloValueSemantics semantics_value = CopySemantics(*operand_semantics);
+    semantics = AddSemantics(semantics_value);
+  }
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(
+      dynamic_slice->shape(), semantics);
+  analysis_->SetHloValueSemantics(dynamic_slice, semantics_shape_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleDynamicUpdateSlice(
+    HloInstruction* dynamic_update_slice) {
+  TF_ASSIGN_OR_RETURN(
+      HloValueSemantics semantics,
+      ComputeSemanticsFromOperands(dynamic_update_slice, {0, 1}));
+  const HloValueSemantics* semantics_ptr = AddSemantics(semantics);
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(
+      dynamic_update_slice->shape(), semantics_ptr);
+  analysis_->SetHloValueSemantics(dynamic_update_slice, semantics_shape_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleCopyStart(
+    HloInstruction* copy_start) {
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(copy_start->shape());
+  const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
+      analysis_->GetInstructionSemantics(copy_start->operand(0));
+  analysis_->DeepCopyHloValueSemantics(semantics_shape_tree,
+                                       operand_semantics_shape_tree, {}, {0});
+  analysis_->DeepCopyHloValueSemantics(semantics_shape_tree,
+                                       operand_semantics_shape_tree, {}, {1});
+  semantics_shape_tree.ForEachMutableElement(
+      [this, copy_start](const ShapeIndex& shape_index,
+                         const HloValueSemantics** semantics) {
+        if (shape_index.empty()) {
+          *semantics = analysis_->NewHloValueSemantics(
+              HloValueSemanticLabel::kTupleOrToken, {copy_start, shape_index});
+        }
+        if (shape_index == ShapeIndex{2}) {
+          *semantics = analysis_->NewHloValueSemantics(
+              HloValueSemanticLabel::kRandom, {copy_start, shape_index});
+        }
+        if (shape_index == ShapeIndex{3}) {
+          *semantics = analysis_->NewHloValueSemantics(
+              HloValueSemanticLabel::kRandom, {copy_start, shape_index});
+        }
+      });
+  analysis_->SetHloValueSemantics(copy_start, semantics_shape_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleCopyDone(HloInstruction* copy_done) {
+  const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
+      analysis_->GetInstructionSemantics(copy_done->operand(0));
+  analysis_->DeepCopyHloValueSemantics(copy_done, operand_semantics_shape_tree,
+                                       {0});
+  return OkStatus();
+}
+Status HloValueSemanticsPropagation::HandleCollectivePermuteStart(
+    HloInstruction* collective_permute_start) {
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(
+      collective_permute_start->shape());
+  const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
+      analysis_->GetInstructionSemantics(collective_permute_start->operand(0));
+  analysis_->DeepCopyHloValueSemantics(semantics_shape_tree,
+                                       operand_semantics_shape_tree, {}, {0});
+  analysis_->DeepCopyHloValueSemantics(semantics_shape_tree,
+                                       operand_semantics_shape_tree, {}, {1});
+  semantics_shape_tree.ForEachMutableElement(
+      [this, collective_permute_start](const ShapeIndex& shape_index,
+                                       const HloValueSemantics** semantics) {
+        if (shape_index.empty()) {
+          *semantics = analysis_->NewHloValueSemantics(
+              HloValueSemanticLabel::kTupleOrToken,
+              {collective_permute_start, {}});
+        }
+        if (shape_index == ShapeIndex{2}) {
+          *semantics = analysis_->NewHloValueSemantics(
+              HloValueSemanticLabel::kRandom,
+              {collective_permute_start, shape_index});
+        }
+        if (shape_index == ShapeIndex{3}) {
+          *semantics = analysis_->NewHloValueSemantics(
+              HloValueSemanticLabel::kRandom,
+              {collective_permute_start, shape_index});
+        }
+      });
+  analysis_->SetHloValueSemantics(collective_permute_start,
+                                  semantics_shape_tree);
+  return OkStatus();
+}
+Status HloValueSemanticsPropagation::HandleCollectivePermuteDone(
+    HloInstruction* collective_permute_done) {
+  const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
+      analysis_->GetInstructionSemantics(collective_permute_done->operand(0));
+  analysis_->DeepCopyHloValueSemantics(collective_permute_done,
+                                       operand_semantics_shape_tree, {1});
+  return OkStatus();
+}
+Status HloValueSemanticsPropagation::HandleGather(HloInstruction* gather) {
+  const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
+      analysis_->GetInstructionSemantics(gather->operand(0));
+  analysis_->DeepCopyHloValueSemantics(gather, operand_semantics_shape_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleScatter(HloInstruction* scatter) {
+  TF_ASSIGN_OR_RETURN(HloValueSemantics semantics,
+                      ComputeSemanticsFromOperands(scatter, {0, 2}));
+  const HloValueSemantics* semantics_ptr = AddSemantics(semantics);
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(scatter->shape(),
+                                                           semantics_ptr);
+  analysis_->SetHloValueSemantics(scatter, semantics_shape_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleAfterAll(HloInstruction* after_all) {
+  const HloValueSemantics* semantics = analysis_->NewHloValueSemantics(
+      HloValueSemanticLabel::kTupleOrToken, {after_all, {}});
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(after_all->shape(),
+                                                           semantics);
+  analysis_->SetHloValueSemantics(after_all, semantics_shape_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleAsyncStart(
+    HloInstruction* async_start) {
+  return Unimplemented("AsyncStart is not supported yet.");
+}
+Status HloValueSemanticsPropagation::HandleAsyncDone(
+    HloInstruction* async_done) {
+  return Unimplemented("AsyncDone is not supported yet.");
+}
+
+Status HloValueSemanticsPropagation::HandleInfeed(HloInstruction* infeed) {
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(infeed->shape(),
+                                                           nullptr);
+  semantics_shape_tree.ForEachMutableElement(
+      [this, &semantics_shape_tree, infeed](
+          const ShapeIndex& shape_index, const HloValueSemantics** semantics) {
+        if (semantics_shape_tree.IsLeaf(shape_index)) {
+          *semantics = analysis_->NewHloValueSemantics(
+              HloValueSemanticLabel::kWeight, {infeed, shape_index});
+        } else {
+          *semantics = analysis_->NewHloValueSemantics(
+              HloValueSemanticLabel::kTupleOrToken, {infeed, shape_index});
+        }
+      });
+  analysis_->SetHloValueSemantics(infeed, semantics_shape_tree);
+  return OkStatus();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_value_semantics_analysis.h b/tensorflow/compiler/xla/service/hlo_value_semantics_analysis.h
new file mode 100644
index 00000000000..4a8e8bd3e39
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_value_semantics_analysis.h
@@ -0,0 +1,210 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VALUE_SEMANTICS_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VALUE_SEMANTICS_ANALYSIS_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_value.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+
+// The comment below explains where the labels could originate from. Once
+// originated,  those labels are then propagated throughout the HLO module.
+enum class HloValueSemanticLabel {
+  // Values that are known or predictable at compile time, including constants,
+  // iota, replica-id, and partition-id.
+  kStatic,
+  // Values that are not known or can't be predicated at compile time.
+  kRandom,
+  // HLO module parameters.
+  kWeight,
+  // Output of weight-weight or weight-activation matmuls.
+  kActivation,
+  // Output of weight-activation matmuls where the weight is a dependence of
+  // that activation. Or output of weight-activation-gradient matmuls.
+  kActivationGradient,
+  // Output of activation-gradient-activation matmuls.
+  kWeightGradient,
+  kTupleOrToken,
+};
+
+std::string HloValueSemanticLabelToString(HloValueSemanticLabel label);
+
+class HloValueSemantics {
+ public:
+  using Id = int64_t;
+  HloValueSemantics(HloValueSemanticLabel label, const HloPosition& origin);
+  HloValueSemantics(Id id, HloValueSemanticLabel label,
+                    const HloPosition& origin);
+  HloValueSemantics(const HloValueSemantics& other) = default;
+  HloValueSemantics(HloValueSemantics&& other) = default;
+  HloValueSemantics& operator=(const HloValueSemantics& other) = default;
+
+  Id id() const { return id_; }
+  HloValueSemanticLabel label() const { return label_; }
+  const HloPosition& origin() const { return origin_; }
+  std::string ToString() const;
+
+ private:
+  const Id id_;
+  const HloValueSemanticLabel label_;
+  const HloPosition origin_;
+};
+
+using HloValueSemanticsMap =
+    absl::flat_hash_map<const HloInstruction*,
+                        ShapeTree<const HloValueSemantics*>>;
+class HloValueSemanticsPropagation;
+
+class HloValueSemanticsAnalysis {
+ public:
+  static StatusOr<std::unique_ptr<HloValueSemanticsAnalysis>> Run(
+      const HloModule& module);
+  virtual ~HloValueSemanticsAnalysis() = default;
+  const HloValueSemantics* GetSemantics(const HloInstruction* instruction,
+                                        const ShapeIndex& index = {}) const;
+
+  const HloValueSemanticsMap& GetSemanticsMap() const {
+    return value_semantics_;
+  }
+
+ protected:
+  friend class HloValueSemanticsPropagation;
+
+  explicit HloValueSemanticsAnalysis(const HloModule& module);
+
+  void AnnotateWeights();
+
+  // Infer semantics for all instructions in the computation. Computation
+  // parameters are assigned the semantics of the corresponding operand.
+  Status RunOnComputation(const HloComputation& computation,
+                          absl::Span<const HloInstruction* const> operands);
+  // Same as the above RunOnComputation, but computation parameters have
+  // already been assigned with semantics.
+  virtual Status RunOnComputation(const HloComputation& computation);
+  HloValueSemantics::Id NextId();
+  const HloValueSemantics* NewHloValueSemantics(HloValueSemanticLabel label,
+                                                const HloPosition& origin);
+  const ShapeTree<const HloValueSemantics*>& GetInstructionSemantics(
+      const HloInstruction* instruction) const;
+  void DeepCopyHloValueSemantics(
+      ShapeTree<const HloValueSemantics*>& copy_to,
+      const ShapeTree<const HloValueSemantics*>& copy_from,
+      const ShapeIndex& source_index, const ShapeIndex& destination_index);
+  void DeepCopyHloValueSemantics(
+      const HloInstruction* target,
+      const ShapeTree<const HloValueSemantics*>& copy_from,
+      const ShapeIndex& source_index = {});
+  void SetHloValueSemantics(
+      const HloInstruction* target,
+      const ShapeTree<const HloValueSemantics*>& semantics);
+  void DeleteHloValueSemantics(
+      const ShapeTree<const HloValueSemantics*>& to_delete);
+  void DeleteHloValueSemantics(const HloValueSemantics* to_delete);
+  const HloModule& module_;
+  HloValueSemanticsMap value_semantics_;
+  absl::flat_hash_map<HloValueSemantics::Id, std::unique_ptr<HloValueSemantics>>
+      value_semantics_map_;
+  HloValueSemantics::Id next_id_;
+};
+
+class HloValueSemanticsPropagation : public DfsHloVisitorWithDefault {
+ public:
+  explicit HloValueSemanticsPropagation(HloValueSemanticsAnalysis* analysis);
+  Status Run(const HloComputation& computation);
+  // Infer the output semantics from all operands of the instruction.
+  Status DefaultAction(HloInstruction* instruction) override;
+  Status HandleParameter(HloInstruction* parameter) override;
+  Status HandleConstant(HloInstruction* constant) override;
+  Status HandleIota(HloInstruction* iota) override;
+  Status HandlePartitionId(HloInstruction* partition_id) override;
+  Status HandleReplicaId(HloInstruction* replica_id) override;
+  Status HandleClamp(HloInstruction* clamp) override;
+  Status HandleTuple(HloInstruction* tuple) override;
+  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
+  Status HandleCall(HloInstruction* call) override;
+  Status HandleFusion(HloInstruction* fusion) override;
+  Status HandleCustomCall(HloInstruction* custom_call) override;
+  Status HandleWhile(HloInstruction* xla_while) override;
+  Status HandleConditional(HloInstruction* conditional) override;
+  Status HandleSelect(HloInstruction* select) override;
+  Status HandleConcatenate(HloInstruction* concatenate) override;
+  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
+  Status HandleDynamicUpdateSlice(
+      HloInstruction* dynamic_update_slice) override;
+  Status HandleCopyStart(HloInstruction* copy_start) override;
+  Status HandleCopyDone(HloInstruction* copy_done) override;
+  Status HandleCollectivePermuteStart(
+      HloInstruction* collective_permute_start) override;
+  Status HandleCollectivePermuteDone(
+      HloInstruction* collective_permute_done) override;
+  Status HandleGather(HloInstruction* gather) override;
+  Status HandleScatter(HloInstruction* scatter) override;
+  Status HandleAfterAll(HloInstruction* after_all) override;
+  Status HandleAsyncStart(HloInstruction* async_start) override;
+  Status HandleAsyncDone(HloInstruction* async_done) override;
+  Status HandleInfeed(HloInstruction* infeed) override;
+
+ protected:
+  HloValueSemantics CopySemantics(const HloValueSemantics& semantics);
+  HloValueSemantics CopySemanticsWithNewOrigin(
+      const HloValueSemantics& semantics, HloInstruction* new_origin,
+      const ShapeIndex& index = {});
+  const HloValueSemantics* AddSemantics(const HloValueSemantics& semantics);
+  // Checks whether the given activation's origin depends on the given
+  // origin_dependence. If recursive is true, recursively match
+  // origin_dependence with operands, otherwise only match it with
+  // activation_semantics' operands.
+  bool IsActivationOriginDependentOn(
+      const HloValueSemantics& activation_semantics,
+      const HloPosition& origin_dependence, bool recursive = false) const;
+  StatusOr<HloValueSemantics> ComputeSemanticsFromStaticAndOther(
+      const HloValueSemantics& static_semantics,
+      const HloValueSemantics& other_semantics, HloInstruction* instruction);
+  StatusOr<HloValueSemantics> ComputeSemanticsFromRandomAndOther(
+      const HloValueSemantics& random_semantics,
+      const HloValueSemantics& other_semantics, HloInstruction* instruction);
+  StatusOr<HloValueSemantics> ComputeSemanticsFromWeightAndOther(
+      const HloValueSemantics& weight_semantics,
+      const HloValueSemantics& other_semantics, HloInstruction* instruction);
+  StatusOr<HloValueSemantics> ComputeSemanticsFromActivationAndOther(
+      const HloValueSemantics& activation_semantics,
+      const HloValueSemantics& other_semantics, HloInstruction* instruction);
+  StatusOr<HloValueSemantics> ComputeSemanticsFromActivationGradientAndOther(
+      const HloValueSemantics& activation_gradient_semantics,
+      const HloValueSemantics& other_semantics, HloInstruction* instruction);
+  StatusOr<HloValueSemantics> ComputeSemanticsFromWeightGradientAndOther(
+      const HloValueSemantics& weight_gradient_semantics,
+      const HloValueSemantics& other_semantics, HloInstruction* instruction);
+  StatusOr<HloValueSemantics> ComputeSemanticsFromOperands(
+      HloInstruction* instruction, absl::Span<const int64_t> operand_indices,
+      absl::Span<const ShapeIndex> operand_shape_indices = {});
+  HloValueSemanticsAnalysis* analysis_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VALUE_SEMANTICS_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_activation_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_value_semantics_analysis_test.cc
similarity index 55%
rename from tensorflow/compiler/xla/service/hlo_activation_analysis_test.cc
rename to tensorflow/compiler/xla/service/hlo_value_semantics_analysis_test.cc
index 3066a35d4d7..251e2b7300c 100644
--- a/tensorflow/compiler/xla/service/hlo_activation_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_value_semantics_analysis_test.cc
@@ -13,21 +13,60 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_activation_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_value_semantics_analysis.h"
 
 #include <string>
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class HloActivationAnalysisTest : public HloTestBase {};
+class HloValueSemanticsAnalysisTest : public HloTestBase {
+ public:
+  bool HasLabel(const HloValueSemanticsAnalysis& hlo_value_semantics_analysis,
+                HloModule* module, absl::string_view instruction_name,
+                const HloValueSemanticLabel& expected_label) {
+    HloInstruction* instruction = FindInstruction(module, instruction_name);
+    const HloValueSemantics* semantics =
+        hlo_value_semantics_analysis.GetSemantics(instruction);
+    LOG(INFO) << "instruction: " << instruction->ToString()
+              << semantics->ToString();
+    return semantics->label() == expected_label;
+  }
+  bool IsStatic(const HloValueSemanticsAnalysis& hlo_value_semantics_analysis,
+                HloModule* module, absl::string_view instruction_name) {
+    return HasLabel(hlo_value_semantics_analysis, module, instruction_name,
+                    HloValueSemanticLabel::kStatic);
+  }
+  bool IsWeight(const HloValueSemanticsAnalysis& hlo_value_semantics_analysis,
+                HloModule* module, absl::string_view instruction_name) {
+    return HasLabel(hlo_value_semantics_analysis, module, instruction_name,
+                    HloValueSemanticLabel::kWeight);
+  }
+  bool IsActivation(
+      const HloValueSemanticsAnalysis& hlo_value_semantics_analysis,
+      HloModule* module, absl::string_view instruction_name) {
+    return HasLabel(hlo_value_semantics_analysis, module, instruction_name,
+                    HloValueSemanticLabel::kActivation);
+  }
+  bool IsActivationGradient(
+      const HloValueSemanticsAnalysis& hlo_value_semantics_analysis,
+      HloModule* module, absl::string_view instruction_name) {
+    return HasLabel(hlo_value_semantics_analysis, module, instruction_name,
+                    HloValueSemanticLabel::kActivationGradient);
+  }
+  bool IsWeightGradient(
+      const HloValueSemanticsAnalysis& hlo_value_semantics_analysis,
+      HloModule* module, absl::string_view instruction_name) {
+    return HasLabel(hlo_value_semantics_analysis, module, instruction_name,
+                    HloValueSemanticLabel::kWeightGradient);
+  }
+};
 
-TEST_F(HloActivationAnalysisTest, OneMatmul) {
+TEST_F(HloValueSemanticsAnalysisTest, OneMatmul) {
   const std::string module_str = R"(
 HloModule OneMatmul
 
@@ -62,15 +101,19 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(
       auto module, ParseAndReturnVerifiedModule(module_str, /*replica_count=*/1,
                                                 /*num_partitions=*/2));
-  auto act_set = ComputeHloActivationAnalysis(module.get());
-  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "copy")));
-  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "Arg_1.2")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.0")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "select.35")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.2")));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloValueSemanticsAnalysis> hlo_value_semantics_analysis,
+      HloValueSemanticsAnalysis::Run(*module));
+  EXPECT_TRUE(IsWeight(*hlo_value_semantics_analysis, module.get(), "copy"));
+  EXPECT_TRUE(IsWeight(*hlo_value_semantics_analysis, module.get(), "Arg_1.2"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.0"));
+  EXPECT_TRUE(
+      IsStatic(*hlo_value_semantics_analysis, module.get(), "select.35"));
+  EXPECT_TRUE(IsWeight(*hlo_value_semantics_analysis, module.get(), "dot.2"));
 }
 
-TEST_F(HloActivationAnalysisTest, TwoMatmuls) {
+TEST_F(HloValueSemanticsAnalysisTest, TwoMatmuls) {
   const std::string module_str = R"(
 HloModule TwoMatmuls
 
@@ -112,19 +155,29 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(
       auto module, ParseAndReturnVerifiedModule(module_str, /*replica_count=*/1,
                                                 /*num_partitions=*/2));
-  auto act_set = ComputeHloActivationAnalysis(module.get());
-  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "copy")));
-  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "Arg_1.2")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.0")));
-  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "Arg_2.3")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.1")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "select.40")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.2")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.5")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.6")));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloValueSemanticsAnalysis> hlo_value_semantics_analysis,
+      HloValueSemanticsAnalysis::Run(*module));
+  EXPECT_FALSE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "copy"));
+  EXPECT_FALSE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "Arg_1.2"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.0"));
+  EXPECT_FALSE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "Arg_2.3"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.1"));
+  EXPECT_TRUE(
+      IsStatic(*hlo_value_semantics_analysis, module.get(), "select.40"));
+  EXPECT_TRUE(IsWeight(*hlo_value_semantics_analysis, module.get(), "dot.2"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.5"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.6"));
 }
 
-TEST_F(HloActivationAnalysisTest, RepeatWhile) {
+TEST_F(HloValueSemanticsAnalysisTest, RepeatWhile) {
   const std::string module_str = R"(
 HloModule RepeatWhile
 
@@ -268,30 +321,241 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(
       auto module, ParseAndReturnVerifiedModule(module_str, /*replica_count=*/1,
                                                 /*num_partitions=*/2));
-  auto act_set = ComputeHloActivationAnalysis(module.get());
-  EXPECT_FALSE(
-      act_set.count(FindInstruction(module.get(), "get-tuple-element.55")));
-  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "reshape.74")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.0")));
-  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "reshape.79")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.1")));
-  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "reshape.22")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "reshape.95")));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloValueSemanticsAnalysis> hlo_value_semantics_analysis,
+      HloValueSemanticsAnalysis::Run(*module));
+  EXPECT_TRUE(IsWeight(*hlo_value_semantics_analysis, module.get(),
+                       "get-tuple-element.55"));
   EXPECT_TRUE(
-      act_set.count(FindInstruction(module.get(), "dynamic-update-slice.99")));
+      IsWeight(*hlo_value_semantics_analysis, module.get(), "reshape.74"));
   EXPECT_TRUE(
-      act_set.count(FindInstruction(module.get(), "get-tuple-element.180")));
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.0"));
   EXPECT_TRUE(
-      act_set.count(FindInstruction(module.get(), "get-tuple-element.190")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "reshape.21")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "multiply.3")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.20")));
-  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "reshape.23")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.21")));
-  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "reshape.24")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.22")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "reshape.26")));
-  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.23")));
+      IsWeight(*hlo_value_semantics_analysis, module.get(), "reshape.79"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.1"));
+  EXPECT_TRUE(
+      IsWeight(*hlo_value_semantics_analysis, module.get(), "reshape.22"));
+  EXPECT_TRUE(
+      IsStatic(*hlo_value_semantics_analysis, module.get(), "reshape.95"));
+  EXPECT_TRUE(IsStatic(*hlo_value_semantics_analysis, module.get(),
+                       "dynamic-update-slice.99"));
+  EXPECT_TRUE(IsStatic(*hlo_value_semantics_analysis, module.get(),
+                       "get-tuple-element.180"));
+  EXPECT_TRUE(IsStatic(*hlo_value_semantics_analysis, module.get(),
+                       "get-tuple-element.190"));
+  EXPECT_TRUE(
+      IsStatic(*hlo_value_semantics_analysis, module.get(), "reshape.21"));
+  EXPECT_TRUE(
+      IsStatic(*hlo_value_semantics_analysis, module.get(), "multiply.3"));
+  EXPECT_TRUE(IsWeight(*hlo_value_semantics_analysis, module.get(), "dot.20"));
+  EXPECT_TRUE(
+      IsWeight(*hlo_value_semantics_analysis, module.get(), "reshape.23"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.21"));
+  EXPECT_TRUE(
+      IsWeight(*hlo_value_semantics_analysis, module.get(), "reshape.24"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.22"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "reshape.26"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.23"));
+}
+
+TEST_F(HloValueSemanticsAnalysisTest, ConvWithClamp) {
+  const std::string module_str = R"(
+HloModule ConvWithClamp
+
+ENTRY entry {
+  constant.123 = bf16[]{:T(256)} constant(127)
+  constant.127 = bf16[]{:T(256)} constant(-128)
+  arg_0 = bf16[128,14,14,1024]{3,0,2,1:T(8,128)(2,1)} parameter(0)
+  broadcast.819 = bf16[1,1,1024,512]{3,2,1,0:T(8,128)(2,1)} broadcast(constant.127), dimensions={}
+  arg_1 = bf16[1,1,1024,512]{3,2,1,0:T(8,128)(2,1)} parameter(1)
+  broadcast.818 = bf16[1,1,1024,512]{3,2,1,0:T(8,128)(2,1)} broadcast(constant.123), dimensions={}
+  clamp.42 = bf16[1,1,1024,512]{3,2,1,0:T(8,128)(2,1)} clamp(broadcast.819, arg_1, broadcast.818)
+  round-nearest-even.42 = bf16[1,1,1024,512]{3,2,1,0:T(8,128)(2,1)} round-nearest-even(clamp.42)
+  convert.219 = s8[1,1,1024,512]{3,2,1,0:T(8,128)(4,1)} convert(round-nearest-even.42)
+  ROOT convolution.43 = bf16[128,14,14,512]{3,0,2,1:T(8,128)(2,1)} convolution(arg_0, convert.219), window={size=1x1}, dim_labels=b01f_01io->b01f
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_str,
+                                                       /*replica_count=*/1,
+                                                       /*num_partitions=*/1));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloValueSemanticsAnalysis> hlo_value_semantics_analysis,
+      HloValueSemanticsAnalysis::Run(*module));
+  EXPECT_TRUE(
+      IsWeight(*hlo_value_semantics_analysis, module.get(), "convert.219"));
+}
+
+TEST_F(HloValueSemanticsAnalysisTest, MnistTrainingLoop) {
+  const std::string module_str = R"(
+HloModule MnistTrainingLoopWithInfeed.140, entry_computation_layout={(f32[784,128]{1,0:T(8,128)},f32[128]{0:T(256)},f32[128,32]{1,0:T(8,128)},f32[32]{0:T(256)},f32[32,10]{1,0:T(8,128)},f32[10]{0:T(256)})->(f32[784,128]{1,0:T(8,128)}, f32[128]{0:T(256)}, f32[128,32]{1,0:T(8,128)}, f32[32]{0:T(256)}, f32[32,10]{1,0:T(8,128)}, /*index=5*/f32[10]{0:T(256)})}
+
+relu.9 {
+  x.10 = f32[] parameter(0)
+  constant.11 = f32[] constant(0)
+  ROOT maximum.12 = f32[] maximum(x.10, constant.11)
+}
+
+max_F32.17 {
+  lhs.18 = f32[] parameter(0)
+  rhs.19 = f32[] parameter(1)
+  ROOT maximum.20 = f32[] maximum(lhs.18, rhs.19)
+}
+
+add_F32.1 {
+  lhs.22 = f32[] parameter(0)
+  rhs.23 = f32[] parameter(1)
+  ROOT add.24 = f32[] add(lhs.22, rhs.23)
+}
+
+relu_gradients.29 {
+  activation.30 = f32[] parameter(0)
+  constant.32 = f32[] constant(0)
+  compare.33 = pred[] compare(activation.30, constant.32), direction=GT
+  backprop.31 = f32[] parameter(1)
+  ROOT select.34 = f32[] select(compare.33, backprop.31, constant.32)
+}
+
+body.49 {
+  after-all.51 = token[] after-all()
+  infeed.52 = ((f32[100,784]{1,0}, f32[100,10]{1,0}, pred[]), token[]) infeed(after-all.51)
+  get.53 = (f32[100,784]{1,0}, f32[100,10]{1,0}, pred[]) get-tuple-element(infeed.52), index=0
+  get.54 = f32[100,784]{1,0} get-tuple-element(get.53), index=0
+  prev.50 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) parameter(0)
+  get.57 = f32[784,128]{1,0} get-tuple-element(prev.50), index=0
+  dot.63 = f32[100,128]{1,0} dot(get.54, get.57), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  get.58 = f32[128]{0} get-tuple-element(prev.50), index=1
+  broadcast.64 = f32[100,128]{1,0} broadcast(get.58), dimensions={1}
+  add.65 = f32[100,128]{1,0} add(dot.63, broadcast.64)
+  map.66 = f32[100,128]{1,0} map(add.65), dimensions={0,1}, to_apply=relu.9
+  get.59 = f32[128,32]{1,0} get-tuple-element(prev.50), index=2
+  dot.67 = f32[100,32]{1,0} dot(map.66, get.59), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  get.60 = f32[32]{0} get-tuple-element(prev.50), index=3
+  broadcast.68 = f32[100,32]{1,0} broadcast(get.60), dimensions={1}
+  add.69 = f32[100,32]{1,0} add(dot.67, broadcast.68)
+  map.70 = f32[100,32]{1,0} map(add.69), dimensions={0,1}, to_apply=relu.9
+  get.61 = f32[32,10]{1,0} get-tuple-element(prev.50), index=4
+  dot.71 = f32[100,10]{1,0} dot(map.70, get.61), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  get.62 = f32[10]{0} get-tuple-element(prev.50), index=5
+  broadcast.72 = f32[100,10]{1,0} broadcast(get.62), dimensions={1}
+  add.73 = f32[100,10]{1,0} add(dot.71, broadcast.72)
+  constant.74 = f32[] constant(-inf)
+  reduce.75 = f32[100]{0} reduce(add.73, constant.74), dimensions={1}, to_apply=max_F32.17
+  broadcast.76 = f32[100,10]{1,0} broadcast(reduce.75), dimensions={0}
+  subtract.77 = f32[100,10]{1,0} subtract(add.73, broadcast.76)
+  exponential.78 = f32[100,10]{1,0} exponential(subtract.77)
+  constant.79 = f32[] constant(0)
+  reduce.80 = f32[100]{0} reduce(exponential.78, constant.79), dimensions={1}, to_apply=add_F32.1
+  broadcast.81 = f32[100,10]{1,0} broadcast(reduce.80), dimensions={0}
+  divide.82 = f32[100,10]{1,0} divide(exponential.78, broadcast.81)
+  get.55 = f32[100,10]{1,0} get-tuple-element(get.53), index=1
+  subtract.83 = f32[100,10]{1,0} subtract(divide.82, get.55)
+  transpose.88 = f32[10,32]{0,1} transpose(get.61), dimensions={1,0}
+  dot.89 = f32[100,32]{1,0} dot(subtract.83, transpose.88), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  map.90 = f32[100,32]{1,0} map(map.70, dot.89), dimensions={0,1}, to_apply=relu_gradients.29
+  transpose.95 = f32[32,128]{0,1} transpose(get.59), dimensions={1,0}
+  dot.96 = f32[100,128]{1,0} dot(map.90, transpose.95), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  map.97 = f32[100,128]{1,0} map(map.66, dot.96), dimensions={0,1}, to_apply=relu_gradients.29
+  transpose.102 = f32[128,784]{0,1} transpose(get.57), dimensions={1,0}
+  dot.103 = f32[100,784]{1,0} dot(map.97, transpose.102), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  transpose.98 = f32[784,100]{0,1} transpose(get.54), dimensions={1,0}
+  dot.99 = f32[784,128]{1,0} dot(transpose.98, map.97), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  constant.104 = f32[] constant(0.01)
+  broadcast.105 = f32[784,128]{1,0} broadcast(constant.104), dimensions={}
+  multiply.106 = f32[784,128]{1,0} multiply(dot.99, broadcast.105)
+  subtract.107 = f32[784,128]{1,0} subtract(get.57, multiply.106)
+  constant.100 = f32[] constant(0)
+  reduce.101 = f32[128]{0} reduce(map.97, constant.100), dimensions={0}, to_apply=add_F32.1
+  constant.108 = f32[] constant(0.01)
+  broadcast.109 = f32[128]{0} broadcast(constant.108), dimensions={}
+  multiply.110 = f32[128]{0} multiply(reduce.101, broadcast.109)
+  subtract.111 = f32[128]{0} subtract(get.58, multiply.110)
+  transpose.91 = f32[128,100]{0,1} transpose(map.66), dimensions={1,0}
+  dot.92 = f32[128,32]{1,0} dot(transpose.91, map.90), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  constant.112 = f32[] constant(0.01)
+  broadcast.113 = f32[128,32]{1,0} broadcast(constant.112), dimensions={}
+  multiply.114 = f32[128,32]{1,0} multiply(dot.92, broadcast.113)
+  subtract.115 = f32[128,32]{1,0} subtract(get.59, multiply.114)
+  constant.93 = f32[] constant(0)
+  reduce.94 = f32[32]{0} reduce(map.90, constant.93), dimensions={0}, to_apply=add_F32.1
+  constant.116 = f32[] constant(0.01)
+  broadcast.117 = f32[32]{0} broadcast(constant.116), dimensions={}
+  multiply.118 = f32[32]{0} multiply(reduce.94, broadcast.117)
+  subtract.119 = f32[32]{0} subtract(get.60, multiply.118)
+  transpose.84 = f32[32,100]{0,1} transpose(map.70), dimensions={1,0}
+  dot.85 = f32[32,10]{1,0} dot(transpose.84, subtract.83), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  constant.120 = f32[] constant(0.01)
+  broadcast.121 = f32[32,10]{1,0} broadcast(constant.120), dimensions={}
+  multiply.122 = f32[32,10]{1,0} multiply(dot.85, broadcast.121)
+  subtract.123 = f32[32,10]{1,0} subtract(get.61, multiply.122)
+  constant.86 = f32[] constant(0)
+  reduce.87 = f32[10]{0} reduce(subtract.83, constant.86), dimensions={0}, to_apply=add_F32.1
+  constant.124 = f32[] constant(0.01)
+  broadcast.125 = f32[10]{0} broadcast(constant.124), dimensions={}
+  multiply.126 = f32[10]{0} multiply(reduce.87, broadcast.125)
+  subtract.127 = f32[10]{0} subtract(get.62, multiply.126)
+  get.56 = pred[] get-tuple-element(get.53), index=2
+  ROOT tuple.128 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) tuple(subtract.107, subtract.111, subtract.115, subtract.119, subtract.123, subtract.127, get.56)
+}
+
+condition.129 {
+  prev.130 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) parameter(0)
+  ROOT get.131 = pred[] get-tuple-element(prev.130), index=6
+}
+
+ENTRY MnistTrainingLoopWithInfeed.140 {
+  layer1_weights.1 = f32[784,128]{1,0} parameter(0)
+  layer1_biases.2 = f32[128]{0} parameter(1)
+  layer2_weights.3 = f32[128,32]{1,0} parameter(2)
+  layer2_biases.4 = f32[32]{0} parameter(3)
+  layer3_weights.5 = f32[32,10]{1,0} parameter(4)
+  layer3_biases.6 = f32[10]{0} parameter(5)
+  constant.7 = pred[] constant(true)
+  tuple.8 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) tuple(layer1_weights.1, layer1_biases.2, layer2_weights.3, layer2_biases.4, layer3_weights.5, layer3_biases.6, constant.7)
+  while.132 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}, pred[]) while(tuple.8), condition=condition.129, body=body.49
+  get.133 = f32[784,128]{1,0} get-tuple-element(while.132), index=0
+  get.134 = f32[128]{0} get-tuple-element(while.132), index=1
+  get.135 = f32[128,32]{1,0} get-tuple-element(while.132), index=2
+  get.136 = f32[32]{0} get-tuple-element(while.132), index=3
+  get.137 = f32[32,10]{1,0} get-tuple-element(while.132), index=4
+  get.138 = f32[10]{0} get-tuple-element(while.132), index=5
+  ROOT tuple.139 = (f32[784,128]{1,0}, f32[128]{0}, f32[128,32]{1,0}, f32[32]{0}, f32[32,10]{1,0}, /*index=5*/f32[10]{0}) tuple(get.133, get.134, get.135, get.136, get.137, get.138)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_str,
+                                                       /*replica_count=*/1,
+                                                       /*num_partitions=*/1));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloValueSemanticsAnalysis> hlo_value_semantics_analysis,
+      HloValueSemanticsAnalysis::Run(*module));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.63"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.67"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.71"));
+  // TODO(b/280359884): Should be a WeightGradient and not an Activation.
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.85"));
+  EXPECT_TRUE(IsActivationGradient(*hlo_value_semantics_analysis, module.get(),
+                                   "dot.89"));
+  EXPECT_TRUE(
+      IsWeightGradient(*hlo_value_semantics_analysis, module.get(), "dot.92"));
+  EXPECT_TRUE(IsActivationGradient(*hlo_value_semantics_analysis, module.get(),
+                                   "dot.96"));
+  // TODO(b/272597866) Should technically be a WeightGradient, but this
+  // classification is coming from it's input (the input data tensor) being
+  // classified as a Weight.
+  EXPECT_TRUE(IsActivationGradient(*hlo_value_semantics_analysis, module.get(),
+                                   "dot.99"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 429b71a5bbe..93fe75744a0 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <iterator>
+#include <map>
 #include <memory>
 #include <numeric>
 #include <optional>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -99,11 +101,6 @@ Status CheckParameterCount(const HloInstruction* calling_instruction,
 int64_t GetSubgroupSize(HloCollectiveInstruction* hlo,
                         CollectiveOpGroupMode group_mode) {
   const HloModuleConfig& config = hlo->GetModule()->config();
-  // empty replica groups imply all replicas form a single group.
-  int64_t replica_subgroup_size =
-      hlo->replica_groups().empty()
-          ? 0
-          : hlo->replica_groups()[0].replica_ids_size();
   switch (group_mode) {
     case CollectiveOpGroupMode::kCrossReplica:
     case CollectiveOpGroupMode::kCrossReplicaAndPartition: {
@@ -118,7 +115,8 @@ int64_t GetSubgroupSize(HloCollectiveInstruction* hlo,
       return replica_subgroup_size;
     }
     case CollectiveOpGroupMode::kFlattenedID:
-      return replica_subgroup_size;
+      // Empty replica groups not allowed in this mode.
+      return hlo->replica_groups()[0].replica_ids_size();
     case CollectiveOpGroupMode::kCrossPartition:
       return hlo->replica_groups().empty()
                  ? config.num_partitions()
@@ -150,7 +148,7 @@ Status CheckNestedComputationThreadNameEqual(const HloComputation* comp,
 Status ShapeVerifier::Preprocess(HloInstruction* hlo) {
   if (!hlo->called_computations().empty() && !IsCallerInstruction(hlo)) {
     return InternalError(
-        "Called computations specified for non-caller instruction  %s",
+        "Called computations specified for non-caller instruction %s",
         hlo->ToString());
   }
   std::optional<int> arity = HloOpcodeArity(hlo->opcode());
@@ -561,15 +559,15 @@ Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(CheckReplicaGroups(hlo, group_mode));
 
   TF_RET_CHECK(all_to_all != nullptr);
-
+  const int64_t split_count = GetSubgroupSize(all_to_all, group_mode);
   if (all_to_all->split_dimension()) {
-    int64_t split_count = GetSubgroupSize(all_to_all, group_mode);
     TF_RET_CHECK(hlo->operand_count() == 1);
     return CheckShape(
         hlo, ShapeInference::InferAllToAllShape(
                  hlo->operand(0)->shape(), *all_to_all->split_dimension(),
                  *all_to_all->split_dimension(), split_count));
   } else {
+    TF_RET_CHECK(hlo->operand_count() == split_count);
     std::vector<const Shape*> operand_shapes;
     for (const HloInstruction* operand : hlo->operands()) {
       operand_shapes.push_back(&operand->shape());
@@ -1353,12 +1351,9 @@ Status ShapeVerifier::HandleMap(HloInstruction* map) {
 }
 
 Status ShapeVerifier::HandleReduceWindow(HloInstruction* reduce_window) {
-  VLOG(2) << "Verify reduce window:" << reduce_window->ToString() << "\n";
   auto reduce_window_instr = Cast<HloReduceWindowInstruction>(reduce_window);
   auto input_shapes = reduce_window_instr->input_shapes();
-  VLOG(2) << "reduce window input shape count: " << input_shapes.size() << "\n";
   auto init_shapes = reduce_window_instr->init_value_shapes();
-  VLOG(2) << "reduce instruction is :" << reduce_window->ToString() << "\n";
   TF_RETURN_IF_ERROR(CheckShape(
       reduce_window, ShapeInference::InferReduceWindowShape(
                          input_shapes, init_shapes, reduce_window->window(),
@@ -1418,8 +1413,7 @@ Status ShapeVerifier::HandleConditional(HloInstruction* conditional) {
     if (operand0_type != S32) {
       return InvalidArgument(
           "The first operand of indexed conditional must be a scalar of S32. "
-          "Got"
-          " type %s.",
+          "Got type %s.",
           PrimitiveType_Name(operand0_type));
     }
     TF_RET_CHECK(num_branches >= 1);
@@ -1944,7 +1938,7 @@ std::string ComputationsToString(
     absl::Span<HloComputation* const> computations) {
   return absl::StrJoin(computations, ",",
                        [](std::string* s, const HloComputation* computation) {
-                         s->append(computation->name());
+                         absl::StrAppend(s, computation->name());
                        });
 }
 
@@ -2292,8 +2286,7 @@ Status CheckFusionInstruction(HloInstruction* fusion) {
   // Fused root instruction and fused parameters must all be owned by the
   // fusion computation.
   bool root_owned = false;
-  const std::vector<HloInstruction*>& fused_parameters =
-      fusion->fused_parameters();
+  const auto& fused_parameters = fusion->fused_parameters();
   const HloInstruction* fused_root = fusion->fused_expression_root();
   std::vector<bool> parameter_owned(fused_parameters.size(), false);
   for (auto* instruction : fused_computation->instructions()) {
@@ -2620,7 +2613,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
 
   Status Preprocess(HloInstruction* instruction) override {
     auto [it, inserted] =
-        instructions_by_name_.insert({instruction->name(), instruction});
+        instructions_by_name_.emplace(instruction->name(), instruction);
     TF_RET_CHECK(inserted) << "HLO has name that is not unique within module:\n"
                            << instruction->ToString() << " in computation: "
                            << instruction->parent()->name()
@@ -2632,10 +2625,10 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
       Status status =
           instruction->sharding().Validate(instruction->shape(), num_devices_);
       if (!status.ok()) {
-        return Status(status.code(),
-                      absl::StrCat("Invalid sharding for instruction: ",
-                                   instruction->ToString(), ": ",
-                                   status.error_message()));
+        return Status(
+            status.code(),
+            absl::StrCat("Invalid sharding for instruction: ",
+                         instruction->ToString(), ": ", status.message()));
       }
     }
 
@@ -2689,19 +2682,23 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
   }
 
   static Status VerifyS4U4Usage(HloInstruction* instruction) {
-    bool has_s4u4_operand =
-        absl::c_any_of(instruction->operands(), [](HloInstruction* operand) {
-          return ShapeUtil::HasPrimitiveType(operand->shape(), S4) ||
-                 ShapeUtil::HasPrimitiveType(operand->shape(), U4);
-        });
     // TODO(b/259306620): Support S4/U4 operands in all instructions that
     // support inputs of other integer dtypes. Currently only aim to use it in
     // matmul and convolution op.
-    if (has_s4u4_operand && instruction->opcode() != HloOpcode::kDot &&
+    if (instruction->opcode() != HloOpcode::kDot &&
         instruction->opcode() != HloOpcode::kConvolution &&
         instruction->opcode() != HloOpcode::kConvert &&
         instruction->opcode() != HloOpcode::kFusion &&
-        instruction->opcode() != HloOpcode::kBitcast) {
+        instruction->opcode() != HloOpcode::kBitcast &&
+        instruction->opcode() != HloOpcode::kCopy &&
+        instruction->opcode() != HloOpcode::kCopyStart &&
+        instruction->opcode() != HloOpcode::kCopyDone &&
+        instruction->opcode() != HloOpcode::kGetTupleElement &&
+        instruction->opcode() != HloOpcode::kTuple &&
+        absl::c_any_of(instruction->operands(), [](HloInstruction* operand) {
+          return ShapeUtil::HasPrimitiveType(operand->shape(), S4) ||
+                 ShapeUtil::HasPrimitiveType(operand->shape(), U4);
+        })) {
       return InvalidArgument(
           "S4/U4 is currently only supported in matmul and convolution, but "
           "got instruction with S4/U4 input: %s",
@@ -2770,8 +2767,78 @@ StatusOr<bool> HloVerifier::Run(
     return status_or_changed.value();
   }
   return Status(status_or_changed.status().code(),
-                absl::StrCat("during context [", context_, "]: ",
-                             status_or_changed.status().error_message()));
+                absl::StrCat("during context [", context_,
+                             "]: ", status_or_changed.status().message()));
+}
+
+MetadataTracker::MetadataTracker(absl::string_view prefix) : prefix_(prefix) {}
+
+MetadataTracker::~MetadataTracker() {
+  if (instruction_count_ == 0) {
+    return;
+  }
+  const std::map<std::string, double> values = {
+      {"instruction_count", 1.0 * instruction_count_},
+      {"op_type_coverage", 1.0 * has_op_type_count_ / instruction_count_},
+      {"op_name_coverage", 1.0 * has_op_name_count_ / instruction_count_},
+      {"source_file_coverage",
+       1.0 * has_source_file_count_ / instruction_count_},
+      {"dummy_source_file_coverage",
+       1.0 * has_dummy_source_file_count_ / instruction_count_},
+      {"source_line_coverage",
+       1.0 * has_source_line_count_ / instruction_count_},
+      {"creation_pass_coverage",
+       1.0 * has_creation_pass_id_count_ / instruction_count_},
+      {"logical_creation_pass_coverage",
+       1.0 * has_logical_creation_pass_id_count_ / instruction_count_},
+      {"size_of_generated_code_in_bytes_coverage",
+       1.0 * has_size_of_generated_code_in_bytes_count_ / instruction_count_},
+      {"size_of_memory_working_set_in_bytes_coverage",
+       1.0 * has_size_of_memory_working_set_in_bytes_count_ /
+           instruction_count_},
+      {"profile_info_coverage",
+       1.0 * has_profile_info_count_ / instruction_count_}};
+  LOG(INFO) << prefix_ << " "
+            << absl::StrJoin(values, ",", absl::PairFormatter("="));
+}
+
+void MetadataTracker::HandleMetadata(const OpMetadata& metadata) {
+  ++instruction_count_;
+  if (!metadata.op_type().empty()) {
+    ++has_op_type_count_;
+  }
+  if (!metadata.op_name().empty()) {
+    ++has_op_name_count_;
+  }
+  if (!metadata.source_file().empty()) {
+    ++has_source_file_count_;
+    if (absl::StrContains(metadata.source_file(), "dummy")) {
+      ++has_dummy_source_file_count_;
+    }
+  }
+  if (metadata.source_line() != 0) {
+    ++has_source_line_count_;
+  }
+  if (metadata.creation_pass_id() != 0) {
+    ++has_creation_pass_id_count_;
+  }
+  if (metadata.logical_creation_pass_id() != 0) {
+    ++has_logical_creation_pass_id_count_;
+  }
+  if (metadata.size_of_generated_code_in_bytes() != 0) {
+    ++has_size_of_generated_code_in_bytes_count_;
+  }
+  if (metadata.size_of_memory_working_set_in_bytes() != 0) {
+    ++has_size_of_memory_working_set_in_bytes_count_;
+  }
+  if (metadata.has_profile_info()) {
+    ++has_profile_info_count_;
+  }
+}
+
+Status MetadataTracker::DefaultAction(HloInstruction* instruction) {
+  HandleMetadata(instruction->metadata());
+  return OkStatus();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 141f1e73811..e1c60f80c35 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
@@ -390,6 +392,31 @@ class HloVerifier : public HloModulePass {
   std::string context_;
 };
 
+// Tracks debug metadata coverage on HLO Ops and reports the results as an INFO
+// log starting with a `prefix` passed to the ctor.
+// TODO(b/261216447): Remove once the work on debug metadata is finished.
+class MetadataTracker : public DfsHloVisitorWithDefault {
+ public:
+  explicit MetadataTracker(absl::string_view prefix);
+  ~MetadataTracker() override;
+  Status DefaultAction(HloInstruction* instruction) override;
+  void HandleMetadata(const OpMetadata& metadata);
+
+ private:
+  const std::string prefix_;
+  int64_t instruction_count_ = 0;
+  int64_t has_op_type_count_ = 0;
+  int64_t has_op_name_count_ = 0;
+  int64_t has_source_file_count_ = 0;
+  int64_t has_dummy_source_file_count_ = 0;
+  int64_t has_source_line_count_ = 0;
+  int64_t has_creation_pass_id_count_ = 0;
+  int64_t has_logical_creation_pass_id_count_ = 0;
+  int64_t has_size_of_generated_code_in_bytes_count_ = 0;
+  int64_t has_size_of_memory_working_set_in_bytes_count_ = 0;
+  int64_t has_profile_info_count_ = 0;
+};
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 3337bc2141d..42258cf0ca1 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -21,6 +21,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include "absl/base/log_severity.h"
+#include "absl/log/scoped_mock_log.h"
 #include "absl/strings/str_replace.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
@@ -90,7 +93,7 @@ TEST_F(HloVerifierTest, NullInstructionParent) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("has a null parent pointer"));
+  EXPECT_THAT(status.message(), HasSubstr("has a null parent pointer"));
 }
 
 TEST_F(HloVerifierTest, NullComputationParent) {
@@ -109,7 +112,7 @@ TEST_F(HloVerifierTest, NullComputationParent) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("has a null parent pointer"));
+  EXPECT_THAT(status.message(), HasSubstr("has a null parent pointer"));
 }
 
 TEST_F(HloVerifierTest, DifferentOperandParents) {
@@ -132,8 +135,7 @@ TEST_F(HloVerifierTest, DifferentOperandParents) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              HasSubstr("is in a different computation"));
+  EXPECT_THAT(status.message(), HasSubstr("is in a different computation"));
 }
 
 TEST_F(HloVerifierTest, ResetsShapeVerifierState) {
@@ -180,8 +182,7 @@ TEST_F(HloVerifierTest, CheckCallOperandParameterShapesMismatch) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              HasSubstr("shape does not match parameter"));
+  EXPECT_THAT(status.message(), HasSubstr("shape does not match parameter"));
 }
 
 TEST_F(HloVerifierTest, CheckCallThreadMismatch) {
@@ -201,7 +202,7 @@ TEST_F(HloVerifierTest, CheckCallThreadMismatch) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("expects parent computation thread name same as called "
                         "computation's thread name"));
 }
@@ -232,8 +233,7 @@ TEST_F(HloVerifierTest, CheckConditionalOperandParameterShapesMismatch) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              HasSubstr("shape does not match parameter"));
+  EXPECT_THAT(status.message(), HasSubstr("shape does not match parameter"));
 }
 
 TEST_F(HloVerifierTest, CheckConditionalBranchIndexOperandShape) {
@@ -271,14 +271,14 @@ TEST_F(HloVerifierTest, CheckConditionalBranchIndexOperandShape) {
   status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr(
           "first operand of indexed conditional must be a scalar of S32"));
 
   *condition->mutable_shape() = ShapeUtil::MakeShape(S32, {4});
   status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("first operand of conditional must be a scalar"));
 }
 
@@ -311,7 +311,7 @@ TEST_F(HloVerifierTest, CheckConditionalBranchThread) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(hlo_string));
   auto status = verifier().Run(module.get()).status();
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("expects parent computation thread name same as called "
                         "computation's thread name"));
 }
@@ -364,7 +364,7 @@ TEST_F(HloVerifierTest, RngOpnd0NotScalar) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("Expected scalar type"));
+  EXPECT_THAT(status.message(), HasSubstr("Expected scalar type"));
 }
 
 TEST_F(HloVerifierTest, RngOperandElementTypesDoNotMatch) {
@@ -383,8 +383,7 @@ TEST_F(HloVerifierTest, RngOperandElementTypesDoNotMatch) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              HasSubstr("Expected compatible element types"));
+  EXPECT_THAT(status.message(), HasSubstr("Expected compatible element types"));
 }
 
 TEST_F(HloVerifierTest, RngMixedPrecisionNotAllowed) {
@@ -403,8 +402,7 @@ TEST_F(HloVerifierTest, RngMixedPrecisionNotAllowed) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              HasSubstr("Expected compatible element types"));
+  EXPECT_THAT(status.message(), HasSubstr("Expected compatible element types"));
 }
 
 TEST_F(HloVerifierTestAllowMixedPrecision, RngMixedPrecisionAllowed) {
@@ -441,7 +439,7 @@ TEST_F(HloVerifierTest, RngElementTypeNotSupported) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("Element type not supported"));
+  EXPECT_THAT(status.message(), HasSubstr("Element type not supported"));
 }
 
 TEST_F(HloVerifierTest, NegativeInteriorPaddingNotAllowed) {
@@ -464,7 +462,7 @@ TEST_F(HloVerifierTest, NegativeInteriorPaddingNotAllowed) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Interior padding cannot be negative"));
 }
 
@@ -486,7 +484,7 @@ TEST_F(HloVerifierTest, PadNegativeInteriorDilationNotAllowed) {
   auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Interior padding cannot be negative"));
 }
 
@@ -524,7 +522,7 @@ TEST_F(HloVerifierTest, ConvNegativeWindowDilationNotAllowed) {
   w.mutable_dimensions(0)->set_window_dilation(-1);
   conv->set_window(w);
 
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("non-positive window dilation factor"));
 }
 
@@ -536,7 +534,7 @@ TEST_F(HloVerifierTest, ConvNegativeBaseDilationNotAllowed) {
   w.mutable_dimensions(0)->set_base_dilation(-1);
   conv->set_window(w);
 
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("non-positive base area dilation factor"));
 }
 
@@ -622,7 +620,7 @@ TEST_F(HloVerifierTestAllowMixedPrecision, DynamicUpdateSliceMixedPrecision) {
                                            kDynamicUpdateSliceMixedPrecision));
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Expected instruction to have shape equal to "
                         "f32[32,511,2048], actual shape is bf16[32,511,2048]"));
 }
@@ -632,7 +630,7 @@ TEST_F(HloVerifierTestLayoutSensitive, AddWithLayoutChangeNotAllowed) {
       auto module, ParseAndReturnUnverifiedModule(kAddWithLayoutChangeHlo));
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Instruction shouldn't change layouts"));
 }
 
@@ -651,7 +649,7 @@ TEST_F(HloVerifierTestLayoutSensitive, SliceWithLayoutChangeNotAllowed) {
       auto module, ParseAndReturnUnverifiedModule(kSliceWithLayoutChangeHlo));
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Instruction shouldn't change layouts"));
 }
 
@@ -669,7 +667,7 @@ TEST_F(HloVerifierTestLayoutSensitive, ConcatWithLayoutChangeNotAllowed) {
       auto module, ParseAndReturnUnverifiedModule(kConcatWithLayoutChangeHlo));
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Instruction shouldn't change layouts"));
 }
 
@@ -687,7 +685,7 @@ TEST_F(HloVerifierTestLayoutSensitive, BitcastNeedsSameNumberOfElements) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Bitcast cannot have different shape sizes of output "
                         "(12) and operand (8)"));
 }
@@ -708,7 +706,7 @@ TEST_F(HloVerifierTest, SelectMixedPrecisionNotAllowed) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Seen floating point types of different precisions"));
 }
 
@@ -746,7 +744,7 @@ TEST_F(HloVerifierTest, SelectTupleNotAllowed) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Expected array argument for select"));
 }
 
@@ -782,7 +780,7 @@ TEST_F(HloVerifierTestLayoutSensitive, CopyStartAndCopyDoneWrongLayout) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Expected instruction to have shape equal to"));
 }
 
@@ -801,7 +799,7 @@ TEST_F(HloVerifierTest, CopyStartAndCopyDoneWrongType) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Expected instruction to have shape equal to "
                         "(f32[2,3], f32[2,3], u32[])"));
 }
@@ -824,7 +822,7 @@ TEST_F(HloVerifierTest, CopyStartMultipleCopyDone) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr("copy-start instruction requires one consumer, found 2"));
 }
 
@@ -844,7 +842,7 @@ TEST_F(HloVerifierTest, CopyDoneNoCopyStart) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("The operand of a copy-done instruction needs to be "
                         "copy-start, found tuple"));
 }
@@ -920,7 +918,7 @@ TEST_F(HloVerifierTest, AsyncStartAndAsyncDoneWrongType) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("async-done expects the async shape at index {1} to "
                         "match the async computation root shape"));
 }
@@ -940,7 +938,7 @@ TEST_F(HloVerifierTest, AsyncStartAndAsyncDoneWrongThreadName) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("thread name (main_thread vs parallel_thread)."));
 }
 
@@ -959,7 +957,7 @@ TEST_F(HloVerifierTest, AsyncStartAndAsyncDoneWrongAttr) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("async-done expects its wrapped async computation to "
                         "be identical to its operand's"));
 }
@@ -982,7 +980,7 @@ TEST_F(HloVerifierTest, AsyncStartMultipleAsyncDone) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr("async-start instruction requires one consumer, found 2"));
 }
 
@@ -1001,7 +999,7 @@ TEST_F(HloVerifierTest, AsyncStartNoAsyncDone) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr("async-start instruction requires one consumer, found 0"));
 }
 
@@ -1021,7 +1019,7 @@ TEST_F(HloVerifierTest, AsyncStartAndAsyncUpdateNoAsyncDone) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr("async-update instruction requires one consumer, found 0"));
 }
 
@@ -1041,7 +1039,7 @@ TEST_F(HloVerifierTest, AsyncDoneNoAsyncStart) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("The operand of a async-done instruction needs to be "
                         "async-start or async-update, found tuple"));
 }
@@ -1063,7 +1061,7 @@ TEST_F(HloVerifierTest, AsyncUpdateAndAsyncDoneNoAsyncStart) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("The operand of a async-update instruction needs to be "
                         "async-start or async-update, found tuple"));
 }
@@ -1088,7 +1086,7 @@ TEST_F(HloVerifierTest, AsyncOpComputationParamWrongType) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("async-start expects the async shape at index {0} to "
                         "match async computation parameter shape"));
 }
@@ -1113,7 +1111,7 @@ TEST_F(HloVerifierTest, AsyncOpComputationRootWrongType) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("async-start expects the async shape at index {1} to "
                         "match the async computation root shape"));
 }
@@ -1138,7 +1136,7 @@ TEST_F(HloVerifierTest, AsyncOpTupleWrongType) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("async-start expects the async shape to be a tuple of "
                         "at least two elements"));
 }
@@ -1163,7 +1161,7 @@ TEST_F(HloVerifierTest, AsyncStartOperandWrongType) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("async-start expects the shape of operand 0 to match "
                         "the async shape at index {0}"));
 }
@@ -1188,7 +1186,7 @@ TEST_F(HloVerifierTest, AsyncDoneOutputWrongType) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("async-done expects the shape of output to match the "
                         "async shape at index {1}"));
 }
@@ -1215,7 +1213,7 @@ TEST_F(HloVerifierTest, AsyncUpdateWrongType) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr(
           "async-update expects the shape of operand and output to match"));
 }
@@ -1237,7 +1235,7 @@ TEST_F(HloVerifierTestLayoutSensitive, AsyncDoneWrongGroupId) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("async-done expects its operand to have the same group "
                         "id (1 vs 0)."));
 }
@@ -1259,7 +1257,7 @@ TEST_F(HloVerifierTestLayoutSensitive, AsyncUpdateWrongGroupId) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("async-update expects its operand to have the same "
                         "group id (none vs 0)."));
 }
@@ -1278,8 +1276,7 @@ TEST_F(HloVerifierTest, IotaNonArrayResult) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              HasSubstr("does not support non-array result"));
+  EXPECT_THAT(status.message(), HasSubstr("does not support non-array result"));
 }
 
 TEST_F(HloVerifierTest, IotaNegativeDimension) {
@@ -1296,7 +1293,7 @@ TEST_F(HloVerifierTest, IotaNegativeDimension) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("negative"));
+  EXPECT_THAT(status.message(), HasSubstr("negative"));
 }
 
 TEST_F(HloVerifierTest, IotaPredResultNotAllowed) {
@@ -1313,7 +1310,7 @@ TEST_F(HloVerifierTest, IotaPredResultNotAllowed) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(), HasSubstr("got PRED"));
+  EXPECT_THAT(status.message(), HasSubstr("got PRED"));
 }
 
 static const char* const kMapOperandComputationMismatchHlo = R"(
@@ -1336,7 +1333,7 @@ TEST_F(HloVerifierTest, MapOperandComputationMismatch) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr(
           "Shape mismatch between to_apply computation parameter and operand"));
 }
@@ -1368,7 +1365,7 @@ TEST_F(HloVerifierTest, ReduceOperandComputationMismatch) {
       ParseAndReturnUnverifiedModule(kReduceOperandComputationMismatchHlo));
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Expected instruction to have shape equal to f32[64]"));
 }
 
@@ -1451,27 +1448,27 @@ TEST_F(HloVerifierTest, AllReduce_DifferentGroupSizesOk) {
 
 TEST_F(HloVerifierTest, AllReduce_EmptyReplicaGroup) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, MakeAllReduceComputation({{0}, {}}));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("empty replica group"));
 }
 
 TEST_F(HloVerifierTest, AllReduce_RepeatedReplicaId) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           MakeAllReduceComputation({{0, 1}, {2, 3}, {4, 0}}));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Replica 0 is repeated"));
 }
 
 TEST_F(HloVerifierTest, AllReduce_MissingReplicaId) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           MakeAllReduceComputation({{0, 1}, {2, 3}, {5, 6}}));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Replica 4 is not named"));
 }
 
 TEST_F(HloVerifierTest, AllReduce_NotEnougReplicasInGroupConfig) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, MakeAllReduceComputation({{0, 1}}, 8));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("In kCrossReplica mode, replica groups should contain "
                         "8 replicas, but found 2"));
 }
@@ -1479,7 +1476,7 @@ TEST_F(HloVerifierTest, AllReduce_NotEnougReplicasInGroupConfig) {
 TEST_F(HloVerifierTest, AllReduce_TooManyReplicasInGroupConfig) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           MakeAllReduceComputation({{0, 1}, {2, 3}}, 2));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("In kCrossReplica mode, replica groups should contain "
                         "2 replicas, but found 4"));
 }
@@ -1489,7 +1486,7 @@ TEST_F(HloVerifierTest, AllReduce_CrossReplicaAndPartition_Invalid) {
       auto module,
       MakeAllReduceComputation({{0, 1}, {2, 3}}, 2, 1, "channel_id=1"));
   EXPECT_THAT(
-      verifier().Run(module.get()).status().error_message(),
+      verifier().Run(module.get()).status().message(),
       HasSubstr(
           "In kCrossReplicaAndPartition mode, replica groups should contain "
           "2 replicas, but found 4"));
@@ -1507,7 +1504,7 @@ TEST_F(HloVerifierTest, AllReduce_FlattenedID_Invalid) {
       auto module,
       MakeAllReduceComputation({{0, 1}, {2, 3}}, 1, 2,
                                "channel_id=1, use_global_device_ids=true"));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("In kFlattenedID mode, replica groups should contain "
                         "2 flattened IDs, but found 4"));
 }
@@ -1559,7 +1556,7 @@ TEST_F(HloVerifierTest, AllReduceStartAndDoneWrongType) {
                           ParseAndReturnUnverifiedModule(kModuleStr));
 
   auto status = verifier().Run(module.get()).status();
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Expected instruction to have shape equal to "
                         "f32[2,3]"));
 }
@@ -1585,7 +1582,7 @@ TEST_F(HloVerifierTest, AllReduceStartAndMultipleDone) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr("all-reduce-start instruction requires one consumer, found 2"));
 }
 
@@ -1604,7 +1601,7 @@ TEST_F(HloVerifierTest, AllReduceDoneWithoutStart) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("The operand of a all-reduce-done instruction "
                         "needs to be all-reduce-start, found tuple"));
 }
@@ -1628,34 +1625,34 @@ StatusOr<std::unique_ptr<HloModule>> MakeAllToAllComputation(
 }
 
 TEST_F(HloVerifierTest, AllToAll_NoReplicaGroupsOK) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, MakeAllToAllComputation({}));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, MakeAllToAllComputation({}, 2));
   TF_ASSERT_OK(verifier().Run(module.get()).status());
 }
 
 TEST_F(HloVerifierTest, AllToAll_EmptyReplicaGroup) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, MakeAllToAllComputation({{0, 1}, {}}));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("cannot have an empty replica group"));
 }
 
 TEST_F(HloVerifierTest, AllToAll_RepeatedReplicaId) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           MakeAllToAllComputation({{0, 1}, {2, 3}, {4, 0}}));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Replica 0 is repeated"));
 }
 
 TEST_F(HloVerifierTest, AllToAll_MissingReplicaId) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           MakeAllToAllComputation({{0, 1}, {2, 3}, {5, 6}}));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Replica 4 is not named"));
 }
 
 TEST_F(HloVerifierTest, AllToAll_UniformSizeOfReplicasInGroup) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           MakeAllToAllComputation({{0, 1}, {2}, {3, 4}}));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Replica groups expected to be of uniform size"));
 }
 
@@ -1663,7 +1660,7 @@ TEST_F(HloVerifierTest, AllToAll_CrossPartition_Invalid) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto module,
       MakeAllToAllComputation({{0, 1}, {2, 3}}, 1, 2, "channel_id=1"));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("In kCrossPartition mode, replica groups should "
                         "contain 2 partitions, but found 4"));
 }
@@ -1689,10 +1686,28 @@ TEST_F(HloVerifierTest, AllToAll_LayoutConstrained) {
   config.set_replica_count(2);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr, config));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("HLO all-to-all has operands with different shapes"));
 }
 
+TEST_F(HloVerifierTest, AllToAll_OperandCountMismatchWithReplicaGroupSize) {
+  const char* const kModuleStr = R"(
+  HloModule test
+  ENTRY entry {
+    p0 = f32[128,4] parameter(0)
+    p1 = f32[128,4] parameter(1)
+    ROOT a2a = (f32[128,4], f32[128,4], f32[128,4]) all-to-all(p0, p1, p1),
+      replica_groups={{0,1}}
+  }
+  )";
+  HloModuleConfig config;
+  config.set_replica_count(2);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr, config));
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
+              HasSubstr("hlo->operand_count() == split_count"));
+}
+
 TEST_F(HloVerifierTest, CollectivePermuteSameSourceTwice) {
   const char* const kModuleStr = R"(
   HloModule test
@@ -1706,7 +1721,7 @@ TEST_F(HloVerifierTest, CollectivePermuteSameSourceTwice) {
   config.set_replica_count(3);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr, config));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Source 0 appears more than once"));
 }
 
@@ -1721,7 +1736,7 @@ TEST_F(HloVerifierTest, CollectivePermuteSameTargetTwice) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Target 2 appears more than once"));
 }
 
@@ -1743,7 +1758,7 @@ TEST_F(HloVerifierTest, CollectivePermuteSameSourceTooManyTimes) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Source 0 appears more than 2 times in instruction's "
                         "source-target pairs:"));
 }
@@ -1766,7 +1781,7 @@ TEST_F(HloVerifierTest, CollectivePermuteSameTargetTooManyTimes) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Target 3 appears more than 2 times in instruction's "
                         "source-target pairs:"));
 }
@@ -1795,7 +1810,7 @@ TEST_F(HloVerifierTest, CollectivePermuteUnmatchingSourceTarget) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Unmatching input buffers and output buffers"));
 }
 
@@ -1824,7 +1839,7 @@ TEST_F(HloVerifierTest, CollectivePermuteUnmatchingInputAndInputOffset) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Unmatching input buffers and input offset."));
 }
 
@@ -1853,7 +1868,7 @@ TEST_F(HloVerifierTest, CollectivePermuteUnmatchingOutputAndOutputOffset) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Unmatching output buffers and output offset."));
 }
 
@@ -1870,8 +1885,8 @@ TEST_F(HloVerifierTest, CollectivePermuteCrossReplicaSourceOOR) {
   config.set_replica_count(3);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr, config));
-  const std::string error_message =
-      verifier().Run(module.get()).status().error_message();
+  const std::string error_message(
+      verifier().Run(module.get()).status().message());
   EXPECT_THAT(error_message, HasSubstr("Source 5"));
   EXPECT_THAT(error_message, HasSubstr("must be < 3"));
 }
@@ -1889,8 +1904,8 @@ TEST_F(HloVerifierTest, CollectivePermuteCrossReplicaTargetOOR) {
   config.set_replica_count(3);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr, config));
-  const std::string error_message =
-      verifier().Run(module.get()).status().error_message();
+  const std::string error_message(
+      verifier().Run(module.get()).status().message());
   EXPECT_THAT(error_message, HasSubstr("Target 7"));
   EXPECT_THAT(error_message, HasSubstr("must be < 3"));
 }
@@ -1908,8 +1923,8 @@ TEST_F(HloVerifierTest, CollectivePermuteCrossPartitionSourceOOR) {
   config.set_num_partitions(3);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr, config));
-  const std::string error_message =
-      verifier().Run(module.get()).status().error_message();
+  const std::string error_message(
+      verifier().Run(module.get()).status().message());
   EXPECT_THAT(error_message, HasSubstr("Source 5"));
   EXPECT_THAT(error_message, HasSubstr("must be < 3"));
 }
@@ -1927,8 +1942,8 @@ TEST_F(HloVerifierTest, CollectivePermuteCrossPartitionTargetOOR) {
   config.set_num_partitions(3);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr, config));
-  const std::string error_message =
-      verifier().Run(module.get()).status().error_message();
+  const std::string error_message(
+      verifier().Run(module.get()).status().message());
   EXPECT_THAT(error_message, HasSubstr("Target 7"));
   EXPECT_THAT(error_message, HasSubstr("must be < 3"));
 }
@@ -1948,7 +1963,7 @@ TEST_F(HloVerifierTest, FusionShapeVerifier) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("Fused computation shape"));
 }
 
@@ -1967,7 +1982,7 @@ TEST_F(HloVerifierTest, FusionThreadVerifier) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("expects parent computation thread name same as called "
                         "computation's thread name"));
 }
@@ -1998,7 +2013,7 @@ TEST_F(HloVerifierTest, FusionNestedComputationThreadVerifier) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr));
   EXPECT_THAT(
-      verifier().Run(module.get()).status().error_message(),
+      verifier().Run(module.get()).status().message(),
       HasSubstr("Nested computations expects same computation's thread name"));
 }
 
@@ -2023,7 +2038,7 @@ TEST_F(HloVerifierTest, AllReduceVerifier) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr));
   EXPECT_THAT(
-      verifier().Run(module.get()).status().error_message(),
+      verifier().Run(module.get()).status().message(),
       HasSubstr("mix of layout constrained and unconstrained AllReduce"));
 }
 
@@ -2049,7 +2064,7 @@ TEST_F(HloVerifierTest, ChannelVerifier) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("used for different types of channel instructions"));
 }
 
@@ -2074,7 +2089,7 @@ TEST_F(HloVerifierTest, CollectiveChannelVerifier) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(kModuleStr));
-  EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
+  EXPECT_THAT(verifier().Run(module.get()).status().message(),
               HasSubstr("used for different types of channel instructions"));
 }
 
@@ -2110,7 +2125,7 @@ TEST_F(HloVerifierTest, CollectivePermuteStartAndDoneWrongType) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Expected instruction to have shape equal to "
                         "(f32[2,3], f32[2,3], u32[], u32[])"));
 }
@@ -2132,7 +2147,7 @@ TEST_F(HloVerifierTest, CollectivePermuteStartAndMultipleDone) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr("collective-permute-start instruction requires one consumer, "
                 "found 2"));
 }
@@ -2155,7 +2170,7 @@ TEST_F(HloVerifierTest, CollectivePermuteDoneNoCollectivePermuteStart) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("The operand of a collective-permute-done instruction "
                         "needs to be collective-permute-start, found tuple"));
 }
@@ -2174,7 +2189,7 @@ TEST_F(HloVerifierTest, ComparisonTypeFloat) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Expected comparison type FLOAT or TOTALORDER"));
 }
 
@@ -2192,8 +2207,7 @@ TEST_F(HloVerifierTest, ComparisonTypeSigned) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              HasSubstr("Expected comparison type SIGNED"));
+  EXPECT_THAT(status.message(), HasSubstr("Expected comparison type SIGNED"));
 }
 
 TEST_F(HloVerifierTest, ComparisonTypeUnsigned) {
@@ -2210,8 +2224,7 @@ TEST_F(HloVerifierTest, ComparisonTypeUnsigned) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              HasSubstr("Expected comparison type UNSIGNED"));
+  EXPECT_THAT(status.message(), HasSubstr("Expected comparison type UNSIGNED"));
 }
 
 TEST_F(HloVerifierTest, ComparisonTypePred) {
@@ -2228,8 +2241,7 @@ TEST_F(HloVerifierTest, ComparisonTypePred) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              HasSubstr("Expected comparison type UNSIGNED"));
+  EXPECT_THAT(status.message(), HasSubstr("Expected comparison type UNSIGNED"));
 }
 
 TEST_F(HloVerifierTest, UseGlobalDeviceIdsEmptyReplicaGroup) {
@@ -2252,7 +2264,7 @@ TEST_F(HloVerifierTest, UseGlobalDeviceIdsEmptyReplicaGroup) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr("Replica groups must be specified in flattened-id mode"));
 }
 
@@ -2276,7 +2288,7 @@ TEST_F(HloVerifierTest, InvalidChannelIDandUseGlobalDeviceIDs) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr(
           "Invalid combination of has_channel_id and use_global_device_ids"));
 }
@@ -2300,7 +2312,7 @@ TEST_F(HloVerifierTest, ReduceScatterInvalidOutputSize0) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("shard_count = 1, subgroup_size = 2"));
 }
 
@@ -2324,7 +2336,7 @@ TEST_F(HloVerifierTest, ReduceScatterInvalidScatterDim) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr("ars->scatter_dimension() < ars->operand(i)->shape().rank()"));
 }
 
@@ -2347,7 +2359,7 @@ TEST_F(HloVerifierTest, ReduceScatterNonUniformGroups) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Replica groups expected to be of uniform size"));
 }
 
@@ -2366,7 +2378,7 @@ ENTRY computation {
                     .Run(module.get())
                     .status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Broadcast dimensions should be ordered"));
 }
 
@@ -2403,7 +2415,7 @@ ENTRY main {
           .Run(module.get())
           .status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Reshape should be a physical bitcast"));
 }
 
@@ -2446,7 +2458,7 @@ TEST_F(HloVerifierTest, VerifyCustomCallThread) {
           .Run(module.get())
           .status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("expects parent computation thread name same as called "
                         "computation's thread name"));
 }
@@ -2476,7 +2488,7 @@ TEST_F(HloVerifierTest, CheckWhileThread) {
                           ParseAndReturnUnverifiedModule(hlo_string));
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("expects parent computation thread name same as called "
                         "computation's thread name"));
 }
@@ -2556,7 +2568,7 @@ ENTRY main {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("tile assignment dimensions (excluding subgroups) is "
                         "different than the input rank."));
 }
@@ -2580,7 +2592,7 @@ ENTRY main {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("device 2 > num_devices (2) in tile assignment"));
 }
 
@@ -2609,7 +2621,7 @@ TEST_F(HloVerifierTest, InconsistentWhileSharding) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Inconsistent while sharding among instructions"));
 }
 
@@ -2639,7 +2651,7 @@ TEST_F(HloVerifierTest, InconsistentConditionSharding) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr("Inconsistent conditional sharding among instructions"));
 }
 
@@ -2657,7 +2669,7 @@ TEST_F(HloVerifierTest, InvalidS4Usage) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr("S4/U4 is currently only supported in matmul and convolution"));
 }
 
@@ -2675,9 +2687,38 @@ TEST_F(HloVerifierTest, InvalidU4Usage) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       HasSubstr("S4/U4 is currently only supported in matmul and convolution"));
 }
 
+TEST(MetadataTrackerTest, MetadataTrackerLogsInfo) {
+  if (tsl::testing::kIsOpenSource) {
+    return;
+  }
+  constexpr absl::string_view hlo = R"(
+    HloModule Module
+    ENTRY entry {
+      p0 = s32[] parameter(0)
+      p1 = s32[] parameter(1)
+      ROOT sum = s32[] add(p0, p1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+
+  ::absl::ScopedMockLog log(::absl::MockLogDefault::kIgnoreUnexpected);
+  EXPECT_CALL(
+      log,
+      Log(absl::LogSeverity::kInfo, ::testing::EndsWith("/hlo_verifier.cc"),
+          ::testing::StartsWith("TEST PREFIX creation_pass_coverage=0")))
+      .Times(1);
+  log.StartCapturingLogs();
+  {
+    MetadataTracker tracker("TEST PREFIX");
+    for (const auto* c : module->computations()) {
+      TF_ASSERT_OK(c->Accept(&tracker));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 3e7d48d79ae..db7315afb59 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -646,7 +646,7 @@ StatusOr<bool> InstructionFusion::Run(
         }
 
         // Saving name to use after the instruction is removed.
-        std::string producer_name = operand->name();
+        std::string producer_name(operand->name());
         fusion_queue->OnFusingInstruction(fusion_instruction, operand,
                                           instruction);
         changed = true;
diff --git a/tensorflow/compiler/xla/service/instruction_hoister.cc b/tensorflow/compiler/xla/service/instruction_hoister.cc
index d7eacc3cc0b..32c05f336bc 100644
--- a/tensorflow/compiler/xla/service/instruction_hoister.cc
+++ b/tensorflow/compiler/xla/service/instruction_hoister.cc
@@ -97,9 +97,7 @@ bool HoistConstantOperations(
         contains_constant_successor_or_predecessors |=
             !instruction->control_successors().empty();
       } else {
-        auto is_constant = [](const HloInstruction* inst) {
-          return inst->opcode() == HloOpcode::kConstant;
-        };
+        auto is_constant = HloPredicateIsOp<HloOpcode::kConstant>;
         contains_constant_successor_or_predecessors |=
             absl::c_find_if(instruction->control_predecessors(), is_constant) !=
             instruction->control_predecessors().end();
diff --git a/tensorflow/compiler/xla/service/latency_hiding_scheduler.cc b/tensorflow/compiler/xla/service/latency_hiding_scheduler.cc
index 186189af996..62a9c2aef63 100644
--- a/tensorflow/compiler/xla/service/latency_hiding_scheduler.cc
+++ b/tensorflow/compiler/xla/service/latency_hiding_scheduler.cc
@@ -16,12 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/latency_hiding_scheduler.h"
 
 #include <algorithm>
-#include <array>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
-#include <list>
+#include <limits>
 #include <memory>
-#include <numeric>
 #include <optional>
 #include <string>
 #include <tuple>
@@ -29,7 +28,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
@@ -48,15 +46,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/tsl/platform/logging.h"
 
 namespace xla {
-
 namespace {
-struct CanonicalAsyncOp {
-  HloOpcode outer;  // kAsyncStart or kAsyncDone
-  HloOpcode inner;  // kAllReduce, kAllGather, kAllToAll, kCollectivePermute
-};
+bool IsNopInstruction(const HloInstruction& hlo) {
+  HloOpcode op = hlo.opcode();
+  return op == HloOpcode::kGetTupleElement || op == HloOpcode::kBitcast ||
+         op == HloOpcode::kConstant || op == HloOpcode::kParameter ||
+         hlo.IsEffectiveBitcast();
+}
+}  // namespace
 
 CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo) {
   switch (hlo.opcode()) {
@@ -80,7 +79,14 @@ CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo) {
   }
 }
 
-}  // namespace
+/*static*/ bool LatencyEstimator::IsAsyncPair(const HloGraphNode& from,
+                                              const HloGraphNode& target) {
+  CanonicalAsyncOp from_op = GetCanonicalAsyncOp(from.GetInstr());
+  CanonicalAsyncOp target_op = GetCanonicalAsyncOp(target.GetInstr());
+  return from_op.outer == HloOpcode::kAsyncStart &&
+         target_op.outer == HloOpcode::kAsyncDone &&
+         from_op.inner == target_op.inner;
+}
 
 LatencyEstimator::TimeCost ApproximateLatencyEstimator::GetLatencyBetween(
     const HloGraphNode& from, const HloGraphNode& target) const {
@@ -88,11 +94,7 @@ LatencyEstimator::TimeCost ApproximateLatencyEstimator::GetLatencyBetween(
   // fusion/convolution with 1 async op or 5 loop fusions with an async op.
   static constexpr TimeCost kLowLatency = 1.0;
   static constexpr TimeCost kHighLatency = 5000.0;
-  CanonicalAsyncOp from_op = GetCanonicalAsyncOp(from.GetInstr());
-  CanonicalAsyncOp target_op = GetCanonicalAsyncOp(target.GetInstr());
-  if (from_op.outer == HloOpcode::kAsyncStart &&
-      target_op.outer == HloOpcode::kAsyncDone &&
-      from_op.inner == target_op.inner) {
+  if (IsAsyncPair(from, target)) {
     return kHighLatency;
   }
   // Every other instruction we consider synchronous, which means the
@@ -125,6 +127,7 @@ bool AsyncTracker::IsSupportedAsyncDone(const HloInstruction& hlo) const {
       case HloOpcode::kAllGather:
       case HloOpcode::kAllReduce:
       case HloOpcode::kCollectivePermute:
+      case HloOpcode::kReduceScatter:
         return true;
       default:
         return false;
@@ -146,6 +149,7 @@ bool AsyncTracker::IsSupportedAsyncStart(const HloInstruction& hlo) const {
       case HloOpcode::kAllGather:
       case HloOpcode::kAllReduce:
       case HloOpcode::kCollectivePermute:
+      case HloOpcode::kReduceScatter:
         return true;
       default:
         return false;
@@ -167,6 +171,8 @@ ResourcesVector AsyncTracker::GetResourcesFromInstruction(
         return ResourceType::kAllToAll;
       case HloOpcode::kCollectivePermute:
         return ResourceType::kCollectivePermute;
+      case HloOpcode::kReduceScatter:
+        return ResourceType::kReduceScatter;
       default:
         return ResourceType::kNoResource;
     }
@@ -306,6 +312,8 @@ void AsyncTracker::SetConcurrentResourceLimits(
       config_.all_gather_overlap_limit;
   max_concurrent_resource[ResourceTypeToIndex(ResourceType::kAllReduce)] =
       config_.all_reduce_overlap_limit;
+  max_concurrent_resource[ResourceTypeToIndex(ResourceType::kReduceScatter)] =
+      config_.reduce_scatter_overlap_limit;
   max_concurrent_resource[ResourceTypeToIndex(ResourceType::kSendRecv)] =
       config_.send_recv_overlap_limit;
   max_concurrent_resource[ResourceTypeToIndex(ResourceType::kSendHost)] =
@@ -340,7 +348,31 @@ absl::string_view AsyncTracker::GetResourceName(int64_t resource_type) const {
     case ResourceTypeToIndex(ResourceType::kRecvHost):
       return "kRecvHost";
     default:
-      return "not a default resource";
+      return "Not a valid default resource";
+  }
+}
+
+absl::string_view AsyncTracker::GetResourceUsageName(
+    ResourceUsageType resource_usage_type) const {
+  return GetResourceUsageName(ResourceUsageTypeToIndex(resource_usage_type));
+}
+
+ResourceHazardType AsyncTracker::GetResourceHazardType(
+    int64_t resource_type) const {
+  return ResourceHazardType::kUnshareable;
+}
+
+absl::string_view AsyncTracker::GetResourceUsageName(
+    int64_t resource_usage_type) const {
+  switch (resource_usage_type) {
+    case ResourceUsageTypeToIndex(ResourceUsageType::kNoResource):
+      return "kNoResource";
+    case ResourceUsageTypeToIndex(ResourceUsageType::kResourceOccupy):
+      return "kResourceOccupy";
+    case ResourceUsageTypeToIndex(ResourceUsageType::kResourceRelease):
+      return "kResourceRelease";
+    default:
+      return "Not a valid resource usage type";
   }
 }
 
@@ -650,10 +682,39 @@ class ReadySetLt {
     // discovering the closest "done" to every instruction and prioritize
     // those that are closer rather than ones that are further away.
     if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
-            ShouldScheduleAsyncDone(*a.node), a,
-            ShouldScheduleAsyncDone(*b.node), b, "kScheduleDone")) {
+            ShouldScheduleAsyncDone(a), a, ShouldScheduleAsyncDone(b), b,
+            "kScheduleDone")) {
       return *value;
     }
+
+    if (sched_state_.config.enable_release_start_policy) {
+      // Prioritise scheduling ready "start" ops, to avoid useless extension of
+      // start-done latencies. This benefits future latency ops, as ops
+      // postponed here may be used to hide not-yet-scheduled latency ops.
+      const ApproximateLatencyEstimator::TimeCost a_ready_interval =
+          a.node->GetReadyTime() - sched_state_.current_time;
+      const ApproximateLatencyEstimator::TimeCost b_ready_interval =
+          b.node->GetReadyTime() - sched_state_.current_time;
+      bool a_ready_and_release =
+          a_ready_interval <= 0 &&
+          a.node->DoesReleaseResource(ResourceType::kCollectivePermute);
+      bool b_ready_and_release =
+          b_ready_interval <= 0 &&
+          b.node->DoesReleaseResource(ResourceType::kCollectivePermute);
+      if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+              a_ready_and_release, a, b_ready_and_release, b,
+              "kScheduleStart")) {
+        return *value;
+      }
+      if (a_ready_and_release && b_ready_and_release) {
+        if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+                a_ready_interval < b_ready_interval, a,
+                b_ready_interval < a_ready_interval, b, "kScheduleStart")) {
+          return *value;
+        }
+      }
+    }
+
     const ApproximateLatencyEstimator::TimeCost a_ready_interval =
         std::max(a.node->GetReadyTime() - sched_state_.current_time, 0.0);
     const ApproximateLatencyEstimator::TimeCost b_ready_interval =
@@ -785,9 +846,7 @@ class ReadySetLt {
     return ready_nodes_if_scheduled;
   }
   static bool IsNop(const HloGraphNode& gn) {
-    return gn.GetInstr().opcode() == HloOpcode::kGetTupleElement ||
-           gn.GetInstr().opcode() == HloOpcode::kBitcast ||
-           gn.GetInstr().IsEffectiveBitcast();
+    return IsNopInstruction(gn.GetInstr());
   }
   bool IsResourceConstrained(
       DefaultSchedulerCore::ScheduleCandidate& cand) const {
@@ -813,13 +872,16 @@ class ReadySetLt {
     }
     return *cand.resource_constrained;
   }
-  bool ShouldScheduleAsyncDone(const HloGraphNode& gn) const {
-    if (!gn.DoesOccupyAnyResource()) {
+  bool ShouldScheduleAsyncDone(
+      DefaultSchedulerCore::ScheduleCandidate& gn_cand) const {
+    if (!gn_cand.node->DoesOccupyAnyResource()) {
       return false;
     }
-    return !ShouldDelaySendHostDone(gn);
+    return !ShouldDelaySendHostDone(gn_cand);
   }
-  bool ShouldDelaySendHostDone(const HloGraphNode& gn) const {
+  bool ShouldDelaySendHostDone(
+      DefaultSchedulerCore::ScheduleCandidate& gn_cand) const {
+    const HloGraphNode& gn = *gn_cand.node;
     if (!gn.UsesResourceType(ResourceType::kSendHost).has_value() ||
         gn.GetInstr().opcode() != HloOpcode::kSendDone) {
       return false;
@@ -830,7 +892,26 @@ class ReadySetLt {
         sched_state_.sched_graph.GetNode(gn.GetInstr().operand(0));
     const LatencyEstimator::TimeCost latency =
         sched_state_.latency_estimator->GetLatencyBetween(start, gn);
-    if (start.GetReadyTime() - sched_state_.current_time <= latency) {
+    if (!gn_cand.estimated_connected_send_ready_time.has_value()) {
+      HloGraphNode::TimeCost start_ready_time = 0;
+      for (const auto& succ : start.GetSuccessors()) {
+        // If any successor is not ready skip this logic. We detect this by
+        // checking that ready time is set to max. This should never happen
+        // because sends always have 1 or 2 successors that should be scheduled
+        // or ready already, but in case somebody comes up with different
+        // patterns lets keep this check here.
+        if (succ.Target().GetReadyTime() >=
+            std::numeric_limits<HloGraphNode::TimeCost>::max()) {
+          return false;
+        }
+        start_ready_time = std::max(
+            start_ready_time, succ.Latency() + succ.Target().GetReadyTime());
+      }
+      gn_cand.estimated_connected_send_ready_time = start_ready_time;
+    }
+    if (*gn_cand.estimated_connected_send_ready_time -
+            sched_state_.current_time <=
+        latency) {
       return false;
     }
     return true;
@@ -1155,7 +1236,8 @@ HloScheduleGraph::HloScheduleGraph(
   }
 }
 
-std::string HloScheduleGraph::ToString() const {
+std::string HloScheduleGraph::ToString(
+    const AsyncTracker* async_tracker) const {
   std::string result;
   std::vector<std::pair<const HloGraphNode*, int>> stack;
   for (const auto& node : nodes_) {
@@ -1179,7 +1261,7 @@ std::string HloScheduleGraph::ToString() const {
     }
   }
   for (auto it = order.rbegin(), e = order.rend(); it != e; ++it) {
-    absl::StrAppend(&result, (*it)->ToString());
+    absl::StrAppend(&result, (*it)->ToString(async_tracker));
   }
   return result;
 }
@@ -1201,6 +1283,17 @@ std::vector<HloGraphNode*> HloScheduleGraph::FindBottomRoots() const {
   return roots;
 }
 
+std::vector<HloGraphNode*> HloScheduleGraph::FindTopRoots() const {
+  std::vector<HloGraphNode*> roots;
+  for (const HloInstruction* instr : original_order_) {
+    HloGraphNode& node = GetNode(instr);
+    if (node.GetIndegree() == 0) {
+      roots.push_back(&node);
+    }
+  }
+  return roots;
+}
+
 void HloScheduleGraph::InitializeGraphAnalysis(
     const AsyncTracker* async_tracker) {
   absl::flat_hash_map<HloGraphNode*, int> current_rank;
@@ -1209,6 +1302,7 @@ void HloScheduleGraph::InitializeGraphAnalysis(
     HloGraphNode& node = GetNode(instr);
     current_rank[&node] = node.GetIndegree();
     node.SetAsyncDepth(0.0);
+    node.SetDepth(0.0);
     if (node.GetIndegree() == 0) {
       stack.push_back(&node);
     }
@@ -1221,11 +1315,17 @@ void HloScheduleGraph::InitializeGraphAnalysis(
         node->SetAsyncDepth(
             std::max(pred.Target().GetAsyncDepth() + pred.Latency(),
                      node->GetAsyncDepth()));
+        node->SetDepth(std::max(
+            pred.Target().GetDepth() + pred.Target().GetCost() + pred.Latency(),
+            node->GetDepth()));
       }
     } else {
       for (auto& pred : node->GetPredecessors()) {
         node->SetAsyncDepth(
             std::max(pred.Target().GetAsyncDepth(), node->GetAsyncDepth()));
+        node->SetDepth(std::max(
+            pred.Target().GetDepth() + pred.Target().GetCost() + pred.Latency(),
+            node->GetDepth()));
       }
     }
     for (auto& succ : node->GetSuccessors()) {
@@ -1275,7 +1375,7 @@ DefaultSchedulerCore::ScheduleComputation(const HloComputation* computation) {
                                            latency_estimator_);
   sched_state.sched_graph.InitializeGraphAnalysis(async_tracker_);
   VLOG(5) << "Just built graph:";
-  XLA_VLOG_LINES(5, sched_state.sched_graph.ToString());
+  XLA_VLOG_LINES(5, sched_state.sched_graph.ToString(async_tracker_));
   async_tracker_->SetConcurrentResourceLimits(
       sched_state.max_concurrent_resource);
   // Collect the bottom roots of the graph (nodes that don't have any
@@ -1292,6 +1392,7 @@ DefaultSchedulerCore::ScheduleComputation(const HloComputation* computation) {
                                roots.end());
   // Schedule in order bottom up.
   while (!sched_state.ready_set.empty()) {
+    VLOG(10) << "Current ready time: " << sched_state.current_time;
     VLOG(10) << "Current ready queue:";
     XLA_VLOG_LINES(10, [&sched_state]() {
       struct LogFormatter {
@@ -1315,6 +1416,10 @@ DefaultSchedulerCore::ScheduleComputation(const HloComputation* computation) {
   }
   module_pressure_state_->UpdatePressureStateForComputation(
       computation, memory_pressure_tracker.pressure_state());
+  absl::c_reverse(sched_state.new_sequence_reversed);
+  if (post_processing_fn_) {
+    post_processing_fn_(sched_state);
+  }
   CHECK_EQ(sched_state.new_sequence_reversed.size(),
            sched_state.sched_graph.GetOriginalInstrList().size())
       << "Not all instructions have been scheduled "
@@ -1322,9 +1427,8 @@ DefaultSchedulerCore::ScheduleComputation(const HloComputation* computation) {
       << sched_state.sched_graph.GetOriginalInstrList().size();
   VLOG(1) << "Total time: "
           << sched_state.sched_graph
-                 .GetNode(sched_state.new_sequence_reversed.back())
+                 .GetNode(sched_state.new_sequence_reversed.front())
                  .GetReadyTime();
-  absl::c_reverse(sched_state.new_sequence_reversed);
 
   const auto& debug_options = xla::GetDebugOptionsFromFlags();
   if (debug_options.xla_dump_latency_hiding_schedule() &&
diff --git a/tensorflow/compiler/xla/service/latency_hiding_scheduler.h b/tensorflow/compiler/xla/service/latency_hiding_scheduler.h
index e906f642e9e..0a8c3dac204 100644
--- a/tensorflow/compiler/xla/service/latency_hiding_scheduler.h
+++ b/tensorflow/compiler/xla/service/latency_hiding_scheduler.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LATENCY_HIDING_SCHEDULER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LATENCY_HIDING_SCHEDULER_H_
 
-#include <array>
+#include <cstddef>
 #include <functional>
 #include <limits>
 #include <memory>
@@ -42,10 +42,11 @@ enum class ResourceType {
   kAllGather = 2,
   kAllReduce = 3,
   kCollectivePermute = 4,
-  kSendRecv = 5,
-  kSendHost = 6,
-  kRecvHost = 7,
-  kNumResources = 8,
+  kReduceScatter = 5,
+  kSendRecv = 6,
+  kSendHost = 7,
+  kRecvHost = 8,
+  kNumResources = 9,
   kTargetDefinedResourcesBound = 10000,
 };
 
@@ -55,10 +56,20 @@ enum class ResourceUsageType {
   kResourceRelease,
 };
 
+enum class ResourceHazardType {
+  kShareable = 0,
+  kUnshareable = 1,
+};
+
 constexpr int64_t ResourceTypeToIndex(ResourceType resource_type) {
   return static_cast<int64_t>(resource_type);
 }
 
+constexpr int64_t ResourceUsageTypeToIndex(
+    ResourceUsageType resource_usage_type) {
+  return static_cast<int64_t>(resource_usage_type);
+}
+
 using ResourcePair = std::pair<int64_t, ResourceUsageType>;
 using ResourcesVector = absl::InlinedVector<ResourcePair, 1>;
 
@@ -70,6 +81,7 @@ struct SchedulerConfig {
   int64_t all_to_all_overlap_limit = 1;
   int64_t all_gather_overlap_limit = 1;
   int64_t all_reduce_overlap_limit = 1;
+  int64_t reduce_scatter_overlap_limit = 1;
   int64_t send_recv_overlap_limit = 1;
   int64_t send_recv_host_overlap_limit = 1;
   bool schedule_send_recvs = false;
@@ -78,6 +90,7 @@ struct SchedulerConfig {
   bool force_send_recv_to_use_same_resource = false;
   bool use_real_cost_model = false;
   bool aggressive_scheduling_policies = false;
+  bool enable_release_start_policy = false;
   uint64_t memory_limit = UINT64_MAX;
 };
 
@@ -94,6 +107,8 @@ class LatencyEstimator {
   // Returns the core frequency used in latency estimation.
   virtual int CyclesPerMicrosecond() const = 0;
   virtual ~LatencyEstimator() = default;
+
+  static bool IsAsyncPair(const HloGraphNode& from, const HloGraphNode& target);
 };
 
 // Implementation of LatencyEstimator using an approximate cost model.
@@ -152,6 +167,11 @@ class AsyncTracker {
   // Returns the name of the given resource
   virtual absl::string_view GetResourceName(int64_t resource_type) const;
 
+  // Returns the name of the given resource usage
+  absl::string_view GetResourceUsageName(int64_t resource_usage_type) const;
+  absl::string_view GetResourceUsageName(
+      ResourceUsageType resource_usage_type) const;
+
   // Returns the first target defined resource's id, regardless of if it exits
   static int64_t GetFirstTargetDefinedResource() {
     return static_cast<int64_t>(ResourceType::kTargetDefinedResourcesBound) + 1;
@@ -163,6 +183,11 @@ class AsyncTracker {
   // Returns how many instructions using the given resource_type we can overlap
   virtual int64_t GetNumAvailableResources(int64_t resource_type) const;
 
+  // Returns the hazard type that describes how to resolve the conflicts when
+  // multiple instructions attempt to use the given resource type concurrently.
+  // Default resources have a hazard type of kUnshareable.
+  virtual ResourceHazardType GetResourceHazardType(int64_t resource_type) const;
+
   explicit AsyncTracker(const SchedulerConfig& config) : config_(config) {}
 
  private:
@@ -188,6 +213,7 @@ class HloEdge {
   HloEdge(LatencyEstimator::TimeCost latency, HloGraphNode* target)
       : latency_(latency), target_(target) {}
   LatencyEstimator::TimeCost Latency() const { return latency_; }
+  void SetLatency(LatencyEstimator::TimeCost latency) { latency_ = latency; }
   const HloGraphNode& Target() const { return *target_; }
   HloGraphNode& Target() { return *target_; }
   std::string ToString() const;
@@ -220,7 +246,9 @@ class HloGraphNode {
   TimeCost GetCost() const { return cost_; }
   void SetCost(TimeCost cost) { cost_ = cost; }
   TimeCost GetAsyncDepth() const { return async_depth_; }
+  TimeCost GetDepth() const { return depth_; }
   void SetAsyncDepth(TimeCost async_depth) { async_depth_ = async_depth; }
+  void SetDepth(TimeCost depth) { depth_ = depth; }
   bool GetForceDelay() const { return force_delay_; }
   void SetForceDelay(bool force_delay) { force_delay_ = force_delay; }
   ResourcesVector GetResources() const { return resources_; }
@@ -234,6 +262,12 @@ class HloGraphNode {
       return resource.second == ResourceUsageType::kResourceRelease;
     });
   }
+  bool DoesReleaseResource(ResourceType res) const {
+    return absl::c_any_of(resources_, [res](const ResourcePair& resource) {
+      return resource.second == ResourceUsageType::kResourceRelease &&
+             resource.first == ResourceTypeToIndex(res);
+    });
+  }
   std::optional<ResourceUsageType> UsesResourceType(ResourceType res) const {
     int64_t res_type = ResourceTypeToIndex(res);
     for (const auto& [resource_type, usage_type] : resources_) {
@@ -264,7 +298,7 @@ class HloGraphNode {
   }
   void AddSuccessor(const HloEdge& e) { successors_.push_back(e); }
   int64_t GetOriginalPosition() const { return original_position_; }
-  std::string ToString() const {
+  std::string ToString(const AsyncTracker* async_tracker = nullptr) const {
     std::string result;
     absl::StrAppend(&result, "Instr: ", instr_->ToShortString(), "\n");
     absl::StrAppend(&result, "ReadyTime: ", ready_time_, "\n");
@@ -272,6 +306,7 @@ class HloGraphNode {
     absl::StrAppend(&result, "Outdegree: ", outdegree_, "\n");
     absl::StrAppend(&result, "Cost: ", cost_, "\n");
     absl::StrAppend(&result, "Async Depth: ", async_depth_, "\n");
+    absl::StrAppend(&result, "Depth: ", depth_, "\n");
     absl::StrAppend(&result, "Force Delay: ", force_delay_, "\n");
     absl::StrAppend(&result, "Predecessors:\n");
     for (const HloEdge& e : predecessors_) {
@@ -281,6 +316,14 @@ class HloGraphNode {
     for (const HloEdge& e : successors_) {
       absl::StrAppend(&result, e.ToString());
     }
+    if (async_tracker != nullptr) {
+      absl::StrAppend(&result, "Resources:\n");
+      for (const auto& [resource, usage] : resources_) {
+        absl::StrAppend(
+            &result, "\tResource: ", async_tracker->GetResourceName(resource),
+            " usage: ", async_tracker->GetResourceUsageName(usage), "\n");
+      }
+    }
     return result;
   }
 
@@ -308,6 +351,8 @@ class HloGraphNode {
   TimeCost cost_ = 0.0;
   // Depth in latency terms of a node based on Async operation cost on the path.
   TimeCost async_depth_ = 0.0;
+  // Depth in latency terms of node based on distance to the entry nodes.
+  TimeCost depth_ = 0.0;
   // AsyncResources used by the node.
   ResourcesVector resources_;
   // Force the scheduling of the nodes with attribute set as late as possible.
@@ -329,7 +374,7 @@ class HloScheduleGraph {
                    const LatencyEstimator* latency_estimator,
                    const AsyncTracker* async_tracker);
 
-  std::string ToString() const;
+  std::string ToString(const AsyncTracker* async_tracker = nullptr) const;
 
   HloGraphNode& GetNode(const HloInstruction* instr) const;
 
@@ -556,6 +601,7 @@ class DefaultSchedulerCore : public SchedulerCore {
   struct ScheduleCandidate {
     HloGraphNode* node = nullptr;
     std::optional<std::pair<int64_t, int64_t>> pressure_change;
+    std::optional<HloGraphNode::TimeCost> estimated_connected_send_ready_time;
     std::optional<bool> resource_constrained;
   };
 
@@ -587,22 +633,6 @@ class DefaultSchedulerCore : public SchedulerCore {
     return std::nullopt;
   }
 
-  DefaultSchedulerCore(
-      HloCostAnalysis::ShapeSizeFunction shape_size_bytes,
-      const AsyncTracker* async_tracker,
-      const LatencyEstimator* latency_estimator, const SchedulerConfig& config,
-      TargetSchedulingRule target_scheduling_rule = nullptr,
-      TargetSchedulingRule early_target_scheduling_rule = nullptr)
-      : shape_size_bytes_(shape_size_bytes),
-        async_tracker_(async_tracker),
-        latency_estimator_(latency_estimator),
-        config_(config),
-        target_scheduling_rule_(target_scheduling_rule),
-        early_target_scheduling_rule_(early_target_scheduling_rule) {}
-  Status InitializeScheduler(const HloModule* module) override;
-  StatusOr<std::vector<HloInstruction*>> ScheduleComputation(
-      const HloComputation* computation) override;
-
   // The scheduling state contains everything that is required for the
   // bookkeeping of the scheduling algorithm. Functions that perform operations
   // over the scheduling state can directly operate on the state contained into
@@ -656,6 +686,26 @@ class DefaultSchedulerCore : public SchedulerCore {
           config(config) {}
   };
 
+  using PostProcessingFn = std::function<void(SchedulingState&)>;
+
+  DefaultSchedulerCore(
+      HloCostAnalysis::ShapeSizeFunction shape_size_bytes,
+      const AsyncTracker* async_tracker,
+      const LatencyEstimator* latency_estimator, const SchedulerConfig& config,
+      TargetSchedulingRule target_scheduling_rule = nullptr,
+      TargetSchedulingRule early_target_scheduling_rule = nullptr,
+      PostProcessingFn post_processing_fn = nullptr)
+      : shape_size_bytes_(shape_size_bytes),
+        async_tracker_(async_tracker),
+        latency_estimator_(latency_estimator),
+        config_(config),
+        target_scheduling_rule_(target_scheduling_rule),
+        early_target_scheduling_rule_(early_target_scheduling_rule),
+        post_processing_fn_(post_processing_fn) {}
+  Status InitializeScheduler(const HloModule* module) override;
+  StatusOr<std::vector<HloInstruction*>> ScheduleComputation(
+      const HloComputation* computation) override;
+
  protected:
   virtual void LogInstruction(const HloInstruction* instr) const;
   // Update node that has been scheduled.
@@ -681,6 +731,7 @@ class DefaultSchedulerCore : public SchedulerCore {
   SchedulerConfig config_;
   TargetSchedulingRule target_scheduling_rule_ = nullptr;
   TargetSchedulingRule early_target_scheduling_rule_ = nullptr;
+  PostProcessingFn post_processing_fn_ = nullptr;
 };
 
 // A scheduler oriented to hiding latencies of operations that can run in
@@ -728,11 +779,6 @@ class LatencyHidingScheduler : public HloModulePass {
   virtual void LogScheduleStatistics(const HloComputation* computation);
 
  private:
-  // Perform scheduling of the computation.
-  Status ScheduleAsyncComputation(HloComputation* comp,
-                                  const LatencyEstimator* latency_estimator,
-                                  HloAliasAnalysis* alias_analysis,
-                                  ModulePressureState* module_pressure_state);
   SchedulerConfig config_;
   std::unique_ptr<LatencyEstimator> latency_estimator_;
   std::unique_ptr<AsyncTracker> async_tracker_;
@@ -741,6 +787,14 @@ class LatencyHidingScheduler : public HloModulePass {
   absl::flat_hash_set<HloComputation*> computations_to_schedule_;
 };
 
+struct CanonicalAsyncOp {
+  HloOpcode outer;  // kAsyncStart or kAsyncDone
+  HloOpcode inner;  // kAllReduce, kAllGather, kAllToAll, kCollectivePermute,
+                    // or kReduceScatter
+};
+
+CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LATENCY_HIDING_SCHEDULER_H_
diff --git a/tensorflow/compiler/xla/service/latency_hiding_scheduler_test.cc b/tensorflow/compiler/xla/service/latency_hiding_scheduler_test.cc
index d9d72398f96..5f6c3dd4e6e 100644
--- a/tensorflow/compiler/xla/service/latency_hiding_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/latency_hiding_scheduler_test.cc
@@ -17,23 +17,18 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <functional>
 #include <iterator>
 #include <memory>
-#include <numeric>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/async_collective_creator.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/tsl/lib/core/status_test_util.h"
 
 namespace xla {
 
@@ -134,10 +129,9 @@ StatusOr<bool> RunScheduler(
     std::unique_ptr<LatencyEstimator> latency_estimator =
         std::make_unique<ApproximateLatencyEstimator>()) {
   AsyncCollectiveCreator::CollectiveCreatorConfig config{
-      /*convert_all_reduce=*/[](const HloInstruction*) { return true; },
-      /*convert_all_gather=*/[](const HloInstruction*) { return true; },
-      /*convert_collective_permute=*/
-      [](const HloInstruction*) { return true; }};
+      /*convert_all_reduce=*/HloPredicateTrue,
+      /*convert_all_gather=*/HloPredicateTrue,
+      /*convert_collective_permute=*/HloPredicateTrue};
   TF_ASSIGN_OR_RETURN(bool value,
                       AsyncCollectiveCreator(std::move(config)).Run(module));
   HloCostAnalysis::ShapeSizeFunction shape_size_bytes =
@@ -2524,6 +2518,67 @@ ENTRY entry {
             GetIndex(new_instruction_sequence, "cp1s"));
 }
 
+TEST_F(LatencyHidingSchedulerTest, ReleaseStartWhenLatencyDue) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[16,64,256]{2,1,0} parameter(0)
+  p1 = f32[128,2048,2048]{2,1,0} parameter(1)
+  p2 = f32[512,2048,2048]{2,1,0} parameter(2)
+  cp1s = (f32[512,2048,2048]{2,1,0}, f32[512,2048,2048]{2,1,0}, u32[], u32[]) collective-permute-start(p2), source_target_pairs={{1,0},{0,3},{3,2}}
+  cp1d = f32[512,2048,2048]{2,1,0} collective-permute-done(cp1s)
+  cp2s = (f32[128,2048,2048]{2,1,0}, f32[128,2048,2048]{2,1,0}, u32[], u32[]) collective-permute-start(p1), source_target_pairs={{1,0},{0,3},{3,2}}
+  cp2d = f32[128,2048,2048]{2,1,0} collective-permute-done(cp2s)
+  cp3s = (f32[128,2048,2048]{2,1,0}, f32[128,2048,2048]{2,1,0}, u32[], u32[]) collective-permute-start(cp2d), source_target_pairs={{1,0},{0,3},{3,2}}
+  cp3d = f32[128,2048,2048]{2,1,0} collective-permute-done(cp3s)
+  slice = f32[16,64,256]{2,1,0} slice(f32[512,2048,2048]{2,1,0} cp1d), slice={[0:16], [0:64], [0:256]}
+  c0 = f32[16,256,256]{2,1,0} convolution(p0, slice),
+    window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb  
+  c1 = f32[16,256,256]{2,1,0} convolution(p0, slice),
+    window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb  
+  ROOT tuple.2 = (f32[16,256,256]{2,1,0}, f32[16,256,256]{2,1,0}, f32[128,2048,2048]{2,1,0}, f32[128,2048,2048]{2,1,0}) tuple(c0, c1, cp2d, cp3d)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  HloSchedule& module_schedule = hlo_module->schedule();
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+  auto sched_config = GetDefaultSchedConfig();
+  sched_config.aggressive_scheduling_policies = true;
+  sched_config.enable_release_start_policy = true;
+  EXPECT_TRUE(RunScheduler(hlo_module.get(), sched_config,
+                           std::make_unique<TestLatencyEstimator>())
+                  .ok());
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+
+  std::vector<HloInstruction*> new_instruction_sequence =
+      module_schedule.sequence(hlo_module->entry_computation()).instructions();
+  if (VLOG_IS_ON(1)) {
+    for (auto* new_i : new_instruction_sequence) {
+      VLOG(1) << new_i->ToString();
+    }
+  }
+
+  // Make sure that c0 and c1 are latency-hiding cp2 and cp3 respectively,
+  // instead of being scheduled only within the latency of one of cp2 or cp3.
+  // Note that the cost of c0 and c1 is larger than the latencies of cp2 and cp3
+  // so the best strategy is indeed to distribute them among both cp2 and cp3.
+  // When aggressive_scheduling_policies = true, this is achieved thanks to the
+  // "kScheduleStart" policy, in absence of which the "kAsyncDepth" prevails
+  // in scheduling both c0 and c1 within the latency of cp3.
+  EXPECT_LT(GetIndex(new_instruction_sequence, "cp2s"),
+            GetIndex(new_instruction_sequence, "c0"));
+  EXPECT_LT(GetIndex(new_instruction_sequence, "c0"),
+            GetIndex(new_instruction_sequence, "cp2d"));
+  EXPECT_LT(GetIndex(new_instruction_sequence, "cp2d"),
+            GetIndex(new_instruction_sequence, "cp3s"));
+  EXPECT_LT(GetIndex(new_instruction_sequence, "cp3s"),
+            GetIndex(new_instruction_sequence, "c1"));
+  EXPECT_LT(GetIndex(new_instruction_sequence, "c1"),
+            GetIndex(new_instruction_sequence, "cp3d"));
+}
+
 TEST_F(LatencyHidingSchedulerTest, AsyncTrackerTestForTargetDefinedResources) {
   // Extend AsyncTracker for a fake target with one target-defined resource
   class AsyncTrackerForMyTarget : public AsyncTracker {
@@ -2553,6 +2608,22 @@ TEST_F(LatencyHidingSchedulerTest, AsyncTrackerTestForTargetDefinedResources) {
       }
     }
 
+    ResourceHazardType GetResourceHazardType(
+        int64_t resource_type) const override {
+      const int64_t first_target_resource = GetFirstTargetDefinedResource();
+      if (resource_type < first_target_resource) {
+        return AsyncTracker::GetResourceHazardType(resource_type);
+      }
+      CHECK_LE(resource_type,
+               first_target_resource + GetNumTargetDefinedResources());
+      switch (resource_type - first_target_resource) {
+        case static_cast<int64_t>(MyTargetResourceType::kTargetResource0):
+          return ResourceHazardType::kShareable;
+        default:
+          return ResourceHazardType::kUnshareable;
+      }
+    }
+
     int64_t GetNumTargetDefinedResources() const override {
       return static_cast<int64_t>(MyTargetResourceType::kNumTargetResources);
     }
@@ -2581,11 +2652,17 @@ TEST_F(LatencyHidingSchedulerTest, AsyncTrackerTestForTargetDefinedResources) {
       SchedulerConfig(), target_resource0_overlap_limit);
   // Check the number of target-defined resources
   CHECK_EQ(async_tracker_for_my_target.GetNumTargetDefinedResources(), 1);
-  // Check the name of the target-defined resource
+  // Get the index of the target-defined resource
   const int64_t target_resource0_index =
       static_cast<int64_t>(ResourceType::kTargetDefinedResourcesBound) + 1;
+  // Check the name of the target-defined resource
   CHECK_EQ(async_tracker_for_my_target.GetResourceName(target_resource0_index),
            "kTargetResource0");
+  // Check the hazard type of the target-defined resource
+  CHECK_EQ(
+      static_cast<int64_t>(async_tracker_for_my_target.GetResourceHazardType(
+          target_resource0_index)),
+      static_cast<int64_t>(ResourceHazardType::kShareable));
   // Check the number of available resources (overlap limit) for the
   // target-defined resource
   CHECK_EQ(async_tracker_for_my_target.GetNumAvailableResources(
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index fe48501c1ed..44f4eaaaea0 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -87,7 +87,7 @@ BufferLayoutConstraint::BufferLayoutConstraint(const Layout& layout,
 
 std::string BufferLayoutConstraint::ToString() const {
   return absl::StrFormat(
-      "BufferLayoutConstraint (prioity=%d, mandatory=%d, dfs=%d) %s: %s",
+      "BufferLayoutConstraint (priority=%d, mandatory=%d, dfs=%d) %s: %s",
       priority(), mandatory(), dfs(), buffer_->ToString(),
       LayoutUtil::HumanString(layout_[0]));
 }
@@ -209,7 +209,7 @@ bool OperandLayoutConstraint::UpdateLayout(int64_t new_priority,
 
 std::string OperandLayoutConstraint::ToString() const {
   return absl::StrFormat(
-      "OperandLayoutConstraint (prioity=%d) %s, operand %d: %s", priority(),
+      "OperandLayoutConstraint (priority=%d) %s, operand %d: %s", priority(),
       instruction_->name(), operand_no_, shape_layout_[0].ToString());
 }
 
@@ -640,7 +640,7 @@ Status PropagateParameterLayoutToUsers(const HloInstruction* instruction,
     if (user->opcode() == HloOpcode::kTuple) {
       continue;
     }
-    VLOG(3) << "Setting  user layout : " << user->ToString();
+    VLOG(3) << "Setting user layout : " << user->ToString();
     if (user->opcode() == HloOpcode::kGetTupleElement) {
       auto tuple_index = user->tuple_index();
       CHECK(shape.IsTuple());
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 609c3a9c6f5..4b22e0facf3 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -317,6 +317,10 @@ class LayoutAssignment : public HloModulePass {
   // rank as the output to have the same layout as the output.
   static bool InstructionCanChangeLayout(const HloInstruction* instruction);
 
+  LayoutConstraints& mutable_computation_constraints(
+      const HloComputation* computation) {
+    return *FindOrDie(computation_layouts_, computation);
+  }
   LayoutConstraints* mutable_computation_constraints(
       HloComputation* computation) {
     auto it = computation_layouts_.find(computation);
@@ -481,7 +485,7 @@ class LayoutAssignment : public HloModulePass {
 
   // Propagates the memory space defined in the entry computation to the called
   // computations.
-  Status PropagateMemorySpace(HloModule* module);
+  virtual Status PropagateMemorySpace(HloModule* module);
 
   // Chooses a layout of operand `operand_no` of `instruction` that minimizes
   // the cost of `instruction`. `output_layout` is the layout of `instruction`.
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index fb5d060bbac..6bc463bd5fd 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -823,7 +823,7 @@ TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
   Status error_status = layout_assignment.Run(m.get()).status();
   EXPECT_FALSE(error_status.ok());
   EXPECT_THAT(
-      error_status.error_message(),
+      error_status.message(),
       ::testing::HasSubstr(
           "Unexpected bitcast operation seen during layout assignment"));
 }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 755967732c9..8babba9601f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -76,16 +76,6 @@ llvm::Module* ModuleFromIRBuilder(llvm::IRBuilder<>* b) {
 
 }  // namespace
 
-std::unique_ptr<llvm::Module> DropConstantInitializers(
-    const llvm::Module& module) {
-  std::unique_ptr<llvm::Module> cloned_module = CloneModule(module);
-  for (llvm::GlobalVariable& global_var : cloned_module->globals()) {
-    global_var.setInitializer(nullptr);
-    global_var.setLinkage(llvm::GlobalValue::LinkageTypes::ExternalLinkage);
-  }
-  return cloned_module;
-}
-
 std::string DumpToString(const llvm::Module* module) {
   return DumpToStringTempl(module);
 }
@@ -192,6 +182,7 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
       return llvm::Type::getInt16Ty(module->getContext());
     case F8E5M2:
     case F8E4M3FN:
+    case F8E4M3B11FNUZ:
       // Similarly as with BF16, we represent F8 as an int since there is no
       // LLVM F8 dtype.
       return llvm::Type::getInt8Ty(module->getContext());
@@ -616,7 +607,6 @@ static Status CreateAndWriteStringToFile(const std::string& directory_name,
 void DumpIrIfEnabled(const HloModule& hlo_module,
                      const llvm::Module& llvm_module, bool optimized,
                      absl::string_view filename_suffix) {
-  const auto& debug_opts = hlo_module.config().debug_options();
   if (!DumpingEnabledForHloModule(hlo_module)) {
     return;
   }
@@ -628,15 +618,6 @@ void DumpIrIfEnabled(const HloModule& hlo_module,
                    filename_suffix.empty() ? "" : ".", filename_suffix);
   DumpToFileInDirOrStdout(hlo_module, "", absl::StrCat(suffix, ".ll"),
                           DumpToString(&llvm_module));
-
-  // For some models the embedded constants can be huge, so also dump the module
-  // with the constants stripped to get IR that is easier to manipulate.  Skip
-  // this if we're dumping to stdout; there's no point in duplicating everything
-  // when writing to the terminal.
-  if (!DumpingToStdout(debug_opts)) {
-    DumpToFileInDir(hlo_module, "", absl::StrCat(suffix, "-noconst.ll"),
-                    DumpToString(DropConstantInitializers(llvm_module).get()));
-  }
 }
 
 llvm::Function* CreateCpuFunction(llvm::FunctionType* function_type,
diff --git a/tensorflow/compiler/xla/service/logistic_expander.cc b/tensorflow/compiler/xla/service/logistic_expander.cc
index aac6f38ade5..51ae228f41c 100644
--- a/tensorflow/compiler/xla/service/logistic_expander.cc
+++ b/tensorflow/compiler/xla/service/logistic_expander.cc
@@ -36,26 +36,13 @@ limitations under the License.
 
 namespace xla {
 
-namespace {
-
-HloInstruction* ExpandLogisticWithTanh(HloInstruction* logistic) {
-  HloInstruction* operand = logistic->mutable_operand(0);
-  const Shape operand_shape = operand->shape();
-  HloInstruction* half_constant = MakeScalarLike(operand, 0.5f);
-  HloInstruction* tanh_instr =
-      MakeUnaryHlo(
-          HloOpcode::kTanh,
-          MakeBinaryHlo(HloOpcode::kMultiply, half_constant, operand).value())
-          .value();
-  return MakeBinaryHlo(
-             HloOpcode::kAdd, half_constant,
-             MakeBinaryHlo(HloOpcode::kMultiply, half_constant, tanh_instr)
-                 .value())
-      .value();
+bool LogisticExpander::InstructionMatchesPattern(HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kLogistic;
 }
 
-HloInstruction* ExpandLogisticWithExp(HloInstruction* logistic) {
-  HloInstruction* operand = logistic->mutable_operand(0);
+StatusOr<HloInstruction*> LogisticExpander::ExpandInstruction(
+    HloInstruction* instruction) {
+  HloInstruction* operand = instruction->mutable_operand(0);
   const Shape operand_shape = operand->shape();
   // Computing 1.0 / (1.0 - exp(-x))
   HloInstruction* one_constant = MakeScalarLike(operand, 1.0f);
@@ -68,20 +55,4 @@ HloInstruction* ExpandLogisticWithExp(HloInstruction* logistic) {
   return MakeBinaryHlo(HloOpcode::kDivide, one_constant, denominator).value();
 }
 
-}  // namespace
-
-bool LogisticExpander::InstructionMatchesPattern(HloInstruction* instruction) {
-  return instruction->opcode() == HloOpcode::kLogistic;
-}
-
-StatusOr<HloInstruction*> LogisticExpander::ExpandInstruction(
-    HloInstruction* instruction) {
-  switch (expansion_type_) {
-    case LogisticExpansionType::kTanh:
-      return ExpandLogisticWithTanh(instruction);
-    case LogisticExpansionType::kExp:
-      return ExpandLogisticWithExp(instruction);
-  }
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/logistic_expander.h b/tensorflow/compiler/xla/service/logistic_expander.h
index cceba91b802..79fdac646ed 100644
--- a/tensorflow/compiler/xla/service/logistic_expander.h
+++ b/tensorflow/compiler/xla/service/logistic_expander.h
@@ -24,16 +24,10 @@ limitations under the License.
 
 namespace xla {
 
-enum class LogisticExpansionType {
-  kTanh,  // Expands as 0.5 + 0.5*tanh(0.5*x)
-  kExp,   // Expands as 1.0 / (1.0 + exp(-x))
-};
-
 // A pass which performs expansion of the logistic function.
 class LogisticExpander : public OpExpanderPass {
  public:
-  explicit LogisticExpander(LogisticExpansionType expansion_type)
-      : expansion_type_(expansion_type) {}
+  LogisticExpander() = default;
   ~LogisticExpander() override = default;
   absl::string_view name() const override { return "logistic-expander"; }
 
@@ -45,7 +39,6 @@ class LogisticExpander : public OpExpanderPass {
   // modified).
   StatusOr<HloInstruction*> ExpandInstruction(
       HloInstruction* instruction) override;
-  LogisticExpansionType expansion_type_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/logistic_expander_test.cc b/tensorflow/compiler/xla/service/logistic_expander_test.cc
index a6a788c0a0a..773f3ef69ac 100644
--- a/tensorflow/compiler/xla/service/logistic_expander_test.cc
+++ b/tensorflow/compiler/xla/service/logistic_expander_test.cc
@@ -48,9 +48,9 @@ namespace m = match;
 
 class LogisticExpanderTest : public HloTestBase {};
 
-// Test that we expand kLogistic with 0.5 + 0.5 * tanh(0.5*x) when the proper
+
 // option is enabled.
-TEST_F(LogisticExpanderTest, ExpandWithTanh) {
+TEST_F(LogisticExpanderTest, ExpandWith) {
   const char* kModuleStr = R"(
     HloModule m
     test {
@@ -63,34 +63,7 @@ TEST_F(LogisticExpanderTest, ExpandWithTanh) {
   auto computation = m->entry_computation();
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kLogistic);
-  LogisticExpander logistic_expander(LogisticExpansionType::kTanh);
-  ASSERT_TRUE(logistic_expander.Run(m.get()).value());
-  root = computation->root_instruction();
-  EXPECT_THAT(m->entry_computation()->root_instruction(),
-              GmockMatch(m::AddAnyOrder(
-                  m::MultiplyAnyOrder(m::Broadcast(m::ConstantScalar(0.5)),
-                                      m::Tanh(m::MultiplyAnyOrder(
-                                          m::Broadcast(m::ConstantScalar(0.5)),
-                                          m::Parameter(0)))),
-                  m::Broadcast(m::ConstantScalar(0.5)))));
-}
-
-// Test that we expand kLogistic with 1.0 / (1.0 + exp(-x)) when the proper
-// option is enabled.
-TEST_F(LogisticExpanderTest, ExpandWithEXP) {
-  const char* kModuleStr = R"(
-    HloModule m
-    test {
-      p = f32[2,3] parameter(0)
-      ROOT r = f32[2,3] logistic(p)
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
-
-  auto computation = m->entry_computation();
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kLogistic);
-  LogisticExpander logistic_expander(LogisticExpansionType::kExp);
+  LogisticExpander logistic_expander;
   ASSERT_TRUE(logistic_expander.Run(m.get()).value());
   root = computation->root_instruction();
   EXPECT_THAT(m->entry_computation()->root_instruction(),
diff --git a/tensorflow/compiler/xla/service/map_inliner.cc b/tensorflow/compiler/xla/service/map_inliner.cc
index e3af7201bd0..b09775af615 100644
--- a/tensorflow/compiler/xla/service/map_inliner.cc
+++ b/tensorflow/compiler/xla/service/map_inliner.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/platform/errors.h"
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index f47d60d0a02..4b765f6cc17 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
+#include <cstdint>
 #include <functional>
 #include <iterator>
 #include <limits>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/memory_space_assignment_tuning_utils.h"
 #include "tensorflow/compiler/xla/service/memory_space_assignment_utils.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
@@ -40,7 +42,8 @@ namespace memory_space_assignment {
 namespace {
 // Define a dummy chunk for chunks that will be allocated in the default memory
 // space and for keeping track of number of asynchronous copies.
-const HeapSimulator::Chunk kDummyChunk{-1, -1};
+const HeapSimulator::Chunk kDummyChunk =
+    HeapSimulator::Chunk::FromOffsetSize(-1, -1);
 // For cross-program prefetched buffer, we only perform the freeing optimization
 // if the buffer occupies less of the execution time ratio than this value.
 const float kCrossProgramPrefetchOccupyFreeingLimit = 0.6;
@@ -292,40 +295,71 @@ StatusOr<xla::HloLiveRange::LogicalTime> GetScheduleTimeFromInstructionName(
                   name);
 }
 
+StatusOr<bool> GetFilterResult(
+    const std::pair<FilterUpdatePreferredPrefetch::FilterType, std::string>&
+        filter,
+    int64_t operand_size, const HloUse& hlo_use) {
+  switch (filter.first) {
+    case FilterUpdatePreferredPrefetch::FilterType::OP_SIZE_GTE:
+      return FilterUpdatePreferredPrefetch::IsOpSizeGte(operand_size,
+                                                        filter.second);
+    case FilterUpdatePreferredPrefetch::FilterType::OP_SIZE_LTE:
+      return FilterUpdatePreferredPrefetch::IsOpSizeLte(operand_size,
+                                                        filter.second);
+    case FilterUpdatePreferredPrefetch::FilterType::INSTRUCTION_NAME_EXACT:
+      return FilterUpdatePreferredPrefetch::IsInstructionNameExact(
+          hlo_use.instruction->name(), filter.second);
+    case FilterUpdatePreferredPrefetch::FilterType::OP_NUMBER_EXACT:
+      return FilterUpdatePreferredPrefetch::IsOpNumberExact(
+          hlo_use.operand_number, filter.second);
+    case FilterUpdatePreferredPrefetch::FilterType::OP_INDEX_EXACT:
+      return FilterUpdatePreferredPrefetch::IsOpIndexExact(
+          hlo_use.operand_index, filter.second);
+    default:
+      return InvalidArgument("Unknown filter type.");
+  }
+}
+
 StatusOr<std::optional<int64_t>> GetOverriddenPreferredPrefetchTime(
-    const std::vector<OverridePreferredPrefetchTime>&
-        override_preferred_prefetch_times,
-    const HloUse& hlo_use,
+    const std::vector<FilterUpdatePreferredPrefetch>&
+        filter_update_preferred_prefetches,
+    int64_t operand_size, const HloUse& hlo_use,
     const absl::flat_hash_map<const HloInstruction*, HloLiveRange::LogicalTime>&
-        instruction_schedule) {
-  // If the operand number and instruction name of current HloUse matches
-  // an override config, find the reference instruction time from the
-  // instruction schedule and set the preferred prefetch time before or
-  // after the reference instruction.
-  for (const auto& override_preferred_prefetch_time :
-       override_preferred_prefetch_times) {
-    if (override_preferred_prefetch_time.instruction_name_ !=
-            hlo_use.instruction->name() ||
-        hlo_use.operand_number !=
-            override_preferred_prefetch_time.operand_number_ ||
-        hlo_use.operand_index !=
-            override_preferred_prefetch_time.operand_index_) {
-      continue;
+        instruction_schedule,
+    int64_t earliest_prefetch_time, int64_t latest_prefetch_time) {
+  for (const auto& filter_update_preferred_prefetch :
+       filter_update_preferred_prefetches) {
+    bool match = true;
+    for (const auto& filter : filter_update_preferred_prefetch.filter_list_) {
+      TF_ASSIGN_OR_RETURN(auto filter_result,
+                          GetFilterResult(filter, operand_size, hlo_use));
+      match &= filter_result;
     }
-    TF_ASSIGN_OR_RETURN(
-        auto reference_instruction_time,
-        GetScheduleTimeFromInstructionName(
-            override_preferred_prefetch_time.reference_instruction_name_,
-            instruction_schedule));
-    if (override_preferred_prefetch_time.placement_ ==
-        OverridePreferredPrefetchTime::Placement::kBefore) {
-      return static_cast<std::optional<int64_t>>(reference_instruction_time -
-                                                 1);
-    } else {
-      return static_cast<std::optional<int64_t>>(reference_instruction_time);
+    if (match) {
+      LOG(INFO) << "Config " << filter_update_preferred_prefetch.ToString()
+                << " match for instruction " << hlo_use.instruction->name()
+                << " operand number " << hlo_use.operand_number
+                << " operand index " << hlo_use.operand_index.ToString()
+                << " size " << operand_size << " live range ("
+                << earliest_prefetch_time << ", " << latest_prefetch_time
+                << ")";
+      switch (filter_update_preferred_prefetch.override_type_) {
+        case FilterUpdatePreferredPrefetch::OverrideType::PREFETCH_EAGERNESS:
+          return filter_update_preferred_prefetch.GetPrefetchByEagerness(
+              earliest_prefetch_time, latest_prefetch_time);
+        case FilterUpdatePreferredPrefetch::OverrideType::PUT_AFTER_INSTRUCTION:
+          return filter_update_preferred_prefetch
+              .GetPrefetchTimeAfterInstruction(instruction_schedule);
+        case FilterUpdatePreferredPrefetch::OverrideType::
+            PUT_BEFORE_INSTRUCTION:
+          return filter_update_preferred_prefetch
+              .GetPrefetchTimeBeforeInstruction(instruction_schedule);
+        default:
+          return InvalidArgument("Unknown override type.");
+      }
     }
   }
-  return static_cast<std::optional<int64_t>>(std::nullopt);
+  return static_cast<StatusOr<std::optional<int64_t>>>(std::nullopt);
 }
 
 }  // namespace
@@ -470,6 +504,70 @@ int MemorySpaceAssignmentCostAnalysis::CalculateComputationNestLevel(
   return nest_level;
 }
 
+float MemorySpaceAssignmentCostAnalysis::GetDefaultMemoryAccessOverhead(
+    const HloInstruction& instruction,
+    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
+    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
+  // Calculate the pipeline overhead of accessing the default memory. We use the
+  // maximum of the window size heuristic and the actual default memory bytes
+  // accessed multiplied with the compute as the overhead. So, the math is:
+  //
+  // overhead = compute_per_iteration
+  //          = compute_elapsed / num_iterations
+  //          = compute_elapsed / (bytes_accessed / window_size)
+  //          = (window_size / bytes_accessed) * compute_elapsed
+  const float window_size_bytes =
+      options_.pipeline_overhead_window_size_mib * 1024 * 1024;
+  const float bytes_accessed = cost_analysis_.bytes_accessed(instruction);
+  const float default_memory_bytes_accessed =
+      bytes_accessed -
+      GetBytesAccessedFromAlternateMemory(
+          instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
+  const float compute_elapsed = GetInstructionElapsedDueToCompute(instruction);
+  const float effective_window_size_bytes =
+      std::min(window_size_bytes, default_memory_bytes_accessed);
+  float overhead = 0;
+  if (bytes_accessed > 0) {
+    overhead = (effective_window_size_bytes / bytes_accessed) * compute_elapsed;
+  }
+  return overhead;
+}
+
+float MemorySpaceAssignmentCostAnalysis::GetDefaultMemoryBandwidthIdleTime(
+    const HloInstruction& instruction,
+    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
+    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
+  const float default_memory_bytes_accessed =
+      cost_analysis_.bytes_accessed(instruction) -
+      GetBytesAccessedFromAlternateMemory(
+          instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
+  const float elapsed_due_to_default_mem =
+      default_memory_bytes_accessed /
+      cost_analysis_.per_second_rate(HloCostAnalysis::kBytesAccessedKey);
+  const float elapsed = GetInstructionElapsedInAlternateMemory(
+      instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
+  return elapsed - elapsed_due_to_default_mem;
+}
+
+float MemorySpaceAssignmentCostAnalysis::GetBytesAccessedFromAlternateMemory(
+    const HloInstruction& instruction,
+    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
+    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
+  float bytes_accessed_from_alternate_mem = 0.0;
+  for (auto& operand : operands_in_alternate_mem) {
+    const float operand_bytes_accessed = cost_analysis_.operand_bytes_accessed(
+        instruction, operand.first, operand.second);
+    bytes_accessed_from_alternate_mem += operand_bytes_accessed;
+  }
+
+  for (auto& shape_idx : outputs_in_alternate_mem) {
+    const float output_bytes_accessed =
+        cost_analysis_.output_bytes_accessed(instruction, shape_idx);
+    bytes_accessed_from_alternate_mem += output_bytes_accessed;
+  }
+  return bytes_accessed_from_alternate_mem;
+}
+
 float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToCompute(
     const HloInstruction& instruction) const {
   return std::max(
@@ -484,18 +582,8 @@ float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToMemory(
     absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
     absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
   float total_bytes_accessed = cost_analysis_.bytes_accessed(instruction);
-  float bytes_accessed_from_alternate_mem = 0.0;
-  for (auto& operand : operands_in_alternate_mem) {
-    float operand_bytes_accessed = cost_analysis_.operand_bytes_accessed(
-        instruction, operand.first, operand.second);
-    bytes_accessed_from_alternate_mem += operand_bytes_accessed;
-  }
-
-  for (auto& shape_idx : outputs_in_alternate_mem) {
-    float output_bytes_accessed =
-        cost_analysis_.output_bytes_accessed(instruction, shape_idx);
-    bytes_accessed_from_alternate_mem += output_bytes_accessed;
-  }
+  float bytes_accessed_from_alternate_mem = GetBytesAccessedFromAlternateMemory(
+      instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
   float elapsed_due_to_alternate_mem =
       bytes_accessed_from_alternate_mem /
       options().alternate_mem_bandwidth_bytes_per_second;
@@ -546,18 +634,22 @@ float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToMemory(
 
 float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsed(
     const HloInstruction& instruction) const {
+  float overhead = GetDefaultMemoryAccessOverhead(instruction);
   return std::max(GetInstructionElapsedDueToCompute(instruction),
-                  GetInstructionElapsedDueToMemory(instruction));
+                  GetInstructionElapsedDueToMemory(instruction) + overhead);
 }
 
 float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedInAlternateMemory(
     const HloInstruction& instruction,
     absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
     absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
+  float overhead = GetDefaultMemoryAccessOverhead(
+      instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
   return std::max(
       GetInstructionElapsedDueToCompute(instruction),
       GetInstructionElapsedDueToMemory(instruction, operands_in_alternate_mem,
-                                       outputs_in_alternate_mem));
+                                       outputs_in_alternate_mem) +
+          overhead);
 }
 
 float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedInAlternateMemory(
@@ -1050,36 +1142,146 @@ CostAnalysisPrefetchIntervalPicker::BufferIntervalAlternateMemoryBenefit(
   return cost_analysis_.GetMemoryBoundedness(interval);
 }
 
-std::string OverridePreferredPrefetchTime::ToString() const {
-  std::string placement_string =
-      placement_ == Placement::kBefore ? "before" : "after";
-  return absl::StrCat(instruction_name_, ":", operand_number_, ":",
-                      operand_index_.ToString(), ":", placement_string, ":",
-                      reference_instruction_name_);
+/*static*/ StatusOr<std::vector<FilterUpdatePreferredPrefetch>>
+FilterUpdatePreferredPrefetch::ParseFilterUpdatePreferredPrefetches(
+    std::string config) {
+  if (config.empty()) {
+    return std::vector<FilterUpdatePreferredPrefetch>();
+  }
+  std::vector<FilterUpdatePreferredPrefetch> filter_update_prefetches;
+  std::vector<std::string> filter_update_configs = absl::StrSplit(config, ';');
+  for (const auto& config : filter_update_configs) {
+    TF_ASSIGN_OR_RETURN(auto filter_update_prefetch,
+                        ParseFilterUpdatePreferredPrefetch(config));
+    filter_update_prefetches.push_back(filter_update_prefetch);
+  }
+  return filter_update_prefetches;
 }
 
-/*static*/ StatusOr<
-    std::vector<memory_space_assignment::OverridePreferredPrefetchTime>>
-OverridePreferredPrefetchTime::ParseOverridePreferredPrefetchTimesConfig(
-    std::string override_preferred_prefetch_times_config) {
-  std::vector<memory_space_assignment::OverridePreferredPrefetchTime>
-      override_preferred_prefetch_times;
-  if (override_preferred_prefetch_times_config.empty()) {
-    return override_preferred_prefetch_times;
+/*static*/ StatusOr<bool> FilterUpdatePreferredPrefetch::IsOpSizeGte(
+    int64_t operand_size, std::string config) {
+  int64_t config_value;
+  if (!absl::SimpleAtoi(config, &config_value)) {
+    return InvalidArgument("Expected integer, got %s for operand size filter",
+                           config);
   }
-  std::vector<std::string> override_configs =
-      absl::StrSplit(override_preferred_prefetch_times_config, ';');
-  for (const auto& config : override_configs) {
-    TF_ASSIGN_OR_RETURN(auto override_preferred_prefetch_time,
-                        ParseOverridePreferredPrefetchTimeConfig(config));
-    override_preferred_prefetch_times.push_back(
-        override_preferred_prefetch_time);
+  return operand_size >= config_value;
+}
+
+/*static*/ StatusOr<bool> FilterUpdatePreferredPrefetch::IsOpSizeLte(
+    int64_t operand_size, std::string config) {
+  int64_t config_value;
+  if (!absl::SimpleAtoi(config, &config_value)) {
+    return InvalidArgument("Expected integer, got %s for operand size filter",
+                           config);
   }
-  return override_preferred_prefetch_times;
+  return operand_size <= config_value;
+}
+
+/*static*/ StatusOr<bool> FilterUpdatePreferredPrefetch::IsInstructionNameExact(
+    const absl::string_view instruction_name, std::string config) {
+  return instruction_name == config;
+}
+
+/*static*/ StatusOr<bool> FilterUpdatePreferredPrefetch::IsOpNumberExact(
+    int64_t operand_number, std::string config) {
+  int64_t config_value;
+  if (!absl::SimpleAtoi(config, &config_value)) {
+    return InvalidArgument("Expected integer, got %s for operand number filter",
+                           config);
+  }
+  return operand_number == config_value;
+}
+
+/*static*/ StatusOr<bool> FilterUpdatePreferredPrefetch::IsOpIndexExact(
+    const ShapeIndex& operand_index, std::string config) {
+  TF_ASSIGN_OR_RETURN(auto config_value, ParseOperandIndex(config));
+  return operand_index == config_value;
+}
+
+StatusOr<std::optional<int64_t>>
+FilterUpdatePreferredPrefetch::GetPrefetchByEagerness(
+    int64_t earliest_prefetch_time, int64_t latest_prefetch_time) const {
+  if (earliest_prefetch_time > latest_prefetch_time) {
+    return static_cast<std::optional<int64_t>>(std::nullopt);
+  }
+  float override_value;
+  if (!absl::SimpleAtof(override_value_, &override_value)) {
+    return InvalidArgument("Expected float, got %s for prefetch eagerness",
+                           override_value_);
+  }
+  return static_cast<std::optional<int64_t>>(
+      earliest_prefetch_time * override_value +
+      latest_prefetch_time * (1.0 - override_value));
+}
+
+StatusOr<std::optional<int64_t>>
+FilterUpdatePreferredPrefetch::GetPrefetchTimeAfterInstruction(
+    const absl::flat_hash_map<const xla::HloInstruction*,
+                              xla::HloLiveRange::LogicalTime>& schedule) const {
+  TF_ASSIGN_OR_RETURN(auto reference_instruction_time,
+                      GetScheduleTimeFromInstructionName(schedule));
+  return static_cast<std::optional<int64_t>>(reference_instruction_time);
+}
+
+StatusOr<std::optional<int64_t>>
+FilterUpdatePreferredPrefetch::GetPrefetchTimeBeforeInstruction(
+    const absl::flat_hash_map<const xla::HloInstruction*,
+                              xla::HloLiveRange::LogicalTime>& schedule) const {
+  TF_ASSIGN_OR_RETURN(auto reference_instruction_time,
+                      GetScheduleTimeFromInstructionName(schedule));
+  return static_cast<std::optional<int64_t>>(reference_instruction_time - 1);
+}
+
+StatusOr<xla::HloLiveRange::LogicalTime>
+FilterUpdatePreferredPrefetch::GetScheduleTimeFromInstructionName(
+    const absl::flat_hash_map<const xla::HloInstruction*,
+                              xla::HloLiveRange::LogicalTime>& schedule) const {
+  for (auto schedule_entry : schedule) {
+    if (schedule_entry.first->name() == override_value_) {
+      return schedule_entry.second;
+    }
+  }
+  return NotFound("Reference instruction %s was not found in the schedule.",
+                  override_value_);
+}
+
+/*static*/ StatusOr<FilterUpdatePreferredPrefetch::FilterType>
+FilterUpdatePreferredPrefetch::ParseFilterType(std::string config) {
+  if (config == "op_size_lte") {
+    return FilterType::OP_SIZE_LTE;
+  }
+  if (config == "op_size_gte") {
+    return FilterType::OP_SIZE_GTE;
+  }
+  if (config == "instruction_name_exact") {
+    return FilterType::INSTRUCTION_NAME_EXACT;
+  }
+  if (config == "op_number_exact") {
+    return FilterType::OP_NUMBER_EXACT;
+  }
+  if (config == "op_index_exact") {
+    return FilterType::OP_INDEX_EXACT;
+  }
+  return InvalidArgument("Failed to parse filter type %s", config);
+}
+
+/*static*/ StatusOr<FilterUpdatePreferredPrefetch::OverrideType>
+FilterUpdatePreferredPrefetch::ParseOverrideType(std::string config) {
+  if (config == "prefetch_eagerness") {
+    return OverrideType::PREFETCH_EAGERNESS;
+  }
+  if (config == "put_after_instruction") {
+    return OverrideType::PUT_AFTER_INSTRUCTION;
+  }
+  if (config == "put_before_instruction") {
+    return OverrideType::PUT_BEFORE_INSTRUCTION;
+  }
+  return InvalidArgument("Failed to parse override type %s", config);
 }
 
 /*static*/ StatusOr<ShapeIndex>
-OverridePreferredPrefetchTime::ParseOperandIndex(std::string config) {
+FilterUpdatePreferredPrefetch::ParseOperandIndex(std::string config) {
   ShapeIndex operand_index{};
   if (config.empty()) {
     return operand_index;
@@ -1094,35 +1296,29 @@ OverridePreferredPrefetchTime::ParseOperandIndex(std::string config) {
   return operand_index;
 }
 
-/*static*/ StatusOr<OverridePreferredPrefetchTime>
-OverridePreferredPrefetchTime::ParseOverridePreferredPrefetchTimeConfig(
+/*static*/ StatusOr<FilterUpdatePreferredPrefetch>
+FilterUpdatePreferredPrefetch::ParseFilterUpdatePreferredPrefetch(
     std::string config) {
-  std::vector<std::string> override_config = absl::StrSplit(config, ':');
-  if (override_config.size() != 5) {
-    return InvalidArgument("Failed to parse config, insufficient filters %s",
-                           config);
+  std::vector<std::string> filter_update_config = absl::StrSplit(config, ':');
+  if (filter_update_config.size() < 4 || filter_update_config.size() % 2 != 0) {
+    return InvalidArgument(
+        "Failed to parse filter update config %s, incorrect number of "
+        "arguments",
+        config);
   }
-  auto instruction_name = override_config[0];
-  int64_t operand_number;
-  if (!absl::SimpleAtoi(override_config[1], &operand_number)) {
-    return InvalidArgument("Failed to parse operand number %s for config %s",
-                           override_config[1], config);
+  FilterUpdatePreferredPrefetch result;
+  result.config_string_ = config;
+  for (int i = 0; i < filter_update_config.size() - 2; i += 2) {
+    TF_ASSIGN_OR_RETURN(auto filter_type,
+                        ParseFilterType(filter_update_config[i]));
+    result.filter_list_.push_back(
+        std::make_pair(filter_type, filter_update_config[i + 1]));
   }
-  TF_ASSIGN_OR_RETURN(ShapeIndex operand_index,
-                      ParseOperandIndex(override_config[2]));
-  if (override_config[3] != "before" && override_config[3] != "after") {
-    return InvalidArgument("Failed to parse placement %s for config %s",
-                           override_config[3], config);
-  }
-  auto placement = override_config[3] == "before"
-                       ? memory_space_assignment::
-                             OverridePreferredPrefetchTime::Placement::kBefore
-                       : memory_space_assignment::
-                             OverridePreferredPrefetchTime::Placement::kAfter;
-  auto reference_instruction_name = override_config[4];
-  return OverridePreferredPrefetchTime(instruction_name, operand_number,
-                                       operand_index, placement,
-                                       reference_instruction_name);
+  TF_ASSIGN_OR_RETURN(
+      result.override_type_,
+      ParseOverrideType(filter_update_config[filter_update_config.size() - 2]));
+  result.override_value_ = filter_update_config.back();
+  return result;
 }
 
 bool MemorySpaceAssignment::Allocation::operator==(
@@ -1610,6 +1806,9 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
     CHECK_EQ((*options_.autotuning_config).size(), buffer_intervals_.size());
   }
 
+  // TODO(b/275905276): if slicing is turned on, ensure repacking is disabled,
+  // i.e., max_repacks == 0
+
   AllocateReservedScopedAllocations();
   std::vector<BufferInterval> sorted_buffer_intervals =
       GetSortedBufferIntervals();
@@ -1958,7 +2157,7 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
           // We require while body ROOTs to be the last in the schedule.
           CHECK_EQ(instruction_schedule.at(while_body->root_instruction()) + 1,
                    instruction_schedule.at(hlo_use.instruction))
-              << "While body ROOTs need to be the last in the schedule!  "
+              << "While body ROOTs need to be the last in the schedule! "
                  "Please run RootInstructionSinker.";
           // Replace the use time with the parameter time so that we can decide
           // on alternate memory allocations within the while loop body when we
@@ -2039,15 +2238,24 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
         }
         AllocationRequest request;
 
-        StatusOr<std::optional<int64_t>> overridden_preferred_prefetch_time =
+        int64_t live_range_start_time =
+            (earliest_prefetch_time.has_value()
+                 ? earliest_prefetch_time.value()
+                 : std::min(definition_time, use_time));
+        auto overridden_preferred_prefetch_time =
             GetOverriddenPreferredPrefetchTime(
-                options_.override_preferred_prefetch_times, hlo_use,
-                instruction_schedule);
+                options_.filter_update_preferred_prefetches,
+                allocation_value.size(), hlo_use, instruction_schedule,
+                live_range_start_time, latest_prefetch_time);
         TF_CHECK_OK(overridden_preferred_prefetch_time.status());
         if (overridden_preferred_prefetch_time.value().has_value()) {
           LOG(INFO) << "Overriding preferred prefetch for "
-                    << hlo_use.instruction->name() << " operand "
-                    << hlo_use.operand_number << " from "
+                    << hlo_use.instruction->name() << " operand number "
+                    << hlo_use.operand_number << " operand index "
+                    << hlo_use.operand_index.ToString() << " size "
+                    << allocation_value.size() << " live range ("
+                    << live_range_start_time << ", " << latest_prefetch_time
+                    << ") from "
                     << (preferred_prefetch_time.has_value()
                             ? preferred_prefetch_time.value()
                             : -1)
@@ -2908,7 +3116,8 @@ void AlternateMemoryBestFitHeap::ImportRepackedAllocations() {
     allocation_block.allocation->mutable_chunk()->offset =
         allocation_block.offset;
     interval_tree_.Add(allocation_block.start_time, allocation_block.end_time,
-                       {allocation_block.offset, allocation_block.size});
+                       HeapSimulator::Chunk::FromOffsetSize(
+                           allocation_block.offset, allocation_block.size));
     allocation_block.initial_offset = allocation_block.offset;
     allocation_block.offset = -1;
   }
@@ -3137,8 +3346,8 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
       std::optional<Chunk> aliased_chunk = std::nullopt;
       if (required_assignment_at_start->memory_space ==
           MemorySpace::kAlternate) {
-        aliased_chunk =
-            Chunk{required_assignment_at_start->offset->offset, request.size};
+        aliased_chunk = Chunk::FromOffsetSize(
+            required_assignment_at_start->offset->offset, request.size);
       }
       allocation_sequence->push_back(
           std::make_unique<MemorySpaceAssignment::Allocation>(
@@ -3640,13 +3849,11 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
       result_mark(Result::kFailOutOfMemory, result);
       continue;
     }
-    int64_t estimated_prefetch_end_time =
-        options_.prefetch_interval_picker->EstimatedPrefetchEndTime(
-            shape, alternate_mem_interval.start, prefetch_end_time);
     VLOG(4) << "Trying alternate memory allocation ("
             << alternate_mem_interval.start << ", " << request.end_time
             << "), estimated prefetch end time = "
-            << estimated_prefetch_end_time;
+            << options_.prefetch_interval_picker->EstimatedPrefetchEndTime(
+                   shape, alternate_mem_interval.start, prefetch_end_time);
     float prefetch_resource =
         options_.cost_analysis
             ? options_.cost_analysis->GetAsyncCopyElapsed(shape)
@@ -3674,6 +3881,10 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
       continue;
     }
 
+    // TODO(b/275905276): when slicing, compute start times and sizes for
+    // each slice
+
+    // TODO(b/275905276): pass slice info to FindBestChunkCandidate
     auto chunk_candidate = FindBestChunkCandidate(
         request, request.preferred_offset, &alternate_mem_interval);
     // Check if we could find a suitable chunk.
@@ -3687,6 +3898,8 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
               << options_.prefetch_interval_picker->ToDebugString();
       AddToPendingChunks(alternate_mem_interval, *chunk_candidate);
 
+      // TODO(b/275905276): when slicing, account for multiple chunks & call
+      // AddAsyncSlicedCopy
       AddAsyncCopy(prev_allocation_in_default_mem, MemorySpace::kAlternate,
                    chunk_candidate, alternate_mem_interval.start,
                    request.end_time, prefetch_end_time,
@@ -4743,9 +4956,10 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
     HeapSimulatorTrace::Event* heap_trace_event = heap_trace->add_events();
     heap_trace_event->set_kind(kind);
     heap_trace_event->set_buffer_id(buffer_id);
-    heap_trace_event->set_instruction_name(value->instruction()->name());
-    heap_trace_event->set_computation_name(
-        value->instruction()->parent()->name());
+    *heap_trace_event->mutable_instruction_name() =
+        std::string(value->instruction()->name());
+    *heap_trace_event->mutable_computation_name() =
+        std::string(value->instruction()->parent()->name());
 
     if (prev_time != time) {
       VLOG(2) << "Memory usage: " << std::max(memory_usage, prev_memory_usage)
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index 674a3ddc540..ca7b84f6498 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/memory_space_assignment.pb.h"
 #include "tensorflow/compiler/xla/service/memory_space_assignment_repacking.h"
 
 namespace xla {
@@ -153,6 +154,46 @@ class MemorySpaceAssignmentCostAnalysis {
       const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval,
       Cache* cache = nullptr) const;
 
+  // If enabled in Options::pipeline_overhead_window_size_mib, returns the
+  // overhead of accessing the default memory, in seconds. The source of the
+  // overhead is the software pipelining ovehead. The lowering of the operations
+  // typically use tiling to copy one window at a time from default memory, and
+  // perform compute:
+  //
+  // Pipeline overhead:                          <->
+  //                        +----+----+----+----+
+  // Copy from default mem: |    |    |    |    |
+  //                        +----+----+----+----+
+  //                            \    \    \    \
+  //                             \    \    \    \
+  //                              V    V    V    V
+  //                             +--+ +--+ +--+ +--+
+  // Compute:                    |  | |  | |  | |  |
+  //                             +--+ +--+ +--+ +--+
+  float GetDefaultMemoryAccessOverhead(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem = {},
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
+
+  // Returns the amount of time the default memory bandwidth is idle, while
+  // executing this instruction, in seconds.  This value can be multiplied with
+  // the default memory bandwidth to get the amount of bytes that are available
+  // to be copied to/from default memory during the execution of this
+  // instruction.
+  float GetDefaultMemoryBandwidthIdleTime(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem = {},
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
+
+  // Returns the bytes accessed from alternate memory.
+  float GetBytesAccessedFromAlternateMemory(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem = {},
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
+
   // Returns the elapsed time in seconds due to compute only.
   float GetInstructionElapsedDueToCompute(
       const HloInstruction& instruction) const;
@@ -720,6 +761,8 @@ class MemorySpaceAssignment {
     std::optional<int64_t> cross_program_prefetch_index_;
   };
 
+  // TODO(b/275905276): create a SlicedCopyAllocation
+
   // An allocation in the default memory space that mirrors another Allocation
   // object. This is useful to model an eviction that happens before a while op
   // so that we don't need to redundantly evict the buffer after the while op as
@@ -1024,43 +1067,86 @@ class MemorySpaceAssignment {
   absl::flat_hash_map<int64_t, std::vector<HloInstruction*>> schedule_before_;
 };
 
-// Config to override preferred prefetch / copy start location for a target
-// instruction, to immediately before/after a reference instruction.
-class OverridePreferredPrefetchTime {
-  // Place copy start of element indexed into operand_index_ of the (possibly
-  // tuple) operand at operand_number_ of instruction with name
-  // instruction_name_ and place it before or after (depending on placement_)
-  // the reference instruction with name reference_instruction_name_.
+// Filters prefetches by matching against multiple filters and overrides the
+// preferred prefetch time for matching prefetches by the provided override
+// strategy.
+class FilterUpdatePreferredPrefetch {
  public:
-  enum class Placement { kBefore, kAfter };
-  std::string instruction_name_;
-  int64_t operand_number_;
-  ShapeIndex operand_index_;
-  Placement placement_;
-  std::string reference_instruction_name_;
+  // Supported filters for prefetch filtering by operand size, instruction name,
+  // operand number and operand index matching.
+  enum class FilterType {
+    OP_SIZE_LTE,  // sting value: op_size_lte, filter value type: integer
+    OP_SIZE_GTE,  // sting value: op_size_gte, filter value type: integer
+    INSTRUCTION_NAME_EXACT,  // sting value: instruction_name_exact,
+                             // filter value type: string
+    OP_NUMBER_EXACT,         // sting value: op_number_exact,
+                             // filter value type: integer
+    OP_INDEX_EXACT  // sting value: op_index_exact, filter value type: string
+                    // (empty string for {}, 1 for {1} and 1#2 for {1,2})
+  };
+  // Strategies to compute new perferred prefetch time. Prefetch eagerness
+  // sets prefetch time to a time within the live-range depending on a value,
+  // e.g. 0.5 sets it exactly in the middle of the live-range. Put after
+  // instruction or put before instruction finds an instruction in the schedule
+  // and puts the preferred prefetch time before or after the found instruction.
+  enum class OverrideType {
+    PREFETCH_EAGERNESS,     // sting value: prefetch_eagerness,
+                            // override value type : float
+    PUT_AFTER_INSTRUCTION,  // sting value: put_after_instruction,
+                            // override value type: string
+    PUT_BEFORE_INSTRUCTION  // sting value: put_before_instruction,
+                            // override value type: string
+  };
+  std::vector<std::pair<FilterType, std::string>> filter_list_;
+  OverrideType override_type_;
+  std::string override_value_;
 
-  OverridePreferredPrefetchTime(std::string inst_name, int64_t op_num,
-                                ShapeIndex op_idx, Placement p,
-                                std::string ref_inst_name)
-      : instruction_name_(inst_name),
-        operand_number_(op_num),
-        operand_index_(op_idx),
-        placement_(p),
-        reference_instruction_name_(ref_inst_name) {}
+  std::string ToString() const { return config_string_; }
 
-  // For debugging use only.
-  std::string ToString() const;
+  static StatusOr<std::vector<FilterUpdatePreferredPrefetch>>
+  ParseFilterUpdatePreferredPrefetches(std::string config);
 
-  static StatusOr<
-      std::vector<memory_space_assignment::OverridePreferredPrefetchTime>>
-  ParseOverridePreferredPrefetchTimesConfig(
-      std::string override_preferred_prefetch_times_config);
+  static StatusOr<bool> IsOpSizeGte(int64_t operand_size, std::string config);
+
+  static StatusOr<bool> IsOpSizeLte(int64_t operand_size, std::string config);
+
+  static StatusOr<bool> IsInstructionNameExact(
+      absl::string_view instruction_name, std::string config);
+
+  static StatusOr<bool> IsOpNumberExact(int64_t operand_number,
+                                        std::string config);
+
+  static StatusOr<bool> IsOpIndexExact(const ShapeIndex& operand_index,
+                                       std::string config);
+
+  StatusOr<std::optional<int64_t>> GetPrefetchByEagerness(
+      int64_t earliest_prefetch_time, int64_t latest_prefetch_time) const;
+
+  StatusOr<std::optional<int64_t>> GetPrefetchTimeAfterInstruction(
+      const absl::flat_hash_map<const xla::HloInstruction*,
+                                xla::HloLiveRange::LogicalTime>& schedule)
+      const;
+
+  StatusOr<std::optional<int64_t>> GetPrefetchTimeBeforeInstruction(
+      const absl::flat_hash_map<const xla::HloInstruction*,
+                                xla::HloLiveRange::LogicalTime>& schedule)
+      const;
 
  private:
+  std::string config_string_;
+  StatusOr<xla::HloLiveRange::LogicalTime> GetScheduleTimeFromInstructionName(
+      const absl::flat_hash_map<const xla::HloInstruction*,
+                                xla::HloLiveRange::LogicalTime>& schedule)
+      const;
+
+  static StatusOr<FilterType> ParseFilterType(std::string config);
+
+  static StatusOr<OverrideType> ParseOverrideType(std::string config);
+
   static StatusOr<ShapeIndex> ParseOperandIndex(std::string config);
 
-  static StatusOr<OverridePreferredPrefetchTime>
-  ParseOverridePreferredPrefetchTimeConfig(std::string config);
+  static StatusOr<FilterUpdatePreferredPrefetch>
+  ParseFilterUpdatePreferredPrefetch(std::string config);
 };
 
 // The different options to be passed to the Run() API.
@@ -1196,9 +1282,16 @@ struct Options {
   // If true, enforces the FIFO order for prefetches.
   bool enforce_prefetch_fifo_order = false;
 
-  // Config to override preferred prefetch times for operands of specific
-  // instructions to before or after given reference instructions.
-  std::vector<OverridePreferredPrefetchTime> override_preferred_prefetch_times;
+  // The window size used to calculate the pipeline overhead when HLO accesses
+  // the default memory, in MiB.
+  float pipeline_overhead_window_size_mib = 0;
+
+  // Config to filter prefetches and update preferred prefetch times for the
+  // filtered prefetches according to an update config.
+  std::vector<FilterUpdatePreferredPrefetch> filter_update_preferred_prefetches;
+
+  // Options for slicing prefetches into smaller asynchronously copied pieces.
+  SlicedPrefetchOptions sliced_prefetch_options;
 };
 
 // A struct representing an asynchronous copy with its logical start and end
@@ -1447,6 +1540,9 @@ class AlternateMemoryBestFitHeap
     absl::Span<const int64_t> all_use_times;
   };
 
+  // TODO(b/275905276): create a SlicedAllocationRequest that contains the
+  // original AllocationRequest, plus slice times and sizes
+
   // This struct contains mandatory memory assignments at a given time. E.g., an
   // input's required memory assignment time would correspond to the definition
   // time of the parameter instruction, and an output's time would correspond to
@@ -1594,6 +1690,10 @@ class AlternateMemoryBestFitHeap
       const AllocationRequest& request,
       const MemorySpaceAssignment::Allocation& prev_allocation_in_default_mem);
 
+  // TODO(b/275905276): change FindBestChunkCandidate() signature to take a
+  // SlicedAllocationRequest (which can indicate 0 slices), and return a
+  // vector of chunks
+
   // Find the best possible chunk candidate, where it has the longest possible
   // availability if no preferred offset is given, or at the preferred_offset if
   // it is given.
@@ -1672,6 +1772,8 @@ class AlternateMemoryBestFitHeap
       AliasedOffset* aliased_offset, float resource,
       std::optional<int> cross_program_prefetch_index = std::nullopt);
 
+  // TODO(b/275905276): create AddAsyncSlicedCopy
+
   // This method is used for committing the chunk candidate but adding it to
   // pending_chunks_ so that we can "uncommit" them in case we need to roll back
   // this allocation sequence.
diff --git a/tensorflow/lite/core/shims/cc/tools/verifier.h b/tensorflow/compiler/xla/service/memory_space_assignment.proto
similarity index 53%
rename from tensorflow/lite/core/shims/cc/tools/verifier.h
rename to tensorflow/compiler/xla/service/memory_space_assignment.proto
index 223088e994a..ec622ea37e0 100644
--- a/tensorflow/lite/core/shims/cc/tools/verifier.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.proto
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_TOOLS_VERIFIER_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_CC_TOOLS_VERIFIER_H_
+syntax = "proto3";
 
-/// For documentation, see third_party/tensorflow/lite/tools/verifier.h.
-#include "tensorflow/lite/tools/verifier.h"
+package xla.memory_space_assignment;
 
-#include "tensorflow/lite/core/tools/verifier.h"
+// Memory space assignment options for slicing prefetches into smaller
+// asynchronous copies, reducing prefetch memory allocation pressure.
+//
+// No prefetch slicing is performed if max_slices == 0.
+message SlicedPrefetchOptions {
+  // The maximum number of slices into which to slice a prefetch.
+  uint32 max_slices = 1;
 
-namespace tflite_shims {
-
-using ::tflite::Verify;  // NOLINT
-
-}  // namespace tflite_shims
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_TOOLS_VERIFIER_H_
+  // The minimum size (in bytes) of any slice.
+  uint64 min_slice_bytes = 2;
+}
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index 11f54e79681..be0470a17e3 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/instruction_hoister.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
@@ -561,8 +562,7 @@ TEST_P(MemorySpaceAssignmentTest, NegateChain) {
   EXPECT_THAT(sequence.instructions()[10], op::CopyDone());
 }
 
-TEST_P(MemorySpaceAssignmentTest,
-       OverridePreferredPrefetchTestBeforePlacement) {
+TEST_P(MemorySpaceAssignmentTest, FilterUpdatePreferredPrefetchTest) {
   // The negate chain is long enough for asynchronous copy to be inserted
   // between p1 and add.
   HloComputation::Builder builder(TestName());
@@ -600,10 +600,85 @@ TEST_P(MemorySpaceAssignmentTest,
   options.max_size_in_bytes = 128;
   options.alignment_in_bytes = 8;
   options.verify = true;
+  auto config = "op_size_gte:24:op_size_lte:24:prefetch_eagerness:0.5";
   TF_ASSERT_OK_AND_ASSIGN(
-      options.override_preferred_prefetch_times,
-      memory_space_assignment::OverridePreferredPrefetchTime::
-          ParseOverridePreferredPrefetchTimesConfig("add:1::before:negate.3"));
+      options.filter_update_preferred_prefetches,
+      memory_space_assignment::FilterUpdatePreferredPrefetch::
+          ParseFilterUpdatePreferredPrefetches(config));
+  AssignMemorySpace(module.get(), -1, 10, 2, options);
+
+  EXPECT_THAT(add, op::Add(op::Negate(), op::AsyncCopy(kAlternateMemorySpace,
+                                                       kDefaultMemorySpace,
+                                                       op::Parameter(1))));
+  // Parameters are in the default memory space.
+  EXPECT_THAT(p0, op::ShapeWithLayout(shape));
+  EXPECT_THAT(p1, op::ShapeWithLayout(shape));
+  // Negate instructions are in the alternate memory space (1).
+  Shape shape_in_alternate_mem =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
+                                          /*minor_to_major=*/{1, 0},
+                                          /*tiles=*/{}, kAlternateMemorySpace);
+  EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate3, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate4, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate5, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate6, op::ShapeWithLayout(shape_in_alternate_mem));
+  // Ensure the CopyStart/CopyDone schedules.
+  const HloInstructionSequence& sequence =
+      module->schedule().sequence(computation);
+  EXPECT_THAT(sequence.instructions()[0], op::Parameter(0));
+  EXPECT_THAT(sequence.instructions()[1], op::Parameter(1));
+  EXPECT_THAT(sequence.instructions()[6], op::CopyStart());
+  EXPECT_THAT(sequence.instructions()[10], op::CopyDone());
+}
+
+TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchBeforeTest) {
+  // The negate chain is long enough for asynchronous copy to be inserted
+  // between p1 and add.
+  HloComputation::Builder builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  HloInstruction* p0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  HloInstruction* p1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  HloInstruction* negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, p0));
+  HloInstruction* negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate0));
+  HloInstruction* negate2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate1));
+  HloInstruction* negate3 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate2));
+  HloInstruction* negate4 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate3));
+  HloInstruction* negate5 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate4));
+  HloInstruction* negate6 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate5));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, negate6, p1));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
+                                      negate3, negate4, negate5, negate6, add});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  Options options;
+  options.max_size_in_bytes = 128;
+  options.alignment_in_bytes = 8;
+  options.verify = true;
+  auto config =
+      "instruction_name_exact:add:op_number_exact:1:put_before_instruction:"
+      "negate.3";
+  TF_ASSERT_OK_AND_ASSIGN(
+      options.filter_update_preferred_prefetches,
+      memory_space_assignment::FilterUpdatePreferredPrefetch::
+          ParseFilterUpdatePreferredPrefetches(config));
   AssignMemorySpace(module.get(), -1, 10, 2, options);
 
   EXPECT_THAT(add, op::Add(op::Negate(), op::AsyncCopy(kAlternateMemorySpace,
@@ -633,7 +708,7 @@ TEST_P(MemorySpaceAssignmentTest,
   EXPECT_THAT(sequence.instructions()[10], op::CopyDone());
 }
 
-TEST_P(MemorySpaceAssignmentTest, OverridePreferredPrefetchTestAfterPlacement) {
+TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchAfterTest) {
   // The negate chain is long enough for asynchronous copy to be inserted
   // between p1 and add.
   HloComputation::Builder builder(TestName());
@@ -671,10 +746,13 @@ TEST_P(MemorySpaceAssignmentTest, OverridePreferredPrefetchTestAfterPlacement) {
   options.max_size_in_bytes = 128;
   options.alignment_in_bytes = 8;
   options.verify = true;
+  auto config =
+      "instruction_name_exact:add:op_number_exact:1:put_after_instruction:"
+      "negate.1";
   TF_ASSERT_OK_AND_ASSIGN(
-      options.override_preferred_prefetch_times,
-      memory_space_assignment::OverridePreferredPrefetchTime::
-          ParseOverridePreferredPrefetchTimesConfig("add:1::after:negate.1"));
+      options.filter_update_preferred_prefetches,
+      memory_space_assignment::FilterUpdatePreferredPrefetch::
+          ParseFilterUpdatePreferredPrefetches(config));
   AssignMemorySpace(module.get(), -1, 10, 2, options);
 
   EXPECT_THAT(add, op::Add(op::Negate(), op::AsyncCopy(kAlternateMemorySpace,
@@ -704,7 +782,7 @@ TEST_P(MemorySpaceAssignmentTest, OverridePreferredPrefetchTestAfterPlacement) {
   EXPECT_THAT(sequence.instructions()[10], op::CopyDone());
 }
 
-TEST_P(MemorySpaceAssignmentTest, OverridePreferredPrefetchTooLateTest) {
+TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchTooLateTest) {
   // The negate chain is long enough for asynchronous copy to be inserted
   // between p1 and add.
   HloComputation::Builder builder(TestName());
@@ -742,17 +820,13 @@ TEST_P(MemorySpaceAssignmentTest, OverridePreferredPrefetchTooLateTest) {
   options.max_size_in_bytes = 128;
   options.alignment_in_bytes = 8;
   options.verify = true;
-  auto multi_config =
-      "random_name.1:1:1#2:after:random_name.0;add:1::after:negate.5";
+  auto config =
+      "instruction_name_exact:add:op_number_exact:1:put_after_instruction:"
+      "negate.5";
   TF_ASSERT_OK_AND_ASSIGN(
-      options.override_preferred_prefetch_times,
-      memory_space_assignment::OverridePreferredPrefetchTime::
-          ParseOverridePreferredPrefetchTimesConfig(multi_config));
-  // Ensure both configurations were parsed.
-  EXPECT_EQ(2, options.override_preferred_prefetch_times.size());
-  // Ensure parsing correctness for first config.
-  EXPECT_EQ("random_name.1:1:{1,2}:after:random_name.0",
-            options.override_preferred_prefetch_times.front().ToString());
+      options.filter_update_preferred_prefetches,
+      memory_space_assignment::FilterUpdatePreferredPrefetch::
+          ParseFilterUpdatePreferredPrefetches(config));
   AssignMemorySpace(module.get(), -1, 10, 2, options);
 
   // Ensure the Async copy is not scheduled.
@@ -774,6 +848,226 @@ TEST_P(MemorySpaceAssignmentTest, OverridePreferredPrefetchTooLateTest) {
   EXPECT_THAT(negate6, op::ShapeWithLayout(shape_in_alternate_mem));
 }
 
+TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigPrecedenceTest) {
+  // The negate chain is long enough for asynchronous copy to be inserted
+  // between p1 and add.
+  HloComputation::Builder builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  HloInstruction* p0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  HloInstruction* p1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  HloInstruction* negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, p0));
+  HloInstruction* negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate0));
+  HloInstruction* negate2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate1));
+  HloInstruction* negate3 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate2));
+  HloInstruction* negate4 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate3));
+  HloInstruction* negate5 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate4));
+  HloInstruction* negate6 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate5));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, negate6, p1));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
+                                      negate3, negate4, negate5, negate6, add});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  Options options;
+  options.max_size_in_bytes = 128;
+  options.alignment_in_bytes = 8;
+  options.verify = true;
+  auto config =
+      "op_size_gte:24:op_size_lte:24:prefetch_eagerness:0.5;instruction_"
+      "name_exact:add:op_number_exact:1:put_after_instruction:negate.1";
+  TF_ASSERT_OK_AND_ASSIGN(
+      options.filter_update_preferred_prefetches,
+      memory_space_assignment::FilterUpdatePreferredPrefetch::
+          ParseFilterUpdatePreferredPrefetches(config));
+  AssignMemorySpace(module.get(), -1, 10, 2, options);
+
+  EXPECT_THAT(add, op::Add(op::Negate(), op::AsyncCopy(kAlternateMemorySpace,
+                                                       kDefaultMemorySpace,
+                                                       op::Parameter(1))));
+  // Parameters are in the default memory space.
+  EXPECT_THAT(p0, op::ShapeWithLayout(shape));
+  EXPECT_THAT(p1, op::ShapeWithLayout(shape));
+  // Negate instructions are in the alternate memory space (1).
+  Shape shape_in_alternate_mem =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
+                                          /*minor_to_major=*/{1, 0},
+                                          /*tiles=*/{}, kAlternateMemorySpace);
+  EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate3, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate4, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate5, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate6, op::ShapeWithLayout(shape_in_alternate_mem));
+  // Ensure the CopyStart/CopyDone schedules.
+  const HloInstructionSequence& sequence =
+      module->schedule().sequence(computation);
+  EXPECT_THAT(sequence.instructions()[0], op::Parameter(0));
+  EXPECT_THAT(sequence.instructions()[1], op::Parameter(1));
+  EXPECT_THAT(sequence.instructions()[6], op::CopyStart());
+  EXPECT_THAT(sequence.instructions()[10], op::CopyDone());
+}
+
+TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchPrecedenceTest) {
+  // The negate chain is long enough for asynchronous copy to be inserted
+  // between p1 and add.
+  HloComputation::Builder builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  HloInstruction* p0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  HloInstruction* p1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  HloInstruction* negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, p0));
+  HloInstruction* negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate0));
+  HloInstruction* negate2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate1));
+  HloInstruction* negate3 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate2));
+  HloInstruction* negate4 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate3));
+  HloInstruction* negate5 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate4));
+  HloInstruction* negate6 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate5));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, negate6, p1));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
+                                      negate3, negate4, negate5, negate6, add});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  Options options;
+  options.max_size_in_bytes = 128;
+  options.alignment_in_bytes = 8;
+  options.verify = true;
+  auto config =
+      "instruction_name_exact:add:op_number_exact:1:put_after_instruction:"
+      "negate.1;op_size_gte:24:op_size_lte:24:prefetch_eagerness:0.5";
+  TF_ASSERT_OK_AND_ASSIGN(
+      options.filter_update_preferred_prefetches,
+      memory_space_assignment::FilterUpdatePreferredPrefetch::
+          ParseFilterUpdatePreferredPrefetches(config));
+  AssignMemorySpace(module.get(), -1, 10, 2, options);
+
+  EXPECT_THAT(add, op::Add(op::Negate(), op::AsyncCopy(kAlternateMemorySpace,
+                                                       kDefaultMemorySpace,
+                                                       op::Parameter(1))));
+  // Parameters are in the default memory space.
+  EXPECT_THAT(p0, op::ShapeWithLayout(shape));
+  EXPECT_THAT(p1, op::ShapeWithLayout(shape));
+  // Negate instructions are in the alternate memory space (1).
+  Shape shape_in_alternate_mem =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
+                                          /*minor_to_major=*/{1, 0},
+                                          /*tiles=*/{}, kAlternateMemorySpace);
+  EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate3, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate4, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate5, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate6, op::ShapeWithLayout(shape_in_alternate_mem));
+  // Ensure the CopyStart/CopyDone schedules.
+  const HloInstructionSequence& sequence =
+      module->schedule().sequence(computation);
+  EXPECT_THAT(sequence.instructions()[0], op::Parameter(0));
+  EXPECT_THAT(sequence.instructions()[1], op::Parameter(1));
+  EXPECT_THAT(sequence.instructions()[4], op::CopyStart());
+  EXPECT_THAT(sequence.instructions()[10], op::CopyDone());
+}
+
+TEST_P(MemorySpaceAssignmentTest, FilterUpdatePreferredPrefetchNoMatchTest) {
+  // The negate chain is long enough for asynchronous copy to be inserted
+  // between p1 and add.
+  HloComputation::Builder builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  HloInstruction* p0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  HloInstruction* p1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  HloInstruction* negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, p0));
+  HloInstruction* negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate0));
+  HloInstruction* negate2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate1));
+  HloInstruction* negate3 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate2));
+  HloInstruction* negate4 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate3));
+  HloInstruction* negate5 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate4));
+  HloInstruction* negate6 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate5));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, negate6, p1));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2,
+                                      negate3, negate4, negate5, negate6, add});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  Options options;
+  options.max_size_in_bytes = 128;
+  options.alignment_in_bytes = 8;
+  options.verify = true;
+  auto config = "op_size_gte:25:op_size_lte:24:prefetch_eagerness:0.5";
+  TF_ASSERT_OK_AND_ASSIGN(
+      options.filter_update_preferred_prefetches,
+      memory_space_assignment::FilterUpdatePreferredPrefetch::
+          ParseFilterUpdatePreferredPrefetches(config));
+  AssignMemorySpace(module.get(), -1, 10, 2, options);
+
+  EXPECT_THAT(add, op::Add(op::Negate(), op::AsyncCopy(kAlternateMemorySpace,
+                                                       kDefaultMemorySpace,
+                                                       op::Parameter(1))));
+  // Parameters are in the default memory space.
+  EXPECT_THAT(p0, op::ShapeWithLayout(shape));
+  EXPECT_THAT(p1, op::ShapeWithLayout(shape));
+  // Negate instructions are in the alternate memory space (1).
+  Shape shape_in_alternate_mem =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
+                                          /*minor_to_major=*/{1, 0},
+                                          /*tiles=*/{}, kAlternateMemorySpace);
+  EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate3, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate4, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate5, op::ShapeWithLayout(shape_in_alternate_mem));
+  EXPECT_THAT(negate6, op::ShapeWithLayout(shape_in_alternate_mem));
+  // Ensure the CopyStart/CopyDone schedules.
+  const HloInstructionSequence& sequence =
+      module->schedule().sequence(computation);
+  EXPECT_THAT(sequence.instructions()[0], op::Parameter(0));
+  EXPECT_THAT(sequence.instructions()[1], op::Parameter(1));
+  EXPECT_THAT(sequence.instructions()[2], op::CopyStart());
+  EXPECT_THAT(sequence.instructions()[10], op::CopyDone());
+}
+
 TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetch) {
   std::unique_ptr<HloModule> module = CreateEvictAndPrefetchModule();
 
@@ -4645,11 +4939,9 @@ TEST_P(MemorySpaceAssignmentTest, RedundantEvictionEliminationBug) {
               kAlternateMemorySpace);
     const HloInstruction* gte1 = FindInstruction(module.get(), "gte1");
     EXPECT_EQ(gte1->user_count(), 2);
-    EXPECT_NE(absl::c_find_if(gte1->users(),
-                              [](const HloInstruction* use) {
-                                return use->opcode() == HloOpcode::kCopyStart;
-                              }),
-              gte1->users().end());
+    EXPECT_NE(
+        absl::c_find_if(gte1->users(), HloPredicateIsOp<HloOpcode::kCopyStart>),
+        gte1->users().end());
   }
 }
 
@@ -7993,5 +8285,187 @@ TEST_F(CostAnalysisPrefetchIntervalPickerTest, EarliestLatestWindowTooSmall) {
   EXPECT_TRUE(interval_picker.Done());
 }
 
+class MemorySpaceAssignmentCostAnalysisTest : public HloTestBase {
+ protected:
+  Status Initialize(const HloModule* module,
+                    float pipeline_overhead_window_size_mib = 0.0) {
+    HloCostAnalysis::Options options;
+    options_.alternate_mem_bandwidth_bytes_per_second = 128;
+    options_.async_copy_bandwidth_bytes_per_second = 32;
+    options_.pipeline_overhead_window_size_mib =
+        pipeline_overhead_window_size_mib;
+    options.shape_size = ShapeSize;
+    options.set_flops_per_second(8);
+    options.set_bytes_per_second(32);
+    options.set_transcendentals_per_second(16);
+    hlo_cost_analysis_ = std::make_unique<HloCostAnalysis>(options);
+    TF_RETURN_IF_ERROR(
+        module->entry_computation()->Accept(hlo_cost_analysis_.get()));
+    TF_ASSIGN_OR_RETURN(cost_analysis_,
+                        MemorySpaceAssignmentCostAnalysis::Create(
+                            *hlo_cost_analysis_, options_, *module));
+    return OkStatus();
+  }
+
+  Options options_;
+  std::unique_ptr<HloCostAnalysis> hlo_cost_analysis_;
+  std::unique_ptr<MemorySpaceAssignmentCostAnalysis> cost_analysis_;
+};
+
+TEST_F(MemorySpaceAssignmentCostAnalysisTest, NoPipelineOverhead) {
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)
+    param1 = f32[2,4] parameter(1)
+    ROOT add = f32[2,4] add(param0, param1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(Initialize(module.get()));
+
+  const HloInstruction* add = module->entry_computation()->root_instruction();
+  const float expected_compute_elapsed =
+      /*num_flops=*/8 / /*flops_per_second=*/8.0;
+  LOG(INFO) << "Expected compute elapsed = " << expected_compute_elapsed;
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToCompute(*add),
+            expected_compute_elapsed);
+  float expected_memory_elapsed =
+      /*bytes_accessed=*/(3 * 4 * 8) / /*bytes_per_second=*/32.0;
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(*add),
+            expected_memory_elapsed);
+
+  // This HLO is memory-bound.
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsed(*add),
+            expected_memory_elapsed);
+  EXPECT_EQ(
+      cost_analysis_->GetInstructionElapsedInAlternateMemory(*add, {}, {}),
+      expected_memory_elapsed);
+
+  // Put operand 0 in alternate memory. Still memory bound.
+  expected_memory_elapsed =
+      (/*bytes_accessed=*/(2 * 4 * 8) / /*bytes_per_second=*/32.0) +
+      (/*bytes_accessed=*/(4 * 8) / /*bytes_per_second=*/128.0);
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(*add, {{0, {}}}),
+            expected_memory_elapsed);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
+                *add, {{0, {}}}, {}),
+            expected_memory_elapsed);
+
+  // Put operand 0 and output in alternate memory. Still memory bound.
+  expected_memory_elapsed =
+      (/*bytes_accessed=*/(4 * 8) / /*bytes_per_second=*/32.0) +
+      (/*bytes_accessed=*/(2 * 4 * 8) / /*bytes_per_second=*/128.0);
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  EXPECT_EQ(
+      cost_analysis_->GetInstructionElapsedDueToMemory(*add, {{0, {}}}, {{}}),
+      expected_memory_elapsed);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
+                *add, {{0, {}}}, {{}}),
+            expected_memory_elapsed);
+
+  // Put everything in alternate memory. We're now compute bound.
+  expected_memory_elapsed =
+      /*bytes_accessed=*/(3 * 4 * 8) / /*bytes_per_second=*/128.0;
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(
+                *add, {{0, {}}, {1, {}}}, {{}}),
+            expected_memory_elapsed);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
+                *add, {{0, {}}, {1, {}}}, {{}}),
+            expected_compute_elapsed);
+}
+
+TEST_F(MemorySpaceAssignmentCostAnalysisTest, PipelineOverhead) {
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)
+    param1 = f32[2,4] parameter(1)
+    ROOT add = f32[2,4] add(param0, param1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  // Set the window size 64B.
+  TF_ASSERT_OK(
+      Initialize(module.get(),
+                 /*pipeline_overhead_window_size_mib=*/(64.0 / 1024 / 1024)));
+
+  const HloInstruction* add = module->entry_computation()->root_instruction();
+  const float expected_compute_elapsed =
+      /*num_flops=*/8 / /*flops_per_second=*/8.0;
+  LOG(INFO) << "Expected compute elapsed = " << expected_compute_elapsed;
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToCompute(*add),
+            expected_compute_elapsed);
+  float expected_memory_elapsed =
+      /*bytes_accessed=*/(3 * 4 * 8) / /*bytes_per_second=*/32.0;
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(*add),
+            expected_memory_elapsed);
+
+  float expected_overhead = expected_compute_elapsed * 2 / 3;
+  LOG(INFO) << "Expected overhead = " << expected_overhead;
+  EXPECT_EQ(cost_analysis_->GetDefaultMemoryAccessOverhead(*add),
+            expected_overhead);
+  // This HLO is memory-bound.
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsed(*add),
+            expected_memory_elapsed + expected_overhead);
+  EXPECT_EQ(
+      cost_analysis_->GetInstructionElapsedInAlternateMemory(*add, {}, {}),
+      expected_memory_elapsed + expected_overhead);
+
+  // Put operand 0 in alternate memory. Still memory bound.
+  expected_memory_elapsed =
+      (/*bytes_accessed=*/(2 * 4 * 8) / /*bytes_per_second=*/32.0) +
+      (/*bytes_accessed=*/(4 * 8) / /*bytes_per_second=*/128.0);
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  EXPECT_EQ(cost_analysis_->GetDefaultMemoryAccessOverhead(*add, {{0, {}}}),
+            expected_overhead);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(*add, {{0, {}}}),
+            expected_memory_elapsed);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
+                *add, {{0, {}}}, {}),
+            expected_memory_elapsed + expected_overhead);
+
+  // Put operand 0 and output in alternate memory. Still memory bound.
+  expected_memory_elapsed =
+      (/*bytes_accessed=*/(4 * 8) / /*bytes_per_second=*/32.0) +
+      (/*bytes_accessed=*/(2 * 4 * 8) / /*bytes_per_second=*/128.0);
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  expected_overhead = expected_compute_elapsed / 3;
+  LOG(INFO) << "Expected overhead = " << expected_overhead;
+  EXPECT_EQ(
+      cost_analysis_->GetDefaultMemoryAccessOverhead(*add, {{0, {}}}, {{}}),
+      expected_overhead);
+  EXPECT_EQ(
+      cost_analysis_->GetInstructionElapsedDueToMemory(*add, {{0, {}}}, {{}}),
+      expected_memory_elapsed);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
+                *add, {{0, {}}}, {{}}),
+            expected_memory_elapsed + expected_overhead);
+
+  // Put everything in alternate memory. We're now compute bound.
+  expected_memory_elapsed =
+      /*bytes_accessed=*/(3 * 4 * 8) / /*bytes_per_second=*/128.0;
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  expected_overhead = 0;
+  LOG(INFO) << "Expected overhead = " << expected_overhead;
+  EXPECT_EQ(cost_analysis_->GetDefaultMemoryAccessOverhead(
+                *add, {{0, {}}, {1, {}}}, {{}}),
+            expected_overhead);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(
+                *add, {{0, {}}, {1, {}}}, {{}}),
+            expected_memory_elapsed);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
+                *add, {{0, {}}, {1, {}}}, {{}}),
+            expected_compute_elapsed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index ccc55812910..11ccc007029 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -1622,7 +1622,7 @@ class HloInstructionPatternBinaryOperandsAnyOrderImpl {
     for (int i = 0; !wrote_explanation && i < 2; ++i) {
       if (!matches[i][0] && !matches[i][1]) {
         EXPLAIN << "HloInstruction's operands (ignoring order) did not match "
-                << (i == 0 ? "first" : "second") << " matcher.  Specifically,";
+                << (i == 0 ? "first" : "second") << " matcher. Specifically,";
         describe_matcher(i);
         wrote_explanation = true;
       }
@@ -1636,7 +1636,7 @@ class HloInstructionPatternBinaryOperandsAnyOrderImpl {
         CHECK(!matches[1][(i + 1) % 2]);
         CHECK(!wrote_explanation);
         EXPLAIN << "HloInstruction's " << (i == 1 ? "LHS" : "RHS")
-                << " operand did not match either of the two matchers.  "
+                << " operand did not match either of the two matchers. "
                    "Specifically,";
         describe_matcher(0);
         EXPLAIN << "\nand";
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index f38deca8790..646718cefad 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -893,8 +893,7 @@ TEST_F(PatternMatcherTest, HloInstructionDescribeToAndExplain) {
       "in a = s32[] add(s32[] c, s32[] c)");
 
   EXPECT_DESC_AND_EXPLANATION(
-      constant,
-      m::Op().WithPredicate([](const HloInstruction*) { return false; }),
+      constant, m::Op().WithPredicate(HloPredicateFalse),
       "an HloInstruction which matches a user-specified predicate",
       "HloInstruction does not match user-specified predicate\n"
       "in c = s32[] constant(0)");
@@ -918,7 +917,7 @@ TEST_F(PatternMatcherTest, HloInstructionMatcherAnyOrderDescribeTo) {
       "    - an HloInstruction named \"b\"\n"
       "    - an HloInstruction named \"bar\"",
       "HloInstruction's operands (ignoring order) did not match second "
-      "matcher.  Specifically,\n"
+      "matcher. Specifically,\n"
       " - an HloInstruction named \"bar\"\n"
       "does not match LHS:\n"
       " - HloInstruction not named \"bar\"\n"
@@ -942,7 +941,7 @@ TEST_F(PatternMatcherTest, HloInstructionMatcherAnyOrderDescribeTo) {
       " * with two operands in either order:\n"
       "    - an HloInstruction which is a constant scalar\n"
       "    - an HloInstruction with opcode constant",
-      "HloInstruction's LHS operand did not match either of the two matchers.  "
+      "HloInstruction's LHS operand did not match either of the two matchers. "
       "Specifically,\n"
       " - an HloInstruction which is a constant scalar\n"
       "does not match LHS:\n"
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index f0ff62c7d19..2b280e85b27 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -71,7 +71,7 @@ StatusOr<std::vector<se::Platform*>> GetSupportedPlatforms() {
         if (!supported) {
           LOG(INFO) << "platform " << platform->Name() << " present but no "
                     << "XLA compiler available: "
-                    << compiler_status.status().error_message();
+                    << compiler_status.status().message();
         }
         return supported;
       });
@@ -197,7 +197,7 @@ PlatformUtil::GetStreamExecutors(
       } else {
         LOG(WARNING) << "unable to create StreamExecutor for "
                      << platform->Name() << ":" << device_ordinal << ": "
-                     << executor_status.status().error_message();
+                     << executor_status.status().message();
       }
       VLOG(1) << "Finished device init " << device_ordinal;
     };
diff --git a/tensorflow/compiler/xla/service/profile_guided_latency_estimator.cc b/tensorflow/compiler/xla/service/profile_guided_latency_estimator.cc
index 7fe56250c93..35dfb68ef03 100644
--- a/tensorflow/compiler/xla/service/profile_guided_latency_estimator.cc
+++ b/tensorflow/compiler/xla/service/profile_guided_latency_estimator.cc
@@ -21,87 +21,61 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/service/latency_hiding_scheduler.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/tsl/platform/logging.h"
 
 namespace xla {
 
 LatencyEstimator::TimeCost ProfileGuidedLatencyEstimator::GetLatencyBetween(
     const HloGraphNode& from, const HloGraphNode& target) const {
   static constexpr HloGraphNode::TimeCost kLowLatency = 1.0;
-  auto get_latency = [this, &from, &target]() {
-    auto from_it = instr_map_.find(from.GetInstr().name());
-    auto target_it = instr_map_.find(target.GetInstr().name());
-    if (from_it != instr_map_.end() && target_it != instr_map_.end()) {
-      const TimeCost from_ts = from_it->second.first;
-      const TimeCost from_dur = from_it->second.second;
-      const TimeCost target_ts = target_it->second.first;
-      const TimeCost target_dur = target_it->second.second;
-      CHECK_LE(from_ts, target_ts);
-      CHECK_GE(from_dur, 0.0);
-      CHECK_GE(target_dur, 0.0);
-      return target_ts + target_dur - from_ts - from_dur;
-    }
-    LOG(FATAL) << "PGLE failed to get latency between "
-               << from.GetInstr().name() << " and " << target.GetInstr().name();
-  };
-  switch (from.GetInstr().opcode()) {
-    case HloOpcode::kCollectivePermuteStart:
-      if (target.GetInstr().opcode() == HloOpcode::kCollectivePermuteDone) {
-        return get_latency();
-      }
-      break;
-    case HloOpcode::kAllGatherStart:
-      if (target.GetInstr().opcode() == HloOpcode::kAllGatherDone) {
-        return get_latency();
-      }
-      break;
-    case HloOpcode::kSend:
-      if (!config_.schedule_send_recvs) {
-        return kLowLatency;
-      }
-      if (target.GetInstr().opcode() == HloOpcode::kSendDone) {
-        return get_latency();
-      }
-      // Cross-slice communication case.
-      if (target.GetInstr().opcode() == HloOpcode::kRecvDone) {
-        return get_latency();
-      }
-      break;
-    case HloOpcode::kRecv:
-      if (!config_.schedule_send_recvs) {
-        return kLowLatency;
-      }
-      if (target.GetInstr().opcode() == HloOpcode::kRecvDone) {
-        return get_latency();
-      }
-      break;
-    default:
-      break;
+  const HloOpcode from_op = from.GetInstr().opcode();
+  if (!config_.schedule_send_recvs &&
+      (from_op == HloOpcode::kSend || from_op == HloOpcode::kRecv)) {
+    return kLowLatency;
   }
-  return kLowLatency;
+
+  auto it = instr_map_.find(from.GetInstr().name());
+  if (it == instr_map_.end()) {
+    return latency_estimator_->GetLatencyBetween(from, target);
+  }
+  auto it2 = it->second.latencies.find(target.GetInstr().name());
+  if (it2 != it->second.latencies.end()) {
+    VLOG(10) << "PGLE found latency between " << from.GetInstr().name()
+             << " and " << target.GetInstr().name() << " in latency info";
+    return it2->second * CyclesPerMicrosecond();
+  }
+
+  // For async-start/done instructions, if there is no entry in latencies, fall
+  // back to using instruction cost as the latency.
+  if (it->second.cost.has_value() && IsAsyncPair(from, target)) {
+    VLOG(10) << "PGLE found latency for async op " << from.GetInstr().name()
+             << " and (assumed)" << target.GetInstr().name()
+             << " in instruction costs";
+    return *it->second.cost * CyclesPerMicrosecond();
+  }
+
+  return latency_estimator_->GetLatencyBetween(from, target);
 }
 
 LatencyEstimator::TimeCost ProfileGuidedLatencyEstimator::NodeCost(
     const HloInstruction* instr) const {
-  static constexpr HloGraphNode::TimeCost kLowCost = 1.0;
-  if (instr->IsOutputFusion() || instr->IsLoopFusion() ||
-      instr->opcode() == HloOpcode::kConvolution ||
-      instr->opcode() == HloOpcode::kWhile) {
-    auto it = instr_map_.find(instr->name());
-    if (it != instr_map_.end()) {
-      VLOG(10) << "PGLE found cost for: " << instr->name();
-      return it->second.second;
-    }
-    VLOG(10) << "PGLE missed cost for: " << instr->name();
-    return latency_estimator_->NodeCost(instr);
+  const HloOpcode opcode = instr->opcode();
+  if (hlo_query::IsAsyncCollectiveStartOp(opcode) ||
+      hlo_query::IsAsyncCollectiveDoneOp(opcode) ||
+      opcode == HloOpcode::kSend || opcode == HloOpcode::kRecv ||
+      opcode == HloOpcode::kSendDone || opcode == HloOpcode::kRecvDone) {
+    static constexpr TimeCost kLowCost = 1.0;
+    return kLowCost;
   }
-  return kLowCost;
-}
-
-int ProfileGuidedLatencyEstimator::CyclesPerMicrosecond() const {
-  return latency_estimator_->CyclesPerMicrosecond();
+  if (auto it = instr_map_.find(instr->name());
+      it != instr_map_.end() && it->second.cost.has_value()) {
+    VLOG(10) << "PGLE found cost for: " << instr->name();
+    return *it->second.cost;
+  }
+  VLOG(10) << "PGLE missed cost for: " << instr->name();
+  return latency_estimator_->NodeCost(instr);
 }
 
 ProfileGuidedLatencyEstimator::ProfileGuidedLatencyEstimator(
@@ -110,10 +84,15 @@ ProfileGuidedLatencyEstimator::ProfileGuidedLatencyEstimator(
     const ProfiledInstructionsProto& proto)
     : config_(config), latency_estimator_(std::move(latency_estimator)) {
   const int cycles_per_microsecond = latency_estimator_->CyclesPerMicrosecond();
-  for (const auto& instr : proto.instructions()) {
-    instr_map_[instr.name()] =
-        std::make_pair(instr.timestamp_us() * cycles_per_microsecond,
-                       instr.duration_us() * cycles_per_microsecond);
+  for (const auto& instr_cost : proto.costs()) {
+    instr_map_[instr_cost.name()] =
+        ProfileInfo{instr_cost.cost_us() * cycles_per_microsecond};
+  }
+  for (const auto& latency : proto.latencies()) {
+    auto it = instr_map_.insert(std::make_pair(latency.source(), ProfileInfo{}))
+                  .first;
+    it->second.latencies[latency.target()] =
+        latency.latency_us() * cycles_per_microsecond;
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/profile_guided_latency_estimator.h b/tensorflow/compiler/xla/service/profile_guided_latency_estimator.h
index 7f1eab8106c..26fda409042 100644
--- a/tensorflow/compiler/xla/service/profile_guided_latency_estimator.h
+++ b/tensorflow/compiler/xla/service/profile_guided_latency_estimator.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_PROFILE_GUIDED_LATENCY_ESTIMATOR_H_
 
 #include <memory>
+#include <optional>
 #include <string>
-#include <utility>
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/service/latency_hiding_scheduler.h"
@@ -28,24 +28,32 @@ namespace xla {
 
 // Implementation of LatencyEstimator using a profile to estimate HLO cost and
 // latencies between instructions. If a cost is not known, it will forward to
-// an underlying model-based estimator.
+// an underlying estimator.
 class ProfileGuidedLatencyEstimator : public LatencyEstimator {
  public:
-  TimeCost GetLatencyBetween(const HloGraphNode& from,
-                             const HloGraphNode& target) const override;
-  TimeCost NodeCost(const HloInstruction* instr) const override;
-  int CyclesPerMicrosecond() const override;
-
   ProfileGuidedLatencyEstimator(
       const SchedulerConfig& config,
       std::unique_ptr<LatencyEstimator> latency_estimator,
       const ProfiledInstructionsProto& proto);
 
+  TimeCost GetLatencyBetween(const HloGraphNode& from,
+                             const HloGraphNode& target) const override;
+  TimeCost NodeCost(const HloInstruction* instr) const override;
+  int CyclesPerMicrosecond() const override {
+    return latency_estimator_->CyclesPerMicrosecond();
+  }
+
  private:
   const SchedulerConfig config_;
   std::unique_ptr<LatencyEstimator> latency_estimator_;
-  // Maps HLO instruction name to timestamp and duration.
-  absl::flat_hash_map<std::string, std::pair<TimeCost, TimeCost>> instr_map_;
+
+  // Profile info pertaining to a single instruction.
+  struct ProfileInfo {
+    std::optional<TimeCost> cost;
+    // Latencies to other instruction with this instruction as source.
+    absl::flat_hash_map<std::string, TimeCost> latencies;
+  };
+  absl::flat_hash_map<std::string, ProfileInfo> instr_map_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/profile_guided_latency_estimator_test.cc b/tensorflow/compiler/xla/service/profile_guided_latency_estimator_test.cc
index 31d548b13ce..360a88ca0da 100644
--- a/tensorflow/compiler/xla/service/profile_guided_latency_estimator_test.cc
+++ b/tensorflow/compiler/xla/service/profile_guided_latency_estimator_test.cc
@@ -15,12 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/profile_guided_latency_estimator.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
-#include <iterator>
 #include <memory>
-#include <numeric>
 #include <string>
 #include <utility>
 #include <vector>
@@ -32,16 +29,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/latency_hiding_scheduler.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/tsl/platform/logging.h"
-#include "tensorflow/tsl/platform/protobuf.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 
 namespace {
 
-constexpr int kMaxConcurrentAsyncCollectivePermutes = 5;
-
 int GetIndex(absl::Span<HloInstruction* const> instruction_sequence,
              absl::string_view hlo_name) {
   return absl::c_find_if(instruction_sequence,
@@ -53,14 +46,11 @@ int GetIndex(absl::Span<HloInstruction* const> instruction_sequence,
 
 SchedulerConfig GetDefaultSchedConfig() {
   SchedulerConfig sched_cfg;
-  sched_cfg.collective_permute_overlap_limit =
-      kMaxConcurrentAsyncCollectivePermutes;
-  sched_cfg.send_recv_overlap_limit = INT32_MAX;
   return sched_cfg;
 }
 
 StatusOr<bool> RunScheduler(
-    HloModule* module, SchedulerConfig sched_config = GetDefaultSchedConfig(),
+    HloModule* module, const SchedulerConfig& sched_config,
     std::unique_ptr<LatencyEstimator> latency_estimator =
         std::make_unique<ApproximateLatencyEstimator>()) {
   HloCostAnalysis::ShapeSizeFunction shape_size_bytes =
@@ -89,18 +79,16 @@ StatusOr<bool> RunScheduler(
 
 }  // namespace
 
-class LatencyHidingSchedulerTest : public HloTestBase {
+class LatencyHidingSchedulerTest : public HloTestBase,
+                                   public ::testing::WithParamInterface<bool> {
  public:
   StatusOr<std::unique_ptr<HloModule>> ParseHloText(
       absl::string_view hlo_string) {
-    TF_ASSIGN_OR_RETURN(
-        auto hlo_module,
-        ParseAndReturnVerifiedModule(hlo_string, GetModuleConfigForTest()));
-    return StatusOr<std::unique_ptr<HloModule>>(std::move(hlo_module));
+    return ParseAndReturnVerifiedModule(hlo_string, GetModuleConfigForTest());
   }
 };
 
-TEST_F(LatencyHidingSchedulerTest, TestProfileGuidedLatencyEstimator) {
+TEST_P(LatencyHidingSchedulerTest, TestProfileGuidedLatencyEstimator) {
   absl::string_view hlo_string = R"(
 HloModule module, is_scheduled=true
 
@@ -112,8 +100,7 @@ ENTRY entry {
   cp1s = (f32[1024,2048,2048]{2,1,0}, f32[1024,2048,2048]{2,1,0}, u32[], u32[]) collective-permute-start(p2), source_target_pairs={{1,0},{0,3},{3,2}}
   cp2s = (f32[2048,2048,2048]{2,1,0}, f32[2048,2048,2048]{2,1,0}, u32[], u32[]) collective-permute-start(p3), source_target_pairs={{1,0},{0,3},{3,2}}
   c0 = f32[16,256,256]{2,1,0} convolution(p0, p1),
-    window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb,
-    metadata={op_type="AllToAll" op_name="c0"}
+    window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb
   cp1d = f32[1024,2048,2048]{2,1,0} collective-permute-done(cp1s)
   cp2d = f32[2048,2048,2048]{2,1,0} collective-permute-done(cp2s)
   ROOT tuple.2 = (f32[16,256,256]{2,1,0}, f32[1024,2048,2048]{2,1,0}, f32[2048,2048,2048]{2,1,0}) tuple(c0, cp1d, cp2d)
@@ -124,26 +111,34 @@ ENTRY entry {
   HloSchedule& module_schedule = hlo_module->schedule();
   EXPECT_TRUE(hlo_module->has_entry_computation());
 
-  std::string profiled_instructions_text_proto = R"pb(
-    instructions { name: "cp1s" timestamp_us: 0.0 duration_us: 1.0 }
-    instructions { name: "cp1d" timestamp_us: 1.0 duration_us: 40.0 }
-    instructions { name: "cp2s" timestamp_us: 41.0 duration_us: 1.0 }
-    instructions { name: "cp2d" timestamp_us: 42.0 duration_us: 80.0 }
-    instructions { name: "c0" timestamp_us: 122.0 duration_us: 10.0 }
-  )pb";
+  // Test parameter decided whether async latencies are read from latencies or
+  // costs.
+  std::string profiled_instructions_text_proto;
+  if (GetParam()) {
+    profiled_instructions_text_proto = R"pb(
+      costs { name: "c0" cost_us: 10.0 }
+      latencies { source: "cp1s" target: "cp1d" latency_us: 40.0 }
+      latencies { source: "cp2s" target: "cp2d" latency_us: 80.0 }
+    )pb";
+  } else {
+    profiled_instructions_text_proto = R"pb(
+      costs { name: "c0" cost_us: 10.0 }
+      costs { name: "cp1s" cost_us: 40.0 }
+      costs { name: "cp2s" cost_us: 80.0 }
+    )pb";
+  }
   ProfiledInstructionsProto profiled_instructions_proto;
   ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
       profiled_instructions_text_proto, &profiled_instructions_proto));
 
   auto sched_config = GetDefaultSchedConfig();
   sched_config.collective_permute_overlap_limit = 2;
-  sched_config.all_gather_overlap_limit = 2;
   auto latency_estimator = std::make_unique<ProfileGuidedLatencyEstimator>(
       sched_config, std::make_unique<ApproximateLatencyEstimator>(),
       profiled_instructions_proto);
-  EXPECT_TRUE(RunScheduler(hlo_module.get(), GetDefaultSchedConfig(),
-                           std::move(latency_estimator))
-                  .ok());
+  EXPECT_TRUE(
+      RunScheduler(hlo_module.get(), sched_config, std::move(latency_estimator))
+          .ok());
   EXPECT_TRUE(hlo_module->has_entry_computation());
 
   std::vector<HloInstruction*> new_instruction_sequence =
@@ -154,10 +149,13 @@ ENTRY entry {
     }
   }
 
-  // cp2s should come first since the duration between cp2s->cp2d is double that
+  // cp2s should come first since the latency between cp2s->cp2d is double that
   // of cp1s->cp1d
   EXPECT_LT(GetIndex(new_instruction_sequence, "cp2s"),
             GetIndex(new_instruction_sequence, "cp1s"));
 }
 
+INSTANTIATE_TEST_SUITE_P(LatencyHidingSchedulerTest, LatencyHidingSchedulerTest,
+                         ::testing::Bool());
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_combiner.cc b/tensorflow/compiler/xla/service/reduce_scatter_combiner.cc
index 3a4a5f8e660..69d66f666f3 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_combiner.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_combiner.cc
@@ -29,11 +29,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_reachability.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/service/all_reduce_key.h"
 #include "tensorflow/compiler/xla/service/collective_combiner_utils.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_combiner_test.cc b/tensorflow/compiler/xla/service/reduce_scatter_combiner_test.cc
index 07e9ec5b87d..fe5c82b4c51 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_combiner_test.cc
@@ -46,10 +46,7 @@ class ReduceScatterCombinerTest : public HloTestBase {
 
   size_t ReduceScatterCount(std::unique_ptr<HloModule>& module) {
     return absl::c_count_if(module->entry_computation()->instructions(),
-                            [](const HloInstruction* inst) {
-                              return inst->opcode() ==
-                                     HloOpcode::kReduceScatter;
-                            });
+                            HloPredicateIsOp<HloOpcode::kReduceScatter>);
   }
 };
 
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_decomposer.cc b/tensorflow/compiler/xla/service/reduce_scatter_decomposer.cc
index afc6b5e90ed..ce625fa328d 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_decomposer.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_decomposer.cc
@@ -25,11 +25,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/collective_decomposer_utils.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_reassociate.cc b/tensorflow/compiler/xla/service/reduce_scatter_reassociate.cc
index 5362118d29e..c046e44fddf 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_reassociate.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_reassociate.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/service/all_reduce_key.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_reassociate_test.cc b/tensorflow/compiler/xla/service/reduce_scatter_reassociate_test.cc
index d5dfbf47b23..ba90e822295 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_reassociate_test.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_reassociate_test.cc
@@ -40,10 +40,7 @@ class ReduceScatterReassociateTest : public HloTestBase {
 
   size_t ReduceScatterCount(std::unique_ptr<HloModule>& module) {
     return absl::c_count_if(module->entry_computation()->instructions(),
-                            [](const HloInstruction* inst) {
-                              return inst->opcode() ==
-                                     HloOpcode::kReduceScatter;
-                            });
+                            HloPredicateIsOp<HloOpcode::kReduceScatter>);
   }
 };
 
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_utils.cc b/tensorflow/compiler/xla/service/reduce_scatter_utils.cc
index 419db580a6b..ae689d2a96c 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_utils.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_utils.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/reduce_scatter_utils.h"
 
+#include <functional>
+#include <optional>
+
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 
@@ -35,6 +38,14 @@ bool IsTableLookup(const HloInstruction* hlo) {
           hlo->operand(0)->shape().element_type() == U32);
 }
 
+std::optional<int64_t> GetScalarInt64Value(const HloInstruction* constant) {
+  CHECK_EQ(constant->opcode(), HloOpcode::kConstant);
+  CHECK(ShapeUtil::IsEffectiveScalar(constant->shape()));
+  absl::InlinedVector<int64_t, 8> multi_index(
+      constant->shape().dimensions_size());
+  return constant->literal().GetIntegralAsS64(multi_index);
+}
+
 // Function to map a replica/partition/global ID to an offset in the offset
 // table, based on the given scalar offset HLO. For example, if the HLO is
 // kPartitionId but the all-reduce uses global IDs, then the function maps
@@ -148,8 +159,7 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
       VLOG(2) << "Offset is not multiple(const, ...) " << offset->ToString();
       return false;
     }
-    auto multiplier =
-        offset->operand(const_operand)->literal().GetIntegralAsS64({});
+    auto multiplier = GetScalarInt64Value(offset->operand(const_operand));
     if (!multiplier || shard_size % *multiplier != 0) {
       VLOG(2) << "Multiplier is unknown or cannot evenly divide shard size "
               << offset->operand(const_operand);
@@ -187,11 +197,12 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
   }
 
   if (offset->opcode() == HloOpcode::kClamp) {
-    auto lower_bound = offset->operand(0)->literal().GetIntegralAsS64({});
-    auto upper_bound = offset->operand(2)->literal().GetIntegralAsS64({});
-    if (!lower_bound || !upper_bound || *lower_bound != 0 ||
+    auto lower_bound = GetScalarInt64Value(offset->operand(0));
+    auto upper_bound = GetScalarInt64Value(offset->operand(2));
+    if (!lower_bound || !upper_bound || lower_bound != 0 ||
         *upper_bound < (group_size - 1) * shard_size) {
-      VLOG(2) << "Boundaries of the clamp is not legal: " << offset->ToString();
+      VLOG(2) << "Boundaries of the clamp are not legal: "
+              << offset->ToString();
       return false;
     }
     return IsPerIdOffset(offset->operand(1), shard_size, map_id, group_size,
@@ -253,12 +264,8 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
     const HloAllReduceInstruction* ar, int64_t num_partitions,
     int64_t num_replicas, bool allow_multiple_split_dims,
     bool allow_intervening_reshape, int64_t min_rank) {
-  HloPredicate match_partition_id = [](const HloInstruction* i) {
-    return i->opcode() == HloOpcode::kPartitionId;
-  };
-  HloPredicate match_replica_id = [](const HloInstruction* i) {
-    return i->opcode() == HloOpcode::kReplicaId;
-  };
+  HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>;
+  HloPredicate match_replica_id = HloPredicateIsOp<HloOpcode::kReplicaId>;
   return MatchReduceScatter(ar, num_partitions, num_replicas,
                             allow_multiple_split_dims,
                             allow_intervening_reshape, min_rank,
@@ -366,12 +373,10 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
         return operand->opcode() == HloOpcode::kMultiply &&
                ((operand->operand(0)->opcode() == HloOpcode::kReplicaId &&
                  operand->operand(1)->IsConstant() &&
-                 operand->operand(1)->literal().GetIntegralAsS64({}) ==
-                     num_partitions) ||
+                 GetScalarInt64Value(operand->operand(1)) == num_partitions) ||
                 (operand->operand(1)->opcode() == HloOpcode::kReplicaId &&
                  operand->operand(0)->IsConstant() &&
-                 operand->operand(0)->literal().GetIntegralAsS64({}) ==
-                     num_partitions));
+                 GetScalarInt64Value(operand->operand(0)) == num_partitions));
       };
       if (hlo->opcode() == HloOpcode::kAdd &&
           ((match_partition_id(hlo->operand(0)) &&
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 59ca52c5e0f..2e93ffe93b9 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -239,7 +239,7 @@ Service::ResolveAndValidateArguments(
     if (!buffer_status.ok()) {
       return tsl::errors::CreateWithUpdatedMessage(
           buffer_status.status(),
-          StrCat(buffer_status.status().error_message(), ", ",
+          StrCat(buffer_status.status().message(), ", ",
                  "failed to resolve allocation for parameter ", i));
     }
     auto replicated_buffers = buffer_status.value();
@@ -466,7 +466,7 @@ Service::ExecuteParallelAndRegisterResult(
     Status block_status = streams[i]->BlockHostUntilDone();
     if (!block_status.ok()) {
       return InternalError("failed to complete execution for stream %d: %s", i,
-                           block_status.error_message());
+                           block_status.message());
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index 2cbdb599644..a0bde697ec7 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -53,7 +53,7 @@ class ServiceExecutableRunOptions {
   StatusOr<StreamPool::Ptr> BorrowStream(int device_ordinal) const {
     return borrow_stream_
                ? borrow_stream_(device_ordinal)
-               : Status(tsl::error::UNIMPLEMENTED, "No stream cache");
+               : Status(absl::StatusCode::kUnimplemented, "No stream cache");
   }
 
  private:
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index d516aa97c08..f18f490a386 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -1180,16 +1180,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         continue;
       }
     }
-
-    std::vector<std::string> pieces;
-    pieces.reserve(arg_shapes.size());
-    for (const Shape* shape : arg_shapes) {
-      pieces.push_back(ShapeUtil::HumanString(*shape));
-    }
     return InvalidArgument(
         "Map operation requires all operands to have the same shape; got: "
         "%s.",
-        StrJoin(pieces, ", "));
+        absl::StrJoin(arg_shapes, ", ",
+                      [](std::string* out, const Shape* shape) {
+                        absl::StrAppend(out, ShapeUtil::HumanString(*shape));
+                      }));
   }
 
   // Check that dimensions.size == arg_shape.dimensions_size() (we currently
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 78379da0be5..3516e5e21fd 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -118,7 +118,7 @@ TEST_F(ShapeInferenceTest, SelectScalarPredBetweenTuples) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kSelect, pred_, tuple, tuple);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Expected array argument for select"));
 }
 
@@ -127,7 +127,7 @@ TEST_F(ShapeInferenceTest, SelectScalarPredBetweenArrays) {
       HloOpcode::kSelect, pred_, matrix_64_48_, matrix_64_48_);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(
-      inferred_status.status().error_message(),
+      inferred_status.status().message(),
       HasSubstr("Operands to select and predicate must be the same shape"));
 }
 
@@ -143,13 +143,13 @@ TEST_F(ShapeInferenceTest, SelectBadShapes) {
   auto inferred_status_error1 = ShapeInference::InferTernaryOpShape(
       HloOpcode::kSelect, pred_, matrix_64_48_, matrix_32_64_);
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().error_message(),
+  ASSERT_THAT(inferred_status_error1.status().message(),
               HasSubstr("Operands to select must be the same shape"));
 
   auto inferred_status_error2 = ShapeInference::InferTernaryOpShape(
       HloOpcode::kSelect, s32_, matrix_64_48_, matrix_64_48_);
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().error_message(),
+  ASSERT_THAT(inferred_status_error2.status().message(),
               HasSubstr("pred operand must have PRED"));
 
   auto inferred_status_error3 = ShapeInference::InferTernaryOpShape(
@@ -157,7 +157,7 @@ TEST_F(ShapeInferenceTest, SelectBadShapes) {
       matrix_64_48_);
   ASSERT_FALSE(inferred_status_error3.ok());
   ASSERT_THAT(
-      inferred_status_error3.status().error_message(),
+      inferred_status_error3.status().message(),
       HasSubstr("Operands to select and predicate must be the same shape"));
 
   // Tuples have a TUPLE element type and cannot be the pred of a select.
@@ -166,7 +166,7 @@ TEST_F(ShapeInferenceTest, SelectBadShapes) {
       ShapeUtil::MakeTupleShape({f32_, f32_}),
       ShapeUtil::MakeTupleShape({f32_, f32_}));
   ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_THAT(inferred_status_error4.status().error_message(),
+  ASSERT_THAT(inferred_status_error4.status().message(),
               HasSubstr("Expected array argument for select pred"));
 }
 
@@ -188,7 +188,7 @@ TEST_F(ShapeInferenceTest, ClampMinScalar) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kClamp, f32_, matrix_64_48_, matrix_64_48_);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Clamp with different shapes"));
 }
 
@@ -196,7 +196,7 @@ TEST_F(ShapeInferenceTest, ClampMaxScalar) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kClamp, matrix_64_48_, matrix_64_48_, f32_);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Clamp with different shapes"));
 }
 
@@ -204,7 +204,7 @@ TEST_F(ShapeInferenceTest, ClampOperandScalar) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kClamp, matrix_64_48_, f32_, matrix_64_48_);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Clamp with different shapes"));
 }
 
@@ -212,7 +212,7 @@ TEST_F(ShapeInferenceTest, ClampMinMatrix) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kClamp, matrix_64_48_, f32_, f32_);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Clamp with different shapes"));
 }
 
@@ -220,7 +220,7 @@ TEST_F(ShapeInferenceTest, ClampMaxMatrix) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kClamp, f32_, f32_, matrix_64_48_);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Clamp with different shapes"));
 }
 
@@ -228,7 +228,7 @@ TEST_F(ShapeInferenceTest, ClampOperandMatrix) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
       HloOpcode::kClamp, f32_, matrix_64_48_, f32_);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Clamp with different shapes"));
 }
 
@@ -354,7 +354,7 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSourceShape) {
       operand_shape_, select_program_shape_, window_, source_shape_fail,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().error_message(),
+  ASSERT_THAT(inferred_status_fail.status().message(),
               HasSubstr("Source shape does not match"));
 }
 
@@ -365,7 +365,7 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape1) {
       operand_shape_, select_program_shape_fail, window_, source_shape_,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().error_message(),
+  ASSERT_THAT(inferred_status_fail.status().message(),
               HasSubstr("Select function must take 2 parameters"));
 }
 
@@ -376,7 +376,7 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape2) {
       operand_shape_, select_program_shape_fail, window_, source_shape_,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().error_message(),
+  ASSERT_THAT(inferred_status_fail.status().message(),
               HasSubstr("Select function must have rank-0 PRED"));
 }
 
@@ -387,7 +387,7 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape3) {
       operand_shape_, select_program_shape_fail, window_, source_shape_,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().error_message(),
+  ASSERT_THAT(inferred_status_fail.status().message(),
               HasSubstr("Select function's first parameter"));
 }
 
@@ -398,7 +398,7 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape4) {
       operand_shape_, select_program_shape_fail, window_, source_shape_,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().error_message(),
+  ASSERT_THAT(inferred_status_fail.status().message(),
               HasSubstr("Select function's second parameter"));
 }
 
@@ -635,7 +635,7 @@ TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
       lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
       window, dnums, /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("each dimension exactly once"));
 }
 
@@ -672,7 +672,7 @@ TEST_F(ShapeInferenceTest, ConvolveBatchGroupCountUnequalOutputFeature) {
       lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/6,
       window, dnums, /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("to be a multiple of batch group count"));
 }
 
@@ -855,7 +855,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithNarrowerPreferredElementType) {
           /*preferred_element_type=*/S8)
           .status();
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.error_message(),
+  ASSERT_THAT(inferred_status.message(),
               HasSubstr("must not be narrower than the original type"));
 }
 
@@ -882,7 +882,7 @@ static void Fail(const Shape& shape, FftType type,
                  absl::Span<const int64_t> length, absl::string_view message) {
   auto inferred_status = ShapeInference::InferFftShape(shape, type, length);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr(std::string(message)));
 }
 
@@ -1035,42 +1035,42 @@ TEST_F(ShapeInferenceTest, Map) {
   auto no_args_error = ShapeInference::InferMapShape(
       {}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {});
   ASSERT_FALSE(no_args_error.ok());
-  ASSERT_THAT(no_args_error.status().error_message(),
+  ASSERT_THAT(no_args_error.status().message(),
               HasSubstr("expects at least one argument"));
 
   auto args_diff_shapes_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_64_},
       ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
   ASSERT_FALSE(args_diff_shapes_error.ok());
-  ASSERT_THAT(args_diff_shapes_error.status().error_message(),
+  ASSERT_THAT(args_diff_shapes_error.status().message(),
               HasSubstr("requires all operands to have the same shape"));
 
   auto arity_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_}, ShapeUtil::MakeProgramShape({f32_}, f32_),
       {0});
   ASSERT_FALSE(arity_error.ok());
-  ASSERT_THAT(arity_error.status().error_message(),
+  ASSERT_THAT(arity_error.status().message(),
               HasSubstr("function arity must match"));
 
   auto output_shape_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({f32_, f32_}, vector_32_), {0});
   ASSERT_FALSE(output_shape_error.ok());
-  ASSERT_THAT(output_shape_error.status().error_message(),
+  ASSERT_THAT(output_shape_error.status().message(),
               HasSubstr("result has to be a scalar"));
 
   auto param_shape_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({vector_32_, f32_}, f32_), {0});
   ASSERT_FALSE(param_shape_error.ok());
-  ASSERT_THAT(param_shape_error.status().error_message(),
+  ASSERT_THAT(param_shape_error.status().message(),
               HasSubstr("parameter has to be a scalar"));
 
   auto param_element_type_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({f32_, s32_}, f32_), {0});
   ASSERT_FALSE(param_element_type_error.ok());
-  ASSERT_THAT(param_element_type_error.status().error_message(),
+  ASSERT_THAT(param_element_type_error.status().message(),
               HasSubstr("parameter type has to match argument"));
 
   Shape arg = ShapeUtil::MakeShape(F32, {20});
@@ -1082,25 +1082,25 @@ TEST_F(ShapeInferenceTest, Map) {
   auto inferred_status_error1 = ShapeInference::InferMapShape(
       {&arg}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().error_message(),
+  ASSERT_THAT(inferred_status_error1.status().message(),
               HasSubstr("arity must match number of arguments"));
 
   auto inferred_status_error2 = ShapeInference::InferMapShape(
       {&arg}, ShapeUtil::MakeProgramShape({vector_32_}, f32_), {0});
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().error_message(),
+  ASSERT_THAT(inferred_status_error2.status().message(),
               HasSubstr("has to be a scalar"));
 
   auto inferred_status_error3 = ShapeInference::InferMapShape(
       {&arg}, ShapeUtil::MakeProgramShape({f32_}, vector_32_), {0});
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().error_message(),
+  ASSERT_THAT(inferred_status_error3.status().message(),
               HasSubstr("has to be a scalar"));
 
   auto inferred_status_error5 = ShapeInference::InferMapShape(
       {&arg}, ShapeUtil::MakeProgramShape({s32_}, s32_), {0});
   ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_THAT(inferred_status_error5.status().error_message(),
+  ASSERT_THAT(inferred_status_error5.status().message(),
               HasSubstr("parameter type has to match argument"));
 }
 
@@ -1207,7 +1207,7 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput1) {
   auto inferred_status = ShapeInference::InferReduceShape(
       {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().error_message(),
+  EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("must take 4 parameters, but takes 6 parameter(s)"));
 }
 
@@ -1220,7 +1220,7 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput2) {
       {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(
-      inferred_status.status().error_message(),
+      inferred_status.status().message(),
       HasSubstr(
           "parameter shape differs from the result shape: s32[] vs f32[]"));
 }
@@ -1230,7 +1230,7 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput3) {
       {s32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_}));
   auto inferred_status = ShapeInference::InferReduceShape({}, {0, 1}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().error_message(),
+  EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("must have at least 2 arguments, has 0"));
 }
 
@@ -1253,8 +1253,7 @@ TEST_F(ReduceShapeInferenceTest, ErrorBadReduceWindowInput) {
   auto inferred_status = ShapeInference::InferReduceWindowShape(
       absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
   EXPECT_FALSE(inferred_status.status().ok());
-  EXPECT_THAT(inferred_status.status().error_message(),
-              HasSubstr("f32[] vs s32[]"));
+  EXPECT_THAT(inferred_status.status().message(), HasSubstr("f32[] vs s32[]"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput1) {
@@ -1266,7 +1265,7 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput1) {
       {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(
-      inferred_status.status().error_message(),
+      inferred_status.status().message(),
       HasSubstr("must produce a tuple with 2 elements, but produces a scalar"));
 }
 
@@ -1279,7 +1278,7 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput2) {
       {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
   EXPECT_THAT(
-      inferred_status.status().error_message(),
+      inferred_status.status().message(),
       HasSubstr("must produce a tuple with 2 elements, but has 3 elements"));
 }
 
@@ -1291,7 +1290,7 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerBoth) {
   auto inferred_status = ShapeInference::InferReduceShape(
       {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().error_message(),
+  EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("accumulator shape at index 0 differs from the "
                         "init_value shape: s32[] vs f32[]"));
 }
@@ -1303,7 +1302,7 @@ TEST_F(ReduceShapeInferenceTest, ErrorOutOfBoundsDimension) {
       {&arg_shape, &f32_},
       /*dimensions_to_reduce=*/{3, 4}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().error_message(),
+  EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("out-of-bounds dimension"));
 }
 
@@ -1314,7 +1313,7 @@ TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) {
       ShapeInference::InferReduceShape({&arg_shape, &f32_},
                                        /*dimensions_to_reduce=*/{0}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().error_message(),
+  EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("take 2 parameters"));
 }
 
@@ -1325,7 +1324,7 @@ TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) {
       ShapeInference::InferReduceShape({&arg_shape, &f32_},
                                        /*dimensions_to_reduce=*/{0}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().error_message(),
+  EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("0-th parameter shape differs"));
 }
 
@@ -1336,7 +1335,7 @@ TEST_F(ReduceShapeInferenceTest, ReduceWithRepeatedReduceDimension) {
       {&arg_shape, &f32_},
       /*dimensions_to_reduce=*/{0, 0}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().error_message(),
+  EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("Duplicate reduction dimension: 0"));
 }
 
@@ -1422,9 +1421,9 @@ TEST_F(ShapeInferenceTest, InferTupleElementShapeOutOfBound) {
       ShapeInference::InferGetTupleElementShape(tuple_shape, 2);
   ASSERT_FALSE(inferredNegative_status.ok());
   ASSERT_FALSE(inferred2_status.ok());
-  EXPECT_THAT(inferredNegative_status.status().error_message(),
+  EXPECT_THAT(inferredNegative_status.status().message(),
               HasSubstr("attempt to index out of tuple bounds"));
-  EXPECT_THAT(inferred2_status.status().error_message(),
+  EXPECT_THAT(inferred2_status.status().message(),
               HasSubstr("attempt to index out of tuple bounds"));
 }
 
@@ -1664,7 +1663,7 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Must specify the same number of contracting "
                         "dimensions for lhs and rhs."));
 }
@@ -1697,7 +1696,7 @@ TEST_F(ShapeInferenceTest, ErrorSetDimensionSize) {
       arg_shape, val_shape, /*dimension=*/0);
 
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().error_message(),
+  EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("value has to be S32 scalar"));
 }
 
@@ -1708,7 +1707,7 @@ TEST_F(ShapeInferenceTest, ErrorSetDimensionSizeWrongType) {
       arg_shape, val_shape, /*dimension=*/0);
 
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().error_message(),
+  EXPECT_THAT(inferred_status.status().message(),
               HasSubstr("value has to be S32 scalar"));
 }
 
@@ -1728,7 +1727,7 @@ TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimSizesFails) {
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("Batch dimension sizes must match"));
 }
 
@@ -1768,7 +1767,7 @@ TEST_F(ShapeInferenceTest, DotWithContractingDimNumberOutOfRange) {
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("A dimension number is out of range"));
 }
 
@@ -1788,7 +1787,7 @@ TEST_F(ShapeInferenceTest, DotWithContractingNonUniqueDimNumber) {
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
+  ASSERT_THAT(inferred_status.status().message(),
               HasSubstr("A dimension number is not unique"));
 }
 
@@ -1880,7 +1879,7 @@ TEST_F(ShapeInferenceTest, DotWithNarrowerPreferredElementType) {
                              /*preferred_element_type=*/S8)
                              .status();
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.error_message(),
+  ASSERT_THAT(inferred_status.message(),
               HasSubstr("must not be narrower than the original type"));
 }
 
@@ -1945,42 +1944,42 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
   auto inferred_status_error1 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {});
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().error_message(),
+  ASSERT_THAT(inferred_status_error1.status().message(),
               HasSubstr("Shapes must be equal rank"));
 
   // broadcast_dimension out of bounds for tensor's rank
   auto inferred_status_error2 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {3});
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().error_message(),
+  ASSERT_THAT(inferred_status_error2.status().message(),
               ContainsRegex("Broadcast dimension number .* too large"));
 
   // broadcast_dimension doesn't match corresponding dimension
   auto inferred_status_error3 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {0});
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().error_message(),
+  ASSERT_THAT(inferred_status_error3.status().message(),
               HasSubstr("Broadcast dimension 0 mismatch"));
 
   // broadcast_dimensions list too long
   auto inferred_status_error4 = ShapeInference::InferBinaryOpShape(
       HloOpcode::kAdd, tensor, matrix8_4, {0, 1, 2});
   ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_THAT(inferred_status_error4.status().error_message(),
+  ASSERT_THAT(inferred_status_error4.status().message(),
               HasSubstr("broadcast_dimensions has to match"));
 
   // there's a dimension above the rank of the tensor
   auto inferred_status_error5 = ShapeInference::InferBinaryOpShape(
       HloOpcode::kAdd, tensor, matrix8_4, {3, 0});
   ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_THAT(inferred_status_error5.status().error_message(),
+  ASSERT_THAT(inferred_status_error5.status().message(),
               ContainsRegex("dimension number .* too large"));
 
   // broadcasting dimensions don't match in this order
   auto inferred_status_error6 = ShapeInference::InferBinaryOpShape(
       HloOpcode::kAdd, tensor, matrix8_4, {2, 1});
   ASSERT_FALSE(inferred_status_error6.ok());
-  ASSERT_THAT(inferred_status_error6.status().error_message(),
+  ASSERT_THAT(inferred_status_error6.status().message(),
               HasSubstr("dimension 0 mismatch"));
 
   // The following two tests make sure that broadcasting dimensions are listed
@@ -1989,13 +1988,13 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
   auto inferred_status_error7 = ShapeInference::InferBinaryOpShape(
       HloOpcode::kAdd, tensor8_8_8, matrix8_8, {0, 0});
   ASSERT_FALSE(inferred_status_error7.ok());
-  ASSERT_THAT(inferred_status_error7.status().error_message(),
+  ASSERT_THAT(inferred_status_error7.status().message(),
               HasSubstr("dimensions order is wrong"));
 
   auto inferred_status_error8 = ShapeInference::InferBinaryOpShape(
       HloOpcode::kAdd, tensor8_8_8, matrix8_8, {1, 0});
   ASSERT_FALSE(inferred_status_error8.ok());
-  ASSERT_THAT(inferred_status_error8.status().error_message(),
+  ASSERT_THAT(inferred_status_error8.status().message(),
               HasSubstr("dimensions order is wrong"));
 }
 
@@ -2021,7 +2020,7 @@ TEST_F(ShapeInferenceTest, WhileWithBadShapes) {
   auto inferred_status_error1 =
       ShapeInference::InferWhileShape(bad_shape_1, body, result_shape);
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().error_message(),
+  ASSERT_THAT(inferred_status_error1.status().message(),
               HasSubstr("Condition must take 1 arguments"));
 
   auto bad_shape_2 =
@@ -2029,21 +2028,21 @@ TEST_F(ShapeInferenceTest, WhileWithBadShapes) {
   auto inferred_status_error2 =
       ShapeInference::InferWhileShape(cond, bad_shape_2, result_shape);
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().error_message(),
+  ASSERT_THAT(inferred_status_error2.status().message(),
               HasSubstr("Body must take 1 arguments"));
 
   auto bad_shape_3 = ShapeUtil::MakeProgramShape({result_shape}, s32_);
   auto inferred_status_error3 =
       ShapeInference::InferWhileShape(bad_shape_3, body, result_shape);
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().error_message(),
+  ASSERT_THAT(inferred_status_error3.status().message(),
               HasSubstr("Condition must return a boolean"));
 
   auto bad_shape_4 = ShapeUtil::MakeProgramShape({result_shape}, vector_32_);
   auto inferred_status_error4 =
       ShapeInference::InferWhileShape(cond, bad_shape_4, result_shape);
   ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_THAT(inferred_status_error4.status().error_message(),
+  ASSERT_THAT(inferred_status_error4.status().message(),
               HasSubstr("parameter of condition and body"));
 }
 
@@ -2088,19 +2087,19 @@ TEST_F(ShapeInferenceTest, ConcatenateWithBadShapes) {
   auto inferred_status_error1 =
       ShapeInference::InferConcatOpShape({}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().error_message(),
+  ASSERT_THAT(inferred_status_error1.status().message(),
               HasSubstr("Concatenate expects at least one argument"));
 
   auto inferred_status_error2 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/-1);
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().error_message(),
+  ASSERT_THAT(inferred_status_error2.status().message(),
               HasSubstr("dimension out of bounds: -1"));
 
   auto inferred_status_error3 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/1);
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().error_message(),
+  ASSERT_THAT(inferred_status_error3.status().message(),
               HasSubstr("dimension out of bounds: 1"));
 
   Shape tuple = ShapeUtil::MakeTupleShape({vector_32_});
@@ -2108,20 +2107,20 @@ TEST_F(ShapeInferenceTest, ConcatenateWithBadShapes) {
       {&vector_32_, &tuple}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error4.ok());
   ASSERT_THAT(
-      inferred_status_error4.status().error_message(),
+      inferred_status_error4.status().message(),
       HasSubstr("Expected array argument for operand of concatenation"));
 
   const Shape vector_s32 = ShapeUtil::MakeShape(S32, {32});
   auto inferred_status_error5 = ShapeInference::InferConcatOpShape(
       {&vector_32_, &vector_s32}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_THAT(inferred_status_error5.status().error_message(),
+  ASSERT_THAT(inferred_status_error5.status().message(),
               HasSubstr("concatenate arrays with different element types"));
 
   auto inferred_status_error6 = ShapeInference::InferConcatOpShape(
       {&matrix_32_48_, &matrix_32_64_}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error6.ok());
-  ASSERT_THAT(inferred_status_error6.status().error_message(),
+  ASSERT_THAT(inferred_status_error6.status().message(),
               HasSubstr("concatenate arrays that differ in "
                         "dimensions other than the one being "
                         "concatenated"));
@@ -2154,7 +2153,7 @@ TEST_F(ShapeInferenceTest, Pad) {
   auto negative_dimension_size = ShapeInference::InferPadShape(
       input_shape, padding_value_shape, padding_config);
   ASSERT_FALSE(negative_dimension_size.ok());
-  ASSERT_THAT(negative_dimension_size.status().error_message(),
+  ASSERT_THAT(negative_dimension_size.status().message(),
               HasSubstr("negative size for dimension 1"));
 }
 
@@ -2173,26 +2172,26 @@ TEST_F(ShapeInferenceTest, ReverseInvalidDimension) {
   auto inferred_status_error0 =
       ShapeInference::InferReverseShape(input_shape, {0, 2});
   ASSERT_FALSE(inferred_status_error0.ok());
-  ASSERT_THAT(inferred_status_error0.status().error_message(),
+  ASSERT_THAT(inferred_status_error0.status().message(),
               HasSubstr("out-of-bounds"));
 
   auto inferred_status_error1 =
       ShapeInference::InferReverseShape(input_shape, {0, -1});
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().error_message(),
+  ASSERT_THAT(inferred_status_error1.status().message(),
               HasSubstr("out-of-bounds"));
 
   auto inferred_status_error2 =
       ShapeInference::InferReverseShape(input_shape, {0, 0});
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().error_message(),
+  ASSERT_THAT(inferred_status_error2.status().message(),
               HasSubstr("duplicated"));
 
   Shape tuple_shape = ShapeUtil::MakeTupleShape({input_shape, input_shape});
   auto inferred_status_error3 =
       ShapeInference::InferReverseShape(tuple_shape, {0});
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().error_message(),
+  ASSERT_THAT(inferred_status_error3.status().message(),
               HasSubstr("Expected array argument"));
 }
 
@@ -2212,19 +2211,19 @@ TEST_F(ShapeInferenceTest, Call) {
   auto inferred_status_error0 = ShapeInference::InferCallShape(
       {}, ShapeUtil::MakeProgramShape({f32_}, f32_));
   EXPECT_FALSE(inferred_status_error0.ok());
-  EXPECT_THAT(inferred_status_error0.status().error_message(),
+  EXPECT_THAT(inferred_status_error0.status().message(),
               HasSubstr("arity must match"));
 
   auto inferred_status_error1 = ShapeInference::InferCallShape(
       {&f32_}, ShapeUtil::MakeProgramShape({}, f32_));
   EXPECT_FALSE(inferred_status_error1.ok());
-  EXPECT_THAT(inferred_status_error1.status().error_message(),
+  EXPECT_THAT(inferred_status_error1.status().message(),
               HasSubstr("arity must match"));
 
   auto inferred_status_error2 = ShapeInference::InferCallShape(
       {&f32_}, ShapeUtil::MakeProgramShape({s32_}, f32_));
   EXPECT_FALSE(inferred_status_error2.ok());
-  EXPECT_THAT(inferred_status_error2.status().error_message(),
+  EXPECT_THAT(inferred_status_error2.status().message(),
               HasSubstr("parameter must match argument"));
 }
 
@@ -2280,7 +2279,7 @@ TEST_F(ShapeInferenceTest, ConditionalPred) {
        ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
       {vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error0.ok());
-  EXPECT_THAT(inferred_status_error0.status().error_message(),
+  EXPECT_THAT(inferred_status_error0.status().message(),
               HasSubstr("must be bool or int32_t"));
 
   auto inferred_status_error1 = ShapeInference::InferConditionalShape(
@@ -2289,7 +2288,7 @@ TEST_F(ShapeInferenceTest, ConditionalPred) {
        ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
       {ShapeUtil::MakeTupleShape({f32_, vector_32_}), matrix_32_48_});
   EXPECT_FALSE(inferred_status_error1.ok());
-  EXPECT_THAT(inferred_status_error1.status().error_message(),
+  EXPECT_THAT(inferred_status_error1.status().message(),
               HasSubstr("branch computation 0 must take 1 argument"));
 
   auto inferred_status_error2 = ShapeInference::InferConditionalShape(
@@ -2298,7 +2297,7 @@ TEST_F(ShapeInferenceTest, ConditionalPred) {
        ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
       {vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error2.ok());
-  EXPECT_THAT(inferred_status_error2.status().error_message(),
+  EXPECT_THAT(inferred_status_error2.status().message(),
               HasSubstr("branch operand 0 must match the shape of the only "
                         "parameter of branch computation 0"));
 
@@ -2308,7 +2307,7 @@ TEST_F(ShapeInferenceTest, ConditionalPred) {
        ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_)},
       {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_})});
   EXPECT_FALSE(inferred_status_error3.ok());
-  EXPECT_THAT(inferred_status_error3.status().error_message(),
+  EXPECT_THAT(inferred_status_error3.status().message(),
               HasSubstr("branch computation 1 must take 1 argument"));
 
   auto inferred_status_error4 = ShapeInference::InferConditionalShape(
@@ -2317,7 +2316,7 @@ TEST_F(ShapeInferenceTest, ConditionalPred) {
        ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
       {vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error4.ok());
-  EXPECT_THAT(inferred_status_error4.status().error_message(),
+  EXPECT_THAT(inferred_status_error4.status().message(),
               HasSubstr("branch operand 1 must match the shape of the only "
                         "parameter of branch computation 1"));
 
@@ -2327,7 +2326,7 @@ TEST_F(ShapeInferenceTest, ConditionalPred) {
        ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
       {vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error5.ok());
-  EXPECT_THAT(inferred_status_error5.status().error_message(),
+  EXPECT_THAT(inferred_status_error5.status().message(),
               HasSubstr("the result of branch 0 computation and branch 1 "
                         "computation must have the same shape"));
 }
@@ -2366,7 +2365,7 @@ TEST_F(ShapeInferenceTest, ConditionalIndexed) {
        ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
       {vector_32_, vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error0.ok());
-  EXPECT_THAT(inferred_status_error0.status().error_message(),
+  EXPECT_THAT(inferred_status_error0.status().message(),
               HasSubstr("2 == branch_computations.size()"));
 
   auto inferred_status_error1 = ShapeInference::InferConditionalShape(
@@ -2377,7 +2376,7 @@ TEST_F(ShapeInferenceTest, ConditionalIndexed) {
       {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_}),
        matrix_32_48_});
   EXPECT_FALSE(inferred_status_error1.ok());
-  EXPECT_THAT(inferred_status_error1.status().error_message(),
+  EXPECT_THAT(inferred_status_error1.status().message(),
               HasSubstr("branch computation 1 must take 1 argument"));
 
   auto inferred_status_error2 = ShapeInference::InferConditionalShape(
@@ -2387,7 +2386,7 @@ TEST_F(ShapeInferenceTest, ConditionalIndexed) {
        ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
       {r0s32, vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error2.ok());
-  EXPECT_THAT(inferred_status_error2.status().error_message(),
+  EXPECT_THAT(inferred_status_error2.status().message(),
               HasSubstr("branch operand 2 must match the shape of the only "
                         "parameter of branch computation 2"));
 
@@ -2399,14 +2398,14 @@ TEST_F(ShapeInferenceTest, ConditionalIndexed) {
        ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
       {vector_32_, vector_32_, vector_32_, vector_64_});
   EXPECT_FALSE(inferred_status_error3.ok());
-  EXPECT_THAT(inferred_status_error3.status().error_message(),
+  EXPECT_THAT(inferred_status_error3.status().message(),
               HasSubstr("the result of branch 0 computation and branch 3 "
                         "computation must have the same shape"));
 
   auto inferred_status_error4 =
       ShapeInference::InferConditionalShape(r0s32, {}, {});
   EXPECT_FALSE(inferred_status_error4.ok());
-  EXPECT_THAT(inferred_status_error4.status().error_message(),
+  EXPECT_THAT(inferred_status_error4.status().message(),
               HasSubstr("!branch_computations.empty()"));
 }
 
@@ -2441,10 +2440,10 @@ TEST_F(ShapeInferenceTest, BadSlice) {
 
   LOG(INFO) << statusor.status();
 
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("less than or equal to dimension size"))
       << statusor.status();
-  EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape"))
+  EXPECT_THAT(statusor.status().message(), HasSubstr("argument shape"))
       << statusor.status();
 }
 
@@ -2454,8 +2453,7 @@ TEST_F(ShapeInferenceTest, BadSort) {
   StatusOr<Shape> statusor =
       ShapeInference::InferVariadicOpShape(HloOpcode::kSort, {&keys, &values});
   EXPECT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("dimensions must match"))
+  EXPECT_THAT(statusor.status().message(), HasSubstr("dimensions must match"))
       << statusor.status();
 }
 
@@ -2466,8 +2464,7 @@ TEST_F(ShapeInferenceTest, BadSortValuesMismatch) {
   StatusOr<Shape> statusor = ShapeInference::InferVariadicOpShape(
       HloOpcode::kSort, {&keys, &values_good, &values_bad});
   EXPECT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("dimensions must match"))
+  EXPECT_THAT(statusor.status().message(), HasSubstr("dimensions must match"))
       << statusor.status();
 }
 
@@ -2504,7 +2501,7 @@ TEST_F(ShapeInferenceTest, InvalidStochasticConvert_MismatchRandomElementType) {
       ShapeInference::InferStochasticConvertShape(operand, random, S8);
   ASSERT_FALSE(status_or.ok());
   EXPECT_THAT(
-      status_or.status().error_message(),
+      status_or.status().message(),
       HasSubstr(
           "The random number is required to have same bits as the operand."));
 }
@@ -2519,7 +2516,7 @@ TEST_F(ShapeInferenceTest,
       ShapeInference::InferStochasticConvertShape(operand, random, S8);
   ASSERT_FALSE(status_or.ok());
   EXPECT_THAT(
-      status_or.status().error_message(),
+      status_or.status().message(),
       HasSubstr(
           "Random numbers for stochastic convert must be unsigned integers"));
 }
@@ -2738,7 +2735,7 @@ TEST_F(GatherShapeInferenceTest, TupleShapedTensorInput) {
           /*index_vector_dim=*/1),
       /*slice_sizes=*/{64, 1});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Expected array argument for input"))
       << statusor.status();
 }
@@ -2753,7 +2750,7 @@ TEST_F(GatherShapeInferenceTest, TupleShapedGatherIndicesInput) {
           /*index_vector_dim=*/0),
       /*slice_sizes=*/{64, 1});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Expected array argument for gather indices"))
       << statusor.status();
 }
@@ -2768,7 +2765,7 @@ TEST_F(GatherShapeInferenceTest, FloatingPointGatherIndicesInput) {
           /*index_vector_dim=*/0),
       /*slice_sizes=*/{64, 1});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Gather indices parameter must be an integral tensor"))
       << statusor.status();
 }
@@ -2785,7 +2782,7 @@ TEST_F(GatherShapeInferenceTest,
       /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr("Output window dimensions in gather op must be ascending"))
       << statusor.status();
 }
@@ -2802,7 +2799,7 @@ TEST_F(GatherShapeInferenceTest,
       /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr("Output window dimensions in gather op must not repeat"))
       << statusor.status();
 }
@@ -2818,7 +2815,7 @@ TEST_F(GatherShapeInferenceTest,
           /*index_vector_dim=*/4),
       /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Offset dimension 2 in gather op is out of bounds"))
       << statusor.status();
 }
@@ -2834,7 +2831,7 @@ TEST_F(GatherShapeInferenceTest,
           /*index_vector_dim=*/4),
       /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Offset dimension 4 in gather op is out of bounds"))
       << statusor.status();
 }
@@ -2851,7 +2848,7 @@ TEST_F(GatherShapeInferenceTest,
       /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr("All components of the offset index in a gather op must either "
                 "be a offset dimension or explicitly collapsed"))
       << statusor.status();
@@ -2868,7 +2865,7 @@ TEST_F(GatherShapeInferenceTest,
           /*index_vector_dim=*/4),
       /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Invalid collapsed_slice_dims set in gather op; valid "
                         "range is [0, 5), got: 19"))
       << statusor.status();
@@ -2885,7 +2882,7 @@ TEST_F(GatherShapeInferenceTest,
           /*index_vector_dim=*/4),
       /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Repeated dimensions not allowed in "
                         "collapsed_slice_dims in gather op"))
       << statusor.status();
@@ -2902,7 +2899,7 @@ TEST_F(GatherShapeInferenceTest,
           /*index_vector_dim=*/4),
       /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Gather op has 4 elements in start_index_map and "
                         "the bound of dimension index_vector_dim=4 of "
                         "start_indices is 5. These two numbers must be equal."))
@@ -2920,7 +2917,7 @@ TEST_F(GatherShapeInferenceTest,
           /*index_vector_dim=*/4),
       /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Invalid start_index_map; domain is [0, 5), got: 4->7"))
       << statusor.status();
 }
@@ -2937,7 +2934,7 @@ TEST_F(GatherShapeInferenceTest,
       /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr("Repeated dimensions are not allowed in start_index_map"))
       << statusor.status();
 }
@@ -2953,7 +2950,7 @@ TEST_F(GatherShapeInferenceTest,
           /*index_vector_dim=*/4),
       /*slice_sizes=*/{1, 1, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("collapsed_slice_dims in gather op must be sorted"))
       << statusor.status();
 }
@@ -2968,7 +2965,7 @@ TEST_F(GatherShapeInferenceTest, InvalidGatherDimNumbers_WindowBoundsTooLarge) {
           /*index_vector_dim=*/4),
       /*slice_sizes=*/{30, 29, 1, 300, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Slice size at index 3 in gather op is out of range, "
                         "must be within [0, 48), got 300."))
       << statusor.status();
@@ -2986,7 +2983,7 @@ TEST_F(GatherShapeInferenceTest,
       /*slice_sizes=*/{30, 29, 28, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr("Gather op must have one slice size for every input dimension"))
       << statusor.status();
 }
@@ -3003,7 +3000,7 @@ TEST_F(GatherShapeInferenceTest,
       /*slice_sizes=*/{30, 29, 28, 26, 20});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr("Gather op can only collapse slice dims with bound 1 or 0, "
                 "but bound is 29 for index 1 at position 0."))
       << statusor.status();
@@ -3020,7 +3017,7 @@ TEST_F(GatherShapeInferenceTest, OutOfBoundsGatherIndicesLeafDim) {
       /*slice_sizes=*/{30, 29, 28, 27, 26});
 
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Gather index leaf dimension must be within [0, "
                         "rank(start_indices) + 1)"))
       << statusor.status();
@@ -3161,7 +3158,7 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesBiggerThanInput) {
           /*index_vector_dim=*/1));
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr("Bounds of the window dimensions of updates must not exceed "
                 "the bounds of the corresponding dimensions of operand."))
       << statusor.status();
@@ -3178,7 +3175,7 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesBiggerThanInputV2) {
           /*index_vector_dim=*/1));
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr("Bounds of the window dimensions of updates must not exceed "
                 "the bounds of the corresponding dimensions of operand."))
       << statusor.status();
@@ -3195,7 +3192,7 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesNotMatchingIndices) {
           /*index_vector_dim=*/1));
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr(
           "Bounds of the scatter dimensions of updates must be same as the "
           "bounds of the corresponding dimensions of scatter indices."))
@@ -3213,7 +3210,7 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesNotMatchingIndicesV2) {
           /*index_vector_dim=*/1));
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr(
           "Bounds of the scatter dimensions of updates must be same as the "
           "bounds of the corresponding dimensions of scatter indices."))
@@ -3292,7 +3289,7 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithUpdatesBiggerThanInput) {
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr("Bounds of the window dimensions of updates must not exceed "
                 "the bounds of the corresponding dimensions of operand."))
       << statusor.status();
@@ -3310,7 +3307,7 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithUpdatesNotMatchingIndices) {
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr(
           "Bounds of the scatter dimensions of updates must be same as the "
           "bounds of the corresponding dimensions of scatter indices."))
@@ -3421,7 +3418,7 @@ TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedTensorInput) {
           /*scatter_dims_to_operand_dims=*/{1},
           /*index_vector_dim=*/1));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Expected array argument for operand"))
       << statusor.status();
 }
@@ -3439,7 +3436,7 @@ TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedScatterIndicesInput) {
           /*scatter_dims_to_operand_dims=*/{1},
           /*index_vector_dim=*/0));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Expected array argument for scatter indices"))
       << statusor.status();
 }
@@ -3457,7 +3454,7 @@ TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedUpdatesInput) {
           /*scatter_dims_to_operand_dims=*/{1},
           /*index_vector_dim=*/0));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Expected array argument for updates"))
       << statusor.status();
 }
@@ -3472,7 +3469,7 @@ TEST_P(ScatterShapeInferenceTest, FloatingPointScatterIndicesInput) {
           /*scatter_dims_to_operand_dims=*/{1},
           /*index_vector_dim=*/0));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Scatter indices parameter must be an integral tensor"))
       << statusor.status();
 }
@@ -3488,7 +3485,7 @@ TEST_P(ScatterShapeInferenceTest, OutOfBoundsScatterIndicesLeafDim) {
           /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/10));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Scatter index leaf dimension must be within [0, "
                         "rank(scatter_indices) + 1)"))
       << statusor.status();
@@ -3505,7 +3502,7 @@ TEST_P(ScatterShapeInferenceTest, InvalidUpdates) {
           /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Updates tensor must be of rank 7; got 8."))
       << statusor.status();
 }
@@ -3523,7 +3520,7 @@ TEST_P(ScatterShapeInferenceTest, InvalidUpdateComputation) {
           /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr(absl::Substitute(
                   "Reduction function must take $0 parameters, but takes 1",
                   2 * types().size())))
@@ -3542,7 +3539,7 @@ TEST_P(ScatterShapeInferenceTest,
           /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("update_window_dims in scatter op must be sorted"))
       << statusor.status();
 }
@@ -3559,7 +3556,7 @@ TEST_P(ScatterShapeInferenceTest,
           /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("update_window_dims in scatter op must not repeat"))
       << statusor.status();
 }
@@ -3576,7 +3573,7 @@ TEST_P(ScatterShapeInferenceTest,
           /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Invalid update_window_dims set in scatter op; valid "
                         "range is [0, 9)"))
       << statusor.status();
@@ -3594,7 +3591,7 @@ TEST_P(ScatterShapeInferenceTest,
           /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("inserted_window_dims in scatter op must be sorted"))
       << statusor.status();
 }
@@ -3611,7 +3608,7 @@ TEST_P(ScatterShapeInferenceTest,
           /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("inserted_window_dims in scatter op must not repeat"))
       << statusor.status();
 }
@@ -3628,7 +3625,7 @@ TEST_P(ScatterShapeInferenceTest,
           /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Invalid inserted_window_dims set in scatter op; valid "
                         "range is [0, 5)"))
       << statusor.status();
@@ -3647,7 +3644,7 @@ TEST_P(ScatterShapeInferenceTest,
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr("Scatter op has 4 elements in scatter_dims_to_operand_dims and "
                 "the bound of dimension index_vector_dim=4 of scatter_indices "
                 "is 5. These two numbers must be equal"))
@@ -3666,7 +3663,7 @@ TEST_P(ScatterShapeInferenceTest,
           /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 10},
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
+  EXPECT_THAT(statusor.status().message(),
               HasSubstr("Invalid scatter_dims_to_operand_dims mapping; domain "
                         "is [0, 5), got: 4->10"))
       << statusor.status();
@@ -3685,7 +3682,7 @@ TEST_P(ScatterShapeInferenceTest,
           /*index_vector_dim=*/4));
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr(
           "Repeated dimensions not allowed in scatter_dims_to_operand_dims"))
       << statusor.status();
@@ -3704,7 +3701,7 @@ TEST_P(ScatterShapeInferenceTest,
           /*index_vector_dim=*/0));
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
-      statusor.status().error_message(),
+      statusor.status().message(),
       HasSubstr(
           "Scatter op has window of size 4; doesn't match operand of rank 5."))
       << statusor.status();
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
index 5bf180da7e7..db979bac52a 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -968,8 +968,9 @@ bool RefineManualAutoShardingFromAuto(
   // leverage existing merging implementation, we treat the manual dim as a
   // data dim, and add it right before the replication dim.
   auto partial_tiling_for_manual = partial_rep.tile_assignment();
-  std::vector<int64_t> partial_manual_shape =
-      partial_tiling_for_manual.dimensions();
+  std::vector<int64_t> partial_manual_shape(
+      partial_tiling_for_manual.dimensions().begin(),
+      partial_tiling_for_manual.dimensions().end());
   partial_manual_shape.insert(partial_manual_shape.begin() + data_rank, 1);
   partial_tiling_for_manual.Reshape(partial_manual_shape);
   HloSharding partial_rep_for_manual = HloSharding::PartialTile(
@@ -977,7 +978,8 @@ bool RefineManualAutoShardingFromAuto(
   Array<int64_t> man_tiling = manual_sharding->tile_assignment();
   if (manual_sharding->subgroup_types().back() != OpSharding::REPLICATED) {
     // Move the manual dim before replication dim.
-    std::vector<int64_t> transposed_dims = man_tiling.dimensions();
+    std::vector<int64_t> transposed_dims(man_tiling.dimensions().begin(),
+                                         man_tiling.dimensions().end());
     transposed_dims[data_rank] = transposed_dims.back();
     transposed_dims.back() = man_tiling.dim(data_rank);
     Array<int64_t> transposed(transposed_dims);
@@ -1439,6 +1441,10 @@ std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
   if (!IsSpatiallyPartitioned(&user)) {
     return std::nullopt;
   }
+  if (instruction.opcode() == HloOpcode::kConstant &&
+      user.sharding().IsManual()) {
+    return std::nullopt;
+  }
   const bool may_combine_partial_sharding = is_spmd && aggressiveness > 0;
 
   switch (user.opcode()) {
@@ -1479,7 +1485,8 @@ std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
       const int64_t tile_shape = CeilOfRatio(
           user.shape().dimensions(cdim), tile_assignment.dimensions()[cdim]);
       std::vector<int64_t> start_indices(tile_assignment.num_dimensions());
-      std::vector<int64_t> end_indices = tile_assignment.dimensions();
+      std::vector<int64_t> end_indices(tile_assignment.dimensions().begin(),
+                                       tile_assignment.dimensions().end());
       start_indices[cdim] = start_offset / tile_shape;
       end_indices[cdim] = CeilOfRatio(
           start_offset + instruction.shape().dimensions(cdim), tile_shape);
@@ -2317,9 +2324,6 @@ bool ShardingPropagation::InferShardingFromOperands(
                                              may_combine_partial_sharding);
     }
     case HloOpcode::kCustomCall: {
-      if (instruction->IsCustomCall("X64Combine")) {
-        return false;
-      }
       HloSharding inferred_operand_sharding = HloSharding::Replicate();
       if (auto* partitioner =
               GetCustomCallPartitioner(instruction->custom_call_target());
@@ -2437,7 +2441,7 @@ bool ShardingPropagation::InferShardingFromUsers(
     std::optional<HloSharding> user_sharding =
         ShardingPropagation::GetShardingFromUser(
             *instruction, *user, aggressiveness, is_spmd, call_graph);
-    // Do not propagate manual sharding to constant.
+    // Do not propagate manual sharding to constant from partially manual tuple.
     if (instruction->opcode() == HloOpcode::kConstant && user_sharding &&
         user_sharding->IsManual()) {
       continue;
@@ -2589,7 +2593,7 @@ StatusOr<bool> ShardingPropagation::Run(
                       inst->sharding() != instruction->sharding()) {
                     VLOG(2) << "Add computation sharding: " << inst->name()
                             << " " << instruction->sharding().ToString();
-                    inst->set_sharding(instruction->sharding());
+                    inst->copy_sharding(instruction);
                     changed->insert(inst);
                     maybe_computation_propagation(inst, changed);
                   }
@@ -2642,7 +2646,7 @@ StatusOr<bool> ShardingPropagation::Run(
         if (sharded_inst != nullptr) {
           // Set the same sharding to all the other related instructions.
           for (auto inst : related_instructions) {
-            inst->set_sharding(sharded_inst->sharding());
+            inst->copy_sharding(sharded_inst);
           }
         }
         if (instruction->opcode() == HloOpcode::kWhile) {
@@ -2815,14 +2819,14 @@ StatusOr<bool> ShardingPropagation::Run(
           }
         }
       }
-      VLOG(1) << "Sharding propagation iteration " << iterations << ";";
-      VLOG(1) << "  total instructions: " << instruction_counter;
-      VLOG(1) << "  instructions already sharded: " << already_sharded_counter;
-      VLOG(1) << "  shardings inferred from operands: "
-              << inferred_from_operand_counter;
-      VLOG(1) << "  shardings inferred from users: "
-              << inferred_from_user_counter;
-      VLOG(1) << "  aggressiveness: " << aggressiveness;
+      VLOG(1) << "Sharding propagation iteration " << iterations << ";"
+              << "\n  total instructions: " << instruction_counter
+              << "\n  instructions already sharded: " << already_sharded_counter
+              << "\n  shardings inferred from operands: "
+              << inferred_from_operand_counter
+              << "\n  shardings inferred from users: "
+              << inferred_from_user_counter
+              << "\n  aggressiveness: " << aggressiveness;
       ++iterations;
     }
     return OkStatus();
diff --git a/tensorflow/compiler/xla/service/sharding_propagation_test.cc b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
index ce87ca9b5d7..166eea9381d 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
@@ -2290,7 +2290,7 @@ ENTRY %entry {
   auto result =
       ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
           .Run(module.get());
-  EXPECT_THAT(result.status().error_message(),
+  EXPECT_THAT(result.status().message(),
               ::testing::HasSubstr(
                   "Instruction: count is on device: 0, which conflicts with "
                   "device: 1 of channel instruction: recv"));
@@ -2336,7 +2336,7 @@ ENTRY %entry {
   auto result =
       ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
           .Run(module.get());
-  EXPECT_THAT(result.status().error_message(),
+  EXPECT_THAT(result.status().message(),
               ::testing::HasSubstr(
                   "Instruction: data is on device: 0, which conflicts with "
                   "device: 1 of channel instruction: recv"));
@@ -2382,7 +2382,7 @@ ENTRY %entry {
   auto result =
       ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
           .Run(module.get());
-  EXPECT_THAT(result.status().error_message(),
+  EXPECT_THAT(result.status().message(),
               ::testing::HasSubstr(
                   "Instruction: while is on device: 0, which conflicts with "
                   "device: 1 of channel instruction: recv"));
@@ -8401,9 +8401,7 @@ ENTRY %reshape {
   EXPECT_TRUE(changed);
   auto* instruction = FindInstruction(module.get(), "custom-call");
   ASSERT_NE(instruction, nullptr);
-  EXPECT_THAT(
-      instruction,
-      op::Sharding("{devices=[1,2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+  EXPECT_THAT(instruction, op::Sharding("{devices=[1,2,2]0,1,2,3}"));
 }
 
 TEST_P(ParameterizedMetadataTest, PropagateThroughSingleUsers) {
diff --git a/tensorflow/compiler/xla/service/slow_operation_alarm.cc b/tensorflow/compiler/xla/service/slow_operation_alarm.cc
index a0117c7d5fe..388d8b271d9 100644
--- a/tensorflow/compiler/xla/service/slow_operation_alarm.cc
+++ b/tensorflow/compiler/xla/service/slow_operation_alarm.cc
@@ -159,7 +159,7 @@ std::unique_ptr<SlowOperationAlarm> SlowCompilationAlarm(
       absl::Duration(absl::Minutes(2)),
       absl::StrCat(
           separator, "\n", context_msg,
-          "Very slow compile?  If you want to file a bug, run with envvar "
+          "Very slow compile? If you want to file a bug, run with envvar "
           "XLA_FLAGS=--xla_dump_to=/tmp/foo and attach the results.",
           separator),
       counter);
@@ -168,8 +168,8 @@ std::unique_ptr<SlowOperationAlarm> SlowCompilationAlarm(
       absl::Duration(absl::Seconds(10)),
       absl::StrCat(
           separator, "\n", context_msg,
-          "Slow compile?  XLA was built without compiler optimizations, "
-          "which can be slow.  Try rebuilding with -c opt.",
+          "Slow compile? XLA was built without compiler optimizations, which "
+          "can be slow. Try rebuilding with -c opt.",
           separator),
       counter);
 #endif
diff --git a/tensorflow/compiler/xla/service/sort_simplifier.cc b/tensorflow/compiler/xla/service/sort_simplifier.cc
index de14ce06ac8..6f6428111e5 100644
--- a/tensorflow/compiler/xla/service/sort_simplifier.cc
+++ b/tensorflow/compiler/xla/service/sort_simplifier.cc
@@ -145,9 +145,7 @@ StatusOr<bool> SortSimplifier::Run(
   std::vector<HloInstruction*> sort_instrs;
   for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
     absl::c_copy_if(comp->instructions(), std::back_inserter(sort_instrs),
-                    [](const HloInstruction* instr) {
-                      return instr->opcode() == HloOpcode::kSort;
-                    });
+                    HloPredicateIsOp<HloOpcode::kSort>);
   }
 
   for (HloInstruction* sort_instr : sort_instrs) {
diff --git a/tensorflow/compiler/xla/service/space_to_batch_converter.cc b/tensorflow/compiler/xla/service/space_to_batch_converter.cc
index 4479d3a1dbe..e6b58279fef 100644
--- a/tensorflow/compiler/xla/service/space_to_batch_converter.cc
+++ b/tensorflow/compiler/xla/service/space_to_batch_converter.cc
@@ -1670,8 +1670,7 @@ bool ConvolutionVisitor::SupportedOpForPropagation(HloInstruction* consumer,
     const int64_t batch_dim = result[DimMapper(SpaceToBatchDimMap::kBatch)];
     const int64_t space_dim = result[DimMapper(SpaceToBatchDimMap::kSpace0)];
     VLOG(1) << "Checking if reduce is supported batch_dim " << batch_dim
-            << "  space_dim " << space_dim << " reduce "
-            << consumer->ToString();
+            << " space_dim " << space_dim << " reduce " << consumer->ToString();
     return absl::c_linear_search(reduce_dims, batch_dim) &&
            absl::c_linear_search(reduce_dims, space_dim);
   }
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
index 2c621c506dc..50c70060598 100644
--- a/tensorflow/compiler/xla/service/spmd/BUILD
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -46,6 +46,7 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:comparators",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/hlo/utils:hlo_sharding_util",
         "//tensorflow/compiler/xla/service:call_graph",
         "//tensorflow/compiler/xla/service:custom_call_sharding_helper",
@@ -56,12 +57,10 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_lexer",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
-        "//tensorflow/compiler/xla/service:hlo_query",
         "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/service:sharding_propagation",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:numbers",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -122,8 +121,8 @@ cc_library(
     hdrs = ["canonicalize_all_gather_for_cse.h"],
     deps = [
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/compiler/xla/service:hlo_query",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.cc b/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.cc
index 8d6cb06194b..dad613f0a91 100644
--- a/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.cc
+++ b/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/spmd/custom_call_handler.cc b/tensorflow/compiler/xla/service/spmd/custom_call_handler.cc
index 91632142d98..36ae36e602a 100644
--- a/tensorflow/compiler/xla/service/spmd/custom_call_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/custom_call_handler.cc
@@ -109,7 +109,9 @@ Status SpmdPartitioningVisitor::HandleCustomCallTopK(HloInstruction* hlo) {
         partitioned_input.state(), sharding_grouped.device_groups,
         partitioned_input.state().b);
     auto reshape_tile_assignment = sharding.tile_assignment();
-    auto reshape_dimensions = reshape_tile_assignment.dimensions();
+    std::vector<int64_t> reshape_dimensions(
+        reshape_tile_assignment.dimensions().begin(),
+        reshape_tile_assignment.dimensions().end());
     reshape_dimensions.push_back(reshape_dimensions.back());
     reshape_dimensions[sort_dim] = 1;
     reshape_tile_assignment.Reshape(reshape_dimensions);
diff --git a/tensorflow/compiler/xla/service/spmd/dot_handler.cc b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
index 09ee15c60b4..0063930eaee 100644
--- a/tensorflow/compiler/xla/service/spmd/dot_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
@@ -469,8 +469,11 @@ std::optional<WindowedEinsumConfig> GetWindowedEinsumConfiguration(
 
   // Determine if any of the users have the same shardings that can allow
   // reuse of the resharding for the operand with original_hlo.
-  auto check_users_sharding = [original_hlo, &call_graph](
-                                  const HloInstruction* to_loop_over) {
+  auto check_users_sharding = [original_hlo, &call_graph,
+                               &options](const HloInstruction* to_loop_over) {
+    if (options.skip_checking_windowed_einsum_users) {
+      return true;
+    }
     if (to_loop_over->users().size() <= 1) {
       return true;
     }
@@ -1146,11 +1149,14 @@ StatusOr<HloInstruction*> PartitionBaseCase(
                   ShapeUtil::MakeShape(slice_operand->shape().element_type(),
                                        new_dims),
                   slice_operand));
-          auto min = body_b.AddInstruction(
-              HloInstruction::CreateConstant(LiteralUtil::MinValue(
-                  reshaped_slice_operand->shape().element_type())));
-          std::vector<int64_t> min_padding(
-              reshaped_slice_operand->shape().rank());
+          auto pad_value = body_b.AddInstruction(HloInstruction::CreateConstant(
+              ShapeUtil::ElementIsFloating(reshaped_slice_operand->shape())
+                  ? LiteralUtil::MinValue(
+                        reshaped_slice_operand->shape().element_type())
+                  : LiteralUtil::Zero(
+                        reshaped_slice_operand->shape().element_type())));
+
+          std::vector<int64_t> padding(reshaped_slice_operand->shape().rank());
           auto padded_slice_operand = reshaped_slice_operand;
           auto padded_shape = padded_slice_operand->shape();
           int64_t padding_dim = slice_sharding_dim;
@@ -1158,25 +1164,25 @@ StatusOr<HloInstruction*> PartitionBaseCase(
           if (ccw) {
             // ccw pad high
             PaddingConfig ccw_pad_config =
-                window_util::MakeSymmetricPadding(min_padding);
+                window_util::MakeSymmetricPadding(padding);
             ccw_pad_config.mutable_dimensions(padding_dim)
                 ->set_edge_padding_low(0);
             ccw_pad_config.mutable_dimensions(padding_dim)
                 ->set_edge_padding_high(1);
-            padded_slice_operand =
-                body_b.AddInstruction(HloInstruction::CreatePad(
-                    padded_shape, padded_slice_operand, min, ccw_pad_config));
+            padded_slice_operand = body_b.AddInstruction(
+                HloInstruction::CreatePad(padded_shape, padded_slice_operand,
+                                          pad_value, ccw_pad_config));
           } else {
             // cw pad low
             PaddingConfig cw_pad_config =
-                window_util::MakeSymmetricPadding(min_padding);
+                window_util::MakeSymmetricPadding(padding);
             cw_pad_config.mutable_dimensions(padding_dim)
                 ->set_edge_padding_low(1);
             cw_pad_config.mutable_dimensions(padding_dim)
                 ->set_edge_padding_high(0);
-            padded_slice_operand =
-                body_b.AddInstruction(HloInstruction::CreatePad(
-                    padded_shape, padded_slice_operand, min, cw_pad_config));
+            padded_slice_operand = body_b.AddInstruction(
+                HloInstruction::CreatePad(padded_shape, padded_slice_operand,
+                                          pad_value, cw_pad_config));
           }
 
           padded_slice_operand->set_sharding(HloSharding::Replicate());
@@ -1199,7 +1205,11 @@ StatusOr<HloInstruction*> PartitionBaseCase(
         auto ccw_slice = gen_slice(ccw_data_partition_id, true);
         auto cw_slice = gen_slice(cw_data_partition_id, false);
         auto slice = body_b.AddInstruction(HloInstruction::CreateBinary(
-            ccw_slice->shape(), HloOpcode::kMaximum, ccw_slice, cw_slice));
+            ccw_slice->shape(),
+            ShapeUtil::ElementIsFloating(ccw_slice->shape())
+                ? HloOpcode::kMaximum
+                : HloOpcode::kAdd,
+            ccw_slice, cw_slice));
         // Reshape. The reshaped slice will not be used to produce the final
         // result, but used as a hint for the shape inference.
         std::vector<int64_t> reshaped_slice_dims;
@@ -2038,13 +2048,18 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
   auto lhs_sharding_dims_adjusted_to_output =
       lhs.sharding().IsReplicated()
           ? std::vector<int64_t>(lhs.base_shape().rank(), 1)
-          : lhs.sharding().tile_assignment().dimensions();
+          : std::vector<int64_t>(
+                lhs.sharding().tile_assignment().dimensions().begin(),
+                lhs.sharding().tile_assignment().dimensions().end());
   auto rhs_sharding_dims_adjusted_to_output =
       rhs.sharding().IsReplicated()
           ? std::vector<int64_t>(rhs.base_shape().rank(), 1)
-          : rhs.sharding().tile_assignment().dimensions();
-  auto output_sharding_dims_adjusted_to_lhs =
-      output_sharding.tile_assignment().dimensions();
+          : std::vector<int64_t>(
+                rhs.sharding().tile_assignment().dimensions().begin(),
+                rhs.sharding().tile_assignment().dimensions().end());
+  std::vector<int64_t> output_sharding_dims_adjusted_to_lhs(
+      output_sharding.tile_assignment().dimensions().begin(),
+      output_sharding.tile_assignment().dimensions().end());
   bool lhs_rhs_dims_matching = true;
   for (const auto& dim : dims_mapping.batch_dims) {
     lhs_dims.push_back(dim.lhs);
@@ -2252,8 +2267,9 @@ GroupedSharding GetNonContractingPartitionGroupedShardingForMatchedOperand(
     bool lhs_matching, const HloSharding& matching_sharding,
     const HloSharding& output_sharding,
     absl::Span<const DotConvDimsMapping::DimsMapping> partitioned_dims) {
-  std::vector<int64_t> matching_sharding_dims =
-      matching_sharding.tile_assignment().dimensions();
+  std::vector<int64_t> matching_sharding_dims(
+      matching_sharding.tile_assignment().dimensions().begin(),
+      matching_sharding.tile_assignment().dimensions().end());
   std::vector<int64_t> matching_dims;
   std::vector<int64_t> output_dims;
   // Make sure the partitioning on matching's non-contracting dimensions
@@ -2565,10 +2581,12 @@ GetDotGroupPartitionContractingLhsRhsShardings(
         partitioned_contracting_dims) {
   HloSharding lhs_sharding = lhs.sharding();
   HloSharding rhs_sharding = rhs.sharding();
-  std::vector<int64_t> lhs_tile_shape =
-      lhs_sharding.tile_assignment().dimensions();
-  std::vector<int64_t> rhs_tile_shape =
-      rhs_sharding.tile_assignment().dimensions();
+  std::vector<int64_t> lhs_tile_shape(
+      lhs_sharding.tile_assignment().dimensions().begin(),
+      lhs_sharding.tile_assignment().dimensions().end());
+  std::vector<int64_t> rhs_tile_shape(
+      rhs_sharding.tile_assignment().dimensions().begin(),
+      rhs_sharding.tile_assignment().dimensions().end());
   if (ShapeUtil::ByteSizeOf(lhs.hlo()->shape()) >
       ShapeUtil::ByteSizeOf(rhs.hlo()->shape())) {
     for (const auto& dim : partitioned_contracting_dims) {
@@ -3393,7 +3411,7 @@ StatusOr<HloInstruction*> PartitionDot(
   if (lhs.hlo() == rhs.hlo()) {
     auto copy_hlo = b->AddInstruction(HloInstruction::CreateUnary(
         rhs.hlo()->shape(), HloOpcode::kCopy, rhs.hlo()));
-    copy_hlo->set_sharding(rhs.sharding());
+    copy_hlo->copy_sharding(rhs.hlo());
     rhs = PartitionedHlo(copy_hlo, rhs.base_shape(), rhs.state());
   }
 
diff --git a/tensorflow/compiler/xla/service/spmd/gather_scatter_handler.cc b/tensorflow/compiler/xla/service/spmd/gather_scatter_handler.cc
index 1a189eb9446..77640f0c607 100644
--- a/tensorflow/compiler/xla/service/spmd/gather_scatter_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/gather_scatter_handler.cc
@@ -106,7 +106,8 @@ std::pair<HloInstruction*, HloInstruction*>
 IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
     const PartitionedHlo& operand, const PartitionedHlo& indices,
     HloInstruction* partition_id, absl::Span<const int64_t> index_map,
-    int64_t index_vector_dim, SpmdBuilder* b) {
+    absl::Span<const int64_t> trivial_slice_dims, int64_t index_vector_dim,
+    SpmdBuilder* b) {
   auto operand_offsets = MakePartitionOffsets(
       operand.base_shape(), operand.sharding(), partition_id, b);
   const PrimitiveType indices_type = indices.hlo()->shape().element_type();
@@ -116,7 +117,7 @@ IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
   for (int64_t i = 0; i < index_map.size(); ++i) {
     int64_t dim = index_map[i];
     int64_t partitions = operand.sharding().tile_assignment().dim(dim);
-    if (partitions == 1) {
+    if (partitions == 1 || !absl::c_linear_search(trivial_slice_dims, dim)) {
       min_indices.push_back(CreateR0WithType<int32_t>(indices_type, 0, b));
       max_indices.push_back(CreateR0WithType<int32_t>(
           indices_type, operand.base_shape().dimensions(dim), b));
@@ -398,7 +399,7 @@ StatusOr<HloInstruction*> PartitionGatherTrivialSlicedOperandDimensions(
     std::tie(indices_min, indices_max) =
         IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
             operand, indices, operand.state().partition_id, start_index_map,
-            dnums.index_vector_dim(), b);
+            *trivial_slice_dims, dnums.index_vector_dim(), b);
     // Clamp the indices.
     auto adjusted_indices = b->AddInstruction(
         HloInstruction::CreateTernary(indices.hlo()->shape(), HloOpcode::kClamp,
@@ -454,13 +455,10 @@ StatusOr<HloInstruction*> PartitionGatherTrivialSlicedOperandDimensions(
     auto filtered = b->AddInstruction(HloInstruction::CreateTernary(
         pgather->shape(), HloOpcode::kSelect, broadcast_filter,
         CreateZero(pgather->shape(), b), pgather));
-    // All-reduce along all dims in operand sharding -- this is OK because the
-    // operand is sharded only on trivially sliced dimensions.
-    std::vector<int64_t> all_dims(operand.rank());
-    absl::c_iota(all_dims, 0);
+    // All-reduce along trivially sliced dimensions.
     auto ar = operand.state().partitioner->AllReduceAlongShardingDims(
         b, filtered, original_operand_sharding, operand.state().next_channel_id,
-        all_dims, operand.state().collective_ops_creator,
+        *trivial_slice_dims, operand.state().collective_ops_creator,
         MakeBinaryAdd(filtered->shape().element_type(),
                       operand.state().module));
     VLOG(5) << "[Gather partitioning]: Partitioned as trivial operand "
@@ -1290,8 +1288,8 @@ StatusOr<HloInstruction*> PartitionScatterTrivialSlicedOperandDimensions(
       std::tie(indices_min, std::ignore) =
           IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
               operands[0], indices, operands[0].state().partition_id,
-              dnums.scatter_dims_to_operand_dims(), dnums.index_vector_dim(),
-              b);
+              dnums.scatter_dims_to_operand_dims(), *trivial_slice_dims,
+              dnums.index_vector_dim(), b);
       auto adjusted_indices = b->AddInstruction(HloInstruction::CreateBinary(
           indices.hlo()->shape(), HloOpcode::kSubtract, indices.hlo(),
           indices_min));
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index 93437a7be60..5578674e792 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
@@ -47,7 +48,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/service/spmd/custom_call_handler.h"
@@ -115,10 +115,8 @@ void SpmdLogger::RegisterLogEntry(HloInstruction* hlo,
                                },
                                report_instruction_count));
   absl::StrAppend(&report, "\n  ** All instructions\n");
-  absl::StrAppend(&report,
-                  ReportMemoryUsage(
-                      module, [](const HloInstruction* hlo) { return true; },
-                      report_instruction_count));
+  absl::StrAppend(&report, ReportMemoryUsage(module, HloPredicateTrue,
+                                             report_instruction_count));
   return report;
 }
 
@@ -127,10 +125,8 @@ void SpmdLogger::RegisterLogEntry(HloInstruction* hlo,
   std::string report;
   absl::StrAppend(&report,
                   "\n\n***** SPMD memory usage after partition *****\n");
-  absl::StrAppend(&report,
-                  ReportMemoryUsage(
-                      module, [](const HloInstruction* hlo) { return true; },
-                      report_instruction_count));
+  absl::StrAppend(&report, ReportMemoryUsage(module, HloPredicateTrue,
+                                             report_instruction_count));
   return report;
 }
 
@@ -389,10 +385,14 @@ PartitionedHlo PartitionedHlo::Reshard(const HloSharding& target,
   if (sharding() == target) {
     return *this;
   }
-  // Do not reshard constants from tile maximal sharding to manual sharding.
-  if (hlo()->opcode() == HloOpcode::kConstant && sharding().IsTileMaximal() &&
+  // Handling for constant resharding from non-manual sharding to manual.
+  // (This could happen for Tuple, While, etc. since manual sharding is not
+  // propagated to constant.)
+  if (hlo()->opcode() == HloOpcode::kConstant && !sharding().IsManual() &&
       target.IsManual()) {
-    return *this;
+    PartitionedHlo pconstant = this->Reshard(HloSharding::Replicate());
+    pconstant.hlo()->set_sharding(target);
+    return pconstant;
   }
   auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
   // Replace existing reshard cache for target if we are sharding with new
@@ -1300,7 +1300,7 @@ HloInstruction* PartitionedHlo::ReplicatePartial(
           skipped_dims.push_back(i);
         }
       }
-      result->set_sharding(sharding());
+      result->copy_sharding(hlo_);
       result = PartitionedHlo(result, padded_target_shape, state_)
                    .PadWithValue(zero,
                                  /*left_padded_dims=*/{},
@@ -1563,7 +1563,9 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
                            HloSharding::Tile(temp_target_tile), xpose_dims)
                            .tile_assignment();
     VLOG(5) << "Transposed target: " << temp_target_tile.ToString();
-    auto temp_target_tile_dims = sharding().tile_assignment().dimensions();
+    std::vector<int64_t> temp_target_tile_dims(
+        sharding().tile_assignment().dimensions().begin(),
+        sharding().tile_assignment().dimensions().end());
     temp_target_tile_dims[source_dim] =
         sharding().tile_assignment().dim(target_dim);
     temp_target_tile_dims[target_dim] =
@@ -1622,8 +1624,9 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
   VLOG(5) << "Before reshard: " << p_hlo.hlo_->ToString();
   HloInstruction* zero = CreateZero(
       ShapeUtil::MakeShape(hlo_->shape().element_type(), {}), state_.b);
+  HloSharding sharding_copy = sharding();
   auto padded_phlo = ReshardDataForPad(zero, pc, p_hlo, padded_base_shape,
-                                       sharding(), state_.b);
+                                       sharding_copy, state_.b);
   CHECK(padded_phlo.has_value());
   VLOG(5) << "Resharded: " << padded_phlo->sharded_input->ToString();
   VLOG(5) << "Padded Window: " << padded_phlo->shard_window.DebugString();
@@ -1774,7 +1777,9 @@ PatternMatchMergeSharding(const Shape& shape, const HloSharding& source,
         if (auto reshaped_sharding = get_reshaped_sharding(j)) {
           VLOG(10) << "Triggered Merge From Left";
           auto target_tile_assignment = target.tile_assignment();
-          auto dimensions = reshaped_sharding->tile_assignment().dimensions();
+          std::vector<int64_t> dimensions(
+              reshaped_sharding->tile_assignment().dimensions().begin(),
+              reshaped_sharding->tile_assignment().dimensions().end());
           std::swap(dimensions[i + 1], dimensions[j]);
           target_tile_assignment.Reshape(dimensions);
           auto new_sharding =
@@ -1794,7 +1799,9 @@ PatternMatchMergeSharding(const Shape& shape, const HloSharding& source,
         if (auto reshaped_sharding = get_reshaped_sharding(j)) {
           VLOG(10) << "Triggered Merge From Right";
           auto target_tile_assignment = target.tile_assignment();
-          auto dimensions = reshaped_sharding->tile_assignment().dimensions();
+          std::vector<int64_t> dimensions(
+              reshaped_sharding->tile_assignment().dimensions().begin(),
+              reshaped_sharding->tile_assignment().dimensions().end());
           std::swap(dimensions[i + 1], dimensions[j + 1]);
           target_tile_assignment.Reshape(dimensions);
           auto new_sharding =
@@ -1872,7 +1879,9 @@ PatternMatchUnmergeSharding(const Shape& shape, const Shape& base_shape,
         if (auto reshaped_sharding = get_reshaped_sharding(j)) {
           VLOG(10) << "Triggered Unmerge to Right";
           auto target_tile_assignment = target.tile_assignment();
-          auto dimensions = reshaped_sharding->tile_assignment().dimensions();
+          std::vector<int64_t> dimensions(
+              reshaped_sharding->tile_assignment().dimensions().begin(),
+              reshaped_sharding->tile_assignment().dimensions().end());
           std::swap(dimensions[i + 1], dimensions[j]);
           target_tile_assignment.Reshape(dimensions);
           auto new_sharding =
@@ -1892,7 +1901,9 @@ PatternMatchUnmergeSharding(const Shape& shape, const Shape& base_shape,
         if (auto reshaped_sharding = get_reshaped_sharding(j)) {
           VLOG(10) << "Triggered Unmerge to Left";
           auto target_tile_assignment = target.tile_assignment();
-          auto dimensions = reshaped_sharding->tile_assignment().dimensions();
+          std::vector<int64_t> dimensions(
+              reshaped_sharding->tile_assignment().dimensions().begin(),
+              reshaped_sharding->tile_assignment().dimensions().end());
           std::swap(dimensions[i + 1], dimensions[j + 1]);
           target_tile_assignment.Reshape(dimensions);
           auto new_sharding =
@@ -2175,8 +2186,9 @@ PartitionedHlo::ReshardPartialReplicateWithAllToAll(const HloSharding& target) {
   }
 
   auto tmp_tile_assignment = tile_sharding.tile_assignment();
-  auto tmp_tile_assignment_dimensions =
-      tile_sharding.tile_assignment().dimensions();
+  std::vector<int64_t> tmp_tile_assignment_dimensions(
+      tile_sharding.tile_assignment().dimensions().begin(),
+      tile_sharding.tile_assignment().dimensions().end());
   tmp_tile_assignment_dimensions[to_replicate_dim] = 1;
   tmp_tile_assignment_dimensions.push_back(num_replicas);
   tmp_tile_assignment.Reshape(tmp_tile_assignment_dimensions);
@@ -2252,7 +2264,8 @@ SpmdPartitioningVisitor::SpmdPartitioningVisitor(
       num_replicas_(num_replicas),
       collective_ops_creator_(collective_ops_creator),
       next_channel_id_(next_channel_id),
-      b_(SpmdBuilder(computation->name() + "_spmd", /*hlo=*/nullptr)),
+      b_(SpmdBuilder(absl::StrCat(computation->name(), "_spmd"),
+                     /*hlo=*/nullptr)),
       partition_id_(collective_ops_creator_.create_partition_id(&b_)),
       logger_(logger),
       options_(std::move(options)),
@@ -2390,7 +2403,7 @@ Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
         visiting_hlo_operand_shardings_.push_back(operand->sharding());
         operand->set_sharding(manual_to_onedevice(
             hlo->opcode(), operand->shape(), operand->sharding()));
-        GetPartitionedHlo(operand).hlo()->set_sharding(operand->sharding());
+        GetPartitionedHlo(operand).hlo()->copy_sharding(operand);
       }
     } else {
       const bool has_manual_subgroup =
@@ -2484,7 +2497,7 @@ Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
               get_grouped_sharding(operand->sharding(), operand->shape(),
                                    &group_sharding));
           operand->set_sharding(op_group_sharding.sharding);
-          GetPartitionedHlo(operand).hlo()->set_sharding(operand->sharding());
+          GetPartitionedHlo(operand).hlo()->copy_sharding(operand);
           auto group_state = CreatePerGroupPartitioningState(
               old_state, op_group_sharding.device_groups, &b_);
           GetPartitionedHlo(operand).set_state(group_state);
@@ -2506,7 +2519,7 @@ Status SpmdPartitioningVisitor::Postprocess(HloInstruction* hlo) {
     int64_t i = 0;
     for (HloInstruction* operand : hlo->unique_operands()) {
       operand->set_sharding(visiting_hlo_operand_shardings_[i++]);
-      GetPartitionedHlo(operand).hlo()->set_sharding(operand->sharding());
+      GetPartitionedHlo(operand).hlo()->copy_sharding(operand);
     }
     visiting_hlo_sharding_.reset();
     visiting_hlo_operand_shardings_.clear();
@@ -2526,7 +2539,7 @@ Status SpmdPartitioningVisitor::Postprocess(HloInstruction* hlo) {
   if (!visiting_state_.empty()) {
     int64_t i = 0;
     for (const HloInstruction* operand : hlo->unique_operands()) {
-      GetPartitionedHlo(operand).set_state(visiting_state_[i++]);
+      GetPartitionedHlo(operand).set_state(std::move(visiting_state_[i++]));
     }
     visiting_state_.clear();
   }
@@ -2819,7 +2832,9 @@ Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
       !cur_sharding.IsTileMaximal() &&
       cur_sharding.tile_assignment().dim(sort_dim) != 1) {
     Array<int64_t> tile_assignment = cur_sharding.tile_assignment();
-    std::vector<int64_t> tile_assignment_dims = tile_assignment.dimensions();
+    std::vector<int64_t> tile_assignment_dims(
+        tile_assignment.dimensions().begin(),
+        tile_assignment.dimensions().end());
     // Pick the new dimension to move the sharding into
     int64_t picked_dim = -1;
     int64_t first_nonsort_nonsharded_dim = -1;
@@ -2852,7 +2867,8 @@ Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
            "there are no free dimensions to move it into";
     // Move the sharding to the picked dimension
     std::vector<int64_t> permutation(
-        cur_sharding.tile_assignment().dimensions());
+        cur_sharding.tile_assignment().dimensions().begin(),
+        cur_sharding.tile_assignment().dimensions().end());
     absl::c_iota(permutation, 0);
     std::swap(permutation[sort_dim], permutation[picked_dim]);
     auto new_sharding =
@@ -3081,7 +3097,7 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
         input_shard_shape.dimensions(input_sharded_dim) * merge_factor);
     auto tmp_reshape = b_.AddInstruction(
         HloInstruction::CreateReshape(tmp_shard_shape, operand.hlo()));
-    tmp_reshape->set_sharding(hlo->sharding());
+    tmp_reshape->copy_sharding(hlo);
     auto tmp_full_shape = tmp_shard_shape;
     tmp_full_shape.set_dimensions(
         output_sharded_dim, tmp_shard_shape.dimensions(output_sharded_dim) *
@@ -3775,7 +3791,7 @@ Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
     }
     auto clone = b_.AddInstruction(
         hlo->CloneWithNewOperands(hlo->shape(), new_operands));
-    clone->set_sharding(hlo->sharding());
+    clone->copy_sharding(hlo);
     SetPartitionedHlo(
         hlo, PartitionedHlo(clone, hlo->shape(), MakePartitioningState())
                  .Reshard(hlo->sharding()));
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
index 0ca64e78e3a..68ba4b15f2e 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/functional/function_ref.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
@@ -76,6 +77,10 @@ struct SpmdPartitionerOptions {
   // Whether doing bidirectional communication when decomposing independent
   // all-gathers.
   bool bidirectional_decomposed_all_gather = false;
+
+  // Whether to skip checking the numbers and shardings of windowed einsum's
+  // users.
+  bool skip_checking_windowed_einsum_users = false;
 };
 
 // Class to wrap the computation builder to capture information during SPMD
@@ -353,7 +358,7 @@ class PartitionedHlo {
     PartitionedHlo new_phlo = *this;
     new_phlo.hlo_ = hlo;
     if (!hlo->has_sharding() && hlo_->has_sharding()) {
-      hlo->set_sharding(hlo_->sharding());
+      hlo->copy_sharding(hlo_);
     }
     return new_phlo;
   }
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index f0cf48bf5c7..f9dd5a22423 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -10657,6 +10657,27 @@ ENTRY %module {
                 _, _)));
 }
 
+TEST_F(SpmdPartitioningTest, GatherTrivialSlicedOperandPartial) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY main.4 {
+  %arg.0 = s64[8,2]{1,0} parameter(0), sharding={devices=[4,2]0,1,2,3,4,5,6,7}
+  %arg.1 = s32[2]{0} parameter(1), sharding={replicated}
+  ROOT gather = s64[2,1]{1,0} gather(arg.0, arg.1), offset_dims={0,1},
+    collapsed_slice_dims={}, start_index_map={0,1}, index_vector_dim=0,
+    slice_sizes={2,1}, indices_are_sorted=true, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  auto operand = AllOf(op::Shape("s64[8,1]"), op::AllReduce());
+  auto indices = AllOf(op::Shape("s32[2]"), op::Subtract());
+  auto gather = AllOf(op::Shape("s64[2,1]"), op::Gather(operand, indices));
+  EXPECT_THAT(root, op::AllReduce(op::Select(_, _, gather)));
+}
+
 TEST_F(SpmdPartitioningTest, GatherParallelIndexAndOperand) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -11498,6 +11519,40 @@ ENTRY %module {
                         _, op::AllReduce(scatter), _, _, _, _))));
 }
 
+TEST_F(SpmdPartitioningTest, ScatterTrivialSlicedOperandPartial) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+add (lhs: s64[], rhs: s64[]) -> s64[] {
+  lhs = s64[] parameter(0)
+  rhs = s64[] parameter(1)
+  ROOT sum = s64[] add(lhs, rhs)
+}
+
+ENTRY main.4 {
+  %arg.0 = s64[8,2]{1,0} parameter(0), sharding={devices=[4,2]0,1,2,3,4,5,6,7}
+  %arg.1 = s32[2]{0} parameter(1), sharding={replicated}
+  %arg.2 = s64[2,1]{1,0} parameter(2), sharding={replicated}
+  ROOT scatter = s64[8,2]{1,0} scatter(arg.0, arg.1, arg.2),
+    to_apply=add,
+    update_window_dims={0,1},
+    inserted_window_dims={},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=0, indices_are_sorted=true, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  auto operand = AllOf(op::Shape("s64[8,1]"), op::AllReduce());
+  auto indices = AllOf(op::Shape("s32[2]"), op::Subtract());
+  auto update = AllOf(op::Shape("s64[2,1]"), op::Parameter());
+  auto scatter =
+      AllOf(op::Shape("s64[8,1]"), op::Scatter(operand, indices, update));
+  EXPECT_THAT(root, op::AllReduce(op::AllReduce(op::DynamicUpdateSlice(
+                        _, op::DynamicSlice(scatter, _, _), _, _))));
+}
+
 TEST_F(SpmdPartitioningTest, SortTopKNonSortDimension) {
   absl::string_view hlo_string = R"(
 HloModule module
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
index ce990996193..266af694a12 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -194,7 +194,7 @@ std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
   CHECK(!sharding.IsTileMaximal());
   auto dimensions = sharding.tile_assignment().dimensions();
   if (sharding.ReplicateOnLastTileDim()) {
-    dimensions.pop_back();
+    dimensions.remove_suffix(1);
   }
   auto table_shape = ShapeUtil::MakeShape(S32, dimensions);
   return MakePartitionOffsets(table_shape, sharding, partition_id, b);
@@ -368,7 +368,9 @@ std::optional<HloSharding> PartialReplicateReshardCompatibleSharding(
     num_target_replication =
         target_sharding.tile_assignment().dimensions().back();
   }
-  auto reshape_dimensions = partial_sharding.tile_assignment().dimensions();
+  std::vector<int64_t> reshape_dimensions(
+      partial_sharding.tile_assignment().dimensions().begin(),
+      partial_sharding.tile_assignment().dimensions().end());
   int64_t num_replication = reshape_dimensions.back();
   if (num_replication / num_target_replication != Product(expand_tile_sizes) ||
       num_replication % num_target_replication != 0) {
@@ -1792,7 +1794,9 @@ GatherScatterOperandsShardedAcrossParallelDims(
     if (!to_adjust->ReplicateOnLastTileDim()) {
       return std::nullopt;
     }
-    auto new_tile_assignment_dims = to_adjust->tile_assignment().dimensions();
+    std::vector<int64_t> new_tile_assignment_dims(
+        to_adjust->tile_assignment().dimensions().begin(),
+        to_adjust->tile_assignment().dimensions().end());
     for (int i = 0; i < to_adjust_dims.size(); ++i) {
       int64_t target_dim = target->tile_assignment().dim(target_dims[i]);
       int64_t to_adjust_dim =
@@ -1829,8 +1833,9 @@ GatherScatterOperandsShardedAcrossParallelDims(
     }
   }
   // Make sure that the parallel dimensions are aligned.
-  auto operand_shard_tile_dims =
-      new_operand_shard.tile_assignment().dimensions();
+  std::vector<int64_t> operand_shard_tile_dims(
+      new_operand_shard.tile_assignment().dimensions().begin(),
+      new_operand_shard.tile_assignment().dimensions().end());
   for (int i = 0; i < indices_parallel_dims_ordered_as_operand.size(); ++i) {
     operand_shard_tile_dims[operand_parallel_dims[i]] =
         new_index_shard.tile_assignment().dim(
diff --git a/tensorflow/compiler/xla/service/stochastic_convert_decomposer_test.cc b/tensorflow/compiler/xla/service/stochastic_convert_decomposer_test.cc
index b031f4490de..dfc94dec271 100644
--- a/tensorflow/compiler/xla/service/stochastic_convert_decomposer_test.cc
+++ b/tensorflow/compiler/xla/service/stochastic_convert_decomposer_test.cc
@@ -94,7 +94,7 @@ ENTRY entry {
 
   auto result = decomposer.Run(module.get());
   EXPECT_NE(OkStatus(), result.status());
-  EXPECT_THAT(result.status().error_message(), HasSubstr("have same bits"));
+  EXPECT_THAT(result.status().message(), HasSubstr("have same bits"));
 }
 
 TEST_F(StochasticConvertDecomposerTest, WrongRandomType) {
@@ -114,7 +114,7 @@ ENTRY entry {
 
   auto result = decomposer.Run(module.get());
   EXPECT_NE(OkStatus(), result.status());
-  EXPECT_THAT(result.status().error_message(),
+  EXPECT_THAT(result.status().message(),
               HasSubstr("must be unsigned integers"));
 }
 
diff --git a/tensorflow/compiler/xla/service/topk_rewriter.cc b/tensorflow/compiler/xla/service/topk_rewriter.cc
index bff7b0c7c70..3d6dcfb3b83 100644
--- a/tensorflow/compiler/xla/service/topk_rewriter.cc
+++ b/tensorflow/compiler/xla/service/topk_rewriter.cc
@@ -111,10 +111,26 @@ static bool IsNanSafeGt(HloComputation* comp) {
     return m::Gt(param0, param1);
   };
 
-  auto match_all_compares = [&match_compare](HloInstruction* root) {
-    return Match(root, match_compare(BF16)) ||
-           Match(root, match_compare(F32)) || Match(root, match_compare(S32)) ||
-           Match(root, match_compare(U32));
+  auto match_default_compare = [](PrimitiveType type) {
+    auto params_with_type = [&](int i, PrimitiveType t) {
+      return m::Parameter(i).WithShape(m::Shape().WithElementType(t));
+    };
+    auto params =
+        std::vector({// Values
+                     params_with_type(0, type), params_with_type(1, type),
+                     // Indices
+                     params_with_type(2, S32), params_with_type(3, S32)});
+    auto const_true = m::Broadcast(m::Constant());
+    auto values_gt = m::Gt(params[0], params[1]);
+    return m::Select(const_true, values_gt, const_true);
+  };
+
+  auto match_all_types = [](HloInstruction* root, auto callback) {
+    bool result = false;
+    for (auto type : {BF16, F32, S32, U32}) {
+      result = result || Match(root, callback(type));
+    }
+    return result;
   };
 
   return Match(comp->root_instruction(),
@@ -128,16 +144,19 @@ static bool IsNanSafeGt(HloComputation* comp) {
                m::Gt(match_bitcast_bf16_with_convert(0),
                      match_bitcast_bf16_with_convert(1))) ||
          Match(comp->root_instruction(), m::Gt(match_s32(0), match_s32(1))) ||
-         match_all_compares(comp->root_instruction());
+         match_all_types(comp->root_instruction(), match_compare) ||
+         match_all_types(comp->root_instruction(), match_default_compare);
 }
 
 // Look for the instructions emitted from: xla/client/lib/sorting.cc
 static bool HasIota(HloSortInstruction* sort, HloInstruction* data) {
   namespace m = match;
-  auto match_iota = m::Iota().WithShape(
-      m::Shape().WithElementType(S32).WithDims(data->shape().dimensions()));
-  return Match(sort->operand(1), match_iota) ||
-         Match(sort->operand(1), m::Broadcast(match_iota));
+  const auto sort_dims = {data->shape().dimensions(sort->sort_dimension())};
+  auto match_iota = [](auto dims) {
+    return m::Iota().WithShape(m::Shape().WithElementType(S32).WithDims(dims));
+  };
+  return Match(sort->operand(1), match_iota(data->shape().dimensions())) ||
+         Match(sort->operand(1), m::Broadcast(match_iota(sort_dims)));
 }
 
 std::optional<int64_t> TopkRewriter::SortIsInTopK(HloInstruction* inst) {
@@ -330,10 +349,10 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
     };
     CHECK_NE(comparator, nullptr);
     // If only the topk values are necessary, skip the iota.
-    if (call->user_count() == 1) {
+    if (call->user_count() == 1 && call->users().front()->tuple_index() == 0) {
       HloInstruction* sort = comp->AddInstruction(HloInstruction::CreateSort(
           {input->shape()}, sort_dimension, {input}, call->to_apply(),
-          /*is_stable=*/false));
+          /*is_stable=*/true));
       TF_RETURN_IF_ERROR(ReplaceInstruction(
           call->users().front(),
           comp->AddInstruction(HloInstruction::CreateSlice(
@@ -346,7 +365,7 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
       HloInstruction* sort = comp->AddInstruction(HloInstruction::CreateSort(
           ShapeUtil::MakeTupleShape({input->shape(), iota_shape}),
           sort_dimension, {input, iota}, call->to_apply(),
-          /*is_stable=*/false));
+          /*is_stable=*/true));
       TF_RETURN_IF_ERROR(ReplaceInstruction(
           call, comp->AddInstruction(HloInstruction::CreateTuple(
                     {slice_tuple(sort, 0), slice_tuple(sort, 1)}))));
diff --git a/tensorflow/compiler/xla/service/topk_rewriter_test.cc b/tensorflow/compiler/xla/service/topk_rewriter_test.cc
index 2e10c2ab01d..c55aba942b0 100644
--- a/tensorflow/compiler/xla/service/topk_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/topk_rewriter_test.cc
@@ -22,9 +22,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
-#include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -33,7 +36,7 @@ limitations under the License.
 #include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/platform/test.h"
 
-namespace op = xla::testing::opcode_matchers;
+namespace m = ::xla::match;
 
 namespace xla {
 namespace {
@@ -134,8 +137,28 @@ std::string getCompareComparator() {
 })";
 }
 
+std::string getStableComparator() {
+  return R"(
+  %compare {
+    %p.1.lhs.40628 = s32[] parameter(2)
+    %p.1.rhs.40629 = s32[] parameter(3)
+    %constant.40630 = pred[] constant(true)
+    %broadcast.40631 = pred[] broadcast(pred[] %constant.40630), dimensions={}
+    %p.0.lhs.40626 = f32[] parameter(0)
+    %p.0.rhs.40627 = f32[] parameter(1)
+    %compare.40632 = pred[] compare(f32[] %p.0.lhs.40626, f32[] %p.0.rhs.40627), direction=GT, type=TOTALORDER
+    ROOT %select.40633 = pred[] select(pred[] %broadcast.40631, pred[] %compare.40632, pred[] %broadcast.40631)
+  })";
+}
+
+bool IsStableSort(const HloInstruction* inst) {
+  auto* sort = DynCast<HloSortInstruction>(inst);
+  return sort != nullptr && sort->is_stable();
+}
+
 TEST_F(TopkRewriterTest, Rewrite) {
-  for (std::string comparator : {getComparator(), getComparator()}) {
+  for (std::string comparator :
+       {getComparator(), getCompareComparator(), getStableComparator()}) {
     const std::string hlo_string = R"(
 HloModule module
 )" + comparator + R"(
@@ -157,10 +180,45 @@ ENTRY cluster {
     TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
     TF_ASSERT_OK(HloDCE().Run(module.get()).status());
     EXPECT_TRUE(changed);
-    EXPECT_THAT(
-        module->entry_computation()->root_instruction(),
-        op::Tuple(op::GetTupleElement(op::CustomCall(op::Parameter(0)), 0),
-                  op::GetTupleElement(op::CustomCall(op::Parameter(0)), 1)));
+    EXPECT_THAT(module->entry_computation()->root_instruction(),
+                GmockMatch(m::Tuple(
+                    m::GetTupleElement(m::CustomCall(m::Parameter(0)), 0),
+                    m::GetTupleElement(m::CustomCall(m::Parameter(0)), 1))));
+    const HloInstruction* cc =
+        module->entry_computation()->root_instruction()->operand(0)->operand(0);
+    EXPECT_THAT(cc->custom_call_target(), "TopK");
+  }
+}
+
+TEST_F(TopkRewriterTest, RewriteWithBroadcast) {
+  for (std::string comparator :
+       {getComparator(), getCompareComparator(), getStableComparator()}) {
+    const std::string hlo_string = R"(
+HloModule module
+)" + comparator + R"(
+ENTRY cluster {
+  %arg_tuple.1 = f32[8,1234567] parameter(0)
+  %iota.4 = s32[1234567]{0} iota(), iota_dimension=0
+  %broadcast.5 = s32[8,1234567]{1,0} broadcast(iota.4), dimensions={1}
+  %sort.27 = (f32[8,1234567], s32[8,1234567]) sort(%arg_tuple.1, %broadcast.5),
+    dimensions={1}, is_stable=true, to_apply=%compare
+  %get-tuple-element.28 = f32[8,1234567] get-tuple-element(%sort.27), index=0
+  %slice.29 = f32[8,5] slice(%get-tuple-element.28), slice={[0:8], [0:5]}
+  %get-tuple-element.30 = s32[8,1234567] get-tuple-element(%sort.27), index=1
+  %slice.31 = s32[8,5] slice(%get-tuple-element.30), slice={[0:8], [0:5]}
+  ROOT %tuple.32 = (f32[8,5], s32[8,5]) tuple(%slice.29, %slice.31)
+})";
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(hlo_string));
+    TopkRewriter rewriter(
+        [](const HloSortInstruction*, int64_t) { return true; });
+    TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
+    TF_ASSERT_OK(HloDCE().Run(module.get()).status());
+    EXPECT_TRUE(changed);
+    EXPECT_THAT(module->entry_computation()->root_instruction(),
+                GmockMatch(m::Tuple(
+                    m::GetTupleElement(m::CustomCall(m::Parameter(0)), 0),
+                    m::GetTupleElement(m::CustomCall(m::Parameter(0)), 1))));
     const HloInstruction* cc =
         module->entry_computation()->root_instruction()->operand(0)->operand(0);
     EXPECT_THAT(cc->custom_call_target(), "TopK");
@@ -189,10 +247,10 @@ ENTRY cluster {
   TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
   TF_ASSERT_OK(HloDCE().Run(module.get()).status());
   EXPECT_TRUE(changed);
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      op::Tuple(op::GetTupleElement(op::CustomCall(op::Parameter(0)), 0),
-                op::GetTupleElement(op::CustomCall(op::Parameter(0)), 1)));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::GetTupleElement(m::CustomCall(m::Parameter(0)), 0),
+                  m::GetTupleElement(m::CustomCall(m::Parameter(0)), 1))));
   const HloInstruction* cc =
       module->entry_computation()->root_instruction()->operand(0)->operand(0);
   EXPECT_THAT(cc->custom_call_target(), "TopK");
@@ -220,10 +278,10 @@ ENTRY cluster {
   TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
   TF_ASSERT_OK(HloDCE().Run(module.get()).status());
   EXPECT_TRUE(changed);
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      op::Tuple(op::GetTupleElement(op::CustomCall(op::Parameter(0)), 0),
-                op::GetTupleElement(op::CustomCall(op::Parameter(0)), 1)));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::GetTupleElement(m::CustomCall(m::Parameter(0)), 0),
+                  m::GetTupleElement(m::CustomCall(m::Parameter(0)), 1))));
   const HloInstruction* cc =
       module->entry_computation()->root_instruction()->operand(0)->operand(0);
   EXPECT_THAT(cc->custom_call_target(), "TopK");
@@ -252,12 +310,12 @@ ENTRY cluster {
   TF_ASSERT_OK(HloDCE().Run(module.get()).status());
   EXPECT_TRUE(changed);
   LOG(INFO) << module->entry_computation()->ToString();
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      op::Tuple(op::Transpose(op::GetTupleElement(
-                    op::CustomCall(op::Transpose(op::Parameter(0))), 0)),
-                op::Transpose(op::GetTupleElement(
-                    op::CustomCall(op::Transpose(op::Parameter(0))), 1))));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Transpose(m::GetTupleElement(
+                      m::CustomCall(m::Transpose(m::Parameter(0))), 0)),
+                  m::Transpose(m::GetTupleElement(
+                      m::CustomCall(m::Transpose(m::Parameter(0))), 1)))));
   const HloInstruction* cc = module->entry_computation()
                                  ->root_instruction()
                                  ->operand(0)
@@ -282,8 +340,9 @@ ENTRY cluster {
   TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
   TF_ASSERT_OK(HloDCE().Run(module.get()).status());
   EXPECT_TRUE(changed);
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::GetTupleElement(op::CustomCall(op::Parameter(0)), 0));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(m::CustomCall(m::Parameter(0)), 0)));
   const HloInstruction* cc =
       module->entry_computation()->root_instruction()->operand(0);
   EXPECT_THAT(cc->custom_call_target(), "TopK");
@@ -306,8 +365,9 @@ ENTRY cluster {
     TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
     TF_ASSERT_OK(HloDCE().Run(module.get()).status());
     ASSERT_TRUE(changed);
-    ASSERT_THAT(module->entry_computation()->root_instruction(),
-                op::GetTupleElement(op::CustomCall(op::Parameter(0)), 0));
+    ASSERT_THAT(
+        module->entry_computation()->root_instruction(),
+        GmockMatch(m::GetTupleElement(m::CustomCall(m::Parameter(0)), 0)));
     const HloInstruction* cc =
         module->entry_computation()->root_instruction()->operand(0);
     ASSERT_THAT(cc->custom_call_target(), "TopK");
@@ -320,7 +380,53 @@ ENTRY cluster {
   EXPECT_TRUE(decomposer_changed);
   TF_ASSERT_OK(HloDCE().Run(module.get()).status());
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Slice(op::Sort(op::Parameter(0))));
+              GmockMatch(m::Slice(
+                  m::Sort(m::Parameter(0)).WithPredicate(IsStableSort))));
+  // ... and that it can become a topk again.
+  run_topk_pass();
+}
+
+TEST_F(TopkRewriterTest, RoundTripOnlyIota) {
+  const std::string hlo_string = R"(
+HloModule module
+)" + getComparator() + R"(
+ENTRY cluster {
+  %arg_tuple.1 = f32[8,1234567] parameter(0)
+  %iota.4 = s32[1234567]{0} iota(), iota_dimension=0
+  %broadcast.5 = s32[8,1234567]{1,0} broadcast(iota.4), dimensions={1}
+  %sort.27 = (f32[8,1234567], s32[8,1234567]) sort(%arg_tuple.1, %broadcast.5),
+    dimensions={1}, is_stable=true, to_apply=%compare
+  %get-tuple-element.28 = s32[8,1234567] get-tuple-element(%sort.27), index=1
+  ROOT %slice.29 = s32[8,5] slice(%get-tuple-element.28), slice={[0:8], [0:5]}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto run_topk_pass = [&] {
+    TopkRewriter rewriter(
+        [](const HloSortInstruction*, int64_t) { return true; });
+    TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
+    TF_ASSERT_OK(HloDCE().Run(module.get()).status());
+    ASSERT_TRUE(changed);
+    EXPECT_THAT(
+        module->entry_computation()->root_instruction(),
+        GmockMatch(m::GetTupleElement(m::CustomCall(m::Parameter(0)), 1)));
+    const HloInstruction* cc =
+        module->entry_computation()->root_instruction()->operand(0);
+    ASSERT_THAT(cc->custom_call_target(), "TopK");
+  };
+  // Start by producing a TopK...
+  run_topk_pass();
+  // ... ensuring it decomposes into sort+slice...
+  TF_ASSERT_OK_AND_ASSIGN(bool decomposer_changed,
+                          TopkDecomposer().Run(module.get()));
+  EXPECT_TRUE(decomposer_changed);
+  TF_ASSERT_OK(TupleSimplifier().Run(module.get()).status());
+  TF_ASSERT_OK(HloDCE().Run(module.get()).status());
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Slice(m::GetTupleElement(
+          m::Sort(m::Parameter(0), m::Iota()).WithPredicate(IsStableSort),
+          1))));
   // ... and that it can become a topk again.
   run_topk_pass();
 }
@@ -348,10 +454,10 @@ ENTRY cluster {
     TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
     TF_ASSERT_OK(HloDCE().Run(module.get()).status());
     ASSERT_TRUE(changed);
-    ASSERT_THAT(
-        module->entry_computation()->root_instruction(),
-        op::Tuple(op::GetTupleElement(op::CustomCall(op::Parameter(0)), 0),
-                  op::GetTupleElement(op::CustomCall(op::Parameter(0)), 1)));
+    ASSERT_THAT(module->entry_computation()->root_instruction(),
+                GmockMatch(m::Tuple(
+                    m::GetTupleElement(m::CustomCall(m::Parameter(0)), 0),
+                    m::GetTupleElement(m::CustomCall(m::Parameter(0)), 1))));
     const HloInstruction* cc =
         module->entry_computation()->root_instruction()->operand(0)->operand(0);
     ASSERT_THAT(cc->custom_call_target(), "TopK");
@@ -364,10 +470,12 @@ ENTRY cluster {
   EXPECT_TRUE(decomposer_changed);
   TF_ASSERT_OK(HloDCE().Run(module.get()).status());
   TF_ASSERT_OK(TupleSimplifier().Run(module.get()).status());
-  auto sort_matcher = op::Sort(op::Parameter(0), op::Iota());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Slice(op::GetTupleElement(sort_matcher, 0)),
-                        op::Slice(op::GetTupleElement(sort_matcher, 1))));
+  auto sort_matcher =
+      m::Sort(m::Parameter(0), m::Iota()).WithPredicate(IsStableSort);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(m::Slice(m::GetTupleElement(sort_matcher, 0)),
+                          m::Slice(m::GetTupleElement(sort_matcher, 1)))));
   // ... and that it can become a topk again.
   run_topk_pass();
 }
@@ -439,5 +547,34 @@ ENTRY cluster {
       RunAndCompare(std::move(source_module), std::nullopt, round_trip));
 }
 
+TEST_F(TopkRewriterTest, DecomposerStability) {
+  const std::string hlo_string = R"(
+HloModule module
+)" + getCompareComparator() + R"(
+ENTRY cluster {
+  %constant.1 = f32[] constant(42)
+  %broadcast.2= f32[1234] broadcast(f32[] %constant.1), dimensions={}
+  %iota.4 = s32[1234] iota(), iota_dimension=0
+  %sort.27 = (f32[1234], s32[1234]) sort(%broadcast.2, %iota.4),
+    dimensions={0}, is_stable=true, to_apply=%compare
+  %get-tuple-element.28 = f32[1234] get-tuple-element(%sort.27), index=0
+  %slice.29 = f32[5] slice(%get-tuple-element.28), slice={[0:5]}
+  %get-tuple-element.30 = s32[1234] get-tuple-element(%sort.27), index=1
+  %slice.31 = s32[5] slice(%get-tuple-element.30), slice={[0:5]}
+  ROOT %tuple.32 = (f32[5], s32[5]) tuple(%slice.29, %slice.31)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto source_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto round_trip = [](HloModule* module) {
+    EXPECT_THAT(TopkRewriter([](const HloSortInstruction*, int64_t) {
+                  return true;
+                }).Run(module),
+                IsOkAndHolds(true));
+    EXPECT_THAT(TopkDecomposer().Run(module), IsOkAndHolds(true));
+  };
+  EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(source_module), std::nullopt,
+                                       round_trip));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.cc
index 5999d5d1ece..c2472c66b21 100644
--- a/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.cc
@@ -15,15 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.h"
 
-#include <iterator>
 #include <memory>
 #include <optional>
 #include <stack>
-#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
@@ -31,11 +30,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/hlo_replication_analysis.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -48,7 +47,7 @@ namespace {
 struct AccumulationContext {
   HloInstruction* accumulation_instruction;
   HloInstruction* accumulation_buffer;
-  std::vector<int> param_tuple_indices;
+  int64_t param_tuple_index;
   std::optional<HloInstruction*> dynamic_slice;
   std::optional<HloInstruction*> dynamic_update_slice;
 };
@@ -128,6 +127,19 @@ bool IsValueReplicatedWithinEachAllReduceGroup(
   }
 }
 
+// If this instruction is effectively a scalar, returns the scalar value, else
+// returns nullptr.
+HloInstruction* GetEffectiveScalar(HloInstruction* instruction) {
+  if (instruction->opcode() != HloOpcode::kBroadcast) {
+    return nullptr;
+  }
+  HloInstruction* operand = instruction->mutable_operand(0);
+  if (!ShapeUtil::IsScalar(operand->shape())) {
+    return nullptr;
+  }
+  return operand;
+}
+
 // Checks if an all-reduce instruction is eligible for sinking and finds all of
 // the all-reduce's accumulation uses inside the while body if eligible.
 // An all-reduce is movable if all following conditions hold. This function
@@ -157,39 +169,37 @@ bool IsValueReplicatedWithinEachAllReduceGroup(
 //      c. Dynamically discarding the all-reduce result, i.e., kSelect between
 //         all-reduce result and 0. The predicate to kSelect must have the same
 //         value on all all-reduce cores.
+//
+// Extension for reduce-scatter: For reduce-scatter, we will allow a subset of
+// the conditions above, given that we need to expand the size of the
+// accumulation buffers in the loop and then execute the reduce-scatter outside
+// the loop. This means we will only allow cases where we can do this
+// expansion:
+//  1. Do not allow dynamic-slice or dynamic-update-slice (not expected with
+//     reduce-scatter)
+//  2. Do not allow any forwarding instructions like convert (TBD, as their
+//     shape can be changed).
+//  3. Do not allow forwarding instructions like bitcast, transpose, and
+//     reshape (as changing their shape may not be trivial)
+//  3. Allow select only when the predicate is effectively scalar. This is
+//     because we will be changing the shape of this select to match the
+//     expanded accumulation buffer, and with scalar predicate its trivial to
+//     change the predicate shape by broadcasting the scalar predicate to the
+//     new shape. With non-scalar predicate, HLO replication analysis only has
+//     HLO level info about whether the entire value is replicated or not, and
+//     that may not be sufficient to change the predicate shape to a new shape.
 MovableAllReduceContext IsAllReduceMovable(
-    HloAllReduceInstruction* all_reduce, HloComputation* while_body,
+    HloAllReduceInstructionBase* all_reduce, HloComputation* while_body,
     const std::unique_ptr<HloReplicationAnalysis>&
         cross_replica_replication_analysis,
     const std::unique_ptr<HloReplicationAnalysis>&
         cross_partition_replication_analysis) {
   VLOG(4) << "IsAllReduceMovable: " << all_reduce->ToString();
-  StatusOr<CollectiveOpGroupMode> all_reduce_group_mode =
-      GetCollectiveOpGroupMode(all_reduce->channel_id().has_value(),
-                               all_reduce->use_global_device_ids());
-  CHECK(all_reduce_group_mode.ok());
-  bool all_reduce_is_summation = [all_reduce]() {
-    std::optional<ReductionKind> reduction_type =
-        MatchReductionComputation(all_reduce->to_apply());
-    return reduction_type.has_value() && *reduction_type == ReductionKind::SUM;
-  }();
+  std::optional<ReductionKind> reduction_type =
+      MatchReductionComputation(all_reduce->to_apply());
+  const bool all_reduce_is_summation =
+      reduction_type.has_value() && *reduction_type == ReductionKind::SUM;
 
-  auto is_value_replicated_within_replica_group =
-      [&cross_replica_replication_analysis,
-       &cross_partition_replication_analysis, &all_reduce_group_mode,
-       all_reduce](const HloInstruction& instruction,
-                   const ShapeIndex& index) -> bool {
-    bool is_replicated = IsValueReplicatedWithinEachAllReduceGroup(
-        instruction, index, *all_reduce_group_mode,
-        all_reduce->replica_groups(),
-        all_reduce->GetModule()->config().replica_count(),
-        all_reduce->GetModule()->config().num_partitions(),
-        cross_replica_replication_analysis,
-        cross_partition_replication_analysis);
-    VLOG(5) << "instruction: " << instruction.name()
-            << " is_replicate: " << is_replicated;
-    return is_replicated;
-  };
   // We only support numerical types.
   const absl::InlinedVector<PrimitiveType, 12> kSupportedTypes{
       BF16, F16, F32, F64, S8, S16, S32, S64, U8, U16, U32, U64};
@@ -201,19 +211,42 @@ MovableAllReduceContext IsAllReduceMovable(
                                    /*accumulation_contexts=*/{}};
   }
 
+  CollectiveOpGroupMode all_reduce_group_mode =
+      GetCollectiveOpGroupMode(all_reduce->channel_id().has_value(),
+                               all_reduce->use_global_device_ids())
+          .value();
+
+  auto is_value_replicated_within_replica_group =
+      [&cross_replica_replication_analysis,
+       &cross_partition_replication_analysis, &all_reduce_group_mode,
+       all_reduce](const HloInstruction& instruction,
+                   const ShapeIndex& index) -> bool {
+    bool is_replicated = IsValueReplicatedWithinEachAllReduceGroup(
+        instruction, index, all_reduce_group_mode, all_reduce->replica_groups(),
+        all_reduce->GetModule()->config().replica_count(),
+        all_reduce->GetModule()->config().num_partitions(),
+        cross_replica_replication_analysis,
+        cross_partition_replication_analysis);
+    VLOG(5) << "instruction: " << instruction.name()
+            << " is_replicate: " << is_replicated;
+    return is_replicated;
+  };
   struct BufferTupleIndex {
     bool unsupported_operation{false};
-    std::vector<int> tuple_index;
+    std::optional<int64_t> tuple_index;
     bool returned_from_computation{false};
     std::optional<HloInstruction*> dynamic_slice;
     std::optional<HloInstruction*> dynamic_update_slice;
   };
 
+  const bool is_reduce_scatter =
+      all_reduce->opcode() == HloOpcode::kReduceScatter;
+
   // If the instruction is a buffer forwarded from a tuple element of the
   // computation's parameter, returns the indices of the buffer in the parameter
   // tuple. The returned_from_computation field in the result is unused.
   auto get_origin_tuple_index =
-      [](HloInstruction* instruction) -> BufferTupleIndex {
+      [is_reduce_scatter](HloInstruction* instruction) -> BufferTupleIndex {
     VLOG(4) << "get_origin_tuple_index called on " << instruction->ToString();
     // The returned_from_computation is never touched in this function.
     BufferTupleIndex result;
@@ -230,26 +263,37 @@ MovableAllReduceContext IsAllReduceMovable(
         case HloOpcode::kConvert:
         case HloOpcode::kReshape:
         case HloOpcode::kTranspose:
-          instruction = instruction->mutable_operand(0);
+          if (is_reduce_scatter) {
+            VLOG(4) << "get_origin_tuple_index, instruction: ("
+                    << instruction->ToString()
+                    << ") is an unsupported operation on accumulation buffer.";
+            result.unsupported_operation = true;
+          } else {
+            instruction = instruction->mutable_operand(0);
+          }
           break;
         case HloOpcode::kGetTupleElement: {
-          if (!result.tuple_index.empty()) {
+          if (result.tuple_index.has_value()) {
             // Note that we don't support nested tuples as of now.
             result.unsupported_operation = true;
           } else {
-            result.tuple_index.push_back(
-                Cast<HloGetTupleElementInstruction>(instruction)
-                    ->tuple_index());
+            result.tuple_index =
+                Cast<HloGetTupleElementInstruction>(instruction)->tuple_index();
             instruction = instruction->mutable_operand(0);
           }
           break;
         }
         case HloOpcode::kDynamicSlice: {
-          if (result.dynamic_slice.has_value()) {
+          if (is_reduce_scatter) {
+            VLOG(4) << "get_origin_tuple_index, instruction: ("
+                    << instruction->ToString()
+                    << ") is an unsupported operation on accumulation buffer.";
+            result.unsupported_operation = true;
+          } else if (result.dynamic_slice.has_value()) {
             VLOG(4) << "get_origin_tuple_index, instruction: ("
                     << instruction->ToString()
                     << "), we do not yet support more than 1 dynamic-slices on"
-                    << " the accmulation buffer.";
+                    << " the accumulation buffer.";
             result.unsupported_operation = true;
           } else {
             result.dynamic_slice = instruction;
@@ -275,8 +319,8 @@ MovableAllReduceContext IsAllReduceMovable(
   // only forwarding operations, returns the index of the result buffer in the
   // output parameter tuple.
   auto get_output_tuple_index =
-      [](HloInstruction* instruction,
-         HloComputation* while_body) -> BufferTupleIndex {
+      [is_reduce_scatter](HloInstruction* instruction,
+                          HloComputation* while_body) -> BufferTupleIndex {
     VLOG(4) << "get_output_tuple_index called on " << instruction->ToString();
     BufferTupleIndex result;
     std::stack<HloInstruction*> to_visit;
@@ -292,11 +336,15 @@ MovableAllReduceContext IsAllReduceMovable(
           case HloOpcode::kGetTupleElement:
           case HloOpcode::kTranspose:
           case HloOpcode::kSlice: {
-            to_visit.push(user);
+            if (is_reduce_scatter) {
+              result.unsupported_operation = true;
+            } else {
+              to_visit.push(user);
+            }
             break;
           }
           case HloOpcode::kDynamicUpdateSlice: {
-            if (result.dynamic_update_slice.has_value()) {
+            if (result.dynamic_update_slice.has_value() || is_reduce_scatter) {
               result.unsupported_operation = true;
             } else {
               result.dynamic_update_slice = user;
@@ -305,11 +353,11 @@ MovableAllReduceContext IsAllReduceMovable(
             break;
           }
           case HloOpcode::kTuple: {
-            if (!result.tuple_index.empty()) {
+            if (result.tuple_index.has_value()) {
               // Note that we don't support nested tuples as of now.
               result.unsupported_operation = true;
             } else {
-              result.tuple_index.push_back(user->operand_index(instruction));
+              result.tuple_index = user->operand_index(instruction);
               if (while_body->root_instruction() == user) {
                 if (result.returned_from_computation) {
                   result.unsupported_operation = true;
@@ -339,36 +387,33 @@ MovableAllReduceContext IsAllReduceMovable(
   // Checks whether any buffer in the list of accumulation contexts is used in
   // the parent computation except for forwarding uses.
   auto is_buffer_used =
-      [&is_value_replicated_within_replica_group](
+      [&is_value_replicated_within_replica_group, is_reduce_scatter](
           absl::Span<const AccumulationContext> accumulation_contexts,
           HloComputation* while_body_computation) -> bool {
-    std::vector<HloInstruction*> parameter_instructions;
-    absl::c_copy_if(while_body_computation->instructions(),
-                    std::back_inserter(parameter_instructions),
-                    [](HloInstruction* instruction) -> bool {
-                      return instruction->opcode() == HloOpcode::kParameter;
-                    });
+    // While loop body computation should have a single parameter.
+    CHECK_EQ(while_body_computation->num_parameters(), 1);
+    HloInstruction* parameter_instruction =
+        while_body_computation->parameter_instruction(0);
     for (const auto& accumulation : accumulation_contexts) {
       HloInstruction* accumulation_instruction =
           accumulation.accumulation_instruction;
-      int tuple_index = accumulation.param_tuple_indices[0];
+      int64_t tuple_index = accumulation.param_tuple_index;
       std::stack<HloInstruction*> to_visit;
       // TODO(b/176437845): simplify the logic below by using
       // TuplePointsToAnalysis.
-      for (HloInstruction* parameter_instruction : parameter_instructions) {
-        // Iterate over all users of the while body parameter and find all
-        // instructions that use the accumulation buffer, as specified by
-        // tuple_index.
-        // This logic could be simplied by using TuplePointsToAnalysis, which
-        // we leave to a future CL (see TODO above).
-        for (HloInstruction* user : parameter_instruction->users()) {
-          if (auto* gte = DynCast<HloGetTupleElementInstruction>(user)) {
-            if (gte->tuple_index() == tuple_index) {
-              to_visit.push(user);
-            }
-          } else {
-            return true;
+
+      // Iterate over all users of the while body parameter and find all
+      // instructions that use the accumulation buffer, as specified by
+      // tuple_index.
+      // This logic could be simplied by using TuplePointsToAnalysis, which
+      // we leave to a future CL (see TODO above).
+      for (HloInstruction* user : parameter_instruction->users()) {
+        if (auto* gte = DynCast<HloGetTupleElementInstruction>(user)) {
+          if (gte->tuple_index() == tuple_index) {
+            to_visit.push(user);
           }
+        } else {
+          return true;
         }
       }
 
@@ -382,6 +427,11 @@ MovableAllReduceContext IsAllReduceMovable(
             case HloOpcode::kConvert:
             case HloOpcode::kReshape:
             case HloOpcode::kTranspose:
+              if (is_reduce_scatter) {
+                VLOG(4) << "buffer is used by " << user->ToString()
+                        << ", preventing the motion of reduce-scatter.";
+                return true;
+              }
               to_visit.push(user);
               break;
             case HloOpcode::kSelect: {
@@ -458,98 +508,113 @@ MovableAllReduceContext IsAllReduceMovable(
 
   // Finds all accumulation contexts of the given all-reduce instruction
   // if it is movable.
-  auto get_accumulation_contexts =
-      [&get_origin_tuple_index, &get_output_tuple_index, &is_buffer_used,
-       &dus_matches_ds_offsets, &dus_indices_are_replicated](
-          HloInstruction* all_reduce,
-          HloComputation* while_body) -> MovableAllReduceContext {
-    std::vector<AccumulationContext> accumulation_contexts;
-    // DFS starting from the all-reduce instruction and stops at the first
-    // non-triival uses of the all-reduce result or finds all accmululations
-    // of the all-reduce result.
-    std::stack<HloInstruction*> to_visit;
-    // By default movable unless we find that it's not.
-    bool is_all_reduce_movable = true;
-    to_visit.push(all_reduce);
+  std::vector<AccumulationContext> accumulation_contexts;
+  // DFS starting from the all-reduce instruction and stops at the first
+  // non-trival uses of the all-reduce result or finds all accmululations
+  // of the all-reduce result.
+  std::stack<HloInstruction*> to_visit;
+  // By default movable unless we find that it's not.
+  bool is_all_reduce_movable = true;
+  to_visit.push(all_reduce);
 
-    while (!to_visit.empty() && is_all_reduce_movable) {
-      HloInstruction* instruction = to_visit.top();
-      to_visit.pop();
-      for (HloInstruction* user : instruction->users()) {
-        switch (user->opcode()) {
-          case HloOpcode::kBitcast:
-          case HloOpcode::kConvert:
-          case HloOpcode::kReshape:
-          case HloOpcode::kGetTupleElement:
-          case HloOpcode::kTranspose:
-          case HloOpcode::kSlice: {
+  while (!to_visit.empty() && is_all_reduce_movable) {
+    HloInstruction* instruction = to_visit.top();
+    to_visit.pop();
+    for (HloInstruction* user : instruction->users()) {
+      switch (user->opcode()) {
+        case HloOpcode::kConvert:
+          to_visit.push(user);
+          break;
+
+        case HloOpcode::kBitcast:
+        case HloOpcode::kReshape:
+        case HloOpcode::kGetTupleElement:
+        case HloOpcode::kTranspose:
+        case HloOpcode::kSlice: {
+          if (is_reduce_scatter) {
+            is_all_reduce_movable = false;
+          } else {
             to_visit.push(user);
-            break;
           }
-          case HloOpcode::kSelect: {
-            if ((user->operand_index(instruction) == 1 &&
-                 IsZero(user->operand(2))) ||
-                (user->operand_index(instruction) == 2 &&
-                 IsZero(user->operand(1)))) {
-              to_visit.push(user);
-            } else {
-              is_all_reduce_movable = false;
-              break;
-            }
-            break;
-          }
-          case HloOpcode::kAdd: {
-            int64_t buffer_index = 1 - user->operand_index(instruction);
-            HloInstruction* accumulation_buffer =
-                user->mutable_operand(buffer_index);
-
-            auto origin_buffer_tuple_index =
-                get_origin_tuple_index(accumulation_buffer);
-            if (origin_buffer_tuple_index.unsupported_operation) {
-              is_all_reduce_movable = false;
-              break;
+          break;
+        }
+        case HloOpcode::kSelect: {
+          bool is_select_ok = [&]() {
+            bool operand_1_match = user->operand_index(instruction) == 1 &&
+                                   IsZero(user->operand(2));
+            bool operand_2_match = user->operand_index(instruction) == 2 &&
+                                   IsZero(user->operand(1));
+            if (!operand_1_match && !operand_2_match) {
+              return false;
             }
 
-            auto output_buffer_tuple_index =
-                get_output_tuple_index(user, while_body);
-            if (!output_buffer_tuple_index.unsupported_operation &&
-                output_buffer_tuple_index.returned_from_computation &&
-                !origin_buffer_tuple_index.tuple_index.empty() &&
-                absl::c_equal(origin_buffer_tuple_index.tuple_index,
-                              output_buffer_tuple_index.tuple_index) &&
-                (origin_buffer_tuple_index.dynamic_slice.has_value() ==
-                 output_buffer_tuple_index.dynamic_update_slice.has_value()) &&
-                (!origin_buffer_tuple_index.dynamic_slice.has_value() ||
-                 (dus_matches_ds_offsets(
-                      **origin_buffer_tuple_index.dynamic_slice,
-                      **output_buffer_tuple_index.dynamic_update_slice) &&
-                  dus_indices_are_replicated(
-                      **output_buffer_tuple_index.dynamic_update_slice)))) {
-              accumulation_contexts.push_back(AccumulationContext{
-                  user, accumulation_buffer,
-                  std::move(output_buffer_tuple_index.tuple_index),
-                  origin_buffer_tuple_index.dynamic_slice,
-                  output_buffer_tuple_index.dynamic_update_slice});
-            } else {
-              is_all_reduce_movable = false;
+            if (!is_reduce_scatter) {
+              return true;
             }
-            break;
-          }
-          default: {
-            VLOG(4) << "get_accumulation_contexts, all-reduce result is used "
-                    << " by " << user->ToString() << ", not movable.";
+            // For reduce-scatter, check that the predicate is effectively
+            // scalar.
+            HloInstruction* predicate = user->mutable_operand(0);
+            return GetEffectiveScalar(predicate) != nullptr;
+          }();
+          if (is_select_ok) {
+            to_visit.push(user);
+          } else {
             is_all_reduce_movable = false;
           }
+          break;
+        }
+        case HloOpcode::kAdd: {
+          int64_t buffer_index = 1 - user->operand_index(instruction);
+          HloInstruction* accumulation_buffer =
+              user->mutable_operand(buffer_index);
+
+          auto origin_buffer_tuple_index =
+              get_origin_tuple_index(accumulation_buffer);
+          if (origin_buffer_tuple_index.unsupported_operation) {
+            is_all_reduce_movable = false;
+            break;
+          }
+
+          // TODO(b/276505779): We also need to check that the accumulators are
+          // not referenced in the loop condition.
+          auto output_buffer_tuple_index =
+              get_output_tuple_index(user, while_body);
+          if (!output_buffer_tuple_index.unsupported_operation &&
+              output_buffer_tuple_index.returned_from_computation &&
+              origin_buffer_tuple_index.tuple_index.has_value() &&
+              output_buffer_tuple_index.tuple_index.has_value() &&
+              origin_buffer_tuple_index.tuple_index ==
+                  output_buffer_tuple_index.tuple_index &&
+              (origin_buffer_tuple_index.dynamic_slice.has_value() ==
+               output_buffer_tuple_index.dynamic_update_slice.has_value()) &&
+              (!origin_buffer_tuple_index.dynamic_slice.has_value() ||
+               (dus_matches_ds_offsets(
+                    **origin_buffer_tuple_index.dynamic_slice,
+                    **output_buffer_tuple_index.dynamic_update_slice) &&
+                dus_indices_are_replicated(
+                    **output_buffer_tuple_index.dynamic_update_slice)))) {
+            accumulation_contexts.push_back(AccumulationContext{
+                user, accumulation_buffer,
+                *output_buffer_tuple_index.tuple_index,
+                origin_buffer_tuple_index.dynamic_slice,
+                output_buffer_tuple_index.dynamic_update_slice});
+          } else {
+            is_all_reduce_movable = false;
+          }
+          break;
+        }
+        default: {
+          VLOG(4) << "get_accumulation_contexts, all-reduce result is used "
+                  << " by " << user->ToString() << ", not movable.";
+          is_all_reduce_movable = false;
         }
       }
     }
-    if (is_buffer_used(accumulation_contexts, while_body)) {
-      is_all_reduce_movable = false;
-    }
-    return MovableAllReduceContext{is_all_reduce_movable,
-                                   accumulation_contexts};
-  };
-  return get_accumulation_contexts(all_reduce, while_body);
+  }
+  if (is_buffer_used(accumulation_contexts, while_body)) {
+    is_all_reduce_movable = false;
+  }
+  return MovableAllReduceContext{is_all_reduce_movable, accumulation_contexts};
 }
 
 struct WhileInitContext {
@@ -573,14 +638,22 @@ WhileInitContext CreateNewWhileInit(
        all_reduce_to_accumulations) {
     const std::vector<AccumulationContext>& accumulations =
         all_reduce_and_accumulations_pair.second;
+    HloInstruction* loop_all_reduce = all_reduce_and_accumulations_pair.first;
+
     for (auto& accumulation_context : accumulations) {
-      CHECK_EQ(accumulation_context.param_tuple_indices.size(), 1);
-      int tuple_index = accumulation_context.param_tuple_indices[0];
+      int64_t tuple_index = accumulation_context.param_tuple_index;
       HloInstruction* old_buffer = old_while_init->mutable_operand(tuple_index);
+      // For reduce-scatter, the shape of the accumulator is the pre-scatter
+      // shape.
+      const Shape& accumulation_shape =
+          loop_all_reduce->opcode() == HloOpcode::kAllReduce
+              ? old_buffer->shape()
+              : loop_all_reduce->operand(0)->shape();
+
       HloInstruction* new_buffer = while_parent->AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::CreateFromDimensions(
-              old_buffer->shape().element_type(),
-              old_buffer->shape().dimensions())));
+              accumulation_shape.element_type(),
+              accumulation_shape.dimensions())));
       new_while_init_elements[tuple_index] = new_buffer;
     }
   }
@@ -597,9 +670,123 @@ WhileInitContext CreateNewWhileInit(
   return WhileInitContext{new_while_init, tuple_index_to_old_buffer};
 }
 
+// When moving reduce-scatter outside the while body, change the associated
+// accumulation buffers to use the shape of the operand of the reduce-scatter
+// (i.e., the pre-scatter shape).
+Status ChangeAccumulatorShapesInLoopBodies(
+    HloInstruction* old_while_instruction,
+    const HloInstructionMap<std::vector<AccumulationContext>>&
+        all_reduce_to_accumulations) {
+  HloComputation* body = old_while_instruction->while_body();
+  HloComputation* cond = old_while_instruction->while_condition();
+
+  // Cache of zero shaped constants
+  absl::flat_hash_map<Shape, HloInstruction*> zeros;
+
+  auto create_zero_of_shape = [&zeros, body](const Shape& shape) {
+    auto it = zeros.find(shape);
+    if (it != zeros.end()) {
+      return it->second;
+    }
+    HloInstruction* zero = body->AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateFromShape(shape)));
+    zeros[shape] = zero;
+    return zero;
+  };
+
+  for (const auto& [loop_reduce_scatter, accumulations] :
+       all_reduce_to_accumulations) {
+    if (loop_reduce_scatter->opcode() != HloOpcode::kReduceScatter) {
+      continue;
+    }
+    const Shape& accumulation_shape = loop_reduce_scatter->operand(0)->shape();
+    for (auto& accumulation_context : accumulations) {
+      const int64_t tuple_index = accumulation_context.param_tuple_index;
+      // Change shape of parameter tuple element at index for while body.
+      HloInstruction* param_body = body->parameter_instruction(0);
+
+      std::vector<Shape> element_shapes = param_body->shape().tuple_shapes();
+      element_shapes[tuple_index] = accumulation_shape;
+      *param_body->mutable_shape() = ShapeUtil::MakeTupleShape(element_shapes);
+
+      // Find the GTE for this index and change its type and its users.
+      // For reduce-scatter, we do not allow any forwarding instructions, so
+      // we expect that the user is the accumulation instruction.
+      for (HloInstruction* user : param_body->users()) {
+        if (user->opcode() != HloOpcode::kGetTupleElement) {
+          continue;
+        }
+        HloGetTupleElementInstruction* gte =
+            Cast<HloGetTupleElementInstruction>(user);
+        if (gte->tuple_index() != tuple_index) {
+          continue;
+        }
+
+        *gte->mutable_shape() = accumulation_shape;
+        for (HloInstruction* gte_user : gte->users()) {
+          CHECK_EQ(gte_user->opcode(), HloOpcode::kAdd);
+          *gte_user->mutable_shape() = accumulation_shape;
+        }
+      }
+
+      // Change the users of the reduce-scatter. We expect it to feed into an
+      // add through an optional chain of selects.
+      std::vector<HloInstruction*> reduce_scatter_users =
+          loop_reduce_scatter->users();
+      while (!reduce_scatter_users.empty()) {
+        HloInstruction* user = reduce_scatter_users.back();
+        reduce_scatter_users.pop_back();
+        if (user->opcode() == HloOpcode::kSelect) {
+          // Select should have a zero on one side and reduce-scatter on other
+          // with an effectively scalar predicate.
+          HloInstruction* zero = create_zero_of_shape(accumulation_shape);
+
+          // We have verified that the predicate is effectively scalar.
+          HloInstruction* scalar_predicate =
+              GetEffectiveScalar(user->mutable_operand(0));
+          Shape pred_shape =
+              ShapeUtil::ChangeElementType(accumulation_shape, PRED);
+          HloInstruction* pred =
+              body->AddInstruction(HloInstruction::CreateBroadcast(
+                  pred_shape, scalar_predicate, {}));
+          TF_RETURN_IF_ERROR(user->ReplaceOperandWithDifferentShape(0, pred));
+          HloInstruction *new_operand_1, *new_operand_2;
+          if (user->operand_index(loop_reduce_scatter) == 1) {
+            new_operand_1 = loop_reduce_scatter->mutable_operand(0);
+            new_operand_2 = zero;
+          } else {
+            new_operand_1 = zero;
+            new_operand_2 = loop_reduce_scatter->mutable_operand(0);
+          }
+          TF_RETURN_IF_ERROR(
+              user->ReplaceOperandWithDifferentShape(1, new_operand_1));
+          TF_RETURN_IF_ERROR(
+              user->ReplaceOperandWithDifferentShape(2, new_operand_2));
+          *user->mutable_shape() = accumulation_shape;
+        } else {
+          TF_RET_CHECK(user->opcode() == HloOpcode::kAdd);
+          // We should have already changed the Add's shape when patching input
+          // GTE above.
+          TF_RET_CHECK(user->shape() == accumulation_shape);
+        }
+      }
+
+      // Change result tuple of the while body.
+      HloInstruction* root = body->root_instruction();
+      *root->mutable_shape() = param_body->shape();
+
+      // Change parameter type for condition.
+      HloInstruction* param_cond = cond->parameter_instruction(0);
+      *param_cond->mutable_shape() = param_body->shape();
+    }
+  }
+
+  return OkStatus();
+}
+
 // Creates all the sinked all-reduce instructions in the while instruction's
-// parent computation. Returns a map that maps a tuple index of an accumulation
-// buffer to it's corresponding all-reduce.
+// parent computation. Returns a map that maps a tuple index of an
+// accumulation buffer to it's corresponding all-reduce.
 absl::flat_hash_map<int, HloInstruction*> CreateSinkedAllReduces(
     HloInstruction* new_while_instruction,
     const HloInstructionMap<std::vector<AccumulationContext>>&
@@ -614,34 +801,48 @@ absl::flat_hash_map<int, HloInstruction*> CreateSinkedAllReduces(
     const std::vector<AccumulationContext>& accumulations =
         all_reduce_and_accumulations_pair.second;
     for (const auto& accumulation_context : accumulations) {
-      CHECK_EQ(accumulation_context.param_tuple_indices.size(), 1);
-      int tuple_index = accumulation_context.param_tuple_indices[0];
+      int64_t tuple_index = accumulation_context.param_tuple_index;
       const Shape& accumulation_buffer_shape =
           new_while_instruction->shape().tuple_shapes(tuple_index);
       HloInstruction* accumulation_buffer =
           while_parent->AddInstruction(HloInstruction::CreateGetTupleElement(
               accumulation_buffer_shape, new_while_instruction, tuple_index));
-      HloAllReduceInstruction* old_all_reduce =
-          Cast<HloAllReduceInstruction>(loop_all_reduce);
       HloInstruction* all_reduce_operand = accumulation_buffer;
-      if (!ShapeUtil::SameElementType(old_all_reduce->shape(),
+      if (!ShapeUtil::SameElementType(loop_all_reduce->shape(),
                                       accumulation_buffer_shape)) {
         Shape all_reduce_shape =
-            ShapeUtil::MakeShape(old_all_reduce->shape().element_type(),
+            ShapeUtil::MakeShape(loop_all_reduce->shape().element_type(),
                                  accumulation_buffer_shape.dimensions());
         all_reduce_operand =
             while_parent->AddInstruction(HloInstruction::CreateConvert(
                 all_reduce_shape, accumulation_buffer));
       }
-      HloInstruction* new_all_reduce =
-          while_parent->AddInstruction(HloInstruction::CreateAllReduce(
-              all_reduce_operand->shape(), {all_reduce_operand},
-              old_all_reduce->called_computations()[0],
-              old_all_reduce->replica_groups(),
-              old_all_reduce->constrain_layout(),
-              hlo_query::NextChannelId(*(while_parent->parent())),
-              old_all_reduce->use_global_device_ids()));
-      HloInstruction* all_reduced_delta = new_all_reduce;
+      HloInstruction* all_reduced_delta;
+      if (loop_all_reduce->opcode() == HloOpcode::kAllReduce) {
+        auto* old_all_reduce = Cast<HloAllReduceInstruction>(loop_all_reduce);
+
+        all_reduced_delta =
+            while_parent->AddInstruction(HloInstruction::CreateAllReduce(
+                all_reduce_operand->shape(), {all_reduce_operand},
+                old_all_reduce->called_computations()[0],
+                old_all_reduce->replica_groups(),
+                old_all_reduce->constrain_layout(),
+                hlo_query::NextChannelId(*(while_parent->parent())),
+                old_all_reduce->use_global_device_ids()));
+      } else {
+        auto* old_reduce_scatter =
+            Cast<HloReduceScatterInstruction>(loop_all_reduce);
+        all_reduced_delta =
+            while_parent->AddInstruction(HloInstruction::CreateReduceScatter(
+                old_reduce_scatter->shape(), {all_reduce_operand},
+                old_reduce_scatter->called_computations()[0],
+                old_reduce_scatter->replica_groups(),
+                old_reduce_scatter->constrain_layout(),
+                hlo_query::NextChannelId(*(while_parent->parent())),
+                old_reduce_scatter->use_global_device_ids(),
+                old_reduce_scatter->scatter_dimension()));
+      }
+
       if (!ShapeUtil::SameElementType(all_reduced_delta->shape(),
                                       accumulation_buffer_shape)) {
         all_reduced_delta =
@@ -688,22 +889,28 @@ HloInstruction* CreateNewWhileResult(
   return new_while_result;
 }
 
-// Creates the sinked all-reduce instructions for all accumulation buffers. The
-// all-reduce outputs are then added to the original accumulation buffers.
+// Creates the sinked all-reduce instructions for all accumulation buffers.
+// The all-reduce outputs are then added to the original accumulation buffers.
 // Creates a tuple that groups the while loop output and the accumulated
 // buffers and replaces all uses of the old while with this new tuple.
 Status AddSinkedAllReducesAndReplaceWhile(
     HloInstruction* while_instruction,
     const HloInstructionMap<std::vector<AccumulationContext>>&
         all_reduce_to_accumulations) {
-  // Note that we create all instructions before replacing and removing any old
-  // instruction. This ensures that we do not accidentally access any deleted
-  // instruction when creating new instructions.
+  // Note that we create all instructions before replacing and removing any
+  // old instruction. This ensures that we do not accidentally access any
+  // deleted instruction when creating new instructions.
 
-  // Step 1) create the new while init instruction, which uses zero-initialized
-  // tensors as the accumulation buffers for the all-reduce.
+  // Step 1) create the new while init instruction, which uses
+  // zero-initialized tensors as the accumulation buffers for the all-reduce.
   auto new_while_init_context =
       CreateNewWhileInit(while_instruction, all_reduce_to_accumulations);
+
+  // For reduce-scatter, we need to adjust all the accumulator shapes to use
+  // the pre-scatter shape.
+  TF_RETURN_IF_ERROR(ChangeAccumulatorShapesInLoopBodies(
+      while_instruction, all_reduce_to_accumulations));
+
   // Step 2) create the new while instruction.
   HloInstruction* new_while_instruction =
       while_instruction->parent()->AddInstruction(HloInstruction::CreateWhile(
@@ -757,6 +964,7 @@ StatusOr<bool> WhileLoopAllReduceCodeMotion::Run(
   // The while instruction's parent could be a while body for another while
   // loop. We recursively sink the all-reduce through nested while loops if
   // applicable by repeating this process.
+  uint32_t count_all_reduce = 0, count_reduce_scatter = 0;
   while (run_next_pass) {
     run_next_pass = false;
     std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
@@ -782,21 +990,27 @@ StatusOr<bool> WhileLoopAllReduceCodeMotion::Run(
       if (while_caller_instructions.empty()) {
         continue;
       }
-      std::vector<HloAllReduceInstruction*> while_body_all_reduces;
+      std::vector<HloAllReduceInstructionBase*> while_body_all_reduces;
       for (HloInstruction* while_body_instruction :
            computation->MakeInstructionPostOrder()) {
-        if (auto* all_reduce_instruction =
-                DynCast<HloAllReduceInstruction>(while_body_instruction)) {
-          if (all_reduce_instruction->constrain_layout()) {
-            return false;
-          } else {
-            while_body_all_reduces.push_back(all_reduce_instruction);
-          }
+        HloOpcode op = while_body_instruction->opcode();
+        const bool is_candidate =
+            (op == HloOpcode::kAllReduce) ||
+            (enable_reduce_scatter_ && op == HloOpcode::kReduceScatter);
+        if (!is_candidate) {
+          continue;
+        }
+        auto* all_reduce_instruction =
+            Cast<HloAllReduceInstructionBase>(while_body_instruction);
+        if (all_reduce_instruction->constrain_layout()) {
+          return false;
+        } else {
+          while_body_all_reduces.push_back(all_reduce_instruction);
         }
       }
       HloInstructionMap<std::vector<AccumulationContext>>
           all_reduce_to_accumulations;
-      for (HloAllReduceInstruction* all_reduce : while_body_all_reduces) {
+      for (HloAllReduceInstructionBase* all_reduce : while_body_all_reduces) {
         auto movable_all_reduce_context = IsAllReduceMovable(
             all_reduce, computation, cross_replica_replication_analysis,
             cross_partition_replication_analysis);
@@ -828,11 +1042,18 @@ StatusOr<bool> WhileLoopAllReduceCodeMotion::Run(
       for (const auto& all_reduce_accumulations_pair :
            all_reduce_to_accumulations) {
         HloInstruction* all_reduce = all_reduce_accumulations_pair.first;
-        TF_RETURN_IF_ERROR(computation->ReplaceInstruction(
+        if (all_reduce->opcode() == HloOpcode::kAllReduce) {
+          count_all_reduce++;
+        } else {
+          count_reduce_scatter++;
+        }
+        TF_RETURN_IF_ERROR(computation->ReplaceInstructionWithDifferentShape(
             all_reduce, all_reduce->mutable_operand(0)));
       }
     }
   }
+  VLOG(2) << "Hoisted " << count_all_reduce << " all-reduce and "
+          << count_reduce_scatter << " reduce-scatter out of while loops";
   return is_changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.h b/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.h
index 40a8c6e13a4..50a30273af3 100644
--- a/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.h
@@ -44,7 +44,8 @@ namespace xla {
 // a += e
 class WhileLoopAllReduceCodeMotion : public HloModulePass {
  public:
-  WhileLoopAllReduceCodeMotion() = default;
+  explicit WhileLoopAllReduceCodeMotion(bool enable_reduce_scatter = false)
+      : enable_reduce_scatter_(enable_reduce_scatter) {}
   ~WhileLoopAllReduceCodeMotion() override = default;
 
   absl::string_view name() const override {
@@ -54,6 +55,9 @@ class WhileLoopAllReduceCodeMotion : public HloModulePass {
   StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const bool enable_reduce_scatter_;
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion_test.cc
index 666e664506e..7f9927ea4e8 100644
--- a/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion_test.cc
@@ -16,8 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.h"
 
 #include <algorithm>
+#include <array>
 #include <iterator>
 #include <optional>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/string_view.h"
@@ -28,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 
 namespace xla {
@@ -40,7 +42,15 @@ using ::testing::NotNull;
 using ::testing::Property;
 using ::testing::SizeIs;
 
-class WhileLoopAllReduceCodeMotionTest : public HloTestBase {};
+class WhileLoopAllReduceCodeMotionTest : public HloTestBase {
+ public:
+  template <HloOpcode op>
+  HloInstruction* find_op(HloComputation* computation) {
+    return *std::find_if(computation->instructions().begin(),
+                         computation->instructions().end(),
+                         HloPredicateIsOp<op>);
+  }
+};
 
 TEST_F(WhileLoopAllReduceCodeMotionTest, AllReduceAccumulate) {
   constexpr absl::string_view kHloModule = R"(
@@ -91,42 +101,155 @@ TEST_F(WhileLoopAllReduceCodeMotionTest, AllReduceAccumulate) {
       HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/true)
           .Run(module.get())
           .status());
-  HloInstruction* transformed_while =
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) -> bool {
-                       return Value(instruction, op::While());
-                     }));
-
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* transformed_while = find_op<HloOpcode::kWhile>(entry);
   ASSERT_THAT(transformed_while, NotNull());
   EXPECT_THAT(transformed_while->while_body()->instructions(),
               Each(Not(op::AllReduce())));
   HloInstruction* accumulation_buffer =
       transformed_while->mutable_operand(0)->mutable_operand(3);
   EXPECT_THAT(accumulation_buffer, op::Constant());
-  HloAllReduceInstruction* moved_all_reduce = DynCast<HloAllReduceInstruction>(
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) {
-                       return Value(instruction, op::AllReduce());
-                     })));
+  HloAllReduceInstruction* moved_all_reduce =
+      DynCast<HloAllReduceInstruction>(find_op<HloOpcode::kAllReduce>(entry));
   ASSERT_THAT(moved_all_reduce, NotNull());
   EXPECT_THAT(moved_all_reduce->operand(0), op::GetTupleElement());
   EXPECT_EQ(DynCast<HloGetTupleElementInstruction>(
                 moved_all_reduce->mutable_operand(0))
                 ->tuple_index(),
             3);
-  EXPECT_THAT(moved_all_reduce->replica_groups(), SizeIs(1));
-  EXPECT_TRUE(
-      std::equal(moved_all_reduce->replica_groups()[0].replica_ids().begin(),
-                 moved_all_reduce->replica_groups()[0].replica_ids().end(),
-                 std::vector<int>{0, 1, 2, 3}.begin()));
+  EXPECT_THAT(moved_all_reduce, op::ReplicaGroups({{0, 1, 2, 3}}));
   EXPECT_FALSE(moved_all_reduce->constrain_layout());
   EXPECT_TRUE(moved_all_reduce->use_global_device_ids());
   HloComputation* reduction_computation =
       module->GetComputationWithName("reduction");
   ASSERT_THAT(reduction_computation, NotNull());
-  EXPECT_EQ(moved_all_reduce->called_computations()[0], reduction_computation);
+  EXPECT_EQ(moved_all_reduce->to_apply(), reduction_computation);
+}
+
+TEST_F(WhileLoopAllReduceCodeMotionTest, ReduceScatterAccumulate) {
+  constexpr absl::string_view kHloModule = R"(
+    HloModule accumulated_reduce_scatter
+
+    %reduction {
+      %x = f32[] parameter(0)
+      %y = f32[] parameter(1)
+      ROOT %add = f32[] add(f32[] %x, f32[] %y)
+    }
+
+    %while_condition {
+      %param = (s32[], s32[], f32[4096, 1024], f32[1024, 1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      ROOT result = pred[] compare(%gte.0, %gte.1), direction=LT
+    }
+
+    %while_body {
+      %param = (s32[], s32[], f32[4096, 1024], f32[1024, 1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      %gte.2 = f32[4096, 1024] get-tuple-element(%param), index=2
+      %gte.3 = f32[1024, 1024] get-tuple-element(%param), index=3
+      %reduce-scatter = f32[1024, 1024] reduce-scatter(f32[4096, 1024] %gte.2), channel_id=1, replica_groups={{0,1,2,3}}, use_global_device_ids=true, to_apply=%reduction, dimensions={0}
+      %accumulation = f32[1024, 1024] add(f32[1024, 1024] %reduce-scatter, f32[1024, 1024] %gte.3)
+      %constant = s32[] constant(1)
+      %increment_iteration = s32[] add(s32[] %gte.0, s32[] %constant)
+      ROOT %loop_result = (s32[], s32[], f32[4096, 1024], f32[1024, 1024]) tuple(%increment_iteration, %gte.1, %gte.2, %accumulation)
+    }
+
+    ENTRY accumulated_all_reduce {
+      %param.0 = s32[] parameter(0)
+      %param.1 = f32[4096, 1024] parameter(1)
+      %constant.0 = s32[] constant(1)
+      %accumulation_buffer_init = f32[] constant(0)
+      %accumulation_buffer = f32[1024, 1024] broadcast(f32[] %accumulation_buffer_init), dimensions={}
+      %while_init = (s32[], s32[], f32[4096, 1024], f32[1024, 1024]) tuple(s32[] %constant.0, s32[] %param.0, f32[4096, 1024] %param.1, f32[1024, 1024] %accumulation_buffer)
+      ROOT %while = (s32[], s32[], f32[4096, 1024], f32[1024, 1024]) while(%while_init), condition=%while_condition, body=%while_body
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopAllReduceCodeMotion{/*enable_reduce_scatter=*/true}.Run(
+          module.get()));
+  ASSERT_TRUE(simplified_loop);
+  TF_ASSERT_OK(
+      HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/true)
+          .Run(module.get())
+          .status());
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* transformed_while = find_op<HloOpcode::kWhile>(entry);
+  ASSERT_THAT(transformed_while, NotNull());
+  EXPECT_THAT(transformed_while->while_body()->instructions(),
+              Each(Not(op::ReduceScatter())));
+  HloInstruction* accumulation_buffer =
+      transformed_while->mutable_operand(0)->mutable_operand(3);
+  EXPECT_THAT(accumulation_buffer, op::Constant());
+  // Verify that the accumulation buffer's shape changed.
+  EXPECT_THAT(accumulation_buffer, op::Shape("f32[4096, 1024]"));
+  auto* moved_reduce_scatter = DynCast<HloReduceScatterInstruction>(
+      find_op<HloOpcode::kReduceScatter>(entry));
+  ASSERT_THAT(moved_reduce_scatter, NotNull());
+  EXPECT_THAT(moved_reduce_scatter->operand(0), op::GetTupleElement());
+  EXPECT_EQ(DynCast<HloGetTupleElementInstruction>(
+                moved_reduce_scatter->mutable_operand(0))
+                ->tuple_index(),
+            3);
+  EXPECT_THAT(moved_reduce_scatter, op::ReplicaGroups({{0, 1, 2, 3}}));
+  EXPECT_FALSE(moved_reduce_scatter->constrain_layout());
+  EXPECT_TRUE(moved_reduce_scatter->use_global_device_ids());
+  HloComputation* reduction_computation =
+      module->GetComputationWithName("reduction");
+  ASSERT_THAT(reduction_computation, NotNull());
+  EXPECT_EQ(moved_reduce_scatter->to_apply(), reduction_computation);
+}
+
+TEST_F(WhileLoopAllReduceCodeMotionTest,
+       ReduceScatterAccumulateDisabledByDefault) {
+  constexpr absl::string_view kHloModule = R"(
+    HloModule accumulated_reduce_scatter
+
+    %reduction {
+      %x = f32[] parameter(0)
+      %y = f32[] parameter(1)
+      ROOT %add = f32[] add(f32[] %x, f32[] %y)
+    }
+
+    %while_condition {
+      %param = (s32[], s32[], f32[4096, 1024], f32[1024, 1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      ROOT result = pred[] compare(%gte.0, %gte.1), direction=LT
+    }
+
+    %while_body {
+      %param = (s32[], s32[], f32[4096, 1024], f32[1024, 1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      %gte.2 = f32[4096, 1024] get-tuple-element(%param), index=2
+      %gte.3 = f32[1024, 1024] get-tuple-element(%param), index=3
+      %reduce-scatter = f32[1024, 1024] reduce-scatter(f32[4096, 1024] %gte.2), channel_id=1, replica_groups={{0,1,2,3}}, use_global_device_ids=true, to_apply=%reduction, dimensions={0}
+      %accumulation = f32[1024, 1024] add(f32[1024, 1024] %reduce-scatter, f32[1024, 1024] %gte.3)
+      %constant = s32[] constant(1)
+      %increment_iteration = s32[] add(s32[] %gte.0, s32[] %constant)
+      ROOT %loop_result = (s32[], s32[], f32[4096, 1024], f32[1024, 1024]) tuple(%increment_iteration, %gte.1, %gte.2, %accumulation)
+    }
+
+    ENTRY accumulated_all_reduce {
+      %param.0 = s32[] parameter(0)
+      %param.1 = f32[4096, 1024] parameter(1)
+      %constant.0 = s32[] constant(1)
+      %accumulation_buffer_init = f32[] constant(0)
+      %accumulation_buffer = f32[1024, 1024] broadcast(f32[] %accumulation_buffer_init), dimensions={}
+      %while_init = (s32[], s32[], f32[4096, 1024], f32[1024, 1024]) tuple(s32[] %constant.0, s32[] %param.0, f32[4096, 1024] %param.1, f32[1024, 1024] %accumulation_buffer)
+      ROOT %while = (s32[], s32[], f32[4096, 1024], f32[1024, 1024]) while(%while_init), condition=%while_condition, body=%while_body
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopAllReduceCodeMotion{}.Run(module.get()));
+  EXPECT_FALSE(simplified_loop);
 }
 
 TEST_F(WhileLoopAllReduceCodeMotionTest, AllReduceSliceAccumulate) {
@@ -190,22 +313,15 @@ TEST_F(WhileLoopAllReduceCodeMotionTest, AllReduceSliceAccumulate) {
       HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/true)
           .Run(module.get())
           .status());
-  HloInstruction* transformed_while =
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) -> bool {
-                       return Value(instruction, op::While());
-                     }));
-
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* transformed_while = find_op<HloOpcode::kWhile>(entry);
   ASSERT_THAT(transformed_while, NotNull());
   EXPECT_THAT(transformed_while->while_body()->instructions(),
               Each(Not(op::AllReduce())));
   std::vector<HloInstruction*> hoisted_all_reduces;
   absl::c_copy_if(module->entry_computation()->instructions(),
                   std::back_inserter(hoisted_all_reduces),
-                  [](HloInstruction* instruction) {
-                    return Value(instruction, op::AllReduce());
-                  });
+                  HloPredicateIsOp<HloOpcode::kAllReduce>);
   EXPECT_THAT(hoisted_all_reduces, SizeIs(3));
   ASSERT_THAT(
       hoisted_all_reduces,
@@ -269,12 +385,8 @@ TEST_F(WhileLoopAllReduceCodeMotionTest, AllReduceAccumulateUse) {
       HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/true)
           .Run(module.get())
           .status());
-  HloInstruction* transformed_while =
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) -> bool {
-                       return Value(instruction, op::While());
-                     }));
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* transformed_while = find_op<HloOpcode::kWhile>(entry);
 
   ASSERT_THAT(transformed_while, NotNull());
   EXPECT_THAT(transformed_while->while_body()->instructions(),
@@ -385,41 +497,25 @@ TEST_F(WhileLoopAllReduceCodeMotionTest, TypeCastAllReduceAccumulate) {
       HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/true)
           .Run(module.get())
           .status());
-  HloInstruction* transformed_while =
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) -> bool {
-                       return Value(instruction, op::While());
-                     }));
-
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* transformed_while = find_op<HloOpcode::kWhile>(entry);
   ASSERT_THAT(transformed_while, NotNull());
   EXPECT_THAT(transformed_while->while_body()->instructions(),
               Each(Not(op::AllReduce())));
   HloInstruction* accumulation_buffer =
       transformed_while->mutable_operand(0)->mutable_operand(3);
   EXPECT_THAT(accumulation_buffer, op::Constant());
-  HloAllReduceInstruction* moved_all_reduce = DynCast<HloAllReduceInstruction>(
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) {
-                       return Value(instruction, op::AllReduce());
-                     })));
-  EXPECT_TRUE(ShapeUtil::Equal(moved_all_reduce->shape(),
-                               ShapeUtil::MakeShape(BF16, {1024, 1024})));
+  HloAllReduceInstruction* moved_all_reduce =
+      DynCast<HloAllReduceInstruction>(find_op<HloOpcode::kAllReduce>(entry));
+  EXPECT_THAT(moved_all_reduce, op::Shape("bf16[1024, 1024]"));
 
-  HloInstruction* add_delta_to_old_buffer =
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) -> bool {
-                       return Value(instruction, op::Add());
-                     }));
+  HloInstruction* add_delta_to_old_buffer = find_op<HloOpcode::kAdd>(entry);
   ASSERT_THAT(add_delta_to_old_buffer, NotNull());
-  EXPECT_TRUE(ShapeUtil::Equal(add_delta_to_old_buffer->shape(),
-                               ShapeUtil::MakeShape(F32, {1024, 1024})));
-  EXPECT_TRUE(ShapeUtil::Equal(add_delta_to_old_buffer->operand(0)->shape(),
-                               ShapeUtil::MakeShape(F32, {1024, 1024})));
-  EXPECT_TRUE(ShapeUtil::Equal(add_delta_to_old_buffer->operand(1)->shape(),
-                               ShapeUtil::MakeShape(F32, {1024, 1024})));
+  EXPECT_THAT(add_delta_to_old_buffer, op::Shape("f32[1024, 1024]"));
+  EXPECT_THAT(add_delta_to_old_buffer->operand(0),
+              op::Shape("f32[1024, 1024]"));
+  EXPECT_THAT(add_delta_to_old_buffer->operand(1),
+              op::Shape("f32[1024, 1024]"));
 }
 
 TEST_F(WhileLoopAllReduceCodeMotionTest, SelectAllReduceAccumulate) {
@@ -475,41 +571,154 @@ TEST_F(WhileLoopAllReduceCodeMotionTest, SelectAllReduceAccumulate) {
       HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/true)
           .Run(module.get())
           .status());
-  HloInstruction* transformed_while =
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) -> bool {
-                       return Value(instruction, op::While());
-                     }));
-
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* transformed_while = find_op<HloOpcode::kWhile>(entry);
   ASSERT_THAT(transformed_while, NotNull());
   EXPECT_THAT(transformed_while->while_body()->instructions(),
               Each(Not(op::AllReduce())));
   HloInstruction* accumulation_buffer =
       transformed_while->mutable_operand(0)->mutable_operand(3);
   EXPECT_THAT(accumulation_buffer, op::Constant());
-  HloAllReduceInstruction* moved_all_reduce = DynCast<HloAllReduceInstruction>(
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) {
-                       return Value(instruction, op::AllReduce());
-                     })));
-  EXPECT_TRUE(ShapeUtil::Equal(moved_all_reduce->shape(),
-                               ShapeUtil::MakeShape(F32, {1024, 1024})));
+  HloAllReduceInstruction* moved_all_reduce =
+      DynCast<HloAllReduceInstruction>(find_op<HloOpcode::kAllReduce>(entry));
+  EXPECT_THAT(moved_all_reduce, op::Shape("f32[1024,1024]"));
 
-  HloInstruction* add_delta_to_old_buffer =
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) -> bool {
-                       return Value(instruction, op::Add());
-                     }));
+  HloInstruction* add_delta_to_old_buffer = find_op<HloOpcode::kAdd>(entry);
   ASSERT_THAT(add_delta_to_old_buffer, NotNull());
-  EXPECT_TRUE(ShapeUtil::Equal(add_delta_to_old_buffer->shape(),
-                               ShapeUtil::MakeShape(F32, {1024, 1024})));
-  EXPECT_TRUE(ShapeUtil::Equal(add_delta_to_old_buffer->operand(0)->shape(),
-                               ShapeUtil::MakeShape(F32, {1024, 1024})));
-  EXPECT_TRUE(ShapeUtil::Equal(add_delta_to_old_buffer->operand(1)->shape(),
-                               ShapeUtil::MakeShape(F32, {1024, 1024})));
+  EXPECT_THAT(add_delta_to_old_buffer, op::Shape("f32[1024, 1024]"));
+  EXPECT_THAT(add_delta_to_old_buffer->operand(0),
+              op::Shape("f32[1024, 1024]"));
+  EXPECT_THAT(add_delta_to_old_buffer->operand(1),
+              op::Shape("f32[1024, 1024]"));
+}
+
+TEST_F(WhileLoopAllReduceCodeMotionTest, SelectReduceScatterAccumulate) {
+  constexpr absl::string_view kHloModule = R"(
+    HloModule accumulated_reduce_scatter
+
+    %reduction {
+      %x = bf16[] parameter(0)
+      %y = bf16[] parameter(1)
+      ROOT %add = bf16[] add(bf16[] %x, bf16[] %y)
+    }
+
+    %while_condition {
+      %param = (s32[], s32[], f32[1024,4096], f32[1024,1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      ROOT result = pred[] compare(%gte.0, %gte.1), direction=LT
+    }
+
+    %while_body {
+      %param = (s32[], s32[], f32[1024,4096], f32[1024,1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      %gte.2 = f32[1024,4096] get-tuple-element(%param), index=2
+      %gte.3 = f32[1024,1024] get-tuple-element(%param), index=3
+      %reduce-scatter = f32[1024,1024] reduce-scatter(%gte.2), channel_id=1, replica_groups={{0,1,2,3}}, use_global_device_ids=true, to_apply=%reduction, dimensions={1}
+      %const.0 = f32[] constant(0)
+      %zeros = f32[1024,1024] broadcast(%const.0), dimensions={}
+      // effectively scalar predicate
+      %scalarp = pred[] custom-call(), custom_call_target="something"
+      %predicates = pred[1024,1024] broadcast(%scalarp), dimensions={}
+      %select = f32[1024,1024] select(%predicates, %zeros, %reduce-scatter)
+      %accumulation = f32[1024,1024] add(%select, %gte.3)
+      %constant = s32[] constant(1)
+      %increment_iteration = s32[] add(s32[] %gte.0, s32[] %constant)
+      ROOT %loop_result = (s32[], s32[], f32[1024,4096], f32[1024,1024]) tuple(%increment_iteration, %gte.1, %gte.2, %accumulation)
+    }
+
+    ENTRY accumulated_all_reduce {
+      %param.0 = s32[] parameter(0)
+      %param.1 = f32[1024,4096] parameter(1)
+      %constant.0 = s32[] constant(1)
+      %accumulation_buffer_init = f32[] constant(0)
+      %accumulation_buffer = f32[1024,1024] broadcast(f32[] %accumulation_buffer_init), dimensions={}
+      %while_init = (s32[], s32[], f32[1024, 4096], f32[1024,1024]) tuple(s32[] %constant.0, s32[] %param.0, f32[1024, 4096] %param.1, f32[1024, 1024] %accumulation_buffer)
+      ROOT %while = (s32[], s32[], f32[1024, 4096], f32[1024,1024]) while(%while_init), condition=%while_condition, body=%while_body
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopAllReduceCodeMotion{/*enable_reduce_scatter=*/true}.Run(
+          module.get()));
+  ASSERT_TRUE(simplified_loop);
+  TF_ASSERT_OK(
+      HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/true)
+          .Run(module.get())
+          .status());
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* transformed_while = find_op<HloOpcode::kWhile>(entry);
+
+  ASSERT_THAT(transformed_while, NotNull());
+  EXPECT_THAT(transformed_while->while_body()->instructions(),
+              Each(Not(op::ReduceScatter())));
+  HloInstruction* accumulation_buffer =
+      transformed_while->mutable_operand(0)->mutable_operand(3);
+  EXPECT_THAT(accumulation_buffer, op::Constant());
+  EXPECT_THAT(accumulation_buffer, op::Shape("f32[1024,4096]"));
+  auto* moved_reduce_scatter = DynCast<HloReduceScatterInstruction>(
+      find_op<HloOpcode::kReduceScatter>(entry));
+  EXPECT_THAT(moved_reduce_scatter, op::Shape("f32[1024,1024]"));
+  HloInstruction* add_delta_to_old_buffer = find_op<HloOpcode::kAdd>(entry);
+  ASSERT_THAT(add_delta_to_old_buffer, NotNull());
+  EXPECT_THAT(add_delta_to_old_buffer, op::Shape("f32[1024,1024]"));
+}
+
+TEST_F(WhileLoopAllReduceCodeMotionTest,
+       SelectReduceScatterAccumulateNotScalarPredicate) {
+  constexpr absl::string_view kHloModule = R"(
+    HloModule accumulated_reduce_scatter
+
+    %reduction {
+      %x = bf16[] parameter(0)
+      %y = bf16[] parameter(1)
+      ROOT %add = bf16[] add(bf16[] %x, bf16[] %y)
+    }
+
+    %while_condition {
+      %param = (s32[], s32[], f32[1024,4096], f32[1024,1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      ROOT result = pred[] compare(%gte.0, %gte.1), direction=LT
+    }
+
+    %while_body {
+      %param = (s32[], s32[], f32[1024,4096], f32[1024,1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      %gte.2 = f32[1024,4096] get-tuple-element(%param), index=2
+      %gte.3 = f32[1024,1024] get-tuple-element(%param), index=3
+      %reduce-scatter = f32[1024,1024] reduce-scatter(%gte.2), channel_id=1, replica_groups={{0,1,2,3}}, use_global_device_ids=true, to_apply=%reduction, dimensions={1}
+      %const.0 = f32[] constant(0)
+      %zeros = f32[1024,1024] broadcast(%const.0), dimensions={}
+      %predicates = pred[1024,1024] custom-call(), custom_call_target="something"
+      %select = f32[1024,1024] select(%predicates, %zeros, %reduce-scatter)
+      %accumulation = f32[1024,1024] add(%select, %gte.3)
+      %constant = s32[] constant(1)
+      %increment_iteration = s32[] add(s32[] %gte.0, s32[] %constant)
+      ROOT %loop_result = (s32[], s32[], f32[1024,4096], f32[1024,1024]) tuple(%increment_iteration, %gte.1, %gte.2, %accumulation)
+    }
+
+    ENTRY accumulated_all_reduce {
+      %param.0 = s32[] parameter(0)
+      %param.1 = f32[1024,4096] parameter(1)
+      %constant.0 = s32[] constant(1)
+      %accumulation_buffer_init = f32[] constant(0)
+      %accumulation_buffer = f32[1024,1024] broadcast(f32[] %accumulation_buffer_init), dimensions={}
+      %while_init = (s32[], s32[], f32[1024, 4096], f32[1024,1024]) tuple(s32[] %constant.0, s32[] %param.0, f32[1024, 4096] %param.1, f32[1024, 1024] %accumulation_buffer)
+      ROOT %while = (s32[], s32[], f32[1024, 4096], f32[1024,1024]) while(%while_init), condition=%while_condition, body=%while_body
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopAllReduceCodeMotion{/*enable_reduce_scatter=*/true}.Run(
+          module.get()));
+  EXPECT_FALSE(simplified_loop);
 }
 
 TEST_F(WhileLoopAllReduceCodeMotionTest, MultipleLoopCalls) {
@@ -573,13 +782,8 @@ TEST_F(WhileLoopAllReduceCodeMotionTest, MultipleLoopCalls) {
   EXPECT_EQ(absl::c_count_if(module->entry_computation()->instructions(),
                              Matches(op::AllReduce())),
             2);
-  HloInstruction* transformed_while =
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) -> bool {
-                       return Value(instruction, op::While());
-                     }));
-
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* transformed_while = find_op<HloOpcode::kWhile>(entry);
   ASSERT_THAT(transformed_while, NotNull());
   EXPECT_THAT(transformed_while->while_body()->instructions(),
               Each(Not(op::AllReduce())));
@@ -645,13 +849,8 @@ TEST_F(WhileLoopAllReduceCodeMotionTest, MultipleAllReduceAccumulate) {
       HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/true)
           .Run(module.get())
           .status());
-  HloInstruction* transformed_while =
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) -> bool {
-                       return Value(instruction, op::While());
-                     }));
-
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* transformed_while = find_op<HloOpcode::kWhile>(entry);
   ASSERT_THAT(transformed_while, NotNull());
   // Both all-reduces should have been sinked.
   EXPECT_THAT(transformed_while->while_body()->instructions(),
@@ -664,6 +863,93 @@ TEST_F(WhileLoopAllReduceCodeMotionTest, MultipleAllReduceAccumulate) {
             2);
 }
 
+TEST_F(WhileLoopAllReduceCodeMotionTest, MultipleReduceScatterAccumulate) {
+  constexpr absl::string_view kHloModule = R"(
+    HloModule accumulated_reduce_scatter
+
+    %reduction.0 {
+      %x = f32[] parameter(0)
+      %y = f32[] parameter(1)
+      ROOT %add = f32[] add(f32[] %x, f32[] %y)
+    }
+
+    %reduction.1 {
+      %x = bf16[] parameter(0)
+      %y = bf16[] parameter(1)
+      ROOT %add = bf16[] add(bf16[] %x, bf16[] %y)
+    }
+
+    %while_condition {
+      %param = (s32[], s32[], f32[4096, 1024], f32[1024, 1024], bf16[4096, 1024], bf16[1024, 1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      ROOT result = pred[] compare(%gte.0, %gte.1), direction=LT
+    }
+
+    %while_body {
+      %param = (s32[], s32[], f32[4096, 1024], f32[1024, 1024], bf16[4096, 1024], bf16[1024, 1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      %gte.2 = f32[4096, 1024] get-tuple-element(%param), index=2
+      %gte.3 = f32[1024, 1024] get-tuple-element(%param), index=3
+      %gte.4 = bf16[4096, 1024] get-tuple-element(%param), index=4
+      %gte.5 = bf16[1024, 1024] get-tuple-element(%param), index=5
+      %reduce-scatter.0 = f32[1024, 1024] reduce-scatter(f32[4096, 1024] %gte.2), channel_id=1, replica_groups={{0,1,2,3}}, use_global_device_ids=true, to_apply=%reduction.0, dimensions={0}
+      %accumulation.0 = f32[1024, 1024] add(f32[1024, 1024] %reduce-scatter.0, f32[1024, 1024] %gte.3)
+      %reduce-scatter.1 = bf16[1024, 1024] reduce-scatter(bf16[4096, 1024] %gte.4), channel_id=2, replica_groups={{0,1,2,3}}, use_global_device_ids=true, to_apply=%reduction.1, dimensions={0}
+      %accumulation.1 = bf16[1024, 1024] add(bf16[1024, 1024] %reduce-scatter.1, bf16[1024, 1024] %gte.5)
+      %constant = s32[] constant(1)
+      %increment_iteration = s32[] add(s32[] %gte.0, s32[] %constant)
+      ROOT %loop_result = (s32[], s32[], f32[4096, 1024], f32[1024, 1024], bf16[4096, 1024], bf16[1024, 1024]) tuple(%increment_iteration, %gte.1, %gte.2, %accumulation.0, %gte.4, %accumulation.1)
+    }
+
+    ENTRY accumulated_all_reduce {
+      %param.0 = s32[] parameter(0)
+      %param.1 = f32[4096, 1024] parameter(1)
+      %param.2 = bf16[4096, 1024] parameter(2)
+      %constant.0 = s32[] constant(1)
+      %accumulation_buffer.0 = f32[1024, 1024] constant({...})
+      %accumulation_buffer.1 = bf16[1024, 1024] constant({...})
+      %while_init = (s32[], s32[], f32[4096, 1024], f32[1024, 1024], bf16[4096, 1024], bf16[1024, 1024]) tuple(s32[] %constant.0, s32[] %param.0, f32[4096, 1024] %param.1, f32[1024, 1024] %accumulation_buffer.0, bf16[4096, 1024] %param.2, bf16[1024, 1024] %accumulation_buffer.1)
+      ROOT %while = (s32[], s32[], f32[4096, 1024], f32[1024, 1024], bf16[4096, 1024], bf16[1024, 1024]) while(%while_init), condition=%while_condition, body=%while_body
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopAllReduceCodeMotion{/*enable_reduce_scatter=*/true}.Run(
+          module.get()));
+  ASSERT_TRUE(simplified_loop);
+  TF_ASSERT_OK(
+      HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/true)
+          .Run(module.get())
+          .status());
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* transformed_while = find_op<HloOpcode::kWhile>(entry);
+  ASSERT_THAT(transformed_while, NotNull());
+  // Both reduce-scatters should have been sinked.
+  EXPECT_THAT(transformed_while->while_body()->instructions(),
+              Each(Not(op::ReduceScatter())));
+
+  // Verify both accumulation buffers' shape has changed.
+  constexpr std::array<std::pair<int64_t, absl::string_view>, 2> accum_buffers =
+      {{
+          {3, "f32[4096, 1024]"},
+          {5, "bf16[4096, 1024]"},
+      }};
+
+  for (auto [index, shape] : accum_buffers) {
+    HloInstruction* accumulation_buffer =
+        transformed_while->mutable_operand(0)->mutable_operand(index);
+    EXPECT_THAT(accumulation_buffer, op::Constant());
+    EXPECT_THAT(accumulation_buffer, op::Shape(shape));
+  }
+  EXPECT_EQ(absl::c_count_if(module->entry_computation()->instructions(),
+                             Matches(op::ReduceScatter())),
+            2);
+}
+
 TEST_F(WhileLoopAllReduceCodeMotionTest, MixMovableAllReduceWithNotMovable) {
   constexpr absl::string_view kHloModule = R"(
     HloModule accumulated_all_reduce
@@ -725,13 +1011,8 @@ TEST_F(WhileLoopAllReduceCodeMotionTest, MixMovableAllReduceWithNotMovable) {
       HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/true)
           .Run(module.get())
           .status());
-  HloInstruction* transformed_while =
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) -> bool {
-                       return Value(instruction, op::While());
-                     }));
-
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* transformed_while = find_op<HloOpcode::kWhile>(entry);
   ASSERT_THAT(transformed_while, NotNull());
   // One all-reduce is movable and the other is not movable.
   EXPECT_EQ(absl::c_count_if(transformed_while->while_body()->instructions(),
@@ -808,12 +1089,8 @@ TEST_F(WhileLoopAllReduceCodeMotionTest,
       HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/true)
           .Run(module.get())
           .status());
-  HloInstruction* transformed_while =
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) -> bool {
-                       return Value(instruction, op::While());
-                     }));
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* transformed_while = find_op<HloOpcode::kWhile>(entry);
 
   ASSERT_THAT(transformed_while, NotNull());
   EXPECT_THAT(transformed_while->while_body()->instructions(),
@@ -821,28 +1098,17 @@ TEST_F(WhileLoopAllReduceCodeMotionTest,
   HloInstruction* accumulation_buffer =
       transformed_while->mutable_operand(0)->mutable_operand(3);
   EXPECT_THAT(accumulation_buffer, op::Constant());
-  HloAllReduceInstruction* moved_all_reduce = DynCast<HloAllReduceInstruction>(
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) {
-                       return Value(instruction, op::AllReduce());
-                     })));
-  EXPECT_TRUE(ShapeUtil::Equal(moved_all_reduce->shape(),
-                               ShapeUtil::MakeShape(BF16, {2, 1024, 1024})));
+  HloAllReduceInstruction* moved_all_reduce =
+      DynCast<HloAllReduceInstruction>(find_op<HloOpcode::kAllReduce>(entry));
+  EXPECT_THAT(moved_all_reduce, op::Shape("bf16[2, 1024, 1024]"));
 
-  HloInstruction* add_delta_to_old_buffer =
-      *(std::find_if(module->entry_computation()->instructions().begin(),
-                     module->entry_computation()->instructions().end(),
-                     [](HloInstruction* instruction) -> bool {
-                       return Value(instruction, op::Add());
-                     }));
+  HloInstruction* add_delta_to_old_buffer = find_op<HloOpcode::kAdd>(entry);
   ASSERT_THAT(add_delta_to_old_buffer, NotNull());
-  EXPECT_TRUE(ShapeUtil::Equal(add_delta_to_old_buffer->shape(),
-                               ShapeUtil::MakeShape(F32, {2, 1024, 1024})));
-  EXPECT_TRUE(ShapeUtil::Equal(add_delta_to_old_buffer->operand(0)->shape(),
-                               ShapeUtil::MakeShape(F32, {2, 1024, 1024})));
-  EXPECT_TRUE(ShapeUtil::Equal(add_delta_to_old_buffer->operand(1)->shape(),
-                               ShapeUtil::MakeShape(F32, {2, 1024, 1024})));
+  EXPECT_THAT(add_delta_to_old_buffer, op::Shape("f32[2, 1024, 1024]"));
+  EXPECT_THAT(add_delta_to_old_buffer->operand(0),
+              op::Shape("f32[2, 1024, 1024]"));
+  EXPECT_THAT(add_delta_to_old_buffer->operand(1),
+              op::Shape("f32[2, 1024, 1024]"));
 }
 
 // This test is almost the same as the one above but we change the all-reduce
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
index 2976d03f527..e2635f20077 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
@@ -157,9 +157,7 @@ StatusOr<bool> WhileLoopConstantSinking::Run(
     // This will let us sink the constant into the outer while first and then
     // into the inner while in a single run of this pass.
     absl::c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
-                    [](const HloInstruction* instr) {
-                      return instr->opcode() == HloOpcode::kWhile;
-                    });
+                    HloPredicateIsOp<HloOpcode::kWhile>);
   }
 
   for (HloInstruction* while_instr : while_instrs) {
diff --git a/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.cc
index a4f642a7e82..b05ad866ea1 100644
--- a/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.h"
 
+#include <iterator>
+#include <string>
+#include <vector>
+
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
-#include "tensorflow/compiler/xla/service/tuple_util.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -294,7 +297,7 @@ StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::
       continue;
     }
 
-    if (!worth_hoisting_individually_(*instruction)) {
+    if (!worth_hoisting_individually_(instruction)) {
       continue;
     }
 
@@ -344,9 +347,7 @@ StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::Run(
   std::vector<HloInstruction*> while_instrs;
   for (auto* comp : module->computations(execution_threads)) {
     absl::c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
-                    [](const HloInstruction* instr) {
-                      return instr->opcode() == HloOpcode::kWhile;
-                    });
+                    HloPredicateIsOp<HloOpcode::kWhile>);
   }
 
   for (HloInstruction* while_instr : while_instrs) {
diff --git a/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.h
index b226a2623e4..fd313a4be19 100644
--- a/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_EXPENSIVE_INVARIANT_CODE_MOTION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_EXPENSIVE_INVARIANT_CODE_MOTION_H_
 
+#include <functional>
+#include <utility>
+
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -31,7 +34,7 @@ class WhileLoopExpensiveInvariantCodeMotion : public HloModulePass {
  public:
   using ShapeSizeFunction = std::function<int64_t(const Shape&)>;
   explicit WhileLoopExpensiveInvariantCodeMotion(
-      std::function<bool(const HloInstruction&)> worth_hoisting_individually,
+      HloPredicate worth_hoisting_individually,
       ShapeSizeFunction shape_size_function = ShapeUtil::ByteSizeOfElements)
       : shape_size_function_(std::move(shape_size_function)),
         worth_hoisting_individually_(std::move(worth_hoisting_individually)) {}
@@ -50,7 +53,7 @@ class WhileLoopExpensiveInvariantCodeMotion : public HloModulePass {
       HloInstruction* while_instr);
 
   ShapeSizeFunction shape_size_function_;
-  std::function<bool(const HloInstruction&)> worth_hoisting_individually_;
+  HloPredicate worth_hoisting_individually_;
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion_test.cc
index 539016210b2..53a438909ac 100644
--- a/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion_test.cc
@@ -17,9 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
@@ -69,9 +67,7 @@ TEST_F(WhileLoopExpensiveInvariantCodeMotionTest,
   TF_ASSERT_OK_AND_ASSIGN(
       bool simplified_loop,
       WhileLoopExpensiveInvariantCodeMotion(
-          /*worth_hoisting_individually=*/[](const HloInstruction& instr) {
-            return instr.opcode() == HloOpcode::kDot;
-          })
+          /*worth_hoisting_individually=*/HloPredicateIsOp<HloOpcode::kDot>)
           .Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
@@ -90,10 +86,8 @@ TEST_F(WhileLoopExpensiveInvariantCodeMotionTest,
   TF_ASSERT_OK_AND_ASSIGN(
       bool simplified_loop,
       WhileLoopExpensiveInvariantCodeMotion(
-          /*worth_hoisting_individually=*/[](const HloInstruction& instr) {
-            return instr.opcode() == HloOpcode::kDot ||
-                   instr.opcode() == HloOpcode::kReduce;
-          })
+          /*worth_hoisting_individually=*/HloPredicateIsOp<HloOpcode::kDot,
+                                                           HloOpcode::kReduce>)
           .Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
@@ -108,13 +102,10 @@ TEST_F(WhileLoopExpensiveInvariantCodeMotionTest,
   auto m =
       ParseAndReturnVerifiedModule(kModuleWithNonInflatingInvariantDot).value();
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool simplified_loop,
-      WhileLoopExpensiveInvariantCodeMotion(
-          /*worth_hoisting_individually=*/[](const HloInstruction& instr) {
-            return false;
-          })
-          .Run(m.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopExpensiveInvariantCodeMotion(
+                              /*worth_hoisting_individually=*/HloPredicateFalse)
+                              .Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 }
 
@@ -159,9 +150,7 @@ TEST_F(WhileLoopExpensiveInvariantCodeMotionTest, DoesNotHoistsInflating) {
   TF_ASSERT_OK_AND_ASSIGN(
       bool simplified_loop,
       WhileLoopExpensiveInvariantCodeMotion(
-          /*worth_hoisting_individually=*/[](const HloInstruction& instr) {
-            return instr.opcode() == HloOpcode::kDot;
-          })
+          /*worth_hoisting_individually=*/HloPredicateIsOp<HloOpcode::kDot>)
           .Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 }
@@ -174,10 +163,8 @@ TEST_F(WhileLoopExpensiveInvariantCodeMotionTest,
   TF_ASSERT_OK_AND_ASSIGN(
       bool simplified_loop,
       WhileLoopExpensiveInvariantCodeMotion(
-          /*worth_hoisting_individually=*/[](const HloInstruction& instr) {
-            return instr.opcode() == HloOpcode::kDot ||
-                   instr.opcode() == HloOpcode::kReduce;
-          })
+          /*worth_hoisting_individually=*/HloPredicateIsOp<HloOpcode::kDot,
+                                                           HloOpcode::kReduce>)
           .Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
@@ -224,9 +211,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(
       bool simplified_loop,
       WhileLoopExpensiveInvariantCodeMotion(
-          /*worth_hoisting_individually=*/[](const HloInstruction& instr) {
-            return instr.opcode() == HloOpcode::kDot;
-          })
+          /*worth_hoisting_individually=*/HloPredicateIsOp<HloOpcode::kDot>)
           .Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
@@ -270,9 +255,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(
       bool simplified_loop,
       WhileLoopExpensiveInvariantCodeMotion(
-          /*worth_hoisting_individually=*/[](const HloInstruction& instr) {
-            return instr.opcode() == HloOpcode::kDot;
-          })
+          /*worth_hoisting_individually=*/HloPredicateIsOp<HloOpcode::kDot>)
           .Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 }
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index ea70f06c0fa..f083b76646c 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -328,9 +328,7 @@ StatusOr<bool> WhileLoopInvariantCodeMotion::Run(
   std::vector<HloInstruction*> while_instrs;
   for (auto* comp : module->MakeComputationPostOrder(execution_threads)) {
     absl::c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
-                    [](const HloInstruction* instr) {
-                      return instr->opcode() == HloOpcode::kWhile;
-                    });
+                    HloPredicateIsOp<HloOpcode::kWhile>);
   }
   BoundNonLinearCompilerAnalysis allowance(module, name(), 10);
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index f97cdeb2dbc..cdd16b2abb9 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/union_find.h"
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index 4762983c363..8b1fd137c22 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -37,9 +37,7 @@ namespace op = xla::testing::opcode_matchers;
 // Returns the first kWhile instruction within m's entry computation.
 HloInstruction* FindFirstWhile(HloModule* m) {
   const auto& instrs = m->entry_computation()->instructions();
-  return *absl::c_find_if(instrs, [](const HloInstruction* instr) {
-    return instr->opcode() == HloOpcode::kWhile;
-  });
+  return *absl::c_find_if(instrs, HloPredicateIsOp<HloOpcode::kWhile>);
 }
 
 class WhileLoopSimplifierTest : public HloTestBase {
diff --git a/tensorflow/compiler/xla/service/xla_compile_main.cc b/tensorflow/compiler/xla/service/xla_compile_main.cc
index 57cfa3ae6eb..ef723d71a6d 100644
--- a/tensorflow/compiler/xla/service/xla_compile_main.cc
+++ b/tensorflow/compiler/xla/service/xla_compile_main.cc
@@ -38,10 +38,14 @@ limitations under the License.
 #include "tensorflow/tsl/platform/protobuf.h"
 #include "tensorflow/tsl/util/command_line_flags.h"
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/compiler/xla/service/gpu/executable.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
+#endif
+#if GOOGLE_CUDA
 #include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h"
 #endif
 
 namespace xla {
@@ -68,25 +72,29 @@ StatusOr<std::string> AotCompileCpuExecutable(
   return result;
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 StatusOr<std::string> AotCompileGpuExecutable(
     std::unique_ptr<HloModule> hlo_module,
     const gpu::GpuTargetConfig& gpu_target_config,
     const AutotuneResults& autotune_results = AutotuneResults()) {
-  gpu::NVPTXCompiler nvptx_compiler;
+#if GOOGLE_CUDA
+  auto gpu_compiler = gpu::NVPTXCompiler();
+#elif TENSORFLOW_USE_ROCM
+  auto gpu_compiler = gpu::AMDGPUCompiler();
+#endif
   Compiler::CompileOptions compile_options;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module_after_opt,
-                      nvptx_compiler.RunHloPassesWithoutDevice(
+                      gpu_compiler.RunHloPassesWithoutDevice(
                           std::move(hlo_module), compile_options,
                           gpu_target_config, autotune_results));
 
   auto module_group =
       std::make_unique<HloModuleGroup>(std::move(module_after_opt));
-  AotCompilationOptions aot_options(nvptx_compiler.PlatformId());
+  AotCompilationOptions aot_options(gpu_compiler.PlatformId());
   aot_options.set_target_config(gpu_target_config);
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
-      nvptx_compiler.CompileAheadOfTime(std::move(module_group), aot_options));
+      gpu_compiler.CompileAheadOfTime(std::move(module_group), aot_options));
   TF_ASSIGN_OR_RETURN(std::string result, aot_results[0]->SerializeAsString());
   return result;
 }
@@ -131,7 +139,7 @@ xla::Status XlaCompileMain(const std::string& module_path,
   std::string result;
   if (platform == "cpu") {
     TF_ASSIGN_OR_RETURN(result, AotCompileCpuExecutable(std::move(hlo_module)));
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   } else if (platform == "gpu") {
     // Parse GpuTargetConfig.
     std::string gpu_target_config_string;
@@ -213,7 +221,7 @@ int main(int argc, char* argv[]) {
       module_path, output_path, platform, gpu_target_config_path,
       autotune_results_path);
   if (!result.ok()) {
-    LOG(ERROR) << "Compilation failed: " << result.error_message();
+    LOG(ERROR) << "Compilation failed: " << result.message();
     return 1;
   }
 
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
index 82185fb84c6..a4cf14bd53e 100644
--- a/tensorflow/compiler/xla/shape.cc
+++ b/tensorflow/compiler/xla/shape.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
@@ -158,6 +159,15 @@ void Shape::DeleteDimension(int64_t dim_to_delete) {
       }
       ++i;
     }
+    // Delete the corresponding dim level types.
+    if (LayoutUtil::IsSparse(this->layout())) {
+      auto* mut_dlt = layout_->mutable_dim_level_types();
+      auto* mut_dim_unique = layout_->mutable_dim_unique();
+      auto* mut_dim_ordered = layout_->mutable_dim_ordered();
+      mut_dlt->erase(mut_dlt->begin() + dim_to_delete);
+      mut_dim_unique->erase(mut_dim_unique->begin() + dim_to_delete);
+      mut_dim_ordered->erase(mut_dim_ordered->begin() + dim_to_delete);
+    }
   }
 }
 
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 041552ec009..526aa4a858f 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 
 #include <algorithm>
+#include <climits>
 #include <functional>
 #include <numeric>
 #include <optional>
@@ -71,6 +72,7 @@ constexpr uint8_t primitive_byte_size[PrimitiveType_ARRAYSIZE] = {
     sizeof(float) / 4,   // F8E4M3FN = 20
     sizeof(int8_t),      // S4 = 21
     sizeof(int8_t),      // U4 = 22
+    sizeof(float) / 4,   // F8E4M3B11FNUZ = 23
 };
 constexpr int64_t kAnnotationPrintInterval = 5;
 
@@ -528,6 +530,24 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   TF_DCHECK_OK(ValidateShape(*to));
 }
 
+/* static */ bool ShapeUtil::IsEffectivelyMostMajorDimension(
+    const Shape& shape, int64_t dimension) {
+  // Check if the dimension is most major as returned by LayoutUtil::Major(0).
+  // If not, and the most major dimension's size is 1, then we can repeat the
+  // same check for next most major dimension as returned by
+  // LayoutUtil::Major(1) and so on.
+  for (int64_t i = 0; i < shape.dimensions_size(); ++i) {
+    int64_t major_dimension = LayoutUtil::Major(shape.layout(), i);
+    if (major_dimension == dimension) {
+      return true;
+    }
+    if (shape.dimensions(major_dimension) != 1) {
+      return false;
+    }
+  }
+  return false;
+}
+
 /* static */ bool ShapeUtil::ElementIsIntegral(const Shape& shape) {
   return primitive_util::IsIntegralType(shape.element_type());
 }
@@ -553,6 +573,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     case S64:
     case F8E5M2:
     case F8E4M3FN:
+    case F8E4M3B11FNUZ:
     case F16:
     case BF16:
     case F32:
@@ -592,7 +613,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ bool ShapeUtil::IsEmptyTuple(const Shape& shape) {
-  return shape.IsTuple() && TupleElementCount(shape) == 0;
+  return shape.IsTuple() && shape.tuple_shapes().empty();
 }
 
 /* static */ int64_t ShapeUtil::TupleElementCount(const Shape& shape) {
@@ -602,7 +623,6 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */ const Shape& ShapeUtil::GetTupleElementShape(const Shape& shape,
                                                           int64_t index) {
-  CHECK(shape.IsTuple());
   CHECK_GT(TupleElementCount(shape), index);
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(shape.tuple_shapes(index)));
   return shape.tuple_shapes(index);
@@ -619,8 +639,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                                          int64_t limit) {
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(tuple));
   CHECK(tuple.IsTuple());
-  CHECK_LE(start, TupleElementCount(tuple));
-  CHECK_LE(limit, TupleElementCount(tuple));
+  CHECK_LE(start, tuple.tuple_shapes_size());
+  CHECK_LE(limit, tuple.tuple_shapes_size());
 
   std::vector<Shape> new_elements(tuple.tuple_shapes().begin() + start,
                                   tuple.tuple_shapes().begin() + limit);
@@ -661,7 +681,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ bool ShapeUtil::IsZeroElementArray(const Shape& shape) {
-  return shape.IsArray() && ElementsIn(shape) == 0;
+  return shape.IsArray() &&
+         absl::c_any_of(shape.dimensions(), [](int64_t d) { return d == 0; });
 }
 
 /* static */ bool ShapeUtil::IsScalarWithElementType(
@@ -816,55 +837,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */ int64_t ShapeUtil::ByteSizeOfPrimitiveType(
     PrimitiveType primitive_type) {
-  switch (primitive_type) {
-    case PRED:
-      return sizeof(int8_t);
-    case S4:
-      return sizeof(int8_t);
-    case S8:
-      return sizeof(int8_t);
-    case S16:
-      return sizeof(int16_t);
-    case S32:
-      return sizeof(int32_t);
-    case S64:
-      return sizeof(int64_t);
-    case U4:
-      return sizeof(uint8_t);
-    case U8:
-      return sizeof(uint8_t);
-    case U16:
-      return sizeof(uint16_t);
-    case U32:
-      return sizeof(uint32_t);
-    case U64:
-      return sizeof(uint64_t);
-    case F8E5M2:
-      return sizeof(float) / 4;
-    case F8E4M3FN:
-      return sizeof(float) / 4;
-    case BF16:
-      return sizeof(float) / 2;
-    case F16:
-      return sizeof(float) / 2;
-    case F32:
-      return sizeof(float);
-    case F64:
-      return sizeof(double);
-    case C64:
-      return sizeof(complex64);
-    case C128:
-      return sizeof(complex128);
-    case TOKEN:
-      // Tokens require no space.
-      return 0;
-    case TUPLE:
-    case OPAQUE_TYPE:
-      LOG(FATAL) << PrimitiveType_Name(primitive_type)
-                 << " primitive type has no definitive size";
-    default:
-      LOG(FATAL) << "Unhandled primitive type " << primitive_type;
-  }
+  return primitive_util::ByteWidth(primitive_type);
 }
 
 /* static */ int64_t ShapeUtil::ByteSizeOf(const Shape& shape,
@@ -2027,6 +2000,11 @@ Status ShapeUtil::ByteStrides(const Shape& shape, absl::Span<int64_t> strides) {
     int64_t dim_size = shape_dimensions[minor_to_major[dim]];
     num_of_elements *= dim_size;
   }
+
+  if (ShapeUtil::ElementHasBitWidth(shape, 4)) {
+    return num_of_elements / 2;
+  }
+
   return num_of_elements * ByteSizeOfPrimitiveType(shape.element_type());
 }
 
@@ -2037,7 +2015,9 @@ Status ShapeUtil::ByteStrides(const Shape& shape, absl::Span<int64_t> strides) {
     indices.push_back(dim - 1);
   }
   int64_t size = LayoutUtil::LinearIndex(shape, indices) + 1;
-  return (size * ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type()));
+  int64_t num_bits = size * primitive_util::BitWidth(shape.element_type());
+
+  return CeilOfRatio<int64_t>(num_bits, CHAR_BIT);
 }
 
 int64_t ShapeUtil::ForEachState::CalculateNumSteps() const {
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index e4444ee0f65..e7810703733 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -287,7 +287,7 @@ class ShapeUtil {
   // element type changed to type.
   static Shape ChangeElementType(const Shape& original, PrimitiveType type);
 
-  // Retursn a shape with same dimensions but with all dimensions set to static.
+  // Returns a shape with same dimensions but with all dimensions set to static.
   static Shape MakeStaticShape(const Shape& original);
 
   // Creates a tuple shape from a slice of element shapes within the tuple.
@@ -326,6 +326,12 @@ class ShapeUtil {
   // Copy the dynamic dimensions property from one shape to another.
   static void CopyDynamicDimensions(Shape* to, const Shape& from);
 
+  // Returns true if the given dimension is effectively the most major dimension
+  // of the shape taking into account any unit dimensions. Requires that the
+  // shape has a layout.
+  static bool IsEffectivelyMostMajorDimension(const Shape& shape,
+                                              int64_t dimension);
+
   // Returns an empty tuple shape. Can be used as a sentinel Shape value.
   static Shape MakeNil() { return MakeTupleShape({}); }
 
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 71101c91f16..44c632f587b 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -625,7 +625,7 @@ TEST(ShapeUtilTest, ForEachIndexWithStatus) {
       increment_func);
 
   EXPECT_FALSE(error_status.ok());
-  EXPECT_THAT(error_status.error_message(),
+  EXPECT_THAT(error_status.message(),
               ::testing::HasSubstr("Cannot increment beyond 5."));
   EXPECT_EQ(invocations, 5);
 }
@@ -1036,6 +1036,32 @@ TEST(ShapeUtilTest, DeleteDimensionsUnsorted) {
   EXPECT_EQ(a, ShapeUtil::MakeShapeWithDenseLayout(F32, {5, 9}, {0, 1}));
 }
 
+TEST(ShapeUtilTest, IsEffectivelyMostMajorDimension) {
+  // f32[1,1,16,1,279]{4,0,1,2,3}
+  // Dim 3 in front of 2 has size 1, so 2 is effectively most major dim.
+  Shape shape0 = ShapeUtil::MakeShapeWithDenseLayout(F32, {1, 1, 16, 1, 279},
+                                                     {4, 0, 1, 2, 3});
+  EXPECT_TRUE(ShapeUtil::IsEffectivelyMostMajorDimension(shape0, 2));
+
+  // f32[1,1,16,1,279]{4,1,2,3,0}
+  // Dims 3 and 0 in from of 2 havs size 1.
+  Shape shape1 = ShapeUtil::MakeShapeWithDenseLayout(F32, {1, 1, 16, 1, 279},
+                                                     {4, 1, 2, 3, 0});
+  EXPECT_TRUE(ShapeUtil::IsEffectivelyMostMajorDimension(shape1, 2));
+
+  // f32[1,1,16,1,279]{0,1,2,3,4}
+  // Dim 4 in front of 2 has size > 1, so 2 is not most effectively most major.
+  Shape shape2 = ShapeUtil::MakeShapeWithDenseLayout(F32, {1, 1, 16, 1, 279},
+                                                     {0, 1, 2, 3, 4});
+  EXPECT_FALSE(ShapeUtil::IsEffectivelyMostMajorDimension(shape2, 2));
+
+  // f32[1,1,16,1,1]{0,1,2,3,4}
+  // Dim 4 is of size 1, and can be returned as most major even if size is 1.
+  Shape shape3 = ShapeUtil::MakeShapeWithDenseLayout(F32, {1, 1, 16, 1, 1},
+                                                     {0, 1, 2, 3, 4});
+  EXPECT_TRUE(ShapeUtil::IsEffectivelyMostMajorDimension(shape2, 4));
+}
+
 TEST(ShapeUtilTest, B_250640044) {
   // This case failed the fuzzer; see b/250640044.
   ShapeProto proto;
@@ -1093,6 +1119,21 @@ TEST(ShapeUtilTest, B_251055887) {
   EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
 }
 
+TEST(ShapeUtilTest, Int4ShapeSize) {
+  Shape int4_shape = ShapeUtil::MakeShape(S4, {64, 128});
+  EXPECT_EQ(ShapeUtil::ArrayDataSize(int4_shape), 64 * 128 / 2);
+
+  // Ensure the size is correct with int4 tiling.
+  Shape int4_shape2 = ShapeUtil::MakeShape(S4, {9216, 6144});
+  auto* layout = int4_shape2.mutable_layout();
+  layout->clear_tiles();
+  layout->add_tiles();
+  layout->add_tiles();
+  *layout->mutable_tiles(0) = Tile({8 * (32 / 4), 128});
+  *layout->mutable_tiles(1) = Tile({32 / 4, 1});
+  EXPECT_EQ(ShapeUtil::ArrayDataSize(int4_shape2), 9216 * 6144 / 2);
+}
+
 TEST(Transpose021Test, NoTranspose) {
   Shape shape = ShapeUtil::MakeShapeWithDenseLayout(F32, {128, 64}, {1, 0});
   Shape transposed =
@@ -1245,6 +1286,17 @@ TEST(Transpose021Test, Large) {
                                                    Vector3{0, 2, 1}));
 }
 
+TEST(Transpose210Test, LogicalTranspose) {
+  Shape shape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {10, 11, 12, 13}, {3, 2, 1, 0});
+  Shape transposed =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {13, 12, 10, 11}, {3, 2, 1, 0});
+  std::vector<int64_t> dimensions = {3, 2, 0, 1};
+  EXPECT_EQ(std::make_optional(Vector3{13, 12, 110}),
+            ShapeUtil::GetNormalizedLogicalTransposeShape(
+                shape, transposed, dimensions, Vector3{2, 1, 0}));
+}
+
 TEST(AlgebraicSimplifierTest, ReshapeIsBitcast_3x2x2_6x2_Dim0IsMostMinor) {
   EXPECT_FALSE(ShapeUtil::ReshapeIsBitcast(
       ShapeUtil::MakeShapeWithDenseLayout(F32, {3, 2, 2}, {0, 1, 2}),
diff --git a/tensorflow/compiler/xla/side_effect_util.cc b/tensorflow/compiler/xla/side_effect_util.cc
index 74a167367bd..e01bf34bba7 100644
--- a/tensorflow/compiler/xla/side_effect_util.cc
+++ b/tensorflow/compiler/xla/side_effect_util.cc
@@ -52,4 +52,11 @@ const char kXlaShardingStrategyDiv[] = "div";
 
 const char kXlaPadValueAttr[] = "_xla_pad_value";
 
+const char kXlaQuantizationHighValueAttr[] = "_xla_quantization_high_value";
+
+const char kXlaQuantizationLowValueAttr[] = "_xla_quantization_low_value";
+
+const char kXlaQuantizationNumBucketsValueAttr[] =
+    "_xla_quantization_num_buckets_value";
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/side_effect_util.h b/tensorflow/compiler/xla/side_effect_util.h
index 2c2738e56a0..51b8a0329b9 100644
--- a/tensorflow/compiler/xla/side_effect_util.h
+++ b/tensorflow/compiler/xla/side_effect_util.h
@@ -64,6 +64,10 @@ extern const char kXlaShardingStrategyDiv[];
 // XLA frontend attribute for pad value.
 extern const char kXlaPadValueAttr[];
 
+// XLA frontend attributes for simulated quantization.
+extern const char kXlaQuantizationHighValueAttr[];
+extern const char kXlaQuantizationLowValueAttr[];
+extern const char kXlaQuantizationNumBucketsValueAttr[];
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SIDE_EFFECT_UTIL_H_
diff --git a/tensorflow/compiler/xla/status_macros.cc b/tensorflow/compiler/xla/status_macros.cc
index 2f7c27c1016..9b85291a7c5 100644
--- a/tensorflow/compiler/xla/status_macros.cc
+++ b/tensorflow/compiler/xla/status_macros.cc
@@ -116,7 +116,7 @@ MakeErrorStream::Impl::Impl(const Status& status,
       code_(!status.ok() ? static_cast<absl::StatusCode>(status.code())
                          : absl::StatusCode::kUnknown),
       prior_message_handling_(prior_message_handling),
-      prior_message_(status.error_message()),
+      prior_message_(status.message()),
       is_done_(false),
       // Error code type is not visible here, so we can't call
       // IsLoggedByDefault.
diff --git a/tensorflow/compiler/xla/status_macros_test.cc b/tensorflow/compiler/xla/status_macros_test.cc
index dc97ea341a6..e6fb2ea430c 100644
--- a/tensorflow/compiler/xla/status_macros_test.cc
+++ b/tensorflow/compiler/xla/status_macros_test.cc
@@ -43,14 +43,14 @@ Status RetCheckSuccess() {
 TEST(StatusMacros, RetCheckFailing) {
   Status status = RetCheckFail();
   EXPECT_EQ(status.code(), tsl::error::INTERNAL);
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::ContainsRegex("RET_CHECK failure.*2 > 3"));
 }
 
 TEST(StatusMacros, RetCheckFailingWithExtraMessage) {
   Status status = RetCheckFailWithExtraMessage();
   EXPECT_EQ(status.code(), tsl::error::INTERNAL);
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::ContainsRegex("RET_CHECK.*2 > 3 extra message"));
 }
 
diff --git a/tensorflow/compiler/xla/stream_executor/BUILD b/tensorflow/compiler/xla/stream_executor/BUILD
index 37d4a2fd195..0425da4aea4 100644
--- a/tensorflow/compiler/xla/stream_executor/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/BUILD
@@ -5,7 +5,7 @@
 # do not link against restricted binary blobs.
 
 load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
-load("//tensorflow/tsl:tsl.bzl", "transitive_hdrs", "tsl_gpu_library")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility", "transitive_hdrs", "tsl_gpu_library")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
@@ -14,9 +14,9 @@ load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         ":friends",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -43,6 +43,7 @@ cc_library(
         "kernel_cache_config.h",
         "kernel_spec.h",
         "launch_dim.h",
+        "numeric_options.h",
         "module_spec.h",
         "multi_platform_manager.h",
         "platform.h",
@@ -107,6 +108,7 @@ cc_library(
         ":device_memory",
         ":kernel",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -258,6 +260,11 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "numeric_options",
+    hdrs = ["numeric_options.h"],
+)
+
 cc_library(
     name = "device_mem_allocator",
     hdrs = [
@@ -673,7 +680,14 @@ cc_library(
         ":device_description_proto_cc",
         ":device_memory",
         ":dnn_proto_cc",
+        ":numeric_options",
         ":stream_executor_headers",
+        "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/lib/strings:proto_serialization",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -681,12 +695,6 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/lib/strings:proto_serialization",
-        "//tensorflow/compiler/xla/stream_executor/platform",
-        "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/platform:statusor",
-        "//tensorflow/tsl/protobuf:dnn_proto_cc",
     ] + if_static(["@com_google_protobuf//:protobuf"]),
 )
 
diff --git a/tensorflow/compiler/xla/stream_executor/blas.h b/tensorflow/compiler/xla/stream_executor/blas.h
index 7fa2d170da2..9774a3f6d72 100644
--- a/tensorflow/compiler/xla/stream_executor/blas.h
+++ b/tensorflow/compiler/xla/stream_executor/blas.h
@@ -278,29 +278,6 @@ class BlasSupport {
                           std::complex<double> beta,
                           DeviceMemory<std::complex<double>> *y, int incy) = 0;
 
-  virtual bool DoBlasGemvWithProfiling(
-      Stream *stream, blas::Transpose trans, uint64_t m, uint64 n, float alpha,
-      const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
-      int incx, float beta, DeviceMemory<float> *y, int incy,
-      ProfileResult *output_profile_result) = 0;
-  virtual bool DoBlasGemvWithProfiling(
-      Stream *stream, blas::Transpose trans, uint64_t m, uint64 n, double alpha,
-      const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
-      int incx, double beta, DeviceMemory<double> *y, int incy,
-      ProfileResult *output_profile_result) = 0;
-  virtual bool DoBlasGemvWithProfiling(
-      Stream *stream, blas::Transpose trans, uint64_t m, uint64 n,
-      std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,
-      int lda, const DeviceMemory<std::complex<float>> &x, int incx,
-      std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
-      ProfileResult *output_profile_result) = 0;
-  virtual bool DoBlasGemvWithProfiling(
-      Stream *stream, blas::Transpose trans, uint64_t m, uint64 n,
-      std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a,
-      int lda, const DeviceMemory<std::complex<double>> &x, int incx,
-      std::complex<double> beta, DeviceMemory<std::complex<double>> *y,
-      int incy, ProfileResult *output_profile_result) = 0;
-
   // Computes a matrix-vector product using a symmetric band matrix.
   //
   //     y <- alpha * a * x + beta * y,
@@ -339,40 +316,6 @@ class BlasSupport {
                                  const void *beta, DeviceMemoryBase *c, int ldc,
                                  ComputePrecision precision) = 0;
 
-  virtual bool DoBlasGemmWithProfiling(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64 k, float alpha,
-      const DeviceMemory<Eigen::half> &a, int lda,
-      const DeviceMemory<Eigen::half> &b, int ldb, float beta,
-      DeviceMemory<Eigen::half> *c, int ldc,
-      ProfileResult *output_profile_result) = 0;
-  virtual bool DoBlasGemmWithProfiling(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64 k, float alpha,
-      const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
-      int ldb, float beta, DeviceMemory<float> *c, int ldc,
-      ProfileResult *output_profile_result) = 0;
-  virtual bool DoBlasGemmWithProfiling(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64 k, double alpha,
-      const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
-      int ldb, double beta, DeviceMemory<double> *c, int ldc,
-      ProfileResult *output_profile_result) = 0;
-  virtual bool DoBlasGemmWithProfiling(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64 k, std::complex<float> alpha,
-      const DeviceMemory<std::complex<float>> &a, int lda,
-      const DeviceMemory<std::complex<float>> &b, int ldb,
-      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
-      ProfileResult *output_profile_result) = 0;
-  virtual bool DoBlasGemmWithProfiling(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64 k, std::complex<double> alpha,
-      const DeviceMemory<std::complex<double>> &a, int lda,
-      const DeviceMemory<std::complex<double>> &b, int ldb,
-      std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
-      ProfileResult *output_profile_result) = 0;
-
   // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm.
   virtual bool GetBlasGemmAlgorithms(
       Stream *stream, std::vector<AlgorithmType> *out_algorithms) = 0;
@@ -604,30 +547,6 @@ class BlasSupport {
                   const DeviceMemory<std::complex<double>> &x, int incx,       \
                   std::complex<double> beta,                                   \
                   DeviceMemory<std::complex<double>> *y, int incy) override;   \
-  bool DoBlasGemvWithProfiling(                                                \
-      Stream *stream, blas::Transpose trans, uint64_t m, uint64 n,             \
-      float alpha, const DeviceMemory<float> &a, int lda,                      \
-      const DeviceMemory<float> &x, int incx, float beta,                      \
-      DeviceMemory<float> *y, int incy,                                        \
-      blas::ProfileResult *output_profile_result) override;                    \
-  bool DoBlasGemvWithProfiling(                                                \
-      Stream *stream, blas::Transpose trans, uint64_t m, uint64 n,             \
-      double alpha, const DeviceMemory<double> &a, int lda,                    \
-      const DeviceMemory<double> &x, int incx, double beta,                    \
-      DeviceMemory<double> *y, int incy,                                       \
-      blas::ProfileResult *output_profile_result) override;                    \
-  bool DoBlasGemvWithProfiling(                                                \
-      Stream *stream, blas::Transpose trans, uint64_t m, uint64 n,             \
-      std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,   \
-      int lda, const DeviceMemory<std::complex<float>> &x, int incx,           \
-      std::complex<float> beta, DeviceMemory<std::complex<float>> *y,          \
-      int incy, blas::ProfileResult *output_profile_result) override;          \
-  bool DoBlasGemvWithProfiling(                                                \
-      Stream *stream, blas::Transpose trans, uint64_t m, uint64 n,             \
-      std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a, \
-      int lda, const DeviceMemory<std::complex<double>> &x, int incx,          \
-      std::complex<double> beta, DeviceMemory<std::complex<double>> *y,        \
-      int incy, blas::ProfileResult *output_profile_result) override;          \
   bool DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64_t n, uint64 k, \
                   float alpha, const DeviceMemory<float> &a, int lda,          \
                   const DeviceMemory<float> &x, int incx, float beta,          \
@@ -642,39 +561,6 @@ class BlasSupport {
       const DeviceMemoryBase &a, int lda, const DeviceMemoryBase &b, int ldb,  \
       const void *beta, DeviceMemoryBase *c, int ldc,                          \
       blas::ComputePrecision precision) override;                              \
-  bool DoBlasGemmWithProfiling(                                                \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64 n, uint64 k, float alpha,                             \
-      const DeviceMemory<Eigen::half> &a, int lda,                             \
-      const DeviceMemory<Eigen::half> &b, int ldb, float beta,                 \
-      DeviceMemory<Eigen::half> *c, int ldc,                                   \
-      blas::ProfileResult *output_profile_result) override;                    \
-  bool DoBlasGemmWithProfiling(                                                \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64 n, uint64 k, float alpha,                             \
-      const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,     \
-      int ldb, float beta, DeviceMemory<float> *c, int ldc,                    \
-      blas::ProfileResult *output_profile_result) override;                    \
-  bool DoBlasGemmWithProfiling(                                                \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64 n, uint64 k, double alpha,                            \
-      const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,   \
-      int ldb, double beta, DeviceMemory<double> *c, int ldc,                  \
-      blas::ProfileResult *output_profile_result) override;                    \
-  bool DoBlasGemmWithProfiling(                                                \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64 n, uint64 k, std::complex<float> alpha,               \
-      const DeviceMemory<std::complex<float>> &a, int lda,                     \
-      const DeviceMemory<std::complex<float>> &b, int ldb,                     \
-      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc, \
-      blas::ProfileResult *output_profile_result) override;                    \
-  bool DoBlasGemmWithProfiling(                                                \
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64 n, uint64 k, std::complex<double> alpha,              \
-      const DeviceMemory<std::complex<double>> &a, int lda,                    \
-      const DeviceMemory<std::complex<double>> &b, int ldb,                    \
-      std::complex<double> beta, DeviceMemory<std::complex<double>> *c,        \
-      int ldc, blas::ProfileResult *output_profile_result) override;           \
   bool GetBlasGemmAlgorithms(Stream *stream,                                   \
                              std::vector<blas::AlgorithmType> *out_algorithms) \
       override;                                                                \
diff --git a/tensorflow/compiler/xla/stream_executor/build_defs.bzl b/tensorflow/compiler/xla/stream_executor/build_defs.bzl
index 0de6f49fd0f..89e387cb585 100644
--- a/tensorflow/compiler/xla/stream_executor/build_defs.bzl
+++ b/tensorflow/compiler/xla/stream_executor/build_defs.bzl
@@ -2,7 +2,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 
 def stream_executor_friends():
-    return ["//tensorflow/..."]
+    return ["//..."]
 
 def tf_additional_cuda_platform_deps():
     return []
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/BUILD b/tensorflow/compiler/xla/stream_executor/cuda/BUILD
index 3cd596fb60a..6c81194cc3f 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/cuda/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   CUDA-platform specific StreamExecutor support code.
 
-load("//tensorflow/tsl:tsl.bzl", "if_google", "tsl_copts")
+load("//tensorflow/tsl:tsl.bzl", "if_google", "set_external_visibility", "tsl_copts")
 load("//tensorflow/tsl:tsl.default.bzl", "tsl_gpu_cc_test")
 load(
     "//tensorflow/compiler/xla/stream_executor:build_defs.bzl",
@@ -28,7 +28,7 @@ load(
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
+    default_visibility = set_external_visibility([":friends"]),
     features = ["-layering_check"],
     licenses = ["notice"],
 )
@@ -146,6 +146,7 @@ tsl_gpu_cc_test(
     srcs = ["cuda_driver_test.cc"],
     tags = tf_cuda_tests_tags() + [
         "no_cuda_asan",  # TODO(b/171512140): re-enable.
+        "no_rocm",
     ],
     deps = [
         ":cuda_driver",
@@ -397,7 +398,6 @@ cc_library(
         "@local_config_cuda//cuda:cudnn_header",
         "//tensorflow/tsl/cuda:cudnn_version",
         "//tensorflow/tsl/platform:tensor_float_32_utils",
-        "//tensorflow/tsl/util:determinism",
         "//tensorflow/compiler/xla/stream_executor:dnn",
         "//tensorflow/compiler/xla/stream_executor:event",
         "//tensorflow/compiler/xla/stream_executor:plugin_registry",
@@ -593,10 +593,13 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
     ]) + [
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/strings:str_format",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:numbers",
         "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
     ],
     alwayslink = True,
 )
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc
index c9d199bc0b1..238a6d634d5 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc
@@ -715,170 +715,6 @@ tsl::Status CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   }
 }
 
-bool CUDABlas::DoBlasGemvWithProfiling(
-    Stream *stream, blas::Transpose trans, uint64_t m, uint64 n, float alpha,
-    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
-    int incx, float beta, DeviceMemory<float> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
-                                     incx, beta, y, incy,
-                                     output_profile_result);
-}
-
-bool CUDABlas::DoBlasGemvWithProfiling(
-    Stream *stream, blas::Transpose trans, uint64_t m, uint64 n, double alpha,
-    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
-    int incx, double beta, DeviceMemory<double> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
-                                     incx, beta, y, incy,
-                                     output_profile_result);
-}
-
-bool CUDABlas::DoBlasGemvWithProfiling(
-    Stream *stream, blas::Transpose trans, uint64_t m, uint64 n,
-    std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,
-    int lda, const DeviceMemory<std::complex<float>> &x, int incx,
-    std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
-                                     incx, beta, y, incy,
-                                     output_profile_result);
-}
-
-bool CUDABlas::DoBlasGemvWithProfiling(
-    Stream *stream, blas::Transpose trans, uint64_t m, uint64 n,
-    std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a,
-    int lda, const DeviceMemory<std::complex<double>> &x, int incx,
-    std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
-                                     incx, beta, y, incy,
-                                     output_profile_result);
-}
-
-bool CUDABlas::DoBlasGemmWithProfiling(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
-    int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta,
-    DeviceMemory<Eigen::half> *c, int ldc,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
-                                     lda, b, ldb, beta, c, ldc,
-                                     output_profile_result);
-}
-
-bool CUDABlas::DoBlasGemmWithProfiling(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
-    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
-    int ldc, blas::ProfileResult *output_profile_result) {
-  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
-                                     lda, b, ldb, beta, c, ldc,
-                                     output_profile_result);
-}
-
-bool CUDABlas::DoBlasGemmWithProfiling(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
-    const DeviceMemory<double> &b, int ldb, double beta,
-    DeviceMemory<double> *c, int ldc,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
-                                     lda, b, ldb, beta, c, ldc,
-                                     output_profile_result);
-}
-
-bool CUDABlas::DoBlasGemmWithProfiling(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, std::complex<float> alpha,
-    const DeviceMemory<std::complex<float>> &a, int lda,
-    const DeviceMemory<std::complex<float>> &b, int ldb,
-    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
-                                     lda, b, ldb, beta, c, ldc,
-                                     output_profile_result);
-}
-
-bool CUDABlas::DoBlasGemmWithProfiling(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, std::complex<double> alpha,
-    const DeviceMemory<std::complex<double>> &a, int lda,
-    const DeviceMemory<std::complex<double>> &b, int ldb,
-    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
-                                     lda, b, ldb, beta, c, ldc,
-                                     output_profile_result);
-}
-
-template <typename T>
-bool CUDABlas::DoBlasGemvWithProfilingImpl(
-    Stream *stream, blas::Transpose trans, uint64_t m, uint64 n, const T &alpha,
-    const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
-    const T &beta, DeviceMemory<T> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
-  if (output_profile_result != nullptr) {
-    timer.reset(new GpuTimer(parent_));
-    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
-      return false;
-    }
-  }
-
-  // Call blasGemm
-  bool result =
-      DoBlasGemv(stream, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-
-  if (timer != nullptr && result) {
-    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
-    // state.
-    if (!timer->Stop(AsGpuStream(stream))) {
-      return false;
-    }
-    output_profile_result->set_is_valid(true);
-    output_profile_result->set_algorithm(blas::kDefaultBlasGemv);
-    output_profile_result->set_elapsed_time_in_ms(
-        timer->GetElapsedMilliseconds());
-  }
-  return result;
-}
-
-template <typename T, typename ParamType>
-bool CUDABlas::DoBlasGemmWithProfilingImpl(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
-    int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
-    DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
-  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
-  if (output_profile_result != nullptr) {
-    timer.reset(new GpuTimer(parent_));
-    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
-      return false;
-    }
-  }
-
-  // Call blasGemm
-  bool result = DoBlasGemm(stream, transa, transb, m, n, k,
-                           blas::ToDataType<T>::value, &alpha, a, lda, b, ldb,
-                           &beta, c, ldc, blas::kDefaultComputePrecision)
-                    .ok();
-
-  if (timer != nullptr && result) {
-    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
-    // state.
-    if (!timer->Stop(AsGpuStream(stream))) {
-      return false;
-    }
-    output_profile_result->set_is_valid(true);
-    output_profile_result->set_algorithm(blas::kDefaultBlasGemm);
-    output_profile_result->set_elapsed_time_in_ms(
-        timer->GetElapsedMilliseconds());
-  }
-  return result;
-}
-
 static bool UsesTensorOps(blas::AlgorithmType algo) {
   cublasGemmAlgo_t cublas_algo = static_cast<cublasGemmAlgo_t>(algo);
   return cublas_algo >= CUBLAS_GEMM_DEFAULT_TENSOR_OP;
@@ -1681,8 +1517,7 @@ void initialize_cublas() {
           });
 
   if (!status.ok()) {
-    LOG(ERROR) << "Unable to register cuBLAS factory: "
-               << status.error_message();
+    LOG(ERROR) << "Unable to register cuBLAS factory: " << status.message();
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.h b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.h
index 2ecdaee9dbc..70aaa58aaa1 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.h
@@ -111,26 +111,6 @@ class CUDABlas : public blas::BlasSupport {
       const DeviceMemorySlice<T> &c_array, int ldc, int batch_count,
       ScratchAllocator *scratch_allocator);
 
-  // Helper function for implementing DoBlasGemmWithProfiling.
-  template <typename T, typename ParamType>
-  bool DoBlasGemmWithProfilingImpl(Stream *stream, blas::Transpose transa,
-                                   blas::Transpose transb, uint64_t m,
-                                   uint64_t n, uint64 k, const ParamType &alpha,
-                                   const DeviceMemory<T> &a, int lda,
-                                   const DeviceMemory<T> &b, int ldb,
-                                   const ParamType &beta, DeviceMemory<T> *c,
-                                   int ldc,
-                                   blas::ProfileResult *output_profile_result);
-
-  // Helper function for implementing DoBlasGemvWithProfiling.
-  template <typename T>
-  bool DoBlasGemvWithProfilingImpl(Stream *stream, blas::Transpose trans,
-                                   uint64_t m, uint64 n, const T &alpha,
-                                   const DeviceMemory<T> &a, int lda,
-                                   const DeviceMemory<T> &x, int incx,
-                                   const T &beta, DeviceMemory<T> *y, int incy,
-                                   blas::ProfileResult *output_profile_result);
-
   // Guards the cuBLAS handle for this device.
   absl::Mutex mu_;
 
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc
index b4fc811e897..4b9464e7a51 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc
@@ -216,7 +216,7 @@ void Diagnostician::LogDiagnosticInformation() {
 // driver-interfacing DSO version number. Returns it as a string.
 tsl::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
   tsl::StatusOr<DriverVersion> result(tsl::Status(
-      tsl::error::NOT_FOUND,
+      absl::StatusCode::kNotFound,
       "was unable to find libcuda.so DSO loaded into this program"));
 
 #if defined(__APPLE__)
@@ -286,7 +286,7 @@ tsl::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
   size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
   if (offset == std::string::npos) {
     return tsl::Status(
-        tsl::error::NOT_FOUND,
+        absl::StatusCode::kNotFound,
         absl::StrCat("could not find kernel module information in "
                      "driver version file contents: \"",
                      driver_version_file_contents, "\""));
@@ -347,21 +347,21 @@ tsl::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
   }
   CFRelease(kext_infos);
   auto status = tsl::Status(
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrCat(
           "failed to read driver bundle version: ",
           CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8)));
   return status;
 #elif defined(PLATFORM_WINDOWS)
   auto status =
-      tsl::Status(tsl::error::UNIMPLEMENTED,
+      tsl::Status(absl::StatusCode::kUnimplemented,
                   "kernel reported driver version not implemented on Windows");
   return status;
 #else
   FILE *driver_version_file = fopen(kDriverVersionPath, "r");
   if (driver_version_file == nullptr) {
     return tsl::Status(
-        tsl::error::PERMISSION_DENIED,
+        absl::StatusCode::kPermissionDenied,
         absl::StrCat("could not open driver version path for reading: ",
                      kDriverVersionPath));
   }
@@ -383,7 +383,7 @@ tsl::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
   }
 
   auto status = tsl::Status(
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrCat(
           "failed to read driver version file contents: ", kDriverVersionPath,
           "; ferror: ", ferror(driver_version_file)));
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc
index e3ccb43f2cf..d81122db3c2 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 
 #include "absl/base/optimization.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
+#include "tensorflow/compiler/xla/stream_executor/numeric_options.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
@@ -46,13 +48,13 @@ limitations under the License.
 #include "tensorflow/tsl/cuda/cudnn_version.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/tensor_float_32_utils.h"
-#include "tensorflow/tsl/util/determinism.h"
 #include "tensorflow/tsl/util/env_var.h"
 
 // clang-format off
 #include "third_party/gpus/cudnn/cudnn.h"
 #if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
 #include "third_party/cudnn_frontend/include/cudnn_frontend.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
 #endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
 #include "absl/strings/string_view.h"
 // clang-format on
@@ -392,10 +394,16 @@ tsl::Status CudnnSupport::Init() {
   ScopedActivateExecutorContext context(parent_);
 
   // Peek at the last error to give more information in cases of errors.
-  cudaError_t cerr = cudaPeekAtLastError();
-  if (cerr != cudaSuccess) {
-    LOG(WARNING) << "There was an error before creating cudnn handle: "
-                 << cudaGetErrorName(cerr) << " : " << cudaGetErrorString(cerr);
+  cudaError_t cuda_error = cudaPeekAtLastError();
+  if (cuda_error != cudaSuccess) {
+    // Printing the cuda_error value is useful when cudaGetErrorName doesn't
+    // work.
+    const std::string error =
+        absl::StrCat("There was an error before creating cudnn handle (",
+                     cuda_error, "): ", cudaGetErrorName(cuda_error), " : ",
+                     cudaGetErrorString(cuda_error));
+    LOG(ERROR) << error;
+    return tsl::Status(absl::StatusCode::kInternal, error);
   }
 
   cudnnHandle_t cudnn_handle = nullptr;
@@ -428,6 +436,11 @@ tsl::Status CudnnSupport::Init() {
   CHECK_EQ(cudnn_handle, nullptr);
   LOG(ERROR) << "Could not create cudnn handle: "
              << CudnnStatusToString(status);
+  int64_t free, total;
+  GpuDriver::GetDeviceMemoryInfo(parent_->gpu_context(), &free, &total);
+  LOG(ERROR) << "Memory usage: " << free << " bytes free, " << total
+             << " bytes total.";
+
   if (status == CUDNN_STATUS_NOT_INITIALIZED) {
     auto result = gpu::Diagnostician::FindKernelDriverVersion();
     if (!result.ok()) {
@@ -808,8 +821,8 @@ bool BatchnormSpatialPersistentEnabled() {
   return is_enabled;
 }
 
-bool RequireCudnnDeterminism() {
-  static bool require_cudnn_determinism = [] {
+bool RequireCudnnDeterminism(const NumericOptions& numeric_options) {
+  static bool cudnn_deterministic_env_var = [] {
     // TODO(reedwm): Remove the TF_CUDNN_DETERMINISTIC env var.
     bool cudnn_deterministic = false;
     TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
@@ -817,7 +830,10 @@ bool RequireCudnnDeterminism() {
                                         &cudnn_deterministic));
     return cudnn_deterministic;
   }();
-  return tsl::OpDeterminismRequired() || require_cudnn_determinism;
+  bool require_determinism =
+      cudnn_deterministic_env_var || numeric_options.require_determinism;
+  VLOG(5) << "RequireCudnnDeterminism: " << require_determinism;
+  return require_determinism;
 }
 
 // A helper function to decide whether to force the default conv algorithm.
@@ -937,7 +953,8 @@ static bool IsTensorMathEnabled(Stream* stream, dnn::DataType input_type) {
 class CudnnPoolingDescriptor {
  public:
   explicit CudnnPoolingDescriptor(
-      const dnn::PoolingDescriptor& pooling_descriptor)
+      const dnn::PoolingDescriptor& pooling_descriptor,
+      const NumericOptions& numeric_options)
       : handle_(CreatePoolingDescriptor()) {
     absl::Span<const int64_t> strides64 = pooling_descriptor.strides();
     absl::Span<const int64_t> padding64 = pooling_descriptor.padding();
@@ -954,7 +971,7 @@ class CudnnPoolingDescriptor {
     std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                    &CheckedNarrowing<int64_t, int>);
     bool propagate_nans = pooling_descriptor.propagate_nans();
-    const auto cudnn_max_pooling_mode = RequireCudnnDeterminism()
+    const auto cudnn_max_pooling_mode = RequireCudnnDeterminism(numeric_options)
                                             ? CUDNN_POOLING_MAX_DETERMINISTIC
                                             : CUDNN_POOLING_MAX;
     CHECK_CUDNN_OK(cudnnSetPoolingNdDescriptor(
@@ -3491,6 +3508,29 @@ std::tuple<int, int> GetTensorVectorSizeAndDim(
   return std::make_tuple(vector_size, vector_dim);
 }
 
+#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
+    absl::Span<const int64_t> dims, absl::Span<const int64_t> strides,
+    int64_t uid, dnn::DataType dtype, int64_t vec_count, int64_t vec_dim,
+    bool is_virtual = false,
+    cudnnBackendTensorReordering_t cudnn_tensor_order_type =
+        CUDNN_TENSOR_REORDERING_NONE,
+    bool is_value = false) {
+  auto tensor = cudnn_frontend::TensorBuilder()
+                    .setDim(dims.size(), dims.data())
+                    .setStride(strides.size(), strides.data())
+                    .setId(uid)
+                    .setAlignment(32)
+                    .setDataType(ToCudnnDataType(dtype))
+                    .setVectorCountAndDimension(vec_count, vec_dim)
+                    .setVirtual(is_virtual)
+                    .setReorderType(cudnn_tensor_order_type)
+                    .setByValue(is_value)
+                    .build();
+  RETURN_MSG_IF_CUDNN_ERROR(tensor);
+  return tensor;
+}
+#else
 tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
     absl::Span<const int64_t> dims, absl::Span<const int64_t> strides,
     int64_t uid, dnn::DataType dtype, int64_t vec_count, int64_t vec_dim,
@@ -3519,6 +3559,338 @@ tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
   RETURN_MSG_IF_CUDNN_ERROR(tensor);
   return tensor;
 }
+#endif
+
+#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+// Returns a cudnn tensor that's the output of the mask op
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnMaskTensor(
+    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
+    absl::Span<const int64_t> strides, dnn::DataType dtype,
+    std::shared_ptr<cudnn_frontend::Tensor> input_tensor, double scale) {
+  std::vector<int64_t> mask_dim(dims.size(), 1);
+  std::vector<int64_t> mask_stride(strides.size(), 1);
+
+  // Create the masked out value tensor.
+  TF_ASSIGN_OR_RETURN(
+      auto masked_val_tensor,
+      CreateCudnnTensor(
+          mask_dim, mask_stride, 'A', dnn::DataType::kFloat, 1, -1,
+          /*is_virtual*/ false,
+          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
+          /*is_value*/ true));
+
+  // Create the mask tensor
+  TF_ASSIGN_OR_RETURN(auto mask_tensor,
+                      CreateCudnnTensor(dims, strides, 'P', dtype, 1, -1,
+                                        /*is_virtual=*/false));
+  // Create the mask output tensor
+  TF_ASSIGN_OR_RETURN(
+      auto mask_out_tensor,
+      CreateCudnnTensor(dims, strides, 'm' + 500, dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual=*/true));
+
+  auto mask_desc = cudnn_frontend::PointWiseDescBuilder()
+                       .setMode(CUDNN_POINTWISE_MUL)
+                       .setComputeType(CUDNN_DATA_FLOAT)
+                       .build();
+
+  // Create the mask op.
+  auto mask_op = cudnn_frontend::OperationBuilder(
+                     CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                     .setxDesc((*input_tensor))
+                     .setbDesc(mask_tensor)
+                     .setAlpha(scale)
+                     .setyDesc(mask_out_tensor)
+                     .setpwDesc(mask_desc)
+                     .build();
+
+  RETURN_MSG_IF_CUDNN_ERROR(mask_op);
+
+  RETURN_MSG_IF_CUDNN_ERROR(mask_out_tensor);
+  // Add mask to op list
+  ops.push_back(std::move(mask_op));
+
+  return mask_out_tensor;
+}
+
+// Returns a cudnn tensor that's the output of the bias addition op
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnBiasTensor(
+    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
+    absl::Span<const int64_t> strides, dnn::DataType dtype,
+    std::shared_ptr<cudnn_frontend::Tensor> input_tensor, double alpha_scale,
+    bool use_mask) {
+  // Create the bias tensor.
+  TF_ASSIGN_OR_RETURN(auto bias_tensor,
+                      CreateCudnnTensor(dims, strides, 'B', dtype, 1, -1));
+
+  // Create the bias output tensor
+  dnn::DataType bias_out_type = use_mask ? dtype : dnn::DataType::kFloat;
+  TF_ASSIGN_OR_RETURN(
+      auto bias_out_tensor,
+      CreateCudnnTensor(dims, strides, 'B' + 200, bias_out_type, 1, -1,
+                        /*is_virtual=*/true));
+
+  // Define the bias descriptor
+  auto bias_desc = cudnn_frontend::PointWiseDescBuilder()
+                       .setMode(CUDNN_POINTWISE_ADD)
+                       .setComputeType(CUDNN_DATA_FLOAT)
+                       .build();
+  // Create the bias op.
+  auto bias_op = cudnn_frontend::OperationBuilder(
+                     CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                     .setxDesc((*input_tensor))
+                     .setbDesc(bias_tensor)
+                     .setyDesc(bias_out_tensor)
+                     .setAlpha(alpha_scale)
+                     .setpwDesc(bias_desc)
+                     .build();
+
+  RETURN_MSG_IF_CUDNN_ERROR(bias_op);
+
+  RETURN_MSG_IF_CUDNN_ERROR(bias_out_tensor);
+  // Add bias to op list
+  ops.push_back(std::move(bias_op));
+
+  return bias_out_tensor;
+}
+
+// Returns a cudnn tensor that's the output of the softmax op
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnSoftmaxFwdTensor(
+    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
+    absl::Span<const int64_t> strides, dnn::DataType dtype,
+    std::shared_ptr<cudnn_frontend::Tensor> input_tensor, bool use_dropout,
+    bool is_virtual = false) {
+  // softmax's typical computation is:
+  // exp(input - reduce_max(input)) / reduce_sum(exp(input - reduce_max(input)))
+  // We need to create each op and add it to the op list sequentially.
+
+  // Copy all dims except the last dim since it's reduced to 1.
+  std::vector<int64_t> reduction_output_dim(dims.begin(), dims.end() - 1);
+  reduction_output_dim.push_back(1);
+
+  // Divide every stride by the last dim value.
+  std::vector<int64_t> reduction_output_stride;
+  int64_t reduced_dim_len = dims.back();
+  for (auto stride : strides) {
+    reduction_output_stride.push_back(stride / reduced_dim_len);
+  }
+
+  // Softmax output should be float
+  cudnnDataType_t softmax_output_type = CUDNN_DATA_FLOAT;
+
+  // Create output tensor of the first max reduction.
+  TF_ASSIGN_OR_RETURN(
+      auto max_reduction_output_tensor,
+      CreateCudnnTensor(reduction_output_dim, reduction_output_stride, 'm',
+                        dnn::DataType::kFloat, 1, -1, /*is_virtual=*/true));
+
+  // Create the reduction descriptor
+  auto max_reduction_desc = cudnn_frontend::ReductionDescBuilder()
+                                .setComputeType(softmax_output_type)
+                                .setReductionOp(CUDNN_REDUCE_TENSOR_MAX)
+                                .build();
+
+  // Create a reduction max node.
+  auto max_reduction_op = cudnn_frontend::OperationBuilder(
+                              CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                              .setxDesc((*input_tensor))
+                              .setyDesc(max_reduction_output_tensor)
+                              .setreductionDesc(max_reduction_desc)
+                              .build();
+  RETURN_MSG_IF_CUDNN_ERROR(max_reduction_op);
+
+  // Create output tensor of the subtraction op.
+  TF_ASSIGN_OR_RETURN(
+      auto subtract_output_tensor,
+      CreateCudnnTensor(dims, strides, 'S', dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual=*/true));
+  // Create the subtraction descriptor
+  auto subtract_desc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_SUB)
+                           .setComputeType(softmax_output_type)
+                           .build();
+
+  // Create a subtraction node.
+  auto subtract_op = cudnn_frontend::OperationBuilder(
+                         CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                         .setxDesc((*input_tensor))
+                         .setbDesc(max_reduction_output_tensor)
+                         .setyDesc(subtract_output_tensor)
+                         .setpwDesc(subtract_desc)
+                         .build();
+  RETURN_MSG_IF_CUDNN_ERROR(subtract_op);
+  // Create output tensor of the exp op.
+  TF_ASSIGN_OR_RETURN(
+      auto exp_output_tensor,
+      CreateCudnnTensor(dims, strides, 'e', dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual=*/true));
+  // Create the exponetial descriptor
+  auto exp_desc = cudnn_frontend::PointWiseDescBuilder()
+                      .setMode(CUDNN_POINTWISE_EXP)
+                      .setComputeType(softmax_output_type)
+                      .build();
+
+  // Create a exponetial node.
+  auto exp_op = cudnn_frontend::OperationBuilder(
+                    CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                    .setxDesc(subtract_output_tensor)
+                    .setyDesc(exp_output_tensor)
+                    .setpwDesc(exp_desc)
+                    .build();
+  RETURN_MSG_IF_CUDNN_ERROR(exp_op);
+
+  // Create output tensor of the sum reduction.
+  TF_ASSIGN_OR_RETURN(
+      auto sum_reduction_output_tensor,
+      CreateCudnnTensor(reduction_output_dim, reduction_output_stride, 'u',
+                        dnn::DataType::kFloat, 1, -1, /*is_virtual=*/true));
+  // Create the reduction descriptor
+  auto sum_reduction_desc = cudnn_frontend::ReductionDescBuilder()
+                                .setComputeType(softmax_output_type)
+                                .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                                .build();
+
+  // Create a reduction sum node.
+  auto sum_reduction_op = cudnn_frontend::OperationBuilder(
+                              CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                              .setxDesc(exp_output_tensor)
+                              .setyDesc(sum_reduction_output_tensor)
+                              .setreductionDesc(sum_reduction_desc)
+                              .build();
+  RETURN_MSG_IF_CUDNN_ERROR(sum_reduction_op);
+
+  // Create output tensor of the divide op.
+  // cudnnBackendTensorReordering_t tensor_ordering =
+  //     use_dropout ? CUDNN_TENSOR_REORDERING_F16x16
+  //                 : CUDNN_TENSOR_REORDERING_NONE;
+  cudnnBackendTensorReordering_t tensor_ordering = CUDNN_TENSOR_REORDERING_NONE;
+  if (use_dropout) {
+    tensor_ordering = CUDNN_TENSOR_REORDERING_F16x16;
+  }
+  TF_ASSIGN_OR_RETURN(
+      auto divide_output_tensor,
+      CreateCudnnTensor(dims, strides, 'd', dtype, 1, -1,
+                        /*is_virtual*/ is_virtual,
+                        /*cudnn_tensor_order_type*/ tensor_ordering));
+  // Create the divide descriptor
+  auto divide_desc = cudnn_frontend::PointWiseDescBuilder()
+                         .setMode(CUDNN_POINTWISE_DIV)
+                         .setComputeType(softmax_output_type)
+                         .build();
+
+  // Create a divide node.
+  auto divide_op = cudnn_frontend::OperationBuilder(
+                       CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                       .setxDesc(exp_output_tensor)
+                       .setbDesc(sum_reduction_output_tensor)
+                       .setyDesc(divide_output_tensor)
+                       .setpwDesc(divide_desc)
+                       .build();
+  RETURN_MSG_IF_CUDNN_ERROR(divide_op);
+
+  RETURN_MSG_IF_CUDNN_ERROR(divide_output_tensor);
+
+  // Add max reduction to op list
+  ops.push_back(std::move(max_reduction_op));
+  // Add subtract to op list
+  ops.push_back(std::move(subtract_op));
+  // Add exponetial to op list
+  ops.push_back(std::move(exp_op));
+  // Add sum reduction to op list
+  ops.push_back(std::move(sum_reduction_op));
+  // Add divide to op list
+  ops.push_back(std::move(divide_op));
+
+  return divide_output_tensor;
+}
+
+// Returns a cudnn tensor that's the output of the dropout op
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnDropoutTensor(
+    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
+    absl::Span<const int64_t> strides, dnn::DataType dtype,
+    std::shared_ptr<cudnn_frontend::Tensor> input_tensor, double dropout_rate,
+    int64_t seed) {
+  // Create tensor for dropout's mask.
+  TF_ASSIGN_OR_RETURN(
+      auto mask_tensor,
+      CreateCudnnTensor(dims, strides, 'K', dnn::DataType::kFloat, 1, -1,
+                        /*is_virtual*/ true));
+  // Create output tensor of dropout node
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_out_tensor,
+      CreateCudnnTensor(
+          dims, strides, 'D', dtype, 1, -1, /*is_virtual*/ true,
+          /*cudnn_tensor_order_type*/
+          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_F16x16));
+
+  // Create description for rng node
+  auto rng_desc = cudnn_frontend::RngDescBuilder()
+                      .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI)
+                      .setBernoulliDistProbability(1.0 - dropout_rate)
+                      .build();
+  // Create the rng Node.
+  auto rng_op =
+      cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR)
+          .setyDesc(mask_tensor)
+          .setSeed(seed)
+          .setRngDesc(rng_desc)
+          .build();
+  RETURN_MSG_IF_CUDNN_ERROR(rng_op);
+
+  // Create the masking node desc after mask tensor
+  auto masking_desc = cudnn_frontend::PointWiseDescBuilder()
+                          .setMode(CUDNN_POINTWISE_MUL)
+                          .setComputeType(CUDNN_DATA_FLOAT)
+                          .build();
+
+  auto masking_op = cudnn_frontend::OperationBuilder(
+                        CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                        .setxDesc((*input_tensor))
+                        .setbDesc(mask_tensor)
+                        .setyDesc(dropout_out_tensor)
+                        .setpwDesc(masking_desc)
+                        .build();
+  RETURN_MSG_IF_CUDNN_ERROR(masking_op);
+
+  // Create scale tensor
+  std::vector<int64_t> scale_dims(dims.size(), 1);
+  std::vector<int64_t> scale_strides(strides.size(), 1);
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_scale_tensor,
+      CreateCudnnTensor(
+          scale_dims, scale_strides, 'l', dtype, 1, -1, /*is_virtual*/ false,
+          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE));
+
+  // Create output of scale node
+  TF_ASSIGN_OR_RETURN(
+      auto dropout_scale_out_tensor,
+      CreateCudnnTensor(dims, strides, 'r', dtype, 1, -1, /*is_virtual*/ true));
+  // Create the scaling desc
+  auto scale_desc = cudnn_frontend::PointWiseDescBuilder()
+                        .setMode(CUDNN_POINTWISE_MUL)
+                        .setComputeType(CUDNN_DATA_FLOAT)
+                        .build();
+  // Create the scaling op
+  auto scale_op = cudnn_frontend::OperationBuilder(
+                      CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                      .setxDesc(dropout_out_tensor)
+                      .setbDesc(dropout_scale_tensor)
+                      .setyDesc(dropout_scale_out_tensor)
+                      .setpwDesc(scale_desc)
+                      .build();
+  RETURN_MSG_IF_CUDNN_ERROR(scale_op);
+
+  RETURN_MSG_IF_CUDNN_ERROR(dropout_scale_out_tensor);
+  // Add rng op to op list
+  ops.push_back(std::move(rng_op));
+  // Add masking op to op list
+  ops.push_back(std::move(masking_op));
+  // Add scaling op to op list
+  ops.push_back(std::move(scale_op));
+
+  return dropout_scale_out_tensor;
+}
+#endif
 
 tsl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
 GetCudnnOperationGraph(dnn::ConvolutionKind kind, dnn::DataType input_type,
@@ -3565,14 +3937,28 @@ GetCudnnOperationGraph(dnn::ConvolutionKind kind, dnn::DataType input_type,
   std::vector<int64_t> filter_strides = filter_descriptor.vectorized_strides(
       dnn::FilterLayout::kOutputInputYX, vector_size, vector_dim);
 
+#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+  cudnnBackendTensorReordering_t tensor_ordering_type =
+      filter_descriptor.layout() ==
+              dnn::FilterLayout::kOutputInputYX32_CudnnReordered
+          ? CUDNN_TENSOR_REORDERING_INT8x32
+          : CUDNN_TENSOR_REORDERING_NONE;
+#else
   bool is_reordered_nchw_vect =
       filter_descriptor.layout() ==
       dnn::FilterLayout::kOutputInputYX32_CudnnReordered;
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_w,
-      CreateCudnnTensor(filter_dims, filter_strides, 'w', input_type,
-                        vector_size, vector_dim,
-                        /*is_virtual=*/false, is_reordered_nchw_vect));
+#endif
+
+  TF_ASSIGN_OR_RETURN(auto tensor_w,
+                      CreateCudnnTensor(filter_dims, filter_strides, 'w',
+                                        input_type, vector_size, vector_dim,
+                                        /*is_virtual=*/false,
+#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+                                        tensor_ordering_type
+#else
+                                        is_reordered_nchw_vect
+#endif
+                                        ));
 
   // conv_desc.
   auto mode = convolution_descriptor.convolution_not_crosscorr()
@@ -3701,14 +4087,28 @@ GetCudnnFusedOperationGraph(
   std::vector<int64_t> filter_strides = filter_descriptor.vectorized_strides(
       dnn::FilterLayout::kOutputInputYX, vector_size, vector_dim);
 
+#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+  cudnnBackendTensorReordering_t tensor_ordering_type =
+      filter_descriptor.layout() ==
+              dnn::FilterLayout::kOutputInputYX32_CudnnReordered
+          ? CUDNN_TENSOR_REORDERING_INT8x32
+          : CUDNN_TENSOR_REORDERING_NONE;
+#else
   bool is_reordered_nchw_vect =
       filter_descriptor.layout() ==
       dnn::FilterLayout::kOutputInputYX32_CudnnReordered;
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_w,
-      CreateCudnnTensor(filter_dims, filter_strides, 'w', input_type,
-                        vector_size, vector_dim,
-                        /*is_virtual=*/false, is_reordered_nchw_vect));
+#endif
+
+  TF_ASSIGN_OR_RETURN(auto tensor_w,
+                      CreateCudnnTensor(filter_dims, filter_strides, 'w',
+                                        input_type, vector_size, vector_dim,
+                                        /*is_virtual=*/false,
+#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+                                        tensor_ordering_type
+#else
+                                        is_reordered_nchw_vect
+#endif
+                                        ));  // cuDNN 8.3 fails here
 
   // For the purposes of the cudnn graph, say that the bias tensor has the same
   // layout as the output tensor.  It doesn't actually matter, because bias is a
@@ -3742,9 +4142,15 @@ GetCudnnFusedOperationGraph(
   // kFloat). If it's not, then cuDNN silently does the reordering under the
   // hood, which yields incorrect results as we already do the reordering
   // ourselves.
-  auto maybe_tensor_b = CreateCudnnTensor(
-      bias_dims, bias_strides, 'b', bias_type, vector_size, vector_dim,
-      /*is_virtual=*/false, is_reordered_nchw_vect);  // cuDNN 8.3 fails here
+  auto maybe_tensor_b = CreateCudnnTensor(bias_dims, bias_strides, 'b',
+                                          bias_type, vector_size, vector_dim,
+                                          /*is_virtual=*/false,
+#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+                                          tensor_ordering_type
+#else
+                                          is_reordered_nchw_vect
+#endif
+  );  // cuDNN 8.3 fails here
   if (!maybe_tensor_b.ok()) {
     maybe_tensor_b = CreateCudnnTensor(bias_dims, bias_strides, 'b', bias_type,
                                        vector_size, vector_dim);
@@ -4077,6 +4483,213 @@ GetCudnnFusedMatmulGraph(dnn::DataType input_type, dnn::DataType bias_type,
   return std::make_unique<cudnn_frontend::OperationGraph>(std::move(op_graph));
 }
 
+#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+tsl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
+GetCudnnFusedMHAOperationGraph(
+    const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+    const dnn::TensorDescriptor& mask_descriptor,
+    const dnn::TensorDescriptor& bias_descriptor,
+    const dnn::TensorDescriptor& output_descriptor, dnn::FusedMHAKind kind,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed,
+    CudnnHandle& cudnn, double scale, bool use_dropout = false,
+    bool use_mask = false, bool use_bias = false) {
+  if (VLOG_IS_ON(4)) {
+    VLOG(4) << "\n bmm1_lhs(q): " << bmm1_lhs_descriptor.ToString()
+            << "\n bmm1_rhs(k): " << bmm1_rhs_descriptor.ToString()
+            << "\n bmm2_lhs(s): " << intermediate_bmm2_lhs_descriptor.ToString()
+            << "\n bmm2_rhs(v): " << bmm2_rhs_descriptor.ToString()
+            << "\n out(o): " << output_descriptor.ToString();
+  }
+  // cnn_infer needs to be preloaded for fMHA as well. Reusing the function
+  // created for convolution for fMHA.
+  PreloadCudnnSubLibsHelper(dnn::ConvolutionKind::FORWARD);
+
+  std::vector<cudnn_frontend::Operation const*> ops;
+  std::vector<cudnn_frontend::Operation> intermdiate_ops;
+
+  // Batched Matmul: bmm1_lhs: tensor_q, bmm1_rhs:tensor_k; output: tensor_s
+  // (virtual)
+  // Batched Matmul: bmm2_lhs: tensor_s, bmm2_rhs:tensor_v; output: tensor_o
+  std::vector<int64_t> bmm1_lhs_dims =
+      bmm1_lhs_descriptor.GetCudnnCompatibleDimensions(true);
+  std::vector<int64_t> bmm1_lhs_strides =
+      bmm1_lhs_descriptor.GetCudnnCompatibleStrides(true);
+
+  VLOG(2) << "\n cuDNN compatible bmm1_lhs_dims: "
+          << absl::StrJoin(bmm1_lhs_dims, ",")
+          << "\n cuDNN compatible bmm1_lhs_strides: "
+          << absl::StrJoin(bmm1_lhs_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(auto tensor_q,
+                      CreateCudnnTensor(bmm1_lhs_dims, bmm1_lhs_strides, 'q',
+                                        bmm1_lhs_descriptor.type(), 1, -1));
+
+  std::vector<int64_t> bmm1_rhs_dims =
+      bmm1_rhs_descriptor.GetCudnnCompatibleDimensions(false);
+  std::vector<int64_t> bmm1_rhs_strides =
+      bmm1_rhs_descriptor.GetCudnnCompatibleStrides(false);
+
+  VLOG(2) << "\n cuDNN compatible bmm1_rhs_dims: "
+          << absl::StrJoin(bmm1_rhs_dims, ",")
+          << "\n cuDNN compatible bmm1_rhs_strides: "
+          << absl::StrJoin(bmm1_rhs_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(auto tensor_k,
+                      CreateCudnnTensor(bmm1_rhs_dims, bmm1_rhs_strides, 'k',
+                                        bmm1_rhs_descriptor.type(), 1, -1));
+
+  std::vector<int64_t> intermediate_bmm2_lhs_dims =
+      intermediate_bmm2_lhs_descriptor.GetCudnnCompatibleDimensions(true);
+  std::vector<int64_t> intermediate_bmm2_lhs_strides =
+      intermediate_bmm2_lhs_descriptor.GetCudnnCompatibleStrides(true);
+
+  VLOG(2) << "\n cuDNN compatible intermediate_bmm2_lhs_dims: "
+          << absl::StrJoin(intermediate_bmm2_lhs_dims, ",")
+          << "\n cuDNN compatible intermediate_bmm2_lhs_strides: "
+          << absl::StrJoin(intermediate_bmm2_lhs_strides, ",");
+  dnn::DataType s_tensor_type = kind == dnn::FusedMHAKind::BMM1_OUTPUT_FLOAT
+                                    ? dnn::DataType::kFloat
+                                    : intermediate_bmm2_lhs_descriptor.type();
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_s,
+      CreateCudnnTensor(intermediate_bmm2_lhs_dims,
+                        intermediate_bmm2_lhs_strides, 's', s_tensor_type, 1,
+                        -1, /*is_virtual=*/true));
+
+  auto bmm1_desc = cudnn_frontend::MatMulDescBuilder()
+                       .setComputeType(CUDNN_DATA_FLOAT)
+                       .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_desc);
+  auto bmm1_op = cudnn_frontend::OperationBuilder(
+                     CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                     .setaMatDesc(tensor_q)
+                     .setbMatDesc(tensor_k)
+                     .setcMatDesc(tensor_s)
+                     .setmatmulDesc(bmm1_desc)
+                     .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm1_op);
+  intermdiate_ops.push_back(std::move(bmm1_op));
+
+  std::shared_ptr<cudnn_frontend::Tensor> bmm2_input_tensor =
+      std::make_shared<cudnn_frontend::Tensor>(std::move(tensor_s));
+  if (use_dropout || use_mask || kind == dnn::FusedMHAKind::BMM1_OUTPUT_FLOAT ||
+      use_bias) {
+    double alpha_scale = scale;
+    if (use_bias) {
+      // Create bias op and tensor
+      TF_ASSIGN_OR_RETURN(
+          auto bias_out,
+          CreateCudnnBiasTensor(intermdiate_ops, intermediate_bmm2_lhs_dims,
+                                intermediate_bmm2_lhs_strides,
+                                bias_descriptor.type(), bmm2_input_tensor,
+                                alpha_scale, use_mask));
+      bmm2_input_tensor =
+          std::make_shared<cudnn_frontend::Tensor>(std::move(bias_out));
+      // Scaling is done, reset the scale back to 1.
+      alpha_scale = 1.0f;
+    }
+    if (use_mask) {
+      // Create mask op and tensor
+      TF_ASSIGN_OR_RETURN(
+          auto mask_out,
+          CreateCudnnMaskTensor(intermdiate_ops, intermediate_bmm2_lhs_dims,
+                                intermediate_bmm2_lhs_strides,
+                                intermediate_bmm2_lhs_descriptor.type(),
+                                bmm2_input_tensor, alpha_scale));
+      bmm2_input_tensor =
+          std::make_shared<cudnn_frontend::Tensor>(std::move(mask_out));
+    }
+    // Create Softmax tensor
+    // The output is always a virtual for inference mode.
+    TF_ASSIGN_OR_RETURN(auto softmax_fwd_out,
+                        CreateCudnnSoftmaxFwdTensor(
+                            intermdiate_ops, intermediate_bmm2_lhs_dims,
+                            intermediate_bmm2_lhs_strides,
+                            intermediate_bmm2_lhs_descriptor.type(),
+                            /*input_tensor*/ bmm2_input_tensor, use_dropout,
+                            /*is_virtual*/ true));
+
+    bmm2_input_tensor =
+        std::make_shared<cudnn_frontend::Tensor>(std::move(softmax_fwd_out));
+
+    if (use_dropout) {
+      // Create dropout tensor
+      TF_ASSIGN_OR_RETURN(
+          auto dropout_out,
+          CreateCudnnDropoutTensor(intermdiate_ops, intermediate_bmm2_lhs_dims,
+                                   intermediate_bmm2_lhs_strides,
+                                   intermediate_bmm2_lhs_descriptor.type(),
+                                   /*input_tensor*/ bmm2_input_tensor,
+                                   *dropout_rate, *seed));
+      bmm2_input_tensor =
+          std::make_shared<cudnn_frontend::Tensor>(std::move(dropout_out));
+    }
+  }
+  std::vector<int64_t> bmm2_rhs_dims =
+      bmm2_rhs_descriptor.GetCudnnCompatibleDimensions(false);
+  std::vector<int64_t> bmm2_rhs_strides =
+      bmm2_rhs_descriptor.GetCudnnCompatibleStrides(false);
+
+  VLOG(2) << "\n cuDNN compatible bmm2_rhs_dims: "
+          << absl::StrJoin(bmm2_rhs_dims, ",")
+          << "\n cuDNN compatible bmm2_rhs_strides: "
+          << absl::StrJoin(bmm2_rhs_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(auto tensor_v,
+                      CreateCudnnTensor(bmm2_rhs_dims, bmm2_rhs_strides, 'v',
+                                        bmm2_rhs_descriptor.type(), 1, -1));
+
+  std::vector<int64_t> output_dims = output_descriptor.dimensions();
+  std::vector<int64_t> output_strides = output_descriptor.GetLogicalStrides();
+
+  VLOG(2) << "\n Out Dims: " << absl::StrJoin(output_dims, ",")
+          << "\n Out Strides: " << absl::StrJoin(output_strides, ",");
+
+  TF_ASSIGN_OR_RETURN(auto tensor_o,
+                      CreateCudnnTensor(output_dims, output_strides, 'o',
+                                        output_descriptor.type(), 1, -1));
+  auto bmm2_desc = cudnn_frontend::MatMulDescBuilder()
+                       .setComputeType(CUDNN_DATA_FLOAT)
+                       .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm2_desc);
+  auto bmm2_op = cudnn_frontend::OperationBuilder(
+                     CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                     .setaMatDesc((*bmm2_input_tensor))
+                     .setbMatDesc(tensor_v)
+                     .setcMatDesc(tensor_o)
+                     .setmatmulDesc(bmm2_desc)
+                     .build();
+  RETURN_MSG_IF_CUDNN_ERROR(bmm2_op);
+  // Create an Operation Graph. In this case it is gemm-gemm
+  intermdiate_ops.push_back(std::move(bmm2_op));
+  ops.reserve(intermdiate_ops.size());
+  for (auto& intermediate_op : intermdiate_ops) {
+    ops.push_back(&intermediate_op);
+  }
+
+  auto op_graph = cudnn_frontend::OperationGraphBuilder()
+                      .setHandle(cudnn.handle())
+                      .setOperationGraph(ops.size(), ops.data())
+                      .build();
+  RETURN_MSG_IF_CUDNN_ERROR(op_graph);
+
+  VLOG(4) << "\nTensor_q: " << tensor_q.describe()
+          << "\nTensor_k: " << tensor_k.describe()
+          << "\nTensor_s: " << tensor_s.describe()
+          << "\nTensor_v: " << tensor_v.describe()
+          << "\nTensor_o: " << tensor_o.describe()
+          << "\nBMM1: " << bmm1_desc.describe()
+          << "\nBMM1_op: " << bmm1_op.describe()
+          << "\nBMM2: " << bmm2_desc.describe()
+          << "\nBMM2_op: " << bmm2_op.describe()
+          << "\nOpGraph: " << op_graph.describe();
+  return std::make_unique<cudnn_frontend::OperationGraph>(std::move(op_graph));
+}
+#endif
+
 }  // namespace
 
 static tsl::StatusOr<cudnn_frontend::ExecutionPlan> RebuildExecutionPlan(
@@ -4775,13 +5388,13 @@ tsl::Status CreateOpRunners(
     dnn::ConvolutionKind kind, dnn::DataType input_type,
     absl::Span<const int64_t> input_uids, bool use_fallback,
     std::vector<std::unique_ptr<const dnn::OpRunner<Sig>>>* out_runners,
-    bool need_side_input) {
+    bool need_side_input, const NumericOptions& numeric_options) {
   cudnn_frontend::EngineConfigList filtered_configs;
   auto generic_filter_fn = [=](cudnnBackendDescriptor_t engine_config) -> bool {
     return GenericEngineFilter(
         engine_config,
         /*disable_winograd*/ !CudnnEnvVar<WinogradNonfused>::IsEnabled(),
-        /*disable_nondeterminism*/ RequireCudnnDeterminism(),
+        /*disable_nondeterminism*/ RequireCudnnDeterminism(numeric_options),
         /*disable_tensor_core*/ !IsTensorMathEnabled(stream, input_type));
   };
 
@@ -4839,7 +5452,7 @@ tsl::Status CreateOpRunners(
         std::move(runner_or).value()));
 
     // We will use the first working plan when determinism is required.
-    if (RequireCudnnDeterminism()) {
+    if (RequireCudnnDeterminism(numeric_options)) {
       break;
     }
   }
@@ -4863,6 +5476,7 @@ tsl::Status CudnnSupport::GetConvolveRunners(
     DeviceMemoryBase /*output_data*/,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
     ScratchAllocator* /*scratch_allocator*/,
+    const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans) {
   // cuDNN frontend support became sufficiently stable to use in 8.1.
   // TODO(awpr): remove this condition once support for cuDNN 8.0 is dropped.
@@ -4904,15 +5518,15 @@ tsl::Status CudnnSupport::GetConvolveRunners(
             "Unknown ConvolutionKind for unfused conv: %d", kind));
       case dnn::ConvolutionKind::FORWARD:
         got_algos = GetConvolveAlgorithms(cuda_compute_capability, input_type,
-                                          &algorithms);
+                                          numeric_options, &algorithms);
         break;
       case dnn::ConvolutionKind::BACKWARD_FILTER:
         got_algos = GetConvolveBackwardFilterAlgorithms(
-            cuda_compute_capability, input_type, &algorithms);
+            cuda_compute_capability, input_type, numeric_options, &algorithms);
         break;
       case dnn::ConvolutionKind::BACKWARD_DATA:
-        got_algos = GetConvolveBackwardDataAlgorithms(cuda_compute_capability,
-                                                      input_type, &algorithms);
+        got_algos = GetConvolveBackwardDataAlgorithms(
+            cuda_compute_capability, input_type, numeric_options, &algorithms);
         break;
     }
     if (!got_algos) {
@@ -4954,7 +5568,7 @@ tsl::Status CudnnSupport::GetConvolveRunners(
   return CreateOpRunners<dnn::ConvSignature>(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph), kind,
       input_type, {'x', 'w', 'y'}, use_fallback, out_exec_plans,
-      /*need_side_input=*/false);
+      /*need_side_input=*/false, numeric_options);
 #else
   return tsl::errors::Unimplemented(
       "Cudnn execution plans are only supported with Cudnn >= 8.1.");
@@ -5307,6 +5921,7 @@ tsl::Status CudnnSupport::GetFusedConvolveRunners(
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
     const dnn::ActivationMode activation_mode,
+    const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
   // Fused convolutions with identity activations are broken in that they
   // implicitly do ReLU on some engines, and we can't reliably detect which
@@ -5383,7 +5998,7 @@ tsl::Status CudnnSupport::GetFusedConvolveRunners(
 
     auto cuda_compute_capability = stream->GetCudaComputeCapability();
     if (!GetConvolveAlgorithms(cuda_compute_capability, input_type,
-                               &algorithms)) {
+                               numeric_options, &algorithms)) {
       return tsl::Status(absl::StatusCode::kUnknown,
                          "Listing fused convolve algorithms failed.");
     }
@@ -5430,7 +6045,7 @@ tsl::Status CudnnSupport::GetFusedConvolveRunners(
   return CreateOpRunners<dnn::FusedConvSignature>(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph), kind,
       input_type, {'x', 'w', 'z', 'b', 'y'}, use_fallback, out_exec_plans,
-      need_side_input);
+      need_side_input, numeric_options);
 #else
   return tsl::errors::Unimplemented(
       "Cudnn execution plans are only supported with Cudnn >= 8.1.");
@@ -5442,6 +6057,7 @@ tsl::Status CudnnSupport::GetFusedMatmulRunners(
     dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
     uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
     dnn::ActivationMode activation_mode, bool use_fallback,
+    const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
 #if CUDNN_VERSION >= 8400 && TF_ENABLE_CUDNN_FRONTEND
@@ -5469,7 +6085,7 @@ tsl::Status CudnnSupport::GetFusedMatmulRunners(
   return CreateOpRunners<dnn::FusedMatmulSignature>(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph),
       dnn::ConvolutionKind::INVALID, input_type, {'a', 'b', 'z', 'c'},
-      use_fallback, out_exec_plans, /*need_side_input=*/true);
+      use_fallback, out_exec_plans, /*need_side_input=*/true, numeric_options);
 #else
   return tsl::errors::Unimplemented(
       "Cudnn execution plans for matmul are only supported with Cudnn >= 8.4.");
@@ -5478,6 +6094,7 @@ tsl::Status CudnnSupport::GetFusedMatmulRunners(
 
 bool CudnnSupport::GetConvolveAlgorithms(
     CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
+    const NumericOptions& numeric_options,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::ConvFwd);
 
@@ -5515,6 +6132,167 @@ bool CudnnSupport::GetConvolveAlgorithms(
   return true;
 }
 
+tsl::StatusOr<std::unique_ptr<const dnn::FusedMHASoftmaxRunner>>
+CudnnSupport::FusedMHASoftmaxRunnerFromDesc(
+    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+    dnn::FusedMHAKind kind,
+    const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+    const dnn::TensorDescriptor& output_descriptor,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+#if CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  // Create empty descriptors for bias and mask tensors
+  dnn::TensorDescriptor empty_mask_desc;
+  dnn::TensorDescriptor empty_bias_desc;
+  bool use_dropout = dropout_rate && *dropout_rate > 0.0;
+  TF_ASSIGN_OR_RETURN(auto op_graph,
+                      GetCudnnFusedMHAOperationGraph(
+                          bmm1_lhs_descriptor, bmm1_rhs_descriptor,
+                          bmm2_rhs_descriptor, intermediate_bmm2_lhs_descriptor,
+                          empty_mask_desc, empty_bias_desc, output_descriptor,
+                          kind, dropout_rate, seed, cudnn, 1.0f, use_dropout));
+
+  TF_ASSIGN_OR_RETURN(auto execution_plan,
+                      RebuildExecutionPlan(cudnn, algorithm_desc, *op_graph));
+
+  TF_ASSIGN_OR_RETURN(
+      auto runner,
+      CudnnExecutionPlanRunner<dnn::FusedMHASoftmaxSignature>::Create(
+          parent_, cudnn_.get(), std::move(execution_plan),
+          {'q', 'k', 'v', 'o'}, false));
+  return {
+      std::make_unique<CudnnExecutionPlanRunner<dnn::FusedMHASoftmaxSignature>>(
+          std::move(runner))};
+#else
+  return tsl::errors::Unimplemented(
+      "Cudnn execution plans are only supported with Cudnn >= 8.8.");
+#endif
+}
+
+tsl::StatusOr<std::unique_ptr<const dnn::FusedMHAMaskRunner>>
+CudnnSupport::FusedMHAScaleMaskSoftmaxRunnerFromDesc(
+    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+    dnn::FusedMHAKind kind,
+    const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+    const dnn::TensorDescriptor& output_descriptor,
+    const dnn::TensorDescriptor& mask_descriptor, double scale,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+#if CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  bool use_dropout = dropout_rate && *dropout_rate > 0.0;
+  // Create empty bias decriptor
+  dnn::TensorDescriptor empty_bias_desc;
+  TF_ASSIGN_OR_RETURN(auto op_graph,
+                      GetCudnnFusedMHAOperationGraph(
+                          bmm1_lhs_descriptor, bmm1_rhs_descriptor,
+                          bmm2_rhs_descriptor, intermediate_bmm2_lhs_descriptor,
+                          mask_descriptor, empty_bias_desc, output_descriptor,
+                          kind, dropout_rate, seed, cudnn, scale, use_dropout,
+                          /*use_mask*/ true));
+
+  TF_ASSIGN_OR_RETURN(auto execution_plan,
+                      RebuildExecutionPlan(cudnn, algorithm_desc, *op_graph));
+  TF_ASSIGN_OR_RETURN(
+      auto runner, CudnnExecutionPlanRunner<dnn::FusedMHAMaskSignature>::Create(
+                       parent_, cudnn_.get(), std::move(execution_plan),
+                       {'q', 'k', 'P', 'v', 'o'}, true));
+  return {
+      std::make_unique<CudnnExecutionPlanRunner<dnn::FusedMHAMaskSignature>>(
+          std::move(runner))};
+#else
+  return tsl::errors::Unimplemented(
+      "Cudnn execution plans are only supported with Cudnn >= 8.8.");
+#endif
+}
+
+tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABiasMaskRunner>>
+CudnnSupport::FusedMHAScaleBiasMaskSoftmaxRunnerFromDesc(
+    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+    dnn::FusedMHAKind kind,
+    const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+    const dnn::TensorDescriptor& output_descriptor,
+    const dnn::TensorDescriptor& mask_descriptor,
+    const dnn::TensorDescriptor& bias_descriptor, double scale,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+#if CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  bool use_dropout = dropout_rate && *dropout_rate > 0.0;
+  TF_ASSIGN_OR_RETURN(auto op_graph,
+                      GetCudnnFusedMHAOperationGraph(
+                          bmm1_lhs_descriptor, bmm1_rhs_descriptor,
+                          bmm2_rhs_descriptor, intermediate_bmm2_lhs_descriptor,
+                          mask_descriptor, bias_descriptor, output_descriptor,
+                          kind, dropout_rate, seed, cudnn, scale, use_dropout,
+                          /*use_mask*/ true, /*use_bias*/ true));
+
+  TF_ASSIGN_OR_RETURN(auto execution_plan,
+                      RebuildExecutionPlan(cudnn, algorithm_desc, *op_graph));
+
+  TF_ASSIGN_OR_RETURN(
+      auto runner,
+      CudnnExecutionPlanRunner<dnn::FusedMHABiasMaskSignature>::Create(
+          parent_, cudnn_.get(), std::move(execution_plan),
+          {'q', 'k', 'P', 'B', 'v', 'o'}, true));
+  return {std::make_unique<
+      CudnnExecutionPlanRunner<dnn::FusedMHABiasMaskSignature>>(
+      std::move(runner))};
+#else
+  return tsl::errors::Unimplemented(
+      "Cudnn execution plans are only supported with Cudnn >= 8.8.");
+#endif
+}
+
+tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABiasRunner>>
+CudnnSupport::FusedMHAScaleBiasSoftmaxRunnerFromDesc(
+    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+    dnn::FusedMHAKind kind,
+    const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+    const dnn::TensorDescriptor& output_descriptor,
+    const dnn::TensorDescriptor& bias_descriptor, double scale,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+#if CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  // Create empty descriptors for mask tensors
+  dnn::TensorDescriptor empty_mask_desc;
+  bool use_dropout = dropout_rate && *dropout_rate > 0.0;
+  TF_ASSIGN_OR_RETURN(auto op_graph,
+                      GetCudnnFusedMHAOperationGraph(
+                          bmm1_lhs_descriptor, bmm1_rhs_descriptor,
+                          bmm2_rhs_descriptor, intermediate_bmm2_lhs_descriptor,
+                          empty_mask_desc, bias_descriptor, output_descriptor,
+                          kind, dropout_rate, seed, cudnn, scale, use_dropout,
+                          /*use_mask*/ false, /*use_bias*/ true));
+
+  TF_ASSIGN_OR_RETURN(auto execution_plan,
+                      RebuildExecutionPlan(cudnn, algorithm_desc, *op_graph));
+
+  TF_ASSIGN_OR_RETURN(
+      auto runner, CudnnExecutionPlanRunner<dnn::FusedMHABiasSignature>::Create(
+                       parent_, cudnn_.get(), std::move(execution_plan),
+                       {'q', 'k', 'B', 'v', 'o'}, true));
+  return {
+      std::make_unique<CudnnExecutionPlanRunner<dnn::FusedMHABiasSignature>>(
+          std::move(runner))};
+#else
+  return tsl::errors::Unimplemented(
+      "Cudnn execution plans are only supported with Cudnn >= 8.8.");
+#endif
+}
+
 bool CudnnSupport::GetRnnAlgorithms(
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::Rnn);
@@ -5537,6 +6315,7 @@ bool CudnnSupport::GetRnnAlgorithms(
 
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
+    const NumericOptions& numeric_options,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::ConvBwdData);
 
@@ -5555,7 +6334,7 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled()) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
   }
-  if (!RequireCudnnDeterminism()) {
+  if (!RequireCudnnDeterminism(numeric_options)) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0);
   }
 
@@ -5572,6 +6351,7 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
 
 bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
     CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
+    const NumericOptions& numeric_options,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::ConvBwdFilter);
 
@@ -5594,7 +6374,7 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled()) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
   }
-  if (!RequireCudnnDeterminism()) {
+  if (!RequireCudnnDeterminism(numeric_options)) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0);
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3);
   }
@@ -6116,8 +6896,8 @@ tsl::Status CudnnSupport::DoPrepareForCtcLoss(
     absl::Span<const int> labels_data,
     absl::Span<const int> labels_lengths_data,
     absl::Span<const int> input_lengths_data,
-    ScratchAllocator* scratch_allocator, DeviceMemory<uint8_t>* scratch_memory,
-    int* ctc_loss_algo_id) {
+    const NumericOptions& numeric_options, ScratchAllocator* scratch_allocator,
+    DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) {
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Query the workspace size.
   size_t workspace_size_in_bytes = 0;
@@ -6131,8 +6911,9 @@ tsl::Status CudnnSupport::DoPrepareForCtcLoss(
   // Try running with `algo`, if successful then pick it. The
   // non-deterministic algorithm is first and thus preferentially picked when
   // determinism is not required.
-  auto algo = RequireCudnnDeterminism() ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
-                                        : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC;
+  auto algo = RequireCudnnDeterminism(numeric_options)
+                  ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
+                  : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC;
   cudnnStatus_t status = cudnnGetCTCLossWorkspaceSize(
       /*handle=*/cudnn.handle(), /*probsDesc=*/cudnn_probs_desc.handle(),
       /*gradientsDesc=*/cudnn_grads_desc.handle(),
@@ -6142,7 +6923,7 @@ tsl::Status CudnnSupport::DoPrepareForCtcLoss(
       /*algo=*/algo,
       /*ctcLossDesc=*/cudnn_ctc_loss_desc.handle(),
       /*sizeInBytes=*/&workspace_size_in_bytes);
-  if (RequireCudnnDeterminism()) {
+  if (RequireCudnnDeterminism(numeric_options)) {
     RETURN_IF_CUDNN_ERROR(status);
   }
 
@@ -6500,6 +7281,18 @@ tsl::Status CudnnSupport::DoPoolForward(
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     ScratchAllocator* workspace_allocator) {
+  return DoPoolForward(element_type, stream, pooling_dimensions,
+                       NumericOptions{false}, input_dimensions, input_data,
+                       output_dimensions, output_data, workspace_allocator);
+}
+
+tsl::Status CudnnSupport::DoPoolForward(
+    dnn::DataType element_type, Stream* stream,
+    const dnn::PoolingDescriptor& pooling_dimensions,
+    const NumericOptions& numeric_options,
+    const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+    const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
+    ScratchAllocator* workspace_allocator) {
   // Alpha is the scaling factor for input.
   const float alpha_f = 1.0f;
   const double alpha_d = 1.0;
@@ -6517,7 +7310,7 @@ tsl::Status CudnnSupport::DoPoolForward(
       ToCudnnDataType(element_type, input_dimensions.layout());
   cudnnDataType_t cudnn_output_type =
       ToCudnnDataType(element_type, output_dimensions.layout());
-  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions, numeric_options);
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   auto cudnn_launcher = [&](CudnnTensorDescriptor& src_desc,
@@ -6568,6 +7361,20 @@ tsl::Status CudnnSupport::DoPoolBackward(
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
     ScratchAllocator* workspace_allocator) {
+  return DoPoolBackward(element_type, stream, pooling_dimensions,
+                        NumericOptions{false}, input_dimensions, input_data,
+                        output_dimensions, output_data, input_diff_data,
+                        output_diff_data, workspace_allocator);
+}
+
+tsl::Status CudnnSupport::DoPoolBackward(
+    dnn::DataType element_type, Stream* stream,
+    const dnn::PoolingDescriptor& pooling_dimensions,
+    const NumericOptions& numeric_options,
+    const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+    const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
+    DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
+    ScratchAllocator* workspace_allocator) {
   // Alpha is the scaling factor for input.
   const float alpha_f = 1.0f;
   const double alpha_d = 1.0;
@@ -6585,7 +7392,7 @@ tsl::Status CudnnSupport::DoPoolBackward(
       ToCudnnDataType(element_type, input_dimensions.layout());
   cudnnDataType_t cudnn_output_type =
       ToCudnnDataType(element_type, output_dimensions.layout());
-  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions, numeric_options);
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   auto cudnn_launcher = [&](CudnnTensorDescriptor& src_desc,
@@ -6860,8 +7667,7 @@ void initialize_cudnn() {
           });
 
   if (!status.ok()) {
-    LOG(ERROR) << "Unable to register cuDNN factory: "
-               << status.error_message();
+    LOG(ERROR) << "Unable to register cuDNN factory: " << status.message();
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.h b/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.h
index c95f26728d1..9005c17ae8d 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <vector>
 
 #include "absl/base/thread_annotations.h"
 #include "absl/types/span.h"
@@ -215,10 +216,6 @@ class CudnnSupport : public dnn::DnnSupport {
                      ScratchAllocator* workspace_allocator,
                      dnn::ProfileResult* output_profile_result) override;
 
-  bool GetConvolveAlgorithms(
-      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
-
   tsl::Status GetConvolveRunners(
       bool use_cudnn_frontend, dnn::ConvolutionKind kind,
       dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
@@ -229,6 +226,7 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemoryBase output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, ScratchAllocator* scratch_allocator,
+      const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans)
       override;
 
@@ -251,6 +249,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::BatchDescriptor& output_descriptor,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, dnn::ActivationMode activation_mode,
+      const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans)
       override;
 
@@ -260,6 +259,7 @@ class CudnnSupport : public dnn::DnnSupport {
       bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k,
       int64_t lda, int64_t ldb, int64_t ldc,
       dnn::ActivationMode activation_mode, bool use_fallback,
+      const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
           out_exec_plans) override;
 
@@ -276,17 +276,57 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       dnn::ActivationMode activation_mode) override;
 
+  tsl::StatusOr<std::unique_ptr<const dnn::FusedMHASoftmaxRunner>>
+  FusedMHASoftmaxRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::FusedMHAKind kind,
+      const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+      const dnn::TensorDescriptor& output_descriptor,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed) override;
+
+  tsl::StatusOr<std::unique_ptr<const dnn::FusedMHAMaskRunner>>
+  FusedMHAScaleMaskSoftmaxRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::FusedMHAKind kind,
+      const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+      const dnn::TensorDescriptor& output_descriptor,
+      const dnn::TensorDescriptor& mask_descriptor, double scale,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed) override;
+
+  tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABiasMaskRunner>>
+  FusedMHAScaleBiasMaskSoftmaxRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::FusedMHAKind kind,
+      const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+      const dnn::TensorDescriptor& output_descriptor,
+      const dnn::TensorDescriptor& mask_descriptor,
+      const dnn::TensorDescriptor& bias_descriptor, double scale,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed) override;
+
+  tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABiasRunner>>
+  FusedMHAScaleBiasSoftmaxRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::FusedMHAKind kind,
+      const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+      const dnn::TensorDescriptor& output_descriptor,
+      const dnn::TensorDescriptor& bias_descriptor, double scale,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed) override;
+
   bool GetRnnAlgorithms(
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
-  bool GetConvolveBackwardDataAlgorithms(
-      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
-
-  bool GetConvolveBackwardFilterAlgorithms(
-      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
-
   bool DoBatchNormalizationForward(
       Stream* stream, const DeviceMemory<float>& x,
       const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
@@ -491,6 +531,15 @@ class CudnnSupport : public dnn::DnnSupport {
                             DeviceMemoryBase output_data,
                             ScratchAllocator* workspace_allocator) override;
 
+  tsl::Status DoPoolForward(dnn::DataType element_type, Stream* stream,
+                            const dnn::PoolingDescriptor& pooling_dimensions,
+                            const NumericOptions& numeric_options,
+                            const dnn::BatchDescriptor& input_dimensions,
+                            DeviceMemoryBase input_data,
+                            const dnn::BatchDescriptor& output_dimensions,
+                            DeviceMemoryBase output_data,
+                            ScratchAllocator* workspace_allocator) override;
+
   tsl::Status DoPoolBackward(dnn::DataType element_type, Stream* stream,
                              const dnn::PoolingDescriptor& pooling_dimensions,
                              const dnn::BatchDescriptor& input_dimensions,
@@ -501,6 +550,17 @@ class CudnnSupport : public dnn::DnnSupport {
                              DeviceMemoryBase output_diff_data,
                              ScratchAllocator* workspace_allocator) override;
 
+  tsl::Status DoPoolBackward(dnn::DataType element_type, Stream* stream,
+                             const dnn::PoolingDescriptor& pooling_dimensions,
+                             const NumericOptions& numeric_options,
+                             const dnn::BatchDescriptor& input_dimensions,
+                             DeviceMemoryBase input_data,
+                             const dnn::BatchDescriptor& output_dimensions,
+                             DeviceMemoryBase output_data,
+                             DeviceMemoryBase input_diff_data,
+                             DeviceMemoryBase output_diff_data,
+                             ScratchAllocator* workspace_allocator) override;
+
   bool DoNormalizeWithDimensions(
       Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
       const dnn::BatchDescriptor& dimensions,
@@ -581,6 +641,21 @@ class CudnnSupport : public dnn::DnnSupport {
   // Provides access to the cuDNN handle.
   std::unique_ptr<class CudnnAccess> cudnn_;
 
+  bool GetConvolveAlgorithms(CudaComputeCapability cuda_compute_capability,
+                             dnn::DataType input_type,
+                             const NumericOptions& numeric_options,
+                             std::vector<dnn::AlgorithmDesc>* out_algorithms);
+
+  bool GetConvolveBackwardDataAlgorithms(
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
+      const NumericOptions& numeric_options,
+      std::vector<dnn::AlgorithmDesc>* out_algorithms);
+
+  bool GetConvolveBackwardFilterAlgorithms(
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
+      const NumericOptions& numeric_options,
+      std::vector<dnn::AlgorithmDesc>* out_algorithms);
+
   template <class T, class U>
   tsl::Status DoBatchNormalizationForwardImpl(
       Stream* stream, dnn::DataType input_data_type,
@@ -687,6 +762,7 @@ class CudnnSupport : public dnn::DnnSupport {
       absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
       absl::Span<const int> input_lengths_data,
+      const NumericOptions& numeric_options,
       ScratchAllocator* scratch_allocator,
       DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) override;
 
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc
index 14726ebf4e2..95410dbadfc 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc
@@ -33,7 +33,9 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/driver_types.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
@@ -267,7 +269,7 @@ static tsl::Status InternalInit() {
   }
 
   Diagnostician::LogDiagnosticInformation();
-  return tsl::Status(tsl::error::ABORTED,
+  return tsl::Status(absl::StatusCode::kAborted,
                      absl::StrCat("failed call to cuInit: ", ToString(res)));
 }
 
@@ -400,7 +402,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
     }
   }
 
-  return tsl::Status(tsl::error::INTERNAL, message);
+  return tsl::Status(absl::StatusCode::kInternal, message);
 }
 
 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
@@ -468,6 +470,11 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
           << " bdz: " << block_dim_z;
+  RETURN_IF_CUDA_RES_ERROR(
+      cuFuncSetAttribute(function,
+                         CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                         shared_mem_bytes),
+      "Failed to set shared memory size");
   RETURN_IF_CUDA_RES_ERROR(
       cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x,
                      block_dim_y, block_dim_z, shared_mem_bytes, stream,
@@ -475,7 +482,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
       "Failed to launch CUDA kernel: ", kernel_name,
       " with block dimensions: ", block_dim_x, "x", block_dim_y, "x",
       block_dim_z, " and grid dimensions: ", grid_dim_x, "x", grid_dim_y, "x",
-      grid_dim_z);
+      grid_dim_z, " and shared memory size: ", shared_mem_bytes);
   return ::tsl::OkStatus();
 }
 
@@ -483,8 +490,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
                                               const char* cubin_bytes,
                                               CUmodule* module) {
   ScopedActivateContext activation(context);
-  RETURN_IF_CUDA_RES_ERROR(cuModuleLoadFatBinary(module, cubin_bytes),
-                           "Failed to load in-memory CUBIN");
+  RETURN_IF_CUDA_RES_ERROR(
+      cuModuleLoadFatBinary(module, cubin_bytes),
+      "Failed to load in-memory CUBIN (compiled for a different GPU?).");
   return ::tsl::OkStatus();
 }
 
@@ -610,7 +618,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
                                                StreamCallback callback,
                                                void* data) {
   // Note: flags param is required to be zero according to CUDA 6.0.
-  CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
+  CUresult res = cuLaunchHostFunc(stream, callback, data);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "unable to add host callback: " << ToString(res);
     return false;
@@ -624,6 +632,18 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
                                                CUfunction* function) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && kernel_name != nullptr);
+  cudaError_t cuda_error = cudaPeekAtLastError();
+  if (cuda_error != cudaSuccess) {
+    // Printing the cuda_error value is useful when cudaGetErrorName doesn't
+    // work.
+    const std::string error =
+        absl::StrCat("There was an error before calling cuModuleGetFunction (",
+                     cuda_error, "): ", cudaGetErrorName(cuda_error), " : ",
+                     cudaGetErrorString(cuda_error));
+    LOG(ERROR) << error;
+    return false;
+  }
+
   CUresult res = cuModuleGetFunction(function, module, kernel_name);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
@@ -673,24 +693,21 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
   }
 
   return tsl::Status(
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrCat("failed to get device for context: ", ToString(result)));
 }
 
 /* static */ bool GpuDriver::CreateStream(GpuContext* context, CUstream* stream,
                                           int priority) {
-  // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
-  // up synchronization with respect to memsets and any other things that have
-  // to occur on the default stream?
   ScopedActivateContext activated{context};
   CUresult res;
   // If the priority is 0, then use the previous api to create the stream with
   // the default priority for backward compatibility. Probably there is no
   // difference in using the new api call but leaving it as is for now.
   if (priority == 0) {
-    res = cuStreamCreate(stream, 0);
+    res = cuStreamCreate(stream, CU_STREAM_NON_BLOCKING);
   } else {
-    res = cuStreamCreateWithPriority(stream, 0, priority);
+    res = cuStreamCreateWithPriority(stream, CU_STREAM_NON_BLOCKING, priority);
   }
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not allocate CUDA stream for context "
@@ -972,7 +989,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
 /* static */ tsl::Status GpuDriver::DestroyEvent(GpuContext* context,
                                                  CUevent* event) {
   if (*event == nullptr) {
-    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+    return tsl::Status(absl::StatusCode::kInvalidArgument,
                        "input event cannot be null");
   }
 
@@ -997,7 +1014,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   CUresult res = cuEventQuery(event);
   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
     return tsl::Status(
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat("failed to query event: %s", ToString(res)));
   }
 
@@ -1196,9 +1213,40 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
                                                    CUstream stream) {
   ScopedActivateContext activation(context);
   CUresult result;
-  // CreatedContexts::GetAnyContext() doesn't works when ptr == 0.
-  // This happens when the size is 0.
+
+  // Check if the stream is doing graph capture.
+  cudaStreamCaptureStatus stream_capture_status;
+  cudaError_t err =
+      cudaStreamGetCaptureInfo(stream, &stream_capture_status, /*pId=*/nullptr);
+  if (err != cudaSuccess) {
+    LOG(ERROR) << "Failed to get stream capture info: "
+               << cudaGetErrorString(err);
+    return false;
+  }
+
   if (gpu_dst == 0 || gpu_src == 0) {
+    // CreatedContexts::GetAnyContext() doesn't works when ptr == 0.
+    // This happens when the size is 0.
+    result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
+  } else if (stream_capture_status == cudaStreamCaptureStatusActive) {
+    // cuMemcpyPeerAsync is not supported during graph capture, so we use
+    // cuMemcpyDtoDAsync instead. This is only valid if UVA is supported.
+
+    // Check if UVA is enabled.
+    for (int i = 0; i < GetDeviceCount(); ++i) {
+      GpuDeviceAttribute attribute = CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING;
+      auto result = GetDeviceAttribute(attribute, i);
+      if (!result.ok()) {
+        LOG(ERROR) << "Failed to get device attribute";
+        return false;
+      }
+
+      if (result.value() == 0) {
+        LOG(ERROR) << "Unified addressing is not enabled";
+        return false;
+      }
+    }
+
     result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
   } else {
     // Any context work here.
@@ -1266,11 +1314,11 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   if (res == CUDA_SUCCESS) {
     return ::tsl::OkStatus();
   } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
-    return tsl::Status(tsl::error::RESOURCE_EXHAUSTED,
+    return tsl::Status(absl::StatusCode::kResourceExhausted,
                        "could not create CUDA event: out of device memory");
   } else {
     return tsl::Status(
-        tsl::error::FAILED_PRECONDITION,
+        absl::StatusCode::kFailedPrecondition,
         absl::StrCat("could not create CUDA event: ", ToString(res)));
   }
 }
@@ -1302,14 +1350,14 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
     // error then the original one.
     if (context == nullptr) {
       return tsl::Status(
-          tsl::error::UNAVAILABLE,
+          absl::StatusCode::kUnavailable,
           "Empty context returned while querying context for device pointer");
     }
     return context;
   }
 
   return tsl::Status(
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrCat("failed to query context for device pointer: ",
                    ToString(result)));
 }
@@ -1327,13 +1375,13 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
         return MemorySpace::kHost;
       default:
         return tsl::Status(
-            tsl::error::INTERNAL,
+            absl::StatusCode::kInternal,
             absl::StrCat("unknown memory space provided by CUDA API: ", value));
     }
   }
 
   return tsl::Status(
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrCat("failed to query device pointer for memory space: ",
                    ToString(result)));
 }
@@ -1349,13 +1397,13 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
     // "there was an internal error while performing this operation" (return
     // below).
     return tsl::Status(
-        tsl::error::NOT_FOUND,
+        absl::StatusCode::kNotFound,
         absl::StrFormat("not a device pointer %p; %s",
                         reinterpret_cast<void*>(dptr), ToString(result)));
   }
 
   return tsl::Status(
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrFormat("failed to get pointer into for device pointer %p; %s",
                       reinterpret_cast<void*>(dptr), ToString(result)));
 }
@@ -1380,7 +1428,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
       cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
   if (res != CUDA_SUCCESS) {
     return tsl::Status(
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat(
             "failed to get compute capability major for device: %s; %d",
             ToString(res), device));
@@ -1390,7 +1438,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
       cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
   if (res != CUDA_SUCCESS) {
     return tsl::Status(
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat(
             "failed to get compute capability minor for device: %s; %d",
             ToString(res), device));
@@ -1402,13 +1450,13 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
 /* static */ tsl::Status GpuDriver::GetGpuISAVersion(int* version,
                                                      CUdevice device) {
   return tsl::Status{
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       "Feature not supported on CUDA platform (GetGpuISAVersion)"};
 }
 
 /* static */ tsl::Status GpuDriver::GetGpuGCNArchName(CUdevice, std::string*) {
   return tsl::Status{
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       "Feature not supported on CUDA platform (GetGpuGCNArchName)"};
 }
 
@@ -1443,6 +1491,12 @@ static tsl::StatusOr<T> GetSimpleAttribute(CUdevice device,
       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
 }
 
+tsl::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerBlockOptin(
+    CUdevice device) {
+  return GetSimpleAttribute<int64_t>(
+      device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN);
+}
+
 /* static */ tsl::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerMultiprocessor(
     CUdevice device) {
   return GetSimpleAttribute<int64_t>(
@@ -1522,7 +1576,7 @@ static tsl::StatusOr<T> GetSimpleAttribute(CUdevice device,
   CUresult res = cuDeviceGetAttribute(&val, attribute, device);
   if (res != CUDA_SUCCESS) {
     return tsl::Status(
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat("failed to get device attribute %d for device %d: %s",
                         attribute, device, ToString(res)));
   }
@@ -1631,7 +1685,7 @@ static tsl::StatusOr<T> GetSimpleAttribute(CUdevice device,
   if (result != CUDA_SUCCESS &&
       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
     return tsl::Status(
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat("failed to enable peer access from %p to %p: %s", from,
                         to, ToString(result)));
   }
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_event.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_event.cc
index 2ee7338c974..8e7e1e1ae14 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_event.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_event.cc
@@ -27,7 +27,7 @@ Event::Status GpuEvent::PollForStatus() {
       GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
   if (!status.ok()) {
     LOG(ERROR) << "Error polling for event status: "
-               << status.status().error_message();
+               << status.status().message();
     return Event::Status::kError;
   }
 
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc
index d3914e1becc..3606d4c30b1 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc
@@ -320,8 +320,7 @@ std::unique_ptr<fft::Plan> CUDAFft::Create1dPlan(Stream *stream, uint64_t num_x,
                                /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
     LOG(ERROR) << "Plan Parameters: num_x: " << num_x;
-    LOG(ERROR) << "Failed to initialize cufft 1d plan: "
-               << status.error_message();
+    LOG(ERROR) << "Failed to initialize cufft 1d plan: " << status.message();
     return nullptr;
   }
   return std::move(fft_plan_ptr);
@@ -338,7 +337,7 @@ std::unique_ptr<fft::Plan> CUDAFft::Create1dPlanWithScratchAllocator(
     LOG(ERROR) << "Plan Parameters: num_x: " << num_x;
     LOG(ERROR)
         << "Failed to initialize cufft 1d plan with customized allocator: "
-        << status.error_message();
+        << status.message();
     return nullptr;
   }
   return std::move(fft_plan_ptr);
@@ -354,8 +353,7 @@ std::unique_ptr<fft::Plan> CUDAFft::Create2dPlan(Stream *stream, uint64_t num_x,
                                /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
     LOG(ERROR) << "Plan Parameters: num_x: " << num_x << " num_y: " << num_y;
-    LOG(ERROR) << "Failed to initialize cufft 2d plan: "
-               << status.error_message();
+    LOG(ERROR) << "Failed to initialize cufft 2d plan: " << status.message();
     return nullptr;
   }
   return std::move(fft_plan_ptr);
@@ -372,7 +370,7 @@ std::unique_ptr<fft::Plan> CUDAFft::Create2dPlanWithScratchAllocator(
     LOG(ERROR) << "Plan Parameters: num_x: " << num_x << " num_y: " << num_y;
     LOG(ERROR)
         << "Failed to initialize cufft 2d plan with customized allocator: "
-        << status.error_message();
+        << status.message();
     return nullptr;
   }
   return std::move(fft_plan_ptr);
@@ -390,8 +388,7 @@ std::unique_ptr<fft::Plan> CUDAFft::Create3dPlan(Stream *stream, uint64_t num_x,
   if (!status.ok()) {
     LOG(ERROR) << "Plan Parameters: num_x: " << num_x << " num_y: " << num_y
                << " num_z: " << num_z;
-    LOG(ERROR) << "Failed to initialize cufft 3d plan: "
-               << status.error_message();
+    LOG(ERROR) << "Failed to initialize cufft 3d plan: " << status.message();
     return nullptr;
   }
   return std::move(fft_plan_ptr);
@@ -409,7 +406,7 @@ std::unique_ptr<fft::Plan> CUDAFft::Create3dPlanWithScratchAllocator(
                << " num_z: " << num_z;
     LOG(ERROR)
         << "Failed to initialize cufft 3d plan with customized allocator: "
-        << status.error_message();
+        << status.message();
     return nullptr;
   }
   return std::move(fft_plan_ptr);
@@ -436,7 +433,7 @@ std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlan(
                << " output_distance: " << output_distance
                << " batch_count: " << batch_count;
     LOG(ERROR) << "Failed to initialize batched cufft plan: "
-               << status.error_message();
+               << status.message();
     return nullptr;
   }
 
@@ -465,7 +462,7 @@ std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlanWithScratchAllocator(
                << " batch_count: " << batch_count;
     LOG(ERROR)
         << "Failed to initialize batched cufft plan with customized allocator: "
-        << status.error_message();
+        << status.message();
     return nullptr;
   }
   return std::move(fft_plan_ptr);
@@ -478,7 +475,7 @@ void CUDAFft::UpdatePlanWithScratchAllocator(
       cuda_fft_plan->UpdateScratchAllocator(stream, scratch_allocator);
   if (!status.ok()) {
     LOG(FATAL) << "Failed to update custom allocator for cufft plan: "
-               << status.error_message();
+               << status.message();
   }
 }
 
@@ -609,8 +606,7 @@ void initialize_cufft() {
             return new gpu::CUDAFft(cuda_executor);
           });
   if (!status.ok()) {
-    LOG(ERROR) << "Unable to register cuFFT factory: "
-               << status.error_message();
+    LOG(ERROR) << "Unable to register cuFFT factory: " << status.message();
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc
index 3b01066dc7a..5934080a887 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include <unistd.h>
 #endif
 #include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
@@ -279,7 +280,7 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   VLOG(2) << "getting function " << *kernelname << " from module " << module;
   if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
                                     cuda_kernel->gpu_function_ptr())) {
-    return tsl::errors::Internal("Could not find the corresponding function");
+    return tsl::errors::Internal("Could not find function ", *kernelname);
   }
 
   // We have to trust the kernel loader spec arity because there doesn't appear
@@ -719,9 +720,7 @@ bool GpuExecutor::HostCallback(Stream* stream,
                                       InternalHostCallback, callback_ptr);
 }
 
-/* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
-                                                    CUresult status,
-                                                    void* data) {
+/* static */ void GpuExecutor::InternalHostCallback(void* data) {
   auto* callback = reinterpret_cast<absl::AnyInvocable<void() &&>*>(data);
   std::move (*callback)();
   delete callback;
@@ -745,7 +744,7 @@ tsl::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
     return ::tsl::OkStatus();
   } else {
     return tsl::Status(
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat("error recording waiting for CUDA event on stream %p",
                         stream));
   }
@@ -814,7 +813,7 @@ blas::BlasSupport* GpuExecutor::CreateBlas() {
                                                         plugin_config_.blas());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve BLAS factory: "
-               << status.status().error_message();
+               << status.status().message();
     return nullptr;
   }
 
@@ -828,7 +827,7 @@ dnn::DnnSupport* GpuExecutor::CreateDnn() {
                                                        plugin_config_.dnn());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve DNN factory: "
-               << status.status().error_message();
+               << status.status().message();
     return nullptr;
   }
 
@@ -842,7 +841,7 @@ fft::FftSupport* GpuExecutor::CreateFft() {
                                                        plugin_config_.fft());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve FFT factory: "
-               << status.status().error_message();
+               << status.status().message();
     return nullptr;
   }
 
@@ -856,7 +855,7 @@ rng::RngSupport* GpuExecutor::CreateRng() {
                                                        plugin_config_.rng());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve RNG factory: "
-               << status.status().error_message();
+               << status.status().message();
     return nullptr;
   }
 
@@ -1129,6 +1128,8 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
       GpuDriver::GetMaxSharedMemoryPerCore(device).value());
   builder.set_shared_memory_per_block(
       GpuDriver::GetMaxSharedMemoryPerBlock(device).value());
+  builder.set_shared_memory_per_block_optin(
+      GpuDriver::GetMaxSharedMemoryPerBlockOptin(device).value());
   int core_count = GpuDriver::GetMultiprocessorCount(device).value();
   builder.set_core_count(core_count);
   builder.set_fpus_per_core(fpus_per_core(cc_major, cc_minor));
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.cc
index b0063383575..5a0b9e099ea 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.cc
@@ -119,7 +119,7 @@ tsl::StatusOr<OwnedCudaGraph> CaptureCudaGraph(
 
   if (!captured.ok())
     return InternalError("failed to capture CUDA graph: %s",
-                         captured.error_message());
+                         captured.message());
 
   VLOG(5) << "Captured CUDA graph " << graph;
 
@@ -175,5 +175,17 @@ tsl::StatusOr<OwnedCudaGraphExec> InstantiateCudaGraph(OwnedCudaGraph graph) {
   return OwnedCudaGraphExec(exec);
 }
 
+tsl::StatusOr<bool> IsStreamCapturing(stream_executor::Stream* stream) {
+  cudaStreamCaptureStatus capture_status;
+  cudaError_t err = cudaStreamIsCapturing(
+      stream_executor::gpu::AsGpuStreamValue(stream), &capture_status);
+  if (err != cudaSuccess) {
+    return InternalError("Failed to get stream's capture status: %s",
+                         cudaGetErrorString(err));
+  }
+
+  return capture_status == cudaStreamCaptureStatusActive;
+}
+
 }  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h b/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h
index f790b1cff3b..3480e96c095 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h
@@ -85,6 +85,9 @@ tsl::StatusOr<OwnedCudaGraph> CaptureCudaGraph(
 // Instantiates a captured cuda graph instance into a cuda graph executable.
 tsl::StatusOr<OwnedCudaGraphExec> InstantiateCudaGraph(OwnedCudaGraph graph);
 
+// Returns true if the stream is in graph capture mode
+tsl::StatusOr<bool> IsStreamCapturing(stream_executor ::Stream* stream);
+
 }  // namespace gpu
 }  // namespace stream_executor
 
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_rng.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_rng.cc
index d80ef0b5b40..4711942add0 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_rng.cc
@@ -253,8 +253,7 @@ void initialize_curand() {
           });
 
   if (!status.ok()) {
-    LOG(ERROR) << "Unable to register cuRAND factory: "
-               << status.error_message();
+    LOG(ERROR) << "Unable to register cuRAND factory: " << status.message();
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
diff --git a/tensorflow/compiler/xla/stream_executor/device_description.h b/tensorflow/compiler/xla/stream_executor/device_description.h
index 6371f0d844c..0ea7a6ae4d0 100644
--- a/tensorflow/compiler/xla/stream_executor/device_description.h
+++ b/tensorflow/compiler/xla/stream_executor/device_description.h
@@ -23,6 +23,8 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <set>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/str_join.h"
@@ -48,7 +50,7 @@ struct CudaComputeCapability {
     HOPPER = 9
   };
 
-  CudaComputeCapability() {}
+  CudaComputeCapability() = default;
   CudaComputeCapability(int major, int minor) {
     this->major = major;
     this->minor = minor;
@@ -124,7 +126,8 @@ class RocmComputeCapability {
   explicit RocmComputeCapability(const RocmComputeCapabilityProto &proto)
       : gcn_arch_name_(proto.gcn_arch_name()) {}
 
-  ~RocmComputeCapability() {}
+  RocmComputeCapability() = default;
+  ~RocmComputeCapability() = default;
 
   std::string gcn_arch_name() { return gcn_arch_name_; }
 
@@ -325,9 +328,16 @@ class DeviceDescription {
   // partitioning between shared memory and L1 cache.
   int64_t shared_memory_per_core() const { return shared_memory_per_core_; }
 
-  // Returns the maximum amount of shared memory available for a single block.
+  // Returns the maximum amount of static shared memory
+  // available for a single block.
   int64_t shared_memory_per_block() const { return shared_memory_per_block_; }
 
+  // Returns the maximum amount of shared memory available for a single block
+  // including the dynamically allocated one.
+  int64_t shared_memory_per_block_optin() const {
+    return shared_memory_per_block_optin_;
+  }
+
   // TODO(leary): resident blocks per core will be useful.
 
   // Convenience typedef for the string-based DeviceDescription mapping.
@@ -376,6 +386,7 @@ class DeviceDescription {
   // Shared memory limits on a given device.
   int64_t shared_memory_per_core_;
   int64_t shared_memory_per_block_;
+  int64_t shared_memory_per_block_optin_;
 
   float clock_rate_ghz_;
 
@@ -469,6 +480,9 @@ class DeviceDescriptionBuilder {
   void set_shared_memory_per_block(int64_t value) {
     device_description_->shared_memory_per_block_ = value;
   }
+  void set_shared_memory_per_block_optin(int64_t value) {
+    device_description_->shared_memory_per_block_optin_ = value;
+  }
 
   void set_clock_rate_ghz(float value) {
     device_description_->clock_rate_ghz_ = value;
diff --git a/tensorflow/compiler/xla/stream_executor/device_description.proto b/tensorflow/compiler/xla/stream_executor/device_description.proto
index eaff1a011a0..9d09cea6f9f 100644
--- a/tensorflow/compiler/xla/stream_executor/device_description.proto
+++ b/tensorflow/compiler/xla/stream_executor/device_description.proto
@@ -43,6 +43,7 @@ message GpuDeviceInfoProto {
   int64 l2_cache_size = 12;
   float clock_rate_ghz = 13;
   int64 device_memory_size = 14;
+  int32 shared_memory_per_block_optin = 15;
 }
 
 message DnnVersionInfoProto {
@@ -66,4 +67,4 @@ message GpuTargetConfigProto {
   xla.AutotuneResults autotune_results = 6;
 
   string device_description_str = 7;
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/xla/stream_executor/device_id_utils.h b/tensorflow/compiler/xla/stream_executor/device_id_utils.h
index 0d96c4cc8e9..9285d8bb23d 100644
--- a/tensorflow/compiler/xla/stream_executor/device_id_utils.h
+++ b/tensorflow/compiler/xla/stream_executor/device_id_utils.h
@@ -17,27 +17,17 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_DEVICE_ID_UTILS_H_
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_DEVICE_ID_UTILS_H_
 
-#include <numeric>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "absl/strings/numbers.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/tsl/framework/device_id.h"
 #include "tensorflow/tsl/framework/device_id_manager.h"
-#include "tensorflow/tsl/lib/gtl/int_type.h"
-#include "tensorflow/tsl/platform/str_util.h"
 
 namespace stream_executor {
 
-// Utility methods for translation between TensorFlow device ids and platform
-// device ids.
+// Utility methods for getting the associated executor given a TfDeviceId
+// or PlatformDeviceId.
 class DeviceIdUtil {
  public:
-  // Convenient methods for getting the associated executor given a TfDeviceId
-  // or PlatformDeviceId.
   static tsl::StatusOr<StreamExecutor*> ExecutorForPlatformDeviceId(
       Platform* device_manager, tsl::PlatformDeviceId platform_device_id) {
     return device_manager->ExecutorForDevice(platform_device_id.value());
@@ -50,68 +40,6 @@ class DeviceIdUtil {
         type, tf_device_id, &platform_device_id));
     return ExecutorForPlatformDeviceId(device_manager, platform_device_id);
   }
-
-  // Verify that the platform_device_id associated with a TfDeviceId is
-  // legitimate.
-  static void CheckValidTfDeviceId(const tsl::DeviceType& type,
-                                   Platform* device_manager,
-                                   tsl::TfDeviceId tf_device_id) {
-    tsl::PlatformDeviceId platform_device_id;
-    TF_CHECK_OK(tsl::DeviceIdManager::TfToPlatformDeviceId(
-        type, tf_device_id, &platform_device_id));
-    const int visible_device_count = device_manager->VisibleDeviceCount();
-    CHECK_LT(platform_device_id.value(), visible_device_count)
-        << "platform_device_id is outside discovered device range."
-        << " TF " << type << " id: " << tf_device_id << ", platform " << type
-        << " id: " << platform_device_id
-        << ", visible device count: " << visible_device_count;
-  }
-
-  // Parse `visible_device_list` into a list of platform Device ids.
-  static tsl::Status ParseVisibleDeviceList(
-      const std::string& visible_device_list, const int visible_device_count,
-      std::vector<tsl::PlatformDeviceId>* visible_device_order) {
-    visible_device_order->clear();
-
-    // If the user wants to remap the visible to virtual Device mapping,
-    // check for that here.
-    if (visible_device_list.empty()) {
-      visible_device_order->resize(visible_device_count);
-      // By default, visible to virtual mapping is unchanged.
-      std::iota(visible_device_order->begin(), visible_device_order->end(), 0);
-    } else {
-      const std::vector<std::string> order_str =
-          tsl::str_util::Split(visible_device_list, ',');
-      for (const std::string& platform_device_id_str : order_str) {
-        int32_t platform_device_id;
-        if (!absl::SimpleAtoi(platform_device_id_str, &platform_device_id)) {
-          return tsl::errors::InvalidArgument(
-              "Could not parse entry in 'visible_device_list': '",
-              platform_device_id_str,
-              "'. visible_device_list = ", visible_device_list);
-        }
-        if (platform_device_id < 0 ||
-            platform_device_id >= visible_device_count) {
-          return tsl::errors::InvalidArgument(
-              "'visible_device_list' listed an invalid Device id '",
-              platform_device_id, "' but visible device count is ",
-              visible_device_count);
-        }
-        visible_device_order->push_back(
-            tsl::PlatformDeviceId(platform_device_id));
-      }
-    }
-
-    // Validate no repeats.
-    std::set<tsl::PlatformDeviceId> visible_device_set(
-        visible_device_order->begin(), visible_device_order->end());
-    if (visible_device_set.size() != visible_device_order->size()) {
-      return tsl::errors::InvalidArgument(
-          "visible_device_list contained a duplicate entry: ",
-          visible_device_list);
-    }
-    return tsl::OkStatus();
-  }
 };
 
 }  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/dnn.cc b/tensorflow/compiler/xla/stream_executor/dnn.cc
index 576ad1c7b76..4efdc79b5fa 100644
--- a/tensorflow/compiler/xla/stream_executor/dnn.cc
+++ b/tensorflow/compiler/xla/stream_executor/dnn.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <iterator>
 
@@ -24,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/stream_executor/numeric_options.h"
 #include "tensorflow/tsl/lib/strings/proto_serialization.h"
 #include "tensorflow/tsl/protobuf/dnn.pb.h"
 
@@ -117,12 +119,6 @@ std::vector<std::pair<int64_t, int64_t>> AlgorithmDesc::TuningKnobs() const {
   return result;
 }
 
-bool DnnSupport::GetConvolveAlgorithms(
-    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-    std::vector<AlgorithmDesc>* out_algorithms) {
-  return false;
-}
-
 tsl::Status DnnSupport::GetConvolveRunners(
     bool /* use_cudnn_frontend */, dnn::ConvolutionKind /*kind*/,
     dnn::DataType /*input_type*/, dnn::DataType /*output_type*/,
@@ -134,6 +130,7 @@ tsl::Status DnnSupport::GetConvolveRunners(
     DeviceMemoryBase /*output_data*/,
     const dnn::ConvolutionDescriptor& /*convolution_descriptor*/,
     bool /*use_fallback*/, ScratchAllocator* /*scratch_allocator*/,
+    const NumericOptions& /*numeric_options*/,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* /*exec_plans*/) {
   return tsl::errors::Unimplemented("GetConvolveRunners not implemented.");
 }
@@ -159,7 +156,7 @@ tsl::Status DnnSupport::GetFusedConvolveRunners(
     const dnn::BatchDescriptor& bias_descriptor,
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    dnn::ActivationMode activation_mode,
+    dnn::ActivationMode activation_mode, const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
   return tsl::errors::Unimplemented("GetFusedConvolveRunners not implemented.");
 }
@@ -169,7 +166,7 @@ tsl::Status DnnSupport::GetFusedMatmulRunners(
     dnn::DataType bias_type, dnn::DataType output_type, Stream* stream,
     bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k, int64_t lda,
     int64_t ldb, int64_t ldc, dnn::ActivationMode activation_mode,
-    bool use_fallback,
+    bool use_fallback, const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
   return tsl::errors::Unimplemented("GetFusedMatmulRunners not implemented.");
@@ -191,6 +188,66 @@ DnnSupport::FusedConvolveRunnerFromDesc(
       "FusedConvolveRunnerFromDesc not implemented.");
 }
 
+tsl::StatusOr<std::unique_ptr<const dnn::FusedMHASoftmaxRunner>>
+DnnSupport::FusedMHASoftmaxRunnerFromDesc(
+    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+    dnn::FusedMHAKind kind,
+    const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+    const dnn::TensorDescriptor& output_descriptor,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+  return tsl::errors::Unimplemented(
+      "FusedMHASoftmaxRunnerFromDesc not implemented.");
+}
+
+tsl::StatusOr<std::unique_ptr<const dnn::FusedMHAMaskRunner>>
+DnnSupport::FusedMHAScaleMaskSoftmaxRunnerFromDesc(
+    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+    dnn::FusedMHAKind kind,
+    const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+    const dnn::TensorDescriptor& output_descriptor,
+    const dnn::TensorDescriptor& mask_descriptor, double scale,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+  return tsl::errors::Unimplemented(
+      "FusedMHAScaleMaskSoftmaxRunnerFromDesc not implemented.");
+}
+
+tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABiasMaskRunner>>
+DnnSupport::FusedMHAScaleBiasMaskSoftmaxRunnerFromDesc(
+    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+    dnn::FusedMHAKind kind,
+    const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+    const dnn::TensorDescriptor& output_descriptor,
+    const dnn::TensorDescriptor& mask_descriptor,
+    const dnn::TensorDescriptor& bias_descriptor, double scale,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+  return tsl::errors::Unimplemented(
+      "FusedMHAScaleBiasMaskSoftmaxRunnerFromDesc not implemented.");
+}
+
+tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABiasRunner>>
+DnnSupport::FusedMHAScaleBiasSoftmaxRunnerFromDesc(
+    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+    dnn::FusedMHAKind kind,
+    const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+    const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+    const dnn::TensorDescriptor& output_descriptor,
+    const dnn::TensorDescriptor& bias_descriptor, double scale,
+    std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+  return tsl::errors::Unimplemented(
+      "FusedMHAScaleBiasSoftmaxRunnerFromDesc not implemented.");
+}
+
 bool DnnSupport::GetMIOpenConvolveAlgorithms(
     dnn::ConvolutionKind /*kind*/, dnn::DataType /*element_type*/,
     Stream* /*stream*/, const dnn::BatchDescriptor& /*input_descriptor*/,
@@ -209,16 +266,32 @@ bool DnnSupport::GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms) {
   return false;
 }
 
-bool DnnSupport::GetConvolveBackwardDataAlgorithms(
-    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-    std::vector<AlgorithmDesc>* out_algorithms) {
-  return false;
+tsl::Status DnnSupport::DoPoolForward(
+    DataType element_type, Stream* stream,
+    const dnn::PoolingDescriptor& pooling_dimensions,
+    const NumericOptions& numeric_options,
+    const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+    const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
+    ScratchAllocator* workspace_allocator) {
+  // Ignore numeric options. Subclasses can override this method to use it.
+  return DoPoolForward(element_type, stream, pooling_dimensions,
+                       input_dimensions, input_data, output_dimensions,
+                       output_data, workspace_allocator);
 }
 
-bool DnnSupport::GetConvolveBackwardFilterAlgorithms(
-    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-    std::vector<AlgorithmDesc>* out_algorithms) {
-  return false;
+tsl::Status DnnSupport::DoPoolBackward(
+    DataType element_type, Stream* stream,
+    const dnn::PoolingDescriptor& pooling_dimensions,
+    const NumericOptions& numeric_options,
+    const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+    const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
+    DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
+    ScratchAllocator* workspace_allocator) {
+  // Ignore numeric options. Subclasses can override this method to use it.
+  return DoPoolBackward(element_type, stream, pooling_dimensions,
+                        input_dimensions, input_data, output_dimensions,
+                        output_data, input_diff_data, output_diff_data,
+                        workspace_allocator);
 }
 
 std::string QuantizedActivationModeString(QuantizedActivationMode mode) {
@@ -482,6 +555,169 @@ std::string AlgorithmConfig::ToString() const {
   return absl::StrCat(algo, ", ", algo_no_scratch);
 }
 
+// -- TensorDescriptor
+
+int TensorDescriptor::ndims() const {
+  CHECK_EQ(dimensions_.size(), minor_to_major_.size());
+  return dimensions_.size();
+}
+
+tsl::StatusOr<std::vector<int64_t>>
+TensorDescriptor::GetPhysicalDimensionsMajorToMinor() const {
+  std::vector<int64_t> logical_to_physical(minor_to_major_.size());
+  for (int64_t physical = 0; physical < logical_to_physical.size();
+       ++physical) {
+    int64_t logical = minor_to_major_.at(minor_to_major_.size() - 1 - physical);
+    logical_to_physical[logical] = physical;
+  }
+  if (dimensions_.size() != minor_to_major_.size())
+    return tsl::errors::Internal(
+        "Dimensions size should match the layout size.");
+
+  std::vector<int64_t> physical_dims(dimensions_.size());
+  for (int64_t i = 0; i < physical_dims.size(); ++i) {
+    physical_dims[logical_to_physical[i]] = dimensions_[i];
+  }
+  return physical_dims;
+}
+
+std::vector<int64_t> TensorDescriptor::GetPhysicalStridesMajorToMinor() const {
+  std::vector<int64_t> phys_dims = GetPhysicalDimensionsMajorToMinor().value();
+  std::vector<int64_t> phys_strides(ndims());
+  phys_strides[ndims() - 1] = 1;
+  for (int i = ndims() - 2; i >= 0; i--) {
+    phys_strides[i] = phys_strides[i + 1] * phys_dims[i + 1];
+  }
+  return phys_strides;
+}
+
+std::vector<int64_t> TensorDescriptor::GetLogicalStrides() const {
+  std::vector<int64_t> physical_strides = GetPhysicalStridesMajorToMinor();
+  std::reverse(physical_strides.begin(), physical_strides.end());
+  std::vector<int64_t> logical_strides(physical_strides.size());
+  for (int i = 0; i < ndims(); i++) {
+    logical_strides[minor_to_major_[i]] = physical_strides[i];
+  }
+  return logical_strides;
+}
+
+/*static*/ TensorDescriptor TensorDescriptor::For(
+    DataType type, absl::Span<const int64_t> dimensions,
+    absl::Span<const int64_t> minor_to_major) {
+  std::vector<int64_t> dims(dimensions.size());
+  std::vector<int64_t> minor_to_major_vec(minor_to_major.size());
+  CHECK_EQ(dimensions.size(), minor_to_major.size());
+  for (int i = 0; i < dimensions.size(); i++) {
+    dims[i] = dimensions[i];
+    minor_to_major_vec[i] = minor_to_major[i];
+  }
+  return TensorDescriptor(type, dims, minor_to_major_vec);
+}
+
+std::string TensorDescriptor::ToString() const {
+  return absl::StrFormat("{dimensions: %s minor_to_major: %s}",
+                         absl::StrJoin(dimensions(), ","),
+                         absl::StrJoin(minor_to_major(), ","));
+}
+
+// -- MatmulTensorDescriptor
+
+tsl::StatusOr<std::vector<int64_t>>
+MatmulTensorDescriptor::GetNonContractingDims() const {
+  std::vector<int64_t> non_contracting_dims;
+  for (int64_t dim = 0; dim < tensor_.dimensions().size(); ++dim) {
+    bool is_batch = absl::c_count(batch_dimension_numbers_, dim) != 0;
+    bool is_contracting = absl::c_count(contracting_dim_, dim) != 0;
+    if (is_batch && is_contracting)
+      return tsl::errors::Internal(
+          "A dimension cannot be both a batch dimension and a contracting "
+          "dimension.");
+    if (!(is_batch || is_contracting)) non_contracting_dims.push_back(dim);
+  }
+
+  if (batch_dimension_numbers_.size() + contracting_dim_.size() +
+          non_contracting_dims.size() !=
+      tensor_.dimensions().size())
+    return tsl::errors::Internal(
+        "Batch_dimension_numbers, contracting_dim and non_contracting_dims "
+        "should sum up to the total number of dimensions.");
+  return non_contracting_dims;
+}
+
+tsl::StatusOr<std::vector<int64_t>> MatmulTensorDescriptor::MakeCudnnCompatible(
+    const std::vector<int64_t>& vec, bool is_lhs) const {
+  std::vector<int64_t> cudnn_compatible(vec.size());
+  int batch_dim_size = batch_dimension_numbers_.size();
+  CHECK_LT(batch_dim_size, vec.size());
+  for (int i = 0; i < batch_dim_size; i++) {
+    cudnn_compatible[i] = vec.at(batch_dimension_numbers_.at(i));
+  }
+  std::vector<int64_t> non_contracting_dims = GetNonContractingDims().value();
+  if (batch_dimension_numbers_.size() + contracting_dim_.size() +
+          non_contracting_dims.size() !=
+      vec.size())
+    return tsl::errors::Internal(
+        "Batch_dimension_numbers, contracting_dim and non_contracting_dims "
+        "should sum up to the total number of dimensions.");
+  if (is_lhs) /* lhs -> {b0, b1,....bk, m, k} */ {
+    for (int i = 0; i < non_contracting_dims.size(); i++) {
+      cudnn_compatible[batch_dim_size + i] = vec.at(non_contracting_dims.at(i));
+    }
+    for (int i = 0; i < contracting_dim_.size(); i++) {
+      cudnn_compatible[batch_dim_size + non_contracting_dims.size() + i] =
+          vec.at(contracting_dim_.at(i));
+    }
+  } else /* rhs -> {b0, b1, ... bk, k, n} */ {
+    for (int i = 0; i < contracting_dim_.size(); i++) {
+      cudnn_compatible[batch_dim_size + i] = vec.at(contracting_dim_.at(i));
+    }
+    for (int i = 0; i < non_contracting_dims.size(); i++) {
+      cudnn_compatible[batch_dim_size + contracting_dim_.size() + i] =
+          vec.at(non_contracting_dims.at(i));
+    }
+  }
+  return cudnn_compatible;
+}
+
+std::vector<int64_t> MatmulTensorDescriptor::GetCudnnCompatibleDimensions(
+    bool is_lhs) const {
+  std::vector<int64_t> cudnn_compatible_dims =
+      MakeCudnnCompatible(tensor_.dimensions(), is_lhs).value();
+  return cudnn_compatible_dims;
+}
+
+std::vector<int64_t> MatmulTensorDescriptor::GetCudnnCompatibleStrides(
+    bool is_lhs) const {
+  std::vector<int64_t> cudnn_compatible_strides =
+      MakeCudnnCompatible(tensor_.GetLogicalStrides(), is_lhs).value();
+  return cudnn_compatible_strides;
+}
+
+/*static*/ MatmulTensorDescriptor MatmulTensorDescriptor::For(
+    DataType type, absl::Span<const int64_t> dimensions,
+    absl::Span<const int64_t> minor_to_major,
+    absl::Span<const int64_t> batch_dims,
+    absl::Span<const int64_t> contracting_dims) {
+  std::vector<int64_t> batch_dims_vec(batch_dims.size());
+  std::vector<int64_t> contracting_dims_vec(contracting_dims.size());
+  for (int i = 0; i < batch_dims.size(); i++) {
+    batch_dims_vec[i] = batch_dims[i];
+  }
+  for (int i = 0; i < contracting_dims.size(); i++) {
+    contracting_dims_vec[i] = contracting_dims[i];
+  }
+  return MatmulTensorDescriptor(
+      TensorDescriptor::For(type, dimensions, minor_to_major), batch_dims_vec,
+      contracting_dims_vec);
+}
+
+std::string MatmulTensorDescriptor::ToString() const {
+  return absl::StrFormat(
+      "{%s, batch_dimension_numbers: %s contracting_dim: %s}",
+      tensor_.ToString(), absl::StrJoin(batch_dimension_numbers_, ","),
+      absl::StrJoin(contracting_dim_, ","));
+}
+
 // -- BatchDescriptor
 
 BatchDescriptor::BatchDescriptor(int ndims)
@@ -894,7 +1130,7 @@ bool DnnSupport::IsStatusOk(const tsl::Status& status, bool report_error) {
     return true;
   }
   if (report_error) {
-    LOG(ERROR) << status.error_message();
+    LOG(ERROR) << status.message();
   }
   return false;
 }
diff --git a/tensorflow/compiler/xla/stream_executor/dnn.h b/tensorflow/compiler/xla/stream_executor/dnn.h
index d26663d7b6f..e4edfedbe71 100644
--- a/tensorflow/compiler/xla/stream_executor/dnn.h
+++ b/tensorflow/compiler/xla/stream_executor/dnn.h
@@ -26,10 +26,13 @@ limitations under the License.
 #include <functional>
 #include <limits>
 #include <memory>
+#include <optional>
+#include <ostream>
 #include <string>
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 #include "google/protobuf/wrappers.pb.h"
 #include "absl/types/optional.h"
@@ -39,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/device_description.pb.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.pb.h"
+#include "tensorflow/compiler/xla/stream_executor/numeric_options.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/tsl/platform/status.h"
@@ -152,6 +156,75 @@ enum class RnnDirectionMode {
 // DepthToSpace comment for more information.
 enum class DepthToSpaceLayout { DepthHeightWidth };
 
+class TensorDescriptor {
+ public:
+  TensorDescriptor() = default;
+  tsl::StatusOr<std::vector<int64_t>> GetPhysicalDimensionsMajorToMinor() const;
+  std::vector<int64_t> GetPhysicalStridesMajorToMinor() const;
+  std::vector<int64_t> GetLogicalStrides() const;
+
+  static TensorDescriptor For(DataType type,
+                              absl::Span<const int64_t> dimensions,
+                              absl::Span<const int64_t> minor_to_major);
+  int ndims() const;
+  std::vector<int64_t> dimensions() const { return dimensions_; }
+  std::vector<int64_t> minor_to_major() const { return minor_to_major_; }
+  DataType type() const { return d_type_; }
+  std::string ToString() const;
+
+ protected:
+  TensorDescriptor(DataType type, std::vector<int64_t> dimensions,
+                   std::vector<int64_t> minor_to_major)
+      : d_type_(type),
+        dimensions_(dimensions),
+        minor_to_major_(minor_to_major) {}
+
+ private:
+  DataType d_type_;
+  std::vector<int64_t> dimensions_;
+  std::vector<int64_t> minor_to_major_;
+};
+
+class MatmulTensorDescriptor {
+ public:
+  MatmulTensorDescriptor() = default;
+  tsl::StatusOr<std::vector<int64_t>> GetNonContractingDims() const;
+  std::vector<int64_t> GetCudnnCompatibleDimensions(
+      bool is_lhs
+      /*if not lhs, then rhs*/) const;
+  std::vector<int64_t> GetCudnnCompatibleStrides(
+      bool is_lhs
+      /*if not lhs, then rhs*/) const;
+  tsl::StatusOr<std::vector<int64_t>> MakeCudnnCompatible(
+      const std::vector<int64_t>&, bool is_lhs) const;
+
+  static MatmulTensorDescriptor For(DataType type,
+                                    absl::Span<const int64_t> dimensions,
+                                    absl::Span<const int64_t> minor_to_major,
+                                    absl::Span<const int64_t> batch_dims,
+                                    absl::Span<const int64_t> contracting_dims);
+  std::vector<int64_t> dimensions() const { return tensor_.dimensions(); }
+  std::vector<int64_t> minor_to_major() const {
+    return tensor_.minor_to_major();
+  }
+  DataType type() const { return tensor_.type(); }
+
+  std::string ToString() const;
+
+ protected:
+  MatmulTensorDescriptor(TensorDescriptor tensor,
+                         std::vector<int64_t> batch_dims,
+                         std::vector<int64_t> contracting_dims)
+      : tensor_(tensor),
+        batch_dimension_numbers_(batch_dims),
+        contracting_dim_(contracting_dims) {}
+
+ private:
+  TensorDescriptor tensor_;
+  std::vector<int64_t> batch_dimension_numbers_;
+  std::vector<int64_t> contracting_dim_;
+};
+
 // Specifies the descriptor for a RNN model.
 //
 // An example use case:
@@ -170,7 +243,7 @@ class RnnDescriptor {
     int64_t size;
   };
   typedef std::vector<ParamsRegion> ParamsRegions;
-  virtual ~RnnDescriptor() {}
+  virtual ~RnnDescriptor() = default;
   virtual int64_t ParamsSizeInBytes() const { return -1; }
   virtual ParamsRegions ParamsWeightRegions() const { return ParamsRegions(); }
   virtual ParamsRegions ParamsBiasRegions() const { return ParamsRegions(); }
@@ -182,7 +255,7 @@ class RnnDescriptor {
 // in use. The destructor releases the underlying descriptors.
 class RnnSequenceTensorDescriptor {
  public:
-  virtual ~RnnSequenceTensorDescriptor() {}
+  virtual ~RnnSequenceTensorDescriptor() = default;
 };
 
 // Specifies either the input and hidden state in a RNN model.
@@ -191,7 +264,7 @@ class RnnSequenceTensorDescriptor {
 // in use. The destructor releases the underlying descriptors.
 class RnnStateTensorDescriptor {
  public:
-  virtual ~RnnStateTensorDescriptor() {}
+  virtual ~RnnStateTensorDescriptor() = default;
 };
 
 // Returns a string representation of the given quantization mode.
@@ -870,7 +943,7 @@ class OpRunner;
 template <typename... Args>
 class OpRunner<void(Args...)> {
  public:
-  virtual ~OpRunner() {}
+  virtual ~OpRunner() = default;
 
   // Get a description of the runner, for uniqueness of autotune entries.
   //
@@ -912,6 +985,34 @@ using FusedMatmulSignature = void(DeviceMemoryBase /* a_data */,
                                   DeviceMemoryBase /* c_data */);
 using FusedMatmulRunner = OpRunner<FusedMatmulSignature>;
 
+using FusedMHASoftmaxSignature = void(DeviceMemoryBase /*BMM1_inputA_data*/,
+                                      DeviceMemoryBase /* BMM1_inputB_data */,
+                                      DeviceMemoryBase /* BMM2_inputA_data */,
+                                      DeviceMemoryBase /* output_data */);
+using FusedMHASoftmaxRunner = OpRunner<FusedMHASoftmaxSignature>;
+
+using FusedMHAMaskSignature = void(DeviceMemoryBase /*BMM1_inputA_data*/,
+                                   DeviceMemoryBase /* BMM1_inputB_data */,
+                                   DeviceMemoryBase /* mask_data */,
+                                   DeviceMemoryBase /* BMM2_inputA_data */,
+                                   DeviceMemoryBase /* output_data */);
+using FusedMHAMaskRunner = OpRunner<FusedMHAMaskSignature>;
+
+using FusedMHABiasMaskSignature = void(DeviceMemoryBase /*BMM1_inputA_data*/,
+                                       DeviceMemoryBase /* BMM1_inputB_data */,
+                                       DeviceMemoryBase /* mask_data */,
+                                       DeviceMemoryBase /* bias_data */,
+                                       DeviceMemoryBase /* BMM2_inputA_data */,
+                                       DeviceMemoryBase /* output_data */);
+using FusedMHABiasMaskRunner = OpRunner<FusedMHABiasMaskSignature>;
+
+using FusedMHABiasSignature = void(DeviceMemoryBase /*BMM1_inputA_data*/,
+                                   DeviceMemoryBase /* BMM1_inputB_data */,
+                                   DeviceMemoryBase /* bias_data */,
+                                   DeviceMemoryBase /* BMM2_inputA_data */,
+                                   DeviceMemoryBase /* output_data */);
+using FusedMHABiasRunner = OpRunner<FusedMHABiasSignature>;
+
 // Describes the configuration for the algorithms that will used.
 //
 // Arguments:
@@ -930,7 +1031,7 @@ using FusedMatmulRunner = OpRunner<FusedMatmulSignature>;
 // one without scratch memory, and scratch_size field is used to track it.
 class AlgorithmConfig {
  public:
-  AlgorithmConfig() {}
+  AlgorithmConfig() = default;
   explicit AlgorithmConfig(AlgorithmDesc algorithm) : algorithm_(algorithm) {}
   AlgorithmConfig(AlgorithmDesc algorithm, size_t scratch_size)
       : algorithm_(algorithm), scratch_size_(scratch_size) {}
@@ -1159,8 +1260,8 @@ class VersionInfo {
 // PrepareForConvolution is an example for how new APIs should be written.
 class DnnSupport {
  public:
-  DnnSupport() {}
-  virtual ~DnnSupport() {}
+  DnnSupport() = default;
+  virtual ~DnnSupport() = default;
 
   virtual tsl::Status Init() = 0;
 
@@ -1474,12 +1575,6 @@ class DnnSupport {
       AlgorithmDesc algorithm_desc, DeviceMemory<uint8_t> scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
-  // Return a list of algorithms supported by the forward convolution pass.
-  // cc_major and cc_minor are the compute capabilities of the device.
-  virtual bool GetConvolveAlgorithms(
-      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      std::vector<AlgorithmDesc>* out_algorithms);
-
   virtual tsl::Status GetConvolveRunners(
       bool use_cudnn_frontend, dnn::ConvolutionKind kind,
       dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
@@ -1490,6 +1585,7 @@ class DnnSupport {
       DeviceMemoryBase output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, ScratchAllocator* scratch_allocator,
+      const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans);
 
   virtual tsl::StatusOr<std::unique_ptr<const dnn::ConvRunner>>
@@ -1512,6 +1608,7 @@ class DnnSupport {
       const dnn::BatchDescriptor& output_descriptor,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, dnn::ActivationMode activation_mode,
+      const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans);
 
   virtual tsl::Status GetFusedMatmulRunners(
@@ -1520,6 +1617,7 @@ class DnnSupport {
       bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k,
       int64_t lda, int64_t ldb, int64_t ldc,
       dnn::ActivationMode activation_mode, bool use_fallback,
+      const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
           out_exec_plans);
 
@@ -1536,6 +1634,54 @@ class DnnSupport {
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       dnn::ActivationMode activation_mode);
 
+  virtual tsl::StatusOr<std::unique_ptr<const dnn::FusedMHASoftmaxRunner>>
+  FusedMHASoftmaxRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::FusedMHAKind kind,
+      const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+      const dnn::TensorDescriptor& output_descriptor,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed);
+
+  virtual tsl::StatusOr<std::unique_ptr<const dnn::FusedMHAMaskRunner>>
+  FusedMHAScaleMaskSoftmaxRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::FusedMHAKind kind,
+      const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+      const dnn::TensorDescriptor& output_descriptor,
+      const dnn::TensorDescriptor& mask_descriptor, double scale,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed);
+
+  virtual tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABiasMaskRunner>>
+  FusedMHAScaleBiasMaskSoftmaxRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::FusedMHAKind kind,
+      const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+      const dnn::TensorDescriptor& output_descriptor,
+      const dnn::TensorDescriptor& mask_descriptor,
+      const dnn::TensorDescriptor& bias_descriptor, double scale,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed);
+
+  virtual tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABiasRunner>>
+  FusedMHAScaleBiasSoftmaxRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::FusedMHAKind kind,
+      const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+      const dnn::TensorDescriptor& output_descriptor,
+      const dnn::TensorDescriptor& bias_descriptor, double scale,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed);
+
   virtual bool GetMIOpenConvolveAlgorithms(
       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
@@ -1594,18 +1740,6 @@ class DnnSupport {
       const BatchDescriptor& output_descriptor,
       DeviceMemory<float>* output_data) = 0;
 
-  // Return a list of algorithms supported by the backward convolution pass for
-  // data.
-  virtual bool GetConvolveBackwardDataAlgorithms(
-      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      std::vector<AlgorithmDesc>* out_algorithms);
-
-  // Return a list of algorithms supported by the backward convolution pass for
-  // filters.
-  virtual bool GetConvolveBackwardFilterAlgorithms(
-      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      std::vector<AlgorithmDesc>* out_algorithms);
-
   // Fully connects the "nodes" (float values) in input_data with
   // shape input_dimensions to output_data with output_dimensions
   // using provided weights. This is equivalent to computing a matrix
@@ -1731,6 +1865,14 @@ class DnnSupport {
       const dnn::BatchDescriptor& output_dimensions,
       DeviceMemoryBase output_data, ScratchAllocator* workspace_allocator) = 0;
 
+  virtual tsl::Status DoPoolForward(
+      DataType element_type, Stream* stream,
+      const dnn::PoolingDescriptor& pooling_dimensions,
+      const NumericOptions& numeric_options,
+      const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+      const dnn::BatchDescriptor& output_dimensions,
+      DeviceMemoryBase output_data, ScratchAllocator* workspace_allocator);
+
   // Performs differentiation of the pooling operation.
   virtual tsl::Status DoPoolBackward(
       DataType element_type, Stream* stream,
@@ -1741,6 +1883,15 @@ class DnnSupport {
       DeviceMemoryBase output_diff_data,
       ScratchAllocator* workspace_allocator) = 0;
 
+  virtual tsl::Status DoPoolBackward(
+      DataType element_type, Stream* stream,
+      const dnn::PoolingDescriptor& pooling_dimensions,
+      const NumericOptions& numeric_options,
+      const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+      const dnn::BatchDescriptor& output_dimensions,
+      DeviceMemoryBase output_data, DeviceMemoryBase input_diff_data,
+      DeviceMemoryBase output_diff_data, ScratchAllocator* workspace_allocator);
+
   // Applies local response normalization to the values from input_data and
   // writes the result to output_data.
   //
@@ -2411,12 +2562,13 @@ class DnnSupport {
                                 absl::Span<const int> labels_data,
                                 absl::Span<const int> labels_lengths_data,
                                 absl::Span<const int> input_lengths_data,
+                                const NumericOptions& numeric_options,
                                 ScratchAllocator* workspace_allocator,
                                 DeviceMemory<uint8_t>* scratch_memory,
                                 int* ctc_loss_algo_id) {
     return DoPrepareForCtcLoss(
         stream, ToDataType<ElementType>::value, probs_desc, grads_desc,
-        labels_data, labels_lengths_data, input_lengths_data,
+        labels_data, labels_lengths_data, input_lengths_data, numeric_options,
         workspace_allocator, scratch_memory, ctc_loss_algo_id);
   }
 
@@ -2737,6 +2889,7 @@ class DnnSupport {
       absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
       absl::Span<const int> input_lengths_data,
+      const NumericOptions& numeric_options,
       ScratchAllocator* scratch_allocator,
       DeviceMemory<uint8_t>* scratch_memory, int* ctc_loss_algo_id) {
     *scratch_memory = {};
diff --git a/tensorflow/compiler/xla/stream_executor/event.cc b/tensorflow/compiler/xla/stream_executor/event.cc
index 1b43016ab30..c695dad7eec 100644
--- a/tensorflow/compiler/xla/stream_executor/event.cc
+++ b/tensorflow/compiler/xla/stream_executor/event.cc
@@ -31,7 +31,7 @@ Event::~Event() {
   if (stream_exec_ && implementation_) {
     auto status = stream_exec_->DeallocateEvent(this);
     if (!status.ok()) {
-      LOG(ERROR) << status.error_message();
+      LOG(ERROR) << status.message();
     }
   }
 }
@@ -39,7 +39,7 @@ Event::~Event() {
 bool Event::Init() {
   auto status = stream_exec_->AllocateEvent(this);
   if (!status.ok()) {
-    LOG(ERROR) << status.error_message();
+    LOG(ERROR) << status.message();
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/BUILD b/tensorflow/compiler/xla/stream_executor/gpu/BUILD
index 715daf2610f..8843902db19 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/gpu/BUILD
@@ -12,6 +12,7 @@ load(
 load(
     "//tensorflow/tsl:tsl.bzl",
     "if_libtpu",
+    "set_external_visibility",
     "tsl_copts",
     "tsl_gpu_library",
 )
@@ -30,14 +31,14 @@ load(
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//tensorflow/compiler/tf2xla:__subpackages__",
         "//tensorflow/compiler/xla:__subpackages__",
         "//tensorflow/compiler/xla/pjrt:__subpackages__",
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/common_runtime/gpu:__subpackages__",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -73,12 +74,12 @@ cc_library(
 cc_library(
     name = "gpu_driver_header",
     hdrs = if_gpu_is_configured(["gpu_driver.h"]),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/common_runtime/gpu:__subpackages__",
         "//tensorflow/core/util/autotune_maps:__subpackages__",
-    ],
+    ]),
     deps = [
         ":gpu_types_header",
         "//tensorflow/compiler/xla/stream_executor:device_options",
@@ -98,6 +99,7 @@ cc_library(
         ":gpu_driver_header",
         ":gpu_stream_header",
         "//tensorflow/compiler/xla/stream_executor:event",
+        "//tensorflow/tsl/platform:status",
     ]),
 )
 
@@ -148,9 +150,9 @@ tsl_gpu_library(
     hdrs = [
         "gpu_init.h",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl:internal",
-    ],
+    ]),
     deps = [
         "//tensorflow/tsl/platform:status",
     ] + if_static(
@@ -168,12 +170,12 @@ tsl_gpu_library(
     ],
     copts = tsl_copts(),
     linkstatic = True,
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/tf2xla:__subpackages__",
         "//tensorflow/compiler/xla:__subpackages__",
         "//tensorflow/core/common_runtime/gpu:__subpackages__",
         "//tensorflow/stream_executor:__subpackages__",
-    ],
+    ]),
     deps = [
         "//tensorflow/compiler/xla/stream_executor:multi_platform_manager",
         "//tensorflow/compiler/xla/stream_executor:platform",
@@ -269,11 +271,11 @@ cc_library(
 cc_library(
     name = "gpu_asm_opts",
     hdrs = ["gpu_asm_opts.h"],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
-    ],
+    ]),
     deps = [
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -284,12 +286,12 @@ cc_library(
     name = "asm_compiler_header",
     hdrs = if_gpu_is_configured(["asm_compiler.h"]),
     copts = tsl_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
-    ],
+    ]),
     deps = if_gpu_is_configured([
         ":gpu_asm_opts",
         ":gpu_driver_header",
@@ -322,12 +324,12 @@ cc_library(
     srcs = if_gpu_is_configured(["asm_compiler.cc"]),
     hdrs = if_gpu_is_configured(["asm_compiler.h"]),
     copts = tsl_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
-    ],
+    ]),
     deps = if_gpu_is_configured([
         ":gpu_asm_opts",
         ":gpu_driver_header",
@@ -365,16 +367,17 @@ cc_library(
     srcs = if_gpu_is_configured(["redzone_allocator.cc"]),
     hdrs = if_gpu_is_configured(["redzone_allocator.h"]),
     copts = tsl_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
-    ],
+    ]),
     deps = if_gpu_is_configured([
         ":asm_compiler",
         ":gpu_asm_opts",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "//tensorflow/tsl/lib/math:math_util",
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h b/tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h
index c8d675b750f..397f29b44f7 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h
@@ -373,9 +373,8 @@ class GpuDriver {
   // * Callbacks must not make any CUDA API calls.
   // * Callbacks from independent streams execute in an undefined order and may
   //   be serialized.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
-  typedef void (*StreamCallback)(GpuStreamHandle stream, GpuStatus status,
-                                 void* data);
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gab95a78143bae7f21eebb978f91e7f3f
+  typedef void (*StreamCallback)(void* data);
 
   // Enqueues a callback operation into stream.
   // See StreamCallback above and the NVIDIA documentation for additional
@@ -503,11 +502,16 @@ class GpuDriver {
   static tsl::StatusOr<int64_t> GetMaxSharedMemoryPerCore(
       GpuDeviceHandle device);
 
-  // Returns the amount of shared memory available for a single block
+  // Returns the amount of static shared memory available for a single block
   // (cooperative thread array).
   static tsl::StatusOr<int64_t> GetMaxSharedMemoryPerBlock(
       GpuDeviceHandle device);
 
+  // Returns the total amount of shared memory available for a single block
+  // (cooperative thread array).
+  static tsl::StatusOr<int64_t> GetMaxSharedMemoryPerBlockOptin(
+      GpuDeviceHandle device);
+
   // Returns the maximum supported number of registers per block.
   static tsl::StatusOr<int64_t> GetMaxRegistersPerBlock(GpuDeviceHandle device);
 
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h b/tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h
index da89fdf73de..e38c099f765 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h
@@ -334,8 +334,7 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   // data: User-provided callback provided to HostCallback() above, captured
   //       as a std::function<void()>. Allocated/initialized inside
   //       HostCallback() and owned and deleted by this call.
-  static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
-                                   void* data);
+  static void InternalHostCallback(void* data);
 
   // Collects metadata for the specified kernel.
   tsl::Status GetKernelMetadata(GpuKernel* cuda_kernel,
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_rng.h b/tensorflow/compiler/xla/stream_executor/gpu/gpu_rng.h
index 019367464da..8fa05f2b9c6 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_rng.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_rng.h
@@ -102,22 +102,22 @@ template <typename T>
 std::string TypeString();
 
 template <>
-std::string TypeString<float>() {
+inline std::string TypeString<float>() {
   return "float";
 }
 
 template <>
-std::string TypeString<double>() {
+inline std::string TypeString<double>() {
   return "double";
 }
 
 template <>
-std::string TypeString<std::complex<float>>() {
+inline std::string TypeString<std::complex<float>>() {
   return "std::complex<float>";
 }
 
 template <>
-std::string TypeString<std::complex<double>>() {
+inline std::string TypeString<std::complex<double>>() {
   return "std::complex<double>";
 }
 
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.cc b/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.cc
index 8e572c82b31..121e86a2b9a 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.cc
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.cc
@@ -37,7 +37,7 @@ void GpuStream::Destroy() {
     tsl::Status status =
         GpuDriver::DestroyEvent(parent_->gpu_context(), &completed_event_);
     if (!status.ok()) {
-      LOG(ERROR) << status.error_message();
+      LOG(ERROR) << status.message();
     }
   }
 
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.cc b/tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.cc
index ce1a6fdde11..1ab21ed7850 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.cc
+++ b/tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "absl/base/call_once.h"
 #include "absl/container/fixed_array.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
@@ -69,7 +70,7 @@ tsl::StatusOr<DeviceMemory<uint8_t>> RedzoneAllocator::AllocateBytes(
   CHECK_GE(byte_size, 0) << "byte_size must be positive.";
   if (byte_size > GetMemoryLimitInBytes()) {
     return tsl::Status(
-        tsl::error::RESOURCE_EXHAUSTED,
+        absl::StatusCode::kResourceExhausted,
         absl::StrFormat(
             "Allocating %d bytes exceeds the memory limit of %d bytes.",
             byte_size, GetMemoryLimitInBytes()));
diff --git a/tensorflow/compiler/xla/stream_executor/host/BUILD b/tensorflow/compiler/xla/stream_executor/host/BUILD
index 42bce3339fe..dcc1e849e80 100644
--- a/tensorflow/compiler/xla/stream_executor/host/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/host/BUILD
@@ -2,12 +2,13 @@
 #   Host-platform specific StreamExecutor support code.
 
 load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
+    default_visibility = set_external_visibility([":friends"]),
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.cc b/tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.cc
index eb3aa6884cb..14cb63b2218 100644
--- a/tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.cc
@@ -305,7 +305,7 @@ blas::BlasSupport* HostExecutor::CreateBlas() {
                                                         plugin_config_.blas());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve BLAS factory: "
-               << status.status().error_message();
+               << status.status().message();
     return nullptr;
   }
 
@@ -326,7 +326,7 @@ fft::FftSupport* HostExecutor::CreateFft() {
                                                        plugin_config_.fft());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve FFT factory: "
-               << status.status().error_message();
+               << status.status().message();
     return nullptr;
   }
 
@@ -347,7 +347,7 @@ rng::RngSupport* HostExecutor::CreateRng() {
                                                        plugin_config_.rng());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve RNG factory: "
-               << status.status().error_message();
+               << status.status().message();
     return nullptr;
   }
 
diff --git a/tensorflow/compiler/xla/stream_executor/host/host_platform.cc b/tensorflow/compiler/xla/stream_executor/host/host_platform.cc
index e254ce97093..ce9d5dfaea1 100644
--- a/tensorflow/compiler/xla/stream_executor/host/host_platform.cc
+++ b/tensorflow/compiler/xla/stream_executor/host/host_platform.cc
@@ -75,7 +75,7 @@ HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   auto init_status = executor->Init(config.device_options);
   if (!init_status.ok()) {
     return tsl::Status(
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat(
             "failed initializing StreamExecutor for device ordinal %d: %s",
             config.ordinal, init_status.ToString().c_str()));
diff --git a/tensorflow/compiler/xla/stream_executor/host/host_stream_test.cc b/tensorflow/compiler/xla/stream_executor/host/host_stream_test.cc
index e356e97f4d1..3bf017024b5 100644
--- a/tensorflow/compiler/xla/stream_executor/host/host_stream_test.cc
+++ b/tensorflow/compiler/xla/stream_executor/host/host_stream_test.cc
@@ -60,7 +60,7 @@ TEST(HostStream, ReportsHostCallbackError) {
 
   auto status = stream.BlockHostUntilDone();
   ASSERT_EQ(status.code(), tsl::error::INTERNAL);
-  ASSERT_EQ(status.error_message(), "error!");
+  ASSERT_EQ(status.message(), "error!");
 }
 
 TEST(HostStream, ReportsFirstHostCallbackError) {
@@ -76,5 +76,5 @@ TEST(HostStream, ReportsFirstHostCallbackError) {
       []() { return tsl::errors::Internal("error 2"); });
 
   // "error 2" is just lost.
-  ASSERT_EQ(stream.BlockHostUntilDone().error_message(), "error 1");
+  ASSERT_EQ(stream.BlockHostUntilDone().message(), "error 1");
 }
diff --git a/tensorflow/compiler/xla/stream_executor/lazy_op_runner.h b/tensorflow/compiler/xla/stream_executor/lazy_op_runner.h
index 54793046045..a0a71055951 100644
--- a/tensorflow/compiler/xla/stream_executor/lazy_op_runner.h
+++ b/tensorflow/compiler/xla/stream_executor/lazy_op_runner.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <atomic>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -202,6 +203,114 @@ struct FusedMatmulOp {
   }
 };
 
+struct FusedMHASoftmaxOp {
+  using Signature = FusedMHASoftmaxSignature;
+
+  struct Config {
+    FusedMHAKind kind;
+    const MatmulTensorDescriptor& bmm1_lhs_descriptor;
+    const MatmulTensorDescriptor& bmm1_rhs_descriptor;
+    const MatmulTensorDescriptor& bmm2_rhs_descriptor;
+    const MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor;
+    const TensorDescriptor& output_descriptor;
+    std::optional<double> dropout_rate;
+    std::optional<int64_t> seed;
+  };
+
+  static tsl::StatusOr<
+      std::unique_ptr<const OpRunner<FusedMHASoftmaxSignature>>>
+  RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
+                          Stream* stream) {
+    return stream->FusedMHASoftmaxRunnerFromDesc(
+        desc, config.kind, config.bmm1_lhs_descriptor,
+        config.bmm1_rhs_descriptor, config.bmm2_rhs_descriptor,
+        config.intermediate_bmm2_lhs_descriptor, config.output_descriptor,
+        config.dropout_rate, config.seed);
+  }
+};
+
+struct FusedMHAScaleMaskSoftmaxOp {
+  using Signature = FusedMHAMaskSignature;
+
+  struct Config {
+    FusedMHAKind kind;
+    double scale;
+    const MatmulTensorDescriptor& bmm1_lhs_descriptor;
+    const MatmulTensorDescriptor& bmm1_rhs_descriptor;
+    const MatmulTensorDescriptor& bmm2_rhs_descriptor;
+    const MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor;
+    const TensorDescriptor& output_descriptor;
+    const TensorDescriptor& mask_descriptor;
+    std::optional<double> dropout_rate;
+    std::optional<int64_t> seed;
+  };
+
+  static tsl::StatusOr<std::unique_ptr<const OpRunner<FusedMHAMaskSignature>>>
+  RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
+                          Stream* stream) {
+    return stream->FusedMHAScaleMaskSoftmaxRunnerFromDesc(
+        desc, config.kind, config.bmm1_lhs_descriptor,
+        config.bmm1_rhs_descriptor, config.bmm2_rhs_descriptor,
+        config.intermediate_bmm2_lhs_descriptor, config.output_descriptor,
+        config.mask_descriptor, config.scale, config.dropout_rate, config.seed);
+  }
+};
+
+struct FusedMHAScaleBiasMaskSoftmaxOp {
+  using Signature = FusedMHABiasMaskSignature;
+  struct Config {
+    FusedMHAKind kind;
+    double scale;
+    const MatmulTensorDescriptor& bmm1_lhs_descriptor;
+    const MatmulTensorDescriptor& bmm1_rhs_descriptor;
+    const MatmulTensorDescriptor& bmm2_rhs_descriptor;
+    const MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor;
+    const TensorDescriptor& output_descriptor;
+    const TensorDescriptor& bias_descriptor;
+    const TensorDescriptor& mask_descriptor;
+    std::optional<double> dropout_rate;
+    std::optional<int64_t> seed;
+  };
+
+  static tsl::StatusOr<
+      std::unique_ptr<const OpRunner<FusedMHABiasMaskSignature>>>
+  RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
+                          Stream* stream) {
+    return stream->FusedMHAScaleBiasMaskSoftmaxRunnerFromDesc(
+        desc, config.kind, config.bmm1_lhs_descriptor,
+        config.bmm1_rhs_descriptor, config.bmm2_rhs_descriptor,
+        config.intermediate_bmm2_lhs_descriptor, config.output_descriptor,
+        config.mask_descriptor, config.bias_descriptor, config.scale,
+        config.dropout_rate, config.seed);
+  }
+};
+
+struct FusedMHAScaleBiasSoftmaxOp {
+  using Signature = FusedMHABiasSignature;
+  struct Config {
+    FusedMHAKind kind;
+    double scale;
+    const MatmulTensorDescriptor& bmm1_lhs_descriptor;
+    const MatmulTensorDescriptor& bmm1_rhs_descriptor;
+    const MatmulTensorDescriptor& bmm2_rhs_descriptor;
+    const MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor;
+    const TensorDescriptor& output_descriptor;
+    const TensorDescriptor& bias_descriptor;
+    std::optional<double> dropout_rate;
+    std::optional<int64_t> seed;
+  };
+
+  static tsl::StatusOr<std::unique_ptr<const OpRunner<FusedMHABiasSignature>>>
+  RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
+                          Stream* stream) {
+    return stream->FusedMHAScaleBiasSoftmaxRunnerFromDesc(
+        desc, config.kind, config.bmm1_lhs_descriptor,
+        config.bmm1_rhs_descriptor, config.bmm2_rhs_descriptor,
+        config.intermediate_bmm2_lhs_descriptor, config.output_descriptor,
+        config.bias_descriptor, config.scale, config.dropout_rate, config.seed);
+  }
+};
+
 }  // namespace dnn
 }  // namespace stream_executor
 
diff --git a/tensorflow/compiler/xla/stream_executor/multi_platform_manager.cc b/tensorflow/compiler/xla/stream_executor/multi_platform_manager.cc
index f4322e8ef94..2fd4a264077 100644
--- a/tensorflow/compiler/xla/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/compiler/xla/stream_executor/multi_platform_manager.cc
@@ -95,7 +95,7 @@ tsl::Status MultiPlatformManagerImpl::RegisterPlatform(
   std::string key = absl::AsciiStrToLower(platform->Name());
   absl::MutexLock lock(&mu_);
   if (name_map_.find(key) != name_map_.end()) {
-    return tsl::Status(tsl::error::INTERNAL,
+    return tsl::Status(absl::StatusCode::kInternal,
                        "platform is already registered with name: \"" +
                            platform->Name() + "\"");
   }
@@ -155,7 +155,7 @@ tsl::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithName(
   TF_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target));
   if (platform->Initialized()) {
     return tsl::Status(
-        tsl::error::FAILED_PRECONDITION,
+        absl::StatusCode::kFailedPrecondition,
         absl::StrCat("platform \"", target, "\" is already initialized"));
   }
 
@@ -171,7 +171,7 @@ tsl::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithId(
   TF_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
   if (platform->Initialized()) {
     return tsl::Status(
-        tsl::error::FAILED_PRECONDITION,
+        absl::StatusCode::kFailedPrecondition,
         absl::StrFormat("platform with id %p is already initialized", id));
   }
 
@@ -231,7 +231,7 @@ tsl::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByNameLocked(
   auto it = name_map_.find(absl::AsciiStrToLower(target));
   if (it == name_map_.end()) {
     return tsl::Status(
-        tsl::error::NOT_FOUND,
+        absl::StatusCode::kNotFound,
         absl::StrCat("Could not find registered platform with name: \"", target,
                      "\". Available platform names are: ",
                      absl::StrJoin(InitializedPlatformNamesWithFilter(), " ")));
@@ -244,7 +244,7 @@ tsl::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByIdLocked(
   auto it = id_map_.find(id);
   if (it == id_map_.end()) {
     return tsl::Status(
-        tsl::error::NOT_FOUND,
+        absl::StatusCode::kNotFound,
         absl::StrFormat("could not find registered platform with id: %p", id));
   }
   return it->second;
diff --git a/tensorflow/compiler/xla/stream_executor/numeric_options.h b/tensorflow/compiler/xla/stream_executor/numeric_options.h
new file mode 100644
index 00000000000..2f76f0364d6
--- /dev/null
+++ b/tensorflow/compiler/xla/stream_executor/numeric_options.h
@@ -0,0 +1,32 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+------------------------------------------------------------------------------*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_NUMERIC_OPTIONS_H_
+#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_NUMERIC_OPTIONS_H_
+
+namespace stream_executor {
+
+// Options that specify the numeric behavior of operations like matrix
+// multiplications and convolutions
+struct NumericOptions {
+  explicit NumericOptions(bool require_determinism)
+      : require_determinism(require_determinism) {}
+
+  bool require_determinism;
+};
+
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_NUMERIC_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/stream_executor/platform/BUILD b/tensorflow/compiler/xla/stream_executor/platform/BUILD
index b533bd1f2f5..d1bef5f3af4 100644
--- a/tensorflow/compiler/xla/stream_executor/platform/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/platform/BUILD
@@ -1,10 +1,11 @@
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/platform:build_config.bzl", "tf_stream_executor_deps")
 load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
+    default_visibility = set_external_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -22,9 +23,9 @@ cc_library(
         "port.h",
     ],
     deps = [
+        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:macros",
         "//tensorflow/tsl/platform:types",
-        "//tensorflow/tsl/platform:logging",
     ] + tf_stream_executor_deps("platform", "//tensorflow/compiler/xla/stream_executor/platform/"),
 )
 
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/BUILD b/tensorflow/compiler/xla/stream_executor/rocm/BUILD
index 97c283c94d2..b29f7f9f35c 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/rocm/BUILD
@@ -6,13 +6,13 @@ load(
     "//tensorflow/compiler/xla/stream_executor:build_defs.bzl",
     "stream_executor_friends",
 )
-load("//tensorflow/tsl:tsl.bzl", "tsl_copts")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility", "tsl_copts")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured", "rocm_copts")
 load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
+    default_visibility = set_external_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -150,6 +150,13 @@ cc_library(
 cc_library(
     name = "rocblas_if_static",
     deps = if_static([
+        ":rocblas_if_rocm_configured",
+    ]),
+)
+
+cc_library(
+    name = "rocblas_if_rocm_configured",
+    deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:rocblas",
     ]),
 )
@@ -205,6 +212,13 @@ cc_library(
 cc_library(
     name = "hipfft_if_static",
     deps = if_static([
+        ":hipfft_if_rocm_configured",
+    ]),
+)
+
+cc_library(
+    name = "hipfft_if_rocm_configured",
+    deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:hipfft",
     ]),
 )
@@ -237,6 +251,13 @@ cc_library(
 cc_library(
     name = "miopen_if_static",
     deps = if_static([
+        ":miopen_if_rocm_configured",
+    ]),
+)
+
+cc_library(
+    name = "miopen_if_rocm_configured",
+    deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:miopen",
     ]),
 )
@@ -267,6 +288,7 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_activation_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_timer_header",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
         "@com_google_absl//absl/algorithm:container",
@@ -283,6 +305,13 @@ cc_library(
 cc_library(
     name = "hiprand_if_static",
     deps = if_static([
+        ":hiprand_if_rocm_configured",
+    ]),
+)
+
+cc_library(
+    name = "hiprand_if_rocm_configured",
+    deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:hiprand",
     ]),
 )
@@ -314,6 +343,13 @@ cc_library(
 cc_library(
     name = "hipsparse_if_static",
     deps = if_static([
+        ":hipsparse_if_rocm_configured",
+    ]),
+)
+
+cc_library(
+    name = "hipsparse_if_rocm_configured",
+    deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:hipsparse",
     ]),
 )
@@ -337,6 +373,13 @@ cc_library(
 cc_library(
     name = "rocsolver_if_static",
     deps = if_static([
+        ":rocsolver_if_rocm_configured",
+    ]),
+)
+
+cc_library(
+    name = "rocsolver_if_rocm_configured",
+    deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:rocsolver",
     ]),
 )
@@ -360,6 +403,13 @@ cc_library(
 cc_library(
     name = "hipsolver_if_static",
     deps = if_static([
+        ":hipsolver_if_rocm_configured",
+    ]),
+)
+
+cc_library(
+    name = "hipsolver_if_rocm_configured",
+    deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:hipsolver",
     ]),
 )
@@ -383,6 +433,13 @@ cc_library(
 cc_library(
     name = "roctracer_if_static",
     deps = if_static([
+        ":roctracer_if_rocm_configured",
+    ]),
+)
+
+cc_library(
+    name = "roctracer_if_rocm_configured",
+    deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:roctracer",
     ]),
 )
@@ -443,8 +500,8 @@ cc_library(
 cc_library(
     name = "stream_executor_rocm",
     deps = [
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_bundle",
         ":rocm_rpath",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_bundle",
     ] + if_static(
         [":all_runtime"],
     ),
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h
index 155be6a04d5..9e5af1ec863 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h
@@ -47,17 +47,17 @@ namespace wrap {
 
 #define HIPSOLVER_API_WRAPPER(api_name)                                       \
   template <typename... Args>                                                 \
-  auto api_name(Args... args)->decltype(::api_name(args...)) {                \
+  auto api_name(Args... args) -> decltype(::api_name(args...)) {              \
     using FuncPtrT = std::add_pointer<decltype(::api_name)>::type;            \
     static FuncPtrT loaded = []() -> FuncPtrT {                               \
       static const char* kName = TO_STR(api_name);                            \
       void* f;                                                                \
-      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                     \
+      auto s = tsl::Env::Default() -> GetSymbolFromLibrary(                   \
           stream_executor::internal::CachedDsoLoader::GetHipsolverDsoHandle() \
               .value(),                                                       \
           kName, &f);                                                         \
       CHECK(s.ok()) << "could not find " << kName                             \
-                    << " in hipsolver lib; dlerror: " << s.error_message();   \
+                    << " in hipsolver lib; dlerror: " << s.message();         \
       return reinterpret_cast<FuncPtrT>(f);                                   \
     }();                                                                      \
     return loaded(args...);                                                   \
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/hipsparse_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/hipsparse_wrapper.h
index d6329368a31..a617f7003c6 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/hipsparse_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/hipsparse_wrapper.h
@@ -52,14 +52,14 @@ namespace wrap {
     static void* GetDsoHandle() {                                              \
       auto s =                                                                 \
           stream_executor::internal::CachedDsoLoader::GetHipsparseDsoHandle(); \
-      return s.value();                                                   \
+      return s.value();                                                        \
     }                                                                          \
     static FuncPtrT LoadOrDie() {                                              \
       void* f;                                                                 \
-      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(),       \
-                                                         kName, &f);           \
+      auto s = tsl::Env::Default()                                             \
+          -> GetSymbolFromLibrary(GetDsoHandle(), kName, &f);                  \
       CHECK(s.ok()) << "could not find " << kName                              \
-                    << " in miopen DSO; dlerror: " << s.error_message();       \
+                    << " in miopen DSO; dlerror: " << s.message();             \
       return reinterpret_cast<FuncPtrT>(f);                                    \
     }                                                                          \
     static FuncPtrT DynLoad() {                                                \
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocblas_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/rocblas_wrapper.h
index 519adf9a07b..e994e6642de 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocblas_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocblas_wrapper.h
@@ -44,31 +44,31 @@ using stream_executor::internal::CachedDsoLoader::GetRocblasDsoHandle;
 
 #else
 
-#define ROCBLAS_API_WRAPPER(__name)                                       \
-  struct DynLoadShim__##__name {                                          \
-    static const char* kName;                                             \
-    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
-    static void* GetDsoHandle() {                                         \
-      auto s = GetRocblasDsoHandle();                                     \
-      return s.value();                                              \
-    }                                                                     \
-    static FuncPtrT LoadOrDie() {                                         \
-      void* f;                                                            \
-      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(),  \
-                                                         kName, &f);      \
-      CHECK(s.ok()) << "could not find " << kName                         \
-                    << " in rocblas DSO; dlerror: " << s.error_message(); \
-      return reinterpret_cast<FuncPtrT>(f);                               \
-    }                                                                     \
-    static FuncPtrT DynLoad() {                                           \
-      static FuncPtrT f = LoadOrDie();                                    \
-      return f;                                                           \
-    }                                                                     \
-    template <typename... Args>                                           \
-    rocblas_status operator()(Args... args) {                             \
-      return DynLoad()(args...);                                          \
-    }                                                                     \
-  } __name;                                                               \
+#define ROCBLAS_API_WRAPPER(__name)                                 \
+  struct DynLoadShim__##__name {                                    \
+    static const char* kName;                                       \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;    \
+    static void* GetDsoHandle() {                                   \
+      auto s = GetRocblasDsoHandle();                               \
+      return s.value();                                             \
+    }                                                               \
+    static FuncPtrT LoadOrDie() {                                   \
+      void* f;                                                      \
+      auto s = tsl::Env::Default()                                  \
+          -> GetSymbolFromLibrary(GetDsoHandle(), kName, &f);       \
+      CHECK(s.ok()) << "could not find " << kName                   \
+                    << " in rocblas DSO; dlerror: " << s.message(); \
+      return reinterpret_cast<FuncPtrT>(f);                         \
+    }                                                               \
+    static FuncPtrT DynLoad() {                                     \
+      static FuncPtrT f = LoadOrDie();                              \
+      return f;                                                     \
+    }                                                               \
+    template <typename... Args>                                     \
+    rocblas_status operator()(Args... args) {                       \
+      return DynLoad()(args...);                                    \
+    }                                                               \
+  } __name;                                                         \
   const char* DynLoadShim__##__name::kName = #__name;
 
 #endif
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.cc
index 1cf271a8380..6c603f51b47 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.cc
@@ -519,123 +519,6 @@ tsl::Status ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   }
 }
 
-bool ROCMBlas::DoBlasGemvWithProfiling(
-    Stream *stream, blas::Transpose trans, uint64_t m, uint64 n, float alpha,
-    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
-    int incx, float beta, DeviceMemory<float> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
-                                     incx, beta, y, incy,
-                                     output_profile_result);
-}
-
-bool ROCMBlas::DoBlasGemvWithProfiling(
-    Stream *stream, blas::Transpose trans, uint64_t m, uint64 n, double alpha,
-    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
-    int incx, double beta, DeviceMemory<double> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
-                                     incx, beta, y, incy,
-                                     output_profile_result);
-}
-
-bool ROCMBlas::DoBlasGemvWithProfiling(
-    Stream *stream, blas::Transpose trans, uint64_t m, uint64 n,
-    std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,
-    int lda, const DeviceMemory<std::complex<float>> &x, int incx,
-    std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
-                                     incx, beta, y, incy,
-                                     output_profile_result);
-}
-
-bool ROCMBlas::DoBlasGemvWithProfiling(
-    Stream *stream, blas::Transpose trans, uint64_t m, uint64 n,
-    std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a,
-    int lda, const DeviceMemory<std::complex<double>> &x, int incx,
-    std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
-                                     incx, beta, y, incy,
-                                     output_profile_result);
-}
-
-bool ROCMBlas::DoBlasGemmWithProfiling(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
-    int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta,
-    DeviceMemory<Eigen::half> *c, int ldc,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
-                                     lda, b, ldb, beta, c, ldc,
-                                     output_profile_result);
-}
-
-bool ROCMBlas::DoBlasGemmWithProfiling(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
-    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
-    int ldc, blas::ProfileResult *output_profile_result) {
-  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
-                                     lda, b, ldb, beta, c, ldc,
-                                     output_profile_result);
-}
-
-bool ROCMBlas::DoBlasGemmWithProfiling(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
-    const DeviceMemory<double> &b, int ldb, double beta,
-    DeviceMemory<double> *c, int ldc,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
-                                     lda, b, ldb, beta, c, ldc,
-                                     output_profile_result);
-}
-
-bool ROCMBlas::DoBlasGemmWithProfiling(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, std::complex<float> alpha,
-    const DeviceMemory<std::complex<float>> &a, int lda,
-    const DeviceMemory<std::complex<float>> &b, int ldb,
-    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
-                                     lda, b, ldb, beta, c, ldc,
-                                     output_profile_result);
-}
-
-bool ROCMBlas::DoBlasGemmWithProfiling(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, std::complex<double> alpha,
-    const DeviceMemory<std::complex<double>> &a, int lda,
-    const DeviceMemory<std::complex<double>> &b, int ldb,
-    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
-    blas::ProfileResult *output_profile_result) {
-  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
-                                     lda, b, ldb, beta, c, ldc,
-                                     output_profile_result);
-}
-
-template <typename T>
-bool ROCMBlas::DoBlasGemvWithProfilingImpl(
-    Stream *stream, blas::Transpose trans, uint64_t m, uint64 n, const T &alpha,
-    const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
-    const T &beta, DeviceMemory<T> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  // ROCM TODO: properly implement the interface
-  return false;
-}
-
-template <typename T, typename ParamType>
-bool ROCMBlas::DoBlasGemmWithProfilingImpl(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
-    int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
-    DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
-  // ROCM TODO: properly implement the interface
-  return false;
-}
 tsl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64 k, const void *alpha, const DeviceMemoryBase &a,
@@ -696,7 +579,7 @@ tsl::Status ReorganizeMemory(Stream *stream,
               : stream->ThenMemcpy(&src_mem, target_mem, cur_stride_size).ok();
       if (!a_status) {
         return tsl::Status(
-            tsl::error::INTERNAL,
+            absl::StatusCode::kInternal,
             "failed to copy device memory in ROCMBlas::DoBlasGemmBatched");
       }
       src_ptr = reinterpret_cast<char *>(raw_ptrs[i]);
@@ -712,7 +595,7 @@ tsl::Status ReorganizeMemory(Stream *stream,
              : stream->ThenMemcpy(&src_mem, target_mem, cur_stride_size).ok();
   if (!a_status)
     return tsl::Status(
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         "failed to copy device memory in ROCMBlas::DoBlasGemmBatched");
   return tsl::OkStatus();
 }
@@ -875,7 +758,7 @@ tsl::Status ROCMBlas::DoBlasGemmBatchedInternal(
                         batch_stride_c, batch_count);
   }
   if (!ok)
-    return tsl::Status(tsl::error::INTERNAL,
+    return tsl::Status(absl::StatusCode::kInternal,
                        "failed BLAS call, see log for details");
   if (reallocated_c)
     return ReorganizeMemory(stream, &c, c_raw_ptrs, batch_count, batch_stride_c,
@@ -1244,8 +1127,7 @@ void initialize_rocblas() {
                 });
 
     if (!status.ok()) {
-      LOG(ERROR) << "Unable to register rocBLAS factory: "
-                 << status.error_message();
+      LOG(ERROR) << "Unable to register rocBLAS factory: " << status.message();
     }
 
     PluginRegistry::Instance()->SetDefaultFactory(
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.h b/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.h
index cea293a6631..a9dd80c5afe 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.h
@@ -176,26 +176,6 @@ class ROCMBlas : public blas::BlasSupport {
       DeviceMemorySlice<T> c_ptrs_to_wrappers, int ldc, int batch_count,
       ScratchAllocator *scratch_allocator);
 
-  // Helper function for implementing DoBlasGemmWithProfiling.
-  template <typename T, typename ParamType>
-  bool DoBlasGemmWithProfilingImpl(Stream *stream, blas::Transpose transa,
-                                   blas::Transpose transb, uint64_t m,
-                                   uint64_t n, uint64 k, const ParamType &alpha,
-                                   const DeviceMemory<T> &a, int lda,
-                                   const DeviceMemory<T> &b, int ldb,
-                                   const ParamType &beta, DeviceMemory<T> *c,
-                                   int ldc,
-                                   blas::ProfileResult *output_profile_result);
-
-  // Helper function for implementing DoBlasGemvWithProfiling.
-  template <typename T>
-  bool DoBlasGemvWithProfilingImpl(Stream *stream, blas::Transpose trans,
-                                   uint64_t m, uint64 n, const T &alpha,
-                                   const DeviceMemory<T> &a, int lda,
-                                   const DeviceMemory<T> &x, int incx,
-                                   const T &beta, DeviceMemory<T> *y, int incy,
-                                   blas::ProfileResult *output_profile_result);
-
   // mutex that guards the rocBLAS handle for this device.
   absl::Mutex mu_;
 
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.cc
index e0ccb9be275..6d43aab8c6d 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.cc
@@ -58,7 +58,7 @@ string DriverVersionStatusToString(tsl::StatusOr<DriverVersion> version) {
 tsl::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
   std::vector<string> pieces = absl::StrSplit(value, '.');
   if (pieces.size() != 2 && pieces.size() != 3) {
-    return tsl::Status{tsl::error::INVALID_ARGUMENT,
+    return tsl::Status{absl::StatusCode::kInvalidArgument,
                        absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
                                        "for driver version; got \"%s\"",
                                        value.c_str())};
@@ -69,21 +69,21 @@ tsl::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
   int patch = 0;
   if (!absl::SimpleAtoi(pieces[0], &major)) {
     return tsl::Status{
-        tsl::error::INVALID_ARGUMENT,
+        absl::StatusCode::kInvalidArgument,
         absl::StrFormat("could not parse major version number \"%s\" as an "
                         "integer from string \"%s\"",
                         pieces[0].c_str(), value.c_str())};
   }
   if (!absl::SimpleAtoi(pieces[1], &minor)) {
     return tsl::Status{
-        tsl::error::INVALID_ARGUMENT,
+        absl::StatusCode::kInvalidArgument,
         absl::StrFormat("could not parse minor version number \"%s\" as an "
                         "integer from string \"%s\"",
                         pieces[1].c_str(), value.c_str())};
   }
   if (pieces.size() == 3 && !absl::SimpleAtoi(pieces[2], &patch)) {
     return tsl::Status{
-        tsl::error::INVALID_ARGUMENT,
+        absl::StatusCode::kInvalidArgument,
         absl::StrFormat("could not parse patch version number \"%s\" as an "
                         "integer from string \"%s\"",
                         pieces[2].c_str(), value.c_str())};
@@ -154,7 +154,7 @@ void Diagnostician::LogDiagnosticInformation() {
 // driver-interfacing DSO version number. Returns it as a string.
 tsl::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
   tsl::StatusOr<DriverVersion> result{tsl::Status{
-      tsl::error::NOT_FOUND,
+      absl::StatusCode::kNotFound,
       "was unable to find librocm.so DSO loaded into this program"}};
 
   // Callback used when iterating through DSOs. Looks for the driver-interfacing
@@ -198,7 +198,7 @@ tsl::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
   size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
   if (offset == string::npos) {
     return tsl::Status{
-        tsl::error::NOT_FOUND,
+        absl::StatusCode::kNotFound,
         absl::StrCat("could not find kernel module information in "
                      "driver version file contents: \"",
                      driver_version_file_contents, "\"")};
@@ -230,7 +230,7 @@ void Diagnostician::WarnOnDsoKernelMismatch(
 }
 
 tsl::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
-  auto status = tsl::Status{tsl::error::UNIMPLEMENTED,
+  auto status = tsl::Status{absl::StatusCode::kUnimplemented,
                             "kernel reported driver version not implemented"};
   return status;
 }
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc
index 62e4c5bc83c..106314e8109 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc
@@ -222,31 +222,31 @@ namespace wrap {
 
 #else
 
-#define STREAM_EXECUTOR_MIOPEN_WRAP(__name)                              \
-  struct DynLoadShim__##__name {                                         \
-    static const char* kName;                                            \
-    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;         \
-    static void* GetDsoHandle() {                                        \
-      auto s = internal::CachedDsoLoader::GetMiopenDsoHandle();          \
-      return s.value();                                                  \
-    }                                                                    \
-    static FuncPtrT LoadOrDie() {                                        \
-      void* f;                                                           \
-      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
-                                                         kName, &f);     \
-      CHECK(s.ok()) << "could not find " << kName                        \
-                    << " in miopen DSO; dlerror: " << s.error_message(); \
-      return reinterpret_cast<FuncPtrT>(f);                              \
-    }                                                                    \
-    static FuncPtrT DynLoad() {                                          \
-      static FuncPtrT f = LoadOrDie();                                   \
-      return f;                                                          \
-    }                                                                    \
-    template <typename... Args>                                          \
-    miopenStatus_t operator()(Args... args) {                            \
-      return DynLoad()(args...);                                         \
-    }                                                                    \
-  } __name;                                                              \
+#define STREAM_EXECUTOR_MIOPEN_WRAP(__name)                        \
+  struct DynLoadShim__##__name {                                   \
+    static const char* kName;                                      \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;   \
+    static void* GetDsoHandle() {                                  \
+      auto s = internal::CachedDsoLoader::GetMiopenDsoHandle();    \
+      return s.value();                                            \
+    }                                                              \
+    static FuncPtrT LoadOrDie() {                                  \
+      void* f;                                                     \
+      auto s = tsl::Env::Default()                                 \
+          -> GetSymbolFromLibrary(GetDsoHandle(), kName, &f);      \
+      CHECK(s.ok()) << "could not find " << kName                  \
+                    << " in miopen DSO; dlerror: " << s.message(); \
+      return reinterpret_cast<FuncPtrT>(f);                        \
+    }                                                              \
+    static FuncPtrT DynLoad() {                                    \
+      static FuncPtrT f = LoadOrDie();                             \
+      return f;                                                    \
+    }                                                              \
+    template <typename... Args>                                    \
+    miopenStatus_t operator()(Args... args) {                      \
+      return DynLoad()(args...);                                   \
+    }                                                              \
+  } __name;                                                        \
   const char* DynLoadShim__##__name::kName = #__name;
 
 #endif
@@ -750,7 +750,7 @@ tsl::Status MIOpenSupport::Init() {
     }
   }
 
-  return tsl::Status{tsl::error::INTERNAL,
+  return tsl::Status{absl::StatusCode::kInternal,
                      absl::StrCat("miopen library could not create a handle: ",
                                   ToString(status))};
 }
@@ -1878,7 +1878,7 @@ class MixinBase<void> {};
 #define RETURN_IF_MIOPEN_ERROR(STATUS, ...)                              \
   if (!SE_PREDICT_TRUE((STATUS) == miopenStatusSuccess)) {               \
     string error_msg = absl::StrCat(ToString(STATUS), " ", __VA_ARGS__); \
-    SetFailure(::tsl::Status(tsl::error::UNKNOWN, error_msg));           \
+    SetFailure(::tsl::Status(absl::StatusCode::kUnknown, error_msg));    \
     LOG(ERROR) << error_msg;                                             \
     return;                                                              \
   }
@@ -2044,7 +2044,7 @@ class MIOpenRnnSequenceTensorDescriptor
       string error_msg =
           absl::StrCat("sequence length must be positive: ", seq_length);
       LOG(ERROR) << error_msg;
-      SetFailure(tsl::Status(tsl::error::UNKNOWN, error_msg));
+      SetFailure(tsl::Status(absl::StatusCode::kUnknown, error_msg));
       return;
     }
     auto status = wrap::miopenCreateTensorDescriptor(&handle);
@@ -2566,8 +2566,8 @@ tsl::Status MIOpenSupport::DoPrepareForCtcLoss(
     absl::Span<const int> labels_data,
     absl::Span<const int> labels_lengths_data,
     absl::Span<const int> input_lengths_data,
-    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch_memory,
-    int* ctc_loss_algo_id) {
+    const NumericOptions& numeric_options, ScratchAllocator* scratch_allocator,
+    DeviceMemory<uint8>* scratch_memory, int* ctc_loss_algo_id) {
   auto miopen = miopen_->GetHandle(parent_, stream);
 
   MIOpenCTCLossDescriptor miopen_ctc_loss_desc(ToMIOpenDataType(element_type));
@@ -2608,7 +2608,7 @@ tsl::Status MIOpenSupport::DoPrepareForCtcLoss(
     } else {
       LOG(ERROR)
           << "Failed to allocate scratch memory - "
-          << scratch_or.status().error_message() << "\n"
+          << scratch_or.status().message() << "\n"
           << "\tYou can set the env var TF_CUDNN_WORKSPACE_LIMIT_IN_MB to a "
              "larger number (e.g. 8192) to increase the max memory limit.\n"
           << "\tIncreasing the max memory limit might help resolve this "
@@ -2663,7 +2663,7 @@ tsl::Status MIOpenSupport::DoCtcLoss(
     int ctc_loss_algo_id) {
   // Current MIOPen CTC Loss only supports the float datatype
   if (element_type != dnn::DataType::kFloat) {
-    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+    return tsl::Status(absl::StatusCode::kInvalidArgument,
                        "MIOpenCTCLossDescriptor is supported only when the "
                        "DataType is float");
   }
@@ -2693,14 +2693,14 @@ MIOpenSupport::createRnnDescriptor(
   // ROCM TODO: batch_size is used in dynamic persistent RNN algorithm and is
   // not supported by MIOpen now.
   if (use_padded_io) {
-    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+    return tsl::Status(absl::StatusCode::kInvalidArgument,
                        "ROCm MIOpen only supports packed input output.");
   }
 
   bool use_projection = cell_size != 0 && hidden_size < cell_size;
   if (use_projection) {
     return tsl::Status(
-        tsl::error::INVALID_ARGUMENT,
+        absl::StatusCode::kInvalidArgument,
         "ROCm MIOpen does not support RNN ProjectionLayers yet.");
   }
 
@@ -3051,7 +3051,7 @@ tsl::Status MIOpenSupport::DoPrepareForConvolution(
     } else {
       LOG(ERROR)
           << "Failed to allocate scratch memory - "
-          << allocated.status().error_message() << "\n"
+          << allocated.status().message() << "\n"
           << "\tYou can set the env var TF_CUDNN_WORKSPACE_LIMIT_IN_MB to a "
              "larger number (e.g. 8192) to increase the max memory limit.\n"
           << "\tIncreasing the max memory limit might help resolve this "
@@ -3111,14 +3111,15 @@ class RocmConvRunner : public dnn::ConvRunner {
     if (is_profiling) {
       timer.reset(new GpuTimer(parent_));
       if (!timer->Init()) {
-        return tsl::Status(tsl::error::INTERNAL, "Failed to init timer");
+        return tsl::Status(absl::StatusCode::kInternal, "Failed to init timer");
       }
       // The start and stop of the timer should be as close to the MIOpen call
       // as possible. It is still possible for other threads to issue workload
       // on to this stream. So it could take multiple profiling measurements.
       if (!timer->Start(AsGpuStream(stream))) {
         timer->Destroy();
-        return tsl::Status(tsl::error::INTERNAL, "Failed to start timer");
+        return tsl::Status(absl::StatusCode::kInternal,
+                           "Failed to start timer");
       }
     }
 
@@ -3190,7 +3191,7 @@ class RocmConvRunner : public dnn::ConvRunner {
     if (is_profiling) {
       if (!timer->Stop(AsGpuStream(stream))) {
         timer->Destroy();
-        return tsl::Status(tsl::error::INTERNAL, "Failed to stop timer");
+        return tsl::Status(absl::StatusCode::kInternal, "Failed to stop timer");
       }
       if (status == miopenStatusSuccess) {
         dnn::AlgorithmDesc algotype(algo_id_, false);
@@ -3243,21 +3244,6 @@ tsl::Status MIOpenSupport::DoConvolve(
                    filter_data, output_data);
 }
 
-bool MIOpenSupport::GetConvolveAlgorithms(
-    // ROCM TODO: refactor cc_major / cc_minor
-    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
-  out_algorithms->assign({
-      // clang-format off
-      dnn::AlgorithmDesc(miopenConvolutionFwdAlgoGEMM, false),
-      dnn::AlgorithmDesc(miopenConvolutionFwdAlgoDirect, false),
-      dnn::AlgorithmDesc(miopenConvolutionFwdAlgoFFT, false),
-      dnn::AlgorithmDesc(miopenConvolutionFwdAlgoWinograd, false),
-      // clang-format on
-  });
-  return true;
-}
-
 tsl::Status MIOpenSupport::GetConvolveRunners(
     bool use_cudnn_frontend, dnn::ConvolutionKind kind,
     dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
@@ -3266,7 +3252,7 @@ tsl::Status MIOpenSupport::GetConvolveRunners(
     DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
     DeviceMemoryBase output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    ScratchAllocator* scratch_allocator,
+    ScratchAllocator* scratch_allocator, const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_runners) {
   if (input_type != output_type) {
     return tsl::errors::Unimplemented(
@@ -3281,7 +3267,7 @@ tsl::Status MIOpenSupport::GetConvolveRunners(
           filter_descriptor, filter_data, output_descriptor, output_data,
           convolution_descriptor, scratch_allocator, &profile_results)) {
     return tsl::Status(
-        tsl::error::UNKNOWN,
+        absl::StatusCode::kUnknown,
         "GetConvolveRunners: GetMIOpenConvolveAlgorithms failed");
   }
 
@@ -3636,7 +3622,7 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
     } else {
       LOG(FATAL)
           << "Failed to allocate scratch memory - "
-          << allocated.status().error_message() << "\n"
+          << allocated.status().message() << "\n"
           << "\tYou can set the env var TF_CUDNN_WORKSPACE_LIMIT_IN_MB to a "
              "larger number (e.g. 8192) to increase the max memory limit.\n"
           << "\tIncreasing the max memory limit might help resolve this "
@@ -3720,34 +3706,6 @@ bool MIOpenSupport::GetRnnAlgorithms(
   return true;
 }
 
-bool MIOpenSupport::GetConvolveBackwardDataAlgorithms(
-    // ROCM TODO: refactor cc_major / cc_minor
-    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
-  out_algorithms->assign({
-      // clang-format off
-      dnn::AlgorithmDesc(miopenConvolutionBwdDataAlgoGEMM, false),
-      dnn::AlgorithmDesc(miopenConvolutionBwdDataAlgoDirect, false),
-      dnn::AlgorithmDesc(miopenConvolutionBwdDataAlgoFFT, false),
-      dnn::AlgorithmDesc(miopenConvolutionBwdDataAlgoWinograd, false),
-      // clang-format on
-  });
-  return true;
-}
-
-bool MIOpenSupport::GetConvolveBackwardFilterAlgorithms(
-    // ROCM TODO: refactor cc_major / cc_minor
-    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
-  out_algorithms->assign({
-      // clang-format off
-      dnn::AlgorithmDesc(miopenConvolutionBwdWeightsAlgoGEMM, false),
-      dnn::AlgorithmDesc(miopenConvolutionBwdWeightsAlgoDirect, false),
-      // clang-format on
-  });
-  return true;
-}
-
 bool MIOpenSupport::DoBatchNormalizationForward(
     Stream* stream, const DeviceMemory<Eigen::half>& x,
     const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
@@ -4137,7 +4095,7 @@ tsl::Status MIOpenSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     ScratchAllocator* workspace_allocator) {
   if (element_type == dnn::DataType::kDouble) {
-    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+    return tsl::Status(absl::StatusCode::kInvalidArgument,
                        "MIOpen does not support pooling for double type yet");
   }
 
@@ -4296,7 +4254,7 @@ tsl::Status MIOpenSupport::DoPoolBackward(
     DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
     ScratchAllocator* workspace_allocator) {
   if (element_type == dnn::DataType::kDouble) {
-    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+    return tsl::Status(absl::StatusCode::kInvalidArgument,
                        "MIOpen does not support pooling for double type yet");
   }
 
@@ -5160,8 +5118,7 @@ void initialize_miopen() {
             });
 
     if (!status.ok()) {
-      LOG(ERROR) << "Unable to register MIOpen factory: "
-                 << status.error_message();
+      LOG(ERROR) << "Unable to register MIOpen factory: " << status.message();
     }
 
     PluginRegistry::Instance()->SetDefaultFactory(
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.h b/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.h
index 2c95cf5408c..b0714cad4e5 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.h
@@ -232,10 +232,6 @@ class MIOpenSupport : public dnn::DnnSupport {
                      ScratchAllocator* workspace_allocator,
                      dnn::ProfileResult* output_profile_result) override;
 
-  bool GetConvolveAlgorithms(
-      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
-
   tsl::Status GetConvolveRunners(
       bool use_cudnn_frontend, dnn::ConvolutionKind kind,
       dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
@@ -246,6 +242,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemoryBase output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, ScratchAllocator* scratch_allocator,
+      const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_runners)
       override;
 
@@ -271,14 +268,6 @@ class MIOpenSupport : public dnn::DnnSupport {
   bool GetRnnAlgorithms(
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
-  bool GetConvolveBackwardDataAlgorithms(
-      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
-
-  bool GetConvolveBackwardFilterAlgorithms(
-      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
-      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
-
   bool DoBatchNormalizationForward(
       Stream* stream, const DeviceMemory<float>& x,
       const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
@@ -785,6 +774,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
       absl::Span<const int> input_lengths_data,
+      const NumericOptions& numeric_options,
       ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch_memory,
       int* ctc_loss_algo_id) override;
 
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc
index 1783bac253f..0687778d410 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc
@@ -301,7 +301,7 @@ static tsl::Status InternalInit() {
 
   LOG(ERROR) << "failed call to hipInit: " << ToString(res);
   Diagnostician::LogDiagnosticInformation();
-  return tsl::Status{tsl::error::ABORTED,
+  return tsl::Status{absl::StatusCode::kAborted,
                      absl::StrCat("failed call to hipInit: ", ToString(res))};
 }
 
@@ -324,7 +324,7 @@ static tsl::Status InternalInit() {
   }
 
   return tsl::Status{
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrCat("failed call to hipDeviceGet: ", ToString(res))};
 }
 
@@ -428,7 +428,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ tsl::Status GpuDriver::LoadCubin(GpuContext* context,
                                               const char* cubin_bytes,
                                               hipModule_t* module) {
-  return tsl::Status{tsl::error::INTERNAL,
+  return tsl::Status{absl::StatusCode::kInternal,
                      "Feature not supported on ROCm platform (LoadCubin)"};
 }
 
@@ -501,8 +501,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                GpuStreamHandle stream,
                                                StreamCallback callback,
                                                void* data) {
-  hipError_t res = wrap::hipStreamAddCallback(
-      stream, (hipStreamCallback_t)callback, data, 0 /* = flags */);
+  hipError_t res = wrap::hipLaunchHostFunc(stream, (hipHostFn_t)callback, data);
   if (res != hipSuccess) {
     LOG(ERROR) << "unable to add host callback: " << ToString(res);
     return false;
@@ -695,7 +694,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ tsl::Status GpuDriver::DestroyEvent(GpuContext* context,
                                                  GpuEventHandle* event) {
   if (*event == nullptr) {
-    return tsl::Status{tsl::error::INVALID_ARGUMENT,
+    return tsl::Status{absl::StatusCode::kInvalidArgument,
                        "input event cannot be null"};
   }
 
@@ -709,12 +708,12 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     case hipErrorDeinitialized:
     case hipErrorNotInitialized:
       return tsl::Status{
-          tsl::error::FAILED_PRECONDITION,
+          absl::StatusCode::kFailedPrecondition,
           absl::StrFormat("error destroying ROCM event in device %d: %s",
                           context->device_ordinal(), ToString(res).c_str())};
     default:
       return tsl::Status{
-          tsl::error::INTERNAL,
+          absl::StatusCode::kInternal,
           absl::StrFormat("error destroying ROCM event in device %d: %s",
                           context->device_ordinal(), ToString(res).c_str())};
   }
@@ -731,12 +730,12 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     case hipErrorDeinitialized:
     case hipErrorNotInitialized:
       return tsl::Status{
-          tsl::error::FAILED_PRECONDITION,
+          absl::StatusCode::kFailedPrecondition,
           absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
                           ToString(res).c_str())};
     default:
       return tsl::Status{
-          tsl::error::INVALID_ARGUMENT,
+          absl::StatusCode::kInvalidArgument,
           absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
                           ToString(res).c_str())};
   }
@@ -748,7 +747,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   hipError_t res = wrap::hipEventQuery(event);
   if (res != hipSuccess && res != hipErrorNotReady) {
     return tsl::Status{
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
   }
 
@@ -964,11 +963,11 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   if (res == hipSuccess) {
     return tsl::OkStatus();
   } else if (res == hipErrorMemoryAllocation) {
-    return tsl::Status{tsl::error::RESOURCE_EXHAUSTED,
+    return tsl::Status{absl::StatusCode::kResourceExhausted,
                        "could not create ROCM event: out of device memory"};
   } else {
     return tsl::Status{
-        tsl::error::FAILED_PRECONDITION,
+        absl::StatusCode::kFailedPrecondition,
         absl::StrCat("could not create ROCM event: ", ToString(res))};
   }
 }
@@ -991,7 +990,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                          int* cc_minor,
                                                          hipDevice_t device) {
   return tsl::Status(
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrFormat("failed to get compute capability for device: %d "
                       "(unsupported API on AMD Gpus)",
                       device));
@@ -1007,14 +1006,14 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     // We differentiate between "this pointer is unknown" (return here) and
     // "there was an internal error while performing this operation" (return
     // below).
-    return tsl::Status{tsl::error::NOT_FOUND,
+    return tsl::Status{absl::StatusCode::kNotFound,
                        absl::StrFormat("not a device pointer %p; %s",
                                        reinterpret_cast<void*>(dptr),
                                        ToString(result).c_str())};
   }
 
   return tsl::Status{
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrFormat("failed to get pointer into for device pointer %p; %s",
                       reinterpret_cast<void*>(dptr), ToString(result).c_str())};
 }
@@ -1031,13 +1030,13 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
         return MemorySpace::kHost;
       default:
         return tsl::Status{
-            tsl::error::INTERNAL,
+            absl::StatusCode::kInternal,
             absl::StrCat("unknown memory space provided by ROCM API: ", value)};
     }
   }
 
   return tsl::Status{
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrCat("failed to query device pointer for memory space: ",
                    ToString(result))};
 }
@@ -1049,7 +1048,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
       wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
   if (result != hipSuccess) {
     return tsl::Status{
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrCat("failed to get device for pointer: ", ToString(result))};
   }
 
@@ -1057,7 +1056,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   result = wrap::hipDeviceGet(&device, pointerAttributes.device);
   if (result != hipSuccess) {
     return tsl::Status{
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrCat("failed to get device for pointer: ", ToString(result))};
   }
 
@@ -1074,7 +1073,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   }
   *version = 0;
   return tsl::Status{
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrFormat("failed to determine AMDGpu ISA version for device %d",
                       device)};
 }
@@ -1089,7 +1088,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   }
   *gcnArchName = "";
   return tsl::Status{
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
                       device)};
 }
@@ -1111,7 +1110,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
             (gcnArchName == "90a") || (gcnArchName == "940"));
   }
   return tsl::Status{
-      tsl::error::INTERNAL,
+      absl::StatusCode::kInternal,
       absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
                       dev)};
 }
@@ -1125,7 +1124,7 @@ static tsl::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   hipError_t result = wrap::hipDeviceGetAttribute(&value, attribute, device);
   if (result != hipSuccess) {
     return tsl::Status{
-        tsl::error::NOT_FOUND,
+        absl::StatusCode::kNotFound,
         absl::StrCat("could not retrieve ROCM device attribute (", attribute,
                      "): ", ToString(result))};
   }
@@ -1365,7 +1364,7 @@ static tsl::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
       wrap::hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */);
   if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
     return tsl::Status{
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat("failed to enable peer access from %d to %d: %s",
                         from->device_ordinal(), to->device_ordinal(),
                         ToString(result).c_str())};
@@ -1384,7 +1383,7 @@ static tsl::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   // TODO(ROCm) implement this feature in HIP
   if (result != hipSuccess) {
     return tsl::Status{
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat("failed to calculate occupancy of kernel %p: %s",
                         kernel, ToString(result).c_str())};
   }
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h
index b826d2cd572..ad62ffdf455 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h
@@ -31,10 +31,10 @@ namespace stream_executor {
 namespace wrap {
 #ifdef PLATFORM_GOOGLE
 // Use static linked library
-#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                          \
-  template <typename... Args>                                            \
-  auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
-    return ::hipSymbolName(args...);                                     \
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                            \
+  template <typename... Args>                                              \
+  auto hipSymbolName(Args... args) -> decltype(::hipSymbolName(args...)) { \
+    return ::hipSymbolName(args...);                                       \
   }
 
 // This macro wraps a global identifier, given by hipSymbolName, in a callable
@@ -46,22 +46,22 @@ namespace wrap {
 #define TO_STR_(x) #x
 #define TO_STR(x) TO_STR_(x)
 
-#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                          \
-  template <typename... Args>                                            \
-  auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
-    using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type;  \
-    static FuncPtrT loaded = []() -> FuncPtrT {                          \
-      static const char *kName = TO_STR(hipSymbolName);                  \
-      void *f;                                                           \
-      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                \
-          stream_executor::internal::CachedDsoLoader::GetHipDsoHandle()  \
-              .value(),                                                  \
-          kName, &f);                                                    \
-      CHECK(s.ok()) << "could not find " << kName                        \
-                    << " in HIP DSO; dlerror: " << s.error_message();    \
-      return reinterpret_cast<FuncPtrT>(f);                              \
-    }();                                                                 \
-    return loaded(args...);                                              \
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                            \
+  template <typename... Args>                                              \
+  auto hipSymbolName(Args... args) -> decltype(::hipSymbolName(args...)) { \
+    using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type;    \
+    static FuncPtrT loaded = []() -> FuncPtrT {                            \
+      static const char *kName = TO_STR(hipSymbolName);                    \
+      void *f;                                                             \
+      auto s = tsl::Env::Default() -> GetSymbolFromLibrary(                \
+          stream_executor::internal::CachedDsoLoader::GetHipDsoHandle()    \
+              .value(),                                                    \
+          kName, &f);                                                      \
+      CHECK(s.ok()) << "could not find " << kName                          \
+                    << " in HIP DSO; dlerror: " << s.message();            \
+      return reinterpret_cast<FuncPtrT>(f);                                \
+    }();                                                                   \
+    return loaded(args...);                                                \
   }
 #endif
 
@@ -96,6 +96,7 @@ namespace wrap {
   __macro(hipHostRegister)                          \
   __macro(hipHostUnregister)                        \
   __macro(hipInit)                                  \
+  __macro(hipLaunchHostFunc)                        \
   __macro(hipMalloc)                                \
   __macro(hipMemGetAddressRange)                    \
   __macro(hipMemGetInfo)                            \
@@ -126,8 +127,7 @@ namespace wrap {
   __macro(hipStreamDestroy)                         \
   __macro(hipStreamQuery)                           \
   __macro(hipStreamSynchronize)                     \
-  __macro(hipStreamWaitEvent)                       \
-// clang-format on
+  __macro(hipStreamWaitEvent)  // clang-format on
 
 HIP_ROUTINE_EACH(STREAM_EXECUTOR_HIP_WRAP)
 #undef HIP_ROUTINE_EACH
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_event.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_event.cc
index 5ec86ebaaa7..84943f0f59a 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_event.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_event.cc
@@ -25,7 +25,7 @@ Event::Status GpuEvent::PollForStatus() {
       GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
   if (!status.ok()) {
     LOG(ERROR) << "Error polling for event status: "
-               << status.status().error_message();
+               << status.status().message();
     return Event::Status::kError;
   }
 
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.cc
index 633fced55e1..d5a7f319378 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.cc
@@ -55,32 +55,32 @@ namespace wrap {
 
 #else
 
-#define STREAM_EXECUTOR_ROCFFT_WRAP(__name)                              \
-  struct DynLoadShim__##__name {                                         \
-    static const char *kName;                                            \
-    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;         \
-    static void *GetDsoHandle() {                                        \
-      auto s = internal::CachedDsoLoader::GetHipfftDsoHandle();          \
-      return s.value();                                                  \
-    }                                                                    \
-    static FuncPtrT LoadOrDie() {                                        \
-      void *f;                                                           \
-      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
-                                                         kName, &f);     \
-      CHECK(s.ok()) << "could not find " << kName                        \
-                    << " in rocfft DSO; dlerror: " << s.error_message(); \
-      return reinterpret_cast<FuncPtrT>(f);                              \
-    }                                                                    \
-    static FuncPtrT DynLoad() {                                          \
-      static FuncPtrT f = LoadOrDie();                                   \
-      return f;                                                          \
-    }                                                                    \
-    template <typename... Args>                                          \
-    hipfftResult operator()(GpuExecutor *parent, Args... args) {         \
-      gpu::ScopedActivateExecutorContext sac{parent};                    \
-      return DynLoad()(args...);                                         \
-    }                                                                    \
-  } __name;                                                              \
+#define STREAM_EXECUTOR_ROCFFT_WRAP(__name)                        \
+  struct DynLoadShim__##__name {                                   \
+    static const char *kName;                                      \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;   \
+    static void *GetDsoHandle() {                                  \
+      auto s = internal::CachedDsoLoader::GetHipfftDsoHandle();    \
+      return s.value();                                            \
+    }                                                              \
+    static FuncPtrT LoadOrDie() {                                  \
+      void *f;                                                     \
+      auto s = tsl::Env::Default()                                 \
+          -> GetSymbolFromLibrary(GetDsoHandle(), kName, &f);      \
+      CHECK(s.ok()) << "could not find " << kName                  \
+                    << " in rocfft DSO; dlerror: " << s.message(); \
+      return reinterpret_cast<FuncPtrT>(f);                        \
+    }                                                              \
+    static FuncPtrT DynLoad() {                                    \
+      static FuncPtrT f = LoadOrDie();                             \
+      return f;                                                    \
+    }                                                              \
+    template <typename... Args>                                    \
+    hipfftResult operator()(GpuExecutor *parent, Args... args) {   \
+      gpu::ScopedActivateExecutorContext sac{parent};              \
+      return DynLoad()(args...);                                   \
+    }                                                              \
+  } __name;                                                        \
   const char *DynLoadShim__##__name::kName = #__name;
 
 #endif
@@ -185,7 +185,7 @@ tsl::Status ROCMFftPlan::Initialize(
                                    ROCMFftType(type), 1 /* = batch */);
           if (ret != HIPFFT_SUCCESS) {
             LOG(ERROR) << "failed to create rocFFT 1d plan:" << ret;
-            return tsl::Status{tsl::error::INTERNAL,
+            return tsl::Status{absl::StatusCode::kInternal,
                                "Failed to create rocFFT 1d plan."};
           }
           return tsl::OkStatus();
@@ -195,7 +195,7 @@ tsl::Status ROCMFftPlan::Initialize(
                                    elem_count_[1], ROCMFftType(type));
           if (ret != HIPFFT_SUCCESS) {
             LOG(ERROR) << "failed to create rocFFT 2d plan:" << ret;
-            return tsl::Status{tsl::error::INTERNAL,
+            return tsl::Status{absl::StatusCode::kInternal,
                                "Failed to create rocFFT 2d plan."};
           }
           return tsl::OkStatus();
@@ -206,7 +206,7 @@ tsl::Status ROCMFftPlan::Initialize(
                                  elem_count_[2], ROCMFftType(type));
           if (ret != HIPFFT_SUCCESS) {
             LOG(ERROR) << "failed to create rocFFT 3d plan:" << ret;
-            return tsl::Status{tsl::error::INTERNAL,
+            return tsl::Status{absl::StatusCode::kInternal,
                                "Failed to create rocFFT 3d plan."};
           }
           return tsl::OkStatus();
@@ -214,20 +214,20 @@ tsl::Status ROCMFftPlan::Initialize(
           LOG(ERROR) << "Invalid rank value for hipfftPlan. "
                         "Requested 1, 2, or 3, given: "
                      << rank;
-          return tsl::Status{tsl::error::INVALID_ARGUMENT,
+          return tsl::Status{absl::StatusCode::kInvalidArgument,
                              "hipfftPlan only takes rank 1, 2, or 3."};
       }
     } else {
       ret = wrap::hipfftCreate(parent, &plan_);
       if (ret != HIPFFT_SUCCESS) {
         LOG(ERROR) << "failed to create rocFFT plan:" << ret;
-        return tsl::Status{tsl::error::INTERNAL,
+        return tsl::Status{absl::StatusCode::kInternal,
                            "Failed to create rocFFT plan."};
       }
       ret = wrap::hipfftSetAutoAllocation(parent, plan_, 0);
       if (ret != HIPFFT_SUCCESS) {
         LOG(ERROR) << "failed to set auto allocation for rocFFT plan:" << ret;
-        return tsl::Status{tsl::error::INTERNAL,
+        return tsl::Status{absl::StatusCode::kInternal,
                            "Failed to set auto allocation for rocFFT plan."};
       }
       switch (rank) {
@@ -237,7 +237,7 @@ tsl::Status ROCMFftPlan::Initialize(
                                        &scratch_size_bytes_);
           if (ret != HIPFFT_SUCCESS) {
             LOG(ERROR) << "failed to make rocFFT 1d plan:" << ret;
-            return tsl::Status{tsl::error::INTERNAL,
+            return tsl::Status{absl::StatusCode::kInternal,
                                "Failed to make rocFFT 1d plan."};
           }
           break;
@@ -247,7 +247,7 @@ tsl::Status ROCMFftPlan::Initialize(
                                        &scratch_size_bytes_);
           if (ret != HIPFFT_SUCCESS) {
             LOG(ERROR) << "failed to make rocFFT 2d plan:" << ret;
-            return tsl::Status{tsl::error::INTERNAL,
+            return tsl::Status{absl::StatusCode::kInternal,
                                "Failed to make rocFFT 2d plan."};
           }
           break;
@@ -257,7 +257,7 @@ tsl::Status ROCMFftPlan::Initialize(
                                        ROCMFftType(type), &scratch_size_bytes_);
           if (ret != HIPFFT_SUCCESS) {
             LOG(ERROR) << "failed to make rocFFT 3d plan:" << ret;
-            return tsl::Status{tsl::error::INTERNAL,
+            return tsl::Status{absl::StatusCode::kInternal,
                                "Failed to make rocFFT 3d plan."};
           }
           break;
@@ -265,7 +265,7 @@ tsl::Status ROCMFftPlan::Initialize(
           LOG(ERROR) << "Invalid rank value for hipfftPlan. "
                         "Requested 1, 2, or 3, given: "
                      << rank;
-          return tsl::Status{tsl::error::INVALID_ARGUMENT,
+          return tsl::Status{absl::StatusCode::kInvalidArgument,
                              "hipfftPlan only takes rank 1, 2, or 3."};
       }
       return UpdateScratchAllocator(stream, scratch_allocator);
@@ -280,14 +280,14 @@ tsl::Status ROCMFftPlan::Initialize(
           output_distance, ROCMFftType(type), batch_count);
       if (ret != HIPFFT_SUCCESS) {
         LOG(ERROR) << "failed to create rocFFT batched plan:" << ret;
-        return tsl::Status{tsl::error::INTERNAL,
+        return tsl::Status{absl::StatusCode::kInternal,
                            "Failed to create rocFFT batched plan."};
       }
     } else {
       auto ret = wrap::hipfftCreate(parent, &plan_);
       if (ret != HIPFFT_SUCCESS) {
         LOG(ERROR) << "failed to create rocFFT batched plan:" << ret;
-        return tsl::Status{tsl::error::INTERNAL,
+        return tsl::Status{absl::StatusCode::kInternal,
                            "Failed to create rocFFT batched plan."};
       }
       ret = wrap::hipfftSetAutoAllocation(parent, plan_, 0);
@@ -295,7 +295,7 @@ tsl::Status ROCMFftPlan::Initialize(
         LOG(ERROR) << "failed to set auto allocation for rocFFT batched plan:"
                    << ret;
         return tsl::Status{
-            tsl::error::INTERNAL,
+            absl::StatusCode::kInternal,
             "Failed to set auto allocation for rocFFT batched plan."};
       }
       ret = wrap::hipfftMakePlanMany(
@@ -306,7 +306,7 @@ tsl::Status ROCMFftPlan::Initialize(
           &scratch_size_bytes_);
       if (ret != HIPFFT_SUCCESS) {
         LOG(ERROR) << "failed to make rocFFT batched plan:" << ret;
-        return tsl::Status{tsl::error::INTERNAL,
+        return tsl::Status{absl::StatusCode::kInternal,
                            "Failed to make rocFFT batched plan."};
       }
       return UpdateScratchAllocator(stream, scratch_allocator);
@@ -340,7 +340,7 @@ tsl::Status ROCMFftPlan::UpdateScratchAllocator(
   auto ret = wrap::hipfftSetWorkArea(parent_, plan_, scratch_.opaque());
   if (ret != HIPFFT_SUCCESS) {
     LOG(ERROR) << "failed to set work area for rocFFT plan:" << ret;
-    return tsl::Status(tsl::error::INTERNAL,
+    return tsl::Status(absl::StatusCode::kInternal,
                        "Failed to set work area for rocFFT plan.");
   }
   return tsl::OkStatus();
@@ -380,8 +380,7 @@ std::unique_ptr<fft::Plan> ROCMFft::Create1dPlan(Stream *stream, uint64_t num_x,
   // TODO(yangzihao): In the future, send error msg back to TensorFlow
   // so it can fail gracefully,
   if (!status.ok()) {
-    LOG(FATAL) << "failed to initialize hipfft 1d plan: "
-               << status.error_message();
+    LOG(FATAL) << "failed to initialize hipfft 1d plan: " << status.message();
   }
   return std::move(fft_plan_ptr);
 }
@@ -396,7 +395,7 @@ std::unique_ptr<fft::Plan> ROCMFft::Create1dPlanWithScratchAllocator(
   if (!status.ok()) {
     LOG(FATAL)
         << "failed to initialize hipfft 1d plan with customized allocator: "
-        << status.error_message();
+        << status.message();
   }
   return std::move(fft_plan_ptr);
 }
@@ -410,8 +409,7 @@ std::unique_ptr<fft::Plan> ROCMFft::Create2dPlan(Stream *stream, uint64_t num_x,
       fft_plan_ptr->Initialize(parent_, stream, 1, elem_count, type,
                                /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
-    LOG(FATAL) << "failed to initialize hipfft 2d plan: "
-               << status.error_message();
+    LOG(FATAL) << "failed to initialize hipfft 2d plan: " << status.message();
   }
   return std::move(fft_plan_ptr);
 }
@@ -426,7 +424,7 @@ std::unique_ptr<fft::Plan> ROCMFft::Create2dPlanWithScratchAllocator(
   if (!status.ok()) {
     LOG(FATAL)
         << "failed to initialize hipfft 2d plan with customized allocator: "
-        << status.error_message();
+        << status.message();
   }
   return std::move(fft_plan_ptr);
 }
@@ -441,8 +439,7 @@ std::unique_ptr<fft::Plan> ROCMFft::Create3dPlan(Stream *stream, uint64_t num_x,
       fft_plan_ptr->Initialize(parent_, stream, 3, elem_count, type,
                                /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
-    LOG(FATAL) << "failed to initialize hipfft 3d plan: "
-               << status.error_message();
+    LOG(FATAL) << "failed to initialize hipfft 3d plan: " << status.message();
   }
   return std::move(fft_plan_ptr);
 }
@@ -457,7 +454,7 @@ std::unique_ptr<fft::Plan> ROCMFft::Create3dPlanWithScratchAllocator(
   if (!status.ok()) {
     LOG(FATAL)
         << "failed to initialize hipfft 3d plan with customized allocator: "
-        << status.error_message();
+        << status.message();
   }
   return std::move(fft_plan_ptr);
 }
@@ -474,7 +471,7 @@ std::unique_ptr<fft::Plan> ROCMFft::CreateBatchedPlan(
       batch_count, /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
     LOG(FATAL) << "failed to initialize batched hipfft plan: "
-               << status.error_message();
+               << status.message();
   }
 
   return std::move(fft_plan_ptr);
@@ -493,7 +490,7 @@ std::unique_ptr<fft::Plan> ROCMFft::CreateBatchedPlanWithScratchAllocator(
   if (!status.ok()) {
     LOG(FATAL) << "failed to initialize batched hipfft plan with customized "
                   "allocator: "
-               << status.error_message();
+               << status.message();
   }
   return std::move(fft_plan_ptr);
 }
@@ -505,7 +502,7 @@ void ROCMFft::UpdatePlanWithScratchAllocator(
       rocm_fft_plan->UpdateScratchAllocator(stream, scratch_allocator);
   if (!status.ok()) {
     LOG(FATAL) << "failed to update custom allocator for hipfft plan: "
-               << status.error_message();
+               << status.message();
   }
 }
 
@@ -637,8 +634,7 @@ void initialize_rocfft() {
               return new gpu::ROCMFft(rocm_executor);
             });
     if (!status.ok()) {
-      LOG(ERROR) << "Unable to register rocFFT factory: "
-                 << status.error_message();
+      LOG(ERROR) << "Unable to register rocFFT factory: " << status.message();
     }
 
     PluginRegistry::Instance()->SetDefaultFactory(
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.cc
index 94dafa1b409..583e47d55ce 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.cc
@@ -568,9 +568,7 @@ bool GpuExecutor::HostCallback(Stream* stream,
                                       InternalHostCallback, callback_ptr);
 }
 
-/* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
-                                                    hipError_t status,
-                                                    void* data) {
+/* static */ void GpuExecutor::InternalHostCallback(void* data) {
   auto* callback = reinterpret_cast<absl::AnyInvocable<void() &&>*>(data);
   std::move (*callback)();
   delete callback;
@@ -594,7 +592,7 @@ tsl::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
     return tsl::OkStatus();
   } else {
     return tsl::Status{
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat("error recording waiting for ROCM event on stream %p",
                         stream)};
   }
@@ -663,7 +661,7 @@ blas::BlasSupport* GpuExecutor::CreateBlas() {
                                                         plugin_config_.blas());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve BLAS factory: "
-               << status.status().error_message();
+               << status.status().message();
     return nullptr;
   }
 
@@ -677,7 +675,7 @@ dnn::DnnSupport* GpuExecutor::CreateDnn() {
                                                        plugin_config_.dnn());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve DNN factory: "
-               << status.status().error_message();
+               << status.status().message();
     return nullptr;
   }
 
@@ -691,7 +689,7 @@ fft::FftSupport* GpuExecutor::CreateFft() {
                                                        plugin_config_.fft());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve FFT factory: "
-               << status.status().error_message();
+               << status.status().message();
     return nullptr;
   }
 
@@ -705,7 +703,7 @@ rng::RngSupport* GpuExecutor::CreateRng() {
                                                        plugin_config_.rng());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve RNG factory: "
-               << status.status().error_message();
+               << status.status().message();
     return nullptr;
   }
 
@@ -961,8 +959,7 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
       GpuDriver::GetMaxThreadsPerMultiprocessor(device).value());
   builder.set_registers_per_block_limit(
       GpuDriver::GetMaxRegistersPerBlock(device).value());
-  builder.set_threads_per_warp(
-      GpuDriver::GetThreadsPerWarp(device).value());
+  builder.set_threads_per_warp(GpuDriver::GetThreadsPerWarp(device).value());
   builder.set_registers_per_core_limit(64 * 1024);
 
   int cc_major = 0;
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.cc
index 5218d5f2f2e..b8d8cb0a316 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.cc
@@ -85,7 +85,7 @@ tsl::StatusOr<StreamExecutor*> ROCmPlatform::FirstExecutorForBus(
   }
 
   return tsl::Status{
-      tsl::error::NOT_FOUND,
+      absl::StatusCode::kNotFound,
       absl::StrFormat("Executor for bus %d not found.", bus_ordinal)};
 }
 
@@ -146,7 +146,7 @@ ROCmPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   auto init_status = executor->Init(config.device_options);
   if (!init_status.ok()) {
     return tsl::Status{
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat(
             "failed initializing StreamExecutor for ROCM device ordinal %d: %s",
             config.ordinal, init_status.ToString().c_str())};
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_rng.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_rng.cc
index 4bd74fe0031..d9560583da5 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_rng.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_rng.cc
@@ -73,32 +73,32 @@ namespace wrap {
 
 #else
 
-#define STREAM_EXECUTOR_HIPRAND_WRAP(__name)                              \
-  struct DynLoadShim__##__name {                                          \
-    static const char* kName;                                             \
-    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
-    static void* GetDsoHandle() {                                         \
-      auto s = internal::CachedDsoLoader::GetRocrandDsoHandle();          \
-      return s.value();                                                   \
-    }                                                                     \
-    static FuncPtrT LoadOrDie() {                                         \
-      void* f;                                                            \
-      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(),  \
-                                                         kName, &f);      \
-      CHECK(s.ok()) << "could not find " << kName                         \
-                    << " in rocrand DSO; dlerror: " << s.error_message(); \
-      return reinterpret_cast<FuncPtrT>(f);                               \
-    }                                                                     \
-    static FuncPtrT DynLoad() {                                           \
-      static FuncPtrT f = LoadOrDie();                                    \
-      return f;                                                           \
-    }                                                                     \
-    template <typename... Args>                                           \
-    hiprandStatus operator()(GpuExecutor* parent, Args... args) {         \
-      gpu::ScopedActivateExecutorContext sac{parent};                     \
-      return DynLoad()(args...);                                          \
-    }                                                                     \
-  } __name;                                                               \
+#define STREAM_EXECUTOR_HIPRAND_WRAP(__name)                        \
+  struct DynLoadShim__##__name {                                    \
+    static const char* kName;                                       \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;    \
+    static void* GetDsoHandle() {                                   \
+      auto s = internal::CachedDsoLoader::GetRocrandDsoHandle();    \
+      return s.value();                                             \
+    }                                                               \
+    static FuncPtrT LoadOrDie() {                                   \
+      void* f;                                                      \
+      auto s = tsl::Env::Default()                                  \
+          -> GetSymbolFromLibrary(GetDsoHandle(), kName, &f);       \
+      CHECK(s.ok()) << "could not find " << kName                   \
+                    << " in rocrand DSO; dlerror: " << s.message(); \
+      return reinterpret_cast<FuncPtrT>(f);                         \
+    }                                                               \
+    static FuncPtrT DynLoad() {                                     \
+      static FuncPtrT f = LoadOrDie();                              \
+      return f;                                                     \
+    }                                                               \
+    template <typename... Args>                                     \
+    hiprandStatus operator()(GpuExecutor* parent, Args... args) {   \
+      gpu::ScopedActivateExecutorContext sac{parent};               \
+      return DynLoad()(args...);                                    \
+    }                                                               \
+  } __name;                                                         \
   const char* DynLoadShim__##__name::kName = #__name;
 
 #endif
@@ -308,8 +308,7 @@ void initialize_rocrand() {
             });
 
     if (!status.ok()) {
-      LOG(ERROR) << "Unable to register rocRAND factory: "
-                 << status.error_message();
+      LOG(ERROR) << "Unable to register rocRAND factory: " << status.message();
     }
 
     PluginRegistry::Instance()->SetDefaultFactory(
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocsolver_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/rocsolver_wrapper.h
index e442e7ab2d7..a98f6f64eb9 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocsolver_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocsolver_wrapper.h
@@ -49,17 +49,17 @@ namespace wrap {
 
 #define ROCSOLVER_API_WRAPPER(api_name)                                       \
   template <typename... Args>                                                 \
-  auto api_name(Args... args)->decltype(::api_name(args...)) {                \
+  auto api_name(Args... args) -> decltype(::api_name(args...)) {              \
     using FuncPtrT = std::add_pointer<decltype(::api_name)>::type;            \
     static FuncPtrT loaded = []() -> FuncPtrT {                               \
       static const char* kName = TO_STR(api_name);                            \
       void* f;                                                                \
-      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                     \
+      auto s = tsl::Env::Default() -> GetSymbolFromLibrary(                   \
           stream_executor::internal::CachedDsoLoader::GetRocsolverDsoHandle() \
               .value(),                                                       \
           kName, &f);                                                         \
       CHECK(s.ok()) << "could not find " << kName                             \
-                    << " in rocsolver lib; dlerror: " << s.error_message();   \
+                    << " in rocsolver lib; dlerror: " << s.message();         \
       return reinterpret_cast<FuncPtrT>(f);                                   \
     }();                                                                      \
     return loaded(args...);                                                   \
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/roctracer_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/roctracer_wrapper.h
index 45e4c0c947b..33f1472434b 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/roctracer_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/roctracer_wrapper.h
@@ -47,17 +47,17 @@ namespace wrap {
 
 #define ROCTRACER_API_WRAPPER(API_NAME)                                       \
   template <typename... Args>                                                 \
-  auto API_NAME(Args... args)->decltype(::API_NAME(args...)) {                \
+  auto API_NAME(Args... args) -> decltype(::API_NAME(args...)) {              \
     using FuncPtrT = std::add_pointer<decltype(::API_NAME)>::type;            \
     static FuncPtrT loaded = []() -> FuncPtrT {                               \
       static const char* kName = #API_NAME;                                   \
       void* f;                                                                \
-      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                     \
+      auto s = tsl::Env::Default() -> GetSymbolFromLibrary(                   \
           stream_executor::internal::CachedDsoLoader::GetRoctracerDsoHandle() \
-              .value(),                                                  \
+              .value(),                                                       \
           kName, &f);                                                         \
       CHECK(s.ok()) << "could not find " << kName                             \
-                    << " in roctracer DSO; dlerror: " << s.error_message();   \
+                    << " in roctracer DSO; dlerror: " << s.message();         \
       return reinterpret_cast<FuncPtrT>(f);                                   \
     }();                                                                      \
     return loaded(args...);                                                   \
diff --git a/tensorflow/compiler/xla/stream_executor/stream.cc b/tensorflow/compiler/xla/stream_executor/stream.cc
index 8dde2de8294..28dffedffba 100644
--- a/tensorflow/compiler/xla/stream_executor/stream.cc
+++ b/tensorflow/compiler/xla/stream_executor/stream.cc
@@ -333,7 +333,7 @@ Stream &Stream::ThenRecordEvent(Event *event) {
 
   tsl::Status status = parent_->RecordEvent(this, event);
   if (!status.ok()) {
-    LOG(ERROR) << "Error recording event in stream: " << status.error_message()
+    LOG(ERROR) << "Error recording event in stream: " << status.message()
                << "; not marking stream as bad, as the Event object may be "
                << "at fault. Monitor for further errors.";
   }
@@ -1117,8 +1117,7 @@ Stream &Stream::ThenWaitFor(Event *event) {
   if (ok()) {
     tsl::Status status = parent_->WaitForEvent(this, event);
     if (!status.ok()) {
-      LOG(ERROR) << "Error waiting for event in stream: "
-                 << status.error_message()
+      LOG(ERROR) << "Error waiting for event in stream: " << status.message()
                  << "; not marking stream as bad, as the Event object may be "
                  << "at fault. Monitor for further errors.";
     }
@@ -1445,184 +1444,6 @@ struct ThenBlasWithProfileImpl {
 };
 }  // anonymous namespace
 
-Stream &Stream::ThenBlasGemvWithProfiling(
-    blas::Transpose trans, uint64_t m, uint64 n, float alpha,
-    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
-    int incx, float beta, DeviceMemory<float> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a),
-            PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y),
-            PARAM(incy));
-
-  ThenBlasWithProfileImpl<
-      blas::Transpose, uint64_t, uint64_t, float, const DeviceMemory<float> &,
-      int, const DeviceMemory<float> &, int, float, DeviceMemory<float> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemvWithProfiling, trans, m, n,
-              alpha, a, lda, x, incx, beta, y, incy, output_profile_result);
-}
-
-Stream &Stream::ThenBlasGemvWithProfiling(
-    blas::Transpose trans, uint64_t m, uint64 n, double alpha,
-    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
-    int incx, double beta, DeviceMemory<double> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a),
-            PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y),
-            PARAM(incy));
-
-  ThenBlasWithProfileImpl<blas::Transpose, uint64_t, uint64_t, double,
-                          const DeviceMemory<double> &, int,
-                          const DeviceMemory<double> &, int, double,
-                          DeviceMemory<double> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemvWithProfiling, trans, m, n,
-              alpha, a, lda, x, incx, beta, y, incy, output_profile_result);
-}
-
-Stream &Stream::ThenBlasGemvWithProfiling(
-    blas::Transpose trans, uint64_t m, uint64 n, std::complex<float> alpha,
-    const DeviceMemory<std::complex<float>> &a, int lda,
-    const DeviceMemory<std::complex<float>> &x, int incx,
-    std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a),
-            PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y),
-            PARAM(incy));
-
-  ThenBlasWithProfileImpl<
-      blas::Transpose, uint64_t, uint64_t, std::complex<float>,
-      const DeviceMemory<std::complex<float>> &, int,
-      const DeviceMemory<std::complex<float>> &, int, std::complex<float>,
-      DeviceMemory<std::complex<float>> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemvWithProfiling, trans, m, n,
-              alpha, a, lda, x, incx, beta, y, incy, output_profile_result);
-}
-
-Stream &Stream::ThenBlasGemvWithProfiling(
-    blas::Transpose trans, uint64_t m, uint64 n, std::complex<double> alpha,
-    const DeviceMemory<std::complex<double>> &a, int lda,
-    const DeviceMemory<std::complex<double>> &x, int incx,
-    std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy,
-    blas::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a),
-            PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y),
-            PARAM(incy));
-
-  ThenBlasWithProfileImpl<
-      blas::Transpose, uint64_t, uint64_t, std::complex<double>,
-      const DeviceMemory<std::complex<double>> &, int,
-      const DeviceMemory<std::complex<double>> &, int, std::complex<double>,
-      DeviceMemory<std::complex<double>> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemvWithProfiling, trans, m, n,
-              alpha, a, lda, x, incx, beta, y, incy, output_profile_result);
-}
-
-Stream &Stream::ThenBlasGemmWithProfiling(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, float alpha, const DeviceMemory<Eigen::half> &a, int lda,
-    const DeviceMemory<Eigen::half> &b, int ldb, float beta,
-    DeviceMemory<Eigen::half> *c, int ldc,
-    blas::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
-            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
-            PARAM(beta), PARAM(c), PARAM(ldc));
-
-  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t,
-                          uint64_t, float, const DeviceMemory<Eigen::half> &,
-                          int, const DeviceMemory<Eigen::half> &, int, float,
-                          DeviceMemory<Eigen::half> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb,
-              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-              output_profile_result);
-}
-
-Stream &Stream::ThenBlasGemmWithProfiling(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, float alpha, const DeviceMemory<float> &a, int lda,
-    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
-    int ldc, blas::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
-            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
-            PARAM(beta), PARAM(c), PARAM(ldc));
-
-  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t,
-                          uint64_t, float, const DeviceMemory<float> &, int,
-                          const DeviceMemory<float> &, int, float,
-                          DeviceMemory<float> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb,
-              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-              output_profile_result);
-}
-
-Stream &Stream::ThenBlasGemmWithProfiling(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, double alpha, const DeviceMemory<double> &a, int lda,
-    const DeviceMemory<double> &b, int ldb, double beta,
-    DeviceMemory<double> *c, int ldc,
-    blas::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
-            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
-            PARAM(beta), PARAM(c), PARAM(ldc));
-
-  ThenBlasWithProfileImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t,
-                          uint64_t, double, const DeviceMemory<double> &, int,
-                          const DeviceMemory<double> &, int, double,
-                          DeviceMemory<double> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb,
-              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-              output_profile_result);
-}
-
-Stream &Stream::ThenBlasGemmWithProfiling(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, std::complex<float> alpha,
-    const DeviceMemory<std::complex<float>> &a, int lda,
-    const DeviceMemory<std::complex<float>> &b, int ldb,
-    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
-    blas::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
-            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
-            PARAM(beta), PARAM(c), PARAM(ldc));
-
-  ThenBlasWithProfileImpl<
-      blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
-      std::complex<float>, const DeviceMemory<std::complex<float>> &, int,
-      const DeviceMemory<std::complex<float>> &, int, std::complex<float>,
-      DeviceMemory<std::complex<float>> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb,
-              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-              output_profile_result);
-}
-
-Stream &Stream::ThenBlasGemmWithProfiling(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, std::complex<double> alpha,
-    const DeviceMemory<std::complex<double>> &a, int lda,
-    const DeviceMemory<std::complex<double>> &b, int ldb,
-    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
-    blas::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
-            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
-            PARAM(beta), PARAM(c), PARAM(ldc));
-
-  ThenBlasWithProfileImpl<
-      blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
-      std::complex<double>, const DeviceMemory<std::complex<double>> &, int,
-      const DeviceMemory<std::complex<double>> &, int, std::complex<double>,
-      DeviceMemory<std::complex<double>> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemmWithProfiling, transa, transb,
-              m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-              output_profile_result);
-}
-
 Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
                              blas::Transpose transa, blas::Diagonal diag,
                              uint64_t m, uint64 n, float alpha,
@@ -2331,6 +2152,7 @@ Stream &Stream::ThenCtcLoss(const dnn::RnnStateTensorDescriptor &probs_desc,
                             absl::Span<const int> labels_data,
                             absl::Span<const int> labels_lengths_data,
                             absl::Span<const int> input_lengths_data,
+                            const NumericOptions &numeric_options,
                             DeviceMemory<float> *costs_data,
                             const dnn::RnnStateTensorDescriptor &grads_desc,
                             DeviceMemory<float> *grads_data,
@@ -2339,10 +2161,10 @@ Stream &Stream::ThenCtcLoss(const dnn::RnnStateTensorDescriptor &probs_desc,
     DeviceMemory<uint8_t> scratch_memory;
     int ctc_loss_algo_id;
     auto status =
-        dnn->PrepareForCtcLoss(this, probs_desc, probs_data, grads_desc,
-                               labels_data, labels_lengths_data,
-                               input_lengths_data, workspace_allocator,
-                               &scratch_memory, &ctc_loss_algo_id)
+        dnn->PrepareForCtcLoss(
+               this, probs_desc, probs_data, grads_desc, labels_data,
+               labels_lengths_data, input_lengths_data, numeric_options,
+               workspace_allocator, &scratch_memory, &ctc_loss_algo_id)
             .ok();
     if (status) {
       status = dnn->DoCtcLoss(this, probs_desc, probs_data, labels_data,
diff --git a/tensorflow/compiler/xla/stream_executor/stream.h b/tensorflow/compiler/xla/stream_executor/stream.h
index ea84d630c51..535d19da11f 100644
--- a/tensorflow/compiler/xla/stream_executor/stream.h
+++ b/tensorflow/compiler/xla/stream_executor/stream.h
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/fft.h"
 #include "tensorflow/compiler/xla/stream_executor/kernel.h"
 #include "tensorflow/compiler/xla/stream_executor/launch_dim.h"
+#include "tensorflow/compiler/xla/stream_executor/numeric_options.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/compiler/xla/stream_executor/temporary_memory_manager.h"
@@ -476,6 +477,87 @@ class Stream {
         convolution_descriptor, activation_mode);
   }
 
+  tsl::StatusOr<std::unique_ptr<const dnn::FusedMHASoftmaxRunner>>
+  FusedMHASoftmaxRunnerFromDesc(
+      const dnn::AlgorithmDesc &algorithm_desc, dnn::FusedMHAKind kind,
+      const dnn::MatmulTensorDescriptor &bmm1_lhs_descriptor,
+      const dnn::MatmulTensorDescriptor &bmm1_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor &bmm2_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor &intermediate_bmm2_lhs_descriptor,
+      const dnn::TensorDescriptor &output_descriptor,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+    dnn::DnnSupport *dnn_support = parent_->AsDnn();
+    if (!dnn_support) {
+      return tsl::errors::Unimplemented("DNN library is not found.");
+    }
+    return dnn_support->FusedMHASoftmaxRunnerFromDesc(
+        this, algorithm_desc, kind, bmm1_lhs_descriptor, bmm1_rhs_descriptor,
+        bmm2_rhs_descriptor, intermediate_bmm2_lhs_descriptor,
+        output_descriptor, dropout_rate, seed);
+  }
+
+  tsl::StatusOr<std::unique_ptr<const dnn::FusedMHAMaskRunner>>
+  FusedMHAScaleMaskSoftmaxRunnerFromDesc(
+      const dnn::AlgorithmDesc &algorithm_desc, dnn::FusedMHAKind kind,
+      const dnn::MatmulTensorDescriptor &bmm1_lhs_descriptor,
+      const dnn::MatmulTensorDescriptor &bmm1_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor &bmm2_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor &intermediate_bmm2_lhs_descriptor,
+      const dnn::TensorDescriptor &output_descriptor,
+      const dnn::TensorDescriptor &mask_descriptor, double scale,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+    dnn::DnnSupport *dnn_support = parent_->AsDnn();
+    if (!dnn_support) {
+      return tsl::errors::Unimplemented("DNN library is not found.");
+    }
+    return dnn_support->FusedMHAScaleMaskSoftmaxRunnerFromDesc(
+        this, algorithm_desc, kind, bmm1_lhs_descriptor, bmm1_rhs_descriptor,
+        bmm2_rhs_descriptor, intermediate_bmm2_lhs_descriptor,
+        output_descriptor, mask_descriptor, scale, dropout_rate, seed);
+  }
+
+  tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABiasMaskRunner>>
+  FusedMHAScaleBiasMaskSoftmaxRunnerFromDesc(
+      const dnn::AlgorithmDesc &algorithm_desc, dnn::FusedMHAKind kind,
+      const dnn::MatmulTensorDescriptor &bmm1_lhs_descriptor,
+      const dnn::MatmulTensorDescriptor &bmm1_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor &bmm2_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor &intermediate_bmm2_lhs_descriptor,
+      const dnn::TensorDescriptor &output_descriptor,
+      const dnn::TensorDescriptor &mask_descriptor,
+      const dnn::TensorDescriptor &bias_descriptor, double scale,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+    dnn::DnnSupport *dnn_support = parent_->AsDnn();
+    if (!dnn_support) {
+      return tsl::errors::Unimplemented("DNN library is not found.");
+    }
+    return dnn_support->FusedMHAScaleBiasMaskSoftmaxRunnerFromDesc(
+        this, algorithm_desc, kind, bmm1_lhs_descriptor, bmm1_rhs_descriptor,
+        bmm2_rhs_descriptor, intermediate_bmm2_lhs_descriptor,
+        output_descriptor, mask_descriptor, bias_descriptor, scale,
+        dropout_rate, seed);
+  }
+
+  tsl::StatusOr<std::unique_ptr<const dnn::FusedMHABiasRunner>>
+  FusedMHAScaleBiasSoftmaxRunnerFromDesc(
+      const dnn::AlgorithmDesc &algorithm_desc, dnn::FusedMHAKind kind,
+      const dnn::MatmulTensorDescriptor &bmm1_lhs_descriptor,
+      const dnn::MatmulTensorDescriptor &bmm1_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor &bmm2_rhs_descriptor,
+      const dnn::MatmulTensorDescriptor &intermediate_bmm2_lhs_descriptor,
+      const dnn::TensorDescriptor &output_descriptor,
+      const dnn::TensorDescriptor &bias_descriptor, double scale,
+      std::optional<double> dropout_rate, std::optional<int64_t> seed) {
+    dnn::DnnSupport *dnn_support = parent_->AsDnn();
+    if (!dnn_support) {
+      return tsl::errors::Unimplemented("DNN library is not found.");
+    }
+    return dnn_support->FusedMHAScaleBiasSoftmaxRunnerFromDesc(
+        this, algorithm_desc, kind, bmm1_lhs_descriptor, bmm1_rhs_descriptor,
+        bmm2_rhs_descriptor, intermediate_bmm2_lhs_descriptor,
+        output_descriptor, bias_descriptor, scale, dropout_rate, seed);
+  }
+
   Stream &ThenSeparableConvolve(
       const dnn::BatchDescriptor &input_descriptor,
       const DeviceMemory<float> &input_data,
@@ -527,6 +609,23 @@ class Stream {
     return tsl::errors::Unimplemented("DNN library is not found.");
   }
 
+  template <typename ElementType>
+  tsl::Status ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
+                              const NumericOptions &numeric_options,
+                              const dnn::BatchDescriptor &input_dimensions,
+                              const DeviceMemory<ElementType> &input_data,
+                              const dnn::BatchDescriptor &output_dimensions,
+                              DeviceMemory<ElementType> *output_data,
+                              ScratchAllocator *workspace_allocator = nullptr) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      return dnn->DoPoolForward(dnn::ToDataType<ElementType>::value, this,
+                                pooling_dimensions, numeric_options,
+                                input_dimensions, input_data, output_dimensions,
+                                *output_data, workspace_allocator);
+    }
+    return tsl::errors::Unimplemented("DNN library is not found.");
+  }
+
   template <typename ElementType>
   tsl::Status ThenPoolBackward(
       const dnn::PoolingDescriptor &pooling_dimensions,
@@ -546,6 +645,26 @@ class Stream {
     return tsl::errors::Unimplemented("DNN library is not found.");
   }
 
+  template <typename ElementType>
+  tsl::Status ThenPoolBackward(
+      const dnn::PoolingDescriptor &pooling_dimensions,
+      const NumericOptions &numeric_options,
+      const dnn::BatchDescriptor &input_dimensions,
+      const DeviceMemory<ElementType> &input_data,
+      const dnn::BatchDescriptor &output_dimensions,
+      const DeviceMemory<ElementType> &output_data,
+      const DeviceMemory<ElementType> &input_diff_data,
+      DeviceMemory<ElementType> *output_diff_data,
+      ScratchAllocator *workspace_allocator = nullptr) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      return dnn->DoPoolBackward(
+          dnn::ToDataType<ElementType>::value, this, pooling_dimensions,
+          numeric_options, input_dimensions, input_data, output_dimensions,
+          output_data, input_diff_data, *output_diff_data, workspace_allocator);
+    }
+    return tsl::errors::Unimplemented("DNN library is not found.");
+  }
+
   Stream &ThenNormalizeWithDimensions(
       const dnn::NormalizeDescriptor &normalize_descriptor,
       const dnn::BatchDescriptor &dimensions,
@@ -758,31 +877,6 @@ class Stream {
                        std::complex<double> beta,
                        DeviceMemory<std::complex<double>> *y, int incy);
 
-  Stream &ThenBlasGemvWithProfiling(blas::Transpose trans, uint64_t m, uint64 n,
-                                    float alpha, const DeviceMemory<float> &a,
-                                    int lda, const DeviceMemory<float> &x,
-                                    int incx, float beta,
-                                    DeviceMemory<float> *y, int incy,
-                                    blas::ProfileResult *output_profile_result);
-  Stream &ThenBlasGemvWithProfiling(blas::Transpose trans, uint64_t m, uint64 n,
-                                    double alpha, const DeviceMemory<double> &a,
-                                    int lda, const DeviceMemory<double> &x,
-                                    int incx, double beta,
-                                    DeviceMemory<double> *y, int incy,
-                                    blas::ProfileResult *output_profile_result);
-  Stream &ThenBlasGemvWithProfiling(
-      blas::Transpose trans, uint64_t m, uint64 n, std::complex<float> alpha,
-      const DeviceMemory<std::complex<float>> &a, int lda,
-      const DeviceMemory<std::complex<float>> &x, int incx,
-      std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
-      blas::ProfileResult *output_profile_result);
-  Stream &ThenBlasGemvWithProfiling(
-      blas::Transpose trans, uint64_t m, uint64 n, std::complex<double> alpha,
-      const DeviceMemory<std::complex<double>> &a, int lda,
-      const DeviceMemory<std::complex<double>> &x, int incx,
-      std::complex<double> beta, DeviceMemory<std::complex<double>> *y,
-      int incy, blas::ProfileResult *output_profile_result);
-
   // See BlasSupport::DoBlasSbmv.
   Stream &ThenBlasSbmv(blas::UpperLower uplo, uint64_t n, uint64 k, float alpha,
                        const DeviceMemory<float> &a, int lda,
@@ -866,44 +960,6 @@ class Stream {
                         ldc, blas::kDefaultComputePrecision);
   }
 
-  Stream &ThenBlasGemmWithProfiling(blas::Transpose transa,
-                                    blas::Transpose transb, uint64_t m,
-                                    uint64 n, uint64_t k, float alpha,
-                                    const DeviceMemory<Eigen::half> &a, int lda,
-                                    const DeviceMemory<Eigen::half> &b, int ldb,
-                                    float beta, DeviceMemory<Eigen::half> *c,
-                                    int ldc,
-                                    blas::ProfileResult *output_profile_result);
-  Stream &ThenBlasGemmWithProfiling(blas::Transpose transa,
-                                    blas::Transpose transb, uint64_t m,
-                                    uint64 n, uint64_t k, float alpha,
-                                    const DeviceMemory<float> &a, int lda,
-                                    const DeviceMemory<float> &b, int ldb,
-                                    float beta, DeviceMemory<float> *c, int ldc,
-                                    blas::ProfileResult *output_profile_result);
-  Stream &ThenBlasGemmWithProfiling(blas::Transpose transa,
-                                    blas::Transpose transb, uint64_t m,
-                                    uint64 n, uint64_t k, double alpha,
-                                    const DeviceMemory<double> &a, int lda,
-                                    const DeviceMemory<double> &b, int ldb,
-                                    double beta, DeviceMemory<double> *c,
-                                    int ldc,
-                                    blas::ProfileResult *output_profile_result);
-  Stream &ThenBlasGemmWithProfiling(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, std::complex<float> alpha,
-      const DeviceMemory<std::complex<float>> &a, int lda,
-      const DeviceMemory<std::complex<float>> &b, int ldb,
-      std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
-      blas::ProfileResult *output_profile_result);
-  Stream &ThenBlasGemmWithProfiling(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, std::complex<double> alpha,
-      const DeviceMemory<std::complex<double>> &a, int lda,
-      const DeviceMemory<std::complex<double>> &b, int ldb,
-      std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
-      blas::ProfileResult *output_profile_result);
-
   template <typename InputType, typename OutputType>
   tsl::Status ThenBlasGemmWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
@@ -1416,6 +1472,7 @@ class Stream {
                       absl::Span<const int> labels_data,
                       absl::Span<const int> labels_lengths_data,
                       absl::Span<const int> input_lengths_data,
+                      const NumericOptions &numeric_options,
                       DeviceMemory<float> *costs_data,
                       const dnn::RnnStateTensorDescriptor &grads_desc,
                       DeviceMemory<float> *grads_data,
diff --git a/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.cc b/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.cc
index 8dc30fd966e..b619bb5016a 100644
--- a/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.cc
@@ -270,32 +270,6 @@ bool StreamExecutor::SupportsDnn() const {
   return implementation_->SupportsDnn();
 }
 
-bool StreamExecutor::GetConvolveAlgorithms(
-    dnn::ConvolutionKind kind, dnn::DataType input_type,
-    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
-  dnn::DnnSupport* dnn_support = AsDnn();
-  if (!dnn_support) {
-    return false;
-  }
-  switch (kind) {
-    default:
-      return false;
-    case dnn::ConvolutionKind::FORWARD:
-    case dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION:
-      return dnn_support->GetConvolveAlgorithms(
-          GetDeviceDescription().cuda_compute_capability(), input_type,
-          out_algorithms);
-    case dnn::ConvolutionKind::BACKWARD_DATA:
-      return dnn_support->GetConvolveBackwardDataAlgorithms(
-          GetDeviceDescription().cuda_compute_capability(), input_type,
-          out_algorithms);
-    case dnn::ConvolutionKind::BACKWARD_FILTER:
-      return dnn_support->GetConvolveBackwardFilterAlgorithms(
-          GetDeviceDescription().cuda_compute_capability(), input_type,
-          out_algorithms);
-  }
-}
-
 tsl::Status StreamExecutor::GetConvolveRunners(
     bool use_cudnn_frontend, dnn::ConvolutionKind kind,
     dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
@@ -304,7 +278,7 @@ tsl::Status StreamExecutor::GetConvolveRunners(
     DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
     DeviceMemoryBase output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    ScratchAllocator* scratch_allocator,
+    ScratchAllocator* scratch_allocator, const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
@@ -314,7 +288,7 @@ tsl::Status StreamExecutor::GetConvolveRunners(
       use_cudnn_frontend, kind, input_type, output_type, stream,
       input_descriptor, input_data, filter_descriptor, filter_data,
       output_descriptor, output_data, convolution_descriptor, use_fallback,
-      scratch_allocator, out_exec_plans);
+      scratch_allocator, numeric_options, out_exec_plans);
 }
 
 tsl::Status StreamExecutor::GetFusedConvolveRunners(
@@ -327,7 +301,7 @@ tsl::Status StreamExecutor::GetFusedConvolveRunners(
     const dnn::BatchDescriptor& bias_descriptor,
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    dnn::ActivationMode activation_mode,
+    dnn::ActivationMode activation_mode, const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
@@ -337,7 +311,8 @@ tsl::Status StreamExecutor::GetFusedConvolveRunners(
       use_cudnn_frontend, kind, input_type, bias_type, output_type,
       conv_input_scale, side_input_scale, leakyrelu_alpha, stream,
       input_descriptor, filter_descriptor, bias_descriptor, output_descriptor,
-      convolution_descriptor, use_fallback, activation_mode, out_exec_plans);
+      convolution_descriptor, use_fallback, activation_mode, numeric_options,
+      out_exec_plans);
 }
 
 tsl::Status StreamExecutor::GetFusedMatmulRunners(
@@ -345,6 +320,7 @@ tsl::Status StreamExecutor::GetFusedMatmulRunners(
     dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
     uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
     dnn::ActivationMode activation_mode, bool use_fallback,
+    const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
   dnn::DnnSupport* dnn_support = AsDnn();
@@ -355,7 +331,7 @@ tsl::Status StreamExecutor::GetFusedMatmulRunners(
   return dnn_support->GetFusedMatmulRunners(
       use_cudnn_frontend, input_type, bias_type, output_type, stream, trans_a,
       trans_b, m, n, k, lda, ldb, ldc, activation_mode, use_fallback,
-      out_exec_plans);
+      numeric_options, out_exec_plans);
 }
 
 bool StreamExecutor::GetMIOpenConvolveAlgorithms(
@@ -405,7 +381,7 @@ StreamExecutor::createRnnDescriptor(
     bool use_padded_io) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
-    return tsl::Status(tsl::error::UNKNOWN,
+    return tsl::Status(absl::StatusCode::kUnknown,
                        "Fail to find the dnn implementation.");
   }
   return dnn_support->createRnnDescriptor(
@@ -420,7 +396,7 @@ StreamExecutor::createRnnSequenceTensorDescriptor(int max_seq_length,
                                                   dnn::DataType data_type) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
-    return tsl::Status(tsl::error::UNKNOWN,
+    return tsl::Status(absl::StatusCode::kUnknown,
                        "Fail to find the dnn implementation.");
   }
   return dnn_support->createRnnSequenceTensorDescriptor(
@@ -434,7 +410,7 @@ StreamExecutor::createRnnSequenceTensorDescriptor(
     dnn::DataType data_type) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
-    return tsl::Status(tsl::error::UNKNOWN,
+    return tsl::Status(absl::StatusCode::kUnknown,
                        "Fail to find the dnn implementation.");
   }
   return dnn_support->createRnnSequenceTensorDescriptor(
@@ -448,7 +424,7 @@ StreamExecutor::createRnnStateTensorDescriptor(int num_layer, int batch_size,
                                                dnn::DataType data_type) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
-    return tsl::Status(tsl::error::UNKNOWN,
+    return tsl::Status(absl::StatusCode::kUnknown,
                        "Fail to find the dnn implementation.");
   }
   return dnn_support->createRnnStateTensorDescriptor(num_layer, batch_size,
@@ -546,7 +522,7 @@ tsl::StatusOr<DeviceMemoryBase> StreamExecutor::GetUntypedSymbol(
   }
 
   return tsl::Status(
-      tsl::error::NOT_FOUND,
+      absl::StatusCode::kNotFound,
       absl::StrCat("Check if module containing symbol ", symbol_name,
                    " is loaded (module_handle = ",
                    reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
@@ -691,7 +667,7 @@ tsl::Status StreamExecutor::SynchronousMemcpyD2H(
   result = implementation_->SynchronousMemcpy(host_dst, device_src, size);
   if (!result.ok()) {
     result = tsl::Status(
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat("failed to synchronously memcpy device-to-host: device "
                         "%p to host %p size %d: %s",
                         device_src.opaque(), host_dst, size,
@@ -715,7 +691,7 @@ tsl::Status StreamExecutor::SynchronousMemcpyH2D(const void* host_src,
   result = implementation_->SynchronousMemcpy(device_dst, host_src, size);
   if (!result.ok()) {
     result = tsl::Status(
-        tsl::error::INTERNAL,
+        absl::StatusCode::kInternal,
         absl::StrFormat("failed to synchronously memcpy host-to-device: host "
                         "%p to device %p size %d: %s",
                         host_src, device_dst->opaque(), size,
diff --git a/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h b/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h
index 14711ef660d..f40e943a20c 100644
--- a/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h
@@ -367,12 +367,6 @@ class StreamExecutor {
   // platform that underlies this interface.
   bool SupportsDnn() const;
 
-  // Returns the list of supported algorithms for the specified convolution
-  // operation.
-  bool GetConvolveAlgorithms(dnn::ConvolutionKind kind,
-                             dnn::DataType input_type,
-                             std::vector<dnn::AlgorithmDesc>* out_algorithms);
-
   // Returns the supported algorithms / execution plans for a convolution.
   tsl::Status GetConvolveRunners(
       bool use_cudnn_frontend, dnn::ConvolutionKind kind,
@@ -384,6 +378,7 @@ class StreamExecutor {
       DeviceMemoryBase output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, ScratchAllocator* scratch_allocator,
+      const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans);
 
   tsl::Status GetFusedConvolveRunners(
@@ -397,6 +392,7 @@ class StreamExecutor {
       const dnn::BatchDescriptor& output_descriptor,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       bool use_fallback, dnn::ActivationMode activation_mode,
+      const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans);
 
   tsl::Status GetFusedMatmulRunners(
@@ -405,6 +401,7 @@ class StreamExecutor {
       bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k,
       int64_t lda, int64_t ldb, int64_t ldc,
       dnn::ActivationMode activation_mode, bool use_fallback,
+      const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
           out_exec_plans);
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/BUILD b/tensorflow/compiler/xla/stream_executor/tpu/BUILD
index faeaa233a89..061ea7c4c7e 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/tpu/BUILD
@@ -1,10 +1,11 @@
 # Description: StreamExecutor Interface for TPUs
 
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//learning/brain/experimental/dtensor:__subpackages__",
         "//learning/brain/google/xla/kernels:__subpackages__",
         "//learning/brain/research/pjrt:__subpackages__",
@@ -19,7 +20,7 @@ package(
         "//tensorflow/core/common_runtime/next_pluggable_device:__subpackages__",
         "//tensorflow/core/tpu:__subpackages__",
         "//tensorflow/dtensor:__subpackages__",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.cc b/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.cc
index 21bbc5653fa..3a69351c58b 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.cc
@@ -103,9 +103,9 @@ SE_DeviceMemoryAllocator ToC(
             ->Allocate(device_ordinal, size, retry_on_failure, memory_space);
     if (!allocation.ok()) {
       auto status = allocation.status();
+      auto message = status.message();
       stream_executor::tpu::ExecutorApiFn()->TpuStatus_SetFn(
-          se_status, status.raw_code(), status.error_message().data(),
-          status.error_message().size());
+          se_status, status.raw_code(), message.data(), message.size());
     } else {
       auto& scoped_memory = allocation.value();
       memory->wrapped = ApiConverter::ToC(scoped_memory.Release());
@@ -118,9 +118,9 @@ SE_DeviceMemoryAllocator ToC(
     auto status = reinterpret_cast<stream_executor::DeviceMemoryAllocator*>(ctx)
                       ->Deallocate(device_ordinal, ApiConverter::FromC(*base));
     if (!status.ok()) {
+      auto message = status.message();
       stream_executor::tpu::ExecutorApiFn()->TpuStatus_SetFn(
-          se_status, status.raw_code(), status.error_message().data(),
-          status.error_message().size());
+          se_status, status.raw_code(), message.data(), message.size());
     }
   };
   return se_allocator;
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h b/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h
index 47398b6421d..e1d846739f1 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h
@@ -152,4 +152,4 @@ struct StackHelper {
 
 }  // namespace ApiConverter
 
-#endif
+#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_C_API_CONVERSIONS_H_
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h b/tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h
index e4a2aa69700..d8e4ace253c 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h
@@ -21,21 +21,22 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/tpu/libtftpu.h"
 
+#ifdef __cplusplus
 extern "C" {
+#endif
 
-struct TSL_Status;
-typedef TSL_Status TF_Status;
+typedef struct TSL_Status TF_Status;
 
 // Maximum number of array elements to inline into structs for performance.
 #define TPU_C_API_MAX_INLINED 6
 
-enum TpuCoreTypeEnum {
+typedef enum TpuCoreTypeEnum {
   kTensorCore,
   kEmbeddingV1,
   kEmbeddingV2,
-};
+} TpuCoreTypeEnum;
 
-enum TpuVersionEnum {
+typedef enum TpuVersionEnum {
   kUnknownTpuVersion,
   kTpuV2,
   kTpuV3,
@@ -43,7 +44,7 @@ enum TpuVersionEnum {
   // BEGIN-INTERNAL
   // reserved for internal use
   // END-INTERNAL
-};
+} TpuVersionEnum;
 
 typedef struct TpuRuntimeVersion {
   // The three version numbers are: major, minor, patch
@@ -181,42 +182,42 @@ typedef struct SE_MaybeOwningDeviceMemory {
   SE_DeviceMemoryAllocator allocator;
 } SE_MaybeOwningDeviceMemory;
 
-struct IntList {
+typedef struct IntList {
   union {
     int* heap;  // owned
     int inlined[TPU_C_API_MAX_INLINED];
   };
   int64_t size;
-};
+} IntList;
 
-struct Int64List {
+typedef struct Int64List {
   union {
     int64_t* heap;  // owned
     int64_t inlined[TPU_C_API_MAX_INLINED];
   };
   int64_t size;
-};
+} Int64List;
 
-struct FloatList {
+typedef struct FloatList {
   union {
     float* heap;  // owned
     float inlined[TPU_C_API_MAX_INLINED];
   };
   int64_t size;
-};
+} FloatList;
 
-struct BoolList {
+typedef struct BoolList {
   union {
     bool* heap;  // owned
     bool inlined[TPU_C_API_MAX_INLINED];
   };
   int64_t size;
-};
+} BoolList;
 
-struct FloatListRef {
+typedef struct FloatListRef {
   float* ptr;  // not owned
   int64_t size;
-};
+} FloatListRef;
 
 typedef struct TpuEmbeddingEngineParameters {
   FloatListRef** parameters[8];
@@ -227,13 +228,13 @@ typedef struct XLA_Tile {
   Int64List dimensions;
 } XLA_Tile;
 
-struct TileList {
+typedef struct TileList {
   union {
     XLA_Tile* heap;  // owned
     XLA_Tile inlined[TPU_C_API_MAX_INLINED];
   };
   int64_t size;
-};
+} TileList;
 
 typedef struct XLA_Layout {
   Int64List minor_to_major;
@@ -252,7 +253,7 @@ typedef struct XLA_Shape {
   int element_type;
   Int64List dimensions;
   BoolList dynamic_dimensions;
-  XLA_Shape* tuple_shapes;  // owned
+  struct XLA_Shape* tuple_shapes;  // owned
   int ntuple_shapes;
   bool has_layout;
   XLA_Layout layout;
@@ -325,10 +326,10 @@ typedef struct XLA_HloModuleConfig {
 
 typedef struct SE_HloExecutionProfile SE_HloExecutionProfile;
 
-struct SE_StreamExecutorList {
+typedef struct SE_StreamExecutorList {
   SE_StreamExecutor** exec;
   int count;
-};
+} SE_StreamExecutorList;
 
 typedef struct XLA_HloModuleGroup {
   TpuSerializedProto proto;
@@ -350,6 +351,9 @@ typedef void (*XLA_StatusCallbackFn)(void*, TF_Status*);
 typedef struct SE_TpuTopology SE_TpuTopology;
 typedef struct SE_TpuTopology_Core SE_TpuTopology_Core;
 typedef struct SE_TpuTopology_Core SE_TpuTopology_Host;
+
+#ifdef __cplusplus
 }
+#endif
 
 #endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_C_API_DECL_H_
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/libtftpu.h b/tensorflow/compiler/xla/stream_executor/tpu/libtftpu.h
index d3cb46d00c7..e1229b52392 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/libtftpu.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/libtftpu.h
@@ -18,6 +18,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_LIBTFTPU_H_
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_LIBTFTPU_H_
 
+#ifndef __cplusplus
+#define decltype(expr) __typeof__(expr)
+#endif
+
 // Unfortunately we have to add an Fn suffix because we cannot have the same
 // name for both a function and a element within a struct in the global
 // namespace in gcc. This restriction doesn't exist in clang.
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/status_helper.h b/tensorflow/compiler/xla/stream_executor/tpu/status_helper.h
index 9d5399a005b..d94adb78b7a 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/status_helper.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/status_helper.h
@@ -36,7 +36,7 @@ class StatusHelper {
       return ::tsl::OkStatus();
     } else {
       return tsl::Status(  // TENSORFLOW_STATUS_OK
-          tsl::error::Code(
+          absl::StatusCode(
               stream_executor::tpu::ExecutorApiFn()->TpuStatus_CodeFn(
                   c_status)),
           stream_executor::tpu::ExecutorApiFn()->TpuStatus_MessageFn(c_status));
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.cc
index 59c566da342..11404322745 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.cc
@@ -350,7 +350,7 @@ TSL_Status* HostCallbackTrampoline(void* ctx) {
   HostCallbackContext* host_ctx = reinterpret_cast<HostCallbackContext*>(ctx);
   Status status = std::move(host_ctx->callback)();
   TSL_Status* c_status = ExecutorApiFn()->TpuStatus_CreateFn(
-      status.raw_code(), status.error_message().c_str());
+      status.raw_code(), tsl::NullTerminatedMessage(status));
   delete host_ctx;
   return c_status;
 }
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index ddde25679ba..65ca47f80af 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -392,6 +392,7 @@ xla_test(
     deps = [
         ":hlo_test_base",
         ":literal_test_util",
+        ":test_utils",
         ":verified_hlo_module",
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
@@ -402,8 +403,6 @@ xla_test(
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -675,17 +674,12 @@ xla_test(
     srcs = ["conditional_test.cc"],
     shard_count = 2,
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -738,7 +732,10 @@ xla_test(
     name = "deallocation_test",
     srcs = ["deallocation_test.cc"],
     deps = [
+        ":client_library_test_base",
         ":test_macros_header",
+        ":test_utils",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
@@ -746,8 +743,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -756,7 +751,9 @@ xla_test(
     name = "deconstruct_tuple_test",
     srcs = ["deconstruct_tuple_test.cc"],
     deps = [
+        ":client_library_test_base",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -767,8 +764,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/types:span",
     ],
@@ -866,7 +861,6 @@ xla_test(
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",
-        ":literal_test_util",
         ":test_macros_header",
         ":test_utils",
         ":xla_internal_test_main",
@@ -879,7 +873,6 @@ xla_test(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
         "@com_google_absl//absl/strings",
@@ -987,9 +980,9 @@ xla_test(
         ":client_library_test_base",
         ":hlo_test_base",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
 
@@ -1673,10 +1666,10 @@ xla_test(
     name = "reduce_hlo_test",
     srcs = ["reduce_hlo_test.cc"],
     deps = [
+        ":hlo_test_base",
         ":test_macros_header",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
+        ":test_utils",
+        ":xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/strings",
     ],
@@ -1864,9 +1857,13 @@ xla_test(
     name = "prng_test",
     timeout = "long",
     srcs = ["prng_test.cc"],
-    shard_count = 6,
+    shard_count = 20,
     # TODO(b/148276347) The test fails on macOS.
-    tags = ["nomac"],
+    tags = [
+        "noasan",
+        "nomac",
+        "nosan",
+    ],
     deps = [
         ":test_macros_header",
         "//tensorflow/compiler/xla:literal",
@@ -1984,6 +1981,7 @@ xla_test(
     deps = [
         ":test_macros_header",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -2061,7 +2059,6 @@ xla_test(
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:local_client",
@@ -2078,6 +2075,46 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "collective_ops_test_e2e",
+    srcs = ["collective_ops_test_e2e.cc"],
+    backend_tags = {
+        # This test is tagged "manual" because it requires multiple GPUs, and
+        # Forge only supports single-GPU tests.  Guitar skips "manual" tests
+        # unless they're also tagged "guitar".
+        "gpu": [
+            "guitar",
+            "manual",
+            "multi_gpu",
+            "no_oss",
+            "notap",
+            "jitrt_executable",
+        ],
+    },
+    backends = [
+        "gpu",
+    ],
+    deps = [
+        ":client_library_test_base",
+        ":hlo_test_base",
+        ":literal_test_util",
+        ":test_macros_header",
+        ":test_utils",
+        ":xla_internal_test_main",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/service/gpu:backend_configs_cc",
+        "//tensorflow/tsl/platform:test",
+    ],
+)
+
 xla_test(
     name = "replicated_io_feed_test",
     srcs = ["replicated_io_feed_test.cc"],
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 812ef04fa6b..3882f4889fe 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -3290,7 +3290,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, NonIdentityBroadcastOfSameRankIsDisallowed) {
 
   auto computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
-  EXPECT_THAT(computation_status.status().error_message(),
+  EXPECT_THAT(computation_status.status().message(),
               ::testing::ContainsRegex("must.*be the identity"));
 }
 
diff --git a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
index bcdf980bacc..a8ff9d90289 100644
--- a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
+++ b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
@@ -43,7 +43,7 @@ TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) {
   StatusOr<XlaComputation> computation = builder.Build();
   EXPECT_FALSE(computation.ok());
   LOG(INFO) << "status received: " << computation.status();
-  EXPECT_THAT(computation.status().error_message(),
+  EXPECT_THAT(computation.status().message(),
               ::testing::HasSubstr("shape has invalid"));
 }
 
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index 23e608ccef8..34be3ee2630 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -739,7 +739,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
 
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
-  EXPECT_THAT(result_status.status().error_message(),
+  EXPECT_THAT(result_status.status().message(),
               HasSubstr("dimension 0 mismatch"));
 }
 
@@ -752,7 +752,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
 
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
-  EXPECT_THAT(result_status.status().error_message(),
+  EXPECT_THAT(result_status.status().message(),
               HasSubstr("op add with incompatible shapes"));
 }
 
@@ -765,7 +765,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
 
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
-  EXPECT_THAT(result_status.status().error_message(),
+  EXPECT_THAT(result_status.status().message(),
               HasSubstr("op add with incompatible shapes"));
 }
 
diff --git a/tensorflow/compiler/xla/tests/buffer_donation_test.cc b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
index 204f26dcbfa..2d6f9362ec5 100644
--- a/tensorflow/compiler/xla/tests/buffer_donation_test.cc
+++ b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
@@ -130,10 +130,10 @@ class BufferDonationTest : public HloTestBase {
                                          /*hlo_execution_profile=*/nullptr);
     if (!expected_failure.empty()) {
       ASSERT_FALSE(output_status.ok());
-      ASSERT_TRUE(absl::StrContains(output_status.status().error_message(),
-                                    expected_failure))
+      ASSERT_TRUE(
+          absl::StrContains(output_status.status().message(), expected_failure))
           << "got: \n"
-          << output_status.status().error_message() << " \nvs want\n"
+          << output_status.status().message() << " \nvs want\n"
           << expected_failure;
       return;
     }
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 306512e21b8..db8daa02afc 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -30,20 +30,15 @@ def xla_test(
         **kwargs):
     """Generates cc_test targets for the given XLA backends.
 
-    This rule generates a cc_test target for one or more XLA backends and also a
-    platform-agnostic cc_library rule. The arguments are identical to cc_test with
-    two additions: 'backends' and 'backend_args'. 'backends' specifies the
-    backends to generate tests for ("cpu", "gpu"), and
-    'backend_args'/'backend_tags' specifies backend-specific args parameters to
-    use when generating the cc_test.
+    This rule generates a cc_test target for one or more XLA backends. The arguments
+    are identical to cc_test with two additions: 'backends' and 'backend_args'.
+    'backends' specifies the backends to generate tests for ("cpu", "gpu"), and
+    'backend_args'/'backend_tags' specifies backend-specific args parameters to use
+    when generating the cc_test.
 
     The name of the cc_tests are the provided name argument with the backend name
-    appended, and the cc_library target name is the provided name argument with
-    "_lib" appended. For example, if name parameter is "foo_test", then the cpu
-    test target will be "foo_test_cpu" and the cc_library target is "foo_lib".
-
-    The cc_library target can be used to link with other plugins outside of
-    xla_test.
+    appended. For example, if name parameter is "foo_test", then the cpu
+    test target will be "foo_test_cpu".
 
     The build rule also defines a test suite ${name} which includes the tests for
     each of the supported backends.
@@ -109,15 +104,6 @@ def xla_test(
         if backend not in disabled_backends
     ]
 
-    native.cc_library(
-        name = "%s_lib" % name,
-        srcs = srcs,
-        tags = tags,
-        copts = copts,
-        testonly = True,
-        deps = deps,
-    )
-
     for backend in backends:
         test_name = "%s_%s" % (name, backend)
         this_backend_tags = ["xla_%s" % backend]
diff --git a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
index 929c5931624..61106baeb0f 100644
--- a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
+++ b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
@@ -60,15 +60,13 @@ TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
       client_->Execute(computation, {param0_data.get()}, &execution_options_);
   ASSERT_FALSE(result_one_arg.ok());
   ASSERT_EQ(result_one_arg.status().code(), tsl::error::INVALID_ARGUMENT);
-  ASSERT_THAT(result_one_arg.status().error_message(),
-              ContainsRegex("takes 2"));
+  ASSERT_THAT(result_one_arg.status().message(), ContainsRegex("takes 2"));
 
   auto result_zero_args =
       client_->Execute(computation, {}, &execution_options_);
   ASSERT_FALSE(result_zero_args.ok());
   ASSERT_EQ(result_zero_args.status().code(), tsl::error::INVALID_ARGUMENT);
-  ASSERT_THAT(result_zero_args.status().error_message(),
-              ContainsRegex("takes 2"));
+  ASSERT_THAT(result_zero_args.status().message(), ContainsRegex("takes 2"));
 }
 
 XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
@@ -99,7 +97,7 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
                             &execution_options_);
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tsl::error::INVALID_ARGUMENT);
-  ASSERT_THAT(status.status().error_message(),
+  ASSERT_THAT(status.status().message(),
               ContainsRegex(
                   "Argument does not match shape of computation parameter 0"));
 
@@ -108,7 +106,7 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
                             &execution_options_);
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tsl::error::INVALID_ARGUMENT);
-  ASSERT_THAT(status.status().error_message(),
+  ASSERT_THAT(status.status().message(),
               ContainsRegex(
                   "Argument does not match shape of computation parameter 1"));
 
@@ -117,7 +115,7 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
                             &execution_options_);
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tsl::error::INVALID_ARGUMENT);
-  ASSERT_THAT(status.status().error_message(),
+  ASSERT_THAT(status.status().message(),
               ContainsRegex(
                   "Argument does not match shape of computation parameter 1"));
 }
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 2606ca8ea3c..0db79f36089 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -465,6 +465,7 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
                     std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, tsl::float8_e5m2>::value ||
                     std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
+                    std::is_same<NativeT, tsl::float8_e4m3b11>::value ||
                     std::is_same<NativeT, complex64>::value ||
                     std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
@@ -492,6 +493,7 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                     std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, tsl::float8_e5m2>::value ||
                     std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
+                    std::is_same<NativeT, tsl::float8_e4m3b11>::value ||
                     std::is_same<NativeT, complex64>::value ||
                     std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
@@ -520,6 +522,7 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
                     std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, tsl::float8_e5m2>::value ||
                     std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
+                    std::is_same<NativeT, tsl::float8_e4m3b11>::value ||
                     std::is_same<NativeT, complex64>::value ||
                     std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
@@ -549,6 +552,7 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
                     std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, tsl::float8_e5m2>::value ||
                     std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
+                    std::is_same<NativeT, tsl::float8_e4m3b11>::value ||
                     std::is_same<NativeT, complex64>::value ||
                     std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
@@ -578,6 +582,7 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
                     std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, tsl::float8_e5m2>::value ||
                     std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
+                    std::is_same<NativeT, tsl::float8_e4m3b11>::value ||
                     std::is_same<NativeT, complex64>::value ||
                     std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
@@ -606,6 +611,7 @@ void ClientLibraryTestBase::ComputeAndCompare(
                     std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, tsl::float8_e5m2>::value ||
                     std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
+                    std::is_same<NativeT, tsl::float8_e4m3b11>::value ||
                     std::is_same<NativeT, complex64>::value ||
                     std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
diff --git a/tensorflow/compiler/xla/tests/collective_ops_test.cc b/tensorflow/compiler/xla/tests/collective_ops_test.cc
index ddedd5e9421..b885396a681 100644
--- a/tensorflow/compiler/xla/tests/collective_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/collective_ops_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -49,14 +48,17 @@ class CollectiveOpsTest : public HloTestBase {
   CollectiveOpsTest() : num_devices_(backend().device_count()) {
     VLOG(1) << "Running with " << num_devices_ << " devices";
   }
-  static void SetUpTestSuite() {
-    // Not needed structly, since this test exercises cross replica collective
-    // permute which does not use NCCL. But keeping it here for testing.
-    tsl::setenv("NCCL_LAUNCH_MODE", "PARALLEL", /*overwrite=*/1);
-    HloTestBase::SetUpTestSuite();
-  }
 
  protected:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    // Disable async->sync collective conversion pass to enable unit testing
+    // of async collectives.
+    debug_options.add_xla_disable_hlo_passes(
+        "gpu-convert-async-collectives-to-sync");
+    return debug_options;
+  }
+
   std::unique_ptr<HloModule> MakeCrsModule(
       const Shape& shape, std::vector<std::vector<int64_t>> replica_groups,
       const HloModuleConfig& config, std::string op = "add",
@@ -283,10 +285,11 @@ XLA_TEST_F(CollectiveOpsTest, AllReduceAnd_Pred) {
 
   HloModuleConfig config = GetModuleConfigForTest(/*replica_count=*/2);
   auto module = ParseAndReturnVerifiedModule(hlo_module, config).value();
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
-                          ExecuteReplicated(std::move(module), {},
-                                            /*num_replicas=*/2,
-                                            /*use_threads=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), {},
+                        /*num_replicas=*/2,
+                        /*use_threads=*/true, /*run_hlo_passes=*/true));
   for (int replica_idx = 0; replica_idx < 2; replica_idx++) {
     EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<bool>({false}),
                                        results[replica_idx]));
@@ -323,10 +326,11 @@ XLA_TEST_F(CollectiveOpsTest, AllReduceOr_Pred) {
 
   HloModuleConfig config = GetModuleConfigForTest(/*replica_count=*/2);
   auto module = ParseAndReturnVerifiedModule(hlo_module, config).value();
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
-                          ExecuteReplicated(std::move(module), {},
-                                            /*num_replicas=*/2,
-                                            /*use_threads=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), {},
+                        /*num_replicas=*/2,
+                        /*use_threads=*/true, /*run_hlo_passes=*/true));
   for (int replica_idx = 0; replica_idx < 2; replica_idx++) {
     EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<bool>({true}),
                                        results[replica_idx]));
@@ -440,7 +444,7 @@ XLA_TEST_F(CollectiveOpsTest, AllReduce_CombinableAllReduces) {
       std::vector<Literal> results,
       ExecuteReplicated(std::move(module), {&input0_literal, &input1_literal},
                         /*num_replicas=*/kNumReplicas,
-                        /*use_threads=*/true));
+                        /*use_threads=*/true, /*run_hlo_passes=*/true));
   std::vector<float> expected0_vec = {2., 4., 6., 8., 10.};
   auto expected0_literal = LiteralUtil::CreateR1<float>(expected0_vec);
   std::vector<float> expected1_vec = {14., 6., 8., 2., 4.};
@@ -1043,7 +1047,7 @@ XLA_TEST_F(CollectiveOpsTest, AllReduce_TupleAllReduce) {
       std::vector<Literal> results,
       ExecuteReplicated(std::move(module), {&input0_literal, &input1_literal},
                         /*num_replicas=*/kNumReplicas,
-                        /*use_threads=*/true));
+                        /*use_threads=*/true, /*run_hlo_passes=*/true));
   std::vector<float> expected0_vec = {2., 4., 6., 8., 10.};
   auto expected0_literal = LiteralUtil::CreateR1<float>(expected0_vec);
   std::vector<float> expected1_vec = {14., 6., 8., 2., 4., 6., 8.};
@@ -1691,5 +1695,36 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncAllToAll)) {
   LiteralTestUtil::ExpectR1Equal<uint32_t>({15, 16}, results[1]);
 }
 
+// Test for all-gather with unit dims to verify that dimension check works
+// correctly in the presence of unit dimensions.
+XLA_TEST_F(CollectiveOpsTest, AllGather_Dim1UnitDimensions) {
+  const char* const kModuleStr = R"(
+  HloModule test
+  ENTRY test_computation {
+    id = u32[] replica-id()
+    id2 = u32[1, 1, 2, 1, 2] broadcast(id), dimensions={}
+    offset = u32[4] iota(), iota_dimension=0
+    offset_reshape = u32[1, 1, 2, 1, 2] reshape(offset)
+    agi = u32[1, 1, 2, 1, 2] add(id2, offset_reshape)
+    allgather = u32[1, 1, 4, 1, 2] all-gather(agi), dimensions={2}
+    ROOT out = u32[8] reshape(allgather)
+  }
+  )";
+  const int64_t kNumReplicas = 2;
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), {}, kNumReplicas,
+                        /*use_threads=*/true, /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  for (const Literal& result : results) {
+    LiteralTestUtil::ExpectR1Equal<uint32_t>({0, 1, 2, 3, 1, 2, 3, 4}, result);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/collective_ops_test_e2e.cc b/tensorflow/compiler/xla/tests/collective_ops_test_e2e.cc
new file mode 100644
index 00000000000..18f8f25c87d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/collective_ops_test_e2e.cc
@@ -0,0 +1,535 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+namespace op = ::xla::testing::opcode_matchers;
+using ::testing::NotNull;
+
+// Makes a DeviceAssignment device#i to replica_id #i.
+DeviceAssignment MakeDeviceAssn(int64_t num_replicas) {
+  DeviceAssignment assn(/*replica_count=*/num_replicas,
+                        /*computation_count=*/1);
+  for (int64_t i = 0; i < num_replicas; ++i) {
+    assn(i, 0) = i;
+  }
+  return assn;
+}
+
+class CollectiveOpsTestE2E : public HloTestBase {
+ public:
+  StatusOr<std::vector<Literal>> ExecuteReplicated(Executable* executable,
+                                                   int64_t num_replicas) {
+    DeviceAssignment device_assignment = MakeDeviceAssn(num_replicas);
+    return HloTestBase::ExecuteReplicated(
+        /*executable_provider*/ [&](int64_t) { return executable; },
+        /*argument_count_provider*/ [](int64_t) { return 0; },
+        /*argument_provider*/ [](int64_t, int64_t) { return nullptr; },
+        num_replicas, /*run_hlo_passes=*/false, &device_assignment);
+  }
+};
+
+// E2E tests for collective ops. These will generally verify some HLO transform
+// for collectives (for example, sync -> async conversion) and correct
+// execution of the transformed HLO.
+
+// E2E test for async collectives. Tested with both async collective enabled
+// and disabled. Verify that async collective is generated when enabled
+// in the end-to-end compilation for GPU's and that the execution produces
+// correct result.
+class AsyncCollectiveOps : public CollectiveOpsTestE2E,
+                           public ::testing::WithParamInterface<bool> {
+ public:
+  AsyncCollectiveOps() : num_devices_(backend().device_count()) {
+    VLOG(1) << "Running with " << num_devices_ << " devices";
+  }
+
+ protected:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+
+    // Enable or disable all async collectives based on test parameter.
+    const bool enable_async = GetParam();
+    debug_options.set_xla_gpu_enable_async_all_reduce(enable_async);
+    debug_options.set_xla_gpu_enable_async_collective_permute(enable_async);
+    debug_options.set_xla_gpu_enable_async_all_gather(enable_async);
+    debug_options.set_xla_gpu_enable_async_reduce_scatter(enable_async);
+    debug_options.set_xla_gpu_enable_async_all_to_all(enable_async);
+    debug_options.add_xla_disable_hlo_passes(
+        "gpu-convert-async-collectives-to-sync");
+    return debug_options;
+  }
+
+  StatusOr<std::unique_ptr<Executable>> CreateExecutable(
+      absl::string_view hlo_string, int64_t num_replicas) {
+    HloModuleConfig config =
+        GetModuleConfigForTest(/*replica_count=*/num_replicas);
+
+    TF_ASSIGN_OR_RETURN(auto module,
+                        ParseAndReturnVerifiedModule(hlo_string, config));
+    return HloTestBase::CreateExecutable(std::move(module),
+                                         /*run_hlo_passes=*/true);
+  }
+
+  bool IsAsync(const HloInstruction* inst) {
+    return !inst->backend_config<gpu::CollectiveBackendConfig>()->is_sync();
+  }
+
+  const int64_t num_devices_;
+};
+
+XLA_TEST_P(AsyncCollectiveOps, AsyncAllReduce) {
+  const absl::string_view kModuleStr = R"(
+      HloModule test
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        ROOT all-reduce = u32[] all-reduce(id), to_apply=apply_op
+      }
+    )";
+
+  const int64_t kNumReplicas = 2;
+  const bool enable_async_all_reduce = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CreateExecutable(kModuleStr, kNumReplicas));
+  EXPECT_TRUE(executable->has_module());
+
+  HloInstruction* all_reduce_start =
+      FindInstruction(&executable->module(), HloOpcode::kAllReduceStart);
+  HloInstruction* all_reduce_done =
+      FindInstruction(&executable->module(), HloOpcode::kAllReduceDone);
+  EXPECT_THAT(all_reduce_start, NotNull());
+  EXPECT_THAT(all_reduce_done, NotNull());
+  EXPECT_EQ(IsAsync(all_reduce_start), enable_async_all_reduce);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  // sum [0, num_devices)
+  const uint32_t expected = kNumReplicas * (kNumReplicas - 1) / 2;
+  for (int i = 0; i < kNumReplicas; ++i) {
+    LiteralTestUtil::ExpectR0Equal<uint32_t>(expected, results[i]);
+  }
+}
+
+XLA_TEST_P(AsyncCollectiveOps, AsyncAllGather) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test
+  ENTRY test_computation {
+    id = u32[] replica-id()
+    id2 = u32[1, 2] broadcast(id), dimensions={}
+    a0 = u32[1, 2] constant({{10, 15}})
+    a1 = u32[1, 2] add(id2, a0)
+    allgather = u32[2, 2] all-gather(a1), dimensions={0}
+    ROOT out = u32[4] reshape(allgather)
+  }
+  )";
+  const int64_t kNumReplicas = 2;
+  const bool enable_async_all_gather = GetParam();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CreateExecutable(kModuleStr, kNumReplicas));
+
+  EXPECT_TRUE(executable->has_module());
+  HloInstruction* all_gather_start =
+      FindInstruction(&executable->module(), HloOpcode::kAllGatherStart);
+  HloInstruction* all_gather_done =
+      FindInstruction(&executable->module(), HloOpcode::kAllGatherDone);
+  EXPECT_THAT(all_gather_start, NotNull());
+  EXPECT_THAT(all_gather_done, NotNull());
+  EXPECT_EQ(IsAsync(all_gather_start), enable_async_all_gather);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+
+  ASSERT_EQ(results.size(), kNumReplicas);
+  for (const Literal& result : results) {
+    LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 15, 11, 16}, result);
+  }
+}
+
+XLA_TEST_P(AsyncCollectiveOps, AsyncAllGatherMixedTypes) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test
+  ENTRY test_computation {
+    id = u32[] replica-id()
+    id2 = u32[1, 2] broadcast(id), dimensions={}
+    a0 = u32[1, 2] constant({{10, 15}})
+    a1 = u32[1, 2] add(id2, a0)
+    a2 = f32[1, 2] convert(a1)
+    allgather = (u32[2, 2], f32[2,2]) all-gather(a1, a2), dimensions={0}
+    gte0 = u32[2,2] get-tuple-element(allgather), index=0
+    gte1 = f32[2,2] get-tuple-element(allgather), index=1
+    out0 = u32[4] reshape(gte0)
+    out1 = f32[4] reshape(gte1)
+    ROOT out = (u32[4], f32[4]) tuple(out0, out1)
+  }
+  )";
+  const int64_t kNumReplicas = 2;
+  const bool enable_async_all_gather = GetParam();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CreateExecutable(kModuleStr, kNumReplicas));
+  EXPECT_TRUE(executable->has_module());
+  HloInstruction* all_gather_start =
+      FindInstruction(&executable->module(), HloOpcode::kAllGatherStart);
+  HloInstruction* all_gather_done =
+      FindInstruction(&executable->module(), HloOpcode::kAllGatherDone);
+  EXPECT_THAT(all_gather_start, NotNull());
+  EXPECT_THAT(all_gather_done, NotNull());
+  EXPECT_EQ(IsAsync(all_gather_start), enable_async_all_gather);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+
+  ASSERT_EQ(results.size(), kNumReplicas);
+  for (Literal& result : results) {
+    std::vector<Literal> results = result.DecomposeTuple();
+    LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 15, 11, 16}, results[0]);
+    LiteralTestUtil::ExpectR1Equal<float>({10.0, 15.0, 11.0, 16.0}, results[1]);
+  }
+}
+
+XLA_TEST_P(AsyncCollectiveOps, AsyncCollectivePermute) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test
+  ENTRY test_computation {
+    replica = u32[] replica-id()
+    ten = u32[] constant(10)
+    sum = u32[] add(replica, ten)
+    p = u32[2] broadcast(sum), dimensions={}
+    permute = u32[2] collective-permute(p), source_target_pairs={{1,0}, {0,1}}
+    ROOT copy = u32[2] copy(permute)
+  }
+  )";
+  const int64_t kNumReplicas = 2;
+  const bool enable_async_collective_permute = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CreateExecutable(kModuleStr, kNumReplicas));
+  EXPECT_TRUE(executable->has_module());
+  HloInstruction* cp_start = FindInstruction(
+      &executable->module(), HloOpcode::kCollectivePermuteStart);
+  HloInstruction* cp_done =
+      FindInstruction(&executable->module(), HloOpcode::kCollectivePermuteDone);
+  EXPECT_THAT(cp_start, NotNull());
+  EXPECT_THAT(cp_done, NotNull());
+  EXPECT_EQ(IsAsync(cp_start), enable_async_collective_permute);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({11, 11}, results[0]);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 10}, results[1]);
+}
+
+XLA_TEST_P(AsyncCollectiveOps, AsyncReduceScatter) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test
+  add {
+    lhs = u32[] parameter(0)
+    rhs = u32[] parameter(1)
+    ROOT add = u32[] add(lhs, rhs)
+  }
+
+  ENTRY main {
+    c0 = u32[8] constant({1, 2, 3, 4, 5, 6, 7, 8})
+    c1 = u32[8] constant({10, 11, 12, 13, 14, 15, 16, 17})
+    zero = u32[] constant(0)
+    id = u32[] replica-id()
+    p = pred[] compare(id, zero), direction=EQ
+    pb = pred[8] broadcast(p), dimensions={}
+    // data = c0 for replica 0 and c1 for replica 1
+    data = u32[8] select(pb, c0, c1)
+    ROOT ars = u32[4] reduce-scatter(data), replica_groups={},
+                      dimensions={0}, to_apply=add
+  }
+  )";
+
+  const int64_t kNumReplicas = 2;
+  const bool enable_async_reduce_scatter = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CreateExecutable(kModuleStr, kNumReplicas));
+  EXPECT_TRUE(executable->has_module());
+  HloInstruction* rs_start =
+      FindInstruction(&executable->module(), HloOpcode::kAsyncStart);
+  HloInstruction* rs_done =
+      FindInstruction(&executable->module(), HloOpcode::kAsyncDone);
+  ASSERT_THAT(rs_start, NotNull());
+  ASSERT_THAT(rs_done, NotNull());
+  HloAsyncInstruction* rs_start_async = Cast<HloAsyncInstruction>(rs_start);
+  EXPECT_EQ(rs_start_async->async_wrapped_opcode(), HloOpcode::kReduceScatter);
+  EXPECT_EQ(IsAsync(rs_start), enable_async_reduce_scatter);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({11, 13, 15, 17}, results[0]);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({19, 21, 23, 25}, results[1]);
+}
+
+XLA_TEST_P(AsyncCollectiveOps, AsyncAllToAllWithSplitDim) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test
+
+  ENTRY test_computation {
+    id = u32[] replica-id()
+    id2 = u32[2] broadcast(id), dimensions={}
+    a0 = u32[2] constant({10, 15})
+    a1 = u32[2] add(id2, a0)
+    ROOT a2a = u32[2] all-to-all(u32[2] a1), dimensions={0}
+  }
+  )";
+  const int64_t kNumReplicas = 2;
+  const bool enable_async_all_to_all = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CreateExecutable(kModuleStr, kNumReplicas));
+  EXPECT_TRUE(executable->has_module());
+
+  HloInstruction* a2a_start =
+      FindInstruction(&executable->module(), HloOpcode::kAsyncStart);
+  HloInstruction* a2a_done =
+      FindInstruction(&executable->module(), HloOpcode::kAsyncDone);
+  ASSERT_THAT(a2a_start, NotNull());
+  ASSERT_THAT(a2a_done, NotNull());
+  HloAsyncInstruction* a2a_start_async = Cast<HloAsyncInstruction>(a2a_start);
+  EXPECT_EQ(a2a_start_async->async_wrapped_opcode(), HloOpcode::kAllToAll);
+  EXPECT_EQ(IsAsync(a2a_start), enable_async_all_to_all);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 11}, results[0]);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({15, 16}, results[1]);
+}
+
+XLA_TEST_P(AsyncCollectiveOps, AsyncAllToAllWithoutSplitDim) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test
+
+  ENTRY test_computation {
+    id = u32[] replica-id()
+    id2 = u32[2] broadcast(id), dimensions={}
+    a0 = u32[2] constant({10, 15})
+    a1 = u32[2] add(id2, a0)
+    a2 = u32[2] constant({4, 4})
+    a3 = u32[2] multiply(a1, a2)
+    // r0 : a1 = {10, 15}, a2 = {40, 60)
+    // r1 : a1 = {11, 16}, a1 = {44, 64}
+    // r0: a2a element 0 = {10, 15}, a2a element 1 = {11, 16}
+    // r0: a2a element 0 = {40, 60}, a2a element 1 = {44, 64}
+    a2a = (u32[2], u32[2]) all-to-all(u32[2] a1, u32[2] a3), replica_groups={{0,1}}
+    gte0 = get-tuple-element(a2a), index=0
+    gte1 = get-tuple-element(a2a), index=1
+    ROOT x = u32[4] concatenate(gte0, gte1), dimensions={0}
+  }
+  )";
+  const int64_t kNumReplicas = 2;
+  const bool enable_async_all_to_all = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CreateExecutable(kModuleStr, kNumReplicas));
+  EXPECT_TRUE(executable->has_module());
+  HloInstruction* a2a_start =
+      FindInstruction(&executable->module(), HloOpcode::kAsyncStart);
+  HloInstruction* a2a_done =
+      FindInstruction(&executable->module(), HloOpcode::kAsyncDone);
+  ASSERT_THAT(a2a_start, NotNull());
+  ASSERT_THAT(a2a_done, NotNull());
+  HloAsyncInstruction* a2a_start_async = Cast<HloAsyncInstruction>(a2a_start);
+  EXPECT_EQ(a2a_start_async->async_wrapped_opcode(), HloOpcode::kAllToAll);
+  EXPECT_EQ(IsAsync(a2a_start_async), enable_async_all_to_all);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 15, 11, 16}, results[0]);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({40, 60, 44, 64}, results[1]);
+}
+
+INSTANTIATE_TEST_SUITE_P(AsyncCollectiveOps, AsyncCollectiveOps,
+                         ::testing::Bool());
+
+// Tests for HLO level transforms.
+TEST_F(CollectiveOpsTestE2E, WhileLoopReduceScatterCodeMotion) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test
+
+  %add {
+    %x = u32[] parameter(0)
+    %y = u32[] parameter(1)
+    ROOT %add = u32[] add(%x, %y)
+  }
+
+  %cond {
+    %param = (u32[], u32[2], u32[1]) parameter(0)
+    %count = get-tuple-element(%param), index=0
+    %limit = u32[] constant(3)
+    ROOT %result = pred[] compare(%count, %limit), direction=LT
+  }
+
+  %body {
+    %param = (u32[], u32[2], u32[1]) parameter(0)
+
+    %count = u32[] get-tuple-element(%param), index=0
+    %increment = u32[] constant(1)
+    %new_count = u32[] add(%count, %increment)
+
+    // iter0: replica0 = {10, 15}, replica1 = {11, 16}
+    // iter1: replica0 = {11, 17}, replica1 = {12, 18}
+    // iter2: replica0 = {12, 19}, replica1 = {13, 20}
+
+    %rs_input = u32[2] get-tuple-element(%param), index=1
+
+    // iter0: replica0 = 21, replica1 = 31
+    // iter1: replica0 = 23, replica1 = 35
+    // iter2: replicq0 = 25, replica1 = 39
+    %rs = u32[1] reduce-scatter(%rs_input), replica_groups={{0,1}}, to_apply=%add, dimensions={0}
+
+    // iter0: replica0 = 5, replica1 = 5
+    // iter1: replica0 = 26, replica1 = 36
+    // iter2: replica0 = 49, replica1 = 70
+    %old_accum = u32[1] get-tuple-element(%param), index=2
+
+    // iter0: replica0 = 26, replica1 = 36
+    // iter1: replica0 = 49, replica1 = 71
+    // iter2: replica0 = 74, replica1 = 110
+    %new_accum = u32[1] add(%rs, %old_accum)
+
+    %input_inc = u32[2] constant({1, 2})
+
+    // iter0: replica0 = {11, 17}, replica1 = {12, 18}
+    // iter1: replica0 = {12, 19}, replica1 = {13, 20}
+    // iter2: replica0 = {13, 21}, replica1 = {14, 22}
+    %new_rs_input = u32[2] add(%rs_input, %input_inc)
+
+    ROOT ret = (u32[], u32[2], u32[1]) tuple(%new_count, %new_rs_input, %new_accum)
+  }
+
+  ENTRY test_computation {
+    // loop that executes 3 times.
+    %count = u32[] constant(0)
+    %id = u32[] replica-id()
+    %id2 = u32[2] broadcast(id), dimensions={}
+    %a0 = u32[2] constant({10, 15})
+    // replica0: {10, 15}, replica1 : {11, 16}
+    %init_rs_input = u32[2] add(id2, a0)
+    %init_rs_accum = u32[1] constant({5})
+    %while_init = (u32[], u32[2], u32[1]) tuple(%count, %init_rs_input, %init_rs_accum)
+    %while_result = (u32[], u32[2], u32[1]) while(%while_init), body=%body, condition=%cond
+    ROOT gte = u32[1] get-tuple-element(%while_result), index=2
+  }
+  )";
+
+  const int64_t kNumReplicas = 2;
+
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_while_loop_reduce_scatter_code_motion(true);
+  HloModuleConfig config;
+  config.set_debug_options(debug_options);
+  config.set_replica_count(kNumReplicas);
+  config.set_num_partitions(1);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      CreateExecutable(std::move(module), /*run_hlo_passes=*/true));
+  ASSERT_TRUE(executable->has_module());
+  HloModule* executable_module = &executable->module();
+
+  // Verify that the reduce-scatter get hoisted out of the while loop.
+  const HloInstruction* while_loop =
+      FindInstruction(executable_module, HloOpcode::kWhile);
+  ASSERT_THAT(while_loop, NotNull());
+  const HloInstruction* reduce_scatter =
+      FindInstruction(executable_module, HloOpcode::kAsyncStart);
+  ASSERT_THAT(reduce_scatter, NotNull());
+
+  const HloAsyncInstruction* rs_async =
+      Cast<HloAsyncInstruction>(reduce_scatter);
+  EXPECT_EQ(rs_async->async_wrapped_opcode(), HloOpcode::kReduceScatter);
+
+  // Verify that the reduce-scatter has been hoisted out of the while loop and
+  // into the entry computation.
+  const HloComputation* entry = executable_module->entry_computation();
+  EXPECT_EQ(reduce_scatter->parent(), entry);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({74}, results[0]);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({110}, results[1]);
+}
+
+// Verify that all-to-all with split dims is not decomposed to tuples.
+TEST_F(CollectiveOpsTestE2E, NoAllToAllDecomposition) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test
+  ENTRY test_computation {
+    id = u32[] replica-id()
+    id2 = u32[2, 2] broadcast(id), dimensions={}
+    a0 = u32[2, 2] constant({{10, 15}, {20, 25}})
+    a1 = u32[2, 2] add(id2, a0)
+    all2all = u32[2, 2] all-to-all(a1), replica_groups={{0,1}}, dimensions={0}
+    ROOT out = u32[4] reshape(all2all)
+  }
+  )";
+  const int64_t kNumReplicas = 2;
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      CreateExecutable(std::move(module), /*run_hlo_passes=*/true));
+  ASSERT_TRUE(executable->has_module());
+  HloModule* executable_module = &executable->module();
+
+  // Verify that the all-to-all is not decomposed into a tuple all-to-all.
+  const HloInstruction* all_to_all =
+      FindInstruction(executable_module, HloOpcode::kAllToAll);
+  EXPECT_THAT(all_to_all, op::Shape("u32[2, 2]"));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 15, 11, 16}, results[0]);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({20, 25, 21, 26}, results[1]);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
index 09f7e94f467..50f1de78191 100644
--- a/tensorflow/compiler/xla/tests/conditional_test.cc
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -697,7 +697,7 @@ XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
 
   auto result = builder.Build();
   EXPECT_FALSE(result.ok());
-  EXPECT_THAT(result.status().error_message(),
+  EXPECT_THAT(result.status().message(),
               ::testing::HasSubstr("operand 0 must match the shape of the "
                                    "only parameter of branch computation 0"));
 }
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index c2db5e55898..a67d7b6112b 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -77,6 +77,28 @@ TEST_F(ConstantsTest, OneCellU32) {
   ComputeAndCompareR1<uint32_t>(&builder, constant, {});
 }
 
+TEST_F(ConstantsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(OneCellU4))) {
+  std::vector<u4> constant = {u4(2)};
+
+  XlaBuilder builder(TestName());
+  auto c = ConstantR1<u4>(&builder, constant);
+  // ComputeAndCompareR1 currently does not support U4, so convert to U8
+  ConvertElementType(c, U8);
+
+  ComputeAndCompareR1<uint8_t>(&builder, {2}, {});
+}
+
+TEST_F(ConstantsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(OneCellS4))) {
+  std::vector<s4> constant = {s4(-2)};
+
+  XlaBuilder builder(TestName());
+  auto c = ConstantR1<s4>(&builder, constant);
+  // ComputeAndCompareR1 currently does not support S4, so convert to S8
+  ConvertElementType(c, S8);
+
+  ComputeAndCompareR1<int8_t>(&builder, {-2}, {});
+}
+
 TEST_F(ConstantsTest, OneCellF16) {
   std::vector<half> constant = {half{2.0}};
 
@@ -88,7 +110,7 @@ TEST_F(ConstantsTest, OneCellF16) {
   ComputeAndCompareR1<float>(&builder, {2.0f}, {}, error_spec_);
 }
 
-TEST_F(ConstantsTest, OneCellF8e4m3fn) {
+TEST_F(ConstantsTest, OneCellF8e5m2) {
   std::vector<tsl::float8_e5m2> constant = {tsl::float8_e5m2{2.0}};
 
   XlaBuilder builder(TestName());
@@ -99,6 +121,17 @@ TEST_F(ConstantsTest, OneCellF8e4m3fn) {
   ComputeAndCompareR1<float>(&builder, {2.0f}, {}, error_spec_);
 }
 
+TEST_F(ConstantsTest, OneCellF8e4m3b11fnuz) {
+  std::vector<tsl::float8_e4m3b11> constant = {tsl::float8_e4m3b11{2.0}};
+
+  XlaBuilder builder(TestName());
+  auto c = ConstantR1<tsl::float8_e4m3b11>(&builder, constant);
+  // F8 outputs are not yet supported so convert to F32
+  ConvertElementType(c, F32);
+
+  ComputeAndCompareR1<float>(&builder, {2.0f}, {}, error_spec_);
+}
+
 TEST_F(ConstantsTest, EightCells) {
   std::vector<float> constant = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
 
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index a838dbf7a90..e303b8a90f1 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -642,6 +642,7 @@ XLA_TEST_F(ConvertTest, ConvertF16F8e4m3fnRoundtrip) {
   } test_cases[] = {
       // clang-format off
       {0.0, 0.0},
+      {-0.0, -0.0},
       {1.0, 1.0},
       {-1.0, -1.0},
       {inf, nan},
@@ -731,6 +732,109 @@ XLA_TEST_F(ConvertTest, ConvertF8e4m3fnF16RoundtripExhaustive3) {
   ComputeAndCompare(&builder, {}, ErrorSpec(0.));
 }
 
+XLA_TEST_F(ConvertTest, ConvertF16F8e4m3b11fnuzRoundtrip) {
+  // Convert from FP16 to FP8, then back to FP16
+  XlaBuilder builder(TestName());
+  float nan = std::numeric_limits<float>::quiet_NaN();
+  float inf = std::numeric_limits<float>::infinity();
+
+  struct TestCase {
+    float input;
+    float expected_roundtrip;
+  } test_cases[] = {
+      // clang-format off
+      {0.0, 0.0},
+      {-0.0, 0.0},
+      {1.0, 1.0},
+      {-1.0, -1.0},
+      {inf, nan},
+      // clang-format on
+      {0x1.1p0, 0x1p0},      // Round-to-even down
+      {0x1.3p0, 0x1.4p0},    // Round-to-even up
+      {0x1.Ep4, 0x1.Ep4},    // Max value
+      {0x1.EFCp4, 0x1.Ep4},  // Largest number that doesn't overflow
+      {0x1.Fp4, nan},        // Smallest number that overflows
+      {0x1p5, nan},          // Overflow
+      {0x1p-10, 0x1p-10},    // Smallest F8 normal
+      {0x1.Ep-11, 0x1p-10},  // Smallest number rounding up to normal
+
+      // Denormal tests
+      {0x1.0p-12, 0x1.0p-12},    // Denormal without rounding
+      {0x1.4p-12, 0x1.0p-12},    // Round-to-even down
+      {0x1.Cp-12, 0x1.0p-11},    // Round-to-even up
+      {0x1.5p-11, 0x1.4p-11},    // Round-to-nearest down
+      {0x1.3p-11, 0x1.4p-11},    // Round-to-nearest up
+      {0x1p-14, 0},              // Largest number that underflows
+      {0x1.004p-14, 0x1p-13},    // Smallest number that doesn't underflow
+      {0x1.DFCp-11, 0x1.Cp-11},  // Largest number that rounds to denormal
+  };
+
+  std::vector<Eigen::half> inputs;
+  std::vector<Eigen::half> expected_roundtrip;
+  for (auto test_case : test_cases) {
+    inputs.push_back(Eigen::half{test_case.input});
+    expected_roundtrip.push_back(Eigen::half{test_case.expected_roundtrip});
+  }
+
+  auto f8 = ConvertElementType(ConstantR1<Eigen::half>(&builder, inputs),
+                               F8E4M3B11FNUZ);
+  ConvertElementType(f8, F16);
+  const bool saved =
+      execution_options_.debug_options().xla_allow_excess_precision();
+  execution_options_.mutable_debug_options()->set_xla_allow_excess_precision(
+      false);
+  ComputeAndCompareR1<Eigen::half>(&builder, expected_roundtrip, {},
+                                   ErrorSpec(0.));
+  execution_options_.mutable_debug_options()->set_xla_allow_excess_precision(
+      saved);
+}
+
+XLA_TEST_F(ConvertTest, ConvertF8e4m3b11fnuzF16RoundtripExhaustive) {
+  // Convert from FP8 to FP16, then back to FP8
+  XlaBuilder builder(TestName());
+
+  std::vector<tsl::float8_e4m3b11> all_f8;
+  for (int i = 0; i < 256; i++) {
+    all_f8.push_back(
+        Eigen::numext::bit_cast<tsl::float8_e4m3b11>(static_cast<uint8_t>(i)));
+  }
+
+  xla::XlaOp all_f8_as_f8 = ConstantR1<tsl::float8_e4m3b11>(&builder, all_f8);
+  xla::XlaOp all_f8_as_f16 = ConvertElementType(all_f8_as_f8, F16);
+  ConvertElementType(all_f8_as_f16, F8E4M3B11FNUZ);
+  ComputeAndCompare(&builder, {}, ErrorSpec(0.));
+}
+
+XLA_TEST_F(ConvertTest, ConvertF8e4m3b11fnuzF16RoundtripExhaustive2) {
+  // Convert from FP32 to FP8.
+  XlaBuilder builder(TestName());
+
+  std::vector<float> all_f8;
+  for (int i = 0; i < 256; i++) {
+    all_f8.push_back(static_cast<float>(
+        Eigen::numext::bit_cast<tsl::float8_e4m3b11>(static_cast<uint8_t>(i))));
+  }
+
+  xla::XlaOp all_f8_as_f32 = ConstantR1<float>(&builder, all_f8);
+  ConvertElementType(all_f8_as_f32, F8E4M3B11FNUZ);
+  ComputeAndCompare(&builder, {}, ErrorSpec(0.));
+}
+
+XLA_TEST_F(ConvertTest, ConvertF8e4m3b11fnuzF16RoundtripExhaustive3) {
+  // Convert from FP8 to FP32.
+  XlaBuilder builder(TestName());
+
+  std::vector<tsl::float8_e4m3b11> all_f8;
+  for (int i = 0; i < 256; i++) {
+    all_f8.push_back(
+        Eigen::numext::bit_cast<tsl::float8_e4m3b11>(static_cast<uint8_t>(i)));
+  }
+
+  xla::XlaOp all_f8_as_f8 = ConstantR1<tsl::float8_e4m3b11>(&builder, all_f8);
+  ConvertElementType(all_f8_as_f8, F32);
+  ComputeAndCompare(&builder, {}, ErrorSpec(0.));
+}
+
 XLA_TEST_F(ConvertTest, ConvertF8e5m2ToPred) {
   XlaBuilder builder(TestName());
   using F8 = tsl::float8_e5m2;
diff --git a/tensorflow/compiler/xla/tests/convolution_cudnn_test.cc b/tensorflow/compiler/xla/tests/convolution_cudnn_test.cc
index c43b27d4029..e46ba714498 100644
--- a/tensorflow/compiler/xla/tests/convolution_cudnn_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_cudnn_test.cc
@@ -42,6 +42,16 @@ ENTRY TestComputation {
 }
 
 XLA_TEST_F(ConvolutionHloTest, TestCudnnConvInt8x32Bias) {
+  // cudnnConvolutionBiasActivationForward() for int8 is only supported on GPUs
+  // with compute capability 6.1 or later.
+  if (!backend()
+           .default_stream_executor()
+           ->GetDeviceDescription()
+           .cuda_compute_capability()
+           .IsAtLeast(6, 1)) {
+    return;
+  }
+
   // This convolution with the following add/relu ops should be transformed to
   // "cudnn-conv-bias-activation" and vectorized as INT8x32_CONFIG on GPUs.
   // In order to verify this with non-zero bias and without adding test-specific
diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
index 6637a8df87d..e91c2e8c419 100644
--- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
@@ -61,7 +61,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
   auto dimension_numbers_status =
       CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
-  ASSERT_THAT(dimension_numbers_status.status().error_message(),
+  ASSERT_THAT(dimension_numbers_status.status().message(),
               ::testing::HasSubstr("input are not unique"));
 }
 
@@ -70,7 +70,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) {
   auto dimension_numbers_status =
       CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
-  ASSERT_THAT(dimension_numbers_status.status().error_message(),
+  ASSERT_THAT(dimension_numbers_status.status().message(),
               ::testing::HasSubstr("weight are not unique"));
 }
 
@@ -79,7 +79,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidOutputDimensionNumbers) {
   auto dimension_numbers_status =
       CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0, 1, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
-  ASSERT_THAT(dimension_numbers_status.status().error_message(),
+  ASSERT_THAT(dimension_numbers_status.status().message(),
               ::testing::HasSubstr("output are not unique"));
 }
 
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 28b64a1a15e..5824e6f6bde 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -272,7 +272,7 @@ XLA_TEST_F(CustomCallTest, ReportsFailure) {
 
   auto status = Execute(std::move(module), {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
-  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("Failed: 42.0"));
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed: 42.0"));
 }
 
 XLA_TEST_F(CustomCallTest, ReportsFirstFailure) {
@@ -296,7 +296,7 @@ XLA_TEST_F(CustomCallTest, ReportsFirstFailure) {
 
   auto status = Execute(std::move(module), {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
-  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("Failed: 1.0"));
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed: 1.0"));
 }
 
 XLA_TEST_F(CustomCallTest, TransitiveCustomCallReportsFirstFailure) {
@@ -319,7 +319,7 @@ XLA_TEST_F(CustomCallTest, TransitiveCustomCallReportsFirstFailure) {
 
   auto status = Execute(std::move(module), {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
-  EXPECT_THAT(status.error_message(), HasSubstr("Failed: 1.0"));
+  EXPECT_THAT(status.message(), HasSubstr("Failed: 1.0"));
 }
 
 XLA_TEST_F(CustomCallTest, FillStatusMsgWithBackendConfigStr) {
@@ -342,7 +342,7 @@ XLA_TEST_F(CustomCallTest, FillStatusMsgWithBackendConfigStr) {
 
   auto status = Execute(std::move(module), {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Fail with raw backend config str: foo"));
 }
 
diff --git a/tensorflow/compiler/xla/tests/deallocation_test.cc b/tensorflow/compiler/xla/tests/deallocation_test.cc
index 7593b705bc8..8903e911854 100644
--- a/tensorflow/compiler/xla/tests/deallocation_test.cc
+++ b/tensorflow/compiler/xla/tests/deallocation_test.cc
@@ -59,7 +59,7 @@ TEST_F(DeallocationTest, DeallocateScalar) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_THAT(transfer_status.status().error_message(),
+  ASSERT_THAT(transfer_status.status().message(),
               HasSubstr("was previously deallocated"));
 }
 
@@ -72,7 +72,7 @@ TEST_F(DeallocationTest, DeallocateVector) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_THAT(transfer_status.status().error_message(),
+  ASSERT_THAT(transfer_status.status().message(),
               HasSubstr("was previously deallocated"));
 }
 
@@ -85,7 +85,7 @@ TEST_F(DeallocationTest, DeallocateEmptyVector) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_THAT(transfer_status.status().error_message(),
+  ASSERT_THAT(transfer_status.status().message(),
               HasSubstr("was previously deallocated"));
 }
 
@@ -99,7 +99,7 @@ XLA_TEST_F(DeallocationTest, DeallocateTuple) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_THAT(transfer_status.status().error_message(),
+  ASSERT_THAT(transfer_status.status().message(),
               HasSubstr("was previously deallocated"));
 }
 
@@ -115,7 +115,7 @@ XLA_TEST_F(DeallocationTest, DeallocateTupleWithRepeatedElements) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_THAT(transfer_status.status().error_message(),
+  ASSERT_THAT(transfer_status.status().message(),
               HasSubstr("was previously deallocated"));
 }
 
@@ -131,7 +131,7 @@ XLA_TEST_F(DeallocationTest, DeallocateNestedTuple) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_THAT(transfer_status.status().error_message(),
+  ASSERT_THAT(transfer_status.status().message(),
               HasSubstr("was previously deallocated"));
 }
 
diff --git a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
index e9d5abffbe9..6e846722730 100644
--- a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
@@ -164,7 +164,7 @@ TEST_F(DeconstructTupleTest, DeconstructNonTuple) {
 
   auto result_status = client_->DeconstructTuple(*global_data);
   EXPECT_FALSE(result_status.ok());
-  EXPECT_THAT(result_status.status().error_message(),
+  EXPECT_THAT(result_status.status().message(),
               ContainsRegex("global data handle .* is not a tuple"));
 }
 
@@ -192,7 +192,7 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructNestedTuple) {
 
   auto result_status = client_->DeconstructTuple(*global_data);
   EXPECT_FALSE(result_status.ok());
-  EXPECT_THAT(result_status.status().error_message(),
+  EXPECT_THAT(result_status.status().message(),
               HasSubstr("Deconstructing nested tuples is not implemented"));
 }
 
diff --git a/tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
index 9c3e08fbdb5..cb4cd9e3c9e 100644
--- a/tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include <fenv.h>  // NOLINT
 
+#include <array>
 #include <cmath>
 #include <limits>
+#include <tuple>
+#include <utility>
 
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h"
@@ -330,6 +333,21 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Expm1, {
   }
 })
 
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Logistic, {
+  EvaluateOp fn = +[](float x) { return 1.0f / (1.0f + std::exp(-x)); };
+
+  Run(
+      Logistic, fn, +[](NativeT) {
+        // Notice that we use the same absolute and relative tolerance.
+        // Since Logistic(x) -> 0 for x -> -Inf, the relative error
+        // is actually enormous for x < -5, say.
+        // For example, Logistic(-13.8183346) = 1.9967556e-06 while the expected
+        // value is 9.97178972e-07.
+        float tol = 200.0f * std::numeric_limits<NativeT>::epsilon();
+        return ErrorSpec(tol, tol);
+      });
+})
+
 // It feels a little overkill to exhaustively test sqrt and pow(x, 0.5), but
 // this *did* find a bug, namely that some backends were assuming sqrt(x) ==
 // pow(x, 0.5), but this is not true for x == -inf.
@@ -514,7 +532,17 @@ UNARY_TEST_BF16(Tan, {
 // UNARY_TEST_FLOAT_32_BITS_OR_LESS(Atan) { Run(Atan, std::atan); }
 // UNARY_TEST_FLOAT_32_BITS_OR_LESS(Atan2) { Run(Atan2, std::atan2); }
 
-UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erf, { Run(Erf, std::erf); })
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erf, {
+  Run(
+      Erf, std::erf, +[](NativeT x) {
+        NativeT tol =
+            std::max(std::numeric_limits<NativeT>::epsilon(),
+                     NativeT(4 * std::numeric_limits<float>::epsilon()));
+        NativeT abs_tol = std::min(tol, NativeT(1 - std::abs(std::erf(x))));
+        return ErrorSpec(abs_tol, tol);
+      });
+})
+
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erfc, { Run(Erfc, std::erfc); })
 
 UNARY_TEST_F32(ErfInv, { Run(ErfInv, HostErfInv); })
diff --git a/tensorflow/compiler/xla/tests/fuzz/BUILD b/tensorflow/compiler/xla/tests/fuzz/BUILD
new file mode 100644
index 00000000000..54670996a75
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/BUILD
@@ -0,0 +1,16 @@
+load("//tensorflow/compiler/xla/tests/fuzz:build_defs.bzl", "hlo_test")
+
+hlo_test(
+    name = "rand",
+    hlo_files = glob(
+        include = ["rand_*.hlo"],
+        exclude = [
+            "rand_1.hlo",  # fails on GPU
+            "rand_4.hlo",  # times out during coverage
+            # These fail on all platforms
+            "rand_60.hlo",
+            "rand_67.hlo",
+            "rand_72.hlo",
+        ],
+    ),
+)
diff --git a/tensorflow/compiler/xla/tests/fuzz/build_defs.bzl b/tensorflow/compiler/xla/tests/fuzz/build_defs.bzl
new file mode 100644
index 00000000000..83e0e5abfaf
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/build_defs.bzl
@@ -0,0 +1,21 @@
+"""Build rules for XLA generated regression testing."""
+
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
+
+def hlo_test(name, hlo_files, **kwargs):
+    for hlo in hlo_files:
+        without_extension = hlo.split(".")[0]
+        xla_test(
+            name = without_extension,
+            srcs = ["hlo_test_template.cc"],
+            env = {"HLO_PATH": "$(location {})".format(hlo)},
+            data = [hlo],
+            deps = [
+                "//tensorflow/compiler/xla/tests:hlo_test_base",
+                "//tensorflow/compiler/xla:error_spec",
+                "//tensorflow/tsl/platform:env",
+                "//tensorflow/tsl/platform:test_main",
+            ],
+            real_hardware_only = True,
+            **kwargs
+        )
diff --git a/tensorflow/compiler/xla/tests/fuzz/hlo_test_template.cc b/tensorflow/compiler/xla/tests/fuzz/hlo_test_template.cc
new file mode 100644
index 00000000000..e7bf3b19d16
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/hlo_test_template.cc
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdlib>
+#include <iostream>
+#include <ostream>
+#include <string>
+
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/tsl/platform/env.h"
+
+namespace xla {
+namespace {
+
+class HloTest : public HloTestBase {};
+
+TEST_F(HloTest, HloTest) {
+  std::string path_to_hlo = std::getenv("HLO_PATH");
+  std::cout << path_to_hlo << std::endl;
+  std::string hlo;
+  TF_CHECK_OK(tsl::ReadFileToString(tsl::Env::Default(), path_to_hlo, &hlo));
+  std::cerr << hlo << std::endl;
+  HloModuleConfig config;
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo, config));
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{0.01, 0.01}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_0.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_0.hlo
new file mode 100644
index 00000000000..18141540c12
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_0.hlo
@@ -0,0 +1,16 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s8[10,1]{1,0},s8[10,1]{1,0})->(s8[10,1]{1,0})}
+
+ENTRY main.11 {
+  Arg_0.1 = s8[10,1]{1,0} parameter(0)
+  constant.3 = s8[] constant(1)
+  broadcast.4 = s8[5,1]{1,0} broadcast(constant.3), dimensions={}
+  constant.5 = s32[] constant(5)
+  constant.6 = s32[] constant(0)
+  dynamic-update-slice.7 = s8[10,1]{1,0} dynamic-update-slice(Arg_0.1, broadcast.4, constant.5, constant.6)
+  Arg_1.2 = s8[10,1]{1,0} parameter(1)
+  sign.8 = s8[10,1]{1,0} sign(Arg_1.2)
+  maximum.9 = s8[10,1]{1,0} maximum(dynamic-update-slice.7, sign.8)
+  ROOT tuple.10 = (s8[10,1]{1,0}) tuple(maximum.9)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_1.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_1.hlo
new file mode 100644
index 00000000000..87f3aa2ca63
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_1.hlo
@@ -0,0 +1,40 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s16[8,12,16]{2,1,0},s32[1,3]{1,0},s32[3,11,2,4]{3,2,1,0},s16[3,3,11,2,1,9,4]{6,5,4,3,2,1,0})->(s16[8,12,16]{2,1,0})}
+
+region_0.21 {
+  Arg_0.22 = s16[] parameter(0)
+  ROOT Arg_1.23 = s16[] parameter(1)
+}
+
+region_1.26 {
+  Arg_0.27 = s16[] parameter(0)
+  ROOT Arg_1.28 = s16[] parameter(1)
+}
+
+ENTRY main.31 {
+  Arg_0.1 = s16[8,12,16]{2,1,0} parameter(0)
+  constant.11 = s16[] constant(1)
+  broadcast.12 = s16[8,12,16]{2,1,0} broadcast(constant.11), dimensions={}
+  and.17 = s16[8,12,16]{2,1,0} and(Arg_0.1, broadcast.12)
+  constant.5 = s32[] constant(0)
+  broadcast.6 = s32[1,3]{1,0} broadcast(constant.5), dimensions={}
+  Arg_1.2 = s32[1,3]{1,0} parameter(1)
+  constant.13 = s32[] constant(1)
+  broadcast.14 = s32[1,3]{1,0} broadcast(constant.13), dimensions={}
+  shift-right-logical.18 = s32[1,3]{1,0} shift-right-logical(Arg_1.2, broadcast.14)
+  constant.7 = s32[1,3]{1,0} constant({ { 1, 1, 5 } })
+  clamp.25 = s32[1,3]{1,0} clamp(broadcast.6, shift-right-logical.18, constant.7)
+  constant.15 = s16[] constant(1)
+  broadcast.16 = s16[3,1,11,15]{3,2,1,0} broadcast(constant.15), dimensions={}
+  constant.8 = s32[] constant(0)
+  broadcast.9 = s32[3,11,2,4]{3,2,1,0} broadcast(constant.8), dimensions={}
+  Arg_2.3 = s32[3,11,2,4]{3,2,1,0} parameter(2)
+  constant.10 = s32[4]{0} constant({2, 0, 11, 0})
+  broadcast.19 = s32[3,11,2,4]{3,2,1,0} broadcast(constant.10), dimensions={3}
+  clamp.20 = s32[3,11,2,4]{3,2,1,0} clamp(broadcast.9, Arg_2.3, broadcast.19)
+  Arg_3.4 = s16[3,3,11,2,1,9,4]{6,5,4,3,2,1,0} parameter(3)
+  scatter.24 = s16[3,1,11,15]{3,2,1,0} scatter(broadcast.16, clamp.20, Arg_3.4), update_window_dims={1,4,5,6}, inserted_window_dims={}, scatter_dims_to_operand_dims={2,0,3,1}, index_vector_dim=3, to_apply=region_0.21
+  scatter.29 = s16[8,12,16]{2,1,0} scatter(and.17, clamp.25, scatter.24), update_window_dims={0,2,3}, inserted_window_dims={}, scatter_dims_to_operand_dims={1,2,0}, index_vector_dim=1, to_apply=region_1.26
+  ROOT tuple.30 = (s16[8,12,16]{2,1,0}) tuple(scatter.29)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_10.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_10.hlo
new file mode 100644
index 00000000000..65edb2e1226
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_10.hlo
@@ -0,0 +1,24 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s32[3,7,1,2]{3,2,1,0})->(pred[5,16]{1,0})}
+
+region_0.12 {
+  Arg_0.13 = pred[] parameter(0)
+  ROOT Arg_1.14 = pred[] parameter(1)
+}
+
+ENTRY main.17 {
+  constant.4 = pred[] constant(false)
+  broadcast.5 = pred[5,16]{1,0} broadcast(constant.4), dimensions={}
+  Arg_0.1 = s32[3,7,1,2]{3,2,1,0} parameter(0)
+  constant.9 = s32[] constant(1)
+  broadcast.10 = s32[3,6,1,1]{3,2,1,0} broadcast(constant.9), dimensions={}
+  constant.8 = s32[] constant(1)
+  constant.7 = s32[] constant(4)
+  constant.6 = s32[] constant(0)
+  dynamic-update-slice.11 = s32[3,7,1,2]{3,2,1,0} dynamic-update-slice(Arg_0.1, broadcast.10, constant.8, constant.7, constant.6, constant.6)
+  constant.2 = pred[] constant(true)
+  broadcast.3 = pred[3,4,8,7,1]{4,3,2,1,0} broadcast(constant.2), dimensions={}
+  scatter.15 = pred[5,16]{1,0} scatter(broadcast.5, dynamic-update-slice.11, broadcast.3), update_window_dims={1,2}, inserted_window_dims={}, scatter_dims_to_operand_dims={1,0}, index_vector_dim=3, to_apply=region_0.12
+  ROOT tuple.16 = (pred[5,16]{1,0}) tuple(scatter.15)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_11.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_11.hlo
new file mode 100644
index 00000000000..339e7f5f8ab
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_11.hlo
@@ -0,0 +1,9 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(f16[10,4,11]{2,1,0})}
+
+ENTRY main.4 {
+  constant.1 = f16[] constant(0.15735)
+  broadcast.2 = f16[10,4,11]{2,1,0} broadcast(constant.1), dimensions={}
+  ROOT tuple.3 = (f16[10,4,11]{2,1,0}) tuple(broadcast.2)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_12.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_12.hlo
new file mode 100644
index 00000000000..b3eb4b912c2
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_12.hlo
@@ -0,0 +1,11 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(u16[13,6,16]{2,1,0})}
+
+ENTRY main.6 {
+  constant.1 = u16[] constant(1)
+  broadcast.2 = u16[13,6,16]{2,1,0} broadcast(constant.1), dimensions={}
+  shift-left.3 = u16[13,6,16]{2,1,0} shift-left(broadcast.2, broadcast.2)
+  shift-right-arithmetic.4 = u16[13,6,16]{2,1,0} shift-right-arithmetic(broadcast.2, shift-left.3)
+  ROOT tuple.5 = (u16[13,6,16]{2,1,0}) tuple(shift-right-arithmetic.4)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_13.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_13.hlo
new file mode 100644
index 00000000000..0ae31e850a9
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_13.hlo
@@ -0,0 +1,14 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(c64[9,15,10,13,6]{4,3,2,1,0},s32[8,24,1,1,1]{4,3,2,1,0})->(c64[5,8,12,1]{3,2,1,0})}
+
+ENTRY main.9 {
+  Arg_0.1 = c64[9,15,10,13,6]{4,3,2,1,0} parameter(0)
+  reverse.3 = c64[9,15,10,13,6]{4,3,2,1,0} reverse(Arg_0.1), dimensions={3,1}
+  Arg_1.2 = s32[8,24,1,1,1]{4,3,2,1,0} parameter(1)
+  transpose.4 = s32[1,1,24,1,8]{0,1,3,2,4} transpose(Arg_1.2), dimensions={4,3,1,2,0}
+  reshape.5 = s32[12,8,2]{2,1,0} reshape(transpose.4)
+  gather.6 = c64[5,12,8,1]{3,2,1,0} gather(reverse.3, reshape.5), offset_dims={0,3}, collapsed_slice_dims={0,2,3}, start_index_map={0,3}, index_vector_dim=2, slice_sizes={1,5,1,1,1}
+  reshape.7 = c64[5,8,12,1]{3,2,1,0} reshape(gather.6)
+  ROOT tuple.8 = (c64[5,8,12,1]{3,2,1,0}) tuple(reshape.7)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_14.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_14.hlo
new file mode 100644
index 00000000000..8e6ee6a6870
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_14.hlo
@@ -0,0 +1,9 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(s8[13]{0})}
+
+ENTRY main.4 {
+  constant.1 = s8[] constant(1)
+  broadcast.2 = s8[13]{0} broadcast(constant.1), dimensions={}
+  ROOT tuple.3 = (s8[13]{0}) tuple(broadcast.2)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_15.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_15.hlo
new file mode 100644
index 00000000000..9407a9d3ef7
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_15.hlo
@@ -0,0 +1,32 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(c64[10,10,3]{2,1,0},s32[2]{0},c64[9,5]{1,0})->(c64[10,10,3]{2,1,0})}
+
+region_0.10 {
+  Arg_0.11 = c64[] parameter(0)
+  real.13 = f32[] real(Arg_0.11)
+  Arg_1.12 = c64[] parameter(1)
+  real.14 = f32[] real(Arg_1.12)
+  compare.15 = pred[] compare(real.13, real.14), direction=EQ
+  imag.17 = f32[] imag(Arg_0.11)
+  imag.18 = f32[] imag(Arg_1.12)
+  compare.19 = pred[] compare(imag.17, imag.18), direction=LT
+  compare.16 = pred[] compare(real.13, real.14), direction=LT
+  select.20 = pred[] select(compare.15, compare.19, compare.16)
+  ROOT select.21 = c64[] select(select.20, Arg_0.11, Arg_1.12)
+}
+
+ENTRY main.25 {
+  constant.4 = c64[] constant((1, 0))
+  broadcast.5 = c64[10,10,3]{2,1,0} broadcast(constant.4), dimensions={}
+  negate.6 = c64[10,10,3]{2,1,0} negate(broadcast.5)
+  exponential.7 = c64[10,10,3]{2,1,0} exponential(negate.6)
+  add.8 = c64[10,10,3]{2,1,0} add(exponential.7, broadcast.5)
+  divide.9 = c64[10,10,3]{2,1,0} divide(broadcast.5, add.8)
+  Arg_0.1 = c64[10,10,3]{2,1,0} parameter(0)
+  Arg_1.2 = s32[2]{0} parameter(1)
+  Arg_2.3 = c64[9,5]{1,0} parameter(2)
+  scatter.22 = c64[10,10,3]{2,1,0} scatter(Arg_0.1, Arg_1.2, Arg_2.3), update_window_dims={0,1}, inserted_window_dims={2}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=0, to_apply=region_0.10
+  add.23 = c64[10,10,3]{2,1,0} add(divide.9, scatter.22)
+  ROOT tuple.24 = (c64[10,10,3]{2,1,0}) tuple(add.23)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_16.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_16.hlo
new file mode 100644
index 00000000000..c93c340d49a
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_16.hlo
@@ -0,0 +1,19 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s16[8,8,8,9,14,2,3]{6,5,4,3,2,1,0})->(s16[8,14,3]{2,1,0})}
+
+region_0.6 {
+  Arg_0.7 = s16[] parameter(0)
+  Arg_1.8 = s16[] parameter(1)
+  ROOT add.9 = s16[] add(Arg_0.7, Arg_1.8)
+}
+
+ENTRY main.12 {
+  Arg_0.1 = s16[8,8,8,9,14,2,3]{6,5,4,3,2,1,0} parameter(0)
+  constant.3 = s16[] constant(1)
+  broadcast.4 = s16[8,8,8,9,14,2,3]{6,5,4,3,2,1,0} broadcast(constant.3), dimensions={}
+  shift-left.5 = s16[8,8,8,9,14,2,3]{6,5,4,3,2,1,0} shift-left(Arg_0.1, broadcast.4)
+  constant.2 = s16[] constant(0)
+  reduce.10 = s16[8,14,3]{2,1,0} reduce(shift-left.5, constant.2), dimensions={0,2,3,5}, to_apply=region_0.6
+  ROOT tuple.11 = (s16[8,14,3]{2,1,0}) tuple(reduce.10)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_17.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_17.hlo
new file mode 100644
index 00000000000..b90698f5291
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_17.hlo
@@ -0,0 +1,16 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u32[1080,2,2]{2,1,0},u32[6,5,12,12]{3,2,1,0},u32[6,5,12,12]{3,2,1,0})->(u32[6,5,12,12]{3,2,1,0})}
+
+ENTRY main.11 {
+  Arg_0.1 = u32[1080,2,2]{2,1,0} parameter(0)
+  transpose.6 = u32[2,1080,2]{2,0,1} transpose(Arg_0.1), dimensions={1,0,2}
+  reshape.7 = u32[6,5,12,12]{3,2,1,0} reshape(transpose.6)
+  Arg_1.2 = u32[6,5,12,12]{3,2,1,0} parameter(1)
+  constant.4 = u32[] constant(1)
+  broadcast.5 = u32[6,5,12,12]{3,2,1,0} broadcast(constant.4), dimensions={}
+  Arg_2.3 = u32[6,5,12,12]{3,2,1,0} parameter(2)
+  clamp.8 = u32[6,5,12,12]{3,2,1,0} clamp(Arg_1.2, broadcast.5, Arg_2.3)
+  remainder.9 = u32[6,5,12,12]{3,2,1,0} remainder(reshape.7, clamp.8)
+  ROOT tuple.10 = (u32[6,5,12,12]{3,2,1,0}) tuple(remainder.9)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_18.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_18.hlo
new file mode 100644
index 00000000000..af9e259c162
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_18.hlo
@@ -0,0 +1,19 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s8[11,6]{1,0},s8[1,7,9,2,9,4,16]{6,5,4,3,2,1,0})->(s8[11,6]{1,0})}
+
+region_0.5 {
+  Arg_0.6 = s8[] parameter(0)
+  Arg_1.7 = s8[] parameter(1)
+  ROOT add.8 = s8[] add(Arg_0.6, Arg_1.7)
+}
+
+ENTRY main.12 {
+  Arg_0.1 = s8[11,6]{1,0} parameter(0)
+  constant.3 = s32[] constant(1)
+  broadcast.4 = s32[1,7,9,2,16,2]{5,4,3,2,1,0} broadcast(constant.3), dimensions={}
+  Arg_1.2 = s8[1,7,9,2,9,4,16]{6,5,4,3,2,1,0} parameter(1)
+  scatter.9 = s8[11,6]{1,0} scatter(Arg_0.1, broadcast.4, Arg_1.2), update_window_dims={4,5}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=5, to_apply=region_0.5
+  abs.10 = s8[11,6]{1,0} abs(scatter.9)
+  ROOT tuple.11 = (s8[11,6]{1,0}) tuple(abs.10)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_19.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_19.hlo
new file mode 100644
index 00000000000..42eb753d14f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_19.hlo
@@ -0,0 +1,21 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(c64[5,16]{1,0})->(c64[5,16]{1,0})}
+
+ENTRY main.16 {
+  constant.4 = c64[] constant((1, 0))
+  broadcast.5 = c64[5,16]{1,0} broadcast(constant.4), dimensions={}
+  constant.6 = c64[] constant((0, 0))
+  broadcast.7 = c64[5,16]{1,0} broadcast(constant.6), dimensions={}
+  Arg_0.1 = c64[5,16]{1,0} parameter(0)
+  compare.10 = pred[5,16]{1,0} compare(broadcast.7, Arg_0.1), direction=EQ
+  select.11 = c64[5,16]{1,0} select(compare.10, broadcast.5, Arg_0.1)
+  divide.12 = c64[5,16]{1,0} divide(broadcast.5, select.11)
+  constant.8 = f32[] constant(1)
+  broadcast.9 = f32[5,16]{1,0} broadcast(constant.8), dimensions={}
+  constant.2 = f32[] constant(0)
+  broadcast.3 = f32[5,16]{1,0} broadcast(constant.2), dimensions={}
+  complex.13 = c64[5,16]{1,0} complex(broadcast.9, broadcast.3)
+  subtract.14 = c64[5,16]{1,0} subtract(divide.12, complex.13)
+  ROOT tuple.15 = (c64[5,16]{1,0}) tuple(subtract.14)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_2.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_2.hlo
new file mode 100644
index 00000000000..07465550744
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_2.hlo
@@ -0,0 +1,14 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s8[8]{0},s8[8]{0})->(s8[8]{0})}
+
+ENTRY main.9 {
+  constant.3 = s8[] constant(1)
+  broadcast.4 = s8[8]{0} broadcast(constant.3), dimensions={}
+  Arg_0.1 = s8[8]{0} parameter(0)
+  remainder.5 = s8[8]{0} remainder(broadcast.4, Arg_0.1)
+  Arg_1.2 = s8[8]{0} parameter(1)
+  minimum.6 = s8[8]{0} minimum(Arg_1.2, broadcast.4)
+  xor.7 = s8[8]{0} xor(remainder.5, minimum.6)
+  ROOT tuple.8 = (s8[8]{0}) tuple(xor.7)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_20.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_20.hlo
new file mode 100644
index 00000000000..1bdff050664
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_20.hlo
@@ -0,0 +1,15 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(f16[],f16[])->(f16[])}
+
+ENTRY main.10 {
+  Arg_0.1 = f16[] parameter(0)
+  cosine.5 = f16[] cosine(Arg_0.1)
+  constant.3 = f16[] constant(0)
+  maximum.7 = f16[] maximum(cosine.5, constant.3)
+  Arg_1.2 = f16[] parameter(1)
+  constant.4 = f16[] constant(1)
+  maximum.6 = f16[] maximum(Arg_1.2, constant.4)
+  power.8 = f16[] power(maximum.7, maximum.6)
+  ROOT tuple.9 = (f16[]) tuple(power.8)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_21.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_21.hlo
new file mode 100644
index 00000000000..e06c1db9bd0
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_21.hlo
@@ -0,0 +1,11 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s8[])->(s8[])}
+
+ENTRY main.6 {
+  Arg_0.1 = s8[] parameter(0)
+  constant.2 = s8[] constant(1)
+  divide.3 = s8[] divide(Arg_0.1, constant.2)
+  multiply.4 = s8[] multiply(divide.3, divide.3)
+  ROOT tuple.5 = (s8[]) tuple(multiply.4)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_22.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_22.hlo
new file mode 100644
index 00000000000..cbc4a650ce1
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_22.hlo
@@ -0,0 +1,171 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(f16[10,14,15]{2,1,0},f16[6,14,13]{2,1,0})->(f16[10,6,15]{2,1,0})}
+
+region_0.156 {
+  Arg_0.157 = f16[] parameter(0)
+  Arg_1.158 = f16[] parameter(1)
+  ROOT add.159 = f16[] add(Arg_0.157, Arg_1.158)
+}
+
+ENTRY main.164 {
+  Arg_0.1 = f16[10,14,15]{2,1,0} parameter(0)
+  convert.64 = f32[10,14,15]{2,1,0} convert(Arg_0.1)
+  abs.65 = f32[10,14,15]{2,1,0} abs(convert.64)
+  constant.3 = f32[] constant(8)
+  broadcast.4 = f32[10,14,15]{2,1,0} broadcast(constant.3), dimensions={}
+  compare.151 = pred[10,14,15]{2,1,0} compare(abs.65, broadcast.4), direction=LE
+  constant.5 = f32[] constant(0.5)
+  broadcast.6 = f32[10,14,15]{2,1,0} broadcast(constant.5), dimensions={}
+  multiply.66 = f32[10,14,15]{2,1,0} multiply(abs.65, broadcast.6)
+  constant.61 = f32[] constant(2)
+  broadcast.62 = f32[10,14,15]{2,1,0} broadcast(constant.61), dimensions={}
+  subtract.67 = f32[10,14,15]{2,1,0} subtract(multiply.66, broadcast.62)
+  constant.21 = f32[] constant(0)
+  broadcast.22 = f32[10,14,15]{2,1,0} broadcast(constant.21), dimensions={}
+  multiply.68 = f32[10,14,15]{2,1,0} multiply(subtract.67, broadcast.22)
+  subtract.69 = f32[10,14,15]{2,1,0} subtract(multiply.68, broadcast.22)
+  constant.57 = f32[] constant(-1.30002498e-08)
+  broadcast.58 = f32[10,14,15]{2,1,0} broadcast(constant.57), dimensions={}
+  add.70 = f32[10,14,15]{2,1,0} add(subtract.69, broadcast.58)
+  multiply.71 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.70)
+  subtract.72 = f32[10,14,15]{2,1,0} subtract(multiply.71, broadcast.22)
+  constant.55 = f32[] constant(6.04699508e-08)
+  broadcast.56 = f32[10,14,15]{2,1,0} broadcast(constant.55), dimensions={}
+  add.73 = f32[10,14,15]{2,1,0} add(subtract.72, broadcast.56)
+  multiply.74 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.73)
+  subtract.75 = f32[10,14,15]{2,1,0} subtract(multiply.74, add.70)
+  constant.53 = f32[] constant(-2.67079372e-07)
+  broadcast.54 = f32[10,14,15]{2,1,0} broadcast(constant.53), dimensions={}
+  add.76 = f32[10,14,15]{2,1,0} add(subtract.75, broadcast.54)
+  multiply.77 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.76)
+  subtract.78 = f32[10,14,15]{2,1,0} subtract(multiply.77, add.73)
+  constant.51 = f32[] constant(1.11738757e-06)
+  broadcast.52 = f32[10,14,15]{2,1,0} broadcast(constant.51), dimensions={}
+  add.79 = f32[10,14,15]{2,1,0} add(subtract.78, broadcast.52)
+  multiply.80 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.79)
+  subtract.81 = f32[10,14,15]{2,1,0} subtract(multiply.80, add.76)
+  constant.49 = f32[] constant(-4.41673819e-06)
+  broadcast.50 = f32[10,14,15]{2,1,0} broadcast(constant.49), dimensions={}
+  add.82 = f32[10,14,15]{2,1,0} add(subtract.81, broadcast.50)
+  multiply.83 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.82)
+  subtract.84 = f32[10,14,15]{2,1,0} subtract(multiply.83, add.79)
+  constant.47 = f32[] constant(1.64484482e-05)
+  broadcast.48 = f32[10,14,15]{2,1,0} broadcast(constant.47), dimensions={}
+  add.85 = f32[10,14,15]{2,1,0} add(subtract.84, broadcast.48)
+  multiply.86 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.85)
+  subtract.87 = f32[10,14,15]{2,1,0} subtract(multiply.86, add.82)
+  constant.45 = f32[] constant(-5.75419508e-05)
+  broadcast.46 = f32[10,14,15]{2,1,0} broadcast(constant.45), dimensions={}
+  add.88 = f32[10,14,15]{2,1,0} add(subtract.87, broadcast.46)
+  multiply.89 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.88)
+  subtract.90 = f32[10,14,15]{2,1,0} subtract(multiply.89, add.85)
+  constant.43 = f32[] constant(0.000188502891)
+  broadcast.44 = f32[10,14,15]{2,1,0} broadcast(constant.43), dimensions={}
+  add.91 = f32[10,14,15]{2,1,0} add(subtract.90, broadcast.44)
+  multiply.92 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.91)
+  subtract.93 = f32[10,14,15]{2,1,0} subtract(multiply.92, add.88)
+  constant.41 = f32[] constant(-0.000576375576)
+  broadcast.42 = f32[10,14,15]{2,1,0} broadcast(constant.41), dimensions={}
+  add.94 = f32[10,14,15]{2,1,0} add(subtract.93, broadcast.42)
+  multiply.95 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.94)
+  subtract.96 = f32[10,14,15]{2,1,0} subtract(multiply.95, add.91)
+  constant.39 = f32[] constant(0.00163947558)
+  broadcast.40 = f32[10,14,15]{2,1,0} broadcast(constant.39), dimensions={}
+  add.97 = f32[10,14,15]{2,1,0} add(subtract.96, broadcast.40)
+  multiply.98 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.97)
+  subtract.99 = f32[10,14,15]{2,1,0} subtract(multiply.98, add.94)
+  constant.37 = f32[] constant(-0.00432431)
+  broadcast.38 = f32[10,14,15]{2,1,0} broadcast(constant.37), dimensions={}
+  add.100 = f32[10,14,15]{2,1,0} add(subtract.99, broadcast.38)
+  multiply.101 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.100)
+  subtract.102 = f32[10,14,15]{2,1,0} subtract(multiply.101, add.97)
+  constant.35 = f32[] constant(0.0105464607)
+  broadcast.36 = f32[10,14,15]{2,1,0} broadcast(constant.35), dimensions={}
+  add.103 = f32[10,14,15]{2,1,0} add(subtract.102, broadcast.36)
+  multiply.104 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.103)
+  subtract.105 = f32[10,14,15]{2,1,0} subtract(multiply.104, add.100)
+  constant.33 = f32[] constant(-0.0237374157)
+  broadcast.34 = f32[10,14,15]{2,1,0} broadcast(constant.33), dimensions={}
+  add.106 = f32[10,14,15]{2,1,0} add(subtract.105, broadcast.34)
+  multiply.107 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.106)
+  subtract.108 = f32[10,14,15]{2,1,0} subtract(multiply.107, add.103)
+  constant.31 = f32[] constant(0.0493052825)
+  broadcast.32 = f32[10,14,15]{2,1,0} broadcast(constant.31), dimensions={}
+  add.109 = f32[10,14,15]{2,1,0} add(subtract.108, broadcast.32)
+  multiply.110 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.109)
+  subtract.111 = f32[10,14,15]{2,1,0} subtract(multiply.110, add.106)
+  constant.29 = f32[] constant(-0.0949011)
+  broadcast.30 = f32[10,14,15]{2,1,0} broadcast(constant.29), dimensions={}
+  add.112 = f32[10,14,15]{2,1,0} add(subtract.111, broadcast.30)
+  multiply.113 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.112)
+  subtract.114 = f32[10,14,15]{2,1,0} subtract(multiply.113, add.109)
+  constant.27 = f32[] constant(0.171620905)
+  broadcast.28 = f32[10,14,15]{2,1,0} broadcast(constant.27), dimensions={}
+  add.115 = f32[10,14,15]{2,1,0} add(subtract.114, broadcast.28)
+  multiply.116 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.115)
+  subtract.117 = f32[10,14,15]{2,1,0} subtract(multiply.116, add.112)
+  constant.25 = f32[] constant(-0.304682672)
+  broadcast.26 = f32[10,14,15]{2,1,0} broadcast(constant.25), dimensions={}
+  add.118 = f32[10,14,15]{2,1,0} add(subtract.117, broadcast.26)
+  multiply.119 = f32[10,14,15]{2,1,0} multiply(subtract.67, add.118)
+  subtract.120 = f32[10,14,15]{2,1,0} subtract(multiply.119, add.115)
+  constant.23 = f32[] constant(0.676795304)
+  broadcast.24 = f32[10,14,15]{2,1,0} broadcast(constant.23), dimensions={}
+  add.121 = f32[10,14,15]{2,1,0} add(subtract.120, broadcast.24)
+  subtract.122 = f32[10,14,15]{2,1,0} subtract(add.121, add.115)
+  multiply.123 = f32[10,14,15]{2,1,0} multiply(subtract.122, broadcast.6)
+  constant.59 = f32[] constant(32)
+  broadcast.60 = f32[10,14,15]{2,1,0} broadcast(constant.59), dimensions={}
+  divide.124 = f32[10,14,15]{2,1,0} divide(broadcast.60, abs.65)
+  subtract.125 = f32[10,14,15]{2,1,0} subtract(divide.124, broadcast.62)
+  multiply.126 = f32[10,14,15]{2,1,0} multiply(subtract.125, broadcast.22)
+  subtract.127 = f32[10,14,15]{2,1,0} subtract(multiply.126, broadcast.22)
+  constant.19 = f32[] constant(3.39623196e-09)
+  broadcast.20 = f32[10,14,15]{2,1,0} broadcast(constant.19), dimensions={}
+  add.128 = f32[10,14,15]{2,1,0} add(subtract.127, broadcast.20)
+  multiply.129 = f32[10,14,15]{2,1,0} multiply(subtract.125, add.128)
+  subtract.130 = f32[10,14,15]{2,1,0} subtract(multiply.129, broadcast.22)
+  constant.17 = f32[] constant(2.26666899e-08)
+  broadcast.18 = f32[10,14,15]{2,1,0} broadcast(constant.17), dimensions={}
+  add.131 = f32[10,14,15]{2,1,0} add(subtract.130, broadcast.18)
+  multiply.132 = f32[10,14,15]{2,1,0} multiply(subtract.125, add.131)
+  subtract.133 = f32[10,14,15]{2,1,0} subtract(multiply.132, add.128)
+  constant.15 = f32[] constant(2.04891862e-07)
+  broadcast.16 = f32[10,14,15]{2,1,0} broadcast(constant.15), dimensions={}
+  add.134 = f32[10,14,15]{2,1,0} add(subtract.133, broadcast.16)
+  multiply.135 = f32[10,14,15]{2,1,0} multiply(subtract.125, add.134)
+  subtract.136 = f32[10,14,15]{2,1,0} subtract(multiply.135, add.131)
+  constant.13 = f32[] constant(2.89137051e-06)
+  broadcast.14 = f32[10,14,15]{2,1,0} broadcast(constant.13), dimensions={}
+  add.137 = f32[10,14,15]{2,1,0} add(subtract.136, broadcast.14)
+  multiply.138 = f32[10,14,15]{2,1,0} multiply(subtract.125, add.137)
+  subtract.139 = f32[10,14,15]{2,1,0} subtract(multiply.138, add.134)
+  constant.11 = f32[] constant(6.88975852e-05)
+  broadcast.12 = f32[10,14,15]{2,1,0} broadcast(constant.11), dimensions={}
+  add.140 = f32[10,14,15]{2,1,0} add(subtract.139, broadcast.12)
+  multiply.141 = f32[10,14,15]{2,1,0} multiply(subtract.125, add.140)
+  subtract.142 = f32[10,14,15]{2,1,0} subtract(multiply.141, add.137)
+  constant.9 = f32[] constant(0.00336911646)
+  broadcast.10 = f32[10,14,15]{2,1,0} broadcast(constant.9), dimensions={}
+  add.143 = f32[10,14,15]{2,1,0} add(subtract.142, broadcast.10)
+  multiply.144 = f32[10,14,15]{2,1,0} multiply(subtract.125, add.143)
+  subtract.145 = f32[10,14,15]{2,1,0} subtract(multiply.144, add.140)
+  constant.7 = f32[] constant(0.804490387)
+  broadcast.8 = f32[10,14,15]{2,1,0} broadcast(constant.7), dimensions={}
+  add.146 = f32[10,14,15]{2,1,0} add(subtract.145, broadcast.8)
+  subtract.147 = f32[10,14,15]{2,1,0} subtract(add.146, add.140)
+  multiply.148 = f32[10,14,15]{2,1,0} multiply(subtract.147, broadcast.6)
+  sqrt.149 = f32[10,14,15]{2,1,0} sqrt(abs.65)
+  divide.150 = f32[10,14,15]{2,1,0} divide(multiply.148, sqrt.149)
+  select.152 = f32[10,14,15]{2,1,0} select(compare.151, multiply.123, divide.150)
+  convert.153 = f16[10,14,15]{2,1,0} convert(select.152)
+  Arg_1.2 = f16[6,14,13]{2,1,0} parameter(1)
+  sine.154 = f16[6,14,13]{2,1,0} sine(Arg_1.2)
+  convolution.155 = f16[10,6,16]{2,1,0} convolution(convert.153, sine.154), window={size=13 pad=4_3 lhs_dilate=4 rhs_dilate=4}, dim_labels=bf0_oi0->bf0
+  constant.63 = f16[] constant(0)
+  reduce.160 = f16[10,6]{1,0} reduce(convolution.155, constant.63), dimensions={2}, to_apply=region_0.156
+  broadcast.161 = f16[15,10,6]{2,1,0} broadcast(reduce.160), dimensions={1,2}
+  reshape.162 = f16[10,6,15]{2,1,0} reshape(broadcast.161)
+  ROOT tuple.163 = (f16[10,6,15]{2,1,0}) tuple(reshape.162)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_23.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_23.hlo
new file mode 100644
index 00000000000..15d04acb6db
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_23.hlo
@@ -0,0 +1,15 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u32[9,8,15,3,11]{4,3,2,1,0})->(u32[9,8,15,3,11]{4,3,2,1,0})}
+
+ENTRY main.10 {
+  Arg_0.1 = u32[9,8,15,3,11]{4,3,2,1,0} parameter(0)
+  negate.6 = u32[9,8,15,3,11]{4,3,2,1,0} negate(Arg_0.1)
+  constant.4 = u32[] constant(0)
+  broadcast.5 = u32[9,8,15,3,11]{4,3,2,1,0} broadcast(constant.4), dimensions={}
+  compare.7 = pred[9,8,15,3,11]{4,3,2,1,0} compare(negate.6, broadcast.5), direction=EQ
+  constant.2 = u32[] constant(1)
+  broadcast.3 = u32[9,8,15,3,11]{4,3,2,1,0} broadcast(constant.2), dimensions={}
+  select.8 = u32[9,8,15,3,11]{4,3,2,1,0} select(compare.7, broadcast.5, broadcast.3)
+  ROOT tuple.9 = (u32[9,8,15,3,11]{4,3,2,1,0}) tuple(select.8)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_24.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_24.hlo
new file mode 100644
index 00000000000..1c185134418
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_24.hlo
@@ -0,0 +1,116 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(f16[10]{0})->(f16[10]{0})}
+
+ENTRY main.111 {
+  Arg_0.1 = f16[10]{0} parameter(0)
+  constant.48 = f16[] constant(0.0078125)
+  broadcast.49 = f16[10]{0} broadcast(constant.48), dimensions={}
+  maximum.50 = f16[10]{0} maximum(Arg_0.1, broadcast.49)
+  convert.51 = f32[10]{0} convert(maximum.50)
+  abs.106 = f32[10]{0} abs(convert.51)
+  constant.2 = f32[] constant(inf)
+  broadcast.3 = f32[10]{0} broadcast(constant.2), dimensions={}
+  compare.107 = pred[10]{0} compare(abs.106, broadcast.3), direction=EQ
+  constant.46 = f32[] constant(0.5)
+  broadcast.47 = f32[10]{0} broadcast(constant.46), dimensions={}
+  compare.52 = pred[10]{0} compare(convert.51, broadcast.47), direction=LT
+  abs.91 = f32[10]{0} abs(convert.51)
+  floor.92 = f32[10]{0} floor(abs.91)
+  subtract.93 = f32[10]{0} subtract(abs.91, floor.92)
+  compare.94 = pred[10]{0} compare(broadcast.47, subtract.93), direction=LT
+  constant.44 = f32[] constant(1)
+  broadcast.45 = f32[10]{0} broadcast(constant.44), dimensions={}
+  subtract.95 = f32[10]{0} subtract(broadcast.45, subtract.93)
+  select.96 = f32[10]{0} select(compare.94, subtract.95, subtract.93)
+  constant.6 = f32[] constant(3.14159274)
+  broadcast.7 = f32[10]{0} broadcast(constant.6), dimensions={}
+  multiply.97 = f32[10]{0} multiply(select.96, broadcast.7)
+  sine.98 = f32[10]{0} sine(multiply.97)
+  log.99 = f32[10]{0} log(sine.98)
+  is-finite.102 = pred[10]{0} is-finite(log.99)
+  constant.4 = f32[] constant(1.14472985)
+  broadcast.5 = f32[10]{0} broadcast(constant.4), dimensions={}
+  subtract.100 = f32[10]{0} subtract(broadcast.5, log.99)
+  negate.53 = f32[10]{0} negate(convert.51)
+  subtract.54 = f32[10]{0} subtract(convert.51, broadcast.45)
+  select.55 = f32[10]{0} select(compare.52, negate.53, subtract.54)
+  add.85 = f32[10]{0} add(select.55, broadcast.47)
+  constant.12 = f32[] constant(7.5)
+  broadcast.13 = f32[10]{0} broadcast(constant.12), dimensions={}
+  add.80 = f32[10]{0} add(select.55, broadcast.13)
+  divide.81 = f32[10]{0} divide(select.55, broadcast.13)
+  log-plus-one.82 = f32[10]{0} log-plus-one(divide.81)
+  constant.10 = f32[] constant(2.01490307)
+  broadcast.11 = f32[10]{0} broadcast(constant.10), dimensions={}
+  add.83 = f32[10]{0} add(log-plus-one.82, broadcast.11)
+  divide.84 = f32[10]{0} divide(add.80, add.83)
+  subtract.86 = f32[10]{0} subtract(add.85, divide.84)
+  multiply.87 = f32[10]{0} multiply(subtract.86, add.83)
+  constant.8 = f32[] constant(0.918938518)
+  broadcast.9 = f32[10]{0} broadcast(constant.8), dimensions={}
+  add.89 = f32[10]{0} add(multiply.87, broadcast.9)
+  constant.42 = f32[] constant(676.520386)
+  broadcast.43 = f32[10]{0} broadcast(constant.42), dimensions={}
+  add.56 = f32[10]{0} add(select.55, broadcast.45)
+  divide.57 = f32[10]{0} divide(broadcast.43, add.56)
+  add.58 = f32[10]{0} add(divide.57, broadcast.45)
+  constant.40 = f32[] constant(-1259.13916)
+  broadcast.41 = f32[10]{0} broadcast(constant.40), dimensions={}
+  constant.38 = f32[] constant(2)
+  broadcast.39 = f32[10]{0} broadcast(constant.38), dimensions={}
+  add.59 = f32[10]{0} add(select.55, broadcast.39)
+  divide.60 = f32[10]{0} divide(broadcast.41, add.59)
+  add.61 = f32[10]{0} add(add.58, divide.60)
+  constant.36 = f32[] constant(771.323425)
+  broadcast.37 = f32[10]{0} broadcast(constant.36), dimensions={}
+  constant.34 = f32[] constant(3)
+  broadcast.35 = f32[10]{0} broadcast(constant.34), dimensions={}
+  add.62 = f32[10]{0} add(select.55, broadcast.35)
+  divide.63 = f32[10]{0} divide(broadcast.37, add.62)
+  add.64 = f32[10]{0} add(add.61, divide.63)
+  constant.32 = f32[] constant(-176.615036)
+  broadcast.33 = f32[10]{0} broadcast(constant.32), dimensions={}
+  constant.30 = f32[] constant(4)
+  broadcast.31 = f32[10]{0} broadcast(constant.30), dimensions={}
+  add.65 = f32[10]{0} add(select.55, broadcast.31)
+  divide.66 = f32[10]{0} divide(broadcast.33, add.65)
+  add.67 = f32[10]{0} add(add.64, divide.66)
+  constant.28 = f32[] constant(12.5073433)
+  broadcast.29 = f32[10]{0} broadcast(constant.28), dimensions={}
+  constant.26 = f32[] constant(5)
+  broadcast.27 = f32[10]{0} broadcast(constant.26), dimensions={}
+  add.68 = f32[10]{0} add(select.55, broadcast.27)
+  divide.69 = f32[10]{0} divide(broadcast.29, add.68)
+  add.70 = f32[10]{0} add(add.67, divide.69)
+  constant.24 = f32[] constant(-0.138571098)
+  broadcast.25 = f32[10]{0} broadcast(constant.24), dimensions={}
+  constant.22 = f32[] constant(6)
+  broadcast.23 = f32[10]{0} broadcast(constant.22), dimensions={}
+  add.71 = f32[10]{0} add(select.55, broadcast.23)
+  divide.72 = f32[10]{0} divide(broadcast.25, add.71)
+  add.73 = f32[10]{0} add(add.70, divide.72)
+  constant.20 = f32[] constant(9.98436917e-06)
+  broadcast.21 = f32[10]{0} broadcast(constant.20), dimensions={}
+  constant.18 = f32[] constant(7)
+  broadcast.19 = f32[10]{0} broadcast(constant.18), dimensions={}
+  add.74 = f32[10]{0} add(select.55, broadcast.19)
+  divide.75 = f32[10]{0} divide(broadcast.21, add.74)
+  add.76 = f32[10]{0} add(add.73, divide.75)
+  constant.16 = f32[] constant(1.50563267e-07)
+  broadcast.17 = f32[10]{0} broadcast(constant.16), dimensions={}
+  constant.14 = f32[] constant(8)
+  broadcast.15 = f32[10]{0} broadcast(constant.14), dimensions={}
+  add.77 = f32[10]{0} add(select.55, broadcast.15)
+  divide.78 = f32[10]{0} divide(broadcast.17, add.77)
+  add.79 = f32[10]{0} add(add.76, divide.78)
+  log.88 = f32[10]{0} log(add.79)
+  add.90 = f32[10]{0} add(add.89, log.88)
+  subtract.101 = f32[10]{0} subtract(subtract.100, add.90)
+  negate.103 = f32[10]{0} negate(log.99)
+  select.104 = f32[10]{0} select(is-finite.102, subtract.101, negate.103)
+  select.105 = f32[10]{0} select(compare.52, select.104, add.90)
+  select.108 = f32[10]{0} select(compare.107, broadcast.3, select.105)
+  convert.109 = f16[10]{0} convert(select.108)
+  ROOT tuple.110 = (f16[10]{0}) tuple(convert.109)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_25.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_25.hlo
new file mode 100644
index 00000000000..c5e02fd83d2
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_25.hlo
@@ -0,0 +1,20 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s32[2]{0},f32[7,2]{1,0},f32[11,2,8,8]{3,2,1,0})->(f32[11,2,8,8]{3,2,1,0})}
+
+region_0.6 {
+  Arg_0.7 = f32[] parameter(0)
+  ROOT Arg_1.8 = f32[] parameter(1)
+}
+
+ENTRY main.13 {
+  constant.4 = f32[] constant(1)
+  broadcast.5 = f32[11,2,8,8]{3,2,1,0} broadcast(constant.4), dimensions={}
+  Arg_0.1 = s32[2]{0} parameter(0)
+  Arg_1.2 = f32[7,2]{1,0} parameter(1)
+  scatter.9 = f32[11,2,8,8]{3,2,1,0} scatter(broadcast.5, Arg_0.1, Arg_1.2), update_window_dims={0,1}, inserted_window_dims={2,3}, scatter_dims_to_operand_dims={1,0}, index_vector_dim=0, to_apply=region_0.6
+  Arg_2.3 = f32[11,2,8,8]{3,2,1,0} parameter(2)
+  reverse.10 = f32[11,2,8,8]{3,2,1,0} reverse(Arg_2.3), dimensions={1,2,0}
+  multiply.11 = f32[11,2,8,8]{3,2,1,0} multiply(scatter.9, reverse.10)
+  ROOT tuple.12 = (f32[11,2,8,8]{3,2,1,0}) tuple(multiply.11)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_26.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_26.hlo
new file mode 100644
index 00000000000..b595fbaab58
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_26.hlo
@@ -0,0 +1,20 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s32[13,1]{1,0})->(pred[11]{0})}
+
+region_0.6 {
+  Arg_0.7 = u16[] parameter(0)
+  Arg_1.8 = u16[] parameter(1)
+  ROOT maximum.9 = u16[] maximum(Arg_0.7, Arg_1.8)
+}
+
+ENTRY main.13 {
+  constant.2 = u16[] constant(1)
+  broadcast.3 = u16[11]{0} broadcast(constant.2), dimensions={}
+  Arg_0.1 = s32[13,1]{1,0} parameter(0)
+  constant.4 = u16[] constant(1)
+  broadcast.5 = u16[10,13]{1,0} broadcast(constant.4), dimensions={}
+  scatter.10 = u16[11]{0} scatter(broadcast.3, Arg_0.1, broadcast.5), update_window_dims={0}, inserted_window_dims={}, scatter_dims_to_operand_dims={0}, index_vector_dim=1, to_apply=region_0.6
+  compare.11 = pred[11]{0} compare(broadcast.3, scatter.10), direction=LT
+  ROOT tuple.12 = (pred[11]{0}) tuple(compare.11)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_27.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_27.hlo
new file mode 100644
index 00000000000..8e47055ebea
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_27.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u32[1,14,8]{2,1,0},u32[1,14,8]{2,1,0})->(u32[1,14,8]{2,1,0})}
+
+ENTRY main.8 {
+  Arg_0.1 = u32[1,14,8]{2,1,0} parameter(0)
+  Arg_1.2 = u32[1,14,8]{2,1,0} parameter(1)
+  remainder.5 = u32[1,14,8]{2,1,0} remainder(Arg_0.1, Arg_1.2)
+  constant.3 = u32[] constant(4294967295)
+  broadcast.4 = u32[1,14,8]{2,1,0} broadcast(constant.3), dimensions={}
+  add.6 = u32[1,14,8]{2,1,0} add(remainder.5, broadcast.4)
+  ROOT tuple.7 = (u32[1,14,8]{2,1,0}) tuple(add.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_28.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_28.hlo
new file mode 100644
index 00000000000..7c922222ac4
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_28.hlo
@@ -0,0 +1,9 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(pred[7,5,5,4]{3,2,1,0})}
+
+ENTRY main.4 {
+  constant.1 = pred[] constant(true)
+  broadcast.2 = pred[7,5,5,4]{3,2,1,0} broadcast(constant.1), dimensions={}
+  ROOT tuple.3 = (pred[7,5,5,4]{3,2,1,0}) tuple(broadcast.2)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_29.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_29.hlo
new file mode 100644
index 00000000000..c236afc9207
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_29.hlo
@@ -0,0 +1,10 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s32[9,5,3]{2,1,0},s32[9,5,3]{2,1,0})->(s32[9,5,3]{2,1,0})}
+
+ENTRY main.5 {
+  Arg_0.1 = s32[9,5,3]{2,1,0} parameter(0)
+  Arg_1.2 = s32[9,5,3]{2,1,0} parameter(1)
+  shift-right-arithmetic.3 = s32[9,5,3]{2,1,0} shift-right-arithmetic(Arg_0.1, Arg_1.2)
+  ROOT tuple.4 = (s32[9,5,3]{2,1,0}) tuple(shift-right-arithmetic.3)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_3.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_3.hlo
new file mode 100644
index 00000000000..970fb8e6954
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_3.hlo
@@ -0,0 +1,24 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s32[6,11,8,2]{3,2,1,0},s16[6,11,8,3,5]{4,3,2,1,0})->(s16[3,2,2,14,16]{4,3,2,1,0})}
+
+region_0.10 {
+  Arg_0.11 = s16[] parameter(0)
+  Arg_1.12 = s16[] parameter(1)
+  ROOT maximum.13 = s16[] maximum(Arg_0.11, Arg_1.12)
+}
+
+ENTRY main.17 {
+  constant.3 = s16[] constant(1)
+  broadcast.4 = s16[3,2,2,14,16]{4,3,2,1,0} broadcast(constant.3), dimensions={}
+  constant.5 = s32[] constant(0)
+  broadcast.6 = s32[6,11,8,2]{3,2,1,0} broadcast(constant.5), dimensions={}
+  Arg_0.1 = s32[6,11,8,2]{3,2,1,0} parameter(0)
+  constant.7 = s32[2]{0} constant({11, 0})
+  broadcast.8 = s32[6,11,8,2]{3,2,1,0} broadcast(constant.7), dimensions={3}
+  clamp.9 = s32[6,11,8,2]{3,2,1,0} clamp(broadcast.6, Arg_0.1, broadcast.8)
+  Arg_1.2 = s16[6,11,8,3,5]{4,3,2,1,0} parameter(1)
+  scatter.14 = s16[3,2,2,14,16]{4,3,2,1,0} scatter(broadcast.4, clamp.9, Arg_1.2), update_window_dims={3,4}, inserted_window_dims={1,2,3}, scatter_dims_to_operand_dims={4,0}, index_vector_dim=3, to_apply=region_0.10
+  shift-right-arithmetic.15 = s16[3,2,2,14,16]{4,3,2,1,0} shift-right-arithmetic(scatter.14, broadcast.4)
+  ROOT tuple.16 = (s16[3,2,2,14,16]{4,3,2,1,0}) tuple(shift-right-arithmetic.15)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_30.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_30.hlo
new file mode 100644
index 00000000000..7bf45b610d5
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_30.hlo
@@ -0,0 +1,16 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u8[11,1,8,10]{3,2,1,0},u8[11,1,8,10]{3,2,1,0})->(u8[11,1,8,10]{3,2,1,0})}
+
+ENTRY main.11 {
+  Arg_0.1 = u8[11,1,8,10]{3,2,1,0} parameter(0)
+  constant.5 = u8[] constant(0)
+  broadcast.6 = u8[11,1,8,10]{3,2,1,0} broadcast(constant.5), dimensions={}
+  compare.7 = pred[11,1,8,10]{3,2,1,0} compare(Arg_0.1, broadcast.6), direction=EQ
+  constant.3 = u8[] constant(1)
+  broadcast.4 = u8[11,1,8,10]{3,2,1,0} broadcast(constant.3), dimensions={}
+  select.8 = u8[11,1,8,10]{3,2,1,0} select(compare.7, broadcast.6, broadcast.4)
+  Arg_1.2 = u8[11,1,8,10]{3,2,1,0} parameter(1)
+  or.9 = u8[11,1,8,10]{3,2,1,0} or(select.8, Arg_1.2)
+  ROOT tuple.10 = (u8[11,1,8,10]{3,2,1,0}) tuple(or.9)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_31.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_31.hlo
new file mode 100644
index 00000000000..12a02f6245d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_31.hlo
@@ -0,0 +1,26 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u16[11,12]{1,0})->(u16[12]{0})}
+
+region_0.6 {
+  Arg_0.7 = u16[] parameter(0)
+  Arg_1.8 = u16[] parameter(1)
+  ROOT minimum.9 = u16[] minimum(Arg_0.7, Arg_1.8)
+}
+
+region_1.11 {
+  Arg_0.12 = u16[] parameter(0)
+  Arg_1.13 = u16[] parameter(1)
+  ROOT maximum.14 = u16[] maximum(Arg_0.12, Arg_1.13)
+}
+
+ENTRY main.17 {
+  Arg_0.1 = u16[11,12]{1,0} parameter(0)
+  constant.2 = s32[1]{0} constant({1})
+  constant.4 = u16[] constant(1)
+  broadcast.5 = u16[12]{0} broadcast(constant.4), dimensions={}
+  scatter.10 = u16[11,12]{1,0} scatter(Arg_0.1, constant.2, broadcast.5), update_window_dims={0}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=0, to_apply=region_0.6
+  constant.3 = u16[] constant(0)
+  reduce.15 = u16[12]{0} reduce(scatter.10, constant.3), dimensions={0}, to_apply=region_1.11
+  ROOT tuple.16 = (u16[12]{0}) tuple(reduce.15)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_32.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_32.hlo
new file mode 100644
index 00000000000..f0c7e7f9179
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_32.hlo
@@ -0,0 +1,30 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(f16[3,15,15,13]{3,2,1,0},f16[3,3,11]{2,1,0})->(f16[3,15,15,13]{3,2,1,0})}
+
+region_0.9 {
+  Arg_0.10 = s32[] parameter(0)
+  Arg_1.11 = s32[] parameter(1)
+  ROOT add.12 = s32[] add(Arg_0.10, Arg_1.11)
+}
+
+region_1.16 {
+  Arg_0.17 = f16[] parameter(0)
+  ROOT Arg_1.18 = f16[] parameter(1)
+}
+
+ENTRY main.21 {
+  constant.5 = f16[] constant(1)
+  broadcast.6 = f16[3,15,15,13]{3,2,1,0} broadcast(constant.5), dimensions={}
+  Arg_0.1 = f16[3,15,15,13]{3,2,1,0} parameter(0)
+  power.8 = f16[3,15,15,13]{3,2,1,0} power(broadcast.6, Arg_0.1)
+  constant.3 = s32[] constant(1)
+  broadcast.4 = s32[28]{0} broadcast(constant.3), dimensions={}
+  constant.7 = s32[] constant(0)
+  reduce.13 = s32[] reduce(broadcast.4, constant.7), dimensions={0}, to_apply=region_0.9
+  broadcast.14 = s32[3]{0} broadcast(reduce.13), dimensions={}
+  Arg_1.2 = f16[3,3,11]{2,1,0} parameter(1)
+  ceil.15 = f16[3,3,11]{2,1,0} ceil(Arg_1.2)
+  scatter.19 = f16[3,15,15,13]{3,2,1,0} scatter(power.8, broadcast.14, ceil.15), update_window_dims={0,1,2}, inserted_window_dims={2}, scatter_dims_to_operand_dims={2,0,1}, index_vector_dim=0, to_apply=region_1.16
+  ROOT tuple.20 = (f16[3,15,15,13]{3,2,1,0}) tuple(scatter.19)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_33.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_33.hlo
new file mode 100644
index 00000000000..b24560c508d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_33.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(s32[11,5]{1,0})}
+
+ENTRY main.8 {
+  constant.3 = s32[] constant(1)
+  broadcast.4 = s32[11,3]{1,0} broadcast(constant.3), dimensions={}
+  shift-left.5 = s32[11,3]{1,0} shift-left(broadcast.4, broadcast.4)
+  constant.1 = s32[] constant(-2)
+  broadcast.2 = s32[3,5]{1,0} broadcast(constant.1), dimensions={}
+  dot.6 = s32[11,5]{1,0} dot(shift-left.5, broadcast.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT tuple.7 = (s32[11,5]{1,0}) tuple(dot.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_34.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_34.hlo
new file mode 100644
index 00000000000..001e2e93956
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_34.hlo
@@ -0,0 +1,326 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(bf16[9,11,3]{2,1,0},bf16[11,9,3]{2,1,0})->(bf16[])}
+
+region_0.117 {
+  arg_tuple.118 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) parameter(0)
+  get-tuple-element.119 = pred[] get-tuple-element(arg_tuple.118), index=0
+  get-tuple-element.121 = f32[] get-tuple-element(arg_tuple.118), index=2
+  get-tuple-element.125 = f32[] get-tuple-element(arg_tuple.118), index=6
+  get-tuple-element.120 = f32[] get-tuple-element(arg_tuple.118), index=1
+  constant.127 = f32[] constant(1)
+  add.128 = f32[] add(get-tuple-element.120, constant.127)
+  divide.136 = f32[] divide(get-tuple-element.125, add.128)
+  multiply.137 = f32[] multiply(get-tuple-element.121, divide.136)
+  get-tuple-element.122 = f32[] get-tuple-element(arg_tuple.118), index=3
+  add.138 = f32[] add(get-tuple-element.122, multiply.137)
+  divide.139 = f32[] divide(multiply.137, add.138)
+  constant.126 = f32[] constant(1.1920929e-07)
+  compare.140 = pred[] compare(divide.139, constant.126), direction=GT
+  and.141 = pred[] and(get-tuple-element.119, compare.140)
+  select.142 = f32[] select(get-tuple-element.119, add.128, get-tuple-element.120)
+  select.143 = f32[] select(get-tuple-element.119, multiply.137, get-tuple-element.121)
+  select.144 = f32[] select(get-tuple-element.119, add.138, get-tuple-element.122)
+  get-tuple-element.123 = f32[] get-tuple-element(arg_tuple.118), index=4
+  divide.129 = f32[] divide(get-tuple-element.125, add.128)
+  multiply.130 = f32[] multiply(get-tuple-element.123, divide.129)
+  multiply.131 = f32[] multiply(get-tuple-element.121, get-tuple-element.125)
+  multiply.132 = f32[] multiply(add.128, add.128)
+  divide.133 = f32[] divide(multiply.131, multiply.132)
+  subtract.134 = f32[] subtract(multiply.130, divide.133)
+  select.145 = f32[] select(get-tuple-element.119, subtract.134, get-tuple-element.123)
+  get-tuple-element.124 = f32[] get-tuple-element(arg_tuple.118), index=5
+  add.135 = f32[] add(get-tuple-element.124, subtract.134)
+  select.146 = f32[] select(get-tuple-element.119, add.135, get-tuple-element.124)
+  ROOT tuple.147 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) tuple(and.141, select.142, select.143, select.144, select.145, select.146, get-tuple-element.125)
+}
+
+region_1.148 {
+  arg_tuple.149 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) parameter(0)
+  ROOT get-tuple-element.150 = pred[] get-tuple-element(arg_tuple.149), index=0
+  get-tuple-element.151 = f32[] get-tuple-element(arg_tuple.149), index=1
+  get-tuple-element.152 = f32[] get-tuple-element(arg_tuple.149), index=2
+  get-tuple-element.153 = f32[] get-tuple-element(arg_tuple.149), index=3
+  get-tuple-element.154 = f32[] get-tuple-element(arg_tuple.149), index=4
+  get-tuple-element.155 = f32[] get-tuple-element(arg_tuple.149), index=5
+  get-tuple-element.156 = f32[] get-tuple-element(arg_tuple.149), index=6
+}
+
+region_2.180 {
+  arg_tuple.181 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) parameter(0)
+  get-tuple-element.182 = pred[] get-tuple-element(arg_tuple.181), index=0
+  get-tuple-element.189 = f32[] get-tuple-element(arg_tuple.181), index=7
+  get-tuple-element.186 = f32[] get-tuple-element(arg_tuple.181), index=4
+  constant.200 = f32[] constant(2)
+  add.204 = f32[] add(get-tuple-element.186, constant.200)
+  multiply.209 = f32[] multiply(get-tuple-element.189, add.204)
+  get-tuple-element.191 = f32[] get-tuple-element(arg_tuple.181), index=9
+  get-tuple-element.185 = f32[] get-tuple-element(arg_tuple.181), index=3
+  constant.201 = f32[] constant(1)
+  add.203 = f32[] add(get-tuple-element.185, constant.201)
+  get-tuple-element.187 = f32[] get-tuple-element(arg_tuple.181), index=5
+  add.202 = f32[] add(get-tuple-element.187, constant.201)
+  multiply.205 = f32[] multiply(add.203, add.202)
+  multiply.210 = f32[] multiply(get-tuple-element.191, multiply.205)
+  subtract.211 = f32[] subtract(multiply.209, multiply.210)
+  constant.199 = f32[] constant(0)
+  compare.212 = pred[] compare(subtract.211, constant.199), direction=NE
+  get-tuple-element.183 = f32[] get-tuple-element(arg_tuple.181), index=1
+  get-tuple-element.188 = f32[] get-tuple-element(arg_tuple.181), index=6
+  multiply.206 = f32[] multiply(get-tuple-element.188, add.204)
+  get-tuple-element.190 = f32[] get-tuple-element(arg_tuple.181), index=8
+  multiply.207 = f32[] multiply(get-tuple-element.190, multiply.205)
+  subtract.208 = f32[] subtract(multiply.206, multiply.207)
+  divide.213 = f32[] divide(subtract.208, subtract.211)
+  subtract.214 = f32[] subtract(get-tuple-element.183, divide.213)
+  divide.215 = f32[] divide(subtract.214, divide.213)
+  abs.216 = f32[] abs(divide.215)
+  select.217 = f32[] select(compare.212, abs.216, constant.201)
+  constant.197 = f32[] constant(1.1920929e-07)
+  compare.253 = pred[] compare(select.217, constant.197), direction=GT
+  and.254 = pred[] and(get-tuple-element.182, compare.253)
+  select.218 = f32[] select(compare.212, divide.213, get-tuple-element.183)
+  select.255 = f32[] select(get-tuple-element.182, select.218, get-tuple-element.183)
+  get-tuple-element.184 = f32[] get-tuple-element(arg_tuple.181), index=2
+  select.256 = f32[] select(get-tuple-element.182, select.217, get-tuple-element.184)
+  select.257 = f32[] select(get-tuple-element.182, add.203, get-tuple-element.185)
+  select.258 = f32[] select(get-tuple-element.182, add.204, get-tuple-element.186)
+  abs.235 = f32[] abs(subtract.208)
+  constant.198 = f32[] constant(8388608)
+  compare.236 = pred[] compare(abs.235, constant.198), direction=GT
+  multiply.239 = f32[] multiply(subtract.208, constant.197)
+  select.240 = f32[] select(compare.236, multiply.239, subtract.208)
+  select.259 = f32[] select(get-tuple-element.182, select.240, get-tuple-element.188)
+  multiply.243 = f32[] multiply(subtract.211, constant.197)
+  select.244 = f32[] select(compare.236, multiply.243, subtract.211)
+  select.260 = f32[] select(get-tuple-element.182, select.244, get-tuple-element.189)
+  multiply.237 = f32[] multiply(get-tuple-element.188, constant.197)
+  select.238 = f32[] select(compare.236, multiply.237, get-tuple-element.188)
+  select.261 = f32[] select(get-tuple-element.182, select.238, get-tuple-element.190)
+  multiply.241 = f32[] multiply(get-tuple-element.189, constant.197)
+  select.242 = f32[] select(compare.236, multiply.241, get-tuple-element.189)
+  select.262 = f32[] select(get-tuple-element.182, select.242, get-tuple-element.191)
+  get-tuple-element.194 = f32[] get-tuple-element(arg_tuple.181), index=12
+  multiply.245 = f32[] multiply(get-tuple-element.194, constant.197)
+  select.246 = f32[] select(compare.236, multiply.245, get-tuple-element.194)
+  get-tuple-element.192 = f32[] get-tuple-element(arg_tuple.181), index=10
+  select.263 = f32[] select(get-tuple-element.182, select.246, get-tuple-element.192)
+  get-tuple-element.195 = f32[] get-tuple-element(arg_tuple.181), index=13
+  multiply.247 = f32[] multiply(get-tuple-element.195, constant.197)
+  select.248 = f32[] select(compare.236, multiply.247, get-tuple-element.195)
+  get-tuple-element.193 = f32[] get-tuple-element(arg_tuple.181), index=11
+  select.264 = f32[] select(get-tuple-element.182, select.248, get-tuple-element.193)
+  multiply.219 = f32[] multiply(get-tuple-element.194, add.204)
+  subtract.220 = f32[] subtract(multiply.219, get-tuple-element.188)
+  multiply.221 = f32[] multiply(get-tuple-element.192, multiply.205)
+  subtract.222 = f32[] subtract(subtract.220, multiply.221)
+  multiply.223 = f32[] multiply(get-tuple-element.190, add.202)
+  add.224 = f32[] add(subtract.222, multiply.223)
+  multiply.249 = f32[] multiply(add.224, constant.197)
+  select.250 = f32[] select(compare.236, multiply.249, add.224)
+  select.265 = f32[] select(get-tuple-element.182, select.250, get-tuple-element.194)
+  multiply.225 = f32[] multiply(get-tuple-element.195, add.204)
+  subtract.226 = f32[] subtract(multiply.225, get-tuple-element.189)
+  multiply.227 = f32[] multiply(get-tuple-element.193, multiply.205)
+  subtract.228 = f32[] subtract(subtract.226, multiply.227)
+  multiply.229 = f32[] multiply(get-tuple-element.191, add.202)
+  add.230 = f32[] add(subtract.228, multiply.229)
+  multiply.251 = f32[] multiply(add.230, constant.197)
+  select.252 = f32[] select(compare.236, multiply.251, add.230)
+  select.266 = f32[] select(get-tuple-element.182, select.252, get-tuple-element.195)
+  multiply.231 = f32[] multiply(select.218, add.230)
+  subtract.232 = f32[] subtract(add.224, multiply.231)
+  divide.233 = f32[] divide(subtract.232, subtract.211)
+  get-tuple-element.196 = f32[] get-tuple-element(arg_tuple.181), index=14
+  select.234 = f32[] select(compare.212, divide.233, get-tuple-element.196)
+  select.267 = f32[] select(get-tuple-element.182, select.234, get-tuple-element.196)
+  ROOT tuple.268 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) tuple(and.254, select.255, select.256, select.257, select.258, add.202, select.259, select.260, select.261, select.262, select.263, select.264, select.265, select.266, select.267)
+}
+
+region_3.269 {
+  arg_tuple.270 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) parameter(0)
+  get-tuple-element.272 = f32[] get-tuple-element(arg_tuple.270), index=1
+  get-tuple-element.273 = f32[] get-tuple-element(arg_tuple.270), index=2
+  get-tuple-element.274 = f32[] get-tuple-element(arg_tuple.270), index=3
+  get-tuple-element.275 = f32[] get-tuple-element(arg_tuple.270), index=4
+  get-tuple-element.277 = f32[] get-tuple-element(arg_tuple.270), index=6
+  get-tuple-element.278 = f32[] get-tuple-element(arg_tuple.270), index=7
+  get-tuple-element.279 = f32[] get-tuple-element(arg_tuple.270), index=8
+  get-tuple-element.280 = f32[] get-tuple-element(arg_tuple.270), index=9
+  get-tuple-element.281 = f32[] get-tuple-element(arg_tuple.270), index=10
+  get-tuple-element.282 = f32[] get-tuple-element(arg_tuple.270), index=11
+  get-tuple-element.283 = f32[] get-tuple-element(arg_tuple.270), index=12
+  get-tuple-element.284 = f32[] get-tuple-element(arg_tuple.270), index=13
+  get-tuple-element.285 = f32[] get-tuple-element(arg_tuple.270), index=14
+  get-tuple-element.276 = f32[] get-tuple-element(arg_tuple.270), index=5
+  constant.286 = f32[] constant(2000)
+  compare.287 = pred[] compare(get-tuple-element.276, constant.286), direction=LT
+  get-tuple-element.271 = pred[] get-tuple-element(arg_tuple.270), index=0
+  ROOT and.288 = pred[] and(compare.287, get-tuple-element.271)
+}
+
+ENTRY main.313 {
+  Arg_0.1 = bf16[9,11,3]{2,1,0} parameter(0)
+  Arg_1.2 = bf16[11,9,3]{2,1,0} parameter(1)
+  dot.39 = bf16[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={2,1,0}, rhs_contracting_dims={2,0,1}
+  constant.17 = bf16[] constant(0)
+  maximum.41 = bf16[] maximum(dot.39, constant.17)
+  convert.43 = f32[] convert(maximum.41)
+  constant.7 = f32[] constant(0)
+  compare.44 = pred[] compare(convert.43, constant.7), direction=LE
+  constant.6 = f32[] constant(0.5)
+  constant.8 = f32[] constant(7.5)
+  log-plus-one.31 = f32[] log-plus-one(constant.7)
+  constant.22 = f32[] constant(2.01490307)
+  add.32 = f32[] add(log-plus-one.31, constant.22)
+  divide.33 = f32[] divide(constant.8, add.32)
+  subtract.34 = f32[] subtract(constant.6, divide.33)
+  multiply.35 = f32[] multiply(subtract.34, add.32)
+  constant.21 = f32[] constant(0.918938518)
+  add.36 = f32[] add(multiply.35, constant.21)
+  constant.5 = f32[] constant(5.57361031)
+  add.37 = f32[] add(add.36, constant.5)
+  convert.38 = bf16[] convert(add.37)
+  constant.30 = bf16[] constant(0.007812)
+  maximum.40 = bf16[] maximum(convert.38, constant.30)
+  convert.42 = f32[] convert(maximum.40)
+  compare.45 = pred[] compare(convert.42, constant.7), direction=LE
+  or.46 = pred[] or(compare.44, compare.45)
+  log.50 = f32[] log(convert.43)
+  multiply.51 = f32[] multiply(convert.42, log.50)
+  subtract.52 = f32[] subtract(multiply.51, convert.43)
+  abs.107 = f32[] abs(convert.42)
+  constant.18 = f32[] constant(inf)
+  compare.108 = pred[] compare(abs.107, constant.18), direction=EQ
+  compare.53 = pred[] compare(convert.42, constant.6), direction=LT
+  abs.92 = f32[] abs(convert.42)
+  floor.93 = f32[] floor(abs.92)
+  subtract.94 = f32[] subtract(abs.92, floor.93)
+  compare.95 = pred[] compare(constant.6, subtract.94), direction=LT
+  constant.4 = f32[] constant(1)
+  subtract.96 = f32[] subtract(constant.4, subtract.94)
+  select.97 = f32[] select(compare.95, subtract.96, subtract.94)
+  constant.20 = f32[] constant(3.14159274)
+  multiply.98 = f32[] multiply(select.97, constant.20)
+  sine.99 = f32[] sine(multiply.98)
+  log.100 = f32[] log(sine.99)
+  is-finite.103 = pred[] is-finite(log.100)
+  constant.19 = f32[] constant(1.14472985)
+  subtract.101 = f32[] subtract(constant.19, log.100)
+  negate.54 = f32[] negate(convert.42)
+  subtract.55 = f32[] subtract(convert.42, constant.4)
+  select.56 = f32[] select(compare.53, negate.54, subtract.55)
+  add.86 = f32[] add(select.56, constant.6)
+  add.81 = f32[] add(select.56, constant.8)
+  divide.82 = f32[] divide(select.56, constant.8)
+  log-plus-one.83 = f32[] log-plus-one(divide.82)
+  add.84 = f32[] add(log-plus-one.83, constant.22)
+  divide.85 = f32[] divide(add.81, add.84)
+  subtract.87 = f32[] subtract(add.86, divide.85)
+  multiply.88 = f32[] multiply(subtract.87, add.84)
+  add.90 = f32[] add(multiply.88, constant.21)
+  constant.16 = f32[] constant(676.520386)
+  add.57 = f32[] add(select.56, constant.4)
+  divide.58 = f32[] divide(constant.16, add.57)
+  add.59 = f32[] add(divide.58, constant.4)
+  constant.29 = f32[] constant(-1259.13916)
+  constant.15 = f32[] constant(2)
+  add.60 = f32[] add(select.56, constant.15)
+  divide.61 = f32[] divide(constant.29, add.60)
+  add.62 = f32[] add(add.59, divide.61)
+  constant.28 = f32[] constant(771.323425)
+  constant.14 = f32[] constant(3)
+  add.63 = f32[] add(select.56, constant.14)
+  divide.64 = f32[] divide(constant.28, add.63)
+  add.65 = f32[] add(add.62, divide.64)
+  constant.27 = f32[] constant(-176.615036)
+  constant.13 = f32[] constant(4)
+  add.66 = f32[] add(select.56, constant.13)
+  divide.67 = f32[] divide(constant.27, add.66)
+  add.68 = f32[] add(add.65, divide.67)
+  constant.26 = f32[] constant(12.5073433)
+  constant.12 = f32[] constant(5)
+  add.69 = f32[] add(select.56, constant.12)
+  divide.70 = f32[] divide(constant.26, add.69)
+  add.71 = f32[] add(add.68, divide.70)
+  constant.25 = f32[] constant(-0.138571098)
+  constant.11 = f32[] constant(6)
+  add.72 = f32[] add(select.56, constant.11)
+  divide.73 = f32[] divide(constant.25, add.72)
+  add.74 = f32[] add(add.71, divide.73)
+  constant.24 = f32[] constant(9.98436917e-06)
+  constant.10 = f32[] constant(7)
+  add.75 = f32[] add(select.56, constant.10)
+  divide.76 = f32[] divide(constant.24, add.75)
+  add.77 = f32[] add(add.74, divide.76)
+  constant.23 = f32[] constant(1.50563267e-07)
+  constant.9 = f32[] constant(8)
+  add.78 = f32[] add(select.56, constant.9)
+  divide.79 = f32[] divide(constant.23, add.78)
+  add.80 = f32[] add(add.77, divide.79)
+  log.89 = f32[] log(add.80)
+  add.91 = f32[] add(add.90, log.89)
+  subtract.102 = f32[] subtract(subtract.101, add.91)
+  negate.104 = f32[] negate(log.100)
+  select.105 = f32[] select(is-finite.103, subtract.102, negate.104)
+  select.106 = f32[] select(compare.53, select.105, add.91)
+  select.109 = f32[] select(compare.108, constant.18, select.106)
+  subtract.110 = f32[] subtract(subtract.52, select.109)
+  constant.3 = f32[] constant(-88.7228394)
+  compare.111 = pred[] compare(subtract.110, constant.3), direction=LT
+  or.112 = pred[] or(or.46, compare.111)
+  not.113 = pred[] not(or.112)
+  compare.47 = pred[] compare(convert.43, constant.4), direction=LT
+  compare.48 = pred[] compare(convert.43, convert.42), direction=LT
+  or.49 = pred[] or(compare.47, compare.48)
+  and.115 = pred[] and(not.113, or.49)
+  tuple.116 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) tuple(and.115, convert.42, constant.4, constant.4, constant.7, constant.7, convert.43)
+  while.157 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) while(tuple.116), condition=region_1.148, body=region_0.117
+  get-tuple-element.158 = pred[] get-tuple-element(while.157), index=0
+  get-tuple-element.159 = f32[] get-tuple-element(while.157), index=1
+  get-tuple-element.160 = f32[] get-tuple-element(while.157), index=2
+  get-tuple-element.162 = f32[] get-tuple-element(while.157), index=4
+  get-tuple-element.163 = f32[] get-tuple-element(while.157), index=5
+  get-tuple-element.164 = f32[] get-tuple-element(while.157), index=6
+  not.167 = pred[] not(or.49)
+  and.168 = pred[] and(not.113, not.167)
+  add.172 = f32[] add(convert.43, constant.4)
+  subtract.169 = f32[] subtract(constant.4, convert.42)
+  add.170 = f32[] add(convert.43, subtract.169)
+  add.171 = f32[] add(add.170, constant.4)
+  multiply.173 = f32[] multiply(add.171, convert.43)
+  divide.174 = f32[] divide(add.172, multiply.173)
+  negate.175 = f32[] negate(convert.43)
+  multiply.176 = f32[] multiply(divide.174, negate.175)
+  subtract.177 = f32[] subtract(constant.7, multiply.176)
+  divide.178 = f32[] divide(subtract.177, multiply.173)
+  tuple.179 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) tuple(and.168, divide.174, constant.4, subtract.169, add.171, constant.7, add.172, multiply.173, constant.4, convert.43, constant.7, constant.7, constant.7, negate.175, divide.178)
+  while.289 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) while(tuple.179), condition=region_3.269, body=region_2.180
+  get-tuple-element.290 = pred[] get-tuple-element(while.289), index=0
+  get-tuple-element.292 = f32[] get-tuple-element(while.289), index=2
+  get-tuple-element.293 = f32[] get-tuple-element(while.289), index=3
+  get-tuple-element.294 = f32[] get-tuple-element(while.289), index=4
+  get-tuple-element.295 = f32[] get-tuple-element(while.289), index=5
+  get-tuple-element.296 = f32[] get-tuple-element(while.289), index=6
+  get-tuple-element.297 = f32[] get-tuple-element(while.289), index=7
+  get-tuple-element.298 = f32[] get-tuple-element(while.289), index=8
+  get-tuple-element.299 = f32[] get-tuple-element(while.289), index=9
+  get-tuple-element.300 = f32[] get-tuple-element(while.289), index=10
+  get-tuple-element.301 = f32[] get-tuple-element(while.289), index=11
+  get-tuple-element.302 = f32[] get-tuple-element(while.289), index=12
+  get-tuple-element.303 = f32[] get-tuple-element(while.289), index=13
+  get-tuple-element.304 = f32[] get-tuple-element(while.289), index=14
+  compare.308 = pred[] compare(convert.43, constant.18), direction=EQ
+  get-tuple-element.161 = f32[] get-tuple-element(while.157), index=3
+  exponential.114 = f32[] exponential(subtract.110)
+  multiply.165 = f32[] multiply(get-tuple-element.161, exponential.114)
+  divide.166 = f32[] divide(multiply.165, convert.42)
+  subtract.306 = f32[] subtract(constant.4, divide.166)
+  get-tuple-element.291 = f32[] get-tuple-element(while.289), index=1
+  multiply.305 = f32[] multiply(get-tuple-element.291, exponential.114)
+  select.307 = f32[] select(or.49, subtract.306, multiply.305)
+  select.309 = f32[] select(compare.308, constant.7, select.307)
+  select.310 = f32[] select(or.46, constant.4, select.309)
+  convert.311 = bf16[] convert(select.310)
+  ROOT tuple.312 = (bf16[]) tuple(convert.311)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_35.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_35.hlo
new file mode 100644
index 00000000000..bb0a31a9a3e
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_35.hlo
@@ -0,0 +1,51 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(pred[],c64[12]{0})->(f32[11,3,2]{2,1,0})}
+
+region_1.10 {
+  Arg_0.11 = f32[] parameter(0)
+  Arg_1.12 = f32[] parameter(1)
+  ROOT add.13 = f32[] add(Arg_0.11, Arg_1.12)
+}
+
+region_0.14 {
+  Arg_.15 = c64[12]{0} parameter(0)
+  real.19 = f32[12]{0} real(Arg_.15)
+  broadcast.20 = f32[11,3,12]{2,1,0} broadcast(real.19), dimensions={2}
+  constant.18 = f32[] constant(0)
+  reduce.21 = f32[11,3]{1,0} reduce(broadcast.20, constant.18), dimensions={2}, to_apply=region_1.10
+  broadcast.22 = f32[2,11,3]{2,1,0} broadcast(reduce.21), dimensions={1,2}
+  reshape.23 = f32[11,3,2]{2,1,0} reshape(broadcast.22)
+  constant.16 = f32[] constant(1)
+  broadcast.17 = f32[11,3,2]{2,1,0} broadcast(constant.16), dimensions={}
+  ROOT add.24 = f32[11,3,2]{2,1,0} add(reshape.23, broadcast.17)
+}
+
+region_3.25 {
+  Arg_0.26 = f32[] parameter(0)
+  Arg_1.27 = f32[] parameter(1)
+  ROOT add.28 = f32[] add(Arg_0.26, Arg_1.27)
+}
+
+region_2.29 {
+  arg_empty_tuple.30 = () parameter(0)
+  constant.32 = f32[] constant(1)
+  broadcast.33 = f32[182,9,10]{2,1,0} broadcast(constant.32), dimensions={}
+  constant.31 = f32[] constant(0)
+  reduce.34 = f32[] reduce(broadcast.33, constant.31), dimensions={0,1,2}, to_apply=region_3.25
+  ROOT broadcast.35 = f32[11,3,2]{2,1,0} broadcast(reduce.34), dimensions={}
+}
+
+ENTRY main.38 {
+  Arg_0.1 = pred[] parameter(0)
+  constant.5 = pred[] constant(true)
+  xor.6 = pred[] xor(Arg_0.1, constant.5)
+  convert.8 = s32[] convert(xor.6)
+  constant.3 = c64[] constant((1, 0))
+  broadcast.4 = c64[12]{0} broadcast(constant.3), dimensions={}
+  Arg_1.2 = c64[12]{0} parameter(1)
+  atan2.7 = c64[12]{0} atan2(broadcast.4, Arg_1.2)
+  tuple.9 = () tuple()
+  conditional.36 = f32[11,3,2]{2,1,0} conditional(convert.8, atan2.7, tuple.9), branch_computations={region_0.14, region_2.29}
+  ROOT tuple.37 = (f32[11,3,2]{2,1,0}) tuple(conditional.36)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_36.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_36.hlo
new file mode 100644
index 00000000000..dc53be6649b
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_36.hlo
@@ -0,0 +1,18 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(pred[11,4]{1,0},s32[5,4,2,16,16,1]{5,4,3,2,1,0},pred[5,4,2,16,16,9]{5,4,3,2,1,0})->(pred[14,11,4]{2,1,0})}
+
+region_0.4 {
+  Arg_0.5 = pred[] parameter(0)
+  Arg_1.6 = pred[] parameter(1)
+  ROOT maximum.7 = pred[] maximum(Arg_0.5, Arg_1.6)
+}
+
+ENTRY main.11 {
+  Arg_0.1 = pred[11,4]{1,0} parameter(0)
+  Arg_1.2 = s32[5,4,2,16,16,1]{5,4,3,2,1,0} parameter(1)
+  Arg_2.3 = pred[5,4,2,16,16,9]{5,4,3,2,1,0} parameter(2)
+  scatter.8 = pred[11,4]{1,0} scatter(Arg_0.1, Arg_1.2, Arg_2.3), update_window_dims={5}, inserted_window_dims={1}, scatter_dims_to_operand_dims={0}, index_vector_dim=5, to_apply=region_0.4
+  broadcast.9 = pred[14,11,4]{2,1,0} broadcast(scatter.8), dimensions={1,2}
+  ROOT tuple.10 = (pred[14,11,4]{2,1,0}) tuple(broadcast.9)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_37.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_37.hlo
new file mode 100644
index 00000000000..de2f67aa748
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_37.hlo
@@ -0,0 +1,9 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(f32[1,7]{1,0})}
+
+ENTRY main.4 {
+  constant.1 = f32[] constant(-4.85203028)
+  broadcast.2 = f32[1,7]{1,0} broadcast(constant.1), dimensions={}
+  ROOT tuple.3 = (f32[1,7]{1,0}) tuple(broadcast.2)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_38.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_38.hlo
new file mode 100644
index 00000000000..a63c8ccd304
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_38.hlo
@@ -0,0 +1,9 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(f32[9,9,3,9]{3,2,1,0})}
+
+ENTRY main.4 {
+  constant.1 = f32[] constant(0)
+  broadcast.2 = f32[9,9,3,9]{3,2,1,0} broadcast(constant.1), dimensions={}
+  ROOT tuple.3 = (f32[9,9,3,9]{3,2,1,0}) tuple(broadcast.2)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_39.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_39.hlo
new file mode 100644
index 00000000000..c088a6e240f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_39.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u8[8,1,14,6,13]{4,3,2,1,0},u8[8,1,14,6,13]{4,3,2,1,0},u8[8,1,6,13]{3,2,1,0})->(u8[8,1,14,6,13]{4,3,2,1,0})}
+
+ENTRY main.8 {
+  Arg_0.1 = u8[8,1,14,6,13]{4,3,2,1,0} parameter(0)
+  Arg_1.2 = u8[8,1,14,6,13]{4,3,2,1,0} parameter(1)
+  xor.4 = u8[8,1,14,6,13]{4,3,2,1,0} xor(Arg_0.1, Arg_1.2)
+  Arg_2.3 = u8[8,1,6,13]{3,2,1,0} parameter(2)
+  broadcast.5 = u8[8,1,14,6,13]{4,3,2,1,0} broadcast(Arg_2.3), dimensions={0,1,3,4}
+  subtract.6 = u8[8,1,14,6,13]{4,3,2,1,0} subtract(xor.4, broadcast.5)
+  ROOT tuple.7 = (u8[8,1,14,6,13]{4,3,2,1,0}) tuple(subtract.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_4.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_4.hlo
new file mode 100644
index 00000000000..54d5459a212
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_4.hlo
@@ -0,0 +1,17 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s32[9,8,1,5,16]{4,3,2,1,0})->(s32[7,11,11,8,15]{4,3,2,1,0})}
+
+region_0.4 {
+  Arg_0.5 = s32[] parameter(0)
+  Arg_1.6 = s32[] parameter(1)
+  ROOT maximum.7 = s32[] maximum(Arg_0.5, Arg_1.6)
+}
+
+ENTRY main.10 {
+  Arg_0.1 = s32[9,8,1,5,16]{4,3,2,1,0} parameter(0)
+  abs.3 = s32[9,8,1,5,16]{4,3,2,1,0} abs(Arg_0.1)
+  constant.2 = s32[] constant(-2147483648)
+  reduce-window.8 = s32[7,11,11,8,15]{4,3,2,1,0} reduce-window(abs.3, constant.2), window={size=6x12x2x2x13 pad=2_6x6_1x4_7x3_4x6_5 lhs_dilate=1x2x4x1x1 rhs_dilate=2x1x1x4x1}, to_apply=region_0.4
+  ROOT tuple.9 = (s32[7,11,11,8,15]{4,3,2,1,0}) tuple(reduce-window.8)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_40.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_40.hlo
new file mode 100644
index 00000000000..cb282761126
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_40.hlo
@@ -0,0 +1,27 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s16[1,1,14,8]{3,2,1,0},s16[1,1,14,8]{3,2,1,0},s32[13,10,8,2]{3,2,1,0},s16[13,10,1,8,1]{4,3,2,1,0})->(s16[1,1,14,8]{3,2,1,0})}
+
+region_0.13 {
+  Arg_0.14 = s16[] parameter(0)
+  Arg_1.15 = s16[] parameter(1)
+  ROOT maximum.16 = s16[] maximum(Arg_0.14, Arg_1.15)
+}
+
+ENTRY main.20 {
+  Arg_0.1 = s16[1,1,14,8]{3,2,1,0} parameter(0)
+  constant.5 = s16[] constant(0)
+  broadcast.6 = s16[1,1,14,8]{3,2,1,0} broadcast(constant.5), dimensions={}
+  Arg_1.2 = s16[1,1,14,8]{3,2,1,0} parameter(1)
+  compare.9 = pred[1,1,14,8]{3,2,1,0} compare(broadcast.6, Arg_1.2), direction=EQ
+  constant.7 = s16[] constant(1)
+  broadcast.8 = s16[1,1,14,8]{3,2,1,0} broadcast(constant.7), dimensions={}
+  select.10 = s16[1,1,14,8]{3,2,1,0} select(compare.9, broadcast.8, Arg_1.2)
+  divide.11 = s16[1,1,14,8]{3,2,1,0} divide(Arg_0.1, select.10)
+  shift-right-logical.12 = s16[1,1,14,8]{3,2,1,0} shift-right-logical(broadcast.8, broadcast.8)
+  Arg_2.3 = s32[13,10,8,2]{3,2,1,0} parameter(2)
+  Arg_3.4 = s16[13,10,1,8,1]{4,3,2,1,0} parameter(3)
+  scatter.17 = s16[1,1,14,8]{3,2,1,0} scatter(broadcast.8, Arg_2.3, Arg_3.4), update_window_dims={2,4}, inserted_window_dims={2,3}, scatter_dims_to_operand_dims={2,3}, index_vector_dim=3, to_apply=region_0.13
+  clamp.18 = s16[1,1,14,8]{3,2,1,0} clamp(divide.11, shift-right-logical.12, scatter.17)
+  ROOT tuple.19 = (s16[1,1,14,8]{3,2,1,0}) tuple(clamp.18)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_41.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_41.hlo
new file mode 100644
index 00000000000..a02e353aef6
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_41.hlo
@@ -0,0 +1,8 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(pred[8,6]{1,0})->(pred[8,6]{1,0})}
+
+ENTRY main.3 {
+  Arg_0.1 = pred[8,6]{1,0} parameter(0)
+  ROOT tuple.2 = (pred[8,6]{1,0}) tuple(Arg_0.1)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_42.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_42.hlo
new file mode 100644
index 00000000000..23b907c8662
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_42.hlo
@@ -0,0 +1,11 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u32[7]{0})->(u32[7]{0})}
+
+ENTRY main.6 {
+  Arg_0.1 = u32[7]{0} parameter(0)
+  constant.2 = u32[] constant(1)
+  broadcast.3 = u32[7]{0} broadcast(constant.2), dimensions={}
+  minimum.4 = u32[7]{0} minimum(Arg_0.1, broadcast.3)
+  ROOT tuple.5 = (u32[7]{0}) tuple(minimum.4)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_43.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_43.hlo
new file mode 100644
index 00000000000..e65df9df321
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_43.hlo
@@ -0,0 +1,12 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(u16[10]{0})}
+
+ENTRY main.7 {
+  constant.1 = u16[] constant(1)
+  broadcast.2 = u16[14]{0} broadcast(constant.1), dimensions={}
+  constant.3 = s32[1]{0} constant({1})
+  shift-right-arithmetic.4 = s32[1]{0} shift-right-arithmetic(constant.3, constant.3)
+  gather.5 = u16[10]{0} gather(broadcast.2, shift-right-arithmetic.4), offset_dims={0}, collapsed_slice_dims={}, start_index_map={0}, index_vector_dim=0, slice_sizes={10}
+  ROOT tuple.6 = (u16[10]{0}) tuple(gather.5)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_44.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_44.hlo
new file mode 100644
index 00000000000..8e5e1edf8b2
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_44.hlo
@@ -0,0 +1,9 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(s32[10,3,2,15,10]{4,3,2,1,0})}
+
+ENTRY main.4 {
+  constant.1 = s32[] constant(0)
+  broadcast.2 = s32[10,3,2,15,10]{4,3,2,1,0} broadcast(constant.1), dimensions={}
+  ROOT tuple.3 = (s32[10,3,2,15,10]{4,3,2,1,0}) tuple(broadcast.2)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_45.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_45.hlo
new file mode 100644
index 00000000000..cfcd6bdce9b
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_45.hlo
@@ -0,0 +1,11 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(c64[1,13,1,5,7]{4,3,2,1,0})}
+
+ENTRY main.6 {
+  constant.1 = c64[] constant((1, 0))
+  broadcast.2 = c64[1,1,13,1,5,7]{5,4,3,2,1,0} broadcast(constant.1), dimensions={}
+  subtract.3 = c64[1,1,13,1,5,7]{5,4,3,2,1,0} subtract(broadcast.2, broadcast.2)
+  reshape.4 = c64[1,13,1,5,7]{4,3,2,1,0} reshape(subtract.3)
+  ROOT tuple.5 = (c64[1,13,1,5,7]{4,3,2,1,0}) tuple(reshape.4)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_46.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_46.hlo
new file mode 100644
index 00000000000..ad6d8aa1d02
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_46.hlo
@@ -0,0 +1,11 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s8[13]{0},s8[13]{0})->(s8[13]{0})}
+
+ENTRY main.6 {
+  Arg_0.1 = s8[13]{0} parameter(0)
+  Arg_1.2 = s8[13]{0} parameter(1)
+  shift-right-logical.3 = s8[13]{0} shift-right-logical(Arg_0.1, Arg_1.2)
+  abs.4 = s8[13]{0} abs(shift-right-logical.3)
+  ROOT tuple.5 = (s8[13]{0}) tuple(abs.4)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_47.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_47.hlo
new file mode 100644
index 00000000000..dd80454d348
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_47.hlo
@@ -0,0 +1,12 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u32[143,1,1]{2,1,0})->(u32[11,13]{1,0})}
+
+ENTRY main.7 {
+  Arg_0.1 = u32[143,1,1]{2,1,0} parameter(0)
+  constant.2 = u32[] constant(1)
+  broadcast.3 = u32[143,1,1]{2,1,0} broadcast(constant.2), dimensions={}
+  add.4 = u32[143,1,1]{2,1,0} add(Arg_0.1, broadcast.3)
+  reshape.5 = u32[11,13]{1,0} reshape(add.4)
+  ROOT tuple.6 = (u32[11,13]{1,0}) tuple(reshape.5)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_48.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_48.hlo
new file mode 100644
index 00000000000..16d977f4bba
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_48.hlo
@@ -0,0 +1,17 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(f32[16,6,12,7,6,12]{5,4,3,2,1,0})->(f32[16,12,7]{2,1,0})}
+
+region_0.4 {
+  Arg_0.5 = f32[] parameter(0)
+  Arg_1.6 = f32[] parameter(1)
+  ROOT multiply.7 = f32[] multiply(Arg_0.5, Arg_1.6)
+}
+
+ENTRY main.10 {
+  Arg_0.1 = f32[16,6,12,7,6,12]{5,4,3,2,1,0} parameter(0)
+  exponential.3 = f32[16,6,12,7,6,12]{5,4,3,2,1,0} exponential(Arg_0.1)
+  constant.2 = f32[] constant(1)
+  reduce.8 = f32[16,12,7]{2,1,0} reduce(exponential.3, constant.2), dimensions={1,4,5}, to_apply=region_0.4
+  ROOT tuple.9 = (f32[16,12,7]{2,1,0}) tuple(reduce.8)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_49.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_49.hlo
new file mode 100644
index 00000000000..fe6778e52de
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_49.hlo
@@ -0,0 +1,24 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s32[7,1]{1,0},u16[7,24]{1,0})->(u16[6]{0})}
+
+region_0.10 {
+  Arg_0.11 = u16[] parameter(0)
+  Arg_1.12 = u16[] parameter(1)
+  ROOT maximum.13 = u16[] maximum(Arg_0.11, Arg_1.12)
+}
+
+ENTRY main.17 {
+  constant.7 = u16[] constant(1)
+  broadcast.8 = u16[29]{0} broadcast(constant.7), dimensions={}
+  constant.3 = s32[] constant(0)
+  broadcast.4 = s32[7,1]{1,0} broadcast(constant.3), dimensions={}
+  Arg_0.1 = s32[7,1]{1,0} parameter(0)
+  constant.5 = s32[] constant(5)
+  broadcast.6 = s32[7,1]{1,0} broadcast(constant.5), dimensions={}
+  clamp.9 = s32[7,1]{1,0} clamp(broadcast.4, Arg_0.1, broadcast.6)
+  Arg_1.2 = u16[7,24]{1,0} parameter(1)
+  scatter.14 = u16[29]{0} scatter(broadcast.8, clamp.9, Arg_1.2), update_window_dims={1}, inserted_window_dims={}, scatter_dims_to_operand_dims={0}, index_vector_dim=1, to_apply=region_0.10
+  slice.15 = u16[6]{0} slice(scatter.14), slice={[11:29:3]}
+  ROOT tuple.16 = (u16[6]{0}) tuple(slice.15)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_5.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_5.hlo
new file mode 100644
index 00000000000..ac5cdc4ae8e
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_5.hlo
@@ -0,0 +1,38 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u16[5,3,2,12,3,6]{5,4,3,2,1,0},s32[6,2]{1,0},u16[6,3,1]{2,1,0},u16[6,3,1]{2,1,0})->(u16[6,3,12,14,3]{4,3,2,1,0})}
+
+region_0.15 {
+  Arg_0.16 = u16[] parameter(0)
+  ROOT Arg_1.17 = u16[] parameter(1)
+}
+
+region_1.23 {
+  Arg_0.24 = u16[] parameter(0)
+  Arg_1.25 = u16[] parameter(1)
+  ROOT minimum.26 = u16[] minimum(Arg_0.24, Arg_1.25)
+}
+
+ENTRY main.29 {
+  constant.5 = u16[] constant(1)
+  broadcast.6 = u16[6,3,12,14,3]{4,3,2,1,0} broadcast(constant.5), dimensions={}
+  constant.7 = s32[] constant(1)
+  broadcast.8 = s32[6,5]{1,0} broadcast(constant.7), dimensions={}
+  Arg_0.1 = u16[5,3,2,12,3,6]{5,4,3,2,1,0} parameter(0)
+  scatter.18 = u16[6,3,12,14,3]{4,3,2,1,0} scatter(broadcast.6, broadcast.8, Arg_0.1), update_window_dims={0,1,2,3,4}, inserted_window_dims={}, scatter_dims_to_operand_dims={3,2,0,4,1}, index_vector_dim=1, to_apply=region_0.15
+  Arg_1.2 = s32[6,2]{1,0} parameter(1)
+  constant.9 = s32[] constant(1)
+  broadcast.10 = s32[6,2]{1,0} broadcast(constant.9), dimensions={}
+  xor.19 = s32[6,2]{1,0} xor(Arg_1.2, broadcast.10)
+  Arg_2.3 = u16[6,3,1]{2,1,0} parameter(2)
+  constant.11 = u16[] constant(0)
+  broadcast.12 = u16[6,3,1]{2,1,0} broadcast(constant.11), dimensions={}
+  Arg_3.4 = u16[6,3,1]{2,1,0} parameter(3)
+  compare.20 = pred[6,3,1]{2,1,0} compare(broadcast.12, Arg_3.4), direction=EQ
+  constant.13 = u16[] constant(1)
+  broadcast.14 = u16[6,3,1]{2,1,0} broadcast(constant.13), dimensions={}
+  select.21 = u16[6,3,1]{2,1,0} select(compare.20, broadcast.14, Arg_3.4)
+  divide.22 = u16[6,3,1]{2,1,0} divide(Arg_2.3, select.21)
+  scatter.27 = u16[6,3,12,14,3]{4,3,2,1,0} scatter(scatter.18, xor.19, divide.22), update_window_dims={1,2}, inserted_window_dims={2,3,4}, scatter_dims_to_operand_dims={4,0}, index_vector_dim=1, to_apply=region_1.23
+  ROOT tuple.28 = (u16[6,3,12,14,3]{4,3,2,1,0}) tuple(scatter.27)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_50.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_50.hlo
new file mode 100644
index 00000000000..edd5fc810a3
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_50.hlo
@@ -0,0 +1,14 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(f16[6,1]{1,0})->(f16[6,1]{1,0})}
+
+ENTRY main.9 {
+  Arg_0.1 = f16[6,1]{1,0} parameter(0)
+  constant.2 = f16[] constant(1)
+  broadcast.3 = f16[2,1]{1,0} broadcast(constant.2), dimensions={}
+  constant.4 = s32[] constant(3)
+  constant.5 = s32[] constant(0)
+  dynamic-update-slice.6 = f16[6,1]{1,0} dynamic-update-slice(Arg_0.1, broadcast.3, constant.4, constant.5)
+  exponential.7 = f16[6,1]{1,0} exponential(dynamic-update-slice.6)
+  ROOT tuple.8 = (f16[6,1]{1,0}) tuple(exponential.7)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_51.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_51.hlo
new file mode 100644
index 00000000000..17e0f8d240c
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_51.hlo
@@ -0,0 +1,12 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s16[13]{0})->(s16[13]{0})}
+
+ENTRY main.7 {
+  constant.2 = s16[] constant(1)
+  broadcast.3 = s16[13]{0} broadcast(constant.2), dimensions={}
+  Arg_0.1 = s16[13]{0} parameter(0)
+  subtract.4 = s16[13]{0} subtract(broadcast.3, Arg_0.1)
+  abs.5 = s16[13]{0} abs(subtract.4)
+  ROOT tuple.6 = (s16[13]{0}) tuple(abs.5)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_52.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_52.hlo
new file mode 100644
index 00000000000..ebc5714ab0d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_52.hlo
@@ -0,0 +1,9 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(f32[3,4,2,14]{3,2,1,0})}
+
+ENTRY main.4 {
+  constant.1 = f32[] constant(1.55740774)
+  broadcast.2 = f32[3,4,2,14]{3,2,1,0} broadcast(constant.1), dimensions={}
+  ROOT tuple.3 = (f32[3,4,2,14]{3,2,1,0}) tuple(broadcast.2)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_53.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_53.hlo
new file mode 100644
index 00000000000..4866d1beb29
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_53.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s8[11,11,4]{2,1,0},s8[11,11,4]{2,1,0})->(s8[11,11,10]{2,1,0})}
+
+ENTRY main.8 {
+  Arg_0.1 = s8[11,11,4]{2,1,0} parameter(0)
+  Arg_1.2 = s8[11,11,4]{2,1,0} parameter(1)
+  remainder.5 = s8[11,11,4]{2,1,0} remainder(Arg_0.1, Arg_1.2)
+  constant.3 = s8[] constant(-1)
+  broadcast.4 = s8[11,4,10]{2,1,0} broadcast(constant.3), dimensions={}
+  dot.6 = s8[11,11,10]{2,1,0} dot(remainder.5, broadcast.4), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  ROOT tuple.7 = (s8[11,11,10]{2,1,0}) tuple(dot.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_54.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_54.hlo
new file mode 100644
index 00000000000..912e9bfbe67
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_54.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s32[13,16,16]{2,1,0},s32[13,16,16]{2,1,0})->(s32[13,16,16]{2,1,0})}
+
+ENTRY main.8 {
+  Arg_0.1 = s32[13,16,16]{2,1,0} parameter(0)
+  Arg_1.2 = s32[13,16,16]{2,1,0} parameter(1)
+  shift-right-logical.5 = s32[13,16,16]{2,1,0} shift-right-logical(Arg_0.1, Arg_1.2)
+  constant.3 = s32[] constant(1)
+  broadcast.4 = s32[13,16,16]{2,1,0} broadcast(constant.3), dimensions={}
+  minimum.6 = s32[13,16,16]{2,1,0} minimum(shift-right-logical.5, broadcast.4)
+  ROOT tuple.7 = (s32[13,16,16]{2,1,0}) tuple(minimum.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_56.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_56.hlo
new file mode 100644
index 00000000000..323ead964f2
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_56.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u8[4]{0})->(u8[16]{0})}
+
+ENTRY main.8 {
+  constant.2 = u8[] constant(1)
+  broadcast.3 = u8[16]{0} broadcast(constant.2), dimensions={}
+  Arg_0.1 = u8[4]{0} parameter(0)
+  constant.4 = s32[] constant(3)
+  dynamic-update-slice.5 = u8[16]{0} dynamic-update-slice(broadcast.3, Arg_0.1, constant.4)
+  remainder.6 = u8[16]{0} remainder(broadcast.3, dynamic-update-slice.5)
+  ROOT tuple.7 = (u8[16]{0}) tuple(remainder.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_57.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_57.hlo
new file mode 100644
index 00000000000..bdbf53c3e8d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_57.hlo
@@ -0,0 +1,20 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s8[195,2]{1,0},s32[8,1,2]{2,1,0})->(s8[13,2,15]{2,1,0})}
+
+region_0.5 {
+  Arg_0.6 = s8[] parameter(0)
+  Arg_1.7 = s8[] parameter(1)
+  ROOT minimum.8 = s8[] minimum(Arg_0.6, Arg_1.7)
+}
+
+ENTRY main.13 {
+  Arg_0.1 = s8[195,2]{1,0} parameter(0)
+  Arg_1.2 = s32[8,1,2]{2,1,0} parameter(1)
+  constant.3 = s8[] constant(1)
+  broadcast.4 = s8[56,2,8,1]{3,2,1,0} broadcast(constant.3), dimensions={}
+  scatter.9 = s8[195,2]{1,0} scatter(Arg_0.1, Arg_1.2, broadcast.4), update_window_dims={0,1}, inserted_window_dims={}, scatter_dims_to_operand_dims={1,0}, index_vector_dim=2, to_apply=region_0.5
+  transpose.10 = s8[2,195]{0,1} transpose(scatter.9), dimensions={1,0}
+  reshape.11 = s8[13,2,15]{2,1,0} reshape(transpose.10)
+  ROOT tuple.12 = (s8[13,2,15]{2,1,0}) tuple(reshape.11)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_58.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_58.hlo
new file mode 100644
index 00000000000..36f8ca328b2
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_58.hlo
@@ -0,0 +1,48 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(pred[],s8[6]{0},u8[4,3,8,13,11]{4,3,2,1,0})->(u8[4,3,8,4,11]{4,3,2,1,0})}
+
+region_1.7 {
+  Arg_0.8 = u8[] parameter(0)
+  Arg_1.9 = u8[] parameter(1)
+  ROOT add.10 = u8[] add(Arg_0.8, Arg_1.9)
+}
+
+region_0.11 {
+  Arg_.12 = s8[6]{0} parameter(0)
+  convert.14 = u8[6]{0} convert(Arg_.12)
+  broadcast.15 = u8[4,3,8,4,6]{4,3,2,1,0} broadcast(convert.14), dimensions={4}
+  constant.13 = u8[] constant(0)
+  reduce.16 = u8[4,3,8,4]{3,2,1,0} reduce(broadcast.15, constant.13), dimensions={4}, to_apply=region_1.7
+  broadcast.17 = u8[13,4,3,8,4]{4,3,2,1,0} broadcast(reduce.16), dimensions={1,2,3,4}
+  ROOT reshape.18 = u8[4,3,8,4,13]{4,3,2,1,0} reshape(broadcast.17)
+}
+
+region_3.19 {
+  Arg_0.20 = u8[] parameter(0)
+  Arg_1.21 = u8[] parameter(1)
+  ROOT add.22 = u8[] add(Arg_0.20, Arg_1.21)
+}
+
+region_2.23 {
+  Arg_.24 = s8[6]{0} parameter(0)
+  convert.26 = u8[6]{0} convert(Arg_.24)
+  broadcast.27 = u8[4,3,8,4,6]{4,3,2,1,0} broadcast(convert.26), dimensions={4}
+  constant.25 = u8[] constant(0)
+  reduce.28 = u8[4,3,8,4]{3,2,1,0} reduce(broadcast.27, constant.25), dimensions={4}, to_apply=region_3.19
+  broadcast.29 = u8[13,4,3,8,4]{4,3,2,1,0} broadcast(reduce.28), dimensions={1,2,3,4}
+  ROOT reshape.30 = u8[4,3,8,4,13]{4,3,2,1,0} reshape(broadcast.29)
+}
+
+ENTRY main.35 {
+  Arg_0.1 = pred[] parameter(0)
+  convert.6 = s32[] convert(Arg_0.1)
+  Arg_1.2 = s8[6]{0} parameter(1)
+  conditional.31 = u8[4,3,8,4,13]{4,3,2,1,0} conditional(convert.6, Arg_1.2, Arg_1.2), branch_computations={region_0.11, region_2.23}
+  Arg_2.3 = u8[4,3,8,13,11]{4,3,2,1,0} parameter(2)
+  constant.4 = u8[] constant(1)
+  broadcast.5 = u8[4,3,8,13,11]{4,3,2,1,0} broadcast(constant.4), dimensions={}
+  subtract.32 = u8[4,3,8,13,11]{4,3,2,1,0} subtract(Arg_2.3, broadcast.5)
+  dot.33 = u8[4,3,8,4,11]{4,3,2,1,0} dot(conditional.31, subtract.32), lhs_batch_dims={0,1,2}, lhs_contracting_dims={4}, rhs_batch_dims={0,1,2}, rhs_contracting_dims={3}
+  ROOT tuple.34 = (u8[4,3,8,4,11]{4,3,2,1,0}) tuple(dot.33)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_59.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_59.hlo
new file mode 100644
index 00000000000..eeca8845437
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_59.hlo
@@ -0,0 +1,14 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u32[15,16,7,14,1]{4,3,2,1,0})->(pred[15,16,7,14,1]{4,3,2,1,0})}
+
+ENTRY main.9 {
+  constant.4 = u32[] constant(1)
+  broadcast.5 = u32[15,16,7,14,1]{4,3,2,1,0} broadcast(constant.4), dimensions={}
+  Arg_0.1 = u32[15,16,7,14,1]{4,3,2,1,0} parameter(0)
+  shift-left.6 = u32[15,16,7,14,1]{4,3,2,1,0} shift-left(broadcast.5, Arg_0.1)
+  constant.2 = u32[] constant(0)
+  broadcast.3 = u32[15,16,7,14,1]{4,3,2,1,0} broadcast(constant.2), dimensions={}
+  compare.7 = pred[15,16,7,14,1]{4,3,2,1,0} compare(shift-left.6, broadcast.3), direction=GE
+  ROOT tuple.8 = (pred[15,16,7,14,1]{4,3,2,1,0}) tuple(compare.7)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_6.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_6.hlo
new file mode 100644
index 00000000000..c509094781f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_6.hlo
@@ -0,0 +1,20 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(u8[4]{0})}
+
+region_0.6 {
+  Arg_0.7 = u8[] parameter(0)
+  Arg_1.8 = u8[] parameter(1)
+  ROOT multiply.9 = u8[] multiply(Arg_0.7, Arg_1.8)
+}
+
+ENTRY main.13 {
+  constant.1 = u8[] constant(1)
+  broadcast.2 = u8[14,4,5,6,1]{4,3,2,1,0} broadcast(constant.1), dimensions={}
+  constant.5 = u8[] constant(1)
+  reduce.10 = u8[4,1]{1,0} reduce(broadcast.2, constant.5), dimensions={0,2,3}, to_apply=region_0.6
+  constant.3 = u8[] constant(1)
+  broadcast.4 = u8[4,1]{1,0} broadcast(constant.3), dimensions={}
+  dot.11 = u8[4]{0} dot(reduce.10, broadcast.4), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  ROOT tuple.12 = (u8[4]{0}) tuple(dot.11)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_60.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_60.hlo
new file mode 100644
index 00000000000..27ae0648507
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_60.hlo
@@ -0,0 +1,27 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s16[13,16,15]{2,1,0})->(s16[13,4,12]{2,1,0})}
+
+region_0.12 {
+  Arg_0.13 = s16[] parameter(0)
+  Arg_1.14 = s16[] parameter(1)
+  ROOT add.15 = s16[] add(Arg_0.13, Arg_1.14)
+}
+
+ENTRY main.20 {
+  constant.3 = s16[] constant(1)
+  broadcast.4 = s16[13,2,16]{2,1,0} broadcast(constant.3), dimensions={}
+  Arg_0.1 = s16[13,16,15]{2,1,0} parameter(0)
+  dot.9 = s16[13,2,15]{2,1,0} dot(broadcast.4, Arg_0.1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  constant.5 = s16[] constant(1)
+  broadcast.6 = s16[4,2,6]{2,1,0} broadcast(constant.5), dimensions={}
+  constant.7 = s16[] constant(1)
+  broadcast.8 = s16[4,6,9]{2,1,0} broadcast(constant.7), dimensions={}
+  dot.10 = s16[4,2,9]{2,1,0} dot(broadcast.6, broadcast.8), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  convolution.11 = s16[13,4,15]{2,1,0} convolution(dot.9, dot.10), window={size=9 pad=2_2 lhs_dilate=3 rhs_dilate=4}, dim_labels=bf0_oi0->bf0
+  constant.2 = s16[] constant(0)
+  reduce.16 = s16[13,4]{1,0} reduce(convolution.11, constant.2), dimensions={2}, to_apply=region_0.12
+  broadcast.17 = s16[12,13,4]{2,1,0} broadcast(reduce.16), dimensions={1,2}
+  reshape.18 = s16[13,4,12]{2,1,0} reshape(broadcast.17)
+  ROOT tuple.19 = (s16[13,4,12]{2,1,0}) tuple(reshape.18)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_61.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_61.hlo
new file mode 100644
index 00000000000..55185eba833
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_61.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s8[],s8[])->(s8[])}
+
+ENTRY main.8 {
+  Arg_0.1 = s8[] parameter(0)
+  negate.4 = s8[] negate(Arg_0.1)
+  constant.3 = s8[] constant(1)
+  Arg_1.2 = s8[] parameter(1)
+  shift-right-arithmetic.5 = s8[] shift-right-arithmetic(constant.3, Arg_1.2)
+  xor.6 = s8[] xor(negate.4, shift-right-arithmetic.5)
+  ROOT tuple.7 = (s8[]) tuple(xor.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_62.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_62.hlo
new file mode 100644
index 00000000000..6f628bd941e
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_62.hlo
@@ -0,0 +1,10 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(bf16[18,22,24,41,11]{4,3,2,1,0})->(bf16[11,9,6,15,1]{4,3,2,1,0})}
+
+ENTRY main.5 {
+  Arg_0.1 = bf16[18,22,24,41,11]{4,3,2,1,0} parameter(0)
+  sign.2 = bf16[18,22,24,41,11]{4,3,2,1,0} sign(Arg_0.1)
+  slice.3 = bf16[11,9,6,15,1]{4,3,2,1,0} slice(sign.2), slice={[7:18:1], [13:22:1], [6:24:3], [11:41:2], [1:4:3]}
+  ROOT tuple.4 = (bf16[11,9,6,15,1]{4,3,2,1,0}) tuple(slice.3)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_63.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_63.hlo
new file mode 100644
index 00000000000..70266fb3545
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_63.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u32[],u32[])->(u32[])}
+
+ENTRY main.8 {
+  Arg_0.1 = u32[] parameter(0)
+  constant.3 = u32[] constant(1)
+  shift-right-arithmetic.4 = u32[] shift-right-arithmetic(Arg_0.1, constant.3)
+  Arg_1.2 = u32[] parameter(1)
+  multiply.5 = u32[] multiply(Arg_1.2, Arg_1.2)
+  shift-left.6 = u32[] shift-left(shift-right-arithmetic.4, multiply.5)
+  ROOT tuple.7 = (u32[]) tuple(shift-left.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_64.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_64.hlo
new file mode 100644
index 00000000000..e644e27e012
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_64.hlo
@@ -0,0 +1,9 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(c64[])}
+
+ENTRY main.4 {
+  constant.1 = c64[] constant((1, 0))
+  sine.2 = c64[] sine(constant.1)
+  ROOT tuple.3 = (c64[]) tuple(sine.2)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_65.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_65.hlo
new file mode 100644
index 00000000000..ddd269e13e3
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_65.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s8[],s8[])->(s8[])}
+
+ENTRY main.8 {
+  Arg_0.1 = s8[] parameter(0)
+  constant.4 = s8[] constant(1)
+  Arg_1.2 = s8[] parameter(1)
+  clamp.5 = s8[] clamp(Arg_0.1, constant.4, Arg_1.2)
+  constant.3 = s8[] constant(-1)
+  subtract.6 = s8[] subtract(clamp.5, constant.3)
+  ROOT tuple.7 = (s8[]) tuple(subtract.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_66.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_66.hlo
new file mode 100644
index 00000000000..c42d2ce2272
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_66.hlo
@@ -0,0 +1,26 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u16[16,13,8,10]{3,2,1,0},s32[4,16,15,1]{3,2,1,0},u16[4,16,15,5]{3,2,1,0},u16[16,13,8,10]{3,2,1,0},u16[16,13,8,10]{3,2,1,0})->(u16[16,13,8,10]{3,2,1,0})}
+
+region_0.11 {
+  Arg_0.12 = u16[] parameter(0)
+  Arg_1.13 = u16[] parameter(1)
+  ROOT minimum.14 = u16[] minimum(Arg_0.12, Arg_1.13)
+}
+
+ENTRY main.19 {
+  Arg_0.1 = u16[16,13,8,10]{3,2,1,0} parameter(0)
+  constant.6 = s32[] constant(0)
+  broadcast.7 = s32[4,16,15,1]{3,2,1,0} broadcast(constant.6), dimensions={}
+  Arg_1.2 = s32[4,16,15,1]{3,2,1,0} parameter(1)
+  constant.8 = s32[] constant(7)
+  broadcast.9 = s32[4,16,15,1]{3,2,1,0} broadcast(constant.8), dimensions={}
+  clamp.10 = s32[4,16,15,1]{3,2,1,0} clamp(broadcast.7, Arg_1.2, broadcast.9)
+  Arg_2.3 = u16[4,16,15,5]{3,2,1,0} parameter(2)
+  scatter.15 = u16[16,13,8,10]{3,2,1,0} scatter(Arg_0.1, clamp.10, Arg_2.3), update_window_dims={3}, inserted_window_dims={1,2,3}, scatter_dims_to_operand_dims={2}, index_vector_dim=3, to_apply=region_0.11
+  Arg_3.4 = u16[16,13,8,10]{3,2,1,0} parameter(3)
+  Arg_4.5 = u16[16,13,8,10]{3,2,1,0} parameter(4)
+  remainder.16 = u16[16,13,8,10]{3,2,1,0} remainder(Arg_3.4, Arg_4.5)
+  maximum.17 = u16[16,13,8,10]{3,2,1,0} maximum(scatter.15, remainder.16)
+  ROOT tuple.18 = (u16[16,13,8,10]{3,2,1,0}) tuple(maximum.17)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_67.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_67.hlo
new file mode 100644
index 00000000000..733588f991f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_67.hlo
@@ -0,0 +1,12 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u8[11,15,9,15,14]{4,3,2,1,0})->(u8[11,16,9]{2,1,0})}
+
+ENTRY main.7 {
+  Arg_0.1 = u8[11,15,9,15,14]{4,3,2,1,0} parameter(0)
+  constant.2 = u8[] constant(1)
+  broadcast.3 = u8[4,15,30,11,18]{4,3,2,1,0} broadcast(constant.2), dimensions={}
+  convolution.4 = u8[11,4,2,2,9]{4,3,2,1,0} convolution(Arg_0.1, broadcast.3), window={size=30x11x18 stride=7x3x1 pad=5_7x6_4x1_2 lhs_dilate=3x1x3 rhs_dilate=1x2x2}, dim_labels=bf012_oi012->bf012
+  reshape.5 = u8[11,16,9]{2,1,0} reshape(convolution.4)
+  ROOT tuple.6 = (u8[11,16,9]{2,1,0}) tuple(reshape.5)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_68.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_68.hlo
new file mode 100644
index 00000000000..cd4c01d607b
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_68.hlo
@@ -0,0 +1,158 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(f32[12]{0})->(f32[12]{0})}
+
+ENTRY main.153 {
+  Arg_0.1 = f32[12]{0} parameter(0)
+  sign.148 = f32[12]{0} sign(Arg_0.1)
+  abs.62 = f32[12]{0} abs(Arg_0.1)
+  constant.52 = f32[] constant(8)
+  broadcast.53 = f32[12]{0} broadcast(constant.52), dimensions={}
+  compare.146 = pred[12]{0} compare(abs.62, broadcast.53), direction=LE
+  constant.58 = f32[] constant(0.5)
+  broadcast.59 = f32[12]{0} broadcast(constant.58), dimensions={}
+  multiply.63 = f32[12]{0} multiply(abs.62, broadcast.59)
+  constant.56 = f32[] constant(2)
+  broadcast.57 = f32[12]{0} broadcast(constant.56), dimensions={}
+  subtract.64 = f32[12]{0} subtract(multiply.63, broadcast.57)
+  constant.50 = f32[] constant(0)
+  broadcast.51 = f32[12]{0} broadcast(constant.50), dimensions={}
+  multiply.65 = f32[12]{0} multiply(subtract.64, broadcast.51)
+  subtract.66 = f32[12]{0} subtract(multiply.65, broadcast.51)
+  constant.48 = f32[] constant(9.38153732e-09)
+  broadcast.49 = f32[12]{0} broadcast(constant.48), dimensions={}
+  add.67 = f32[12]{0} add(subtract.66, broadcast.49)
+  multiply.68 = f32[12]{0} multiply(subtract.64, add.67)
+  subtract.69 = f32[12]{0} subtract(multiply.68, broadcast.51)
+  constant.46 = f32[] constant(-4.44505908e-08)
+  broadcast.47 = f32[12]{0} broadcast(constant.46), dimensions={}
+  add.70 = f32[12]{0} add(subtract.69, broadcast.47)
+  multiply.71 = f32[12]{0} multiply(subtract.64, add.70)
+  subtract.72 = f32[12]{0} subtract(multiply.71, add.67)
+  constant.44 = f32[] constant(2.00329481e-07)
+  broadcast.45 = f32[12]{0} broadcast(constant.44), dimensions={}
+  add.73 = f32[12]{0} add(subtract.72, broadcast.45)
+  multiply.74 = f32[12]{0} multiply(subtract.64, add.73)
+  subtract.75 = f32[12]{0} subtract(multiply.74, add.70)
+  constant.42 = f32[] constant(-8.56872e-07)
+  broadcast.43 = f32[12]{0} broadcast(constant.42), dimensions={}
+  add.76 = f32[12]{0} add(subtract.75, broadcast.43)
+  multiply.77 = f32[12]{0} multiply(subtract.64, add.76)
+  subtract.78 = f32[12]{0} subtract(multiply.77, add.73)
+  constant.40 = f32[] constant(3.47025139e-06)
+  broadcast.41 = f32[12]{0} broadcast(constant.40), dimensions={}
+  add.79 = f32[12]{0} add(subtract.78, broadcast.41)
+  multiply.80 = f32[12]{0} multiply(subtract.64, add.79)
+  subtract.81 = f32[12]{0} subtract(multiply.80, add.76)
+  constant.38 = f32[] constant(-1.32731639e-05)
+  broadcast.39 = f32[12]{0} broadcast(constant.38), dimensions={}
+  add.82 = f32[12]{0} add(subtract.81, broadcast.39)
+  multiply.83 = f32[12]{0} multiply(subtract.64, add.82)
+  subtract.84 = f32[12]{0} subtract(multiply.83, add.79)
+  constant.36 = f32[] constant(4.78156508e-05)
+  broadcast.37 = f32[12]{0} broadcast(constant.36), dimensions={}
+  add.85 = f32[12]{0} add(subtract.84, broadcast.37)
+  multiply.86 = f32[12]{0} multiply(subtract.64, add.85)
+  subtract.87 = f32[12]{0} subtract(multiply.86, add.82)
+  constant.34 = f32[] constant(-0.000161760821)
+  broadcast.35 = f32[12]{0} broadcast(constant.34), dimensions={}
+  add.88 = f32[12]{0} add(subtract.87, broadcast.35)
+  multiply.89 = f32[12]{0} multiply(subtract.64, add.88)
+  subtract.90 = f32[12]{0} subtract(multiply.89, add.85)
+  constant.32 = f32[] constant(0.000512286)
+  broadcast.33 = f32[12]{0} broadcast(constant.32), dimensions={}
+  add.91 = f32[12]{0} add(subtract.90, broadcast.33)
+  multiply.92 = f32[12]{0} multiply(subtract.64, add.91)
+  subtract.93 = f32[12]{0} subtract(multiply.92, add.88)
+  constant.30 = f32[] constant(-0.00151357241)
+  broadcast.31 = f32[12]{0} broadcast(constant.30), dimensions={}
+  add.94 = f32[12]{0} add(subtract.93, broadcast.31)
+  multiply.95 = f32[12]{0} multiply(subtract.64, add.94)
+  subtract.96 = f32[12]{0} subtract(multiply.95, add.91)
+  constant.28 = f32[] constant(0.0041564228)
+  broadcast.29 = f32[12]{0} broadcast(constant.28), dimensions={}
+  add.97 = f32[12]{0} add(subtract.96, broadcast.29)
+  multiply.98 = f32[12]{0} multiply(subtract.64, add.97)
+  subtract.99 = f32[12]{0} subtract(multiply.98, add.94)
+  constant.26 = f32[] constant(-0.0105640851)
+  broadcast.27 = f32[12]{0} broadcast(constant.26), dimensions={}
+  add.100 = f32[12]{0} add(subtract.99, broadcast.27)
+  multiply.101 = f32[12]{0} multiply(subtract.64, add.100)
+  subtract.102 = f32[12]{0} subtract(multiply.101, add.97)
+  constant.24 = f32[] constant(0.0247264486)
+  broadcast.25 = f32[12]{0} broadcast(constant.24), dimensions={}
+  add.103 = f32[12]{0} add(subtract.102, broadcast.25)
+  multiply.104 = f32[12]{0} multiply(subtract.64, add.103)
+  subtract.105 = f32[12]{0} subtract(multiply.104, add.100)
+  constant.22 = f32[] constant(-0.0529459827)
+  broadcast.23 = f32[12]{0} broadcast(constant.22), dimensions={}
+  add.106 = f32[12]{0} add(subtract.105, broadcast.23)
+  multiply.107 = f32[12]{0} multiply(subtract.64, add.106)
+  subtract.108 = f32[12]{0} subtract(multiply.107, add.103)
+  constant.20 = f32[] constant(0.102643661)
+  broadcast.21 = f32[12]{0} broadcast(constant.20), dimensions={}
+  add.109 = f32[12]{0} add(subtract.108, broadcast.21)
+  multiply.110 = f32[12]{0} multiply(subtract.64, add.109)
+  subtract.111 = f32[12]{0} subtract(multiply.110, add.106)
+  constant.18 = f32[] constant(-0.176416516)
+  broadcast.19 = f32[12]{0} broadcast(constant.18), dimensions={}
+  add.112 = f32[12]{0} add(subtract.111, broadcast.19)
+  multiply.113 = f32[12]{0} multiply(subtract.64, add.112)
+  subtract.114 = f32[12]{0} subtract(multiply.113, add.109)
+  constant.16 = f32[] constant(0.252587199)
+  broadcast.17 = f32[12]{0} broadcast(constant.16), dimensions={}
+  add.115 = f32[12]{0} add(subtract.114, broadcast.17)
+  subtract.116 = f32[12]{0} subtract(add.115, add.109)
+  multiply.117 = f32[12]{0} multiply(subtract.116, broadcast.59)
+  multiply.118 = f32[12]{0} multiply(abs.62, multiply.117)
+  constant.54 = f32[] constant(32)
+  broadcast.55 = f32[12]{0} broadcast(constant.54), dimensions={}
+  divide.119 = f32[12]{0} divide(broadcast.55, abs.62)
+  subtract.120 = f32[12]{0} subtract(divide.119, broadcast.57)
+  multiply.121 = f32[12]{0} multiply(subtract.120, broadcast.51)
+  subtract.122 = f32[12]{0} subtract(multiply.121, broadcast.51)
+  constant.14 = f32[] constant(-3.83538046e-09)
+  broadcast.15 = f32[12]{0} broadcast(constant.14), dimensions={}
+  add.123 = f32[12]{0} add(subtract.122, broadcast.15)
+  multiply.124 = f32[12]{0} multiply(subtract.120, add.123)
+  subtract.125 = f32[12]{0} subtract(multiply.124, broadcast.51)
+  constant.12 = f32[] constant(-2.63146891e-08)
+  broadcast.13 = f32[12]{0} broadcast(constant.12), dimensions={}
+  add.126 = f32[12]{0} add(subtract.125, broadcast.13)
+  multiply.127 = f32[12]{0} multiply(subtract.120, add.126)
+  subtract.128 = f32[12]{0} subtract(multiply.127, add.123)
+  constant.10 = f32[] constant(-2.51223611e-07)
+  broadcast.11 = f32[12]{0} broadcast(constant.10), dimensions={}
+  add.129 = f32[12]{0} add(subtract.128, broadcast.11)
+  multiply.130 = f32[12]{0} multiply(subtract.120, add.129)
+  subtract.131 = f32[12]{0} subtract(multiply.130, add.126)
+  constant.8 = f32[] constant(-3.88256467e-06)
+  broadcast.9 = f32[12]{0} broadcast(constant.8), dimensions={}
+  add.132 = f32[12]{0} add(subtract.131, broadcast.9)
+  multiply.133 = f32[12]{0} multiply(subtract.120, add.132)
+  subtract.134 = f32[12]{0} subtract(multiply.133, add.129)
+  constant.6 = f32[] constant(-0.000110588939)
+  broadcast.7 = f32[12]{0} broadcast(constant.6), dimensions={}
+  add.135 = f32[12]{0} add(subtract.134, broadcast.7)
+  multiply.136 = f32[12]{0} multiply(subtract.120, add.135)
+  subtract.137 = f32[12]{0} subtract(multiply.136, add.132)
+  constant.4 = f32[] constant(-0.00976109784)
+  broadcast.5 = f32[12]{0} broadcast(constant.4), dimensions={}
+  add.138 = f32[12]{0} add(subtract.137, broadcast.5)
+  multiply.139 = f32[12]{0} multiply(subtract.120, add.138)
+  subtract.140 = f32[12]{0} subtract(multiply.139, add.135)
+  constant.2 = f32[] constant(0.778576255)
+  broadcast.3 = f32[12]{0} broadcast(constant.2), dimensions={}
+  add.141 = f32[12]{0} add(subtract.140, broadcast.3)
+  subtract.142 = f32[12]{0} subtract(add.141, add.135)
+  multiply.143 = f32[12]{0} multiply(subtract.142, broadcast.59)
+  sqrt.144 = f32[12]{0} sqrt(abs.62)
+  divide.145 = f32[12]{0} divide(multiply.143, sqrt.144)
+  select.147 = f32[12]{0} select(compare.146, multiply.118, divide.145)
+  multiply.149 = f32[12]{0} multiply(sign.148, select.147)
+  constant.60 = f32[] constant(-0.9921875)
+  broadcast.61 = f32[12]{0} broadcast(constant.60), dimensions={}
+  maximum.150 = f32[12]{0} maximum(multiply.149, broadcast.61)
+  log-plus-one.151 = f32[12]{0} log-plus-one(maximum.150)
+  ROOT tuple.152 = (f32[12]{0}) tuple(log-plus-one.151)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_69.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_69.hlo
new file mode 100644
index 00000000000..8c3a8f8a53f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_69.hlo
@@ -0,0 +1,25 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u32[14,11,10,14]{3,2,1,0})->(u32[14,11,14,10]{0,3,1,2})}
+
+region_0.11 {
+  Arg_0.12 = u32[] parameter(0)
+  Arg_1.13 = u32[] parameter(1)
+  ROOT maximum.14 = u32[] maximum(Arg_0.12, Arg_1.13)
+}
+
+ENTRY main.18 {
+  Arg_0.1 = u32[14,11,10,14]{3,2,1,0} parameter(0)
+  constant.2 = s32[] constant(0)
+  broadcast.3 = s32[7,3]{1,0} broadcast(constant.2), dimensions={}
+  constant.5 = s32[] constant(1)
+  broadcast.6 = s32[7,3]{1,0} broadcast(constant.5), dimensions={}
+  constant.4 = s32[3]{0} constant({2, 4, 9})
+  broadcast.9 = s32[7,3]{1,0} broadcast(constant.4), dimensions={1}
+  clamp.10 = s32[7,3]{1,0} clamp(broadcast.3, broadcast.6, broadcast.9)
+  constant.7 = u32[] constant(1)
+  broadcast.8 = u32[6,9,10,7]{3,2,1,0} broadcast(constant.7), dimensions={}
+  scatter.15 = u32[14,11,10,14]{3,2,1,0} scatter(Arg_0.1, clamp.10, broadcast.8), update_window_dims={0,1,2}, inserted_window_dims={2}, scatter_dims_to_operand_dims={1,3,2}, index_vector_dim=1, to_apply=region_0.11
+  transpose.16 = u32[14,11,14,10]{0,3,1,2} transpose(scatter.15), dimensions={3,1,0,2}
+  ROOT tuple.17 = (u32[14,11,14,10]{0,3,1,2}) tuple(transpose.16)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_7.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_7.hlo
new file mode 100644
index 00000000000..a71e77bd92f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_7.hlo
@@ -0,0 +1,53 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(pred[])->(u16[7,11,11,3]{3,2,1,0})}
+
+region_1.9 {
+  Arg_0.10 = s32[] parameter(0)
+  Arg_1.11 = s32[] parameter(1)
+  ROOT add.12 = s32[] add(Arg_0.10, Arg_1.11)
+}
+
+region_0.13 {
+  arg_empty_tuple.14 = () parameter(0)
+  constant.16 = s32[] constant(1)
+  broadcast.17 = s32[5]{0} broadcast(constant.16), dimensions={}
+  constant.15 = s32[] constant(0)
+  reduce.18 = s32[] reduce(broadcast.17, constant.15), dimensions={0}, to_apply=region_1.9
+  ROOT broadcast.19 = s32[4]{0} broadcast(reduce.18), dimensions={}
+}
+
+region_3.20 {
+  Arg_0.21 = s32[] parameter(0)
+  Arg_1.22 = s32[] parameter(1)
+  ROOT add.23 = s32[] add(Arg_0.21, Arg_1.22)
+}
+
+region_2.24 {
+  arg_empty_tuple.25 = () parameter(0)
+  constant.27 = s32[] constant(1)
+  broadcast.28 = s32[5]{0} broadcast(constant.27), dimensions={}
+  constant.26 = s32[] constant(0)
+  reduce.29 = s32[] reduce(broadcast.28, constant.26), dimensions={0}, to_apply=region_3.20
+  ROOT broadcast.30 = s32[4]{0} broadcast(reduce.29), dimensions={}
+}
+
+region_4.32 {
+  Arg_0.33 = u16[] parameter(0)
+  Arg_1.34 = u16[] parameter(1)
+  ROOT multiply.35 = u16[] multiply(Arg_0.33, Arg_1.34)
+}
+
+ENTRY main.38 {
+  constant.4 = u16[] constant(1)
+  broadcast.5 = u16[7,11,11,3]{3,2,1,0} broadcast(constant.4), dimensions={}
+  Arg_0.1 = pred[] parameter(0)
+  convert.6 = s32[] convert(Arg_0.1)
+  tuple.7 = () tuple()
+  tuple.8 = () tuple()
+  conditional.31 = s32[4]{0} conditional(convert.6, tuple.7, tuple.8), branch_computations={region_0.13, region_2.24}
+  constant.2 = u16[] constant(1)
+  broadcast.3 = u16[6,8,7,2]{3,2,1,0} broadcast(constant.2), dimensions={}
+  scatter.36 = u16[7,11,11,3]{3,2,1,0} scatter(broadcast.5, conditional.31, broadcast.3), update_window_dims={0,1,2,3}, inserted_window_dims={}, scatter_dims_to_operand_dims={2,0,1,3}, index_vector_dim=0, to_apply=region_4.32
+  ROOT tuple.37 = (u16[7,11,11,3]{3,2,1,0}) tuple(scatter.36)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_70.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_70.hlo
new file mode 100644
index 00000000000..a8bda75ecbd
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_70.hlo
@@ -0,0 +1,12 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s32[5]{0})->(s32[5]{0})}
+
+ENTRY main.7 {
+  Arg_0.1 = s32[5]{0} parameter(0)
+  constant.2 = s32[] constant(1)
+  broadcast.3 = s32[5]{0} broadcast(constant.2), dimensions={}
+  shift-right-logical.4 = s32[5]{0} shift-right-logical(Arg_0.1, broadcast.3)
+  divide.5 = s32[5]{0} divide(shift-right-logical.4, broadcast.3)
+  ROOT tuple.6 = (s32[5]{0}) tuple(divide.5)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_71.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_71.hlo
new file mode 100644
index 00000000000..025029419ad
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_71.hlo
@@ -0,0 +1,37 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s16[10,14]{1,0})->(s16[15,7]{1,0})}
+
+region_0.20 {
+  Arg_0.21 = pred[] parameter(0)
+  Arg_1.22 = pred[] parameter(1)
+  ROOT and.23 = pred[] and(Arg_0.21, Arg_1.22)
+}
+
+ENTRY main.30 {
+  constant.7 = s16[] constant(1)
+  broadcast.8 = s16[15,16]{1,0} broadcast(constant.7), dimensions={}
+  constant.10 = s32[] constant(1)
+  broadcast.11 = s32[16,1]{1,0} broadcast(constant.10), dimensions={}
+  constant.6 = s32[2]{0} constant({10, 14})
+  constant.4 = s32[1,1]{1,0} constant({ {1} })
+  gather.12 = s32[1]{0} gather(constant.6, constant.4), offset_dims={}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1, slice_sizes={1}
+  constant.5 = s32[2]{0} constant({1, 7})
+  gather.13 = s32[1]{0} gather(constant.5, constant.4), offset_dims={}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1, slice_sizes={1}
+  subtract.14 = s32[1]{0} subtract(gather.12, gather.13)
+  reshape.15 = s32[1,1]{1,0} reshape(subtract.14)
+  broadcast.16 = s32[1,1]{1,0} broadcast(reshape.15), dimensions={0,1}
+  reshape.17 = s32[1]{0} reshape(broadcast.16)
+  broadcast.18 = s32[16,1]{1,0} broadcast(reshape.17), dimensions={1}
+  compare.19 = pred[16,1]{1,0} compare(broadcast.11, broadcast.18), direction=LE
+  constant.9 = pred[] constant(true)
+  reduce.24 = pred[16]{0} reduce(compare.19, constant.9), dimensions={1}, to_apply=region_0.20
+  broadcast.26 = pred[16,7]{1,0} broadcast(reduce.24), dimensions={0}
+  Arg_0.1 = s16[10,14]{1,0} parameter(0)
+  gather.25 = s16[16,7]{1,0} gather(Arg_0.1, broadcast.11), offset_dims={1}, collapsed_slice_dims={0}, start_index_map={1}, index_vector_dim=1, slice_sizes={1,7}
+  constant.2 = s16[] constant(0)
+  broadcast.3 = s16[16,7]{1,0} broadcast(constant.2), dimensions={}
+  select.27 = s16[16,7]{1,0} select(broadcast.26, gather.25, broadcast.3)
+  dot.28 = s16[15,7]{1,0} dot(broadcast.8, select.27), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT tuple.29 = (s16[15,7]{1,0}) tuple(dot.28)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_72.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_72.hlo
new file mode 100644
index 00000000000..fb6a7d2073d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_72.hlo
@@ -0,0 +1,15 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(c64[12,4,6,13,3]{4,3,2,1,0})}
+
+ENTRY main.10 {
+  constant.3 = c64[] constant((0, 0))
+  broadcast.4 = c64[12,4,6,13,3]{4,3,2,1,0} broadcast(constant.3), dimensions={}
+  constant.1 = c64[] constant((1, 0))
+  broadcast.2 = c64[12,4,6,13,3]{4,3,2,1,0} broadcast(constant.1), dimensions={}
+  exponential-minus-one.5 = c64[12,4,6,13,3]{4,3,2,1,0} exponential-minus-one(broadcast.2)
+  compare.6 = pred[12,4,6,13,3]{4,3,2,1,0} compare(broadcast.4, exponential-minus-one.5), direction=EQ
+  select.7 = c64[12,4,6,13,3]{4,3,2,1,0} select(compare.6, broadcast.2, exponential-minus-one.5)
+  rsqrt.8 = c64[12,4,6,13,3]{4,3,2,1,0} rsqrt(select.7)
+  ROOT tuple.9 = (c64[12,4,6,13,3]{4,3,2,1,0}) tuple(rsqrt.8)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_73.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_73.hlo
new file mode 100644
index 00000000000..4a666c5bd7d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_73.hlo
@@ -0,0 +1,14 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s32[7,8,7,12,14]{4,3,2,1,0})->(s32[7,8,7,12,14]{4,3,2,1,0})}
+
+ENTRY main.9 {
+  constant.2 = s32[] constant(0)
+  broadcast.3 = s32[7,8,7,12,14]{4,3,2,1,0} broadcast(constant.2), dimensions={}
+  Arg_0.1 = s32[7,8,7,12,14]{4,3,2,1,0} parameter(0)
+  constant.4 = s32[] constant(1)
+  broadcast.5 = s32[7,8,7,12,14]{4,3,2,1,0} broadcast(constant.4), dimensions={}
+  shift-right-logical.6 = s32[7,8,7,12,14]{4,3,2,1,0} shift-right-logical(Arg_0.1, broadcast.5)
+  shift-left.7 = s32[7,8,7,12,14]{4,3,2,1,0} shift-left(broadcast.3, shift-right-logical.6)
+  ROOT tuple.8 = (s32[7,8,7,12,14]{4,3,2,1,0}) tuple(shift-left.7)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_74.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_74.hlo
new file mode 100644
index 00000000000..c595acc8f99
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_74.hlo
@@ -0,0 +1,9 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(u32[7,13]{1,0})}
+
+ENTRY main.4 {
+  constant.1 = u32[] constant(1)
+  broadcast.2 = u32[7,13]{1,0} broadcast(constant.1), dimensions={}
+  ROOT tuple.3 = (u32[7,13]{1,0}) tuple(broadcast.2)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_75.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_75.hlo
new file mode 100644
index 00000000000..c221028e4e8
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_75.hlo
@@ -0,0 +1,12 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s16[4,14,14]{2,1,0})->(s16[4,14,14]{2,1,0})}
+
+ENTRY main.7 {
+  constant.2 = s16[] constant(1)
+  broadcast.3 = s16[4,14,14]{2,1,0} broadcast(constant.2), dimensions={}
+  Arg_0.1 = s16[4,14,14]{2,1,0} parameter(0)
+  clamp.4 = s16[4,14,14]{2,1,0} clamp(broadcast.3, Arg_0.1, broadcast.3)
+  negate.5 = s16[4,14,14]{2,1,0} negate(clamp.4)
+  ROOT tuple.6 = (s16[4,14,14]{2,1,0}) tuple(negate.5)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_76.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_76.hlo
new file mode 100644
index 00000000000..130d640f4df
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_76.hlo
@@ -0,0 +1,19 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s16[12]{0},s16[12]{0},s16[12]{0})->(s16[12]{0})}
+
+ENTRY main.14 {
+  Arg_0.1 = s16[12]{0} parameter(0)
+  constant.4 = s16[] constant(0)
+  broadcast.5 = s16[12]{0} broadcast(constant.4), dimensions={}
+  Arg_1.2 = s16[12]{0} parameter(1)
+  compare.8 = pred[12]{0} compare(broadcast.5, Arg_1.2), direction=EQ
+  constant.6 = s16[] constant(1)
+  broadcast.7 = s16[12]{0} broadcast(constant.6), dimensions={}
+  select.9 = s16[12]{0} select(compare.8, broadcast.7, Arg_1.2)
+  divide.10 = s16[12]{0} divide(Arg_0.1, select.9)
+  Arg_2.3 = s16[12]{0} parameter(2)
+  maximum.11 = s16[12]{0} maximum(Arg_2.3, broadcast.7)
+  xor.12 = s16[12]{0} xor(divide.10, maximum.11)
+  ROOT tuple.13 = (s16[12]{0}) tuple(xor.12)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_77.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_77.hlo
new file mode 100644
index 00000000000..e576a550d95
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_77.hlo
@@ -0,0 +1,25 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(u32[11]{0})}
+
+region_0.4 {
+  Arg_0.5 = u32[] parameter(0)
+  Arg_1.6 = u32[] parameter(1)
+  ROOT maximum.7 = u32[] maximum(Arg_0.5, Arg_1.6)
+}
+
+region_1.9 {
+  Arg_0.10 = u32[] parameter(0)
+  Arg_1.11 = u32[] parameter(1)
+  ROOT add.12 = u32[] add(Arg_0.10, Arg_1.11)
+}
+
+ENTRY main.16 {
+  constant.1 = u32[] constant(1)
+  broadcast.2 = u32[13]{0} broadcast(constant.1), dimensions={}
+  constant.3 = u32[] constant(0)
+  reduce-window.8 = u32[13]{0} reduce-window(broadcast.2, constant.3), window={size=1 rhs_dilate=3}, to_apply=region_0.4
+  reduce.13 = u32[] reduce(reduce-window.8, constant.3), dimensions={0}, to_apply=region_1.9
+  broadcast.14 = u32[11]{0} broadcast(reduce.13), dimensions={}
+  ROOT tuple.15 = (u32[11]{0}) tuple(broadcast.14)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_78.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_78.hlo
new file mode 100644
index 00000000000..75186475d34
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_78.hlo
@@ -0,0 +1,12 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s8[14,2]{1,0})->(s8[4,14,8,2,8]{4,3,2,1,0})}
+
+ENTRY main.7 {
+  Arg_0.1 = s8[14,2]{1,0} parameter(0)
+  constant.2 = s8[] constant(1)
+  broadcast.3 = s8[14,2]{1,0} broadcast(constant.2), dimensions={}
+  minimum.4 = s8[14,2]{1,0} minimum(Arg_0.1, broadcast.3)
+  broadcast.5 = s8[4,14,8,2,8]{4,3,2,1,0} broadcast(minimum.4), dimensions={1,3}
+  ROOT tuple.6 = (s8[4,14,8,2,8]{4,3,2,1,0}) tuple(broadcast.5)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_79.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_79.hlo
new file mode 100644
index 00000000000..147cd5b473d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_79.hlo
@@ -0,0 +1,39 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s8[5,13,11,16,4,7]{5,4,3,2,1,0},s32[6,4]{1,0},s8[5,14,5,3,9]{4,3,2,1,0},s8[10,14,15,2,7]{4,3,2,1,0})->(s8[5,11,16,7]{3,2,1,0})}
+
+region_0.8 {
+  Arg_0.9 = s8[] parameter(0)
+  Arg_1.10 = s8[] parameter(1)
+  ROOT add.11 = s8[] add(Arg_0.9, Arg_1.10)
+}
+
+region_1.15 {
+  Arg_0.16 = s8[] parameter(0)
+  Arg_1.17 = s8[] parameter(1)
+  ROOT add.18 = s8[] add(Arg_0.16, Arg_1.17)
+}
+
+region_2.22 {
+  Arg_0.23 = s8[] parameter(0)
+  Arg_1.24 = s8[] parameter(1)
+  ROOT maximum.25 = s8[] maximum(Arg_0.23, Arg_1.24)
+}
+
+ENTRY main.28 {
+  Arg_0.1 = s8[5,13,11,16,4,7]{5,4,3,2,1,0} parameter(0)
+  constant.7 = s8[] constant(0)
+  reduce.12 = s8[5,11,16,7]{3,2,1,0} reduce(Arg_0.1, constant.7), dimensions={1,4}, to_apply=region_0.8
+  constant.5 = s32[] constant(1)
+  broadcast.6 = s32[6,4]{1,0} broadcast(constant.5), dimensions={}
+  Arg_1.2 = s32[6,4]{1,0} parameter(1)
+  subtract.13 = s32[6,4]{1,0} subtract(broadcast.6, Arg_1.2)
+  Arg_2.3 = s8[5,14,5,3,9]{4,3,2,1,0} parameter(2)
+  Arg_3.4 = s8[10,14,15,2,7]{4,3,2,1,0} parameter(3)
+  convolution.14 = s8[5,10,16,6,6]{4,3,2,1,0} convolution(Arg_2.3, Arg_3.4), window={size=15x2x7 stride=1x2x1 pad=6_7x1_4x7_6 lhs_dilate=4x4x2 rhs_dilate=1x3x4}, dim_labels=bf012_oi012->bf012
+  reduce.19 = s8[5,10,16,6]{3,2,1,0} reduce(convolution.14, constant.7), dimensions={3}, to_apply=region_1.15
+  broadcast.20 = s8[5,5,10,16,6]{4,3,2,1,0} broadcast(reduce.19), dimensions={1,2,3,4}
+  reshape.21 = s8[5,10,16,5,6]{4,3,2,1,0} reshape(broadcast.20)
+  scatter.26 = s8[5,11,16,7]{3,2,1,0} scatter(reduce.12, subtract.13, reshape.21), update_window_dims={0,1,2,3}, inserted_window_dims={}, scatter_dims_to_operand_dims={3,0,1,2}, index_vector_dim=1, to_apply=region_2.22
+  ROOT tuple.27 = (s8[5,11,16,7]{3,2,1,0}) tuple(scatter.26)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_8.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_8.hlo
new file mode 100644
index 00000000000..e241a7eceb0
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_8.hlo
@@ -0,0 +1,128 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(f16[16,2,11]{2,1,0})->(f16[13,2]{1,0})}
+
+region_0.115 {
+  Arg_0.116 = f16[] parameter(0)
+  Arg_1.117 = f16[] parameter(1)
+  ROOT minimum.118 = f16[] minimum(Arg_0.116, Arg_1.117)
+}
+
+ENTRY main.121 {
+  constant.4 = f16[] constant(1)
+  broadcast.5 = f16[13,2]{1,0} broadcast(constant.4), dimensions={}
+  log-plus-one.54 = f16[13,2]{1,0} log-plus-one(broadcast.5)
+  constant.2 = s32[] constant(0)
+  broadcast.3 = s32[16,11,1]{2,1,0} broadcast(constant.2), dimensions={}
+  Arg_0.1 = f16[16,2,11]{2,1,0} parameter(0)
+  constant.52 = f16[] constant(0.0078125)
+  broadcast.53 = f16[16,2,11]{2,1,0} broadcast(constant.52), dimensions={}
+  maximum.55 = f16[16,2,11]{2,1,0} maximum(Arg_0.1, broadcast.53)
+  convert.56 = f32[16,2,11]{2,1,0} convert(maximum.55)
+  abs.111 = f32[16,2,11]{2,1,0} abs(convert.56)
+  constant.6 = f32[] constant(inf)
+  broadcast.7 = f32[16,2,11]{2,1,0} broadcast(constant.6), dimensions={}
+  compare.112 = pred[16,2,11]{2,1,0} compare(abs.111, broadcast.7), direction=EQ
+  constant.50 = f32[] constant(0.5)
+  broadcast.51 = f32[16,2,11]{2,1,0} broadcast(constant.50), dimensions={}
+  compare.57 = pred[16,2,11]{2,1,0} compare(convert.56, broadcast.51), direction=LT
+  abs.96 = f32[16,2,11]{2,1,0} abs(convert.56)
+  floor.97 = f32[16,2,11]{2,1,0} floor(abs.96)
+  subtract.98 = f32[16,2,11]{2,1,0} subtract(abs.96, floor.97)
+  compare.99 = pred[16,2,11]{2,1,0} compare(broadcast.51, subtract.98), direction=LT
+  constant.48 = f32[] constant(1)
+  broadcast.49 = f32[16,2,11]{2,1,0} broadcast(constant.48), dimensions={}
+  subtract.100 = f32[16,2,11]{2,1,0} subtract(broadcast.49, subtract.98)
+  select.101 = f32[16,2,11]{2,1,0} select(compare.99, subtract.100, subtract.98)
+  constant.10 = f32[] constant(3.14159274)
+  broadcast.11 = f32[16,2,11]{2,1,0} broadcast(constant.10), dimensions={}
+  multiply.102 = f32[16,2,11]{2,1,0} multiply(select.101, broadcast.11)
+  sine.103 = f32[16,2,11]{2,1,0} sine(multiply.102)
+  log.104 = f32[16,2,11]{2,1,0} log(sine.103)
+  is-finite.107 = pred[16,2,11]{2,1,0} is-finite(log.104)
+  constant.8 = f32[] constant(1.14472985)
+  broadcast.9 = f32[16,2,11]{2,1,0} broadcast(constant.8), dimensions={}
+  subtract.105 = f32[16,2,11]{2,1,0} subtract(broadcast.9, log.104)
+  negate.58 = f32[16,2,11]{2,1,0} negate(convert.56)
+  subtract.59 = f32[16,2,11]{2,1,0} subtract(convert.56, broadcast.49)
+  select.60 = f32[16,2,11]{2,1,0} select(compare.57, negate.58, subtract.59)
+  add.90 = f32[16,2,11]{2,1,0} add(select.60, broadcast.51)
+  constant.16 = f32[] constant(7.5)
+  broadcast.17 = f32[16,2,11]{2,1,0} broadcast(constant.16), dimensions={}
+  add.85 = f32[16,2,11]{2,1,0} add(select.60, broadcast.17)
+  divide.86 = f32[16,2,11]{2,1,0} divide(select.60, broadcast.17)
+  log-plus-one.87 = f32[16,2,11]{2,1,0} log-plus-one(divide.86)
+  constant.14 = f32[] constant(2.01490307)
+  broadcast.15 = f32[16,2,11]{2,1,0} broadcast(constant.14), dimensions={}
+  add.88 = f32[16,2,11]{2,1,0} add(log-plus-one.87, broadcast.15)
+  divide.89 = f32[16,2,11]{2,1,0} divide(add.85, add.88)
+  subtract.91 = f32[16,2,11]{2,1,0} subtract(add.90, divide.89)
+  multiply.92 = f32[16,2,11]{2,1,0} multiply(subtract.91, add.88)
+  constant.12 = f32[] constant(0.918938518)
+  broadcast.13 = f32[16,2,11]{2,1,0} broadcast(constant.12), dimensions={}
+  add.94 = f32[16,2,11]{2,1,0} add(multiply.92, broadcast.13)
+  constant.46 = f32[] constant(676.520386)
+  broadcast.47 = f32[16,2,11]{2,1,0} broadcast(constant.46), dimensions={}
+  add.61 = f32[16,2,11]{2,1,0} add(select.60, broadcast.49)
+  divide.62 = f32[16,2,11]{2,1,0} divide(broadcast.47, add.61)
+  add.63 = f32[16,2,11]{2,1,0} add(divide.62, broadcast.49)
+  constant.44 = f32[] constant(-1259.13916)
+  broadcast.45 = f32[16,2,11]{2,1,0} broadcast(constant.44), dimensions={}
+  constant.42 = f32[] constant(2)
+  broadcast.43 = f32[16,2,11]{2,1,0} broadcast(constant.42), dimensions={}
+  add.64 = f32[16,2,11]{2,1,0} add(select.60, broadcast.43)
+  divide.65 = f32[16,2,11]{2,1,0} divide(broadcast.45, add.64)
+  add.66 = f32[16,2,11]{2,1,0} add(add.63, divide.65)
+  constant.40 = f32[] constant(771.323425)
+  broadcast.41 = f32[16,2,11]{2,1,0} broadcast(constant.40), dimensions={}
+  constant.38 = f32[] constant(3)
+  broadcast.39 = f32[16,2,11]{2,1,0} broadcast(constant.38), dimensions={}
+  add.67 = f32[16,2,11]{2,1,0} add(select.60, broadcast.39)
+  divide.68 = f32[16,2,11]{2,1,0} divide(broadcast.41, add.67)
+  add.69 = f32[16,2,11]{2,1,0} add(add.66, divide.68)
+  constant.36 = f32[] constant(-176.615036)
+  broadcast.37 = f32[16,2,11]{2,1,0} broadcast(constant.36), dimensions={}
+  constant.34 = f32[] constant(4)
+  broadcast.35 = f32[16,2,11]{2,1,0} broadcast(constant.34), dimensions={}
+  add.70 = f32[16,2,11]{2,1,0} add(select.60, broadcast.35)
+  divide.71 = f32[16,2,11]{2,1,0} divide(broadcast.37, add.70)
+  add.72 = f32[16,2,11]{2,1,0} add(add.69, divide.71)
+  constant.32 = f32[] constant(12.5073433)
+  broadcast.33 = f32[16,2,11]{2,1,0} broadcast(constant.32), dimensions={}
+  constant.30 = f32[] constant(5)
+  broadcast.31 = f32[16,2,11]{2,1,0} broadcast(constant.30), dimensions={}
+  add.73 = f32[16,2,11]{2,1,0} add(select.60, broadcast.31)
+  divide.74 = f32[16,2,11]{2,1,0} divide(broadcast.33, add.73)
+  add.75 = f32[16,2,11]{2,1,0} add(add.72, divide.74)
+  constant.28 = f32[] constant(-0.138571098)
+  broadcast.29 = f32[16,2,11]{2,1,0} broadcast(constant.28), dimensions={}
+  constant.26 = f32[] constant(6)
+  broadcast.27 = f32[16,2,11]{2,1,0} broadcast(constant.26), dimensions={}
+  add.76 = f32[16,2,11]{2,1,0} add(select.60, broadcast.27)
+  divide.77 = f32[16,2,11]{2,1,0} divide(broadcast.29, add.76)
+  add.78 = f32[16,2,11]{2,1,0} add(add.75, divide.77)
+  constant.24 = f32[] constant(9.98436917e-06)
+  broadcast.25 = f32[16,2,11]{2,1,0} broadcast(constant.24), dimensions={}
+  constant.22 = f32[] constant(7)
+  broadcast.23 = f32[16,2,11]{2,1,0} broadcast(constant.22), dimensions={}
+  add.79 = f32[16,2,11]{2,1,0} add(select.60, broadcast.23)
+  divide.80 = f32[16,2,11]{2,1,0} divide(broadcast.25, add.79)
+  add.81 = f32[16,2,11]{2,1,0} add(add.78, divide.80)
+  constant.20 = f32[] constant(1.50563267e-07)
+  broadcast.21 = f32[16,2,11]{2,1,0} broadcast(constant.20), dimensions={}
+  constant.18 = f32[] constant(8)
+  broadcast.19 = f32[16,2,11]{2,1,0} broadcast(constant.18), dimensions={}
+  add.82 = f32[16,2,11]{2,1,0} add(select.60, broadcast.19)
+  divide.83 = f32[16,2,11]{2,1,0} divide(broadcast.21, add.82)
+  add.84 = f32[16,2,11]{2,1,0} add(add.81, divide.83)
+  log.93 = f32[16,2,11]{2,1,0} log(add.84)
+  add.95 = f32[16,2,11]{2,1,0} add(add.94, log.93)
+  subtract.106 = f32[16,2,11]{2,1,0} subtract(subtract.105, add.95)
+  negate.108 = f32[16,2,11]{2,1,0} negate(log.104)
+  select.109 = f32[16,2,11]{2,1,0} select(is-finite.107, subtract.106, negate.108)
+  select.110 = f32[16,2,11]{2,1,0} select(compare.57, select.109, add.95)
+  select.113 = f32[16,2,11]{2,1,0} select(compare.112, broadcast.7, select.110)
+  convert.114 = f16[16,2,11]{2,1,0} convert(select.113)
+  scatter.119 = f16[13,2]{1,0} scatter(log-plus-one.54, broadcast.3, convert.114), update_window_dims={1}, inserted_window_dims={0}, scatter_dims_to_operand_dims={1}, index_vector_dim=2, to_apply=region_0.115
+  ROOT tuple.120 = (f16[13,2]{1,0}) tuple(scatter.119)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_80.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_80.hlo
new file mode 100644
index 00000000000..409322f592a
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_80.hlo
@@ -0,0 +1,14 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u16[],u16[],u16[])->(u16[])}
+
+ENTRY main.9 {
+  Arg_0.1 = u16[] parameter(0)
+  Arg_1.2 = u16[] parameter(1)
+  shift-left.5 = u16[] shift-left(Arg_0.1, Arg_1.2)
+  Arg_2.3 = u16[] parameter(2)
+  constant.4 = u16[] constant(1)
+  xor.6 = u16[] xor(Arg_2.3, constant.4)
+  minimum.7 = u16[] minimum(shift-left.5, xor.6)
+  ROOT tuple.8 = (u16[]) tuple(minimum.7)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_81.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_81.hlo
new file mode 100644
index 00000000000..8225683aa90
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_81.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(c64[3,12,11,13]{3,2,1,0})->(c64[3,12,11,13]{3,2,1,0})}
+
+ENTRY main.8 {
+  constant.2 = c64[] constant((1, 0))
+  broadcast.3 = c64[3,12,11,13]{3,2,1,0} broadcast(constant.2), dimensions={}
+  rsqrt.4 = c64[3,12,11,13]{3,2,1,0} rsqrt(broadcast.3)
+  Arg_0.1 = c64[3,12,11,13]{3,2,1,0} parameter(0)
+  atan2.5 = c64[3,12,11,13]{3,2,1,0} atan2(Arg_0.1, broadcast.3)
+  subtract.6 = c64[3,12,11,13]{3,2,1,0} subtract(rsqrt.4, atan2.5)
+  ROOT tuple.7 = (c64[3,12,11,13]{3,2,1,0}) tuple(subtract.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_82.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_82.hlo
new file mode 100644
index 00000000000..f8256dfb10d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_82.hlo
@@ -0,0 +1,27 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(f32[6]{0},s32[10,4,1,15]{3,2,1,0},f32[10,15,5,4]{3,2,1,0})->(f32[6]{0})}
+
+region_0.14 {
+  Arg_0.15 = f32[] parameter(0)
+  Arg_1.16 = f32[] parameter(1)
+  ROOT minimum.17 = f32[] minimum(Arg_0.15, Arg_1.16)
+}
+
+ENTRY main.20 {
+  Arg_0.1 = f32[6]{0} parameter(0)
+  constant.4 = s32[] constant(0)
+  broadcast.5 = s32[10,15,4,1]{3,2,1,0} broadcast(constant.4), dimensions={}
+  Arg_1.2 = s32[10,4,1,15]{3,2,1,0} parameter(1)
+  transpose.10 = s32[10,15,4,1]{1,3,2,0} transpose(Arg_1.2), dimensions={0,3,1,2}
+  constant.6 = s32[] constant(1)
+  broadcast.7 = s32[10,15,4,1]{3,2,1,0} broadcast(constant.6), dimensions={}
+  clamp.13 = s32[10,15,4,1]{1,3,2,0} clamp(broadcast.5, transpose.10, broadcast.7)
+  Arg_2.3 = f32[10,15,5,4]{3,2,1,0} parameter(2)
+  constant.8 = f32[] constant(0.0078125)
+  broadcast.9 = f32[10,15,5,4]{3,2,1,0} broadcast(constant.8), dimensions={}
+  maximum.11 = f32[10,15,5,4]{3,2,1,0} maximum(Arg_2.3, broadcast.9)
+  rsqrt.12 = f32[10,15,5,4]{3,2,1,0} rsqrt(maximum.11)
+  scatter.18 = f32[6]{0} scatter(Arg_0.1, clamp.13, rsqrt.12), update_window_dims={2}, inserted_window_dims={}, scatter_dims_to_operand_dims={0}, index_vector_dim=3, to_apply=region_0.14
+  ROOT tuple.19 = (f32[6]{0}) tuple(scatter.18)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_83.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_83.hlo
new file mode 100644
index 00000000000..7b264681fc4
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_83.hlo
@@ -0,0 +1,10 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(pred[],pred[])->(pred[])}
+
+ENTRY main.5 {
+  Arg_0.1 = pred[] parameter(0)
+  Arg_1.2 = pred[] parameter(1)
+  minimum.3 = pred[] minimum(Arg_0.1, Arg_1.2)
+  ROOT tuple.4 = (pred[]) tuple(minimum.3)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_84.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_84.hlo
new file mode 100644
index 00000000000..45ebb92f481
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_84.hlo
@@ -0,0 +1,16 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(c64[30,37,54]{2,1,0})->(c64[6,12,16]{2,1,0})}
+
+ENTRY main.11 {
+  constant.2 = c64[] constant((1, 0))
+  broadcast.3 = c64[30,37,54]{2,1,0} broadcast(constant.2), dimensions={}
+  constant.4 = c64[] constant((0, 0))
+  broadcast.5 = c64[30,37,54]{2,1,0} broadcast(constant.4), dimensions={}
+  Arg_0.1 = c64[30,37,54]{2,1,0} parameter(0)
+  compare.6 = pred[30,37,54]{2,1,0} compare(broadcast.5, Arg_0.1), direction=EQ
+  select.7 = c64[30,37,54]{2,1,0} select(compare.6, broadcast.3, Arg_0.1)
+  divide.8 = c64[30,37,54]{2,1,0} divide(broadcast.3, select.7)
+  slice.9 = c64[6,12,16]{2,1,0} slice(divide.8), slice={[12:30:3], [1:37:3], [6:54:3]}
+  ROOT tuple.10 = (c64[6,12,16]{2,1,0}) tuple(slice.9)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_85.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_85.hlo
new file mode 100644
index 00000000000..5ca2d95278a
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_85.hlo
@@ -0,0 +1,17 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(f16[])->(f16[])}
+
+ENTRY main.12 {
+  constant.3 = f16[] constant(-1.5)
+  constant.5 = f16[] constant(0)
+  Arg_0.1 = f16[] parameter(0)
+  compare.6 = pred[] compare(constant.5, Arg_0.1), direction=EQ
+  constant.4 = f16[] constant(1)
+  select.7 = f16[] select(compare.6, constant.4, Arg_0.1)
+  atan2.8 = f16[] atan2(select.7, constant.4)
+  constant.2 = f16[] constant(1.5)
+  clamp.9 = f16[] clamp(constant.3, atan2.8, constant.2)
+  tan.10 = f16[] tan(clamp.9)
+  ROOT tuple.11 = (f16[]) tuple(tan.10)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_86.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_86.hlo
new file mode 100644
index 00000000000..44bcedb8156
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_86.hlo
@@ -0,0 +1,14 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u32[],u32[],u32[])->(u32[])}
+
+ENTRY main.9 {
+  Arg_0.1 = u32[] parameter(0)
+  Arg_1.2 = u32[] parameter(1)
+  minimum.5 = u32[] minimum(Arg_0.1, Arg_1.2)
+  Arg_2.3 = u32[] parameter(2)
+  constant.4 = u32[] constant(1)
+  minimum.6 = u32[] minimum(Arg_2.3, constant.4)
+  and.7 = u32[] and(minimum.5, minimum.6)
+  ROOT tuple.8 = (u32[]) tuple(and.7)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_87.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_87.hlo
new file mode 100644
index 00000000000..9de1f088876
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_87.hlo
@@ -0,0 +1,12 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s32[14,5,5]{2,1,0})->(s32[5,5,14]{1,0,2})}
+
+ENTRY main.7 {
+  Arg_0.1 = s32[14,5,5]{2,1,0} parameter(0)
+  constant.2 = s32[] constant(1)
+  broadcast.3 = s32[14,5,5]{2,1,0} broadcast(constant.2), dimensions={}
+  maximum.4 = s32[14,5,5]{2,1,0} maximum(Arg_0.1, broadcast.3)
+  transpose.5 = s32[5,5,14]{1,0,2} transpose(maximum.4), dimensions={1,2,0}
+  ROOT tuple.6 = (s32[5,5,14]{1,0,2}) tuple(transpose.5)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_88.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_88.hlo
new file mode 100644
index 00000000000..4066454eaac
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_88.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(bf16[11]{0})}
+
+ENTRY main.8 {
+  constant.3 = bf16[] constant(0)
+  broadcast.4 = bf16[11]{0} broadcast(constant.3), dimensions={}
+  constant.1 = bf16[] constant(2)
+  broadcast.2 = bf16[11]{0} broadcast(constant.1), dimensions={}
+  atan2.5 = bf16[11]{0} atan2(broadcast.4, broadcast.2)
+  multiply.6 = bf16[11]{0} multiply(atan2.5, broadcast.2)
+  ROOT tuple.7 = (bf16[11]{0}) tuple(multiply.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_89.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_89.hlo
new file mode 100644
index 00000000000..6d0ecd608d2
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_89.hlo
@@ -0,0 +1,47 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u16[7,4,15,9]{3,2,1,0},u16[7,4,15,9]{3,2,1,0},pred[],u32[16,8,10]{2,1,0})->(u16[7,4,15,9]{3,2,1,0})}
+
+region_1.7 {
+  Arg_0.8 = u16[] parameter(0)
+  Arg_1.9 = u16[] parameter(1)
+  ROOT add.10 = u16[] add(Arg_0.8, Arg_1.9)
+}
+
+region_0.11 {
+  Arg_.12 = u32[16,8,10]{2,1,0} parameter(0)
+  convert.14 = u16[16,8,10]{2,1,0} convert(Arg_.12)
+  broadcast.15 = u16[7,16,8,10]{3,2,1,0} broadcast(convert.14), dimensions={1,2,3}
+  constant.13 = u16[] constant(0)
+  reduce.16 = u16[7]{0} reduce(broadcast.15, constant.13), dimensions={1,2,3}, to_apply=region_1.7
+  broadcast.17 = u16[4,15,9,7]{3,2,1,0} broadcast(reduce.16), dimensions={3}
+  ROOT reshape.18 = u16[7,4,15,9]{3,2,1,0} reshape(broadcast.17)
+}
+
+region_3.19 {
+  Arg_0.20 = u16[] parameter(0)
+  Arg_1.21 = u16[] parameter(1)
+  ROOT add.22 = u16[] add(Arg_0.20, Arg_1.21)
+}
+
+region_2.23 {
+  Arg_.24 = u32[16,8,10]{2,1,0} parameter(0)
+  convert.26 = u16[16,8,10]{2,1,0} convert(Arg_.24)
+  broadcast.27 = u16[7,16,8,10]{3,2,1,0} broadcast(convert.26), dimensions={1,2,3}
+  constant.25 = u16[] constant(0)
+  reduce.28 = u16[7]{0} reduce(broadcast.27, constant.25), dimensions={1,2,3}, to_apply=region_3.19
+  broadcast.29 = u16[4,15,9,7]{3,2,1,0} broadcast(reduce.28), dimensions={3}
+  ROOT reshape.30 = u16[7,4,15,9]{3,2,1,0} reshape(broadcast.29)
+}
+
+ENTRY main.34 {
+  Arg_0.1 = u16[7,4,15,9]{3,2,1,0} parameter(0)
+  Arg_1.2 = u16[7,4,15,9]{3,2,1,0} parameter(1)
+  shift-right-logical.5 = u16[7,4,15,9]{3,2,1,0} shift-right-logical(Arg_0.1, Arg_1.2)
+  Arg_2.3 = pred[] parameter(2)
+  convert.6 = s32[] convert(Arg_2.3)
+  Arg_3.4 = u32[16,8,10]{2,1,0} parameter(3)
+  conditional.31 = u16[7,4,15,9]{3,2,1,0} conditional(convert.6, Arg_3.4, Arg_3.4), branch_computations={region_0.11, region_2.23}
+  xor.32 = u16[7,4,15,9]{3,2,1,0} xor(shift-right-logical.5, conditional.31)
+  ROOT tuple.33 = (u16[7,4,15,9]{3,2,1,0}) tuple(xor.32)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_9.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_9.hlo
new file mode 100644
index 00000000000..df67d82e5dd
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_9.hlo
@@ -0,0 +1,254 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(bf16[])->(bf16[])}
+
+region_0.38 {
+  arg_tuple.39 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) parameter(0)
+  get-tuple-element.40 = pred[] get-tuple-element(arg_tuple.39), index=0
+  get-tuple-element.42 = f32[] get-tuple-element(arg_tuple.39), index=2
+  get-tuple-element.46 = f32[] get-tuple-element(arg_tuple.39), index=6
+  get-tuple-element.41 = f32[] get-tuple-element(arg_tuple.39), index=1
+  constant.48 = f32[] constant(1)
+  add.49 = f32[] add(get-tuple-element.41, constant.48)
+  divide.57 = f32[] divide(get-tuple-element.46, add.49)
+  multiply.58 = f32[] multiply(get-tuple-element.42, divide.57)
+  get-tuple-element.43 = f32[] get-tuple-element(arg_tuple.39), index=3
+  add.59 = f32[] add(get-tuple-element.43, multiply.58)
+  divide.60 = f32[] divide(multiply.58, add.59)
+  constant.47 = f32[] constant(1.1920929e-07)
+  compare.61 = pred[] compare(divide.60, constant.47), direction=GT
+  and.62 = pred[] and(get-tuple-element.40, compare.61)
+  select.63 = f32[] select(get-tuple-element.40, add.49, get-tuple-element.41)
+  select.64 = f32[] select(get-tuple-element.40, multiply.58, get-tuple-element.42)
+  select.65 = f32[] select(get-tuple-element.40, add.59, get-tuple-element.43)
+  get-tuple-element.44 = f32[] get-tuple-element(arg_tuple.39), index=4
+  divide.50 = f32[] divide(get-tuple-element.46, add.49)
+  multiply.51 = f32[] multiply(get-tuple-element.44, divide.50)
+  multiply.52 = f32[] multiply(get-tuple-element.42, get-tuple-element.46)
+  multiply.53 = f32[] multiply(add.49, add.49)
+  divide.54 = f32[] divide(multiply.52, multiply.53)
+  subtract.55 = f32[] subtract(multiply.51, divide.54)
+  select.66 = f32[] select(get-tuple-element.40, subtract.55, get-tuple-element.44)
+  get-tuple-element.45 = f32[] get-tuple-element(arg_tuple.39), index=5
+  add.56 = f32[] add(get-tuple-element.45, subtract.55)
+  select.67 = f32[] select(get-tuple-element.40, add.56, get-tuple-element.45)
+  ROOT tuple.68 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) tuple(and.62, select.63, select.64, select.65, select.66, select.67, get-tuple-element.46)
+}
+
+region_1.69 {
+  arg_tuple.70 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) parameter(0)
+  ROOT get-tuple-element.71 = pred[] get-tuple-element(arg_tuple.70), index=0
+  get-tuple-element.72 = f32[] get-tuple-element(arg_tuple.70), index=1
+  get-tuple-element.73 = f32[] get-tuple-element(arg_tuple.70), index=2
+  get-tuple-element.74 = f32[] get-tuple-element(arg_tuple.70), index=3
+  get-tuple-element.75 = f32[] get-tuple-element(arg_tuple.70), index=4
+  get-tuple-element.76 = f32[] get-tuple-element(arg_tuple.70), index=5
+  get-tuple-element.77 = f32[] get-tuple-element(arg_tuple.70), index=6
+}
+
+region_2.99 {
+  arg_tuple.100 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) parameter(0)
+  get-tuple-element.101 = pred[] get-tuple-element(arg_tuple.100), index=0
+  get-tuple-element.108 = f32[] get-tuple-element(arg_tuple.100), index=7
+  get-tuple-element.105 = f32[] get-tuple-element(arg_tuple.100), index=4
+  constant.119 = f32[] constant(2)
+  add.123 = f32[] add(get-tuple-element.105, constant.119)
+  multiply.128 = f32[] multiply(get-tuple-element.108, add.123)
+  get-tuple-element.110 = f32[] get-tuple-element(arg_tuple.100), index=9
+  get-tuple-element.104 = f32[] get-tuple-element(arg_tuple.100), index=3
+  constant.120 = f32[] constant(1)
+  add.122 = f32[] add(get-tuple-element.104, constant.120)
+  get-tuple-element.106 = f32[] get-tuple-element(arg_tuple.100), index=5
+  add.121 = f32[] add(get-tuple-element.106, constant.120)
+  multiply.124 = f32[] multiply(add.122, add.121)
+  multiply.129 = f32[] multiply(get-tuple-element.110, multiply.124)
+  subtract.130 = f32[] subtract(multiply.128, multiply.129)
+  constant.118 = f32[] constant(0)
+  compare.131 = pred[] compare(subtract.130, constant.118), direction=NE
+  get-tuple-element.102 = f32[] get-tuple-element(arg_tuple.100), index=1
+  get-tuple-element.107 = f32[] get-tuple-element(arg_tuple.100), index=6
+  multiply.125 = f32[] multiply(get-tuple-element.107, add.123)
+  get-tuple-element.109 = f32[] get-tuple-element(arg_tuple.100), index=8
+  multiply.126 = f32[] multiply(get-tuple-element.109, multiply.124)
+  subtract.127 = f32[] subtract(multiply.125, multiply.126)
+  divide.132 = f32[] divide(subtract.127, subtract.130)
+  subtract.133 = f32[] subtract(get-tuple-element.102, divide.132)
+  divide.134 = f32[] divide(subtract.133, divide.132)
+  abs.135 = f32[] abs(divide.134)
+  select.136 = f32[] select(compare.131, abs.135, constant.120)
+  constant.116 = f32[] constant(1.1920929e-07)
+  compare.172 = pred[] compare(select.136, constant.116), direction=GT
+  and.173 = pred[] and(get-tuple-element.101, compare.172)
+  select.137 = f32[] select(compare.131, divide.132, get-tuple-element.102)
+  select.174 = f32[] select(get-tuple-element.101, select.137, get-tuple-element.102)
+  get-tuple-element.103 = f32[] get-tuple-element(arg_tuple.100), index=2
+  select.175 = f32[] select(get-tuple-element.101, select.136, get-tuple-element.103)
+  select.176 = f32[] select(get-tuple-element.101, add.122, get-tuple-element.104)
+  select.177 = f32[] select(get-tuple-element.101, add.123, get-tuple-element.105)
+  abs.154 = f32[] abs(subtract.127)
+  constant.117 = f32[] constant(8388608)
+  compare.155 = pred[] compare(abs.154, constant.117), direction=GT
+  multiply.158 = f32[] multiply(subtract.127, constant.116)
+  select.159 = f32[] select(compare.155, multiply.158, subtract.127)
+  select.178 = f32[] select(get-tuple-element.101, select.159, get-tuple-element.107)
+  multiply.162 = f32[] multiply(subtract.130, constant.116)
+  select.163 = f32[] select(compare.155, multiply.162, subtract.130)
+  select.179 = f32[] select(get-tuple-element.101, select.163, get-tuple-element.108)
+  multiply.156 = f32[] multiply(get-tuple-element.107, constant.116)
+  select.157 = f32[] select(compare.155, multiply.156, get-tuple-element.107)
+  select.180 = f32[] select(get-tuple-element.101, select.157, get-tuple-element.109)
+  multiply.160 = f32[] multiply(get-tuple-element.108, constant.116)
+  select.161 = f32[] select(compare.155, multiply.160, get-tuple-element.108)
+  select.181 = f32[] select(get-tuple-element.101, select.161, get-tuple-element.110)
+  get-tuple-element.113 = f32[] get-tuple-element(arg_tuple.100), index=12
+  multiply.164 = f32[] multiply(get-tuple-element.113, constant.116)
+  select.165 = f32[] select(compare.155, multiply.164, get-tuple-element.113)
+  get-tuple-element.111 = f32[] get-tuple-element(arg_tuple.100), index=10
+  select.182 = f32[] select(get-tuple-element.101, select.165, get-tuple-element.111)
+  get-tuple-element.114 = f32[] get-tuple-element(arg_tuple.100), index=13
+  multiply.166 = f32[] multiply(get-tuple-element.114, constant.116)
+  select.167 = f32[] select(compare.155, multiply.166, get-tuple-element.114)
+  get-tuple-element.112 = f32[] get-tuple-element(arg_tuple.100), index=11
+  select.183 = f32[] select(get-tuple-element.101, select.167, get-tuple-element.112)
+  multiply.138 = f32[] multiply(get-tuple-element.113, add.123)
+  subtract.139 = f32[] subtract(multiply.138, get-tuple-element.107)
+  multiply.140 = f32[] multiply(get-tuple-element.111, multiply.124)
+  subtract.141 = f32[] subtract(subtract.139, multiply.140)
+  multiply.142 = f32[] multiply(get-tuple-element.109, add.121)
+  add.143 = f32[] add(subtract.141, multiply.142)
+  multiply.168 = f32[] multiply(add.143, constant.116)
+  select.169 = f32[] select(compare.155, multiply.168, add.143)
+  select.184 = f32[] select(get-tuple-element.101, select.169, get-tuple-element.113)
+  multiply.144 = f32[] multiply(get-tuple-element.114, add.123)
+  subtract.145 = f32[] subtract(multiply.144, get-tuple-element.108)
+  multiply.146 = f32[] multiply(get-tuple-element.112, multiply.124)
+  subtract.147 = f32[] subtract(subtract.145, multiply.146)
+  multiply.148 = f32[] multiply(get-tuple-element.110, add.121)
+  add.149 = f32[] add(subtract.147, multiply.148)
+  multiply.170 = f32[] multiply(add.149, constant.116)
+  select.171 = f32[] select(compare.155, multiply.170, add.149)
+  select.185 = f32[] select(get-tuple-element.101, select.171, get-tuple-element.114)
+  multiply.150 = f32[] multiply(select.137, add.149)
+  subtract.151 = f32[] subtract(add.143, multiply.150)
+  divide.152 = f32[] divide(subtract.151, subtract.130)
+  get-tuple-element.115 = f32[] get-tuple-element(arg_tuple.100), index=14
+  select.153 = f32[] select(compare.131, divide.152, get-tuple-element.115)
+  select.186 = f32[] select(get-tuple-element.101, select.153, get-tuple-element.115)
+  ROOT tuple.187 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) tuple(and.173, select.174, select.175, select.176, select.177, add.121, select.178, select.179, select.180, select.181, select.182, select.183, select.184, select.185, select.186)
+}
+
+region_3.188 {
+  arg_tuple.189 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) parameter(0)
+  get-tuple-element.191 = f32[] get-tuple-element(arg_tuple.189), index=1
+  get-tuple-element.192 = f32[] get-tuple-element(arg_tuple.189), index=2
+  get-tuple-element.193 = f32[] get-tuple-element(arg_tuple.189), index=3
+  get-tuple-element.194 = f32[] get-tuple-element(arg_tuple.189), index=4
+  get-tuple-element.196 = f32[] get-tuple-element(arg_tuple.189), index=6
+  get-tuple-element.197 = f32[] get-tuple-element(arg_tuple.189), index=7
+  get-tuple-element.198 = f32[] get-tuple-element(arg_tuple.189), index=8
+  get-tuple-element.199 = f32[] get-tuple-element(arg_tuple.189), index=9
+  get-tuple-element.200 = f32[] get-tuple-element(arg_tuple.189), index=10
+  get-tuple-element.201 = f32[] get-tuple-element(arg_tuple.189), index=11
+  get-tuple-element.202 = f32[] get-tuple-element(arg_tuple.189), index=12
+  get-tuple-element.203 = f32[] get-tuple-element(arg_tuple.189), index=13
+  get-tuple-element.204 = f32[] get-tuple-element(arg_tuple.189), index=14
+  get-tuple-element.195 = f32[] get-tuple-element(arg_tuple.189), index=5
+  constant.205 = f32[] constant(2000)
+  compare.206 = pred[] compare(get-tuple-element.195, constant.205), direction=LT
+  get-tuple-element.190 = pred[] get-tuple-element(arg_tuple.189), index=0
+  ROOT and.207 = pred[] and(compare.206, get-tuple-element.190)
+}
+
+ENTRY main.241 {
+  Arg_0.1 = bf16[] parameter(0)
+  constant.15 = bf16[] constant(0)
+  maximum.16 = bf16[] maximum(Arg_0.1, constant.15)
+  convert.17 = f32[] convert(maximum.16)
+  constant.2 = f32[] constant(0)
+  compare.18 = pred[] compare(convert.17, constant.2), direction=LE
+  log.22 = f32[] log(convert.17)
+  subtract.23 = f32[] subtract(log.22, convert.17)
+  constant.6 = f32[] constant(0.5)
+  constant.7 = f32[] constant(7.5)
+  log-plus-one.24 = f32[] log-plus-one(constant.2)
+  constant.14 = f32[] constant(2.01490307)
+  add.25 = f32[] add(log-plus-one.24, constant.14)
+  divide.26 = f32[] divide(constant.7, add.25)
+  subtract.27 = f32[] subtract(constant.6, divide.26)
+  multiply.28 = f32[] multiply(subtract.27, add.25)
+  constant.13 = f32[] constant(0.918938518)
+  add.29 = f32[] add(multiply.28, constant.13)
+  constant.5 = f32[] constant(5.57361031)
+  add.30 = f32[] add(add.29, constant.5)
+  subtract.31 = f32[] subtract(subtract.23, add.30)
+  constant.3 = f32[] constant(-88.7228394)
+  compare.32 = pred[] compare(subtract.31, constant.3), direction=LT
+  or.33 = pred[] or(compare.18, compare.32)
+  not.34 = pred[] not(or.33)
+  constant.4 = f32[] constant(1)
+  compare.19 = pred[] compare(convert.17, constant.4), direction=LT
+  compare.20 = pred[] compare(convert.17, constant.4), direction=LT
+  or.21 = pred[] or(compare.19, compare.20)
+  and.36 = pred[] and(not.34, or.21)
+  tuple.37 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) tuple(and.36, constant.4, constant.4, constant.4, constant.2, constant.2, convert.17)
+  while.78 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) while(tuple.37), condition=region_1.69, body=region_0.38
+  get-tuple-element.79 = pred[] get-tuple-element(while.78), index=0
+  get-tuple-element.80 = f32[] get-tuple-element(while.78), index=1
+  get-tuple-element.81 = f32[] get-tuple-element(while.78), index=2
+  get-tuple-element.83 = f32[] get-tuple-element(while.78), index=4
+  get-tuple-element.84 = f32[] get-tuple-element(while.78), index=5
+  get-tuple-element.85 = f32[] get-tuple-element(while.78), index=6
+  not.88 = pred[] not(or.21)
+  and.89 = pred[] and(not.34, not.88)
+  add.91 = f32[] add(convert.17, constant.4)
+  add.90 = f32[] add(convert.17, constant.4)
+  multiply.92 = f32[] multiply(add.90, convert.17)
+  divide.93 = f32[] divide(add.91, multiply.92)
+  negate.94 = f32[] negate(convert.17)
+  multiply.95 = f32[] multiply(divide.93, negate.94)
+  subtract.96 = f32[] subtract(constant.2, multiply.95)
+  divide.97 = f32[] divide(subtract.96, multiply.92)
+  tuple.98 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) tuple(and.89, divide.93, constant.4, constant.2, add.90, constant.2, add.91, multiply.92, constant.4, convert.17, constant.2, constant.2, constant.2, negate.94, divide.97)
+  while.208 = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) while(tuple.98), condition=region_3.188, body=region_2.99
+  get-tuple-element.209 = pred[] get-tuple-element(while.208), index=0
+  get-tuple-element.211 = f32[] get-tuple-element(while.208), index=2
+  get-tuple-element.212 = f32[] get-tuple-element(while.208), index=3
+  get-tuple-element.213 = f32[] get-tuple-element(while.208), index=4
+  get-tuple-element.214 = f32[] get-tuple-element(while.208), index=5
+  get-tuple-element.215 = f32[] get-tuple-element(while.208), index=6
+  get-tuple-element.216 = f32[] get-tuple-element(while.208), index=7
+  get-tuple-element.217 = f32[] get-tuple-element(while.208), index=8
+  get-tuple-element.218 = f32[] get-tuple-element(while.208), index=9
+  get-tuple-element.219 = f32[] get-tuple-element(while.208), index=10
+  get-tuple-element.220 = f32[] get-tuple-element(while.208), index=11
+  get-tuple-element.221 = f32[] get-tuple-element(while.208), index=12
+  get-tuple-element.222 = f32[] get-tuple-element(while.208), index=13
+  get-tuple-element.223 = f32[] get-tuple-element(while.208), index=14
+  constant.11 = bf16[] constant(-1)
+  constant.12 = f32[] constant(inf)
+  compare.227 = pred[] compare(convert.17, constant.12), direction=EQ
+  get-tuple-element.82 = f32[] get-tuple-element(while.78), index=3
+  exponential.35 = f32[] exponential(subtract.31)
+  multiply.86 = f32[] multiply(get-tuple-element.82, exponential.35)
+  divide.87 = f32[] divide(multiply.86, constant.4)
+  subtract.225 = f32[] subtract(constant.4, divide.87)
+  get-tuple-element.210 = f32[] get-tuple-element(while.208), index=1
+  multiply.224 = f32[] multiply(get-tuple-element.210, exponential.35)
+  select.226 = f32[] select(or.21, subtract.225, multiply.224)
+  select.228 = f32[] select(compare.227, constant.2, select.226)
+  select.229 = f32[] select(compare.18, constant.4, select.228)
+  convert.230 = bf16[] convert(select.229)
+  constant.8 = bf16[] constant(1)
+  clamp.231 = bf16[] clamp(constant.11, convert.230, constant.8)
+  compare.232 = pred[] compare(clamp.231, constant.11), direction=NE
+  multiply.233 = bf16[] multiply(clamp.231, clamp.231)
+  subtract.234 = bf16[] subtract(constant.8, multiply.233)
+  sqrt.235 = bf16[] sqrt(subtract.234)
+  add.236 = bf16[] add(clamp.231, constant.8)
+  atan2.237 = bf16[] atan2(sqrt.235, add.236)
+  constant.10 = bf16[] constant(2)
+  multiply.238 = bf16[] multiply(atan2.237, constant.10)
+  constant.9 = bf16[] constant(3.141)
+  select.239 = bf16[] select(compare.232, multiply.238, constant.9)
+  ROOT tuple.240 = (bf16[]) tuple(select.239)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_90.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_90.hlo
new file mode 100644
index 00000000000..2a3f0bd84d7
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_90.hlo
@@ -0,0 +1,15 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u16[2,2,12]{2,1,0},u16[2,2,12]{2,1,0},u16[2,2,2,5,12]{4,3,2,1,0})->(pred[2,2,12]{2,1,0})}
+
+ENTRY main.10 {
+  Arg_0.1 = u16[2,2,12]{2,1,0} parameter(0)
+  Arg_1.2 = u16[2,2,12]{2,1,0} parameter(1)
+  shift-right-logical.6 = u16[2,2,12]{2,1,0} shift-right-logical(Arg_0.1, Arg_1.2)
+  Arg_2.3 = u16[2,2,2,5,12]{4,3,2,1,0} parameter(2)
+  constant.4 = u16[] constant(1)
+  broadcast.5 = u16[5,2]{1,0} broadcast(constant.4), dimensions={}
+  dot.7 = u16[2,2,12]{2,1,0} dot(Arg_2.3, broadcast.5), lhs_contracting_dims={3,0}, rhs_contracting_dims={0,1}
+  compare.8 = pred[2,2,12]{2,1,0} compare(shift-right-logical.6, dot.7), direction=GT
+  ROOT tuple.9 = (pred[2,2,12]{2,1,0}) tuple(compare.8)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_91.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_91.hlo
new file mode 100644
index 00000000000..9b78d72ea2f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_91.hlo
@@ -0,0 +1,18 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(u32[8,8]{1,0})}
+
+region_0.4 {
+  Arg_0.5 = u32[] parameter(0)
+  Arg_1.6 = u32[] parameter(1)
+  ROOT add.7 = u32[] add(Arg_0.5, Arg_1.6)
+}
+
+ENTRY main.11 {
+  constant.1 = u32[] constant(1)
+  broadcast.2 = u32[8,8]{1,0} broadcast(constant.1), dimensions={}
+  constant.3 = u32[] constant(0)
+  reduce-window.8 = u32[8,8]{1,0} reduce-window(broadcast.2, constant.3), window={size=1x1 stride=2x1 lhs_dilate=2x1 rhs_dilate=4x1}, to_apply=region_0.4
+  or.9 = u32[8,8]{1,0} or(reduce-window.8, broadcast.2)
+  ROOT tuple.10 = (u32[8,8]{1,0}) tuple(or.9)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_92.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_92.hlo
new file mode 100644
index 00000000000..d2618035621
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_92.hlo
@@ -0,0 +1,27 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(f16[11,13,15,12]{3,2,1,0})}
+
+region_0.5 {
+  Arg_0.6 = f16[] parameter(0)
+  Arg_1.7 = f16[] parameter(1)
+  ROOT add.8 = f16[] add(Arg_0.6, Arg_1.7)
+}
+
+region_1.10 {
+  Arg_0.11 = f16[] parameter(0)
+  Arg_1.12 = f16[] parameter(1)
+  ROOT add.13 = f16[] add(Arg_0.11, Arg_1.12)
+}
+
+ENTRY main.18 {
+  constant.2 = f16[] constant(1)
+  broadcast.3 = f16[15,13,11,13]{3,2,1,0} broadcast(constant.2), dimensions={}
+  floor.4 = f16[15,13,11,13]{3,2,1,0} floor(broadcast.3)
+  constant.1 = f16[] constant(0)
+  reduce-window.9 = f16[12,13,15,13]{3,2,1,0} reduce-window(floor.4, constant.1), window={size=16x1x3x19 lhs_dilate=4x1x2x4 rhs_dilate=3x1x3x2}, to_apply=region_0.5
+  reduce.14 = f16[13,15]{1,0} reduce(reduce-window.9, constant.1), dimensions={0,3}, to_apply=region_1.10
+  broadcast.15 = f16[11,12,13,15]{3,2,1,0} broadcast(reduce.14), dimensions={2,3}
+  reshape.16 = f16[11,13,15,12]{3,2,1,0} reshape(broadcast.15)
+  ROOT tuple.17 = (f16[11,13,15,12]{3,2,1,0}) tuple(reshape.16)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_93.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_93.hlo
new file mode 100644
index 00000000000..783744b30b4
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_93.hlo
@@ -0,0 +1,19 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(s32[8,16,3,11]{3,2,1,0})}
+
+ENTRY main.14 {
+  constant.1 = s32[] constant(1)
+  broadcast.2 = s32[8,16,3,11]{3,2,1,0} broadcast(constant.1), dimensions={}
+  constant.5 = s32[] constant(1)
+  broadcast.6 = s32[6,13,1,3]{3,2,1,0} broadcast(constant.5), dimensions={}
+  constant.7 = s32[] constant(7)
+  constant.8 = s32[] constant(0)
+  constant.9 = s32[] constant(1)
+  constant.10 = s32[] constant(10)
+  dynamic-update-slice.11 = s32[8,16,3,11]{3,2,1,0} dynamic-update-slice(broadcast.2, broadcast.6, constant.7, constant.8, constant.9, constant.10)
+  constant.3 = s32[] constant(2)
+  broadcast.4 = s32[8,16,3,11]{3,2,1,0} broadcast(constant.3), dimensions={}
+  clamp.12 = s32[8,16,3,11]{3,2,1,0} clamp(dynamic-update-slice.11, broadcast.4, broadcast.2)
+  ROOT tuple.13 = (s32[8,16,3,11]{3,2,1,0}) tuple(clamp.12)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_94.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_94.hlo
new file mode 100644
index 00000000000..903fa648e1e
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_94.hlo
@@ -0,0 +1,17 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u32[13,6]{1,0})->(u32[13,6]{1,0})}
+
+ENTRY main.12 {
+  constant.2 = u32[] constant(1)
+  broadcast.3 = u32[13,3]{1,0} broadcast(constant.2), dimensions={}
+  constant.4 = u32[] constant(1)
+  broadcast.5 = u32[3,6]{1,0} broadcast(constant.4), dimensions={}
+  dot.8 = u32[13,6]{1,0} dot(broadcast.3, broadcast.5), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  Arg_0.1 = u32[13,6]{1,0} parameter(0)
+  constant.6 = u32[] constant(1)
+  broadcast.7 = u32[13,6]{1,0} broadcast(constant.6), dimensions={}
+  maximum.9 = u32[13,6]{1,0} maximum(Arg_0.1, broadcast.7)
+  shift-right-logical.10 = u32[13,6]{1,0} shift-right-logical(dot.8, maximum.9)
+  ROOT tuple.11 = (u32[13,6]{1,0}) tuple(shift-right-logical.10)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_95.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_95.hlo
new file mode 100644
index 00000000000..8f2f3cd7da9
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_95.hlo
@@ -0,0 +1,56 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(pred[],u32[8,5,3,8,2]{4,3,2,1,0},u32[1,1,2,3,1]{4,3,2,1,0})->(u32[13]{0})}
+
+region_1.11 {
+  Arg_0.12 = s32[] parameter(0)
+  Arg_1.13 = s32[] parameter(1)
+  ROOT add.14 = s32[] add(Arg_0.12, Arg_1.13)
+}
+
+region_0.15 {
+  arg_empty_tuple.16 = () parameter(0)
+  constant.18 = s32[] constant(1)
+  broadcast.19 = s32[6,12,1,9,5]{4,3,2,1,0} broadcast(constant.18), dimensions={}
+  constant.17 = s32[] constant(0)
+  reduce.20 = s32[] reduce(broadcast.19, constant.17), dimensions={0,1,2,3,4}, to_apply=region_1.11
+  ROOT broadcast.21 = s32[8,3,8,2,1]{4,3,2,1,0} broadcast(reduce.20), dimensions={}
+}
+
+region_3.22 {
+  Arg_0.23 = s32[] parameter(0)
+  Arg_1.24 = s32[] parameter(1)
+  ROOT add.25 = s32[] add(Arg_0.23, Arg_1.24)
+}
+
+region_2.26 {
+  arg_empty_tuple.27 = () parameter(0)
+  constant.29 = s32[] constant(1)
+  broadcast.30 = s32[6,12,1,9,5]{4,3,2,1,0} broadcast(constant.29), dimensions={}
+  constant.28 = s32[] constant(0)
+  reduce.31 = s32[] reduce(broadcast.30, constant.28), dimensions={0,1,2,3,4}, to_apply=region_3.22
+  ROOT broadcast.32 = s32[8,3,8,2,1]{4,3,2,1,0} broadcast(reduce.31), dimensions={}
+}
+
+region_4.35 {
+  Arg_0.36 = u32[] parameter(0)
+  Arg_1.37 = u32[] parameter(1)
+  ROOT multiply.38 = u32[] multiply(Arg_0.36, Arg_1.37)
+}
+
+ENTRY main.41 {
+  constant.4 = u32[] constant(1)
+  broadcast.5 = u32[13]{0} broadcast(constant.4), dimensions={}
+  Arg_0.1 = pred[] parameter(0)
+  convert.8 = s32[] convert(Arg_0.1)
+  tuple.9 = () tuple()
+  tuple.10 = () tuple()
+  conditional.33 = s32[8,3,8,2,1]{4,3,2,1,0} conditional(convert.8, tuple.9, tuple.10), branch_computations={region_0.15, region_2.26}
+  Arg_1.2 = u32[8,5,3,8,2]{4,3,2,1,0} parameter(1)
+  Arg_2.3 = u32[1,1,2,3,1]{4,3,2,1,0} parameter(2)
+  constant.6 = s32[] constant(1)
+  constant.7 = s32[] constant(0)
+  dynamic-update-slice.34 = u32[8,5,3,8,2]{4,3,2,1,0} dynamic-update-slice(Arg_1.2, Arg_2.3, constant.6, constant.7, constant.6, constant.6, constant.6)
+  scatter.39 = u32[13]{0} scatter(broadcast.5, conditional.33, dynamic-update-slice.34), update_window_dims={1}, inserted_window_dims={}, scatter_dims_to_operand_dims={0}, index_vector_dim=4, to_apply=region_4.35
+  ROOT tuple.40 = (u32[13]{0}) tuple(scatter.39)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_96.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_96.hlo
new file mode 100644
index 00000000000..7d81c1ef120
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_96.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(s8[8,6]{1,0})}
+
+ENTRY main.8 {
+  constant.1 = s8[] constant(0)
+  broadcast.2 = s8[8,2]{1,0} broadcast(constant.1), dimensions={}
+  constant.3 = s8[] constant(1)
+  broadcast.4 = s8[2,6]{1,0} broadcast(constant.3), dimensions={}
+  shift-left.5 = s8[2,6]{1,0} shift-left(broadcast.4, broadcast.4)
+  dot.6 = s8[8,6]{1,0} dot(broadcast.2, shift-left.5), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT tuple.7 = (s8[8,6]{1,0}) tuple(dot.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_97.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_97.hlo
new file mode 100644
index 00000000000..55f203ed8b9
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_97.hlo
@@ -0,0 +1,9 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={()->(u16[2]{0})}
+
+ENTRY main.4 {
+  constant.1 = u16[] constant(4)
+  broadcast.2 = u16[2]{0} broadcast(constant.1), dimensions={}
+  ROOT tuple.3 = (u16[2]{0}) tuple(broadcast.2)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_98.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_98.hlo
new file mode 100644
index 00000000000..42ca545a117
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_98.hlo
@@ -0,0 +1,22 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(u16[4]{0},s32[1,5,1,4,1]{4,3,2,1,0})->(u16[4]{0})}
+
+region_0.8 {
+  Arg_0.9 = u16[] parameter(0)
+  Arg_1.10 = u16[] parameter(1)
+  ROOT maximum.11 = u16[] maximum(Arg_0.9, Arg_1.10)
+}
+
+ENTRY main.15 {
+  constant.3 = u16[] constant(1)
+  broadcast.4 = u16[4]{0} broadcast(constant.3), dimensions={}
+  Arg_0.1 = u16[4]{0} parameter(0)
+  shift-right-arithmetic.7 = u16[4]{0} shift-right-arithmetic(broadcast.4, Arg_0.1)
+  Arg_1.2 = s32[1,5,1,4,1]{4,3,2,1,0} parameter(1)
+  constant.5 = u16[] constant(1)
+  broadcast.6 = u16[1,5,1,4,1]{4,3,2,1,0} broadcast(constant.5), dimensions={}
+  scatter.12 = u16[4]{0} scatter(broadcast.4, Arg_1.2, broadcast.6), update_window_dims={4}, inserted_window_dims={}, scatter_dims_to_operand_dims={0}, index_vector_dim=4, to_apply=region_0.8
+  shift-right-arithmetic.13 = u16[4]{0} shift-right-arithmetic(shift-right-arithmetic.7, scatter.12)
+  ROOT tuple.14 = (u16[4]{0}) tuple(shift-right-arithmetic.13)
+}
+
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_99.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_99.hlo
new file mode 100644
index 00000000000..2f7a329ebf3
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fuzz/rand_99.hlo
@@ -0,0 +1,13 @@
+/* This file is autogenerated! Please don't edit! */
+HloModule xla_computation_unknown, entry_computation_layout={(s8[8,13,14,12]{3,2,1,0})->(s8[8,13,14,12]{3,2,1,0})}
+
+ENTRY main.8 {
+  Arg_0.1 = s8[8,13,14,12]{3,2,1,0} parameter(0)
+  sign.4 = s8[8,13,14,12]{3,2,1,0} sign(Arg_0.1)
+  constant.2 = s8[] constant(1)
+  broadcast.3 = s8[16,23,56,16]{3,2,1,0} broadcast(constant.2), dimensions={}
+  slice.5 = s8[8,13,14,12]{3,2,1,0} slice(broadcast.3), slice={[6:14:1], [10:23:1], [14:56:3], [2:14:1]}
+  or.6 = s8[8,13,14,12]{3,2,1,0} or(sign.4, slice.5)
+  ROOT tuple.7 = (s8[8,13,14,12]{3,2,1,0}) tuple(or.6)
+}
+
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 41369420901..909ea247ffe 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -470,7 +470,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
       runner_->Execute(std::move(module), fake_arguments, run_hlo_passes);
   return output.ok()
              ? ::testing::AssertionSuccess()
-             : ::testing::AssertionFailure() << output.status().error_message();
+             : ::testing::AssertionFailure() << output.status().message();
 }
 
 ::testing::AssertionResult HloTestBase::RunAndCompare(
@@ -623,7 +623,7 @@ HloTestBase::RunAndCompareTwoModulesInternal(
         module->entry_computation()->root_instruction();
     Status s = instruction->set_backend_config(*backend_config);
     return s.ok() ? ::testing::AssertionSuccess()
-                  : ::testing::AssertionFailure() << s.error_message();
+                  : ::testing::AssertionFailure() << s.message();
   }
 
   auto output = runner_->Execute(std::move(module), fake_argument_ptrs,
@@ -632,7 +632,7 @@ HloTestBase::RunAndCompareTwoModulesInternal(
 
   return output.ok()
              ? ::testing::AssertionSuccess()
-             : ::testing::AssertionFailure() << output.status().error_message();
+             : ::testing::AssertionFailure() << output.status().message();
 }
 
 ::testing::AssertionResult HloTestBase::RunReplicated(
@@ -659,7 +659,7 @@ HloTestBase::RunAndCompareTwoModulesInternal(
         module->entry_computation()->root_instruction();
     Status s = instruction->set_backend_config(*backend_config);
     return s.ok() ? ::testing::AssertionSuccess()
-                  : ::testing::AssertionFailure() << s.error_message();
+                  : ::testing::AssertionFailure() << s.message();
   }
 
   HloRunner::ReplicatedExecuteOptions options;
@@ -673,7 +673,7 @@ HloTestBase::RunAndCompareTwoModulesInternal(
 
   return output.ok()
              ? ::testing::AssertionSuccess()
-             : ::testing::AssertionFailure() << output.status().error_message();
+             : ::testing::AssertionFailure() << output.status().message();
 }
 
 ::testing::AssertionResult HloTestBase::RunMultipleTimes(
@@ -715,14 +715,13 @@ HloTestBase::RunAndCompareTwoModulesInternal(
           module->entry_computation()->root_instruction();
       Status s = instruction->set_backend_config(*backend_config);
       return s.ok() ? ::testing::AssertionSuccess()
-                    : ::testing::AssertionFailure() << s.error_message();
+                    : ::testing::AssertionFailure() << s.message();
     }
 
     auto executable =
         runner_->CreateExecutable(std::move(module), run_hlo_passes);
     if (!executable.ok()) {
-      return ::testing::AssertionFailure()
-             << executable.status().error_message();
+      return ::testing::AssertionFailure() << executable.status().message();
     }
     executables[i] = std::move(executable.value());
   }
@@ -733,7 +732,7 @@ HloTestBase::RunAndCompareTwoModulesInternal(
         runner_->ExecuteWithExecutable(executables[i].get(), fake_arguments[i],
                                        /*profile=*/&((*profiles)[i]));
     if (!output.ok()) {
-      return ::testing::AssertionFailure() << output.status().error_message();
+      return ::testing::AssertionFailure() << output.status().message();
     }
 
     if (assert_determinism) {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index f5b2480cbbc..f84a4acaba7 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/types/span.h"
@@ -211,6 +212,12 @@ class HloTestBase : public ManifestCheckingTest {
   Literal ExecuteAndTransfer(std::unique_ptr<HloModule> module,
                              absl::Span<Literal* const> arguments);
 
+  // Compile the given module to an executable.
+  StatusOr<std::unique_ptr<Executable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes) {
+    return runner_->CreateExecutable(std::move(module), run_hlo_passes);
+  }
+
   // Executes the given module on multiple replicas.
   //
   // use_threads indicates whether this replicated computation will be executed
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 9778e4aba93..ea819834294 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -66,7 +66,7 @@ void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
   if (s.ok()) {
     return ::testing::AssertionSuccess();
   }
-  return ::testing::AssertionFailure() << s.error_message();
+  return ::testing::AssertionFailure() << s.message();
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index b80e0ed65b7..8244c252138 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -55,8 +55,7 @@ class GpuDummyCompiler : public GpuCompiler {
 
   Status OptimizeHloPostLayoutAssignment(
       HloModule* hlo_module, se::StreamExecutor* stream_executor,
-      se::DeviceMemoryAllocator* device_allocator,
-      const GpuTargetConfig& gpu_target_config,
+      const CompileOptions& options, const GpuTargetConfig& gpu_target_config,
       const AutotuneResults* autotune_results) override {
     return OkStatus();
   }
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 0e57e548601..4f111039c71 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -565,7 +565,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) {
   auto execute_status = ExecuteLocally(builder.Build().value(), {&x_array});
 
   EXPECT_FALSE(execute_status.ok());
-  EXPECT_THAT(execute_status.status().error_message(),
+  EXPECT_THAT(execute_status.status().message(),
               ContainsRegex("Invalid number of arguments"));
 }
 
@@ -580,7 +580,7 @@ XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) {
   auto execute_status = ExecuteLocally(builder.Build().value(), {&x_array});
 
   EXPECT_FALSE(execute_status.ok());
-  EXPECT_THAT(execute_status.status().error_message(),
+  EXPECT_THAT(execute_status.status().message(),
               ContainsRegex("Invalid argument shape"))
       << execute_status.status();
 }
@@ -602,7 +602,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) {
       DefaultExecutableRunOptions());
 
   EXPECT_FALSE(execute_status.ok());
-  EXPECT_THAT(execute_status.status().error_message(),
+  EXPECT_THAT(execute_status.status().message(),
               ContainsRegex("not compatible with result shape"))
       << execute_status.status();
 }
@@ -620,7 +620,7 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) {
                          DefaultExecutableBuildOptions().set_device_ordinal(d),
                          DefaultExecutableRunOptions().set_device_ordinal(d));
       EXPECT_FALSE(execute_status.ok());
-      EXPECT_THAT(execute_status.status().error_message(),
+      EXPECT_THAT(execute_status.status().message(),
                   ContainsRegex("device .* not supported"));
     } else {
       auto result = ExecuteLocallyOrDie(
@@ -648,7 +648,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidDeviceOrdinalValues) {
                      DefaultExecutableRunOptions().set_device_ordinal(
                          local_client_->device_count()));
   EXPECT_FALSE(execute_status.ok());
-  EXPECT_THAT(execute_status.status().error_message(),
+  EXPECT_THAT(execute_status.status().message(),
               ContainsRegex("Invalid device ordinal value"));
 }
 
@@ -695,7 +695,7 @@ XLA_TEST_F(LocalClientExecuteTest,
       builder.Build().value(), {}, DefaultExecutableBuildOptions(),
       DefaultExecutableRunOptions().set_stream(&wrong_stream));
   EXPECT_FALSE(execute_status.ok());
-  EXPECT_THAT(execute_status.status().error_message(),
+  EXPECT_THAT(execute_status.status().message(),
               ContainsRegex("stream is for platform .*, but service targets"));
 }
 
@@ -713,7 +713,7 @@ XLA_TEST_F(LocalClientExecuteTest,
       builder.Build().value(), {}, DefaultExecutableBuildOptions(),
       DefaultExecutableRunOptions().set_allocator(&allocator));
   EXPECT_FALSE(execute_status.ok());
-  EXPECT_THAT(execute_status.status().error_message(),
+  EXPECT_THAT(execute_status.status().message(),
               ContainsRegex("allocator platform .* does not match service"));
 }
 
@@ -734,7 +734,7 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnUninitializedStream) {
       builder.Build().value(), {}, DefaultExecutableBuildOptions(),
       DefaultExecutableRunOptions().set_stream(&stream));
   EXPECT_FALSE(execute_status.ok());
-  EXPECT_THAT(execute_status.status().error_message(),
+  EXPECT_THAT(execute_status.status().message(),
               ContainsRegex("stream is uninitialized or in an error state"));
 }
 
diff --git a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
index 517d1b820f6..47884b7a587 100644
--- a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/tsl/platform/test.h"
 
 // Tests the Reduce HLO in ways that can't be done using the ComputationBuilder
@@ -73,6 +74,10 @@ ENTRY reduce.1 {
 };
 
 XLA_TEST_P(ReduceWithLayoutTest, Reduce) {
+  if (IsMlirLoweringEnabled()) {
+    GTEST_SKIP() << "Explicit layouts not supported by MLIR";
+  }
+
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module, GetParsedModule());
   HloInstruction* reduce_instruction =
       module->entry_computation()->root_instruction()->mutable_operand(0);
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 76349ba0078..caa8f58058e 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -115,7 +115,7 @@ XLA_TEST_P(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
                /*window_strides=*/{1}, Padding::kValid);
   ASSERT_EQ(builder_.first_error().code(), tsl::error::INVALID_ARGUMENT)
       << builder_.first_error();
-  ASSERT_THAT(builder_.first_error().error_message(),
+  ASSERT_THAT(builder_.first_error().message(),
               ::testing::HasSubstr("Want input dimensions size"));
 }
 
diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc
index 30c9cf2a178..17654c2d57d 100644
--- a/tensorflow/compiler/xla/tests/scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/scatter_test.cc
@@ -816,6 +816,10 @@ ENTRY main {
 
 // TODO(b/230137437): Enable this on GPU once mhlo allows variadic scatter.
 XLA_TEST_F(ScatterTest, DISABLED_ON_GPU(Multioutput)) {
+  if (IsMlirLoweringEnabled()) {
+    GTEST_SKIP() << "Variadic scatter not supported by MLIR";
+  }
+
   constexpr char hlo_text[] = R"(
 HloModule MultioutputScatter
 
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 3bf3d5eb0e1..02e557e64d6 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -152,6 +153,8 @@ void PopulateWithNoDuplicateData(Literal* literal, std::minstd_rand0* engine) {
 template <typename FloatT>
 void PopulateWithFloatingPointData(Literal* literal, std::minstd_rand0* engine,
                                    bool no_duplicates, bool use_large_range) {
+  using ComputeT =
+      std::conditional_t<sizeof(FloatT) < sizeof(float), float, FloatT>;
   CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
@@ -160,7 +163,7 @@ void PopulateWithFloatingPointData(Literal* literal, std::minstd_rand0* engine,
   } else if (use_large_range) {
     PopulateWithRandomFullRangeFloatingPointData<FloatT>(literal, engine);
   } else {
-    PopulateWithRandomFloatingPointData<FloatT, FloatT>(literal, engine);
+    PopulateWithRandomFloatingPointData<FloatT, ComputeT>(literal, engine);
   }
 }
 
@@ -190,77 +193,6 @@ void PopulateWithComplexData(Literal* result, std::minstd_rand0* engine,
   }
 }
 
-template <>
-void PopulateWithFloatingPointData<half>(Literal* literal,
-                                         std::minstd_rand0* engine,
-                                         bool no_duplicates,
-                                         bool use_large_range) {
-  CHECK(engine != nullptr);
-  CHECK_EQ(literal->shape().element_type(),
-           primitive_util::NativeToPrimitiveType<half>());
-  if (no_duplicates) {
-    PopulateWithNoDuplicateData<half>(literal, engine);
-  } else if (use_large_range) {
-    PopulateWithRandomFullRangeFloatingPointData<half>(literal, engine);
-  } else {
-    PopulateWithRandomFloatingPointData<half, float>(literal, engine);
-  }
-}
-
-template <>
-void PopulateWithFloatingPointData<bfloat16>(Literal* literal,
-                                             std::minstd_rand0* engine,
-                                             bool no_duplicates,
-                                             bool use_large_range) {
-  CHECK(engine != nullptr);
-  CHECK_EQ(literal->shape().element_type(),
-           primitive_util::NativeToPrimitiveType<bfloat16>());
-  if (no_duplicates) {
-    PopulateWithNoDuplicateData<bfloat16>(literal, engine);
-  } else if (use_large_range) {
-    PopulateWithRandomFullRangeFloatingPointData<bfloat16>(literal, engine);
-  } else {
-    PopulateWithRandomFloatingPointData<bfloat16, float>(literal, engine);
-  }
-}
-
-template <>
-void PopulateWithFloatingPointData<tsl::float8_e5m2>(Literal* literal,
-                                                     std::minstd_rand0* engine,
-                                                     bool no_duplicates,
-                                                     bool use_large_range) {
-  CHECK(engine != nullptr);
-  CHECK_EQ(literal->shape().element_type(),
-           primitive_util::NativeToPrimitiveType<tsl::float8_e5m2>());
-  if (no_duplicates) {
-    PopulateWithNoDuplicateData<tsl::float8_e5m2>(literal, engine);
-  } else if (use_large_range) {
-    PopulateWithRandomFullRangeFloatingPointData<tsl::float8_e5m2>(literal,
-                                                                   engine);
-  } else {
-    PopulateWithRandomFloatingPointData<tsl::float8_e5m2, float>(literal,
-                                                                 engine);
-  }
-}
-
-template <>
-void PopulateWithFloatingPointData<tsl::float8_e4m3fn>(
-    Literal* literal, std::minstd_rand0* engine, bool no_duplicates,
-    bool use_large_range) {
-  CHECK(engine != nullptr);
-  CHECK_EQ(literal->shape().element_type(),
-           primitive_util::NativeToPrimitiveType<tsl::float8_e4m3fn>());
-  if (no_duplicates) {
-    PopulateWithNoDuplicateData<tsl::float8_e4m3fn>(literal, engine);
-  } else if (use_large_range) {
-    PopulateWithRandomFullRangeFloatingPointData<tsl::float8_e4m3fn>(literal,
-                                                                     engine);
-  } else {
-    PopulateWithRandomFloatingPointData<tsl::float8_e4m3fn, float>(literal,
-                                                                   engine);
-  }
-}
-
 // uniform_int_distribution is not defined for 8-bit integers.
 // Use 'short' for those types.
 template <typename IntT>
@@ -392,6 +324,10 @@ StatusOr<Literal> MakeFakeLiteralInternal(
       PopulateWithFloatingPointData<tsl::float8_e4m3fn>(
           &literal, engine, no_duplicates, use_large_range);
       break;
+    case F8E4M3B11FNUZ:
+      PopulateWithFloatingPointData<tsl::float8_e4m3b11>(
+          &literal, engine, no_duplicates, use_large_range);
+      break;
     case BF16:
       PopulateWithFloatingPointData<bfloat16>(&literal, engine, no_duplicates,
                                               use_large_range);
diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc
index 3abbc9b9a55..b0c610423ee 100644
--- a/tensorflow/compiler/xla/tests/token_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc
@@ -95,7 +95,7 @@ XLA_TEST_F(TokenHloTest, InvalidTokenShapedEntryParameter) {
           .status();
   ASSERT_IS_NOT_OK(status);
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       ::testing::HasSubstr("Entry parameter 1 is or contains a token shape"));
 }
 
@@ -115,7 +115,7 @@ XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) {
           .status();
   ASSERT_IS_NOT_OK(status);
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       ::testing::HasSubstr("Entry parameter 0 is or contains a token shape"));
 }
 
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index e3d2093e1f9..cf448bc4029 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -149,7 +149,7 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNonTupleFailsGracefully) {
   auto result_status = builder.Build();
   EXPECT_FALSE(result_status.ok());
   EXPECT_THAT(
-      result_status.status().error_message(),
+      result_status.status().message(),
       ::testing::HasSubstr("Operand to GetTupleElement() is not a tuple"));
 }
 
@@ -252,10 +252,6 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
 }
 
 XLA_TEST_F(TupleTest, NestedTuples) {
-  if (IsMlirLoweringEnabled()) {
-    GTEST_SKIP() << "Nested tuples are not supported by the MLIR pipeline";
-  }
-
   XlaBuilder builder(TestName());
   auto inner_tuple = Tuple(&builder, {ConstantR1<float>(&builder, {1.0, 2.0}),
                                       ConstantR0<float>(&builder, 42.0)});
@@ -272,10 +268,6 @@ XLA_TEST_F(TupleTest, NestedTuples) {
 }
 
 XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
-  if (IsMlirLoweringEnabled()) {
-    GTEST_SKIP() << "Nested tuples are not supported by the MLIR pipeline";
-  }
-
   XlaBuilder builder(TestName());
 
   Shape data_shape = ShapeUtil::MakeShape(F32, {3});
@@ -305,10 +297,6 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
 }
 
 XLA_TEST_F(TupleTest, ComplexTuples) {
-  if (IsMlirLoweringEnabled()) {
-    GTEST_SKIP() << "Nested tuples are not supported by the MLIR pipeline";
-  }
-
   XlaBuilder builder(TestName());
   {
     Shape c64r0 = ShapeUtil::MakeShape(C64, {});
@@ -381,9 +369,9 @@ XLA_TEST_F(TupleHloTest, BadTupleShapeFailsGracefully) {
   auto status = verifier().Run(module.get()).status();
   EXPECT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       ::testing::HasSubstr("Expected instruction to have shape equal to"));
-  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("actual shape is"));
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("actual shape is"));
 }
 
 XLA_TEST_F(TupleHloTest, BitcastAfterGTE) {
diff --git a/tensorflow/compiler/xla/tests/value_inference_test.cc b/tensorflow/compiler/xla/tests/value_inference_test.cc
index 17445609031..f915fcd4d59 100644
--- a/tensorflow/compiler/xla/tests/value_inference_test.cc
+++ b/tensorflow/compiler/xla/tests/value_inference_test.cc
@@ -279,7 +279,7 @@ TEST_F(DynamismInferenceTest, GatherWithCommonParent) {
   dim_numbers.add_start_index_map(0);
   dim_numbers.set_index_vector_dim(1);
   auto gather = Gather(operand1, indices, dim_numbers, {1});
-  ASSERT_TRUE(b.first_error().ok()) << b.first_error().error_message();
+  ASSERT_TRUE(b.first_error().ok()) << b.first_error().message();
   EXPECT_TRUE(ComputeDynamismLiteral(gather, &b).value().Get<bool>({0, 0}));
 }
 
@@ -294,7 +294,7 @@ TEST_F(DynamismInferenceTest, GatherWithConstantParent) {
   dim_numbers.add_start_index_map(0);
   dim_numbers.set_index_vector_dim(1);
   auto gather = Gather(data_operand, indices, dim_numbers, {1});
-  ASSERT_TRUE(b.first_error().ok()) << b.first_error().error_message();
+  ASSERT_TRUE(b.first_error().ok()) << b.first_error().message();
   // Everything is constant, result is also contant.
   EXPECT_FALSE(ComputeDynamismLiteral(gather, &b).value().Get<bool>({0, 0}));
 }
@@ -311,7 +311,7 @@ TEST_F(DynamismInferenceTest, GatherWithSharedConstantParent) {
   dim_numbers.add_start_index_map(0);
   dim_numbers.set_index_vector_dim(1);
   auto gather = Gather(operand1, indices, dim_numbers, {1});
-  ASSERT_TRUE(b.first_error().ok()) << b.first_error().error_message();
+  ASSERT_TRUE(b.first_error().ok()) << b.first_error().message();
   // Everything is constant, result is also contant.
   EXPECT_FALSE(ComputeDynamismLiteral(gather, &b).value().Get<bool>({0, 0}));
 }
@@ -325,7 +325,7 @@ TEST_F(DynamismInferenceTest, InferThroughPad) {
   padding_config.add_dimensions()->set_edge_padding_high(1);
   // After pad the value is [constant, constant, parameter].
   auto pad = Pad(operand1, parameter, padding_config);
-  ASSERT_TRUE(b.first_error().ok()) << b.first_error().error_message();
+  ASSERT_TRUE(b.first_error().ok()) << b.first_error().message();
   // Everything is constant, result is also contant.
   EXPECT_FALSE(ComputeDynamismLiteral(pad, &b).value().Get<bool>({0}));
   EXPECT_FALSE(ComputeDynamismLiteral(pad, &b).value().Get<bool>({1}));
@@ -360,7 +360,7 @@ TEST_F(DynamismInferenceTest, InferThroughConditionalBranchesAreSame) {
   auto cond = Conditional(parameter, constant, true_computation, constant,
                           false_computation);
   auto gte = GetTupleElement(cond, 0);
-  ASSERT_TRUE(b.first_error().ok()) << b.first_error().error_message();
+  ASSERT_TRUE(b.first_error().ok()) << b.first_error().message();
   // Result is not dynamic.
   EXPECT_FALSE(ComputeDynamismLiteral(gte, &b).value().Get<bool>({}));
 }
@@ -386,7 +386,7 @@ TEST_F(DynamismInferenceTest, InferThroughCall) {
   XlaBuilder b(TestName());
   auto constant = ConstantR0<int32_t>(&b, 3);
   auto call = Call(&b, call_computation, {constant});
-  ASSERT_TRUE(b.first_error().ok()) << b.first_error().error_message();
+  ASSERT_TRUE(b.first_error().ok()) << b.first_error().message();
   // Result is static.
   EXPECT_EQ(ComputeDynamismScalar(call, &b, {}).value(), false);
 }
@@ -419,7 +419,7 @@ TEST_F(DynamismInferenceTest, InferThroughConditionalBranchesAreNotSame) {
   auto cond = Conditional(parameter, constant, true_computation, constant,
                           false_computation);
   auto gte = GetTupleElement(cond, 0);
-  ASSERT_TRUE(b.first_error().ok()) << b.first_error().error_message();
+  ASSERT_TRUE(b.first_error().ok()) << b.first_error().message();
   // Result is dynamic.
   EXPECT_TRUE(ComputeDynamismLiteral(gte, &b).value().Get<bool>({}));
 }
@@ -452,7 +452,7 @@ TEST_F(DynamismInferenceTest, InferThroughConditionalPredIsConstantTrueBranch) {
   auto cond = Conditional(pred, constant, true_computation, constant,
                           false_computation);
   auto gte = GetTupleElement(cond, 0);
-  ASSERT_TRUE(b.first_error().ok()) << b.first_error().error_message();
+  ASSERT_TRUE(b.first_error().ok()) << b.first_error().message();
   // Result is not dynamic.
   EXPECT_FALSE(ComputeDynamismLiteral(gte, &b).value().Get<bool>({}));
 }
@@ -487,7 +487,7 @@ TEST_F(DynamismInferenceTest,
   auto cond =
       Conditional(pred, constant, true_computation, param, false_computation);
   auto gte = GetTupleElement(cond, 0);
-  ASSERT_TRUE(b.first_error().ok()) << b.first_error().error_message();
+  ASSERT_TRUE(b.first_error().ok()) << b.first_error().message();
   // Result is dynamic.
   EXPECT_TRUE(ComputeDynamismLiteral(gte, &b).value().Get<bool>({}));
 }
@@ -548,7 +548,7 @@ TEST_F(DynamismInferenceTest, ArgumentForwardingNestedTuple) {
   auto cond =
       Conditional(pred, param, true_computation, param, false_computation);
   auto gte = GetTupleElement(cond, 0);
-  ASSERT_TRUE(b.first_error().ok()) << b.first_error().error_message();
+  ASSERT_TRUE(b.first_error().ok()) << b.first_error().message();
   // Result is static.
   EXPECT_FALSE(ComputeDynamismLiteral(gte, &b).value().Get<bool>({}));
 }
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index f7eb06fc6fc..724e32b4dc5 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -261,14 +261,11 @@ xla_cc_binary(
     srcs = ["interactive_graphviz.cc"],
     deps = [
         ":hlo_extractor",
-        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:local_service",
@@ -279,6 +276,9 @@ xla_cc_binary(
         "//tensorflow/tsl/platform:subprocess",
         "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
         "//tensorflow/tsl/util:command_line_flags",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ] + if_cuda_or_rocm([
         "//tensorflow/compiler/xla/service:gpu_plugin",
     ]) + if_cuda([
@@ -405,18 +405,18 @@ xla_cc_binary(
     ],
     deps = [
         ":run_hlo_module_lib",
-        "@com_google_absl//absl/strings",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
         "//tensorflow/compiler/xla/service:platform_util",
-        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:path",
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/util:command_line_flags",
+        "@com_google_absl//absl/strings",
     ] + if_cuda_or_rocm([
         "//tensorflow/compiler/xla/service:gpu_plugin",
     ]) + if_cuda([
diff --git a/tensorflow/compiler/xla/tools/data/benchmarking/mobilenet_v2.hlo b/tensorflow/compiler/xla/tools/data/benchmarking/mobilenet_v2.hlo
new file mode 100644
index 00000000000..03a7a13c509
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/data/benchmarking/mobilenet_v2.hlo
@@ -0,0 +1,4091 @@
+HloModule IrToHlo.3979, entry_computation_layout={(f32[32]{0},f32[32]{0},f32[32]{0},f32[32]{0},f32[32,3,3,3]{3,2,1,0},f32[96,3,224,224]{3,2,1,0},s64[],f32[32]{0},f32[32]{0},f32[32]{0},f32[32]{0},f32[32,1,3,3]{3,2,1,0},f32[],s64[],f32[16]{0},f32[16]{0},f32[16]{0},f32[16]{0},f32[16,32,1,1]{3,2,1,0},s64[],f32[96]{0},f32[96]{0},f32[96]{0},f32[96]{0},f32[96,16,1,1]{3,2,1,0},s64[],f32[96]{0},f32[96]{0},f32[96]{0},f32[96]{0},f32[96,1,3,3]{3,2,1,0},s64[],f32[24]{0},f32[24]{0},f32[24]{0},f32[24]{0},f32[24,96,1,1]{3,2,1,0},s64[],f32[144]{0},f32[144]{0},f32[144]{0},f32[144]{0},f32[144,24,1,1]{3,2,1,0},s64[],f32[144]{0},f32[144]{0},f32[144]{0},f32[144]{0},f32[144,1,3,3]{3,2,1,0},s64[],f32[24]{0},f32[24]{0},f32[24]{0},f32[24]{0},f32[24,144,1,1]{3,2,1,0},s64[],f32[144]{0},f32[144]{0},f32[144]{0},f32[144]{0},f32[144,24,1,1]{3,2,1,0},s64[],f32[144]{0},f32[144]{0},f32[144]{0},f32[144]{0},f32[144,1,3,3]{3,2,1,0},s64[],f32[32]{0},f32[32]{0},f32[32]{0},f32[32]{0},f32[32,144,1,1]{3,2,1,0},s64[],f32[192]{0},f32[192]{0},f32[192]{0},f32[192]{0},f32[192,32,1,1]{3,2,1,0},s64[],f32[192]{0},f32[192]{0},f32[192]{0},f32[192]{0},f32[192,1,3,3]{3,2,1,0},s64[],f32[32]{0},f32[32]{0},f32[32]{0},f32[32]{0},f32[32,192,1,1]{3,2,1,0},s64[],f32[192]{0},f32[192]{0},f32[192]{0},f32[192]{0},f32[192,32,1,1]{3,2,1,0},s64[],f32[192]{0},f32[192]{0},f32[192]{0},f32[192]{0},f32[192,1,3,3]{3,2,1,0},s64[],f32[32]{0},f32[32]{0},f32[32]{0},f32[32]{0},f32[32,192,1,1]{3,2,1,0},s64[],f32[192]{0},f32[192]{0},f32[192]{0},f32[192]{0},f32[192,32,1,1]{3,2,1,0},s64[],f32[192]{0},f32[192]{0},f32[192]{0},f32[192]{0},f32[192,1,3,3]{3,2,1,0},s64[],f32[64]{0},f32[64]{0},f32[64]{0},f32[64]{0},f32[64,192,1,1]{3,2,1,0},s64[],f32[384]{0},f32[384]{0},f32[384]{0},f32[384]{0},f32[384,64,1,1]{3,2,1,0},s64[],f32[384]{0},f32[384]{0},f32[384]{0},f32[384]{0},f32[384,1,3,3]{3,2,1,0},s64[],f32[64]{0},f32[64]{0},f32[64]{0},f32[64]{0},f32[64,384,1,1]{3,2,1,0},s64[],f32[384]{0},f32[384]{0},f32[384]{0},f32[384]{0},f32[384,64,1,1]{3,2,1,0},s64[],f32[384]{0},f32[384]{0},f32[384]{0},f32[384]{0},f32[384,1,3,3]{3,2,1,0},s64[],f32[64]{0},f32[64]{0},f32[64]{0},f32[64]{0},f32[64,384,1,1]{3,2,1,0},s64[],f32[384]{0},f32[384]{0},f32[384]{0},f32[384]{0},f32[384,64,1,1]{3,2,1,0},s64[],f32[384]{0},f32[384]{0},f32[384]{0},f32[384]{0},f32[384,1,3,3]{3,2,1,0},s64[],f32[64]{0},f32[64]{0},f32[64]{0},f32[64]{0},f32[64,384,1,1]{3,2,1,0},s64[],f32[384]{0},f32[384]{0},f32[384]{0},f32[384]{0},f32[384,64,1,1]{3,2,1,0},s64[],f32[384]{0},f32[384]{0},f32[384]{0},f32[384]{0},f32[384,1,3,3]{3,2,1,0},s64[],f32[96]{0},f32[96]{0},f32[96]{0},f32[96]{0},f32[96,384,1,1]{3,2,1,0},s64[],f32[576]{0},f32[576]{0},f32[576]{0},f32[576]{0},f32[576,96,1,1]{3,2,1,0},s64[],f32[576]{0},f32[576]{0},f32[576]{0},f32[576]{0},f32[576,1,3,3]{3,2,1,0},s64[],f32[96]{0},f32[96]{0},f32[96]{0},f32[96]{0},f32[96,576,1,1]{3,2,1,0},s64[],f32[576]{0},f32[576]{0},f32[576]{0},f32[576]{0},f32[576,96,1,1]{3,2,1,0},s64[],f32[576]{0},f32[576]{0},f32[576]{0},f32[576]{0},f32[576,1,3,3]{3,2,1,0},s64[],f32[96]{0},f32[96]{0},f32[96]{0},f32[96]{0},f32[96,576,1,1]{3,2,1,0},s64[],f32[576]{0},f32[576]{0},f32[576]{0},f32[576]{0},f32[576,96,1,1]{3,2,1,0},s64[],f32[576]{0},f32[576]{0},f32[576]{0},f32[576]{0},f32[576,1,3,3]{3,2,1,0},s64[],f32[160]{0},f32[160]{0},f32[160]{0},f32[160]{0},f32[160,576,1,1]{3,2,1,0},s64[],f32[960]{0},f32[960]{0},f32[960]{0},f32[960]{0},f32[960,160,1,1]{3,2,1,0},s64[],f32[960]{0},f32[960]{0},f32[960]{0},f32[960]{0},f32[960,1,3,3]{3,2,1,0},s64[],f32[160]{0},f32[160]{0},f32[160]{0},f32[160]{0},f32[160,960,1,1]{3,2,1,0},s64[],f32[960]{0},f32[960]{0},f32[960]{0},f32[960]{0},f32[960,160,1,1]{3,2,1,0},s64[],f32[960]{0},f32[960]{0},f32[960]{0},f32[960]{0},f32[960,1,3,3]{3,2,1,0},s64[],f32[160]{0},f32[160]{0},f32[160]{0},f32[160]{0},f32[160,960,1,1]{3,2,1,0},s64[],f32[960]{0},f32[960]{0},f32[960]{0},f32[960]{0},f32[960,160,1,1]{3,2,1,0},s64[],f32[960]{0},f32[960]{0},f32[960]{0},f32[960]{0},f32[960,1,3,3]{3,2,1,0},s64[],f32[320]{0},f32[320]{0},f32[320]{0},f32[320]{0},f32[320,960,1,1]{3,2,1,0},s64[],f32[1280]{0},f32[1280]{0},f32[1280]{0},f32[1280]{0},f32[1280,320,1,1]{3,2,1,0},s64[],f32[],f32[],s64[],f32[],f32[1000,1280]{1,0})->(f32[32]{0}, f32[32]{0}, s64[], f32[32]{0}, f32[32]{0}, /*index=5*/s64[], f32[16]{0}, f32[16]{0}, s64[], f32[96]{0}, /*index=10*/f32[96]{0}, s64[], f32[96]{0}, f32[96]{0}, s64[], /*index=15*/f32[24]{0}, f32[24]{0}, s64[], f32[144]{0}, f32[144]{0}, /*index=20*/s64[], f32[144]{0}, f32[144]{0}, s64[], f32[24]{0}, /*index=25*/f32[24]{0}, s64[], f32[144]{0}, f32[144]{0}, s64[], /*index=30*/f32[144]{0}, f32[144]{0}, s64[], f32[32]{0}, f32[32]{0}, /*index=35*/s64[], f32[192]{0}, f32[192]{0}, s64[], f32[192]{0}, /*index=40*/f32[192]{0}, s64[], f32[32]{0}, f32[32]{0}, s64[], /*index=45*/f32[192]{0}, f32[192]{0}, s64[], f32[192]{0}, f32[192]{0}, /*index=50*/s64[], f32[32]{0}, f32[32]{0}, s64[], f32[192]{0}, /*index=55*/f32[192]{0}, s64[], f32[192]{0}, f32[192]{0}, s64[], /*index=60*/f32[64]{0}, f32[64]{0}, s64[], f32[384]{0}, f32[384]{0}, /*index=65*/s64[], f32[384]{0}, f32[384]{0}, s64[], f32[64]{0}, /*index=70*/f32[64]{0}, s64[], f32[384]{0}, f32[384]{0}, s64[], /*index=75*/f32[384]{0}, f32[384]{0}, s64[], f32[64]{0}, f32[64]{0}, /*index=80*/s64[], f32[384]{0}, f32[384]{0}, s64[], f32[384]{0}, /*index=85*/f32[384]{0}, s64[], f32[64]{0}, f32[64]{0}, s64[], /*index=90*/f32[384]{0}, f32[384]{0}, s64[], f32[384]{0}, f32[384]{0}, /*index=95*/s64[], f32[96]{0}, f32[96]{0}, s64[], f32[576]{0}, /*index=100*/f32[576]{0}, s64[], f32[576]{0}, f32[576]{0}, s64[], /*index=105*/f32[96]{0}, f32[96]{0}, s64[], f32[576]{0}, f32[576]{0}, /*index=110*/s64[], f32[576]{0}, f32[576]{0}, s64[], f32[96]{0}, /*index=115*/f32[96]{0}, s64[], f32[576]{0}, f32[576]{0}, s64[], /*index=120*/f32[576]{0}, f32[576]{0}, s64[], f32[160]{0}, f32[160]{0}, /*index=125*/s64[], f32[960]{0}, f32[960]{0}, s64[], f32[960]{0}, /*index=130*/f32[960]{0}, s64[], f32[160]{0}, f32[160]{0}, s64[], /*index=135*/f32[960]{0}, f32[960]{0}, s64[], f32[960]{0}, f32[960]{0}, /*index=140*/s64[], f32[160]{0}, f32[160]{0}, s64[], f32[960]{0}, /*index=145*/f32[960]{0}, s64[], f32[960]{0}, f32[960]{0}, s64[], /*index=150*/f32[320]{0}, f32[320]{0}, s64[], f32[1280]{0}, f32[1280]{0}, /*index=155*/s64[], f32[1000]{0}, f32[1000,1280]{0,1}, f32[1280]{0}, f32[1280]{0}, /*index=160*/f32[1280,320,1,1]{0,1,3,2}, f32[320]{0}, f32[320]{0}, f32[320,960,1,1]{0,1,3,2}, f32[960]{0}, /*index=165*/f32[960]{0}, f32[960,1,3,3]{0,1,3,2}, f32[960]{0}, f32[960]{0}, f32[960,160,1,1]{0,1,3,2}, /*index=170*/f32[160]{0}, f32[160]{0}, f32[160,960,1,1]{0,1,3,2}, f32[960]{0}, f32[960]{0}, /*index=175*/f32[960,1,3,3]{0,1,3,2}, f32[960]{0}, f32[960]{0}, f32[960,160,1,1]{0,1,3,2}, f32[160]{0}, /*index=180*/f32[160]{0}, f32[160,960,1,1]{0,1,3,2}, f32[960]{0}, f32[960]{0}, f32[960,1,3,3]{0,1,3,2}, /*index=185*/f32[960]{0}, f32[960]{0}, f32[960,160,1,1]{0,1,3,2}, f32[160]{0}, f32[160]{0}, /*index=190*/f32[160,576,1,1]{0,1,3,2}, f32[576]{0}, f32[576]{0}, f32[576,1,3,3]{0,1,3,2}, f32[576]{0}, /*index=195*/f32[576]{0}, f32[576,96,1,1]{0,1,3,2}, f32[96]{0}, f32[96]{0}, f32[96,576,1,1]{0,1,3,2}, /*index=200*/f32[576]{0}, f32[576]{0}, f32[576,1,3,3]{0,1,3,2}, f32[576]{0}, f32[576]{0}, /*index=205*/f32[576,96,1,1]{0,1,3,2}, f32[96]{0}, f32[96]{0}, f32[96,576,1,1]{0,1,3,2}, f32[576]{0}, /*index=210*/f32[576]{0}, f32[576,1,3,3]{0,1,3,2}, f32[576]{0}, f32[576]{0}, f32[576,96,1,1]{0,1,3,2}, /*index=215*/f32[96]{0}, f32[96]{0}, f32[96,384,1,1]{0,1,3,2}, f32[384]{0}, f32[384]{0}, /*index=220*/f32[384,1,3,3]{0,1,3,2}, f32[384]{0}, f32[384]{0}, f32[384,64,1,1]{0,1,3,2}, f32[64]{0}, /*index=225*/f32[64]{0}, f32[64,384,1,1]{0,1,3,2}, f32[384]{0}, f32[384]{0}, f32[384,1,3,3]{0,1,3,2}, /*index=230*/f32[384]{0}, f32[384]{0}, f32[384,64,1,1]{0,1,3,2}, f32[64]{0}, f32[64]{0}, /*index=235*/f32[64,384,1,1]{0,1,3,2}, f32[384]{0}, f32[384]{0}, f32[384,1,3,3]{0,1,3,2}, f32[384]{0}, /*index=240*/f32[384]{0}, f32[384,64,1,1]{0,1,3,2}, f32[64]{0}, f32[64]{0}, f32[64,384,1,1]{0,1,3,2}, /*index=245*/f32[384]{0}, f32[384]{0}, f32[384,1,3,3]{0,1,3,2}, f32[384]{0}, f32[384]{0}, /*index=250*/f32[384,64,1,1]{0,1,3,2}, f32[64]{0}, f32[64]{0}, f32[64,192,1,1]{0,1,3,2}, f32[192]{0}, /*index=255*/f32[192]{0}, f32[192,1,3,3]{0,1,3,2}, f32[192]{0}, f32[192]{0}, f32[192,32,1,1]{0,1,3,2}, /*index=260*/f32[32]{0}, f32[32]{0}, f32[32,192,1,1]{0,1,3,2}, f32[192]{0}, f32[192]{0}, /*index=265*/f32[192,1,3,3]{0,1,3,2}, f32[192]{0}, f32[192]{0}, f32[192,32,1,1]{0,1,3,2}, f32[32]{0}, /*index=270*/f32[32]{0}, f32[32,192,1,1]{0,1,3,2}, f32[192]{0}, f32[192]{0}, f32[192,1,3,3]{0,1,3,2}, /*index=275*/f32[192]{0}, f32[192]{0}, f32[192,32,1,1]{0,1,3,2}, f32[32]{0}, f32[32]{0}, /*index=280*/f32[32,144,1,1]{0,1,3,2}, f32[144]{0}, f32[144]{0}, f32[144,1,3,3]{0,1,3,2}, f32[144]{0}, /*index=285*/f32[144]{0}, f32[144,24,1,1]{0,1,3,2}, f32[24]{0}, f32[24]{0}, f32[24,144,1,1]{0,1,3,2}, /*index=290*/f32[144]{0}, f32[144]{0}, f32[144,1,3,3]{0,1,3,2}, f32[144]{0}, f32[144]{0}, /*index=295*/f32[144,24,1,1]{0,1,3,2}, f32[24]{0}, f32[24]{0}, f32[24,96,1,1]{0,1,3,2}, f32[96]{0}, /*index=300*/f32[96]{0}, f32[96,1,3,3]{0,1,3,2}, f32[96]{0}, f32[96]{0}, f32[96,16,1,1]{0,1,3,2}, /*index=305*/f32[16]{0}, f32[16]{0}, f32[16,32,1,1]{0,1,3,2}, f32[32]{0}, f32[32]{0}, /*index=310*/f32[32,1,3,3]{0,1,3,2}, f32[32]{0}, f32[32]{0}, f32[32,3,3,3]{0,1,3,2})}
+ 
+%AddComputation.2038 (x.2039: f32[], y.2040: f32[]) -> f32[] {
+  %x.2039 = f32[] parameter(0)
+  %y.2040 = f32[] parameter(1)
+  ROOT %add.2041 = f32[] add(f32[] %x.2039, f32[] %y.2040)
+}
+ 
+%AddComputation.2330 (x.2331: f32[], y.2332: f32[]) -> f32[] {
+  %x.2331 = f32[] parameter(0)
+  %y.2332 = f32[] parameter(1)
+  ROOT %add.2333 = f32[] add(f32[] %x.2331, f32[] %y.2332)
+}
+ 
+%AddComputation.2390 (x.2391: f32[], y.2392: f32[]) -> f32[] {
+  %x.2391 = f32[] parameter(0)
+  %y.2392 = f32[] parameter(1)
+  ROOT %add.2393 = f32[] add(f32[] %x.2391, f32[] %y.2392)
+}
+ 
+%AddComputation.2412 (x.2413: f32[], y.2414: f32[]) -> f32[] {
+  %x.2413 = f32[] parameter(0)
+  %y.2414 = f32[] parameter(1)
+  ROOT %add.2415 = f32[] add(f32[] %x.2413, f32[] %y.2414)
+}
+ 
+%AddComputation.2447 (x.2448: f32[], y.2449: f32[]) -> f32[] {
+  %x.2448 = f32[] parameter(0)
+  %y.2449 = f32[] parameter(1)
+  ROOT %add.2450 = f32[] add(f32[] %x.2448, f32[] %y.2449)
+}
+ 
+%AddComputation.2479 (x.2480: f32[], y.2481: f32[]) -> f32[] {
+  %x.2480 = f32[] parameter(0)
+  %y.2481 = f32[] parameter(1)
+  ROOT %add.2482 = f32[] add(f32[] %x.2480, f32[] %y.2481)
+}
+ 
+%AddComputation.2501 (x.2502: f32[], y.2503: f32[]) -> f32[] {
+  %x.2502 = f32[] parameter(0)
+  %y.2503 = f32[] parameter(1)
+  ROOT %add.2504 = f32[] add(f32[] %x.2502, f32[] %y.2503)
+}
+ 
+%AddComputation.2536 (x.2537: f32[], y.2538: f32[]) -> f32[] {
+  %x.2537 = f32[] parameter(0)
+  %y.2538 = f32[] parameter(1)
+  ROOT %add.2539 = f32[] add(f32[] %x.2537, f32[] %y.2538)
+}
+ 
+%AddComputation.2568 (x.2569: f32[], y.2570: f32[]) -> f32[] {
+  %x.2569 = f32[] parameter(0)
+  %y.2570 = f32[] parameter(1)
+  ROOT %add.2571 = f32[] add(f32[] %x.2569, f32[] %y.2570)
+}
+ 
+%AddComputation.2597 (x.2598: f32[], y.2599: f32[]) -> f32[] {
+  %x.2598 = f32[] parameter(0)
+  %y.2599 = f32[] parameter(1)
+  ROOT %add.2600 = f32[] add(f32[] %x.2598, f32[] %y.2599)
+}
+ 
+%AddComputation.2632 (x.2633: f32[], y.2634: f32[]) -> f32[] {
+  %x.2633 = f32[] parameter(0)
+  %y.2634 = f32[] parameter(1)
+  ROOT %add.2635 = f32[] add(f32[] %x.2633, f32[] %y.2634)
+}
+ 
+%AddComputation.2664 (x.2665: f32[], y.2666: f32[]) -> f32[] {
+  %x.2665 = f32[] parameter(0)
+  %y.2666 = f32[] parameter(1)
+  ROOT %add.2667 = f32[] add(f32[] %x.2665, f32[] %y.2666)
+}
+ 
+%AddComputation.2693 (x.2694: f32[], y.2695: f32[]) -> f32[] {
+  %x.2694 = f32[] parameter(0)
+  %y.2695 = f32[] parameter(1)
+  ROOT %add.2696 = f32[] add(f32[] %x.2694, f32[] %y.2695)
+}
+ 
+%AddComputation.2728 (x.2729: f32[], y.2730: f32[]) -> f32[] {
+  %x.2729 = f32[] parameter(0)
+  %y.2730 = f32[] parameter(1)
+  ROOT %add.2731 = f32[] add(f32[] %x.2729, f32[] %y.2730)
+}
+ 
+%AddComputation.2760 (x.2761: f32[], y.2762: f32[]) -> f32[] {
+  %x.2761 = f32[] parameter(0)
+  %y.2762 = f32[] parameter(1)
+  ROOT %add.2763 = f32[] add(f32[] %x.2761, f32[] %y.2762)
+}
+ 
+%AddComputation.2782 (x.2783: f32[], y.2784: f32[]) -> f32[] {
+  %x.2783 = f32[] parameter(0)
+  %y.2784 = f32[] parameter(1)
+  ROOT %add.2785 = f32[] add(f32[] %x.2783, f32[] %y.2784)
+}
+ 
+%AddComputation.2817 (x.2818: f32[], y.2819: f32[]) -> f32[] {
+  %x.2818 = f32[] parameter(0)
+  %y.2819 = f32[] parameter(1)
+  ROOT %add.2820 = f32[] add(f32[] %x.2818, f32[] %y.2819)
+}
+ 
+%AddComputation.2849 (x.2850: f32[], y.2851: f32[]) -> f32[] {
+  %x.2850 = f32[] parameter(0)
+  %y.2851 = f32[] parameter(1)
+  ROOT %add.2852 = f32[] add(f32[] %x.2850, f32[] %y.2851)
+}
+ 
+%AddComputation.2878 (x.2879: f32[], y.2880: f32[]) -> f32[] {
+  %x.2879 = f32[] parameter(0)
+  %y.2880 = f32[] parameter(1)
+  ROOT %add.2881 = f32[] add(f32[] %x.2879, f32[] %y.2880)
+}
+ 
+%AddComputation.2913 (x.2914: f32[], y.2915: f32[]) -> f32[] {
+  %x.2914 = f32[] parameter(0)
+  %y.2915 = f32[] parameter(1)
+  ROOT %add.2916 = f32[] add(f32[] %x.2914, f32[] %y.2915)
+}
+ 
+%AddComputation.2945 (x.2946: f32[], y.2947: f32[]) -> f32[] {
+  %x.2946 = f32[] parameter(0)
+  %y.2947 = f32[] parameter(1)
+  ROOT %add.2948 = f32[] add(f32[] %x.2946, f32[] %y.2947)
+}
+ 
+%AddComputation.2974 (x.2975: f32[], y.2976: f32[]) -> f32[] {
+  %x.2975 = f32[] parameter(0)
+  %y.2976 = f32[] parameter(1)
+  ROOT %add.2977 = f32[] add(f32[] %x.2975, f32[] %y.2976)
+}
+ 
+%AddComputation.3009 (x.3010: f32[], y.3011: f32[]) -> f32[] {
+  %x.3010 = f32[] parameter(0)
+  %y.3011 = f32[] parameter(1)
+  ROOT %add.3012 = f32[] add(f32[] %x.3010, f32[] %y.3011)
+}
+ 
+%AddComputation.3041 (x.3042: f32[], y.3043: f32[]) -> f32[] {
+  %x.3042 = f32[] parameter(0)
+  %y.3043 = f32[] parameter(1)
+  ROOT %add.3044 = f32[] add(f32[] %x.3042, f32[] %y.3043)
+}
+ 
+%AddComputation.3063 (x.3064: f32[], y.3065: f32[]) -> f32[] {
+  %x.3064 = f32[] parameter(0)
+  %y.3065 = f32[] parameter(1)
+  ROOT %add.3066 = f32[] add(f32[] %x.3064, f32[] %y.3065)
+}
+ 
+%AddComputation.3098 (x.3099: f32[], y.3100: f32[]) -> f32[] {
+  %x.3099 = f32[] parameter(0)
+  %y.3100 = f32[] parameter(1)
+  ROOT %add.3101 = f32[] add(f32[] %x.3099, f32[] %y.3100)
+}
+ 
+%AddComputation.3130 (x.3131: f32[], y.3132: f32[]) -> f32[] {
+  %x.3131 = f32[] parameter(0)
+  %y.3132 = f32[] parameter(1)
+  ROOT %add.3133 = f32[] add(f32[] %x.3131, f32[] %y.3132)
+}
+ 
+%AddComputation.3159 (x.3160: f32[], y.3161: f32[]) -> f32[] {
+  %x.3160 = f32[] parameter(0)
+  %y.3161 = f32[] parameter(1)
+  ROOT %add.3162 = f32[] add(f32[] %x.3160, f32[] %y.3161)
+}
+ 
+%AddComputation.3194 (x.3195: f32[], y.3196: f32[]) -> f32[] {
+  %x.3195 = f32[] parameter(0)
+  %y.3196 = f32[] parameter(1)
+  ROOT %add.3197 = f32[] add(f32[] %x.3195, f32[] %y.3196)
+}
+ 
+%AddComputation.3226 (x.3227: f32[], y.3228: f32[]) -> f32[] {
+  %x.3227 = f32[] parameter(0)
+  %y.3228 = f32[] parameter(1)
+  ROOT %add.3229 = f32[] add(f32[] %x.3227, f32[] %y.3228)
+}
+ 
+%AddComputation.3255 (x.3256: f32[], y.3257: f32[]) -> f32[] {
+  %x.3256 = f32[] parameter(0)
+  %y.3257 = f32[] parameter(1)
+  ROOT %add.3258 = f32[] add(f32[] %x.3256, f32[] %y.3257)
+}
+ 
+%AddComputation.3290 (x.3291: f32[], y.3292: f32[]) -> f32[] {
+  %x.3291 = f32[] parameter(0)
+  %y.3292 = f32[] parameter(1)
+  ROOT %add.3293 = f32[] add(f32[] %x.3291, f32[] %y.3292)
+}
+ 
+%AddComputation.3322 (x.3323: f32[], y.3324: f32[]) -> f32[] {
+  %x.3323 = f32[] parameter(0)
+  %y.3324 = f32[] parameter(1)
+  ROOT %add.3325 = f32[] add(f32[] %x.3323, f32[] %y.3324)
+}
+ 
+%AddComputation.3351 (x.3352: f32[], y.3353: f32[]) -> f32[] {
+  %x.3352 = f32[] parameter(0)
+  %y.3353 = f32[] parameter(1)
+  ROOT %add.3354 = f32[] add(f32[] %x.3352, f32[] %y.3353)
+}
+ 
+%AddComputation.3386 (x.3387: f32[], y.3388: f32[]) -> f32[] {
+  %x.3387 = f32[] parameter(0)
+  %y.3388 = f32[] parameter(1)
+  ROOT %add.3389 = f32[] add(f32[] %x.3387, f32[] %y.3388)
+}
+ 
+%AddComputation.3418 (x.3419: f32[], y.3420: f32[]) -> f32[] {
+  %x.3419 = f32[] parameter(0)
+  %y.3420 = f32[] parameter(1)
+  ROOT %add.3421 = f32[] add(f32[] %x.3419, f32[] %y.3420)
+}
+ 
+%AddComputation.3440 (x.3441: f32[], y.3442: f32[]) -> f32[] {
+  %x.3441 = f32[] parameter(0)
+  %y.3442 = f32[] parameter(1)
+  ROOT %add.3443 = f32[] add(f32[] %x.3441, f32[] %y.3442)
+}
+ 
+%AddComputation.3475 (x.3476: f32[], y.3477: f32[]) -> f32[] {
+  %x.3476 = f32[] parameter(0)
+  %y.3477 = f32[] parameter(1)
+  ROOT %add.3478 = f32[] add(f32[] %x.3476, f32[] %y.3477)
+}
+ 
+%AddComputation.3507 (x.3508: f32[], y.3509: f32[]) -> f32[] {
+  %x.3508 = f32[] parameter(0)
+  %y.3509 = f32[] parameter(1)
+  ROOT %add.3510 = f32[] add(f32[] %x.3508, f32[] %y.3509)
+}
+ 
+%AddComputation.3536 (x.3537: f32[], y.3538: f32[]) -> f32[] {
+  %x.3537 = f32[] parameter(0)
+  %y.3538 = f32[] parameter(1)
+  ROOT %add.3539 = f32[] add(f32[] %x.3537, f32[] %y.3538)
+}
+ 
+%AddComputation.3571 (x.3572: f32[], y.3573: f32[]) -> f32[] {
+  %x.3572 = f32[] parameter(0)
+  %y.3573 = f32[] parameter(1)
+  ROOT %add.3574 = f32[] add(f32[] %x.3572, f32[] %y.3573)
+}
+ 
+%AddComputation.3603 (x.3604: f32[], y.3605: f32[]) -> f32[] {
+  %x.3604 = f32[] parameter(0)
+  %y.3605 = f32[] parameter(1)
+  ROOT %add.3606 = f32[] add(f32[] %x.3604, f32[] %y.3605)
+}
+ 
+%AddComputation.3632 (x.3633: f32[], y.3634: f32[]) -> f32[] {
+  %x.3633 = f32[] parameter(0)
+  %y.3634 = f32[] parameter(1)
+  ROOT %add.3635 = f32[] add(f32[] %x.3633, f32[] %y.3634)
+}
+ 
+%AddComputation.3667 (x.3668: f32[], y.3669: f32[]) -> f32[] {
+  %x.3668 = f32[] parameter(0)
+  %y.3669 = f32[] parameter(1)
+  ROOT %add.3670 = f32[] add(f32[] %x.3668, f32[] %y.3669)
+}
+ 
+%AddComputation.3699 (x.3700: f32[], y.3701: f32[]) -> f32[] {
+  %x.3700 = f32[] parameter(0)
+  %y.3701 = f32[] parameter(1)
+  ROOT %add.3702 = f32[] add(f32[] %x.3700, f32[] %y.3701)
+}
+ 
+%AddComputation.3721 (x.3722: f32[], y.3723: f32[]) -> f32[] {
+  %x.3722 = f32[] parameter(0)
+  %y.3723 = f32[] parameter(1)
+  ROOT %add.3724 = f32[] add(f32[] %x.3722, f32[] %y.3723)
+}
+ 
+%AddComputation.3756 (x.3757: f32[], y.3758: f32[]) -> f32[] {
+  %x.3757 = f32[] parameter(0)
+  %y.3758 = f32[] parameter(1)
+  ROOT %add.3759 = f32[] add(f32[] %x.3757, f32[] %y.3758)
+}
+ 
+%AddComputation.3788 (x.3789: f32[], y.3790: f32[]) -> f32[] {
+  %x.3789 = f32[] parameter(0)
+  %y.3790 = f32[] parameter(1)
+  ROOT %add.3791 = f32[] add(f32[] %x.3789, f32[] %y.3790)
+}
+ 
+%AddComputation.3817 (x.3818: f32[], y.3819: f32[]) -> f32[] {
+  %x.3818 = f32[] parameter(0)
+  %y.3819 = f32[] parameter(1)
+  ROOT %add.3820 = f32[] add(f32[] %x.3818, f32[] %y.3819)
+}
+ 
+%AddComputation.3852 (x.3853: f32[], y.3854: f32[]) -> f32[] {
+  %x.3853 = f32[] parameter(0)
+  %y.3854 = f32[] parameter(1)
+  ROOT %add.3855 = f32[] add(f32[] %x.3853, f32[] %y.3854)
+}
+ 
+%AddComputation.3884 (x.3885: f32[], y.3886: f32[]) -> f32[] {
+  %x.3885 = f32[] parameter(0)
+  %y.3886 = f32[] parameter(1)
+  ROOT %add.3887 = f32[] add(f32[] %x.3885, f32[] %y.3886)
+}
+ 
+%AddComputation.3906 (x.3907: f32[], y.3908: f32[]) -> f32[] {
+  %x.3907 = f32[] parameter(0)
+  %y.3908 = f32[] parameter(1)
+  ROOT %add.3909 = f32[] add(f32[] %x.3907, f32[] %y.3908)
+}
+ 
+%AddComputation.3941 (x.3942: f32[], y.3943: f32[]) -> f32[] {
+  %x.3942 = f32[] parameter(0)
+  %y.3943 = f32[] parameter(1)
+  ROOT %add.3944 = f32[] add(f32[] %x.3942, f32[] %y.3943)
+}
+ 
+%AddComputation.3973 (x.3974: f32[], y.3975: f32[]) -> f32[] {
+  %x.3974 = f32[] parameter(0)
+  %y.3975 = f32[] parameter(1)
+  ROOT %add.3976 = f32[] add(f32[] %x.3974, f32[] %y.3975)
+}
+ 
+ENTRY %IrToHlo.3979 (p0.1: f32[32], p1.2: f32[32], p2.3: f32[32], p3.4: f32[32], p4.5: f32[32,3,3,3], p5.6: f32[96,3,224,224], p6.35: s64[], p7.37: f32[32], p8.38: f32[32], p9.39: f32[32], p10.40: f32[32], p11.41: f32[32,1,3,3], p12.42: f32[], p13.75: s64[], p14.77: f32[16], p15.78: f32[16], p16.79: f32[16], p17.80: f32[16], p18.81: f32[16,32,1,1], p19.114: s64[], p20.116: f32[96], p21.117: f32[96], p22.118: f32[96], p23.119: f32[96], p24.120: f32[96,16,1,1], p25.149: s64[], p26.151: f32[96], p27.152: f32[96], p28.153: f32[96], p29.154: f32[96], p30.155: f32[96,1,3,3], p31.188: s64[], p32.190: f32[24], p33.191: f32[24], p34.192: f32[24], p35.193: f32[24], p36.194: f32[24,96,1,1], p37.227: s64[], p38.229: f32[144], p39.230: f32[144], p40.231: f32[144], p41.232: f32[144], p42.233: f32[144,24,1,1], p43.262: s64[], p44.264: f32[144], p45.265: f32[144], p46.266: f32[144], p47.267: f32[144], p48.268: f32[144,1,3,3], p49.301: s64[], p50.303: f32[24], p51.304: f32[24], p52.305: f32[24], p53.306: f32[24], p54.307: f32[24,144,1,1], p55.340: s64[], p56.342: f32[144], p57.343: f32[144], p58.344: f32[144], p59.345: f32[144], p60.346: f32[144,24,1,1], p61.382: s64[], p62.384: f32[144], p63.385: f32[144], p64.386: f32[144], p65.387: f32[144], p66.388: f32[144,1,3,3], p67.421: s64[], p68.423: f32[32], p69.424: f32[32], p70.425: f32[32], p71.426: f32[32], p72.427: f32[32,144,1,1], p73.460: s64[], p74.462: f32[192], p75.463: f32[192], p76.464: f32[192], p77.465: f32[192], p78.466: f32[192,32,1,1], p79.495: s64[], p80.497: f32[192], p81.498: f32[192], p82.499: f32[192], p83.500: f32[192], p84.501: f32[192,1,3,3], p85.534: s64[], p86.536: f32[32], p87.537: f32[32], p88.538: f32[32], p89.539: f32[32], p90.540: f32[32,192,1,1], p91.573: s64[], p92.575: f32[192], p93.576: f32[192], p94.577: f32[192], p95.578: f32[192], p96.579: f32[192,32,1,1], p97.615: s64[], p98.617: f32[192], p99.618: f32[192], p100.619: f32[192], p101.620: f32[192], p102.621: f32[192,1,3,3], p103.654: s64[], p104.656: f32[32], p105.657: f32[32], p106.658: f32[32], p107.659: f32[32], p108.660: f32[32,192,1,1], p109.693: s64[], p110.695: f32[192], p111.696: f32[192], p112.697: f32[192], p113.698: f32[192], p114.699: f32[192,32,1,1], p115.735: s64[], p116.737: f32[192], p117.738: f32[192], p118.739: f32[192], p119.740: f32[192], p120.741: f32[192,1,3,3], p121.774: s64[], p122.776: f32[64], p123.777: f32[64], p124.778: f32[64], p125.779: f32[64], p126.780: f32[64,192,1,1], p127.813: s64[], p128.815: f32[384], p129.816: f32[384], p130.817: f32[384], p131.818: f32[384], p132.819: f32[384,64,1,1], p133.848: s64[], p134.850: f32[384], p135.851: f32[384], p136.852: f32[384], p137.853: f32[384], p138.854: f32[384,1,3,3], p139.887: s64[], p140.889: f32[64], p141.890: f32[64], p142.891: f32[64], p143.892: f32[64], p144.893: f32[64,384,1,1], p145.926: s64[], p146.928: f32[384], p147.929: f32[384], p148.930: f32[384], p149.931: f32[384], p150.932: f32[384,64,1,1], p151.968: s64[], p152.970: f32[384], p153.971: f32[384], p154.972: f32[384], p155.973: f32[384], p156.974: f32[384,1,3,3], p157.1007: s64[], p158.1009: f32[64], p159.1010: f32[64], p160.1011: f32[64], p161.1012: f32[64], p162.1013: f32[64,384,1,1], p163.1046: s64[], p164.1048: f32[384], p165.1049: f32[384], p166.1050: f32[384], p167.1051: f32[384], p168.1052: f32[384,64,1,1], p169.1088: s64[], p170.1090: f32[384], p171.1091: f32[384], p172.1092: f32[384], p173.1093: f32[384], p174.1094: f32[384,1,3,3], p175.1127: s64[], p176.1129: f32[64], p177.1130: f32[64], p178.1131: f32[64], p179.1132: f32[64], p180.1133: f32[64,384,1,1], p181.1166: s64[], p182.1168: f32[384], p183.1169: f32[384], p184.1170: f32[384], p185.1171: f32[384], p186.1172: f32[384,64,1,1], p187.1208: s64[], p188.1210: f32[384], p189.1211: f32[384], p190.1212: f32[384], p191.1213: f32[384], p192.1214: f32[384,1,3,3], p193.1247: s64[], p194.1249: f32[96], p195.1250: f32[96], p196.1251: f32[96], p197.1252: f32[96], p198.1253: f32[96,384,1,1], p199.1286: s64[], p200.1288: f32[576], p201.1289: f32[576], p202.1290: f32[576], p203.1291: f32[576], p204.1292: f32[576,96,1,1], p205.1321: s64[], p206.1323: f32[576], p207.1324: f32[576], p208.1325: f32[576], p209.1326: f32[576], p210.1327: f32[576,1,3,3], p211.1360: s64[], p212.1362: f32[96], p213.1363: f32[96], p214.1364: f32[96], p215.1365: f32[96], p216.1366: f32[96,576,1,1], p217.1399: s64[], p218.1401: f32[576], p219.1402: f32[576], p220.1403: f32[576], p221.1404: f32[576], p222.1405: f32[576,96,1,1], p223.1441: s64[], p224.1443: f32[576], p225.1444: f32[576], p226.1445: f32[576], p227.1446: f32[576], p228.1447: f32[576,1,3,3], p229.1480: s64[], p230.1482: f32[96], p231.1483: f32[96], p232.1484: f32[96], p233.1485: f32[96], p234.1486: f32[96,576,1,1], p235.1519: s64[], p236.1521: f32[576], p237.1522: f32[576], p238.1523: f32[576], p239.1524: f32[576], p240.1525: f32[576,96,1,1], p241.1561: s64[], p242.1563: f32[576], p243.1564: f32[576], p244.1565: f32[576], p245.1566: f32[576], p246.1567: f32[576,1,3,3], p247.1600: s64[], p248.1602: f32[160], p249.1603: f32[160], p250.1604: f32[160], p251.1605: f32[160], p252.1606: f32[160,576,1,1], p253.1639: s64[], p254.1641: f32[960], p255.1642: f32[960], p256.1643: f32[960], p257.1644: f32[960], p258.1645: f32[960,160,1,1], p259.1674: s64[], p260.1676: f32[960], p261.1677: f32[960], p262.1678: f32[960], p263.1679: f32[960], p264.1680: f32[960,1,3,3], p265.1713: s64[], p266.1715: f32[160], p267.1716: f32[160], p268.1717: f32[160], p269.1718: f32[160], p270.1719: f32[160,960,1,1], p271.1752: s64[], p272.1754: f32[960], p273.1755: f32[960], p274.1756: f32[960], p275.1757: f32[960], p276.1758: f32[960,160,1,1], p277.1794: s64[], p278.1796: f32[960], p279.1797: f32[960], p280.1798: f32[960], p281.1799: f32[960], p282.1800: f32[960,1,3,3], p283.1833: s64[], p284.1835: f32[160], p285.1836: f32[160], p286.1837: f32[160], p287.1838: f32[160], p288.1839: f32[160,960,1,1], p289.1872: s64[], p290.1874: f32[960], p291.1875: f32[960], p292.1876: f32[960], p293.1877: f32[960], p294.1878: f32[960,160,1,1], p295.1914: s64[], p296.1916: f32[960], p297.1917: f32[960], p298.1918: f32[960], p299.1919: f32[960], p300.1920: f32[960,1,3,3], p301.1953: s64[], p302.1955: f32[320], p303.1956: f32[320], p304.1957: f32[320], p305.1958: f32[320], p306.1959: f32[320,960,1,1], p307.1992: s64[], p308.1994: f32[1280], p309.1995: f32[1280], p310.1996: f32[1280], p311.1997: f32[1280], p312.1998: f32[1280,320,1,1], p313.2027: s64[], p314.2029: f32[], p315.2045: f32[], p316.2046: s64[], p317.2351: f32[], p318.2352: f32[1000,1280]) -> (f32[32], f32[32], s64[], f32[32], f32[32], /*index=5*/s64[], f32[16], f32[16], s64[], f32[96], /*index=10*/f32[96], s64[], f32[96], f32[96], s64[], /*index=15*/f32[24], f32[24], s64[], f32[144], f32[144], /*index=20*/s64[], f32[144], f32[144], s64[], f32[24], /*index=25*/f32[24], s64[], f32[144], f32[144], s64[], /*index=30*/f32[144], f32[144], s64[], f32[32], f32[32], /*index=35*/s64[], f32[192], f32[192], s64[], f32[192], /*index=40*/f32[192], s64[], f32[32], f32[32], s64[], /*index=45*/f32[192], f32[192], s64[], f32[192], f32[192], /*index=50*/s64[], f32[32], f32[32], s64[], f32[192], /*index=55*/f32[192], s64[], f32[192], f32[192], s64[], /*index=60*/f32[64], f32[64], s64[], f32[384], f32[384], /*index=65*/s64[], f32[384], f32[384], s64[], f32[64], /*index=70*/f32[64], s64[], f32[384], f32[384], s64[], /*index=75*/f32[384], f32[384], s64[], f32[64], f32[64], /*index=80*/s64[], f32[384], f32[384], s64[], f32[384], /*index=85*/f32[384], s64[], f32[64], f32[64], s64[], /*index=90*/f32[384], f32[384], s64[], f32[384], f32[384], /*index=95*/s64[], f32[96], f32[96], s64[], f32[576], /*index=100*/f32[576], s64[], f32[576], f32[576], s64[], /*index=105*/f32[96], f32[96], s64[], f32[576], f32[576], /*index=110*/s64[], f32[576], f32[576], s64[], f32[96], /*index=115*/f32[96], s64[], f32[576], f32[576], s64[], /*index=120*/f32[576], f32[576], s64[], f32[160], f32[160], /*index=125*/s64[], f32[960], f32[960], s64[], f32[960], /*index=130*/f32[960], s64[], f32[160], f32[160], s64[], /*index=135*/f32[960], f32[960], s64[], f32[960], f32[960], /*index=140*/s64[], f32[160], f32[160], s64[], f32[960], /*index=145*/f32[960], s64[], f32[960], f32[960], s64[], /*index=150*/f32[320], f32[320], s64[], f32[1280], f32[1280], /*index=155*/s64[], f32[1000], f32[1000,1280], f32[1280], f32[1280], /*index=160*/f32[1280,320,1,1], f32[320], f32[320], f32[320,960,1,1], f32[960], /*index=165*/f32[960], f32[960,1,3,3], f32[960], f32[960], f32[960,160,1,1], /*index=170*/f32[160], f32[160], f32[160,960,1,1], f32[960], f32[960], /*index=175*/f32[960,1,3,3], f32[960], f32[960], f32[960,160,1,1], f32[160], /*index=180*/f32[160], f32[160,960,1,1], f32[960], f32[960], f32[960,1,3,3], /*index=185*/f32[960], f32[960], f32[960,160,1,1], f32[160], f32[160], /*index=190*/f32[160,576,1,1], f32[576], f32[576], f32[576,1,3,3], f32[576], /*index=195*/f32[576], f32[576,96,1,1], f32[96], f32[96], f32[96,576,1,1], /*index=200*/f32[576], f32[576], f32[576,1,3,3], f32[576], f32[576], /*index=205*/f32[576,96,1,1], f32[96], f32[96], f32[96,576,1,1], f32[576], /*index=210*/f32[576], f32[576,1,3,3], f32[576], f32[576], f32[576,96,1,1], /*index=215*/f32[96], f32[96], f32[96,384,1,1], f32[384], f32[384], /*index=220*/f32[384,1,3,3], f32[384], f32[384], f32[384,64,1,1], f32[64], /*index=225*/f32[64], f32[64,384,1,1], f32[384], f32[384], f32[384,1,3,3], /*index=230*/f32[384], f32[384], f32[384,64,1,1], f32[64], f32[64], /*index=235*/f32[64,384,1,1], f32[384], f32[384], f32[384,1,3,3], f32[384], /*index=240*/f32[384], f32[384,64,1,1], f32[64], f32[64], f32[64,384,1,1], /*index=245*/f32[384], f32[384], f32[384,1,3,3], f32[384], f32[384], /*index=250*/f32[384,64,1,1], f32[64], f32[64], f32[64,192,1,1], f32[192], /*index=255*/f32[192], f32[192,1,3,3], f32[192], f32[192], f32[192,32,1,1], /*index=260*/f32[32], f32[32], f32[32,192,1,1], f32[192], f32[192], /*index=265*/f32[192,1,3,3], f32[192], f32[192], f32[192,32,1,1], f32[32], /*index=270*/f32[32], f32[32,192,1,1], f32[192], f32[192], f32[192,1,3,3], /*index=275*/f32[192], f32[192], f32[192,32,1,1], f32[32], f32[32], /*index=280*/f32[32,144,1,1], f32[144], f32[144], f32[144,1,3,3], f32[144], /*index=285*/f32[144], f32[144,24,1,1], f32[24], f32[24], f32[24,144,1,1], /*index=290*/f32[144], f32[144], f32[144,1,3,3], f32[144], f32[144], /*index=295*/f32[144,24,1,1], f32[24], f32[24], f32[24,96,1,1], f32[96], /*index=300*/f32[96], f32[96,1,3,3], f32[96], f32[96], f32[96,16,1,1], /*index=305*/f32[16], f32[16], f32[16,32,1,1], f32[32], f32[32], /*index=310*/f32[32,1,3,3], f32[32], f32[32], f32[32,3,3,3]) {
+  %constant.2037 = s32[] constant(96), metadata={op_type="aten__sum" op_name="aten__sum"}
+  %constant.2058 = u64[] constant(0), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2071 = u64[] constant(61440), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2072 = u64[] add(u64[] %constant.2058, u64[] %constant.2071), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.1960 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1961 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.1960), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.1921 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1922 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.1921), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.1607 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1608 = f32[96,576,7,7]{3,2,1,0} broadcast(f32[] %constant.1607), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.1568 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1569 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.1568), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.1254 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1255 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.1254), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.1215 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1216 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.1215), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.781 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.782 = f32[96,192,14,14]{3,2,1,0} broadcast(f32[] %constant.781), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.742 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.743 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.742), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.428 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.429 = f32[96,144,28,28]{3,2,1,0} broadcast(f32[] %constant.428), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.389 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.390 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %constant.389), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.195 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.196 = f32[96,96,56,56]{3,2,1,0} broadcast(f32[] %constant.195), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.156 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.157 = f32[96,96,112,112]{3,2,1,0} broadcast(f32[] %constant.156), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.82 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.83 = f32[96,32,112,112]{3,2,1,0} broadcast(f32[] %constant.82), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.43 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.44 = f32[96,32,112,112]{3,2,1,0} broadcast(f32[] %constant.43), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p5.6 = f32[96,3,224,224]{3,2,1,0} parameter(5), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %p4.5 = f32[32,3,3,3]{3,2,1,0} parameter(4), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.7 = f32[96,32,112,112]{3,2,1,0} convolution(f32[96,3,224,224]{3,2,1,0} %p5.6, f32[32,3,3,3]{3,2,1,0} %p4.5), window={size=3x3 stride=2x2 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p3.4 = f32[32]{0} parameter(3), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p2.3 = f32[32]{0} parameter(2), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.8 = (f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) batch-norm-training(f32[96,32,112,112]{3,2,1,0} %convolution.7, f32[32]{0} %p3.4, f32[32]{0} %p2.3), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.9 = f32[96,32,112,112]{3,2,1,0} get-tuple-element((f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.8), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %p12.42 = f32[] parameter(12), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.45 = f32[96,32,112,112]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.46 = f32[96,32,112,112]{3,2,1,0} clamp(f32[96,32,112,112]{3,2,1,0} %broadcast.44, f32[96,32,112,112]{3,2,1,0} %get-tuple-element.9, f32[96,32,112,112]{3,2,1,0} %broadcast.45), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p11.41 = f32[32,1,3,3]{3,2,1,0} parameter(11), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.47 = f32[96,32,112,112]{3,2,1,0} convolution(f32[96,32,112,112]{3,2,1,0} %clamp.46, f32[32,1,3,3]{3,2,1,0} %p11.41), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=32, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p10.40 = f32[32]{0} parameter(10), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p9.39 = f32[32]{0} parameter(9), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.48 = (f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) batch-norm-training(f32[96,32,112,112]{3,2,1,0} %convolution.47, f32[32]{0} %p10.40, f32[32]{0} %p9.39), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.49 = f32[96,32,112,112]{3,2,1,0} get-tuple-element((f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.48), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.84 = f32[96,32,112,112]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.85 = f32[96,32,112,112]{3,2,1,0} clamp(f32[96,32,112,112]{3,2,1,0} %broadcast.83, f32[96,32,112,112]{3,2,1,0} %get-tuple-element.49, f32[96,32,112,112]{3,2,1,0} %broadcast.84), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p18.81 = f32[16,32,1,1]{3,2,1,0} parameter(18), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.86 = f32[96,16,112,112]{3,2,1,0} convolution(f32[96,32,112,112]{3,2,1,0} %clamp.85, f32[16,32,1,1]{3,2,1,0} %p18.81), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p17.80 = f32[16]{0} parameter(17), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p16.79 = f32[16]{0} parameter(16), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.87 = (f32[96,16,112,112]{3,2,1,0}, f32[16]{0}, f32[16]{0}) batch-norm-training(f32[96,16,112,112]{3,2,1,0} %convolution.86, f32[16]{0} %p17.80, f32[16]{0} %p16.79), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.88 = f32[96,16,112,112]{3,2,1,0} get-tuple-element((f32[96,16,112,112]{3,2,1,0}, f32[16]{0}, f32[16]{0}) %batch-norm-training.87), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %p24.120 = f32[96,16,1,1]{3,2,1,0} parameter(24), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.121 = f32[96,96,112,112]{3,2,1,0} convolution(f32[96,16,112,112]{3,2,1,0} %get-tuple-element.88, f32[96,16,1,1]{3,2,1,0} %p24.120), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p23.119 = f32[96]{0} parameter(23), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p22.118 = f32[96]{0} parameter(22), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.122 = (f32[96,96,112,112]{3,2,1,0}, f32[96]{0}, f32[96]{0}) batch-norm-training(f32[96,96,112,112]{3,2,1,0} %convolution.121, f32[96]{0} %p23.119, f32[96]{0} %p22.118), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.123 = f32[96,96,112,112]{3,2,1,0} get-tuple-element((f32[96,96,112,112]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.122), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.158 = f32[96,96,112,112]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.159 = f32[96,96,112,112]{3,2,1,0} clamp(f32[96,96,112,112]{3,2,1,0} %broadcast.157, f32[96,96,112,112]{3,2,1,0} %get-tuple-element.123, f32[96,96,112,112]{3,2,1,0} %broadcast.158), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p30.155 = f32[96,1,3,3]{3,2,1,0} parameter(30), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.160 = f32[96,96,56,56]{3,2,1,0} convolution(f32[96,96,112,112]{3,2,1,0} %clamp.159, f32[96,1,3,3]{3,2,1,0} %p30.155), window={size=3x3 stride=2x2 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=96, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p29.154 = f32[96]{0} parameter(29), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p28.153 = f32[96]{0} parameter(28), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.161 = (f32[96,96,56,56]{3,2,1,0}, f32[96]{0}, f32[96]{0}) batch-norm-training(f32[96,96,56,56]{3,2,1,0} %convolution.160, f32[96]{0} %p29.154, f32[96]{0} %p28.153), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.162 = f32[96,96,56,56]{3,2,1,0} get-tuple-element((f32[96,96,56,56]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.161), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.197 = f32[96,96,56,56]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.198 = f32[96,96,56,56]{3,2,1,0} clamp(f32[96,96,56,56]{3,2,1,0} %broadcast.196, f32[96,96,56,56]{3,2,1,0} %get-tuple-element.162, f32[96,96,56,56]{3,2,1,0} %broadcast.197), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p36.194 = f32[24,96,1,1]{3,2,1,0} parameter(36), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.199 = f32[96,24,56,56]{3,2,1,0} convolution(f32[96,96,56,56]{3,2,1,0} %clamp.198, f32[24,96,1,1]{3,2,1,0} %p36.194), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p35.193 = f32[24]{0} parameter(35), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p34.192 = f32[24]{0} parameter(34), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.200 = (f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) batch-norm-training(f32[96,24,56,56]{3,2,1,0} %convolution.199, f32[24]{0} %p35.193, f32[24]{0} %p34.192), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.201 = f32[96,24,56,56]{3,2,1,0} get-tuple-element((f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) %batch-norm-training.200), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.308 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.309 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %constant.308), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.269 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.270 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %constant.269), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p42.233 = f32[144,24,1,1]{3,2,1,0} parameter(42), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.234 = f32[96,144,56,56]{3,2,1,0} convolution(f32[96,24,56,56]{3,2,1,0} %get-tuple-element.201, f32[144,24,1,1]{3,2,1,0} %p42.233), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p41.232 = f32[144]{0} parameter(41), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p40.231 = f32[144]{0} parameter(40), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.235 = (f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) batch-norm-training(f32[96,144,56,56]{3,2,1,0} %convolution.234, f32[144]{0} %p41.232, f32[144]{0} %p40.231), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.236 = f32[96,144,56,56]{3,2,1,0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-training.235), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.271 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.272 = f32[96,144,56,56]{3,2,1,0} clamp(f32[96,144,56,56]{3,2,1,0} %broadcast.270, f32[96,144,56,56]{3,2,1,0} %get-tuple-element.236, f32[96,144,56,56]{3,2,1,0} %broadcast.271), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p48.268 = f32[144,1,3,3]{3,2,1,0} parameter(48), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.273 = f32[96,144,56,56]{3,2,1,0} convolution(f32[96,144,56,56]{3,2,1,0} %clamp.272, f32[144,1,3,3]{3,2,1,0} %p48.268), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=144, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p47.267 = f32[144]{0} parameter(47), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p46.266 = f32[144]{0} parameter(46), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.274 = (f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) batch-norm-training(f32[96,144,56,56]{3,2,1,0} %convolution.273, f32[144]{0} %p47.267, f32[144]{0} %p46.266), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.275 = f32[96,144,56,56]{3,2,1,0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-training.274), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.310 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.311 = f32[96,144,56,56]{3,2,1,0} clamp(f32[96,144,56,56]{3,2,1,0} %broadcast.309, f32[96,144,56,56]{3,2,1,0} %get-tuple-element.275, f32[96,144,56,56]{3,2,1,0} %broadcast.310), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p54.307 = f32[24,144,1,1]{3,2,1,0} parameter(54), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.312 = f32[96,24,56,56]{3,2,1,0} convolution(f32[96,144,56,56]{3,2,1,0} %clamp.311, f32[24,144,1,1]{3,2,1,0} %p54.307), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p53.306 = f32[24]{0} parameter(53), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p52.305 = f32[24]{0} parameter(52), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.313 = (f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) batch-norm-training(f32[96,24,56,56]{3,2,1,0} %convolution.312, f32[24]{0} %p53.306, f32[24]{0} %p52.305), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.314 = f32[96,24,56,56]{3,2,1,0} get-tuple-element((f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) %batch-norm-training.313), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.347 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.348 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.347), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.349 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.348), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.350 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.349), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.351 = f32[96,24,56,56]{3,2,1,0} broadcast(f32[] %reshape.350), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %multiply.352 = f32[96,24,56,56]{3,2,1,0} multiply(f32[96,24,56,56]{3,2,1,0} %get-tuple-element.314, f32[96,24,56,56]{3,2,1,0} %broadcast.351), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@mobilenetv2.py" source_line=62}
+  %add.353 = f32[96,24,56,56]{3,2,1,0} add(f32[96,24,56,56]{3,2,1,0} %get-tuple-element.201, f32[96,24,56,56]{3,2,1,0} %multiply.352), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@mobilenetv2.py" source_line=62}
+  %p60.346 = f32[144,24,1,1]{3,2,1,0} parameter(60), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.354 = f32[96,144,56,56]{3,2,1,0} convolution(f32[96,24,56,56]{3,2,1,0} %add.353, f32[144,24,1,1]{3,2,1,0} %p60.346), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p59.345 = f32[144]{0} parameter(59), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p58.344 = f32[144]{0} parameter(58), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.355 = (f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) batch-norm-training(f32[96,144,56,56]{3,2,1,0} %convolution.354, f32[144]{0} %p59.345, f32[144]{0} %p58.344), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.356 = f32[96,144,56,56]{3,2,1,0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-training.355), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.391 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.392 = f32[96,144,56,56]{3,2,1,0} clamp(f32[96,144,56,56]{3,2,1,0} %broadcast.390, f32[96,144,56,56]{3,2,1,0} %get-tuple-element.356, f32[96,144,56,56]{3,2,1,0} %broadcast.391), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p66.388 = f32[144,1,3,3]{3,2,1,0} parameter(66), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.393 = f32[96,144,28,28]{3,2,1,0} convolution(f32[96,144,56,56]{3,2,1,0} %clamp.392, f32[144,1,3,3]{3,2,1,0} %p66.388), window={size=3x3 stride=2x2 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=144, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p65.387 = f32[144]{0} parameter(65), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p64.386 = f32[144]{0} parameter(64), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.394 = (f32[96,144,28,28]{3,2,1,0}, f32[144]{0}, f32[144]{0}) batch-norm-training(f32[96,144,28,28]{3,2,1,0} %convolution.393, f32[144]{0} %p65.387, f32[144]{0} %p64.386), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.395 = f32[96,144,28,28]{3,2,1,0} get-tuple-element((f32[96,144,28,28]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-training.394), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.430 = f32[96,144,28,28]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.431 = f32[96,144,28,28]{3,2,1,0} clamp(f32[96,144,28,28]{3,2,1,0} %broadcast.429, f32[96,144,28,28]{3,2,1,0} %get-tuple-element.395, f32[96,144,28,28]{3,2,1,0} %broadcast.430), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p72.427 = f32[32,144,1,1]{3,2,1,0} parameter(72), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.432 = f32[96,32,28,28]{3,2,1,0} convolution(f32[96,144,28,28]{3,2,1,0} %clamp.431, f32[32,144,1,1]{3,2,1,0} %p72.427), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p71.426 = f32[32]{0} parameter(71), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p70.425 = f32[32]{0} parameter(70), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.433 = (f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) batch-norm-training(f32[96,32,28,28]{3,2,1,0} %convolution.432, f32[32]{0} %p71.426, f32[32]{0} %p70.425), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.434 = f32[96,32,28,28]{3,2,1,0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.433), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.541 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.542 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.541), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.502 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.503 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.502), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p78.466 = f32[192,32,1,1]{3,2,1,0} parameter(78), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.467 = f32[96,192,28,28]{3,2,1,0} convolution(f32[96,32,28,28]{3,2,1,0} %get-tuple-element.434, f32[192,32,1,1]{3,2,1,0} %p78.466), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p77.465 = f32[192]{0} parameter(77), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p76.464 = f32[192]{0} parameter(76), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.468 = (f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) batch-norm-training(f32[96,192,28,28]{3,2,1,0} %convolution.467, f32[192]{0} %p77.465, f32[192]{0} %p76.464), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.469 = f32[96,192,28,28]{3,2,1,0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.468), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.504 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.505 = f32[96,192,28,28]{3,2,1,0} clamp(f32[96,192,28,28]{3,2,1,0} %broadcast.503, f32[96,192,28,28]{3,2,1,0} %get-tuple-element.469, f32[96,192,28,28]{3,2,1,0} %broadcast.504), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p84.501 = f32[192,1,3,3]{3,2,1,0} parameter(84), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.506 = f32[96,192,28,28]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %clamp.505, f32[192,1,3,3]{3,2,1,0} %p84.501), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=192, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p83.500 = f32[192]{0} parameter(83), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p82.499 = f32[192]{0} parameter(82), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.507 = (f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) batch-norm-training(f32[96,192,28,28]{3,2,1,0} %convolution.506, f32[192]{0} %p83.500, f32[192]{0} %p82.499), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.508 = f32[96,192,28,28]{3,2,1,0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.507), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.543 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.544 = f32[96,192,28,28]{3,2,1,0} clamp(f32[96,192,28,28]{3,2,1,0} %broadcast.542, f32[96,192,28,28]{3,2,1,0} %get-tuple-element.508, f32[96,192,28,28]{3,2,1,0} %broadcast.543), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p90.540 = f32[32,192,1,1]{3,2,1,0} parameter(90), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.545 = f32[96,32,28,28]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %clamp.544, f32[32,192,1,1]{3,2,1,0} %p90.540), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p89.539 = f32[32]{0} parameter(89), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p88.538 = f32[32]{0} parameter(88), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.546 = (f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) batch-norm-training(f32[96,32,28,28]{3,2,1,0} %convolution.545, f32[32]{0} %p89.539, f32[32]{0} %p88.538), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.547 = f32[96,32,28,28]{3,2,1,0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.546), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.580 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.581 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.580), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.582 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.581), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.583 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.582), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.584 = f32[96,32,28,28]{3,2,1,0} broadcast(f32[] %reshape.583), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %multiply.585 = f32[96,32,28,28]{3,2,1,0} multiply(f32[96,32,28,28]{3,2,1,0} %get-tuple-element.547, f32[96,32,28,28]{3,2,1,0} %broadcast.584), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@mobilenetv2.py" source_line=62}
+  %add.586 = f32[96,32,28,28]{3,2,1,0} add(f32[96,32,28,28]{3,2,1,0} %get-tuple-element.434, f32[96,32,28,28]{3,2,1,0} %multiply.585), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@mobilenetv2.py" source_line=62}
+  %constant.661 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.662 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.661), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.622 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.623 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.622), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p96.579 = f32[192,32,1,1]{3,2,1,0} parameter(96), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.587 = f32[96,192,28,28]{3,2,1,0} convolution(f32[96,32,28,28]{3,2,1,0} %add.586, f32[192,32,1,1]{3,2,1,0} %p96.579), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p95.578 = f32[192]{0} parameter(95), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p94.577 = f32[192]{0} parameter(94), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.588 = (f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) batch-norm-training(f32[96,192,28,28]{3,2,1,0} %convolution.587, f32[192]{0} %p95.578, f32[192]{0} %p94.577), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.589 = f32[96,192,28,28]{3,2,1,0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.588), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.624 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.625 = f32[96,192,28,28]{3,2,1,0} clamp(f32[96,192,28,28]{3,2,1,0} %broadcast.623, f32[96,192,28,28]{3,2,1,0} %get-tuple-element.589, f32[96,192,28,28]{3,2,1,0} %broadcast.624), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p102.621 = f32[192,1,3,3]{3,2,1,0} parameter(102), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.626 = f32[96,192,28,28]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %clamp.625, f32[192,1,3,3]{3,2,1,0} %p102.621), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=192, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p101.620 = f32[192]{0} parameter(101), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p100.619 = f32[192]{0} parameter(100), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.627 = (f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) batch-norm-training(f32[96,192,28,28]{3,2,1,0} %convolution.626, f32[192]{0} %p101.620, f32[192]{0} %p100.619), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.628 = f32[96,192,28,28]{3,2,1,0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.627), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.663 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.664 = f32[96,192,28,28]{3,2,1,0} clamp(f32[96,192,28,28]{3,2,1,0} %broadcast.662, f32[96,192,28,28]{3,2,1,0} %get-tuple-element.628, f32[96,192,28,28]{3,2,1,0} %broadcast.663), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p108.660 = f32[32,192,1,1]{3,2,1,0} parameter(108), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.665 = f32[96,32,28,28]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %clamp.664, f32[32,192,1,1]{3,2,1,0} %p108.660), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p107.659 = f32[32]{0} parameter(107), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p106.658 = f32[32]{0} parameter(106), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.666 = (f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) batch-norm-training(f32[96,32,28,28]{3,2,1,0} %convolution.665, f32[32]{0} %p107.659, f32[32]{0} %p106.658), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.667 = f32[96,32,28,28]{3,2,1,0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.666), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.700 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.701 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.700), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.702 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.701), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.703 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.702), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.704 = f32[96,32,28,28]{3,2,1,0} broadcast(f32[] %reshape.703), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %multiply.705 = f32[96,32,28,28]{3,2,1,0} multiply(f32[96,32,28,28]{3,2,1,0} %get-tuple-element.667, f32[96,32,28,28]{3,2,1,0} %broadcast.704), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@mobilenetv2.py" source_line=62}
+  %add.706 = f32[96,32,28,28]{3,2,1,0} add(f32[96,32,28,28]{3,2,1,0} %add.586, f32[96,32,28,28]{3,2,1,0} %multiply.705), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@mobilenetv2.py" source_line=62}
+  %p114.699 = f32[192,32,1,1]{3,2,1,0} parameter(114), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.707 = f32[96,192,28,28]{3,2,1,0} convolution(f32[96,32,28,28]{3,2,1,0} %add.706, f32[192,32,1,1]{3,2,1,0} %p114.699), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p113.698 = f32[192]{0} parameter(113), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p112.697 = f32[192]{0} parameter(112), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.708 = (f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) batch-norm-training(f32[96,192,28,28]{3,2,1,0} %convolution.707, f32[192]{0} %p113.698, f32[192]{0} %p112.697), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.709 = f32[96,192,28,28]{3,2,1,0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.708), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.744 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.745 = f32[96,192,28,28]{3,2,1,0} clamp(f32[96,192,28,28]{3,2,1,0} %broadcast.743, f32[96,192,28,28]{3,2,1,0} %get-tuple-element.709, f32[96,192,28,28]{3,2,1,0} %broadcast.744), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p120.741 = f32[192,1,3,3]{3,2,1,0} parameter(120), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.746 = f32[96,192,14,14]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %clamp.745, f32[192,1,3,3]{3,2,1,0} %p120.741), window={size=3x3 stride=2x2 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=192, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p119.740 = f32[192]{0} parameter(119), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p118.739 = f32[192]{0} parameter(118), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.747 = (f32[96,192,14,14]{3,2,1,0}, f32[192]{0}, f32[192]{0}) batch-norm-training(f32[96,192,14,14]{3,2,1,0} %convolution.746, f32[192]{0} %p119.740, f32[192]{0} %p118.739), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.748 = f32[96,192,14,14]{3,2,1,0} get-tuple-element((f32[96,192,14,14]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.747), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.783 = f32[96,192,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.784 = f32[96,192,14,14]{3,2,1,0} clamp(f32[96,192,14,14]{3,2,1,0} %broadcast.782, f32[96,192,14,14]{3,2,1,0} %get-tuple-element.748, f32[96,192,14,14]{3,2,1,0} %broadcast.783), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p126.780 = f32[64,192,1,1]{3,2,1,0} parameter(126), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.785 = f32[96,64,14,14]{3,2,1,0} convolution(f32[96,192,14,14]{3,2,1,0} %clamp.784, f32[64,192,1,1]{3,2,1,0} %p126.780), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p125.779 = f32[64]{0} parameter(125), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p124.778 = f32[64]{0} parameter(124), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.786 = (f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) batch-norm-training(f32[96,64,14,14]{3,2,1,0} %convolution.785, f32[64]{0} %p125.779, f32[64]{0} %p124.778), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.787 = f32[96,64,14,14]{3,2,1,0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-training.786), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.894 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.895 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.894), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.855 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.856 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.855), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p132.819 = f32[384,64,1,1]{3,2,1,0} parameter(132), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.820 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.787, f32[384,64,1,1]{3,2,1,0} %p132.819), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p131.818 = f32[384]{0} parameter(131), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p130.817 = f32[384]{0} parameter(130), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.821 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-training(f32[96,384,14,14]{3,2,1,0} %convolution.820, f32[384]{0} %p131.818, f32[384]{0} %p130.817), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.822 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.821), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.857 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.858 = f32[96,384,14,14]{3,2,1,0} clamp(f32[96,384,14,14]{3,2,1,0} %broadcast.856, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.822, f32[96,384,14,14]{3,2,1,0} %broadcast.857), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p138.854 = f32[384,1,3,3]{3,2,1,0} parameter(138), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.859 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.858, f32[384,1,3,3]{3,2,1,0} %p138.854), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=384, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p137.853 = f32[384]{0} parameter(137), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p136.852 = f32[384]{0} parameter(136), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.860 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-training(f32[96,384,14,14]{3,2,1,0} %convolution.859, f32[384]{0} %p137.853, f32[384]{0} %p136.852), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.861 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.860), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.896 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.897 = f32[96,384,14,14]{3,2,1,0} clamp(f32[96,384,14,14]{3,2,1,0} %broadcast.895, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.861, f32[96,384,14,14]{3,2,1,0} %broadcast.896), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p144.893 = f32[64,384,1,1]{3,2,1,0} parameter(144), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.898 = f32[96,64,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.897, f32[64,384,1,1]{3,2,1,0} %p144.893), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p143.892 = f32[64]{0} parameter(143), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p142.891 = f32[64]{0} parameter(142), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.899 = (f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) batch-norm-training(f32[96,64,14,14]{3,2,1,0} %convolution.898, f32[64]{0} %p143.892, f32[64]{0} %p142.891), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.900 = f32[96,64,14,14]{3,2,1,0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-training.899), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.933 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.934 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.933), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.935 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.934), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.936 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.935), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.937 = f32[96,64,14,14]{3,2,1,0} broadcast(f32[] %reshape.936), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %multiply.938 = f32[96,64,14,14]{3,2,1,0} multiply(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.900, f32[96,64,14,14]{3,2,1,0} %broadcast.937), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@mobilenetv2.py" source_line=62}
+  %add.939 = f32[96,64,14,14]{3,2,1,0} add(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.787, f32[96,64,14,14]{3,2,1,0} %multiply.938), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@mobilenetv2.py" source_line=62}
+  %constant.1014 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1015 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.1014), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.975 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.976 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.975), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p150.932 = f32[384,64,1,1]{3,2,1,0} parameter(150), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.940 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,64,14,14]{3,2,1,0} %add.939, f32[384,64,1,1]{3,2,1,0} %p150.932), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p149.931 = f32[384]{0} parameter(149), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p148.930 = f32[384]{0} parameter(148), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.941 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-training(f32[96,384,14,14]{3,2,1,0} %convolution.940, f32[384]{0} %p149.931, f32[384]{0} %p148.930), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.942 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.941), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.977 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.978 = f32[96,384,14,14]{3,2,1,0} clamp(f32[96,384,14,14]{3,2,1,0} %broadcast.976, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.942, f32[96,384,14,14]{3,2,1,0} %broadcast.977), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p156.974 = f32[384,1,3,3]{3,2,1,0} parameter(156), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.979 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.978, f32[384,1,3,3]{3,2,1,0} %p156.974), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=384, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p155.973 = f32[384]{0} parameter(155), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p154.972 = f32[384]{0} parameter(154), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.980 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-training(f32[96,384,14,14]{3,2,1,0} %convolution.979, f32[384]{0} %p155.973, f32[384]{0} %p154.972), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.981 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.980), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1016 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1017 = f32[96,384,14,14]{3,2,1,0} clamp(f32[96,384,14,14]{3,2,1,0} %broadcast.1015, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.981, f32[96,384,14,14]{3,2,1,0} %broadcast.1016), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p162.1013 = f32[64,384,1,1]{3,2,1,0} parameter(162), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1018 = f32[96,64,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.1017, f32[64,384,1,1]{3,2,1,0} %p162.1013), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p161.1012 = f32[64]{0} parameter(161), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p160.1011 = f32[64]{0} parameter(160), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1019 = (f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) batch-norm-training(f32[96,64,14,14]{3,2,1,0} %convolution.1018, f32[64]{0} %p161.1012, f32[64]{0} %p160.1011), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1020 = f32[96,64,14,14]{3,2,1,0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-training.1019), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1053 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.1054 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.1053), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.1055 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.1054), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.1056 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.1055), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.1057 = f32[96,64,14,14]{3,2,1,0} broadcast(f32[] %reshape.1056), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %multiply.1058 = f32[96,64,14,14]{3,2,1,0} multiply(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.1020, f32[96,64,14,14]{3,2,1,0} %broadcast.1057), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@mobilenetv2.py" source_line=62}
+  %add.1059 = f32[96,64,14,14]{3,2,1,0} add(f32[96,64,14,14]{3,2,1,0} %add.939, f32[96,64,14,14]{3,2,1,0} %multiply.1058), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@mobilenetv2.py" source_line=62}
+  %constant.1134 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1135 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.1134), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.1095 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1096 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.1095), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p168.1052 = f32[384,64,1,1]{3,2,1,0} parameter(168), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1060 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,64,14,14]{3,2,1,0} %add.1059, f32[384,64,1,1]{3,2,1,0} %p168.1052), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p167.1051 = f32[384]{0} parameter(167), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p166.1050 = f32[384]{0} parameter(166), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1061 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-training(f32[96,384,14,14]{3,2,1,0} %convolution.1060, f32[384]{0} %p167.1051, f32[384]{0} %p166.1050), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1062 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.1061), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1097 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1098 = f32[96,384,14,14]{3,2,1,0} clamp(f32[96,384,14,14]{3,2,1,0} %broadcast.1096, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.1062, f32[96,384,14,14]{3,2,1,0} %broadcast.1097), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p174.1094 = f32[384,1,3,3]{3,2,1,0} parameter(174), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1099 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.1098, f32[384,1,3,3]{3,2,1,0} %p174.1094), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=384, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p173.1093 = f32[384]{0} parameter(173), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p172.1092 = f32[384]{0} parameter(172), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1100 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-training(f32[96,384,14,14]{3,2,1,0} %convolution.1099, f32[384]{0} %p173.1093, f32[384]{0} %p172.1092), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1101 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.1100), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1136 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1137 = f32[96,384,14,14]{3,2,1,0} clamp(f32[96,384,14,14]{3,2,1,0} %broadcast.1135, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.1101, f32[96,384,14,14]{3,2,1,0} %broadcast.1136), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p180.1133 = f32[64,384,1,1]{3,2,1,0} parameter(180), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1138 = f32[96,64,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.1137, f32[64,384,1,1]{3,2,1,0} %p180.1133), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p179.1132 = f32[64]{0} parameter(179), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p178.1131 = f32[64]{0} parameter(178), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1139 = (f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) batch-norm-training(f32[96,64,14,14]{3,2,1,0} %convolution.1138, f32[64]{0} %p179.1132, f32[64]{0} %p178.1131), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1140 = f32[96,64,14,14]{3,2,1,0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-training.1139), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1173 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.1174 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.1173), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.1175 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.1174), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.1176 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.1175), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.1177 = f32[96,64,14,14]{3,2,1,0} broadcast(f32[] %reshape.1176), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %multiply.1178 = f32[96,64,14,14]{3,2,1,0} multiply(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.1140, f32[96,64,14,14]{3,2,1,0} %broadcast.1177), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@mobilenetv2.py" source_line=62}
+  %add.1179 = f32[96,64,14,14]{3,2,1,0} add(f32[96,64,14,14]{3,2,1,0} %add.1059, f32[96,64,14,14]{3,2,1,0} %multiply.1178), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@mobilenetv2.py" source_line=62}
+  %p186.1172 = f32[384,64,1,1]{3,2,1,0} parameter(186), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1180 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,64,14,14]{3,2,1,0} %add.1179, f32[384,64,1,1]{3,2,1,0} %p186.1172), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p185.1171 = f32[384]{0} parameter(185), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p184.1170 = f32[384]{0} parameter(184), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1181 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-training(f32[96,384,14,14]{3,2,1,0} %convolution.1180, f32[384]{0} %p185.1171, f32[384]{0} %p184.1170), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1182 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.1181), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1217 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1218 = f32[96,384,14,14]{3,2,1,0} clamp(f32[96,384,14,14]{3,2,1,0} %broadcast.1216, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.1182, f32[96,384,14,14]{3,2,1,0} %broadcast.1217), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p192.1214 = f32[384,1,3,3]{3,2,1,0} parameter(192), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1219 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.1218, f32[384,1,3,3]{3,2,1,0} %p192.1214), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=384, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p191.1213 = f32[384]{0} parameter(191), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p190.1212 = f32[384]{0} parameter(190), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1220 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-training(f32[96,384,14,14]{3,2,1,0} %convolution.1219, f32[384]{0} %p191.1213, f32[384]{0} %p190.1212), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1221 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.1220), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1256 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1257 = f32[96,384,14,14]{3,2,1,0} clamp(f32[96,384,14,14]{3,2,1,0} %broadcast.1255, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.1221, f32[96,384,14,14]{3,2,1,0} %broadcast.1256), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p198.1253 = f32[96,384,1,1]{3,2,1,0} parameter(198), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1258 = f32[96,96,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.1257, f32[96,384,1,1]{3,2,1,0} %p198.1253), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p197.1252 = f32[96]{0} parameter(197), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p196.1251 = f32[96]{0} parameter(196), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1259 = (f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) batch-norm-training(f32[96,96,14,14]{3,2,1,0} %convolution.1258, f32[96]{0} %p197.1252, f32[96]{0} %p196.1251), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1260 = f32[96,96,14,14]{3,2,1,0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.1259), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1367 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1368 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.1367), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.1328 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1329 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.1328), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p204.1292 = f32[576,96,1,1]{3,2,1,0} parameter(204), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1293 = f32[96,576,14,14]{3,2,1,0} convolution(f32[96,96,14,14]{3,2,1,0} %get-tuple-element.1260, f32[576,96,1,1]{3,2,1,0} %p204.1292), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p203.1291 = f32[576]{0} parameter(203), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p202.1290 = f32[576]{0} parameter(202), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1294 = (f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) batch-norm-training(f32[96,576,14,14]{3,2,1,0} %convolution.1293, f32[576]{0} %p203.1291, f32[576]{0} %p202.1290), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1295 = f32[96,576,14,14]{3,2,1,0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1294), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1330 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1331 = f32[96,576,14,14]{3,2,1,0} clamp(f32[96,576,14,14]{3,2,1,0} %broadcast.1329, f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1295, f32[96,576,14,14]{3,2,1,0} %broadcast.1330), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p210.1327 = f32[576,1,3,3]{3,2,1,0} parameter(210), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1332 = f32[96,576,14,14]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %clamp.1331, f32[576,1,3,3]{3,2,1,0} %p210.1327), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=576, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p209.1326 = f32[576]{0} parameter(209), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p208.1325 = f32[576]{0} parameter(208), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1333 = (f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) batch-norm-training(f32[96,576,14,14]{3,2,1,0} %convolution.1332, f32[576]{0} %p209.1326, f32[576]{0} %p208.1325), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1334 = f32[96,576,14,14]{3,2,1,0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1333), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1369 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1370 = f32[96,576,14,14]{3,2,1,0} clamp(f32[96,576,14,14]{3,2,1,0} %broadcast.1368, f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1334, f32[96,576,14,14]{3,2,1,0} %broadcast.1369), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p216.1366 = f32[96,576,1,1]{3,2,1,0} parameter(216), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1371 = f32[96,96,14,14]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %clamp.1370, f32[96,576,1,1]{3,2,1,0} %p216.1366), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p215.1365 = f32[96]{0} parameter(215), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p214.1364 = f32[96]{0} parameter(214), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1372 = (f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) batch-norm-training(f32[96,96,14,14]{3,2,1,0} %convolution.1371, f32[96]{0} %p215.1365, f32[96]{0} %p214.1364), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1373 = f32[96,96,14,14]{3,2,1,0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.1372), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1406 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.1407 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.1406), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.1408 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.1407), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.1409 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.1408), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.1410 = f32[96,96,14,14]{3,2,1,0} broadcast(f32[] %reshape.1409), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %multiply.1411 = f32[96,96,14,14]{3,2,1,0} multiply(f32[96,96,14,14]{3,2,1,0} %get-tuple-element.1373, f32[96,96,14,14]{3,2,1,0} %broadcast.1410), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@mobilenetv2.py" source_line=62}
+  %add.1412 = f32[96,96,14,14]{3,2,1,0} add(f32[96,96,14,14]{3,2,1,0} %get-tuple-element.1260, f32[96,96,14,14]{3,2,1,0} %multiply.1411), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@mobilenetv2.py" source_line=62}
+  %constant.1487 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1488 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.1487), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.1448 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1449 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.1448), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p222.1405 = f32[576,96,1,1]{3,2,1,0} parameter(222), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1413 = f32[96,576,14,14]{3,2,1,0} convolution(f32[96,96,14,14]{3,2,1,0} %add.1412, f32[576,96,1,1]{3,2,1,0} %p222.1405), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p221.1404 = f32[576]{0} parameter(221), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p220.1403 = f32[576]{0} parameter(220), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1414 = (f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) batch-norm-training(f32[96,576,14,14]{3,2,1,0} %convolution.1413, f32[576]{0} %p221.1404, f32[576]{0} %p220.1403), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1415 = f32[96,576,14,14]{3,2,1,0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1414), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1450 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1451 = f32[96,576,14,14]{3,2,1,0} clamp(f32[96,576,14,14]{3,2,1,0} %broadcast.1449, f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1415, f32[96,576,14,14]{3,2,1,0} %broadcast.1450), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p228.1447 = f32[576,1,3,3]{3,2,1,0} parameter(228), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1452 = f32[96,576,14,14]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %clamp.1451, f32[576,1,3,3]{3,2,1,0} %p228.1447), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=576, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p227.1446 = f32[576]{0} parameter(227), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p226.1445 = f32[576]{0} parameter(226), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1453 = (f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) batch-norm-training(f32[96,576,14,14]{3,2,1,0} %convolution.1452, f32[576]{0} %p227.1446, f32[576]{0} %p226.1445), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1454 = f32[96,576,14,14]{3,2,1,0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1453), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1489 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1490 = f32[96,576,14,14]{3,2,1,0} clamp(f32[96,576,14,14]{3,2,1,0} %broadcast.1488, f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1454, f32[96,576,14,14]{3,2,1,0} %broadcast.1489), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p234.1486 = f32[96,576,1,1]{3,2,1,0} parameter(234), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1491 = f32[96,96,14,14]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %clamp.1490, f32[96,576,1,1]{3,2,1,0} %p234.1486), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p233.1485 = f32[96]{0} parameter(233), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p232.1484 = f32[96]{0} parameter(232), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1492 = (f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) batch-norm-training(f32[96,96,14,14]{3,2,1,0} %convolution.1491, f32[96]{0} %p233.1485, f32[96]{0} %p232.1484), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1493 = f32[96,96,14,14]{3,2,1,0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.1492), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1526 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.1527 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.1526), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.1528 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.1527), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.1529 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.1528), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.1530 = f32[96,96,14,14]{3,2,1,0} broadcast(f32[] %reshape.1529), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %multiply.1531 = f32[96,96,14,14]{3,2,1,0} multiply(f32[96,96,14,14]{3,2,1,0} %get-tuple-element.1493, f32[96,96,14,14]{3,2,1,0} %broadcast.1530), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@mobilenetv2.py" source_line=62}
+  %add.1532 = f32[96,96,14,14]{3,2,1,0} add(f32[96,96,14,14]{3,2,1,0} %add.1412, f32[96,96,14,14]{3,2,1,0} %multiply.1531), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@mobilenetv2.py" source_line=62}
+  %p240.1525 = f32[576,96,1,1]{3,2,1,0} parameter(240), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1533 = f32[96,576,14,14]{3,2,1,0} convolution(f32[96,96,14,14]{3,2,1,0} %add.1532, f32[576,96,1,1]{3,2,1,0} %p240.1525), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p239.1524 = f32[576]{0} parameter(239), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p238.1523 = f32[576]{0} parameter(238), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1534 = (f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) batch-norm-training(f32[96,576,14,14]{3,2,1,0} %convolution.1533, f32[576]{0} %p239.1524, f32[576]{0} %p238.1523), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1535 = f32[96,576,14,14]{3,2,1,0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1534), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1570 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1571 = f32[96,576,14,14]{3,2,1,0} clamp(f32[96,576,14,14]{3,2,1,0} %broadcast.1569, f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1535, f32[96,576,14,14]{3,2,1,0} %broadcast.1570), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p246.1567 = f32[576,1,3,3]{3,2,1,0} parameter(246), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1572 = f32[96,576,7,7]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %clamp.1571, f32[576,1,3,3]{3,2,1,0} %p246.1567), window={size=3x3 stride=2x2 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=576, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p245.1566 = f32[576]{0} parameter(245), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p244.1565 = f32[576]{0} parameter(244), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1573 = (f32[96,576,7,7]{3,2,1,0}, f32[576]{0}, f32[576]{0}) batch-norm-training(f32[96,576,7,7]{3,2,1,0} %convolution.1572, f32[576]{0} %p245.1566, f32[576]{0} %p244.1565), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1574 = f32[96,576,7,7]{3,2,1,0} get-tuple-element((f32[96,576,7,7]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1573), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1609 = f32[96,576,7,7]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1610 = f32[96,576,7,7]{3,2,1,0} clamp(f32[96,576,7,7]{3,2,1,0} %broadcast.1608, f32[96,576,7,7]{3,2,1,0} %get-tuple-element.1574, f32[96,576,7,7]{3,2,1,0} %broadcast.1609), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p252.1606 = f32[160,576,1,1]{3,2,1,0} parameter(252), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1611 = f32[96,160,7,7]{3,2,1,0} convolution(f32[96,576,7,7]{3,2,1,0} %clamp.1610, f32[160,576,1,1]{3,2,1,0} %p252.1606), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p251.1605 = f32[160]{0} parameter(251), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p250.1604 = f32[160]{0} parameter(250), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1612 = (f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) batch-norm-training(f32[96,160,7,7]{3,2,1,0} %convolution.1611, f32[160]{0} %p251.1605, f32[160]{0} %p250.1604), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1613 = f32[96,160,7,7]{3,2,1,0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-training.1612), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1720 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1721 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.1720), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.1681 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1682 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.1681), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p258.1645 = f32[960,160,1,1]{3,2,1,0} parameter(258), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1646 = f32[96,960,7,7]{3,2,1,0} convolution(f32[96,160,7,7]{3,2,1,0} %get-tuple-element.1613, f32[960,160,1,1]{3,2,1,0} %p258.1645), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p257.1644 = f32[960]{0} parameter(257), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p256.1643 = f32[960]{0} parameter(256), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1647 = (f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) batch-norm-training(f32[96,960,7,7]{3,2,1,0} %convolution.1646, f32[960]{0} %p257.1644, f32[960]{0} %p256.1643), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1648 = f32[96,960,7,7]{3,2,1,0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1647), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1683 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1684 = f32[96,960,7,7]{3,2,1,0} clamp(f32[96,960,7,7]{3,2,1,0} %broadcast.1682, f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1648, f32[96,960,7,7]{3,2,1,0} %broadcast.1683), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p264.1680 = f32[960,1,3,3]{3,2,1,0} parameter(264), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1685 = f32[96,960,7,7]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %clamp.1684, f32[960,1,3,3]{3,2,1,0} %p264.1680), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=960, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p263.1679 = f32[960]{0} parameter(263), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p262.1678 = f32[960]{0} parameter(262), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1686 = (f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) batch-norm-training(f32[96,960,7,7]{3,2,1,0} %convolution.1685, f32[960]{0} %p263.1679, f32[960]{0} %p262.1678), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1687 = f32[96,960,7,7]{3,2,1,0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1686), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1722 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1723 = f32[96,960,7,7]{3,2,1,0} clamp(f32[96,960,7,7]{3,2,1,0} %broadcast.1721, f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1687, f32[96,960,7,7]{3,2,1,0} %broadcast.1722), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p270.1719 = f32[160,960,1,1]{3,2,1,0} parameter(270), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1724 = f32[96,160,7,7]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %clamp.1723, f32[160,960,1,1]{3,2,1,0} %p270.1719), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p269.1718 = f32[160]{0} parameter(269), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p268.1717 = f32[160]{0} parameter(268), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1725 = (f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) batch-norm-training(f32[96,160,7,7]{3,2,1,0} %convolution.1724, f32[160]{0} %p269.1718, f32[160]{0} %p268.1717), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1726 = f32[96,160,7,7]{3,2,1,0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-training.1725), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1759 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.1760 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.1759), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.1761 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.1760), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.1762 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.1761), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.1763 = f32[96,160,7,7]{3,2,1,0} broadcast(f32[] %reshape.1762), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %multiply.1764 = f32[96,160,7,7]{3,2,1,0} multiply(f32[96,160,7,7]{3,2,1,0} %get-tuple-element.1726, f32[96,160,7,7]{3,2,1,0} %broadcast.1763), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@mobilenetv2.py" source_line=62}
+  %add.1765 = f32[96,160,7,7]{3,2,1,0} add(f32[96,160,7,7]{3,2,1,0} %get-tuple-element.1613, f32[96,160,7,7]{3,2,1,0} %multiply.1764), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@mobilenetv2.py" source_line=62}
+  %constant.1840 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1841 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.1840), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.1801 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.1802 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.1801), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p276.1758 = f32[960,160,1,1]{3,2,1,0} parameter(276), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1766 = f32[96,960,7,7]{3,2,1,0} convolution(f32[96,160,7,7]{3,2,1,0} %add.1765, f32[960,160,1,1]{3,2,1,0} %p276.1758), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p275.1757 = f32[960]{0} parameter(275), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p274.1756 = f32[960]{0} parameter(274), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1767 = (f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) batch-norm-training(f32[96,960,7,7]{3,2,1,0} %convolution.1766, f32[960]{0} %p275.1757, f32[960]{0} %p274.1756), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1768 = f32[96,960,7,7]{3,2,1,0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1767), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1803 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1804 = f32[96,960,7,7]{3,2,1,0} clamp(f32[96,960,7,7]{3,2,1,0} %broadcast.1802, f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1768, f32[96,960,7,7]{3,2,1,0} %broadcast.1803), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p282.1800 = f32[960,1,3,3]{3,2,1,0} parameter(282), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1805 = f32[96,960,7,7]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %clamp.1804, f32[960,1,3,3]{3,2,1,0} %p282.1800), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=960, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p281.1799 = f32[960]{0} parameter(281), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p280.1798 = f32[960]{0} parameter(280), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1806 = (f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) batch-norm-training(f32[96,960,7,7]{3,2,1,0} %convolution.1805, f32[960]{0} %p281.1799, f32[960]{0} %p280.1798), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1807 = f32[96,960,7,7]{3,2,1,0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1806), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1842 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1843 = f32[96,960,7,7]{3,2,1,0} clamp(f32[96,960,7,7]{3,2,1,0} %broadcast.1841, f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1807, f32[96,960,7,7]{3,2,1,0} %broadcast.1842), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p288.1839 = f32[160,960,1,1]{3,2,1,0} parameter(288), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1844 = f32[96,160,7,7]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %clamp.1843, f32[160,960,1,1]{3,2,1,0} %p288.1839), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p287.1838 = f32[160]{0} parameter(287), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p286.1837 = f32[160]{0} parameter(286), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1845 = (f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) batch-norm-training(f32[96,160,7,7]{3,2,1,0} %convolution.1844, f32[160]{0} %p287.1838, f32[160]{0} %p286.1837), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1846 = f32[96,160,7,7]{3,2,1,0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-training.1845), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1879 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.1880 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.1879), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.1881 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.1880), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %reshape.1882 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.1881), metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %broadcast.1883 = f32[96,160,7,7]{3,2,1,0} broadcast(f32[] %reshape.1882), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="forward@mobilenetv2.py" source_line=62}
+  %multiply.1884 = f32[96,160,7,7]{3,2,1,0} multiply(f32[96,160,7,7]{3,2,1,0} %get-tuple-element.1846, f32[96,160,7,7]{3,2,1,0} %broadcast.1883), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@mobilenetv2.py" source_line=62}
+  %add.1885 = f32[96,160,7,7]{3,2,1,0} add(f32[96,160,7,7]{3,2,1,0} %add.1765, f32[96,160,7,7]{3,2,1,0} %multiply.1884), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@mobilenetv2.py" source_line=62}
+  %p294.1878 = f32[960,160,1,1]{3,2,1,0} parameter(294), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1886 = f32[96,960,7,7]{3,2,1,0} convolution(f32[96,160,7,7]{3,2,1,0} %add.1885, f32[960,160,1,1]{3,2,1,0} %p294.1878), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p293.1877 = f32[960]{0} parameter(293), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p292.1876 = f32[960]{0} parameter(292), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1887 = (f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) batch-norm-training(f32[96,960,7,7]{3,2,1,0} %convolution.1886, f32[960]{0} %p293.1877, f32[960]{0} %p292.1876), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1888 = f32[96,960,7,7]{3,2,1,0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1887), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1923 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1924 = f32[96,960,7,7]{3,2,1,0} clamp(f32[96,960,7,7]{3,2,1,0} %broadcast.1922, f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1888, f32[96,960,7,7]{3,2,1,0} %broadcast.1923), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p300.1920 = f32[960,1,3,3]{3,2,1,0} parameter(300), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1925 = f32[96,960,7,7]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %clamp.1924, f32[960,1,3,3]{3,2,1,0} %p300.1920), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, feature_group_count=960, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p299.1919 = f32[960]{0} parameter(299), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p298.1918 = f32[960]{0} parameter(298), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1926 = (f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) batch-norm-training(f32[96,960,7,7]{3,2,1,0} %convolution.1925, f32[960]{0} %p299.1919, f32[960]{0} %p298.1918), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1927 = f32[96,960,7,7]{3,2,1,0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1926), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1962 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.1963 = f32[96,960,7,7]{3,2,1,0} clamp(f32[96,960,7,7]{3,2,1,0} %broadcast.1961, f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1927, f32[96,960,7,7]{3,2,1,0} %broadcast.1962), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %p306.1959 = f32[320,960,1,1]{3,2,1,0} parameter(306), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1964 = f32[96,320,7,7]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %clamp.1963, f32[320,960,1,1]{3,2,1,0} %p306.1959), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p305.1958 = f32[320]{0} parameter(305), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p304.1957 = f32[320]{0} parameter(304), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.1965 = (f32[96,320,7,7]{3,2,1,0}, f32[320]{0}, f32[320]{0}) batch-norm-training(f32[96,320,7,7]{3,2,1,0} %convolution.1964, f32[320]{0} %p305.1958, f32[320]{0} %p304.1957), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.1966 = f32[96,320,7,7]{3,2,1,0} get-tuple-element((f32[96,320,7,7]{3,2,1,0}, f32[320]{0}, f32[320]{0}) %batch-norm-training.1965), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %p312.1998 = f32[1280,320,1,1]{3,2,1,0} parameter(312), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="_conv_forward@conv.py" source_line=460}
+  %convolution.1999 = f32[96,1280,7,7]{3,2,1,0} convolution(f32[96,320,7,7]{3,2,1,0} %get-tuple-element.1966, f32[1280,320,1,1]{3,2,1,0} %p312.1998), window={size=1x1}, dim_labels=bf01_oi01->bf01, metadata={op_type="aten__convolution_overrideable" op_name="aten__convolution_overrideable" source_file="_conv_forward@conv.py" source_line=460}
+  %p311.1997 = f32[1280]{0} parameter(311), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %p310.1996 = f32[1280]{0} parameter(310), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %batch-norm-training.2000 = (f32[96,1280,7,7]{3,2,1,0}, f32[1280]{0}, f32[1280]{0}) batch-norm-training(f32[96,1280,7,7]{3,2,1,0} %convolution.1999, f32[1280]{0} %p311.1997, f32[1280]{0} %p310.1996), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %get-tuple-element.2002 = f32[1280]{0} get-tuple-element((f32[96,1280,7,7]{3,2,1,0}, f32[1280]{0}, f32[1280]{0}) %batch-norm-training.2000), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2374 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2375 = f32[1280]{0} broadcast(f32[] %constant.2374), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2003 = f32[1280]{0} get-tuple-element((f32[96,1280,7,7]{3,2,1,0}, f32[1280]{0}, f32[1280]{0}) %batch-norm-training.2000), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2004 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.2005 = f32[1280]{0} broadcast(f32[] %constant.2004), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.2006 = f32[1280]{0} add(f32[1280]{0} %get-tuple-element.2003, f32[1280]{0} %broadcast.2005), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.2007 = f32[1280]{0} rsqrt(f32[1280]{0} %add.2006), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2376 = f32[1280]{0} divide(f32[1280]{0} %broadcast.2375, f32[1280]{0} %rsqrt.2007), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2377 = f32[1280]{0} multiply(f32[1280]{0} %divide.2376, f32[1280]{0} %divide.2376), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2373 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2378 = f32[1280]{0} broadcast(f32[] %constant.2373), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2379 = f32[1280]{0} subtract(f32[1280]{0} %multiply.2377, f32[1280]{0} %broadcast.2378), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2001 = f32[96,1280,7,7]{3,2,1,0} get-tuple-element((f32[96,1280,7,7]{3,2,1,0}, f32[1280]{0}, f32[1280]{0}) %batch-norm-training.2000), index=0, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2364 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2365 = f32[96,1280,7,7]{3,2,1,0} broadcast(f32[] %constant.2364), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2366 = pred[96,1280,7,7]{3,2,1,0} compare(f32[96,1280,7,7]{3,2,1,0} %get-tuple-element.2001, f32[96,1280,7,7]{3,2,1,0} %broadcast.2365), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2367 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2368 = f32[96,1280,7,7]{3,2,1,0} broadcast(f32[] %constant.2367), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2369 = pred[96,1280,7,7]{3,2,1,0} compare(f32[96,1280,7,7]{3,2,1,0} %get-tuple-element.2001, f32[96,1280,7,7]{3,2,1,0} %broadcast.2368), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2370 = pred[96,1280,7,7]{3,2,1,0} and(pred[96,1280,7,7]{3,2,1,0} %compare.2366, pred[96,1280,7,7]{3,2,1,0} %compare.2369), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2030 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="_make_grads@__init__.py" source_line=86}
+  %p314.2029 = f32[] parameter(314), metadata={op_type="xla__device_data" op_name="xla__device_data"}
+  %divide.2031 = f32[] divide(f32[] %constant.2030, f32[] %p314.2029), metadata={op_type="aten__div" op_name="aten__div"}
+  %reshape.2032 = f32[1,1]{1,0} reshape(f32[] %divide.2031), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.2033 = f32[1,1]{1,0} broadcast(f32[1,1]{1,0} %reshape.2032), dimensions={0,1}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %reshape.2034 = f32[] reshape(f32[1,1]{1,0} %broadcast.2033), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.2035 = f32[96,1000]{1,0} broadcast(f32[] %reshape.2034), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %p318.2352 = f32[1000,1280]{1,0} parameter(318), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@linear.py" source_line=114}
+  %transpose.2353 = f32[1280,1000]{0,1} transpose(f32[1000,1280]{1,0} %p318.2352), dimensions={1,0}, metadata={op_type="aten__permute" op_name="aten__permute"}
+  %transpose.2354 = f32[1000,1280]{1,0} transpose(f32[1280,1000]{0,1} %transpose.2353), dimensions={1,0}, metadata={op_type="aten__permute" op_name="aten__permute"}
+  %dot.2355 = f32[96,1280]{1,0} dot(f32[96,1000]{1,0} %broadcast.2035, f32[1000,1280]{1,0} %transpose.2354), lhs_contracting_dims={1}, rhs_contracting_dims={0}, metadata={op_type="aten__mm" op_name="aten__mm"}
+  %reshape.2059 = u64[] reshape(u64[] %constant.2058), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2060 = u64[48,1,1280]{2,1,0} broadcast(u64[] %reshape.2059), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %iota.2062 = u64[48,1,1280]{2,1,0} iota(), iota_dimension=2, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2061 = u64[] constant(1), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2063 = u64[48,1,1280]{2,1,0} broadcast(u64[] %constant.2061), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %multiply.2064 = u64[48,1,1280]{2,1,0} multiply(u64[48,1,1280]{2,1,0} %iota.2062, u64[48,1,1280]{2,1,0} %broadcast.2063), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2065 = u64[48,1,1280]{2,1,0} add(u64[48,1,1280]{2,1,0} %broadcast.2060, u64[48,1,1280]{2,1,0} %multiply.2064), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %iota.2067 = u64[48,1,1280]{2,1,0} iota(), iota_dimension=0, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2066 = u64[] constant(1280), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2068 = u64[48,1,1280]{2,1,0} broadcast(u64[] %constant.2066), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %multiply.2069 = u64[48,1,1280]{2,1,0} multiply(u64[48,1,1280]{2,1,0} %iota.2067, u64[48,1,1280]{2,1,0} %broadcast.2068), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2070 = u64[48,1,1280]{2,1,0} add(u64[48,1,1280]{2,1,0} %add.2065, u64[48,1,1280]{2,1,0} %multiply.2069), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %convert.2074 = u32[48,1,1280]{2,1,0} convert(u64[48,1,1280]{2,1,0} %add.2070), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2049 = s64[] constant(2531011), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="dropout@functional.py" source_line=1252}
+  %constant.2047 = s64[] constant(214013), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="dropout@functional.py" source_line=1252}
+  %p316.2046 = s64[] parameter(316), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="dropout@functional.py" source_line=1252}
+  %multiply.2048 = s64[] multiply(s64[] %constant.2047, s64[] %p316.2046), metadata={op_type="aten__mul" op_name="aten__mul" source_file="dropout@functional.py" source_line=1252}
+  %add.2050 = s64[] add(s64[] %constant.2049, s64[] %multiply.2048), metadata={op_type="aten__add" op_name="aten__add" source_file="dropout@functional.py" source_line=1252}
+  %convert.2057 = u64[] convert(s64[] %add.2050), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %convert.2079 = u32[] convert(u64[] %convert.2057), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %bitcast-convert.2082 = u32[] bitcast-convert(u32[] %convert.2079), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2087 = u32[48,1,1280]{2,1,0} broadcast(u32[] %bitcast-convert.2082), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2088 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %convert.2074, u32[48,1,1280]{2,1,0} %broadcast.2087), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2073 = u64[] constant(32), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2075 = u64[48,1,1280]{2,1,0} broadcast(u64[] %constant.2073), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2076 = u64[48,1,1280]{2,1,0} shift-right-logical(u64[48,1,1280]{2,1,0} %add.2070, u64[48,1,1280]{2,1,0} %broadcast.2075), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %convert.2077 = u32[48,1,1280]{2,1,0} convert(u64[48,1,1280]{2,1,0} %shift-right-logical.2076), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2078 = u64[] constant(32), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2080 = u64[] shift-right-logical(u64[] %convert.2057, u64[] %constant.2078), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %convert.2081 = u32[] convert(u64[] %shift-right-logical.2080), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %bitcast-convert.2083 = u32[] bitcast-convert(u32[] %convert.2081), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2089 = u32[48,1,1280]{2,1,0} broadcast(u32[] %bitcast-convert.2083), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2090 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %convert.2077, u32[48,1,1280]{2,1,0} %broadcast.2089), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2091 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2088, u32[48,1,1280]{2,1,0} %add.2090), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2095 = u32[] constant(13), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2096 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2095), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2097 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %add.2090, u32[48,1,1280]{2,1,0} %broadcast.2096), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2092 = u32[] constant(19), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2093 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2092), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2094 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %add.2090, u32[48,1,1280]{2,1,0} %broadcast.2093), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2098 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2097, u32[48,1,1280]{2,1,0} %shift-right-logical.2094), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2099 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2091, u32[48,1,1280]{2,1,0} %or.2098), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2100 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2091, u32[48,1,1280]{2,1,0} %xor.2099), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2104 = u32[] constant(15), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2105 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2104), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2106 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2099, u32[48,1,1280]{2,1,0} %broadcast.2105), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2101 = u32[] constant(17), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2102 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2101), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2103 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2099, u32[48,1,1280]{2,1,0} %broadcast.2102), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2107 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2106, u32[48,1,1280]{2,1,0} %shift-right-logical.2103), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2108 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2100, u32[48,1,1280]{2,1,0} %or.2107), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2109 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2100, u32[48,1,1280]{2,1,0} %xor.2108), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2113 = u32[] constant(26), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2114 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2113), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2115 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2108, u32[48,1,1280]{2,1,0} %broadcast.2114), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2110 = u32[] constant(6), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2111 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2110), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2112 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2108, u32[48,1,1280]{2,1,0} %broadcast.2111), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2116 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2115, u32[48,1,1280]{2,1,0} %shift-right-logical.2112), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2117 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2109, u32[48,1,1280]{2,1,0} %or.2116), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2118 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2109, u32[48,1,1280]{2,1,0} %xor.2117), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2127 = u32[48,1,1280]{2,1,0} broadcast(u32[] %bitcast-convert.2083), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2128 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2118, u32[48,1,1280]{2,1,0} %broadcast.2127), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2122 = u32[] constant(6), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2123 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2122), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2124 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2117, u32[48,1,1280]{2,1,0} %broadcast.2123), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2119 = u32[] constant(26), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2120 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2119), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2121 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2117, u32[48,1,1280]{2,1,0} %broadcast.2120), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2125 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2124, u32[48,1,1280]{2,1,0} %shift-right-logical.2121), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2126 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2118, u32[48,1,1280]{2,1,0} %or.2125), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2084 = u32[] constant(466688986), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2085 = u32[] xor(u32[] %constant.2084, u32[] %bitcast-convert.2082), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2086 = u32[] xor(u32[] %xor.2085, u32[] %bitcast-convert.2083), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2130 = u32[48,1,1280]{2,1,0} broadcast(u32[] %xor.2086), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2131 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %xor.2126, u32[48,1,1280]{2,1,0} %broadcast.2130), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2129 = u32[] constant(1), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2132 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2129), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2133 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2131, u32[48,1,1280]{2,1,0} %broadcast.2132), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2134 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2128, u32[48,1,1280]{2,1,0} %add.2133), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2138 = u32[] constant(17), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2139 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2138), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2140 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %add.2133, u32[48,1,1280]{2,1,0} %broadcast.2139), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2135 = u32[] constant(15), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2136 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2135), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2137 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %add.2133, u32[48,1,1280]{2,1,0} %broadcast.2136), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2141 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2140, u32[48,1,1280]{2,1,0} %shift-right-logical.2137), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2142 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2134, u32[48,1,1280]{2,1,0} %or.2141), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2143 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2134, u32[48,1,1280]{2,1,0} %xor.2142), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2147 = u32[] constant(29), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2148 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2147), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2149 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2142, u32[48,1,1280]{2,1,0} %broadcast.2148), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2144 = u32[] constant(3), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2145 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2144), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2146 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2142, u32[48,1,1280]{2,1,0} %broadcast.2145), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2150 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2149, u32[48,1,1280]{2,1,0} %shift-right-logical.2146), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2151 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2143, u32[48,1,1280]{2,1,0} %or.2150), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2152 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2143, u32[48,1,1280]{2,1,0} %xor.2151), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2156 = u32[] constant(16), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2157 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2156), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2158 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2151, u32[48,1,1280]{2,1,0} %broadcast.2157), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2153 = u32[] constant(16), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2154 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2153), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2155 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2151, u32[48,1,1280]{2,1,0} %broadcast.2154), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2159 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2158, u32[48,1,1280]{2,1,0} %shift-right-logical.2155), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2160 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2152, u32[48,1,1280]{2,1,0} %or.2159), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2161 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2152, u32[48,1,1280]{2,1,0} %xor.2160), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2170 = u32[48,1,1280]{2,1,0} broadcast(u32[] %xor.2086), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2171 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2161, u32[48,1,1280]{2,1,0} %broadcast.2170), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2165 = u32[] constant(24), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2166 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2165), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2167 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2160, u32[48,1,1280]{2,1,0} %broadcast.2166), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2162 = u32[] constant(8), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2163 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2162), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2164 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2160, u32[48,1,1280]{2,1,0} %broadcast.2163), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2168 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2167, u32[48,1,1280]{2,1,0} %shift-right-logical.2164), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2169 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2161, u32[48,1,1280]{2,1,0} %or.2168), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2173 = u32[48,1,1280]{2,1,0} broadcast(u32[] %bitcast-convert.2082), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2174 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %xor.2169, u32[48,1,1280]{2,1,0} %broadcast.2173), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2172 = u32[] constant(2), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2175 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2172), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2176 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2174, u32[48,1,1280]{2,1,0} %broadcast.2175), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2177 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2171, u32[48,1,1280]{2,1,0} %add.2176), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2181 = u32[] constant(13), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2182 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2181), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2183 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %add.2176, u32[48,1,1280]{2,1,0} %broadcast.2182), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2178 = u32[] constant(19), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2179 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2178), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2180 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %add.2176, u32[48,1,1280]{2,1,0} %broadcast.2179), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2184 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2183, u32[48,1,1280]{2,1,0} %shift-right-logical.2180), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2185 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2177, u32[48,1,1280]{2,1,0} %or.2184), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2186 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2177, u32[48,1,1280]{2,1,0} %xor.2185), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2190 = u32[] constant(15), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2191 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2190), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2192 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2185, u32[48,1,1280]{2,1,0} %broadcast.2191), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2187 = u32[] constant(17), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2188 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2187), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2189 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2185, u32[48,1,1280]{2,1,0} %broadcast.2188), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2193 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2192, u32[48,1,1280]{2,1,0} %shift-right-logical.2189), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2194 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2186, u32[48,1,1280]{2,1,0} %or.2193), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2195 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2186, u32[48,1,1280]{2,1,0} %xor.2194), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2199 = u32[] constant(26), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2200 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2199), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2201 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2194, u32[48,1,1280]{2,1,0} %broadcast.2200), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2196 = u32[] constant(6), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2197 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2196), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2198 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2194, u32[48,1,1280]{2,1,0} %broadcast.2197), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2202 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2201, u32[48,1,1280]{2,1,0} %shift-right-logical.2198), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2203 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2195, u32[48,1,1280]{2,1,0} %or.2202), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2204 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2195, u32[48,1,1280]{2,1,0} %xor.2203), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2213 = u32[48,1,1280]{2,1,0} broadcast(u32[] %bitcast-convert.2082), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2214 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2204, u32[48,1,1280]{2,1,0} %broadcast.2213), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2208 = u32[] constant(6), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2209 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2208), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2210 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2203, u32[48,1,1280]{2,1,0} %broadcast.2209), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2205 = u32[] constant(26), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2206 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2205), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2207 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2203, u32[48,1,1280]{2,1,0} %broadcast.2206), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2211 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2210, u32[48,1,1280]{2,1,0} %shift-right-logical.2207), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2212 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2204, u32[48,1,1280]{2,1,0} %or.2211), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2216 = u32[48,1,1280]{2,1,0} broadcast(u32[] %bitcast-convert.2083), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2217 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %xor.2212, u32[48,1,1280]{2,1,0} %broadcast.2216), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2215 = u32[] constant(3), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2218 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2215), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2219 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2217, u32[48,1,1280]{2,1,0} %broadcast.2218), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2220 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2214, u32[48,1,1280]{2,1,0} %add.2219), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2224 = u32[] constant(17), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2225 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2224), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2226 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %add.2219, u32[48,1,1280]{2,1,0} %broadcast.2225), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2221 = u32[] constant(15), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2222 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2221), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2223 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %add.2219, u32[48,1,1280]{2,1,0} %broadcast.2222), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2227 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2226, u32[48,1,1280]{2,1,0} %shift-right-logical.2223), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2228 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2220, u32[48,1,1280]{2,1,0} %or.2227), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2229 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2220, u32[48,1,1280]{2,1,0} %xor.2228), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2233 = u32[] constant(29), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2234 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2233), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2235 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2228, u32[48,1,1280]{2,1,0} %broadcast.2234), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2230 = u32[] constant(3), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2231 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2230), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2232 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2228, u32[48,1,1280]{2,1,0} %broadcast.2231), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2236 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2235, u32[48,1,1280]{2,1,0} %shift-right-logical.2232), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2237 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2229, u32[48,1,1280]{2,1,0} %or.2236), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2238 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2229, u32[48,1,1280]{2,1,0} %xor.2237), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2242 = u32[] constant(16), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2243 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2242), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2244 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2237, u32[48,1,1280]{2,1,0} %broadcast.2243), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2239 = u32[] constant(16), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2240 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2239), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2241 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2237, u32[48,1,1280]{2,1,0} %broadcast.2240), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2245 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2244, u32[48,1,1280]{2,1,0} %shift-right-logical.2241), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2246 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2238, u32[48,1,1280]{2,1,0} %or.2245), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2247 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2238, u32[48,1,1280]{2,1,0} %xor.2246), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2256 = u32[48,1,1280]{2,1,0} broadcast(u32[] %bitcast-convert.2083), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2257 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2247, u32[48,1,1280]{2,1,0} %broadcast.2256), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2251 = u32[] constant(24), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2252 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2251), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2253 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2246, u32[48,1,1280]{2,1,0} %broadcast.2252), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2248 = u32[] constant(8), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2249 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2248), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2250 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2246, u32[48,1,1280]{2,1,0} %broadcast.2249), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2254 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2253, u32[48,1,1280]{2,1,0} %shift-right-logical.2250), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2255 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2247, u32[48,1,1280]{2,1,0} %or.2254), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2259 = u32[48,1,1280]{2,1,0} broadcast(u32[] %xor.2086), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2260 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %xor.2255, u32[48,1,1280]{2,1,0} %broadcast.2259), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2258 = u32[] constant(4), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2261 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2258), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2262 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2260, u32[48,1,1280]{2,1,0} %broadcast.2261), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2263 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2257, u32[48,1,1280]{2,1,0} %add.2262), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2267 = u32[] constant(13), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2268 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2267), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2269 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %add.2262, u32[48,1,1280]{2,1,0} %broadcast.2268), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2264 = u32[] constant(19), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2265 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2264), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2266 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %add.2262, u32[48,1,1280]{2,1,0} %broadcast.2265), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2270 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2269, u32[48,1,1280]{2,1,0} %shift-right-logical.2266), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2271 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2263, u32[48,1,1280]{2,1,0} %or.2270), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2272 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2263, u32[48,1,1280]{2,1,0} %xor.2271), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2276 = u32[] constant(15), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2277 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2276), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2278 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2271, u32[48,1,1280]{2,1,0} %broadcast.2277), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2273 = u32[] constant(17), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2274 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2273), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2275 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2271, u32[48,1,1280]{2,1,0} %broadcast.2274), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2279 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2278, u32[48,1,1280]{2,1,0} %shift-right-logical.2275), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2280 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2272, u32[48,1,1280]{2,1,0} %or.2279), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2281 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2272, u32[48,1,1280]{2,1,0} %xor.2280), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2285 = u32[] constant(26), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2286 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2285), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2287 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2280, u32[48,1,1280]{2,1,0} %broadcast.2286), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2282 = u32[] constant(6), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2283 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2282), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2284 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2280, u32[48,1,1280]{2,1,0} %broadcast.2283), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2288 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2287, u32[48,1,1280]{2,1,0} %shift-right-logical.2284), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2289 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2281, u32[48,1,1280]{2,1,0} %or.2288), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2290 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2281, u32[48,1,1280]{2,1,0} %xor.2289), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2299 = u32[48,1,1280]{2,1,0} broadcast(u32[] %xor.2086), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2300 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2290, u32[48,1,1280]{2,1,0} %broadcast.2299), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2294 = u32[] constant(6), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2295 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2294), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-left.2296 = u32[48,1,1280]{2,1,0} shift-left(u32[48,1,1280]{2,1,0} %xor.2289, u32[48,1,1280]{2,1,0} %broadcast.2295), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2291 = u32[] constant(26), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2292 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2291), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2293 = u32[48,1,1280]{2,1,0} shift-right-logical(u32[48,1,1280]{2,1,0} %xor.2289, u32[48,1,1280]{2,1,0} %broadcast.2292), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %or.2297 = u32[48,1,1280]{2,1,0} or(u32[48,1,1280]{2,1,0} %shift-left.2296, u32[48,1,1280]{2,1,0} %shift-right-logical.2293), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %xor.2298 = u32[48,1,1280]{2,1,0} xor(u32[48,1,1280]{2,1,0} %add.2290, u32[48,1,1280]{2,1,0} %or.2297), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2302 = u32[48,1,1280]{2,1,0} broadcast(u32[] %bitcast-convert.2082), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2303 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %xor.2298, u32[48,1,1280]{2,1,0} %broadcast.2302), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2301 = u32[] constant(5), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2304 = u32[48,1,1280]{2,1,0} broadcast(u32[] %constant.2301), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2305 = u32[48,1,1280]{2,1,0} add(u32[48,1,1280]{2,1,0} %add.2303, u32[48,1,1280]{2,1,0} %broadcast.2304), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %concatenate.2306 = u32[48,2,1280]{2,1,0} concatenate(u32[48,1,1280]{2,1,0} %add.2300, u32[48,1,1280]{2,1,0} %add.2305), dimensions={1}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %reshape.2307 = u32[96,1280]{1,0} reshape(u32[48,2,1280]{2,1,0} %concatenate.2306), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2308 = u32[] constant(9), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2309 = u32[96,1280]{1,0} broadcast(u32[] %constant.2308), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %shift-right-logical.2310 = u32[96,1280]{1,0} shift-right-logical(u32[96,1280]{1,0} %reshape.2307, u32[96,1280]{1,0} %broadcast.2309), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %convert.2311 = f32[96,1280]{1,0} convert(u32[96,1280]{1,0} %shift-right-logical.2310), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2312 = f32[] constant(1.1920929e-07), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2313 = f32[96,1280]{1,0} broadcast(f32[] %constant.2312), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %multiply.2314 = f32[96,1280]{1,0} multiply(f32[96,1280]{1,0} %convert.2311, f32[96,1280]{1,0} %broadcast.2313), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2056 = f32[] constant(1), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %constant.2055 = f32[] constant(0), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %subtract.2315 = f32[] subtract(f32[] %constant.2056, f32[] %constant.2055), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2316 = f32[96,1280]{1,0} broadcast(f32[] %subtract.2315), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %multiply.2317 = f32[96,1280]{1,0} multiply(f32[96,1280]{1,0} %multiply.2314, f32[96,1280]{1,0} %broadcast.2316), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2318 = f32[96,1280]{1,0} broadcast(f32[] %constant.2055), dimensions={}, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %add.2319 = f32[96,1280]{1,0} add(f32[96,1280]{1,0} %multiply.2317, f32[96,1280]{1,0} %broadcast.2318), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %p315.2045 = f32[] parameter(315), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="dropout@functional.py" source_line=1252}
+  %reshape.2051 = f32[1,1]{1,0} reshape(f32[] %p315.2045), metadata={op_type="aten__expand" op_name="aten__expand" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2052 = f32[1,1]{1,0} broadcast(f32[1,1]{1,0} %reshape.2051), dimensions={0,1}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="dropout@functional.py" source_line=1252}
+  %reshape.2053 = f32[] reshape(f32[1,1]{1,0} %broadcast.2052), metadata={op_type="aten__expand" op_name="aten__expand" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2054 = f32[96,1280]{1,0} broadcast(f32[] %reshape.2053), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand" source_file="dropout@functional.py" source_line=1252}
+  %compare.2320 = pred[96,1280]{1,0} compare(f32[96,1280]{1,0} %add.2319, f32[96,1280]{1,0} %broadcast.2054), direction=LT, metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %convert.2321 = f32[96,1280]{1,0} convert(pred[96,1280]{1,0} %compare.2320), metadata={op_type="aten__bernoulli" op_name="aten__bernoulli" source_file="dropout@functional.py" source_line=1252}
+  %broadcast.2322 = f32[96,1280]{1,0} broadcast(f32[] %p315.2045), dimensions={}, metadata={op_type="aten__div" op_name="aten__div" source_file="dropout@functional.py" source_line=1252}
+  %divide.2323 = f32[96,1280]{1,0} divide(f32[96,1280]{1,0} %convert.2321, f32[96,1280]{1,0} %broadcast.2322), metadata={op_type="aten__div" op_name="aten__div" source_file="dropout@functional.py" source_line=1252}
+  %multiply.2356 = f32[96,1280]{1,0} multiply(f32[96,1280]{1,0} %dot.2355, f32[96,1280]{1,0} %divide.2323), metadata={op_type="aten__mul" op_name="aten__mul"}
+  %reshape.2357 = f32[96,1280,1,1]{3,2,1,0} reshape(f32[96,1280]{1,0} %multiply.2356), metadata={op_type="aten__view" op_name="aten__view"}
+  %broadcast.2358 = f32[96,1280,1,1]{3,2,1,0} broadcast(f32[96,1280,1,1]{3,2,1,0} %reshape.2357), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %reshape.2359 = f32[96,1280]{1,0} reshape(f32[96,1280,1,1]{3,2,1,0} %broadcast.2358), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.2360 = f32[96,1280,7,7]{3,2,1,0} broadcast(f32[96,1280]{1,0} %reshape.2359), dimensions={0,1}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %p317.2351 = f32[] parameter(317), metadata={op_type="xla__device_data" op_name="xla__device_data"}
+  %broadcast.2361 = f32[96,1280,7,7]{3,2,1,0} broadcast(f32[] %p317.2351), dimensions={}, metadata={op_type="aten__div" op_name="aten__div"}
+  %divide.2362 = f32[96,1280,7,7]{3,2,1,0} divide(f32[96,1280,7,7]{3,2,1,0} %broadcast.2360, f32[96,1280,7,7]{3,2,1,0} %broadcast.2361), metadata={op_type="aten__div" op_name="aten__div"}
+  %constant.2363 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2371 = f32[96,1280,7,7]{3,2,1,0} broadcast(f32[] %constant.2363), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2372 = f32[96,1280,7,7]{3,2,1,0} select(pred[96,1280,7,7]{3,2,1,0} %and.2370, f32[96,1280,7,7]{3,2,1,0} %divide.2362, f32[96,1280,7,7]{3,2,1,0} %broadcast.2371), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2380 = (f32[96,1280,7,7]{3,2,1,0}, f32[1280]{0}, f32[1280]{0}) batch-norm-grad(f32[96,1280,7,7]{3,2,1,0} %convolution.1999, f32[1280]{0} %p311.1997, f32[1280]{0} %get-tuple-element.2002, f32[1280]{0} %subtract.2379, f32[96,1280,7,7]{3,2,1,0} %select.2372), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2381 = f32[96,1280,7,7]{3,2,1,0} get-tuple-element((f32[96,1280,7,7]{3,2,1,0}, f32[1280]{0}, f32[1280]{0}) %batch-norm-grad.2380), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2389 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2394 = f32[1280]{0} reduce(f32[96,1280,7,7]{3,2,1,0} %get-tuple-element.2381, f32[] %constant.2389), dimensions={0,2,3}, to_apply=%AddComputation.2390, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1967 = f32[320]{0} get-tuple-element((f32[96,320,7,7]{3,2,1,0}, f32[320]{0}, f32[320]{0}) %batch-norm-training.1965), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2396 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2397 = f32[320]{0} broadcast(f32[] %constant.2396), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1968 = f32[320]{0} get-tuple-element((f32[96,320,7,7]{3,2,1,0}, f32[320]{0}, f32[320]{0}) %batch-norm-training.1965), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1969 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1970 = f32[320]{0} broadcast(f32[] %constant.1969), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1971 = f32[320]{0} add(f32[320]{0} %get-tuple-element.1968, f32[320]{0} %broadcast.1970), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1972 = f32[320]{0} rsqrt(f32[320]{0} %add.1971), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2398 = f32[320]{0} divide(f32[320]{0} %broadcast.2397, f32[320]{0} %rsqrt.1972), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2399 = f32[320]{0} multiply(f32[320]{0} %divide.2398, f32[320]{0} %divide.2398), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2395 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2400 = f32[320]{0} broadcast(f32[] %constant.2395), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2401 = f32[320]{0} subtract(f32[320]{0} %multiply.2399, f32[320]{0} %broadcast.2400), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.2384 = f32[1,1,320,1280]{1,0,2,3} transpose(f32[1280,320,1,1]{3,2,1,0} %p312.1998), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2385 = f32[1,1,320,1280]{1,0,2,3} reverse(f32[1,1,320,1280]{1,0,2,3} %transpose.2384), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2386 = f32[96,320,7,7]{3,2,1,0} convolution(f32[96,1280,7,7]{3,2,1,0} %get-tuple-element.2381, f32[1,1,320,1280]{1,0,2,3} %reverse.2385), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %batch-norm-grad.2402 = (f32[96,320,7,7]{3,2,1,0}, f32[320]{0}, f32[320]{0}) batch-norm-grad(f32[96,320,7,7]{3,2,1,0} %convolution.1964, f32[320]{0} %p305.1958, f32[320]{0} %get-tuple-element.1967, f32[320]{0} %subtract.2401, f32[96,320,7,7]{3,2,1,0} %convolution.2386), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2403 = f32[96,320,7,7]{3,2,1,0} get-tuple-element((f32[96,320,7,7]{3,2,1,0}, f32[320]{0}, f32[320]{0}) %batch-norm-grad.2402), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2411 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2416 = f32[320]{0} reduce(f32[96,320,7,7]{3,2,1,0} %get-tuple-element.2403, f32[] %constant.2411), dimensions={0,2,3}, to_apply=%AddComputation.2412, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1928 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1926), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2428 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2429 = f32[960]{0} broadcast(f32[] %constant.2428), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1929 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1926), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1930 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1931 = f32[960]{0} broadcast(f32[] %constant.1930), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1932 = f32[960]{0} add(f32[960]{0} %get-tuple-element.1929, f32[960]{0} %broadcast.1931), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1933 = f32[960]{0} rsqrt(f32[960]{0} %add.1932), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2430 = f32[960]{0} divide(f32[960]{0} %broadcast.2429, f32[960]{0} %rsqrt.1933), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2431 = f32[960]{0} multiply(f32[960]{0} %divide.2430, f32[960]{0} %divide.2430), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2427 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2432 = f32[960]{0} broadcast(f32[] %constant.2427), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2433 = f32[960]{0} subtract(f32[960]{0} %multiply.2431, f32[960]{0} %broadcast.2432), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2418 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2419 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2418), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2420 = pred[96,960,7,7]{3,2,1,0} compare(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1927, f32[96,960,7,7]{3,2,1,0} %broadcast.2419), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2421 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2422 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2421), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2423 = pred[96,960,7,7]{3,2,1,0} compare(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1927, f32[96,960,7,7]{3,2,1,0} %broadcast.2422), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2424 = pred[96,960,7,7]{3,2,1,0} and(pred[96,960,7,7]{3,2,1,0} %compare.2420, pred[96,960,7,7]{3,2,1,0} %compare.2423), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2406 = f32[1,1,960,320]{1,0,2,3} transpose(f32[320,960,1,1]{3,2,1,0} %p306.1959), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2407 = f32[1,1,960,320]{1,0,2,3} reverse(f32[1,1,960,320]{1,0,2,3} %transpose.2406), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2408 = f32[96,960,7,7]{3,2,1,0} convolution(f32[96,320,7,7]{3,2,1,0} %get-tuple-element.2403, f32[1,1,960,320]{1,0,2,3} %reverse.2407), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2417 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2425 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2417), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2426 = f32[96,960,7,7]{3,2,1,0} select(pred[96,960,7,7]{3,2,1,0} %and.2424, f32[96,960,7,7]{3,2,1,0} %convolution.2408, f32[96,960,7,7]{3,2,1,0} %broadcast.2425), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2434 = (f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) batch-norm-grad(f32[96,960,7,7]{3,2,1,0} %convolution.1925, f32[960]{0} %p299.1919, f32[960]{0} %get-tuple-element.1928, f32[960]{0} %subtract.2433, f32[96,960,7,7]{3,2,1,0} %select.2426), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2435 = f32[96,960,7,7]{3,2,1,0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2434), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2446 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2451 = f32[960]{0} reduce(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2435, f32[] %constant.2446), dimensions={0,2,3}, to_apply=%AddComputation.2447, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1889 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1887), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2463 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2464 = f32[960]{0} broadcast(f32[] %constant.2463), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1890 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1887), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1891 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1892 = f32[960]{0} broadcast(f32[] %constant.1891), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1893 = f32[960]{0} add(f32[960]{0} %get-tuple-element.1890, f32[960]{0} %broadcast.1892), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1894 = f32[960]{0} rsqrt(f32[960]{0} %add.1893), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2465 = f32[960]{0} divide(f32[960]{0} %broadcast.2464, f32[960]{0} %rsqrt.1894), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2466 = f32[960]{0} multiply(f32[960]{0} %divide.2465, f32[960]{0} %divide.2465), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2462 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2467 = f32[960]{0} broadcast(f32[] %constant.2462), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2468 = f32[960]{0} subtract(f32[960]{0} %multiply.2466, f32[960]{0} %broadcast.2467), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2453 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2454 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2453), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2455 = pred[96,960,7,7]{3,2,1,0} compare(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1888, f32[96,960,7,7]{3,2,1,0} %broadcast.2454), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2456 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2457 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2456), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2458 = pred[96,960,7,7]{3,2,1,0} compare(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1888, f32[96,960,7,7]{3,2,1,0} %broadcast.2457), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2459 = pred[96,960,7,7]{3,2,1,0} and(pred[96,960,7,7]{3,2,1,0} %compare.2455, pred[96,960,7,7]{3,2,1,0} %compare.2458), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2438 = f32[3,3,1,960]{1,0,2,3} transpose(f32[960,1,3,3]{3,2,1,0} %p300.1920), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.2439 = f32[3,3,1,960,1]{4,3,2,1,0} reshape(f32[3,3,1,960]{1,0,2,3} %transpose.2438), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2440 = f32[3,3,960,1,1]{4,2,3,1,0} transpose(f32[3,3,1,960,1]{4,3,2,1,0} %reshape.2439), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.2441 = f32[3,3,960,1]{3,2,1,0} reshape(f32[3,3,960,1,1]{4,2,3,1,0} %transpose.2440), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2442 = f32[3,3,960,1]{3,2,1,0} reverse(f32[3,3,960,1]{3,2,1,0} %reshape.2441), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2443 = f32[96,960,7,7]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2435, f32[3,3,960,1]{3,2,1,0} %reverse.2442), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=960, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2452 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2460 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2452), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2461 = f32[96,960,7,7]{3,2,1,0} select(pred[96,960,7,7]{3,2,1,0} %and.2459, f32[96,960,7,7]{3,2,1,0} %convolution.2443, f32[96,960,7,7]{3,2,1,0} %broadcast.2460), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2469 = (f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) batch-norm-grad(f32[96,960,7,7]{3,2,1,0} %convolution.1886, f32[960]{0} %p293.1877, f32[960]{0} %get-tuple-element.1889, f32[960]{0} %subtract.2468, f32[96,960,7,7]{3,2,1,0} %select.2461), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2470 = f32[96,960,7,7]{3,2,1,0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2469), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2478 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2483 = f32[960]{0} reduce(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2470, f32[] %constant.2478), dimensions={0,2,3}, to_apply=%AddComputation.2479, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1847 = f32[160]{0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-training.1845), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2485 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2486 = f32[160]{0} broadcast(f32[] %constant.2485), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1848 = f32[160]{0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-training.1845), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1849 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1850 = f32[160]{0} broadcast(f32[] %constant.1849), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1851 = f32[160]{0} add(f32[160]{0} %get-tuple-element.1848, f32[160]{0} %broadcast.1850), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1852 = f32[160]{0} rsqrt(f32[160]{0} %add.1851), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2487 = f32[160]{0} divide(f32[160]{0} %broadcast.2486, f32[160]{0} %rsqrt.1852), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2488 = f32[160]{0} multiply(f32[160]{0} %divide.2487, f32[160]{0} %divide.2487), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2484 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2489 = f32[160]{0} broadcast(f32[] %constant.2484), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2490 = f32[160]{0} subtract(f32[160]{0} %multiply.2488, f32[160]{0} %broadcast.2489), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.2473 = f32[1,1,160,960]{1,0,2,3} transpose(f32[960,160,1,1]{3,2,1,0} %p294.1878), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2474 = f32[1,1,160,960]{1,0,2,3} reverse(f32[1,1,160,960]{1,0,2,3} %transpose.2473), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2475 = f32[96,160,7,7]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2470, f32[1,1,160,960]{1,0,2,3} %reverse.2474), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %batch-norm-grad.2491 = (f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) batch-norm-grad(f32[96,160,7,7]{3,2,1,0} %convolution.1844, f32[160]{0} %p287.1838, f32[160]{0} %get-tuple-element.1847, f32[160]{0} %subtract.2490, f32[96,160,7,7]{3,2,1,0} %convolution.2475), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2492 = f32[96,160,7,7]{3,2,1,0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-grad.2491), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2500 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2505 = f32[160]{0} reduce(f32[96,160,7,7]{3,2,1,0} %get-tuple-element.2492, f32[] %constant.2500), dimensions={0,2,3}, to_apply=%AddComputation.2501, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1808 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1806), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2517 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2518 = f32[960]{0} broadcast(f32[] %constant.2517), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1809 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1806), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1810 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1811 = f32[960]{0} broadcast(f32[] %constant.1810), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1812 = f32[960]{0} add(f32[960]{0} %get-tuple-element.1809, f32[960]{0} %broadcast.1811), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1813 = f32[960]{0} rsqrt(f32[960]{0} %add.1812), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2519 = f32[960]{0} divide(f32[960]{0} %broadcast.2518, f32[960]{0} %rsqrt.1813), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2520 = f32[960]{0} multiply(f32[960]{0} %divide.2519, f32[960]{0} %divide.2519), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2516 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2521 = f32[960]{0} broadcast(f32[] %constant.2516), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2522 = f32[960]{0} subtract(f32[960]{0} %multiply.2520, f32[960]{0} %broadcast.2521), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2507 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2508 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2507), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2509 = pred[96,960,7,7]{3,2,1,0} compare(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1807, f32[96,960,7,7]{3,2,1,0} %broadcast.2508), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2510 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2511 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2510), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2512 = pred[96,960,7,7]{3,2,1,0} compare(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1807, f32[96,960,7,7]{3,2,1,0} %broadcast.2511), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2513 = pred[96,960,7,7]{3,2,1,0} and(pred[96,960,7,7]{3,2,1,0} %compare.2509, pred[96,960,7,7]{3,2,1,0} %compare.2512), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2495 = f32[1,1,960,160]{1,0,2,3} transpose(f32[160,960,1,1]{3,2,1,0} %p288.1839), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2496 = f32[1,1,960,160]{1,0,2,3} reverse(f32[1,1,960,160]{1,0,2,3} %transpose.2495), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2497 = f32[96,960,7,7]{3,2,1,0} convolution(f32[96,160,7,7]{3,2,1,0} %get-tuple-element.2492, f32[1,1,960,160]{1,0,2,3} %reverse.2496), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2506 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2514 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2506), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2515 = f32[96,960,7,7]{3,2,1,0} select(pred[96,960,7,7]{3,2,1,0} %and.2513, f32[96,960,7,7]{3,2,1,0} %convolution.2497, f32[96,960,7,7]{3,2,1,0} %broadcast.2514), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2523 = (f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) batch-norm-grad(f32[96,960,7,7]{3,2,1,0} %convolution.1805, f32[960]{0} %p281.1799, f32[960]{0} %get-tuple-element.1808, f32[960]{0} %subtract.2522, f32[96,960,7,7]{3,2,1,0} %select.2515), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2524 = f32[96,960,7,7]{3,2,1,0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2523), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2535 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2540 = f32[960]{0} reduce(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2524, f32[] %constant.2535), dimensions={0,2,3}, to_apply=%AddComputation.2536, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1769 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1767), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2552 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2553 = f32[960]{0} broadcast(f32[] %constant.2552), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1770 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1767), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1771 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1772 = f32[960]{0} broadcast(f32[] %constant.1771), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1773 = f32[960]{0} add(f32[960]{0} %get-tuple-element.1770, f32[960]{0} %broadcast.1772), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1774 = f32[960]{0} rsqrt(f32[960]{0} %add.1773), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2554 = f32[960]{0} divide(f32[960]{0} %broadcast.2553, f32[960]{0} %rsqrt.1774), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2555 = f32[960]{0} multiply(f32[960]{0} %divide.2554, f32[960]{0} %divide.2554), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2551 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2556 = f32[960]{0} broadcast(f32[] %constant.2551), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2557 = f32[960]{0} subtract(f32[960]{0} %multiply.2555, f32[960]{0} %broadcast.2556), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2542 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2543 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2542), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2544 = pred[96,960,7,7]{3,2,1,0} compare(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1768, f32[96,960,7,7]{3,2,1,0} %broadcast.2543), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2545 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2546 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2545), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2547 = pred[96,960,7,7]{3,2,1,0} compare(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1768, f32[96,960,7,7]{3,2,1,0} %broadcast.2546), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2548 = pred[96,960,7,7]{3,2,1,0} and(pred[96,960,7,7]{3,2,1,0} %compare.2544, pred[96,960,7,7]{3,2,1,0} %compare.2547), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2527 = f32[3,3,1,960]{1,0,2,3} transpose(f32[960,1,3,3]{3,2,1,0} %p282.1800), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.2528 = f32[3,3,1,960,1]{4,3,2,1,0} reshape(f32[3,3,1,960]{1,0,2,3} %transpose.2527), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2529 = f32[3,3,960,1,1]{4,2,3,1,0} transpose(f32[3,3,1,960,1]{4,3,2,1,0} %reshape.2528), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.2530 = f32[3,3,960,1]{3,2,1,0} reshape(f32[3,3,960,1,1]{4,2,3,1,0} %transpose.2529), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2531 = f32[3,3,960,1]{3,2,1,0} reverse(f32[3,3,960,1]{3,2,1,0} %reshape.2530), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2532 = f32[96,960,7,7]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2524, f32[3,3,960,1]{3,2,1,0} %reverse.2531), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=960, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2541 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2549 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2541), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2550 = f32[96,960,7,7]{3,2,1,0} select(pred[96,960,7,7]{3,2,1,0} %and.2548, f32[96,960,7,7]{3,2,1,0} %convolution.2532, f32[96,960,7,7]{3,2,1,0} %broadcast.2549), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2558 = (f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) batch-norm-grad(f32[96,960,7,7]{3,2,1,0} %convolution.1766, f32[960]{0} %p275.1757, f32[960]{0} %get-tuple-element.1769, f32[960]{0} %subtract.2557, f32[96,960,7,7]{3,2,1,0} %select.2550), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2559 = f32[96,960,7,7]{3,2,1,0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2558), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2567 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2572 = f32[960]{0} reduce(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2559, f32[] %constant.2567), dimensions={0,2,3}, to_apply=%AddComputation.2568, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1727 = f32[160]{0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-training.1725), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2581 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2582 = f32[160]{0} broadcast(f32[] %constant.2581), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1728 = f32[160]{0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-training.1725), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1729 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1730 = f32[160]{0} broadcast(f32[] %constant.1729), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1731 = f32[160]{0} add(f32[160]{0} %get-tuple-element.1728, f32[160]{0} %broadcast.1730), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1732 = f32[160]{0} rsqrt(f32[160]{0} %add.1731), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2583 = f32[160]{0} divide(f32[160]{0} %broadcast.2582, f32[160]{0} %rsqrt.1732), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2584 = f32[160]{0} multiply(f32[160]{0} %divide.2583, f32[160]{0} %divide.2583), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2580 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2585 = f32[160]{0} broadcast(f32[] %constant.2580), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2586 = f32[160]{0} subtract(f32[160]{0} %multiply.2584, f32[160]{0} %broadcast.2585), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.2562 = f32[1,1,160,960]{1,0,2,3} transpose(f32[960,160,1,1]{3,2,1,0} %p276.1758), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2563 = f32[1,1,160,960]{1,0,2,3} reverse(f32[1,1,160,960]{1,0,2,3} %transpose.2562), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2564 = f32[96,160,7,7]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2559, f32[1,1,160,960]{1,0,2,3} %reverse.2563), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2573 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant"}
+  %reshape.2574 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.2573), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.2575 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.2574), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %reshape.2576 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.2575), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.2577 = f32[96,160,7,7]{3,2,1,0} broadcast(f32[] %reshape.2576), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %multiply.2578 = f32[96,160,7,7]{3,2,1,0} multiply(f32[96,160,7,7]{3,2,1,0} %convolution.2564, f32[96,160,7,7]{3,2,1,0} %broadcast.2577), metadata={op_type="aten__mul" op_name="aten__mul"}
+  %add.2579 = f32[96,160,7,7]{3,2,1,0} add(f32[96,160,7,7]{3,2,1,0} %convolution.2475, f32[96,160,7,7]{3,2,1,0} %multiply.2578), metadata={op_type="aten__add" op_name="aten__add"}
+  %batch-norm-grad.2587 = (f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) batch-norm-grad(f32[96,160,7,7]{3,2,1,0} %convolution.1724, f32[160]{0} %p269.1718, f32[160]{0} %get-tuple-element.1727, f32[160]{0} %subtract.2586, f32[96,160,7,7]{3,2,1,0} %add.2579), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2588 = f32[96,160,7,7]{3,2,1,0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-grad.2587), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2596 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2601 = f32[160]{0} reduce(f32[96,160,7,7]{3,2,1,0} %get-tuple-element.2588, f32[] %constant.2596), dimensions={0,2,3}, to_apply=%AddComputation.2597, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1688 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1686), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2613 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2614 = f32[960]{0} broadcast(f32[] %constant.2613), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1689 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1686), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1690 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1691 = f32[960]{0} broadcast(f32[] %constant.1690), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1692 = f32[960]{0} add(f32[960]{0} %get-tuple-element.1689, f32[960]{0} %broadcast.1691), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1693 = f32[960]{0} rsqrt(f32[960]{0} %add.1692), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2615 = f32[960]{0} divide(f32[960]{0} %broadcast.2614, f32[960]{0} %rsqrt.1693), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2616 = f32[960]{0} multiply(f32[960]{0} %divide.2615, f32[960]{0} %divide.2615), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2612 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2617 = f32[960]{0} broadcast(f32[] %constant.2612), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2618 = f32[960]{0} subtract(f32[960]{0} %multiply.2616, f32[960]{0} %broadcast.2617), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2603 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2604 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2603), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2605 = pred[96,960,7,7]{3,2,1,0} compare(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1687, f32[96,960,7,7]{3,2,1,0} %broadcast.2604), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2606 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2607 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2606), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2608 = pred[96,960,7,7]{3,2,1,0} compare(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1687, f32[96,960,7,7]{3,2,1,0} %broadcast.2607), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2609 = pred[96,960,7,7]{3,2,1,0} and(pred[96,960,7,7]{3,2,1,0} %compare.2605, pred[96,960,7,7]{3,2,1,0} %compare.2608), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2591 = f32[1,1,960,160]{1,0,2,3} transpose(f32[160,960,1,1]{3,2,1,0} %p270.1719), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2592 = f32[1,1,960,160]{1,0,2,3} reverse(f32[1,1,960,160]{1,0,2,3} %transpose.2591), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2593 = f32[96,960,7,7]{3,2,1,0} convolution(f32[96,160,7,7]{3,2,1,0} %get-tuple-element.2588, f32[1,1,960,160]{1,0,2,3} %reverse.2592), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2602 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2610 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2602), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2611 = f32[96,960,7,7]{3,2,1,0} select(pred[96,960,7,7]{3,2,1,0} %and.2609, f32[96,960,7,7]{3,2,1,0} %convolution.2593, f32[96,960,7,7]{3,2,1,0} %broadcast.2610), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2619 = (f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) batch-norm-grad(f32[96,960,7,7]{3,2,1,0} %convolution.1685, f32[960]{0} %p263.1679, f32[960]{0} %get-tuple-element.1688, f32[960]{0} %subtract.2618, f32[96,960,7,7]{3,2,1,0} %select.2611), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2620 = f32[96,960,7,7]{3,2,1,0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2619), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2631 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2636 = f32[960]{0} reduce(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2620, f32[] %constant.2631), dimensions={0,2,3}, to_apply=%AddComputation.2632, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1649 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1647), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2648 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2649 = f32[960]{0} broadcast(f32[] %constant.2648), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1650 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-training.1647), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1651 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1652 = f32[960]{0} broadcast(f32[] %constant.1651), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1653 = f32[960]{0} add(f32[960]{0} %get-tuple-element.1650, f32[960]{0} %broadcast.1652), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1654 = f32[960]{0} rsqrt(f32[960]{0} %add.1653), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2650 = f32[960]{0} divide(f32[960]{0} %broadcast.2649, f32[960]{0} %rsqrt.1654), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2651 = f32[960]{0} multiply(f32[960]{0} %divide.2650, f32[960]{0} %divide.2650), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2647 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2652 = f32[960]{0} broadcast(f32[] %constant.2647), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2653 = f32[960]{0} subtract(f32[960]{0} %multiply.2651, f32[960]{0} %broadcast.2652), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2638 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2639 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2638), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2640 = pred[96,960,7,7]{3,2,1,0} compare(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1648, f32[96,960,7,7]{3,2,1,0} %broadcast.2639), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2641 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2642 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2641), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2643 = pred[96,960,7,7]{3,2,1,0} compare(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.1648, f32[96,960,7,7]{3,2,1,0} %broadcast.2642), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2644 = pred[96,960,7,7]{3,2,1,0} and(pred[96,960,7,7]{3,2,1,0} %compare.2640, pred[96,960,7,7]{3,2,1,0} %compare.2643), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2623 = f32[3,3,1,960]{1,0,2,3} transpose(f32[960,1,3,3]{3,2,1,0} %p264.1680), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.2624 = f32[3,3,1,960,1]{4,3,2,1,0} reshape(f32[3,3,1,960]{1,0,2,3} %transpose.2623), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2625 = f32[3,3,960,1,1]{4,2,3,1,0} transpose(f32[3,3,1,960,1]{4,3,2,1,0} %reshape.2624), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.2626 = f32[3,3,960,1]{3,2,1,0} reshape(f32[3,3,960,1,1]{4,2,3,1,0} %transpose.2625), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2627 = f32[3,3,960,1]{3,2,1,0} reverse(f32[3,3,960,1]{3,2,1,0} %reshape.2626), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2628 = f32[96,960,7,7]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2620, f32[3,3,960,1]{3,2,1,0} %reverse.2627), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=960, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2637 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2645 = f32[96,960,7,7]{3,2,1,0} broadcast(f32[] %constant.2637), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2646 = f32[96,960,7,7]{3,2,1,0} select(pred[96,960,7,7]{3,2,1,0} %and.2644, f32[96,960,7,7]{3,2,1,0} %convolution.2628, f32[96,960,7,7]{3,2,1,0} %broadcast.2645), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2654 = (f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) batch-norm-grad(f32[96,960,7,7]{3,2,1,0} %convolution.1646, f32[960]{0} %p257.1644, f32[960]{0} %get-tuple-element.1649, f32[960]{0} %subtract.2653, f32[96,960,7,7]{3,2,1,0} %select.2646), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2655 = f32[96,960,7,7]{3,2,1,0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2654), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2663 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2668 = f32[960]{0} reduce(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2655, f32[] %constant.2663), dimensions={0,2,3}, to_apply=%AddComputation.2664, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1614 = f32[160]{0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-training.1612), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2677 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2678 = f32[160]{0} broadcast(f32[] %constant.2677), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1615 = f32[160]{0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-training.1612), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1616 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1617 = f32[160]{0} broadcast(f32[] %constant.1616), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1618 = f32[160]{0} add(f32[160]{0} %get-tuple-element.1615, f32[160]{0} %broadcast.1617), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1619 = f32[160]{0} rsqrt(f32[160]{0} %add.1618), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2679 = f32[160]{0} divide(f32[160]{0} %broadcast.2678, f32[160]{0} %rsqrt.1619), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2680 = f32[160]{0} multiply(f32[160]{0} %divide.2679, f32[160]{0} %divide.2679), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2676 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2681 = f32[160]{0} broadcast(f32[] %constant.2676), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2682 = f32[160]{0} subtract(f32[160]{0} %multiply.2680, f32[160]{0} %broadcast.2681), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.2658 = f32[1,1,160,960]{1,0,2,3} transpose(f32[960,160,1,1]{3,2,1,0} %p258.1645), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2659 = f32[1,1,160,960]{1,0,2,3} reverse(f32[1,1,160,960]{1,0,2,3} %transpose.2658), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2660 = f32[96,160,7,7]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2655, f32[1,1,160,960]{1,0,2,3} %reverse.2659), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2669 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant"}
+  %reshape.2670 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.2669), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.2671 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.2670), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %reshape.2672 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.2671), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.2673 = f32[96,160,7,7]{3,2,1,0} broadcast(f32[] %reshape.2672), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %multiply.2674 = f32[96,160,7,7]{3,2,1,0} multiply(f32[96,160,7,7]{3,2,1,0} %convolution.2660, f32[96,160,7,7]{3,2,1,0} %broadcast.2673), metadata={op_type="aten__mul" op_name="aten__mul"}
+  %add.2675 = f32[96,160,7,7]{3,2,1,0} add(f32[96,160,7,7]{3,2,1,0} %add.2579, f32[96,160,7,7]{3,2,1,0} %multiply.2674), metadata={op_type="aten__add" op_name="aten__add"}
+  %batch-norm-grad.2683 = (f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) batch-norm-grad(f32[96,160,7,7]{3,2,1,0} %convolution.1611, f32[160]{0} %p251.1605, f32[160]{0} %get-tuple-element.1614, f32[160]{0} %subtract.2682, f32[96,160,7,7]{3,2,1,0} %add.2675), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2684 = f32[96,160,7,7]{3,2,1,0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-grad.2683), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2692 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2697 = f32[160]{0} reduce(f32[96,160,7,7]{3,2,1,0} %get-tuple-element.2684, f32[] %constant.2692), dimensions={0,2,3}, to_apply=%AddComputation.2693, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1575 = f32[576]{0} get-tuple-element((f32[96,576,7,7]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1573), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2709 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2710 = f32[576]{0} broadcast(f32[] %constant.2709), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1576 = f32[576]{0} get-tuple-element((f32[96,576,7,7]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1573), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1577 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1578 = f32[576]{0} broadcast(f32[] %constant.1577), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1579 = f32[576]{0} add(f32[576]{0} %get-tuple-element.1576, f32[576]{0} %broadcast.1578), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1580 = f32[576]{0} rsqrt(f32[576]{0} %add.1579), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2711 = f32[576]{0} divide(f32[576]{0} %broadcast.2710, f32[576]{0} %rsqrt.1580), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2712 = f32[576]{0} multiply(f32[576]{0} %divide.2711, f32[576]{0} %divide.2711), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2708 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2713 = f32[576]{0} broadcast(f32[] %constant.2708), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2714 = f32[576]{0} subtract(f32[576]{0} %multiply.2712, f32[576]{0} %broadcast.2713), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2699 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2700 = f32[96,576,7,7]{3,2,1,0} broadcast(f32[] %constant.2699), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2701 = pred[96,576,7,7]{3,2,1,0} compare(f32[96,576,7,7]{3,2,1,0} %get-tuple-element.1574, f32[96,576,7,7]{3,2,1,0} %broadcast.2700), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2702 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2703 = f32[96,576,7,7]{3,2,1,0} broadcast(f32[] %constant.2702), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2704 = pred[96,576,7,7]{3,2,1,0} compare(f32[96,576,7,7]{3,2,1,0} %get-tuple-element.1574, f32[96,576,7,7]{3,2,1,0} %broadcast.2703), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2705 = pred[96,576,7,7]{3,2,1,0} and(pred[96,576,7,7]{3,2,1,0} %compare.2701, pred[96,576,7,7]{3,2,1,0} %compare.2704), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2687 = f32[1,1,576,160]{1,0,2,3} transpose(f32[160,576,1,1]{3,2,1,0} %p252.1606), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2688 = f32[1,1,576,160]{1,0,2,3} reverse(f32[1,1,576,160]{1,0,2,3} %transpose.2687), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2689 = f32[96,576,7,7]{3,2,1,0} convolution(f32[96,160,7,7]{3,2,1,0} %get-tuple-element.2684, f32[1,1,576,160]{1,0,2,3} %reverse.2688), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2698 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2706 = f32[96,576,7,7]{3,2,1,0} broadcast(f32[] %constant.2698), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2707 = f32[96,576,7,7]{3,2,1,0} select(pred[96,576,7,7]{3,2,1,0} %and.2705, f32[96,576,7,7]{3,2,1,0} %convolution.2689, f32[96,576,7,7]{3,2,1,0} %broadcast.2706), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2715 = (f32[96,576,7,7]{3,2,1,0}, f32[576]{0}, f32[576]{0}) batch-norm-grad(f32[96,576,7,7]{3,2,1,0} %convolution.1572, f32[576]{0} %p245.1566, f32[576]{0} %get-tuple-element.1575, f32[576]{0} %subtract.2714, f32[96,576,7,7]{3,2,1,0} %select.2707), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2716 = f32[96,576,7,7]{3,2,1,0} get-tuple-element((f32[96,576,7,7]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2715), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2727 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2732 = f32[576]{0} reduce(f32[96,576,7,7]{3,2,1,0} %get-tuple-element.2716, f32[] %constant.2727), dimensions={0,2,3}, to_apply=%AddComputation.2728, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1536 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1534), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2744 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2745 = f32[576]{0} broadcast(f32[] %constant.2744), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1537 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1534), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1538 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1539 = f32[576]{0} broadcast(f32[] %constant.1538), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1540 = f32[576]{0} add(f32[576]{0} %get-tuple-element.1537, f32[576]{0} %broadcast.1539), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1541 = f32[576]{0} rsqrt(f32[576]{0} %add.1540), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2746 = f32[576]{0} divide(f32[576]{0} %broadcast.2745, f32[576]{0} %rsqrt.1541), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2747 = f32[576]{0} multiply(f32[576]{0} %divide.2746, f32[576]{0} %divide.2746), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2743 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2748 = f32[576]{0} broadcast(f32[] %constant.2743), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2749 = f32[576]{0} subtract(f32[576]{0} %multiply.2747, f32[576]{0} %broadcast.2748), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2734 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2735 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2734), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2736 = pred[96,576,14,14]{3,2,1,0} compare(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1535, f32[96,576,14,14]{3,2,1,0} %broadcast.2735), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2737 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2738 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2737), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2739 = pred[96,576,14,14]{3,2,1,0} compare(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1535, f32[96,576,14,14]{3,2,1,0} %broadcast.2738), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2740 = pred[96,576,14,14]{3,2,1,0} and(pred[96,576,14,14]{3,2,1,0} %compare.2736, pred[96,576,14,14]{3,2,1,0} %compare.2739), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2719 = f32[3,3,1,576]{1,0,2,3} transpose(f32[576,1,3,3]{3,2,1,0} %p246.1567), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.2720 = f32[3,3,1,576,1]{4,3,2,1,0} reshape(f32[3,3,1,576]{1,0,2,3} %transpose.2719), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2721 = f32[3,3,576,1,1]{4,2,3,1,0} transpose(f32[3,3,1,576,1]{4,3,2,1,0} %reshape.2720), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.2722 = f32[3,3,576,1]{3,2,1,0} reshape(f32[3,3,576,1,1]{4,2,3,1,0} %transpose.2721), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2723 = f32[3,3,576,1]{3,2,1,0} reverse(f32[3,3,576,1]{3,2,1,0} %reshape.2722), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2724 = f32[96,576,14,14]{3,2,1,0} convolution(f32[96,576,7,7]{3,2,1,0} %get-tuple-element.2716, f32[3,3,576,1]{3,2,1,0} %reverse.2723), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2}, dim_labels=bf01_01oi->bf01, feature_group_count=576, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2733 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2741 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2733), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2742 = f32[96,576,14,14]{3,2,1,0} select(pred[96,576,14,14]{3,2,1,0} %and.2740, f32[96,576,14,14]{3,2,1,0} %convolution.2724, f32[96,576,14,14]{3,2,1,0} %broadcast.2741), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2750 = (f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) batch-norm-grad(f32[96,576,14,14]{3,2,1,0} %convolution.1533, f32[576]{0} %p239.1524, f32[576]{0} %get-tuple-element.1536, f32[576]{0} %subtract.2749, f32[96,576,14,14]{3,2,1,0} %select.2742), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2751 = f32[96,576,14,14]{3,2,1,0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2750), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2759 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2764 = f32[576]{0} reduce(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2751, f32[] %constant.2759), dimensions={0,2,3}, to_apply=%AddComputation.2760, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1494 = f32[96]{0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.1492), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2766 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2767 = f32[96]{0} broadcast(f32[] %constant.2766), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1495 = f32[96]{0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.1492), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1496 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1497 = f32[96]{0} broadcast(f32[] %constant.1496), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1498 = f32[96]{0} add(f32[96]{0} %get-tuple-element.1495, f32[96]{0} %broadcast.1497), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1499 = f32[96]{0} rsqrt(f32[96]{0} %add.1498), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2768 = f32[96]{0} divide(f32[96]{0} %broadcast.2767, f32[96]{0} %rsqrt.1499), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2769 = f32[96]{0} multiply(f32[96]{0} %divide.2768, f32[96]{0} %divide.2768), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2765 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2770 = f32[96]{0} broadcast(f32[] %constant.2765), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2771 = f32[96]{0} subtract(f32[96]{0} %multiply.2769, f32[96]{0} %broadcast.2770), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.2754 = f32[1,1,96,576]{1,0,2,3} transpose(f32[576,96,1,1]{3,2,1,0} %p240.1525), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2755 = f32[1,1,96,576]{1,0,2,3} reverse(f32[1,1,96,576]{1,0,2,3} %transpose.2754), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2756 = f32[96,96,14,14]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2751, f32[1,1,96,576]{1,0,2,3} %reverse.2755), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %batch-norm-grad.2772 = (f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) batch-norm-grad(f32[96,96,14,14]{3,2,1,0} %convolution.1491, f32[96]{0} %p233.1485, f32[96]{0} %get-tuple-element.1494, f32[96]{0} %subtract.2771, f32[96,96,14,14]{3,2,1,0} %convolution.2756), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2773 = f32[96,96,14,14]{3,2,1,0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.2772), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2781 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2786 = f32[96]{0} reduce(f32[96,96,14,14]{3,2,1,0} %get-tuple-element.2773, f32[] %constant.2781), dimensions={0,2,3}, to_apply=%AddComputation.2782, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1455 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1453), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2798 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2799 = f32[576]{0} broadcast(f32[] %constant.2798), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1456 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1453), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1457 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1458 = f32[576]{0} broadcast(f32[] %constant.1457), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1459 = f32[576]{0} add(f32[576]{0} %get-tuple-element.1456, f32[576]{0} %broadcast.1458), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1460 = f32[576]{0} rsqrt(f32[576]{0} %add.1459), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2800 = f32[576]{0} divide(f32[576]{0} %broadcast.2799, f32[576]{0} %rsqrt.1460), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2801 = f32[576]{0} multiply(f32[576]{0} %divide.2800, f32[576]{0} %divide.2800), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2797 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2802 = f32[576]{0} broadcast(f32[] %constant.2797), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2803 = f32[576]{0} subtract(f32[576]{0} %multiply.2801, f32[576]{0} %broadcast.2802), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2788 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2789 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2788), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2790 = pred[96,576,14,14]{3,2,1,0} compare(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1454, f32[96,576,14,14]{3,2,1,0} %broadcast.2789), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2791 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2792 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2791), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2793 = pred[96,576,14,14]{3,2,1,0} compare(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1454, f32[96,576,14,14]{3,2,1,0} %broadcast.2792), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2794 = pred[96,576,14,14]{3,2,1,0} and(pred[96,576,14,14]{3,2,1,0} %compare.2790, pred[96,576,14,14]{3,2,1,0} %compare.2793), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2776 = f32[1,1,576,96]{1,0,2,3} transpose(f32[96,576,1,1]{3,2,1,0} %p234.1486), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2777 = f32[1,1,576,96]{1,0,2,3} reverse(f32[1,1,576,96]{1,0,2,3} %transpose.2776), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2778 = f32[96,576,14,14]{3,2,1,0} convolution(f32[96,96,14,14]{3,2,1,0} %get-tuple-element.2773, f32[1,1,576,96]{1,0,2,3} %reverse.2777), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2787 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2795 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2787), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2796 = f32[96,576,14,14]{3,2,1,0} select(pred[96,576,14,14]{3,2,1,0} %and.2794, f32[96,576,14,14]{3,2,1,0} %convolution.2778, f32[96,576,14,14]{3,2,1,0} %broadcast.2795), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2804 = (f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) batch-norm-grad(f32[96,576,14,14]{3,2,1,0} %convolution.1452, f32[576]{0} %p227.1446, f32[576]{0} %get-tuple-element.1455, f32[576]{0} %subtract.2803, f32[96,576,14,14]{3,2,1,0} %select.2796), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2805 = f32[96,576,14,14]{3,2,1,0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2804), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2816 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2821 = f32[576]{0} reduce(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2805, f32[] %constant.2816), dimensions={0,2,3}, to_apply=%AddComputation.2817, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1416 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1414), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2833 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2834 = f32[576]{0} broadcast(f32[] %constant.2833), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1417 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1414), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1418 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1419 = f32[576]{0} broadcast(f32[] %constant.1418), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1420 = f32[576]{0} add(f32[576]{0} %get-tuple-element.1417, f32[576]{0} %broadcast.1419), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1421 = f32[576]{0} rsqrt(f32[576]{0} %add.1420), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2835 = f32[576]{0} divide(f32[576]{0} %broadcast.2834, f32[576]{0} %rsqrt.1421), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2836 = f32[576]{0} multiply(f32[576]{0} %divide.2835, f32[576]{0} %divide.2835), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2832 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2837 = f32[576]{0} broadcast(f32[] %constant.2832), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2838 = f32[576]{0} subtract(f32[576]{0} %multiply.2836, f32[576]{0} %broadcast.2837), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2823 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2824 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2823), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2825 = pred[96,576,14,14]{3,2,1,0} compare(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1415, f32[96,576,14,14]{3,2,1,0} %broadcast.2824), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2826 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2827 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2826), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2828 = pred[96,576,14,14]{3,2,1,0} compare(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1415, f32[96,576,14,14]{3,2,1,0} %broadcast.2827), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2829 = pred[96,576,14,14]{3,2,1,0} and(pred[96,576,14,14]{3,2,1,0} %compare.2825, pred[96,576,14,14]{3,2,1,0} %compare.2828), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2808 = f32[3,3,1,576]{1,0,2,3} transpose(f32[576,1,3,3]{3,2,1,0} %p228.1447), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.2809 = f32[3,3,1,576,1]{4,3,2,1,0} reshape(f32[3,3,1,576]{1,0,2,3} %transpose.2808), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2810 = f32[3,3,576,1,1]{4,2,3,1,0} transpose(f32[3,3,1,576,1]{4,3,2,1,0} %reshape.2809), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.2811 = f32[3,3,576,1]{3,2,1,0} reshape(f32[3,3,576,1,1]{4,2,3,1,0} %transpose.2810), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2812 = f32[3,3,576,1]{3,2,1,0} reverse(f32[3,3,576,1]{3,2,1,0} %reshape.2811), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2813 = f32[96,576,14,14]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2805, f32[3,3,576,1]{3,2,1,0} %reverse.2812), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=576, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2822 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2830 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2822), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2831 = f32[96,576,14,14]{3,2,1,0} select(pred[96,576,14,14]{3,2,1,0} %and.2829, f32[96,576,14,14]{3,2,1,0} %convolution.2813, f32[96,576,14,14]{3,2,1,0} %broadcast.2830), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2839 = (f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) batch-norm-grad(f32[96,576,14,14]{3,2,1,0} %convolution.1413, f32[576]{0} %p221.1404, f32[576]{0} %get-tuple-element.1416, f32[576]{0} %subtract.2838, f32[96,576,14,14]{3,2,1,0} %select.2831), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2840 = f32[96,576,14,14]{3,2,1,0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2839), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2848 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2853 = f32[576]{0} reduce(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2840, f32[] %constant.2848), dimensions={0,2,3}, to_apply=%AddComputation.2849, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1374 = f32[96]{0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.1372), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2862 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2863 = f32[96]{0} broadcast(f32[] %constant.2862), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1375 = f32[96]{0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.1372), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1376 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1377 = f32[96]{0} broadcast(f32[] %constant.1376), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1378 = f32[96]{0} add(f32[96]{0} %get-tuple-element.1375, f32[96]{0} %broadcast.1377), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1379 = f32[96]{0} rsqrt(f32[96]{0} %add.1378), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2864 = f32[96]{0} divide(f32[96]{0} %broadcast.2863, f32[96]{0} %rsqrt.1379), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2865 = f32[96]{0} multiply(f32[96]{0} %divide.2864, f32[96]{0} %divide.2864), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2861 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2866 = f32[96]{0} broadcast(f32[] %constant.2861), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2867 = f32[96]{0} subtract(f32[96]{0} %multiply.2865, f32[96]{0} %broadcast.2866), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.2843 = f32[1,1,96,576]{1,0,2,3} transpose(f32[576,96,1,1]{3,2,1,0} %p222.1405), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2844 = f32[1,1,96,576]{1,0,2,3} reverse(f32[1,1,96,576]{1,0,2,3} %transpose.2843), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2845 = f32[96,96,14,14]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2840, f32[1,1,96,576]{1,0,2,3} %reverse.2844), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2854 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant"}
+  %reshape.2855 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.2854), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.2856 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.2855), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %reshape.2857 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.2856), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.2858 = f32[96,96,14,14]{3,2,1,0} broadcast(f32[] %reshape.2857), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %multiply.2859 = f32[96,96,14,14]{3,2,1,0} multiply(f32[96,96,14,14]{3,2,1,0} %convolution.2845, f32[96,96,14,14]{3,2,1,0} %broadcast.2858), metadata={op_type="aten__mul" op_name="aten__mul"}
+  %add.2860 = f32[96,96,14,14]{3,2,1,0} add(f32[96,96,14,14]{3,2,1,0} %convolution.2756, f32[96,96,14,14]{3,2,1,0} %multiply.2859), metadata={op_type="aten__add" op_name="aten__add"}
+  %batch-norm-grad.2868 = (f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) batch-norm-grad(f32[96,96,14,14]{3,2,1,0} %convolution.1371, f32[96]{0} %p215.1365, f32[96]{0} %get-tuple-element.1374, f32[96]{0} %subtract.2867, f32[96,96,14,14]{3,2,1,0} %add.2860), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2869 = f32[96,96,14,14]{3,2,1,0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.2868), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2877 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2882 = f32[96]{0} reduce(f32[96,96,14,14]{3,2,1,0} %get-tuple-element.2869, f32[] %constant.2877), dimensions={0,2,3}, to_apply=%AddComputation.2878, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1335 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1333), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2894 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2895 = f32[576]{0} broadcast(f32[] %constant.2894), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1336 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1333), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1337 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1338 = f32[576]{0} broadcast(f32[] %constant.1337), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1339 = f32[576]{0} add(f32[576]{0} %get-tuple-element.1336, f32[576]{0} %broadcast.1338), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1340 = f32[576]{0} rsqrt(f32[576]{0} %add.1339), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2896 = f32[576]{0} divide(f32[576]{0} %broadcast.2895, f32[576]{0} %rsqrt.1340), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2897 = f32[576]{0} multiply(f32[576]{0} %divide.2896, f32[576]{0} %divide.2896), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2893 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2898 = f32[576]{0} broadcast(f32[] %constant.2893), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2899 = f32[576]{0} subtract(f32[576]{0} %multiply.2897, f32[576]{0} %broadcast.2898), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2884 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2885 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2884), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2886 = pred[96,576,14,14]{3,2,1,0} compare(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1334, f32[96,576,14,14]{3,2,1,0} %broadcast.2885), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2887 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2888 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2887), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2889 = pred[96,576,14,14]{3,2,1,0} compare(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1334, f32[96,576,14,14]{3,2,1,0} %broadcast.2888), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2890 = pred[96,576,14,14]{3,2,1,0} and(pred[96,576,14,14]{3,2,1,0} %compare.2886, pred[96,576,14,14]{3,2,1,0} %compare.2889), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2872 = f32[1,1,576,96]{1,0,2,3} transpose(f32[96,576,1,1]{3,2,1,0} %p216.1366), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2873 = f32[1,1,576,96]{1,0,2,3} reverse(f32[1,1,576,96]{1,0,2,3} %transpose.2872), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2874 = f32[96,576,14,14]{3,2,1,0} convolution(f32[96,96,14,14]{3,2,1,0} %get-tuple-element.2869, f32[1,1,576,96]{1,0,2,3} %reverse.2873), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2883 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2891 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2883), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2892 = f32[96,576,14,14]{3,2,1,0} select(pred[96,576,14,14]{3,2,1,0} %and.2890, f32[96,576,14,14]{3,2,1,0} %convolution.2874, f32[96,576,14,14]{3,2,1,0} %broadcast.2891), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2900 = (f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) batch-norm-grad(f32[96,576,14,14]{3,2,1,0} %convolution.1332, f32[576]{0} %p209.1326, f32[576]{0} %get-tuple-element.1335, f32[576]{0} %subtract.2899, f32[96,576,14,14]{3,2,1,0} %select.2892), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2901 = f32[96,576,14,14]{3,2,1,0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2900), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2912 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2917 = f32[576]{0} reduce(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2901, f32[] %constant.2912), dimensions={0,2,3}, to_apply=%AddComputation.2913, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1296 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1294), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2929 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2930 = f32[576]{0} broadcast(f32[] %constant.2929), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1297 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-training.1294), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1298 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1299 = f32[576]{0} broadcast(f32[] %constant.1298), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1300 = f32[576]{0} add(f32[576]{0} %get-tuple-element.1297, f32[576]{0} %broadcast.1299), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1301 = f32[576]{0} rsqrt(f32[576]{0} %add.1300), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2931 = f32[576]{0} divide(f32[576]{0} %broadcast.2930, f32[576]{0} %rsqrt.1301), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2932 = f32[576]{0} multiply(f32[576]{0} %divide.2931, f32[576]{0} %divide.2931), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2928 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2933 = f32[576]{0} broadcast(f32[] %constant.2928), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2934 = f32[576]{0} subtract(f32[576]{0} %multiply.2932, f32[576]{0} %broadcast.2933), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2919 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2920 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2919), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2921 = pred[96,576,14,14]{3,2,1,0} compare(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1295, f32[96,576,14,14]{3,2,1,0} %broadcast.2920), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2922 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2923 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2922), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2924 = pred[96,576,14,14]{3,2,1,0} compare(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.1295, f32[96,576,14,14]{3,2,1,0} %broadcast.2923), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2925 = pred[96,576,14,14]{3,2,1,0} and(pred[96,576,14,14]{3,2,1,0} %compare.2921, pred[96,576,14,14]{3,2,1,0} %compare.2924), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2904 = f32[3,3,1,576]{1,0,2,3} transpose(f32[576,1,3,3]{3,2,1,0} %p210.1327), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.2905 = f32[3,3,1,576,1]{4,3,2,1,0} reshape(f32[3,3,1,576]{1,0,2,3} %transpose.2904), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2906 = f32[3,3,576,1,1]{4,2,3,1,0} transpose(f32[3,3,1,576,1]{4,3,2,1,0} %reshape.2905), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.2907 = f32[3,3,576,1]{3,2,1,0} reshape(f32[3,3,576,1,1]{4,2,3,1,0} %transpose.2906), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2908 = f32[3,3,576,1]{3,2,1,0} reverse(f32[3,3,576,1]{3,2,1,0} %reshape.2907), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2909 = f32[96,576,14,14]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2901, f32[3,3,576,1]{3,2,1,0} %reverse.2908), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=576, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2918 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2926 = f32[96,576,14,14]{3,2,1,0} broadcast(f32[] %constant.2918), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2927 = f32[96,576,14,14]{3,2,1,0} select(pred[96,576,14,14]{3,2,1,0} %and.2925, f32[96,576,14,14]{3,2,1,0} %convolution.2909, f32[96,576,14,14]{3,2,1,0} %broadcast.2926), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2935 = (f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) batch-norm-grad(f32[96,576,14,14]{3,2,1,0} %convolution.1293, f32[576]{0} %p203.1291, f32[576]{0} %get-tuple-element.1296, f32[576]{0} %subtract.2934, f32[96,576,14,14]{3,2,1,0} %select.2927), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2936 = f32[96,576,14,14]{3,2,1,0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2935), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2944 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2949 = f32[576]{0} reduce(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2936, f32[] %constant.2944), dimensions={0,2,3}, to_apply=%AddComputation.2945, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1261 = f32[96]{0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.1259), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2958 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2959 = f32[96]{0} broadcast(f32[] %constant.2958), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1262 = f32[96]{0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.1259), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1263 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1264 = f32[96]{0} broadcast(f32[] %constant.1263), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1265 = f32[96]{0} add(f32[96]{0} %get-tuple-element.1262, f32[96]{0} %broadcast.1264), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1266 = f32[96]{0} rsqrt(f32[96]{0} %add.1265), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2960 = f32[96]{0} divide(f32[96]{0} %broadcast.2959, f32[96]{0} %rsqrt.1266), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2961 = f32[96]{0} multiply(f32[96]{0} %divide.2960, f32[96]{0} %divide.2960), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2957 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2962 = f32[96]{0} broadcast(f32[] %constant.2957), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2963 = f32[96]{0} subtract(f32[96]{0} %multiply.2961, f32[96]{0} %broadcast.2962), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.2939 = f32[1,1,96,576]{1,0,2,3} transpose(f32[576,96,1,1]{3,2,1,0} %p204.1292), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2940 = f32[1,1,96,576]{1,0,2,3} reverse(f32[1,1,96,576]{1,0,2,3} %transpose.2939), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2941 = f32[96,96,14,14]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2936, f32[1,1,96,576]{1,0,2,3} %reverse.2940), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2950 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant"}
+  %reshape.2951 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.2950), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.2952 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.2951), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %reshape.2953 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.2952), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.2954 = f32[96,96,14,14]{3,2,1,0} broadcast(f32[] %reshape.2953), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %multiply.2955 = f32[96,96,14,14]{3,2,1,0} multiply(f32[96,96,14,14]{3,2,1,0} %convolution.2941, f32[96,96,14,14]{3,2,1,0} %broadcast.2954), metadata={op_type="aten__mul" op_name="aten__mul"}
+  %add.2956 = f32[96,96,14,14]{3,2,1,0} add(f32[96,96,14,14]{3,2,1,0} %add.2860, f32[96,96,14,14]{3,2,1,0} %multiply.2955), metadata={op_type="aten__add" op_name="aten__add"}
+  %batch-norm-grad.2964 = (f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) batch-norm-grad(f32[96,96,14,14]{3,2,1,0} %convolution.1258, f32[96]{0} %p197.1252, f32[96]{0} %get-tuple-element.1261, f32[96]{0} %subtract.2963, f32[96,96,14,14]{3,2,1,0} %add.2956), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2965 = f32[96,96,14,14]{3,2,1,0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.2964), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2973 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.2978 = f32[96]{0} reduce(f32[96,96,14,14]{3,2,1,0} %get-tuple-element.2965, f32[] %constant.2973), dimensions={0,2,3}, to_apply=%AddComputation.2974, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1222 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.1220), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2990 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2991 = f32[384]{0} broadcast(f32[] %constant.2990), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1223 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.1220), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1224 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1225 = f32[384]{0} broadcast(f32[] %constant.1224), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1226 = f32[384]{0} add(f32[384]{0} %get-tuple-element.1223, f32[384]{0} %broadcast.1225), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1227 = f32[384]{0} rsqrt(f32[384]{0} %add.1226), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.2992 = f32[384]{0} divide(f32[384]{0} %broadcast.2991, f32[384]{0} %rsqrt.1227), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.2993 = f32[384]{0} multiply(f32[384]{0} %divide.2992, f32[384]{0} %divide.2992), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2989 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.2994 = f32[384]{0} broadcast(f32[] %constant.2989), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.2995 = f32[384]{0} subtract(f32[384]{0} %multiply.2993, f32[384]{0} %broadcast.2994), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.2980 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2981 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.2980), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2982 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.1221, f32[96,384,14,14]{3,2,1,0} %broadcast.2981), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.2983 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2984 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.2983), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.2985 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.1221, f32[96,384,14,14]{3,2,1,0} %broadcast.2984), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.2986 = pred[96,384,14,14]{3,2,1,0} and(pred[96,384,14,14]{3,2,1,0} %compare.2982, pred[96,384,14,14]{3,2,1,0} %compare.2985), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.2968 = f32[1,1,384,96]{1,0,2,3} transpose(f32[96,384,1,1]{3,2,1,0} %p198.1253), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.2969 = f32[1,1,384,96]{1,0,2,3} reverse(f32[1,1,384,96]{1,0,2,3} %transpose.2968), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.2970 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,96,14,14]{3,2,1,0} %get-tuple-element.2965, f32[1,1,384,96]{1,0,2,3} %reverse.2969), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.2979 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.2987 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.2979), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.2988 = f32[96,384,14,14]{3,2,1,0} select(pred[96,384,14,14]{3,2,1,0} %and.2986, f32[96,384,14,14]{3,2,1,0} %convolution.2970, f32[96,384,14,14]{3,2,1,0} %broadcast.2987), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.2996 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-grad(f32[96,384,14,14]{3,2,1,0} %convolution.1219, f32[384]{0} %p191.1213, f32[384]{0} %get-tuple-element.1222, f32[384]{0} %subtract.2995, f32[96,384,14,14]{3,2,1,0} %select.2988), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2997 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.2996), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3008 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3013 = f32[384]{0} reduce(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.2997, f32[] %constant.3008), dimensions={0,2,3}, to_apply=%AddComputation.3009, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1183 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.1181), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3025 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3026 = f32[384]{0} broadcast(f32[] %constant.3025), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1184 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.1181), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1185 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1186 = f32[384]{0} broadcast(f32[] %constant.1185), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1187 = f32[384]{0} add(f32[384]{0} %get-tuple-element.1184, f32[384]{0} %broadcast.1186), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1188 = f32[384]{0} rsqrt(f32[384]{0} %add.1187), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3027 = f32[384]{0} divide(f32[384]{0} %broadcast.3026, f32[384]{0} %rsqrt.1188), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3028 = f32[384]{0} multiply(f32[384]{0} %divide.3027, f32[384]{0} %divide.3027), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3024 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3029 = f32[384]{0} broadcast(f32[] %constant.3024), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3030 = f32[384]{0} subtract(f32[384]{0} %multiply.3028, f32[384]{0} %broadcast.3029), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3015 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3016 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3015), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3017 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.1182, f32[96,384,14,14]{3,2,1,0} %broadcast.3016), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3018 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3019 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3018), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3020 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.1182, f32[96,384,14,14]{3,2,1,0} %broadcast.3019), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3021 = pred[96,384,14,14]{3,2,1,0} and(pred[96,384,14,14]{3,2,1,0} %compare.3017, pred[96,384,14,14]{3,2,1,0} %compare.3020), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3000 = f32[3,3,1,384]{1,0,2,3} transpose(f32[384,1,3,3]{3,2,1,0} %p192.1214), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3001 = f32[3,3,1,384,1]{4,3,2,1,0} reshape(f32[3,3,1,384]{1,0,2,3} %transpose.3000), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3002 = f32[3,3,384,1,1]{4,2,3,1,0} transpose(f32[3,3,1,384,1]{4,3,2,1,0} %reshape.3001), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3003 = f32[3,3,384,1]{3,2,1,0} reshape(f32[3,3,384,1,1]{4,2,3,1,0} %transpose.3002), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3004 = f32[3,3,384,1]{3,2,1,0} reverse(f32[3,3,384,1]{3,2,1,0} %reshape.3003), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3005 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.2997, f32[3,3,384,1]{3,2,1,0} %reverse.3004), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=384, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3014 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3022 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3014), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3023 = f32[96,384,14,14]{3,2,1,0} select(pred[96,384,14,14]{3,2,1,0} %and.3021, f32[96,384,14,14]{3,2,1,0} %convolution.3005, f32[96,384,14,14]{3,2,1,0} %broadcast.3022), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3031 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-grad(f32[96,384,14,14]{3,2,1,0} %convolution.1180, f32[384]{0} %p185.1171, f32[384]{0} %get-tuple-element.1183, f32[384]{0} %subtract.3030, f32[96,384,14,14]{3,2,1,0} %select.3023), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3032 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3031), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3040 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3045 = f32[384]{0} reduce(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3032, f32[] %constant.3040), dimensions={0,2,3}, to_apply=%AddComputation.3041, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1141 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-training.1139), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3047 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3048 = f32[64]{0} broadcast(f32[] %constant.3047), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1142 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-training.1139), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1143 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1144 = f32[64]{0} broadcast(f32[] %constant.1143), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1145 = f32[64]{0} add(f32[64]{0} %get-tuple-element.1142, f32[64]{0} %broadcast.1144), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1146 = f32[64]{0} rsqrt(f32[64]{0} %add.1145), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3049 = f32[64]{0} divide(f32[64]{0} %broadcast.3048, f32[64]{0} %rsqrt.1146), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3050 = f32[64]{0} multiply(f32[64]{0} %divide.3049, f32[64]{0} %divide.3049), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3046 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3051 = f32[64]{0} broadcast(f32[] %constant.3046), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3052 = f32[64]{0} subtract(f32[64]{0} %multiply.3050, f32[64]{0} %broadcast.3051), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.3035 = f32[1,1,64,384]{1,0,2,3} transpose(f32[384,64,1,1]{3,2,1,0} %p186.1172), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3036 = f32[1,1,64,384]{1,0,2,3} reverse(f32[1,1,64,384]{1,0,2,3} %transpose.3035), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3037 = f32[96,64,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3032, f32[1,1,64,384]{1,0,2,3} %reverse.3036), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %batch-norm-grad.3053 = (f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) batch-norm-grad(f32[96,64,14,14]{3,2,1,0} %convolution.1138, f32[64]{0} %p179.1132, f32[64]{0} %get-tuple-element.1141, f32[64]{0} %subtract.3052, f32[96,64,14,14]{3,2,1,0} %convolution.3037), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3054 = f32[96,64,14,14]{3,2,1,0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-grad.3053), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3062 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3067 = f32[64]{0} reduce(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.3054, f32[] %constant.3062), dimensions={0,2,3}, to_apply=%AddComputation.3063, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1102 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.1100), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3079 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3080 = f32[384]{0} broadcast(f32[] %constant.3079), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1103 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.1100), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1104 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1105 = f32[384]{0} broadcast(f32[] %constant.1104), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1106 = f32[384]{0} add(f32[384]{0} %get-tuple-element.1103, f32[384]{0} %broadcast.1105), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1107 = f32[384]{0} rsqrt(f32[384]{0} %add.1106), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3081 = f32[384]{0} divide(f32[384]{0} %broadcast.3080, f32[384]{0} %rsqrt.1107), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3082 = f32[384]{0} multiply(f32[384]{0} %divide.3081, f32[384]{0} %divide.3081), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3078 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3083 = f32[384]{0} broadcast(f32[] %constant.3078), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3084 = f32[384]{0} subtract(f32[384]{0} %multiply.3082, f32[384]{0} %broadcast.3083), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3069 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3070 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3069), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3071 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.1101, f32[96,384,14,14]{3,2,1,0} %broadcast.3070), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3072 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3073 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3072), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3074 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.1101, f32[96,384,14,14]{3,2,1,0} %broadcast.3073), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3075 = pred[96,384,14,14]{3,2,1,0} and(pred[96,384,14,14]{3,2,1,0} %compare.3071, pred[96,384,14,14]{3,2,1,0} %compare.3074), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3057 = f32[1,1,384,64]{1,0,2,3} transpose(f32[64,384,1,1]{3,2,1,0} %p180.1133), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3058 = f32[1,1,384,64]{1,0,2,3} reverse(f32[1,1,384,64]{1,0,2,3} %transpose.3057), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3059 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.3054, f32[1,1,384,64]{1,0,2,3} %reverse.3058), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3068 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3076 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3068), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3077 = f32[96,384,14,14]{3,2,1,0} select(pred[96,384,14,14]{3,2,1,0} %and.3075, f32[96,384,14,14]{3,2,1,0} %convolution.3059, f32[96,384,14,14]{3,2,1,0} %broadcast.3076), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3085 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-grad(f32[96,384,14,14]{3,2,1,0} %convolution.1099, f32[384]{0} %p173.1093, f32[384]{0} %get-tuple-element.1102, f32[384]{0} %subtract.3084, f32[96,384,14,14]{3,2,1,0} %select.3077), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3086 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3085), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3097 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3102 = f32[384]{0} reduce(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3086, f32[] %constant.3097), dimensions={0,2,3}, to_apply=%AddComputation.3098, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1063 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.1061), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3114 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3115 = f32[384]{0} broadcast(f32[] %constant.3114), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1064 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.1061), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1065 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1066 = f32[384]{0} broadcast(f32[] %constant.1065), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1067 = f32[384]{0} add(f32[384]{0} %get-tuple-element.1064, f32[384]{0} %broadcast.1066), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1068 = f32[384]{0} rsqrt(f32[384]{0} %add.1067), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3116 = f32[384]{0} divide(f32[384]{0} %broadcast.3115, f32[384]{0} %rsqrt.1068), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3117 = f32[384]{0} multiply(f32[384]{0} %divide.3116, f32[384]{0} %divide.3116), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3113 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3118 = f32[384]{0} broadcast(f32[] %constant.3113), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3119 = f32[384]{0} subtract(f32[384]{0} %multiply.3117, f32[384]{0} %broadcast.3118), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3104 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3105 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3104), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3106 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.1062, f32[96,384,14,14]{3,2,1,0} %broadcast.3105), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3107 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3108 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3107), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3109 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.1062, f32[96,384,14,14]{3,2,1,0} %broadcast.3108), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3110 = pred[96,384,14,14]{3,2,1,0} and(pred[96,384,14,14]{3,2,1,0} %compare.3106, pred[96,384,14,14]{3,2,1,0} %compare.3109), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3089 = f32[3,3,1,384]{1,0,2,3} transpose(f32[384,1,3,3]{3,2,1,0} %p174.1094), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3090 = f32[3,3,1,384,1]{4,3,2,1,0} reshape(f32[3,3,1,384]{1,0,2,3} %transpose.3089), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3091 = f32[3,3,384,1,1]{4,2,3,1,0} transpose(f32[3,3,1,384,1]{4,3,2,1,0} %reshape.3090), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3092 = f32[3,3,384,1]{3,2,1,0} reshape(f32[3,3,384,1,1]{4,2,3,1,0} %transpose.3091), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3093 = f32[3,3,384,1]{3,2,1,0} reverse(f32[3,3,384,1]{3,2,1,0} %reshape.3092), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3094 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3086, f32[3,3,384,1]{3,2,1,0} %reverse.3093), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=384, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3103 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3111 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3103), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3112 = f32[96,384,14,14]{3,2,1,0} select(pred[96,384,14,14]{3,2,1,0} %and.3110, f32[96,384,14,14]{3,2,1,0} %convolution.3094, f32[96,384,14,14]{3,2,1,0} %broadcast.3111), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3120 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-grad(f32[96,384,14,14]{3,2,1,0} %convolution.1060, f32[384]{0} %p167.1051, f32[384]{0} %get-tuple-element.1063, f32[384]{0} %subtract.3119, f32[96,384,14,14]{3,2,1,0} %select.3112), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3121 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3120), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3129 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3134 = f32[384]{0} reduce(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3121, f32[] %constant.3129), dimensions={0,2,3}, to_apply=%AddComputation.3130, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.1021 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-training.1019), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3143 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3144 = f32[64]{0} broadcast(f32[] %constant.3143), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.1022 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-training.1019), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1023 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1024 = f32[64]{0} broadcast(f32[] %constant.1023), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1025 = f32[64]{0} add(f32[64]{0} %get-tuple-element.1022, f32[64]{0} %broadcast.1024), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.1026 = f32[64]{0} rsqrt(f32[64]{0} %add.1025), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3145 = f32[64]{0} divide(f32[64]{0} %broadcast.3144, f32[64]{0} %rsqrt.1026), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3146 = f32[64]{0} multiply(f32[64]{0} %divide.3145, f32[64]{0} %divide.3145), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3142 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3147 = f32[64]{0} broadcast(f32[] %constant.3142), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3148 = f32[64]{0} subtract(f32[64]{0} %multiply.3146, f32[64]{0} %broadcast.3147), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.3124 = f32[1,1,64,384]{1,0,2,3} transpose(f32[384,64,1,1]{3,2,1,0} %p168.1052), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3125 = f32[1,1,64,384]{1,0,2,3} reverse(f32[1,1,64,384]{1,0,2,3} %transpose.3124), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3126 = f32[96,64,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3121, f32[1,1,64,384]{1,0,2,3} %reverse.3125), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3135 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant"}
+  %reshape.3136 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.3135), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.3137 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.3136), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %reshape.3138 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.3137), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.3139 = f32[96,64,14,14]{3,2,1,0} broadcast(f32[] %reshape.3138), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %multiply.3140 = f32[96,64,14,14]{3,2,1,0} multiply(f32[96,64,14,14]{3,2,1,0} %convolution.3126, f32[96,64,14,14]{3,2,1,0} %broadcast.3139), metadata={op_type="aten__mul" op_name="aten__mul"}
+  %add.3141 = f32[96,64,14,14]{3,2,1,0} add(f32[96,64,14,14]{3,2,1,0} %convolution.3037, f32[96,64,14,14]{3,2,1,0} %multiply.3140), metadata={op_type="aten__add" op_name="aten__add"}
+  %batch-norm-grad.3149 = (f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) batch-norm-grad(f32[96,64,14,14]{3,2,1,0} %convolution.1018, f32[64]{0} %p161.1012, f32[64]{0} %get-tuple-element.1021, f32[64]{0} %subtract.3148, f32[96,64,14,14]{3,2,1,0} %add.3141), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3150 = f32[96,64,14,14]{3,2,1,0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-grad.3149), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3158 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3163 = f32[64]{0} reduce(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.3150, f32[] %constant.3158), dimensions={0,2,3}, to_apply=%AddComputation.3159, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.982 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.980), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3175 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3176 = f32[384]{0} broadcast(f32[] %constant.3175), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.983 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.980), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.984 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.985 = f32[384]{0} broadcast(f32[] %constant.984), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.986 = f32[384]{0} add(f32[384]{0} %get-tuple-element.983, f32[384]{0} %broadcast.985), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.987 = f32[384]{0} rsqrt(f32[384]{0} %add.986), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3177 = f32[384]{0} divide(f32[384]{0} %broadcast.3176, f32[384]{0} %rsqrt.987), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3178 = f32[384]{0} multiply(f32[384]{0} %divide.3177, f32[384]{0} %divide.3177), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3174 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3179 = f32[384]{0} broadcast(f32[] %constant.3174), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3180 = f32[384]{0} subtract(f32[384]{0} %multiply.3178, f32[384]{0} %broadcast.3179), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3165 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3166 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3165), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3167 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.981, f32[96,384,14,14]{3,2,1,0} %broadcast.3166), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3168 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3169 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3168), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3170 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.981, f32[96,384,14,14]{3,2,1,0} %broadcast.3169), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3171 = pred[96,384,14,14]{3,2,1,0} and(pred[96,384,14,14]{3,2,1,0} %compare.3167, pred[96,384,14,14]{3,2,1,0} %compare.3170), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3153 = f32[1,1,384,64]{1,0,2,3} transpose(f32[64,384,1,1]{3,2,1,0} %p162.1013), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3154 = f32[1,1,384,64]{1,0,2,3} reverse(f32[1,1,384,64]{1,0,2,3} %transpose.3153), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3155 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.3150, f32[1,1,384,64]{1,0,2,3} %reverse.3154), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3164 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3172 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3164), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3173 = f32[96,384,14,14]{3,2,1,0} select(pred[96,384,14,14]{3,2,1,0} %and.3171, f32[96,384,14,14]{3,2,1,0} %convolution.3155, f32[96,384,14,14]{3,2,1,0} %broadcast.3172), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3181 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-grad(f32[96,384,14,14]{3,2,1,0} %convolution.979, f32[384]{0} %p155.973, f32[384]{0} %get-tuple-element.982, f32[384]{0} %subtract.3180, f32[96,384,14,14]{3,2,1,0} %select.3173), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3182 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3181), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3193 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3198 = f32[384]{0} reduce(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3182, f32[] %constant.3193), dimensions={0,2,3}, to_apply=%AddComputation.3194, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.943 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.941), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3210 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3211 = f32[384]{0} broadcast(f32[] %constant.3210), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.944 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.941), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.945 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.946 = f32[384]{0} broadcast(f32[] %constant.945), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.947 = f32[384]{0} add(f32[384]{0} %get-tuple-element.944, f32[384]{0} %broadcast.946), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.948 = f32[384]{0} rsqrt(f32[384]{0} %add.947), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3212 = f32[384]{0} divide(f32[384]{0} %broadcast.3211, f32[384]{0} %rsqrt.948), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3213 = f32[384]{0} multiply(f32[384]{0} %divide.3212, f32[384]{0} %divide.3212), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3209 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3214 = f32[384]{0} broadcast(f32[] %constant.3209), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3215 = f32[384]{0} subtract(f32[384]{0} %multiply.3213, f32[384]{0} %broadcast.3214), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3200 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3201 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3200), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3202 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.942, f32[96,384,14,14]{3,2,1,0} %broadcast.3201), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3203 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3204 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3203), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3205 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.942, f32[96,384,14,14]{3,2,1,0} %broadcast.3204), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3206 = pred[96,384,14,14]{3,2,1,0} and(pred[96,384,14,14]{3,2,1,0} %compare.3202, pred[96,384,14,14]{3,2,1,0} %compare.3205), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3185 = f32[3,3,1,384]{1,0,2,3} transpose(f32[384,1,3,3]{3,2,1,0} %p156.974), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3186 = f32[3,3,1,384,1]{4,3,2,1,0} reshape(f32[3,3,1,384]{1,0,2,3} %transpose.3185), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3187 = f32[3,3,384,1,1]{4,2,3,1,0} transpose(f32[3,3,1,384,1]{4,3,2,1,0} %reshape.3186), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3188 = f32[3,3,384,1]{3,2,1,0} reshape(f32[3,3,384,1,1]{4,2,3,1,0} %transpose.3187), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3189 = f32[3,3,384,1]{3,2,1,0} reverse(f32[3,3,384,1]{3,2,1,0} %reshape.3188), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3190 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3182, f32[3,3,384,1]{3,2,1,0} %reverse.3189), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=384, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3199 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3207 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3199), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3208 = f32[96,384,14,14]{3,2,1,0} select(pred[96,384,14,14]{3,2,1,0} %and.3206, f32[96,384,14,14]{3,2,1,0} %convolution.3190, f32[96,384,14,14]{3,2,1,0} %broadcast.3207), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3216 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-grad(f32[96,384,14,14]{3,2,1,0} %convolution.940, f32[384]{0} %p149.931, f32[384]{0} %get-tuple-element.943, f32[384]{0} %subtract.3215, f32[96,384,14,14]{3,2,1,0} %select.3208), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3217 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3216), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3225 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3230 = f32[384]{0} reduce(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3217, f32[] %constant.3225), dimensions={0,2,3}, to_apply=%AddComputation.3226, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.901 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-training.899), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3239 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3240 = f32[64]{0} broadcast(f32[] %constant.3239), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.902 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-training.899), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.903 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.904 = f32[64]{0} broadcast(f32[] %constant.903), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.905 = f32[64]{0} add(f32[64]{0} %get-tuple-element.902, f32[64]{0} %broadcast.904), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.906 = f32[64]{0} rsqrt(f32[64]{0} %add.905), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3241 = f32[64]{0} divide(f32[64]{0} %broadcast.3240, f32[64]{0} %rsqrt.906), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3242 = f32[64]{0} multiply(f32[64]{0} %divide.3241, f32[64]{0} %divide.3241), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3238 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3243 = f32[64]{0} broadcast(f32[] %constant.3238), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3244 = f32[64]{0} subtract(f32[64]{0} %multiply.3242, f32[64]{0} %broadcast.3243), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.3220 = f32[1,1,64,384]{1,0,2,3} transpose(f32[384,64,1,1]{3,2,1,0} %p150.932), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3221 = f32[1,1,64,384]{1,0,2,3} reverse(f32[1,1,64,384]{1,0,2,3} %transpose.3220), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3222 = f32[96,64,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3217, f32[1,1,64,384]{1,0,2,3} %reverse.3221), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3231 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant"}
+  %reshape.3232 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.3231), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.3233 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.3232), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %reshape.3234 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.3233), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.3235 = f32[96,64,14,14]{3,2,1,0} broadcast(f32[] %reshape.3234), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %multiply.3236 = f32[96,64,14,14]{3,2,1,0} multiply(f32[96,64,14,14]{3,2,1,0} %convolution.3222, f32[96,64,14,14]{3,2,1,0} %broadcast.3235), metadata={op_type="aten__mul" op_name="aten__mul"}
+  %add.3237 = f32[96,64,14,14]{3,2,1,0} add(f32[96,64,14,14]{3,2,1,0} %add.3141, f32[96,64,14,14]{3,2,1,0} %multiply.3236), metadata={op_type="aten__add" op_name="aten__add"}
+  %batch-norm-grad.3245 = (f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) batch-norm-grad(f32[96,64,14,14]{3,2,1,0} %convolution.898, f32[64]{0} %p143.892, f32[64]{0} %get-tuple-element.901, f32[64]{0} %subtract.3244, f32[96,64,14,14]{3,2,1,0} %add.3237), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3246 = f32[96,64,14,14]{3,2,1,0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-grad.3245), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3254 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3259 = f32[64]{0} reduce(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.3246, f32[] %constant.3254), dimensions={0,2,3}, to_apply=%AddComputation.3255, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.862 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.860), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3271 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3272 = f32[384]{0} broadcast(f32[] %constant.3271), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.863 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.860), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.864 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.865 = f32[384]{0} broadcast(f32[] %constant.864), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.866 = f32[384]{0} add(f32[384]{0} %get-tuple-element.863, f32[384]{0} %broadcast.865), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.867 = f32[384]{0} rsqrt(f32[384]{0} %add.866), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3273 = f32[384]{0} divide(f32[384]{0} %broadcast.3272, f32[384]{0} %rsqrt.867), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3274 = f32[384]{0} multiply(f32[384]{0} %divide.3273, f32[384]{0} %divide.3273), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3270 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3275 = f32[384]{0} broadcast(f32[] %constant.3270), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3276 = f32[384]{0} subtract(f32[384]{0} %multiply.3274, f32[384]{0} %broadcast.3275), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3261 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3262 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3261), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3263 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.861, f32[96,384,14,14]{3,2,1,0} %broadcast.3262), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3264 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3265 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3264), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3266 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.861, f32[96,384,14,14]{3,2,1,0} %broadcast.3265), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3267 = pred[96,384,14,14]{3,2,1,0} and(pred[96,384,14,14]{3,2,1,0} %compare.3263, pred[96,384,14,14]{3,2,1,0} %compare.3266), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3249 = f32[1,1,384,64]{1,0,2,3} transpose(f32[64,384,1,1]{3,2,1,0} %p144.893), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3250 = f32[1,1,384,64]{1,0,2,3} reverse(f32[1,1,384,64]{1,0,2,3} %transpose.3249), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3251 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.3246, f32[1,1,384,64]{1,0,2,3} %reverse.3250), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3260 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3268 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3260), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3269 = f32[96,384,14,14]{3,2,1,0} select(pred[96,384,14,14]{3,2,1,0} %and.3267, f32[96,384,14,14]{3,2,1,0} %convolution.3251, f32[96,384,14,14]{3,2,1,0} %broadcast.3268), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3277 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-grad(f32[96,384,14,14]{3,2,1,0} %convolution.859, f32[384]{0} %p137.853, f32[384]{0} %get-tuple-element.862, f32[384]{0} %subtract.3276, f32[96,384,14,14]{3,2,1,0} %select.3269), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3278 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3277), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3289 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3294 = f32[384]{0} reduce(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3278, f32[] %constant.3289), dimensions={0,2,3}, to_apply=%AddComputation.3290, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.823 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.821), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3306 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3307 = f32[384]{0} broadcast(f32[] %constant.3306), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.824 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-training.821), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.825 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.826 = f32[384]{0} broadcast(f32[] %constant.825), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.827 = f32[384]{0} add(f32[384]{0} %get-tuple-element.824, f32[384]{0} %broadcast.826), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.828 = f32[384]{0} rsqrt(f32[384]{0} %add.827), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3308 = f32[384]{0} divide(f32[384]{0} %broadcast.3307, f32[384]{0} %rsqrt.828), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3309 = f32[384]{0} multiply(f32[384]{0} %divide.3308, f32[384]{0} %divide.3308), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3305 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3310 = f32[384]{0} broadcast(f32[] %constant.3305), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3311 = f32[384]{0} subtract(f32[384]{0} %multiply.3309, f32[384]{0} %broadcast.3310), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3296 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3297 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3296), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3298 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.822, f32[96,384,14,14]{3,2,1,0} %broadcast.3297), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3299 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3300 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3299), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3301 = pred[96,384,14,14]{3,2,1,0} compare(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.822, f32[96,384,14,14]{3,2,1,0} %broadcast.3300), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3302 = pred[96,384,14,14]{3,2,1,0} and(pred[96,384,14,14]{3,2,1,0} %compare.3298, pred[96,384,14,14]{3,2,1,0} %compare.3301), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3281 = f32[3,3,1,384]{1,0,2,3} transpose(f32[384,1,3,3]{3,2,1,0} %p138.854), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3282 = f32[3,3,1,384,1]{4,3,2,1,0} reshape(f32[3,3,1,384]{1,0,2,3} %transpose.3281), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3283 = f32[3,3,384,1,1]{4,2,3,1,0} transpose(f32[3,3,1,384,1]{4,3,2,1,0} %reshape.3282), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3284 = f32[3,3,384,1]{3,2,1,0} reshape(f32[3,3,384,1,1]{4,2,3,1,0} %transpose.3283), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3285 = f32[3,3,384,1]{3,2,1,0} reverse(f32[3,3,384,1]{3,2,1,0} %reshape.3284), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3286 = f32[96,384,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3278, f32[3,3,384,1]{3,2,1,0} %reverse.3285), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=384, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3295 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3303 = f32[96,384,14,14]{3,2,1,0} broadcast(f32[] %constant.3295), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3304 = f32[96,384,14,14]{3,2,1,0} select(pred[96,384,14,14]{3,2,1,0} %and.3302, f32[96,384,14,14]{3,2,1,0} %convolution.3286, f32[96,384,14,14]{3,2,1,0} %broadcast.3303), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3312 = (f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) batch-norm-grad(f32[96,384,14,14]{3,2,1,0} %convolution.820, f32[384]{0} %p131.818, f32[384]{0} %get-tuple-element.823, f32[384]{0} %subtract.3311, f32[96,384,14,14]{3,2,1,0} %select.3304), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3313 = f32[96,384,14,14]{3,2,1,0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3312), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3321 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3326 = f32[384]{0} reduce(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3313, f32[] %constant.3321), dimensions={0,2,3}, to_apply=%AddComputation.3322, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.788 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-training.786), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3335 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3336 = f32[64]{0} broadcast(f32[] %constant.3335), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.789 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-training.786), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.790 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.791 = f32[64]{0} broadcast(f32[] %constant.790), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.792 = f32[64]{0} add(f32[64]{0} %get-tuple-element.789, f32[64]{0} %broadcast.791), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.793 = f32[64]{0} rsqrt(f32[64]{0} %add.792), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3337 = f32[64]{0} divide(f32[64]{0} %broadcast.3336, f32[64]{0} %rsqrt.793), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3338 = f32[64]{0} multiply(f32[64]{0} %divide.3337, f32[64]{0} %divide.3337), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3334 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3339 = f32[64]{0} broadcast(f32[] %constant.3334), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3340 = f32[64]{0} subtract(f32[64]{0} %multiply.3338, f32[64]{0} %broadcast.3339), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.3316 = f32[1,1,64,384]{1,0,2,3} transpose(f32[384,64,1,1]{3,2,1,0} %p132.819), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3317 = f32[1,1,64,384]{1,0,2,3} reverse(f32[1,1,64,384]{1,0,2,3} %transpose.3316), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3318 = f32[96,64,14,14]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3313, f32[1,1,64,384]{1,0,2,3} %reverse.3317), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3327 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant"}
+  %reshape.3328 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.3327), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.3329 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.3328), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %reshape.3330 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.3329), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.3331 = f32[96,64,14,14]{3,2,1,0} broadcast(f32[] %reshape.3330), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %multiply.3332 = f32[96,64,14,14]{3,2,1,0} multiply(f32[96,64,14,14]{3,2,1,0} %convolution.3318, f32[96,64,14,14]{3,2,1,0} %broadcast.3331), metadata={op_type="aten__mul" op_name="aten__mul"}
+  %add.3333 = f32[96,64,14,14]{3,2,1,0} add(f32[96,64,14,14]{3,2,1,0} %add.3237, f32[96,64,14,14]{3,2,1,0} %multiply.3332), metadata={op_type="aten__add" op_name="aten__add"}
+  %batch-norm-grad.3341 = (f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) batch-norm-grad(f32[96,64,14,14]{3,2,1,0} %convolution.785, f32[64]{0} %p125.779, f32[64]{0} %get-tuple-element.788, f32[64]{0} %subtract.3340, f32[96,64,14,14]{3,2,1,0} %add.3333), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3342 = f32[96,64,14,14]{3,2,1,0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-grad.3341), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3350 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3355 = f32[64]{0} reduce(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.3342, f32[] %constant.3350), dimensions={0,2,3}, to_apply=%AddComputation.3351, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.749 = f32[192]{0} get-tuple-element((f32[96,192,14,14]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.747), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3367 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3368 = f32[192]{0} broadcast(f32[] %constant.3367), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.750 = f32[192]{0} get-tuple-element((f32[96,192,14,14]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.747), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.751 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.752 = f32[192]{0} broadcast(f32[] %constant.751), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.753 = f32[192]{0} add(f32[192]{0} %get-tuple-element.750, f32[192]{0} %broadcast.752), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.754 = f32[192]{0} rsqrt(f32[192]{0} %add.753), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3369 = f32[192]{0} divide(f32[192]{0} %broadcast.3368, f32[192]{0} %rsqrt.754), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3370 = f32[192]{0} multiply(f32[192]{0} %divide.3369, f32[192]{0} %divide.3369), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3366 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3371 = f32[192]{0} broadcast(f32[] %constant.3366), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3372 = f32[192]{0} subtract(f32[192]{0} %multiply.3370, f32[192]{0} %broadcast.3371), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3357 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3358 = f32[96,192,14,14]{3,2,1,0} broadcast(f32[] %constant.3357), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3359 = pred[96,192,14,14]{3,2,1,0} compare(f32[96,192,14,14]{3,2,1,0} %get-tuple-element.748, f32[96,192,14,14]{3,2,1,0} %broadcast.3358), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3360 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3361 = f32[96,192,14,14]{3,2,1,0} broadcast(f32[] %constant.3360), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3362 = pred[96,192,14,14]{3,2,1,0} compare(f32[96,192,14,14]{3,2,1,0} %get-tuple-element.748, f32[96,192,14,14]{3,2,1,0} %broadcast.3361), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3363 = pred[96,192,14,14]{3,2,1,0} and(pred[96,192,14,14]{3,2,1,0} %compare.3359, pred[96,192,14,14]{3,2,1,0} %compare.3362), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3345 = f32[1,1,192,64]{1,0,2,3} transpose(f32[64,192,1,1]{3,2,1,0} %p126.780), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3346 = f32[1,1,192,64]{1,0,2,3} reverse(f32[1,1,192,64]{1,0,2,3} %transpose.3345), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3347 = f32[96,192,14,14]{3,2,1,0} convolution(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.3342, f32[1,1,192,64]{1,0,2,3} %reverse.3346), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3356 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3364 = f32[96,192,14,14]{3,2,1,0} broadcast(f32[] %constant.3356), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3365 = f32[96,192,14,14]{3,2,1,0} select(pred[96,192,14,14]{3,2,1,0} %and.3363, f32[96,192,14,14]{3,2,1,0} %convolution.3347, f32[96,192,14,14]{3,2,1,0} %broadcast.3364), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3373 = (f32[96,192,14,14]{3,2,1,0}, f32[192]{0}, f32[192]{0}) batch-norm-grad(f32[96,192,14,14]{3,2,1,0} %convolution.746, f32[192]{0} %p119.740, f32[192]{0} %get-tuple-element.749, f32[192]{0} %subtract.3372, f32[96,192,14,14]{3,2,1,0} %select.3365), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3374 = f32[96,192,14,14]{3,2,1,0} get-tuple-element((f32[96,192,14,14]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3373), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3385 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3390 = f32[192]{0} reduce(f32[96,192,14,14]{3,2,1,0} %get-tuple-element.3374, f32[] %constant.3385), dimensions={0,2,3}, to_apply=%AddComputation.3386, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.710 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.708), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3402 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3403 = f32[192]{0} broadcast(f32[] %constant.3402), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.711 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.708), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.712 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.713 = f32[192]{0} broadcast(f32[] %constant.712), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.714 = f32[192]{0} add(f32[192]{0} %get-tuple-element.711, f32[192]{0} %broadcast.713), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.715 = f32[192]{0} rsqrt(f32[192]{0} %add.714), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3404 = f32[192]{0} divide(f32[192]{0} %broadcast.3403, f32[192]{0} %rsqrt.715), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3405 = f32[192]{0} multiply(f32[192]{0} %divide.3404, f32[192]{0} %divide.3404), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3401 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3406 = f32[192]{0} broadcast(f32[] %constant.3401), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3407 = f32[192]{0} subtract(f32[192]{0} %multiply.3405, f32[192]{0} %broadcast.3406), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3392 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3393 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3392), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3394 = pred[96,192,28,28]{3,2,1,0} compare(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.709, f32[96,192,28,28]{3,2,1,0} %broadcast.3393), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3395 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3396 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3395), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3397 = pred[96,192,28,28]{3,2,1,0} compare(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.709, f32[96,192,28,28]{3,2,1,0} %broadcast.3396), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3398 = pred[96,192,28,28]{3,2,1,0} and(pred[96,192,28,28]{3,2,1,0} %compare.3394, pred[96,192,28,28]{3,2,1,0} %compare.3397), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3377 = f32[3,3,1,192]{1,0,2,3} transpose(f32[192,1,3,3]{3,2,1,0} %p120.741), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3378 = f32[3,3,1,192,1]{4,3,2,1,0} reshape(f32[3,3,1,192]{1,0,2,3} %transpose.3377), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3379 = f32[3,3,192,1,1]{4,2,3,1,0} transpose(f32[3,3,1,192,1]{4,3,2,1,0} %reshape.3378), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3380 = f32[3,3,192,1]{3,2,1,0} reshape(f32[3,3,192,1,1]{4,2,3,1,0} %transpose.3379), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3381 = f32[3,3,192,1]{3,2,1,0} reverse(f32[3,3,192,1]{3,2,1,0} %reshape.3380), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3382 = f32[96,192,28,28]{3,2,1,0} convolution(f32[96,192,14,14]{3,2,1,0} %get-tuple-element.3374, f32[3,3,192,1]{3,2,1,0} %reverse.3381), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2}, dim_labels=bf01_01oi->bf01, feature_group_count=192, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3391 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3399 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3391), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3400 = f32[96,192,28,28]{3,2,1,0} select(pred[96,192,28,28]{3,2,1,0} %and.3398, f32[96,192,28,28]{3,2,1,0} %convolution.3382, f32[96,192,28,28]{3,2,1,0} %broadcast.3399), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3408 = (f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) batch-norm-grad(f32[96,192,28,28]{3,2,1,0} %convolution.707, f32[192]{0} %p113.698, f32[192]{0} %get-tuple-element.710, f32[192]{0} %subtract.3407, f32[96,192,28,28]{3,2,1,0} %select.3400), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3409 = f32[96,192,28,28]{3,2,1,0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3408), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3417 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3422 = f32[192]{0} reduce(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3409, f32[] %constant.3417), dimensions={0,2,3}, to_apply=%AddComputation.3418, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.668 = f32[32]{0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.666), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3424 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3425 = f32[32]{0} broadcast(f32[] %constant.3424), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.669 = f32[32]{0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.666), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.670 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.671 = f32[32]{0} broadcast(f32[] %constant.670), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.672 = f32[32]{0} add(f32[32]{0} %get-tuple-element.669, f32[32]{0} %broadcast.671), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.673 = f32[32]{0} rsqrt(f32[32]{0} %add.672), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3426 = f32[32]{0} divide(f32[32]{0} %broadcast.3425, f32[32]{0} %rsqrt.673), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3427 = f32[32]{0} multiply(f32[32]{0} %divide.3426, f32[32]{0} %divide.3426), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3423 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3428 = f32[32]{0} broadcast(f32[] %constant.3423), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3429 = f32[32]{0} subtract(f32[32]{0} %multiply.3427, f32[32]{0} %broadcast.3428), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.3412 = f32[1,1,32,192]{1,0,2,3} transpose(f32[192,32,1,1]{3,2,1,0} %p114.699), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3413 = f32[1,1,32,192]{1,0,2,3} reverse(f32[1,1,32,192]{1,0,2,3} %transpose.3412), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3414 = f32[96,32,28,28]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3409, f32[1,1,32,192]{1,0,2,3} %reverse.3413), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %batch-norm-grad.3430 = (f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) batch-norm-grad(f32[96,32,28,28]{3,2,1,0} %convolution.665, f32[32]{0} %p107.659, f32[32]{0} %get-tuple-element.668, f32[32]{0} %subtract.3429, f32[96,32,28,28]{3,2,1,0} %convolution.3414), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3431 = f32[96,32,28,28]{3,2,1,0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3430), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3439 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3444 = f32[32]{0} reduce(f32[96,32,28,28]{3,2,1,0} %get-tuple-element.3431, f32[] %constant.3439), dimensions={0,2,3}, to_apply=%AddComputation.3440, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.629 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.627), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3456 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3457 = f32[192]{0} broadcast(f32[] %constant.3456), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.630 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.627), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.631 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.632 = f32[192]{0} broadcast(f32[] %constant.631), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.633 = f32[192]{0} add(f32[192]{0} %get-tuple-element.630, f32[192]{0} %broadcast.632), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.634 = f32[192]{0} rsqrt(f32[192]{0} %add.633), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3458 = f32[192]{0} divide(f32[192]{0} %broadcast.3457, f32[192]{0} %rsqrt.634), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3459 = f32[192]{0} multiply(f32[192]{0} %divide.3458, f32[192]{0} %divide.3458), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3455 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3460 = f32[192]{0} broadcast(f32[] %constant.3455), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3461 = f32[192]{0} subtract(f32[192]{0} %multiply.3459, f32[192]{0} %broadcast.3460), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3446 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3447 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3446), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3448 = pred[96,192,28,28]{3,2,1,0} compare(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.628, f32[96,192,28,28]{3,2,1,0} %broadcast.3447), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3449 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3450 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3449), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3451 = pred[96,192,28,28]{3,2,1,0} compare(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.628, f32[96,192,28,28]{3,2,1,0} %broadcast.3450), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3452 = pred[96,192,28,28]{3,2,1,0} and(pred[96,192,28,28]{3,2,1,0} %compare.3448, pred[96,192,28,28]{3,2,1,0} %compare.3451), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3434 = f32[1,1,192,32]{1,0,2,3} transpose(f32[32,192,1,1]{3,2,1,0} %p108.660), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3435 = f32[1,1,192,32]{1,0,2,3} reverse(f32[1,1,192,32]{1,0,2,3} %transpose.3434), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3436 = f32[96,192,28,28]{3,2,1,0} convolution(f32[96,32,28,28]{3,2,1,0} %get-tuple-element.3431, f32[1,1,192,32]{1,0,2,3} %reverse.3435), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3445 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3453 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3445), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3454 = f32[96,192,28,28]{3,2,1,0} select(pred[96,192,28,28]{3,2,1,0} %and.3452, f32[96,192,28,28]{3,2,1,0} %convolution.3436, f32[96,192,28,28]{3,2,1,0} %broadcast.3453), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3462 = (f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) batch-norm-grad(f32[96,192,28,28]{3,2,1,0} %convolution.626, f32[192]{0} %p101.620, f32[192]{0} %get-tuple-element.629, f32[192]{0} %subtract.3461, f32[96,192,28,28]{3,2,1,0} %select.3454), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3463 = f32[96,192,28,28]{3,2,1,0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3462), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3474 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3479 = f32[192]{0} reduce(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3463, f32[] %constant.3474), dimensions={0,2,3}, to_apply=%AddComputation.3475, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.590 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.588), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3491 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3492 = f32[192]{0} broadcast(f32[] %constant.3491), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.591 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.588), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.592 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.593 = f32[192]{0} broadcast(f32[] %constant.592), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.594 = f32[192]{0} add(f32[192]{0} %get-tuple-element.591, f32[192]{0} %broadcast.593), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.595 = f32[192]{0} rsqrt(f32[192]{0} %add.594), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3493 = f32[192]{0} divide(f32[192]{0} %broadcast.3492, f32[192]{0} %rsqrt.595), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3494 = f32[192]{0} multiply(f32[192]{0} %divide.3493, f32[192]{0} %divide.3493), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3490 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3495 = f32[192]{0} broadcast(f32[] %constant.3490), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3496 = f32[192]{0} subtract(f32[192]{0} %multiply.3494, f32[192]{0} %broadcast.3495), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3481 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3482 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3481), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3483 = pred[96,192,28,28]{3,2,1,0} compare(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.589, f32[96,192,28,28]{3,2,1,0} %broadcast.3482), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3484 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3485 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3484), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3486 = pred[96,192,28,28]{3,2,1,0} compare(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.589, f32[96,192,28,28]{3,2,1,0} %broadcast.3485), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3487 = pred[96,192,28,28]{3,2,1,0} and(pred[96,192,28,28]{3,2,1,0} %compare.3483, pred[96,192,28,28]{3,2,1,0} %compare.3486), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3466 = f32[3,3,1,192]{1,0,2,3} transpose(f32[192,1,3,3]{3,2,1,0} %p102.621), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3467 = f32[3,3,1,192,1]{4,3,2,1,0} reshape(f32[3,3,1,192]{1,0,2,3} %transpose.3466), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3468 = f32[3,3,192,1,1]{4,2,3,1,0} transpose(f32[3,3,1,192,1]{4,3,2,1,0} %reshape.3467), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3469 = f32[3,3,192,1]{3,2,1,0} reshape(f32[3,3,192,1,1]{4,2,3,1,0} %transpose.3468), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3470 = f32[3,3,192,1]{3,2,1,0} reverse(f32[3,3,192,1]{3,2,1,0} %reshape.3469), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3471 = f32[96,192,28,28]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3463, f32[3,3,192,1]{3,2,1,0} %reverse.3470), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=192, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3480 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3488 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3480), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3489 = f32[96,192,28,28]{3,2,1,0} select(pred[96,192,28,28]{3,2,1,0} %and.3487, f32[96,192,28,28]{3,2,1,0} %convolution.3471, f32[96,192,28,28]{3,2,1,0} %broadcast.3488), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3497 = (f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) batch-norm-grad(f32[96,192,28,28]{3,2,1,0} %convolution.587, f32[192]{0} %p95.578, f32[192]{0} %get-tuple-element.590, f32[192]{0} %subtract.3496, f32[96,192,28,28]{3,2,1,0} %select.3489), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3498 = f32[96,192,28,28]{3,2,1,0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3497), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3506 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3511 = f32[192]{0} reduce(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3498, f32[] %constant.3506), dimensions={0,2,3}, to_apply=%AddComputation.3507, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.548 = f32[32]{0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.546), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3520 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3521 = f32[32]{0} broadcast(f32[] %constant.3520), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.549 = f32[32]{0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.546), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.550 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.551 = f32[32]{0} broadcast(f32[] %constant.550), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.552 = f32[32]{0} add(f32[32]{0} %get-tuple-element.549, f32[32]{0} %broadcast.551), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.553 = f32[32]{0} rsqrt(f32[32]{0} %add.552), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3522 = f32[32]{0} divide(f32[32]{0} %broadcast.3521, f32[32]{0} %rsqrt.553), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3523 = f32[32]{0} multiply(f32[32]{0} %divide.3522, f32[32]{0} %divide.3522), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3519 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3524 = f32[32]{0} broadcast(f32[] %constant.3519), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3525 = f32[32]{0} subtract(f32[32]{0} %multiply.3523, f32[32]{0} %broadcast.3524), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.3501 = f32[1,1,32,192]{1,0,2,3} transpose(f32[192,32,1,1]{3,2,1,0} %p96.579), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3502 = f32[1,1,32,192]{1,0,2,3} reverse(f32[1,1,32,192]{1,0,2,3} %transpose.3501), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3503 = f32[96,32,28,28]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3498, f32[1,1,32,192]{1,0,2,3} %reverse.3502), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3512 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant"}
+  %reshape.3513 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.3512), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.3514 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.3513), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %reshape.3515 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.3514), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.3516 = f32[96,32,28,28]{3,2,1,0} broadcast(f32[] %reshape.3515), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %multiply.3517 = f32[96,32,28,28]{3,2,1,0} multiply(f32[96,32,28,28]{3,2,1,0} %convolution.3503, f32[96,32,28,28]{3,2,1,0} %broadcast.3516), metadata={op_type="aten__mul" op_name="aten__mul"}
+  %add.3518 = f32[96,32,28,28]{3,2,1,0} add(f32[96,32,28,28]{3,2,1,0} %convolution.3414, f32[96,32,28,28]{3,2,1,0} %multiply.3517), metadata={op_type="aten__add" op_name="aten__add"}
+  %batch-norm-grad.3526 = (f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) batch-norm-grad(f32[96,32,28,28]{3,2,1,0} %convolution.545, f32[32]{0} %p89.539, f32[32]{0} %get-tuple-element.548, f32[32]{0} %subtract.3525, f32[96,32,28,28]{3,2,1,0} %add.3518), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3527 = f32[96,32,28,28]{3,2,1,0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3526), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3535 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3540 = f32[32]{0} reduce(f32[96,32,28,28]{3,2,1,0} %get-tuple-element.3527, f32[] %constant.3535), dimensions={0,2,3}, to_apply=%AddComputation.3536, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.509 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.507), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3552 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3553 = f32[192]{0} broadcast(f32[] %constant.3552), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.510 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.507), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.511 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.512 = f32[192]{0} broadcast(f32[] %constant.511), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.513 = f32[192]{0} add(f32[192]{0} %get-tuple-element.510, f32[192]{0} %broadcast.512), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.514 = f32[192]{0} rsqrt(f32[192]{0} %add.513), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3554 = f32[192]{0} divide(f32[192]{0} %broadcast.3553, f32[192]{0} %rsqrt.514), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3555 = f32[192]{0} multiply(f32[192]{0} %divide.3554, f32[192]{0} %divide.3554), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3551 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3556 = f32[192]{0} broadcast(f32[] %constant.3551), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3557 = f32[192]{0} subtract(f32[192]{0} %multiply.3555, f32[192]{0} %broadcast.3556), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3542 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3543 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3542), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3544 = pred[96,192,28,28]{3,2,1,0} compare(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.508, f32[96,192,28,28]{3,2,1,0} %broadcast.3543), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3545 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3546 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3545), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3547 = pred[96,192,28,28]{3,2,1,0} compare(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.508, f32[96,192,28,28]{3,2,1,0} %broadcast.3546), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3548 = pred[96,192,28,28]{3,2,1,0} and(pred[96,192,28,28]{3,2,1,0} %compare.3544, pred[96,192,28,28]{3,2,1,0} %compare.3547), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3530 = f32[1,1,192,32]{1,0,2,3} transpose(f32[32,192,1,1]{3,2,1,0} %p90.540), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3531 = f32[1,1,192,32]{1,0,2,3} reverse(f32[1,1,192,32]{1,0,2,3} %transpose.3530), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3532 = f32[96,192,28,28]{3,2,1,0} convolution(f32[96,32,28,28]{3,2,1,0} %get-tuple-element.3527, f32[1,1,192,32]{1,0,2,3} %reverse.3531), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3541 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3549 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3541), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3550 = f32[96,192,28,28]{3,2,1,0} select(pred[96,192,28,28]{3,2,1,0} %and.3548, f32[96,192,28,28]{3,2,1,0} %convolution.3532, f32[96,192,28,28]{3,2,1,0} %broadcast.3549), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3558 = (f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) batch-norm-grad(f32[96,192,28,28]{3,2,1,0} %convolution.506, f32[192]{0} %p83.500, f32[192]{0} %get-tuple-element.509, f32[192]{0} %subtract.3557, f32[96,192,28,28]{3,2,1,0} %select.3550), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3559 = f32[96,192,28,28]{3,2,1,0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3558), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3570 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3575 = f32[192]{0} reduce(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3559, f32[] %constant.3570), dimensions={0,2,3}, to_apply=%AddComputation.3571, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.470 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.468), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3587 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3588 = f32[192]{0} broadcast(f32[] %constant.3587), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.471 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-training.468), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.472 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.473 = f32[192]{0} broadcast(f32[] %constant.472), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.474 = f32[192]{0} add(f32[192]{0} %get-tuple-element.471, f32[192]{0} %broadcast.473), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.475 = f32[192]{0} rsqrt(f32[192]{0} %add.474), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3589 = f32[192]{0} divide(f32[192]{0} %broadcast.3588, f32[192]{0} %rsqrt.475), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3590 = f32[192]{0} multiply(f32[192]{0} %divide.3589, f32[192]{0} %divide.3589), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3586 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3591 = f32[192]{0} broadcast(f32[] %constant.3586), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3592 = f32[192]{0} subtract(f32[192]{0} %multiply.3590, f32[192]{0} %broadcast.3591), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3577 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3578 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3577), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3579 = pred[96,192,28,28]{3,2,1,0} compare(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.469, f32[96,192,28,28]{3,2,1,0} %broadcast.3578), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3580 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3581 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3580), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3582 = pred[96,192,28,28]{3,2,1,0} compare(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.469, f32[96,192,28,28]{3,2,1,0} %broadcast.3581), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3583 = pred[96,192,28,28]{3,2,1,0} and(pred[96,192,28,28]{3,2,1,0} %compare.3579, pred[96,192,28,28]{3,2,1,0} %compare.3582), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3562 = f32[3,3,1,192]{1,0,2,3} transpose(f32[192,1,3,3]{3,2,1,0} %p84.501), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3563 = f32[3,3,1,192,1]{4,3,2,1,0} reshape(f32[3,3,1,192]{1,0,2,3} %transpose.3562), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3564 = f32[3,3,192,1,1]{4,2,3,1,0} transpose(f32[3,3,1,192,1]{4,3,2,1,0} %reshape.3563), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3565 = f32[3,3,192,1]{3,2,1,0} reshape(f32[3,3,192,1,1]{4,2,3,1,0} %transpose.3564), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3566 = f32[3,3,192,1]{3,2,1,0} reverse(f32[3,3,192,1]{3,2,1,0} %reshape.3565), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3567 = f32[96,192,28,28]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3559, f32[3,3,192,1]{3,2,1,0} %reverse.3566), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=192, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3576 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3584 = f32[96,192,28,28]{3,2,1,0} broadcast(f32[] %constant.3576), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3585 = f32[96,192,28,28]{3,2,1,0} select(pred[96,192,28,28]{3,2,1,0} %and.3583, f32[96,192,28,28]{3,2,1,0} %convolution.3567, f32[96,192,28,28]{3,2,1,0} %broadcast.3584), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3593 = (f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) batch-norm-grad(f32[96,192,28,28]{3,2,1,0} %convolution.467, f32[192]{0} %p77.465, f32[192]{0} %get-tuple-element.470, f32[192]{0} %subtract.3592, f32[96,192,28,28]{3,2,1,0} %select.3585), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3594 = f32[96,192,28,28]{3,2,1,0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3593), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3602 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3607 = f32[192]{0} reduce(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3594, f32[] %constant.3602), dimensions={0,2,3}, to_apply=%AddComputation.3603, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.435 = f32[32]{0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.433), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3616 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3617 = f32[32]{0} broadcast(f32[] %constant.3616), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.436 = f32[32]{0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.433), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.437 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.438 = f32[32]{0} broadcast(f32[] %constant.437), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.439 = f32[32]{0} add(f32[32]{0} %get-tuple-element.436, f32[32]{0} %broadcast.438), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.440 = f32[32]{0} rsqrt(f32[32]{0} %add.439), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3618 = f32[32]{0} divide(f32[32]{0} %broadcast.3617, f32[32]{0} %rsqrt.440), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3619 = f32[32]{0} multiply(f32[32]{0} %divide.3618, f32[32]{0} %divide.3618), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3615 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3620 = f32[32]{0} broadcast(f32[] %constant.3615), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3621 = f32[32]{0} subtract(f32[32]{0} %multiply.3619, f32[32]{0} %broadcast.3620), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.3597 = f32[1,1,32,192]{1,0,2,3} transpose(f32[192,32,1,1]{3,2,1,0} %p78.466), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3598 = f32[1,1,32,192]{1,0,2,3} reverse(f32[1,1,32,192]{1,0,2,3} %transpose.3597), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3599 = f32[96,32,28,28]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3594, f32[1,1,32,192]{1,0,2,3} %reverse.3598), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3608 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant"}
+  %reshape.3609 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.3608), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.3610 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.3609), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %reshape.3611 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.3610), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.3612 = f32[96,32,28,28]{3,2,1,0} broadcast(f32[] %reshape.3611), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %multiply.3613 = f32[96,32,28,28]{3,2,1,0} multiply(f32[96,32,28,28]{3,2,1,0} %convolution.3599, f32[96,32,28,28]{3,2,1,0} %broadcast.3612), metadata={op_type="aten__mul" op_name="aten__mul"}
+  %add.3614 = f32[96,32,28,28]{3,2,1,0} add(f32[96,32,28,28]{3,2,1,0} %add.3518, f32[96,32,28,28]{3,2,1,0} %multiply.3613), metadata={op_type="aten__add" op_name="aten__add"}
+  %batch-norm-grad.3622 = (f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) batch-norm-grad(f32[96,32,28,28]{3,2,1,0} %convolution.432, f32[32]{0} %p71.426, f32[32]{0} %get-tuple-element.435, f32[32]{0} %subtract.3621, f32[96,32,28,28]{3,2,1,0} %add.3614), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3623 = f32[96,32,28,28]{3,2,1,0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3622), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3631 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3636 = f32[32]{0} reduce(f32[96,32,28,28]{3,2,1,0} %get-tuple-element.3623, f32[] %constant.3631), dimensions={0,2,3}, to_apply=%AddComputation.3632, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.396 = f32[144]{0} get-tuple-element((f32[96,144,28,28]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-training.394), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3648 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3649 = f32[144]{0} broadcast(f32[] %constant.3648), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.397 = f32[144]{0} get-tuple-element((f32[96,144,28,28]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-training.394), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.398 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.399 = f32[144]{0} broadcast(f32[] %constant.398), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.400 = f32[144]{0} add(f32[144]{0} %get-tuple-element.397, f32[144]{0} %broadcast.399), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.401 = f32[144]{0} rsqrt(f32[144]{0} %add.400), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3650 = f32[144]{0} divide(f32[144]{0} %broadcast.3649, f32[144]{0} %rsqrt.401), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3651 = f32[144]{0} multiply(f32[144]{0} %divide.3650, f32[144]{0} %divide.3650), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3647 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3652 = f32[144]{0} broadcast(f32[] %constant.3647), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3653 = f32[144]{0} subtract(f32[144]{0} %multiply.3651, f32[144]{0} %broadcast.3652), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3638 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3639 = f32[96,144,28,28]{3,2,1,0} broadcast(f32[] %constant.3638), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3640 = pred[96,144,28,28]{3,2,1,0} compare(f32[96,144,28,28]{3,2,1,0} %get-tuple-element.395, f32[96,144,28,28]{3,2,1,0} %broadcast.3639), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3641 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3642 = f32[96,144,28,28]{3,2,1,0} broadcast(f32[] %constant.3641), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3643 = pred[96,144,28,28]{3,2,1,0} compare(f32[96,144,28,28]{3,2,1,0} %get-tuple-element.395, f32[96,144,28,28]{3,2,1,0} %broadcast.3642), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3644 = pred[96,144,28,28]{3,2,1,0} and(pred[96,144,28,28]{3,2,1,0} %compare.3640, pred[96,144,28,28]{3,2,1,0} %compare.3643), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3626 = f32[1,1,144,32]{1,0,2,3} transpose(f32[32,144,1,1]{3,2,1,0} %p72.427), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3627 = f32[1,1,144,32]{1,0,2,3} reverse(f32[1,1,144,32]{1,0,2,3} %transpose.3626), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3628 = f32[96,144,28,28]{3,2,1,0} convolution(f32[96,32,28,28]{3,2,1,0} %get-tuple-element.3623, f32[1,1,144,32]{1,0,2,3} %reverse.3627), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3637 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3645 = f32[96,144,28,28]{3,2,1,0} broadcast(f32[] %constant.3637), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3646 = f32[96,144,28,28]{3,2,1,0} select(pred[96,144,28,28]{3,2,1,0} %and.3644, f32[96,144,28,28]{3,2,1,0} %convolution.3628, f32[96,144,28,28]{3,2,1,0} %broadcast.3645), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3654 = (f32[96,144,28,28]{3,2,1,0}, f32[144]{0}, f32[144]{0}) batch-norm-grad(f32[96,144,28,28]{3,2,1,0} %convolution.393, f32[144]{0} %p65.387, f32[144]{0} %get-tuple-element.396, f32[144]{0} %subtract.3653, f32[96,144,28,28]{3,2,1,0} %select.3646), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3655 = f32[96,144,28,28]{3,2,1,0} get-tuple-element((f32[96,144,28,28]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-grad.3654), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3666 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3671 = f32[144]{0} reduce(f32[96,144,28,28]{3,2,1,0} %get-tuple-element.3655, f32[] %constant.3666), dimensions={0,2,3}, to_apply=%AddComputation.3667, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.357 = f32[144]{0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-training.355), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3683 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3684 = f32[144]{0} broadcast(f32[] %constant.3683), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.358 = f32[144]{0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-training.355), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.359 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.360 = f32[144]{0} broadcast(f32[] %constant.359), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.361 = f32[144]{0} add(f32[144]{0} %get-tuple-element.358, f32[144]{0} %broadcast.360), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.362 = f32[144]{0} rsqrt(f32[144]{0} %add.361), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3685 = f32[144]{0} divide(f32[144]{0} %broadcast.3684, f32[144]{0} %rsqrt.362), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3686 = f32[144]{0} multiply(f32[144]{0} %divide.3685, f32[144]{0} %divide.3685), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3682 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3687 = f32[144]{0} broadcast(f32[] %constant.3682), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3688 = f32[144]{0} subtract(f32[144]{0} %multiply.3686, f32[144]{0} %broadcast.3687), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3673 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3674 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %constant.3673), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3675 = pred[96,144,56,56]{3,2,1,0} compare(f32[96,144,56,56]{3,2,1,0} %get-tuple-element.356, f32[96,144,56,56]{3,2,1,0} %broadcast.3674), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3676 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3677 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %constant.3676), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3678 = pred[96,144,56,56]{3,2,1,0} compare(f32[96,144,56,56]{3,2,1,0} %get-tuple-element.356, f32[96,144,56,56]{3,2,1,0} %broadcast.3677), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3679 = pred[96,144,56,56]{3,2,1,0} and(pred[96,144,56,56]{3,2,1,0} %compare.3675, pred[96,144,56,56]{3,2,1,0} %compare.3678), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3658 = f32[3,3,1,144]{1,0,2,3} transpose(f32[144,1,3,3]{3,2,1,0} %p66.388), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3659 = f32[3,3,1,144,1]{4,3,2,1,0} reshape(f32[3,3,1,144]{1,0,2,3} %transpose.3658), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3660 = f32[3,3,144,1,1]{4,2,3,1,0} transpose(f32[3,3,1,144,1]{4,3,2,1,0} %reshape.3659), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3661 = f32[3,3,144,1]{3,2,1,0} reshape(f32[3,3,144,1,1]{4,2,3,1,0} %transpose.3660), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3662 = f32[3,3,144,1]{3,2,1,0} reverse(f32[3,3,144,1]{3,2,1,0} %reshape.3661), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3663 = f32[96,144,56,56]{3,2,1,0} convolution(f32[96,144,28,28]{3,2,1,0} %get-tuple-element.3655, f32[3,3,144,1]{3,2,1,0} %reverse.3662), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2}, dim_labels=bf01_01oi->bf01, feature_group_count=144, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3672 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3680 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %constant.3672), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3681 = f32[96,144,56,56]{3,2,1,0} select(pred[96,144,56,56]{3,2,1,0} %and.3679, f32[96,144,56,56]{3,2,1,0} %convolution.3663, f32[96,144,56,56]{3,2,1,0} %broadcast.3680), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3689 = (f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) batch-norm-grad(f32[96,144,56,56]{3,2,1,0} %convolution.354, f32[144]{0} %p59.345, f32[144]{0} %get-tuple-element.357, f32[144]{0} %subtract.3688, f32[96,144,56,56]{3,2,1,0} %select.3681), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3690 = f32[96,144,56,56]{3,2,1,0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-grad.3689), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3698 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3703 = f32[144]{0} reduce(f32[96,144,56,56]{3,2,1,0} %get-tuple-element.3690, f32[] %constant.3698), dimensions={0,2,3}, to_apply=%AddComputation.3699, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.315 = f32[24]{0} get-tuple-element((f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) %batch-norm-training.313), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3705 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3706 = f32[24]{0} broadcast(f32[] %constant.3705), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.316 = f32[24]{0} get-tuple-element((f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) %batch-norm-training.313), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.317 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.318 = f32[24]{0} broadcast(f32[] %constant.317), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.319 = f32[24]{0} add(f32[24]{0} %get-tuple-element.316, f32[24]{0} %broadcast.318), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.320 = f32[24]{0} rsqrt(f32[24]{0} %add.319), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3707 = f32[24]{0} divide(f32[24]{0} %broadcast.3706, f32[24]{0} %rsqrt.320), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3708 = f32[24]{0} multiply(f32[24]{0} %divide.3707, f32[24]{0} %divide.3707), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3704 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3709 = f32[24]{0} broadcast(f32[] %constant.3704), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3710 = f32[24]{0} subtract(f32[24]{0} %multiply.3708, f32[24]{0} %broadcast.3709), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.3693 = f32[1,1,24,144]{1,0,2,3} transpose(f32[144,24,1,1]{3,2,1,0} %p60.346), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3694 = f32[1,1,24,144]{1,0,2,3} reverse(f32[1,1,24,144]{1,0,2,3} %transpose.3693), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3695 = f32[96,24,56,56]{3,2,1,0} convolution(f32[96,144,56,56]{3,2,1,0} %get-tuple-element.3690, f32[1,1,24,144]{1,0,2,3} %reverse.3694), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %batch-norm-grad.3711 = (f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) batch-norm-grad(f32[96,24,56,56]{3,2,1,0} %convolution.312, f32[24]{0} %p53.306, f32[24]{0} %get-tuple-element.315, f32[24]{0} %subtract.3710, f32[96,24,56,56]{3,2,1,0} %convolution.3695), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3712 = f32[96,24,56,56]{3,2,1,0} get-tuple-element((f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) %batch-norm-grad.3711), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3720 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3725 = f32[24]{0} reduce(f32[96,24,56,56]{3,2,1,0} %get-tuple-element.3712, f32[] %constant.3720), dimensions={0,2,3}, to_apply=%AddComputation.3721, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.276 = f32[144]{0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-training.274), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3737 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3738 = f32[144]{0} broadcast(f32[] %constant.3737), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.277 = f32[144]{0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-training.274), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.278 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.279 = f32[144]{0} broadcast(f32[] %constant.278), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.280 = f32[144]{0} add(f32[144]{0} %get-tuple-element.277, f32[144]{0} %broadcast.279), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.281 = f32[144]{0} rsqrt(f32[144]{0} %add.280), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3739 = f32[144]{0} divide(f32[144]{0} %broadcast.3738, f32[144]{0} %rsqrt.281), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3740 = f32[144]{0} multiply(f32[144]{0} %divide.3739, f32[144]{0} %divide.3739), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3736 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3741 = f32[144]{0} broadcast(f32[] %constant.3736), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3742 = f32[144]{0} subtract(f32[144]{0} %multiply.3740, f32[144]{0} %broadcast.3741), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3727 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3728 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %constant.3727), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3729 = pred[96,144,56,56]{3,2,1,0} compare(f32[96,144,56,56]{3,2,1,0} %get-tuple-element.275, f32[96,144,56,56]{3,2,1,0} %broadcast.3728), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3730 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3731 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %constant.3730), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3732 = pred[96,144,56,56]{3,2,1,0} compare(f32[96,144,56,56]{3,2,1,0} %get-tuple-element.275, f32[96,144,56,56]{3,2,1,0} %broadcast.3731), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3733 = pred[96,144,56,56]{3,2,1,0} and(pred[96,144,56,56]{3,2,1,0} %compare.3729, pred[96,144,56,56]{3,2,1,0} %compare.3732), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3715 = f32[1,1,144,24]{1,0,2,3} transpose(f32[24,144,1,1]{3,2,1,0} %p54.307), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3716 = f32[1,1,144,24]{1,0,2,3} reverse(f32[1,1,144,24]{1,0,2,3} %transpose.3715), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3717 = f32[96,144,56,56]{3,2,1,0} convolution(f32[96,24,56,56]{3,2,1,0} %get-tuple-element.3712, f32[1,1,144,24]{1,0,2,3} %reverse.3716), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3726 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3734 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %constant.3726), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3735 = f32[96,144,56,56]{3,2,1,0} select(pred[96,144,56,56]{3,2,1,0} %and.3733, f32[96,144,56,56]{3,2,1,0} %convolution.3717, f32[96,144,56,56]{3,2,1,0} %broadcast.3734), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3743 = (f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) batch-norm-grad(f32[96,144,56,56]{3,2,1,0} %convolution.273, f32[144]{0} %p47.267, f32[144]{0} %get-tuple-element.276, f32[144]{0} %subtract.3742, f32[96,144,56,56]{3,2,1,0} %select.3735), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3744 = f32[96,144,56,56]{3,2,1,0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-grad.3743), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3755 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3760 = f32[144]{0} reduce(f32[96,144,56,56]{3,2,1,0} %get-tuple-element.3744, f32[] %constant.3755), dimensions={0,2,3}, to_apply=%AddComputation.3756, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.237 = f32[144]{0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-training.235), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3772 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3773 = f32[144]{0} broadcast(f32[] %constant.3772), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.238 = f32[144]{0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-training.235), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.239 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.240 = f32[144]{0} broadcast(f32[] %constant.239), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.241 = f32[144]{0} add(f32[144]{0} %get-tuple-element.238, f32[144]{0} %broadcast.240), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.242 = f32[144]{0} rsqrt(f32[144]{0} %add.241), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3774 = f32[144]{0} divide(f32[144]{0} %broadcast.3773, f32[144]{0} %rsqrt.242), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3775 = f32[144]{0} multiply(f32[144]{0} %divide.3774, f32[144]{0} %divide.3774), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3771 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3776 = f32[144]{0} broadcast(f32[] %constant.3771), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3777 = f32[144]{0} subtract(f32[144]{0} %multiply.3775, f32[144]{0} %broadcast.3776), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3762 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3763 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %constant.3762), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3764 = pred[96,144,56,56]{3,2,1,0} compare(f32[96,144,56,56]{3,2,1,0} %get-tuple-element.236, f32[96,144,56,56]{3,2,1,0} %broadcast.3763), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3765 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3766 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %constant.3765), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3767 = pred[96,144,56,56]{3,2,1,0} compare(f32[96,144,56,56]{3,2,1,0} %get-tuple-element.236, f32[96,144,56,56]{3,2,1,0} %broadcast.3766), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3768 = pred[96,144,56,56]{3,2,1,0} and(pred[96,144,56,56]{3,2,1,0} %compare.3764, pred[96,144,56,56]{3,2,1,0} %compare.3767), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3747 = f32[3,3,1,144]{1,0,2,3} transpose(f32[144,1,3,3]{3,2,1,0} %p48.268), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3748 = f32[3,3,1,144,1]{4,3,2,1,0} reshape(f32[3,3,1,144]{1,0,2,3} %transpose.3747), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3749 = f32[3,3,144,1,1]{4,2,3,1,0} transpose(f32[3,3,1,144,1]{4,3,2,1,0} %reshape.3748), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3750 = f32[3,3,144,1]{3,2,1,0} reshape(f32[3,3,144,1,1]{4,2,3,1,0} %transpose.3749), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3751 = f32[3,3,144,1]{3,2,1,0} reverse(f32[3,3,144,1]{3,2,1,0} %reshape.3750), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3752 = f32[96,144,56,56]{3,2,1,0} convolution(f32[96,144,56,56]{3,2,1,0} %get-tuple-element.3744, f32[3,3,144,1]{3,2,1,0} %reverse.3751), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=144, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3761 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3769 = f32[96,144,56,56]{3,2,1,0} broadcast(f32[] %constant.3761), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3770 = f32[96,144,56,56]{3,2,1,0} select(pred[96,144,56,56]{3,2,1,0} %and.3768, f32[96,144,56,56]{3,2,1,0} %convolution.3752, f32[96,144,56,56]{3,2,1,0} %broadcast.3769), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3778 = (f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) batch-norm-grad(f32[96,144,56,56]{3,2,1,0} %convolution.234, f32[144]{0} %p41.232, f32[144]{0} %get-tuple-element.237, f32[144]{0} %subtract.3777, f32[96,144,56,56]{3,2,1,0} %select.3770), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3779 = f32[96,144,56,56]{3,2,1,0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-grad.3778), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3787 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3792 = f32[144]{0} reduce(f32[96,144,56,56]{3,2,1,0} %get-tuple-element.3779, f32[] %constant.3787), dimensions={0,2,3}, to_apply=%AddComputation.3788, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.202 = f32[24]{0} get-tuple-element((f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) %batch-norm-training.200), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3801 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3802 = f32[24]{0} broadcast(f32[] %constant.3801), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.203 = f32[24]{0} get-tuple-element((f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) %batch-norm-training.200), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.204 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.205 = f32[24]{0} broadcast(f32[] %constant.204), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.206 = f32[24]{0} add(f32[24]{0} %get-tuple-element.203, f32[24]{0} %broadcast.205), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.207 = f32[24]{0} rsqrt(f32[24]{0} %add.206), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3803 = f32[24]{0} divide(f32[24]{0} %broadcast.3802, f32[24]{0} %rsqrt.207), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3804 = f32[24]{0} multiply(f32[24]{0} %divide.3803, f32[24]{0} %divide.3803), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3800 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3805 = f32[24]{0} broadcast(f32[] %constant.3800), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3806 = f32[24]{0} subtract(f32[24]{0} %multiply.3804, f32[24]{0} %broadcast.3805), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.3782 = f32[1,1,24,144]{1,0,2,3} transpose(f32[144,24,1,1]{3,2,1,0} %p42.233), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3783 = f32[1,1,24,144]{1,0,2,3} reverse(f32[1,1,24,144]{1,0,2,3} %transpose.3782), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3784 = f32[96,24,56,56]{3,2,1,0} convolution(f32[96,144,56,56]{3,2,1,0} %get-tuple-element.3779, f32[1,1,24,144]{1,0,2,3} %reverse.3783), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3793 = f32[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant"}
+  %reshape.3794 = f32[1,1,1,1]{3,2,1,0} reshape(f32[] %constant.3793), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.3795 = f32[1,1,1,1]{3,2,1,0} broadcast(f32[1,1,1,1]{3,2,1,0} %reshape.3794), dimensions={0,1,2,3}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %reshape.3796 = f32[] reshape(f32[1,1,1,1]{3,2,1,0} %broadcast.3795), metadata={op_type="aten__expand" op_name="aten__expand"}
+  %broadcast.3797 = f32[96,24,56,56]{3,2,1,0} broadcast(f32[] %reshape.3796), dimensions={}, metadata={op_type="aten__expand" op_name="aten__expand"}
+  %multiply.3798 = f32[96,24,56,56]{3,2,1,0} multiply(f32[96,24,56,56]{3,2,1,0} %convolution.3784, f32[96,24,56,56]{3,2,1,0} %broadcast.3797), metadata={op_type="aten__mul" op_name="aten__mul"}
+  %add.3799 = f32[96,24,56,56]{3,2,1,0} add(f32[96,24,56,56]{3,2,1,0} %convolution.3695, f32[96,24,56,56]{3,2,1,0} %multiply.3798), metadata={op_type="aten__add" op_name="aten__add"}
+  %batch-norm-grad.3807 = (f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) batch-norm-grad(f32[96,24,56,56]{3,2,1,0} %convolution.199, f32[24]{0} %p35.193, f32[24]{0} %get-tuple-element.202, f32[24]{0} %subtract.3806, f32[96,24,56,56]{3,2,1,0} %add.3799), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3808 = f32[96,24,56,56]{3,2,1,0} get-tuple-element((f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) %batch-norm-grad.3807), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3816 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3821 = f32[24]{0} reduce(f32[96,24,56,56]{3,2,1,0} %get-tuple-element.3808, f32[] %constant.3816), dimensions={0,2,3}, to_apply=%AddComputation.3817, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.163 = f32[96]{0} get-tuple-element((f32[96,96,56,56]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.161), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3833 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3834 = f32[96]{0} broadcast(f32[] %constant.3833), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.164 = f32[96]{0} get-tuple-element((f32[96,96,56,56]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.161), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.165 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.166 = f32[96]{0} broadcast(f32[] %constant.165), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.167 = f32[96]{0} add(f32[96]{0} %get-tuple-element.164, f32[96]{0} %broadcast.166), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.168 = f32[96]{0} rsqrt(f32[96]{0} %add.167), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3835 = f32[96]{0} divide(f32[96]{0} %broadcast.3834, f32[96]{0} %rsqrt.168), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3836 = f32[96]{0} multiply(f32[96]{0} %divide.3835, f32[96]{0} %divide.3835), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3832 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3837 = f32[96]{0} broadcast(f32[] %constant.3832), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3838 = f32[96]{0} subtract(f32[96]{0} %multiply.3836, f32[96]{0} %broadcast.3837), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3823 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3824 = f32[96,96,56,56]{3,2,1,0} broadcast(f32[] %constant.3823), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3825 = pred[96,96,56,56]{3,2,1,0} compare(f32[96,96,56,56]{3,2,1,0} %get-tuple-element.162, f32[96,96,56,56]{3,2,1,0} %broadcast.3824), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3826 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3827 = f32[96,96,56,56]{3,2,1,0} broadcast(f32[] %constant.3826), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3828 = pred[96,96,56,56]{3,2,1,0} compare(f32[96,96,56,56]{3,2,1,0} %get-tuple-element.162, f32[96,96,56,56]{3,2,1,0} %broadcast.3827), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3829 = pred[96,96,56,56]{3,2,1,0} and(pred[96,96,56,56]{3,2,1,0} %compare.3825, pred[96,96,56,56]{3,2,1,0} %compare.3828), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3811 = f32[1,1,96,24]{1,0,2,3} transpose(f32[24,96,1,1]{3,2,1,0} %p36.194), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3812 = f32[1,1,96,24]{1,0,2,3} reverse(f32[1,1,96,24]{1,0,2,3} %transpose.3811), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3813 = f32[96,96,56,56]{3,2,1,0} convolution(f32[96,24,56,56]{3,2,1,0} %get-tuple-element.3808, f32[1,1,96,24]{1,0,2,3} %reverse.3812), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3822 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3830 = f32[96,96,56,56]{3,2,1,0} broadcast(f32[] %constant.3822), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3831 = f32[96,96,56,56]{3,2,1,0} select(pred[96,96,56,56]{3,2,1,0} %and.3829, f32[96,96,56,56]{3,2,1,0} %convolution.3813, f32[96,96,56,56]{3,2,1,0} %broadcast.3830), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3839 = (f32[96,96,56,56]{3,2,1,0}, f32[96]{0}, f32[96]{0}) batch-norm-grad(f32[96,96,56,56]{3,2,1,0} %convolution.160, f32[96]{0} %p29.154, f32[96]{0} %get-tuple-element.163, f32[96]{0} %subtract.3838, f32[96,96,56,56]{3,2,1,0} %select.3831), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3840 = f32[96,96,56,56]{3,2,1,0} get-tuple-element((f32[96,96,56,56]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.3839), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3851 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3856 = f32[96]{0} reduce(f32[96,96,56,56]{3,2,1,0} %get-tuple-element.3840, f32[] %constant.3851), dimensions={0,2,3}, to_apply=%AddComputation.3852, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.124 = f32[96]{0} get-tuple-element((f32[96,96,112,112]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.122), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3868 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3869 = f32[96]{0} broadcast(f32[] %constant.3868), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.125 = f32[96]{0} get-tuple-element((f32[96,96,112,112]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-training.122), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.126 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.127 = f32[96]{0} broadcast(f32[] %constant.126), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.128 = f32[96]{0} add(f32[96]{0} %get-tuple-element.125, f32[96]{0} %broadcast.127), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.129 = f32[96]{0} rsqrt(f32[96]{0} %add.128), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3870 = f32[96]{0} divide(f32[96]{0} %broadcast.3869, f32[96]{0} %rsqrt.129), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3871 = f32[96]{0} multiply(f32[96]{0} %divide.3870, f32[96]{0} %divide.3870), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3867 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3872 = f32[96]{0} broadcast(f32[] %constant.3867), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3873 = f32[96]{0} subtract(f32[96]{0} %multiply.3871, f32[96]{0} %broadcast.3872), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3858 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3859 = f32[96,96,112,112]{3,2,1,0} broadcast(f32[] %constant.3858), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3860 = pred[96,96,112,112]{3,2,1,0} compare(f32[96,96,112,112]{3,2,1,0} %get-tuple-element.123, f32[96,96,112,112]{3,2,1,0} %broadcast.3859), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3861 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3862 = f32[96,96,112,112]{3,2,1,0} broadcast(f32[] %constant.3861), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3863 = pred[96,96,112,112]{3,2,1,0} compare(f32[96,96,112,112]{3,2,1,0} %get-tuple-element.123, f32[96,96,112,112]{3,2,1,0} %broadcast.3862), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3864 = pred[96,96,112,112]{3,2,1,0} and(pred[96,96,112,112]{3,2,1,0} %compare.3860, pred[96,96,112,112]{3,2,1,0} %compare.3863), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3843 = f32[3,3,1,96]{1,0,2,3} transpose(f32[96,1,3,3]{3,2,1,0} %p30.155), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3844 = f32[3,3,1,96,1]{4,3,2,1,0} reshape(f32[3,3,1,96]{1,0,2,3} %transpose.3843), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3845 = f32[3,3,96,1,1]{4,2,3,1,0} transpose(f32[3,3,1,96,1]{4,3,2,1,0} %reshape.3844), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3846 = f32[3,3,96,1]{3,2,1,0} reshape(f32[3,3,96,1,1]{4,2,3,1,0} %transpose.3845), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3847 = f32[3,3,96,1]{3,2,1,0} reverse(f32[3,3,96,1]{3,2,1,0} %reshape.3846), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3848 = f32[96,96,112,112]{3,2,1,0} convolution(f32[96,96,56,56]{3,2,1,0} %get-tuple-element.3840, f32[3,3,96,1]{3,2,1,0} %reverse.3847), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2}, dim_labels=bf01_01oi->bf01, feature_group_count=96, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3857 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3865 = f32[96,96,112,112]{3,2,1,0} broadcast(f32[] %constant.3857), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3866 = f32[96,96,112,112]{3,2,1,0} select(pred[96,96,112,112]{3,2,1,0} %and.3864, f32[96,96,112,112]{3,2,1,0} %convolution.3848, f32[96,96,112,112]{3,2,1,0} %broadcast.3865), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3874 = (f32[96,96,112,112]{3,2,1,0}, f32[96]{0}, f32[96]{0}) batch-norm-grad(f32[96,96,112,112]{3,2,1,0} %convolution.121, f32[96]{0} %p23.119, f32[96]{0} %get-tuple-element.124, f32[96]{0} %subtract.3873, f32[96,96,112,112]{3,2,1,0} %select.3866), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3875 = f32[96,96,112,112]{3,2,1,0} get-tuple-element((f32[96,96,112,112]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.3874), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3883 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3888 = f32[96]{0} reduce(f32[96,96,112,112]{3,2,1,0} %get-tuple-element.3875, f32[] %constant.3883), dimensions={0,2,3}, to_apply=%AddComputation.3884, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.89 = f32[16]{0} get-tuple-element((f32[96,16,112,112]{3,2,1,0}, f32[16]{0}, f32[16]{0}) %batch-norm-training.87), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3890 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3891 = f32[16]{0} broadcast(f32[] %constant.3890), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.90 = f32[16]{0} get-tuple-element((f32[96,16,112,112]{3,2,1,0}, f32[16]{0}, f32[16]{0}) %batch-norm-training.87), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.91 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.92 = f32[16]{0} broadcast(f32[] %constant.91), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.93 = f32[16]{0} add(f32[16]{0} %get-tuple-element.90, f32[16]{0} %broadcast.92), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.94 = f32[16]{0} rsqrt(f32[16]{0} %add.93), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3892 = f32[16]{0} divide(f32[16]{0} %broadcast.3891, f32[16]{0} %rsqrt.94), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3893 = f32[16]{0} multiply(f32[16]{0} %divide.3892, f32[16]{0} %divide.3892), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3889 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3894 = f32[16]{0} broadcast(f32[] %constant.3889), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3895 = f32[16]{0} subtract(f32[16]{0} %multiply.3893, f32[16]{0} %broadcast.3894), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.3878 = f32[1,1,16,96]{1,0,2,3} transpose(f32[96,16,1,1]{3,2,1,0} %p24.120), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3879 = f32[1,1,16,96]{1,0,2,3} reverse(f32[1,1,16,96]{1,0,2,3} %transpose.3878), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3880 = f32[96,16,112,112]{3,2,1,0} convolution(f32[96,96,112,112]{3,2,1,0} %get-tuple-element.3875, f32[1,1,16,96]{1,0,2,3} %reverse.3879), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %batch-norm-grad.3896 = (f32[96,16,112,112]{3,2,1,0}, f32[16]{0}, f32[16]{0}) batch-norm-grad(f32[96,16,112,112]{3,2,1,0} %convolution.86, f32[16]{0} %p17.80, f32[16]{0} %get-tuple-element.89, f32[16]{0} %subtract.3895, f32[96,16,112,112]{3,2,1,0} %convolution.3880), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3897 = f32[96,16,112,112]{3,2,1,0} get-tuple-element((f32[96,16,112,112]{3,2,1,0}, f32[16]{0}, f32[16]{0}) %batch-norm-grad.3896), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3905 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3910 = f32[16]{0} reduce(f32[96,16,112,112]{3,2,1,0} %get-tuple-element.3897, f32[] %constant.3905), dimensions={0,2,3}, to_apply=%AddComputation.3906, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.50 = f32[32]{0} get-tuple-element((f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.48), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3922 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3923 = f32[32]{0} broadcast(f32[] %constant.3922), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.51 = f32[32]{0} get-tuple-element((f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.48), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.52 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.53 = f32[32]{0} broadcast(f32[] %constant.52), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.54 = f32[32]{0} add(f32[32]{0} %get-tuple-element.51, f32[32]{0} %broadcast.53), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.55 = f32[32]{0} rsqrt(f32[32]{0} %add.54), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3924 = f32[32]{0} divide(f32[32]{0} %broadcast.3923, f32[32]{0} %rsqrt.55), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3925 = f32[32]{0} multiply(f32[32]{0} %divide.3924, f32[32]{0} %divide.3924), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3921 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3926 = f32[32]{0} broadcast(f32[] %constant.3921), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3927 = f32[32]{0} subtract(f32[32]{0} %multiply.3925, f32[32]{0} %broadcast.3926), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3912 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3913 = f32[96,32,112,112]{3,2,1,0} broadcast(f32[] %constant.3912), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3914 = pred[96,32,112,112]{3,2,1,0} compare(f32[96,32,112,112]{3,2,1,0} %get-tuple-element.49, f32[96,32,112,112]{3,2,1,0} %broadcast.3913), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3915 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3916 = f32[96,32,112,112]{3,2,1,0} broadcast(f32[] %constant.3915), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3917 = pred[96,32,112,112]{3,2,1,0} compare(f32[96,32,112,112]{3,2,1,0} %get-tuple-element.49, f32[96,32,112,112]{3,2,1,0} %broadcast.3916), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3918 = pred[96,32,112,112]{3,2,1,0} and(pred[96,32,112,112]{3,2,1,0} %compare.3914, pred[96,32,112,112]{3,2,1,0} %compare.3917), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3900 = f32[1,1,32,16]{1,0,2,3} transpose(f32[16,32,1,1]{3,2,1,0} %p18.81), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3901 = f32[1,1,32,16]{1,0,2,3} reverse(f32[1,1,32,16]{1,0,2,3} %transpose.3900), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3902 = f32[96,32,112,112]{3,2,1,0} convolution(f32[96,16,112,112]{3,2,1,0} %get-tuple-element.3897, f32[1,1,32,16]{1,0,2,3} %reverse.3901), window={size=1x1}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3911 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3919 = f32[96,32,112,112]{3,2,1,0} broadcast(f32[] %constant.3911), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3920 = f32[96,32,112,112]{3,2,1,0} select(pred[96,32,112,112]{3,2,1,0} %and.3918, f32[96,32,112,112]{3,2,1,0} %convolution.3902, f32[96,32,112,112]{3,2,1,0} %broadcast.3919), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3928 = (f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) batch-norm-grad(f32[96,32,112,112]{3,2,1,0} %convolution.47, f32[32]{0} %p10.40, f32[32]{0} %get-tuple-element.50, f32[32]{0} %subtract.3927, f32[96,32,112,112]{3,2,1,0} %select.3920), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3929 = f32[96,32,112,112]{3,2,1,0} get-tuple-element((f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3928), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3940 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3945 = f32[32]{0} reduce(f32[96,32,112,112]{3,2,1,0} %get-tuple-element.3929, f32[] %constant.3940), dimensions={0,2,3}, to_apply=%AddComputation.3941, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.10 = f32[32]{0} get-tuple-element((f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.8), index=1, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.3957 = f32[] constant(1), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3958 = f32[32]{0} broadcast(f32[] %constant.3957), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.11 = f32[32]{0} get-tuple-element((f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-training.8), index=2, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.12 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.13 = f32[32]{0} broadcast(f32[] %constant.12), dimensions={}, metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %add.14 = f32[32]{0} add(f32[32]{0} %get-tuple-element.11, f32[32]{0} %broadcast.13), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %rsqrt.15 = f32[32]{0} rsqrt(f32[32]{0} %add.14), metadata={op_type="aten__native_batch_norm" op_name="aten__native_batch_norm" source_file="batch_norm@functional.py" source_line=2451}
+  %divide.3959 = f32[32]{0} divide(f32[32]{0} %broadcast.3958, f32[32]{0} %rsqrt.15), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %multiply.3960 = f32[32]{0} multiply(f32[32]{0} %divide.3959, f32[32]{0} %divide.3959), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3956 = f32[] constant(1e-05), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %broadcast.3961 = f32[32]{0} broadcast(f32[] %constant.3956), dimensions={}, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %subtract.3962 = f32[32]{0} subtract(f32[32]{0} %multiply.3960, f32[32]{0} %broadcast.3961), metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %constant.3947 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3948 = f32[96,32,112,112]{3,2,1,0} broadcast(f32[] %constant.3947), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3949 = pred[96,32,112,112]{3,2,1,0} compare(f32[96,32,112,112]{3,2,1,0} %get-tuple-element.9, f32[96,32,112,112]{3,2,1,0} %broadcast.3948), direction=GE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %constant.3950 = f32[] constant(6), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3951 = f32[96,32,112,112]{3,2,1,0} broadcast(f32[] %constant.3950), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %compare.3952 = pred[96,32,112,112]{3,2,1,0} compare(f32[96,32,112,112]{3,2,1,0} %get-tuple-element.9, f32[96,32,112,112]{3,2,1,0} %broadcast.3951), direction=LE, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %and.3953 = pred[96,32,112,112]{3,2,1,0} and(pred[96,32,112,112]{3,2,1,0} %compare.3949, pred[96,32,112,112]{3,2,1,0} %compare.3952), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %transpose.3932 = f32[3,3,1,32]{1,0,2,3} transpose(f32[32,1,3,3]{3,2,1,0} %p11.41), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3933 = f32[3,3,1,32,1]{4,3,2,1,0} reshape(f32[3,3,1,32]{1,0,2,3} %transpose.3932), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3934 = f32[3,3,32,1,1]{4,2,3,1,0} transpose(f32[3,3,1,32,1]{4,3,2,1,0} %reshape.3933), dimensions={0,1,3,2,4}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reshape.3935 = f32[3,3,32,1]{3,2,1,0} reshape(f32[3,3,32,1,1]{4,2,3,1,0} %transpose.3934), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3936 = f32[3,3,32,1]{3,2,1,0} reverse(f32[3,3,32,1]{3,2,1,0} %reshape.3935), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3937 = f32[96,32,112,112]{3,2,1,0} convolution(f32[96,32,112,112]{3,2,1,0} %get-tuple-element.3929, f32[3,3,32,1]{3,2,1,0} %reverse.3936), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01oi->bf01, feature_group_count=32, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3946 = f32[] constant(0), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %broadcast.3954 = f32[96,32,112,112]{3,2,1,0} broadcast(f32[] %constant.3946), dimensions={}, metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %select.3955 = f32[96,32,112,112]{3,2,1,0} select(pred[96,32,112,112]{3,2,1,0} %and.3953, f32[96,32,112,112]{3,2,1,0} %convolution.3937, f32[96,32,112,112]{3,2,1,0} %broadcast.3954), metadata={op_type="aten__hardtanh_backward" op_name="aten__hardtanh_backward"}
+  %batch-norm-grad.3963 = (f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) batch-norm-grad(f32[96,32,112,112]{3,2,1,0} %convolution.7, f32[32]{0} %p3.4, f32[32]{0} %get-tuple-element.10, f32[32]{0} %subtract.3962, f32[96,32,112,112]{3,2,1,0} %select.3955), epsilon=1e-05, feature_index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3964 = f32[96,32,112,112]{3,2,1,0} get-tuple-element((f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3963), index=0, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %transpose.3967 = f32[3,3,3,32]{1,0,2,3} transpose(f32[32,3,3,3]{3,2,1,0} %p4.5), dimensions={2,3,1,0}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reverse.3968 = f32[3,3,3,32]{1,0,2,3} reverse(f32[3,3,3,32]{1,0,2,3} %transpose.3967), dimensions={0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %convolution.3969 = f32[96,3,224,224]{3,2,1,0} convolution(f32[96,32,112,112]{3,2,1,0} %get-tuple-element.3964, f32[3,3,3,32]{1,0,2,3} %reverse.3968), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2}, dim_labels=bf01_01oi->bf01, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.3972 = f32[] constant(0), metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %reduce.3977 = f32[32]{0} reduce(f32[96,32,112,112]{3,2,1,0} %get-tuple-element.3964, f32[] %constant.3972), dimensions={0,2,3}, to_apply=%AddComputation.3973, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %constant.17 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.18 = f32[32]{0} broadcast(f32[] %constant.17), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.19 = f32[32]{0} multiply(f32[32]{0} %get-tuple-element.10, f32[32]{0} %broadcast.18), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p0.1 = f32[32]{0} parameter(0), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.16 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.20 = f32[] subtract(f32[] %constant.16, f32[] %constant.17), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.21 = f32[32]{0} broadcast(f32[] %subtract.20), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.22 = f32[32]{0} multiply(f32[32]{0} %p0.1, f32[32]{0} %broadcast.21), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.23 = f32[32]{0} add(f32[32]{0} %multiply.19, f32[32]{0} %multiply.22), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.25 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.26 = f32[32]{0} broadcast(f32[] %constant.25), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.27 = f32[32]{0} multiply(f32[32]{0} %get-tuple-element.11, f32[32]{0} %broadcast.26), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p1.2 = f32[32]{0} parameter(1), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.24 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.28 = f32[] subtract(f32[] %constant.24, f32[] %constant.25), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.29 = f32[32]{0} broadcast(f32[] %subtract.28), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.30 = f32[32]{0} multiply(f32[32]{0} %p1.2, f32[32]{0} %broadcast.29), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.31 = f32[32]{0} add(f32[32]{0} %multiply.27, f32[32]{0} %multiply.30), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p6.35 = s64[] parameter(6), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.33 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.32 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.34 = s64[] multiply(s64[] %constant.33, s64[] %constant.32), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.36 = s64[] add(s64[] %p6.35, s64[] %multiply.34), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.57 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.58 = f32[32]{0} broadcast(f32[] %constant.57), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.59 = f32[32]{0} multiply(f32[32]{0} %get-tuple-element.50, f32[32]{0} %broadcast.58), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p7.37 = f32[32]{0} parameter(7), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.56 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.60 = f32[] subtract(f32[] %constant.56, f32[] %constant.57), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.61 = f32[32]{0} broadcast(f32[] %subtract.60), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.62 = f32[32]{0} multiply(f32[32]{0} %p7.37, f32[32]{0} %broadcast.61), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.63 = f32[32]{0} add(f32[32]{0} %multiply.59, f32[32]{0} %multiply.62), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.65 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.66 = f32[32]{0} broadcast(f32[] %constant.65), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.67 = f32[32]{0} multiply(f32[32]{0} %get-tuple-element.51, f32[32]{0} %broadcast.66), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p8.38 = f32[32]{0} parameter(8), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.64 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.68 = f32[] subtract(f32[] %constant.64, f32[] %constant.65), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.69 = f32[32]{0} broadcast(f32[] %subtract.68), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.70 = f32[32]{0} multiply(f32[32]{0} %p8.38, f32[32]{0} %broadcast.69), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.71 = f32[32]{0} add(f32[32]{0} %multiply.67, f32[32]{0} %multiply.70), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p13.75 = s64[] parameter(13), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.73 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.72 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.74 = s64[] multiply(s64[] %constant.73, s64[] %constant.72), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.76 = s64[] add(s64[] %p13.75, s64[] %multiply.74), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.96 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.97 = f32[16]{0} broadcast(f32[] %constant.96), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.98 = f32[16]{0} multiply(f32[16]{0} %get-tuple-element.89, f32[16]{0} %broadcast.97), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p14.77 = f32[16]{0} parameter(14), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.95 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.99 = f32[] subtract(f32[] %constant.95, f32[] %constant.96), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.100 = f32[16]{0} broadcast(f32[] %subtract.99), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.101 = f32[16]{0} multiply(f32[16]{0} %p14.77, f32[16]{0} %broadcast.100), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.102 = f32[16]{0} add(f32[16]{0} %multiply.98, f32[16]{0} %multiply.101), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.104 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.105 = f32[16]{0} broadcast(f32[] %constant.104), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.106 = f32[16]{0} multiply(f32[16]{0} %get-tuple-element.90, f32[16]{0} %broadcast.105), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p15.78 = f32[16]{0} parameter(15), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.103 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.107 = f32[] subtract(f32[] %constant.103, f32[] %constant.104), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.108 = f32[16]{0} broadcast(f32[] %subtract.107), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.109 = f32[16]{0} multiply(f32[16]{0} %p15.78, f32[16]{0} %broadcast.108), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.110 = f32[16]{0} add(f32[16]{0} %multiply.106, f32[16]{0} %multiply.109), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p19.114 = s64[] parameter(19), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.112 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.111 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.113 = s64[] multiply(s64[] %constant.112, s64[] %constant.111), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.115 = s64[] add(s64[] %p19.114, s64[] %multiply.113), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.131 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.132 = f32[96]{0} broadcast(f32[] %constant.131), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.133 = f32[96]{0} multiply(f32[96]{0} %get-tuple-element.124, f32[96]{0} %broadcast.132), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p20.116 = f32[96]{0} parameter(20), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.130 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.134 = f32[] subtract(f32[] %constant.130, f32[] %constant.131), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.135 = f32[96]{0} broadcast(f32[] %subtract.134), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.136 = f32[96]{0} multiply(f32[96]{0} %p20.116, f32[96]{0} %broadcast.135), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.137 = f32[96]{0} add(f32[96]{0} %multiply.133, f32[96]{0} %multiply.136), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.139 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.140 = f32[96]{0} broadcast(f32[] %constant.139), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.141 = f32[96]{0} multiply(f32[96]{0} %get-tuple-element.125, f32[96]{0} %broadcast.140), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p21.117 = f32[96]{0} parameter(21), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.138 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.142 = f32[] subtract(f32[] %constant.138, f32[] %constant.139), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.143 = f32[96]{0} broadcast(f32[] %subtract.142), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.144 = f32[96]{0} multiply(f32[96]{0} %p21.117, f32[96]{0} %broadcast.143), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.145 = f32[96]{0} add(f32[96]{0} %multiply.141, f32[96]{0} %multiply.144), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p25.149 = s64[] parameter(25), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.147 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.146 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.148 = s64[] multiply(s64[] %constant.147, s64[] %constant.146), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.150 = s64[] add(s64[] %p25.149, s64[] %multiply.148), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.170 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.171 = f32[96]{0} broadcast(f32[] %constant.170), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.172 = f32[96]{0} multiply(f32[96]{0} %get-tuple-element.163, f32[96]{0} %broadcast.171), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p26.151 = f32[96]{0} parameter(26), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.169 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.173 = f32[] subtract(f32[] %constant.169, f32[] %constant.170), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.174 = f32[96]{0} broadcast(f32[] %subtract.173), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.175 = f32[96]{0} multiply(f32[96]{0} %p26.151, f32[96]{0} %broadcast.174), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.176 = f32[96]{0} add(f32[96]{0} %multiply.172, f32[96]{0} %multiply.175), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.178 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.179 = f32[96]{0} broadcast(f32[] %constant.178), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.180 = f32[96]{0} multiply(f32[96]{0} %get-tuple-element.164, f32[96]{0} %broadcast.179), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p27.152 = f32[96]{0} parameter(27), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.177 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.181 = f32[] subtract(f32[] %constant.177, f32[] %constant.178), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.182 = f32[96]{0} broadcast(f32[] %subtract.181), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.183 = f32[96]{0} multiply(f32[96]{0} %p27.152, f32[96]{0} %broadcast.182), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.184 = f32[96]{0} add(f32[96]{0} %multiply.180, f32[96]{0} %multiply.183), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p31.188 = s64[] parameter(31), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.186 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.185 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.187 = s64[] multiply(s64[] %constant.186, s64[] %constant.185), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.189 = s64[] add(s64[] %p31.188, s64[] %multiply.187), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.209 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.210 = f32[24]{0} broadcast(f32[] %constant.209), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.211 = f32[24]{0} multiply(f32[24]{0} %get-tuple-element.202, f32[24]{0} %broadcast.210), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p32.190 = f32[24]{0} parameter(32), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.208 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.212 = f32[] subtract(f32[] %constant.208, f32[] %constant.209), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.213 = f32[24]{0} broadcast(f32[] %subtract.212), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.214 = f32[24]{0} multiply(f32[24]{0} %p32.190, f32[24]{0} %broadcast.213), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.215 = f32[24]{0} add(f32[24]{0} %multiply.211, f32[24]{0} %multiply.214), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.217 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.218 = f32[24]{0} broadcast(f32[] %constant.217), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.219 = f32[24]{0} multiply(f32[24]{0} %get-tuple-element.203, f32[24]{0} %broadcast.218), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p33.191 = f32[24]{0} parameter(33), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.216 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.220 = f32[] subtract(f32[] %constant.216, f32[] %constant.217), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.221 = f32[24]{0} broadcast(f32[] %subtract.220), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.222 = f32[24]{0} multiply(f32[24]{0} %p33.191, f32[24]{0} %broadcast.221), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.223 = f32[24]{0} add(f32[24]{0} %multiply.219, f32[24]{0} %multiply.222), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p37.227 = s64[] parameter(37), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.225 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.224 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.226 = s64[] multiply(s64[] %constant.225, s64[] %constant.224), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.228 = s64[] add(s64[] %p37.227, s64[] %multiply.226), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.244 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.245 = f32[144]{0} broadcast(f32[] %constant.244), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.246 = f32[144]{0} multiply(f32[144]{0} %get-tuple-element.237, f32[144]{0} %broadcast.245), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p38.229 = f32[144]{0} parameter(38), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.243 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.247 = f32[] subtract(f32[] %constant.243, f32[] %constant.244), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.248 = f32[144]{0} broadcast(f32[] %subtract.247), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.249 = f32[144]{0} multiply(f32[144]{0} %p38.229, f32[144]{0} %broadcast.248), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.250 = f32[144]{0} add(f32[144]{0} %multiply.246, f32[144]{0} %multiply.249), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.252 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.253 = f32[144]{0} broadcast(f32[] %constant.252), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.254 = f32[144]{0} multiply(f32[144]{0} %get-tuple-element.238, f32[144]{0} %broadcast.253), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p39.230 = f32[144]{0} parameter(39), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.251 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.255 = f32[] subtract(f32[] %constant.251, f32[] %constant.252), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.256 = f32[144]{0} broadcast(f32[] %subtract.255), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.257 = f32[144]{0} multiply(f32[144]{0} %p39.230, f32[144]{0} %broadcast.256), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.258 = f32[144]{0} add(f32[144]{0} %multiply.254, f32[144]{0} %multiply.257), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p43.262 = s64[] parameter(43), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.260 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.259 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.261 = s64[] multiply(s64[] %constant.260, s64[] %constant.259), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.263 = s64[] add(s64[] %p43.262, s64[] %multiply.261), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.283 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.284 = f32[144]{0} broadcast(f32[] %constant.283), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.285 = f32[144]{0} multiply(f32[144]{0} %get-tuple-element.276, f32[144]{0} %broadcast.284), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p44.264 = f32[144]{0} parameter(44), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.282 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.286 = f32[] subtract(f32[] %constant.282, f32[] %constant.283), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.287 = f32[144]{0} broadcast(f32[] %subtract.286), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.288 = f32[144]{0} multiply(f32[144]{0} %p44.264, f32[144]{0} %broadcast.287), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.289 = f32[144]{0} add(f32[144]{0} %multiply.285, f32[144]{0} %multiply.288), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.291 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.292 = f32[144]{0} broadcast(f32[] %constant.291), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.293 = f32[144]{0} multiply(f32[144]{0} %get-tuple-element.277, f32[144]{0} %broadcast.292), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p45.265 = f32[144]{0} parameter(45), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.290 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.294 = f32[] subtract(f32[] %constant.290, f32[] %constant.291), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.295 = f32[144]{0} broadcast(f32[] %subtract.294), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.296 = f32[144]{0} multiply(f32[144]{0} %p45.265, f32[144]{0} %broadcast.295), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.297 = f32[144]{0} add(f32[144]{0} %multiply.293, f32[144]{0} %multiply.296), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p49.301 = s64[] parameter(49), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.299 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.298 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.300 = s64[] multiply(s64[] %constant.299, s64[] %constant.298), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.302 = s64[] add(s64[] %p49.301, s64[] %multiply.300), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.322 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.323 = f32[24]{0} broadcast(f32[] %constant.322), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.324 = f32[24]{0} multiply(f32[24]{0} %get-tuple-element.315, f32[24]{0} %broadcast.323), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p50.303 = f32[24]{0} parameter(50), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.321 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.325 = f32[] subtract(f32[] %constant.321, f32[] %constant.322), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.326 = f32[24]{0} broadcast(f32[] %subtract.325), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.327 = f32[24]{0} multiply(f32[24]{0} %p50.303, f32[24]{0} %broadcast.326), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.328 = f32[24]{0} add(f32[24]{0} %multiply.324, f32[24]{0} %multiply.327), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.330 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.331 = f32[24]{0} broadcast(f32[] %constant.330), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.332 = f32[24]{0} multiply(f32[24]{0} %get-tuple-element.316, f32[24]{0} %broadcast.331), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p51.304 = f32[24]{0} parameter(51), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.329 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.333 = f32[] subtract(f32[] %constant.329, f32[] %constant.330), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.334 = f32[24]{0} broadcast(f32[] %subtract.333), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.335 = f32[24]{0} multiply(f32[24]{0} %p51.304, f32[24]{0} %broadcast.334), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.336 = f32[24]{0} add(f32[24]{0} %multiply.332, f32[24]{0} %multiply.335), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p55.340 = s64[] parameter(55), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.338 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.337 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.339 = s64[] multiply(s64[] %constant.338, s64[] %constant.337), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.341 = s64[] add(s64[] %p55.340, s64[] %multiply.339), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.364 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.365 = f32[144]{0} broadcast(f32[] %constant.364), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.366 = f32[144]{0} multiply(f32[144]{0} %get-tuple-element.357, f32[144]{0} %broadcast.365), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p56.342 = f32[144]{0} parameter(56), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.363 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.367 = f32[] subtract(f32[] %constant.363, f32[] %constant.364), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.368 = f32[144]{0} broadcast(f32[] %subtract.367), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.369 = f32[144]{0} multiply(f32[144]{0} %p56.342, f32[144]{0} %broadcast.368), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.370 = f32[144]{0} add(f32[144]{0} %multiply.366, f32[144]{0} %multiply.369), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.372 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.373 = f32[144]{0} broadcast(f32[] %constant.372), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.374 = f32[144]{0} multiply(f32[144]{0} %get-tuple-element.358, f32[144]{0} %broadcast.373), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p57.343 = f32[144]{0} parameter(57), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.371 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.375 = f32[] subtract(f32[] %constant.371, f32[] %constant.372), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.376 = f32[144]{0} broadcast(f32[] %subtract.375), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.377 = f32[144]{0} multiply(f32[144]{0} %p57.343, f32[144]{0} %broadcast.376), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.378 = f32[144]{0} add(f32[144]{0} %multiply.374, f32[144]{0} %multiply.377), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p61.382 = s64[] parameter(61), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.380 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.379 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.381 = s64[] multiply(s64[] %constant.380, s64[] %constant.379), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.383 = s64[] add(s64[] %p61.382, s64[] %multiply.381), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.403 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.404 = f32[144]{0} broadcast(f32[] %constant.403), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.405 = f32[144]{0} multiply(f32[144]{0} %get-tuple-element.396, f32[144]{0} %broadcast.404), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p62.384 = f32[144]{0} parameter(62), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.402 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.406 = f32[] subtract(f32[] %constant.402, f32[] %constant.403), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.407 = f32[144]{0} broadcast(f32[] %subtract.406), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.408 = f32[144]{0} multiply(f32[144]{0} %p62.384, f32[144]{0} %broadcast.407), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.409 = f32[144]{0} add(f32[144]{0} %multiply.405, f32[144]{0} %multiply.408), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.411 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.412 = f32[144]{0} broadcast(f32[] %constant.411), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.413 = f32[144]{0} multiply(f32[144]{0} %get-tuple-element.397, f32[144]{0} %broadcast.412), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p63.385 = f32[144]{0} parameter(63), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.410 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.414 = f32[] subtract(f32[] %constant.410, f32[] %constant.411), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.415 = f32[144]{0} broadcast(f32[] %subtract.414), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.416 = f32[144]{0} multiply(f32[144]{0} %p63.385, f32[144]{0} %broadcast.415), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.417 = f32[144]{0} add(f32[144]{0} %multiply.413, f32[144]{0} %multiply.416), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p67.421 = s64[] parameter(67), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.419 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.418 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.420 = s64[] multiply(s64[] %constant.419, s64[] %constant.418), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.422 = s64[] add(s64[] %p67.421, s64[] %multiply.420), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.442 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.443 = f32[32]{0} broadcast(f32[] %constant.442), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.444 = f32[32]{0} multiply(f32[32]{0} %get-tuple-element.435, f32[32]{0} %broadcast.443), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p68.423 = f32[32]{0} parameter(68), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.441 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.445 = f32[] subtract(f32[] %constant.441, f32[] %constant.442), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.446 = f32[32]{0} broadcast(f32[] %subtract.445), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.447 = f32[32]{0} multiply(f32[32]{0} %p68.423, f32[32]{0} %broadcast.446), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.448 = f32[32]{0} add(f32[32]{0} %multiply.444, f32[32]{0} %multiply.447), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.450 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.451 = f32[32]{0} broadcast(f32[] %constant.450), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.452 = f32[32]{0} multiply(f32[32]{0} %get-tuple-element.436, f32[32]{0} %broadcast.451), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p69.424 = f32[32]{0} parameter(69), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.449 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.453 = f32[] subtract(f32[] %constant.449, f32[] %constant.450), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.454 = f32[32]{0} broadcast(f32[] %subtract.453), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.455 = f32[32]{0} multiply(f32[32]{0} %p69.424, f32[32]{0} %broadcast.454), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.456 = f32[32]{0} add(f32[32]{0} %multiply.452, f32[32]{0} %multiply.455), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p73.460 = s64[] parameter(73), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.458 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.457 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.459 = s64[] multiply(s64[] %constant.458, s64[] %constant.457), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.461 = s64[] add(s64[] %p73.460, s64[] %multiply.459), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.477 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.478 = f32[192]{0} broadcast(f32[] %constant.477), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.479 = f32[192]{0} multiply(f32[192]{0} %get-tuple-element.470, f32[192]{0} %broadcast.478), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p74.462 = f32[192]{0} parameter(74), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.476 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.480 = f32[] subtract(f32[] %constant.476, f32[] %constant.477), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.481 = f32[192]{0} broadcast(f32[] %subtract.480), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.482 = f32[192]{0} multiply(f32[192]{0} %p74.462, f32[192]{0} %broadcast.481), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.483 = f32[192]{0} add(f32[192]{0} %multiply.479, f32[192]{0} %multiply.482), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.485 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.486 = f32[192]{0} broadcast(f32[] %constant.485), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.487 = f32[192]{0} multiply(f32[192]{0} %get-tuple-element.471, f32[192]{0} %broadcast.486), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p75.463 = f32[192]{0} parameter(75), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.484 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.488 = f32[] subtract(f32[] %constant.484, f32[] %constant.485), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.489 = f32[192]{0} broadcast(f32[] %subtract.488), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.490 = f32[192]{0} multiply(f32[192]{0} %p75.463, f32[192]{0} %broadcast.489), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.491 = f32[192]{0} add(f32[192]{0} %multiply.487, f32[192]{0} %multiply.490), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p79.495 = s64[] parameter(79), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.493 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.492 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.494 = s64[] multiply(s64[] %constant.493, s64[] %constant.492), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.496 = s64[] add(s64[] %p79.495, s64[] %multiply.494), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.516 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.517 = f32[192]{0} broadcast(f32[] %constant.516), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.518 = f32[192]{0} multiply(f32[192]{0} %get-tuple-element.509, f32[192]{0} %broadcast.517), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p80.497 = f32[192]{0} parameter(80), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.515 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.519 = f32[] subtract(f32[] %constant.515, f32[] %constant.516), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.520 = f32[192]{0} broadcast(f32[] %subtract.519), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.521 = f32[192]{0} multiply(f32[192]{0} %p80.497, f32[192]{0} %broadcast.520), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.522 = f32[192]{0} add(f32[192]{0} %multiply.518, f32[192]{0} %multiply.521), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.524 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.525 = f32[192]{0} broadcast(f32[] %constant.524), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.526 = f32[192]{0} multiply(f32[192]{0} %get-tuple-element.510, f32[192]{0} %broadcast.525), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p81.498 = f32[192]{0} parameter(81), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.523 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.527 = f32[] subtract(f32[] %constant.523, f32[] %constant.524), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.528 = f32[192]{0} broadcast(f32[] %subtract.527), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.529 = f32[192]{0} multiply(f32[192]{0} %p81.498, f32[192]{0} %broadcast.528), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.530 = f32[192]{0} add(f32[192]{0} %multiply.526, f32[192]{0} %multiply.529), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p85.534 = s64[] parameter(85), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.532 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.531 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.533 = s64[] multiply(s64[] %constant.532, s64[] %constant.531), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.535 = s64[] add(s64[] %p85.534, s64[] %multiply.533), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.555 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.556 = f32[32]{0} broadcast(f32[] %constant.555), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.557 = f32[32]{0} multiply(f32[32]{0} %get-tuple-element.548, f32[32]{0} %broadcast.556), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p86.536 = f32[32]{0} parameter(86), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.554 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.558 = f32[] subtract(f32[] %constant.554, f32[] %constant.555), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.559 = f32[32]{0} broadcast(f32[] %subtract.558), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.560 = f32[32]{0} multiply(f32[32]{0} %p86.536, f32[32]{0} %broadcast.559), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.561 = f32[32]{0} add(f32[32]{0} %multiply.557, f32[32]{0} %multiply.560), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.563 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.564 = f32[32]{0} broadcast(f32[] %constant.563), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.565 = f32[32]{0} multiply(f32[32]{0} %get-tuple-element.549, f32[32]{0} %broadcast.564), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p87.537 = f32[32]{0} parameter(87), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.562 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.566 = f32[] subtract(f32[] %constant.562, f32[] %constant.563), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.567 = f32[32]{0} broadcast(f32[] %subtract.566), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.568 = f32[32]{0} multiply(f32[32]{0} %p87.537, f32[32]{0} %broadcast.567), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.569 = f32[32]{0} add(f32[32]{0} %multiply.565, f32[32]{0} %multiply.568), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p91.573 = s64[] parameter(91), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.571 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.570 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.572 = s64[] multiply(s64[] %constant.571, s64[] %constant.570), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.574 = s64[] add(s64[] %p91.573, s64[] %multiply.572), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.597 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.598 = f32[192]{0} broadcast(f32[] %constant.597), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.599 = f32[192]{0} multiply(f32[192]{0} %get-tuple-element.590, f32[192]{0} %broadcast.598), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p92.575 = f32[192]{0} parameter(92), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.596 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.600 = f32[] subtract(f32[] %constant.596, f32[] %constant.597), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.601 = f32[192]{0} broadcast(f32[] %subtract.600), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.602 = f32[192]{0} multiply(f32[192]{0} %p92.575, f32[192]{0} %broadcast.601), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.603 = f32[192]{0} add(f32[192]{0} %multiply.599, f32[192]{0} %multiply.602), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.605 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.606 = f32[192]{0} broadcast(f32[] %constant.605), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.607 = f32[192]{0} multiply(f32[192]{0} %get-tuple-element.591, f32[192]{0} %broadcast.606), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p93.576 = f32[192]{0} parameter(93), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.604 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.608 = f32[] subtract(f32[] %constant.604, f32[] %constant.605), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.609 = f32[192]{0} broadcast(f32[] %subtract.608), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.610 = f32[192]{0} multiply(f32[192]{0} %p93.576, f32[192]{0} %broadcast.609), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.611 = f32[192]{0} add(f32[192]{0} %multiply.607, f32[192]{0} %multiply.610), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p97.615 = s64[] parameter(97), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.613 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.612 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.614 = s64[] multiply(s64[] %constant.613, s64[] %constant.612), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.616 = s64[] add(s64[] %p97.615, s64[] %multiply.614), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.636 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.637 = f32[192]{0} broadcast(f32[] %constant.636), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.638 = f32[192]{0} multiply(f32[192]{0} %get-tuple-element.629, f32[192]{0} %broadcast.637), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p98.617 = f32[192]{0} parameter(98), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.635 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.639 = f32[] subtract(f32[] %constant.635, f32[] %constant.636), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.640 = f32[192]{0} broadcast(f32[] %subtract.639), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.641 = f32[192]{0} multiply(f32[192]{0} %p98.617, f32[192]{0} %broadcast.640), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.642 = f32[192]{0} add(f32[192]{0} %multiply.638, f32[192]{0} %multiply.641), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.644 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.645 = f32[192]{0} broadcast(f32[] %constant.644), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.646 = f32[192]{0} multiply(f32[192]{0} %get-tuple-element.630, f32[192]{0} %broadcast.645), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p99.618 = f32[192]{0} parameter(99), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.643 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.647 = f32[] subtract(f32[] %constant.643, f32[] %constant.644), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.648 = f32[192]{0} broadcast(f32[] %subtract.647), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.649 = f32[192]{0} multiply(f32[192]{0} %p99.618, f32[192]{0} %broadcast.648), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.650 = f32[192]{0} add(f32[192]{0} %multiply.646, f32[192]{0} %multiply.649), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p103.654 = s64[] parameter(103), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.652 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.651 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.653 = s64[] multiply(s64[] %constant.652, s64[] %constant.651), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.655 = s64[] add(s64[] %p103.654, s64[] %multiply.653), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.675 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.676 = f32[32]{0} broadcast(f32[] %constant.675), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.677 = f32[32]{0} multiply(f32[32]{0} %get-tuple-element.668, f32[32]{0} %broadcast.676), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p104.656 = f32[32]{0} parameter(104), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.674 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.678 = f32[] subtract(f32[] %constant.674, f32[] %constant.675), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.679 = f32[32]{0} broadcast(f32[] %subtract.678), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.680 = f32[32]{0} multiply(f32[32]{0} %p104.656, f32[32]{0} %broadcast.679), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.681 = f32[32]{0} add(f32[32]{0} %multiply.677, f32[32]{0} %multiply.680), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.683 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.684 = f32[32]{0} broadcast(f32[] %constant.683), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.685 = f32[32]{0} multiply(f32[32]{0} %get-tuple-element.669, f32[32]{0} %broadcast.684), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p105.657 = f32[32]{0} parameter(105), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.682 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.686 = f32[] subtract(f32[] %constant.682, f32[] %constant.683), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.687 = f32[32]{0} broadcast(f32[] %subtract.686), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.688 = f32[32]{0} multiply(f32[32]{0} %p105.657, f32[32]{0} %broadcast.687), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.689 = f32[32]{0} add(f32[32]{0} %multiply.685, f32[32]{0} %multiply.688), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p109.693 = s64[] parameter(109), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.691 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.690 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.692 = s64[] multiply(s64[] %constant.691, s64[] %constant.690), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.694 = s64[] add(s64[] %p109.693, s64[] %multiply.692), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.717 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.718 = f32[192]{0} broadcast(f32[] %constant.717), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.719 = f32[192]{0} multiply(f32[192]{0} %get-tuple-element.710, f32[192]{0} %broadcast.718), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p110.695 = f32[192]{0} parameter(110), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.716 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.720 = f32[] subtract(f32[] %constant.716, f32[] %constant.717), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.721 = f32[192]{0} broadcast(f32[] %subtract.720), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.722 = f32[192]{0} multiply(f32[192]{0} %p110.695, f32[192]{0} %broadcast.721), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.723 = f32[192]{0} add(f32[192]{0} %multiply.719, f32[192]{0} %multiply.722), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.725 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.726 = f32[192]{0} broadcast(f32[] %constant.725), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.727 = f32[192]{0} multiply(f32[192]{0} %get-tuple-element.711, f32[192]{0} %broadcast.726), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p111.696 = f32[192]{0} parameter(111), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.724 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.728 = f32[] subtract(f32[] %constant.724, f32[] %constant.725), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.729 = f32[192]{0} broadcast(f32[] %subtract.728), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.730 = f32[192]{0} multiply(f32[192]{0} %p111.696, f32[192]{0} %broadcast.729), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.731 = f32[192]{0} add(f32[192]{0} %multiply.727, f32[192]{0} %multiply.730), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p115.735 = s64[] parameter(115), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.733 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.732 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.734 = s64[] multiply(s64[] %constant.733, s64[] %constant.732), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.736 = s64[] add(s64[] %p115.735, s64[] %multiply.734), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.756 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.757 = f32[192]{0} broadcast(f32[] %constant.756), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.758 = f32[192]{0} multiply(f32[192]{0} %get-tuple-element.749, f32[192]{0} %broadcast.757), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p116.737 = f32[192]{0} parameter(116), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.755 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.759 = f32[] subtract(f32[] %constant.755, f32[] %constant.756), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.760 = f32[192]{0} broadcast(f32[] %subtract.759), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.761 = f32[192]{0} multiply(f32[192]{0} %p116.737, f32[192]{0} %broadcast.760), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.762 = f32[192]{0} add(f32[192]{0} %multiply.758, f32[192]{0} %multiply.761), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.764 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.765 = f32[192]{0} broadcast(f32[] %constant.764), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.766 = f32[192]{0} multiply(f32[192]{0} %get-tuple-element.750, f32[192]{0} %broadcast.765), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p117.738 = f32[192]{0} parameter(117), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.763 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.767 = f32[] subtract(f32[] %constant.763, f32[] %constant.764), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.768 = f32[192]{0} broadcast(f32[] %subtract.767), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.769 = f32[192]{0} multiply(f32[192]{0} %p117.738, f32[192]{0} %broadcast.768), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.770 = f32[192]{0} add(f32[192]{0} %multiply.766, f32[192]{0} %multiply.769), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p121.774 = s64[] parameter(121), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.772 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.771 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.773 = s64[] multiply(s64[] %constant.772, s64[] %constant.771), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.775 = s64[] add(s64[] %p121.774, s64[] %multiply.773), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.795 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.796 = f32[64]{0} broadcast(f32[] %constant.795), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.797 = f32[64]{0} multiply(f32[64]{0} %get-tuple-element.788, f32[64]{0} %broadcast.796), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p122.776 = f32[64]{0} parameter(122), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.794 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.798 = f32[] subtract(f32[] %constant.794, f32[] %constant.795), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.799 = f32[64]{0} broadcast(f32[] %subtract.798), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.800 = f32[64]{0} multiply(f32[64]{0} %p122.776, f32[64]{0} %broadcast.799), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.801 = f32[64]{0} add(f32[64]{0} %multiply.797, f32[64]{0} %multiply.800), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.803 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.804 = f32[64]{0} broadcast(f32[] %constant.803), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.805 = f32[64]{0} multiply(f32[64]{0} %get-tuple-element.789, f32[64]{0} %broadcast.804), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p123.777 = f32[64]{0} parameter(123), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.802 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.806 = f32[] subtract(f32[] %constant.802, f32[] %constant.803), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.807 = f32[64]{0} broadcast(f32[] %subtract.806), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.808 = f32[64]{0} multiply(f32[64]{0} %p123.777, f32[64]{0} %broadcast.807), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.809 = f32[64]{0} add(f32[64]{0} %multiply.805, f32[64]{0} %multiply.808), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p127.813 = s64[] parameter(127), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.811 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.810 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.812 = s64[] multiply(s64[] %constant.811, s64[] %constant.810), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.814 = s64[] add(s64[] %p127.813, s64[] %multiply.812), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.830 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.831 = f32[384]{0} broadcast(f32[] %constant.830), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.832 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.823, f32[384]{0} %broadcast.831), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p128.815 = f32[384]{0} parameter(128), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.829 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.833 = f32[] subtract(f32[] %constant.829, f32[] %constant.830), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.834 = f32[384]{0} broadcast(f32[] %subtract.833), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.835 = f32[384]{0} multiply(f32[384]{0} %p128.815, f32[384]{0} %broadcast.834), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.836 = f32[384]{0} add(f32[384]{0} %multiply.832, f32[384]{0} %multiply.835), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.838 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.839 = f32[384]{0} broadcast(f32[] %constant.838), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.840 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.824, f32[384]{0} %broadcast.839), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p129.816 = f32[384]{0} parameter(129), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.837 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.841 = f32[] subtract(f32[] %constant.837, f32[] %constant.838), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.842 = f32[384]{0} broadcast(f32[] %subtract.841), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.843 = f32[384]{0} multiply(f32[384]{0} %p129.816, f32[384]{0} %broadcast.842), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.844 = f32[384]{0} add(f32[384]{0} %multiply.840, f32[384]{0} %multiply.843), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p133.848 = s64[] parameter(133), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.846 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.845 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.847 = s64[] multiply(s64[] %constant.846, s64[] %constant.845), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.849 = s64[] add(s64[] %p133.848, s64[] %multiply.847), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.869 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.870 = f32[384]{0} broadcast(f32[] %constant.869), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.871 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.862, f32[384]{0} %broadcast.870), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p134.850 = f32[384]{0} parameter(134), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.868 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.872 = f32[] subtract(f32[] %constant.868, f32[] %constant.869), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.873 = f32[384]{0} broadcast(f32[] %subtract.872), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.874 = f32[384]{0} multiply(f32[384]{0} %p134.850, f32[384]{0} %broadcast.873), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.875 = f32[384]{0} add(f32[384]{0} %multiply.871, f32[384]{0} %multiply.874), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.877 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.878 = f32[384]{0} broadcast(f32[] %constant.877), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.879 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.863, f32[384]{0} %broadcast.878), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p135.851 = f32[384]{0} parameter(135), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.876 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.880 = f32[] subtract(f32[] %constant.876, f32[] %constant.877), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.881 = f32[384]{0} broadcast(f32[] %subtract.880), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.882 = f32[384]{0} multiply(f32[384]{0} %p135.851, f32[384]{0} %broadcast.881), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.883 = f32[384]{0} add(f32[384]{0} %multiply.879, f32[384]{0} %multiply.882), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p139.887 = s64[] parameter(139), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.885 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.884 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.886 = s64[] multiply(s64[] %constant.885, s64[] %constant.884), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.888 = s64[] add(s64[] %p139.887, s64[] %multiply.886), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.908 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.909 = f32[64]{0} broadcast(f32[] %constant.908), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.910 = f32[64]{0} multiply(f32[64]{0} %get-tuple-element.901, f32[64]{0} %broadcast.909), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p140.889 = f32[64]{0} parameter(140), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.907 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.911 = f32[] subtract(f32[] %constant.907, f32[] %constant.908), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.912 = f32[64]{0} broadcast(f32[] %subtract.911), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.913 = f32[64]{0} multiply(f32[64]{0} %p140.889, f32[64]{0} %broadcast.912), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.914 = f32[64]{0} add(f32[64]{0} %multiply.910, f32[64]{0} %multiply.913), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.916 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.917 = f32[64]{0} broadcast(f32[] %constant.916), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.918 = f32[64]{0} multiply(f32[64]{0} %get-tuple-element.902, f32[64]{0} %broadcast.917), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p141.890 = f32[64]{0} parameter(141), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.915 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.919 = f32[] subtract(f32[] %constant.915, f32[] %constant.916), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.920 = f32[64]{0} broadcast(f32[] %subtract.919), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.921 = f32[64]{0} multiply(f32[64]{0} %p141.890, f32[64]{0} %broadcast.920), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.922 = f32[64]{0} add(f32[64]{0} %multiply.918, f32[64]{0} %multiply.921), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p145.926 = s64[] parameter(145), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.924 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.923 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.925 = s64[] multiply(s64[] %constant.924, s64[] %constant.923), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.927 = s64[] add(s64[] %p145.926, s64[] %multiply.925), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.950 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.951 = f32[384]{0} broadcast(f32[] %constant.950), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.952 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.943, f32[384]{0} %broadcast.951), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p146.928 = f32[384]{0} parameter(146), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.949 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.953 = f32[] subtract(f32[] %constant.949, f32[] %constant.950), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.954 = f32[384]{0} broadcast(f32[] %subtract.953), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.955 = f32[384]{0} multiply(f32[384]{0} %p146.928, f32[384]{0} %broadcast.954), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.956 = f32[384]{0} add(f32[384]{0} %multiply.952, f32[384]{0} %multiply.955), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.958 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.959 = f32[384]{0} broadcast(f32[] %constant.958), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.960 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.944, f32[384]{0} %broadcast.959), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p147.929 = f32[384]{0} parameter(147), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.957 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.961 = f32[] subtract(f32[] %constant.957, f32[] %constant.958), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.962 = f32[384]{0} broadcast(f32[] %subtract.961), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.963 = f32[384]{0} multiply(f32[384]{0} %p147.929, f32[384]{0} %broadcast.962), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.964 = f32[384]{0} add(f32[384]{0} %multiply.960, f32[384]{0} %multiply.963), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p151.968 = s64[] parameter(151), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.966 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.965 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.967 = s64[] multiply(s64[] %constant.966, s64[] %constant.965), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.969 = s64[] add(s64[] %p151.968, s64[] %multiply.967), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.989 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.990 = f32[384]{0} broadcast(f32[] %constant.989), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.991 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.982, f32[384]{0} %broadcast.990), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p152.970 = f32[384]{0} parameter(152), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.988 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.992 = f32[] subtract(f32[] %constant.988, f32[] %constant.989), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.993 = f32[384]{0} broadcast(f32[] %subtract.992), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.994 = f32[384]{0} multiply(f32[384]{0} %p152.970, f32[384]{0} %broadcast.993), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.995 = f32[384]{0} add(f32[384]{0} %multiply.991, f32[384]{0} %multiply.994), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.997 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.998 = f32[384]{0} broadcast(f32[] %constant.997), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.999 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.983, f32[384]{0} %broadcast.998), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p153.971 = f32[384]{0} parameter(153), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.996 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1000 = f32[] subtract(f32[] %constant.996, f32[] %constant.997), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1001 = f32[384]{0} broadcast(f32[] %subtract.1000), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1002 = f32[384]{0} multiply(f32[384]{0} %p153.971, f32[384]{0} %broadcast.1001), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1003 = f32[384]{0} add(f32[384]{0} %multiply.999, f32[384]{0} %multiply.1002), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p157.1007 = s64[] parameter(157), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1005 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1004 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1006 = s64[] multiply(s64[] %constant.1005, s64[] %constant.1004), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1008 = s64[] add(s64[] %p157.1007, s64[] %multiply.1006), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1028 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1029 = f32[64]{0} broadcast(f32[] %constant.1028), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1030 = f32[64]{0} multiply(f32[64]{0} %get-tuple-element.1021, f32[64]{0} %broadcast.1029), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p158.1009 = f32[64]{0} parameter(158), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1027 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1031 = f32[] subtract(f32[] %constant.1027, f32[] %constant.1028), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1032 = f32[64]{0} broadcast(f32[] %subtract.1031), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1033 = f32[64]{0} multiply(f32[64]{0} %p158.1009, f32[64]{0} %broadcast.1032), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1034 = f32[64]{0} add(f32[64]{0} %multiply.1030, f32[64]{0} %multiply.1033), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1036 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1037 = f32[64]{0} broadcast(f32[] %constant.1036), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1038 = f32[64]{0} multiply(f32[64]{0} %get-tuple-element.1022, f32[64]{0} %broadcast.1037), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p159.1010 = f32[64]{0} parameter(159), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1035 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1039 = f32[] subtract(f32[] %constant.1035, f32[] %constant.1036), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1040 = f32[64]{0} broadcast(f32[] %subtract.1039), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1041 = f32[64]{0} multiply(f32[64]{0} %p159.1010, f32[64]{0} %broadcast.1040), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1042 = f32[64]{0} add(f32[64]{0} %multiply.1038, f32[64]{0} %multiply.1041), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p163.1046 = s64[] parameter(163), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1044 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1043 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1045 = s64[] multiply(s64[] %constant.1044, s64[] %constant.1043), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1047 = s64[] add(s64[] %p163.1046, s64[] %multiply.1045), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1070 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1071 = f32[384]{0} broadcast(f32[] %constant.1070), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1072 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.1063, f32[384]{0} %broadcast.1071), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p164.1048 = f32[384]{0} parameter(164), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1069 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1073 = f32[] subtract(f32[] %constant.1069, f32[] %constant.1070), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1074 = f32[384]{0} broadcast(f32[] %subtract.1073), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1075 = f32[384]{0} multiply(f32[384]{0} %p164.1048, f32[384]{0} %broadcast.1074), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1076 = f32[384]{0} add(f32[384]{0} %multiply.1072, f32[384]{0} %multiply.1075), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1078 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1079 = f32[384]{0} broadcast(f32[] %constant.1078), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1080 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.1064, f32[384]{0} %broadcast.1079), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p165.1049 = f32[384]{0} parameter(165), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1077 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1081 = f32[] subtract(f32[] %constant.1077, f32[] %constant.1078), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1082 = f32[384]{0} broadcast(f32[] %subtract.1081), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1083 = f32[384]{0} multiply(f32[384]{0} %p165.1049, f32[384]{0} %broadcast.1082), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1084 = f32[384]{0} add(f32[384]{0} %multiply.1080, f32[384]{0} %multiply.1083), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p169.1088 = s64[] parameter(169), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1086 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1085 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1087 = s64[] multiply(s64[] %constant.1086, s64[] %constant.1085), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1089 = s64[] add(s64[] %p169.1088, s64[] %multiply.1087), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1109 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1110 = f32[384]{0} broadcast(f32[] %constant.1109), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1111 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.1102, f32[384]{0} %broadcast.1110), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p170.1090 = f32[384]{0} parameter(170), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1108 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1112 = f32[] subtract(f32[] %constant.1108, f32[] %constant.1109), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1113 = f32[384]{0} broadcast(f32[] %subtract.1112), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1114 = f32[384]{0} multiply(f32[384]{0} %p170.1090, f32[384]{0} %broadcast.1113), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1115 = f32[384]{0} add(f32[384]{0} %multiply.1111, f32[384]{0} %multiply.1114), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1117 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1118 = f32[384]{0} broadcast(f32[] %constant.1117), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1119 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.1103, f32[384]{0} %broadcast.1118), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p171.1091 = f32[384]{0} parameter(171), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1116 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1120 = f32[] subtract(f32[] %constant.1116, f32[] %constant.1117), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1121 = f32[384]{0} broadcast(f32[] %subtract.1120), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1122 = f32[384]{0} multiply(f32[384]{0} %p171.1091, f32[384]{0} %broadcast.1121), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1123 = f32[384]{0} add(f32[384]{0} %multiply.1119, f32[384]{0} %multiply.1122), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p175.1127 = s64[] parameter(175), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1125 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1124 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1126 = s64[] multiply(s64[] %constant.1125, s64[] %constant.1124), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1128 = s64[] add(s64[] %p175.1127, s64[] %multiply.1126), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1148 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1149 = f32[64]{0} broadcast(f32[] %constant.1148), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1150 = f32[64]{0} multiply(f32[64]{0} %get-tuple-element.1141, f32[64]{0} %broadcast.1149), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p176.1129 = f32[64]{0} parameter(176), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1147 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1151 = f32[] subtract(f32[] %constant.1147, f32[] %constant.1148), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1152 = f32[64]{0} broadcast(f32[] %subtract.1151), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1153 = f32[64]{0} multiply(f32[64]{0} %p176.1129, f32[64]{0} %broadcast.1152), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1154 = f32[64]{0} add(f32[64]{0} %multiply.1150, f32[64]{0} %multiply.1153), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1156 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1157 = f32[64]{0} broadcast(f32[] %constant.1156), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1158 = f32[64]{0} multiply(f32[64]{0} %get-tuple-element.1142, f32[64]{0} %broadcast.1157), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p177.1130 = f32[64]{0} parameter(177), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1155 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1159 = f32[] subtract(f32[] %constant.1155, f32[] %constant.1156), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1160 = f32[64]{0} broadcast(f32[] %subtract.1159), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1161 = f32[64]{0} multiply(f32[64]{0} %p177.1130, f32[64]{0} %broadcast.1160), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1162 = f32[64]{0} add(f32[64]{0} %multiply.1158, f32[64]{0} %multiply.1161), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p181.1166 = s64[] parameter(181), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1164 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1163 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1165 = s64[] multiply(s64[] %constant.1164, s64[] %constant.1163), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1167 = s64[] add(s64[] %p181.1166, s64[] %multiply.1165), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1190 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1191 = f32[384]{0} broadcast(f32[] %constant.1190), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1192 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.1183, f32[384]{0} %broadcast.1191), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p182.1168 = f32[384]{0} parameter(182), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1189 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1193 = f32[] subtract(f32[] %constant.1189, f32[] %constant.1190), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1194 = f32[384]{0} broadcast(f32[] %subtract.1193), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1195 = f32[384]{0} multiply(f32[384]{0} %p182.1168, f32[384]{0} %broadcast.1194), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1196 = f32[384]{0} add(f32[384]{0} %multiply.1192, f32[384]{0} %multiply.1195), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1198 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1199 = f32[384]{0} broadcast(f32[] %constant.1198), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1200 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.1184, f32[384]{0} %broadcast.1199), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p183.1169 = f32[384]{0} parameter(183), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1197 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1201 = f32[] subtract(f32[] %constant.1197, f32[] %constant.1198), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1202 = f32[384]{0} broadcast(f32[] %subtract.1201), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1203 = f32[384]{0} multiply(f32[384]{0} %p183.1169, f32[384]{0} %broadcast.1202), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1204 = f32[384]{0} add(f32[384]{0} %multiply.1200, f32[384]{0} %multiply.1203), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p187.1208 = s64[] parameter(187), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1206 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1205 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1207 = s64[] multiply(s64[] %constant.1206, s64[] %constant.1205), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1209 = s64[] add(s64[] %p187.1208, s64[] %multiply.1207), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1229 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1230 = f32[384]{0} broadcast(f32[] %constant.1229), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1231 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.1222, f32[384]{0} %broadcast.1230), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p188.1210 = f32[384]{0} parameter(188), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1228 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1232 = f32[] subtract(f32[] %constant.1228, f32[] %constant.1229), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1233 = f32[384]{0} broadcast(f32[] %subtract.1232), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1234 = f32[384]{0} multiply(f32[384]{0} %p188.1210, f32[384]{0} %broadcast.1233), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1235 = f32[384]{0} add(f32[384]{0} %multiply.1231, f32[384]{0} %multiply.1234), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1237 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1238 = f32[384]{0} broadcast(f32[] %constant.1237), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1239 = f32[384]{0} multiply(f32[384]{0} %get-tuple-element.1223, f32[384]{0} %broadcast.1238), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p189.1211 = f32[384]{0} parameter(189), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1236 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1240 = f32[] subtract(f32[] %constant.1236, f32[] %constant.1237), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1241 = f32[384]{0} broadcast(f32[] %subtract.1240), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1242 = f32[384]{0} multiply(f32[384]{0} %p189.1211, f32[384]{0} %broadcast.1241), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1243 = f32[384]{0} add(f32[384]{0} %multiply.1239, f32[384]{0} %multiply.1242), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p193.1247 = s64[] parameter(193), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1245 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1244 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1246 = s64[] multiply(s64[] %constant.1245, s64[] %constant.1244), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1248 = s64[] add(s64[] %p193.1247, s64[] %multiply.1246), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1268 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1269 = f32[96]{0} broadcast(f32[] %constant.1268), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1270 = f32[96]{0} multiply(f32[96]{0} %get-tuple-element.1261, f32[96]{0} %broadcast.1269), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p194.1249 = f32[96]{0} parameter(194), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1267 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1271 = f32[] subtract(f32[] %constant.1267, f32[] %constant.1268), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1272 = f32[96]{0} broadcast(f32[] %subtract.1271), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1273 = f32[96]{0} multiply(f32[96]{0} %p194.1249, f32[96]{0} %broadcast.1272), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1274 = f32[96]{0} add(f32[96]{0} %multiply.1270, f32[96]{0} %multiply.1273), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1276 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1277 = f32[96]{0} broadcast(f32[] %constant.1276), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1278 = f32[96]{0} multiply(f32[96]{0} %get-tuple-element.1262, f32[96]{0} %broadcast.1277), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p195.1250 = f32[96]{0} parameter(195), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1275 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1279 = f32[] subtract(f32[] %constant.1275, f32[] %constant.1276), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1280 = f32[96]{0} broadcast(f32[] %subtract.1279), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1281 = f32[96]{0} multiply(f32[96]{0} %p195.1250, f32[96]{0} %broadcast.1280), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1282 = f32[96]{0} add(f32[96]{0} %multiply.1278, f32[96]{0} %multiply.1281), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p199.1286 = s64[] parameter(199), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1284 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1283 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1285 = s64[] multiply(s64[] %constant.1284, s64[] %constant.1283), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1287 = s64[] add(s64[] %p199.1286, s64[] %multiply.1285), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1303 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1304 = f32[576]{0} broadcast(f32[] %constant.1303), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1305 = f32[576]{0} multiply(f32[576]{0} %get-tuple-element.1296, f32[576]{0} %broadcast.1304), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p200.1288 = f32[576]{0} parameter(200), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1302 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1306 = f32[] subtract(f32[] %constant.1302, f32[] %constant.1303), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1307 = f32[576]{0} broadcast(f32[] %subtract.1306), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1308 = f32[576]{0} multiply(f32[576]{0} %p200.1288, f32[576]{0} %broadcast.1307), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1309 = f32[576]{0} add(f32[576]{0} %multiply.1305, f32[576]{0} %multiply.1308), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1311 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1312 = f32[576]{0} broadcast(f32[] %constant.1311), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1313 = f32[576]{0} multiply(f32[576]{0} %get-tuple-element.1297, f32[576]{0} %broadcast.1312), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p201.1289 = f32[576]{0} parameter(201), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1310 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1314 = f32[] subtract(f32[] %constant.1310, f32[] %constant.1311), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1315 = f32[576]{0} broadcast(f32[] %subtract.1314), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1316 = f32[576]{0} multiply(f32[576]{0} %p201.1289, f32[576]{0} %broadcast.1315), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1317 = f32[576]{0} add(f32[576]{0} %multiply.1313, f32[576]{0} %multiply.1316), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p205.1321 = s64[] parameter(205), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1319 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1318 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1320 = s64[] multiply(s64[] %constant.1319, s64[] %constant.1318), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1322 = s64[] add(s64[] %p205.1321, s64[] %multiply.1320), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1342 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1343 = f32[576]{0} broadcast(f32[] %constant.1342), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1344 = f32[576]{0} multiply(f32[576]{0} %get-tuple-element.1335, f32[576]{0} %broadcast.1343), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p206.1323 = f32[576]{0} parameter(206), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1341 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1345 = f32[] subtract(f32[] %constant.1341, f32[] %constant.1342), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1346 = f32[576]{0} broadcast(f32[] %subtract.1345), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1347 = f32[576]{0} multiply(f32[576]{0} %p206.1323, f32[576]{0} %broadcast.1346), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1348 = f32[576]{0} add(f32[576]{0} %multiply.1344, f32[576]{0} %multiply.1347), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1350 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1351 = f32[576]{0} broadcast(f32[] %constant.1350), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1352 = f32[576]{0} multiply(f32[576]{0} %get-tuple-element.1336, f32[576]{0} %broadcast.1351), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p207.1324 = f32[576]{0} parameter(207), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1349 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1353 = f32[] subtract(f32[] %constant.1349, f32[] %constant.1350), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1354 = f32[576]{0} broadcast(f32[] %subtract.1353), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1355 = f32[576]{0} multiply(f32[576]{0} %p207.1324, f32[576]{0} %broadcast.1354), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1356 = f32[576]{0} add(f32[576]{0} %multiply.1352, f32[576]{0} %multiply.1355), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p211.1360 = s64[] parameter(211), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1358 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1357 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1359 = s64[] multiply(s64[] %constant.1358, s64[] %constant.1357), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1361 = s64[] add(s64[] %p211.1360, s64[] %multiply.1359), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1381 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1382 = f32[96]{0} broadcast(f32[] %constant.1381), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1383 = f32[96]{0} multiply(f32[96]{0} %get-tuple-element.1374, f32[96]{0} %broadcast.1382), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p212.1362 = f32[96]{0} parameter(212), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1380 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1384 = f32[] subtract(f32[] %constant.1380, f32[] %constant.1381), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1385 = f32[96]{0} broadcast(f32[] %subtract.1384), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1386 = f32[96]{0} multiply(f32[96]{0} %p212.1362, f32[96]{0} %broadcast.1385), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1387 = f32[96]{0} add(f32[96]{0} %multiply.1383, f32[96]{0} %multiply.1386), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1389 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1390 = f32[96]{0} broadcast(f32[] %constant.1389), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1391 = f32[96]{0} multiply(f32[96]{0} %get-tuple-element.1375, f32[96]{0} %broadcast.1390), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p213.1363 = f32[96]{0} parameter(213), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1388 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1392 = f32[] subtract(f32[] %constant.1388, f32[] %constant.1389), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1393 = f32[96]{0} broadcast(f32[] %subtract.1392), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1394 = f32[96]{0} multiply(f32[96]{0} %p213.1363, f32[96]{0} %broadcast.1393), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1395 = f32[96]{0} add(f32[96]{0} %multiply.1391, f32[96]{0} %multiply.1394), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p217.1399 = s64[] parameter(217), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1397 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1396 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1398 = s64[] multiply(s64[] %constant.1397, s64[] %constant.1396), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1400 = s64[] add(s64[] %p217.1399, s64[] %multiply.1398), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1423 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1424 = f32[576]{0} broadcast(f32[] %constant.1423), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1425 = f32[576]{0} multiply(f32[576]{0} %get-tuple-element.1416, f32[576]{0} %broadcast.1424), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p218.1401 = f32[576]{0} parameter(218), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1422 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1426 = f32[] subtract(f32[] %constant.1422, f32[] %constant.1423), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1427 = f32[576]{0} broadcast(f32[] %subtract.1426), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1428 = f32[576]{0} multiply(f32[576]{0} %p218.1401, f32[576]{0} %broadcast.1427), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1429 = f32[576]{0} add(f32[576]{0} %multiply.1425, f32[576]{0} %multiply.1428), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1431 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1432 = f32[576]{0} broadcast(f32[] %constant.1431), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1433 = f32[576]{0} multiply(f32[576]{0} %get-tuple-element.1417, f32[576]{0} %broadcast.1432), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p219.1402 = f32[576]{0} parameter(219), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1430 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1434 = f32[] subtract(f32[] %constant.1430, f32[] %constant.1431), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1435 = f32[576]{0} broadcast(f32[] %subtract.1434), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1436 = f32[576]{0} multiply(f32[576]{0} %p219.1402, f32[576]{0} %broadcast.1435), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1437 = f32[576]{0} add(f32[576]{0} %multiply.1433, f32[576]{0} %multiply.1436), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p223.1441 = s64[] parameter(223), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1439 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1438 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1440 = s64[] multiply(s64[] %constant.1439, s64[] %constant.1438), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1442 = s64[] add(s64[] %p223.1441, s64[] %multiply.1440), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1462 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1463 = f32[576]{0} broadcast(f32[] %constant.1462), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1464 = f32[576]{0} multiply(f32[576]{0} %get-tuple-element.1455, f32[576]{0} %broadcast.1463), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p224.1443 = f32[576]{0} parameter(224), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1461 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1465 = f32[] subtract(f32[] %constant.1461, f32[] %constant.1462), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1466 = f32[576]{0} broadcast(f32[] %subtract.1465), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1467 = f32[576]{0} multiply(f32[576]{0} %p224.1443, f32[576]{0} %broadcast.1466), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1468 = f32[576]{0} add(f32[576]{0} %multiply.1464, f32[576]{0} %multiply.1467), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1470 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1471 = f32[576]{0} broadcast(f32[] %constant.1470), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1472 = f32[576]{0} multiply(f32[576]{0} %get-tuple-element.1456, f32[576]{0} %broadcast.1471), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p225.1444 = f32[576]{0} parameter(225), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1469 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1473 = f32[] subtract(f32[] %constant.1469, f32[] %constant.1470), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1474 = f32[576]{0} broadcast(f32[] %subtract.1473), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1475 = f32[576]{0} multiply(f32[576]{0} %p225.1444, f32[576]{0} %broadcast.1474), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1476 = f32[576]{0} add(f32[576]{0} %multiply.1472, f32[576]{0} %multiply.1475), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p229.1480 = s64[] parameter(229), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1478 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1477 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1479 = s64[] multiply(s64[] %constant.1478, s64[] %constant.1477), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1481 = s64[] add(s64[] %p229.1480, s64[] %multiply.1479), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1501 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1502 = f32[96]{0} broadcast(f32[] %constant.1501), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1503 = f32[96]{0} multiply(f32[96]{0} %get-tuple-element.1494, f32[96]{0} %broadcast.1502), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p230.1482 = f32[96]{0} parameter(230), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1500 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1504 = f32[] subtract(f32[] %constant.1500, f32[] %constant.1501), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1505 = f32[96]{0} broadcast(f32[] %subtract.1504), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1506 = f32[96]{0} multiply(f32[96]{0} %p230.1482, f32[96]{0} %broadcast.1505), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1507 = f32[96]{0} add(f32[96]{0} %multiply.1503, f32[96]{0} %multiply.1506), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1509 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1510 = f32[96]{0} broadcast(f32[] %constant.1509), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1511 = f32[96]{0} multiply(f32[96]{0} %get-tuple-element.1495, f32[96]{0} %broadcast.1510), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p231.1483 = f32[96]{0} parameter(231), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1508 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1512 = f32[] subtract(f32[] %constant.1508, f32[] %constant.1509), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1513 = f32[96]{0} broadcast(f32[] %subtract.1512), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1514 = f32[96]{0} multiply(f32[96]{0} %p231.1483, f32[96]{0} %broadcast.1513), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1515 = f32[96]{0} add(f32[96]{0} %multiply.1511, f32[96]{0} %multiply.1514), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p235.1519 = s64[] parameter(235), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1517 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1516 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1518 = s64[] multiply(s64[] %constant.1517, s64[] %constant.1516), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1520 = s64[] add(s64[] %p235.1519, s64[] %multiply.1518), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1543 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1544 = f32[576]{0} broadcast(f32[] %constant.1543), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1545 = f32[576]{0} multiply(f32[576]{0} %get-tuple-element.1536, f32[576]{0} %broadcast.1544), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p236.1521 = f32[576]{0} parameter(236), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1542 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1546 = f32[] subtract(f32[] %constant.1542, f32[] %constant.1543), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1547 = f32[576]{0} broadcast(f32[] %subtract.1546), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1548 = f32[576]{0} multiply(f32[576]{0} %p236.1521, f32[576]{0} %broadcast.1547), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1549 = f32[576]{0} add(f32[576]{0} %multiply.1545, f32[576]{0} %multiply.1548), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1551 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1552 = f32[576]{0} broadcast(f32[] %constant.1551), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1553 = f32[576]{0} multiply(f32[576]{0} %get-tuple-element.1537, f32[576]{0} %broadcast.1552), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p237.1522 = f32[576]{0} parameter(237), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1550 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1554 = f32[] subtract(f32[] %constant.1550, f32[] %constant.1551), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1555 = f32[576]{0} broadcast(f32[] %subtract.1554), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1556 = f32[576]{0} multiply(f32[576]{0} %p237.1522, f32[576]{0} %broadcast.1555), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1557 = f32[576]{0} add(f32[576]{0} %multiply.1553, f32[576]{0} %multiply.1556), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p241.1561 = s64[] parameter(241), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1559 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1558 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1560 = s64[] multiply(s64[] %constant.1559, s64[] %constant.1558), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1562 = s64[] add(s64[] %p241.1561, s64[] %multiply.1560), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1582 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1583 = f32[576]{0} broadcast(f32[] %constant.1582), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1584 = f32[576]{0} multiply(f32[576]{0} %get-tuple-element.1575, f32[576]{0} %broadcast.1583), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p242.1563 = f32[576]{0} parameter(242), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1581 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1585 = f32[] subtract(f32[] %constant.1581, f32[] %constant.1582), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1586 = f32[576]{0} broadcast(f32[] %subtract.1585), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1587 = f32[576]{0} multiply(f32[576]{0} %p242.1563, f32[576]{0} %broadcast.1586), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1588 = f32[576]{0} add(f32[576]{0} %multiply.1584, f32[576]{0} %multiply.1587), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1590 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1591 = f32[576]{0} broadcast(f32[] %constant.1590), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1592 = f32[576]{0} multiply(f32[576]{0} %get-tuple-element.1576, f32[576]{0} %broadcast.1591), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p243.1564 = f32[576]{0} parameter(243), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1589 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1593 = f32[] subtract(f32[] %constant.1589, f32[] %constant.1590), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1594 = f32[576]{0} broadcast(f32[] %subtract.1593), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1595 = f32[576]{0} multiply(f32[576]{0} %p243.1564, f32[576]{0} %broadcast.1594), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1596 = f32[576]{0} add(f32[576]{0} %multiply.1592, f32[576]{0} %multiply.1595), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p247.1600 = s64[] parameter(247), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1598 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1597 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1599 = s64[] multiply(s64[] %constant.1598, s64[] %constant.1597), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1601 = s64[] add(s64[] %p247.1600, s64[] %multiply.1599), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1621 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1622 = f32[160]{0} broadcast(f32[] %constant.1621), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1623 = f32[160]{0} multiply(f32[160]{0} %get-tuple-element.1614, f32[160]{0} %broadcast.1622), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p248.1602 = f32[160]{0} parameter(248), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1620 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1624 = f32[] subtract(f32[] %constant.1620, f32[] %constant.1621), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1625 = f32[160]{0} broadcast(f32[] %subtract.1624), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1626 = f32[160]{0} multiply(f32[160]{0} %p248.1602, f32[160]{0} %broadcast.1625), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1627 = f32[160]{0} add(f32[160]{0} %multiply.1623, f32[160]{0} %multiply.1626), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1629 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1630 = f32[160]{0} broadcast(f32[] %constant.1629), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1631 = f32[160]{0} multiply(f32[160]{0} %get-tuple-element.1615, f32[160]{0} %broadcast.1630), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p249.1603 = f32[160]{0} parameter(249), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1628 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1632 = f32[] subtract(f32[] %constant.1628, f32[] %constant.1629), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1633 = f32[160]{0} broadcast(f32[] %subtract.1632), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1634 = f32[160]{0} multiply(f32[160]{0} %p249.1603, f32[160]{0} %broadcast.1633), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1635 = f32[160]{0} add(f32[160]{0} %multiply.1631, f32[160]{0} %multiply.1634), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p253.1639 = s64[] parameter(253), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1637 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1636 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1638 = s64[] multiply(s64[] %constant.1637, s64[] %constant.1636), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1640 = s64[] add(s64[] %p253.1639, s64[] %multiply.1638), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1656 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1657 = f32[960]{0} broadcast(f32[] %constant.1656), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1658 = f32[960]{0} multiply(f32[960]{0} %get-tuple-element.1649, f32[960]{0} %broadcast.1657), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p254.1641 = f32[960]{0} parameter(254), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1655 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1659 = f32[] subtract(f32[] %constant.1655, f32[] %constant.1656), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1660 = f32[960]{0} broadcast(f32[] %subtract.1659), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1661 = f32[960]{0} multiply(f32[960]{0} %p254.1641, f32[960]{0} %broadcast.1660), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1662 = f32[960]{0} add(f32[960]{0} %multiply.1658, f32[960]{0} %multiply.1661), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1664 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1665 = f32[960]{0} broadcast(f32[] %constant.1664), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1666 = f32[960]{0} multiply(f32[960]{0} %get-tuple-element.1650, f32[960]{0} %broadcast.1665), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p255.1642 = f32[960]{0} parameter(255), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1663 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1667 = f32[] subtract(f32[] %constant.1663, f32[] %constant.1664), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1668 = f32[960]{0} broadcast(f32[] %subtract.1667), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1669 = f32[960]{0} multiply(f32[960]{0} %p255.1642, f32[960]{0} %broadcast.1668), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1670 = f32[960]{0} add(f32[960]{0} %multiply.1666, f32[960]{0} %multiply.1669), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p259.1674 = s64[] parameter(259), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1672 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1671 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1673 = s64[] multiply(s64[] %constant.1672, s64[] %constant.1671), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1675 = s64[] add(s64[] %p259.1674, s64[] %multiply.1673), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1695 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1696 = f32[960]{0} broadcast(f32[] %constant.1695), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1697 = f32[960]{0} multiply(f32[960]{0} %get-tuple-element.1688, f32[960]{0} %broadcast.1696), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p260.1676 = f32[960]{0} parameter(260), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1694 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1698 = f32[] subtract(f32[] %constant.1694, f32[] %constant.1695), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1699 = f32[960]{0} broadcast(f32[] %subtract.1698), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1700 = f32[960]{0} multiply(f32[960]{0} %p260.1676, f32[960]{0} %broadcast.1699), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1701 = f32[960]{0} add(f32[960]{0} %multiply.1697, f32[960]{0} %multiply.1700), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1703 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1704 = f32[960]{0} broadcast(f32[] %constant.1703), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1705 = f32[960]{0} multiply(f32[960]{0} %get-tuple-element.1689, f32[960]{0} %broadcast.1704), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p261.1677 = f32[960]{0} parameter(261), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1702 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1706 = f32[] subtract(f32[] %constant.1702, f32[] %constant.1703), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1707 = f32[960]{0} broadcast(f32[] %subtract.1706), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1708 = f32[960]{0} multiply(f32[960]{0} %p261.1677, f32[960]{0} %broadcast.1707), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1709 = f32[960]{0} add(f32[960]{0} %multiply.1705, f32[960]{0} %multiply.1708), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p265.1713 = s64[] parameter(265), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1711 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1710 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1712 = s64[] multiply(s64[] %constant.1711, s64[] %constant.1710), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1714 = s64[] add(s64[] %p265.1713, s64[] %multiply.1712), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1734 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1735 = f32[160]{0} broadcast(f32[] %constant.1734), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1736 = f32[160]{0} multiply(f32[160]{0} %get-tuple-element.1727, f32[160]{0} %broadcast.1735), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p266.1715 = f32[160]{0} parameter(266), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1733 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1737 = f32[] subtract(f32[] %constant.1733, f32[] %constant.1734), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1738 = f32[160]{0} broadcast(f32[] %subtract.1737), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1739 = f32[160]{0} multiply(f32[160]{0} %p266.1715, f32[160]{0} %broadcast.1738), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1740 = f32[160]{0} add(f32[160]{0} %multiply.1736, f32[160]{0} %multiply.1739), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1742 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1743 = f32[160]{0} broadcast(f32[] %constant.1742), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1744 = f32[160]{0} multiply(f32[160]{0} %get-tuple-element.1728, f32[160]{0} %broadcast.1743), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p267.1716 = f32[160]{0} parameter(267), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1741 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1745 = f32[] subtract(f32[] %constant.1741, f32[] %constant.1742), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1746 = f32[160]{0} broadcast(f32[] %subtract.1745), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1747 = f32[160]{0} multiply(f32[160]{0} %p267.1716, f32[160]{0} %broadcast.1746), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1748 = f32[160]{0} add(f32[160]{0} %multiply.1744, f32[160]{0} %multiply.1747), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p271.1752 = s64[] parameter(271), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1750 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1749 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1751 = s64[] multiply(s64[] %constant.1750, s64[] %constant.1749), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1753 = s64[] add(s64[] %p271.1752, s64[] %multiply.1751), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1776 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1777 = f32[960]{0} broadcast(f32[] %constant.1776), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1778 = f32[960]{0} multiply(f32[960]{0} %get-tuple-element.1769, f32[960]{0} %broadcast.1777), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p272.1754 = f32[960]{0} parameter(272), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1775 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1779 = f32[] subtract(f32[] %constant.1775, f32[] %constant.1776), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1780 = f32[960]{0} broadcast(f32[] %subtract.1779), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1781 = f32[960]{0} multiply(f32[960]{0} %p272.1754, f32[960]{0} %broadcast.1780), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1782 = f32[960]{0} add(f32[960]{0} %multiply.1778, f32[960]{0} %multiply.1781), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1784 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1785 = f32[960]{0} broadcast(f32[] %constant.1784), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1786 = f32[960]{0} multiply(f32[960]{0} %get-tuple-element.1770, f32[960]{0} %broadcast.1785), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p273.1755 = f32[960]{0} parameter(273), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1783 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1787 = f32[] subtract(f32[] %constant.1783, f32[] %constant.1784), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1788 = f32[960]{0} broadcast(f32[] %subtract.1787), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1789 = f32[960]{0} multiply(f32[960]{0} %p273.1755, f32[960]{0} %broadcast.1788), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1790 = f32[960]{0} add(f32[960]{0} %multiply.1786, f32[960]{0} %multiply.1789), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p277.1794 = s64[] parameter(277), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1792 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1791 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1793 = s64[] multiply(s64[] %constant.1792, s64[] %constant.1791), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1795 = s64[] add(s64[] %p277.1794, s64[] %multiply.1793), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1815 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1816 = f32[960]{0} broadcast(f32[] %constant.1815), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1817 = f32[960]{0} multiply(f32[960]{0} %get-tuple-element.1808, f32[960]{0} %broadcast.1816), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p278.1796 = f32[960]{0} parameter(278), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1814 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1818 = f32[] subtract(f32[] %constant.1814, f32[] %constant.1815), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1819 = f32[960]{0} broadcast(f32[] %subtract.1818), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1820 = f32[960]{0} multiply(f32[960]{0} %p278.1796, f32[960]{0} %broadcast.1819), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1821 = f32[960]{0} add(f32[960]{0} %multiply.1817, f32[960]{0} %multiply.1820), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1823 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1824 = f32[960]{0} broadcast(f32[] %constant.1823), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1825 = f32[960]{0} multiply(f32[960]{0} %get-tuple-element.1809, f32[960]{0} %broadcast.1824), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p279.1797 = f32[960]{0} parameter(279), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1822 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1826 = f32[] subtract(f32[] %constant.1822, f32[] %constant.1823), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1827 = f32[960]{0} broadcast(f32[] %subtract.1826), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1828 = f32[960]{0} multiply(f32[960]{0} %p279.1797, f32[960]{0} %broadcast.1827), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1829 = f32[960]{0} add(f32[960]{0} %multiply.1825, f32[960]{0} %multiply.1828), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p283.1833 = s64[] parameter(283), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1831 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1830 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1832 = s64[] multiply(s64[] %constant.1831, s64[] %constant.1830), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1834 = s64[] add(s64[] %p283.1833, s64[] %multiply.1832), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1854 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1855 = f32[160]{0} broadcast(f32[] %constant.1854), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1856 = f32[160]{0} multiply(f32[160]{0} %get-tuple-element.1847, f32[160]{0} %broadcast.1855), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p284.1835 = f32[160]{0} parameter(284), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1853 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1857 = f32[] subtract(f32[] %constant.1853, f32[] %constant.1854), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1858 = f32[160]{0} broadcast(f32[] %subtract.1857), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1859 = f32[160]{0} multiply(f32[160]{0} %p284.1835, f32[160]{0} %broadcast.1858), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1860 = f32[160]{0} add(f32[160]{0} %multiply.1856, f32[160]{0} %multiply.1859), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1862 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1863 = f32[160]{0} broadcast(f32[] %constant.1862), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1864 = f32[160]{0} multiply(f32[160]{0} %get-tuple-element.1848, f32[160]{0} %broadcast.1863), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p285.1836 = f32[160]{0} parameter(285), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1861 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1865 = f32[] subtract(f32[] %constant.1861, f32[] %constant.1862), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1866 = f32[160]{0} broadcast(f32[] %subtract.1865), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1867 = f32[160]{0} multiply(f32[160]{0} %p285.1836, f32[160]{0} %broadcast.1866), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1868 = f32[160]{0} add(f32[160]{0} %multiply.1864, f32[160]{0} %multiply.1867), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p289.1872 = s64[] parameter(289), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1870 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1869 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1871 = s64[] multiply(s64[] %constant.1870, s64[] %constant.1869), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1873 = s64[] add(s64[] %p289.1872, s64[] %multiply.1871), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1896 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1897 = f32[960]{0} broadcast(f32[] %constant.1896), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1898 = f32[960]{0} multiply(f32[960]{0} %get-tuple-element.1889, f32[960]{0} %broadcast.1897), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p290.1874 = f32[960]{0} parameter(290), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1895 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1899 = f32[] subtract(f32[] %constant.1895, f32[] %constant.1896), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1900 = f32[960]{0} broadcast(f32[] %subtract.1899), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1901 = f32[960]{0} multiply(f32[960]{0} %p290.1874, f32[960]{0} %broadcast.1900), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1902 = f32[960]{0} add(f32[960]{0} %multiply.1898, f32[960]{0} %multiply.1901), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1904 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1905 = f32[960]{0} broadcast(f32[] %constant.1904), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1906 = f32[960]{0} multiply(f32[960]{0} %get-tuple-element.1890, f32[960]{0} %broadcast.1905), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p291.1875 = f32[960]{0} parameter(291), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1903 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1907 = f32[] subtract(f32[] %constant.1903, f32[] %constant.1904), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1908 = f32[960]{0} broadcast(f32[] %subtract.1907), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1909 = f32[960]{0} multiply(f32[960]{0} %p291.1875, f32[960]{0} %broadcast.1908), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1910 = f32[960]{0} add(f32[960]{0} %multiply.1906, f32[960]{0} %multiply.1909), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p295.1914 = s64[] parameter(295), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1912 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1911 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1913 = s64[] multiply(s64[] %constant.1912, s64[] %constant.1911), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1915 = s64[] add(s64[] %p295.1914, s64[] %multiply.1913), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1935 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1936 = f32[960]{0} broadcast(f32[] %constant.1935), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1937 = f32[960]{0} multiply(f32[960]{0} %get-tuple-element.1928, f32[960]{0} %broadcast.1936), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p296.1916 = f32[960]{0} parameter(296), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1934 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1938 = f32[] subtract(f32[] %constant.1934, f32[] %constant.1935), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1939 = f32[960]{0} broadcast(f32[] %subtract.1938), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1940 = f32[960]{0} multiply(f32[960]{0} %p296.1916, f32[960]{0} %broadcast.1939), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1941 = f32[960]{0} add(f32[960]{0} %multiply.1937, f32[960]{0} %multiply.1940), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1943 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1944 = f32[960]{0} broadcast(f32[] %constant.1943), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1945 = f32[960]{0} multiply(f32[960]{0} %get-tuple-element.1929, f32[960]{0} %broadcast.1944), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p297.1917 = f32[960]{0} parameter(297), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1942 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1946 = f32[] subtract(f32[] %constant.1942, f32[] %constant.1943), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1947 = f32[960]{0} broadcast(f32[] %subtract.1946), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1948 = f32[960]{0} multiply(f32[960]{0} %p297.1917, f32[960]{0} %broadcast.1947), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1949 = f32[960]{0} add(f32[960]{0} %multiply.1945, f32[960]{0} %multiply.1948), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p301.1953 = s64[] parameter(301), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1951 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1950 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1952 = s64[] multiply(s64[] %constant.1951, s64[] %constant.1950), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1954 = s64[] add(s64[] %p301.1953, s64[] %multiply.1952), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1974 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1975 = f32[320]{0} broadcast(f32[] %constant.1974), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1976 = f32[320]{0} multiply(f32[320]{0} %get-tuple-element.1967, f32[320]{0} %broadcast.1975), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p302.1955 = f32[320]{0} parameter(302), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1973 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1977 = f32[] subtract(f32[] %constant.1973, f32[] %constant.1974), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1978 = f32[320]{0} broadcast(f32[] %subtract.1977), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1979 = f32[320]{0} multiply(f32[320]{0} %p302.1955, f32[320]{0} %broadcast.1978), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1980 = f32[320]{0} add(f32[320]{0} %multiply.1976, f32[320]{0} %multiply.1979), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1982 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1983 = f32[320]{0} broadcast(f32[] %constant.1982), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1984 = f32[320]{0} multiply(f32[320]{0} %get-tuple-element.1968, f32[320]{0} %broadcast.1983), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p303.1956 = f32[320]{0} parameter(303), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.1981 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.1985 = f32[] subtract(f32[] %constant.1981, f32[] %constant.1982), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.1986 = f32[320]{0} broadcast(f32[] %subtract.1985), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.1987 = f32[320]{0} multiply(f32[320]{0} %p303.1956, f32[320]{0} %broadcast.1986), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.1988 = f32[320]{0} add(f32[320]{0} %multiply.1984, f32[320]{0} %multiply.1987), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p307.1992 = s64[] parameter(307), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1990 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.1989 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.1991 = s64[] multiply(s64[] %constant.1990, s64[] %constant.1989), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.1993 = s64[] add(s64[] %p307.1992, s64[] %multiply.1991), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.2009 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.2010 = f32[1280]{0} broadcast(f32[] %constant.2009), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.2011 = f32[1280]{0} multiply(f32[1280]{0} %get-tuple-element.2002, f32[1280]{0} %broadcast.2010), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p308.1994 = f32[1280]{0} parameter(308), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2008 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.2012 = f32[] subtract(f32[] %constant.2008, f32[] %constant.2009), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.2013 = f32[1280]{0} broadcast(f32[] %subtract.2012), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.2014 = f32[1280]{0} multiply(f32[1280]{0} %p308.1994, f32[1280]{0} %broadcast.2013), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.2015 = f32[1280]{0} add(f32[1280]{0} %multiply.2011, f32[1280]{0} %multiply.2014), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2017 = f32[] constant(0.1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.2018 = f32[1280]{0} broadcast(f32[] %constant.2017), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.2019 = f32[1280]{0} multiply(f32[1280]{0} %get-tuple-element.2003, f32[1280]{0} %broadcast.2018), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p309.1995 = f32[1280]{0} parameter(309), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="batch_norm@functional.py" source_line=2451}
+  %constant.2016 = f32[] constant(1), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %subtract.2020 = f32[] subtract(f32[] %constant.2016, f32[] %constant.2017), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %broadcast.2021 = f32[1280]{0} broadcast(f32[] %subtract.2020), dimensions={}, metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %multiply.2022 = f32[1280]{0} multiply(f32[1280]{0} %p309.1995, f32[1280]{0} %broadcast.2021), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %add.2023 = f32[1280]{0} add(f32[1280]{0} %multiply.2019, f32[1280]{0} %multiply.2022), metadata={op_type="xla__moving_average" op_name="xla__moving_average" source_file="batch_norm@functional.py" source_line=2451}
+  %p313.2027 = s64[] parameter(313), metadata={op_type="xla__device_data" op_name="xla__device_data" source_file="forward@batchnorm.py" source_line=151}
+  %constant.2025 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %constant.2024 = s64[] constant(1), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="forward@batchnorm.py" source_line=151}
+  %multiply.2026 = s64[] multiply(s64[] %constant.2025, s64[] %constant.2024), metadata={op_type="aten__mul" op_name="aten__mul" source_file="forward@batchnorm.py" source_line=151}
+  %add.2028 = s64[] add(s64[] %p313.2027, s64[] %multiply.2026), metadata={op_type="aten__add" op_name="aten__add" source_file="forward@batchnorm.py" source_line=151}
+  %constant.2036 = f32[] constant(0), metadata={op_type="aten__sum" op_name="aten__sum"}
+  %reduce.2042 = f32[1000]{0} reduce(f32[96,1000]{1,0} %broadcast.2035, f32[] %constant.2036), dimensions={0}, to_apply=%AddComputation.2038, metadata={op_type="aten__sum" op_name="aten__sum"}
+  %reshape.2043 = f32[1,1000]{1,0} reshape(f32[1000]{0} %reduce.2042), metadata={op_type="aten__sum" op_name="aten__sum"}
+  %reshape.2044 = f32[1000]{0} reshape(f32[1,1000]{1,0} %reshape.2043), metadata={op_type="aten__view" op_name="aten__view" source_file="mark_step@xla_model.py" source_line=953}
+  %constant.2324 = f32[] constant(0), metadata={op_type="prim__Constant" op_name="prim__Constant" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.2325 = f32[96,1280,7,7]{3,2,1,0} broadcast(f32[] %constant.2324), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %broadcast.2326 = f32[96,1280,7,7]{3,2,1,0} broadcast(f32[] %p12.42), dimensions={}, metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %clamp.2327 = f32[96,1280,7,7]{3,2,1,0} clamp(f32[96,1280,7,7]{3,2,1,0} %broadcast.2325, f32[96,1280,7,7]{3,2,1,0} %get-tuple-element.2001, f32[96,1280,7,7]{3,2,1,0} %broadcast.2326), metadata={op_type="aten__clamp" op_name="aten__clamp" source_file="hardtanh@functional.py" source_line=1506}
+  %constant.2328 = f32[] constant(0), metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %reduce.2334 = f32[96,1280]{1,0} reduce(f32[96,1280,7,7]{3,2,1,0} %clamp.2327, f32[] %constant.2328), dimensions={3,2}, to_apply=%AddComputation.2330, metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %constant.2329 = s32[] constant(49), metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %constant.2335 = s32[] constant(0), metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %compare.2337 = pred[] compare(s32[] %constant.2329, s32[] %constant.2335), direction=NE, metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %constant.2336 = f32[] constant(1), metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %convert.2338 = f32[] convert(s32[] %constant.2329), metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %divide.2339 = f32[] divide(f32[] %constant.2336, f32[] %convert.2338), metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %constant.2340 = f32[] constant(nan), metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %select.2341 = f32[] select(pred[] %compare.2337, f32[] %divide.2339, f32[] %constant.2340), metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %broadcast.2342 = f32[96,1280]{1,0} broadcast(f32[] %select.2341), dimensions={}, metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %multiply.2343 = f32[96,1280]{1,0} multiply(f32[96,1280]{1,0} %reduce.2334, f32[96,1280]{1,0} %broadcast.2342), metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %reshape.2344 = f32[96,1280,1,1]{3,2,1,0} reshape(f32[96,1280]{1,0} %multiply.2343), metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %convert.2345 = f32[96,1280,1,1]{3,2,1,0} convert(f32[96,1280,1,1]{3,2,1,0} %reshape.2344), metadata={op_type="aten__mean" op_name="aten__mean" source_file="adaptive_avg_pool2d@functional.py" source_line=1214}
+  %reshape.2346 = f32[96,1280]{1,0} reshape(f32[96,1280,1,1]{3,2,1,0} %convert.2345), metadata={op_type="aten__view" op_name="aten__view" source_file="dropout@functional.py" source_line=1252}
+  %multiply.2347 = f32[96,1280]{1,0} multiply(f32[96,1280]{1,0} %reshape.2346, f32[96,1280]{1,0} %divide.2323), metadata={op_type="aten__mul" op_name="aten__mul" source_file="dropout@functional.py" source_line=1252}
+  %transpose.2348 = f32[1280,96]{0,1} transpose(f32[96,1280]{1,0} %multiply.2347), dimensions={1,0}, metadata={op_type="aten__permute" op_name="aten__permute"}
+  %dot.2349 = f32[1280,1000]{1,0} dot(f32[1280,96]{0,1} %transpose.2348, f32[96,1000]{1,0} %broadcast.2035), lhs_contracting_dims={1}, rhs_contracting_dims={0}, metadata={op_type="aten__mm" op_name="aten__mm"}
+  %transpose.2350 = f32[1000,1280]{0,1} transpose(f32[1280,1000]{1,0} %dot.2349), dimensions={1,0}, metadata={op_type="aten__permute" op_name="aten__permute" source_file="mark_step@xla_model.py" source_line=953}
+  %get-tuple-element.2382 = f32[1280]{0} get-tuple-element((f32[96,1280,7,7]{3,2,1,0}, f32[1280]{0}, f32[1280]{0}) %batch-norm-grad.2380), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2383 = f32[1280]{0} get-tuple-element((f32[96,1280,7,7]{3,2,1,0}, f32[1280]{0}, f32[1280]{0}) %batch-norm-grad.2380), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2387 = f32[1,1,320,1280]{3,2,1,0} convolution(f32[96,320,7,7]{3,2,1,0} %get-tuple-element.1966, f32[96,1280,7,7]{3,2,1,0} %get-tuple-element.2381), window={size=7x7}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2388 = f32[1280,320,1,1]{0,1,3,2} transpose(f32[1,1,320,1280]{3,2,1,0} %convolution.2387), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2404 = f32[320]{0} get-tuple-element((f32[96,320,7,7]{3,2,1,0}, f32[320]{0}, f32[320]{0}) %batch-norm-grad.2402), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2405 = f32[320]{0} get-tuple-element((f32[96,320,7,7]{3,2,1,0}, f32[320]{0}, f32[320]{0}) %batch-norm-grad.2402), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2409 = f32[1,1,960,320]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %clamp.1963, f32[96,320,7,7]{3,2,1,0} %get-tuple-element.2403), window={size=7x7}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2410 = f32[320,960,1,1]{0,1,3,2} transpose(f32[1,1,960,320]{3,2,1,0} %convolution.2409), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2436 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2434), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2437 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2434), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2444 = f32[3,3,1,960]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %clamp.1924, f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2435), window={size=7x7 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=960, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2445 = f32[960,1,3,3]{0,1,3,2} transpose(f32[3,3,1,960]{3,2,1,0} %convolution.2444), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2471 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2469), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2472 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2469), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2476 = f32[1,1,160,960]{3,2,1,0} convolution(f32[96,160,7,7]{3,2,1,0} %add.1885, f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2470), window={size=7x7}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2477 = f32[960,160,1,1]{0,1,3,2} transpose(f32[1,1,160,960]{3,2,1,0} %convolution.2476), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2493 = f32[160]{0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-grad.2491), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2494 = f32[160]{0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-grad.2491), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2498 = f32[1,1,960,160]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %clamp.1843, f32[96,160,7,7]{3,2,1,0} %get-tuple-element.2492), window={size=7x7}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2499 = f32[160,960,1,1]{0,1,3,2} transpose(f32[1,1,960,160]{3,2,1,0} %convolution.2498), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2525 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2523), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2526 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2523), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2533 = f32[3,3,1,960]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %clamp.1804, f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2524), window={size=7x7 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=960, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2534 = f32[960,1,3,3]{0,1,3,2} transpose(f32[3,3,1,960]{3,2,1,0} %convolution.2533), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2560 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2558), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2561 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2558), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2565 = f32[1,1,160,960]{3,2,1,0} convolution(f32[96,160,7,7]{3,2,1,0} %add.1765, f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2559), window={size=7x7}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2566 = f32[960,160,1,1]{0,1,3,2} transpose(f32[1,1,160,960]{3,2,1,0} %convolution.2565), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2589 = f32[160]{0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-grad.2587), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2590 = f32[160]{0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-grad.2587), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2594 = f32[1,1,960,160]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %clamp.1723, f32[96,160,7,7]{3,2,1,0} %get-tuple-element.2588), window={size=7x7}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2595 = f32[160,960,1,1]{0,1,3,2} transpose(f32[1,1,960,160]{3,2,1,0} %convolution.2594), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2621 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2619), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2622 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2619), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2629 = f32[3,3,1,960]{3,2,1,0} convolution(f32[96,960,7,7]{3,2,1,0} %clamp.1684, f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2620), window={size=7x7 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=960, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2630 = f32[960,1,3,3]{0,1,3,2} transpose(f32[3,3,1,960]{3,2,1,0} %convolution.2629), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2656 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2654), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2657 = f32[960]{0} get-tuple-element((f32[96,960,7,7]{3,2,1,0}, f32[960]{0}, f32[960]{0}) %batch-norm-grad.2654), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2661 = f32[1,1,160,960]{3,2,1,0} convolution(f32[96,160,7,7]{3,2,1,0} %get-tuple-element.1613, f32[96,960,7,7]{3,2,1,0} %get-tuple-element.2655), window={size=7x7}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2662 = f32[960,160,1,1]{0,1,3,2} transpose(f32[1,1,160,960]{3,2,1,0} %convolution.2661), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2685 = f32[160]{0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-grad.2683), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2686 = f32[160]{0} get-tuple-element((f32[96,160,7,7]{3,2,1,0}, f32[160]{0}, f32[160]{0}) %batch-norm-grad.2683), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2690 = f32[1,1,576,160]{3,2,1,0} convolution(f32[96,576,7,7]{3,2,1,0} %clamp.1610, f32[96,160,7,7]{3,2,1,0} %get-tuple-element.2684), window={size=7x7}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2691 = f32[160,576,1,1]{0,1,3,2} transpose(f32[1,1,576,160]{3,2,1,0} %convolution.2690), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2717 = f32[576]{0} get-tuple-element((f32[96,576,7,7]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2715), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2718 = f32[576]{0} get-tuple-element((f32[96,576,7,7]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2715), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2725 = f32[3,3,1,576]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %clamp.1571, f32[96,576,7,7]{3,2,1,0} %get-tuple-element.2716), window={size=7x7 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=fb01_io01->01bf, batch_group_count=576, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2726 = f32[576,1,3,3]{0,1,3,2} transpose(f32[3,3,1,576]{3,2,1,0} %convolution.2725), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2752 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2750), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2753 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2750), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2757 = f32[1,1,96,576]{3,2,1,0} convolution(f32[96,96,14,14]{3,2,1,0} %add.1532, f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2751), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2758 = f32[576,96,1,1]{0,1,3,2} transpose(f32[1,1,96,576]{3,2,1,0} %convolution.2757), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2774 = f32[96]{0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.2772), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2775 = f32[96]{0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.2772), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2779 = f32[1,1,576,96]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %clamp.1490, f32[96,96,14,14]{3,2,1,0} %get-tuple-element.2773), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2780 = f32[96,576,1,1]{0,1,3,2} transpose(f32[1,1,576,96]{3,2,1,0} %convolution.2779), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2806 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2804), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2807 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2804), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2814 = f32[3,3,1,576]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %clamp.1451, f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2805), window={size=14x14 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=576, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2815 = f32[576,1,3,3]{0,1,3,2} transpose(f32[3,3,1,576]{3,2,1,0} %convolution.2814), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2841 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2839), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2842 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2839), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2846 = f32[1,1,96,576]{3,2,1,0} convolution(f32[96,96,14,14]{3,2,1,0} %add.1412, f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2840), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2847 = f32[576,96,1,1]{0,1,3,2} transpose(f32[1,1,96,576]{3,2,1,0} %convolution.2846), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2870 = f32[96]{0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.2868), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2871 = f32[96]{0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.2868), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2875 = f32[1,1,576,96]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %clamp.1370, f32[96,96,14,14]{3,2,1,0} %get-tuple-element.2869), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2876 = f32[96,576,1,1]{0,1,3,2} transpose(f32[1,1,576,96]{3,2,1,0} %convolution.2875), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2902 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2900), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2903 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2900), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2910 = f32[3,3,1,576]{3,2,1,0} convolution(f32[96,576,14,14]{3,2,1,0} %clamp.1331, f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2901), window={size=14x14 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=576, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2911 = f32[576,1,3,3]{0,1,3,2} transpose(f32[3,3,1,576]{3,2,1,0} %convolution.2910), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2937 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2935), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2938 = f32[576]{0} get-tuple-element((f32[96,576,14,14]{3,2,1,0}, f32[576]{0}, f32[576]{0}) %batch-norm-grad.2935), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2942 = f32[1,1,96,576]{3,2,1,0} convolution(f32[96,96,14,14]{3,2,1,0} %get-tuple-element.1260, f32[96,576,14,14]{3,2,1,0} %get-tuple-element.2936), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2943 = f32[576,96,1,1]{0,1,3,2} transpose(f32[1,1,96,576]{3,2,1,0} %convolution.2942), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2966 = f32[96]{0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.2964), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2967 = f32[96]{0} get-tuple-element((f32[96,96,14,14]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.2964), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.2971 = f32[1,1,384,96]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.1257, f32[96,96,14,14]{3,2,1,0} %get-tuple-element.2965), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.2972 = f32[96,384,1,1]{0,1,3,2} transpose(f32[1,1,384,96]{3,2,1,0} %convolution.2971), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.2998 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.2996), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.2999 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.2996), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3006 = f32[3,3,1,384]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.1218, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.2997), window={size=14x14 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=384, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3007 = f32[384,1,3,3]{0,1,3,2} transpose(f32[3,3,1,384]{3,2,1,0} %convolution.3006), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3033 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3031), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3034 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3031), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3038 = f32[1,1,64,384]{3,2,1,0} convolution(f32[96,64,14,14]{3,2,1,0} %add.1179, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3032), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3039 = f32[384,64,1,1]{0,1,3,2} transpose(f32[1,1,64,384]{3,2,1,0} %convolution.3038), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3055 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-grad.3053), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3056 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-grad.3053), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3060 = f32[1,1,384,64]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.1137, f32[96,64,14,14]{3,2,1,0} %get-tuple-element.3054), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3061 = f32[64,384,1,1]{0,1,3,2} transpose(f32[1,1,384,64]{3,2,1,0} %convolution.3060), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3087 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3085), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3088 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3085), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3095 = f32[3,3,1,384]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.1098, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3086), window={size=14x14 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=384, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3096 = f32[384,1,3,3]{0,1,3,2} transpose(f32[3,3,1,384]{3,2,1,0} %convolution.3095), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3122 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3120), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3123 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3120), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3127 = f32[1,1,64,384]{3,2,1,0} convolution(f32[96,64,14,14]{3,2,1,0} %add.1059, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3121), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3128 = f32[384,64,1,1]{0,1,3,2} transpose(f32[1,1,64,384]{3,2,1,0} %convolution.3127), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3151 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-grad.3149), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3152 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-grad.3149), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3156 = f32[1,1,384,64]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.1017, f32[96,64,14,14]{3,2,1,0} %get-tuple-element.3150), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3157 = f32[64,384,1,1]{0,1,3,2} transpose(f32[1,1,384,64]{3,2,1,0} %convolution.3156), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3183 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3181), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3184 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3181), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3191 = f32[3,3,1,384]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.978, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3182), window={size=14x14 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=384, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3192 = f32[384,1,3,3]{0,1,3,2} transpose(f32[3,3,1,384]{3,2,1,0} %convolution.3191), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3218 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3216), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3219 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3216), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3223 = f32[1,1,64,384]{3,2,1,0} convolution(f32[96,64,14,14]{3,2,1,0} %add.939, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3217), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3224 = f32[384,64,1,1]{0,1,3,2} transpose(f32[1,1,64,384]{3,2,1,0} %convolution.3223), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3247 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-grad.3245), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3248 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-grad.3245), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3252 = f32[1,1,384,64]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.897, f32[96,64,14,14]{3,2,1,0} %get-tuple-element.3246), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3253 = f32[64,384,1,1]{0,1,3,2} transpose(f32[1,1,384,64]{3,2,1,0} %convolution.3252), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3279 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3277), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3280 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3277), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3287 = f32[3,3,1,384]{3,2,1,0} convolution(f32[96,384,14,14]{3,2,1,0} %clamp.858, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3278), window={size=14x14 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=384, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3288 = f32[384,1,3,3]{0,1,3,2} transpose(f32[3,3,1,384]{3,2,1,0} %convolution.3287), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3314 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3312), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3315 = f32[384]{0} get-tuple-element((f32[96,384,14,14]{3,2,1,0}, f32[384]{0}, f32[384]{0}) %batch-norm-grad.3312), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3319 = f32[1,1,64,384]{3,2,1,0} convolution(f32[96,64,14,14]{3,2,1,0} %get-tuple-element.787, f32[96,384,14,14]{3,2,1,0} %get-tuple-element.3313), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3320 = f32[384,64,1,1]{0,1,3,2} transpose(f32[1,1,64,384]{3,2,1,0} %convolution.3319), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3343 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-grad.3341), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3344 = f32[64]{0} get-tuple-element((f32[96,64,14,14]{3,2,1,0}, f32[64]{0}, f32[64]{0}) %batch-norm-grad.3341), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3348 = f32[1,1,192,64]{3,2,1,0} convolution(f32[96,192,14,14]{3,2,1,0} %clamp.784, f32[96,64,14,14]{3,2,1,0} %get-tuple-element.3342), window={size=14x14}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3349 = f32[64,192,1,1]{0,1,3,2} transpose(f32[1,1,192,64]{3,2,1,0} %convolution.3348), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3375 = f32[192]{0} get-tuple-element((f32[96,192,14,14]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3373), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3376 = f32[192]{0} get-tuple-element((f32[96,192,14,14]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3373), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3383 = f32[3,3,1,192]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %clamp.745, f32[96,192,14,14]{3,2,1,0} %get-tuple-element.3374), window={size=14x14 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=fb01_io01->01bf, batch_group_count=192, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3384 = f32[192,1,3,3]{0,1,3,2} transpose(f32[3,3,1,192]{3,2,1,0} %convolution.3383), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3410 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3408), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3411 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3408), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3415 = f32[1,1,32,192]{3,2,1,0} convolution(f32[96,32,28,28]{3,2,1,0} %add.706, f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3409), window={size=28x28}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3416 = f32[192,32,1,1]{0,1,3,2} transpose(f32[1,1,32,192]{3,2,1,0} %convolution.3415), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3432 = f32[32]{0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3430), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3433 = f32[32]{0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3430), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3437 = f32[1,1,192,32]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %clamp.664, f32[96,32,28,28]{3,2,1,0} %get-tuple-element.3431), window={size=28x28}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3438 = f32[32,192,1,1]{0,1,3,2} transpose(f32[1,1,192,32]{3,2,1,0} %convolution.3437), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3464 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3462), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3465 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3462), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3472 = f32[3,3,1,192]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %clamp.625, f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3463), window={size=28x28 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=192, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3473 = f32[192,1,3,3]{0,1,3,2} transpose(f32[3,3,1,192]{3,2,1,0} %convolution.3472), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3499 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3497), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3500 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3497), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3504 = f32[1,1,32,192]{3,2,1,0} convolution(f32[96,32,28,28]{3,2,1,0} %add.586, f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3498), window={size=28x28}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3505 = f32[192,32,1,1]{0,1,3,2} transpose(f32[1,1,32,192]{3,2,1,0} %convolution.3504), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3528 = f32[32]{0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3526), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3529 = f32[32]{0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3526), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3533 = f32[1,1,192,32]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %clamp.544, f32[96,32,28,28]{3,2,1,0} %get-tuple-element.3527), window={size=28x28}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3534 = f32[32,192,1,1]{0,1,3,2} transpose(f32[1,1,192,32]{3,2,1,0} %convolution.3533), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3560 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3558), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3561 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3558), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3568 = f32[3,3,1,192]{3,2,1,0} convolution(f32[96,192,28,28]{3,2,1,0} %clamp.505, f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3559), window={size=28x28 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=192, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3569 = f32[192,1,3,3]{0,1,3,2} transpose(f32[3,3,1,192]{3,2,1,0} %convolution.3568), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3595 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3593), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3596 = f32[192]{0} get-tuple-element((f32[96,192,28,28]{3,2,1,0}, f32[192]{0}, f32[192]{0}) %batch-norm-grad.3593), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3600 = f32[1,1,32,192]{3,2,1,0} convolution(f32[96,32,28,28]{3,2,1,0} %get-tuple-element.434, f32[96,192,28,28]{3,2,1,0} %get-tuple-element.3594), window={size=28x28}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3601 = f32[192,32,1,1]{0,1,3,2} transpose(f32[1,1,32,192]{3,2,1,0} %convolution.3600), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3624 = f32[32]{0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3622), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3625 = f32[32]{0} get-tuple-element((f32[96,32,28,28]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3622), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3629 = f32[1,1,144,32]{3,2,1,0} convolution(f32[96,144,28,28]{3,2,1,0} %clamp.431, f32[96,32,28,28]{3,2,1,0} %get-tuple-element.3623), window={size=28x28}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3630 = f32[32,144,1,1]{0,1,3,2} transpose(f32[1,1,144,32]{3,2,1,0} %convolution.3629), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3656 = f32[144]{0} get-tuple-element((f32[96,144,28,28]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-grad.3654), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3657 = f32[144]{0} get-tuple-element((f32[96,144,28,28]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-grad.3654), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3664 = f32[3,3,1,144]{3,2,1,0} convolution(f32[96,144,56,56]{3,2,1,0} %clamp.392, f32[96,144,28,28]{3,2,1,0} %get-tuple-element.3655), window={size=28x28 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=fb01_io01->01bf, batch_group_count=144, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3665 = f32[144,1,3,3]{0,1,3,2} transpose(f32[3,3,1,144]{3,2,1,0} %convolution.3664), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3691 = f32[144]{0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-grad.3689), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3692 = f32[144]{0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-grad.3689), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3696 = f32[1,1,24,144]{3,2,1,0} convolution(f32[96,24,56,56]{3,2,1,0} %add.353, f32[96,144,56,56]{3,2,1,0} %get-tuple-element.3690), window={size=56x56}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3697 = f32[144,24,1,1]{0,1,3,2} transpose(f32[1,1,24,144]{3,2,1,0} %convolution.3696), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3713 = f32[24]{0} get-tuple-element((f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) %batch-norm-grad.3711), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3714 = f32[24]{0} get-tuple-element((f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) %batch-norm-grad.3711), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3718 = f32[1,1,144,24]{3,2,1,0} convolution(f32[96,144,56,56]{3,2,1,0} %clamp.311, f32[96,24,56,56]{3,2,1,0} %get-tuple-element.3712), window={size=56x56}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3719 = f32[24,144,1,1]{0,1,3,2} transpose(f32[1,1,144,24]{3,2,1,0} %convolution.3718), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3745 = f32[144]{0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-grad.3743), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3746 = f32[144]{0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-grad.3743), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3753 = f32[3,3,1,144]{3,2,1,0} convolution(f32[96,144,56,56]{3,2,1,0} %clamp.272, f32[96,144,56,56]{3,2,1,0} %get-tuple-element.3744), window={size=56x56 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=144, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3754 = f32[144,1,3,3]{0,1,3,2} transpose(f32[3,3,1,144]{3,2,1,0} %convolution.3753), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3780 = f32[144]{0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-grad.3778), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3781 = f32[144]{0} get-tuple-element((f32[96,144,56,56]{3,2,1,0}, f32[144]{0}, f32[144]{0}) %batch-norm-grad.3778), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3785 = f32[1,1,24,144]{3,2,1,0} convolution(f32[96,24,56,56]{3,2,1,0} %get-tuple-element.201, f32[96,144,56,56]{3,2,1,0} %get-tuple-element.3779), window={size=56x56}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3786 = f32[144,24,1,1]{0,1,3,2} transpose(f32[1,1,24,144]{3,2,1,0} %convolution.3785), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3809 = f32[24]{0} get-tuple-element((f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) %batch-norm-grad.3807), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3810 = f32[24]{0} get-tuple-element((f32[96,24,56,56]{3,2,1,0}, f32[24]{0}, f32[24]{0}) %batch-norm-grad.3807), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3814 = f32[1,1,96,24]{3,2,1,0} convolution(f32[96,96,56,56]{3,2,1,0} %clamp.198, f32[96,24,56,56]{3,2,1,0} %get-tuple-element.3808), window={size=56x56}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3815 = f32[24,96,1,1]{0,1,3,2} transpose(f32[1,1,96,24]{3,2,1,0} %convolution.3814), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3841 = f32[96]{0} get-tuple-element((f32[96,96,56,56]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.3839), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3842 = f32[96]{0} get-tuple-element((f32[96,96,56,56]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.3839), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3849 = f32[3,3,1,96]{3,2,1,0} convolution(f32[96,96,112,112]{3,2,1,0} %clamp.159, f32[96,96,56,56]{3,2,1,0} %get-tuple-element.3840), window={size=56x56 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=fb01_io01->01bf, batch_group_count=96, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3850 = f32[96,1,3,3]{0,1,3,2} transpose(f32[3,3,1,96]{3,2,1,0} %convolution.3849), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3876 = f32[96]{0} get-tuple-element((f32[96,96,112,112]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.3874), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3877 = f32[96]{0} get-tuple-element((f32[96,96,112,112]{3,2,1,0}, f32[96]{0}, f32[96]{0}) %batch-norm-grad.3874), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3881 = f32[1,1,16,96]{3,2,1,0} convolution(f32[96,16,112,112]{3,2,1,0} %get-tuple-element.88, f32[96,96,112,112]{3,2,1,0} %get-tuple-element.3875), window={size=112x112}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3882 = f32[96,16,1,1]{0,1,3,2} transpose(f32[1,1,16,96]{3,2,1,0} %convolution.3881), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3898 = f32[16]{0} get-tuple-element((f32[96,16,112,112]{3,2,1,0}, f32[16]{0}, f32[16]{0}) %batch-norm-grad.3896), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3899 = f32[16]{0} get-tuple-element((f32[96,16,112,112]{3,2,1,0}, f32[16]{0}, f32[16]{0}) %batch-norm-grad.3896), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3903 = f32[1,1,32,16]{3,2,1,0} convolution(f32[96,32,112,112]{3,2,1,0} %clamp.85, f32[96,16,112,112]{3,2,1,0} %get-tuple-element.3897), window={size=112x112}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3904 = f32[16,32,1,1]{0,1,3,2} transpose(f32[1,1,32,16]{3,2,1,0} %convolution.3903), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3930 = f32[32]{0} get-tuple-element((f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3928), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3931 = f32[32]{0} get-tuple-element((f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3928), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3938 = f32[3,3,1,32]{3,2,1,0} convolution(f32[96,32,112,112]{3,2,1,0} %clamp.46, f32[96,32,112,112]{3,2,1,0} %get-tuple-element.3929), window={size=112x112 pad=1_1x1_1}, dim_labels=fb01_io01->01bf, batch_group_count=32, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3939 = f32[32,1,3,3]{0,1,3,2} transpose(f32[3,3,1,32]{3,2,1,0} %convolution.3938), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %get-tuple-element.3965 = f32[32]{0} get-tuple-element((f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3963), index=1, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %get-tuple-element.3966 = f32[32]{0} get-tuple-element((f32[96,32,112,112]{3,2,1,0}, f32[32]{0}, f32[32]{0}) %batch-norm-grad.3963), index=2, metadata={op_type="aten__native_batch_norm_backward" op_name="aten__native_batch_norm_backward"}
+  %convolution.3970 = f32[3,3,3,32]{3,2,1,0} convolution(f32[96,3,224,224]{3,2,1,0} %p5.6, f32[96,32,112,112]{3,2,1,0} %get-tuple-element.3964), window={size=112x112 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=fb01_io01->01bf, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  %transpose.3971 = f32[32,3,3,3]{0,1,3,2} transpose(f32[3,3,3,32]{3,2,1,0} %convolution.3970), dimensions={3,2,0,1}, metadata={op_type="aten__convolution_backward_overrideable" op_name="aten__convolution_backward_overrideable"}
+  ROOT %tuple.3978 = (f32[32]{0}, f32[32]{0}, s64[], f32[32]{0}, f32[32]{0}, /*index=5*/s64[], f32[16]{0}, f32[16]{0}, s64[], f32[96]{0}, /*index=10*/f32[96]{0}, s64[], f32[96]{0}, f32[96]{0}, s64[], /*index=15*/f32[24]{0}, f32[24]{0}, s64[], f32[144]{0}, f32[144]{0}, /*index=20*/s64[], f32[144]{0}, f32[144]{0}, s64[], f32[24]{0}, /*index=25*/f32[24]{0}, s64[], f32[144]{0}, f32[144]{0}, s64[], /*index=30*/f32[144]{0}, f32[144]{0}, s64[], f32[32]{0}, f32[32]{0}, /*index=35*/s64[], f32[192]{0}, f32[192]{0}, s64[], f32[192]{0}, /*index=40*/f32[192]{0}, s64[], f32[32]{0}, f32[32]{0}, s64[], /*index=45*/f32[192]{0}, f32[192]{0}, s64[], f32[192]{0}, f32[192]{0}, /*index=50*/s64[], f32[32]{0}, f32[32]{0}, s64[], f32[192]{0}, /*index=55*/f32[192]{0}, s64[], f32[192]{0}, f32[192]{0}, s64[], /*index=60*/f32[64]{0}, f32[64]{0}, s64[], f32[384]{0}, f32[384]{0}, /*index=65*/s64[], f32[384]{0}, f32[384]{0}, s64[], f32[64]{0}, /*index=70*/f32[64]{0}, s64[], f32[384]{0}, f32[384]{0}, s64[], /*index=75*/f32[384]{0}, f32[384]{0}, s64[], f32[64]{0}, f32[64]{0}, /*index=80*/s64[], f32[384]{0}, f32[384]{0}, s64[], f32[384]{0}, /*index=85*/f32[384]{0}, s64[], f32[64]{0}, f32[64]{0}, s64[], /*index=90*/f32[384]{0}, f32[384]{0}, s64[], f32[384]{0}, f32[384]{0}, /*index=95*/s64[], f32[96]{0}, f32[96]{0}, s64[], f32[576]{0}, /*index=100*/f32[576]{0}, s64[], f32[576]{0}, f32[576]{0}, s64[], /*index=105*/f32[96]{0}, f32[96]{0}, s64[], f32[576]{0}, f32[576]{0}, /*index=110*/s64[], f32[576]{0}, f32[576]{0}, s64[], f32[96]{0}, /*index=115*/f32[96]{0}, s64[], f32[576]{0}, f32[576]{0}, s64[], /*index=120*/f32[576]{0}, f32[576]{0}, s64[], f32[160]{0}, f32[160]{0}, /*index=125*/s64[], f32[960]{0}, f32[960]{0}, s64[], f32[960]{0}, /*index=130*/f32[960]{0}, s64[], f32[160]{0}, f32[160]{0}, s64[], /*index=135*/f32[960]{0}, f32[960]{0}, s64[], f32[960]{0}, f32[960]{0}, /*index=140*/s64[], f32[160]{0}, f32[160]{0}, s64[], f32[960]{0}, /*index=145*/f32[960]{0}, s64[], f32[960]{0}, f32[960]{0}, s64[], /*index=150*/f32[320]{0}, f32[320]{0}, s64[], f32[1280]{0}, f32[1280]{0}, /*index=155*/s64[], f32[1000]{0}, f32[1000,1280]{0,1}, f32[1280]{0}, f32[1280]{0}, /*index=160*/f32[1280,320,1,1]{0,1,3,2}, f32[320]{0}, f32[320]{0}, f32[320,960,1,1]{0,1,3,2}, f32[960]{0}, /*index=165*/f32[960]{0}, f32[960,1,3,3]{0,1,3,2}, f32[960]{0}, f32[960]{0}, f32[960,160,1,1]{0,1,3,2}, /*index=170*/f32[160]{0}, f32[160]{0}, f32[160,960,1,1]{0,1,3,2}, f32[960]{0}, f32[960]{0}, /*index=175*/f32[960,1,3,3]{0,1,3,2}, f32[960]{0}, f32[960]{0}, f32[960,160,1,1]{0,1,3,2}, f32[160]{0}, /*index=180*/f32[160]{0}, f32[160,960,1,1]{0,1,3,2}, f32[960]{0}, f32[960]{0}, f32[960,1,3,3]{0,1,3,2}, /*index=185*/f32[960]{0}, f32[960]{0}, f32[960,160,1,1]{0,1,3,2}, f32[160]{0}, f32[160]{0}, /*index=190*/f32[160,576,1,1]{0,1,3,2}, f32[576]{0}, f32[576]{0}, f32[576,1,3,3]{0,1,3,2}, f32[576]{0}, /*index=195*/f32[576]{0}, f32[576,96,1,1]{0,1,3,2}, f32[96]{0}, f32[96]{0}, f32[96,576,1,1]{0,1,3,2}, /*index=200*/f32[576]{0}, f32[576]{0}, f32[576,1,3,3]{0,1,3,2}, f32[576]{0}, f32[576]{0}, /*index=205*/f32[576,96,1,1]{0,1,3,2}, f32[96]{0}, f32[96]{0}, f32[96,576,1,1]{0,1,3,2}, f32[576]{0}, /*index=210*/f32[576]{0}, f32[576,1,3,3]{0,1,3,2}, f32[576]{0}, f32[576]{0}, f32[576,96,1,1]{0,1,3,2}, /*index=215*/f32[96]{0}, f32[96]{0}, f32[96,384,1,1]{0,1,3,2}, f32[384]{0}, f32[384]{0}, /*index=220*/f32[384,1,3,3]{0,1,3,2}, f32[384]{0}, f32[384]{0}, f32[384,64,1,1]{0,1,3,2}, f32[64]{0}, /*index=225*/f32[64]{0}, f32[64,384,1,1]{0,1,3,2}, f32[384]{0}, f32[384]{0}, f32[384,1,3,3]{0,1,3,2}, /*index=230*/f32[384]{0}, f32[384]{0}, f32[384,64,1,1]{0,1,3,2}, f32[64]{0}, f32[64]{0}, /*index=235*/f32[64,384,1,1]{0,1,3,2}, f32[384]{0}, f32[384]{0}, f32[384,1,3,3]{0,1,3,2}, f32[384]{0}, /*index=240*/f32[384]{0}, f32[384,64,1,1]{0,1,3,2}, f32[64]{0}, f32[64]{0}, f32[64,384,1,1]{0,1,3,2}, /*index=245*/f32[384]{0}, f32[384]{0}, f32[384,1,3,3]{0,1,3,2}, f32[384]{0}, f32[384]{0}, /*index=250*/f32[384,64,1,1]{0,1,3,2}, f32[64]{0}, f32[64]{0}, f32[64,192,1,1]{0,1,3,2}, f32[192]{0}, /*index=255*/f32[192]{0}, f32[192,1,3,3]{0,1,3,2}, f32[192]{0}, f32[192]{0}, f32[192,32,1,1]{0,1,3,2}, /*index=260*/f32[32]{0}, f32[32]{0}, f32[32,192,1,1]{0,1,3,2}, f32[192]{0}, f32[192]{0}, /*index=265*/f32[192,1,3,3]{0,1,3,2}, f32[192]{0}, f32[192]{0}, f32[192,32,1,1]{0,1,3,2}, f32[32]{0}, /*index=270*/f32[32]{0}, f32[32,192,1,1]{0,1,3,2}, f32[192]{0}, f32[192]{0}, f32[192,1,3,3]{0,1,3,2}, /*index=275*/f32[192]{0}, f32[192]{0}, f32[192,32,1,1]{0,1,3,2}, f32[32]{0}, f32[32]{0}, /*index=280*/f32[32,144,1,1]{0,1,3,2}, f32[144]{0}, f32[144]{0}, f32[144,1,3,3]{0,1,3,2}, f32[144]{0}, /*index=285*/f32[144]{0}, f32[144,24,1,1]{0,1,3,2}, f32[24]{0}, f32[24]{0}, f32[24,144,1,1]{0,1,3,2}, /*index=290*/f32[144]{0}, f32[144]{0}, f32[144,1,3,3]{0,1,3,2}, f32[144]{0}, f32[144]{0}, /*index=295*/f32[144,24,1,1]{0,1,3,2}, f32[24]{0}, f32[24]{0}, f32[24,96,1,1]{0,1,3,2}, f32[96]{0}, /*index=300*/f32[96]{0}, f32[96,1,3,3]{0,1,3,2}, f32[96]{0}, f32[96]{0}, f32[96,16,1,1]{0,1,3,2}, /*index=305*/f32[16]{0}, f32[16]{0}, f32[16,32,1,1]{0,1,3,2}, f32[32]{0}, f32[32]{0}, /*index=310*/f32[32,1,3,3]{0,1,3,2}, f32[32]{0}, f32[32]{0}, f32[32,3,3,3]{0,1,3,2}) tuple(f32[32]{0} %add.23, f32[32]{0} %add.31, s64[] %add.36, f32[32]{0} %add.63, f32[32]{0} %add.71, /*index=5*/s64[] %add.76, f32[16]{0} %add.102, f32[16]{0} %add.110, s64[] %add.115, f32[96]{0} %add.137, /*index=10*/f32[96]{0} %add.145, s64[] %add.150, f32[96]{0} %add.176, f32[96]{0} %add.184, s64[] %add.189, /*index=15*/f32[24]{0} %add.215, f32[24]{0} %add.223, s64[] %add.228, f32[144]{0} %add.250, f32[144]{0} %add.258, /*index=20*/s64[] %add.263, f32[144]{0} %add.289, f32[144]{0} %add.297, s64[] %add.302, f32[24]{0} %add.328, /*index=25*/f32[24]{0} %add.336, s64[] %add.341, f32[144]{0} %add.370, f32[144]{0} %add.378, s64[] %add.383, /*index=30*/f32[144]{0} %add.409, f32[144]{0} %add.417, s64[] %add.422, f32[32]{0} %add.448, f32[32]{0} %add.456, /*index=35*/s64[] %add.461, f32[192]{0} %add.483, f32[192]{0} %add.491, s64[] %add.496, f32[192]{0} %add.522, /*index=40*/f32[192]{0} %add.530, s64[] %add.535, f32[32]{0} %add.561, f32[32]{0} %add.569, s64[] %add.574, /*index=45*/f32[192]{0} %add.603, f32[192]{0} %add.611, s64[] %add.616, f32[192]{0} %add.642, f32[192]{0} %add.650, /*index=50*/s64[] %add.655, f32[32]{0} %add.681, f32[32]{0} %add.689, s64[] %add.694, f32[192]{0} %add.723, /*index=55*/f32[192]{0} %add.731, s64[] %add.736, f32[192]{0} %add.762, f32[192]{0} %add.770, s64[] %add.775, /*index=60*/f32[64]{0} %add.801, f32[64]{0} %add.809, s64[] %add.814, f32[384]{0} %add.836, f32[384]{0} %add.844, /*index=65*/s64[] %add.849, f32[384]{0} %add.875, f32[384]{0} %add.883, s64[] %add.888, f32[64]{0} %add.914, /*index=70*/f32[64]{0} %add.922, s64[] %add.927, f32[384]{0} %add.956, f32[384]{0} %add.964, s64[] %add.969, /*index=75*/f32[384]{0} %add.995, f32[384]{0} %add.1003, s64[] %add.1008, f32[64]{0} %add.1034, f32[64]{0} %add.1042, /*index=80*/s64[] %add.1047, f32[384]{0} %add.1076, f32[384]{0} %add.1084, s64[] %add.1089, f32[384]{0} %add.1115, /*index=85*/f32[384]{0} %add.1123, s64[] %add.1128, f32[64]{0} %add.1154, f32[64]{0} %add.1162, s64[] %add.1167, /*index=90*/f32[384]{0} %add.1196, f32[384]{0} %add.1204, s64[] %add.1209, f32[384]{0} %add.1235, f32[384]{0} %add.1243, /*index=95*/s64[] %add.1248, f32[96]{0} %add.1274, f32[96]{0} %add.1282, s64[] %add.1287, f32[576]{0} %add.1309, /*index=100*/f32[576]{0} %add.1317, s64[] %add.1322, f32[576]{0} %add.1348, f32[576]{0} %add.1356, s64[] %add.1361, /*index=105*/f32[96]{0} %add.1387, f32[96]{0} %add.1395, s64[] %add.1400, f32[576]{0} %add.1429, f32[576]{0} %add.1437, /*index=110*/s64[] %add.1442, f32[576]{0} %add.1468, f32[576]{0} %add.1476, s64[] %add.1481, f32[96]{0} %add.1507, /*index=115*/f32[96]{0} %add.1515, s64[] %add.1520, f32[576]{0} %add.1549, f32[576]{0} %add.1557, s64[] %add.1562, /*index=120*/f32[576]{0} %add.1588, f32[576]{0} %add.1596, s64[] %add.1601, f32[160]{0} %add.1627, f32[160]{0} %add.1635, /*index=125*/s64[] %add.1640, f32[960]{0} %add.1662, f32[960]{0} %add.1670, s64[] %add.1675, f32[960]{0} %add.1701, /*index=130*/f32[960]{0} %add.1709, s64[] %add.1714, f32[160]{0} %add.1740, f32[160]{0} %add.1748, s64[] %add.1753, /*index=135*/f32[960]{0} %add.1782, f32[960]{0} %add.1790, s64[] %add.1795, f32[960]{0} %add.1821, f32[960]{0} %add.1829, /*index=140*/s64[] %add.1834, f32[160]{0} %add.1860, f32[160]{0} %add.1868, s64[] %add.1873, f32[960]{0} %add.1902, /*index=145*/f32[960]{0} %add.1910, s64[] %add.1915, f32[960]{0} %add.1941, f32[960]{0} %add.1949, s64[] %add.1954, /*index=150*/f32[320]{0} %add.1980, f32[320]{0} %add.1988, s64[] %add.1993, f32[1280]{0} %add.2015, f32[1280]{0} %add.2023, /*index=155*/s64[] %add.2028, f32[1000]{0} %reshape.2044, f32[1000,1280]{0,1} %transpose.2350, f32[1280]{0} %get-tuple-element.2382, f32[1280]{0} %get-tuple-element.2383, /*index=160*/f32[1280,320,1,1]{0,1,3,2} %transpose.2388, f32[320]{0} %get-tuple-element.2404, f32[320]{0} %get-tuple-element.2405, f32[320,960,1,1]{0,1,3,2} %transpose.2410, f32[960]{0} %get-tuple-element.2436, /*index=165*/f32[960]{0} %get-tuple-element.2437, f32[960,1,3,3]{0,1,3,2} %transpose.2445, f32[960]{0} %get-tuple-element.2471, f32[960]{0} %get-tuple-element.2472, f32[960,160,1,1]{0,1,3,2} %transpose.2477, /*index=170*/f32[160]{0} %get-tuple-element.2493, f32[160]{0} %get-tuple-element.2494, f32[160,960,1,1]{0,1,3,2} %transpose.2499, f32[960]{0} %get-tuple-element.2525, f32[960]{0} %get-tuple-element.2526, /*index=175*/f32[960,1,3,3]{0,1,3,2} %transpose.2534, f32[960]{0} %get-tuple-element.2560, f32[960]{0} %get-tuple-element.2561, f32[960,160,1,1]{0,1,3,2} %transpose.2566, f32[160]{0} %get-tuple-element.2589, /*index=180*/f32[160]{0} %get-tuple-element.2590, f32[160,960,1,1]{0,1,3,2} %transpose.2595, f32[960]{0} %get-tuple-element.2621, f32[960]{0} %get-tuple-element.2622, f32[960,1,3,3]{0,1,3,2} %transpose.2630, /*index=185*/f32[960]{0} %get-tuple-element.2656, f32[960]{0} %get-tuple-element.2657, f32[960,160,1,1]{0,1,3,2} %transpose.2662, f32[160]{0} %get-tuple-element.2685, f32[160]{0} %get-tuple-element.2686, /*index=190*/f32[160,576,1,1]{0,1,3,2} %transpose.2691, f32[576]{0} %get-tuple-element.2717, f32[576]{0} %get-tuple-element.2718, f32[576,1,3,3]{0,1,3,2} %transpose.2726, f32[576]{0} %get-tuple-element.2752, /*index=195*/f32[576]{0} %get-tuple-element.2753, f32[576,96,1,1]{0,1,3,2} %transpose.2758, f32[96]{0} %get-tuple-element.2774, f32[96]{0} %get-tuple-element.2775, f32[96,576,1,1]{0,1,3,2} %transpose.2780, /*index=200*/f32[576]{0} %get-tuple-element.2806, f32[576]{0} %get-tuple-element.2807, f32[576,1,3,3]{0,1,3,2} %transpose.2815, f32[576]{0} %get-tuple-element.2841, f32[576]{0} %get-tuple-element.2842, /*index=205*/f32[576,96,1,1]{0,1,3,2} %transpose.2847, f32[96]{0} %get-tuple-element.2870, f32[96]{0} %get-tuple-element.2871, f32[96,576,1,1]{0,1,3,2} %transpose.2876, f32[576]{0} %get-tuple-element.2902, /*index=210*/f32[576]{0} %get-tuple-element.2903, f32[576,1,3,3]{0,1,3,2} %transpose.2911, f32[576]{0} %get-tuple-element.2937, f32[576]{0} %get-tuple-element.2938, f32[576,96,1,1]{0,1,3,2} %transpose.2943, /*index=215*/f32[96]{0} %get-tuple-element.2966, f32[96]{0} %get-tuple-element.2967, f32[96,384,1,1]{0,1,3,2} %transpose.2972, f32[384]{0} %get-tuple-element.2998, f32[384]{0} %get-tuple-element.2999, /*index=220*/f32[384,1,3,3]{0,1,3,2} %transpose.3007, f32[384]{0} %get-tuple-element.3033, f32[384]{0} %get-tuple-element.3034, f32[384,64,1,1]{0,1,3,2} %transpose.3039, f32[64]{0} %get-tuple-element.3055, /*index=225*/f32[64]{0} %get-tuple-element.3056, f32[64,384,1,1]{0,1,3,2} %transpose.3061, f32[384]{0} %get-tuple-element.3087, f32[384]{0} %get-tuple-element.3088, f32[384,1,3,3]{0,1,3,2} %transpose.3096, /*index=230*/f32[384]{0} %get-tuple-element.3122, f32[384]{0} %get-tuple-element.3123, f32[384,64,1,1]{0,1,3,2} %transpose.3128, f32[64]{0} %get-tuple-element.3151, f32[64]{0} %get-tuple-element.3152, /*index=235*/f32[64,384,1,1]{0,1,3,2} %transpose.3157, f32[384]{0} %get-tuple-element.3183, f32[384]{0} %get-tuple-element.3184, f32[384,1,3,3]{0,1,3,2} %transpose.3192, f32[384]{0} %get-tuple-element.3218, /*index=240*/f32[384]{0} %get-tuple-element.3219, f32[384,64,1,1]{0,1,3,2} %transpose.3224, f32[64]{0} %get-tuple-element.3247, f32[64]{0} %get-tuple-element.3248, f32[64,384,1,1]{0,1,3,2} %transpose.3253, /*index=245*/f32[384]{0} %get-tuple-element.3279, f32[384]{0} %get-tuple-element.3280, f32[384,1,3,3]{0,1,3,2} %transpose.3288, f32[384]{0} %get-tuple-element.3314, f32[384]{0} %get-tuple-element.3315, /*index=250*/f32[384,64,1,1]{0,1,3,2} %transpose.3320, f32[64]{0} %get-tuple-element.3343, f32[64]{0} %get-tuple-element.3344, f32[64,192,1,1]{0,1,3,2} %transpose.3349, f32[192]{0} %get-tuple-element.3375, /*index=255*/f32[192]{0} %get-tuple-element.3376, f32[192,1,3,3]{0,1,3,2} %transpose.3384, f32[192]{0} %get-tuple-element.3410, f32[192]{0} %get-tuple-element.3411, f32[192,32,1,1]{0,1,3,2} %transpose.3416, /*index=260*/f32[32]{0} %get-tuple-element.3432, f32[32]{0} %get-tuple-element.3433, f32[32,192,1,1]{0,1,3,2} %transpose.3438, f32[192]{0} %get-tuple-element.3464, f32[192]{0} %get-tuple-element.3465, /*index=265*/f32[192,1,3,3]{0,1,3,2} %transpose.3473, f32[192]{0} %get-tuple-element.3499, f32[192]{0} %get-tuple-element.3500, f32[192,32,1,1]{0,1,3,2} %transpose.3505, f32[32]{0} %get-tuple-element.3528, /*index=270*/f32[32]{0} %get-tuple-element.3529, f32[32,192,1,1]{0,1,3,2} %transpose.3534, f32[192]{0} %get-tuple-element.3560, f32[192]{0} %get-tuple-element.3561, f32[192,1,3,3]{0,1,3,2} %transpose.3569, /*index=275*/f32[192]{0} %get-tuple-element.3595, f32[192]{0} %get-tuple-element.3596, f32[192,32,1,1]{0,1,3,2} %transpose.3601, f32[32]{0} %get-tuple-element.3624, f32[32]{0} %get-tuple-element.3625, /*index=280*/f32[32,144,1,1]{0,1,3,2} %transpose.3630, f32[144]{0} %get-tuple-element.3656, f32[144]{0} %get-tuple-element.3657, f32[144,1,3,3]{0,1,3,2} %transpose.3665, f32[144]{0} %get-tuple-element.3691, /*index=285*/f32[144]{0} %get-tuple-element.3692, f32[144,24,1,1]{0,1,3,2} %transpose.3697, f32[24]{0} %get-tuple-element.3713, f32[24]{0} %get-tuple-element.3714, f32[24,144,1,1]{0,1,3,2} %transpose.3719, /*index=290*/f32[144]{0} %get-tuple-element.3745, f32[144]{0} %get-tuple-element.3746, f32[144,1,3,3]{0,1,3,2} %transpose.3754, f32[144]{0} %get-tuple-element.3780, f32[144]{0} %get-tuple-element.3781, /*index=295*/f32[144,24,1,1]{0,1,3,2} %transpose.3786, f32[24]{0} %get-tuple-element.3809, f32[24]{0} %get-tuple-element.3810, f32[24,96,1,1]{0,1,3,2} %transpose.3815, f32[96]{0} %get-tuple-element.3841, /*index=300*/f32[96]{0} %get-tuple-element.3842, f32[96,1,3,3]{0,1,3,2} %transpose.3850, f32[96]{0} %get-tuple-element.3876, f32[96]{0} %get-tuple-element.3877, f32[96,16,1,1]{0,1,3,2} %transpose.3882, /*index=305*/f32[16]{0} %get-tuple-element.3898, f32[16]{0} %get-tuple-element.3899, f32[16,32,1,1]{0,1,3,2} %transpose.3904, f32[32]{0} %get-tuple-element.3930, f32[32]{0} %get-tuple-element.3931, /*index=310*/f32[32,1,3,3]{0,1,3,2} %transpose.3939, f32[32]{0} %get-tuple-element.3965, f32[32]{0} %get-tuple-element.3966, f32[32,3,3,3]{0,1,3,2} %transpose.3971)
+}
+
diff --git a/tensorflow/compiler/xla/tools/driver.cc b/tensorflow/compiler/xla/tools/driver.cc
index df4d3154d62..26517ba945a 100644
--- a/tensorflow/compiler/xla/tools/driver.cc
+++ b/tensorflow/compiler/xla/tools/driver.cc
@@ -118,12 +118,14 @@ enum PrimitiveType {
   F8E4M3FN,
   S4,
   U4,
+  F8E4M3B11FNUZ,
 };
 
 const std::vector<std::string>& primitive_strings() {
   static auto vec = new std::vector<std::string>(
       {"s16", "s32", "s64", "u8", "u16", "u32", "u64", "f16", "bf16", "f32",
-       "f64", "c64", "c128", "f8e5m2", "f8e4m3fn", "s4", "u4"});
+       "f64", "c64", "c128", "f8e5m2", "f8e4m3fn", "s4", "u4",
+       "f8e4m3b11fnuz"});
   return *vec;
 }
 
@@ -400,6 +402,7 @@ void Fill(void* buffer, const ArrayShape& shape) {
 
     case F8E5M2:
     case F8E4M3FN:
+    case F8E4M3B11FNUZ:
     case F16:
     case BF16:
     case C64:
@@ -449,6 +452,7 @@ void Display(const void* buffer, const ArrayShape& shape) {
 
     case F8E5M2:
     case F8E4M3FN:
+    case F8E4M3B11FNUZ:
     case F16:
     case BF16:
     case C64:
diff --git a/tensorflow/compiler/xla/tools/hlo_bisect/BUILD b/tensorflow/compiler/xla/tools/hlo_bisect/BUILD
index 1d1961a8117..78e2235c193 100644
--- a/tensorflow/compiler/xla/tools/hlo_bisect/BUILD
+++ b/tensorflow/compiler/xla/tools/hlo_bisect/BUILD
@@ -48,7 +48,6 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_dce",
         "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.cc b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.cc
index 1d85a9f167f..2b64fa62afd 100644
--- a/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.cc
+++ b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.cc
@@ -41,8 +41,9 @@ namespace {
 // instructions in the front and the rest are in post order.
 std::vector<HloInstruction*> GetModifiedInstructionPostOrder(
     HloComputation* computation) {
-  std::vector<HloInstruction*> instructions =
-      computation->parameter_instructions();
+  std::vector<HloInstruction*> instructions(
+      computation->parameter_instructions().begin(),
+      computation->parameter_instructions().end());
   absl::c_copy_if(computation->MakeInstructionPostOrder(),
                   std::back_inserter(instructions),
                   [&](const HloInstruction* instr) {
@@ -182,7 +183,7 @@ StatusOr<bool> HloBisectState::RunModule(const HloModule& module) {
   // Update foldable instructions data.
   if (!bug_result.value()) {
     for (HloInstruction* instr : module.entry_computation()->instructions()) {
-      foldable_instructions_.insert(instr->name());
+      foldable_instructions_.emplace(instr->name());
     }
     for (auto& [key, value] : bug_checker_->GetResults()) {
       foldable_instructions_values_[key] = std::move(value);
diff --git a/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_utils.cc b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_utils.cc
index b4abe2ad507..7a1f9430922 100644
--- a/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_utils.cc
+++ b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_utils.cc
@@ -100,7 +100,7 @@ StatusOr<ModuleWithInputs> GetModuleAndInputData(
     module_or_status = LoadModuleFromHloProto(hlo_proto);
     if (!module_or_status.ok()) {
       LOG(ERROR) << "Failed to load hlo proto"
-                 << module_or_status.status().error_message();
+                 << module_or_status.status().message();
       return module_or_status.status();
     }
     module = std::move(module_or_status).value();
@@ -111,7 +111,7 @@ StatusOr<ModuleWithInputs> GetModuleAndInputData(
   Status to_string_status = tsl::ReadFileToString(env, input_file, &hlo_string);
   if (!to_string_status.ok()) {
     LOG(ERROR) << input_file << " problem in reading file to string: "
-               << to_string_status.error_message();
+               << to_string_status.message();
     return to_string_status;
   }
 
@@ -120,7 +120,7 @@ StatusOr<ModuleWithInputs> GetModuleAndInputData(
   module_or_status = ParseAndReturnUnverifiedModule(hlo_string, config);
   if (!module_or_status.ok()) {
     LOG(ERROR) << input_file << " is not HLO text either, error in parsing "
-               << module_or_status.status().error_message();
+               << module_or_status.status().message();
     return module_or_status.status();
   }
 
@@ -343,13 +343,13 @@ void RunBisect(std::unique_ptr<BisectRunner> runner, bool all_computations,
                absl::string_view dump_path, absl::string_view output_format) {
   StatusOr<std::unique_ptr<HloModule>> bisect_status =
       all_computations ? runner->RunAll() : runner->RunEntry();
-  CHECK(bisect_status.ok()) << bisect_status.status().error_message();
+  CHECK(bisect_status.ok()) << bisect_status.status().message();
 
   std::unique_ptr<HloModule> new_module = std::move(bisect_status.value());
   Status dump_status =
       DumpHloModule(new_module.get(), new_module->name() + "_trimmed",
                     dump_path, output_format);
-  CHECK(dump_status.ok()) << dump_status.error_message();
+  CHECK(dump_status.ok()) << dump_status.message();
 }
 
 StatusOr<ModuleWithInputs> GetVerifiedModuleAndInputData(
@@ -372,8 +372,7 @@ StatusOr<ModuleWithInputs> GetVerifiedModuleAndInputData(
                                .Run(module.get())
                                .status();
   if (!verified_status.ok()) {
-    LOG(ERROR) << "Failed to verify hlo module "
-               << verified_status.error_message();
+    LOG(ERROR) << "Failed to verify hlo module " << verified_status.message();
     return verified_status;
   }
 
diff --git a/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.cc b/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.cc
index 516cba4da23..f1576fbae98 100644
--- a/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.cc
+++ b/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.cc
@@ -284,7 +284,7 @@ Status HloControlFlowFlattening::RemoveRecvDone(
   HloInstruction* custom_call_recv =
       computation->AddInstruction(HloInstruction::CreateCustomCall(
           recv->shape(), recv->operands(), kNopCustomCallTarget));
-  std::string original_recv_name = recv->name();
+  std::string original_recv_name(recv->name());
   if (module->has_schedule() &&
       module->schedule().is_computation_scheduled(computation)) {
     module->schedule().replace_instruction(computation, recv, custom_call_recv);
@@ -292,7 +292,7 @@ Status HloControlFlowFlattening::RemoveRecvDone(
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(recv, custom_call_recv));
   custom_call_recv->SetAndSanitizeName(original_recv_name);
 
-  std::string original_recv_done_name = recv_done->name();
+  std::string original_recv_done_name(recv_done->name());
   HloInstruction* custom_call_recv_done = computation->AddInstruction(
       HloInstruction::CreateCustomCall(
           recv_done->shape(), recv_done->operands(), kNopCustomCallTarget),
@@ -340,7 +340,7 @@ Status HloControlFlowFlattening::RemoveSendDone(
   HloInstruction* custom_call_send =
       computation->AddInstruction(HloInstruction::CreateCustomCall(
           send->shape(), send->operands(), kNopCustomCallTarget));
-  std::string original_send_name = send->name();
+  std::string original_send_name(send->name());
   if (module->has_schedule() &&
       module->schedule().is_computation_scheduled(computation)) {
     module->schedule().replace_instruction(computation, send, custom_call_send);
@@ -351,7 +351,7 @@ Status HloControlFlowFlattening::RemoveSendDone(
   HloInstruction* custom_call_send_done =
       computation->AddInstruction(HloInstruction::CreateCustomCall(
           send_done->shape(), send_done->operands(), "NopReturnToken"));
-  std::string original_send_done_name = send_done->name();
+  std::string original_send_done_name(send_done->name());
   Cast<HloCustomCallInstruction>(custom_call_send_done)
       ->set_custom_call_has_side_effect(true);
   if (module->has_schedule() &&
@@ -379,14 +379,13 @@ Status HloControlFlowFlattening::RemoveCollective(HloInstruction* hlo) const {
       module->schedule().is_computation_scheduled(computation)) {
     module->schedule().replace_instruction(computation, hlo, custom_call);
   }
-  std::string original_op_name = hlo->name();
+  std::string original_op_name(hlo->name());
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, custom_call));
   custom_call->SetAndSanitizeName(original_op_name);
   return OkStatus();
 }
 
-Status HloControlFlowFlattening::RemovePartitionOrReplicaId(
-    HloInstruction* hlo) const {
+Status HloControlFlowFlattening::RemoveId(HloInstruction* hlo) const {
   HloComputation* computation = hlo->parent();
   HloInstruction* zero = CreateConstant(hlo->shape(), computation);
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, zero));
@@ -447,9 +446,11 @@ StatusOr<bool> HloControlFlowFlattening::Run(
         changed = true;
       } else if (remove_comm_ &&
                  (instruction->opcode() == HloOpcode::kPartitionId ||
-                  instruction->opcode() == HloOpcode::kReplicaId)) {
+                  instruction->opcode() == HloOpcode::kReplicaId ||
+                  (instruction->opcode() == HloOpcode::kCustomCall &&
+                   instruction->custom_call_target() == "SliceId"))) {
         VLOG(1) << "Remove " << instruction->name();
-        TF_RETURN_IF_ERROR(RemovePartitionOrReplicaId(instruction));
+        TF_RETURN_IF_ERROR(RemoveId(instruction));
       }
     }
   }
diff --git a/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.h b/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.h
index 3141ec70deb..fd65102d1ef 100644
--- a/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.h
+++ b/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.h
@@ -70,8 +70,8 @@ class HloControlFlowFlattening : public HloModulePass {
   // Flattens the while loop. Precondition: while_hlo is a while instruction.
   Status FlattenWhileLoop(HloInstruction* while_hlo,
                           const CallGraph& call_graph) const;
-  // Replaces a partition-id or replica-id with a zero constant.
-  Status RemovePartitionOrReplicaId(HloInstruction* hlo) const;
+  // Replaces an id with a zero constant.
+  Status RemoveId(HloInstruction* hlo) const;
   // Removes send and send-done with a custom call.
   Status RemoveSendDone(
       HloInstruction* send_done,
diff --git a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
index 265d569bf4b..b0637c1a4d1 100644
--- a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
+++ b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/tsl/util/command_line_flags.h"
 
 using std::string;
-using tsl::Env;
 
 namespace xla {
 namespace tools {
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz.cc b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
index 9472486e194..36b7aec32d8 100644
--- a/tensorflow/compiler/xla/tools/interactive_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
@@ -400,10 +400,15 @@ void DoExtractCommand(const HloModule& module,
   }
 
   auto extracted_module = ExtractModule(instr, height);
-  std::cout << extracted_module->ToString(
-                   HloPrintOptions::ShortParsable().set_print_backend_config(
-                       hlo_render_options.show_backend_config))
-            << std::endl;
+  std::string module_str = extracted_module->ToString(
+      HloPrintOptions::ShortParsable().set_print_backend_config(
+          hlo_render_options.show_backend_config));
+
+  std::string outfile_name =
+      tsl::io::GetTempFilename(absl::StrCat(node_name, "-extracted.hlo"));
+  TF_CHECK_OK(
+      tsl::WriteStringToFile(tsl::Env::Default(), outfile_name, module_str));
+  std::cout << outfile_name << std::endl;
 }
 
 // Checks if there is a use-def path from `from` to `to`.
diff --git a/tensorflow/compiler/xla/tools/multihost_hlo_runner/BUILD b/tensorflow/compiler/xla/tools/multihost_hlo_runner/BUILD
index bf39267acd4..531461788bb 100644
--- a/tensorflow/compiler/xla/tools/multihost_hlo_runner/BUILD
+++ b/tensorflow/compiler/xla/tools/multihost_hlo_runner/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow/tsl:tsl.bzl", "if_cuda_or_rocm")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 
@@ -15,6 +16,7 @@ build_test(
     name = "hlo_runner_main_build_test",
     tags = [
         "gpu",
+        "no_rocm",
     ],
     targets = [
         ":hlo_runner_main",
@@ -33,7 +35,6 @@ xla_cc_binary(
     deps = [
         ":functional_hlo_runner",
         ":hlo_runner_flags",
-        "@com_google_absl//absl/strings",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/tsl/platform:logging",
@@ -41,10 +42,13 @@ xla_cc_binary(
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/util:command_line_flags",
+        "@com_google_absl//absl/strings",
     ] + if_cuda_or_rocm([
         "//tensorflow/compiler/xla/service:gpu_plugin",
     ]) + if_cuda([
         "//tensorflow/compiler/xla/stream_executor:cuda_platform",
+    ]) + if_rocm([
+        "//tensorflow/compiler/xla/stream_executor:rocm_platform",
     ]),
 )
 
@@ -97,6 +101,7 @@ xla_test(
     },
     backends = ["gpu"],
     data = [
+        "data/sharded_16_devices.hlo",
         "data/sharded_2_devices.hlo",
         "data/single_device.hlo",
         "data/single_device_tupled.hlo",
@@ -104,7 +109,9 @@ xla_test(
     tags = ["nomac"],
     deps = [
         ":functional_hlo_runner",
+        "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:path",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:test",
diff --git a/tensorflow/compiler/xla/tools/multihost_hlo_runner/data/sharded_16_devices.hlo b/tensorflow/compiler/xla/tools/multihost_hlo_runner/data/sharded_16_devices.hlo
new file mode 100644
index 00000000000..e477849926b
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/multihost_hlo_runner/data/sharded_16_devices.hlo
@@ -0,0 +1,6 @@
+HloModule f
+
+ENTRY f {
+  arg = f32[16,16]{1,0} parameter(0), sharding={devices=[1,16]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+  ROOT add_result = f32[16,16]{1,0} add(arg, arg), sharding={devices=[1,16]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+}
diff --git a/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc b/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
index e7d2a56198c..108205b60eb 100644
--- a/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
+++ b/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -128,25 +129,6 @@ StatusOr<Literal> MakeFakeLiteralWithSameValue(const Shape& shape, int value) {
   return literal;
 }
 
-void AddShardingAnnotationsToSpmdPartitionedModule(HloModule* hlo_module) {
-  auto set_manual_sharding = [](HloInstruction* hlo) {
-    if (!hlo->has_sharding()) {
-      hlo->set_sharding(
-          HloSharding::Manual().NormalizeTupleSharding(hlo->shape()));
-    }
-  };
-  for (int64_t i = 0; i < hlo_module->entry_computation()->num_parameters();
-       ++i) {
-    HloInstruction* param =
-        hlo_module->entry_computation()->parameter_instruction(i);
-    set_manual_sharding(param);
-  }
-
-  HloInstruction* entry_root =
-      hlo_module->entry_computation()->root_instruction();
-  set_manual_sharding(entry_root);
-}
-
 }  // namespace
 
 bool AbslParseFlag(absl::string_view text, InputFormat* input_format,
@@ -237,6 +219,59 @@ std::string AbslUnparseFlag(
   }
 }
 
+bool AbslParseFlag(absl::string_view text,
+                   FunctionalHloRunner::ModuleOutputMode* output_mode,
+                   std::string* error) {
+  if (text == "return_outputs") {
+    *output_mode = FunctionalHloRunner::ModuleOutputMode::kReturnOutputs;
+    return true;
+  }
+  if (text == "not_return_outputs") {
+    *output_mode = FunctionalHloRunner::ModuleOutputMode::kNotReturnOutputs;
+    return true;
+  }
+  if (text == "return_device_0_outputs") {
+    *output_mode = FunctionalHloRunner::ModuleOutputMode::kReturnDevice0Outputs;
+    return true;
+  }
+  *error =
+      "Unrecognized module output mode specified. Expect \"return_outputs\", "
+      "\"not_return_outputs\", or \"return_device_0_outputs\".";
+  return false;
+}
+
+std::string AbslUnparseFlag(FunctionalHloRunner::ModuleOutputMode output_mode) {
+  switch (output_mode) {
+    case FunctionalHloRunner::ModuleOutputMode::kReturnOutputs:
+      return "return_outputs";
+    case FunctionalHloRunner::ModuleOutputMode::kNotReturnOutputs:
+      return "not_return_outputs";
+    case FunctionalHloRunner::ModuleOutputMode::kReturnDevice0Outputs:
+      return "return_device_0_outputs";
+    default:
+      LOG(FATAL) << "Unexpected output mode.";
+  }
+}
+
+void AddShardingAnnotationsToSpmdPartitionedModule(HloModule* hlo_module) {
+  auto set_manual_sharding = [](HloInstruction* hlo) {
+    if (!hlo->has_sharding()) {
+      hlo->set_sharding(
+          HloSharding::Manual().NormalizeTupleSharding(hlo->shape()));
+    }
+  };
+  for (int64_t i = 0; i < hlo_module->entry_computation()->num_parameters();
+       ++i) {
+    HloInstruction* param =
+        hlo_module->entry_computation()->parameter_instruction(i);
+    set_manual_sharding(param);
+  }
+
+  HloInstruction* entry_root =
+      hlo_module->entry_computation()->root_instruction();
+  set_manual_sharding(entry_root);
+}
+
 StatusOr<std::unique_ptr<PjRtClient>> FunctionalHloRunner::CreateGpuClient() {
   return GetStreamExecutorGpuClient(
       /*asynchronous=*/true, GpuAllocatorConfig(),
@@ -283,7 +318,11 @@ StatusOr<CompileOptions> FunctionalHloRunner::CreateCompileOptions(
   }
   DebugOptions& debug_options = *build_options.mutable_debug_options();
   if (task_id == 0) {
-    debug_options.set_xla_dump_to(raw_options.xla_dump_to);
+    // Overwrite xla_dump_to only if it's not empty, to preserve `xla_dump_to`
+    // from parsed XLA_FLAGS env (already populated in debug_options).
+    if (!raw_options.xla_dump_to.empty()) {
+      debug_options.set_xla_dump_to(raw_options.xla_dump_to);
+    }
     debug_options.set_xla_dump_hlo_as_text(raw_options.xla_text_dump_mode ==
                                            XlaTextDumpMode::kDumpAsText);
     debug_options.set_xla_dump_hlo_as_proto(raw_options.xla_proto_dump_mode ==
@@ -513,6 +552,38 @@ FunctionalHloRunner::LoadAndRun(
                        argument_literals, per_device_index_vec);
 }
 
+Status FunctionalHloRunner::LoadAndCompile(
+    PjRtClient& client, const PreprocessingOptions& preproc_options,
+    const RawCompileOptions& raw_compile_options, std::string_view hlo_file,
+    InputFormat input_format, int task_id) {
+  TF_ASSIGN_OR_RETURN(CompileOptions compile_options,
+                      FunctionalHloRunner::CreateCompileOptions(
+                          client, raw_compile_options, task_id));
+
+  int num_replicas = compile_options.executable_build_options.num_replicas();
+  int num_partitions =
+      compile_options.executable_build_options.num_partitions();
+  int needed_devices = num_replicas * num_partitions;
+  if (client.addressable_device_count() < needed_devices) {
+    LOG(INFO) << "Applying a workaround to allow compiling multi-device HLOs "
+                 "on machines with fewer devices.";
+    DeviceAssignment assignment(num_replicas, num_partitions);
+    assignment.Fill(0);
+    compile_options.executable_build_options.set_device_assignment(assignment);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      FunctionalHloRunner::HloModuleAndArguments hlo_module_and_arguments,
+      FunctionalHloRunner::LoadHloModuleAndArguments(hlo_file, input_format));
+
+  TF_RETURN_IF_ERROR(FunctionalHloRunner::Compile(
+                         client, hlo_module_and_arguments.hlo_module.get(),
+                         preproc_options, compile_options)
+                         .status());
+
+  return OkStatus();
+}
+
 StatusOr<std::unique_ptr<HloModule>>
 FunctionalHloRunner::ReadModuleFromHloTextFile(absl::string_view hlo_file) {
   std::string hlo_string;
@@ -854,8 +925,7 @@ std::vector<std::vector<PjRtBuffer*>> CreateArgumentPointersBasedOnAliasing(
 }
 
 std::vector<Shape> GetArgumentShapes(const HloModule& module) {
-  const std::vector<HloInstruction*>& params =
-      module.entry_computation()->parameter_instructions();
+  const auto& params = module.entry_computation()->parameter_instructions();
   std::vector<Shape> argument_shapes;
   argument_shapes.reserve(params.size());
   for (int i = 0; i < static_cast<int>(params.size()); ++i) {
@@ -977,6 +1047,9 @@ FunctionalHloRunner::RunInternal(
             << repeat << ").";
     if (repeat == running_options.num_repeats - 1) {
       execute_options.untuple_result = default_untuple_result;
+      if (running_options.profiler != nullptr) {
+        running_options.profiler->CreateSession();
+      }
     }
     TF_ASSIGN_OR_RETURN(output_buffers,
                         executable->Execute(argument_ptrs, execute_options));
@@ -1001,10 +1074,14 @@ FunctionalHloRunner::RunInternal(
       }
     }
   }
+
   TF_ASSIGN_OR_RETURN(PerDeviceLiteralVecType results,
                       FetchAndLogOutput(client, output_buffers,
                                         running_options.module_output_mode,
                                         running_options.log_input_output()));
+  if (running_options.profiler != nullptr) {
+    running_options.profiler->UploadSession();
+  }
   return results;
 }
 
diff --git a/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner.h b/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
index d35fd31b173..4627b82bc11 100644
--- a/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
+++ b/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <vector>
 
 #include "absl/container/btree_map.h"
@@ -44,6 +45,17 @@ enum class InputFormat {
                          // xla_dump_hlo_snapshots.
 };
 
+// Interface for profiler plugins. If being set in RunningOptions, profiling
+// session will be created for the last run of the HLO module.
+class ProfilerInterface {
+ public:
+  virtual ~ProfilerInterface() = default;
+  // Creates profiling session while running HLO module.
+  virtual void CreateSession() = 0;
+  // Uploads profiling session data after finishing running HLO module.
+  virtual void UploadSession() = 0;
+};
+
 bool AbslParseFlag(absl::string_view text, InputFormat* input_format,
                    std::string* error);
 std::string AbslUnparseFlag(InputFormat input_format);
@@ -65,8 +77,8 @@ class FunctionalHloRunner {
   enum class LogOutputMode { kLogOutput, kNotLogOutput };
 
   enum class HloPassesMode {
-    // Only call the XLA compiler's RunBackend to compile the module. This is
-    // used to run a post-optimization HLO module (dumped as
+    // Call only XLA's RunBackend during the compilation. This is used to run a
+    // post-optimization HLO module (dumped as
     // 'xxx.after_optimizations.hlo.xxx').
     kRunXLABackendOnly,
     // Calls Compile (i.e., both RunHloPasses and RunBackend) to compile the
@@ -173,6 +185,7 @@ class FunctionalHloRunner {
     // This indicates whether we log the inputs and outputs to stderr.
     LogOutputMode log_input_output_mode = LogOutputMode::kNotLogOutput;
     const MultiSliceConfig* multi_slice_config = nullptr;
+    ProfilerInterface* profiler = nullptr;
 
     // Should we log the inputs and outputs to stderr?
     bool log_input_output() const {
@@ -243,6 +256,16 @@ class FunctionalHloRunner {
       const LiteralVec& argument_literals,
       const PerDeviceIndexVecType& per_device_index_vec);
 
+  // Loads and compiles an HLO for debugging purposes.
+  //
+  // This function allows compiling multi-device HLOs on machines with fewer
+  // devices.
+  static Status LoadAndCompile(PjRtClient& client,
+                               const PreprocessingOptions& preproc_options,
+                               const RawCompileOptions& raw_compile_options,
+                               std::string_view hlo_file,
+                               InputFormat input_format, int task_id = 0);
+
   // Compiles and runs the given HLO module with the given arguments for each
   // device. The given arguments is a map from device ID to a list of arguments.
   // If the arguments map is empty, the HLO module is run with fake arguments.
@@ -391,6 +414,13 @@ bool AbslParseFlag(absl::string_view text,
 std::string AbslUnparseFlag(
     FunctionalHloRunner::ModuleArgumentMode argument_mode);
 
+bool AbslParseFlag(absl::string_view text,
+                   FunctionalHloRunner::ModuleOutputMode* output_mode,
+                   std::string* error);
+std::string AbslUnparseFlag(FunctionalHloRunner::ModuleOutputMode output_mode);
+
+void AddShardingAnnotationsToSpmdPartitionedModule(HloModule* hlo_module);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TOOLS_MULTIHOST_HLO_RUNNER_FUNCTIONAL_HLO_RUNNER_H_
diff --git a/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc b/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
index b8150544b93..e11aab35b3d 100644
--- a/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
+++ b/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
@@ -20,7 +20,10 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/file_system.h"
 #include "tensorflow/tsl/platform/path.h"
 #include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/platform/test.h"
@@ -28,6 +31,8 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::testing::SizeIs;
+
 class FunctionalHloRunnerTest : public ::testing::Test {
  protected:
   std::string GetHloPath(std::string file_name) {
@@ -161,5 +166,57 @@ TEST_F(FunctionalHloRunnerTest, UseUninitializedInputsWithTupledArguments) {
       {GetHloPath("single_device_tupled.hlo")}, InputFormat::kText));
 }
 
+TEST_F(FunctionalHloRunnerTest, CanCompileWithoutHavingEnoughGpus) {
+  // This test corresponds to:
+  // --use_spmd_partitioning=true --num_replicas=1 --num_partitions=16
+  // --run=false --xla_dump_to=dump_dir
+
+  tsl::Env* env = tsl::Env::Default();
+  std::string dump_dir;
+  ASSERT_TRUE(env->LocalTempFilename(&dump_dir));
+  tsl::FileSystem* fs = nullptr;
+  TF_ASSERT_OK(env->GetFileSystemForFile(dump_dir, &fs));
+
+  FunctionalHloRunner::PreprocessingOptions preproc_options;
+  FunctionalHloRunner::RawCompileOptions raw_compile_options;
+  raw_compile_options.spmd_mode =
+      FunctionalHloRunner::SpmdMode::kUseSpmdPartitioning;
+  raw_compile_options.num_replicas = 1;
+  raw_compile_options.num_partitions = 16;
+  raw_compile_options.xla_dump_to = dump_dir;
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
+                          xla::FunctionalHloRunner::CreateGpuClient());
+  TF_EXPECT_OK(FunctionalHloRunner::LoadAndCompile(
+      *client, preproc_options, raw_compile_options,
+      GetHloPath("sharded_16_devices.hlo"), InputFormat::kText));
+
+  // Check that the sharding was done correctly.
+  {
+    std::vector<std::string> after_opt_hlo_paths;
+    TF_ASSERT_OK(
+        fs->GetMatchingPaths(fs->JoinPath(dump_dir, "*after_optimizations.txt"),
+                             &after_opt_hlo_paths));
+    ASSERT_THAT(after_opt_hlo_paths, SizeIs(1));
+    std::string after_opt_hlo;
+    TF_ASSERT_OK(
+        tsl::ReadFileToString(env, after_opt_hlo_paths[0], &after_opt_hlo));
+    StatusOr<bool> file_check_result = RunFileCheck(after_opt_hlo, R"(
+      // CHECK: param = f32[16,1]{1,0}
+      // CHECK: add = f32[16,1]{1,0}
+    )");
+    TF_ASSERT_OK(file_check_result.status());
+    EXPECT_TRUE(file_check_result.value());
+  }
+
+  // Check that the LLVM IR has been generated.
+  {
+    std::vector<std::string> ir_paths;
+    TF_ASSERT_OK(fs->GetMatchingPaths(fs->JoinPath(dump_dir, "*ir-no-opt.ll"),
+                                      &ir_paths));
+    ASSERT_THAT(ir_paths, SizeIs(1));
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/multihost_hlo_runner/hlo_runner_flags.cc b/tensorflow/compiler/xla/tools/multihost_hlo_runner/hlo_runner_flags.cc
index 721909ec798..c160d756094 100644
--- a/tensorflow/compiler/xla/tools/multihost_hlo_runner/hlo_runner_flags.cc
+++ b/tensorflow/compiler/xla/tools/multihost_hlo_runner/hlo_runner_flags.cc
@@ -39,7 +39,9 @@ void MultiHostHloRunnerFlags::AppendFlags(std::vector<tsl::Flag>* flags) {
                       "Log the input and output to stderr.");
   flags->emplace_back("run_xla_backend_only",
                       &flag_values_.run_xla_backend_only,
-                      "Only call XLA's RunBackend to compile the module.");
+                      "Call only XLA's RunBackend during the compilation. "
+                      "This is used to run a post-optimization HLO module "
+                      "(dumped as 'xxx.after_optimizations.hlo.xxx'");
   flags->emplace_back("disable_all_hlo_passes",
                       &flag_values_.disable_all_hlo_passes,
                       "Disable HLO passes or not.");
diff --git a/tensorflow/compiler/xla/tools/multihost_hlo_runner/hlo_runner_main.cc b/tensorflow/compiler/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
index 2d8e8a0d1b8..cdeed72f528 100644
--- a/tensorflow/compiler/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
+++ b/tensorflow/compiler/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
@@ -15,11 +15,9 @@ limitations under the License.
 
 // Utility for launching some HLO text that supports multiple hosts/devices.
 
-#include <cstdint>
 #include <memory>
-#include <optional>
 #include <string>
-#include <utility>
+#include <string_view>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
@@ -65,6 +63,7 @@ int main(int argc, char** argv) {
   std::string input_format_str = "text";
   xla::InputFormat input_format;
   std::string hlo_file = "";
+  bool should_run = true;
   std::string dump_output_literal_to = "";
   int task_id = 0;
   std::string device_type_str = "gpu";
@@ -78,6 +77,7 @@ int main(int argc, char** argv) {
                 "snapshot_proto_binary"),
       tsl::Flag("hlo_file", &hlo_file,
                 "A text or proto buf file for HLO input"),
+      tsl::Flag("run", &should_run, "Should we run the compiled HLO?"),
       tsl::Flag("dump_output_literal_to", &dump_output_literal_to,
                 "A path to which the HLO output will be dumped. "
                 "Example: /a/b/literal.txt."),
@@ -117,9 +117,15 @@ int main(int argc, char** argv) {
       xla::FunctionalHloRunner::CreateGpuClient();
   TF_QCHECK_OK(client.status());
 
-  TF_QCHECK_OK(xla::FunctionalHloRunner::LoadAndRunAndDump(
-      *client.value(), preproc_options, raw_compile_options, running_options,
-      {hlo_file}, input_format, dump_output_literal_to, task_id));
+  if (should_run) {
+    TF_QCHECK_OK(xla::FunctionalHloRunner::LoadAndRunAndDump(
+        *client.value(), preproc_options, raw_compile_options, running_options,
+        {hlo_file}, input_format, dump_output_literal_to, task_id));
+  } else {
+    TF_QCHECK_OK(xla::FunctionalHloRunner::LoadAndCompile(
+        *client.value(), preproc_options, raw_compile_options, hlo_file,
+        input_format, task_id));
+  }
 
   return 0;
 }
diff --git a/tensorflow/compiler/xla/tools/run_hlo_module.cc b/tensorflow/compiler/xla/tools/run_hlo_module.cc
index a11244c92f7..99bf8712732 100644
--- a/tensorflow/compiler/xla/tools/run_hlo_module.cc
+++ b/tensorflow/compiler/xla/tools/run_hlo_module.cc
@@ -248,12 +248,18 @@ Status RunAndCompare(
   std::unique_ptr<RunHloModuleIterationLiterals> iteration_literals_proto_local;
   if (iteration_literals_proto == nullptr) {
     // User did not explicitly give input
-    if (options.input_format == "pb" || options.input_format == "pbtxt") {
+    if (!options.force_fake_data &&
+        (options.input_format == "pb" || options.input_format == "pbtxt")) {
       // User is giving a snapshot (which contains inputs)
+      LOG(INFO) << "Using input data from the user-provided snapshot.";
       TF_ASSIGN_OR_RETURN(
           iteration_literals_proto_local,
           LoadInputFromFile(hlo_filename, options.input_format));
       iteration_literals_proto = iteration_literals_proto_local.get();
+    } else if (options.input_format == "pb" ||
+               options.input_format == "pbtxt") {
+      LOG(INFO)
+          << "Ignoring input data from snapshot and using fake data instead.";
     }
   }
   return RunAndCompare(std::move(test_module), test_runner, reference_runner,
diff --git a/tensorflow/compiler/xla/tools/run_hlo_module.h b/tensorflow/compiler/xla/tools/run_hlo_module.h
index 44bbf65c2a0..5fa5732e6a1 100644
--- a/tensorflow/compiler/xla/tools/run_hlo_module.h
+++ b/tensorflow/compiler/xla/tools/run_hlo_module.h
@@ -30,41 +30,25 @@ namespace xla {
 // Command-line options to this tool.  See main() in run_hlo_module_main.cc for
 // descriptions of these fields.
 struct RunHloModuleOptions {
-  RunHloModuleOptions()
-      : platform(""),
-        reference_platform("default"),
-        print_literals(false),
-        flatten_control_flow(false),
-        run_test_hlo_passes(true),
-        run_reference_hlo_passes(true),
-        // Using small float range by default, as otherwise all reductions
-        // miscompare vs. the interpreter with inf/nan.
-        use_large_float_range(false),
-        treat_gte_as_data_formatting(false),
-        abs_error_bound(1e-3),
-        rel_error_bound(1e-3),
-        input_format("hlo"),
-        input_module(""),
-        iterations(1),
-        output_literals_file(""),
-        input_literals_file(""),
-        random_init_input_literals(true) {}
   std::string platform;
-  std::string reference_platform;
-  bool print_literals;
-  bool flatten_control_flow;
-  bool run_test_hlo_passes;
-  bool run_reference_hlo_passes;
-  bool use_large_float_range;
-  bool treat_gte_as_data_formatting;
-  float abs_error_bound;
-  float rel_error_bound;
+  std::string reference_platform{"default"};
+  bool print_literals{false};
+  bool flatten_control_flow{false};
+  bool run_test_hlo_passes{true};
+  bool run_reference_hlo_passes{true};
+  // Using small float range by default, as otherwise all reductions
+  // miscompare vs. the interpreter with inf/nan.
+  bool use_large_float_range{false};
+  bool treat_gte_as_data_formatting{false};
+  float abs_error_bound{1e-3};
+  float rel_error_bound{1e-3};
   std::string input_format;
   std::string input_module;
-  int iterations;
+  int iterations{1};
   std::string output_literals_file;
   std::string input_literals_file;
-  bool random_init_input_literals;
+  bool random_init_input_literals{true};
+  bool force_fake_data{false};
 };
 
 // Runs test_module on the platform with the name
diff --git a/tensorflow/compiler/xla/tools/run_hlo_module_main.cc b/tensorflow/compiler/xla/tools/run_hlo_module_main.cc
index c17a9e96963..f35408140a2 100644
--- a/tensorflow/compiler/xla/tools/run_hlo_module_main.cc
+++ b/tensorflow/compiler/xla/tools/run_hlo_module_main.cc
@@ -72,6 +72,7 @@ std::string GetReferencePlatformName(std::string reference_platform) {
 
 int main(int argc, char** argv) {
   xla::RunHloModuleOptions opts;
+  bool different_random_seeds = false;
   std::vector<tsl::Flag> flag_list = {
       tsl::Flag("platform", &opts.platform,
                 "The test platform that the HLO module will be executed on "
@@ -125,7 +126,11 @@ int main(int argc, char** argv) {
       tsl::Flag(
           "iterations", &opts.iterations,
           "The number of times to run the module. Each iteration will be run "
-          "with different input data.")};
+          "with different input data."),
+      tsl::Flag("different_random_seeds", &different_random_seeds,
+                "Whether each iteration should use a different random seed for "
+                "the HloModuleConfig."),
+  };
   xla::AppendDebugOptionsFlags(&flag_list);
   // The usage string includes the message at the top of the file, the
   // DebugOptions flags and the flags defined above.
@@ -170,7 +175,12 @@ int main(int argc, char** argv) {
       std::cerr << "\n=== Iteration " << i << "\n";
     }
     xla::Status matched = xla::RunAndCompare(
-        hlo_filename, &test_runner, reference_runner.get(), engine.get(), opts);
+        hlo_filename, &test_runner, reference_runner.get(), engine.get(), opts,
+        /*iteration_literals_proto=*/nullptr,
+        /*reference_module_modifier_hook=*/{},
+        [&](xla::HloModuleConfig* config) {
+          config->set_seed(different_random_seeds ? i : 42);
+        });
 
     // The AssertionResult is only meaningful when the reference is
     // used. Without a reference, the test just verifies that nothing blew up
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
index fe9c857bf22..c11af417b0a 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
@@ -85,12 +85,16 @@ std::string SanitizeFunctionName(llvm::StringRef name) {
 }
 
 // Returns whether the instruction is a default dot operation.
+// Supports vector.vector, vector.matrix, matrix.vector, and matrix.matrix.
+// Default operations have lhs_contracting dimension is 1 (or zero for vector)
+// and the rhs_contracting dimension is zero, and there are no batch dimensions.
 bool DotIsDefault(const HloInstruction* instruction) {
+  // If LHS/RHS has rank greater than 2, not default dot
   const auto& operands = instruction->operands();
-  // eg. vector[3] dot matrix[3, 2] => [2] not default dot
-  if (operands[0]->shape().rank() < operands[1]->shape().rank()) {
+  if (operands[0]->shape().rank() > 2 || operands[1]->shape().rank() > 2) {
     return false;
   }
+
   auto dnums = instruction->dot_dimension_numbers();
   DotDimensionNumbers default_dimension_numbers;
   default_dimension_numbers.add_lhs_contracting_dimensions(
@@ -405,7 +409,8 @@ StatusOr<FuncOp> HloFunctionImporter::ImportAsFunc(
                                                : FuncOp::Visibility::Private;
   function.setVisibility(visibility);
 
-  for (auto& entry : llvm::enumerate(computation.parameter_instructions())) {
+  for (const auto& entry :
+       llvm::enumerate(computation.parameter_instructions())) {
     HloParameterInstruction* parameter =
         Cast<HloParameterInstruction>(entry.value());
     if (parameter->has_sharding()) {
@@ -2028,7 +2033,7 @@ StatusOr<llvm::SmallVector<mlir::Value, 4>> HloFunctionImporter::GetOperands(
 }
 
 Status HloFunctionImporter::GetMlirTypes(
-    const std::vector<HloInstruction*>& instructions,
+    absl::Span<const HloInstruction* const> instructions,
     llvm::SmallVectorImpl<mlir::Type>* types) {
   for (auto instruction : instructions) {
     TF_ASSIGN_OR_RETURN(auto ret_type, ConvertShapeToType<RankedTensorType>(
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h
index 1756a0f9765..a204800a811 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h
@@ -198,7 +198,7 @@ class HloFunctionImporter {
 
   // Takes a list of HloInstructions and generates the list of types used for
   // input, bypassing tuples to subsets.
-  Status GetMlirTypes(const std::vector<xla::HloInstruction*>& instructions,
+  Status GetMlirTypes(absl::Span<const HloInstruction* const> instructions,
                       llvm::SmallVectorImpl<mlir::Type>* types);
 
   // Returns the Mlir Value for the corresponding HloInstruction.
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.cc
index be31a3733e4..e7558c6a6ea 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.cc
@@ -117,6 +117,8 @@ StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
       return CreateDenseAttrFromLiteral<tsl::float8_e5m2>(type, literal);
     case PrimitiveType::F8E4M3FN:
       return CreateDenseAttrFromLiteral<tsl::float8_e4m3fn>(type, literal);
+    case PrimitiveType::F8E4M3B11FNUZ:
+      return CreateDenseAttrFromLiteral<tsl::float8_e4m3b11>(type, literal);
     case PrimitiveType::F16:
       return CreateDenseAttrFromLiteral<half>(type, literal);
     case PrimitiveType::BF16:
@@ -183,6 +185,10 @@ Status CopyDenseElementsDataToXlaFormat(mlir::DenseElementsAttr data,
     CopyDenseElementsBy<tsl::float8_e4m3fn>(data, output);
     return OkStatus();
   }
+  if (element_type.isFloat8E4M3B11FNUZ()) {
+    CopyDenseElementsBy<tsl::float8_e4m3b11>(data, output);
+    return OkStatus();
+  }
   if (element_type.isBF16()) {
     CopyDenseElementsBy<bfloat16>(data, output);
     return OkStatus();
@@ -244,6 +250,8 @@ StatusOr<mlir::Type> ConvertPrimitiveTypeToMLIRType(PrimitiveType element_type,
       return builder.getFloat8E5M2Type();
     case PrimitiveType::F8E4M3FN:
       return builder.getFloat8E4M3FNType();
+    case PrimitiveType::F8E4M3B11FNUZ:
+      return builder.getFloat8E4M3B11FNUZType();
     case PrimitiveType::F16:
       return builder.getF16Type();
     case PrimitiveType::BF16:
@@ -342,7 +350,7 @@ StatusOr<::xla::HloOpcode> MhloToHloOpcode(mlir::Operation* op) {
     return xla::HloOpcode::kReplicaId;
   } else if (isa<mlir::mhlo::AfterAllOp>(op)) {
     return xla::HloOpcode::kAfterAll;
-  } else if (isa<mlir::mhlo::AllReduceOp, mlir::lmhlo::AllReduceOp>(op)) {
+  } else if (isa<mlir::mhlo::AllReduceOp>(op)) {
     return xla::HloOpcode::kAllReduce;
   } else if (isa<mlir::mhlo::AllToAllOp>(op)) {
     return xla::HloOpcode::kAllToAll;
@@ -453,8 +461,7 @@ StatusOr<::xla::HloOpcode> MhloToHloOpcode(mlir::Operation* op) {
   } else if (isa<mlir::mhlo::DynamicUpdateSliceOp,
                  mlir::lmhlo::DynamicUpdateSliceOp>(op)) {
     return xla::HloOpcode::kDynamicUpdateSlice;
-  } else if (isa<mlir::mhlo::CollectivePermuteOp,
-                 mlir::lmhlo::CollectivePermuteOp>(op)) {
+  } else if (isa<mlir::mhlo::CollectivePermuteOp>(op)) {
     return xla::HloOpcode::kCollectivePermute;
   } else if (isa<mlir::mhlo::CopyOp, mlir::lmhlo::CopyOp>(op)) {
     return xla::HloOpcode::kCopy;
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h
index cca4b59ee2f..abd6b3c517a 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h
@@ -112,6 +112,11 @@ static StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
             dlts.push_back(*mlir::sparse_tensor::getDimLevelType(
                 mlir::sparse_tensor::LevelFormat::Singleton, ordered, unique));
             break;
+          case DimLevelType::DIM_COMPRESSED_WITH_HI:
+            dlts.push_back(*mlir::sparse_tensor::getDimLevelType(
+                mlir::sparse_tensor::LevelFormat::CompressedWithHi, ordered,
+                unique));
+            break;
           default:
             return InvalidArgument("Unknown DimLevelType from HLO");
         }
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.cc
index f608610f5da..c56a70c3445 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h"
 
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -43,6 +44,7 @@ limitations under the License.
 namespace xla {
 
 constexpr char kFrontendAttributesAttr[] = "mhlo.frontend_attributes";
+constexpr char kShardingAttr[] = "mhlo.sharding";
 
 // Merge two dictionary attributes into one. This function overrides the
 // first dictionary attributes with the second one if there are attributes
@@ -90,6 +92,23 @@ void AddFrontendAttributesToOperation(
   op->setAttr(kFrontendAttributesAttr, updated_attributes);
 }
 
+void AddShapeAttributeToOperation(mlir::OpBuilder builder, mlir::Operation* op,
+                                  std::optional<xla::OpSharding> sharding) {
+  if (sharding) {
+    op->setAttr(kShardingAttr, builder.getStringAttr(
+                                   HloSharding::FromProto(*sharding)->ToString(
+                                       /*include_metadata=*/true)));
+  }
+}
+
+// Adds sharding and frontend_attributes to op.
+void AddAttributesToOperation(mlir::OpBuilder builder, mlir::Operation* op,
+                              std::optional<xla::OpSharding> sharding,
+                              mlir::DictionaryAttr& frontend_attributes) {
+  AddShapeAttributeToOperation(builder, op, sharding);
+  AddFrontendAttributesToOperation(op, frontend_attributes);
+}
+
 static std::string GetMlirOpName(HloOpcode opcode) {
   std::string op_name(HloOpcodeString(opcode));
   absl::c_replace(op_name, '-', '_');
@@ -128,9 +147,10 @@ StatusOr<XlaOp> MlirHloBuilder::MakeXlaOp(mlir::Value val) {
     return InvalidArgument("unsupported type: %s", llvm_ir::DumpToString(ty));
   }
 
+  AddAttributesToOperation(builder_, val.getDefiningOp(), sharding(),
+                           frontend_attributes_);
   int64_t handle = reinterpret_cast<int64_t>(val.getAsOpaquePointer());
   handle_to_shape_[handle] = std::move(shape);
-  AddFrontendAttributesToOperation(val.getDefiningOp(), frontend_attributes_);
   return XlaOp(handle, this);
 }
 
@@ -295,7 +315,7 @@ StatusOr<XlaOp> MlirHloBuilder::ReduceInternal(
   if (op.getNumResults() == 1) return MakeXlaOp(op.getResult(0));
   // Add frontend attributes to the ReduceOp as no MakeXlaOp is called.
   // TODO(hinsu): Avoid this duplicated call for ops returning multiple results.
-  AddFrontendAttributesToOperation(op, frontend_attributes_);
+  AddAttributesToOperation(builder_, op, sharding(), frontend_attributes_);
   auto tuple = builder_.create<mlir::mhlo::TupleOp>(loc_, op.getResults());
   return MakeXlaOp(tuple);
 }
@@ -391,7 +411,7 @@ StatusOr<XlaOp> MlirHloBuilder::SortInternal(const Shape& shape,
     // Add frontend attributes to the SortOp as no MakeXlaOp is called.
     // TODO(hinsu): Avoid this duplicated call for ops returning multiple
     // results.
-    AddFrontendAttributesToOperation(op, frontend_attributes_);
+    AddAttributesToOperation(builder_, op, sharding(), frontend_attributes_);
     auto tuple = builder_.create<mlir::mhlo::TupleOp>(loc_, op.getResults());
     return MakeXlaOp(tuple);
   }
@@ -425,7 +445,7 @@ StatusOr<XlaOp> MlirHloBuilder::WhileInternal(const Shape& shape,
     // Add frontend attributes to the WhileOp as no MakeXlaOp is called.
     // TODO(hinsu): Avoid this duplicated call for ops returning multiple
     // results.
-    AddFrontendAttributesToOperation(op, frontend_attributes_);
+    AddAttributesToOperation(builder_, op, sharding(), frontend_attributes_);
     llvm::SmallVector<mlir::Value> flattened_results = op->getResults();
     llvm::MutableArrayRef<mlir::Value> flattened_results_ref(flattened_results);
     auto result = HloFunctionImporter::CreateTupleValue(
@@ -545,7 +565,7 @@ StatusOr<XlaOp> MlirHloBuilder::RngBitGeneratorInternal(
     // called.
     // TODO(hinsu): Avoid this duplicated call for ops returning multiple
     // results.
-    AddFrontendAttributesToOperation(op, frontend_attributes_);
+    AddAttributesToOperation(builder_, op, sharding(), frontend_attributes_);
     llvm::SmallVector<mlir::Value> flattened_results = op->getResults();
     llvm::MutableArrayRef<mlir::Value> flattened_results_ref(flattened_results);
     auto result = HloFunctionImporter::CreateTupleValue(
@@ -687,7 +707,7 @@ StatusOr<XlaOp> MlirHloBuilder::InfeedWithTokenInternal(
 
   // Add frontend attributes to the InfeedOp as no MakeXlaOp is called.
   // TODO(hinsu): Avoid this duplicated call for ops returning multiple results.
-  AddFrontendAttributesToOperation(op, frontend_attributes_);
+  AddAttributesToOperation(builder_, op, sharding(), frontend_attributes_);
   llvm::SmallVector<mlir::Value> flattened_results = op->getResults();
   llvm::MutableArrayRef<mlir::Value> flattened_results_ref(flattened_results);
   auto result = HloFunctionImporter::CreateTupleValue(
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h b/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h
index b48cfa4766e..ee49fd76523 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h
@@ -47,7 +47,7 @@ namespace xla {
 // method or constructed using MakeXlaOp method in this builder.
 //
 // TODO(hinsu): Support more ops and utility functions to set special attributes
-// like OpMetadata and Sharding.
+// like OpMetadata.
 class MlirHloBuilder : public XlaBuilder {
  public:
   // Constructs builder for the given function. New operations are added to the
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import.hlotxt b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import.hlotxt
index 665c700ec55..75e99c8413b 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import.hlotxt
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import.hlotxt
@@ -315,6 +315,9 @@ add {
 
   // CHECK: %[[VAL_8:.*]] = mhlo.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf8E4M3FN>
   %constant.8 = f8e4m3fn[4] constant({1, 2, 3, 4})
+
+  // CHECK: %[[VAL_9:.*]] = mhlo.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf8E4M3B11FNUZ>
+  %constant.9 = f8e4m3b11fnuz[4] constant({1, 2, 3, 4})
 }
 
 // TODO(b/129422361) Potentially update when copy, reshape, and conv have actual
@@ -399,11 +402,17 @@ add {
   // CHECK-NEXT:  %1 = mhlo.convert %0 : (tensor<4xf8E5M2>) -> tensor<4xf32>
   %convert.4 = f32[4] convert(f8e5m2[4] %convert.3)
 
-  // CHECK-NEXT:  %2 = mhlo.convert %1 : (tensor<4xf32>) -> tensor<4xf8E4M3FN>
-  %convert.5 = f8e4m3fn[4] convert(f32[4] %convert.4)
+  // CHECK-NEXT:  %2 = mhlo.convert %1 : (tensor<4xf32>) -> tensor<4xf8E4M3B11FNUZ>
+  %convert.5 = f8e4m3b11fnuz[4] convert(f32[4] %convert.4)
 
-  // CHECK-NEXT:  %3 = mhlo.convert %2 : (tensor<4xf8E4M3FN>) -> tensor<4xf32>
-  ROOT %convert.6 = f32[4] convert(f8e4m3fn[4] %convert.5)
+  // CHECK-NEXT:  %3 = mhlo.convert %2 : (tensor<4xf8E4M3B11FNUZ>) -> tensor<4xf32>
+  %convert.6 = f32[4] convert(f8e4m3b11fnuz[4] %convert.5)
+
+  // CHECK-NEXT:  %4 = mhlo.convert %3 : (tensor<4xf32>) -> tensor<4xf8E4M3FN>
+  %convert.7 = f8e4m3fn[4] convert(f32[4] %convert.6)
+
+  // CHECK-NEXT:  %5 = mhlo.convert %4 : (tensor<4xf8E4M3FN>) -> tensor<4xf32>
+  ROOT %convert.8 = f32[4] convert(f8e4m3fn[4] %convert.7)
 }
 
 // CHECK-LABEL:  func private @test_stochastic_convert(%arg0: tensor<4x3xf32>, %arg1: tensor<4x3xui32>) -> tensor<4x3xi8>
@@ -580,9 +589,17 @@ add {
   // CHECK-NEXT:  %2 = "mhlo.dot"(%arg0, %arg1) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<1x1xf32>
   %dot.5 = f32[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={default,default}
 
+  // CHECK-NEXT:  %3 = mhlo.reshape %arg1 : (tensor<4x1xf32>) -> tensor<4xf32>
+  // CHECK-NEXT:  %4 = "mhlo.dot"(%3, %arg1) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4xf32>, tensor<4x1xf32>) -> tensor<1xf32>
+  reshape.0 = f32[4]{0} reshape(f32[4, 1] Arg_1.2)
+  %dot.6 = f32[1] dot(reshape.0, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}, operand_precision={default,default}
+
+  // CHECK-NEXT:  %5 = "mhlo.dot"(%arg0, %3) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1xf32>
+  %dot.7 = f32[1] dot(%Arg_0.1, reshape.0), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={default,default}
+
   // TODO(b/129709049) consider making this default precision config inferred.
   // CHECK-NEXT:  "mhlo.dot"(%arg0, %arg1) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<1x1xf32>
-  ROOT %dot.6 = f32[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT %dot.8 = f32[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
 // CHECK-LABEL:  func private @test_s4_dot(%arg0: tensor<1x4xi4>, %arg1: tensor<4x1xi4>) -> tensor<1x1xi8>
@@ -640,6 +657,19 @@ add {
   ROOT %dot.7 = f32[1] dot(%reshape.0, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}
 }
 
+// CHECK-LABEL:  @test_dot_general_high_rank
+// CHECK-SAME: [[ARG0:%[a-zA-Z0-9]+]]
+// CHECK-SAME: [[ARG1:%[a-zA-Z0-9]+]]
+%test_dot_general_high_rank (Arg_0.1: s16[14,6,1,12,16], Arg_1.2: s16[6,3]) -> s16[14,1,12,16,3] {
+  %Arg_0.1 = s16[14,6,1,12,16] parameter(0)
+  %Arg_1.2 = s16[6,3] parameter(1)
+  // CHECK:  "mhlo.dot_general"(%arg0, %arg1)
+  // CHECK-SAME: lhs_contracting_dimensions = [1]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]
+  // CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
+  ROOT %dot.1 = s16[14,1,12,16,3] dot(%Arg_0.1, %Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
 // CHECK-LABEL:  func private @test_dynamic_slice
 // CHECK-SAME:  [[OPERAND:%.*]]: tensor<2x2x258xi32>, [[START_IDX_1:%.*]]: tensor<i32>, [[START_IDX_2:%.*]]: tensor<i32>, [[START_IDX_3:%.*]]: tensor<i32>
 %test_dynamic_slice (operand: s32[2,2,258], start_indices: s32[3]) -> s32[1,1,32] {
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/BUILD b/tensorflow/compiler/xla/translate/mhlo_to_hlo/BUILD
index 37bdf65ba4c..b0085d0ee10 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/BUILD
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/BUILD
@@ -65,6 +65,7 @@ cc_library(
         ":location_exporter",
         ":type_to_shape",
         "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 5b431d1d991..c40ddfff108 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -62,6 +62,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/mlir/utils/error_util.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
@@ -179,6 +180,8 @@ StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::BF16, Eigen::bfloat16)
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F8E5M2, tsl::float8_e5m2)
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F8E4M3FN, tsl::float8_e4m3fn)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F8E4M3B11FNUZ,
+                             tsl::float8_e4m3b11)
     default:
       return tsl::errors::Internal(absl::StrCat(  // NOLINT
           "Unsupported type: ", xla::PrimitiveType_Name(shape.element_type())));
@@ -289,49 +292,46 @@ static std::vector<xla::ReplicaGroup> Convert_replica_groups(
   return xla::ConvertReplicaGroups(groups).value();
 }
 
+static void SetLayout(xla::Shape& shape, mlir::DenseIntElementsAttr layout) {
+  if (shape.IsArray()) {
+    shape.mutable_layout()->clear_minor_to_major();
+    for (auto l : layout) {
+      shape.mutable_layout()->mutable_minor_to_major()->push_back(
+          l.getSExtValue());
+    }
+  } else if (shape.IsToken()) {
+    assert(layout.empty() && "Invalid layout for token type");
+  } else {
+    assert(!shape.IsTuple() &&
+           "Exporting layout for tuples is not implemented yet");
+    assert(false && "Exporting unknown type with layout");
+  }
+}
+
+static void SetLayout(xla::Shape& shape, mlir::ArrayAttr layouts) {
+  if (shape.IsTuple()) {
+    for (int i = 0; i < shape.tuple_shapes_size(); ++i) {
+      SetLayout(*shape.mutable_tuple_shapes(i),
+                layouts[i].cast<mlir::DenseIntElementsAttr>());
+    }
+  } else {
+    assert(layouts.size() == 1);
+    SetLayout(shape, layouts[0].cast<mlir::DenseIntElementsAttr>());
+  }
+}
+
 // Converts types and corresponding layouts into xla shapes with layouts.
 static std::vector<xla::Shape> ConvertTypesToShapesWithLayout(
     mlir::TypeRange value_types, mlir::ArrayAttr layouts) {
   std::vector<xla::Shape> shapes_with_layout;
-  for (auto type_and_layout : llvm::zip(value_types, layouts)) {
-    mlir::Type type = std::get<0>(type_and_layout);
-    mlir::Attribute layout = std::get<1>(type_and_layout);
-
-    if (type.isa<mlir::TensorType>()) {
-      shapes_with_layout.emplace_back(xla::TypeToShape(type));
-      auto& shape = shapes_with_layout.back();
-      shape.mutable_layout()->clear_minor_to_major();
-      for (auto l : layout.cast<mlir::DenseIntElementsAttr>()) {
-        shape.mutable_layout()->mutable_minor_to_major()->push_back(
-            l.getSExtValue());
-      }
-    } else if (type.isa<mlir::mhlo::TokenType>()) {
-      assert(mlir::cast<mlir::DenseElementsAttr>(layout).empty() &&
-             "Invalid layout for token type");
-      shapes_with_layout.emplace_back(xla::TypeToShape(type));
-    } else {
-      assert(!type.isa<mlir::TupleType>() &&
-             "Exporting layout for tuples is not implemented yet");
-      assert(false && "Exporting unknown type with layout");
-    }
+  for (auto [type, layout] : llvm::zip(value_types, layouts)) {
+    xla::Shape shape = xla::TypeToShape(type);
+    SetLayout(shape, layout.cast<mlir::DenseIntElementsAttr>());
+    shapes_with_layout.push_back(std::move(shape));
   }
   return shapes_with_layout;
 }
 
-// CustomCallOp result can be of tuple type to pack multiple results into one
-// value. If the custom call result is a tuple, then result layouts represent
-// the layout of each element of the tuple. Nested tuples are currently not
-// supported for export.
-static xla::Shape GetCustomCallResultShapeWithLayout(mlir::Type type,
-                                                     mlir::ArrayAttr layouts) {
-  auto tuple_type = type.dyn_cast<mlir::TupleType>();
-  if (!tuple_type) return ConvertTypesToShapesWithLayout({type}, layouts)[0];
-
-  std::vector<xla::Shape> shapes_with_layouts =
-      ConvertTypesToShapesWithLayout(tuple_type.getTypes(), layouts);
-  return xla::ShapeUtil::MakeTupleShape(shapes_with_layouts);
-}
-
 // Converts StringRef to xla Transpose enum.
 static xla::TriangularSolveOptions::Transpose Convert_transpose_a(
     mlir::mhlo::Transpose transpose) {
@@ -1653,9 +1653,6 @@ LogicalResult ExportXlaOp(ConvertOp op, OpLoweringContext ctx) {
 }
 
 LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
-  if (op.getNumResults() != 1)
-    return op.emitOpError() << "with multiple results cannot be exported";
-
   if (op.getCalledComputations().size() > 1)
     return op.emitOpError()
            << "cannot export with more than one called computations";
@@ -1668,7 +1665,6 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
                                "layouts are specified";
   }
 
-  Value result = op.getResult(0);
   llvm::SmallVector<xla::XlaOp> args;
   if (failed(GetTuple(op, op.getInputs(), ctx, args))) return failure();
   auto xla_api_version = xla::ConvertCustomCallApiVersion(op.getApiVersion());
@@ -1697,10 +1693,12 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
   }
 
   StatusOr<xla::Literal> literal;
+  const xla::Literal* literal_ptr = nullptr;
   auto literal_attr = op->getAttrOfType<DenseElementsAttr>(kLiteralAttr);
   if (literal_attr) {
     literal = CreateArrayLiteralFromAttr(literal_attr, {});
     if (!literal.ok()) return failure();
+    literal_ptr = &*literal;
   }
 
   auto& value_map = *ctx.values;
@@ -1710,44 +1708,55 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
   auto custom_call_schedule =
       xla::ConvertCustomCallSchedule(op.getCustomCallSchedule());
   if (!custom_call_schedule.ok()) return failure();
+
+  std::string call_target_name(op.getCallTargetName());
+  xla::Shape result_shape;
+  if (op->getNumResults() == 1) {
+    result_shape = xla::TypeToShape(op.getResult(0).getType());
+  } else {
+    std::vector<xla::Shape> subshapes;
+    for (const auto& item : op.getResults().getType()) {
+      subshapes.push_back(xla::TypeToShape(item));
+    }
+    result_shape = xla::ShapeUtil::MakeTupleShape(subshapes);
+  }
+
+  xla::XlaOp custom_call;
   if (op.getCalledComputations().size() == 1) {
     mlir::func::FuncOp callee = ctx.converter->LookUpSymbol(
         op.getCalledComputations()[0].cast<FlatSymbolRefAttr>());
     if (failed(ctx.converter->RunOnFunction(callee))) return failure();
     xla::XlaComputation& computation =
         ctx.converter->GetLoweredComputation(callee);
-    value_map[result] = xla::CustomCallWithComputation(
-        ctx.builder, std::string(op.getCallTargetName()), args, computation,
-        xla::TypeToShape(result.getType()), backend_config,
-        op.getHasSideEffect(), output_operand_aliasing,
-        /*literal=*/literal.ok() ? &*literal : nullptr,
-        /*schedule=*/*custom_call_schedule,
-        /*api_version=*/*xla_api_version);
-    return success();
-  }
-
-  if (op.getOperandLayouts() && op.getResultLayouts()) {
+    custom_call = xla::CustomCallWithComputation(
+        ctx.builder, call_target_name, args, computation, result_shape,
+        backend_config, op.getHasSideEffect(), output_operand_aliasing,
+        literal_ptr, *custom_call_schedule, *xla_api_version);
+  } else if (op.getOperandLayouts() && op.getResultLayouts()) {
     auto operand_shapes_with_layout = ConvertTypesToShapesWithLayout(
         op.getOperandTypes(), op.getOperandLayouts().value());
-    xla::Shape result_shape_with_layout = GetCustomCallResultShapeWithLayout(
-        result.getType(), op.getResultLayouts().value());
-    value_map[result] = xla::CustomCallWithLayout(
-        ctx.builder, std::string(op.getCallTargetName()), args,
-        result_shape_with_layout, operand_shapes_with_layout, backend_config,
-        op.getHasSideEffect(), output_operand_aliasing,
-        /*literal=*/literal.ok() ? &*literal : nullptr,
-        /*schedule=*/*custom_call_schedule,
-        /*api_version=*/*xla_api_version);
-    return success();
+    SetLayout(result_shape, op.getResultLayouts().value());
+
+    custom_call = xla::CustomCallWithLayout(
+        ctx.builder, call_target_name, args, result_shape,
+        operand_shapes_with_layout, backend_config, op.getHasSideEffect(),
+        output_operand_aliasing, literal_ptr, *custom_call_schedule,
+        *xla_api_version);
+  } else {
+    custom_call = xla::CustomCall(
+        ctx.builder, call_target_name, args, result_shape, backend_config,
+        op.getHasSideEffect(), output_operand_aliasing, literal_ptr,
+        *custom_call_schedule, *xla_api_version);
+  }
+
+  if (op->getNumResults() == 1) {
+    value_map[op.getResult(0)] = custom_call;
+  } else {
+    for (auto [index, result] : llvm::enumerate(op.getResults())) {
+      value_map[result] = xla::GetTupleElement(custom_call, index);
+    }
   }
 
-  value_map[result] =
-      xla::CustomCall(ctx.builder, std::string(op.getCallTargetName()), args,
-                      xla::TypeToShape(result.getType()), backend_config,
-                      op.getHasSideEffect(), output_operand_aliasing,
-                      /*literal=*/literal.ok() ? &*literal : nullptr,
-                      /*schedule=*/*custom_call_schedule,
-                      /*api_version=*/*xla_api_version);
   return success();
 }
 
@@ -2745,7 +2754,7 @@ LogicalResult ConvertToHloModule::Lower(
                 options_.layout_preference_fn, options_.shape_representation_fn,
                 ret_shardings[index], /*fast_mem=*/false);
         if (!reshape.ok())
-          return inst->emitError() << reshape.status().error_message();
+          return inst->emitError() << reshape.status().message();
 
         returns[index] = reshape.value();
       }
@@ -2923,7 +2932,7 @@ LogicalResult ConvertToHloModule::SetEntryTupleShapesAndLeafReplication(
                                       : XlaLayoutPreference::kNoPreference;
     if (!layout_preference_status.ok())
       return block->getParentOp()->emitError()
-             << layout_preference_status.status().error_message();
+             << layout_preference_status.status().message();
 
     auto arg_shape_status = options_.shape_representation_fn
                                 ? options_.shape_representation_fn(
@@ -2932,7 +2941,7 @@ LogicalResult ConvertToHloModule::SetEntryTupleShapesAndLeafReplication(
                                 : arg_shape;
     if (!arg_shape_status.ok())
       return block->getParentOp()->emitError()
-             << arg_shape_status.status().error_message();
+             << arg_shape_status.status().message();
 
     arg_shape = std::move(arg_shape_status.value());
 
@@ -2956,14 +2965,14 @@ LogicalResult ConvertToHloModule::SetEntryTupleShardings(
       auto hlo_sharding = xla::HloSharding::FromProto(*arg_sharding.value());
       if (!hlo_sharding.ok())
         return block->getParentOp()->emitError()
-               << hlo_sharding.status().error_message();
+               << hlo_sharding.status().message();
 
       auto status = RewriteLayoutWithShardedShape(
           hlo_sharding.value(), /*use_fast_memory=*/false,
           options_.layout_preference_fn, options_.shape_representation_fn,
           &(*arg_shapes)[arg_sharding.index()]);
       if (!status.ok())
-        return block->getParentOp()->emitError() << status.error_message();
+        return block->getParentOp()->emitError() << status.message();
 
       *sharding.add_tuple_shardings() = *arg_sharding.value();
     }
@@ -3096,8 +3105,7 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
   auto computation_or =
       return_value.valid() ? builder->Build(return_value) : builder->Build();
   if (!computation_or.ok()) {
-    block->back().emitError(
-        llvm::Twine(computation_or.status().error_message()));
+    block->back().emitError() << computation_or.status().message();
     return failure();
   }
   *result = std::move(computation_or.value());
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export.mlir b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export.mlir
index 4c5ee14e183..b95949bd1db 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export.mlir
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export.mlir
@@ -606,6 +606,9 @@ func.func @main() {
   // CHECK: f8e4m3fn[4] constant({1, 2, 3, 4})
   %cst_12 = arith.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf8E4M3FN>
 
+  // CHECK: f8e4m3b11fnuz[4] constant({1, 2, 3, 4})
+  %cst_13 = arith.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf8E4M3B11FNUZ>
+
   func.return
 }
 
@@ -856,6 +859,50 @@ func.func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<1x2x3x
 // CHECK-SAME:  schedule=SCHEDULE_EARLIEST
 // CHECK-SAME:  backend_config="bar"
 
+// -----
+
+// CHECK:  HloModule
+func.func @main(%arg0: tensor<2x3xf32>) -> tuple<tensor<2x3xf32>> {
+  %0 = "mhlo.custom_call"(%arg0) {call_target_name = "foo"} : (tensor<2x3xf32>) -> tuple<tensor<2x3xf32>>
+  func.return %0 : tuple<tensor<2x3xf32>>
+}
+
+// CHECK:  ENTRY
+// CHECK:  [[ARG0:%.*]] = f32[2,3] parameter(0)
+// CHECK:  ROOT
+// CHECK-SAME:  (f32[2,3]) custom-call(f32[2,3] [[ARG0]])
+// CHECK-SAME:  custom_call_target="foo"
+
+// -----
+
+// CHECK:  HloModule
+func.func @main(%arg0: tensor<2x3xf32>) -> tuple<tensor<2x3xf32>, tensor<4x5xf16>> {
+  %0 = "mhlo.custom_call"(%arg0) {call_target_name = "foo"} : (tensor<2x3xf32>) -> tuple<tensor<2x3xf32>, tensor<4x5xf16>>
+  func.return %0 : tuple<tensor<2x3xf32>, tensor<4x5xf16>>
+}
+
+// CHECK:  ENTRY
+// CHECK:  [[ARG0:%.*]] = f32[2,3] parameter(0)
+// CHECK:  ROOT
+// CHECK-SAME:  (f32[2,3], f16[4,5]) custom-call(f32[2,3] [[ARG0]])
+// CHECK-SAME:  custom_call_target="foo"
+
+// -----
+
+// CHECK:  HloModule
+func.func @main(%arg0: tensor<2x3xf32>) -> (tensor<2x3xf32>, tensor<4x5xf16>) {
+  %0:2 = "mhlo.custom_call"(%arg0) {call_target_name = "foo"} : (tensor<2x3xf32>) -> (tensor<2x3xf32>, tensor<4x5xf16>)
+  func.return %0#0, %0#1 : tensor<2x3xf32>, tensor<4x5xf16>
+}
+
+// CHECK:  ENTRY
+// CHECK:  [[ARG0:%.*]] = f32[2,3] parameter(0)
+// CHECK:  [[OUTS:%.*]] = (f32[2,3], f16[4,5]) custom-call(f32[2,3] [[ARG0]])
+// CHECK-SAME:  custom_call_target="foo"
+// CHECK-DAG:  [[OUT0:%.*]] = f32[2,3] get-tuple-element((f32[2,3], f16[4,5]) [[OUTS]]), index=0
+// CHECK-DAG:  [[OUT1:%.*]] = f16[4,5] get-tuple-element((f32[2,3], f16[4,5]) [[OUTS]]), index=1
+// CHECK:  ROOT
+// CHECK-SAME: (f32[2,3], f16[4,5]) tuple(f32[2,3] [[OUT0]], f16[4,5] [[OUT1]])
 
 // -----
 
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.cc
index 10d1a2f316c..5ee56c3dbe0 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.cc
@@ -50,6 +50,8 @@ PrimitiveType TypeToPrimitiveType(mlir::Type type) {
     return PrimitiveType::F8E5M2;
   } else if (type.isFloat8E4M3FN()) {
     return PrimitiveType::F8E4M3FN;
+  } else if (type.isFloat8E4M3B11FNUZ()) {
+    return PrimitiveType::F8E4M3B11FNUZ;
   } else if (type.isBF16()) {
     return PrimitiveType::BF16;
   } else if (type.isF16()) {
@@ -103,6 +105,9 @@ std::optional<std::tuple<DimLevelType, bool, bool>> ConvertDimLevelType(
       return std::make_tuple(DimLevelType::DIM_COMPRESSED, unique, ordered);
     case mlir::sparse_tensor::LevelFormat::Dense:
       return std::make_tuple(DimLevelType::DIM_DENSE, unique, ordered);
+    case mlir::sparse_tensor::LevelFormat::CompressedWithHi:
+      return std::make_tuple(DimLevelType::DIM_COMPRESSED_WITH_HI, unique,
+                             ordered);
     default:
       return std::nullopt;
   }
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/BUILD b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/BUILD
index 31e4a1695fc..04587171007 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/BUILD
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 
 package(
@@ -105,10 +106,6 @@ xla_cc_binary(
     srcs = ["xla_translate_opt_main.cc"],
     deps = [
         ":mhlo_to_lhlo_with_xla",  # buildcleaner: keep
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:MlirOptLib",
-        "@stablehlo//:register",
         "//tensorflow/compiler/xla/mlir/framework/ir:xla_framework",
         "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
@@ -116,5 +113,11 @@ xla_cc_binary(
         "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
         "//tensorflow/tsl/platform:platform_port",
-    ] + if_cuda(["//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin"]),
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:MlirOptLib",
+        "@stablehlo//:register",
+    ] + if_cuda(["//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin"]) + if_rocm([
+        "//tensorflow/compiler/xla/stream_executor/rocm:rocblas_plugin",
+    ]),
 )
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
index 16349d993c9..279414fb198 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
-#include <climits>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -106,6 +105,12 @@ tsl::StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
   return HloModule::CreateFromProto(module_proto, module_config);
 }
 
+bool IsSyncCollective(const HloInstruction* instr) {
+  auto backend_config =
+      instr->backend_config<xla::gpu::CollectiveBackendConfig>().value();
+  return backend_config.is_sync();
+}
+
 }  // namespace
 
 // Convert the MLIR `module` from HLO dialect to LHLO dialect using XLA for the
@@ -352,16 +357,10 @@ tsl::StatusOr<mlir::Operation*> LhloDialectEmitter::EmitOp(
       // LMHLO is already ordered. This assumption may be broken after
       // introducing async regions and partial orders.
       return nullptr;
-    case HloOpcode::kAllToAll:
-      return EmitAllToAllOp(instr);
-    case HloOpcode::kAllGather:
-      return EmitAllGatherOp(instr);
     case HloOpcode::kAllGatherStart:
       return EmitAllGatherStartOp(instr);
     case HloOpcode::kAllGatherDone:
       return EmitAllGatherDoneOp(instr);
-    case HloOpcode::kAllReduce:
-      return EmitAllReduceOp(instr);
     case HloOpcode::kAllReduceStart:
       return EmitAllReduceStartOp(instr);
     case HloOpcode::kAllReduceDone:
@@ -370,12 +369,8 @@ tsl::StatusOr<mlir::Operation*> LhloDialectEmitter::EmitOp(
       return EmitAsyncStartOp(instr);
     case HloOpcode::kAsyncDone:
       return EmitAsyncDoneOp(instr);
-    case HloOpcode::kReduceScatter:
-      return EmitReduceScatterOp(instr);
     case HloOpcode::kBitcast:
       return EmitBitcast(instr);
-    case HloOpcode::kCollectivePermute:
-      return EmitCollectivePermuteOp(instr);
     case HloOpcode::kCollectivePermuteStart:
       return EmitCollectivePermuteStartOp(instr);
     case HloOpcode::kCollectivePermuteDone:
@@ -998,9 +993,18 @@ tsl::StatusOr<Operation*> LhloDialectEmitter::EmitCublasLtMatmulF8(
       auto const config,
       custom_call->backend_config<xla::gpu::GemmBackendConfig>());
 
-  TF_RET_CHECK(custom_call->operand_count() == 7);
+  int ops_num = custom_call->operand_count();
+  TF_RET_CHECK(ops_num == 7 || ops_num == 8);
 
-  llvm::SmallVector<Value, 9> operands;
+  TF_ASSIGN_OR_RETURN(
+      bool has_vector_bias,
+      xla::gpu::cublas_lt::EpilogueAddsVectorBias(config.epilogue()));
+
+  bool has_damax = custom_call->shape().IsTuple();
+  xla::ShapeIndex output_index =
+      has_damax ? xla::ShapeIndex{0} : xla::ShapeIndex{};
+
+  llvm::SmallVector<Value, 10> operands;
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(0), &operands));
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(1), &operands));
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(2), &operands));
@@ -1008,13 +1012,21 @@ tsl::StatusOr<Operation*> LhloDialectEmitter::EmitCublasLtMatmulF8(
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(4), &operands));
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-
+  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, output_index));
+  if (has_vector_bias) {
+    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(7), &operands));
+  }
+  if (has_damax) {
+    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, {1}));
+  }
   auto op = CreateOpWithoutAttrs<lmhlo_gpu::CublasLtMatmulF8Op>(custom_call,
                                                                 operands);
 
   SetMatmulAttributes(op, config, builder_);
-
+  int32_t operand_sizes[] = {
+      1, 1, 1, 1, 1, 1, 1, 1, has_vector_bias ? 1 : 0, has_damax ? 1 : 0};
+  op->setAttr(op.getOperandSegmentSizeAttr(),
+              builder_.getDenseI32ArrayAttr(operand_sizes));
   TF_ASSIGN_OR_RETURN(lmhlo_gpu::CublasLtMatmulEpilogue epilogue,
                       AsLhloEpilogue(config.epilogue()));
   op.setEpilogueAttr(lmhlo_gpu::CublasLtMatmulEpilogueAttr::get(
@@ -1325,20 +1337,6 @@ tsl::StatusOr<OpT> LhloDialectEmitter::EmitDoneOp(
                               token.mapped());
 }
 
-tsl::StatusOr<lmhlo::AllToAllOp> LhloDialectEmitter::EmitAllToAllOp(
-    const HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto all_to_all_op,
-                      CreateOpWithoutAttrs<lmhlo::AllToAllOp>(instr));
-  auto* all_to_all = xla::Cast<xla::HloAllToAllInstruction>(instr);
-  TF_RETURN_IF_ERROR(
-      SetupCommonCollectiveOpAttributes(all_to_all_op, instr, builder_));
-  if (all_to_all->split_dimension().has_value()) {
-    all_to_all_op.setSplitDimensionAttr(
-        builder_.getI64IntegerAttr(*all_to_all->split_dimension()));
-  }
-  return all_to_all_op;
-}
-
 tsl::StatusOr<lmhlo_gpu::AllToAllStartOp>
 LhloDialectEmitter::EmitAllToAllStartOp(const xla::HloInstruction* instr) {
   // All the input of async-done (which wraps the all-to-all) are also
@@ -1360,6 +1358,7 @@ LhloDialectEmitter::EmitAllToAllStartOp(const xla::HloInstruction* instr) {
     all_to_all_start_op.setSplitDimensionAttr(
         builder_.getI64IntegerAttr(*all_to_all->split_dimension()));
   }
+  all_to_all_start_op.setIsSync(IsSyncCollective(instr));
 
   auto [_, was_inserted] =
       ret_tokens_.insert({instr, all_to_all_start_op.getToken()});
@@ -1372,20 +1371,6 @@ tsl::StatusOr<lmhlo_gpu::AllToAllDoneOp> LhloDialectEmitter::EmitAllToAllDoneOp(
   return EmitDoneOp<lmhlo_gpu::AllToAllDoneOp>(instr);
 }
 
-tsl::StatusOr<lmhlo::AllGatherOp> LhloDialectEmitter::EmitAllGatherOp(
-    const HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto all_gather_op,
-                      CreateOpWithoutAttrs<lmhlo::AllGatherOp>(instr));
-  auto* all_gather = xla::Cast<xla::HloAllGatherInstruction>(instr);
-  TF_RETURN_IF_ERROR(
-      SetupCommonCollectiveOpAttributes(all_gather_op, instr, builder_));
-  all_gather_op.setUseGlobalDeviceIdsAttr(
-      builder_.getBoolAttr(all_gather->use_global_device_ids()));
-  all_gather_op.setAllGatherDimensionAttr(
-      builder_.getI64IntegerAttr(all_gather->all_gather_dimension()));
-  return all_gather_op;
-}
-
 tsl::StatusOr<lmhlo_gpu::AllGatherStartOp>
 LhloDialectEmitter::EmitAllGatherStartOp(const HloInstruction* instr) {
   llvm::SmallVector<Value, 4> operands;
@@ -1408,7 +1393,7 @@ LhloDialectEmitter::EmitAllGatherStartOp(const HloInstruction* instr) {
       builder_.getBoolAttr(all_gather->use_global_device_ids()));
   all_gather_start_op.setAllGatherDimensionAttr(
       builder_.getI64IntegerAttr(all_gather->all_gather_dimension()));
-
+  all_gather_start_op.setIsSync(IsSyncCollective(instr));
   auto [_, was_inserted] =
       ret_tokens_.insert({instr, all_gather_start_op.getToken()});
   TF_RET_CHECK(was_inserted) << "all-gather-start already lowered";
@@ -1420,21 +1405,6 @@ LhloDialectEmitter::EmitAllGatherDoneOp(const HloInstruction* instr) {
   return EmitDoneOp<lmhlo_gpu::AllGatherDoneOp>(instr);
 }
 
-tsl::StatusOr<lmhlo::AllReduceOp> LhloDialectEmitter::EmitAllReduceOp(
-    const HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto all_reduce_op,
-                      CreateOpWithoutAttrs<lmhlo::AllReduceOp>(instr));
-  auto* all_reduce = xla::Cast<xla::HloAllReduceInstruction>(instr);
-  TF_RETURN_IF_ERROR(
-      SetupCommonCollectiveOpAttributes(all_reduce_op, instr, builder_));
-  all_reduce_op.setUseGlobalDeviceIdsAttr(
-      builder_.getBoolAttr(all_reduce->use_global_device_ids()));
-  TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *instr->called_computations()[0], symbol_table_,
-      &all_reduce_op.getComputation(), &builder_));
-  return all_reduce_op;
-}
-
 tsl::StatusOr<lmhlo_gpu::AllReduceStartOp>
 LhloDialectEmitter::EmitAllReduceStartOp(const HloInstruction* instr) {
   llvm::SmallVector<Value, 4> operands;
@@ -1454,6 +1424,8 @@ LhloDialectEmitter::EmitAllReduceStartOp(const HloInstruction* instr) {
       SetupCommonCollectiveOpAttributes(all_reduce_start_op, instr, builder_));
   all_reduce_start_op.setUseGlobalDeviceIdsAttr(
       builder_.getBoolAttr(all_reduce->use_global_device_ids()));
+  all_reduce_start_op.setIsSync(IsSyncCollective(instr));
+
   TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
       *instr->called_computations()[0], symbol_table_,
       &all_reduce_start_op.getComputation(), &builder_));
@@ -1504,23 +1476,6 @@ tsl::StatusOr<mlir::Operation*> LhloDialectEmitter::EmitAsyncDoneOp(
   }
 }
 
-tsl::StatusOr<lmhlo::ReduceScatterOp> LhloDialectEmitter::EmitReduceScatterOp(
-    const HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto reduce_scatter_op,
-                      CreateOpWithoutAttrs<lmhlo::ReduceScatterOp>(instr));
-  auto* ars = xla::Cast<xla::HloReduceScatterInstruction>(instr);
-  TF_RETURN_IF_ERROR(
-      SetupCommonCollectiveOpAttributes(reduce_scatter_op, instr, builder_));
-  reduce_scatter_op.setUseGlobalDeviceIdsAttr(
-      builder_.getBoolAttr(ars->use_global_device_ids()));
-  TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *instr->called_computations()[0], symbol_table_,
-      &reduce_scatter_op.getComputation(), &builder_));
-  reduce_scatter_op.setScatterDimensionAttr(
-      builder_.getI64IntegerAttr(ars->scatter_dimension()));
-  return reduce_scatter_op;
-}
-
 tsl::StatusOr<lmhlo_gpu::ReduceScatterStartOp>
 LhloDialectEmitter::EmitReduceScatterStartOp(const xla::HloInstruction* instr) {
   // All the input of async-done (which wraps the reduce-scatter) are also
@@ -1543,6 +1498,7 @@ LhloDialectEmitter::EmitReduceScatterStartOp(const xla::HloInstruction* instr) {
       builder_.getBoolAttr(reduce_scatter->use_global_device_ids()));
   reduce_scatter_start_op.setScatterDimensionAttr(
       builder_.getI64IntegerAttr(reduce_scatter->scatter_dimension()));
+  reduce_scatter_start_op.setIsSync(IsSyncCollective(instr));
   TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
       *reduce_scatter->to_apply(), symbol_table_,
       &reduce_scatter_start_op.getComputation(), &builder_));
@@ -1558,20 +1514,6 @@ LhloDialectEmitter::EmitReduceScatterDoneOp(const xla::HloInstruction* instr) {
   return EmitDoneOp<lmhlo_gpu::ReduceScatterDoneOp>(instr);
 }
 
-tsl::StatusOr<lmhlo::CollectivePermuteOp>
-LhloDialectEmitter::EmitCollectivePermuteOp(const HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto permute_op,
-                      CreateOpWithoutAttrs<lmhlo::CollectivePermuteOp>(instr));
-  auto* permute = xla::Cast<xla::HloCollectivePermuteInstruction>(instr);
-  SetupChannelIdAttribute(permute_op, permute, builder_);
-  mlir::NamedAttribute source_target_pairs_attr =
-      xla::HloFunctionImporter::ConvertSourceTargetPairs(
-          permute->source_target_pairs(), &builder_);
-  permute_op->setAttr(source_target_pairs_attr.getName(),
-                      source_target_pairs_attr.getValue());
-  return permute_op;
-}
-
 tsl::StatusOr<lmhlo_gpu::CollectivePermuteStartOp>
 LhloDialectEmitter::EmitCollectivePermuteStartOp(const HloInstruction* instr) {
   llvm::SmallVector<Value, 2> operands;
@@ -1594,6 +1536,7 @@ LhloDialectEmitter::EmitCollectivePermuteStartOp(const HloInstruction* instr) {
           permute->source_target_pairs(), &builder_);
   permute_start_op->setAttr(source_target_pairs_attr.getName(),
                             source_target_pairs_attr.getValue());
+  permute_start_op.setIsSync(IsSyncCollective(instr));
 
   auto [_, was_inserted] =
       ret_tokens_.insert({instr, permute_start_op.getToken()});
@@ -1975,8 +1918,10 @@ tsl::Status LhloDialectEmitter::Initialize() {
   mlir::IntegerAttr unique_id =
       builder_.getI32IntegerAttr(computation_.parent()->unique_id());
   module_->setAttr("hlo.unique_id", unique_id);
-  std::string function_name =
-      computation_.name().empty() ? "__compute" : computation_.name();
+  llvm::StringRef function_name =
+      computation_.name().empty() ? "__compute"
+                                  : llvm::StringRef(computation_.name().data(),
+                                                    computation_.name().size());
 
   // Create the function as () -> (), we'll compute the arguments from the
   // buffer allocation and update the type then.
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h
index e540b47d0ed..41e4414848a 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h
@@ -93,20 +93,14 @@ class LhloDialectEmitter : public xla::ConstDfsHloVisitorWithDefault {
   template <typename OpT>
   tsl::StatusOr<OpT> EmitDoneOp(const xla::HloInstruction* instr);
 
-  tsl::StatusOr<lmhlo::AllToAllOp> EmitAllToAllOp(
-      const xla::HloInstruction* instr);
   tsl::StatusOr<lmhlo_gpu::AllToAllStartOp> EmitAllToAllStartOp(
       const xla::HloInstruction* instr);
   tsl::StatusOr<lmhlo_gpu::AllToAllDoneOp> EmitAllToAllDoneOp(
       const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo::AllGatherOp> EmitAllGatherOp(
-      const xla::HloInstruction* instr);
   tsl::StatusOr<lmhlo_gpu::AllGatherStartOp> EmitAllGatherStartOp(
       const xla::HloInstruction* instr);
   tsl::StatusOr<lmhlo_gpu::AllGatherDoneOp> EmitAllGatherDoneOp(
       const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo::AllReduceOp> EmitAllReduceOp(
-      const xla::HloInstruction* instr);
   tsl::StatusOr<lmhlo_gpu::AllReduceStartOp> EmitAllReduceStartOp(
       const xla::HloInstruction* instr);
   tsl::StatusOr<lmhlo_gpu::AllReduceDoneOp> EmitAllReduceDoneOp(
@@ -115,14 +109,10 @@ class LhloDialectEmitter : public xla::ConstDfsHloVisitorWithDefault {
       const xla::HloInstruction* instr);
   tsl::StatusOr<mlir::Operation*> EmitAsyncDoneOp(
       const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo::ReduceScatterOp> EmitReduceScatterOp(
-      const xla::HloInstruction* instr);
   tsl::StatusOr<lmhlo_gpu::ReduceScatterStartOp> EmitReduceScatterStartOp(
       const xla::HloInstruction* instr);
   tsl::StatusOr<lmhlo_gpu::ReduceScatterDoneOp> EmitReduceScatterDoneOp(
       const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo::CollectivePermuteOp> EmitCollectivePermuteOp(
-      const xla::HloInstruction* instr);
   tsl::StatusOr<lmhlo_gpu::CollectivePermuteStartOp>
   EmitCollectivePermuteStartOp(const xla::HloInstruction* instr);
   tsl::StatusOr<lmhlo_gpu::CollectivePermuteDoneOp> EmitCollectivePermuteDoneOp(
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
index 5e756dca626..4c328378fee 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
@@ -197,88 +197,12 @@ ENTRY main {
     backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]}}"
 }
 
-// -----
-
-HloModule AllReduce
-
-// Test all-reduce
-add {
-  lhs = f32[] parameter(0)
-  rhs = f32[] parameter(1)
-  ROOT add = f32[] add(lhs, rhs)
-}
-
-// CHECK-LABEL: func @test_all_reduce
-// CHECK-SAME:  ([[INPUT:%.*]]: memref<32xi8> {lmhlo.params = 0
-%test_all_reduce {
-  input = f32[8] parameter(0)
-  // CHECK:  [[VIEW0:%.*]] = memref.view [[INPUT]]{{.*}} : memref<32xi8> to memref<8xf32>
-  // CHECK:  "lmhlo.all_reduce"([[VIEW0]], {{.*}})
-  // CHECK:  ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
-  // CHECK:    [[ADD:%.*]] = mhlo.add [[ARG0]], [[ARG1]]
-  // CHECK:    mhlo.return [[ADD]] : tensor<f32>
-  // CHECK:  }) {
-  // CHECK-SAME:  channel_id = #mhlo.channel_handle<handle = 1, type = 0>
-  // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : tensor<2x4xi64>
-  ROOT result = f32[8] all-reduce(input), channel_id=1, replica_groups={{0,1,2,3}, {4,5,6,7}}, to_apply=add
-}
-
-// -----
-
-HloModule AllReduceTuple
-// Test all-reduce with tuples
-add {
-  lhs = f32[] parameter(0)
-  rhs = f32[] parameter(1)
-  ROOT add = f32[] add(lhs, rhs)
-}
-
-// CHECK-LABEL: func @test_all_reduce_tuple
-// CHECK:  "lmhlo.all_reduce"
-// CHECK:  ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
-// CHECK:    [[ADD:%.*]] = mhlo.add [[ARG0]], [[ARG1]]
-// CHECK:    mhlo.return [[ADD]] : tensor<f32>
-// CHECK:  }) {
-// CHECK-NOT:  channel_id
-// CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, 3], [5, 6, 7, 4]]> : tensor<2x4xi64>
-// CHECK-SAME: (memref<8xf32>, memref<5xf32>, memref<8xf32>, memref<5xf32>) -> ()
-%test_all_reduce_tuple {
-  input = f32[8] parameter(0)
-  %tuple = (f32[8], f32[5]) custom-call(),custom_call_target="make-tuple"
-  ROOT result = (f32[8], f32[5]) all-reduce(%tuple), replica_groups={{0,1,2,3}, {5,6,7,4}}, to_apply=add
-}
-
-// -----
-
-HloModule AllReduceNonUniformReplicaGroups
-
-// Test all-reduce
-add {
-  lhs = f32[] parameter(0)
-  rhs = f32[] parameter(1)
-  ROOT add = f32[] add(lhs, rhs)
-}
-
-// CHECK-LABEL: func @test_all_reduce
-// CHECK-SAME:  ([[INPUT:%.*]]: memref<32xi8> {lmhlo.params = 0
-%test_all_reduce {
-  input = f32[8] parameter(0)
-  // CHECK:  [[VIEW0:%.*]] = memref.view [[INPUT]]{{.*}} : memref<32xi8> to memref<8xf32>
-  // CHECK:  "lmhlo.all_reduce"([[VIEW0]], {{.*}})
-  // CHECK:  ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
-  // CHECK:    [[ADD:%.*]] = mhlo.add [[ARG0]], [[ARG1]]
-  // CHECK:    mhlo.return [[ADD]] : tensor<f32>
-  // CHECK:  }) {
-  // CHECK-SAME:  channel_id = #mhlo.channel_handle<handle = 1, type = 0>
-  // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, -1], [3, 4, 5, 6]]> : tensor<2x4xi64>
-  ROOT result = f32[8] all-reduce(input), channel_id=1, replica_groups={{0,1,2}, {3,4,5,6}}, to_apply=add
-}
 
 // -----
 
 HloModule AsyncAllReduce
 
-// Test all-reduce
+// Test all-reduce-async
 add {
   lhs = f32[] parameter(0)
   rhs = f32[] parameter(1)
@@ -308,7 +232,7 @@ add {
 
 HloModule AsyncAllReduceTwoOperands
 
-// Test all-reduce
+// Test all-reduce-async
 add {
   lhs = f32[] parameter(0)
   rhs = f32[] parameter(1)
@@ -575,48 +499,6 @@ ENTRY main {
 
 // -----
 
-HloModule TestAllGather
-
-// CHECK: func @main
-// CHECK: "lmhlo.all_gather"
-// CHECK-SAME: all_gather_dimension = 1 : i64
-// CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
-// CHECK-SAME: use_global_device_ids = false
-ENTRY main {
-  param0 = f32[10,20] parameter(0)
-  ROOT ag = f32[10,80] all-gather(param0), replica_groups={{0,1,2,3}},
-    dimensions={1}
-}
-
-// -----
-
-// CHECK: func @entry
-// CHECK: "lmhlo.all_to_all"
-// CHECK-SAME: constrain_layout = false
-// CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-HloModule TestAllToAll
-ENTRY entry {
-  p0 = f32[128,4]{0,1} parameter(0)
-  p1 = f32[128,4]{1,0} parameter(1)
-  ROOT a2a = (f32[128,4]{0,1}, f32[128,4]{1,0}) all-to-all(p0, p1),
-    replica_groups={{0,1}}
-}
-
-// -----
-
-// CHECK: func @main
-// CHECK: "lmhlo.collective_permute"
-// CHECK-SAME:  channel_id = #mhlo.channel_handle<handle = 2, type = 0>
-// CHECK-SAME{LITERAL}: source_target_pairs = dense<[[0, 1], [1, 2], [2, 0]]> : tensor<3x2xi64>
-HloModule TestCollectivePermute
-ENTRY main {
-    p0 = f32[128] parameter(0)
-    ROOT permute = f32[128] collective-permute(p0),
-      source_target_pairs={{0,1}, {1,2}, {2,0}}, channel_id=2
-}
-
-// -----
-
 HloModule TestReplicaId
 
 // CHECK: func @main
@@ -749,25 +631,6 @@ ENTRY main {
                                       backend_config=""
 }
 
-// -----
-
-// CHECK: func @main
-// CHECK: "lmhlo.reduce_scatter"
-// CHECK{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-// CHECK-SAME: scatter_dimension = 0 : i64
-HloModule ReduceScatter
-
-add {
-  lhs = f32[] parameter(0)
-  rhs = f32[] parameter(1)
-  ROOT add = f32[] add(lhs, rhs)
-}
-
-ENTRY main {
-  input = f32[8]{0} parameter(0)
-  ROOT ars = f32[4]{0} reduce-scatter(input), replica_groups={{0,1}}, dimensions={0}, to_apply=add
-}
-
 // -----
 // CHECK: func @main
 // CHECK: "lmhlo.while"(%{{.*}}) ({
@@ -958,4 +821,40 @@ ENTRY main {
   p0 = f32[128,4] parameter(0)
   a2a-start = ((f32[128,4]), f32[128,4]) async-start(p0), calls=all_to_all
   ROOT a2a-done = f32[128,4] async-done(a2a-start), calls=all_to_all
-}
\ No newline at end of file
+}
+
+// -----
+
+HloModule TestAllGatherAsyncWithSyncFlagFalse
+
+// CHECK: func @main
+// CHECK: %[[TOKEN:.*]] = "lmhlo_gpu.all_gather_start"(%{{.*}}, %{{.*}}) {
+// CHECK-SAME: all_gather_dimension = 1 : i64
+// CHECK-SAME: is_sync = false
+// CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
+// CHECK-SAME: use_global_device_ids = false
+// CHECK ""lmhlo_gpu.all_gather_done"(%[[TOKEN]])
+ENTRY main {
+  param0 = f32[10,20] parameter(0)
+  ags = (f32[10,20], f32[10,80]) all-gather-start(param0), replica_groups={{0,1,2,3}},
+    dimensions={1}
+  ROOT ag = f32[10,80] all-gather-done(ags)
+}
+
+// -----
+
+HloModule TestAllGatherAsyncWithSyncFlagTrue
+
+// CHECK: func @main
+// CHECK: %[[TOKEN:.*]] = "lmhlo_gpu.all_gather_start"(%{{.*}}, %{{.*}}) {
+// CHECK-SAME: all_gather_dimension = 1 : i64
+// CHECK-SAME: is_sync = true
+// CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
+// CHECK-SAME: use_global_device_ids = false
+// CHECK ""lmhlo_gpu.all_gather_done"(%[[TOKEN]])
+ENTRY main {
+  param0 = f32[10,20] parameter(0)
+  ags = (f32[10,20], f32[10,80]) all-gather-start(param0), replica_groups={{0,1,2,3}},
+    dimensions={1}, backend_config="{\"is_sync\":true}"
+  ROOT ag = f32[10,80] all-gather-done(ags)
+}
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index 633cba3eb34..f3f5a4b9e88 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
+#include "absl/strings/str_format.h"
 #include "third_party/eigen3/Eigen/Core"
 
 namespace xla {
@@ -54,10 +55,13 @@ struct i4 {
   explicit operator complex64() const { return complex64(v); }
   explicit operator complex128() const { return complex128(v); }
 
-  i4 operator+(const i4& other) const { return i4((v + other.v) & 0x0F); }
-  i4 operator-(const i4& other) const { return i4((v - other.v) & 0x0F); }
-  i4 operator*(const i4& other) const { return i4((v * other.v) & 0x0F); }
-  i4 operator/(const i4& other) const { return i4((v / other.v) & 0x0F); }
+  i4 operator+(const i4& other) const { return i4((v + other.v)); }
+  i4 operator-(const i4& other) const { return i4((v - other.v)); }
+  i4 operator*(const i4& other) const { return i4((v * other.v)); }
+  i4 operator/(const i4& other) const { return i4((v / other.v)); }
+
+  i4 operator>>(const int amount) const { return i4((v >> amount)); }
+  i4 operator<<(const int amount) const { return i4((v << amount)); }
 
   bool operator==(const i4& other) const { return v == other.v; }
   bool operator!=(const i4& other) const { return v != other.v; }
@@ -90,6 +94,11 @@ struct i4 {
     return is;
   }
 
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const i4& i) {
+    absl::Format(&sink, "%d", i.v);
+  }
+
   std::string to_string() const { return std::to_string(v); }
 };
 
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 19868d17bf3..963f3715629 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -112,14 +112,12 @@ ScopedLoggingTimer::~ScopedLoggingTimer() { StopAndLog(); }
 
 Status AddStatus(Status prior, absl::string_view context) {
   CHECK(!prior.ok());
-  return Status{prior.code(),
-                absl::StrCat(context, ": ", prior.error_message())};
+  return Status{prior.code(), absl::StrCat(context, ": ", prior.message())};
 }
 
 Status AppendStatus(Status prior, absl::string_view context) {
   CHECK(!prior.ok());
-  return Status{prior.code(),
-                absl::StrCat(prior.error_message(), ": ", context)};
+  return Status{prior.code(), absl::StrCat(prior.message(), ": ", context)};
 }
 
 std::string Reindent(absl::string_view original,
@@ -167,6 +165,11 @@ std::string RoundTripFpToString(tsl::float8_e4m3fn value) {
   return result;
 }
 
+std::string RoundTripFpToString(tsl::float8_e4m3b11 value) {
+  std::string result = GenericRoundTripFpToString(value);
+  return result;
+}
+
 std::string RoundTripFpToString(bfloat16 value) {
   std::string result = GenericRoundTripFpToString(value);
   RoundTripNanPayload(value, &result);
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 3da3bba7e41..292ab5a3fd9 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -179,11 +179,12 @@ constexpr std::underlying_type_t<T> to_underlying(T value) noexcept {
 // source and destination. The source starting index is src_base, while the
 // destination one is dest_base.
 template <typename D, typename S>
-void StridedCopy(absl::Span<D> dest, int64_t dest_base, int64_t dest_stride,
-                 absl::Span<const S> src, int64_t src_base, int64_t src_stride,
+void StridedCopy(D* dest, int64_t dest_stride, const S* src, int64_t src_stride,
                  int64_t count) {
-  for (; count > 0; --count, dest_base += dest_stride, src_base += src_stride) {
-    dest[dest_base] = static_cast<D>(src[src_base]);
+  const S* src_end = src + count * src_stride;
+  DCHECK_LT(src, src_end);
+  for (; src < src_end; dest += dest_stride, src += src_stride) {
+    *dest = static_cast<D>(*src);
   }
 }
 
@@ -254,22 +255,25 @@ Status Internal(const absl::FormatSpec<Args...>& format, const Args&... args) {
 
 template <typename... Args>
 Status InvalidArgumentStrCat(Args&&... concat) {
-  return InvalidArgument("%s", absl::StrCat(std::forward<Args>(concat)...));
+  return WithLogBacktrace(
+      tsl::errors::InvalidArgument(std::forward<Args>(concat)...));
 }
 
 template <typename... Args>
 Status UnimplementedStrCat(Args&&... concat) {
-  return Unimplemented("%s", absl::StrCat(std::forward<Args>(concat)...));
+  return WithLogBacktrace(
+      tsl::errors::Unimplemented(std::forward<Args>(concat)...));
 }
 
 template <typename... Args>
 Status InternalErrorStrCat(Args&&... concat) {
-  return InternalError("%s", absl::StrCat(std::forward<Args>(concat)...));
+  return WithLogBacktrace(tsl::errors::Internal(std::forward<Args>(concat)...));
 }
 
 template <typename... Args>
 Status ResourceExhaustedStrCat(Args&&... concat) {
-  return ResourceExhausted("%s", absl::StrCat(std::forward<Args>(concat)...));
+  return WithLogBacktrace(
+      tsl::errors::ResourceExhausted(std::forward<Args>(concat)...));
 }
 
 // Splits the lines of the original, replaces leading whitespace with the prefix
@@ -332,6 +336,9 @@ std::string RoundTripFpToString(tsl::float8_e5m2 value);
 // Returns a string which can losslessly round trip to a float8 E4M3.
 std::string RoundTripFpToString(tsl::float8_e4m3fn value);
 
+// Returns a string which can losslessly round trip to a float8 E4M3B11.
+std::string RoundTripFpToString(tsl::float8_e4m3b11 value);
+
 // Returns a string which can losslessly round trip to a bfloat.
 std::string RoundTripFpToString(tsl::bfloat16 value);
 
@@ -525,10 +532,8 @@ SignedIntegerTypeForSizeType<sizeof(T)> ToSignMagnitude(T input) {
 
 template <typename T>
 constexpr int NanPayloadBits() {
-  static_assert(!std::is_same<T, tsl::float8_e4m3fn>::value,
-                "E4M3FN does not have payload");
-  // Floating point types with NaNs have payloads.
-  if (!std::numeric_limits<T>::has_quiet_NaN) {
+  // Floating point types with signaling NaNs have payloads.
+  if constexpr (!std::numeric_limits<T>::has_signaling_NaN) {
     return 0;
   }
   return std::numeric_limits<T>::digits - 1;
@@ -536,19 +541,17 @@ constexpr int NanPayloadBits() {
 
 template <typename T>
 constexpr uint64_t QuietNanWithoutPayload() {
-  static_assert(!std::is_same<T, tsl::float8_e4m3fn>::value,
-                "E4M3FN does not have payload");
-  if (const int bits = NanPayloadBits<T>()) {
-    return uint64_t{1} << (bits > 0 ? (bits - 1) : 0);
+  constexpr int bits = NanPayloadBits<T>();
+  if constexpr (bits > 0) {
+    return uint64_t{1} << (bits - 1);
   }
   return 0;
 }
 
 template <typename T>
 constexpr uint64_t NanPayloadBitMask() {
-  static_assert(!std::is_same<T, tsl::float8_e4m3fn>::value,
-                "E4M3FN does not have payload");
-  if (const int bits = NanPayloadBits<T>()) {
+  constexpr int bits = NanPayloadBits<T>();
+  if constexpr (bits > 0) {
     return LsbMask<uint64_t>(bits);
   }
   return 0;
@@ -556,8 +559,7 @@ constexpr uint64_t NanPayloadBitMask() {
 
 template <typename T>
 T NanWithSignAndPayload(bool sign, uint64_t nan_payload) {
-  static_assert(!std::is_same<T, tsl::float8_e4m3fn>::value,
-                "E4M3FN does not have payload");
+  static_assert(NanPayloadBits<T>() > 0);
   using RepT = UnsignedIntegerTypeForSizeType<sizeof(T)>;
   const T val = std::numeric_limits<T>::quiet_NaN();
   auto rep = absl::bit_cast<RepT>(val);
@@ -672,9 +674,14 @@ Status EraseElementFromVector(std::vector<T>* container, const T& value) {
 std::pair<float, float> SplitF64ToF32(double x);
 
 class HloInstruction;
+class HloModule;
 
 // A predicate over HLO instruction.
 using HloPredicate = std::function<bool(const HloInstruction*)>;
+using HloModulePredicate = std::function<bool(const HloModule*)>;
+
+inline bool HloPredicateTrue(const HloInstruction*) { return true; }
+inline bool HloPredicateFalse(const HloInstruction*) { return false; }
 
 using Vector2 = std::array<int64_t, 2>;
 using Vector3 = std::array<int64_t, 3>;
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index b8ed8453bf3..721318c2a64 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -19,6 +19,9 @@ load(
     "tf_exec_properties",
 )
 
+def register_extension_info(**kwargs):
+    pass
+
 def xla_py_proto_library(**kwargs):
     # Note: we don't currently define a proto library target for Python in OSS.
     _ignore = kwargs
@@ -45,6 +48,7 @@ def xla_cc_binary(deps = None, copts = tsl_copts(), **kwargs):
         "//tensorflow/compiler/xla:xla_proto_cc_impl",
         "//tensorflow/compiler/xla:xla_data_proto_cc_impl",
         "//tensorflow/compiler/xla/service:hlo_proto_cc_impl",
+        "//tensorflow/compiler/xla/service:memory_space_assignment_proto_cc_impl",
         "//tensorflow/compiler/xla/service/gpu:backend_configs_cc_impl",
         "//tensorflow/compiler/xla/stream_executor:dnn_proto_cc_impl",
         "//tensorflow/tsl/platform:env_impl",
@@ -55,6 +59,8 @@ def xla_cc_binary(deps = None, copts = tsl_copts(), **kwargs):
         "//tensorflow/tsl/protobuf:autotuning_proto_cc_impl",
         "//tensorflow/tsl/protobuf:protos_all_cc_impl",
         "//tensorflow/tsl/protobuf:dnn_proto_cc_impl",
+        "//tensorflow/tsl/framework:allocator",
+        "//tensorflow/tsl/framework:allocator_registry_impl",
         "//tensorflow/tsl/util:determinism",
     ]
     native.cc_binary(deps = deps, copts = copts, **kwargs)
@@ -75,6 +81,7 @@ def xla_cc_test(
                        "//tensorflow/compiler/xla:xla_proto_cc_impl",
                        "//tensorflow/compiler/xla:xla_data_proto_cc_impl",
                        "//tensorflow/compiler/xla/service:hlo_proto_cc_impl",
+                       "//tensorflow/compiler/xla/service:memory_space_assignment_proto_cc_impl",
                        "//tensorflow/compiler/xla/service/gpu:backend_configs_cc_impl",
                        "//tensorflow/compiler/xla/stream_executor:dnn_proto_cc_impl",
                        "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
@@ -106,3 +113,8 @@ def xla_cc_test(
         exec_properties = tf_exec_properties(kwargs),
         **kwargs
     )
+
+register_extension_info(
+    extension = xla_cc_test,
+    label_regex_for_dep = "{extension_name}",
+)
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 71f38471704..616fb68ea8f 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -165,6 +165,13 @@ message DebugOptions {
   // above!
   bool xla_gpu_enable_fast_min_max = 100;
 
+  // Defines the number of CUDA threads that can be used to accelerate
+  // a sparse computation compiled for the XLA Runtime and CPU backend.
+  // By default (value 0), no acceleration is used. Otherwise, this
+  // many threads may be used to accelerate sparse operations, typically
+  // useful when accelerating structured sparsity.
+  int32 xla_cpu_sparse_cuda_threads = 207;
+
   // Allows xla to increase the output precision of floating point operations.
   bool xla_allow_excess_precision = 122;
 
@@ -362,11 +369,12 @@ message DebugOptions {
   // Paths to files with LLVM code.
   repeated string xla_gpu_llvm_ir_file = 150;
 
-  // Convert synchronous all-reduces ops into asynchronous.
+  // Convert synchronous collective ops into asynchronous.
   bool xla_gpu_enable_async_all_reduce = 152;
-
-  // Convert synchronous collective-permute ops into asynchronous.
   bool xla_gpu_enable_async_collective_permute = 183;
+  bool xla_gpu_enable_async_all_gather = 199;
+  bool xla_gpu_enable_async_reduce_scatter = 200;
+  bool xla_gpu_enable_async_all_to_all = 201;
 
   // Size threshold (in bytes) for the GPU all-reduce combiner.
   int64 xla_gpu_all_reduce_combine_threshold_bytes = 157;
@@ -374,6 +382,10 @@ message DebugOptions {
   // Combine GPU all-reduces into a single operation over a contiguous buffer.
   bool xla_gpu_all_reduce_contiguous = 158;
 
+  // Enable allreduce reassociation on allreduces that are converted to a wider
+  // type. The resulting allreduce will be promoted to a wider-typed allreduce.
+  bool xla_gpu_enable_reassociation_for_converted_ar = 209;
+
   // Number of devices per host for first stage of BlueConnect decomposition
   // pass. The pass will attempt to decompose all-reduces ops into a
   // ReduceScatter-AllReduce-AllGather sequence, with the initial ReduceScatter
@@ -381,6 +393,12 @@ message DebugOptions {
   // disable all-reduce decomposition.
   int32 xla_gpu_all_reduce_blueconnect_num_devices_per_host = 159;
 
+  // Enable hoisting of reduce-scatter out of while loops.
+  bool xla_gpu_enable_while_loop_reduce_scatter_code_motion = 203;
+
+  // Inflate collective cost by running each collective multiple times.
+  int32 xla_gpu_collective_inflation_factor = 205;
+
   // Whether to use the cuDNN frontend API for convolutions when possible.
   bool xla_gpu_enable_cudnn_frontend = 160;
 
@@ -413,8 +431,28 @@ message DebugOptions {
   // Whether to use cuBLASLt for GEMMs on GPUs.
   bool xla_gpu_enable_cublaslt = 166;
 
-  // Whether XLA is allowed to use CUDA Graphs.
-  bool xla_gpu_enable_cuda_graphs = 180;
+  // 0:   Disable CUDA graph capture.
+  // 1:   Enable cuda graphs for fusions and memcpy (safest ones).
+  // 2:   Enable cuda graphs for gemms and convs.
+  // 3+   Enable cuda graphs for collectives.
+  //
+  // Default: 0.
+  int32 xla_gpu_cuda_graph_level = 194;
+
+  // Only instantiates a CUDA graph after the captured function execution count
+  // reaches the threshold. This constant is a heuristic to avoid creating a
+  // large number of CUDA graph instances in memory.
+  int32 xla_gpu_cuda_graph_instantiation_threshold = 202;
+
+  // This number determines how many moved instructions like fusion kernels are
+  // required for a region to be captured as a function to be launched as a cuda
+  // graph.
+  int32 xla_gpu_cuda_graph_capture_threshold = 208;
+
+  // Allocate temp buffers once during the first execution of an executable.
+  // Reuse the allocated buffers in subsequent executions. Executables cannot
+  // run concurrently if this is enabled.
+  bool xla_gpu_enable_persistent_temp_buffers = 206;
 
   // Size threshold (in megabytes) for the GPU redzone scratch allocator.
   int64 xla_gpu_redzone_scratch_max_megabytes = 167;
@@ -450,10 +488,20 @@ message DebugOptions {
   // instead.
   bool xla_cpu_enable_mlir_tiling_and_fusion = 184;
 
+  // XLA:CPU-Next tiling parameters for matmul.
+  bool xla_cpu_enable_custom_matmul_tiling = 195;
+  int64 xla_cpu_matmul_tiling_m_dim = 196;
+  int64 xla_cpu_matmul_tiling_n_dim = 197;
+  int64 xla_cpu_matmul_tiling_k_dim = 198;
+
+  bool xla_cpu_enable_mlir_fusion_outlining = 192;
+
   // If set, use the experimental deallocation pass from mlir-hlo.
   bool xla_cpu_enable_experimental_deallocation = 191;
 
   bool xla_gpu_enable_latency_hiding_scheduler = 186;
+  bool xla_gpu_lhs_enable_gpu_async_tracker = 204;
+  string xla_gpu_pgle_profile_directory = 210;
 
   enum PartitioningAlgorithm {
     PARTITIONING_ALGORITHM_NOOP = 0;
@@ -470,7 +518,7 @@ message DebugOptions {
 
   bool xla_gpu_triton_gemm_any = 190;
 
-  // Next id: 192
+  // Next id: 211
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
@@ -480,7 +528,9 @@ message DebugOptions {
   // xla_gpu_use_horizontal_fusion,
   // xla_gpu_unsafe_fallback_to_driver_on_ptxas_error,
   // xla_gpu_simplify_scatters, xla_gpu_simplify_gathers
-  reserved 5, 117, 133, 139, 176, 178;
+  // xla_gpu_enable_cuda_graphs
+  // xla_gpu_allow_all_reduce_kernel
+  reserved 5, 117, 133, 139, 176, 178, 180, 193;
 }
 
 message ShardableValueUpdatePairProto {
@@ -860,10 +910,15 @@ message ScheduleProto {
 }
 
 message ProfiledInstructionsProto {
-  message Instruction {
+  message InstructionCost {
     string name = 1;
-    double timestamp_us = 2;
-    double duration_us = 3;
+    double cost_us = 2;
   }
-  repeated Instruction instructions = 1;
+  message Latency {
+    string source = 1;
+    string target = 2;
+    double latency_us = 3;
+  }
+  repeated InstructionCost costs = 1;
+  repeated Latency latencies = 2;
 }
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index a9db4c98266..c84c2100ddf 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -69,11 +69,18 @@ enum PrimitiveType {
   // supported.  NaN is represented when the exponent and mantissa bits are all
   // 1s. All other values are finite.
   //
+  // F8E4M3B11FNUZ has 4 exponent bits and 3 mantissa bits and a bias of 11. The
+  // "FNUZ" means only Finite and NaN values are supported; zero is unsigned.
+  // Unlike IEEE types, infinities are not supported.  NaN is represented when
+  // the exponent and mantissa bits are all 0s with a sign bit of 1. All other
+  // values are finite.
+  //
   // Support for these dtypes is under development. They do not yet work
   // properly in most cases.
   // TODO(b/259609697): Fully support FP8.
   F8E5M2 = 19;
   F8E4M3FN = 20;
+  F8E4M3B11FNUZ = 23;
 
   // Complex values of fixed width.
   C64 = 15;   // Paired F32 (real, imag), as in std::complex<float>.
@@ -100,7 +107,7 @@ enum PrimitiveType {
   // primitive type will have empty dimensions and tuple_shapes fields.
   TOKEN = 17;
 
-  // Next = 23
+  // Next = 24
 }
 // LINT.ThenChange(
 //   https://www.tensorflow.org/code/tensorflow/compiler/xla/shape_util.cc,
@@ -140,6 +147,10 @@ enum DimLevelType {
   // The corresponding dimension contains a single coordinate, no sibling
   // elements for each parent.
   DIM_SINGLETON = 2;
+  // The corresponding dimension is Compressed, but with potential trailing
+  // zeros, thus an extra upper bound (high) is used to exclude those zeros.
+  // E.g., indices = [1, 2, 0, 0, 3, 4, 0, 0], position = [(0, 2), (4, 6)].
+  DIM_COMPRESSED_WITH_HI = 3;
 }
 
 // Describes a tile used in tiling-based layout. Refer to
@@ -508,8 +519,9 @@ message LiteralProto {
   bytes s16s = 17;
   bytes f8e5m2s = 19;
   bytes f8e4m3fns = 20;
+  bytes f8e4m3b11fnuzs = 23;
   repeated int64 sparse_indices = 14;
-  // Next = 23
+  // Next = 24
 }
 
 message WindowDimension {
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index 4d4a42468e9..811eb35410d 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -115,7 +115,6 @@ cc_library(
         DEFAULT: [],
     }) + [
         ":xrt_state_ops",
-        "@com_google_absl//absl/strings",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -125,10 +124,11 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/compiler/xrt:xrt_compile_ops_op_lib",
         "//tensorflow/compiler/xrt:xrt_execute_op_op_lib",
         "//tensorflow/compiler/xrt:xrt_proto_cc",
@@ -139,7 +139,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7655b09d125..f0f5a471e58 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -238,22 +238,22 @@ cc_library(
     copts = tf_copts(),
     deps = tf_lib_proto_parsing_deps() + [
         ":platform_base",
-        "@com_google_absl//absl/strings",
-        "@double_conversion//:double-conversion",
         "//tensorflow/core/lib/core:errors",
-        "//tensorflow/core/lib/core:stringpiece",
         "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/lib/core:stringpiece",
         "//tensorflow/core/lib/strings:numbers",
         "//tensorflow/core/lib/strings:strcat",
-        "//tensorflow/core/platform:platform_port",
+        "//tensorflow/core/platform",
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:platform_port",
         "//tensorflow/core/platform:protobuf",
-        "//tensorflow/core/platform",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:stringpiece",
         "//tensorflow/core/platform:tstring",
         "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/strings",
+        "@double_conversion//:double-conversion",
     ],
 )
 
@@ -286,6 +286,8 @@ cc_library(
     deps = [
         ":lib_internal",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/types:optional",
@@ -333,10 +335,10 @@ cc_library(
         ":lib",
         ":lib_internal",
         ":protos_all_cc",
-        "//tensorflow/tsl/platform/default/build_config:gtest",
         "//tensorflow/core/kernels:required",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
+        "//tensorflow/tsl/platform/default/build_config:gtest",
         "@com_google_googletest//:gtest",
     ] + tf_additional_test_deps(),
 )
@@ -392,12 +394,10 @@ tf_cuda_library(
         "//tensorflow/core/framework:reader_op_kernel.h",
         "//tensorflow/core/framework:register_types.h",
         "//tensorflow/core/framework:register_types_traits.h",
-        "//tensorflow/core/framework/registration:options.h",
-        "//tensorflow/core/framework:resource_mgr.h",
         "//tensorflow/core/framework:resource_handle.h",
+        "//tensorflow/core/framework:resource_mgr.h",
         "//tensorflow/core/framework:resource_op_kernel.h",
         "//tensorflow/core/framework:rng_alg.h",
-        "//tensorflow/core/framework/registration:registration.h",
         "//tensorflow/core/framework:session_state.h",
         "//tensorflow/core/framework:shape_inference.h",
         "//tensorflow/core/framework:shared_ptr_variant.h",
@@ -417,11 +417,13 @@ tf_cuda_library(
         "//tensorflow/core/framework:variant_encode_decode.h",
         "//tensorflow/core/framework:variant_op_registry.h",
         "//tensorflow/core/framework:variant_tensor_data.h",
-        "//tensorflow/core/util/sparse:framework_group",
+        "//tensorflow/core/framework/registration:options.h",
+        "//tensorflow/core/framework/registration:registration.h",
+        "//tensorflow/core/public:version.h",
+        "//tensorflow/core/util:determinism_hdr",
         "//tensorflow/core/util:framework_srcs",
         "//tensorflow/core/util:memmapped_file_system_hdrs",
-        "//tensorflow/core/util:determinism_hdr",
-        "//tensorflow/core/public:version.h",
+        "//tensorflow/core/util/sparse:framework_group",
         "//tensorflow/lite/kernels/shim:tf_headers",
         "//tensorflow/tsl/framework:allocator_hdrs",
         "//tensorflow/tsl/framework:cancellation_hdrs",
@@ -433,6 +435,7 @@ tf_cuda_library(
     deps = [
         ":framework_internal",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/functional:any_invocable",
     ],
 )
 
@@ -518,7 +521,7 @@ cc_library(
         "//tensorflow/c/kernels:histogram_summary_op_lib",
         "//tensorflow/c/kernels:merge_summary_op_lib",
         "//tensorflow/c/kernels:summary_op_lib",
-        "//tensorflow/core/ops:ops",
+        "//tensorflow/core/ops",
         "//tensorflow/dtensor/cc:dtensor_ops",
     ] + select({
         # Non-tpu platforms don't need tpu dependency.
@@ -568,6 +571,7 @@ cc_library(
         "//tensorflow/core/kernels:checkpoint_ops",
         "//tensorflow/core/kernels:clustering_ops",
         "//tensorflow/core/kernels:collective_ops",
+        "//tensorflow/core/kernels:composite_tensor_ops",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:count_ops",
@@ -575,7 +579,6 @@ cc_library(
         "//tensorflow/core/kernels:data_flow",
         "//tensorflow/core/kernels:decode_proto_op",
         "//tensorflow/core/kernels:encode_proto_op",
-        "//tensorflow/core/kernels:composite_tensor_ops",
         "//tensorflow/core/kernels:fact_op",
         "//tensorflow/core/kernels:fake_quant_ops",
         "//tensorflow/core/kernels:filesystem_ops",
@@ -585,8 +588,8 @@ cc_library(
         "//tensorflow/core/kernels:histogram_op",
         "//tensorflow/core/kernels:io",
         "//tensorflow/core/kernels:isotonic_regression_op",
-        "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
+        "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:manip",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:multinomial_op",
@@ -598,11 +601,9 @@ cc_library(
         "//tensorflow/core/kernels:pooling_ops",
         "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/kernels:ragged_ops",
+        "//tensorflow/core/kernels:random_binomial_op",
         "//tensorflow/core/kernels:random_index_shuffle_ops",
         "//tensorflow/core/kernels:random_ops",
-        "//tensorflow/core/kernels:stateful_random_ops",
-        "//tensorflow/core/kernels:sync_ops",
-        "//tensorflow/core/kernels:random_binomial_op",
         "//tensorflow/core/kernels:random_poisson_op",
         "//tensorflow/core/kernels:required",
         "//tensorflow/core/kernels:resource_variable_ops",
@@ -613,22 +614,26 @@ cc_library(
         "//tensorflow/core/kernels:set_kernels",
         "//tensorflow/core/kernels:sparse",
         "//tensorflow/core/kernels:state",
-        "//tensorflow/core/kernels:stateless_random_ops",
+        "//tensorflow/core/kernels:stateful_random_ops",
         "//tensorflow/core/kernels:stateless_random_gamma_op",
+        "//tensorflow/core/kernels:stateless_random_ops",
         "//tensorflow/core/kernels:stateless_shuffle",
+        "//tensorflow/core/kernels:stochastic_cast_op",
         "//tensorflow/core/kernels:string",
         "//tensorflow/core/kernels:summary_kernels",
+        "//tensorflow/core/kernels:sync_ops",
         "//tensorflow/core/kernels:training_ops",
-        "//tensorflow/core/kernels/uniform_quant_ops:kernels",
         "//tensorflow/core/kernels:word2vec_kernels",
-        "//tensorflow/core/kernels/linalg:linalg",
-        "//tensorflow/core/kernels/image:image",
+        "//tensorflow/core/kernels/image",
+        "//tensorflow/core/kernels/linalg",
         "//tensorflow/core/kernels/sparse:kernels",
+        "//tensorflow/core/kernels/uniform_quant_ops:kernels",
     ] + if_mkl([
         "//tensorflow/core/kernels/mkl:mkl_concat_op",
         "//tensorflow/core/kernels/mkl:mkl_dequantize_op",
         "//tensorflow/core/kernels/mkl:mkl_conv_op",
         "//tensorflow/core/kernels/mkl:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels/mkl:mkl_fused_instance_norm_op",
         "//tensorflow/core/kernels/mkl:mkl_layer_norm_op",
         "//tensorflow/core/kernels/mkl:mkl_pooling_ops",
         "//tensorflow/core/kernels/mkl:mkl_qmatmul_op",
@@ -959,6 +964,7 @@ filegroup(
         "stateful_random_ops_op_lib",
         "stateless_random_ops_op_lib",
         "stateless_random_ops_v2_op_lib",
+        "stochastic_cast_op_op_lib",
         "string_ops_op_lib",
         "summary_ops_op_lib",
         "sync_ops_op_lib",
@@ -1017,9 +1023,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:mobile_additional_lib_deps",
         "//tensorflow/core/platform:resource",
         "//tensorflow/core/util:stats_calculator_portable",
-        "//tensorflow/core:mobile_additional_lib_deps",
     ] + tf_portable_proto_lib() + tf_portable_deps_no_runtime(),
     alwayslink = 1,
 )
@@ -1050,13 +1056,15 @@ cc_library(
 #         "@com_google_absl//absl/algorithm:container",
 #         "@com_google_absl//absl/base",
 #         "@com_google_absl//absl/meta:type_traits",
+#         "@com_google_absl//absl/status:statusor",
 #         "@com_google_absl//absl/strings",
 #         "@com_google_absl//absl/types:optional",
-#         "//tensorflow/core/util:managed_stack_trace",
-#         "//tensorflow/core/platform:resource",
-#         "//tensorflow/tsl/framework/fixedpoint",
-#         "//tensorflow/core/util:stats_calculator_portable",
 #         "//tensorflow/core:mobile_additional_lib_deps",
+#         "//tensorflow/core/platform:resource",
+#         "//tensorflow/core/util:managed_stack_trace",
+#         "//tensorflow/core/util:stats_calculator_portable",
+#         "//tensorflow/tsl/framework/fixedpoint",
+#         "//tensorflow/tsl/platform:logging",
 #     ] + tf_portable_proto_lib() + tf_portable_deps_no_runtime(),
 #     alwayslink = 1,
 # )
@@ -1067,9 +1075,11 @@ cc_library(
     deps = tf_additional_lib_deps() + [
         ":platform_base",
         "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1184,18 +1194,18 @@ alias(
 filegroup(
     name = "lib_internal_private_headers",
     srcs = [
-        "//tensorflow/core/platform:legacy_lib_internal_headers",
-        "//tensorflow/core/platform:lib_internal_private_hdrs",
         "//tensorflow/core/lib/core:legacy_lib_core_all_headers",
-        "//tensorflow/tsl/lib/core:legacy_lib_core_all_headers",
         "//tensorflow/core/lib/gtl:legacy_lib_gtl_all_headers",
-        "//tensorflow/core/lib/histogram:legacy_lib_histogram_all_headers",
         "//tensorflow/core/lib/hash:legacy_lib_hash_all_headers",
-        "//tensorflow/core/lib/monitoring:legacy_lib_monitoring_all_headers",
+        "//tensorflow/core/lib/histogram:legacy_lib_histogram_all_headers",
         "//tensorflow/core/lib/io:legacy_lib_io_all_headers",
+        "//tensorflow/core/lib/math:math_util.h",
+        "//tensorflow/core/lib/monitoring:legacy_lib_monitoring_all_headers",
         "//tensorflow/core/lib/random:legacy_lib_random_all_headers",
         "//tensorflow/core/lib/strings:legacy_lib_strings_all_headers",
-        "//tensorflow/core/lib/math:math_util.h",
+        "//tensorflow/core/platform:legacy_lib_internal_headers",
+        "//tensorflow/core/platform:lib_internal_private_hdrs",
+        "//tensorflow/tsl/lib/core:legacy_lib_core_all_headers",
         "//tensorflow/tsl/lib/io:legacy_lib_io_all_headers",
         "//tensorflow/tsl/lib/math:math_util.h",
         "//tensorflow/tsl/platform:legacy_lib_internal_headers",
@@ -1250,12 +1260,13 @@ cc_library(
         ],
     }),
     deps = tf_additional_lib_deps() + [
-        "@com_google_absl//absl/meta:type_traits",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/status",
+        "//tensorflow/core/platform/default/build_config:platformlib",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
-        "//tensorflow/core/platform/default/build_config:platformlib",
+        "@com_google_absl//absl/meta:type_traits",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
     ] + if_static([":lib_internal_impl"]),
 )
 
@@ -1277,11 +1288,11 @@ cc_library(
     }),
     visibility = ["//tensorflow:internal"],
     deps = tf_additional_lib_deps() + [
-        "@com_google_absl//absl/meta:type_traits",
-        "@com_google_absl//absl/strings",
+        "//tensorflow/core/platform/default/build_config:platformlib",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
-        "//tensorflow/core/platform/default/build_config:platformlib",
+        "@com_google_absl//absl/meta:type_traits",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1307,9 +1318,6 @@ cc_library(
     deps = tf_additional_lib_deps() + [
         ":core_stringpiece",
         ":lib_proto_parsing",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "//third_party/eigen3",
         "//tensorflow/core/lib/core:arena",
         "//tensorflow/core/lib/core:bitmap",
         "//tensorflow/core/lib/core:bits",
@@ -1338,8 +1346,8 @@ cc_library(
         "//tensorflow/core/lib/gtl:optional",
         "//tensorflow/core/lib/gtl:priority_queue_util",
         "//tensorflow/core/lib/gtl:top_n",
-        "//tensorflow/core/lib/hash:crc32c",
         "//tensorflow/core/lib/hash",
+        "//tensorflow/core/lib/hash:crc32c",
         "//tensorflow/core/lib/histogram",
         "//tensorflow/core/lib/io:block",
         "//tensorflow/core/lib/io:buffered_inputstream",
@@ -1362,8 +1370,6 @@ cc_library(
         "//tensorflow/core/lib/io:zlib_inputstream",
         "//tensorflow/core/lib/io:zlib_outputbuffer",
         "//tensorflow/core/lib/math:math_util",
-        "//tensorflow/tsl/lib/math:math_util",
-        "//tensorflow/core/lib/wav:wav_io",
         "//tensorflow/core/lib/monitoring:collected_metrics",
         "//tensorflow/core/lib/monitoring:collection_registry",
         "//tensorflow/core/lib/monitoring:counter",
@@ -1372,10 +1378,10 @@ cc_library(
         "//tensorflow/core/lib/monitoring:percentile_sampler",
         "//tensorflow/core/lib/monitoring:sampler",
         "//tensorflow/core/lib/monitoring:timed",
+        "//tensorflow/core/lib/random",
         "//tensorflow/core/lib/random:exact_uniform_int",
         "//tensorflow/core/lib/random:philox",
         "//tensorflow/core/lib/random:philox_random",
-        "//tensorflow/core/lib/random",
         "//tensorflow/core/lib/random:weighted_picker",
         "//tensorflow/core/lib/strings:base64",
         "//tensorflow/core/lib/strings:numbers",
@@ -1386,9 +1392,10 @@ cc_library(
         "//tensorflow/core/lib/strings:str_util",
         "//tensorflow/core/lib/strings:strcat",
         "//tensorflow/core/lib/strings:stringprintf",
+        "//tensorflow/core/lib/wav:wav_io",
         "//tensorflow/core/platform:abi",
-        "//tensorflow/core/platform:bfloat16",
         "//tensorflow/core/platform:base64",
+        "//tensorflow/core/platform:bfloat16",
         "//tensorflow/core/platform:blocking_counter",
         "//tensorflow/core/platform:casts",
         "//tensorflow/core/platform:coding",
@@ -1406,8 +1413,8 @@ cc_library(
         "//tensorflow/core/platform:load_library",
         "//tensorflow/core/platform:logger",
         "//tensorflow/core/platform:mutex",
-        "//tensorflow/core/platform:notification",
         "//tensorflow/core/platform:net",
+        "//tensorflow/core/platform:notification",
         "//tensorflow/core/platform:null_file_system",
         "//tensorflow/core/platform:numbers",
         "//tensorflow/core/platform:path",
@@ -1424,11 +1431,11 @@ cc_library(
         "//tensorflow/core/platform:stacktrace_handler",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
+        "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:strcat",
         "//tensorflow/core/platform:stringpiece",
         "//tensorflow/core/platform:stringprintf",
         "//tensorflow/core/platform:strong_hash",
-        "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:tensor_coding",
         "//tensorflow/core/platform:thread_annotations",
         "//tensorflow/core/platform:threadpool_interface",
@@ -1439,10 +1446,14 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/util:env_var",
         "//tensorflow/core/util:reporter",  # TODO(gunan): REMOVE as soon as cc_shared_library is supported.
+        "//tensorflow/tsl/lib/math:math_util",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_protobuf//:protobuf",
+        "@double_conversion//:double-conversion",
         "@snappy",
         "@zlib",
-        "@double_conversion//:double-conversion",
-        "@com_google_protobuf//:protobuf",
     ] + select({
         "//tensorflow:fuchsia": [],
         "//conditions:default": ["//tensorflow/core/platform:subprocess"],
@@ -1461,6 +1472,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_proto_cc_impl",
         "//tensorflow/compiler/xla:xla_data_proto_cc_impl",
         "//tensorflow/compiler/xla/service:hlo_proto_cc_impl",
+        "//tensorflow/compiler/xla/service:memory_space_assignment_proto_cc_impl",
         "//tensorflow/compiler/xla/service/gpu:backend_configs_cc_impl",
     ] + tf_protos_grappler_impl() + tf_monitoring_framework_deps(),
     # Alwayslink causes a cc_binary to "always link" in the
@@ -1561,6 +1573,7 @@ filegroup(
 filegroup(
     name = "framework_internal_public_headers",
     srcs = [
+        "//tensorflow/core/framework:local_rendezvous.h",
         "//tensorflow/core/framework:model.h",
         "//tensorflow/core/framework:op_segment.h",
         "//tensorflow/core/framework:ref_var.h",
@@ -1584,8 +1597,8 @@ tf_cuda_library(
     hdrs = [":framework_internal_public_headers"],
     deps = [
         ":framework_internal_headers_lib",
-        "//third_party/eigen3",
         ":lib",
+        "//third_party/eigen3",
     ] + if_static(
         extra_deps = [
             ":framework_internal_impl",
@@ -1640,14 +1653,8 @@ tf_cuda_library(
         ":lib",
         ":lib_internal",
         ":protos_all_cc",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "//third_party/eigen3",
+        "//tensorflow/compiler/jit:common",
+        "//tensorflow/core/activity_watcher",
         "//tensorflow/core/example:feature_util",
         "//tensorflow/core/framework:allocator",
         "//tensorflow/core/framework:allocator_registry_impl",
@@ -1664,30 +1671,38 @@ tf_cuda_library(
         "//tensorflow/core/framework:op_def_builder",
         "//tensorflow/core/framework:op_def_util",
         "//tensorflow/core/framework:resource_handle",
-        "//tensorflow/core/framework/registration",
         "//tensorflow/core/framework:shape_inference",
         "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:tensor_shape",
+        "//tensorflow/core/framework/registration",
         "//tensorflow/core/platform:env_impl",
         "//tensorflow/core/platform:fingerprint",
         "//tensorflow/core/platform/default/build_config:platformlib",
-        "//tensorflow/tsl/platform:stringpiece",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/util:determinism",
-        "//tensorflow/core/util:managed_stack_trace",
         "//tensorflow/core/util:einsum_op_util",
+        "//tensorflow/core/util:managed_stack_trace",
         "//tensorflow/core/util:padding",
         "//tensorflow/core/util:port",
         "//tensorflow/core/util:stats_calculator_portable",
         "//tensorflow/core/util:tensor_format",
-        "//tensorflow/compiler/jit:common",
         "//tensorflow/tsl/framework:cancellation",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:stringpiece",
         "//tensorflow/tsl/util:command_line_flags",
         "//tensorflow/tsl/util:device_name_utils",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
     ] + if_cuda([
         "@local_config_cuda//cuda:cudnn_header",
     ]) + if_static(
@@ -1958,6 +1973,7 @@ tf_cc_test_mkl(
         "//tensorflow/core/kernels/mkl:mkl_conv_op",
         "//tensorflow/core/kernels/mkl:mkl_dequantize_op",
         "//tensorflow/core/kernels/mkl:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels/mkl:mkl_fused_instance_norm_op",
         "//tensorflow/core/kernels/mkl:mkl_matmul_op",
         "//tensorflow/core/kernels/mkl:mkl_pooling_ops",
         "//tensorflow/core/kernels/mkl:mkl_qmatmul_op",
@@ -2029,6 +2045,7 @@ transitive_hdrs(
 #     deps = [
 #         "//tensorflow/core/example:example_parser_configuration_go_proto",
 #         "//tensorflow/core/example:example_protos_go_proto",
+#         "//tensorflow/core/framework:graph_debug_info_go_proto",
 #         "//tensorflow/core/protobuf:for_core_protos_go_proto",
 #         "//tensorflow/core/util:event_go_proto",
 #         "//tensorflow/core/util:memmapped_file_system_go_proto",
diff --git a/tensorflow/core/activity_watcher/BUILD b/tensorflow/core/activity_watcher/BUILD
index 77e79919216..2b222a537c5 100644
--- a/tensorflow/core/activity_watcher/BUILD
+++ b/tensorflow/core/activity_watcher/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "if_not_android")
+load("//tensorflow:tensorflow.bzl", "if_not_mobile")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -19,10 +19,12 @@ filegroup(
 cc_library(
     name = "activity_watcher",
     hdrs = ["activity.h"],
+    defines = if_not_mobile(["TF_ENABLE_ACTIVITY_WATCHER"]),
     deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/core:lib",
-    ] + if_not_android([
+        "//tensorflow/tsl/platform:types",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ] + if_not_mobile([
         ":activity_watcher_impl",
     ]),
     alwayslink = True,
@@ -36,6 +38,7 @@ cc_library(
     hdrs = ["activity.h"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/tsl/platform:types",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
     alwayslink = True,
diff --git a/tensorflow/core/activity_watcher/activity.h b/tensorflow/core/activity_watcher/activity.h
index b71b02e260f..00807aa727e 100644
--- a/tensorflow/core/activity_watcher/activity.h
+++ b/tensorflow/core/activity_watcher/activity.h
@@ -21,8 +21,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/types.h"
 
 namespace tsl {
 class CoordinationServiceAgent;
@@ -32,7 +32,7 @@ namespace tensorflow {
 
 namespace activity_watcher {
 
-using ActivityId = uint64;
+using ActivityId = tsl::uint64;
 constexpr ActivityId kActivityNotRecorded = 0;
 constexpr int kWatcherDisabled = 0;
 
@@ -42,9 +42,10 @@ enum ActivityCategory {
   kMisc = 2,
   kDatasetOp = 3,
   kTpuOp = 4,
+  kRendezvous = 5,
 };
 
-static tensorflow::string ToString(ActivityCategory category) {
+static tsl::string ToString(ActivityCategory category) {
   switch (category) {
     case ActivityCategory::kCollective:
       return "Collective";
@@ -56,24 +57,24 @@ static tensorflow::string ToString(ActivityCategory category) {
       return "Dataset Op";
     case ActivityCategory::kTpuOp:
       return "TPU Op";
+    case ActivityCategory::kRendezvous:
+      return "Rendezvous";
   }
 }
 
 // An activity to be recorded.
 struct Activity {
-  using Attributes =
-      absl::flat_hash_map<tensorflow::string, tensorflow::string>;
+  using Attributes = absl::flat_hash_map<tsl::string, tsl::string>;
   // A human readable title of the activity.
-  tensorflow::string title;
+  tsl::string title;
   // The category of the activity.
   ActivityCategory category = ActivityCategory::kMisc;
   // Key/value pairs that are attached to the activity.
   Attributes attributes;
   Activity() = default;
-  Activity(tensorflow::string title, ActivityCategory category)
+  Activity(tsl::string title, ActivityCategory category)
       : title(std::move(title)), category(category) {}
-  Activity(tensorflow::string title, ActivityCategory category,
-           Attributes attributes)
+  Activity(tsl::string title, ActivityCategory category, Attributes attributes)
       : title(std::move(title)),
         category(category),
         attributes(std::move(attributes)) {}
@@ -85,7 +86,7 @@ void MaybeEnableMultiWorkersWatching(tsl::CoordinationServiceAgent* agent);
 
 namespace tfw_internal {
 
-#if !defined(IS_MOBILE_PLATFORM)
+#if defined(TF_ENABLE_ACTIVITY_WATCHER)
 
 // Records an activity start without checking whether the watcher is enabled.
 ActivityId RecordActivityStart(std::unique_ptr<Activity> activity);
@@ -131,7 +132,7 @@ template <
     typename ActivityGenerator,
     std::enable_if_t<is_activity_generator<ActivityGenerator>, bool> = true>
 inline ActivityId ActivityStart(ActivityGenerator&& gen, int level = 1) {
-#if !defined(IS_MOBILE_PLATFORM)
+#if defined(TF_ENABLE_ACTIVITY_WATCHER)
   if (TF_PREDICT_FALSE(tfw_internal::WatcherEnabled(level))) {
     return tfw_internal::RecordActivityStart(
         std::forward<ActivityGenerator>(gen)());
@@ -141,7 +142,7 @@ inline ActivityId ActivityStart(ActivityGenerator&& gen, int level = 1) {
 }
 
 inline void ActivityEnd(ActivityId id) {
-#if !defined(IS_MOBILE_PLATFORM)
+#if defined(TF_ENABLE_ACTIVITY_WATCHER)
   if (TF_PREDICT_FALSE(id != kActivityNotRecorded)) {
     tfw_internal::RecordActivityEnd(id);
   }
@@ -167,6 +168,10 @@ class ActivityScope {
   explicit ActivityScope(ActivityGenerator&& gen, int level = 1) {
     activity_id_ = ActivityStart(std::forward<ActivityGenerator>(gen), level);
   }
+  ActivityScope(ActivityScope&& activity) {
+    activity_id_ = activity.activity_id_;
+    activity.activity_id_ = kActivityNotRecorded;
+  }
   ~ActivityScope() { ActivityEnd(activity_id_); }
 
  private:
diff --git a/tensorflow/core/activity_watcher/activity_utils.cc b/tensorflow/core/activity_watcher/activity_utils.cc
index 1d0643f10df..9d2b1520e5a 100644
--- a/tensorflow/core/activity_watcher/activity_utils.cc
+++ b/tensorflow/core/activity_watcher/activity_utils.cc
@@ -19,13 +19,14 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/str_join.h"
+#include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
 namespace activity_watcher {
 
 std::unique_ptr<Activity> ActivityFromContext(
-    OpKernelContext* context, tensorflow::string name,
-    ActivityCategory category, Activity::Attributes additional_attributes) {
+    OpKernelContext* context, tsl::string name, ActivityCategory category,
+    Activity::Attributes additional_attributes) {
   Activity::Attributes attributes(std::move(additional_attributes));
   if (context) {
     attributes.merge(Activity::Attributes({
diff --git a/tensorflow/core/activity_watcher/activity_utils.h b/tensorflow/core/activity_watcher/activity_utils.h
index ed9b1b320fa..745b1c88565 100644
--- a/tensorflow/core/activity_watcher/activity_utils.h
+++ b/tensorflow/core/activity_watcher/activity_utils.h
@@ -18,16 +18,17 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/activity_watcher/activity.h"
-#include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
+
+class OpKernelContext;
+
 namespace activity_watcher {
 
 // A convenient way to create an activity. Writes OpKernelContext information
 // and given attributes to a new activity and returns.
 std::unique_ptr<Activity> ActivityFromContext(
-    OpKernelContext* context, tensorflow::string name,
-    ActivityCategory category,
+    OpKernelContext* context, tsl::string name, ActivityCategory category,
     Activity::Attributes additional_attributes = Activity::Attributes());
 
 }  // namespace activity_watcher
diff --git a/tensorflow/core/api_def/base_api/api_def_DebugIdentityV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_DebugIdentityV3.pbtxt
new file mode 100644
index 00000000000..0b7ac15b5e8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DebugIdentityV3.pbtxt
@@ -0,0 +1,62 @@
+op {
+  graph_op_name: "DebugIdentityV3"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+Input tensor, non-Reference type
+END
+  }
+  attr {
+    name: "device_name"
+    description: <<END
+Name of the device on which the tensor resides.
+END
+  }
+  attr {
+    name: "tensor_name"
+    description: <<END
+Name of the input tensor.
+END
+  }
+  attr {
+    name: "debug_urls"
+    description: <<END
+List of URLs to debug targets, e.g.,
+  file:///foo/tfdbg_dump, grpc:://localhost:11011
+END
+  }
+  attr {
+    name: "gated_grpc"
+    description: <<END
+Whether this op will be gated. If any of the debug_urls of this
+  debug node is of the grpc:// scheme, when the value of this attribute is set
+  to True, the data will not actually be sent via the grpc stream unless this
+  debug op has been enabled at the debug_url. If all of the debug_urls of this
+  debug node are of the grpc:// scheme and the debug op is enabled at none of
+  them, the output will be an empty Tensor.
+END
+  }
+  attr {
+    name: "io_of_node"
+    description: <<END
+Name of the node of which the tensor is an input or output.
+END
+  }
+  attr {
+    name: "is_input"
+    description: <<END
+If true, the tensor is an input of the node; otherwise the output.
+END
+  }
+  attr {
+    name: "io_index"
+    description: <<END
+The index of which the tensor is an input or output of the node.
+END
+  }
+  summary: "Provides an identity mapping of the non-Ref type input tensor for debugging."
+  description: <<END
+Provides an identity mapping of the non-Ref type input tensor for debugging.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RelayoutGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_RelayoutGrad.pbtxt
deleted file mode 100644
index 4c9e7fa00ca..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_RelayoutGrad.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "RelayoutGrad"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_RelayoutLike.pbtxt b/tensorflow/core/api_def/base_api/api_def_RelayoutLike.pbtxt
new file mode 100644
index 00000000000..5334b393486
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RelayoutLike.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RelayoutLike"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SetSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_SetSize.pbtxt
index 812537412e2..6aedc607781 100644
--- a/tensorflow/core/api_def/base_api/api_def_SetSize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SetSize.pbtxt
@@ -33,6 +33,7 @@ and `set_shape`. The last dimension contains values in a set, duplicates are
 allowed but ignored.
 
 If `validate_indices` is `True`, this op validates the order and range of `set`
-indices.
+indices. Setting is to `False` while passing invalid arguments results in
+undefined behavior.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SnapshotChunkDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SnapshotChunkDataset.pbtxt
new file mode 100644
index 00000000000..b522159c33a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SnapshotChunkDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SnapshotChunkDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StochasticCastToInt.pbtxt b/tensorflow/core/api_def/base_api/api_def_StochasticCastToInt.pbtxt
new file mode 100644
index 00000000000..324fdb3731a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StochasticCastToInt.pbtxt
@@ -0,0 +1,53 @@
+op {
+  graph_op_name: "StochasticCastToInt"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+The operand to stochastically cast to int.
+END
+  }
+  in_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  in_arg {
+    name: "counter"
+    description: <<END
+Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The cast result with the same shape as the input.
+END
+  }
+  attr {
+    name: "Tin"
+    description: <<END
+The type of the input.
+END
+  }
+  attr {
+    name: "Tout"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Stochastically cast a given tensor from floats to ints."
+  description: <<END
+The values are cast with a deterministic pseudo-random tensor from a uniform distribution generated from user given key, counter, algorithm. Values will saturate if out of the specified integer type range, and will become zero if inputs are NaN.  
+
+The outputs are a deterministic function of `input`, `key`, `counter`, `alg`.
+
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 6e8da0d3b55..9718b256dfb 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -163,10 +163,10 @@ cc_library(
 filegroup(
     name = "mobile_srcs_only_runtime",
     srcs = [
-        "//tensorflow/tsl/framework:allocator_retry.h",
         "//tensorflow/tsl/framework:allocator_retry.cc",
-        "//tensorflow/tsl/framework:bfc_allocator.h",
+        "//tensorflow/tsl/framework:allocator_retry.h",
         "//tensorflow/tsl/framework:bfc_allocator.cc",
+        "//tensorflow/tsl/framework:bfc_allocator.h",
         "//tensorflow/tsl/framework:shared_counter.h",
     ] + glob(
         [
@@ -228,14 +228,14 @@ tf_cuda_library(
     deps = [
         ":scoped_allocator",
         ":stats_publisher_interface",
-        "//tensorflow/core:graph",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + if_static([
         ":graph_constructor",
         ":graph_def_builder_util",
@@ -247,13 +247,11 @@ tf_cuda_library(
 filegroup(
     name = "core_cpu_lib_headers",
     srcs = [
-        ":core_cpu_base_headers",
+        "all_to_all.h",
         "allocator_retry.h",
-        "shared_counter.h",
         "arg_ret_placement.h",
         "base_collective_executor.h",
         "bfc_allocator.h",
-        "hierarchical_tree_broadcaster.h",
         "buf_rendezvous.h",
         "build_graph_options.h",
         "collective_executor_mgr.h",
@@ -264,25 +262,25 @@ filegroup(
         "constant_folding.h",
         "copy_tensor.h",
         "costmodel_manager.h",
-        "placer_inspection_required_ops_utils.h",
         "debugger_state_interface.h",
-        "all_to_all.h",
         "device_resolver_local.h",
         "dma_helper.h",
         "executor.h",
         "executor_factory.h",
         "function_optimization_registry.h",
-        "graph_optimizer.h",
         "gradients.h",
+        "graph_optimizer.h",
+        "hierarchical_tree_broadcaster.h",
         "input_colocation_exemption_registry.h",
+        "inspecting_placer.h",
         "int32_fulltype.h",
         "isolate_placer_inspection_required_ops_pass.h",
         "local_device.h",
         "local_executor_params.h",
-        "lower_function_call_op.h",
-        "lower_if_op.h",
         "lower_case_op.h",
+        "lower_function_call_op.h",
         "lower_functional_ops.h",
+        "lower_if_op.h",
         "lower_while_op.h",
         "memory_types.h",
         "mkl_cpu_allocator.h",
@@ -290,26 +288,28 @@ filegroup(
         "node_file_writer.h",
         "optimization_registry.h",
         "partitioning_utils.h",
+        "permuter.h",
         "placer.h",
+        "placer_inspection_required_ops_utils.h",
+        "pool_allocator.h",
+        "process_state.h",
         "process_util.h",
-        "inspecting_placer.h",
         "profile_handler.h",
         "quantize_training.h",
         "renamed_device.h",
         "rendezvous_mgr.h",
         "rendezvous_util.h",
         "replicate_per_replica_nodes.h",
-        "ring_reducer.h",
         "ring_alg.h",
         "ring_gatherer.h",
+        "ring_reducer.h",
         "session_factory.h",
+        "shared_counter.h",
         "single_threaded_cpu_device.h",
         "stats_publisher_interface.h",
         "step_stats_collector.h",
         "threadpool_device.h",
-        "process_state.h",
-        "pool_allocator.h",
-        "permuter.h",
+        ":core_cpu_base_headers",
         "//tensorflow/tsl/framework:allocator_retry.h",
         "//tensorflow/tsl/framework:shared_counter.h",
     ] + if_mkl(["//tensorflow/core/graph:mkl_graph_util_header"]),
@@ -978,6 +978,15 @@ cc_library(
         ":memory_types",
         ":session_options",
         ":single_threaded_cpu_device",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tsl/framework:bfc_allocator",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:types",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -988,15 +997,6 @@ cc_library(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:variant",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/tsl/framework:bfc_allocator",
-        "//tensorflow/tsl/platform:mutex",
-        "//tensorflow/tsl/platform:types",
     ] + if_static([
         ":executor",
         ":rendezvous_mgr",
@@ -1849,15 +1849,15 @@ cc_library(
         ":device_factory",
         ":local_device",
         ":loose_headers",
+        ":node_file_writer",
         ":scoped_allocator",
         ":session_options",
-        ":node_file_writer",
-        "@com_google_absl//absl/base",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/base",
     ] + if_mkl([":mkl_cpu_allocator"]) + if_mkl_ml([
         "//third_party/mkl:intel_binary_blob",
     ]),
@@ -1980,14 +1980,14 @@ tf_cuda_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:utils",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + tf_additional_core_deps() + if_static([
         ":core_cpu_impl",
         "//tensorflow/core:function_ops_op_lib",
@@ -2614,6 +2614,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime/eager:rendezvous_cache",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:cwise_op",
@@ -2667,6 +2668,9 @@ tf_cc_test(
     name = "optimize_cross_host_control_deps_test",
     size = "small",
     srcs = ["optimize_cross_host_control_deps_test.cc"],
+    tags = [
+        "notsan",
+    ],
     deps = [
         ":optimize_cross_host_control_deps",
         "//tensorflow/cc:cc_ops",
@@ -2707,6 +2711,7 @@ tf_cuda_cc_test(
         ":core_cpu",
         ":core_cpu_internal",
         ":direct_session_internal",
+        "//tensorflow/cc:cc_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -2716,10 +2721,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "//third_party/eigen3",
-        "//tensorflow/cc:cc_ops",
         "//tensorflow/core/kernels:collective_ops",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:cwise_op",
@@ -2733,6 +2734,9 @@ tf_cuda_cc_test(
         "//tensorflow/core/kernels:queue_ops",
         "//tensorflow/core/kernels:session_ops",
         "//tensorflow/core/kernels:variable_ops",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + if_cuda(["//tensorflow/tsl/platform:cuda"]),
 )
 
@@ -2819,11 +2823,14 @@ tf_cc_test(
     size = "small",
     srcs = ["graph_runner_test.cc"],
     deps = [
-        "//tensorflow/core:array_ops_op_lib",
         ":core",
         ":core_cpu",
         ":core_cpu_internal",
         ":direct_session_internal",
+        "//tensorflow/c/kernels:bitcast_op_lib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -2833,11 +2840,8 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//third_party/eigen3",
-        "//tensorflow/c/kernels:bitcast_op_lib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
         "//tensorflow/core/kernels:cwise_op",
+        "//third_party/eigen3",
     ] + if_mkl(["//tensorflow/core:mkl_array_ops_op_lib"]),
 )
 
@@ -3305,6 +3309,7 @@ tf_cc_test(
         ":device_factory",
         ":device_set",
         ":optimize_function_graph_utils",
+        "//tensorflow/core:framework",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -3351,7 +3356,10 @@ tf_cc_fuzz_test(
     srcs = ["graph_constructor_fuzz.cc"],
     tags = ["no_oss"],
     deps = [
-        ":graph_constructor",
+        ":core",
+        ":session",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:tensorflow",
         "//tensorflow/core/ops",
     ],
 )
diff --git a/tensorflow/core/common_runtime/arg_ret_placement.cc b/tensorflow/core/common_runtime/arg_ret_placement.cc
index c70c58601cb..0467a36456f 100644
--- a/tensorflow/core/common_runtime/arg_ret_placement.cc
+++ b/tensorflow/core/common_runtime/arg_ret_placement.cc
@@ -174,38 +174,45 @@ static Status SetMemoryTypeHelper(
 
 // This helper function takes a list of pairs that contain an arg node.
 // Note that ints_on_device is only true for single device functions
-// (i.e. for cases where Placer is not run).
+// (i.e. for cases where Placer is not run). The DataType specified by the "T"
+// attr of input nodes is used.
 static Status SetMemoryTypeHelper(
     const std::vector<std::pair<Node*, FunctionArgIndex>> arg_nodes,
-    const DataTypeVector& dtypes, bool weak_flag, bool ints_on_device,
+    bool weak_flag, bool ints_on_device,
     std::vector<AllocatorAttributes>* alloc_attrs) {
-  DCHECK_EQ(arg_nodes.size(), dtypes.size());
   DCHECK(alloc_attrs != nullptr);
   alloc_attrs->reserve(arg_nodes.size());
-  for (int i = 0; i < arg_nodes.size(); ++i) {
-    TF_RETURN_IF_ERROR(
-        SetMemoryTypeForNode(arg_nodes[i].first, dtypes[i], /*is_arg=*/true,
-                             weak_flag, ints_on_device,
-                             /*memory_types=*/nullptr, alloc_attrs));
+  for (const auto& arg : arg_nodes) {
+    const AttrValue* attr_value = arg.first->attrs().Find("T");
+    if (attr_value == nullptr) {
+      return errors::Internal("Arg node missing T attribute");
+    }
+    DataType dtype = attr_value->type();
+    TF_RETURN_IF_ERROR(SetMemoryTypeForNode(
+        arg.first, dtype, /*is_arg=*/true, weak_flag, ints_on_device,
+        /*memory_types=*/nullptr, alloc_attrs));
   }
   return OkStatus();
 }
 
 // This helper function takes a list of pairs that contain a ret node.
 // Note that ints_on_device is only true for single device functions
-// (i.e. for cases where Placer is not run).
+// (i.e. for cases where Placer is not run). The DataType specified by the "T"
+// attr of input nodes is used.
 static Status SetMemoryTypeHelper(
-    const std::vector<std::pair<Node*, int>> ret_nodes,
-    const DataTypeVector& dtypes, bool weak_flag, bool ints_on_device,
-    std::vector<AllocatorAttributes>* alloc_attrs) {
-  DCHECK_EQ(ret_nodes.size(), dtypes.size());
+    const std::vector<std::pair<Node*, int>> ret_nodes, bool weak_flag,
+    bool ints_on_device, std::vector<AllocatorAttributes>* alloc_attrs) {
   DCHECK(alloc_attrs != nullptr);
   alloc_attrs->reserve(ret_nodes.size());
-  for (int i = 0; i < ret_nodes.size(); ++i) {
-    TF_RETURN_IF_ERROR(
-        SetMemoryTypeForNode(ret_nodes[i].first, dtypes[i], /*is_arg=*/false,
-                             weak_flag, ints_on_device,
-                             /*memory_types=*/nullptr, alloc_attrs));
+  for (const auto& ret : ret_nodes) {
+    const AttrValue* attr_value = ret.first->attrs().Find("T");
+    if (attr_value == nullptr) {
+      return errors::Internal("Ret node missing T attribute");
+    }
+    DataType dtype = attr_value->type();
+    TF_RETURN_IF_ERROR(SetMemoryTypeForNode(
+        ret.first, dtype, /*is_arg=*/false, weak_flag, ints_on_device,
+        /*memory_types=*/nullptr, alloc_attrs));
   }
   return OkStatus();
 }
@@ -271,34 +278,30 @@ Status WeakSetAllocAttrsForRets(const gtl::InlinedVector<Node*, 4>& nodes,
 
 Status SingleDeviceSetAllocAttrsForArgs(
     std::vector<std::pair<Node*, FunctionArgIndex>> arg_nodes,
-    const DataTypeVector& dtypes, bool ints_on_device,
-    std::vector<AllocatorAttributes>& alloc_attrs) {
-  return SetMemoryTypeHelper(arg_nodes, dtypes, /*weak_flag=*/false,
-                             ints_on_device, &alloc_attrs);
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs) {
+  return SetMemoryTypeHelper(arg_nodes, /*weak_flag=*/false, ints_on_device,
+                             &alloc_attrs);
 }
 
 Status WeakSingleDeviceSetAllocAttrsForArgs(
     std::vector<std::pair<Node*, FunctionArgIndex>> arg_nodes,
-    const DataTypeVector& dtypes, bool ints_on_device,
-    std::vector<AllocatorAttributes>& alloc_attrs) {
-  return SetMemoryTypeHelper(arg_nodes, dtypes, /*weak_flag=*/true,
-                             ints_on_device, &alloc_attrs);
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs) {
+  return SetMemoryTypeHelper(arg_nodes, /*weak_flag=*/true, ints_on_device,
+                             &alloc_attrs);
 }
 
 Status SingleDeviceSetAllocAttrsForRets(
-    const std::vector<std::pair<Node*, int>> ret_nodes,
-    const DataTypeVector& dtypes, bool ints_on_device,
+    const std::vector<std::pair<Node*, int>> ret_nodes, bool ints_on_device,
     std::vector<AllocatorAttributes>& alloc_attrs) {
-  return SetMemoryTypeHelper(ret_nodes, dtypes, /*weak_flag=*/false,
-                             ints_on_device, &alloc_attrs);
+  return SetMemoryTypeHelper(ret_nodes, /*weak_flag=*/false, ints_on_device,
+                             &alloc_attrs);
 }
 
 Status WeakSingleDeviceSetAllocAttrsForRets(
-    const std::vector<std::pair<Node*, int>> ret_nodes,
-    const DataTypeVector& dtypes, bool ints_on_device,
+    const std::vector<std::pair<Node*, int>> ret_nodes, bool ints_on_device,
     std::vector<AllocatorAttributes>& alloc_attrs) {
-  return SetMemoryTypeHelper(ret_nodes, dtypes, /*weak_flag=*/true,
-                             ints_on_device, &alloc_attrs);
+  return SetMemoryTypeHelper(ret_nodes, /*weak_flag=*/true, ints_on_device,
+                             &alloc_attrs);
 }
 
 }  // namespace tensorflow::full_type
diff --git a/tensorflow/core/common_runtime/arg_ret_placement.h b/tensorflow/core/common_runtime/arg_ret_placement.h
index 8eb9c9bdf49..4f00d18e3bb 100644
--- a/tensorflow/core/common_runtime/arg_ret_placement.h
+++ b/tensorflow/core/common_runtime/arg_ret_placement.h
@@ -102,19 +102,19 @@ Status WeakSetAllocAttrsForRets(const gtl::InlinedVector<Node*, 4>& nodes,
 // does not have expected full_type information. If an error raised about bad
 // full time information causes a breakage, changing
 // `SingleDeviceSetAllocAttrsForArgs` to `WeakSingleDeviceSetAllocAttrsForArgs`
-// is a possible work around.
+// is a possible work around. The DataType specified by the "T" attr of input
+// nodes is used.
 Status SingleDeviceSetAllocAttrsForArgs(
     std::vector<std::pair<Node*, FunctionArgIndex>> arg_nodes,
-    const DataTypeVector& dtypes, bool ints_on_device,
-    std::vector<AllocatorAttributes>& alloc_attrs);
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs);
 
 // Set the contents of alloc_attrs for args (inputs to functions, "_Arg" ops)
 // for a single device based on dtype. Logging of warnings if an int32 arg does
-// not have expected full_type information can be enabled.
+// not have expected full_type information can be enabled. The DataType
+// specified by the "T" attr of input nodes is used.
 Status WeakSingleDeviceSetAllocAttrsForArgs(
     std::vector<std::pair<Node*, FunctionArgIndex>> arg_nodes,
-    const DataTypeVector& dtypes, bool ints_on_device,
-    std::vector<AllocatorAttributes>& alloc_attrs);
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs);
 
 // Set the contents of alloc_attrs for rets (outputs from functions, "_Retval"
 // ops) for a single device based on dtype. Raises an error if an int32 ret does
@@ -122,18 +122,20 @@ Status WeakSingleDeviceSetAllocAttrsForArgs(
 // the ret does not have expected full type information). If an error raised
 // about bad full time information causes a breakage, changing
 // `SingleDeviceSetAllocAttrsForRets` to `WeakSingleDeviceSetAllocAttrsForRets`
-// is a possible work around.
+// is a possible work around. The DataType specified by the "T" attr of input
+// nodes is used.
 Status SingleDeviceSetAllocAttrsForRets(
-    std::vector<std::pair<Node*, int>> ret_nodes, const DataTypeVector& dtypes,
-    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs);
+    std::vector<std::pair<Node*, int>> ret_nodes, bool ints_on_device,
+    std::vector<AllocatorAttributes>& alloc_attrs);
 
 // Set the contents of alloc_attrs for rets (outputs from functions, "_Retval"
 // ops) for a single device based on dtype. Logging of warnings if an int32 ret
 // does not have expected full_type information (i.e. if the source of the input
-// to the ret does not have expected full type information) can be enabled.
+// to the ret does not have expected full type information) can be enabled. The
+// DataType specified by the "T" attr of input nodes is used.
 Status WeakSingleDeviceSetAllocAttrsForRets(
-    std::vector<std::pair<Node*, int>> ret_nodes, const DataTypeVector& dtypes,
-    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs);
+    std::vector<std::pair<Node*, int>> ret_nodes, bool ints_on_device,
+    std::vector<AllocatorAttributes>& alloc_attrs);
 
 // Given a FullTypeId, return the corresponding MemoryTypes (i.e. return
 // HOST_MEMORY for TFT_SHAPE_TENSOR, DEVICE_MEMORY othersize).
diff --git a/tensorflow/core/common_runtime/arg_ret_placement_test.cc b/tensorflow/core/common_runtime/arg_ret_placement_test.cc
index 46493c79f9c..f13f1d80283 100644
--- a/tensorflow/core/common_runtime/arg_ret_placement_test.cc
+++ b/tensorflow/core/common_runtime/arg_ret_placement_test.cc
@@ -250,7 +250,6 @@ TEST_F(FullTypeGraphUtilsTest, WeakAllocAttrsRetIgnore) {
 
 TEST_F(FullTypeGraphUtilsTest, AllocatorAttrsArgWithFTSingleDevice) {
   std::vector<std::pair<Node *, FunctionArgIndex>> arg_nodes;
-  DataTypeVector dtypes;
   std::vector<AllocatorAttributes> alloc_attrs;
 
   Node *arg, *ret;
@@ -258,16 +257,14 @@ TEST_F(FullTypeGraphUtilsTest, AllocatorAttrsArgWithFTSingleDevice) {
   AddArgFullType(arg, TFT_TENSOR, TFT_INT32);  // numeric INT32
 
   arg_nodes.push_back(std::make_pair(arg, FunctionArgIndex(0, 0)));
-  dtypes.push_back(DT_INT32);
   TF_ASSERT_OK(full_type::SingleDeviceSetAllocAttrsForArgs(
-      arg_nodes, dtypes, /*ints_on_device=*/true, alloc_attrs));
+      arg_nodes, /*ints_on_device=*/true, alloc_attrs));
   ASSERT_EQ(alloc_attrs.size(), 1);
   ASSERT_FALSE(alloc_attrs[0].on_host());
 }
 
 TEST_F(FullTypeGraphUtilsTest, AllocatorAttrsArgWithUnsetFTSingleDevice) {
   std::vector<std::pair<Node *, FunctionArgIndex>> arg_nodes;
-  DataTypeVector dtypes;
   std::vector<AllocatorAttributes> alloc_attrs;
 
   Node *arg, *ret;
@@ -275,17 +272,14 @@ TEST_F(FullTypeGraphUtilsTest, AllocatorAttrsArgWithUnsetFTSingleDevice) {
   AddArgFullType(arg, TFT_UNSET, TFT_UNSET);  // numeric INT32
 
   arg_nodes.push_back(std::make_pair(arg, FunctionArgIndex(0, 0)));
-  dtypes.push_back(DT_INT32);
   TF_ASSERT_OK(full_type::SingleDeviceSetAllocAttrsForArgs(
-      arg_nodes, dtypes, /*ints_on_device=*/true, alloc_attrs));
+      arg_nodes, /*ints_on_device=*/true, alloc_attrs));
   ASSERT_EQ(alloc_attrs.size(), 1);
   ASSERT_FALSE(alloc_attrs[0].on_host());
 }
 
 TEST_F(FullTypeGraphUtilsTest, WeakAllocatorAttrsArgWithFTSingleDevice) {
   std::vector<std::pair<Node *, FunctionArgIndex>> arg_nodes;
-  gtl::InlinedVector<Node *, 4> nodes;
-  DataTypeVector dtypes;
   std::vector<AllocatorAttributes> alloc_attrs;
 
   Node *arg, *ret;
@@ -293,16 +287,14 @@ TEST_F(FullTypeGraphUtilsTest, WeakAllocatorAttrsArgWithFTSingleDevice) {
   AddArgFullType(arg, TFT_SHAPE_TENSOR, TFT_INT32);
 
   arg_nodes.push_back(std::make_pair(arg, FunctionArgIndex(0, 0)));
-  dtypes.push_back(DT_INT32);
   TF_ASSERT_OK(full_type::WeakSingleDeviceSetAllocAttrsForArgs(
-      arg_nodes, dtypes, /*ints_on_device=*/false, alloc_attrs));
+      arg_nodes, /*ints_on_device=*/false, alloc_attrs));
   ASSERT_EQ(alloc_attrs.size(), 1);
   ASSERT_TRUE(alloc_attrs[0].on_host());
 }
 
 TEST_F(FullTypeGraphUtilsTest, SingleDeviceAllocAttrsRetError) {
   std::vector<std::pair<Node *, int>> ret_nodes;
-  DataTypeVector dtypes;
   std::vector<AllocatorAttributes> alloc_attrs;
 
   Node *arg, *ret;
@@ -310,39 +302,34 @@ TEST_F(FullTypeGraphUtilsTest, SingleDeviceAllocAttrsRetError) {
   // test TFT_SHAPE_TENSOR and ints_on_device=true mismatch
   AddArgFullType(arg, TFT_SHAPE_TENSOR, TFT_INT32);
   ret_nodes.push_back(std::make_pair(ret, 0));
-  dtypes.push_back(DT_INT32);
   Status status = full_type::SingleDeviceSetAllocAttrsForRets(
-      ret_nodes, dtypes, /*ints_on_device=*/true, alloc_attrs);
+      ret_nodes, /*ints_on_device=*/true, alloc_attrs);
   EXPECT_FALSE(status.ok());
 }
 
 TEST_F(FullTypeGraphUtilsTest, SingleDeviceAllocAttrsNotInt32) {
   std::vector<std::pair<Node *, int>> ret_nodes;
-  DataTypeVector dtypes;
   std::vector<AllocatorAttributes> alloc_attrs;
 
   Node *arg, *ret;
   TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_STRING));
   // If dtype is not DT_UINT32, then OK to not have full type information
   ret_nodes.push_back(std::make_pair(ret, 0));
-  dtypes.push_back(DT_STRING);
   TF_ASSERT_OK(full_type::SingleDeviceSetAllocAttrsForRets(
-      ret_nodes, dtypes, /*ints_on_device=*/false, alloc_attrs));
+      ret_nodes, /*ints_on_device=*/false, alloc_attrs));
   ASSERT_EQ(alloc_attrs.size(), 1);
   ASSERT_TRUE(alloc_attrs[0].on_host());
 }
 
 TEST_F(FullTypeGraphUtilsTest, SingleDeviceWeakAllocAttrsRetIgnore) {
   std::vector<std::pair<Node *, int>> ret_nodes;
-  DataTypeVector dtypes;
   std::vector<AllocatorAttributes> alloc_attrs;
 
   Node *arg, *ret;
   TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
   ret_nodes.push_back(std::make_pair(ret, 0));
-  dtypes.push_back(DT_INT32);
   TF_ASSERT_OK(full_type::WeakSingleDeviceSetAllocAttrsForRets(
-      ret_nodes, dtypes, /*ints_on_device=*/true, alloc_attrs));
+      ret_nodes, /*ints_on_device=*/true, alloc_attrs));
   ASSERT_EQ(alloc_attrs.size(), 1);
   ASSERT_FALSE(alloc_attrs[0].on_host());
 }
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index cf67f35eaf5..8119b0a58e6 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -241,7 +241,7 @@ void BaseCollectiveExecutor::StartAbort(const Status& s) {
     status_ = StatusGroup::MakeDerived(Status(
         s.code(),
         absl::StrCat(
-            "Collective ops is aborted by: ", s.error_message(),
+            "Collective ops is aborted by: ", s.message(),
             "\nThe error could be from a previous operation. Restart your "
             "program to reset.")));
     status = status_;
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.cc b/tensorflow/core/common_runtime/buf_rendezvous.cc
index 20cbbe2d6cf..bd0d21dd2cf 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous.cc
@@ -76,11 +76,12 @@ void BufRendezvous::PurgeTable(const Status& s, HookTable* table) {
 }
 
 string BufRendezvous::Hook::DebugString() const {
-  return absl::StrCat("[dev:", (prod_dev ? prod_dev->name() : "none"),
-                      ", ctx:", reinterpret_cast<uint64>(prod_ctx),
-                      ", val:", reinterpret_cast<uint64>(prod_value),
-                      ", pcb:", reinterpret_cast<uint64>(&prod_cb),
-                      ", ccb:", reinterpret_cast<uint64>(&cons_cb), "]");
+  return absl::StrCat(
+      "[dev:", (prod_dev ? prod_dev->name() : "none"),
+      ", ctx:", reinterpret_cast<uint64>(prod_ctx),
+      ", val:", reinterpret_cast<uint64>(prod_value),
+      ", pcb:", prod_cb ? reinterpret_cast<uint64>(&prod_cb) : 0,
+      ", ccb:", cons_cb ? reinterpret_cast<uint64>(&cons_cb) : 0, "]");
 }
 
 void BufRendezvous::ProvideBuf(const string& key, Device* dev,
@@ -88,6 +89,12 @@ void BufRendezvous::ProvideBuf(const string& key, Device* dev,
                                const AllocatorAttributes& attr,
                                const ProducerCallback& done,
                                CancellationManager* cancellation_manager) {
+  DVLOG(4) << "ProvideBuf: key = " << key;
+#ifndef NDEBUG
+  if (VLOG_IS_ON(4)) {
+    LogContents();
+  }
+#endif
   Hook* h = nullptr;
   Status providebuf_status;
   do {
@@ -138,6 +145,8 @@ void BufRendezvous::ProvideBuf(const string& key, Device* dev,
     }
   } while (false);
   if (h) {
+    DVLOG(4) << "ProvideBuf: key = " << key << ": calling cons_cb"
+             << h->DebugString();
     DeregisterCancellation(h);
     h->cons_cb(OkStatus(), h);
   }
@@ -150,6 +159,12 @@ void BufRendezvous::ConsumeBuf(const string& key, const string& device_name,
                                const uint64 device_incarnation,
                                const ConsumerCallback& done,
                                CancellationManager* cancellation_manager) {
+  DVLOG(4) << "ConsumeBuf: key = " << key << " device_name = " << device_name;
+#ifndef NDEBUG
+  if (VLOG_IS_ON(4)) {
+    LogContents();
+  }
+#endif
   // Check the incarnation in the request matches the current device
   // incarnation of the producer.
   Device* device;
@@ -167,7 +182,6 @@ void BufRendezvous::ConsumeBuf(const string& key, const string& device_name,
     done(consumebuf_status, nullptr);
     return;
   }
-
   Hook* existing_hook = nullptr;
   do {
     mutex_lock l(mu_);
@@ -207,6 +221,8 @@ void BufRendezvous::ConsumeBuf(const string& key, const string& device_name,
     }
   } while (false);
   if (existing_hook) {
+    DVLOG(4) << "ConsumeBuf: key = " << key << ": calling cons_cb"
+             << existing_hook->DebugString();
     DeregisterCancellation(existing_hook);
     existing_hook->cons_cb(OkStatus(), existing_hook);
     return;
diff --git a/tensorflow/core/common_runtime/buf_rendezvous_test.cc b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
index 75d3c0a9c69..5e2c6172aa4 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous_test.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
@@ -168,7 +168,7 @@ TEST_F(BufRendezvousTest, ErrorDuplicatePut) {
   EXPECT_FALSE(bad_status.ok());
   EXPECT_EQ(absl::StrCat("BufRendezvous::ProvideBuf already called for key ",
                          *kDefaultKey),
-            bad_status.error_message());
+            bad_status.message());
   EXPECT_FALSE(prod_callback_called);
   br_.reset();
 }
@@ -185,8 +185,7 @@ TEST_F(BufRendezvousTest, ErrorDeleteNonEmpty) {
   EXPECT_TRUE(cons_status.ok());
   br_.reset();
   EXPECT_FALSE(cons_status.ok());
-  EXPECT_EQ("Delete called on non-empty BufRendezvous",
-            cons_status.error_message());
+  EXPECT_EQ("Delete called on non-empty BufRendezvous", cons_status.message());
 }
 
 TEST_F(BufRendezvousTest, AbortNonEmpty) {
@@ -212,9 +211,9 @@ TEST_F(BufRendezvousTest, AbortNonEmpty) {
   prod_note.WaitForNotification();
   cons_note.WaitForNotification();
   EXPECT_FALSE(prod_status.ok());
-  EXPECT_EQ(prod_status.error_message(), "Falling sky detected");
+  EXPECT_EQ(prod_status.message(), "Falling sky detected");
   EXPECT_FALSE(cons_status.ok());
-  EXPECT_EQ(cons_status.error_message(), "Falling sky detected");
+  EXPECT_EQ(cons_status.message(), "Falling sky detected");
 }
 
 TEST_F(BufRendezvousTest, AbortEmpty) {
@@ -244,11 +243,9 @@ TEST_F(BufRendezvousTest, UseAfterAbort) {
   prod_note.WaitForNotification();
   cons_note.WaitForNotification();
   EXPECT_FALSE(prod_status.ok());
-  EXPECT_NE(prod_status.error_message().find("Falling sky detected"),
-            string::npos);
+  EXPECT_NE(prod_status.message().find("Falling sky detected"), string::npos);
   EXPECT_FALSE(cons_status.ok());
-  EXPECT_NE(cons_status.error_message().find("Falling sky detected"),
-            string::npos);
+  EXPECT_NE(cons_status.message().find("Falling sky detected"), string::npos);
 }
 
 TEST_F(BufRendezvousTest, DeviceIncarnationMismatch) {
@@ -283,7 +280,7 @@ TEST_F(BufRendezvousTest, ProvideThenCancel) {
   note.WaitForNotification();
   EXPECT_TRUE(errors::IsCancelled(status));
   EXPECT_NE(
-      status.error_message().find(absl::StrCat(
+      status.message().find(absl::StrCat(
           "Operation was cancelled for BufRendezvous key ", *kDefaultKey)),
       string::npos);
 }
@@ -302,7 +299,7 @@ TEST_F(BufRendezvousTest, CancelThenProvide) {
   note.WaitForNotification();
   EXPECT_TRUE(errors::IsCancelled(status));
   EXPECT_NE(
-      status.error_message().find(absl::StrCat(
+      status.message().find(absl::StrCat(
           "Operation was cancelled for BufRendezvous key ", *kDefaultKey)),
       string::npos);
 }
@@ -321,7 +318,7 @@ TEST_F(BufRendezvousTest, ConsumeThenCancel) {
   note.WaitForNotification();
   EXPECT_TRUE(errors::IsCancelled(status));
   EXPECT_NE(
-      status.error_message().find(absl::StrCat(
+      status.message().find(absl::StrCat(
           "Operation was cancelled for BufRendezvous key ", *kDefaultKey)),
       string::npos);
 }
@@ -340,7 +337,7 @@ TEST_F(BufRendezvousTest, CancelThenConsume) {
   note.WaitForNotification();
   EXPECT_TRUE(errors::IsCancelled(status));
   EXPECT_NE(
-      status.error_message().find(absl::StrCat(
+      status.message().find(absl::StrCat(
           "Operation was cancelled for BufRendezvous key ", *kDefaultKey)),
       string::npos);
 }
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
index d68c7871f28..86f70de4158 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -89,7 +89,7 @@ TEST_F(CollectiveExecutorMgrTest, StepSequenceRelated) {
                                    });
   ss_note.WaitForNotification();
   EXPECT_FALSE(ss_status.ok());
-  EXPECT_EQ(ss_status.error_message(),
+  EXPECT_EQ(ss_status.message(),
             "CollectiveExecutorMgr does not implement RefreshStepIdSequence.");
   Notification gs_note;
   Status gs_status;
@@ -102,7 +102,7 @@ TEST_F(CollectiveExecutorMgrTest, StepSequenceRelated) {
                              });
   gs_note.WaitForNotification();
   EXPECT_FALSE(gs_status.ok());
-  EXPECT_EQ(gs_status.error_message(),
+  EXPECT_EQ(gs_status.message(),
             "CollectiveExecutorMgr does not implement GetStepSequence.");
 }
 
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index 0d4f14c0458..cebcee53ba3 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -557,22 +557,8 @@ void CollectiveParamResolverLocal::CompleteDefaultRanking(CollGroupParams* gp) {
   // Sort gp->member to avoid indeterminism.
   std::sort(gp->members.begin(), gp->members.end(),
             [](const CollGroupMember& lhs, const CollGroupMember& rhs) {
-              DeviceNameUtils::ParsedName lhs_device_name, rhs_device_name;
-              if (DeviceNameUtils::ParseFullName(lhs.device.name(),
-                                                 &lhs_device_name) &&
-                  DeviceNameUtils::ParseFullName(rhs.device.name(),
-                                                 &rhs_device_name)) {
-                if (lhs_device_name.job == rhs_device_name.job) {
-                  if (lhs_device_name.task == rhs_device_name.task) {
-                    return lhs_device_name.id < rhs_device_name.id;
-                  } else {
-                    return lhs_device_name.task < rhs_device_name.task;
-                  }
-                } else {
-                  return lhs_device_name.job < rhs_device_name.job;
-                }
-              }
-              return lhs.device.name() < rhs.device.name();
+              return DeviceNameUtils::CompareFullNames(lhs.device.name(),
+                                                       rhs.device.name());
             });
   // Establish an instance-specific default rank order for devices
   // based on localities.  This rank order should be a good ring
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index ea83904358e..24d88fa9929 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -290,7 +290,7 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcastForgotSender) {
   }
   for (int i = 0; i < NUM_DEVS; ++i) {
     EXPECT_EQ(statuses[i].code(), error::INTERNAL);
-    EXPECT_EQ(statuses[i].error_message(),
+    EXPECT_EQ(statuses[i].message(),
               strings::StrCat(
                   "Instance ", kInstanceKey,
                   " found no source for broadcast.  This could mean that there"
@@ -328,8 +328,9 @@ TEST_F(CollectiveParamResolverLocalTest, AbortPendingGroup) {
                                    /*is_source*/ i == 0);
       prl_->CompleteParamsAsync(GetDeviceAttributes(device), cp[i], &cancel_mgr,
                                 [&done, cp = cp[i]](const Status& s) {
-                                  EXPECT_EQ(s.code(), error::ABORTED);
-                                  EXPECT_EQ(s.error_message(), "__aborted__");
+                                  EXPECT_EQ(s.code(),
+                                            absl::StatusCode::kAborted);
+                                  EXPECT_EQ(s.message(), "__aborted__");
                                   done.DecrementCount();
                                   cp->Unref();
                                 });
@@ -337,7 +338,7 @@ TEST_F(CollectiveParamResolverLocalTest, AbortPendingGroup) {
     });
   }
   start.Wait();
-  prl_->StartAbort(Status(error::ABORTED, "__aborted__"));
+  prl_->StartAbort(Status(absl::StatusCode::kAborted, "__aborted__"));
   done.Wait();
 }
 
@@ -378,8 +379,9 @@ TEST_F(CollectiveParamResolverLocalTest, AbortPendingInstance) {
                                    /*is_source*/ i == 0);
       prl_->CompleteParamsAsync(GetDeviceAttributes(device), cp[i], &cancel_mgr,
                                 [&done, cp = cp[i]](const Status& s) {
-                                  EXPECT_EQ(s.code(), error::ABORTED);
-                                  EXPECT_EQ(s.error_message(), "__aborted__");
+                                  EXPECT_EQ(s.code(),
+                                            absl::StatusCode::kAborted);
+                                  EXPECT_EQ(s.message(), "__aborted__");
                                   done.DecrementCount();
                                   cp->Unref();
                                 });
@@ -387,7 +389,7 @@ TEST_F(CollectiveParamResolverLocalTest, AbortPendingInstance) {
     });
   }
   start.Wait();
-  prl_->StartAbort(Status(error::ABORTED, "__aborted__"));
+  prl_->StartAbort(Status(absl::StatusCode::kAborted, "__aborted__"));
   done.Wait();
 }
 
@@ -417,7 +419,7 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsAfterAbortion) {
     }
     done.Wait();
   }
-  prl_->StartAbort(Status(error::ABORTED, "__aborted__"));
+  prl_->StartAbort(Status(absl::StatusCode::kAborted, "__aborted__"));
 
   auto complete_params = [this, &cancel_mgr](int group_key, int instance_key) {
     string device = "/job:localhost/replica:0/task:0/device:CPU:0";
@@ -427,8 +429,8 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsAfterAbortion) {
     core::ScopedUnref unref(cp);
     prl_->CompleteParamsAsync(GetDeviceAttributes(device), cp, &cancel_mgr,
                               [&done](const Status& s) {
-                                EXPECT_EQ(s.code(), error::ABORTED);
-                                EXPECT_EQ(s.error_message(), "__aborted__");
+                                EXPECT_EQ(s.code(), absl::StatusCode::kAborted);
+                                EXPECT_EQ(s.message(), "__aborted__");
                                 done.Notify();
                               });
     done.WaitForNotification();
@@ -474,8 +476,8 @@ TEST_F(CollectiveParamResolverLocalTest, AbortNormalCompleteParamsAsync) {
               cp->Unref();
               // The status should be either OK or the aborted status.
               if (!status.ok()) {
-                EXPECT_EQ(status.code(), error::ABORTED);
-                EXPECT_EQ(status.error_message(), "__aborted__");
+                EXPECT_EQ(status.code(), absl::StatusCode::kAborted);
+                EXPECT_EQ(status.message(), "__aborted__");
                 done.DecrementCount();
                 return;
               }
@@ -488,7 +490,7 @@ TEST_F(CollectiveParamResolverLocalTest, AbortNormalCompleteParamsAsync) {
     // on different code points each time.
     int64_t delay_ms = random::New64() % 50000;
     Env::Default()->SleepForMicroseconds(delay_ms);
-    prl_->StartAbort(Status(error::ABORTED, "__aborted__"));
+    prl_->StartAbort(Status(absl::StatusCode::kAborted, "__aborted__"));
     done.Wait();
     ResetParamResolver(ConfigProto());
   }
diff --git a/tensorflow/core/common_runtime/colocation_graph.cc b/tensorflow/core/common_runtime/colocation_graph.cc
index a74588ea991..795f8fe4678 100644
--- a/tensorflow/core/common_runtime/colocation_graph.cc
+++ b/tensorflow/core/common_runtime/colocation_graph.cc
@@ -501,7 +501,7 @@ Status Member::AssignDevice(const Node& node) {
         "root's assigned device name: ",
         DeviceNameUtils::ParsedNameToString(assigned_device_name_),
         " node's assigned device name \"", node.assigned_device_name(),
-        ". Error: ", s.error_message());
+        ". Error: ", s.message());
   }
   s = DeviceNameUtils::MergeOverrideDevNames(&resource_device_name_, parsed);
   if (!s.ok()) {
@@ -510,7 +510,7 @@ Status Member::AssignDevice(const Node& node) {
         "root's resource device name: ",
         DeviceNameUtils::ParsedNameToString(resource_device_name_),
         " node's assigned device name \"", node.assigned_device_name(),
-        ". Error: ", s.error_message());
+        ". Error: ", s.message());
   }
   s = DeviceNameUtils::MergeOverrideDevNames(&requested_device_name_, parsed);
   if (!s.ok()) {
@@ -519,7 +519,7 @@ Status Member::AssignDevice(const Node& node) {
         "root's requested device name: \"",
         DeviceNameUtils::ParsedNameToString(requested_device_name_),
         "\", node's assigned device name \"", node.assigned_device_name(),
-        "\". Error: ", s.error_message());
+        "\". Error: ", s.message());
   }
 
   assigned_device_name_index_ = node.assigned_device_name_index();
@@ -728,7 +728,7 @@ Status ColocationGraph::ColocateResourceOrRefEdge(const Node* src,
             "Nodes were connected by a reference or resource connection "
             "(requiring them to be on the same device), but the two nodes "
             "were assigned two different devices: ",
-            status.error_message()),
+            status.message()),
         *dst);
   }
   return OkStatus();
@@ -1135,8 +1135,7 @@ Status ColocationGraph::ColocateNodes(const Node& x, int x_root, const Node& y,
     return errors::InvalidArgument(
         "Cannot colocate nodes ",
         errors::FormatColocationNodeForError(x.name()), " and ",
-        errors::FormatColocationNodeForError(y.name()), ": ",
-        s.error_message());
+        errors::FormatColocationNodeForError(y.name()), ": ", s.message());
   }
 
   // Ensure that the common root has at least one supported device
diff --git a/tensorflow/core/common_runtime/composite_device_test.cc b/tensorflow/core/common_runtime/composite_device_test.cc
index 219fcba954b..af2c7915d2c 100644
--- a/tensorflow/core/common_runtime/composite_device_test.cc
+++ b/tensorflow/core/common_runtime/composite_device_test.cc
@@ -31,7 +31,7 @@ TEST(CompositeDeviceTest, Basic) {
                                     parsed_host_name, &status);
     EXPECT_EQ(composite_device, nullptr);
     EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
-    EXPECT_TRUE(absl::StrContains(status.error_message(),
+    EXPECT_TRUE(absl::StrContains(status.message(),
                                   "underlying_devices should not be empty"))
         << status.ToString();
   }
@@ -59,7 +59,7 @@ TEST(CompositeDeviceTest, Basic) {
                                     parsed_host_name, &status);
     EXPECT_EQ(composite_device, nullptr);
     EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
-    EXPECT_TRUE(absl::StrContains(status.error_message(),
+    EXPECT_TRUE(absl::StrContains(status.message(),
                                   "Expect device type CPU; but got type GPU"))
         << status.ToString();
   }
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 757bd57d34a..318ce8c958b 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include <algorithm>
 #include <atomic>
 #include <set>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function_utils.h"
@@ -48,6 +50,23 @@ namespace {
 
 const char kScopedAllocatorAttrName[] = "_scoped_allocator";
 
+// For stateless RNGs ops, they are pure but device-dependent. Those ops are not
+// constant-foldable.
+static absl::flat_hash_set<std::string>* kBlockList =
+    new absl::flat_hash_set<std::string>({"StatelessRandomGetKeyCounter"});
+
+// Always allow these ops to fold even if their shape information is incomplete.
+static absl::flat_hash_set<std::string>* kAllowList =
+    new absl::flat_hash_set<std::string>({
+        "Cast",
+        "Const",
+        "Identity",
+        "IdentityN",
+        "Less",
+        "NoOp",
+        "StopGradient",
+    });
+
 // Test to see if the Op is one that turns into a constant when its
 // inputs' shapes are known.
 bool IsShapeOp(const Node* n) {
@@ -242,9 +261,14 @@ bool IsConstantFoldable(
     if (shape_it != shape_map->end()) {
       for (int64_t i = 0; i < shape_it->second.size(); ++i) {
         const auto& out_shape = shape_it->second[i];
-        if (out_shape.IsFullyDefined() &&
-            out_shape.num_elements() * DataTypeSize(n->output_type(i)) >
-                max_constant_size_in_bytes) {
+        // Don't fold nodes that are too large or if we can't determine the
+        // shape. We special case nodes that are known to have a safe expansion.
+        if (!out_shape.IsFullyDefined() &&
+            !kAllowList->contains(n->type_string())) {
+          return false;
+        }
+        if (out_shape.num_elements() * DataTypeSize(n->output_type(i)) >
+            max_constant_size_in_bytes) {
           return false;
         }
       }
@@ -284,6 +308,11 @@ bool IsConstantFoldable(
             << "] for constant folding due to scoped allocator";
     return false;
   }
+  if (kBlockList->contains(n->type_string())) {
+    VLOG(2) << "Skip node [" << n->DebugString()
+            << "] for constant folding, it is in constant folding block list";
+    return false;
+  }
   return true;
 }
 
@@ -296,51 +325,51 @@ void ConsiderConstantFoldableNode(
     std::unordered_map<const Node*, gtl::FlatSet<Node*>>* constant_control_deps,
     std::unordered_map<const Node*, std::vector<Tensor>>* shape_replacement_map,
     bool* internal_node_inserted) {
-  if (IsConstantFoldable(n, opts.shape_map, opts.consider,
-                         opts.max_constant_size_in_bytes,
-                         shape_replacement_map)) {
-    // A node is constant provided all of its non-control incoming Tensors come
-    // from constant nodes, or it's a shape Op with statically known inputs in
-    // which case it is placed in shape_replacement_map.
-    //
-    // We allow control dependencies from non-constant nodes to constant nodes,
-    // but to preserve the graph structure we must transfer the control
-    // dependency onto any constant replacement.
-    bool all_parents_constant = true;
-    for (const Edge* in : n->in_edges()) {
-      // Allows non-constant -> constant control edges.
-      if (!in->IsControlEdge() &&
-          constant_control_deps->count(in->src()) == 0) {
-        all_parents_constant = false;
-        break;
+  if (!IsConstantFoldable(n, opts.shape_map, opts.consider,
+                          opts.max_constant_size_in_bytes,
+                          shape_replacement_map)) {
+    return;
+  }
+  // A node is constant provided all of its non-control incoming Tensors come
+  // from constant nodes, or it's a shape Op with statically known inputs in
+  // which case it is placed in shape_replacement_map.
+  //
+  // We allow control dependencies from non-constant nodes to constant nodes,
+  // but to preserve the graph structure we must transfer the control
+  // dependency onto any constant replacement.
+  bool all_parents_constant = true;
+  for (const Edge* in : n->in_edges()) {
+    // Allows non-constant -> constant control edges.
+    if (!in->IsControlEdge() && constant_control_deps->count(in->src()) == 0) {
+      all_parents_constant = false;
+      break;
+    }
+  }
+  if (all_parents_constant || shape_replacement_map->count(n) != 0) {
+    gtl::FlatSet<Node*>& control_deps = (*constant_control_deps)[n];
+    for (const Edge* e : n->in_edges()) {
+      if (constant_control_deps->count(e->src()) == 0) {
+        // This branch is taken if the incoming edge is a control dependency,
+        // in which case we want to add it to the dependencies being
+        // accumulated for this node, or the incoming edge is not
+        // constant. The latter may happen when n is a shape node and the
+        // source has known shape. In that case add a control dependency from
+        // the source node, since there was previously a data dependency and
+        // we want to preserve sequencing constraints.
+        if (!e->src()->IsSource()) {
+          control_deps.insert(e->src());
+        }
+      } else {
+        // If the parent has been accumulating control dependencies, add all
+        // of its transitive control deps.
+        const gtl::FlatSet<Node*>& parent_deps =
+            (*constant_control_deps)[e->src()];
+        control_deps.insert(parent_deps.begin(), parent_deps.end());
       }
     }
-    if (all_parents_constant || shape_replacement_map->count(n) != 0) {
-      gtl::FlatSet<Node*>& control_deps = (*constant_control_deps)[n];
-      for (const Edge* e : n->in_edges()) {
-        if (constant_control_deps->count(e->src()) == 0) {
-          // This branch is taken if the incoming edge is a control dependency,
-          // in which case we want to add it to the dependencies being
-          // accumulated for this node, or the incoming edge is not
-          // constant. The latter may happen when n is a shape node and the
-          // source has known shape. In that case add a control dependency from
-          // the source node, since there was previously a data dependency and
-          // we want to preserve sequencing constraints.
-          if (!e->src()->IsSource()) {
-            control_deps.insert(e->src());
-          }
-        } else {
-          // If the parent has been accumulating control dependencies, add all
-          // of its transitive control deps.
-          const gtl::FlatSet<Node*>& parent_deps =
-              (*constant_control_deps)[e->src()];
-          control_deps.insert(parent_deps.begin(), parent_deps.end());
-        }
-      }
-      nodes->push_back(n);
-      if (!n->IsConstant()) {
-        *internal_node_inserted = true;
-      }
+    nodes->push_back(n);
+    if (!n->IsConstant()) {
+      *internal_node_inserted = true;
     }
   }
 }
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index f165ec0c901..29ad1493c3f 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -208,7 +208,7 @@ void CopyTensor::ViaDMA(StringPiece edge_name, DeviceContext* send_dev_context,
                         bool sync_dst_compute) {
   profiler::ScopedAnnotation annotation(
       [&] { return absl::StrCat("#edge_name=", edge_name, "#"); });
-  VLOG(1) << "Copy " << edge_name;
+  VLOG(4) << "Copy " << edge_name;
 
   const DeviceType src_device_type(
       src_alloc_attr.on_host() ? DEVICE_CPU : src->attributes().device_type());
diff --git a/tensorflow/core/common_runtime/device/device_event_mgr.cc b/tensorflow/core/common_runtime/device/device_event_mgr.cc
index 7174e5a434c..1a9c9bf6c9a 100644
--- a/tensorflow/core/common_runtime/device/device_event_mgr.cc
+++ b/tensorflow/core/common_runtime/device/device_event_mgr.cc
@@ -17,22 +17,18 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
-#include <optional>
 #include <utility>
 
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_activation.h"
 #include "tensorflow/core/platform/stacktrace.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/thread_annotations.h"
 
 namespace tensorflow {
 
 namespace {
-
-// The EventMgr has two threads to execute event callback functions. Issues for
-// reconsideration:
+// The EventMgr has 1 thread for the polling loop and one to execute
+// event callback functions. Issues for reconsideration:
 //  - Is this the right number of threads?
 //  - Should EventMgrs be shared between devices on a machine with multiple
 //  devices of the same type?
@@ -101,146 +97,176 @@ void InitThreadpoolLabels(thread::ThreadPool* threadpool) {
 
 EventMgr::EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
     : exec_(se),
+      polling_active_delay_usecs_(gpu_options.polling_active_delay_usecs()
+                                      ? gpu_options.polling_active_delay_usecs()
+                                      : 10),
       threadpool_(Env::Default(), "Device_Event_Manager", kNumThreads) {
   device_event_mgr::InitThreadpoolLabels(&threadpool_);
+  StartPollingLoop();
 }
 
 EventMgr::~EventMgr() {
-  // Wait for all streams to complete.  All of the streams have completed when
-  // `callback_streams_` is empty, or when all of the remaining streams are in
-  // an error state.
-  mutex_lock lock(mu_);
-  mu_.Await(tsl::Condition(
-      +[](decltype(callback_streams_)* callback_streams) {
-        // std::all_of returns true if the container is empty.
-        return absl::c_all_of(*callback_streams,
-                              [](auto& kv) { return !kv.first->ok(); });
-      },
-      &callback_streams_));
+  StopPollingLoop();
 
+  for (auto& [stream, stream_callbacks] : callbacks_) {
+    for (auto& [event, callback] : stream_callbacks) {
+      threadpool_.Schedule(std::move(callback));
+    }
+  }
   // The threadpool's destructor will block waiting for all outstanding
   // callbacks to complete.
 }
 
-void EventMgr::ThenExecute(se::Stream* stream, std::function<void()> func) {
-  // Ensure the correct GPU is active before making any CUDA calls.
-  //
-  // This shouldn't be necessary!  StreamExecutor uses the CUDA driver API, and
-  // all calls there take a GPU context as a parameter; the "current GPU" from
-  // the perspective of the runtime API shouldn't matter.  But we can verify
-  // that it in fact *does* matter, perhaps specifically because of the
-  // ThenHostCallback calls, though it's hard to tell.
-  //
-  // This library is only available when GOOGLE_CUDA is defined.
-#if GOOGLE_CUDA
-  stream_executor::gpu::ScopedActivateExecutorContext scoped_activation{exec_};
-#endif
-
-  // tl;dr: Don't make CUDA calls while holding the lock on mu_.
-  //
-  // There are three mutexes at play here.  We need to be careful to avoid a
-  // cycle in the order in which we acquire them.  The mutexes are:
-  //
-  //  1. Any mutexes inside the CUDA runtime or driver.  We can consider these
-  //     to be a single mutex that wraps all CUDA calls and is also held while
-  //     running the host callback, because that's the worst case for our
-  //     analysis.
-  //  2. Any mutexes inside threadpool_, which again we can consier to be a
-  //     single mutex wrapping all calls to the object.
-  //  3. EventMgr::mu_.
-  //
-  // The CUDA host callback needs to schedule func on threadpool_ (that's the
-  // whole point of all this).  It also needs to modify internal state of the
-  // EventMgr, e.g. to push an elements back onto free_events_ and
-  // free_streams_.  Thus it's unavoidable that we acquire mutexes (2) and (3)
-  // while holding (1).  This means that to avoid a deadlock, we must drop the
-  // lock on the EventMgr (3) before making any CUDA API calls (1)!
-
-  // Get an event and stream off the free list, lazily creating them if
-  // necessary.  There's currently no limit on the number of allocated events
-  // and streams.
-  //
-  // If we have to create a new stream/event, don't call Init() while holding
-  // mu_, because that's what touches the CUDA API and can cause deadlocks.
-  std::unique_ptr<se::Event> event;
-  bool is_new_event = false;
-  se::Stream* callback_stream;
-  bool is_new_stream = false;
+void EventMgr::StartPollingLoop() {
+  CHECK(polling_stopped_ == nullptr);
   {
-    mutex_lock lock(mu_);
+    mutex_lock l(mu_);
+    stop_polling_ = false;
+  }
+  polling_stopped_ = std::make_unique<Notification>();
+  threadpool_.Schedule([this]() { PollLoop(); });
+}
 
-    // Get an event off the free list.
-    if (free_events_.empty()) {
-      free_events_.push_back(std::make_unique<se::Event>(exec_));
-      is_new_event = true;
+void EventMgr::StopPollingLoop() {
+  if (polling_stopped_) {
+    {
+      mutex_lock l(mu_);
+      stop_polling_ = true;
+      events_pending_.notify_all();
     }
-    event = std::move(free_events_.back());
-    free_events_.pop_back();
+    polling_stopped_->WaitForNotification();
+    polling_stopped_.reset(nullptr);
+  }
+}
 
-    // Get the internal stream associated with `stream`, or grab one off the
-    // free list.
-    //
-    // Disable thread-safety analysis on this lambda because tsl::Mutex
-    // currently lacks an AssertHeld function.  :(
-    auto it = callback_streams_.lazy_emplace(
-        stream, [&](const auto& ctor) ABSL_NO_THREAD_SAFETY_ANALYSIS {
-          if (free_streams_.empty()) {
-            free_streams_.push_back(std::make_unique<se::Stream>(exec_));
-            is_new_stream = true;
+// A polling loop to detect completion of device events.
+//
+// While one or more events is outstanding, poll for completed events.  When no
+// events are outstanding, we sleep until one is enqueued.
+void EventMgr::PollLoop() {
+  while (true) {
+    bool events_still_pending;
+    {
+      mutex_lock l(mu_);
+      if (stop_polling_) {
+        break;
+      }
+      if (callbacks_.empty()) {
+        events_pending_.wait(l);
+      }
+      PollEvents(/*stream=*/nullptr);  // poll all streams
+      events_still_pending = !callbacks_.empty();
+    }
+
+    if (events_still_pending) {
+      Env::Default()->SleepForMicroseconds(polling_active_delay_usecs_);
+    }
+  }
+  polling_stopped_->Notify();
+}
+
+void EventMgr::EnqueueCallback(se::Stream* stream, std::function<void()> func) {
+  VLOG(2) << "EnqueueCallback with one or more callbacks pending on "
+          << callbacks_.size() << " streams and " << free_events_.size()
+          << " unused event objects.";
+  // Events are created on demand, and repeatedly reused.  There is no
+  // limit placed here on the number of allocated Events.
+  if (free_events_.empty()) {
+    free_events_.push_back(std::make_unique<se::Event>(exec_));
+    free_events_.back()->Init();
+  }
+
+  std::unique_ptr<se::Event> e = std::move(free_events_.back());
+  free_events_.pop_back();
+  stream->ThenRecordEvent(e.get());
+
+  bool was_empty = callbacks_.empty();
+  callbacks_[stream].push_back({std::move(e), std::move(func)});
+
+  // Wake up the polling thread if it was sleeping.
+  if (was_empty) {
+    events_pending_.notify_all();
+  }
+}
+
+// This function must be called periodically to check whether pending
+// events have recorded, and then retire them.  Initial observations
+// suggest that typical behavior in a TensorFlow program is to have
+// 0-3 events pending most of the time, but there are occasionally
+// spikes of up to several hundred outstanding.  (If GPUKernelTracker
+// is used to cap pending kernels there should never be more than
+// that many.)
+void EventMgr::PollEvents(se::Stream* stream /*=nullptr*/) {
+  VLOG(2) << "PollEvents with one or more callbacks pending on "
+          << callbacks_.size() << " streams and " << free_events_.size()
+          << " unused event objects.";
+
+  // Polls the events for one stream.
+  //
+  // `stream_it` should be an iterator into callbacks_.  Modifies stream_it so
+  // it points to the next element of callbacks_.
+  auto poll_events_for_stream_it =
+      [&](auto& stream_it) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        auto& stream_callbacks = stream_it->second;
+
+        auto it = stream_callbacks.begin();
+        while (it != stream_callbacks.end()) {
+          auto& [event, callback] = *it;
+
+          se::Event::Status s = event->PollForStatus();
+          bool keep_looping = true;
+          switch (s) {
+            case se::Event::Status::kUnknown:
+            case se::Event::Status::kError:
+              // We don't expect to see these.  Someday maybe propagate
+              // a Status error, but for now fail hard.
+              LOG(FATAL) << "Unexpected Event status: " << static_cast<int>(s);
+              break;
+            case se::Event::Status::kPending:
+              // If this event is still pending, then all events after it are
+              // guaranteed to be pending as well, so we can stop looping.
+              keep_looping = false;
+              break;
+            case se::Event::Status::kComplete:
+              free_events_.push_back(std::move(event));
+              threadpool_.Schedule(std::move(callback));
+              // std::deque::erase() does invalidate iterators, so we can't
+              // erase `it` here.  Instead, we'll wait until the end of the loop
+              // over stream_callbacks and erase all of the completed events at
+              // that point.
+              ++it;
+              break;
           }
-          ctor(stream, std::make_pair(std::move(free_streams_.back()),
-                                      /*num_pending_events=*/0));
-          free_streams_.pop_back();
-        });
-    callback_stream = it->second.first.get();
-    it->second.second++;  // increment num_pending_events
-  }
-  if (is_new_event) {
-    event->Init();
-  }
-  if (is_new_stream) {
-    callback_stream->Init();
-  }
 
-  // Set callback_stream to run `func` when `stream` finishes the work that's
-  // currently pending.
-  stream->ThenRecordEvent(event.get());
-  callback_stream->ThenWaitFor(event.get());
-
-  // `mutable` is needed on the lambda so we can move `event` and `func`.
-  // Without `mutable`, these variables are const and can't be moved.
-  callback_stream->ThenDoHostCallbackWithStatus(
-      [this, stream, event = std::move(event),
-       func = std::move(func)]() mutable {
-        threadpool_.Schedule(std::move(func));
-
-        mutex_lock lock(mu_);
-        free_events_.push_back(std::move(event));
-
-        // Update the number of pending events on `stream` and erase it from
-        // callback_streams_ if no events are pending any longer.
-        auto callback_stream_it = callback_streams_.find(stream);
-        if (callback_stream_it == callback_streams_.end()) {
-          return tsl::errors::Internal(
-              "Invariant violation in EventMgr: callback_streams_ does not "
-              "contain stream ",
-              stream);
+          if (!keep_looping) {
+            break;
+          }
         }
-        auto& [callback_stream, num_pending_events] =
-            callback_stream_it->second;
-        if (num_pending_events <= 0) {
-          return tsl::errors::Internal(
-              "Invariant violation in EventMgr: refcount for stream ", stream,
-              "should be >= 1, but was ", num_pending_events);
-        }
-        num_pending_events--;
 
-        if (num_pending_events == 0) {
-          free_streams_.push_back(std::move(callback_stream));
-          callback_streams_.erase(callback_stream_it);
+        // Erase all completed events from stream_callbacks.
+        stream_callbacks.erase(stream_callbacks.begin(), it);
+
+        if (stream_callbacks.empty()) {
+          // absl::flat_hash_map::erase doesn't invalidate iterators, so this is
+          // safe.
+          callbacks_.erase(stream_it++);
+        } else {
+          stream_it++;
         }
-        return tsl::OkStatus();
-      });
+      };
+
+  // If `stream` is non-null, poll events just for that stream.  Otherwise, poll
+  // events for all streams.
+  if (stream != nullptr) {
+    auto stream_it = callbacks_.find(stream);
+    if (stream_it != callbacks_.end()) {
+      poll_events_for_stream_it(stream_it);
+    }
+  } else {
+    for (auto stream_it = callbacks_.begin(); stream_it != callbacks_.end();) {
+      poll_events_for_stream_it(stream_it);
+    }
+  }
 }
 
 EventMgrFactory* EventMgrFactory::Singleton() {
diff --git a/tensorflow/core/common_runtime/device/device_event_mgr.h b/tensorflow/core/common_runtime/device/device_event_mgr.h
index 5898c60fff4..00b30046680 100644
--- a/tensorflow/core/common_runtime/device/device_event_mgr.h
+++ b/tensorflow/core/common_runtime/device/device_event_mgr.h
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/tsl/platform/thread_annotations.h"
 
 namespace tensorflow {
 
@@ -65,50 +64,62 @@ class EventMgr {
  public:
   virtual ~EventMgr();
 
-  // Executes `func` when all pending stream actions have completed.  func must
-  // be brief and non-blocking since it executes in a shared threadpool used for
-  // all such callbacks.
-  void ThenExecute(se::Stream* stream, std::function<void()> func);
+  // Execute `func` when all pending stream actions have completed.  func must
+  // be brief and non-blocking since it executes in the one thread used for all
+  // such callbacks and also buffer deletions.
+  void ThenExecute(se::Stream* stream, std::function<void()> func) {
+    mutex_lock l(mu_);
+    EnqueueCallback(stream, std::move(func));
+    PollEvents(stream);
+  }
 
  private:
   friend class TEST_EventMgr;
   friend class TEST_EventMgrHelper;
   friend class EventMgrFactory;
 
+  se::StreamExecutor* const exec_;
+  const int32 polling_active_delay_usecs_;
+  mutex mu_;
+  condition_variable events_pending_ TF_GUARDED_BY(mu_);
+
   EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
 
-  mutex mu_;
-  se::StreamExecutor* const exec_;
-  thread::ThreadPool threadpool_;
+  // Set up `func` to be called once `stream` completes all its outstanding
+  // work.
+  void EnqueueCallback(se::Stream* stream, std::function<void()> func)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  // Stacks of currently-unused events and streams.
+  // This function should be called at roughly the same tempo as QueueTensors()
+  // to check whether pending events have recorded, and then retire them.
+  //
+  // If `stream` is not null, we only poll events for that stream.  Otherwise we
+  // poll events for all streams.
+  void PollEvents(se::Stream* stream = nullptr)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // An internal polling loop that runs at a low frequency to clear straggler
+  // Events.
+  void PollLoop();
+
+  // Setup/Teardown functions for the polling loop.
+  void StartPollingLoop();
+  void StopPollingLoop();
+
+  // A stack of unused events
   std::vector<std::unique_ptr<se::Event>> free_events_ TF_GUARDED_BY(mu_);
-  std::vector<std::unique_ptr<se::Stream>> free_streams_ TF_GUARDED_BY(mu_);
 
-  // Logically, we want a call to ThenExecute(stream, func) to enqueue a CUDA
-  // host callback (StreamExecutor ThenDoHostCallback) that runs `func` onto
-  // `stream`.
-  //
-  // But CUDA host callbacks execute synchronously with respect to the GPU work
-  // on their stream -- in other words, they block the stream.  Blocking a
-  // stream is potentially very expensive to the GPU work there; it essentially
-  // flushes the work queue and can result in gaps in GPU execution.
-  //
-  // We therefore build an "asynchronous CUDA callback" as follows:
-  //  - A user calls ThenExecute(stream, func).
-  //  - We enqueue a CUDA event onto `stream`; this lets us find out when
-  //    `stream` completes all the work that's currently pending.
-  //  - A *separate* stream, managed by EventMgr, waits on the event and then
-  //    enqueues a synchronous CUDA callback that runs `func`.
-  //
-  // This way we're never blocking the stream that actually executes GPU work.
-  //
-  // callback_streams_ maps the first stream (that was passed to ThenExecute) to
-  // the second one (owned by EventMgr).
-  absl::flat_hash_map<se::Stream* /*user stream*/,
-                      std::pair<std::unique_ptr<se::Stream> /*callback stream*/,
-                                int64_t /*num_pending_events*/>>
-      callback_streams_ TF_GUARDED_BY(mu_);
+  // Callbacks waiting on their events to complete.
+  absl::flat_hash_map<
+      se::Stream*,
+      std::deque<std::pair<std::unique_ptr<se::Event>, std::function<void()>>>>
+      callbacks_ TF_GUARDED_BY(mu_);
+
+  bool stop_polling_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<Notification> polling_stopped_;
+
+  // The main PollLoop for the event manager runs in this threadpool.
+  thread::ThreadPool threadpool_;
 };
 
 // Manages all the EventMgr instances.
diff --git a/tensorflow/core/common_runtime/device/device_event_mgr_test.cc b/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
index 08d615b4dc7..f188d32f5cf 100644
--- a/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
@@ -47,13 +47,39 @@ class TEST_EventMgr : public EventMgr {
 
 class TEST_EventMgrHelper {
  public:
-  explicit TEST_EventMgrHelper(EventMgr* em) : em_(em) {}
+  explicit TEST_EventMgrHelper(EventMgr* em) : em_(em) {
+    // The polling loop can interfere with the measurements made here, and
+    // isn't needed since the member PollEvents() always clears the queue.
+    // The tested behavior is slightly different from what may occur in
+    // ordinary execution.
+    StopPollingLoop();
+  }
+
+  size_t queue_size() {
+    mutex_lock l(em_->mu_);
+    size_t n = 0;
+    for (const auto& [stream, events_and_callbacks] : em_->callbacks_) {
+      n += events_and_callbacks.size();
+    }
+    return n;
+  }
 
   size_t free_size() {
     mutex_lock l(em_->mu_);
     return em_->free_events_.size();
   }
 
+  void PollEvents() {
+    while (queue_size() > 0) {
+      mutex_lock l(em_->mu_);
+      em_->PollEvents();
+    }
+  }
+
+  void StopPollingLoop() { return em_->StopPollingLoop(); }
+
+  void StartPollingLoop() { return em_->StartPollingLoop(); }
+
  private:
   EventMgr* em_;
 };
@@ -81,6 +107,14 @@ class TestTensorBuffer : public TensorBuffer {
 
 namespace {
 
+TEST(EventMgr, Empty) {
+  auto stream_exec = se::GPUMachineManager()->ExecutorForDevice(0).value();
+  TEST_EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgrHelper th(&em);
+  EXPECT_EQ(0, th.queue_size());
+  EXPECT_EQ(0, th.free_size());
+}
+
 // Tests that WarnIfInCallback() triggers correctly.
 TEST(EventMgr, WarnIfInCallback) {
   auto stream_exec = se::GPUMachineManager()->ExecutorForDevice(0).value();
@@ -90,6 +124,7 @@ TEST(EventMgr, WarnIfInCallback) {
   CHECK(stream);
   stream->Init();
   bool hit = false;
+  th.StartPollingLoop();
   device_event_mgr::WarnIfInCallback([&hit] { hit = true; });
   EXPECT_FALSE(hit);
   Notification note;
@@ -175,7 +210,7 @@ class EMBenchmarkHelper {
                                    {tensor_size}, AllocationAttributes()));
     }
     gpu_outputs_.clear();
-    while (gpu_outputs_.size() < 1) {
+    while (gpu_outputs_.empty()) {
       gpu_outputs_.push_back(Tensor(gpu_helper_->gpu_allocator(), DT_FLOAT,
                                     {tensor_size}, AllocationAttributes()));
     }
@@ -190,7 +225,7 @@ class EMBenchmarkHelper {
       }
     }
     host_outputs_.clear();
-    while (host_outputs_.size() < 1) {
+    while (host_outputs_.empty()) {
       host_outputs_.push_back(Tensor(gpu_helper_->host_allocator(), DT_FLOAT,
                                      {tensor_size}, AllocationAttributes()));
       for (int i = 0; i < tensor_size; ++i) {
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index 0ba0612f610..97055b3d722 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -28,132 +28,4 @@ namespace tensorflow {
 
 DeviceMgr::~DeviceMgr() {}
 
-StaticDeviceMgr::StaticDeviceMgr(std::vector<std::unique_ptr<Device>> devices)
-    : devices_(std::move(devices)),
-      name_backing_store_(128),
-      cpu_device_(nullptr) {
-  for (auto& d : devices_) {
-    // Register under the (1) full name and (2) canonical name.
-    for (const string& name :
-         DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) {
-      device_map_[CopyToBackingStore(name)] = d.get();
-    }
-    // Register under the (3) local name and (4) legacy local name.
-    for (const string& name :
-         DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
-      device_map_[CopyToBackingStore(name)] = d.get();
-    }
-    const auto& t = d->device_type();
-    device_type_counts_[t]++;
-    device_incarnation_set_.insert(d->attributes().incarnation());
-    if (cpu_device_ == nullptr && t == "CPU" && d->parsed_name().id == 0) {
-      cpu_device_ = d.get();
-    }
-  }
-}
-
-StaticDeviceMgr::StaticDeviceMgr(std::unique_ptr<Device> device)
-    : StaticDeviceMgr([&device] {
-        std::vector<std::unique_ptr<Device>> vector;
-        vector.push_back(std::move(device));
-        return vector;
-      }()) {}
-
-StaticDeviceMgr::~StaticDeviceMgr() {
-  // Release resources ahead of destroying the device manager as the resource
-  // destructors (e.g. ~IteratorResource) assume devices still exist.
-  for (auto& device : devices_) {
-    device->ClearResourceMgr();
-  }
-}
-
-StringPiece StaticDeviceMgr::CopyToBackingStore(StringPiece s) {
-  size_t n = s.size();
-  char* space = name_backing_store_.Alloc(n);
-  memcpy(space, s.data(), n);
-  return StringPiece(space, n);
-}
-
-void StaticDeviceMgr::ListDeviceAttributes(
-    std::vector<DeviceAttributes>* devices) const {
-  devices->reserve(devices_.size());
-  for (const auto& dev : devices_) {
-    devices->emplace_back(dev->attributes());
-  }
-}
-
-std::vector<Device*> StaticDeviceMgr::ListDevices() const {
-  std::vector<Device*> devices(devices_.size());
-  for (size_t i = 0; i < devices_.size(); ++i) {
-    devices[i] = devices_[i].get();
-  }
-  return devices;
-}
-
-string StaticDeviceMgr::DebugString() const {
-  string out;
-  for (const auto& dev : devices_) {
-    strings::StrAppend(&out, dev->name(), "\n");
-  }
-  return out;
-}
-
-string StaticDeviceMgr::DeviceMappingString() const {
-  string out;
-  for (const auto& dev : devices_) {
-    if (!dev->attributes().physical_device_desc().empty()) {
-      strings::StrAppend(&out, dev->name(), " -> ",
-                         dev->attributes().physical_device_desc(), "\n");
-    }
-  }
-  return out;
-}
-
-Status StaticDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
-  auto iter = device_map_.find(name);
-  if (iter == device_map_.end()) {
-    std::vector<StringPiece> device_names;
-    for (auto&& itr : device_map_) {
-      device_names.push_back(itr.first);
-    }
-    VLOG(1) << "Unknown device: " << name
-            << " all devices: " << absl::StrJoin(device_names, ", ");
-    return errors::InvalidArgument(name, " unknown device.");
-  }
-  *device = iter->second;
-  return OkStatus();
-}
-
-bool StaticDeviceMgr::ContainsDevice(int64_t device_incarnation) const {
-  return device_incarnation_set_.contains(device_incarnation);
-}
-
-void StaticDeviceMgr::ClearContainers(
-    gtl::ArraySlice<string> containers) const {
-  Status s;
-  for (const auto& dev : devices_) {
-    if (containers.empty()) {
-      s.Update(dev->resource_manager()->Cleanup(
-          dev->resource_manager()->default_container()));
-    } else {
-      for (const string& c : containers) {
-        s.Update(dev->resource_manager()->Cleanup(c));
-      }
-    }
-    if (!s.ok()) {
-      LOG(WARNING) << s;
-    }
-  }
-}
-
-int StaticDeviceMgr::NumDeviceType(const string& type) const {
-  auto iter = device_type_counts_.find(type);
-  if (iter != device_type_counts_.end()) return iter->second;
-  return 0;
-}
-
-int StaticDeviceMgr::NumDevices() const { return devices_.size(); }
-
-Device* StaticDeviceMgr::HostCPU() const { return cpu_device_; }
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index e2b81231469..9020b2b0376 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
 
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -77,42 +78,6 @@ class DeviceMgr {
   TF_DISALLOW_COPY_AND_ASSIGN(DeviceMgr);
 };
 
-// Represents a static set of devices.
-class StaticDeviceMgr : public DeviceMgr {
- public:
-  // Constructs a StaticDeviceMgr from a list of devices.
-  explicit StaticDeviceMgr(std::vector<std::unique_ptr<Device>> devices);
-
-  // Constructs a StaticDeviceMgr managing a single device.
-  explicit StaticDeviceMgr(std::unique_ptr<Device> device);
-
-  ~StaticDeviceMgr() override;
-
-  void ListDeviceAttributes(
-      std::vector<DeviceAttributes>* devices) const override;
-  std::vector<Device*> ListDevices() const override;
-  string DebugString() const override;
-  string DeviceMappingString() const override;
-  Status LookupDevice(StringPiece name, Device** device) const override;
-  bool ContainsDevice(int64_t device_incarnation) const override;
-  void ClearContainers(gtl::ArraySlice<string> containers) const override;
-  int NumDeviceType(const string& type) const override;
-  int NumDevices() const override;
-  Device* HostCPU() const override;
-
- private:
-  const std::vector<std::unique_ptr<Device>> devices_;
-
-  StringPiece CopyToBackingStore(StringPiece s);
-
-  absl::flat_hash_set<int64_t> device_incarnation_set_;
-  std::unordered_map<StringPiece, Device*, StringPieceHasher> device_map_;
-  core::Arena name_backing_store_;  // Storage for keys in device_map_
-  std::unordered_map<string, int> device_type_counts_;
-  Device* cpu_device_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(StaticDeviceMgr);
-};
 
 // Size of stale device buffer for temporary storage of removed devices.
 static const size_t kStaleDeviceBufferSize = 8192;
@@ -124,8 +89,8 @@ class DynamicDeviceMgr : public DeviceMgr {
   DynamicDeviceMgr();
 
   // Constructs a DynamicDeviceMgr from a list of devices.
-  // TODO(b/183966398): Remove StaticDeviceMgr since there's no usage.
-  explicit DynamicDeviceMgr(std::vector<std::unique_ptr<Device>> devices);
+  explicit DynamicDeviceMgr(std::vector<std::unique_ptr<Device>>&& devices);
+  explicit DynamicDeviceMgr(std::unique_ptr<Device>&& device);
 
   ~DynamicDeviceMgr() override;
 
@@ -157,8 +122,17 @@ class DynamicDeviceMgr : public DeviceMgr {
  private:
   mutable mutex devices_mu_;
 
-  std::vector<std::unique_ptr<Device>> dynamic_devices_
-      TF_GUARDED_BY(devices_mu_);
+  // Using an ordered map to ensure deterministic ordering of devices.
+  // Not a set, because we need to do find(Device*) and own the devices
+  // at the same time.
+  // We still have to override C++'s default pointer ordering.
+  struct DereferenceDevicePtrLess {
+    bool operator()(const Device* a, const Device* b) const {
+      return Device::LessByParsedName(*a, *b);
+    }
+  };
+  std::map<Device*, std::unique_ptr<Device>, DereferenceDevicePtrLess>
+      dynamic_devices_ TF_GUARDED_BY(devices_mu_);
 
   absl::flat_hash_set<int64_t> device_incarnation_set_
       TF_GUARDED_BY(devices_mu_);
@@ -193,6 +167,10 @@ class DynamicDeviceMgr : public DeviceMgr {
 
   TF_DISALLOW_COPY_AND_ASSIGN(DynamicDeviceMgr);
 };
+
+// TODO(b/183966398): Remove StaticDeviceMgr since there's no usage.
+using StaticDeviceMgr = DynamicDeviceMgr;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 5649f9157b2..85f1e637359 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -138,9 +138,13 @@ Status NewThreadPoolFromThreadPoolOptions(
   return OkStatus();
 }
 
-thread::ThreadPool* GlobalThreadPool(const SessionOptions& options) {
+// Function to create a global thread pool for sessions. The thread number is
+// set as `num_threads` if `num_threads` > 0, otherwise it will be parsed from
+// SessionOptions.
+thread::ThreadPool* GlobalThreadPool(const SessionOptions& options,
+                                     int32_t num_threads) {
   static thread::ThreadPool* const thread_pool =
-      NewThreadPoolFromSessionOptions(options);
+      NewThreadPoolFromSessionOptions(options, num_threads);
   return thread_pool;
 }
 
@@ -338,7 +342,6 @@ DirectSession::DirectSession(const SessionOptions& options,
     thread_pools_.emplace_back(NewThreadPoolFromSessionOptions(options_),
                                true /* owned */);
   } else {
-    thread_pools_.emplace_back(GlobalThreadPool(options), false /* owned */);
     // Run locally if environment value of TF_NUM_INTEROP_THREADS is negative
     // and config.inter_op_parallelism_threads is unspecified or negative.
     static const int env_num_threads = NumInterOpThreadsFromEnvironment();
@@ -347,13 +350,20 @@ DirectSession::DirectSession(const SessionOptions& options,
          env_num_threads < 0)) {
       run_in_caller_thread_ = true;
     }
+
+    // `run_in_caller_thread_` means the session is expected to run with single
+    // thread, but it will be dispatched to global thread pool if there're
+    // multiple executors. To keep consistent behavior, set thread number to 1.
+    thread_pools_.emplace_back(
+        GlobalThreadPool(options, run_in_caller_thread_ ? 1 : 0),
+        false /* owned */);
   }
   // The default value of sync_on_finish will be flipped soon and this
   // environment variable will be removed as well.
   const Status status =
       ReadBoolFromEnvVar("TF_SYNC_ON_FINISH", true, &sync_on_finish_);
   if (!status.ok()) {
-    LOG(ERROR) << status.error_message();
+    LOG(ERROR) << status.message();
   }
   session_handle_ =
       strings::StrCat("direct", strings::FpToString(random::New64()));
@@ -896,7 +906,7 @@ Status DirectSession::Run(const RunOptions& run_options,
   }
   const Status s = call_frame.SetArgs(feed_args);
   if (errors::IsInternal(s)) {
-    return errors::InvalidArgument(s.error_message());
+    return errors::InvalidArgument(s.message());
   } else if (!s.ok()) {
     return s;
   }
@@ -917,7 +927,7 @@ Status DirectSession::Run(const RunOptions& run_options,
     const Status s = call_frame.ConsumeRetvals(
         &sorted_outputs, /* allow_dead_tensors = */ false);
     if (errors::IsInternal(s)) {
-      return errors::InvalidArgument(s.error_message());
+      return errors::InvalidArgument(s.message());
     } else if (!s.ok()) {
       return s;
     }
@@ -1363,11 +1373,12 @@ Status DirectSession::CreateExecutors(
       device_mgr_.get(), options_.env, &options_.config, graph_def_version,
       func_info->flib_def.get(), optimizer_opts, thread_pools_[0].first,
       /*parent=*/nullptr, session_metadata,
-      Rendezvous::Factory{
-          [](const int64_t, const DeviceMgr* device_mgr, Rendezvous** r) {
-            *r = new IntraProcessRendezvous(device_mgr);
-            return OkStatus();
-          }}));
+      Rendezvous::Factory{[](const int64_t, const DeviceMgr* device_mgr,
+                             tsl::core::RefCountPtr<Rendezvous>* r) {
+        *r = tsl::core::RefCountPtr<Rendezvous>(
+            new IntraProcessRendezvous(device_mgr));
+        return OkStatus();
+      }}));
 
   GraphOptimizer optimizer(optimizer_opts);
   for (auto iter = graphs.begin(); iter != graphs.end(); ++iter) {
@@ -1847,7 +1858,7 @@ void DirectSession::WaitForNotification(Notification* n, RunState* run_state,
     const bool notified =
         WaitForNotificationWithTimeout(notification, timeout_in_us);
     if (!notified) {
-      return Status(error::DEADLINE_EXCEEDED,
+      return Status(absl::StatusCode::kDeadlineExceeded,
                     "Timed out waiting for notification");
     }
   } else {
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 9d9675a72ec..fff77d4dae2 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -180,8 +180,8 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
 
     Status s = session->RunCallable(handle, {}, nullptr, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(absl::StrContains(s.error_message(),
-                                  "`fetch_tensors` must be provided"));
+    EXPECT_TRUE(
+        absl::StrContains(s.message(), "`fetch_tensors` must be provided"));
 
     TF_ASSERT_OK(session->ReleaseCallable(handle));
 
@@ -189,13 +189,11 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
     s = session->RunCallable(handle, {}, &outputs, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
     EXPECT_TRUE(absl::StrContains(
-        s.error_message(),
-        "Attempted to run callable after handle was released"));
+        s.message(), "Attempted to run callable after handle was released"));
 
     s = session->RunCallable(handle + 1, {}, &outputs, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(
-        absl::StrContains(s.error_message(), "No such callable handle"));
+    EXPECT_TRUE(absl::StrContains(s.message(), "No such callable handle"));
   }
 }
 
@@ -225,8 +223,7 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_OptimizeForStaticGraph) {
 
   s = session->Extend({});
   EXPECT_TRUE(errors::IsFailedPrecondition(s));
-  EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "optimize_for_static_graph"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "optimize_for_static_graph"));
 }
 
 TEST_F(DirectSessionMinusAXTest,
@@ -264,7 +261,7 @@ TEST_F(DirectSessionMinusAXTest,
 
   EXPECT_TRUE(errors::IsInvalidArgument(s));
   EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "disable_output_partition_graphs"));
+      absl::StrContains(s.message(), "disable_output_partition_graphs"));
 }
 
 TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_FinalizeWithCallables) {
@@ -300,8 +297,7 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_FinalizeWithCallables) {
   Status s =
       session->MakeCallable(MakeCallableOptions({}, {y_ + ":0"}, {}), &handle);
   EXPECT_TRUE(errors::IsFailedPrecondition(s));
-  EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "Session has been finalized."));
+  EXPECT_TRUE(absl::StrContains(s.message(), "Session has been finalized."));
 }
 
 TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_FinalizeWithRun) {
@@ -333,8 +329,7 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_FinalizeWithRun) {
   // Running a different subgraph fails because the session has been finalized.
   Status s = session->Run({}, {y_ + ":0"}, {}, &outputs);
   EXPECT_TRUE(errors::IsFailedPrecondition(s));
-  EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "Session has been finalized."));
+  EXPECT_TRUE(absl::StrContains(s.message(), "Session has been finalized."));
 }
 
 TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
@@ -409,7 +404,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(absl::StrContains(s.error_message(), "would create a cycle"));
+    EXPECT_TRUE(absl::StrContains(s.message(), "would create a cycle"));
   }
 
   {
@@ -423,7 +418,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(absl::StrContains(s.error_message(), "unknown node"));
+    EXPECT_TRUE(absl::StrContains(s.message(), "unknown node"));
   }
 
   {
@@ -438,7 +433,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(absl::StrContains(s.error_message(), "unknown edge"));
+    EXPECT_TRUE(absl::StrContains(s.message(), "unknown edge"));
   }
 
   {
@@ -452,8 +447,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsNotFound(s));
-    EXPECT_TRUE(
-        absl::StrContains(s.error_message(), "unable to find feed output"));
+    EXPECT_TRUE(absl::StrContains(s.message(), "unable to find feed output"));
   }
 
   {
@@ -470,7 +464,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(absl::StrContains(s.error_message(), "fed more than once"));
+    EXPECT_TRUE(absl::StrContains(s.message(), "fed more than once"));
   }
 
   {
@@ -485,7 +479,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(absl::StrContains(s.error_message(), "fed more than once"));
+    EXPECT_TRUE(absl::StrContains(s.message(), "fed more than once"));
   }
 }
 
@@ -910,7 +904,7 @@ TEST(DirectSessionTest, MultipleFeedTest) {
       {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
       &outputs);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "fed more than once"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "fed more than once"));
 }
 
 TEST(DirectSessionTest, MultipleFeedTest_Callable) {
@@ -993,7 +987,7 @@ TEST(DirectSessionTest, MultipleFeedTest_Callable) {
           {first_identity->name() + ":0", second_identity->name() + ":0"}, {}),
       &handle);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "fed more than once"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "fed more than once"));
 }
 
 TEST(DirectSessionTest, TestTensorConnectionUseTwice) {
@@ -1147,7 +1141,7 @@ TEST(DirectSessionTest, MultipleFeedTestSomeSyncRun) {
       {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
       &outputs, nullptr);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "fed more than once"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "fed more than once"));
 }
 
 REGISTER_OP("SessionMetadataReader")
@@ -1660,7 +1654,7 @@ TEST(DirectSessionTest, PartialRunMissingFeed) {
                     {third_identity->name() + ":0"}, &outputs);
   ASSERT_TRUE(errors::IsInvalidArgument(s));
   EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "can't be computed from the feeds"));
+      absl::StrContains(s.message(), "can't be computed from the feeds"));
 }
 
 TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
@@ -1690,7 +1684,7 @@ TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
   s = session->PRun(handle, {}, {fourth_identity->name() + ":0"}, &outputs);
   ASSERT_TRUE(errors::IsInvalidArgument(s));
   EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "can't be computed from the feeds"));
+      absl::StrContains(s.message(), "can't be computed from the feeds"));
 
   // Feed switch_node:1 and fetch fourth_identity.
   s = session->PRun(handle, {{switch_node->name() + ":1", bool_value}},
@@ -2152,7 +2146,7 @@ TEST(DirectSessionTest, TestSessionInterOpThreadsInvalidOptions) {
                               &outputs, nullptr /* run_metadata */);
       EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
       EXPECT_TRUE(absl::StrContains(
-          s.error_message(),
+          s.message(),
           strings::StrCat("Invalid inter_op_thread_pool: ", pool_num)));
     }
   }
@@ -2219,12 +2213,12 @@ TEST(DirectSessionTest, TestDirectSessionRunClose) {
   Status s = session->Run({} /* inputs */, {},
                           {var_assign->name()} /* target_nodes */, nullptr);
   EXPECT_EQ(s.code(), error::CANCELLED);
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "Session has been closed."));
+  EXPECT_TRUE(absl::StrContains(s.message(), "Session has been closed."));
 
   // Run the read as a callable to verify that we get the same error.
   s = session->RunCallable(handle, {}, {}, nullptr);
   EXPECT_EQ(s.code(), error::CANCELLED);
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "Session has been closed."));
+  EXPECT_TRUE(absl::StrContains(s.message(), "Session has been closed."));
 }
 
 TEST(DirectSessionTest, TestDirectSessionPRunClose) {
@@ -2272,7 +2266,7 @@ TEST(DirectSessionTest, TestDirectSessionPRunClose) {
   s = session->PRun(handle, {{first_const->name(), value_11}},
                     {first_identity->name() + ":0"}, &outputs);
   EXPECT_EQ(s.code(), error::CANCELLED);
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "Session has been closed."));
+  EXPECT_TRUE(absl::StrContains(s.message(), "Session has been closed."));
 }
 
 TEST(DirectSessionTest, TestDirectSessionReset) {
@@ -2313,7 +2307,7 @@ TEST(DirectSessionTest, TestDirectSessionReset) {
   Status s = session->Run({} /* inputs */, {},
                           {var_assign->name()} /* target_nodes */, nullptr);
   EXPECT_EQ(s.code(), error::CANCELLED);
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "Session has been closed."));
+  EXPECT_TRUE(absl::StrContains(s.message(), "Session has been closed."));
 }
 
 TEST(DirectSessionTest, LocalDeviceManager) {
@@ -2535,7 +2529,7 @@ void TestFeedAndFetchTensorsInDeviceMemoryFailsToMakeCallable(
     Status status = session->MakeCallable(opts, &handle);
     EXPECT_FALSE(status.ok()) << DataType_Name(dtype);
     EXPECT_TRUE(absl::StrContains(
-        status.error_message(),
+        status.message(),
         strings::StrCat(
             "Cannot feed or fetch tensor 'y:0' from device ", gpu_device_name,
             " as feeding/fetching from GPU devices is not yet supported for ",
@@ -2551,7 +2545,7 @@ void TestFeedAndFetchTensorsInDeviceMemoryFailsToMakeCallable(
     Status status = session->MakeCallable(opts, &handle);
     EXPECT_FALSE(status.ok());
     EXPECT_TRUE(absl::StrContains(
-        status.error_message(),
+        status.message(),
         strings::StrCat(
             "Cannot feed or fetch tensor 'x:0' from device ", gpu_device_name,
             " as feeding/fetching from GPU devices is not yet supported for ",
diff --git a/tensorflow/core/common_runtime/dynamic_device_mgr.cc b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
index 8bd3acd4a7d..36abc55471e 100644
--- a/tensorflow/core/common_runtime/dynamic_device_mgr.cc
+++ b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
@@ -30,13 +30,14 @@ namespace tensorflow {
 DynamicDeviceMgr::DynamicDeviceMgr() : cpu_device_(nullptr) {}
 
 DynamicDeviceMgr::DynamicDeviceMgr(
-    std::vector<std::unique_ptr<Device>> devices) {
+    std::vector<std::unique_ptr<Device>>&& devices)
+    : cpu_device_(nullptr) {
   Status status = AddDevices(std::move(devices));
   CHECK(status.ok());  // Crash OK
   mutex_lock l(devices_mu_);
   // Initialize cpu_device_.
-  for (int i = 0; i < dynamic_devices_.size(); ++i) {
-    auto* d = dynamic_devices_[i].get();
+  for (const auto& it : dynamic_devices_) {
+    Device* d = it.first;
     if (d->device_type() == DEVICE_CPU && d->parsed_name().id == 0) {
       cpu_device_ = d;
       break;
@@ -44,14 +45,21 @@ DynamicDeviceMgr::DynamicDeviceMgr(
   }
 }
 
+DynamicDeviceMgr::DynamicDeviceMgr(std::unique_ptr<Device>&& device)
+    : DynamicDeviceMgr([&device] {
+        std::vector<std::unique_ptr<Device>> vector;
+        vector.push_back(std::move(device));
+        return vector;
+      }()) {}
+
 DynamicDeviceMgr::~DynamicDeviceMgr() {
   // Release resources ahead of destroying the device manager as the resource
   // destructors (e.g. ~IteratorResource) assume devices still exist.
   mutex_lock l(devices_mu_);
-  for (const auto& d : dynamic_devices_) {
+  for (const auto& it : dynamic_devices_) {
     // TODO(tf-runtime-team): clear devices' resource mgr in devices'
     // destructor.
-    d->ClearResourceMgr();
+    it.first->ClearResourceMgr();
   }
 }
 
@@ -59,8 +67,8 @@ void DynamicDeviceMgr::ListDeviceAttributes(
     std::vector<DeviceAttributes>* devices) const {
   tf_shared_lock l(devices_mu_);
   devices->reserve(dynamic_devices_.size());
-  for (const auto& d : dynamic_devices_) {
-    devices->emplace_back(d->attributes());
+  for (const auto& it : dynamic_devices_) {
+    devices->emplace_back(it.first->attributes());
   }
 }
 
@@ -68,8 +76,8 @@ std::vector<Device*> DynamicDeviceMgr::ListDevices() const {
   tf_shared_lock l(devices_mu_);
   std::vector<Device*> devices;
   devices.reserve(dynamic_devices_.size());
-  for (const auto& d : dynamic_devices_) {
-    devices.emplace_back(d.get());
+  for (const auto& it : dynamic_devices_) {
+    devices.emplace_back(it.first);
   }
   return devices;
 }
@@ -77,8 +85,8 @@ std::vector<Device*> DynamicDeviceMgr::ListDevices() const {
 string DynamicDeviceMgr::DebugString() const {
   string out;
   tf_shared_lock l(devices_mu_);
-  for (const auto& d : dynamic_devices_) {
-    strings::StrAppend(&out, d->name(), "\n");
+  for (const auto& it : dynamic_devices_) {
+    strings::StrAppend(&out, it.first->name(), "\n");
   }
   return out;
 }
@@ -86,7 +94,8 @@ string DynamicDeviceMgr::DebugString() const {
 string DynamicDeviceMgr::DeviceMappingString() const {
   string out;
   tf_shared_lock l(devices_mu_);
-  for (const auto& d : dynamic_devices_) {
+  for (const auto& it : dynamic_devices_) {
+    auto d = it.first;
     if (!d->attributes().physical_device_desc().empty()) {
       strings::StrAppend(&out, d->name(), " -> ",
                          d->attributes().physical_device_desc(), "\n");
@@ -120,7 +129,8 @@ void DynamicDeviceMgr::ClearContainers(
     gtl::ArraySlice<string> containers) const {
   Status s;
   tf_shared_lock l(devices_mu_);
-  for (const auto& d : dynamic_devices_) {
+  for (const auto& it : dynamic_devices_) {
+    auto d = it.first;
     if (containers.empty()) {
       s.Update(d->resource_manager()->Cleanup(
           d->resource_manager()->default_container()));
@@ -168,7 +178,7 @@ Status DynamicDeviceMgr::AddDevices(
     }
     device_type_counts_[d->device_type()]++;
     device_incarnation_set_.insert(d->attributes().incarnation());
-    dynamic_devices_.push_back(std::move(d));
+    dynamic_devices_.emplace(d.get(), std::move(d));
   }
   return OkStatus();
 }
@@ -181,11 +191,8 @@ Status DynamicDeviceMgr::RemoveDevices(const std::vector<Device*>& devices) {
       TF_RETURN_IF_ERROR(
           errors::InvalidArgument("Can not remove HostCPU device ", d->name()));
     }
-    int i = 0;
-    for (; i < dynamic_devices_.size(); ++i) {
-      if (d == dynamic_devices_[i].get()) break;
-    }
-    if (i >= dynamic_devices_.size()) {
+    const auto it = dynamic_devices_.find(d);
+    if (it == dynamic_devices_.end()) {
       return errors::InvalidArgument("Unknown device ", d->name());
     }
   }
@@ -204,14 +211,14 @@ Status DynamicDeviceMgr::RemoveDevices(const std::vector<Device*>& devices) {
     device_type_counts_[d->device_type()]--;
     device_incarnation_set_.erase(d->attributes().incarnation());
 
-    int i = 0;
-    for (; i < dynamic_devices_.size(); ++i) {
-      if (d == dynamic_devices_[i].get()) break;
+    auto it = dynamic_devices_.find(d);
+    if (it == dynamic_devices_.end()) {
+      return errors::InvalidArgument("Unknown device ", d->name());
     }
     // There shouldn't be unknown devices at this point.
-    CHECK(i < dynamic_devices_.size());  // Crash OK
-    stale_devices_.add(std::move(dynamic_devices_[i]));
-    dynamic_devices_.erase(dynamic_devices_.begin() + i);
+    CHECK(it != dynamic_devices_.end());  // Crash OK
+    stale_devices_.add(std::move(it->second));
+    dynamic_devices_.erase(it);
   }
   return OkStatus();
 }
@@ -235,8 +242,8 @@ Device* DynamicDeviceMgr::HostCPU() const {
   if (device != nullptr) return device;
 
   mutex_lock l(devices_mu_);
-  for (int i = 0; i < dynamic_devices_.size(); ++i) {
-    Device* d = dynamic_devices_[i].get();
+  for (const auto& it : dynamic_devices_) {
+    Device* d = it.first;
     if (d->device_type() == DEVICE_CPU && d->parsed_name().id == 0) {
       cpu_device_ = d;
       break;
diff --git a/tensorflow/core/common_runtime/dynamic_device_mgr_test.cc b/tensorflow/core/common_runtime/dynamic_device_mgr_test.cc
index 6e4fbf35e78..4749d254c3a 100644
--- a/tensorflow/core/common_runtime/dynamic_device_mgr_test.cc
+++ b/tensorflow/core/common_runtime/dynamic_device_mgr_test.cc
@@ -63,8 +63,8 @@ static Device* CreateDevice(const char* type, const char* name,
 }
 
 TEST(DynamicDeviceMgrTest, AddDeviceToMgr) {
-  std::unique_ptr<Device> d0(CreateDevice("CPU", "/device:CPU:0"));
-  std::unique_ptr<Device> d1(CreateDevice("CPU", "/device:CPU:1"));
+  std::unique_ptr<Device> d0(CreateDevice("CPU", "/device:CPU:1"));
+  std::unique_ptr<Device> d1(CreateDevice("CPU", "/device:CPU:0"));
 
   auto dm = std::make_unique<DynamicDeviceMgr>();
   EXPECT_EQ(dm->ListDevices().size(), 0);
@@ -74,6 +74,10 @@ TEST(DynamicDeviceMgrTest, AddDeviceToMgr) {
   added_devices.emplace_back(std::move(d1));
   TF_CHECK_OK(dm->AddDevices(std::move(added_devices)));
   EXPECT_EQ(dm->ListDevices().size(), 2);
+  // Checks that list is sorted by the device name order, not insertion order.
+  // Insertion order is flipped above.
+  EXPECT_EQ(dm->ListDevices()[0]->name(), "/device:CPU:0");
+  EXPECT_EQ(dm->ListDevices()[1]->name(), "/device:CPU:1");
 }
 
 TEST(DynamicDeviceMgrTest, RemoveDeviceFromMgr) {
@@ -156,8 +160,8 @@ TEST(DynamicDeviceMgrTest, AddRepeatedDeviceToMgr) {
   std::vector<std::unique_ptr<Device>> added_devices;
   added_devices.emplace_back(std::move(d1));
   Status s = dm->AddDevices(std::move(added_devices));
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
-                                "name conflicts with an existing device"));
+  EXPECT_TRUE(
+      absl::StrContains(s.message(), "name conflicts with an existing device"));
 }
 
 TEST(DynamicDeviceMgrTest, RemoveNonExistingDeviceFromMgr) {
@@ -174,7 +178,7 @@ TEST(DynamicDeviceMgrTest, RemoveNonExistingDeviceFromMgr) {
 
   std::vector<Device*> removed_devices{d0_ptr, d1_ptr};
   Status s = dm->RemoveDevices(removed_devices);
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "Unknown device"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "Unknown device"));
   EXPECT_EQ(dm->ListDevices().size(), 1);  // d0 *not* removed.
 }
 
@@ -191,7 +195,7 @@ TEST(DynamicDeviceMgrTest, RemoveNonExistingDeviceByNameFromMgr) {
 
   std::vector<string> removed_devices{d0_name, d1_name};
   Status s = dm->RemoveDevicesByName(removed_devices);
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "unknown device"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "unknown device"));
   EXPECT_EQ(dm->ListDevices().size(), 1);  // d0 *not* removed
 }
 
@@ -227,7 +231,7 @@ TEST(DynamicDeviceMgrTest, HostCPU) {
 
   // Once we have a HostCPU() device, we can't remove it ...
   std::vector<Device*> removed{gpu_ptr, cpu0_ptr};
-  EXPECT_TRUE(absl::StrContains(dm->RemoveDevices(removed).error_message(),
+  EXPECT_TRUE(absl::StrContains(dm->RemoveDevices(removed).message(),
                                 "Can not remove HostCPU device"));
   EXPECT_EQ(dm->ListDevices().size(), 3);
   EXPECT_EQ(dm->HostCPU(), cpu0_ptr);
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index ec4f3674947..82ec86ed4e7 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -103,24 +103,25 @@ tf_cuda_library(
     ],
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":custom_device",
         ":eager_executor",
         ":kernel_and_device",
-        ":custom_device",
-        "@com_google_absl//absl/container:flat_hash_map",
+        ":rendezvous_cache",
+        ":small_constants_optimizer",
         "//tensorflow/c:tf_tensor_internal",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_distributed_manager",
-        "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/nccl:collective_communicator",
+        "@com_google_absl//absl/container:flat_hash_map",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            "@com_google_absl//absl/types:optional",
             "//tensorflow/core:core_cpu_lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -130,13 +131,14 @@ tf_cuda_library(
             "//tensorflow/core:session_options",
             "//tensorflow/core/distributed_runtime:cluster_function_library_runtime",
             "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
-            "//tensorflow/core/distributed_runtime:session_mgr",
             "//tensorflow/core/distributed_runtime:device_resolver_distributed",
             "//tensorflow/core/distributed_runtime:rpc_collective_executor_mgr",
-            "//tensorflow/core/distributed_runtime:worker_cache",
             "//tensorflow/core/distributed_runtime:server_lib",
+            "//tensorflow/core/distributed_runtime:session_mgr",
+            "//tensorflow/core/distributed_runtime:worker_cache",
             "//tensorflow/core/distributed_runtime:worker_session",
             "//tensorflow/core/distributed_runtime/eager:eager_client",
+            "@com_google_absl//absl/types:optional",
         ],
     }),
 )
@@ -157,12 +159,12 @@ tf_cuda_library(
             "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            "//tensorflow/core:framework",
-            "//tensorflow/core/framework:full_type_proto_cc",
-            "//tensorflow/core:lib",
             "//tensorflow/c/eager:immediate_execution_context",
-            "//tensorflow/c/eager:immediate_execution_tensor_handle",
             "//tensorflow/c/eager:immediate_execution_operation",
+            "//tensorflow/c/eager:immediate_execution_tensor_handle",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core/framework:full_type_proto_cc",
             "//tensorflow/core/lib/core:status",
         ],
     }),
@@ -202,8 +204,8 @@ tf_cuda_library(
         ":eager_executor",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_distributed_manager",
-        "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
         "//tensorflow/core/distributed_runtime:master_env",
+        "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
         "//tensorflow/core/distributed_runtime:worker_env",
     ] + select({
         "//tensorflow:android": [
@@ -224,8 +226,8 @@ tf_cuda_library(
             "//tensorflow/core/distributed_runtime:worker_cache",
             "//tensorflow/core/distributed_runtime:worker_interface",
             "//tensorflow/core/distributed_runtime:worker_session",
-            "//tensorflow/core/distributed_runtime/eager:eager_client",
             "//tensorflow/core/distributed_runtime/eager:cluster_function_library_runtime",
+            "//tensorflow/core/distributed_runtime/eager:eager_client",
             "//tensorflow/core/distributed_runtime/eager:remote_mgr",
             "//tensorflow/tsl/distributed_runtime/coordination:coordination_service",
             "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
@@ -282,25 +284,25 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":attr_builder",
-        ":custom_device",
         ":context",
+        ":custom_device",
         ":eager_executor",
         ":kernel_and_device",
         ":tensor_handle",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:platform_port",
+        "//tensorflow/core/util:managed_stack_trace",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
-        "//tensorflow/c:tf_tensor_internal",
-        "//tensorflow/c/eager:immediate_execution_operation",
-        "//tensorflow/c/eager:abstract_operation",
-        "//tensorflow/c/eager:immediate_execution_tensor_handle",
-        "//tensorflow/c/eager:abstract_tensor_handle",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:platform_port",
-        "//tensorflow/core/util:managed_stack_trace",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -340,10 +342,10 @@ tf_cuda_library(
             "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            "@com_google_absl//absl/types:variant",
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
             "//tensorflow/core/profiler/lib:traceme",
+            "@com_google_absl//absl/types:variant",
         ],
     }),
 )
@@ -378,8 +380,6 @@ tf_cuda_library(
             "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            "@com_google_absl//absl/strings",
-            "@com_google_absl//absl/types:variant",
             "//tensorflow/c:tf_tensor_internal",
             "//tensorflow/c/eager:immediate_execution_tensor_handle",
             "//tensorflow/core:core_cpu_lib",
@@ -391,6 +391,8 @@ tf_cuda_library(
             "//tensorflow/core:session_options",
             "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle_data",
             "//tensorflow/core/profiler/lib:traceme",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/types:variant",
         ],
     }),
 )
@@ -449,6 +451,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "small_constants_optimizer",
+    srcs = ["small_constants_optimizer.cc"],
+    hdrs = ["small_constants_optimizer.h"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 KERNEL_AND_DEVICE_DEPS = [
     "//tensorflow/core:core_cpu_lib",
     "//tensorflow/core:framework",
@@ -474,8 +489,8 @@ tf_cuda_library(
     deps = [
         ":attr_builder",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@farmhash_archive//:farmhash",
     ] + select({
@@ -535,25 +550,25 @@ cc_library(
         ":eager_op_rewrite_registry",
         ":eager_operation",
         ":kernel_and_device",
+        ":small_constants_optimizer",
         ":tensor_handle",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/compiler/jit:common",
+        "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
+        "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/memory",
-        "//tensorflow/c:tf_tensor_internal",
-        "//tensorflow/compiler/jit:common",
-        "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            "//tensorflow/core/distributed_runtime/eager:remote_mgr",
             "//tensorflow/core:core_cpu_lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -561,8 +576,9 @@ cc_library(
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/distributed_runtime/eager:eager_client",
-            "//tensorflow/core/distributed_runtime/eager:remote_execute_node",
             "//tensorflow/core/distributed_runtime/eager:remote_copy_node",
+            "//tensorflow/core/distributed_runtime/eager:remote_execute_node",
+            "//tensorflow/core/distributed_runtime/eager:remote_mgr",
         ],
     }) + if_mkl([
         ":mkl_eager_op_rewrite",
@@ -582,6 +598,7 @@ tf_cc_test(
         "//tensorflow/compiler/jit:xla_cpu_jit",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:full_type_proto_cc",
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:partitioned_function_ops",
@@ -697,28 +714,28 @@ cc_library(
         ":eager_operation",
         ":kernel_and_device",
         ":placement_utils",
+        ":small_constants_optimizer",
         ":tensor_handle",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c/eager:abstract_function",
+        "//tensorflow/compiler/jit:common",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
+        "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/memory",
-        "//tensorflow/c:c_api_internal",
-        "//tensorflow/c:tf_tensor_internal",
-        "//tensorflow/compiler/jit:common",
-        "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
-        "//tensorflow/c/eager:abstract_function",
-        "//tensorflow/core/platform:errors",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            "//tensorflow/core/distributed_runtime/eager:remote_mgr",
             "//tensorflow/core:core_cpu_lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -726,8 +743,9 @@ cc_library(
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/distributed_runtime/eager:eager_client",
-            "//tensorflow/core/distributed_runtime/eager:remote_execute_node",
             "//tensorflow/core/distributed_runtime/eager:remote_copy_node",
+            "//tensorflow/core/distributed_runtime/eager:remote_execute_node",
+            "//tensorflow/core/distributed_runtime/eager:remote_mgr",
         ],
     }),
 )
@@ -742,9 +760,9 @@ tf_cuda_library(
     ],
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":attr_builder",
         ":context",
         ":custom_device",
-        ":attr_builder",
         ":eager_operation",
         "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
@@ -784,6 +802,18 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "rendezvous_cache",
+    hdrs = ["rendezvous_cache.h"],
+    deps = [
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:refcount",
+        "//tensorflow/tsl/platform:thread_annotations",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
 tf_cuda_library(
     name = "attr_builder",
     srcs = ["attr_builder.cc"],
@@ -841,6 +871,7 @@ filegroup(
         "eager_executor.h",
         "eager_operation.h",
         "kernel_and_device.h",
+        "rendezvous_cache.h",
         "tensor_handle.h",
         "tensor_handle_data.h",
     ],
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 6445da41ca2..c979020245d 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -112,6 +112,7 @@ class AttrBuilder : public AbstractOpAttrs {
   }
 
   const string& op_name() const { return op_name_; }
+  void set_op_name(const string& name) { op_name_ = name; }
 
   // Needed to work around call to ValidateNodeDef in CreateOpKernel.
   AttrBuilder& NumInputs(int n);
@@ -198,7 +199,7 @@ class AttrBuilder : public AbstractOpAttrs {
   gtl::FlatMap<string, string> encoded_attrs_;
   mutable AttrValue attr_tmp_;  // For encoding
 
-  string op_name_;  // Conceptually const, but can't be because of Reset(...)
+  string op_name_;
   int num_inputs_;
   NodeDef node_def_;
   bool node_def_initialized_;
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 98b8c8f8f16..a813b9ed6ba 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
+#include "tensorflow/core/common_runtime/eager/rendezvous_cache.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -44,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/colocation_graph.h"
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/eager/small_constants_optimizer.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
@@ -51,6 +53,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/tsl/platform/refcount.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/cluster_function_library_runtime.h"
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
@@ -98,60 +101,20 @@ const int64_t EagerContext::kGlobalRendezvousId = -1;
 
 // Find the rendezvous instance corresponding to the step id, or create a
 // new instance if not existing.
-IntraProcessRendezvous* EagerContext::LocalRendezvousTable::FindOrCreate(
-    int64_t step_id, DeviceMgr* device_mgr) {
-  mutex_lock l(table_lock_);
-  auto iter = table_.find(step_id);
-  if (iter == table_.end()) {
-    iter =
-        table_.insert({step_id, new IntraProcessRendezvous(device_mgr)}).first;
-    // Global rendezvous: ref-count should be 1 upon creation.
-    if (step_id == EagerContext::kGlobalRendezvousId) {
-      return iter->second;
-    }
-  }
-  iter->second->Ref();
-  return iter->second;
+tsl::core::RefCountPtr<IntraProcessRendezvous>
+EagerContext::LocalRendezvousCache::FindOrCreate(int64_t step_id,
+                                                 DeviceMgr* device_mgr) {
+  return cache_->FindOrCreate(step_id, [&]() {
+    return tsl::core::RefCountPtr<IntraProcessRendezvous>(
+        new IntraProcessRendezvous(device_mgr));
+  });
 }
 
-IntraProcessRendezvous* EagerContext::LocalRendezvousTable::Find(
-    int64_t step_id) {
-  mutex_lock l(table_lock_);
-  auto iter = table_.find(step_id);
-  if (iter == table_.end()) return nullptr;
-  iter->second->Ref();
-  return iter->second;
-}
-
-void EagerContext::LocalRendezvousTable::Remove(int64_t step_id) {
-  mutex_lock l(table_lock_);
-  auto iter = table_.find(step_id);
-  if (iter != table_.end()) {
-    if (step_id != EagerContext::kGlobalRendezvousId) {
-      iter->second->Unref();
-    }
-    table_.erase(iter);
-  }
-}
-
-void EagerContext::LocalRendezvousTable::CleanUpAll() {
-  mutex_lock l(table_lock_);
-  for (auto iter = table_.begin(); iter != table_.end(); iter++) {
-    // Unref all redezvous instance, except for global rendezvous,
-    // which is cleaned up elsewhere when necessary.
-    if (iter->first == -1) {
-      continue;
-    }
-    iter->second->Unref();
-  }
-}
-
-EagerContext::LocalRendezvousTable::~LocalRendezvousTable() { CleanUpAll(); }
-
 EagerContext::EagerContext(
     const SessionOptions& opts,
     ContextDevicePlacementPolicy default_device_placement_policy, bool async,
-    DeviceMgr* device_mgr, bool device_mgr_owned, Rendezvous* rendezvous,
+    DeviceMgr* device_mgr, bool device_mgr_owned,
+    tsl::core::RefCountPtr<Rendezvous> rendezvous,
     DistributedFunctionLibraryRuntime* cluster_flr,
     CollectiveExecutorMgrInterface* collective_executor_mgr,
     bool run_eager_op_as_function, bool jit_compile_rewrite)
@@ -160,7 +123,7 @@ EagerContext::EagerContext(
       default_device_placement_policy_(default_device_placement_policy),
       local_device_manager_(device_mgr, device_mgr_owned),
       host_cpu_device_(device_mgr->HostCPU()),
-      rendezvous_(rendezvous),
+      rendezvous_(std::move(rendezvous)),
       thread_pool_(NewThreadPoolFromSessionOptions(opts)),
       cluster_flr_(cluster_flr),
       log_device_placement_(opts.config.log_device_placement()),
@@ -204,10 +167,6 @@ EagerContext::EagerContext(
         MaybeCreateNcclCommunicator(opts.config)));
   }
 
-  // Initialization of local_rendezvous_table_ needs to happen before the
-  // initialization of global_rendezvous_for_functions_ because the latter
-  // depends on the former.
-  local_rendezvous_table_ = std::make_unique<LocalRendezvousTable>();
   ResetGlobalRendezvousForFunction();
 }
 
@@ -579,7 +538,7 @@ void EagerContext::CloseRemoteContexts(
           if (!s.ok()) {
             LOG(ERROR) << "Unable to close remote context with ID "
                        << context_id << " for worker: " << worker << " due to "
-                       << s.error_message();
+                       << s.message();
           }
           counter.DecrementCount();
         });
@@ -686,19 +645,20 @@ EagerContext::~EagerContext() {
 
   // Clean up all the rendezvous instances created via EagerContext.
   // Currently there are 3 cases in which a rendezvous instances is created:
-  // (1). Created through rendezvous_mgr.
-  // (2). Created within EagerContext using LocalRendezvousTable.
+  // (1). Created through a rendezvous_creator passed to EagerContext.
+  // (2). Created through rendezvous_mgr.
+  // (3). Created within EagerContext using LocalRendezvousCache.
   //
-  // Currently case-(2) is taken care of automatically when an EagerContext
-  // instance is deleted. The following code takes care of case-(1).
+  // Currently case-(3) is taken care of automatically when an EagerContext
+  // instance is deleted. The following code takes care of case-(2). Case-(1)
+  // is tricky as EagerContext does not have a way to access those rendezvous
+  // instances.
+  // TODO (tfrt-dev): Take care of case-(1) mentioned above.
   if (worker_env_ != nullptr && worker_env_->rendezvous_mgr != nullptr) {
     worker_env_->rendezvous_mgr->CleanupAll();
   }
 #endif  // !IS_MOBILE_PLATFORM
 
-  if (rendezvous_) {
-    rendezvous_->Unref();
-  }
   if (resource_deallocator_ != nullptr) {
     resource_deallocator_();
   }
@@ -717,6 +677,11 @@ const FunctionDef* EagerContext::FindFunctionDef(const string& name) const {
   return func_lib_def_.Find(name);
 }
 
+core::RefCountPtr<FunctionRecord> EagerContext::FindRecord(
+    const string& name) const {
+  return func_lib_def_.FindRecord(name);
+}
+
 std::unique_ptr<RunMetadata> EagerContext::ExportRunMetadata() {
   mutex_lock ml(metadata_mu_);
   auto result = std::make_unique<RunMetadata>();
@@ -874,7 +839,7 @@ Status EagerContext::MaybeRegisterFunctionRemotely(const FunctionDef& fdef) {
         [request, response](const Status& status) {
           if (!status.ok()) {
             LOG(ERROR) << "Failed to register function remotely due to "
-                       << status.error_message()
+                       << status.message()
                        << "\nThis could happen if the remote target has been "
                           "disconnected from the client.";
           }
@@ -911,7 +876,7 @@ Status EagerContext::MaybeRemoveFunctionRemotely(const string& function_name) {
         [request, response](const Status& status) {
           if (!status.ok()) {
             LOG(ERROR) << "Failed to remove function remotely due to "
-                       << status.error_message()
+                       << status.message()
                        << "\nThis could happen if the remote target has been "
                           "disconnected from the client.";
           }
@@ -955,7 +920,7 @@ Status EagerContext::RegisterExistingFunctionsOnRemoteWorkers(
           [request = requests[i], response](const Status& s) {
             if (!s.ok()) {
               LOG(ERROR) << "Failed to register function remotely due to "
-                         << s.error_message()
+                         << s.message()
                          << "\nThis could happen if the remote target has been "
                             "disconnected from the client.";
             }
@@ -981,6 +946,13 @@ Status EagerContext::AddFunctionDef(const FunctionDef& fdef,
                                     const FunctionDefLibrary& library,
                                     const bool add_to_local_only,
                                     const StackTracesMap& stack_traces) {
+  auto fdefs_to_add =
+      small_constants_optimizer::FoldInputTensors(fdef, func_lib_def_);
+  for (const auto& fdef_to_add : fdefs_to_add) {
+    TF_RETURN_IF_ERROR(
+        AddFunctionDef(fdef_to_add, library, add_to_local_only, stack_traces));
+  }
+
   bool is_first_ref = false;
   {
     mutex_lock l(cache_mu_);
@@ -1059,6 +1031,10 @@ EagerContext::GetCacheStats() {
     mutex_lock dl(device_cache_mu_);
     stats.device_cache_size = device_cache_.size();
   }
+  {
+    stats.local_rendezvous_cache_active_size =
+        local_rendezvous_cache_.GetActiveStepIds().size();
+  }
   return stats;
 }
 
@@ -1321,22 +1297,19 @@ void EagerContext::ClearResourceContainer(const string& name) {
 
 Status EagerContext::GetGlobalRendezvousForFunctionLocalRendezvousStatus() {
   mutex_lock l(global_rendezvous_mu_);
-  IntraProcessRendezvous* rendezvous =
-      local_rendezvous_table_->Find(kGlobalRendezvousId);
+  tsl::core::RefCountPtr<IntraProcessRendezvous> rendezvous =
+      local_rendezvous_cache_.Find(kGlobalRendezvousId);
   if (rendezvous == nullptr) return OkStatus();
-  Status s = rendezvous->GetLocalRendezvousStatus();
-  rendezvous->Unref();
-  return s;
+  return rendezvous->GetLocalRendezvousStatus();
 }
 
 void EagerContext::UpdateGlobalRendezvousDeviceManager(
     tensorflow::DeviceMgr* device_mgr) {
   mutex_lock l(global_rendezvous_mu_);
-  IntraProcessRendezvous* rendezvous =
-      local_rendezvous_table_->Find(kGlobalRendezvousId);
+  tsl::core::RefCountPtr<IntraProcessRendezvous> rendezvous =
+      local_rendezvous_cache_.Find(kGlobalRendezvousId);
   if (rendezvous == nullptr) return;
   rendezvous->UpdateDeviceManager(device_mgr);
-  rendezvous->Unref();
 }
 
 namespace {
@@ -1440,7 +1413,6 @@ Status EagerContext::StoreCollectiveOpsServer(
     }
     local_device_manager_.Reset(device_mgr);
     UpdateGlobalRendezvousDeviceManager(local_device_manager_.Get());
-    if (rendezvous_ != nullptr) rendezvous_->Unref();
     TF_RETURN_IF_ERROR(RendezvousFactory()(-1, nullptr, &rendezvous_));
   }
   host_cpu_device_ = local_device_manager_.Get()->HostCPU();
@@ -1569,8 +1541,8 @@ Status EagerContext::InitializeRemoteMaster(
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
     const std::vector<string>& remote_contexts, uint64 context_id,
-    Rendezvous* r, DeviceMgr* local_device_mgr, int keep_alive_secs,
-    DistributedFunctionLibraryRuntime* cluster_flr,
+    tsl::core::RefCountPtr<Rendezvous> r, DeviceMgr* local_device_mgr,
+    int keep_alive_secs, DistributedFunctionLibraryRuntime* cluster_flr,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
         remote_mgr) {
   if (context_id == kInvalidContextId) {
@@ -1590,8 +1562,8 @@ Status EagerContext::InitializeRemoteMaster(
   return SetMasterContextState(
       std::move(server), worker_env, std::move(worker_session),
       std::move(remote_eager_workers), std::move(remote_device_manager),
-      context_id, 0, r, local_device_mgr, keep_alive_secs, cluster_flr,
-      std::move(remote_mgr));
+      context_id, 0, std::move(r), local_device_mgr, keep_alive_secs,
+      cluster_flr, std::move(remote_mgr));
 }
 
 Status EagerContext::UpdateRemoteMaster(
@@ -1666,8 +1638,9 @@ Status EagerContext::SetMasterContextState(
     std::shared_ptr<WorkerSession> worker_session,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     std::unique_ptr<DynamicDeviceMgr> remote_device_manager, uint64 context_id,
-    uint64 context_view_id, Rendezvous* r, DeviceMgr* local_device_mgr,
-    int keep_alive_secs, DistributedFunctionLibraryRuntime* cluster_flr,
+    uint64 context_view_id, tsl::core::RefCountPtr<Rendezvous> r,
+    DeviceMgr* local_device_mgr, int keep_alive_secs,
+    DistributedFunctionLibraryRuntime* cluster_flr,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
         remote_mgr) {
   mutex_lock l(remote_state_mu_);
@@ -1688,8 +1661,7 @@ Status EagerContext::SetMasterContextState(
   }
   host_cpu_device_ = local_device_manager_.Get()->HostCPU();
 
-  if (rendezvous_ != nullptr) rendezvous_->Unref();
-  rendezvous_ = r;
+  rendezvous_ = std::move(r);
 
   // Memory leak!
   if (server_ != nullptr) {
@@ -1784,11 +1756,13 @@ Status EagerContext::SetMasterContextState(
 }
 
 Status EagerContext::InitializeRemoteWorker(
-    const WorkerEnv* worker_env, std::shared_ptr<WorkerSession> worker_session,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     DynamicDeviceMgr* remote_device_mgr,
     const std::vector<string>& remote_contexts, uint64 context_id,
-    uint64 context_view_id, DistributedFunctionLibraryRuntime* cluster_flr,
+    uint64 context_view_id,
+    std::function<tsl::core::RefCountPtr<Rendezvous>(const int64_t)>
+        rendezvous_creator,
+    DistributedFunctionLibraryRuntime* cluster_flr,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
         remote_mgr,
     std::function<void()> resource_deallocator) {
@@ -1811,11 +1785,9 @@ Status EagerContext::InitializeRemoteWorker(
   context_id_ = context_id;
   context_view_id_ = context_view_id;
 
+  rendezvous_creator_ = std::move(rendezvous_creator);
   remote_eager_workers_ = std::move(remote_eager_workers);
   remote_mgr_ = std::move(remote_mgr);
-  worker_env_ = worker_env;
-  worker_session_ = std::move(worker_session);
-
   ResetClusterFLR(cluster_flr);
 
   remote_device_manager_.Reset(remote_device_mgr);
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 1dd85f12bd5..88ad3f02012 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstddef>
+#include <functional>
 #include <map>
 #include <memory>
 #include <queue>
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/custom_device_op_handler.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/rendezvous_cache.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
@@ -61,6 +63,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/tsl/platform/refcount.h"
 
 // "tensorflow/core/platform/platform.h" must be included first before using
 // IS_MOBILE_PLATFORM.
@@ -97,7 +100,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       const SessionOptions& opts,
       ContextDevicePlacementPolicy default_device_placement_policy, bool async,
       /*const*/ DeviceMgr* device_mgr, bool device_mgr_owned,
-      /*const*/ Rendezvous* rendezvous,
+      /*const*/ tsl::core::RefCountPtr<Rendezvous> rendezvous,
       DistributedFunctionLibraryRuntime* cluster_flr = nullptr,
       CollectiveExecutorMgrInterface* collective_executor_mgr = nullptr,
       bool run_eager_op_as_function = false, bool jit_compile_rewrite = false);
@@ -210,6 +213,8 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
                             const tensorflow::OpRegistrationData** op_data);
 
   const FunctionDef* FindFunctionDef(const string& name) const override;
+  core::RefCountPtr<FunctionRecord> FindRecord(
+      const string& name) const override;
 
   Device* HostCPU() const { return host_cpu_device_; }
   Device* CanonicalDevice(Device* d) const {
@@ -291,22 +296,23 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   }
   bool LogMemory() const { return log_memory_; }
 
-  Rendezvous* GetRendezvous() const { return rendezvous_; }
+  // Returns a borrowed pointer to the global rendezvous. The rendezvous may
+  // become invalid if this Context is destroyed.
+  Rendezvous* GetRendezvous() const { return rendezvous_.get(); }
 
   void ResetGlobalRendezvousForFunction() override {
     mutex_lock l(global_rendezvous_mu_);
     // Remove the global rendezvous instance from the local rendezvous table
     // if it uses local rendezvous type, which forces EagerContext to create a
     // new local rendezvous instance in the table.
-    local_rendezvous_table_->Remove(-1);
-    Rendezvous* rendezvous;
-    TF_CHECK_OK(CreateRendezvousFactory()(-1, nullptr, &rendezvous));
-    global_rendezvous_for_functions_ =
-        core::RefCountPtr<Rendezvous>(rendezvous);
+    // TODO(b/274683676) Why can't we abort the old rendezvous here?
+    local_rendezvous_cache_.Remove(-1);
+    TF_CHECK_OK(CreateRendezvousFactory()(-1, nullptr,
+                                          &global_rendezvous_for_functions_));
   }
 
   // Returns the global_rendezvous_for_functions' underlying LocalRendezvous'
-  // status. If the underlying Rendezvous is not in the local_rendezvous_table_
+  // status. If the underlying Rendezvous is not in the local_rendezvous_cache_
   // returns OK.
   Status GetGlobalRendezvousForFunctionLocalRendezvousStatus();
 
@@ -318,27 +324,27 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // The caller of the returned function owns a reference to the resulting
   // Rendezvous.
   Rendezvous::Factory RendezvousFactory() {
-    Rendezvous::Factory factory = CreateRendezvousFactory();
-
-    if (!factory && reuse_rendezvous_for_functions_) {
-      // There is an implicit assumption that the
-      // global_rendezvous_for_functions_ is always an IntraProcessRendezvous to
-      // match the behaviour of the EagerContext's rendezvous. Ref:
-      // tensorflow/c/eager/c_api.cc;l=143;rcl=396387348 If a cross process
-      // kernel needs a rendezvous a new InterProcessRendezvous should be
-      // created.
-      factory = Rendezvous::Factory{[this](const int64_t step_id,
-                                           const DeviceMgr* device_mgr,
-                                           Rendezvous** r) {
+    // There is an implicit assumption that the global_rendezvous_for_functions_
+    // is always an IntraProcessRendezvous to match the behaviour of the
+    // EagerContext's rendezvous.
+    // Ref: tensorflow/c/eager/c_api.cc;l=143;rcl=396387348
+    // If a cross process kernel needs a rendezvous a new InterProcessRendezvous
+    // should be created.
+    if (reuse_rendezvous_for_functions_ && rendezvous_creator_ == nullptr &&
+#if !defined(IS_MOBILE_PLATFORM)
+        worker_env_ == nullptr &&
+#endif
+        remote_device_mgr() == nullptr) {
+      return Rendezvous::Factory{[this](const int64_t step_id,
+                                        const DeviceMgr* device_mgr,
+                                        tsl::core::RefCountPtr<Rendezvous>* r) {
         mutex_lock l(global_rendezvous_mu_);
-        // Increase the ref that owned by the caller.
-        global_rendezvous_for_functions_->Ref();
-        *r = global_rendezvous_for_functions_.get();
+        *r = global_rendezvous_for_functions_.GetNewRef();
         return OkStatus();
       }};
+    } else {
+      return CreateRendezvousFactory();
     }
-
-    return factory;
   }
 
   CollectiveExecutorMgrInterface* collective_executor_mgr() {
@@ -350,6 +356,9 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
             collective_executor_mgr()->FindOrCreate(0), true /*inherit_ref*/));
   }
 
+  void SetCollectiveExecutorMgr(CollectiveExecutorMgrInterface* mgr) {
+    collective_executor_mgr_.Reset(mgr);
+  }
   tensorflow::DeviceMgr* local_device_mgr() const {
     return local_device_manager_.Get();
   }
@@ -424,8 +433,9 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
       const std::vector<string>& remote_contexts, uint64 context_id,
-      /*const*/ Rendezvous* r, /*const*/ DeviceMgr* local_device_mgr,
-      int keep_alive_secs, DistributedFunctionLibraryRuntime* cluster_flr,
+      tsl::core::RefCountPtr<Rendezvous> r,
+      /*const*/ DeviceMgr* local_device_mgr, int keep_alive_secs,
+      DistributedFunctionLibraryRuntime* cluster_flr,
       std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
           remote_mgr);
 
@@ -444,12 +454,13 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // Similar with InitializeRemoteMaster but this context will not kill remote
   // contexts in shutdown.
   Status InitializeRemoteWorker(
-      const WorkerEnv* worker_env,
-      std::shared_ptr<WorkerSession> worker_session,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       DynamicDeviceMgr* remote_device_mgr,
       const std::vector<string>& remote_contexts, uint64 context_id,
-      uint64 context_view_id, DistributedFunctionLibraryRuntime* cluster_flr,
+      uint64 context_view_id,
+      std::function<tsl::core::RefCountPtr<Rendezvous>(const int64_t)>
+          rendezvous_creator,
+      DistributedFunctionLibraryRuntime* cluster_flr,
       std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
           remote_mgr,
       std::function<void()> resource_deallocator);
@@ -570,61 +581,65 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   static const int64_t kGlobalRendezvousId;
 
  private:
-  // The class for wrapping a map of step_id to local rendezvous instances.
-  class LocalRendezvousTable {
+  // The class for caching Rendezvous instances per step_id.
+  // If the Rendezvous object is destroyed for the step, a new one will be
+  // created on demand.
+  class LocalRendezvousCache {
    public:
-    LocalRendezvousTable() = default;
-    ~LocalRendezvousTable();
+    LocalRendezvousCache()
+        : cache_(new RendezvousCache<IntraProcessRendezvous>) {}
 
-    IntraProcessRendezvous* FindOrCreate(int64_t step_id,
-                                         DeviceMgr* device_mgr);
-    IntraProcessRendezvous* Find(int64_t step_id);
-    void Remove(int64_t step_id);
-    void CleanUpAll();
+    tsl::core::RefCountPtr<IntraProcessRendezvous> FindOrCreate(
+        int64_t step_id, DeviceMgr* device_mgr);
+
+    tsl::core::RefCountPtr<IntraProcessRendezvous> Find(int64_t step_id) const {
+      return cache_->Find(step_id);
+    }
+
+    std::vector<int64_t> GetActiveStepIds() const {
+      return cache_->GetActiveStepIds();
+    }
+
+    void Remove(int64_t step_id) { cache_->Remove(step_id); }
 
    private:
-    mutable mutex table_lock_;
-    absl::flat_hash_map<int64_t, IntraProcessRendezvous*> table_
-        TF_GUARDED_BY(table_lock_);
+    tsl::core::RefCountPtr<RendezvousCache<IntraProcessRendezvous>> cache_;
   };
 
-  Rendezvous::Factory CreateRendezvousFactory() const {
+  Rendezvous::Factory CreateRendezvousFactory() {
+    if (rendezvous_creator_ != nullptr) {
+      return Rendezvous::Factory{[this](const int64_t step_id,
+                                        const DeviceMgr* device_mgr,
+                                        tsl::core::RefCountPtr<Rendezvous>* r) {
+        VLOG(6) << "Creating rendezvous using the rendezvous_creator_.";
+        *r = rendezvous_creator_(step_id);
+        return OkStatus();
+      }};
+    }
+
 #if !defined(IS_MOBILE_PLATFORM)
     if (worker_env_ != nullptr && worker_env_->rendezvous_mgr != nullptr) {
-      return Rendezvous::Factory{
-          [this](const int64_t step_id, const DeviceMgr* device_mgr,
-                 Rendezvous** r) {
-            VLOG(6)
-                << "Creating rendezvous using the worker_env's rendezvous_mgr.";
-            // TODO(hhb): Add a Create method and use it here.
-            auto* remote_r = worker_env_->rendezvous_mgr->Find(step_id);
-            remote_r->Initialize(worker_session_.get()).IgnoreError();
-            *r = remote_r;
-            return OkStatus();
-          },
-          [this](const int64_t step_id) {
-            VLOG(6) << "Cleaning up rendezvous from the rendezvous_mgr. "
-                    << "Step id: " << step_id;
-            worker_env_->rendezvous_mgr->Cleanup(step_id);
-            return OkStatus();
-          }};
+      return Rendezvous::Factory{[this](const int64_t step_id,
+                                        const DeviceMgr* device_mgr,
+                                        tsl::core::RefCountPtr<Rendezvous>* r) {
+        VLOG(6) << "Creating rendezvous using the worker_env's rendezvous_mgr.";
+        // TODO(hhb): Add a Create method and use it here.
+        auto remote_r = worker_env_->rendezvous_mgr->Find(step_id);
+        remote_r->Initialize(worker_session_.get()).IgnoreError();
+        *r = std::move(remote_r);
+        return OkStatus();
+      }};
     }
 #endif
 
     if (remote_device_mgr() == nullptr) {
-      return Rendezvous::Factory{
-          [this](const int64_t step_id, const DeviceMgr* device_mgr,
-                 Rendezvous** r) {
-            VLOG(6) << "Creating rendezvous using local_device_mgr.";
-            *r = local_rendezvous_table_->FindOrCreate(step_id,
-                                                       local_device_mgr());
-            return OkStatus();
-          },
-          [this](const int64_t step_id) {
-            VLOG(6) << "Cleaning up rendezvous from local_device_mgr.";
-            local_rendezvous_table_->Remove(step_id);
-            return OkStatus();
-          }};
+      return Rendezvous::Factory{[this](const int64_t step_id,
+                                        const DeviceMgr* device_mgr,
+                                        tsl::core::RefCountPtr<Rendezvous>* r) {
+        VLOG(6) << "Creating rendezvous using local_device_mgr.";
+        *r = local_rendezvous_cache_.FindOrCreate(step_id, local_device_mgr());
+        return OkStatus();
+      }};
     }
 
     return Rendezvous::Factory();
@@ -707,7 +722,9 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   mutable mutex device_type_list_mu_;
   std::shared_ptr<std::vector<DeviceType>> prioritized_device_type_list_
       TF_GUARDED_BY(device_type_list_mu_);
-  Rendezvous* rendezvous_;
+  tsl::core::RefCountPtr<Rendezvous> rendezvous_;
+  std::function<tsl::core::RefCountPtr<Rendezvous>(const int64_t)>
+      rendezvous_creator_;
   CustomDeviceOpHandler custom_device_op_handler_;
 
   mutable mutex composite_devices_mu_;
@@ -775,11 +792,13 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   // The table of local rendezvous instances for intra-process communication.
   // This make sures only one local rendezvous instance exists per step id.
-  std::unique_ptr<LocalRendezvousTable> local_rendezvous_table_;
+  LocalRendezvousCache local_rendezvous_cache_;
 
   // Whether to use same rendezvous instance across function/eager executions.
   std::atomic<bool> reuse_rendezvous_for_functions_{false};
   mutable mutex global_rendezvous_mu_;
+
+  // Keeps alive the global rendezvous object.
   core::RefCountPtr<Rendezvous> global_rendezvous_for_functions_
       TF_GUARDED_BY(global_rendezvous_mu_);
   mutex reuse_rendezvous_for_functions_mu_;
@@ -801,7 +820,8 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       std::shared_ptr<WorkerSession> worker_session,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
-      uint64 context_id, uint64 context_view_id, /*const*/ Rendezvous* r,
+      uint64 context_id, uint64 context_view_id,
+      tsl::core::RefCountPtr<Rendezvous> r,
       /*const*/ DeviceMgr* local_device_mgr, int keep_alive_secs,
       DistributedFunctionLibraryRuntime* cluster_flr,
       std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
@@ -811,7 +831,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // Therefore the server_ object is not marked as const (even though it should
   // be).
   std::unique_ptr<ServerInterface> server_;
-  const WorkerEnv* worker_env_ = nullptr;
+  WorkerEnv* worker_env_ = nullptr;
   std::shared_ptr<WorkerSession> worker_session_;
 
   mutable mutex remote_state_mu_;
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
index f39cf6f5e5b..1f4d2c1d5fb 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
@@ -402,7 +402,7 @@ Status UpdateContextWithServerDef(EagerContext* context,
   do {                                                \
     const tensorflow::Status _status = (__VA_ARGS__); \
     if (TF_PREDICT_FALSE(!_status.ok())) {            \
-      LOG(ERROR) << _status.error_message();          \
+      LOG(ERROR) << _status.message();                \
       return _status;                                 \
     }                                                 \
   } while (0);
@@ -561,7 +561,7 @@ Status UpdateContextWithServerDef(EagerContext* context,
     // see additional errors if ops are subsequently sent to the failed workers.
     if (TF_PREDICT_FALSE(!s.ok())) {
       LOG(ERROR) << "Error when creating contexts on remote targets: "
-                 << s.error_message()
+                 << s.message()
                  << "\nExecuting remote ops or functions on these remote "
                     "targets will fail.";
     }
@@ -600,7 +600,7 @@ Status UpdateContextWithServerDef(EagerContext* context,
   auto session_name = strings::StrCat("eager_", context_id);
   auto* session_mgr = server->worker_env()->session_mgr;
   if (reset_context) {
-    RemoteRendezvous* r =
+    tsl::core::RefCountPtr<RemoteRendezvous> r =
         server->worker_env()->rendezvous_mgr->Find(context_id);
     auto* device_mgr = server->worker_env()->device_mgr;
     std::shared_ptr<WorkerSession> worker_session;
@@ -623,8 +623,8 @@ Status UpdateContextWithServerDef(EagerContext* context,
     LOG_AND_RETURN_IF_ERROR(context->InitializeRemoteMaster(
         std::move(new_server), server->worker_env(), worker_session,
         std::move(remote_eager_workers), std::move(new_remote_device_mgr),
-        remote_workers, context_id, r, device_mgr, keep_alive_secs, cluster_flr,
-        std::move(remote_mgr)));
+        remote_workers, context_id, std::move(r), device_mgr, keep_alive_secs,
+        cluster_flr, std::move(remote_mgr)));
 
     // NOTE: We start the server after all other initialization, because the
     // GrpcServer cannot be destroyed after it is started.
@@ -688,7 +688,7 @@ Status EagerContextDistributedManager::EnableCollectiveOps(
   do {                                                \
     const tensorflow::Status _status = (__VA_ARGS__); \
     if (TF_PREDICT_FALSE(!_status.ok())) {            \
-      LOG(ERROR) << _status.error_message();          \
+      LOG(ERROR) << _status.message();                \
       return _status;                                 \
     }                                                 \
   } while (0);
@@ -723,6 +723,25 @@ Status EagerContextDistributedManager::EnableCollectiveOps(
       coordination_service_agent_ = session_mgr->GetCoordinationServiceAgent();
       LOG_AND_RETURN_IF_ERROR(server->SetCoordinationServiceAgentInstance(
           coordination_service_agent_));
+      // Start preemption notifier that will propagate preemption signals to the
+      // cluster.
+      preemption_notifier_ = tsl::PreemptionNotifier::CreatePreemptionNotifier(
+          "sigterm", Env::Default());
+      preemption_notifier_->WillBePreemptedAtAsync(
+          [coord_agent = coordination_service_agent_](
+              StatusOr<absl::Time> time_or_status) {
+            if (time_or_status.ok()) {
+              const auto coord_task = coord_agent->GetOwnTask().value();
+              Status s = coord_agent->InsertKeyValue(
+                  "TF_DEFAULT_PREEMPTION_NOTICE_KEY",
+                  absl::StrCat("/job:", coord_task.job_name(),
+                               "/task:", coord_task.task_id()));
+              if (!s.ok()) {
+                LOG(INFO) << "Preemption not exported to coordination service: "
+                          << s;
+              }
+            }
+          });
     }
 
     LOG_AND_RETURN_IF_ERROR(server->Start());
@@ -798,7 +817,7 @@ Status EagerContextDistributedManager::CheckRemoteAlive(
     *is_alive = true;
   } else {
     LOG(INFO) << "Remote worker " << remote_task_name
-              << " is not alive: " << remote_status.error_message();
+              << " is not alive: " << remote_status.message();
   }
   return OkStatus();
 }
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index cb1174e543a..e4c406251d5 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -283,7 +283,7 @@ TEST_F(EagerContextTest, FunctionErrorRecovery) {
   op_and_sync_status.Update(
       fail_op->Execute(absl::MakeSpan(retvals), &num_retvals));
   op_and_sync_status.Update(context()->SyncExecutors());
-  ASSERT_THAT(op_and_sync_status.as_summary_status().error_message(),
+  ASSERT_THAT(op_and_sync_status.as_summary_status().message(),
               HasSubstr("assertion failed"));
   if (retvals[0] != nullptr) {
     retvals[0]->Unref();
@@ -346,24 +346,25 @@ TEST_F(EagerContextTest, LocalRendezvousCreation) {
   // Create a new rendezvous instance.
   // Initially its ref-count is 2:
   // one added upon rendezvous creation, the other one added by EagerContext.
-  Rendezvous* rendezvous_1;
+  tsl::core::RefCountPtr<Rendezvous> rendezvous_1;
   TF_ASSERT_OK(rendezvous_creator(1, nullptr, &rendezvous_1));
-  EXPECT_EQ(rendezvous_1->RefCount(), 2);
+  EXPECT_EQ(rendezvous_1->RefCount(), 1);
 
   // Create another rendezvous instance with the same step-id.
   // This would add one more ref-count to the existing rendezvous insteance
   // insted of creating a new instance.
-  Rendezvous* rendezvous_2;
+  tsl::core::RefCountPtr<Rendezvous> rendezvous_2;
   TF_ASSERT_OK(rendezvous_creator(1, nullptr, &rendezvous_2));
-  EXPECT_EQ(rendezvous_2->RefCount(), 3);
+  EXPECT_EQ(rendezvous_2->RefCount(), 2);
 
   // Caller releases rendezvous-1.
-  rendezvous_1->Unref();
-  EXPECT_EQ(rendezvous_1->RefCount(), 2);
+  rendezvous_1.reset();
+  EXPECT_EQ(rendezvous_2->RefCount(), 1);
 
   // Caller releases rendezvous-2.
-  rendezvous_2->Unref();
-  EXPECT_EQ(rendezvous_2->RefCount(), 1);
+  tsl::core::WeakPtr<Rendezvous> weak2{rendezvous_2.get()};
+  rendezvous_2.reset();
+  EXPECT_EQ(weak2.GetNewRef(), nullptr);
 }
 
 void TestGlobalRendezvous(EagerContext* context, bool reuse_global_rendezvous) {
@@ -371,24 +372,19 @@ void TestGlobalRendezvous(EagerContext* context, bool reuse_global_rendezvous) {
   EXPECT_EQ(context->GetReuseRendezvousForFunctions(), reuse_global_rendezvous);
 
   auto rendezvous_creator = context->RendezvousFactory();
-  Rendezvous* rendezvous_1;
+  tsl::core::RefCountPtr<Rendezvous> rendezvous_1;
   TF_ASSERT_OK(rendezvous_creator(-1, nullptr, &rendezvous_1));
   EXPECT_EQ(rendezvous_1->RefCount(), 2);
-  Rendezvous* rendezvous_2;
+  tsl::core::RefCountPtr<Rendezvous> rendezvous_2;
   TF_ASSERT_OK(rendezvous_creator(-1, nullptr, &rendezvous_2));
   EXPECT_EQ(rendezvous_2->RefCount(), 3);
 
   // Global rendezvous's ref-count should be back to 1 after resetting.
   context->ResetGlobalRendezvousForFunction();
 
-  Rendezvous* rendezvous_3;
+  tsl::core::RefCountPtr<Rendezvous> rendezvous_3;
   TF_ASSERT_OK(rendezvous_creator(-1, nullptr, &rendezvous_3));
   EXPECT_EQ(rendezvous_3->RefCount(), 2);
-
-  // Callers release rendezvous.
-  rendezvous_1->Unref();
-  rendezvous_2->Unref();
-  rendezvous_3->Unref();
 }
 
 TEST_F(EagerContextTest, GlobalRendezvousCreation) {
diff --git a/tensorflow/core/common_runtime/eager/custom_device_test.cc b/tensorflow/core/common_runtime/eager/custom_device_test.cc
index 50dea68a316..25ebdb40a27 100644
--- a/tensorflow/core/common_runtime/eager/custom_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/custom_device_test.cc
@@ -116,14 +116,14 @@ TEST(CustomDevice, TestTensorHandle) {
                                        /*length=*/3));
   Status s;
   std::string device_type = tensor->DeviceType(&s);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   EXPECT_EQ("CUSTOM", device_type);
   int device_index = tensor->DeviceId(&s);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   EXPECT_EQ(15, device_index);
   int64_t num_elements = 0;
   s = tensor->NumElements(&num_elements);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   EXPECT_EQ(3, num_elements);
   EXPECT_THAT(
       tensor->DebugString(),
@@ -149,7 +149,7 @@ TEST(CustomDevice, TestTensorHandleUnknownDimNumElements) {
   int64_t num_elements;
   Status s = tensor->NumElements(&num_elements);
   EXPECT_FALSE(s.ok());
-  EXPECT_THAT(s.error_message(), HasSubstr("representing varying shapes"));
+  EXPECT_THAT(s.message(), HasSubstr("representing varying shapes"));
 }
 
 TEST(CustomDevice, TestResourcePlacement) {
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index b9d7bf0f9f8..6d94719fb07 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -166,6 +166,13 @@ class EagerOperation : public ImmediateExecutionOperation {
 
   void UpdateInput(int i, TensorHandle* h);
 
+  // This is useful if we want the EagerOperation to point to a different
+  // function.
+  void UpdateName(const string& name) {
+    op_name_ = name.c_str();
+    attrs_.set_op_name(name);
+  }
+
   // Like TensorHandles, EagerOperations may be placed either on a virtual
   // CustomDevice or on a physical Device.
   VariantDevice Device() const { return device_; }
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 8ccc187f557..1bf82581731 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -28,8 +28,12 @@ limitations under the License.
 #include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_replace.h"
+#include "tensorflow/core/common_runtime/arg_ret_placement.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/small_constants_optimizer.h"
+#include "tensorflow/core/common_runtime/int32_fulltype.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/full_type.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -69,6 +73,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/tsl/platform/fingerprint.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_copy_node.h"
@@ -205,7 +210,7 @@ Status CopyInputToExpectedDevice(EagerContext* ctx, EagerOperation* op,
         status.code(),
         absl::StrCat("Failed copying input tensor from ", handle_device->name(),
                      " to ", expected_input_device->name(), " in order to run ",
-                     op->Name(), ": ", status.error_message()));
+                     op->Name(), ": ", status.message()));
   }
 
   *result = result_handle;
@@ -350,11 +355,22 @@ Status GetDeviceForInput(const EagerOperation& op, const EagerContext& ctx,
     const bool is_tpu = device != nullptr && device->device_type() == "TPU";
     // int32 return values can be placed on TPUs.
     // int32 retrun values can be placed on device for eager operations.
+    FullTypeDef ft = tensor_handle->FullType();
     const bool use_host_memory =
         is_tpu || (!op.is_function() && device != cpu_device &&
                    !is_host_memory_arg)
             ? MTypeFromDTypeIntsOnDevice(tensor_handle->dtype)
             : MTypeFromDType(tensor_handle->dtype);
+    if (use_host_memory) {
+      Int32FulltypePass int32_ft("GetDeviceForInput");
+      TF_RETURN_IF_ERROR(int32_ft.Int32FullTypeForTensor(
+          tensor_handle->dtype, &ft, /*set_only_int32=*/false));
+      VLOG(2)
+          << "Full type information with TFT_SHAPE_TENSOR for int32 for eager '"
+          << tensor_handle->DebugString();
+    }
+    TF_RETURN_IF_ERROR(
+        tensorflow::full_type::CheckMemoryType(use_host_memory, ft));
     if (use_host_memory) {
       *result = cpu_device;
     } else {
@@ -1039,6 +1055,68 @@ bool IntArgsAndRetvalsOnDevice(EagerOperation* op,
   return true;
 }
 
+using BoolTensorInputs = std::vector<std::pair<std::string, bool>>;
+
+// Removes boolean tensor inputs from the EagerOperation and returns them.
+// Currently this is only useful to invoke when small_constants_optimizer is
+// enabled because the runtime will have equivalent FunctionDefs of the original
+// tf.function without the boolean tensor input.
+StatusOr<BoolTensorInputs> RemoveBoolInputs(EagerOperation* op) {
+  BoolTensorInputs result;
+  if (!op->is_function()) return result;
+  // Extract tensor inputs.
+  const absl::InlinedVector<TensorHandle*, 4>* inputs;
+  if (!op->TensorHandleInputs(&inputs).ok()) return result;
+  // Extract the FunctionDef.
+  const FunctionDef* fdef = op->EagerContext().GetFunctionDef(op->Name());
+  if (fdef == nullptr) return result;
+  // Ensure the number of inputs matches the specification in the FunctionDef.
+  if (fdef->signature().input_arg_size() != inputs->size()) return result;
+
+  // Remove all boolean inputs.
+  absl::InlinedVector<TensorHandle*, 4> stripped_inputs;
+  for (int32_t i = 0; i < fdef->signature().input_arg_size(); ++i) {
+    const auto& input_arg = fdef->signature().input_arg(i);
+    // Identify non-boolean inputs to this EagerOperation.
+    if (input_arg.type() != DT_BOOL) {
+      stripped_inputs.push_back(inputs->at(i));
+      continue;
+    }
+    // Identify boolean inputs to this EagerOperation that are on host.
+    const TensorHandle* handle = inputs->at(i);
+    Status s;
+    const char* input_device = handle->DeviceType(&s);
+    if (!s.ok() || !absl::StrContains(input_device, "CPU")) {
+      return errors::InvalidArgument(
+          "Expecting boolean tensor to be on host when "
+          "small_constants_optimizer is enabled.");
+    }
+    const Tensor* tensor;
+    TF_RETURN_IF_ERROR(handle->Tensor(&tensor));
+    // small_constant_optimizer does not handle non-scalar boolean inputs.
+    if (tensor->NumElements() != 1) {
+      stripped_inputs.push_back(inputs->at(i));
+      continue;
+    }
+    const bool input_value = tensor->scalar<bool>()();
+    result.emplace_back(input_arg.name(), input_value);
+  }
+
+  // If we were able to identify all boolean inputs, update the op's inputs.
+  op->Clear();
+  for (auto* input : stripped_inputs) {
+    TF_RETURN_IF_ERROR(op->AddInput(input));
+  }
+  return result;
+}
+
+bool IsSmallConstantOptimizationEnabled(const EagerOperation& op) {
+  if (!op.is_function()) return false;
+  const FunctionDef* fdef = op.EagerContext().GetFunctionDef(op.Name());
+  if (fdef == nullptr) return false;
+  return small_constants_optimizer::IsSmallConstantOptimizationEnabled(*fdef);
+}
+
 StatusOr<Fprint128> GetKernelCacheKey(
     const EagerOperation& op, const Fprint128& op_cache_key,
     const std::vector<Device*>& input_device_ptrs,
@@ -1188,6 +1266,18 @@ Status GetOrCreateKernelAndDevice(
   EagerContext& ctx = op->EagerContext();
   Device* device = absl::get<Device*>(op->Device());
 
+  // Update the EagerOperation with information about the boolean input tensors
+  // when small constant optimization is enabled.
+  if (IsSmallConstantOptimizationEnabled(*op)) {
+    TF_ASSIGN_OR_RETURN(BoolTensorInputs bool_inputs, RemoveBoolInputs(op));
+    string folded_name = op->Name();
+    for (const auto& [input_name, input_value] : bool_inputs) {
+      folded_name = small_constants_optimizer::FoldedFunctionName(
+          folded_name, input_name, input_value);
+    }
+    op->UpdateName(folded_name);
+  }
+
   // Set the EagerOperation's device prior to extracting the input_device_ptrs
   // to avoid any redundant H2D/D2H copies.
   if (device == nullptr && !op->is_function()) {
@@ -1926,8 +2016,8 @@ void CollectGraphs(EagerContext* ctx) {
 }
 }  // namespace
 
-Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
-                    int* num_retvals) {
+Status DoEagerExecute(EagerOperation* op, TensorHandle** retvals,
+                      int* num_retvals) {
   profiler::TraceMe activity([&] {
     return ::tensorflow::profiler::TraceMeEncode(
         "EagerExecute",
@@ -2014,6 +2104,31 @@ Status EagerKernelExecute(
                           kernel.get(), eager_func_params);
 }
 
+Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
+                    int* num_retvals) {
+  if (VLOG_IS_ON(1) && op->is_function()) {
+    const std::string& op_name = op->Name();
+    const std::string& exec_mode = op->IsLocal() ? "local" : "remote";
+    const std::string& device_name = op->DeviceName();
+
+    auto msg = absl::StrCat("eager executing ", exec_mode, " operation '",
+                            op_name, "'");
+
+    if (!device_name.empty()) {
+      absl::StrAppend(&msg, " on device '", device_name, "'");
+    }
+
+    VLOG(1) << "Entering " << msg;
+
+    Status status = DoEagerExecute(op, retvals, num_retvals);
+
+    VLOG(1) << "Exiting " << msg << ", status code is " << status;
+
+    return status;
+  }
+  return DoEagerExecute(op, retvals, num_retvals);
+}
+
 namespace {
 
 Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
diff --git a/tensorflow/core/common_runtime/eager/execute_test.cc b/tensorflow/core/common_runtime/eager/execute_test.cc
index eccf04346dd..c9dc9bc9958 100644
--- a/tensorflow/core/common_runtime/eager/execute_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_test.cc
@@ -15,8 +15,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/execute.h"
 
 #include <memory>
+#include <utility>
 #include <vector>
 
+#include "tensorflow/core/framework/full_type.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
@@ -112,6 +114,71 @@ TEST(ExecuteTest, SimpleFunction) {
   ctx->Unref();
 }
 
+TEST(ExecuteTest, SimpleFunctionInt32BadFullType) {
+  StaticDeviceMgr device_mgr(
+      DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:0"));
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_EXPLICIT,
+      false, &device_mgr, /*device_mgr_owned=*/false, /*rendezvous=*/nullptr,
+      /*cluster_flr=*/nullptr, /*collective_executor_mgr=*/nullptr,
+      /*run_eager_op_as_function=*/true);
+
+  const Tensor kTwo = test::AsScalar<int32_t>(2);
+  const string function_name = "XTimesTwo";
+  const FunctionDef x_times_two = FunctionDefHelper::Define(
+      // Name
+      function_name,
+      // Args
+      {"x: int32"},
+      // Return values
+      {"y: int32"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT32}}},
+          {{"scale"},
+           "Cast",
+           {"two"},
+           {{"SrcT", DT_INT32}, {"DstT", DT_INT32}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", DT_INT32}}},
+      });
+  TF_ASSERT_OK(ctx->AddFunctionDef(x_times_two));
+
+  auto op = std::make_unique<EagerOperation>(ctx);
+  TF_ASSERT_OK(op->Reset(
+      /*op=*/function_name.c_str(),
+      /*raw_device_name=*/"/job:localhost/replica:0/task:0/device:CPU:0"));
+
+  // Get a TensorHandle for the input (which has a method for setting its full
+  // type information) and set bad full type information (TFT_TENSOR instead of
+  // TFT_TENSOR[TFT_INT32]) to cause Int32FulltypePass to return an error.
+  Tensor input_tensor = test::AsScalar<int32_t>(3);
+  ASSERT_NE(ctx->HostCPUName().c_str(), nullptr);
+  Device* d = nullptr;
+  TF_ASSERT_OK(ctx->FindDeviceFromName(ctx->HostCPUName().c_str(), &d));
+  auto input = core::RefCountPtr<TensorHandle>(
+      TensorHandle::CreateLocalHandle(std::move(input_tensor), /*d=*/d,
+                                      /*op_device=*/nullptr, ctx));
+  TF_ASSERT_OK(op->AddInput(input.get()));
+  FullTypeDef ft;
+  ft.set_type_id(TFT_TENSOR);
+  input.get()->SetFullType(ft);
+
+  std::vector<TensorHandle*> retvals(1);
+  int num_retvals = retvals.size();
+  Status status = EagerExecute(op.get(), retvals.data(), &num_retvals);
+  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  EXPECT_TRUE(
+      absl::StrContains(status.message(), "TFT_TENSOR has 0 args instead of 1"))
+      << "Actual: " << status.message();
+  // Since an error occured before the function ran, retval[0] was never
+  // assigned.
+  ASSERT_EQ(retvals[0], nullptr);
+  ctx->Unref();
+}
+
 TEST(ExecuteTest, CompiledFunction) {
   StaticDeviceMgr device_mgr(
       DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:0"));
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index fc72bdeca68..0d95d16fbd7 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -354,7 +354,8 @@ KernelAndDeviceFunc::PrepareForRun(
     CancellationManager* cancellation_manager,
     const absl::optional<EagerFunctionParams>& eager_func_params,
     const absl::optional<ManagedStackTrace>& stack_trace,
-    tsl::CoordinationServiceAgent* coordination_service_agent) {
+    tsl::CoordinationServiceAgent* coordination_service_agent,
+    tsl::core::RefCountPtr<Rendezvous>* rendezvous) {
   std::shared_ptr<FunctionLibraryRuntime::Options> opts = nullptr;
   if (eager_func_params.has_value()) {
     const EagerFunctionParams& params = eager_func_params.value();
@@ -382,9 +383,8 @@ KernelAndDeviceFunc::PrepareForRun(
   // We don't pass rendezvous from eager context because we can get tensor
   // name collisions in send/recv ops when running multiple instances
   // of the same multi-device function concurrently.
-  Rendezvous* rendezvous = nullptr;
-  TF_CHECK_OK(rendezvous_factory_(opts->step_id, nullptr, &rendezvous));
-  opts->rendezvous = rendezvous;
+  TF_CHECK_OK(rendezvous_factory_(opts->step_id, nullptr, rendezvous));
+  opts->rendezvous = rendezvous->get();
   opts->create_rendezvous = false;
 
   // Create a cancellation manager to be used by FLR options if caller does not
@@ -432,9 +432,10 @@ Status KernelAndDeviceFunc::Run(
     n.WaitForNotification();
     return status;
   }
-  std::shared_ptr<FunctionLibraryRuntime::Options> opts =
-      PrepareForRun(step_container, outputs, cancellation_manager,
-                    eager_func_params, stack_trace, coordination_service_agent);
+  tsl::core::RefCountPtr<Rendezvous> created_rendezvous;
+  std::shared_ptr<FunctionLibraryRuntime::Options> opts = PrepareForRun(
+      step_container, outputs, cancellation_manager, eager_func_params,
+      stack_trace, coordination_service_agent, &created_rendezvous);
 
   std::vector<Tensor> rets;
   Status s;
@@ -447,11 +448,6 @@ Status KernelAndDeviceFunc::Run(
   if (cancellation_manager == nullptr) {
     delete opts->cancellation_manager;
   }
-  static_cast<Rendezvous*>(opts->rendezvous)->Unref();
-  if (opts->cleanup_rendezvous_after_run) {
-    // Clean up the rendezvous created in PrepareForRun.
-    TF_RETURN_IF_ERROR(rendezvous_factory_.CleanUp(opts->step_id));
-  }
   outputs->reserve(rets.size());
   for (auto& v : rets) {
     outputs->push_back(std::move(v));
@@ -472,24 +468,21 @@ void KernelAndDeviceFunc::RunAsync(
                                        {{"_r", 1}});
       },
       profiler::TraceMeLevel::kInfo);
+  tsl::core::RefCountPtr<Rendezvous> created_rendezvous;
   std::shared_ptr<FunctionLibraryRuntime::Options> opts = PrepareForRun(
       step_container, outputs, cancellation_manager, eager_func_params,
-      absl::nullopt, coordination_service_agent);
+      absl::nullopt, coordination_service_agent, &created_rendezvous);
 
-  pflr_->Run(*opts, handle_, inputs, outputs,
-             [this, opts, cancellation_manager,
-              done = std::move(done)](const Status& s) {
-               if (cancellation_manager == nullptr) {
-                 delete opts->cancellation_manager;
-               }
-               static_cast<Rendezvous*>(opts->rendezvous)->Unref();
-               Status status = s;
-               if (opts->cleanup_rendezvous_after_run) {
-                 // Clean up the rendezvous created in PrepareForRun.
-                 status.Update(rendezvous_factory_.CleanUp(opts->step_id));
-               }
-               done(status);
-             });
+  pflr_->Run(
+      *opts, handle_, inputs, outputs,
+      [opts, cancellation_manager, done = std::move(done),
+       created_rendezvous = created_rendezvous.release()](const Status& s) {
+        if (cancellation_manager == nullptr) {
+          delete opts->cancellation_manager;
+        }
+        created_rendezvous->Unref();
+        done(s);
+      });
 }
 
 tensorflow::Device* KernelAndDeviceOp::OutputDevice(int idx) const {
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index ffafbb04d23..6c70e9fb031 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -352,7 +352,8 @@ class KernelAndDeviceFunc : public KernelAndDevice {
       CancellationManager* cancellation_manager,
       const absl::optional<EagerFunctionParams>& eager_func_params,
       const absl::optional<ManagedStackTrace>& stack_trace,
-      tsl::CoordinationServiceAgent* coordination_service_agent);
+      tsl::CoordinationServiceAgent* coordination_service_agent,
+      tsl::core::RefCountPtr<Rendezvous>* rendezvous);
 
   ProcessFunctionLibraryRuntime* const pflr_;  // non-null
   FunctionLibraryRuntime::Handle handle_;
diff --git a/tensorflow/core/common_runtime/eager/placement_test.cc b/tensorflow/core/common_runtime/eager/placement_test.cc
index 86185684b1c..9347258deea 100644
--- a/tensorflow/core/common_runtime/eager/placement_test.cc
+++ b/tensorflow/core/common_runtime/eager/placement_test.cc
@@ -126,9 +126,9 @@ TEST_F(PlacementTest, SelectDeviceExplicitHardPlacement) {
   Status status = context()->SelectDevice(requested, invalid_op, &dev);
   LOG(ERROR) << status.ToString();
   EXPECT_TRUE(errors::IsNotFound(status));
-  EXPECT_TRUE(absl::StrContains(status.error_message(),
-                                "Could not find device for node"))
-      << "unexpected error message " << status.error_message();
+  EXPECT_TRUE(
+      absl::StrContains(status.message(), "Could not find device for node"))
+      << "unexpected error message " << status.message();
 
   // An invalid requested device should also cause an error.
   ASSERT_TRUE(DeviceNameUtils::ParseLocalName("FakeGPU:99", &requested));
@@ -136,9 +136,9 @@ TEST_F(PlacementTest, SelectDeviceExplicitHardPlacement) {
   status = context()->SelectDevice(requested, node, &dev);
 
   EXPECT_TRUE(errors::IsInvalidArgument(status));
-  EXPECT_TRUE(absl::StrContains(status.error_message(),
+  EXPECT_TRUE(absl::StrContains(status.message(),
                                 "Could not satisfy device specification"))
-      << "unexpected error message " << status.error_message();
+      << "unexpected error message " << status.message();
 
   // Should pick the device with higher priority if given no constraints.
   requested.Clear();
@@ -167,9 +167,9 @@ TEST_F(PlacementTest, SelectDeviceExplicitSoftPlacement) {
   Status status = context()->SelectDevice(requested, invalid_op, &dev);
   LOG(ERROR) << status.ToString();
   EXPECT_TRUE(errors::IsNotFound(status));
-  EXPECT_TRUE(absl::StrContains(status.error_message(),
-                                "Could not find device for node"))
-      << "unexpected error message " << status.error_message();
+  EXPECT_TRUE(
+      absl::StrContains(status.message(), "Could not find device for node"))
+      << "unexpected error message " << status.message();
 
   // An invalid requested device should be replaced by the "best" one.
   ASSERT_TRUE(DeviceNameUtils::ParseLocalName("FakeGPU:99", &requested));
diff --git a/tensorflow/core/common_runtime/eager/rendezvous_cache.h b/tensorflow/core/common_runtime/eager/rendezvous_cache.h
new file mode 100644
index 00000000000..f3971768e48
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/rendezvous_cache.h
@@ -0,0 +1,146 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_RENDEZVOUS_CACHE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_RENDEZVOUS_CACHE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/refcount.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// The class for caching Rendezvous instances per step_id.
+// If the Rendezvous object is destroyed for the step, a new one will be
+// created on demand.
+template <typename T>
+class RendezvousCache : public tsl::core::WeakRefCounted {
+ public:
+  RendezvousCache() = default;
+  virtual ~RendezvousCache() {
+    for (auto& p : table_) {
+      auto rendez = p.second.GetNewRef();
+      if (rendez) {
+        rendez->StartAbort(tsl::errors::Aborted("Shutdown"));
+      }
+    }
+  }
+
+  // Returns a new Reference.
+  template <typename RendezvousCreator>
+  tsl::core::RefCountPtr<T> FindOrCreate(int64_t step_id,
+                                         RendezvousCreator create_fn) {
+    tsl::mutex_lock l(table_lock_);
+    tsl::core::RefCountPtr<T> rendz = nullptr;
+    auto iter = table_.find(step_id);
+    if (iter != table_.end()) {
+      rendz = iter->second.GetNewRef();
+      VLOG(5) << "step_id:" << step_id << " "
+              << "WeakPtr returned:" << rendz.get();
+      if (!rendz) {
+        table_.erase(iter);
+      }
+    }
+    if (!rendz) {  // Deleted or not found
+      rendz = create_fn();
+      VLOG(5) << "step_id:" << step_id << " "
+              << "Rendezvous not found, inserting a new one." << rendz.get();
+      auto cleanup_fn = [weak_cache = tsl::core::WeakPtr<RendezvousCache>(this),
+                         step_id]() {
+        tsl::core::RefCountPtr<RendezvousCache> cache = weak_cache.GetNewRef();
+        if (cache != nullptr) {
+          // If the rendezvous is released, Find() will clean it up from the
+          // map.
+          cache->Find(step_id);
+        }
+      };
+      table_.insert({step_id, tsl::core::WeakPtr<T>{rendz.get(), cleanup_fn}});
+    }
+    return rendz;
+  }
+
+  // Returns a new Reference.
+  tsl::core::RefCountPtr<T> Find(int64_t step_id) {
+    tsl::mutex_lock l(table_lock_);
+    auto iter = table_.find(step_id);
+    if (iter == table_.end()) return nullptr;
+    tsl::core::RefCountPtr<T> res = iter->second.GetNewRef();
+    // Cleans the record if the rendezvous is already destroyed.
+    if (res == nullptr) {
+      table_.erase(iter);
+    }
+    return res;
+  }
+
+  // Removes a Rendezvous weak reference from table.
+  void Remove(int64_t step_id) {
+    tsl::mutex_lock l(table_lock_);
+    table_.erase(step_id);
+  }
+
+  // Removes a Rendezvous weak reference from table, and abort the rendezvous.
+  void RemoveAndAbort(int64_t step_id) {
+    tsl::core::RefCountPtr<T> rendez = nullptr;
+    {
+      tsl::mutex_lock l(table_lock_);
+      auto iter = table_.find(step_id);
+      if (iter != table_.end()) {
+        rendez = iter->second.GetNewRef();
+        table_.erase(iter);
+      }
+    }
+    if (rendez) {
+      rendez->StartAbort(tsl::errors::Aborted("Cleanup ", step_id));
+    }
+  }
+
+  void RemoveAll() {
+    tsl::mutex_lock l(table_lock_);
+    table_.clear();
+  }
+
+  // Returns a list of active step ids. This result is only informative
+  // at time of the call. The returned vector may contain step ids that have
+  // been invalidated after the call.
+  std::vector<int64_t> GetActiveStepIds() {
+    std::vector<int64_t> list;
+    tsl::mutex_lock l(table_lock_);
+    list.reserve(table_.size());
+    for (const auto& iter : table_) {
+      list.push_back(iter.first);
+    }
+    return list;
+  }
+
+  size_t Size() const {
+    tsl::mutex_lock l(table_lock_);
+    return table_.size();
+  }
+
+ private:
+  mutable tsl::mutex table_lock_;
+  absl::flat_hash_map<int64_t, tsl::core::WeakPtr<T>> table_
+      TF_GUARDED_BY(table_lock_);
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_RENDEZVOUS_CACHE_H_
diff --git a/tensorflow/core/common_runtime/eager/small_constants_optimizer.cc b/tensorflow/core/common_runtime/eager/small_constants_optimizer.cc
new file mode 100644
index 00000000000..7d43f29029f
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/small_constants_optimizer.cc
@@ -0,0 +1,284 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/small_constants_optimizer.h"
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow::small_constants_optimizer {
+namespace {
+
+constexpr char kRuntimeConstantOptimization[] = "runtime_constant_optimization";
+constexpr char kIfOp[] = "If";
+constexpr char kStatelessIfOp[] = "StatelessIf";
+constexpr char kPartitionedCallOp[] = "PartitionedCall";
+constexpr char kStatefulPartitionedCallOp[] = "StatefulPartitionedCall";
+
+// Limit the folding to functions that have a single boolean tensor input to
+// avoid exponential FunctionDef creation. We might consider easing this
+// restriction later.
+constexpr int32_t kMaxBoolArguments = 1;
+
+// Returns a list of input arguments that have dtype tf.bool in a FunctionDef.
+// NOTE: This function requires that the FunctionDef outlive the returned
+// result.
+std::vector<absl::string_view> GetBoolInputNames(const FunctionDef& fdef) {
+  std::vector<absl::string_view> result;
+  for (const auto& input_arg : fdef.signature().input_arg()) {
+    if (input_arg.type() == DT_BOOL) {
+      result.emplace_back(input_arg.name());
+    }
+  }
+  return result;
+}
+
+// Recursively folds the boolean input tensor as a constant tensor into the
+// FunctionDef and its dependent functions.
+std::vector<FunctionDef> FoldBoolInputTensor(
+    const FunctionDef& fdef, absl::string_view input_name, bool input_value,
+    bool delete_input, const FunctionLibraryDefinition& flib,
+    absl::flat_hash_set<std::string>& folded_functions);
+
+// Updates the name of the nested function referred to by `nested_func` and
+// extends `results` with the folded versions of the underlying FunctionDefs.
+void FoldBoolInputTensorInNestedFunction(
+    const int32_t input_idx, const bool input_value, const NodeDef& outer_ndef,
+    const FunctionLibraryDefinition& flib, NameAttrList& nested_func,
+    std::vector<FunctionDef>& results,
+    absl::flat_hash_set<std::string>& folded_functions) {
+  // Extract the FunctionDef and the appropriate input argument of the nested
+  // function to fold.
+  const std::string& fname = nested_func.name();
+  const FunctionDef* internal_fdef = flib.Find(fname);
+  if (internal_fdef == nullptr) return;
+
+  // Validate the input arguments of the inner function.
+  if (outer_ndef.input_size() != internal_fdef->signature().input_arg_size()) {
+    return;
+  }
+  if (internal_fdef->signature().input_arg(input_idx).type() != DT_BOOL) {
+    return;
+  }
+
+  // Update the nested function's name.
+  const std::string& internal_name_to_fold =
+      internal_fdef->signature().input_arg(input_idx).name();
+  nested_func.set_name(
+      FoldedFunctionName(fname, internal_name_to_fold, input_value));
+
+  // Fold the nested function's boolean input into its FunctionDef.
+  auto folded_funcs =
+      FoldBoolInputTensor(*internal_fdef, internal_name_to_fold, input_value,
+                          /*delete_input=*/false, flib, folded_functions);
+  results.insert(results.end(), std::make_move_iterator(folded_funcs.begin()),
+                 std::make_move_iterator(folded_funcs.end()));
+}
+
+std::vector<FunctionDef> FoldBoolInputTensor(
+    const FunctionDef& fdef, absl::string_view input_name,
+    const bool input_value, const bool delete_input,
+    const FunctionLibraryDefinition& flib,
+    absl::flat_hash_set<std::string>& folded_functions) {
+  std::vector<FunctionDef> results;
+  // TODO(b/272805674):
+  // -  Inline the evaluated conditional expressions by promoting the
+  //    appropriate true/false branch into the function body.
+  // -  Compare with trace time constant folding.
+  const std::string folded_function_name =
+      FoldedFunctionName(fdef.signature().name(), input_name, input_value);
+  if (folded_functions.contains(folded_function_name)) return results;
+  FunctionDef result = fdef;
+
+  // Rename the new fdef.
+  result.mutable_signature()->set_name(folded_function_name);
+
+  // Remove boolean tensor from input arg signature.
+  // TODO(b/272805674): Investigate if deleting input at top level makes any
+  //                    difference in performance.
+  if (delete_input) {
+    result.mutable_signature()->clear_input_arg();
+    for (const auto& input_arg : fdef.signature().input_arg()) {
+      if (input_arg.name() == input_name) continue;
+      *result.mutable_signature()->add_input_arg() = input_arg;
+    }
+  }
+
+  // Update nodes in the FunctionDef to use the correct inputs and
+  // PartitionedCalls.
+  for (auto& node_def : *result.mutable_node_def()) {
+    // Note: `inputs` will be invalidated after the handling for `If` and
+    // `StatelessIf` ops.
+    auto& inputs = *node_def.mutable_input();
+    auto it = std::find(inputs.begin(), inputs.end(), input_name);
+
+    // Only process nodes that have the boolean tensor input defined by
+    // `input_name`.
+    if (it == inputs.end()) continue;
+    int32_t input_idx = it - inputs.begin();
+
+    // Point all references of the boolean tensor to the constant tensor.
+    it->append("_rt_const_opt:output:0");
+
+    // Inline the true/false PartitionedCall for cond ops relying on the const
+    // bool tensor.
+    if (node_def.op() == kIfOp || node_def.op() == kStatelessIfOp) {
+      // Update the Node's op type.
+      if (node_def.op() == kIfOp) node_def.set_op(kStatefulPartitionedCallOp);
+      if (node_def.op() == kStatelessIfOp) node_def.set_op(kPartitionedCallOp);
+
+      // Update the node's inputs. The cond op does not pass its first argument
+      // to the true/false PartitionedCalls.
+      const auto node_inputs = node_def.input();
+      node_def.clear_input();
+      for (int32_t i = 1; i < node_inputs.size(); ++i) {
+        node_def.add_input(node_inputs[i]);
+      }
+
+      // Update the node's attributes.
+      const auto node_attrs = node_def.attr();
+      node_def.clear_attr();
+      for (const auto& [attr_key, attr_value] : node_attrs) {
+        // Skip redundant attributes.
+        if (attr_key == "Tcond") continue;
+        if (attr_key == "_lower_using_switch_merge") continue;
+
+        // Promote the true branch when input_value is `true`.
+        if (attr_key == "then_branch") {
+          if (input_value) node_def.mutable_attr()->insert({"f", attr_value});
+          continue;
+        }
+        // Promote the false branch when input_value is `false`.
+        if (attr_key == "else_branch") {
+          if (!input_value) node_def.mutable_attr()->insert({"f", attr_value});
+          continue;
+        }
+
+        // All other attributes should be copied over.
+        node_def.mutable_attr()->insert({attr_key, attr_value});
+      }
+    }
+
+    // Recursively point the nested functions to the folded equivalent.
+    for (auto& attr : *node_def.mutable_attr()) {
+      AttrValue& attr_value = attr.second;
+      // Nested functions can be stored in a node's attr in two ways.
+      // 1. A single function is stored in the attr_value as AttrValue.func.
+      if (attr_value.has_func()) {
+        FoldBoolInputTensorInNestedFunction(input_idx, input_value, node_def,
+                                            flib, *attr_value.mutable_func(),
+                                            results, folded_functions);
+      }
+
+      // 2. A list of functions is stored as AttrValue.ListValue.func.
+      if (attr_value.has_list()) {
+        for (auto& func : *attr_value.mutable_list()->mutable_func()) {
+          FoldBoolInputTensorInNestedFunction(input_idx, input_value, node_def,
+                                              flib, func, results,
+                                              folded_functions);
+        }
+      }
+    }
+  }
+
+  // Insert top level const tensor node.
+  auto* const_tensor = result.add_node_def();
+  const_tensor->set_name(absl::StrCat(input_name, "_rt_const_opt"));
+  const_tensor->set_op("Const");
+  AttrValue dtype_value;
+  dtype_value.set_type(DT_BOOL);
+  const_tensor->mutable_attr()->insert({"dtype", dtype_value});
+  AttrValue tensor_value;
+  auto* tensor = tensor_value.mutable_tensor();
+  tensor->set_dtype(DT_BOOL);
+  tensor->mutable_tensor_shape();
+  tensor->add_bool_val(true);
+  const_tensor->mutable_attr()->insert({"value", tensor_value});
+
+  // Mark the current `FunctionDef` as folded and return the results.
+  results.push_back(std::move(result));
+  folded_functions.insert(folded_function_name);
+  return results;
+}
+
+void DisableBoolInputFolding(FunctionDef& fdef) {
+  auto it = fdef.mutable_attr()->find(kRuntimeConstantOptimization);
+  if (it == fdef.mutable_attr()->end()) return;
+  it->second.set_b(false);
+}
+
+void GenerateTrueAndFalseFunctions(const FunctionDef& fdef,
+                                   const FunctionLibraryDefinition& flib,
+                                   std::vector<FunctionDef>& result) {
+  std::vector<absl::string_view> bool_input_names = GetBoolInputNames(fdef);
+  if (bool_input_names.size() != kMaxBoolArguments) return;
+  absl::string_view input_name_to_fold = bool_input_names[0];
+  absl::flat_hash_set<std::string> folded_functions;
+
+  // Add f_true(s).
+  auto true_fdefs =
+      FoldBoolInputTensor(fdef, input_name_to_fold, /*input_value=*/true,
+                          /*delete_input=*/true, flib, folded_functions);
+  for (FunctionDef& fdef : true_fdefs) DisableBoolInputFolding(fdef);
+  result.insert(result.end(), std::make_move_iterator(true_fdefs.begin()),
+                std::make_move_iterator(true_fdefs.end()));
+  // Add f_false(s).
+  auto false_fdefs =
+      FoldBoolInputTensor(fdef, input_name_to_fold, /*input_value=*/false,
+                          /*delete_input=*/true, flib, folded_functions);
+  for (FunctionDef& fdef : false_fdefs) DisableBoolInputFolding(fdef);
+  result.insert(result.end(), std::make_move_iterator(false_fdefs.begin()),
+                std::make_move_iterator(false_fdefs.end()));
+}
+
+}  // namespace
+
+std::string FoldedFunctionName(absl::string_view fname,
+                               absl::string_view input_name, bool input_value) {
+  return absl::StrCat(fname, "_", input_name, "_", input_value);
+}
+
+bool IsSmallConstantOptimizationEnabled(const FunctionDef& fdef) {
+  auto it = fdef.attr().find(kRuntimeConstantOptimization);
+  if (it == fdef.attr().end()) return false;
+  return it->second.b();
+}
+
+std::vector<FunctionDef> FoldInputTensors(
+    const FunctionDef& fdef, const FunctionLibraryDefinition& flib) {
+  std::vector<FunctionDef> result;
+  if (!IsSmallConstantOptimizationEnabled(fdef)) return result;
+
+  // Add f_true and f_false to the result.
+  GenerateTrueAndFalseFunctions(fdef, flib, result);
+
+  return result;
+}
+
+}  // namespace tensorflow::small_constants_optimizer
diff --git a/tensorflow/core/common_runtime/eager/small_constants_optimizer.h b/tensorflow/core/common_runtime/eager/small_constants_optimizer.h
new file mode 100644
index 00000000000..cb70fb9993f
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/small_constants_optimizer.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SMALL_CONSTANTS_OPTIMIZER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SMALL_CONSTANTS_OPTIMIZER_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+
+namespace tensorflow::small_constants_optimizer {
+
+// Checks whether small constant optimization is enabled for a tf.function.
+bool IsSmallConstantOptimizationEnabled(const FunctionDef& fdef);
+
+// Generates new FunctionDefs with the boolean input tensors folded as
+// constants into the FunctionDef.
+std::vector<FunctionDef> FoldInputTensors(
+    const FunctionDef& fdef, const FunctionLibraryDefinition& flib);
+
+// Generates the FunctionDef name for the folded function.
+std::string FoldedFunctionName(absl::string_view fname,
+                               absl::string_view input_name, bool input_value);
+
+}  // namespace tensorflow::small_constants_optimizer
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SMALL_CONSTANTS_OPTIMIZER_H_
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 34d5b9380ad..a5b479d29e3 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -848,8 +848,10 @@ Status TensorHandle::SetRemoteShapeAndDevice(const TensorShape& shape,
     return OkStatus();
   }
 
-  DCHECK(Type() == REMOTE)
-      << "SetRemoteShape is only called on remote handles.";
+  if (Type() != REMOTE) {
+    return errors::InvalidArgument(
+        "SetRemoteShape should only be called on remote handles.");
+  }
 
   auto& data = absl::get<RemoteTensorHandleData>(data_);
   // context_view_id is currently used to validate mirrors. The shape of
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index c9500d42231..3d50054af2f 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -275,6 +275,8 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
 
   tensorflow::FullTypeDef FullType() const override { return full_type_; }
 
+  void SetFullType(FullTypeDef& full_type) { full_type_ = full_type; }
+
  private:
   friend class PackedTensorHandleTest;
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data.cc b/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
index 0a7238ada77..ea9cd3a843b 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
@@ -15,14 +15,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/tensor_handle_data.h"
 
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
-namespace tsl {
-class Status;
-}
 namespace tensorflow {
-using tsl::Status;
 
 Status LocalTensorHandleData::Tensor(const tensorflow::Tensor** t) const {
   TF_RETURN_IF_ERROR(WaitReady("Tensor"));
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_data_test.cc
index ad22a3d446f..b5dcd52f843 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data_test.cc
@@ -131,10 +131,10 @@ TEST(TensorHandleData, BlockingControlPoisonHandle) {
                                          "Fake failure.");
   handle_data.Poison(fake_failure_status);
 
-  EXPECT_THAT(
-      handle_data.IsPoisoned(),
-      tensorflow::testing::StatusIs(fake_failure_status.code(),
-                                    fake_failure_status.error_message()));
+  EXPECT_THAT(handle_data.IsPoisoned(),
+              tensorflow::testing::StatusIs(
+                  fake_failure_status.code(),
+                  std::string(fake_failure_status.message())));
 }
 
 TEST(TensorHandleData, BlockingControlSetTensor) {
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 2afcf3de5cd..365a4dca9b7 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -98,10 +98,10 @@ class PackedTensorHandleTest : public ::testing::Test {
  public:
   PackedTensorHandleTest() {
     std::vector<std::unique_ptr<Device>> devices;
+    devices.emplace_back(CreateDevice("CPU", host_name_));
     for (const char* name : device_names_) {
       devices.emplace_back(CreateDevice("GPU", name));
     }
-    devices.emplace_back(CreateDevice("CPU", host_name_));
     device_mgr_ = new StaticDeviceMgr(std::move(devices));
 
     context_ = new EagerContext(
@@ -120,8 +120,10 @@ class PackedTensorHandleTest : public ::testing::Test {
 
   EagerContext* context() { return context_; }
 
-  std::vector<Device*> ListDevices() const {
-    return device_mgr_->ListDevices();
+  std::vector<Device*> ListGPUDevices() const {
+    // Remove the first CPU device.
+    auto all_devices = device_mgr_->ListDevices();
+    return std::vector<Device*>(all_devices.begin() + 1, all_devices.end());
   }
 
   bool IsReady(TensorHandle* handle) const { return handle->IsReady(); }
@@ -150,13 +152,13 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   // Create 2 local TensorHandles (ready)
   std::vector<TensorHandle*> handles;
   Tensor t0(dtype, shape);
-  Device* d0 = ListDevices().at(0);
+  Device* d0 = ListGPUDevices().at(0);
   TensorHandle* h0 =
       TensorHandle::CreateLocalHandle(std::move(t0), d0, d0, d0, context());
   h0->SetResourceHandleDtypeAndShape({dtype_and_shape});
   handles.push_back(h0);
   Tensor t1(dtype, shape);
-  Device* d1 = ListDevices().at(1);
+  Device* d1 = ListGPUDevices().at(1);
   TensorHandle* h1 =
       TensorHandle::CreateLocalHandle(std::move(t1), d1, d1, d1, context());
   h1->SetResourceHandleDtypeAndShape({dtype_and_shape});
@@ -164,11 +166,11 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
 
   // Create 2 remote TensorHandles (not ready).
   const string remote_task = "/job:worker/replica:0/task:1";
-  Device* d2 = ListDevices().at(2);
+  Device* d2 = ListGPUDevices().at(2);
   TensorHandle* h2 = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d2, context());
   handles.push_back(h2);
-  Device* d3 = ListDevices().at(3);
+  Device* d3 = ListGPUDevices().at(3);
   TensorHandle* h3 = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/1, /*output_num=*/0, remote_task, dtype, d3, context());
   handles.push_back(h3);
@@ -206,16 +208,16 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   for (int i = 0; i < packed_handle->NumPackedHandles(); ++i) {
     TensorHandle* h = nullptr;
     TF_ASSERT_OK(packed_handle->ExtractPackedHandle(i, &h));
-    EXPECT_EQ(h->device(), ListDevices().at(i));
+    EXPECT_EQ(h->device(), ListGPUDevices().at(i));
     EXPECT_EQ(h->Type(), expected_handle_types.at(i));
     EXPECT_EQ(h->FullType().type_id(), TFT_UNSET);
   }
   EXPECT_FALSE(IsReady(packed_handle));
 
-  TF_ASSERT_OK(h2->SetRemoteShape(shape, ListDevices().at(2),
+  TF_ASSERT_OK(h2->SetRemoteShape(shape, ListGPUDevices().at(2),
                                   context()->GetContextViewId()));
   EXPECT_FALSE(IsReady(packed_handle));
-  TF_ASSERT_OK(h3->SetRemoteShape(shape, ListDevices().at(3),
+  TF_ASSERT_OK(h3->SetRemoteShape(shape, ListGPUDevices().at(3),
                                   context()->GetContextViewId()));
   EXPECT_TRUE(IsReady(packed_handle));
 
@@ -227,7 +229,7 @@ TEST_F(PackedTensorHandleTest, PackedSingleHandle) {
   TensorShape shape = {};
 
   Tensor t(dtype, shape);
-  Device* d = ListDevices().at(0);
+  Device* d = ListGPUDevices().at(0);
   TensorHandle* h =
       TensorHandle::CreateLocalHandle(std::move(t), d, d, d, context());
   std::vector<TensorHandle*> handles = {h};
@@ -260,7 +262,7 @@ TEST_F(PackedTensorHandleTest, PoisonHandle) {
   TensorShape shape = {};
 
   Tensor t(dtype, shape);
-  Device* d = ListDevices().at(0);
+  Device* d = ListGPUDevices().at(0);
   TensorHandle* h =
       TensorHandle::CreateLocalHandle(std::move(t), d, d, d, context());
   std::vector<TensorHandle*> handles = {h};
@@ -277,10 +279,10 @@ TEST_F(PackedTensorHandleTest, PoisonHandle) {
   tensorflow::Status fake_failure_status(absl::StatusCode::kAborted,
                                          "Fake failure.");
   packed_handle->Poison(fake_failure_status, packed_handle->device());
-  EXPECT_THAT(
-      WaitReady(packed_handle),
-      tensorflow::testing::StatusIs(fake_failure_status.code(),
-                                    fake_failure_status.error_message()));
+  EXPECT_THAT(WaitReady(packed_handle),
+              tensorflow::testing::StatusIs(
+                  fake_failure_status.code(),
+                  std::string(fake_failure_status.message())));
 
   packed_handle->Unref();
 }
@@ -479,11 +481,11 @@ TEST_F(RemoteTensorHandleTest, PoisonRemote) {
   h->PoisonRemote(fake_failure_status, d1, context->GetContextViewId());
 
   Device* d2 = device_mgr.ListDevices().at(2);
-  EXPECT_THAT(
-      h->SetRemoteShapeAndDevice(shape, d1, context->GetContextViewId(),
-                                 d2->name()),
-      tensorflow::testing::StatusIs(fake_failure_status.code(),
-                                    fake_failure_status.error_message()));
+  EXPECT_THAT(h->SetRemoteShapeAndDevice(shape, d1, context->GetContextViewId(),
+                                         d2->name()),
+              tensorflow::testing::StatusIs(
+                  fake_failure_status.code(),
+                  std::string(fake_failure_status.message())));
 
   h->Unref();
   context->Unref();
@@ -527,11 +529,11 @@ TEST_F(RemoteTensorHandleTest, PoisonRemoteMirror) {
                                          "Fake failure.");
   h->PoisonRemote(fake_failure_status, d2, context->GetContextViewId());
 
-  EXPECT_THAT(
-      h->SetRemoteShapeAndDevice(shape, d2, context->GetContextViewId(),
-                                 d2->name()),
-      tensorflow::testing::StatusIs(fake_failure_status.code(),
-                                    fake_failure_status.error_message()));
+  EXPECT_THAT(h->SetRemoteShapeAndDevice(shape, d2, context->GetContextViewId(),
+                                         d2->name()),
+              tensorflow::testing::StatusIs(
+                  fake_failure_status.code(),
+                  std::string(fake_failure_status.message())));
 
   h->Unref();
   context->Unref();
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index ed7585d9144..fb994a09c9f 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -296,7 +296,7 @@ class ExecutorState {
   struct AsyncState;
 
   // Process a ready node in current thread.
-  void Process(TaggedNode node, int64_t scheduled_nsec);
+  void Process(const TaggedNode& node, int64_t scheduled_nsec);
 
   void ProcessInline(TaggedNodeReadyQueue* inline_ready,
                      int64_t scheduled_nsec);
@@ -693,7 +693,7 @@ void ExecutorState<PropagatorStateType>::ProcessConstTensor(
 }
 
 template <class PropagatorStateType>
-void ExecutorState<PropagatorStateType>::Process(TaggedNode tagged_node,
+void ExecutorState<PropagatorStateType>::Process(const TaggedNode& tagged_node,
                                                  int64_t scheduled_nsec) {
   profiler::TraceMe traceme("ExecutorState::Process Scheduled",
                             profiler::TraceMeLevel::kVerbose);
@@ -1060,15 +1060,15 @@ Status ExecutorState<PropagatorStateType>::ProcessOutputs(
     }
     if (s.code() == error::RESOURCE_EXHAUSTED) {
       if (stats_collector_) {
-        string err = stats_collector_->ReportAllocsOnResourceExhausted(
-            s.error_message());
-        s = errors::CreateWithUpdatedMessage(
-            s, strings::StrCat(s.error_message(), err));
+        string err =
+            stats_collector_->ReportAllocsOnResourceExhausted(s.message());
+        s = errors::CreateWithUpdatedMessage(s,
+                                             strings::StrCat(s.message(), err));
       } else {
         s = errors::CreateWithUpdatedMessage(
             s,
             strings::StrCat(
-                s.error_message(),
+                s.message(),
                 "\nHint: If you want to see a list of allocated tensors when "
                 "OOM happens, add report_tensor_allocations_upon_oom "
                 "to RunOptions for current allocation info. This isn't "
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 1c9f5d80c6b..ff388e1479d 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/core/common_runtime/arg_ret_placement.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/executor_factory.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/inline_function_utils.h"
+#include "tensorflow/core/common_runtime/int32_fulltype.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
@@ -646,18 +648,19 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(
   const FunctionBody* fbody = GetFunctionBody(handle);
   CHECK_NOTNULL(fbody);
 
-  // TODO(zhifengc): For now, we assume int32 and resources are always on host
-  // memory and other types are always on device memory. We should do type
+  // Originally, int32 and resources were always on host memory and other types
+  // are always on device memory. Now, having TFT_SHAPE_TENSOR full type
+  // information specifies host memory and unspecified or other full type
+  // information specifies device memory. Full type information can be set to
+  // match the orginal behavior, manually for manual placement or by using type
   // inference over function body to derive the correct input/output memory
   // types.
   MemoryTypeVector input_memory_types;
-  for (const auto& t : fbody->arg_types) {
-    input_memory_types.push_back(MTypeFromDType(t));
-  }
+  TF_RETURN_IF_ERROR(full_type::SetMemoryTypeForArgs(
+      fbody->arg_nodes, fbody->arg_types, input_memory_types));
   MemoryTypeVector output_memory_types;
-  for (const auto& t : fbody->ret_types) {
-    output_memory_types.push_back(MTypeFromDType(t));
-  }
+  TF_RETURN_IF_ERROR(full_type::SetMemoryTypeForRets(
+      fbody->ret_nodes, fbody->ret_types, output_memory_types));
 
   // Constructs a CallOp kernel for running the instantiated function.
   auto device_type = DeviceType(device_->attributes().device_type());
@@ -806,6 +809,9 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
       return errors::NotFound("Function ", function_name, " is not defined.");
     }
     TF_RETURN_IF_ERROR(FunctionDefToBody(*fdef, attrs, lib_def, &fbody));
+    Int32FulltypePass int32_fulltype("FunctionLibraryRuntime::Instantiate");
+    TF_RETURN_IF_ERROR(
+        int32_fulltype.ProcessGraph(fbody->graph, /*ints_on_device=*/false));
   }
 
   LocalHandle local_handle;
@@ -1058,22 +1064,17 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   ExecutorArgsFromOptions(opts, frame, exec_args);
 
   std::vector<AllocatorAttributes> args_alloc_attrs, rets_alloc_attrs;
-  args_alloc_attrs.reserve(fbody->arg_types.size());
-  rets_alloc_attrs.reserve(fbody->ret_types.size());
-  // Note: Functions assume that int32's are always on host memory.
-  for (const auto& arg_type : fbody->arg_types) {
-    AllocatorAttributes arg_alloc_attrs;
-    if (MTypeFromDType(arg_type) == HOST_MEMORY) {
-      arg_alloc_attrs.set_on_host(true);
-    }
-    args_alloc_attrs.push_back(arg_alloc_attrs);
+  s = full_type::SetAllocAttrsForArgs(fbody->arg_nodes, fbody->arg_types,
+                                      args_alloc_attrs);
+  if (!s.ok()) {
+    done(s);
+    return;
   }
-  for (const auto& ret_type : fbody->ret_types) {
-    AllocatorAttributes ret_alloc_attrs;
-    if (MTypeFromDType(ret_type) == HOST_MEMORY) {
-      ret_alloc_attrs.set_on_host(true);
-    }
-    rets_alloc_attrs.push_back(ret_alloc_attrs);
+  s = full_type::SetAllocAttrsForRets(fbody->ret_nodes, fbody->ret_types,
+                                      rets_alloc_attrs);
+  if (!s.ok()) {
+    done(s);
+    return;
   }
 
   bool allow_dead_tensors = opts.allow_dead_tensors;
diff --git a/tensorflow/core/common_runtime/function_def_utils.cc b/tensorflow/core/common_runtime/function_def_utils.cc
index 90a834bacbe..ed1582c05a8 100644
--- a/tensorflow/core/common_runtime/function_def_utils.cc
+++ b/tensorflow/core/common_runtime/function_def_utils.cc
@@ -52,13 +52,15 @@ Status FunctionDefToBodyHelper(
   opts.expect_device_spec = false;
   TF_RETURN_IF_ERROR(ConvertNodeDefsToGraph(opts, result.nodes, graph.get()));
 
-  const StackTracesMap& stack_traces =
+  const StackTracesMap* stack_traces =
       lib_def->GetStackTraces(fdef.signature().name());
-  for (Node* n : graph->nodes()) {
-    if (n) {
-      auto it = stack_traces.find(n->name());
-      if (it != stack_traces.end()) {
-        n->SetStackTrace(it->second);
+  if (stack_traces) {
+    for (Node* n : graph->nodes()) {
+      if (n) {
+        auto it = stack_traces->find(n->name());
+        if (it != stack_traces->end()) {
+          n->SetStackTrace(it->second);
+        }
       }
     }
   }
@@ -69,7 +71,7 @@ Status FunctionDefToBodyHelper(
   TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph.get(), &dummy));
 
   *fbody = std::make_unique<FunctionBody>(fdef, result.arg_types,
-                                           result.ret_types, graph.release());
+                                          result.ret_types, graph.release());
   return OkStatus();
 }
 
diff --git a/tensorflow/core/common_runtime/function_optimization_registration_test.cc b/tensorflow/core/common_runtime/function_optimization_registration_test.cc
index 62c3ff5e314..34ed3302963 100644
--- a/tensorflow/core/common_runtime/function_optimization_registration_test.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registration_test.cc
@@ -29,8 +29,9 @@ class TestFunctionPass : public FunctionOptimizationPass {
   static bool ran_;
 
   Status Run(const std::string& function_name, const DeviceSet& device_set,
-             const ConfigProto& config_proto, std::unique_ptr<Graph>* graph,
-             FunctionLibraryDefinition* flib_def,
+             const ConfigProto& config_proto,
+             absl::string_view xla_compile_device_type,
+             std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
              std::vector<std::string>* control_ret_node_names,
              bool* control_rets_updated) override {
     ran_ = true;
@@ -49,7 +50,9 @@ TEST(FunctionOptimizationPassRegistry, RegisteredPass) {
   DeviceSet device_set;
   ConfigProto config_proto;
   Status status = FunctionOptimizationPassRegistry::Global().Run(
-      "test_func", device_set, config_proto, /*graph=*/nullptr,
+      "test_func", device_set, config_proto,
+      /*xla_compile_device_type=*/"",
+      /*graph=*/nullptr,
       /*flib_def=*/nullptr,
       /*control_ret_node_names=*/nullptr, /*control_rets_updated=*/nullptr);
 
diff --git a/tensorflow/core/common_runtime/function_optimization_registry.cc b/tensorflow/core/common_runtime/function_optimization_registry.cc
index c61350d7c1a..8fc12e99535 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registry.cc
@@ -29,8 +29,8 @@ void FunctionOptimizationPassRegistry::Init(
 
 Status FunctionOptimizationPassRegistry::Run(
     const std::string& function_name, const DeviceSet& device_set,
-    const ConfigProto& config_proto, std::unique_ptr<Graph>* graph,
-    FunctionLibraryDefinition* flib_def,
+    const ConfigProto& config_proto, absl::string_view xla_compile_device_type,
+    std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
     std::vector<std::string>* control_ret_node_names,
     bool* control_rets_updated) {
   if (!pass_) return OkStatus();
@@ -39,7 +39,8 @@ Status FunctionOptimizationPassRegistry::Run(
       tensorflow::metrics::GetGraphOptimizationCounter(),
       {"GraphOptimizationPass", "FunctionOptimizationPassRegistry"});
 
-  return pass_->Run(function_name, device_set, config_proto, graph, flib_def,
+  return pass_->Run(function_name, device_set, config_proto,
+                    xla_compile_device_type, graph, flib_def,
                     control_ret_node_names, control_rets_updated);
 }
 
diff --git a/tensorflow/core/common_runtime/function_optimization_registry.h b/tensorflow/core/common_runtime/function_optimization_registry.h
index c095c4e7e23..8179cb329bd 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry.h
+++ b/tensorflow/core/common_runtime/function_optimization_registry.h
@@ -32,7 +32,8 @@ namespace tensorflow {
 
 // A pass to be registered with the FunctionOptimizationPassRegistry. This pass
 // takes in a DeviceSet (available devices for executing the Graph), ConfigProto
-// (session configuration parameters), Graph (computation),
+// (session configuration parameters), an optional target device for XLA
+// compilation, Graph (computation),
 // FunctionLibraryDefinition (mapping between function names and function
 // definitions of the Graph), control ret/target node names (names of nodes that
 // must execute but their data outputs, if they have any, are irrelevant), and
@@ -44,6 +45,7 @@ class FunctionOptimizationPass {
   virtual Status Run(const std::string& function_name,
                      const DeviceSet& device_set,
                      const ConfigProto& config_proto,
+                     absl::string_view xla_compile_device_type,
                      std::unique_ptr<Graph>* graph,
                      FunctionLibraryDefinition* flib_def,
                      std::vector<std::string>* control_ret_node_names,
@@ -62,8 +64,9 @@ class FunctionOptimizationPassRegistry {
 
   // Runs a pass if the registry contains one.
   Status Run(const std::string& function_name, const DeviceSet& device_set,
-             const ConfigProto& config_proto, std::unique_ptr<Graph>* graph,
-             FunctionLibraryDefinition* flib_def,
+             const ConfigProto& config_proto,
+             absl::string_view xla_compile_device_type,
+             std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
              std::vector<std::string>* control_ret_node_names,
              bool* control_rets_updated);
 
diff --git a/tensorflow/core/common_runtime/function_optimization_registry_no_pass_test.cc b/tensorflow/core/common_runtime/function_optimization_registry_no_pass_test.cc
index 4f877b5ed7e..da25acf9ef5 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry_no_pass_test.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registry_no_pass_test.cc
@@ -29,7 +29,9 @@ TEST(FunctionOptimizationPassRegistry, NoPassSet) {
   DeviceSet device_set;
   ConfigProto config_proto;
   Status status = FunctionOptimizationPassRegistry::Global().Run(
-      "test_func", device_set, config_proto, /*graph=*/nullptr,
+      "test_func", device_set, config_proto,
+      /*xla_compile_device_type=*/"",
+      /*graph=*/nullptr,
       /*flib_def=*/nullptr,
       /*control_ret_node_names=*/nullptr, /*control_rets_updated=*/nullptr);
 
diff --git a/tensorflow/core/common_runtime/function_optimization_registry_pass_failure_test.cc b/tensorflow/core/common_runtime/function_optimization_registry_pass_failure_test.cc
index ea2d0757c2f..06f127706b4 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry_pass_failure_test.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registry_pass_failure_test.cc
@@ -30,8 +30,9 @@ class FailingFunctionPass : public FunctionOptimizationPass {
   static bool ran_;
 
   Status Run(const std::string& function_name, const DeviceSet& device_set,
-             const ConfigProto& config_proto, std::unique_ptr<Graph>* graph,
-             FunctionLibraryDefinition* flib_def,
+             const ConfigProto& config_proto,
+             absl::string_view xla_compile_device_type,
+             std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
              std::vector<std::string>* control_ret_node_names,
              bool* control_rets_updated) override {
     ran_ = true;
@@ -49,7 +50,9 @@ TEST(FunctionOptimizationPassRegistry, PassWithError) {
   DeviceSet device_set;
   ConfigProto config_proto;
   Status status = FunctionOptimizationPassRegistry::Global().Run(
-      "test_func", device_set, config_proto, /*graph=*/nullptr,
+      "test_func", device_set, config_proto,
+      /*xla_compile_device_type=*/"",
+      /*graph=*/nullptr,
       /*flib_def=*/nullptr,
       /*control_ret_node_names=*/nullptr, /*control_rets_updated=*/nullptr);
 
diff --git a/tensorflow/core/common_runtime/function_optimization_registry_test.cc b/tensorflow/core/common_runtime/function_optimization_registry_test.cc
index b14893c05f4..75da5482015 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry_test.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registry_test.cc
@@ -31,8 +31,9 @@ class PassingFunctionPass : public FunctionOptimizationPass {
   static bool ran_;
 
   Status Run(const std::string& function_name, const DeviceSet& device_set,
-             const ConfigProto& config_proto, std::unique_ptr<Graph>* graph,
-             FunctionLibraryDefinition* flib_def,
+             const ConfigProto& config_proto,
+             absl::string_view xla_compile_device_type,
+             std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
              std::vector<std::string>* control_ret_node_names,
              bool* control_rets_updated) override {
     ran_ = true;
@@ -50,7 +51,8 @@ TEST(FunctionOptimizationPassRegistry, PassNoError) {
   DeviceSet device_set;
   ConfigProto config_proto;
   Status status = FunctionOptimizationPassRegistry::Global().Run(
-      "test_func", device_set, config_proto, /*graph=*/nullptr,
+      "test_func", device_set, config_proto, /*xla_compile_device_type=*/"",
+      /*graph=*/nullptr,
       /*flib_def=*/nullptr,
       /*control_ret_node_names=*/nullptr, /*control_rets_updated=*/nullptr);
 
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 7a8bad2b8c8..440a8bb9946 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <atomic>
 #include <functional>
+#include <memory>
 #include <utility>
 
+#include <gtest/gtest.h>
 #include "absl/memory/memory.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
@@ -36,8 +38,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/framework/full_type.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -70,7 +74,7 @@ Status GetOpSig(const string& op, const OpDef** sig) {
 
 void HasError(const Status& s, const error::Code code, StringPiece substr) {
   EXPECT_EQ(s.code(), code) << s;
-  EXPECT_TRUE(absl::StrContains(s.error_message(), substr))
+  EXPECT_TRUE(absl::StrContains(s.message(), substr))
       << s << ", expected substring " << substr;
 }
 
@@ -170,11 +174,12 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
         device_mgr_.get(), Env::Default(), &options.config,
         TF_GRAPH_DEF_VERSION, lib_def_.get(), opts, /*thread_pool=*/nullptr,
         /*parent=*/nullptr, /*session_metadata=*/nullptr,
-        Rendezvous::Factory{
-            [](const int64_t, const DeviceMgr* device_mgr, Rendezvous** r) {
-              *r = new IntraProcessRendezvous(device_mgr);
-              return OkStatus();
-            }}));
+        Rendezvous::Factory{[](const int64_t, const DeviceMgr* device_mgr,
+                               tsl::core::RefCountPtr<Rendezvous>* r) {
+          *r = tsl::core::RefCountPtr<Rendezvous>(
+              new IntraProcessRendezvous(device_mgr));
+          return OkStatus();
+        }}));
     flr0_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
     flr1_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:1");
     flr2_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:2");
@@ -250,8 +255,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsNotFound(status2))
         << "Actual status: " << status2.ToString();
-    EXPECT_TRUE(absl::StrContains(status2.error_message(), "Handle"));
-    EXPECT_TRUE(absl::StrContains(status2.error_message(), "not found"));
+    EXPECT_TRUE(absl::StrContains(status2.message(), "Handle"));
+    EXPECT_TRUE(absl::StrContains(status2.message(), "not found"));
 
     return status;
   }
@@ -308,8 +313,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsNotFound(status2));
-    EXPECT_TRUE(absl::StrContains(status2.error_message(), "Handle"));
-    EXPECT_TRUE(absl::StrContains(status2.error_message(), "not found"));
+    EXPECT_TRUE(absl::StrContains(status2.message(), "Handle"));
+    EXPECT_TRUE(absl::StrContains(status2.message(), "not found"));
 
     return status;
   }
@@ -385,6 +390,10 @@ TEST_F(FunctionLibraryRuntimeTest, InstantiationStackTraceCopying) {
     }
 
     StackFrame LastUserFrame() const override { return StackFrame{}; }
+
+    std::vector<StackFrame> GetUserFrames(int limit) const override {
+      return {};
+    }
   };
 
   FunctionDef func = test::function::XTimesTwo();
@@ -2019,6 +2028,43 @@ TEST_F(FunctionLibraryRuntimeTest, RunAllKernelsInline) {
   }
 }
 
+TEST_F(FunctionLibraryRuntimeTest, FullTypeForInt32) {
+  auto T = DT_INT32;
+  FunctionDef int32_func = FDH::Define(
+      // Name
+      "DoubleInt32",
+      // Args
+      {"x: int32"},
+      // Return values
+      {"z: int32"},
+      // Attrs
+      {},
+      // Nodes
+      {// z = Add<T>(x, x)
+       {{"z"}, "Add", {"x", "x"}, {{"T", T}}}});
+  Init({int32_func});
+
+  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  auto y = test::AsTensor<float>({1.0, 2.0, 3.0, 4.0});
+  Tensor z;
+
+  FunctionLibraryRuntime::Handle handle;
+  TF_CHECK_OK(Instantiate(flr0_, "DoubleInt32", {}, &handle));
+
+  const FunctionBody* fb = flr0_->GetFunctionBody(handle);
+  for (const Node* node : fb->arg_nodes) {
+    if (node->name() == "x") {
+      ASSERT_TRUE(node->def().has_experimental_type());
+      FullTypeDef ft = node->def().experimental_type();
+      EXPECT_EQ(ft.type_id(), TFT_PRODUCT);
+      ASSERT_EQ(ft.args_size(), 1);
+      EXPECT_EQ(ft.args(0).type_id(), TFT_SHAPE_TENSOR);
+      ASSERT_EQ(ft.args(0).args_size(), 1);
+      EXPECT_EQ(ft.args(0).args(0).type_id(), TFT_INT32);
+    }
+  }
+}
+
 class UserIntraOpThreadPoolOp : public OpKernel {
  public:
   using OpKernel::OpKernel;
@@ -2405,5 +2451,133 @@ TEST(OptimizationTest, RemoveListArrayConverter_WithControlDeps) {
   TF_EXPECT_GRAPH_EQ(expected, Optimize(remove_listarray_and_identity, func));
 }
 
+class TestStackTrace : public AbstractStackTrace {
+ public:
+  explicit TestStackTrace(const std::vector<StackFrame>& frames)
+      : frames_(frames) {}
+
+  absl::Span<StackFrame const> ToFrames() const override { return frames_; }
+
+  StackFrame LastUserFrame() const override { return frames_.back(); }
+
+  std::vector<StackFrame> GetUserFrames(int limit) const override {
+    return frames_;
+  }
+
+  string ToString(const TracePrintingOptions& opts) const override {
+    return "";
+  }
+
+  std::vector<StackFrame> frames_;
+};
+
+TEST(StackTracesMapToGraphDebugInfoTest, EmptyMap) {
+  StackTracesMap map;
+  GraphDebugInfo generated = StackTracesMapToGraphDebugInfo(map);
+
+  EXPECT_EQ(generated.files_size(), 0);
+  EXPECT_EQ(generated.traces_size(), 0);
+}
+
+TEST(StackTracesMapToGraphDebugInfoTest, EmptyFrames) {
+  StackTracesMap map;
+  std::vector<StackFrame> frames;
+  auto stack_trace = std::make_shared<TestStackTrace>(frames);
+  map.insert({"dummy_name", stack_trace});
+  GraphDebugInfo generated = StackTracesMapToGraphDebugInfo(map);
+
+  EXPECT_EQ(generated.files_size(), 0);
+  EXPECT_EQ(generated.traces_size(), 1);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols().size(), 0);
+}
+
+TEST(StackTracesMapToGraphDebugInfoTest, OneFrame) {
+  StackTracesMap map;
+  std::vector<StackFrame> frames = {
+      StackFrame({"dummy_file_name", 10, "dummy_function_name"})};
+  auto stack_trace = std::make_shared<TestStackTrace>(frames);
+  map.insert({"dummy_name", stack_trace});
+  GraphDebugInfo generated = StackTracesMapToGraphDebugInfo(map);
+
+  EXPECT_EQ(generated.files_size(), 1);
+  EXPECT_EQ(generated.files()[0], "dummy_file_name");
+
+  EXPECT_EQ(generated.traces_size(), 1);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols().size(), 1);
+  EXPECT_EQ(
+      generated.traces().at("dummy_name").file_line_cols()[0].file_index(), 0);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols()[0].line(), 10);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols()[0].func(),
+            "dummy_function_name");
+}
+
+TEST(StackTracesMapToGraphDebugInfoTest, TwoFramesSameFile) {
+  StackTracesMap map;
+  std::vector<StackFrame> frames = {
+      StackFrame({"dummy_file_name", 10, "dummy_function_name"}),
+      StackFrame({"dummy_file_name", 20, "other_function_name"})};
+  auto stack_trace = std::make_shared<TestStackTrace>(frames);
+  map.insert({"dummy_name", stack_trace});
+  GraphDebugInfo generated = StackTracesMapToGraphDebugInfo(map);
+
+  EXPECT_EQ(generated.files_size(), 1);
+  EXPECT_EQ(generated.files()[0], "dummy_file_name");
+
+  EXPECT_EQ(generated.traces_size(), 1);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols().size(), 2);
+
+  EXPECT_EQ(
+      generated.traces().at("dummy_name").file_line_cols()[0].file_index(), 0);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols()[0].line(), 10);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols()[0].func(),
+            "dummy_function_name");
+
+  EXPECT_EQ(
+      generated.traces().at("dummy_name").file_line_cols()[1].file_index(), 0);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols()[1].line(), 20);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols()[1].func(),
+            "other_function_name");
+}
+
+TEST(StackTracesMapToGraphDebugInfoTest, TwoFramesDifferentFile) {
+  StackTracesMap map;
+  std::vector<StackFrame> frames = {
+      StackFrame({"dummy_file_name", 10, "dummy_function_name"}),
+      StackFrame({"other_file_name", 20, "other_function_name"})};
+  auto stack_trace = std::make_shared<TestStackTrace>(frames);
+  map.insert({"dummy_name", stack_trace});
+  GraphDebugInfo generated = StackTracesMapToGraphDebugInfo(map);
+
+  EXPECT_EQ(generated.files_size(), 2);
+  EXPECT_EQ(generated.files()[0], "dummy_file_name");
+  EXPECT_EQ(generated.files()[1], "other_file_name");
+
+  EXPECT_EQ(generated.traces_size(), 1);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols().size(), 2);
+
+  EXPECT_EQ(
+      generated.traces().at("dummy_name").file_line_cols()[0].file_index(), 0);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols()[0].line(), 10);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols()[0].func(),
+            "dummy_function_name");
+
+  EXPECT_EQ(
+      generated.traces().at("dummy_name").file_line_cols()[1].file_index(), 1);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols()[1].line(), 20);
+  EXPECT_EQ(generated.traces().at("dummy_name").file_line_cols()[1].func(),
+            "other_function_name");
+}
+
+TEST(StackTracesTest, CacheCleaningWorks) {
+  StackTracesMap map;
+  std::vector<StackFrame> frames = {
+      StackFrame({"dummy_file_name", 10, "dummy_function_name"}),
+      StackFrame({"other_file_name", 20, "other_function_name"})};
+  auto stack_trace = TestStackTrace(frames);
+  EXPECT_EQ(stack_trace.ToFrames().size(), 2);
+  stack_trace.WipeCache();
+  EXPECT_EQ(stack_trace.ToFrames().size(), 2);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function_testlib.cc b/tensorflow/core/common_runtime/function_testlib.cc
index bbaa94d6143..d8965a7357e 100644
--- a/tensorflow/core/common_runtime/function_testlib.cc
+++ b/tensorflow/core/common_runtime/function_testlib.cc
@@ -58,6 +58,20 @@ FunctionDef FindDevice() {
       {{{"device_name"}, "FindDeviceOp", {}, {}}});
 }
 
+FunctionDef FindDeviceWithUuid() {
+  return FDH::Define(
+      // Name
+      "FindDevice_1234",
+      // Args
+      {},
+      // Return values
+      {"device_name: string"},
+      // Attr def
+      {},
+      // Nodes
+      {{{"device_name"}, "FindDeviceOp", {}, {}}});
+}
+
 void BlockingOpState::AwaitState(int awaiting_state) {
   mutex_lock ml(mu_);
   while (state_ != awaiting_state) {
diff --git a/tensorflow/core/common_runtime/function_testlib.h b/tensorflow/core/common_runtime/function_testlib.h
index fb967a61233..0874dd796f3 100644
--- a/tensorflow/core/common_runtime/function_testlib.h
+++ b/tensorflow/core/common_runtime/function_testlib.h
@@ -24,6 +24,7 @@ namespace function {
 
 // {} -> y:DT_STRING (device where this op runs).
 FunctionDef FindDevice();
+FunctionDef FindDeviceWithUuid();
 
 class BlockingOpState {
  public:
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 4de1d26bd20..77890a829c2 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -66,11 +66,12 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
         device_mgr_.get(), Env::Default(), /*config=*/nullptr,
         TF_GRAPH_DEF_VERSION, lib_def_.get(), opts, default_thread_pool,
         /*parent=*/nullptr, /*session_metadata=*/nullptr,
-        Rendezvous::Factory{
-            [](const int64_t, const DeviceMgr* device_mgr, Rendezvous** r) {
-              *r = new IntraProcessRendezvous(device_mgr);
-              return OkStatus();
-            }}));
+        Rendezvous::Factory{[](const int64_t, const DeviceMgr* device_mgr,
+                               tsl::core::RefCountPtr<Rendezvous>* r) {
+          *r = tsl::core::RefCountPtr<Rendezvous>(
+              new IntraProcessRendezvous(device_mgr));
+          return OkStatus();
+        }}));
     flr0_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
   }
 
@@ -155,8 +156,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsNotFound(status2));
-    EXPECT_TRUE(absl::StrContains(status2.error_message(), "Handle"));
-    EXPECT_TRUE(absl::StrContains(status2.error_message(), "not found"));
+    EXPECT_TRUE(absl::StrContains(status2.message(), "Handle"));
+    EXPECT_TRUE(absl::StrContains(status2.message(), "not found"));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 8b81bc1be64..2575f8d0129 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -183,6 +183,7 @@ tf_cuda_library(
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:scoped_annotation",
         "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
+        "//tensorflow/tsl/framework:device_id_utils",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index ad17768a660..a86ef1c7e24 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -61,6 +61,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/tsl/framework/device_id.h"
+#include "tensorflow/tsl/framework/device_id_utils.h"
 #if GOOGLE_CUDA
 #include "third_party/gpus/cudnn/cudnn.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_activation.h"
@@ -632,7 +633,7 @@ void BaseGPUDevice::LogInputs(OpKernel* op_kernel, OpKernelContext* context) {
 void BaseGPUDevice::LogOutputs(OpKernel* op_kernel, OpKernelContext* context) {
   if (!context->status().ok()) {
     LOG(INFO) << op_kernel->name()
-              << " failed: " << context->status().error_message();
+              << " failed: " << context->status().message();
     return;
   }
 
@@ -1055,7 +1056,8 @@ int64_t MinSystemMemory(int64_t available_memory, int cc_major) {
   // Otherwise, depending on the capability version assign
   //  500MiB (for cuda_compute_capability <= 6.x) or
   // 1050MiB (for cuda_compute_capability <= 7.x) or
-  // 1536MiB (for cuda_compute_capability >= 8.x)
+  // 1536MiB (for cuda_compute_capability <= 8.x) or
+  // 1800MiB (for cuda_compute_capability >= 9.x)
   int64_t min_system_memory;
   if (available_memory < (1LL << 31)) {
     min_system_memory = 225 * 1024 * 1024;
@@ -1064,8 +1066,10 @@ int64_t MinSystemMemory(int64_t available_memory, int cc_major) {
       min_system_memory = 500 * 1024 * 1024;
     } else if (cc_major <= 7) {
       min_system_memory = 1050 * 1024 * 1024;
-    } else {
+    } else if (cc_major <= 8) {
       min_system_memory = 1536 * 1024 * 1024;
+    } else {
+      min_system_memory = 1800 * 1024 * 1024;
     }
   }
 #if defined(__GNUC__) && defined(__OPTIMIZE__)
@@ -1302,7 +1306,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
   // because it treats an empty gpu_options.visible_device_list as 'all GPUs
   // are visible'.
   if (num_gpus_to_use > 0) {
-    TF_RETURN_IF_ERROR(se::DeviceIdUtil::ParseVisibleDeviceList(
+    TF_RETURN_IF_ERROR(tsl::ParseVisibleDeviceList(
         gpu_options.visible_device_list(), gpu_manager->VisibleDeviceCount(),
         &visible_gpu_order));
     bool new_gpu_found = false;
@@ -1583,8 +1587,8 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
   CHECK_GE(tf_device_id.value(), 0);
   const string device_name =
       strings::StrCat(name_prefix, "/device:GPU:", tf_device_id.value());
-  se::DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, se::GPUMachineManager(),
-                                         tf_device_id);
+  tsl::CheckValidTfDeviceId(
+      DEVICE_GPU, se::GPUMachineManager()->VisibleDeviceCount(), tf_device_id);
   tsl::PlatformDeviceId platform_device_id;
   TF_RETURN_IF_ERROR(
       GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 9f0bb857edd..18264281977 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/tsl/framework/allocator.h"
 #include "tensorflow/tsl/framework/bfc_allocator.h"
 #include "tensorflow/tsl/framework/device_id.h"
+#include "tensorflow/tsl/framework/device_id_utils.h"
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/mutex.h"
 #include "tensorflow/tsl/platform/strcat.h"
@@ -160,8 +161,8 @@ Allocator* GPUProcessState::GetGPUAllocator(
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
   const string& allocator_type = options.allocator_type();
   mutex_lock lock(mu_);
-  se::DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, se::GPUMachineManager(),
-                                         tf_device_id);
+  tsl::CheckValidTfDeviceId(
+      DEVICE_GPU, se::GPUMachineManager()->VisibleDeviceCount(), tf_device_id);
 
   if (tf_device_id.value() >= static_cast<int64_t>(gpu_allocators_.size())) {
     gpu_allocators_.resize(tf_device_id.value() + 1);
@@ -269,8 +270,8 @@ SharedCounter* GPUProcessState::GPUAllocatorCounter(
   DCHECK(process_state_);
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
-  se::DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, se::GPUMachineManager(),
-                                         tf_device_id);
+  tsl::CheckValidTfDeviceId(
+      DEVICE_GPU, se::GPUMachineManager()->VisibleDeviceCount(), tf_device_id);
   mutex_lock l(mu_);
   if (tf_device_id.value() >= static_cast<int64_t>(gpu_allocators_.size())) {
     LOG(ERROR) << "Asked for counter for GPU allocator " << tf_device_id.value()
@@ -348,7 +349,7 @@ Allocator* GPUProcessState::GetGpuHostAllocator(const GPUOptions& options,
         tsl::ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
                                  1LL << 17 /*2^17 MB == 128GB*/, &limit_mb);
     if (!status.ok()) {
-      LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
+      LOG(ERROR) << "GetGpuHostAllocator: " << status.message();
     }
     mem_limit_bytes = limit_mb * (1LL << 20);
   }
diff --git a/tensorflow/core/common_runtime/graph_constructor.cc b/tensorflow/core/common_runtime/graph_constructor.cc
index bc854ac58e7..1dcbca494ed 100644
--- a/tensorflow/core/common_runtime/graph_constructor.cc
+++ b/tensorflow/core/common_runtime/graph_constructor.cc
@@ -16,10 +16,14 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 
 #include <algorithm>
+#include <optional>
 #include <set>
+#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -30,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -161,10 +166,11 @@ class GraphConstructor {
 
   typedef gtl::ArraySlice<const NodeDef*> NodeDefSlice;
 
-  // versions and library may be nullptr
+  // versions, library, and debug_info may be nullptr
   static Status Construct(
       const Options& opts, NodeDefSlice node_defs, const VersionDef* versions,
-      const FunctionDefLibrary* library, Graph* g, ShapeRefiner* refiner,
+      const FunctionDefLibrary* library, const GraphDebugInfo* debug_info,
+      Graph* g, ShapeRefiner* refiner,
       std::vector<std::pair<Node*, int>>* return_tensors,
       std::vector<Node*>* return_nodes,
       std::vector<SafeTensorId>* missing_unused_input_map_keys);
@@ -294,9 +300,11 @@ class GraphConstructor {
   // Returns the version information for the graph, or nullptr if none is
   // available.
   virtual const VersionDef* versions() const = 0;
-  // Returns the function information for the graph, or nullptr if none is
-  // available.
-  virtual const FunctionDefLibrary* library() const = 0;
+  // Destructively reads the function information for the graph, or nullopt if
+  // none is available.
+  virtual std::optional<FunctionDefLibrary> consume_library() = 0;
+  // Returns the debug info for the graph, or nullptr if none is available.
+  virtual const GraphDebugInfo* debug_info() const = 0;
 
   // From constructor
   const Options opts_;
@@ -405,7 +413,8 @@ class NodeDefCopyingGraphConstructor : public GraphConstructor {
  public:
   NodeDefCopyingGraphConstructor(
       const Options& opts, NodeDefSlice node_defs, const VersionDef* versions,
-      const FunctionDefLibrary* library, Graph* g, ShapeRefiner* refiner,
+      const FunctionDefLibrary* library, const GraphDebugInfo* debug_info,
+      Graph* g, ShapeRefiner* refiner,
       std::vector<std::pair<Node*, int>>* return_tensors,
       std::vector<Node*>* return_nodes,
       std::vector<SafeTensorId>* missing_unused_input_map_keys)
@@ -413,18 +422,27 @@ class NodeDefCopyingGraphConstructor : public GraphConstructor {
                          missing_unused_input_map_keys),
         node_defs_(node_defs),
         versions_(versions),
-        library_(library) {}
+        library_(library),
+        debug_info_(debug_info) {}
 
  private:
   size_t node_def_count() const override { return node_defs_.size(); }
   const NodeDef& get_node_def(int i) const override { return *node_defs_[i]; }
   NodeDef consume_node_def(int i) override { return *node_defs_[i]; }
   const VersionDef* versions() const override { return versions_; }
-  const FunctionDefLibrary* library() const override { return library_; }
+  std::optional<FunctionDefLibrary> consume_library() override {
+    if (library_ == nullptr) {
+      return std::nullopt;
+    } else {
+      return *library_;
+    }
+  }
+  const GraphDebugInfo* debug_info() const override { return debug_info_; }
 
   const NodeDefSlice node_defs_;
   const VersionDef* const versions_;
   const FunctionDefLibrary* const library_;
+  const GraphDebugInfo* const debug_info_;
 };
 
 // Implementation of GraphConstructor that takes ownership of the input
@@ -454,8 +472,11 @@ class NodeDefMovingGraphConstructor : public GraphConstructor {
     return std::move(*graph_def_.mutable_node(i));
   }
   const VersionDef* versions() const override { return &graph_def_.versions(); }
-  const FunctionDefLibrary* library() const override {
-    return &graph_def_.library();
+  std::optional<FunctionDefLibrary> consume_library() override {
+    return std::move(*graph_def_.mutable_library());
+  }
+  const GraphDebugInfo* debug_info() const override {
+    return &graph_def_.debug_info();
   }
 
   GraphDef graph_def_;
@@ -475,7 +496,7 @@ Status MaybeAppendVersionWarning(const VersionDef* versions,
         import_status.code(),
         absl::StrCat(
             "Converting GraphDef to Graph has failed with an error: '",
-            import_status.error_message(),
+            import_status.message(),
             "' The binary trying to import the GraphDef was built when "
             "GraphDef version was ",
             TF_GRAPH_DEF_VERSION,
@@ -491,7 +512,8 @@ Status MaybeAppendVersionWarning(const VersionDef* versions,
 
 /* static */ Status GraphConstructor::Construct(
     const Options& opts, NodeDefSlice node_defs, const VersionDef* versions,
-    const FunctionDefLibrary* library, Graph* g, ShapeRefiner* refiner,
+    const FunctionDefLibrary* library, const GraphDebugInfo* debug_info,
+    Graph* g, ShapeRefiner* refiner,
     std::vector<std::pair<Node*, int>>* return_tensors,
     std::vector<Node*>* return_nodes,
     std::vector<SafeTensorId>* missing_unused_input_map_keys) {
@@ -500,9 +522,9 @@ Status MaybeAppendVersionWarning(const VersionDef* versions,
                                      TF_GRAPH_DEF_VERSION_MIN_PRODUCER,
                                      "GraphDef", "graph"));
   }
-  NodeDefCopyingGraphConstructor c(opts, node_defs, versions, library, g,
-                                   refiner, return_tensors, return_nodes,
-                                   missing_unused_input_map_keys);
+  NodeDefCopyingGraphConstructor c(opts, node_defs, versions, library,
+                                   debug_info, g, refiner, return_tensors,
+                                   return_nodes, missing_unused_input_map_keys);
   Status s = c.TryImport();
   if (!s.ok()) {
     c.Undo();
@@ -821,14 +843,14 @@ Status GraphConstructor::ValidateShape(Node* node) {
     if (!s.ok()) {
       return errors::InvalidArgument("Node '", node->name(), " has an invalid ",
                                      kAttrName, " attribute (shape #", i,
-                                     " error:'", s.error_message(), "'");
+                                     " error:'", s.message(), "'");
     }
     s = refiner_->SetShape(node, i, h);
     if (!s.ok()) {
       return errors::InvalidArgument(
           "Node '", node->name(), "' has an ", kAttrName,
           " attribute inconsistent with the GraphDef for output #", i, ": ",
-          s.error_message());
+          s.message());
     }
   }
   node->ClearAttr(kAttrName);
@@ -1124,10 +1146,8 @@ void GraphConstructor::PrintCycles() {
 Status GraphConstructor::Convert() {
   // Import functions before adding nodes, since imported nodes may refer to
   // functions
-  if (library()) {
-    // TODO(b/135705010): Add rvalue overloads into the function library, to
-    // avoid unnecessarily copying `*library()` here.
-    TF_RETURN_IF_ERROR(g_->AddFunctionLibrary(*library()));
+  if (auto library = consume_library(); library.has_value()) {
+    TF_RETURN_IF_ERROR(g_->AddFunctionLibrary(*std::move(library)));
   }
 
   std::vector<InputInfo> inputs;
@@ -1459,8 +1479,8 @@ Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
                               const GraphDef& gdef, Graph* g) {
   ShapeRefiner refiner(gdef.versions().producer(), g->op_registry());
   return GraphConstructor::Construct(
-      opts, gdef.node(), &gdef.versions(), &gdef.library(), g, &refiner,
-      /*return_tensors=*/nullptr, /*return_nodes=*/nullptr,
+      opts, gdef.node(), &gdef.versions(), &gdef.library(), &gdef.debug_info(),
+      g, &refiner, /*return_tensors=*/nullptr, /*return_nodes=*/nullptr,
       /*missing_unused_input_map_keys=*/nullptr);
 }
 
@@ -1482,8 +1502,8 @@ Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
   for (const auto& n : nodes) {
     node_defs.push_back(&n);
   }
-  return GraphConstructor::Construct(opts, node_defs, nullptr, nullptr, g,
-                                     &refiner, /*return_tensors=*/nullptr,
+  return GraphConstructor::Construct(opts, node_defs, nullptr, nullptr, nullptr,
+                                     g, &refiner, /*return_tensors=*/nullptr,
                                      /*return_nodes=*/nullptr,
                                      /*missing_unused_input_map_keys=*/nullptr);
 }
@@ -1551,13 +1571,13 @@ Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
 
   if (results == nullptr) {
     return GraphConstructor::Construct(opts, gdef.node(), &gdef.versions(),
-                                       &gdef.library(), g, refiner, nullptr,
-                                       nullptr, nullptr);
+                                       &gdef.library(), &gdef.debug_info(), g,
+                                       refiner, nullptr, nullptr, nullptr);
   } else {
     return GraphConstructor::Construct(
-        opts, gdef.node(), &gdef.versions(), &gdef.library(), g, refiner,
-        &results->return_tensors, &results->return_nodes,
-        &results->missing_unused_input_map_keys);
+        opts, gdef.node(), &gdef.versions(), &gdef.library(),
+        &gdef.debug_info(), g, refiner, &results->return_tensors,
+        &results->return_nodes, &results->missing_unused_input_map_keys);
   }
 }
 
diff --git a/tensorflow/core/common_runtime/graph_constructor_fuzz.cc b/tensorflow/core/common_runtime/graph_constructor_fuzz.cc
index b20299f7c42..5b82e1d313a 100644
--- a/tensorflow/core/common_runtime/graph_constructor_fuzz.cc
+++ b/tensorflow/core/common_runtime/graph_constructor_fuzz.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "fuzztest/fuzztest.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/public/session.h"
 
 namespace tensorflow::fuzzing {
 namespace {
@@ -28,5 +29,69 @@ void FuzzImportGraphDef(const GraphDef& graph_def) {
 }
 FUZZ_TEST(GraphDefFuzz, FuzzImportGraphDef);
 
+void FuzzGraphEndToEndSimpleFixedInput(const GraphDef& graph_def) {
+  // Load an arbitrary graph and run a session on it using simple input.
+  ImportGraphDefOptions options;
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  Status status =
+      ImportGraphDef(options, graph_def, graph.get(), nullptr, nullptr);
+  if (!status.ok()) {
+    return;
+  }
+  GraphDef gdef;
+  graph->ToGraphDef(&gdef);
+  SessionOptions sess_options;
+  std::unique_ptr<Session> sess =
+      std::unique_ptr<Session>(NewSession(sess_options));
+  status = sess->Create(gdef);
+  if (!status.ok()) {
+    return;
+  }
+
+  // Use the same input for each fuzz iteration. The benefit of this is the
+  // fuzzer will focus on exploring graphs that match this input, which
+  // gives it a more narrow space to search, as opposed to any given input.
+  Tensor p1(DT_FLOAT, TensorShape({1}));
+  p1.scalar<float>()() = 1.0;
+  Tensor p2(DT_FLOAT, TensorShape({1}));
+  p2.scalar<float>()() = 2.0;
+  std::vector<std::pair<string, Tensor>> inputs = {{"Placeholder", p1},
+                                                   {"Placeholder_1", p2}};
+  std::vector<string> output_names = {"O_FUZZ"};
+  std::vector<string> target_names;
+  std::vector<Tensor> outputs;
+  status = sess->Run(inputs, output_names, target_names, &outputs);
+}
+FUZZ_TEST(GraphDefFuzz, FuzzGraphEndToEndSimpleFixedInput);
+
+void FuzzGraphEndToEndAllStatic(const GraphDef& graph_def) {
+  // Load an arbitrary graph and run a session on it. No input or output is
+  // provided and the reason is we aim for the graph itself to embed all
+  // values needed for the computations. In this sense we enable the fuzzer
+  // to explore any arbitrary graph computation.
+  ImportGraphDefOptions options;
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  Status status =
+      ImportGraphDef(options, graph_def, graph.get(), nullptr, nullptr);
+  if (!status.ok()) {
+    return;
+  }
+  GraphDef gdef;
+  graph->ToGraphDef(&gdef);
+  SessionOptions sess_options;
+  auto sess = std::unique_ptr<Session>(NewSession(sess_options));
+  status = sess->Create(gdef);
+  if (!status.ok()) {
+    return;
+  }
+
+  std::vector<std::pair<string, Tensor>> inputs = {};
+  std::vector<string> output_names = {};
+  std::vector<string> target_names = {};
+  std::vector<Tensor> outputs = {};
+  status = sess->Run(inputs, output_names, target_names, &outputs);
+}
+FUZZ_TEST(GraphDefFuzz, FuzzGraphEndToEndAllStatic);
+
 }  // namespace
 }  // namespace tensorflow::fuzzing
diff --git a/tensorflow/core/common_runtime/graph_constructor_test.cc b/tensorflow/core/common_runtime/graph_constructor_test.cc
index 4afff0ab1cd..1f88351919f 100644
--- a/tensorflow/core/common_runtime/graph_constructor_test.cc
+++ b/tensorflow/core/common_runtime/graph_constructor_test.cc
@@ -63,13 +63,12 @@ class GraphConstructorTest : public ::testing::Test {
     EXPECT_FALSE(status.ok());
 
     for (const string& error : expected_error_strs) {
-      EXPECT_TRUE(status.error_message().find(error) != string::npos)
+      EXPECT_TRUE(absl::StrContains(status.message(), error))
           << "Expected to find '" << error << "' in " << status;
     }
 
     if (!not_expected_error_str.empty()) {
-      EXPECT_TRUE(status.error_message().find(not_expected_error_str) ==
-                  string::npos)
+      EXPECT_TRUE(!absl::StrContains(status.message(), not_expected_error_str))
           << "Expected not to find '" << not_expected_error_str << "' in "
           << status;
     }
@@ -89,7 +88,7 @@ class GraphConstructorTest : public ::testing::Test {
     EXPECT_FALSE(status.ok());
 
     for (const string& error : expected_error_strs) {
-      EXPECT_TRUE(status.error_message().find(error) != string::npos)
+      EXPECT_TRUE(absl::StrContains(status.message(), error))
           << "Expected to find '" << error << "' in " << status;
     }
 
@@ -1166,9 +1165,9 @@ node {
   def.mutable_versions()->set_producer(10);
   s = ImportGraphDef(ImportGraphDefOptions(), def, &g2, nullptr);
   EXPECT_EQ(error::UNIMPLEMENTED, s.code());
-  EXPECT_TRUE(s.error_message().find("BatchNormWithGlobalNormalization is not "
-                                     "available in GraphDef version 10") !=
-              string::npos)
+  EXPECT_TRUE(absl::StrContains(s.message(),
+                                "BatchNormWithGlobalNormalization is not "
+                                "available in GraphDef version 10"))
       << s;
 }
 
@@ -2450,18 +2449,18 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ErrorsDoNoChangeTheGraph) {
   EXPECT_EQ(3, graph_.num_edges());
   const string original_graph_description = GraphDebugString();
 
-#define EXPECT_IMPORT_FAILURE(graph_def, options, expected_err)             \
-  do {                                                                      \
-    Status s = ImportGraphDef(options, graph_def, &graph_, nullptr);        \
-    EXPECT_NE(OkStatus(), s) << s;                                          \
-    EXPECT_TRUE(s.error_message().find(expected_err) != string::npos) << s; \
-    const string graph_description = GraphDebugString();                    \
-    EXPECT_EQ(original_graph_description, graph_description);               \
-    EXPECT_EQ(3, graph_.num_nodes());                                       \
-    EXPECT_TRUE(HasControlEdge(source, sink));                              \
-    EXPECT_TRUE(HasControlEdge(source, "scope/A"));                         \
-    EXPECT_TRUE(HasControlEdge("scope/A", sink));                           \
-    EXPECT_EQ(3, graph_.num_edges());                                       \
+#define EXPECT_IMPORT_FAILURE(graph_def, options, expected_err)       \
+  do {                                                                \
+    Status s = ImportGraphDef(options, graph_def, &graph_, nullptr);  \
+    EXPECT_NE(OkStatus(), s) << s;                                    \
+    EXPECT_TRUE(s.message().find(expected_err) != string::npos) << s; \
+    const string graph_description = GraphDebugString();              \
+    EXPECT_EQ(original_graph_description, graph_description);         \
+    EXPECT_EQ(3, graph_.num_nodes());                                 \
+    EXPECT_TRUE(HasControlEdge(source, sink));                        \
+    EXPECT_TRUE(HasControlEdge(source, "scope/A"));                   \
+    EXPECT_TRUE(HasControlEdge("scope/A", sink));                     \
+    EXPECT_EQ(3, graph_.num_edges());                                 \
   } while (0)
 
   EXPECT_IMPORT_FAILURE(def, opts,
@@ -2731,9 +2730,9 @@ TEST_F(GraphConstructorTest, ImportGraphDef_NestedFunctionDefs) {
   // Check that Inner and Outer have been imported
   const OpDef* op_def;
   Status s = graph_.op_registry()->LookUpOpDef("Inner_d03c39a3", &op_def);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
   s = graph_.op_registry()->LookUpOpDef("Outer_966fa13d", &op_def);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
 
   // Re-serialize and run the graph. This tests that re-serialized functions can
   // be imported again and that imported functions can be run.
@@ -2741,7 +2740,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_NestedFunctionDefs) {
   graph_.ToGraphDef(&gdef);
   std::unique_ptr<Session> sess(NewSession(SessionOptions()));
   s = sess->Create(gdef);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
 
   Tensor p1(DT_FLOAT, TensorShape({1}));
   p1.scalar<float>()() = 1.0;
@@ -2753,7 +2752,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_NestedFunctionDefs) {
   std::vector<string> target_names;
   std::vector<Tensor> outputs;
   s = sess->Run(inputs, output_names, target_names, &outputs);
-  ASSERT_TRUE(s.ok()) << s.error_message();
+  ASSERT_TRUE(s.ok()) << s.message();
 
   ASSERT_EQ(outputs.size(), 1);
   EXPECT_EQ(outputs[0].scalar<float>()(), 3.0);
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 337c42d9415..c149fb93d0e 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -67,7 +67,7 @@ namespace {
 bool IsCollectiveV2(const string& op) {
   return op == "CollectiveReduceV2" || op == "CollectiveGatherV2" ||
          op == "CollectiveBcastRecvV2" || op == "CollectiveBcastSendV2" ||
-         op == "ColectiveReduceScatterV2";
+         op == "ColectiveReduceScatterV2" || op == "ColectiveAllToAllV2";
 }
 }  // namespace
 
@@ -81,7 +81,8 @@ GraphExecutionState::GraphExecutionState(
       session_options_(options.session_options),
       session_handle_(options.session_handle),
       flib_def_(std::move(flib_def)),
-      graph_(nullptr) {}
+      graph_(nullptr),
+      run_placer_(options.run_placer) {}
 
 GraphExecutionState::~GraphExecutionState() {
   node_name_to_cost_id_map_.clear();
@@ -622,14 +623,16 @@ Status GraphExecutionState::InitBaseGraph(std::unique_ptr<Graph>&& new_graph) {
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
       OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
 
-  Placer placer(new_graph.get(), "", flib_def_.get(), device_set_,
-                /* default_local_device= */ nullptr,
-                session_options_ == nullptr ||
-                    session_options_->config.allow_soft_placement(),
-                session_options_ != nullptr &&
-                    session_options_->config.log_device_placement());
-  // TODO(mrry): Consider making the Placer cancellable.
-  TF_RETURN_IF_ERROR(placer.Run());
+  if (run_placer_) {
+    Placer placer(new_graph.get(), "", flib_def_.get(), device_set_,
+                  /* default_local_device= */ nullptr,
+                  session_options_ == nullptr ||
+                      session_options_->config.allow_soft_placement(),
+                  session_options_ != nullptr &&
+                      session_options_->config.log_device_placement());
+    // TODO(mrry): Consider making the Placer cancellable.
+    TF_RETURN_IF_ERROR(placer.Run());
+  }
 
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
       OptimizationPassRegistry::POST_PLACEMENT, optimization_options));
@@ -665,7 +668,7 @@ Status GraphExecutionState::OptimizeGraph(
     // It's ok to skip invalid device annotations in Grappler.
     for (const Device* d : device_set_->devices()) {
       Status added_device = item.AddDevice(d->name());
-      if (!added_device.ok()) VLOG(3) << added_device.error_message();
+      if (!added_device.ok()) VLOG(3) << added_device.message();
     }
     VLOG(3) << "Grappler available devices: "
             << absl::StrJoin(item.devices(), ", ");
@@ -870,7 +873,7 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
   Status s = OptimizeGraph(options, *graph_, flib_def_.get(), &optimized_graph,
                            &optimized_flib);
   if (!s.ok()) {
-    VLOG(2) << "Grappler optimization failed. Error: " << s.error_message();
+    VLOG(2) << "Grappler optimization failed. Error: " << s.message();
     // Simply copy the original graph and the function library if we couldn't
     // optimize it.
     optimized_graph.reset(new Graph(flib_def_.get()));
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index 4464fdfff0b..e31a357954a 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -47,6 +47,8 @@ struct GraphExecutionStateOptions {
   // A map from node name to device name, representing the unchangeable
   // placement of stateful nodes.
   std::unordered_map<string, string> stateful_placements;
+  // Whether to run Placer on the graph.
+  bool run_placer = true;
 };
 
 // A ClientGraph is simply a sub-graph of the full graph as induced by
@@ -223,6 +225,9 @@ class GraphExecutionState {
   // The dataflow graph owned by this object.
   Graph* graph_;
 
+  // Whether to run Placer.
+  bool run_placer_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(GraphExecutionState);
 };
 
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index 5804fc4a3de..c79c3fc159b 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -190,9 +190,8 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
     for (int di = 0; di < instances_.size(); ++di) {
       if (!instances_[di]->status_.ok()) {
         ASSERT_GT(fail_after, 0);
-        ASSERT_NE(
-            instances_[di]->status_.error_message().find("Deliberate failure"),
-            string::npos);
+        ASSERT_NE(instances_[di]->status_.message().find("Deliberate failure"),
+                  string::npos);
         ++failure_count_;
         continue;
       }
diff --git a/tensorflow/core/common_runtime/inline_function_utils.cc b/tensorflow/core/common_runtime/inline_function_utils.cc
index aaf02241612..829e5897305 100644
--- a/tensorflow/core/common_runtime/inline_function_utils.cc
+++ b/tensorflow/core/common_runtime/inline_function_utils.cc
@@ -486,7 +486,7 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
 
   Status validation = ValidateInlining(caller, fbody, options);
   if (!validation.ok()) {
-    return errors::Internal("Inlining mismatch: ", validation.error_message());
+    return errors::Internal("Inlining mismatch: ", validation.message());
   }
 
   // Placer is responsible for assigning devices for all nodes that we will add
@@ -578,6 +578,9 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
   std::vector<Node*> input_nodes;
   std::map<absl::string_view, absl::string_view> input_node_name_map;
   for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
+    if (inputs[i].node == nullptr)
+      return errors::Internal("Null node found for input ", i);
+
     Node* n = input_identity("input", inputs[i], i);
     input_node_name_map[arg_name(fbody->fdef.signature().input_arg(), i)] =
         n->name();
@@ -873,7 +876,7 @@ bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
     FunctionLibraryRuntime::Handle handle;
     Status s = InstantiateFunctionCall(node->def(), lib, &handle);
     if (!s.ok()) {
-      LOG(ERROR) << "Failed to instantiate a function:  " << s.error_message();
+      LOG(ERROR) << "Failed to instantiate a function:  " << s.message();
       continue;
     }
     const FunctionBody* fbody = lib->GetFunctionBody(handle);
@@ -891,7 +894,7 @@ bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
       inlined_any = true;
     } else {
       VLOG(1) << "Failed to inline function call: node=" << p.first->name()
-              << " error=" << inlined.error_message();
+              << " error=" << inlined.message();
     }
   }
 
diff --git a/tensorflow/core/common_runtime/int32_fulltype.cc b/tensorflow/core/common_runtime/int32_fulltype.cc
index 5f411437aa4..ea173a4e060 100644
--- a/tensorflow/core/common_runtime/int32_fulltype.cc
+++ b/tensorflow/core/common_runtime/int32_fulltype.cc
@@ -35,14 +35,14 @@ Status Int32FulltypePass::Int32FullTypeForTensor(DataType dtype,
     if (tensor_t->args_size() != 1) {
       if (node != nullptr) {
         return Status(
-            error::INVALID_ARGUMENT,
+            absl::StatusCode::kInvalidArgument,
             absl::StrCat("Full type for node='", node->name(), "' (op='",
-                         node->op_def().name(), "') has TFT_TENSOR output ",
-                         output_idx, " which has ", tensor_t->args_size(),
-                         " args instead of 1.\n got:\n",
+                         node->op_def().name(), "') in '", debug_location_,
+                         "' has TFT_TENSOR output ", output_idx, " which has ",
+                         tensor_t->args_size(), " args instead of 1.\n got:\n",
                          tensor_t->DebugString()));
       } else {
-        return Status(error::INVALID_ARGUMENT,
+        return Status(absl::StatusCode::kInvalidArgument,
                       absl::StrCat("TFT_TENSOR has ", tensor_t->args_size(),
                                    " args instead of 1.\n got:\n",
                                    tensor_t->DebugString()));
diff --git a/tensorflow/core/common_runtime/int32_fulltype.h b/tensorflow/core/common_runtime/int32_fulltype.h
index c6fae8944f3..f2be1c87e3f 100644
--- a/tensorflow/core/common_runtime/int32_fulltype.h
+++ b/tensorflow/core/common_runtime/int32_fulltype.h
@@ -28,6 +28,10 @@ namespace tensorflow {
 // mechanisms using full type information to always place int32 on host.
 class Int32FulltypePass {
  public:
+  Int32FulltypePass() = default;
+  explicit Int32FulltypePass(string debug_location)
+      : debug_location_(debug_location) {}
+
   // For each node in this graph that outputs int32 tensors, set full
   // type information such that the int32 tensors use TFT_SHAPE_TENSOR
   // (or TFT_TENSOR if ints_on_device is true, which is only for single
@@ -50,6 +54,10 @@ class Int32FulltypePass {
   Status Int32FullTypeForTensor(DataType dtype, FullTypeDef* tensor_t,
                                 bool set_only_int32, Node* node = nullptr,
                                 int output_idx = 0);
+
+ private:
+  // Location of where annotations were added for debug messages.
+  string debug_location_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/int32_fulltype_test.cc b/tensorflow/core/common_runtime/int32_fulltype_test.cc
index 9e9c1a29433..06be5b6ebd2 100644
--- a/tensorflow/core/common_runtime/int32_fulltype_test.cc
+++ b/tensorflow/core/common_runtime/int32_fulltype_test.cc
@@ -269,7 +269,7 @@ TEST_F(Int32FulltypeTest, BadTensorFT) {
 
   const auto& status = Int32FulltypeAnnotate(&g);
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::HasSubstr("which has 0 args instead of 1."));
 }
 
@@ -287,7 +287,7 @@ TEST_F(Int32FulltypeTest, BadFTWithoutProduct) {
 
   const auto& status = Int32FulltypeAnnotate(&g);
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::HasSubstr("does not start with TFT_PRODUCT."));
 }
 
@@ -306,7 +306,7 @@ TEST_F(Int32FulltypeTest, BadProductFT) {
   const auto& status = Int32FulltypeAnnotate(&g);
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       ::testing::HasSubstr("has 0 outputs but output_types has 2 outputs."));
 }
 
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index 0fb5498fd04..1f9f5be541c 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -33,7 +33,7 @@ class Device;
 class FunctionLibraryRuntime;
 class ProcessFunctionLibraryRuntime;
 struct SessionOptions;
-class StaticDeviceMgr;
+class DynamicDeviceMgr;
 
 namespace test {
 
@@ -67,7 +67,7 @@ class Benchmark {
   thread::ThreadPool* pool_ = nullptr;  // Not owned.
   Device* device_ = nullptr;            // Not owned.
   Rendezvous* rendez_ = nullptr;
-  std::unique_ptr<StaticDeviceMgr> device_mgr_;
+  std::unique_ptr<DynamicDeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
   FunctionLibraryRuntime* flr_;  // Not owned.
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index 9fa266c86f3..933c8a66665 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -39,7 +39,7 @@ bool OverrideGlobalThreadPoolFromEnvironment() {
     auto status = ReadBoolFromEnvVar("TF_OVERRIDE_GLOBAL_THREADPOOL",
                                      /*default_val=*/false, &flag);
     if (!status.ok()) {
-      LOG(ERROR) << "OverrideGlobalThreadPool: " << status.error_message();
+      LOG(ERROR) << "OverrideGlobalThreadPool: " << status.message();
       return false;
     }
     return flag;
@@ -51,9 +51,6 @@ bool OverrideGlobalThreadPoolFromEnvironment() {
 
 /* static */
 bool LocalDevice::use_global_threadpool_ = true;
-mutex LocalDevice::global_tp_mu_;
-gtl::InlinedVector<LocalDevice::EigenThreadPoolInfo*, 4>
-    LocalDevice::global_tp_info_;
 
 struct LocalDevice::EigenThreadPoolInfo {
   // Wrapper so we can provide the CPUAllocator to Eigen for use
@@ -120,31 +117,38 @@ LocalDevice::LocalDevice(const SessionOptions& options,
   LocalDevice::EigenThreadPoolInfo* tp_info;
 
   if (OverrideGlobalThreadPoolFromEnvironment()) {
-    set_use_global_threadpool(false);
+    use_global_threadpool_ = false;
   }
 
   if (use_global_threadpool_) {
-    mutex_lock l(global_tp_mu_);
+    // All ThreadPoolDevices in the process associated with the same
+    // NUMA node will share a single fixed sized threadpool for numerical
+    // computations.
+    static mutex& global_tp_mu = *new mutex;
+    static auto& global_tp_info TF_GUARDED_BY(global_tp_mu) =
+        *new gtl::InlinedVector<LocalDevice::EigenThreadPoolInfo*, 4>;
+
+    mutex_lock l(global_tp_mu);
     if (options.config.experimental().use_numa_affinity()) {
       int numa_node = attributes.locality().numa_node();
       int num_numa_nodes = port::NUMANumNodes();
       DCHECK_LT(numa_node, num_numa_nodes);
       Allocator* numa_allocator =
           ProcessState::singleton()->GetCPUAllocator(numa_node);
-      while (numa_node >= global_tp_info_.size()) {
-        global_tp_info_.push_back(nullptr);
+      while (numa_node >= global_tp_info.size()) {
+        global_tp_info.push_back(nullptr);
       }
-      if (!global_tp_info_[numa_node]) {
-        global_tp_info_[numa_node] = new LocalDevice::EigenThreadPoolInfo(
+      if (!global_tp_info[numa_node]) {
+        global_tp_info[numa_node] = new LocalDevice::EigenThreadPoolInfo(
             options, numa_node, numa_allocator);
       }
-      tp_info = global_tp_info_[numa_node];
+      tp_info = global_tp_info[numa_node];
     } else {
-      if (global_tp_info_.empty()) {
-        global_tp_info_.push_back(new LocalDevice::EigenThreadPoolInfo(
+      if (global_tp_info.empty()) {
+        global_tp_info.push_back(new LocalDevice::EigenThreadPoolInfo(
             options, port::kNUMANoAffinity, nullptr));
       }
-      tp_info = global_tp_info_[0];
+      tp_info = global_tp_info[0];
     }
   } else {
     // Each LocalDevice owns a separate ThreadPoolDevice for numerical
diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h
index 63c4b988daa..226f121bf32 100644
--- a/tensorflow/core/common_runtime/local_device.h
+++ b/tensorflow/core/common_runtime/local_device.h
@@ -47,13 +47,6 @@ class LocalDevice : public Device {
   struct EigenThreadPoolInfo;
   std::unique_ptr<EigenThreadPoolInfo> owned_tp_info_;
 
-  // All ThreadPoolDevices in the process associated with the same
-  // NUMA node will share a single fixed sized threadpool for numerical
-  // computations.
-  static mutex global_tp_mu_;
-  static gtl::InlinedVector<EigenThreadPoolInfo*, 4> global_tp_info_
-      TF_GUARDED_BY(global_tp_mu_);
-
   friend class test::Benchmark;
 
   TF_DISALLOW_COPY_AND_ASSIGN(LocalDevice);
diff --git a/tensorflow/core/common_runtime/local_executor_params.h b/tensorflow/core/common_runtime/local_executor_params.h
index 9eede739381..6c201d79daf 100644
--- a/tensorflow/core/common_runtime/local_executor_params.h
+++ b/tensorflow/core/common_runtime/local_executor_params.h
@@ -19,9 +19,8 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
-namespace tsl {
-class Status;
-}
+#include "tensorflow/core/lib/core/status.h"
+
 namespace tensorflow {
 class Device;
 class StepStatsCollector;
@@ -29,7 +28,6 @@ class SessionMetadata;
 class FunctionLibraryRuntime;
 class NodeProperties;
 class OpKernel;
-using tsl::Status;
 
 // LocalExecutorParams provides arguments that will be shared by all invocations
 // of an executor. We expect that different contexts would provide different
diff --git a/tensorflow/core/common_runtime/lower_function_call_op.cc b/tensorflow/core/common_runtime/lower_function_call_op.cc
index f206059b8c5..3ff98acd97f 100644
--- a/tensorflow/core/common_runtime/lower_function_call_op.cc
+++ b/tensorflow/core/common_runtime/lower_function_call_op.cc
@@ -91,7 +91,7 @@ Status RewriteFunctionCallNode(Node* n, Graph* g,
         InlineFunctionBody(flib_def, g, n, fbody.get(), inline_options));
   } else {
     VLOG(2) << "Failed to inline function call node: "
-            << can_inline_function_call.error_message();
+            << can_inline_function_call.message();
   }
 
   return OkStatus();
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 9b0c13667ce..f3e254f356a 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -280,14 +280,14 @@ class MklCPUAllocator : public Allocator {
   }
 
   static inline void* CallocHook(size_t num, size_t size) {
-    Status s = Status(error::Code::UNIMPLEMENTED,
+    Status s = Status(absl::StatusCode::kUnimplemented,
                       "Unimplemented case for hooking MKL function.");
     TF_CHECK_OK(s);  // way to assert with an error message
     return nullptr;  // return a value and make static code analyzers happy
   }
 
   static inline void* ReallocHook(void* ptr, size_t size) {
-    Status s = Status(error::Code::UNIMPLEMENTED,
+    Status s = Status(absl::StatusCode::kUnimplemented,
                       "Unimplemented case for hooking MKL function.");
     TF_CHECK_OK(s);  // way to assert with an error message
     return nullptr;  // return a value and make static code analyzers happy
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 7125cbbd31e..18bf9d23298 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -383,6 +383,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     const bool native_fmt = NativeFormatEnabled();
     // NOTE: names are alphabetically sorted.
+#ifndef ENABLE_ONEDNN_V3
     rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
                       CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
@@ -412,6 +413,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.batch_matmul_v2,
                       mkl_op_registry::GetMklOpName(csinfo_.batch_matmul_v2),
                       CopyAttrsAll, MatMulRewrite, kRewriteForOpNameChange});
+#endif  // !ENABLE_ONEDNN_V3
     rinfo_.push_back({csinfo_.concat,
                       mkl_op_registry::GetMklOpName(csinfo_.concat),
                       CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
@@ -462,6 +464,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         {csinfo_.depthwise_conv2d_grad_filter,
          mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_filter),
          CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+#ifndef ENABLE_ONEDNN_V3
     rinfo_.push_back(
         {csinfo_.dequantize, mkl_op_registry::GetMklOpName(csinfo_.dequantize),
          CopyAttrsAll, DequantizeRewrite, kRewriteForOpNameChange});
@@ -496,6 +499,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                  : csinfo_.mkl_fused_batch_norm_ex,
                       CopyAttrsAll, FusedBatchNormExRewrite,
                       GetRewriteCause()});
+#endif  // !ENABLE_ONEDNN_V3
     rinfo_.push_back({csinfo_.fused_conv2d,
                       native_fmt ? csinfo_.mkl_native_fused_conv2d
                                  : csinfo_.mkl_fused_conv2d,
@@ -509,6 +513,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                  : csinfo_.mkl_fused_depthwise_conv2d,
                       CopyAttrsAllCheckConstFilter, FusedDepthwiseConv2DRewrite,
                       GetRewriteCause()});
+#ifndef ENABLE_ONEDNN_V3
     rinfo_.push_back({csinfo_.fused_matmul,
                       native_fmt ? csinfo_.mkl_native_fused_matmul
                                  : csinfo_.mkl_fused_matmul,
@@ -549,6 +554,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsAll, RewriteIfAtleastOneMklInput,
                       GetRewriteCause()});
+#endif  // !ENABLE_ONEDNN_V3
     rinfo_.push_back({csinfo_.pad_with_conv2d,
                       native_fmt ? csinfo_.mkl_native_pad_with_conv2d
                                  : csinfo_.mkl_pad_with_conv2d,
@@ -559,6 +565,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                  : csinfo_.mkl_pad_with_fused_conv2d,
                       CopyAttrsAllCheckConstFilter, AlwaysRewrite,
                       GetRewriteCause()});
+#ifndef ENABLE_ONEDNN_V3
     rinfo_.push_back({csinfo_.quantized_avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_avg_pool),
                       CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
@@ -670,10 +677,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
              csinfo_
                  .quantized_depthwise_conv2d_with_bias_and_relu_and_requantize),
          CopyAttrsQuantizedConv2D, AlwaysRewrite, kRewriteForOpNameChange});
+#endif  // !ENABLE_ONEDNN_V3
     rinfo_.push_back({csinfo_.quantize_v2,
                       mkl_op_registry::GetMklOpName(csinfo_.quantize_v2),
                       CopyAttrsAll, QuantizeOpRewrite,
                       kRewriteForOpNameChange});
+#ifndef ENABLE_ONEDNN_V3
     rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.relu_grad,
@@ -700,10 +709,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back(
         {csinfo_.slice, mkl_op_registry::GetMklOpName(csinfo_.slice),
          CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
+#endif  // !ENABLE_ONEDNN_V3
     rinfo_.push_back({csinfo_.softmax,
                       mkl_op_registry::GetMklOpName(csinfo_.softmax),
                       CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
 
+#ifndef ENABLE_ONEDNN_V3
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsAll, RewriteIfAtleastOneMklInput,
@@ -711,6 +722,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsAll, RewriteIfAtleastOneMklInput,
                       GetRewriteCause()});
+#endif  // !ENABLE_ONEDNN_V3
     rinfo_.push_back({csinfo_.transpose,
                       mkl_op_registry::GetMklOpName(csinfo_.transpose),
                       CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
@@ -2407,7 +2419,7 @@ Status MklLayoutRewritePass::SetUpInputs(
     // TODO(nhasabni): implement this function just for same of completion.
     // We do not use interleaved ordering right now.
     return Status(
-        error::Code::UNIMPLEMENTED,
+        absl::StatusCode::kUnimplemented,
         "Interleaved ordering of tensors is currently not supported.");
   } else {
     CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
@@ -3068,7 +3080,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   if (data_format_pred != data_format_succ || T_pred != T_succ ||
       pred->assigned_device_name() != succ->assigned_device_name() ||
       pred->def().device() != succ->def().device()) {
-    return Status(error::Code::INVALID_ARGUMENT,
+    return Status(absl::StatusCode::kInvalidArgument,
                   "data_format or T attribute or devices of Conv2D and "
                   "BiasAdd do not match. Will skip node merge optimization");
   }
@@ -3089,7 +3101,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   const int kFirstOutputSlot = 0;
   for (const Edge* e : pred->out_edges()) {
     if (e->src_output() == kFirstOutputSlot && e->dst() != succ) {
-      return Status(error::Code::INVALID_ARGUMENT,
+      return Status(absl::StatusCode::kInvalidArgument,
                     "Conv2D does not feed to BiasAdd, or "
                     "it feeds BiasAdd but has multiple outputs. "
                     "Will skip node merge optimization");
@@ -3229,7 +3241,7 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
   if (T_pred != T_succ ||
       pred->assigned_device_name() != succ->assigned_device_name() ||
       pred->def().device() != succ->def().device()) {
-    return Status(error::Code::INVALID_ARGUMENT,
+    return Status(absl::StatusCode::kInvalidArgument,
                   "T attribute or devices of Conv2D and "
                   "Pad do not match. Will skip node merge optimization");
   }
@@ -3250,7 +3262,7 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
   const int kFirstOutputSlot = 0;
   for (const Edge* e : pred->out_edges()) {
     if (e->src_output() == kFirstOutputSlot && e->dst() != succ) {
-      return Status(error::Code::INVALID_ARGUMENT,
+      return Status(absl::StatusCode::kInvalidArgument,
                     "Pad does not feed to Conv2D, or "
                     "it feeds Conv2D but has multiple outputs. "
                     "Will skip node merge optimization");
@@ -3398,7 +3410,7 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
   if (data_format_b != data_format_f || T_b != T_f ||
       badd->assigned_device_name() != fltr->assigned_device_name() ||
       badd->def().device() != fltr->def().device()) {
-    return Status(error::Code::INVALID_ARGUMENT,
+    return Status(absl::StatusCode::kInvalidArgument,
                   "data_format or T attribute or devices of "
                   "Conv2DBackpropFilter and BiasAddGrad do not match. "
                   "Will skip node merge optimization");
@@ -3537,7 +3549,7 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
     return this->MergeConv2DBackpropFilterWithBiasAddGrad(g, m, n);
   }
 
-  return Status(error::Code::UNIMPLEMENTED,
+  return Status(absl::StatusCode::kUnimplemented,
                 "Unimplemented case for node merge optimization.");
 }
 
@@ -3734,7 +3746,7 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
   } else if (ri->rewrite_cause == kRewriteForOpNameChange) {
     ret_status = RewriteNodeForJustOpNameChange(g, orig_node, &new_node, ri);
   } else {
-    ret_status = Status(error::Code::INVALID_ARGUMENT,
+    ret_status = Status(absl::StatusCode::kInvalidArgument,
                         "Unsupported rewrite cause found."
                         "RewriteNode will fail.");
   }
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
index b4725d404ac..2e51b26bcc1 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
@@ -35,7 +35,9 @@ cc_library(
         "//tensorflow/c:tf_status_headers",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/compiler/jit:pjrt_base_device",
         "//tensorflow/compiler/jit:pjrt_device_context",
+        "//tensorflow/compiler/tf2xla:layout_util",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core/common_runtime/next_pluggable_device/c:plugin_c_api_hdrs",
@@ -45,6 +47,7 @@ cc_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/tfrt/common:async_value_tensor",
         "//tensorflow/core/tfrt/common:pjrt_state",
         "@com_google_absl//absl/flags:flag",
     ],
@@ -59,7 +62,7 @@ cc_library(
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core/common_runtime/next_pluggable_device/c:plugin_c_api_hdrs",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
     ],
 )
 
@@ -76,10 +79,14 @@ cc_library(
         ":next_pluggable_device",
         ":next_pluggable_device_api",
         ":pjrt_compile_on_demand_op",
+        ":utils",
         "//tensorflow/c:tf_status_headers",
         "//tensorflow/c:tf_status_helper",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",
         "//tensorflow/core:framework",
         "//tensorflow/core/common_runtime/next_pluggable_device/c:plugin_c_api_hdrs",
+        "//tensorflow/tsl/framework:device_id_utils",
         "//tensorflow/tsl/platform:errors",
         "@com_google_absl//absl/strings",
     ],
@@ -95,24 +102,17 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":next_pluggable_device",
-        ":next_pluggable_device_api",
-        ":utils",
-        "//tensorflow/c:tf_status_helper",
         "//tensorflow/compiler/jit:device_compiler_client",
         "//tensorflow/compiler/jit:variable_info",
         "//tensorflow/compiler/jit:variable_info_util",
         "//tensorflow/compiler/jit:xla_device",
+        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
         "//tensorflow/compiler/jit:xla_launch_util",
-        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core/platform:status",
-        "//tensorflow/core/tfrt/common:async_value_tensor",
         "//tensorflow/core/tfrt/common:create_pjrt_client_util",
-        "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/core/util:determinism",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:statusor",
@@ -167,14 +167,14 @@ cc_library(
 )
 
 bool_flag(
-    name = "tf_c_api_passthrough",
-    build_setting_default = True,
+    name = "tf_npd_use_c_api",
+    build_setting_default = False,
     visibility = ["//visibility:public"],
 )
 
 config_setting(
-    name = "tf_c_api_passthrough_enabled",
-    flag_values = {":tf_c_api_passthrough": "True"},
+    name = "tf_npd_use_c_api_enabled",
+    flag_values = {":tf_npd_use_c_api": "True"},
 )
 
 cc_library(
@@ -191,6 +191,7 @@ cc_library(
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:kernels_experimental_hdrs",
         "//tensorflow/c:kernels_hdrs",
+        "//tensorflow/c:tf_buffer",
         "//tensorflow/c:tf_buffer_internal",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c:tf_tensor_internal",
@@ -211,7 +212,7 @@ cc_library(
     name = "plugin_op_kernel_helper",
     hdrs = ["plugin_op_kernel_helper.h"],
     defines = select({
-        "tf_c_api_passthrough_enabled": ["TF_OPKERNEL_C_API_PASSTHROUGH"],
+        "tf_npd_use_c_api_enabled": ["TF_NEXT_PLUGGABLE_DEVICE_USE_C_API"],
         "//conditions:default": [],
     }),
     visibility = ["//visibility:public"],
@@ -221,9 +222,9 @@ cc_library(
         "//tensorflow/c:kernels_hdrs",
         "//tensorflow/c:tf_status_helper",
     ] + select({
-        "tf_c_api_passthrough_enabled": [":direct_plugin_op_kernel"],
+        "tf_npd_use_c_api_enabled": [":c_plugin_op_kernel"],
         "//conditions:default": [
-            ":c_plugin_op_kernel",
+            ":direct_plugin_op_kernel",
         ],
     }),
 )
@@ -247,7 +248,10 @@ cc_library(
     name = "plugin_coordination_service_agent",
     hdrs = ["plugin_coordination_service_agent.h"],
     visibility = ["//visibility:public"],
-    deps = ["//tensorflow/core/platform:statusor"],
+    deps = [
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:statusor",
+    ],
 )
 
 cc_library(
@@ -281,7 +285,7 @@ cc_library(
     name = "plugin_coordination_service_agent_helper",
     hdrs = ["plugin_coordination_service_agent_helper.h"],
     defines = select({
-        "tf_c_api_passthrough_enabled": ["TF_OPKERNEL_C_API_PASSTHROUGH"],
+        "tf_npd_use_c_api_enabled": ["TF_NEXT_PLUGGABLE_DEVICE_USE_C_API"],
         "//conditions:default": [],
     }),
     visibility = ["//visibility:public"],
@@ -290,11 +294,11 @@ cc_library(
         "//tensorflow/c:kernels_hdrs",
         "//tensorflow/c:tf_status_helper",
     ] + select({
-        "tf_c_api_passthrough_enabled": [
-            ":direct_plugin_coordination_service_agent",
+        "tf_npd_use_c_api_enabled": [
+            ":c_plugin_coordination_service_agent",
         ],
         "//conditions:default": [
-            ":c_plugin_coordination_service_agent",
+            ":direct_plugin_coordination_service_agent",
         ],
     }),
 )
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
index d904f29024f..24f6a9f09c8 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
@@ -12,8 +12,9 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/c:c_api_headers",
+        "//tensorflow/c:c_api_macros_hdrs",
         "//tensorflow/c:tf_status_headers",
-        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c:tf_tensor_hdrs",
         "//tensorflow/compiler/xla/c:c_api_decl",
         "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",
     ],
@@ -38,6 +39,7 @@ tf_cc_test(
     deps = [
         ":example_plugin",
         ":plugin_c_api_hdrs",
+        "//tensorflow/c:c_api",
         "//tensorflow/c:tf_status_headers",
         "//tensorflow/core/platform:status",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h b/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h
index 24f1f57a45e..af4c53df380 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h
@@ -152,9 +152,14 @@ typedef struct TFNPD_PluginParams {
 
   const char* device_type;              // output, set by plugin
   const char* compilation_device_name;  // output, set by plugin
+  int32_t priority;                     // output, set by plugin
+  // Certain devices may set this one to false to avoid using device copy logic
+  // implemented for legacy PluggableDevice.
+  bool is_pluggable_device;         // output, set by plugin
+  bool use_pjrt_on_demand_compile;  // output, set by plugin
 } TFNPD_PluginParams;
 const size_t TFNPD_PLUGIN_PARAMS_STRUCT_SIZE =
-    TF_OFFSET_OF_END(TFNPD_PluginParams, compilation_device_name);
+    TF_OFFSET_OF_END(TFNPD_PluginParams, is_pluggable_device);
 const TFNPD_Api* TFNPD_InitPlugin(TFNPD_PluginParams* params,
                                   TF_Status* tf_status);
 
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api_test.cc b/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api_test.cc
index 7ba319fed52..79d19f0221b 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api_test.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api_test.cc
@@ -110,8 +110,8 @@ TEST_F(PluginEventTestFixture, TestInvokeCallback) {
                &tennis_goat](const tensorflow::Status& status) {
     result_avref.emplace(42);
     LOG(INFO) << "Invoking status callback. Tennis goat is: "
-              << status.error_message();
-    tennis_goat = status.error_message();
+              << status.message();
+    tennis_goat = status.message();
   };
 
   TFNPD_DeviceEvent* event =
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.cc b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.cc
index 844546e4324..343a081b8d4 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.cc
@@ -121,7 +121,7 @@ void CPluginOpKernelConstruction::CtxFailure(const Status& status) {
 void CPluginOpKernelConstruction::CtxFailure(const char* file, int line,
                                              const Status& status) {
   TF_StatusPtr c_status_ptr(TF_NewStatus());
-  Set_TF_Status_from_Status(c_status_ptr.get(), status);
+  tsl::Set_TF_Status_from_Status(c_status_ptr.get(), status);
   if (line != kInvalidLineNumber) {
     LOG(WARNING) << "Plugin OP_REQUIRES failed at " << file << ": " << line
                  << ": " << status;
@@ -324,7 +324,7 @@ void CPluginOpKernelContext::CtxFailure(const Status& status) {
 void CPluginOpKernelContext::CtxFailure(const char* file, int line,
                                         const Status& status) {
   TF_StatusPtr c_status_ptr(TF_NewStatus());
-  Set_TF_Status_from_Status(c_status_ptr.get(), status);
+  tsl::Set_TF_Status_from_Status(c_status_ptr.get(), status);
   if (line != kInvalidLineNumber) {
     LOG(WARNING) << "Plugin OP_REQUIRES failed at " << file << ": " << line
                  << ": " << status;
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
index e460a4dcf49..3ef33c38035 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
@@ -32,6 +32,9 @@ ABSL_FLAG(bool, next_pluggable_device_use_pjrt, true,
           "Use PjRtClient for data transfer and compile on demand op in next "
           "pluggable device.");
 
+ABSL_FLAG(bool, next_pluggable_device_use_pjrt_allocator, true,
+          "Use PjRtAllocator in next pluggable device.");
+
 namespace tensorflow {
 
 // TODO(chuanhao): implement an API to query device memory, and make
@@ -46,15 +49,34 @@ static DeviceAttributes BuildNextPluggableDeviceAttributes(
 
 NextPluggableDevice::NextPluggableDevice(const SessionOptions& session_options,
                                          const Options& options)
-    : LocalDevice(session_options,
-                  BuildNextPluggableDeviceAttributes(options.device_name_prefix,
-                                                     options.device_name,
-                                                     options.device_ordinal)),
-      device_ordinal_(options.device_ordinal),
-      compilation_device_type_(options.compilation_device_name) {
-  allocator_ = std::make_unique<NextPluggableDeviceAllocator>(device_ordinal_);
+    : PjRtBaseDevice(
+          session_options,
+          PjRtBaseDevice::Options(options.device_name_prefix,
+                                  options.device_name, options.device_ordinal,
+                                  options.compilation_device_name,
+                                  options.shape_determination_fns)),
+      device_ordinal_(options.device_ordinal) {
+  if (absl::GetFlag(FLAGS_next_pluggable_device_use_pjrt_allocator)) {
+    pjrt_allocator_ = std::make_unique<AsyncValueAllocator>();
+    allocator_ = pjrt_allocator_.get();
+  } else {
+    tfnpd_allocator_ =
+        std::make_unique<NextPluggableDeviceAllocator>(device_ordinal_);
+    allocator_ = tfnpd_allocator_.get();
+  }
+
   if (absl::GetFlag(FLAGS_next_pluggable_device_use_pjrt)) {
-    device_context_ = core::RefCountPtr<DeviceContext>(new PjRtDeviceContext());
+    // TODO(b/262472386) Support shape_determination_fns through
+    // TFNPD_XlaShapeToDeviceShapeRepresentation.
+    if (!options.shape_determination_fns.empty()) {
+      device_context_ = core::RefCountPtr<DeviceContext>(
+          new PjRtDeviceContext(options.shape_determination_fns[0]));
+    } else {
+      XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns{
+          UseNoPreferenceLayoutFn(), IdentityShapeRepresentationFn()};
+      device_context_ = core::RefCountPtr<DeviceContext>(
+          new PjRtDeviceContext(shape_determination_fns));
+    }
   } else {
     device_context_ = core::RefCountPtr<DeviceContext>(
         new NextPluggableDeviceContext(device_ordinal_));
@@ -75,7 +97,7 @@ Allocator* NextPluggableDevice::GetAllocator(AllocatorAttributes attr) {
   if (attr.on_host()) {
     return cpu_allocator();
   }
-  return allocator_.get();
+  return allocator_;
 }
 
 void NextPluggableDevice::Compute(OpKernel* op_kernel,
@@ -97,7 +119,7 @@ void NextPluggableDevice::ComputeAsync(AsyncOpKernel* op_kernel,
 Status NextPluggableDevice::Sync() { return OkStatus(); }
 
 // TODO(chuanhao): implement NextPluggableDevice::Sync().
-void NextPluggableDevice::Sync(const DoneCallback& done) {}
+void NextPluggableDevice::Sync(const DoneCallback& done) { done(Sync()); }
 
 Status NextPluggableDevice::TryGetDeviceContext(DeviceContext** out_context) {
   *out_context = device_context_.get();
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
index a073fd34ddc..553b9c37cb6 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
@@ -18,16 +18,20 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <vector>
 
+#include "tensorflow/compiler/jit/pjrt_base_device.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.h"
 #include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/tfrt/common/async_value_tensor.h"
 
 namespace tensorflow {
 
 class NextPluggableDeviceAllocator;
 
-class NextPluggableDevice : public LocalDevice {
+class NextPluggableDevice : public PjRtBaseDevice {
  public:
   struct Options {
     // The device name's prefix (e.g., "/task:7")
@@ -39,8 +43,17 @@ class NextPluggableDevice : public LocalDevice {
     // The name of the compilation device (e.g., "XLA_TPU_JIT");
     string compilation_device_name;
 
-    // The number of the device.
+    // The TfDeviceId.
     int device_ordinal = -1;
+
+    // A vector of ShapeDeterminationFn (i.e., a bundle of LayoutSelectionFn,
+    // ShapeRepresentationFn). Each bundle describes how the on-host shapes of
+    // a) argument and return value, for entry computations b) variables, for
+    // all computations, should be represented in XLA. Parameters/return values
+    // will be shaped according to the function pair, and reshaped back to/from
+    // their declared shapes for computations. Must be non-empty.
+    std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
+        shape_determination_fns;
   };
 
   NextPluggableDevice(const SessionOptions& session_options,
@@ -62,21 +75,18 @@ class NextPluggableDevice : public LocalDevice {
   Status TryGetDeviceContext(DeviceContext** out_context) override;
 
   Status MakeTensorFromProto(const TensorProto& tensor_proto,
-                             const AllocatorAttributes alloc_attrs,
+                             AllocatorAttributes alloc_attrs,
                              Tensor* tensor) override;
 
   int GetDeviceOrdinal() const { return device_ordinal_; }
 
-  const std::string& GetCompilationDeviceType() const {
-    return compilation_device_type_;
-  }
-
  private:
   int device_ordinal_;
-  std::string compilation_device_type_;
   // Need to use RefCountPtr since DeviceContext is a ref counted object.
   core::RefCountPtr<DeviceContext> device_context_;
-  std::unique_ptr<NextPluggableDeviceAllocator> allocator_;
+  std::unique_ptr<NextPluggableDeviceAllocator> tfnpd_allocator_;
+  std::unique_ptr<AsyncValueAllocator> pjrt_allocator_;
+  Allocator* allocator_ = nullptr;  // Not owned.
   std::unique_ptr<DeviceBase::AcceleratorDeviceInfo> accelerator_device_info_;
 };
 
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.cc
index feaa0d839f0..478dd280f30 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace tensorflow {
@@ -28,18 +29,16 @@ const TFNPD_Api* TfnpdApi() { return tfnpd_api; }
 
 void SetTfnpdApi(const TFNPD_Api* api) { tfnpd_api = api; }
 
-tsl::Status InitNextPluggableDevicePlugin(
-    TFNPDInitPluginFn init_fn, std::string* device_type,
-    std::string* compilation_device_name) {
+tsl::StatusOr<TFNPD_PluginParams> InitNextPluggableDevicePlugin(
+    TFNPDInitPluginFn init_fn) {
   TFNPD_PluginParams params{TFNPD_PLUGIN_PARAMS_STRUCT_SIZE};
   TF_StatusPtr c_status_ptr(TF_NewStatus());
   const TFNPD_Api* api = init_fn(&params, c_status_ptr.get());
   TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status_ptr.get()));
 
   SetTfnpdApi(api);
-  *device_type = std::string(params.device_type);
-  *compilation_device_name = std::string(params.compilation_device_name);
-  return ::tensorflow::OkStatus();
+
+  return params;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h
index 584145a6328..bbc7f03f0bf 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
-#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 
@@ -28,9 +28,8 @@ const TFNPD_Api* TfnpdApi();
 void SetTfnpdApi(const TFNPD_Api* api);
 
 typedef const TFNPD_Api* (*TFNPDInitPluginFn)(TFNPD_PluginParams*, TF_Status*);
-tsl::Status InitNextPluggableDevicePlugin(TFNPDInitPluginFn init_fn,
-                                          std::string* device_type,
-                                          std::string* compilation_device_name);
+tsl::StatusOr<TFNPD_PluginParams> InitNextPluggableDevicePlugin(
+    TFNPDInitPluginFn init_fn);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
index 8ff10e10004..1b1c23b731a 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
@@ -23,11 +23,36 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/utils.h"
+#include "tensorflow/tsl/framework/device_id_utils.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace tensorflow {
+namespace {
+StatusOr<xla::Shape> DeviceShapeRepresentation(
+    const TensorShape& shape, DataType type, bool use_fast_memory,
+    XlaLayoutPreference layout_preference) {
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(
+      tensorflow::TensorShapeToXLAShape(type, shape, &xla_shape));
+  ApiConverter::StackHelper<XLA_Shape> c_xla_shape(xla_shape);
+  ApiConverter::StackHelper<XLA_Shape> c_device_shape;
+  TF_Status* tf_status = TF_NewStatus();
+  TfnpdApi()->TFNPD_XlaShapeToDeviceShapeRepresentation(
+      &c_xla_shape.value, type, use_fast_memory,
+      ConvertToCXlaLayoutPreference(layout_preference), &c_device_shape.value,
+      tf_status);
+  const Status status = StatusFromTF_Status(tf_status);
+  TF_DeleteStatus(tf_status);
+  TF_RETURN_IF_ERROR(status);
+  return c_device_shape.AsCpp<xla::Shape>();
+}
+}  // namespace
 
 Status NextPluggableDeviceFactory::ListPhysicalDevices(
     std::vector<string>* devices) {
@@ -54,28 +79,43 @@ Status NextPluggableDeviceFactory::CreateDevices(
   api_->TFNPD_InitPluginInternalDeviceStates(c_status);
   TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status));
 
-  int32_t device_count = api_->TFNPD_GetDeviceCount(c_status);
+  const int32_t visible_device_count = api_->TFNPD_GetDeviceCount(c_status);
   TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status));
   TF_DeleteStatus(c_status);
 
-  for (int i = 0; i < device_count; ++i) {
+  if (visible_device_count <= 0) {
+    return OkStatus();
+  }
+  const absl::flat_hash_map<std::string, int64_t> device_count_map(
+      session_options.config.device_count().begin(),
+      session_options.config.device_count().end());
+  const GPUOptions gpu_options = session_options.config.gpu_options();
+  TF_ASSIGN_OR_RETURN(
+      const size_t num_tf_devices,
+      tsl::GetNumberTfDevicesAndConfigurePlatformDeviceId(
+          device_count_map, device_type_, gpu_options.visible_device_list(),
+          visible_device_count));
+
+  if (!gpu_options.experimental().virtual_devices().empty()) {
+    VLOG(2) << "NextPluggableDevice does not support virtual device setting.";
+  }
+
+  for (int i = 0; i < num_tf_devices; ++i) {
     NextPluggableDevice::Options options;
     options.device_name_prefix = name_prefix;
     options.device_name = device_type_;
     options.compilation_device_name = compilation_device_name_;
     options.device_ordinal = i;
+    options.shape_determination_fns = {
+        XlaShapeLayoutHelpers::ShapeDeterminationFns{
+            UseNoPreferenceLayoutFn(), DeviceShapeRepresentation}};
 
     auto device =
         std::make_unique<NextPluggableDevice>(session_options, options);
     devices->push_back(std::move(device));
   }
 
-  // PjRtCompileOnDemand op compiles a TensorFlow op to a PjRtExecutable and
-  // runs it.
-  RegisterPjRtCompileOnDemand(device_type_.c_str(),
-                              compilation_device_name_.c_str());
-
-  LOG(INFO) << "Created " << device_count
+  LOG(INFO) << "Created " << num_tf_devices
             << " TensorFlow NextPluggableDevices. "
             << "Physical device type: " << device_type_;
   return OkStatus();
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.cc b/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.cc
index 8e6ac2f583a..2ab55e1b83f 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.cc
@@ -19,178 +19,21 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
-#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/compiler/jit/device_compiler_client.h"
 #include "tensorflow/compiler/jit/variable_info.h"
 #include "tensorflow/compiler/jit/variable_info_util.h"
+#include "tensorflow/compiler/jit/xla_compiler_options_util.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
-#include "tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h"
-#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h"
-#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h"
-#include "tensorflow/core/common_runtime/next_pluggable_device/utils.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/tfrt/common/async_value_tensor.h"
 #include "tensorflow/core/tfrt/common/create_pjrt_client_util.h"
-#include "tensorflow/core/tpu/tpu_defs.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 
-static StatusOr<xla::Shape> DeviceShapeRepresentation(
-    const TensorShape& shape, DataType type, bool use_fast_memory,
-    XlaLayoutPreference layout_preference) {
-  xla::Shape xla_shape;
-  TF_RETURN_IF_ERROR(
-      tensorflow::TensorShapeToXLAShape(type, shape, &xla_shape));
-  ApiConverter::StackHelper<XLA_Shape> c_xla_shape(xla_shape);
-  ApiConverter::StackHelper<XLA_Shape> c_device_shape;
-  TF_Status* tf_status = TF_NewStatus();
-  TfnpdApi()->TFNPD_XlaShapeToDeviceShapeRepresentation(
-      &c_xla_shape.value, type, use_fast_memory,
-      ConvertToCXlaLayoutPreference(layout_preference), &c_device_shape.value,
-      tf_status);
-  const Status status = StatusFromTF_Status(tf_status);
-  TF_DeleteStatus(tf_status);
-  TF_RETURN_IF_ERROR(status);
-  return c_device_shape.AsCpp<xla::Shape>();
-}
-
-static int GetDeviceOrdinal(const DeviceBase* device) {
-  return device->parsed_name().id;
-}
-
-static DeviceType GetDeviceType(OpKernelContext* ctx) {
-  auto* device =
-      tensorflow::down_cast<Device*>(ctx->device()->UnderlyingDevice());
-  return DeviceType(device->device_type());
-}
-
-// LINT.IfChange
-static XlaCompiler::Options GenerateXlaCompilerOptions(
-    const FunctionLibraryRuntime& function_library, DeviceBase* device_base) {
-  XlaCompiler::Options options;
-  options.device_ordinal = GetDeviceOrdinal(device_base);
-  options.flib_def = function_library.GetFunctionLibraryDefinition();
-  options.graph_def_version = function_library.graph_def_version();
-  auto* next_pluggable_device =
-      dynamic_cast<NextPluggableDevice*>(device_base->UnderlyingDevice());
-  // TODO(b/267499840): support setting compilation device type and
-  // shape_determination_fns for non-NextPluggableDevice case.
-  // TODO(b/273348427): Set these fields in a XlaDevice::Metadata and set
-  // XlaCompiler::Options using the XlaDevice::Metadata instead.
-  // XlaDevice::Metadata should be moved out of XlaDevice and made more general
-  // so that it could be used with other devices that support XLA compilation
-  // (eg. NextPluggableDevice).
-  if (next_pluggable_device != nullptr) {
-    options.device_type =
-        DeviceType(next_pluggable_device->GetCompilationDeviceType());
-    options.shape_determination_fns =
-        XlaShapeLayoutHelpers::ShapeDeterminationFns{UseNoPreferenceLayoutFn(),
-                                                     DeviceShapeRepresentation};
-  }
-  options.allow_cpu_custom_calls = false;
-  options.alias_passthrough_params = false;
-  options.detailed_logging = false;
-  return options;
-}
-// LINT.ThenChange(//tensorflow/compiler/jit/xla_compiler_options_util.cc)
-
-static std::vector<xla::PjRtBuffer*> PrepareExecutableArguments(
-    int xla_input_sizes, const std::vector<int>& input_mapping,
-    const std::vector<const Tensor*>& inputs,
-    const std::vector<VariableInfo>& variables,
-    const absl::flat_hash_map<int, int>& variable_lookup) {
-  std::vector<xla::PjRtBuffer*> args;
-  args.reserve(xla_input_sizes);
-  for (auto arg_num : input_mapping) {
-    const Tensor* tensor;
-    if (auto it = variable_lookup.find(arg_num); it != variable_lookup.end()) {
-      tensor = variables[it->second].var()->tensor();
-    } else {
-      tensor = inputs[arg_num];
-    }
-    AsyncValueTensor* av_tensor = AsyncValueTensor::FromTensor(tensor);
-    if (av_tensor->GetBuffer() == nullptr) {
-      // TODO(b/260799971): verify size 0 argument is supported.
-      CHECK_EQ(tensor->NumElements(), 0);  // Crash OK
-      continue;
-    }
-    args.push_back(av_tensor->GetBuffer().get());
-  }
-  return args;
-}
-
-static Status PopulateOutputs(
-    OpKernelContext* ctx, const std::vector<const Tensor*>& inputs,
-    const std::vector<VariableInfo>& variables,
-    const absl::flat_hash_map<int, int>& variable_lookup,
-    const XlaCompiler::CompilationResult& compilation_result,
-    std::vector<std::unique_ptr<xla::PjRtBuffer>>& execute_outputs) {
-  // Copy XLA results to the OpOutputList.
-  int output_num = 0;
-  for (int i = 0, end = ctx->num_outputs(); i < end; ++i) {
-    const DataType& type = compilation_result.outputs[i].type;
-    VLOG(2) << "Populating output for retval " << i << " type "
-            << DataTypeString(type);
-
-    if (compilation_result.outputs[i].is_constant) {
-      bool requires_copy_to_device = GetDeviceType(ctx) != DEVICE_CPU;
-      TF_RETURN_IF_ERROR(SetOutputForConstant(ctx, requires_copy_to_device,
-                                              &compilation_result, i));
-    } else if (type == DT_RESOURCE) {
-      int input_index = compilation_result.outputs[i].input_index;
-      TF_RET_CHECK(input_index >= 0 && input_index < ctx->num_inputs())
-          << "Invalid input for outputs " << i << ": " << input_index;
-      ctx->set_output(i, *inputs[input_index]);
-    } else {
-      Tensor* output_tensor;
-      TensorShape shape = TensorShape(
-          execute_outputs[output_num]->on_device_shape().dimensions());
-      TF_RETURN_IF_ERROR(ctx->allocate_output(i, shape, &output_tensor));
-      auto output_avt = AsyncValueTensor::FromTensor(output_tensor);
-      output_avt->SetBuffer(std::move(execute_outputs[output_num]));
-      ++output_num;
-    }
-  }
-
-  // Apply variable updates, if any.
-  for (int i = 0, end = compilation_result.resource_updates.size(); i < end;
-       ++i) {
-    const XlaCompiler::ResourceUpdate& write =
-        compilation_result.resource_updates[i];
-    int actual_input_index = write.input_index;
-    CHECK_GE(actual_input_index, 0);                  // Crash OK
-    CHECK_LT(actual_input_index, ctx->num_inputs());  // Crash OK
-    auto it = variable_lookup.find(actual_input_index);
-    if (it == variable_lookup.end()) {
-      continue;
-    }
-    Var* var = variables[it->second].var();
-    CHECK(var);  // Crash OK
-
-    VLOG(2) << "Updating variable #" << i
-            << " at input index: " << actual_input_index << " with shape "
-            << write.shape.DebugString() << "; variable tensor has shape: "
-            << var->tensor()->shape().DebugString();
-
-    if (var->is_initialized && var->tensor()->dtype() != write.type) {
-      return errors::Internal("Mismatched type in variable write");
-    }
-
-    TF_RETURN_IF_ERROR(ctx->allocate_temp(
-        var->tensor()->dtype(), var->tensor()->shape(), var->tensor()));
-    AsyncValueTensor::FromTensor(var->tensor())
-        ->SetBuffer(std::move(execute_outputs[output_num]));
-    var->is_initialized |= write.modified;
-    ++output_num;
-  }
-  return OkStatus();
-}
-
 Status PjRtCompileOnDemandOp::Compile(
     OpKernelContext* ctx, xla::PjRtClient* pjrt_client,
     const std::vector<XlaCompiler::Argument>& args,
@@ -198,7 +41,8 @@ Status PjRtCompileOnDemandOp::Compile(
     std::unique_ptr<xla::PjRtLoadedExecutable>* executable) {
   // TODO(b/260798754): use caching when it is ready.
   const XlaCompiler::Options options =
-      GenerateXlaCompilerOptions(*ctx->function_library(), ctx->device());
+      GenerateCompilerOptionsForPjRt(*ctx->function_library(), ctx->device(),
+                                     XlaPlatformInfoFromDevice(ctx->device()));
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = true;
   compile_options.use_tuple_arg = false;
@@ -210,6 +54,8 @@ Status PjRtCompileOnDemandOp::Compile(
   xla::ExecutableBuildOptions build_options =
       GetExecutableBuildOptions(options, *compilation_result, -1);
   xla::CompileOptions pjrt_compile_options;
+  pjrt_compile_options.argument_layouts.emplace(
+      compilation_result->xla_input_shapes);
   pjrt_compile_options.executable_build_options = build_options;
   pjrt_compile_options.compile_portable_executable = true;
   TF_ASSIGN_OR_RETURN(
@@ -223,38 +69,6 @@ Status PjRtCompileOnDemandOp::Compile(
   return OkStatus();
 }
 
-Status PjRtCompileOnDemandOp::Run(
-    OpKernelContext* ctx, xla::PjRtClient* pjrt_client,
-    const std::vector<const Tensor*>& inputs,
-    const std::vector<VariableInfo>& variables,
-    const XlaCompiler::CompilationResult& compilation_result,
-    std::unique_ptr<xla::PjRtLoadedExecutable> executable) {
-  xla::ExecuteOptions options;
-  options.arguments_are_tupled = false;
-  options.untuple_result = true;
-  TF_ASSIGN_OR_RETURN(
-      xla::PjRtDevice * device,
-      pjrt_client->LookupAddressableDevice(GetDeviceOrdinal(ctx->device())));
-
-  absl::flat_hash_map<int, int> variable_lookup;
-  for (int i = 0; i < variables.size(); i++) {
-    variable_lookup[variables[i].index()] = i;
-  }
-  const std::vector<xla::PjRtBuffer*> executable_args =
-      PrepareExecutableArguments(compilation_result.xla_input_shapes.size(),
-                                 compilation_result.input_mapping, inputs,
-                                 variables, variable_lookup);
-  // TODO(b/257548614): currently PJRT is compiled as portable (num_replica = 1
-  // and num_partition = 1). Support multiple partitions case.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<xla::PjRtBuffer>> execute_outputs,
-      executable->ExecutePortable(executable_args, device, options));
-
-  TF_RETURN_IF_ERROR(PopulateOutputs(ctx, inputs, variables, variable_lookup,
-                                     compilation_result, execute_outputs));
-  return OkStatus();
-}
-
 void PjRtCompileOnDemandOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_VALUE(xla::PjRtClient * pjrt_client, ctx,
                     GetOrCreatePjRtClient(GetDeviceType(ctx)));
@@ -283,8 +97,8 @@ void PjRtCompileOnDemandOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, Compile(ctx, pjrt_client, args, &result, &executable));
 
   // Execute
-  OP_REQUIRES_OK(ctx, Run(ctx, pjrt_client, inputs, variables, result,
-                          std::move(executable)));
+  OP_REQUIRES_OK(ctx, RunPjRtExecutable(*pjrt_client, inputs, variables, result,
+                                        executable.get(), ctx));
 
   ctx->SetStatus(OkStatus());
   VLOG(1) << "PjRtCompileOnDemandOp::Compute: " << ctx->op_kernel().name()
@@ -299,9 +113,8 @@ void RegisterPjRtCompileOnDemand(const char* device, const char* jit_device) {
     return new PjRtCompileOnDemandOp(context);
   };
   XlaOpRegistry::RegisterCompilationKernels();
-  static XlaDeviceOpRegistrations* registrations = RegisterXlaDeviceKernels(
-      device, jit_device, factory, "PjRtCompileOnDemandOp");
-  (void)registrations;
+  RegisterXlaDeviceKernels(device, jit_device, factory,
+                           "PjRtCompileOnDemandOp");
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.h b/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.h
index c4f5089e00f..3d65e783686 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.h
@@ -42,12 +42,6 @@ class PjRtCompileOnDemandOp : public OpKernel {
                         const std::vector<XlaCompiler::Argument>& args,
                         XlaCompiler::CompilationResult* result,
                         std::unique_ptr<xla::PjRtLoadedExecutable>* executable);
-
-  static Status Run(OpKernelContext* ctx, xla::PjRtClient* pjrt_client,
-                    const std::vector<const Tensor*>& inputs,
-                    const std::vector<VariableInfo>& variables,
-                    const XlaCompiler::CompilationResult& result,
-                    std::unique_ptr<xla::PjRtLoadedExecutable> executable);
 };
 
 void RegisterPjRtCompileOnDemand(const char* device, const char* jit_device);
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h
index 2ffa2ce1abe..5c2946eaf59 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h
@@ -18,13 +18,10 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
 
-namespace tsl {
-class Status;
-}  // namespace tsl
 namespace tensorflow {
-using tsl::Status;
 
 class PluginCoordinationServiceAgent {
  public:
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
index 3c0473c7758..726737dbeb4 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
@@ -18,23 +18,23 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h"
 
-#ifdef TF_OPKERNEL_C_API_PASSTHROUGH
+#ifndef TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
 #include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h"
 #else
 #include "tensorflow/c/kernels.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.h"
-#endif  // TF_OPKERNEL_C_API_PASSTHROUGH
+#endif  // TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
 
 namespace tensorflow {
 
 inline PluginCoordinationServiceAgent* CreatePluginCoordinationServiceAgent(
     void* agent) {
-#ifdef TF_OPKERNEL_C_API_PASSTHROUGH
+#ifndef TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
   return new DirectPluginCoordinationServiceAgent(agent);
 #else
   return new CPluginCoordinationServiceAgent(agent);
-#endif  // TF_OPKERNEL_C_API_PASSTHROUGH
+#endif  // TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
index 045070c6bcf..b0d0f6b7f4b 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
@@ -18,22 +18,22 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h"
 
-#ifdef TF_OPKERNEL_C_API_PASSTHROUGH
+#ifndef TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
 #include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h"
 #else
 #include "tensorflow/c/kernels.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.h"
-#endif  // TF_OPKERNEL_C_API_PASSTHROUGH
+#endif  // TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
 
 namespace tensorflow {
 
 inline PluginOpKernelConstruction* CreatePluginOpKernelConstruction(void* ctx) {
-#ifdef TF_OPKERNEL_C_API_PASSTHROUGH
+#ifndef TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
   return new DirectPluginOpKernelConstruction(ctx);
 #else
   return new CPluginOpKernelConstruction(ctx);
-#endif  // TF_OPKERNEL_C_API_PASSTHROUGH
+#endif  // TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
 }
 
 inline void DeletePluginOpKernelConstruction(
@@ -42,11 +42,11 @@ inline void DeletePluginOpKernelConstruction(
 }
 
 inline PluginOpKernelContext* CreatePluginOpKernelContext(void* ctx) {
-#ifdef TF_OPKERNEL_C_API_PASSTHROUGH
+#ifndef TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
   return new DirectPluginOpKernelContext(ctx);
 #else
   return new CPluginOpKernelContext(ctx);
-#endif  // TF_OPKERNEL_C_API_PASSTHROUGH
+#endif  // TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
 }
 
 inline void DeletePluginOpKernelContext(PluginOpKernelContext* wrapper) {
diff --git a/tensorflow/core/common_runtime/optimization_registry.cc b/tensorflow/core/common_runtime/optimization_registry.cc
index 72f58e1954b..b5f25ff9ea2 100644
--- a/tensorflow/core/common_runtime/optimization_registry.cc
+++ b/tensorflow/core/common_runtime/optimization_registry.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 
+#include <string>
+
 #include "tensorflow/core/framework/metrics.h"
+#include "tensorflow/core/util/debug_data_dumper.h"
 #include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
@@ -34,29 +37,30 @@ void OptimizationPassRegistry::Register(
 
 Status OptimizationPassRegistry::RunGrouping(
     Grouping grouping, const GraphOptimizationPassOptions& options) {
-  auto dump_graph = [&](std::string& prefix) {
+  const char* grouping_name = GetGroupingName(grouping);
+
+  auto dump_graph = [&](std::string func_name, const std::string& group,
+                        const std::string& tag, bool bypass_filter) {
+    if (func_name.empty()) func_name = "unknown_graph";
+
     if (options.graph) {
-      DumpGraphToFile(
-          strings::StrCat(prefix, "_",
-                          reinterpret_cast<uintptr_t>((*options.graph).get())),
-          **options.graph, options.flib_def);
+      DEBUG_DATA_DUMPER()->DumpGraph(func_name, group, tag,
+                                     options.graph->get(), options.flib_def,
+                                     bypass_filter);
     }
     if (options.partition_graphs) {
       for (auto& part : *options.partition_graphs) {
-        DumpGraphToFile(
-            strings::StrCat(prefix, "_partition_", part.first, "_",
-                            reinterpret_cast<uintptr_t>(part.second.get())),
-            *part.second, options.flib_def);
+        DEBUG_DATA_DUMPER()->DumpGraph(func_name + "_partition_" + part.first,
+                                       group, tag, part.second.get(),
+                                       options.flib_def, bypass_filter);
       }
     }
   };
 
-  VLOG(1) << "Starting optimization of a group " << grouping;
-  if (VLOG_IS_ON(3)) {
-    std::string prefix = strings::StrCat(options.debug_filename_prefix,
-                                         "before_grouping_", grouping);
-    dump_graph(prefix);
-  }
+  dump_graph(options.debug_filename_prefix, kDebugGroupMain,
+             strings::StrCat("before_opt_group_", grouping_name),
+             VLOG_IS_ON(3));
+
   auto group = groups_.find(grouping);
   if (group != groups_.end()) {
     static const char* kGraphOptimizationCategory = "GraphOptimizationPass";
@@ -78,27 +82,27 @@ Status OptimizationPassRegistry::RunGrouping(
 
         if (!s.ok()) return s;
         pass_timings.ReportAndStop();
-        if (VLOG_IS_ON(5)) {
-          std::string prefix = strings::StrCat(
-              options.debug_filename_prefix, "after_group_", grouping,
-              "_phase_", phase.first, "_", pass->name());
-          dump_graph(prefix);
-        }
+
+        dump_graph(options.debug_filename_prefix, kDebugGroupGraphOptPass,
+                   strings::StrCat("after_opt_group_", grouping_name, "_phase_",
+                                   phase.first, "_", pass->name()),
+                   VLOG_IS_ON(5));
       }
     }
     group_timings.ReportAndStop();
   }
+
   VLOG(1) << "Finished optimization of a group " << grouping;
   if (options.graph && group != groups_.end()) {
     VLOG(1) << "Graph #nodes " << (*options.graph)->num_nodes() << " #edges "
             << (*options.graph)->num_edges();
   }
-  if (VLOG_IS_ON(3) ||
-      (VLOG_IS_ON(2) && grouping == Grouping::POST_REWRITE_FOR_EXEC)) {
-    std::string prefix = strings::StrCat(options.debug_filename_prefix,
-                                         "after_grouping_", grouping);
-    dump_graph(prefix);
-  }
+
+  dump_graph(options.debug_filename_prefix, kDebugGroupMain,
+             strings::StrCat("after_opt_group_", grouping_name),
+             VLOG_IS_ON(3) || (VLOG_IS_ON(2) &&
+                               grouping == Grouping::POST_REWRITE_FOR_EXEC));
+
   return OkStatus();
 }
 
diff --git a/tensorflow/core/common_runtime/optimization_registry.h b/tensorflow/core/common_runtime/optimization_registry.h
index d176f174b56..290887711f0 100644
--- a/tensorflow/core/common_runtime/optimization_registry.h
+++ b/tensorflow/core/common_runtime/optimization_registry.h
@@ -137,6 +137,20 @@ class OptimizationPassRegistry {
 
  private:
   std::map<Grouping, GraphOptimizationPasses> groups_;
+
+  const char* GetGroupingName(Grouping grouping) const {
+    switch (grouping) {
+      case PRE_PLACEMENT:
+        return "pre_placement";
+      case POST_PLACEMENT:
+        return "post_placement";
+      case POST_REWRITE_FOR_EXEC:
+        return "post_rewrite_for_exec";
+      case POST_PARTITIONING:
+        return "post_partitioning";
+    }
+    return "unknown";
+  }
 };
 
 namespace optimization_registration {
diff --git a/tensorflow/core/common_runtime/optimize_cross_host_control_deps_test.cc b/tensorflow/core/common_runtime/optimize_cross_host_control_deps_test.cc
index 31f5b20e8bd..d8a803ba9aa 100644
--- a/tensorflow/core/common_runtime/optimize_cross_host_control_deps_test.cc
+++ b/tensorflow/core/common_runtime/optimize_cross_host_control_deps_test.cc
@@ -180,7 +180,7 @@ TEST(OptimizeCrossHostControlDepsTest, OptimizeCrossHostControlInputEdges) {
 TEST(OptimizeCrossHostControlDepsTest, LargeGraph) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
-  constexpr int size = 1000;
+  constexpr int size = 750;
 
   std::vector<Operation> layer1;
   for (int i = 0; i < size; ++i) {
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
index 344b5827307..1ebc43db062 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
@@ -15,25 +15,38 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/optimize_function_graph_utils.h"
 
 #include <algorithm>
+#include <cstdlib>
 #include <iterator>
 #include <memory>
+#include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function_body.h"
 #include "tensorflow/core/common_runtime/function_def_utils.h"
 #include "tensorflow/core/common_runtime/function_optimization_registry.h"
 #include "tensorflow/core/common_runtime/function_utils.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/optimized_function_graph_info.h"
 #include "tensorflow/core/common_runtime/partitioning_utils.h"
 #include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/replicate_per_replica_nodes.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/metrics.h"
+#include "tensorflow/core/framework/optimized_function_graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_node_util.h"
 #include "tensorflow/core/util/debug_data_dumper.h"
 #include "tensorflow/core/util/dump_graph.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/host_info.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -134,6 +147,71 @@ void GetColocationGroup(const Node* node, string* group) {
     *group = attr_value->list().s(0);
   }
 }
+
+// Writes the OptimizedFunctionGraphInfo proto into a cache file.
+// Returns error if the cache file writing fails.
+Status WriteToCache(const string& dir_name, const string& file_name,
+                    OptimizedFunctionGraphInfo& optimized_function_graph_info,
+                    Env* env) {
+  const absl::Time cache_writing_start_time = absl::Now();
+
+  OptimizedFunctionGraph optimized_function_graph_proto;
+  string optimized_function_graph_proto_str;
+  optimized_function_graph_proto =
+      OptimizedFunctionGraphInfo::ToProto(optimized_function_graph_info);
+  optimized_function_graph_proto.SerializeToString(
+      &optimized_function_graph_proto_str);
+
+  // Creates the directory if not already existent.
+  if (!env->FileExists(dir_name).ok()) {
+    TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(dir_name));
+  }
+  TF_RETURN_IF_ERROR(tsl::WriteStringToFile(
+      env, file_name, optimized_function_graph_proto_str));
+
+  const absl::Duration cache_writing_duration =
+      absl::Now() - cache_writing_start_time;
+  VLOG(3) << "Finished writing optimized graph into cache; took "
+          << absl::ToInt64Seconds(cache_writing_duration)
+          << " secs, file name: " << file_name;
+
+  return OkStatus();
+}
+
+// Retrieves the OptimizedFunctionGraphInfo from a cache file.
+// Returns error if cache file loading fails.
+StatusOr<OptimizedFunctionGraphInfo> ReadFromCache(const string& file_name,
+                                                   Env* env) {
+  absl::Time cache_reading_start_time = absl::Now();
+
+  OptimizedFunctionGraph optimized_function_graph_proto;
+  string optimized_function_graph_proto_str;
+  TF_RETURN_IF_ERROR(tsl::ReadFileToString(
+      env, file_name, &optimized_function_graph_proto_str));
+
+  optimized_function_graph_proto.ParseFromString(
+      optimized_function_graph_proto_str);
+  TF_ASSIGN_OR_RETURN(
+      StatusOr<OptimizedFunctionGraphInfo>
+          optimized_function_graph_info_restored,
+      OptimizedFunctionGraphInfo::FromProto(optimized_function_graph_proto));
+
+  const absl::Duration cache_reading_duration =
+      absl::Now() - cache_reading_start_time;
+  VLOG(3) << "Finished reading optimized graph from cache; took "
+          << absl::ToInt64Seconds(cache_reading_duration) << " secs";
+
+  return optimized_function_graph_info_restored;
+}
+
+// Gets the full path name of the file cache.
+// TODO(b/276813768) Include more runtime specific info like env/flag
+// values, or line number. An alternative is to use the fingerprint of the
+// graph once graph building cache is enabled.
+string GetFileCacheName(const string& dir_name, const string& function_name) {
+  return absl::StrCat(dir_name, "/", tsl::port::JobName(), "_",
+                      tsl::port::TaskId(), "_", function_name);
+}
 }  // namespace
 
 Status GetGraphAndArgRets(
@@ -342,7 +420,9 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
     const std::vector<CompositeDevice*>& composite_devices, Device* cpu_device,
-    Device* default_device, Env* env) {
+    Device* default_device, Env* env,
+    OptimizedFunctionGraph::OptimizationSource optimization_source) {
+  const uint64_t graph_optimization_start_time_usecs = env->NowMicros();
   const FunctionLibraryDefinition* lib_def =
       options.lib_def == nullptr ? input_lib_def : options.lib_def;
 
@@ -364,8 +444,8 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
       function_name, attrs, fdef, lib_def, &graph, &arg_nodes, &ret_nodes,
       &ret_node_names, &ret_types, &control_ret_node_names));
 
-  // Dump the initial graph.
-  DUMP_GRAPH(function_name, "initial", graph.get());
+  DEBUG_DATA_DUMPER()->DumpOpCreationStackTraces(
+      function_name, kDebugGroupOpStacktrace, "before_opt", graph.get());
 
   GraphDef graph_def;
   graph->ToGraphDef(&graph_def);
@@ -376,6 +456,10 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
     options.graph_collector->CollectRawGraph(graph_def);
   }
 
+  // Dump the initial graph.
+  DEBUG_DATA_DUMPER()->DumpGraph(function_name, kDebugGroupMain, "initial",
+                                 graph.get(), &reachable_lib_def, false);
+
   // Mark and assign device for each node in the graph to be compiled by
   // specified device.
   if (!options.xla_compile_device_type.empty()) {
@@ -417,8 +501,9 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
   bool control_rets_updated = false;
   if (should_run_optimization_passes) {
     TF_RETURN_IF_ERROR(FunctionOptimizationPassRegistry::Global().Run(
-        function_name, dev_set, options.config_proto, &graph,
-        &reachable_lib_def, &control_ret_node_names, &control_rets_updated));
+        function_name, dev_set, options.config_proto,
+        options.xla_compile_device_type, &graph, &reachable_lib_def,
+        &control_ret_node_names, &control_rets_updated));
   }
 
   if (control_rets_updated) {
@@ -448,10 +533,11 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
   optimization_options.function_def = fdef;
   optimization_options.shape_inference_on_tfe_dialect_import =
       options.shape_inference_on_tfe_dialect_import;
-  optimization_options.debug_filename_prefix = "pflr_optmz_";
-  env->CreateUniqueFileName(&optimization_options.debug_filename_prefix, "_");
+  optimization_options.debug_filename_prefix = function_name;
 
-  DUMP_GRAPH(function_name, "before_pre_placement_passes", graph.get());
+  DEBUG_DATA_DUMPER()->DumpGraph(function_name, kDebugGroupMain,
+                                 "before_pre_placement_passes", graph.get(),
+                                 &reachable_lib_def, false);
   if (should_run_optimization_passes) {
     TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
         OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
@@ -459,21 +545,27 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
 
   // TODO(b/124993244): Smartly merge options in nested defuns, and raise
   // exceptions/warnings in case where nested function call options are ignored.
-  DUMP_GRAPH(function_name, "before_placer", graph.get());
+  DEBUG_DATA_DUMPER()->DumpGraph(function_name, kDebugGroupMain,
+                                 "before_placer", graph.get(),
+                                 &reachable_lib_def, false);
   Placer placer(graph.get(), function_name, optimization_options.flib_def,
                 &dev_set, default_device,
                 options.config_proto.allow_soft_placement(),
                 options.config_proto.log_device_placement());
   TF_RETURN_IF_ERROR(placer.Run(optimization_options));
 
-  DUMP_GRAPH(function_name, "before_post_placement_passes", graph.get());
+  DEBUG_DATA_DUMPER()->DumpGraph(function_name, kDebugGroupMain,
+                                 "before_post_placement_passes", graph.get(),
+                                 &reachable_lib_def, false);
   if (should_run_optimization_passes) {
     TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
         OptimizationPassRegistry::POST_PLACEMENT, optimization_options));
   }
 
   if (options.optimize_graph_fn) {
-    DUMP_GRAPH(function_name, "before_graph_optimization", graph.get());
+    DEBUG_DATA_DUMPER()->DumpGraph(function_name, kDebugGroupMain,
+                                   "before_graph_optimization", graph.get(),
+                                   &reachable_lib_def, false);
     Status status = options.optimize_graph_fn(
         std::move(ret_node_names), std::move(control_ret_node_names),
         &reachable_lib_def, dev_set, cpu_device, &graph);
@@ -481,27 +573,124 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
       LOG(WARNING) << "Ignoring multi-device function optimization failure: "
                    << status.ToString();
     }
-    DUMP_GRAPH(function_name, "after_graph_optimization", graph.get());
+    DEBUG_DATA_DUMPER()->DumpGraph(function_name, kDebugGroupMain,
+                                   "after_graph_optimization", graph.get(),
+                                   &reachable_lib_def, false);
   }
 
-  DUMP_GRAPH(function_name, "before_post_rewrite_for_exec_passes", graph.get());
+  DEBUG_DATA_DUMPER()->DumpGraph(function_name, kDebugGroupMain,
+                                 "before_post_rewrite_for_exec_passes",
+                                 graph.get(), &reachable_lib_def, false);
   if (should_run_optimization_passes) {
     TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
         OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, optimization_options));
   }
+  DEBUG_DATA_DUMPER()->DumpGraph(function_name, kDebugGroupMain,
+                                 "after_post_rewrite_for_exec_passes",
+                                 graph.get(), &reachable_lib_def, false);
 
   graph->mutable_flib_def()->set_default_registry(nullptr);
   graph->mutable_flib_def()->Clear();
-  return OptimizedFunctionGraphInfo{function_name,
-                                    std::move(graph),
-                                    std::move(reachable_lib_def),
-                                    node_name_to_control_ret,
-                                    std::move(ret_types),
-                                    ret_nodes.size()};
+  return OptimizedFunctionGraphInfo(
+      function_name, std::move(graph), std::move(reachable_lib_def),
+      node_name_to_control_ret, ret_types, ret_nodes.size(),
+      env->NowMicros() - graph_optimization_start_time_usecs,
+      optimization_source);
+}
+
+StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
+    const string& function_name, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options,
+    const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
+    const std::vector<CompositeDevice*>& composite_devices, Device* cpu_device,
+    Device* default_device, Env* env,
+    absl::Duration caching_threshold_duration) {
+  // There are 3 scenarios in this codepath:
+  // (1) This function is not eligible for caching.
+  // (2) This function is eligible for caching and its cache exists.
+  // (3) This function is eligible for caching and its cache does not exist.
+
+  // Get the caching directory from Env variable.
+  const string dir_name = absl::StrCat(getenv(kGraphCachingEnvVariableName));
+
+  // Scenario (1): Not eligible for caching. Run the optimization passes.
+  if (dir_name.empty() || options.is_component_function) {
+    return OptimizeFunctionGraph(function_name, attrs, options, dev_set,
+                                 input_lib_def, composite_devices, cpu_device,
+                                 default_device, env,
+                                 OptimizedFunctionGraph::JIT);
+  }
+
+  const string file_name = GetFileCacheName(dir_name, function_name);
+
+  // Scenario (2): File cache exists for this function; restore from the cache.
+  if (env->FileExists(file_name).ok()) {
+    VLOG(3) << "Cache existed; reading from cache; file_name: " << file_name;
+
+    StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info =
+        ReadFromCache(file_name, env);
+    if (optimized_function_graph_info.ok()) {
+      metrics::UpdateFunctionGraphOptimizationSavingTime(
+          optimized_function_graph_info->optimization_duration_usecs,
+          metrics::GraphOptimizationSource::kJit);
+      metrics::IncrementFunctionGraphOptimizationCacheHitCount(
+          1, metrics::GraphOptimizationSource::kJit);
+      return optimized_function_graph_info;
+    }
+
+    // Run the optimization passes if reading from cache fails.
+    metrics::IncrementFunctionGraphOptimizationCacheFailureCount(
+        1, metrics::GraphOptimizationSource::kJit);
+    LOG(ERROR) << "Reading from file cache failed. Continue to run the "
+                  "optimization passes instead. Error message: "
+               << optimized_function_graph_info.status().ToString();
+    return OptimizeFunctionGraph(function_name, attrs, options, dev_set,
+                                 input_lib_def, composite_devices, cpu_device,
+                                 default_device, env,
+                                 OptimizedFunctionGraph::JIT);
+  }
+
+  // Scenario (3): No file cache exists for this function.
+  // Run the optimization (Step 1) then write to the cache if eligible (Step 2).
+  metrics::IncrementFunctionGraphOptimizationCacheMissCount(
+      1, metrics::GraphOptimizationSource::kJit);
+  VLOG(3) << "No cache existed; run the optimization passes. function name:"
+          << " " << function_name;
+
+  // Step 1: Run the graph optimization passes normally.
+  absl::Time optimization_start_time = absl::Now();
+  TF_ASSIGN_OR_RETURN(
+      StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info,
+      OptimizeFunctionGraph(function_name, attrs, options, dev_set,
+                            input_lib_def, composite_devices, cpu_device,
+                            default_device, env, OptimizedFunctionGraph::JIT));
+  const absl::Duration graph_optimization_duration =
+      absl::Now() - optimization_start_time;
+  VLOG(3) << "Finished running the optimization passes; took "
+          << absl::ToInt64Seconds(graph_optimization_duration)
+          << " secs; function name: " << function_name;
+
+  // Step 2: Write the optimized function graph into the cache if eligible.
+  if (graph_optimization_duration >= caching_threshold_duration) {
+    VLOG(3) << "Writing optimized graph into cache: function name: "
+            << function_name << ", full cache file path: " << file_name;
+    Status s = WriteToCache(dir_name, file_name,
+                            optimized_function_graph_info.value(), env);
+    // If writing to cache failed, log the error message and move on without
+    // failing the program.
+    if (!s.ok()) {
+      LOG(ERROR) << "Caching the graph optimization results failed; "
+                    "cotinue without caching. Error message: "
+                 << s.ToString();
+    }
+  }
+
+  return optimized_function_graph_info;
 }
 
 StatusOr<std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>>
 PreprocessAndPartitionGraph(
+    const std::string& function_name,
     OptimizedFunctionGraphInfo& input_optimized_graph,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
@@ -525,28 +714,35 @@ PreprocessAndPartitionGraph(
     options.graph_collector->CollectOptimizedGraph(def);
   }
 
-  VLOG(4) << "Main function graph to be partitioned:";
-  VLOG(4) << DebugString(graph->ToGraphDefDebug());
+  // Dump graph before the partition starts.
+  DEBUG_DATA_DUMPER()->DumpGraph(function_name, kDebugGroupMain,
+                                 "before_partition", graph.get(),
+                                 &input_optimized_graph.lib_def, VLOG_IS_ON(4));
 
+  // Partition the graph.
   auto device_name_to_subgraphs =
       std::make_unique<std::unordered_map<string, std::unique_ptr<Graph>>>();
   TF_RETURN_IF_ERROR(PartitionFunctionGraph(dev_set, std::move(graph),
                                             device_name_to_subgraphs.get()));
 
+  // Dump graphs before post-partitioning passes.
   for (const auto& pair : *device_name_to_subgraphs) {
-    DumpGraph(strings::StrCat("Before running POST_PARTITIONING passes (",
-                              pair.first, ")"),
-              pair.second.get());
+    std::string partitioned_func_name =
+        absl::StrCat(function_name, "_partition_" + pair.first);
+    const auto* optimized_subgraph = pair.second.get();
+    DEBUG_DATA_DUMPER()->DumpGraph(
+        partitioned_func_name, kDebugGroupMain, "before_partition_passes",
+        optimized_subgraph, &input_optimized_graph.lib_def, false);
   }
 
+  // Doing post-partitioning passes.
   GraphOptimizationPassOptions optimization_options;
   optimization_options.flib_def = &(input_optimized_graph.lib_def);
   optimization_options.is_function_graph = true;
   optimization_options.graph = nullptr;
   optimization_options.device_set = nullptr;
   optimization_options.partition_graphs = device_name_to_subgraphs.get();
-  optimization_options.debug_filename_prefix = "pflr_imd_";
-  env->CreateUniqueFileName(&optimization_options.debug_filename_prefix, "_");
+  optimization_options.debug_filename_prefix = function_name;
 
   // Normally POST_PARTITIONING passes are run by distributed workers.
   // Distributed workers are currently not supported in this code path, so we
@@ -556,19 +752,17 @@ PreprocessAndPartitionGraph(
     TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
         OptimizationPassRegistry::POST_PARTITIONING, optimization_options));
   }
+
+  // Dump graphs after post-partitioning passes.
   for (const auto& pair : *device_name_to_subgraphs) {
+    std::string partitioned_func_name =
+        absl::StrCat(function_name, "_partition_" + pair.first);
     const auto* optimized_subgraph = pair.second.get();
-    DumpGraph(
-        strings::StrCat("After all optimization passes (", pair.first, ")"),
-        optimized_subgraph);
-    if (VLOG_IS_ON(3)) {
-      DumpGraphDefToFile(
-          strings::StrCat("pflr_after_all_optimization_passes_",
-                          reinterpret_cast<uintptr_t>(optimized_subgraph), "_",
-                          pair.first),
-          optimized_subgraph->ToGraphDefDebug());
-    }
+    DEBUG_DATA_DUMPER()->DumpGraph(partitioned_func_name, kDebugGroupMain,
+                                   "after_partition_passes", optimized_subgraph,
+                                   &input_optimized_graph.lib_def, false);
   }
+
   return std::move(device_name_to_subgraphs);
 }
 
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils.h b/tensorflow/core/common_runtime/optimize_function_graph_utils.h
index 952b04d3c46..72fec7528dc 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils.h
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/time/time.h"
 #include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/optimized_function_graph_info.h"
 #include "tensorflow/core/framework/function.h"
@@ -30,6 +31,14 @@ limitations under the License.
 namespace tensorflow {
 // TODO(b/246646753): add more tests.
 
+// The name of the env variable for the caching location of graph optimization.
+// Note: if the caching location retrieved by the env variable is empty it means
+// no caching would be performed.
+static const char kGraphCachingEnvVariableName[] = "TF_GRAPH_CACHING";
+// The threshold of the graph optimization duration to be cached.
+// Note: setting this threshold to 0 means to cache for every function.
+constexpr absl::Duration kCachingThresholdDuration = absl::Seconds(3);
+
 // Generates graph and return information given the input function name,
 // attributes and function definition.
 Status GetGraphAndArgRets(
@@ -59,13 +68,28 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
     const std::vector<CompositeDevice*>& composite_devices, Device* cpu_device,
-    Device* default_device, Env* env);
+    Device* default_device, Env* env,
+    OptimizedFunctionGraph::OptimizationSource optimization_source);
+
+// Outputs graph optimization results (as OptimizedFunctionGraphInfo proto),
+// either by running the actual graph optimization passes,  or by reloading from
+// the file cache if existent. If cache loading fails, it goes ahead and runs
+// the graph optimization passes. Returns error if running the optimization
+// passes fails.
+StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
+    const string& function_name, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options,
+    const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
+    const std::vector<CompositeDevice*>& composite_devices, Device* cpu_device,
+    Device* default_device, Env* env,
+    absl::Duration caching_threshold_duration = kCachingThresholdDuration);
 
 // Pre-processes, partitions and post-optimizes the input graph; returns
 // subgraph result (maps from device name to the subgraph); returns error if any
 // optimization or partitioning step fails.
 StatusOr<std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>>
 PreprocessAndPartitionGraph(
+    const std::string& function_name,
     OptimizedFunctionGraphInfo& input_optimized_graph,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc b/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
index 093dc6ec646..e8375b01855 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
@@ -20,13 +20,17 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function_testlib.h"
+#include "tensorflow/core/common_runtime/optimized_function_graph_info.h"
+#include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/status.h"
 
 namespace tensorflow {
@@ -62,15 +66,15 @@ TEST(OptimizeFunctionGraphTest,
 
   // Try to optimize a function called "FindDevice" which does not exist in
   // library.
-  const StatusOr<OptimizedFunctionGraphInfo> aot_result =
-      OptimizeFunctionGraph("FindDevice", {}, opts, device_set, lib_def.get(),
-                            /*composite_devices=*/{}, devices[0].get(),
-                            devices[0].get(), Env::Default());
+  const StatusOr<OptimizedFunctionGraphInfo> aot_result = OptimizeFunctionGraph(
+      "FindDevice", {}, opts, device_set, lib_def.get(),
+      /*composite_devices=*/{}, devices[0].get(), devices[0].get(),
+      Env::Default(), OptimizedFunctionGraph::AOT);
   EXPECT_TRUE(errors::IsInvalidArgument(aot_result.status()))
       << "Actual status: " << aot_result.status();
-  EXPECT_TRUE(absl::StrContains(aot_result.status().error_message(),
+  EXPECT_TRUE(absl::StrContains(aot_result.status().message(),
                                 "Failed to find function"))
-      << "Actual error message: " << aot_result.status().error_message();
+      << "Actual error message: " << aot_result.status().message();
 }
 
 TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphReturnsCorrectResult) {
@@ -91,16 +95,121 @@ TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphReturnsCorrectResult) {
     device_set.AddDevice(device.get());
   }
 
-  const StatusOr<OptimizedFunctionGraphInfo> aot_result =
-      OptimizeFunctionGraph("FindDevice", {}, opts, device_set, lib_def.get(),
-                            /*composite_devices=*/{}, devices[0].get(),
-                            devices[1].get(), Env::Default());
+  const StatusOr<OptimizedFunctionGraphInfo> aot_result = OptimizeFunctionGraph(
+      "FindDevice", {}, opts, device_set, lib_def.get(),
+      /*composite_devices=*/{}, devices[0].get(), devices[1].get(),
+      Env::Default(), OptimizedFunctionGraph::AOT);
   TF_EXPECT_OK(aot_result.status());
   EXPECT_EQ(aot_result->name, "FindDevice");
   // FindDevice function has one return node.
   EXPECT_EQ(aot_result->num_return_nodes, 1);
   // Return node type is string.
   EXPECT_THAT(aot_result->ret_types, ElementsAre(DT_STRING));
+  EXPECT_GT(aot_result->optimization_duration_usecs, 0);
+  EXPECT_EQ(aot_result->optimization_source, OptimizedFunctionGraph::AOT);
+}
+
+TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphAndWriteToCache) {
+  Env* env = Env::Default();
+
+  // Create a temp directory and set to env variable for the purpose of testing.
+  const string temp_dir = "/tmp/testing_cache_direcroty";
+  EXPECT_TRUE(env->RecursivelyCreateDir(temp_dir).ok());
+  setenv(kGraphCachingEnvVariableName, temp_dir.c_str(), 1);
+
+  // Check that no file exists before caching.
+  std::vector<string> empty_file_list;
+  TF_ASSERT_OK(
+      env->GetMatchingPaths(absl::StrCat(temp_dir, "/*"), &empty_file_list));
+  ASSERT_TRUE(empty_file_list.empty());
+
+  // Setup InstantiateOptions, FunctionLibraryDefinition, and devices.
+  FunctionLibraryRuntime::InstantiateOptions opts;
+  opts.is_multi_device_function = true;
+  FunctionDefLibrary proto;
+  *(proto.add_function()) = test::function::FindDeviceWithUuid();
+  auto lib_def =
+      std::make_unique<FunctionLibraryDefinition>(OpRegistry::Global(), proto);
+  std::vector<std::unique_ptr<Device>> devices;
+  CreateCpuDeviceList(kDevicePrefix, 3, devices);
+  DeviceSet device_set;
+  for (const auto& device : devices) {
+    device_set.AddDevice(device.get());
+  }
+
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationSavingTimeUsecs(
+                metrics::GraphOptimizationSource::kJit),
+            0);
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationCacheHitCount(
+                metrics::GraphOptimizationSource::kJit),
+            0);
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationCacheMissCount(
+                metrics::GraphOptimizationSource::kJit),
+            0);
+
+  // Expect no caching with an extremely high caching threshold.
+  StatusOr<OptimizedFunctionGraphInfo> optimized_info =
+      OptimizeFunctionGraphOrReadFromFileCache(
+          "FindDevice_1234", {}, opts, device_set, lib_def.get(),
+          /*composite_devices=*/{}, devices[0].get(), devices[1].get(),
+          Env::Default(), /*caching_threshold_duration=*/absl::Hours(48));
+  TF_ASSERT_OK(optimized_info.status());
+  std::vector<string> file_list;
+  TF_ASSERT_OK(env->GetMatchingPaths(absl::StrCat(temp_dir, "/*"), &file_list));
+  EXPECT_EQ(file_list.size(), 0);
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationSavingTimeUsecs(
+                metrics::GraphOptimizationSource::kJit),
+            0);
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationCacheHitCount(
+                metrics::GraphOptimizationSource::kJit),
+            0);
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationCacheMissCount(
+                metrics::GraphOptimizationSource::kJit),
+            1);
+
+  // Expect one file cache with zero caching threshold duration.
+  optimized_info = OptimizeFunctionGraphOrReadFromFileCache(
+      "FindDevice_1234", {}, opts, device_set, lib_def.get(),
+      /*composite_devices=*/{}, devices[0].get(), devices[1].get(),
+      Env::Default(), /*caching_threshold_duration=*/absl::ZeroDuration());
+  TF_ASSERT_OK(optimized_info.status());
+  // Check that only one cache file exists.
+  file_list.clear();
+  TF_ASSERT_OK(env->GetMatchingPaths(
+      absl::StrCat(temp_dir, "/_-1_FindDevice_1234"), &file_list));
+  EXPECT_EQ(file_list.size(), 1);
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationSavingTimeUsecs(
+                metrics::GraphOptimizationSource::kJit),
+            0);
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationCacheHitCount(
+                metrics::GraphOptimizationSource::kJit),
+            0);
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationCacheMissCount(
+                metrics::GraphOptimizationSource::kJit),
+            2);
+
+  // Expect one file cache after running for the same function again.
+  optimized_info = OptimizeFunctionGraphOrReadFromFileCache(
+      "FindDevice_1234", {}, opts, device_set, lib_def.get(),
+      /*composite_devices=*/{}, devices[0].get(), devices[1].get(),
+      Env::Default(), /*caching_threshold_duration=*/absl::ZeroDuration());
+  TF_ASSERT_OK(optimized_info.status());
+  file_list.clear();
+  TF_ASSERT_OK(env->GetMatchingPaths(
+      absl::StrCat(temp_dir, "/_-1_FindDevice_1234"), &file_list));
+  EXPECT_EQ(file_list.size(), 1);
+  EXPECT_GT(metrics::GetFunctionGraphOptimizationSavingTimeUsecs(
+                metrics::GraphOptimizationSource::kJit),
+            0);
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationCacheHitCount(
+                metrics::GraphOptimizationSource::kJit),
+            1);
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationCacheMissCount(
+                metrics::GraphOptimizationSource::kJit),
+            2);
+  EXPECT_EQ(optimized_info->name, "FindDevice_1234");
+  EXPECT_EQ(optimized_info->num_return_nodes, 1);
+  EXPECT_THAT(optimized_info->ret_types, ElementsAre(DT_STRING));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info.cc b/tensorflow/core/common_runtime/optimized_function_graph_info.cc
index 3b775a3f744..6ed153319e1 100644
--- a/tensorflow/core/common_runtime/optimized_function_graph_info.cc
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info.cc
@@ -35,6 +35,8 @@ OptimizedFunctionGraph OptimizedFunctionGraphInfo::ToProto(
   *proto.mutable_node_name_to_control_ret() = {
       info.node_name_to_control_ret.begin(),
       info.node_name_to_control_ret.end()};
+  proto.set_optimization_time_usecs(info.optimization_duration_usecs);
+  proto.set_source(info.optimization_source);
   return proto;
 }
 
@@ -62,13 +64,12 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizedFunctionGraphInfo::FromProto(
     // Need to explicityly convert to the enum type.
     data_type_vector[i] = static_cast<DataType>(proto.ret_types().at(i));
   }
-  return OptimizedFunctionGraphInfo{proto.name(),
-                                    std::move(graph),
-                                    std::move(lib_def),
-                                    {proto.node_name_to_control_ret().begin(),
-                                     proto.node_name_to_control_ret().end()},
-                                    std::move(data_type_vector),
-                                    proto.num_return_nodes()};
+  return OptimizedFunctionGraphInfo(
+      proto.name(), std::move(graph), std::move(lib_def),
+      {proto.node_name_to_control_ret().begin(),
+       proto.node_name_to_control_ret().end()},
+      std::move(data_type_vector), proto.num_return_nodes(),
+      proto.optimization_time_usecs(), proto.source());
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info.h b/tensorflow/core/common_runtime/optimized_function_graph_info.h
index 2663c67f5a5..2b92e145b6b 100644
--- a/tensorflow/core/common_runtime/optimized_function_graph_info.h
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/optimized_function_graph.pb.h"
@@ -43,6 +44,36 @@ struct OptimizedFunctionGraphInfo {
   DataTypeVector ret_types;
   // Number of return nodes.
   size_t num_return_nodes;
+  // Time (in microseconds) spent on running the graph optimization passes for
+  // this function.
+  uint64_t optimization_duration_usecs;
+  // Indicates the source environment where the optimization is created.
+  OptimizedFunctionGraph::OptimizationSource optimization_source;
+
+  ~OptimizedFunctionGraphInfo() = default;
+  OptimizedFunctionGraphInfo() : lib_def(OpRegistry::Global()) {}
+  OptimizedFunctionGraphInfo(
+      const std::string& name, std::unique_ptr<Graph>&& graph,
+      FunctionLibraryDefinition&& lib_def,
+      const std::unordered_map<string, string>& node_name_to_control_ret,
+      const DataTypeVector& ret_types, size_t num_return_nodes,
+      uint64_t optimization_duration_usecs,
+      OptimizedFunctionGraph::OptimizationSource optimization_source)
+      : name(name),
+        function_graph(std::move(graph)),
+        lib_def(std::move(lib_def)),
+        node_name_to_control_ret(node_name_to_control_ret),
+        ret_types(ret_types),
+        num_return_nodes(num_return_nodes),
+        optimization_duration_usecs(optimization_duration_usecs),
+        optimization_source(optimization_source) {}
+
+  OptimizedFunctionGraphInfo(OptimizedFunctionGraphInfo& info) = delete;
+  OptimizedFunctionGraphInfo& operator=(OptimizedFunctionGraphInfo& info) =
+      delete;
+  OptimizedFunctionGraphInfo(OptimizedFunctionGraphInfo&& info) = default;
+  OptimizedFunctionGraphInfo& operator=(OptimizedFunctionGraphInfo&& info) =
+      default;
 
   // Converts from the struct to OptimizedFunctionGraph proto.
   static OptimizedFunctionGraph ToProto(const OptimizedFunctionGraphInfo& info);
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc b/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc
index eea88ffde50..8d5dcb0eff6 100644
--- a/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc
@@ -84,24 +84,28 @@ constexpr absl::string_view kLibraryPb =
            }
          })pb";
 
-TEST(OptimizedFunctionGraphUtilsTest, ToProtoProducesCorrectResult) {
-  // Create a simple graph with one trivial node.
+// Creates a simple graph with one trivial node.
+StatusOr<OptimizedFunctionGraphInfo> CreateSimpleOptimizedFunctionGraphInfo() {
   NodeDef node_def;
-  TF_ASSERT_OK(NodeDefBuilder("A", "OneOutput").Finalize(&node_def));
+  TF_RETURN_IF_ERROR(NodeDefBuilder("A", "OneOutput").Finalize(&node_def));
   auto graph = std::make_unique<Graph>(OpRegistry::Global());
   Status status;
   graph->AddNode(node_def, &status);
-  TF_ASSERT_OK(status);
+  TF_RETURN_IF_ERROR(status);
 
   // Create a simple library with one function.
   FunctionLibraryDefinition lib_def(OpRegistry::Global(), {});
-  TF_ASSERT_OK(lib_def.AddFunctionDef(test::function::NonZero()));
+  TF_RETURN_IF_ERROR(lib_def.AddFunctionDef(test::function::NonZero()));
 
   // Construct an OptimizedFunctionGraphInfo.
-  OptimizedFunctionGraphInfo test_info{
-      std::string(kFunctionName), std::move(graph),
-      std::move(lib_def),         {{"A", "B"}},
-      {DT_FLOAT, DT_DOUBLE},      1};
+  return OptimizedFunctionGraphInfo(
+      std::string(kFunctionName), std::move(graph), std::move(lib_def),
+      {{"A", "B"}}, {DT_FLOAT, DT_DOUBLE}, 1, 5, OptimizedFunctionGraph::JIT);
+}
+
+TEST(OptimizedFunctionGraphUtilsTest, ToProtoProducesCorrectResult) {
+  TF_ASSERT_OK_AND_ASSIGN(OptimizedFunctionGraphInfo test_info,
+                          CreateSimpleOptimizedFunctionGraphInfo());
 
   const OptimizedFunctionGraph test_result =
       OptimizedFunctionGraphInfo::ToProto(test_info);
@@ -114,6 +118,8 @@ TEST(OptimizedFunctionGraphUtilsTest, ToProtoProducesCorrectResult) {
                     ret_types: DT_FLOAT
                     ret_types: DT_DOUBLE
                     num_return_nodes: 1
+                    source: JIT
+                    optimization_time_usecs: 5
                   )pb",
                   kLibraryPb))));
 }
@@ -148,6 +154,8 @@ TEST(OptimizedFunctionGraphUtilsTest, FromProtoProducesCorrectResult) {
             ret_types: DT_DOUBLE
             ret_types: DT_BOOL
             num_return_nodes: 2
+            optimization_time_usecs: 15
+            source: 1
           )pb",
           kLibraryPb),
       &proto);
@@ -173,6 +181,31 @@ TEST(OptimizedFunctionGraphUtilsTest, FromProtoProducesCorrectResult) {
   EXPECT_THAT(test_result->node_name_to_control_ret,
               UnorderedElementsAre(Pair("B", "A")));
   EXPECT_EQ(test_result->num_return_nodes, 2);
+  EXPECT_EQ(test_result->optimization_duration_usecs, 15);
+  EXPECT_EQ(test_result->optimization_source, OptimizedFunctionGraph::AOT);
+}
+
+TEST(OptimizedFunctionGraphUtilsTest, MoveTest) {
+  TF_ASSERT_OK_AND_ASSIGN(OptimizedFunctionGraphInfo test_info,
+                          CreateSimpleOptimizedFunctionGraphInfo());
+
+  OptimizedFunctionGraphInfo moved_result = std::move(test_info);
+
+  // Compare graph.
+  GraphDef moved_result_graph_def;
+  moved_result.function_graph->ToGraphDef(&moved_result_graph_def);
+  EXPECT_EQ(moved_result.name, kFunctionName);
+  EXPECT_THAT(
+      moved_result_graph_def,
+      Partially(EqualsProto(R"pb(node { name: 'A' op: 'OneOutput' })pb")));
+  // The function should be found in result's lib_def.
+  EXPECT_NE(moved_result.lib_def.Find("NonZero"), nullptr);
+  EXPECT_THAT(moved_result.ret_types, ElementsAre(DT_FLOAT, DT_DOUBLE));
+  EXPECT_THAT(moved_result.node_name_to_control_ret,
+              UnorderedElementsAre(Pair("A", "B")));
+  EXPECT_EQ(moved_result.num_return_nodes, 1);
+  EXPECT_EQ(moved_result.optimization_duration_usecs, 5);
+  EXPECT_EQ(moved_result.optimization_source, OptimizedFunctionGraph::JIT);
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
index 2cc9cfb3e31..9756f5584ae 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "tensorflow/core/common_runtime/arg_ret_placement.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/types.h"
@@ -244,32 +245,18 @@ Status UpdateArgAndRetvalMetadata(
   for (int i = 0; i < arg_nodes.size(); ++i) {
     Node* arg = arg_nodes[i].first;
     arg->AddAttr("index", i);
-    TF_RETURN_IF_ERROR(arg->attrs().Find("T", &attr_value));
-    if (arg_alloc_attrs != nullptr) {
-      AllocatorAttributes alloc_attr;
-      DataType type = attr_value->type();
-      MemoryType mtype = ints_on_device ? MTypeFromDTypeIntsOnDevice(type)
-                                        : MTypeFromDType(type);
-      if (mtype == HOST_MEMORY) {
-        alloc_attr.set_on_host(true);
-      }
-      arg_alloc_attrs->push_back(alloc_attr);
-    }
+  }
+  if (arg_alloc_attrs != nullptr) {
+    TF_RETURN_IF_ERROR(full_type::SingleDeviceSetAllocAttrsForArgs(
+        arg_nodes, ints_on_device, *arg_alloc_attrs));
   }
   for (int i = 0; i < ret_nodes.size(); ++i) {
     Node* ret = ret_nodes[i].first;
     ret->AddAttr("index", i);
-    TF_RETURN_IF_ERROR(ret->attrs().Find("T", &attr_value));
-    if (ret_alloc_attrs) {
-      AllocatorAttributes alloc_attr;
-      DataType type = attr_value->type();
-      MemoryType mtype = ints_on_device ? MTypeFromDTypeIntsOnDevice(type)
-                                        : MTypeFromDType(type);
-      if (mtype == HOST_MEMORY) {
-        alloc_attr.set_on_host(true);
-      }
-      ret_alloc_attrs->push_back(alloc_attr);
-    }
+  }
+  if (ret_alloc_attrs) {
+    TF_RETURN_IF_ERROR(full_type::SingleDeviceSetAllocAttrsForRets(
+        ret_nodes, ints_on_device, *ret_alloc_attrs));
   }
 
   return OkStatus();
diff --git a/tensorflow/core/common_runtime/partitioning_utils_test.cc b/tensorflow/core/common_runtime/partitioning_utils_test.cc
index ca7d0db2de9..17ba51e9f35 100644
--- a/tensorflow/core/common_runtime/partitioning_utils_test.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function_testlib.h"
+#include "tensorflow/core/common_runtime/int32_fulltype.h"
 #include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -346,6 +347,10 @@ TEST_F(PartitioningUtilsTest, UpdateArgsAndRetsIntsNotOnDevice) {
   std::vector<AllocatorAttributes> arg_alloc_attrs;
   std::vector<AllocatorAttributes> ret_alloc_attrs;
 
+  Int32FulltypePass int32_fulltype;
+  TF_ASSERT_OK(
+      int32_fulltype.ProcessGraph(graph.get(), /*ints_on_device=*/false));
+
   Status status = UpdateArgAndRetvalMetadata(
       graph.get(), &arg_indices, &ret_indices, &arg_alloc_attrs,
       &ret_alloc_attrs, /*ints_on_device=*/false);
diff --git a/tensorflow/core/common_runtime/permuter_test.cc b/tensorflow/core/common_runtime/permuter_test.cc
index 4d451dba234..34b5d8c7dc0 100644
--- a/tensorflow/core/common_runtime/permuter_test.cc
+++ b/tensorflow/core/common_runtime/permuter_test.cc
@@ -124,9 +124,8 @@ class PermuterTest : public ::testing::Test {
     for (int di = 0; di < instances_.size(); ++di) {
       if (!instances_[di]->status_.ok()) {
         ASSERT_GT(fail_after, 0);
-        ASSERT_NE(
-            instances_[di]->status_.error_message().find("Deliberate failure"),
-            string::npos);
+        ASSERT_NE(instances_[di]->status_.message().find("Deliberate failure"),
+                  string::npos);
         continue;
       }
       TF_EXPECT_OK(instances_[di]->status_);
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index fbf90be83d4..8c3e2c183f7 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -226,7 +226,7 @@ Status Placer::Run(const GraphOptimizationPassOptions& options) {
     if (!status.ok()) {
       return AttachDef(
           errors::InvalidArgument("Cannot assign a device for operation ",
-                                  node->name(), ": ", status.error_message()),
+                                  node->name(), ": ", status.message()),
           *node);
     }
 
@@ -276,7 +276,7 @@ Status Placer::Run(const GraphOptimizationPassOptions& options) {
     if (!status.ok()) {
       return AttachDef(
           errors::InvalidArgument("Cannot assign a device for operation ",
-                                  node->name(), ": ", status.error_message()),
+                                  node->name(), ": ", status.message()),
           *node);
     }
 
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index bb84e97e2b9..0a2d1a278b2 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -923,7 +923,7 @@ TEST_F(PlacerTest, TestAssignedGpuDeviceToCpuDevice) {
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code()) << s.ToString();
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(),
+      s.message(),
       "Assigned device '/job:a/replica:0/task:0/device:FakeGPU:0' "
       "does not have registered OpKernel support for TestInput"))
       << s.ToString();
@@ -976,14 +976,14 @@ TEST_F(PlacerTest, TestReferenceConnection) {
     Status s = ReferenceTestHelper("VariableCPU", "AssignGPU", "FakeCPU");
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
     EXPECT_TRUE(absl::StrContains(
-        s.error_message(), "no device type supports both of those nodes"));
+        s.message(), "no device type supports both of those nodes"));
   }
   TF_EXPECT_OK(ReferenceTestHelper("VariableGPU", "TestAssign", "FakeGPU"));
   {
     Status s = ReferenceTestHelper("VariableGPU", "AssignCPU", "FakeCPU");
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
     EXPECT_TRUE(absl::StrContains(
-        s.error_message(), "no device type supports both of those nodes"));
+        s.message(), "no device type supports both of those nodes"));
   }
   TF_EXPECT_OK(ReferenceTestHelper("VariableGPU", "AssignGPU", "FakeGPU"));
 }
@@ -1079,7 +1079,7 @@ TEST_F(PlacerTest, TestResourceHandlesOnDifferentDevicesFails) {
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
     if (set_assigned) {
       EXPECT_TRUE(absl::StrContains(
-          s.error_message(),
+          s.message(),
           "Cannot place the graph because a reference or resource edge "
           "connects "
           "colocation groups with incompatible assigned devices: "
@@ -1088,7 +1088,7 @@ TEST_F(PlacerTest, TestResourceHandlesOnDifferentDevicesFails) {
           << s.ToString();
     } else {
       EXPECT_TRUE(absl::StrContains(
-          s.error_message(),
+          s.message(),
           "Cannot place the graph because a reference or resource edge "
           "connects "
           "colocation groups with incompatible resource devices: "
@@ -1317,7 +1317,7 @@ TEST_P(SoftPlacementPlacerTest, TestInvalidMultipleColocationGroups) {
     EXPECT_DEVICE_TYPE(g, "foo", "FakeGPU");
   } else {
     EXPECT_TRUE(absl::StrContains(
-        s.error_message(),
+        s.message(),
         "Cannot colocate nodes {{colocation_node foo}} and "
         "{{colocation_node in}} because no device type supports both of those "
         "nodes and the other nodes colocated with them"))
@@ -1399,7 +1399,7 @@ TEST_P(SoftPlacementPlacerTest,
   } else {
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
     EXPECT_TRUE(absl::StrContains(
-        s.error_message(),
+        s.message(),
         "Cannot colocate nodes {{colocation_node assign3}} and "
         "{{colocation_node var2}} because no device type supports both of "
         "those nodes and the other nodes colocated with them."))
@@ -1463,8 +1463,7 @@ TEST_F(PlacerTest, TestEmptyDeviceSet) {
   DeviceSet empty;
 
   Status s = Place(&g, &empty);
-  EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "No devices are registered"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "No devices are registered"));
 }
 
 // Test that placement fails when the requested device forces an
@@ -1489,16 +1488,14 @@ TEST_F(PlacerTest, TestHeterogeneousDeviceSetFailure) {
   heterogeneous.AddDevice(cpu.get());
   Status s = Place(&g, &heterogeneous);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
+  EXPECT_TRUE(absl::StrContains(s.message(),
                                 "colocated with a group of nodes that required "
                                 "incompatible device"));
 
   // The error message should contain information that indicates which
   // op types have which registered device types.
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "VariableGPU: FakeGPU"))
-      << s;
-  EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "TestAssign: FakeGPU FakeCPU"))
+  EXPECT_TRUE(absl::StrContains(s.message(), "VariableGPU: FakeGPU")) << s;
+  EXPECT_TRUE(absl::StrContains(s.message(), "TestAssign: FakeGPU FakeCPU"))
       << s;
 }
 
@@ -1513,7 +1510,7 @@ TEST_F(PlacerTest, TestUnknownDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "/job:foo"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "/job:foo"));
 }
 
 // Test that placement fails when the combination of partial
@@ -1528,7 +1525,7 @@ TEST_F(PlacerTest, TestUnknownMergedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "/job:foo"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "/job:foo"));
 }
 
 // Test that placement fails when the previously-assigned device for a
@@ -1546,8 +1543,7 @@ TEST_F(PlacerTest, TestUnknownAssignedDevice) {
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(),
-      "Assigned device '/job:foo' does not match any device"));
+      s.message(), "Assigned device '/job:foo' does not match any device"));
 }
 
 // Test that placement fails when an op with no registered kernels is
@@ -1562,10 +1558,10 @@ TEST_F(PlacerTest, TestNoKernelsRegisteredWithNoRequestedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
+  EXPECT_TRUE(absl::StrContains(s.message(),
                                 "No OpKernel was registered to support Op "
                                 "'VariableNoKernels' used by {{node var}}"));
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "<no registered kernels>"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "<no registered kernels>"));
 }
 
 // Test that placement fails when an op does not have registered kernel
@@ -1590,10 +1586,10 @@ TEST_F(PlacerTest, TestNoKernelsRegisteredWithRequestedDeviceLocal) {
   devices.AddDevice(cpu.get());
   Status s = Place(&g, &devices, cpu.get(), false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
+  EXPECT_TRUE(absl::StrContains(s.message(),
                                 "No OpKernel was registered to support Op "
                                 "'VariableNoKernels' used by {{node var}}"));
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "<no registered kernels>"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "<no registered kernels>"));
 }
 
 // Test that placement succeeds when an op does not have registered kernel
@@ -1637,10 +1633,10 @@ TEST_F(PlacerTest, TestNoDevicesRegistered) {
 
   Status s = Place(&g, &cpu_only);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
+  EXPECT_TRUE(absl::StrContains(s.message(),
                                 "No OpKernel was registered to support Op "
                                 "'VariableGPU' used by {{node var}}"));
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "device='FakeGPU'"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "device='FakeGPU'"));
 }
 
 // Test that placement fails when a requested device is malformed.
@@ -1654,7 +1650,7 @@ TEST_F(PlacerTest, TestMalformedDeviceSpecification) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
+  EXPECT_TRUE(absl::StrContains(s.message(),
                                 "Malformed device specification '/foo:bar'"));
 }
 
@@ -1671,8 +1667,8 @@ TEST_F(PlacerTest, TestMalformedAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
-                                "Malformed assigned device '/foo:bar'"));
+  EXPECT_TRUE(
+      absl::StrContains(s.message(), "Malformed assigned device '/foo:bar'"));
 }
 
 // Test that placement fails when a device was previously assigned to
@@ -1690,7 +1686,7 @@ TEST_F(PlacerTest, TestNonUniqueAssignedDevice) {
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(), "Assigned device '/job:a' does not match any device"));
+      s.message(), "Assigned device '/job:a' does not match any device"));
 }
 
 // Test that ops request to be placed on non-existent devices will be relocated
@@ -1721,7 +1717,7 @@ TEST_F(PlacerTest, TestNonexistentGpuNoAllowSoftPlacement) {
 
   Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "/device:FakeGPU:11"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "/device:FakeGPU:11"));
 }
 
 // Test that the "Cannot assign a device" error message contains a format tag
@@ -1737,10 +1733,10 @@ TEST_F(PlacerTest, TestNonexistentGpuNoAllowSoftPlacementFormatTag) {
 
   Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  LOG(WARNING) << s.error_message();
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
+  LOG(WARNING) << s.message();
+  EXPECT_TRUE(absl::StrContains(s.message(),
                                 "Cannot assign a device for operation in"));
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "{{node in}}"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "{{node in}}"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1756,11 +1752,10 @@ TEST_F(PlacerTest, TestUnsupportedDeviceNoAllowSoftPlacement) {
 
   Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "/device:FakeCPU:0"))
+  EXPECT_TRUE(absl::StrContains(s.message(), "/device:FakeCPU:0"))
       << s.ToString();
-  EXPECT_TRUE(
-      absl::StrContains(s.error_message(),
-                        "no supported kernel for FakeCPU devices is available"))
+  EXPECT_TRUE(absl::StrContains(
+      s.message(), "no supported kernel for FakeCPU devices is available"))
       << s.ToString();
 }
 
@@ -1777,10 +1772,10 @@ TEST_F(PlacerTest, TestNonExistentDevice) {
 
   Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  LOG(WARNING) << s.error_message();
+  LOG(WARNING) << s.message();
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(), "was explicitly assigned to /job:foo/replica:17"));
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "but available devices"));
+      s.message(), "was explicitly assigned to /job:foo/replica:17"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "but available devices"));
 }
 
 #if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
@@ -1797,9 +1792,9 @@ TEST_F(PlacerTest, TestUseGpuWithNoCuda) {
 
   Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  LOG(WARNING) << s.error_message();
+  LOG(WARNING) << s.message();
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(),
+      s.message(),
       "The requested device appears to be a GPU, but CUDA is not enabled."));
 }
 #endif
@@ -1863,7 +1858,7 @@ TEST_F(PlacerTest, TestUnsatisfiableConstraintWithReferenceConnections) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
+  EXPECT_TRUE(absl::StrContains(s.message(),
                                 "Cannot colocate nodes {{colocation_node "
                                 "var}} and {{colocation_node assign}}"));
 }
@@ -1991,7 +1986,7 @@ TEST_P(SoftPlacementPlacerTest,
   } else {
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
     EXPECT_TRUE(absl::StrContains(
-        s.error_message(),
+        s.message(),
         "Cannot colocate nodes {{colocation_node id2}} and {{colocation_node "
         "id1}}: Cannot merge devices with incompatible types: "
         "'/device:FakeCPU:0' and '/device:FakeGPU:0'"))
@@ -2054,7 +2049,7 @@ TEST_F(PlacerTest, AssignedDeviceOfColocatedNodeIsRespected) {
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
   EXPECT_TRUE(
-      absl::StrContains(s.error_message(),
+      absl::StrContains(s.message(),
                         "{{colocation_node iter}} was colocated with a "
                         "group of nodes that required incompatible device "
                         "'/job:a/replica:0/task:0/device:FakeCPU:0'"))
@@ -2106,7 +2101,7 @@ TEST_P(SoftPlacementPlacerTest,
   } else {
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
     EXPECT_TRUE(absl::StrContains(
-        s.error_message(),
+        s.message(),
         "Cannot colocate nodes {{colocation_node id2}} and {{colocation_node "
         "id1}}: Cannot merge devices with incompatible types: "
         "'/job:a/replica:0/task:0/device:FakeCPU:0' and "
@@ -2691,7 +2686,7 @@ TEST_F(NestedPlacerTest, ResourceConflictInvolvingPCO) {
   Status s = CallOptPassesAndPlace(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(),
+      s.message(),
       "Cannot place the graph because a reference or resource edge connects "
       "colocation groups with incompatible resource devices: /device:FakeCPU:0 "
       "vs /device:FakeGPU:0"))
@@ -2739,7 +2734,7 @@ TEST_F(NestedPlacerTest, ResourceConflictInvolvingTwoPCOs) {
   Status s = CallOptPassesAndPlace(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(),
+      s.message(),
       "Cannot place the graph because a reference or resource edge connects "
       "colocation groups with incompatible resource devices: /device:FakeCPU:0 "
       "vs /device:FakeGPU:0"))
@@ -2802,7 +2797,7 @@ TEST_F(NestedPlacerTest, DeepDeviceConstraintsPropagated) {
   // TODO(b/129057603): When better error messages are implemented, this should
   // change.
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(), "Could not satisfy explicit device specification"))
+      s.message(), "Could not satisfy explicit device specification"))
       << s.ToString();
 }
 
@@ -2865,7 +2860,7 @@ TEST_F(NestedPlacerTest, NestedDeepDeviceConstraintsPropagated) {
   // TODO(b/129057603): When better error messages are implemented, this should
   // change.
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(), "Could not satisfy explicit device specification"))
+      s.message(), "Could not satisfy explicit device specification"))
       << s.ToString();
 }
 
@@ -2917,7 +2912,7 @@ TEST_F(NestedPlacerTest, TwoFunctionsBackToBack) {
   Status s = CallOptPassesAndPlace(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(),
+      s.message(),
       "Cannot place the graph because a reference or resource edge connects "
       "colocation groups with incompatible resource devices: /device:FakeCPU:0 "
       "vs /device:FakeGPU:0"))
@@ -2984,7 +2979,7 @@ TEST_F(NestedPlacerTest, NestedTwoFunctionsBackToBack) {
   Status s = CallOptPassesAndPlace(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(),
+      s.message(),
       "Nodes were connected by a reference or resource connection (requiring "
       "them to be on the same device), but the two nodes were assigned two "
       "different devices"))
@@ -3033,7 +3028,7 @@ TEST_F(NestedPlacerTest, DirectRecursion) {
   Status s = CallOptPassesAndPlace(&g);
   EXPECT_EQ(error::UNIMPLEMENTED, s.code()) << s.ToString();
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(),
+      s.message(),
       "Recursive function calls are not supported. Node {{node out}} inside "
       "the body of {{function_node RecursiveResourceIdentity}} calls function "
       "{{function_node RecursiveResourceIdentity}}"))
@@ -3105,7 +3100,7 @@ TEST_F(NestedPlacerTest, IndirectRecursion) {
   Status s = CallOptPassesAndPlace(&g);
   EXPECT_EQ(error::UNIMPLEMENTED, s.code()) << s.ToString();
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(),
+      s.message(),
       "Recursive function calls are not supported. Node {{node out}} inside "
       "the body of {{function_node RecursiveF2}} calls function "
       "{{function_node RecursiveF1}} which is already present in the call "
diff --git a/tensorflow/core/common_runtime/pluggable_device/BUILD b/tensorflow/core/common_runtime/pluggable_device/BUILD
index b9b0c65de8e..fd4b1cd0fa0 100644
--- a/tensorflow/core/common_runtime/pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/pluggable_device/BUILD
@@ -57,6 +57,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/device:device_event_mgr",
         "//tensorflow/core/platform:stream_executor",
+        "//tensorflow/tsl/framework:device_id_utils",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
     alwayslink = 1,
@@ -75,7 +76,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":pluggable_device_runtime_impl",
-        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/c/experimental/grappler",
         "//tensorflow/c/experimental/pluggable_profiler",
         "//tensorflow/c/experimental/pluggable_profiler:pluggable_profiler_internal",
@@ -85,19 +85,21 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
         "//tensorflow/compiler/xla/stream_executor:event",
         "//tensorflow/compiler/xla/stream_executor:kernel",
-        "//tensorflow/core/common_runtime:bfc_allocator",
-        "//tensorflow/core/common_runtime:dma_helper",
-        "//tensorflow/core/common_runtime:local_device",
-        "//tensorflow/core/common_runtime/next_pluggable_device:next_pluggable_device_api",
-        "//tensorflow/core/common_runtime/next_pluggable_device:next_pluggable_device_factory",
-        "//tensorflow/core/common_runtime:process_state",
-        "//tensorflow/core/common_runtime:shared_counter",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:bfc_allocator",
+        "//tensorflow/core/common_runtime:dma_helper",
+        "//tensorflow/core/common_runtime:local_device",
+        "//tensorflow/core/common_runtime:process_state",
+        "//tensorflow/core/common_runtime:shared_counter",
+        "//tensorflow/core/common_runtime/next_pluggable_device:next_pluggable_device_api",
+        "//tensorflow/core/common_runtime/next_pluggable_device:next_pluggable_device_factory",
+        "//tensorflow/core/common_runtime/next_pluggable_device:pjrt_compile_on_demand_op",
+        "//tensorflow/core/platform:stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
     ] + if_static([
         "//tensorflow/core/common_runtime:copy_tensor",
     ]),
@@ -108,20 +110,20 @@ cc_library(
     hdrs = [":pluggable_device_runtime_headers"],
     linkstatic = 1,
     deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
         "//tensorflow/compiler/xla/stream_executor:event",
         "//tensorflow/compiler/xla/stream_executor:kernel",
-        "//tensorflow/core/common_runtime:core_cpu",
-        "//tensorflow/core/common_runtime:dma_helper",
-        "//tensorflow/core/common_runtime:shared_counter",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/lib/core:status",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:core_cpu",
+        "//tensorflow/core/common_runtime:dma_helper",
+        "//tensorflow/core/common_runtime:shared_counter",
+        "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/platform:stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
     ] + if_static([
         # Temporary workaround for duplicated symbols issues.
         ":pluggable_device_runtime_impl",
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
index 11baff6b047..b760c8f1def 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
@@ -21,7 +21,11 @@ limitations under the License.
 #include <algorithm>
 #include <list>
 #include <map>
+#include <memory>
+#include <optional>
+#include <string>
 #include <tuple>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -34,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/tsl/framework/device_id_utils.h"
 
 namespace tensorflow {
 namespace {
@@ -163,42 +168,34 @@ Status PluggableDeviceFactory::CreateDevices(
   if (platform == nullptr) {
     return OkStatus();
   }
-
-  if (platform->VisibleDeviceCount() <= 0) {
+  const int visible_device_count = platform->VisibleDeviceCount();
+  if (visible_device_count <= 0) {
     return OkStatus();
   }
-
-  size_t num_tf_devices = INT_MAX;
-  auto iter = options.config.device_count().find(device_type_);
-  if (iter != options.config.device_count().end()) {
-    num_tf_devices = iter->second;
-  }
+  const absl::flat_hash_map<std::string, int64_t> device_count_map(
+      options.config.device_count().begin(),
+      options.config.device_count().end());
   const auto& device_options = options.config.gpu_options();
-  std::vector<PlatformDeviceId> visible_device_order;
-
-  if (num_tf_devices > 0) {
-    TF_RETURN_IF_ERROR(se::DeviceIdUtil::ParseVisibleDeviceList(
-        device_options.visible_device_list(), platform->VisibleDeviceCount(),
-        &visible_device_order));
-  }
-  if (num_tf_devices > visible_device_order.size()) {
-    num_tf_devices = visible_device_order.size();
-  }
+  TF_ASSIGN_OR_RETURN(
+      const size_t num_tf_devices,
+      tsl::GetNumberTfDevicesAndConfigurePlatformDeviceId(
+          device_count_map, device_type_, device_options.visible_device_list(),
+          visible_device_count));
 
   const auto& virtual_devices = device_options.experimental().virtual_devices();
   if (!virtual_devices.empty())
     VLOG(2) << "Pluggable device does not support virtual device setting yet";
   std::vector<int64_t> memory_limit_bytes;
   for (int i = 0; i < num_tf_devices; ++i) {
-    const PlatformDeviceId platform_device_id = visible_device_order[i];
+    const TfDeviceId tf_device_id(i);
+    PlatformDeviceId platform_device_id;
+    TF_RETURN_IF_ERROR(DeviceIdManager::TfToPlatformDeviceId(
+        DeviceType(device_type_), tf_device_id, &platform_device_id));
     int64_t single_virtual_device_memory_limit = 0;
     TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit(
         platform_name_, device_options, platform_device_id,
         &single_virtual_device_memory_limit));
     memory_limit_bytes.push_back(single_virtual_device_memory_limit);
-    TfDeviceId tf_device_id(i);
-    TF_RETURN_IF_ERROR(DeviceIdManager::InsertTfPlatformDeviceIdPair(
-        DeviceType(device_type_), tf_device_id, platform_device_id));
   }
 
   std::vector<DeviceLocality> device_localities;
@@ -232,8 +229,8 @@ Status PluggableDeviceFactory::CreatePluggableDevice(
       name_prefix, "/device:", device_type_, ":", tf_device_id.value());
 
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
-  se::DeviceIdUtil::CheckValidTfDeviceId(DeviceType(device_type_), platform,
-                                         tf_device_id);
+  tsl::CheckValidTfDeviceId(DeviceType(device_type_),
+                            platform->VisibleDeviceCount(), tf_device_id);
   PlatformDeviceId platform_device_id;
   TF_RETURN_IF_ERROR(DeviceIdManager::TfToPlatformDeviceId(
       DeviceType(device_type_), tf_device_id, &platform_device_id));
@@ -253,7 +250,7 @@ Status PluggableDeviceFactory::CreatePluggableDevice(
         "Failed to get memory allocator for TF PluggableDevice ",
         tf_device_id.value(), " with", memory_limit, " bytes of memory. ");
   }
-  absl::optional<AllocatorStats> stats = device_allocator->GetStats();
+  const std::optional<AllocatorStats> stats = device_allocator->GetStats();
   if (!stats) {
     return errors::Internal("No allocator statistics");
   }
@@ -263,7 +260,7 @@ Status PluggableDeviceFactory::CreatePluggableDevice(
   // memory limit represented by 'stats.bytes_limit' used by that allocator
   // may be different (which should be an error).
   int64_t bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0;
-  auto pluggable_device = absl::make_unique<PluggableDevice>(
+  auto pluggable_device = std::make_unique<PluggableDevice>(
       options, device_name, device_type_, platform_name_,
       static_cast<Bytes>(bytes_limit), dev_locality, tf_device_id,
       GetShortDeviceDescription(platform_device_id, *desc), device_allocator,
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
index 2f15cf8c685..a9201e487dc 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <memory>
+#include <string>
 
 #include "tensorflow/c/experimental/grappler/grappler_internal.h"
 #include "tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h"
@@ -22,12 +24,14 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 
@@ -78,15 +82,18 @@ static Status InitNextPluggableDeviceModule(void* dso_handle) {
   }
   auto init_fn = reinterpret_cast<TFNPDInitPluginFn>(dso_symbol);
   string device_type, compilation_device_name;
-  TF_RETURN_IF_ERROR(InitNextPluggableDevicePlugin(init_fn, &device_type,
-                                                   &compilation_device_name));
+  TF_ASSIGN_OR_RETURN(auto init_params, InitNextPluggableDevicePlugin(init_fn));
+  device_type = std::string(init_params.device_type);
+  compilation_device_name = std::string(init_params.compilation_device_name);
+  int priority = init_params.priority;
+  bool is_pluggable_device = init_params.is_pluggable_device;
 
   // Loads the PJRT plugin.
   // TODO(b/265301627): use LoadPjrtPlugin when it supports windows.
   status = env->GetSymbolFromLibrary(dso_handle, "GetPjrtApi", &dso_symbol);
   if (errors::IsNotFound(status)) {
     VLOG(1) << "Loading PJRT plugin failed for " << device_type << ": "
-            << status.error_message();
+            << status.message();
     return OkStatus();
   } else if (!status.ok()) {
     return status;
@@ -94,12 +101,16 @@ static Status InitNextPluggableDeviceModule(void* dso_handle) {
   auto init_pjrt_fn = reinterpret_cast<pjrt::PjrtApiInitFn>(dso_symbol);
   TF_RETURN_IF_ERROR(pjrt::InitPjrtPlugin(init_pjrt_fn, device_type));
 
-  // TODO(b/265303775): consider let NextPluggableDevice decide the priority in
-  // TFNPDInitPluginFn.
   DeviceFactory::Register(device_type,
                           std::make_unique<NextPluggableDeviceFactory>(
                               device_type, compilation_device_name),
-                          /*priority=*/200, /*is_pluggable_device=*/true);
+                          priority, is_pluggable_device);
+  if (init_params.use_pjrt_on_demand_compile) {
+    // PjRtCompileOnDemand op compiles a TensorFlow op to a PjRtExecutable and
+    // runs it.
+    RegisterPjRtCompileOnDemand(device_type.c_str(),
+                                compilation_device_name.c_str());
+  }
 
   VLOG(1) << "Successfully initialized NextPluggableDevice module.";
   return OkStatus();
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
index a5ec1ff5916..6e366f91c67 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/env_var.h"
+#include "tensorflow/tsl/framework/device_id_utils.h"
 
 namespace tensorflow {
 
@@ -82,8 +83,8 @@ Allocator* PluggableDeviceProcessState::GetPluggableDeviceAllocator(
   const string& allocator_type = options.allocator_type();
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
   mutex_lock lock(mu_);
-  se::DeviceIdUtil::CheckValidTfDeviceId(DeviceType(device_type_), platform,
-                                         tf_device_id);
+  tsl::CheckValidTfDeviceId(DeviceType(device_type_),
+                            platform->VisibleDeviceCount(), tf_device_id);
 
   if (tf_device_id.value() >=
       static_cast<int64_t>(pluggable_device_allocators_.size())) {
@@ -192,8 +193,7 @@ Allocator* PluggableDeviceProcessState::GetPluggableDeviceHostAllocator(
                                         1LL << 17 /*128GB max by default*/,
                                         &pluggable_device_host_mem_limit_in_mb);
     if (!status.ok()) {
-      LOG(ERROR) << "GetPluggableDeviceHostAllocator: "
-                 << status.error_message();
+      LOG(ERROR) << "GetPluggableDeviceHostAllocator: " << status.message();
     }
     int64_t pluggable_device_host_mem_limit =
         pluggable_device_host_mem_limit_in_mb << 20;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index a696d7d2bdb..68254ba2088 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 #include <iterator>
 #include <memory>
+#include <optional>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/function_optimization_registry.h"
+#include "tensorflow/core/common_runtime/int32_fulltype.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/optimize_function_graph_utils.h"
 #include "tensorflow/core/common_runtime/partitioning_utils.h"
@@ -563,11 +565,20 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
       options.lib_def != nullptr
           ? options.lib_def->FindOptimizedFunctionGraph(function_name)
           : lib_def_->FindOptimizedFunctionGraph(function_name);
+  if (optimized_graph_proto != nullptr) {
+    LOG(INFO) << "Found AOT'd graph for function: " << function_name;
+    metrics::UpdateFunctionGraphOptimizationSavingTime(
+        optimized_graph_proto->optimization_time_usecs(),
+        metrics::GraphOptimizationSource::kAot);
+    metrics::IncrementFunctionGraphOptimizationCacheHitCount(
+        1, metrics::GraphOptimizationSource::kAot);
+  }
+
   StatusOr<OptimizedFunctionGraphInfo> optimized_graph_info =
       optimized_graph_proto == nullptr
-          ? OptimizeFunctionGraph(function_name, attrs, options, *dev_set,
-                                  lib_def_, composite_devices, cpu_device,
-                                  default_device, env_)
+          ? OptimizeFunctionGraphOrReadFromFileCache(
+                function_name, attrs, options, *dev_set, lib_def_,
+                composite_devices, cpu_device, default_device, env_)
           : OptimizedFunctionGraphInfo::FromProto(*optimized_graph_proto);
   if (!optimized_graph_info.ok()) return optimized_graph_info.status();
 
@@ -577,8 +588,8 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
 
   TF_ASSIGN_OR_RETURN(
       auto subgraphs,
-      PreprocessAndPartitionGraph(*optimized_graph_info, options, *dev_set,
-                                  lib_def_, composite_devices, env_));
+      PreprocessAndPartitionGraph(function_name, *optimized_graph_info, options,
+                                  *dev_set, lib_def_, composite_devices, env_));
   const uint64 optimization_end_time_usecs = Env::Default()->NowMicros();
   const uint64 graph_optimization_duration =
       optimization_end_time_usecs - optimization_start_time_usecs;
@@ -669,6 +680,13 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
       bool ints_on_device =
           (device_type == "TPU" || device_type == "XLA_CPU" ||
            device_type == "XLA_GPU" || options.int_args_and_retvals_on_device);
+      Int32FulltypePass int32_fulltype(
+          "ProcessFunctionLibraryRuntime::InstantiateMultiDevice");
+      status->Update(int32_fulltype.ProcessGraph(subgraph, ints_on_device));
+      if (!status->ok()) {
+        counter.DecrementCount();
+        return;
+      }
       status->Update(UpdateArgAndRetvalMetadata(
           subgraph, &comp_data->arg_indices, &comp_data->ret_indices,
           &comp_data->arg_alloc_attrs, &comp_data->ret_alloc_attrs,
@@ -921,7 +939,7 @@ Status ProcessFunctionLibraryRuntime::RunMultiDeviceSync(
         VLOG(2) << "Component function execution failed: " << run_status;
         const string function_and_msg = strings::StrCat(
             errors::FormatFunctionForError(data->function_name_), " ",
-            run_status.error_message());
+            run_status.message());
         if (opts.rendezvous != nullptr) opts.rendezvous->StartAbort(run_status);
         return errors::CreateWithUpdatedMessage(run_status, function_and_msg);
       } else {
@@ -1011,7 +1029,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDeviceAsync(
                 << comp_handle << " failed: " << status;
         const string function_and_msg = strings::StrCat(
             errors::FormatFunctionForError(data->function_name_), " ",
-            status.error_message());
+            status.message());
         refcounted_done->UpdateStatus(
             errors::CreateWithUpdatedMessage(status, function_and_msg));
         // Cancel the execution of other component functions.
@@ -1212,26 +1230,18 @@ Status ProcessFunctionLibraryRuntime::ReleaseHandle(
   return errors::InvalidArgument("Handle not found: ", handle);
 }
 
-void ProcessFunctionLibraryRuntime::CleanupCreatedRendezvous(
-    const Rendezvous* created_rendezvous, const int64_t step_id) const {
-  if (created_rendezvous) {
-    DCHECK(rendezvous_factory_);
-    created_rendezvous->Unref();
-    Status s = rendezvous_factory_.CleanUp(step_id);
-    if (!s.ok()) {
-      LOG(ERROR) << s;
-    }
-  }
-}
-
 FunctionLibraryRuntime::DoneCallback
 ProcessFunctionLibraryRuntime::ApplyCleanUpToDoneCallback(
     std::vector<std::unique_ptr<CleanUpItem>>* items,
-    FunctionLibraryRuntime::DoneCallback done, const int64_t step_id,
-    const Rendezvous* created_rendezvous) const {
-  return [this, items, done = std::move(done), step_id,
-          created_rendezvous](const Status& status) {
-    this->CleanupCreatedRendezvous(created_rendezvous, step_id);
+    FunctionLibraryRuntime::DoneCallback done,
+    const FunctionLibraryRuntime::Options& opts,
+    tsl::core::RefCountPtr<Rendezvous> created_rendezvous) const {
+  return [this, items, done = std::move(done), step_id = opts.step_id,
+          created_rendezvous =
+              created_rendezvous.release()](const Status& status) {
+    if (created_rendezvous != nullptr) {
+      created_rendezvous->Unref();
+    }
     auto* local_status = new Status(status);
     CleanUp(items, [local_status, done](const Status& cleanup_status) {
       local_status->Update(cleanup_status);
@@ -1244,7 +1254,7 @@ ProcessFunctionLibraryRuntime::ApplyCleanUpToDoneCallback(
 
 Status ProcessFunctionLibraryRuntime::CreateRendezvous(
     FunctionLibraryRuntime::Options& opts,
-    Rendezvous** created_rendezvous) const {
+    tsl::core::RefCountPtr<Rendezvous>* created_rendezvous) const {
   DCHECK(opts.rendezvous == nullptr);
   if (!rendezvous_factory_) {
     return errors::FailedPrecondition(
@@ -1254,7 +1264,7 @@ Status ProcessFunctionLibraryRuntime::CreateRendezvous(
   }
   Status s = rendezvous_factory_(opts.step_id, device_mgr_, created_rendezvous);
   if (s.ok()) {
-    opts.rendezvous = *created_rendezvous;
+    opts.rendezvous = created_rendezvous->get();
     opts.create_rendezvous = false;
   }
   return s;
@@ -1320,7 +1330,7 @@ void ProcessFunctionLibraryRuntime::Run(
     std::vector<Tensor>* rets,
     FunctionLibraryRuntime::DoneCallback done) const {
   FunctionLibraryRuntime::Options new_opts = opts;
-  Rendezvous* created_rendezvous = nullptr;
+  tsl::core::RefCountPtr<Rendezvous> created_rendezvous = nullptr;
   if (!opts.rendezvous) {
     Status s = CreateRendezvous(new_opts, &created_rendezvous);
     if (!s.ok()) {
@@ -1330,8 +1340,8 @@ void ProcessFunctionLibraryRuntime::Run(
   }
 
   auto* cleanup_items = new std::vector<std::unique_ptr<CleanUpItem>>;
-  done = ApplyCleanUpToDoneCallback(cleanup_items, std::move(done),
-                                    new_opts.step_id, created_rendezvous);
+  done = ApplyCleanUpToDoneCallback(cleanup_items, std::move(done), new_opts,
+                                    std::move(created_rendezvous));
   std::vector<FunctionRet>* function_rets = new std::vector<FunctionRet>;
   done = [rets, function_rets, done = std::move(done)](const Status& s) {
     Status status = s;
@@ -1507,7 +1517,7 @@ Status ProcessFunctionLibraryRuntime::RunSync(
   if (multi_device_data && multi_device_data->enable_sync_execution) {
     metrics::IncrementTestCounter("pflr_runsync", "sync");
     FunctionLibraryRuntime::Options new_opts = orig_opts;
-    Rendezvous* created_rendezvous = nullptr;
+    tsl::core::RefCountPtr<Rendezvous> created_rendezvous = nullptr;
     if (!new_opts.rendezvous) {
       TF_RETURN_IF_ERROR(CreateRendezvous(new_opts, &created_rendezvous));
     }
@@ -1520,7 +1530,6 @@ Status ProcessFunctionLibraryRuntime::RunSync(
 
     Status status = RunMultiDeviceSync(new_opts, handle, &function_rets,
                                        std::move(get_component_args));
-    CleanupCreatedRendezvous(created_rendezvous, new_opts.step_id);
     status.Update(FunctionRetsToTensors(&function_rets, rets));
     return status;
   } else {
@@ -1570,7 +1579,7 @@ void ProcessFunctionLibraryRuntime::Run(
   }
 
   FunctionLibraryRuntime::Options new_opts = opts;
-  Rendezvous* created_rendezvous = nullptr;
+  tsl::core::RefCountPtr<Rendezvous> created_rendezvous = nullptr;
   if (!opts.rendezvous) {
     Status s = CreateRendezvous(new_opts, &created_rendezvous);
     if (!s.ok()) {
@@ -1585,8 +1594,8 @@ void ProcessFunctionLibraryRuntime::Run(
   return;
 #else   // !IS_MOBILE_PLATFORM
   auto* cleanup_items = new std::vector<std::unique_ptr<CleanUpItem>>;
-  done = ApplyCleanUpToDoneCallback(cleanup_items, done, opts.step_id,
-                                    created_rendezvous);
+  done = ApplyCleanUpToDoneCallback(cleanup_items, done, opts,
+                                    std::move(created_rendezvous));
 
   auto get_component_args = [&args](const ComponentFunctionData& comp_data,
                                     InternalArgs* comp_args) -> Status {
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 277c291d276..49c6c52089c 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -404,16 +404,15 @@ class ProcessFunctionLibraryRuntime {
                    std::vector<std::unique_ptr<CleanUpItem>>* cleanup_items,
                    FunctionLibraryRuntime::DoneCallback done) const;
 
-  Status CreateRendezvous(FunctionLibraryRuntime::Options& opts,
-                          Rendezvous** created_rendezvous) const;
-
-  void CleanupCreatedRendezvous(const Rendezvous* created_rendezvous,
-                                const int64_t step_id) const;
+  Status CreateRendezvous(
+      FunctionLibraryRuntime::Options& opts,
+      tsl::core::RefCountPtr<Rendezvous>* created_rendezvous) const;
 
   FunctionLibraryRuntime::DoneCallback ApplyCleanUpToDoneCallback(
       std::vector<std::unique_ptr<CleanUpItem>>* items,
-      FunctionLibraryRuntime::DoneCallback done, const int64_t step_id,
-      const Rendezvous* rendezvous) const;
+      FunctionLibraryRuntime::DoneCallback done,
+      const FunctionLibraryRuntime::Options& opts,
+      tsl::core::RefCountPtr<Rendezvous> rendezvous) const;
 
   void CleanUp(std::vector<std::unique_ptr<CleanUpItem>>* items,
                FunctionLibraryRuntime::DoneCallback done) const;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index cb8e9309e06..01e243f6300 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -21,12 +21,15 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/rendezvous_cache.h"
 #include "tensorflow/core/common_runtime/function_testlib.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/full_type.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/optimized_function_graph.pb.h"
 #include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/type_index.h"
@@ -100,7 +103,8 @@ SessionMetadata GenerateSessionMetadata() {
 // device is set up.
 class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
  public:
-  ProcessFunctionLibraryRuntimeTest() {
+  ProcessFunctionLibraryRuntimeTest()
+      : rendezvous_cache_(new RendezvousCache<IntraProcessRendezvous>()) {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 3});
@@ -132,34 +136,30 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   void Init(const std::vector<FunctionDef>& flib,
-            const SessionMetadata* session_metadata = nullptr) {
+            const SessionMetadata* session_metadata = nullptr,
+            const std::vector<OptimizedFunctionGraph>&
+                optimized_function_graphs = {}) {
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
+    for (const auto& fg : optimized_function_graphs) {
+      lib_def_->AddOptimizedFunctionGraph(fg.name(), fg);
+    }
     OptimizerOptions opts;
     cluster_flr_.reset(new TestClusterFLR(device_mgr_.get()));
     proc_flr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), /*config=*/nullptr,
         TF_GRAPH_DEF_VERSION, lib_def_.get(), opts,
         /*thread_pool=*/nullptr, cluster_flr_.get(), session_metadata,
-        Rendezvous::Factory{
-            [this](const int64_t step_id, const DeviceMgr* device_mgr,
-                   Rendezvous** r) {
-              *r = new IntraProcessRendezvous(device_mgr);
-              if (rendezvous_ref_counts_.find(step_id) !=
-                  rendezvous_ref_counts_.end()) {
-                rendezvous_ref_counts_[step_id]++;
-              } else {
-                rendezvous_ref_counts_[step_id] = 1;
-              }
-              return OkStatus();
-            },
-            [this](const int64_t step_id) {
-              CHECK(rendezvous_ref_counts_.find(step_id) !=
-                    rendezvous_ref_counts_.end());
-              rendezvous_ref_counts_[step_id]--;
-              return OkStatus();
-            }}));
+        Rendezvous::Factory{[this](const int64_t step_id,
+                                   const DeviceMgr* device_mgr,
+                                   tsl::core::RefCountPtr<Rendezvous>* r) {
+          *r = this->rendezvous_cache_->FindOrCreate(step_id, [device_mgr]() {
+            return tsl::core::RefCountPtr<IntraProcessRendezvous>(
+                new IntraProcessRendezvous(device_mgr));
+          });
+          return OkStatus();
+        }}));
   }
 
   void AddCompositeDevice(CompositeDevice* d) {
@@ -259,7 +259,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     });
     done2.WaitForNotification();
     EXPECT_TRUE(errors::IsNotFound(status)) << "Actual status: " << status;
-    EXPECT_TRUE(absl::StrContains(status.error_message(), "not found."));
+    EXPECT_TRUE(absl::StrContains(status.message(), "not found."));
 
     return OkStatus();
   }
@@ -322,7 +322,8 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr_;
 
   // To ensure that we are cleaning up the rendezvous properly.
-  std::unordered_map<int64_t, int> rendezvous_ref_counts_;
+  tsl::core::RefCountPtr<RendezvousCache<IntraProcessRendezvous>>
+      rendezvous_cache_;
 };
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, GetFLRNull) {
@@ -425,9 +426,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, SingleCallFindDevice) {
   test::ExpectTensorEqual<tstring>(
       y, test::AsTensor<tstring>({"/job:a/replica:0/task:0/device:CPU:0"},
                                  TensorShape({})));
-  EXPECT_EQ(1, rendezvous_ref_counts_.size());
-  EXPECT_EQ(opts.step_id, rendezvous_ref_counts_.begin()->first);
-  EXPECT_EQ(0, rendezvous_ref_counts_.begin()->second);
+  EXPECT_EQ(0, rendezvous_cache_->Size());
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceXTimes) {
@@ -627,8 +626,8 @@ void TestTwoDeviceMult(
   if (!error.empty()) {
     EXPECT_TRUE(errors::IsInvalidArgument(status))
         << "Actual status: " << status;
-    EXPECT_TRUE(absl::StrContains(status.error_message(), error))
-        << "Actual error message: " << status.error_message();
+    EXPECT_TRUE(absl::StrContains(status.message(), error))
+        << "Actual error message: " << status.message();
     return;
   }
 
@@ -787,9 +786,32 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListInput) {
       MakeOptions("CPU:0", {"CPU:0"}, {}), &handle);
   ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
   ASSERT_TRUE(absl::StrContains(
-      status.error_message(),
+      status.message(),
       "FuncWithListInput has an input named \"x1\" that is a list of tensors"))
-      << "Actual error message: " << status.error_message();
+      << "Actual error message: " << status.message();
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, FullTypeForInt32) {
+  FunctionDef def = test::function::XTimesTwoInt32();
+  // Add bad full type information to a node to cause Int32FulltypePass to
+  // return an error (TFT_PRODUCT[TFT_TENSOR] instead of
+  // TFT_PRODUCT[TFT_TENSOR[TFT_INT32]]).
+  def.mutable_node_def(2)->mutable_experimental_type()->set_type_id(
+      TFT_PRODUCT);
+  def.mutable_node_def(2)->mutable_experimental_type()->add_args()->set_type_id(
+      TFT_TENSOR);
+  Init({def});
+  FunctionLibraryRuntime::Handle handle;
+  Status status =
+      proc_flr_->Instantiate("XTimesTwoInt32", test::function::Attrs({}),
+                             MakeOptions("CPU:0", {"CPU:0"}, {}), &handle);
+  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  // Check that the error is found by earlier in ProcessFunctionLibraryRuntime
+  // and not later in FunctionLibraryRuntime.
+  EXPECT_TRUE(absl::StrContains(
+      status.message(),
+      "in 'ProcessFunctionLibraryRuntime::InstantiateMultiDevice' has "
+      "TFT_TENSOR output 0 which has 0 args instead of 1"));
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListOutput) {
@@ -801,9 +823,9 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListOutput) {
       MakeOptions("CPU:0", {}, {"CPU:0"}), &handle);
   ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
   ASSERT_TRUE(absl::StrContains(
-      status.error_message(),
+      status.message(),
       "FuncWithListOutput has an output named \"y\" that is a list of tensors"))
-      << "Actual error message: " << status.error_message();
+      << "Actual error message: " << status.message();
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest,
@@ -1003,7 +1025,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ResourceOutput_GPU) {
   resource->is_initialized = true;
   ResourceMgr* mgr = gpu_device_->resource_manager();
   Status status = mgr->Create(mgr->default_container(), "my_gpu_var", resource);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   // Run the function taking a resource and outputting it
   FunctionLibraryRuntime::Options opts;
@@ -1049,7 +1071,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_PlacerError) {
       "ResourceOutput", test::function::Attrs({{"T", DT_FLOAT}}), inst_opts,
       &handle);
   ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
-  ASSERT_TRUE(absl::StrContains(status.error_message(), "Cannot place"));
+  ASSERT_TRUE(absl::StrContains(status.message(), "Cannot place"));
 }
 
 REGISTER_OP("BrokenOp")
@@ -1133,7 +1155,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_StateHandle) {
   *resource->tensor() = resource_value;
   resource->is_initialized = true;
   Status status = mgr->Create(mgr->default_container(), "my_gpu_var", resource);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   Tensor x = GetResourceHandle("my_gpu_var", mgr->default_container(),
                                "/job:a/replica:0/task:0/device:GPU:0");
@@ -1365,5 +1387,27 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, PartitionedGraphRequiresAsync) {
   EXPECT_GT(async_recv_only.Get(), 0);
 }
 
+TEST_F(ProcessFunctionLibraryRuntimeTest, RecordAotSavingTimeAndHitCount) {
+  FunctionLibraryRuntime::InstantiateOptions opts =
+      MakeOptions("CPU:0", {}, {});
+  opts.allow_small_function_optimizations = true;
+  FunctionLibraryRuntime::Handle h;
+
+  OptimizedFunctionGraph optimized_graph_proto;
+  optimized_graph_proto.set_name("FindDevice");
+  optimized_graph_proto.set_optimization_time_usecs(10);
+  Init({test::function::FindDevice()}, /*session_metadata=*/nullptr,
+       {optimized_graph_proto});
+  Instantiate("FindDevice",
+              {{"_target", "/job:b/replica:0/task:0/device:CPU:0"}}, opts, &h)
+      .IgnoreError();
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationSavingTimeUsecs(
+                metrics::GraphOptimizationSource::kAot),
+            10);
+  EXPECT_EQ(metrics::GetFunctionGraphOptimizationCacheHitCount(
+                metrics::GraphOptimizationSource::kAot),
+            1);
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc
index 56c2de67a72..4a39206af75 100644
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@@ -80,7 +80,7 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
     Status status = ReadBoolFromEnvVar(
         "TF_CPU_ALLOCATOR_USE_BFC", alloc_visitors_defined, &use_bfc_allocator);
     if (!status.ok()) {
-      LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
+      LOG(ERROR) << "GetCPUAllocator: " << status.message();
     }
     Allocator* allocator = nullptr;
     SubAllocator* sub_allocator =
@@ -96,7 +96,7 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
                                           1LL << 16 /*64GB max by default*/,
                                           &cpu_mem_limit_in_mb);
       if (!status.ok()) {
-        LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
+        LOG(ERROR) << "GetCPUAllocator: " << status.message();
       }
       int64_t cpu_mem_limit = cpu_mem_limit_in_mb * (1LL << 20);
       DCHECK(sub_allocator);
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 6f48562157c..53d73952219 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -154,30 +154,32 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
-    const SessionOptions& options) {
-  const int32_t num_threads = NumInterOpThreadsFromSessionOptions(options);
-  VLOG(1) << "Session inter op parallelism threads: " << num_threads;
+    const SessionOptions& options, int32_t num_threads) {
+  const int32_t num_threads_real =
+      num_threads > 0 ? num_threads
+                      : NumInterOpThreadsFromSessionOptions(options);
+  VLOG(1) << "Session inter op parallelism threads: " << num_threads_real;
   return new thread::ThreadPool(
-      options.env, ThreadOptions(), "Compute", num_threads,
+      options.env, ThreadOptions(), "Compute", num_threads_real,
       !options.config.experimental().disable_thread_spinning(),
       /*allocator=*/nullptr);
 }
 
-void SchedClosure(std::function<void()> closure) {
+void SchedClosure(absl::AnyInvocable<void()> closure) {
   if (!tracing::EventCollector::IsEnabled()) {
     return Env::Default()->SchedClosure(std::move(closure));
   }
   uint64 id = tracing::GetUniqueArg();
   tracing::RecordEvent(tracing::EventCategory::kScheduleClosure, id);
 
-  Env::Default()->SchedClosure([id, closure = std::move(closure)]() {
+  Env::Default()->SchedClosure([id, closure = std::move(closure)]() mutable {
     tracing::ScopedRegion region(tracing::EventCategory::kRunClosure, id);
     closure();
   });
 }
 
 void SchedNonBlockingClosureAfter(int64_t micros,
-                                  std::function<void()> closure) {
+                                  absl::AnyInvocable<void()> closure) {
   Env::Default()->SchedClosureAfter(micros, std::move(closure));
 }
 
diff --git a/tensorflow/core/common_runtime/process_util.h b/tensorflow/core/common_runtime/process_util.h
index 38db82911fa..cc2bc439079 100644
--- a/tensorflow/core/common_runtime/process_util.h
+++ b/tensorflow/core/common_runtime/process_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 
+#include "absl/functional/any_invocable.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -45,16 +46,18 @@ int32 NumIntraOpThreadsFromEnvironment();
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
 
 // Creates a thread pool with number of inter op threads.
+// The number is set if `num_threads` > 0, otherwise it will be configured by
+// SessionOptions.
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
-    const SessionOptions& options);
+    const SessionOptions& options, int32_t num_threads = 0);
 
 // Schedule "closure" in the default thread queue.
-void SchedClosure(std::function<void()> closure);
+void SchedClosure(absl::AnyInvocable<void()> closure);
 
 // Schedule "closure" after the given number of microseconds in the
 // fixed-size ThreadPool used for non-blocking compute tasks.
 void SchedNonBlockingClosureAfter(int64_t micros,
-                                  std::function<void()> closure);
+                                  absl::AnyInvocable<void()> closure);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/propagator_state.h b/tensorflow/core/common_runtime/propagator_state.h
index b4fd4da2716..9c8a7d9f9a5 100644
--- a/tensorflow/core/common_runtime/propagator_state.h
+++ b/tensorflow/core/common_runtime/propagator_state.h
@@ -88,10 +88,12 @@ class PropagatorState {
     TaggedNodeReadyQueue() : front_index_(0) {}
 
     void push_back(const TaggedNode& node) { ready_.push_back(node); }
+
     TaggedNode front() const {
       DCHECK_LT(front_index_, ready_.size());
       return ready_[front_index_];
     }
+
     void pop_front() {
       DCHECK_LT(front_index_, ready_.size());
       front_index_++;
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index a48099916b1..80bc0ff53f2 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -171,7 +171,9 @@ RefCountedIntraProcessRendezvous::RefCountedIntraProcessRendezvous(
     : device_mgr_(device_mgr),
       local_(this, /* num_shards= */ device_mgr->NumDevices()) {}
 
-RefCountedIntraProcessRendezvous::~RefCountedIntraProcessRendezvous() {}
+RefCountedIntraProcessRendezvous::~RefCountedIntraProcessRendezvous() {
+  VLOG(5) << "Destructor of IntraProcessRendezvous: " << this;
+}
 
 Status RefCountedIntraProcessRendezvous::Send(const ParsedKey& key,
                                               const Rendezvous::Args& args,
@@ -189,6 +191,7 @@ void RefCountedIntraProcessRendezvous::RecvAsync(const ParsedKey& key,
 }
 
 void RefCountedIntraProcessRendezvous::StartAbort(const Status& s) {
+  VLOG(1) << "IntraProcessRendezvous start Abort " << this;
   local_.StartAbort(s);
 }
 
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.h b/tensorflow/core/common_runtime/rendezvous_mgr.h
index c7748a59d81..bfe54a7a455 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.h
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.h
@@ -45,6 +45,10 @@ class RefCountedIntraProcessRendezvous : public Rendezvous {
   explicit RefCountedIntraProcessRendezvous(const DeviceMgr* device_mgr);
 
   // Implementation of RendezvousInterface methods.
+  // NOTE: The methods may clear the Item list and destroy 'this' if there are
+  // no other references to the RefCountedIntraProcessRendezvous object.
+  // If the caller intend to keep a longer life time then it shall keep its own
+  // reference to the RefCountedIntraProcessRendezvous.
   Status Send(const ParsedKey& key, const Rendezvous::Args& args,
               const Tensor& val, const bool is_dead) override;
   void RecvAsync(const ParsedKey& key, const Rendezvous::Args& args,
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
index 2a70f7abd0d..e4b402528e8 100644
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -104,9 +104,8 @@ class RingGathererTest : public ::testing::Test {
     if (fail_after > 0) {
       // Confirm that every device terminated with the expected error status.
       for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
-        EXPECT_NE(
-            instances_[di]->status_.error_message().find("Deliberate failure"),
-            string::npos);
+        EXPECT_NE(instances_[di]->status_.message().find("Deliberate failure"),
+                  string::npos);
       }
     } else {
       // Confirm that every device accumulated the same set of correct
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index f5ca86e598f..2464ac38670 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -137,9 +137,8 @@ class RingReducerTest : public ::testing::Test {
     if (fail_after > 0) {
       // Confirm that every device terminated with the expected error status.
       for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
-        EXPECT_NE(
-            instances_[di]->status_.error_message().find("Deliberate failure"),
-            string::npos);
+        EXPECT_NE(instances_[di]->status_.message().find("Deliberate failure"),
+                  string::npos);
       }
     } else {
       // Confirm that every device computed the same correct reduction value.
diff --git a/tensorflow/core/common_runtime/session.cc b/tensorflow/core/common_runtime/session.cc
index 56f4fc34ebc..6715ab20aad 100644
--- a/tensorflow/core/common_runtime/session.cc
+++ b/tensorflow/core/common_runtime/session.cc
@@ -30,6 +30,8 @@ auto* session_created = monitoring::Gauge<bool, 0>::New(
 
 }  // namespace
 
+void SetSessionCreatedMetric() { session_created->GetCell()->Set(true); }
+
 Session::Session() {}
 
 Session::~Session() {}
@@ -63,7 +65,7 @@ Session* NewSession(const SessionOptions& options) {
   // Starts exporting metrics through a platform-specific monitoring API (if
   // provided). For builds using "tensorflow/tsl/platform/default", this is
   // currently a no-op.
-  session_created->GetCell()->Set(true);
+  SetSessionCreatedMetric();
   Session* out_session;
   Status s = NewSession(options, &out_session);
   if (!s.ok()) {
@@ -84,7 +86,7 @@ Status NewSession(const SessionOptions& options, Session** out_session) {
   // Starts exporting metrics through a platform-specific monitoring API (if
   // provided). For builds using "tensorflow/tsl/platform/default", this is
   // currently a no-op.
-  session_created->GetCell()->Set(true);
+  SetSessionCreatedMetric();
   s = factory->NewSession(options, out_session);
   if (!s.ok()) {
     *out_session = nullptr;
diff --git a/tensorflow/core/common_runtime/session_test.cc b/tensorflow/core/common_runtime/session_test.cc
index f2067537932..cbc5e912810 100644
--- a/tensorflow/core/common_runtime/session_test.cc
+++ b/tensorflow/core/common_runtime/session_test.cc
@@ -33,7 +33,7 @@ TEST(SessionTest, InvalidTargetReturnsNull) {
   Status s = tensorflow::NewSession(options, &session);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(),
+      s.message(),
       "No session factory registered for the given session options"));
 }
 
@@ -69,10 +69,9 @@ TEST(SessionTest, MultipleFactoriesForTarget) {
   Session* session;
   Status s = tensorflow::NewSession(options, &session);
   EXPECT_EQ(s.code(), error::INTERNAL);
-  EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "Multiple session factories"));
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "FAKE_SESSION_1"));
-  EXPECT_TRUE(absl::StrContains(s.error_message(), "FAKE_SESSION_2"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "Multiple session factories"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "FAKE_SESSION_1"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "FAKE_SESSION_2"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index cfe3f6bd00e..7b1e8f8c313 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -164,7 +164,7 @@ TEST_F(ShapeRefinerTest, BadShapes) {
   // an error.
   Status s = m.AddNode(mm.node());
   ASSERT_FALSE(s.ok());
-  ASSERT_TRUE(absl::StrContains(s.error_message(),
+  ASSERT_TRUE(absl::StrContains(s.message(),
                                 "Dimensions must be equal, but are 1 and 2"));
 }
 
@@ -1070,8 +1070,7 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
     TF_ASSERT_OK(m.AddNode(input.node()));
   }
   TF_ASSERT_OK(m.AddNode(pack.node()));
-  EXPECT_TRUE(
-      absl::StrContains(m.AddNode(result).error_message(), "but is rank 2"));
+  EXPECT_TRUE(absl::StrContains(m.AddNode(result).message(), "but is rank 2"));
 }
 
 TEST_F(ShapeRefinerTest, ConstantValueAsShape_Concat) {
@@ -1190,7 +1189,7 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
   TF_ASSERT_OK(m.AddNode(concat_dim.node()));
   TF_ASSERT_OK(m.AddNode(concat.node()));
   EXPECT_EQ("Invalid value in tensor used for shape: -2",
-            m.AddNode(result).error_message());
+            m.AddNode(result).message());
 }
 
 TEST_F(ShapeRefinerTest, ConstantValueAsShape_StridedSlice) {
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index 0e30e3fc89d..3121a938bd0 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -433,7 +433,8 @@ NodeExecStatsInterface* StepStatsCollector::CreateNodeExecStats(
   return new NodeExecStatsWrapper(node, this);
 }
 
-string StepStatsCollector::ReportAllocsOnResourceExhausted(const string& err) {
+string StepStatsCollector::ReportAllocsOnResourceExhausted(
+    const absl::string_view err) {
   mutex_lock l(mu_);
   if (err.find("OOM") == err.npos) {
     return "";
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index 3467fc991cd..df1e579f6d8 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -148,7 +148,7 @@ class StepStatsCollectorInterface {
   // `err` message needs to contain device name and allocator name, e.g.:
   // "ResourceExhaustedError: OOM when allocating tensor ...
   // on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc"
-  virtual string ReportAllocsOnResourceExhausted(const string& err) = 0;
+  virtual string ReportAllocsOnResourceExhausted(absl::string_view err) = 0;
 };
 
 // StepStatsCollector manages the collection of a StepStats object.
@@ -176,7 +176,7 @@ class StepStatsCollector : public StepStatsCollectorInterface {
                       const string& thread_name);
 
   NodeExecStatsInterface* CreateNodeExecStats(const NodeDef* node) override;
-  string ReportAllocsOnResourceExhausted(const string& err) override;
+  string ReportAllocsOnResourceExhausted(absl::string_view err) override;
 
   // The following 2 Finalize methods populate the StepStats passed
   // from the constructor. Calling it more than once won't have any effect.
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 77371359a73..1d10109bb2f 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -232,7 +232,7 @@ void ThreadPoolDevice::LogOutputs(OpKernel* op_kernel,
                                   OpKernelContext* context) {
   if (!context->status().ok()) {
     LOG(INFO) << op_kernel->name()
-              << " failed: " << context->status().error_message();
+              << " failed: " << context->status().message();
     return;
   }
 
diff --git a/tensorflow/core/common_runtime/type_inference.cc b/tensorflow/core/common_runtime/type_inference.cc
index 9d47f2fa70b..54ce1a68bc5 100644
--- a/tensorflow/core/common_runtime/type_inference.cc
+++ b/tensorflow/core/common_runtime/type_inference.cc
@@ -111,7 +111,7 @@ Status update_inferred_type(Node* target, const FullTypeDef& t, bool& updated) {
       // The only allowable type mismatches are those which would further
       // specialize the existing type.
       return Status(
-          error::INVALID_ARGUMENT,
+          absl::StatusCode::kInvalidArgument,
           absl::StrCat("type mismatch for node '", target->name(),
                        "': expected a subtype of:\n", existing.DebugString(),
                        "\n  got:\n", t.DebugString(), "\n  "));
diff --git a/tensorflow/core/common_runtime/type_inference_test.cc b/tensorflow/core/common_runtime/type_inference_test.cc
index cf93f3e81aa..7483ca3f572 100644
--- a/tensorflow/core/common_runtime/type_inference_test.cc
+++ b/tensorflow/core/common_runtime/type_inference_test.cc
@@ -386,7 +386,7 @@ TEST(TypeInferenceTest, BinaryNodeWithCycleInput) {
   // In effect, the assertion verifies that the merge node is always visited
   // at least once after both its inputs have been resolved, so the graph always
   // has complete type information.
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               ::testing::HasSubstr("expected compatible input types"));
 }
 
diff --git a/tensorflow/core/common_runtime/zen_layout_pass.cc b/tensorflow/core/common_runtime/zen_layout_pass.cc
index 42d237c3b83..5f2a1100fad 100644
--- a/tensorflow/core/common_runtime/zen_layout_pass.cc
+++ b/tensorflow/core/common_runtime/zen_layout_pass.cc
@@ -1068,7 +1068,7 @@ Status ZenLayoutRewritePass::AreAllInferenceOps(std::unique_ptr<Graph> *g) {
     for (auto op = tf_training_ops_.cbegin(); op != tf_training_ops_.cend();
          ++op) {
       if (n->type_string().find(*op) != string::npos) {
-        return Status(error::Code::UNIMPLEMENTED,
+        return Status(absl::StatusCode::kUnimplemented,
                       "Training operation found! Currently TF-ZenDNN "
                       "does not support training. Set environment "
                       "variable TF_ENABLE_ZENDNN_OPTS to '0' for training.");
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index e684ab40b70..5bee7afc0a0 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -58,7 +58,6 @@ cc_library(
     deps = [
         ":dataset_utils",
         ":stats_utils",
-        "@com_google_absl//absl/time",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -66,6 +65,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/time",
     ] + if_not_mobile([
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
@@ -462,6 +462,9 @@ cc_library(
         "//tensorflow/core/platform:coding",
         "//tensorflow/core/platform:random",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/core/data/captured_function.cc b/tensorflow/core/data/captured_function.cc
index 10d007aca87..1825830f48d 100644
--- a/tensorflow/core/data/captured_function.cc
+++ b/tensorflow/core/data/captured_function.cc
@@ -60,7 +60,7 @@ class SimpleStepStatsCollector : public StepStatsCollectorInterface {
     return new SimpleNodeExecStats(this);
   }
 
-  string ReportAllocsOnResourceExhausted(const string& err) override {
+  string ReportAllocsOnResourceExhausted(absl::string_view err) override {
     return "";
   }
 
@@ -177,7 +177,7 @@ Status CreateShortCircuitInfo(OpKernelConstruction* ctx,
   auto cleanup = gtl::MakeCleanup([ctx, fn_handle]() {
     Status s = ctx->function_library()->ReleaseHandle(fn_handle);
     if (!s.ok()) {
-      LOG(WARNING) << "Failed to release handle: " << s.error_message();
+      LOG(WARNING) << "Failed to release handle: " << s.message();
     }
   });
 
@@ -815,7 +815,7 @@ Status InstantiatedCapturedFunction::Run(
   if (node || ctx->stats_aggregator()) {
     stats_collector = std::make_shared<SimpleStepStatsCollector>();
   }
-  const bool collect_usage = node && ctx->model();
+  const bool was_recording = node && node->is_recording();
   f_opts.stats_collector = stats_collector.get();
 
   OwnedArgsCallFrame frame(std::move(args), &captured_func_->captured_inputs(),
@@ -830,7 +830,7 @@ Status InstantiatedCapturedFunction::Run(
     // Resource usage for function execution is gathered from the executor.
     // TODO(jsimsa): Factor out common code for Run, RunAsync, and
     // RunWithBorrowedArguments
-    if (collect_usage) node->record_stop(EnvTime::NowNanos());
+    if (was_recording) node->record_stop(EnvTime::NowNanos());
     TF_RETURN_IF_ERROR(lib_->RunSync(std::move(f_opts), f_handle_, &frame));
     if (ctx->stats_aggregator()) {
       string prefix_with_func_name = strings::StrCat(
@@ -841,7 +841,7 @@ Status InstantiatedCapturedFunction::Run(
           node->num_elements());
     }
     node->add_processing_time(stats_collector->processing_time());
-    if (collect_usage) node->record_start(EnvTime::NowNanos());
+    if (was_recording) node->record_start(EnvTime::NowNanos());
   } else {
     TF_RETURN_IF_ERROR(lib_->RunSync(std::move(f_opts), f_handle_, &frame));
   }
@@ -878,7 +878,7 @@ Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
   if (node || ctx->stats_aggregator()) {
     stats_collector = std::make_shared<SimpleStepStatsCollector>();
   }
-  const bool collect_usage = node && ctx->model();
+  const bool was_recording = node && node->is_recording();
   f_opts.stats_collector = stats_collector.get();
 
   BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
@@ -890,9 +890,9 @@ Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
             {{"id", f_opts.step_id}});
       },
       profiler::TraceMeLevel::kInfo);
+  if (was_recording) node->record_stop(EnvTime::NowNanos());
   if (node) {
     // Resource usage for function execution is gathered from the executor.
-    if (collect_usage) node->record_stop(EnvTime::NowNanos());
     TF_RETURN_IF_ERROR(lib_->RunSync(std::move(f_opts), f_handle_, &frame));
     if (ctx->stats_aggregator()) {
       string prefix_with_func_name = strings::StrCat(
@@ -903,10 +903,10 @@ Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
           node->num_elements());
     }
     node->add_processing_time(stats_collector->processing_time());
-    if (collect_usage) node->record_start(EnvTime::NowNanos());
   } else {
     TF_RETURN_IF_ERROR(lib_->RunSync(std::move(f_opts), f_handle_, &frame));
   }
+  if (was_recording) node->record_start(EnvTime::NowNanos());
   return frame.ConsumeRetvals(rets);
 }
 
@@ -1035,9 +1035,10 @@ void InstantiatedCapturedFunction::RunAsync(
   // Stop the usage collection before calling `Run()` because `callback` may
   // be executed synchronously, and so the `node->record_start()` call within
   // `callback` would violate nesting.
-  if (collect_usage) node->record_stop(EnvTime::NowNanos());
+  bool was_recording = node && node->is_recording();
+  if (was_recording) node->record_stop(EnvTime::NowNanos());
   lib_->Run(f_opts, f_handle_, frame, std::move(callback));
-  if (collect_usage) node->record_start(EnvTime::NowNanos());
+  if (was_recording) node->record_start(EnvTime::NowNanos());
 }
 
 bool InstantiatedCapturedFunction::ShouldCreateRendezvous() const {
diff --git a/tensorflow/core/data/dataset_test_base.cc b/tensorflow/core/data/dataset_test_base.cc
index 0071b38ec5f..4e48a18a905 100644
--- a/tensorflow/core/data/dataset_test_base.cc
+++ b/tensorflow/core/data/dataset_test_base.cc
@@ -218,7 +218,7 @@ Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
   case DataTypeToEnum<DT>::value:          \
     TF_RETURN_IF_ERROR(IsEqual<DT>(a, b)); \
     break;
-    TF_CALL_NUMBER_TYPES(CASE);
+    TF_CALL_POD_TYPES(CASE);
     TF_CALL_tstring(CASE);
 #undef CASE
     case DT_VARIANT: {
@@ -431,11 +431,12 @@ Status DatasetOpsTestBase::InitFunctionLibraryRuntime(
       TF_GRAPH_DEF_VERSION, lib_def_.get(), opts, thread_pool_.get(),
       /*parent=*/nullptr,
       /*session_metadata=*/nullptr,
-      Rendezvous::Factory{
-          [](const int64_t, const DeviceMgr* device_mgr, Rendezvous** r) {
-            *r = new IntraProcessRendezvous(device_mgr);
-            return OkStatus();
-          }});
+      Rendezvous::Factory{[](const int64_t, const DeviceMgr* device_mgr,
+                             tsl::core::RefCountPtr<Rendezvous>* r) {
+        *r = tsl::core::RefCountPtr<Rendezvous>(
+            new IntraProcessRendezvous(device_mgr));
+        return OkStatus();
+      }});
   flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
   if (thread_pool_ == nullptr) {
     runner_ = [](const std::function<void()>& fn) { fn(); };
diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index 284c8002c72..6195ae07480 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -16,10 +16,13 @@ limitations under the License.
 #include "tensorflow/core/data/dataset_utils.h"
 
 #include <algorithm>
+#include <array>
+#include <cstdint>
 #include <cstdlib>
 #include <functional>
 #include <memory>
 #include <queue>
+#include <random>
 #include <string>
 #include <utility>
 #include <vector>
@@ -42,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/regexp.h"
@@ -538,9 +542,12 @@ absl::flat_hash_set<string> GetExperiments(
   }
   // Stochastically include live experiments unless they are opted out.
   for (const auto& [experiment_name, experiment_selector] : live_experiments) {
-    if (experiment_selector.job_selector(hash_func, experiment_name,
-                                         job_name) &&
-        experiment_selector.task_selector(task_id) &&
+    uint64_t name_hash = hash_func(strings::StrCat(job_name, experiment_name));
+    std::mt19937_64 rng{name_hash};
+    std::bernoulli_distribution d{0.5};
+    bool evens = d(rng);
+    if (experiment_selector.job_selector(name_hash) &&
+        experiment_selector.task_selector(task_id, evens) &&
         !opt_outs.contains(experiment_name)) {
       experiments.insert(experiment_name);
     }
@@ -687,9 +694,9 @@ Status ReadStatus(const string& iterator_prefix, const string& prefix,
   TF_RETURN_IF_ERROR(reader->ReadScalar(
       FullName(iterator_prefix, strings::StrCat(prefix, "_", kCode)),
       &code_int));
-  error::Code code = static_cast<error::Code>(code_int);
+  absl::StatusCode code = static_cast<absl::StatusCode>(code_int);
 
-  if (code != error::Code::OK) {
+  if (code != absl::StatusCode::kOk) {
     tstring error_message;
     TF_RETURN_IF_ERROR(reader->ReadScalar(
         FullName(iterator_prefix, strings::StrCat(prefix, "_", kMessage)),
@@ -709,7 +716,7 @@ Status WriteStatus(const string& iterator_prefix, const string& prefix,
   if (!status.ok()) {
     TF_RETURN_IF_ERROR(writer->WriteScalar(
         FullName(iterator_prefix, strings::StrCat(prefix, "_", kMessage)),
-        status.error_message()));
+        std::string(status.message())));
   }
   return OkStatus();
 }
@@ -850,7 +857,7 @@ Status CopyBatch(CopyBatchParams params,
 absl::flat_hash_set<tstring> CreateGraphRewriteConfigs(const Options& options) {
   absl::flat_hash_set<tstring> configs;
   const auto& autotune_options = options.autotune_options();
-  std::vector<tstring> autotune_only_optimizations = {
+  std::array<tstring, 7> autotune_only_optimizations = {
       kAutotuneBufferSizesOpt,
       kBatchParallelizationOpt,
       kDisablePrefetchLegacyAutotuneOpt,
@@ -931,28 +938,21 @@ DatasetExperimentRegistry::Experiments() {
   return *get_dataset_experiments();
 }
 
-namespace {
+bool AllTasks(int64_t unused_task_id, bool unused_evens) { return true; }
 
-// Select `rollout_pct` percent of jobs at random. `hash_func` takes a string
-// and returns a uint64 between 0 and 1.
-template <int64_t rollout_pct>
-bool RandomJobSamplePercentage(std::function<uint64_t(const string&)> hash_func,
-                               const std::string& experiment_name,
-                               const std::string& job_name) {
-  return hash_func(strings::StrCat(job_name, experiment_name)) % 100 <
-         rollout_pct;
+bool IndependentHostTasks(int64_t task_id, bool evens) {
+  int64_t lhs = task_id & 0x2;
+  int64_t rhs = 0x2;
+  return evens ? lhs != rhs : lhs == rhs;
 }
-bool AllTasks(int64_t task_id) { return true; }
-// Typically 2 tasks run on a single TPU host. This selector assigns every 2
-// other tasks in the experiment such that control and experiment do not run on
-// the same hosts. For example, if a job has 4 tasks, then task 0 and 1 are
-// assigned to control and tasks 2 and 3 experiment.
-bool IndependentHostTasks(int64_t task_id) { return (task_id & 0x2) == 0x2; }
+
+namespace {
 
 REGISTER_DATASET_EXPERIMENT("allow_small_function_optimizations",
                             RandomJobSamplePercentage<0>, AllTasks);
 REGISTER_DATASET_EXPERIMENT("autotune_buffer_optimization",
-                            RandomJobSamplePercentage<5>, IndependentHostTasks);
+                            RandomJobSamplePercentage<25>,
+                            IndependentHostTasks);
 REGISTER_DATASET_EXPERIMENT(kFilterParallelizationOpt,
                             RandomJobSamplePercentage<0>, AllTasks);
 REGISTER_DATASET_EXPERIMENT("min_outer_interleave_parallelism",
@@ -964,8 +964,7 @@ REGISTER_DATASET_EXPERIMENT("serialize_input_cycle_length",
 REGISTER_DATASET_EXPERIMENT("stage_based_autotune",
                             RandomJobSamplePercentage<0>, IndependentHostTasks);
 REGISTER_DATASET_EXPERIMENT("stage_based_autotune_v2",
-                            RandomJobSamplePercentage<10>,
-                            IndependentHostTasks);
+                            RandomJobSamplePercentage<0>, IndependentHostTasks);
 REGISTER_DATASET_EXPERIMENT("data_transfer", RandomJobSamplePercentage<1>,
                             IndependentHostTasks);
 }  // namespace
diff --git a/tensorflow/core/data/dataset_utils.h b/tensorflow/core/data/dataset_utils.h
index 2c600f0fc96..c2dd54eacef 100644
--- a/tensorflow/core/data/dataset_utils.h
+++ b/tensorflow/core/data/dataset_utils.h
@@ -363,13 +363,28 @@ inline int GetCpuBudget() {
 // optimization.
 int64 GetAutotuneDefaultParallelism(IteratorContext* ctx);
 
+// A `DatasetExperimentRegistry::JobSelector` that randomly selects
+// `rollout_pct` percent of all jobs. `name_hash` is a hash of the experiment
+// and job names.
+template <int64_t rollout_pct>
+bool RandomJobSamplePercentage(uint64_t name_hash) {
+  return name_hash % 100 < rollout_pct;
+}
+
+// A `DatasetExperimentRegistry::TaskSelector` that selects all tasks.
+bool AllTasks(int64_t unused_task_id, bool unused_evens);
+
+// A `DatasetExperimentRegistry::TaskSelector` that selects the tasks for half
+// of all hosts. Typically, one or two consecutive tasks run on a single host.
+// If `evens` is `true`, selects tasks 0,1,4,5,8,9,..., otherwise selects tasks
+// 2,3,6,7,10,11,...
+bool IndependentHostTasks(int64_t task_id, bool evens);
+
 // Registry of tf.data experiments.
 class DatasetExperimentRegistry {
  public:
-  using JobSelector = std::function<bool(
-      std::function<uint64_t(const string&)> hash_func,
-      const std::string& experiment_name, const std::string& job_name)>;
-  using TaskSelector = std::function<bool(int64_t task_id)>;
+  using JobSelector = std::function<bool(uint64_t name_hash)>;
+  using TaskSelector = std::function<bool(int64_t task_id, bool evens)>;
 
   struct ExperimentSelector {
     JobSelector job_selector;
diff --git a/tensorflow/core/data/dataset_utils_test.cc b/tensorflow/core/data/dataset_utils_test.cc
index 776a2cec0d9..53057c1a857 100644
--- a/tensorflow/core/data/dataset_utils_test.cc
+++ b/tensorflow/core/data/dataset_utils_test.cc
@@ -127,7 +127,7 @@ TEST(DatasetUtilsTest, AddToFunctionLibraryWithConflictingSignatures) {
   EXPECT_EQ(
       "Cannot add function '0' because a different function with the same "
       "signature already exists.",
-      s.error_message());
+      s.message());
 
   FunctionLibraryDefinition flib_1(OpRegistry::Global(), fdef_base);
   FunctionLibraryDefinition flib_to_add(OpRegistry::Global(), fdef_to_add);
@@ -136,7 +136,7 @@ TEST(DatasetUtilsTest, AddToFunctionLibraryWithConflictingSignatures) {
   EXPECT_EQ(
       "Cannot add function '0' because a different function with the same "
       "signature already exists.",
-      s.error_message());
+      s.message());
 }
 
 TEST(DatasetUtilsTest, StripDevicePlacement) {
@@ -194,16 +194,6 @@ TEST(DatasetUtilsTest, BoolConstructor) {
   EXPECT_FALSE(DeterminismPolicy(false).IsDefault());
 }
 
-template <int64_t rollout_pct>
-bool RandomJobSamplePercentage(std::function<uint64(const string&)> hash_func,
-                               const std::string& experiment_name,
-                               const std::string& job_name) {
-  return hash_func(strings::StrCat(job_name, experiment_name)) % 100 <
-         rollout_pct;
-}
-bool IndependentHostTasks(int64_t task_id) { return (task_id & 0x2) == 0x2; }
-bool AllTasks(int64_t task_id) { return true; }
-
 REGISTER_DATASET_EXPERIMENT("test_only_experiment_0",
                             RandomJobSamplePercentage<0>, AllTasks);
 REGISTER_DATASET_EXPERIMENT("test_only_experiment_1",
@@ -465,6 +455,7 @@ INSTANTIATE_TEST_SUITE_P(
             {"test_only_experiment_1", "test_only_experiment_99"}}));
 
 struct GetExperimentsJobNameTestCase {
+  uint64_t hash;
   string job_name;
   int64_t task_id;
   std::vector<string> expected_in;
@@ -478,7 +469,8 @@ TEST_P(GetExperimentsJobNameTest, DatasetUtils) {
   const GetExperimentsJobNameTestCase test_case = GetParam();
   auto job_name = test_case.job_name;
   auto task_id = test_case.task_id;
-  auto hash_func = [](const string& str) { return 0; };
+  uint64 hash_result = test_case.hash;
+  auto hash_func = [hash_result](const string& str) { return hash_result; };
   auto experiments = GetExperiments(job_name, task_id, hash_func);
 
   absl::flat_hash_set<string> experiment_set(experiments.begin(),
@@ -493,10 +485,12 @@ TEST_P(GetExperimentsJobNameTest, DatasetUtils) {
   }
 }
 
+// TODO(mpcallanan): Remove randomness from unit tests (see go/python-tips/048).
 INSTANTIATE_TEST_SUITE_P(
     Test, GetExperimentsJobNameTest,
     ::testing::Values(
         GetExperimentsJobNameTestCase{
+            /*hash=*/0,
             /*job_name=*/"",
             /*task_id=*/0,
             /*expected_in=*/{},
@@ -506,6 +500,7 @@ INSTANTIATE_TEST_SUITE_P(
              "test_only_experiment_50", "test_only_experiment_99",
              "test_only_experiment_100", "test_only_task_experiment_100"}},
         GetExperimentsJobNameTestCase{
+            /*hash=*/0,
             /*job_name=*/"",
             /*task_id=*/-1,
             /*expected_in=*/{},
@@ -515,6 +510,7 @@ INSTANTIATE_TEST_SUITE_P(
              "test_only_experiment_50", "test_only_experiment_99",
              "test_only_experiment_100", "test_only_task_experiment_100"}},
         GetExperimentsJobNameTestCase{
+            /*hash=*/0,
             /*job_name=*/"",
             /*task_id=*/2,
             /*expected_in=*/{},
@@ -524,6 +520,7 @@ INSTANTIATE_TEST_SUITE_P(
              "test_only_experiment_50", "test_only_experiment_99",
              "test_only_experiment_100", "test_only_task_experiment_100"}},
         GetExperimentsJobNameTestCase{
+            /*hash=*/0,
             /*job_name=*/"job_name",
             /*task_id=*/-1,
             /*expected_in=*/{},
@@ -533,33 +530,58 @@ INSTANTIATE_TEST_SUITE_P(
              "test_only_experiment_50", "test_only_experiment_99",
              "test_only_experiment_100", "test_only_task_experiment_100"}},
         GetExperimentsJobNameTestCase{
+            /*hash=*/0,
             /*job_name=*/"job_name",
             /*task_id=*/0,
             /*expected_in=*/
-            {"test_only_experiment_1", "test_only_experiment_5",
-             "test_only_experiment_10", "test_only_experiment_50",
-             "test_only_experiment_99", "test_only_experiment_100"},
-            /*expected_out=*/
-            {"test_only_experiment_0", "test_only_task_experiment_100"}},
-        GetExperimentsJobNameTestCase{
-            /*job_name=*/"job_name",
-            /*task_id=*/1,
-            /*expected_in=*/
-            {"test_only_experiment_1", "test_only_experiment_5",
-             "test_only_experiment_10", "test_only_experiment_50",
-             "test_only_experiment_99", "test_only_experiment_100"},
-            /*expected_out=*/
-            {"test_only_experiment_0", "test_only_task_experiment_100"}},
-        GetExperimentsJobNameTestCase{
-            /*job_name=*/"job_name",
-            /*task_id=*/2,
-            /*expected_in=*/
             {"test_only_experiment_1", "test_only_experiment_5",
              "test_only_experiment_10", "test_only_experiment_50",
              "test_only_experiment_99", "test_only_experiment_100",
              "test_only_task_experiment_100"},
             /*expected_out=*/
-            {"test_only_experiment_0"}}));
+            {"test_only_experiment_0"}},
+        GetExperimentsJobNameTestCase{
+            /*hash=*/0,
+            /*job_name=*/"job_name",
+            /*task_id=*/1,
+            /*expected_in=*/
+            {"test_only_experiment_1", "test_only_experiment_5",
+             "test_only_experiment_10", "test_only_experiment_50",
+             "test_only_experiment_99", "test_only_experiment_100",
+             "test_only_task_experiment_100"},
+            /*expected_out=*/
+            {"test_only_experiment_0"}},
+        GetExperimentsJobNameTestCase{
+            /*hash=*/0,
+            /*job_name=*/"job_name",
+            /*task_id=*/2,
+            /*expected_in=*/
+            {"test_only_experiment_1", "test_only_experiment_5",
+             "test_only_experiment_10", "test_only_experiment_50",
+             "test_only_experiment_99", "test_only_experiment_100"},
+            /*expected_out=*/
+            {"test_only_experiment_0", "test_only_task_experiment_100"}},
+        GetExperimentsJobNameTestCase{
+            /*hash=*/95,
+            /*job_name=*/"job_name",
+            /*task_id=*/1,
+            /*expected_in=*/
+            {"test_only_experiment_99", "test_only_experiment_100"},
+            /*expected_out=*/
+            {"test_only_experiment_0", "test_only_experiment_1",
+             "test_only_experiment_5", "test_only_experiment_10",
+             "test_only_experiment_50", "test_only_task_experiment_100"}},
+        GetExperimentsJobNameTestCase{
+            /*hash=*/95,
+            /*job_name=*/"job_name",
+            /*task_id=*/2,
+            /*expected_in=*/
+            {"test_only_experiment_99", "test_only_experiment_100",
+             "test_only_task_experiment_100"},
+            /*expected_out=*/
+            {"test_only_experiment_0", "test_only_experiment_1",
+             "test_only_experiment_5", "test_only_experiment_10",
+             "test_only_experiment_50"}}));
 
 struct GetOptimizationsTestCase {
   Options options;
diff --git a/tensorflow/core/data/hash_utils_test.cc b/tensorflow/core/data/hash_utils_test.cc
index fa989890e9a..b0b34607a55 100644
--- a/tensorflow/core/data/hash_utils_test.cc
+++ b/tensorflow/core/data/hash_utils_test.cc
@@ -131,7 +131,7 @@ TEST_F(DatasetHashUtilsTest, HashFunctionDifferentFunctions) {
   EXPECT_NE(GetHash(fl, *f1), GetHash(fl, *f2));
   Status s = CheckEqual(fl, *f1, *f2);
   EXPECT_NE(s.code(), error::OK);
-  EXPECT_THAT(s.error_message(), ContainsRegex("Add"));
+  EXPECT_THAT(s.message(), ContainsRegex("Add"));
 }
 
 TEST_F(DatasetHashUtilsTest, HashFunctionDifferentInternalNodeNames) {
@@ -273,8 +273,8 @@ TEST_F(DatasetHashUtilsTest, HashNodeDifferentGraphs) {
   EXPECT_NE(hash1, hash2);
   Status s = CheckSubgraphsEqual(gd, n3, gd, n4);
   EXPECT_NE(s.code(), error::OK);
-  EXPECT_THAT(s.error_message(), ContainsRegex("Add"));
-  EXPECT_THAT(s.error_message(), ContainsRegex("Mul"));
+  EXPECT_THAT(s.message(), ContainsRegex("Add"));
+  EXPECT_THAT(s.message(), ContainsRegex("Mul"));
 }
 
 TEST_F(DatasetHashUtilsTest, HashSameGraphDifferentSeeds) {
@@ -434,7 +434,7 @@ TEST_F(DatasetHashUtilsTest, HashNodeReversedOrder) {
   EXPECT_NE(hash1, hash2);
   Status s = CheckSubgraphsEqual(gd, n3, gd, n4);
   EXPECT_NE(s.code(), error::OK);
-  EXPECT_THAT(s.error_message(), ContainsRegex("AttrValues are different"));
+  EXPECT_THAT(s.message(), ContainsRegex("AttrValues are different"));
 }
 
 TEST_F(DatasetHashUtilsTest, HashNodeInputPortChanged) {
@@ -473,7 +473,7 @@ TEST_F(DatasetHashUtilsTest, HashNodeInputPortChanged) {
   EXPECT_NE(hash1, hash2);
   Status s = CheckSubgraphsEqual(gd, n3, gd, n4);
   EXPECT_NE(s.code(), error::OK);
-  EXPECT_THAT(s.error_message(), ContainsRegex("Node inputs"));
+  EXPECT_THAT(s.message(), ContainsRegex("Node inputs"));
 }
 
 TEST_F(DatasetHashUtilsTest, HashNodeSameFunctionDifferentNames) {
@@ -702,7 +702,7 @@ TEST_F(DatasetHashUtilsTest, HashNodeDifferentFunctionsOps) {
   Status s = CheckSubgraphsEqual(gd, n2, gd, n3);
   EXPECT_NE(s.code(), error::OK);
   EXPECT_THAT(
-      s.error_message(),
+      s.message(),
       ContainsRegex("Functions AddAndMul and AddAndMul2 are not the same"));
 }
 
@@ -773,7 +773,7 @@ TEST_F(DatasetHashUtilsTest, HashNodeDifferentFunctions) {
   Status s = CheckSubgraphsEqual(gd, n2, gd, n3);
   EXPECT_NE(s.code(), error::OK);
   EXPECT_THAT(
-      s.error_message(),
+      s.message(),
       ContainsRegex("Functions AddAndMul and AddAndMul2 are not the same"));
 }
 
@@ -846,7 +846,7 @@ TEST_F(DatasetHashUtilsTest, HashNodeDifferentFunctionLists) {
   Status s = CheckSubgraphsEqual(gd, n2, gd, n3);
   EXPECT_NE(s.code(), error::OK);
   EXPECT_THAT(
-      s.error_message(),
+      s.message(),
       ContainsRegex("Functions AddAndMul and AddAndMul2 are not the same"));
 }
 
@@ -891,8 +891,7 @@ TEST_F(DatasetHashUtilsTest, HashNodeDifferentControlInputs) {
   EXPECT_NE(hash1, hash2);
   Status s = CheckSubgraphsEqual(gd, n4, gd, n5);
   EXPECT_NE(s.code(), error::OK);
-  EXPECT_THAT(s.error_message(),
-              ContainsRegex("Control dependencies are different"));
+  EXPECT_THAT(s.message(), ContainsRegex("Control dependencies are different"));
 }
 
 TEST_F(DatasetHashUtilsTest, HashNodeControlInputDifferentOrdering) {
diff --git a/tensorflow/core/data/root_dataset.cc b/tensorflow/core/data/root_dataset.cc
index d9e16b1d450..9b03486e805 100644
--- a/tensorflow/core/data/root_dataset.cc
+++ b/tensorflow/core/data/root_dataset.cc
@@ -79,9 +79,19 @@ void SetRootDatasetParams(const Options& options, RootDataset::Params* params) {
     }
     params->autotune_cpu_budget = value_or_default(
         options.autotune_options().cpu_budget(), 0, GetCpuBudget());
-    params->autotune_ram_budget =
-        value_or_default(options.autotune_options().ram_budget(), 0,
-                         model::kRamBudgetShare * port::AvailableRam());
+    if (experiments.contains("autotune_buffer_optimization")) {
+      // When running this experiment, increase the ram_budget since it already
+      // takes into account the ram usage in buffer sizing, which is not the
+      // case for prefetch autotuner. Without this, we see degradation in some
+      // jobs for lack of buffers while ram usage is low.
+      params->autotune_ram_budget =
+          value_or_default(options.autotune_options().ram_budget(), 0,
+                           0.90 * port::AvailableRam());
+    } else {
+      params->autotune_ram_budget =
+          value_or_default(options.autotune_options().ram_budget(), 0,
+                           model::kRamBudgetShare * port::AvailableRam());
+    }
   }
 }
 
@@ -283,7 +293,7 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
                                  dataset()->params_.autotune_ram_budget,
                                  cancellation_manager_.get());
         if (!status.ok()) {
-          LOG(WARNING) << "Optimization loop failed: " << status.ToString();
+          LOG(WARNING) << "Optimization loop failed: " << status;
         }
       });
     }
@@ -405,14 +415,14 @@ Status FinalizeDataset(OpKernelContext* ctx, const DatasetBase* input,
   };
   core::RefCountPtr<DatasetBase> rewritten_output;
   Status s = RewriteDataset(ctx, input, std::move(config_factory),
-                            /*record_fingerprint=*/true, &rewritten_output);
+                            /*record_fingerprint=*/false, &rewritten_output);
 
   *output = rewritten_output.get();
   bool rewritten = (*output != input);
   if (errors::IsDeadlineExceeded(s)) {
     // Ignore DeadlineExceeded as it implies that the attempted rewrite took too
     // long which should not prevent further computation.
-    LOG(WARNING) << s.ToString();
+    LOG(WARNING) << s;
   } else if (!s.ok()) {
     return s;
   }
diff --git a/tensorflow/core/data/serialization_utils.cc b/tensorflow/core/data/serialization_utils.cc
index a2250ed579f..33c6635bbc8 100644
--- a/tensorflow/core/data/serialization_utils.cc
+++ b/tensorflow/core/data/serialization_utils.cc
@@ -42,24 +42,6 @@ constexpr char kIsDataset[] = ".is_dataset";
 constexpr char kIteratorVariantTypeName[] = "tensorflow::Iterator";
 constexpr char kOutputNode[] = ".output_node";
 
-// We assume that all keys are of the form <iterator_prefix>:<name>. We extract
-// the iterator name by getting rid of everything post the final colon.
-Status GetIteratorName(StringPiece key, string* name) {
-  if (!str_util::StartsWith(key, data::kFullNameRandomHex)) {
-    return errors::InvalidArgument("Save key: ", key,
-                                   " not generated using full_name.");
-  }
-  std::vector<string> split_keys = str_util::Split(key, data::kPipe);
-  if (split_keys.size() != 2) {
-    return errors::InvalidArgument("Save key: ", key,
-                                   " not generated using full_name.");
-  }
-  string real_key = split_keys[1];
-  const int pos = real_key.rfind(kColon);
-  *name = real_key.substr(0, pos);
-  return OkStatus();
-}
-
 Status FromGraphDef(FunctionLibraryRuntime* flr, const GraphDef& graph_def,
                     const std::vector<std::pair<string, Tensor>>& input_list,
                     const string& output_node, Tensor* result) {
@@ -170,9 +152,9 @@ VariantTensorDataReader::VariantTensorDataReader(
 
 Status VariantTensorDataReader::ReadScalar(StringPiece key,
                                            int64_t* val) const {
-  string name;
-  TF_RETURN_IF_ERROR(GetIteratorName(key, &name));
-  return ReadScalar(name, key, val);
+  string prefix;
+  TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+  return ReadScalar(prefix, key, val);
 }
 
 Status VariantTensorDataReader::ReadScalar(StringPiece name, StringPiece key,
@@ -182,9 +164,9 @@ Status VariantTensorDataReader::ReadScalar(StringPiece name, StringPiece key,
 
 Status VariantTensorDataReader::ReadScalar(StringPiece key,
                                            tstring* val) const {
-  string name;
-  TF_RETURN_IF_ERROR(GetIteratorName(key, &name));
-  return ReadScalar(name, key, val);
+  string prefix;
+  TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+  return ReadScalar(prefix, key, val);
 }
 
 Status VariantTensorDataReader::ReadScalar(StringPiece name, StringPiece key,
@@ -193,16 +175,16 @@ Status VariantTensorDataReader::ReadScalar(StringPiece name, StringPiece key,
 }
 
 Status VariantTensorDataReader::ReadTensor(StringPiece key, Tensor* val) const {
-  string name;
-  TF_RETURN_IF_ERROR(GetIteratorName(key, &name));
-  return ReadTensor(name, key, val);
+  string prefix;
+  TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+  return ReadTensor(prefix, key, val);
 }
 
 Status VariantTensorDataReader::ReadTensor(FunctionLibraryRuntime* flr,
                                            StringPiece key, Tensor* val) const {
-  string name;
-  TF_RETURN_IF_ERROR(GetIteratorName(key, &name));
-  return ReadTensorInternal(flr, name, key, val);
+  string prefix;
+  TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+  return ReadTensorInternal(flr, prefix, key, val);
 }
 
 Status VariantTensorDataReader::ReadTensor(StringPiece name, StringPiece key,
@@ -217,11 +199,11 @@ Status VariantTensorDataReader::ReadTensor(FunctionLibraryRuntime* flr,
 }
 
 bool VariantTensorDataReader::Contains(StringPiece key) const {
-  string name;
-  if (!GetIteratorName(key, &name).ok()) {
+  string prefix;
+  if (!ExtractIteratorPrefix(key, &prefix).ok()) {
     return false;
   }
-  return Contains(name, key);
+  return Contains(prefix, key);
 }
 
 bool VariantTensorDataReader::Contains(StringPiece n, StringPiece key) const {
@@ -294,9 +276,9 @@ Status VariantTensorDataReader::ReadDatasetInternal(FunctionLibraryRuntime* flr,
 
 Status VariantTensorDataWriter::WriteScalar(StringPiece key,
                                             const int64_t val) {
-  string name;
-  TF_RETURN_IF_ERROR(GetIteratorName(key, &name));
-  return WriteScalar(name, key, val);
+  string prefix;
+  TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+  return WriteScalar(prefix, key, val);
 }
 
 Status VariantTensorDataWriter::WriteScalar(StringPiece name, StringPiece key,
@@ -306,9 +288,9 @@ Status VariantTensorDataWriter::WriteScalar(StringPiece name, StringPiece key,
 
 Status VariantTensorDataWriter::WriteScalar(StringPiece key,
                                             const tstring& val) {
-  string name;
-  TF_RETURN_IF_ERROR(GetIteratorName(key, &name));
-  return WriteScalar(name, key, val);
+  string prefix;
+  TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+  return WriteScalar(prefix, key, val);
 }
 
 Status VariantTensorDataWriter::WriteScalar(StringPiece name, StringPiece key,
@@ -318,9 +300,9 @@ Status VariantTensorDataWriter::WriteScalar(StringPiece name, StringPiece key,
 
 Status VariantTensorDataWriter::WriteTensor(StringPiece key,
                                             const Tensor& val) {
-  string name;
-  TF_RETURN_IF_ERROR(GetIteratorName(key, &name));
-  return WriteTensor(name, key, val);
+  string prefix;
+  TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+  return WriteTensor(prefix, key, val);
 }
 
 Status VariantTensorDataWriter::WriteTensor(StringPiece name, StringPiece key,
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index 9a7fdfa0fb8..6b1b6afbcc4 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -12,7 +12,10 @@ load(
     "tf_cc_test",
 )
 
-package_group(name = "data_transfer_visibility")
+package_group(
+    name = "data_transfer_visibility",
+    packages = ["//learning/brain/google/data/service/data_transfer_protocols/..."],
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -171,8 +174,8 @@ cc_library(
     hdrs = ["credentials_factory.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -243,13 +246,13 @@ tf_cc_test(
         ":export_proto_cc",
         ":test_cluster",
         ":test_util",
-        "@com_google_absl//absl/time",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:status_matchers",
         "//tensorflow/core/platform:statusor",
+        "@com_google_absl//absl/time",
     ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
 )
 
@@ -347,7 +350,6 @@ cc_library(
         ":dispatcher_cc_grpc_proto",
         ":dispatcher_proto_cc",
         ":grpc_util",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:protos_all_cc",
@@ -355,6 +357,7 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
+        "@com_google_absl//absl/strings",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -368,7 +371,6 @@ tf_cc_test(
         ":dispatcher_client",
         ":test_cluster",
         ":test_util",
-        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -377,6 +379,7 @@ tf_cc_test(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status_matchers",
         "//tensorflow/core/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
 )
 
@@ -403,12 +406,6 @@ cc_library(
         ":utils",
         ":validate_utils",
         ":worker_cc_grpc_proto",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "//tensorflow/core/data/service/snapshot:snapshot_manager",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -422,6 +419,7 @@ cc_library(
         "//tensorflow/core/data:standalone",
         "//tensorflow/core/data/service/snapshot:file_utils",
         "//tensorflow/core/data/service/snapshot:path_utils",
+        "//tensorflow/core/data/service/snapshot:snapshot_manager",
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:macros",
@@ -431,6 +429,11 @@ cc_library(
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/platform:strcat",
         "//tensorflow/core/platform:thread_annotations",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -499,7 +502,6 @@ tf_cc_test(
         ":grpc_dispatcher_impl",
         ":server_lib",
         ":test_util",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -509,6 +511,7 @@ tf_cc_test(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/strings",
     ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
 )
 
@@ -568,7 +571,6 @@ tf_cc_test(
         ":server_lib",
         ":test_util",
         ":worker_proto_cc",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -579,6 +581,7 @@ tf_cc_test(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/strings",
     ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
 )
 
@@ -665,18 +668,18 @@ cc_library(
         "//visibility:public",
     ],
     deps = [
+        ":common_proto_cc",
         ":credentials_factory",
         ":data_transfer",
-        ":common_proto_cc",
         ":export_proto_cc",
         ":grpc_dispatcher_impl",
         ":grpc_util",
         ":grpc_worker_impl",
         ":worker_client",
-        "//tensorflow/core/data:utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
+        "//tensorflow/core/data:utils",
         "//tensorflow/core/profiler/rpc:profiler_service_impl",
     ] + tf_grpc_cc_dependencies(),
     alwayslink = 1,
@@ -996,9 +999,6 @@ cc_library(
         ":worker_cc_grpc_proto",
         ":worker_impl",
         ":worker_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core/framework:dataset_proto_cc",
@@ -1007,6 +1007,9 @@ cc_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/tsl/platform:errors",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -1027,9 +1030,6 @@ tf_cc_test(
         ":worker_client",
         ":worker_impl",
         ":worker_proto_cc",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -1041,6 +1041,9 @@ tf_cc_test(
         "//tensorflow/core/platform:status_matchers",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
 )
 
@@ -1066,13 +1069,6 @@ cc_library(
         ":task_runner",
         ":utils",
         ":worker_proto_cc",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -1091,8 +1087,15 @@ cc_library(
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/platform:thread_annotations",
         "//tensorflow/core/platform:types",
-        "//tensorflow/tsl/protobuf:protos_all_cc",
         "//tensorflow/tsl/platform:status_to_from_proto",
+        "//tensorflow/tsl/protobuf:protos_all_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
     ] + tf_grpc_cc_dependencies(),
 )
 
diff --git a/tensorflow/core/data/service/client/BUILD b/tensorflow/core/data/service/client/BUILD
index 150b1842ac7..44ae3738331 100644
--- a/tensorflow/core/data/service/client/BUILD
+++ b/tensorflow/core/data/service/client/BUILD
@@ -67,7 +67,6 @@ tf_cc_test(
     deps = [
         ":common",
         ":data_service_client",
-        "@com_google_absl//absl/time",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -79,6 +78,7 @@ tf_cc_test(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:status_matchers",
         "//tensorflow/core/platform:statusor",
+        "@com_google_absl//absl/time",
     ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
 )
 
diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index 08410ad906d..98788d2cf38 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -61,7 +61,6 @@ bool IsColocatedTask(const TaskInfo& task) {
   });
 }
 
-
 }  // namespace
 
 DataServiceClient::DataServiceClient(const DataServiceParams& params)
@@ -325,6 +324,30 @@ DataServiceClient::CreateWorkerClient(const std::string& protocol,
                           task_info.worker_address());
 }
 
+StatusOr<std::unique_ptr<DataServiceWorkerClient>>
+DataServiceClient::CreateGrpcWorkerClient(const TaskInfo& task_info) {
+  return CreateWorkerClient(kGrpcTransferProtocol, task_info);
+}
+
+StatusOr<std::unique_ptr<DataServiceWorkerClient>>
+DataServiceClient::CreateAlternativeWorkerClientWithGrpcFallback(
+    const std::string& protocol, const TaskInfo& task_info) {
+  StatusOr<std::unique_ptr<DataServiceWorkerClient>> worker =
+      CreateWorkerClient(protocol, task_info);
+  if (worker.ok()) {
+    LOG(INFO) << "Successfully started client for data transfer protocol '"
+              << protocol << "'.";
+    return worker;
+  }
+  LOG(ERROR) << "Failed to start client for data transfer protocol '"
+             << protocol << "'; falling back to grpc. "
+             << "Original error: " << worker.status();
+  metrics::RecordTFDataServiceDataTransferProtocolFallback(
+      protocol, static_cast<error::Code>(worker.status().raw_code()),
+      std::string(worker.status().message()));
+  return CreateGrpcWorkerClient(task_info);
+}
+
 StatusOr<std::unique_ptr<DataServiceWorkerClient>>
 DataServiceClient::CreateWorkerClient(const TaskInfo& task_info) {
   if (params_.data_transfer_protocol == kLocalTransferProtocol) {
@@ -334,24 +357,17 @@ DataServiceClient::CreateWorkerClient(const TaskInfo& task_info) {
     return CreateDataServiceWorkerClient(params_.protocol, info);
   }
   if (!params_.data_transfer_protocol.empty()) {
-    return CreateWorkerClient(params_.data_transfer_protocol, task_info);
+    return CreateAlternativeWorkerClientWithGrpcFallback(
+        params_.data_transfer_protocol, task_info);
   }
   if (std::string default_protocol = DefaultDataTransferProtocol();
       default_protocol != kGrpcTransferProtocol) {
     LOG(INFO)
         << "This task is participating in the \"data_transfer\" experiment.";
-    StatusOr<std::unique_ptr<DataServiceWorkerClient>> worker =
-        CreateWorkerClient(default_protocol, task_info);
-    if (worker.ok()) {
-      LOG(INFO) << "Successfully started client for data transfer protocol '"
-                << default_protocol << "'.";
-      return worker;
-    }
-    LOG(ERROR) << "Failed to start client for default data transfer protocol '"
-               << default_protocol << "'; falling back to grpc. "
-               << "Original error: " << worker.status();
+    return CreateAlternativeWorkerClientWithGrpcFallback(default_protocol,
+                                                         task_info);
   }
-  return CreateWorkerClient(kGrpcTransferProtocol, task_info);
+  return CreateGrpcWorkerClient(task_info);
 }
 
 Status DataServiceClient::AddTask(const TaskInfo& task_info)
@@ -587,7 +603,7 @@ void DataServiceClient::RunWorkerThread(std::function<void()> done)
       status_ = errors::CreateWithUpdatedMessage(
           s, absl::StrCat("Failed to get element from worker ",
                           task_to_process->info.worker_address(), ": ",
-                          s.error_message()));
+                          s.message()));
       get_next_cv_.notify_all();
       return;
     }
@@ -761,21 +777,24 @@ Status DataServiceClient::GetElement(Task* task, int64_t deadline_micros,
     Status s = TryGetElement(*task, get_element_result);
     if (s.ok()) break;
     if (!IsPreemptedError(s)) {
-      if (!params_.data_transfer_protocol.empty() ||
-          DefaultDataTransferProtocol() == kGrpcTransferProtocol) {
+      std::string data_transfer_protocol =
+          !params_.data_transfer_protocol.empty()
+              ? params_.data_transfer_protocol
+              : DefaultDataTransferProtocol();
+      if (data_transfer_protocol == kGrpcTransferProtocol ||
+          data_transfer_protocol == kLocalTransferProtocol) {
         return s;
       }
       mutex_lock l(mu_);
-      TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<DataServiceWorkerClient> worker,
-          CreateWorkerClient(kGrpcTransferProtocol, task->info));
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<DataServiceWorkerClient> worker,
+                          CreateGrpcWorkerClient(task->info));
       task->worker = std::move(worker);
-      LOG(ERROR) << "failed to use client for default data transfer protocol '"
-                 << DefaultDataTransferProtocol() << "'; falling back to grpc. "
+      LOG(ERROR) << "failed to use alternative data transfer protocol '"
+                 << data_transfer_protocol << "'; falling back to grpc. "
                  << "Original error: " << s;
       metrics::RecordTFDataServiceDataTransferProtocolError(
           DefaultDataTransferProtocol(), static_cast<error::Code>(s.raw_code()),
-          s.error_message());
+          std::string(s.message()));
       continue;
     }
     if (!IsCoordinatedRead()) {
diff --git a/tensorflow/core/data/service/client/data_service_client.h b/tensorflow/core/data/service/client/data_service_client.h
index 3d5f7079058..1ea93d72ba1 100644
--- a/tensorflow/core/data/service/client/data_service_client.h
+++ b/tensorflow/core/data/service/client/data_service_client.h
@@ -140,6 +140,11 @@ class DataServiceClient {
       const TaskInfo& task_info);
   StatusOr<std::unique_ptr<DataServiceWorkerClient>> CreateWorkerClient(
       const std::string& protocol, const TaskInfo& task_info);
+  StatusOr<std::unique_ptr<DataServiceWorkerClient>> CreateGrpcWorkerClient(
+      const TaskInfo& task_info);
+  StatusOr<std::unique_ptr<DataServiceWorkerClient>>
+  CreateAlternativeWorkerClientWithGrpcFallback(const std::string& protocol,
+                                                const TaskInfo& task_info);
   void Heartbeat();
   void UpdateTasks(const ClientHeartbeatResponse& resp);
   bool ShouldReadFromTask(const TaskInfo& task) const;
diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index 84e3213401c..3a704ec612e 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -24,11 +24,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/tsl/platform/errors.h"
-
-#ifdef PLATFORM_GOOGLE
-#include "file/logging/log_lines.h"
-#endif
 #include "grpcpp/create_channel.h"
 #include "grpcpp/impl/codegen/server_context.h"
 #include "grpcpp/security/credentials.h"
@@ -88,7 +83,7 @@ constexpr char kDatasetsDir[] = "datasets";
 constexpr absl::Duration kDefaultIterationGcCheckInterval = absl::Minutes(10);
 constexpr absl::Duration kDefaultIterationGcTimeout = absl::Minutes(5);
 constexpr absl::Duration kDefaultClientTimeout = absl::Minutes(2);
-constexpr absl::Duration kDefaultWorkerTimeout = absl::Minutes(1);
+constexpr absl::Duration kDefaultWorkerTimeout = absl::Minutes(10);
 
 constexpr std::array<const char*, 8> kNodeNameSharingOps = {
     "HashTable",
@@ -166,14 +161,6 @@ DispatcherConfig ApplyConfigDefaults(const DispatcherConfig& config) {
   }
   return new_config;
 }
-
-void VLogLines(const int log_level, const std::string& message) {
-#if defined(PLATFORM_GOOGLE)
-  VLOG_LINES(log_level, message);
-#else
-  VLOG(log_level) << message;
-#endif
-}
 }  // namespace
 
 DataServiceDispatcherImpl::DataServiceDispatcherImpl(
@@ -455,6 +442,13 @@ Status DataServiceDispatcherImpl::GetSplit(const GetSplitRequest* request,
             << " is greater than the requested repetition " << repetition;
     return OkStatus();
   }
+  if (repetition > current_repetition) {
+    // This could happen if an iterator is repeated before reaching end of
+    // input, e.g. for the longer input to `Dataset.zip`. In this case we mark
+    // the previous repetitions as completed and advance to the requested
+    // repetition.
+    TF_RETURN_IF_ERROR(split_providers_[iteration_id][provider_index]->Reset());
+  }
   SplitProvider* split_provider =
       split_providers_[iteration_id][provider_index].get();
   DCHECK(split_provider != nullptr);
@@ -500,8 +494,6 @@ Status DataServiceDispatcherImpl::GetOrRegisterDataset(
   DatasetDef dataset_def = request->dataset();
   GraphDef* graph = dataset_def.mutable_graph();
   PrepareGraph(graph);
-  VLogLines(/*log_level=*/4,
-            absl::StrCat("Registering dataset graph: ", graph->DebugString()));
 
   mutex_lock l(mu_);
   TF_ASSIGN_OR_RETURN(std::optional<std::string> dataset_id,
@@ -1197,11 +1189,6 @@ void DataServiceDispatcherImpl::MaintenanceThread() {
         LOG(WARNING) << "Error garbage collecting old iterations: " << s;
       }
     }
-    {
-      for (const auto& [ignore, snapshot_manager] : snapshots_) {
-        snapshot_manager->UpdateStreams();
-      }
-    }
     DetectMissingWorkers();
     next_check_micros =
         env_->NowMicros() + (config_.job_gc_check_interval_ms() * 1000);
@@ -1227,6 +1214,7 @@ Status DataServiceDispatcherImpl::ReleaseMissingClients()
   return OkStatus();
 }
 
+// TODO(b/250921378): Once snapshots have leases, inform snapshot managers.
 void DataServiceDispatcherImpl::DetectMissingWorkers()
     TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   int64_t now = env_->NowMicros();
@@ -1234,9 +1222,6 @@ void DataServiceDispatcherImpl::DetectMissingWorkers()
        it != latest_worker_heartbeats_time_.end();) {
     if (absl::FromUnixMicros(now) >
         it->second + absl::Milliseconds(config_.worker_timeout_ms())) {
-      for (const auto& [ignore, snapshot_manager] : snapshots_) {
-        snapshot_manager->HandleMissingWorker(it->first);
-      }
       LOG(INFO) << "Lost worker " << it->first << " due to timeout";
       latest_worker_heartbeats_time_.erase(it++);
     } else {
@@ -1249,12 +1234,9 @@ Status DataServiceDispatcherImpl::GcOldIterations()
     TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   std::vector<std::shared_ptr<const Iteration>> iterations =
       state_.ListIterations();
-  int64_t now = env_->NowMicros();
+  int64_t now_us = env_->NowMicros();
   for (const auto& iteration : iterations) {
-    if (iteration->finished || iteration->num_clients > 0 ||
-        iteration->last_client_released_micros < 0 ||
-        now < iteration->last_client_released_micros +
-                  (config_.job_gc_timeout_ms() * 1000)) {
+    if (!ShouldGcIteration(*iteration, now_us)) {
       continue;
     }
     Update update;
@@ -1266,6 +1248,21 @@ Status DataServiceDispatcherImpl::GcOldIterations()
   return OkStatus();
 }
 
+bool DataServiceDispatcherImpl::ShouldGcIteration(const Iteration& iteration,
+                                                  int64_t now_us) const {
+  if (iteration.job->processing_mode.sharding_policy() ==
+          ProcessingModeDef::DYNAMIC &&
+      !config_.gc_dynamic_sharding_jobs()) {
+    // Jobs with dynamic sharding have visitation guarantees that are violated
+    // if they are garbage collected and later recreated.
+    return false;
+  }
+  return !(iteration.finished || iteration.num_clients > 0 ||
+           iteration.last_client_released_micros < 0 ||
+           now_us < iteration.last_client_released_micros +
+                        (config_.job_gc_timeout_ms() * 1000));
+}
+
 Status DataServiceDispatcherImpl::GetDatasetDef(
     const std::string& dataset_id,
     std::shared_ptr<const DatasetDef>& dataset_def)
diff --git a/tensorflow/core/data/service/dispatcher_impl.h b/tensorflow/core/data/service/dispatcher_impl.h
index 29eaa06d196..8572949c21f 100644
--- a/tensorflow/core/data/service/dispatcher_impl.h
+++ b/tensorflow/core/data/service/dispatcher_impl.h
@@ -313,6 +313,9 @@ class DataServiceDispatcherImpl {
   void DetectMissingWorkers() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Scans for old iterations and marks them as finished.
   Status GcOldIterations() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Returns true if an iteration should be garbage collected.
+  bool ShouldGcIteration(const DispatcherState::Iteration& iteration,
+                         int64_t now_us) const;
   // Gets a `DatasetDef` from `dataset_store_` for the given dataset id, and
   // stores it in `dataset_def`.
   Status GetDatasetDef(const std::string& dataset_id,
diff --git a/tensorflow/core/data/service/dispatcher_state.cc b/tensorflow/core/data/service/dispatcher_state.cc
index d657d807595..97806e37681 100644
--- a/tensorflow/core/data/service/dispatcher_state.cc
+++ b/tensorflow/core/data/service/dispatcher_state.cc
@@ -177,7 +177,8 @@ void DispatcherState::ProduceSplit(const ProduceSplitUpdate& produce_split) {
   DCHECK(iteration->distributed_epoch_state.has_value());
   DistributedEpochState& state = iteration->distributed_epoch_state.value();
   int64_t provider_index = produce_split.split_provider_index();
-  DCHECK_EQ(produce_split.repetition(), state.repetitions[provider_index]);
+  DCHECK_GE(produce_split.repetition(), state.repetitions[provider_index]);
+  state.repetitions[provider_index] = produce_split.repetition();
   if (produce_split.finished()) {
     state.repetitions[provider_index]++;
     state.indices[provider_index] = 0;
diff --git a/tensorflow/core/data/service/journal_test.cc b/tensorflow/core/data/service/journal_test.cc
index 190699b0892..5c1326c190a 100644
--- a/tensorflow/core/data/service/journal_test.cc
+++ b/tensorflow/core/data/service/journal_test.cc
@@ -140,7 +140,7 @@ TEST(Journal, NonRecordData) {
   Update result;
   bool end_of_journal = true;
   Status s = reader.Read(result, end_of_journal);
-  EXPECT_THAT(s.error_message(), HasSubstr("corrupted record"));
+  EXPECT_THAT(s.message(), HasSubstr("corrupted record"));
   EXPECT_EQ(s.code(), error::DATA_LOSS);
 }
 
@@ -161,7 +161,7 @@ TEST(Journal, InvalidRecordData) {
   Update result;
   bool end_of_journal = true;
   Status s = reader.Read(result, end_of_journal);
-  EXPECT_THAT(s.error_message(), HasSubstr("Failed to parse journal record"));
+  EXPECT_THAT(s.message(), HasSubstr("Failed to parse journal record"));
   EXPECT_EQ(s.code(), error::DATA_LOSS);
 }
 }  // namespace data
diff --git a/tensorflow/core/data/service/snapshot/BUILD b/tensorflow/core/data/service/snapshot/BUILD
index 54af57533e1..8068a66e7f0 100644
--- a/tensorflow/core/data/service/snapshot/BUILD
+++ b/tensorflow/core/data/service/snapshot/BUILD
@@ -21,8 +21,6 @@ tf_cc_test(
         ":path_utils",
         ":snapshot_reader",
         ":test_utils",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -39,6 +37,8 @@ tf_cc_test(
         "//tensorflow/tsl/platform:status_matchers",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:tstring",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
     ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
 )
 
@@ -48,13 +48,16 @@ cc_library(
     hdrs = ["file_utils.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":path_utils",
         "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/data:snapshot_utils",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/platform:random",
         "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:status_to_from_proto",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/strings",
     ],
@@ -77,6 +80,7 @@ tf_cc_test(
         "//tensorflow/tsl/platform:path",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:status_to_from_proto",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/protobuf:protos_all_cc",
     ],
@@ -166,6 +170,7 @@ cc_library(
     deps = [
         ":file_utils",
         ":path_utils",
+        ":utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -179,6 +184,7 @@ cc_library(
         "//tensorflow/tsl/platform:status_to_from_proto",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/protobuf:protos_all_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/time",
     ],
 )
@@ -212,47 +218,25 @@ cc_library(
     hdrs = ["snapshot_reader.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":file_utils",
         ":path_utils",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:captured_function",
+        "//tensorflow/core/data:name_utils",
         "//tensorflow/core/data:snapshot_utils",
-        "//tensorflow/core/data/service:common",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:refcount",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
-    name = "snapshot_reader_test",
-    size = "small",
-    srcs = ["snapshot_reader_test.cc"],
-    deps = [
-        ":path_utils",
-        ":snapshot_reader",
-        ":snapshot_stream_writer",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/data:standalone",
-        "//tensorflow/core/data/service:common",
-        "//tensorflow/core/data/service:common_proto_cc",
-        "//tensorflow/core/data/service:task_runner",
-        "//tensorflow/core/data/service:test_util",
-        "//tensorflow/core/framework:types_proto_cc",
-        "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/platform:status_matchers",
-        "//tensorflow/tsl/platform:statusor",
-        "//tensorflow/tsl/protobuf:protos_all_cc",
-    ],
-)
-
 cc_library(
     name = "snapshot_stream_writer",
     srcs = ["snapshot_stream_writer.cc"],
@@ -337,13 +321,13 @@ cc_library(
     hdrs = ["test_utils.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":file_utils",
         ":path_utils",
-        ":snapshot_reader",
         ":snapshot_stream_writer",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:snapshot_utils",
         "//tensorflow/core/data:standalone",
-        "//tensorflow/core/data/service:common",
         "//tensorflow/core/data/service:common_proto_cc",
         "//tensorflow/core/data/service:task_runner",
         "//tensorflow/tsl/platform:env",
@@ -364,6 +348,10 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:tensor_proto_cc",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -377,7 +365,10 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:dataset_proto_cc",
         "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:protobuf",
+        "//tensorflow/tsl/platform:status",
     ],
 )
diff --git a/tensorflow/core/data/service/snapshot/file_utils.cc b/tensorflow/core/data/service/snapshot/file_utils.cc
index 96da6b6df7c..73a5b710e9e 100644
--- a/tensorflow/core/data/service/snapshot/file_utils.cc
+++ b/tensorflow/core/data/service/snapshot/file_utils.cc
@@ -22,11 +22,14 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
 #include "tensorflow/core/data/snapshot_utils.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/random.h"
 #include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/status_to_from_proto.h"
+#include "tensorflow/tsl/protobuf/status.pb.h"
 
 namespace tensorflow {
 namespace data {
@@ -125,6 +128,30 @@ tsl::StatusOr<std::vector<std::string>> GetChildren(absl::string_view directory,
   return result;
 }
 
+Status ValidateSnapshot(const std::string& snapshot_path, tsl::Env* env) {
+  if (!env->FileExists(snapshot_path).ok()) {
+    return errors::NotFound("Failed to load tf.data snapshot at ",
+                            snapshot_path,
+                            ": The snapshot directory does not exist.");
+  }
+  if (env->FileExists(SnapshotErrorFilePath(snapshot_path)).ok()) {
+    StatusProto status_proto;
+    TF_RETURN_IF_ERROR(ReadTextProto(env, SnapshotErrorFilePath(snapshot_path),
+                                     &status_proto));
+    Status status = tsl::StatusFromProto(status_proto);
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        status, "Failed to load tf.data snapshot at ", snapshot_path,
+        " since the save job failed to write it.");
+    return status;
+  }
+  if (!env->FileExists(SnapshotDoneFilePath(snapshot_path)).ok()) {
+    return errors::InvalidArgument(
+        "Failed to load tf.data snapshot at ", snapshot_path,
+        ". The save job has not finished writing the snapshot.");
+  }
+  return OkStatus();
+}
+
 bool IsTemporaryFile(absl::string_view filename) {
   return absl::EndsWith(filename, kTempFileSuffix);
 }
diff --git a/tensorflow/core/data/service/snapshot/file_utils.h b/tensorflow/core/data/service/snapshot/file_utils.h
index 9adb7e491c5..f7481f82087 100644
--- a/tensorflow/core/data/service/snapshot/file_utils.h
+++ b/tensorflow/core/data/service/snapshot/file_utils.h
@@ -56,6 +56,11 @@ tsl::Status AtomicallyWriteTFRecords(absl::string_view filename,
 tsl::StatusOr<std::vector<std::string>> GetChildren(absl::string_view directory,
                                                     tsl::Env* env);
 
+// Validates a snapshot before reading it by `load`. If the snapshot does not
+// exist, is not finished, or is in an error state, returns an error. If the
+// snapshot is ready to be loaded, returns an OK status.
+Status ValidateSnapshot(const std::string& snapshot_path, tsl::Env* env);
+
 // Returns true if `filename` is a temporary file and should be ignored in
 // normal data processing.
 bool IsTemporaryFile(absl::string_view filename);
diff --git a/tensorflow/core/data/service/snapshot/file_utils_test.cc b/tensorflow/core/data/service/snapshot/file_utils_test.cc
index 1c07050cf45..fc03c63b54c 100644
--- a/tensorflow/core/data/service/snapshot/file_utils_test.cc
+++ b/tensorflow/core/data/service/snapshot/file_utils_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/data/service/test_util.h"
 #include "tensorflow/core/data/snapshot_utils.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/env.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/lib/io/compression.h"
 #include "tensorflow/tsl/platform/env.h"
@@ -29,6 +28,7 @@ limitations under the License.
 #include "tensorflow/tsl/platform/path.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/status_to_from_proto.h"
 #include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/platform/test.h"
 #include "tensorflow/tsl/protobuf/error_codes.pb.h"
@@ -38,6 +38,7 @@ namespace data {
 namespace {
 
 using ::testing::ElementsAre;
+using ::testing::HasSubstr;
 using ::testing::IsEmpty;
 using tsl::testing::IsOkAndHolds;
 using tsl::testing::StatusIs;
@@ -101,12 +102,11 @@ TEST(FileUtilsTest, AtomicallyWriteTFRecord) {
   TF_ASSERT_OK(AtomicallyWriteTFRecords(
       test_file, {out}, tsl::io::compression::kSnappy, tsl::Env::Default()));
 
-  std::vector<Tensor> in;
   TF_EXPECT_OK(tsl::Env::Default()->FileExists(test_file));
-  snapshot_util::TFRecordReader reader(test_file, tsl::io::compression::kSnappy,
-                                       {DT_INT64});
+  snapshot_util::TFRecordReaderImpl reader(test_file,
+                                           tsl::io::compression::kSnappy);
   TF_ASSERT_OK(reader.Initialize(tsl::Env::Default()));
-  TF_ASSERT_OK(reader.ReadTensors(&in));
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Tensor> in, reader.GetTensors());
   EXPECT_EQ(out.DebugString(), in.front().DebugString());
 }
 
@@ -131,6 +131,40 @@ TEST(FileUtilsTest, GetChildrenDirectoryNotFound) {
               StatusIs(tsl::error::NOT_FOUND));
 }
 
+TEST(FileUtilsTest, ValidateSnapshotSuccess) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string directory, CreateTestDirectory());
+  std::string done_file = tsl::io::JoinPath(directory, "DONE");
+  TF_ASSERT_OK(AtomicallyWriteStringToFile(done_file, "", tsl::Env::Default()));
+  TF_EXPECT_OK(ValidateSnapshot(directory, tsl::Env::Default()));
+}
+
+TEST(FileUtilsTest, ValidateSnapshotError) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string directory, CreateTestDirectory());
+  std::string error_file = tsl::io::JoinPath(directory, "ERROR");
+  TF_ASSERT_OK(AtomicallyWriteTextProto(
+      error_file,
+      tsl::StatusToProto(
+          errors::FailedPrecondition("Failed precondition test")),
+      tsl::Env::Default()));
+  EXPECT_THAT(ValidateSnapshot(directory, tsl::Env::Default()),
+              StatusIs(tsl::error::FAILED_PRECONDITION,
+                       HasSubstr("Failed precondition test")));
+}
+
+TEST(FileUtilsTest, ValidateSnapshotNotFinished) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string directory, CreateTestDirectory());
+  EXPECT_THAT(
+      ValidateSnapshot(directory, tsl::Env::Default()),
+      StatusIs(
+          tsl::error::INVALID_ARGUMENT,
+          HasSubstr("The save job has not finished writing the snapshot.")));
+}
+
+TEST(FileUtilsTest, ValidateSnapshotNotFound) {
+  EXPECT_THAT(ValidateSnapshot("Not found", tsl::Env::Default()),
+              StatusIs(tsl::error::NOT_FOUND));
+}
+
 TEST(FileUtilsTest, IsTemporaryFile) {
   EXPECT_TRUE(IsTemporaryFile("file.tmp"));
   EXPECT_FALSE(IsTemporaryFile("file"));
diff --git a/tensorflow/core/data/service/snapshot/path_utils.cc b/tensorflow/core/data/service/snapshot/path_utils.cc
index a981fb7ce26..633be77105b 100644
--- a/tensorflow/core/data/service/snapshot/path_utils.cc
+++ b/tensorflow/core/data/service/snapshot/path_utils.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/data/service/snapshot/path_utils.h"
 
+#include <cstdint>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -31,6 +33,7 @@ namespace {
 
 constexpr const char kDoneFileName[] = "DONE";
 constexpr const char kErrorFileName[] = "ERROR";
+constexpr const char kWorkerFileName[] = "owner_worker";
 constexpr const char kSnapshotMetadataFileName[] = "snapshot.metadata";
 constexpr const char kDatasetDefFileName[] = "dataset_def.proto";
 constexpr const char kDatasetSpecFileName[] = "dataset_spec.pb";
@@ -39,6 +42,7 @@ constexpr const char kSplitsDirectoryName[] = "splits";
 constexpr const char kCheckpointsDirectoryName[] = "checkpoints";
 constexpr const char kCommittedChunksDirectoryName[] = "chunks";
 constexpr const char kUncommittedChunksDirectoryName[] = "uncommitted_chunks";
+constexpr int64_t kUnknownNumElements = -1;
 
 }  // namespace
 
@@ -72,9 +76,22 @@ std::string SplitPath(absl::string_view snapshot_path, int64_t stream_index,
       absl::StrCat("split_", local_index, "_", global_index));
 }
 
-tsl::StatusOr<std::pair<int64_t, int64_t>> SplitIndex(
-    absl::string_view split_path) {
-  std::vector<std::string> tokens = absl::StrSplit(split_path, '_');
+tsl::StatusOr<int64_t> ParseStreamDirectoryName(
+    absl::string_view stream_directory_name) {
+  std::vector<std::string> tokens = absl::StrSplit(stream_directory_name, '_');
+  int64_t stream_index = 0;
+  if (tokens.size() != 2 || tokens[0] != "stream" ||
+      !absl::SimpleAtoi(tokens[1], &stream_index) || stream_index < 0) {
+    return tsl::errors::InvalidArgument(
+        "Invalid stream directory name: ", stream_directory_name,
+        ". Expected stream_<stream_index>.");
+  }
+  return stream_index;
+}
+
+tsl::StatusOr<std::pair<int64_t, int64_t>> ParseSplitFilename(
+    absl::string_view split_filename) {
+  std::vector<std::string> tokens = absl::StrSplit(split_filename, '_');
   int64_t local_split_index = 0, global_split_index = 0;
   if (tokens.size() != 3 || tokens[0] != "split" ||
       !absl::SimpleAtoi(tokens[1], &local_split_index) ||
@@ -82,18 +99,52 @@ tsl::StatusOr<std::pair<int64_t, int64_t>> SplitIndex(
       !absl::SimpleAtoi(tokens[2], &global_split_index) ||
       global_split_index < 0) {
     return tsl::errors::InvalidArgument(
-        "Invalid split file name: ", split_path,
+        "Invalid split file name: ", split_filename,
         ". Expected split_<local_split_index>_<global_split_index>.");
   }
   if (local_split_index > global_split_index) {
     return tsl::errors::InvalidArgument(
-        "Invalid split file name: ", split_path, ". The local split index ",
+        "Invalid split file name: ", split_filename, ". The local split index ",
         local_split_index, " exceeds the global split index ",
         global_split_index, ".");
   }
   return std::make_pair(local_split_index, global_split_index);
 }
 
+tsl::StatusOr<std::pair<int64_t, int64_t>> ParseCheckpointFilename(
+    absl::string_view checkpoint_filename) {
+  std::vector<std::string> tokens = absl::StrSplit(checkpoint_filename, '_');
+  int64_t checkpoint_index = 0, checkpoint_num_elements = 0;
+  if (tokens.size() != 3 || tokens[0] != "checkpoint" ||
+      !absl::SimpleAtoi(tokens[1], &checkpoint_index) || checkpoint_index < 0 ||
+      !absl::SimpleAtoi(tokens[2], &checkpoint_num_elements) ||
+      (checkpoint_num_elements < 0 &&
+       checkpoint_num_elements != kUnknownNumElements)) {
+    return tsl::errors::InvalidArgument(
+        "Invalid checkpoint file name: ", checkpoint_filename,
+        ". Expected checkpoint_<checkpoint_index>_<checkpoint_num_elements>.");
+  }
+  return std::make_pair(checkpoint_index, checkpoint_num_elements);
+}
+
+tsl::StatusOr<std::tuple<int64_t, int64_t, int64_t>> ParseChunkFilename(
+    absl::string_view chunk_filename) {
+  std::vector<std::string> tokens = absl::StrSplit(chunk_filename, '_');
+  int64_t stream_index = 0, stream_chunk_index = 0, chunk_num_elements = 0;
+  if (tokens.size() != 4 || tokens[0] != "chunk" ||
+      !absl::SimpleAtoi(tokens[1], &stream_index) || stream_index < 0 ||
+      !absl::SimpleAtoi(tokens[2], &stream_chunk_index) ||
+      stream_chunk_index < 0 ||
+      !absl::SimpleAtoi(tokens[3], &chunk_num_elements) ||
+      (chunk_num_elements < 0 && chunk_num_elements != kUnknownNumElements)) {
+    return tsl::errors::InvalidArgument(
+        "Invalid chunk file name: ", chunk_filename,
+        ". Expected "
+        "chunk_<stream_index>_<stream_chunk_index>_<chunk_num_elements>.");
+  }
+  return std::make_tuple(stream_index, stream_chunk_index, chunk_num_elements);
+}
+
 std::string SnapshotMetadataFilePath(absl::string_view snapshot_path_) {
   return tsl::io::JoinPath(snapshot_path_, kSnapshotMetadataFileName);
 }
@@ -112,6 +163,12 @@ std::string StreamDoneFilePath(absl::string_view snapshot_path,
                            kDoneFileName);
 }
 
+std::string StreamWorkerFilePath(absl::string_view snapshot_path,
+                                 int64_t stream_index) {
+  return tsl::io::JoinPath(StreamDirectory(snapshot_path, stream_index),
+                           kWorkerFileName);
+}
+
 std::string SnapshotDoneFilePath(absl::string_view snapshot_path) {
   return tsl::io::JoinPath(snapshot_path, kDoneFileName);
 }
diff --git a/tensorflow/core/data/service/snapshot/path_utils.h b/tensorflow/core/data/service/snapshot/path_utils.h
index e4bd6768060..5c84b6114f5 100644
--- a/tensorflow/core/data/service/snapshot/path_utils.h
+++ b/tensorflow/core/data/service/snapshot/path_utils.h
@@ -15,7 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PATH_UTILS_H_
 #define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PATH_UTILS_H_
 
+#include <cstdint>
 #include <string>
+#include <tuple>
 #include <utility>
 
 #include "absl/strings/string_view.h"
@@ -47,16 +49,38 @@ std::string SplitPath(absl::string_view snapshot_path, int64_t stream_index,
                       int64_t source_id, int64_t local_index,
                       int64_t global_index);
 
+// Returns the index of the stream. The expected format of
+// `stream_directory_name` is:
+// stream_<stream_index>
+tsl::StatusOr<int64_t> ParseStreamDirectoryName(
+    absl::string_view stream_directory_name);
+
 // Returns a pair of {local_split_index, global_split_index} of the split. The
-// expected format of `split_path` is:
+// expected format of `split_filename` is:
 // split_<local_split_index>_<global_split_index>
-tsl::StatusOr<std::pair<int64_t, int64_t>> SplitIndex(
-    absl::string_view split_path);
+tsl::StatusOr<std::pair<int64_t, int64_t>> ParseSplitFilename(
+    absl::string_view split_filename);
+
+// Returns a pair of {checkpoint_index, checkpoint_num_elements} of the
+// checkpoint. The expected format of `checkpoint_filename` is:
+// checkpoint_<checkpoint_index>_<checkpoint_num_elements>
+tsl::StatusOr<std::pair<int64_t, int64_t>> ParseCheckpointFilename(
+    absl::string_view checkpoint_filename);
+
+// Returns a tuple of {stream_index, stream_chunk_index, chunk_num_elements} of
+// the chunk. The expected format of `chunk_filename` is:
+// chunk_<stream_index>_<stream_chunk_index>_<chunk_num_elements>
+tsl::StatusOr<std::tuple<int64_t, int64_t, int64_t>> ParseChunkFilename(
+    absl::string_view chunk_filename);
 
 // Returns the path of the DONE file of a snapshot stream.
 std::string StreamDoneFilePath(absl::string_view snapshot_path,
                                int64_t stream_index);
 
+// Returns the path of the owner_worker file of a snapshot stream.
+std::string StreamWorkerFilePath(absl::string_view snapshot_path,
+                                 int64_t stream_index);
+
 // Returns the path of the DONE file of a snapshot.
 std::string SnapshotDoneFilePath(absl::string_view snapshot_path);
 
diff --git a/tensorflow/core/data/service/snapshot/path_utils_test.cc b/tensorflow/core/data/service/snapshot/path_utils_test.cc
index dfca3ea8224..624a0d30a62 100644
--- a/tensorflow/core/data/service/snapshot/path_utils_test.cc
+++ b/tensorflow/core/data/service/snapshot/path_utils_test.cc
@@ -22,6 +22,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
+using ::testing::FieldsAre;
 using ::testing::HasSubstr;
 using ::testing::MatchesRegex;
 using ::testing::Pair;
@@ -57,39 +58,129 @@ TEST(PathUtilsTest, SplitPath) {
           "/path/to/snapshot.streams.stream_0.splits.source_1.split_2_3"));
 }
 
-TEST(PathUtilsTest, SplitIndex) {
-  EXPECT_THAT(SplitIndex("split_0_1"), IsOkAndHolds(Pair(0, 1)));
+TEST(PathUtilsTest, ParseStreamDirectoryName) {
+  EXPECT_THAT(ParseStreamDirectoryName("stream_1"), IsOkAndHolds(1));
 }
 
-TEST(PathUtilsTest, InvalidSplitFile) {
+TEST(PathUtilsTest, InvalidStreamDirectoryName) {
+  EXPECT_THAT(ParseStreamDirectoryName(""),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected stream_<stream_index>")));
+  EXPECT_THAT(ParseStreamDirectoryName("stream_-1"),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected stream_<stream_index>")));
+  EXPECT_THAT(ParseStreamDirectoryName("chunk_1"),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected stream_<stream_index>")));
+}
+
+TEST(PathUtilsTest, ParseSplitFilename) {
+  EXPECT_THAT(ParseSplitFilename("split_0_1"), IsOkAndHolds(Pair(0, 1)));
+}
+
+TEST(PathUtilsTest, InvalidSplitFilename) {
   EXPECT_THAT(
-      SplitIndex(""),
+      ParseSplitFilename(""),
       StatusIs(error::INVALID_ARGUMENT,
                HasSubstr(
                    "Expected split_<local_split_index>_<global_split_index>")));
   EXPECT_THAT(
-      SplitIndex("split_123"),
+      ParseSplitFilename("split_123"),
       StatusIs(error::INVALID_ARGUMENT,
                HasSubstr(
                    "Expected split_<local_split_index>_<global_split_index>")));
   EXPECT_THAT(
-      SplitIndex("split_-1_(-1)"),
+      ParseSplitFilename("split_-1_(-1)"),
       StatusIs(error::INVALID_ARGUMENT,
                HasSubstr(
                    "Expected split_<local_split_index>_<global_split_index>")));
   EXPECT_THAT(
-      SplitIndex("split_5_0"),
+      ParseSplitFilename("chunk_1_2"),
+      StatusIs(error::INVALID_ARGUMENT,
+               HasSubstr(
+                   "Expected split_<local_split_index>_<global_split_index>")));
+  EXPECT_THAT(
+      ParseSplitFilename("split_5_0"),
       StatusIs(
           error::INVALID_ARGUMENT,
           HasSubstr(
               "The local split index 5 exceeds the global split index 0")));
 }
 
+TEST(PathUtilsTest, ParseCheckpointFilename) {
+  EXPECT_THAT(ParseCheckpointFilename("checkpoint_0_1"),
+              IsOkAndHolds(Pair(0, 1)));
+  EXPECT_THAT(ParseCheckpointFilename("checkpoint_0_-1"),
+              IsOkAndHolds(Pair(0, -1)));
+}
+
+TEST(PathUtilsTest, InvalidCheckpointFilename) {
+  EXPECT_THAT(
+      ParseCheckpointFilename(""),
+      StatusIs(error::INVALID_ARGUMENT,
+               HasSubstr(
+                   "Expected "
+                   "checkpoint_<checkpoint_index>_<checkpoint_num_elements>")));
+  EXPECT_THAT(
+      ParseCheckpointFilename("checkpoint_123"),
+      StatusIs(error::INVALID_ARGUMENT,
+               HasSubstr(
+                   "Expected "
+                   "checkpoint_<checkpoint_index>_<checkpoint_num_elements>")));
+  EXPECT_THAT(
+      ParseCheckpointFilename("checkpoint_-1_(-1)"),
+      StatusIs(error::INVALID_ARGUMENT,
+               HasSubstr(
+                   "Expected "
+                   "checkpoint_<checkpoint_index>_<checkpoint_num_elements>")));
+  EXPECT_THAT(
+      ParseCheckpointFilename("chunk_1_2"),
+      StatusIs(error::INVALID_ARGUMENT,
+               HasSubstr(
+                   "Expected "
+                   "checkpoint_<checkpoint_index>_<checkpoint_num_elements>")));
+}
+
+TEST(PathUtilsTest, ParseChunkFilename) {
+  EXPECT_THAT(ParseChunkFilename("chunk_0_1_2"),
+              IsOkAndHolds(FieldsAre(0, 1, 2)));
+  EXPECT_THAT(ParseChunkFilename("chunk_0_1_-1"),
+              IsOkAndHolds(FieldsAre(0, 1, -1)));
+}
+
+TEST(PathUtilsTest, InvalidChunkFilename) {
+  EXPECT_THAT(ParseChunkFilename(""),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected "
+                                 "chunk_<stream_index>_<stream_chunk_index>_<"
+                                 "chunk_num_elements>")));
+  EXPECT_THAT(ParseChunkFilename("chunk_123_0"),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected "
+                                 "chunk_<stream_index>_<stream_chunk_index>_<"
+                                 "chunk_num_elements>")));
+  EXPECT_THAT(ParseChunkFilename("chunk_-1_(-1)_0"),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected "
+                                 "chunk_<stream_index>_<stream_chunk_index>_<"
+                                 "chunk_num_elements>")));
+  EXPECT_THAT(ParseChunkFilename("split_1_2_3"),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected "
+                                 "chunk_<stream_index>_<stream_chunk_index>_<"
+                                 "chunk_num_elements>")));
+}
+
 TEST(PathUtilsTest, StreamDoneFilePath) {
   EXPECT_THAT(StreamDoneFilePath("/path/to/snapshot", /*stream_index=*/0),
               MatchesRegex("/path/to/snapshot.streams.stream_0.DONE"));
 }
 
+TEST(PathUtilsTest, StreamWorkerFilePath) {
+  EXPECT_THAT(StreamWorkerFilePath("/path/to/snapshot", /*stream_index=*/0),
+              MatchesRegex("/path/to/snapshot.streams.stream_0.owner_worker"));
+}
+
 TEST(PathUtilsTest, SnapshotDoneFilePath) {
   EXPECT_THAT(SnapshotDoneFilePath("/path/to/snapshot"),
               MatchesRegex("/path/to/snapshot.DONE"));
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.cc b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
index afa14b7ef90..cbb11a285de 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
@@ -21,11 +21,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/dispatcher.pb.h"
 #include "tensorflow/core/data/service/snapshot/file_utils.h"
 #include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/snapshot/utils.h"
 #include "tensorflow/core/data/service/split_provider.h"
 #include "tensorflow/core/data/snapshot_utils.h"
 #include "tensorflow/tsl/lib/io/compression.h"
@@ -166,9 +168,9 @@ Status SnapshotManager::ReadOnDiskStreams() {
   }
   num_assigned_splits_ = global_split_indices.size();
 
-  if (!streams_.empty() &&
-      std::all_of(streams_.begin(), streams_.end(),
-                  [](const Stream& stream) { return stream.done; })) {
+  if (!streams_.empty() && absl::c_all_of(streams_, [](const Stream& stream) {
+        return stream.state == Stream::State::kDone;
+      })) {
     mode_ = Mode::kDone;
     TF_RETURN_IF_ERROR(AtomicallyWriteStringToFile(SnapshotDoneFilePath(path_),
                                                    std::string(), env_));
@@ -179,6 +181,19 @@ Status SnapshotManager::ReadOnDiskStreams() {
 
 Status SnapshotManager::ReadOnDiskStream(
     int64_t stream_index, absl::flat_hash_set<int64_t>& global_split_indices) {
+  std::string worker_address;
+  TF_RETURN_IF_ERROR(
+      env_->FileExists(StreamWorkerFilePath(path_, stream_index)));
+  TF_RETURN_IF_ERROR(ReadFileToString(
+      env_, StreamWorkerFilePath(path_, stream_index), &worker_address));
+  auto [it, success] = assignments_.insert({worker_address, stream_index});
+  if (!success) {
+    return InvalidArgument("tf.data dispatcher failed to assign stream ",
+                           stream_index, " to snapshot worker ", worker_address,
+                           ": The  worker is already assigned stream ",
+                           it->second, ".");
+  }
+
   std::string splits_path = SplitsDirectory(path_, stream_index);
   TF_ASSIGN_OR_RETURN(std::vector<std::string> source_directories,
                       GetChildren(splits_path, env_));
@@ -205,11 +220,9 @@ Status SnapshotManager::ReadOnDiskStream(
   }
 
   if (env_->FileExists(StreamDoneFilePath(path_, stream_index)).ok()) {
-    streams_[stream_index].done = true;
-    return OkStatus();
+    streams_[stream_index].state = Stream::State::kDone;
   }
 
-  unknowns_.insert(stream_index);
   return OkStatus();
 }
 
@@ -224,11 +237,8 @@ Status SnapshotManager::ReadOnDiskSource(
   bool unused_end_of_splits;
   for (const auto& split_filename : split_filenames) {
     std::string split_path = io::JoinPath(source_path, split_filename);
-
-    // `split_filename` must have this format:
-    // "split_<local_split_index>_<global_split_index>".
-    TF_ASSIGN_OR_RETURN(auto split_index, SplitIndex(split_filename));
-    auto [local_split_index, global_split_index] = split_index;
+    TF_ASSIGN_OR_RETURN(auto split_indices, ParseSplitFilename(split_filename));
+    auto [local_split_index, global_split_index] = split_indices;
     if (local_split_index > split_filenames.size() - 1) {
       return InvalidArgument(
           "found conflict between the number of splits and name of ",
@@ -253,9 +263,10 @@ Status SnapshotManager::ReadOnDiskSource(
 
 Status SnapshotManager::HandleStreamCompletion(
     int64_t stream_index, absl::string_view worker_address) {
-  streams_[stream_index].done = true;
-  assignments_.erase(worker_address);
-  if (assignments_.empty() && orphans_.empty() && unknowns_.empty()) {
+  streams_[stream_index].state = Stream::State::kDone;
+  if (absl::c_all_of(streams_, [](const Stream& stream) {
+        return stream.state == Stream::State::kDone;
+      })) {
     mode_ = Mode::kDone;
     TF_RETURN_IF_ERROR(AtomicallyWriteStringToFile(SnapshotDoneFilePath(path_),
                                                    std::string(), env_));
@@ -264,7 +275,8 @@ Status SnapshotManager::HandleStreamCompletion(
   return OkStatus();
 }
 
-Status SnapshotManager::HandleStreamError(const StatusProto& status_proto) {
+Status SnapshotManager::HandleStreamError(absl::string_view worker_address,
+                                          const StatusProto& status_proto) {
   // This method returns an OkStatus as the RPC status if the worker reports an
   // error. The errors are communicated back to the workers with a proper RPC
   // response, instead of with a error status.
@@ -277,23 +289,10 @@ Status SnapshotManager::HandleStreamError(const StatusProto& status_proto) {
   TF_RETURN_IF_ERROR(AtomicallyWriteTextProto(SnapshotErrorFilePath(path_),
                                               status_proto, env_));
   LOG(ERROR) << "Failed to write tf.data distributed snapshot at " << path_
-             << ". Status: " << status_.ToString();
+             << ". Worker " << worker_address << " reported error: " << status_;
   return OkStatus();
 }
 
-std::optional<int64_t> SnapshotManager::MaybeAssignOrphanStream(
-    absl::string_view worker_address) {
-  if (!orphans_.empty()) {
-    int64_t stream_index = *orphans_.begin();
-    orphans_.erase(orphans_.begin());
-    assignments_[worker_address] = stream_index;
-    VLOG(1) << "assigning an existing stream, " << stream_index
-            << ", to worker " << worker_address;
-    return stream_index;
-  }
-  return std::nullopt;
-}
-
 StatusOr<int64_t> SnapshotManager::CreateAndAssignNewStream(
     absl::string_view worker_address) {
   int64_t new_stream_index = streams_.size();
@@ -301,22 +300,15 @@ StatusOr<int64_t> SnapshotManager::CreateAndAssignNewStream(
     TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(
         SourceDirectory(path_, new_stream_index, source_index)));
   }
+  TF_RETURN_IF_ERROR(AtomicallyWriteStringToFile(
+      StreamWorkerFilePath(path_, new_stream_index), worker_address, env_));
   streams_.push_back(Stream(num_sources()));
   assignments_[worker_address] = new_stream_index;
-  VLOG(1) << "assigning a new stream, " << new_stream_index << ", to worker "
-          << worker_address;
+  LOG(INFO) << "creating stream_ " << new_stream_index << " and assigning to "
+            << worker_address;
   return new_stream_index;
 }
 
-void SnapshotManager::ReassignPreviouslyAssignedStream(
-    int64_t stream_index, absl::string_view worker_address) {
-  VLOG(1) << "reassigning a previous assignment of stream " << stream_index
-          << " to worker " << worker_address;
-  assignments_[worker_address] = stream_index;
-  orphans_.erase(stream_index);
-  unknowns_.erase(stream_index);
-}
-
 StatusOr<std::optional<int64_t>>
 SnapshotManager::MaybeGetOrCreateStreamAssignment(
     absl::string_view worker_address,
@@ -334,25 +326,17 @@ SnapshotManager::MaybeGetOrCreateStreamAssignment(
                               " but it's actually assigned assigned stream ",
                               *assigned_stream_index);
     }
-    if (!assigned_stream_index &&
-        stream_available(snapshot_progress->snapshot_task().stream_index())) {
-      ReassignPreviouslyAssignedStream(
-          snapshot_progress->snapshot_task().stream_index(), worker_address);
-      assigned_stream_index = snapshot_progress->snapshot_task().stream_index();
-    }
     if (assigned_stream_index.has_value() && snapshot_progress->completed()) {
       TF_RETURN_IF_ERROR(HandleStreamCompletion(
           snapshot_progress->snapshot_task().stream_index(), worker_address));
       assigned_stream_index.reset();
     }
     if (snapshot_progress->status().code() != error::OK) {
-      TF_RETURN_IF_ERROR(HandleStreamError(snapshot_progress->status()));
+      TF_RETURN_IF_ERROR(
+          HandleStreamError(worker_address, snapshot_progress->status()));
       return std::optional<int64_t>();
     }
   }
-  if (!assigned_stream_index) {
-    assigned_stream_index = MaybeAssignOrphanStream(worker_address);
-  }
   if (!assigned_stream_index) {
     if (mode_ != Mode::kActive) {
       return std::optional<int64_t>();
@@ -395,22 +379,12 @@ Status SnapshotManager::WorkerHeartbeat(const WorkerHeartbeatRequest& request,
 
 Status SnapshotManager::GetSnapshotSplit(const GetSnapshotSplitRequest& request,
                                          GetSnapshotSplitResponse& response) {
-  auto it = assignments_.find(request.worker_address());
-  if (it == assignments_.end()) {
-    if (!stream_available(request.stream_index())) {
-      return errors::FailedPrecondition(
-          "worker ", request.worker_address(),
-          " has no known assignment and its desired stream, ",
-          request.stream_index(), ", is unavailable");
-    }
-    if (dead_workers_.contains(request.worker_address())) {
-      return errors::FailedPrecondition(
-          "worker ", request.worker_address(),
-          " is considered to have timed out and must heartbeat to retain its "
-          "stream assignment before requesting more splits");
-    }
-    ReassignPreviouslyAssignedStream(request.stream_index(),
-                                     request.worker_address());
+  if (auto it = assignments_.find(request.worker_address());
+      it == assignments_.end()) {
+    return errors::Internal("tf.data snapshot worker ",
+                            request.worker_address(), " was assigned stream ",
+                            request.stream_index(),
+                            ", but the assignment is no longer available.");
   } else if (it->second != request.stream_index()) {
     return errors::Internal("worker ", request.worker_address(),
                             " think it's assigned stream ",
@@ -453,39 +427,12 @@ Status SnapshotManager::GetSnapshotStreams(
   for (int64_t i = 0; i < streams_.size(); ++i) {
     SnapshotStreamInfo* stream = response.add_streams();
     stream->set_index(i);
-    if (orphans_.contains(i)) {
-      stream->set_state(SnapshotStreamInfo::ORPHAN);
-    } else if (unknowns_.contains(i)) {
-      stream->set_state(SnapshotStreamInfo::UNKNOWN);
-    } else {
-      stream->set_state(streams_[i].done ? SnapshotStreamInfo::DONE
-                                         : SnapshotStreamInfo::ASSIGNED);
-    }
+    stream->set_state(streams_[i].state == Stream::State::kDone
+                          ? SnapshotStreamInfo::DONE
+                          : SnapshotStreamInfo::ASSIGNED);
   }
   return OkStatus();
 }
 
-void SnapshotManager::HandleMissingWorker(const std::string& worker_address) {
-  if (auto it = assignments_.find(worker_address); it != assignments_.end()) {
-    LOG(INFO) << "deleting assignment for stream " << it->second
-              << " due to lost worker " << worker_address;
-    orphans_.insert(it->second);
-    assignments_.erase(it);
-    dead_workers_.insert(worker_address);
-  }
-}
-
-void SnapshotManager::UpdateStreams() {
-  // Check for streams to move from `unknowns_` to `orphans_`.
-  if (resume_time_micros_.has_value() && !unknowns_.empty() &&
-      absl::Microseconds(env_->NowMicros()) - resume_time_micros_.value() >
-          kUnknownStreamTimeout) {
-    for (auto stream_index : unknowns_) {
-      orphans_.insert(stream_index);
-    }
-    unknowns_.clear();
-  }
-}
-
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.h b/tensorflow/core/data/service/snapshot/snapshot_manager.h
index 3e876b9d9b2..36f48a9f8b2 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager.h
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.h
@@ -40,21 +40,23 @@ namespace data {
 // The on-disk state has this structure:
 // - snapshot_path
 //   - DONE
+//   - ERROR
 //   - snapshot.metadata
 //   - dataset_def.proto
 //   - dataset_spec.pb
 //   - chunks
-//     - chunk_<stream_index>_<chunk_index>
+//     - chunk_<stream_index>_<stream_chunk_index>_<num_elements>
 //   - streams
 //     - stream_0
 //       - DONE
+//       - ERROR
 //       - splits
 //         - source_0
 //           - split_<local_split_index>_<global_split_index>
-//       - uncommitted_chucnks
+//       - uncommitted_chunks
 //         - chunk_<chunk_index>
 //       - checkpoints
-//         - checkpoint_<chunk_index>
+//         - checkpoint_<chunk_index>_<num_elements>
 //
 class SnapshotManager {
  public:
@@ -80,13 +82,6 @@ class SnapshotManager {
                                GetSnapshotSplitResponse& response);
   tsl::Status GetSnapshotStreams(GetSnapshotStreamsResponse& response);
 
-  // Checks for a stream that should move from `assignments_` to `orphans_` due
-  // to its assigned worker having stopped heartbeating.
-  void HandleMissingWorker(const std::string& worker_address);
-  // Checks for streams that should move from `unknowns_` to `orphans_` due to
-  // the dispatcher not having gotten a heartbeat from an assigned worker.
-  void UpdateStreams();
-
  private:
   SnapshotManager(
       absl::string_view path, Env* env,
@@ -121,7 +116,8 @@ class SnapshotManager {
       absl::string_view worker_address);
   tsl::StatusOr<int64_t> CreateAndAssignNewStream(
       absl::string_view worker_address);
-  Status HandleStreamError(const StatusProto& status_proto);
+  Status HandleStreamError(absl::string_view worker_address,
+                           const StatusProto& status_proto);
 
   // The filepath of the on-disk state.
   const std::string path_;
@@ -143,31 +139,22 @@ class SnapshotManager {
   struct Stream {
     explicit Stream(int64_t num_sources) : num_assigned_splits(num_sources) {}
 
+    enum class State {
+      // The stream is not finished and the worker is heartbeating.
+      kActive,
+      // The stream is finished.
+      kDone,
+    };
+
     // A counter of assigned splits for each source.
     std::vector<int64_t> num_assigned_splits;
-    // If `true`, there are no more splits to be processed for this stream.
-    bool done = false;
+    State state = State::kActive;
   };
 
   // All streams for this snapshot.
   std::vector<Stream> streams_;
-  // Indices of all "assigned" streams, keyed by worker address. A stream is
-  // considered to be assigned if the dispatcher knows of a worker
-  // processing the stream and that worker is heartbeating.
+  // A mapping of assigned worker to stream index.
   absl::flat_hash_map<std::string, int64_t> assignments_;
-  // Indices of all "orphan" streams. A stream is considered to be an orphan if
-  // the dispatcher believes that there is no worker currently processing the
-  // stream. Orphans are eventually assigned to unoccupied workers.
-  absl::flat_hash_set<int64_t> orphans_;
-  // Indices of all "unknown" streams. A stream is considered to be an unknown
-  // if the dispatcher recently restarted and has no idea whether or not there
-  // is a worker currently processing the stream. Unknown streams are eventually
-  // (1) reassigned to the worker processing the stream or (2) considered to be
-  // orphans.
-  absl::flat_hash_set<int64_t> unknowns_;
-  bool stream_available(int64_t stream_index) const {
-    return orphans_.contains(stream_index) || unknowns_.contains(stream_index);
-  }
 
   // A counter of assigned aplits for this snapshot.
   int64_t num_assigned_splits_ = 0;
diff --git a/tensorflow/core/data/service/snapshot/snapshot_reader.cc b/tensorflow/core/data/service/snapshot/snapshot_reader.cc
index 7f41d20d376..ff63ec41d74 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_reader.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_reader.cc
@@ -14,15 +14,25 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/data/service/snapshot/snapshot_reader.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
-#include "tensorflow/core/data/service/common.h"
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/data/service/snapshot/file_utils.h"
 #include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/refcount.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
@@ -30,67 +40,222 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-constexpr int64_t kTFRecordReaderOutputBufferSize = 256 << 20;  // 256MB
+constexpr const char* const kChunkFile = "chunk_file";
+constexpr const char* const kCompression = "compression";
+constexpr const char* const kStartIndex = "start_index";
+constexpr const char* const kOutputTypes = "output_types";
+constexpr const char* const kOutputShapes = "output_shapes";
 
-}
+constexpr int64_t kTFRecordReaderOutputBufferSize = 512 << 20;  // 512MB
 
-SnapshotReader::SnapshotReader(const SnapshotReaderParams& params)
-    : params_(params) {}
+// A reader dataset is responsible for reading one chunk file.
+// TODO(b/250921378): Merge this with `snapshot_util::Reader::Dataset`.
+class SnapshotChunkDatasetOp : public DatasetOpKernel {
+ public:
+  explicit SnapshotChunkDatasetOp(OpKernelConstruction* ctx);
+  class Dataset;
 
-StatusOr<GetNextResult> SnapshotReader::GetNext() {
-  TF_RETURN_IF_ERROR(EnsureInitialized());
-  while (!end_of_sequence_) {
-    GetNextResult result;
-    Status status = tfrecord_reader_->ReadTensors(&result.tensors);
-    if (status.ok()) {
-      return result;
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  std::string compression_;
+};
+
+class SnapshotChunkDatasetOp::Dataset : public DatasetBase {
+ public:
+  Dataset(DatasetContext&& ctx, const std::string& chunk_file,
+          const std::string& compression, const DataTypeVector& dtypes,
+          const std::vector<PartialTensorShape>& shapes)
+      : DatasetBase(std::move(ctx)),
+        chunk_file_(chunk_file),
+        compression_(compression),
+        dtypes_(dtypes),
+        shapes_(shapes) {}
+
+  const DataTypeVector& output_dtypes() const override { return dtypes_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return shapes_;
+  }
+
+  std::string DebugString() const override { return "SnapshotChunkDataset"; }
+
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return OkStatus();
+  }
+
+  Status CheckExternalState() const override { return OkStatus(); }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* chunk_file = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(chunk_file_, &chunk_file));
+
+    AttrValue compression;
+    b->BuildAttrValue(compression_, &compression);
+
+    return b->AddDataset(this,
+                         /*inputs=*/
+                         {std::make_pair(0, chunk_file)},
+                         /*list_inputs=*/{},
+                         /*attrs=*/
+                         {{kCompression, compression}},
+                         /*use_dataset_name=*/true, output);
+  }
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return std::make_unique<Iterator>(Iterator::Params{
+        this, name_utils::IteratorPrefix(node_name(), prefix)});
+  }
+
+ private:
+  class Iterator : public DatasetIterator<Dataset> {
+   public:
+    explicit Iterator(const Params& params)
+        : DatasetIterator<Dataset>(params) {}
+
+    Status Initialize(IteratorContext* ctx) override {
+      reader_ = std::make_unique<snapshot_util::TFRecordReader>(
+          dataset()->chunk_file_, dataset()->compression_, dataset()->dtypes_,
+          kTFRecordReaderOutputBufferSize);
+      return reader_->Initialize(ctx->env());
     }
-    if (!errors::IsOutOfRange(status)) {
+
+   protected:
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      *end_of_sequence = false;
+      Status status = reader_->ReadTensors(out_tensors);
+      if (errors::IsOutOfRange(status)) {
+        *end_of_sequence = true;
+        return OkStatus();
+      }
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(
+          status,
+          " Failed to read tf.data snapshot file: ", dataset()->chunk_file_);
+      ++start_index_;
       return status;
     }
-    TF_RETURN_IF_ERROR(InitializeNextRecordReader());
-  }
-  return GetNextResult::EndOfSequence();
+
+    Status SaveInternal(SerializationContext* ctx,
+                        IteratorStateWriter* writer) override {
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(kStartIndex), start_index_));
+      return OkStatus();
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kStartIndex), &start_index_));
+      TF_RETURN_IF_ERROR(Initialize(ctx));
+      return AdvanceToStartIndex(ctx);
+    }
+
+   private:
+    // TODO(b/250921378): Optimize this to not parse every single element. We
+    // may consider switching the data format to ArrayRecords so we can use the
+    // index to jump straight to the starting record.
+    Status AdvanceToStartIndex(IteratorContext* ctx) {
+      for (int64_t i = 0; i < start_index_; ++i) {
+        std::vector<Tensor> unused;
+        TF_RETURN_IF_ERROR(reader_->ReadTensors(&unused));
+      }
+      return OkStatus();
+    }
+
+    std::unique_ptr<snapshot_util::TFRecordReader> reader_;
+    int64_t start_index_ = 0;
+  };
+
+  const tstring chunk_file_;
+  const tstring compression_;
+  const DataTypeVector dtypes_;
+  const std::vector<PartialTensorShape> shapes_;
+};
+
+SnapshotChunkDatasetOp::SnapshotChunkDatasetOp(OpKernelConstruction* ctx)
+    : DatasetOpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kCompression, &compression_));
 }
 
-Status SnapshotReader::EnsureInitialized() {
-  if (!chunk_files_.empty()) {
-    return OkStatus();
-  }
+void SnapshotChunkDatasetOp::MakeDataset(OpKernelContext* ctx,
+                                         DatasetBase** output) {
+  tstring chunk_file;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kChunkFile, &chunk_file));
 
-  TF_ASSIGN_OR_RETURN(chunk_files_, GetChunkFiles());
-  TF_RETURN_IF_ERROR(InitializeNextRecordReader());
-  if (end_of_sequence_) {
-    return errors::NotFound("Failed to read distributed tf.data snapshot ",
-                            params_.DebugString(), ": No snapshot is written.");
+  *output = new SnapshotChunkDatasetOp::Dataset(DatasetContext(ctx), chunk_file,
+                                                compression_, output_types_,
+                                                output_shapes_);
+}
+
+Status MakeNestedDataset(const SnapshotReaderParams& params,
+                         DatasetBase** output) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::string> chunk_files,
+      GetChildren(params.CommittedChunksDirectory(), params.env));
+
+  std::vector<DatasetBase*> datasets;
+  datasets.reserve(chunk_files.size());
+  for (int64_t i = 0; i < chunk_files.size(); ++i) {
+    std::string chunk_file_path =
+        tsl::io::JoinPath(params.CommittedChunksDirectory(), chunk_files[i]);
+    datasets.push_back(new SnapshotChunkDatasetOp::Dataset(
+        DatasetContext(DatasetContext::Params(
+            {"SnapshotChunkDataset",
+             strings::StrCat("SnapshotChunkDataset/_", i)})),
+        chunk_file_path, params.metadata.compression(), params.dtypes,
+        params.shapes));
+    datasets.back()->Initialize(/*metadata=*/{});
   }
+  snapshot_util::Reader::MakeNestedDataset(datasets, output);
   return OkStatus();
 }
 
-StatusOr<std::vector<std::string>> SnapshotReader::GetChunkFiles() {
-  std::string chunks_directory = params_.CommittedChunksDirectory();
-  std::vector<string> chunk_files;
-  TF_RETURN_IF_ERROR(params_.env->GetChildren(chunks_directory, &chunk_files));
-  for (std::string& chunk_file : chunk_files) {
-    chunk_file = tsl::io::JoinPath(chunks_directory, chunk_file);
-  }
-  return chunk_files;
-}
+REGISTER_KERNEL_BUILDER(Name("SnapshotChunkDataset").Device(DEVICE_CPU),
+                        SnapshotChunkDatasetOp);
 
-Status SnapshotReader::InitializeNextRecordReader() {
-  if (next_chunk_index_ >= chunk_files_.size()) {
-    end_of_sequence_ = true;
-    tfrecord_reader_ = nullptr;
-    return OkStatus();
+}  // namespace
+
+StatusOr<core::RefCountPtr<DatasetBase>> MakeSnapshotReaderDataset(
+    const SnapshotReaderParams& params,
+    InstantiatedCapturedFunction& instantiated_captured_func,
+    IteratorContext* ctx) {
+  TF_RETURN_IF_ERROR(ValidateSnapshot(params.snapshot_path, params.env));
+  DatasetBase* dataset_of_snapshot_files;
+  TF_RETURN_IF_ERROR(MakeNestedDataset(params, &dataset_of_snapshot_files));
+
+  Tensor input_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_RETURN_IF_ERROR(StoreDatasetInVariantTensor(dataset_of_snapshot_files,
+                                                 &input_dataset_tensor));
+
+  std::vector<Tensor> reader_input;
+  std::vector<Tensor> reader_output;
+  reader_input.push_back(std::move(input_dataset_tensor));
+
+  // NOTE: We intentionally ignore resource modeling outside GetNext().
+  TF_RETURN_IF_ERROR(instantiated_captured_func.Run(
+      ctx, std::move(reader_input), &reader_output, /*node=*/nullptr));
+  if (reader_output.size() != 1) {
+    return errors::InvalidArgument(
+        "reader_func in tf.data.Dataset.load is expected to return one "
+        "argument. Got ",
+        reader_output.size(), ".");
   }
-  tfrecord_reader_ = std::make_unique<snapshot_util::TFRecordReader>(
-      chunk_files_[next_chunk_index_], params_.metadata.compression(),
-      params_.output_types, kTFRecordReaderOutputBufferSize);
-  TF_RETURN_IF_ERROR(tfrecord_reader_->Initialize(params_.env));
-  LOG(INFO) << "Starting to read distributed tf.data snapshot "
-            << params_.DebugString() << ", chunk " << next_chunk_index_;
-  ++next_chunk_index_;
-  return OkStatus();
+  DatasetBase* output_dataset = nullptr;
+  TF_RETURN_IF_ERROR(
+      GetDatasetFromVariantTensor(reader_output[0], &output_dataset));
+  output_dataset->Ref();
+  return core::RefCountPtr<DatasetBase>(output_dataset);
 }
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/snapshot/snapshot_reader.h b/tensorflow/core/data/service/snapshot/snapshot_reader.h
index 23f2f05de5f..76510a5bd46 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_reader.h
+++ b/tensorflow/core/data/service/snapshot/snapshot_reader.h
@@ -15,19 +15,18 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_READER_H_
 #define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_READER_H_
 
-#include <cstdint>
-#include <memory>
 #include <string>
 #include <vector>
 
 #include "absl/strings/substitute.h"
-#include "tensorflow/core/data/service/common.h"
+#include "tensorflow/core/data/captured_function.h"
 #include "tensorflow/core/data/service/snapshot/path_utils.h"
 #include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/protobuf/snapshot.pb.h"
 #include "tensorflow/tsl/platform/env.h"
-#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/refcount.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
@@ -42,7 +41,10 @@ struct SnapshotReaderParams {
   experimental::DistributedSnapshotMetadata metadata;
 
   // Data types of the snapshot data elements.
-  DataTypeVector output_types;
+  DataTypeVector dtypes;
+
+  // Data shape of the snapshot data elements.
+  std::vector<PartialTensorShape> shapes;
 
   // The Tensorflow environment.
   Env* env = nullptr;
@@ -58,41 +60,11 @@ struct SnapshotReaderParams {
   }
 };
 
-// Reads a distributed tf.data snapshot written by `SnapshotManager` and
-// `SnapshotStreamWriter`. See the comment on SnapshotManager for
-// how the directory is structured.
-// TODO(b/258691097): Support parallel read and make it thread-safe.
-// TODO(b/258691097): Support `reader_func`.
-class SnapshotReader {
- public:
-  explicit SnapshotReader(const SnapshotReaderParams& params);
-  virtual ~SnapshotReader() = default;
-  SnapshotReader(const SnapshotReader&) = delete;
-  SnapshotReader& operator=(const SnapshotReader&) = delete;
-
-  // Gets the next element from the snapshot.
-  StatusOr<GetNextResult> GetNext();
-
- private:
-  // Initializes the reader if it's not already initialized. This is called when
-  // `GetNext` is first called.
-  Status EnsureInitialized();
-  // Returns a list of the committed chunks.
-  StatusOr<std::vector<std::string>> GetChunkFiles();
-  // If a chunk file is exhausted, starts reading the next chunk file. If there
-  // are no more files to read, `end_of_sequence_` will be set to true.
-  Status InitializeNextRecordReader();
-
-  const SnapshotReaderParams params_;
-
-  // A list of the committed chunks to read.
-  std::vector<std::string> chunk_files_;
-  // The index of the next chunk to read.
-  uint64_t next_chunk_index_ = 0;
-  bool end_of_sequence_ = false;
-
-  std::unique_ptr<snapshot_util::TFRecordReader> tfrecord_reader_;
-};
+// Creates a dataset that reads tf.data distributed snapshots.
+StatusOr<core::RefCountPtr<DatasetBase>> MakeSnapshotReaderDataset(
+    const SnapshotReaderParams& params,
+    InstantiatedCapturedFunction& instantiated_captured_func,
+    IteratorContext* ctx);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_reader_test.cc b/tensorflow/core/data/service/snapshot/snapshot_reader_test.cc
deleted file mode 100644
index b179771cdb7..00000000000
--- a/tensorflow/core/data/service/snapshot/snapshot_reader_test.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/data/service/snapshot/snapshot_reader.h"
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/core/data/service/common.h"
-#include "tensorflow/core/data/service/common.pb.h"
-#include "tensorflow/core/data/service/snapshot/path_utils.h"
-#include "tensorflow/core/data/service/snapshot/snapshot_stream_writer.h"
-#include "tensorflow/core/data/service/task_runner.h"
-#include "tensorflow/core/data/service/test_util.h"
-#include "tensorflow/core/data/standalone.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/protobuf/snapshot.pb.h"
-#include "tensorflow/tsl/lib/io/compression.h"
-#include "tensorflow/tsl/platform/env.h"
-#include "tensorflow/tsl/platform/errors.h"
-#include "tensorflow/tsl/platform/status.h"
-#include "tensorflow/tsl/platform/status_matchers.h"
-#include "tensorflow/tsl/platform/statusor.h"
-#include "tensorflow/tsl/protobuf/error_codes.pb.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-using ::testing::ElementsAre;
-using ::testing::IsEmpty;
-using ::testing::UnorderedElementsAre;
-using ::tsl::testing::IsOkAndHolds;
-using ::tsl::testing::StatusIs;
-
-StatusOr<std::unique_ptr<StandaloneTaskIterator>> TestIterator(
-    const DatasetDef& dataset_def) {
-  std::unique_ptr<standalone::Dataset> dataset;
-  TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
-      standalone::Dataset::Params(), dataset_def.graph(), &dataset));
-  std::unique_ptr<standalone::Iterator> iterator;
-  TF_RETURN_IF_ERROR(dataset->MakeIterator(&iterator));
-  return std::make_unique<StandaloneTaskIterator>(std::move(dataset),
-                                                  std::move(iterator));
-}
-
-StatusOr<std::string> CreateSnapshotDirectory() {
-  std::string snapshot_path;
-  if (!Env::Default()->LocalTempFilename(&snapshot_path)) {
-    return errors::FailedPrecondition(
-        "Failed to create local temp file for snapshot.");
-  }
-  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(
-      CommittedChunksDirectory(snapshot_path)));
-  return snapshot_path;
-}
-
-template <class T>
-StatusOr<std::vector<T>> ReadSnapshot(const std::string& base_path,
-                                      const std::string& compression) {
-  experimental::DistributedSnapshotMetadata metadata;
-  metadata.set_compression(compression);
-  SnapshotReaderParams params{base_path, metadata, DataTypeVector{DT_INT64},
-                              Env::Default()};
-  SnapshotReader reader(params);
-  std::vector<T> result;
-  while (true) {
-    TF_ASSIGN_OR_RETURN(GetNextResult next, reader.GetNext());
-    if (next.end_of_sequence) {
-      return result;
-    }
-    result.push_back(next.tensors[0].unaligned_flat<T>().data()[0]);
-  }
-  return result;
-}
-
-TEST(SnapshotReaderTest, ReadSnapshot) {
-  int64_t range = 10;
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
-                          TestIterator(testing::RangeDataset(range)));
-  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
-  SnapshotWriterParams writer_params{snapshot_path, /*stream_index=*/0,
-                                     tsl::io::compression::kNone,
-                                     Env::Default()};
-  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
-  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
-
-  EXPECT_THAT(ReadSnapshot<int64_t>(snapshot_path, tsl::io::compression::kNone),
-              IsOkAndHolds(ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
-}
-
-TEST(SnapshotReaderTest, MultipleWritersAndChunks) {
-  int64_t range = 10, num_writers = 3;
-  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
-
-  std::vector<std::unique_ptr<SnapshotStreamWriter>> writers;
-  for (int i = 0; i < num_writers; ++i) {
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
-                            TestIterator(testing::RangeDataset(range)));
-    SnapshotWriterParams writer_params{
-        snapshot_path, /*stream_index=*/i, tsl::io::compression::kNone,
-        Env::Default(), /*max_chunk_size_bytes=*/1};
-    writers.push_back(std::make_unique<SnapshotStreamWriter>(
-        writer_params, std::move(iterator)));
-  }
-  for (std::unique_ptr<SnapshotStreamWriter>& writer : writers) {
-    EXPECT_THAT(writer->Wait(), IsOkAndHolds(true));
-  }
-  EXPECT_THAT(ReadSnapshot<int64_t>(snapshot_path, tsl::io::compression::kNone),
-              IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,
-                                                1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
-                                                2, 3, 4, 5, 6, 7, 8, 9)));
-}
-
-TEST(SnapshotReaderTest, EmptyDataset) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
-                          TestIterator(testing::RangeDataset(0)));
-  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
-  SnapshotWriterParams writer_params{snapshot_path, /*stream_index=*/0,
-                                     tsl::io::compression::kNone,
-                                     Env::Default()};
-  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
-  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
-
-  EXPECT_THAT(ReadSnapshot<int64_t>(snapshot_path, tsl::io::compression::kNone),
-              IsOkAndHolds(IsEmpty()));
-}
-
-TEST(SnapshotReaderTest, SnapshotDoesNotExist) {
-  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_does_not_exist,
-                          CreateSnapshotDirectory());
-  EXPECT_THAT(ReadSnapshot<int64_t>(snapshot_does_not_exist,
-                                    tsl::io::compression::kNone),
-              StatusIs(error::NOT_FOUND));
-}
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_split_provider.cc b/tensorflow/core/data/service/snapshot/snapshot_split_provider.cc
index 5e96fadfaff..e45bdc14303 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_split_provider.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_split_provider.cc
@@ -97,11 +97,10 @@ Status SnapshotSplitProvider::GetSplitFromFile(const std::string& split_file,
                                                bool* end_of_splits)
     TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   VLOG(3) << "Getting the next split from file: " << split_file;
-  snapshot_util::TFRecordReader reader(split_file, tsl::io::compression::kNone,
-                                       DataTypeVector{1, DT_VARIANT});
-  std::vector<Tensor> tensors;
+  snapshot_util::TFRecordReaderImpl reader(split_file,
+                                           tsl::io::compression::kNone);
   TF_RETURN_IF_ERROR(reader.Initialize(env_));
-  TF_RETURN_IF_ERROR(reader.ReadTensors(&tensors));
+  TF_ASSIGN_OR_RETURN(std::vector<Tensor> tensors, reader.GetTensors());
   if (tensors.size() != 1) {
     return errors::Internal(
         "A snapshot split file is expected to contain 1 tensor. Got ",
@@ -139,8 +138,8 @@ SnapshotSplitProvider::GetSplitsFiles(int64_t start_index) const
   TF_ASSIGN_OR_RETURN(std::vector<std::string> split_files,
                       GetChildren(splits_directory, env_));
   for (const std::string& split_file : split_files) {
-    TF_ASSIGN_OR_RETURN(auto split_index, SplitIndex(split_file));
-    auto [local_split_index, global_split_index] = split_index;
+    TF_ASSIGN_OR_RETURN(auto split_indices, ParseSplitFilename(split_file));
+    auto [local_split_index, global_split_index] = split_indices;
     if (local_split_index >= next_split_index_) {
       splits[local_split_index] =
           tsl::io::JoinPath(splits_directory, split_file);
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
index 41b841657cc..515ef8484e4 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
@@ -45,6 +45,9 @@ namespace tensorflow {
 namespace data {
 namespace {
 
+constexpr int64_t kTFRecordReaderOutputBufferSize = 512 << 20;  // 512MB
+constexpr int64_t kUnknownNumElements = -1;
+
 // Extracts the index from `filename`. If `filename` is `prefix_<index>`, this
 // returns <index>. If `filename` does not start with `prefix`, returns an
 // internal error.
@@ -61,8 +64,6 @@ StatusOr<int64_t> GetFileIndex(const std::string& filename,
 
 }  // namespace
 
-constexpr int64_t SnapshotWriterParams::kDefaultMaxChunkSizeBytes;
-
 SnapshotStreamWriter::SnapshotStreamWriter(
     const SnapshotWriterParams& params, std::unique_ptr<TaskIterator> iterator)
     : params_(params), iterator_(std::move(iterator)) {
@@ -84,11 +85,6 @@ void SnapshotStreamWriter::WriteSnapshotAndLog() TF_LOCKS_EXCLUDED(mu_) {
   LOG(INFO) << "Writing distributed tf.data snapshot stream: "
             << params_.DebugString();
   Status status = WriteSnapshot();
-  if (errors::IsFailedPrecondition(status)) {
-    LOG(INFO) << "Stopping writing distributed tf.data snapshot stream: "
-              << status.error_message();
-    return;
-  }
   status = FinalizeStream(status);
   mutex_lock l(mu_);
   if (!status.ok()) {
@@ -157,6 +153,7 @@ Status SnapshotStreamWriter::CommitChunk() {
   ++chunk_index_;
   metrics::RecordTFDataServiceSnapshotBytesCommitted(chunk_size_bytes_);
   chunk_size_bytes_ = 0;
+  chunk_num_elements_ = 0;
   return OkStatus();
 }
 
@@ -168,7 +165,8 @@ std::string SnapshotStreamWriter::GetChunkFilePath() const {
 std::string SnapshotStreamWriter::GetCommittedChunkFilePath() const {
   return tsl::io::JoinPath(
       params_.CommittedChunksDirectory(),
-      absl::StrCat("chunk_", params_.stream_index, "_", chunk_index_));
+      absl::StrCat("chunk_", params_.stream_index, "_", chunk_index_, "_",
+                   chunk_num_elements_));
 }
 
 bool SnapshotStreamWriter::ShouldWriteRecord() const TF_LOCKS_EXCLUDED(mu_) {
@@ -186,6 +184,7 @@ Status SnapshotStreamWriter::WriteRecord(
   }
   TF_RETURN_IF_ERROR(writer.WriteTensors(element));
   chunk_size_bytes_ += EstimatedSizeBytes(element);
+  ++chunk_num_elements_;
   return OkStatus();
 }
 
@@ -257,12 +256,14 @@ bool SnapshotStreamWriter::ShouldSave() const TF_LOCKS_EXCLUDED(mu_) {
 Status SnapshotStreamWriter::Save() {
   LOG(INFO) << "Checkpointing distributed tf.data snapshot writer. Stream "
             << params_.stream_index << ", chunk " << chunk_index_
-            << ", chunk size in bytes: " << chunk_size_bytes_ << ".";
-  std::string checkpoint_path = CheckpointPath(chunk_index_);
-  TF_ASSIGN_OR_RETURN(Tensor serialized_iterator, iterator_->Save());
-  TF_RETURN_IF_ERROR(
-      AtomicallyWriteTFRecords(checkpoint_path, {serialized_iterator},
-                               params_.compression, params_.env));
+            << ", chunk size in bytes: " << chunk_size_bytes_
+            << ", number of elements in chunk: " << chunk_num_elements_ << ".";
+  std::string checkpoint_path =
+      CheckpointPath(chunk_index_, chunk_num_elements_);
+  TF_ASSIGN_OR_RETURN(std::vector<Tensor> serialized_iterator,
+                      iterator_->Save());
+  TF_RETURN_IF_ERROR(AtomicallyWriteTFRecords(
+      checkpoint_path, serialized_iterator, params_.compression, params_.env));
   return DeleteOutdatedCheckpoints();
 }
 
@@ -282,8 +283,9 @@ Status SnapshotStreamWriter::DeleteOutdatedCheckpoints() {
       continue;
     }
 
-    TF_ASSIGN_OR_RETURN(int64_t checkpoint_index,
-                        GetFileIndex(checkpoint_filename, "checkpoint"));
+    TF_ASSIGN_OR_RETURN(auto checkpoint_filename_tokens,
+                        ParseCheckpointFilename(checkpoint_filename));
+    auto [checkpoint_index, unused] = checkpoint_filename_tokens;
     if (checkpoint_index < chunk_index_) {
       TF_RETURN_IF_ERROR(params_.env->DeleteFile(checkpoint_filepath));
     }
@@ -304,34 +306,33 @@ Status SnapshotStreamWriter::DeleteCheckpoints() {
 }
 
 Status SnapshotStreamWriter::Restore() {
-  StatusOr<int64_t> checkpoint_index = LastCheckpointIndex();
-  if (errors::IsNotFound(checkpoint_index.status())) {
-    // No checkpoint has been written. Does not restore anything.
-    return OkStatus();
+  StatusOr<std::string> checkpoint_name = LastCheckpointName();
+  if (errors::IsNotFound(checkpoint_name.status())) {
+    // No checkpoint has been written. Deletes any uncommitted chunks.
+    // Otherwise, it may attempt to write an existing file.
+    return SyncCheckpointWithChunks(/*checkpoint_index=*/std::nullopt,
+                                    kUnknownNumElements);
   }
-  TF_RETURN_IF_ERROR(checkpoint_index.status());
-
-  std::string checkpoint_path = CheckpointPath(*checkpoint_index);
-  snapshot_util::TFRecordReader reader(checkpoint_path, params_.compression,
-                                       DataTypeVector{1, DT_VARIANT});
+  TF_RETURN_IF_ERROR(checkpoint_name.status());
+  snapshot_util::TFRecordReaderImpl reader(CheckpointPath(*checkpoint_name),
+                                           params_.compression,
+                                           kTFRecordReaderOutputBufferSize);
   TF_RETURN_IF_ERROR(reader.Initialize(params_.env));
-  std::vector<Tensor> serialized_tensors;
-  TF_RETURN_IF_ERROR(reader.ReadTensors(&serialized_tensors));
-  if (serialized_tensors.size() != 1) {
-    return errors::Internal(
-        "A snapshot checkpoint file is expected to contain 1 Tensor. Got ",
-        serialized_tensors.size(),
-        " tensors from checkpoint file: ", checkpoint_path);
-  }
-  TF_RETURN_IF_ERROR(iterator_->Restore(serialized_tensors[0]));
-  TF_RETURN_IF_ERROR(SyncCheckpointWithChunks(*checkpoint_index));
-  chunk_index_ = *checkpoint_index + 1;
+  TF_ASSIGN_OR_RETURN(std::vector<Tensor> serialized_tensors,
+                      reader.GetTensors());
+  TF_RETURN_IF_ERROR(iterator_->Restore(serialized_tensors));
+  TF_ASSIGN_OR_RETURN(auto checkpoint_name_tokens,
+                      ParseCheckpointFilename(*checkpoint_name));
+  auto [checkpoint_index, checkpoint_num_elements] = checkpoint_name_tokens;
+  TF_RETURN_IF_ERROR(
+      SyncCheckpointWithChunks(checkpoint_index, checkpoint_num_elements));
+  chunk_index_ = checkpoint_index + 1;
   LOG(INFO) << "Restored distributed tf.data snapshot writer. Stream "
-            << params_.stream_index << ", chunk " << *checkpoint_index << ".";
+            << params_.stream_index << ", chunk " << checkpoint_index << ".";
   return OkStatus();
 }
 
-StatusOr<int64_t> SnapshotStreamWriter::LastCheckpointIndex() const {
+StatusOr<std::string> SnapshotStreamWriter::LastCheckpointName() const {
   TF_ASSIGN_OR_RETURN(std::vector<std::string> checkpoint_names,
                       GetChildren(params_.CheckpointsDirectory(), params_.env));
   if (checkpoint_names.empty()) {
@@ -339,17 +340,22 @@ StatusOr<int64_t> SnapshotStreamWriter::LastCheckpointIndex() const {
                             params_.CheckpointsDirectory());
   }
 
-  int64_t last_index = 0;
+  int64_t last_index = -1;
+  std::string last_checkpoint_name = "";
   for (const std::string& checkpoint_name : checkpoint_names) {
-    TF_ASSIGN_OR_RETURN(int64_t checkpoint_index,
-                        GetFileIndex(checkpoint_name, "checkpoint"));
-    last_index = std::max(last_index, checkpoint_index);
+    TF_ASSIGN_OR_RETURN(auto checkpoint_name_tokens,
+                        ParseCheckpointFilename(checkpoint_name));
+    auto [checkpoint_index, unused] = checkpoint_name_tokens;
+    if (checkpoint_index > last_index) {
+      last_index = checkpoint_index;
+      last_checkpoint_name = checkpoint_name;
+    }
   }
-  return last_index;
+  return last_checkpoint_name;
 }
 
 Status SnapshotStreamWriter::SyncCheckpointWithChunks(
-    int64_t checkpoint_index) {
+    std::optional<int64_t> checkpoint_index, int64_t checkpoint_num_elements) {
   // In case the worker fails after writing the checkpoint but before committing
   // a chunk file, this will synchronize the checkpoint with the chunks. It will
   // commit uncommitted chunk files written before the checkpoint and delete
@@ -362,10 +368,14 @@ Status SnapshotStreamWriter::SyncCheckpointWithChunks(
         params_.UncommittedChunksDirectory(), uncommitted_chunk);
     TF_ASSIGN_OR_RETURN(int64_t chunk_index,
                         GetFileIndex(uncommitted_chunk, "chunk"));
-    std::string committed_chunk_filename = tsl::io::JoinPath(
-        params_.CommittedChunksDirectory(),
-        absl::StrCat("chunk_", params_.stream_index, "_", chunk_index));
-    if (chunk_index <= checkpoint_index) {
+    if (checkpoint_index.has_value() && chunk_index <= *checkpoint_index) {
+      int64_t chunk_num_elements = chunk_index == *checkpoint_index
+                                       ? checkpoint_num_elements
+                                       : kUnknownNumElements;
+      std::string committed_chunk_filename =
+          tsl::io::JoinPath(params_.CommittedChunksDirectory(),
+                            absl::StrCat("chunk_", params_.stream_index, "_",
+                                         chunk_index, "_", chunk_num_elements));
       TF_RETURN_IF_ERROR(params_.env->RenameFile(uncommitted_chunk_filename,
                                                  committed_chunk_filename));
     } else {
@@ -375,9 +385,17 @@ Status SnapshotStreamWriter::SyncCheckpointWithChunks(
   return OkStatus();
 }
 
-std::string SnapshotStreamWriter::CheckpointPath(int64_t chunk_index) const {
-  return tsl::io::JoinPath(params_.CheckpointsDirectory(),
-                           absl::StrCat("checkpoint_", chunk_index));
+std::string SnapshotStreamWriter::CheckpointPath(
+    int64_t chunk_index, int64_t chunk_num_elements) const {
+  return tsl::io::JoinPath(
+      params_.CheckpointsDirectory(),
+      absl::StrCat("checkpoint_", chunk_index, "_", chunk_num_elements));
 }
+
+std::string SnapshotStreamWriter::CheckpointPath(
+    const std::string& checkpoint_name) const {
+  return tsl::io::JoinPath(params_.CheckpointsDirectory(), checkpoint_name);
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
index e5cdf0a7d1f..46ae004ce78 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
@@ -15,10 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_STREAM_WRITER_H_
 #define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_STREAM_WRITER_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 
 #include "absl/strings/substitute.h"
 #include "tensorflow/core/data/service/common.pb.h"
@@ -30,11 +32,14 @@ limitations under the License.
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/mutex.h"
 #include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/platform/thread_annotations.h"
 
 namespace tensorflow {
 namespace data {
 
+constexpr int64_t kDefaultMaxChunkSizeBytes = 2 * (size_t{1} << 30);  // 2GB
+
 struct SnapshotWriterParams {
   // The directory path of the snapshot. See the comment on SnapshotStreamWriter
   // for how the directory is structured.
@@ -79,31 +84,27 @@ struct SnapshotWriterParams {
         "SnapshotWriterParams { base_path: $0, stream: $1, compression: $2 }",
         snapshot_path, stream_index, compression);
   }
-
- private:
-  static constexpr int64_t kDefaultMaxChunkSizeBytes =
-      10 * (size_t{1} << 30);  // 10GB
 };
 
 // Responsible for writing one snapshot stream, which is organized as following:
 //
 // - snapshot
-//   - LEASE
 //   - DONE
+//   - ERROR
 //   - snapshot.metadata
 //   - dataset_def.proto
 //   - chunks
-//     - chunk_<stream_index>_<chunk_index>
+//     - chunk_<stream_index>_<chunk_index>_<num_elements>
 //   - streams
 //     - stream_0
-//       - LEASE
 //       - DONE
+//       - ERROR
 //       - splits
 //         - split_<local_split_index>_<global_split_index>
 //       - uncommitted chunks
 //         - chunk_<chunk_index>
 //       - checkpoints
-//         - checkpoint_<chunk_index>
+//         - checkpoint_<chunk_index>_<num_elements>
 //
 // This class is thread-safe.
 class SnapshotStreamWriter {
@@ -187,16 +188,23 @@ class SnapshotStreamWriter {
   // Restores from the last checkpoint.
   Status Restore();
 
-  // Returns the index of the last checkpointed chunk.
-  StatusOr<int64_t> LastCheckpointIndex() const;
+  // Returns the filename of the most recent checkpoint.
+  StatusOr<std::string> LastCheckpointName() const;
 
   // Synchronizes the checkpoint with the committed chunks. This is called when
   // the worker restores the snapshot in case the worker fails after writing the
-  // checkpoint but before committing a chunk file.
-  Status SyncCheckpointWithChunks(int64_t checkpoint_index);
+  // checkpoint but before committing a chunk file. If no checkpoint has been
+  // written, `checkpoint_index` is nullopt.
+  Status SyncCheckpointWithChunks(std::optional<int64_t> checkpoint_index,
+                                  int64_t checkpoint_num_elements);
 
-  // Returns the path of the checkpoint for `chunk_index`.
-  std::string CheckpointPath(int64_t chunk_index) const;
+  // Returns the path of the checkpoint for `chunk_index` with
+  // `chunk_num_elements`.
+  std::string CheckpointPath(int64_t chunk_index,
+                             int64_t chunk_num_elements) const;
+
+  // Returns the path of the checkpoint for `checkpoint_name`.
+  std::string CheckpointPath(const std::string& checkpoint_name) const;
 
   const SnapshotWriterParams params_;
 
@@ -207,6 +215,8 @@ class SnapshotStreamWriter {
   int64_t chunk_index_ = 0;
   // Size of the current chunk.
   int64_t chunk_size_bytes_ = 0;
+  // Number of elements in current chunk.
+  int64_t chunk_num_elements_ = 0;
 
   // True if the dataset is exhausted.
   bool end_of_sequence_ = false;
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer_checkpoint_test.cc b/tensorflow/core/data/service/snapshot/snapshot_stream_writer_checkpoint_test.cc
index 3232d2a2b28..2875bd4c566 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_stream_writer_checkpoint_test.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer_checkpoint_test.cc
@@ -237,6 +237,32 @@ TEST(SnapshotStreamWriterCheckpointTest, SyncCheckpointsWithChunksByRenaming) {
               IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 6, 7, 8, 9)));
 }
 
+TEST(SnapshotStreamWriterCheckpointTest,
+     SyncCheckpointsWithSingleChunkByRenaming) {
+  const int64_t range = 10;
+  const std::string compression = tsl::io::compression::kSnappy;
+  const DatasetDef dataset = testing::RangeDataset(range);
+  const int64_t stream_index = 0;
+  TF_ASSERT_OK_AND_ASSIGN(const std::string snapshot_path,
+                          CreateSnapshotDirectory());
+  TF_ASSERT_OK_AND_ASSIGN(
+      testing::PartialSnapshotWriter partial_writer,
+      testing::PartialSnapshotWriter::Create(dataset, snapshot_path,
+                                             stream_index, compression));
+
+  TF_ASSERT_OK(partial_writer.WriteUncommittedChunks({0}));
+  TF_ASSERT_OK(partial_writer.WriteCheckpoints({0}));
+
+  SnapshotWriterParams writer_params{snapshot_path, stream_index, compression,
+                                     Env::Default()};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          testing::TestIterator(dataset));
+  SnapshotStreamWriter writer(writer_params, std::move(iterator));
+  EXPECT_THAT(writer.Wait(), IsOkAndHolds(true));
+  EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path, compression),
+              IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
+}
+
 TEST(SnapshotStreamWriterCheckpointTest, SyncCheckpointsWithChunksByDeleting) {
   const int64_t range = 10;
   const std::string compression = tsl::io::compression::kNone;
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer_test.cc b/tensorflow/core/data/service/snapshot/snapshot_stream_writer_test.cc
index 21bb60742cb..c6694315e40 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_stream_writer_test.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer_test.cc
@@ -81,9 +81,13 @@ class ElementOrErrorIterator : public TaskIterator {
     return OkStatus();
   }
 
-  StatusOr<Tensor> Save() override { return Tensor(); }
+  StatusOr<std::vector<Tensor>> Save() override {
+    return std::vector<Tensor>{};
+  }
 
-  Status Restore(const Tensor& saved_iterator) override { return OkStatus(); }
+  Status Restore(const std::vector<Tensor>& saved_iterator) override {
+    return OkStatus();
+  }
 
   int64_t Cardinality() const override { return elements_.size(); }
 
@@ -161,7 +165,7 @@ TEST_P(SnapshotStreamWriterParameterizedTest, WriteSnapshot) {
   // files are deleted.
   EXPECT_THAT(ReadSnapshot<int64_t>(
                   tsl::io::JoinPath(writer_params.CommittedChunksDirectory(),
-                                    "chunk_0_0"),
+                                    "chunk_0_0_10"),
                   compression, range),
               IsOkAndHolds(ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
 
@@ -188,7 +192,7 @@ TEST_P(SnapshotStreamWriterParameterizedTest, StreamAlreadyCompleted) {
 
   EXPECT_THAT(ReadSnapshot<int64_t>(
                   tsl::io::JoinPath(writer_params.CommittedChunksDirectory(),
-                                    "chunk_0_0"),
+                                    "chunk_0_0_10"),
                   compression, range),
               IsOkAndHolds(ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
 
@@ -198,7 +202,7 @@ TEST_P(SnapshotStreamWriterParameterizedTest, StreamAlreadyCompleted) {
   EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
   EXPECT_THAT(ReadSnapshot<int64_t>(
                   tsl::io::JoinPath(writer_params.CommittedChunksDirectory(),
-                                    "chunk_0_0"),
+                                    "chunk_0_0_10"),
                   compression, range),
               IsOkAndHolds(ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
 }
@@ -219,7 +223,7 @@ TEST_P(SnapshotStreamWriterParameterizedTest, WriteSnapshotChunks) {
   for (int i = 0; i < 10; ++i) {
     EXPECT_THAT(ReadSnapshot<int64_t>(
                     tsl::io::JoinPath(writer_params.CommittedChunksDirectory(),
-                                      absl::StrCat("chunk_0_", i)),
+                                      absl::StrCat("chunk_0_", i, "_1")),
                     compression,
                     /*num_elements=*/1),
                 IsOkAndHolds(ElementsAre(i)));
@@ -302,7 +306,7 @@ TEST(SnapshotStreamWriterTest, EmptyDataset) {
 
   EXPECT_THAT(ReadSnapshot<int64_t>(
                   tsl::io::JoinPath(writer_params.CommittedChunksDirectory(),
-                                    "chunk_0_0"),
+                                    "chunk_0_0_0"),
                   tsl::io::compression::kSnappy, /*num_elements=*/0),
               IsOkAndHolds(IsEmpty()));
 }
diff --git a/tensorflow/core/data/service/snapshot/test_utils.cc b/tensorflow/core/data/service/snapshot/test_utils.cc
index 977cc15c09b..d9c7fcc701e 100644
--- a/tensorflow/core/data/service/snapshot/test_utils.cc
+++ b/tensorflow/core/data/service/snapshot/test_utils.cc
@@ -53,7 +53,7 @@ tsl::StatusOr<std::string> CreateTmpDirectory() {
 tsl::StatusOr<int64_t> CommittedChunkIndex(const std::string& chunk_file) {
   std::vector<std::string> tokens = absl::StrSplit(chunk_file, '_');
   int64_t result = 0;
-  if (tokens.size() != 3 || !absl::SimpleAtoi(tokens[2], &result)) {
+  if (tokens.size() != 4 || !absl::SimpleAtoi(tokens[2], &result)) {
     return errors::Internal("Invalid");
   }
   return result;
@@ -62,7 +62,7 @@ tsl::StatusOr<int64_t> CommittedChunkIndex(const std::string& chunk_file) {
 tsl::StatusOr<int64_t> CheckpointIndex(const std::string& checkpoint_file) {
   std::vector<std::string> tokens = absl::StrSplit(checkpoint_file, '_');
   int64_t result = 0;
-  if (tokens.size() != 2 || !absl::SimpleAtoi(tokens[1], &result)) {
+  if (tokens.size() != 3 || !absl::SimpleAtoi(tokens[1], &result)) {
     return errors::Internal("Invalid");
   }
   return result;
diff --git a/tensorflow/core/data/service/snapshot/test_utils.h b/tensorflow/core/data/service/snapshot/test_utils.h
index 31b319a3baf..a6a91d4aac6 100644
--- a/tensorflow/core/data/service/snapshot/test_utils.h
+++ b/tensorflow/core/data/service/snapshot/test_utils.h
@@ -21,13 +21,15 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/core/data/service/common.h"
 #include "tensorflow/core/data/service/common.pb.h"
-#include "tensorflow/core/data/service/snapshot/snapshot_reader.h"
+#include "tensorflow/core/data/service/snapshot/file_utils.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
 #include "tensorflow/core/data/service/task_runner.h"
+#include "tensorflow/core/data/snapshot_utils.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/protobuf/snapshot.pb.h"
 #include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/path.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
@@ -39,18 +41,26 @@ namespace testing {
 template <class T>
 tsl::StatusOr<std::vector<T>> ReadSnapshot(const std::string& base_path,
                                            const std::string& compression) {
-  experimental::DistributedSnapshotMetadata metadata;
-  metadata.set_compression(compression);
-  SnapshotReaderParams params{base_path, metadata, DataTypeVector{DT_INT64},
-                              Env::Default()};
-  SnapshotReader reader(params);
   std::vector<T> result;
-  while (true) {
-    TF_ASSIGN_OR_RETURN(GetNextResult next, reader.GetNext());
-    if (next.end_of_sequence) {
-      return result;
+  std::string chunks_directory = CommittedChunksDirectory(base_path);
+  TF_ASSIGN_OR_RETURN(std::vector<string> chunk_files,
+                      GetChildren(chunks_directory, Env::Default()));
+  for (const std::string& chunk_file : chunk_files) {
+    std::string chunk_file_path =
+        tsl::io::JoinPath(chunks_directory, chunk_file);
+    snapshot_util::TFRecordReader tfrecord_reader(chunk_file_path, compression,
+                                                  DataTypeVector{DT_INT64});
+    TF_RETURN_IF_ERROR(tfrecord_reader.Initialize(Env::Default()));
+
+    while (true) {
+      std::vector<Tensor> tensors;
+      Status status = tfrecord_reader.ReadTensors(&tensors);
+      if (errors::IsOutOfRange(status)) {
+        break;
+      }
+      TF_RETURN_IF_ERROR(status);
+      result.push_back(tensors[0].unaligned_flat<T>().data()[0]);
     }
-    result.push_back(next.tensors[0].unaligned_flat<T>().data()[0]);
   }
   return result;
 }
diff --git a/tensorflow/core/data/service/snapshot/utils.cc b/tensorflow/core/data/service/snapshot/utils.cc
index d733948d9b3..748546ae265 100644
--- a/tensorflow/core/data/service/snapshot/utils.cc
+++ b/tensorflow/core/data/service/snapshot/utils.cc
@@ -17,8 +17,12 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/data/service/snapshot/utils.h b/tensorflow/core/data/service/snapshot/utils.h
index 948c62672f4..d54d2327301 100644
--- a/tensorflow/core/data/service/snapshot/utils.h
+++ b/tensorflow/core/data/service/snapshot/utils.h
@@ -18,7 +18,9 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/data/service/snapshot/utils_test.cc b/tensorflow/core/data/service/snapshot/utils_test.cc
index d7fa77da8c7..576f9f73c05 100644
--- a/tensorflow/core/data/service/snapshot/utils_test.cc
+++ b/tensorflow/core/data/service/snapshot/utils_test.cc
@@ -24,7 +24,9 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/protobuf.h"
+#include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace tensorflow {
@@ -64,6 +66,7 @@ TEST(UtilsTest, EstimatedMixedElementsSizeBytes) {
 }
 
 TEST(UtilsTest, EmptyTensor) { EXPECT_GT(EstimatedSizeBytes({Tensor()}), 0); }
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/task_runner.cc b/tensorflow/core/data/service/task_runner.cc
index 04b2c1f4dfe..483d32a8d0d 100644
--- a/tensorflow/core/data/service/task_runner.cc
+++ b/tensorflow/core/data/service/task_runner.cc
@@ -63,9 +63,12 @@ int64_t StandaloneTaskIterator::Cardinality() const {
   return dataset_->Get()->Cardinality();
 }
 
-StatusOr<Tensor> StandaloneTaskIterator::Save() { return iterator_->Save(); }
+StatusOr<std::vector<Tensor>> StandaloneTaskIterator::Save() {
+  return iterator_->Save();
+}
 
-Status StandaloneTaskIterator::Restore(const Tensor& saved_iterator) {
+Status StandaloneTaskIterator::Restore(
+    const std::vector<Tensor>& saved_iterator) {
   return iterator_->Restore(saved_iterator);
 }
 
diff --git a/tensorflow/core/data/service/task_runner.h b/tensorflow/core/data/service/task_runner.h
index 6fc119f705d..5c4a6afbd08 100644
--- a/tensorflow/core/data/service/task_runner.h
+++ b/tensorflow/core/data/service/task_runner.h
@@ -48,16 +48,16 @@ class TaskIterator {
   // Reports the cardinality of the dataset that created this iterator.
   virtual int64_t Cardinality() const = 0;
 
-  // Returns a serialized representation of the iterator. This is used to write
-  // iterator checkpoints.
-  virtual StatusOr<Tensor> Save() {
+  // Saves a checkpoint of the iterator. Returns Tensors that can be called with
+  // `Restore()`.
+  virtual StatusOr<std::vector<Tensor>> Save() {
     return errors::Unimplemented(
         "Serializing a tf.data service task iterator is unsupported.");
   }
 
-  // Restores the iterator from a serialized representation. `serialized` is a
-  // Tensor produced by `Serialize()`.
-  virtual Status Restore(const Tensor& saved_iterator) {
+  // Restores the iterator from a checkpoint. `saved_iterator` is the serialized
+  // iterator saved by calling `Save()`.
+  virtual Status Restore(const std::vector<Tensor>& saved_iterator) {
     return errors::Unimplemented(
         "Restoring from a tf.data service task iterator is unsupported.");
   }
@@ -73,8 +73,8 @@ class StandaloneTaskIterator : public TaskIterator {
                          std::unique_ptr<standalone::Iterator> iterator);
   Status GetNext(std::vector<Tensor>& element, bool& end_of_sequence) override;
   int64_t Cardinality() const override;
-  StatusOr<Tensor> Save() override;
-  Status Restore(const Tensor& saved_iterator) override;
+  StatusOr<std::vector<Tensor>> Save() override;
+  Status Restore(const std::vector<Tensor>& saved_iterator) override;
 
  private:
   std::unique_ptr<standalone::Dataset> dataset_;
diff --git a/tensorflow/core/data/service/worker_client.cc b/tensorflow/core/data/service/worker_client.cc
index f817cb45592..86de2860c58 100644
--- a/tensorflow/core/data/service/worker_client.cc
+++ b/tensorflow/core/data/service/worker_client.cc
@@ -83,10 +83,7 @@ Status DataServiceWorkerClient::EnsureInitialized() {
 }
 
 std::string DataServiceWorkerClient::GetDataTransferProtocol() const {
-  // TODO(mpcallanan): Test local transfer with alternative data transfer
-  // protocols.
-  if (transfer_protocol_ == kGrpcTransferProtocol &&
-      LocalWorkers::Get(address_) != nullptr) {
+  if (LocalWorkers::Get(address_) != nullptr) {
     return kLocalTransferProtocol;
   }
   return transfer_protocol_;
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index 24c7d390bf1..039235d8ff5 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -112,6 +112,9 @@ WorkerConfig ApplyWorkerDefaults(const WorkerConfig& config) {
     new_config.set_dispatcher_timeout_ms(
         absl::ToInt64Milliseconds(kDefaultDispatcherTimeout));
   }
+  if (new_config.snapshot_max_chunk_size_bytes() == 0) {
+    new_config.set_snapshot_max_chunk_size_bytes(kDefaultMaxChunkSizeBytes);
+  }
   return new_config;
 }
 
@@ -631,16 +634,19 @@ void DataServiceWorkerImpl::UpdateTasks(const WorkerHeartbeatResponse& response)
   }
 }
 
+// TODO(yangchen): Figure out why `mutex_lock`s here are needed for sanitizers.
 Status DataServiceWorkerImpl::UpdateSnapshotWriters(
-    const WorkerHeartbeatResponse& response) {
-  mutex_lock l(mu_);
+    const WorkerHeartbeatResponse& response) TF_LOCKS_EXCLUDED(mu_) {
   absl::flat_hash_set<SnapshotTask> assigned_snapshot_task_keys;
   for (const SnapshotTaskDef& snapshot_task : response.snapshot_tasks()) {
     SnapshotTask snapshot_task_key{snapshot_task.base_path(),
                                    snapshot_task.stream_index()};
     assigned_snapshot_task_keys.insert(snapshot_task_key);
-    if (snapshot_writers_.contains(snapshot_task_key)) {
-      continue;
+    {
+      mutex_lock l(mu_);
+      if (snapshot_writers_.contains(snapshot_task_key)) {
+        continue;
+      }
     }
 
     DatasetDef dataset_def;
@@ -649,16 +655,19 @@ Status DataServiceWorkerImpl::UpdateSnapshotWriters(
         &dataset_def));
     TF_ASSIGN_OR_RETURN(std::unique_ptr<StandaloneTaskIterator> iterator,
                         MakeSnapshotTaskIterator(snapshot_task, dataset_def));
+    mutex_lock l(mu_);
     snapshot_writers_.emplace(
         snapshot_task_key,
         std::make_unique<SnapshotStreamWriter>(
             SnapshotWriterParams{
                 snapshot_task.base_path(), snapshot_task.stream_index(),
-                snapshot_task.metadata().compression(), Env::Default()},
+                snapshot_task.metadata().compression(), Env::Default(),
+                config_.snapshot_max_chunk_size_bytes()},
             std::move(iterator)));
   }
 
   // Cancel writers for snapshots that are no longer assigned by the dispatcher.
+  mutex_lock l(mu_);
   for (auto it = snapshot_writers_.begin(); it != snapshot_writers_.end();) {
     if (!assigned_snapshot_task_keys.contains(it->first)) {
       it->second->Cancel();
diff --git a/tensorflow/core/data/service/worker_impl.h b/tensorflow/core/data/service/worker_impl.h
index 1c30a14c5ec..e4d36be9fb4 100644
--- a/tensorflow/core/data/service/worker_impl.h
+++ b/tensorflow/core/data/service/worker_impl.h
@@ -127,7 +127,7 @@ class DataServiceWorkerImpl {
   Status ValidateWorkerConfig() const;
   // Creates and initializes a dispatcher client.
   StatusOr<std::unique_ptr<DataServiceDispatcherClient>>
-  CreateDispatcherClient() const;
+  CreateDispatcherClient() const TF_LOCKS_EXCLUDED(mu_);
   // Sends task status to the dispatcher and checks for dispatcher commands.
   Status SendTaskUpdates() TF_LOCKS_EXCLUDED(mu_);
   // Creates an iterator to process a task.
@@ -150,7 +150,8 @@ class DataServiceWorkerImpl {
   void UpdateTasks(const WorkerHeartbeatResponse& response)
       TF_LOCKS_EXCLUDED(mu_);
   // Updates the distributed snapshot tasks according to the heartbeat response.
-  Status UpdateSnapshotWriters(const WorkerHeartbeatResponse& response);
+  Status UpdateSnapshotWriters(const WorkerHeartbeatResponse& response)
+      TF_LOCKS_EXCLUDED(mu_);
   // Creates an dataset iterator for snapshot writers.
   StatusOr<std::unique_ptr<StandaloneTaskIterator>> MakeSnapshotTaskIterator(
       const SnapshotTaskDef& snapshot_task,
@@ -197,7 +198,7 @@ class DataServiceWorkerImpl {
 
   absl::flat_hash_map<SnapshotTask, std::unique_ptr<SnapshotStreamWriter>,
                       absl::Hash<SnapshotTask>>
-      snapshot_writers_;
+      snapshot_writers_ TF_GUARDED_BY(mu_);
 
   // A thread for notifying the dispatcher when tasks complete.
   std::unique_ptr<Thread> task_completion_thread_;
diff --git a/tensorflow/core/data/snapshot_utils.cc b/tensorflow/core/data/snapshot_utils.cc
index 1ae4547ab88..06e945b371f 100644
--- a/tensorflow/core/data/snapshot_utils.cc
+++ b/tensorflow/core/data/snapshot_utils.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
-#include <queue>
 #include <string>
 #include <utility>
 #include <vector>
@@ -40,7 +39,6 @@ limitations under the License.
 #include "tensorflow/core/lib/io/zlib_inputstream.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #include "tensorflow/core/platform/coding.h"
-#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/random.h"
@@ -50,6 +48,9 @@ limitations under the License.
 #include "tensorflow/core/protobuf/snapshot.pb.h"
 #include "tensorflow/tsl/lib/io/snappy/snappy_inputbuffer.h"
 #include "tensorflow/tsl/lib/io/snappy/snappy_outputbuffer.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace data {
@@ -722,29 +723,31 @@ Status Reader::MakeNestedDataset(Env* env,
                 datasets.begin() + (start_index % shard_dirs.size()),
                 datasets.end());
   }
+  MakeNestedDataset(datasets, output);
+  return OkStatus();
+}
 
+void Reader::MakeNestedDataset(const std::vector<DatasetBase*>& datasets,
+                               DatasetBase** output) {
   *output = new NestedDataset(
       DatasetContext(DatasetContext::Params(
           {"SnapshotNestedDatasetReader", "SnapshotNestedDatasetReader"})),
       datasets);
   (*output)->Initialize(/*metadata=*/{});
-  return OkStatus();
 }
 
-TFRecordReader::TFRecordReader(const std::string& filename,
-                               const string& compression_type,
-                               const DataTypeVector& dtypes,
-                               std::optional<int64_t> output_buffer_size)
+TFRecordReaderImpl::TFRecordReaderImpl(
+    const std::string& filename, const string& compression,
+    std::optional<int64_t> output_buffer_size)
     : filename_(filename),
       offset_(0),
-      compression_type_(compression_type),
-      dtypes_(dtypes),
+      compression_(compression),
       output_buffer_size_(output_buffer_size) {}
 
-Status TFRecordReader::Initialize(Env* env) {
+Status TFRecordReaderImpl::Initialize(Env* env) {
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename_, &file_));
   auto options = io::RecordReaderOptions::CreateRecordReaderOptions(
-      /*compression_type=*/compression_type_);
+      /*compression_type=*/compression_);
 #if !defined(IS_SLIM_BUILD)
   if (output_buffer_size_.has_value()) {
     options.snappy_options.output_buffer_size = *output_buffer_size_;
@@ -755,26 +758,47 @@ Status TFRecordReader::Initialize(Env* env) {
   return OkStatus();
 }
 
+StatusOr<Tensor> TFRecordReaderImpl::GetNext() {
+  tstring record;
+  TF_RETURN_IF_ERROR(record_reader_->ReadRecord(&offset_, &record));
+  return Parse(record);
+}
+
+StatusOr<std::vector<Tensor>> TFRecordReaderImpl::GetTensors() {
+  std::vector<Tensor> tensors;
+  while (true) {
+    StatusOr<Tensor> tensor = GetNext();
+    if (errors::IsOutOfRange(tensor.status())) {
+      return tensors;
+    }
+    TF_RETURN_IF_ERROR(tensor.status());
+    tensors.push_back(std::move(*tensor));
+  }
+  return tensors;
+}
+
+StatusOr<Tensor> TFRecordReaderImpl::Parse(const tstring& record) {
+  TensorProto proto;
+  if (!proto.ParseFromArray(record.data(), record.size())) {
+    return errors::DataLoss(
+        "Unable to parse tensor from stored proto in file: ", filename_,
+        ", record ", offset_, ". Serialized proto: ", record);
+  }
+
+  Tensor tensor;
+  if (!tensor.FromProto(proto)) {
+    return errors::DataLoss(
+        "Unable to parse tensor from stored proto in file: ", filename_,
+        ", record ", offset_, ". TensorProto: ", proto.ShortDebugString());
+  }
+  return tensor;
+}
+
 Status TFRecordReader::ReadTensors(std::vector<Tensor>* read_tensors) {
+  read_tensors->clear();
   read_tensors->reserve(dtypes_.size());
   for (int i = 0; i < dtypes_.size(); ++i) {
-    tstring record;
-    TF_RETURN_IF_ERROR(record_reader_->ReadRecord(&offset_, &record));
-
-    TensorProto proto;
-    if (!proto.ParseFromArray(record.data(), record.size())) {
-      return errors::DataLoss(
-          "Unable to parse tensor from stored proto in file: ", filename_,
-          ", record ", offset_, ". Serialized proto: ", record);
-    }
-
-    Tensor tensor;
-    if (!tensor.FromProto(proto)) {
-      return errors::DataLoss(
-          "Unable to parse tensor from stored proto in file: ", filename_,
-          ", record ", offset_, ". TensorProto: ", proto.ShortDebugString());
-    }
-
+    TF_ASSIGN_OR_RETURN(Tensor tensor, reader_impl_.GetNext());
     read_tensors->push_back(std::move(tensor));
   }
   return OkStatus();
diff --git a/tensorflow/core/data/snapshot_utils.h b/tensorflow/core/data/snapshot_utils.h
index 80afb307f55..15ba39e7d84 100644
--- a/tensorflow/core/data/snapshot_utils.h
+++ b/tensorflow/core/data/snapshot_utils.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_SNAPSHOT_UTILS_H_
 #define TENSORFLOW_CORE_DATA_SNAPSHOT_UTILS_H_
 
+#include <cstdint>
+#include <memory>
 #include <optional>
+#include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -220,6 +224,10 @@ class Reader {
                                   const int64_t start_index,
                                   DatasetBase** output);
 
+  // Returns a nested dataset for the given datasets.
+  static void MakeNestedDataset(const std::vector<DatasetBase*>& datasets,
+                                DatasetBase** output);
+
   // Reads a vector of Tensors from the snapshot file.
   virtual Status ReadTensors(std::vector<Tensor>* read_tensors) = 0;
 
@@ -236,30 +244,60 @@ class Reader {
   class NestedDataset;
 };
 
+class TFRecordReaderImpl {
+ public:
+  // Constructs a `TFRecordReaderImpl`.
+  // `filename` is the file to read from.
+  // `compression_type` is the compression method, as defined in
+  // tensorflow/tsl/lib/io/compression.h.
+  // `output_buffer_size` specifies the buffer size required by Snappy/Zlib
+  // compression algorithms. Ignored if compression is not enabled.
+  TFRecordReaderImpl(const std::string& filename, const string& compression,
+                     std::optional<int64_t> output_buffer_size = std::nullopt);
+
+  // Initializes the reader. Callers must initialize the reader before calling
+  // `GetNext` or `GetTensors`.
+  Status Initialize(Env* env);
+
+  // Reads the next Tensor in the input file.
+  StatusOr<Tensor> GetNext();
+
+  // Reads all Tensors in the input file.
+  StatusOr<std::vector<Tensor>> GetTensors();
+
+ private:
+  // Parses `record` into a Tensor.
+  StatusOr<Tensor> Parse(const tstring& record);
+
+  std::string filename_;
+  std::unique_ptr<RandomAccessFile> file_;
+  std::unique_ptr<io::RecordReader> record_reader_;
+  uint64_t offset_;
+
+  const string compression_;
+  const std::optional<int64_t> output_buffer_size_;
+};
+
 // Reads snapshots previously written with `TFRecordWriter`.
 class TFRecordReader : public Reader {
  public:
-  TFRecordReader(const std::string& filename, const string& compression_type,
+  TFRecordReader(const std::string& filename, const string& compression,
                  const DataTypeVector& dtypes,
-                 std::optional<int64_t> output_buffer_size = std::nullopt);
+                 std::optional<int64_t> output_buffer_size = std::nullopt)
+      : reader_impl_(filename, compression, output_buffer_size),
+        dtypes_(dtypes) {}
 
-  Status Initialize(Env* env) override;
+  // Initializes the reader. Callers must initialize the reader before calling
+  // `ReadTensors`.
+  Status Initialize(Env* env) override { return reader_impl_.Initialize(env); }
 
   // Reads Tensors into `read_tensors`. Returns OK on success, OutOfRange for
   // end of file, or an error status if there is an error.
   Status ReadTensors(std::vector<Tensor>* read_tensors) override;
 
-  ~TFRecordReader() override = default;
-
  private:
-  std::string filename_;
-  std::unique_ptr<RandomAccessFile> file_;
-  std::unique_ptr<io::RecordReader> record_reader_;
-  uint64 offset_;
-
-  const string compression_type_;
+  TFRecordReaderImpl reader_impl_;
   const DataTypeVector dtypes_;
-  const std::optional<int64_t> output_buffer_size_;
 };
 
 // Reads snapshots previously written with `CustomWriter`.
diff --git a/tensorflow/core/data/standalone.cc b/tensorflow/core/data/standalone.cc
index c5a4b70ddde..cb37bfd6c6a 100644
--- a/tensorflow/core/data/standalone.cc
+++ b/tensorflow/core/data/standalone.cc
@@ -79,33 +79,33 @@ Status Iterator::GetNext(std::vector<Tensor>* outputs, bool* end_of_input) {
   return iterator_->GetNext(ctx_.get(), outputs, end_of_input);
 }
 
-StatusOr<Tensor> Iterator::Save() {
+StatusOr<std::vector<Tensor>> Iterator::Save() {
   VariantTensorDataWriter writer;
   TF_RETURN_IF_ERROR(iterator_->Save(serialization_ctx_.get(), &writer));
   std::vector<std::unique_ptr<VariantTensorData>> data;
   writer.ReleaseData(&data);
 
-  int64_t num_tensors = data.size();
-  Tensor serialized(DT_VARIANT, TensorShape({num_tensors}));
+  std::vector<Tensor> serialized;
   for (size_t i = 0; i < data.size(); ++i) {
+    Tensor tensor(DT_VARIANT, TensorShape({1}));
     IteratorStateVariant variant;
     TF_RETURN_IF_ERROR(variant.InitializeFromVariantData(std::move(data[i])));
-    serialized.vec<Variant>()(i) = std::move(variant);
+    tensor.vec<Variant>()(0) = std::move(variant);
+    serialized.push_back(std::move(tensor));
   }
   return serialized;
 }
 
-Status Iterator::Restore(const Tensor& saved_iterator) {
-  int64_t num_tensors = saved_iterator.dim_size(0);
-  auto saved_vec = saved_iterator.vec<Variant>();
+Status Iterator::Restore(const std::vector<Tensor>& saved_iterator) {
   std::vector<const VariantTensorData*> data;
-  data.reserve(num_tensors);
-  for (int i = 0; i < num_tensors; ++i) {
-    auto* variant = saved_vec(i).get<IteratorStateVariant>();
+  data.reserve(saved_iterator.size());
+  for (int i = 0; i < saved_iterator.size(); ++i) {
+    auto saved_vec = saved_iterator[i].vec<Variant>();
+    auto* variant = saved_vec(0).get<IteratorStateVariant>();
     if (!variant) {
       return errors::Internal(
           "Cannot initialize an iterator from tensor ",
-          saved_vec(i).DebugString(),
+          saved_vec(0).DebugString(),
           ". Expected a variant tensor of type IteratorStateVariant.");
     }
     data.push_back(variant->GetData());
@@ -132,11 +132,12 @@ Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
       TF_GRAPH_DEF_VERSION, flib_def.get(), OptimizerOptions{},
       /*thread_pool=*/nullptr, /*parent=*/nullptr,
       /*session_metadata=*/nullptr,
-      Rendezvous::Factory{
-          [](const int64_t, const DeviceMgr* device_mgr, Rendezvous** r) {
-            *r = new IntraProcessRendezvous(device_mgr);
-            return OkStatus();
-          }});
+      Rendezvous::Factory{[](const int64_t, const DeviceMgr* device_mgr,
+                             tsl::core::RefCountPtr<Rendezvous>* r) {
+        *r = tsl::core::RefCountPtr<Rendezvous>(
+            new IntraProcessRendezvous(device_mgr));
+        return OkStatus();
+      }});
 
   string fetch_node = "";
   for (const auto& node : graph_def.node()) {
diff --git a/tensorflow/core/data/standalone.h b/tensorflow/core/data/standalone.h
index cb78525ded0..748ea655389 100644
--- a/tensorflow/core/data/standalone.h
+++ b/tensorflow/core/data/standalone.h
@@ -81,12 +81,13 @@ class Iterator {
   // indication of whether the end of the input pipeline has been reached.
   Status GetNext(std::vector<Tensor>* outputs, bool* end_of_input);
 
-  // Saves a checkpoint of the iterator. Returns a Tensor that can be called
-  // with `Restore()`.
-  StatusOr<Tensor> Save();
+  // Saves a checkpoint of the iterator. Returns Tensors that can be called with
+  // `Restore()`.
+  StatusOr<std::vector<Tensor>> Save();
 
-  // Restores the iterator from a checkpoint.
-  Status Restore(const Tensor& saved_iterator);
+  // Restores the iterator from a checkpoint. `saved_iterator` is the serialized
+  // iterator saved by calling `Save()`.
+  Status Restore(const std::vector<Tensor>& saved_iterator);
 
  private:
   friend class Dataset;
diff --git a/tensorflow/core/data/standalone_save_restore_test.cc b/tensorflow/core/data/standalone_save_restore_test.cc
index 91ea48c3ddf..d6b7fdc6b86 100644
--- a/tensorflow/core/data/standalone_save_restore_test.cc
+++ b/tensorflow/core/data/standalone_save_restore_test.cc
@@ -73,7 +73,7 @@ TEST(TaskRunnerCheckpointTest, SaveAndRestoreFromCheckpoints) {
   TestDataset dataset(testing::RangeDataset(range));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Iterator> iterator,
                           dataset.MakeIterator());
-  TF_ASSERT_OK_AND_ASSIGN(Tensor saved_iterator, iterator->Save());
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Tensor> saved_iterator, iterator->Save());
 
   for (int64_t i = 0; i < range; ++i) {
     TF_ASSERT_OK_AND_ASSIGN(iterator, dataset.MakeIterator());
@@ -87,7 +87,7 @@ TEST(TaskRunnerCheckpointTest, EmptyDataset) {
   TestDataset dataset(testing::RangeDataset(0));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Iterator> iterator,
                           dataset.MakeIterator());
-  TF_ASSERT_OK_AND_ASSIGN(Tensor saved_iterator, iterator->Save());
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Tensor> saved_iterator, iterator->Save());
 
   TF_ASSERT_OK_AND_ASSIGN(iterator, dataset.MakeIterator());
   TF_ASSERT_OK(iterator->Restore(saved_iterator));
@@ -100,7 +100,7 @@ TEST(TaskRunnerCheckpointTest, EndOfSequenceIterator) {
                           dataset.MakeIterator());
   EXPECT_THAT(GetNext<int64_t>(*iterator), StatusIs(error::OUT_OF_RANGE));
 
-  TF_ASSERT_OK_AND_ASSIGN(Tensor saved_iterator, iterator->Save());
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Tensor> saved_iterator, iterator->Save());
   TF_ASSERT_OK_AND_ASSIGN(iterator, dataset.MakeIterator());
   TF_ASSERT_OK(iterator->Restore(saved_iterator));
   EXPECT_THAT(GetNext<int64_t>(*iterator), StatusIs(error::OUT_OF_RANGE));
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index b685e9fbc3e..c9ba706ebb1 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -125,13 +125,13 @@ tf_cuda_library(
         ":debug_node_key",
         ":debug_service_cc_grpc_proto",
         ":debugger_event_metadata_proto_cc",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ] + tf_grpc_cc_dependencies(),
     alwayslink = 1,
 )
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index ce508e4eba2..37731144a28 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -86,7 +86,7 @@ Status DebugNodeInserter::InsertNodes(
                                   watch.debug_urls().begin(),
                                   watch.debug_urls().end());
       } else {
-        return Status(error::FAILED_PRECONDITION,
+        return Status(absl::StatusCode::kFailedPrecondition,
                       strings::StrCat(
                           "output_slot is expected to be -1 for wildcard ",
                           "node name (\"*\"), but got ", watch.output_slot()));
@@ -95,7 +95,7 @@ Status DebugNodeInserter::InsertNodes(
     } else {
       if (watch.output_slot() < 0) {
         return Status(
-            error::FAILED_PRECONDITION,
+            absl::StatusCode::kFailedPrecondition,
             strings::StrCat("A negative output_slot in DebugTensorWatch is ",
                             "valid only for the wildcard node name (\"*\"), ",
                             "but got node name ", watch.node_name()));
@@ -186,9 +186,9 @@ Status DebugNodeInserter::InsertNodes(
                          debug_ops, debug_urls, &copy_node);
       if (!copy_s.ok()) {
         return Status(
-            error::FAILED_PRECONDITION,
+            absl::StatusCode::kFailedPrecondition,
             strings::StrCat("Failed to create Copy/CopyHost node for tensor ",
-                            tensor_name, ", due to: ", copy_s.error_message()));
+                            tensor_name, ", due to: ", copy_s.message()));
       }
 
       // Add edge from watched tensor to the copy node.
@@ -213,10 +213,10 @@ Status DebugNodeInserter::InsertNodes(
                       << "debug op name = " << debug_op_name;
           } else {
             return Status(
-                error::FAILED_PRECONDITION,
+                absl::StatusCode::kFailedPrecondition,
                 strings::StrCat("Failed to create debug node ", debug_op_name,
                                 " for tensor ", tensor_name,
-                                ", due to: ", debug_s.error_message()));
+                                ", due to: ", debug_s.message()));
           }
         }
       }
@@ -336,7 +336,7 @@ Status DebugNodeInserter::CreateCopyNode(
 
   if (!builder.Finalize(&node_def).ok()) {
     return Status(
-        error::FAILED_PRECONDITION,
+        absl::StatusCode::kFailedPrecondition,
         strings::StrCat("Failed to create node definition ", "for copy op ",
                         copy_node_name, " on watched tensor ", tensor_name));
   }
@@ -344,12 +344,12 @@ Status DebugNodeInserter::CreateCopyNode(
 
   if (!s.ok()) {
     return Status(
-        error::FAILED_PRECONDITION,
+        absl::StatusCode::kFailedPrecondition,
         strings::StrCat("Failed to find kernel definition ", "for copy op ",
                         copy_node_name, " on watched tensor ", tensor_name));
   }
   if (!NodeBuilder(builder).Finalize(graph, copy_node).ok()) {
-    return Status(error::FAILED_PRECONDITION,
+    return Status(absl::StatusCode::kFailedPrecondition,
                   strings::StrCat("Failed to create copy node ", copy_node_name,
                                   " on watched tensor ", tensor_name));
   }
diff --git a/tensorflow/core/debug/debug_grpc_io_utils_test.cc b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
index 7c8c727e1c7..235bdaed604 100644
--- a/tensorflow/core/debug/debug_grpc_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
@@ -99,8 +99,7 @@ TEST_F(GrpcDebugTest, ConnectionTimeoutWorks) {
   const string expected_error_msg = strings::StrCat(
       "Failed to connect to gRPC channel at ", kInvalidGrpcUrl.substr(7),
       " within a timeout of ", kShortTimeoutMicros / 1e6, " s");
-  ASSERT_NE(string::npos,
-            publish_status.error_message().find(expected_error_msg));
+  ASSERT_NE(string::npos, publish_status.message().find(expected_error_msg));
 }
 
 TEST_F(GrpcDebugTest, ConnectionToDelayedStartingServerWorks) {
@@ -154,9 +153,9 @@ TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex0ViaGrpcTest) {
   const Status status = DebugIO::PublishDebugTensor(
       kDebugNodeKey, tensor, Env::Default()->NowMicros(), {server_data_.url});
   ASSERT_FALSE(status.ok());
-  ASSERT_NE(status.error_message().find("string value at index 0 from debug "
-                                        "node foo_tensor:0:DebugIdentity does "
-                                        "not fit gRPC message size limit"),
+  ASSERT_NE(status.message().find("string value at index 0 from debug "
+                                  "node foo_tensor:0:DebugIdentity does "
+                                  "not fit gRPC message size limit"),
             string::npos);
   TF_ASSERT_OK(DebugIO::CloseDebugURL(server_data_.url));
 }
@@ -170,9 +169,9 @@ TEST_F(GrpcDebugTest, SendDebugTensorWithLargeStringAtIndex1ViaGrpcTest) {
   const Status status = DebugIO::PublishDebugTensor(
       kDebugNodeKey, tensor, Env::Default()->NowMicros(), {server_data_.url});
   ASSERT_FALSE(status.ok());
-  ASSERT_NE(status.error_message().find("string value at index 1 from debug "
-                                        "node foo_tensor:0:DebugIdentity does "
-                                        "not fit gRPC message size limit"),
+  ASSERT_NE(status.message().find("string value at index 1 from debug "
+                                  "node foo_tensor:0:DebugIdentity does "
+                                  "not fit gRPC message size limit"),
             string::npos);
   TF_ASSERT_OK(DebugIO::CloseDebugURL(server_data_.url));
 }
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 6d076a238ec..84168d82097 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <stddef.h>
 #include <string.h>
+
 #include <cmath>
+#include <cstdint>
 #include <cstdlib>
 #include <cstring>
 #include <limits>
@@ -31,6 +33,8 @@ limitations under the License.
 
 #include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/debug/debug_callback_registry.h"
 #include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -43,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/util/event.pb.h"
 
 #define GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR \
@@ -53,6 +58,8 @@ namespace tensorflow {
 
 namespace {
 
+constexpr absl::string_view kDumpSubDirName = "node-io-dump";
+
 // Creates an Event proto representing a chunk of a Tensor. This method only
 // populates the field of the Event proto that represent the envelope
 // information (e.g., timestamp, device_name, num_chunks, chunk_index, dtype,
@@ -273,7 +280,7 @@ Status PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
       return errors::FailedPrecondition(
           "Failed to send chunk ", i, " of ", num_chunks,
           " of encoded GraphDef of size ", encoded_graph_def.size(), " bytes, ",
-          "due to: ", s.error_message());
+          "due to: ", s.message());
     }
   }
   return OkStatus();
@@ -415,7 +422,8 @@ Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
                                    const Tensor& tensor,
                                    const uint64 wall_time_us,
                                    const gtl::ArraySlice<string> debug_urls,
-                                   const bool gated_grpc) {
+                                   const bool gated_grpc,
+                                   const int64_t step_id) {
   int32_t num_failed_urls = 0;
   std::vector<Status> fail_statuses;
   for (const string& url : debug_urls) {
@@ -435,8 +443,13 @@ Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
             "variable TFDBG_DISK_BYTES_LIMIT to set a higher limit.");
       }
 
-      Status s = DebugFileIO::DumpTensorToDir(
-          debug_node_key, tensor, wall_time_us, dump_root_dir, nullptr);
+      Status s = debug_node_key.io_of_node.empty()
+                     ? DebugFileIO::DumpTensorToDir(debug_node_key, tensor,
+                                                    wall_time_us, dump_root_dir,
+                                                    nullptr)
+                     : DebugFileIO::DumpTensorToDirForNodeDumping(
+                           debug_node_key, tensor, wall_time_us, dump_root_dir,
+                           nullptr, step_id);
       if (!s.ok()) {
         num_failed_urls++;
         fail_statuses.push_back(s);
@@ -460,7 +473,7 @@ Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
       CHECK(callback) << "No callback registered for: " << dump_root_dir;
       (*callback)(debug_node_key, tensor);
     } else {
-      return Status(error::UNAVAILABLE,
+      return Status(absl::StatusCode::kUnavailable,
                     strings::StrCat("Invalid debug target URL: ", url));
     }
   }
@@ -473,10 +486,10 @@ Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
         " debug target URLs failed, due to the following errors:");
     for (Status& status : fail_statuses) {
       error_message =
-          strings::StrCat(error_message, " ", status.error_message(), ";");
+          strings::StrCat(error_message, " ", status.message(), ";");
     }
 
-    return Status(error::INTERNAL, error_message);
+    return Status(absl::StatusCode::kInternal, error_message);
   }
 }
 
@@ -606,6 +619,19 @@ Status DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key,
   return DumpTensorToEventFile(debug_node_key, tensor, wall_time_us, file_path);
 }
 
+Status DebugFileIO::DumpTensorToDirForNodeDumping(
+    const DebugNodeKey& debug_node_key, const Tensor& tensor,
+    const uint64 wall_time_us, const string& dump_root_dir,
+    string* dump_file_path, const int64_t step_id) {
+  const string file_path = GetDumpFilePathForNodeDumping(
+      dump_root_dir, debug_node_key, wall_time_us, step_id);
+  if (dump_file_path != nullptr) {
+    *dump_file_path = file_path;
+  }
+
+  return DumpTensorToEventFile(debug_node_key, tensor, wall_time_us, file_path);
+}
+
 string DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
                                     const DebugNodeKey& debug_node_key,
                                     const uint64 wall_time_us) {
@@ -617,6 +643,19 @@ string DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
       wall_time_us);
 }
 
+string DebugFileIO::GetDumpFilePathForNodeDumping(
+    const string& dump_root_dir, const DebugNodeKey& debug_node_key,
+    const uint64 wall_time_us, const int64_t step_id) {
+  return AppendTimestampToFilePath(
+      io::JoinPath(
+          dump_root_dir, kDumpSubDirName, strings::StrCat("step-", step_id),
+          strings::StrCat(
+              absl::StrReplaceAll(debug_node_key.io_of_node, {{"/", "-"}}), ":",
+              debug_node_key.is_input ? "in" : "out", ":",
+              debug_node_key.io_index)),
+      wall_time_us);
+}
+
 Status DebugFileIO::DumpEventProtoToFile(const Event& event_proto,
                                          const string& dir_name,
                                          const string& file_name) {
@@ -624,9 +663,9 @@ Status DebugFileIO::DumpEventProtoToFile(const Event& event_proto,
 
   Status s = RecursiveCreateDir(env, dir_name);
   if (!s.ok()) {
-    return Status(error::FAILED_PRECONDITION,
+    return Status(absl::StatusCode::kFailedPrecondition,
                   strings::StrCat("Failed to create directory  ", dir_name,
-                                  ", due to: ", s.error_message()));
+                                  ", due to: ", s.message()));
   }
 
   const string file_path = io::JoinPath(dir_name, file_name);
@@ -665,13 +704,13 @@ Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
     Status s = RecursiveCreateDir(env, parent_dir);  // Recursive call
     if (!s.ok()) {
       return Status(
-          error::FAILED_PRECONDITION,
+          absl::StatusCode::kFailedPrecondition,
           strings::StrCat("Failed to create directory  ", parent_dir));
     }
   } else if (env->FileExists(parent_dir).ok() &&
              !env->IsDirectory(parent_dir).ok()) {
     // The path exists, but it is a file.
-    return Status(error::FAILED_PRECONDITION,
+    return Status(absl::StatusCode::kFailedPrecondition,
                   strings::StrCat("Failed to create directory  ", parent_dir,
                                   " because the path exists as a file "));
   }
@@ -682,7 +721,7 @@ Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
   if (env->FileExists(dir).ok() && env->IsDirectory(dir).ok()) {
     return OkStatus();
   } else {
-    return Status(error::ABORTED,
+    return Status(absl::StatusCode::kAborted,
                   strings::StrCat("Failed to create directory  ", parent_dir));
   }
 }
@@ -782,7 +821,7 @@ Status DebugGrpcChannel::ReceiveServerRepliesAndClose() {
   if (reader_writer_->Finish().ok()) {
     return OkStatus();
   } else {
-    return Status(error::FAILED_PRECONDITION,
+    return Status(absl::StatusCode::kFailedPrecondition,
                   "Failed to close debug GRPC stream.");
   }
 }
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index f87e6374aa1..0f0054eaa03 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_DEBUG_DEBUG_IO_UTILS_H_
 
 #include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <string>
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
@@ -69,17 +71,20 @@ class DebugIO {
   // Publishes a tensor to a debug target URL.
   //
   // Args:
-  //   debug_node_key: A DebugNodeKey identifying the debug node.
+  //   debug_node_key: A DebugNodeKey identifying the debug node. If
+  //     `debug_node_key.io_of_node` is non-empty, publish for node
+  //     inputs/outputs dumping feature.
   //   tensor: The Tensor object being published.
   //   wall_time_us: Time stamp for the Tensor. Unit: microseconds (us).
   //   debug_urls: An array of debug target URLs, e.g.,
   //     "file:///foo/tfdbg_dump", "grpc://localhost:11011"
   //   gated_grpc: Whether this call is subject to gRPC gating.
+  //   step_id: Step ID associated with the tensor.
   static Status PublishDebugTensor(const DebugNodeKey& debug_node_key,
                                    const Tensor& tensor,
                                    const uint64 wall_time_us,
                                    const gtl::ArraySlice<string> debug_urls,
-                                   const bool gated_grpc);
+                                   bool gated_grpc, int64_t step_id = -1);
 
   // Convenience overload of the method above for no gated_grpc by default.
   static Status PublishDebugTensor(const DebugNodeKey& debug_node_key,
@@ -170,6 +175,12 @@ class DebugFileIO {
                                 const string& dump_root_dir,
                                 string* dump_file_path);
 
+  // Similar to the above, but for node inputs/outputs dumping feature.
+  static Status DumpTensorToDirForNodeDumping(
+      const DebugNodeKey& debug_node_key, const Tensor& tensor,
+      uint64 wall_time_us, const string& dump_root_dir, string* dump_file_path,
+      int64_t step_id);
+
   // Get the full path to the dump file.
   //
   // Args:
@@ -183,6 +194,11 @@ class DebugFileIO {
                                 const DebugNodeKey& debug_node_key,
                                 const uint64 wall_time_us);
 
+  // Similar to the above, but for node inputs/outputs dumping feature.
+  static string GetDumpFilePathForNodeDumping(
+      const string& dump_root_dir, const DebugNodeKey& debug_node_key,
+      uint64 wall_time_us, int64_t step_id);
+
   // Dumps an Event proto to a file.
   //
   // Args:
diff --git a/tensorflow/core/debug/debug_node_key.cc b/tensorflow/core/debug/debug_node_key.cc
index aa3d4e57006..1a8c9f91eee 100644
--- a/tensorflow/core/debug/debug_node_key.cc
+++ b/tensorflow/core/debug/debug_node_key.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/debug/debug_node_key.h"
 
+#include <cstdint>
+
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
@@ -25,18 +27,25 @@ const char* const DebugNodeKey::kMetadataFilePrefix = "_tfdbg_";
 const char* const DebugNodeKey::kDeviceTag = "device_";
 
 DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
-                           const int32_t output_slot, const string& debug_op)
+                           const int32_t output_slot, const string& debug_op,
+                           const string& io_of_node, const bool is_input,
+                           const int32_t io_index)
     : device_name(device_name),
       node_name(node_name),
       output_slot(output_slot),
       debug_op(debug_op),
       debug_node_name(
           strings::StrCat(node_name, ":", output_slot, ":", debug_op)),
-      device_path(DeviceNameToDevicePath(device_name)) {}
+      device_path(DeviceNameToDevicePath(device_name)),
+      io_of_node(io_of_node),
+      is_input(is_input),
+      io_index(io_index) {}
 
 bool DebugNodeKey::operator==(const DebugNodeKey& other) const {
   return (device_name == other.device_name && node_name == other.node_name &&
-          output_slot == other.output_slot && debug_op == other.debug_op);
+          output_slot == other.output_slot && debug_op == other.debug_op &&
+          io_of_node == other.io_of_node && is_input == other.is_input &&
+          io_index == other.io_index);
 }
 
 bool DebugNodeKey::operator!=(const DebugNodeKey& other) const {
diff --git a/tensorflow/core/debug/debug_node_key.h b/tensorflow/core/debug/debug_node_key.h
index 8be5fa4e440..5decb5cc683 100644
--- a/tensorflow/core/debug/debug_node_key.h
+++ b/tensorflow/core/debug/debug_node_key.h
@@ -28,7 +28,9 @@ struct DebugNodeKey {
   static const char* const kDeviceTag;
 
   DebugNodeKey(const string& device_name, const string& node_name,
-               const int32_t output_slot, const string& debug_op);
+               int32_t output_slot, const string& debug_op,
+               const string& io_of_node = "", bool is_input = false,
+               int32_t io_index = -1);
 
   // Converts a device name string to a device path string.
   // E.g., /job:localhost/replica:0/task:0/cpu:0 will be converted to
@@ -44,6 +46,9 @@ struct DebugNodeKey {
   const string debug_op;
   const string debug_node_name;
   const string device_path;
+  const string io_of_node;
+  const bool is_input;
+  const int32 io_index;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 4908be28f36..0779262b9df 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -79,6 +79,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf:master_proto_cc",
         "//tensorflow/core/protobuf:worker_proto_cc",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -471,6 +472,7 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/common_runtime/eager:rendezvous_cache",
         "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -778,8 +780,8 @@ tf_cuda_cc_test(
     # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = tf_cuda_tests_tags() + [
         "manual",
-        "notap",  # Memory leak due to b/62910646
         "noguitar",  # Memory leak due to b/62910646
+        "notap",  # Memory leak due to b/62910646
     ],
     deps = [
         ":master",
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 8bf897b909e..e0966c54200 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -33,48 +33,27 @@ limitations under the License.
 
 namespace tensorflow {
 
-static void StartAbortRendevous(Rendezvous* rendez, const Status& s) {
-  rendez->StartAbort(s);
-  rendez->Unref();
-}
-
 BaseRendezvousMgr::BaseRendezvousMgr(const WorkerEnv* worker_env)
-    : worker_env_(worker_env) {}
+    : cache_(new RendezvousCache<BaseRemoteRendezvous>()),
+      worker_env_(worker_env) {}
 
-BaseRendezvousMgr::~BaseRendezvousMgr() {
-  for (auto& p : table_) {
-    auto rendez = p.second;
-    StartAbortRendevous(rendez, errors::Aborted("Shutdown"));
-  }
-}
+BaseRendezvousMgr::~BaseRendezvousMgr() = default;
 
-RemoteRendezvous* BaseRendezvousMgr::Find(int64_t step_id) {
+tsl::core::RefCountPtr<RemoteRendezvous> BaseRendezvousMgr::Find(
+    int64_t step_id) {
   return FindOrCreate(step_id);
 }
 
-BaseRemoteRendezvous* BaseRendezvousMgr::FindOrCreate(int64_t step_id) {
-  mutex_lock l(mu_);
-  auto iter = table_.find(step_id);
-  if (iter == table_.end()) {
-    auto rr = Create(step_id, worker_env_);
-    iter = table_.insert({step_id, rr}).first;
-  }
-  iter->second->Ref();
-  return iter->second;
+tsl::core::RefCountPtr<BaseRemoteRendezvous> BaseRendezvousMgr::FindOrCreate(
+    int64_t step_id) {
+  return cache_->FindOrCreate(
+      step_id, [this, step_id]() { return Create(step_id, worker_env_); });
 }
 
 void BaseRendezvousMgr::RecvLocalAsync(int64_t step_id,
                                        const Rendezvous::ParsedKey& parsed,
                                        Rendezvous::DoneCallback done) {
-  auto rendez = FindOrCreate(step_id);
-  auto done_cb = [rendez, done = std::move(done)](
-                     const Status& s, const Rendezvous::Args& send_args,
-                     const Rendezvous::Args& recv_args, const Tensor& v,
-                     bool dead) {
-    rendez->Unref();
-    done(s, send_args, recv_args, v, dead);
-  };
-  rendez->RecvLocalAsync(parsed, std::move(done_cb));
+  FindOrCreate(step_id)->RecvLocalAsync(parsed, std::move(done));
 }
 
 Status BaseRendezvousMgr::RecvLocal(int64_t step_id,
@@ -96,45 +75,22 @@ Status BaseRendezvousMgr::RecvLocal(int64_t step_id,
   return ret;
 }
 
-void BaseRendezvousMgr::Cleanup(int64_t step_id) {
-  Rendezvous* rendez = nullptr;
-  {
-    mutex_lock l(mu_);
-    auto iter = table_.find(step_id);
-    if (iter != table_.end()) {
-      rendez = iter->second;
-      table_.erase(iter);
-    }
-  }
-  if (rendez) {
-    StartAbortRendevous(rendez, errors::Aborted("Cleanup ", step_id));
-  }
-}
-
-void BaseRendezvousMgr::CleanupAll() {
-  mutex_lock l(mu_);
-  for (auto iter = table_.begin(); iter != table_.end(); iter++) {
-    iter->second->Unref();
-  }
-  table_.clear();
-}
-
 BaseRemoteRendezvous::BaseRemoteRendezvous(const WorkerEnv* env,
                                            int64_t step_id)
     : env_(env),
       step_id_(step_id),
       num_shards_(env_->experimental_num_shards),
-      local_(NewLocalRendezvous(num_shards_)),
+      local_(this, num_shards_),
       session_(nullptr) {
   DCHECK_GT(env_->experimental_num_shards, 0);
 }
 
 BaseRemoteRendezvous::~BaseRemoteRendezvous() {
+  VLOG(5) << "BaseRemoteRendezvous::~BaseRemoteRendezvous() " << this;
   {
     mutex_lock l(calls_mu_);
     calls_.clear();
   }
-  local_->Unref();
 }
 
 // Returns true if "device_name" is a valid full name of local device
@@ -209,7 +165,7 @@ Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
   }
 
   // Buffers "val" and "device_context" in local_.
-  return local_->Send(parsed, args, val, is_dead);
+  return local_.Send(parsed, args, val, is_dead);
 }
 
 Status BaseRemoteRendezvous::ValidateDevices(const ParsedKey& parsed,
@@ -338,12 +294,12 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
   if (IsImplicitLocalDevice(parsed.src) ||
       IsSameWorker(parsed.src, parsed.dst)) {
     // Recv the tensor from local_.
-    local_->RecvAsync(
+    local_.RecvAsync(
         parsed, recv_args,
         [this, parsed, done](
             const Status& status, const Rendezvous::Args& send_args,
             const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) {
-          VLOG(2) << "RemoteRendezvous Finished Recv " << this << " "
+          VLOG(2) << "RemoteRendezvous Finished Local Recv " << this << " "
                   << parsed.FullKey();
           Tensor* out = new Tensor;
           StatusCallback final_callback = [done, send_args, recv_args, out,
@@ -361,12 +317,24 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
         });
     return;
   } else {
-    RecvFromRemoteAsync(parsed, recv_args, std::move(done));
+    // Keep current rendezvous alive while the recv is inflight.
+    this->Ref();
+    RecvFromRemoteAsync(parsed, recv_args,
+                        [this, parsed, done](const Status& status,
+                                             const Rendezvous::Args& send_args,
+                                             const Rendezvous::Args& recv_args,
+                                             const Tensor& in, bool is_dead) {
+                          VLOG(2) << "RemoteRendezvous Finished Remote Recv "
+                                  << this << " " << parsed.FullKey();
+                          done(status, send_args, recv_args, in, is_dead);
+                          this->Unref();
+                        });
   }
 }
 
 void BaseRemoteRendezvous::RecvLocalAsync(const ParsedKey& parsed,
                                           DoneCallback done) {
+  VLOG(2) << "RemoteRendezvous RecvLocal " << this << " " << parsed.FullKey();
   // Test whether the rendezvous is initialized using a shared lock, to avoid
   // the need for exclusive access in the common case.
   if (TF_PREDICT_FALSE(!is_initialized())) {
@@ -379,8 +347,12 @@ void BaseRemoteRendezvous::RecvLocalAsync(const ParsedKey& parsed,
       // rendezvous logic. At some point after Initialize() is called, a Tensor
       // is produced locally that will then be sent in response to the incoming
       // RPC.
-      DeferredCall call(parsed, std::move(done));
-      deferred_calls_.push_back(call);
+
+      // Keeps a reference to ensure current rendezvous won't be released before
+      // these pending calls are applied.
+      tsl::core::RefCountPtr<Rendezvous> rendez_ref = GetNewRef(this);
+      deferred_calls_.emplace_back(parsed, std::move(done),
+                                   std::move(rendez_ref));
       return;
     }
   }
@@ -394,7 +366,7 @@ void BaseRemoteRendezvous::RecvLocalAsyncInternal(const ParsedKey& parsed,
     done(s, Args(), Args(), Tensor(), false);
     return;
   }
-  local_->RecvAsync(parsed, Args(), std::move(done));
+  local_.RecvAsync(parsed, Args(), std::move(done));
 }
 
 void BaseRemoteRendezvous::StartAbort(const Status& s) {
@@ -408,7 +380,7 @@ void BaseRemoteRendezvous::StartAbort(const Status& s) {
     derived_status = StatusGroup::MakeDerived(s);
   }
 
-  local_->StartAbort(derived_status);
+  local_.StartAbort(derived_status);
 
   bool status_ok = false;
   {
@@ -556,8 +528,9 @@ void BaseRemoteRendezvous::DeregisterCall(BaseRecvTensorCall* call,
   }
 }
 
-BaseRemoteRendezvous::DeferredCall::DeferredCall(const ParsedKey& parsed,
-                                                 DoneCallback done)
-    : parsed(parsed), done(std::move(done)) {}
+BaseRemoteRendezvous::DeferredCall::DeferredCall(
+    const ParsedKey& parsed, DoneCallback done,
+    tsl::core::RefCountPtr<Rendezvous> rendez)
+    : parsed(parsed), done(std::move(done)), rendezvous(std::move(rendez)) {}
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index e3820dd1979..04436174100 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_BASE_RENDEZVOUS_MGR_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_BASE_RENDEZVOUS_MGR_H_
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -24,11 +25,13 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/common_runtime/eager/rendezvous_cache.h"
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/control_flow.h"
+#include "tensorflow/core/framework/local_rendezvous.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -37,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/tsl/platform/refcount.h"
 
 namespace tensorflow {
 
@@ -48,6 +52,8 @@ class BaseRecvTensorCall;
 // until the tensor is received.  Each global unique "step_id"
 // corresponds to one local rendezvous instance managed by a
 // RendezvousMgr.
+// RendezvousMgr holds weak references to rendezvous. When a rendezvous is
+// destructed, it will create a new instance to fulfill the Find.
 //
 // E.g.,
 //   Rendezvous* rendez = worker_env->rendezvous_mgr->Find(0x8935);
@@ -73,7 +79,7 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
   //
   // Note: the caller must guarantee to eventually call Initialize on the
   // returned RemoteRendezvous
-  RemoteRendezvous* Find(int64_t step_id) override;
+  tsl::core::RefCountPtr<RemoteRendezvous> Find(int64_t step_id) override;
 
   // Finds the local rendezvous instance for the "step_id".  Runs
   // "done" when the tensor for "key" is produced or an error occurs.
@@ -87,29 +93,22 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
                    Tensor* val, bool* is_dead) override;
 
   // Removes rendezvous for "step_id".
-  //
-  // TODO(zhifengc): Have a background thread in worker that
-  // periodically calls CleanupAll().
-  void Cleanup(int64_t step_id) override;
+  void Cleanup(int64_t step_id) override { cache_->RemoveAndAbort(step_id); }
 
   // Remove all rendezvous instances owned by the rendezvous_mgr.
-  void CleanupAll() override;
+  void CleanupAll() override { cache_->RemoveAll(); }
 
  protected:
-  virtual BaseRemoteRendezvous* Create(int64_t step_id,
-                                       const WorkerEnv* worker_env) = 0;
+  virtual tsl::core::RefCountPtr<BaseRemoteRendezvous> Create(
+      int64_t step_id, const WorkerEnv* worker_env) = 0;
 
  private:
-  // Maps step_id to rendezvous.
-  typedef absl::flat_hash_map<int64_t, BaseRemoteRendezvous*> Table;
+  tsl::core::RefCountPtr<RendezvousCache<BaseRemoteRendezvous>> cache_;
 
   // Not owned.
   const WorkerEnv* const worker_env_;
 
-  mutex mu_;
-  Table table_ TF_GUARDED_BY(mu_);
-
-  BaseRemoteRendezvous* FindOrCreate(int64_t step_id);
+  tsl::core::RefCountPtr<BaseRemoteRendezvous> FindOrCreate(int64_t step_id);
 
   TF_DISALLOW_COPY_AND_ASSIGN(BaseRendezvousMgr);
 };
@@ -187,7 +186,7 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
 
  private:
   int num_shards_;
-  Rendezvous* local_;  // Owns a Ref on this object.
+  LocalRendezvous local_;
   // Indicates whether this remote rendezvous instance is used as the default
   // rendezvous for remote eager op-by-op execution. Errors in eager op-by-op
   // execution should not abort the rendezvous since it is a context-wide
@@ -207,8 +206,10 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   struct DeferredCall {
     const ParsedKey parsed;
     DoneCallback done;
+    tsl::core::RefCountPtr<Rendezvous> rendezvous;
 
-    DeferredCall(const ParsedKey& parsed, DoneCallback done);
+    DeferredCall(const ParsedKey& parsed, DoneCallback done,
+                 tsl::core::RefCountPtr<Rendezvous> rendez);
   };
   std::vector<DeferredCall> deferred_calls_ TF_GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 78e33fe00e7..6a6e6f45560 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -150,7 +150,6 @@ class DeviceResDistTest : public ::testing::Test {
             worker_name);
     auto worker_env = std::make_unique<WorkerEnv>();
     worker_env->env = Env::Default();
-    worker_env->local_devices = device_mgrs_[worker_name]->ListDevices();
     worker_env->device_mgr = device_mgrs_[worker_name].get();
     worker_env->collective_executor_mgr =
         std::make_unique<TestCollectiveExecutorMgr>(
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index f69ab6658b5..bd9d5f100d8 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -470,7 +470,7 @@ TEST_P(CollRMADistTest, ConsFirstAbort) {
       });
   rma_->StartAbort(errors::Internal("Deliberate Failure"));
   consumer_note.WaitForNotification();
-  EXPECT_EQ(consumer_status.error_message(), "Cancelled");
+  EXPECT_EQ(consumer_status.message(), "Cancelled");
 }
 
 TEST_P(CollRMADistTest, ResponseTooLarge) {
@@ -506,7 +506,7 @@ TEST_P(CollRMADistTest, ResponseTooLarge) {
         consumer_note.Notify();
       });
   consumer_note.WaitForNotification();
-  EXPECT_THAT(consumer_status.error_message(),
+  EXPECT_THAT(consumer_status.message(),
               ::testing::HasSubstr("Tensor Size Mismatch"));
   producer_note.WaitForNotification();
   TF_EXPECT_OK(producer_status);
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index ebd861895e4..bdb1cfea4df 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -99,11 +99,7 @@ cc_library(
         ":cluster_function_library_runtime",
         ":remote_mgr",
         ":remote_tensor_handle",
-        "@com_google_absl//absl/container:fixed_array",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
         "//tensorflow/c/eager:immediate_execution_distributed_manager",
-        "//tensorflow/tsl/distributed_runtime/preemption:preemption_notifier",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -115,10 +111,16 @@ cc_library(
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:execute",
         "//tensorflow/core/distributed_runtime:message_wrappers",
+        "//tensorflow/core/distributed_runtime:rpc_collective_executor_mgr",
         "//tensorflow/core/distributed_runtime:session_mgr",
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/tsl/distributed_runtime/preemption:preemption_notifier",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ] + tf_grpc_cc_dependencies(),
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index a80868d15d1..54ec62574f5 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -35,10 +35,12 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h"
 #include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -233,21 +235,27 @@ Status ResetAgentAndConnectToCoordinationService(
 
 Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
                                        CreateContextResponse* response) {
+  bool update_collective_executor_mgr = false;
   {
     mutex_lock l(contexts_mu_);
-    auto context_it = contexts_.find(request->context_id());
-    if (context_it != contexts_.end()) {
-      if (request->context_view_id() <
-          context_it->second->Context()->GetContextViewId()) {
-        return errors::InvalidArgument("EagerService:CreateContext failed. ",
-                                       "Context id: <", request->context_id(),
-                                       "> already exists.");
-      } else {
-        // For existing context with a stale context_view_id, close the old one
-        // and recreate with new view id. This is likely due to the worker
-        // disconnected and then reconnected after one or more cluster updates.
-        context_it->second->Unref();
-        contexts_.erase(context_it);
+    if (contexts_.empty()) {
+      update_collective_executor_mgr = true;
+    } else {
+      auto context_it = contexts_.find(request->context_id());
+      if (context_it != contexts_.end()) {
+        if (request->context_view_id() <
+            context_it->second->Context()->GetContextViewId()) {
+          return errors::InvalidArgument("EagerService:CreateContext failed. ",
+                                         "Context id: <", request->context_id(),
+                                         "> already exists.");
+        } else {
+          // For existing context with a stale context_view_id, close the old
+          // one and recreate with new view id. This is likely due to the worker
+          // disconnected and then reconnected after one or more cluster
+          // updates.
+          context_it->second->Unref();
+          contexts_.erase(context_it);
+        }
       }
     }
   }
@@ -257,7 +265,8 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
         "invalid eager env_ or env_->rendezvous_mgr.");
   }
 
-  auto* r = env_->rendezvous_mgr->Find(request->context_id());
+  tsl::core::RefCountPtr<RemoteRendezvous> r =
+      env_->rendezvous_mgr->Find(request->context_id());
   auto session_name =
       tensorflow::strings::StrCat("eager_", request->context_id());
   if (VLOG_IS_ON(2)) {
@@ -272,10 +281,11 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
       request->server_def().default_session_config().isolate_session_state()));
   int64_t context_id = request->context_id();
   std::function<void()> session_destroyer = [this, context_id, session_name]() {
+    env_->rendezvous_mgr->Cleanup(context_id);
     auto s = env_->session_mgr->DeleteSession(session_name);
     if (!s.ok()) {
       LOG(WARNING) << "Failed to destroy worker session '" << session_name
-                   << "' due to " << s.error_message();
+                   << "' due to " << s.message();
     }
   };
 
@@ -290,15 +300,37 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
   // Set the rendezvous as context-global instance for eager op-by-op execution.
   r->SetRemoteEagerContextDefault();
 
+  std::function<tsl::core::RefCountPtr<Rendezvous>(const int64_t)>
+      rendezvous_creator = [worker_session, this](const int64_t step_id) {
+        tsl::core::RefCountPtr<RemoteRendezvous> r =
+            env_->rendezvous_mgr->Find(step_id);
+        r->Initialize(worker_session.get()).IgnoreError();
+        return r;
+      };
+
   LOG(INFO) << "Creating " << (request->async() ? "async" : "sync")
             << " eager service context with rendezvous_id on host "
             << port::Hostname() << " " << worker_session->worker_name();
   SessionOptions opts;
   opts.config = request->server_def().default_session_config();
+
+  LOG(INFO) << "SessionOptions: " << opts.config.DebugString();
+
+  if (update_collective_executor_mgr) {
+    // Replace the collective execution manager in worker env. This ensures
+    // this newly create EagerContext and the worker service agrees about the
+    // leader and type (RPC / local) of the collectives.
+    // Other EagerContexts are broken if they disagree with the worker service.
+    env_->collective_executor_mgr = CreateProdRpcCollectiveExecutorMgr(
+        opts.config, device_mgr, MaybeCreateNcclCommunicator(opts.config),
+        worker_session->worker_cache(), worker_session->worker_name());
+  }
+
   tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       opts, tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      request->async(), device_mgr, false, r, worker_session->cluster_flr(),
-      env_->collective_executor_mgr.get());
+      request->async(), device_mgr, false, std::move(r),
+      worker_session->cluster_flr(), env_->collective_executor_mgr.get());
+
   // Ownership will be transferred to the ServerContext, or else in an error
   // case ctx will be deleted by this unref.
   core::ScopedUnref unref_ctx(ctx);
@@ -318,10 +350,10 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
   auto remote_mgr =
       std::make_unique<tensorflow::eager::RemoteMgr>(/*is_master=*/false, ctx);
   Status s = ctx->InitializeRemoteWorker(
-      env_, worker_session, std::move(remote_eager_workers),
-      worker_session->remote_device_mgr(), remote_workers,
-      request->context_id(), request->context_view_id(), cluster_flr,
-      std::move(remote_mgr), std::move(session_destroyer));
+      std::move(remote_eager_workers), worker_session->remote_device_mgr(),
+      remote_workers, request->context_id(), request->context_view_id(),
+      std::move(rendezvous_creator), cluster_flr, std::move(remote_mgr),
+      std::move(session_destroyer));
   if (!s.ok()) {
     VLOG(1) << "EagerContext::InitializeRemoteWorker failed with "
             << s.ToString();
@@ -348,7 +380,7 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
     preemption_notifier->WillBePreemptedAtAsync(
         [coord_agent](StatusOr<absl::Time> time_or_status) {
           if (time_or_status.ok()) {
-            const auto& coord_task = coord_agent->GetOwnTask().value();
+            const auto coord_task = coord_agent->GetOwnTask().value();
             Status s = coord_agent->InsertKeyValue(
                 "TF_DEFAULT_PREEMPTION_NOTICE_KEY",
                 absl::StrCat("/job:", coord_task.job_name(),
@@ -539,7 +571,7 @@ void EagerServiceImpl::RunComponentFunction(
     delete num_retvals;
     delete op;
     done(errors::Internal("Error setting is_component_function attribute: ",
-                          s.error_message()));
+                          s.message()));
     return;
   }
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index abe86bb2851..8092362b144 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -36,7 +36,7 @@ namespace eager {
 // over this (e.g. gRPC).
 class EagerServiceImpl {
  public:
-  explicit EagerServiceImpl(const WorkerEnv* env) : env_(env) {
+  explicit EagerServiceImpl(WorkerEnv* env) : env_(env) {
     gc_thread_.reset(
         env_->env->StartThread({}, "EagerServiceContextGC", [this]() {
           while (true) {
@@ -220,7 +220,8 @@ class EagerServiceImpl {
   Status RemoveFunction(const RemoveFunctionOp& remove_function,
                         EagerContext* eager_context);
   Status CleanupFunction(const CleanupFunctionOp& cleanup_function);
-  const WorkerEnv* const env_;  // Not owned.
+
+  WorkerEnv* const env_;  // Not owned.
 
   mutex contexts_mu_;
   std::unordered_map<uint64, ServerContext*> contexts_
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index d35d1982786..991f657513e 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -51,7 +51,7 @@ namespace {
 
 class TestEagerServiceImpl : public EagerServiceImpl {
  public:
-  explicit TestEagerServiceImpl(const WorkerEnv* env) : EagerServiceImpl(env) {}
+  explicit TestEagerServiceImpl(WorkerEnv* env) : EagerServiceImpl(env) {}
   Status GetEagerContext(const uint64 context_id, EagerContext** ctx) {
     ServerContext* context = nullptr;
     TF_RETURN_IF_ERROR(GetServerContext(context_id, &context));
@@ -164,7 +164,6 @@ class EagerServiceImplTest : public ::testing::Test {
 
     device_mgr_ = std::make_unique<StaticDeviceMgr>(
         DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:0"));
-    worker_env_.local_devices = device_mgr_->ListDevices();
     worker_env_.device_mgr = device_mgr_.get();
   }
 
@@ -172,7 +171,7 @@ class EagerServiceImplTest : public ::testing::Test {
   WorkerEnv worker_env_;
   tensorflow::RpcRendezvousMgr rendezvous_mgr_;
   std::unique_ptr<SessionMgr> session_mgr_;
-  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<DynamicDeviceMgr> device_mgr_;
 };
 
 void SetTensorProto(TensorProto* tensor_proto) {
@@ -546,7 +545,7 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
       Env::Default()->SleepForMicroseconds(500000);
       call_opts.StartCancel();
       n.WaitForNotification();
-      EXPECT_TRUE(errors::IsCancelled(status)) << status.error_message();
+      EXPECT_TRUE(errors::IsCancelled(status)) << status.message();
     } else {
       n.WaitForNotification();
       TF_ASSERT_OK(status);
@@ -639,7 +638,7 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     }
     n.WaitForNotification();
     if (test_cancel) {
-      EXPECT_TRUE(errors::IsCancelled(status)) << status.error_message();
+      EXPECT_TRUE(errors::IsCancelled(status)) << status.message();
     } else {
       TF_ASSERT_OK(status);
       // Retrieve the output.
@@ -790,8 +789,9 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
         /*session_metadata=*/nullptr,
         Rendezvous::Factory{[this](const int64_t step_id,
                                    const DeviceMgr* device_mgr,
-                                   Rendezvous** r) {
-          *r = worker_env_.rendezvous_mgr->Find(step_id);
+                                   tsl::core::RefCountPtr<Rendezvous>* r) {
+          *r = tsl::core::RefCountPtr<Rendezvous>(
+              worker_env_.rendezvous_mgr->Find(step_id).release());
           return OkStatus();
         }});
   }
@@ -1230,13 +1230,15 @@ TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
 
 // Test requests sent to the eager service on master.
 TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
-  tensorflow::Rendezvous* rendezvous =
-      new tensorflow::IntraProcessRendezvous(device_mgr_.get());
+  tsl::core::RefCountPtr<tensorflow::Rendezvous> rendezvous =
+      tsl::core::RefCountPtr<tensorflow::Rendezvous>(
+          new tensorflow::IntraProcessRendezvous(device_mgr_.get()));
   // Create a master eager context.
   tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      /*async=*/false, device_mgr_.get(), false, rendezvous, nullptr, nullptr,
+      /*async=*/false, device_mgr_.get(), false, std::move(rendezvous), nullptr,
+      nullptr,
       /*run_eager_op_as_function=*/true);
   const uint64 context_id = random::New64();
 
@@ -1244,9 +1246,9 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   auto remote_mgr =
       std::make_unique<tensorflow::eager::RemoteMgr>(/*is_master=*/true, ctx);
   TF_ASSERT_OK(ctx->InitializeRemoteWorker(
-      /*worker_env=*/nullptr, /*worker_session=*/nullptr,
       /*remote_eager_workers=*/nullptr, /*remote_device_mgr=*/nullptr,
       /*remote_contexts=*/{}, context_id, /*context_view_id=*/0,
+      /*rendezvous_creator=*/nullptr,
       /*cluster_flr=*/nullptr, std::move(remote_mgr),
       /*resource_deallocator=*/nullptr));
 
@@ -1265,7 +1267,7 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
                                              &remote_enqueue_response);
   EXPECT_EQ(error::ABORTED, status.code());
   EXPECT_TRUE(absl::StrContains(
-      status.error_message(),
+      status.message(),
       "Unable to find a context_id matching the specified one"));
 
   // The request can be handled after adding the master eager context to
@@ -1302,7 +1304,7 @@ TEST_F(EagerServiceImplTest, KeepAliveTest) {
 
   EXPECT_EQ(status.code(), error::ABORTED);
   EXPECT_PRED_FORMAT2(::testing::IsSubstring, "Unable to find a context_id",
-                      status.error_message());
+                      std::string(status.message()));
 
   uint64 new_context_id = random::New64();
   // Create a new context.
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 070edf7f5a9..2bec995a1f6 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -238,7 +238,7 @@ void RemoteMgr::DeleteExecutorForStream(uint64 stream_id) {
   }
   Status s = it->second.ShutDown();
   if (!s.ok()) {
-    LOG(ERROR) << "EagerExecutor shutdown with error " << s.error_message();
+    LOG(ERROR) << "EagerExecutor shutdown with error " << s.message();
   }
   executor_map_.erase(it);
 }
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
index 26bc6df78f1..b452efc97fe 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -52,13 +52,13 @@ class RemoteMgrTest : public ::testing::Test {
         DeviceFactory::NewDevice("CPU", {}, "/job:worker/replica:0/task:0"));
     remote_device_ = devices.back().get();
     auto device_mgr = std::make_unique<StaticDeviceMgr>(std::move(devices));
-    tensorflow::Rendezvous* rendezvous =
-        new tensorflow::IntraProcessRendezvous(device_mgr.get());
+    auto rendezvous = tsl::core::RefCountPtr<tensorflow::Rendezvous>(
+        new tensorflow::IntraProcessRendezvous(device_mgr.get()));
     ctx_ = new tensorflow::EagerContext(
         SessionOptions(),
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-        /*async=*/false, device_mgr.release(), true, rendezvous, nullptr,
-        nullptr, /*run_eager_op_as_function=*/true);
+        /*async=*/false, device_mgr.release(), true, std::move(rendezvous),
+        nullptr, nullptr, /*run_eager_op_as_function=*/true);
   }
 
   ~RemoteMgrTest() override { ctx_->Unref(); }
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index 4123ce859fd..f2fe8438963 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -64,7 +64,7 @@ void DestroyRemoteTensorHandle(EagerContext* ctx, const string& remote_task,
           << "Unable to destroy remote tensor handles. If you are "
              "running a tf.function, it usually indicates some op in "
              "the graph gets an error: "
-          << status.error_message();
+          << status.message();
     }
   } else {
     // This thread may still hold tensorflow::StreamingRPCState::mu_. We need
@@ -78,7 +78,7 @@ void DestroyRemoteTensorHandle(EagerContext* ctx, const string& remote_task,
             << "Unable to destroy remote tensor handles. If you are "
                "running a tf.function, it usually indicates some op in "
                "the graph gets an error: "
-            << status.error_message();
+            << status.message();
       }
     });
   }
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index d42be3c8658..dd749b9e868 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -63,7 +63,7 @@ GraphMgr::GraphMgr(const WorkerEnv* worker_env, const DeviceMgr* device_mgr)
   Status status =
       ReadBoolFromEnvVar("TF_SYNC_ON_FINISH", true, &sync_on_finish_);
   if (!status.ok()) {
-    LOG(ERROR) << status.error_message();
+    LOG(ERROR) << status.message();
   }
 }
 
@@ -145,14 +145,11 @@ Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
       /*session_metadata=*/nullptr,
       Rendezvous::Factory{
           [this, session](const int64_t step_id, const DeviceMgr*,
-                          Rendezvous** r) -> Status {
-            auto* remote_r = this->worker_env_->rendezvous_mgr->Find(step_id);
+                          tsl::core::RefCountPtr<Rendezvous>* r) -> Status {
+            tsl::core::RefCountPtr<RemoteRendezvous> remote_r =
+                this->worker_env_->rendezvous_mgr->Find(step_id);
             TF_RETURN_IF_ERROR(remote_r->Initialize(session));
-            *r = remote_r;
-            return OkStatus();
-          },
-          [this](const int64_t step_id) {
-            this->worker_env_->rendezvous_mgr->Cleanup(step_id);
+            *r = std::move(remote_r);
             return OkStatus();
           }}));
 
@@ -354,7 +351,7 @@ Status GraphMgr::DeregisterAll() {
 }
 
 Status GraphMgr::SendInputs(const int64_t step_id, const NamedTensors& in) {
-  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
+  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id).release();
   std::vector<string> keys;
   std::vector<Tensor> tensors_to_send;
   keys.reserve(in.size());
@@ -373,14 +370,14 @@ Status GraphMgr::SendInputs(const int64_t step_id, const NamedTensors& in) {
 }
 
 Status GraphMgr::RecvOutputs(const int64_t step_id, NamedTensors* out) {
-  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
+  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id).release();
   Status s = RecvOutputsFromRendezvous(rendezvous, out, Rendezvous::Args());
   rendezvous->Unref();
   if (!s.ok()) {
     // Failing to fetch the outputs should not be possible, so rewrite the error
     // status to an INTERNAL error.
     s = errors::Internal("Failed to fetch outputs for step ", step_id,
-                         ". (Original error message: ", s.error_message(), ")");
+                         ". (Original error message: ", s.message(), ")");
   }
   size_t output_size = 0;
   for (auto& p : *out) {
@@ -392,7 +389,7 @@ Status GraphMgr::RecvOutputs(const int64_t step_id, NamedTensors* out) {
 
 void GraphMgr::RecvOutputsAsync(const int64_t step_id, NamedTensors* out,
                                 StatusCallback done) {
-  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
+  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id).release();
   std::vector<string> keys;
   std::vector<Tensor>* received_keys = new std::vector<Tensor>;
   keys.reserve(out->size());
@@ -460,7 +457,8 @@ void GraphMgr::ExecuteAsync(
     }
   }
 
-  RemoteRendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
+  RemoteRendezvous* rendezvous =
+      worker_env_->rendezvous_mgr->Find(step_id).release();
   Status s = rendezvous->Initialize(session);
   CollectiveExecutor::Handle* ce_handle =
       item->collective_graph_key != BuildGraphOptions::kNoCollectiveGraphKey
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_recoverable_jobs_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_recoverable_jobs_test.cc
index 6a49147b169..acdb82903a9 100644
--- a/tensorflow/core/distributed_runtime/integration_test/c_api_recoverable_jobs_test.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_recoverable_jobs_test.cc
@@ -60,6 +60,8 @@ void ConfigCoordinationService(tensorflow::ServerDef* server_def,
   coord_config->set_shutdown_barrier_timeout_in_ms(
       absl::ToInt64Milliseconds(absl::Seconds(5)));
   coord_config->set_enable_health_check(enable_health_check);
+  // Allow restarted clients to reconnect.
+  coord_config->set_allow_new_incarnation_to_reconnect(true);
 }
 
 class SingleClientRecoverableJobsTest
diff --git a/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc b/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
index 9b04a419af6..f1be730cbbe 100644
--- a/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
@@ -147,7 +147,7 @@ class TestReportErrorToClusterOp : public OpKernel {
                            "initialized properly."));
       return;
     }
-    tensorflow::Status s(static_cast<tensorflow::error::Code>(error_code),
+    tensorflow::Status s(static_cast<absl::StatusCode>(error_code),
                          error_message);
     s.SetPayload(tsl::CoordinationErrorPayloadKey(),
                  absl::Cord("testing error payload"));
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 5288033afc9..393565a665d 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -236,6 +236,8 @@ class DeviceFinder {
   }
 
   void Start() {
+    LOG(INFO) << "Scanning workers for devices: " << targets_.size()
+              << " total workers";
     {
       mutex_lock l(mu_);
       num_pending_ = targets_.size();
@@ -244,13 +246,14 @@ class DeviceFinder {
       }
     }
     // Talk to all workers to get the list of available devices.
-    using std::placeholders::_1;
-    using std::placeholders::_2;
     for (size_t i = 0; i < targets_.size(); ++i) {
       // TODO(mrry): Propagate a timeout here, since `this->WhenFound()` may
       // never be called.
-      NewRemoteDevices(env_->env, worker_cache_, targets_[i],
-                       std::bind(&ME::WhenFound, this, i, _1, _2));
+      NewRemoteDevices(
+          env_->env, worker_cache_, targets_[i],
+          [this, i](const Status& s, std::vector<Device*>* devices) {
+            WhenFound(i, s, devices);
+          });
     }
   }
 
@@ -360,8 +363,6 @@ void Master::CreateSession(const CreateSessionRequest* req,
   SchedClosure([this, req, resp, done]() {
     Status status;
     WorkerCacheFactoryOptions worker_cache_factory_options;
-    string grpc_protocol("grpc");
-    worker_cache_factory_options.protocol = &grpc_protocol;
     auto call_done = gtl::MakeCleanup([&status, &done] { done(status); });
     status = ValidateExternalGraphDefSyntax(req->graph_def());
     if (!status.ok()) return;
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index bf0cf92c349..ab54167fb7e 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -42,7 +42,6 @@ struct WorkerCacheFactoryOptions {
   const ClusterDef* cluster_def = nullptr;
   const string* job_name = nullptr;
   int task_index;
-  const string* protocol = nullptr;
   const RPCOptions* rpc_options = nullptr;
 
   WorkerCacheFactoryOptions() {}
@@ -55,7 +54,6 @@ struct WorkerCacheFactoryOptions {
       cluster_def = &server_def.cluster();
       job_name = &server_def.job_name();
       task_index = server_def.task_index();
-      protocol = &server_def.protocol();
       rpc_options = &server_def.default_session_config().rpc_options();
     }
   }
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 43bf06174b1..bcdc66d1797 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -533,12 +533,11 @@ class RunManyGraphs {
       Status resp_status = call->resp->status();
       ReportBadStatus(errors::CreateWithUpdatedMessage(
           resp_status, strings::StrCat("From ", *call->worker_name, ":\n",
-                                       resp_status.error_message())));
+                                       resp_status.message())));
     } else if (!s.ok()) {
       mutex_lock l(mu_);
       ReportBadStatus(errors::CreateWithUpdatedMessage(
-          s, strings::StrCat("From ", *call->worker_name, ":\n",
-                             s.error_message())));
+          s, strings::StrCat("From ", *call->worker_name, ":\n", s.message())));
     }
     pending_.DecrementCount();
   }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 3ce84d568cf..35bc935496e 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/message_wrappers.h"
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -644,10 +645,6 @@ errors::Code InMemoryRunGraphResponse::status_code() const {
   return static_cast<errors::Code>(status_.code());
 }
 
-const string& InMemoryRunGraphResponse::status_error_message() const {
-  return status_.error_message();
-}
-
 void InMemoryRunGraphResponse::set_status(const Status& status) {
   status_ = status;
 }
@@ -709,20 +706,17 @@ CostGraphDef* OwnedProtoRunGraphResponse::mutable_cost_graph() {
 }
 
 Status OwnedProtoRunGraphResponse::status() const {
-  return Status(response_.status_code(), response_.status_error_message());
+  return Status(static_cast<absl::StatusCode>(response_.status_code()),
+                response_.status_error_message());
 }
 
-errors::Code OwnedProtoRunGraphResponse::status_code() const {
-  return response_.status_code();
-}
-
-const string& OwnedProtoRunGraphResponse::status_error_message() const {
-  return response_.status_error_message();
+absl::StatusCode OwnedProtoRunGraphResponse::status_code() const {
+  return static_cast<absl::StatusCode>(response_.status_code());
 }
 
 void OwnedProtoRunGraphResponse::set_status(const Status& status) {
   response_.set_status_code(static_cast<tsl::error::Code>(status.code()));
-  response_.set_status_error_message(status.error_message());
+  response_.set_status_error_message(tsl::NullTerminatedMessage(status));
 }
 
 RunGraphResponse* OwnedProtoRunGraphResponse::get_proto() { return &response_; }
@@ -784,20 +778,17 @@ CostGraphDef* NonOwnedProtoRunGraphResponse::mutable_cost_graph() {
 }
 
 Status NonOwnedProtoRunGraphResponse::status() const {
-  return Status(response_->status_code(), response_->status_error_message());
+  return Status(static_cast<absl::StatusCode>(response_->status_code()),
+                response_->status_error_message());
 }
 
-errors::Code NonOwnedProtoRunGraphResponse::status_code() const {
-  return response_->status_code();
-}
-
-const string& NonOwnedProtoRunGraphResponse::status_error_message() const {
-  return response_->status_error_message();
+absl::StatusCode NonOwnedProtoRunGraphResponse::status_code() const {
+  return static_cast<absl::StatusCode>(response_->status_code());
 }
 
 void NonOwnedProtoRunGraphResponse::set_status(const Status& status) {
   response_->set_status_code(static_cast<tsl::error::Code>(status.code()));
-  response_->set_status_error_message(status.error_message());
+  response_->set_status_error_message(tsl::NullTerminatedMessage(status));
 }
 
 RunGraphResponse* NonOwnedProtoRunGraphResponse::get_proto() {
@@ -852,10 +843,6 @@ errors::Code InMemoryRunStepResponse::status_code() const {
   return static_cast<errors::Code>(status_.code());
 }
 
-const string& InMemoryRunStepResponse::status_error_message() const {
-  return status_.error_message();
-}
-
 void InMemoryRunStepResponse::set_status(const Status& status) {
   status_ = status;
 }
@@ -899,20 +886,17 @@ RunMetadata* OwnedProtoRunStepResponse::mutable_metadata() {
 }
 
 Status OwnedProtoRunStepResponse::status() const {
-  return Status(response_.status_code(), response_.status_error_message());
+  return Status(static_cast<absl::StatusCode>(response_.status_code()),
+                response_.status_error_message());
 }
 
-errors::Code OwnedProtoRunStepResponse::status_code() const {
-  return response_.status_code();
-}
-
-const string& OwnedProtoRunStepResponse::status_error_message() const {
-  return response_.status_error_message();
+absl::StatusCode OwnedProtoRunStepResponse::status_code() const {
+  return static_cast<absl::StatusCode>(response_.status_code());
 }
 
 void OwnedProtoRunStepResponse::set_status(const Status& status) {
   response_.set_status_code(static_cast<tsl::error::Code>(status.code()));
-  response_.set_status_error_message(status.error_message());
+  response_.set_status_error_message(tsl::NullTerminatedMessage(status));
 }
 
 RunStepResponse* OwnedProtoRunStepResponse::get_proto() { return &response_; }
@@ -955,20 +939,17 @@ RunMetadata* NonOwnedProtoRunStepResponse::mutable_metadata() {
 }
 
 Status NonOwnedProtoRunStepResponse::status() const {
-  return Status(response_->status_code(), response_->status_error_message());
+  return Status(static_cast<absl::StatusCode>(response_->status_code()),
+                response_->status_error_message());
 }
 
-errors::Code NonOwnedProtoRunStepResponse::status_code() const {
-  return response_->status_code();
-}
-
-const string& NonOwnedProtoRunStepResponse::status_error_message() const {
-  return response_->status_error_message();
+absl::StatusCode NonOwnedProtoRunStepResponse::status_code() const {
+  return static_cast<absl::StatusCode>(response_->status_code());
 }
 
 void NonOwnedProtoRunStepResponse::set_status(const Status& status) {
   response_->set_status_code(static_cast<tsl::error::Code>(status.code()));
-  response_->set_status_error_message(status.error_message());
+  response_->set_status_error_message(tsl::NullTerminatedMessage(status));
 }
 
 RunStepResponse* NonOwnedProtoRunStepResponse::get_proto() { return response_; }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 2304376102a..5370df8a06c 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -496,8 +497,7 @@ class MutableRunGraphResponseWrapper {
 
   // Returned status if requested.
   virtual Status status() const = 0;
-  virtual errors::Code status_code() const = 0;
-  virtual const string& status_error_message() const = 0;
+  virtual absl::StatusCode status_code() const = 0;
   virtual void set_status(const Status& status) = 0;
 
  protected:
@@ -530,8 +530,7 @@ class InMemoryRunGraphResponse : public MutableRunGraphResponseWrapper {
   GraphDef* mutable_partition_graph(size_t i) override;
   void AddPartitionGraph(const GraphDef& partition_graph) override;
   Status status() const override;
-  errors::Code status_code() const override;
-  const string& status_error_message() const override;
+  absl::StatusCode status_code() const override;
   void set_status(const Status& status) override;
 
  protected:
@@ -564,8 +563,7 @@ class OwnedProtoRunGraphResponse : public MutableRunGraphResponseWrapper {
   GraphDef* mutable_partition_graph(size_t i) override;
   void AddPartitionGraph(const GraphDef& partition_graph) override;
   Status status() const override;
-  errors::Code status_code() const override;
-  const string& status_error_message() const override;
+  absl::StatusCode status_code() const override;
   void set_status(const Status& status) override;
 
  protected:
@@ -592,8 +590,7 @@ class NonOwnedProtoRunGraphResponse : public MutableRunGraphResponseWrapper {
   GraphDef* mutable_partition_graph(size_t i) override;
   void AddPartitionGraph(const GraphDef& partition_graph) override;
   Status status() const override;
-  errors::Code status_code() const override;
-  const string& status_error_message() const override;
+  absl::StatusCode status_code() const override;
   void set_status(const Status& status) override;
 
  protected:
@@ -646,8 +643,7 @@ class MutableRunStepResponseWrapper {
 
   // Returned status if requested.
   virtual Status status() const = 0;
-  virtual errors::Code status_code() const = 0;
-  virtual const string& status_error_message() const = 0;
+  virtual absl::StatusCode status_code() const = 0;
   virtual void set_status(const Status& status) = 0;
 
  protected:
@@ -678,8 +674,7 @@ class InMemoryRunStepResponse : public MutableRunStepResponseWrapper {
   const RunMetadata& metadata() const override;
   RunMetadata* mutable_metadata() override;
   Status status() const override;
-  errors::Code status_code() const override;
-  const string& status_error_message() const override;
+  absl::StatusCode status_code() const override;
   void set_status(const Status& status) override;
 
  protected:
@@ -708,8 +703,7 @@ class OwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
   const RunMetadata& metadata() const override;
   RunMetadata* mutable_metadata() override;
   Status status() const override;
-  errors::Code status_code() const override;
-  const string& status_error_message() const override;
+  absl::StatusCode status_code() const override;
   void set_status(const Status& status) override;
 
  protected:
@@ -734,8 +728,7 @@ class NonOwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
   const RunMetadata& metadata() const override;
   RunMetadata* mutable_metadata() override;
   Status status() const override;
-  errors::Code status_code() const override;
-  const string& status_error_message() const override;
+  absl::StatusCode status_code() const override;
   void set_status(const Status& status) override;
 
  protected:
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index 4ed06366bd8..458273de5da 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -99,48 +99,48 @@ void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
           done(s, &remote_devices);
           delete call;
         });
-    if (s.ok()) {
-      DeviceNameUtils::ParsedName worker_name_parsed;
-      if (!DeviceNameUtils::ParseFullName(worker_name, &worker_name_parsed) ||
-          !worker_name_parsed.has_job || !worker_name_parsed.has_replica ||
-          !worker_name_parsed.has_task) {
-        s = errors::InvalidArgument("Could not parse worker name: ",
-                                    worker_name);
-        LOG(WARNING) << s;
-        return;
-      }
-      remote_devices.reserve(call->resp.device_attributes_size());
-      for (const DeviceAttributes& da : call->resp.device_attributes()) {
-        DeviceNameUtils::ParsedName device_name_parsed;
-        CHECK(DeviceNameUtils::ParseFullName(da.name(), &device_name_parsed))
-            << "Device attribute name '" << da.name() << "' could not be "
-            << "parsed. Device Attribute: " << da.DebugString();
-        // Preserve the exact name, if possible.
-        // TODO(b/37868888): Simplify when legacy device name formats removed.
-        if (device_name_parsed.job == worker_name_parsed.job &&
-            device_name_parsed.replica == worker_name_parsed.replica &&
-            device_name_parsed.task == worker_name_parsed.task) {
-          auto d = new RemoteDevice(env, da);
-          remote_devices.push_back(d);
-        } else {
-          DeviceAttributes da_rewritten = da;
-          da_rewritten.set_name(DeviceNameUtils::FullName(
-              worker_name_parsed.job, worker_name_parsed.replica,
-              worker_name_parsed.task, device_name_parsed.type,
-              device_name_parsed.id));
-          auto d = new RemoteDevice(env, da_rewritten);
+    if (!s.ok()) {
+      return;
+    }
+    DeviceNameUtils::ParsedName worker_name_parsed;
+    if (!DeviceNameUtils::ParseFullName(worker_name, &worker_name_parsed) ||
+        !worker_name_parsed.has_job || !worker_name_parsed.has_replica ||
+        !worker_name_parsed.has_task) {
+      s = errors::InvalidArgument("Could not parse worker name: ", worker_name);
+      LOG(WARNING) << s;
+      return;
+    }
+    remote_devices.reserve(call->resp.device_attributes_size());
+    for (const DeviceAttributes& da : call->resp.device_attributes()) {
+      DeviceNameUtils::ParsedName device_name_parsed;
+      CHECK(DeviceNameUtils::ParseFullName(da.name(), &device_name_parsed))
+          << "Device attribute name '" << da.name() << "' could not be "
+          << "parsed. Device Attribute: " << da.DebugString();
+      // Preserve the exact name, if possible.
+      // TODO(b/37868888): Simplify when legacy device name formats removed.
+      if (device_name_parsed.job == worker_name_parsed.job &&
+          device_name_parsed.replica == worker_name_parsed.replica &&
+          device_name_parsed.task == worker_name_parsed.task) {
+        auto d = new RemoteDevice(env, da);
+        remote_devices.push_back(d);
+      } else {
+        DeviceAttributes da_rewritten = da;
+        da_rewritten.set_name(DeviceNameUtils::FullName(
+            worker_name_parsed.job, worker_name_parsed.replica,
+            worker_name_parsed.task, device_name_parsed.type,
+            device_name_parsed.id));
+        auto d = new RemoteDevice(env, da_rewritten);
 
-          // Experimental: Skipping over adding any TPU-type devices that aren't
-          // on the job called "worker" (but still adds the CPUs of other jobs).
-          if (getenv("TPU_NO_POPULATE_DEVICE_LIST_FROM_CLUSTER_SPEC") !=
-              nullptr) {
-            if (worker_name_parsed.job == "worker" ||
-                device_name_parsed.type.find("TPU") == std::string::npos) {
-              remote_devices.push_back(d);
-            }
-          } else {
+        // Experimental: Skipping over adding any TPU-type devices that aren't
+        // on the job called "worker" (but still adds the CPUs of other jobs).
+        if (getenv("TPU_NO_POPULATE_DEVICE_LIST_FROM_CLUSTER_SPEC") !=
+            nullptr) {
+          if (worker_name_parsed.job == "worker" ||
+              device_name_parsed.type.find("TPU") == std::string::npos) {
             remote_devices.push_back(d);
           }
+        } else {
+          remote_devices.push_back(d);
         }
       }
     }
diff --git a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
index bb56f927fc0..2308d8abc9c 100644
--- a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
+++ b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
@@ -71,7 +71,7 @@ class RemoteRendezvous : public Rendezvous {
 // RendezvousMgr must have keys generated by Rendezvous::CreateKey.
 class RendezvousMgrInterface {
  public:
-  RendezvousMgrInterface() {}
+  RendezvousMgrInterface() = default;
   virtual ~RendezvousMgrInterface() {}
 
   // Returns Rendezvous supporting send and recv among workers in the
@@ -80,7 +80,7 @@ class RendezvousMgrInterface {
   //
   // Note: the caller must guarantee to eventually call Initialize on the
   // returned RemoteRendezvous
-  virtual RemoteRendezvous* Find(int64_t step_id) = 0;
+  virtual tsl::core::RefCountPtr<RemoteRendezvous> Find(int64_t step_id) = 0;
 
   // Finds the local rendezvous instance for the "step_id".  Runs
   // "done" when the tensor for "key" is produced or an error occurs.
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index d19f573c773..643f04b75a7 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -74,12 +74,12 @@ cc_library(
     deps = [
         ":grpc_client_cq_tag",
         ":grpc_util",
-        "@com_google_absl//absl/strings:str_format",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/tsl/distributed_runtime/rpc:grpc_state",
+        "@com_google_absl//absl/strings:str_format",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -120,13 +120,13 @@ cc_library(
     hdrs = ["grpc_tensor_coding.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "@com_google_absl//absl/flags:flag",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf:worker_proto_cc",
+        "@com_google_absl//absl/flags:flag",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -172,13 +172,14 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "grpc_response_cache",
-    srcs = ["grpc_response_cache.cc"],
-    hdrs = ["grpc_response_cache.h"],
+    name = "rpc_response_cache",
+    srcs = ["rpc_response_cache.cc"],
+    hdrs = ["rpc_response_cache.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -189,11 +190,10 @@ tf_cuda_library(
     hdrs = ["grpc_worker_service.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        ":grpc_response_cache",
         ":grpc_tensor_coding",
         ":grpc_util",
         ":grpc_worker_service_impl",
-        "@com_google_absl//absl/container:flat_hash_map",
+        ":rpc_response_cache",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -207,9 +207,10 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:worker_session",
         "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
         "//tensorflow/core/protobuf:worker_proto_cc",
-        "//tensorflow/tsl/protobuf:rpc_options_proto_cc",
         "//tensorflow/tsl/distributed_runtime/rpc:async_service_interface",
         "//tensorflow/tsl/distributed_runtime/rpc:grpc_call",
+        "//tensorflow/tsl/protobuf:rpc_options_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -303,7 +304,6 @@ cc_library(
         ":grpc_worker_cache",
         ":grpc_worker_service",
         ":rpc_rendezvous_mgr",
-        "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -325,6 +325,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service_impl",
+        "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/profiler/rpc:profiler_service_impl",
         "//tensorflow/tsl/distributed_runtime/rpc:async_service_interface",
     ] + tf_protos_profiler_service() + tf_grpc_dependencies() + tf_grpc_cc_dependencies(),
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index 71bf4841167..bc16b56e708 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -16,8 +16,8 @@ cc_library(
     hdrs = ["grpc_eager_service.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "//tensorflow/core/protobuf:eager_service_cc_grpc_proto",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/core/protobuf:eager_service_cc_grpc_proto",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -28,8 +28,6 @@ cc_library(
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_eager_service",
-        "//tensorflow/core/platform:error_payloads",
-        "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -39,7 +37,9 @@ cc_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_client_cq_tag",
         "//tensorflow/core/distributed_runtime/rpc:grpc_state",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/platform:error_payloads",
         "//tensorflow/core/protobuf:eager_service_proto_cc",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
     ] + tf_grpc_cc_dependencies(),
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index afd43a7c83c..184025fa738 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -253,10 +253,10 @@ class GrpcEagerClient : public EagerClient {
           metrics::UpdateEagerClientErrorCounter(
               error_source_proto.ErrorSource_Name(
                   error_source_proto.error_source()),
-              error_name(status.code()));
+              absl::StatusCodeToString(status.code()));
         } else {
-          metrics::UpdateEagerClientErrorCounter("unknown",
-                                                 error_name(status.code()));
+          metrics::UpdateEagerClientErrorCounter(
+              "unknown", absl::StatusCodeToString(status.code()));
         }
       }
     };
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
index a9ae242f76c..03f406f509f 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
@@ -27,7 +27,7 @@ namespace tensorflow {
 namespace eager {
 
 GrpcEagerServiceImpl::GrpcEagerServiceImpl(
-    const WorkerEnv* env, ::grpc::ServerBuilder* server_builder)
+    WorkerEnv* env, ::grpc::ServerBuilder* server_builder)
     : env_(env),
       local_impl_(env),
       enqueue_streaming_thread_(env_->env, "enqueue_streaming_thread", 1) {
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index 4dcc1328581..2f5767e6fc5 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -41,8 +41,7 @@ class GrpcEagerServiceImpl : public tsl::AsyncServiceInterface {
                                             grpc::EagerService::AsyncService,
                                             RequestMessage, ResponseMessage>;
 
-  GrpcEagerServiceImpl(const WorkerEnv* env,
-                       ::grpc::ServerBuilder* server_builder);
+  GrpcEagerServiceImpl(WorkerEnv* env, ::grpc::ServerBuilder* server_builder);
   virtual ~GrpcEagerServiceImpl() {}
 
   // Create a master context in eager service.
@@ -151,7 +150,7 @@ class GrpcEagerServiceImpl : public tsl::AsyncServiceInterface {
     });
   }
 
-  const WorkerEnv* const env_;  // Not owned.
+  WorkerEnv* const env_;  // Not owned.
   EagerServiceImpl local_impl_;
 
   // A single-threaded thread pool to handle streaming enqueue rpc request.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index 795780f445f..b5d99769642 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -30,6 +30,8 @@ limitations under the License.
 // RunGraph on workers.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_master_service.h"
 
+#include <string>
+
 #include "grpcpp/alarm.h"
 #include "grpcpp/server_builder.h"
 #include "tensorflow/core/distributed_runtime/master.h"
@@ -207,7 +209,8 @@ class GrpcMasterService : public tsl::AsyncServiceInterface {
           if (call->request.store_errors_in_response_body() && !status.ok()) {
             call->response.set_status_code(
                 static_cast<error::Code>(status.code()));
-            call->response.set_status_error_message(status.error_message());
+            call->response.set_status_error_message(
+                std::string(status.message()));
             call->SendResponse(ToGrpcStatus(OkStatus()));
           } else {
             call->SendResponse(ToGrpcStatus(status));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 75a09ebaa8a..a9c71b264f9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -206,7 +206,6 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
     worker_env_.device_mgr = opts.local_device_mgr;
     owned_device_manager_.reset(nullptr);
   }
-  worker_env_.local_devices = worker_env_.device_mgr->ListDevices();
   master_env_.local_devices = worker_env_.device_mgr->ListDevices();
 
   int num_tasks = 0;
@@ -249,7 +248,7 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   const Status status =
       ReadBoolFromEnvVar("TF_GRPC_REUSE_PORT", false, &reuse_port);
   if (!status.ok()) {
-    LOG(ERROR) << status.error_message();
+    LOG(ERROR) << status.message();
   }
   auto server_build_option =
       reuse_port
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index 483652cdf61..8e94ab2f411 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -314,8 +314,8 @@ TEST(GrpcSessionTest, DisableOutputPartitionGraphs) {
     Status s = session->Run(run_options, {}, {}, {node_names[2]}, nullptr,
                             &run_metadata);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(absl::StrContains(s.error_message(),
-                                  "disable_output_partition_graphs"));
+    EXPECT_TRUE(
+        absl::StrContains(s.message(), "disable_output_partition_graphs"));
   }
 
   TF_CHECK_OK(session->Close());
@@ -875,7 +875,7 @@ void CreateInvalidGraph(const string& graph_def_ascii,
   Status s = session->Create(graph);
 
   ASSERT_FALSE(s.ok());
-  EXPECT_NE(s.error_message().find(error_substring), string::npos);
+  EXPECT_NE(s.message().find(error_substring), string::npos);
 }
 
 TEST(SessionTest, InvalidOpName) {
@@ -1029,7 +1029,7 @@ TEST(SessionTest, ExtendValidation) {
 
   Status s = session->Extend(extension);
   ASSERT_FALSE(s.ok());
-  EXPECT_NE(s.error_message().find("Illegal op input name"), string::npos);
+  EXPECT_NE(s.message().find("Illegal op input name"), string::npos);
 
   // 2. Succeed with a valid node.
   success = protobuf::TextFormat::ParseFromString(R"(
@@ -1057,7 +1057,7 @@ TEST(SessionTest, ExtendValidation) {
   ASSERT_TRUE(success);
   s = session->Extend(extension);
   ASSERT_FALSE(s.ok());
-  EXPECT_NE(s.error_message().find("'b', which was created by a previous call"),
+  EXPECT_NE(s.message().find("'b', which was created by a previous call"),
             string::npos);
 }
 // Tests that Create() with "operation_timeout_in_ms" set times out.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
index 696f7ae2832..e7c5d68bc1c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
@@ -143,7 +143,8 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val, bool require_ack,
     size_t exceeded_bytes = val.TotalBytes() - kProtoBufLimitBytes;
     LOG(FATAL) << "Cannot encode a Tensor that exceeds the 2GB protobuf limit. "
                   "Exceeded bytes: "
-               << exceeded_bytes;
+               << exceeded_bytes
+               << ", tensor shape: " << val.shape().AsProto().DebugString();
   }
 
   RecvTensorResponse response;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
index 807d7ec9707..f9314e92e66 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
@@ -116,7 +116,7 @@ int main(int argc, char* argv[]) {
   tensorflow::Status s = tensorflow::FillServerDef(cluster_spec, job_name,
                                                    task_index, &server_def);
   if (!s.ok()) {
-    std::cerr << "ERROR: " << s.error_message() << std::endl;
+    std::cerr << "ERROR: " << s.message() << std::endl;
     Usage(argv[0]);
     return -1;
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
index 27a05969ec1..849a52f00b0 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
@@ -100,8 +100,7 @@ int main(int argc, char* argv[]) {
   tensorflow::Status s = tensorflow::FillServerDef(job_spec, job_name, num_cpus,
                                                    num_gpus, task_index, &def);
   if (!s.ok()) {
-    LOG(ERROR) << "Could not parse job spec: " << s.error_message() << "\n"
-               << usage;
+    LOG(ERROR) << "Could not parse job spec: " << s.message() << "\n" << usage;
     return -1;
   }
 
@@ -109,7 +108,7 @@ int main(int argc, char* argv[]) {
   s = tensorflow::NewServer(def, &svr);
 
   if (!s.ok()) {
-    LOG(ERROR) << "Could not create server: " << s.error_message();
+    LOG(ERROR) << "Could not create server: " << s.message();
     return -1;
   }
   TF_QCHECK_OK(svr->Start());
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 925f1a3ad67..7f9771257b5 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -33,10 +33,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_response_cache.h"
 #include "tensorflow/core/distributed_runtime/worker.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
@@ -101,11 +101,10 @@ class GrpcWorkerServiceThread {
  public:
   explicit GrpcWorkerServiceThread(
       GrpcWorker* worker, ::grpc::ServerBuilder* builder,
-      std::unordered_map<int, int> queue_depth, GrpcResponseCache* cache,
+      std::unordered_map<int, int> queue_depth,
       grpc::WorkerService::AsyncService* worker_service)
       : worker_(worker),
         queue_depth_(queue_depth),
-        cache_(cache),
         worker_service_(worker_service),
         is_shutdown_(false) {
     cq_ = builder->AddCompletionQueue();
@@ -358,7 +357,6 @@ class GrpcWorkerServiceThread {
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   std::unique_ptr<Thread> thread_;
   std::unordered_map<int, int> queue_depth_;
-  GrpcResponseCache* cache_;
   grpc::WorkerService::AsyncService* const worker_service_;
 
   mutex shutdown_mu_;
@@ -374,9 +372,8 @@ class GrpcWorkerService : public tsl::AsyncServiceInterface {
     builder->RegisterService(&worker_service_);
 
     for (int i = 0; i < options.num_serving_threads; i++) {
-      threads_.emplace_back(
-          new GrpcWorkerServiceThread(worker, builder, options.queue_depth,
-                                      cache_.get(), &worker_service_));
+      threads_.emplace_back(new GrpcWorkerServiceThread(
+          worker, builder, options.queue_depth, &worker_service_));
     }
   }
 
@@ -411,7 +408,6 @@ class GrpcWorkerService : public tsl::AsyncServiceInterface {
   grpc::WorkerService::AsyncService worker_service_;
   std::vector<std::unique_ptr<GrpcWorkerServiceThread>> threads_;
 
-  std::unique_ptr<GrpcResponseCache> cache_;
   mutex service_shutdown_mu_;
   bool is_shutdown_ TF_GUARDED_BY(service_shutdown_mu_);
 
@@ -433,7 +429,7 @@ GrpcWorker::GrpcWorker(WorkerEnv* worker_env, const ConfigProto& config)
 
 void GrpcWorker::EnableResponseCache() {
   VLOG(3) << "Enabling gRPC tensor response cache.";
-  response_cache_ = std::make_unique<GrpcResponseCache>();
+  response_cache_ = std::make_unique<RpcResponseCache>();
 }
 
 // GrpcRecvTensorAsync: unlike the other Worker methods, which use protocol
@@ -472,7 +468,7 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
                              const Status& status) {
     if (cache_enabled) {
       // Data is ready. Process all pending requests in the response cache.
-      response_cache_->OnRequestFinished(request_id, tensor, is_dead, status);
+      response_cache_->RequestFinished(request_id, tensor, is_dead, status);
     } else {
       do_response(tensor, is_dead, status);
     }
@@ -521,47 +517,38 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
           const Rendezvous::Args& recv_args, const Tensor& val,
           const bool is_dead) {
         opts->ClearCancelCallback();
-        if (status.ok()) {
-          // DMA can only be used for Tensors that do not fall into
-          // the following three odd edge cases: 1) a zero-size
-          // buffer, 2) a dead tensor which has an uninit value, and
-          // 3) the tensor has the on_host allocation attribute,
-          // i.e. it's in CPU RAM *independent of its assigned
-          // device type*.
-          const bool on_host = send_args.alloc_attrs.on_host();
-          {
-            // Non-DMA cases.
-            if (src_dev->tensorflow_accelerator_device_info() && (!on_host)) {
-              DeviceContext* send_dev_context = send_args.device_context;
-              AllocatorAttributes alloc_attrs;
-              alloc_attrs.set_gpu_compatible(true);
-              alloc_attrs.set_on_host(true);
-              profiler::ScopedMemoryDebugAnnotation op_annotation(
-                  "GrpcWorker::RecvTensorAsync::consumer_callback",
-                  request->step_id(), "dynamic", val.dtype(),
-                  [shape = val.shape()]() { return shape.DebugString(); });
-              Allocator* alloc = src_dev->GetAllocator(alloc_attrs);
-              Tensor* copy = new Tensor(alloc, val.dtype(), val.shape());
-              CHECK(send_dev_context)
-                  << "send dev name: " << src_dev->name() << " gpu_info: "
-                  << src_dev->tensorflow_accelerator_device_info();
-              // "val" is on an accelerator device. Uses the device_context to
-              // fill the copy on host.
-              StatusCallback copy_ready = [rendezvous_done, copy,
-                                           is_dead](const Status& s) {
-                // The value is now ready to be returned on the wire.
-                rendezvous_done(*copy, is_dead, s);
-                delete copy;
-              };
-
-              CopyDeviceToHost(&val, alloc, alloc, request->rendezvous_key(),
-                               src_dev, copy, send_dev_context, copy_ready);
-              return;
-            }
-          }
+        if (!status.ok()) {
+          return rendezvous_done(val, is_dead, status);
         }
 
-        rendezvous_done(val, is_dead, status);
+        const bool on_host = send_args.alloc_attrs.on_host();
+        if (!src_dev->tensorflow_accelerator_device_info() || on_host) {
+          return rendezvous_done(val, is_dead, status);
+        }
+
+        DeviceContext* send_dev_context = send_args.device_context;
+        AllocatorAttributes alloc_attrs;
+        alloc_attrs.set_gpu_compatible(true);
+        alloc_attrs.set_on_host(true);
+        profiler::ScopedMemoryDebugAnnotation op_annotation(
+            "GrpcWorker::RecvTensorAsync::consumer_callback",
+            request->step_id(), "dynamic", val.dtype(),
+            [shape = val.shape()]() { return shape.DebugString(); });
+        Allocator* alloc = src_dev->GetAllocator(alloc_attrs);
+        Tensor* copy = new Tensor(alloc, val.dtype(), val.shape());
+        CHECK(send_dev_context)
+            << "send dev name: " << src_dev->name()
+            << " gpu_info: " << src_dev->tensorflow_accelerator_device_info();
+
+        StatusCallback copy_ready = [rendezvous_done, copy,
+                                     is_dead](const Status& s) {
+          // The value is now ready to be returned on the wire.
+          rendezvous_done(*copy, is_dead, s);
+          delete copy;
+        };
+
+        CopyDeviceToHost(&val, alloc, alloc, request->rendezvous_key(), src_dev,
+                         copy, send_dev_context, copy_ready);
       });
 }
 
@@ -623,7 +610,7 @@ void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
                              const Tensor& tensor, const Status& status) {
     if (cache_enabled) {
       // Data is ready. Process all pending requests in the response cache.
-      response_cache_->OnRequestFinished(request_id, tensor, false, status);
+      response_cache_->RequestFinished(request_id, tensor, false, status);
     } else {
       do_response(tensor, false, status);
     }
@@ -760,7 +747,7 @@ void GrpcWorker::RemoveCacheEntryForId(int64_t request_id) {
 
 std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env,
                                           const ConfigProto& config) {
-  return std::unique_ptr<GrpcWorker>(new GrpcWorker(env, config));
+  return std::make_unique<GrpcWorker>(env, config);
 }
 
 std::unique_ptr<tsl::AsyncServiceInterface> NewGrpcWorkerService(
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index e69c4fdc1f6..b4f663262bf 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <unordered_map>
 
 #include "grpcpp/server_builder.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_response_cache.h"
 #include "tensorflow/core/distributed_runtime/worker.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 #include "tensorflow/tsl/distributed_runtime/rpc/async_service_interface.h"
@@ -39,7 +39,7 @@ namespace tensorflow {
 class ConfigProto;
 struct WorkerEnv;
 class WorkerSession;
-class GrpcResponseCache;
+class RpcResponseCache;
 
 class GrpcWorker : public Worker {
  public:
@@ -68,7 +68,7 @@ class GrpcWorker : public Worker {
   void RemoveCacheEntryForId(int64_t request_id);
 
  private:
-  std::unique_ptr<GrpcResponseCache> response_cache_;
+  std::unique_ptr<RpcResponseCache> response_cache_;
   const int32 recv_buf_max_chunk_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index 68fbe3a641c..25f5ec9774d 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -56,7 +56,7 @@ class SerializationTraits<tensorflow::TensorResponse> {
       if (!s.ok()) {
         result = Status(StatusCode::INTERNAL,
                         ::tensorflow::strings::StrCat(
-                            "TensorResponse parse error", s.error_message()));
+                            "TensorResponse parse error", s.message()));
       }
     }
     buffer->Clear();
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 5f89302ef77..0b2a974b1e4 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -308,9 +308,10 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
 RpcRendezvousMgr::RpcRendezvousMgr(const WorkerEnv* env)
     : BaseRendezvousMgr(env) {}
 
-BaseRemoteRendezvous* RpcRendezvousMgr::Create(int64_t step_id,
-                                               const WorkerEnv* worker_env) {
-  return new RpcRemoteRendezvous(worker_env, step_id);
+tsl::core::RefCountPtr<BaseRemoteRendezvous> RpcRendezvousMgr::Create(
+    int64_t step_id, const WorkerEnv* worker_env) {
+  return tsl::core::RefCountPtr<BaseRemoteRendezvous>(
+      new RpcRemoteRendezvous(worker_env, step_id));
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
index 1e4127a593b..ffc9e7b2dea 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
@@ -47,7 +47,8 @@ class RpcRendezvousMgr : public BaseRendezvousMgr {
   explicit RpcRendezvousMgr(const WorkerEnv* env);
 
  protected:
-  BaseRemoteRendezvous* Create(int64_t step_id, const WorkerEnv* worker_env);
+  tsl::core::RefCountPtr<BaseRemoteRendezvous> Create(
+      int64_t step_id, const WorkerEnv* worker_env) override;
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(RpcRendezvousMgr);
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index cd9b98747d2..ea374f7346c 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -148,9 +148,8 @@ TEST_F(RpcRendezvousMgrTest, LocalSendRecv) {
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
-    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    tsl::core::RefCountPtr<RemoteRendezvous> rendez = rmgr_.Find(step_id);
     TF_ASSERT_OK(rendez->Initialize(&worker_session_));
-    core::ScopedUnref unref(rendez);
     Rendezvous::Args args;
     TF_ASSERT_OK(rendez->Send(key, args, V("peach"), false));
   }
@@ -169,9 +168,8 @@ TEST_F(RpcRendezvousMgrTest, LocalAbort) {
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {  // Explicit Abort().
     const int64_t step_id = 123;
-    RemoteRendezvous* rendez = rmgr_.Find(step_id);
-    core::ScopedUnref unref(rendez);
-    SchedClosure([this, rendez]() {
+    tsl::core::RefCountPtr<RemoteRendezvous> rendez = rmgr_.Find(step_id);
+    SchedClosure([this, rendez = rendez.GetNewRef()]() {
       env.env->SleepForMicroseconds(100 * 1000);
       rendez->StartAbort(errors::Aborted(""));
     });
@@ -183,8 +181,7 @@ TEST_F(RpcRendezvousMgrTest, LocalAbort) {
   }
   {  // Cleanup causes Abort().
     const int64_t step_id = 321;
-    RemoteRendezvous* rendez = rmgr_.Find(step_id);
-    core::ScopedUnref unref(rendez);
+    tsl::core::RefCountPtr<RemoteRendezvous> rendez = rmgr_.Find(step_id);
     SchedClosure([this, step_id]() {
       env.env->SleepForMicroseconds(100 * 1000);
       rmgr_.Cleanup(step_id);
@@ -203,8 +200,7 @@ TEST_F(RpcRendezvousMgrTest, LocalCancel) {
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   auto* cm = new CancellationManager();
   const int64_t step_id = 123;
-  RemoteRendezvous* rendez = rmgr_.Find(step_id);
-  core::ScopedUnref unref(rendez);
+  tsl::core::RefCountPtr<RemoteRendezvous> rendez = rmgr_.Find(step_id);
   Notification n;
   SchedClosure([this, cm, &n]() {
     env.env->SleepForMicroseconds(100 * 1000);
@@ -227,10 +223,9 @@ TEST_F(RpcRendezvousMgrTest, CancelAfterReceived) {
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   auto* cm = new CancellationManager();
   const int64_t step_id = 123;
-  RemoteRendezvous* rendez = rmgr_.Find(step_id);
-  core::ScopedUnref unref(rendez);
+  tsl::core::RefCountPtr<RemoteRendezvous> rendez = rmgr_.Find(step_id);
   Notification n;
-  SchedClosure([this, rendez, key, cm, &n]() {
+  SchedClosure([this, rendez = rendez.get(), key, cm, &n]() {
     env.env->SleepForMicroseconds(100 * 1000);
     TF_ASSERT_OK(rendez->Send(key, Rendezvous::Args(), V("peach"), false));
     cm->StartCancel();
@@ -267,8 +262,7 @@ TEST_F(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
-    RemoteRendezvous* rendez = rmgr_.Find(step_id);
-    core::ScopedUnref unref(rendez);
+    tsl::core::RefCountPtr<RemoteRendezvous> rendez = rmgr_.Find(step_id);
     Rendezvous::Args args;
     args.device_context = dc;
     TF_ASSERT_OK(rendez->Initialize(&worker_session_));
@@ -299,9 +293,8 @@ TEST_F(RpcRendezvousMgrTest, RemoteRecvOne) {
       "/job:worker/replica:1/task:2/cpu:0", 7890,
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
-    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    tsl::core::RefCountPtr<RemoteRendezvous> rendez = rmgr_.Find(step_id);
     TF_ASSERT_OK(rendez->Initialize(&worker_session_));
-    core::ScopedUnref unref(rendez);
     Rendezvous::Args args;
 
     Tensor val(DT_STRING);
@@ -318,9 +311,8 @@ TEST_F(RpcRendezvousMgrTest, RemoteRecvAsyncMany) {
       "/job:worker/replica:1/task:2/cpu:0", 7890,
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
-    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    tsl::core::RefCountPtr<RemoteRendezvous> rendez = rmgr_.Find(step_id);
     TF_ASSERT_OK(rendez->Initialize(&worker_session_));
-    core::ScopedUnref unref(rendez);
     Rendezvous::Args args;
 
     // Send a large number of async RPC requests to fill up the buffer in
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.cc b/tensorflow/core/distributed_runtime/rpc/rpc_response_cache.cc
similarity index 67%
rename from tensorflow/core/distributed_runtime/rpc/grpc_response_cache.cc
rename to tensorflow/core/distributed_runtime/rpc/rpc_response_cache.cc
index f7f76601206..62a26218cbd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_response_cache.cc
@@ -13,15 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_response_cache.h"
+
+#include "absl/log/log.h"
 #include "absl/types/optional.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 
-bool GrpcResponseCache::QueueRequest(int64_t request_id, int64_t step_id,
-                                     const FinishResponseCB& cb) {
-  VLOG(1) << "GrpcResponseCache Lookup " << request_id;
+auto* tf_response_cache_hits = monitoring::Counter<0>::New(
+    "/tensorflow/rpc/service/response_cache_hits",
+    "Number of times the tensor response cache was used.");
+
+bool RpcResponseCache::QueueRequest(int64_t request_id, int64_t step_id,
+                                    const FinishResponseCB& cb) {
+  VLOG(1) << "RpcResponseCache Lookup " << request_id;
 
   mu_.lock();
 
@@ -29,12 +36,18 @@ bool GrpcResponseCache::QueueRequest(int64_t request_id, int64_t step_id,
 
   if (entry.state == ResponseCacheEntry::State::FINISHED) {
     VLOG(1) << "Reuse cached response for " << request_id;
+
     // Make a copy of the ResponseCacheEntry so that we can run FinishResponse
     // outside the critical section. FinishResponse can be potentially
     // expensive.
     auto entry_copy = entry;
 
     mu_.unlock();
+
+    tf_response_cache_hits->GetCell()->IncrementBy(1);
+    LOG_EVERY_N_SEC(INFO, 60)
+        << "RPC Cache rescued duplicate RPC request. id=" << request_id
+        << " step=" << step_id;
     entry_copy.FinishResponse(cb);
     return true;
   }
@@ -45,6 +58,11 @@ bool GrpcResponseCache::QueueRequest(int64_t request_id, int64_t step_id,
     VLOG(1) << "Found active request for " << request_id
             << ".  Adding entry to response queue.";
     mu_.unlock();
+
+    tf_response_cache_hits->GetCell()->IncrementBy(1);
+    LOG_EVERY_N_SEC(INFO, 60)
+        << "RPC Cache rescued duplicate RPC request. id=" << request_id
+        << " step=" << step_id;
     return true;
   } else {
     VLOG(2) << "No cache entry for " << request_id
@@ -56,10 +74,9 @@ bool GrpcResponseCache::QueueRequest(int64_t request_id, int64_t step_id,
   }
 }
 
-void GrpcResponseCache::OnRequestFinished(int64_t request_id,
-                                          const Tensor& tensor, bool is_dead,
-                                          const Status& status) {
-  absl::optional<ResponseCacheEntry> entry_copy;
+void RpcResponseCache::RequestFinished(int64_t request_id, const Tensor& tensor,
+                                       bool is_dead, const Status& status) {
+  ResponseCacheEntry entry_copy;
 
   {
     mutex_lock m(mu_);
@@ -88,23 +105,23 @@ void GrpcResponseCache::OnRequestFinished(int64_t request_id,
     entry.callbacks.clear();
   }
 
-  for (auto& cb : entry_copy->callbacks) {
-    entry_copy->FinishResponse(cb);
+  for (auto& cb : entry_copy.callbacks) {
+    entry_copy.FinishResponse(cb);
   }
 }
 
-void GrpcResponseCache::EraseRequestId(int64_t request_id) {
+void RpcResponseCache::EraseRequestId(int64_t request_id) {
   mutex_lock m(mu_);
   response_cache_.erase(request_id);
 }
 
-void GrpcResponseCache::CleanEntriesForStep(int64_t step_id) {
+void RpcResponseCache::CleanEntriesForStep(int64_t step_id) {
   mutex_lock m(mu_);
   // Remove all cache entries whose step id is the given step_id
   for (auto it = response_cache_.begin(), last = response_cache_.end();
        it != last;) {
     if (it->second.step_id == step_id) {
-      VLOG(1) << "Erase stale GrpcResponseCache entry " << it->first;
+      VLOG(1) << "Erase stale RpcResponseCache entry " << it->first;
       it = response_cache_.erase(it);
     } else {
       ++it;
@@ -112,4 +129,9 @@ void GrpcResponseCache::CleanEntriesForStep(int64_t step_id) {
   }
 }
 
+int64_t RpcResponseCache::size() {
+  mutex_lock m(mu_);
+  return response_cache_.size();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h b/tensorflow/core/distributed_runtime/rpc/rpc_response_cache.h
similarity index 89%
rename from tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
rename to tensorflow/core/distributed_runtime/rpc/rpc_response_cache.h
index 42fc2b112b9..c7c7567fa2b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_response_cache.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RESPONSE_CACHE_H_
-#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RESPONSE_CACHE_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RESPONSE_CACHE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RESPONSE_CACHE_H_
 
 #include <memory>
 #include <unordered_map>
@@ -38,7 +38,7 @@ namespace tensorflow {
 // * ACTIVE: another thread is active processing this RPC
 // * FINISHED: the worker has finished processing the method
 
-class GrpcResponseCache {
+class RpcResponseCache {
  public:
   using FinishResponseCB = std::function<void(
       const Tensor& tensor, bool is_dead, const Status& status)>;
@@ -55,8 +55,8 @@ class GrpcResponseCache {
 
   // Fill the response cache for the given request_id and respond to all
   // pending request.
-  void OnRequestFinished(int64_t request_id, const Tensor& tensor, bool is_dead,
-                         const Status& status);
+  void RequestFinished(int64_t request_id, const Tensor& tensor, bool is_dead,
+                       const Status& status);
 
   // Erase the cache entry with the given request_id
   void EraseRequestId(int64_t request_id);
@@ -64,6 +64,8 @@ class GrpcResponseCache {
   // Erase cache entries with the given step_id
   void CleanEntriesForStep(int64_t step_id);
 
+  int64_t size();
+
  private:
   struct ResponseCacheEntry {
     enum class State {
@@ -92,4 +94,4 @@ class GrpcResponseCache {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RESPONSE_CACHE_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RESPONSE_CACHE_H_
diff --git a/tensorflow/core/distributed_runtime/server_lib_test.cc b/tensorflow/core/distributed_runtime/server_lib_test.cc
index 233724a51b0..2ac89ae67d5 100644
--- a/tensorflow/core/distributed_runtime/server_lib_test.cc
+++ b/tensorflow/core/distributed_runtime/server_lib_test.cc
@@ -47,10 +47,9 @@ TEST(ServerLibTest, NewServerNoFactoriesAccept) {
   Status s = NewServer(server_def, &server);
   ASSERT_NE(s, OkStatus());
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(),
-      "No server factory registered for the given ServerDef"));
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
-                                "The available server factories are: ["));
+      s.message(), "No server factory registered for the given ServerDef"));
+  EXPECT_TRUE(
+      absl::StrContains(s.message(), "The available server factories are: ["));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index c48e88862b1..338aba4afac 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -181,7 +181,10 @@ Status SessionMgr::CreateSession(
     worker_cache->SetLogging(this->is_logging_active_);
   }
 
-  CHECK(!worker_env_->local_devices.empty())
+  CHECK(worker_env_->device_mgr)  // Crash OK
+      << "The WorkerEnv must have a device manager.";
+  std::vector<Device*> local_devices = worker_env_->device_mgr->ListDevices();
+  CHECK(!local_devices.empty())  // Crash OK
       << "The WorkerEnv must have at least one device in `local_devices`.";
 
   std::shared_ptr<WorkerSession> worker_session;
@@ -197,8 +200,8 @@ Status SessionMgr::CreateSession(
 
     // Create a private copy of the DeviceMgr for the WorkerSession.
     std::vector<std::unique_ptr<Device>> renamed_devices;
-    renamed_devices.reserve(worker_env_->local_devices.size());
-    for (Device* d : worker_env_->local_devices) {
+    renamed_devices.reserve(local_devices.size());
+    for (Device* d : local_devices) {
       renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
           worker_name, d, false, isolate_session_state));
     }
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 9202a4b1e45..46b53e2be0e 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -50,9 +50,8 @@ class SessionMgrTest : public ::testing::Test {
       : mgr_(&env_, "/job:mnist/replica:0/task:0",
              std::unique_ptr<WorkerCacheInterface>(), factory_,
              /*coordination_handler=*/nullptr) {
-    device_mgr_ = std::make_unique<StaticDeviceMgr>(
+    device_mgr_ = std::make_unique<DynamicDeviceMgr>(
         FakeDevice::MakeCPU("/job:mnist/replica:0/task:0/device:fakecpu:0"));
-    env_.local_devices = device_mgr_->ListDevices();
     env_.device_mgr = device_mgr_.get();
   }
 
@@ -252,8 +251,7 @@ TEST_F(SessionMgrTest, UnknownSessionHandle) {
   std::shared_ptr<WorkerSession> session;
   Status s = mgr_.WorkerSessionForSession(session_handle, &session);
   EXPECT_TRUE(errors::IsAborted(s));
-  EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "Session handle is not found"));
+  EXPECT_TRUE(absl::StrContains(s.message(), "Session handle is not found"));
   EXPECT_TRUE(s.GetPayload(kWorkerPossiblyRestarted).has_value());
 }
 
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 064f2e04c50..005628016dd 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/worker.h"
 
+#include <utility>
+
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/process_util.h"
@@ -112,16 +114,17 @@ void Worker::DeregisterGraphAsync(const DeregisterGraphRequest* request,
 }
 
 void Worker::AbortStep(int64_t step_id) {
-  RemoteRendezvous* rendez = env_->rendezvous_mgr->Find(step_id);
+  tsl::core::RefCountPtr<RemoteRendezvous> rendez =
+      env_->rendezvous_mgr->Find(step_id);
   // Do not abort if it's a context global instance for eager op-by-op execution
   if (rendez->IsRemoteEagerContextDefault()) return;
-  SchedNonBlockingClosureAfter(1000000, [rendez, step_id]() {
+  SchedNonBlockingClosureAfter(1000000, [rendez = std::move(rendez),
+                                         step_id]() {
     // Delay a bit before aborting the step. This way, the root
     // cause may return first back to the client instead of this
     // cancellation generated abort error.
     rendez->StartAbort(errors::Aborted("Step ", step_id,
                                        " cancelled.  Cancelling rendezvous."));
-    rendez->Unref();
   });
 }
 
@@ -362,7 +365,7 @@ void Worker::CleanupGraphAsync(const CleanupGraphRequest* request,
   if (env_->collective_executor_mgr) {
     env_->collective_executor_mgr->Cleanup(step_id);
   }
-  for (Device* d : env_->local_devices) {
+  for (Device* d : env_->device_mgr->ListDevices()) {
     ScopedAllocatorMgr* sam = d->GetScopedAllocatorMgr();
     if (sam) {
       sam->Cleanup(step_id);
diff --git a/tensorflow/core/distributed_runtime/worker_env.h b/tensorflow/core/distributed_runtime/worker_env.h
index c257a15d238..350c3e5fa16 100644
--- a/tensorflow/core/distributed_runtime/worker_env.h
+++ b/tensorflow/core/distributed_runtime/worker_env.h
@@ -49,11 +49,6 @@ struct WorkerEnv {
   // session_mgr encapsulates state for each session.
   SessionMgr* session_mgr = nullptr;
 
-  // The local devices of this worker. Devices are owned by the device_mgr.
-  //
-  // REQUIRES: !local_devices.empty().
-  std::vector<Device*> local_devices;
-
   // In large scaled distributed training, many singleton components (e.g.
   // Rendezvous) can becomes the bottleneck of the system. This field allows
   // us to shard the single components. This number will scale up with number
diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h
index 3d27d83e05f..777040a692c 100644
--- a/tensorflow/core/example/feature_util.h
+++ b/tensorflow/core/example/feature_util.h
@@ -361,6 +361,54 @@ const Feature& GetFeature(absl::string_view key, const ProtoType& proto) {
   return GetFeatures(proto).feature().at(internal::ProtoMapKey(key));
 }
 
+// Returns a read-only Feature proto for the specified key, returns nullptr
+// if the key is not found. Supported types for the proto: SequenceExample,
+// Example, Features.
+template <typename ProtoType>
+const Feature* MaybeGetFeature(absl::string_view key, const ProtoType& proto) {
+  const protobuf::Map<std::string, Feature>& feature_map =
+      GetFeatures(proto).feature();
+  auto it = feature_map.find(internal::ProtoMapKey(key));
+
+  if (it == feature_map.end()) {
+    return nullptr;
+  }
+
+  return &it->second;
+}
+
+// Base declaration of a family of template functions to return a read only
+// repeated field of feature values or nullptr.
+template <typename FeatureType>
+const typename internal::RepeatedFieldTrait<FeatureType>::Type*
+MaybeGetFeatureValues(const Feature& feature);
+
+template <>
+const protobuf::RepeatedField<protobuf_int64>*
+MaybeGetFeatureValues<protobuf_int64>(const Feature& feature);
+template <>
+const protobuf::RepeatedField<float>* MaybeGetFeatureValues<float>(
+    const Feature& feature);
+template <>
+const protobuf::RepeatedPtrField<std::string>* MaybeGetFeatureValues<tstring>(
+    const Feature& feature);
+template <>
+const protobuf::RepeatedPtrField<std::string>*
+MaybeGetFeatureValues<std::string>(const Feature& feature);
+
+// Returns a read only repeated field corresponding to a feature with the
+// specified name and FeatureType. Supported ProtoTypes: SequenceExample,
+// Example, Features.
+template <typename FeatureType, typename ProtoType>
+const typename internal::RepeatedFieldTrait<FeatureType>::Type*
+MaybeGetFeatureValues(absl::string_view key, const ProtoType& proto) {
+  const Feature* feature = MaybeGetFeature(key, proto);
+  if (feature == nullptr) {
+    return nullptr;
+  }
+  return &GetFeatureValues<FeatureType>(*feature);
+}
+
 // Returns a mutable Feature proto for the specified key, creates a new if
 // necessary. Supported types for the proto: SequenceExample, Example, Features.
 template <typename ProtoType>
@@ -568,6 +616,16 @@ bool HasFeature(absl::string_view key, const Example& example) {
   return HasFeature<FeatureType...>(key, GetFeatures(example));
 }
 
+// Returns true if a feature with the specified key belongs to the
+// SequenceExample. Doesn't check feature type if used without FeatureType,
+// otherwise the specialized versions return false if the feature has a wrong
+// type.
+template <typename... FeatureType>
+bool HasFeature(absl::string_view key,
+                const SequenceExample& sequence_example) {
+  return HasFeature<FeatureType...>(key, GetFeatures(sequence_example));
+}
+
 // TODO(gorban): update all clients in a followup CL.
 template <typename... FeatureType>
 ABSL_DEPRECATED("Use HasFeature instead.")
diff --git a/tensorflow/core/example/feature_util_test.cc b/tensorflow/core/example/feature_util_test.cc
index eacdbe1e1e7..374bbe6093b 100644
--- a/tensorflow/core/example/feature_util_test.cc
+++ b/tensorflow/core/example/feature_util_test.cc
@@ -99,6 +99,15 @@ TEST(GetFeatureValuesInt64Test, CheckUntypedFieldExistence) {
   EXPECT_TRUE(HasFeature("tag", example));
 }
 
+TEST(GetFeatureValuesInt64Test, CheckUntypedFieldExistenceForSequenceExample) {
+  SequenceExample seq_example;
+  ASSERT_FALSE(HasFeature("tag", seq_example));
+
+  GetFeatureValues<protobuf_int64>("tag", &seq_example)->Add(0);
+
+  EXPECT_TRUE(HasFeature("tag", seq_example));
+}
+
 TEST(GetFeatureValuesInt64Test, CheckTypedFieldExistence) {
   Example example;
 
@@ -113,6 +122,20 @@ TEST(GetFeatureValuesInt64Test, CheckTypedFieldExistence) {
   EXPECT_EQ(42, tag_ro.Get(0));
 }
 
+TEST(GetFeatureValuesInt64Test, CheckTypedFieldExistenceForSequenceExample) {
+  SequenceExample sequence_example;
+
+  GetFeatureValues<float>("tag", &sequence_example)->Add(3.14);
+  ASSERT_FALSE(HasFeature<protobuf_int64>("tag", sequence_example));
+
+  GetFeatureValues<protobuf_int64>("tag", &sequence_example)->Add(42);
+
+  EXPECT_TRUE(HasFeature<protobuf_int64>("tag", sequence_example));
+  auto tag_ro = GetFeatureValues<protobuf_int64>("tag", sequence_example);
+  ASSERT_EQ(1, tag_ro.size());
+  EXPECT_EQ(42, tag_ro.Get(0));
+}
+
 TEST(GetFeatureValuesInt64Test, CopyIterableToAField) {
   Example example;
   std::vector<int> values{1, 2, 3};
@@ -748,5 +771,47 @@ TEST(SequenceExampleTest, SetFeatureValuesWithInitializerList) {
   EXPECT_EQ(se.DebugString(), expected_proto.DebugString());
 }
 
+TEST(MaybeGetFeatureValuesTest, ReturnsNullPtr) {
+  const Example example;
+  auto tag = MaybeGetFeatureValues<protobuf_int64>("tag", example);
+  ASSERT_EQ(tag, nullptr);
+}
+
+TEST(MaybeGetFeatureValuesTest, ReadsASingleInt) {
+  Example example;
+  (*example.mutable_features()->mutable_feature())["tag"]
+      .mutable_int64_list()
+      ->add_value(42);
+
+  auto tag = MaybeGetFeatureValues<protobuf_int64>("tag", example);
+
+  ASSERT_EQ(1, tag->size());
+  EXPECT_EQ(42, tag->Get(0));
+}
+
+TEST(MaybeGetFeatureValuesTest, ReadsASingleFloat) {
+  Example example;
+  (*example.mutable_features()->mutable_feature())["tag"]
+      .mutable_float_list()
+      ->add_value(0.3);
+
+  auto tag = MaybeGetFeatureValues<float>("tag", example);
+
+  ASSERT_EQ(1, tag->size());
+  EXPECT_FLOAT_EQ(0.3, tag->Get(0));
+}
+
+TEST(MaybeGetFeatureValuesTest, ReadsASingleString) {
+  Example example;
+  (*example.mutable_features()->mutable_feature())["tag"]
+      .mutable_bytes_list()
+      ->add_value("entry");
+
+  auto tag = MaybeGetFeatureValues<std::string>("tag", example);
+
+  ASSERT_EQ(1, tag->size());
+  EXPECT_EQ("entry", tag->Get(0));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 0e6f7b681e1..a4d7400ad8d 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -33,6 +33,7 @@ package(
         # TODO(pedaveeraiah): to be removed when summary.proto.h deps moves to TSL
         "//tensorflow/tsl/lib:__subpackages__",
         # copybara:uncomment "//learning/brain/tfrt/aot:__subpackages__",
+        # copybara:uncomment "//platforms/xla/megascale/tensorflow:__subpackages__",
     ],
     licenses = ["notice"],
 )
@@ -55,6 +56,7 @@ exports_files(
         "graph_to_functiondef.h",
         "kernel_def_builder.h",
         "kernel_def_util.h",
+        "local_rendezvous.h",
         "logging.h",
         "lookup_interface.h",
         "memory_types.h",
@@ -120,6 +122,7 @@ exports_files(
         "full_type.proto",
         "function.proto",
         "graph.proto",
+        "graph_debug_info.proto",
         "graph_transfer_info.proto",
         "kernel_def.proto",
         "log_memory.proto",
@@ -481,9 +484,9 @@ cc_library(
     deps = [
         ":numeric_types",
         ":type_traits",
+        "//tensorflow/tsl/framework:allocator",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "//tensorflow/tsl/framework:allocator",
     ] + if_static(
         extra_deps = [
             ":allocator_registry_impl",
@@ -545,6 +548,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1030,11 +1034,10 @@ cc_library(
         "full_type_util.h",
     ],
     visibility = [
-        "//tensorflow/core:__subpackages__",
-        "//tensorflow/security/fuzzing:__subpackages__",
         # Necessary for calling REGISTER_OP with SetTypeConstructor
         # http://yaqs/4779722181477138432
         "//intelligence/brella/analytics/sql/tensorflow:__pkg__",
+        "//tensorflow:__subpackages__",
     ],
     deps = [
         ":attr_value_proto_cc",
@@ -1061,6 +1064,7 @@ cc_library(
     deps = [
         ":full_type_proto_cc",
         ":full_type_util",
+        ":loose_headers",
         ":op_def_builder",
         ":op_def_util",
         "//tensorflow/core:core_stringpiece",
@@ -1345,6 +1349,7 @@ tf_cc_tests(
         "//tensorflow/core/util:protos_test_cc",
         "//third_party/eigen3",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -1500,6 +1505,7 @@ tf_proto_library(
     protodeps = [
         ":attr_value_proto",
         ":function_proto",
+        ":graph_debug_info_proto",
         ":node_def_proto",
         ":op_def_proto",
         ":resource_handle_proto",
@@ -1525,6 +1531,15 @@ tf_proto_library(
     ],
 )
 
+tf_proto_library(
+    name = "graph_debug_info_proto",
+    srcs = ["graph_debug_info.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = [
+    ],
+)
+
 tf_proto_library(
     name = "allocation_description_proto",
     srcs = ["allocation_description.proto"],
@@ -1821,6 +1836,7 @@ tf_proto_library(
         ":device_attributes_proto",
         ":full_type_proto",
         ":function_proto",
+        ":graph_debug_info_proto",
         ":graph_proto",
         ":graph_transfer_info_proto",
         ":kernel_def_proto",
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 13d8e3124fd..a8d36387915 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -2448,7 +2448,7 @@ Status ScatterNdShapeHelper(InferenceContext* c, ShapeHandle indices_shape,
             "] = ", c->DebugString(prefix_indices),
             " must match dimensions [0,", outer_dims,
             ") of updates[shape=", c->DebugString(updates_shape),
-            "] = ", c->DebugString(prefix_updates), ": ", s.error_message());
+            "] = ", c->DebugString(prefix_updates), ": ", s.message());
       }
 
       ShapeHandle suffix_output;
@@ -2464,7 +2464,7 @@ Status ScatterNdShapeHelper(InferenceContext* c, ShapeHandle indices_shape,
             "] = ", c->DebugString(suffix_output), " must match dimensions [",
             outer_dims, ",", c->Rank(updates_shape),
             ") of updates[shape=", c->DebugString(updates_shape),
-            "] = ", c->DebugString(suffix_updates), ": ", s.error_message());
+            "] = ", c->DebugString(suffix_updates), ": ", s.message());
       }
     }
   }
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 2b11dd4b2d8..b81c84d2733 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -143,8 +143,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(absl::StrContains(s.error_message(),
-                                  "Shape must be rank 2 but is rank 1"));
+    EXPECT_TRUE(
+        absl::StrContains(s.message(), "Shape must be rank 2 but is rank 1"));
   }
 
   {
@@ -164,7 +164,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(absl::StrContains(s.error_message(),
+    EXPECT_TRUE(absl::StrContains(s.message(),
                                   "Dimensions must be equal, but are 5 and 3"));
   }
 
@@ -174,8 +174,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {S({2, 5, 3}), S({3, 5, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(absl::StrContains(s.error_message(),
-                                  "Shape must be rank 2 but is rank 3"));
+    EXPECT_TRUE(
+        absl::StrContains(s.message(), "Shape must be rank 2 but is rank 3"));
   }
 
   {
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 2fad39d4b94..bdeb8637f3a 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -15,7 +15,9 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 
 #include <unordered_map>
+#include <vector>
 
+#include "tensorflow/core/activity_watcher/activity.h"
 #include "tensorflow/core/framework/dataset.pb.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/function.h"
@@ -399,6 +401,137 @@ int32_t GetRunnerThreadpoolSizeFromOpKernelContext(OpKernelContext* ctx) {
   }
 }
 
+int64_t MemoryCheckpoint::IdRegistry::Add(const std::string& prefix,
+                                          const std::string& key) {
+  mutex_lock l(mu_);
+  auto pair = std::make_pair(prefix, key);
+  if (string_to_int_.contains(pair)) {
+    return string_to_int_[pair];
+  }
+  int64_t id = next_id_++;
+  int_to_string_[id] = pair;
+  string_to_int_[pair] = id;
+  return id;
+}
+
+std::vector<int64_t> MemoryCheckpoint::IdRegistry::GetMatchingIds(
+    const std::string& prefix_to_match) {
+  mutex_lock l(mu_);
+  std::vector<int64_t> ids;
+  for (const auto& [pair, id] : string_to_int_) {
+    auto [prefix, key] = pair;
+    if (prefix.compare(0, prefix_to_match.length(), prefix_to_match) == 0) {
+      ids.push_back(id);
+    }
+  }
+  return ids;
+}
+
+std::pair<std::string, std::string> MemoryCheckpoint::IdRegistry::Get(
+    int64_t id) {
+  mutex_lock l(mu_);
+  auto result = int_to_string_.find(id);
+  DCHECK(result != int_to_string_.end())
+      << "Failed find id " << id << " in IdRegistry. "
+      << "Max id is: " << next_id_ - 1;
+  return result->second;
+}
+
+void MemoryCheckpoint::IdRegistry::RemoveIds(const std::vector<int64_t>& ids) {
+  mutex_lock l(mu_);
+  for (const auto& id : ids) {
+    string_to_int_.erase(int_to_string_[id]);
+    int_to_string_.erase(id);
+  }
+}
+
+std::string MemoryCheckpoint::DebugString() const {
+  std::string result = absl::StrCat("status=", status_.ToString(),
+                                    ", "
+                                    "root=",
+                                    (is_root_ ? "true" : "false"), "\n");
+  absl::StrAppend(&result, "number of integers: ", int_values_.size(), "\n");
+  for (const auto& [k, v] : int_values_) {
+    absl::StrAppend(&result, "  ", id_registry_->Get(k).first, ":",
+                    id_registry_->Get(k).second, ": ", v, "\n");
+  }
+  absl::StrAppend(&result, "number of strings: ", str_values_.size(), "\n");
+  for (const auto& [k, v] : str_values_) {
+    absl::StrAppend(&result, "  ", id_registry_->Get(k).first, ":",
+                    id_registry_->Get(k).second, ": ", v, "\n");
+  }
+  absl::StrAppend(&result, "number of tensors: ", tensor_values_.size(), "\n");
+
+  absl::StrAppend(
+      &result, "number of expired prefixes: ", expired_prefixes_.size(), "\n");
+  return result;
+}
+
+void MemoryCheckpoint::Merge(MemoryCheckpoint* other) {
+  if (!status_.ok()) {
+    return;
+  }
+
+  if (!other->status_.ok()) {
+    status_ = other->status_;
+    int_values_.clear();
+    str_values_.clear();
+    tensor_values_.clear();
+  }
+
+  for (const auto& [k, v] : other->int_values_) {
+    int_values_[k] = v;
+  }
+  for (const auto& [k, v] : other->str_values_) {
+    str_values_[k] = v;
+  }
+  for (const auto& [k, v] : other->tensor_values_) {
+    tensor_values_[k] = v;
+  }
+
+  // Get the expired prefixes from `other`. Since the info only needs to be
+  // propagated once downstream, we also clean the `expired_prefixes_` of
+  // `other` here.
+  for (const auto& prefix : other->expired_prefixes_) {
+    Purge(prefix);
+  }
+
+  other->expired_prefixes_.clear();
+  VLOG(5) << "MemoryCheckpoint::Merge " << DebugString();
+}
+
+void MemoryCheckpoint::Purge(const std::string& prefix) {
+  std::vector<int64_t> ids = id_registry_->GetMatchingIds(prefix);
+  for (const auto& id : ids) {
+    int_values_.erase(id);
+    str_values_.erase(id);
+    tensor_values_.erase(id);
+  }
+  if (!is_root_) {
+    expired_prefixes_.insert(prefix);
+  } else {
+    // We no longer need the mapping after change has been propagated all the
+    // way to root.
+    id_registry_->RemoveIds(ids);
+  }
+}
+
+Status MemoryCheckpoint::Save(IteratorStateWriter* writer) const {
+  for (const auto& [id, value] : int_values_) {
+    auto [prefix, key] = id_registry_->Get(id);
+    TF_RETURN_IF_ERROR(writer->WriteScalar(prefix, key, value));
+  }
+  for (const auto& [id, value] : str_values_) {
+    auto [prefix, key] = id_registry_->Get(id);
+    TF_RETURN_IF_ERROR(writer->WriteScalar(prefix, key, value));
+  }
+  for (const auto& [id, value] : tensor_values_) {
+    auto [prefix, key] = id_registry_->Get(id);
+    TF_RETURN_IF_ERROR(writer->WriteTensor(prefix, key, value));
+  }
+  return OkStatus();
+}
+
 Status IteratorBase::InitializeBase(IteratorContext* ctx,
                                     const IteratorBase* parent) {
   parent_ = parent;
@@ -490,6 +623,22 @@ std::string FullName(const std::string& prefix, const std::string& name) {
   return strings::StrCat(kFullNameRandomHex, kPipe, prefix, kColon, name);
 }
 
+Status ExtractIteratorPrefix(StringPiece key, string* prefix) {
+  if (!str_util::StartsWith(key, data::kFullNameRandomHex)) {
+    return errors::InvalidArgument("Key: ", key,
+                                   " was not generated using full_name.");
+  }
+  std::vector<string> split_keys = str_util::Split(key, data::kPipe);
+  if (split_keys.size() != 2) {
+    return errors::InvalidArgument("Key: ", key,
+                                   " was not generated using full_name.");
+  }
+  string real_key = split_keys[1];
+  const int pos = real_key.rfind(kColon);
+  *prefix = real_key.substr(0, pos);
+  return OkStatus();
+}
+
 Status GetDatasetFromVariantTensor(const Tensor& tensor,
                                    DatasetBase** out_dataset) {
   if (!(tensor.dtype() == DT_VARIANT &&
@@ -614,11 +763,11 @@ void MergeOptions(const protobuf::MessageLite& source,
 void DatasetBase::Initialize(const Metadata& metadata) {
   Status s = ComputeNumSources();
   if (!s.ok()) {
-    LOG(ERROR) << s;
+    LOG_EVERY_N_SEC(ERROR, 10) << s;
   }
   s = MergeOptionsFromInputs();
   if (!s.ok()) {
-    LOG(ERROR) << s;
+    LOG_EVERY_N_SEC(ERROR, 10) << s;
   }
   metadata_ = metadata;
   if (metadata_.name() == "") {
@@ -632,9 +781,7 @@ Status DatasetBase::ComputeNumSources() {
   std::vector<const DatasetBase*> inputs;
   Status s = InputDatasets(&inputs);
   if (errors::IsUnimplemented(s)) {
-    return errors::Unimplemented(
-        "Cannot compute input sources for dataset of type ", type_string(),
-        ", because the dataset does not implement `InputDatasets`.");
+    return s;
   }
   if (num_sources_ >= 0) {
     // Already computed.
@@ -696,9 +843,7 @@ Status DatasetBase::MergeOptionsFromInputs() {
   std::vector<const DatasetBase*> inputs;
   Status s = InputDatasets(&inputs);
   if (errors::IsUnimplemented(s)) {
-    return errors::Unimplemented(
-        "Cannot merge options for dataset of type ", type_string(),
-        ", because the dataset does not implement `InputDatasets`.");
+    return s;
   }
   if (inputs.empty()) {
     return OkStatus();
@@ -783,8 +928,11 @@ int64_t DatasetBase::Cardinality(CardinalityOptions options) const {
 
 Status DatasetBase::InputDatasets(
     std::vector<const DatasetBase*>* inputs) const {
-  return errors::Unimplemented("InputDatasets not implemented for ",
-                               type_string());
+  return errors::Unimplemented(
+      "Cannot compute input sources for dataset of type ", type_string(),
+      ", because the dataset does not implement `InputDatasets`. To fix this, "
+      "your dataset should override the `InputDatasets` method. If it is a "
+      "source dataset, it should return empty inputs.");
 }
 
 Status DatasetBase::DatasetGraphDefBuilder::AddInputDataset(
@@ -946,15 +1094,23 @@ string DatasetBaseIterator::BuildTraceMeName() {
 Status DatasetBaseIterator::GetNext(IteratorContext* ctx,
                                     std::vector<Tensor>* out_tensors,
                                     bool* end_of_sequence) {
+  activity_watcher::ActivityScope activity_scope([&]() {
+    activity_watcher::Activity::Attributes attributes;
+    attributes["iterator_prefix"] = prefix();
+    return std::make_unique<activity_watcher::Activity>(
+        "Iterator::GetNext", activity_watcher::ActivityCategory::kDatasetOp,
+        std::move(attributes));
+  });
   profiler::TraceMe activity([&] { return BuildTraceMeName(); },
                              profiler::TraceMeLevel::kInfo);
   DVLOG(3) << prefix() << " GetNext enter";
   auto model = ctx->model();
+  bool output_was_recording =
+      node_ && node_->output() && node_->output()->is_recording();
   if (collect_resource_usage(ctx)) {
     int64_t now_nanos = EnvTime::NowNanos();
-    auto output = node_->output();
-    if (output) {
-      output->record_stop(now_nanos);
+    if (output_was_recording) {
+      node_->output()->record_stop(now_nanos);
     }
     node_->record_start(now_nanos);
   }
@@ -978,9 +1134,8 @@ Status DatasetBaseIterator::GetNext(IteratorContext* ctx,
   if (collect_resource_usage(ctx)) {
     int64_t now_nanos = EnvTime::NowNanos();
     node_->record_stop(now_nanos);
-    auto output = node_->output();
-    if (output) {
-      output->record_start(now_nanos);
+    if (output_was_recording) {
+      node_->output()->record_start(now_nanos);
     }
   }
   if (TF_PREDICT_FALSE(errors::IsOutOfRange(s))) {
@@ -988,7 +1143,7 @@ Status DatasetBaseIterator::GetNext(IteratorContext* ctx,
                          "\" returned `OutOfRange`. This indicates an "
                          "implementation error as `OutOfRange` errors are not "
                          "expected to be returned here. Original message: ",
-                         s.error_message());
+                         s.message());
     LOG(ERROR) << s;
   }
   DVLOG(3) << prefix() << " GetNext exit";
@@ -1001,10 +1156,12 @@ Status DatasetBaseIterator::Skip(IteratorContext* ctx, int num_to_skip,
                              profiler::TraceMeLevel::kInfo);
   DVLOG(3) << prefix() << " Skip enter";
   auto model = ctx->model();
+  bool output_was_recording =
+      node_ && node_->output() && node_->output()->is_recording();
   if (collect_resource_usage(ctx)) {
     int64_t now_nanos = EnvTime::NowNanos();
     auto output = node_->output();
-    if (output) {
+    if (output_was_recording) {
       output->record_stop(now_nanos);
     }
     node_->record_start(now_nanos);
@@ -1014,7 +1171,7 @@ Status DatasetBaseIterator::Skip(IteratorContext* ctx, int num_to_skip,
     int64_t now_nanos = EnvTime::NowNanos();
     node_->record_stop(now_nanos);
     auto output = node_->output();
-    if (output) {
+    if (output_was_recording) {
       output->record_start(now_nanos);
     }
   }
@@ -1023,7 +1180,7 @@ Status DatasetBaseIterator::Skip(IteratorContext* ctx, int num_to_skip,
                          "\" returned `OutOfRange`. This indicates an "
                          "implementation error as `OutOfRange` errors are not "
                          "expected to be returned here. Original message: ",
-                         s.error_message());
+                         s.message());
     LOG(ERROR) << s;
   }
   DVLOG(3) << prefix() << " Skip exit";
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 1c37f7b61a3..f53fbc51498 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -177,6 +177,9 @@ class IteratorStateWriter {
 // iterator checkpoints should go through this function.
 std::string FullName(const std::string& prefix, const std::string& name);
 
+// Extracts iterator prefix from key generated by `FullName`.
+Status ExtractIteratorPrefix(StringPiece key, string* prefix);
+
 // Interface for objects that can be checkpointed.
 class Checkpointable {
  public:
@@ -409,66 +412,35 @@ int32_t GetRunnerThreadpoolSizeFromOpKernelContext(OpKernelContext* ctx);
 // The implementation is not thread-safe.
 class MemoryCheckpoint : public IteratorStateWriter {
  public:
-  // IdRegistry maintains the mapping between a string key and an integer.
-  // The main purpose of this registry is to allow us using integers as map keys
-  // in MemoryCheckpoint to reduce the cost in checkpoint merging.
+  // IdRegistry maintains a bi-directional mapping between string and integer
+  // representations of checkpoint keys.
+  //
+  // The reason we need both is that integer ids are used for fast lookups and
+  // comparisons, while string ids are used for prefix matching.
   class IdRegistry {
    public:
     IdRegistry() = default;
 
-    // Inserts the key into the registry and get the integer id for the key.
-    // If the key already exists in the registry, the corresponding id is
-    // directly returned.
-    int64_t InsertKey(const std::string& key) {
-      mutex_lock l(mu_);
-      if (key_to_id_.contains(key)) {
-        return key_to_id_[key];
-      }
-      int64_t id = next_id_++;
-      id_to_key_[id] = key;
-      key_to_id_[key] = id;
-      return id;
-    }
+    // Adds the given string id to the registry, generating a unique integer id
+    // for it. If the string id already exists, its integer id is returned.
+    int64_t Add(const std::string& prefix, const std::string& key);
 
-    // Gets all ids for keys starting with the given prefix.
-    std::vector<int64_t> GetIdsWithPrefix(const std::string& prefix) {
-      mutex_lock l(mu_);
-      std::vector<int64_t> ids;
-      for (const auto& [key, id] : key_to_id_) {
-        if (key.length() >= kFullNameRandomHexLen + 1 + prefix.length() &&
-            key.compare(kFullNameRandomHexLen + 1, prefix.length(), prefix) ==
-                0) {
-          ids.push_back(id);
-        }
-      }
-      return ids;
-    }
+    // Gets all integer ids for string ids matching the given prefix.
+    std::vector<int64_t> GetMatchingIds(const std::string& prefix_to_match);
 
-    // Gets the key corresponding to the given id.
-    std::string GetKey(int64_t id) {
-      mutex_lock l(mu_);
-      if (!id_to_key_.contains(id)) {
-        LOG(ERROR) << "Failed find key in IdRegistry: " << id
-                   << ", max id is: " << next_id_ - 1;
-      }
-      return id_to_key_[id];
-    }
+    // Gets the string id for the given integer id.
+    std::pair<std::string, std::string> Get(int64_t id);
 
-    // Removes the given ids from the registry along with their corresponding
-    // keys.
-    void RemoveIds(const std::vector<int64_t>& ids) {
-      mutex_lock l(mu_);
-      for (const auto& id : ids) {
-        key_to_id_.erase(id_to_key_[id]);
-        id_to_key_.erase(id);
-      }
-    }
+    // Removes the entries matching the given integer ids from the registry.
+    void RemoveIds(const std::vector<int64_t>& ids);
 
    private:
     mutex mu_;
     int64_t next_id_ TF_GUARDED_BY(mu_) = 0;
-    absl::flat_hash_map<int64_t, std::string> id_to_key_ TF_GUARDED_BY(mu_);
-    absl::flat_hash_map<std::string, int64_t> key_to_id_ TF_GUARDED_BY(mu_);
+    absl::flat_hash_map<int64_t, std::pair<std::string, std::string>>
+        int_to_string_ TF_GUARDED_BY(mu_);
+    absl::flat_hash_map<std::pair<std::string, std::string>, int64_t>
+        string_to_int_ TF_GUARDED_BY(mu_);
   };
 
   MemoryCheckpoint() = delete;
@@ -484,126 +456,57 @@ class MemoryCheckpoint : public IteratorStateWriter {
 
   // BEGIN implementation of `IteratorStateWriter` interface
   Status WriteScalar(StringPiece key, int64_t val) override {
-    auto id = id_registry_->InsertKey(string(key));
+    string prefix;
+    TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+    return WriteScalar(prefix, key, val);
+  }
+  Status WriteScalar(StringPiece name, StringPiece key, int64_t val) override {
+    auto id = id_registry_->Add(string(name), string(key));
     int_values_[id] = val;
     return OkStatus();
   }
-  Status WriteScalar(StringPiece name, StringPiece key, int64_t val) override {
-    return WriteScalar(FullName(string(name), string(key)), val);
-  }
   Status WriteScalar(StringPiece key, const tstring& val) override {
-    auto id = id_registry_->InsertKey(string(key));
-    str_values_[id] = val;
-    return OkStatus();
+    string prefix;
+    TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+    return WriteScalar(prefix, key, val);
   }
   Status WriteScalar(StringPiece name, StringPiece key,
                      const tstring& val) override {
-    return WriteScalar(FullName(string(name), string(key)), val);
+    auto id = id_registry_->Add(string(name), string(key));
+    str_values_[id] = val;
+    return OkStatus();
   }
   Status WriteTensor(StringPiece key, const Tensor& val) override {
-    auto id = id_registry_->InsertKey(string(key));
-    tensor_values_[id] = val;
-    return OkStatus();
+    string prefix;
+    TF_RETURN_IF_ERROR(ExtractIteratorPrefix(key, &prefix));
+    return WriteTensor(prefix, key, val);
   }
   Status WriteTensor(StringPiece name, StringPiece key,
                      const Tensor& val) override {
-    return WriteTensor(FullName(string(name), string(key)), val);
+    auto id = id_registry_->Add(string(name), string(key));
+    tensor_values_[id] = val;
+    return OkStatus();
   }
   // END implementation of `IteratorStateWriter` interface
 
   // String representation for the in-memory checkpoint suitable for debugging.
-  std::string DebugString() const {
-    std::string result = absl::StrCat("status=", status_.ToString(),
-                                      ", "
-                                      "root=",
-                                      (is_root_ ? "true" : "false"), "\n");
-    absl::StrAppend(&result, "number of integers: ", int_values_.size(), "\n");
-
-    absl::StrAppend(&result, "number of strings: ", str_values_.size(), "\n");
-    absl::StrAppend(&result, "number of tensors: ", tensor_values_.size(),
-                    "\n");
-
-    absl::StrAppend(&result,
-                    "number of expired prefixes: ", expired_prefixes_.size(),
-                    "\n");
-    return result;
-  }
+  std::string DebugString() const;
 
   // Returns the status of the in-memory checkpoint.
   Status GetStatus() const { return status_; }
 
-  // Merges key-values pair of another checkpoint with this checkpoint. If a key
-  // exists with another checkpoint, then the key-value pair from the `other`
-  // argument is used.
+  // Merges state of another checkpoint into this checkpoint, overwriting
+  // existing state (if applicable).
   //
-  // Merge also garbage collects expired prefixes.
-  void Merge(MemoryCheckpoint* other) {
-    if (!status_.ok()) {
-      return;
-    }
-
-    if (!other->status_.ok()) {
-      status_ = other->status_;
-      int_values_.clear();
-      str_values_.clear();
-      tensor_values_.clear();
-    }
-
-    for (const auto& [k, v] : other->int_values_) {
-      int_values_[k] = v;
-    }
-    for (const auto& [k, v] : other->str_values_) {
-      str_values_[k] = v;
-    }
-    for (const auto& [k, v] : other->tensor_values_) {
-      tensor_values_[k] = v;
-    }
-
-    // Get the expired prefixes from `other`. Since the info only needs to be
-    // propagated once downstream, we also clean the `expired_prefixes_` of
-    // `other` here.
-    for (const auto& prefix : other->expired_prefixes_) {
-      Purge(prefix);
-    }
-
-    other->expired_prefixes_.clear();
-    VLOG(5) << "MemoryCheckpoint::Merge " << DebugString();
-  }
+  // Merge also garbage collects state that is no longer needed.
+  void Merge(MemoryCheckpoint* other);
 
   // Purge removes all keys with given prefix from checkpoint. It also adds the
   // prefix for tracking unless it is the root checkpoint.
-  void Purge(const std::string& prefix) {
-    std::vector<int64_t> ids = id_registry_->GetIdsWithPrefix(prefix);
-    for (const auto& id : ids) {
-      int_values_.erase(id);
-      str_values_.erase(id);
-      tensor_values_.erase(id);
-    }
-    if (!is_root_) {
-      expired_prefixes_.insert(prefix);
-    } else {
-      // We no longer need the mapping after change has been propagated all the
-      // way to root.
-      id_registry_->RemoveIds(ids);
-    }
-  }
+  void Purge(const std::string& prefix);
 
   // Stores the in-memory checkpoint to the given writer.
-  Status Save(IteratorStateWriter* writer) const {
-    for (const auto& [id, value] : int_values_) {
-      auto key = id_registry_->GetKey(id);
-      TF_RETURN_IF_ERROR(writer->WriteScalar(key, value));
-    }
-    for (const auto& [id, value] : str_values_) {
-      auto key = id_registry_->GetKey(id);
-      TF_RETURN_IF_ERROR(writer->WriteScalar(key, value));
-    }
-    for (const auto& [id, value] : tensor_values_) {
-      auto key = id_registry_->GetKey(id);
-      TF_RETURN_IF_ERROR(writer->WriteTensor(key, value));
-    }
-    return OkStatus();
-  }
+  Status Save(IteratorStateWriter* writer) const;
 
   // Updates the status of the in-memory checkpoint with the given status.
   void UpdateStatus(Status status) { status_.Update(status); }
diff --git a/tensorflow/core/framework/dataset_test.cc b/tensorflow/core/framework/dataset_test.cc
index dafa2cda1cd..fca80be41af 100644
--- a/tensorflow/core/framework/dataset_test.cc
+++ b/tensorflow/core/framework/dataset_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/dataset.h"
 
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/test.h"
@@ -157,5 +159,46 @@ TEST(DatasetTest, IsDatasetOp) {
   }
 }
 
+TEST(DatasetTest, IdRegistry) {
+  MemoryCheckpoint::IdRegistry id_registry;
+
+  auto id_1 = id_registry.Add("foo", "key_1");
+  auto id_2 = id_registry.Add("foo:bar", "key_2");
+  auto id_3 = id_registry.Add("foo:bar:baz", "key_3");
+
+  auto [prefix_1, key_1] = id_registry.Get(id_1);
+  EXPECT_EQ(prefix_1, "foo");
+  EXPECT_EQ(key_1, "key_1");
+
+  auto [prefix_2, key_2] = id_registry.Get(id_2);
+  EXPECT_EQ(prefix_2, "foo:bar");
+  EXPECT_EQ(key_2, "key_2");
+
+  auto [prefix_3, key_3] = id_registry.Get(id_3);
+  EXPECT_EQ(prefix_3, "foo:bar:baz");
+  EXPECT_EQ(key_3, "key_3");
+
+  auto matching_ids = id_registry.GetMatchingIds("hello");
+  EXPECT_EQ(matching_ids.size(), 0);
+  matching_ids = id_registry.GetMatchingIds("foo:bar:baz");
+  EXPECT_EQ(matching_ids.size(), 1);
+  matching_ids = id_registry.GetMatchingIds("foo:bar");
+  EXPECT_EQ(matching_ids.size(), 2);
+  matching_ids = id_registry.GetMatchingIds("foo");
+  EXPECT_EQ(matching_ids.size(), 3);
+  matching_ids = id_registry.GetMatchingIds("f");
+  EXPECT_EQ(matching_ids.size(), 3);
+
+  absl::flat_hash_set<int64_t> matching_ids_set(matching_ids.begin(),
+                                                matching_ids.end());
+  EXPECT_TRUE(matching_ids_set.contains(id_1));
+  EXPECT_TRUE(matching_ids_set.contains(id_2));
+  EXPECT_TRUE(matching_ids_set.contains(id_3));
+
+  id_registry.RemoveIds(matching_ids);
+  matching_ids = id_registry.GetMatchingIds("foo");
+  EXPECT_EQ(matching_ids.size(), 0);
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/device.h b/tensorflow/core/framework/device.h
index 27defb06b96..6c2f65bbf72 100644
--- a/tensorflow/core/framework/device.h
+++ b/tensorflow/core/framework/device.h
@@ -59,6 +59,11 @@ class Device : public DeviceBase {
   Device(Env* env, const DeviceAttributes& device_attributes);
   ~Device() override;
 
+  // A compare function that orders devices by their parsed name.
+  static bool LessByParsedName(const Device& a, const Device& b) {
+    return a.parsed_name() < b.parsed_name();
+  }
+
   // Full name of this device (see top comment).
   const std::string& name() const override { return device_attributes_.name(); }
 
diff --git a/tensorflow/core/framework/fake_input.cc b/tensorflow/core/framework/fake_input.cc
index a1ef1d98ca5..415125c73b1 100644
--- a/tensorflow/core/framework/fake_input.cc
+++ b/tensorflow/core/framework/fake_input.cc
@@ -105,7 +105,7 @@ Status FakeInputImpl::AddInputToBuilder() {
       if (!status.ok()) {
         return errors::InvalidArgument(
             "Could not infer list of types for input '", arg_->name(),
-            "': ", status.error_message());
+            "': ", status.message());
       }
       SourceList(dts);
       return OkStatus();
@@ -131,8 +131,7 @@ Status FakeInputImpl::GetN(int* n) const {
     Status status = GetNodeAttr(*node_def_, arg_->number_attr(), n);
     if (!status.ok()) {
       return errors::InvalidArgument("Could not infer length of input '",
-                                     arg_->name(),
-                                     "': ", status.error_message());
+                                     arg_->name(), "': ", status.message());
     }
   }
   return OkStatus();
@@ -153,8 +152,7 @@ Status FakeInputImpl::GetDataType(DataType* dt) const {
         *dt = attr->default_value().type();
       } else {
         return errors::InvalidArgument("Could not infer type for input '",
-                                       arg_->name(),
-                                       "': ", status.error_message());
+                                       arg_->name(), "': ", status.message());
       }
     }
   } else {
diff --git a/tensorflow/core/framework/full_type_inference_util.cc b/tensorflow/core/framework/full_type_inference_util.cc
index 2074950346e..6e2f78a2f76 100644
--- a/tensorflow/core/framework/full_type_inference_util.cc
+++ b/tensorflow/core/framework/full_type_inference_util.cc
@@ -89,7 +89,7 @@ TypeInferenceFn Merge() {
         continue;
       }
 
-      return Status(error::INVALID_ARGUMENT,
+      return Status(absl::StatusCode::kInvalidArgument,
                     absl::StrCat("expected compatible input types, but input ",
                                  i, ":\n", t.DebugString(),
                                  " is neither a subtype nor a supertype of the "
@@ -138,7 +138,7 @@ TypeInferenceFn Decode(FullTypeId t, int i) {
 
     const FullTypeId enc_tid = GetArgDefaultUnset(in_t, 1).type_id();
     if ((enc_tid != TFT_UNSET) && (enc_tid != t)) {
-      return Status(error::INVALID_ARGUMENT,
+      return Status(absl::StatusCode::kInvalidArgument,
                     absl::StrCat("expected encoded type ", t, " for input ", i,
                                  ", got ", in_t.DebugString()));
     }
@@ -192,7 +192,7 @@ TypeInferenceFn UnaryContainerAdd(FullTypeId t, int container_idx,
     if (in_cont_t.type_id() != TFT_UNSET) {
       if (in_cont_t.type_id() != t) {
         return Status(
-            error::INVALID_ARGUMENT,
+            absl::StatusCode::kInvalidArgument,
             absl::StrCat("expected container type ", t, " for input ",
                          container_idx, ", got ", in_cont_t.DebugString()));
       }
@@ -225,7 +225,7 @@ TypeInferenceFn UnaryContainerAdd(FullTypeId t, int container_idx,
     }
 
     if (homogeneous) {
-      return Status(error::INVALID_ARGUMENT,
+      return Status(absl::StatusCode::kInvalidArgument,
                     absl::StrCat("expected a subtype of ", el_t.DebugString(),
                                  " for input ", element_idx,
                                  " of a homogeneous container ", t, ", got ",
@@ -233,7 +233,7 @@ TypeInferenceFn UnaryContainerAdd(FullTypeId t, int container_idx,
     } else {
       // TODO(mdan): Implement if needed.
       return Status(
-          error::UNIMPLEMENTED,
+          absl::StatusCode::kUnimplemented,
           absl::StrCat("need union types for heterogeneous containers.\n"
                        "A homogeneous container would expect a subtype of ",
                        el_t.DebugString(), " for input ", element_idx,
@@ -287,7 +287,7 @@ TypeInferenceFn ContainerMap(
       return ret_type;
     }
     if (in_cont_t.type_id() != t) {
-      return Status(error::INVALID_ARGUMENT,
+      return Status(absl::StatusCode::kInvalidArgument,
                     absl::StrCat("expected type ", t, " for input ", input_idx,
                                  ", got ", in_cont_t.DebugString()));
     }
@@ -299,7 +299,7 @@ TypeInferenceFn ContainerMap(
       return ret_type;
     }
     if (in_el_t.type_id() != TFT_PRODUCT) {
-      return Status(error::INVALID_ARGUMENT,
+      return Status(absl::StatusCode::kInvalidArgument,
                     absl::StrCat("expected PRODUCT element type for input ",
                                  input_idx, ", got ", in_el_t.DebugString()));
     }
@@ -324,7 +324,7 @@ TypeInferenceFn MapCovariant(FullTypeId t, FullTypeId u, int input_idx) {
           return ret_type;
         }
         if (in_t.type_id() != t) {
-          return Status(error::INVALID_ARGUMENT,
+          return Status(absl::StatusCode::kInvalidArgument,
                         absl::StrCat("expected type ", t, " for input ",
                                      input_idx, ", got ", in_t.DebugString()));
         }
@@ -369,7 +369,7 @@ TypeInferenceFn Tuple(const std::vector<TypeInferenceFn>& func_list) {
       }
       if (t.type_id() != TFT_PRODUCT) {
         return Status(
-            error::INVALID_ARGUMENT,
+            absl::StatusCode::kInvalidArgument,
             absl::StrCat("for Tuple type inference function, expected result "
                          "of type inference function ",
                          ret_type.args_size(),
diff --git a/tensorflow/core/framework/full_type_inference_util_test.cc b/tensorflow/core/framework/full_type_inference_util_test.cc
index c0e002771b5..943d491771a 100644
--- a/tensorflow/core/framework/full_type_inference_util_test.cc
+++ b/tensorflow/core/framework/full_type_inference_util_test.cc
@@ -163,7 +163,7 @@ TEST(Merge, RejectsMismatched) {
   t2.set_type_id(TFT_TENSOR);
 
   const auto ret = Merge()({t1, t2}, {});
-  EXPECT_THAT(ret.status().error_message(),
+  EXPECT_THAT(ret.status().message(),
               ::testing::HasSubstr("expected compatible input types"));
 }
 
@@ -238,7 +238,7 @@ TEST(UnaryContainerAdd, RejectsMismatchedContainerType) {
   const auto ret =
       UnaryContainerAdd(TFT_ARRAY, /*container_idx=*/1, /*element_idx=*/0,
                         /*homogeneous=*/false)({t1, t2}, {});
-  EXPECT_THAT(ret.status().error_message(),
+  EXPECT_THAT(ret.status().message(),
               ::testing::HasSubstr("expected container type"));
 }
 
@@ -330,8 +330,7 @@ TEST(UnaryContainerAdd, RejectsMismatchedElementTypesHeterogenous) {
   const auto ret =
       UnaryContainerAdd(TFT_ARRAY, /*container_idx=*/0, /*element_idx=*/1,
                         /*homogeneous=*/false)({t1, t2}, {});
-  EXPECT_THAT(ret.status().error_message(),
-              ::testing::HasSubstr("need union types"));
+  EXPECT_THAT(ret.status().message(), ::testing::HasSubstr("need union types"));
 }
 
 TEST(UnaryContainerAdd, RejectsMismatchedElementTypesHomogeneous) {
@@ -344,7 +343,7 @@ TEST(UnaryContainerAdd, RejectsMismatchedElementTypesHomogeneous) {
   const auto ret =
       UnaryContainerAdd(TFT_ARRAY, /*container_idx=*/0, /*element_idx=*/1,
                         /*homogeneous=*/true)({t1, t2}, {});
-  EXPECT_THAT(ret.status().error_message(),
+  EXPECT_THAT(ret.status().message(),
               ::testing::HasSubstr("expected a subtype"));
 }
 
@@ -359,8 +358,7 @@ TEST(UnaryContainerAdd, RejectsSupertypeElementTypeHeterogeneous) {
   const auto ret =
       UnaryContainerAdd(TFT_ARRAY, /*container_idx=*/0, /*element_idx=*/1,
                         /*homogeneous=*/false)({t1, t2}, {});
-  EXPECT_THAT(ret.status().error_message(),
-              ::testing::HasSubstr("need union types"));
+  EXPECT_THAT(ret.status().message(), ::testing::HasSubstr("need union types"));
 }
 
 TEST(UnaryContainerAdd, RejectsSupertypeElementTypeHomogeneous) {
@@ -374,7 +372,7 @@ TEST(UnaryContainerAdd, RejectsSupertypeElementTypeHomogeneous) {
   const auto ret =
       UnaryContainerAdd(TFT_ARRAY, /*container_idx=*/0, /*element_idx=*/1,
                         /*homogeneous=*/true)({t1, t2}, {});
-  EXPECT_THAT(ret.status().error_message(),
+  EXPECT_THAT(ret.status().message(),
               ::testing::HasSubstr("expected a subtype"));
 }
 
@@ -518,8 +516,7 @@ TEST(MapCovariant, RejectsMismatchedType) {
   t.add_args()->set_type_id(TFT_INT32);
 
   const auto ret = MapCovariant(TFT_ARRAY, TFT_DATASET, 0)({t}, {});
-  EXPECT_THAT(ret.status().error_message(),
-              ::testing::HasSubstr("expected type"));
+  EXPECT_THAT(ret.status().message(), ::testing::HasSubstr("expected type"));
 }
 
 // Create a type inference function for the Tuple.Basic test (in a function so
diff --git a/tensorflow/core/framework/full_type_util.cc b/tensorflow/core/framework/full_type_util.cc
index b528a27203a..fcc6446b67a 100644
--- a/tensorflow/core/framework/full_type_util.cc
+++ b/tensorflow/core/framework/full_type_util.cc
@@ -146,7 +146,7 @@ inline Status SubstituteFromAttrs(AttrMap& attrs, FullTypeDef& t);
 Status SubstituteVar(AttrMap& attrs, FullTypeDef& t) {
   if (t.args_size() != 0) {
     return Status(
-        error::INVALID_ARGUMENT,
+        absl::StatusCode::kInvalidArgument,
         absl::StrCat("Unexpected Var type, expected args_size 0, found ",
                      t.args_size()));
   }
@@ -154,7 +154,7 @@ Status SubstituteVar(AttrMap& attrs, FullTypeDef& t) {
   StringPiece var_name = t.s();
   if (!attrs.contains(var_name)) {
     return Status(
-        error::INVALID_ARGUMENT,
+        absl::StatusCode::kInvalidArgument,
         absl::StrCat("could not find an attribute for key '", var_name, "'"));
   }
   const AttrValue* attr = attrs.at(var_name);
@@ -165,13 +165,13 @@ Status SubstituteVar(AttrMap& attrs, FullTypeDef& t) {
   } else if (attr_type == AttrValue::kList) {
     const auto& attr_list = attr->list();
     if (attr_list.type_size() != 1) {
-      return Status(error::UNIMPLEMENTED,
+      return Status(absl::StatusCode::kUnimplemented,
                     absl::StrCat("lists or other than one type element\n",
                                  attr_list.DebugString(), "\nkey=", var_name));
     }
     map_dtype_to_tensor(attr_list.type(0), t);
   } else {
-    return Status(error::UNIMPLEMENTED,
+    return Status(absl::StatusCode::kUnimplemented,
                   absl::StrCat("unsupported attribute type ",
                                attr->DebugString(), " for name ", var_name));
   }
@@ -181,7 +181,7 @@ Status SubstituteVar(AttrMap& attrs, FullTypeDef& t) {
 
 Status SubstituteForEach(AttrMap& attrs, FullTypeDef& t) {
   if (t.args_size() != 3) {
-    return Status(error::INVALID_ARGUMENT,
+    return Status(absl::StatusCode::kInvalidArgument,
                   absl::StrCat("illegal FOR_EACH type, expected 3 args, got ",
                                t.args_size()));
   }
@@ -193,7 +193,7 @@ Status SubstituteForEach(AttrMap& attrs, FullTypeDef& t) {
   StringPiece var_name = t_var.s();
   if (!attrs.contains(var_name)) {
     return Status(
-        error::INVALID_ARGUMENT,
+        absl::StatusCode::kInvalidArgument,
         absl::StrCat("could not find an attribute for key '", var_name, "'"));
   }
   const AttrValue* attr = attrs.at(var_name);
@@ -213,7 +213,7 @@ Status SubstituteForEach(AttrMap& attrs, FullTypeDef& t) {
     const auto& attr_list = attr->list();
     int tsize = attr_list.type_size();
     if (tsize == 0) {
-      return Status(error::UNIMPLEMENTED,
+      return Status(absl::StatusCode::kUnimplemented,
                     absl::StrCat("unsupported list attribute type\n",
                                  attr_list.DebugString(), "\nkey=", var_name));
     }
@@ -233,7 +233,7 @@ Status SubstituteForEach(AttrMap& attrs, FullTypeDef& t) {
     attrs[var_name] = attr;
 
   } else {
-    return Status(error::UNIMPLEMENTED,
+    return Status(absl::StatusCode::kUnimplemented,
                   absl::StrCat("unsupported attribute type\n",
                                attr->DebugString(), "\nfor name ", var_name));
   }
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 6372d6c370b..f624fe849be 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
@@ -39,8 +40,10 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/equal_graph_def.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace tensorflow {
 
@@ -1215,63 +1218,163 @@ Status FunctionCallFrame::SetRetval(int index, const Tensor& val) {
   return OkStatus();
 }
 
-FunctionLibraryDefinition::FunctionDefAndOpRegistration::
-    FunctionDefAndOpRegistration(const FunctionDef& fdef_in,
-                                 const StackTracesMap& stack_traces)
-    : fdef(fdef_in),
+tensorflow::GraphDebugInfo StackTracesMapToGraphDebugInfo(
+    const tensorflow::StackTracesMap& map) {
+  tensorflow::GraphDebugInfo debug_info;
+  for (const auto& [node_name, stack_trace] : map) {
+    if (stack_trace == nullptr) continue;
+
+    tensorflow::GraphDebugInfo::StackTrace stack_trace_proto;
+    absl::flat_hash_map<string, int> file_name_to_index;
+    int new_name_index = 0;
+
+    for (const auto& stack_frame : stack_trace->GetUserFrames(-1)) {
+      auto* file_line_col = stack_trace_proto.add_file_line_cols();
+      if (file_name_to_index.contains(stack_frame.file_name)) {
+        file_line_col->set_file_index(
+            file_name_to_index[stack_frame.file_name]);
+      } else {
+        *debug_info.add_files() = stack_frame.file_name;
+        file_line_col->set_file_index(new_name_index);
+        file_name_to_index[stack_frame.file_name] = new_name_index;
+        new_name_index++;
+      }
+      file_line_col->set_line(stack_frame.line_number);
+      file_line_col->set_func(stack_frame.function_name);
+    }
+
+    (*debug_info.mutable_traces())[node_name] = std::move(stack_trace_proto);
+  }
+
+  return debug_info;
+}
+
+FunctionRecord::FunctionRecord(const FunctionDef& fdef,
+                               const StackTracesMap& stack_traces,
+                               bool finalized)
+    : FunctionRecord(FunctionDef(fdef), stack_traces, finalized) {}
+
+FunctionRecord::FunctionRecord(FunctionDef&& fdef,
+                               const StackTracesMap& stack_traces,
+                               bool finalized)
+    : finalized_(finalized),
+      fdef_(std::move(fdef)),
+      stack_traces_(stack_traces),
       // Exact shape inference for functions is handled by ShapeRefiner.
       // Here we pass a dummy shape inference function for legacy code paths.
-      op_registration_data(fdef.signature(), shape_inference::UnknownShape,
-                           true /* is_function */),
-      stack_traces(stack_traces) {}
+      op_registration_data_(fdef_.signature(), shape_inference::UnknownShape,
+                            true /* is_function */) {}
+
+void FunctionRecord::finalize() {
+  if (!finalized_) {
+    finalized_ = true;
+  }
+}
+
+StatusOr<FunctionDef*> FunctionRecord::mutable_fdef() {
+  if (finalized_) {
+    return Status(absl::StatusCode::kPermissionDenied,
+                  "Can not mutate FunctionDef after finalization.");
+  }
+
+  return &fdef_;
+}
+
+const FunctionDef& FunctionRecord::fdef() const { return fdef_; }
+
+const StackTracesMap& FunctionRecord::stack_traces() const {
+  return stack_traces_;
+}
+
+const OpRegistrationData& FunctionRecord::op_registration_data() const {
+  return op_registration_data_;
+}
+
+const bool FunctionRecord::finalized() const { return finalized_; }
 
 FunctionLibraryDefinition::FunctionLibraryDefinition(
     const FunctionLibraryDefinition& other)
     : default_registry_(other.default_registry_) {
   tf_shared_lock l(other.mu_);
-  function_defs_ = other.function_defs_;
+  records_ = other.records_;
+  // Increment the Refs.
+  for (const auto& key_value_pair : records_) {
+    key_value_pair.second->Ref();
+  }
   func_grad_ = other.func_grad_;
+  optimized_function_graph_map_ = other.optimized_function_graph_map_;
 }
 
 FunctionLibraryDefinition::FunctionLibraryDefinition(
     const OpRegistryInterface* default_registry,
     const FunctionDefLibrary& def_lib)
-    : default_registry_(default_registry),
-      function_defs_(def_lib.function_size()) {
+    : default_registry_(default_registry), records_(def_lib.function_size()) {
   for (const auto& fdef : def_lib.function()) {
     // The latter function definition wins.
-    auto& ptr = function_defs_[fdef.signature().name()];
-    ptr.reset(new FunctionDefAndOpRegistration(fdef));
+    auto iter = records_.find(fdef.signature().name());
+    if (iter != records_.end()) {
+      iter->second->Unref();
+      records_.erase(iter);
+    }
+    records_.insert(
+        {fdef.signature().name(), new FunctionRecord(fdef, {}, true)});
   }
   for (const auto& grad : def_lib.gradient()) {
     func_grad_[grad.function_name()] = grad.gradient_func();
   }
 }
 
-FunctionLibraryDefinition::~FunctionLibraryDefinition() {}
+FunctionLibraryDefinition::~FunctionLibraryDefinition() {
+  // Drop Ref Count for each FunctionRecord.
+  for (const auto& [function_name, record] : records_) {
+    DCHECK(record->finalized());
+    record->Unref();
+  }
+}
+
+FunctionLibraryDefinition& FunctionLibraryDefinition::operator=(
+    FunctionLibraryDefinition&& other) {
+  mutex_lock other_l(other.mu_);
+  mutex_lock this_l(mu_);
+  default_registry_ = std::move(other.default_registry_);
+  records_ = std::move(other.records_);
+  func_grad_ = std::move(other.func_grad_);
+  optimized_function_graph_map_ =
+      std::move(other.optimized_function_graph_map_);
+  return *this;
+}
 
 bool FunctionLibraryDefinition::Contains(const string& func) const {
   tf_shared_lock l(mu_);
-  return function_defs_.find(func) != function_defs_.end();
+  return records_.find(func) != records_.end();
 }
 
 const FunctionDef* FunctionLibraryDefinition::Find(const string& func) const {
   tf_shared_lock l(mu_);
   auto result = FindHelper(func);
   if (result) {
-    return &result->fdef;
+    return &result->fdef();
   } else {
     return nullptr;
   }
 }
 
-std::shared_ptr<FunctionLibraryDefinition::FunctionDefAndOpRegistration>
-FunctionLibraryDefinition::FindHelper(const string& func) const {
-  auto iter = function_defs_.find(func);
-  if (iter == function_defs_.end()) {
+core::RefCountPtr<FunctionRecord> FunctionLibraryDefinition::FindRecord(
+    const string& func) const {
+  tf_shared_lock l(mu_);
+  return FindHelper(func);
+}
+
+core::RefCountPtr<FunctionRecord> FunctionLibraryDefinition::FindHelper(
+    const string& func) const {
+  auto iter = records_.find(func);
+  if (iter == records_.end()) {
     return nullptr;
   } else {
-    return iter->second;
+    DCHECK(iter->second->finalized());
+    // Return a new Ref.
+    iter->second->Ref();
+    return core::RefCountPtr<FunctionRecord>(iter->second);
   }
 }
 
@@ -1279,44 +1382,29 @@ Status FunctionLibraryDefinition::AddFunctionDef(
     const FunctionDef& fdef, const StackTracesMap& stack_traces) {
   mutex_lock l(mu_);
   bool added;
-  return AddFunctionDefHelper(fdef, stack_traces, &added);
+  FunctionRecord* record = new FunctionRecord(fdef, stack_traces, true);
+  core::ScopedUnref scoped_unref(record);
+  Status status = AddHelper(record, &added);
+  return status;
 }
 
 Status FunctionLibraryDefinition::AddFunctionDefHelper(
-    const FunctionDef& fdef, const StackTracesMap& stack_traces, bool* added) {
-  *added = false;
-  std::shared_ptr<FunctionDefAndOpRegistration>& entry =
-      function_defs_[fdef.signature().name()];
-  if (entry) {
-    if (!FunctionDefsEqual(entry->fdef, fdef)) {
-      return errors::InvalidArgument(
-          "Cannot add function '", fdef.signature().name(),
-          "' because a different function with the same name already "
-          "exists.");
-    }
-    // Ignore duplicate FunctionDefs.
-    return OkStatus();
-  }
-  const OpDef* op_def;
-  if (default_registry_->LookUpOpDef(fdef.signature().name(), &op_def).ok()) {
-    return errors::InvalidArgument(
-        "Cannot add function '", fdef.signature().name(),
-        "' because an op with the same name already exists.");
-  }
-  entry = std::make_shared<FunctionDefAndOpRegistration>(fdef, stack_traces);
-  *added = true;
-  return OkStatus();
+    FunctionDef&& fdef, const StackTracesMap& stack_traces, bool* added) {
+  FunctionRecord* record =
+      new FunctionRecord(std::move(fdef), stack_traces, true);
+  core::ScopedUnref scoped_unref(record);
+  Status status = AddHelper(record, added);
+  return status;
 }
 
-Status FunctionLibraryDefinition::AddHelper(
-    std::shared_ptr<FunctionDefAndOpRegistration> registration, bool* added) {
+Status FunctionLibraryDefinition::AddHelper(FunctionRecord* registration,
+                                            bool* added) {
   *added = false;
-  std::shared_ptr<FunctionDefAndOpRegistration>& entry =
-      function_defs_[registration->fdef.signature().name()];
-  if (entry) {
-    if (!FunctionDefsEqual(entry->fdef, registration->fdef)) {
+  auto iter = records_.find(registration->fdef().signature().name());
+  if (iter != records_.end()) {
+    if (!FunctionDefsEqual(iter->second->fdef(), registration->fdef())) {
       return errors::InvalidArgument(
-          "Cannot add function '", registration->fdef.signature().name(),
+          "Cannot add function '", registration->fdef().signature().name(),
           "' because a different function with the same name already "
           "exists.");
     }
@@ -1325,48 +1413,44 @@ Status FunctionLibraryDefinition::AddHelper(
   }
   const OpDef* op_def;
   if (default_registry_
-          ->LookUpOpDef(registration->fdef.signature().name(), &op_def)
+          ->LookUpOpDef(registration->fdef().signature().name(), &op_def)
           .ok()) {
     return errors::InvalidArgument(
-        "Cannot add function '", registration->fdef.signature().name(),
+        "Cannot add function '", registration->fdef().signature().name(),
         "' because an op with the same name already exists.");
   }
-  entry = std::move(registration);
+  registration->Ref();
+  registration->finalize();
+  records_.insert({registration->fdef().signature().name(), registration});
   *added = true;
   return OkStatus();
 }
 
 Status FunctionLibraryDefinition::CopyFunctionDefFrom(
-    const string& func, const FunctionLibraryDefinition& other) {
-  if (default_registry_ != other.default_registry_) {
+    const string& name, const FunctionLibraryDefinition& other) {
+  if (default_registry() != other.default_registry()) {
     return errors::InvalidArgument(
-        "Cannot copy function '", func,
+        "Cannot copy function '", name,
         "' because CopyFunctionDefFrom() requires that both libraries have the "
         "same default registry.");
   }
-  std::shared_ptr<FunctionDefAndOpRegistration> function_def;
-  {
-    tf_shared_lock l(other.mu_);
-    function_def = other.FindHelper(func);
-  }
-  if (!function_def) {
+  core::RefCountPtr<FunctionRecord> other_record = other.FindRecord(name);
+  if (!other_record) {
     return errors::InvalidArgument(
-        "Cannot copy function '", func,
+        "Cannot copy function '", name,
         "' because no function with that name exists in the other library.");
   }
-  {
-    mutex_lock l(mu_);
-    std::shared_ptr<FunctionDefAndOpRegistration>& entry = function_defs_[func];
-    if (entry) {
-      if (!FunctionDefsEqual(entry->fdef, function_def->fdef)) {
-        return errors::InvalidArgument(
-            "Cannot copy function '", func,
-            "' because a different function with the same name already "
-            "exists.");
-      }
-    } else {
-      entry = std::move(function_def);
+  core::RefCountPtr<FunctionRecord> self_record = FindRecord(name);
+  if (self_record) {
+    if (!FunctionDefsEqual(self_record->fdef(), other_record->fdef())) {
+      return errors::InvalidArgument(
+          "Cannot copy function '", name,
+          "' because a different function with the same name already "
+          "exists.");
     }
+  } else {
+    TF_RETURN_IF_ERROR(
+        AddFunctionDef(other_record->fdef(), other_record->stack_traces()));
   }
   return OkStatus();
 }
@@ -1400,17 +1484,21 @@ Status FunctionLibraryDefinition::AddLibrary(
     const FunctionLibraryDefinition& other) {
   // Clone `other` to ensure thread-safety (grabbing `other`'s lock for
   // the duration of the function could lead to deadlock).
-  FunctionLibraryDefinition clone(other);
+  return AddLibrary(FunctionLibraryDefinition(other));
+}
+
+Status FunctionLibraryDefinition::AddLibrary(
+    FunctionLibraryDefinition&& other) {
   mutex_lock l(mu_);
-  mutex_lock l2(clone.mu_);
+  mutex_lock l2(other.mu_);
   // Remember the funcs and grads that we added successfully so that
   // we can roll them back on error.
   std::vector<string> funcs;
   std::vector<string> funcs_with_grads;
   Status s;
   bool added;
-  for (auto iter : clone.function_defs_) {
-    s = AddHelper(iter.second, &added);
+  for (const auto& [name, record] : other.records_) {
+    s = AddHelper(record, &added);
     if (!s.ok()) {
       Status remove_status = Remove(funcs, funcs_with_grads);
       if (!remove_status.ok()) {
@@ -1419,10 +1507,10 @@ Status FunctionLibraryDefinition::AddLibrary(
       return s;
     }
     if (added) {
-      funcs.push_back(iter.second->fdef.signature().name());
+      funcs.push_back(record->fdef().signature().name());
     }
   }
-  for (auto iter : clone.func_grad_) {
+  for (auto iter : other.func_grad_) {
     GradientDef grad;
     grad.set_function_name(iter.first);
     grad.set_gradient_func(iter.second);
@@ -1443,6 +1531,20 @@ Status FunctionLibraryDefinition::AddLibrary(
 
 Status FunctionLibraryDefinition::AddLibrary(
     const FunctionDefLibrary& lib_def) {
+  return AddLibrary(FunctionDefLibrary(lib_def), /*stack_traces=*/{});
+}
+
+Status FunctionLibraryDefinition::AddLibrary(FunctionDefLibrary&& lib_def) {
+  return AddLibrary(std::move(lib_def), /*stack_traces=*/{});
+}
+
+Status FunctionLibraryDefinition::AddLibrary(
+    const FunctionDefLibrary& lib_def, const StackTracesMap& stack_traces) {
+  return AddLibrary(FunctionDefLibrary(lib_def), stack_traces);
+}
+
+Status FunctionLibraryDefinition::AddLibrary(
+    FunctionDefLibrary&& lib_def, const StackTracesMap& stack_traces) {
   // Remember the funcs and grads that we added successfully so that
   // we can roll them back on error.
   mutex_lock l(mu_);
@@ -1450,8 +1552,9 @@ Status FunctionLibraryDefinition::AddLibrary(
   std::vector<string> funcs_with_grads;
   Status s;
   bool added;
-  for (const FunctionDef& fdef : lib_def.function()) {
-    s = AddFunctionDefHelper(fdef, /*stack_traces=*/{}, &added);
+  for (FunctionDef& fdef : *lib_def.mutable_function()) {
+    string name = fdef.signature().name();
+    s = AddFunctionDefHelper(std::move(fdef), stack_traces, &added);
     if (!s.ok()) {
       Status remove_status = Remove(funcs, funcs_with_grads);
       if (!remove_status.ok()) {
@@ -1460,7 +1563,7 @@ Status FunctionLibraryDefinition::AddLibrary(
       return s;
     }
     if (added) {
-      funcs.push_back(fdef.signature().name());
+      funcs.push_back(std::move(name));
     }
   }
   for (const GradientDef& grad : lib_def.gradient()) {
@@ -1485,7 +1588,8 @@ Status FunctionLibraryDefinition::ReplaceFunction(
   mutex_lock l(mu_);
   bool added;
   TF_RETURN_IF_ERROR(RemoveFunctionHelper(func));
-  TF_RETURN_IF_ERROR(AddFunctionDefHelper(fdef, stack_traces, &added));
+  TF_RETURN_IF_ERROR(
+      AddFunctionDefHelper(FunctionDef(fdef), stack_traces, &added));
   return OkStatus();
 }
 
@@ -1504,18 +1608,23 @@ Status FunctionLibraryDefinition::RemoveFunction(const string& func) {
 }
 
 Status FunctionLibraryDefinition::RemoveFunctionHelper(const string& func) {
-  const auto& i = function_defs_.find(func);
-  if (i == function_defs_.end()) {
+  auto iter = records_.find(func);
+  if (iter == records_.end()) {
     return errors::InvalidArgument("Tried to remove non-existent function '",
                                    func, "'.");
   }
-  function_defs_.erase(i);
+  iter->second->Unref();
+  records_.erase(iter);
   return OkStatus();
 }
 
 void FunctionLibraryDefinition::Clear() {
   mutex_lock l(mu_);
-  function_defs_.clear();
+  // Drop Ref Count for each FunctionRecord.
+  for (const auto& [name, record] : records_) {
+    record->Unref();
+  }
+  records_.clear();
   func_grad_.clear();
 }
 
@@ -1560,9 +1669,9 @@ string FunctionLibraryDefinition::FindGradientHelper(const string& func) const {
 Status FunctionLibraryDefinition::LookUp(
     const string& op, const OpRegistrationData** op_reg_data) const {
   tf_shared_lock l(mu_);
-  auto iter = function_defs_.find(op);
-  if (iter != function_defs_.end()) {
-    *op_reg_data = &iter->second->op_registration_data;
+  auto iter = records_.find(op);
+  if (iter != records_.end()) {
+    *op_reg_data = &iter->second->op_registration_data();
     return OkStatus();
   }
   return default_registry_->LookUp(op, op_reg_data);
@@ -1572,7 +1681,7 @@ string FunctionLibraryDefinition::UniqueFunctionName(StringPiece prefix) const {
   tf_shared_lock l(mu_);
   int index = 0;
   string name = strings::StrCat(prefix, index);
-  while (function_defs_.find(name) != function_defs_.end()) {
+  while (records_.find(name) != records_.end()) {
     ++index;
     name = strings::StrCat(prefix, index);
   }
@@ -1594,32 +1703,29 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
     return nullptr;
   }
   const string& func_name = forward_func_attrs->name();
-  {
-    tf_shared_lock l(mu_);
-    const string& grad_name = FindGradientHelper(func_name);
-    // If 'func' has a user-defined gradient function, uses the grad
-    // function's attrs to see if noinline is specified. Otherwise,
-    // uses func's attrs.
-    if (!grad_name.empty()) {
-      if (const auto helper = FindHelper(grad_name)) {
-        return &(helper->fdef);
-      } else {
-        return nullptr;
-      }
-    }
-    if (const auto helper = FindHelper(func_name)) {
-      return &(helper->fdef);
+  const string& grad_name = FindGradient(func_name);
+  // If 'func' has a user-defined gradient function, uses the grad
+  // function's attrs to see if noinline is specified. Otherwise,
+  // uses func's attrs.
+  if (!grad_name.empty()) {
+    if (const auto record = FindRecord(grad_name)) {
+      return &(record->fdef());
     } else {
       return nullptr;
     }
   }
+  if (const auto record = FindRecord(func_name)) {
+    return &(record->fdef());
+  } else {
+    return nullptr;
+  }
 }
 
 std::vector<string> FunctionLibraryDefinition::ListFunctionNames() const {
   std::vector<string> function_names;
   tf_shared_lock l(mu_);
-  function_names.reserve(function_defs_.size());
-  for (const auto& it : function_defs_) {
+  function_names.reserve(records_.size());
+  for (const auto& it : records_) {
     function_names.emplace_back(it.first);
   }
   return function_names;
@@ -1628,8 +1734,8 @@ std::vector<string> FunctionLibraryDefinition::ListFunctionNames() const {
 FunctionDefLibrary FunctionLibraryDefinition::ToProto() const {
   FunctionDefLibrary lib;
   tf_shared_lock l(mu_);
-  for (const auto& f : function_defs_) {
-    *lib.add_function() = f.second->fdef;
+  for (const auto& f : records_) {
+    *lib.add_function() = f.second->fdef();
   }
   for (const auto& g : func_grad_) {
     GradientDef* gd = lib.add_gradient();
@@ -1682,13 +1788,13 @@ std::set<string> ReachableFunctions(
 
   // Functions might be reachable from the nested function calls, so we keep a
   // queue of functions that we have to check.
-  gtl::InlinedVector<const FunctionDef*, 4> func_queue;
+  gtl::InlinedVector<core::RefCountPtr<FunctionRecord>, 4> func_queue;
 
   // Add reachable and not already processed functions to the functions queue.
   const auto add_to_func_queue = [&](const string& func_name) {
-    const FunctionDef* func = flib.Find(func_name);
-    if (func && reachable_funcs.find(func_name) == reachable_funcs.end()) {
-      func_queue.push_back(func);
+    auto record = flib.FindRecord(func_name);
+    if (record && reachable_funcs.find(func_name) == reachable_funcs.end()) {
+      func_queue.push_back(std::move(record));
     }
   };
 
@@ -1698,9 +1804,9 @@ std::set<string> ReachableFunctions(
     if (!reachable_api_interface.contains(api_name)) {
       reachable_api_interface.insert(api_name);
       for (const auto& func_name : flib.ListFunctionNames()) {
-        const auto& func_def = flib.Find(func_name);
-        const auto attr_it = func_def->attr().find(kApiImplements);
-        if (attr_it != func_def->attr().end() &&
+        const auto record = flib.FindRecord(func_name);
+        const auto attr_it = record->fdef().attr().find(kApiImplements);
+        if (attr_it != record->fdef().attr().end() &&
             attr_it->second.s() == api_name) {
           add_to_func_queue(func_name);
         }
@@ -1736,19 +1842,19 @@ std::set<string> ReachableFunctions(
 
   // Process all reachable functions.
   while (!func_queue.empty()) {
-    const FunctionDef* func = func_queue.back();
+    auto func = std::move(func_queue.back());
     func_queue.pop_back();
 
-    const string& func_name = func->signature().name();
+    const string& func_name = func->fdef().signature().name();
     reachable_funcs.insert(func_name);
 
-    const auto attr_it = func->attr().find(kApiImplements);
-    if (attr_it != func->attr().end()) {
+    const auto attr_it = func->fdef().attr().find(kApiImplements);
+    if (attr_it != func->fdef().attr().end()) {
       add_function_with_api_interface(attr_it->second.s());
     }
 
     // Find all the functions called from the function body.
-    const auto& func_body = func->node_def();
+    const auto& func_body = func->fdef().node_def();
     std::for_each(func_body.begin(), func_body.end(), process_node);
 
     // Check if the function has a registered gradient.
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index f20f3e810fa..8c4d0914e5b 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -16,10 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_FUNCTION_H_
 #define TENSORFLOW_CORE_FRAMEWORK_FUNCTION_H_
 
+#include <memory>
 #include <vector>
 
 // clang-format off
 // Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
@@ -45,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/threadpool_interface.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
 #endif  // IS_MOBILE_PLATFORM
@@ -365,12 +368,23 @@ class AbstractStackTrace {
 
   virtual ~AbstractStackTrace() {}
 
-  // The returned span is alive as long as the AbstractStackTrace is alive.
+  // The returned span is alive until either the AbstractStackTrace gets
+  // destroyed or its cache gets flushed.
   virtual absl::Span<StackFrame const> ToFrames() const = 0;
 
+  // Remove all data that was generated for e.g. the result of ToFrames().
+  // Calling this will make the next ToFrames() run more slowly, but tends to
+  // save a large amount of memory.
+  virtual void WipeCache() {}
+
   // Returns the last stack frame from user code, attempting to ignore the
   // framework code. Returns an empty frame if no such stack frame was found.
   virtual StackFrame LastUserFrame() const = 0;
+
+  // Returns stack trace from user code (instead of op creation ones returned in
+  // ToFrames).
+  virtual std::vector<StackFrame> GetUserFrames(int limit) const = 0;
+
   virtual std::string ToString(const TracePrintingOptions& opts) const = 0;
 };
 
@@ -378,6 +392,41 @@ using StackTracesMap =
     std::unordered_map<std::string,
                        std::shared_ptr<tensorflow::AbstractStackTrace>>;
 
+// Generates a GraphDebugInfo proto from a StackTracesMap object.
+tensorflow::GraphDebugInfo StackTracesMapToGraphDebugInfo(
+    const tensorflow::StackTracesMap& map);
+
+// Holds Function information that can be shared in multiple places.
+// FunctionRecord must be explicitly finalized before being saved in
+// FunctionLibraryDefinition or any other place that expects immutability.
+class FunctionRecord : public core::RefCounted {
+ public:
+  FunctionRecord(const FunctionDef& fdef, const StackTracesMap& stack_traces,
+                 bool finalized);
+  FunctionRecord(FunctionDef&& fdef, const StackTracesMap& stack_traces,
+                 bool finalized);
+
+  // Mark FunctionRecord as finalized (disable mutation).
+  void finalize();
+
+  // Get a mutable reference to the FunctionDef owned by the record.
+  // Will fail if record is finalized.
+  StatusOr<FunctionDef*> mutable_fdef();
+
+  // Get an immutable access to FunctionRecord properties.
+  const FunctionDef& fdef() const;
+  const StackTracesMap& stack_traces() const;
+  const OpRegistrationData& op_registration_data() const;
+  const bool finalized() const;
+
+ private:
+  bool finalized_ = false;
+
+  FunctionDef fdef_;
+  const StackTracesMap stack_traces_;
+  const OpRegistrationData op_registration_data_;
+};
+
 // Helper to maintain a map between function names in a given
 // FunctionDefLibrary and function definitions.
 //
@@ -406,9 +455,10 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
 
   FunctionLibraryDefinition& operator=(const FunctionLibraryDefinition&) =
       delete;
+  FunctionLibraryDefinition& operator=(FunctionLibraryDefinition&& other);
 
   // Returns True if the library contains `func`, False otherwise.
-  bool Contains(const std::string& func) const;
+  bool Contains(const std::string& func) const TF_LOCKS_EXCLUDED(mu_);
 
   // Returns nullptr if "func" is not defined in "lib_def". Otherwise,
   // returns its definition proto.
@@ -417,6 +467,11 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // subsequent call to `ReplaceFunction()` with the given name.
   const FunctionDef* Find(const std::string& func) const TF_LOCKS_EXCLUDED(mu_);
 
+  // Returns nullptr if "func" is not defined in "lib_def". Otherwise,
+  // returns a strong reference pointer to the FunctionRecord in the library.
+  core::RefCountPtr<FunctionRecord> FindRecord(const std::string& func) const
+      TF_LOCKS_EXCLUDED(mu_);
+
   // Adds function definition 'fdef' to this function library.
   // Returns status 'ok' on success, or error otherwise. This is a no-op if
   // 'fdef' already exists in this function library.
@@ -465,11 +520,21 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // This operation is atomic.
   Status AddLibrary(const FunctionLibraryDefinition& other)
       TF_LOCKS_EXCLUDED(mu_);
+  Status AddLibrary(FunctionLibraryDefinition&& other) TF_LOCKS_EXCLUDED(mu_);
+
+  // Adds the functions and gradients in 'lib_def' to this function library.
+  // Duplicate functions and gradients are ignored. This overload adds the
+  // functions with no stack traces. This operation is atomic.
+  Status AddLibrary(const FunctionDefLibrary& lib_def) TF_LOCKS_EXCLUDED(mu_);
+  Status AddLibrary(FunctionDefLibrary&& lib_def) TF_LOCKS_EXCLUDED(mu_);
 
   // Adds the functions and gradients in 'lib_def' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
-  Status AddLibrary(const FunctionDefLibrary& lib_def) TF_LOCKS_EXCLUDED(mu_);
+  Status AddLibrary(const FunctionDefLibrary& lib_def,
+                    const StackTracesMap& stack_traces) TF_LOCKS_EXCLUDED(mu_);
+  Status AddLibrary(FunctionDefLibrary&& lib_def,
+                    const StackTracesMap& stack_traces) TF_LOCKS_EXCLUDED(mu_);
 
   // If the gradient function for 'func' is specified explicitly in
   // the library, returns the gradient function name.  Otherwise,
@@ -510,9 +575,9 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Returns a proto representation of the state of this function library.
   FunctionDefLibrary ToProto() const TF_LOCKS_EXCLUDED(mu_);
 
-  size_t num_functions() const {
+  size_t num_functions() const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
-    return function_defs_.size();
+    return records_.size();
   }
 
   // Returns all the function names in the FunctionLibraryDefinition.
@@ -537,25 +602,23 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // name `func` already exists in this function library, and has the same
   // implementation as in `other`. If the implementations conflict, an invalid
   // argument error is returned.
-  Status CopyFunctionDefFrom(const std::string& func,
-                             const FunctionLibraryDefinition& other)
-      TF_LOCKS_EXCLUDED(mu_);
+  Status CopyFunctionDefFrom(const std::string& name,
+                             const FunctionLibraryDefinition& other);
 
   // Returns graph with debug stack traces for the given function, or `nullptr`
   // if none found.
-  const StackTracesMap& GetStackTraces(const std::string& func_name) const {
-    tf_shared_lock l(mu_);
-    std::shared_ptr<FunctionDefAndOpRegistration> entry = FindHelper(func_name);
-    if (entry) {
-      return entry->stack_traces;
+  const StackTracesMap* GetStackTraces(const std::string& func_name) const {
+    core::RefCountPtr<FunctionRecord> entry = FindRecord(func_name);
+    if (entry.get() != nullptr) {
+      return &entry->stack_traces();
     }
-    static const auto* empty_map = new StackTracesMap;
-    return *empty_map;
+    return nullptr;
   }
 
   // Adds or updates an OptimizedFunctionGraph. Key is `function_name`.
   void AddOptimizedFunctionGraph(const std::string& function_name,
-                                 const OptimizedFunctionGraph& graph) {
+                                 const OptimizedFunctionGraph& graph)
+      TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     optimized_function_graph_map_.emplace(function_name, graph);
   }
@@ -563,7 +626,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Look up for OptimizedFunctionGraph given `function_name`. Returns nullptr
   // if not found.
   OptimizedFunctionGraph* FindOptimizedFunctionGraph(
-      const std::string& function_name) const {
+      const std::string& function_name) const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     if (auto it = optimized_function_graph_map_.find(function_name);
         it != optimized_function_graph_map_.end()) {
@@ -573,28 +636,17 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   }
 
  private:
-  // Shape inference for functions is handled separately by ShapeRefiner.
-
-  struct FunctionDefAndOpRegistration {
-    explicit FunctionDefAndOpRegistration(
-        const FunctionDef& fdef_in, const StackTracesMap& stack_traces = {});
-
-    const FunctionDef fdef;
-    const OpRegistrationData op_registration_data;
-    const StackTracesMap stack_traces;
-  };
-
-  std::shared_ptr<FunctionDefAndOpRegistration> FindHelper(
-      const string& func) const TF_SHARED_LOCKS_REQUIRED(mu_);
+  core::RefCountPtr<FunctionRecord> FindHelper(const string& func) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
   std::string FindGradientHelper(const std::string& func) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
 
-  Status AddHelper(std::shared_ptr<FunctionDefAndOpRegistration> registration,
-                   bool* added) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status AddHelper(FunctionRecord* registration, bool* added)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Same as AddFunctionDef/AddGradientDef except these methods set
   // `added` to true if the `fdef`/`grad` were actually added to this.
-  Status AddFunctionDefHelper(const FunctionDef& fdef,
+  Status AddFunctionDefHelper(FunctionDef&& fdef,
                               const StackTracesMap& stack_traces, bool* added)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   Status AddGradientDefHelper(const GradientDef& grad, bool* added)
@@ -624,8 +676,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
 
   mutable mutex mu_;
   const OpRegistryInterface* default_registry_;
-  gtl::FlatMap<string, std::shared_ptr<FunctionDefAndOpRegistration>>
-      function_defs_ TF_GUARDED_BY(mu_);
+  gtl::FlatMap<string, FunctionRecord*> records_ TF_GUARDED_BY(mu_);
   gtl::FlatMap<string, string> func_grad_ TF_GUARDED_BY(mu_);
   // Maps from function name to optimized function graph.
   gtl::FlatMap<string, OptimizedFunctionGraph> optimized_function_graph_map_
@@ -851,8 +902,7 @@ class FunctionLibraryRuntime {
   // RPC calls.
   struct Options {
     Options() {}
-    explicit Options(const int64_t step_id)
-        : step_id(step_id), cleanup_rendezvous_after_run(false) {}
+    explicit Options(const int64_t step_id) : step_id(step_id) {}
 
     // Choose a step ID that is guaranteed not to clash with any
     // Session-generated step ID. DirectSession only generates
@@ -861,18 +911,13 @@ class FunctionLibraryRuntime {
     // always 0, so a negative random step ID should suffice.
     const int64_t step_id = -std::abs(static_cast<int64_t>(random::New64()));
 
-    // Whether to clean up rendezvous after run.
-    // If the function is a remote component of a cross-process function, a
-    // higher level component should determine the end of a step, and cleanup
-    // the rendezvous.
-    const bool cleanup_rendezvous_after_run = true;
-
     // op_id of the function running in eager mode. Set when we want to copy
     // remote outputs lazily. All components of a remote multi-device function
     // should use the same op_id, in order to correctly map remote output
     // tensors to the remote TensorHandles in the default device.
     absl::optional<int64_t> op_id = absl::nullopt;
 
+    // Not owned. Caller makes sure that the rendezvous outlives this Options.
     RendezvousInterface* rendezvous = nullptr;
     CancellationManager* cancellation_manager = nullptr;
     CollectiveExecutor* collective_executor = nullptr;
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 6eaa44ad535..4bee5c46534 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace tensorflow {
 namespace {
@@ -1050,10 +1051,15 @@ TEST(FunctionLibraryDefinitionTest, Find) {
   TF_CHECK_OK(lib_def.AddFunctionDef(test::function::XTimesTwo()));
 
   EXPECT_EQ(lib_def.Find("XTimes16"), nullptr);
+  EXPECT_EQ(lib_def.FindRecord("XTimes16").get(), nullptr);
 
   auto found = lib_def.Find("XTimesTwo");
+  auto found_record = lib_def.FindRecord("XTimesTwo");
   ASSERT_NE(found, nullptr);
+  ASSERT_NE(found_record.get(), nullptr);
   EXPECT_EQ(test::function::XTimesTwo().DebugString(), found->DebugString());
+  EXPECT_EQ(test::function::XTimesTwo().DebugString(),
+            found_record->fdef().DebugString());
 }
 
 TEST(FunctionLibraryDefinitionTest, LookUp) {
@@ -1091,7 +1097,7 @@ TEST(FunctionLibraryDefinitionTest, AddFunctionDef) {
   fdef.mutable_signature()->set_name("Add");
   Status s = lib_def.AddFunctionDef(fdef);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             "Cannot add function 'Add' because an op with the same name "
             "already exists.");
 
@@ -1117,7 +1123,7 @@ TEST(FunctionLibraryDefinitionTest, AddGradientDef) {
   grad.set_gradient_func(test::function::XTimes16().signature().name());
   Status s = lib_def.AddGradientDef(grad);
   EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             "Cannot assign gradient function 'XTimes16' to 'XTimesTwo' because "
             "it already has gradient function 'XTimesFour'");
 }
@@ -1128,8 +1134,7 @@ TEST(FunctionLibraryDefinitionTest, RemoveFunction) {
 
   Status s = lib_def.RemoveFunction("XTimes16");
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
-            "Tried to remove non-existent function 'XTimes16'.");
+  EXPECT_EQ(s.message(), "Tried to remove non-existent function 'XTimes16'.");
 
   EXPECT_TRUE(lib_def.Contains("XTimesTwo"));
   TF_EXPECT_OK(lib_def.RemoveFunction("XTimesTwo"));
@@ -1167,7 +1172,7 @@ TEST(FunctionLibraryDefinitionTest, AddLibrary) {
   FunctionLibraryDefinition lib_def2(OpRegistry::Global(), proto);
   Status s = lib_def.AddLibrary(lib_def2);
   EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             "Cannot add function 'XTimesTwo' because a different function with "
             "the same name already exists.");
 
@@ -1178,7 +1183,7 @@ TEST(FunctionLibraryDefinitionTest, AddLibrary) {
   FunctionLibraryDefinition lib_def3(OpRegistry::Global(), proto);
   s = lib_def.AddLibrary(lib_def3);
   EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             "Cannot assign gradient function 'XTimes16' to 'XTimesTwo' because "
             "it already has gradient function 'XTimesFour'");
 
@@ -1218,7 +1223,7 @@ TEST(FunctionLibraryDefinitionTest, AddLibrary_Atomic) {
   EXPECT_EQ(
       "Cannot add function 'XTimesTwo' because a different function with "
       "the same name already exists.",
-      s.error_message());
+      s.message());
 
   // Verify that none of the functions are added
   EXPECT_TRUE(lib_def.Find(x2_name) == nullptr);
@@ -1231,7 +1236,7 @@ TEST(FunctionLibraryDefinitionTest, AddLibrary_Atomic) {
   // Try adding the library and check that nothing was added
   s = lib_def.AddLibrary(proto);
   EXPECT_EQ(error::Code::INVALID_ARGUMENT, s.code());
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             "Cannot assign gradient function 'SecondGradName' to 'XTimesTwo' "
             "because it already has gradient function 'XTimesFour'");
   EXPECT_TRUE(lib_def.Find(x2_name) == nullptr);
@@ -1269,7 +1274,7 @@ TEST(FunctionLibraryDefinitionTest, AddLibraryDefinition_Atomic_FuncConflict) {
   EXPECT_EQ(
       "Cannot add function 'XTimesTwo' because a different function "
       "with the same name already exists.",
-      s.error_message());
+      s.message());
   EXPECT_TRUE(lib_def.Find(wx_name) == nullptr);
   EXPECT_EQ(1, lib_def.ToProto().function_size());
   EXPECT_EQ(1, lib_def.ToProto().gradient_size());
@@ -1305,7 +1310,7 @@ TEST(FunctionLibraryDefinitionTest, AddLibraryDefinition_Atomic_GradConflict) {
   EXPECT_EQ(
       "Cannot assign gradient function 'WXPlusB' to 'XTimesTwo'"
       " because it already has gradient function 'XTimesFour'",
-      s.error_message());
+      s.message());
   EXPECT_TRUE(lib_def.Find(wx_name) == nullptr);
   EXPECT_EQ(1, lib_def.ToProto().function_size());
   EXPECT_EQ(1, lib_def.ToProto().gradient_size());
@@ -1502,6 +1507,17 @@ TEST(FunctionLibraryDefinitionTest, AddAndFindOptimizedFunctionGraph) {
   EXPECT_NE(lib_def.FindOptimizedFunctionGraph("test"), nullptr);
 }
 
+TEST(FunctionLibraryDefinitionTest, MoveTest) {
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(), {});
+  const OptimizedFunctionGraph proto;
+  lib_def.AddOptimizedFunctionGraph("test", proto);
+  TF_CHECK_OK(lib_def.AddFunctionDef(test::function::XTimesTwo()));
+
+  FunctionLibraryDefinition copy_lib_def = std::move(lib_def);
+  EXPECT_TRUE(copy_lib_def.Contains("XTimesTwo"));
+  EXPECT_NE(copy_lib_def.FindOptimizedFunctionGraph("test"), nullptr);
+}
+
 // TODO(skyewm): this could be more thorough
 TEST(FunctionDefsEqualTest, TestFunctionDefsEqual) {
   // Equal functions
diff --git a/tensorflow/core/framework/graph.proto b/tensorflow/core/framework/graph.proto
index 73145e9fe07..8523f13ab20 100644
--- a/tensorflow/core/framework/graph.proto
+++ b/tensorflow/core/framework/graph.proto
@@ -3,6 +3,7 @@ syntax = "proto3";
 package tensorflow;
 
 import "tensorflow/core/framework/function.proto";
+import "tensorflow/core/framework/graph_debug_info.proto";
 import "tensorflow/core/framework/node_def.proto";
 import "tensorflow/core/framework/versions.proto";
 
@@ -53,4 +54,7 @@ message GraphDef {
   //     consumer does not start until all return values of the callee
   //     function are ready.
   FunctionDefLibrary library = 2;
+
+  // Stack traces for the nodes in this graph.
+  GraphDebugInfo debug_info = 5;
 }
diff --git a/tensorflow/core/protobuf/graph_debug_info.proto b/tensorflow/core/framework/graph_debug_info.proto
similarity index 97%
rename from tensorflow/core/protobuf/graph_debug_info.proto
rename to tensorflow/core/framework/graph_debug_info.proto
index 7af52628cd8..01d820104c1 100644
--- a/tensorflow/core/protobuf/graph_debug_info.proto
+++ b/tensorflow/core/framework/graph_debug_info.proto
@@ -42,7 +42,8 @@ message GraphDebugInfo {
   // The map key is a mangling of the containing function and op name with
   // syntax:
   //   op.name '@' func_name
-  // For ops in the top-level graph, the func_name is the empty string.
+  // For ops in the top-level graph, the func_name is the empty string and hence
+  // the `@` may be ommitted.
   // Note that op names are restricted to a small number of characters which
   // exclude '@', making it impossible to collide keys of this form. Function
   // names accept a much wider set of characters.
diff --git a/tensorflow/core/framework/local_rendezvous.cc b/tensorflow/core/framework/local_rendezvous.cc
index c63ecd57052..614d1e3427a 100644
--- a/tensorflow/core/framework/local_rendezvous.cc
+++ b/tensorflow/core/framework/local_rendezvous.cc
@@ -15,6 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/framework/local_rendezvous.h"
 
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/activity_watcher/activity.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -27,22 +33,28 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/tsl/platform/logging.h"
 
 namespace tensorflow {
 
 // Represents a blocked Send() or Recv() call in the rendezvous.
+// Item hols a reference to the owner rendezvous, to make
+// sure the local rendezvous outlives any pending requests and callbacks.
 struct LocalRendezvous::Item {
   enum Type { kSend = 0, kRecv = 1 };
 
-  Item(Rendezvous::Args send_args, const Tensor& value, bool is_dead)
-      : Item(send_args, kSend) {
+  Item(tsl::core::RefCountPtr<Rendezvous> rc_owner, Rendezvous::Args send_args,
+       const Tensor& value, bool is_dead,
+       activity_watcher::ActivityScope activity_scope)
+      : Item(std::move(rc_owner), send_args, kSend, std::move(activity_scope)) {
     send_state.value.Init(value);
     send_state.is_dead = is_dead;
   }
 
-  Item(Rendezvous::Args recv_args, Rendezvous::DoneCallback waiter,
-       CancellationToken cancellation_token)
-      : Item(recv_args, kRecv) {
+  Item(tsl::core::RefCountPtr<Rendezvous> rc_owner, Rendezvous::Args recv_args,
+       Rendezvous::DoneCallback waiter, CancellationToken cancellation_token,
+       activity_watcher::ActivityScope activity_scope)
+      : Item(std::move(rc_owner), recv_args, kRecv, std::move(activity_scope)) {
     recv_state.waiter.Init(std::move(waiter));
     recv_state.cancellation_token = cancellation_token;
   }
@@ -60,6 +72,7 @@ struct LocalRendezvous::Item {
 
   const Rendezvous::Args args;
   const Type type;
+  tsl::core::RefCountPtr<Rendezvous> rc_owner;
 
   // Link to next item in an ItemQueue.
   Item* next = nullptr;
@@ -77,8 +90,15 @@ struct LocalRendezvous::Item {
     } recv_state;
   };
 
+  activity_watcher::ActivityScope scope;
+
  private:
-  Item(Rendezvous::Args args, Type type) : args(args), type(type) {
+  Item(tsl::core::RefCountPtr<Rendezvous> rc_owner, Rendezvous::Args args,
+       Type type, activity_watcher::ActivityScope activity_scope)
+      : args(args),
+        type(type),
+        rc_owner(std::move(rc_owner)),
+        scope(std::move(activity_scope)) {
     if (args.device_context) {
       args.device_context->Ref();
     }
@@ -101,7 +121,7 @@ LocalRendezvous::~LocalRendezvous() {
   // Before destroying this rendezvous instance, make sure all the done-callback
   // calls have finished and the tensors have been released from the queue.
   bool table_not_empty = false;
-  for (int i = 0; i < table_buckets_.size(); ++i) {
+  for (int i = 0; i < num_buckets_; ++i) {
     auto& bucket = table_buckets_[i];
     {
       mutex_lock l(bucket.mu);
@@ -141,7 +161,7 @@ Status LocalRendezvous::Send(const Rendezvous::ParsedKey& key,
 
   TF_RETURN_IF_ERROR(status());
 
-  int bucket_index = key_hash % table_buckets_.size();
+  int bucket_index = key_hash % num_buckets_;
   auto& bucket = table_buckets_[bucket_index];
   bucket.mu.lock();
 
@@ -153,8 +173,22 @@ Status LocalRendezvous::Send(const Rendezvous::ParsedKey& key,
     // Only send-related fields need to be filled.
     // TODO(b/143786186): Investigate moving the allocation of `Item` outside
     // the lock.
+    auto rc_owner = tsl::core::GetNewRef(rc_owner_);
     DVLOG(2) << "Enqueue Send Item (key:" << key.FullKey() << "). ";
-    queue->push_back(new Item(send_args, val, is_dead));
+    activity_watcher::ActivityScope activity_scope(
+        [&]() {
+          return std::make_unique<activity_watcher::Activity>(
+              "LocalRendezvous::Send",
+              activity_watcher::ActivityCategory::kRendezvous,
+              activity_watcher::Activity::Attributes{
+                  {"Rendezvous", absl::StrFormat("%p", this)},
+                  {"key", std::string(key.FullKey())},
+                  {"key_hash", absl::StrCat(key_hash)},
+              });
+        },
+        /*level=*/1);
+    queue->push_back(new Item(std::move(rc_owner), send_args, val, is_dead,
+                              std::move(activity_scope)));
     bucket.mu.unlock();
     return OkStatus();
   }
@@ -174,18 +208,8 @@ Status LocalRendezvous::Send(const Rendezvous::ParsedKey& key,
   // Invoke the done-callback, without holding the lock.
   bucket.mu.unlock();
 
-  // Make sure the ref-count of the rendezvous won't reach 0 while the
-  // done_callback is running, which would otherwise become deadlock:
-  // the done_callback waits for the Unref() to return, while the destructor
-  // waits for the pending_callback_counter to reach 0.
-  core::RefCountPtr<const Rendezvous> rc_owner_ref;
-  if (rc_owner_) {
-    rc_owner_ref.reset(rc_owner_);
-    rc_owner_->Ref();
-  }
   DCHECK_EQ(item->type, Item::kRecv);
   (*item->recv_state.waiter)(OkStatus(), send_args, item->args, val, is_dead);
-  delete item;
   {
     mutex_lock l(bucket.mu);
     bucket.pending_callback_counter--;
@@ -193,6 +217,8 @@ Status LocalRendezvous::Send(const Rendezvous::ParsedKey& key,
       bucket.pending_callback_cond_var.notify_all();
     }
   }
+  // Delete the item at last since it may unref and destruct the rendezvous.
+  delete item;
   return OkStatus();
 }
 
@@ -201,6 +227,7 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
                                 Rendezvous::DoneCallback done) {
   uint64 key_hash = KeyHash(key.FullKey());
   DVLOG(2) << "Recv " << this << " " << key_hash << " " << key.FullKey();
+  tsl::core::RefCountPtr<Rendezvous> rc_keep_alive;
 
   auto s = status();
   if (!s.ok()) {
@@ -209,7 +236,7 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
     return;
   }
 
-  int bucket_index = key_hash % table_buckets_.size();
+  int bucket_index = key_hash % num_buckets_;
   auto& bucket = table_buckets_[bucket_index];
   bucket.mu.lock();
 
@@ -222,20 +249,6 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
     CancellationToken token = CancellationManager::kInvalidToken;
     bool already_cancelled = false;
     if (cm != nullptr) {
-      // Increment the refcount when cancellation manager is present, to make
-      // sure the rendezvous outlives the recv and its cancel callbacks.
-      // This refcount is dropped in exactly one of the following cases:
-      // (1) Recv registers cancellation callback to cm, and then cm is
-      //     cancelled, unref in the cancellation callback;
-      // (2) Recv registers cancellation callback to cm, but cm is already
-      //     cancelled, unref in the already_cancelled check;
-      // (3) Recv is successful, and item done callback finishes deregistering
-      //     the cancellation callback, unref in the item done callback;
-      // (4) Recv is successful, but the item done callback fails to deregister
-      //     the cancellation callback because cm already StartCancel, in this
-      //     case the cancellation callback will be invoked by the cm anyway,
-      //     unref in the cancellation callback.
-      if (rc_owner_) rc_owner_->Ref();
       token = cm->get_cancellation_token();
       already_cancelled = !cm->RegisterCallback(token, [this, token, key_hash,
                                                         &bucket] {
@@ -281,14 +294,10 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
               Rendezvous::Args(), item->args, Tensor(), /*is_dead=*/false);
           delete item;
         }
-        // Unref case (1) and (4)
-        if (rc_owner_) rc_owner_->Unref();
       });
     }
     if (already_cancelled) {
       bucket.mu.unlock();
-      // Unref case (2)
-      if (rc_owner_) rc_owner_->Unref();
       done(StatusGroup::MakeDerived(
                errors::Cancelled("RecvAsync is cancelled.")),
            Rendezvous::Args(), recv_args, Tensor(), /*is_dead=*/false);
@@ -299,12 +308,25 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
 
     // TODO(b/143786186): Investigate moving the allocation of `Item` outside
     // the lock.
+    activity_watcher::ActivityScope activity_scope(
+        [&]() {
+          return std::make_unique<activity_watcher::Activity>(
+              "LocalRendezvous::RecvAsync",
+              activity_watcher::ActivityCategory::kRendezvous,
+              activity_watcher::Activity::Attributes{
+                  {"Rendezvous", absl::StrFormat("%p", this)},
+                  {"key", std::string(key.FullKey())},
+                  {"key_hash", absl::StrCat(key_hash)},
+              });
+        },
+        /*level=*/1);
+    auto rc_owner = tsl::core::GetNewRef(rc_owner_);
     if (cm != nullptr) {
       // NOTE(mrry): We must wrap `done` with code that deregisters the
       // cancellation callback before calling the `done` callback, because the
       // cancellation manager may no longer be live after `done` is called.
       queue->push_back(new Item(
-          recv_args,
+          std::move(rc_owner), recv_args,
           [this, cm, token, done = std::move(done)](
               const Status& s, const Rendezvous::Args& send_args,
               const Rendezvous::Args& recv_args, const Tensor& v, bool dead) {
@@ -313,14 +335,14 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
             // StartAbort, Unref will happen inside the cancellation callback
             // when called by the CM.
             if (cm->TryDeregisterCallback(token)) {
-              // Unref case (3)
-              if (this->rc_owner_) this->rc_owner_->Unref();
+              // Ignore the return value.
             }
             done(s, send_args, recv_args, v, dead);
           },
-          token));
+          token, std::move(activity_scope)));
     } else {
-      queue->push_back(new Item(recv_args, std::move(done), token));
+      queue->push_back(new Item(std::move(rc_owner), recv_args, std::move(done),
+                                token, std::move(activity_scope)));
     }
 
     bucket.mu.unlock();
@@ -343,19 +365,9 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
   // Invoke the done-callback, without holding the lock.
   bucket.mu.unlock();
 
-  // Make sure the ref-count of the rendezvous won't reach 0 while the
-  // done_callback is running, which would otherwise become deadlock:
-  // the done_callback waits for the Unref() to return, while the destructor
-  // wiats for the pending_callback_counter to reach 0.
-  core::RefCountPtr<const Rendezvous> rc_owner_ref;
-  if (rc_owner_) {
-    rc_owner_ref.reset(rc_owner_);
-    rc_owner_->Ref();
-  }
   DCHECK_EQ(item->type, Item::kSend);
   done(OkStatus(), item->args, recv_args, *item->send_state.value,
        item->send_state.is_dead);
-  delete item;
   {
     mutex_lock l(bucket.mu);
     bucket.pending_callback_counter--;
@@ -363,6 +375,8 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
       bucket.pending_callback_cond_var.notify_all();
     }
   }
+  // Delete the item at last since it may unref and destruct the rendezvous.
+  delete item;
 }
 
 void LocalRendezvous::StartAbort(const Status& status) {
@@ -371,7 +385,10 @@ void LocalRendezvous::StartAbort(const Status& status) {
     mutex_lock l(mu_);
     status_.Update(status);
   }
-  for (int i = 0; i < table_buckets_.size(); ++i) {
+
+  // Keeps one Item to make sure the current rendezvous won't be destructed.
+  std::unique_ptr<Item> to_delete;
+  for (int i = 0; i < num_buckets_; ++i) {
     auto& bucket = table_buckets_[i];
     Table table;
     {
@@ -381,13 +398,20 @@ void LocalRendezvous::StartAbort(const Status& status) {
     for (auto& p : table) {
       Item* item = p.second.head;
       while (item != nullptr) {
-        if (item->type == Item::kRecv) {
-          (*item->recv_state.waiter)(status, Rendezvous::Args(),
-                                     Rendezvous::Args(), Tensor(), false);
+        switch (item->type) {
+          case Item::kRecv:
+            (*item->recv_state.waiter)(status, Rendezvous::Args(),
+                                       Rendezvous::Args(), Tensor(), false);
+            LOG(INFO) << "Local rendezvous recv item cancelled. Key hash: "
+                      << p.first;
+            break;
+          case Item::kSend:
+            LOG(INFO) << "Local rendezvous send item cancelled. Key hash: "
+                      << p.first;
+            break;
         }
-        Item* to_delete = item;
+        to_delete.reset(item);
         item = item->next;
-        delete to_delete;
       }
     }
   }
diff --git a/tensorflow/core/framework/local_rendezvous.h b/tensorflow/core/framework/local_rendezvous.h
index 00e950ff8d4..060b6b912a8 100644
--- a/tensorflow/core/framework/local_rendezvous.h
+++ b/tensorflow/core/framework/local_rendezvous.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_LOCAL_RENDEZVOUS_H_
 #define TENSORFLOW_CORE_FRAMEWORK_LOCAL_RENDEZVOUS_H_
 
+#include <memory>
+#include <optional>
 #include <vector>
 
 #include "tensorflow/core/framework/rendezvous.h"
@@ -42,12 +44,14 @@ class LocalRendezvous {
   // can make sure it outlives the async recv requests.
   // Pass in nullptr if the wrapping class is not refcounted.
   explicit LocalRendezvous(Rendezvous* owner, int num_shards)
-      : rc_owner_(owner), table_buckets_(num_shards > 0 ? num_shards : 1) {}
+      : num_buckets_(num_shards > 0 ? num_shards : 1),
+        rc_owner_(owner),
+        table_buckets_(std::make_unique<TableBucket[]>(num_buckets_)) {}
   ~LocalRendezvous();
 
   Status Send(const Rendezvous::ParsedKey& key,
               const Rendezvous::Args& send_args, const Tensor& val,
-              const bool is_dead);
+              bool is_dead);
   void RecvAsync(const Rendezvous::ParsedKey& key,
                  const Rendezvous::Args& recv_args,
                  Rendezvous::DoneCallback done);
@@ -55,6 +59,8 @@ class LocalRendezvous {
   Status status();
 
  private:
+  tsl::core::RefCountPtr<Rendezvous> GetOwnerRefCountPtr();
+
   struct Item;
 
   // By invariant, the item queue under each key is of the form
@@ -70,8 +76,10 @@ class LocalRendezvous {
 
   typedef gtl::FlatMap<uint64, ItemQueue> Table;
 
-  // Pointer to the owner class of this LocalRendezvous if it is refcounted.
-  const Rendezvous* rc_owner_;
+  const int num_buckets_;
+  // Pointer to the owner class of this LocalRendezvous if it is refcounted,
+  // nullptr otherwise.
+  Rendezvous* rc_owner_;
 
   struct TableBucket {
     mutex mu;
@@ -82,8 +90,8 @@ class LocalRendezvous {
     condition_variable pending_callback_cond_var TF_GUARDED_BY(mu);
   };
 
-  // Immutable vector.
-  std::vector<TableBucket> table_buckets_;
+  // Immutable set of buckets. This uses less memory than std::vector.
+  const std::unique_ptr<TableBucket[]> table_buckets_;
   mutex mu_;
   Status status_ TF_GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index f8df0166bad..ccd812c357e 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/tsl/lib/monitoring/counter.h"
 #include "tensorflow/tsl/lib/monitoring/gauge.h"
 #include "tensorflow/tsl/lib/monitoring/sampler.h"
+#include "tensorflow/tsl/platform/types.h"
 
 namespace tensorflow {
 namespace metrics {
@@ -185,6 +186,13 @@ auto* tf_data_service_data_transfer_protocol_used =
         "data transfer protocol.",
         "data_transfer_protocol");
 
+auto* tf_data_service_data_transfer_protocol_fallback =
+    tsl::monitoring::Counter<3>::New(
+        "/tensorflow/data/service/data_transfer_protocol_fallback",
+        "The number of tf.data service worker clients created that fell back "
+        "from using this data transfer protocol for this reason.",
+        "data_transfer_protocol", "error_type", "error_message");
+
 auto* tf_data_service_data_transfer_protocol_error =
     tsl::monitoring::Counter<3>::New(
         "/tensorflow/data/service/data_transfer_protocol_error",
@@ -259,6 +267,32 @@ auto* function_graph_optimization_time_usecs = tsl::monitoring::Counter<0>::New(
     "The amount of time TensorFlow has spent optimizing function graphs, in "
     "microseconds. ");
 
+auto* graph_optimization_saving_time_usecs = tsl::monitoring::Counter<1>::New(
+    "/tensorflow/core/graph_optimization_saving_time_usec",
+    "The amount of time TensorFlow has saved by caching the optimized "
+    "function graph, in microseconds",  // metric description
+    "source"                            // graph optimization source
+);
+
+auto* graph_optimization_cache_hit_count = tsl::monitoring::Counter<1>::New(
+    "/tensorflow/core/graph_optimization_cache_hit_count",
+    "The number of times the cache for the graph optimization is hit.",
+    "source"  // graph optimization source
+);
+
+auto* graph_optimization_cache_failure_count = tsl::monitoring::Counter<1>::New(
+    "/tensorflow/core/graph_optimization_cache_failure_count",
+    "The number of times restoring from the graph optimization cache "
+    "fails.",
+    "source"  // graph optimization source
+);
+
+auto* graph_optimization_cache_miss_count = tsl::monitoring::Counter<1>::New(
+    "/tensorflow/core/graph_optimization_cache_miss_count",
+    "The number of times the cache for the graph optimization is missed.",
+    "source"  // graph optimization source
+);
+
 auto* xla_compilations = tsl::monitoring::Counter<0>::New(
     "/tensorflow/core/xla_compilations",
     "The number of XLA compilations used to collect "
@@ -312,6 +346,21 @@ tsl::monitoring::Counter<2>* GetGraphOptimizationCounter() {
   return graph_optimization_counter;
 }
 
+std::string GraphOptimizationSourceMapping(GraphOptimizationSource source) {
+  switch (source) {
+    case GraphOptimizationSource::kJit:
+      return "jit";
+    case GraphOptimizationSource::kAot:
+      return "aot";
+    case GraphOptimizationSource::kUnknown:
+      return "unknown";
+    default:
+      return "";
+      LOG(ERROR) << "Unexpected value for GraphOptimizationSource: "
+                 << absl::StrCat(source);
+  }
+}
+
 void RecordTFDataFetchOp(const string& name) {
   tf_data_fetch_op_counter->GetCell(name)->IncrementBy(1);
 }
@@ -435,6 +484,14 @@ void RecordTFDataServiceDataTransferProtocolUsed(
       ->IncrementBy(1);
 }
 
+void RecordTFDataServiceDataTransferProtocolFallback(
+    const string& data_transfer_protocol, error::Code code,
+    const string& error_message) {
+  tf_data_service_data_transfer_protocol_fallback
+      ->GetCell(data_transfer_protocol, error::Code_Name(code), error_message)
+      ->IncrementBy(1);
+}
+
 void RecordTFDataServiceDataTransferProtocolError(
     const string& data_transfer_protocol, error::Code code,
     const string& error_message) {
@@ -557,6 +614,63 @@ void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs) {
   }
 }
 
+void UpdateFunctionGraphOptimizationSavingTime(const uint64 saving_time_usecs,
+                                               GraphOptimizationSource source) {
+  if (saving_time_usecs > 0) {
+    std::string mapped_source = GraphOptimizationSourceMapping(source);
+    static auto* function_graph_optimization_saving_time_usecs_cell =
+        graph_optimization_saving_time_usecs->GetCell(mapped_source);
+    function_graph_optimization_saving_time_usecs_cell->IncrementBy(
+        saving_time_usecs);
+  }
+}
+
+uint64 GetFunctionGraphOptimizationSavingTimeUsecs(
+    GraphOptimizationSource source) {
+  std::string mapped_source = GraphOptimizationSourceMapping(source);
+  return graph_optimization_saving_time_usecs->GetCell(mapped_source)->value();
+}
+
+void IncrementFunctionGraphOptimizationCacheHitCount(
+    const int count, GraphOptimizationSource source) {
+  std::string mapped_source = GraphOptimizationSourceMapping(source);
+  graph_optimization_cache_hit_count->GetCell(mapped_source)
+      ->IncrementBy(count);
+}
+
+int64_t GetFunctionGraphOptimizationCacheHitCount(
+    GraphOptimizationSource source) {
+  std::string mapped_source = GraphOptimizationSourceMapping(source);
+  return graph_optimization_cache_hit_count->GetCell(mapped_source)->value();
+}
+
+void IncrementFunctionGraphOptimizationCacheFailureCount(
+    const int count, GraphOptimizationSource source) {
+  std::string mapped_source = GraphOptimizationSourceMapping(source);
+  graph_optimization_cache_failure_count->GetCell(mapped_source)
+      ->IncrementBy(count);
+}
+
+int64_t GetFunctionGraphOptimizationCacheFailureCount(
+    GraphOptimizationSource source) {
+  std::string mapped_source = GraphOptimizationSourceMapping(source);
+  return graph_optimization_cache_failure_count->GetCell(mapped_source)
+      ->value();
+}
+
+void IncrementFunctionGraphOptimizationCacheMissCount(
+    const int count, GraphOptimizationSource source) {
+  std::string mapped_source = GraphOptimizationSourceMapping(source);
+  graph_optimization_cache_miss_count->GetCell(mapped_source)
+      ->IncrementBy(count);
+}
+
+int64_t GetFunctionGraphOptimizationCacheMissCount(
+    GraphOptimizationSource source) {
+  std::string mapped_source = GraphOptimizationSourceMapping(source);
+  return graph_optimization_cache_miss_count->GetCell(mapped_source)->value();
+}
+
 void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs) {
   if (distribution_time_usecs > 0) {
     tpu_variable_distribution_time_usecs->GetCell()->IncrementBy(
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index 8f39574d0a6..7c7314f1d86 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -25,9 +25,16 @@ limitations under the License.
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/data_service.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
 namespace metrics {
+enum class GraphOptimizationSource {
+  kUnknown,
+  kJit,
+  kAot,
+};
+
 // Records when a data-fetching tf.data operation is executed.
 //
 // The `name` argument identifies the operation type (e.g. "ToSingleElementOp").
@@ -131,6 +138,13 @@ void RecordTFDataServiceClientIterators(
 void RecordTFDataServiceDataTransferProtocolUsed(
     const string& data_transfer_protocol);
 
+// Records that a tf.data service worker client fell back to gRPC rather than
+// use `data_transfer_protocol` because of an error of type `code` with message
+// `error_message`.
+void RecordTFDataServiceDataTransferProtocolFallback(
+    const string& data_transfer_protocol, error::Code code,
+    const string& error_message);
+
 // Records that a tf.data service worker client got an error of non-retriable
 // type `code` with message `error_message` when trying to transfer data over
 // `data_transfer_protocol`.
@@ -213,6 +227,38 @@ void UpdateGraphBuildTime(const uint64 running_time_usecs);
 // Updates the metric stored for time spent optimizing function graphs.
 void UpdateFunctionGraphOptimizationTime(const uint64 running_time_usecs);
 
+// Updates the metric stored for time saved by caching graph optimization.
+void UpdateFunctionGraphOptimizationSavingTime(uint64 saving_time_usec,
+                                               GraphOptimizationSource source);
+
+// Retrieves the total time saved by the graph optimization caching.
+uint64 GetFunctionGraphOptimizationSavingTimeUsecs(
+    GraphOptimizationSource source);
+
+// Increments the hit count for the graph optimization cache.
+void IncrementFunctionGraphOptimizationCacheHitCount(
+    int count, GraphOptimizationSource source);
+
+// Gets the hit count for the graph optimization cache.
+int64_t GetFunctionGraphOptimizationCacheHitCount(
+    GraphOptimizationSource source);
+
+// Increments the failure count for the graph optimization cache restoring.
+void IncrementFunctionGraphOptimizationCacheFailureCount(
+    int count, GraphOptimizationSource source);
+
+// Gets the failure count for the graph optimization cache.
+int64_t GetFunctionGraphOptimizationCacheFailureCount(
+    GraphOptimizationSource source);
+
+// Increments the miss count for the graph optimization cache.
+void IncrementFunctionGraphOptimizationCacheMissCount(
+    int count, GraphOptimizationSource source);
+
+// Gets the miss count for the graph optimization cache.
+int64_t GetFunctionGraphOptimizationCacheMissCount(
+    GraphOptimizationSource source);
+
 // Records the activity of the first phase of the mlir bridge using the
 // tf_metadata.tf_mlir_bridge_first_phase_count metric.
 // device_type: tpu, cpu, gpu, etc.
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 9c0a904408a..47a0d427d3a 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -51,6 +51,10 @@ constexpr double kOutlierSigmas = 2.0;
 // processing time. For example, a value of 1 would mean that the target time is
 // faster than 84% of the gap times.
 constexpr double kTargetTimeSigmas = 1.0;
+// The scaling factors to apply to buffer optimization when upsizing or
+// downsizing a buffer.
+constexpr double kBufferUpsizeMultiplier = 2.0;
+constexpr double kBufferDownsizeMultipliter = 0.9;
 
 constexpr char kFlatMap[] = "FlatMap";
 constexpr char kInterleave[] = "Interleave";
@@ -257,9 +261,13 @@ inline bool IsSyncNode(const std::shared_ptr<Node> node) {
   return !node->IsAsync();
 }
 
-// Helper function for node traversal that returns only asynchronous nodes.
-inline bool IsAsyncNode(const std::shared_ptr<Node> node) {
-  return node->IsAsync();
+// Helper function for node traversal that returns only asynchronous interleave
+// many nodes.
+inline bool IsAsyncInterleaveManyNode(const std::shared_ptr<Node> node) {
+  return absl::StartsWith(node->name(), kParallelInterleave);
+}
+inline bool IsAsyncInterleaveManyNode(const Node* node) {
+  return absl::StartsWith(node->name(), kParallelInterleave);
 }
 
 // Wrapper for the square function to reduce verbosity.
@@ -1858,12 +1866,12 @@ bool Node::TryDownsizeBuffer() {
           (buffered_elements_high_ - buffered_elements_low_ + 1) <
               parameter->value) {
         double old_value = parameter->value;
-        // By default, we double buffer sizes if there is enough RAM in
-        // upsize. We cap the downsize by 1/4 of the current size to avoid
-        // undoing the previous upsize.
-        parameter->value =
-            std::max(buffered_elements_high_ - buffered_elements_low_ + 1,
-                     static_cast<int64_t>(old_value * 0.75));
+        // By default, we increase the buffer sizes by `kBufferUpsizeMultiplier`
+        // if there is enough RAM in upsize. We cap the downsize by
+        // `kBufferDownsizeMultipliter` of the current size.
+        parameter->value = std::max(
+            buffered_elements_high_ - buffered_elements_low_ + 1,
+            static_cast<int64_t>(old_value * kBufferDownsizeMultipliter));
         if (old_value != parameter->value) {
           VLOG(2) << "Downsize buffer " << long_name()
                   << "::" << parameter->name << " from " << old_value << " to "
@@ -2189,7 +2197,7 @@ Model::Model()
                                                   gap_times_usec_.end()};
               return model_proto.DebugString();
             }
-            LOG(WARNING) << s.error_message();
+            LOG(WARNING) << s.message();
           }
         }
         return DebugString();
@@ -2307,10 +2315,13 @@ Model::ModelParameters Model::CollectTunableParameters(
 
 bool Model::DownsizeBuffers(std::shared_ptr<Node> snapshot) {
   Node::NodeVector nodes =
-      snapshot->CollectNodes(TraversalOrder::BFS, IsAsyncNode);
+      snapshot->CollectNodes(TraversalOrder::BFS, IsAnyNode);
   nodes.push_back(snapshot);
   bool downsized = false;
   for (auto& node : nodes) {
+    if (!node->IsAsync()) {
+      continue;
+    }
     if (node->TryDownsizeBuffer()) {
       downsized = true;
     }
@@ -2321,12 +2332,15 @@ bool Model::DownsizeBuffers(std::shared_ptr<Node> snapshot) {
 absl::flat_hash_map<Node*, Parameter*> Model::CollectBufferParametersToUpsize(
     std::shared_ptr<Node> snapshot) {
   Node::NodeVector nodes =
-      snapshot->CollectNodes(TraversalOrder::BFS, IsAsyncNode);
+      snapshot->CollectNodes(TraversalOrder::BFS, IsAnyNode);
   absl::flat_hash_map<Node*, Parameter*> node_parameters;
   if (snapshot->IsAsync()) {
     snapshot->CollectBufferParametersToUpsize(node_parameters);
   }
   for (auto& node : nodes) {
+    if (!node->IsAsync()) {
+      continue;
+    }
     node->CollectBufferParametersToUpsize(node_parameters);
   }
   return node_parameters;
@@ -2545,6 +2559,7 @@ void Model::OptimizeHillClimbHelper(
       pair.second->value--;
     }
     if (!best_parameter) {
+      metrics::RecordTFDataAutotuneStoppingCriteria("local_maximum_reached");
       VLOG(2) << "Failed to find a tunable parameter that would further "
                  "decrease the output time. This suggests that the hill-climb "
                  "optimization got stuck in a local maximum. The optimization "
@@ -2586,19 +2601,110 @@ double Model::ComputeTargetTimeNsec() {
 void Model::OptimizeStageBased(std::shared_ptr<Node> snapshot,
                                const OptimizationParams& optimization_params,
                                CancellationManager* cancellation_manager) {
-  return OptimizeStageBasedParallelism(
+  VLOG(2) << "Starting optimization of tunable parameters with Stage-Based "
+             "optimization with a target time of "
+          << optimization_params.model_input_time() << " nanoseconds.";
+  if (experiments_.contains("stage_based_autotune_v2")) {
+    OptimizeStageBasedAsyncInterleaveManyNodes(snapshot, optimization_params,
+                                               cancellation_manager);
+  }
+  OptimizeStageBasedNonAsyncInterleaveManyNodes(
       snapshot, optimization_params.model_input_time(), optimization_params,
       cancellation_manager);
 }
 
-void Model::OptimizeStageBasedParallelism(
+void Model::OptimizeStageBasedAsyncInterleaveManyNodes(
+    std::shared_ptr<Node> snapshot,
+    const OptimizationParams& optimization_params,
+    CancellationManager* cancellation_manager) {
+  VLOG(2) << "Optimizing async interleave many nodes.";
+  Node::NodeVector interleave_many_nodes =
+      snapshot->CollectNodes(TraversalOrder::BFS, IsAnyNode);
+  if (IsAsyncInterleaveManyNode(snapshot)) {
+    interleave_many_nodes.push_back(snapshot);
+  }
+  Node::ModelParameters tunable_parameters;
+  for (auto node : interleave_many_nodes) {
+    if (!IsAsyncInterleaveManyNode(node)) {
+      continue;
+    }
+    Node::ModelParameters node_tunable_parameters =
+        node->CollectNodeTunableParameters();
+    tunable_parameters.insert(tunable_parameters.end(),
+                              node_tunable_parameters.begin(),
+                              node_tunable_parameters.end());
+  }
+  ModelTiming model_timing(snapshot);
+  ModelTimingPriorityQueue priority_queue(model_timing);
+  NodeParallelismParameters node_parallelism;
+  while (!cancellation_manager->IsCancelled()) {
+    StatusOr<std::pair<double, Node*>> critical_root_status =
+        priority_queue.PopSlowestStageRoot();
+    if (!critical_root_status.ok()) {
+      // All async interleave many nodes have been processed.
+      break;
+    }
+    std::pair<double, Node*> critical_root = critical_root_status.value();
+    if (!IsAsyncInterleaveManyNode(critical_root.second)) {
+      continue;
+    }
+    Parameter* parallelism_parameter =
+        node_parallelism.Get(critical_root.second);
+    if (parallelism_parameter == nullptr ||
+        parallelism_parameter->value >= parallelism_parameter->max) {
+      continue;
+    }
+    parallelism_parameter->value += 1.0;
+    if (TotalMaximumBufferedBytes(snapshot) >
+        optimization_params.ram_budget()) {
+      // Increasing the parallelism by 1 exceeded ram budget. Reduce it back and
+      // stop optimization because we cannot improve the most critical stage.
+      // There is also a decent chance that the current optimization iteration
+      // is under-optimized. For that reason, return immediately without
+      // updating the parameter state values.
+      parallelism_parameter->value -= 1.0;
+      // Removes the `<index>` of `[<index>]` to reduce the number of labels.
+      metrics::RecordTFDataAutotuneStoppingCriteria(strings::StrCat(
+          "ram_budget_exceeded:",
+          RemoveArrayIndices(critical_root.second->long_name())));
+      return;
+    }
+    model_timing.ComputeNodeTotalTime(*critical_root.second);
+    // This async interleave many node has not reached its max parallelism
+    // value. Push it back to the priority queue.
+    const ModelTiming::NodeTiming* root_timing =
+        model_timing.GetTiming(critical_root.second);
+    priority_queue.Push(critical_root.second, *root_timing);
+  }
+  UpdateStateValues(&tunable_parameters);
+}
+
+void Model::OptimizeStageBasedNonAsyncInterleaveManyNodes(
     std::shared_ptr<Node> snapshot, double target_time_nsec,
     const OptimizationParams& optimization_params,
     CancellationManager* cancellation_manager) {
-  VLOG(2) << "Starting optimization of tunable parameters with Stage-Based "
-             "optimization with a target time of "
-          << optimization_params.model_input_time() << " nanoseconds.";
-  Node::ModelParameters tunable_parameters = CollectTunableParameters(snapshot);
+  VLOG(2) << "Optimizing nodes other than async interleave many nodes.";
+  Node::NodeVector all_nodes;
+  if (experiments_.contains("stage_based_autotune_v2")) {
+    all_nodes = snapshot->CollectNodes(TraversalOrder::BFS, IsAnyNode);
+    if (!IsAsyncInterleaveManyNode(snapshot)) {
+      all_nodes.push_back(snapshot);
+    }
+  } else {
+    all_nodes = snapshot->CollectNodes(TraversalOrder::BFS, IsAnyNode);
+    all_nodes.push_back(snapshot);
+  }
+  Node::ModelParameters tunable_parameters;
+  for (auto node : all_nodes) {
+    if (IsAsyncInterleaveManyNode(node)) {
+      continue;
+    }
+    Node::ModelParameters node_tunable_parameters =
+        node->CollectNodeTunableParameters();
+    tunable_parameters.insert(tunable_parameters.end(),
+                              node_tunable_parameters.begin(),
+                              node_tunable_parameters.end());
+  }
   // Initialize the parallelism parameter values to minimal before tuning.
   for (std::pair<string, std::shared_ptr<Parameter>>& pair :
        tunable_parameters) {
@@ -2637,14 +2743,15 @@ void Model::OptimizeStageBasedParallelism(
       break;
     }
     parallelism_parameter->value += 1.0;
-    if (TotalMaximumBufferedBytes(snapshot) >
-        optimization_params.ram_budget()) {
-      // Increasing the parallelism by 1 exceeded ram budget. Reduce it back and
-      // stop optimization because we cannot improve the most critical stage.
-      // There is also a decent chance that the current optimization iteration
-      // is under-optimized. For that reason, return immediately without
-      // updating the parameter state values.
-      parallelism_parameter->value -= 1.0;
+    if (cancellation_manager->IsCancelled() ||
+        TotalMaximumBufferedBytes(snapshot) >
+            optimization_params.ram_budget()) {
+      // Either the optimization thread is cancelled or increasing the
+      // parallelism by 1 exceeded ram budget. There is a decent chance that the
+      // current optimization iteration is under-optimized. For that reason,
+      // return immediately without updating the parameter state values after
+      // recording the stopping criteria.
+
       // Removes the `<index>` of `[<index>]` to reduce the number of labels.
       metrics::RecordTFDataAutotuneStoppingCriteria(strings::StrCat(
           "ram_budget_exceeded:",
@@ -2719,7 +2826,7 @@ bool Model::UpsizeBuffers(std::shared_ptr<Node> snapshot, int64_t ram_budget) {
   }
 
   // Compute a uniform scaling factor for all buffers. Cap the factor at 2.
-  double scaling_factor = 2.0;
+  double scaling_factor = kBufferUpsizeMultiplier;
   if (max_buffered_bytes > 0) {
     scaling_factor =
         1.0 + std::min(1.0, available_ram_bytes / max_buffered_bytes);
@@ -2750,9 +2857,12 @@ bool Model::UpsizeBuffers(std::shared_ptr<Node> snapshot, int64_t ram_budget) {
 
 void Model::ResetBufferWatermarks() {
   Node::NodeVector nodes =
-      output()->CollectNodes(TraversalOrder::BFS, IsAsyncNode);
+      output()->CollectNodes(TraversalOrder::BFS, IsAnyNode);
   nodes.push_back(output());
   for (auto& node : nodes) {
+    if (!node->IsAsync()) {
+      continue;
+    }
     node->ResetBufferWatermarks();
   }
 }
@@ -2893,7 +3003,7 @@ std::string Model::DebugString() {
   if (s.ok()) {
     cached_debug_string_ = model_proto.DebugString();
   } else {
-    LOG(WARNING) << s.error_message();
+    LOG(WARNING) << s.message();
   }
   cache_until_ = absl::Now() + absl::Seconds(kMinSecondsBetweenCalls);
   return cached_debug_string_;
@@ -2936,12 +3046,6 @@ const ModelTiming::NodeTiming* ModelTiming::GetTiming(const Node* node) const {
 void ModelTiming::ComputePipelineRatios(const Node::NodeVector& bfs_nodes) {
   for (const auto& node : bfs_nodes) {
     auto& node_timing = timing_nodes_[node.get()];
-    if (!node->autotune()) {
-      // These are inactive nodes marked by parallel interleave
-      // transformations.
-      node_timing.pipeline_ratio = 0.0;
-      continue;
-    }
     double parent_pipeline_ratio = 1.0;
     double parent_ratio = 1.0;
     if (node->output() != nullptr || timing_nodes_.contains(node->output())) {
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index ef9ef70ace9..5b32511ccbc 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -944,8 +944,18 @@ class Model {
                           CancellationManager* cancellation_manager);
 
   // This is the first part of the stage-based optimization that optimizes
-  // tunable parallelism parameters.
-  void OptimizeStageBasedParallelism(
+  // tunable parallelism parameters for async interleave many nodes only. We
+  // separately optimize async interleave many nodes more aggressively because
+  // the variance of IO is difficult to predict.
+  void OptimizeStageBasedAsyncInterleaveManyNodes(
+      std::shared_ptr<Node> snapshot,
+      const OptimizationParams& optimization_params,
+      CancellationManager* cancellation_manager);
+
+  // This is the second part of the stage-based optimization that optimizes
+  // tunable parallelism parameters for all nodes other than async interleave
+  // many nodes.
+  void OptimizeStageBasedNonAsyncInterleaveManyNodes(
       std::shared_ptr<Node> snapshot, double target_time_nsec,
       const OptimizationParams& optimization_params,
       CancellationManager* cancellation_manager);
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 26b8744fbc1..5b4470b9de8 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -1716,7 +1716,7 @@ TEST_F(ModelTimingTest, TestDefaultParallelismInParallelInterleave) {
                    GetNodeTiming(/*node_id=*/5)->pipeline_ratio);
   EXPECT_DOUBLE_EQ(1.0 / static_cast<double>(cycle_length),
                    GetNodeTiming(/*node_id=*/6)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(0, GetNodeTiming(/*node_id=*/7)->pipeline_ratio);
+  EXPECT_DOUBLE_EQ(1.0 / 3.0, GetNodeTiming(/*node_id=*/7)->pipeline_ratio);
 
   const double expected_self_time_1 = 1000.0 / 100.0;
   const double expected_self_time_2 = 2000.0 / 100.0 / parallelism;
@@ -1907,7 +1907,8 @@ TEST_P(ParallelInterleaveTimingTest, ScenarioTest) {
                    GetNodeTiming(/*node_id=*/5)->pipeline_ratio);
   EXPECT_DOUBLE_EQ(1.0 / static_cast<double>(cycle_length),
                    GetNodeTiming(/*node_id=*/6)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(0, GetNodeTiming(/*node_id=*/7)->pipeline_ratio);
+  EXPECT_DOUBLE_EQ(1.0 / static_cast<double>(cycle_length),
+                   GetNodeTiming(/*node_id=*/7)->pipeline_ratio);
 
   const double expected_self_time_1 = 1000.0 / 100.0;
   const double expected_self_time_2 = 2000.0 / 100.0 / parallelism;
@@ -2222,22 +2223,13 @@ TEST_F(BufferSizeTest, OptimizeBuffers_PlentyOfMemory) {
       key: 2
       value: {
         id: 2
-        name: "Prefetch"
+        name: "Map"
         autotune: true
-        bytes_produced: 10000
         num_elements: 100
-        processing_time: 2000
-        node_class: ASYNC_KNOWN_RATIO
-        inputs: 3
+        processing_time: 3000
+        node_class: KNOWN_RATIO
         ratio: 1
-        parameters: {
-          name: "buffer_size"
-          value: 5
-          state_value: 5
-          min: 1
-          max: 10
-          tunable: true
-        }
+        inputs: 3
       }
     }
     nodes: {
@@ -2279,7 +2271,7 @@ TEST_F(BufferSizeTest, OptimizeBuffers_PlentyOfMemory) {
           value: 5
           state_value: 5
           min: 1
-          max: 8
+          max: 10
           tunable: true
         }
       }
@@ -2294,6 +2286,28 @@ TEST_F(BufferSizeTest, OptimizeBuffers_PlentyOfMemory) {
         num_elements: 100
         processing_time: 2000
         node_class: ASYNC_KNOWN_RATIO
+        inputs: 6
+        ratio: 1
+        parameters: {
+          name: "buffer_size"
+          value: 5
+          state_value: 5
+          min: 1
+          max: 8
+          tunable: true
+        }
+      }
+    }
+    nodes: {
+      key: 6
+      value: {
+        id: 6
+        name: "Prefetch"
+        autotune: true
+        bytes_produced: 10000
+        num_elements: 100
+        processing_time: 2000
+        node_class: ASYNC_KNOWN_RATIO
         ratio: 1
         parameters: {
           name: "buffer_size"
@@ -2309,64 +2323,64 @@ TEST_F(BufferSizeTest, OptimizeBuffers_PlentyOfMemory) {
   )pb");
 
   std::shared_ptr<Node> node_1 = GetNode(1);
-  std::shared_ptr<Node> node_2 = GetNode(2);
   std::shared_ptr<Node> node_3 = GetNode(3);
   std::shared_ptr<Node> node_4 = GetNode(4);
   std::shared_ptr<Node> node_5 = GetNode(5);
+  std::shared_ptr<Node> node_6 = GetNode(6);
   // Set node 1 low watermark to 1 and high watermark to 2. Expect that it is
   // downsized to 2.
   node_1->record_buffer_event(100, 1);
   node_1->record_buffer_event(100, 1);
   EXPECT_EQ(1, node_1->buffered_elements_low());
   EXPECT_EQ(2, node_1->buffered_elements_high());
-  // Set node 2 low watermark to 1 and high watermark to 5. Expect that it is
+  // Set node 3 low watermark to 1 and high watermark to 5. Expect that it is
   // not changed.
-  node_2->record_buffer_event(100, 1);
-  node_2->record_buffer_event(400, 4);
-  node_2->record_buffer_event(-100, -1);
-  EXPECT_EQ(1, node_2->buffered_elements_low());
-  EXPECT_EQ(5, node_2->buffered_elements_high());
-  // Set node 3 low watermark to 0 and high watermark to 5. Expect that it is
-  // upsized to 10.
   node_3->record_buffer_event(100, 1);
+  node_3->record_buffer_event(400, 4);
   node_3->record_buffer_event(-100, -1);
-  node_3->record_buffer_event(500, 5);
-  node_3->record_buffer_event(-100, -1);
-  EXPECT_EQ(0, node_3->buffered_elements_low());
+  EXPECT_EQ(1, node_3->buffered_elements_low());
   EXPECT_EQ(5, node_3->buffered_elements_high());
-  // Set node 4 low watermark to 0 and high watermark to 5. Its max buffer size
-  // is set to 8. Expect that it is upsized to 8.
+  // Set node 4 low watermark to 0 and high watermark to 5. Expect that it is
+  // upsized to 10.
   node_4->record_buffer_event(100, 1);
   node_4->record_buffer_event(-100, -1);
   node_4->record_buffer_event(500, 5);
   node_4->record_buffer_event(-100, -1);
   EXPECT_EQ(0, node_4->buffered_elements_low());
   EXPECT_EQ(5, node_4->buffered_elements_high());
-  // Set node 5 low watermark to 1 and high watermark to 2. Its current buffer
+  // Set node 5 low watermark to 0 and high watermark to 5. Its max buffer size
+  // is set to 8. Expect that it is upsized to 8.
+  node_5->record_buffer_event(100, 1);
+  node_5->record_buffer_event(-100, -1);
+  node_5->record_buffer_event(500, 5);
+  node_5->record_buffer_event(-100, -1);
+  EXPECT_EQ(0, node_5->buffered_elements_low());
+  EXPECT_EQ(5, node_5->buffered_elements_high());
+  // Set node 6 low watermark to 1 and high watermark to 2. Its current buffer
   // size is set to 8. Expect that it is downsized to 8/2 rather than (2 - 1 + 1
   // = 3) because downsize is capped to half its size.
-  node_5->record_buffer_event(100, 1);
-  node_5->record_buffer_event(-100, 1);
-  EXPECT_EQ(1, node_5->buffered_elements_low());
-  EXPECT_EQ(2, node_5->buffered_elements_high());
+  node_6->record_buffer_event(100, 1);
+  node_6->record_buffer_event(-100, 1);
+  EXPECT_EQ(1, node_6->buffered_elements_low());
+  EXPECT_EQ(2, node_6->buffered_elements_high());
 
   model_->OptimizeBuffers(node_1->Snapshot(), 10000);
 
   EXPECT_EQ(2, node_1->parameter_value(kBufferSize));
-  EXPECT_EQ(5, node_2->parameter_value(kBufferSize));
-  EXPECT_EQ(10, node_3->parameter_value(kBufferSize));
-  EXPECT_EQ(8, node_4->parameter_value(kBufferSize));
-  EXPECT_EQ(6, node_5->parameter_value(kBufferSize));
+  EXPECT_EQ(5, node_3->parameter_value(kBufferSize));
+  EXPECT_EQ(10, node_4->parameter_value(kBufferSize));
+  EXPECT_EQ(8, node_5->parameter_value(kBufferSize));
+  EXPECT_EQ(7, node_6->parameter_value(kBufferSize));
   EXPECT_EQ(2, node_1->buffered_elements_low());
   EXPECT_EQ(2, node_1->buffered_elements_high());
-  EXPECT_EQ(4, node_2->buffered_elements_low());
-  EXPECT_EQ(4, node_2->buffered_elements_high());
   EXPECT_EQ(4, node_3->buffered_elements_low());
   EXPECT_EQ(4, node_3->buffered_elements_high());
   EXPECT_EQ(4, node_4->buffered_elements_low());
   EXPECT_EQ(4, node_4->buffered_elements_high());
-  EXPECT_EQ(2, node_5->buffered_elements_low());
-  EXPECT_EQ(2, node_5->buffered_elements_high());
+  EXPECT_EQ(4, node_5->buffered_elements_low());
+  EXPECT_EQ(4, node_5->buffered_elements_high());
+  EXPECT_EQ(2, node_6->buffered_elements_low());
+  EXPECT_EQ(2, node_6->buffered_elements_high());
 }
 
 TEST_F(BufferSizeTest, OptimizeBuffers_TightMemory) {
@@ -2690,6 +2704,161 @@ TEST_F(ModelTimingTest, OptimizeStageBased_TwoStages) {
   EXPECT_EQ(5, GetNode(/*node_id=*/2)->parameter_value("parallelism"));
 }
 
+TEST_F(ModelTimingTest, OptimizeStageBased_ParallelInterleaveMaxParallelism) {
+  BuildModelFromProto(R"pb(
+    nodes: {
+      key: 1
+      value: {
+        id: 1
+        name: "Prefetch"
+        autotune: false
+        bytes_produced: 10000
+        num_elements: 100
+        processing_time: 2000
+        node_class: ASYNC_KNOWN_RATIO
+        inputs: 2
+        ratio: 1
+        parameters: {
+          name: "buffer_size"
+          value: 3
+          state_value: 3
+          min: 1
+          max: 10
+          tunable: false
+        }
+      }
+    }
+    nodes: {
+      key: 2
+      value: {
+        id: 2
+        name: "ParallelMapV2"
+        autotune: true
+        num_elements: 100
+        processing_time: 60000
+        bytes_produced: 10000
+        node_class: ASYNC_KNOWN_RATIO
+        ratio: 1
+        inputs: 3
+        parameters: {
+          name: "parallelism"
+          value: 1
+          state_value: 4
+          min: 1
+          max: 16
+          tunable: true
+        }
+      }
+    }
+    nodes: {
+      key: 3
+      value: {
+        id: 3
+        name: "ParallelInterleaveV4"
+        autotune: true
+        num_elements: 100
+        processing_time: 40000
+        bytes_produced: 10000
+        node_class: ASYNC_INTERLEAVE_MANY
+        inputs: 4
+        inputs: 5
+        inputs: 6
+        parameters: {
+          name: "parallelism"
+          value: 4
+          state_value: 4
+          min: 1
+          max: 20
+          tunable: true
+        }
+        parameters: { name: "cycle_length" value: 10 tunable: false }
+      }
+    }
+    nodes: {
+      key: 4
+      value: {
+        id: 4
+        name: "TensorSlice"
+        autotune: true
+        num_elements: 2
+        processing_time: 2
+        node_class: KNOWN_RATIO
+        ratio: 1
+      }
+    }
+    nodes: {
+      key: 5
+      value: {
+        id: 5
+        name: "SSTable"
+        autotune: true
+        num_elements: 100
+        processing_time: 1000
+        node_class: KNOWN_RATIO
+      }
+    }
+    nodes: {
+      key: 6
+      value: {
+        id: 6
+        name: "ParallelInterleaveV4"
+        autotune: true
+        num_elements: 100
+        processing_time: 4000
+        bytes_produced: 10000
+        node_class: ASYNC_INTERLEAVE_MANY
+        inputs: 7
+        inputs: 8
+        parameters: {
+          name: "parallelism"
+          value: 4
+          state_value: 4
+          min: 1
+          max: 20
+          tunable: true
+        }
+        parameters: { name: "cycle_length" value: 10 tunable: false }
+      }
+    }
+    nodes: {
+      key: 7
+      value: {
+        id: 7
+        name: "TensorSlice"
+        autotune: true
+        num_elements: 2
+        processing_time: 2
+        node_class: KNOWN_RATIO
+        ratio: 1
+      }
+    }
+    nodes: {
+      key: 8
+      value: {
+        id: 8
+        name: "SSTable"
+        autotune: true
+        num_elements: 100
+        processing_time: 1000
+        node_class: KNOWN_RATIO
+      }
+    }
+    output: 1
+  )pb");
+
+  CellReader<int64_t> cell_reader(
+      "/tensorflow/data/autotune_stopping_criteria");
+  CancellationManager cancellation_manager;
+  // Not enough RAM, the original `parallelism` should not change.
+  model_->AddExperiment("stage_based_autotune_v2");
+  model_->Optimize(AutotuneAlgorithm::STAGE_BASED, /*cpu_budget=*/10000,
+                   /*ram_budget=*/10000, /*model_input_time=*/60,
+                   &cancellation_manager);
+  EXPECT_EQ(10, GetNode(/*node_id=*/2)->parameter_value("parallelism"));
+  EXPECT_EQ(20, GetNode(/*node_id=*/3)->parameter_value("parallelism"));
+  EXPECT_EQ(20, GetNode(/*node_id=*/6)->parameter_value("parallelism"));
+}
+
 TEST_F(ModelTimingTest, OptimizeStageBased_TwoStages_RamBudgetExceeded) {
   BuildModelFromProto(R"pb(
     nodes: {
diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc
index bf6ce2bd33c..fcf73e6970b 100644
--- a/tensorflow/core/framework/node_def_builder.cc
+++ b/tensorflow/core/framework/node_def_builder.cc
@@ -45,7 +45,7 @@ NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name,
   if (status.ok()) {
     Initialize();
   } else {
-    errors_.push_back(status.error_message());
+    errors_.push_back(std::string(status.message()));
     inputs_specified_ = 0;
   }
   if (debug != nullptr) MergeDebugInfo(*debug, &node_def_);
@@ -88,7 +88,7 @@ bool NodeDefBuilder::NextArgAvailable() {
 NodeDefBuilder& NodeDefBuilder::Input(FakeInputFunctor fake_input) {
   if (NextArgAvailable()) {
     Status status = fake_input(*op_def_, inputs_specified_, node_def_, this);
-    if (!status.ok()) errors_.push_back(status.error_message());
+    if (!status.ok()) errors_.push_back(std::string(status.message()));
   }
   return *this;
 }
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index b25c363bab9..24fc361d2f8 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -84,7 +84,7 @@ class NodeDefBuilderTest : public ::testing::Test {
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
     for (const string& message : messages) {
-      EXPECT_TRUE(absl::StrContains(status.error_message(), message))
+      EXPECT_TRUE(absl::StrContains(status.message(), message))
           << status << ", " << message;
     }
   }
@@ -107,8 +107,8 @@ class NodeDefBuilderTest : public ::testing::Test {
     }
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
-    EXPECT_TRUE(absl::StrContains(status.error_message(), message))
-        << "Actual error: " << status.error_message()
+    EXPECT_TRUE(absl::StrContains(status.message(), message))
+        << "Actual error: " << status.message()
         << "\nDoes not contain: " << message;
   }
 
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 477bad4bba8..7ac0dc1897f 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -910,7 +911,7 @@ Status AttachDef(const Status& status, const NodeDef& node_def,
   Status ret = status;
   string node_error;
   if (!allow_multiple_formatted_node &&
-      status.error_message().find("{{node ") != string::npos) {
+      absl::StrContains(status.message(), "{{node ")) {
     node_error = node_def.name();
   } else {
     node_error = FormatNodeDefForError(node_def);
@@ -1034,4 +1035,11 @@ Status MaybeUpdateColocationConstraintsWithMap(
   return OkStatus();
 }
 
+void ChangeToNoOp(NodeDef* node_def) {
+  node_def->set_op("NoOp");
+  // NoOp nodes have no outputs. Remove any full type information describing
+  // any outputs the node previous had.
+  node_def->clear_experimental_type();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index fdfdbe0d686..7987e5ebb03 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -440,6 +440,12 @@ Status MaybeUpdateColocationConstraintsWithMap(
     const std::map<absl::string_view, absl::string_view>& node_name_map,
     NodeDef* node_def);
 
+// For replacing a existing node with a NoOp, change the op and clear full type
+// information (since a NoOp has no output). Note that (duplicate control or
+// all) inputs, (regular, output or all) attributes and output properperties are
+// NOT cleared (and should be cleared if appropriate elsewhere).
+void ChangeToNoOp(NodeDef* node_def);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_UTIL_H_
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index 33fce730de2..fbba2c86892 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -67,7 +67,7 @@ void ExpectFailure(const NodeDef& bad, const OpDef& op_def,
       << status << "; NodeDef: " << SummarizeNodeDef(bad)
       << "; OpDef: " << SummarizeOpDef(op_def);
 
-  LOG(INFO) << "Message: " << status.error_message();
+  LOG(INFO) << "Message: " << status.message();
   EXPECT_TRUE(absl::StrContains(status.ToString(), message))
       << "NodeDef: " << SummarizeNodeDef(bad)
       << "; OpDef: " << SummarizeOpDef(op_def) << "\nActual error: " << status
@@ -876,11 +876,11 @@ TEST(AttachDef, AllowMultipleFormattedNode) {
   a.set_name("a");
   NodeDef b;
   b.set_name("b");
-  Status s = Status(error::CANCELLED, "Error");
+  Status s = Status(absl::StatusCode::kCancelled, "Error");
   Status s2 = AttachDef(s, a, true);
-  EXPECT_EQ("Error\n\t [[{{node a}}]]", s2.error_message());
+  EXPECT_EQ("Error\n\t [[{{node a}}]]", s2.message());
   Status s3 = AttachDef(s2, b, true);
-  EXPECT_EQ("Error\n\t [[{{node a}}]]\n\t [[{{node b}}]]", s3.error_message());
+  EXPECT_EQ("Error\n\t [[{{node a}}]]\n\t [[{{node b}}]]", s3.message());
 }
 
 TEST(AttachDef, DisallowMultipleFormattedNode) {
@@ -888,11 +888,11 @@ TEST(AttachDef, DisallowMultipleFormattedNode) {
   a.set_name("a");
   NodeDef b;
   b.set_name("b");
-  Status s = Status(error::CANCELLED, "Error");
+  Status s = Status(absl::StatusCode::kCancelled, "Error");
   Status s2 = AttachDef(s, a, false);
-  EXPECT_EQ("Error\n\t [[{{node a}}]]", s2.error_message());
+  EXPECT_EQ("Error\n\t [[{{node a}}]]", s2.message());
   Status s3 = AttachDef(s2, b, false);
-  EXPECT_EQ("Error\n\t [[{{node a}}]]\n\t [[b]]", s3.error_message());
+  EXPECT_EQ("Error\n\t [[{{node a}}]]\n\t [[b]]", s3.message());
 }
 
 }  // namespace
diff --git a/tensorflow/core/framework/op_compatibility_test.cc b/tensorflow/core/framework/op_compatibility_test.cc
index aa2907d342f..04adde1c503 100644
--- a/tensorflow/core/framework/op_compatibility_test.cc
+++ b/tensorflow/core/framework/op_compatibility_test.cc
@@ -98,7 +98,7 @@ class OpCompatibilityTest : public OpsTestBase {
       ADD_FAILURE() << SummarizeOpDef(old_op_def) << " vs. "
                     << SummarizeOpDef(new_op_def);
     } else {
-      EXPECT_TRUE(absl::StrContains(status.error_message(), error))
+      EXPECT_TRUE(absl::StrContains(status.message(), error))
           << status << " does not contain " << error;
     }
   }
@@ -119,7 +119,7 @@ class OpCompatibilityTest : public OpsTestBase {
     if (status.ok()) {
       ADD_FAILURE() << SummarizeNodeDef(*node_def());
     } else {
-      EXPECT_TRUE(absl::StrContains(status.error_message(), validation_error))
+      EXPECT_TRUE(absl::StrContains(status.message(), validation_error))
           << status << " does not contain " << validation_error;
     }
 
@@ -179,8 +179,7 @@ class OpCompatibilityTest : public OpsTestBase {
       ADD_FAILURE() << SummarizeOpDef(old_op_def) << " vs. "
                     << SummarizeOpDef(*new_op_def);
     } else {
-      EXPECT_TRUE(
-          absl::StrContains(status.error_message(), compatibility_error))
+      EXPECT_TRUE(absl::StrContains(status.message(), compatibility_error))
           << status << " does not contain " << compatibility_error;
     }
   }
diff --git a/tensorflow/core/framework/op_def_builder_test.cc b/tensorflow/core/framework/op_def_builder_test.cc
index 52cf0167e40..59aeed913e4 100644
--- a/tensorflow/core/framework/op_def_builder_test.cc
+++ b/tensorflow/core/framework/op_def_builder_test.cc
@@ -79,7 +79,7 @@ class OpDefBuilderTest : public ::testing::Test {
     Status status = builder.Finalize(&op_reg_data);
     EXPECT_FALSE(status.ok());
     if (!status.ok()) {
-      EXPECT_EQ(status.error_message(), error);
+      EXPECT_EQ(status.message(), error);
     }
   }
 };
@@ -617,7 +617,7 @@ TEST_F(OpDefBuilderTest, SetShapeFn) {
       "attr { name: \"dtype\" type: \"type\" allowed_values { list { } } }",
       &fn_out);
   ASSERT_TRUE(fn_out != nullptr);
-  EXPECT_EQ("ShapeFn was called", fn_out(nullptr).error_message());
+  EXPECT_EQ("ShapeFn was called", fn_out(nullptr).message());
 }
 
 TEST_F(OpDefBuilderTest, SetShapeFnCalledTwiceFailure) {
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 3a62aea46b0..11a17486372 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -496,7 +496,7 @@ Status ApiDefMap::LoadFile(Env* env, const string& filename) {
     // Return failed status annotated with filename to aid in debugging.
     return errors::CreateWithUpdatedMessage(
         status, strings::StrCat("Error parsing ApiDef file ", filename, ": ",
-                                status.error_message()));
+                                status.message()));
   }
   return OkStatus();
 }
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index a2eb7c5f86e..f1372f2dad1 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -1214,8 +1214,7 @@ void LoadDynamicKernelsInternal() {
         Status s = IsProbablySafeToLoad(fullpath);
         if (!s.ok() && override_abi_check) {
           LOG(WARNING) << "Loading UNSAFE library " << fullpath
-                       << " because ABI check override is set: "
-                       << s.error_message();
+                       << " because ABI check override is set: " << s.message();
         }
         if (s.ok() || override_abi_check) {
           // TODO(gunan): Store the handles to the opened files.
@@ -1224,7 +1223,7 @@ void LoadDynamicKernelsInternal() {
               env->LoadDynamicLibrary(fullpath.c_str(), &unused_filehandle));
         } else {
           LOG(WARNING) << "Not loading plugin library " << fullpath << ": "
-                       << s.error_message();
+                       << s.message();
         }
       }
     }
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index d67f4681cf7..7c1a44fb004 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -227,7 +227,7 @@ class OpKernelTest : public ::testing::Test {
     EXPECT_TRUE(op == nullptr);
     EXPECT_FALSE(status.ok());
     if (!status.ok()) {
-      LOG(INFO) << "Status message: " << status.error_message();
+      LOG(INFO) << "Status message: " << status.message();
       EXPECT_EQ(code, status.code());
     }
   }
@@ -476,12 +476,12 @@ TEST_F(OpKernelTest, AllocateOutput) {
   // Allocating to index -1 should fail (Only 0 should work).
   Status s = ctx->allocate_output(-1, TensorShape({}), &output);
   EXPECT_THAT(s, tensorflow::testing::StatusIs(error::INTERNAL));
-  EXPECT_THAT(s.error_message(), ::testing::ContainsRegex("bad index=-1"));
+  EXPECT_THAT(s.message(), ::testing::ContainsRegex("bad index=-1"));
 
   // Allocating to index 1 should fail (Only 0 should work).
   s = ctx->allocate_output(1, TensorShape({}), &output);
   EXPECT_THAT(s, tensorflow::testing::StatusIs(error::INTERNAL));
-  EXPECT_THAT(s.error_message(), ::testing::ContainsRegex("bad index=1"));
+  EXPECT_THAT(s.message(), ::testing::ContainsRegex("bad index=1"));
 
   // Testing allocate_output when allocator attributes are set.
   AllocatorAttributes attrs;
@@ -491,12 +491,12 @@ TEST_F(OpKernelTest, AllocateOutput) {
   // Index -1 should fail as only 1 output for the op.
   s = ctx->allocate_output(-1, TensorShape({}), &output, attrs);
   EXPECT_THAT(s, tensorflow::testing::StatusIs(error::INTERNAL));
-  EXPECT_THAT(s.error_message(), ::testing::ContainsRegex("bad index=-1"));
+  EXPECT_THAT(s.message(), ::testing::ContainsRegex("bad index=-1"));
 
   // Index 1 should fail as only 1 output for the op.
   s = ctx->allocate_output(1, TensorShape({}), &output, attrs);
   EXPECT_THAT(s, tensorflow::testing::StatusIs(error::INTERNAL));
-  EXPECT_THAT(s.error_message(), ::testing::ContainsRegex("bad index=1"));
+  EXPECT_THAT(s.message(), ::testing::ContainsRegex("bad index=1"));
 }
 
 // A mock device that mimics the behavior of scoped allocator upon calling
@@ -698,7 +698,7 @@ TEST_F(OpKernelBuilderTest, DuplicateKernel) {
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(absl::StrContains(
-      status.error_message(), "Multiple OpKernel registrations match NodeDef"));
+      status.message(), "Multiple OpKernel registrations match NodeDef"));
 
   ExpectFailure("DuplicateKernel", DEVICE_CPU, {}, error::INVALID_ARGUMENT);
 }
@@ -718,7 +718,7 @@ TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(absl::StrContains(
-      status.error_message(), "Multiple OpKernel registrations match NodeDef"));
+      status.message(), "Multiple OpKernel registrations match NodeDef"));
 
   ExpectFailure("DuplicateKernelForT", DEVICE_CPU, {"T|type|DT_FLOAT"},
                 error::INVALID_ARGUMENT);
@@ -739,7 +739,7 @@ TEST_F(OpKernelBuilderTest, BadConstraint) {
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(
-      absl::StrContains(status.error_message(),
+      absl::StrContains(status.message(),
                         "OpKernel 'BadConstraint' has constraint on attr "
                         "'T' not in NodeDef"));
 
@@ -893,7 +893,7 @@ TEST_F(GetAttrTest, Int) {
     if (key_status.first == "i32") {
       EXPECT_EQ(error::INVALID_ARGUMENT, key_status.second.code());
       EXPECT_EQ("Attr a has value 8589934592 out of range for an int32",
-                key_status.second.error_message());
+                key_status.second.message());
     }
   }
 
@@ -907,7 +907,7 @@ TEST_F(GetAttrTest, Int) {
     if (key_status.first == "i32_list") {
       EXPECT_EQ(error::INVALID_ARGUMENT, key_status.second.code());
       EXPECT_EQ("Attr b has value -8589934592 out of range for an int32",
-                key_status.second.error_message());
+                key_status.second.message());
     }
   }
 }
diff --git a/tensorflow/core/framework/op_kernel_test_base.h b/tensorflow/core/framework/op_kernel_test_base.h
index 266fe2272b4..95950d891bc 100644
--- a/tensorflow/core/framework/op_kernel_test_base.h
+++ b/tensorflow/core/framework/op_kernel_test_base.h
@@ -123,7 +123,7 @@ class OpKernelBuilderTest : public ::testing::Test {
     EXPECT_TRUE(op == nullptr);
     EXPECT_FALSE(status.ok());
     if (!status.ok()) {
-      LOG(INFO) << "Status message: " << status.error_message();
+      LOG(INFO) << "Status message: " << status.message();
       EXPECT_EQ(code, status.code());
 
       // Test SupportedDeviceTypesForNode().
diff --git a/tensorflow/core/framework/optimized_function_graph.proto b/tensorflow/core/framework/optimized_function_graph.proto
index d3a1c473913..25008f0ead2 100644
--- a/tensorflow/core/framework/optimized_function_graph.proto
+++ b/tensorflow/core/framework/optimized_function_graph.proto
@@ -23,4 +23,26 @@ message OptimizedFunctionGraph {
   repeated DataType ret_types = 4;
   // Number of return nodes. This is an output of graph preprocessing.
   uint32 num_return_nodes = 5;
+
+  reserved 6;
+
+  // Enum for distinguishing the origin where the proto is created.
+  //
+  // AOT: proto is created in ahead-of-time environment, which can be different
+  // from the environment where the graph is actually executed.
+  //
+  // JIT: proto is created in just-in-time execution, which has the same
+  // environment as the one the graph is actually executed.
+  enum OptimizationSource {
+    SOURCE_UNSPECIFIED = 0;
+    AOT = 1;
+    JIT = 2;
+  }
+
+  // Indicates the source environment where this proto is generated.
+  optional OptimizationSource source = 7;
+
+  // Time (in microseconds) spent on running the graph optimization passes for
+  // this function.
+  optional uint64 optimization_time_usecs = 8;
 }
diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc
index 75d5c2e2146..2bc23d0b8a6 100644
--- a/tensorflow/core/framework/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -179,9 +179,9 @@ void ReaderBase::Read(QueueInterface* queue, tstring* key, tstring* value,
           " must set *at_end=true, *produced=true, or return an error.");
     }
     if (!status.ok() && produced) {
-      status = errors::Internal("ReadLocked() for ", name(),
-                                " set *produced=true *and* returned an error: ",
-                                status.error_message());
+      status = errors::Internal(
+          "ReadLocked() for ", name(),
+          " set *produced=true *and* returned an error: ", status.message());
     }
     if (status.ok() && at_end) {
       status = OnWorkFinishedLocked();
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index ed1d51f6e90..793d6408bcc 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -134,7 +134,7 @@ Status RendezvousInterface::Recv(const ParsedKey& key, const Args& recv_args,
     int64_t timeout_us = timeout_ms * 1000;
     bool notified = WaitForNotificationWithTimeout(&n, timeout_us);
     if (!notified) {
-      return Status(error::DEADLINE_EXCEEDED,
+      return Status(absl::StatusCode::kDeadlineExceeded,
                     "Timed out waiting for notification");
     }
   } else {
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index 8c47ed20571..4c93f120edb 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_RENDEZVOUS_H_
 
 #include <string>
+#include <utility>
 
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/control_flow.h"
@@ -127,43 +128,30 @@ class RendezvousInterface {
 //
 // This class is used in cases where a rendezvous may be shared between multiple
 // threads with no clear owner.
-class Rendezvous : public RendezvousInterface, public core::RefCounted {
+class Rendezvous : public RendezvousInterface, public core::WeakRefCounted {
  public:
   class Factory {
    public:
     // Default to a factory that evaluates to false.
     Factory() : valid_(false) {}
 
-    Factory(std::function<Status(const int64_t, const DeviceMgr*, Rendezvous**)>
-                create_fn,
-            std::function<Status(const int64_t)> cleanup_fn)
-        : valid_(true),
-          create_fn_(std::move(create_fn)),
-          cleanup_fn_(std::move(cleanup_fn)) {}
-
-    // If no clean up fn is provided, just put in a dummy.
-    // For backwards compatibility.
-    explicit Factory(
-        std::function<Status(const int64_t, const DeviceMgr*, Rendezvous**)>
-            create_fn)
-        : valid_(true),
-          create_fn_(std::move(create_fn)),
-          cleanup_fn_([](const int64_t step_id) { return OkStatus(); }) {}
+    explicit Factory(std::function<Status(const int64_t, const DeviceMgr*,
+                                          tsl::core::RefCountPtr<Rendezvous>*)>
+                         create_fn)
+        : valid_(true), create_fn_(std::move(create_fn)) {}
 
     explicit operator bool() const { return valid_; }
 
     Status operator()(const int64_t step_id, const DeviceMgr* device_mgr,
-                      Rendezvous** rendez) const {
+                      tsl::core::RefCountPtr<Rendezvous>* rendez) const {
       return create_fn_(step_id, device_mgr, rendez);
     }
 
-    Status CleanUp(const int64_t step_id) const { return cleanup_fn_(step_id); }
-
    private:
     bool valid_;
-    std::function<Status(const int64_t, const DeviceMgr*, Rendezvous**)>
+    std::function<Status(const int64_t, const DeviceMgr*,
+                         tsl::core::RefCountPtr<Rendezvous>*)>
         create_fn_;
-    std::function<Status(const int64_t)> cleanup_fn_;
   };
 
   // Constructs a rendezvous key for the tensor of "name" sent from
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index 12c93d83790..cb8e422c5e6 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -165,7 +165,7 @@ TEST_F(LocalRendezvousTest, CancelBeforeRecv) {
   auto s = rendez_->Recv(KeyFoo(), args, &val, &is_dead);
   EXPECT_FALSE(s.ok());
   EXPECT_TRUE(errors::IsCancelled(s));
-  EXPECT_EQ("RecvAsync is cancelled.", s.error_message());
+  EXPECT_EQ("RecvAsync is cancelled.", s.message());
   delete cm;
 }
 
@@ -184,7 +184,7 @@ TEST_F(LocalRendezvousTest, CancelAfterRecv) {
   auto s = rendez_->Recv(KeyFoo(), args, &val, &is_dead);
   EXPECT_FALSE(s.ok());
   EXPECT_TRUE(errors::IsCancelled(s));
-  EXPECT_EQ("RecvAsync is cancelled.", s.error_message());
+  EXPECT_EQ("RecvAsync is cancelled.", s.message());
   n.WaitForNotification();
   delete cm;
 }
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 3498aef2f36..ffdcb9aebfe 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -200,8 +200,8 @@ class ResourceMgr {
   template <typename T, bool use_dynamic_cast = false>
   Status LookupMany(absl::Span<std::pair<const string*, const string*> const>
                         containers_and_names,
-                    std::vector<std::unique_ptr<T, core::RefCountDeleter>>*
-                        resources) const TF_MUST_USE_RESULT;
+                    std::vector<core::RefCountPtr<T>>* resources) const
+      TF_MUST_USE_RESULT;
 
   // If "container" has a resource "name", returns it in
   // "*resource". Otherwise, invokes creator() to create the resource.
@@ -658,7 +658,7 @@ template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::LookupMany(
     absl::Span<std::pair<const string*, const string*> const>
         containers_and_names,
-    std::vector<std::unique_ptr<T, core::RefCountDeleter>>* resources) const {
+    std::vector<core::RefCountPtr<T>>* resources) const {
   CheckDeriveFromResourceBase<T>();
   tf_shared_lock l(mu_);
   resources->resize(containers_and_names.size());
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index d1f4803987b..e5ad8f0b094 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -81,7 +81,7 @@ string LookupOrCreate(ResourceMgr* rm, const string& container,
 static void HasError(const Status& s, const error::Code code,
                      const string& substr) {
   EXPECT_EQ(s.code(), code);
-  EXPECT_TRUE(absl::StrContains(s.error_message(), substr))
+  EXPECT_TRUE(absl::StrContains(s.message(), substr))
       << s << ", expected substring " << substr;
 }
 
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index e215770589a..caa3825e7c3 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -929,14 +929,18 @@ Status InferenceContext::GetScalarFromTensor(const Tensor* t, int64_t* val) {
     return errors::InvalidArgument("Input must be scalar but has rank ", rank);
   }
 
-  if (t->dtype() == DataType::DT_INT32) {
+  if (t->dtype() == DataType::DT_INT16) {
+    *val = t->scalar<int16_t>()();
+    return OkStatus();
+  } else if (t->dtype() == DataType::DT_INT32) {
     *val = t->scalar<int32>()();
     return OkStatus();
   } else if (t->dtype() == DataType::DT_INT64) {
     *val = t->scalar<int64_t>()();
     return OkStatus();
   } else {
-    return errors::InvalidArgument("Scalar input must be int32 or int64.");
+    return errors::InvalidArgument(
+        "Scalar input must be int16, int32 or int64.");
   }
 }
 
@@ -1197,7 +1201,7 @@ Status InferenceContext::AttachContext(const Status& status) {
 
   strings::StrAppend(&error_context, ".");
   return errors::CreateWithUpdatedMessage(
-      status, strings::StrCat(status.error_message(), error_context));
+      status, strings::StrCat(status.message(), error_context));
 }
 
 bool InferenceContext::MergeHandleShapesAndTypes(
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index 83acd1c7b81..2aefdea81d1 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/shape_inference.h"
 
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -1109,7 +1111,7 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
       return c.DebugString(out);
     } else {
       EXPECT_FALSE(IsSet(out));
-      return s.error_message();
+      return std::string(s.message());
     }
   };
 
@@ -1163,7 +1165,7 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
                        {}, {});
     ShapeHandle out;
     EXPECT_EQ("Shape must be rank 1 but is rank 2",
-              c.MakeShapeFromShapeTensor(0, &out).error_message());
+              c.MakeShapeFromShapeTensor(0, &out).message());
   }
 }
 
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index 5588b012678..1125bd259e7 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 
+#include <algorithm>
+#include <memory>
+#include <vector>
+
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 361f7ed13c1..769dcde453a 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -80,22 +81,22 @@ class ShapeInferenceTestutil {
 
 }  // namespace shape_inference
 
-#define INFER_OK(op, i, o)                                                    \
-  EXPECT_EQ(                                                                  \
-      "", ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
-              op, i, o)                                                       \
-              .error_message())
-#define INFER_ERROR(error_substring, op, i)                                 \
-  {                                                                         \
-    std::string error_message =                                             \
-        ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
-            op, i, "e")                                                     \
-            .error_message();                                               \
-    const std::string substring = error_substring;                          \
-    EXPECT_NE("", error_message);                                           \
-    EXPECT_TRUE(absl::StrContains(error_message, substring))                \
-        << "Expected to see '" << substring << "' in '" << error_message    \
-        << "'";                                                             \
+#define INFER_OK(op, i, o)                                                \
+  EXPECT_EQ(                                                              \
+      ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
+          op, i, o),                                                      \
+      ::tensorflow::OkStatus())
+#define INFER_ERROR(error_substring, op, i)                                  \
+  {                                                                          \
+    tensorflow::Status status =                                              \
+        (::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
+            op, i, "e"));                                                    \
+    std::string error_message = status.ToString();                           \
+    const std::string substring = std::string(error_substring);              \
+    EXPECT_NE(status, ::tensorflow::OkStatus());                             \
+    EXPECT_TRUE(absl::StrContains(error_message, substring))                 \
+        << "Expected to see '" << substring << "' in '" << error_message     \
+        << "'";                                                              \
   }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/shape_inference_testutil_test.cc b/tensorflow/core/framework/shape_inference_testutil_test.cc
index 3a799ba9f72..57bc56e7f0b 100644
--- a/tensorflow/core/framework/shape_inference_testutil_test.cc
+++ b/tensorflow/core/framework/shape_inference_testutil_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 
+#include <string>
+
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -60,8 +62,8 @@ string RunInferShapes(const string& op_name, const string& ins,
                   .Attr("N", num_inputs)
                   .Finalize(&op.node_def));
   global_fn_ptr = &fn;
-  return ShapeInferenceTestutil::InferShapes(op, ins, expected_outs)
-      .error_message();
+  return std::string(
+      ShapeInferenceTestutil::InferShapes(op, ins, expected_outs).message());
 }
 
 }  // namespace
@@ -95,11 +97,10 @@ TEST(ShapeInferenceTestutilTest, Failures) {
             RunInferShapes(op, "[1];[2];[1]", "e", fn_copy_input_0));
   EXPECT_CONTAINS(RunInferShapes(op, "[1];[2];[1]", "[1];[2]", fn_copy_input_0),
                   "wrong number of outputs");
-  auto error_message = ShapeInferenceTestutil::InferShapes(
-                           ShapeInferenceTestOp("NoSuchOp"), "", "")
-                           .error_message();
+  auto s = ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp("NoSuchOp"),
+                                               "", "");
   EXPECT_TRUE(
-      absl::StartsWith(error_message, "Op type not registered 'NoSuchOp'"));
+      absl::StartsWith(s.message(), "Op type not registered 'NoSuchOp'"));
 
   // Wrong shape error messages.
   EXPECT_CONTAINS(RunInferShapes(op, "[1];[2];[1]", "?", fn_copy_input_0),
diff --git a/tensorflow/core/framework/tensor_slice_test.cc b/tensorflow/core/framework/tensor_slice_test.cc
index dc5c7e40589..8e6ce1013e8 100644
--- a/tensorflow/core/framework/tensor_slice_test.cc
+++ b/tensorflow/core/framework/tensor_slice_test.cc
@@ -90,7 +90,7 @@ TEST(TensorSliceTest, Serialization) {
     Status s = TensorSlice::Parse("-:-:1,3:4:5", &slice);
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
     EXPECT_TRUE(
-        absl::StrContains(s.error_message(),
+        absl::StrContains(s.message(),
                           "Expected a pair of numbers or '-' but got '4': "
                           "string = -:-:1,3:4:5"));
   }
@@ -99,7 +99,7 @@ TEST(TensorSliceTest, Serialization) {
     Status s = TensorSlice::Parse("-:-1,3", &slice);
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
     EXPECT_TRUE(absl::StrContains(
-        s.error_message(),
+        s.message(),
         "Expected non-negative start and positive length but got "
         "start = -1, length = 3: string = -:-1,3"));
   }
@@ -126,7 +126,7 @@ TEST(TensorSliceTest, Serialization) {
         TensorSlice::Parse("19223372036854775808,19223372036854775808", &slice);
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
     EXPECT_TRUE(absl::StrContains(
-        s.error_message(),
+        s.message(),
         "Expected a pair of numbers or '-' but got "
         "'19223372036854775808,19223372036854775808': string = "
         "19223372036854775808,19223372036854775808"));
@@ -248,7 +248,7 @@ TEST(TensorSliceTest, SliceTensorShape) {
     TensorShape y;
     Status s = a.SliceTensorShape(x, &y);
     EXPECT_EQ(s.code(), error::INTERNAL);
-    EXPECT_TRUE(absl::StrContains(s.error_message(),
+    EXPECT_TRUE(absl::StrContains(s.message(),
                                   "Extent in dimension 1 out of bounds: "
                                   "shape = [2,4,5,8], slice = 1,1:1,4:-:-"));
     EXPECT_EQ("[]", y.DebugString());
diff --git a/tensorflow/core/framework/variant_op_copy_test.cc b/tensorflow/core/framework/variant_op_copy_test.cc
index 974f893f2a5..5a836deecca 100644
--- a/tensorflow/core/framework/variant_op_copy_test.cc
+++ b/tensorflow/core/framework/variant_op_copy_test.cc
@@ -260,8 +260,8 @@ TEST(VariantOpCopyTest, CreateConstOnGPUFailsGracefully) {
   ClientSession session(root);
   std::vector<Tensor> outputs;
   Status s = session.Run({create_const}, &outputs);
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
-                                "GPU copy from non-DMA string tensor"))
+  EXPECT_TRUE(
+      absl::StrContains(s.message(), "GPU copy from non-DMA string tensor"))
       << s.ToString();
 }
 
@@ -367,10 +367,10 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPUStringFailsSafely) {
   Status err = session.Run({create_op, identity}, &outputs);
   EXPECT_TRUE(errors::IsInvalidArgument(err));
   EXPECT_TRUE(
-      absl::StrContains(err.error_message(),
+      absl::StrContains(err.message(),
                         "During Variant Host->Device Copy: non-DMA-copy "
                         "attempted of tensor type: string"))
-      << err.error_message();
+      << err.message();
 }
 
 // TODO(ebrevdo): Identify a way to create two virtual GPUs within a
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index ef0c7eb468a..e7ff1a99e72 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -206,7 +206,7 @@ TEST(VariantOpZerosLikeRegistryTest, TestBasicCPU) {
   Status s0 = UnaryOpVariant<CPUDevice>(null_context_pointer,
                                         ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(absl::StrContains(s0.error_message(), "early exit zeros_like"));
+  EXPECT_TRUE(absl::StrContains(s0.message(), "early exit zeros_like"));
 
   VariantValue vv_ok{false /* early_exit */, 0 /* value */};
   v = vv_ok;
@@ -232,7 +232,7 @@ TEST(VariantOpUnaryOpRegistryTest, TestBasicGPU) {
   Status s0 = UnaryOpVariant<GPUDevice>(null_context_pointer,
                                         ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(absl::StrContains(s0.error_message(), "early exit zeros_like"));
+  EXPECT_TRUE(absl::StrContains(s0.message(), "early exit zeros_like"));
 
   VariantValue vv_ok{false /* early_exit */, 0 /* value */};
   v = vv_ok;
@@ -278,7 +278,7 @@ TEST(VariantOpAddRegistryTest, TestBasicCPU) {
   Status s0 = BinaryOpVariants<CPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(absl::StrContains(s0.error_message(), "early exit add"));
+  EXPECT_TRUE(absl::StrContains(s0.message(), "early exit add"));
 
   VariantValue vv_ok{false /* early_exit */, 3 /* value */};
   v_a = vv_ok;
@@ -305,7 +305,7 @@ TEST(VariantOpAddRegistryTest, TestBasicGPU) {
   Status s0 = BinaryOpVariants<GPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(absl::StrContains(s0.error_message(), "early exit add"));
+  EXPECT_TRUE(absl::StrContains(s0.message(), "early exit add"));
 
   VariantValue vv_ok{false /* early_exit */, 3 /* value */};
   v_a = vv_ok;
diff --git a/tensorflow/core/function/capture/BUILD b/tensorflow/core/function/capture/BUILD
index 44a78c550fc..3b9e8538a4e 100644
--- a/tensorflow/core/function/capture/BUILD
+++ b/tensorflow/core/function/capture/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow:strict.default.bzl", "py_strict_test")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -14,9 +15,13 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
-        "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:inspect_utils",
+        "//tensorflow/python/autograph/pyct:naming",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:transformer",
+        "//tensorflow/python/autograph/pyct/static_analysis:activity",
     ],
 )
 
@@ -50,7 +55,7 @@ py_strict_test(
     ],
 )
 
-py_library(
+pytype_strict_library(
     name = "capture_container",
     srcs = [
         "capture_container.py",
@@ -59,12 +64,9 @@ py_library(
     deps = [
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:type_spec",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:object_identity",
     ],
 )
 
@@ -74,15 +76,8 @@ py_strict_test(
     python_version = "PY3",
     deps = [
         ":capture_container",
-        "//tensorflow/python:bitwise_ops",
-        "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/eager/polymorphic_function:composite_tensor_utils",
-        "//tensorflow/python/framework:combinations",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:func_graph",
-        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/core/function/trace_type",
         "//tensorflow/python/platform:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/core/function/capture/by_ref_capture_test.py b/tensorflow/core/function/capture/by_ref_capture_test.py
index 2853559254b..c95ac0107aa 100644
--- a/tensorflow/core/function/capture/by_ref_capture_test.py
+++ b/tensorflow/core/function/capture/by_ref_capture_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests for detecting free vars in tf.function."""
 
+import unittest
 
 from absl.testing import parameterized
 
@@ -42,8 +43,9 @@ class ByRefCaptureTest(test.TestCase, parameterized.TestCase):
     x = val_type(2)
     self.assertEqual(f(), 3)
 
+  @unittest.skip("By ref capture API does not work for nested tf.function.")
   def test_capture_in_nested_function(self):
-    x = 1
+    x = constant_op.constant(1)
 
     @def_function.function
     def f():
@@ -54,14 +56,14 @@ class ByRefCaptureTest(test.TestCase, parameterized.TestCase):
       @def_function.function
       def g():
         graph = ops.get_default_graph()
-        cap_x = graph._experimental_capture_side_input_by_ref("x", lambda: x)
-        return cap_x + 1
+        cap_x = graph._experimental_capture_side_input_by_ref("xx", lambda: x)
+        return cap_x + 100
 
       return g()
 
     self.assertEqual(f(), 2)
-    x = 2
-    self.assertEqual(f(), 3)
+    x = constant_op.constant(2)
+    self.assertEqual(f(), 102)
 
   def test_capture_in_outer_function(self):
     x = 1
@@ -79,6 +81,7 @@ class ByRefCaptureTest(test.TestCase, parameterized.TestCase):
     x = 2
     self.assertEqual(f(), 3)
 
+  @unittest.skip("By ref capture API does not work for nested tf.function.")
   def test_capture_in_outer_tf_function(self):
     x = 1
 
diff --git a/tensorflow/core/function/capture/capture_container.py b/tensorflow/core/function/capture/capture_container.py
index fc8bae0ddc3..5388112229c 100644
--- a/tensorflow/core/function/capture/capture_container.py
+++ b/tensorflow/core/function/capture/capture_container.py
@@ -15,102 +15,77 @@
 """FuncGraph and related functionality."""
 
 import collections as py_collections
-import dataclasses
 import functools
-import inspect
-from typing import Any, Callable, Hashable, Mapping, Union
+from typing import Any, Callable, Hashable, Mapping, Optional
 
 from tensorflow.core.function import trace_type
 from tensorflow.python import pywrap_tfe
-from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import type_spec
 from tensorflow.python.types import core
-from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 
 
 _EAGER_CONST_THRESHOLD = 128
 
 
-@dataclasses.dataclass(frozen=True)
-class CaptureContainer():
-  """A container for both by-reference and by-value captures.
-
-  external: Used to record the tensor external to the func_graph.
-     For by-value captures, it would be the original tensor.
-     For by-reference captures, it would be the lambda function, which will be
-     called later to get the capture's runtime value.
-  internal: An internal placeholder for the capture, or a constant tensor.
-    The external value of the capture will be fed to this internal placeholder
-    when executing the func_graph as a side input.
-  idf: A Hashable identifier for the capture.
-  is_by_ref: A bool indicates if the capture is call by reference or value.
-    This flag will determine how `CaptureContainer.internal` is used.
-  """
-  external: Any
-  internal: core.Tensor
-  idf: Hashable
-  is_by_ref: bool = False
-
-
-class CachedCaptureDict(py_collections.OrderedDict):
-  """A dict like container for captures with cached tuples."""
+class MutationAwareDict(py_collections.OrderedDict):
+  """A dict with a mutation flag."""
 
   def __init__(self, *args, **kwargs):
-    self._tuple_cache = []
     super().__init__(*args, **kwargs)
-
-  def _recompute_tuple_cache(self):
-    self._tuple_cache = [(
-        c.external, c.internal) for c in self.values()]
+    self._mutated = True
 
   def pop(self, key, default=None):
-    if key in self.keys():
-      ret = super().pop(key, default)
-      self._recompute_tuple_cache()
-      return ret
-    else:
-      return default
+    self._mutated = True
+    return super().pop(key, default)
 
   def __setitem__(self, key, value):
-    assert isinstance(value, CaptureContainer)
-    if key in self.keys():
-      super().__setitem__(key, value)
-      self._recompute_tuple_cache()
-    else:
-      super().__setitem__(key, value)
-      self._tuple_cache.append((value.external, value.internal))
+    self._mutated = True
+    return super().__setitem__(key, value)
 
   def __delitem__(self, key):
-    super().__delitem__(key)
-    self._recompute_tuple_cache()
+    self._mutated = True
+    return super().__delitem__(key)
 
   def clear(self):
-    self._tuple_cache = []
-    super().clear()
+    self._mutated = True
+    return super().clear()
 
   @property
-  def tuple_cache(self):
-    return self._tuple_cache
+  def mutated(self):
+    return self._mutated
+
+  @mutated.setter
+  def mutated(self, value):
+    self._mutated = value
 
 
 class FunctionCaptures(object):
   """A container for all capture usages within FuncGraph."""
 
   def __init__(self):
-    # Dict that maps capture identifier -> CaptureContainer
-    self._by_ref = py_collections.OrderedDict()
-    self._by_val = CachedCaptureDict()
+    self._by_ref_internal = py_collections.OrderedDict()
+    self._by_ref_external = py_collections.OrderedDict()
+    self._by_ref_tracetype = py_collections.OrderedDict()
+    self._by_val_internal = MutationAwareDict()
+    self._by_val_external = MutationAwareDict()
+    self._by_val_tracetype = py_collections.OrderedDict()
+
     # Set of external ops on which the graph has a control dependency
     self.control = object_identity.ObjectIdentitySet()
 
+  def clear(self):
+    self._by_ref_internal.clear()
+    self._by_ref_external.clear()
+    self._by_ref_tracetype.clear()
+    self._by_val_internal.clear()
+    self._by_val_external.clear()
+
   def capture_by_value(
       self,
-      graph: "FuncGraph",
+      graph: Any,
       tensor: core.Tensor,
-      name: str = None
+      name: Optional[str] = None
   ) -> core.Tensor:
     """Captures `tensor` if it's external to this graph.
 
@@ -142,17 +117,19 @@ class FunctionCaptures(object):
       if (tensor.dtype in dtypes.TF_VALUE_DTYPES and
           functools.reduce(lambda a, b: a*b, tensor.shape, 1) <=
           _EAGER_CONST_THRESHOLD):
-        capture = self.by_val_captures.get(id(tensor))
-        if capture is None:
+        graph_const = self.by_val_internal.get(id(tensor))
+        if graph_const is None:
           graph_const = tensor._capture_as_const(name)  # pylint: disable=protected-access
           if graph_const is None:
             # Some eager tensors, e.g. parallel tensors, are not convertible to
             # a single constant. We'll use a placeholder for this case.
             graph_const = self._create_placeholder_helper(graph, tensor, name)
-          self.add_or_replace(tensor, graph_const, id(tensor), False)
+          self.add_or_replace(
+              key=id(tensor),
+              external=tensor,
+              internal=graph_const,
+              is_by_ref=False)
           graph.inputs.append(graph_const)
-        else:
-          graph_const = capture.internal
         graph_const._record_tape(tensor)  # pylint: disable=protected-access
         return graph_const
 
@@ -162,6 +139,11 @@ class FunctionCaptures(object):
     if tensor.graph is not graph:
       graph._validate_in_scope(tensor)  # pylint: disable=protected-access
       if name is None:
+        assert tensor.op is not None, (
+            tensor.__class__,
+            dir(tensor),
+            tensor.__class__.__name__,
+        )
         name = tensor.op.name
       # cond/while graphs override _capture_helper() so cannot call
       # self.create_placeholder_helper() here directly.
@@ -170,75 +152,119 @@ class FunctionCaptures(object):
 
   def add_or_replace(
       self,
-      value: Any,
-      placeholder: core.Tensor,
-      idf: Hashable,
-      is_by_ref: bool = False):
+      key: Hashable,
+      external: Any,
+      internal: core.Tensor,
+      tracetype: Any = None,
+      is_by_ref: bool = False) -> None:
     """Replace a already exsiting capture, otherwise add it."""
-    capture = CaptureContainer(value, placeholder, idf, is_by_ref)
     if is_by_ref:
-      self._by_ref[idf] = capture
+      self._by_ref_external[key] = external
+      self._by_ref_internal[key] = internal
+      self._by_ref_tracetype[key] = tracetype
     else:
-      self._by_val[idf] = capture
-    return capture
+      self._by_val_internal[key] = internal
+      self._by_val_external[key] = external
+      if tracetype is not None:
+        self._by_val_tracetype[key] = tracetype
+      else:
+        self._by_val_tracetype[key] = trace_type.from_value(external)
 
   def pop(self,
-          idf: Hashable,
-          is_by_ref: bool = False) -> Union[core.Tensor, None]:
+          key: Hashable,
+          is_by_ref: bool = False) -> Any:
     if is_by_ref:
-      return self._by_ref.pop(idf, None)
+      return (self._by_ref_external.pop(key, None),
+              self._by_ref_internal.pop(key, None),
+              self._by_ref_tracetype.pop(key, None))
     else:
-      return self._by_val.pop(idf, None)
+      return (self._by_val_external.pop(key, None),
+              self._by_val_internal.pop(key, None),
+              self._by_val_tracetype.pop(key, None))
 
   def reset_captures(self, tensors, placeholders):
     """Set the captures with the provided list of captures & placeholder."""
-    self._by_val = CachedCaptureDict()
+    self._by_val_external = MutationAwareDict()
+    self._by_val_internal = MutationAwareDict()
+    self._by_val_tracetype = MutationAwareDict()
     for external, internal in zip(tensors, placeholders):
-      idf = id(external)
-      c = CaptureContainer(external, internal, idf)
-      self._by_val[idf] = c
+      key = id(external)
+      self._by_val_external[key] = external
+      self._by_val_internal[key] = internal
+      self._by_val_tracetype[key] = trace_type.from_value(external)
 
-  def capture_by_ref(self,
-                     lam: Callable[[], Any],
-                     idf: Hashable = None):
-    """Create a by-referece capture if not exists."""
-    # check if the capture exist in self._by_ref
-    if idf is not None and idf in self._by_ref:
-      capture = self._by_ref[idf]
-      return capture.internal
-    if idf is None:
-      idf = len(self._by_ref)
+  # TODO(panzf): make the method public after supporting lam() returns
+  # non-tensor values. Currently, this method is only used by
+  # FuncGraph._experimental_capture_side_input_by_ref(), which contains the
+  # logics for converting non-tensor values to tensor.
+  def _capture_by_ref(self,
+                      graph: Any,
+                      lam: Callable[[], Any],
+                      key: Hashable = None) -> Any:
+    """Used during tracing process to create/retrive by-ref captures.
 
-    if context.executing_eagerly():
-      return lam()
-    placeholder = self._create_capture_placeholder(lam)
-    capture = CaptureContainer(lam, placeholder, idf, is_by_ref=True)
-    self._by_ref[idf] = capture
-    return capture.internal
+    Args:
+      graph: The FuncGraph that captures this tensor.
+      lam: A callable that takes no arguments and returns tensor captures.
+      key: A hashable identifier.
 
-  def merge_by_ref_with(self, other: "FunctionCaptures"):
+    Returns:
+      Tensor from this FuncGraph.
+    """
+    # Check if the capture exists in self._by_ref
+    if key is not None and key in self._by_ref_internal:
+      return self._by_ref_internal[key]
+    if key is None:
+      key = len(self._by_ref_internal)
+      while key in self._by_ref_internal:
+        key += 1
+
+    value_nested = lam()
+    capture_trace_type = trace_type.from_value(value_nested)
+    ctx = trace_type.InternalPlaceholderContext(graph)
+    internal = capture_trace_type.placeholder_value(ctx)
+
+    def lam_fn():
+      # pytype: disable=attribute-error
+      value = lam()
+      return capture_trace_type._to_tensors(value)  # pylint: disable=protected-access
+      # pytype: enable=attribute-error
+
+    self._by_ref_external[key] = lam_fn
+    self._by_ref_internal[key] = internal
+    self._by_ref_tracetype[key] = capture_trace_type
+    return self._by_ref_internal[key]
+
+  def merge_by_ref_with(self, other: "FunctionCaptures") -> None:
     """Add by-ref captures from `other` to `self` if not exist."""
     assert isinstance(other, FunctionCaptures)
-    for key, capture in other.by_ref_captures.items():
-      if key not in self._by_ref:
-        self._by_ref[key] = capture
+    for key in other.by_ref_external:
+      if key not in self._by_ref_external:
+        self._by_ref_external[key] = other.by_ref_external[key]
+        self._by_ref_tracetype[key] = other.by_ref_tracetype[key]
 
   def get_by_ref_snapshot(self) -> Mapping[Hashable, Any]:
     """Get a snapshot of current values of by-ref captures."""
     snapshot = {}
-    for key, capture in self._by_ref.items():
-      func = capture.external
-      snapshot[key] = func()
+    for key in self._by_ref_external:
+      func = self._by_ref_external[key]
+      try:
+        value = func()
+      except (AttributeError, RuntimeError):
+        # b/269680071 In case of by-ref captures are unavailable at dispatch
+        # time, use the predefined trace_type instead.
+        value = self._by_ref_tracetype[key]
+      snapshot[key] = value
     return snapshot
 
   def _create_placeholder_helper(
       self,
-      graph: "FuncGraph",
+      graph: Any,
       tensor: core.Tensor,
       name: str):
     """A helper function to create capture placeholder."""
-    capture = self._by_val.get(id(tensor))
-    if capture is None:
+    placeholder = self._by_val_internal.get(id(tensor))
+    if placeholder is None:
       tracing_ctx = trace_type.InternalTracingContext()
       spec = trace_type.from_value(tensor, tracing_ctx)
       spec._name = name  # pylint: disable=protected-access
@@ -250,67 +276,57 @@ class FunctionCaptures(object):
           graph,
           with_none_control_dependencies=True,
           composite_device_name=composite_device_name)
-      placeholder_ctx._spec_id_to_handledata = (  # pylint: disable=protected-access
-          tracing_ctx.get_handledata_mapping()
-      )
       placeholder = spec.placeholder_value(placeholder_ctx)
-      self.add_or_replace(tensor, placeholder, id(tensor), False)
+      self.add_or_replace(
+          key=id(tensor),
+          external=tensor,
+          internal=placeholder,
+          is_by_ref=False)
       graph.inputs.append(placeholder)
-    else:
-      placeholder = capture.internal
     placeholder._record_tape(tensor)  # pylint: disable=protected-access
     return placeholder
 
-  # TODO(panzf): Use FunctionType/TraceType to create placeholder here.
-  def _create_capture_placeholder(self, func: Callable[[], Any]) -> ...:
-    """Create placeholder if the input is tensor."""
-    values_nest = func()
-
-    values_flat = nest.flatten(values_nest)
-    # Return values in flat format. It consists of placeholders and non-tensor
-    # values.
-    return_flat = []
-    tensor_spec_flat = []
-    # Create return_flat and replace tensors with None. Later, each None is
-    # replaced again by corresponding placeholders
-    for value in values_flat:
-      if isinstance(value, core.Tensor):
-        return_flat.append(None)
-        tensor_spec_flat.append(type_spec.type_spec_from_value(value))
-      elif isinstance(value, set) or isinstance(value, frozenset):
-        raise NotImplementedError(
-            (f"Side input returned by '{inspect.getsource(func).strip()}' "
-             f"has element of {type(value)} type, which is currently not "
-             "supported by tf.function."))
-      else:
-        return_flat.append(value)
-    if tensor_spec_flat:
-
-      def tensor_func():
-        values = nest.flatten(func())
-        return [value for value in values if isinstance(value, core.Tensor)]
-      # TODO(panzf): remove get_default_graph after moving
-      # capture_call_time_value to this class.
-      graph = ops.get_default_graph()
-      placeholder_flat = graph.capture_call_time_value(
-          tensor_func, tensor_spec_flat)
-      # replace None that represents tensors with placehoders
-      flat_ptr = 0
-      for idx, item in enumerate(return_flat):
-        if item is None:
-          return_flat[idx] = placeholder_flat[flat_ptr]
-          flat_ptr += 1
-    return_nest = nest.pack_sequence_as(values_nest, return_flat)
-    return return_nest
+  def _recompute_tuple_cache(self):
+    assert len(self._by_val_internal) == len(self._by_val_external)
+    self._tuple_cache = []
+    for key in self._by_val_internal:
+      assert key in self._by_val_external
+      internal = self._by_val_internal[key]
+      external = self._by_val_external[key]
+      self._tuple_cache.append((external, internal))
 
   @property
-  def by_ref_captures(self):
-    return self._by_ref
-
-  @property
-  def by_val_captures(self):
-    return self._by_val
+  def capture_types(self):
+    return {**self._by_val_tracetype, **self._by_ref_tracetype}
 
   @property
   def by_val_capture_tuples(self):
-    return self._by_val.tuple_cache
+    if self._by_val_internal.mutated or self._by_val_external.mutated:
+      self. _recompute_tuple_cache()
+      self._by_val_internal.mutated = False
+      self._by_val_external.mutated = False
+    return self._tuple_cache
+
+  @property
+  def by_ref_internal(self):
+    return self._by_ref_internal
+
+  @property
+  def by_ref_external(self):
+    return self._by_ref_external
+
+  @property
+  def by_ref_tracetype(self):
+    return self._by_ref_tracetype
+
+  @property
+  def by_val_internal(self):
+    return self._by_val_internal
+
+  @property
+  def by_val_external(self):
+    return self._by_val_external
+
+  @property
+  def by_val_tracetype(self):
+    return self._by_val_tracetype
diff --git a/tensorflow/core/function/capture/capture_container_test.py b/tensorflow/core/function/capture/capture_container_test.py
index 3a6abee777a..c18c6d27f43 100644
--- a/tensorflow/core/function/capture/capture_container_test.py
+++ b/tensorflow/core/function/capture/capture_container_test.py
@@ -13,51 +13,21 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for tf.function capture containers."""
+import copy
 
 from absl.testing import parameterized
-import numpy as np
 
+from tensorflow.core.function import trace_type
 from tensorflow.core.function.capture import capture_container
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.framework import combinations
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import func_graph
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.platform import test
 
 
-class CachedCaptureDict(test.TestCase, parameterized.TestCase):
+class MutationAwareDictTest(test.TestCase, parameterized.TestCase):
 
   def _prepare_dict(self):
-    container_1 = capture_container.CaptureContainer(
-        1, constant_op.constant(1), "1")
-    container_2 = capture_container.CaptureContainer(
-        2, constant_op.constant(2), "2")
-    container_3 = capture_container.CaptureContainer(
-        3, constant_op.constant(3), "3")
-
-    d = dict({
-        "a": container_1,
-        "b": container_2,
-        "c": container_3})
-
-    capture_d = capture_container.CachedCaptureDict({
-        "a": container_1,
-        "b": container_2,
-        "c": container_3})
-
-    return d, capture_d
-
-  def _compare_capture_container(self, x, y):
-    if isinstance(
-        x, capture_container.CaptureContainer) and isinstance(
-            y, capture_container.CaptureContainer):
-      for attr in ["external", "internal", "idf", "is_by_ref"]:
-        if getattr(x, attr) != getattr(y, attr):
-          return False
-      return True
-    else:
-      return x == y
+    d = {"a": 1, "b": 2, "c": 3}
+    mutation_d = capture_container.MutationAwareDict(copy.copy(d))
+    return d, mutation_d
 
   @parameterized.parameters(
       ("__contains__", "a"),
@@ -65,142 +35,135 @@ class CachedCaptureDict(test.TestCase, parameterized.TestCase):
       ("__len__", None),
       ("__getitem__", "a"))
   def test_same_behavior_with_normal_dict(self, method, arg):
-    d, capture_d = self._prepare_dict()
+    d, mutation_d = self._prepare_dict()
     d_method = getattr(d, method)
-    capture_d_method = getattr(capture_d, method)
+    mutation_d_method = getattr(mutation_d, method)
     if arg is None:
-      result = self._compare_capture_container(
-          d_method(),
-          capture_d_method())
+      self.assertEqual(d_method(), mutation_d_method())
     else:
-      result = self._compare_capture_container(
-          d_method(arg),
-          capture_d_method(arg))
-    self.assertTrue(result)
-
-  def _extract_tuple_cache_external(self, tpl):
-    return [i[0] for i in tpl]
+      self.assertEqual(d_method(arg), mutation_d_method(arg))
 
   @parameterized.parameters(
       ("pop",),
       ("__delitem__",))
   def test_pop_and_del(self, method):
-    _, capture_d = self._prepare_dict()
-    fn = getattr(capture_d, method)
-    fn("b")
-    cache = capture_d.tuple_cache
-    self.assertLen(cache, 2)
-    externals = self._extract_tuple_cache_external(cache)
-    self.assertSequenceEqual(externals, [1, 3])
+    d, mutation_d = self._prepare_dict()
+    d_method = getattr(d, method)
+    mutation_d_method = getattr(mutation_d, method)
+    d_method("b")
+    mutation_d_method("b")
+    self.assertListEqual(list(d.keys()), list(mutation_d.keys()))
+    self.assertListEqual(list(d.values()), list(mutation_d.values()))
 
-  def test_set_item(self):
-    _, capture_d = self._prepare_dict()
-    container_4 = capture_container.CaptureContainer(
-        4, constant_op.constant(4), "4")
-    capture_d["d"] = container_4
-    cache = capture_d.tuple_cache
-    self.assertLen(cache, 4)
-    externals = self._extract_tuple_cache_external(cache)
-    self.assertSequenceEqual(externals, [1, 2, 3, 4])
+  def test_mutatation_ops(self):
+    _, d = self._prepare_dict()
+    with self.subTest("set"):
+      d["d"] = 4
+      self.assertTrue(d.mutated)
+    with self.subTest("pop"):
+      d.pop("d")
+      self.assertTrue(d.mutated)
+    with self.subTest("del"):
+      del d["c"]
+      self.assertTrue(d.mutated)
+    with self.subTest("clear"):
+      d.clear()
+      self.assertTrue(d.mutated)
 
-  def test_tuple_cache(self):
-    _, capture_d = self._prepare_dict()
-    cache = capture_d.tuple_cache
-    for ele in cache:
-      self.assertLen(ele, 2)
-    externals = self._extract_tuple_cache_external(cache)
-    self.assertSequenceEqual(externals, [1, 2, 3])
+  def test_mutated_property(self):
+    _, d = self._prepare_dict()
+    with self.subTest("initial_state"):
+      self.assertTrue(d.mutated)
+    with self.subTest("setter"):
+      d.mutated = False
+      self.assertFalse(d.mutated)
 
 
-class CaptureContainerTest(test.TestCase, parameterized.TestCase):
+class FunctionCapturesTest(test.TestCase, parameterized.TestCase):
 
-  def _prepare_function_captures(self):
-    container = capture_container.FunctionCaptures()
-    container.capture_by_ref(lambda: 1, "1")
-    container.capture_by_ref(lambda: 2, "2")
-    return container
+  def test_add_or_replace(self):
+    fn_captures = capture_container.FunctionCaptures()
+    fn_captures.add_or_replace("a", 1, -1, is_by_ref=False)
+    fn_captures.add_or_replace("aa", 1, -1, 0, is_by_ref=True)
 
-  @combinations.generate(combinations.combine(mode=["graph"]))
-  def test_capture_by_ref_dict_sz(self):
-    container = self._prepare_function_captures()
-    self.assertLen(container.by_ref_captures, 2)
+    with self.subTest("add_by_val"):
+      self.assertLen(fn_captures.by_val_internal, 1)
+      self.assertLen(fn_captures.by_val_external, 1)
 
-  @combinations.generate(combinations.combine(mode=["graph"]))
-  def test_capture_by_ref_default_idf(self):
-    container = self._prepare_function_captures()
-    idf = len(container.by_ref_captures)
-    container.capture_by_ref(lambda: 12345)
-    capture = container.by_ref_captures[idf]
-    lam = capture.external
-    self.assertEqual(lam(), 12345)
+    with self.subTest("add_by_ref"):
+      self.assertLen(fn_captures.by_ref_internal, 1)
+      self.assertLen(fn_captures.by_ref_external, 1)
+      self.assertLen(fn_captures.by_ref_tracetype, 1)
 
-  @combinations.generate(combinations.combine(mode=["graph"]))
-  def test_capture_by_ref_is_by_ref(self):
-    container = self._prepare_function_captures()
-    capture = container.by_ref_captures["1"]
-    self.assertTrue(capture.is_by_ref)
+    fn_captures.add_or_replace("a", 2, -2, is_by_ref=False)
+    with self.subTest("replace_by_val"):
+      self.assertLen(fn_captures.by_val_internal, 1)
+      self.assertLen(fn_captures.by_val_external, 1)
+      self.assertEqual(fn_captures.by_val_external["a"], 2)
+      self.assertEqual(fn_captures.by_val_internal["a"], -2)
 
-  @combinations.generate(combinations.combine(mode=["graph"]))
-  def test_capture_by_ref_with_duplicate_idf(self):
-    container = self._prepare_function_captures()
-    container.capture_by_ref(lambda: 3, "1")
-    self.assertLen(container.by_ref_captures, 2)
+  def test_by_val_capture_tuples(self):
+    fn_captures = capture_container.FunctionCaptures()
 
-  @combinations.generate(combinations.combine(mode=["graph"]))
-  def test_get_by_ref_snapshot(self):
-    container = self._prepare_function_captures()
-    snaptshot = container.get_by_ref_snapshot()
-    self.assertDictEqual(snaptshot, {"1": 1, "2": 2})
+    with self.subTest("initial_state"):
+      self.assertEmpty(fn_captures.by_val_capture_tuples)
 
-  @combinations.generate(combinations.combine(mode=["eager",]))
-  def test_create_capture_placeholder_eager(self):
-    container = self._prepare_function_captures()
-    lam = lambda: 12345
-    res = container._create_capture_placeholder(lam)
-    self.assertEqual(res, 12345)
+    with self.subTest("add"):
+      fn_captures.add_or_replace("a", 1, -1, is_by_ref=False)
+      self.assertLen(fn_captures.by_val_capture_tuples, 1)
+      self.assertSequenceEqual(
+          fn_captures.by_val_capture_tuples,
+          ((1, -1),))
 
-  @combinations.generate(combinations.combine(mode=["graph",]))
-  def test_create_capture_placeholder_graph_tensor(self):
-    container = self._prepare_function_captures()
-    lam = lambda: constant_op.constant(123)
-    spec = tensor_spec.TensorSpec([], np.int32, name="Placeholder:0")
-    graph = func_graph.FuncGraph("graph")
-    with graph.as_default():
-      placeholder = container._create_capture_placeholder(lam)
-      self.assertEqual(placeholder.shape, spec.shape)
-      self.assertEqual(placeholder.dtype, spec.dtype)
-      self.assertEqual(placeholder.name, spec.name)
+      fn_captures.add_or_replace("b", 2, -2, is_by_ref=False)
+      self.assertLen(fn_captures.by_val_capture_tuples, 2)
+      self.assertSequenceEqual(
+          fn_captures.by_val_capture_tuples,
+          ((1, -1), (2, -2)))
 
-  @combinations.generate(combinations.combine(mode=["graph",]))
-  def test_create_capture_placeholder_graph_nested_tensor(self):
-    container = self._prepare_function_captures()
-    a = constant_op.constant(1)
-    b = constant_op.constant(2.0)
-    c = constant_op.constant([1, 2, 3])
-    spec_a = tensor_spec.TensorSpec([], np.int32)
-    spec_b = tensor_spec.TensorSpec([], np.float32)
-    spec_c = tensor_spec.TensorSpec([3,], np.int32)
+    with self.subTest("replace"):
+      fn_captures.add_or_replace("a", 1, -3, is_by_ref=False)
+      self.assertLen(fn_captures.by_val_capture_tuples, 2)
+      self.assertSequenceEqual(
+          fn_captures.by_val_capture_tuples,
+          ((1, -3), (2, -2)))
 
-    value = [{"a": a}, [b, c]]
-    lam = lambda: value
-    graph = func_graph.FuncGraph("graph")
-    with graph.as_default():
-      placeholder = container._create_capture_placeholder(lam)
-    self.assertLen(placeholder, 2)
-    self.assertIn("a", placeholder[0])
-    self.assertLen(placeholder[1], 2)
+    with self.subTest("pop"):
+      fn_captures.pop("b", is_by_ref=False)
+      self.assertSequenceEqual(
+          fn_captures.by_val_capture_tuples,
+          ((1, -3),))
 
-    placeholder_a = placeholder[0]["a"]
-    placeholder_b = placeholder[1][0]
-    placeholder_c = placeholder[1][1]
-    self.assertEqual(placeholder_a.shape, spec_a.shape)
-    self.assertEqual(placeholder_a.dtype, spec_a.dtype)
-    self.assertEqual(placeholder_b.shape, spec_b.shape)
-    self.assertEqual(placeholder_b.dtype, spec_b.dtype)
-    self.assertEqual(placeholder_c.shape, spec_c.shape)
-    self.assertEqual(placeholder_c.dtype, spec_c.dtype)
+    with self.subTest("reset"):
+      fn_captures.reset_captures([10, 20], [-10, -20])
+      self.assertSequenceEqual(
+          fn_captures.by_val_capture_tuples,
+          ((10, -10), (20, -20)))
+
+    with self.subTest("clear"):
+      fn_captures.clear()
+      self.assertEmpty(fn_captures.by_val_capture_tuples)
+
+  def test_capture_types(self):
+    class FakePlaceholder():
+      pass
+
+    fn_captures = capture_container.FunctionCaptures()
+    fn_captures.add_or_replace("v1", 1, FakePlaceholder(), is_by_ref=False)
+    fn_captures.add_or_replace("v2", 2, FakePlaceholder(), is_by_ref=False)
+    fn_captures.add_or_replace("v3", 3, FakePlaceholder(), is_by_ref=False)
+    fn_captures.add_or_replace(
+        "r1", 1, FakePlaceholder(), trace_type.from_value(4), is_by_ref=True)
+    fn_captures.add_or_replace(
+        "r2", 2, FakePlaceholder(), trace_type.from_value(5), is_by_ref=True)
+
+    self.assertLen(fn_captures.capture_types, 5)
+    self.assertEqual(fn_captures.capture_types["v1"], trace_type.from_value(1))
+    self.assertEqual(fn_captures.capture_types["v2"], trace_type.from_value(2))
+    self.assertEqual(fn_captures.capture_types["v3"], trace_type.from_value(3))
+    self.assertEqual(fn_captures.capture_types["r1"], trace_type.from_value(4))
+    self.assertEqual(fn_captures.capture_types["r2"], trace_type.from_value(5))
 
 
 if __name__ == "__main__":
-  v2_compat.enable_v2_behavior()
   test.main()
diff --git a/tensorflow/core/function/polymorphism/BUILD b/tensorflow/core/function/polymorphism/BUILD
index 0f637c5b2c6..17ed32ef920 100644
--- a/tensorflow/core/function/polymorphism/BUILD
+++ b/tensorflow/core/function/polymorphism/BUILD
@@ -30,7 +30,7 @@ py_strict_test(
         ":function_type",
         ":type_dispatch",
         "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:trace",
     ],
 )
 
@@ -45,7 +45,6 @@ pytype_strict_library(
         ":function_type",
         "//tensorflow/core/function/polymorphism:type_dispatch",
         "//tensorflow/core/function/trace_type",
-        "//tensorflow/python/types",
     ],
 )
 
@@ -62,7 +61,7 @@ py_strict_test(
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python:array_ops",
         "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:trace",
     ],
 )
 
@@ -77,7 +76,7 @@ pytype_strict_library(
         "//tensorflow/core/function/polymorphism:function_type_proto_py",
         "//tensorflow/core/function/trace_type",
         "//tensorflow/core/function/trace_type:serialization",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:trace",
         "@absl_py//absl/logging",
     ],
 )
@@ -114,7 +113,7 @@ py_strict_test(
         "//tensorflow/core/function/trace_type:serialization",
         "//tensorflow/python/framework:func_graph",
         "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:trace",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/core/function/polymorphism/function_type.py b/tensorflow/core/function/polymorphism/function_type.py
index 9fc4c94f7c7..ba91ef380e7 100644
--- a/tensorflow/core/function/polymorphism/function_type.py
+++ b/tensorflow/core/function/polymorphism/function_type.py
@@ -75,12 +75,14 @@ class Parameter(inspect.Parameter):
 
   @classmethod
   def from_proto(cls, proto: Any) -> "Parameter":
+    """Generate a Parameter from the proto representation."""
     deserialized_type_constraint = serialization.deserialize(
         proto.type_constraint) if proto.HasField("type_constraint") else None
     return Parameter(proto.name, PROTO_TO_PY_ENUM[proto.kind],
                      proto.is_optional, deserialized_type_constraint)
 
   def to_proto(self) -> function_type_pb2.Parameter:
+    """Generate a proto representation of the Parameter."""
     serialized_type_constraint = serialization.serialize(
         self.type_constraint) if self.type_constraint else None
     return function_type_pb2.Parameter(
@@ -169,8 +171,18 @@ class FunctionType(inspect.Signature):
 
   @property
   def captures(self) -> collections.OrderedDict:
+    """Return an ordered mapping of capture id to type."""
     return self._captures
 
+  @property
+  def output(self) -> Optional[trace.TraceType]:
+    """Return the output TraceType if specified."""
+    return (
+        self.return_annotation
+        if self.return_annotation is not self.empty
+        else None
+    )
+
   # TODO(fmuham): Use this method instead of fullargspec and tf_inspect.
   @classmethod
   def from_callable(cls,
@@ -202,6 +214,7 @@ class FunctionType(inspect.Signature):
 
   @classmethod
   def from_proto(cls, proto: Any) -> "FunctionType":
+    """Generate a FunctionType from the proto representation."""
     return FunctionType([Parameter.from_proto(p) for p in proto.parameters],
                         collections.OrderedDict([
                             (c.name,
@@ -210,6 +223,7 @@ class FunctionType(inspect.Signature):
                         ]))
 
   def to_proto(self) -> Any:
+    """Generate a proto representation from the FunctionType."""
     return function_type_pb2.FunctionType(
         parameters=[p.to_proto() for p in self.parameters.values()],
         captures=[
@@ -311,6 +325,25 @@ class FunctionType(inspect.Signature):
 
     return inspect.BoundArguments(self, arguments)
 
+  @property
+  def flat_inputs(self):
+    """Flat tensor inputs accepted by this FunctionType."""
+    if not hasattr(self, "_cached_flat_inputs"):
+      self._cached_flat_inputs = []
+      for p in self.parameters.values():
+        self._cached_flat_inputs.extend(p.type_constraint._flatten())  # pylint: disable=protected-access
+
+    return self._cached_flat_inputs
+
+  @property
+  def flat_outputs(self):
+    """Flat tensor outputs returned by this FunctionType."""
+    if not hasattr(self, "_cached_flat_outputs"):
+      if self.output is not None:
+        self._cached_flat_outputs = self.output._flatten()   # pylint: disable=protected-access
+
+    return self._cached_flat_outputs
+
   def __eq__(self, other: Any) -> bool:
     if not isinstance(other, FunctionType):
       return NotImplemented
@@ -362,7 +395,9 @@ def sanitize_arg_name(name: str) -> str:
 
 
 # TODO(fmuham): Consider forcing kind to be always POSITIONAL_OR_KEYWORD.
-def _make_validated_mono_param(name, value, kind, type_context, poly_type):
+def _make_validated_mono_param(
+    name, value, kind, type_context, poly_type
+) -> Parameter:
   """Generates and validates a parameter for Monomorphic FunctionType."""
   mono_type = trace_type.from_value(value, type_context)
 
@@ -442,7 +477,7 @@ def canonicalize_to_monomorphic(
 # TODO(fmuham): Share code with canonicalize_to_monomorphic.
 # TODO(fmuham): Lift unnecessary restrictions on input_signature validity.
 def add_type_constraints(function_type: FunctionType, input_signature: Any,
-                         default_values: Dict[str, Any]):
+                         default_values: Dict[str, Any]) -> FunctionType:
   """Adds type constraints to a FunctionType based on the input_signature."""
   context = trace_type.InternalTracingContext(is_legacy_signature=True)
   constraints = [trace_type.from_value(c, context) for c in input_signature]
@@ -499,3 +534,48 @@ def add_type_constraints(function_type: FunctionType, input_signature: Any,
         f"input_signature contains {len(constraints)} extra type constraints.")
 
   return FunctionType(parameters)
+
+
+def from_structured_signature(
+    input_signature, output_signature=None, capture_types=None
+) -> FunctionType:
+  """Generates a FunctionType from legacy signature representation."""
+  args, kwargs = input_signature
+  parameters = []
+
+  for i, arg in enumerate(args):
+    parameters.append(
+        Parameter(
+            "arg_" + str(i),
+            Parameter.POSITIONAL_ONLY,
+            False,
+            trace_type.from_value(
+                arg, trace_type.InternalTracingContext(is_legacy_signature=True)
+            ),
+        )
+    )
+
+  for name, kwarg in kwargs.items():
+    parameters.append(
+        Parameter(
+            sanitize_arg_name(name),
+            Parameter.KEYWORD_ONLY,
+            False,
+            trace_type.from_value(
+                kwarg,
+                trace_type.InternalTracingContext(is_legacy_signature=True),
+            ),
+        )
+    )
+
+  if output_signature is None:
+    return_type = None
+  else:
+    return_type = trace_type.from_value(
+        output_signature,
+        trace_type.InternalTracingContext(is_legacy_signature=True),
+    )
+
+  return FunctionType(
+      parameters, capture_types or {}, return_annotation=return_type
+  )
diff --git a/tensorflow/core/function/polymorphism/function_type_test.py b/tensorflow/core/function/polymorphism/function_type_test.py
index 07b6957c453..610d89913d9 100644
--- a/tensorflow/core/function/polymorphism/function_type_test.py
+++ b/tensorflow/core/function/polymorphism/function_type_test.py
@@ -766,5 +766,78 @@ class SerializationTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(function_type.FunctionType.from_proto(expected), original)
 
 
+class FromStructuredSignatureTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      {
+          "signature": ((1, 2, 3), {}),
+          "expected_types": (
+              trace_type.from_value(1),
+              trace_type.from_value(2),
+              trace_type.from_value(3),
+          ),
+      },
+      {
+          "signature": (([1, 2, 3],), {}),
+          "expected_types": (
+              trace_type.from_value([1, 2, 3]),
+          ),
+      },
+      {
+          "signature": ((), {}),
+          "expected_types": (),
+      },
+  )
+  def testArgs(self, signature, expected_types):
+    generated_type = function_type.from_structured_signature(signature)
+    self.assertIsNone(generated_type.output)
+    for i, p in enumerate(generated_type.parameters.values()):
+      self.assertEqual(p.kind, function_type.Parameter.POSITIONAL_ONLY)
+      self.assertEqual(p.type_constraint, expected_types[i])
+
+  @parameterized.parameters(
+      {
+          "signature": ((), {"a": 1, "b": 2, "c": 3}),
+          "expected_types": {
+              "a": trace_type.from_value(1),
+              "b": trace_type.from_value(2),
+              "c": trace_type.from_value(3),
+          },
+      },
+      {
+          "signature": ((), {"a": [1, 2, 3]}),
+          "expected_types": {
+              "a": trace_type.from_value([1, 2, 3]),
+          },
+      },
+      {
+          "signature": ((), {}),
+          "expected_types": {},
+      },
+  )
+  def testKwargs(self, signature, expected_types):
+    generated_type = function_type.from_structured_signature(signature)
+    self.assertIsNone(generated_type.output)
+    for p in generated_type.parameters.values():
+      self.assertEqual(p.kind, function_type.Parameter.KEYWORD_ONLY)
+      self.assertEqual(p.type_constraint, expected_types[p.name])
+
+  @parameterized.parameters(
+      {"output_signature": 1},
+      {"output_signature": [1, 2, 3]},
+      {"output_signature": ()},
+  )
+  def testOutput(self, output_signature):
+    generated_type = function_type.from_structured_signature(
+        ((), {}), output_signature
+    )
+    self.assertEqual(
+        generated_type.output,
+        trace_type.from_value(
+            output_signature,
+            trace_type.InternalTracingContext(is_legacy_signature=True),
+        )
+    )
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/core/function/runtime_client/BUILD b/tensorflow/core/function/runtime_client/BUILD
index 1514ca0d536..68f51ab52e6 100644
--- a/tensorflow/core/function/runtime_client/BUILD
+++ b/tensorflow/core/function/runtime_client/BUILD
@@ -25,10 +25,14 @@ cc_library(
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/compiler/mlir/python:mlir",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
-        "//tensorflow/core/common_runtime:device_mgr",
+        "//tensorflow/core/common_runtime:function_def_utils",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:core",
         "//tensorflow/core/framework:function_proto_cc",
@@ -100,8 +104,8 @@ tf_python_pybind_extension(
     srcs = ["runtime_client_pybind.cc"],
     deps = [
         ":runtime_client_headers",
-        "@pybind11",
         "//tensorflow/python/lib/core:pybind11_status",
+        "@pybind11",
     ] + if_static(
         extra_deps = [
             "//tensorflow/core/framework:function_proto_cc",
@@ -127,7 +131,10 @@ pytype_strict_library(
         "runtime_client.py",
     ],
     srcs_version = "PY3",
-    visibility = ["//tensorflow/core/function/transform:__subpackages__"],
+    visibility = [
+        "//learning/brain/experimental/tfq:__pkg__",
+        "//tensorflow/core/function/transform:__subpackages__",
+    ],
     deps = [
         ":runtime_client_pybind",
         "//tensorflow/core/framework:function_proto_py",
diff --git a/tensorflow/core/function/runtime_client/runtime_client.cc b/tensorflow/core/function/runtime_client/runtime_client.cc
index 211923411cd..b10bcc3856e 100644
--- a/tensorflow/core/function/runtime_client/runtime_client.cc
+++ b/tensorflow/core/function/runtime_client/runtime_client.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -30,9 +31,13 @@ limitations under the License.
 #include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/c/eager/immediate_execution_operation.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/compiler/mlir/python/mlir.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
 #include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/framework/device_factory.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -108,7 +113,19 @@ Status Runtime::CreateFunction(OpaqueTfgGraphFuncOp* fop) {
                                          *this->eager_ctx_.FuncLibDef());
 }
 
-Status Runtime::TransformFunction(StringPiece name, StringPiece pipeline_name) {
+Status Runtime::CreateFunction(OpaqueTfFuncOp* fop) {
+  mlir::func::FuncOp fop_proper = *reinterpret_cast<mlir::func::FuncOp*>(fop);
+  const auto& fname = fop_proper.getName().str();
+  GraphExportConfig config;
+  FunctionDef fdef;
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      ConvertMlirFunctionToFunctionLibraryDef(fop_proper, config, &fdef),
+      "creating function ", fname);
+  return CreateFunction(fdef);
+}
+
+Status Runtime::TransformFunction(StringPiece name, StringPiece pipeline_name,
+                                  Dialect dialect) {
   // TODO(mdan): Use a longer-lived context.
   mlir::MLIRContext ctx;
   mlir::PassManager pm(&ctx);
@@ -131,24 +148,57 @@ Status Runtime::TransformFunction(StringPiece name, StringPiece pipeline_name) {
   GraphDef graph;
   *graph.mutable_library()->add_function() = *fn;
   tensorflow::GraphDebugInfo debug_info;
-  auto mlir_fn = mlir::tfg::ImportGraphDef(&ctx, debug_info, graph);
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(mlir_fn.status(), "importing function ",
-                                  name);
+  // TODO(xjun): Hoist branches into helper functions.
+  if (dialect == Dialect::TFG) {
+    auto mlir_fn = mlir::tfg::ImportGraphDef(&ctx, debug_info, graph);
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(mlir_fn.status(), "importing function ",
+                                    name);
 
-  mlir::StatusScopedDiagnosticHandler diagnostics_handler(&ctx);
-  if (failed(pm.run(mlir_fn->get()))) {
-    return diagnostics_handler.Combine(
-        Status(absl::StatusCode::kInvalidArgument,
-               absl::StrCat("running pass pipeline ", pipeline_name, ": ")));
+    mlir::StatusScopedDiagnosticHandler diagnostics_handler(&ctx);
+    if (failed(pm.run(mlir_fn->get()))) {
+      return diagnostics_handler.Combine(
+          Status(absl::StatusCode::kInvalidArgument,
+                 absl::StrCat("running pass pipeline ", pipeline_name, ": ")));
+    }
+
+    for (auto fn : mlir_fn->get().getBody()->getOps<mlir::tfg::GraphFuncOp>()) {
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(
+          CreateFunction(reinterpret_cast<OpaqueTfgGraphFuncOp*>(&fn)),
+          absl::StrCat("updating function ", fn.getName().str()));
+    }
+    return OkStatus();
   }
 
-  for (auto fn : mlir_fn->get().getBody()->getOps<mlir::tfg::GraphFuncOp>()) {
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        CreateFunction(reinterpret_cast<OpaqueTfgGraphFuncOp*>(&fn)),
-        absl::StrCat("updating function ", fn.getName().str()));
+  if (dialect == Dialect::TF) {
+    Status status;
+    FunctionLibraryDefinition& flib_def = *this->eager_ctx_.FuncLibDef();
+    std::unique_ptr<FunctionBody> fbody;
+    status = FunctionDefToBodyHelper(*fn, AttrSlice(), &flib_def, &fbody);
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(status, "importing function ", name);
+
+    auto mlir_fn = ConvertFunctionToMlir(fbody.get(), flib_def, &ctx);
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(mlir_fn.status(), "importing function ",
+                                    name);
+
+    mlir::StatusScopedDiagnosticHandler diagnostics_handler(&ctx);
+    if (failed(pm.run(mlir_fn->get()))) {
+      return diagnostics_handler.Combine(
+          Status(absl::StatusCode::kInvalidArgument,
+                 absl::StrCat("running pass pipeline ", pipeline_name, ": ")));
+    }
+
+    for (auto fn : mlir_fn->get().getBody()->getOps<mlir::func::FuncOp>()) {
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(
+          CreateFunction(reinterpret_cast<OpaqueTfFuncOp*>(&fn)),
+          absl::StrCat("updating function ", fn.getName().str()));
+    }
+    return OkStatus();
   }
 
-  return OkStatus();
+  return Status(
+      absl::StatusCode::kInvalidArgument,
+      absl::StrCat("Unsupported dialect: ", dialect,
+                   ". Supported dialects are Dialect::TFG and Dialect::TF."));
 }
 
 StatusOr<ReturnValues> Runtime::CallFunction(
diff --git a/tensorflow/core/function/runtime_client/runtime_client.h b/tensorflow/core/function/runtime_client/runtime_client.h
index cd34be10c0f..e542fa56e03 100644
--- a/tensorflow/core/function/runtime_client/runtime_client.h
+++ b/tensorflow/core/function/runtime_client/runtime_client.h
@@ -32,9 +32,15 @@ namespace core {
 namespace function {
 
 // TODO(mdan): Get rid of this once pybind can depend on MLIR headers.
-// This empty struct serves to hide a pointer to an actual MLIR FuncOp object.
+// This empty struct serves to hide a pointer to an actual MLIR TFG dialect
+// FuncOp object.
 struct OpaqueTfgGraphFuncOp;
 
+// TODO(xjun): Get rid of this once pybind can depend on MLIR headers.
+// This empty struct serves to hide a pointer to an actual MLIR TF dialect
+// FuncOp object.
+struct OpaqueTfFuncOp;
+
 // This is the current global context managed by the Python API. For historical
 // reasons, the Python runtime controls this context and all other clients must
 // use it. See tensorflow/python/eager/pywrap_tfe.h and
@@ -59,17 +65,25 @@ class Runtime {
  public:
   explicit Runtime(EagerContext& eager_ctx) : eager_ctx_(eager_ctx) {}
 
+  enum class Dialect {
+    TFG,
+    TF,
+  };
+
   StatusOr<FunctionDef> GetFunctionProto(StringPiece name);
 
   // TODO(mdan): Enforce creation or rename to SetFunction.
   Status CreateFunction(const FunctionDef& fdef);
   // TODO(mdan): Change to mlir::tfg::GraphFuncOp once pybind can depend on it.
   Status CreateFunction(OpaqueTfgGraphFuncOp* fop);
+  // TODO(xjun): Change to mlir::func::FuncOp once pybind can depend on it.
+  Status CreateFunction(OpaqueTfFuncOp* fop);
   // Applies a MLIR pipeline to an existing function.
   // The pipeline may rename the function. If it does so, the old function
   // remains unchanged. If the new name specifies an existing function, it will
   // be overwritten.
-  Status TransformFunction(StringPiece name, StringPiece pipeline_name);
+  Status TransformFunction(StringPiece name, StringPiece pipeline_name,
+                           Dialect dialect = Dialect::TFG);
 
   StatusOr<ReturnValues> CallFunction(
       StringPiece name, absl::Span<AbstractTensorHandle* const> args);
diff --git a/tensorflow/core/function/runtime_client/runtime_client_pybind.cc b/tensorflow/core/function/runtime_client/runtime_client_pybind.cc
index cc65b5bcec4..b787d8cc376 100644
--- a/tensorflow/core/function/runtime_client/runtime_client_pybind.cc
+++ b/tensorflow/core/function/runtime_client/runtime_client_pybind.cc
@@ -36,23 +36,32 @@ PYBIND11_MODULE(runtime_client_pybind, m) {
         &tensorflow::core::function::GlobalPythonEagerContext,
         pybind11::return_value_policy::reference);
 
-  pybind11::class_<tensorflow::core::function::Runtime>(m, "Runtime")
-      .def(pybind11::init<tensorflow::EagerContext&>())
-      // TODO(mdan): Rename to GetFunctionProto once pybind11_protobuf available
-      .def(
-          "GetFunctionProtoString",
-          [](tensorflow::core::function::Runtime& r, const std::string& name) {
-            return pybind11::bytes(
-                r.GetFunctionProto(name)->SerializeAsString());
-          },
-          pybind11::return_value_policy::reference)
-      // TODO(mdan): Rename to CreateFunction once pybind11_protobuf available
-      .def("CreateFunctionFromString",
-           [](tensorflow::core::function::Runtime& r, const std::string& def) {
-             tensorflow::FunctionDef proto;
-             proto.ParseFromString(def);
-             return r.CreateFunction(proto);
-           })
-      .def("TransformFunction",
-           &tensorflow::core::function::Runtime::TransformFunction);
+  pybind11::class_<tensorflow::core::function::Runtime> runtime(m, "Runtime");
+
+  pybind11::enum_<tensorflow::core::function::Runtime::Dialect>(runtime,
+                                                                "Dialect")
+      .value("TFG", tensorflow::core::function::Runtime::Dialect::TFG)
+      .value("TF", tensorflow::core::function::Runtime::Dialect::TF);
+
+  runtime.def(pybind11::init<tensorflow::EagerContext&>());
+  // TODO(mdan): Rename to GetFunctionProto once pybind11_protobuf available
+  runtime.def(
+      "GetFunctionProtoString",
+      [](tensorflow::core::function::Runtime& r, const std::string& name) {
+        return pybind11::bytes(r.GetFunctionProto(name)->SerializeAsString());
+      },
+      pybind11::return_value_policy::reference);
+  // TODO(mdan): Rename to CreateFunction once pybind11_protobuf available
+  runtime.def(
+      "CreateFunctionFromString",
+      [](tensorflow::core::function::Runtime& r, const std::string& def) {
+        tensorflow::FunctionDef proto;
+        proto.ParseFromString(def);
+        return r.CreateFunction(proto);
+      });
+  runtime.def("TransformFunction",
+              &tensorflow::core::function::Runtime::TransformFunction,
+              pybind11::arg("name"), pybind11::arg("pipeline_name"),
+              pybind11::arg("dialect") =
+                  tensorflow::core::function::Runtime::Dialect::TFG);
 }
diff --git a/tensorflow/core/function/runtime_client/runtime_client_test.cc b/tensorflow/core/function/runtime_client/runtime_client_test.cc
index bfa88f69b81..9cfe63fa237 100644
--- a/tensorflow/core/function/runtime_client/runtime_client_test.cc
+++ b/tensorflow/core/function/runtime_client/runtime_client_test.cc
@@ -170,6 +170,40 @@ FunctionDef MakeBinaryFunction() {
   return fd;
 }
 
+FunctionDef MakeMultiplyFunction() {
+  FunctionDef fd;
+  protobuf::TextFormat::Parser parser;
+  CHECK(parser.ParseFromString(
+      R"pb(signature {
+             name: "MultiplyFunction"
+             input_arg { name: "x" type: DT_INT32 }
+             input_arg { name: "y" type: DT_INT32 }
+             output_arg { name: "ret" type: DT_INT32 }
+           }
+           node_def {
+             name: "x_times_y"
+             op: "Mul"
+             input: "x"
+             input: "y"
+             attr {
+               key: "T"
+               value { type: DT_INT32 }
+             }
+           }
+           node_def {
+             name: "ret"
+             op: "Identity"
+             input: "x_times_y:z:0"
+             attr {
+               key: "T"
+               value { type: DT_INT32 }
+             }
+           }
+           ret { key: "ret" value: "ret:output:0" })pb",
+      &fd));
+  return fd;
+}
+
 TEST(GlobalContext, Basic) {
   Runtime rt(GlobalEagerContext());
   TF_ASSERT_OK(rt.CreateFunction(MakeNullaryFunction()));
@@ -307,6 +341,45 @@ TEST(TransformTest, TestPassOnBinaryFunction) {
   EXPECT_EQ(IntValue(*(rets->at(0))), 6);
 }
 
+TEST(TransformTest, TestPassOnMultiplyFunction) {
+  EagerContextPtr ctx = TestingEagerCtx();
+  Runtime rt(*ctx);
+  TF_ASSERT_OK(rt.CreateFunction(MakeMultiplyFunction()));
+
+  testing::RegisterTestPass();
+  TF_EXPECT_OK(rt.TransformFunction("MultiplyFunction", "test-pass-tf-dialect",
+                                    Runtime::Dialect::TF));
+
+  auto x = IntScalarTensor(*ctx, 2);
+  auto y = IntScalarTensor(*ctx, 3);
+  StatusOr<ReturnValues> rets =
+      rt.CallFunction("MultiplyFunction", {x.get(), y.get()});
+  TF_ASSERT_OK(rets.status());
+  ASSERT_EQ(rets->size(), 1);
+  ASSERT_EQ(rets->at(0)->DataType(), DT_INT32);
+  EXPECT_EQ(IntValue(*(rets->at(0))), 5);
+}
+
+TEST(TransformTest, TestMixedPassesOnBinaryFunction) {
+  EagerContextPtr ctx = TestingEagerCtx();
+  Runtime rt(*ctx);
+  TF_ASSERT_OK(rt.CreateFunction(MakeBinaryFunction()));
+
+  testing::RegisterTestPass();
+  TF_EXPECT_OK(rt.TransformFunction("BinaryFunction", "test-pass"));
+  TF_EXPECT_OK(rt.TransformFunction("BinaryFunction", "test-pass-tf-dialect",
+                                    Runtime::Dialect::TF));
+
+  auto x = IntScalarTensor(*ctx, 2);
+  auto y = IntScalarTensor(*ctx, 3);
+  StatusOr<ReturnValues> rets =
+      rt.CallFunction("BinaryFunction", {x.get(), y.get()});
+  TF_ASSERT_OK(rets.status());
+  ASSERT_EQ(rets->size(), 1);
+  ASSERT_EQ(rets->at(0)->DataType(), DT_INT32);
+  EXPECT_EQ(IntValue(*(rets->at(0))), 5);
+}
+
 }  // namespace
 }  // namespace function
 }  // namespace core
diff --git a/tensorflow/core/function/runtime_client/runtime_client_test.py b/tensorflow/core/function/runtime_client/runtime_client_test.py
index a85c7dcb381..12dfe678388 100644
--- a/tensorflow/core/function/runtime_client/runtime_client_test.py
+++ b/tensorflow/core/function/runtime_client/runtime_client_test.py
@@ -183,7 +183,7 @@ class RuntimeClientTest(test.TestCase):
 
     self.assertAllEqual(self.evaluate(f()), 2)
 
-  def test_concrete_function_editing_via_mlir_pass(self):
+  def test_concrete_function_editing_via_mlir_pass_tfg_dialect(self):
     if not tf2.enabled():
       self.skipTest("TF2 test")
 
@@ -201,6 +201,53 @@ class RuntimeClientTest(test.TestCase):
     # 1 + 1 = 2. But the pass changes it to 1 * 1.
     self.assertAllEqual(self.evaluate(f(one, one)), 1)
 
+  def test_concrete_function_editing_via_mlir_pass_tf_dialect(self):
+    if not tf2.enabled():
+      self.skipTest("TF2 test")
+
+    @def_function.function
+    def f(x, y):
+      return math_ops.multiply(x, y, name="x_times_y")
+
+    one = constant_op.constant(1)
+    cf = f.get_concrete_function(one, one)
+    fname = cf.function_def.signature.name
+    ctx = runtime_client.GlobalPythonEagerContext()
+    rt = runtime_client.Runtime(ctx)
+
+    # 1 * 1 = 1 -> 1 + 1 = 2
+    rt.TransformFunction(
+        fname, "test-pass-tf-dialect", runtime_client.Runtime.Dialect.TF
+    )
+
+    self.assertAllEqual(f(one, one), 2)
+
+  def test_concrete_function_editing_via_mlir_pass_mixed_dialects(self):
+    if not tf2.enabled():
+      self.skipTest("TF2 test")
+
+    @def_function.function
+    def f(x, y):
+      return math_ops.add(x, y, name="x_plus_y")
+
+    one = constant_op.constant(1)
+    cf = f.get_concrete_function(one, one)
+    ctx = runtime_client.GlobalPythonEagerContext()
+    rt = runtime_client.Runtime(ctx)
+    fname = cf.function_def.signature.name
+
+    # 1 + 1 = 2 -> 1 * 1 = 1
+    rt.TransformFunction(fname, "test-pass")
+
+    self.assertAllEqual(f(one, one), 1)
+
+    # 1 * 1 = 1 -> 1 + 1 = 2
+    rt.TransformFunction(
+        fname, "test-pass-tf-dialect", runtime_client.Runtime.Dialect.TF
+    )
+
+    self.assertAllEqual(f(one, one), 2)
+
 
 class RuntimeClientMultiWorkersTest(test.TestCase):
 
diff --git a/tensorflow/core/function/testing/BUILD b/tensorflow/core/function/testing/BUILD
index 18e73a19b54..88c77e5eb1a 100644
--- a/tensorflow/core/function/testing/BUILD
+++ b/tensorflow/core/function/testing/BUILD
@@ -17,10 +17,10 @@ cc_library(
         "test_pass.h",
     ],
     deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core/ir:Dialect",
         "//tensorflow/core/platform:logging",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
     ],
diff --git a/tensorflow/core/function/testing/test_pass.h b/tensorflow/core/function/testing/test_pass.h
index b51ef8b4026..93c2116f5ad 100644
--- a/tensorflow/core/function/testing/test_pass.h
+++ b/tensorflow/core/function/testing/test_pass.h
@@ -16,9 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FUNCTION_TESTING_TEST_PASS_H_
 #define TENSORFLOW_CORE_FUNCTION_TESTING_TEST_PASS_H_
 
+#include <memory>
+
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/ir/dialect.h"
 #include "tensorflow/core/ir/ops.h"
 #include "tensorflow/core/ir/tf_op_wrapper.h"
@@ -32,9 +37,10 @@ namespace testing {
 
 // A simple testing pass for BinaryFunction that replaces an AddV2 node named
 // `x_plus_y` with a Mul one.
-struct TestPass
-    : public mlir::PassWrapper<TestPass, mlir::OperationPass<mlir::ModuleOp>> {
-  TestPass() = default;
+struct TestPassTfgDialect
+    : public mlir::PassWrapper<TestPassTfgDialect,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+  TestPassTfgDialect() = default;
 
   llvm::StringRef getArgument() const final { return "test-pass"; }
 
@@ -72,12 +78,51 @@ struct TestPass
   }
 };
 
-inline std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateTestPass() {
-  return std::make_unique<TestPass>();
+// A simple testing pass that replaces the first Mul node in the module
+// to a AddV2 node and names it `x_plus_y`.
+struct TestPassTfDialect
+    : public mlir::PassWrapper<TestPassTfDialect,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+  TestPassTfDialect() = default;
+
+  llvm::StringRef getArgument() const final { return "test-pass-tf-dialect"; }
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    mlir::OpBuilder builder(module);
+
+    mlir::Operation* target = nullptr;
+    module->walk([&target](mlir::Operation* op) {
+      if (op->getName().getStringRef() == "tf.Mul") {
+        target = op;
+        return;
+      }
+    });
+    DCHECK(target != nullptr);
+
+    builder.setInsertionPoint(target);
+    auto replacement = builder.create<mlir::TF::AddV2Op>(
+        mlir::NameLoc::get(
+            mlir::StringAttr::get(builder.getContext(), "x_plus_y")),
+        target->getResultTypes(), target->getOperand(0), target->getOperand(1));
+    target->replaceAllUsesWith(replacement->getResults());
+    target->erase();
+  }
+};
+
+inline std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfgDialectTestPass() {
+  return std::make_unique<TestPassTfgDialect>();
+}
+
+inline std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfDialectTestPass() {
+  return std::make_unique<TestPassTfDialect>();
 }
 
 inline void RegisterTestPass() {
-  mlir::registerPass([] { return CreateTestPass(); });
+  mlir::registerPass([] { return CreateTfgDialectTestPass(); });
+  mlir::registerPass([] { return CreateTfDialectTestPass(); });
 }
 
 }  // namespace testing
diff --git a/tensorflow/core/function/trace_type/BUILD b/tensorflow/core/function/trace_type/BUILD
index 1b3f759dc4b..955671ee8d6 100644
--- a/tensorflow/core/function/trace_type/BUILD
+++ b/tensorflow/core/function/trace_type/BUILD
@@ -22,7 +22,7 @@ pytype_strict_library(
         ":default_types",
         ":serialization",
         ":util",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:trace",
     ],
 )
 
@@ -61,7 +61,6 @@ py_strict_test(
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/types",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -78,7 +77,7 @@ pytype_strict_library(
         ":serialization",
         ":util",
         "//tensorflow/core/function/trace_type:default_types_proto_py",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:trace",
     ],
 )
 
@@ -90,7 +89,7 @@ py_strict_test(
         ":default_types",
         ":serialization",
         "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:trace",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/core/function/trace_type/default_types.py b/tensorflow/core/function/trace_type/default_types.py
index 28fad3cdcf5..2d87b1e88fa 100644
--- a/tensorflow/core/function/trace_type/default_types.py
+++ b/tensorflow/core/function/trace_type/default_types.py
@@ -15,7 +15,7 @@
 """TraceType implementations for common Python types."""
 
 import collections
-from typing import Any, Dict as PythonDict, Hashable, Optional, Sequence, Tuple as PythonTuple, Type
+from typing import Any, Dict as PythonDict, Hashable, List as PythonList, Optional, Sequence, Tuple as PythonTuple, Type
 import weakref
 
 from tensorflow.core.function.trace_type import default_types_pb2
@@ -100,9 +100,6 @@ class Literal(trace.TraceType, serialization.Serializable):
       return list(self.value)
     return self.value
 
-  def _to_tensors(self, value: Any):
-    return []
-
   def __eq__(self, other) -> bool:
     if not isinstance(other, trace.TraceType):
       return NotImplemented
@@ -137,9 +134,6 @@ class Weakref(trace.TraceType):
   def placeholder_value(self, placeholder_context) -> Any:
     return self._ref()
 
-  def _to_tensors(self, value: Any) -> Any:
-    return []
-
   def __eq__(self, other):
     if not isinstance(other, trace.TraceType):
       return NotImplemented
@@ -222,6 +216,12 @@ class Tuple(trace.TraceType, serialization.Serializable):
       flattened_values.extend(comp_type._to_tensors(comp_value))  # pylint: disable=protected-access
     return flattened_values
 
+  def _flatten(self) -> PythonList[trace.TraceType]:
+    flattened_types = []
+    for component in self.components:
+      flattened_types.extend(component._flatten())  # pylint: disable=protected-access
+    return flattened_types
+
   def _cast(self, value: Any, casting_context) -> Any:
     assert isinstance(value, tuple), f"Can not cast {value!r} to tuple type."
     assert len(value) == len(
@@ -297,6 +297,9 @@ class List(trace.TraceType, serialization.Serializable):
     assert isinstance(value, list)
     return self.components_tuple._to_tensors(tuple(value))  # pylint: disable=protected-access
 
+  def _flatten(self) -> PythonList[trace.TraceType]:
+    return self.components_tuple._flatten()  # pylint: disable=protected-access
+
   def _cast(self, value: Any, casting_context) -> Any:
     assert isinstance(value, list), f"Can not cast {value!r} to list type."
     return list(self.components_tuple._cast(tuple(value), casting_context))  # pylint: disable=protected-access
@@ -402,6 +405,14 @@ class NamedTuple(trace.TraceType, serialization.Serializable):
       flattened_values.extend(attribute_type._to_tensors(attribute_value))  # pylint: disable=protected-access
     return flattened_values
 
+  def _flatten(self) -> PythonList[trace.TraceType]:
+    flattened_types = []
+
+    for component in self.attributes.components:
+      flattened_types.extend(component._flatten())  # pylint: disable=protected-access
+
+    return flattened_types
+
   def _cast(self, value: Any, casting_context) -> Any:
     # Value must have same attributes with the TraceType
     assert util.is_namedtuple(
@@ -522,6 +533,14 @@ class Attrs(trace.TraceType):
       flattened_values.extend(attribute_type._to_tensors(attribute_value))  # pylint: disable=protected-access
     return flattened_values
 
+  def _flatten(self) -> PythonList[trace.TraceType]:
+    flattened_types = []
+
+    for component in self.named_attributes.attributes.components:
+      flattened_types.extend(component._flatten())  # pylint: disable=protected-access
+
+    return flattened_types
+
   def _cast(self, value: Any, casting_context) -> Any:
     assert util.is_attrs(value)
     value_cast = {}
@@ -639,6 +658,14 @@ class Dict(trace.TraceType, serialization.Serializable):
       flattened_values.extend(comp_type._to_tensors(comp_value))  # pylint: disable=protected-access
     return flattened_values
 
+  def _flatten(self) -> PythonList[trace.TraceType]:
+    flattened_types = []
+
+    for component in self.mapping.values():
+      flattened_types.extend(component._flatten())  # pylint: disable=protected-access
+
+    return flattened_types
+
   def _cast(self, value: Any, casting_context) -> Any:
     # Value must have same keys with the TraceType
     assert isinstance(
diff --git a/tensorflow/core/function/trace_type/trace_type_builder.py b/tensorflow/core/function/trace_type/trace_type_builder.py
index d1939c44b13..2957eba1b4c 100644
--- a/tensorflow/core/function/trace_type/trace_type_builder.py
+++ b/tensorflow/core/function/trace_type/trace_type_builder.py
@@ -29,7 +29,6 @@ class InternalTracingContext(trace.TracingContext):
   def __init__(self, is_legacy_signature: bool = False):
     self._global_to_local_id = {}
     self._alias_id_to_placeholder = {}
-    self._spec_id_to_handledata = {}
     self._is_legacy_signature = is_legacy_signature
 
   def alias_global_id(self, global_id: Hashable) -> Hashable:
@@ -44,12 +43,6 @@ class InternalTracingContext(trace.TracingContext):
   def get_placeholder_mapping(self) -> Dict[Hashable, Any]:
     return self._alias_id_to_placeholder
 
-  def add_handledata(self, spec_id: Hashable, handledata: Any) -> None:
-    self._spec_id_to_handledata[spec_id] = handledata
-
-  def get_handledata_mapping(self) -> Dict[Hashable, Any]:
-    return self._spec_id_to_handledata
-
   @property
   def is_legacy_signature(self) -> bool:
     """If the value is from a legacy signature representation.
@@ -66,12 +59,10 @@ class InternalPlaceholderContext(trace.PlaceholderContext):
   def __init__(self,
                context_graph=None,
                placeholder_mapping=None,
-               handledata_mapping=None,
                unnest_only=False,
                with_none_control_dependencies=False,
                composite_device_name=None):
     self._alias_id_to_placeholder = placeholder_mapping or {}
-    self._spec_id_to_handledata = handledata_mapping or {}
     self._naming_scope = None
     self._context_graph = context_graph
     self._unnest_only = unnest_only
@@ -93,15 +84,6 @@ class InternalPlaceholderContext(trace.PlaceholderContext):
                      "instance of placeholder context.")
     self._alias_id_to_placeholder[alias_id] = placeholder
 
-  def has_handledata(self, spec_id: Hashable) -> bool:
-    return spec_id in self._spec_id_to_handledata
-
-  def get_handledata(self, spec_id: Hashable) -> Any:
-    if not self.has_handledata(spec_id):
-      raise KeyError("Could not find handle data for TraceType with "
-                     f"id: {spec_id} in this instance of placeholder context.")
-    return self._spec_id_to_handledata[spec_id]
-
   def update_naming_scope(self, naming_scope: Optional[str]) -> None:
     self._naming_scope = naming_scope
 
diff --git a/tensorflow/core/function/transform/BUILD b/tensorflow/core/function/transform/BUILD
index 0e2d2745885..31328d1f22e 100644
--- a/tensorflow/core/function/transform/BUILD
+++ b/tensorflow/core/function/transform/BUILD
@@ -34,8 +34,8 @@ pytype_strict_library(
         "//tensorflow/python/framework:function_def_to_graph",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/platform",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -47,10 +47,10 @@ py_strict_test(
     deps = [
         ":transform",
         "//tensorflow/core/function/testing:test_pass_py",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:gradients",
         "//tensorflow/python:gradients_impl",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:while_loop",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
diff --git a/tensorflow/core/function/transform/transform.py b/tensorflow/core/function/transform/transform.py
index 51e5bfff1a1..ce9aabd455c 100644
--- a/tensorflow/core/function/transform/transform.py
+++ b/tensorflow/core/function/transform/transform.py
@@ -39,15 +39,24 @@ def transform_function(
     f: def_function.Function,
     inputs: Optional[list[Any]] = None,
     kw_inputs: Optional[dict[str, Any]] = None,
-    transform_fn: Optional[Union[_FunctionDefTransformerType,
-                                 list[_FunctionDefTransformerType]]] = None,
+    transform_fn: Optional[
+        Union[_FunctionDefTransformerType, list[_FunctionDefTransformerType]]
+    ] = None,
     mlir_pipeline: Optional[Union[str, list[str]]] = None,
-    nested_fn_transforms: Optional[dict[
-        str, Optional[Union[_FunctionDefTransformerType,
-                            list[_FunctionDefTransformerType]]]]] = None,
-    nested_mlir_transforms: Optional[dict[str,
-                                          Optional[Union[str,
-                                                         list[str]]]]] = None,
+    nested_fn_transforms: Optional[
+        dict[
+            str,
+            Optional[
+                Union[
+                    _FunctionDefTransformerType,
+                    list[_FunctionDefTransformerType],
+                ]
+            ],
+        ]
+    ] = None,
+    nested_mlir_transforms: Optional[
+        dict[str, Optional[Union[str, list[str]]]]
+    ] = None,
 ) -> function_lib.ConcreteFunction:
   """Applies a transformation to a tf.function to produce a new callable.
 
@@ -129,9 +138,11 @@ def transform_function(
     mlir_pipelines = [mlir_pipeline]
 
   nested_fn_transforms = (
-      nested_fn_transforms if nested_fn_transforms is not None else {})
+      nested_fn_transforms if nested_fn_transforms is not None else {}
+  )
   nested_mlir_transforms = (
-      nested_mlir_transforms if nested_mlir_transforms is not None else {})
+      nested_mlir_transforms if nested_mlir_transforms is not None else {}
+  )
 
   # Extract the `ConcreteFunction` from the `tf.function.`
   if inputs is not None or kw_inputs is not None:
@@ -144,8 +155,8 @@ def transform_function(
   # Promote all library functions to the parent scope so that any replicated
   # functions can also re-use them.
   graph = ops.get_default_graph()
-  for edf in cf.graph._functions.values():  # pylint: disable=protected-access
-    edf.add_to_graph(graph, overwrite=False)
+  for atomic in cf.graph._functions.values():  # pylint: disable=protected-access
+    graph._add_function_recursive(atomic)  # pylint: disable=protected-access
 
   # Initialize the `runtime_client`.
   eager_ctx = runtime_client.GlobalPythonEagerContext()
@@ -164,7 +175,7 @@ def transform_function(
   for transform_fn in transform_fns:
     transform_fn(fndef)
 
-  # Apply a transform to any of the nested _EagerDefinedFunctions(EDF) if
+  # Apply a transform to any of the nested AtomicFunctions if
   # `nested_fn_transforms` or `nested_mlir_transforms` is provided.
   if nested_fn_transforms or nested_mlir_transforms:
     nested_functions = cf.graph._functions  # pylint: disable=protected-access
@@ -178,16 +189,24 @@ def transform_function(
 
     # Transform every nested function specified in `nested_fn_transforms` and
     # `nested_mlir_transforms`.
-    for edf_name in nested_mlir_transforms.keys() | nested_fn_transforms.keys():
-      if edf_name in nested_functions:
-        edf_transform_fn = nested_fn_transforms.get(edf_name, [])
-        edf_mlir_pipeline = nested_mlir_transforms.get(edf_name, [])
-        transformed_edf = transform_eager_defined_function(
-            rt, nested_functions[edf_name], edf_transform_fn, edf_mlir_pipeline)
-        transformed_edf.add_to_graph(graph, overwrite=True)
-        transformed_edf_name = compat.as_str(transformed_edf.name)
-        transformed_nested_functions[transformed_edf_name] = transformed_edf
-        nested_transforms_map[edf_name] = transformed_edf_name
+    for atomic_name in (
+        nested_mlir_transforms.keys() | nested_fn_transforms.keys()
+    ):
+      if atomic_name in nested_functions:
+        atomic_transform_fn = nested_fn_transforms.get(atomic_name, [])
+        atomic_mlir_pipeline = nested_mlir_transforms.get(atomic_name, [])
+        transformed_atomic = transform_eager_defined_function(
+            rt,
+            nested_functions[atomic_name],
+            atomic_transform_fn,
+            atomic_mlir_pipeline,
+        )
+        graph._add_function_recursive(transformed_atomic, overwrite=True)  # pylint: disable=protected-access
+        transformed_atomic_name = compat.as_str(transformed_atomic.name)
+        transformed_nested_functions[transformed_atomic_name] = (
+            transformed_atomic
+        )
+        nested_transforms_map[atomic_name] = transformed_atomic_name
 
     # Update the `FunctionDef` to map to the newly created EDFs.
     for node in fndef.node_def:
@@ -201,19 +220,22 @@ def transform_function(
   # Create a new FuncGraph from the modified FunctionDef.
   structured_input_signature = cf.structured_input_signature
   structured_outputs_signature = (
-      func_graph_module.convert_structure_to_signature(cf.structured_outputs))
+      func_graph_module.convert_structure_to_signature(cf.structured_outputs)
+  )
   with graph.as_default():
     func_graph = function_def_lib.function_def_to_graph(
         fndef,
         structured_input_signature=structured_input_signature,
         structured_outputs=structured_outputs_signature,
-        propagate_device_spec=True)
+        propagate_device_spec=True,
+    )
 
   # Set handle data.
   for i, output in enumerate(cf.outputs):
     func_graph_output = func_graph.outputs[i]
-    if isinstance(output, ops.Tensor) and isinstance(func_graph_output,
-                                                     ops.Tensor):
+    if isinstance(output, ops.Tensor) and isinstance(
+        func_graph_output, ops.Tensor
+    ):
       func_graph_output.set_shape(output.shape)
       handle_data_util.copy_handle_data(output, func_graph_output)
 
@@ -230,7 +252,8 @@ def transform_function(
   # pylint: disable=protected-access
   # Get the new ConcreteFunction.
   updated_cf = function_lib.ConcreteFunction(
-      func_graph, attrs=fndef.attr, spec=cf._function_spec)
+      func_graph, attrs=fndef.attr, spec=cf._function_spec
+  )
 
   # Set arg_keywords and positional_args
   updated_cf._arg_keywords = cf._arg_keywords
@@ -240,8 +263,10 @@ def transform_function(
 
   # Register the ConcreteFunction with the python Graph.
   if nested_fn_transforms or nested_mlir_transforms:
-    for transformed_edf in transformed_nested_functions.values():
-      transformed_edf.add_to_graph(updated_cf.graph, overwrite=True)
+    for transformed_atomic in transformed_nested_functions.values():
+      updated_cf.graph._add_function_recursive(  # pylint: disable=protected-access
+          transformed_atomic, overwrite=True
+      )
   updated_cf.add_to_graph(graph, overwrite=True)
 
   return updated_cf
@@ -249,22 +274,25 @@ def transform_function(
 
 def transform_eager_defined_function(
     rt: runtime_client.Runtime,
-    f: function_lib._EagerDefinedFunction,
-    transform_fn: Union[_FunctionDefTransformerType,
-                        list[_FunctionDefTransformerType]],
+    f: function_lib.AtomicFunction,
+    transform_fn: Union[
+        _FunctionDefTransformerType, list[_FunctionDefTransformerType]
+    ],
     mlir_pipeline: Union[str, list[str]],
-) -> function_lib._EagerDefinedFunction:
-  """Applies transforms on an _EagerDefinedFunction."""
+) -> function_lib.AtomicFunction:
+  """Applies transforms on an AtomicFunction."""
   transform_fns = (
-      transform_fn if isinstance(transform_fn, list) else [transform_fn])
+      transform_fn if isinstance(transform_fn, list) else [transform_fn]
+  )
   mlir_pipelines = (
-      mlir_pipeline if isinstance(mlir_pipeline, list) else [mlir_pipeline])
+      mlir_pipeline if isinstance(mlir_pipeline, list) else [mlir_pipeline]
+  )
   # First apply the MLIR based transformation.
   for mlir_pipeline in mlir_pipelines:
-    rt.TransformFunction(f.signature.name, mlir_pipeline)
+    rt.TransformFunction(f.cached_definition.signature.name, mlir_pipeline)
 
   # Get the `FunctionDef` after MLIR transformation.
-  fndef = rt.GetFunctionProto(f.signature.name)
+  fndef = rt.GetFunctionProto(f.cached_definition.signature.name)
 
   # Apply the Python function based transformation.
   for transform_fn in transform_fns:
@@ -278,26 +306,33 @@ def transform_eager_defined_function(
         fndef,
         structured_input_signature=f.graph.structured_input_signature,
         structured_outputs=f.graph.structured_outputs,
-        propagate_device_spec=True)
+        propagate_device_spec=True,
+    )
 
   # pylint: disable=protected-access
   # Ref: third_party/tensorflow/python/ops/control_flow_util_v2.py
-  # Generate a new `_EagerDefinedFunction`.
-  edf = function_lib._EagerDefinedFunction(fndef.signature.name, func_graph,
-                                           func_graph.inputs,
-                                           func_graph.outputs, fndef.attr)
+  # Generate a new `AtomicFunction`.
+  atomic = function_lib.from_func_graph(
+      fndef.signature.name,
+      func_graph,
+      func_graph.inputs,
+      func_graph.outputs,
+      fndef.attr,
+  )
   # pylint: enable=protected-access
 
-  return edf
+  return atomic
 
 
 def _replicate_gradient_functions(
     original_graph: func_graph_module.FuncGraph,
-    replicated_graph: func_graph_module.FuncGraph) -> None:
+    replicated_graph: func_graph_module.FuncGraph,
+) -> None:
   """Copies over any custom_gradients defined within the original Graph."""
   seen_ops = set()
   for gradient_op_type, op in _ops_with_custom_gradients(
-      replicated_graph.get_operations()):
+      replicated_graph.get_operations()
+  ):
     # Soft-cache processed ops so we do not repeat the computation.
     if gradient_op_type in seen_ops:
       continue
@@ -315,18 +350,21 @@ def _replicate_gradient_functions(
     # can replicate the custom gradient and update any python captures.
     try:
       grad_fn = def_function.function(custom_gradient).get_concrete_function(
-          None, *op.inputs)
+          None, *op.inputs
+      )
     except Exception:  # pylint: disable=broad-except
       # TODO(xjun): Figure out why tracing of custom_gradient will fail.
       tf_logging.exception(
-          f"Error when tracing gradients for {replicated_graph}.")
+          f"Error when tracing gradients for {replicated_graph}."
+      )
       continue
 
     # Re-bind all captures to values within the replicated graph.
     remapped_captures = []
     for capture in grad_fn.captured_inputs:
       outer_graph, outer_capture = _get_outer_most_capture(
-          original_graph, capture)
+          original_graph, capture
+      )
 
       # We only need to re-bind captures originating from the `original_graph`.
       if outer_graph is not original_graph:
@@ -337,15 +375,18 @@ def _replicate_gradient_functions(
             f"Cannot replicate graph: {original_graph}. It utilizes a "
             f"`tf.custom_gradient` for op: {op} which has a "
             f"non-replicable capture: {capture}. Consider re-factoring your "
-            f"custom_gradient to avoid the capture.")
+            "custom_gradient to avoid the capture."
+        )
 
       remapped_captures.append(
-          replicated_graph.get_tensor_by_name(outer_capture.name))
+          replicated_graph.get_tensor_by_name(outer_capture.name)
+      )
     restore_captures.restore_captures(grad_fn, remapped_captures)
     new_gradient_op_type = custom_gradient_lib.generate_name()
     op._set_attr(  # pylint: disable=protected-access
         "_gradient_op_type",
-        attr_value_pb2.AttrValue(s=compat.as_bytes(new_gradient_op_type)))
+        attr_value_pb2.AttrValue(s=compat.as_bytes(new_gradient_op_type)),
+    )
     ops.RegisterGradient(new_gradient_op_type)(_gen_gradient_func(grad_fn))
 
 
@@ -363,8 +404,8 @@ def _gen_gradient_func(func):
 
 
 def _get_outer_most_capture(
-    original_graph: func_graph_module.FuncGraph,
-    capture: _TensorType) -> tuple[func_graph_module.FuncGraph, _TensorType]:
+    original_graph: func_graph_module.FuncGraph, capture: _TensorType
+) -> tuple[func_graph_module.FuncGraph, _TensorType]:
   """Tries to find the original captured tensor."""
   outer_graph = original_graph
   while outer_graph is not None and not isinstance(capture, ops.EagerTensor):
@@ -384,7 +425,8 @@ def _get_outer_most_capture(
 
 
 def _ops_with_custom_gradients(
-    operations: list[ops.Operation]) -> Iterator[tuple[str, ops.Operation]]:
+    operations: list[ops.Operation],
+) -> Iterator[tuple[str, ops.Operation]]:
   """Returns an iterator over ops having custom_gradients."""
   for op in operations:
     try:
diff --git a/tensorflow/core/function/transform/transform_test.py b/tensorflow/core/function/transform/transform_test.py
index fc39f605902..77c8ef99adc 100644
--- a/tensorflow/core/function/transform/transform_test.py
+++ b/tensorflow/core/function/transform/transform_test.py
@@ -26,10 +26,10 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.module import module as module_lib
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import while_loop
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import load as load_lib
 from tensorflow.python.saved_model import save as save_lib
@@ -233,7 +233,7 @@ class TransformTest(test.TestCase, parameterized.TestCase):
         i = constant_op.constant(1.0)
         c = lambda i: math_ops.less(i, 3.0)
         b = lambda i: (math_ops.add(i, z, name="x_plus_y"))
-        i = control_flow_ops.while_loop_v2(c, b, [i])
+        i = while_loop.while_loop_v2(c, b, [i])
         return i
 
       y = add()
diff --git a/tensorflow/core/graph/benchmark_testlib.h b/tensorflow/core/graph/benchmark_testlib.h
index dc5a21a85a2..54716405fd2 100644
--- a/tensorflow/core/graph/benchmark_testlib.h
+++ b/tensorflow/core/graph/benchmark_testlib.h
@@ -69,7 +69,7 @@ REGISTER_OP("In16Out1")
     .Input("p: float")
     .Output("y: float");
 
-GraphDef CreateGraphDef(int num_nodes, int num_edges_per_node) {
+inline GraphDef CreateGraphDef(int num_nodes, int num_edges_per_node) {
   const int kNumInNodes = 10 * num_edges_per_node;
   GraphDef graph_def;
 
@@ -111,7 +111,7 @@ GraphDef CreateGraphDef(int num_nodes, int num_edges_per_node) {
   return graph_def;
 }
 
-GraphDef CreateRandomGraph(int size) {
+inline GraphDef CreateRandomGraph(int size) {
   random::PhiloxRandom philox(0x12345);
   random::SimplePhilox rnd(&philox);
 
@@ -135,11 +135,11 @@ GraphDef CreateRandomGraph(int size) {
   return graph;
 }
 
-GraphDef CreateFaninFanoutNodeGraph(int num_regular_fanins,
-                                    int num_regular_fanouts,
-                                    int num_controlling_fanins,
-                                    int num_controlled_fanouts,
-                                    bool fanout_unique_index) {
+inline GraphDef CreateFaninFanoutNodeGraph(int num_regular_fanins,
+                                           int num_regular_fanouts,
+                                           int num_controlling_fanins,
+                                           int num_controlled_fanouts,
+                                           bool fanout_unique_index) {
   GraphDef graph;
 
   auto create_node = [](const string& name) {
diff --git a/tensorflow/core/graph/control_flow_test.cc b/tensorflow/core/graph/control_flow_test.cc
index 49f593f276c..1594e085f12 100644
--- a/tensorflow/core/graph/control_flow_test.cc
+++ b/tensorflow/core/graph/control_flow_test.cc
@@ -60,18 +60,17 @@ TEST(ValidateControlFlowTest, InputsFromDifferentFrames) {
   std::vector<ControlFlowInfo> info;
   Status status = BuildControlFlowInfo(graph.get(), &info);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(absl::StrContains(status.error_message(),
-                                "has inputs from different frames"))
-      << status.error_message();
-  EXPECT_TRUE(absl::StrContains(status.error_message(),
-                                "{{node outer/body/inner/Merge}}"))
-      << status.error_message();
-  EXPECT_TRUE(absl::StrContains(status.error_message(),
-                                "{{node outer/body/inner/Enter}}"))
-      << status.error_message();
   EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "{{node outer/Switch}}"))
-      << status.error_message();
+      absl::StrContains(status.message(), "has inputs from different frames"))
+      << status.message();
+  EXPECT_TRUE(
+      absl::StrContains(status.message(), "{{node outer/body/inner/Merge}}"))
+      << status.message();
+  EXPECT_TRUE(
+      absl::StrContains(status.message(), "{{node outer/body/inner/Enter}}"))
+      << status.message();
+  EXPECT_TRUE(absl::StrContains(status.message(), "{{node outer/Switch}}"))
+      << status.message();
 }
 
 TEST(ValidateControlFlowTest, MismatchedParentFrames) {
@@ -108,11 +107,10 @@ TEST(ValidateControlFlowTest, MismatchedParentFrames) {
   std::vector<ControlFlowInfo> info;
   status = BuildControlFlowInfo(graph.get(), &info);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "Mismatched parent frames"))
-      << status.error_message();
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "{{node Enter2}}"))
-      << status.error_message();
+  EXPECT_TRUE(absl::StrContains(status.message(), "Mismatched parent frames"))
+      << status.message();
+  EXPECT_TRUE(absl::StrContains(status.message(), "{{node Enter2}}"))
+      << status.message();
 }
 
 TEST(ValidateControlFlowTest, TwoLoopCond) {
@@ -134,13 +132,12 @@ TEST(ValidateControlFlowTest, TwoLoopCond) {
   Status status = BuildControlFlowInfo(graph.get(), &info);
   EXPECT_FALSE(status.ok());
   EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "more than one LoopCond node"))
-      << status.error_message();
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "{{node sub/LoopCond}}"))
-      << status.error_message();
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "{{node LoopCond}}"))
-      << status.error_message();
+      absl::StrContains(status.message(), "more than one LoopCond node"))
+      << status.message();
+  EXPECT_TRUE(absl::StrContains(status.message(), "{{node sub/LoopCond}}"))
+      << status.message();
+  EXPECT_TRUE(absl::StrContains(status.message(), "{{node LoopCond}}"))
+      << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 41f3839027b..019ee93c614 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_properties.h"
@@ -460,7 +462,7 @@ Graph::Graph(const FunctionLibraryDefinition& flib_def)
     versions_->set_min_consumer(12);
   }
   Status s = ops_.AddLibrary(flib_def);
-  CHECK(s.ok()) << s.error_message();
+  CHECK(s.ok()) << s.message();
 }
 
 Graph::~Graph() {
@@ -763,12 +765,43 @@ Status Graph::AddWhileInputHack(Node* new_src, int new_src_index, Node* dst) {
   return OkStatus();
 }
 
-Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
+Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib,
+                                 const StackTracesMap& stack_traces) {
+  return AddFunctionLibrary(FunctionDefLibrary(fdef_lib), stack_traces);
+}
+
+Status Graph::AddFunctionLibrary(FunctionDefLibrary&& fdef_lib,
+                                 const StackTracesMap& stack_traces) {
   // Need a new-enough consumer to support the functions we add to the graph.
   if (fdef_lib.function_size() > 0 && versions_->min_consumer() < 12) {
     versions_->set_min_consumer(12);
   }
-  return ops_.AddLibrary(fdef_lib);
+  return ops_.AddLibrary(std::move(fdef_lib), stack_traces);
+}
+
+Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
+  return AddFunctionLibrary(fdef_lib, /*stack_traces=*/{});
+}
+
+Status Graph::AddFunctionLibrary(FunctionDefLibrary&& fdef_lib) {
+  return AddFunctionLibrary(std::move(fdef_lib), /*stack_traces=*/{});
+}
+
+Status Graph::AddFunctionDef(const FunctionDef& fdef,
+                             const StackTracesMap& stack_traces) {
+  // Need a new-enough consumer to support the functions we add to the graph.
+  if (versions_->min_consumer() < 12) {
+    versions_->set_min_consumer(12);
+  }
+  return ops_.AddFunctionDef(fdef, stack_traces);
+}
+
+Status Graph::AddGradientDef(const GradientDef& gdef) {
+  // Need a new-enough consumer to support the functions we add to the graph.
+  if (versions_->min_consumer() < 12) {
+    versions_->set_min_consumer(12);
+  }
+  return ops_.AddGradientDef(gdef);
 }
 
 namespace {
@@ -785,8 +818,8 @@ void AddInput(NodeDef* dst, StringPiece src_name, int src_slot) {
 
 }  // namespace
 
-void Graph::ToGraphDef(GraphDef* graph_def) const {
-  ToGraphDefSubRange(graph_def, 0);
+void Graph::ToGraphDef(GraphDef* graph_def, bool include_flib_def) const {
+  ToGraphDefSubRange(graph_def, /*from_node_id=*/0, include_flib_def);
 }
 
 GraphDef Graph::ToGraphDefDebug() const {
@@ -795,10 +828,14 @@ GraphDef Graph::ToGraphDefDebug() const {
   return ret;
 }
 
-void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
+void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id,
+                               bool include_flib_def) const {
   graph_def->Clear();
   *graph_def->mutable_versions() = versions();
-  *graph_def->mutable_library() = ops_.ToProto();
+
+  if (include_flib_def) {
+    *graph_def->mutable_library() = ops_.ToProto();
+  }
 
   graph_def->mutable_node()->Reserve(std::max(1, num_nodes() - from_node_id));
 
@@ -1008,8 +1045,10 @@ void Graph::NodeType(StringPiece name, const FullTypeDef** result) {
 }
 
 std::string Edge::DebugString() const {
-  return strings::Printf("[id=%d %s:%d -> %s:%d]", id_, src_->name().c_str(),
-                         src_output_, dst_->name().c_str(), dst_input_);
+  auto src_name = src_ ? src_->name().c_str() : "<NULL>";
+  auto dst_name = dst_ ? dst_->name().c_str() : "<NULL>";
+  return strings::Printf("[id=%d %s:%d -> %s:%d]", id_, src_name, src_output_,
+                         dst_name, dst_input_);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index a950ecff46d..a70c27aab49 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -64,14 +64,15 @@ namespace tensorflow {
 class Edge;
 class EdgeSetTest;
 class Graph;
+class GraphTest;
 class GraphDef;
 class Node;
 struct OutputTensor;
 class VersionDef;
 class WhileContext;
 
-class NeighborIter;     // Declared below
-class NodeIter;         // Declared below
+class NeighborIter;  // Declared below
+class NodeIter;      // Declared below
 
 // Indicates where the graph instance is originated from.
 enum class ConstructionContext {
@@ -454,6 +455,7 @@ class Edge {
   Edge() {}
 
   friend class EdgeSetTest;
+  friend class GraphTest;
   friend class Graph;
   Node* src_;
   Node* dst_;
@@ -619,8 +621,30 @@ class Graph {
   // Adds the function and gradient definitions in `fdef_lib` to this graph's op
   // registry. Ignores duplicate functions, and returns a bad status if an
   // imported function differs from an existing function or op with the same
-  // name.
+  // name. This overload adds the function definitions with no stack traces.
   Status AddFunctionLibrary(const FunctionDefLibrary& fdef_lib);
+  Status AddFunctionLibrary(FunctionDefLibrary&& fdef_lib);
+
+  // Adds the function and gradient definitions in `fdef_lib` to this graph's op
+  // registry. Ignores duplicate functions, and returns a bad status if an
+  // imported function differs from an existing function or op with the same
+  // name.
+  Status AddFunctionLibrary(const FunctionDefLibrary& fdef_lib,
+                            const StackTracesMap& stack_traces);
+  Status AddFunctionLibrary(FunctionDefLibrary&& fdef_lib,
+                            const StackTracesMap& stack_traces);
+
+  // Adds the function definition and its stacktraces to this graph's op
+  // registry. Ignores duplicate functions, and returns a bad status if an
+  // imported function differs from an existing function or op with the same
+  // name.
+  Status AddFunctionDef(const FunctionDef& fdef,
+                        const StackTracesMap& stack_traces);
+
+  // Adds the gradient definition to this graph's op registry. Ignores duplicate
+  // gradients of the same function, and returns a bad status if an imported
+  // gradient differs from an existing gradient of the same function name.
+  Status AddGradientDef(const GradientDef& gdef);
 
   // The number of live nodes in the graph.
   //
@@ -645,10 +669,24 @@ class Graph {
   int num_edges() const { return num_edges_; }
 
   // Serialize the nodes starting at `from_node_id` to a GraphDef.
-  void ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const;
+  // `include_flib_def` indicates whether the function library will be populated
+  // in the `graph_def`. `include_flib_def` should be usually set to true so
+  // that the populated `graph_def` will be complete. Setting `include_flib_def`
+  // to false would mean that the returned `graph_def` is incomplete and may
+  // contain references to functions whose definition is not included. It can
+  // make sense to do this in cases where the caller already has a copy of the
+  // function library.
+  void ToGraphDefSubRange(GraphDef* graph_def, int from_node_id,
+                          bool include_flib_def = true) const;
 
-  // Serialize to a GraphDef.
-  void ToGraphDef(GraphDef* graph_def) const;
+  // Serialize to a GraphDef. `include_flib_def` indicates whether the function
+  // library will be populated in the `graph_def`. `include_flib_def` should be
+  // usually set to true so that the populated `graph_def` will be complete.
+  // Setting `include_flib_def` to false would mean that the returned
+  // `graph_def` is incomplete and may contain references to functions whose
+  // definition is not included. It can make sense to do this in cases where the
+  // caller already has a copy of the function library.
+  void ToGraphDef(GraphDef* graph_def, bool include_flib_def = true) const;
 
   // This version can be called from debugger to inspect the graph content.
   // Use the previous version outside debug context for efficiency reasons.
diff --git a/tensorflow/core/graph/graph_def_builder.h b/tensorflow/core/graph/graph_def_builder.h
index aef9592a837..f7fc2549b3e 100644
--- a/tensorflow/core/graph/graph_def_builder.h
+++ b/tensorflow/core/graph/graph_def_builder.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_H_
 #define TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_H_
 
+#include <string>
 #include <vector>
 
 #include "tensorflow/core/framework/function.pb.h"
@@ -104,7 +105,7 @@ class GraphDefBuilder {
     // Returns a string representation of the status associated with *this.
     // Returns the string `"OK"` if the status doesn't have any error.
     string StatusToString() const {
-      return status_->ok() ? "OK" : status_->error_message();
+      return status_->ok() ? "OK" : std::string(status_->message());
     }
 
     // Given the Op type name, return a name for a node of that type.
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index 8196844477a..1822e6f48e3 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/graph/graph.h"
 
+#include <memory>
 #include <set>
 #include <unordered_map>
 #include <vector>
@@ -36,7 +37,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
-namespace {
 
 REGISTER_OP("OneInput").Input("x: float");
 
@@ -72,6 +72,17 @@ class GraphTest : public ::testing::Test {
     EXPECT_EQ(Stringify(expected_out), Stringify(out));
   }
 
+  std::unique_ptr<Edge> BuildEdge(int id = 0, Node* src = nullptr,
+                                  Node* dst = nullptr, int x = 0, int y = 0) {
+    Edge* e = new Edge;
+    e->id_ = id;
+    e->src_ = src;
+    e->dst_ = dst;
+    e->src_output_ = x;
+    e->dst_input_ = y;
+    return absl::WrapUnique(e);
+  }
+
   void VerifyGraphStats() {
     int nodes = 0;
     for (const Node* n : graph_.nodes()) {
@@ -156,6 +167,8 @@ class GraphTest : public ::testing::Test {
   }
 };
 
+namespace {
+
 TEST_F(GraphTest, Constructor) {
   Node* source = graph_.source_node();
   EXPECT_NE(source, nullptr);
@@ -431,20 +444,20 @@ TEST_F(GraphTest, IsValidNode) {
   // nullptr
   Status s = graph_.IsValidNode(nullptr);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_EQ(string("Node is null"), s.error_message());
+  EXPECT_EQ(string("Node is null"), s.message());
 
   // node id_ is too high
   s = graph_.IsValidNode(g2_node2);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   EXPECT_EQ(string("node id 3 is >= than number of nodes in graph 3"),
-            s.error_message());
+            s.message());
 
   // valid id_ but different ptr
   s = graph_.IsValidNode(g2_node1);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   EXPECT_EQ(string("Node with id 2 is different from the passed in node. "
                    "Does it belong to a different graph?"),
-            s.error_message());
+            s.message());
 }
 
 TEST_F(GraphTest, AddControlEdge) {
@@ -572,14 +585,14 @@ TEST_F(GraphTest, UpdateEdge) {
   Status s = graph_.UpdateEdge(a, 1, d, 0);
   EXPECT_FALSE(s.ok());
   EXPECT_EQ(
-      s.error_message(),
+      s.message(),
       "Node 'A' (type: 'OneOutput', num of outputs: 1) does not have output 1");
 
   // Update a's 1st input which is out of range.
   s = graph_.UpdateEdge(c, 0, a, 0);
   EXPECT_FALSE(s.ok());
   EXPECT_EQ(
-      s.error_message(),
+      s.message(),
       "Node 'A' (type: 'OneOutput', num of inputs: 0) does not have input 0");
 }
 
@@ -593,6 +606,30 @@ TEST_F(GraphTest, InputEdges) {
   TF_EXPECT_OK(b->input_edges(&edges));
 }
 
+TEST_F(GraphTest, EdgeDebugString) {
+  // Print valid edge
+  Node* a = FromNodeDef("A", "OneOutput", 0);
+  Node* b = FromNodeDef("B", "OneInput", 1);
+  auto e = graph_.AddEdge(a, 0, b, 0);
+  auto s = e->DebugString();
+  EXPECT_EQ(s, "[id=1 A:0 -> B:0]");
+
+  // Print empty edge
+  auto e1 = BuildEdge();
+  auto s1 = e1->DebugString();
+  EXPECT_EQ(s1, "[id=0 <NULL>:0 -> <NULL>:0]");
+
+  // Print edge with null src node
+  auto e2 = BuildEdge(2, 0, b, 1, 1);
+  auto s2 = e2->DebugString();
+  EXPECT_EQ(s2, "[id=2 <NULL>:1 -> B:1]");
+
+  // Print edge with null dst node
+  auto e3 = BuildEdge(3, a, 0, 2, 1);
+  auto s3 = e3->DebugString();
+  EXPECT_EQ(s3, "[id=3 A:2 -> <NULL>:1]");
+}
+
 TEST_F(GraphTest, AddFunctionLibrary) {
   // Basic functionality
   FunctionDefLibrary proto;
@@ -613,7 +650,7 @@ TEST_F(GraphTest, AddFunctionLibrary) {
       error_proto.function(0).node_def(0);
   Status s = graph_.AddFunctionLibrary(error_proto);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             "Cannot add function 'XTimesTwo' because a different function with "
             "the same name already exists.");
 
@@ -622,7 +659,7 @@ TEST_F(GraphTest, AddFunctionLibrary) {
   error_proto.mutable_function(0)->mutable_signature()->set_name("Add");
   s = graph_.AddFunctionLibrary(error_proto);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             "Cannot add function 'Add' because an op with the same name "
             "already exists.");
 
@@ -642,7 +679,7 @@ TEST_F(GraphTest, AddFunctionLibrary) {
   error_proto.mutable_gradient(0)->set_gradient_func("Undefined2");
   s = graph_.AddFunctionLibrary(error_proto);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             "Cannot assign gradient function 'Undefined2' to 'XTimesTwo' "
             "because it already has gradient function 'Undefined'");
 }
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 4138337650e..3fa1233e57f 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -98,8 +98,7 @@ Status SingleMachine::Provision() {
           GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
       if (!s.ok()) {
         return errors::Unavailable("Unknown TF GPU device with id ",
-                                   tf_device_id.value(), ": ",
-                                   s.error_message());
+                                   tf_device_id.value(), ": ", s.message());
       }
       attr = GetLocalGPUInfo(platform_device_id);
     } else if (dev.device_type().find("XLA") == string::npos) {
@@ -216,7 +215,7 @@ Status SingleMachine::GetPeakMemoryUsage(
   // Cpu_allocator->TracksAllocationSizes() returns true doesn't always mean the
   // the AllocatorStats would be collected.
   if (!cpu_allocator_stats_enabled_) {
-    return Status(error::INVALID_ARGUMENT,
+    return Status(absl::StatusCode::kInvalidArgument,
                   "Tracking allocation for CPU is not enabled.");
   }
 
@@ -228,7 +227,7 @@ Status SingleMachine::GetPeakMemoryUsage(
   for (Device* device : devices) {
     auto* allocator = device->GetAllocator(AllocatorAttributes());
     if (!allocator->TracksAllocationSizes()) {
-      return Status(error::INVALID_ARGUMENT,
+      return Status(absl::StatusCode::kInvalidArgument,
                     "Tracking allocation is not enabled.");
     }
     absl::optional<AllocatorStats> stats = allocator->GetStats();
@@ -445,7 +444,7 @@ Status SingleMachine::ClearAllocatorStats() const {
   // Cpu_allocator->TracksAllocationSizes() returns true doesn't always mean the
   // the AllocatorStats would be collected.
   if (!cpu_allocator_stats_enabled_) {
-    return Status(error::INVALID_ARGUMENT,
+    return Status(absl::StatusCode::kInvalidArgument,
                   "Tracking allocation for CPU is not enabled.");
   }
 
@@ -456,12 +455,12 @@ Status SingleMachine::ClearAllocatorStats() const {
   for (Device* device : devices) {
     auto* allocator = device->GetAllocator(AllocatorAttributes());
     if (!allocator->TracksAllocationSizes()) {
-      return Status(error::INVALID_ARGUMENT,
+      return Status(absl::StatusCode::kInvalidArgument,
                     "Tracking allocation is not enabled.");
     }
     if (!allocator->ClearStats()) {
       return Status(
-          error::INVALID_ARGUMENT,
+          absl::StatusCode::kInvalidArgument,
           absl::StrCat("Clearing allocation stats is not supported for ",
                        device->name()));
     }
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 637a64c561d..c749d793614 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -72,21 +72,21 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":utils",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/types:optional",
-        "//tensorflow/core/grappler/utils:functions",
-        "//tensorflow/core/grappler/utils:topological_sort",
-        "//tensorflow/core/grappler:mutable_graph_view",
-        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:evaluation_utils",
+        "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/types:optional",
     ] + tf_protos_grappler(),
 )
 
@@ -173,19 +173,19 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cost_estimator",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/gpu:gpu_id",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:utils",
+        "//tensorflow/core/util:overflow",
         "//third_party/eigen3",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
-        "//tensorflow/core/common_runtime/gpu:gpu_id",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:utils",
-        "//tensorflow/core/util:overflow",
-        "//tensorflow/core/grappler/clusters:utils",
     ] + tf_protos_grappler(),
 )
 
@@ -263,8 +263,8 @@ cc_library(
     hdrs = ["op_context.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
     ] + tf_protos_grappler(),
 )
 
@@ -340,13 +340,13 @@ cc_library(
         ":cost_estimator",
         ":op_context",
         ":utils",
-        "@com_google_absl//absl/strings",
-        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/clusters:utils",
         "//tensorflow/core/util:overflow",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/strings",
     ] + tf_protos_grappler(),
 )
 
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index c519a00571d..4342ec63749 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -1323,7 +1323,7 @@ class SymbolicShapeRefiner {
     // If function instantiation failed we will skip it during shape inference.
     if (!function_instantiated.ok()) {
       VLOG(3) << "Failed to instantiate a function. Error: "
-              << function_instantiated.error_message();
+              << function_instantiated.message();
       fun_to_grappler_function_item_[function_def->signature().name()] =
           absl::nullopt;
       return OkStatus();
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
index d96834b7db3..f3385e67802 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -115,8 +115,7 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
   measurement_fn(-1);
 
   if (!status.ok()) {
-    LOG(ERROR) << "Failed to run start measurements: "
-               << status.error_message();
+    LOG(ERROR) << "Failed to run start measurements: " << status.message();
     costs->execution_time = Costs::Duration::max();
     return status;
   }
@@ -135,8 +134,7 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
   }
 
   if (!status.ok()) {
-    LOG(ERROR) << "Failed to measure graph performance: "
-               << status.error_message();
+    LOG(ERROR) << "Failed to measure graph performance: " << status.message();
     costs->execution_time = Costs::Duration::max();
     return status;
   }
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index c4c509aa299..a9b6dac892a 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -16,6 +16,8 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
 
+#include <optional>
+
 #include "absl/strings/match.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -2669,13 +2671,15 @@ Status OpLevelCostEstimator::PredictCropAndResize(const OpContext& op_context,
   bool found_unknown_shapes = false;
 
   const auto method = op_context.op_info.attr().find("method");
-  bool use_bilinear_interp;
+  std::optional<bool> use_bilinear_interp;
   if (method == op_context.op_info.attr().end() ||
       method->second.s() == "bilinear") {
     use_bilinear_interp = true;
   } else if (method->second.s() == "nearest") {
     use_bilinear_interp = false;
-  } else {
+  }
+  if (!use_bilinear_interp.has_value() ||
+      op_context.op_info.outputs().empty()) {
     LOG(WARNING) << "method attr in CropAndResize invalid; expected bilinear "
                     "or nearest.";
     return PredictCostOfAnUnknownOp(op_context, node_costs);
@@ -2730,7 +2734,7 @@ Status OpLevelCostEstimator::PredictCropAndResize(const OpContext& op_context,
   // Ops for variable in_x (same computation across both branches).
   ops += (mul_cost * 2 + sub_cost + add_cost) * crop_volume;
   // Specify op_cost based on the method.
-  if (use_bilinear_interp) {
+  if (*use_bilinear_interp) {
     // Ops for variables top_y_index, bottom_y_index, y_lerp.
     ops += (floor_cost + ceil_cost + sub_cost) * crop_depth;
     // Ops for variables left_x, right_x, x_lerp;
diff --git a/tensorflow/core/grappler/graph_analyzer/gen_node.cc b/tensorflow/core/grappler/graph_analyzer/gen_node.cc
index 231575198c5..f7a9f11bf3d 100644
--- a/tensorflow/core/grappler/graph_analyzer/gen_node.cc
+++ b/tensorflow/core/grappler/graph_analyzer/gen_node.cc
@@ -54,7 +54,7 @@ Status GenNode::ParseInputs(const GenNodeMap* map) {
     return Status(
         absl::StatusCode::kInvalidArgument,
         absl::StrFormat("Node '%s' contains an undefined operation '%s': %s",
-                        name(), opcode(), st.error_message()));
+                        name(), opcode(), st.message()));
   }
 
   int n_inputs = node_->input_size();
diff --git a/tensorflow/core/grappler/graph_analyzer/gen_node_test.cc b/tensorflow/core/grappler/graph_analyzer/gen_node_test.cc
index 5961c576c64..79d32dc348f 100644
--- a/tensorflow/core/grappler/graph_analyzer/gen_node_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/gen_node_test.cc
@@ -360,9 +360,10 @@ TEST(GenNodeTest, ParseNodeUndefinedOp) {
   auto gn = map["node1"].get();
   ASSERT_THAT(
       gn->ParseInputs(&map),
-      Eq(Status(error::INVALID_ARGUMENT,
-                "Node 'node1' contains an undefined operation 'Zzzx': " +
-                    nested_error.error_message())));
+      Eq(Status(
+          absl::StatusCode::kInvalidArgument,
+          absl::StrCat("Node 'node1' contains an undefined operation 'Zzzx': ",
+                       nested_error.message()))));
 }
 
 TEST(GenNodeTest, ParseNodeUnexpectedInputs) {
@@ -374,7 +375,7 @@ TEST(GenNodeTest, ParseNodeUnexpectedInputs) {
 
   auto gn1 = map["node1"].get();
   EXPECT_THAT(gn1->ParseInputs(&map),
-              Eq(Status(error::INVALID_ARGUMENT,
+              Eq(Status(absl::StatusCode::kInvalidArgument,
                         "Node 'node1' has a non-control "
                         "input from 'node1' at index 0 but its operation "
                         "'Const' defines only 0 inputs.")));
@@ -388,7 +389,7 @@ TEST(GenNodeTest, ParseNodeUnexpectedInputs) {
 
   auto gn3 = map["node3"].get();
   EXPECT_THAT(gn3->ParseInputs(&map),
-              Eq(Status(error::INVALID_ARGUMENT,
+              Eq(Status(absl::StatusCode::kInvalidArgument,
                         "Node 'node3' has a non-control "
                         "input from 'node1' at index 2 but its operation "
                         "'Sub' defines only 2 inputs.")));
@@ -420,7 +421,7 @@ TEST(GenNodeTest, ParseNodeInvalidInput) {
   ASSERT_THAT(
       gn1->ParseInputs(&map),
       Eq(Status(
-          error::INVALID_ARGUMENT,
+          absl::StatusCode::kInvalidArgument,
           "Node 'node1' input 0 refers to a non-existing node 'node2'.")));
 }
 
@@ -465,9 +466,9 @@ TEST(GenNodeTest, BuildGraphInMapDuplicateNode) {
   (*graph.add_node()) = MakeNodeConst("node1");
   (*graph.add_node()) = MakeNodeConst("node1");
   GenNodeMap map;
-  ASSERT_THAT(
-      GenNode::BuildGraphInMap(graph, &map),
-      Eq(Status(error::INVALID_ARGUMENT, "Duplicate node name 'node1'.")));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map),
+              Eq(Status(absl::StatusCode::kInvalidArgument,
+                        "Duplicate node name 'node1'.")));
 }
 
 TEST(GenNodeTest, BuildGraphInMapParseError) {
@@ -480,7 +481,7 @@ TEST(GenNodeTest, BuildGraphInMapParseError) {
   ASSERT_THAT(
       GenNode::BuildGraphInMap(graph, &map),
       Eq(Status(
-          error::INVALID_ARGUMENT,
+          absl::StatusCode::kInvalidArgument,
           "Node 'node2' input 0 refers to a non-existing node 'node3'.")));
 }
 
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc
index 888c0e50705..011959e2665 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc
@@ -92,8 +92,8 @@ TEST_F(GraphAnalyzerTest, BuildMapError) {
   (*graph_3n_self_control_.add_node()) = MakeNodeConst("node1");
   gran_ = std::make_unique<GraphAnalyzer>(graph_3n_self_control_, 1);
   Status st = BuildMap();
-  ASSERT_THAT(
-      st, Eq(Status(error::INVALID_ARGUMENT, "Duplicate node name 'node1'.")));
+  ASSERT_THAT(st, Eq(Status(absl::StatusCode::kInvalidArgument,
+                            "Duplicate node name 'node1'.")));
 }
 
 TEST_F(GraphAnalyzerTest, FindSubgraphs0) {
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc b/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
index 3b89f4b7137..82081e898d9 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
@@ -1094,7 +1094,7 @@ TEST_F(SignatureTest, GraphTooBig) {
   sg.ExtractForSignature(&sig_.map);
 
   ASSERT_THAT(sig_.Compute(),
-              Eq(Status(error::INVALID_ARGUMENT,
+              Eq(Status(absl::StatusCode::kInvalidArgument,
                         "A graph of 65 nodes is too big for signature "
                         "computation, the maximal supported node count is "
                         "64.")));
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index d1abfc4f6f7..0b11adcdaa4 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -329,7 +329,7 @@ class GraphViewInternal {
   // TODO(ezhulenev): Remove this function.
   void AddUniqueNodeOrDie(NodeDefT* node) {
     Status st = AddUniqueNode(node);
-    CHECK(st.ok()) << st.error_message();
+    CHECK(st.ok()) << st.message();
   }
 
   // TODO(lyandy): Checks for self loops, Switch control dependencies, fanins
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 76256fff334..d5c37f7194b 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -621,7 +621,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
       0, true);
   if (!attr_status.ok()) {
     LOG(ERROR) << "Failed to instantiate default attribute values: "
-               << attr_status.error_message();
+               << attr_status.message();
     return nullptr;
   }
 
@@ -641,7 +641,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
     VLOG(1) << "Pruning graph...";
     auto status = PruneGraph(new_item.get());
     if (!status.ok()) {
-      LOG(ERROR) << "Pruning failed: " << status.error_message();
+      LOG(ERROR) << "Pruning failed: " << status.message();
       return nullptr;
     }
     VLOG(1) << "Number of nodes in graph after pruning: "
diff --git a/tensorflow/core/grappler/mutable_graph_view_test.cc b/tensorflow/core/grappler/mutable_graph_view_test.cc
index 64eb29ca934..3916f992f6f 100644
--- a/tensorflow/core/grappler/mutable_graph_view_test.cc
+++ b/tensorflow/core/grappler/mutable_graph_view_test.cc
@@ -204,7 +204,7 @@ TEST(MutableGraphViewTest, AddSubgraphAndFailIfFunctionDifferent) {
 
   Status status = graph.AddSubgraph(std::move(subgraph));
   EXPECT_FALSE(status.ok());
-  EXPECT_EQ(status.error_message(),
+  EXPECT_EQ(status.message(),
             "MutableGraphView::AddSubgraph(function_size=1) error: Found "
             "different function definition with the same name: XTimesTwo.");
 }
@@ -297,7 +297,7 @@ TEST(MutableGraphViewTest, UpdateNodeSwitchControlDependency) {
       "change node op to Switch when node drives a control dependency "
       "(alternatively, we could add the identity node needed, but it seems "
       "like an unlikely event and probably a mistake).";
-  EXPECT_EQ(s.error_message(), expected_msg);
+  EXPECT_EQ(s.message(), expected_msg);
 
   CheckNode(graph, "foo", "NotImportant", "", {}, {}, {"^bar"});
   CheckNode(graph, "bar", "NotImportant", "", {}, {"^foo"}, {});
@@ -363,7 +363,7 @@ void TestUpdateNodeName(absl::string_view from_node_name, bool node_exists,
     updated_node_name = string(to_node_name);
   } else {
     updated_node_name = string(from_node_name);
-    EXPECT_EQ(s.error_message(), error_msg);
+    EXPECT_EQ(s.message(), error_msg);
   }
   if (node_exists) {
     EXPECT_EQ(node->name(), updated_node_name);
@@ -680,7 +680,7 @@ void TestSwapNodeNamesError(absl::string_view from_node_name,
 
   Status s = graph.SwapNodeNames(from_node_name, to_node_name, update_fanouts);
   EXPECT_EQ(s.ok(), false);
-  EXPECT_EQ(s.error_message(), error_msg);
+  EXPECT_EQ(s.message(), error_msg);
 
   // No changes to graph.
   CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
@@ -848,14 +848,14 @@ TEST(MutableGraphViewTest, UpdateFanoutsToSwitchWithControlFromSwitch) {
       "MutableGraphView::UpdateFanouts(from_node_name='a', to_node_name='b') "
       "error: can't update fanouts to node 'b' as it will become a Switch "
       "control dependency.";
-  EXPECT_EQ(s.error_message(), expected_msg);
+  EXPECT_EQ(s.message(), expected_msg);
   s = graph.UpdateFanouts("d", "b");
   EXPECT_FALSE(s.ok());
   expected_msg =
       "MutableGraphView::UpdateFanouts(from_node_name='d', to_node_name='b') "
       "error: can't update fanouts to node 'b' as it will become a Switch "
       "control dependency.";
-  EXPECT_EQ(s.error_message(), expected_msg);
+  EXPECT_EQ(s.message(), expected_msg);
 
   EXPECT_EQ(graph.graph()->node_size(), 5);
 
@@ -926,7 +926,7 @@ void TestAddRegularFanin(absl::string_view node_name, bool node_exists,
   Status s = graph.AddRegularFanin(node_name, fanin_to_add);
   EXPECT_EQ(s.ok(), success);
   if (!success) {
-    EXPECT_EQ(s.error_message(), error_msg);
+    EXPECT_EQ(s.message(), error_msg);
   }
   if (node_exists) {
     CompareNodeFanins(graph, node, expected_fanins);
@@ -1058,7 +1058,7 @@ void TestAddRegularFaninByPort(absl::string_view node_name, bool node_exists,
   Status s = graph.AddRegularFaninByPort(node_name, port, fanin_to_add);
   EXPECT_EQ(s.ok(), success);
   if (!success) {
-    EXPECT_EQ(s.error_message(), error_msg);
+    EXPECT_EQ(s.message(), error_msg);
   }
   if (node_exists) {
     CompareNodeFanins(graph, node, expected_fanins);
@@ -1212,7 +1212,7 @@ void TestRemoveRegularFanin(absl::string_view node_name, bool node_exists,
   Status s = graph.RemoveRegularFanin(node_name, fanin_to_remove);
   EXPECT_EQ(s.ok(), success);
   if (!success) {
-    EXPECT_EQ(s.error_message(), error_msg);
+    EXPECT_EQ(s.message(), error_msg);
   }
   if (node_exists) {
     CompareNodeFanins(graph, node, expected_fanins);
@@ -1354,7 +1354,7 @@ void TestRemoveRegularFaninByPort(absl::string_view node_name, bool node_exists,
   Status s = graph.RemoveRegularFaninByPort(node_name, port);
   EXPECT_EQ(s.ok(), success);
   if (!success) {
-    EXPECT_EQ(s.error_message(), error_msg);
+    EXPECT_EQ(s.message(), error_msg);
   }
   if (node_exists) {
     CompareNodeFanins(graph, node, expected_fanins);
@@ -1459,7 +1459,7 @@ void TestRemoveAllFanins(absl::string_view node_name, bool node_exists,
   Status s = graph.RemoveAllFanins(node_name, keep_controlling_nodes);
   EXPECT_EQ(s.ok(), success);
   if (!success) {
-    EXPECT_EQ(s.error_message(), error_msg);
+    EXPECT_EQ(s.message(), error_msg);
   }
   if (node_exists) {
     CompareNodeFanins(graph, node, expected_fanins);
@@ -1555,7 +1555,7 @@ void TestUpdateFanin(absl::string_view node_name, bool node_exists,
   Status s = graph.UpdateFanin(node_name, from_fanin, to_fanin);
   EXPECT_EQ(s.ok(), success);
   if (!success) {
-    EXPECT_EQ(s.error_message(), error_msg);
+    EXPECT_EQ(s.message(), error_msg);
   }
   if (node_exists) {
     CompareNodeFanins(graph, node, expected_fanins);
@@ -1694,7 +1694,7 @@ void TestUpdateFaninFromFaninToNodeAsSwitchControl(const TensorId& fanin) {
       "to_fanin='^b') error: can't update to fanin '^b' as it will become a "
       "Switch control dependency.",
       fanin.ToString());
-  EXPECT_EQ(s.error_message(), expected_msg);
+  EXPECT_EQ(s.message(), expected_msg);
 
   EXPECT_EQ(graph.graph()->node_size(), 3);
 
@@ -1733,7 +1733,7 @@ void TestUpdateRegularFaninByPort(absl::string_view node_name, bool node_exists,
   Status s = graph.UpdateRegularFaninByPort(node_name, port, fanin);
   EXPECT_EQ(s.ok(), success);
   if (!success) {
-    EXPECT_EQ(s.error_message(), error_msg);
+    EXPECT_EQ(s.message(), error_msg);
   }
   if (node_exists) {
     CompareNodeFanins(graph, node, expected_fanins);
@@ -1897,7 +1897,7 @@ void TestSwapRegularFaninsByPorts(absl::string_view node_name, bool node_exists,
   Status s = graph.SwapRegularFaninsByPorts(node_name, from_port, to_port);
   EXPECT_EQ(s.ok(), success);
   if (!success) {
-    EXPECT_EQ(s.error_message(), error_msg);
+    EXPECT_EQ(s.message(), error_msg);
   }
   if (node_exists) {
     CompareNodeFanins(graph, node, expected_fanins);
@@ -2397,21 +2397,21 @@ TEST(MutableGraphViewTest, AddControllingFaninMissing) {
   string expected_msg =
       "MutableGraphView::AddControllingFanin(node_name='a', fanin='^c') error: "
       "node 'c' was not found.";
-  EXPECT_EQ(s.error_message(), expected_msg);
+  EXPECT_EQ(s.message(), expected_msg);
   // Missing node.
   s = graph.AddControllingFanin("d", {"a", Graph::kControlSlot});
   EXPECT_FALSE(s.ok());
   expected_msg =
       "MutableGraphView::AddControllingFanin(node_name='d', fanin='^a') error: "
       "node 'd' was not found.";
-  EXPECT_EQ(s.error_message(), expected_msg);
+  EXPECT_EQ(s.message(), expected_msg);
   // Missing node and fanin.
   s = graph.AddControllingFanin("c", {"d", Graph::kControlSlot});
   EXPECT_FALSE(s.ok());
   expected_msg =
       "MutableGraphView::AddControllingFanin(node_name='c', fanin='^d') error: "
       "node 'c' was not found.";
-  EXPECT_EQ(s.error_message(), expected_msg);
+  EXPECT_EQ(s.message(), expected_msg);
 
   ASSERT_EQ(graph.graph()->node_size(), 2);
 
@@ -2469,7 +2469,7 @@ TEST(MutableGraphViewTest, AddControllingFaninSwitch) {
   string expected_msg =
       "MutableGraphView::AddControllingFanin(node_name='a', fanin='^b') error: "
       "can't add fanin '^b' as it will become a Switch control dependency.";
-  EXPECT_EQ(s.error_message(), expected_msg);
+  EXPECT_EQ(s.message(), expected_msg);
 
   ASSERT_EQ(graph.graph()->node_size(), 2);
 
@@ -2560,7 +2560,7 @@ void TestAddControllingFaninSelfLoops(absl::string_view node_name,
 
   Status s = graph.AddControllingFanin(node_name, fanin);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(), error_msg);
+  EXPECT_EQ(s.message(), error_msg);
 
   EXPECT_EQ(graph.graph()->node_size(), 5);
 
@@ -2622,7 +2622,7 @@ TEST(MutableGraphViewTest, AddControllingFaninSelfLoopsGeneratedIdentity) {
       "MutableGraphView::AddControllingFanin(node_name='ConstantFoldingCtrl/"
       "b_1', fanin='b:1') error: can't add generated fanin "
       "'^ConstantFoldingCtrl/b_1' to self.";
-  EXPECT_EQ(s.error_message(), expected_msg);
+  EXPECT_EQ(s.message(), expected_msg);
 
   EXPECT_EQ(graph.graph()->node_size(), 4);
 
@@ -2715,7 +2715,7 @@ TEST(MutableGraphViewTest, RemoveControllingFaninSelfLoop) {
       "MutableGraphView::RemoveControllingFanin(node_name='c', "
       "fanin_node_name='c') error: can't remove fanin '^c' from "
       "self.";
-  EXPECT_EQ(s.error_message(), expected_msg);
+  EXPECT_EQ(s.message(), expected_msg);
 
   ASSERT_EQ(graph.graph()->node_size(), 3);
 
@@ -2756,7 +2756,7 @@ void TestUpdateAllRegularFaninsToControlling(
   Status s = graph.UpdateAllRegularFaninsToControlling(node_name);
   EXPECT_EQ(s.ok(), success);
   if (!success) {
-    EXPECT_EQ(s.error_message(), error_msg);
+    EXPECT_EQ(s.message(), error_msg);
   }
   if (node_exists) {
     CompareNodeFanins(graph, node, expected_fanins);
@@ -2940,7 +2940,7 @@ TEST(MutableGraphViewTest, DeleteNodesWithError) {
   string error_msg =
       "MutableGraphView::DeleteNodes(nodes_to_delete={a, b}) error: can't "
       "delete node(s) with retained fanouts(s) [a, b].";
-  EXPECT_EQ(s.error_message(), error_msg);
+  EXPECT_EQ(s.message(), error_msg);
 
   EXPECT_EQ(graph.graph()->node_size(), 6);
 
@@ -2975,7 +2975,7 @@ TEST(MutableGraphViewTest, DeleteNodesWithLargeError) {
       "MutableGraphView::DeleteNodes(nodes_to_delete={a, b, c, d, e, ...}) "
       "error: can't delete node(s) with retained fanouts(s) [a, b, c, d, e, "
       "...].";
-  EXPECT_EQ(s.error_message(), error_msg);
+  EXPECT_EQ(s.message(), error_msg);
 
   EXPECT_EQ(graph.graph()->node_size(), 13);
 
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 94d152f8727..b1ef63a8f49 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -704,7 +704,7 @@ bool IsStateful(const NodeDef node, const OpRegistryInterface* op_registry) {
   Status status = op_registry->LookUpOpDef(op_name, &op_def);
   if (!status.ok()) {
     LOG(WARNING) << "Failed to lookup OpDef for " << op_name
-                 << ". Error: " << status.error_message();
+                 << ". Error: " << status.message();
     return false;
   }
   return op_def->is_stateful();
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index cb1739c4392..58ec93e281e 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -642,7 +642,6 @@ cc_library(
         ":remapper",
         ":scoped_allocator_optimizer",
         ":shape_optimizer",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -658,6 +657,7 @@ cc_library(
         "//tensorflow/core/grappler/utils:tpu",
         "//tensorflow/core/grappler/verifiers:graph_verifier",
         "//tensorflow/core/grappler/verifiers:structure_verifier",
+        "@com_google_absl//absl/strings",
     ] + select({
         #TODO(b/200087693): LLVM does not build on Fuchsia.
         "//tensorflow:fuchsia": [],
@@ -874,7 +874,6 @@ tf_kernel_library(
     deps = [
         ":constant_folding",
         ":graph_optimizer",
-        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -885,9 +884,10 @@ tf_kernel_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:graph_view",
+        "//tensorflow/core/grappler/utils:pattern_utils",
         "//tensorflow/core/grappler/utils:symbolic_shapes",
         "//tensorflow/core/grappler/utils:topological_sort",
-        "//tensorflow/core/grappler/utils:pattern_utils",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + if_mkl(["//tensorflow/core/graph:mkl_graph_util"]),
 )
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 94363c38eba..4c1ee2e302f 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -221,8 +221,8 @@ void ReplaceWithNoOp(NodeDef* node, const GraphOptimizerContext& ctx) {
   ctx.node_map->RemoveInputs(node->name());
   ctx.graph_properties->ClearInputProperties(node->name());
   ctx.graph_properties->ClearOutputProperties(node->name());
+  ChangeToNoOp(node);
   EraseRegularNodeAttributes(node);
-  node->set_op("NoOp");
   node->clear_input();
 }
 
@@ -3128,7 +3128,7 @@ class SimplifyAggregation : public ArithmeticOptimizerStage {
     Status status = SetTensorValue(type, num_inputs, &t);
     if (!status.ok()) {
       return errors::Internal("Failed to create const node: ",
-                              status.error_message());
+                              status.message());
     }
 
     TensorValue value(&t);
@@ -3137,7 +3137,7 @@ class SimplifyAggregation : public ArithmeticOptimizerStage {
                                             new_const_node);
     if (!status.ok()) {
       return errors::Internal("Failed to create const node: ",
-                              status.error_message());
+                              status.message());
     }
     new_const_node->set_device(node->device());
     MaybeAddControlInput(NodeName(node->input(0)), new_const_node,
@@ -4450,7 +4450,7 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
                                          /*include_tensor_values=*/false);
   const bool can_use_shapes = status.ok();
   if (!can_use_shapes) {
-    VLOG(1) << "Shape inference failed." << status.error_message();
+    VLOG(1) << "Shape inference failed." << status.message();
   }
 
   // Perform the optimizations.
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index 58b3e78a6fc..22823ed0132 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -137,6 +137,8 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
         "Tmlp",
         "TmlpV2",
         "TmlpV3",
+        "Pmlp",
+        "FastUnsortedSegmentMax",
     };
 #if TENSORFLOW_USE_ROCM
     if (true) {
@@ -406,7 +408,6 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
                                      "FusedBatchNormGradV3",
                                      "LeakyRelu",
                                      "LeakyReluGrad",
-                                     "Mean",
                                      "Mul",
                                      "Sub",
                                      "Elu",
@@ -447,6 +448,7 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
         "Exp",
         "Expm1",
         "L2Loss",
+        "Mean",
         "Pow",
         "SaveV2",
         "SoftmaxCrossEntropyWithLogits",
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 0d120f4ee59..e48b48ab668 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -1994,7 +1995,7 @@ void ConstantFolding::ReplaceOperationWithNoOp(NodeDef* node,
                                                GraphProperties* properties,
                                                GraphDef* graph) {
   if (HasRegularOutputs(*node, *node_map_)) return;
-  node->set_op("NoOp");
+  ChangeToNoOp(node);
   EraseRegularNodeAttributes(node);
   EraseNodeOutputAttributes(node);
   // Erase attributes that describe output properties.
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index fae54db3fb1..45e771fa448 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 
+#include <string>
+
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/full_type.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -4305,6 +4309,102 @@ TEST_F(ConstantFoldingTest, QuantizationEmulation) {
   }
 }
 
+static void add_full_type(GrapplerItem& item, std::string node_name) {
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    NodeDef* node = item.graph.mutable_node(i);
+    if (node->name() == node_name) {
+      FullTypeDef t;
+      t.set_type_id(TFT_PRODUCT);
+      t.add_args()->set_type_id(TFT_TENSOR);
+      t.mutable_args(0)->add_args()->set_type_id(TFT_FLOAT);
+      *node->mutable_experimental_type() = t;
+      break;
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, RedundantVariableUpdateAddZeroToVar) {
+  // Adding zeros to a variable should be changed to Identity
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output var = ops::Variable(scope.WithOpName("var"), {2, 2}, DT_FLOAT);
+  Output zeros = ops::Const(scope.WithOpName("zeros_const"), 0.0f, {2, 2});
+  Output b = ops::AssignAdd(scope.WithOpName("assign_add"), var, zeros);
+
+  GrapplerItem item;
+  item.fetch = {"assign_add"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  // Add full type information to the node that will be converted to an Identity
+  add_full_type(item, "assign_add");
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef got;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("var", "VariableV2", {}, {}, &want);
+  AddNode("zeros_const", "Const", {}, {}, &want);
+  AddNode("assign_add", "Identity", {"var", "^zeros_const"}, {}, &want);
+
+  CompareGraphs(want, got);
+  TF_EXPECT_OK(status);
+
+  for (int i = 0; i < got.node_size(); ++i) {
+    const NodeDef& node = got.node(i);
+    if (node.name() == "assign_add") {  // now an Identity
+      FullTypeDef t1 = node.experimental_type();
+      EXPECT_EQ(t1.type_id(), TFT_PRODUCT);
+      EXPECT_EQ(t1.args_size(), 1);
+      FullTypeDef t2 = t1.args(0);
+      EXPECT_EQ(t2.type_id(), TFT_TENSOR);
+      EXPECT_EQ(t2.args_size(), 1);
+      FullTypeDef t3 = t1.args(0).args(0);
+      EXPECT_EQ(t3.type_id(), TFT_FLOAT);
+      EXPECT_EQ(t3.args_size(), 0);
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, RedundantVariableUpdateAddZerosToHandle) {
+  // Adding zeros to a variable handle should be changed to NoOp
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output handle =
+      ops::VarHandleOp(scope.WithOpName("handle"), DT_FLOAT, {2, 2});
+  Output zeros = ops::Const(scope.WithOpName("zeros_const"), 0.0f, {2, 2});
+  auto b = ops::AssignAddVariableOp(scope.WithOpName("assign_add_var"), handle,
+                                    zeros);
+
+  GrapplerItem item;
+  item.fetch = {"assign_add_var"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  // Add full type information to the node that will be converted to a NoOp
+  add_full_type(item, "assign_add_var");
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef got;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("handle", "VarHandleOp", {}, {}, &want);
+  AddNode("zeros_const", "Const", {}, {}, &want);
+  AddNode("assign_add_var", "NoOp", {"^handle", "^zeros_const"}, {}, &want);
+
+  CompareGraphs(want, got);
+  TF_EXPECT_OK(status);
+
+  for (int i = 0; i < got.node_size(); ++i) {
+    const NodeDef& node = got.node(i);
+    if (node.name() == "assign_add_var") {  // now a NoOp
+      FullTypeDef t = node.experimental_type();
+      EXPECT_TRUE((t.type_id() == TFT_UNSET) ||
+                  ((t.type_id() == TFT_PRODUCT) && (t.args_size() == 0)));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 0532049bf5b..dccbac09df4 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -49,10 +49,6 @@ cc_library(
     deps = [
         ":graph_utils",
         ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/core/kernels/data:shard_dataset_op",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/data:dataset_utils",
@@ -62,6 +58,10 @@ cc_library(
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core/kernels/data:shard_dataset_op",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -92,16 +92,16 @@ cc_library(
     deps = [
         ":graph_utils",
         ":optimizer_base",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -132,15 +132,15 @@ cc_library(
         ":function_utils",
         ":graph_utils",
         ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -168,15 +168,15 @@ cc_library(
     deps = [
         ":graph_utils",
         ":optimizer_base",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -204,15 +204,15 @@ cc_library(
     deps = [
         ":graph_utils",
         ":optimizer_base",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -240,15 +240,15 @@ cc_library(
     deps = [
         ":graph_utils",
         ":optimizer_base",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -276,19 +276,19 @@ cc_library(
         "filter_fusion.h",
     ],
     deps = [
-        ":graph_utils",
         ":fusion_utils",
+        ":graph_utils",
         ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -319,15 +319,15 @@ cc_library(
         ":function_utils",
         ":graph_utils",
         ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -355,19 +355,19 @@ cc_library(
         "fusion_utils.h",
     ],
     deps = [
-        ":graph_utils",
         ":function_utils",
-        "//tensorflow/core/grappler:mutable_graph_view",
+        ":graph_utils",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
-        "//tensorflow/core/kernels:functional_ops",
-        "//tensorflow/core/kernels:control_flow_ops",
-        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/core/kernels:functional_ops",
     ] + tf_protos_all(),
 )
 
@@ -397,9 +397,9 @@ cc_library(
         ":graph_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:utils",
-        "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
 )
 
@@ -429,13 +429,13 @@ cc_library(
         "graph_utils.h",
     ],
     deps = [
-        "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:utils",
-        "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
 )
 
@@ -465,10 +465,10 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:mutable_graph_view",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:utils",
     ] + tf_protos_all(),
 )
 
@@ -479,19 +479,19 @@ cc_library(
         "make_deterministic.h",
     ],
     deps = [
-        ":split_utils",
         ":function_utils",
         ":graph_utils",
         ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "//tensorflow/core/data:dataset_utils",
+        ":split_utils",
         "//tensorflow/core:framework",
-        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/data:dataset_utils",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -523,14 +523,14 @@ cc_library(
     deps = [
         ":function_utils",
         ":graph_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:statusor",
     ] + tf_protos_all(),
 )
 
@@ -593,14 +593,14 @@ cc_library(
     deps = [
         ":graph_utils",
         ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -626,22 +626,22 @@ cc_library(
         "map_and_filter_fusion.h",
     ],
     deps = [
-        ":graph_utils",
         ":fusion_utils",
+        ":graph_utils",
         ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/kernels:function_ops",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -669,19 +669,19 @@ cc_library(
         "map_fusion.h",
     ],
     deps = [
-        ":graph_utils",
         ":fusion_utils",
+        ":graph_utils",
         ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -714,15 +714,15 @@ cc_library(
         ":function_utils",
         ":graph_utils",
         ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -748,15 +748,15 @@ cc_library(
     srcs = ["meta_optimizer.cc"],
     hdrs = ["meta_optimizer.h"],
     deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler/utils:functions",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -771,14 +771,14 @@ cc_library(
         ":function_utils",
         ":graph_utils",
         ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -879,17 +879,17 @@ cc_library(
     deps = [
         ":graph_utils",
         ":optimizer_base",
-        "//tensorflow/core/data:dataset_utils",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/data:dataset_utils",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -919,15 +919,15 @@ cc_library(
     deps = [
         ":graph_utils",
         ":optimizer_base",
-        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -955,15 +955,15 @@ cc_library(
     deps = [
         ":graph_utils",
         ":optimizer_base",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "@com_google_absl//absl/strings",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -994,15 +994,15 @@ cc_library(
     deps = [
         ":graph_utils",
         ":optimizer_base",
-        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index 4b6896cedda..7e1a0cd153b 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -786,7 +786,7 @@ Status ApplyAutoShard(const NodeDef& sink_node, int64_t num_workers,
         LOG(WARNING) << "AUTO sharding policy will apply DATA sharding policy "
                         "as it failed to apply FILE sharding policy because of "
                         "the following reason: "
-                     << s.error_message();
+                     << s.message();
         *policy_applied = AutoShardPolicy::DATA;
         return ShardByData(sink_node, num_workers, index, num_replicas, graph);
       }
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
index 9a53b00275e..cac01d1842a 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
@@ -184,7 +184,7 @@ TEST(FunctionUtilsTest, AddNodeToFunctionDef) {
     def fn2():
       return control_flow_ops.Assert(False, ["Wrong branch!!!"])
 
-    return control_flow_ops.cond(pred, fn1, fn2)
+    return cond.cond(pred, fn1, fn2)
 
   r = test_function()
 */
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index 432870fa8e3..93c0a8dce66 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -117,7 +117,7 @@ TEST(GraphUtilsTest, GetScalarConstNodeErrorWithNonConst) {
   int64_t result;
   Status s = GetScalarConstNodeValue<int64_t>(*non_const, &result);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             "Node Placeholder is not a Const node. Op: Placeholder");
 }
 
@@ -128,7 +128,7 @@ TEST(GraphUtilsTest, GetScalarConstNodeErrorWithType) {
   bool result;
   Status s = GetScalarConstNodeValue<bool>(*int64_node, &result);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             "Node Const should have type bool but has type: int64");
 }
 
@@ -146,8 +146,7 @@ TEST(GraphUtilsTest, GetScalarConstNodeErrorWithVector) {
   int64_t result;
   Status s = GetScalarConstNodeValue<int64_t>(node, &result);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
-            "Node Const should be a scalar but has shape: [1]");
+  EXPECT_EQ(s.message(), "Node Const should be a scalar but has shape: [1]");
 }
 
 TEST(GraphUtilsTest, Compare) {
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index e196f699e3d..d83cc07bf8e 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -324,12 +324,9 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       nodes_to_simplify->PushBack(node_to_idx_[old_input_node]);
       ++pos;
     }
-    node->set_op("NoOp");
+    ChangeToNoOp(node);
     EraseRegularNodeAttributes(node);
     DedupControlInputs(node);
-    // Noop nodes have no outputs. Remove any full type information describing
-    // the outputs that were not consumed.
-    node->clear_experimental_type();
     nodes_to_simplify->PushBack(node_to_idx_[node]);
     return;
   }
@@ -773,7 +770,7 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     } else {
       LOG(ERROR) << "Iteration = " << iteration
                  << ", topological sort failed with message: "
-                 << topo_sort_status.error_message();
+                 << topo_sort_status.message();
     }
     // Turn nodes with only control outputs into NoOps, prune NoOp and Identity
     // nodes.
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 2a1d5be5efa..611862b1532 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -1343,7 +1343,7 @@ Status InlineFunctionCalls(const GrapplerItem& item,
 
       if (!can_inline_function_call.ok() &&
           (is_aggressive || force_inline_as_multi_device)) {
-        VLOG(2) << "Ignore error: " << can_inline_function_call.error_message();
+        VLOG(2) << "Ignore error: " << can_inline_function_call.message();
         can_inline_function_call = OkStatus();
       }
     }
@@ -1362,7 +1362,7 @@ Status InlineFunctionCalls(const GrapplerItem& item,
 
     } else {
       VLOG(2) << "Failed to inline function call node: "
-              << can_inline_function_call.error_message();
+              << can_inline_function_call.message();
     }
   }
 
@@ -1498,7 +1498,7 @@ Status FunctionOptimizer::RunFunctionOptimizerPass(
       if (!status.ok() && is_graph_modified()) {
         return status;
       } else if (!status.ok() && !is_graph_modified()) {
-        VLOG(3) << "Skip specialization error: " << status.error_message();
+        VLOG(3) << "Skip specialization error: " << status.message();
         copy_node();
       }
       continue;
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
index 46f625b46be..8851011d91e 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
@@ -75,41 +75,6 @@ inline GpuStats GetNumGPUs(const Cluster& cluster) {
   return gpu_stats;
 }
 
-inline bool NumConvOnDeviceWithDataTypeOverThreshold(
-    const TransposeContext& context, absl::string_view device,
-    const DataType& data_type) {
-  int num_conv_gpu = 0;
-  int num_conv_gpu_expected_dtype = 0;
-
-  for (const auto& node : context.graph_view->GetNodes()) {
-    const auto* node_def = node.node();
-    if (!IsConv2D(*node_def) && !IsConv3D(*node_def)) {
-      continue;
-    }
-    const string& device_name = GetDeviceName(*node_def);
-    string device_type;
-    string task;
-    if (!DeviceNameUtils::SplitDeviceName(device_name, &task, &device_type) ||
-        !absl::StrContains(absl::AsciiStrToLower(device_type),
-                           absl::AsciiStrToLower(device))) {
-      continue;
-    }
-    num_conv_gpu++;
-    const auto* t_attr = node.GetAttr("T");
-    if (t_attr == nullptr) {
-      continue;
-    }
-    if (t_attr->type() == data_type) {
-      num_conv_gpu_expected_dtype++;
-    }
-  }
-
-  if (num_conv_gpu == 0) return false;
-
-  return (static_cast<float>(num_conv_gpu_expected_dtype) /
-          static_cast<float>(num_conv_gpu)) >= kConvGPUExpectedDtypeThreshold;
-}
-
 inline bool ConvBackpropExists(const TransposeContext& context,
                                absl::string_view device,
                                const DataType& data_type) {
@@ -156,15 +121,48 @@ inline std::pair<string, string> GetSrcAndDstDataFormats(
       (static_cast<float>(gpu_stats.num_amperes) /
        static_cast<float>(gpu_stats.num_gpus)) >= kGPURatioThreshold;
 
-  // We swap the src_format and dst_format when:
-  //   (1): Volta+ GPUs AND half-dtype conv nodes >= 50% of total conv nodes.
-  //   (2): Ampere+ GPUs AND TF32-dtype conv nodes >= 50% AND no backprop nodes.
+  // We swap the src_format and dst_format when >= 50% of gpu conv nodes are
+  //   (1): half-dtype and we are tuning for Volta+ GPUs
+  //   (2): TF32-dtype with TensorCores enabled and tuning for Ampere+ GPUs
+  //        (but only if no backward conv in fp32 exists)
+  //   (3): blfoat16-dtype and tuning for Ampere+ GPUs
+  int num_conv_gpu = 0;
+  int num_conv_gpu_prefer_swap = 0;
+  bool fp32_backprop = ConvBackpropExists(context, kGPU, DT_FLOAT);
+
+  for (const auto& node : context.graph_view->GetNodes()) {
+    const auto* node_def = node.node();
+    if (!IsConv2D(*node_def) && !IsConv3D(*node_def)) {
+      continue;
+    }
+    const string& device_name = GetDeviceName(*node_def);
+    string device_type;
+    string task;
+    if (!DeviceNameUtils::SplitDeviceName(device_name, &task, &device_type) ||
+        !absl::StrContains(absl::AsciiStrToLower(device_type),
+                           absl::AsciiStrToLower(kGPU))) {
+      continue;
+    }
+    num_conv_gpu++;
+    const auto* t_attr = node.GetAttr("T");
+    if (t_attr == nullptr) {
+      continue;
+    }
+    const DataType dtype = t_attr->type();
+    if ((volta_ready && dtype == DT_HALF) ||
+        (ampere_ready && dtype == DT_BFLOAT16) ||
+        (ampere_ready && dtype == DT_FLOAT &&
+         tsl::tensor_float_32_execution_enabled() && !fp32_backprop)) {
+      num_conv_gpu_prefer_swap++;
+    }
+  }
+
+  // Check ratio of ops preferring swap.
   const bool should_swap =
-      volta_ready &&
-      (NumConvOnDeviceWithDataTypeOverThreshold(context, kGPU, DT_HALF) ||
-       (ampere_ready && tensor_float_32_execution_enabled() &&
-        NumConvOnDeviceWithDataTypeOverThreshold(context, kGPU, DT_FLOAT) &&
-        !ConvBackpropExists(context, kGPU, DT_FLOAT)));
+      num_conv_gpu > 0 &&
+      (static_cast<float>(num_conv_gpu_prefer_swap) /
+       static_cast<float>(num_conv_gpu)) >= kConvGPUExpectedDtypeThreshold;
+
   // We swap only if NHWC is enforced or no layout is enforced and the devices
   // config meet the thresholds
   if (is_NHWC_enforced || (context.enforced_layout.empty() && should_swap)) {
@@ -188,7 +186,7 @@ Status ExpandLayoutSensitiveOp(TransposeContext* context,
           transposer_factory->GetTransposer(*node_def);
       if (transposer == nullptr) {
         return Status(
-            error::NOT_FOUND,
+            absl::StatusCode::kNotFound,
             absl::StrCat(
                 "Layout sensitive operation should have a transposer. Node: ",
                 node_def->DebugString()));
@@ -209,7 +207,7 @@ Status ExpandLayoutAgnosticOp(TransposeContext* context,
       const auto& transposer = transposer_factory->GetTransposer(*node_def);
       if (transposer == nullptr) {
         return Status(
-            error::NOT_FOUND,
+            absl::StatusCode::kNotFound,
             absl::StrCat(
                 "Layout agnostic operation should have a transposer. Node: ",
                 node_def->DebugString()));
@@ -471,7 +469,7 @@ Status GenericLayoutOptimizer::Optimize(Cluster* cluster,
   if (!enforced_layout_.empty() && enforced_layout_ != "NHWC" &&
       enforced_layout_ != "NCHW") {
     return Status(
-        tensorflow::error::Code::INVALID_ARGUMENT,
+        absl::StatusCode::kInvalidArgument,
         absl::StrCat("Invalid value for enforced_layout: ", enforced_layout_,
                      ". Supported layouts: 'NHWC', 'NCHW'."));
   }
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index 81107660f3b..e8f2d486ba3 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -539,7 +539,7 @@ Status Transposer::UpdateEdge(
         is_src_format_to_dst_format, &added_node));
     added_node_name = node_name;
   } else {
-    return Status(error::INVALID_ARGUMENT,
+    return Status(absl::StatusCode::kInvalidArgument,
                   absl::StrCat("Unsupported op \"", op,
                                "\". Supported ops are Transpose, "
                                "DataFormatVecPerm, DataFormatDimMap."));
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
index edf319c1e02..9c58348daa7 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
@@ -574,7 +574,7 @@ Status PermuteSingle(absl::string_view location,
   DCHECK(values != nullptr);
   int permutation_size = permutation.size();
   if (values->size() != permutation_size) {
-    return Status(tensorflow::error::Code::INVALID_ARGUMENT,
+    return Status(absl::StatusCode::kInvalidArgument,
                   absl::StrCat("Size of values ", values->size(),
                                " does not match size of permutation ",
                                permutation_size, " @ ", location));
@@ -596,7 +596,7 @@ Status PermuteDouble(absl::string_view location,
   DCHECK(values != nullptr);
   int permutation_size = permutation.size();
   if (values->size() != permutation_size * 2) {
-    return Status(tensorflow::error::Code::INVALID_ARGUMENT,
+    return Status(absl::StatusCode::kInvalidArgument,
                   absl::StrCat("Size of values ", values->size(),
                                " does not match twice the size of permutation ",
                                permutation_size, " @ ", location));
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index 5697b279ed5..838a2673060 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -263,8 +263,7 @@ class GraphOptimizerStagePipeline {
         if (!stage_status.ok()) {
           VLOG(2) << "Failed to run optimizer " << stage->optimizer_name()
                   << ", stage " << stage->stage_name() << " node "
-                  << node->name()
-                  << ". Error: " << stage_status.error_message();
+                  << node->name() << ". Error: " << stage_status.message();
         }
         if (break_predicate_(*result)) return true;
       }
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 348c3fe77f2..148047a6255 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -577,7 +577,7 @@ Status EvaluateBoolOpForConstantOperands(const NodeDef& op_node,
       EvaluateNode(op_node, inputs, cpu_device, resource_mgr, &outputs));
 
   if (outputs.size() != 1 || outputs[0].tensor == nullptr) {
-    return Status(error::INVALID_ARGUMENT, "Expected one output.");
+    return Status(absl::StatusCode::kInvalidArgument, "Expected one output.");
   }
   *value = outputs[0].tensor->scalar<bool>()();
   delete outputs[0].tensor;
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 0eddc0cfea8..c9cf8dbc9fb 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -544,7 +544,7 @@ bool SchedulingPass(Cluster* cluster, std::unique_ptr<GraphMemory>* memory_ptr,
     Status s = (*memory_ptr)->InferStatically(cluster->GetDevices());
     if (!s.ok()) {
       memory_ptr->reset();
-      VLOG(1) << "Failed to infer memory usage: " << s.error_message();
+      VLOG(1) << "Failed to infer memory usage: " << s.message();
       return false;
     }
   }
@@ -581,7 +581,7 @@ bool SchedulingPass(Cluster* cluster, std::unique_ptr<GraphMemory>* memory_ptr,
                                         /*aggressive_shape_inference=*/false,
                                         /*include_tensor_values=*/false);
   if (!s.ok()) {
-    VLOG(1) << "Failed to infer shapes: " << s.error_message();
+    VLOG(1) << "Failed to infer shapes: " << s.message();
     return false;
   }
 
@@ -591,7 +591,7 @@ bool SchedulingPass(Cluster* cluster, std::unique_ptr<GraphMemory>* memory_ptr,
   Status initialized_topology = graph_topology.InitializeFromGraph(item->graph);
   if (!initialized_topology.ok()) {
     VLOG(1) << "Failed to initialize graph topology view: "
-            << initialized_topology.error_message();
+            << initialized_topology.message();
     return false;
   }
 
@@ -987,7 +987,7 @@ static bool IdentifySwappingCandidates(
     Status s = (*memory_ptr)->InferStatically(cluster->GetDevices());
     if (!s.ok()) {
       memory_ptr->reset();
-      VLOG(1) << "Failed to infer memory usage: " << s.error_message();
+      VLOG(1) << "Failed to infer memory usage: " << s.message();
       return false;
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index d0b71cd453d..516cf8bc995 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -1382,7 +1382,7 @@ Status OptimizeGraph(
   // Add all available devices so that inlined function can be placed.
   for (const Device* d : device_set.devices()) {
     Status added_device = item.AddDevice(d->name());
-    if (!added_device.ok()) VLOG(3) << added_device.error_message();
+    if (!added_device.ok()) VLOG(3) << added_device.message();
   }
   VLOG(3) << "Grappler available devices: "
           << absl::StrJoin(item.devices(), ", ");
@@ -1414,12 +1414,11 @@ Status OptimizeGraph(
     for (const FunctionDef& fdef : out_graph.library().function()) {
       const string& func_name = fdef.signature().name();
       if (flib->Contains(func_name)) {
-        StackTracesMap stack_traces = flib->GetStackTraces(func_name);
+        StackTracesMap stack_traces = *flib->GetStackTraces(func_name);
         TF_RETURN_IF_ERROR(
             flib->ReplaceFunction(func_name, fdef, stack_traces));
       } else {
-        TF_RETURN_IF_ERROR(
-            flib->AddFunctionDef(fdef, flib->GetStackTraces(func_name)));
+        TF_RETURN_IF_ERROR(flib->AddFunctionDef(fdef));
       }
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 8a63514d332..a8c9aed523a 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -743,7 +743,7 @@ TEST_F(MetaOptimizerTest, OptimizerTimesOut) {
   GraphDef original = item.graph;
   const Status status =
       RunMetaOptimizer(std::move(item), config, nullptr, nullptr, &output);
-  EXPECT_EQ(status.error_message(), "meta_optimizer exceeded deadline.");
+  EXPECT_EQ(status.message(), "meta_optimizer exceeded deadline.");
   // Make sure the graph was reverted to the original regardless of when the
   // optimizer timed out.
   CompareGraphs(original, output);
@@ -766,7 +766,7 @@ TEST_F(MetaOptimizerTest, MetaOptimizerTimesOut) {
   const int original_node_size = item.graph.node_size();
   const Status status =
       RunMetaOptimizer(std::move(item), config, nullptr, nullptr, &output);
-  EXPECT_EQ(status.error_message(), "meta_optimizer exceeded deadline.");
+  EXPECT_EQ(status.message(), "meta_optimizer exceeded deadline.");
   // The meta optimizer should manage to finish one iteration.
   EXPECT_EQ(original_node_size + 1, output.node_size());
 }
@@ -898,7 +898,7 @@ TEST_F(MetaOptimizerTest, RunPostOptimizationVerifiersOnInvalidGraph) {
       optimizer_with_post_verifiers.Optimize(nullptr, item, &output);
   EXPECT_TRUE(errors::IsInvalidArgument(status));
   EXPECT_TRUE(absl::StrContains(
-      status.error_message(),
+      status.message(),
       "NodeDef expected inputs 'float' do not match 3 inputs specified"));
 }
 
@@ -973,7 +973,7 @@ TEST_F(MetaOptimizerTest, RunInterOptimizerVerifiersOnInvalidGraph) {
       optimizer_with_inter_verifiers.Optimize(nullptr, item, &output);
   EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument);
   EXPECT_TRUE(absl::StrContains(
-      status.error_message(),
+      status.message(),
       "NodeDef expected inputs 'float' do not match 3 inputs specified"));
 }
 
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index 19bca1c8b62..bed573493a0 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -1122,6 +1122,296 @@ class MklRemapperConv2dFusedBatchNormSwishTest : public GrapplerTest {
 
 TEST_F(MklRemapperConv2dFusedBatchNormSwishTest, F32) { RunTest<DT_FLOAT>(); }
 
+class MklFuseInstanceNormTest : public GrapplerTest {
+ protected:
+  template <DataType DTYPE>
+  void FuseMklInstanceNorm5D_Runner(string FORMAT, string activation) {
+    using ::tensorflow::ops::Placeholder;
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    TensorShape input_shape, filter_shape, add_shape, scale_shift_shape;
+    if (FORMAT == "NCDHW") {
+      input_shape = TensorShape({4, 8, 3, 3, 4});
+      add_shape = TensorShape({1, 16, 1, 1, 1});
+      scale_shift_shape = TensorShape({1, 16, 1, 1, 1});
+    } else {
+      input_shape = TensorShape({4, 3, 3, 4, 8});
+      add_shape = TensorShape({1, 1, 1, 1, 16});
+      scale_shift_shape = TensorShape({1, 1, 1, 1, 16});
+    }
+
+    filter_shape = TensorShape({1, 1, 1, 8, 16});
+
+    auto input_t = GenerateTensorWithSetRandom<DTYPE>(input_shape);
+    auto filter_t = GenerateTensorWithSetRandom<DTYPE>(filter_shape);
+    auto add_input = GenerateTensorWithSetRandom<DTYPE>(scale_shift_shape);
+    auto scale_input = GenerateTensorWithSetRandom<DTYPE>(scale_shift_shape);
+    auto shift_input = GenerateTensorWithSetRandom<DTYPE>(scale_shift_shape);
+
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    T epsilon_value = static_cast<T>(0.00001);
+    auto epsilon_tensor =
+        GenerateConstantTensor<DTYPE>(TensorShape({}), epsilon_value);
+
+    std::vector<int> strides = {1, 1, 1, 1, 1};
+    auto input = Placeholder(s.WithOpName("input"), DTYPE,
+                             ops::Placeholder::Shape(input_shape));
+    auto filter = Placeholder(s.WithOpName("filter"), DTYPE,
+                              ops::Placeholder::Shape(filter_shape));
+    ops::Conv3D::Attrs conv_attrs = ops::Conv3D::Attrs().DataFormat(FORMAT);
+    auto conv = ops::Conv3D(s.WithOpName("conv"), input, filter, strides,
+                            "SAME", conv_attrs);
+
+    auto add_const =
+        ops::Const(s.WithOpName("add_const"), Input::Initializer(add_input));
+    auto add = ops::Add(s.WithOpName("conv_add"), add_const, conv);
+
+    auto r_indices =
+        (FORMAT == "NCDHW")
+            ? ops::Const(s.WithOpName("r_indices"), {2, 3, 4}, {3})
+            : ops::Const(s.WithOpName("r_indices"), {1, 2, 3}, {3});
+    ops::Mean::Attrs mean_attrs = ops::Mean::Attrs().KeepDims(true);
+    auto mean = ops::Mean(s.WithOpName("mean"), add, r_indices, mean_attrs);
+    auto s_diff = ops::SquaredDifference(s.WithOpName("s_diff"), add, mean);
+    auto variance =
+        ops::Mean(s.WithOpName("variance"), s_diff, r_indices, mean_attrs);
+
+    auto epsilon =
+        ops::Const(s.WithOpName("epsilon"), Input::Initializer(epsilon_tensor));
+    auto add_1 = ops::AddV2(s.WithOpName("add_1"), variance, epsilon);
+    auto rsqrt = ops::Rsqrt(s.WithOpName("rsqrt"), add_1);
+    auto g_const =
+        ops::Const(s.WithOpName("g_const"), Input::Initializer(scale_input));
+    auto mul = ops::Mul(s.WithOpName("mul"), rsqrt, g_const);
+    auto mul_1 = ops::Mul(s.WithOpName("mul_1"), add, mul);
+    auto mul_2 = ops::Mul(s.WithOpName("mul_2"), mean, mul);
+    auto b_const =
+        ops::Const(s.WithOpName("b_const"), Input::Initializer(shift_input));
+    auto sub = ops::Sub(s.WithOpName("sub"), b_const, mul_2);
+    auto add_2 = ops::AddV2(s.WithOpName("add_2"), mul_1, sub);
+
+    ops::Identity fetch = [&]() -> ops::Identity {
+      auto activate = s.WithOpName("activation");
+      auto fetch = s.WithOpName("fetch");
+      if (activation == "None") {
+        return ops::Identity(fetch, add_2);
+      } else if (activation == "Relu") {
+        return ops::Identity(fetch, ops::Relu(activate, add_2));
+      } else if (activation == "LeakyRelu") {
+        auto attr = ops::internal::LeakyRelu::Alpha(0.3f);
+        return ops::Identity(fetch,
+                             ops::internal::LeakyRelu(activate, add_2, attr));
+      }
+      return ops::Identity(fetch, add_2);
+    }();
+
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"input", input_t}, {"filter", filter_t}};
+
+    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+    }
+
+    Remapper optimizer(RewriterConfig::ON);
+    GraphDef output;
+    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+    string fused_node_name = (activation == "None") ? "add_2" : "activation";
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() == fused_node_name) {
+        EXPECT_EQ(node.op(), "_MklFusedInstanceNorm");
+        ASSERT_EQ(node.input_size(), 3);
+        EXPECT_EQ(node.input(0), "conv_add");
+        EXPECT_EQ(node.input(1), "g_const");
+        EXPECT_EQ(node.input(2), "b_const");
+        found++;
+      }
+    }
+
+    EXPECT_EQ(found, 1);
+
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+    ASSERT_EQ(tensors_expected.size(), 1);
+    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+    ASSERT_EQ(tensors.size(), 1);
+    if (DTYPE == DT_BFLOAT16) {
+      test::ExpectClose(tensors[0], tensors_expected[0], 1e-2, 1e-2);
+    } else {
+      test::ExpectClose(tensors[0], tensors_expected[0], 2e-6, 1e-6);
+    }
+  }
+
+  template <DataType DTYPE>
+  void FuseMklInstanceNorm4D_Runner(string FORMAT, string activation) {
+    using ::tensorflow::ops::Placeholder;
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    TensorShape input_shape, filter_shape, add_shape, scale_shift_shape;
+    if (FORMAT == "NCHW") {
+      input_shape = TensorShape({4, 3, 32, 32});
+      add_shape = TensorShape({1, 3, 1, 1});
+      scale_shift_shape = TensorShape({1, 3, 1, 1});
+    } else {
+      input_shape = TensorShape({4, 32, 32, 3});
+      add_shape = TensorShape({1, 1, 1, 3});
+      scale_shift_shape = TensorShape({1, 1, 1, 3});
+    }
+    filter_shape = TensorShape({2, 2, 3, 3});
+
+    auto input_t = GenerateTensorWithSetRandom<DTYPE>(input_shape);
+    auto filter_t = GenerateTensorWithSetRandom<DTYPE>(filter_shape);
+    auto add_input = GenerateTensorWithSetRandom<DTYPE>(scale_shift_shape);
+    auto scale_input = GenerateTensorWithSetRandom<DTYPE>(scale_shift_shape);
+    auto shift_input = GenerateTensorWithSetRandom<DTYPE>(scale_shift_shape);
+
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    T epsilon_value = static_cast<T>(0.00001);
+    auto epsilon_tensor =
+        GenerateConstantTensor<DTYPE>(TensorShape({}), epsilon_value);
+
+    std::vector<int> strides = {1, 1, 1, 1};
+    auto input = Placeholder(s.WithOpName("input"), DTYPE,
+                             ops::Placeholder::Shape(input_shape));
+    auto filter = Placeholder(s.WithOpName("filter"), DTYPE,
+                              ops::Placeholder::Shape(filter_shape));
+    ops::Conv2D::Attrs conv_attrs = ops::Conv2D::Attrs().DataFormat(FORMAT);
+    auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides,
+                            "SAME", conv_attrs);
+
+    auto add_const =
+        ops::Const(s.WithOpName("add_const"), Input::Initializer(add_input));
+    auto add = ops::Add(s.WithOpName("conv_add"), add_const, conv);
+
+    auto r_indices = (FORMAT == "NCHW")
+                         ? ops::Const(s.WithOpName("r_indices"), {2, 3}, {2})
+                         : ops::Const(s.WithOpName("r_indices"), {1, 2}, {2});
+    ops::Mean::Attrs mean_attrs = ops::Mean::Attrs().KeepDims(true);
+    auto mean = ops::Mean(s.WithOpName("mean"), add, r_indices, mean_attrs);
+    auto s_diff = ops::SquaredDifference(s.WithOpName("s_diff"), add, mean);
+    auto variance =
+        ops::Mean(s.WithOpName("variance"), s_diff, r_indices, mean_attrs);
+
+    auto epsilon =
+        ops::Const(s.WithOpName("epsilon"), Input::Initializer(epsilon_tensor));
+    auto add_1 = ops::AddV2(s.WithOpName("add_1"), variance, epsilon);
+    auto rsqrt = ops::Rsqrt(s.WithOpName("rsqrt"), add_1);
+    auto g_const =
+        ops::Const(s.WithOpName("g_const"), Input::Initializer(scale_input));
+    auto mul = ops::Mul(s.WithOpName("mul"), rsqrt, g_const);
+    auto mul_1 = ops::Mul(s.WithOpName("mul_1"), add, mul);
+    auto mul_2 = ops::Mul(s.WithOpName("mul_2"), mean, mul);
+    auto b_const =
+        ops::Const(s.WithOpName("b_const"), Input::Initializer(shift_input));
+    auto sub = ops::Sub(s.WithOpName("sub"), b_const, mul_2);
+    auto add_2 = ops::AddV2(s.WithOpName("add_2"), mul_1, sub);
+
+    ops::Identity fetch = [&]() -> ops::Identity {
+      auto activate = s.WithOpName("activation");
+      auto fetch = s.WithOpName("fetch");
+      if (activation == "None") {
+        return ops::Identity(fetch, add_2);
+      } else if (activation == "Relu") {
+        return ops::Identity(fetch, ops::Relu(activate, add_2));
+      } else if (activation == "LeakyRelu") {
+        auto attr = ops::internal::LeakyRelu::Alpha(0.3f);
+        return ops::Identity(fetch,
+                             ops::internal::LeakyRelu(activate, add_2, attr));
+      }
+      return ops::Identity(fetch, add_2);
+    }();
+
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"input", input_t}, {"filter", filter_t}};
+
+    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+    }
+
+    Remapper optimizer(RewriterConfig::ON);
+    GraphDef output;
+    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+    string fused_node_name = (activation == "None") ? "add_2" : "activation";
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() == fused_node_name) {
+        EXPECT_EQ(node.op(), "_MklFusedInstanceNorm");
+        ASSERT_EQ(node.input_size(), 3);
+        EXPECT_EQ(node.input(0), "conv_add");
+        EXPECT_EQ(node.input(1), "g_const");
+        EXPECT_EQ(node.input(2), "b_const");
+        found++;
+      }
+    }
+
+    EXPECT_EQ(found, 1);
+
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+    ASSERT_EQ(tensors_expected.size(), 1);
+    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+    ASSERT_EQ(tensors.size(), 1);
+    if (DTYPE == DT_BFLOAT16) {
+      test::ExpectClose(tensors[0], tensors_expected[0], 1e-2, 1e-2);
+    } else {
+      test::ExpectClose(tensors[0], tensors_expected[0], 2e-6, 1e-6);
+    }
+  }
+
+  template <DataType DTYPE>
+  void FuseMklInstanceNorm4D(string FORMAT, bool add_activation = false) {
+    if (!add_activation) {
+      return FuseMklInstanceNorm4D_Runner<DTYPE>(FORMAT, "None");
+    }
+    for (const string& activation : {"Relu", "LeakyRelu"}) {
+      FuseMklInstanceNorm4D_Runner<DTYPE>(FORMAT, activation);
+    }
+  }
+
+  template <DataType DTYPE>
+  void FuseMklInstanceNorm5D(string FORMAT, bool add_activation = false) {
+    if (!add_activation) {
+      return FuseMklInstanceNorm5D_Runner<DTYPE>(FORMAT, "None");
+    }
+    for (const string& activation : {"Relu", "LeakyRelu"}) {
+      FuseMklInstanceNorm5D_Runner<DTYPE>(FORMAT, activation);
+    }
+  }
+};
+
+TEST_F(MklFuseInstanceNormTest, FuseMklInstanceNorm5D_FP32_NDHWC) {
+  FuseMklInstanceNorm5D<DT_FLOAT>("NDHWC");
+}
+TEST_F(MklFuseInstanceNormTest, FuseMklInstanceNorm5D_FP32_NCDHW) {
+  FuseMklInstanceNorm5D<DT_FLOAT>("NCDHW");
+}
+TEST_F(MklFuseInstanceNormTest, FuseMklInstanceNorm4D_FP32_NHWC) {
+  FuseMklInstanceNorm4D<DT_FLOAT>("NHWC");
+}
+TEST_F(MklFuseInstanceNormTest, FuseMklInstanceNorm4D_FP32_NCHW) {
+  FuseMklInstanceNorm4D<DT_FLOAT>("NCHW");
+}
+TEST_F(MklFuseInstanceNormTest,
+       FuseMklInstanceNormWithActivation5D_FP32_NDHWC) {
+  FuseMklInstanceNorm5D<DT_FLOAT>("NDHWC", true);
+}
+TEST_F(MklFuseInstanceNormTest,
+       FuseMklInstanceNormWithActivation5D_FP32_NCDHW) {
+  FuseMklInstanceNorm5D<DT_FLOAT>("NCDHW", true);
+}
+TEST_F(MklFuseInstanceNormTest, FuseMklInstanceNormWithActivation4D_FP32_NHWC) {
+  FuseMklInstanceNorm4D<DT_FLOAT>("NHWC", true);
+}
+TEST_F(MklFuseInstanceNormTest, FuseMklInstanceNormWithActivation4D_FP32_NCHW) {
+  FuseMklInstanceNorm4D<DT_FLOAT>("NCHW", true);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
 #endif  // INTEL_MKL && ENABLE_MKL
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index b825e3c8a91..59a242068fa 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 
 #include <algorithm>
+#include <cstdlib>
 #include <map>
 #include <set>
 #include <unordered_set>
@@ -1660,7 +1661,7 @@ bool FindMatMulBiasAddAndGelu(RemapperContext* ctx, int node_index,
 
 bool FindMulAndMaximum(RemapperContext* ctx, int node_index,
                        std::map<string, int>* matched_nodes_map,
-                       std::set<int>* remove_node_indices) {
+                       std::set<int>* remove_node_indices, float* alpha) {
   using utils::MatchingDirection;
   using utils::NodeStatus;
 
@@ -1709,37 +1710,32 @@ bool FindMulAndMaximum(RemapperContext* ctx, int node_index,
         ctx->graph_view.GetNode(matched_nodes_map->at("alpha"));
     const auto* alpha_node_def = alpha_node_view->node();
 
-    float alpha_val;
-    Tensor alpha_tensor;
-    if (alpha_node_def->op() == "Cast") {
+    if (alpha_node_def != nullptr && alpha_node_def->op() == "Cast") {
       const auto& regular_fanin_0 = alpha_node_view->GetRegularFanin(0);
       const auto* regular_node_view = regular_fanin_0.node_view();
-      const auto* const_node = regular_node_view->node();
-      if (const_node != nullptr && const_node->op() == "Const" &&
-          alpha_tensor.FromProto(const_node->attr().at("value").tensor())) {
-        // Only fusing if the const is a scalar value
-        if (alpha_tensor.shape().dims() > 0) {
-          return false;
-        }
-        alpha_val = alpha_tensor.flat<float>()(0);
-      } else {
-        return false;
-      }
-    } else if (alpha_node_def->op() == "Const" &&
-               alpha_tensor.FromProto(
-                   alpha_node_def->attr().at("value").tensor())) {
-      // Only fusing if the const is a scalar value
-      if (alpha_tensor.shape().dims() > 0) {
-        return false;
-      }
+      alpha_node_def = regular_node_view->node();
+    }
+
+    Tensor alpha_tensor;
+    // Only fusing if the node is a const scalar
+    if (alpha_node_def == nullptr || alpha_node_def->op() != "Const" ||
+        !alpha_tensor.FromProto(alpha_node_def->attr().at("value").tensor()) ||
+        alpha_tensor.NumElements() != 1) {
+      return false;
+    }
+
+    DataType dtype = alpha_tensor.dtype();
+    float alpha_val;
+    if (dtype == DT_FLOAT) {
       alpha_val = alpha_tensor.flat<float>()(0);
+    } else if (dtype == DT_BFLOAT16) {
+      alpha_val = static_cast<float>(alpha_tensor.flat<bfloat16>()(0));
     } else {
       return false;
     }
 
-    if (alpha_val < 0) {
-      return false;
-    }
+    if (alpha_val < 0) return false;
+    *alpha = alpha_val;
   }
   return found_op_type_match;
 }
@@ -2411,6 +2407,237 @@ bool FindFusedBatchMatMul(RemapperContext* ctx, int node_index,
   return found_op_type_match;
 }
 
+// Helper function to check if the reduction axes for a given input
+// shape align with instance normalization's mean computation.
+// Mean reduction axes for instance norm are expected to be:
+// 4D input shape - reduction axes [2,3], data format NCHW;
+// 4D input shape - reduction axes [1,2], data format NHWC;
+// 5D input shape - reduction axes [2,3,4], data format NCDHW;
+// 5D input shape - reduction axes [1,2,3], data format NDHWC;
+template <typename T>
+bool IsInstanceNormReduction(const TensorShapeProto& input_shape,
+                             const Tensor& reduction_axes_data) {
+  int input_dims = input_shape.dim_size();
+  int reduction_axes = reduction_axes_data.NumElements();
+
+  if ((input_dims != 4 && input_dims != 5) ||
+      (reduction_axes + 2) != input_dims) {
+    return false;
+  }
+
+  if (input_dims == 4) {
+    return ((reduction_axes_data.flat<T>()(0) == static_cast<T>(1) &&
+             reduction_axes_data.flat<T>()(1) == static_cast<T>(2)) ||
+            (reduction_axes_data.flat<T>()(0) == static_cast<T>(2) &&
+             reduction_axes_data.flat<T>()(1) == static_cast<T>(3)));
+  } else {
+    return ((reduction_axes_data.flat<T>()(0) == static_cast<T>(1) &&
+             reduction_axes_data.flat<T>()(1) == static_cast<T>(2) &&
+             reduction_axes_data.flat<T>()(2) == static_cast<T>(3)) ||
+            (reduction_axes_data.flat<T>()(0) == static_cast<T>(2) &&
+             reduction_axes_data.flat<T>()(1) == static_cast<T>(3) &&
+             reduction_axes_data.flat<T>()(2) == static_cast<T>(4)));
+  }
+}
+
+// Find a group of ops that make up an instance normalization pattern for fusion
+bool FindInstanceNorm(RemapperContext* ctx, int node_index,
+                      std::map<string, int>* matched_nodes_map,
+                      std::set<int>* remove_node_indices) {
+  // The following pattern will be searched in the graph with additional
+  // contraints. Here * means any type of op.
+  // clang-format off
+  //              Subgraph for fusion
+  //              -------------------
+  //   *(input)
+  //    |    | \____________
+  //    |    |              \
+  //    |    |             Mean1                                      FusedOp
+  //    |    |            /    \                                      -------
+  //    |    |           /      \                           *(input)  Const  Const
+  //    |    |          /        \                              \    (gamma) (beta)
+  //    |    |         /          \                              \     |     /
+  //    |    |        /           |                           _MklFusedInstanceNorm
+  //    |    |       /            |
+  //    \   SquaredDiff  Const    |
+  //     \      \      /          |
+  //      \      \    /           |
+  //       \     Mean0  Const     |
+  //        \      \    /         |
+  //         \     AddV2          |
+  //          \       \           |
+  //           \    Rsqrt  Const  |
+  //            \        \ /      |
+  //             \       Mul1     |
+  //              \      |   \    |
+  //               \     |    \   |
+  //                \    |     \  |
+  //                 \   |      \ |
+  //                  \  | Const Mul2
+  //                   \ |    |  /
+  //                   Mul0   Sub
+  //                      \   /
+  //                      AddV2(output)
+  // clang-format on
+  using utils::MatchingDirection;
+  using utils::NodeStatus;
+  // clang-format off
+  utils::OpTypePattern instance_norm_pattern =
+    {"AddV2", "output", NodeStatus::kReplace,
+      {
+        {"Mul", "mul0", NodeStatus::kRemove,
+          {
+            {"*", "input", NodeStatus::kRemain},
+            {"Mul", "mul1", NodeStatus::kRemove,
+              {
+                {"Rsqrt", "rsqrt", NodeStatus::kRemove,
+                  {
+                    {"AddV2", "add", NodeStatus::kRemove,
+                      {
+                        {"Mean", "mean0", NodeStatus::kRemove,
+                          {
+                            {"SquaredDifference", "squareddiff", NodeStatus::kRemove,
+                              {
+                                {"*", "input", NodeStatus::kRemain},
+                                {"Mean", "mean1", NodeStatus::kRemove,
+                                  {
+                                    {"*", "input", NodeStatus::kRemain},
+                                    {"Const", "r_indices1", NodeStatus::kRemain}
+                                  }
+                                } // end mean1
+                              }
+                            }, // end squareddiff
+                            {"Const", "r_indices0", NodeStatus::kRemain}
+                          }
+                        }, // end mean0
+                        {"Const", "epsilon", NodeStatus::kRemain}
+                      }
+                    } // end add
+                  }
+                }, // end rsqrt
+                //TODO(intel-tf): Support non-constant
+                {"Const", "gamma", NodeStatus::kRemain}
+              }
+            } // end mul1
+          }
+        }, // end mul0
+        {"Sub", "sub0", NodeStatus::kRemove,
+          {
+            //TODO(intel-tf): Support non-constant
+            {"Const", "beta", NodeStatus::kRemain},
+            {"Mul", "mul2", NodeStatus::kRemove,
+              {
+                {"Mul", "mul1", NodeStatus::kRemove},
+                {"Mean", "mean1", NodeStatus::kRemove}
+              }
+            }, // end mul2
+          }
+        } // end sub
+      }
+    };
+  // clang-format on
+  utils::SubGraphMatcher<MatchingDirection::kFollowInputs> graph_matcher(
+      &(ctx->graph_view));
+  matched_nodes_map->clear();
+  remove_node_indices->clear();
+
+  if (!graph_matcher.GetMatchedNodes(instance_norm_pattern, {},
+                                     ctx->graph_view.GetNode(node_index),
+                                     matched_nodes_map, remove_node_indices)) {
+    return false;
+  }
+
+  if (!ctx->inferred_graph_properties) {
+    Status s = ctx->graph_properties.InferStatically(
+        /*assume_valid_feeds=*/true,
+        /*aggressive_shape_inference=*/false,
+        /*include_input_tensor_values=*/false,
+        /*include_output_tensor_values=*/true);
+    if (!s.ok()) return false;
+    ctx->inferred_graph_properties = true;
+  }
+
+  NodeDef* mean1_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("mean1"))->node();
+  bool keep_dims = false;
+  if (!mean1_node || !TryGetNodeAttr(*mean1_node, "keep_dims", &keep_dims) ||
+      !keep_dims) {
+    return false;
+  }
+
+  // Get the input shape
+  const auto& input_props =
+      ctx->graph_properties.GetInputProperties(mean1_node->name());
+  const TensorShapeProto& input_shape = input_props[0].shape();
+  if (input_shape.unknown_rank()) return false;
+
+  DataType dtype = GetDataTypeFromAttr(*mean1_node, "T");
+  // TODO(intel-tf): enable bfloat16 data type support
+  if (dtype != DT_FLOAT) return false;
+
+  // Check if gamma and beta constants have the same shape
+  NodeDef* gamma_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("gamma"))->node();
+  NodeDef* beta_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("beta"))->node();
+  if (!gamma_node || !beta_node) {
+    VLOG(2) << "Unexpected error to retrieve gamma or beta node";
+    return false;
+  }
+  Tensor gamma_tensor, beta_tensor;
+  if (!gamma_tensor.FromProto(gamma_node->attr().at("value").tensor()) ||
+      !beta_tensor.FromProto(beta_node->attr().at("value").tensor())) {
+    return false;
+  }
+  if (!gamma_tensor.IsSameSize(beta_tensor)) return false;
+
+  // Get the reduction axes for mean node to check if the
+  // mean computation complies with instance normalization
+  NodeDef* mean_axes_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("r_indices1"))->node();
+  if (!mean_axes_node) {
+    VLOG(2) << "Unexpected error to retrieve reduction axes node";
+    return false;
+  }
+
+  Tensor mean_axes_tensor;
+  if (!mean_axes_tensor.FromProto(
+          mean_axes_node->attr().at("value").tensor())) {
+    return false;
+  }
+  dtype = mean_axes_tensor.dtype();
+  if (dtype != DT_INT32 && dtype != DT_INT64) return false;
+
+  return (dtype == DT_INT32)
+             ? IsInstanceNormReduction<int32>(input_shape, mean_axes_tensor)
+             : IsInstanceNormReduction<int64>(input_shape, mean_axes_tensor);
+}
+
+// Find the pattern with activation following instance normalization
+bool FindInstanceNormWithActivation(RemapperContext* ctx, int node_index,
+                                    std::map<string, int>* matched_nodes_map,
+                                    std::set<int>* remove_node_indices) {
+  const auto* node_view = ctx->graph_view.GetNode(node_index);
+  if (HasControlFaninOrFanout(*node_view)) return false;
+
+  const auto* node_def = node_view->node();
+  // Currently only Relu and LeakyRelu are supported by oneDNN
+  if (!IsLeakyRelu(*node_def) && !IsRelu(*node_def)) return false;
+
+  if (node_view->NumRegularFanins() < 1) return false;
+  const auto& regular_fanin_0 = node_view->GetRegularFanin(0);
+  const auto* base_node_view = regular_fanin_0.node_view();
+  int base_node_idx = base_node_view->node_index();
+
+  if (!FindInstanceNorm(ctx, base_node_idx, matched_nodes_map,
+                        remove_node_indices))
+    return false;
+
+  remove_node_indices->insert(matched_nodes_map->at("output"));
+  matched_nodes_map->insert(std::pair<string, int>("activation", node_index));
+  return true;
+}
+
 void CopyConv2DAttributes(const NodeDef& conv2d, NodeDef* fused_conv2d,
                           const NodeDef* activation = nullptr) {
   DCHECK(IsConv2D(conv2d)) << "Input node must be a Conv2D";
@@ -3083,14 +3310,12 @@ Status AddMklLayerNorm(RemapperContext* ctx,
 Status ReplaceMulMaximumWithLeakyRelu(
     RemapperContext* ctx, const std::map<string, int>& matched_nodes_map,
     const std::set<int>& remove_node_indices,
-    std::vector<bool>* invalidated_nodes, std::vector<bool>* nodes_to_delete) {
+    std::vector<bool>* invalidated_nodes, std::vector<bool>* nodes_to_delete,
+    float alpha) {
   const NodeDef* maximum =
       ctx->graph_view.GetNode(matched_nodes_map.at("max_to_leakyrelu"))->node();
   const NodeDef* input =
       ctx->graph_view.GetNode(matched_nodes_map.at("input"))->node();
-  const auto* alpha_node_view =
-      ctx->graph_view.GetNode(matched_nodes_map.at("alpha"));
-  const auto* alpha_node_def = alpha_node_view->node();
 
   NodeDef fused_op;
   fused_op.set_name(maximum->name());
@@ -3101,25 +3326,7 @@ Status ReplaceMulMaximumWithLeakyRelu(
   auto* attr = fused_op.mutable_attr();
   (*attr)["T"] = maximum->attr().at("T");
 
-  // BF16 adds a cast before the const alpha, so accessing the const node
-  // using the cast node to retrieve the value of alpha.
-  float alpha_val;
-  Tensor alpha_tensor;
-  if (alpha_node_def->op() == "Cast") {
-    const auto& regular_fanin_0 = alpha_node_view->GetRegularFanin(0);
-    const auto* regular_node_view = regular_fanin_0.node_view();
-    const auto* const_node = regular_node_view->node();
-    if (const_node != nullptr && const_node->op() == "Const" &&
-        alpha_tensor.FromProto(const_node->attr().at("value").tensor())) {
-      alpha_val = alpha_tensor.flat<float>()(0);
-      SetAttrValue(alpha_val, &(*attr)["alpha"]);
-    }
-  } else if (alpha_node_def->op() == "Const" &&
-             alpha_tensor.FromProto(
-                 alpha_node_def->attr().at("value").tensor())) {
-    alpha_val = alpha_tensor.flat<float>()(0);
-    SetAttrValue(alpha_val, &(*attr)["alpha"]);
-  }
+  SetAttrValue(alpha, &(*attr)["alpha"]);
 
   utils::Mutation* mutation = ctx->graph_view.GetMutationBuilder();
   Status status;
@@ -3582,6 +3789,127 @@ Status AddFusedBatchMatMul(RemapperContext* ctx,
   return OkStatus();
 }
 
+// Helper function to get data of type T from a given tensor and
+// return them in a vector and casted to type U.
+// Note - use this function only when type cast is safe from T to U.
+template <typename T, typename U>
+std::vector<U> GetTensorValues(const Tensor& tensor) {
+  std::vector<U> result_vector;
+  int item_count = tensor.flat<T>().size();
+  for (int i = 0; i < item_count; i++) {
+    result_vector.push_back((U)(tensor.flat<T>()(i)));
+  }
+  return result_vector;
+}
+
+Status AddMklFusedInstanceNorm(RemapperContext* ctx,
+                               std::map<string, int>* matched_nodes_map,
+                               std::set<int>* remove_node_indices,
+                               std::vector<bool>* invalidated_nodes,
+                               std::vector<bool>* nodes_to_delete,
+                               bool fuse_activation) {
+  auto* output_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("output"))->node();
+  auto* input_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("input"))->node();
+  auto* gamma_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("gamma"))->node();
+  auto* beta_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("beta"))->node();
+  auto* epsilon_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("epsilon"))->node();
+  auto* mean_axes_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("r_indices1"))->node();
+
+  if (!mean_axes_node || mean_axes_node->op() != "Const") {
+    VLOG(2) << "Mean reduction axes node is not valid, abort fusion";
+    return OkStatus();
+  }
+  DataType dtype;
+  Tensor mean_axes_tensor;
+  if (!mean_axes_tensor.FromProto(
+          mean_axes_node->attr().at("value").tensor())) {
+    VLOG(2) << "Unable to get mean reduction axes, abort fusion";
+    return OkStatus();
+  }
+  dtype = mean_axes_tensor.dtype();
+  if (dtype != DT_INT32 && dtype != DT_INT64) {
+    VLOG(2) << "Unexpected mean reduction axes data type, abort fusion";
+    return OkStatus();
+  }
+  std::vector<int> reduction_axes =
+      (dtype == DT_INT32) ? GetTensorValues<int32, int>(mean_axes_tensor)
+                          : GetTensorValues<int64, int>(mean_axes_tensor);
+
+  NodeDef* activation_node = nullptr;
+  if (fuse_activation) {
+    activation_node =
+        ctx->graph_view.GetNode(matched_nodes_map->at("activation"))->node();
+    if (!activation_node) {
+      VLOG(2) << "Error to retrieve activation node, abort fusion";
+      return OkStatus();
+    }
+    if (!IsLeakyRelu(*activation_node) && !IsRelu(*activation_node)) {
+      VLOG(2) << "Unsupported activation node, abort fusion";
+      return OkStatus();
+    }
+  }
+
+  NodeDef fused_node;
+  fused_node.set_op("_MklFusedInstanceNorm");
+  fused_node.set_device(output_node->device());
+  fused_node.add_input(input_node->name());
+  fused_node.add_input(gamma_node->name());
+  fused_node.add_input(beta_node->name());
+  auto* attr = fused_node.mutable_attr();
+  auto& src_attr = output_node->attr();
+  (*attr)["T"] = src_attr.at("T");
+
+  Tensor epsilon_tensor;
+  float epsilon_value = 0.0001;
+  if (epsilon_node != nullptr && epsilon_node->op() == "Const" &&
+      epsilon_tensor.FromProto(epsilon_node->attr().at("value").tensor())) {
+    dtype = epsilon_tensor.dtype();
+    if (dtype == DT_BFLOAT16) {
+      epsilon_value = static_cast<float>(epsilon_tensor.flat<bfloat16>()(0));
+    } else if (dtype == DT_FLOAT) {
+      epsilon_value = epsilon_tensor.flat<float>()(0);
+    }
+    SetAttrValue(epsilon_value, &(*attr)["epsilon"]);
+  }
+
+  SetAttrValue(reduction_axes, &(*attr)["reduction_axes"]);
+
+  if (fuse_activation) {
+    fused_node.set_name(activation_node->name());
+    string activation_op = activation_node->op();
+    absl::string_view fused_items[] = {activation_op};
+    SetAttrValue(absl::Span<absl::string_view>(fused_items),
+                 &(*attr)["fused_ops"]);
+    if (activation_op == "LeakyRelu") {
+      auto& activation_attr = activation_node->attr();
+      (*attr)["leakyrelu_alpha"] = activation_attr.at("alpha");
+    }
+  } else {
+    fused_node.set_name(output_node->name());
+  }
+
+  utils::Mutation* mutation = ctx->graph_view.GetMutationBuilder();
+  Status status;
+  mutation->AddNode(std::move(fused_node), &status);
+  TF_RETURN_IF_ERROR(status);
+  TF_RETURN_IF_ERROR(mutation->Apply());
+  if (fuse_activation) {
+    (*invalidated_nodes)[matched_nodes_map->at("activation")] = true;
+  } else {
+    (*invalidated_nodes)[matched_nodes_map->at("output")] = true;
+  }
+  for (const auto& node_idx : *remove_node_indices) {
+    (*nodes_to_delete)[node_idx] = true;
+  }
+  return OkStatus();
+}
+
 // This function supports below patterns that require inferred
 // shapes:
 // 1. Contraction + Add.
@@ -4009,11 +4337,12 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
       // Remap Maximum(x, alpha * x) pattern, fuse them into the LeakyRelu(x).
       std::map<string, int> mulmax_matched_nodes_map;
       std::set<int> mulmax_remove_node_indices;
+      float alpha;
       if (FindMulAndMaximum(&ctx, i, &mulmax_matched_nodes_map,
-                            &mulmax_remove_node_indices)) {
+                            &mulmax_remove_node_indices, &alpha)) {
         TF_RETURN_IF_ERROR(ReplaceMulMaximumWithLeakyRelu(
             &ctx, mulmax_matched_nodes_map, mulmax_remove_node_indices,
-            &invalidated_nodes, &nodes_to_delete));
+            &invalidated_nodes, &nodes_to_delete, alpha));
         continue;
       }
 
@@ -4039,6 +4368,28 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
                             &invalidated_nodes, &nodes_to_delete, epsilon));
         continue;
       }
+
+      // Remap ops that make up instancenorm followed by Relu or LeakyRelu
+      // into _MklFusedInstanceNorm
+      matched_nodes_map.clear();
+      remove_node_indices.clear();
+      if (FindInstanceNormWithActivation(&ctx, i, &matched_nodes_map,
+                                         &remove_node_indices)) {
+        TF_RETURN_IF_ERROR(AddMklFusedInstanceNorm(
+            &ctx, &matched_nodes_map, &remove_node_indices, &invalidated_nodes,
+            &nodes_to_delete, true));
+        continue;
+      }
+
+      // Remap ops that make up instancenorm into _MklFusedInstanceNorm
+      matched_nodes_map.clear();
+      remove_node_indices.clear();
+      if (FindInstanceNorm(&ctx, i, &matched_nodes_map, &remove_node_indices)) {
+        TF_RETURN_IF_ERROR(AddMklFusedInstanceNorm(
+            &ctx, &matched_nodes_map, &remove_node_indices, &invalidated_nodes,
+            &nodes_to_delete, false));
+        continue;
+      }
     }
 
     // Remap MatMul + BiasAdd + gelu-subgraph
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index 727a577de9e..6cc45a31b7e 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -291,6 +291,9 @@ Status GetInputs(ScopedAllocatorOptimizer* sa_opti, int64_t invocation_count,
                 << " output_index " << output_index;
       }
     }
+    if (inode == nullptr) {
+      return errors::Internal("Did not find node");
+    }
     if (inode_dtype == DT_INVALID) {
       if (!graph_properties.HasOutputProperties(inode->name())) {
         return errors::Internal("Input node ", inode->name(),
diff --git a/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc b/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc
index cacfa136eaa..6b03aa31814 100644
--- a/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc
+++ b/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "tensorflow/core/ir/tf_op_registry.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/util/dump_graph.h"
 using tensorflow::Status;
 using tensorflow::errors::InvalidArgument;
@@ -132,7 +132,7 @@ Status TFGGrapplerOptimizer::Optimize(
     // Import errors are not fatal. Log the error here and return `Aborted` so
     // the meta optimizer knows to swallow the error.
     LOG(ERROR) << name() << " failed: " << status.ToString();
-    return tensorflow::errors::Aborted(status.error_message());
+    return tensorflow::errors::Aborted(status.message());
   }
   metrics.ReportAndStop();
 
diff --git a/tensorflow/core/grappler/utils/graph_view.cc b/tensorflow/core/grappler/utils/graph_view.cc
index 03675762d93..1d26a4a6bfe 100644
--- a/tensorflow/core/grappler/utils/graph_view.cc
+++ b/tensorflow/core/grappler/utils/graph_view.cc
@@ -833,7 +833,7 @@ Status MutableGraphView::CheckKernelRegisteredForNodes() {
                                   diff.update_op ? diff.op : node->op(), device,
                                   AttrSlice(&(*diff.processed_attrs)));
     if (!s.ok()) {
-      LOG(WARNING) << s.error_message();
+      LOG(WARNING) << s.message();
     }
   }
   for (const auto& new_node_holder : mutation_.new_nodes_) {
@@ -846,7 +846,7 @@ Status MutableGraphView::CheckKernelRegisteredForNodes() {
     }
     s = IsKernelRegisteredForNode(new_node_def);
     if (!s.ok()) {
-      LOG(WARNING) << s.error_message();
+      LOG(WARNING) << s.message();
     }
   }
   return OkStatus();
diff --git a/tensorflow/core/grappler/utils/graph_view_test.cc b/tensorflow/core/grappler/utils/graph_view_test.cc
index 6df2eabad66..25a0b9d1678 100644
--- a/tensorflow/core/grappler/utils/graph_view_test.cc
+++ b/tensorflow/core/grappler/utils/graph_view_test.cc
@@ -66,7 +66,7 @@ TYPED_TEST(TypedGraphViewTest, GraphWithDuplicateNodeNames) {
   Status s;
   TypeParam graph_view(&graph, &s);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             absl::Substitute(
                 "$0::$0 error: graph has multiple nodes with the name 'a'.",
                 GetGraphViewTypeAsString<TypeParam>()));
@@ -78,7 +78,7 @@ TYPED_TEST(TypedGraphViewTest, GraphWithMissingFanins) {
   Status s;
   TypeParam graph_view(&graph, &s);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             absl::Substitute("$0::$0 error: node 'a' has missing fanin 'b:3'.",
                              GetGraphViewTypeAsString<TypeParam>()));
 }
@@ -90,7 +90,7 @@ TYPED_TEST(TypedGraphViewTest, GraphWithSelfCycles) {
   TypeParam graph_view(&graph, &s);
   EXPECT_FALSE(s.ok());
   EXPECT_EQ(
-      s.error_message(),
+      s.message(),
       absl::Substitute("$0::$0 error: node 'a' has self cycle fanin 'a:4'.",
                        GetGraphViewTypeAsString<TypeParam>()));
 }
@@ -102,7 +102,7 @@ TYPED_TEST(TypedGraphViewTest, GraphWithMisorderedFanins) {
   Status s;
   TypeParam graph_view(&graph, &s);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             absl::Substitute("$0::$0 error: node 'a' has regular fanin 'b:4' "
                              "after controlling fanins.",
                              GetGraphViewTypeAsString<TypeParam>()));
@@ -990,14 +990,14 @@ TEST_F(MutationTest, AddNewNode) {
       NDef("bad", "IdentityN", {"^b", "a:1"}, {{"N", 1}}, "foo");
   mutation->AddNode(std::move(bad_node_1), &s);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             "Mutation::AddNode error: node 'bad' has regular fanin 'a:1' after "
             "controlling fanins.");
 
   NodeDef bad_node_2 = NDef("bad", "IdentityN", {"bad:1"}, {}, "foo");
   mutation->AddNode(std::move(bad_node_2), &s);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
+  EXPECT_EQ(s.message(),
             "Mutation::AddNode error: node 'bad' has self cycle fanin "
             "'bad:1'.");
 
@@ -1022,7 +1022,7 @@ TEST_F(MutationTest, NewNodeBadFaninsAfterAdd) {
   EXPECT_FALSE(s.ok());
   string expected_error_msg =
       "Mutation::Apply error: new node 'valid' is ill-formed.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 }
 
@@ -1048,7 +1048,7 @@ TEST_F(MutationTest, NewNodesConflictingNames) {
   string expected_error_msg =
       "Mutation::Apply error: multiple nodes with the name: 'a' exists in "
       "Mutation.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 }
 
@@ -1069,7 +1069,7 @@ TEST_F(MutationTest, UpdateNodeAndAddSelfLoop) {
   EXPECT_FALSE(s.ok());
   string expected_error_msg =
       "Mutation::Apply error: inplace updated node 'd' is ill-formed.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 }
 
@@ -1091,7 +1091,7 @@ TEST_F(MutationTest, RenameNodeAndAddSelfLoop) {
   EXPECT_FALSE(s.ok());
   string expected_error_msg =
       "Mutation::Apply error: renamed updated node 'e' ('d') is ill-formed.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 }
 
@@ -1117,7 +1117,7 @@ TEST_F(MutationTest, ExistingNodesConflictingNames) {
   string expected_error_msg =
       "Mutation::Apply error: multiple nodes with the name: 'b' exists in "
       "Mutation.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 }
 
@@ -1143,7 +1143,7 @@ TEST_F(MutationTest, NewAndExistingNodesConflictingNames) {
   string expected_error_msg =
       "Mutation::Apply error: multiple nodes with the name: 'a' exists in "
       "Mutation.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 }
 
@@ -1169,7 +1169,7 @@ TEST_F(MutationTest, NewAndExistingRenamedNodesConflictingNames) {
   string expected_error_msg =
       "Mutation::Apply error: multiple nodes with the name: 'e' exists in "
       "Mutation.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 }
 
@@ -1190,7 +1190,7 @@ TEST_F(MutationTest, RemoveNodesWithFanouts) {
   EXPECT_FALSE(s.ok());
   string expected_error_msg =
       "Mutation::Apply error: fanout 'd' exist for missing node 'b'.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 
   MutableNodeView* d_node = graph_view.GetNode("d");
@@ -1224,7 +1224,7 @@ TEST_F(MutationTest, SwapNodeNamesWithCycle) {
   EXPECT_FALSE(s.ok());
   string expected_error_msg =
       "Mutation::Apply error: renamed updated node 'b' ('d') is ill-formed.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 
   mutation->AddOrUpdateRegularFanin(d_node, 1, {"d", 3});
@@ -1258,7 +1258,7 @@ TEST_F(MutationTest, RenamedNodeWithFanouts) {
   EXPECT_FALSE(s.ok());
   string expected_error_msg =
       "Mutation::Apply error: fanout 'd' exist for missing node 'a'.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 
   mutation->UpdateNodeName(a_node, "a");
@@ -1272,7 +1272,7 @@ TEST_F(MutationTest, RenamedNodeWithFanouts) {
   expected_error_msg =
       "Mutation::Apply error: fanout 'd' exist for missing "
       "node 'b'.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 }
 
@@ -1346,7 +1346,7 @@ TEST_F(MutationTest, UpdateNodeNameAndRemoveRegularFanout) {
   EXPECT_FALSE(s.ok());
   string expected_error_msg =
       "Mutation::Apply error: fanout 'd' exist for missing node 'a'.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 
   MutableNodeView* d_node = graph_view.GetNode("d");
@@ -1357,7 +1357,7 @@ TEST_F(MutationTest, UpdateNodeNameAndRemoveRegularFanout) {
   EXPECT_FALSE(s.ok());
   expected_error_msg =
       "Mutation::Apply error: fanout 'd' exist for missing node 'a'.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 
   mutation->AddOrUpdateRegularFanin(d_node, 0, {"b", 1});
@@ -1390,7 +1390,7 @@ TEST_F(MutationTest, UpdateNodeNameAndRemoveControlledFanout) {
   EXPECT_FALSE(s.ok());
   string expected_error_msg =
       "Mutation::Apply error: fanout 'd' exist for missing node 'c'.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 
   MutableNodeView* d_node = graph_view.GetNode("d");
@@ -1401,7 +1401,7 @@ TEST_F(MutationTest, UpdateNodeNameAndRemoveControlledFanout) {
   EXPECT_FALSE(s.ok());
   expected_error_msg =
       "Mutation::Apply error: fanout 'd' exist for missing node 'c'.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
 
   mutation->RemoveControllingFanin(d_node, "c");
@@ -1707,7 +1707,7 @@ TEST_F(MutationTest, Reset) {
   EXPECT_FALSE(s.ok());
   string expected_error_msg =
       "Mutation::Apply error: fanout 'b' exist for missing node 'a'.";
-  EXPECT_EQ(s.error_message(), expected_error_msg);
+  EXPECT_EQ(s.message(), expected_error_msg);
   CompareGraphViewWithGraph(&graph_view, TestGraphForMutation());
 
   mutation->Reset();
@@ -2043,7 +2043,7 @@ TEST_F(TopologicalSortTest, ActiveMutationSort) {
     status = graph_view.SortTopologically(ignore_cycles, {});
     EXPECT_FALSE(status.ok());
     EXPECT_EQ(
-        status.error_message(),
+        status.message(),
         "MutableGraphView::SortTopologically error: active mutation exists.");
     CompareGraphViewWithGraph(&graph_view, test_graph());
     CompareGraphOrder(graph_view, {"a", "b"});
@@ -2072,7 +2072,7 @@ TEST_F(TopologicalSortTest, BadExtraDependenciesSort) {
     status =
         graph_view_2.SortTopologically(ignore_cycles, {{a_node_1, b_node_2}});
     EXPECT_FALSE(status.ok());
-    EXPECT_EQ(status.error_message(),
+    EXPECT_EQ(status.message(),
               "MutableGraphView::SortTopologically error: invalid extra "
               "dependencies.");
     CompareGraphViewWithGraph(&graph_view_2, test_graph());
@@ -2096,7 +2096,7 @@ TEST_F(TopologicalSortTest, NoCyclesAllowed) {
 
   status = graph_view.SortTopologically(/*ignore_cycles=*/false, {});
   EXPECT_FALSE(status.ok());
-  EXPECT_EQ(status.error_message(),
+  EXPECT_EQ(status.message(),
             "MutableGraphView::SortTopologically error: detected edge(s) "
             "creating cycle(s) {'c' -> 'b'}.");
   CompareGraphViewWithGraph(&graph_view, test_graph());
@@ -2121,7 +2121,7 @@ TEST_F(TopologicalSortTest, NoNodesWithZeroFanins) {
 
   status = graph_view.SortTopologically(/*ignore_cycles=*/false, {});
   EXPECT_FALSE(status.ok());
-  EXPECT_EQ(status.error_message(),
+  EXPECT_EQ(status.message(),
             "MutableGraphView::SortTopologically error: was not able to sort "
             "all nodes topologically.");
   CompareGraphViewWithGraph(&graph_view, test_graph());
@@ -2146,7 +2146,7 @@ TEST_F(TopologicalSortTest, DidNotReachAllNodes) {
 
   status = graph_view.SortTopologically(/*ignore_cycles=*/false, {});
   EXPECT_FALSE(status.ok());
-  EXPECT_EQ(status.error_message(),
+  EXPECT_EQ(status.message(),
             "MutableGraphView::SortTopologically error: was not able to sort "
             "all nodes topologically.");
   CompareGraphViewWithGraph(&graph_view, test_graph());
diff --git a/tensorflow/core/grappler/utils/transitive_fanin_test.cc b/tensorflow/core/grappler/utils/transitive_fanin_test.cc
index 3233dfd34ba..350b1f6dcf6 100644
--- a/tensorflow/core/grappler/utils/transitive_fanin_test.cc
+++ b/tensorflow/core/grappler/utils/transitive_fanin_test.cc
@@ -131,11 +131,11 @@ TEST_F(TransitiveFaninTest, InvalidGraphOrTerminalNodes) {
   const std::vector<string> terminal_nodes = {"1", "5"};
   auto s = SetTransitiveFaninGraph(graph, &output_graph, terminal_nodes);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(), "Graph does not contain input 6 of node 5.");
+  EXPECT_EQ(s.message(), "Graph does not contain input 6 of node 5.");
   const std::vector<string> invalid_terminal_nodes = {"0", "1", "5"};
   s = SetTransitiveFaninGraph(graph, &output_graph, invalid_terminal_nodes);
   EXPECT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(), "Graph does not contain terminal node 0.");
+  EXPECT_EQ(s.message(), "Graph does not contain terminal node 0.");
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 4c198f2b764..37bf0785b35 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -435,9 +435,8 @@ TEST(CheckAttrExists, All) {
   Status status = CheckAttrExists(node, "banana");
   EXPECT_FALSE(status.ok());
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(),
-                        absl::StrFormat("Node 'node' lacks 'banana' attr: %s",
+  EXPECT_TRUE(absl::StrContains(
+      status.message(), absl::StrFormat("Node 'node' lacks 'banana' attr: %s",
                                         node.ShortDebugString())));
   EXPECT_FALSE(CheckAttrsExist(node, {""}).ok());
   EXPECT_FALSE(CheckAttrsExist(node, {"pear", "cherry"}).ok());
@@ -606,7 +605,7 @@ void TestSetTensorValue(DataType type, int val, bool success,
   if (s.ok()) {
     test::ExpectTensorEqual<T>(Tensor(static_cast<T>(val)), t);
   } else {
-    EXPECT_EQ(s.error_message(), error_msg);
+    EXPECT_EQ(s.message(), error_msg);
   }
 }
 
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
index 667506b1d62..bea5de0c6c6 100644
--- a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
+++ b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
@@ -86,8 +86,7 @@ TEST_F(StructureVerifierTest, OpNotRegistered) {
       "node { name: 't2' op: 'TestMul' input: [ 'input:1', 't1' ] }");
   Status status = verifier_->Verify(graph_);
   EXPECT_TRUE(errors::IsNotFound(status));
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "Op type not registered"));
+  EXPECT_TRUE(absl::StrContains(status.message(), "Op type not registered"));
 }
 
 TEST_F(StructureVerifierTest, DuplicateNodeNames) {
@@ -96,8 +95,7 @@ TEST_F(StructureVerifierTest, DuplicateNodeNames) {
       "node { name: 'A' op: 'TestInput' }");
   Status status = verifier_->Verify(graph_);
   EXPECT_TRUE(errors::IsAlreadyExists(status));
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "Node already exists:"));
+  EXPECT_TRUE(absl::StrContains(status.message(), "Node already exists:"));
 }
 
 TEST_F(StructureVerifierTest, GraphWithInvalidCycle) {
@@ -107,9 +105,8 @@ TEST_F(StructureVerifierTest, GraphWithInvalidCycle) {
       "node { name: 't2' op: 'TestMul' input: [ 'input:1', 't1' ] }");
   Status status = verifier_->Verify(graph_);
   EXPECT_TRUE(errors::IsInvalidArgument(status));
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(),
-                        "The graph couldn't be sorted in topological order"));
+  EXPECT_TRUE(absl::StrContains(
+      status.message(), "The graph couldn't be sorted in topological order"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/ir/importexport/convert_tensor.cc b/tensorflow/core/ir/importexport/convert_tensor.cc
index 1541fdf890b..885a7e30b91 100644
--- a/tensorflow/core/ir/importexport/convert_tensor.cc
+++ b/tensorflow/core/ir/importexport/convert_tensor.cc
@@ -225,9 +225,9 @@ tensorflow::StatusOr<ElementsAttr> ConvertTensorProto(
 
     std::vector<int64_t> original_dimensions;
     for (auto dim : input_tensor_shape) original_dimensions.push_back(dim.size);
-    return ElementsAttr(
-        SplatElementsAttr::get(single_attr.getType().clone(original_dimensions),
-                               single_attr.getValues<Attribute>()[0]));
+    return ElementsAttr(SplatElementsAttr::get(
+        single_attr.getShapedType().clone(original_dimensions),
+        single_attr.getValues<Attribute>()[0]));
   }
 
   Tensor t;
@@ -428,7 +428,7 @@ void ConvertFloat8ElementsAttr(const DenseElementsAttr attr,
 }
 
 Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
-  auto type = attr.getType();
+  auto type = attr.getShapedType();
   auto shape = type.getShape();
   tensorflow::DataType output_dtype;
   TF_RETURN_IF_ERROR(ConvertToDataType(type, &output_dtype));
diff --git a/tensorflow/core/ir/importexport/functiondef_export.cc b/tensorflow/core/ir/importexport/functiondef_export.cc
index 2469732e595..2517cd42345 100644
--- a/tensorflow/core/ir/importexport/functiondef_export.cc
+++ b/tensorflow/core/ir/importexport/functiondef_export.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -60,7 +61,8 @@ static tensorflow::StatusOr<std::string> GetValueName(Value operand,
     std::string name;
     if (is_control) name = "^";
     DictionaryAttr arg_attrs = function_interface_impl::getArgAttrDict(
-        block_operand.getParentBlock()->getParentOp(), arg_num - is_control);
+        FunctionOpInterface(block_operand.getParentBlock()->getParentOp()),
+        arg_num - is_control);
     if (!arg_attrs)
       return InvalidArgument("Missing attribute for argument #", arg_num);
     StringAttr arg_name = arg_attrs.getAs<StringAttr>("tfg.name");
@@ -234,7 +236,7 @@ tensorflow::StatusOr<FunctionDef> ConvertGenericFunctionToFunctionDef(
   auto return_op = llvm::cast<tfg::ReturnOp>(
       func_op.SingleBlock::getBody()->getTerminator());
   ArrayAttr results_attr = func_op.getAllResultAttrs();
-  for (auto &indexed_result : llvm::enumerate(return_op->getOperands())) {
+  for (const auto &indexed_result : llvm::enumerate(return_op->getOperands())) {
     int res_num = indexed_result.index();
     if (res_num >= results_attr.size())
       return InvalidArgument("Can't export function ", func_op.getName().str(),
diff --git a/tensorflow/core/ir/importexport/functiondef_import.cc b/tensorflow/core/ir/importexport/functiondef_import.cc
index d1d54b24788..7b5ff8f2f52 100644
--- a/tensorflow/core/ir/importexport/functiondef_import.cc
+++ b/tensorflow/core/ir/importexport/functiondef_import.cc
@@ -503,7 +503,7 @@ Status ImportGenericFunction(
     ret_vals[func.ret_size() + position->second] = result;
   }
   // Check that all the of the return operands have been populated.
-  for (auto& indexed_val : llvm::enumerate(ret_vals)) {
+  for (const auto& indexed_val : llvm::enumerate(ret_vals)) {
     if (indexed_val.value()) continue;
     return InvalidArgument(
         "Failed to import function, missing output for position ",
diff --git a/tensorflow/core/ir/importexport/graphdef_import.cc b/tensorflow/core/ir/importexport/graphdef_import.cc
index 3b0e67f7d5e..e5aefccf12e 100644
--- a/tensorflow/core/ir/importexport/graphdef_import.cc
+++ b/tensorflow/core/ir/importexport/graphdef_import.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
@@ -54,7 +55,6 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/stringpiece.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 
 using tensorflow::DataType;
 using tensorflow::DataTypeVector;
@@ -441,7 +441,7 @@ Location GraphDefImporter::ConvertLocation(const NodeDef &node) {
 
   SmallVector<Location> node_locs;
   node_locs.reserve(original_nodes.size());
-  for (auto &it : llvm::enumerate(original_nodes)) {
+  for (const auto &it : llvm::enumerate(original_nodes)) {
     std::string func_name =
         it.index() < original_funcs.size() ? original_funcs[it.index()] : "";
     node_locs.push_back(ConvertLocation(it.value(), func_name));
@@ -551,7 +551,7 @@ Status GraphDefImporter::ConvertFunctionDef(
   SmallVector<Type> arg_types, res_types;
 
   // Convert the arguments and argument attributes.
-  for (auto &it : llvm::enumerate(signature.input_arg())) {
+  for (const auto &it : llvm::enumerate(signature.input_arg())) {
     Type dtype;
     TF_RETURN_IF_ERROR(ConvertDataType(it.value().type(), b_, &dtype));
     BlockArgument data =
diff --git a/tensorflow/core/ir/importexport/graphdef_import.h b/tensorflow/core/ir/importexport/graphdef_import.h
index 49c65767cb9..f5489b698f4 100644
--- a/tensorflow/core/ir/importexport/graphdef_import.h
+++ b/tensorflow/core/ir/importexport/graphdef_import.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 
 namespace mlir {
 namespace tfg {
diff --git a/tensorflow/core/ir/importexport/savedmodel_export.h b/tensorflow/core/ir/importexport/savedmodel_export.h
index 72b0c89ed77..0d9811fd6a8 100644
--- a/tensorflow/core/ir/importexport/savedmodel_export.h
+++ b/tensorflow/core/ir/importexport/savedmodel_export.h
@@ -17,9 +17,9 @@ limitations under the License.
 #define TENSORFLOW_CORE_IR_IMPORTEXPORT_SAVEDMODEL_EXPORT_H_
 
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/saved_model.pb.h"
 
 namespace mlir {
diff --git a/tensorflow/core/ir/importexport/savedmodel_import.h b/tensorflow/core/ir/importexport/savedmodel_import.h
index 8fa1e5aa939..ffe2992f30c 100644
--- a/tensorflow/core/ir/importexport/savedmodel_import.h
+++ b/tensorflow/core/ir/importexport/savedmodel_import.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/saved_model.pb.h"
 
 namespace mlir {
diff --git a/tensorflow/core/ir/importexport/tests/saved_model/BUILD b/tensorflow/core/ir/importexport/tests/saved_model/BUILD
index 646f6ddafd7..d56674310c7 100644
--- a/tensorflow/core/ir/importexport/tests/saved_model/BUILD
+++ b/tensorflow/core/ir/importexport/tests/saved_model/BUILD
@@ -22,10 +22,9 @@ tf_cc_test(
         ":saved_model_test_files",
     ],
     deps = [
-        "//tensorflow/compiler/mlir:init_mlir",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/core:ops",
         "//tensorflow/core:test",
+        "//tensorflow/core/framework:graph_debug_info_proto_cc",
         "//tensorflow/core/ir/importexport:savedmodel_export",
         "//tensorflow/core/ir/importexport:savedmodel_import",
         "//tensorflow/core/ir/importexport/tests/roundtrip",
@@ -33,9 +32,5 @@ tf_cc_test(
         "//tensorflow/core/platform:path",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
     ],
 )
diff --git a/tensorflow/core/ir/importexport/tests/saved_model/saved_model_roundtrip_test.cc b/tensorflow/core/ir/importexport/tests/saved_model/saved_model_roundtrip_test.cc
index a9a5f6b00d8..85471de7eb0 100644
--- a/tensorflow/core/ir/importexport/tests/saved_model/saved_model_roundtrip_test.cc
+++ b/tensorflow/core/ir/importexport/tests/saved_model/saved_model_roundtrip_test.cc
@@ -16,13 +16,13 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/ir/importexport/savedmodel_export.h"
 #include "tensorflow/core/ir/importexport/savedmodel_import.h"
 #include "tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saved_model.pb.h"
 
diff --git a/tensorflow/core/ir/importexport/tfg-translate.cc b/tensorflow/core/ir/importexport/tfg-translate.cc
index 73f81f556d8..c596dff30b0 100644
--- a/tensorflow/core/ir/importexport/tfg-translate.cc
+++ b/tensorflow/core/ir/importexport/tfg-translate.cc
@@ -35,7 +35,7 @@ TranslateToMLIRRegistration graphdef_to_mlir(
       tensorflow::Status status = tensorflow::LoadProtoFromBuffer(
           {proto_txt.data(), proto_txt.size()}, &graphdef);
       if (!status.ok()) {
-        LOG(ERROR) << status.error_message();
+        LOG(ERROR) << status.message();
         return OwningOpRef<mlir::ModuleOp>{};
       }
       auto errorOrModule = tfg::ImportGraphDef(context, debug_info, graphdef);
diff --git a/tensorflow/core/ir/ops.cc b/tensorflow/core/ir/ops.cc
index 874b0ddffed..fc65d114c67 100644
--- a/tensorflow/core/ir/ops.cc
+++ b/tensorflow/core/ir/ops.cc
@@ -348,7 +348,7 @@ static bool VerifyGenericTFGOperation(Operation& op) {
   // inputs (or results).
   auto check_ctl_at_end = [&](TypeRange types, StringRef input_or_output) {
     int has_control_dep = -1;
-    for (auto& indexed_operand : llvm::enumerate(types)) {
+    for (const auto& indexed_operand : llvm::enumerate(types)) {
       if (indexed_operand.value() == control_ty) {
         has_control_dep = indexed_operand.index();
         continue;
@@ -426,7 +426,7 @@ LogicalResult GraphFuncOp::verifyBody() {
     return emitOpError() << "function type indicated " << type.getNumInputs()
                          << " args but block has " << body->getNumArguments();
 
-  for (auto& arg_types :
+  for (const auto& arg_types :
        llvm::enumerate(llvm::zip(type.getInputs(), body->getArgumentTypes()))) {
     Type signature_arg = std::get<0>(arg_types.value());
     Type block_arg = std::get<1>(arg_types.value());
@@ -447,7 +447,7 @@ LogicalResult GraphFuncOp::verifyBody() {
     return emitOpError() << "expects " << type.getNumResults()
                          << " returned values but tfg.return has "
                          << return_op->getNumOperands() << " operands";
-  for (auto& indexed_type : llvm::enumerate(type.getResults())) {
+  for (const auto& indexed_type : llvm::enumerate(type.getResults())) {
     Type expected_type = indexed_type.value();
     int res_num = indexed_type.index();
     Type actual_type = return_op->getOperand(res_num).getType();
@@ -458,7 +458,7 @@ LogicalResult GraphFuncOp::verifyBody() {
     }
   }
   Type control_type = getDialect()->getControlType();
-  for (auto& indexed_type : llvm::enumerate(llvm::drop_begin(
+  for (const auto& indexed_type : llvm::enumerate(llvm::drop_begin(
            return_op->getOperandTypes(), type.getNumResults()))) {
     Type actual_type = indexed_type.value();
     if (actual_type != control_type) {
@@ -923,7 +923,7 @@ static LogicalResult VerifySignature(GraphFuncOp func, Operation* op,
 
   if (func.getGeneric()) return success();
 
-  for (auto& it : llvm::enumerate(operands)) {
+  for (const auto& it : llvm::enumerate(operands)) {
     Type arg_type = arguments[it.index() * 2];
     Type op_type = it.value();
     if (!tf_type::HasCompatibleElementTypes(arg_type, op_type)) {
@@ -933,7 +933,7 @@ static LogicalResult VerifySignature(GraphFuncOp func, Operation* op,
           << " is not compatible with corresponding operand type: " << op_type);
     }
   }
-  for (auto& it : llvm::enumerate(results)) {
+  for (const auto& it : llvm::enumerate(results)) {
     Type ret_type = returns[it.index()];
     Type res_type = it.value();
     if (!tf_type::HasCompatibleElementTypes(ret_type, res_type)) {
@@ -1053,7 +1053,7 @@ static LogicalResult VerifyCaseLikeOp(CaseLikeOp op,
   // The first operand is the branch index and is not passed to the functions.
   TypeRange func_args = ins->drop_front();
 
-  for (auto& it : llvm::enumerate(op.getBranches())) {
+  for (const auto& it : llvm::enumerate(op.getBranches())) {
     SymbolRefAttr func_name = it.value().template cast<FuncAttr>().getName();
     auto func =
         symbol_table.lookupNearestSymbolFrom<GraphFuncOp>(op, func_name);
@@ -1234,7 +1234,7 @@ void GetIfLikeRegionOpSuccessorRegions(
 // Verify a case-like region op.
 template <typename CaseLikeRegionOp>
 static LogicalResult VerifyCaseLikeRegionOp(CaseLikeRegionOp op) {
-  for (auto& it : llvm::enumerate(op.getBranches())) {
+  for (const auto& it : llvm::enumerate(op.getBranches())) {
     if (!TerminatedByYield(it.value().front())) {
       return op.emitOpError("branch region #")
              << it.index() << " is not terminated by a 'tfg.yield' op";
diff --git a/tensorflow/core/ir/ops.td b/tensorflow/core/ir/ops.td
index da87adf4f6e..eb219e94eee 100644
--- a/tensorflow/core/ir/ops.td
+++ b/tensorflow/core/ir/ops.td
@@ -179,6 +179,18 @@ def TFGraph_GraphFuncOp : TFG_IntrinsicOp<"func", [
     // executed.
     ArrayRef<Type> getCallableResults() { return getFunctionType().getResults(); }
 
+    /// Returns the argument attributes for all callable region arguments or
+    /// null if there are none.
+    ::mlir::ArrayAttr getCallableArgAttrs() {
+      return getArgAttrs().value_or(nullptr);
+    }
+
+    /// Returns the result attributes for all callable region results or
+    /// null if there are none.
+    ::mlir::ArrayAttr getCallableResAttrs() {
+      return getResAttrs().value_or(nullptr);
+    }
+
     //===------------------------------------------------------------------===//
     // FunctionOpInterface Methods
     //===------------------------------------------------------------------===//
diff --git a/tensorflow/core/ir/types/dialect.cc b/tensorflow/core/ir/types/dialect.cc
index 449fb43c4b5..ee35d48e107 100644
--- a/tensorflow/core/ir/types/dialect.cc
+++ b/tensorflow/core/ir/types/dialect.cc
@@ -863,9 +863,11 @@ Attribute TensorProtoAttr::parse(AsmParser& parser, Type type) {
     parser.emitError(parser.getNameLoc(), "Hex string doesn't start with `0x`");
     return nullptr;
   }
+  auto shapedType = type.dyn_cast<ShapedType>();
+  if (!shapedType) return nullptr;
 
   std::string bytes_data = absl::HexStringToBytes(data.substr(2));
-  return TensorProtoAttr::get(type, bytes_data);
+  return TensorProtoAttr::get(shapedType, bytes_data);
 }
 
 void TensorProtoAttr::print(mlir::AsmPrinter& printer) const {
diff --git a/tensorflow/core/ir/utils/shape_inference_utils.cc b/tensorflow/core/ir/utils/shape_inference_utils.cc
index d1a3e992d54..432ee0bed9c 100644
--- a/tensorflow/core/ir/utils/shape_inference_utils.cc
+++ b/tensorflow/core/ir/utils/shape_inference_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -230,7 +231,7 @@ LogicalResult InferReturnTypeComponentsForTFOp(
                            /*ignore_unregistered_attrs=*/true, &attributes);
     if (!status.ok()) {
       VLOG(3) << op_name.data()
-              << " failed to get AttrValue: " << status.error_message();
+              << " failed to get AttrValue: " << status.message();
       return failure();
     }
   } else {
@@ -240,8 +241,8 @@ LogicalResult InferReturnTypeComponentsForTFOp(
         op, &node_def, dialect,
         [&](Value value) { return GetValueName(value, dialect); });
     if (!status.ok()) {
-      VLOG(3) << op_name.data() << " failed to be converted to NodeDef: "
-              << status.error_message();
+      VLOG(3) << op_name.data()
+              << " failed to be converted to NodeDef: " << status.message();
       return failure();
     }
     attributes = node_def.attr();
@@ -276,13 +277,13 @@ LogicalResult InferReturnTypeComponentsForTFOp(
                      /*input_tensors_as_shapes=*/{}, handle_shapes_and_types);
   if (!c.construction_status().ok()) {
     VLOG(3) << "InferenceContext construction failed on " << op_name.data()
-            << ": " << c.construction_status().error_message();
+            << ": " << c.construction_status().message();
     return failure();
   }
   auto status = c.Run(op_reg_data->shape_inference_fn);
   if (!status.ok()) {
     return ReportErrorFromShapeFunction(location, op_name,
-                                        status.error_message());
+                                        std::string(status.message()));
   }
 
   std::vector<const tensorflow::Tensor*> input_tensors(num_operands);
@@ -320,8 +321,8 @@ LogicalResult InferReturnTypeComponentsForTFOp(
             has_new_inputs = true;
           } else {
             VLOG(4) << "Error converting input " << input << " of op '"
-                    << op_name.data()
-                    << "' to Tensor: " << status.error_message() << "\n";
+                    << op_name.data() << "' to Tensor: " << status.message()
+                    << "\n";
           }
         }
       }
@@ -349,7 +350,7 @@ LogicalResult InferReturnTypeComponentsForTFOp(
     auto status = c.Run(op_reg_data->shape_inference_fn);
     if (!status.ok()) {
       return ReportErrorFromShapeFunction(location, op_name,
-                                          status.error_message());
+                                          std::string(status.message()));
     }
   }
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index dd63e2d5250..424c4dc9f6c 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -31,6 +31,10 @@ load(
     "if_mkl",
     "mkl_deps",
 )
+load(
+    "//third_party/mkl_dnn:build_defs.bzl",
+    "if_onednn_v3",
+)
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "get_compatible_with_portable", "tf_cc_shared_library", "tf_cuda_cc_test", "tf_cuda_cc_tests", "tf_disable_ptxas_warning_flags", "tf_kernel_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
@@ -184,10 +188,12 @@ tf_kernel_library(
     ]),
     prefix = "collective_ops",
     deps = [
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:core_cpu",
+        "//tensorflow/core/activity_watcher",
+        "//tensorflow/core/activity_watcher:activity_watcher_utils",
         "//tensorflow/core/profiler/lib:traceme",
     ] + if_nccl([
         "//tensorflow/core/nccl:collective_communicator",
@@ -282,11 +288,11 @@ tf_kernel_library(
         ":eigen_helpers",
         ":fill_functor",
         ":ops_util",
-        "//third_party/eigen3",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//third_party/eigen3",
     ] + if_cuda_or_rocm([":gpu_utils"]),
     alwayslink = 1,
 )
@@ -518,27 +524,36 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "numeric_options_utils",
+    hdrs = ["numeric_options_utils.h"],
+    deps = [
+        "//tensorflow/compiler/xla/stream_executor:numeric_options",
+        "//tensorflow/core/util:determinism_for_kernels",
+    ],
+)
+
 tf_cuda_library(
     name = "gpu_utils",
     srcs = if_cuda_or_rocm(["gpu_utils.cc"]),
     hdrs = ["gpu_utils.h"],
     deps = [
         ":gpu_util_hdrs",
+        "//tensorflow/compiler/xla/stream_executor:lazy_op_runner",
+        "//tensorflow/compiler/xla/stream_executor/gpu:asm_compiler",
+        "//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core/protobuf:conv_autotuning_proto_cc",
+        "//tensorflow/core/util:determinism_for_kernels",
         "//tensorflow/core/util:env_var",
+        "//tensorflow/core/util/autotune_maps:conv_parameters",
         "//tensorflow/core/util/proto:proto_utils",
-        "//tensorflow/compiler/xla/stream_executor:lazy_op_runner",
-        "//tensorflow/compiler/xla/stream_executor/gpu:asm_compiler",
-        "//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
-        "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/strings:str_format",
-        "//tensorflow/core/util:determinism_for_kernels",
-        "//tensorflow/core/util/autotune_maps:conv_parameters",
+        "@com_google_absl//absl/types:span",
     ] + if_cuda([
         "@local_config_cuda//cuda:cudnn_header",
         "//tensorflow/compiler/xla/stream_executor:tf_allocator_adapter",
@@ -1474,9 +1489,9 @@ tf_kernel_library(
     name = "ragged_fill_empty_rows_op",
     prefix = "ragged_fill_empty_rows_op",
     deps = [
+        "fill_empty_rows_functor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "fill_empty_rows_functor",
     ] + [":gpu_prim_hdrs"] + if_cuda_or_rocm([
         ":gpu_prim_helpers",
     ]) + if_cuda([
@@ -1513,6 +1528,7 @@ tf_kernel_library(
     srcs = ["cudnn_rnn_ops.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":cast_op",
         ":gpu_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -1801,8 +1817,8 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["depthwise_conv_ops_test.cc"],
     tags = tf_cuda_tests_tags() + [
-        "no_gpu",  # TODO(b/194100358): re-enable after flakiness resolved.
         "no_cuda_asan",  # TODO(b/171342266): re-enable.
+        "no_gpu",  # TODO(b/194100358): re-enable after flakiness resolved.
     ],
     deps = [
         ":conv_ops",
@@ -1893,8 +1909,6 @@ tf_cuda_cc_test(
         ":ops_testutil",
         ":ops_util",
         ":relu_op",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/strings",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:core_cpu",
@@ -1906,6 +1920,8 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ] + if_cuda([
         "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
     ]),
@@ -2362,10 +2378,10 @@ tf_kernel_library(
     prefix = "ctc",
     deps = [
         ":loose_headers",
-        "//tensorflow/core/framework:bounds_check",
         ":ops_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/util/ctc:ctc_beam_search_lib",
         "//tensorflow/core/util/ctc:ctc_loss_calculator_lib",
     ] + if_cuda_or_rocm([
@@ -3441,16 +3457,16 @@ tf_kernel_library(
     hdrs = ["matmul_op_impl.h"],
     prefix = "matmul_op",
     deps = MATH_DEPS + [
-        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
         ":fused_eigen_output_kernels",
         ":loose_headers",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
     ] + mkl_deps() + if_cuda([
         ":matmul_util",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_asm_opts",
         "//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
         "//tensorflow/compiler/xla/stream_executor:tf_allocator_adapter",
         "//tensorflow/core/profiler/lib:scoped_annotation",
-        "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
     ]) + if_cuda_or_rocm([
         ":conv_ops_gpu_hdrs",
         ":gpu_utils",
@@ -3468,10 +3484,10 @@ cc_library(
     deps = if_cuda([
         "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:tensor_float_32_hdr_lib",
-        "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/util:env_var",
         "//tensorflow/compiler/xla/stream_executor/cuda:cublas_lt_header",
     ]) + if_static(["//tensorflow/core/platform:tensor_float_32_utils"]),
@@ -3563,7 +3579,7 @@ tf_kernel_library(
     prefix = "fft_ops",
     deps = MATH_DEPS + [
     ] + if_cuda([
-        "//tensorflow/tsl/platform/default/build_config:cufft_plugin",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cufft_plugin",
     ]),
 )
 
@@ -3937,13 +3953,15 @@ tf_kernel_library(
     name = "conv_ops",
     srcs = [
         "conv_grad_filter_ops.cc",
+        "conv_grad_filter_ops_3d.cc",
+        "conv_grad_filter_ops_launcher.cc",
         "conv_grad_input_ops.cc",
+        "conv_grad_input_ops_3d.cc",
         "conv_grad_input_ops_bfloat16.cc",
         "conv_grad_input_ops_double.cc",
         "conv_grad_input_ops_float.cc",
         "conv_grad_input_ops_half.cc",
         "conv_grad_input_ops_int32.cc",
-        "conv_grad_ops_3d.cc",
         "deep_conv2d.cc",
     ],
     hdrs = [
@@ -3963,32 +3981,33 @@ tf_kernel_library(
     prefix = "conv_ops",
     deps = [
         ":cast_op",
-        ":conv_grad_shape_utils",
         ":conv_2d",
         ":conv_3d",
+        ":conv_grad_shape_utils",
         ":cwise_lib_hdrs",
-        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
         ":fill_functor",
         ":fused_eigen_output_kernels",
         ":loose_headers",
         ":ops_util",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "//third_party/eigen3",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
-        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/profiler/lib:scoped_annotation",
         "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core/util:image_resizer_state",
         "//tensorflow/core/util/proto:proto_utils",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
     ] + if_cuda([
+        ":numeric_options_utils",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_asm_opts",
-        "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
-        "//tensorflow/tsl/platform/default/build_config:cudnn_plugin",
         "//tensorflow/compiler/xla/stream_executor:tf_allocator_adapter",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/core/platform:stream_executor",
@@ -4412,6 +4431,7 @@ tf_kernel_library(
         ":conv_ops",
         ":eigen_helpers",
         ":loose_headers",
+        ":numeric_options_utils",
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -4419,7 +4439,6 @@ tf_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/platform:stream_executor",
-        "//tensorflow/core/util:determinism_for_kernels",
         "//third_party/eigen3",
     ],
 )
@@ -4672,6 +4691,7 @@ cc_library(
     name = "stateless_random_ops_v2_util",
     hdrs = ["stateless_random_ops_v2_util.h"],
     deps = [
+        ":random_op",
         ":stateless_random_ops_v2_header",
         "//tensorflow/core:framework",
     ],
@@ -4895,8 +4915,8 @@ tf_kernel_library(
     name = "sparse_fill_empty_rows_op",
     prefix = "sparse_fill_empty_rows_op",
     deps = SPARSE_DEPS + [
-        ":gpu_prim_hdrs",
         "fill_empty_rows_functor",
+        ":gpu_prim_hdrs",
     ] + if_cuda_or_rocm([
         ":gpu_prim_helpers",
     ]) + if_cuda([
@@ -5026,9 +5046,9 @@ tf_kernel_library(
     prefix = "sparse_tensor_dense_matmul_op",
     deps = SPARSE_DEPS + [
         ":fill_functor",
-        "//third_party/eigen3",
         "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/util:determinism_for_kernels",
+        "//third_party/eigen3",
     ],
 )
 
@@ -5049,10 +5069,10 @@ tf_kernel_library(
     gpu_copts = tf_disable_ptxas_warning_flags(),
     prefix = "sparse_xent_op",
     deps = SPARSE_DEPS + [
-        "//third_party/eigen3",
-        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/util:determinism_for_kernels",
+        "//third_party/eigen3",
     ] + if_cuda_or_rocm([
         ":reduction_ops",
     ]) + if_cuda([
@@ -5241,8 +5261,8 @@ tf_kernel_library(
     name = "count_ops",
     prefix = "count_ops",
     deps = STATE_DEPS + [
-        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/core/framework:op_requires",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -5297,11 +5317,11 @@ tf_kernel_library(
         "scatter_nd_op_gpu.cu.cc",
     ],
     deps = STATE_DEPS + [
-        ":scatter_nd_util",
         ":dense_update_functor",
+        ":inplace_ops",
+        ":scatter_nd_util",
         ":training_op_helpers",
         ":variable_ops",
-        ":inplace_ops",
     ],
 )
 
@@ -5674,14 +5694,14 @@ tf_kernel_library(
     name = "multinomial_op",
     prefix = "multinomial_op",
     deps = [
+        ":gpu_prim_hdrs",
         ":random_op",
         ":random_ops",
         ":stateless_random_ops",
-        ":gpu_prim_hdrs",
-        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//third_party/eigen3",
     ] + if_cuda_or_rocm([
         ":reduction_ops",
     ]),
@@ -6249,8 +6269,8 @@ filegroup(
         "slice_op_cpu_impl_8.cc",
         "softmax_op.cc",
         "softmax_op_functor.h",
-        "sparse_concat_op.h",
         "sparse_concat_op.cc",
+        "sparse_concat_op.h",
         "split_lib.h",
         "split_lib_cpu.cc",
         "split_op.cc",
@@ -6322,10 +6342,10 @@ filegroup(
         "fused_batch_norm_op.h",
         "inplace_ops.cc",
         "inplace_ops_functor.h",
+        "l2loss_op.h",
+        "list_kernels.h",
         "lookup_table_init_op.h",
         "lookup_table_op.h",
-        "list_kernels.h",
-        "l2loss_op.h",
         "map_kernels.h",
         "maxpooling_op.h",
         "mfcc.h",
@@ -6336,9 +6356,9 @@ filegroup(
         "partitioned_function_ops.h",
         "pooling_ops_3d.h",
         "ragged_tensor_variant.h",
+        "random_index_shuffle.h",
         "random_op.h",
         "random_poisson_op.h",
-        "random_index_shuffle.h",
         "reduction_ops.h",
         "reduction_ops_common.h",
         "relu_op.h",
@@ -6360,18 +6380,19 @@ filegroup(
         "softsign_op.h",
         "spacetobatch_functor.h",
         "spacetodepth_op.h",
-        "spectrogram.h",
-        "stateless_random_gamma_op.h",
-        "stateless_random_ops.h",
-        "stateless_random_ops_v2.h",
-        "sparse_xent_op.h",
         "sparse_reorder_op.h",
         "sparse_slice_op.h",
         "sparse_tensor_dense_matmul_op.h",
         "sparse_utils.h",
-        "string_util.h",
-        "string_to_hash_bucket_op.h",
+        "sparse_xent_op.h",
+        "spectrogram.h",
+        "stateless_random_gamma_op.h",
+        "stateless_random_ops.h",
+        "stateless_random_ops_v2.h",
+        "stochastic_cast_op.h",
         "string_to_hash_bucket_fast_op.h",
+        "string_to_hash_bucket_op.h",
+        "string_util.h",
         "tensor_array.h",
         "tensor_list.h",
         "tensor_map.h",
@@ -6410,7 +6431,6 @@ filegroup(
 filegroup(
     name = "portable_extended_ops_group1",
     srcs = [
-        ":portable_extended_ops_headers",
         "argmax_op.cc",
         "autotune_conv_impl.h",
         "avgpooling_op.cc",
@@ -6420,29 +6440,31 @@ filegroup(
         "control_flow_ops.cc",
         "conv_2d.h",
         "conv_grad_filter_ops.cc",
+        "conv_grad_filter_ops_3d.cc",
+        "conv_grad_filter_ops_launcher.cc",
+        "conv_grad_input_ops.cc",
         "conv_grad_input_ops.h",
+        "conv_grad_input_ops_3d.cc",
         "conv_grad_input_ops_double.cc",
         "conv_grad_input_ops_float.cc",
         "conv_grad_input_ops_half.cc",
         "conv_grad_input_ops_int32.cc",
-        "conv_grad_input_ops.cc",
         "conv_grad_ops.h",
-        "conv_grad_ops_3d.cc",
         "conv_grad_shape_utils.cc",
         "conv_grad_shape_utils.h",
         "conv_ops.cc",
+        "conv_ops_3d.cc",
         "conv_ops_bfloat16.cc",
         "conv_ops_double.cc",
         "conv_ops_float.cc",
-        "conv_ops_half.cc",
-        "conv_ops_int32.cc",
-        "conv_ops_3d.cc",
         "conv_ops_fused_double.cc",
         "conv_ops_fused_float.cc",
         "conv_ops_fused_half.cc",
         "conv_ops_fused_image_transform.cc",
         "conv_ops_fused_impl.h",
         "conv_ops_fused_int8.cc",
+        "conv_ops_half.cc",
+        "conv_ops_int32.cc",
         "conv_ops_using_gemm.cc",
         "cwise_op_abs.cc",
         "cwise_op_add_1.cc",
@@ -6454,8 +6476,8 @@ filegroup(
         "cwise_op_bitwise_or.cc",
         "cwise_op_bitwise_xor.cc",
         "cwise_op_ceil.cc",
-        "cwise_op_clip.h",
         "cwise_op_clip.cc",
+        "cwise_op_clip.h",
         "cwise_op_complex.cc",
         "cwise_op_conj.cc",
         "cwise_op_cos.cc",
@@ -6474,7 +6496,7 @@ filegroup(
         "cwise_op_invert.cc",
         "cwise_op_isfinite.cc",
         "cwise_op_isnan.cc",
-        "cwise_op_leakyrelu_bf16.cc",
+        "cwise_op_leakyrelu.cc",
         "cwise_op_left_shift.cc",
         "cwise_op_less.cc",
         "cwise_op_less_equal.cc",
@@ -6518,8 +6540,6 @@ filegroup(
         "depthwise_conv_grad_op.cc",
         "depthwise_conv_op.cc",
         "dynamic_partition_op.cc",
-        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel.cc",
-        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel.h",
         "encode_wav_op.cc",
         "fake_quant_ops.cc",
         "fifo_queue.cc",
@@ -6532,23 +6552,26 @@ filegroup(
         "population_count_op.cc",
         "population_count_op.h",
         "winograd_transform.h",
+        ":portable_extended_ops_headers",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel.cc",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel.h",
     ] + [
+        "//tensorflow/core/kernels/image:colorspace_op.cc",
         "//tensorflow/core/kernels/image:crop_and_resize_op.cc",
         "//tensorflow/core/kernels/image:crop_and_resize_op.h",
-        "//tensorflow/core/kernels/linalg:einsum_op_impl_half.cc",
-        "//tensorflow/core/kernels/linalg:einsum_op_impl_bfloat16.cc",
-        "//tensorflow/core/kernels/linalg:einsum_op_impl_int32.cc",
-        "//tensorflow/core/kernels/linalg:einsum_op_impl_int64.cc",
-        "//tensorflow/core/kernels/linalg:einsum_op_impl_float.cc",
-        "//tensorflow/core/kernels/linalg:einsum_op_impl_double.cc",
-        "//tensorflow/core/kernels/linalg:einsum_op_impl_complex64.cc",
-        "//tensorflow/core/kernels/linalg:einsum_op_impl_complex128.cc",
-        "//tensorflow/core/kernels/linalg:einsum_op_impl.h",
-        "//tensorflow/core/kernels/linalg:einsum_op.h",
         "//tensorflow/core/kernels/image:decode_image_op.cc",
         "//tensorflow/core/kernels/image:encode_jpeg_op.cc",
         "//tensorflow/core/kernels/image:encode_png_op.cc",
-        "//tensorflow/core/kernels/image:colorspace_op.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op.h",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl.h",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_bfloat16.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_complex128.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_complex64.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_double.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_float.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_half.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_int32.cc",
+        "//tensorflow/core/kernels/linalg:einsum_op_impl_int64.cc",
         "//tensorflow/core/kernels/uniform_quant_ops:portable_all_op_kernels",
     ],
 )
@@ -6556,7 +6579,6 @@ filegroup(
 filegroup(
     name = "portable_extended_ops_group2",
     srcs = [
-        ":portable_extended_ops_headers",
         "as_string_op.cc",
         "base64_ops.cc",
         "batchtospace_op.cc",
@@ -6574,13 +6596,13 @@ filegroup(
         "functional_ops.cc",
         "in_topk_op.cc",
         "in_topk_op.h",
+        "l2loss_op.cc",
         "list_kernels.cc",
         "logging_ops.cc",
         "logging_ops.h",
         "lookup_table_init_op.cc",
         "lookup_table_op.cc",
         "lrn_op.cc",
-        "l2loss_op.cc",
         "map_kernels.cc",
         "maxpooling_op.cc",
         "mfcc.cc",
@@ -6597,22 +6619,20 @@ filegroup(
         "queue_base.cc",
         "queue_op.cc",
         "queue_ops.cc",
-        "ragged_tensor_variant.cc",
-        "ragged_range_op.cc",
         "ragged_gather_op.cc",
+        "ragged_range_op.cc",
+        "ragged_tensor_from_variant_op.cc",
         "ragged_tensor_to_sparse_kernel.cc",
         "ragged_tensor_to_tensor_op.cc",
         "ragged_tensor_to_variant_op.cc",
-        "ragged_tensor_from_variant_op.cc",
-        "random_index_shuffle.h",
+        "ragged_tensor_variant.cc",
         "random_index_shuffle.cc",
+        "random_index_shuffle.h",
         "random_index_shuffle_ops.cc",
         "random_op.cc",
         "random_op_cpu.h",
         "random_ops_util.h",
         "random_poisson_op.cc",
-        "shuffle_common.h",
-        "sparse_utils.cc",
         "random_shuffle_op.cc",
         "reduce_join_op.cc",
         "reduction_ops_all.cc",
@@ -6623,8 +6643,8 @@ filegroup(
         "reduction_ops_min.cc",
         "reduction_ops_prod.cc",
         "reduction_ops_sum.cc",
-        "regex_replace_op.cc",
         "regex_full_match_op.cc",
+        "regex_replace_op.cc",
         "relu_op.cc",
         "reshape_util.cc",
         "resource_variable_ops.cc",
@@ -6655,6 +6675,7 @@ filegroup(
         "segment_reduction_ops_impl_5.cc",
         "session_ops.cc",
         "set_kernels.cc",
+        "shuffle_common.h",
         "softplus_op.cc",
         "softsign_op.cc",
         "spacetobatch_functor.cc",
@@ -6662,14 +6683,15 @@ filegroup(
         "spacetodepth_op.cc",
         "sparse_add_op.cc",
         "sparse_cross_op.cc",
-        "sparse_reduce_op.cc",
-        "sparse_xent_op.cc",
         "sparse_fill_empty_rows_op.cc",
+        "sparse_reduce_op.cc",
         "sparse_reorder_op.cc",
         "sparse_reshape_op.cc",
         "sparse_slice_op.cc",
         "sparse_tensor_dense_matmul_op.cc",
         "sparse_to_dense_op.cc",
+        "sparse_utils.cc",
+        "sparse_xent_op.cc",
         "spectrogram.cc",
         "spectrogram_op.cc",
         "stack.cc",
@@ -6677,19 +6699,20 @@ filegroup(
         "stack_ops.cc",
         "stateless_random_gamma_op.cc",
         "stateless_random_ops.cc",
-        "stateless_random_ops_v2_util.h",
         "stateless_random_ops_v2.cc",
+        "stateless_random_ops_v2_util.h",
         "stateless_shuffle.cc",
+        "stochastic_cast_op.cc",
         "string_format_op.cc",
         "string_join_op.cc",
         "string_length_op.cc",
         "string_lower_op.cc",
-        "string_util.cc",
         "string_split_op.cc",
         "string_strip_op.cc",
-        "string_to_hash_bucket_op.cc",
         "string_to_hash_bucket_fast_op.cc",
+        "string_to_hash_bucket_op.cc",
         "string_to_number_op.cc",
+        "string_util.cc",
         "substr_op.cc",
         "tensor_array.cc",
         "tensor_array_ops.cc",
@@ -6733,6 +6756,7 @@ filegroup(
         "where_op.cc",
         "whole_file_read_ops.cc",
         "xent_op.cc",
+        ":portable_extended_ops_headers",
     ] + [
         "//tensorflow/core/kernels/data:portable_all_op_kernels",
         "//tensorflow/core/kernels/image:adjust_contrast_op.cc",
@@ -7116,14 +7140,14 @@ tf_cc_binary(
         ],
         "//conditions:default": [
             ":quantized_ops",
-            "//third_party/eigen3",
-            "//tensorflow/core:core_cpu_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:test",
             "//tensorflow/cc:cc_ops",
             "//tensorflow/cc:client_session",
+            "//tensorflow/core:core_cpu_internal",
             "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
             "//tensorflow/core/framework:tensor_testutil",
+            "//third_party/eigen3",
         ],
     }),
 )
@@ -7180,9 +7204,9 @@ cc_binary(
             ":quantized_ops",
             "//tensorflow/core:framework",
             "//tensorflow/core:protos_all_cc",
-            "//tensorflow/core/framework:tensor_testutil",
             "//tensorflow/core:tensorflow",
             "//tensorflow/core:test",
+            "//tensorflow/core/framework:tensor_testutil",
         ],
     }),
 )
@@ -7383,9 +7407,9 @@ cc_binary(
             ":ops_util",
             ":quantized_ops",
             "//tensorflow/core:framework",
-            "//tensorflow/core/framework:tensor_testutil",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:test",
+            "//tensorflow/core/framework:tensor_testutil",
         ],
     }),
 )
@@ -7745,6 +7769,38 @@ cc_library(
     ],
 )
 
+tf_kernel_library(
+    name = "stochastic_cast_op",
+    prefix = "stochastic_cast_op",
+    deps = [
+        ":stateless_random_ops_v2_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "stochastic_cast_op_test",
+    timeout = "moderate",
+    srcs = ["stochastic_cast_op_test.cc"],
+    shard_count = 48,
+    deps = [
+        ":cwise_lib",
+        ":ops_testutil",
+        ":ops_util",
+        ":stochastic_cast_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:fake_input",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//third_party/eigen3",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 # Shared object that links all the kernels TF needs.
 tf_cc_shared_library(
     name = "libtfkernel_all_kernels.so",
@@ -7796,7 +7852,7 @@ tf_cc_shared_library(
         "@upb//:__subpackages__",
         "@zlib//:__subpackages__",
         # copybara:comment_end
-    ],
+    ] + if_onednn_v3(["@onednn_v3//:__subpackages__"]),
     visibility = ["//visibility:public"],
     deps = [
         ":kernel_platform_strings",
@@ -7911,7 +7967,7 @@ exports_files([
     "cwise_op_greater.cc",
     "cwise_op_greater_equal.cc",
     "cwise_op_isinf.cc",
-    "cwise_op_leakyrelu_bf16.cc",
+    "cwise_op_leakyrelu.cc",
     "cwise_op_less.cc",
     "cwise_op_less_equal.cc",
     "cwise_op_log.cc",
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index 06371c2c883..985b4059716 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -20,12 +20,14 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/stringpiece.h"
 
 namespace tensorflow {
 
@@ -47,6 +49,7 @@ class AsStringOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("width", &width));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("fill", &fill_string));
     switch (dtype) {
+      case DT_STRING:
       case DT_HALF:
       case DT_BFLOAT16:
       case DT_FLOAT:
@@ -95,6 +98,14 @@ class AsStringOp : public OpKernel {
       strings::Appendf(&format_, ".%d", precision);
     }
     switch (dtype) {
+      case DT_STRING:
+        // Clear format to signal pass-through.
+        if (width <= 0) {
+          format_ = "";
+        } else {
+          strings::Appendf(&format_, "s");
+        }
+        break;
       case DT_UINT8:
       case DT_UINT16:
       case DT_UINT32:
@@ -146,6 +157,12 @@ class AsStringOp : public OpKernel {
     OP_REQUIRES_OK(context, context->input("input", &input_tensor));
     const DataType& dtype = input_tensor->dtype();
 
+    // If input is string and width unspecified, simply forward to output.
+    if (dtype == DT_STRING && format_.empty()) {
+      context->set_output(0, context->input(0));
+      return;
+    }
+
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output("output", input_tensor->shape(),
@@ -177,6 +194,13 @@ class AsStringOp : public OpKernel {
           output_flat(i) = (input_flat(i)) ? "true" : "false";
         }
       } break;
+      case (DT_STRING): {
+        const auto& input_flat = input_tensor->flat<tstring>();
+        for (int i = 0; i < input_flat.size(); ++i) {
+          output_flat(i) = strings::Printf(format_.c_str(),
+                                           StringPiece(input_flat(i)).data());
+        }
+      } break;
       case (DT_VARIANT): {
         const auto& input_flat = input_tensor->flat<Variant>();
         for (int i = 0; i < input_flat.size(); ++i) {
diff --git a/tensorflow/core/kernels/as_string_op_test.cc b/tensorflow/core/kernels/as_string_op_test.cc
index e30d6ef3989..d3e5f405b42 100644
--- a/tensorflow/core/kernels/as_string_op_test.cc
+++ b/tensorflow/core/kernels/as_string_op_test.cc
@@ -170,21 +170,12 @@ TEST_F(AsStringGraphTest, Variant) {
   test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
 }
 
-TEST_F(AsStringGraphTest, String) {
-  Status s = Init(DT_STRING);
-  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
-  ASSERT_TRUE(absl::StrContains(
-      s.error_message(),
-      "Value for attr 'T' of string is not in the list of allowed values"));
-}
-
 TEST_F(AsStringGraphTest, OnlyOneOfScientificAndShortest) {
   Status s = Init(DT_FLOAT, /*fill=*/"", /*width=*/-1, /*precision=*/-1,
                   /*scientific=*/true, /*shortest=*/true);
   ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
-  ASSERT_TRUE(
-      absl::StrContains(s.error_message(),
-                        "Cannot select both scientific and shortest notation"));
+  ASSERT_TRUE(absl::StrContains(
+      s.message(), "Cannot select both scientific and shortest notation"));
 }
 
 TEST_F(AsStringGraphTest, NoShortestForNonFloat) {
@@ -192,7 +183,7 @@ TEST_F(AsStringGraphTest, NoShortestForNonFloat) {
                   /*scientific=*/false, /*shortest=*/true);
   ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
   ASSERT_TRUE(absl::StrContains(
-      s.error_message(),
+      s.message(),
       "scientific and shortest format not supported for datatype"));
 }
 
@@ -201,21 +192,21 @@ TEST_F(AsStringGraphTest, NoScientificForNonFloat) {
                   /*scientific=*/true);
   ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
   ASSERT_TRUE(absl::StrContains(
-      s.error_message(),
+      s.message(),
       "scientific and shortest format not supported for datatype"));
 }
 
 TEST_F(AsStringGraphTest, NoPrecisionForNonFloat) {
   Status s = Init(DT_INT32, /*fill=*/"", /*width=*/-1, /*precision=*/5);
   ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
-  ASSERT_TRUE(absl::StrContains(s.error_message(),
-                                "precision not supported for datatype"));
+  ASSERT_TRUE(
+      absl::StrContains(s.message(), "precision not supported for datatype"));
 }
 
 TEST_F(AsStringGraphTest, LongFill) {
   Status s = Init(DT_INT32, /*fill=*/"asdf");
   ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
-  ASSERT_TRUE(absl::StrContains(s.error_message(),
+  ASSERT_TRUE(absl::StrContains(s.message(),
                                 "Fill string must be one or fewer characters"));
 }
 
@@ -252,15 +243,13 @@ TEST_F(AsStringGraphTest, FillWithChar1) {
 TEST_F(AsStringGraphTest, FillWithChar3) {
   Status s = Init(DT_INT32, /*fill=*/"s");
   ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
-  ASSERT_TRUE(
-      absl::StrContains(s.error_message(), "Fill argument not supported"));
+  ASSERT_TRUE(absl::StrContains(s.message(), "Fill argument not supported"));
 }
 
 TEST_F(AsStringGraphTest, FillWithChar4) {
   Status s = Init(DT_INT32, /*fill=*/"n");
   ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
-  ASSERT_TRUE(
-      absl::StrContains(s.error_message(), "Fill argument not supported"));
+  ASSERT_TRUE(absl::StrContains(s.message(), "Fill argument not supported"));
 }
 
 }  // end namespace
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index df8f3a7eaf9..6e50a5fdab3 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/batch_kernels.h"
 
+#include <algorithm>
 #include <memory>
 #include <utility>
 
@@ -59,8 +60,8 @@ constexpr int64_t kBatchThreadPoolSize = 128;
 }  // namespace
 
 // Per-model inflight batches parameters.
-const int64_t kMinInflightBatches = 16;
-const int64_t kInitialInflightBatches = 16;
+const int64_t kMinInflightBatches = 1;
+const int64_t kInitialInflightBatches = 2;
 const int64_t kBatchesToAverageOver = 10;
 const int64_t kMaxInflightBatches = 64;
 
@@ -315,10 +316,23 @@ void BatchFunctionKernel::ComputeAsync(OpKernelContext* c, DoneCallback done) {
           adaptive_shared_batch_scheduler_options;
       adaptive_shared_batch_scheduler_options.thread_pool_name =
           "adaptive_batch_threads";
-      adaptive_shared_batch_scheduler_options.num_batch_threads =
-          adaptive_batch_scheduler_options_->max_in_flight_batches_limit;
       adaptive_shared_batch_scheduler_options.thread_pool =
           GetOrCreateBatchThreadsPool();
+
+      // When we explicitly specify 'thread_pool', you'd think ASBS would ignore
+      // 'num_batch_threads', but in fact ASBS still uses num_batch_threads as
+      // the max number of in-flight batches.  It makes no sense to have more
+      // in-flight batches than threads (it would result in strictly bad
+      // batching decisions), so we cap this parameter (which otherwise comes
+      // from the saved model) to the actual number of batch threads (which
+      // comes from a process-wide environment variable).
+      //
+      // We have to apply the same capping to min_ and initial_
+      // in_flight_batches_limit below to produce valid configurations.
+      adaptive_shared_batch_scheduler_options.num_batch_threads = std::min(
+          NumBatchThreadsFromEnvironmentWithDefault(kBatchThreadPoolSize),
+          adaptive_batch_scheduler_options_->max_in_flight_batches_limit);
+
       // adaptive_shared_batch_scheduler_options.full_batch_scheduling_boost_micros
       // is 0 (default value) intentionally, so tasks are scheduled in a FIFO
       // way.
@@ -332,9 +346,13 @@ void BatchFunctionKernel::ComputeAsync(OpKernelContext* c, DoneCallback done) {
       // the batch processing latency (which varies on a model basis).
       // If a non-zero value is not set properly, it harms tail latency.
       adaptive_shared_batch_scheduler_options.min_in_flight_batches_limit =
-          adaptive_batch_scheduler_options_->min_in_flight_batches_limit;
-      adaptive_shared_batch_scheduler_options.initial_in_flight_batches_limit =
-          adaptive_batch_scheduler_options_->initial_in_flight_batches_limit;
+          std::min(
+              NumBatchThreadsFromEnvironmentWithDefault(kBatchThreadPoolSize),
+              adaptive_batch_scheduler_options_->min_in_flight_batches_limit);
+      adaptive_shared_batch_scheduler_options
+          .initial_in_flight_batches_limit = std::min(
+          NumBatchThreadsFromEnvironmentWithDefault(kBatchThreadPoolSize),
+          adaptive_batch_scheduler_options_->initial_in_flight_batches_limit);
       adaptive_shared_batch_scheduler_options.batches_to_average_over =
           adaptive_batch_scheduler_options_->batches_to_average_over;
       if (adaptive_batch_scheduler_options_
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index 35e5200c73d..5641a5caf29 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/batching_util/batch_resource_base.h"
 
 #include <sstream>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -60,14 +61,28 @@ void RecordPaddingSize(int32_t padding_size, const string& model_name,
 
 void RecordPaddingSizeV2(int32_t padding_size, const string& model_name,
                          int32_t execution_batch_size, const string& op_name) {
+  // Bucket containing 0 has bounds [-2/3, 2/3).
+  // Remaining buckets are centered at powers of 2 and have bounds:
+  // [(2/3) * 2^i, (4/3) * 2^i) for i = 1, ..., 13.
+  // Largest bucket has range: [(2/3) *  2^14, DBL_MAX]
+
+  std::vector<double> bucket_limits;
+  // populate bound for zero bucket
+  bucket_limits.push_back(-2.0 / 3.0);
+  // populate rest of bounds
+  double bound = 2.0 / 3.0;
+  double growth_factor = 2;
+  for (int i = 0; i < 16; i++) {
+    bucket_limits.push_back(bound);
+    bound *= growth_factor;
+  }
+
   static auto* cell = tensorflow::monitoring::Sampler<3>::New(
       {"/tensorflow/serving/batching/padding_size_v2",
        "Tracks the padding size distribution on batches by model_name (if "
        "available).",
        "model_name", "execution_batch_size", "op_name"},
-      // It's 14 buckets with the last bucket being 2^13 to DBL_MAX;
-      // so the limits are [1, 2, 4, 8, ..., 8 * 1024, DBL_MAX].
-      monitoring::Buckets::Exponential(1, 2, 14));
+      monitoring::Buckets::Explicit(bucket_limits));
   cell->GetCell(model_name, absl::StrCat(execution_batch_size), op_name)
       ->Add(static_cast<double>(padding_size));
 }
@@ -92,9 +107,10 @@ void RecordInputBatchSizeV2(int32_t batch_size, const string& model_name,
        "Tracks the batch size distribution on the inputs by model_name (if "
        "available).",
        "model_name", "op_name"},
-      // It's 14 buckets with the last bucket being 2^13 to DBL_MAX;
-      // so the limits are [1, 2, 4, 8, ..., 8 * 1024, DBL_MAX].
-      monitoring::Buckets::Exponential(1, 2, 14));
+      // Buckets centered at powers of 2, and have bounds:
+      // [(2/3) * 2^i, (4/3) * 2^i] for i = 0, ..., 13.
+      // Largest bucket has range: [(2/3) *  2^14, DBL_MAX]
+      monitoring::Buckets::Exponential(2.0 / 3.0, 2, 15));
   cell->GetCell(model_name, op_name)->Add(static_cast<double>(batch_size));
 }
 
@@ -585,7 +601,7 @@ Status BatchResourceBase::ConcatInputTensors(
     if (!split_status.ok()) {
       return errors::Internal(
           "When splitting input, Tensor split operation failed: ",
-          split_status.error_message());
+          split_status.message());
     }
     if (split_tensors.size() != output_task_sizes.size()) {
       return errors::Internal(
@@ -653,7 +669,7 @@ Status BatchResourceBase::SplitOutputTensors(
     DCHECK(split_status.ok()) << split_status.ToString();
     if (!split_status.ok()) {
       return errors::Internal("Tensor split operation failed: ",
-                              split_status.error_message());
+                              split_status.message());
     }
     DCHECK_EQ(split_tensor.size(), task_sizes_plus_optional_padding.size());
     if (split_tensor.size() != task_sizes_plus_optional_padding.size()) {
diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
index 193431595b6..e36ff2836c4 100644
--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -432,7 +432,7 @@ class SparseBincountOp : public OpKernel {
         OP_REQUIRES(
             ctx, bin < out.dimension(1),
             errors::InvalidArgument("Index out ouf bound. `bin` (", bin,
-                                    ") must be less then the dimension size (",
+                                    ") must be less than the dimension size (",
                                     out.dimension(1), ")."));
         if (bin < size) {
           if (binary_output_) {
diff --git a/tensorflow/core/kernels/bincount_op_gpu.cu.cc b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
index 71825a2ac0e..cb3522c1c01 100644
--- a/tensorflow/core/kernels/bincount_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
@@ -42,9 +42,9 @@ struct BincountFunctor<GPUDevice, Tidx, T, false> {
                         typename TTypes<T, 1>::Tensor& output,
                         const Tidx num_bins) {
     if (weights.size() != 0) {
-      return errors::InvalidArgument(
-          "Weights should not be passed as it should be "
-          "handled by unsorted_segment_sum");
+      return errors::Unimplemented(
+          "Weights are not yet supported by the GPU implementation of Bincount."
+          " Please use unsorted_segment_sum instead.");
     }
     if (output.size() == 0) {
       return OkStatus();
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
index df6e7226ac5..d634e4476fa 100644
--- a/tensorflow/core/kernels/broadcast_to_op.cc
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -20,6 +20,12 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#if !defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS) && defined(__APPLE__) && \
+    !defined(ANDROID) && !defined(__ANDROID__) &&                       \
+    (!defined(TARGET_OS_IOS) || !TARGET_OS_IOS)
+#define PLUGGABLE_DEVICE_SUPPORTED_MACOS 1
+#endif
+
 #include "tensorflow/core/kernels/broadcast_to_op.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -143,5 +149,14 @@ REGISTER_KERNEL_BUILDER(Name("BroadcastTo")
                             .HostMemory("output"),
                         BroadcastToOp<CPUDevice, int32>);
 #endif
+#if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
+REGISTER_KERNEL_BUILDER(Name("BroadcastTo")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("input")
+                            .HostMemory("shape")
+                            .HostMemory("output"),
+                        BroadcastToOp<CPUDevice, int32>);
+#endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index 756c9a7f7f0..7cb9547af7d 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -45,6 +45,7 @@ typedef Eigen::GpuDevice GPUDevice;
   FN(arg0, int32);               \
   FN(arg0, int64_t);             \
   FN(arg0, Eigen::half);         \
+  FN(arg0, bfloat16);            \
   FN(arg0, float);               \
   FN(arg0, double);              \
   FN(arg0, std::complex<float>); \
@@ -252,13 +253,27 @@ CURRY_TYPES2(REGISTER_CAST_GPU, float);
 CURRY_TYPES2(REGISTER_CAST_GPU, double);
 CURRY_TYPES2(REGISTER_CAST_GPU, std::complex<float>);
 CURRY_TYPES2(REGISTER_CAST_GPU, std::complex<double>);
-#endif
-
+#else
+REGISTER_CAST_GPU(bool, bfloat16);
+REGISTER_CAST_GPU(int8, bfloat16);
+REGISTER_CAST_GPU(int16, bfloat16);
+REGISTER_CAST_GPU(int32, bfloat16);
+REGISTER_CAST_GPU(int64, bfloat16);
+REGISTER_CAST_GPU(uint8, bfloat16);
+REGISTER_CAST_GPU(uint16, bfloat16);
+REGISTER_CAST_GPU(uint32, bfloat16);
+REGISTER_CAST_GPU(uint64, bfloat16);
+REGISTER_CAST_GPU(Eigen::half, bfloat16);
 REGISTER_CAST_GPU(float, bfloat16);
+REGISTER_CAST_GPU(double, bfloat16);
+REGISTER_CAST_GPU(std::complex<float>, bfloat16);
+REGISTER_CAST_GPU(std::complex<double>, bfloat16);
+#endif
+CURRY_TYPES2(REGISTER_CAST_GPU, bfloat16);
+
 REGISTER_CAST_GPU(float, float8_e5m2);
 REGISTER_CAST_GPU(float, float8_e4m3fn);
 
-REGISTER_CAST_GPU(bfloat16, float);
 REGISTER_CAST_GPU(bfloat16, float8_e5m2);
 REGISTER_CAST_GPU(bfloat16, float8_e4m3fn);
 
diff --git a/tensorflow/core/kernels/cast_op_gpu.cu.cc b/tensorflow/core/kernels/cast_op_gpu.cu.cc
index db989d45428..5976f39b251 100644
--- a/tensorflow/core/kernels/cast_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cast_op_gpu.cu.cc
@@ -52,8 +52,6 @@ CAST_FUNCTORS(GPUDevice);
   DEFINE(in_type, std::complex<float>); \
   DEFINE(in_type, std::complex<double>)
 
-DEFINE(float, bfloat16);
-
 // Required functors not previously specialized for truncation.
 DEFINE(double, float8_e5m2);
 DEFINE(float, float8_e5m2);
@@ -98,6 +96,7 @@ DEFINE_ALL_FROM(std::complex<double>);
   DEFINE(out_type, int32);            \
   DEFINE(out_type, int64);            \
   DEFINE(out_type, Eigen::half);      \
+  DEFINE(out_type, bfloat16);         \
   DEFINE(out_type, float);            \
   DEFINE(out_type, std::complex<float>)
 
@@ -111,9 +110,24 @@ DEFINE_ALL_FROM(std::complex<double>);
   DEFINE(out_type, int16);           \
   DEFINE(out_type, int32);           \
   DEFINE(out_type, int64);           \
-  DEFINE(out_type, Eigen::half)
+  DEFINE(out_type, Eigen::half);     \
+  DEFINE(out_type, bfloat16)
 
 DEFINE_ALL_TO_HALF(bfloat16);
+DEFINE(bool, bfloat16);
+DEFINE(uint8, bfloat16);
+DEFINE(uint16, bfloat16);
+DEFINE(uint32, bfloat16);
+DEFINE(uint64, bfloat16);
+DEFINE(int8, bfloat16);
+DEFINE(int16, bfloat16);
+DEFINE(int32, bfloat16);
+DEFINE(int64, bfloat16);
+DEFINE(std::complex<double>, bfloat16);
+DEFINE(double, bfloat16);
+DEFINE(bfloat16, std::complex<float>);
+DEFINE(bfloat16, std::complex<double>);
+DEFINE(bfloat16, double);
 
 #if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 // The cast from Eigen::half is still needed for depthwise_conv_grad_op.cc.
@@ -124,6 +138,10 @@ DEFINE(float, float);
 // self_adjoint_eig_v2_op_gpu.cc
 DEFINE(std::complex<float>, float);
 DEFINE(std::complex<double>, double);
+
+DEFINE(float, bfloat16);
+DEFINE(Eigen::half, bfloat16);
+DEFINE(std::complex<float>, bfloat16);
 #else
 DEFINE_ALL_TO_HALF(Eigen::half);
 DEFINE_ALL_TO_FLOAT(float);
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index be3cb886938..66764d2f0de 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -72,27 +72,21 @@ CAST_FUNCTORS(Eigen::ThreadPoolDevice);
 
 }  // namespace functor
 
-#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
-  FN(arg0, arg1, bool);                      \
-  FN(arg0, arg1, uint8);                     \
-  FN(arg0, arg1, uint16);                    \
-  FN(arg0, arg1, uint32);                    \
-  FN(arg0, arg1, uint64);                    \
-  FN(arg0, arg1, int8);                      \
-  FN(arg0, arg1, int16);                     \
-  FN(arg0, arg1, int32);                     \
-  FN(arg0, arg1, int64_t);                   \
-  FN(arg0, arg1, float);                     \
-  FN(arg0, arg1, double);                    \
-  FN(arg0, arg1, std::complex<float>);       \
-  FN(arg0, arg1, std::complex<double>)
-
-#define CURRY_TYPES3_NO_BF16(FN, arg0, arg1) \
-  CURRY_TYPES3_NO_HALF(FN, arg0, arg1)       \
-  FN(arg0, arg1, Eigen::half);
-
 #define CURRY_TYPES3(FN, arg0, arg1)   \
-  CURRY_TYPES3_NO_BF16(FN, arg0, arg1) \
+  FN(arg0, arg1, bool);                \
+  FN(arg0, arg1, uint8);               \
+  FN(arg0, arg1, uint16);              \
+  FN(arg0, arg1, uint32);              \
+  FN(arg0, arg1, uint64);              \
+  FN(arg0, arg1, int8);                \
+  FN(arg0, arg1, int16);               \
+  FN(arg0, arg1, int32);               \
+  FN(arg0, arg1, int64_t);             \
+  FN(arg0, arg1, float);               \
+  FN(arg0, arg1, double);              \
+  FN(arg0, arg1, std::complex<float>); \
+  FN(arg0, arg1, std::complex<double>) \
+  FN(arg0, arg1, Eigen::half);         \
   FN(arg0, arg1, bfloat16);
 
 #define CAST_CASE(DEVICE, IN, OUT)                                        \
diff --git a/tensorflow/core/kernels/cast_op_impl_bfloat.cc b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
index 4944304d032..ecbb3918b2c 100644
--- a/tensorflow/core/kernels/cast_op_impl_bfloat.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
@@ -32,7 +32,7 @@ CastFunctorType GetCpuCastFromBfloat(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromBfloat(DataType dst_dtype) {
-  CAST_CASE(GPUDevice, bfloat16, float);
+  CURRY_TYPES3(CAST_CASE, GPUDevice, bfloat16);
   CAST_CASE(GPUDevice, bfloat16, float8_e5m2);
   CAST_CASE(GPUDevice, bfloat16, float8_e4m3fn);
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc
index 57408599252..af7d4db4c45 100644
--- a/tensorflow/core/kernels/cast_op_impl_bool.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bool.cc
@@ -28,8 +28,10 @@ CastFunctorType GetCpuCastFromBool(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromBool(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, bool);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, bool, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, bool);
 #endif
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_complex128.cc b/tensorflow/core/kernels/cast_op_impl_complex128.cc
index 25e3a6b4d82..2d3011eefc6 100644
--- a/tensorflow/core/kernels/cast_op_impl_complex128.cc
+++ b/tensorflow/core/kernels/cast_op_impl_complex128.cc
@@ -28,8 +28,10 @@ CastFunctorType GetCpuCastFromComplex128(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromComplex128(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, std::complex<double>);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, std::complex<double>, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, std::complex<double>);
 #endif
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_complex64.cc b/tensorflow/core/kernels/cast_op_impl_complex64.cc
index 990a68b2594..eb3a970822e 100644
--- a/tensorflow/core/kernels/cast_op_impl_complex64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_complex64.cc
@@ -28,8 +28,10 @@ CastFunctorType GetCpuCastFromComplex64(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromComplex64(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, std::complex<float>);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, std::complex<float>, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, std::complex<float>);
 #endif
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc
index 8d18e58df71..f487102896e 100644
--- a/tensorflow/core/kernels/cast_op_impl_double.cc
+++ b/tensorflow/core/kernels/cast_op_impl_double.cc
@@ -30,8 +30,10 @@ CastFunctorType GetCpuCastFromDouble(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromDouble(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, double);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, double, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, double);
 #endif
   CAST_CASE(GPUDevice, double, float8_e5m2);
   CAST_CASE(GPUDevice, double, float8_e4m3fn);
diff --git a/tensorflow/core/kernels/cast_op_impl_half.cc b/tensorflow/core/kernels/cast_op_impl_half.cc
index 04a2429efa3..a4044c236f4 100644
--- a/tensorflow/core/kernels/cast_op_impl_half.cc
+++ b/tensorflow/core/kernels/cast_op_impl_half.cc
@@ -30,8 +30,10 @@ CastFunctorType GetCpuCastFromHalf(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromHalf(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, Eigen::half);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, Eigen::half, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, Eigen::half);
 #endif
   CAST_CASE(GPUDevice, Eigen::half, float8_e5m2);
   CAST_CASE(GPUDevice, Eigen::half, float8_e4m3fn);
diff --git a/tensorflow/core/kernels/cast_op_impl_int16.cc b/tensorflow/core/kernels/cast_op_impl_int16.cc
index 714a5dfaac1..3965bcb5389 100644
--- a/tensorflow/core/kernels/cast_op_impl_int16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int16.cc
@@ -28,8 +28,10 @@ CastFunctorType GetCpuCastFromInt16(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromInt16(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int16);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, int16, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, int16);
 #endif
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc
index 5d97d4713b2..e42e0001f6c 100644
--- a/tensorflow/core/kernels/cast_op_impl_int32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int32.cc
@@ -28,8 +28,10 @@ CastFunctorType GetCpuCastFromInt32(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromInt32(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int32);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, int32, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, int32);
 #endif
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index 6701f21c343..6c853609ec7 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -28,8 +28,10 @@ CastFunctorType GetCpuCastFromInt64(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromInt64(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int64);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, int64, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, int64);
 #endif
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_int8.cc b/tensorflow/core/kernels/cast_op_impl_int8.cc
index f82623b5f94..02fe9bc0acd 100644
--- a/tensorflow/core/kernels/cast_op_impl_int8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int8.cc
@@ -28,8 +28,10 @@ CastFunctorType GetCpuCastFromInt8(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromInt8(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int8);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, int8, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, int8);
 #endif
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_uint16.cc b/tensorflow/core/kernels/cast_op_impl_uint16.cc
index 9eec4426b5d..de2b24aca2d 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint16.cc
@@ -28,8 +28,10 @@ CastFunctorType GetCpuCastFromUint16(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromUint16(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint16);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, uint16, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, uint16);
 #endif
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_uint32.cc b/tensorflow/core/kernels/cast_op_impl_uint32.cc
index 6e365acf865..2ed5a2d65b8 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint32.cc
@@ -28,8 +28,10 @@ CastFunctorType GetCpuCastFromUint32(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromUint32(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint32);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, uint32, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, uint32);
 #endif
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_uint64.cc b/tensorflow/core/kernels/cast_op_impl_uint64.cc
index 3ddc1271388..0a983968f47 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint64.cc
@@ -28,8 +28,10 @@ CastFunctorType GetCpuCastFromUint64(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromUint64(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint64);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, uint64, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, uint64);
 #endif
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_uint8.cc b/tensorflow/core/kernels/cast_op_impl_uint8.cc
index 9ba4ad4fd1a..4f7d6341a8e 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint8.cc
@@ -28,8 +28,10 @@ CastFunctorType GetCpuCastFromUint8(DataType dst_dtype) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromUint8(DataType dst_dtype) {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint8);
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+  CAST_CASE(GPUDevice, uint8, bfloat16);
+#else
+  CURRY_TYPES3(CAST_CASE, GPUDevice, uint8);
 #endif
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/checkpoint_callback_manager.cc b/tensorflow/core/kernels/checkpoint_callback_manager.cc
index 161b68fc276..0b6b0413615 100644
--- a/tensorflow/core/kernels/checkpoint_callback_manager.cc
+++ b/tensorflow/core/kernels/checkpoint_callback_manager.cc
@@ -36,7 +36,7 @@ const absl::string_view kCheckpointCallbackManagerResourceName =
 
 namespace {
 
-const absl::string_view kCheckpointFileRegex = "^part-[0-9]*-of-[0-9]*$";
+const absl::string_view kCheckpointFileRegex = "^part-[0-9]*-of-[0-9]*";
 const absl::string_view kCheckpointTempDirRegex = "-[0-9]*_temp$";
 const absl::string_view kCheckpointDirRegex = "-[0-9]*$";
 const absl::string_view kCheckpointTempDirSuffix = "_temp";
diff --git a/tensorflow/core/kernels/checkpoint_callback_manager_test.cc b/tensorflow/core/kernels/checkpoint_callback_manager_test.cc
index 68dd0810e08..534b11e3434 100644
--- a/tensorflow/core/kernels/checkpoint_callback_manager_test.cc
+++ b/tensorflow/core/kernels/checkpoint_callback_manager_test.cc
@@ -74,6 +74,16 @@ TEST_F(CheckpointCallbackManagerTest,
   EXPECT_EQ(pair->second, "/foo/bar");
 }
 
+TEST_F(CheckpointCallbackManagerTest,
+       GetCheckpointIdAndPathFromPrefixForLongerPartName) {
+  StatusOr<std::pair<std::string, std::string>> pair =
+      CheckpointCallbackManager::GetCheckpointIdAndPathFromPrefix(
+          "/foo/bar/ckpt-tensor-1_temp/part-00000-of-00002_dev-0-of-2");
+  TF_ASSERT_OK(pair.status());
+  EXPECT_EQ(pair->first, "ckpt-tensor-1");
+  EXPECT_EQ(pair->second, "/foo/bar");
+}
+
 TEST_F(CheckpointCallbackManagerTest,
        GetCheckpointIdAndPathFromPrefixUnrecognized) {
   EXPECT_FALSE(
diff --git a/tensorflow/core/kernels/collective_nccl_test.cc b/tensorflow/core/kernels/collective_nccl_test.cc
index c698aec415b..6244c9ffa1a 100644
--- a/tensorflow/core/kernels/collective_nccl_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_test.cc
@@ -219,7 +219,9 @@ class NcclTestBase : public ::testing::Test {
     }
 
     void RunAllToAll() {
-      output_ = input_;
+      // Allocate output. We can't reuse the input because NCCL does not support
+      // in-place all-to-all.
+      output_ = Tensor(DT_FLOAT, input_.shape());
       status_ = tensorflow::RunCollective(test_env_, col_params_.get(), device_,
                                           &input_, &output_);
     }
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index f0fabb70af8..d281c7ca383 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "tensorflow/core/activity_watcher/activity.h"
+#include "tensorflow/core/activity_watcher/activity_utils.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
@@ -57,8 +59,8 @@ static std::unique_ptr<OpKernel> BuildOpKernel(OpKernelConstruction* c,
                      c->device()->GetAllocator(AllocatorAttributes()),
                      *sub_node, c->graph_def_version(), &status);
   if (!status.ok()) {
-    c->CtxFailureWithWarning(errors::Internal(
-        "Failed to build OpKernel for ", name, " : ", status.error_message()));
+    c->CtxFailureWithWarning(errors::Internal("Failed to build OpKernel for ",
+                                              name, " : ", status.message()));
   }
   return k;
 }
@@ -642,10 +644,24 @@ class CollectiveOpV2Kernel : public AsyncOpKernel {
             "Failed to get CollectiveExecutor from OpKernelContext for Op ",
             name_),
         done);
+
+    auto activity_id = activity_watcher::ActivityStart([&]() {
+      return activity_watcher::ActivityFromContext(
+          c, "CollectiveV2Op::Run",
+          activity_watcher::ActivityCategory::kCollective,
+          {
+              {"group_key", absl::StrCat(col_params->group.group_key)},
+              {"group_size", absl::StrCat(col_params->group.group_size)},
+              {"instance_key", absl::StrCat(col_params->instance.instance_key)},
+              {"communication_hint",
+               col_params->instance.impl_details.communication_hint},
+          });
+    });
     // Resolve the collective params.
     // Schedule the `CompleteParamsAsync` call on a work queue that can handle
     // blocking work because it's not guaranteed that this call cannot block.
-    c->collective_executor()->RunClosure([c, done = std::move(done), col_params,
+    c->collective_executor()->RunClosure([c, activity_id,
+                                          done = std::move(done), col_params,
                                           col_exec]() {
       VLOG(1) << "Collective CompleteParams for " << col_params->name
               << " device " << c->device()->name() << " group "
@@ -653,9 +669,10 @@ class CollectiveOpV2Kernel : public AsyncOpKernel {
               << col_params->instance.instance_key;
       col_exec->CompleteParamsAsync(
           c->device()->attributes(), col_params, c->cancellation_manager(),
-          [c, done = std::move(done), col_params, col_exec](const Status& s) {
+          [c, activity_id, done = std::move(done), col_params,
+           col_exec](const Status& s) {
             if (s.ok()) {
-              auto actual_done = [c, col_params,
+              auto actual_done = [c, activity_id, col_params,
                                   done = std::move(done)](const Status& s) {
                 VLOG(1) << "Collective ExecuteAsync done for "
                         << col_params->name << " device " << c->device()->name()
@@ -666,6 +683,7 @@ class CollectiveOpV2Kernel : public AsyncOpKernel {
                   c->SetStatus(s);
                 }
                 done();
+                activity_watcher::ActivityEnd(activity_id);
               };
               VLOG(1) << "Collective ExecuteAsync start for "
                       << col_params->name << " device " << c->device()->name()
@@ -679,6 +697,7 @@ class CollectiveOpV2Kernel : public AsyncOpKernel {
             } else {
               c->SetStatus(s);
               done();
+              activity_watcher::ActivityEnd(activity_id);
             }
           });
     });
@@ -1283,12 +1302,14 @@ class CollectiveAllToAllV2OpKernel : public CollectiveOpV2Kernel {
             << col_params->group.group_size << " group_key "
             << col_params->group.group_key << " instance_key "
             << col_params->instance.instance_key;
-    // Allocate the output tensor.
+    // Allocate the output tensor. NCCL does not support in-place all-to-all, so
+    // don't reuse the input tensor. We could potentially use
+    // forward_input_or_allocate_output and allocate a temporary buffer inside
+    // the NCCL backend only when the input is reused.
     Tensor* output = nullptr;
-    OP_REQUIRES_OK_ASYNC(c,
-                         c->forward_input_or_allocate_output(
-                             {0}, 0, col_params->instance.shape, &output),
-                         done_with_cleanup);
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_output(0, col_params->instance.shape, &output),
+        done_with_cleanup);
     Run(c, col_params, std::move(done_with_cleanup));
   }
 };
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index d523eb535ec..7509bed522b 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -104,91 +104,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename T>
-struct LaunchConv2DBackpropFilterOp<CPUDevice, T> {
-  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& out_backprop, const Tensor& input,
-                  int row_dilation, int col_dilation, int row_stride,
-                  int col_stride, const Padding& padding,
-                  const std::vector<int64_t>& explicit_paddings,
-                  Tensor* filter_backprop, TensorFormat data_format) {
-    std::vector<int32> dilations(4, 1);
-    dilations[GetTensorDimIndex(data_format, 'H')] = row_dilation;
-    dilations[GetTensorDimIndex(data_format, 'W')] = col_dilation;
-
-    std::vector<int32> strides(4, 1);
-    strides[GetTensorDimIndex(data_format, 'H')] = row_stride;
-    strides[GetTensorDimIndex(data_format, 'W')] = col_stride;
-    TensorShape filter_shape = filter_backprop->shape();
-
-    ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(
-        ctx, ConvBackpropComputeDimensionsV2(
-                 "Conv2DBackpropFilter", /*num_spatial_dims=*/2, input.shape(),
-                 filter_shape, out_backprop.shape(), dilations, strides,
-                 padding, explicit_paddings, data_format, &dims));
-
-    int64_t padding_top = -1, padding_bottom = -1;
-    int64_t padding_left = -1, padding_right = -1;
-    if (padding == EXPLICIT) {
-      GetExplicitPaddingForDim(explicit_paddings, data_format, 'H',
-                               &padding_top, &padding_bottom);
-      GetExplicitPaddingForDim(explicit_paddings, data_format, 'W',
-                               &padding_left, &padding_right);
-    }
-    int64_t expected_out_rows, expected_out_cols;
-    // The function is guaranteed to succeed because we checked the output and
-    // padding was valid earlier.
-    TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
-        dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
-        row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
-        &padding_bottom));
-    DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
-    TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
-        dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
-        col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
-        &padding_right));
-    DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
-
-    const CPUDevice& d = ctx->eigen_device<CPUDevice>();
-
-    // WARNING: Need to swap row/col, padding_top/padding_left, and
-    // padding_bottom/padding_right when calling Eigen. Eigen expects tensors
-    // in NWHC format, but Tensorflow uses NHWC.
-
-    auto filter_backprop_t = filter_backprop->tensor<T, 4>();
-    auto input_t = input.tensor<T, 4>();
-    auto out_backprop_t = out_backprop.tensor<T, 4>();
-
-    if (padding != EXPLICIT) {
-      // If padding was not explicitly defined, Eigen spatial convolution
-      // backward filter will infer correct forward paddings from input tensors.
-      filter_backprop_t.device(d) = Eigen::SpatialConvolutionBackwardKernel(
-          input_t, out_backprop_t, filter_backprop_t.dimension(1),
-          filter_backprop_t.dimension(0), col_stride, row_stride, col_dilation,
-          row_dilation);
-
-    } else {
-      // Otherwise we have to explicitly pad the input, before passing it to
-      // spatial convolution backward filter.
-      Eigen::array<std::pair<int, int>, 4> paddings;
-      paddings[0] = {0, 0};
-      paddings[1] = {padding_top, padding_bottom};
-      paddings[2] = {padding_left, padding_right};
-      paddings[3] = {0, 0};
-
-      auto padded_t = input_t.pad(paddings, T(0));
-
-      // TODO(ezhulenev): Pass explicit paddings to Eigen spatial backward
-      // convolution and do not rely on tensor padding expression.
-      filter_backprop_t.device(d) = Eigen::SpatialConvolutionBackwardKernel(
-          padded_t, out_backprop_t, filter_backprop_t.dimension(1),
-          filter_backprop_t.dimension(0), col_stride, row_stride, col_dilation,
-          row_dilation);
-    }
-  }
-};
-
 template <typename Device, class T>
 class Conv2DBackpropFilterOp : public OpKernel {
  public:
@@ -560,464 +475,14 @@ TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
-// To be used inside depthwise_conv_grad_op.cc.
-template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::bfloat16>;
-template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
-template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
-template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
+extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::bfloat16>;
+extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
+extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
+extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
 
 // GPU definitions.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-// The slow version (but compiles for GPU)
 
-// A dummy type to group forward backward filter autotune results together.
-struct ConvBackwardFilterAutotuneGroup {
-  static string name() { return "ConvBwdFilter"; }
-};
-
-typedef AutotuneSingleton<ConvBackwardFilterAutotuneGroup, ConvParameters,
-                          AutotuneEntry<se::dnn::ConvOp>>
-    AutotuneConvBwdFilter;
-
-template <typename T>
-void LaunchConv2DBackpropFilterOpImpl(
-    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-    const Tensor& out_backprop, const Tensor& input, int row_dilation,
-    int col_dilation, int row_stride, int col_stride, const Padding& padding,
-    const std::vector<int64_t>& explicit_paddings, Tensor* filter_backprop,
-    TensorFormat data_format) {
-  using se::dnn::AlgorithmConfig;
-  using se::dnn::AlgorithmDesc;
-  using se::dnn::ProfileResult;
-
-  std::vector<int32> dilations(4, 1);
-  dilations[GetTensorDimIndex(data_format, 'H')] = row_dilation;
-  dilations[GetTensorDimIndex(data_format, 'W')] = col_dilation;
-
-  std::vector<int32> strides(4, 1);
-  strides[GetTensorDimIndex(data_format, 'H')] = row_stride;
-  strides[GetTensorDimIndex(data_format, 'W')] = col_stride;
-  TensorShape filter_shape = filter_backprop->shape();
-
-  ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(
-      ctx, ConvBackpropComputeDimensionsV2(
-               "Conv2DBackpropFilter", /*num_spatial_dims=*/2, input.shape(),
-               filter_shape, out_backprop.shape(), dilations, strides, padding,
-               explicit_paddings, data_format, &dims));
-
-  int64_t padding_top = -1, padding_bottom = -1;
-  int64_t padding_left = -1, padding_right = -1;
-  if (padding == EXPLICIT) {
-    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
-                             &padding_bottom);
-    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
-                             &padding_right);
-  }
-  int64_t expected_out_rows, expected_out_cols;
-  // The function is guaranteed to succeed because we checked the output and
-  // padding was valid earlier.
-  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
-      dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
-      row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
-      &padding_bottom));
-  DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
-  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
-      dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
-      col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
-      &padding_right));
-  DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
-
-  auto* stream = ctx->op_device_context()->stream();
-  OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
-
-  if (!use_cudnn) {
-    ctx->SetStatus(errors::Unimplemented(
-        "Conv2DBackprop for GPU is not currently supported "
-        "without cudnn"));
-    return;
-  }
-
-  // If the filter in-depth (filter_shape.dim_size(2)) is 1 and smaller than the
-  // input depth, it's a depthwise convolution. More generally, if the filter
-  // in-depth divides but is smaller than the input depth, it is a grouped
-  // convolution.
-  bool is_grouped_convolution = filter_shape.dim_size(2) != dims.in_depth;
-  bool cudnn_disable_conv_1x1_optimization_ = CudnnDisableConv1x1Optimization();
-  if (!cudnn_disable_conv_1x1_optimization_ &&
-      dims.spatial_dims[0].filter_size == 1 &&
-      dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
-      dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
-      data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
-    const uint64 m = dims.in_depth;
-    const uint64 k = dims.batch_size * dims.spatial_dims[0].input_size *
-                     dims.spatial_dims[1].input_size;
-    const uint64 n = dims.out_depth;
-
-    // The shape of output backprop is
-    //   [batch, out_rows, out_cols, out_depth]
-    //   From cublas's perspective, it is: n x k
-    auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
-                                out_backprop.template flat<T>().size());
-
-    // The shape of input is
-    //   [batch, in_rows, in_cols, in_depth],
-    //   From cublas's perspective, it is: m x k
-    auto b_ptr = AsDeviceMemory(input.template flat<T>().data(),
-                                input.template flat<T>().size());
-
-    // the shape of the filter backprop from the conv_2d should be
-    //   [1, 1, in_depth, out_depth]
-    //   From cublas's perspective, it is: n x m
-    auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
-                                filter_backprop->template flat<T>().size());
-
-    OP_REQUIRES_OK(
-        ctx, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
-                                  se::blas::Transpose::kTranspose, n, m, k,
-                                  a_ptr, n, b_ptr, m, &c_ptr, n,
-                                  se::blas::kDefaultComputePrecision));
-    return;
-  } else if (dims.spatial_dims[0].filter_size ==
-                 dims.spatial_dims[0].input_size &&
-             dims.spatial_dims[1].filter_size ==
-                 dims.spatial_dims[1].input_size &&
-             !is_grouped_convolution && padding == VALID &&
-             data_format == FORMAT_NHWC) {
-    // The input data and filter have the same height/width, and we are not
-    // using grouped convolution, so call cublas directly.
-    const uint64 m = dims.spatial_dims[0].input_size *
-                     dims.spatial_dims[1].input_size * dims.in_depth;
-    const uint64 k = dims.batch_size;
-    const uint64 n = dims.out_depth;
-
-    auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
-                                input.template flat<T>().size());
-    auto b_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
-                                out_backprop.template flat<T>().size());
-    auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
-                                filter_backprop->template flat<T>().size());
-
-    OP_REQUIRES_OK(
-        ctx, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
-                                  se::blas::Transpose::kTranspose, n, m, k,
-                                  b_ptr, n, a_ptr, m, &c_ptr, n,
-                                  se::blas::kDefaultComputePrecision));
-    return;
-  }
-
-  const int64_t common_padding_rows = std::min(padding_top, padding_bottom);
-  const int64_t common_padding_cols = std::min(padding_left, padding_right);
-  Tensor compatible_input;
-  if (padding_top != padding_bottom || padding_left != padding_right) {
-    // Pad the input in the same way we did during the forward pass, so that
-    // cuDNN or MIOpen receives the same input during the backward pass function
-    // as it did during the forward pass function.
-    const int64_t padding_rows_diff = std::abs(padding_bottom - padding_top);
-    const int64_t padding_cols_diff = std::abs(padding_right - padding_left);
-    const int64_t new_in_rows =
-        dims.spatial_dims[0].input_size + padding_rows_diff;
-    const int64_t new_in_cols =
-        dims.spatial_dims[1].input_size + padding_cols_diff;
-    const int64_t input_pad_top = padding_top - common_padding_rows;
-    const int64_t input_pad_bottom = padding_bottom - common_padding_rows;
-    const int64_t input_pad_left = padding_left - common_padding_cols;
-    const int64_t input_pad_right = padding_right - common_padding_cols;
-    TensorShape compatible_input_shape;
-    OP_REQUIRES_OK(
-        ctx, ShapeFromFormatWithStatus(data_format, dims.batch_size,
-                                       new_in_rows, new_in_cols, dims.in_depth,
-                                       &compatible_input_shape));
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                compatible_input_shape, &compatible_input));
-
-    functor::PadInput<GPUDevice, T, int, 4>()(
-        ctx->template eigen_device<GPUDevice>(), To32Bit(input.tensor<T, 4>()),
-        {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
-        {{static_cast<int>(input_pad_bottom),
-          static_cast<int>(input_pad_right)}},
-        To32Bit(compatible_input.tensor<T, 4>()), data_format, T{});
-  } else {
-    compatible_input = input;
-  }
-
-  CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
-      << "Negative row or col paddings: (" << common_padding_rows << ", "
-      << common_padding_cols << ")";
-
-  const bool compute_in_nhwc =
-      ComputeInNhwcEnabled(DataTypeToEnum<T>::value, stream);
-
-  // We only do one directional conversion: NHWC->NCHW. We never convert in the
-  // other direction. Grappler layout optimizer selects the preferred layout and
-  // adds necessary annotations to the graph.
-  const TensorFormat compute_data_format =
-      (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
-                                                      : FORMAT_NCHW;
-
-  VLOG(3) << "Compute Conv2DBackpropFilter with cuDNN:"
-          << " data_format=" << ToString(data_format)
-          << " compute_data_format=" << ToString(compute_data_format);
-
-  constexpr auto kComputeInNHWC =
-      std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
-                      se::dnn::FilterLayout::kOutputYXInput);
-  constexpr auto kComputeInNCHW =
-      std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
-                      se::dnn::FilterLayout::kOutputInputYX);
-
-  se::dnn::DataLayout compute_data_layout;
-  se::dnn::FilterLayout filter_layout;
-
-  std::tie(compute_data_layout, filter_layout) =
-      compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
-
-  se::dnn::BatchDescriptor input_desc;
-  input_desc.set_count(dims.batch_size)
-      .set_height(GetTensorDim(compatible_input, data_format, 'H'))
-      .set_width(GetTensorDim(compatible_input, data_format, 'W'))
-      .set_feature_map_count(dims.in_depth)
-      .set_layout(compute_data_layout);
-  se::dnn::BatchDescriptor output_desc;
-  output_desc.set_count(dims.batch_size)
-      .set_height(dims.spatial_dims[0].output_size)
-      .set_width(dims.spatial_dims[1].output_size)
-      .set_feature_map_count(dims.out_depth)
-      .set_layout(compute_data_layout);
-  se::dnn::FilterDescriptor filter_desc;
-  filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
-      .set_input_filter_width(dims.spatial_dims[1].filter_size)
-      .set_input_feature_map_count(filter_shape.dim_size(2))
-      .set_output_feature_map_count(filter_shape.dim_size(3))
-      .set_layout(filter_layout);
-  se::dnn::ConvolutionDescriptor conv_desc;
-  conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
-      .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
-      .set_vertical_filter_stride(dims.spatial_dims[0].stride)
-      .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
-      .set_zero_padding_height(common_padding_rows)
-      .set_zero_padding_width(common_padding_cols)
-      .set_group_count(dims.in_depth / filter_shape.dim_size(2));
-
-  // Tensorflow filter format: HWIO
-  // cuDNN filter formats: (data format) -> (filter format)
-  //   (1) NCHW -> OIHW
-  //   (2) NHWC -> OHWI
-  //
-  // We compute filter backprop into temporary tensor, and then convert it to
-  // the HWIO data format at the end.
-
-  Tensor pre_transformed_filter_backprop;
-  OP_REQUIRES_OK(
-      ctx,
-      ctx->allocate_temp(
-          DataTypeToEnum<T>::value,
-          TensorShape({filter_shape.dim_size(3), filter_shape.dim_size(2),
-                       filter_shape.dim_size(0), filter_shape.dim_size(1)}),
-          &pre_transformed_filter_backprop));
-
-  Tensor transformed_out_backprop;
-  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
-    VLOG(4) << "Convert the `out_backprop` tensor from NHWC to NCHW.";
-    TensorShape compute_shape;
-    OP_REQUIRES_OK(
-        ctx, ShapeFromFormatWithStatus(compute_data_format, dims.batch_size,
-                                       dims.spatial_dims[0].output_size,
-                                       dims.spatial_dims[1].output_size,
-                                       dims.out_depth, &compute_shape));
-    if (dims.out_depth > 1) {
-      OP_REQUIRES_OK(ctx,
-                     ctx->allocate_temp(DataTypeToEnum<T>::value, compute_shape,
-                                        &transformed_out_backprop));
-      functor::NHWCToNCHW<GPUDevice, T, 4>()(
-          ctx->eigen_device<GPUDevice>(), out_backprop.tensor<T, 4>(),
-          transformed_out_backprop.tensor<T, 4>());
-    } else {
-      // If depth <= 1, just reshape.
-      CHECK(transformed_out_backprop.CopyFrom(out_backprop, compute_shape));
-    }
-  } else {
-    transformed_out_backprop = out_backprop;
-  }
-
-  Tensor transformed_input;
-  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
-    VLOG(4) << "Convert the `input` tensor from NHWC to NCHW.";
-    TensorShape compute_shape;
-    OP_REQUIRES_OK(ctx, ShapeFromFormatWithStatus(
-                            compute_data_format,
-                            GetTensorDim(compatible_input, data_format, 'N'),
-                            GetTensorDim(compatible_input, data_format, 'H'),
-                            GetTensorDim(compatible_input, data_format, 'W'),
-                            GetTensorDim(compatible_input, data_format, 'C'),
-                            &compute_shape));
-    if (compute_shape.dim_size(1) > 1) {
-      OP_REQUIRES_OK(ctx,
-                     ctx->allocate_temp(DataTypeToEnum<T>::value, compute_shape,
-                                        &transformed_input));
-      functor::NHWCToNCHW<GPUDevice, T, 4>()(
-          ctx->eigen_device<GPUDevice>(),
-          const_cast<const Tensor&>(compatible_input).tensor<T, 4>(),
-          transformed_input.tensor<T, 4>());
-    } else {
-      // If depth <= 1, just reshape.
-      CHECK(transformed_input.CopyFrom(compatible_input, compute_shape));
-    }
-  } else {
-    transformed_input = compatible_input;
-  }
-
-  se::DeviceMemory<T> out_backprop_ptr =
-      AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
-                     transformed_out_backprop.template flat<T>().size());
-  se::DeviceMemory<T> filter_backprop_ptr =
-      AsDeviceMemory(pre_transformed_filter_backprop.template flat<T>().data(),
-                     pre_transformed_filter_backprop.template flat<T>().size());
-  auto input_ptr = AsDeviceMemory(transformed_input.template flat<T>().data(),
-                                  transformed_input.template flat<T>().size());
-
-  static int64_t ConvolveBackwardFilterScratchSize =
-      GetDnnWorkspaceLimitOrDefault();
-  ConvParameters conv_parameters = {
-      stream->parent(),
-      dims.batch_size,                     // batch
-      dims.in_depth,                       // in_depths
-      {{input_desc.height(),               // in_rows
-        input_desc.width()}},              // in_cols
-      compute_data_format,                 // compute_data_format
-      dims.out_depth,                      // out_depths
-      {{dims.spatial_dims[0].filter_size,  // filter_rows
-        dims.spatial_dims[1].filter_size,  // filter_cols
-        filter_shape.dim_size(2)}},        // filter_depth
-      {{dims.spatial_dims[0].dilation,     // dilation_rows
-        dims.spatial_dims[1].dilation}},   // dilation_cols
-      {{dims.spatial_dims[0].stride,       // stride_rows
-        dims.spatial_dims[1].stride}},     // stride_cols
-      {{common_padding_rows,               // padding_rows
-        common_padding_cols}},             // padding_cols
-      input.dtype(),                       // tensor datatype
-      conv_desc.group_count(),             // group_count
-  };
-
-  auto entry_or = AutotuneUnfusedConv(
-      cudnn_use_autotune, AutotuneConvBwdFilter::GetInstance(), conv_parameters,
-      ctx, se::dnn::ConvolutionKind::BACKWARD_FILTER, input_desc, input_ptr,
-      filter_desc, filter_backprop_ptr, conv_desc, output_desc,
-      out_backprop_ptr, ConvolveBackwardFilterScratchSize);
-  OP_REQUIRES_OK(ctx, entry_or.status());
-  auto autotune_entry = std::move(entry_or).value();
-
-  DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize, ctx);
-  Status cudnn_launch_status = LaunchAutotunedConv(
-      autotune_entry, &scratch_allocator,
-      se::dnn::ConvolutionKind::BACKWARD_FILTER, stream, input_desc, input_ptr,
-      filter_desc, filter_backprop_ptr, conv_desc, output_desc,
-      out_backprop_ptr);
-  if (!cudnn_launch_status.ok()) {
-    ctx->SetStatus(cudnn_launch_status);
-    return;
-  }
-
-  FilterTensorFormat src_filter_format =
-      compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
-
-  auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-  functor::ReverseTransformFilter<GPUDevice, T, 4>()(
-      ctx->eigen_device<GPUDevice>(), src_filter_format,
-      toConstTensor(pre_transformed_filter_backprop).template tensor<T, 4>(),
-      filter_backprop->tensor<T, 4>());
-}
-
-template <typename T>
-void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
-    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-    const Tensor& out_backprop, const Tensor& input, int row_dilation,
-    int col_dilation, int row_stride, int col_stride, const Padding& padding,
-    const std::vector<int64_t>& explicit_paddings, Tensor* filter_backprop,
-    TensorFormat data_format) {
-  LaunchConv2DBackpropFilterOpImpl<T>(
-      ctx, use_cudnn, cudnn_use_autotune, out_backprop, input, row_dilation,
-      col_dilation, row_stride, col_stride, padding, explicit_paddings,
-      filter_backprop, data_format);
-}
-
-template <>
-void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, Eigen::bfloat16>::
-operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-           const Tensor& out_backprop, const Tensor& input, int row_dilation,
-           int col_dilation, int row_stride, int col_stride,
-           const Padding& padding,
-           const std::vector<int64_t>& explicit_paddings,
-           Tensor* filter_backprop, TensorFormat data_format) {
-  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
-  auto* stream = ctx->op_device_context()->stream();
-  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-      se::CudaComputeCapability::AMPERE);
-
-  if (cast_to_float) {
-    Tensor casted_input = input;
-    Tensor casted_out_backprop = out_backprop;
-    Tensor casted_filter_backprop = *filter_backprop;
-
-    const GPUDevice& device = ctx->eigen_device<GPUDevice>();
-    functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_temp(DT_FLOAT, input.shape(), &casted_input));
-    cast(device, casted_input.template flat<float>(),
-         input.template flat<Eigen::bfloat16>());
-
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, out_backprop.shape(),
-                                           &casted_out_backprop));
-    cast(device, casted_out_backprop.template flat<float>(),
-         out_backprop.template flat<Eigen::bfloat16>());
-
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, filter_backprop->shape(),
-                                           &casted_filter_backprop));
-
-    LaunchConv2DBackpropFilterOpImpl<float>(
-        ctx, use_cudnn, cudnn_use_autotune, casted_out_backprop, casted_input,
-        row_dilation, col_dilation, row_stride, col_stride, padding,
-        explicit_paddings, &casted_filter_backprop, data_format);
-
-    functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
-    const Tensor& casted_filter_backprop_const = casted_filter_backprop;
-    cast_back(device, filter_backprop->template flat<Eigen::bfloat16>(),
-              casted_filter_backprop_const.template flat<float>());
-    return;
-  }
-
-  LaunchConv2DBackpropFilterOpImpl<Eigen::bfloat16>(
-      ctx, use_cudnn, cudnn_use_autotune, out_backprop, input, row_dilation,
-      col_dilation, row_stride, col_stride, padding, explicit_paddings,
-      filter_backprop, data_format);
-}
-
-// Forward declarations of the functor specializations for GPU.
-namespace functor {
-#define DECLARE_GPU_SPEC(T)                                             \
-  template <>                                                           \
-  void TransformFilter<GPUDevice, T, int, 4>::operator()(               \
-      const GPUDevice& d, FilterTensorFormat dst_filter_format,         \
-      typename TTypes<T, 4, int>::ConstTensor in,                       \
-      typename TTypes<T, 4, int>::Tensor out);                          \
-  extern template struct TransformFilter<GPUDevice, T, int, 4>;         \
-  template <>                                                           \
-  void PadInput<GPUDevice, T, int, 4>::operator()(                      \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
-      const std::array<int, 2>& padding_left,                           \
-      const std::array<int, 2>& padding_right,                          \
-      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
-      const T& padding_value);                                          \
-  extern template struct PadInput<GPUDevice, T, int, 4>;
-
-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(Eigen::half);
-DECLARE_GPU_SPEC(Eigen::bfloat16);
-DECLARE_GPU_SPEC(double);
-#undef DECLARE_GPU_SPEC
-}  // namespace functor
 
 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
                             .Device(DEVICE_GPU)
@@ -1040,11 +505,9 @@ REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
                             .HostMemory("filter_sizes"),
                         Conv2DBackpropFilterOp<GPUDevice, Eigen::bfloat16>);
 
-// To be used inside depthwise_conv_grad_op.cc.
-// TODO(reedwm): Move this and the definition to depthwise_conv_grad_op.cc.
-template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
-template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
-template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
+extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
+extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
+extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc b/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
new file mode 100644
index 00000000000..acfb5954203
--- /dev/null
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
@@ -0,0 +1,1138 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include <utility>
+
+#include "tensorflow/core/framework/kernel_shape_util.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_3d.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/conv_grad_shape_utils.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/profiler/lib/scoped_annotation.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
+#endif
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/cast_op.h"
+#include "tensorflow/core/platform/stream_executor.h"
+using stream_executor::dnn::DimIndex;
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/core/util/autotune_maps/conv_parameters.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA
+#include "third_party/gpus/cudnn/cudnn.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
+#include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h"
+#endif  // GOOGLE_CUDA
+
+namespace {
+
+// TODO(ezhulenev): Generalize Im2col for 2-d and 3-d kernels.
+
+// "Depth" is already used for the channel dimension, so for the third spatial
+// dimension in this file we use "plane", although in NDHWC layout it's
+// indicated with a "D".
+
+// Returns in 'col_data', image patches in storage order (planes, height, width,
+// depth) extracted from image at 'input_data', which is required to be in
+// storage order (batch, planes, height, width, depth).
+//
+// Based on 2-dimensional implementation written by Yangqing Jia (jiayq).
+template <typename T>
+void Im2col(const T* input_data, const int depth, const int planes,
+            const int height, const int width, const int filter_p,
+            const int filter_h, const int filter_w, const int pad_pt,
+            const int pad_t, const int pad_l, const int pad_pb, const int pad_b,
+            const int pad_r, const int stride_p, const int stride_h,
+            const int stride_w, T* col_data) {
+  const int planes_col = (planes + pad_pt + pad_pb - filter_p) / stride_p + 1;
+  const int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
+  const int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
+
+  int p_pad = -pad_pt;
+  for (int p = 0; p < planes_col; ++p) {
+    int h_pad = -pad_t;
+    for (int h = 0; h < height_col; ++h) {
+      int w_pad = -pad_l;
+      for (int w = 0; w < width_col; ++w) {
+        for (int ip = p_pad; ip < p_pad + filter_p; ++ip) {
+          for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
+            for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
+              if (ip >= 0 && ip < planes && ih >= 0 && ih < height && iw >= 0 &&
+                  iw < width) {
+                memcpy(col_data,
+                       input_data +
+                           (ip * height * width + ih * width + iw) * depth,
+                       sizeof(T) * depth);
+              } else {
+                // This should be simply padded with zero.
+                memset(col_data, 0, sizeof(T) * depth);
+              }
+              col_data += depth;
+            }
+          }
+        }
+        w_pad += stride_w;
+      }
+      h_pad += stride_h;
+    }
+    p_pad += stride_p;
+  }
+}
+
+}  // namespace
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// Backprop for filter that offloads computation to
+// Eigen::CuboidConvolutionBackwardFilter.
+template <typename Device, class T>
+class Conv3DBackpropFilterOp : public OpKernel {
+ public:
+  explicit Conv3DBackpropFilterOp(OpKernelConstruction* context)
+      : OpKernel(context),
+        data_format_(FORMAT_NHWC),
+        takes_shape_(type_string().find("V2") != std::string::npos) {
+    // data_format is only available in V2.
+    if (takes_shape_) {
+      string data_format;
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                  errors::InvalidArgument("Invalid data format"));
+      OP_REQUIRES(
+          context, data_format_ == FORMAT_NHWC,
+          errors::InvalidArgument(
+              "Conv3DBackpropFilterOpV2 only supports NDHWC on the CPU."));
+    }
+
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+
+    // TODO(yangzihao): Add CPU version of dilated conv 3D.
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, '0') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '1') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '2') == 1),
+                errors::InvalidArgument(
+                    "Current CPU implementation does not yet support "
+                    "dilation rates larger than 1."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 5,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, 'C') == 1 &&
+         GetTensorDim(stride_, data_format_, 'N') == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const TensorShape& input_shape = input.shape();
+
+    const Tensor& out_backprop = context->input(2);
+    const TensorShape& out_backprop_shape = out_backprop.shape();
+
+    TensorShape filter_shape;
+    if (takes_shape_) {
+      const Tensor& filter_sizes = context->input(1);
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+                  errors::InvalidArgument(
+                      "filter_sizes shape must be rank 1 but is rank ",
+                      filter_sizes.shape().dims()));
+      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
+                                  filter_sizes.vec<int32>(), &filter_shape));
+    } else {
+      filter_shape = context->input(1).shape();
+    }
+
+    OP_REQUIRES(context, input_shape.dims() == 5,
+                errors::InvalidArgument("input tensor must have 5 dimensions"));
+    OP_REQUIRES(
+        context, filter_shape.dims() == 5,
+        errors::InvalidArgument("filter_sizes tensor must have 5 dimensions"));
+    OP_REQUIRES(
+        context, out_backprop_shape.dims() == 5,
+        errors::InvalidArgument("out_backprop tensor must have 5 dimensions"));
+    OP_REQUIRES(
+        context, input_shape.dim_size(4) == filter_shape.dim_size(3),
+        errors::InvalidArgument("input and filter_sizes must have the same "
+                                "number of channels. Got ",
+                                input_shape.dim_size(4), " for input and ",
+                                filter_shape.dim_size(3), " for filter_sizes"));
+    OP_REQUIRES(
+        context, out_backprop_shape.dim_size(4) == filter_shape.dim_size(4),
+        errors::InvalidArgument("out_backprop and filter_sizes must have the "
+                                "same number of channels. Got ",
+                                out_backprop_shape.dim_size(4),
+                                " for out_backprop and ",
+                                filter_shape.dim_size(4), " for filter_sizes"));
+
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(context,
+                   ConvBackpropComputeDimensions(
+                       "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3,
+                       input_shape, filter_shape, out_backprop_shape, stride_,
+                       padding_, data_format_, &dims));
+
+    Tensor* filter_backprop;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, filter_shape, &filter_backprop));
+
+    if (input_shape.num_elements() == 0) {
+      filter_backprop->template flat<T>().setZero();
+      return;
+    }
+
+    functor::CuboidConvolutionBackwardFilter<Device, T>()(
+        context->eigen_device<Device>(),
+        filter_backprop->tensor<T, 5>(),                 // filter_backward
+        input.tensor<T, 5>(),                            // input
+        out_backprop.tensor<T, 5>(),                     // output_backward
+        static_cast<int>(dims.spatial_dims[0].stride),   // stride_planes
+        static_cast<int>(dims.spatial_dims[1].stride),   // stride_rows
+        static_cast<int>(dims.spatial_dims[2].stride));  // stride_cols
+  }
+
+ private:
+  std::vector<int32> dilation_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+  bool takes_shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Conv3DBackpropFilterOp);
+};
+
+// Custom backprop for filter that explicitly does the work sharding and calls
+// Eigen only to multiply matrices.
+template <typename Device, class T>
+class Conv3DCustomBackpropFilterOp : public OpKernel {
+  // Limit the maximum size of allocated temporary buffer to
+  // kMaxTempAllocationOverhead times the size of the input tensors (input,
+  // filter, out_backprop). If the size of the temporary buffer exceeds this
+  // limit, fallback on Eigen implementation.
+  static constexpr int kMaxTempAllocationOverhead = 25;
+
+ public:
+  explicit Conv3DCustomBackpropFilterOp(OpKernelConstruction* context)
+      : OpKernel(context),
+        data_format_(FORMAT_NHWC),
+        takes_shape_(type_string().find("V2") != std::string::npos) {
+    // data_format is only available in V2.
+    if (takes_shape_) {
+      string data_format;
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                  errors::InvalidArgument("Invalid data format"));
+      OP_REQUIRES(
+          context, data_format_ == FORMAT_NHWC,
+          errors::InvalidArgument(
+              "Conv3DBackpropFilterOpV2 only supports NDHWC on the CPU."));
+    }
+
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+
+    // TODO(yangzihao): Add CPU version of dilated conv 3D.
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, '0') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '1') == 1 &&
+                 GetTensorDim(dilation_, data_format_, '2') == 1),
+                errors::InvalidArgument(
+                    "Current CPU implementation does not yet support "
+                    "dilation rates larger than 1."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 5,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, 'C') == 1 &&
+         GetTensorDim(stride_, data_format_, 'N') == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const TensorShape& input_shape = input.shape();
+
+    const Tensor& out_backprop = context->input(2);
+    const TensorShape& out_backprop_shape = out_backprop.shape();
+
+    TensorShape filter_shape;
+    if (takes_shape_) {
+      const Tensor& filter_sizes = context->input(1);
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+                  errors::InvalidArgument(
+                      "filter_sizes shape must be rank 1 but is rank ",
+                      filter_sizes.shape().dims()));
+      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
+                                  filter_sizes.vec<int32>(), &filter_shape));
+    } else {
+      filter_shape = context->input(1).shape();
+    }
+
+    OP_REQUIRES(context, input_shape.dims() == 5,
+                errors::InvalidArgument("input tensor must have 5 dimensions"));
+    OP_REQUIRES(
+        context, filter_shape.dims() == 5,
+        errors::InvalidArgument("filter_sizes tensor must have 5 dimensions"));
+    OP_REQUIRES(
+        context, out_backprop_shape.dims() == 5,
+        errors::InvalidArgument("out_backprop tensor must have 5 dimensions"));
+    OP_REQUIRES(
+        context, input_shape.dim_size(4) == filter_shape.dim_size(3),
+        errors::InvalidArgument("input and filter_sizes must have the same "
+                                "number of channels. Got ",
+                                input_shape.dim_size(4), " for input and ",
+                                filter_shape.dim_size(3), " for filter_sizes"));
+    OP_REQUIRES(
+        context, out_backprop_shape.dim_size(4) == filter_shape.dim_size(4),
+        errors::InvalidArgument("out_backprop and filter_sizes must have the "
+                                "same number of channels. Got ",
+                                out_backprop_shape.dim_size(4),
+                                " for out_backprop and ",
+                                filter_shape.dim_size(4), " for filter_sizes"));
+
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(context,
+                   ConvBackpropComputeDimensions(
+                       "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3,
+                       input_shape, filter_shape, out_backprop_shape, stride_,
+                       padding_, data_format_, &dims));
+
+    Tensor* filter_backprop;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, filter_shape, &filter_backprop));
+
+    if (input_shape.num_elements() == 0) {
+      filter_backprop->template flat<T>().setZero();
+      return;
+    }
+
+    int64_t top_pad_planes, bottom_pad_planes;
+    int64_t top_pad_rows, bottom_pad_rows;
+    int64_t left_pad_cols, right_pad_cols;
+
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                dims.spatial_dims[0].input_size,
+                                dims.spatial_dims[0].filter_size,
+                                dims.spatial_dims[0].stride, padding_,
+                                &dims.spatial_dims[0].output_size,
+                                &top_pad_planes, &bottom_pad_planes));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                dims.spatial_dims[1].input_size,
+                                dims.spatial_dims[1].filter_size,
+                                dims.spatial_dims[1].stride, padding_,
+                                &dims.spatial_dims[1].output_size,
+                                &top_pad_rows, &bottom_pad_rows));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                dims.spatial_dims[2].input_size,
+                                dims.spatial_dims[2].filter_size,
+                                dims.spatial_dims[2].stride, padding_,
+                                &dims.spatial_dims[2].output_size,
+                                &left_pad_cols, &right_pad_cols));
+
+    // TODO(ezhulenev): Extract work size and shard estimation to shared
+    // functions in conv_grad_ops, and update 2d convolution backprop.
+
+    // The total dimension size of each kernel.
+    const int64_t filter_total_size =
+        dims.spatial_dims[0].filter_size * dims.spatial_dims[1].filter_size *
+        dims.spatial_dims[2].filter_size * dims.in_depth;
+    // The output image size is the spatial size of the output.
+    const int64_t output_image_size = dims.spatial_dims[0].output_size *
+                                      dims.spatial_dims[1].output_size *
+                                      dims.spatial_dims[2].output_size;
+
+    // Shard 'batch' images (volumes) into 'shard_size' groups of images
+    // (volumes) to be fed into the parallel matmul. Calculate 'shard_size' by
+    // dividing the L3 cache size ('target_working_set_size') by the matmul size
+    // of an individual image ('work_unit_size').
+
+    const auto cache_sizes = Eigen::internal::CacheSizes();
+    const ptrdiff_t l3_cache_size = cache_sizes.m_l3;
+
+    // TODO(andydavis)
+    // *) Consider reducing 'target_working_set_size' if L3 is shared by
+    //    other concurrently running tensorflow ops.
+    const size_t target_working_set_size = l3_cache_size / sizeof(T);
+
+    const int64_t size_A = output_image_size * filter_total_size;
+
+    const int64_t size_B = output_image_size * dims.out_depth;
+
+    const int64_t size_C = filter_total_size * dims.out_depth;
+
+    const int64_t work_unit_size = size_A + size_B + size_C;
+
+    OP_REQUIRES(
+        context, work_unit_size > 0,
+        errors::InvalidArgument("input, filter_sizes and out_backprop tensors "
+                                "must all have at least 1 element"));
+
+    const size_t shard_size =
+        (target_working_set_size + work_unit_size - 1) / work_unit_size;
+
+    // Total number of elements in all the tensors used by this kernel.
+    int64_t total_tensor_elements = input_shape.num_elements() +
+                                    filter_shape.num_elements() +
+                                    out_backprop_shape.num_elements();
+
+    // Shape of the temporary workspace buffer.
+    TensorShape col_buffer_shape = {static_cast<int64_t>(shard_size),
+                                    static_cast<int64_t>(output_image_size),
+                                    static_cast<int64_t>(filter_total_size)};
+    int64_t col_buffer_elements = col_buffer_shape.num_elements();
+
+    // If the temporary allocation overhead is too large, fallback on Eigen
+    // implementation which requires much less memory.
+    int64_t col_buffer_overhead = col_buffer_elements / total_tensor_elements;
+    if (col_buffer_overhead > kMaxTempAllocationOverhead) {
+      VLOG(2) << "Fallback on Eigen implementation of Conv3DBackpropFilterOp: "
+                 "col_buffer_overhead="
+              << col_buffer_overhead;
+
+      functor::CuboidConvolutionBackwardFilter<Device, T>()(
+          context->eigen_device<Device>(),
+          filter_backprop->tensor<T, 5>(),                 // filter_backward
+          input.tensor<T, 5>(),                            // input
+          out_backprop.tensor<T, 5>(),                     // output_backward
+          static_cast<int>(dims.spatial_dims[0].stride),   // stride_planes
+          static_cast<int>(dims.spatial_dims[1].stride),   // stride_rows
+          static_cast<int>(dims.spatial_dims[2].stride));  // stride_cols
+
+      return;
+    }
+
+    Tensor col_buffer;
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value,
+                                          col_buffer_shape, &col_buffer));
+
+    // The input offset corresponding to a single input image.
+    const int64_t input_offset =
+        dims.spatial_dims[0].input_size * dims.spatial_dims[1].input_size *
+        dims.spatial_dims[2].input_size * dims.in_depth;
+    // The output offset corresponding to a single output image.
+    const int64_t output_offset =
+        dims.spatial_dims[0].output_size * dims.spatial_dims[1].output_size *
+        dims.spatial_dims[2].output_size * dims.out_depth;
+
+    const T* input_data = input.template flat<T>().data();
+    T* col_buffer_data = col_buffer.template flat<T>().data();
+    const T* out_backprop_data = out_backprop.template flat<T>().data();
+    T* filter_backprop_data = filter_backprop->template flat<T>().data();
+
+    typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
+                             Eigen::Unaligned>
+        TensorMap;
+    typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
+                             Eigen::Unaligned>
+        ConstTensorMap;
+
+    TensorMap C(filter_backprop_data, filter_total_size, dims.out_depth);
+    C.setZero();
+
+    // Initialize contraction dims (we need to transpose 'A' below).
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims;
+    contract_dims[0].first = 0;
+    contract_dims[0].second = 0;
+
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+
+    for (int image_id = 0; image_id < dims.batch_size; image_id += shard_size) {
+      const int shard_limit =
+          std::min(static_cast<int>(shard_size),
+                   static_cast<int>(dims.batch_size) - image_id);
+
+      auto shard = [&input_data, &col_buffer_data, &dims, &top_pad_planes,
+                    &top_pad_rows, &left_pad_cols, &bottom_pad_planes,
+                    &bottom_pad_rows, &right_pad_cols, &input_offset,
+                    &size_A](int64_t start, int64_t limit) {
+        for (int shard_id = start; shard_id < limit; ++shard_id) {
+          const T* input_data_shard = input_data + shard_id * input_offset;
+          T* col_data_shard = col_buffer_data + shard_id * size_A;
+
+          // When we compute the gradient with respect to the filters, we need
+          // to do im2col to allow gemm-type computation.
+          Im2col<T>(input_data_shard, dims.in_depth,
+                    // Input spatial dimensions.
+                    dims.spatial_dims[0].input_size,  // input planes
+                    dims.spatial_dims[1].input_size,  // input rows
+                    dims.spatial_dims[2].input_size,  // input cols
+                    // Filter spatial dimensions.
+                    dims.spatial_dims[0].filter_size,  // filter planes
+                    dims.spatial_dims[1].filter_size,  // filter rows
+                    dims.spatial_dims[2].filter_size,  // filter cols
+                    // Spatial padding.
+                    top_pad_planes, top_pad_rows, left_pad_cols,
+                    bottom_pad_planes, bottom_pad_rows, right_pad_cols,
+                    // Spatial striding.
+                    dims.spatial_dims[0].stride,  // stride planes
+                    dims.spatial_dims[1].stride,  // stride rows
+                    dims.spatial_dims[2].stride,  // stride cols
+                    col_data_shard);
+        }
+      };
+      Shard(worker_threads.num_threads, worker_threads.workers, shard_limit,
+            size_A, shard);
+
+      ConstTensorMap A(col_buffer_data, output_image_size * shard_limit,
+                       filter_total_size);
+      ConstTensorMap B(out_backprop_data, output_image_size * shard_limit,
+                       dims.out_depth);
+
+      // Gradient with respect to filter.
+      C.device(context->eigen_cpu_device()) += A.contract(B, contract_dims);
+
+      input_data += input_offset * shard_limit;
+      out_backprop_data += output_offset * shard_limit;
+    }
+  }
+
+ private:
+  std::vector<int32> dilation_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+  bool takes_shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Conv3DCustomBackpropFilterOp);
+};
+
+// Custom backrop input kernel is 30% - 4x faster when compiled with AVX2 than
+// default Eigen implementation (at the cost of ~2x-8x peak memory usage).
+
+#define REGISTER_CPU_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv3DCustomBackpropFilterOp<CPUDevice, T>);                            \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T"),                        \
+                          Conv3DCustomBackpropFilterOp<CPUDevice, T>);        \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilter")                        \
+                              .Device(DEVICE_CPU)                             \
+                              .Label("custom")                                \
+                              .TypeConstraint<T>("T"),                        \
+                          Conv3DCustomBackpropFilterOp<CPUDevice, T>);        \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
+                              .Device(DEVICE_CPU)                             \
+                              .Label("custom")                                \
+                              .TypeConstraint<T>("T"),                        \
+                          Conv3DCustomBackpropFilterOp<CPUDevice, T>);        \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilter")                        \
+                              .Device(DEVICE_CPU)                             \
+                              .Label("eigen_tensor")                          \
+                              .TypeConstraint<T>("T"),                        \
+                          Conv3DBackpropFilterOp<CPUDevice, T>);              \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
+                              .Device(DEVICE_CPU)                             \
+                              .Label("eigen_tensor")                          \
+                              .TypeConstraint<T>("T"),                        \
+                          Conv3DBackpropFilterOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
+
+#define REGISTER_CPU_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")               \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<T>("T"),                 \
+                          Conv3DCustomBackpropFilterOp<CPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")               \
+                              .Device(DEVICE_CPU)                      \
+                              .Label("custom")                         \
+                              .TypeConstraint<T>("T"),                 \
+                          Conv3DCustomBackpropFilterOp<CPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")               \
+                              .Device(DEVICE_CPU)                      \
+                              .Label("eigen_tensor")                   \
+                              .TypeConstraint<T>("T"),                 \
+                          Conv3DBackpropFilterOp<CPUDevice, T>);
+
+TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
+
+// WARNING: Eigen::half is not trivially copyable and can't be used in
+// custom backprop filter kernel because of memcpy and memset in Im2col.
+#define REGISTER_CPU_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv3DBackpropFilterOp<CPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T"),                        \
+                          Conv3DBackpropFilterOp<CPUDevice, T>);
+
+TF_CALL_half(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
+
+// GPU definitions of both ops.
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// Forward declarations of the functor specializations for GPU.
+// This ensures that the custom implementation is used instead of the default
+// Eigen one (which is used for CPU).
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                           \
+  template <>                                                         \
+  void TransformFilter<GPUDevice, T, int, 5>::operator()(             \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,       \
+      typename TTypes<T, 5, int>::ConstTensor in,                     \
+      typename TTypes<T, 5, int>::Tensor out);                        \
+  template <>                                                         \
+  void ReverseTransformFilter<GPUDevice, T, 5>::operator()(           \
+      const GPUDevice& d, FilterTensorFormat src_filter_format,       \
+      typename TTypes<T, 5>::ConstTensor in,                          \
+      typename TTypes<T, 5>::Tensor out);                             \
+  template <>                                                         \
+  void PadInput<GPUDevice, T, int, 5>::operator()(                    \
+      const GPUDevice& d, typename TTypes<T, 5, int>::ConstTensor in, \
+      const std::array<int, 3>& padding_left,                         \
+      const std::array<int, 3>& padding_right,                        \
+      typename TTypes<T, 5, int>::Tensor out, TensorFormat format,    \
+      const T& padding_value);
+
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(Eigen::bfloat16);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+
+// A dummy type to group backward filter autotune results together.
+struct Conv3dBackwardFilterAutotuneGroup {
+  static string name() { return "Conv3dBwdFilter"; }
+};
+
+typedef AutotuneSingleton<Conv3dBackwardFilterAutotuneGroup, ConvParameters,
+                          AutotuneEntry<se::dnn::ConvOp>>
+    AutotuneConv3dBwdFilter;
+
+template <typename T>
+void LaunchConvBackpropFilterOpImpl(
+    OpKernelContext* context, bool cudnn_use_autotune, const Tensor& input,
+    const Tensor& out_backprop, const std::vector<int32>& dilation,
+    const std::vector<int32>& stride, const Padding& padding,
+    Tensor* filter_backprop, TensorFormat data_format) {
+  const TensorShape& input_shape = input.shape();
+  const TensorShape& out_backprop_shape = out_backprop.shape();
+  const TensorShape& filter_shape = filter_backprop->shape();
+
+  ConvBackpropDimensions dims;
+  OP_REQUIRES_OK(context, ConvBackpropComputeDimensionsV2(
+                              "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3,
+                              input_shape, filter_shape, out_backprop_shape,
+                              dilation, stride, padding,
+                              /*explicit_paddings=*/{}, data_format, &dims));
+
+  auto* stream = context->op_device_context()->stream();
+  OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+  if (DataTypeToEnum<T>::value == DT_BFLOAT16 &&
+      !stream->GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    context->SetStatus(errors::Unimplemented(
+        "Conv3DBackpropFilter for GPU with bfloat16 is only supported "
+        "with cuDNN on Ampere GPUs or later."));
+    return;
+  }
+
+  bool is_grouped_convolution = filter_shape.dim_size(3) != dims.in_depth;
+  if (!is_grouped_convolution && dims.filter_size(1) == 1 &&
+      dims.filter_size(2) == 1 && dims.filter_size(0) == 1 &&
+      dims.dilation(2) == 1 && dims.dilation(1) == 1 && dims.dilation(0) == 1 &&
+      dims.stride(2) == 1 && dims.stride(1) == 1 && dims.stride(0) == 1 &&
+      data_format == FORMAT_NHWC) {
+    const uint64 m = dims.in_depth;
+    const uint64 k = dims.batch_size * dims.input_size(1) * dims.input_size(2) *
+                     dims.input_size(0);
+    const uint64 n = dims.out_depth;
+
+    // The shape of output backprop is
+    //   [batch, out_z, out_y, out_x, out_depth]
+    // From cublas's perspective, it is: n x k
+    auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                out_backprop.template flat<T>().size());
+
+    // The shape of input is:
+    //   [batch, in_z, in_y, in_x, in_depth],
+    // From cublas's perspective, it is: m x k
+    auto b_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                input.template flat<T>().size());
+
+    // The shape of the filter backprop is:
+    //   [1, 1, 1, in_depth, out_depth]
+    // From cublas's perspective, it is: n x m
+    auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
+                                filter_backprop->template flat<T>().size());
+
+    OP_REQUIRES_OK(
+        context, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                                      se::blas::Transpose::kTranspose, n, m, k,
+                                      a_ptr, n, b_ptr, m, &c_ptr, n,
+                                      se::blas::kDefaultComputePrecision));
+    return;
+  } else if (!is_grouped_convolution &&
+             dims.filter_size(0) == dims.input_size(0) &&
+             dims.filter_size(1) == dims.input_size(1) &&
+             dims.filter_size(2) == dims.input_size(2) &&
+             padding == Padding::VALID && data_format == FORMAT_NHWC) {
+    const uint64 m = dims.input_size(0) * dims.input_size(1) *
+                     dims.input_size(2) * dims.in_depth;
+    const uint64 k = dims.batch_size;
+    const uint64 n = dims.out_depth;
+
+    auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                input.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                out_backprop.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
+                                filter_backprop->template flat<T>().size());
+
+    OP_REQUIRES_OK(
+        context, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                                      se::blas::Transpose::kTranspose, n, m, k,
+                                      b_ptr, n, a_ptr, m, &c_ptr, n,
+                                      se::blas::kDefaultComputePrecision));
+    return;
+  }
+
+  int padding_planes = dims.SpatialPadding(padding, 0);
+  int padding_rows = dims.SpatialPadding(padding, 1);
+  int padding_cols = dims.SpatialPadding(padding, 2);
+  const bool planes_odd = (padding_planes % 2 != 0);
+  const bool rows_odd = (padding_rows % 2 != 0);
+  const bool cols_odd = (padding_cols % 2 != 0);
+
+  Tensor compatible_input;
+  if (rows_odd || cols_odd || planes_odd) {
+    TensorShape compatible_input_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                data_format, dims.batch_size,
+                                {{dims.input_size(0) + planes_odd,
+                                  dims.input_size(1) + rows_odd,
+                                  dims.input_size(2) + cols_odd}},
+                                dims.in_depth, &compatible_input_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   compatible_input_shape,
+                                                   &compatible_input));
+    functor::PadInput<GPUDevice, T, int, 5>()(
+        context->template eigen_device<GPUDevice>(),
+        To32Bit(input.tensor<T, 5>()), {{0, 0, 0}},
+        {{planes_odd, rows_odd, cols_odd}},
+        To32Bit(compatible_input.tensor<T, 5>()), data_format, T{});
+  } else {
+    compatible_input = input;
+  }
+
+  CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
+      << "Negative paddings: (" << padding_rows << ", " << padding_cols << ", "
+      << padding_planes << ")";
+
+#if GOOGLE_CUDA
+    const bool compute_in_nhwc = ComputeInNhwcEnabled(
+        DataTypeToEnum<T>::value, stream, /*use_4d_tensor=*/false);
+#else
+    // fast NDHWC implementation is a CUDA only feature
+    const bool compute_in_nhwc = false;
+#endif
+    const TensorFormat compute_data_format =
+        (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
+                                                        : FORMAT_NCHW;
+
+    VLOG(3) << "Compute Conv3DBackpropFilter with cuDNN:"
+            << " data_format=" << ToString(data_format)
+            << " compute_data_format=" << ToString(compute_data_format);
+
+    constexpr auto kComputeInNHWC =
+        std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                        se::dnn::FilterLayout::kOutputYXInput);
+    constexpr auto kComputeInNCHW =
+        std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                        se::dnn::FilterLayout::kOutputInputYX);
+
+    se::dnn::DataLayout compute_data_layout;
+    se::dnn::FilterLayout filter_layout;
+
+    std::tie(compute_data_layout, filter_layout) =
+        compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
+    se::dnn::BatchDescriptor input_desc(3);
+    input_desc.set_count(dims.batch_size)
+        .set_spatial_dim(DimIndex::X,
+                         GetTensorDim(compatible_input, data_format, '2'))
+        .set_spatial_dim(DimIndex::Y,
+                         GetTensorDim(compatible_input, data_format, '1'))
+        .set_spatial_dim(DimIndex::Z,
+                         GetTensorDim(compatible_input, data_format, '0'))
+        .set_feature_map_count(dims.in_depth)
+        .set_layout(compute_data_layout);
+    se::dnn::BatchDescriptor output_desc(3);
+    output_desc.set_count(dims.batch_size)
+        .set_spatial_dim(DimIndex::X, dims.output_size(2))
+        .set_spatial_dim(DimIndex::Y, dims.output_size(1))
+        .set_spatial_dim(DimIndex::Z, dims.output_size(0))
+        .set_feature_map_count(dims.out_depth)
+        .set_layout(compute_data_layout);
+    se::dnn::FilterDescriptor filter_desc(3);
+    filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2))
+        .set_spatial_dim(DimIndex::Y, dims.filter_size(1))
+        .set_spatial_dim(DimIndex::Z, dims.filter_size(0))
+        .set_input_feature_map_count(filter_shape.dim_size(3))
+        .set_output_feature_map_count(filter_shape.dim_size(4))
+        .set_layout(filter_layout);
+    se::dnn::ConvolutionDescriptor conv_desc(3);
+    conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2))
+        .set_dilation_rate(DimIndex::Y, dims.dilation(1))
+        .set_dilation_rate(DimIndex::Z, dims.dilation(0))
+        .set_filter_stride(DimIndex::X, dims.stride(2))
+        .set_filter_stride(DimIndex::Y, dims.stride(1))
+        .set_filter_stride(DimIndex::Z, dims.stride(0))
+        .set_zero_padding(DimIndex::X, padding_cols / 2)
+        .set_zero_padding(DimIndex::Y, padding_rows / 2)
+        .set_zero_padding(DimIndex::Z, padding_planes / 2)
+        .set_group_count(dims.in_depth / filter_shape.dim_size(3));
+
+    Tensor pre_transformed_filter_backprop;
+    auto dst_format =
+        compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
+    TensorShape dst_shape =
+        dst_format == FORMAT_OIHW
+            ? TensorShape({filter_shape.dim_size(4), filter_shape.dim_size(3),
+                           dims.filter_size(0), dims.filter_size(1),
+                           dims.filter_size(2)})
+            : TensorShape({filter_shape.dim_size(4), dims.filter_size(0),
+                           dims.filter_size(1), dims.filter_size(2),
+                           filter_shape.dim_size(3)});
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
+                                          &pre_transformed_filter_backprop));
+
+    Tensor transformed_out_backprop;
+    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the `out_backprop` tensor from NDHWC to NCDHW.";
+    TensorShape nchw_shape = {dims.batch_size, dims.out_depth,
+                              dims.output_size(0), dims.output_size(1),
+                              dims.output_size(2)};
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
+                                          &transformed_out_backprop));
+    if (dims.out_depth > 1) {
+      functor::NHWCToNCHW<GPUDevice, T, 5>()(
+          context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
+          transformed_out_backprop.tensor<T, 5>());
+    } else {
+      CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
+    }
+    } else {
+    transformed_out_backprop = out_backprop;
+    }
+    Tensor transformed_input;
+    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the `input` tensor from NDHWC to NCDHW.";
+    TensorShape nchw_shape = {
+        dims.batch_size, dims.in_depth, compatible_input.dim_size(1),
+        compatible_input.dim_size(2), compatible_input.dim_size(3)};
+    if (dims.in_depth > 1) {
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
+                                          &transformed_input));
+      functor::NHWCToNCHW<GPUDevice, T, 5>()(
+          context->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(compatible_input).tensor<T, 5>(),
+          transformed_input.tensor<T, 5>());
+    } else {
+      CHECK(transformed_input.CopyFrom(compatible_input, nchw_shape));
+    }
+    } else {
+    transformed_input = compatible_input;
+    }
+
+    auto out_backprop_ptr =
+        AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
+                       transformed_out_backprop.template flat<T>().size());
+    auto filter_backprop_ptr = AsDeviceMemory(
+        pre_transformed_filter_backprop.template flat<T>().data(),
+        pre_transformed_filter_backprop.template flat<T>().size());
+    auto input_ptr =
+        AsDeviceMemory(transformed_input.template flat<T>().data(),
+                       transformed_input.template flat<T>().size());
+
+    static int64_t ConvolveBackwardFilterScratchSize =
+        GetDnnWorkspaceLimitOrDefault();
+
+    const ConvParameters conv_parameters = {
+        stream->parent(),
+        dims.batch_size,
+        dims.in_depth,
+        {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}},
+        compute_data_format,
+        dims.out_depth,
+        {{dims.filter_size(0), dims.filter_size(1), dims.filter_size(2)}},
+        {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}},
+        {{dims.stride(0), dims.stride(1), dims.stride(2)}},
+        {{padding_planes, padding_rows, padding_cols}},
+        input.dtype(),
+        conv_desc.group_count(),
+    };
+
+    using se::dnn::AlgorithmConfig;
+    using se::dnn::AlgorithmDesc;
+    using se::dnn::ProfileResult;
+
+    auto entry_or = AutotuneUnfusedConv(
+        cudnn_use_autotune, AutotuneConv3dBwdFilter::GetInstance(),
+        conv_parameters, context, se::dnn::ConvolutionKind::BACKWARD_FILTER,
+        input_desc, input_ptr, filter_desc, filter_backprop_ptr, conv_desc,
+        output_desc, out_backprop_ptr, ConvolveBackwardFilterScratchSize);
+    OP_REQUIRES_OK(context, entry_or.status());
+    auto autotune_entry = std::move(entry_or).value();
+
+    DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
+                                          context);
+    Status cudnn_launch_status = LaunchAutotunedConv(
+        autotune_entry, &scratch_allocator,
+        se::dnn::ConvolutionKind::BACKWARD_FILTER, stream, input_desc,
+        input_ptr, filter_desc, filter_backprop_ptr, conv_desc, output_desc,
+        out_backprop_ptr);
+    if (!cudnn_launch_status.ok()) {
+      context->SetStatus(cudnn_launch_status);
+      return;
+    }
+
+    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+    functor::ReverseTransformFilter<GPUDevice, T, 5>()(
+        context->eigen_device<GPUDevice>(), /*src_filter_format=*/dst_format,
+        toConstTensor(pre_transformed_filter_backprop).template tensor<T, 5>(),
+        filter_backprop->tensor<T, 5>());
+}
+
+template <typename T>
+struct LaunchConvBackpropFilterOp {
+    static void launch(OpKernelContext* context, bool cudnn_use_autotune,
+                       const Tensor& input, const Tensor& out_backprop,
+                       const std::vector<int32>& dilation,
+                       const std::vector<int32>& stride, const Padding& padding,
+                       Tensor* filter_backprop, TensorFormat data_format) {
+      LaunchConvBackpropFilterOpImpl<T>(context, cudnn_use_autotune, input,
+                                        out_backprop, dilation, stride, padding,
+                                        filter_backprop, data_format);
+    }
+};
+
+template <>
+struct LaunchConvBackpropFilterOp<Eigen::bfloat16> {
+    static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
+                       const Tensor& input, const Tensor& out_backprop,
+                       const std::vector<int32>& dilation,
+                       const std::vector<int32>& stride, const Padding& padding,
+                       Tensor* filter_backprop, TensorFormat data_format) {
+      // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+      // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+      auto* stream = ctx->op_device_context()->stream();
+      const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE);
+
+      if (cast_to_float) {
+      Tensor casted_input = input;
+      Tensor casted_out_backprop = out_backprop;
+      Tensor casted_filter_backprop = *filter_backprop;
+
+      const GPUDevice& device = ctx->eigen_device<GPUDevice>();
+      functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(DT_FLOAT, input.shape(), &casted_input));
+      cast(device, casted_input.template flat<float>(),
+           input.template flat<Eigen::bfloat16>());
+
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, out_backprop.shape(),
+                                             &casted_out_backprop));
+      cast(device, casted_out_backprop.template flat<float>(),
+           out_backprop.template flat<Eigen::bfloat16>());
+
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, filter_backprop->shape(),
+                                             &casted_filter_backprop));
+
+      LaunchConvBackpropFilterOpImpl<float>(
+          ctx, cudnn_use_autotune, casted_input, casted_out_backprop, dilation,
+          stride, padding, &casted_filter_backprop, data_format);
+
+      functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+      const Tensor& casted_filter_backprop_const = casted_filter_backprop;
+      cast_back(device, filter_backprop->template flat<Eigen::bfloat16>(),
+                casted_filter_backprop_const.template flat<float>());
+      return;
+      }
+
+      LaunchConvBackpropFilterOpImpl<Eigen::bfloat16>(
+          ctx, cudnn_use_autotune, input, out_backprop, dilation, stride,
+          padding, filter_backprop, data_format);
+    }
+};
+
+template <typename T>
+class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
+   public:
+    explicit Conv3DBackpropFilterOp(OpKernelConstruction* context)
+        : OpKernel(context),
+          data_format_(FORMAT_NHWC),
+          takes_shape_(type_string().find("V2") != std::string::npos) {
+      // data_format is only available in V2.
+      if (takes_shape_) {
+      string data_format;
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                  errors::InvalidArgument("Invalid data format"));
+      }
+      OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+      OP_REQUIRES(context, dilation_.size() == 5,
+                  errors::InvalidArgument("Dilation rates field must "
+                                          "specify 5 dimensions"));
+      OP_REQUIRES(context,
+                  (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                   GetTensorDim(dilation_, data_format_, 'N') == 1),
+                  errors::InvalidArgument(
+                      "Current implementation does not yet support "
+                      "dilation rates in the batch and depth dimensions."));
+      OP_REQUIRES(
+          context,
+          (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+           GetTensorDim(dilation_, data_format_, '1') > 0 &&
+           GetTensorDim(dilation_, data_format_, '2') > 0),
+          errors::InvalidArgument("Dilated rates should be larger than 0."));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+      OP_REQUIRES(context, stride_.size() == 5,
+                  errors::InvalidArgument("Sliding window strides field must "
+                                          "specify 5 dimensions"));
+      OP_REQUIRES(context,
+                  (GetTensorDim(stride_, data_format_, 'C') == 1 &&
+                   GetTensorDim(stride_, data_format_, 'N') == 1),
+                  errors::InvalidArgument(
+                      "Current implementation does not yet support "
+                      "strides in the batch and depth dimensions."));
+      OP_REQUIRES(
+          context,
+          (GetTensorDim(stride_, data_format_, '0') > 0 &&
+           GetTensorDim(stride_, data_format_, '1') > 0 &&
+           GetTensorDim(stride_, data_format_, '2') > 0),
+          errors::InvalidArgument("Spatial strides should be larger than 0."));
+      OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+      cudnn_use_autotune_ = CudnnUseAutotune();
+    }
+
+    void Compute(OpKernelContext* context) override {
+      const Tensor& input = context->input(0);
+      const Tensor& out_backprop = context->input(2);
+
+      TensorShape filter_shape;
+      if (takes_shape_) {
+      const Tensor& filter_sizes = context->input(1);
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+                  errors::InvalidArgument(
+                      "filter_sizes shape must be rank 1 but is rank ",
+                      filter_sizes.shape().dims()));
+      OP_REQUIRES_OK(context, tensor::MakeShape(filter_sizes, &filter_shape));
+      } else {
+      filter_shape = context->input(1).shape();
+      }
+
+      Tensor* filter_backprop;
+      OP_REQUIRES_OK(
+          context, context->allocate_output(0, filter_shape, &filter_backprop));
+
+      LaunchConvBackpropFilterOp<T>::launch(
+          context, cudnn_use_autotune_, input, out_backprop, dilation_, stride_,
+          padding_, filter_backprop, data_format_);
+    }
+
+   private:
+    std::vector<int32> dilation_;
+    std::vector<int32> stride_;
+    Padding padding_;
+    TensorFormat data_format_;
+    bool takes_shape_;
+    bool cudnn_use_autotune_;
+};
+
+#define REGISTER_GPU_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Conv3DBackpropFilter").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      Conv3DBackpropFilterOp<GPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
+                              .Device(DEVICE_GPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .HostMemory("filter_sizes"),                    \
+                          Conv3DBackpropFilterOp<GPUDevice, T>);
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
+TF_CALL_double(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc b/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc
new file mode 100644
index 00000000000..e04f9949978
--- /dev/null
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc
@@ -0,0 +1,616 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/kernel_shape_util.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/conv_grad_shape_utils.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/lib/scoped_annotation.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
+#endif
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/cast_op.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/core/util/autotune_maps/conv_parameters.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
+#include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+struct LaunchConv2DBackpropFilterOp<CPUDevice, T> {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& out_backprop, const Tensor& input,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
+                  const std::vector<int64_t>& explicit_paddings,
+                  Tensor* filter_backprop, TensorFormat data_format) {
+    std::vector<int32> dilations(4, 1);
+    dilations[GetTensorDimIndex(data_format, 'H')] = row_dilation;
+    dilations[GetTensorDimIndex(data_format, 'W')] = col_dilation;
+
+    std::vector<int32> strides(4, 1);
+    strides[GetTensorDimIndex(data_format, 'H')] = row_stride;
+    strides[GetTensorDimIndex(data_format, 'W')] = col_stride;
+    TensorShape filter_shape = filter_backprop->shape();
+
+    ConvBackpropDimensions dims;
+    OP_REQUIRES_OK(
+        ctx, ConvBackpropComputeDimensionsV2(
+                 "Conv2DBackpropFilter", /*num_spatial_dims=*/2, input.shape(),
+                 filter_shape, out_backprop.shape(), dilations, strides,
+                 padding, explicit_paddings, data_format, &dims));
+
+    int64_t padding_top = -1, padding_bottom = -1;
+    int64_t padding_left = -1, padding_right = -1;
+    if (padding == EXPLICIT) {
+      GetExplicitPaddingForDim(explicit_paddings, data_format, 'H',
+                               &padding_top, &padding_bottom);
+      GetExplicitPaddingForDim(explicit_paddings, data_format, 'W',
+                               &padding_left, &padding_right);
+    }
+    int64_t expected_out_rows, expected_out_cols;
+    // The function is guaranteed to succeed because we checked the output and
+    // padding was valid earlier.
+    TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+        dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+        row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
+        &padding_bottom));
+    DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
+    TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+        dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+        col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
+        &padding_right));
+    DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
+
+    const CPUDevice& d = ctx->eigen_device<CPUDevice>();
+
+    // WARNING: Need to swap row/col, padding_top/padding_left, and
+    // padding_bottom/padding_right when calling Eigen. Eigen expects tensors
+    // in NWHC format, but Tensorflow uses NHWC.
+
+    auto filter_backprop_t = filter_backprop->tensor<T, 4>();
+    auto input_t = input.tensor<T, 4>();
+    auto out_backprop_t = out_backprop.tensor<T, 4>();
+
+    if (padding != EXPLICIT) {
+      // If padding was not explicitly defined, Eigen spatial convolution
+      // backward filter will infer correct forward paddings from input tensors.
+      filter_backprop_t.device(d) = Eigen::SpatialConvolutionBackwardKernel(
+          input_t, out_backprop_t, filter_backprop_t.dimension(1),
+          filter_backprop_t.dimension(0), col_stride, row_stride, col_dilation,
+          row_dilation);
+
+    } else {
+      // Otherwise we have to explicitly pad the input, before passing it to
+      // spatial convolution backward filter.
+      Eigen::array<std::pair<int, int>, 4> paddings;
+      paddings[0] = {0, 0};
+      paddings[1] = {padding_top, padding_bottom};
+      paddings[2] = {padding_left, padding_right};
+      paddings[3] = {0, 0};
+
+      auto padded_t = input_t.pad(paddings, T(0));
+
+      // TODO(ezhulenev): Pass explicit paddings to Eigen spatial backward
+      // convolution and do not rely on tensor padding expression.
+      filter_backprop_t.device(d) = Eigen::SpatialConvolutionBackwardKernel(
+          padded_t, out_backprop_t, filter_backprop_t.dimension(1),
+          filter_backprop_t.dimension(0), col_stride, row_stride, col_dilation,
+          row_dilation);
+    }
+  }
+};
+
+template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::bfloat16>;
+template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
+template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
+template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
+
+// GPU definitions.
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// The slow version (but compiles for GPU)
+
+// A dummy type to group forward backward filter autotune results together.
+struct ConvBackwardFilterAutotuneGroup {
+  static string name() { return "ConvBwdFilter"; }
+};
+
+typedef AutotuneSingleton<ConvBackwardFilterAutotuneGroup, ConvParameters,
+                          AutotuneEntry<se::dnn::ConvOp>>
+    AutotuneConvBwdFilter;
+
+template <typename T>
+void LaunchConv2DBackpropFilterOpImpl(
+    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+    const Tensor& out_backprop, const Tensor& input, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    const std::vector<int64_t>& explicit_paddings, Tensor* filter_backprop,
+    TensorFormat data_format) {
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
+
+  std::vector<int32> dilations(4, 1);
+  dilations[GetTensorDimIndex(data_format, 'H')] = row_dilation;
+  dilations[GetTensorDimIndex(data_format, 'W')] = col_dilation;
+
+  std::vector<int32> strides(4, 1);
+  strides[GetTensorDimIndex(data_format, 'H')] = row_stride;
+  strides[GetTensorDimIndex(data_format, 'W')] = col_stride;
+  TensorShape filter_shape = filter_backprop->shape();
+
+  ConvBackpropDimensions dims;
+  OP_REQUIRES_OK(
+      ctx, ConvBackpropComputeDimensionsV2(
+               "Conv2DBackpropFilter", /*num_spatial_dims=*/2, input.shape(),
+               filter_shape, out_backprop.shape(), dilations, strides, padding,
+               explicit_paddings, data_format, &dims));
+
+  int64_t padding_top = -1, padding_bottom = -1;
+  int64_t padding_left = -1, padding_right = -1;
+  if (padding == EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
+                             &padding_bottom);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
+                             &padding_right);
+  }
+  int64_t expected_out_rows, expected_out_cols;
+  // The function is guaranteed to succeed because we checked the output and
+  // padding was valid earlier.
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+      row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
+      &padding_bottom));
+  DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+      col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
+      &padding_right));
+  DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
+
+  auto* stream = ctx->op_device_context()->stream();
+  OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+
+  if (!use_cudnn) {
+    ctx->SetStatus(errors::Unimplemented(
+        "Conv2DBackprop for GPU is not currently supported "
+        "without cudnn"));
+    return;
+  }
+
+  // If the filter in-depth (filter_shape.dim_size(2)) is 1 and smaller than the
+  // input depth, it's a depthwise convolution. More generally, if the filter
+  // in-depth divides but is smaller than the input depth, it is a grouped
+  // convolution.
+  bool is_grouped_convolution = filter_shape.dim_size(2) != dims.in_depth;
+  bool cudnn_disable_conv_1x1_optimization_ = CudnnDisableConv1x1Optimization();
+  if (!cudnn_disable_conv_1x1_optimization_ &&
+      dims.spatial_dims[0].filter_size == 1 &&
+      dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
+      dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
+      data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
+    const uint64 m = dims.in_depth;
+    const uint64 k = dims.batch_size * dims.spatial_dims[0].input_size *
+                     dims.spatial_dims[1].input_size;
+    const uint64 n = dims.out_depth;
+
+    // The shape of output backprop is
+    //   [batch, out_rows, out_cols, out_depth]
+    //   From cublas's perspective, it is: n x k
+    auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                out_backprop.template flat<T>().size());
+
+    // The shape of input is
+    //   [batch, in_rows, in_cols, in_depth],
+    //   From cublas's perspective, it is: m x k
+    auto b_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                input.template flat<T>().size());
+
+    // the shape of the filter backprop from the conv_2d should be
+    //   [1, 1, in_depth, out_depth]
+    //   From cublas's perspective, it is: n x m
+    auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
+                                filter_backprop->template flat<T>().size());
+
+    OP_REQUIRES_OK(
+        ctx, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                                  se::blas::Transpose::kTranspose, n, m, k,
+                                  a_ptr, n, b_ptr, m, &c_ptr, n,
+                                  se::blas::kDefaultComputePrecision));
+    return;
+  } else if (dims.spatial_dims[0].filter_size ==
+                 dims.spatial_dims[0].input_size &&
+             dims.spatial_dims[1].filter_size ==
+                 dims.spatial_dims[1].input_size &&
+             !is_grouped_convolution && padding == VALID &&
+             data_format == FORMAT_NHWC) {
+    // The input data and filter have the same height/width, and we are not
+    // using grouped convolution, so call cublas directly.
+    const uint64 m = dims.spatial_dims[0].input_size *
+                     dims.spatial_dims[1].input_size * dims.in_depth;
+    const uint64 k = dims.batch_size;
+    const uint64 n = dims.out_depth;
+
+    auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                input.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                out_backprop.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
+                                filter_backprop->template flat<T>().size());
+
+    OP_REQUIRES_OK(
+        ctx, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                                  se::blas::Transpose::kTranspose, n, m, k,
+                                  b_ptr, n, a_ptr, m, &c_ptr, n,
+                                  se::blas::kDefaultComputePrecision));
+    return;
+  }
+
+  const int64_t common_padding_rows = std::min(padding_top, padding_bottom);
+  const int64_t common_padding_cols = std::min(padding_left, padding_right);
+  Tensor compatible_input;
+  if (padding_top != padding_bottom || padding_left != padding_right) {
+    // Pad the input in the same way we did during the forward pass, so that
+    // cuDNN or MIOpen receives the same input during the backward pass function
+    // as it did during the forward pass function.
+    const int64_t padding_rows_diff = std::abs(padding_bottom - padding_top);
+    const int64_t padding_cols_diff = std::abs(padding_right - padding_left);
+    const int64_t new_in_rows =
+        dims.spatial_dims[0].input_size + padding_rows_diff;
+    const int64_t new_in_cols =
+        dims.spatial_dims[1].input_size + padding_cols_diff;
+    const int64_t input_pad_top = padding_top - common_padding_rows;
+    const int64_t input_pad_bottom = padding_bottom - common_padding_rows;
+    const int64_t input_pad_left = padding_left - common_padding_cols;
+    const int64_t input_pad_right = padding_right - common_padding_cols;
+    TensorShape compatible_input_shape;
+    OP_REQUIRES_OK(
+        ctx, ShapeFromFormatWithStatus(data_format, dims.batch_size,
+                                       new_in_rows, new_in_cols, dims.in_depth,
+                                       &compatible_input_shape));
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                compatible_input_shape, &compatible_input));
+
+    functor::PadInput<GPUDevice, T, int, 4>()(
+        ctx->template eigen_device<GPUDevice>(), To32Bit(input.tensor<T, 4>()),
+        {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
+        {{static_cast<int>(input_pad_bottom),
+          static_cast<int>(input_pad_right)}},
+        To32Bit(compatible_input.tensor<T, 4>()), data_format, T{});
+  } else {
+    compatible_input = input;
+  }
+
+  CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
+      << "Negative row or col paddings: (" << common_padding_rows << ", "
+      << common_padding_cols << ")";
+
+  const bool compute_in_nhwc =
+      ComputeInNhwcEnabled(DataTypeToEnum<T>::value, stream);
+
+  // We only do one directional conversion: NHWC->NCHW. We never convert in the
+  // other direction. Grappler layout optimizer selects the preferred layout and
+  // adds necessary annotations to the graph.
+  const TensorFormat compute_data_format =
+      (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
+                                                      : FORMAT_NCHW;
+
+  VLOG(3) << "Compute Conv2DBackpropFilter with cuDNN:"
+          << " data_format=" << ToString(data_format)
+          << " compute_data_format=" << ToString(compute_data_format);
+
+  constexpr auto kComputeInNHWC =
+      std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                      se::dnn::FilterLayout::kOutputYXInput);
+  constexpr auto kComputeInNCHW =
+      std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                      se::dnn::FilterLayout::kOutputInputYX);
+
+  se::dnn::DataLayout compute_data_layout;
+  se::dnn::FilterLayout filter_layout;
+
+  std::tie(compute_data_layout, filter_layout) =
+      compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
+  se::dnn::BatchDescriptor input_desc;
+  input_desc.set_count(dims.batch_size)
+      .set_height(GetTensorDim(compatible_input, data_format, 'H'))
+      .set_width(GetTensorDim(compatible_input, data_format, 'W'))
+      .set_feature_map_count(dims.in_depth)
+      .set_layout(compute_data_layout);
+  se::dnn::BatchDescriptor output_desc;
+  output_desc.set_count(dims.batch_size)
+      .set_height(dims.spatial_dims[0].output_size)
+      .set_width(dims.spatial_dims[1].output_size)
+      .set_feature_map_count(dims.out_depth)
+      .set_layout(compute_data_layout);
+  se::dnn::FilterDescriptor filter_desc;
+  filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
+      .set_input_filter_width(dims.spatial_dims[1].filter_size)
+      .set_input_feature_map_count(filter_shape.dim_size(2))
+      .set_output_feature_map_count(filter_shape.dim_size(3))
+      .set_layout(filter_layout);
+  se::dnn::ConvolutionDescriptor conv_desc;
+  conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
+      .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
+      .set_vertical_filter_stride(dims.spatial_dims[0].stride)
+      .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
+      .set_zero_padding_height(common_padding_rows)
+      .set_zero_padding_width(common_padding_cols)
+      .set_group_count(dims.in_depth / filter_shape.dim_size(2));
+
+  // Tensorflow filter format: HWIO
+  // cuDNN filter formats: (data format) -> (filter format)
+  //   (1) NCHW -> OIHW
+  //   (2) NHWC -> OHWI
+  //
+  // We compute filter backprop into temporary tensor, and then convert it to
+  // the HWIO data format at the end.
+
+  Tensor pre_transformed_filter_backprop;
+  OP_REQUIRES_OK(
+      ctx,
+      ctx->allocate_temp(
+          DataTypeToEnum<T>::value,
+          TensorShape({filter_shape.dim_size(3), filter_shape.dim_size(2),
+                       filter_shape.dim_size(0), filter_shape.dim_size(1)}),
+          &pre_transformed_filter_backprop));
+
+  Tensor transformed_out_backprop;
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the `out_backprop` tensor from NHWC to NCHW.";
+    TensorShape compute_shape;
+    OP_REQUIRES_OK(
+        ctx, ShapeFromFormatWithStatus(compute_data_format, dims.batch_size,
+                                       dims.spatial_dims[0].output_size,
+                                       dims.spatial_dims[1].output_size,
+                                       dims.out_depth, &compute_shape));
+    if (dims.out_depth > 1) {
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<T>::value, compute_shape,
+                                        &transformed_out_backprop));
+      functor::NHWCToNCHW<GPUDevice, T, 4>()(
+          ctx->eigen_device<GPUDevice>(), out_backprop.tensor<T, 4>(),
+          transformed_out_backprop.tensor<T, 4>());
+    } else {
+      // If depth <= 1, just reshape.
+      CHECK(transformed_out_backprop.CopyFrom(out_backprop, compute_shape));
+    }
+  } else {
+    transformed_out_backprop = out_backprop;
+  }
+
+  Tensor transformed_input;
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the `input` tensor from NHWC to NCHW.";
+    TensorShape compute_shape;
+    OP_REQUIRES_OK(ctx, ShapeFromFormatWithStatus(
+                            compute_data_format,
+                            GetTensorDim(compatible_input, data_format, 'N'),
+                            GetTensorDim(compatible_input, data_format, 'H'),
+                            GetTensorDim(compatible_input, data_format, 'W'),
+                            GetTensorDim(compatible_input, data_format, 'C'),
+                            &compute_shape));
+    if (compute_shape.dim_size(1) > 1) {
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<T>::value, compute_shape,
+                                        &transformed_input));
+      functor::NHWCToNCHW<GPUDevice, T, 4>()(
+          ctx->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(compatible_input).tensor<T, 4>(),
+          transformed_input.tensor<T, 4>());
+    } else {
+      // If depth <= 1, just reshape.
+      CHECK(transformed_input.CopyFrom(compatible_input, compute_shape));
+    }
+  } else {
+    transformed_input = compatible_input;
+  }
+
+  se::DeviceMemory<T> out_backprop_ptr =
+      AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
+                     transformed_out_backprop.template flat<T>().size());
+  se::DeviceMemory<T> filter_backprop_ptr =
+      AsDeviceMemory(pre_transformed_filter_backprop.template flat<T>().data(),
+                     pre_transformed_filter_backprop.template flat<T>().size());
+  auto input_ptr = AsDeviceMemory(transformed_input.template flat<T>().data(),
+                                  transformed_input.template flat<T>().size());
+
+  static int64_t ConvolveBackwardFilterScratchSize =
+      GetDnnWorkspaceLimitOrDefault();
+  ConvParameters conv_parameters = {
+      stream->parent(),
+      dims.batch_size,                     // batch
+      dims.in_depth,                       // in_depths
+      {{input_desc.height(),               // in_rows
+        input_desc.width()}},              // in_cols
+      compute_data_format,                 // compute_data_format
+      dims.out_depth,                      // out_depths
+      {{dims.spatial_dims[0].filter_size,  // filter_rows
+        dims.spatial_dims[1].filter_size,  // filter_cols
+        filter_shape.dim_size(2)}},        // filter_depth
+      {{dims.spatial_dims[0].dilation,     // dilation_rows
+        dims.spatial_dims[1].dilation}},   // dilation_cols
+      {{dims.spatial_dims[0].stride,       // stride_rows
+        dims.spatial_dims[1].stride}},     // stride_cols
+      {{common_padding_rows,               // padding_rows
+        common_padding_cols}},             // padding_cols
+      input.dtype(),                       // tensor datatype
+      conv_desc.group_count(),             // group_count
+  };
+
+  auto entry_or = AutotuneUnfusedConv(
+      cudnn_use_autotune, AutotuneConvBwdFilter::GetInstance(), conv_parameters,
+      ctx, se::dnn::ConvolutionKind::BACKWARD_FILTER, input_desc, input_ptr,
+      filter_desc, filter_backprop_ptr, conv_desc, output_desc,
+      out_backprop_ptr, ConvolveBackwardFilterScratchSize);
+  OP_REQUIRES_OK(ctx, entry_or.status());
+  auto autotune_entry = std::move(entry_or).value();
+
+  DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize, ctx);
+  Status cudnn_launch_status = LaunchAutotunedConv(
+      autotune_entry, &scratch_allocator,
+      se::dnn::ConvolutionKind::BACKWARD_FILTER, stream, input_desc, input_ptr,
+      filter_desc, filter_backprop_ptr, conv_desc, output_desc,
+      out_backprop_ptr);
+  if (!cudnn_launch_status.ok()) {
+    ctx->SetStatus(cudnn_launch_status);
+    return;
+  }
+
+  FilterTensorFormat src_filter_format =
+      compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
+
+  auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+  functor::ReverseTransformFilter<GPUDevice, T, 4>()(
+      ctx->eigen_device<GPUDevice>(), src_filter_format,
+      toConstTensor(pre_transformed_filter_backprop).template tensor<T, 4>(),
+      filter_backprop->tensor<T, 4>());
+}
+
+template <typename T>
+void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
+    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+    const Tensor& out_backprop, const Tensor& input, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    const std::vector<int64_t>& explicit_paddings, Tensor* filter_backprop,
+    TensorFormat data_format) {
+  LaunchConv2DBackpropFilterOpImpl<T>(
+      ctx, use_cudnn, cudnn_use_autotune, out_backprop, input, row_dilation,
+      col_dilation, row_stride, col_stride, padding, explicit_paddings,
+      filter_backprop, data_format);
+}
+
+template <>
+void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, Eigen::bfloat16>::
+operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+           const Tensor& out_backprop, const Tensor& input, int row_dilation,
+           int col_dilation, int row_stride, int col_stride,
+           const Padding& padding,
+           const std::vector<int64_t>& explicit_paddings,
+           Tensor* filter_backprop, TensorFormat data_format) {
+  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+  auto* stream = ctx->op_device_context()->stream();
+  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+      se::CudaComputeCapability::AMPERE);
+
+  if (cast_to_float) {
+    Tensor casted_input = input;
+    Tensor casted_out_backprop = out_backprop;
+    Tensor casted_filter_backprop = *filter_backprop;
+
+    const GPUDevice& device = ctx->eigen_device<GPUDevice>();
+    functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DT_FLOAT, input.shape(), &casted_input));
+    cast(device, casted_input.template flat<float>(),
+         input.template flat<Eigen::bfloat16>());
+
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, out_backprop.shape(),
+                                           &casted_out_backprop));
+    cast(device, casted_out_backprop.template flat<float>(),
+         out_backprop.template flat<Eigen::bfloat16>());
+
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, filter_backprop->shape(),
+                                           &casted_filter_backprop));
+
+    LaunchConv2DBackpropFilterOpImpl<float>(
+        ctx, use_cudnn, cudnn_use_autotune, casted_out_backprop, casted_input,
+        row_dilation, col_dilation, row_stride, col_stride, padding,
+        explicit_paddings, &casted_filter_backprop, data_format);
+
+    functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+    const Tensor& casted_filter_backprop_const = casted_filter_backprop;
+    cast_back(device, filter_backprop->template flat<Eigen::bfloat16>(),
+              casted_filter_backprop_const.template flat<float>());
+    return;
+  }
+
+  LaunchConv2DBackpropFilterOpImpl<Eigen::bfloat16>(
+      ctx, use_cudnn, cudnn_use_autotune, out_backprop, input, row_dilation,
+      col_dilation, row_stride, col_stride, padding, explicit_paddings,
+      filter_backprop, data_format);
+}
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                             \
+  template <>                                                           \
+  void TransformFilter<GPUDevice, T, int, 4>::operator()(               \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,         \
+      typename TTypes<T, 4, int>::ConstTensor in,                       \
+      typename TTypes<T, 4, int>::Tensor out);                          \
+  extern template struct TransformFilter<GPUDevice, T, int, 4>;         \
+  template <>                                                           \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
+      const std::array<int, 2>& padding_left,                           \
+      const std::array<int, 2>& padding_right,                          \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
+      const T& padding_value);                                          \
+  extern template struct PadInput<GPUDevice, T, int, 4>;
+
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(Eigen::bfloat16);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
+template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
+template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
similarity index 52%
rename from tensorflow/core/kernels/conv_grad_ops_3d.cc
rename to tensorflow/core/kernels/conv_grad_input_ops_3d.cc
index fabb4f5bc14..d04217251e2 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
@@ -60,10 +60,7 @@ using stream_executor::dnn::DimIndex;
 
 namespace {
 
-// TODO(ezhulenev): Split this file into conv_grad_filter_ops_3d.cc and
-// conv_grad_input_ops_3d.cc.
-
-// TODO(ezhulenev): Generalize Col2im and Im2col for 2-d and 3-d kernels.
+// TODO(ezhulenev): Generalize Col2im for 2-d and 3-d kernels.
 
 // "Depth" is already used for the channel dimension, so for the third spatial
 // dimension in this file we use "plane", although in NDHWC layout it's
@@ -119,53 +116,6 @@ void Col2im(const T* col_data, const int depth, const int planes,
   }
 }
 
-// Returns in 'col_data', image patches in storage order (planes, height, width,
-// depth) extracted from image at 'input_data', which is required to be in
-// storage order (batch, planes, height, width, depth).
-//
-// Based on 2-dimensional implementation written by Yangqing Jia (jiayq).
-template <typename T>
-void Im2col(const T* input_data, const int depth, const int planes,
-            const int height, const int width, const int filter_p,
-            const int filter_h, const int filter_w, const int pad_pt,
-            const int pad_t, const int pad_l, const int pad_pb, const int pad_b,
-            const int pad_r, const int stride_p, const int stride_h,
-            const int stride_w, T* col_data) {
-  const int planes_col = (planes + pad_pt + pad_pb - filter_p) / stride_p + 1;
-  const int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
-  const int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
-
-  int p_pad = -pad_pt;
-  for (int p = 0; p < planes_col; ++p) {
-    int h_pad = -pad_t;
-    for (int h = 0; h < height_col; ++h) {
-      int w_pad = -pad_l;
-      for (int w = 0; w < width_col; ++w) {
-        for (int ip = p_pad; ip < p_pad + filter_p; ++ip) {
-          for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
-            for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
-              if (ip >= 0 && ip < planes && ih >= 0 && ih < height && iw >= 0 &&
-                  iw < width) {
-                memcpy(col_data,
-                       input_data +
-                           (ip * height * width + ih * width + iw) * depth,
-                       sizeof(T) * depth);
-              } else {
-                // This should be simply padded with zero.
-                memset(col_data, 0, sizeof(T) * depth);
-              }
-              col_data += depth;
-            }
-          }
-        }
-        w_pad += stride_w;
-      }
-      h_pad += stride_h;
-    }
-    p_pad += stride_p;
-  }
-}
-
 }  // namespace
 
 namespace tensorflow {
@@ -698,515 +648,6 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
 
-// Backprop for filter that offloads computation to
-// Eigen::CuboidConvolutionBackwardFilter.
-template <typename Device, class T>
-class Conv3DBackpropFilterOp : public OpKernel {
- public:
-  explicit Conv3DBackpropFilterOp(OpKernelConstruction* context)
-      : OpKernel(context),
-        data_format_(FORMAT_NHWC),
-        takes_shape_(type_string().find("V2") != std::string::npos) {
-    // data_format is only available in V2.
-    if (takes_shape_) {
-      string data_format;
-      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                  errors::InvalidArgument("Invalid data format"));
-      OP_REQUIRES(
-          context, data_format_ == FORMAT_NHWC,
-          errors::InvalidArgument(
-              "Conv3DBackpropFilterOpV2 only supports NDHWC on the CPU."));
-    }
-
-    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
-    OP_REQUIRES(context, dilation_.size() == 5,
-                errors::InvalidArgument("Dilation rates field must "
-                                        "specify 5 dimensions"));
-    OP_REQUIRES(context,
-                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
-                 GetTensorDim(dilation_, data_format_, 'N') == 1),
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilation rates in the batch and depth dimensions."));
-
-    // TODO(yangzihao): Add CPU version of dilated conv 3D.
-    OP_REQUIRES(context,
-                (GetTensorDim(dilation_, data_format_, '0') == 1 &&
-                 GetTensorDim(dilation_, data_format_, '1') == 1 &&
-                 GetTensorDim(dilation_, data_format_, '2') == 1),
-                errors::InvalidArgument(
-                    "Current CPU implementation does not yet support "
-                    "dilation rates larger than 1."));
-
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-    OP_REQUIRES(context, stride_.size() == 5,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 5 dimensions"));
-    OP_REQUIRES(
-        context,
-        (GetTensorDim(stride_, data_format_, 'C') == 1 &&
-         GetTensorDim(stride_, data_format_, 'N') == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    const TensorShape& input_shape = input.shape();
-
-    const Tensor& out_backprop = context->input(2);
-    const TensorShape& out_backprop_shape = out_backprop.shape();
-
-    TensorShape filter_shape;
-    if (takes_shape_) {
-      const Tensor& filter_sizes = context->input(1);
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_sizes.shape()),
-                  errors::InvalidArgument(
-                      "filter_sizes shape must be rank 1 but is rank ",
-                      filter_sizes.shape().dims()));
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  filter_sizes.vec<int32>(), &filter_shape));
-    } else {
-      filter_shape = context->input(1).shape();
-    }
-
-    OP_REQUIRES(context, input_shape.dims() == 5,
-                errors::InvalidArgument("input tensor must have 5 dimensions"));
-    OP_REQUIRES(
-        context, filter_shape.dims() == 5,
-        errors::InvalidArgument("filter_sizes tensor must have 5 dimensions"));
-    OP_REQUIRES(
-        context, out_backprop_shape.dims() == 5,
-        errors::InvalidArgument("out_backprop tensor must have 5 dimensions"));
-    OP_REQUIRES(
-        context, input_shape.dim_size(4) == filter_shape.dim_size(3),
-        errors::InvalidArgument("input and filter_sizes must have the same "
-                                "number of channels. Got ",
-                                input_shape.dim_size(4), " for input and ",
-                                filter_shape.dim_size(3), " for filter_sizes"));
-    OP_REQUIRES(
-        context, out_backprop_shape.dim_size(4) == filter_shape.dim_size(4),
-        errors::InvalidArgument("out_backprop and filter_sizes must have the "
-                                "same number of channels. Got ",
-                                out_backprop_shape.dim_size(4),
-                                " for out_backprop and ",
-                                filter_shape.dim_size(4), " for filter_sizes"));
-
-    ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensions(
-                       "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3,
-                       input_shape, filter_shape, out_backprop_shape, stride_,
-                       padding_, data_format_, &dims));
-
-    Tensor* filter_backprop;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, filter_shape, &filter_backprop));
-
-    if (input_shape.num_elements() == 0) {
-      filter_backprop->template flat<T>().setZero();
-      return;
-    }
-
-    functor::CuboidConvolutionBackwardFilter<Device, T>()(
-        context->eigen_device<Device>(),
-        filter_backprop->tensor<T, 5>(),                 // filter_backward
-        input.tensor<T, 5>(),                            // input
-        out_backprop.tensor<T, 5>(),                     // output_backward
-        static_cast<int>(dims.spatial_dims[0].stride),   // stride_planes
-        static_cast<int>(dims.spatial_dims[1].stride),   // stride_rows
-        static_cast<int>(dims.spatial_dims[2].stride));  // stride_cols
-  }
-
- private:
-  std::vector<int32> dilation_;
-  std::vector<int32> stride_;
-  Padding padding_;
-  TensorFormat data_format_;
-  bool takes_shape_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv3DBackpropFilterOp);
-};
-
-// Custom backprop for filter that explicitly does the work sharding and calls
-// Eigen only to multiply matrices.
-template <typename Device, class T>
-class Conv3DCustomBackpropFilterOp : public OpKernel {
-  // Limit the maximum size of allocated temporary buffer to
-  // kMaxTempAllocationOverhead times the size of the input tensors (input,
-  // filter, out_backprop). If the size of the temporary buffer exceeds this
-  // limit, fallback on Eigen implementation.
-  static constexpr int kMaxTempAllocationOverhead = 25;
-
- public:
-  explicit Conv3DCustomBackpropFilterOp(OpKernelConstruction* context)
-      : OpKernel(context),
-        data_format_(FORMAT_NHWC),
-        takes_shape_(type_string().find("V2") != std::string::npos) {
-    // data_format is only available in V2.
-    if (takes_shape_) {
-      string data_format;
-      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                  errors::InvalidArgument("Invalid data format"));
-      OP_REQUIRES(
-          context, data_format_ == FORMAT_NHWC,
-          errors::InvalidArgument(
-              "Conv3DBackpropFilterOpV2 only supports NDHWC on the CPU."));
-    }
-
-    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
-    OP_REQUIRES(context, dilation_.size() == 5,
-                errors::InvalidArgument("Dilation rates field must "
-                                        "specify 5 dimensions"));
-    OP_REQUIRES(context,
-                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
-                 GetTensorDim(dilation_, data_format_, 'N') == 1),
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilation rates in the batch and depth dimensions."));
-
-    // TODO(yangzihao): Add CPU version of dilated conv 3D.
-    OP_REQUIRES(context,
-                (GetTensorDim(dilation_, data_format_, '0') == 1 &&
-                 GetTensorDim(dilation_, data_format_, '1') == 1 &&
-                 GetTensorDim(dilation_, data_format_, '2') == 1),
-                errors::InvalidArgument(
-                    "Current CPU implementation does not yet support "
-                    "dilation rates larger than 1."));
-
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-    OP_REQUIRES(context, stride_.size() == 5,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 5 dimensions"));
-    OP_REQUIRES(
-        context,
-        (GetTensorDim(stride_, data_format_, 'C') == 1 &&
-         GetTensorDim(stride_, data_format_, 'N') == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    const TensorShape& input_shape = input.shape();
-
-    const Tensor& out_backprop = context->input(2);
-    const TensorShape& out_backprop_shape = out_backprop.shape();
-
-    TensorShape filter_shape;
-    if (takes_shape_) {
-      const Tensor& filter_sizes = context->input(1);
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_sizes.shape()),
-                  errors::InvalidArgument(
-                      "filter_sizes shape must be rank 1 but is rank ",
-                      filter_sizes.shape().dims()));
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  filter_sizes.vec<int32>(), &filter_shape));
-    } else {
-      filter_shape = context->input(1).shape();
-    }
-
-    OP_REQUIRES(context, input_shape.dims() == 5,
-                errors::InvalidArgument("input tensor must have 5 dimensions"));
-    OP_REQUIRES(
-        context, filter_shape.dims() == 5,
-        errors::InvalidArgument("filter_sizes tensor must have 5 dimensions"));
-    OP_REQUIRES(
-        context, out_backprop_shape.dims() == 5,
-        errors::InvalidArgument("out_backprop tensor must have 5 dimensions"));
-    OP_REQUIRES(
-        context, input_shape.dim_size(4) == filter_shape.dim_size(3),
-        errors::InvalidArgument("input and filter_sizes must have the same "
-                                "number of channels. Got ",
-                                input_shape.dim_size(4), " for input and ",
-                                filter_shape.dim_size(3), " for filter_sizes"));
-    OP_REQUIRES(
-        context, out_backprop_shape.dim_size(4) == filter_shape.dim_size(4),
-        errors::InvalidArgument("out_backprop and filter_sizes must have the "
-                                "same number of channels. Got ",
-                                out_backprop_shape.dim_size(4),
-                                " for out_backprop and ",
-                                filter_shape.dim_size(4), " for filter_sizes"));
-
-    ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensions(
-                       "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3,
-                       input_shape, filter_shape, out_backprop_shape, stride_,
-                       padding_, data_format_, &dims));
-
-    Tensor* filter_backprop;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, filter_shape, &filter_backprop));
-
-    if (input_shape.num_elements() == 0) {
-      filter_backprop->template flat<T>().setZero();
-      return;
-    }
-
-    int64_t top_pad_planes, bottom_pad_planes;
-    int64_t top_pad_rows, bottom_pad_rows;
-    int64_t left_pad_cols, right_pad_cols;
-
-    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
-                                dims.spatial_dims[0].input_size,
-                                dims.spatial_dims[0].filter_size,
-                                dims.spatial_dims[0].stride, padding_,
-                                &dims.spatial_dims[0].output_size,
-                                &top_pad_planes, &bottom_pad_planes));
-    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
-                                dims.spatial_dims[1].input_size,
-                                dims.spatial_dims[1].filter_size,
-                                dims.spatial_dims[1].stride, padding_,
-                                &dims.spatial_dims[1].output_size,
-                                &top_pad_rows, &bottom_pad_rows));
-    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
-                                dims.spatial_dims[2].input_size,
-                                dims.spatial_dims[2].filter_size,
-                                dims.spatial_dims[2].stride, padding_,
-                                &dims.spatial_dims[2].output_size,
-                                &left_pad_cols, &right_pad_cols));
-
-    // TODO(ezhulenev): Extract work size and shard estimation to shared
-    // functions in conv_grad_ops, and update 2d convolution backprop.
-
-    // The total dimension size of each kernel.
-    const int64_t filter_total_size =
-        dims.spatial_dims[0].filter_size * dims.spatial_dims[1].filter_size *
-        dims.spatial_dims[2].filter_size * dims.in_depth;
-    // The output image size is the spatial size of the output.
-    const int64_t output_image_size = dims.spatial_dims[0].output_size *
-                                      dims.spatial_dims[1].output_size *
-                                      dims.spatial_dims[2].output_size;
-
-    // Shard 'batch' images (volumes) into 'shard_size' groups of images
-    // (volumes) to be fed into the parallel matmul. Calculate 'shard_size' by
-    // dividing the L3 cache size ('target_working_set_size') by the matmul size
-    // of an individual image ('work_unit_size').
-
-    const auto cache_sizes = Eigen::internal::CacheSizes();
-    const ptrdiff_t l3_cache_size = cache_sizes.m_l3;
-
-    // TODO(andydavis)
-    // *) Consider reducing 'target_working_set_size' if L3 is shared by
-    //    other concurrently running tensorflow ops.
-    const size_t target_working_set_size = l3_cache_size / sizeof(T);
-
-    const int64_t size_A = output_image_size * filter_total_size;
-
-    const int64_t size_B = output_image_size * dims.out_depth;
-
-    const int64_t size_C = filter_total_size * dims.out_depth;
-
-    const int64_t work_unit_size = size_A + size_B + size_C;
-
-    OP_REQUIRES(
-        context, work_unit_size > 0,
-        errors::InvalidArgument("input, filter_sizes and out_backprop tensors "
-                                "must all have at least 1 element"));
-
-    const size_t shard_size =
-        (target_working_set_size + work_unit_size - 1) / work_unit_size;
-
-    // Total number of elements in all the tensors used by this kernel.
-    int64_t total_tensor_elements = input_shape.num_elements() +
-                                    filter_shape.num_elements() +
-                                    out_backprop_shape.num_elements();
-
-    // Shape of the temporary workspace buffer.
-    TensorShape col_buffer_shape = {static_cast<int64_t>(shard_size),
-                                    static_cast<int64_t>(output_image_size),
-                                    static_cast<int64_t>(filter_total_size)};
-    int64_t col_buffer_elements = col_buffer_shape.num_elements();
-
-    // If the temporary allocation overhead is too large, fallback on Eigen
-    // implementation which requires much less memory.
-    int64_t col_buffer_overhead = col_buffer_elements / total_tensor_elements;
-    if (col_buffer_overhead > kMaxTempAllocationOverhead) {
-      VLOG(2) << "Fallback on Eigen implementation of Conv3DBackpropFilterOp: "
-                 "col_buffer_overhead="
-              << col_buffer_overhead;
-
-      functor::CuboidConvolutionBackwardFilter<Device, T>()(
-          context->eigen_device<Device>(),
-          filter_backprop->tensor<T, 5>(),                 // filter_backward
-          input.tensor<T, 5>(),                            // input
-          out_backprop.tensor<T, 5>(),                     // output_backward
-          static_cast<int>(dims.spatial_dims[0].stride),   // stride_planes
-          static_cast<int>(dims.spatial_dims[1].stride),   // stride_rows
-          static_cast<int>(dims.spatial_dims[2].stride));  // stride_cols
-
-      return;
-    }
-
-    Tensor col_buffer;
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<T>::value,
-                                          col_buffer_shape, &col_buffer));
-
-    // The input offset corresponding to a single input image.
-    const int64_t input_offset =
-        dims.spatial_dims[0].input_size * dims.spatial_dims[1].input_size *
-        dims.spatial_dims[2].input_size * dims.in_depth;
-    // The output offset corresponding to a single output image.
-    const int64_t output_offset =
-        dims.spatial_dims[0].output_size * dims.spatial_dims[1].output_size *
-        dims.spatial_dims[2].output_size * dims.out_depth;
-
-    const T* input_data = input.template flat<T>().data();
-    T* col_buffer_data = col_buffer.template flat<T>().data();
-    const T* out_backprop_data = out_backprop.template flat<T>().data();
-    T* filter_backprop_data = filter_backprop->template flat<T>().data();
-
-    typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
-                             Eigen::Unaligned>
-        TensorMap;
-    typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
-                             Eigen::Unaligned>
-        ConstTensorMap;
-
-    TensorMap C(filter_backprop_data, filter_total_size, dims.out_depth);
-    C.setZero();
-
-    // Initialize contraction dims (we need to transpose 'A' below).
-    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims;
-    contract_dims[0].first = 0;
-    contract_dims[0].second = 0;
-
-    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-
-    for (int image_id = 0; image_id < dims.batch_size; image_id += shard_size) {
-      const int shard_limit =
-          std::min(static_cast<int>(shard_size),
-                   static_cast<int>(dims.batch_size) - image_id);
-
-      auto shard = [&input_data, &col_buffer_data, &dims, &top_pad_planes,
-                    &top_pad_rows, &left_pad_cols, &bottom_pad_planes,
-                    &bottom_pad_rows, &right_pad_cols, &input_offset,
-                    &size_A](int64_t start, int64_t limit) {
-        for (int shard_id = start; shard_id < limit; ++shard_id) {
-          const T* input_data_shard = input_data + shard_id * input_offset;
-          T* col_data_shard = col_buffer_data + shard_id * size_A;
-
-          // When we compute the gradient with respect to the filters, we need
-          // to do im2col to allow gemm-type computation.
-          Im2col<T>(input_data_shard, dims.in_depth,
-                    // Input spatial dimensions.
-                    dims.spatial_dims[0].input_size,  // input planes
-                    dims.spatial_dims[1].input_size,  // input rows
-                    dims.spatial_dims[2].input_size,  // input cols
-                    // Filter spatial dimensions.
-                    dims.spatial_dims[0].filter_size,  // filter planes
-                    dims.spatial_dims[1].filter_size,  // filter rows
-                    dims.spatial_dims[2].filter_size,  // filter cols
-                    // Spatial padding.
-                    top_pad_planes, top_pad_rows, left_pad_cols,
-                    bottom_pad_planes, bottom_pad_rows, right_pad_cols,
-                    // Spatial striding.
-                    dims.spatial_dims[0].stride,  // stride planes
-                    dims.spatial_dims[1].stride,  // stride rows
-                    dims.spatial_dims[2].stride,  // stride cols
-                    col_data_shard);
-        }
-      };
-      Shard(worker_threads.num_threads, worker_threads.workers, shard_limit,
-            size_A, shard);
-
-      ConstTensorMap A(col_buffer_data, output_image_size * shard_limit,
-                       filter_total_size);
-      ConstTensorMap B(out_backprop_data, output_image_size * shard_limit,
-                       dims.out_depth);
-
-      // Gradient with respect to filter.
-      C.device(context->eigen_cpu_device()) += A.contract(B, contract_dims);
-
-      input_data += input_offset * shard_limit;
-      out_backprop_data += output_offset * shard_limit;
-    }
-  }
-
- private:
-  std::vector<int32> dilation_;
-  std::vector<int32> stride_;
-  Padding padding_;
-  TensorFormat data_format_;
-  bool takes_shape_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv3DCustomBackpropFilterOp);
-};
-
-// Custom backrop input kernel is 30% - 4x faster when compiled with AVX2 than
-// default Eigen implementation (at the cost of ~2x-8x peak memory usage).
-
-#define REGISTER_CPU_KERNEL(T)                                                \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      Conv3DCustomBackpropFilterOp<CPUDevice, T>);                            \
-  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<T>("T"),                        \
-                          Conv3DCustomBackpropFilterOp<CPUDevice, T>);        \
-  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilter")                        \
-                              .Device(DEVICE_CPU)                             \
-                              .Label("custom")                                \
-                              .TypeConstraint<T>("T"),                        \
-                          Conv3DCustomBackpropFilterOp<CPUDevice, T>);        \
-  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
-                              .Device(DEVICE_CPU)                             \
-                              .Label("custom")                                \
-                              .TypeConstraint<T>("T"),                        \
-                          Conv3DCustomBackpropFilterOp<CPUDevice, T>);        \
-  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilter")                        \
-                              .Device(DEVICE_CPU)                             \
-                              .Label("eigen_tensor")                          \
-                              .TypeConstraint<T>("T"),                        \
-                          Conv3DBackpropFilterOp<CPUDevice, T>);              \
-  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
-                              .Device(DEVICE_CPU)                             \
-                              .Label("eigen_tensor")                          \
-                              .TypeConstraint<T>("T"),                        \
-                          Conv3DBackpropFilterOp<CPUDevice, T>);
-
-TF_CALL_float(REGISTER_CPU_KERNEL);
-TF_CALL_double(REGISTER_CPU_KERNEL);
-#undef REGISTER_CPU_KERNEL
-
-#define REGISTER_CPU_KERNEL(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")               \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<T>("T"),                 \
-                          Conv3DCustomBackpropFilterOp<CPUDevice, T>); \
-  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")               \
-                              .Device(DEVICE_CPU)                      \
-                              .Label("custom")                         \
-                              .TypeConstraint<T>("T"),                 \
-                          Conv3DCustomBackpropFilterOp<CPUDevice, T>); \
-  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")               \
-                              .Device(DEVICE_CPU)                      \
-                              .Label("eigen_tensor")                   \
-                              .TypeConstraint<T>("T"),                 \
-                          Conv3DBackpropFilterOp<CPUDevice, T>);
-
-TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
-#undef REGISTER_CPU_KERNEL
-
-// WARNING: Eigen::half is not trivially copyable and can't be used in
-// custom backprop filter kernel because of memcpy and memset in Im2col.
-#define REGISTER_CPU_KERNEL(T)                                                \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      Conv3DBackpropFilterOp<CPUDevice, T>);                                  \
-  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<T>("T"),                        \
-                          Conv3DBackpropFilterOp<CPUDevice, T>);
-
-TF_CALL_half(REGISTER_CPU_KERNEL);
-#undef REGISTER_CPU_KERNEL
 
 // GPU definitions of both ops.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -1703,460 +1144,6 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
   bool cudnn_use_autotune_;
 };
 
-// A dummy type to group backward filter autotune results together.
-struct Conv3dBackwardFilterAutotuneGroup {
-  static string name() { return "Conv3dBwdFilter"; }
-};
-
-typedef AutotuneSingleton<Conv3dBackwardFilterAutotuneGroup, ConvParameters,
-                          AutotuneEntry<se::dnn::ConvOp>>
-    AutotuneConv3dBwdFilter;
-
-template <typename T>
-void LaunchConvBackpropFilterOpImpl(
-    OpKernelContext* context, bool cudnn_use_autotune, const Tensor& input,
-    const Tensor& out_backprop, const std::vector<int32>& dilation,
-    const std::vector<int32>& stride, const Padding& padding,
-    Tensor* filter_backprop, TensorFormat data_format) {
-  const TensorShape& input_shape = input.shape();
-  const TensorShape& out_backprop_shape = out_backprop.shape();
-  const TensorShape& filter_shape = filter_backprop->shape();
-
-  ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(context, ConvBackpropComputeDimensionsV2(
-                              "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3,
-                              input_shape, filter_shape, out_backprop_shape,
-                              dilation, stride, padding,
-                              /*explicit_paddings=*/{}, data_format, &dims));
-
-  auto* stream = context->op_device_context()->stream();
-  OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-
-  if (DataTypeToEnum<T>::value == DT_BFLOAT16 &&
-      !stream->GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    context->SetStatus(errors::Unimplemented(
-        "Conv3DBackpropFilter for GPU with bfloat16 is only supported "
-        "with cuDNN on Ampere GPUs or later."));
-    return;
-  }
-
-  bool is_grouped_convolution = filter_shape.dim_size(3) != dims.in_depth;
-  if (!is_grouped_convolution && dims.filter_size(1) == 1 &&
-      dims.filter_size(2) == 1 && dims.filter_size(0) == 1 &&
-      dims.dilation(2) == 1 && dims.dilation(1) == 1 && dims.dilation(0) == 1 &&
-      dims.stride(2) == 1 && dims.stride(1) == 1 && dims.stride(0) == 1 &&
-      data_format == FORMAT_NHWC) {
-    const uint64 m = dims.in_depth;
-    const uint64 k = dims.batch_size * dims.input_size(1) * dims.input_size(2) *
-                     dims.input_size(0);
-    const uint64 n = dims.out_depth;
-
-    // The shape of output backprop is
-    //   [batch, out_z, out_y, out_x, out_depth]
-    // From cublas's perspective, it is: n x k
-    auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
-                                out_backprop.template flat<T>().size());
-
-    // The shape of input is:
-    //   [batch, in_z, in_y, in_x, in_depth],
-    // From cublas's perspective, it is: m x k
-    auto b_ptr = AsDeviceMemory(input.template flat<T>().data(),
-                                input.template flat<T>().size());
-
-    // The shape of the filter backprop is:
-    //   [1, 1, 1, in_depth, out_depth]
-    // From cublas's perspective, it is: n x m
-    auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
-                                filter_backprop->template flat<T>().size());
-
-    OP_REQUIRES_OK(
-        context, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
-                                      se::blas::Transpose::kTranspose, n, m, k,
-                                      a_ptr, n, b_ptr, m, &c_ptr, n,
-                                      se::blas::kDefaultComputePrecision));
-    return;
-  } else if (!is_grouped_convolution &&
-             dims.filter_size(0) == dims.input_size(0) &&
-             dims.filter_size(1) == dims.input_size(1) &&
-             dims.filter_size(2) == dims.input_size(2) &&
-             padding == Padding::VALID && data_format == FORMAT_NHWC) {
-    const uint64 m = dims.input_size(0) * dims.input_size(1) *
-                     dims.input_size(2) * dims.in_depth;
-    const uint64 k = dims.batch_size;
-    const uint64 n = dims.out_depth;
-
-    auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
-                                input.template flat<T>().size());
-    auto b_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
-                                out_backprop.template flat<T>().size());
-    auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
-                                filter_backprop->template flat<T>().size());
-
-    OP_REQUIRES_OK(
-        context, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
-                                      se::blas::Transpose::kTranspose, n, m, k,
-                                      b_ptr, n, a_ptr, m, &c_ptr, n,
-                                      se::blas::kDefaultComputePrecision));
-    return;
-  }
-
-  int padding_planes = dims.SpatialPadding(padding, 0);
-  int padding_rows = dims.SpatialPadding(padding, 1);
-  int padding_cols = dims.SpatialPadding(padding, 2);
-  const bool planes_odd = (padding_planes % 2 != 0);
-  const bool rows_odd = (padding_rows % 2 != 0);
-  const bool cols_odd = (padding_cols % 2 != 0);
-
-  Tensor compatible_input;
-  if (rows_odd || cols_odd || planes_odd) {
-    TensorShape compatible_input_shape;
-    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
-                                data_format, dims.batch_size,
-                                {{dims.input_size(0) + planes_odd,
-                                  dims.input_size(1) + rows_odd,
-                                  dims.input_size(2) + cols_odd}},
-                                dims.in_depth, &compatible_input_shape));
-    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
-                                                   compatible_input_shape,
-                                                   &compatible_input));
-    functor::PadInput<GPUDevice, T, int, 5>()(
-        context->template eigen_device<GPUDevice>(),
-        To32Bit(input.tensor<T, 5>()), {{0, 0, 0}},
-        {{planes_odd, rows_odd, cols_odd}},
-        To32Bit(compatible_input.tensor<T, 5>()), data_format, T{});
-  } else {
-    compatible_input = input;
-  }
-
-  CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
-      << "Negative paddings: (" << padding_rows << ", " << padding_cols << ", "
-      << padding_planes << ")";
-
-#if GOOGLE_CUDA
-    const bool compute_in_nhwc = ComputeInNhwcEnabled(
-        DataTypeToEnum<T>::value, stream, /*use_4d_tensor=*/false);
-#else
-    // fast NDHWC implementation is a CUDA only feature
-    const bool compute_in_nhwc = false;
-#endif
-    const TensorFormat compute_data_format =
-        (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
-                                                        : FORMAT_NCHW;
-
-    VLOG(3) << "Compute Conv3DBackpropFilter with cuDNN:"
-            << " data_format=" << ToString(data_format)
-            << " compute_data_format=" << ToString(compute_data_format);
-
-    constexpr auto kComputeInNHWC =
-        std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
-                        se::dnn::FilterLayout::kOutputYXInput);
-    constexpr auto kComputeInNCHW =
-        std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
-                        se::dnn::FilterLayout::kOutputInputYX);
-
-    se::dnn::DataLayout compute_data_layout;
-    se::dnn::FilterLayout filter_layout;
-
-    std::tie(compute_data_layout, filter_layout) =
-        compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
-
-    se::dnn::BatchDescriptor input_desc(3);
-    input_desc.set_count(dims.batch_size)
-        .set_spatial_dim(DimIndex::X,
-                         GetTensorDim(compatible_input, data_format, '2'))
-        .set_spatial_dim(DimIndex::Y,
-                         GetTensorDim(compatible_input, data_format, '1'))
-        .set_spatial_dim(DimIndex::Z,
-                         GetTensorDim(compatible_input, data_format, '0'))
-        .set_feature_map_count(dims.in_depth)
-        .set_layout(compute_data_layout);
-    se::dnn::BatchDescriptor output_desc(3);
-    output_desc.set_count(dims.batch_size)
-        .set_spatial_dim(DimIndex::X, dims.output_size(2))
-        .set_spatial_dim(DimIndex::Y, dims.output_size(1))
-        .set_spatial_dim(DimIndex::Z, dims.output_size(0))
-        .set_feature_map_count(dims.out_depth)
-        .set_layout(compute_data_layout);
-    se::dnn::FilterDescriptor filter_desc(3);
-    filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2))
-        .set_spatial_dim(DimIndex::Y, dims.filter_size(1))
-        .set_spatial_dim(DimIndex::Z, dims.filter_size(0))
-        .set_input_feature_map_count(filter_shape.dim_size(3))
-        .set_output_feature_map_count(filter_shape.dim_size(4))
-        .set_layout(filter_layout);
-    se::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2))
-        .set_dilation_rate(DimIndex::Y, dims.dilation(1))
-        .set_dilation_rate(DimIndex::Z, dims.dilation(0))
-        .set_filter_stride(DimIndex::X, dims.stride(2))
-        .set_filter_stride(DimIndex::Y, dims.stride(1))
-        .set_filter_stride(DimIndex::Z, dims.stride(0))
-        .set_zero_padding(DimIndex::X, padding_cols / 2)
-        .set_zero_padding(DimIndex::Y, padding_rows / 2)
-        .set_zero_padding(DimIndex::Z, padding_planes / 2)
-        .set_group_count(dims.in_depth / filter_shape.dim_size(3));
-
-    Tensor pre_transformed_filter_backprop;
-    auto dst_format =
-        compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
-    TensorShape dst_shape =
-        dst_format == FORMAT_OIHW
-            ? TensorShape({filter_shape.dim_size(4), filter_shape.dim_size(3),
-                           dims.filter_size(0), dims.filter_size(1),
-                           dims.filter_size(2)})
-            : TensorShape({filter_shape.dim_size(4), dims.filter_size(0),
-                           dims.filter_size(1), dims.filter_size(2),
-                           filter_shape.dim_size(3)});
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
-                                          &pre_transformed_filter_backprop));
-
-    Tensor transformed_out_backprop;
-    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
-    VLOG(4) << "Convert the `out_backprop` tensor from NDHWC to NCDHW.";
-    TensorShape nchw_shape = {dims.batch_size, dims.out_depth,
-                              dims.output_size(0), dims.output_size(1),
-                              dims.output_size(2)};
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
-                                          &transformed_out_backprop));
-    if (dims.out_depth > 1) {
-      functor::NHWCToNCHW<GPUDevice, T, 5>()(
-          context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
-          transformed_out_backprop.tensor<T, 5>());
-    } else {
-      CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
-    }
-    } else {
-    transformed_out_backprop = out_backprop;
-    }
-    Tensor transformed_input;
-    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
-    VLOG(4) << "Convert the `input` tensor from NDHWC to NCDHW.";
-    TensorShape nchw_shape = {
-        dims.batch_size, dims.in_depth, compatible_input.dim_size(1),
-        compatible_input.dim_size(2), compatible_input.dim_size(3)};
-    if (dims.in_depth > 1) {
-      OP_REQUIRES_OK(
-          context, context->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
-                                          &transformed_input));
-      functor::NHWCToNCHW<GPUDevice, T, 5>()(
-          context->eigen_device<GPUDevice>(),
-          const_cast<const Tensor&>(compatible_input).tensor<T, 5>(),
-          transformed_input.tensor<T, 5>());
-    } else {
-      CHECK(transformed_input.CopyFrom(compatible_input, nchw_shape));
-    }
-    } else {
-    transformed_input = compatible_input;
-    }
-
-    auto out_backprop_ptr =
-        AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
-                       transformed_out_backprop.template flat<T>().size());
-    auto filter_backprop_ptr = AsDeviceMemory(
-        pre_transformed_filter_backprop.template flat<T>().data(),
-        pre_transformed_filter_backprop.template flat<T>().size());
-    auto input_ptr =
-        AsDeviceMemory(transformed_input.template flat<T>().data(),
-                       transformed_input.template flat<T>().size());
-
-    static int64_t ConvolveBackwardFilterScratchSize =
-        GetDnnWorkspaceLimitOrDefault();
-
-    const ConvParameters conv_parameters = {
-        stream->parent(),
-        dims.batch_size,
-        dims.in_depth,
-        {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}},
-        compute_data_format,
-        dims.out_depth,
-        {{dims.filter_size(0), dims.filter_size(1), dims.filter_size(2)}},
-        {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}},
-        {{dims.stride(0), dims.stride(1), dims.stride(2)}},
-        {{padding_planes, padding_rows, padding_cols}},
-        input.dtype(),
-        conv_desc.group_count(),
-    };
-
-    using se::dnn::AlgorithmConfig;
-    using se::dnn::AlgorithmDesc;
-    using se::dnn::ProfileResult;
-
-    auto entry_or = AutotuneUnfusedConv(
-        cudnn_use_autotune, AutotuneConv3dBwdFilter::GetInstance(),
-        conv_parameters, context, se::dnn::ConvolutionKind::BACKWARD_FILTER,
-        input_desc, input_ptr, filter_desc, filter_backprop_ptr, conv_desc,
-        output_desc, out_backprop_ptr, ConvolveBackwardFilterScratchSize);
-    OP_REQUIRES_OK(context, entry_or.status());
-    auto autotune_entry = std::move(entry_or).value();
-
-    DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
-                                          context);
-    Status cudnn_launch_status = LaunchAutotunedConv(
-        autotune_entry, &scratch_allocator,
-        se::dnn::ConvolutionKind::BACKWARD_FILTER, stream, input_desc,
-        input_ptr, filter_desc, filter_backprop_ptr, conv_desc, output_desc,
-        out_backprop_ptr);
-    if (!cudnn_launch_status.ok()) {
-      context->SetStatus(cudnn_launch_status);
-      return;
-    }
-
-    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-    functor::ReverseTransformFilter<GPUDevice, T, 5>()(
-        context->eigen_device<GPUDevice>(), /*src_filter_format=*/dst_format,
-        toConstTensor(pre_transformed_filter_backprop).template tensor<T, 5>(),
-        filter_backprop->tensor<T, 5>());
-}
-
-template <typename T>
-struct LaunchConvBackpropFilterOp {
-    static void launch(OpKernelContext* context, bool cudnn_use_autotune,
-                       const Tensor& input, const Tensor& out_backprop,
-                       const std::vector<int32>& dilation,
-                       const std::vector<int32>& stride, const Padding& padding,
-                       Tensor* filter_backprop, TensorFormat data_format) {
-      LaunchConvBackpropFilterOpImpl<T>(context, cudnn_use_autotune, input,
-                                        out_backprop, dilation, stride, padding,
-                                        filter_backprop, data_format);
-    }
-};
-
-template <>
-struct LaunchConvBackpropFilterOp<Eigen::bfloat16> {
-    static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
-                       const Tensor& input, const Tensor& out_backprop,
-                       const std::vector<int32>& dilation,
-                       const std::vector<int32>& stride, const Padding& padding,
-                       Tensor* filter_backprop, TensorFormat data_format) {
-      // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-      // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
-      auto* stream = ctx->op_device_context()->stream();
-      const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE);
-
-      if (cast_to_float) {
-      Tensor casted_input = input;
-      Tensor casted_out_backprop = out_backprop;
-      Tensor casted_filter_backprop = *filter_backprop;
-
-      const GPUDevice& device = ctx->eigen_device<GPUDevice>();
-      functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
-      OP_REQUIRES_OK(
-          ctx, ctx->allocate_temp(DT_FLOAT, input.shape(), &casted_input));
-      cast(device, casted_input.template flat<float>(),
-           input.template flat<Eigen::bfloat16>());
-
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, out_backprop.shape(),
-                                             &casted_out_backprop));
-      cast(device, casted_out_backprop.template flat<float>(),
-           out_backprop.template flat<Eigen::bfloat16>());
-
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, filter_backprop->shape(),
-                                             &casted_filter_backprop));
-
-      LaunchConvBackpropFilterOpImpl<float>(
-          ctx, cudnn_use_autotune, casted_input, casted_out_backprop, dilation,
-          stride, padding, &casted_filter_backprop, data_format);
-
-      functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
-      const Tensor& casted_filter_backprop_const = casted_filter_backprop;
-      cast_back(device, filter_backprop->template flat<Eigen::bfloat16>(),
-                casted_filter_backprop_const.template flat<float>());
-      return;
-      }
-
-      LaunchConvBackpropFilterOpImpl<Eigen::bfloat16>(
-          ctx, cudnn_use_autotune, input, out_backprop, dilation, stride,
-          padding, filter_backprop, data_format);
-    }
-};
-
-template <typename T>
-class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
-   public:
-    explicit Conv3DBackpropFilterOp(OpKernelConstruction* context)
-        : OpKernel(context),
-          data_format_(FORMAT_NHWC),
-          takes_shape_(type_string().find("V2") != std::string::npos) {
-      // data_format is only available in V2.
-      if (takes_shape_) {
-      string data_format;
-      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                  errors::InvalidArgument("Invalid data format"));
-      }
-      OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
-      OP_REQUIRES(context, dilation_.size() == 5,
-                  errors::InvalidArgument("Dilation rates field must "
-                                          "specify 5 dimensions"));
-      OP_REQUIRES(context,
-                  (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
-                   GetTensorDim(dilation_, data_format_, 'N') == 1),
-                  errors::InvalidArgument(
-                      "Current implementation does not yet support "
-                      "dilation rates in the batch and depth dimensions."));
-      OP_REQUIRES(
-          context,
-          (GetTensorDim(dilation_, data_format_, '0') > 0 &&
-           GetTensorDim(dilation_, data_format_, '1') > 0 &&
-           GetTensorDim(dilation_, data_format_, '2') > 0),
-          errors::InvalidArgument("Dilated rates should be larger than 0."));
-      OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-      OP_REQUIRES(context, stride_.size() == 5,
-                  errors::InvalidArgument("Sliding window strides field must "
-                                          "specify 5 dimensions"));
-      OP_REQUIRES(context,
-                  (GetTensorDim(stride_, data_format_, 'C') == 1 &&
-                   GetTensorDim(stride_, data_format_, 'N') == 1),
-                  errors::InvalidArgument(
-                      "Current implementation does not yet support "
-                      "strides in the batch and depth dimensions."));
-      OP_REQUIRES(
-          context,
-          (GetTensorDim(stride_, data_format_, '0') > 0 &&
-           GetTensorDim(stride_, data_format_, '1') > 0 &&
-           GetTensorDim(stride_, data_format_, '2') > 0),
-          errors::InvalidArgument("Spatial strides should be larger than 0."));
-      OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-      cudnn_use_autotune_ = CudnnUseAutotune();
-    }
-
-    void Compute(OpKernelContext* context) override {
-      const Tensor& input = context->input(0);
-      const Tensor& out_backprop = context->input(2);
-
-      TensorShape filter_shape;
-      if (takes_shape_) {
-      const Tensor& filter_sizes = context->input(1);
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_sizes.shape()),
-                  errors::InvalidArgument(
-                      "filter_sizes shape must be rank 1 but is rank ",
-                      filter_sizes.shape().dims()));
-      OP_REQUIRES_OK(context, tensor::MakeShape(filter_sizes, &filter_shape));
-      } else {
-      filter_shape = context->input(1).shape();
-      }
-
-      Tensor* filter_backprop;
-      OP_REQUIRES_OK(
-          context, context->allocate_output(0, filter_shape, &filter_backprop));
-
-      LaunchConvBackpropFilterOp<T>::launch(
-          context, cudnn_use_autotune_, input, out_backprop, dilation_, stride_,
-          padding_, filter_backprop, data_format_);
-    }
-
-   private:
-    std::vector<int32> dilation_;
-    std::vector<int32> stride_;
-    Padding padding_;
-    TensorFormat data_format_;
-    bool takes_shape_;
-    bool cudnn_use_autotune_;
-};
 
 #define REGISTER_GPU_KERNEL(T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
@@ -2166,15 +1153,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
                               .Device(DEVICE_GPU)                             \
                               .TypeConstraint<T>("T")                         \
                               .HostMemory("input_sizes"),                     \
-                          Conv3DBackpropInputOp<GPUDevice, T>);               \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("Conv3DBackpropFilter").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-      Conv3DBackpropFilterOp<GPUDevice, T>);                                  \
-  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
-                              .Device(DEVICE_GPU)                             \
-                              .TypeConstraint<T>("T")                         \
-                              .HostMemory("filter_sizes"),                    \
-                          Conv3DBackpropFilterOp<GPUDevice, T>);
+                          Conv3DBackpropInputOp<GPUDevice, T>);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
diff --git a/tensorflow/core/kernels/conv_ops_benchmark_test.cc b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
index 1197fd7106d..183372705aa 100644
--- a/tensorflow/core/kernels/conv_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
@@ -29,10 +29,8 @@ limitations under the License.
 #include "tensorflow/core/util/util.h"
 
 #ifdef INTEL_MKL
+#include "tensorflow/core/common_runtime/mkl_layout_pass.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
-#define MKL_OP_LABEL mkl_op_registry::kMklNameChangeOpLabel
-#else
-#define MKL_OP_LABEL ""
 #endif  // INTEL_MKL
 
 namespace tensorflow {
@@ -97,22 +95,58 @@ static Conv2DGraph Conv2D(int batch, int height, int width, int in_depth,
   Node* images = test::graph::Constant(graph, images_t, "images");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");
 
+  graph->AddEdge(graph->source_node(), 0, images, 0);
+  graph->AddEdge(graph->source_node(), 1, filter, 0);
+
+  // Add shape sizes to images and filter.
+  AttrValue attr_input_shape;
+  TensorShapeProto* proto = attr_input_shape.mutable_list()->add_shape();
+  proto->add_dim()->set_size(batch);
+  proto->add_dim()->set_size(height);
+  proto->add_dim()->set_size(width);
+  proto->add_dim()->set_size(in_depth);
+  proto = attr_input_shape.mutable_list()->add_shape();
+  proto->add_dim()->set_size(filter_h);
+  proto->add_dim()->set_size(filter_w);
+  proto->add_dim()->set_size(in_depth);
+  proto->add_dim()->set_size(out_depth);
+
   Node* conv2d;
-  auto builder = IsMKLEnabled()
-                     ? NodeBuilder(graph->NewName("conv"), "_MklNativeConv2D")
-                           .Attr("_kernel", MKL_OP_LABEL)
-                     : NodeBuilder(graph->NewName("conv"), "Conv2D");
+  auto builder = NodeBuilder(graph->NewName("conv"), "Conv2D");
   TF_CHECK_OK(builder.Input(images)
                   .Input(filter)
                   .Attr("T", DataTypeToEnum<T>::value)
                   .Attr("strides", {1, 1, 1, 1})
                   .Attr("padding", "SAME")
                   .Attr("data_format", ToString(data_format))
+                  .Attr("_input_shapes", attr_input_shape)
                   .Finalize(graph, &conv2d));
 
+#ifdef INTEL_MKL
+  int conv2d_node_id = conv2d->id();
+  if (IsMKLEnabled()) {
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(graph);
+    RunMklLayoutRewritePass(ug);
+    // After we ran the pass conv2d node might be overwritten
+    // so we need to make sure that it is pointing to the one
+    // that exists.
+    if (!graph->FindNodeId(conv2d_node_id)) {
+      conv2d = graph->FindNodeId(conv2d_node_id + 1);
+    }
+  }
+#endif  // INTEL_MKL
+
   return {graph, conv2d};
 }
 
+static int64_t Conv2DFlops(int64_t batch, int64_t height, int64_t width,
+                           int64_t in_depth, int64_t filter_w, int64_t filter_h,
+                           int64_t out_depth) {
+  int64_t flops =
+      2 * batch * height * width * in_depth * filter_w * filter_h * out_depth;
+  return flops;
+}
+
 // Creates a Tensorflow graph with a Conv2D node followed by BiasAdd.
 template <typename T>
 static Conv2DWithBiasGraph Conv2DWithBias(
@@ -125,7 +159,7 @@ static Conv2DWithBiasGraph Conv2DWithBias(
   Node* conv2d = conv_graph.conv2d;
 
   Tensor bias_t = MakeRandomTensor<T>({out_depth});
-  Node* bias = test::graph::Constant(graph, bias_t, "bias");
+  Node* bias = test::graph::Constant(graph, bias_t, "bias_constant");
 
   Node* out;
   TF_CHECK_OK(NodeBuilder(graph->NewName("bias"), "BiasAdd")
@@ -138,6 +172,18 @@ static Conv2DWithBiasGraph Conv2DWithBias(
   return {graph, conv2d, out};
 }
 
+static int64_t Conv2DWithPostOpsFlops(int batch, int height, int width,
+                                      int in_depth, int filter_w, int filter_h,
+                                      int out_depth, int post_ops_num) {
+  int64_t conv_flops = Conv2DFlops(batch, height, width, in_depth, filter_w,
+                                   filter_h, out_depth);
+  int64_t output_height = height - filter_h + 1;
+  int64_t output_width = width - filter_w + 1;
+  int64_t post_op_flops = batch * out_depth * output_width * output_height;
+
+  return conv_flops + post_ops_num * post_op_flops;
+}
+
 // Creates a Tensorflow graph with a Conv2D node followed by BiasAdd and
 // activation (Relu, Relu6, etc...).
 template <typename T>
@@ -243,25 +289,36 @@ static Graph* FusedConv2DWithBias(int batch, int height, int width,
 
   Node* images = test::graph::Constant(graph, images_t, "images");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");
+
+  graph->AddEdge(graph->source_node(), 0, images, 0);
+  graph->AddEdge(graph->source_node(), 1, filter, 0);
+
+  // Add shape sizes to images and filter.
+  AttrValue attr_input_shape;
+  TensorShapeProto* proto = attr_input_shape.mutable_list()->add_shape();
+  proto->add_dim()->set_size(batch);
+  proto->add_dim()->set_size(height);
+  proto->add_dim()->set_size(width);
+  proto->add_dim()->set_size(in_depth);
+  proto = attr_input_shape.mutable_list()->add_shape();
+  proto->add_dim()->set_size(filter_h);
+  proto->add_dim()->set_size(filter_w);
+  proto->add_dim()->set_size(in_depth);
+  proto->add_dim()->set_size(out_depth);
+
   Node* bias = test::graph::Constant(graph, bias_t, "bias");
 
   std::vector<NodeBuilder::NodeOut> args = {bias};
   std::vector<NodeBuilder::NodeOut> host_args = {};
 
   Node* conv;
-  auto builder =
-      NodeBuilder(graph->NewName("conv"),
-                  IsMKLEnabled() ? "_MklNativeFusedConv2D" : "_FusedConv2D")
-          .Input(images)
-          .Input(filter)
-          .Attr("num_args", 1)
-          .Input(args);
-
-  if (IsMKLEnabled()) {
-    builder.Attr("_kernel", MKL_OP_LABEL);
-  } else {
-    builder.Input(host_args);
-  }
+  auto builder = NodeBuilder(graph->NewName("conv"), "_FusedConv2D")
+                     .Input(images)
+                     .Input(filter)
+                     .Attr("num_args", 1)
+                     .Attr("_input_shapes", attr_input_shape)
+                     .Input(args)
+                     .Input(host_args);
 
   TF_CHECK_OK(builder.Attr("T", DataTypeToEnum<T>::value)
                   .Attr("strides", {1, 1, 1, 1})
@@ -269,6 +326,13 @@ static Graph* FusedConv2DWithBias(int batch, int height, int width,
                   .Attr("fused_ops", fused_ops)
                   .Finalize(graph, &conv));
 
+#ifdef INTEL_MKL
+  if (IsMKLEnabled()) {
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(graph);
+    RunMklLayoutRewritePass(ug);
+  }
+#endif  // INTEL_MKL
+
   return graph;
 }
 
@@ -295,6 +359,23 @@ static Graph* FusedConv2DWithBatchNorm(
 
   Node* images = test::graph::Constant(graph, images_t, "images");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");
+
+  graph->AddEdge(graph->source_node(), 0, images, 0);
+  graph->AddEdge(graph->source_node(), 1, filter, 0);
+
+  // Add shape sizes to images and filter.
+  AttrValue attr_input_shape;
+  TensorShapeProto* proto = attr_input_shape.mutable_list()->add_shape();
+  proto->add_dim()->set_size(batch);
+  proto->add_dim()->set_size(height);
+  proto->add_dim()->set_size(width);
+  proto->add_dim()->set_size(in_depth);
+  proto = attr_input_shape.mutable_list()->add_shape();
+  proto->add_dim()->set_size(filter_h);
+  proto->add_dim()->set_size(filter_w);
+  proto->add_dim()->set_size(in_depth);
+  proto->add_dim()->set_size(out_depth);
+
   Node* scale = test::graph::Constant(graph, scale_t, "scale");
   Node* offset = test::graph::Constant(graph, offset_t, "offset");
   Node* mean = test::graph::Constant(graph, mean_t, "mean");
@@ -304,19 +385,13 @@ static Graph* FusedConv2DWithBatchNorm(
   std::vector<NodeBuilder::NodeOut> host_args = {};
 
   Node* conv;
-  auto builder =
-      NodeBuilder(graph->NewName("conv"),
-                  IsMKLEnabled() ? "_MklNativeFusedConv2D" : "_FusedConv2D")
-          .Input(images)
-          .Input(filter)
-          .Attr("num_args", 4)
-          .Input(args);
-
-  if (IsMKLEnabled()) {
-    builder.Attr("_kernel", MKL_OP_LABEL);
-  } else {
-    builder.Input(host_args);
-  }
+  auto builder = NodeBuilder(graph->NewName("conv"), "_FusedConv2D")
+                     .Input(images)
+                     .Input(filter)
+                     .Attr("num_args", 4)
+                     .Attr("_input_shapes", attr_input_shape)
+                     .Input(args)
+                     .Input(host_args);
 
   TF_CHECK_OK(builder.Attr("T", DataTypeToEnum<T>::value)
                   .Attr("strides", {1, 1, 1, 1})
@@ -324,6 +399,13 @@ static Graph* FusedConv2DWithBatchNorm(
                   .Attr("fused_ops", fused_ops)
                   .Finalize(graph, &conv));
 
+#ifdef INTEL_MKL
+  if (IsMKLEnabled()) {
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(graph);
+    RunMklLayoutRewritePass(ug);
+  }
+#endif  // INTEL_MKL
+
   return graph;
 }
 
@@ -340,9 +422,10 @@ static Graph* FusedConv2DWithBatchNorm(
 // The following benchmarks are always using 'float' data type with NHWC layout.
 // -------------------------------------------------------------------------- //
 
-#define BM_SET_INFO(N, H, W, C, type, LABEL, NAME)                         \
-  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * (N) * \
-                          (H) * (W) * (C));                                \
+// The number of items is equal to number of fused multiply and accumlate
+// operations
+#define BM_SET_INFO(FLOPS, LABEL, NAME)                                        \
+  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * (FLOPS)); \
   state.SetLabel(LABEL);
 
 #define BM_NAME(name, type, N, H, W, C, FW, FH, FC) \
@@ -354,7 +437,8 @@ static Graph* FusedConv2DWithBatchNorm(
     test::Benchmark(#type, Conv2D<float>(N, H, W, C, FW, FH, FC).graph, \
                     /*old_benchmark_api=*/false)                        \
         .Run(state);                                                    \
-    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                       \
+    int64_t flops = Conv2DFlops(N, H, W, C, FW, FH, FC);                \
+    BM_SET_INFO(flops, LABEL, Conv2D);                                  \
   }                                                                     \
   BENCHMARK(BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC))           \
       ->Arg(/*unused arg*/ 1)                                           \
@@ -367,7 +451,8 @@ static Graph* FusedConv2DWithBatchNorm(
                     Conv2DWithBias<float>(N, H, W, C, FW, FH, FC).graph, \
                     /*old_benchmark_api=*/false)                         \
         .Run(state);                                                     \
-    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                        \
+    int64_t flops = Conv2DWithPostOpsFlops(N, H, W, C, FW, FH, FC, 1);   \
+    BM_SET_INFO(flops, LABEL, Conv2D);                                   \
   }                                                                      \
   BENCHMARK(BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, FC))    \
       ->Arg(/*unused arg*/ 1)                                            \
@@ -382,7 +467,8 @@ static Graph* FusedConv2DWithBatchNorm(
             .graph,                                                          \
         /*old_benchmark_api=*/false)                                         \
         .Run(state);                                                         \
-    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                            \
+    int64_t flops = Conv2DWithPostOpsFlops(N, H, W, C, FW, FH, FC, 2);       \
+    BM_SET_INFO(flops, LABEL, Conv2D);                                       \
   }                                                                          \
   BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC)) \
       ->Arg(/*unused arg*/ 1)                                                \
@@ -396,7 +482,8 @@ static Graph* FusedConv2DWithBatchNorm(
         FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, {"BiasAdd"}),   \
         /*old_benchmark_api=*/false)                                       \
         .Run(state);                                                       \
-    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                          \
+    int64_t flops = Conv2DWithPostOpsFlops(N, H, W, C, FW, FH, FC, 1);     \
+    BM_SET_INFO(flops, LABEL, Conv2D);                                     \
   }                                                                        \
   BENCHMARK(BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, FC)) \
       ->Arg(/*unused arg*/ 1)                                              \
@@ -410,7 +497,8 @@ static Graph* FusedConv2DWithBatchNorm(
                                                {"BiasAdd", "Relu"}),           \
                     /*old_benchmark_api=*/false)                               \
         .Run(state);                                                           \
-    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                              \
+    int64_t flops = Conv2DWithPostOpsFlops(N, H, W, C, FW, FH, FC, 2);         \
+    BM_SET_INFO(flops, LABEL, Conv2D);                                         \
   }                                                                            \
   BENCHMARK(                                                                   \
       BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC))    \
@@ -424,7 +512,8 @@ static Graph* FusedConv2DWithBatchNorm(
                     Conv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC).graph, \
                     /*old_benchmark_api=*/false)                              \
         .Run(state);                                                          \
-    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                             \
+    int64_t flops = Conv2DWithPostOpsFlops(N, H, W, C, FW, FH, FC, 4);        \
+    BM_SET_INFO(flops, LABEL, Conv2D);                                        \
   }                                                                           \
   BENCHMARK(BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC))    \
       ->Arg(/*unused arg*/ 1)                                                 \
@@ -439,7 +528,8 @@ static Graph* FusedConv2DWithBatchNorm(
                         .graph,                                                \
                     /*old_benchmark_api=*/false)                               \
         .Run(state);                                                           \
-    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                              \
+    int64_t flops = Conv2DWithPostOpsFlops(N, H, W, C, FW, FH, FC, 5);         \
+    BM_SET_INFO(flops, LABEL, Conv2D);                                         \
   }                                                                            \
   BENCHMARK(                                                                   \
       BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, FC))    \
@@ -454,7 +544,8 @@ static Graph* FusedConv2DWithBatchNorm(
                                                     {"FusedBatchNorm"}),     \
                     /*old_benchmark_api=*/false)                             \
         .Run(state);                                                         \
-    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                            \
+    int64_t flops = Conv2DFlops(N, H, W, C, FW, FH, FC);                     \
+    BM_SET_INFO(flops, LABEL, Conv2D);                                       \
   }                                                                          \
   BENCHMARK(                                                                 \
       BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC))    \
@@ -470,7 +561,8 @@ static Graph* FusedConv2DWithBatchNorm(
                         N, H, W, C, FW, FH, FC, {"FusedBatchNorm", "Relu"}),  \
                     /*old_benchmark_api=*/false)                              \
         .Run(state);                                                          \
-    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                             \
+    int64_t flops = Conv2DWithPostOpsFlops(N, H, W, C, FW, FH, FC, 1);        \
+    BM_SET_INFO(flops, LABEL, Conv2D);                                        \
   }                                                                           \
   BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, FW, \
                     FH, FC))                                                  \
@@ -632,7 +724,8 @@ BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32");
                     Conv2D<T>(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph, \
                     /*old_benchmark_api=*/false)                              \
         .Run(state);                                                          \
-    BM_SET_INFO(N, H, W, C, type, "", Conv2D);                                \
+    int64_t flops = Conv2DFlops(N, H, W, C, FW, FH, FC);                      \
+    BM_SET_INFO(flops, "", Conv2D);                                           \
   }                                                                           \
   BENCHMARK(BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH, FC)) \
       ->Arg(/*unused arg*/ 1)                                                 \
diff --git a/tensorflow/core/kernels/conv_ops_fused_int8.cc b/tensorflow/core/kernels/conv_ops_fused_int8.cc
index 53eb3692e2b..54d7f68cf5b 100644
--- a/tensorflow/core/kernels/conv_ops_fused_int8.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_int8.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 // This include can't be in the conv_ops_fused_impl.h headers. See b/62899350.
 #if GOOGLE_CUDA
+#include "tensorflow/core/kernels/numeric_options_utils.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #endif  // GOOGLE_CUDA
 #include "tensorflow/core/kernels/autotune_conv_impl.h"
@@ -261,7 +262,7 @@ struct LaunchFusedConv2DOpCpuInt8Helper {
         }
 
         // (2) Side input.
-        if (side_input_scale != 0.0f) {
+        if (side_input_scale != 0.0f && side_input_base != nullptr) {
           const T* side_input_ptr = side_input_base + col * stride;
           TempT* conv_output_ptr = conv_output.data();
           for (int idx = 0; idx < num_rows; ++idx) {
@@ -583,7 +584,8 @@ void operator()(
         use_cudnn_frontend, se::dnn::ConvolutionKind::FORWARD, type, bias_type,
         type, conv_scale, side_input_scale, /*leakyrelu_alpha=*/0.0, stream,
         conv_input_desc, filter_desc, bias_desc, output_desc, conv_desc,
-        /*use_fallback=*/false, dnn_activation_mode, &runners));
+        /*use_fallback=*/false, dnn_activation_mode, GetNumericOptions(),
+        &runners));
 
     auto launch_func =
         [&](se::ScratchAllocator* allocator_used,
@@ -628,10 +630,11 @@ void operator()(
           fallback_runners;
       TF_CHECK_OK(stream->parent()->GetFusedConvolveRunners(
           use_cudnn_frontend, se::dnn::ConvolutionKind::FORWARD, type,
-          bias_type, type, conv_scale, side_input_scale,
-          leakyrelu_alpha, stream, conv_input_desc, filter_desc,
-          bias_desc, output_desc, conv_desc,
-          /*use_fallback=*/true, dnn_activation_mode, &fallback_runners));
+          bias_type, type, conv_scale, side_input_scale, leakyrelu_alpha,
+          stream, conv_input_desc, filter_desc, bias_desc, output_desc,
+          conv_desc,
+          /*use_fallback=*/true, dnn_activation_mode, GetNumericOptions(),
+          &fallback_runners));
 
       auto fallback_results_or = internal::AutotuneConvImpl(
           ctx, fallback_runners, cudnn_use_autotune, launch_func,
diff --git a/tensorflow/core/kernels/conv_ops_gpu.cc b/tensorflow/core/kernels/conv_ops_gpu.cc
index 26034707b6a..6da8d7ee4d5 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h"
 #include "tensorflow/core/kernels/autotune_conv_impl.h"
+#include "tensorflow/core/kernels/numeric_options_utils.h"
 #include "tensorflow/core/platform/tensor_float_32_utils.h"
 #endif  // GOOGLE_CUDA
 
@@ -100,7 +101,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
         element_type, element_type, conv_scale, side_input_scale,
         leakyrelu_alpha, stream, input_desc, filter_desc, bias_desc,
         output_desc, conv_desc, /*use_fallback=*/false, activation_mode,
-        &runners));
+        GetNumericOptions(), &runners));
 
     auto launch_func =
         [&](se::ScratchAllocator* allocator_used,
@@ -151,7 +152,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
           element_type, element_type, conv_scale, side_input_scale,
           leakyrelu_alpha, stream, input_desc, filter_desc, bias_desc,
           output_desc, conv_desc, /*use_fallback=*/true, activation_mode,
-          &fallback_runners));
+          GetNumericOptions(), &fallback_runners));
 
       TF_ASSIGN_OR_RETURN(auto fallback_results,
                           internal::AutotuneConvImpl(
@@ -280,7 +281,8 @@ StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
     TF_RETURN_IF_ERROR(stream->parent()->GetConvolveRunners(
         CudnnUseFrontend(), kind, element_type, element_type, stream,
         input_desc, input_ptr, filter_desc, filter_ptr, output_desc, output_ptr,
-        conv_desc, /*use_fallback=*/false, &rz_allocator, &runners));
+        conv_desc, /*use_fallback=*/false, &rz_allocator, GetNumericOptions(),
+        &runners));
     auto launch_func =
         [&](se::ScratchAllocator* allocator_used,
             const std::unique_ptr<const se::dnn::ConvRunner>& runner,
@@ -325,7 +327,7 @@ StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
           CudnnUseFrontend(), kind, element_type, element_type, stream,
           input_desc, input_ptr, filter_desc, filter_ptr, output_desc,
           output_ptr, conv_desc, /*use_fallback=*/true, &rz_allocator,
-          &fallback_runners));
+          GetNumericOptions(), &fallback_runners));
 
       TF_ASSIGN_OR_RETURN(auto fallback_results,
                           internal::AutotuneConvImpl(
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index f0f638563e0..80646badbad 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -57,11 +57,11 @@ class DnnScratchAllocator : public se::ScratchAllocator {
       int64_t byte_size) override {
     Tensor temporary_memory;
     if (byte_size < 0) {
-      return tsl::Status{tsl::error::INVALID_ARGUMENT,
+      return tsl::Status{absl::StatusCode::kInvalidArgument,
                          "Requested negative byte size!"};
     }
     if (byte_size > memory_limit_) {
-      return tsl::Status{tsl::error::UNAVAILABLE,
+      return tsl::Status{absl::StatusCode::kUnavailable,
                          absl::StrCat("Requested memory size (", byte_size,
                                       ") exceeds the max memory limit (",
                                       memory_limit_, ").")};
@@ -73,7 +73,7 @@ class DnnScratchAllocator : public se::ScratchAllocator {
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
       return tsl::Status{
-          tsl::error::UNAVAILABLE,
+          absl::StatusCode::kUnavailable,
           absl::StrCat("Failed to allocate the requested memory size (",
                        byte_size, ").")};
     }
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index c22f2995e05..df934a7e26b 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -37,6 +37,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/numeric_options_utils.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -155,7 +156,7 @@ class CTCLossOp : public OpKernel {
     Status labels_sp_valid = labels_sp.IndicesValid();
     OP_REQUIRES(ctx, labels_sp_valid.ok(),
                 errors::InvalidArgument("label SparseTensor is not valid: ",
-                                        labels_sp_valid.error_message()));
+                                        labels_sp_valid.message()));
 
     typename ctc::CTCLossCalculator<T>::LabelSequences labels_t(batch_size);
     for (const auto& g : labels_sp.group({0})) {  // iterate by batch
@@ -359,8 +360,9 @@ class CTCLossOpGPU : public OpKernel {
     bool cudnn_launch_status =
         stream
             ->ThenCtcLoss(*probs_desc, probs_data, labels_data,
-                          labels_lengths_data, input_lengths_data, &costs_data,
-                          *grads_desc, &grads_data, &workspace_allocator)
+                          labels_lengths_data, input_lengths_data,
+                          GetNumericOptions(), &costs_data, *grads_desc,
+                          &grads_data, &workspace_allocator)
             .ok();
 
     if (!cudnn_launch_status) {
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index bfa680e905b..dce0e995be7 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_3d.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/numeric_options_utils.h"
 
 typedef Eigen::GpuDevice GPUDevice;
 
@@ -111,13 +112,14 @@ void DnnPooling3dImpl(OpKernelContext* context,
   );
 
   DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
-  OP_REQUIRES_OK(context, stream->ThenPoolForward(
-                              pooling_desc, input_desc, input_data, output_desc,
-                              &output_data, &scratch_allocator));
-#else
   OP_REQUIRES_OK(context,
-                 stream->ThenPoolForward(pooling_desc, input_desc, input_data,
-                                         output_desc, &output_data));
+                 stream->ThenPoolForward(pooling_desc, GetNumericOptions(),
+                                         input_desc, input_data, output_desc,
+                                         &output_data, &scratch_allocator));
+#else
+  OP_REQUIRES_OK(context, stream->ThenPoolForward(
+                              pooling_desc, GetNumericOptions(), input_desc,
+                              input_data, output_desc, &output_data));
 #endif
 
   if (data_format == FORMAT_NHWC) {
@@ -302,16 +304,18 @@ void DnnPooling3dGradImpl(
   );
 
   DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
+  OP_REQUIRES_OK(
+      context,
+      stream->ThenPoolBackward(
+          pooling_desc, GetNumericOptions(), orig_input_desc, orig_input_data,
+          orig_output_desc, orig_output_data, output_backprop_data,
+          &input_backprop_data, &scratch_allocator));
+#else
   OP_REQUIRES_OK(context,
                  stream->ThenPoolBackward(
-                     pooling_desc, orig_input_desc, orig_input_data,
-                     orig_output_desc, orig_output_data, output_backprop_data,
-                     &input_backprop_data, &scratch_allocator));
-#else
-  OP_REQUIRES_OK(context, stream->ThenPoolBackward(
-                              pooling_desc, orig_input_desc, orig_input_data,
-                              orig_output_desc, orig_output_data,
-                              output_backprop_data, &input_backprop_data));
+                     pooling_desc, GetNumericOptions(), orig_input_desc,
+                     orig_input_data, orig_output_desc, orig_output_data,
+                     output_backprop_data, &input_backprop_data));
 #endif
 
   if (data_format == FORMAT_NHWC) {
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index bcf70ff10eb..73b9bd0f828 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -324,7 +325,7 @@ DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
 }
 
 inline Status FromExecutorStatus(const tsl::Status& s) {
-  return s.ok() ? OkStatus() : Status(s.code(), s.error_message());
+  return s.ok() ? OkStatus() : Status(s.code(), s.message());
 }
 
 template <typename T>
@@ -333,7 +334,7 @@ inline Status FromExecutorStatus(const tsl::StatusOr<T>& s) {
 }
 
 inline tsl::Status ToExecutorStatus(const Status& s) {
-  return s.ok() ? OkStatus() : Status(s.code(), s.error_message());
+  return s.ok() ? OkStatus() : Status(s.code(), s.message());
 }
 
 template <typename>
@@ -342,6 +343,10 @@ struct ToTFDataType;
 template <>
 struct ToTFDataType<Eigen::half> : std::integral_constant<DataType, DT_HALF> {};
 
+template <>
+struct ToTFDataType<Eigen::bfloat16>
+    : std::integral_constant<DataType, DT_BFLOAT16> {};
+
 template <>
 struct ToTFDataType<float> : std::integral_constant<DataType, DT_FLOAT> {};
 
@@ -781,19 +786,19 @@ Status CreateForwardAndBackwardIODescriptors(
 }
 
 template <typename T>
-Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc,
-                 const CudnnModelTypes& model_types,
-                 const CudnnRnnModelShapes& model_shapes,
-                 /* forward inputs */
-                 const Tensor* input, const Tensor* input_h,
-                 const Tensor* input_c, const Tensor* params,
-                 const bool is_training,
-                 /* forward outputs, outputs of the function */
-                 Tensor* output, Tensor* output_h, Tensor* output_c,
-                 const Tensor* sequence_lengths, bool time_major,
-                 ScratchAllocator* reserve_space_allocator,
-                 ScratchAllocator* workspace_allocator,
-                 ProfileResult* output_profile_result) {
+Status DoForwardImpl(OpKernelContext* context, const RnnDescriptor& rnn_desc,
+                     const CudnnModelTypes& model_types,
+                     const CudnnRnnModelShapes& model_shapes,
+                     /* forward inputs */
+                     const Tensor* input, const Tensor* input_h,
+                     const Tensor* input_c, const Tensor* params,
+                     const bool is_training,
+                     /* forward outputs, outputs of the function */
+                     Tensor* output, Tensor* output_h, Tensor* output_c,
+                     const Tensor* sequence_lengths, bool time_major,
+                     ScratchAllocator* reserve_space_allocator,
+                     ScratchAllocator* workspace_allocator,
+                     ProfileResult* output_profile_result) {
   std::unique_ptr<RnnSequenceTensorDescriptor> input_desc;
   std::unique_ptr<RnnStateTensorDescriptor> h_state_desc;
   std::unique_ptr<RnnStateTensorDescriptor> c_state_desc;
@@ -861,7 +866,94 @@ Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc,
 }
 
 template <typename T>
-Status DoBackward(
+Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc,
+                 const CudnnModelTypes& model_types,
+                 const CudnnRnnModelShapes& model_shapes,
+                 /* forward inputs */
+                 const Tensor* input, const Tensor* input_h,
+                 const Tensor* input_c, const Tensor* params,
+                 const bool is_training,
+                 /* forward outputs, outputs of the function */
+                 Tensor* output, Tensor* output_h, Tensor* output_c,
+                 const Tensor* sequence_lengths, bool time_major,
+                 ScratchAllocator* reserve_space_allocator,
+                 ScratchAllocator* workspace_allocator,
+                 ProfileResult* output_profile_result) {
+  return DoForwardImpl<T>(context, rnn_desc, model_types, model_shapes, input,
+                          input_h, input_c, params, is_training, output,
+                          output_h, output_c, sequence_lengths, time_major,
+                          reserve_space_allocator, workspace_allocator,
+                          output_profile_result);
+}
+
+template <>
+Status DoForward<Eigen::bfloat16>(
+    OpKernelContext* context, const RnnDescriptor& rnn_desc,
+    const CudnnModelTypes& model_types, const CudnnRnnModelShapes& model_shapes,
+    /* forward inputs */
+    const Tensor* input, const Tensor* input_h, const Tensor* input_c,
+    const Tensor* params, const bool is_training,
+    /* forward outputs, outputs of the function */
+    Tensor* output, Tensor* output_h, Tensor* output_c,
+    const Tensor* sequence_lengths, bool time_major,
+    ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator,
+    ProfileResult* output_profile_result) {
+  // CudnnRnn doesn't support bfloat16 yet, so cast to float. This avoids errors
+  // with mixed_bfloat16 policy.
+  Tensor casted_input;
+  Tensor casted_input_h;
+  Tensor casted_input_c;
+  Tensor casted_params;
+  Tensor casted_output;
+  Tensor casted_output_h;
+  Tensor casted_output_c;
+  const GPUDevice& device = context->eigen_device<GPUDevice>();
+  functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+  auto allocate_and_cast_to_float = [&](const Tensor* tensor,
+                                        Tensor* casted_tensor) {
+    TF_RETURN_IF_ERROR(
+        context->allocate_temp(DT_FLOAT, tensor->shape(), casted_tensor));
+    cast(device, casted_tensor->template flat<float>(),
+         tensor->template flat<Eigen::bfloat16>());
+    return OkStatus();
+  };
+  TF_RETURN_IF_ERROR(allocate_and_cast_to_float(input, &casted_input));
+  TF_RETURN_IF_ERROR(allocate_and_cast_to_float(input_h, &casted_input_h));
+  TF_RETURN_IF_ERROR(allocate_and_cast_to_float(params, &casted_params));
+  TF_RETURN_IF_ERROR(
+      context->allocate_temp(DT_FLOAT, output->shape(), &casted_output));
+  TF_RETURN_IF_ERROR(
+      context->allocate_temp(DT_FLOAT, output_h->shape(), &casted_output_h));
+  if (model_types.HasInputC()) {
+    TF_RETURN_IF_ERROR(allocate_and_cast_to_float(input_c, &casted_input_c));
+    TF_RETURN_IF_ERROR(
+        context->allocate_temp(DT_FLOAT, output_c->shape(), &casted_output_c));
+  }
+
+  TF_RETURN_IF_ERROR(DoForwardImpl<float>(
+      context, rnn_desc, model_types, model_shapes, &casted_input,
+      &casted_input_h, &casted_input_c, &casted_params, is_training,
+      &casted_output, &casted_output_h, &casted_output_c, sequence_lengths,
+      time_major, reserve_space_allocator, workspace_allocator,
+      output_profile_result));
+
+  functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+  cast_back(device, output->template flat<Eigen::bfloat16>(),
+            const_cast<const Tensor*>(&casted_output)->template flat<float>());
+  cast_back(
+      device, output_h->template flat<Eigen::bfloat16>(),
+      const_cast<const Tensor*>(&casted_output_h)->template flat<float>());
+  if (model_types.HasInputC()) {
+    cast_back(
+        device, output_c->template flat<Eigen::bfloat16>(),
+        const_cast<const Tensor*>(&casted_output_c)->template flat<float>());
+  }
+  return OkStatus();
+}
+
+template <typename T>
+Status DoBackwardImpl(
     OpKernelContext* context, const RnnDescriptor& rnn_desc,
     const CudnnModelTypes& model_types, const CudnnRnnModelShapes& model_shapes,
     /* forward inputs */
@@ -961,6 +1053,132 @@ Status DoBackward(
                    model_types.DebugString(), ", ", model_shapes.DebugString());
 }
 
+template <typename T>
+Status DoBackward(
+    OpKernelContext* context, const RnnDescriptor& rnn_desc,
+    const CudnnModelTypes& model_types, const CudnnRnnModelShapes& model_shapes,
+    /* forward inputs */
+    const Tensor* input, const Tensor* input_h, const Tensor* input_c,
+    const Tensor* params,
+    /* forward outputs */
+    const Tensor* output, const Tensor* output_h, const Tensor* output_c,
+    /* backprop inputs */
+    const Tensor* output_backprop, const Tensor* output_h_backprop,
+    const Tensor* output_c_backprop, const Tensor* reserve_space,
+    /* backprop outputs, output of the function */
+    Tensor* input_backprop, Tensor* input_h_backprop, Tensor* input_c_backprop,
+    Tensor* params_backprop, const Tensor* sequence_lengths, bool time_major,
+    ScratchAllocator* workspace_allocator,
+    ProfileResult* output_profile_result) {
+  return DoBackwardImpl<T>(
+      context, rnn_desc, model_types, model_shapes, input, input_h, input_c,
+      params, output, output_h, output_c, output_backprop, output_h_backprop,
+      output_c_backprop, reserve_space, input_backprop, input_h_backprop,
+      input_c_backprop, params_backprop, sequence_lengths, time_major,
+      workspace_allocator, output_profile_result);
+}
+
+template <>
+Status DoBackward<Eigen::bfloat16>(
+    OpKernelContext* context, const RnnDescriptor& rnn_desc,
+    const CudnnModelTypes& model_types, const CudnnRnnModelShapes& model_shapes,
+    /* forward inputs */
+    const Tensor* input, const Tensor* input_h, const Tensor* input_c,
+    const Tensor* params,
+    /* forward outputs */
+    const Tensor* output, const Tensor* output_h, const Tensor* output_c,
+    /* backprop inputs */
+    const Tensor* output_backprop, const Tensor* output_h_backprop,
+    const Tensor* output_c_backprop, const Tensor* reserve_space,
+    /* backprop outputs, output of the function */
+    Tensor* input_backprop, Tensor* input_h_backprop, Tensor* input_c_backprop,
+    Tensor* params_backprop, const Tensor* sequence_lengths, bool time_major,
+    ScratchAllocator* workspace_allocator,
+    ProfileResult* output_profile_result) {
+  // CudnnRnn doesn't support bfloat16 yet, so cast to float. This avoids errors
+  // with mixed_bfloat16 policy.
+  Tensor casted_input;
+  Tensor casted_input_h;
+  Tensor casted_input_c;
+  Tensor casted_params;
+  Tensor casted_output;
+  Tensor casted_output_h;
+  Tensor casted_output_c;
+  Tensor casted_output_backprop;
+  Tensor casted_output_h_backprop;
+  Tensor casted_output_c_backprop;
+  Tensor casted_reserve_space;
+  Tensor casted_input_backprop;
+  Tensor casted_input_h_backprop;
+  Tensor casted_input_c_backprop;
+  Tensor casted_params_backprop;
+  const GPUDevice& device = context->eigen_device<GPUDevice>();
+  functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+  auto allocate_and_cast_to_float = [&](const Tensor* tensor,
+                                        Tensor* casted_tensor) {
+    TF_RETURN_IF_ERROR(
+        context->allocate_temp(DT_FLOAT, tensor->shape(), casted_tensor));
+    cast(device, casted_tensor->template flat<float>(),
+         tensor->template flat<Eigen::bfloat16>());
+    return OkStatus();
+  };
+  TF_RETURN_IF_ERROR(allocate_and_cast_to_float(input, &casted_input));
+  TF_RETURN_IF_ERROR(allocate_and_cast_to_float(input_h, &casted_input_h));
+  TF_RETURN_IF_ERROR(allocate_and_cast_to_float(params, &casted_params));
+  TF_RETURN_IF_ERROR(allocate_and_cast_to_float(output, &casted_output));
+  TF_RETURN_IF_ERROR(allocate_and_cast_to_float(output_h, &casted_output_h));
+  TF_RETURN_IF_ERROR(
+      allocate_and_cast_to_float(output_backprop, &casted_output_backprop));
+  TF_RETURN_IF_ERROR(
+      allocate_and_cast_to_float(output_h_backprop, &casted_output_h_backprop));
+  TF_RETURN_IF_ERROR(context->allocate_temp(DT_FLOAT, input_backprop->shape(),
+                                            &casted_input_backprop));
+  TF_RETURN_IF_ERROR(context->allocate_temp(DT_FLOAT, input_h_backprop->shape(),
+                                            &casted_input_h_backprop));
+  TF_RETURN_IF_ERROR(context->allocate_temp(DT_FLOAT, params_backprop->shape(),
+                                            &casted_params_backprop));
+  if (model_types.HasInputC()) {
+    TF_RETURN_IF_ERROR(allocate_and_cast_to_float(input_c, &casted_input_c));
+    TF_RETURN_IF_ERROR(allocate_and_cast_to_float(output_c, &casted_output_c));
+    TF_RETURN_IF_ERROR(allocate_and_cast_to_float(output_c_backprop,
+                                                  &casted_output_c_backprop));
+    TF_RETURN_IF_ERROR(context->allocate_temp(
+        DT_FLOAT, input_c_backprop->shape(), &casted_input_c_backprop));
+  }
+  // Reserve space doesn't have to be casted, but the size in bytes needs to
+  // match.
+  int64_t num_float_elements =
+      Eigen::divup(reserve_space->TotalBytes(), sizeof(float));
+  TF_RETURN_IF_ERROR(context->allocate_temp(DT_FLOAT, {num_float_elements},
+                                            &casted_reserve_space));
+
+  TF_RETURN_IF_ERROR(DoBackwardImpl<float>(
+      context, rnn_desc, model_types, model_shapes, &casted_input,
+      &casted_input_h, &casted_input_c, &casted_params, &casted_output,
+      &casted_output_h, &casted_output_c, &casted_output_backprop,
+      &casted_output_h_backprop, &casted_output_c_backprop,
+      &casted_reserve_space, &casted_input_backprop, &casted_input_h_backprop,
+      &casted_input_c_backprop, &casted_params_backprop, sequence_lengths,
+      time_major, workspace_allocator, output_profile_result));
+
+  functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+  cast_back(device, input_backprop->template flat<Eigen::bfloat16>(),
+            const_cast<const Tensor*>(&casted_input_backprop)
+                ->template flat<float>());
+  cast_back(device, input_h_backprop->template flat<Eigen::bfloat16>(),
+            const_cast<const Tensor*>(&casted_input_h_backprop)
+                ->template flat<float>());
+  cast_back(device, params_backprop->template flat<Eigen::bfloat16>(),
+            const_cast<const Tensor*>(&casted_params_backprop)
+                ->template flat<float>());
+  if (model_types.HasInputC()) {
+    cast_back(device, input_c_backprop->template flat<Eigen::bfloat16>(),
+              const_cast<const Tensor*>(&casted_input_c_backprop)
+                  ->template flat<float>());
+  }
+  return OkStatus();
+}
+
 template <typename T>
 void RestoreParams(const OpInputList params_input,
                    const std::vector<RnnDescriptor::ParamsRegion>& params,
@@ -1096,7 +1314,9 @@ class CudnnRNNKernelCommon : public OpKernel {
                              std::unique_ptr<RnnDescriptor>* rnn_desc,
                              bool use_padded_io) {
     StreamExecutor* executor = context->op_device_context()->stream()->parent();
-    se::dnn::DataType data_type = ToDataType<T>::value;
+    se::dnn::DataType data_type = std::is_same_v<T, bfloat16>
+                                      ? se::dnn::DataType::kFloat
+                                      : ToDataType<T>::value;
     auto rnn_desc_s = executor->createRnnDescriptor(
         model_shapes.num_layers, model_shapes.num_units,
         model_shapes.input_size, model_shapes.cell_num_units,
@@ -1189,9 +1409,7 @@ class CudnnRNNParamsSizeOp<GPUDevice, T, Index> : public CudnnRNNKernelCommon {
                               .TypeConstraint<int32>("S"), \
                           CudnnRNNParamsSizeOp<GPUDevice, T, int32>);
 
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // Convert weight and bias params from a platform-specific layout to the
@@ -1378,9 +1596,7 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
                               .HostMemory("input_size")     \
                               .TypeConstraint<T>("T"),      \
                           CudnnRNNParamsToCanonical<GPUDevice, T>);
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #define REGISTER_GPU(T)                                       \
@@ -1391,9 +1607,7 @@ TF_CALL_double(REGISTER_GPU);
                               .HostMemory("input_size")       \
                               .TypeConstraint<T>("T"),        \
                           CudnnRNNParamsToCanonical<GPUDevice, T>);
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // Convert weight and bias params from the canonical form to a
@@ -1447,9 +1661,7 @@ class CudnnRNNCanonicalToParams<GPUDevice, T> : public CudnnRNNKernelCommon {
                               .HostMemory("input_size")     \
                               .TypeConstraint<T>("T"),      \
                           CudnnRNNCanonicalToParams<GPUDevice, T>);
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #define REGISTER_GPU(T)                                       \
@@ -1460,9 +1672,7 @@ TF_CALL_double(REGISTER_GPU);
                               .HostMemory("input_size")       \
                               .TypeConstraint<T>("T"),        \
                           CudnnRNNCanonicalToParams<GPUDevice, T>);
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // Run the forward operation of the RNN model.
@@ -1615,9 +1825,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
       Name("CudnnRNN").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       CudnnRNNForwardOp<GPUDevice, T>);
 
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 template <typename T>
@@ -1689,12 +1897,14 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
     }
 
     const auto& modeltypes = model_types();
+    const DataType dtype =
+        std::is_same_v<T, bfloat16> ? DT_FLOAT : input->dtype();
     CudnnRnnParameters rnn_params(
         model_shapes.num_layers, model_shapes.input_size,
         model_shapes.num_units, model_shapes.max_seq_length,
         model_shapes.batch_size, model_shapes.dir_count,
         /*has_dropout=*/std::abs(dropout()) > 1e-8, is_training(),
-        modeltypes.rnn_mode, modeltypes.rnn_input_mode, input->dtype());
+        modeltypes.rnn_mode, modeltypes.rnn_input_mode, dtype);
 
     if (AutotuneRnnConfigMap::GetInstance()->Find(rnn_params, algo_config)) {
       VLOG(1) << "Using existing best Cudnn RNN algorithm "
@@ -1816,9 +2026,7 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
                               .TypeConstraint<T>("T"),     \
                           CudnnRNNForwardOpV2<GPUDevice, T>);
 
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 template <typename T>
@@ -1875,9 +2083,7 @@ class CudnnRNNForwardOpV3<GPUDevice, T>
                               .TypeConstraint<T>("T"),        \
                           CudnnRNNForwardOpV3<GPUDevice, T>);
 
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // Run the backward operation of the RNN model.
@@ -2068,9 +2274,7 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
       Name("CudnnRNNBackprop").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       CudnnRNNBackwardOp<GPUDevice, T>);
 
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 template <typename T>
@@ -2102,9 +2306,7 @@ class CudnnRNNBackwardOpV2<GPUDevice, T>
                               .TypeConstraint<T>("T"),     \
                           CudnnRNNBackwardOpV2<GPUDevice, T>);
 
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 template <typename T>
@@ -2144,9 +2346,7 @@ class CudnnRNNBackwardOpV3<GPUDevice, T>
                               .TypeConstraint<T>("T"),        \
                           CudnnRNNBackwardOpV3<GPUDevice, T>);
 
-TF_CALL_half(REGISTER_GPU);
-TF_CALL_float(REGISTER_GPU);
-TF_CALL_double(REGISTER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // TODO(zhengxq): Add the conversion of Cudnn RNN Params from and to
diff --git a/tensorflow/core/kernels/cwise_op_gpu_div_shard1.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_div_shard1.cu.cc
new file mode 100644
index 00000000000..bce82411583
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_div_shard1.cu.cc
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+
+DEFINE_BINARY1(div_no_nan, bfloat16);
+DEFINE_BINARY1(div, bfloat16);
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_div_shard2.cu.cc
similarity index 75%
rename from tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
rename to tensorflow/core/kernels/cwise_op_gpu_div_shard2.cu.cc
index 226f41eac19..1bde9fa9e57 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_div_shard2.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -24,15 +24,6 @@ namespace functor {
 DEFINE_BINARY5(div_no_nan, Eigen::half, float, double, complex64, complex128);
 #endif
 
-DEFINE_BINARY1(div_no_nan, bfloat16);
-
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-DEFINE_BINARY10(div, Eigen::half, float, double, uint8, uint16, int16, int32,
-                int64, complex64, complex128);
-#endif
-
-DEFINE_BINARY1(div, bfloat16);
-
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/lite/core/shims/cc/shims_test_util.h b/tensorflow/core/kernels/cwise_op_gpu_div_shard3.cu.cc
similarity index 59%
rename from tensorflow/lite/core/shims/cc/shims_test_util.h
rename to tensorflow/core/kernels/cwise_op_gpu_div_shard3.cu.cc
index ae18c6fad66..87d075fd45e 100644
--- a/tensorflow/lite/core/shims/cc/shims_test_util.h
+++ b/tensorflow/core/kernels/cwise_op_gpu_div_shard3.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_SHIMS_TEST_UTIL_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_CC_SHIMS_TEST_UTIL_H_
 
-#include "tensorflow/lite/test_util.h"
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-namespace tflite_shims {
-namespace testing {
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 
-using Test = tflite::testing::Test;
+namespace tensorflow {
+namespace functor {
 
-}  // namespace testing
-}  // namespace tflite_shims
+#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+DEFINE_BINARY5(div, Eigen::half, float, double, uint8, uint16);
+#endif
 
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_SHIMS_TEST_UTIL_H_
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_gpu_div_shard4.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_div_shard4.cu.cc
new file mode 100644
index 00000000000..ddd77626c37
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_div_shard4.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+
+#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+DEFINE_BINARY5(div, int16, int32, int64, complex64, complex128);
+#endif
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mul_shard0.cu.cc
similarity index 73%
rename from tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
rename to tensorflow/core/kernels/cwise_op_gpu_mul_shard0.cu.cc
index b7751f2b488..f4d8e437037 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_mul_shard0.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,14 +19,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-DEFINE_BINARY8(mul, Eigen::half, int8, int16, int64, uint8, uint16, uint32,
-               uint64);
-#endif
-
-#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-DEFINE_BINARY5(mul_no_nan, Eigen::half, float, double, complex64, complex128);
-#endif
 
 // TODO(b/179783573): Also disable the bfloat16, float, double and complex
 // kernels.
diff --git a/tensorflow/core/kernels/cwise_op_gpu_mul_shard1.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mul_shard1.cu.cc
new file mode 100644
index 00000000000..e214215f8a1
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_mul_shard1.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+DEFINE_BINARY8(mul, Eigen::half, int8, int16, int64, uint8, uint16, uint32,
+               uint64);
+#endif
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_gpu_mul_shard2.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mul_shard2.cu.cc
new file mode 100644
index 00000000000..df2f5d1e295
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_mul_shard2.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+
+#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+DEFINE_BINARY5(mul_no_nan, Eigen::half, float, double, complex64, complex128);
+#endif
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_gpu_polygamma.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_polygamma.cu.cc
new file mode 100644
index 00000000000..18399273707
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_polygamma.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
+DEFINE_BINARY2(polygamma, float, double);
+#endif
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_gpu_zeta.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_zeta.cu.cc
index 18030123f78..5e5a00adad5 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_zeta.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_zeta.cu.cc
@@ -21,7 +21,6 @@ namespace tensorflow {
 namespace functor {
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 DEFINE_BINARY2(zeta, float, double);
-DEFINE_BINARY2(polygamma, float, double);
 #endif
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_leakyrelu_bf16.cc b/tensorflow/core/kernels/cwise_op_leakyrelu.cc
similarity index 95%
rename from tensorflow/core/kernels/cwise_op_leakyrelu_bf16.cc
rename to tensorflow/core/kernels/cwise_op_leakyrelu.cc
index dc864cae3ec..d13a4d786d9 100644
--- a/tensorflow/core/kernels/cwise_op_leakyrelu_bf16.cc
+++ b/tensorflow/core/kernels/cwise_op_leakyrelu.cc
@@ -107,5 +107,7 @@ class LeakyReluOp : public OpKernel {
   float alpha_;
 };
 
-REGISTER(LeakyReluOp, CPU, "LeakyRelu", functor::leakyrelu, bfloat16);
+// Register LeakyRelu here for bfloat16 and float32
+// Other datatypes are registered in relu_op.cc
+REGISTER2(LeakyReluOp, CPU, "LeakyRelu", functor::leakyrelu, bfloat16, float);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index ffad345026b..8c8fd4c748b 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -72,19 +72,14 @@ template <typename T>
 struct scalar_inverse_gradient_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
   operator()(const T& output, const T& output_gradient) const {
-    if (output_gradient == T(0)) {
-      return T(0);
-    } else {
-      const T out_conj = numext::conj(output);
-      return -out_conj * out_conj * output_gradient;
-    }
+    const T out_conj = numext::conj(output);
+    return -output_gradient * out_conj * out_conj;
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
   packetOp(const Packet& output, const Packet& output_gradient) const {
     const Packet out_conj = pconj(output);
-    return mul_no_nan_op<T>().packetOp(pnegate(pmul(out_conj, out_conj)),
-                                       output_gradient);
+    return pnegate(pmul(output_gradient, pmul(out_conj, out_conj)));
   }
 };
 template <typename T>
@@ -100,20 +95,15 @@ template <typename T>
 struct scalar_sqrt_gradient_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
   operator()(const T& output, const T& output_gradient) const {
-    if (output_gradient == T(0)) {
-      return T(0);
-    } else {
-      const T out_conj = numext::conj(output);
-      return (static_cast<T>(0.5) * output_gradient) / out_conj;
-    }
+    const T out_conj = numext::conj(output);
+    return static_cast<T>(0.5) * output_gradient / out_conj;
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
   packetOp(const Packet& output, const Packet& output_gradient) const {
     const Packet const_half = pset1<Packet>(static_cast<T>(0.5));
     const Packet out_conj = pconj(output);
-    return mul_no_nan_op<T>().packetOp(pdiv(const_half, out_conj),
-                                       output_gradient);
+    return pdiv(pmul(const_half, output_gradient), out_conj);
   }
 };
 template <typename T>
@@ -129,24 +119,17 @@ template <typename T>
 struct scalar_rsqrt_gradient_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
   operator()(const T& output, const T& output_gradient) const {
-    if (output_gradient == T(0)) {
-      return T(0);
-    } else {
-      const T out_conj = numext::conj(output);
-      return static_cast<T>(-0.5) * (output_gradient * out_conj) *
-             (out_conj * out_conj);
-    }
+    const T out_conj = numext::conj(output);
+    return static_cast<T>(-0.5) * (output_gradient * out_conj) *
+           (out_conj * out_conj);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
   packetOp(const Packet& output, const Packet& output_gradient) const {
     const Packet const_half = pset1<Packet>(static_cast<T>(-0.5));
     const Packet out_conj = pconj(output);
-    auto safe_pmul = [](const Packet& a, const Packet& b) {
-      return mul_no_nan_op<T>().packetOp(a, b);
-    };
-    return safe_pmul(pmul(const_half, pmul(out_conj, out_conj)),
-                     safe_pmul(out_conj, output_gradient));
+    return pmul(const_half, pmul(pmul(output_gradient, out_conj),
+                                 pmul(out_conj, out_conj)));
   }
 };
 template <typename T>
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 537b5c75261..db6546a8d63 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -875,6 +875,7 @@ tf_kernel_library(
         "//tensorflow/core/data:stats_utils",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/lib:traceme_encode",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -1537,13 +1538,13 @@ filegroup(
         "//tensorflow/core/data:dataset_utils.cc",
         "//tensorflow/core/data:finalization_utils.cc",
         "//tensorflow/core/data:metric_utils.cc",
-        "//tensorflow/core/data:tfdataz_metrics.cc",
         "//tensorflow/core/data:name_utils.cc",
         "//tensorflow/core/data:rewrite_utils.cc",
         "//tensorflow/core/data:root_dataset.cc",
         "//tensorflow/core/data:serialization_utils.cc",
         "//tensorflow/core/data:split_utils.cc",
         "//tensorflow/core/data:stats_utils.cc",
+        "//tensorflow/core/data:tfdataz_metrics.cc",
         "//tensorflow/core/data:unbounded_thread_pool.cc",
         "//tensorflow/core/data:utils.cc",
         "//tensorflow/core/kernels/data/experimental:portable_all_op_kernels",
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index 10ee4c2ded4..ec59ee3ed55 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -43,6 +43,7 @@ namespace data {
 
 namespace {
 constexpr char kPyFunc[] = "PyFunc";
+constexpr char kCardinalityOptions[] = "cardinality_options";
 }  // namespace
 
 // See documentation in ../../ops/dataset_ops.cc for a high-level
@@ -88,7 +89,7 @@ void DatasetToGraphOp::Compute(OpKernelContext* ctx) {
   Status s = AsGraphDef(dataset, SerializationContext(params), &graph_def);
   if (!s.ok()) {
     ctx->CtxFailure(errors::FailedPrecondition(
-        "Failed to serialize the input pipeline graph: ", s.error_message()));
+        "Failed to serialize the input pipeline graph: ", s.message()));
     return;
   }
   if (strip_device_assignment_) {
@@ -109,12 +110,22 @@ void DatasetToGraphOp::Compute(OpKernelContext* ctx) {
   result->scalar<tstring>()() = graph_def.SerializeAsString();
 }
 
+DatasetCardinalityOp::DatasetCardinalityOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx), cardinality_options_(new CardinalityOptions) {
+  if (ctx->HasAttr(kCardinalityOptions)) {
+    string options_serialized;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kCardinalityOptions, &options_serialized));
+    if (!options_serialized.empty())
+      cardinality_options_->ParseFromString(options_serialized);
+  }
+}
+
 void DatasetCardinalityOp::Compute(OpKernelContext* ctx) {
   DatasetBase* dataset;
   OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
   Tensor* result;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &result));
-  result->scalar<int64_t>()() = dataset->Cardinality();
+  result->scalar<int64_t>()() = dataset->Cardinality(*cardinality_options_);
 }
 
 void DatasetFromGraphOp::Compute(OpKernelContext* ctx) {
diff --git a/tensorflow/core/kernels/data/dataset_ops.h b/tensorflow/core/kernels/data/dataset_ops.h
index 65c156e7069..fa91fcc9171 100644
--- a/tensorflow/core/kernels/data/dataset_ops.h
+++ b/tensorflow/core/kernels/data/dataset_ops.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_DATASET_OPS_H_
 
+#include <memory>
+
 #include "tensorflow/core/platform/platform.h"
 
 // On mobile we do not provide this functionality because not all of its
@@ -47,9 +49,12 @@ class DatasetToGraphOp : public OpKernel {
 
 class DatasetCardinalityOp : public OpKernel {
  public:
-  explicit DatasetCardinalityOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit DatasetCardinalityOp(OpKernelConstruction* ctx);
 
   void Compute(OpKernelContext* ctx) override;
+
+ private:
+  std::unique_ptr<CardinalityOptions> cardinality_options_;
 };
 
 class DatasetFromGraphOp : public OpKernel {
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 1ce7d139bc1..91ea27eea52 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -871,9 +871,16 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/data:dataset_utils",
+        "//tensorflow/core/framework:dataset_options_proto_cc",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:strcat",
+        "//tensorflow/tsl/platform:thread_annotations",
     ],
 )
 
@@ -934,12 +941,12 @@ tf_kernel_library(
         ":group_by_reducer_dataset_op",
         ":group_by_window_dataset_op",
         ":ignore_errors_dataset_op",
+        ":list_dataset_op",
         ":load_dataset_op",
-        ":save_dataset_op",
+        ":lookup_ops",
         ":map_and_batch_dataset_op",
         ":matching_files_dataset_op",
         ":non_serializable_dataset_op",
-        ":lookup_ops",
         ":parallel_interleave_dataset_op",
         ":parse_example_dataset_op",
         ":prefetching_kernels",
@@ -947,9 +954,9 @@ tf_kernel_library(
         ":random_dataset_op",
         ":rebatch_dataset_op",
         ":sampling_dataset_op",
+        ":save_dataset_op",
         ":scan_dataset_op",
         ":set_stats_aggregator_dataset_op",
-        ":list_dataset_op",
         ":sleep_dataset_op",
         ":sliding_window_dataset_op",
         ":snapshot_dataset_op",
diff --git a/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op.cc
index 1db9d00e694..bcee58bdd31 100644
--- a/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op.cc
@@ -209,7 +209,7 @@ class AssertPrevDatasetOp::Dataset : public DatasetBase {
         if (!s.ok()) {
           return errors::InvalidArgument(
               "Failure checking transformations at offset ", i, ": ",
-              s.error_message());
+              s.message());
         }
 
         current_dataset = *previous_dataset;
diff --git a/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc b/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc
index 97d0fe5872c..8c61b918b08 100644
--- a/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc
+++ b/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc
@@ -129,8 +129,8 @@ class ComputeBatchSizeOp : public OpKernel {
     auto s = GetScalarConstNodeValue(*batch_size_node, &batch_size);
     if (!s.ok()) {
       VLOG(1) << "Could not compute static batch size. Found batching dataset ("
-              << node.name() << "), but failed to get its input batch size: "
-              << s.error_message();
+              << node.name()
+              << "), but failed to get its input batch size: " << s.message();
       return -1;
     }
     return batch_size;
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 68149727ef7..a887c411ac7 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -183,7 +183,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
 
   Status CheckExternalState() const override {
     return Status(
-        error::FAILED_PRECONDITION,
+        absl::StatusCode::kFailedPrecondition,
         strings::StrCat(DebugString(), " does not yet support serialization."));
   }
 
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index cff69117342..ae420ee85ba 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -116,8 +116,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
           s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
           while (!s.ok() && !errors::IsCancelled(s)) {
             if (dataset()->log_warning_) {
-              LOG(WARNING) << "Error raised with error message "
-                           << s.error_message();
+              LOG(WARNING) << "Error raised with error message " << s.message();
             }
             out_tensors->clear();
             s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
diff --git a/tensorflow/core/kernels/data/experimental/load_dataset_op.cc b/tensorflow/core/kernels/data/experimental/load_dataset_op.cc
index df9a82b0959..f80e5af333f 100644
--- a/tensorflow/core/kernels/data/experimental/load_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/load_dataset_op.cc
@@ -306,33 +306,42 @@ class LoadDatasetOp::DatasetV2 : public DatasetBase {
         : DatasetIterator<DatasetV2>(params) {}
 
     Status Initialize(IteratorContext* ctx) override {
+      mutex_lock l(mu_);
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func;
+      TF_RETURN_IF_ERROR(dataset()->captured_func_->Instantiate(
+          ctx, &instantiated_captured_func));
+
       SnapshotReaderParams params{dataset()->path_, dataset()->metadata_,
-                                  dataset()->output_types_, ctx->env()};
-      snapshot_reader_ = std::make_unique<SnapshotReader>(params);
-      return OkStatus();
+                                  dataset()->output_types_,
+                                  dataset()->output_shapes_, ctx->env()};
+      TF_ASSIGN_OR_RETURN(
+          core::RefCountPtr<DatasetBase> input,
+          MakeSnapshotReaderDataset(params, *instantiated_captured_func, ctx));
+      return input->MakeIterator(ctx, this, prefix(), &input_impl_);
     }
 
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
-      TF_ASSIGN_OR_RETURN(GetNextResult result, snapshot_reader_->GetNext());
-      *out_tensors = std::move(result.tensors);
-      *end_of_sequence = result.end_of_sequence;
-      return OkStatus();
+      mutex_lock l(mu_);
+      return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
     }
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
-      return errors::Unimplemented("Checkpointing is not currently supported.");
+      mutex_lock l(mu_);
+      return this->SaveInput(ctx, writer, input_impl_);
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      return errors::Unimplemented("Checkpointing is not currently supported.");
+      mutex_lock l(mu_);
+      return this->RestoreInput(ctx, reader, input_impl_);
     }
 
    private:
-    std::unique_ptr<SnapshotReader> snapshot_reader_;
+    mutex mu_;
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
   };
 
   const tstring path_;
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 8b7ca714705..f1f5e63aee8 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -457,7 +457,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
           // former may be interpreted by a caller as the end of sequence.
           status = errors::InvalidArgument(
               "Function invocation produced OutOfRangeError: ",
-              status.error_message());
+              status.message());
         }
         result->UpdateStatus(status, offset);
         if (status.ok()) {
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index cf61e1d8d7b..162076e4ae0 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -1128,7 +1128,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       if (!status.ok()) {
         TF_RETURN_IF_ERROR(writer->WriteScalar(
             iterator_name, strings::StrCat(prefix, "_", KMessage),
-            status.error_message()));
+            std::string(status.message())));
       }
       return OkStatus();
     }
@@ -1139,9 +1139,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       int64_t code_int;
       TF_RETURN_IF_ERROR(reader->ReadScalar(
           iterator_name, strings::StrCat(prefix, "_", kCode), &code_int));
-      error::Code code = static_cast<error::Code>(code_int);
+      absl::StatusCode code = static_cast<absl::StatusCode>(code_int);
 
-      if (code != error::Code::OK) {
+      if (code != absl::StatusCode::kOk) {
         tstring error_message;
         TF_RETURN_IF_ERROR(reader->ReadScalar(
             iterator_name, strings::StrCat(prefix, "_", KMessage),
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 214b3afb415..ec90ead5920 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -741,7 +741,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
           // former may be interpreted by a caller as the end of sequence.
           return errors::InvalidArgument(
               "Function invocation produced OutOfRangeError: ",
-              result->status.error_message());
+              result->status.message());
         }
         *end_of_sequence = result->end_of_input;
         return result->status;
@@ -855,8 +855,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
         TF_RETURN_IF_ERROR(writer->WriteScalar(
             CodeKey(index), static_cast<int64_t>(status.code())));
         if (!status.ok()) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
-                                                 status.error_message()));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              ErrorMessageKey(index), std::string(status.message())));
         }
         return OkStatus();
       }
@@ -866,9 +866,9 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
           TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         int64_t code_int;
         TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
-        error::Code code = static_cast<error::Code>(code_int);
+        absl::StatusCode code = static_cast<absl::StatusCode>(code_int);
 
-        if (code != error::Code::OK) {
+        if (code != absl::StatusCode::kOk) {
           tstring error_message;
           TF_RETURN_IF_ERROR(
               reader->ReadScalar(ErrorMessageKey(index), &error_message));
diff --git a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index dcbdce894e4..7e937c68234 100644
--- a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -243,8 +243,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
             // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
             // former may be interpreted by a caller as the end of sequence.
             return errors::InvalidArgument(
-                "Function invocation produced OutOfRangeError: ",
-                s.error_message());
+                "Function invocation produced OutOfRangeError: ", s.message());
           } else {
             // `f` may deliberately raise `errors::OutOfRange` to indicate
             // that we should terminate the iteration early.
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index 912224a0b8b..b62e0241570 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -1572,8 +1572,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           TF_RETURN_IF_ERROR(writer->WriteScalar(
               CodeKey(index), static_cast<int64_t>(status.code())));
           if (!status.ok()) {
-            TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
-                                                   status.error_message()));
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                ErrorMessageKey(index), std::string(status.message())));
           }
           return OkStatus();
         }
diff --git a/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
index 5bb511f4cb6..dc974c34b73 100644
--- a/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/experimental/sql/driver_manager.h"
+
+#include <memory>
+
 #include "tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h"
 
 namespace tensorflow {
@@ -23,7 +26,7 @@ namespace sql {
 std::unique_ptr<QueryConnection> DriverManager::CreateQueryConnection(
     const string& driver_name) {
   if (driver_name == "sqlite") {
-    return std::unique_ptr<SqliteQueryConnection>(new SqliteQueryConnection());
+    return std::make_unique<SqliteQueryConnection>();
   } else {  // TODO(b/64276826, b/64276995) Add support for other db types.
             // Change to registry pattern.
     return nullptr;
diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index f7b959192cc..0c60f8c6718 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -12,21 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
+#include <cstdint>
+#include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
-#include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/dataset_options.pb.h"
+#include "tensorflow/core/framework/model.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/util/batch_util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/strcat.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
 
 namespace tensorflow {
 namespace data {
 namespace experimental {
 namespace {
 
+using tsl::mutex;
+using tsl::mutex_lock;
+using tsl::OkStatus;
+using tsl::Status;
+using tsl::strings::StrCat;
+
 constexpr char kInputImplEmpty[] = "input_impl_empty";
 
 class UnbatchDatasetOp : public UnaryDatasetOpKernel {
@@ -66,9 +86,9 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
     ~Dataset() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::Unbatch")});
+          Iterator::Params{this, StrCat(prefix, "::Unbatch")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -78,7 +98,9 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       return shapes_;
     }
 
-    string DebugString() const override { return "UnbatchDatasetOp::Dataset"; }
+    std::string DebugString() const override {
+      return "UnbatchDatasetOp::Dataset";
+    }
 
     int64_t CardinalityInternal(CardinalityOptions options) const override {
       int64_t n = input_->Cardinality(options);
@@ -123,6 +145,8 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       bool SymbolicCheckpointCompatible() const override { return true; }
 
       Status Initialize(IteratorContext* ctx) override {
+        mutex_lock l(mu_);
+        input_ckpt_ = std::make_unique<MemoryCheckpoint>(ctx->id_registry());
         return dataset()->input_->MakeIterator(ctx, this, prefix(),
                                                &input_impl_);
       }
@@ -150,13 +174,18 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
             }
             ++current_index_;
             *end_of_sequence = false;
+            if (current_index_ >= current_batch_size_) {
+              ctx->MergeCheckpoint(input_ckpt_.get());
+            }
             return OkStatus();
           }
           current_index_ = 0;
           current_batch_size_ = 0;
           tensors_.clear();
-          TF_RETURN_IF_ERROR(
-              input_impl_->GetNext(ctx, &tensors_, end_of_sequence));
+          auto input_ctx = std::make_unique<IteratorContext>(*ctx);
+          TF_RETURN_IF_ERROR(input_impl_->GetNext(input_ctx.get(), &tensors_,
+                                                  end_of_sequence));
+          input_ckpt_->Merge(input_ctx->checkpoint());
           if (!*end_of_sequence) {
             for (size_t i = 0; i < tensors_.size(); ++i) {
               if (tensors_[i].dims() == 0) {
@@ -208,10 +237,11 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
             writer->WriteScalar(full_name("current_index"), current_index_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("n"), current_batch_size_));
-        if (current_index_ < current_batch_size_) {
+        if (current_index_ < current_batch_size_ &&
+            !ctx->symbolic_checkpoint()) {
           for (size_t i = 0; i < tensors_.size(); ++i) {
             TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(strings::StrCat("tensors[", i, "]")), tensors_[i]));
+                full_name(StrCat("tensors[", i, "]")), tensors_[i]));
           }
         }
         return OkStatus();
@@ -235,22 +265,54 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
         tensors_.clear();
         tensors_.resize(dataset()->output_dtypes().size());
         if (current_index_ < current_batch_size_) {
-          for (size_t i = 0; i < tensors_.size(); ++i) {
-            TF_RETURN_IF_ERROR(reader->ReadTensor(
-                ctx->flr(), full_name(strings::StrCat("tensors[", i, "]")),
-                &tensors_[i]));
-            shapes_[i] = tensors_[i].shape();
-            shapes_[i].RemoveDim(0);
-          }
+          TF_RETURN_IF_ERROR(RestoreTensors(ctx, reader));
         }
         return OkStatus();
       }
 
      private:
+      // Restores the `tensors_` field (and its associated `shapes_`) from a
+      // checkpoint.
+      Status RestoreTensors(IteratorContext* ctx, IteratorStateReader* reader)
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (ctx->symbolic_checkpoint()) {
+          bool end_of_sequence;
+          auto input_ctx = std::make_unique<IteratorContext>(*ctx);
+          TF_RETURN_IF_ERROR(input_impl_->GetNext(input_ctx.get(), &tensors_,
+                                                  &end_of_sequence));
+          input_ckpt_->Merge(input_ctx->checkpoint());
+          if (end_of_sequence) {
+            return errors::FailedPrecondition(
+                "Unexpected end of sequence while symbolically restoring "
+                " UnbatchDataset. Please verify that the input produces data "
+                " deterministically.");
+          }
+        } else {
+          for (size_t i = 0; i < tensors_.size(); ++i) {
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                ctx->flr(), full_name(StrCat("tensors[", i, "]")),
+                &tensors_[i]));
+          }
+        }
+        for (size_t i = 0; i < tensors_.size(); ++i) {
+          shapes_[i] = tensors_[i].shape();
+          shapes_[i].RemoveDim(0);
+        }
+        return OkStatus();
+      }
+
       mutex mu_;
       int64_t current_index_ TF_GUARDED_BY(mu_);
       int64_t current_batch_size_ TF_GUARDED_BY(mu_);
       std::vector<Tensor> tensors_ TF_GUARDED_BY(mu_);
+      // Checkpoint to use for operations on input_impl_. We maintain a
+      // separate checkpoint from the one passed to unbatch so that we can
+      // control when symbolic checkpoint state will be propagated. In
+      // particular, we wait to propagate input checkpoint state until the
+      // tensors being unbatched have been fully consumed, so that if we need to
+      // restore the partially-unbatched tensors, we can do so by restoring the
+      // input and then calling GetNext() on it.
+      std::unique_ptr<MemoryCheckpoint> input_ckpt_ TF_GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
       std::vector<TensorShape> shapes_ TF_GUARDED_BY(mu_);
     };
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index cd51ee1678d..2ec9f33b787 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -1074,7 +1074,7 @@ void DeserializeIteratorOp::Compute(OpKernelContext* ctx) {
         errors::CreateWithUpdatedMessage(
             s, absl::StrCat(
                    "Failed to restore dataset iterator from checkpoint: ",
-                   s.error_message(),
+                   s.message(),
                    ". Make sure the dataset definition has not changed between "
                    "the process that saved the checkpoint and the process that "
                    "is restoring it.")));
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 447ce599771..6661b103b69 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -180,8 +180,7 @@ class MapDatasetOp::Dataset : public DatasetBase {
           // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
           // former may be interpreted by a caller as the end of sequence.
           return errors::InvalidArgument(
-              "Function invocation produced OutOfRangeError: ",
-              s.error_message());
+              "Function invocation produced OutOfRangeError: ", s.message());
         } else {
           // `f` may deliberately raise `errors::OutOfRange` to indicate
           // that we should terminate the iteration early.
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index 3db974dda52..72524e30813 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -84,7 +84,7 @@ void MakeDatasetHelper(OpKernelContext* ctx,
 
   core::RefCountPtr<DatasetBase> rewritten;
   Status s = RewriteDataset(ctx, input, std::move(config_factory),
-                            /*record_fingerprint=*/true, &rewritten);
+                            /*record_fingerprint=*/false, &rewritten);
   *output = rewritten.release();
   if (errors::IsDeadlineExceeded(s)) {
     // Ignore DeadlineExceeded as it implies that the attempted rewrite took too
diff --git a/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc b/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
index c224d12741f..18436626f9a 100644
--- a/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
@@ -201,6 +201,8 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
       if (deregister_fn_) deregister_fn_();
     }
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(*mu_);
       interleave_depth_ = ctx->interleave_depth();
@@ -220,8 +222,10 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
           [this]() { CancelThreads(/*wait=*/false); }, &deregister_fn_));
       IteratorContext::Params params(ctx);
       params.cancellation_manager = cancellation_manager_.get();
+      IteratorContext iter_ctx(std::move(params));
       TF_RETURN_IF_ERROR(dataset()->input_->MakeIterator(
-          IteratorContext(params), this, prefix(), &input_impl_));
+          &iter_ctx, this, prefix(), &input_impl_));
+      ctx->MergeCheckpoint(iter_ctx.checkpoint());
       if (ctx->warm_start() && !ctx->is_restoring()) {
         EnsureThreadsStarted(ctx);
       }
@@ -257,6 +261,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
       if (result->output_allocated) {
         RecordBufferDequeue(ctx, result->output);
       }
+      ctx->MergeCheckpoint(&result->checkpoint);
       TF_RETURN_IF_ERROR(
           ProcessBatch(dataset()->batch_size_, result->num_elements,
                        dataset()->drop_remainder_, result->status, ctx,
@@ -276,6 +281,9 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
+      if (ctx->symbolic_checkpoint()) {
+        return writer->WriteScalar(full_name(kBatchResultsSize), 0);
+      }
       mutex_lock l(*mu_);
       // Wait for all in-flight calls to complete.
       while (num_calls_ > 0) {
@@ -333,13 +341,14 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
 
     // BatchResult encapsulates the output batch.
     struct BatchResult {
-      explicit BatchResult()
+      explicit BatchResult(IteratorContext* ctx)
           : end_of_input(false),
             num_elements(0),
             status(OkStatus()),
             call_finished(false),
             output_allocated(false),
-            uid(tensorflow::EnvTime::NowNanos()) {}
+            uid(tensorflow::EnvTime::NowNanos()),
+            checkpoint(MemoryCheckpoint{ctx->id_registry()}) {}
 
       mutex mu;
       bool end_of_input TF_GUARDED_BY(mu);
@@ -349,6 +358,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
       bool call_finished TF_GUARDED_BY(&Iterator::mu_);
       bool output_allocated TF_GUARDED_BY(mu);
       const int64_t uid = -1;
+      MemoryCheckpoint checkpoint;
     };
 
     void CallCompleted(const std::shared_ptr<IteratorContext>& ctx,
@@ -391,6 +401,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
           mutex_lock l(result->mu);
           result->end_of_input = result->end_of_input || end_of_input;
           result->status.Update(status);
+          result->checkpoint.Merge(ctx->checkpoint());
           if (result->end_of_input || !result->status.ok()) break;
         }
         if (!end_of_input) {
@@ -480,7 +491,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
           }
 
           while (!busy()) {
-            batch_results_.push_back(std::make_shared<BatchResult>());
+            batch_results_.push_back(std::make_shared<BatchResult>(ctx.get()));
             new_calls.emplace_back(batch_results_.back());
             num_calls_++;
           }
@@ -533,7 +544,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
 
     Status ReadBatchResult(IteratorContext* ctx, IteratorStateReader* reader,
                            size_t index) TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-      batch_results_.push_back(std::make_shared<BatchResult>());
+      batch_results_.push_back(std::make_shared<BatchResult>(ctx));
       std::shared_ptr<BatchResult> result = batch_results_.back();
       string batch_prefix = strings::StrCat(kBatchResults, "_", index);
       mutex_lock l(result->mu);
diff --git a/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc b/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
index 33853d04fc8..2368071090c 100644
--- a/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <deque>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
 #include "tensorflow/core/data/name_utils.h"
@@ -409,7 +410,7 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
         // that we should terminate the iteration early.
         return errors::InvalidArgument(
             "Function invocation produced OutOfRangeError: ",
-            result->status.error_message());
+            result->status.message());
       }
       *end_of_sequence = result->end_of_input;
       return result->status;
@@ -560,8 +561,8 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(writer->WriteScalar(
           key, kErrorCode, static_cast<int64_t>(status.code())));
       if (!status.ok()) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(key, kErrorMessage, status.error_message()));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(key, kErrorMessage,
+                                               std::string(status.message())));
       }
       return OkStatus();
     }
@@ -570,9 +571,9 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
                             Status* status) TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       int64_t code_int;
       TF_RETURN_IF_ERROR(reader->ReadScalar(key, kErrorCode, &code_int));
-      error::Code code = static_cast<error::Code>(code_int);
+      absl::StatusCode code = static_cast<absl::StatusCode>(code_int);
 
-      if (code != error::Code::OK) {
+      if (code != absl::StatusCode::kOk) {
         tstring error_message;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(key, kErrorMessage, &error_message));
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index b30eb77496a..e5dccf7bf05 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -102,6 +102,8 @@ constexpr char kIdSuffix[] = ".id";
 constexpr char kSizeSuffix[] = ".size";
 constexpr char kInputsSuffix[] = ".inputs";
 constexpr char kIsReadySuffix[] = ".is_ready";
+constexpr char kElementUninitialized[] = "element_uninitialized";
+constexpr char kRestoreIterator[] = "restore_iterator";
 
 constexpr char kParallelInterleaveDatasetV2[] = "ParallelInterleaveDatasetV2";
 constexpr char kParallelInterleaveDatasetV3[] = "ParallelInterleaveDatasetV3";
@@ -361,6 +363,10 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     ~ParallelInterleaveIterator() override { CancelThreads(/*wait=*/true); }
 
+    bool SymbolicCheckpointCompatible() const override {
+      return deterministic_;
+    }
+
     // TODO(jsimsa): Register cancellation callback once the implementation is
     // refactored not to hold mu_ while calling `GetNext` on the input.
     Status Initialize(IteratorContext* ctx) override {
@@ -388,18 +394,20 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         num_parallel_calls_->value = std::min(
             GetAutotuneDefaultParallelism(ctx), dataset()->cycle_length_);
       }
-      ctx_ = std::make_unique<IteratorContext>(*ctx);
       cancellation_manager_ = std::make_unique<CancellationManager>();
       IteratorContext::Params params(ctx);
       params.interleave_depth += 1;
       params.cancellation_manager = cancellation_manager_.get();
+      IteratorContext iter_ctx(std::move(params));
       TF_RETURN_IF_ERROR(dataset()->input_->MakeIterator(
-          IteratorContext(params), this, prefix(), &input_impl_));
+          &iter_ctx, this, prefix(), &input_impl_));
+      ctx->MergeCheckpoint(iter_ctx.checkpoint());
       TF_RETURN_IF_ERROR(dataset()->captured_func_->Instantiate(
           ctx, &instantiated_captured_func_));
+      checkpoint_ = std::make_unique<MemoryCheckpoint>(ctx->id_registry());
       if (ctx->warm_start() && !ctx->is_restoring()) {
-        EnsureInitialElementsCreated();
-        EnsureThreadsStarted();
+        EnsureInitialElementsCreated(ctx);
+        EnsureThreadsStarted(ctx);
       }
       return OkStatus();
     }
@@ -410,9 +418,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       std::shared_ptr<Result> result;
       {
         mutex_lock l(*mu_);
-        EnsureInitialElementsCreated();
-        EnsureThreadsStarted();
-        while (!cancelled_ && !Consume(&result)) {
+        EnsureInitialElementsCreated(ctx);
+        EnsureThreadsStarted(ctx);
+        while (!cancelled_ && !Consume(ctx, &result)) {
           RecordStop(ctx);
           if (deterministic_) {
             VLOG(3) << "Blocked waiting for element "
@@ -426,6 +434,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         if (cancelled_) {
           return errors::Cancelled("Iterator was cancelled");
         }
+        if (result) {
+          checkpoint_->Merge(&result->checkpoint);
+        }
       }
       if (!result) {
         *end_of_sequence = true;
@@ -473,20 +484,18 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                                           dataset()->cycle_length_))});
     }
 
-    // TODO(aaudibert): Refactor the implementations to avoid the need for
-    // `IteratorContext` when saving the state of the iterator.
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       TF_RETURN_IF_ERROR(ctx->HandleCheckExternalStateStatus(
           dataset()->captured_func_->CheckExternalState()));
       mutex_lock l(*mu_);
+      TF_RETURN_IF_ERROR(checkpoint_->Save(writer));
       wait_for_checkpoint_ = true;
       // Wait for all in-flight calls to complete.
       while (num_active_workers_ > 0) {
         zero_active_workers_cond_var_.wait(l);
       }
-      // Initialize all elements and filter out elements with no input.
-      InitializeInputs(element_id_counter_);
+      // Filter out elements with no input.
       for (auto& element : current_elements_) {
         if (element && element->no_input) {
           element.reset();
@@ -503,9 +512,8 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
           writer->WriteScalar(prefix(), kBlockIndex, block_index_));
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(prefix(), kCycleIndex, cycle_index_));
-      if (end_of_input_) {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kEndOfInput, ""));
-      }
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          prefix(), kEndOfInput, static_cast<int64_t>(end_of_input_)));
       TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kElementIdCounter,
                                              element_id_counter_));
       TF_RETURN_IF_ERROR(WriteCurrentElements(ctx, writer));
@@ -529,7 +537,10 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
             reader->ReadScalar(prefix(), kCycleIndex, &cycle_index_));
         TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(), kElementIdCounter,
                                               &element_id_counter_));
-        end_of_input_ = reader->Contains(prefix(), kEndOfInput);
+        int64_t end_of_input;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(prefix(), kEndOfInput, &end_of_input));
+        end_of_input_ = static_cast<bool>(end_of_input);
       }
       TF_RETURN_IF_ERROR(ReadCurrentElements(ctx, reader));
       TF_RETURN_IF_ERROR(ReadFutureElements(ctx, reader));
@@ -554,8 +565,8 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         last_valid_current_element_--;
       }
       if (ctx->warm_start()) {
-        EnsureInitialElementsCreated();
-        EnsureThreadsStarted();
+        EnsureInitialElementsCreated(ctx);
+        EnsureThreadsStarted(ctx);
       }
       VLOG(2) << "Parallel interleave iterator restored";
       VLOG(4) << "State after restore:\n" << DebugString();
@@ -607,9 +618,13 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
    private:
     // Represents the result of fetching an element from a dataset.
     struct Result {
+      explicit Result(IteratorContext* ctx)
+          : checkpoint(MemoryCheckpoint{ctx->id_registry()}) {}
+
       Status status;
       int64_t id = -1;
       std::vector<Tensor> return_values;
+      MemoryCheckpoint checkpoint;
     };
 
     // The interleave transformation repeatedly inputs elements, applies the
@@ -683,10 +698,11 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       zero_active_workers_cond_var_.notify_all();
     }
 
-    void EnsureInitialElementsCreated() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void EnsureInitialElementsCreated(IteratorContext* ctx)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!initial_elements_created_) {
         for (int i = 0; i < dataset()->cycle_length_; ++i) {
-          current_elements_[i] = MakeElement();
+          current_elements_[i] = MakeElement(ctx);
           if (!current_elements_[i]) {
             break;
           }
@@ -698,13 +714,16 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       }
     }
 
-    void EnsureThreadsStarted() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void EnsureThreadsStarted(IteratorContext* ctx)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!threads_started_) {
         IncrementOutstandingThreads();
-        thread_pool_->Schedule([this]() { WorkerManagerThread(); });
-        if (ctx_->stats_aggregator()) {
+        auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
+        thread_pool_->Schedule(
+            [this, ctx_copy]() { WorkerManagerThread(ctx_copy); });
+        if (ctx->stats_aggregator()) {
           IncrementOutstandingThreads();
-          thread_pool_->Schedule([this]() { StatsThread(); });
+          thread_pool_->Schedule([this, ctx_copy]() { StatsThread(ctx_copy); });
         }
         threads_started_ = true;
       }
@@ -729,16 +748,16 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // Consumes a result (if available), returning an indication of whether
     // a result is available. If `true` is returned, `result` either
     // points to a valid result or is null if end of input has been reached.
-    bool Consume(std::shared_ptr<Result>* result)
+    bool Consume(IteratorContext* ctx, std::shared_ptr<Result>* result)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (deterministic_) {
-        return ConsumeHelper(result);
+        return ConsumeHelper(ctx, result);
       }
       // If we are allowed to be nondeterministic (i.e. return results out of
       // order), try to find an element in the cycle that has a result
       // available.
       for (int i = 0; i < dataset()->cycle_length_; ++i) {
-        if (ConsumeHelper(result)) {
+        if (ConsumeHelper(ctx, result)) {
           return true;
         }
         AdvanceToNextInCycle();
@@ -749,7 +768,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // Consumes a result (if available), returning an indication of whether
     // a result is available. If `true` is returned, `result` either
     // points to a valid result or is null if end of input has been reached.
-    bool ConsumeHelper(std::shared_ptr<Result>* result)
+    bool ConsumeHelper(IteratorContext* ctx, std::shared_ptr<Result>* result)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       while (true) {
         if (last_valid_current_element_ == -1) {
@@ -792,7 +811,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
               std::move(future_elements_.front());
           future_elements_.pop_front();
           if (future_element->iterator) {
-            EnableAutotune(ctx_.get(), future_element->iterator.get());
+            EnableAutotune(ctx, future_element->iterator.get());
           }
           future_element->cycle_index = cycle_index_;
           current_elements_[cycle_index_] = std::move(future_element);
@@ -801,7 +820,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
             current_workers_cond_var_.notify_one();
           }
         } else {
-          current_elements_[cycle_index_] = MakeElement();
+          current_elements_[cycle_index_] = MakeElement(ctx);
           if (current_elements_[cycle_index_]) {
             current_elements_[cycle_index_]->cycle_index = cycle_index_;
             elements_to_process_.push_back(cycle_index_);
@@ -825,22 +844,24 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     // Creates a new element.
-    std::shared_ptr<Element> MakeElement() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    std::shared_ptr<Element> MakeElement(IteratorContext* ctx)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (end_of_input_) {
         return nullptr;
       }
       auto element = std::make_shared<Element>();
       element->id = element_id_counter_++;
-      uninitialized_elements_.push_back(element);
+      InitializeInput(ctx, *element);
       return element;
     }
 
     // Thread responsible for launching all worker threads. The thread stays
     // around after startup in case autotuning increases num_parallel_calls.
-    void WorkerManagerThread() TF_LOCKS_EXCLUDED(mu_) {
-      RecordStart(ctx_.get());
-      auto cleanup = gtl::MakeCleanup([this]() {
-        RecordStop(ctx_.get());
+    void WorkerManagerThread(std::shared_ptr<IteratorContext> ctx)
+        TF_LOCKS_EXCLUDED(mu_) {
+      RecordStart(ctx.get());
+      auto cleanup = gtl::MakeCleanup([&]() {
+        RecordStop(ctx.get());
         mutex_lock l(*mu_);
         DecrementOutstandingThreads();
       });
@@ -863,19 +884,19 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       }
       // Start current workers before future workers to improve startup time.
       for (int i = 0; i < initial_current_workers; ++i) {
-        StartCurrentWorkerThread();
+        StartCurrentWorkerThread(ctx);
       }
       for (int i = 0; i < future_workers; ++i) {
-        StartFutureWorkerThread();
+        StartFutureWorkerThread(ctx);
       }
       while (true) {
         {
           mutex_lock l(*mu_);
           while (!cancelled_ &&
                  num_current_workers_ >= num_parallel_calls_->value) {
-            RecordStop(ctx_.get());
+            RecordStop(ctx.get());
             num_parallel_calls_cond_var_->wait(l);
-            RecordStart(ctx_.get());
+            RecordStart(ctx.get());
           }
           if (cancelled_ || end_of_input_) {
             return;
@@ -884,17 +905,17 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
           IncrementCurrentWorkers();
           IncrementActiveWorkers();
           IncrementCurrentActiveWorkers();
-          StartCurrentWorkerThread();
+          StartCurrentWorkerThread(ctx);
         }
       }
     }
 
-    void StartCurrentWorkerThread() {
-      thread_pool_->Schedule([this]() { CurrentWorkerThread(); });
+    void StartCurrentWorkerThread(std::shared_ptr<IteratorContext> ctx) {
+      thread_pool_->Schedule([this, ctx]() { CurrentWorkerThread(ctx); });
     }
 
-    void StartFutureWorkerThread() {
-      thread_pool_->Schedule([this]() { FutureWorkerThread(); });
+    void StartFutureWorkerThread(std::shared_ptr<IteratorContext> ctx) {
+      thread_pool_->Schedule([this, ctx]() { FutureWorkerThread(ctx); });
     }
 
     // Current workers are responsible for keeping elements in
@@ -907,10 +928,11 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // claim the element by setting `element->active`, then continue to produce
     // results for the element until enough results have been computed for the
     // current cycle and the results buffer is full.
-    void CurrentWorkerThread() TF_LOCKS_EXCLUDED(mu_) {
-      RecordStart(ctx_.get());
-      auto done = [this]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        RecordStop(ctx_.get());
+    void CurrentWorkerThread(std::shared_ptr<IteratorContext> ctx)
+        TF_LOCKS_EXCLUDED(mu_) {
+      RecordStart(ctx.get());
+      auto done = [&]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        RecordStop(ctx.get());
         DecrementActiveWorkers();
         DecrementCurrentActiveWorkers();
         DecrementOutstandingThreads();
@@ -944,7 +966,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
               break;
             }
             DecrementCurrentActiveWorkers();
-            WaitWorkerThread(&current_workers_cond_var_, &l);
+            WaitWorkerThread(ctx.get(), &current_workers_cond_var_, &l);
             IncrementCurrentActiveWorkers();
           }
           if (cancelled_) {
@@ -957,7 +979,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         // Loop on the element until we fill its results buffer or reach end of
         // input for the element.
         while (true) {
-          ProcessElement(element);
+          ProcessElement(ctx.get(), element);
           {
             mutex_lock l(*mu_);
             // Check whether we have produced enough results for the current
@@ -975,10 +997,11 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // future worker's job is to keep `future_elements_` filled with elements.
     // Elements in `future_elements` have had their first `kPerIteratorPrefetch`
     // results computed.
-    void FutureWorkerThread() TF_LOCKS_EXCLUDED(mu_) {
-      RecordStart(ctx_.get());
-      auto done = [this]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        RecordStop(ctx_.get());
+    void FutureWorkerThread(std::shared_ptr<IteratorContext> ctx)
+        TF_LOCKS_EXCLUDED(mu_) {
+      RecordStart(ctx.get());
+      auto done = [&]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        RecordStop(ctx.get());
         DecrementActiveWorkers();
         DecrementOutstandingThreads();
       };
@@ -998,13 +1021,13 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
           while (!cancelled_ && (future_elements_.size() >=
                                      dataset()->prefetch_input_elements_ ||
                                  wait_for_checkpoint_)) {
-            WaitWorkerThread(&future_workers_cond_var_, &l);
+            WaitWorkerThread(ctx.get(), &future_workers_cond_var_, &l);
           }
           if (cancelled_) {
             done();
             return;
           }
-          element = MakeElement();
+          element = MakeElement(ctx.get());
           if (!element) {
             done();
             return;
@@ -1013,13 +1036,13 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
           element->active = true;
           future_elements_.push_back(element);
         }
-        ProcessElement(element);
+        ProcessElement(ctx.get(), element);
       }
     }
 
     // Generates results for the given element until the element's results
     // buffer is full or the element is done producing results.
-    void ProcessElement(std::shared_ptr<Element> element)
+    void ProcessElement(IteratorContext* ctx, std::shared_ptr<Element> element)
         TF_LOCKS_EXCLUDED(mu_) {
       DCHECK(element != nullptr);
       IteratorBase* iterator;
@@ -1028,13 +1051,10 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       {
         mutex_lock l(*mu_);
         DCHECK(element->active);
-        input_element_id = element->id;
         if (!element->iterator) {
-          InitializeInputs(input_element_id);
-          if (!element->iterator) {
-            return;
-          }
+          return;
         }
+        input_element_id = element->id;
         // `iterator` will remain valid after releasing the lock because we have
         // marked the element as active, so no other thread will modify its
         // iterator.
@@ -1043,7 +1063,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       DCHECK(iterator != nullptr);
       // Process until the results queue is full or we reach end of input.
       while (true) {
-        auto result = std::make_shared<Result>();
+        auto result = std::make_shared<Result>(ctx);
         profiler::TraceMe traceme([&] {
           result->id = profiler::TraceMe::NewActivityId();
           return profiler::TraceMeEncode(
@@ -1052,88 +1072,85 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                {"element_id", result->id}});
         });
         bool end_of_input = false;
-        result->status =
-            iterator->GetNext(MakeNestedIteratorContext(ctx_.get()),
-                              &result->return_values, &end_of_input);
+        IteratorContext nested_ctx = MakeNestedIteratorContext(ctx);
+        result->status = iterator->GetNext(&nested_ctx, &result->return_values,
+                                           &end_of_input);
+        result->checkpoint.Merge(nested_ctx.checkpoint());
         if (result->status.ok() && end_of_input) {
           mutex_lock l(*mu_);
           element->iterator.reset();
-          element->inputs.reset();
-          NotifyElementUpdate(element);
+          // If symbolic checkpointing is enabled, element inputs can only be
+          // garbage collected after all element results have been consumed.
+          if (!ctx->symbolic_checkpoint()) {
+            element->inputs.reset();
+          }
+          NotifyElementUpdate(*element);
           break;
         }
-        RecordBufferEnqueue(ctx_.get(), result->return_values);
+        RecordBufferEnqueue(ctx, result->return_values);
         mutex_lock l(*mu_);
         element->results.push_back(std::move(result));
-        NotifyElementUpdate(element);
+        NotifyElementUpdate(*element);
         if (element->results.size() == dataset()->buffer_output_elements_) {
           break;
         }
       }
     }
 
-    // Initialize inputs and create an iterator for all elements up to
-    // element_id.
-    void InitializeInputs(int element_id) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-      while (!uninitialized_elements_.empty() &&
-             uninitialized_elements_.front()->id <= element_id) {
-        std::shared_ptr<Element> element = uninitialized_elements_.front();
-        uninitialized_elements_.pop_front();
-        element->initialized = true;
-        // Check if we've already reached end of input.
-        if (end_of_input_) {
-          element->no_input = true;
-          NotifyElementUpdate(element);
-          continue;
-        }
-        profiler::TraceMe traceme([input_element_id = element->id] {
-          return profiler::TraceMeEncode(
-              "ParallelInterleaveInitializeInput",
-              {{"input_element_id", input_element_id}});
-        });
-        std::vector<Tensor> inputs;
-        Status status;
-        {
-          // TODO(aaudibert): Refactor the implementation to move calls of
-          // `GetNext` out of the scope of `mu_`.
-          status = input_impl_->GetNext(ctx_.get(), &inputs, &end_of_input_);
-        }
-        if (!status.ok()) {
-          AddErrorResult(element, status);
-          continue;
-        }
-        if (end_of_input_) {
-          element->no_input = true;
-          NotifyElementUpdate(element);
-          continue;
-        }
-        element->inputs =
-            std::make_unique<std::vector<Tensor>>(std::move(inputs));
-        IteratorContext::Params params(ctx_.get());
-        params.interleave_depth += 1;
-        IteratorContext ctx(params);
-        status = MakeIteratorFromInputElement(
-            &ctx, this, *element->inputs, element->id,
-            *instantiated_captured_func_, prefix(), &element->iterator,
-            model_node());
-        if (!status.ok()) {
-          element->inputs.reset();
-          element->iterator.reset();
-          AddErrorResult(element, status);
-          continue;
-        }
-        if (element->cycle_index == -1) {
-          DisableAutotune(ctx_.get(), element->iterator.get());
-        }
+    void InitializeInput(IteratorContext* ctx, Element& element)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      element.initialized = true;
+      // Check if we've already reached end of input.
+      if (end_of_input_) {
+        element.no_input = true;
+        NotifyElementUpdate(element);
+        return;
+      }
+      profiler::TraceMe traceme([input_element_id = element.id] {
+        return profiler::TraceMeEncode(
+            "ParallelInterleaveInitializeInput",
+            {{"input_element_id", input_element_id}});
+      });
+      std::vector<Tensor> inputs;
+      // TODO(aaudibert): Refactor the implementation to move calls of
+      // `GetNext` out of the scope of `mu_`.
+      Status status = input_impl_->GetNext(ctx, &inputs, &end_of_input_);
+      checkpoint_->Merge(ctx->checkpoint());
+      if (!status.ok()) {
+        AddErrorResult(ctx, element, status);
+        return;
+      }
+      if (end_of_input_) {
+        element.no_input = true;
+        NotifyElementUpdate(element);
+        return;
+      }
+      element.inputs = std::make_unique<std::vector<Tensor>>(std::move(inputs));
+      IteratorContext::Params params(ctx);
+      params.interleave_depth += 1;
+      IteratorContext nested_ctx(params);
+      status = MakeIteratorFromInputElement(
+          &nested_ctx, this, *element.inputs, element.id,
+          *instantiated_captured_func_, prefix(), &element.iterator,
+          model_node());
+      checkpoint_->Merge(nested_ctx.checkpoint());
+      if (!status.ok()) {
+        element.inputs.reset();
+        element.iterator.reset();
+        AddErrorResult(ctx, element, status);
+        return;
+      }
+      if (element.cycle_index == -1) {
+        DisableAutotune(ctx, element.iterator.get());
       }
     }
 
     // Adds an error result for the given element.
-    void AddErrorResult(std::shared_ptr<Element> element, Status status)
+    void AddErrorResult(IteratorContext* ctx, Element& element, Status status)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-      auto result = std::make_shared<Result>();
+      auto result = std::make_shared<Result>(ctx);
       result->status = status;
-      element->results.push_back(std::move(result));
+      element.results.push_back(std::move(result));
       NotifyElementUpdate(element);
     }
 
@@ -1141,19 +1158,19 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     void StopAllThreads(mutex_lock* l) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {}
 
     // Waits on the given cond_var in a worker thread.
-    void WaitWorkerThread(condition_variable* cond_var, mutex_lock* l)
-        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void WaitWorkerThread(IteratorContext* ctx, condition_variable* cond_var,
+                          mutex_lock* l) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       DecrementActiveWorkers();
-      RecordStop(ctx_.get());
+      RecordStop(ctx);
       cond_var->wait(*l);
-      RecordStart(ctx_.get());
+      RecordStart(ctx);
       IncrementActiveWorkers();
     }
 
-    void NotifyElementUpdate(std::shared_ptr<Element> element)
+    void NotifyElementUpdate(Element& element)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (deterministic_) {
-        element->cond_var.notify_one();
+        element.cond_var.notify_one();
       } else {
         any_element_available_cond_var_.notify_one();
       }
@@ -1211,7 +1228,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       }
     }
 
-    void StatsThread() {
+    void StatsThread(std::shared_ptr<IteratorContext> ctx) {
       for (int64_t step = 0;; ++step) {
         int num_current_active_workers;
         int num_current_workers;
@@ -1232,7 +1249,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
           // Avoid division by zero.
           num_current_workers = 1;
         }
-        ctx_->stats_aggregator()->AddScalar(
+        ctx->stats_aggregator()->AddScalar(
             stats_utils::ThreadUtilizationScalarName(dataset()->node_name()),
             static_cast<float>(num_current_active_workers) /
                 static_cast<float>(num_current_workers),
@@ -1247,8 +1264,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(writer->WriteScalar(
           iterator_name, CodeKey(idx), static_cast<int64_t>(status.code())));
       if (!status.ok()) {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            iterator_name, ErrorMessageKey(idx), status.error_message()));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(iterator_name,
+                                               ErrorMessageKey(idx),
+                                               std::string(status.message())));
       }
       return OkStatus();
     }
@@ -1281,42 +1299,49 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     Status WriteElement(SerializationContext* ctx,
-                        std::shared_ptr<Element> element, int idx,
+                        std::shared_ptr<Element> element,
                         const string& key_prefix, IteratorStateWriter* writer)
         TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-      const auto& iterator_name =
-          absl::StrCat(prefix(), "::", key_prefix, "::", idx);
-      if (element->iterator) {
+      if (element->iterator ||
+          (ctx->symbolic_checkpoint() && !element->results.empty())) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, element->iterator));
         TF_RETURN_IF_ERROR(
-            writer->WriteScalar(iterator_name, kIdSuffix, element->id));
+            writer->WriteScalar(key_prefix, kIdSuffix, element->id));
         TF_RETURN_IF_ERROR(writer->WriteScalar(
-            iterator_name, absl::StrCat(kInputsSuffix, kSizeSuffix),
+            key_prefix, absl::StrCat(kInputsSuffix, kSizeSuffix),
             element->inputs->size()));
         for (int i = 0; i < element->inputs->size(); i++) {
           TF_RETURN_IF_ERROR(writer->WriteTensor(
-              iterator_name, absl::StrCat(kInputsSuffix, "[", i, "]"),
+              key_prefix, absl::StrCat(kInputsSuffix, "[", i, "]"),
               element->inputs->at(i)));
         }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(key_prefix, kRestoreIterator,
+                                               static_cast<int64_t>(true)));
+      } else {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(key_prefix, kRestoreIterator,
+                                               static_cast<int64_t>(false)));
+      }
+      if (ctx->symbolic_checkpoint()) {
+        return writer->WriteScalar(
+            key_prefix, absl::StrCat(kResultsSuffix, kSizeSuffix), 0);
       }
       TF_RETURN_IF_ERROR(writer->WriteScalar(
-          iterator_name, absl::StrCat(kResultsSuffix, kSizeSuffix),
+          key_prefix, absl::StrCat(kResultsSuffix, kSizeSuffix),
           element->results.size()));
       for (size_t i = 0; i < element->results.size(); i++) {
         std::shared_ptr<Result> result = element->results[i];
         TF_RETURN_IF_ERROR(
-            WriteStatusLocked(writer, iterator_name, i, result->status));
+            WriteStatusLocked(writer, key_prefix, i, result->status));
         TF_RETURN_IF_ERROR(writer->WriteScalar(
-            iterator_name,
-            absl::StrCat(kResultsSuffix, "[", i, "]", kSizeSuffix),
+            key_prefix, absl::StrCat(kResultsSuffix, "[", i, "]", kSizeSuffix),
             result->return_values.size()));
         for (size_t j = 0; j < result->return_values.size(); j++) {
           TF_RETURN_IF_ERROR(writer->WriteTensor(
-              iterator_name, absl::StrCat(kResultsSuffix, "[", i, "][", j, "]"),
+              key_prefix, absl::StrCat(kResultsSuffix, "[", i, "][", j, "]"),
               result->return_values[j]));
         }
         TF_RETURN_IF_ERROR(writer->WriteScalar(
-            iterator_name,
+            key_prefix,
             absl::StrCat(kResultsSuffix, "[", i, "]", kIsReadySuffix), ""));
       }
       return OkStatus();
@@ -1328,9 +1353,14 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kCurrentElementsSize,
                                              current_elements_.size()));
       for (int idx = 0; idx < current_elements_.size(); idx++) {
+        const auto& key_prefix =
+            absl::StrCat(prefix(), "::", kCurrentElements, "::", idx);
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(key_prefix, kElementUninitialized,
+                                static_cast<int64_t>(!current_elements_[idx])));
         if (current_elements_[idx]) {
-          TF_RETURN_IF_ERROR(WriteElement(ctx, current_elements_[idx], idx,
-                                          kCurrentElements, writer));
+          TF_RETURN_IF_ERROR(
+              WriteElement(ctx, current_elements_[idx], key_prefix, writer));
         }
       }
       return OkStatus();
@@ -1342,9 +1372,14 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kFutureElementsSize,
                                              future_elements_.size()));
       for (int idx = 0; idx < future_elements_.size(); idx++) {
+        const auto& key_prefix =
+            absl::StrCat(prefix(), "::", kFutureElements, "::", idx);
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(key_prefix, kElementUninitialized,
+                                static_cast<int64_t>(!future_elements_[idx])));
         if (future_elements_[idx]) {
-          TF_RETURN_IF_ERROR(WriteElement(ctx, future_elements_[idx], idx,
-                                          kFutureElements, writer));
+          TF_RETURN_IF_ERROR(
+              WriteElement(ctx, future_elements_[idx], key_prefix, writer));
         }
       }
       return OkStatus();
@@ -1353,60 +1388,61 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     Status ReadElement(IteratorContext* ctx, IteratorStateReader* reader,
                        int idx, const string& key_prefix,
                        std::shared_ptr<Element>* out) {
+      int64_t element_uninitialized;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(key_prefix, kElementUninitialized,
+                                            &element_uninitialized));
+      if (static_cast<bool>(element_uninitialized)) {
+        return OkStatus();
+      }
       std::unique_ptr<IteratorBase> iterator;
       auto element = std::make_shared<Element>();
       {
         mutex_lock l(*mu_);
-        const auto& iterator_name =
-            absl::StrCat(prefix(), "::", key_prefix, "::", idx);
-        if (!reader->Contains(iterator_name,
-                              absl::StrCat(kResultsSuffix, kSizeSuffix))) {
-          return OkStatus();
-        }
         int64_t results_size;
         TF_RETURN_IF_ERROR(reader->ReadScalar(
-            iterator_name, absl::StrCat(kResultsSuffix, kSizeSuffix),
+            key_prefix, absl::StrCat(kResultsSuffix, kSizeSuffix),
             &results_size));
         element->results.resize(results_size);
         for (size_t i = 0; i < results_size; i++) {
-          auto result = std::make_shared<Result>();
+          auto result = std::make_shared<Result>(ctx);
           TF_RETURN_IF_ERROR(
-              ReadStatusLocked(reader, iterator_name, i, &result->status));
+              ReadStatusLocked(reader, key_prefix, i, &result->status));
           int64_t num_return_values;
           TF_RETURN_IF_ERROR(reader->ReadScalar(
-              iterator_name,
+              key_prefix,
               absl::StrCat(kResultsSuffix, "[", i, "]", kSizeSuffix),
               &num_return_values));
           result->return_values.reserve(num_return_values);
           for (size_t j = 0; j < num_return_values; j++) {
             result->return_values.emplace_back();
             TF_RETURN_IF_ERROR(reader->ReadTensor(
-                ctx->flr(), iterator_name,
+                ctx->flr(), key_prefix,
                 absl::StrCat(kResultsSuffix, "[", i, "][", j, "]"),
                 &result->return_values.back()));
           }
           RecordBufferEnqueue(ctx, result->return_values);
           element->results[i] = std::move(result);
         }
-        if (!reader->Contains(iterator_name,
-                              absl::StrCat(kInputsSuffix, kSizeSuffix))) {
+        int64_t restore_iterator;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(key_prefix, kRestoreIterator,
+                                              &restore_iterator));
+        if (static_cast<bool>(!restore_iterator)) {
           element->iterator.reset();
           *out = std::move(element);
           return OkStatus();
         }
         int64_t inputs_size;
         TF_RETURN_IF_ERROR(reader->ReadScalar(
-            iterator_name, absl::StrCat(kInputsSuffix, kSizeSuffix),
+            key_prefix, absl::StrCat(kInputsSuffix, kSizeSuffix),
             &inputs_size));
         element->inputs = std::make_unique<std::vector<Tensor>>(inputs_size);
         for (int i = 0; i < inputs_size; i++) {
-          TF_RETURN_IF_ERROR(
-              reader->ReadTensor(ctx->flr(), iterator_name,
-                                 absl::StrCat(kInputsSuffix, "[", i, "]"),
-                                 &element->inputs->at(i)));
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              ctx->flr(), key_prefix, absl::StrCat(kInputsSuffix, "[", i, "]"),
+              &element->inputs->at(i)));
         }
         TF_RETURN_IF_ERROR(
-            reader->ReadScalar(iterator_name, kIdSuffix, &element->id));
+            reader->ReadScalar(key_prefix, kIdSuffix, &element->id));
         IteratorContext::Params params(ctx);
         params.interleave_depth += 1;
         IteratorContext ctx_copy(params);
@@ -1500,8 +1536,13 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
             counter.DecrementCount();
           });
           std::shared_ptr<Element> elem;
-          Status ret_status = ReadElement(ctx, reader, idx, name, &elem);
+          const auto& key_prefix =
+              absl::StrCat(prefix(), "::", name, "::", idx);
+          IteratorContext ctx_copy(ctx);
+          Status ret_status =
+              ReadElement(&ctx_copy, reader, idx, key_prefix, &elem);
           mutex_lock l(*mu_);
+          ctx->MergeCheckpoint(ctx_copy.checkpoint());
           if (cancelled_) {
             s.Update(errors::Cancelled("Cancelled in ReadElementsParallel"));
             return;
@@ -1627,11 +1668,6 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // Elements of the current interleave cycle.
     std::vector<std::shared_ptr<Element>> current_elements_ TF_GUARDED_BY(mu_);
 
-    // Elements which still need their inputs and iterators to be initialized.
-    // Elements at the front need to be initialized first.
-    std::deque<std::shared_ptr<Element>> uninitialized_elements_
-        TF_GUARDED_BY(mu_);
-
     // Elements to be used in the interleave cycle in the future. The element
     // at the front is the next element to add to the interleave cycle when a
     // current element is exhausted.
@@ -1650,9 +1686,6 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     int64_t element_id_counter_ TF_GUARDED_BY(mu_) = 0;
 
-    // Iterator context used in worker threads.
-    std::unique_ptr<IteratorContext> ctx_;
-
     // Set to true during checkpointing to alert element threads that they
     // should pause operation. This is needed to prevent constantly-active
     // worker threads from blocking checkpointing indefinitely.
@@ -1668,6 +1701,21 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // tree. We record the interleave depth so that it can be included in the
     // trace metadata.
     int64 interleave_depth_ = -1;
+
+    // The implementation of symbolic checkpointing of parallel interleave is
+    // different from all other transformations.
+    //
+    // Unlike synchronous transformations which simply propagate state through
+    // the `IteratorContext` provided by their caller, parallel interleave
+    // executes asynchronously. However, unlike other asynchronous
+    // transformations, parallel interleave contains state other than the
+    // buffered elements, which needs to be stored in the symbolic checkpoint.
+    //
+    // The implementaiton uses a member variable to accumulate this state.
+    // Notably, its contents are propagated to the caller in the `SaveInternal`
+    // method (as opposed to the `GetNextInternal` method) so that the
+    // checkpoint of upstream state is in sync with parallel interleave state.
+    std::unique_ptr<MemoryCheckpoint> checkpoint_ TF_GUARDED_BY(mu_);
   };
 
   const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index a3909f0f00c..7c1719fffe4 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -545,7 +545,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
           // former may be interpreted by a caller as the end of sequence.
           return errors::InvalidArgument(
               "Function invocation produced OutOfRangeError: ",
-              result->status.error_message());
+              result->status.message());
         } else {
           // `f` may deliberately raise `errors::OutOfRange` to indicate
           // that we should terminate the iteration early.
@@ -665,8 +665,9 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
           writer->WriteScalar(prefix, absl::StrCat("_", kErrorCode),
                               static_cast<int64_t>(status.code())));
       if (!status.ok()) {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            prefix, absl::StrCat("_", kErrorMessage), status.error_message()));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(prefix,
+                                               absl::StrCat("_", kErrorMessage),
+                                               std::string(status.message())));
       }
       return OkStatus();
     }
@@ -677,9 +678,9 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       int64_t code_int;
       TF_RETURN_IF_ERROR(
           reader->ReadScalar(prefix, absl::StrCat("_", kErrorCode), &code_int));
-      error::Code code = static_cast<error::Code>(code_int);
+      absl::StatusCode code = static_cast<absl::StatusCode>(code_int);
 
-      if (code != error::Code::OK) {
+      if (code != absl::StatusCode::kOk) {
         tstring error_message;
         TF_RETURN_IF_ERROR(reader->ReadScalar(
             prefix, absl::StrCat("_", kErrorMessage), &error_message));
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 1a66f768bf0..452ce15bc5a 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -320,6 +320,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       if (ctx->warm_start()) {
         TF_RETURN_IF_ERROR(EnsureThreadsStarted(ctx));
       }
+      cond_var_->notify_all();
       return OkStatus();
     }
 
@@ -547,9 +548,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
           writer->WriteScalar(absl::StrCat(prefix(), "::", index), CodeKey(),
                               static_cast<int64_t>(status.code())));
       if (!status.ok()) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(absl::StrCat(prefix(), "::", index),
-                                ErrorMessageKey(), status.error_message()));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            absl::StrCat(prefix(), "::", index), ErrorMessageKey(),
+            std::string(status.message())));
       }
       return OkStatus();
     }
@@ -559,9 +560,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       int64_t code_int;
       TF_RETURN_IF_ERROR(reader->ReadScalar(absl::StrCat(prefix(), "::", index),
                                             CodeKey(), &code_int));
-      error::Code code = static_cast<error::Code>(code_int);
+      absl::StatusCode code = static_cast<absl::StatusCode>(code_int);
 
-      if (code != error::Code::OK) {
+      if (code != absl::StatusCode::kOk) {
         tstring error_message;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(absl::StrCat(prefix(), "::", index),
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index bdef220f231..fb9b65ecb96 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -70,6 +70,7 @@ constexpr char kNumElements[] = "num_elements";
 constexpr char kSlicesSize[] = "slices_size";
 constexpr char kSlicesStart[] = "slices_start";
 constexpr char kSlicesEnd[] = "slices_end";
+constexpr char kSlicesReachedEndOfSequence[] = "slices_reached_end_of_sequence";
 constexpr char kSeedGenerator[] = "SeedGenerator";
 constexpr char kEpochNumRandomSamples[] = "epoch_num_random_samples";
 constexpr char kShuffleDatasetV1[] = "ShuffleDataset";
@@ -188,10 +189,16 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
           seed_generator_(seed_generator),
           parent_generator_(seed_generator->seed(), seed_generator->seed2()),
           generator_(&parent_generator_) {
-      buffer_ = std::make_unique<std::vector<std::vector<Tensor>>>(
-          params.dataset->buffer_size_);
+      if (params.dataset->buffer_size_ == kUnknownCardinality) {
+        buffer_ = std::make_unique<std::vector<std::vector<Tensor>>>();
+      } else {
+        buffer_ = std::make_unique<std::vector<std::vector<Tensor>>>(
+            params.dataset->buffer_size_);
+      }
     }
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(mu_);
       seed_generator_->GenerateSeeds(&seed_, &seed2_);
@@ -256,10 +263,9 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
 
       // Save input iterator if it hasn't been exhausted else write
       // "end_of_input_sequence".
-      if (!input_impl_) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(this->full_name(kEndOfInputSequence), ""));
-      } else {
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(kEndOfInputSequence), static_cast<int64_t>(!input_impl_)));
+      if (input_impl_) {
         TF_RETURN_IF_ERROR(this->SaveInput(ctx, writer, input_impl_));
       }
 
@@ -278,6 +284,10 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
         TF_RETURN_IF_ERROR(writer->WriteScalar(
             this->full_name(absl::StrJoin(std::make_tuple(kSlicesEnd, i), "_")),
             slices_[i]->end));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            this->full_name(absl::StrJoin(
+                std::make_tuple(kSlicesReachedEndOfSequence, i), "_")),
+            static_cast<int64_t>(slices_[i]->reached_end_of_sequence)));
       }
       if (data_produced_) {
         TF_RETURN_IF_ERROR(
@@ -303,7 +313,10 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       ResetRngs();
 
       // Restore the input iterator if it wasn't already exhausted.
-      if (!reader->Contains(this->full_name(kEndOfInputSequence))) {
+      int64_t input_empty;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          this->full_name(kEndOfInputSequence), &input_empty));
+      if (static_cast<bool>(!input_empty)) {
         TF_RETURN_IF_ERROR(this->dataset()->input_->MakeIterator(
             ctx, this, this->prefix(), &input_impl_));
         TF_RETURN_IF_ERROR(this->RestoreInput(ctx, reader, input_impl_));
@@ -328,7 +341,9 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       for (const auto& element : *buffer_) {
         RecordBufferEnqueue(ctx, element);
       }
-      buffer_->resize(dataset()->buffer_size_);
+      if (!IsShuffleAll()) {
+        buffer_->resize(dataset()->buffer_size_);
+      }
       slices_.clear();
       for (size_t i = 0; i < slices_size; ++i) {
         int64_t start;
@@ -340,7 +355,13 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
         TF_RETURN_IF_ERROR(reader->ReadScalar(
             this->full_name(absl::StrJoin(std::make_tuple(kSlicesEnd, i), "_")),
             &end));
-        slices_.push_back(std::make_unique<Slice>(start, end));
+        int64_t reached_end_of_sequence;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            this->full_name(absl::StrJoin(
+                std::make_tuple(kSlicesReachedEndOfSequence, i), "_")),
+            &reached_end_of_sequence));
+        slices_.push_back(std::make_unique<Slice>(
+            start, end, static_cast<bool>(reached_end_of_sequence)));
       }
       data_produced_ = reader->Contains(this->full_name(kDataProduced));
 
@@ -358,10 +379,14 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
     // should be taken modulo the size of `buffer_` as their absolute value
     // can be greater than the range of `buffer_`.
     struct Slice {
-      Slice(int64_t start, int64_t end) : start(start), end(end) {}
+      Slice(int64_t start, int64_t end, bool reached_end_of_sequence)
+          : start(start),
+            end(end),
+            reached_end_of_sequence(reached_end_of_sequence) {}
 
       int64_t start;
       int64_t end;
+      bool reached_end_of_sequence = false;
     };
 
     random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random()
@@ -371,6 +396,21 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       return out;
     }
 
+    // Returns if the data-generating slice is complete, i.e, the iterator for
+    // the slice that will serve the next GetNext() request has been exhausted.
+    bool IsServingSliceComplete() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      for (auto& slice : slices_) {
+        if (slice->start != slice->end) {
+          return slice->reached_end_of_sequence;
+        }
+      }
+      return false;
+    }
+
+    bool IsShuffleAll() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      return dataset()->buffer_size_ == kUnknownCardinality;
+    }
+
     // Fills the shuffle buffer, preparing the buffer for sampling.
     Status FillBuffer(IteratorContext* ctx) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       int64_t start_micros = EnvTime::NowMicros();
@@ -389,6 +429,9 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
         bool end_of_input_sequence = false;
         TF_RETURN_IF_ERROR(
             input_impl_->GetNext(ctx, &input_element, &end_of_input_sequence));
+        if (end_of_input_sequence) {
+          slices_.back()->reached_end_of_sequence = true;
+        }
         if (!end_of_input_sequence) {
           AddToShuffleBuffer(ctx, std::move(input_element));
           continue;
@@ -422,16 +465,21 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
         // 1`.
         return false;
       }
+      if (IsShuffleAll() && (slices_.empty() || !IsServingSliceComplete())) {
+        // If there is no slice or the first nonempty slice isn't complete,
+        // we need to add to the buffer.
+        return true;
+      }
       return num_elements_ < buffer_->size();
     }
 
     Status PrepareNextEpoch(IteratorContext* ctx)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (epoch_ == 0) {
-        slices_.push_back(std::make_unique<Slice>(0, 0));
+        slices_.push_back(std::make_unique<Slice>(0, 0, false));
       } else {
         int64_t n = slices_.back()->end;
-        slices_.push_back(std::make_unique<Slice>(n, n));
+        slices_.push_back(std::make_unique<Slice>(n, n, false));
         for (const auto& provider : ctx->split_providers()) {
           TF_RETURN_IF_ERROR(provider->Reset());
         }
@@ -450,8 +498,13 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
                 << BufferSizeString();
       }
       this->RecordBufferEnqueue(ctx, element);
-      size_t index = slices_.back()->end % buffer_->size();
-      buffer_->at(index) = std::move(element);
+      if (num_elements_ == buffer_->size()) {
+        DCHECK(IsShuffleAll());
+        buffer_->push_back(element);
+      } else {
+        size_t index = slices_.back()->end % buffer_->size();
+        buffer_->at(index) = std::move(element);
+      }
       num_elements_++;
       slices_.back()->end++;
     }
@@ -704,8 +757,9 @@ void ShuffleDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
   OP_REQUIRES_OK(ctx,
                  ParseScalarArgument<int64_t>(ctx, kBufferSize, &buffer_size));
   OP_REQUIRES(
-      ctx, buffer_size > 0,
-      errors::InvalidArgument("buffer_size must be greater than zero."));
+      ctx, buffer_size > 0 || buffer_size == kUnknownCardinality,
+      errors::InvalidArgument(
+          "buffer_size must be greater than zero or UNKNOWN_CARDINALITY"));
 
   int64_t count = 1;
   static std::atomic<int64_t> resource_id_counter(0);
@@ -954,8 +1008,9 @@ void ShuffleAndRepeatDatasetOp::MakeDataset(OpKernelContext* ctx,
   OP_REQUIRES_OK(ctx,
                  ParseScalarArgument<int64_t>(ctx, kBufferSize, &buffer_size));
   OP_REQUIRES(
-      ctx, buffer_size > 0,
-      errors::InvalidArgument("buffer_size must be greater than zero."));
+      ctx, buffer_size > 0 || buffer_size == kUnknownCardinality,
+      errors::InvalidArgument(
+          "buffer_size must be greater than zero or UNKNOWN_CARDINALITY"));
 
   int64_t seed;
   OP_REQUIRES_OK(ctx, ParseScalarArgument<int64_t>(ctx, kSeed, &seed));
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc b/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
index f704697e737..5845894baf5 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
@@ -208,6 +208,19 @@ ShuffleDatasetParams ShuffleDatasetParams8() {
                               /*node_name=*/kShuffleAndRepeatNodeName);
 }
 
+// Test case 4: similar with the test case 2 but a buffer size of UNKNOWN.
+ShuffleDatasetParams ShuffleDatasetParamsWithUnknownCardinality() {
+  return ShuffleDatasetParams(RangeDatasetParams(0, 10, 1),
+                              /*buffer_size=*/-2,
+                              /*seed=*/1,
+                              /*seed2=*/2,
+                              /*count=*/1,
+                              /*reshuffle_each_iteration=*/true,
+                              /*output_dtypes=*/{DT_INT64},
+                              /*output_shapes=*/{PartialTensorShape({})},
+                              /*node_name=*/kShuffleNodeName);
+}
+
 ShuffleDatasetParams ShuffleDatasetParamsWithInvalidBufferSize() {
   return ShuffleDatasetParams(RangeDatasetParams(0, 0, 1),
                               /*buffer_size=*/-1,
@@ -316,7 +329,15 @@ std::vector<GetNextTestCase<ShuffleDatasetParams>> GetNextTestCases() {
        CreateTensors<int64_t>(
            TensorShape({}),
            {{2}, {0}, {1}, {2}, {0}, {1}, {2}, {0}, {1}, {2}, {0},
-            {1}, {2}, {0}, {1}, {2}, {0}, {1}, {2}, {0}, {1}})}};
+            {1}, {2}, {0}, {1}, {2}, {0}, {1}, {2}, {0}, {1}})},
+      {/*dataset_params=*/ShuffleDatasetParamsWithUnknownCardinality(),
+       /*expected_shuffle_outputs=*/
+       CreateTensors<int64_t>(
+           TensorShape({}), {{2}, {6}, {1}, {3}, {9}, {5}, {0}, {8}, {7}, {4}}),
+       /*expected_reshuffle_outputs=*/
+       CreateTensors<int64_t>(
+           TensorShape({}),
+           {{1}, {6}, {0}, {5}, {2}, {7}, {4}, {3}, {9}, {8}})}};
 }
 
 class ParameterizedGetNextTest : public ShuffleDatasetOpTest,
@@ -428,7 +449,9 @@ std::vector<CardinalityTestCase<ShuffleDatasetParams>> CardinalityTestCases() {
           {/*dataset_params=*/ShuffleDatasetParams7(),
            /*expected_cardinality=*/20},
           {/*dataset_params=*/ShuffleDatasetParams8(),
-           /*expected_cardinality=*/kInfiniteCardinality}};
+           /*expected_cardinality=*/kInfiniteCardinality},
+          {/*dataset_params=*/ShuffleDatasetParamsWithUnknownCardinality(),
+           /*expected_cardinality=*/10}};
 }
 
 DATASET_CARDINALITY_TEST_P(ShuffleDatasetOpTest, ShuffleDatasetParams,
@@ -508,7 +531,13 @@ IteratorSaveAndRestoreTestCases() {
            CreateTensors<int64_t>(
                TensorShape({}),
                {{2}, {0}, {1}, {2}, {0}, {1}, {2}, {0}, {1}, {2}, {0},
-                {1}, {2}, {0}, {1}, {2}, {0}, {1}, {2}, {0}, {1}})}};
+                {1}, {2}, {0}, {1}, {2}, {0}, {1}, {2}, {0}, {1}})},
+          {/*dataset_params=*/ShuffleDatasetParamsWithUnknownCardinality(),
+           /*breakpoints=*/{0, 4, 11},
+           /*expected_shuffle_outputs=*/
+           CreateTensors<int64_t>(
+               TensorShape({}),
+               {{2}, {6}, {1}, {3}, {9}, {5}, {0}, {8}, {7}, {4}})}};
 }
 
 class ParameterizedIteratorSaveAndRestoreTest
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index 44afff8e98f..2e65cdef76c 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -338,7 +338,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
           CodeKey(index), static_cast<int64_t>(status.code())));
       if (!status.ok()) {
         TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
-                                               status.error_message()));
+                                               std::string(status.message())));
       }
       return OkStatus();
     }
@@ -347,9 +347,9 @@ class WindowDatasetOp::Dataset : public DatasetBase {
                             Status* status) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       int64_t code_int;
       TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
-      error::Code code = static_cast<error::Code>(code_int);
+      absl::StatusCode code = static_cast<absl::StatusCode>(code_int);
 
-      if (code != error::Code::OK) {
+      if (code != absl::StatusCode::kOk) {
         tstring error_message;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(ErrorMessageKey(index), &error_message));
diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc
index ab33a71c854..bf515a96437 100644
--- a/tensorflow/core/kernels/debug_ops.cc
+++ b/tensorflow/core/kernels/debug_ops.cc
@@ -46,6 +46,12 @@ REGISTER_KERNEL_BUILDER(Name("DebugIdentity")
                             .HostMemory("output"),
                         DebugIdentityOp);
 
+REGISTER_KERNEL_BUILDER(Name("DebugIdentityV3")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("input")
+                            .HostMemory("output"),
+                        DebugIdentityV3Op);
+
 // Register debug NaN-counter (non-ref and ref) ops.
 #define REGISTER_DEBUG_NAN_COUNT(type)                                    \
   REGISTER_KERNEL_BUILDER(                                                \
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 06c0b0c9bbb..eb1d2db77a7 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
 
+#include <cstdint>
+#include <memory>
 #include <numeric>
 
 #include "tensorflow/core/platform/bfloat16.h"
@@ -177,24 +179,32 @@ class BaseDebugOp : public OpKernel {
 
   // Publish a tensor to all debug URLs of the debug op.
   // Log an error if the publishing failed.
-  Status PublishTensor(const Tensor& tensor) {
+  Status PublishTensor(const Tensor& tensor, int64_t step_id = -1) {
     if (debug_urls_.empty()) {
       return OkStatus();
     } else {
-      Status status = DebugIO::PublishDebugTensor(*debug_watch_key_, tensor,
-                                                  Env::Default()->NowMicros(),
-                                                  debug_urls_, gated_grpc_);
+      Status status = DebugIO::PublishDebugTensor(
+          *debug_watch_key_, tensor, Env::Default()->NowMicros(), debug_urls_,
+          gated_grpc_, step_id);
       if (!status.ok()) {
         LOG(ERROR) << "Debug node of watch key "
                    << debug_watch_key_->debug_node_name
                    << " failed to publish debug tensor data to all URLs "
                    << str_util::Join(debug_urls_, ", ")
-                   << ", due to: " << status.error_message();
+                   << ", due to: " << status.message();
       }
       return status;
     }
   }
 
+  void CompleteDebugNodeKey(const string& io_of_node, bool is_input,
+                            int io_index) {
+    debug_watch_key_ = std::make_unique<DebugNodeKey>(
+        debug_watch_key_->device_name, debug_watch_key_->node_name,
+        debug_watch_key_->output_slot, debug_op_name_, io_of_node, is_input,
+        io_index);
+  }
+
  private:
   const string debug_op_name_;
   std::unique_ptr<DebugNodeKey> debug_watch_key_;
@@ -221,6 +231,36 @@ class DebugIdentityOp : public BaseDebugOp {
   }
 };
 
+// Identity op for debugging.
+//   Output slot 0 carries the debug signal and is always allocated on the
+//   host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp,
+//   the debug signal is equal to the input tensor.
+class DebugIdentityV3Op : public BaseDebugOp {
+ public:
+  explicit DebugIdentityV3Op(OpKernelConstruction* context)
+      : BaseDebugOp("DebugIdentityV3", context) {
+    string io_of_node;
+    bool is_input;
+    int io_index;
+    OP_REQUIRES_OK(context, context->GetAttr("io_of_node", &io_of_node));
+    OP_REQUIRES_OK(context, context->GetAttr("is_input", &is_input));
+    OP_REQUIRES_OK(context, context->GetAttr("io_index", &io_index));
+    if (!io_of_node.empty()) {
+      CompleteDebugNodeKey(io_of_node, is_input, io_index);
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    if (!ApplyGrpcGating(context)) {
+      return;
+    }
+
+    OP_REQUIRES_OK(context,
+                   PublishTensor(context->input(0), context->step_id()));
+    context->set_output(0, context->input(0));
+  }
+};
+
 // NaN-counter op for debugging.
 template <typename T>
 class DebugNanCountOp : public BaseDebugOp {
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 634506b3db6..1e97ef38b5b 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -1032,7 +1032,7 @@ static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
   }
 }
 
-// Extern template instantiated in conv_grad_filter_ops.cc.
+// Extern template instantiated in conv_grad_ops.cc.
 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, bfloat16>;
 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
@@ -1040,7 +1040,7 @@ extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-// Extern template instantiated in conv_grad_filter_ops.cc.
+// Extern template instantiated in conv_grad_ops.cc.
 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::bfloat16>;
 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index 23fc151b4eb..756466a11ca 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/data_flow_ops.cc.
 
+#include <algorithm>
+
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -25,6 +27,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/gpu_device_array.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#if !defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS) && defined(__APPLE__) && \
+    !defined(ANDROID) && !defined(__ANDROID__) &&                       \
+    (!defined(TARGET_OS_IOS) || !TARGET_OS_IOS)
+#define PLUGGABLE_DEVICE_SUPPORTED_MACOS 1
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -378,7 +386,23 @@ TF_CALL_int64(REGISTER_DYNAMIC_STITCH_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
 TF_CALL_COMPLEX_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
 #undef REGISTER_DYNAMIC_STITCH_GPU
-
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
+#define REGISTER_DYNAMIC_STITCH_DEFAULT_DEVICE(type)     \
+  REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
+                              .Device(DEVICE_DEFAULT)    \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("indices")     \
+                              .HostMemory("data")        \
+                              .HostMemory("merged"),     \
+                          DynamicStitchOpCPU<type>)
+
+TF_CALL_int32(REGISTER_DYNAMIC_STITCH_DEFAULT_DEVICE);
+TF_CALL_int64(REGISTER_DYNAMIC_STITCH_DEFAULT_DEVICE);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_DYNAMIC_STITCH_DEFAULT_DEVICE);
+TF_CALL_COMPLEX_TYPES(REGISTER_DYNAMIC_STITCH_DEFAULT_DEVICE);
+#undef REGISTER_DYNAMIC_STITCH_DEFAULT_DEVICE
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index ec1e318bfb7..e58ee0e111c 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -277,7 +277,7 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                           errors::DataLoss("Failed to restore element from "
                                            "partially-dequeued batch "
                                            "to FIFOQueue: ",
-                                           s.error_message()));
+                                           s.message()));
                     }
                     queues_[j].push_front(element);
                   }
diff --git a/tensorflow/core/kernels/fingerprint_op_test.cc b/tensorflow/core/kernels/fingerprint_op_test.cc
index 38dd8dfe619..34f6fa354f0 100644
--- a/tensorflow/core/kernels/fingerprint_op_test.cc
+++ b/tensorflow/core/kernels/fingerprint_op_test.cc
@@ -197,7 +197,7 @@ TEST_F(FingerprintOpTest, SupportedMethods) {
 
   const Status status = RunOpKernel();
   EXPECT_FALSE(status.ok());
-  EXPECT_NE(status.error_message().find("unsupported_method"), string::npos);
+  EXPECT_NE(status.message().find("unsupported_method"), string::npos);
 }
 
 TEST_F(FingerprintOpTest, SupportedTypes) {
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 9050ed42c17..fb0c00ad7e3 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
 #include <atomic>
 
 #define EIGEN_USE_THREADS
@@ -148,7 +149,7 @@ struct FusedBatchNorm<CPUDevice, T, U, /* is_training= */ true> {
           context, context->allocate_temp(DataTypeToEnum<T>::value,
                                           transformed_y_shape, &transformed_y));
       // Perform NCHW to NHWC
-      std::vector<int32> perm = {0, 2, 3, 1};
+      std::array<int32, 4> perm = {0, 2, 3, 1};
       OP_REQUIRES_OK(
           context, ::tensorflow::DoTranspose(context->eigen_device<CPUDevice>(),
                                              x_input, perm, &transformed_x));
@@ -224,7 +225,7 @@ struct FusedBatchNorm<CPUDevice, T, U, /* is_training= */ true> {
 
     if (tensor_format == FORMAT_NCHW) {
       // Perform NHWC to NCHW
-      const std::vector<int32> perm = {0, 3, 1, 2};
+      const std::array<int32, 4> perm = {0, 3, 1, 2};
       const Status s = ::tensorflow::DoTranspose(
           context->eigen_device<CPUDevice>(), transformed_y, perm, y_output);
       if (!s.ok()) {
@@ -293,7 +294,7 @@ struct FusedBatchNorm<CPUDevice, T, U, /* is_training= */ false> {
           context, context->allocate_temp(DataTypeToEnum<T>::value,
                                           transformed_y_shape, &transformed_y));
       // Perform NCHW to NHWC
-      std::vector<int32> perm = {0, 2, 3, 1};
+      std::array<int32, 4> perm = {0, 2, 3, 1};
       OP_REQUIRES_OK(
           context, ::tensorflow::DoTranspose(context->eigen_device<CPUDevice>(),
                                              x_input, perm, &transformed_x));
@@ -344,7 +345,7 @@ struct FusedBatchNorm<CPUDevice, T, U, /* is_training= */ false> {
 
     if (tensor_format == FORMAT_NCHW) {
       // Perform NHWC to NCHW
-      const std::vector<int32> perm = {0, 3, 1, 2};
+      const std::array<int32, 4> perm = {0, 3, 1, 2};
       const Status s = ::tensorflow::DoTranspose(
           context->eigen_device<CPUDevice>(), transformed_y, perm, y_output);
       if (!s.ok()) {
@@ -409,7 +410,7 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
                                             transformed_x_backprop_output_shape,
                                             &transformed_x_backprop_output));
       // Perform NCHW to NHWC
-      std::vector<int32> perm = {0, 2, 3, 1};
+      std::array<int32, 4> perm = {0, 2, 3, 1};
       OP_REQUIRES_OK(
           context, ::tensorflow::DoTranspose(context->eigen_device<CPUDevice>(),
                                              y_backprop_input, perm,
@@ -537,7 +538,7 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
 
     if (tensor_format == FORMAT_NCHW) {
       // Perform NHWC to NCHW
-      std::vector<int32> perm = {0, 3, 1, 2};
+      std::array<int32, 4> perm = {0, 3, 1, 2};
       OP_REQUIRES_OK(
           context, ::tensorflow::DoTranspose(context->eigen_device<CPUDevice>(),
                                              transformed_x_backprop_output,
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 91ff7a2dadb..6aeadddb49b 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -101,7 +101,7 @@ class FuzzSession {
       // This is FATAL, because this code is designed to fuzz an op
       // within a session.  Failure to create the session means we
       // can't send any data to the op.
-      LOG(FATAL) << "Could not create session: " << status.error_message();
+      LOG(FATAL) << "Could not create session: " << status.message();
     }
     return status;
   }
@@ -126,7 +126,7 @@ class FuzzSession {
   int Fuzz(const uint8_t* data, size_t size) {
     Status status = InitIfNeeded();
     TF_CHECK_OK(status) << "Fuzzer graph initialization failed: "
-                        << status.error_message();
+                        << status.message();
     // No return value from fuzzing:  Success is defined as "did not
     // crash".  The actual application results are irrelevant.
     FuzzImpl(data, size);
diff --git a/tensorflow/core/kernels/gather_nd_op.h b/tensorflow/core/kernels/gather_nd_op.h
index 17956883e31..ceff70b6ac5 100644
--- a/tensorflow/core/kernels/gather_nd_op.h
+++ b/tensorflow/core/kernels/gather_nd_op.h
@@ -25,12 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
 
-namespace tsl {
-class Status;
-}
 namespace tensorflow {
-using tsl::Status;
-
 class OpKernelContext;
 class Tensor;
 
diff --git a/tensorflow/core/kernels/gpu_utils.cc b/tensorflow/core/kernels/gpu_utils.cc
index 43877e73f9a..a48991fbfef 100644
--- a/tensorflow/core/kernels/gpu_utils.cc
+++ b/tensorflow/core/kernels/gpu_utils.cc
@@ -69,7 +69,7 @@ void CheckRedzones(const se::RedzoneAllocator& rz_allocator,
     absl::call_once(failure_logged, [&]() {
       LOG(WARNING) << "Failed to check cudnn convolutions for out-of-bounds "
                    << "reads and writes with an error message: '"
-                   << rz_status.status().error_message()
+                   << rz_status.status().message()
                    << "'; skipping this check. This only means that we won't "
                    << "check cudnn for out-of-bounds reads and writes. This "
                    << "message will only be printed once.";
diff --git a/tensorflow/core/kernels/image/BUILD b/tensorflow/core/kernels/image/BUILD
index e55d2f912c7..bf7ae7ad8a6 100644
--- a/tensorflow/core/kernels/image/BUILD
+++ b/tensorflow/core/kernels/image/BUILD
@@ -340,8 +340,8 @@ tf_cc_tests(
     }),
     deps = [
         ":image",
-        ":sampling_kernels",
         ":mirror_pad_op",
+        ":sampling_kernels",
     ] + IMAGE_TEST_DEPS,
 )
 
diff --git a/tensorflow/core/kernels/image/encode_jpeg_op_test.cc b/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
index d9d0459629b..5a97d734a00 100644
--- a/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
+++ b/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
@@ -40,8 +40,7 @@ TEST_F(EncodeJpegWithVariableQualityTest, FailsForInvalidQuality) {
   AddInputFromArray<int32>(TensorShape({}), {200});
   Status status = RunOpKernel();
   EXPECT_TRUE(errors::IsInvalidArgument(status));
-  EXPECT_TRUE(
-      absl::StartsWith(status.error_message(), "quality must be in [0,100]"));
+  EXPECT_TRUE(absl::StartsWith(status.message(), "quality must be in [0,100]"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/image/encode_png_op.cc b/tensorflow/core/kernels/image/encode_png_op.cc
index 6d32578abb3..e92ea1d9260 100644
--- a/tensorflow/core/kernels/image/encode_png_op.cc
+++ b/tensorflow/core/kernels/image/encode_png_op.cc
@@ -16,6 +16,11 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include <vector>
+
+#define EIGEN_USE_THREADS
+
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -25,9 +30,13 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/png/png_io.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/overflow.h"
+#include "tensorflow/tsl/platform/mutex.h"
 
 namespace tensorflow {
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
 // Encode an image to a PNG stream
 class EncodePngOp : public OpKernel {
  public:
@@ -51,18 +60,20 @@ class EncodePngOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& image = context->input(0);
-    OP_REQUIRES(context, image.dims() == 3,
-                errors::InvalidArgument("image must be 3-dimensional",
+    OP_REQUIRES(context, image.dims() >= 3,
+                errors::InvalidArgument("images must be ast least rank 3",
                                         image.shape().DebugString()));
-    OP_REQUIRES(context, image.NumElements() > 0,
+    OP_REQUIRES(context, image.NumElements() >= 0,
                 errors::Internal("Invalid image provided."));
     OP_REQUIRES(
         context,
         FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
         errors::InvalidArgument("image cannot have >= int32 max elements"));
-    const int32_t height = static_cast<int32>(image.dim_size(0));
-    const int32_t width = static_cast<int32>(image.dim_size(1));
-    const int32_t channels = static_cast<int32>(image.dim_size(2));
+
+    const int batch_dims = image.dims() - 3;
+    const int32_t height = static_cast<int32>(image.dim_size(batch_dims));
+    const int32_t width = static_cast<int32>(image.dim_size(batch_dims + 1));
+    const int32_t channels = static_cast<int32>(image.dim_size(batch_dims + 2));
 
     // In some cases, we pass width*channels*2 to png.
     const int32_t max_row_width = std::numeric_limits<int32>::max() / 2;
@@ -76,23 +87,57 @@ class EncodePngOp : public OpKernel {
 
     // Encode image to png string
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-    if (desired_channel_bits_ == 8) {
-      OP_REQUIRES(context,
-                  png::WriteImageToBuffer(
-                      image.flat<uint8>().data(), width, height,
-                      width * channels, channels, desired_channel_bits_,
-                      compression_, &output->scalar<tstring>()(), nullptr),
-                  errors::Internal("PNG encoding failed"));
-    } else {
-      OP_REQUIRES(context,
-                  png::WriteImageToBuffer(
-                      image.flat<uint16>().data(), width, height,
-                      width * channels * 2, channels, desired_channel_bits_,
-                      compression_, &output->scalar<tstring>()(), nullptr),
-                  errors::Internal("PNG encoding failed"));
+    TensorShape out_shape;
+    int64_t num_batches = 1;
+    for (int i = 0; i < batch_dims; ++i) {
+      OP_REQUIRES_OK(context, out_shape.AddDimWithStatus(image.dim_size(i)));
+      num_batches = MultiplyWithoutOverflow(num_batches, image.dim_size(i));
     }
+    OP_REQUIRES(context, num_batches >= 0,
+                errors::InvalidArgument(
+                    "Invalid number of batches: ", num_batches,
+                    ", input image shape: ", image.shape().DebugString()));
+
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    const CPUDevice& device = context->template eigen_device<CPUDevice>();
+
+    tsl::mutex bad_image_mu;
+    std::vector<int64_t> bad_image_indices;
+    tstring* output_data = output->flat<tstring>().data();
+
+    const uint8_t* image_data = static_cast<uint8_t*>(image.data());
+    const int64_t row_bytes = width * channels * desired_channel_bits_ / 8;
+    const int64_t image_bytes = height * row_bytes;
+
+    // The following cost is a rough estimate per image encoded.
+    auto cost =
+        Eigen::TensorOpCost(image_bytes,  // Bytes to load.
+                            image_bytes,  // Dummy number of bytes to store.
+                            image_bytes   // Dummy number of cycles.
+        );
+    device.parallelFor(num_batches, cost,
+                       [image_data, row_bytes, image_bytes, height, width,
+                        channels, desired_channel_bits = desired_channel_bits_,
+                        compression = compression_, output_data, &bad_image_mu,
+                        &bad_image_indices](int64_t start, int64_t end) {
+                         for (int64_t i = start; i < end; ++i) {
+                           bool success = png::WriteImageToBuffer(
+                               image_data + i * image_bytes, width, height,
+                               row_bytes, channels, desired_channel_bits,
+                               compression, output_data + i, nullptr);
+                           if (TF_PREDICT_FALSE(!success)) {
+                             tsl::mutex_lock lock(bad_image_mu);
+                             bad_image_indices.push_back(i);
+                           }
+                         }
+                       });
+
+    OP_REQUIRES(
+        context, bad_image_indices.empty(),
+        errors::Internal(
+            "PNG encoding failed at the following flattened batch indices: ",
+            absl::StrJoin(bad_image_indices, ", ")));
   }
 
  private:
diff --git a/tensorflow/core/kernels/image/resize_bicubic_op_test.cc b/tensorflow/core/kernels/image/resize_bicubic_op_test.cc
index 1ce887ae3e2..77f17257281 100644
--- a/tensorflow/core/kernels/image/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/image/resize_bicubic_op_test.cc
@@ -223,8 +223,8 @@ TEST_F(ResizeBicubicOpTest, TestBicubic2x2To0x0) {
 
   Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
-                                "output dimensions must be positive"))
+  EXPECT_TRUE(
+      absl::StrContains(s.message(), "output dimensions must be positive"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/image/resize_bilinear_op_test.cc b/tensorflow/core/kernels/image/resize_bilinear_op_test.cc
index 11a9a353ba2..ffe69df10c6 100644
--- a/tensorflow/core/kernels/image/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op_test.cc
@@ -495,8 +495,8 @@ TEST_P(ResizeBilinearOpTest, TestInvalidOutputSize) {
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
   Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(absl::StrContains(s.error_message(),
-                                "output dimensions must be positive"))
+  EXPECT_TRUE(
+      absl::StrContains(s.message(), "output dimensions must be positive"))
       << s;
 }
 
@@ -505,8 +505,7 @@ TEST_P(ResizeBilinearOpTest, TestInvalidInputShape) {
   AddInputFromArray<int32>(TensorShape({2}), {4, 4});
   Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "input must be 4-dimensional"))
+  EXPECT_TRUE(absl::StrContains(s.message(), "input must be 4-dimensional"))
       << s;
 }
 
@@ -515,8 +514,7 @@ TEST_P(ResizeBilinearOpTest, TestInvalidSizeDim) {
   AddInputFromArray<int32>(TensorShape({2, 1}), {4, 4});
   Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "shape_t must be 1-dimensional"))
+  EXPECT_TRUE(absl::StrContains(s.message(), "shape_t must be 1-dimensional"))
       << s;
 }
 
@@ -525,8 +523,7 @@ TEST_P(ResizeBilinearOpTest, TestInvalidSizeElements) {
   AddInputFromArray<int32>(TensorShape({3}), {4, 4, 1});
   Status s = RunOpKernel();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "shape_t must have two elements"))
+  EXPECT_TRUE(absl::StrContains(s.message(), "shape_t must have two elements"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 5aa38f67a5f..5d6561a201a 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -430,21 +430,16 @@ REGISTER_KERNEL_BUILDER(Name("DeepCopy").Device(DEVICE_CPU), CopyOp<CPUDevice>);
                               .TypeConstraint<type>("dtype"), \
                           EmptyOp<dev##Device, type>)
 
-REGISTER_EMPTY(float, CPU)
-REGISTER_EMPTY(bfloat16, CPU)
-REGISTER_EMPTY(double, CPU)
-REGISTER_EMPTY(Eigen::half, CPU)
-REGISTER_EMPTY(tstring, CPU)
-REGISTER_EMPTY(int32, CPU)
-REGISTER_EMPTY(int64_t, CPU)
-REGISTER_EMPTY(bool, CPU)
-REGISTER_EMPTY(uint8, CPU)
+#define REGISTER(TYPE) REGISTER_EMPTY(TYPE, CPU);
+TF_CALL_POD_STRING_TYPES(REGISTER);
+#undef REGISTER
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 typedef Eigen::GpuDevice GPUDevice;
 
 #define REGISTER(TYPE)                                                    \
+  REGISTER_EMPTY(TYPE, GPU);                                              \
   REGISTER_KERNEL_BUILDER(                                                \
       Name("InplaceUpdate").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
       InplaceOp<GPUDevice, functor::I_UPDATE>);                           \
@@ -461,18 +456,11 @@ typedef Eigen::GpuDevice GPUDevice;
 REGISTER_KERNEL_BUILDER(
     Name("InplaceUpdate").Device(DEVICE_GPU).TypeConstraint<bool>("T"),
     InplaceOp<GPUDevice, functor::I_UPDATE>);
-REGISTER(float);
-REGISTER(double);
-REGISTER(Eigen::half);
-REGISTER(Eigen::bfloat16);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER);
 REGISTER(int64_t);
 
-REGISTER_EMPTY(float, GPU);
-REGISTER_EMPTY(double, GPU);
-REGISTER_EMPTY(Eigen::half, GPU);
-REGISTER_EMPTY(Eigen::bfloat16, GPU);
-REGISTER_EMPTY(int64_t, GPU);
 REGISTER_EMPTY(int32, GPU);
+#undef REGISTER
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/linalg/BUILD b/tensorflow/core/kernels/linalg/BUILD
index 46195c547a3..14420d91713 100644
--- a/tensorflow/core/kernels/linalg/BUILD
+++ b/tensorflow/core/kernels/linalg/BUILD
@@ -216,13 +216,13 @@ tf_kernel_library(
     prefix = "matrix_triangular_solve_op",
     deps = [
         ":linalg_ops_common",
-        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels:fill_functor",
         "//tensorflow/core/platform:stream_executor",
+        "//third_party/eigen3",
     ] + if_cuda([
-        "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
         "//tensorflow/core/util:cuda_solvers",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rocprim",
@@ -275,9 +275,9 @@ tf_kernel_library(
     ]) + if_rocm([
         "//tensorflow/core/util:rocm_solvers",
     ]) + [
-        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//third_party/eigen3",
     ],
 )
 
diff --git a/tensorflow/core/kernels/linalg/linalg_ops_common.cc b/tensorflow/core/kernels/linalg/linalg_ops_common.cc
index e341c10c54d..7c74098d642 100644
--- a/tensorflow/core/kernels/linalg/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg/linalg_ops_common.cc
@@ -153,8 +153,10 @@ void LinearAlgebraOp<InputScalar, OutputScalar>::AnalyzeInputs(
     const int col_dimension = input_rank - 1;
     const int64_t num_rows = in.dim_size(row_dimension);
     const int64_t num_cols = in.dim_size(col_dimension);
-    input_matrix_shapes->emplace_back(
-        std::initializer_list<int64_t>({num_rows, num_cols}));
+    TensorShape input_shape;
+    OP_REQUIRES_OK(context, TensorShape::BuildTensorShape({num_rows, num_cols},
+                                                          &input_shape));
+    input_matrix_shapes->push_back(std::move(input_shape));
     inputs->emplace_back(&in);
     OP_REQUIRES(
         context, in.dtype() == DataTypeToEnum<InputScalar>::v(),
diff --git a/tensorflow/core/kernels/map_kernels.h b/tensorflow/core/kernels/map_kernels.h
index 03faadd7cdf..6a05762983d 100644
--- a/tensorflow/core/kernels/map_kernels.h
+++ b/tensorflow/core/kernels/map_kernels.h
@@ -22,7 +22,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-Status GetInputMap(OpKernelContext* ctx, int index, const TensorMap** ret_map) {
+inline Status GetInputMap(OpKernelContext* ctx, int index,
+                          const TensorMap** ret_map) {
   if (!TensorShapeUtils::IsScalar(ctx->input(index).shape())) {
     return errors::InvalidArgument("Input map must be a scalar. Saw: ",
                                    ctx->input(index).shape().DebugString());
@@ -38,10 +39,11 @@ Status GetInputMap(OpKernelContext* ctx, int index, const TensorMap** ret_map) {
 }
 
 // TODO(kattian): change into templated function
-Status ForwardInputOrCreateNewMap(OpKernelContext* ctx, int32_t input_index,
-                                  int32_t output_index,
-                                  const TensorMap& input_map,
-                                  TensorMap** output_map) {
+inline Status ForwardInputOrCreateNewMap(OpKernelContext* ctx,
+                                         int32_t input_index,
+                                         int32_t output_index,
+                                         const TensorMap& input_map,
+                                         TensorMap** output_map) {
   // Attempt to forward the input tensor to the output if possible.
   std::unique_ptr<Tensor> maybe_output = ctx->forward_input(
       input_index, output_index, DT_VARIANT, TensorShape{},
diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index b5cee706935..a32ad20568c 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/kernels/matmul_op_impl.h"
 #include "tensorflow/core/kernels/matmul_util.h"
+#include "tensorflow/core/kernels/numeric_options_utils.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/tensor_float_32_utils.h"
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
@@ -340,7 +341,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedMatmulOp>> AutotuneFusedMatmul(
     TF_RETURN_IF_ERROR(stream->parent()->GetFusedMatmulRunners(
         CudnnUseFrontend(), element_type, element_type, element_type, stream,
         trans_a, trans_b, m, n, k, lda, ldb, ldc, activation_mode,
-        /*use_fallback=*/false, &runners));
+        /*use_fallback=*/false, GetNumericOptions(), &runners));
 
     auto launch_func =
         [&](se::ScratchAllocator* allocator_used,
@@ -384,7 +385,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedMatmulOp>> AutotuneFusedMatmul(
       TF_RETURN_IF_ERROR(stream->parent()->GetFusedMatmulRunners(
           CudnnUseFrontend(), element_type, element_type, element_type, stream,
           trans_a, trans_b, m, n, k, lda, ldb, ldc, activation_mode,
-          /*use_fallback=*/true, &fallback_runners));
+          /*use_fallback=*/true, GetNumericOptions(), &fallback_runners));
 
       TF_ASSIGN_OR_RETURN(
           auto fallback_results,
diff --git a/tensorflow/core/kernels/matmul_op_impl.h b/tensorflow/core/kernels/matmul_op_impl.h
index db7e7d90208..2c5993383ed 100644
--- a/tensorflow/core/kernels/matmul_op_impl.h
+++ b/tensorflow/core/kernels/matmul_op_impl.h
@@ -69,7 +69,8 @@ namespace {
 // nor Y is adjointed). The dimension to contract along is switched when any
 // operand is adjointed.
 // See http://en.wikipedia.org/wiki/Tensor_contraction
-Eigen::IndexPair<Eigen::DenseIndex> ContractionDims(bool adj_x, bool adj_y) {
+inline Eigen::IndexPair<Eigen::DenseIndex> ContractionDims(bool adj_x,
+                                                           bool adj_y) {
   return Eigen::IndexPair<Eigen::DenseIndex>(adj_x ? 0 : 1, adj_y ? 1 : 0);
 }
 
diff --git a/tensorflow/core/kernels/matmul_util.cc b/tensorflow/core/kernels/matmul_util.cc
index 344b8e7ba3e..e6e18dd4a29 100644
--- a/tensorflow/core/kernels/matmul_util.cc
+++ b/tensorflow/core/kernels/matmul_util.cc
@@ -71,7 +71,7 @@ int MatmulMaxAutotuneAlgorithmCount() {
   Status status =
       ReadInt64FromEnvVar("TF_MATMUL_AUTOTUNE_MAX_ALGORITHMS", 10, &value);
   if (!status.ok()) {
-    LOG(ERROR) << status.error_message();
+    LOG(ERROR) << status.message();
   }
   static constexpr const int kMaxValue = std::numeric_limits<int>::max();
   if (value < 1 || value > kMaxValue) {
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
index 0d6190581bd..01c1df91fd0 100644
--- a/tensorflow/core/kernels/mkl/BUILD
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -119,11 +119,11 @@ tf_cc_test_mkl(
         "//tensorflow/core:direct_session",
         "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core/kernels:quantization_utils",
-        "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/kernels:bias_op",
         "//tensorflow/core/kernels:conv_ops",
         "//tensorflow/core/kernels:depthwise_conv_op",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/kernels:relu_op",
     ] + MKL_TEST_DEPS,
 )
@@ -149,8 +149,8 @@ tf_mkl_kernel_library(
     name = "mkl_quantize_op",
     srcs = ["mkl_quantize_op.cc"],
     deps = [
-        "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/graph:mkl_graph_util",
+        "//tensorflow/core/kernels:quantized_ops",
         "@gemmlowp",
     ] + MKL_DEPS,
 )
@@ -174,6 +174,7 @@ tf_cc_test_mkl(
     srcs = ["mkl_quantized_pooling_ops_test.cc"],
     linkstatic = 1,  # Fixes dyld error on MacOS.
     deps = [
+        ":mkl_kernel_util",
         ":mkl_pooling_ops",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:math_ops_op_lib",
@@ -207,11 +208,11 @@ tf_mkl_kernel_library(
         "mkl_quantized_conv_ops.h",
     ],
     deps = [
-        "//tensorflow/core/kernels:matmul_op",
-        "//tensorflow/core/kernels:no_op",
         "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:no_op",
     ] + MKL_DEPS,
 )
 
@@ -222,10 +223,10 @@ tf_mkl_kernel_library(
     ],
     prefix = "mkl_conv",
     deps = [
-        "@com_google_absl//absl/strings",
         "//tensorflow/core/kernels:conv_grad_shape_utils",
         "//tensorflow/core/kernels:conv_ops",
         "//tensorflow/core/kernels:no_op",
+        "@com_google_absl//absl/strings",
     ] + MKL_DEPS,
 )
 
@@ -253,10 +254,10 @@ tf_cc_test_mkl(
     deps = [
         ":mkl_eltwise_activation_base_op",
         ":mkl_swish_op",
-        "@com_google_absl//absl/strings",
         "//tensorflow/cc:math_ops",
-        "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core:direct_session",
+        "//tensorflow/core/kernels:cwise_op",
+        "@com_google_absl//absl/strings",
     ] + MKL_TEST_DEPS,
 )
 
@@ -281,13 +282,12 @@ tf_mkl_kernel_library(
     name = "mkl_dequantize_op",
     srcs = ["mkl_dequantize_op.cc"],
     deps = [
-        "@gemmlowp",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core/graph:mkl_graph_util",
         "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/graph:mkl_graph_util",
         "//tensorflow/core/kernels:concat_lib_hdrs",
         "//tensorflow/core/kernels:conv_ops",
         "//tensorflow/core/kernels:cwise_op",
@@ -299,6 +299,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core/kernels:transpose_functor",
         "//tensorflow/core/util:image_resizer_state",
         "//tensorflow/core/util:onednn_env_vars",
+        "@gemmlowp",
     ] + mkl_deps(),
 )
 
@@ -347,8 +348,8 @@ tf_mkl_kernel_library(
     name = "mkl_softmax_op",
     prefix = "mkl_softmax",
     deps = [
-        "//third_party/eigen3",
         "//tensorflow/core/graph:mkl_graph_util",
+        "//third_party/eigen3",
     ] + MKL_SHORT_DEPS,
 )
 
@@ -362,8 +363,8 @@ tf_mkl_kernel_library(
     name = "mkl_deprecated_ops",
     prefix = "mkl_deprecated_ops",
     deps = [
-        "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/graph:mkl_graph_util",
+        "//tensorflow/core/kernels:cwise_op",
     ] + MKL_DEPS,
 )
 
@@ -380,8 +381,8 @@ tf_mkl_kernel_library(
     name = "mkl_layer_norm_op",
     srcs = ["mkl_layer_norm_op.cc"],
     deps = [
-        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:mkl_nn_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
     ] + MKL_DEPS,
 )
 
@@ -399,6 +400,15 @@ tf_cc_test_mkl(
     ] + MKL_TEST_DEPS,
 )
 
+tf_mkl_kernel_library(
+    name = "mkl_fused_instance_norm_op",
+    srcs = ["mkl_fused_instance_norm_op.cc"],
+    deps = [
+        "//tensorflow/core:mkl_nn_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+    ] + MKL_DEPS,
+)
+
 tf_mkl_kernel_library(
     name = "mkl_concat_op",
     prefix = "mkl_concat_op",
@@ -415,7 +425,6 @@ tf_mkl_kernel_library(
         "mkl_requantize_per_channel_op.cc",
     ],
     deps = [
-        "@gemmlowp",
         "//tensorflow/core/kernels:concat_lib_hdrs",
         "//tensorflow/core/kernels:conv_ops",
         "//tensorflow/core/kernels:eigen_helpers",
@@ -424,6 +433,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core/kernels:pooling_ops",
         "//tensorflow/core/kernels:quantization_utils",
         "//tensorflow/core/util:image_resizer_state",
+        "@gemmlowp",
     ] + MKL_DEPS,
 )
 
@@ -452,6 +462,7 @@ tf_cc_test_mkl(
         ":mkl_tfconv_op",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:direct_session",
+        "//tensorflow/core:tensorflow",
         "//tensorflow/core/graph:mkl_graph_util",
         "//tensorflow/core/kernels:bias_op",
         "//tensorflow/core/kernels:conv_ops",
@@ -459,8 +470,7 @@ tf_cc_test_mkl(
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/core/kernels:pad_op",
         "//tensorflow/core/kernels:relu_op",
-        "//tensorflow/core/kernels/image:image",
-        "//tensorflow/core:tensorflow",
+        "//tensorflow/core/kernels/image",
     ] + MKL_TEST_DEPS,
 )
 
@@ -479,6 +489,7 @@ tf_cc_test_mkl(
     linkstatic = 1,
     deps = [
         "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels/mkl:mkl_batch_matmul_op",
         "//tensorflow/core/kernels/mkl:mkl_matmul_op",
     ] + MKL_TEST_DEPS,
 )
@@ -489,12 +500,12 @@ tf_mkl_kernel_library(
     hdrs = ["mkl_kernel_util.h"],
     deps = MKL_DEPS + [
         ":mkl_quantize_op",
+        "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:ops",
-        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:direct_session",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
+        "//tensorflow/core/kernels:quantization_utils",
     ],
 )
 
diff --git a/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
index 1e071076cdf..997c64d44b8 100644
--- a/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
@@ -13,7 +13,7 @@
    limitations under the License.
    ==============================================================================*/
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 #define EIGEN_USE_THREADS
 
 #include "dnnl.hpp"
@@ -115,6 +115,7 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
           pooling_prop_kind,
           static_cast<memory::format_tag>(this->data_format_mkldnn_), input_md,
           this->native_format_);
+      MklDnnThreadPool eigen_tp(context);
       pooling_fwd = MklPoolingFwdPrimitiveFactory<T>::Get(fwdParams);
 
       // Allocate output tensor.
@@ -128,7 +129,6 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
 
       T* dst_data = output_tensor->flat<T>().data();
       std::shared_ptr<stream> fwd_cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
       fwd_cpu_stream.reset(CreateStream(&eigen_tp, pooling_fwd->GetEngine()));
       // Execute pooling op.
       pooling_fwd->Execute(src_data, dst_data, nullptr, fwd_cpu_stream);
@@ -206,6 +206,54 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
                      context->allocate_output(0, output_shape, &output_tensor));
       output_tensor->flat<T>().setZero();
 
+      bool is_pool2d = (this->ksize_.size() == 4);
+
+      // out-of-memory boundary index check for output_tensor in 2D case.
+      const int depth_window = this->ksize_[3];
+      if (is_pool2d && depth_window == 1) {
+        const int window_rows = this->ksize_[1];
+        const int window_cols = this->ksize_[2];
+        const int row_stride = this->stride_[1];
+        const int col_stride = this->stride_[2];
+        const int64_t in_rows = output_shape.dim_size(1);
+        const int64_t in_cols = output_shape.dim_size(2);
+        const int64_t out_backprop_batch = grad_tensor.dim_size(0);
+        const int64_t out_backprop_rows = grad_tensor.dim_size(1);
+        const int64_t out_backprop_cols = grad_tensor.dim_size(2);
+        const int64_t out_backprop_depth = grad_tensor.dim_size(3);
+        int64_t out_height, out_width, pad_rows, pad_cols;
+        OP_REQUIRES_OK(context, GetWindowedOutputSize(
+                                    in_rows, window_rows, row_stride,
+                                    this->padding_, &out_height, &pad_rows));
+
+        OP_REQUIRES_OK(context, GetWindowedOutputSize(
+                                    in_cols, window_cols, col_stride,
+                                    this->padding_, &out_width, &pad_cols));
+
+        for (int64_t r = 0; r < out_backprop_rows; ++r) {
+          int rindex, rsize;
+          OP_REQUIRES_OK(context,
+                         GetBroadcastSize(r, in_rows, window_rows, row_stride,
+                                          pad_rows, &rindex, &rsize));
+          for (int64_t c = 0; c < out_backprop_cols; ++c) {
+            int cindex, csize;
+            OP_REQUIRES_OK(context,
+                           GetBroadcastSize(c, in_cols, window_cols, col_stride,
+                                            pad_cols, &cindex, &csize));
+            int64_t input_max =
+                ((out_backprop_batch - 1) * in_rows + rindex + rsize - 1) *
+                    in_cols +
+                cindex + csize - 1;
+            OP_REQUIRES(context, input_max < output_tensor->NumElements(),
+                        errors::InvalidArgument(
+                            "Output only has ", output_tensor->NumElements(),
+                            " elements but computation requested"
+                            " would use element with index=",
+                            input_max));
+          }
+        }
+      }
+
       if (output_shape.num_elements() == 0 || grad_tensor.NumElements() == 0) {
         return;
       }
@@ -219,7 +267,6 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       // Used to allocate output_diff_src/diff_src.
       MklDnnData<T> grad_dnn_data(&cpu_engine_);
       MklPoolParameters pool_params;
-      bool is_pool2d = (this->ksize_.size() == 4);
       this->InitMklPoolParameters(context, &pool_params, orig_input_mkl_shape,
                                   output_shape);
 
@@ -241,6 +288,14 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
                                                       this->data_format_tf_)
                           : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
                                                        this->data_format_tf_);
+
+      OP_REQUIRES(
+          context, orig_input_dims_mkl_order[0] == diff_dst_dims[0],
+          errors::InvalidArgument(
+              "Expected first dimension of orig_input and diff_dst to match, "
+              "got ",
+              orig_input_dims_mkl_order[0], " and ", diff_dst_dims[0]));
+
       memory::dims output_dims_mkl_order;
       this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
@@ -267,11 +322,11 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           prop_kind::forward_training,
           static_cast<memory::format_tag>(this->data_format_mkldnn_), src_md,
           this->native_format_);
+      MklDnnThreadPool eigen_tp(context);
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
       std::shared_ptr<stream> bwd_cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
       bwd_cpu_stream.reset(CreateStream(&eigen_tp, pooling_bwd->GetEngine()));
       // TODO(intel-tf): Refactor (lines 249-262) common code for
       // max & avg pooling into superclass or common utils function.
@@ -383,4 +438,4 @@ REGISTER_KERNEL_BUILDER(Name("_MklQuantizedAvgPool")
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_batch_matmul_helper.h b/tensorflow/core/kernels/mkl/mkl_batch_matmul_helper.h
index a79ea51e0be..b3aae33b78a 100644
--- a/tensorflow/core/kernels/mkl/mkl_batch_matmul_helper.h
+++ b/tensorflow/core/kernels/mkl/mkl_batch_matmul_helper.h
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_BATCH_MATMUL_HELPER_H_
 #define TENSORFLOW_CORE_KERNELS_MKL_MKL_BATCH_MATMUL_HELPER_H_
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/register_types.h"
@@ -106,5 +106,5 @@ struct MklBatchMatMulHelper {
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
 #endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_BATCH_MATMUL_HELPER_H_
diff --git a/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
index 46f65f31263..e4c3d308565 100644
--- a/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#if defined(INTEL_MKL)
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/register_types.h"
@@ -49,8 +49,23 @@ template <typename Device, typename Tlhs, typename Trhs, typename Toutput,
 class BatchMatMulMkl : public OpKernel {
  public:
   explicit BatchMatMulMkl(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
-    OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
+    if (context && context->HasAttr("transpose_a")) {
+      // This is needed for using BatchMatMulMkl as the super class of
+      // MklMatMulOp (below) whose context has a transpose_a attribute which is
+      // effectively the same as adj_x_
+      OP_REQUIRES_OK(context, context->GetAttr("transpose_a", &adj_x_));
+    } else {
+      OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
+    }
+
+    if (context && context->HasAttr("transpose_b")) {
+      // This is needed for using BatchMatMulMkl as the super class of
+      // MklMatMulOp (below) whose context has a transpose_b attribute which is
+      // effectively the same as adj_y_
+      OP_REQUIRES_OK(context, context->GetAttr("transpose_b", &adj_y_));
+    } else {
+      OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
+    }
   }
 
   virtual ~BatchMatMulMkl() {}
@@ -59,21 +74,26 @@ class BatchMatMulMkl : public OpKernel {
     const Tensor& lhs = ctx->input(0);
     const Tensor& rhs = ctx->input(1);
 
+    if (std::is_same<Tlhs, float>::value) {
+      (void)SetFPMathMode();
+    }
+
     if (!v2_bcast) {
       // Using V1, so check to make sure lhs and rhs dimensions are correct and
       // no broadcasting is needed.
-      OP_REQUIRES(ctx, lhs.dims() == rhs.dims(),
-                  errors::InvalidArgument("lhs and rhs has different ndims: ",
-                                          lhs.shape().DebugString(), " vs. ",
-                                          rhs.shape().DebugString()));
-      const int ndims = lhs.dims();
       OP_REQUIRES(
-          ctx, ndims >= 2,
-          errors::InvalidArgument("lhs and rhs ndims must be >= 2: ", ndims));
+          ctx, lhs.dims() == rhs.dims(),
+          errors::InvalidArgument("In[0] and In[1] has different ndims: ",
+                                  lhs.shape().DebugString(), " vs. ",
+                                  rhs.shape().DebugString()));
+      const int ndims = lhs.dims();
+      OP_REQUIRES(ctx, ndims >= 2,
+                  errors::InvalidArgument(
+                      "In[0] and In[1] ndims must be >= 2: ", ndims));
       for (int i = 0; i < ndims - 2; ++i) {
         OP_REQUIRES(ctx, lhs.dim_size(i) == rhs.dim_size(i),
                     errors::InvalidArgument(
-                        "lhs.dim(", i, ") and rhs.dim(", i,
+                        "In[0].dim(", i, ") and In[1].dim(", i,
                         ") must be the same: ", lhs.shape().DebugString(),
                         " vs ", rhs.shape().DebugString()));
       }
@@ -107,11 +127,11 @@ class BatchMatMulMkl : public OpKernel {
 
     if (adj_x_) std::swap(lhs_rows, lhs_cols);
     if (adj_y_) std::swap(rhs_rows, rhs_cols);
-    OP_REQUIRES(ctx, lhs_cols == rhs_rows,
-                errors::InvalidArgument(
-                    "lhs mismatch rhs shape: ", lhs_cols, " vs. ", rhs_rows,
-                    ": ", lhs.shape().DebugString(), " ",
-                    rhs.shape().DebugString(), " ", adj_x_, " ", adj_y_));
+    OP_REQUIRES(
+        ctx, lhs_cols == rhs_rows,
+        errors::InvalidArgument(
+            "Matrix size-incompatible: In[0]: ", lhs.shape().DebugString(),
+            ", In[1]: ", rhs.shape().DebugString(), " ", adj_x_, " ", adj_y_));
 
     out_shape.AddDim(lhs_rows);
     out_shape.AddDim(rhs_cols);
@@ -140,7 +160,7 @@ class BatchMatMulMkl : public OpKernel {
                                          out_shape, adj_x_, adj_y_);
 
     this->ExtendMklMatMulParams(ctx, *params);
-
+    MklDnnThreadPool eigen_tp(ctx);
     // Create or retrieve matmul primitive from cache.
     MklMatMulPrimitive<Tlhs, Trhs, Toutput>* matmul_prim =
         MklMatMulPrimitiveFactory<float, Tlhs, Trhs, Toutput>::Get(
@@ -194,7 +214,6 @@ class BatchMatMulMkl : public OpKernel {
     scratch_pad.AllocateSPTensor(matmul_prim, ctx);
     // Execute matmul primitive.
     std::shared_ptr<stream> cpu_stream;
-    MklDnnThreadPool eigen_tp(ctx);
     cpu_stream.reset(CreateStream(&eigen_tp, matmul_prim->GetEngine()));
     if (fused_ops_.size() > 0) {
       void* mul_data = nullptr;
@@ -307,6 +326,29 @@ class FusedBatchMatMulMkl
   }
 };
 
+// Direct calls for MklMatMulOp to BatchMatMulMkl for aarch64,
+// because the Arm Compute Library does not provide a BLAS SGEMM
+// interface, which is what MklMatMulOp calls by default.
+#ifdef DNNL_AARCH64_USE_ACL
+template <typename Device, typename T, bool USE_CUBLAS>
+class MklMatMulOp : public BatchMatMulMkl<Device, T, T, T, USE_CUBLAS> {
+ public:
+  explicit MklMatMulOp(OpKernelConstruction* ctx)
+      : BatchMatMulMkl<Device, T, T, T, false>(ctx) {}
+
+  virtual ~MklMatMulOp() {}
+};
+
+#define REGISTER_MATMUL_MKL(TYPE)                         \
+  REGISTER_KERNEL_BUILDER(                                \
+      Name("_MklMatMul")                                  \
+          .Device(DEVICE_CPU)                             \
+          .TypeConstraint<TYPE>("T")                      \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+      MklMatMulOp<CPUDevice, TYPE, false /* cublas, ignored for CPU */>);
+
+#endif  // DNNL_AARCH64_USE_ACL
+
 #define REGISTER_BATCH_MATMUL_MKL(TYPE)                                       \
   REGISTER_KERNEL_BUILDER(Name("_MklBatchMatMul")                             \
                               .Device(DEVICE_CPU)                             \
@@ -328,14 +370,17 @@ class FusedBatchMatMulMkl
           .TypeConstraint<TYPE>("T"),         \
       FusedBatchMatMulMkl<CPUDevice, TYPE, TYPE, TYPE, true>)
 
-#ifdef INTEL_MKL
 TF_CALL_float(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_float(REGISTER_BATCH_MATMUL_MKL_V2);
 TF_CALL_float(REGISTER_FUSED_BATCH_MATMUL_MKL);
 TF_CALL_bfloat16(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_bfloat16(REGISTER_BATCH_MATMUL_MKL_V2);
 TF_CALL_bfloat16(REGISTER_FUSED_BATCH_MATMUL_MKL);
-#endif  // INTEL_MKL
+
+#ifdef DNNL_AARCH64_USE_ACL
+TF_CALL_float(REGISTER_MATMUL_MKL);
+TF_CALL_bfloat16(REGISTER_MATMUL_MKL);
+#endif  // DNNL_AARCH64_USE_ACL
 
 }  // end namespace tensorflow
-#endif
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_concat_op.cc b/tensorflow/core/kernels/mkl/mkl_concat_op.cc
index 4f37bb0e9dd..07e758a5e77 100644
--- a/tensorflow/core/kernels/mkl/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_concat_op.cc
@@ -40,6 +40,21 @@ using dnnl::concat;
 using dnnl::stream;
 
 namespace tensorflow {
+#ifndef ENABLE_ONEDNN_V3
+#define CONCAT_PRIM_DESC(eng, concat_dims, src_md, dst_md_ptr) \
+  concat::primitive_desc(*dst_md_ptr, concat_dims, src_md, eng)
+#define CONCAT_PRIM_DESC_USING_SRC(eng, concat_dims, src_md) \
+  concat::primitive_desc(concat_dims, src_md, eng)
+#define GET_MEMORY_DESC(md) md.data
+#define SET_MKL_LAYOUT(md) SetMklLayout(&md)
+#else
+#define CONCAT_PRIM_DESC(eng, concat_dims, src_md, dst_md_ptr) \
+  concat::primitive_desc(eng, *dst_md_ptr, concat_dims, src_md)
+#define CONCAT_PRIM_DESC_USING_SRC(eng, concat_dims, src_md) \
+  concat::primitive_desc(eng, concat_dims, src_md)
+#define GET_MEMORY_DESC(md) md
+#define SET_MKL_LAYOUT(md) SetMklLayout(md)
+#endif  // !ENABLE_ONEDNN_V3
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 // List of TensorShape objects. Used in Concat/Split layers.
@@ -287,7 +302,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
 #endif
     DCHECK_EQ(in_data.size(), context_.data_mem.size());
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#if !defined(ENABLE_ONEDNN_OPENMP) && !defined(ENABLE_ONEDNN_V3)
       context_.data_mem_shdptr[i]->set_data_handle(
           static_cast<void*>(in_data[i].get_data_handle()), *fwd_stream);
     }
@@ -299,7 +314,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     }
     context_.dst_mem->set_data_handle(
         static_cast<void*>(dst_data.get_data_handle()));
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // !ENABLE_ONEDNN_OPENMP && !ENABLE_ONEDNN_V3
 
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
       context_.data_mem[i] = *context_.data_mem_shdptr[i];
@@ -349,7 +364,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
              const std::vector<memory::desc>& srcs_md) {
     // Create memory descriptors for concat with specified srcs format
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
-      dnnl::memory::desc source_md(memory::desc(srcs_md[i].data));
+      dnnl::memory::desc source_md(memory::desc(GET_MEMORY_DESC(srcs_md[i])));
       context_.src_md.push_back(source_md);
       std::shared_ptr<dnnl::memory> src_mem(
           new dnnl::memory(source_md, cpu_engine_, DummyData));
@@ -361,9 +376,9 @@ class MklConcatFwdPrimitive : public MklPrimitive {
                                            MklDnnType<T>(),
                                            concat_fwd_dims.mkl_common_format));
     // Create a concat primitive descriptor
-    context_.fwd_pd.reset(new concat::primitive_desc(
-        *context_.dst_md, concat_fwd_dims.concat_dims, context_.src_md,
-        cpu_engine_));
+    context_.fwd_pd.reset(
+        new CONCAT_PRIM_DESC(cpu_engine_, concat_fwd_dims.concat_dims,
+                             context_.src_md, context_.dst_md));
 
     // Create memory primitive based on dummy data
     context_.dst_mem.reset(
@@ -648,8 +663,17 @@ class MklConcatOp : public OpKernel {
             auto src_tf_fmt = MklTensorFormatToMklDnnDataFormat(
                 mkl_input_shapes[k].GetTfDataFormat());
             if (src_tf_fmt != mkl_common_format) {
+#ifndef ENABLE_ONEDNN_V3
               memory::dims src_dims(src_md.data.dims,
                                     &src_md.data.dims[src_md.data.ndims]);
+#else
+              memory::dims src_dims;
+              if (src_md.get_ndims() == 2)
+                src_dims = {src_md.get_dims()[0], src_md.get_dims()[1]};
+              else if (src_md.get_ndims() == 4)
+                src_dims = {src_md.get_dims()[0], src_md.get_dims()[1],
+                            src_md.get_dims()[2], src_md.get_dims()[3]};
+#endif  // !ENABLE_ONEDNN_V3
               src_md =
                   memory::desc(src_dims, MklDnnType<T>(), mkl_common_format);
             }
@@ -708,7 +732,7 @@ class MklConcatOp : public OpKernel {
           // When memory::format_tag::nc, dst_dims are already in oneDNN order
           dst_md = memory::desc(dst_dims, MklDnnType<T>(), mkl_common_format);
         } else {
-          TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
+          TF_CHECK_OK(Status(absl::StatusCode::kFailedPrecondition,
                              "Unsupported tensor dimension or"
                              "oneDNN memory format"));
         }
@@ -736,18 +760,18 @@ class MklConcatOp : public OpKernel {
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
       if (are_all_mkl_inputs)
         concat_dim = mkl_input_shapes[0].TfDimIdx(concat_dim);
-
+      MklDnnThreadPool eigen_tp(context);
       if (!inputs.empty()) {
         if (are_all_mkl_inputs) {
           auto concat_pd =
-              concat::primitive_desc(concat_dim, srcs_pd, cpu_engine);
+              CONCAT_PRIM_DESC_USING_SRC(cpu_engine, concat_dim, srcs_pd);
           auto dst_pd = concat_pd.dst_desc();
 
           MklDnnShape dnn_shape_dst;
           TensorShape tf_shape_dst;
           Tensor* dst_tensor = nullptr;
           dnn_shape_dst.SetMklTensor(true);
-          dnn_shape_dst.SetMklLayout(&dst_pd);
+          dnn_shape_dst.SET_MKL_LAYOUT(dst_pd);
           dnn_shape_dst.SetElemType(MklDnnType<T>());
           dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
                                     mkl_input_shapes[0].GetTfDataFormat());
@@ -757,7 +781,7 @@ class MklConcatOp : public OpKernel {
           DCHECK(dst_tensor != nullptr) << "Output tensor pointer is NULL";
 
           std::shared_ptr<stream> fwd_cpu_stream;
-          MklDnnThreadPool eigen_tp(context);
+
           fwd_cpu_stream.reset(CreateStream(&eigen_tp, cpu_engine));
 
           if (dnn_shape_dst.IsMklTensor())
@@ -795,7 +819,7 @@ class MklConcatOp : public OpKernel {
           dst_md = dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout()
                                                : dst_md;
           std::shared_ptr<stream> fwd_cpu_stream;
-          MklDnnThreadPool eigen_tp(context);
+
           fwd_cpu_stream.reset(
               CreateStream(&eigen_tp, concat_fwd->GetEngine()));
           dst.SetUsrMem(dst_md, dst_tensor);
@@ -983,6 +1007,10 @@ REGISTER_QUANTIZED_CONCATV2(quint8);
 REGISTER_QUANTIZED_CONCATV2(qint8);
 
 #undef REGISTER_CONCAT_MKL
+#undef CONCAT_PRIM_DESC
+#undef CONCAT_PRIM_DESC_USING_SRC
+#undef GET_MEMORY_DESC
+#undef SET_MKL_LAYOUT
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
index 748ea42d603..0b5519fc46b 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
@@ -36,7 +36,9 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+#ifndef ENABLE_ONEDNN_V3
 using ConvBwdFilterDesc = dnnl::convolution_backward_weights::desc;
+#endif  // !ENABLE_ONEDNN_V3
 using ConvBwdFilterPd = dnnl::convolution_backward_weights::primitive_desc;
 
 struct MklConvBwdFilterParams {
@@ -157,11 +159,15 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
 
     // Primitive descriptor and descriptor for convolution backward filter.
     std::shared_ptr<ConvBwdFilterPd> bwd_filter_pd;
+#ifndef ENABLE_ONEDNN_V3
     std::shared_ptr<ConvBwdFilterDesc> bwd_filter_desc;
+#endif  // !ENABLE_ONEDNN_V3
 
     // Primitive descriptor and descriptor for convolution forward.
     std::shared_ptr<ConvFwdPd> fwd_pd;
+#ifndef ENABLE_ONEDNN_V3
     std::shared_ptr<ConvFwdDesc> fwd_desc;
+#endif  // !ENABLE_ONEDNN_V3
 
     // Convolution backward filter primitive.
     std::shared_ptr<dnnl::primitive> conv_bwd_filter;
@@ -182,13 +188,18 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
           diff_filter_mem(nullptr),
           diff_bias_mem(nullptr),
           diff_dst_mem(nullptr),
+#ifndef ENABLE_ONEDNN_V3
           bwd_filter_desc(nullptr),
+#endif  // !ENABLE_ONEDNN_V3
           fwd_pd(nullptr),
+#ifndef ENABLE_ONEDNN_V3
           fwd_desc(nullptr),
+#endif  // !ENABLE_ONEDNN_V3
           src_md(nullptr),
           diff_filter_md(nullptr),
           diff_bias_md(nullptr),
-          diff_dst_md(nullptr) {}
+          diff_dst_md(nullptr) {
+    }
   };
 
   void Setup(const MklConvBwdFilterParams& convBwdFilterDims) {
@@ -217,6 +228,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
           new memory::desc({convBwdFilterDims.diff_bias_dims}, MklDnnType<T>(),
                            memory::format_tag::x));
 
+#ifndef ENABLE_ONEDNN_V3
     // Create descriptor and primitive descriptor for convolution forward.
     context_.fwd_desc.reset(new ConvFwdDesc(
         prop_kind::forward, dnnl::algorithm::convolution_direct,
@@ -242,6 +254,29 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
     }
     context_.bwd_filter_pd.reset(new ConvBwdFilterPd(
         *context_.bwd_filter_desc, cpu_engine_, *context_.fwd_pd));
+#else
+    context_.fwd_pd.reset(new ConvFwdPd(
+        cpu_engine_, prop_kind::forward, dnnl::algorithm::convolution_direct,
+        *context_.src_md, *context_.diff_filter_md, *context_.diff_dst_md,
+        convBwdFilterDims.strides, convBwdFilterDims.dilations,
+        convBwdFilterDims.padding_left, convBwdFilterDims.padding_right));
+
+    if (!convBwdFilterDims.diff_bias_dims.empty()) {
+      context_.bwd_filter_pd.reset(new ConvBwdFilterPd(
+          cpu_engine_, dnnl::algorithm::convolution_direct, *context_.src_md,
+          *context_.diff_filter_md, *context_.diff_bias_md,
+          *context_.diff_dst_md, convBwdFilterDims.strides,
+          convBwdFilterDims.dilations, convBwdFilterDims.padding_left,
+          convBwdFilterDims.padding_right, *context_.fwd_pd));
+    } else {
+      context_.bwd_filter_pd.reset(new ConvBwdFilterPd(
+          cpu_engine_, dnnl::algorithm::convolution_direct, *context_.src_md,
+          *context_.diff_filter_md, *context_.diff_dst_md,
+          convBwdFilterDims.strides, convBwdFilterDims.dilations,
+          convBwdFilterDims.padding_left, convBwdFilterDims.padding_right,
+          *context_.fwd_pd));
+    }
+#endif  // !ENABLE_ONEDNN_V3
 
     auto bwd_filter_pd = context_.bwd_filter_pd.get();
 
@@ -367,6 +402,10 @@ class MklConvCustomBackpropFilterOp
       const Tensor& filter_tensor = MklGetInput(context, kFilterIdx);
       const Tensor& diff_dst_tensor = MklGetInput(context, kDiffDstIdx);
 
+      if (std::is_same<T, float>::value) {
+        (void)SetFPMathMode();
+      }
+
       MklDnnShape src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape;
       GetMklShape(context, kInputIdx, &src_mkl_shape, native_format);
       GetMklShape(context, kFilterIdx, &filter_mkl_shape, native_format);
@@ -478,6 +517,7 @@ class MklConvCustomBackpropFilterOp
       // variable TF_MKL_OPTIMIZE_PRIMITIVE_MEMUSE is set to true.
       bool do_not_cache = MklPrimitiveFactory<T>::IsPrimitiveMemOptEnabled();
 
+      MklDnnThreadPool eigen_tp(context);
       MklConvBwdFilterPrimitive<T>* conv_bwd_filter =
           MklConvBwdFilterPrimitiveFactory<T>::Get(convBwdFilterDims,
                                                    do_not_cache);
@@ -614,7 +654,7 @@ class MklConvCustomBackpropFilterOp
 
       // Execute convolution backward filter.
       std::shared_ptr<stream> bwd_cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
+
       bwd_cpu_stream.reset(
           CreateStream(&eigen_tp, conv_bwd_filter->GetEngine()));
       if (bias_enabled) {
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
index 82458079bcb..3eab40d24ee 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
@@ -39,8 +39,15 @@ using dnnl::prop_kind;
 using dnnl::stream;
 
 namespace tensorflow {
+#ifndef ENABLE_ONEDNN_V3
+#define SET_MKL_LAYOUT(md) SetMklLayout(&md)
+#else
+#define SET_MKL_LAYOUT(md) SetMklLayout(md)
+#endif  // !ENABLE_ONEDNN_V3
 
+#ifndef ENABLE_ONEDNN_V3
 using ConvBwdDataDesc = dnnl::convolution_backward_data::desc;
+#endif  // !ENABLE_ONEDNN_V3
 using ConvBwdDataPd = dnnl::convolution_backward_data::primitive_desc;
 
 // Utility classes for enabling primitive reuse for conv bwd input.
@@ -96,7 +103,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
 #ifdef DNNL_AARCH64_USE_ACL
     mutex_lock lock(primitive_execution_mu_);
 #endif
-#ifndef ENABLE_ONEDNN_OPENMP
+#if !defined(ENABLE_ONEDNN_OPENMP) && !defined(ENABLE_ONEDNN_V3)
     // TODO(intel-tf): Create a common function and avoid the duplicate code
     context_.diff_src_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_src_data)), *bwd_input_stream);
@@ -111,7 +118,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
         static_cast<T*>(const_cast<T*>(filter_data)));
     context_.diff_dst_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_dst_data)));
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // !ENABLE_ONEDNN_OPENMP && !ENABLE_ONEDNN_V3
     execute_primitives(context_.bwd_input_primitives, bwd_input_stream,
                        context_.bwd_input_primitives_args);
 
@@ -136,11 +143,15 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
 
     // Conv backward input primitive descriptor and descriptor.
     std::shared_ptr<ConvBwdDataPd> bwd_input_pd;
+#ifndef ENABLE_ONEDNN_V3
     std::shared_ptr<ConvBwdDataDesc> bwd_input_desc;
+#endif  // !ENABLE_ONEDNN_V3
 
     // Primitive descriptor and descriptor for conv fwd
     std::shared_ptr<ConvFwdPd> fwd_pd;
+#ifndef ENABLE_ONEDNN_V3
     std::shared_ptr<ConvFwdDesc> fwd_desc;
+#endif  // !ENABLE_ONEDNN_V3
 
     // Conv bwd input primitive.
     std::shared_ptr<dnnl::primitive> conv_bwd_input;
@@ -159,13 +170,18 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
           filter_mem(nullptr),
           diff_dst_mem(nullptr),
           bwd_input_pd(nullptr),
+#ifndef ENABLE_ONEDNN_V3
           bwd_input_desc(nullptr),
+#endif  // !ENABLE_ONEDNN_V3
           fwd_pd(nullptr),
+#ifndef ENABLE_ONEDNN_V3
           fwd_desc(nullptr),
+#endif  // !ENABLE_ONEDNN_V3
           conv_bwd_input(nullptr),
           diff_src_md(nullptr),
           filter_md(nullptr),
-          diff_dst_md(nullptr) {}
+          diff_dst_md(nullptr) {
+    }
   };
 
   void Setup(const MklConvBwdInputParams& convBwdInputDims) {
@@ -188,6 +204,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
                                               memory::format_tag::any));
 
     // Create descriptors for both conv fwd and conv bwd input.
+#ifndef ENABLE_ONEDNN_V3
     context_.bwd_input_desc.reset(new ConvBwdDataDesc(
         dnnl::algorithm::convolution_direct, *context_.diff_src_md,
         *context_.filter_md, *context_.diff_dst_md, convBwdInputDims.strides,
@@ -204,6 +221,19 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
     context_.fwd_pd.reset(new ConvFwdPd(*context_.fwd_desc, cpu_engine_));
     context_.bwd_input_pd.reset(new ConvBwdDataPd(
         *context_.bwd_input_desc, cpu_engine_, *context_.fwd_pd));
+#else
+    context_.fwd_pd.reset(new ConvFwdPd(
+        cpu_engine_, prop_kind::forward, dnnl::algorithm::convolution_direct,
+        *context_.diff_src_md, *context_.filter_md, *context_.diff_dst_md,
+        convBwdInputDims.strides, convBwdInputDims.dilations,
+        convBwdInputDims.padding_left, convBwdInputDims.padding_right));
+
+    context_.bwd_input_pd.reset(new ConvBwdDataPd(
+        cpu_engine_, dnnl::algorithm::convolution_direct, *context_.diff_src_md,
+        *context_.filter_md, *context_.diff_dst_md, convBwdInputDims.strides,
+        convBwdInputDims.dilations, convBwdInputDims.padding_left,
+        convBwdInputDims.padding_right, *context_.fwd_pd));
+#endif  // !ENABLE_ONEDNN_V3
 
     // Create memory using dummy data.
     context_.diff_src_mem.reset(new memory(
@@ -320,6 +350,10 @@ class MklConvCustomBackpropInputOp
                                   "got: ",
                                   diff_dst_tensor.dims()));
 
+      if (std::is_same<T, float>::value) {
+        (void)SetFPMathMode();
+      }
+
       MklDnnShape src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape;
       GetMklShape(context, kInputIdx, &src_mkl_shape, native_format);
       GetMklShape(context, kFilterIdx, &filter_mkl_shape, native_format);
@@ -436,6 +470,7 @@ class MklConvCustomBackpropInputOp
                           (MklPrimitiveFactory<T>::IsLegacyPlatform() ||
                            IsConv1x1StrideNot1(fwd_filter_dims, strides));
 
+      MklDnnThreadPool eigen_tp(context);
       MklConvBwdInputPrimitive<T>* conv_bwd_input =
           MklConvBwdInputPrimitiveFactory<T>::Get(convBwdInputDims,
                                                   do_not_cache);
@@ -448,7 +483,7 @@ class MklConvCustomBackpropInputOp
       // Allocate output tensor.
       MklDnnShape diff_src_mkl_shape;
       diff_src_mkl_shape.SetMklTensor(true);
-      diff_src_mkl_shape.SetMklLayout(&diff_src_pd);
+      diff_src_mkl_shape.SET_MKL_LAYOUT(diff_src_pd);
       diff_src_mkl_shape.SetElemType(MklDnnType<T>());
       diff_src_mkl_shape.SetTfLayout(bwd_diff_src_dims.size(),
                                      bwd_diff_src_dims, bwd_diff_src_format);
@@ -488,7 +523,7 @@ class MklConvCustomBackpropInputOp
       }
 
       std::shared_ptr<stream> bwd_cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
+
       bwd_cpu_stream.reset(
           CreateStream(&eigen_tp, conv_bwd_input->GetEngine()));
       // Execute conv bwd input primitive.
@@ -577,7 +612,7 @@ class MklConvCustomBackpropInputOp
     // Allocate shape of MKL tensor.
     MklDnnShape output_mkl_shape;
     output_mkl_shape.SetMklTensor(true);
-    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SET_MKL_LAYOUT(dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
                                  output_dims_mkl_order, output_tf_format);
@@ -633,6 +668,7 @@ TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_CPU_KERNELS);
 
 #undef REGISTER_MKL_CPU_KERNELS
+#undef SET_MKL_LAYOUT
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
index a86d420b662..9ef3577d20d 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
@@ -38,6 +38,28 @@ using ReorderPd = dnnl::reorder::primitive_desc;
 
 namespace tensorflow {
 
+#ifndef ENABLE_ONEDNN_V3
+#define APPEND_DEPTHWISE(wei_dt, bias_dt, dst_dt, kernel, stride, padding, \
+                         scales_mask, scales)                              \
+  append_dw(wei_dt, bias_dt, dst_dt, kernel, stride, padding, scales_mask, \
+            scales)
+#define APPEND_ELTWISE(scale, alg, alpha, beta) \
+  append_eltwise(scale, alg, alpha, beta)
+#define GET_DATA_TYPE data_type()
+#define SET_FUSE_ACTIVATION_FOR_RELU6 \
+  set_fuse_activation(true, dnnl::algorithm::eltwise_bounded_relu, 6.0)
+#define SET_MKL_LAYOUT(md) SetMklLayout(&md)
+#else
+#define APPEND_DEPTHWISE(wei_dt, bias_dt, dst_dt, kernel, stride, padding, \
+                         scales_mask, scales)                              \
+  append_dw(wei_dt, bias_dt, dst_dt, kernel, stride, padding)
+#define APPEND_ELTWISE(scale, alg, alpha, beta) append_eltwise(alg, alpha, beta)
+#define GET_DATA_TYPE get_data_type()
+#define SET_FUSE_ACTIVATION_FOR_RELU6 \
+  set_fuse_activation(true, dnnl::algorithm::eltwise_clip, 0.0, 6.0)
+#define SET_MKL_LAYOUT(md) SetMklLayout(md)
+#endif  // !ENABLE_ONEDNN_V3
+
 // TODO(intel-tf) Remove this once old API of quantized ops is abandoned
 namespace quantized_fusions {
 string none[] = {""};
@@ -134,8 +156,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
     // should happen under the lock.
     mutex_lock lock(primitive_execution_mu_);
 #endif
-#ifndef ENABLE_ONEDNN_OPENMP
-    // TODO(intel-tf): Create a common function and avoid the duplicate code
+#if !defined(ENABLE_ONEDNN_OPENMP) && !defined(ENABLE_ONEDNN_V3)
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<Tinput*>(src_data)), *fwd_stream);
     context_.filter_mem->set_data_handle(
@@ -156,6 +177,10 @@ class MklConvFwdPrimitive : public MklPrimitive {
     }
     context_.dst_mem->set_data_handle(
         static_cast<void*>(const_cast<Toutput*>(dst_data)), *fwd_stream);
+    if (sp_data) {
+      context_.sp_mem->set_data_handle(static_cast<void*>(sp_data),
+                                       *fwd_stream);
+    }
 #else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<Tinput*>(src_data)));
@@ -177,11 +202,8 @@ class MklConvFwdPrimitive : public MklPrimitive {
     }
     context_.dst_mem->set_data_handle(
         static_cast<void*>(const_cast<Toutput*>(dst_data)));
-#endif  // !ENABLE_ONEDNN_OPENMP
-    if (sp_data) {
-      context_.sp_mem->set_data_handle(static_cast<void*>(sp_data),
-                                       *fwd_stream);
-    }
+    if (sp_data) context_.sp_mem->set_data_handle(static_cast<void*>(sp_data));
+#endif  // !ENABLE_ONEDNN_OPENMP && !ENABLE_ONEDNN_V3
 
     DCHECK_EQ(context_.fwd_primitives.size(),
               context_.fwd_primitives_args.size());
@@ -240,7 +262,10 @@ class MklConvFwdPrimitive : public MklPrimitive {
     std::shared_ptr<dnnl::memory> bn_offset_mem;
 
     // Desc & primitive desc
+#ifndef ENABLE_ONEDNN_V3
     std::shared_ptr<dnnl::convolution_forward::desc> fwd_desc;
+#endif  // !ENABLE_ONEDNN_V3
+    std::shared_ptr<ConvFwdPd> fwd_pd;
 
     // Memory desc
     std::shared_ptr<dnnl::memory::desc> src_md;
@@ -255,7 +280,6 @@ class MklConvFwdPrimitive : public MklPrimitive {
     std::shared_ptr<dnnl::memory::desc> bn_offset_md;
 
     // Convolution primitive
-    std::shared_ptr<ConvFwdPd> fwd_pd;
     std::shared_ptr<dnnl::primitive> conv_fwd;
 
     std::vector<dnnl::primitive> fwd_primitives;
@@ -271,7 +295,9 @@ class MklConvFwdPrimitive : public MklPrimitive {
           bn_mean_mem(nullptr),
           bn_rsqrt_mem(nullptr),
           bn_offset_mem(nullptr),
+#ifndef ENABLE_ONEDNN_V3
           fwd_desc(nullptr),
+#endif  // !ENABLE_ONEDNN_V3
           src_md(nullptr),
           filter_md(nullptr),
           bias_md(nullptr),
@@ -281,7 +307,8 @@ class MklConvFwdPrimitive : public MklPrimitive {
           bn_rsqrt_md(nullptr),
           bn_offset_md(nullptr),
           fwd_pd(nullptr),
-          conv_fwd(nullptr) {}
+          conv_fwd(nullptr) {
+    }
   };
 
   void Setup(const MklConvFwdParams& convFwdDims) {
@@ -306,6 +333,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
       context_.bias_md.reset(new memory::desc({convFwdDims.bias_dims},
                                               MklDnnType<Tbias>(),
                                               memory::format_tag::any));
+#ifndef ENABLE_ONEDNN_V3
       // Create a convolution descriptor
       context_.fwd_desc.reset(new convolution_forward::desc(
           prop_kind::forward, dnnl::algorithm::convolution_direct,
@@ -318,6 +346,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
           *context_.src_md, *context_.filter_md, *context_.dst_md,
           convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
           convFwdDims.padding_right));
+#endif  // !ENABLE_ONEDNN_V3
     }
 
     if (!convFwdDims.fuse_bn_dims.empty()) {
@@ -348,18 +377,28 @@ class MklConvFwdPrimitive : public MklPrimitive {
           float op_scale = post_op_param.param[0];
           float op_alpha = post_op_param.param[1];
           float op_beta = post_op_param.param[2];
-          post_ops.append_eltwise(op_scale, post_op_param.alg, op_alpha,
+          // TODO(intel-tf): Enable this for int8 when using oneDNN v3.x
+          post_ops.APPEND_ELTWISE(op_scale, post_op_param.alg, op_alpha,
                                   op_beta);
         } else if (post_op_param.name == "sum") {
           DCHECK_EQ(post_op_param.param.size(), 1);
           float op_scale = post_op_param.param[0];
           post_ops.append_sum(op_scale);
         } else if (post_op_param.name == "output_scale") {
+#ifndef ENABLE_ONEDNN_V3
           if (post_op_param.param.size() == 1) {
             post_ops_attr.set_output_scales(0, post_op_param.param);
           } else {
             post_ops_attr.set_output_scales(2, post_op_param.param);
           }
+#else
+          // TODO(intel-tf): Enable this for int8 when using oneDNN v3.x
+          // and return a status instead of using DCHECK_EQ
+          DCHECK_EQ(post_op_param.param.size(), 1);
+          post_ops_attr.set_scales_mask(DNNL_ARG_SRC, 0);
+          post_ops_attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
+          post_ops_attr.set_scales_mask(DNNL_ARG_DST, 0);
+#endif  // !ENABLE_ONEDNN_V3
         } else if (post_op_param.name == "fuse_bn") {
           post_ops.append_binary(dnnl::algorithm::binary_sub,
                                  *context_.bn_mean_md);
@@ -378,8 +417,24 @@ class MklConvFwdPrimitive : public MklPrimitive {
       }
       post_ops_attr.set_post_ops(post_ops);
     }
+#ifndef ENABLE_ONEDNN_V3
     context_.fwd_pd.reset(
         new ConvFwdPd(*context_.fwd_desc, post_ops_attr, cpu_engine_));
+#else
+    if (!convFwdDims.bias_dims.empty()) {
+      context_.fwd_pd.reset(new ConvFwdPd(
+          cpu_engine_, prop_kind::forward, dnnl::algorithm::convolution_direct,
+          *context_.src_md, *context_.filter_md, *context_.bias_md,
+          *context_.dst_md, convFwdDims.strides, convFwdDims.dilations,
+          convFwdDims.padding_left, convFwdDims.padding_right, post_ops_attr));
+    } else {
+      context_.fwd_pd.reset(new ConvFwdPd(
+          cpu_engine_, prop_kind::forward, dnnl::algorithm::convolution_direct,
+          *context_.src_md, *context_.filter_md, *context_.dst_md,
+          convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
+          convFwdDims.padding_right, post_ops_attr));
+    }
+#endif  // !ENABLE_ONEDNN_V3
 
     // Create memory primitive based on dummy data
     context_.src_mem.reset(
@@ -636,12 +691,15 @@ class MklConvOp : public OpKernel {
       // Input tensors
       const Tensor& src_tensor = MklGetInput(context, kInputIndex_Src);
       const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter);
-
       OP_REQUIRES(
           context, filter_tensor.NumElements() > 0,
           errors::InvalidArgument("filter must not have zero elements "
                                   "(i.e. all dimensions must be non-zero)"));
 
+      if (std::is_same<Tinput, float>::value) {
+        (void)SetFPMathMode();
+      }
+
       MklDnnShape src_mkl_shape, filter_mkl_shape;
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape, native_format);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape,
@@ -817,6 +875,7 @@ class MklConvOp : public OpKernel {
 
       // TODO(intel-tf): Extend the basic parameters for data types and fusions
       this->ExtendConvFwdParams(context, convFwdDims);
+      MklDnnThreadPool eigen_tp(context);
       conv_fwd =
           MklConvFwdPrimitiveFactory<Tinput, Tfilter, Tbias, Ttemp_output>::Get(
               convFwdDims, do_not_cache);
@@ -854,6 +913,8 @@ class MklConvOp : public OpKernel {
         // Tensorflow format to MKL format by caching the filter when it is
         // converted for the first time. This cached filter can then be reused
         // in subsequent iterations.
+#ifndef ENABLE_ONEDNN_V3
+        // TODO(intel-tf): Enable weight caching for oneDNN v3.x
         if (is_filter_const_) {
           if (IsFilterCacheEmpty(context)) {
             // Cache filter if it is not already cached.
@@ -863,6 +924,7 @@ class MklConvOp : public OpKernel {
           filter_data = GetCachedFilter(context, conv_fwd_pd->weights_desc());
           is_filter_cached = (filter_data != nullptr);
         }
+#endif  // !ENABLE_ONEDNN_V3
         if (!is_filter_cached) {
           filter.SetUsrMem(filter_md, &filter_tensor);
           if (filter_out_tensor == nullptr) {
@@ -887,7 +949,6 @@ class MklConvOp : public OpKernel {
 
       // Execute convolution
       std::shared_ptr<stream> fwd_cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
       fwd_cpu_stream.reset(CreateStream(&eigen_tp, conv_fwd->GetEngine()));
       if (fuse_biasadd_) {
         const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
@@ -1013,12 +1074,13 @@ class MklConvOp : public OpKernel {
   void set_fuse_biasadd(bool fuse_biasadd) { fuse_biasadd_ = fuse_biasadd; }
   bool get_fuse_biasadd() { return fuse_biasadd_; }
   void set_fuse_activation(bool fuse_activation, dnnl::algorithm activation_alg,
-                           float alpha_or_upbound = 0.0) {
+                           float alpha_or_upbound = 0.0, float beta = 0.0) {
     fuse_activation_ = fuse_activation;
     activation_alg_ = activation_alg;
     // This variable is used for alpha in leakyrelu or upper bound in relu6
     // depending on the context
     alpha_or_upbound_ = alpha_or_upbound;
+    beta_ = beta;
   }
   void set_fuse_pad(bool fuse_pad) {
     fuse_pad_ = fuse_pad;
@@ -1075,8 +1137,10 @@ class MklConvOp : public OpKernel {
             {"fuse_bn", dnnl::algorithm::undef, {1.0}, ""});
       }
       if (fuse_activation_) {
-        params.post_op_params.push_back(
-            {"activation", activation_alg_, {1.0, alpha_or_upbound_, 0.0}, ""});
+        params.post_op_params.push_back({"activation",
+                                         activation_alg_,
+                                         {1.0, alpha_or_upbound_, beta_},
+                                         ""});
       }
     }
   }
@@ -1101,13 +1165,21 @@ class MklConvOp : public OpKernel {
     auto dst_md = conv_prim_desc.dst_desc();
 
     if (!std::is_same<Ttemp_output, Toutput>::value) {
+#ifndef ENABLE_ONEDNN_V3
       dst_md.data.data_type =
           static_cast<dnnl_data_type_t>(MklDnnType<Toutput>());
+#else
+      // Since oneDNN v3.x exposes only an opaque memory descriptor, re-create
+      // the same dst_md as before, but with type == Toutput
+      dst_md =
+          memory::desc(output_dims_mkl_order, MklDnnType<Toutput>(),
+                       MklTensorFormatToMklDnnDataFormat(output_tf_format));
+#endif  // !ENABLE_ONEDNN_V3
     }
 
     // Allocate shape of MKL tensor
     output_mkl_shape->SetMklTensor(true);
-    output_mkl_shape->SetMklLayout(&dst_md);
+    output_mkl_shape->SET_MKL_LAYOUT(dst_md);
     output_mkl_shape->SetElemType(MklDnnType<Toutput>());
     output_mkl_shape->SetTfLayout(output_dims_mkl_order.size(),
                                   output_dims_mkl_order, output_tf_format);
@@ -1205,6 +1277,7 @@ class MklConvOp : public OpKernel {
   // This variable is used for alpha in leakyrelu or upper bound in relu6
   // depending on the context
   float alpha_or_upbound_ = 0.0;
+  float beta_ = 0.0;
   dnnl::algorithm activation_alg_ = dnnl::algorithm::undef;
 
   int input_index_pad_ = 2;
@@ -1270,7 +1343,7 @@ class MklConvOp : public OpKernel {
     // Allocate shape of MKL tensor
     MklDnnShape filter_mkl_shape;
     filter_mkl_shape.SetMklTensor(true);
-    filter_mkl_shape.SetMklLayout(&filter_md);
+    filter_mkl_shape.SET_MKL_LAYOUT(filter_md);
     filter_mkl_shape.SetElemType(MklDnnType<Tfilter>());
 
     // The format of the filter is actually OIhw8i8o, but TF doesn't support
@@ -1327,6 +1400,8 @@ class MklConvOp : public OpKernel {
     memcpy(cached_filter_data, filter_data, cached_filter_data_size);
   }
 
+#ifndef ENABLE_ONEDNN_V3
+  // TODO(intel-tf): This function is no longer used and needs to be removed
   bool AreMemoryDescriptorsEqual(const memory::desc& filter_md,
                                  const Tensor& cached_filter_md) {
     auto filter_md_data = filter_md.data;
@@ -1343,6 +1418,7 @@ class MklConvOp : public OpKernel {
     }
     return true;
   }
+#endif  // !ENABLE_ONEDNN_V3
 
   Tfilter* GetCachedFilter(OpKernelContext* context,
                            const memory::desc& filter_md)
@@ -1393,8 +1469,7 @@ class MklFusedConvOp
     } else if (fused_ops == std::vector<string>{"Relu"}) {
       this->set_fuse_activation(true, dnnl::algorithm::eltwise_relu);
     } else if (fused_ops == std::vector<string>{"Relu6"}) {
-      this->set_fuse_activation(true, dnnl::algorithm::eltwise_bounded_relu,
-                                6.0);
+      this->SET_FUSE_ACTIVATION_FOR_RELU6;
     } else if (fused_ops == std::vector<string>{"Elu"}) {
       this->set_fuse_activation(true, dnnl::algorithm::eltwise_elu, 1.0);
     } else if (fused_ops == std::vector<string>{"LeakyRelu"}) {
@@ -1419,8 +1494,7 @@ class MklFusedConvOp
                       "Fused Conv2D must have one extra argument: bias."));
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu6"}) {
       this->set_fuse_biasadd(true);
-      this->set_fuse_activation(true, dnnl::algorithm::eltwise_bounded_relu,
-                                6.0);
+      this->SET_FUSE_ACTIVATION_FOR_RELU6;
       OP_REQUIRES(context, num_args == 1,
                   errors::InvalidArgument(
                       "Fused Conv2D must have one extra argument: bias."));
@@ -1464,8 +1538,7 @@ class MklFusedConvOp
           errors::InvalidArgument(
               "Fused Conv2D with batchnorm must have 4 extra argument"));
       this->set_fuse_bn(true, epsilon);
-      this->set_fuse_activation(true, dnnl::algorithm::eltwise_bounded_relu,
-                                6.0);
+      this->SET_FUSE_ACTIVATION_FOR_RELU6;
     } else if (fused_ops == std::vector<string>{"FusedBatchNorm", "Elu"}) {
       float epsilon;
       OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
@@ -1509,8 +1582,7 @@ class MklFusedConvOp
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu6"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_add(true);
-      this->set_fuse_activation(true, dnnl::algorithm::eltwise_bounded_relu,
-                                6.0);
+      this->SET_FUSE_ACTIVATION_FOR_RELU6;
       OP_REQUIRES(
           context, num_args == 2,
           errors::InvalidArgument(
@@ -1602,8 +1674,7 @@ class MklFusedDepthwiseConvOp
       this->set_fuse_activation(true, dnnl::algorithm::eltwise_relu);
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu6"}) {
       this->set_fuse_biasadd(true);
-      this->set_fuse_activation(true, dnnl::algorithm::eltwise_bounded_relu,
-                                6.0);
+      this->SET_FUSE_ACTIVATION_FOR_RELU6;
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Elu"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_activation(true, dnnl::algorithm::eltwise_elu, 1.0);
@@ -2087,7 +2158,7 @@ class MklQuantizedConvOp
                                                         output_tensor);
       const Tensor& summand = context->input(this->get_input_add_idx());
       if (summand.dtype() != DT_FLOAT)
-        TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
+        TF_CHECK_OK(Status(absl::StatusCode::kFailedPrecondition,
                            "Current fusion requires summand to be float"));
       // We need to compute scale for the summand
       const float min_input =
@@ -2110,11 +2181,20 @@ class MklQuantizedConvOp
              std::max(std::abs(max_filter[i]), std::abs(min_filter[i])));
       }
       dnnl::primitive_attr reorder_attr;
+#ifndef ENABLE_ONEDNN_V3
       if (depth == 1) {
         reorder_attr.set_output_scales(0, scales);
       } else {
         reorder_attr.set_output_scales(2, scales);
       }
+#else
+      // TODO(intel-tf): Enable this for int8 when using oneDNN v3.x
+      // and return a status instead of using DCHECK_EQ
+      DCHECK_EQ(depth, 1);
+      reorder_attr.set_scales_mask(DNNL_ARG_SRC, 0);
+      reorder_attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
+      reorder_attr.set_scales_mask(DNNL_ARG_DST, 0);
+#endif  // !ENABLE_ONEDNN_V3
       auto summand_md = memory::desc(output_dims_mkl_order, MklDnnType<Tbias>(),
                                      memory::format_tag::nhwc);
       void* summand_buf =
@@ -2173,11 +2253,20 @@ class MklQuantizedConvOp
     }
     if (!is_bias_const_ || IsBiasCacheEmpty(context) || !scales_are_valid) {
       dnnl::primitive_attr bias_attr;
+#ifndef ENABLE_ONEDNN_V3
       if (depth == 1) {
         bias_attr.set_output_scales(0, scales_);
       } else {
         bias_attr.set_output_scales(1, scales_);
       }
+#else
+      // TODO(intel-tf): Enable this for int8 when using oneDNN v3.x
+      // and return a status instead of using DCHECK_EQ
+      DCHECK_EQ(depth, 1);
+      bias_attr.set_scales_mask(DNNL_ARG_SRC, 0);
+      bias_attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
+      bias_attr.set_scales_mask(DNNL_ARG_DST, 0);
+#endif  // !ENABLE_ONEDNN_V3
 
       auto bias_md = memory::desc({static_cast<int>(bias_tensor.NumElements())},
                                   MklDnnType<Tbias>(), memory::format_tag::x);
@@ -2368,8 +2457,7 @@ class MklFusedConv3DOp
       this->set_fuse_activation(true, dnnl::algorithm::eltwise_relu);
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu6"}) {
       this->set_fuse_biasadd(true);
-      this->set_fuse_activation(true, dnnl::algorithm::eltwise_bounded_relu,
-                                6.0);
+      this->SET_FUSE_ACTIVATION_FOR_RELU6;
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Elu"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_activation(true, dnnl::algorithm::eltwise_elu, 1.0);
@@ -2383,8 +2471,7 @@ class MklFusedConv3DOp
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu6"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_add(true);
-      this->set_fuse_activation(true, dnnl::algorithm::eltwise_bounded_relu,
-                                6.0);
+      this->SET_FUSE_ACTIVATION_FOR_RELU6;
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Add", "Elu"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_add(true);
@@ -2811,5 +2898,12 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("_FusedConv3D").Device(DEVICE_CPU).TypeConstraint<bfloat16>("T"),
     NoOp);
+
+#undef APPEND_DEPTHWISE
+#undef APPEND_ELTWISE
+#undef GET_DATA_TYPE
+#undef SET_FUSE_ACTIVATION_FOR_RELU6
+#undef SET_MKL_LAYOUT
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_ops.h b/tensorflow/core/kernels/mkl/mkl_conv_ops.h
index 2ee653c1869..2f35decb354 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.h
@@ -48,7 +48,11 @@ using dnnl::stream;
 
 namespace tensorflow {
 
+#ifndef ENABLE_ONEDNN_V3
+// Op descriptor is no longer supported in oneDNN v3.x. Instead, primitive
+// descriptor will directly accept primitive parameters during creation.
 using ConvFwdDesc = dnnl::convolution_forward::desc;
+#endif  // !ENABLE_ONEDNN_V3
 using ConvFwdPd = dnnl::convolution_forward::primitive_desc;
 
 class MklDnnConvUtil {
diff --git a/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
index 99451a8fe58..ce293200bb3 100644
--- a/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL)
 
 #define EIGEN_USE_THREADS
 
@@ -147,7 +147,14 @@ class MklDequantizeOp : public OpKernel {
       std::vector<float> scales;
       scales.push_back(scale_factor);
       primitive_attr attr;
+#ifndef ENABLE_ONEDNN_V3
       attr.set_output_scales(0, scales);
+#else
+      attr.set_scales_mask(DNNL_ARG_SRC, 0);
+      auto scale_mem =
+          memory({{scales.size()}, MklDnnType<float>(), memory::format_tag::x},
+                 cpu_engine, scales.data());
+#endif  // !ENABLE_ONEDNN_V3
       std::vector<primitive> net;
 
       // Create reorder primitive and then execute.
@@ -156,8 +163,16 @@ class MklDequantizeOp : public OpKernel {
                     dst.GetUsrMem()->get_desc(), attr);
       net.push_back(reorder(reorder_pd));
       std::vector<std::unordered_map<int, memory>> reorder_net_args;
+#ifndef ENABLE_ONEDNN_V3
+      reorder_net_args.push_back({{DNNL_ARG_FROM, *src.GetUsrMem()},
+                                  { DNNL_ARG_TO,
+                                    *dst.GetUsrMem() }});
+#else
       reorder_net_args.push_back(
-          {{DNNL_ARG_FROM, *src.GetUsrMem()}, {DNNL_ARG_TO, *dst.GetUsrMem()}});
+          {{DNNL_ARG_FROM, *src.GetUsrMem()},
+           {DNNL_ARG_TO, *dst.GetUsrMem()},
+           {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, scale_mem}});
+#endif  // !ENABLE_ONEDNN_V3
       execute_primitives(net, reorder_stream, reorder_net_args);
     } catch (dnnl::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
diff --git a/tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc b/tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc
index fc8ae34c3cf..6c46ca1212d 100644
--- a/tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc
@@ -53,24 +53,6 @@ TEST_F(MklDequantizeOpTest, small) {
   test::ExpectTensorNear<float>(expected, output, 0.1);
 }
 
-Tensor CreateMklInput() {
-  MklDnnShape mkl_shape;
-  memory::desc md =
-      memory::desc({1, 2, 2, 2}, MklDnnType<uint8>(), memory::format_tag::nhwc);
-  mkl_shape.SetMklTensor(true);
-  mkl_shape.SetMklLayout(&md);
-  mkl_shape.SetElemType(MklDnnType<uint8>());
-  mkl_shape.SetTfLayout(4, {1, 2, 2, 2}, MklTensorFormat::FORMAT_NHWC);
-
-  DataType dtype = DataTypeToEnum<uint8>::v();
-  Tensor mkl_tensor(dtype,
-                    {static_cast<int64_t>(mkl_shape.GetSerializeBufferSize())});
-  mkl_shape.SerializeMklDnnShape(
-      mkl_tensor.flat<uint8>().data(),
-      mkl_tensor.flat<uint8>().size() * sizeof(uint8));
-  return mkl_tensor;
-}
-
 TEST_F(MklDequantizeOpTest, MKLInput) {
   TF_ASSERT_OK(NodeDefBuilder("dequantize_op", "_MklDequantize")
                    .Input(FakeInput(DT_QUINT8))
diff --git a/tensorflow/core/kernels/mkl/mkl_einsum_op.cc b/tensorflow/core/kernels/mkl/mkl_einsum_op.cc
index b65e64f5ab9..0e0c2eeff5e 100644
--- a/tensorflow/core/kernels/mkl/mkl_einsum_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_einsum_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 #define EIGEN_USE_THREADS
 #define EIGEN_DONT_PARALLELIZE
 
@@ -111,6 +111,7 @@ struct MklEinsumHelper {
                                          out_shape, trans_x, trans_y);
 
     // Create or retrieve matmul primitive from cache.
+    MklDnnThreadPool eigen_tp(ctx);
     MklMatMulPrimitive<T, T, T>* matmul_prim =
         MklMatMulPrimitiveFactory<T, T, T, T>::Get(
             *params, false /* value for do_not_cache */);
@@ -162,7 +163,7 @@ struct MklEinsumHelper {
     scratch_pad.AllocateSPTensor(matmul_prim, ctx);
     // Execute matmul primitive.
     std::shared_ptr<stream> cpu_stream;
-    MklDnnThreadPool eigen_tp(ctx);
+
     cpu_stream.reset(CreateStream(&eigen_tp, matmul_prim->GetEngine()));
 
     matmul_prim->Execute(cpu_stream, lhs.flat<T>().data(), weight_data,
@@ -195,6 +196,10 @@ class MklEinsum : public OpKernel {
     OpInputList inputs;
     OP_REQUIRES_OK(ctx, ctx->input_list("inputs", &inputs));
 
+    if (std::is_same<T, float>::value) {
+      (void)SetFPMathMode();
+    }
+
     OperandLabels input_labels(mkl_input_labels_);
     Labels output_labels(mkl_output_labels_);
     std::vector<EinsumDimensionType> label_types(mkl_label_types_);
@@ -324,4 +329,4 @@ class MklEinsum : public OpKernel {
 TF_CALL_float(REGISTER_EINSUM_MKL);
 TF_CALL_bfloat16(REGISTER_EINSUM_MKL);
 }  // namespace tensorflow
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_eltwise_activation_base_op.h b/tensorflow/core/kernels/mkl/mkl_eltwise_activation_base_op.h
index 6e3a03d5b74..9bfddc2f1d0 100644
--- a/tensorflow/core/kernels/mkl/mkl_eltwise_activation_base_op.h
+++ b/tensorflow/core/kernels/mkl/mkl_eltwise_activation_base_op.h
@@ -43,6 +43,11 @@ using dnnl::stream;
 using EltwiseFwdActivationPd = dnnl::eltwise_forward::primitive_desc;
 
 namespace tensorflow {
+#ifndef ENABLE_ONEDNN_V3
+#define GET_MEMORY_DESC(md) md.data
+#else
+#define GET_MEMORY_DESC(md) md
+#endif  // !ENABLE_ONEDNN_V3
 
 // TODO(tf-onednn): Consolidate this class with `MklEltWiseFwdParams`
 // in `mkl_relu_op.cc`.
@@ -54,17 +59,27 @@ class MklEltwiseFwdActivationParams {
  public:
   memory::dims src_dims;
   memory::desc src_md;
+#ifdef ENABLE_ONEDNN_V3
+  memory::desc dst_md;
+#endif  // ENABLE_ONEDNN_V3
   algorithm alg_kind;
   float alpha;
   float beta;
 
   MklEltwiseFwdActivationParams(memory::dims src_dims, memory::desc src_md,
+#ifdef ENABLE_ONEDNN_V3
+                                memory::desc dst_md,
+#endif  // ENABLE_ONEDNN_V3
                                 algorithm alg_kind, float alpha, float beta)
       : src_dims(src_dims),
         src_md(src_md),
+#ifdef ENABLE_ONEDNN_V3
+        dst_md(dst_md),
+#endif  // ENABLE_ONEDNN_V3
         alg_kind(alg_kind),
         alpha(alpha),
-        beta(beta) {}
+        beta(beta) {
+  }
 };
 
 template <typename T>
@@ -119,7 +134,9 @@ class MklEltwiseFwdActivationPrimitive : public MklPrimitive {
     std::shared_ptr<memory> dst_mem;
 
     // desc & primitive desc
+#ifndef ENABLE_ONEDNN_V3
     std::shared_ptr<dnnl::eltwise_forward::desc> fwd_desc;
+#endif  // !ENABLE_ONEDNN_V3
     std::shared_ptr<EltwiseFwdActivationPd> fwd_pd;
 
     // memory desc
@@ -139,26 +156,36 @@ class MklEltwiseFwdActivationPrimitive : public MklPrimitive {
     EltwiseFwdActivationContext()
         : src_mem(nullptr),
           dst_mem(nullptr),
+#ifndef ENABLE_ONEDNN_V3
           fwd_desc(nullptr),
+#endif  // !ENABLE_ONEDNN_V3
           fwd_pd(nullptr),
           src_md(nullptr),
           dst_md(nullptr),
           src_mpd(nullptr),
-          eltwise_fwd(nullptr) {}
+          eltwise_fwd(nullptr) {
+    }
   };
 
   // Eltwise forward primitive setup
   void Setup(const MklEltwiseFwdActivationParams<T>& fwdParams) {
     // create memory descriptors for eltwise data with specified format
-    context_.src_md.reset(new memory::desc(fwdParams.src_md.data));
+    context_.src_md.reset(new memory::desc(GET_MEMORY_DESC(fwdParams.src_md)));
     context_.src_mpd.reset(new memory::desc(*context_.src_md));
 
     // Create an eltwise forward descriptor and primitive descriptor
+#ifndef ENABLE_ONEDNN_V3
     context_.fwd_desc.reset(new eltwise_forward::desc(
         prop_kind::forward, fwdParams.alg_kind, *context_.src_md,
         fwdParams.alpha, fwdParams.beta));
     context_.fwd_pd.reset(
         new EltwiseFwdActivationPd(*context_.fwd_desc, cpu_engine_));
+#else
+    context_.dst_md.reset(new memory::desc(fwdParams.dst_md));
+    context_.fwd_pd.reset(new EltwiseFwdActivationPd(
+        cpu_engine_, prop_kind::forward, fwdParams.alg_kind, *context_.src_md,
+        *context_.dst_md, fwdParams.alpha, fwdParams.beta));
+#endif  // !ENABLE_ONEDNN_V3
     auto fwd_pd = context_.fwd_pd.get();
 
     // Create memory primitive based on dummy data
@@ -274,9 +301,16 @@ class MklEltwiseFwdActivationOpBase : public OpKernel {
       // Create blocked memory descriptor
       src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
 
+#ifdef ENABLE_ONEDNN_V3
+      memory::desc dst_md = src_md;
+#endif  // ENABLE_ONEDNN_V3
+
       // Try to get an eltwise forward primitive from caching pool
-      MklEltwiseFwdActivationParams<T> fwdParams(src_dims, src_md, alg_kind,
-                                                 alpha_, beta_);
+      MklEltwiseFwdActivationParams<T> fwdParams(src_dims, src_md,
+#ifdef ENABLE_ONEDNN_V3
+                                                 dst_md,
+#endif  // ENABLE_ONEDNN_V3
+                                                 alg_kind, alpha_, beta_);
       MklEltwiseFwdActivationPrimitive<T>* eltwise_fwd =
           MklEltwiseFwdActivationPrimitiveFactory<T>::Get(fwdParams);
 
@@ -309,6 +343,8 @@ class MklEltwiseFwdActivationOpBase : public OpKernel {
 
 // TODO : Implement Eltwise bwd / eltwiseGrad class
 
+#undef GET_MEMORY_DESC
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
index ffb38e22b55..611c3709878 100644
--- a/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "dnnl.hpp"
@@ -838,6 +838,7 @@ class MklFusedBatchNormOp : public OpKernel {
                                       tensor_format_, src_md, activation_mode_);
 
       // Get forward batch-normalization op from the primitive caching pool.
+      MklDnnThreadPool eigen_tp(context);
       MklFusedBatchNormFwdPrimitive<T, U>* bn_fwd =
           MklFusedBatchNormFwdPrimitiveFactory<T, U>::Get(fwdParams);
 
@@ -927,7 +928,7 @@ class MklFusedBatchNormOp : public OpKernel {
 
       // Execute
       std::shared_ptr<stream> fwd_cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
+
       fwd_cpu_stream.reset(CreateStream(&eigen_tp, bn_fwd->GetEngine()));
       bn_fwd->Execute(src_data, weights_op_data, dst_data, mean_op_data,
                       variance_op_data, fwd_cpu_stream, ws_data);
@@ -1311,6 +1312,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
       MklBatchNormBwdParams bwdParams(src_dims, diff_dst_dims, depth_, epsilon_,
                                       is_training_, tensor_format_, src_md,
                                       diff_dst_md);
+      MklDnnThreadPool eigen_tp(context);
       MklFusedBatchNormBwdPrimitive<T, U>* bn_bwd =
           MklFusedBatchNormBwdPrimitiveFactory<T, U>::Get(bwdParams);
 
@@ -1364,7 +1366,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       // Execute
       std::shared_ptr<stream> bwd_cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
+
       bwd_cpu_stream.reset(CreateStream(&eigen_tp, bn_bwd->GetEngine()));
       bn_bwd->Execute(src_data, mean_data, variance_data, diff_dst_data,
                       weights_data, diff_src_data, diff_weights_data,
@@ -1627,4 +1629,4 @@ REGISTER_MKL_FUSED_BATCHNORM_GRAD_V3_CPU(bfloat16, float);
 #undef GET_FLAG
 #undef IS_SET
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op_test.cc b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op_test.cc
index d3d8089b965..0b5e1f902dd 100644
--- a/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
@@ -159,7 +159,8 @@ class CommonTestUtilities : public OpsTestBase {
     ASSERT_EQ(offset_backprop.shape(), mkl_offset_backprop.shape());
 
     test::ExpectClose(output, mkl_output, 1e-5);
-    test::ExpectClose(scale_backprop, mkl_scale_backprop, 1e-5);
+    test::ExpectClose(scale_backprop, mkl_scale_backprop, /*atol=*/1e-5,
+                      /*rtol=*/1e-5);
     test::ExpectClose(offset_backprop, mkl_offset_backprop, 1e-5);
   }
 
@@ -427,4 +428,4 @@ INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedBatchNormOpTest,
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_fused_instance_norm_op.cc b/tensorflow/core/kernels/mkl/mkl_fused_instance_norm_op.cc
new file mode 100644
index 00000000000..6373bf09539
--- /dev/null
+++ b/tensorflow/core/kernels/mkl/mkl_fused_instance_norm_op.cc
@@ -0,0 +1,281 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "dnnl.hpp"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+using namespace dnnl;
+using dnnl::batch_normalization_forward;
+using dnnl::prop_kind;
+using dnnl::stream;
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+namespace tensorflow {
+template <typename Device, typename T>
+class MklFusedInstanceNormOp : public OpKernel {
+ public:
+  explicit MklFusedInstanceNormOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
+    std::vector<int> mean_reduction_axes;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("reduction_axes", &mean_reduction_axes));
+    OP_REQUIRES(context, InferDataFormat(mean_reduction_axes),
+                errors::InvalidArgument(
+                    "Failed to infer data format from reduction axes"));
+    CheckFusedActivation(context);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    try {
+      const Tensor& src_tensor = ctx->input(kSrcIndex);
+      const Tensor& scale_tensor = ctx->input(kScaleIndex);
+      const Tensor& shift_tensor = ctx->input(kShiftIndex);
+
+      OP_REQUIRES(ctx,
+                  (src_tensor.dims() == 4 && data_format_ == "NHWC") ||
+                      (src_tensor.dims() == 4 && data_format_ == "NCHW") ||
+                      (src_tensor.dims() == 5 && data_format_ == "NDHWC") ||
+                      (src_tensor.dims() == 5 && data_format_ == "NCDHW"),
+                  errors::InvalidArgument(
+                      "Unsupported input: ", src_tensor.shape().DebugString(),
+                      ", ", data_format_));
+      size_t num_elements_scale = scale_tensor.NumElements();
+      size_t num_elements_shift = shift_tensor.NumElements();
+      OP_REQUIRES(
+          ctx, num_elements_scale == num_elements_shift,
+          errors::InvalidArgument("Number of elements in scale and shift",
+                                  "tensors are not same."));
+
+      TensorFormat tensor_format;
+      OP_REQUIRES(ctx, FormatFromString(data_format_, &tensor_format),
+                  errors::InvalidArgument("Invalid data format"));
+
+      MklDnnThreadPool eigen_tp(ctx);
+      std::shared_ptr<stream> engine_stream_ptr;
+      engine_stream_ptr.reset(CreateStream(&eigen_tp, cpu_engine_));
+
+      const int batch_size = src_tensor.shape().dim_size(0);
+      const int64_t elems_per_batch =
+          src_tensor.shape().num_elements() / batch_size;
+
+      memory::dims src_dims =
+          (src_tensor.dims() == 5)
+              ? TFShapeToMklDnnDimsInNCDHW(src_tensor.shape(), tensor_format)
+              : TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format);
+
+      // oneDNN has no direct support for instancenorm, use a workaround
+      // with performing multiple batchnorm computations for each sample
+      // in the batched input.
+      src_dims[0] = 1;
+
+      memory::format_tag tag;
+      if (data_format_ == "NCHW" || data_format_ == "NCDHW") {
+        tag = (src_dims.size() == 5) ? memory::format_tag::ncdhw
+                                     : memory::format_tag::nchw;
+      } else {
+        tag = (src_dims.size() == 5) ? memory::format_tag::ndhwc
+                                     : memory::format_tag::nhwc;
+      }
+      auto src_md = memory::desc(src_dims, MklDnnType<T>(), tag);
+
+      void* src_buf =
+          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+
+      memory::dims scale_shift_dims = {
+          2, static_cast<dnnl_dim_t>(num_elements_scale)};
+      auto scale_shift_md = memory::desc(scale_shift_dims, MklDnnType<float>(),
+                                         memory::format_tag::nc);
+      Tensor scale_shift_tensor;
+      int64_t tensor_shape = scale_shift_md.get_size() / sizeof(float);
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(DataTypeToEnum<float>::v(), {tensor_shape},
+                                  &scale_shift_tensor));
+      void* scale_shift_buf =
+          static_cast<void*>(scale_shift_tensor.flat<float>().data());
+      SetupScaleShiftBuffer(scale_tensor, shift_tensor, engine_stream_ptr,
+                            num_elements_scale, scale_shift_buf);
+      auto scale_shift_mem =
+          memory(scale_shift_md, cpu_engine_, scale_shift_buf);
+
+      // Create oneDNN primitive
+      auto bnorm_desc = batch_normalization_forward::desc(
+          prop_kind::forward_inference, src_md, epsilon_,
+          normalization_flags::use_scale_shift);
+      batch_normalization_forward::primitive_desc bnorm_pd;
+      if (fuse_activation_) {
+        dnnl::post_ops post_ops;
+        dnnl::primitive_attr post_ops_attr;
+        post_ops.append_eltwise(1.0, dnnl::algorithm::eltwise_relu,
+                                leakyrelu_alpha_, 0.0);
+        post_ops_attr.set_post_ops(post_ops);
+        bnorm_pd = batch_normalization_forward::primitive_desc(
+            bnorm_desc, post_ops_attr, cpu_engine_);
+      } else {
+        bnorm_pd = batch_normalization_forward::primitive_desc(bnorm_desc,
+                                                               cpu_engine_);
+      }
+      auto bnorm_prim = batch_normalization_forward(bnorm_pd);
+
+      // dst memory
+      Tensor* output_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {0}, 0, src_tensor.shape(), &output_tensor));
+      void* dst_buf =
+          static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
+
+      std::unique_ptr<dnnl::memory> dst_mem_ptr(
+          new memory(src_md, cpu_engine_, (char*)nullptr));
+      std::unique_ptr<dnnl::memory> src_mem_ptr(
+          new memory(src_md, cpu_engine_, (char*)nullptr));
+
+      const T* src_buf_batch = const_cast<T*>(src_tensor.flat<T>().data());
+      const T* dst_buf_batch = const_cast<T*>(output_tensor->flat<T>().data());
+
+      std::unordered_map<int, memory> bnorm_args;
+      bnorm_args.insert({DNNL_ARG_SRC, *src_mem_ptr});
+      bnorm_args.insert({DNNL_ARG_SCALE_SHIFT, scale_shift_mem});
+      bnorm_args.insert({DNNL_ARG_DST, *dst_mem_ptr});
+
+      // Perform batchnorm computation for each batch in input
+      for (int i = 0; i < batch_size; i++) {
+        src_mem_ptr->set_data_handle(static_cast<void*>(
+            const_cast<T*>(src_buf_batch + i * elems_per_batch)));
+        dst_mem_ptr->set_data_handle(static_cast<void*>(
+            const_cast<T*>(dst_buf_batch + i * elems_per_batch)));
+        bnorm_prim.execute(*engine_stream_ptr, bnorm_args);
+      }
+    } catch (dnnl::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          ctx, errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+ private:
+  engine cpu_engine_ = engine(engine::kind::cpu, 0);
+  float epsilon_ = 0.0001f;
+  float leakyrelu_alpha_ = 0.2f;
+  string data_format_ = "";
+  const int kSrcIndex = 0;
+  const int kScaleIndex = 1;
+  const int kShiftIndex = 2;
+  bool fuse_activation_ = false;
+
+  void CheckFusedActivation(OpKernelConstruction* context) {
+    std::vector<string> fused_ops;
+    OP_REQUIRES_OK(context, context->GetAttr("fused_ops", &fused_ops));
+
+    if (fused_ops.empty()) return;
+
+    if (fused_ops == std::vector<string>{"Relu"}) {
+      fuse_activation_ = true;
+      leakyrelu_alpha_ = 0.0f;
+    } else if (fused_ops == std::vector<string>{"LeakyRelu"}) {
+      fuse_activation_ = true;
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("leakyrelu_alpha", &leakyrelu_alpha_));
+    } else {
+      OP_REQUIRES(context, false,
+                  errors::Unimplemented("Fusion is not implemented: [",
+                                        absl::StrJoin(fused_ops, ","), "]"));
+    }
+  }
+
+  // Given the reduction axes of mean computation, infer data format of input
+  bool InferDataFormat(const std::vector<int>& mean_reduction_axes) {
+    bool valid = true;
+    if (mean_reduction_axes == std::vector<int>{1, 2}) {
+      data_format_ = "NHWC";
+    } else if (mean_reduction_axes == std::vector<int>{2, 3}) {
+      data_format_ = "NCHW";
+    } else if (mean_reduction_axes == std::vector<int>{1, 2, 3}) {
+      data_format_ = "NDHWC";
+    } else if (mean_reduction_axes == std::vector<int>{2, 3, 4}) {
+      data_format_ = "NCDHW";
+    } else {
+      valid = false;
+    }
+    return valid;
+  }
+
+  // Helper function to add scale and shift data into same buffer in float
+  // type as requested by oneDNN
+  void SetupScaleShiftBuffer(const Tensor& scale_tensor,
+                             const Tensor& shift_tensor,
+                             std::shared_ptr<stream> engine_stream_ptr,
+                             int num_elements, void* scale_shift_buf) {
+    void* scale_buf_src =
+        static_cast<void*>(const_cast<T*>(scale_tensor.flat<T>().data()));
+    void* shift_buf_src =
+        static_cast<void*>(const_cast<T*>(shift_tensor.flat<T>().data()));
+    auto scale_offset = sizeof(float) * num_elements;
+    void* scale_buf_dst = scale_shift_buf;
+    void* shift_buf_dst = static_cast<char*>(scale_shift_buf) + scale_offset;
+
+    if (std::is_same<T, float>::value) {
+      memcpy(scale_buf_dst, scale_buf_src, scale_offset);
+      memcpy(shift_buf_dst, shift_buf_src, scale_offset);
+    } else {
+      // oneDNN requires float type for scale_shift, need to convert to float
+      // type
+      auto scale_mem_src =
+          memory({{num_elements}, MklDnnType<T>(), memory::format_tag::x},
+                 cpu_engine_, scale_buf_src);
+      auto scale_mem_dst =
+          memory({{num_elements}, MklDnnType<float>(), memory::format_tag::x},
+                 cpu_engine_, scale_buf_dst);
+      auto scale_reorder_prim = reorder(scale_mem_src, scale_mem_dst);
+      std::unordered_map<int, memory> scale_reorder_args;
+      scale_reorder_args.insert({DNNL_ARG_FROM, scale_mem_src});
+      scale_reorder_args.insert({DNNL_ARG_TO, scale_mem_dst});
+      scale_reorder_prim.execute(*engine_stream_ptr, scale_reorder_args);
+
+      auto shift_mem_src =
+          memory({{num_elements}, MklDnnType<T>(), memory::format_tag::x},
+                 cpu_engine_, shift_buf_src);
+      auto shift_mem_dst =
+          memory({{num_elements}, MklDnnType<float>(), memory::format_tag::x},
+                 cpu_engine_, shift_buf_dst);
+      auto shift_reorder_prim = reorder(shift_mem_src, shift_mem_dst);
+      std::unordered_map<int, memory> shift_reorder_args;
+      shift_reorder_args.insert({DNNL_ARG_FROM, shift_mem_src});
+      shift_reorder_args.insert({DNNL_ARG_TO, shift_mem_dst});
+      shift_reorder_prim.execute(*engine_stream_ptr, shift_reorder_args);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklFusedInstanceNorm").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    MklFusedInstanceNormOp<CPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(Name("_MklFusedInstanceNorm")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<bfloat16>("T"),
+                        MklFusedInstanceNormOp<CPUDevice, bfloat16>);
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
index d457797243f..118b3273ea7 100644
--- a/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
@@ -849,6 +849,7 @@ TEST_F(FilterCacheTest, Conv2DFilterCacheTest) {
   Run<float>(DT_FLOAT, image, filter, expected, true);
 }
 
+#ifndef ENABLE_ONEDNN_V3
 // Testing fusion of MatMul and BiasAdd
 template <typename T>
 class MklFusedMatMulOpTest : public OpsTestBase {
@@ -1124,6 +1125,7 @@ TEST_F(MklFusedMatMulCacheTest, WeightCachedTrue) { Run(true); }
 
 // Test that a non-const filter can not be cached.
 TEST_F(MklFusedMatMulCacheTest, WeightCachedFalse) { Run(false); }
+#endif  // !ENABLE_ONEDNN_V3
 
 class BiasCacheTest : public OpsTestBase {
  public:
@@ -1256,6 +1258,7 @@ class BiasCacheTest : public OpsTestBase {
   }
 };
 
+#ifndef ENABLE_ONEDNN_V3
 TEST_F(BiasCacheTest, Conv2DBiasCacheTestOldAPI) {
   TestConv2DBiasCacheTest(true);
 }
@@ -1263,6 +1266,7 @@ TEST_F(BiasCacheTest, Conv2DBiasCacheTestOldAPI) {
 TEST_F(BiasCacheTest, Conv2DBiasCacheTestNewAPI) {
   TestConv2DBiasCacheTest(false);
 }
+#endif  // !ENABLE_ONEDNN_V3
 
 // Testing fusion of pad and fusedconv2d
 template <typename T>
diff --git a/tensorflow/core/kernels/mkl/mkl_layer_norm_op.cc b/tensorflow/core/kernels/mkl/mkl_layer_norm_op.cc
index 6e35395890f..297e95c1cc6 100644
--- a/tensorflow/core/kernels/mkl/mkl_layer_norm_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_layer_norm_op.cc
@@ -74,7 +74,8 @@ class MklLayerNormOp : public OpKernel {
           static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
       auto src_mem = memory(src_md, cpu_engine, src_buf);
 
-      // oneDNN requires scale-shift as a combined array in float32 type.
+#ifndef ENABLE_ONEDNN_V3
+      // oneDNN v2.x requires scale-shift as a combined array in float32 type.
       memory::dims scale_shift_dims = {
           2, static_cast<dnnl_dim_t>(num_elements_scale)};
       auto scale_shift_md =
@@ -89,15 +90,43 @@ class MklLayerNormOp : public OpKernel {
           static_cast<void*>(scale_shift_tensor.flat<float>().data());
       auto scale_shift_mem =
           memory(scale_shift_md, cpu_engine, scale_shift_buf);
+      void* scale_buf_dst = scale_shift_buf;
+      void* shift_buf_dst = static_cast<char*>(scale_shift_buf) +
+                            sizeof(float) * num_elements_scale;
+#else
+      // oneDNN v3.x requires scale and shift as separate float32 arrays
+      memory::dims scale_shift_dims = {
+          static_cast<dnnl_dim_t>(num_elements_scale)};
+      auto scale_shift_md =
+          memory::desc(static_cast<memory::dims>(scale_shift_dims),
+                       MklDnnType<float>(), memory::format_tag::x);
+
+      const int64_t scale_shift_tensor_shape =
+          scale_shift_md.get_size() / sizeof(float);
+
+      Tensor scale_buf_tensor;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<float>::v(),
+                                             {scale_shift_tensor_shape},
+                                             &scale_buf_tensor));
+      void* scale_buf_dst =
+          static_cast<void*>(scale_buf_tensor.flat<float>().data());
+      auto scale_mem = memory(scale_shift_md, cpu_engine, scale_buf_dst);
+
+      Tensor shift_buf_tensor;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<float>::v(),
+                                             {scale_shift_tensor_shape},
+                                             &shift_buf_tensor));
+      void* shift_buf_dst =
+          static_cast<void*>(shift_buf_tensor.flat<float>().data());
+      auto shift_mem = memory(scale_shift_md, cpu_engine, shift_buf_dst);
+#endif  // !ENABLE_ONEDNN_V3
 
-      // Copy of reorder scale and shift tensor data into scale_shift_tensor.
       void* scale_buf_src =
           static_cast<void*>(const_cast<T*>(scale_tensor.flat<T>().data()));
       auto scale_mem_src = memory({{static_cast<ptrdiff_t>(num_elements_scale)},
                                    MklDnnType<T>(),
                                    memory::format_tag::x},
                                   cpu_engine, scale_buf_src);
-      void* scale_buf_dst = scale_shift_buf;
       auto scale_mem_dst = memory({{static_cast<ptrdiff_t>(num_elements_scale)},
                                    MklDnnType<float>(),
                                    memory::format_tag::x},
@@ -114,8 +143,6 @@ class MklLayerNormOp : public OpKernel {
                                    MklDnnType<T>(),
                                    memory::format_tag::x},
                                   cpu_engine, shift_buf_src);
-      void* shift_buf_dst = static_cast<char*>(scale_shift_buf) +
-                            sizeof(float) * num_elements_scale;
       auto shift_mem_dst = memory({{static_cast<ptrdiff_t>(num_elements_shift)},
                                    MklDnnType<float>(),
                                    memory::format_tag::x},
@@ -127,11 +154,18 @@ class MklLayerNormOp : public OpKernel {
       shift_reorder_prim.execute(*cpu_stream, shift_reorder_args);
 
       // Create layer_normalization primitive
+#ifndef ENABLE_ONEDNN_V3
       auto lnorm_desc = layer_normalization_forward::desc(
           prop_kind::forward_inference, src_md, epsilon_,
           normalization_flags::use_scale_shift);
       auto lnorm_pd =
           layer_normalization_forward::primitive_desc(lnorm_desc, cpu_engine);
+#else
+      auto dst_md = src_md;
+      auto lnorm_pd = layer_normalization_forward::primitive_desc(
+          cpu_engine, prop_kind::forward_inference, src_md, dst_md, epsilon_,
+          normalization_flags::use_scale | normalization_flags::use_shift);
+#endif  // !ENABLE_ONEDNN_V3
       auto lnorm_prim = layer_normalization_forward(lnorm_pd);
 
       // mean and variance memory
@@ -150,7 +184,12 @@ class MklLayerNormOp : public OpKernel {
       lnorm_args.insert({DNNL_ARG_SRC, src_mem});
       lnorm_args.insert({DNNL_ARG_MEAN, mean_mem});
       lnorm_args.insert({DNNL_ARG_VARIANCE, variance_mem});
+#ifndef ENABLE_ONEDNN_V3
       lnorm_args.insert({DNNL_ARG_SCALE_SHIFT, scale_shift_mem});
+#else
+      lnorm_args.insert({DNNL_ARG_SCALE, scale_mem});
+      lnorm_args.insert({DNNL_ARG_SHIFT, shift_mem});
+#endif  // !ENABLE_ONEDNN_V3
       lnorm_args.insert({DNNL_ARG_DST, dst_mem});
       lnorm_prim.execute(*cpu_stream, lnorm_args);
     } catch (dnnl::error& e) {
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
index 03d6bd7d0cc..e62b15d1922 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
@@ -23,7 +23,7 @@ limitations under the License.
 // and when it is undefined at build time, this file becomes an empty
 // compilation unit
 
-#if defined(INTEL_MKL)
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 
 #include "dnnl.hpp"
 #include "tensorflow/core/framework/op.h"
@@ -86,6 +86,10 @@ class MklMatMulOp : public OpKernel {
       return;
     }
 
+    if (std::is_same<T, float>::value) {
+      (void)SetFPMathMode();
+    }
+
     const int m = a.dim_size(1 - dim_pair[0].first);
     const int k = a.dim_size(dim_pair[0].first);
     const int n = b.dim_size(1 - dim_pair[0].second);
@@ -191,6 +195,10 @@ class MklMatMulOp : public OpKernel {
   }
 };
 
+// We do not want to use this kernel for aarch64 because the
+// Arm Compute Library does not provide a BLAS SGEMM
+// interface, which is what MklMatMulOp calls by default.
+#ifndef DNNL_AARCH64_USE_ACL
 #define REGISTER_CPU(T)                                   \
   REGISTER_KERNEL_BUILDER(                                \
       Name("_MklMatMul")                                  \
@@ -203,5 +211,7 @@ class MklMatMulOp : public OpKernel {
 // additional types
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_bfloat16(REGISTER_CPU);
+#endif  // !DNNL_AARCH64_USE_ACL
+
 }  // namespace tensorflow
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_op_benchmark.cc b/tensorflow/core/kernels/mkl/mkl_matmul_op_benchmark.cc
index bcee0d22f07..f05f8d112d4 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_op_benchmark.cc
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_op_benchmark.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/graph/mkl_testlib.h"
@@ -132,4 +132,4 @@ BM_Matmul(2000, 1, 2000, true, true);
 }  // namespace
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc b/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
index 69a6cbcabb6..0bef8621538 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 // This file uses oneDNN InnerProduct for acceleration of TF Matrix-Matrix
 // Multiplication (MatMul) with bias (BiasAdd) operations.
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
@@ -64,6 +64,10 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, T> {
     const Tensor& weight_tensor = ctx->input(this->kInputIndexWeight);
     const Tensor& bias_tensor = MklGetInput(ctx, this->kInputIndexBias);
 
+    if (std::is_same<T, float>::value) {
+      (void)SetFPMathMode();
+    }
+
     MklDnnShape src_mkl_shape;
     MklDnnShape weight_mkl_shape;
     GetMklShape(ctx, this->kInputIndexSrc, &src_mkl_shape, native_format);
@@ -129,7 +133,8 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, T> {
         memory::format_tag::nc, this->is_weight_const_);
     // Extend the basic parameters for data types and fusions.
     ExtendMklDnnMatMulFwdParams(ctx, matmul_params);
-
+    auto st = ExecuteSingleThreadedGemm(batch, channel, k, sizeof(T));
+    MklDnnThreadPool eigen_tp(ctx, st ? 1 : -1);
     MklDnnMatMulFwdPrimitive<T, T, T, T, T>* matmul_prim =
         MklDnnMatMulFwdPrimitiveFactory<T, T, T, T, T>::Get(matmul_params, 0);
 
@@ -256,8 +261,7 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, T> {
         }
       }
       std::shared_ptr<stream> cpu_stream;
-      auto st = ExecuteSingleThreadedGemm(batch, channel, k, sizeof(T));
-      MklDnnThreadPool eigen_tp(ctx, st ? 1 : -1);
+
       cpu_stream.reset(CreateStream(&eigen_tp, matmul_prim->GetEngine()));
 
       UserScratchPad<unsigned char> scratch_pad;
@@ -335,4 +339,4 @@ TF_CALL_bfloat16(REGISTER_FUSEDMATMUL_MKL_SUPPORTED_KERNELS_TYPES);
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index 1c2632d8438..542781742cd 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_MATMUL_OPS_COMMON_H_
 #define TENSORFLOW_CORE_KERNELS_MKL_MKL_MATMUL_OPS_COMMON_H_
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 #include <memory>
 #include <string>
 #include <vector>
@@ -896,15 +896,17 @@ void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
 
   MklMatMulParams params("dnnl_gemm", a_dims, b_dims, c_dims, a_strides,
                          b_strides, c_strides);
+  auto st = ExecuteSingleThreadedGemm(m, n, k, sizeof(T));
+  MklDnnThreadPool eigen_tp(ctx, st ? 1 : -1);
   MklMatMulPrimitive<T, T, T>* matmul_prim =
       MklMatMulPrimitiveFactory<T, T, T, T>::Get(params, 0);
 
   UserScratchPad<unsigned char> scratch_pad;
   scratch_pad.AllocateSPTensor(matmul_prim, ctx);
   // Execute matmul primitive.
-  auto st = ExecuteSingleThreadedGemm(m, n, k, sizeof(T));
+
   std::shared_ptr<stream> cpu_stream;
-  MklDnnThreadPool eigen_tp(ctx, st ? 1 : -1);
+
   cpu_stream.reset(CreateStream(&eigen_tp, matmul_prim->GetEngine()));
   matmul_prim->Execute(cpu_stream, a, b, c, scratch_pad.Get());
 }
@@ -913,5 +915,5 @@ void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
 #endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_MATMUL_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
index b737e967ef4..1422d09bf09 100644
--- a/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // See docs in ../ops/nn_ops.cc.
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/framework/op_kernel.h"
@@ -136,6 +136,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
           padding_right, dnnl::algorithm::pooling_max, pooling_prop_kind,
           static_cast<memory::format_tag>(this->data_format_mkldnn_), input_md,
           this->native_format_);
+      MklDnnThreadPool eigen_tp(context);
       pooling_fwd = MklPoolingFwdPrimitiveFactory<T>::Get(fwdParams);
       // Allocate output tensor.
       this->AllocateOutputTensor(context, *(pooling_fwd->GetPoolingFwdPd()),
@@ -148,7 +149,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
 
       T* dst_data = output_tensor->flat<T>().data();
       std::shared_ptr<stream> fwd_cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
+
       fwd_cpu_stream.reset(CreateStream(&eigen_tp, pooling_fwd->GetEngine()));
 
       if (int8_forward_inference) {
@@ -319,11 +320,12 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           prop_kind::forward_training,
           static_cast<memory::format_tag>(this->data_format_mkldnn_), src_md,
           this->native_format_);
+      MklDnnThreadPool eigen_tp(context);
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
       std::shared_ptr<stream> bwd_cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
+
       bwd_cpu_stream.reset(CreateStream(&eigen_tp, pooling_bwd->GetEngine()));
       // Allocate output tensor and memory primitive.
       Tensor* output_tensor = nullptr;
@@ -441,6 +443,13 @@ REGISTER_KERNEL_BUILDER(Name("_MklQuantizedMaxPool")
                             .TypeConstraint<qint8>("T")
                             .Label(mkl_op_registry::kMklQuantizedOpLabel),
                         MklMaxPoolingOp<CPUDevice, qint8, true>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_QuantizedMaxPool3D").Device(DEVICE_CPU).TypeConstraint<quint8>("T"),
+    MklMaxPoolingOp<CPUDevice, quint8, true>);
+REGISTER_KERNEL_BUILDER(
+    Name("_QuantizedMaxPool3D").Device(DEVICE_CPU).TypeConstraint<qint8>("T"),
+    MklMaxPoolingOp<CPUDevice, qint8, true>);
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
index 31ebdae3fb5..475d76a4158 100644
--- a/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 
 #include "tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h"
 
@@ -390,4 +390,4 @@ void MklPoolParameters::Init(OpKernelContext* context,
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
index b8426920303..77f6cb24792 100644
--- a/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
@@ -16,13 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MKL_MKL_POOLING_OPS_COMMON_H_
 #define TENSORFLOW_CORE_KERNELS_MKL_MKL_POOLING_OPS_COMMON_H_
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "dnnl.hpp"
+#include "tensorflow/core/framework/kernel_shape_util.h"
+#include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 #ifdef DNNL_AARCH64_USE_ACL
@@ -736,5 +738,5 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
 #endif  // TENSORFLOW_CORE_KERNELS_MKL_MKL_POOLING_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
index eb069a9aa27..53bd037d182 100644
--- a/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
@@ -89,7 +89,7 @@ limitations under the License.
 //
 // More information of this implementation can be found in
 // https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
@@ -232,6 +232,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> {
       this->ExtendMklDnnMatMulFwdParams(context, matmul_fwd_dims);
 
       // Get a MatMul fwd from primitive pool.
+      MklDnnThreadPool eigen_tp(context);
       matmul_fwd =
           MklDnnMatMulFwdPrimitiveFactory<float, Tinput, Tweight, Tbias,
                                           Toutput>::Get(matmul_fwd_dims, 0);
@@ -291,7 +292,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> {
       }
 
       std::shared_ptr<stream> cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
+
       cpu_stream.reset(CreateStream(&eigen_tp, matmul_fwd->GetEngine()));
 
       UserScratchPad<unsigned char> scratch_pad;
@@ -641,4 +642,4 @@ REGISTER_MKL_KERNEL_ALL_BIAS_TYPES("_MklQuantizedMatMulWithBiasAndDequantize",
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc b/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
index 724f1758d91..2b87f239bca 100644
--- a/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#if defined(INTEL_MKL)
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 #define EIGEN_USE_THREADS
 
 #include <functional>
@@ -543,4 +543,4 @@ TEST_F(QuantizedMatMulTest, Small_withWeightCached) {
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
index 5eac2cd689c..f2f2234f5fa 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
@@ -56,12 +56,22 @@ enum {
 }  // namespace
 
 namespace tensorflow {
+
+#ifndef ENABLE_ONEDNN_V3
+#define SET_MKL_LAYOUT(md) SetMklLayout(&md)
+#else
+#define SET_MKL_LAYOUT(md) SetMklLayout(md)
+#endif  // !ENABLE_ONEDNN_V3
+
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 struct MklReorderWithScaleFwdParams {
   memory::dims src_dims;
   memory::desc src_md;
   memory::desc dst_md;
+#ifdef ENABLE_ONEDNN_V3
+  memory::desc scale_md;
+#endif  // ENABLE_ONEDNN_V3
   string dtypes = string("");
   struct PostOpParam {
     string name;
@@ -69,9 +79,18 @@ struct MklReorderWithScaleFwdParams {
   };
   PostOpParam post_op_params;
 
+#ifndef ENABLE_ONEDNN_V3
   MklReorderWithScaleFwdParams(memory::dims src_dims, memory::desc src_md,
                                memory::desc dst_md)
       : src_dims(src_dims), src_md(src_md), dst_md(dst_md) {}
+#else
+  MklReorderWithScaleFwdParams(memory::dims src_dims, memory::desc src_md,
+                               memory::desc dst_md, memory::desc scale_md)
+      : src_dims(src_dims),
+        src_md(src_md),
+        dst_md(dst_md),
+        scale_md(scale_md) {}
+#endif  // ENABLE_ONEDNN_V3
 };
 
 class MklReorderWithScalePrimitive : public MklPrimitive {
@@ -88,21 +107,30 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
   std::shared_ptr<primitive> GetPrimitive() { return context_.reorder_prim; }
 
   void Execute(void* src_data, void* dst_data,
+#ifdef ENABLE_ONEDNN_V3
+               void* scale_data,
+#endif  // ENABLE_ONEDNN_V3
                std::shared_ptr<stream> reorder_stream) {
 #ifdef DNNL_AARCH64_USE_ACL
     mutex_lock lock(primitive_execution_mu_);
 #endif
-#ifndef ENABLE_ONEDNN_OPENMP
+#if !defined(ENABLE_ONEDNN_OPENMP) && !defined(ENABLE_ONEDNN_V3)
     context_.src_mem->set_data_handle(src_data, *reorder_stream);
     context_.dst_mem->set_data_handle(dst_data, *reorder_stream);
 #else
     context_.src_mem->set_data_handle(src_data);
     context_.dst_mem->set_data_handle(dst_data);
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // !ENABLE_ONEDNN_OPENMP && !ENABLE_ONEDNN_V3
+#ifdef ENABLE_ONEDNN_V3
+    context_.scale_mem->set_data_handle(scale_data);
+#endif  // ENABLE_ONEDNN_V3
     context_.reorder_prim->execute(*reorder_stream, context_.prim_args);
     // After execution, set data handle back.
     context_.src_mem->set_data_handle(DummyData);
     context_.dst_mem->set_data_handle(DummyData);
+#ifdef ENABLE_ONEDNN_V3
+    context_.scale_mem->set_data_handle(DummyData);
+#endif  // ENABLE_ONEDNN_V3
   }
 
  private:
@@ -111,6 +139,9 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     // MKL-DNN memory
     std::shared_ptr<dnnl::memory> src_mem;
     std::shared_ptr<dnnl::memory> dst_mem;
+#ifdef ENABLE_ONEDNN_V3
+    std::shared_ptr<dnnl::memory> scale_mem;
+#endif  // ENABLE_ONEDNN_V3
 
     // Reorder primitive descriptor and primitive
     std::shared_ptr<reorder::primitive_desc> reorder_pd;
@@ -124,8 +155,12 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     ReorderContext()
         : src_mem(nullptr),
           dst_mem(nullptr),
+#ifdef ENABLE_ONEDNN_V3
+          scale_mem(nullptr),
+#endif  // ENABLE_ONEDNN_V3
           reorder_pd(nullptr),
-          reorder_prim(nullptr) {}
+          reorder_prim(nullptr) {
+    }
   } context_;
 
   // Reorder primitive setup
@@ -135,16 +170,23 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
         new memory(fwdParams.src_md, cpu_engine_, DummyData));
     context_.dst_mem.reset(
         new memory(fwdParams.dst_md, cpu_engine_, DummyData));
+#ifdef ENABLE_ONEDNN_V3
+    context_.scale_mem.reset(
+        new memory(fwdParams.scale_md, cpu_engine_, DummyData));
+#endif  // ENABLE_ONEDNN_V3
 
     // Check if there is any fusion as post-ops
-    auto const& post_op_params = fwdParams.post_op_params;
     dnnl::primitive_attr post_ops_attr;
-
+#ifndef ENABLE_ONEDNN_V3
+    auto const& post_op_params = fwdParams.post_op_params;
     DCHECK(post_op_params.name == "scale");
     DCHECK_EQ(post_op_params.param.size(), 1);
     std::vector<float> scales;
     scales.push_back(post_op_params.param[0]);
     post_ops_attr.set_output_scales(0, scales);
+#else
+    post_ops_attr.set_scales_mask(DNNL_ARG_SRC, 0 /* mask */);
+#endif  // !ENABLE_ONEDNN_V3
 
     context_.reorder_pd.reset(
         new ReorderPd(cpu_engine_, context_.src_mem->get_desc(), cpu_engine_,
@@ -154,6 +196,10 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     context_.reorder_prim.reset(new reorder(*context_.reorder_pd));
     context_.prim_args.insert({DNNL_ARG_FROM, *context_.src_mem});
     context_.prim_args.insert({DNNL_ARG_TO, *context_.dst_mem});
+#ifdef ENABLE_ONEDNN_V3
+    context_.prim_args.insert(
+        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, *context_.scale_mem});
+#endif  // ENABLE_ONEDNN_V3
   }
 
 #ifdef DNNL_AARCH64_USE_ACL
@@ -167,6 +213,7 @@ class MklReorderWithScalePrimitiveFactory : public MklPrimitiveFactory<T> {
   static MklReorderWithScalePrimitive* Get(
       const memory* from, const memory* to,
       const MklReorderWithScaleFwdParams& fwdParams) {
+#ifndef ENABLE_ONEDNN_V3
     // Try to find a suitable primitive from the cached pool
     auto reorderPrim = static_cast<MklReorderWithScalePrimitive*>(
         MklReorderWithScalePrimitiveFactory<T>::GetInstance().GetReorder(
@@ -177,17 +224,25 @@ class MklReorderWithScalePrimitiveFactory : public MklPrimitiveFactory<T> {
           from, to, reorderPrim, fwdParams);
     }
     return reorderPrim;
+#else
+    // TODO(intel-tf): enable ReorderWithScale primitive cache for v3.x
+    auto reorderPrim = new MklReorderWithScalePrimitive(fwdParams);
+    return reorderPrim;
+#endif  // !ENABLE_ONEDNN_V3
   }
 
+#ifndef ENABLE_ONEDNN_V3
   static MklReorderWithScalePrimitiveFactory& GetInstance() {
     static MklReorderWithScalePrimitiveFactory instance_;
     return instance_;
   }
+#endif  // !ENABLE_ONEDNN_V3
 
  private:
   MklReorderWithScalePrimitiveFactory() {}
   ~MklReorderWithScalePrimitiveFactory() {}
 
+#ifndef ENABLE_ONEDNN_V3
   static string CreateKey(const memory* from, const memory* to,
                           const MklReorderWithScaleFwdParams& fwdParams) {
     FactoryKeyCreator key_creator;
@@ -215,6 +270,7 @@ class MklReorderWithScalePrimitiveFactory : public MklPrimitiveFactory<T> {
     string key = CreateKey(from, to, fwdParams);
     this->SetOp(key, op);
   }
+#endif  // !ENABLE_ONEDNN_V3
 };
 
 // Quantizes a tensor from float to T, with user-specified min_range and
@@ -388,6 +444,10 @@ class MklQuantizeV2Op : public OpKernel {
     // they are wrapper
     MklDnnData<float> src(&cpu_engine);
     MklDnnData<T> dst(&cpu_engine);
+#ifdef ENABLE_ONEDNN_V3
+    MklDnnData<float> scale(&cpu_engine);
+#endif  // ENABLE_ONEDNN_V3
+
     auto src_md =
         src_mkl_shape.IsMklTensor()
             ? src_mkl_shape.GetMklLayout()
@@ -427,7 +487,7 @@ class MklQuantizeV2Op : public OpKernel {
     TensorShape output_tf_shape;
     if (src_mkl_shape.IsMklTensor()) {
       output_mkl_shape.SetMklTensor(true);
-      output_mkl_shape.SetMklLayout(&dst_md);
+      output_mkl_shape.SET_MKL_LAYOUT(dst_md);
       output_mkl_shape.SetElemType(MklDnnType<T>());
       output_mkl_shape.SetTfLayout(src_mkl_shape.GetDimension(),
                                    src_mkl_shape.GetSizesAsMklDnnDims(),
@@ -485,19 +545,32 @@ class MklQuantizeV2Op : public OpKernel {
       const int64 number_of_steps = static_cast<int64_t>(1) << number_of_bits;
       scale_factor = (number_of_steps - 1.0) / (max_range - min_range);
     }
-
+#ifdef ENABLE_ONEDNN_V3
+    auto scale_md =
+        memory::desc({1}, MklDnnType<float>(), memory::format_tag::x);
+    MklReorderWithScaleFwdParams fwdParams(src_dims, src_md, dst_md, scale_md);
+    Tensor scale_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, {1}, &scale_tensor));
+    scale_tensor.flat<float>()(0) = scale_factor;
+    scale.SetUsrMem(scale_md, &scale_tensor);
+#else
     MklReorderWithScaleFwdParams fwdParams(src_dims, src_md, dst_md);
     fwdParams.dtypes.append(typeid(T).name());
     fwdParams.post_op_params.name = "scale";
     fwdParams.post_op_params.param.push_back(scale_factor);
+#endif  // ENABLE_ONEDNN_V3
 
+    MklDnnThreadPool eigen_tp(ctx);
     MklReorderWithScalePrimitive* reorder_prim =
         MklReorderWithScalePrimitiveFactory<T>::Get(src.GetUsrMem(),
                                                     dst.GetUsrMem(), fwdParams);
     std::shared_ptr<stream> cpu_stream;
-    MklDnnThreadPool eigen_tp(ctx);
+
     cpu_stream.reset(CreateStream(&eigen_tp, reorder_prim->GetEngine()));
     reorder_prim->Execute(src.GetUsrMemDataHandle(), dst.GetUsrMemDataHandle(),
+#ifdef ENABLE_ONEDNN_V3
+                          scale.GetUsrMemDataHandle(),
+#endif  // ENABLE_ONEDNN_V3
                           cpu_stream);
 
     output_min_tensor->scalar<float>()() = min_range;
@@ -522,6 +595,9 @@ REGISTER_KERNEL_BUILDER(Name("_MklQuantizeV2")
                             .TypeConstraint<qint8>("T")
                             .Label(mkl_op_registry::kMklQuantizedOpLabel),
                         MklQuantizeV2Op<CPUDevice, qint8, true>);
+
+#undef SET_MKL_LAYOUT
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_perchannel_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_perchannel_test.cc
index c3388fa51c3..5965bbda51e 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_perchannel_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_perchannel_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3) && defined(ENABLE_MKL)
 #define EIGEN_USE_THREADS
 
 #include <functional>
@@ -191,4 +191,4 @@ TEST_F(QuantizedConv2DPerChannelTest, SmallOldAPI) { TestSmall(true); }
 TEST_F(QuantizedConv2DPerChannelTest, SmallNewAPI) { TestSmall(false); }
 
 }  // namespace tensorflow
-#endif  // INTEL_MKL && ENABLE_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3 && ENABLE_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
index 4dc4634775b..00cc02bfcad 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3) && defined(ENABLE_MKL)
 #define EIGEN_USE_THREADS
 
 #include <functional>
@@ -1062,4 +1062,4 @@ TEST_F(QuantizedConvTest, BiasAddSumReluFusionFloatSummand) {
 }
 
 }  // namespace tensorflow
-#endif  // INTEL_MKL && ENABLE_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3 && ENABLE_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_quantized_pooling_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_pooling_ops_test.cc
index a68ee7ad5de..5665ee47b48 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantized_pooling_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantized_pooling_ops_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #if defined(INTEL_MKL) && defined(ENABLE_MKL)
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/mkl/mkl_kernel_util.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
@@ -33,6 +35,117 @@ namespace tensorflow {
 
 class QuantizedPoolingTest : public OpsTestBase {};
 
+class QuantizedMaxPooling3DTest : public OpsTestBase {
+ protected:
+  void RunFloatPooling(const Tensor& input, const int& ksize, const int& stride,
+                       const string& padding, Tensor* output) {
+    auto root = tensorflow::Scope::NewRootScope();
+    string op_name = "maxpool3d";
+    auto input_op =
+        ops::Const(root.WithOpName("input"), Input::Initializer(input));
+    Output out_op = ops::MaxPool3D(root.WithOpName(op_name), input_op,
+                                   {1, ksize, ksize, ksize, 1},
+                                   {1, stride, stride, stride, 1}, padding);
+    tensorflow::GraphDef graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+    MklTestingUtil::RunGraph(graph_def, op_name, output);
+  }
+
+  template <typename T>
+  void RunQuantizedPooling(const Tensor& input_float, const int& ksize,
+                           const int& stride, const string& padding,
+                           Tensor* output) {
+    DataType type = DataTypeToEnum<T>::v();
+    TF_ASSERT_OK(
+        NodeDefBuilder("quantized_max_pool_3d_op", "_QuantizedMaxPool3D")
+            .Input(FakeInput(type))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Attr("T", type)
+            .Attr("ksize", {1, ksize, ksize, ksize, 1})
+            .Attr("strides", {1, stride, stride, stride, 1})
+            .Attr("padding", padding)
+            .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+
+    float input_min, input_max;
+    MklTestingUtil::ComputeMinMax<float>(input_float, &input_min, &input_max);
+
+    Tensor input_quantized;
+    MklTestingUtil::RunMklQuantizeOp(input_float, input_min, input_max, type,
+                                     "SCALED", &input_quantized);
+
+    AddInputFromArray<T>(input_quantized.shape(), input_quantized.flat<T>());
+    AddInputFromArray<float>(TensorShape({}), {input_min});
+    AddInputFromArray<float>(TensorShape({}), {input_max});
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor& output_quantized = *GetOutput(0);
+    Tensor& output_min = *GetOutput(1);
+    Tensor& output_max = *GetOutput(2);
+
+    MklTestingUtil::RunDequantizeOp(output_quantized, output_min, output_max,
+                                    "SCALED", output);
+  }
+};
+
+TEST_F(QuantizedMaxPooling3DTest, QUINT8_INPUT) {
+  const int ksize = 2;
+  const int stride = 2;
+  const string padding = "SAME";
+
+  const int input_depth = 2;
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+  Tensor input_float(
+      DT_FLOAT, {1, input_depth, input_height, input_width, input_channels});
+  test::FillValues<float>(
+      &input_float,
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+       32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+       1,  2,  3,  4,  5,  6,  7,  8,  16, 15, 14, 13, 12, 11, 10, 9});
+
+  Tensor expected_output;
+  RunFloatPooling(input_float, ksize, stride, padding, &expected_output);
+
+  Tensor output_float;
+  RunQuantizedPooling<quint8>(input_float, ksize, stride, padding,
+                              &output_float);
+
+  test::ExpectTensorNear<float>(expected_output, output_float, 0.3);
+}
+
+TEST_F(QuantizedMaxPooling3DTest, QINT8_INPUT) {
+  const int ksize = 2;
+  const int stride = 2;
+  const string padding = "SAME";
+
+  const int input_depth = 2;
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+  Tensor input_float(
+      DT_FLOAT, {1, input_depth, input_height, input_width, input_channels});
+  test::FillValues<float>(
+      &input_float,
+      {1,  -2, 3,  4,   5,  6,  -7, 8,  9,   -10, 11, 12,  13, 14,  -15, -16,
+       17, 18, 19, -20, 21, 22, 23, 24, -25, 26,  27, 28,  29, -30, 31,  32,
+       32, 31, 30, -29, 28, 27, 26, 25, -24, 23,  22, -21, 20, 19,  18,  -17,
+       1,  2,  -3, 4,   5,  -6, 7,  8,  16,  15,  14, -13, 12, -11, 10,  9});
+
+  Tensor expected_output;
+  RunFloatPooling(input_float, ksize, stride, padding, &expected_output);
+
+  Tensor output_float;
+  RunQuantizedPooling<qint8>(input_float, ksize, stride, padding,
+                             &output_float);
+
+  test::ExpectTensorNear<float>(expected_output, output_float, 0.3);
+}
+
 TEST_F(QuantizedPoolingTest, SmallAveragePooling) {
   const int ksize = 2;
   const int stride = 2;
@@ -148,4 +261,4 @@ TEST_F(QuantizedPoolingTest, SmallMaxPooling) {
 
 }  // namespace tensorflow
 
-#endif  // defined(INTEL_MKL) && defined(ENABLE_MKL)
+#endif  // INTEL_MKL && ENABLE_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_relu_op.cc b/tensorflow/core/kernels/mkl/mkl_relu_op.cc
index 187860825b0..24a0ae60fc0 100644
--- a/tensorflow/core/kernels/mkl/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_relu_op.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // See docs in ../ops/nn_ops.cc.
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 
 #include <unordered_map>
 
@@ -476,11 +476,12 @@ class MklReluOpBase : public OpKernel {
       // Try to get an eltwise forward primitive from caching pool
       MklEltwiseFwdParams<T> fwdParams(src_dims, src_md, alg_kind, alpha_,
                                        beta_);
+      MklDnnThreadPool eigen_tp(context);
       MklEltwiseFwdPrimitive<T>* eltwise_fwd =
           MklEltwiseFwdPrimitiveFactory<T>::Get(fwdParams);
       auto eltwise_fwd_pd = eltwise_fwd->GetEltwiseFwdPd();
       std::shared_ptr<stream> fwd_cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
+
       fwd_cpu_stream.reset(CreateStream(&eigen_tp, eltwise_fwd->GetEngine()));
       // Check if src needs to be reordered
       bool is_src_reordered = false;
@@ -682,12 +683,13 @@ class MklReluGradOpBase : public OpKernel {
       MklEltwiseBwdParams<T> bwdParams(src_dims, common_md, alg_kind, alpha_,
                                        beta_, GetTypeOfInputTensorFromFwdOp());
 
+      MklDnnThreadPool eigen_tp(context);
       MklEltwiseBwdPrimitive<T>* eltwise_bwd =
           MklEltwiseBwdPrimitiveFactory<T>::Get(bwdParams);
 
       auto eltwise_bwd_pd = eltwise_bwd->GetEltwiseBwdPd();
       std::shared_ptr<stream> bwd_cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
+
       bwd_cpu_stream.reset(CreateStream(&eigen_tp, eltwise_bwd->GetEngine()));
       // check whether need reorder for src / diff_dst
       const T* src_data = src_tensor.flat<T>().data();
@@ -1227,4 +1229,4 @@ TF_CALL_bfloat16(REGISTER_LeakyRelu_MKL_SUPPORTED_KERNELS_TYPES);
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_relu_op_test.cc b/tensorflow/core/kernels/mkl/mkl_relu_op_test.cc
index 94b5ffe8506..c507dd210b3 100644
--- a/tensorflow/core/kernels/mkl/mkl_relu_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_relu_op_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3) && defined(ENABLE_MKL)
 
 #include "absl/strings/match.h"
 #include "tensorflow/cc/ops/const_op.h"
@@ -136,4 +136,4 @@ TEST_ALL_SIZES(LeakyReluGrad)
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL && ENABLE_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3 && ENABLE_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
index f1dd794bcc0..6e1daf9ff5b 100644
--- a/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // See docs in ../ops/array_ops.cc.
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL)
 #define EIGEN_USE_THREADS
 
 #include <math.h>
diff --git a/tensorflow/core/kernels/mkl/mkl_requantize_per_channel_op.cc b/tensorflow/core/kernels/mkl/mkl_requantize_per_channel_op.cc
index 76281edd269..62ac3674f2e 100644
--- a/tensorflow/core/kernels/mkl/mkl_requantize_per_channel_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_requantize_per_channel_op.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/array_ops.cc.
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL)
 #define EIGEN_USE_THREADS
 
 #include <math.h>
@@ -106,8 +106,16 @@ class MklRequantizePerChannelOp : public OpKernel {
       }
 
       dnnl::primitive_attr reorder_attr;
+#ifndef ENABLE_ONEDNN_V3
       reorder_attr.set_output_scales(2, scales);
+#else
+      reorder_attr.set_scales_mask(DNNL_ARG_SRC, 2);
+      auto scale_mem =
+          memory({{scales.size()}, MklDnnType<float>(), memory::format_tag::x},
+                 cpu_engine_, scales.data());
+#endif  // !ENABLE_ONEDNN_V3
 
+      MklDnnThreadPool eigen_tp(ctx);
       memory::dims dims_mkl_order =
           TFShapeToMklDnnDimsInNCHW(input.shape(), FORMAT_NHWC);
       memory::desc input_md = memory::desc(dims_mkl_order, MklDnnType<qint32>(),
@@ -139,10 +147,13 @@ class MklRequantizePerChannelOp : public OpKernel {
           ReorderPd(cpu_engine_, input_mem_prim->get_desc(), cpu_engine_,
                     output_mem_prim->get_desc(), reorder_attr);
       std::shared_ptr<stream> reorder_stream;
-      MklDnnThreadPool eigen_tp(ctx);
+
       reorder_stream.reset(CreateStream(&eigen_tp, cpu_engine_));
       std::unordered_map<int, dnnl::memory> reorder_args = {
           {DNNL_ARG_FROM, *input_mem_prim}, {DNNL_ARG_TO, *output_mem_prim}};
+#ifdef ENABLE_ONEDNN_V3
+      reorder_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, scale_mem});
+#endif  // ENABLE_ONEDNN_V3
       std::unique_ptr<dnnl::primitive> reorder_prim(
           new dnnl::reorder(reorder_pd));
       reorder_prim->execute(*reorder_stream, reorder_args);
diff --git a/tensorflow/core/kernels/mkl/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
index 3bd13f84cd5..9291d2c0991 100644
--- a/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
@@ -98,7 +98,9 @@ class MklSoftmaxPrimitive : public MklPrimitive {
     std::shared_ptr<memory> dst_mem;
 
     // Primitive descriptor.
+#ifndef ENABLE_ONEDNN_V3
     std::shared_ptr<dnnl::softmax_forward::desc> fwd_desc;
+#endif  // !ENABLE_ONEDNN_V3
 
     // Memory descriptor.
     std::shared_ptr<memory::desc> src_md;
@@ -113,10 +115,13 @@ class MklSoftmaxPrimitive : public MklPrimitive {
     SoftmaxFwdContext()
         : src_mem(nullptr),
           dst_mem(nullptr),
+#ifndef ENABLE_ONEDNN_V3
           fwd_desc(nullptr),
+#endif  // !ENABLE_ONEDNN_V3
           src_md(nullptr),
           fwd_pd(nullptr),
-          softmax_fwd(nullptr) {}
+          softmax_fwd(nullptr) {
+    }
   };
 
   // Softmax forward primitive setup
@@ -127,10 +132,17 @@ class MklSoftmaxPrimitive : public MklPrimitive {
         new memory::desc({fwdParams.src_dims}, MklDnnType<T>(), src_format));
 
     // Create softmax descriptor and primitive descriptor.
+#ifndef ENABLE_ONEDNN_V3
     context_.fwd_desc.reset(new dnnl::softmax_forward::desc(
         prop_kind::forward_scoring, *context_.src_md, fwdParams.axis));
     context_.fwd_pd.reset(new dnnl::softmax_forward::primitive_desc(
         *context_.fwd_desc, cpu_engine_));
+#else
+    context_.fwd_pd.reset(new dnnl::softmax_forward::primitive_desc(
+        cpu_engine_, prop_kind::forward_inference,
+        dnnl::algorithm::softmax_accurate, *context_.src_md,
+        *context_.src_md /* dst_md */, fwdParams.axis));
+#endif  // !ENABLE_ONEDNN_V3
 
     // Create memory primitive based on dummy data.
     context_.src_mem.reset(
@@ -254,6 +266,7 @@ class MklSoftmaxOp : public OpKernel {
       fwdParams.aarch64_counter =
           MklSoftmaxPrimitiveFactory<T>::IncrementCounter();
 #endif
+      MklDnnThreadPool eigen_tp(context);
       MklSoftmaxPrimitive<T>* softmax_fwd =
           MklSoftmaxPrimitiveFactory<T>::Get(fwdParams);
 
@@ -263,7 +276,7 @@ class MklSoftmaxOp : public OpKernel {
       const T* src_data = src_tensor.flat<T>().data();
       T* dst_data = reinterpret_cast<T*>(output_tensor->flat<T>().data());
       std::shared_ptr<stream> fwd_cpu_stream;
-      MklDnnThreadPool eigen_tp(context);
+
       fwd_cpu_stream.reset(CreateStream(&eigen_tp, softmax_fwd->GetEngine()));
       softmax_fwd->Execute(src_data, dst_data, fwd_cpu_stream);
     } catch (dnnl::error& e) {
@@ -277,8 +290,8 @@ class MklSoftmaxOp : public OpKernel {
   }
 };
 
-/* Register DNN kernels for supported operations and supported types - right now
- * it is only Softmax and f32 */
+// Register oneDNN kernels for supported operations and supported types:
+// right now it is Softmax for fp32 and bf16
 #define REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES(type)                    \
   REGISTER_KERNEL_BUILDER(Name("_MklSoftmax")                                 \
                               .Device(DEVICE_CPU)                             \
diff --git a/tensorflow/core/kernels/mkl/mkl_swish_op.cc b/tensorflow/core/kernels/mkl/mkl_swish_op.cc
index 8e903fed152..f015cd982e7 100644
--- a/tensorflow/core/kernels/mkl/mkl_swish_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_swish_op.cc
@@ -68,4 +68,4 @@ TF_CALL_bfloat16(REGISTER_SWISH_MKL_SUPPORTED_KERNELS_TYPES);
 
 }  // namespace tensorflow
 
-#endif
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl/mkl_transpose_op.cc
index a6806545019..7ad7e517edc 100644
--- a/tensorflow/core/kernels/mkl/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_transpose_op.cc
@@ -83,8 +83,8 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
     out.SetUsrMem(in_dims, out_strides, out_tensor);
 
     std::vector<primitive> net;
-    auto* prim = FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem());
     MklDnnThreadPool eigen_tp(context);
+    auto* prim = FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem());
     transpose_stream.reset(CreateStream(&eigen_tp, prim->GetEngine()));
     in.SetUsrMemDataHandle(&in_tensor, transpose_stream);
     out.SetUsrMemDataHandle(out_tensor, transpose_stream);
diff --git a/tensorflow/core/kernels/mkl/onednn_nn_ops_benchmark.cc b/tensorflow/core/kernels/mkl/onednn_nn_ops_benchmark.cc
index 58fcf115c6b..7941abf2428 100644
--- a/tensorflow/core/kernels/mkl/onednn_nn_ops_benchmark.cc
+++ b/tensorflow/core/kernels/mkl/onednn_nn_ops_benchmark.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(ENABLE_ONEDNN_V3)
 
 #include <initializer_list>
 
@@ -112,4 +112,4 @@ BM_Softmax(bfloat16, 4, "bfloat16_ImageNet_batch_size_128", 2, 128, 1008);
 }  // namespace
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
+#endif  // INTEL_MKL && !ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mlir_generated/BUILD b/tensorflow/core/kernels/mlir_generated/BUILD
index 8187a9d33c2..142db833098 100644
--- a/tensorflow/core/kernels/mlir_generated/BUILD
+++ b/tensorflow/core/kernels/mlir_generated/BUILD
@@ -236,42 +236,60 @@ tf_kernel_library(
     tags = ["manual"],
     deps = if_mlir_generated_gpu_kernels_enabled([
         ":base_gpu_op",
-        ":gpu_add_v2_kernels",
         ":gpu_atan2_kernels",
         ":gpu_bitwise_and_kernels",
         ":gpu_bitwise_or_kernels",
         ":gpu_bitwise_xor_kernels",
         ":gpu_complex_kernels",
-        ":gpu_div_kernels",
         ":gpu_truncate_div_kernels",
-        ":gpu_div_no_nan_kernels",
-        ":gpu_equal_kernels",
-        ":gpu_floor_div_kernels",
         ":gpu_floor_mod_kernels",
-        ":gpu_greater_equal_kernels",
-        ":gpu_greater_kernels",
         ":gpu_left_shift_kernels",
-        ":gpu_less_equal_kernels",
-        ":gpu_less_kernels",
         ":gpu_logical_and_kernels",
         ":gpu_logical_or_kernels",
         ":gpu_maximum_kernels",
         ":gpu_minimum_kernels",
-        ":gpu_mul_kernels",
-        ":gpu_mul_no_nan_kernels",
-        ":gpu_not_equal_kernels",
         ":gpu_polygamma_kernels",
         ":gpu_pow_kernels",
         ":gpu_right_shift_kernels",
         ":gpu_select_v2_kernels",
         ":gpu_squared_difference_kernels",
-        ":gpu_sub_kernels",
         ":gpu_xdivy_kernels",
         ":gpu_xlog1py_kernels",
         ":gpu_xlogy_kernels",
         ":gpu_zeta_kernels",
         "//third_party/eigen3",
-    ]),
+    ]) + if_mlir_generated_experimental_kernels_enabled(
+        [
+            ":gpu_add_v2_kernels_experimental",
+            ":gpu_sub_kernels_experimental",
+            ":gpu_div_kernels_experimental",
+            ":gpu_div_no_nan_kernels_experimental",
+            ":gpu_mul_kernels_experimental",
+            ":gpu_mul_no_nan_kernels_experimental",
+            ":gpu_floor_div_kernels_experimental",
+            ":gpu_equal_kernels_experimental",
+            ":gpu_not_equal_kernels_experimental",
+            ":gpu_greater_kernels_experimental",
+            ":gpu_greater_equal_kernels_experimental",
+            ":gpu_less_equal_kernels_experimental",
+            ":gpu_less_kernels_experimental",
+        ],
+        [
+            ":gpu_add_v2_kernels",
+            ":gpu_sub_kernels",
+            ":gpu_div_kernels",
+            ":gpu_div_no_nan_kernels",
+            ":gpu_mul_kernels",
+            ":gpu_mul_no_nan_kernels",
+            ":gpu_floor_div_kernels",
+            ":gpu_equal_kernels",
+            ":gpu_not_equal_kernels",
+            ":gpu_greater_kernels",
+            ":gpu_greater_equal_kernels",
+            ":gpu_less_equal_kernels",
+            ":gpu_less_kernels",
+        ],
+    ),
 )
 
 tf_kernel_library(
@@ -475,8 +493,8 @@ tf_cuda_cc_test(
     ),
     shard_count = 20,
     tags = tf_cuda_tests_tags() + [
-        "no_cuda_asan",  # TODO(b/171341759): re-enable.
         "no_cuda",  # TODO(b/196608406): re-enable
+        "no_cuda_asan",  # TODO(b/171341759): re-enable.
     ],
     deps = [
         ":base_ops_test",
@@ -496,8 +514,8 @@ tf_cuda_cc_test(
         ["-DMLIR_GENERATED_GPU_KERNELS_ENABLED"],
     ),
     tags = tf_cuda_tests_tags() + [
-        "no_cuda_asan",  # TODO(b/171341759): re-enable.
         "no_cuda",  # TODO(b/196608406): re-enable
+        "no_cuda_asan",  # TODO(b/171341759): re-enable.
     ],
     deps = [
         ":base_ops_test",
@@ -555,8 +573,28 @@ tf_cuda_cc_test(
     ],
 )
 
-# Trigonometric kernels
+tf_cuda_cc_test(
+    name = "gpu_binary_ops_large_tensor_test",
+    size = "large",
+    srcs = ["gpu_binary_ops_large_tensor_test.cc"],
+    extra_copts = if_mlir_generated_experimental_kernels_enabled([
+        "-DMLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED",
+    ]) + if_mlir_generated_gpu_kernels_enabled(
+        ["-DMLIR_GENERATED_GPU_KERNELS_ENABLED"],
+    ),
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda",  # TODO(b/196608406): re-enable
+        "no_cuda_asan",  # TODO(b/171341759): re-enable.
+    ],
+    deps = [
+        ":base_binary_ops_test",
+        ":base_ops_test",
+        "//tensorflow/core/common_runtime:device",
+        "//tensorflow/core/common_runtime:device_factory",
+    ],
+)
 
+# Trigonometric kernels.
 gpu_kernel_library(
     name = "gpu_acos_kernels",
     op = "acos",
@@ -995,6 +1033,27 @@ gpu_kernel_library(
     unroll_factors = "4",
 )
 
+gpu_kernel_library(
+    name = "gpu_equal_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "c64",
+        "c128",
+        "f16",
+        "f32",
+        "f64",
+        "i1",
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+    ],
+    op = "equal",
+    output_jit_i64_indexed_for_large_tensors_types = ["i1"] * 10,
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "4",
+)
+
 gpu_kernel_library(
     name = "gpu_greater_kernels",
     op = "greater",
@@ -1015,6 +1074,27 @@ gpu_kernel_library(
     unroll_factors = "4",
 )
 
+gpu_kernel_library(
+    name = "gpu_greater_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+        "i8",
+        "i16",
+        "i64",
+        "ui8",
+        "ui16",
+        "ui32",
+        "ui64",
+    ],
+    op = "greater",
+    output_jit_i64_indexed_for_large_tensors_types = ["i1"] * 10,
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "4",
+)
+
 gpu_kernel_library(
     name = "gpu_greater_equal_kernels",
     op = "greater_equal",
@@ -1035,6 +1115,27 @@ gpu_kernel_library(
     unroll_factors = "4",
 )
 
+gpu_kernel_library(
+    name = "gpu_greater_equal_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+        "i8",
+        "i16",
+        "i64",
+        "ui8",
+        "ui16",
+        "ui32",
+        "ui64",
+    ],
+    op = "greater_equal",
+    output_jit_i64_indexed_for_large_tensors_types = ["i1"] * 10,
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "4",
+)
+
 gpu_kernel_library(
     name = "gpu_is_finite_kernels",
     op = "is_finite",
@@ -1094,6 +1195,27 @@ gpu_kernel_library(
     unroll_factors = "4",
 )
 
+gpu_kernel_library(
+    name = "gpu_less_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+        "i8",
+        "i16",
+        "i64",
+        "ui8",
+        "ui16",
+        "ui32",
+        "ui64",
+    ],
+    op = "less",
+    output_jit_i64_indexed_for_large_tensors_types = ["i1"] * 10,
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "4",
+)
+
 gpu_kernel_library(
     name = "gpu_less_equal_kernels",
     op = "less_equal",
@@ -1114,6 +1236,27 @@ gpu_kernel_library(
     unroll_factors = "4",
 )
 
+gpu_kernel_library(
+    name = "gpu_less_equal_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+        "i8",
+        "i16",
+        "i64",
+        "ui8",
+        "ui16",
+        "ui32",
+        "ui64",
+    ],
+    op = "less_equal",
+    output_jit_i64_indexed_for_large_tensors_types = ["i1"] * 10,
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "4",
+)
+
 gpu_kernel_library(
     name = "gpu_not_equal_kernels",
     op = "not_equal",
@@ -1134,6 +1277,27 @@ gpu_kernel_library(
     unroll_factors = "4",
 )
 
+gpu_kernel_library(
+    name = "gpu_not_equal_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "c64",
+        "c128",
+        "f16",
+        "f32",
+        "f64",
+        "i1",
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+    ],
+    op = "not_equal",
+    output_jit_i64_indexed_for_large_tensors_types = ["i1"] * 10,
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "4",
+)
+
 # Complex-specifc kernels
 
 gpu_kernel_library(
@@ -1275,6 +1439,25 @@ gpu_kernel_library(
     unroll_factors = "4",
 )
 
+gpu_kernel_library(
+    name = "gpu_add_v2_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+        "c64",
+        "c128",
+    ],
+    op = "add_v2",
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "4",
+)
+
 gpu_kernel_library(
     name = "gpu_div_kernels",
     jit_types = [
@@ -1298,6 +1481,30 @@ gpu_kernel_library(
     unroll_factors = "16B",
 )
 
+gpu_kernel_library(
+    name = "gpu_div_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+        "i16",
+        "i64",
+        "ui8",
+        "ui16",
+        "c64",
+        "c128",
+    ],
+    jit_types = [
+        "i8",
+        "ui32",
+        "ui64",
+    ],
+    op = "div",
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "16B",
+)
+
 gpu_kernel_library(
     name = "gpu_div_no_nan_kernels",
     op = "div_no_nan",
@@ -1311,6 +1518,20 @@ gpu_kernel_library(
     ],
 )
 
+gpu_kernel_library(
+    name = "gpu_div_no_nan_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+        "c64",
+        "c128",
+    ],
+    op = "div_no_nan",
+    tile_size = "1024",
+    types = [],
+)
+
 gpu_kernel_library(
     name = "gpu_floor_div_kernels",
     jit_types = [
@@ -1330,6 +1551,26 @@ gpu_kernel_library(
     unroll_factors = "4",
 )
 
+gpu_kernel_library(
+    name = "gpu_floor_div_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "i16",
+        "i64",
+        "f64",
+    ],
+    jit_types = [
+        "i8",
+        "ui32",
+        "ui64",
+    ],
+    op = "floor_div",
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "4",
+)
+
 gpu_kernel_library(
     name = "gpu_maximum_kernels",
     jit_types = [
@@ -1396,6 +1637,31 @@ gpu_kernel_library(
     },
 )
 
+gpu_kernel_library(
+    name = "gpu_mul_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+        "c64",
+        "c128",
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+    ],
+    op = "mul",
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "4",
+    # For complex MulOp kernels, we don't use unrolling, it would only cause
+    # slowdowns.
+    unroll_factors_override = {
+        "c64": None,
+        "c128": None,
+    },
+)
+
 gpu_kernel_library(
     name = "gpu_mul_no_nan_kernels",
     op = "mul_no_nan",
@@ -1416,6 +1682,27 @@ gpu_kernel_library(
     },
 )
 
+gpu_kernel_library(
+    name = "gpu_mul_no_nan_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+        "c64",
+        "c128",
+    ],
+    op = "mul_no_nan",
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "4",
+    # For complex MulNoNanOp kernels, we don't use unrolling, it would only
+    # cause slowdowns.
+    unroll_factors_override = {
+        "c64": None,
+        "c128": None,
+    },
+)
+
 gpu_kernel_library(
     name = "gpu_neg_kernels",
     op = "neg",
@@ -1507,6 +1794,29 @@ gpu_kernel_library(
     unroll_factors = "4",
 )
 
+gpu_kernel_library(
+    name = "gpu_sub_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+        "i32",
+        "i64",
+        "c64",
+        "c128",
+    ],
+    jit_types = [
+        "i8",
+        "i16",
+        "ui8",
+        "ui16",
+    ],
+    op = "sub",
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "4",
+)
+
 gpu_kernel_library(
     name = "gpu_truncate_div_kernels",
     jit_types = [
diff --git a/tensorflow/core/kernels/mlir_generated/base_binary_ops_test.h b/tensorflow/core/kernels/mlir_generated/base_binary_ops_test.h
index 36945d72ef2..d346db461c2 100644
--- a/tensorflow/core/kernels/mlir_generated/base_binary_ops_test.h
+++ b/tensorflow/core/kernels/mlir_generated/base_binary_ops_test.h
@@ -132,7 +132,7 @@ class BinaryOpsTestBase : public OpsTestBase {
                        BaselineOutT (*baseline_callback)(BaselineT, BaselineT),
                        const test::OpsTestConfig& config) {
     // Prepare inputs.
-    int input_size = shape.num_elements();
+    int64_t input_size = shape.num_elements();
     CHECK(lhs_input.size() <= input_size && rhs_input.size() <= input_size &&
           "expect input shape to hold all input values");
     auto repeated_lhs_input =
@@ -375,7 +375,7 @@ class BinaryOpsTestBase : public OpsTestBase {
       std::vector<int> rhs_indices, absl::InlinedVector<T, 10> rhs_input,
       BaselineOutT (*baseline_callback)(BaselineT, BaselineT)) {
     absl::InlinedVector<OutT, 10> expected_output;
-    for (int i = 0; i < lhs_indices.size(); i++) {
+    for (int64_t i = 0; i < lhs_indices.size(); i++) {
       auto lhs = static_cast<BaselineT>(lhs_input[lhs_indices[i]]);
       auto rhs = static_cast<BaselineT>(rhs_input[rhs_indices[i]]);
       auto result = static_cast<OutT>(baseline_callback(lhs, rhs));
diff --git a/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_large_tensor_test.cc b/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_large_tensor_test.cc
new file mode 100755
index 00000000000..fe1a52faa1d
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_large_tensor_test.cc
@@ -0,0 +1,111 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/core/kernels/mlir_generated/base_binary_ops_test.h"
+#include "tensorflow/core/kernels/mlir_generated/base_ops_test.h"
+
+namespace tensorflow {
+namespace {
+// Test fixture `BianryOpsLargeTensorTest` that sets the TF device is expected
+// by the TEST macros below.
+class BinaryOpsLargeTensorTest : public BinaryOpsTestBase {
+ protected:
+  void SetUp() override {
+    std::unique_ptr<tensorflow::Device> device_gpu(
+        tensorflow::DeviceFactory::NewDevice("GPU", {},
+                                             "/job:a/replica:0/task:0"));
+    SetDevice(tensorflow::DEVICE_GPU, std::move(device_gpu));
+  }
+};
+
+template <typename T>
+T baseline_add(T lhs, T rhs) {
+  return lhs + rhs;
+}
+
+template <typename T>
+T baseline_div(T lhs, T rhs) {
+  return lhs / rhs;
+}
+
+template <typename T>
+T baseline_sub(T lhs, T rhs) {
+  return lhs - rhs;
+}
+
+template <typename T>
+T baseline_greater(T lhs, T rhs) {
+  return lhs > rhs;
+}
+
+/// Test `tf.Addv2`.
+
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED) && \
+    defined(MLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED)
+
+TEST_F(BinaryOpsLargeTensorTest, AddV2LargeTensors) {
+  TestEqualShapes<float, float, float, float>(
+      "AddV2", /*shape=*/test::DefaultInputShapeExceedingInt32(),
+      test::DefaultInput<float>(), test::DefaultInput<float>(), baseline_add,
+      test::OpsTestConfig().ExpectStrictlyEqual());
+}
+
+#endif
+
+/// Test `tf.Sub`.
+
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED) && \
+    defined(MLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED)
+
+TEST_F(BinaryOpsLargeTensorTest, SubLargeTensors) {
+  TestEqualShapes<float, float, float, float>(
+      "Sub", /*shape=*/test::DefaultInputShapeExceedingInt32(),
+      test::DefaultInput<float>(), test::DefaultInput<float>(), baseline_sub,
+      test::OpsTestConfig().ExpectStrictlyEqual());
+}
+
+#endif
+
+/// Test `tf.Div`.
+
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED) && \
+    defined(MLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED)
+
+TEST_F(BinaryOpsLargeTensorTest, DivV2LargeTensors) {
+  TestEqualShapes<float, float, float, float>(
+      "Div", /*shape=*/test::DefaultInputShapeExceedingInt32(),
+      test::DefaultInput<float>(), test::DefaultInput<float>(), baseline_div,
+      test::OpsTestConfig().ExpectStrictlyEqual());
+}
+#endif
+
+/// Test `tf.Greater`.
+
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED) && \
+    defined(MLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED)
+
+TEST_F(BinaryOpsLargeTensorTest, GreaterLargeTensors) {
+  TestEqualShapes<float, float, bool, float>(
+      "Greater", /*shape=*/test::DefaultInputShapeExceedingInt32(),
+      test::DefaultInput<float>(), test::DefaultInput<float>(),
+      baseline_greater, test::OpsTestConfig().ExpectStrictlyEqual());
+}
+#endif
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/numeric_options_utils.h b/tensorflow/core/kernels/numeric_options_utils.h
new file mode 100644
index 00000000000..beb2847c3d5
--- /dev/null
+++ b/tensorflow/core/kernels/numeric_options_utils.h
@@ -0,0 +1,30 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_NUMERIC_OPTIONS_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_NUMERIC_OPTIONS_UTILS_H_
+
+#include "tensorflow/compiler/xla/stream_executor/numeric_options.h"
+#include "tensorflow/tsl/util/determinism.h"
+
+namespace tensorflow {
+
+inline stream_executor::NumericOptions GetNumericOptions() {
+  return stream_executor::NumericOptions{tsl::OpDeterminismRequired()};
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_NUMERIC_OPTIONS_UTILS_H_
diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
index f5667d77cf3..4e7ec4ef93a 100644
--- a/tensorflow/core/kernels/ops_testutil.cc
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -151,7 +151,7 @@ Status OpsTestBase::InitOpWithGraphVersion(int graph_def_version) {
   return OkStatus();
 }
 
-Status OpsTestBase::RunOpKernel() {
+void OpsTestBase::CreateContext() {
   // Make sure the old OpKernelContext is deleted before the Params
   // it was using.
   context_.reset(nullptr);
@@ -170,16 +170,17 @@ Status OpsTestBase::RunOpKernel() {
   params_->op_kernel = kernel_.get();
   step_container_.reset(new ScopedStepContainer(0, [](const string&) {}));
   params_->step_container = step_container_.get();
-  std::vector<AllocatorAttributes> attrs;
-  test::SetOutputAttrs(params_.get(), &attrs);
-  checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
-  params_->slice_reader_cache = &slice_reader_cache_wrapper;
-  CancellationManager default_cancellation_manager;
-  params_->cancellation_manager = &default_cancellation_manager;
+  test::SetOutputAttrs(params_.get(), &out_alloc_attrs_);
+  params_->slice_reader_cache = &slice_reader_cache_wrapper_;
+  params_->cancellation_manager = &default_cancellation_manager_;
   params_->resource_manager = device_->resource_manager();
   params_->function_library = pflr_->GetFLR(device_->name());
 
   context_.reset(new OpKernelContext(params_.get()));
+}
+
+Status OpsTestBase::RunOpKernel() {
+  CreateContext();
   device_->Compute(kernel_.get(), context_.get());
   return context_->status();
 }
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index 9f8905a090d..133774b92fe 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -153,6 +153,7 @@ class OpsTestBase : public ::testing::Test {
   const DataTypeVector& output_types() const;
 
  protected:
+  void CreateContext();
   Tensor* AddInput(DataType dtype, const TensorShape& shape);
   void AddResourceInputInternal(const std::string& container_name,
                                 const std::string& name,
@@ -179,6 +180,10 @@ class OpsTestBase : public ::testing::Test {
   // Copies of the outputs in unified memory (host and device accessible).
   std::vector<Tensor*> managed_outputs_;
 
+  // AllocatorAttributes for the allocators of the outputs.
+  std::vector<AllocatorAttributes> out_alloc_attrs_;
+  checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper_;
+  CancellationManager default_cancellation_manager_;
   std::unique_ptr<OpKernelContext::Params> params_;
   std::unique_ptr<OpKernelContext> context_;
   // Unified memory allocator, only used when running on GPU.
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 9b823369e13..d291a80db4d 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -35,6 +35,12 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#if !defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS) && defined(__APPLE__) && \
+    !defined(ANDROID) && !defined(__ANDROID__) &&                       \
+    (!defined(TARGET_OS_IOS) || !TARGET_OS_IOS)
+#define PLUGGABLE_DEVICE_SUPPORTED_MACOS 1
+#endif
+
 // --------------------------------------------------------------------------
 template <typename Device, typename T>
 class PackOp : public OpKernel {
@@ -163,4 +169,15 @@ REGISTER_KERNEL_BUILDER(Name("Pack")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
+#define REGISTER_DEFAULT_PACK(type)                       \
+  REGISTER_KERNEL_BUILDER(Name("Pack")                    \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("values")       \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          PackOp<CPUDevice, type>);
+TF_CALL_ALL_TYPES(REGISTER_DEFAULT_PACK)
+#undef REGISTER_DEFAULT_PACK
+#endif
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index df367f87a0e..684bbe2bb83 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -17,6 +17,12 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#if !defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS) && defined(__APPLE__) && \
+    !defined(ANDROID) && !defined(__ANDROID__) &&                       \
+    (!defined(TARGET_OS_IOS) || !TARGET_OS_IOS)
+#define PLUGGABLE_DEVICE_SUPPORTED_MACOS 1
+#endif
+
 #include "tensorflow/core/kernels/pad_op.h"
 
 #include <memory>
@@ -287,6 +293,43 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_KERNEL);
 TF_CALL_tstring(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
+#if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
+REGISTER_KERNEL_BUILDER(Name("Pad")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tpaddings")
+                            .HostMemory("input")
+                            .HostMemory("paddings")
+                            .HostMemory("output"),
+                        PadOp<CPUDevice, int32, int32>);
+REGISTER_KERNEL_BUILDER(Name("Pad")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("Tpaddings")
+                            .HostMemory("input")
+                            .HostMemory("paddings")
+                            .HostMemory("output"),
+                        PadOp<CPUDevice, int32, int64>);
+REGISTER_KERNEL_BUILDER(Name("PadV2")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tpaddings")
+                            .HostMemory("input")
+                            .HostMemory("paddings")
+                            .HostMemory("constant_values")
+                            .HostMemory("output"),
+                        PadOp<CPUDevice, int32, int32>);
+REGISTER_KERNEL_BUILDER(Name("PadV2")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("Tpaddings")
+                            .HostMemory("input")
+                            .HostMemory("paddings")
+                            .HostMemory("constant_values")
+                            .HostMemory("output"),
+                        PadOp<CPUDevice, int32, int64>);
+#endif
+
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // Forward declarations of the functor specializations for GPU.
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index e602bd4d95d..95a5483437e 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -115,7 +115,7 @@ void PaddingFIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                           errors::DataLoss("Failed to restore element from "
                                            "partially-dequeued batch "
                                            "to PaddingFIFOQueue: ",
-                                           s.error_message()));
+                                           s.message()));
                     }
                     queues_[j].push_front(element);
                   }
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 899ce2698e1..c57fd709589 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -264,7 +264,7 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
              if (!status.ok()) {
                const string function_and_msg =
                    strings::StrCat(errors::FormatFunctionForError(func_name),
-                                   " ", status.error_message());
+                                   " ", status.message());
                ctx->SetStatus(
                    errors::CreateWithUpdatedMessage(status, function_and_msg));
              } else {
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index e1ec075487c..b48287ae144 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/gpu_utils.h"
+#include "tensorflow/core/kernels/numeric_options_utils.h"
 #if TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #endif
@@ -405,13 +406,14 @@ void DnnPoolingImpl(OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
   );
 
   DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
-  OP_REQUIRES_OK(context, stream->ThenPoolForward(
-                              pooling_desc, input_desc, input_data, output_desc,
-                              &output_data, &scratch_allocator));
-#else
   OP_REQUIRES_OK(context,
-                 stream->ThenPoolForward(pooling_desc, input_desc, input_data,
-                                         output_desc, &output_data));
+                 stream->ThenPoolForward(pooling_desc, GetNumericOptions(),
+                                         input_desc, input_data, output_desc,
+                                         &output_data, &scratch_allocator));
+#else
+  OP_REQUIRES_OK(context, stream->ThenPoolForward(
+                              pooling_desc, GetNumericOptions(), input_desc,
+                              input_data, output_desc, &output_data));
 #endif
 
 #if CUDNN_VERSION < 7300
@@ -801,16 +803,18 @@ void DnnPoolingGradImpl(OpKernelContext* context,
   );
 
   DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
+  OP_REQUIRES_OK(
+      context,
+      stream->ThenPoolBackward(
+          pooling_desc, GetNumericOptions(), orig_input_desc, orig_input_data,
+          orig_output_desc, orig_output_data, output_backprop_data,
+          &input_backprop_data, &scratch_allocator));
+#else
   OP_REQUIRES_OK(context,
                  stream->ThenPoolBackward(
-                     pooling_desc, orig_input_desc, orig_input_data,
-                     orig_output_desc, orig_output_data, output_backprop_data,
-                     &input_backprop_data, &scratch_allocator));
-#else
-  OP_REQUIRES_OK(context, stream->ThenPoolBackward(
-                              pooling_desc, orig_input_desc, orig_input_data,
-                              orig_output_desc, orig_output_data,
-                              output_backprop_data, &input_backprop_data));
+                     pooling_desc, GetNumericOptions(), orig_input_desc,
+                     orig_input_data, orig_output_desc, orig_output_data,
+                     output_backprop_data, &input_backprop_data));
 #endif
 
   if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index 88a890162de..97f18688681 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -355,6 +355,10 @@ class MaxPoolingV2Op : public OpKernel {
       OP_REQUIRES(context, ksize_.size() == 4,
                   errors::InvalidArgument("Sliding window ksize field must "
                                           "specify 4 dimensions"));
+      OP_REQUIRES(
+          context,
+          ksize_[0] > 0 && ksize_[1] > 0 && ksize_[2] > 0 && ksize_[3] > 0,
+          errors::InvalidArgument("Sliding window ksize must be positive."));
       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
       OP_REQUIRES(context, stride_.size() == 4,
                   errors::InvalidArgument("Sliding window stride field must "
@@ -387,6 +391,9 @@ class MaxPoolingV2Op : public OpKernel {
     OP_REQUIRES(context, ksize.size() == 4,
                 errors::InvalidArgument("Sliding window ksize field must "
                                         "specify 4 dimensions"));
+    OP_REQUIRES(
+        context, ksize[0] > 0 && ksize[1] > 0 && ksize[2] > 0 && ksize[3] > 0,
+        errors::InvalidArgument("Sliding window ksize must be positive."));
     OP_REQUIRES(context, stride.size() == 4,
                 errors::InvalidArgument("Sliding window stride field must "
                                         "specify 4 dimensions"));
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index a7ecf257b9e..00854718736 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -96,6 +96,14 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
                     InvalidArgument("Invalid range: input_min ", min_val,
                                     " > input_max ", max_val));
       } else {
+        OP_REQUIRES(
+            ctx, TensorShapeUtils::IsVector(input_min_tensor.shape()),
+            InvalidArgument("Shape must be rank 1 for input_min_tensor when the"
+                            " axis is specified"));
+        OP_REQUIRES(
+            ctx, TensorShapeUtils::IsVector(input_max_tensor.shape()),
+            InvalidArgument("Shape must be rank 1 for input_max_tensor when the"
+                            " axis is specified"));
         OP_REQUIRES(
             ctx, input_min_tensor.dim_size(0) == depth,
             InvalidArgument("input_min_tensor has incorrect size, was ",
@@ -183,6 +191,14 @@ class QuantizeAndDequantizeV4GradientOp : public OpKernel {
                     "Input max tensor must have dimension 0 or 1. Received ",
                     input_max_tensor.dims(), "."));
     if (axis_ != -1) {
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsVector(input_min_tensor.shape()),
+          InvalidArgument("Shape must be rank 1 for input_min_tensor when the"
+                          " axis is specified"));
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsVector(input_max_tensor.shape()),
+          InvalidArgument("Shape must be rank 1 for input_max_tensor when the"
+                          " axis is specified"));
       OP_REQUIRES(ctx, input_min_tensor.dim_size(0) == depth,
                   InvalidArgument("min has incorrect size, expected ", depth,
                                   " was ", input_min_tensor.dim_size(0)));
@@ -282,6 +298,14 @@ class QuantizeAndDequantizeV3Op : public OpKernel {
                     InvalidArgument("Invalid range: input_min ", min_val,
                                     " > input_max ", max_val));
       } else {
+        OP_REQUIRES(
+            ctx, TensorShapeUtils::IsVector(input_min_tensor.shape()),
+            InvalidArgument("Shape must be rank 1 for input_min_tensor when the"
+                            " axis is specified"));
+        OP_REQUIRES(
+            ctx, TensorShapeUtils::IsVector(input_max_tensor.shape()),
+            InvalidArgument("Shape must be rank 1 for input_max_tensor when the"
+                            " axis is specified"));
         OP_REQUIRES(
             ctx, input_min_tensor.dim_size(0) == depth,
             InvalidArgument("input_min_tensor has incorrect size, was ",
diff --git a/tensorflow/core/kernels/ragged_gather_op_test.cc b/tensorflow/core/kernels/ragged_gather_op_test.cc
index d1c7a6fa9b7..ca070524a62 100644
--- a/tensorflow/core/kernels/ragged_gather_op_test.cc
+++ b/tensorflow/core/kernels/ragged_gather_op_test.cc
@@ -185,7 +185,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather_OutOfBounds) {
       TensorShape({9}),                     // params_dense_values.shape
       {.1, .2, .3, .4, .5, .6, .7, .8, .9}  // params_dense_values
   );
-  EXPECT_EQ("indices[1] = 10 is not in [0, 4)", RunOpKernel().error_message());
+  EXPECT_EQ("indices[1] = 10 is not in [0, 4)", RunOpKernel().message());
 }
 
 TEST_F(RaggedGatherOpTest, InvalidSplitsNotSorted) {
@@ -196,7 +196,7 @@ TEST_F(RaggedGatherOpTest, InvalidSplitsNotSorted) {
       TensorShape({9}),                     // params_dense_values.shape
       {.1, .2, .3, .4, .5, .6, .7, .8, .9}  // params_dense_values
   );
-  EXPECT_EQ("Ragged splits must be sorted", RunOpKernel().error_message());
+  EXPECT_EQ("Ragged splits must be sorted", RunOpKernel().message());
 }
 
 TEST_F(RaggedGatherOpTest, InvalidSplitsNegative) {
@@ -207,8 +207,7 @@ TEST_F(RaggedGatherOpTest, InvalidSplitsNegative) {
       TensorShape({9}),                     // params_dense_values.shape
       {.1, .2, .3, .4, .5, .6, .7, .8, .9}  // params_dense_values
   );
-  EXPECT_EQ("Ragged splits must be non-negative",
-            RunOpKernel().error_message());
+  EXPECT_EQ("Ragged splits must be non-negative", RunOpKernel().message());
 }
 
 TEST_F(RaggedGatherOpTest, InvalidSplitsEmpty) {
@@ -219,7 +218,7 @@ TEST_F(RaggedGatherOpTest, InvalidSplitsEmpty) {
       TensorShape({0}),  // params_dense_values.shape
       {}                 // params_dense_values
   );
-  EXPECT_EQ("Ragged splits may not be empty", RunOpKernel().error_message());
+  EXPECT_EQ("Ragged splits may not be empty", RunOpKernel().message());
 }
 
 TEST_F(RaggedGatherOpTest, InvalidSplitsTooBig) {
@@ -231,7 +230,7 @@ TEST_F(RaggedGatherOpTest, InvalidSplitsTooBig) {
       {.1, .2, .3, .4, .5, .6, .7, .8, .9}  // params_dense_values
   );
   EXPECT_EQ("Ragged splits must not point past values",
-            RunOpKernel().error_message());
+            RunOpKernel().message());
 }
 
 TEST_F(RaggedGatherOpTest, BadValuesShape) {
@@ -242,7 +241,7 @@ TEST_F(RaggedGatherOpTest, BadValuesShape) {
       TensorShape({}),   // params_dense_values.shape
       {.1}               // params_dense_values
   );
-  EXPECT_EQ("params.rank must be nonzero", RunOpKernel().error_message());
+  EXPECT_EQ("params.rank must be nonzero", RunOpKernel().message());
 }
 
 TEST_F(RaggedGatherOpTest, ShapeFn) {
diff --git a/tensorflow/core/kernels/ragged_range_op_test.cc b/tensorflow/core/kernels/ragged_range_op_test.cc
index fc3b302eeb7..79514173547 100644
--- a/tensorflow/core/kernels/ragged_range_op_test.cc
+++ b/tensorflow/core/kernels/ragged_range_op_test.cc
@@ -86,7 +86,7 @@ TEST_F(RaggedRangeOpTest, RangeSizeOverflow) {
 
   EXPECT_EQ(absl::StrCat("Requires ((limit - start) / delta) <= ",
                          std::numeric_limits<int64_t>::max()),
-            RunOpKernel().error_message());
+            RunOpKernel().message());
 }
 
 TEST_F(RaggedRangeOpTest, BroadcastDeltas) {
@@ -152,7 +152,7 @@ TEST_F(RaggedRangeOpTest, InvalidArgsStarts) {
   AddInputFromArray<int>(TensorShape({4, 1}), {0, 5, 8, 5});  // starts
   AddInputFromArray<int>(TensorShape({4}), {8, 7, 8, 1});     // limits
   AddInputFromArray<int>(TensorShape({4}), {2, 1, 1, -1});    // deltas
-  EXPECT_EQ("starts must be a scalar or vector", RunOpKernel().error_message());
+  EXPECT_EQ("starts must be a scalar or vector", RunOpKernel().message());
 }
 
 TEST_F(RaggedRangeOpTest, InvalidArgsLimits) {
@@ -160,7 +160,7 @@ TEST_F(RaggedRangeOpTest, InvalidArgsLimits) {
   AddInputFromArray<int>(TensorShape({4}), {0, 5, 8, 5});     // starts
   AddInputFromArray<int>(TensorShape({4, 1}), {8, 7, 8, 1});  // limits
   AddInputFromArray<int>(TensorShape({4}), {2, 1, 1, -1});    // deltas
-  EXPECT_EQ("limits must be a scalar or vector", RunOpKernel().error_message());
+  EXPECT_EQ("limits must be a scalar or vector", RunOpKernel().message());
 }
 
 TEST_F(RaggedRangeOpTest, InvalidArgsDeltas) {
@@ -168,7 +168,7 @@ TEST_F(RaggedRangeOpTest, InvalidArgsDeltas) {
   AddInputFromArray<int>(TensorShape({4}), {0, 5, 8, 5});      // starts
   AddInputFromArray<int>(TensorShape({4}), {8, 7, 8, 1});      // limits
   AddInputFromArray<int>(TensorShape({4, 1}), {2, 1, 1, -1});  // deltas
-  EXPECT_EQ("deltas must be a scalar or vector", RunOpKernel().error_message());
+  EXPECT_EQ("deltas must be a scalar or vector", RunOpKernel().message());
 }
 
 TEST_F(RaggedRangeOpTest, InvalidArgsShapeMismatch) {
@@ -177,7 +177,7 @@ TEST_F(RaggedRangeOpTest, InvalidArgsShapeMismatch) {
   AddInputFromArray<int>(TensorShape({3}), {7, 8, 1});      // limits
   AddInputFromArray<int>(TensorShape({4}), {2, 1, 1, -1});  // deltas
   EXPECT_EQ("starts, limits, and deltas must have the same shape",
-            RunOpKernel().error_message());
+            RunOpKernel().message());
 }
 
 TEST_F(RaggedRangeOpTest, InvalidArgsZeroDelta) {
@@ -185,7 +185,7 @@ TEST_F(RaggedRangeOpTest, InvalidArgsZeroDelta) {
   AddInputFromArray<int>(TensorShape({4}), {0, 5, 8, 5});   // starts
   AddInputFromArray<int>(TensorShape({4}), {7, 8, 8, 1});   // limits
   AddInputFromArray<int>(TensorShape({4}), {2, 1, 0, -1});  // deltas
-  EXPECT_EQ("Requires delta != 0", RunOpKernel().error_message());
+  EXPECT_EQ("Requires delta != 0", RunOpKernel().message());
 }
 
 TEST_F(RaggedRangeOpTest, EmptyRangePositiveDelta) {
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
index 6e725aa2bb3..3b9b7889e3e 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
@@ -463,7 +463,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, InvalidInferredInputRaggedRank) {
       {component_variant_1, component_variant_2, component_variant_3,
        component_variant_4});
   EXPECT_TRUE(
-      absl::StartsWith(RunOpKernel().error_message(),
+      absl::StartsWith(RunOpKernel().message(),
                        "Inferred input_ragged_rank (output_ragged_rank - "
                        "encoded_variant.dims()) must be >= 0"));
 }
@@ -484,7 +484,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, InputDimsAndRaggedRankAttrsMismatch) {
   BuildDecodeRaggedTensorGraph<int, int64_t>(
       input_ragged_rank, output_ragged_rank, TensorShape({2}),
       {variant_component_1, variant_component_2});
-  EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
+  EXPECT_TRUE(absl::StartsWith(RunOpKernel().message(),
                                "output_ragged_rank must be equal to "
                                "input_ragged_rank + encoded_ragged.dims()"));
 }
@@ -495,7 +495,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, InputDoesNotHoldRaggedTensorVariant) {
   BuildDecodeRaggedTensorGraph<int, int64_t>(
       input_ragged_rank, output_ragged_rank, TensorShape({2}), {1, 2});
   EXPECT_TRUE(absl::StartsWith(
-      RunOpKernel().error_message(),
+      RunOpKernel().message(),
       "Input Variant element at index 0 doesn't hold a RaggedTensorVariant"));
 }
 
@@ -512,7 +512,7 @@ TEST_F(RaggedTensorFromVariantKernelTest,
       input_ragged_rank, output_ragged_rank, TensorShape({1}),
       {variant_component_1});
   EXPECT_TRUE(
-      absl::StartsWith(RunOpKernel().error_message(),
+      absl::StartsWith(RunOpKernel().message(),
                        "Encoded input RaggedTensorVariant has ragged_rank=1.  "
                        "Expected ragged_rank=2."));
 }
@@ -530,7 +530,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitTypeMismatch) {
                                          TensorShape({1}),
                                          {variant_component_1});
   EXPECT_TRUE(absl::StartsWith(
-      RunOpKernel().error_message(),
+      RunOpKernel().message(),
       "Expected row_splits Tensor dtype: int32, found: int64"));
 }
 
@@ -544,7 +544,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitRankNotOne) {
   int output_ragged_rank = 2;
   BuildDecodeRaggedTensorGraph<int, int64_t>(
       input_ragged_rank, output_ragged_rank, TensorShape({1}), {encoded});
-  EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
+  EXPECT_TRUE(absl::StartsWith(RunOpKernel().message(),
                                "Ragged splits must have rank 1"));
 }
 
@@ -559,7 +559,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesTypeMismatch) {
       input_ragged_rank, output_ragged_rank, TensorShape({1}),
       {variant_component_1});
   EXPECT_TRUE(
-      absl::StartsWith(RunOpKernel().error_message(),
+      absl::StartsWith(RunOpKernel().message(),
                        "Expected values Tensor dtype: string, found: int32"));
 }
 
@@ -571,7 +571,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesRankNotGreaterThanOne) {
   BuildDecodeRaggedTensorGraph<int, int64_t>(
       input_ragged_rank, output_ragged_rank, TensorShape({1}),
       {variant_component_1});
-  EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
+  EXPECT_TRUE(absl::StartsWith(RunOpKernel().message(),
                                "Ragged values must have rank >= 1"));
 }
 
@@ -590,7 +590,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesRankMismatch) {
   BuildDecodeRaggedTensorGraph<int, int64_t>(
       input_ragged_rank, output_ragged_rank, TensorShape({2}),
       {variant_component_1, variant_component_2});
-  EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
+  EXPECT_TRUE(absl::StartsWith(RunOpKernel().message(),
                                "Rank of values must match for all components"));
 }
 
diff --git a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel_test.cc b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel_test.cc
index e66b74db6e3..a11cee61116 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel_test.cc
@@ -182,7 +182,7 @@ TEST_F(RaggedTensorToSparseTest, NoSplits) {
                    .Attr("T", dtype)
                    .Finalize(node_def()));
   EXPECT_TRUE(absl::StartsWith(
-      InitOp().error_message(),
+      InitOp().message(),
       "Value for attr 'RAGGED_RANK' of 0 must be at least minimum 1"));
 }
 
@@ -190,8 +190,7 @@ TEST_F(RaggedTensorToSparseTest, InvalidArg_BadSplitStart) {
   BuildRaggedTensorToSparseGraph<int>({{5, 7, 10}},      // splits
                                       TensorShape({0}),  // values.shape
                                       {});               // values
-  EXPECT_EQ("First value of ragged splits must be 0.",
-            RunOpKernel().error_message());
+  EXPECT_EQ("First value of ragged splits must be 0.", RunOpKernel().message());
 }
 
 TEST_F(RaggedTensorToSparseTest, InvalidArg_BadSplitLengths1) {
@@ -201,7 +200,7 @@ TEST_F(RaggedTensorToSparseTest, InvalidArg_BadSplitLengths1) {
   EXPECT_EQ(
       "Final value of ragged splits must match the length "
       "the corresponding ragged values.",
-      RunOpKernel().error_message());
+      RunOpKernel().message());
 }
 
 TEST_F(RaggedTensorToSparseTest, InvalidArg_BadSplitLengths2) {
@@ -211,14 +210,14 @@ TEST_F(RaggedTensorToSparseTest, InvalidArg_BadSplitLengths2) {
   EXPECT_EQ(
       "Final value of ragged splits must match the length "
       "the corresponding ragged values.",
-      RunOpKernel().error_message());
+      RunOpKernel().message());
 }
 
 TEST_F(RaggedTensorToSparseTest, InvalidArg_EmptySplits) {
   BuildRaggedTensorToSparseGraph<int>({{}},              // splits
                                       TensorShape({0}),  // values.shape
                                       {});               // values
-  EXPECT_EQ("ragged splits may not be empty.", RunOpKernel().error_message());
+  EXPECT_EQ("ragged splits may not be empty.", RunOpKernel().message());
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index bf9e239c152..67914eb0f6c 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -357,7 +357,7 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                           errors::DataLoss("Failed to restore element from "
                                            "partially-dequeued batch "
                                            "to RandomShuffleQueue: ",
-                                           s.error_message()));
+                                           s.message()));
                     }
                     queues_[j].push_back(element);
                   }
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index 26480c87a60..8fd7e40940e 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -51,8 +51,8 @@ typedef Eigen::GpuDevice GPUDevice;
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_RELU_KERNELS);
 #undef REGISTER_RELU_KERNELS
 
-// Register LeakyRelu here for all types except bfloat16
-// bfloat16 is in cwise_op_leakyrelu_bf16.cc
+// Register LeakyRelu here for all types except bfloat16 and float32
+// bfloat16 and float32 are registered in cwise_op_leakyrelu.cc
 #define REGISTER_LEAKYRELU_KERNELS(type)                              \
   REGISTER_KERNEL_BUILDER(                                            \
       Name("LeakyRelu").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
@@ -60,8 +60,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_RELU_KERNELS);
 
 TF_CALL_INTEGRAL_TYPES(REGISTER_LEAKYRELU_KERNELS)
 TF_CALL_half(REGISTER_LEAKYRELU_KERNELS)
-    TF_CALL_float(REGISTER_LEAKYRELU_KERNELS)
-        TF_CALL_double(REGISTER_LEAKYRELU_KERNELS)
+    TF_CALL_double(REGISTER_LEAKYRELU_KERNELS)
 #undef REGISTER_LEAKYRELU_KERNELS
 
 #define REGISTER_ELU_KERNELS(type)                                   \
diff --git a/tensorflow/core/kernels/requantize_op_test.cc b/tensorflow/core/kernels/requantize_op_test.cc
index 5663520fdbf..88bfdb971e5 100644
--- a/tensorflow/core/kernels/requantize_op_test.cc
+++ b/tensorflow/core/kernels/requantize_op_test.cc
@@ -76,7 +76,7 @@ TEST_F(RequantizeTest, InvalidOutputMin) {
   AddInputFromArray<float>(TensorShape({}), {0.01f});
   AddInputFromArray<float>(TensorShape({}), {1.0f});
   EXPECT_EQ("requested_output_min must be <= 0, but got 0.01",
-            RunOpKernel().error_message());
+            RunOpKernel().message());
 }
 
 TEST_F(RequantizeTest, InvalidOutputMax) {
@@ -92,7 +92,7 @@ TEST_F(RequantizeTest, InvalidOutputMax) {
   EXPECT_EQ(
       "requested_output_max must be >= requested_output_min, but got -11 and "
       "-10",
-      RunOpKernel().error_message());
+      RunOpKernel().message());
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 0d03d7f37e1..fdcf891eaf4 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -145,7 +145,7 @@ void ReadVariableOp::Compute(OpKernelContext* ctx) {
                   "This could mean that the variable has been deleted. ",
                   "In TF1, it can also mean the variable is uninitialized. ",
                   "Debug info: container=", handle.container(),
-                  ", status error message=", status.error_message()));
+                  ", status error message=", status.message()));
 
   tf_shared_lock ml(*variable->mu());
   // We're acquiring a reference to the underlying buffer while
@@ -367,7 +367,7 @@ void DisableCopyOnReadOp::Compute(OpKernelContext* ctx) {
                   "This could mean that the variable has been deleted. ",
                   "In TF1, it can also mean the variable is uninitialized. ",
                   "Debug info: container=", handle.container(),
-                  ", status error message=", status.error_message()));
+                  ", status error message=", status.message()));
   // If the variable is currently in copy-on-read mode, its refcount is 1
   if (variable->copy_on_read_mode.load()) {
     // Obtain an exclusive lock on the variable and change the access mode
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index b72ee3c4b22..c1f8905e063 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -16,6 +16,12 @@ limitations under the License.
 // See docs in ../ops/array_ops.cc
 #define EIGEN_USE_THREADS
 
+#if !defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS) && defined(__APPLE__) && \
+    !defined(ANDROID) && !defined(__ANDROID__) &&                       \
+    (!defined(TARGET_OS_IOS) || !TARGET_OS_IOS)
+#define PLUGGABLE_DEVICE_SUPPORTED_MACOS 1
+#endif
+
 #include "tensorflow/core/kernels/reverse_op.h"
 
 #include <memory>
@@ -405,4 +411,38 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2")
                         ReverseV2Op<CPUDevice, int32, int64>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
+#define REGISTER_DEFAULT_KERNELS(T)                          \
+  REGISTER_KERNEL_BUILDER(Name("Reverse")                    \
+                              .Device(DEVICE_DEFAULT)        \
+                              .TypeConstraint<T>("T")        \
+                              .HostMemory("tensor")          \
+                              .HostMemory("dims")            \
+                              .HostMemory("output"),         \
+                          ReverseOp<CPUDevice, T>)           \
+  REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
+                              .Device(DEVICE_DEFAULT)        \
+                              .TypeConstraint<T>("T")        \
+                              .TypeConstraint<int32>("Tidx") \
+                              .HostMemory("tensor")          \
+                              .HostMemory("axis")            \
+                              .HostMemory("output"),         \
+                          ReverseV2Op<CPUDevice, T, int32>)  \
+  REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
+                              .Device(DEVICE_DEFAULT)        \
+                              .TypeConstraint<T>("T")        \
+                              .TypeConstraint<int64>("Tidx") \
+                              .HostMemory("tensor")          \
+                              .HostMemory("axis")            \
+                              .HostMemory("output"),         \
+                          ReverseV2Op<CPUDevice, T, int64>)
+TF_CALL_uint8(REGISTER_DEFAULT_KERNELS);
+TF_CALL_int8(REGISTER_DEFAULT_KERNELS);
+TF_CALL_int16(REGISTER_DEFAULT_KERNELS);
+TF_CALL_uint32(REGISTER_DEFAULT_KERNELS);
+TF_CALL_int32(REGISTER_DEFAULT_KERNELS);
+TF_CALL_GPU_ALL_TYPES(REGISTER_DEFAULT_KERNELS);
+#undef REGISTER_DEFAULT_KERNELS
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/rnn/lstm_ops.cc b/tensorflow/core/kernels/rnn/lstm_ops.cc
index 7bd99581fcd..c16c1f8bee4 100644
--- a/tensorflow/core/kernels/rnn/lstm_ops.cc
+++ b/tensorflow/core/kernels/rnn/lstm_ops.cc
@@ -424,6 +424,11 @@ class LSTMBlockCellOp : public OpKernel {
         ctx, cs_prev_tensor->dims() == 2,
         errors::InvalidArgument("cs_prev_tensor must be rank 2 but is rank ",
                                 cs_prev_tensor->dims(), "."));
+    OP_REQUIRES(
+        ctx, cs_prev_tensor->dim_size(0) > 0 && cs_prev_tensor->dim_size(1) > 0,
+        errors::InvalidArgument("cs_prev_tensor is empty, has shape: (",
+                                cs_prev_tensor->dim_size(0), ",",
+                                cs_prev_tensor->dim_size(1), ")."));
     OP_REQUIRES(
         ctx, h_prev_tensor->dims() == 2,
         errors::InvalidArgument("h_prev_tensor must be rank 2 but is rank ",
@@ -438,7 +443,7 @@ class LSTMBlockCellOp : public OpKernel {
     OP_REQUIRES(
         ctx, wcf_tensor->dims() == 1,
         errors::InvalidArgument("wcf_tensor must be rank 1 but is rank ",
-                                wci_tensor->dims(), "."));
+                                wcf_tensor->dims(), "."));
     OP_REQUIRES(
         ctx, wco_tensor->dims() == 1,
         errors::InvalidArgument("wco_tensor must be rank 1 but is rank ",
@@ -1060,6 +1065,9 @@ class BlockLSTMOp : public OpKernel {
           cs_tensor.matrix<T>(), f_tensor.matrix<T>(), o_tensor.matrix<T>(),
           ci_tensor.matrix<T>(), co_tensor.matrix<T>(),
           gates_tensor.matrix<T>(), h_tensor.matrix<T>());
+
+      if (!ctx->status().ok()) return;
+
       slicer.FinishTimeStep();
     }
 
diff --git a/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc b/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc
index 7de8046b129..6e05337a944 100644
--- a/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc
@@ -267,19 +267,21 @@ void LSTMBlockCellFpropWithCUDA(
                    Eigen::divup(cell_size, static_cast<int>(block_dim_2d.y)));
 
   if (use_peephole) {
-    TF_CHECK_OK(GpuLaunchKernel(
-        lstm_gates<T, true, gate_layout>, grid_dim_2d, block_dim_2d, 0,
-        cu_stream, gates.data(), b.data(), cs_prev.data(), wci.data(),
-        wcf.data(), wco.data(), o.data(), h.data(), ci.data(), cs.data(),
-        co.data(), i.data(), f.data(), forget_bias, cell_clip, batch_size,
-        cell_size));
+    OP_REQUIRES_OK(
+        ctx, GpuLaunchKernel(lstm_gates<T, true, gate_layout>, grid_dim_2d,
+                             block_dim_2d, 0, cu_stream, gates.data(), b.data(),
+                             cs_prev.data(), wci.data(), wcf.data(), wco.data(),
+                             o.data(), h.data(), ci.data(), cs.data(),
+                             co.data(), i.data(), f.data(), forget_bias,
+                             cell_clip, batch_size, cell_size));
   } else {
-    TF_CHECK_OK(GpuLaunchKernel(
-        lstm_gates<T, false, gate_layout>, grid_dim_2d, block_dim_2d, 0,
-        cu_stream, gates.data(), b.data(), cs_prev.data(), wci.data(),
-        wcf.data(), wco.data(), o.data(), h.data(), ci.data(), cs.data(),
-        co.data(), i.data(), f.data(), forget_bias, cell_clip, batch_size,
-        cell_size));
+    OP_REQUIRES_OK(
+        ctx, GpuLaunchKernel(lstm_gates<T, false, gate_layout>, grid_dim_2d,
+                             block_dim_2d, 0, cu_stream, gates.data(), b.data(),
+                             cs_prev.data(), wci.data(), wcf.data(), wco.data(),
+                             o.data(), h.data(), ci.data(), cs.data(),
+                             co.data(), i.data(), f.data(), forget_bias,
+                             cell_clip, batch_size, cell_size));
   }
 }
 
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index 6cfa1df7c61..b487e5a672b 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -119,13 +119,12 @@ struct ScatterNdFunctor<CPUDevice, T, Index, OP, IXDIM> {
     const Eigen::DenseIndex batch_size = Tindices.dimension(0);
 
     Index batch_strides[IXDIM];
-    for (int dim = IXDIM - 1; dim >= 0; --dim) {
-      if (dim == IXDIM - 1) {
-        batch_strides[dim] = 1;
-      } else {
-        batch_strides[dim] =
-            batch_strides[dim + 1] * output_shape_prefix[dim + 1];
-      }
+    if (IXDIM > 0) {
+      batch_strides[IXDIM - 1] = 1;
+    }
+    for (int dim = IXDIM - 2; dim >= 0; --dim) {
+      batch_strides[dim] =
+          batch_strides[dim + 1] * output_shape_prefix[dim + 1];
     }
 
     for (Eigen::DenseIndex loc = 0; loc < batch_size; ++loc) {
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 64b69af423f..1c907ee3e8d 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -142,13 +142,12 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
 
     // Index batch_strides[IXDIM];
     Eigen::array<int64, IXDIM> batch_strides;
-    for (int dim = IXDIM - 1; dim >= 0; --dim) {
-      if (dim == IXDIM - 1) {
-        batch_strides[dim] = 1;
-      } else {
-        batch_strides[dim] =
-            batch_strides[dim + 1] * output_shape_prefix[dim + 1];
-      }
+    if (IXDIM > 0) {
+      batch_strides[IXDIM - 1] = 1;
+    }
+    for (int dim = IXDIM - 2; dim >= 0; --dim) {
+      batch_strides[dim] =
+          batch_strides[dim + 1] * output_shape_prefix[dim + 1];
     }
 
     GpuLaunchConfig config = GetGpuLaunchConfig(Toutput.size(), d);
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
index b288630e55d..3e7c37f4e14 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
@@ -170,6 +170,37 @@ Status LaunchSegmentMeanNormalizeKernel(
                          output);
 }
 
+template <typename SegmentId, typename Index, typename T>
+__global__ void SegmentSetEmptyKernel(
+    SegmentId nsegments, Index ninner,
+    const Index* __restrict__ segment_offsets,  // [nsegments + 1]
+    const T empty_value,
+    T* __restrict__ output) {  // [nsegments, ninner]
+  for (SegmentId seg : GpuGridRangeY(nsegments)) {
+    SegmentId segment_size = segment_offsets[seg + 1] - segment_offsets[seg];
+    if (segment_size == 0) {
+      for (Index i : GpuGridRangeX(ninner)) {
+        output[seg * ninner + i] = empty_value;
+      }
+    }
+  }
+}
+
+template <typename SegmentId, typename Index, typename T>
+Status LaunchSegmentSetEmptyKernel(
+    const GPUDevice& d, SegmentId nsegments, Index ninner,
+    const Index* __restrict__ segment_offsets,  // [nsegments + 1]
+    const T empty_value,
+    T* __restrict__ output) {  // [nsegments, ninner]
+  Gpu2DLaunchConfig config = GetGpu2DLaunchConfig(
+      ninner, nsegments, d, SegmentSetEmptyKernel<SegmentId, Index, T>,
+      /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+  return GpuLaunchKernel(SegmentSetEmptyKernel<SegmentId, Index, T>,
+                         config.block_count, config.thread_per_block, 0,
+                         d.stream(), nsegments, ninner, segment_offsets,
+                         empty_value, output);
+}
+
 // UnsortedSegmentSumKernel processes 'input_total_size' elements.
 // Each element is mapped from input to output by a combination of its
 // 'segment_ids' mapping and 'inner_dim_size'.
@@ -730,10 +761,10 @@ void SegmentReductionFunctor<
   if (!use_deterministic_kernels) {
     // Set 'output' to initial value.
     GpuLaunchConfig config = GetGpuLaunchConfig(output.size(), d);
-    const T InitialValue = InitialValueF()();
+    const T initial_value = InitialValueF()();
     TF_CHECK_OK(GpuLaunchKernel(SetToValue<T>, config.block_count,
                                 config.thread_per_block, 0, d.stream(),
-                                output.size(), output.data(), InitialValue));
+                                output.size(), output.data(), initial_value));
     if (data_size == 0 || segment_ids_shape.num_elements() == 0) {
       return;
     }
@@ -755,9 +786,10 @@ void SegmentReductionFunctor<
         config.block_count, config.thread_per_block, 0, d.stream(),
         input_outer_dim_size, input_inner_dim_size, output_rows,
         segment_ids.data(), data, output.data(), total_stripe_count,
-        InitialValue));
+        initial_value));
 
-    if (is_mean) {
+    const T empty_value = EmptySegmentValueF()();
+    if (is_mean || initial_value != empty_value) {
       Tensor segment_offsets;
       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<Index>::value,
                                              TensorShape({num_segments + 1}),
@@ -767,9 +799,17 @@ void SegmentReductionFunctor<
                               d, input_outer_dim_size, num_segments,
                               segment_ids.data(), segment_offsets_ptr));
 
-      OP_REQUIRES_OK(ctx, LaunchSegmentMeanNormalizeKernel(
-                              d, num_segments, input_inner_dim_size,
-                              segment_offsets_ptr, output.data()));
+      if (is_mean) {
+        OP_REQUIRES_OK(ctx, LaunchSegmentMeanNormalizeKernel(
+                                d, num_segments, input_inner_dim_size,
+                                segment_offsets_ptr, output.data()));
+      }
+      if (initial_value != empty_value) {
+        OP_REQUIRES_OK(
+            ctx, LaunchSegmentSetEmptyKernel(
+                     d, num_segments, input_inner_dim_size, segment_offsets_ptr,
+                     empty_value, output.data()));
+      }
     }
   } else {
     using Treduce = typename ReduceType<ReductionF, T>::type;
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index c502ea5ba0b..9766ae0d798 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -926,6 +926,17 @@ class SparseSegmentReductionOpBase<GPUDevice, T, Index, SegmentId>
       create_and_check_output();
     } else {
       const int64_t num_indices = indices.NumElements();
+      if (num_indices == 0) {
+        TensorShape output_shape = input.shape();
+        output_shape.set_dim(0, 0);
+
+        Tensor* output = nullptr;
+        OP_REQUIRES_OK_ASYNC(
+            context, context->allocate_output(0, output_shape, &output), done);
+        done();
+        return;
+      }
+
       // Need to copy last element of segment_ids from device to host, and then
       // asynchronously allocate the output and finish the computation.
       se::DeviceMemoryBase last_segment_id_device(
diff --git a/tensorflow/core/kernels/sparse/BUILD b/tensorflow/core/kernels/sparse/BUILD
index aea9b3fe163..cc43fbfbf87 100644
--- a/tensorflow/core/kernels/sparse/BUILD
+++ b/tensorflow/core/kernels/sparse/BUILD
@@ -63,7 +63,6 @@ tf_kernel_library(
     ],
     deps = [
         ":sparse_matrix",
-        "//third_party/eigen3",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:framework",
@@ -82,10 +81,11 @@ tf_kernel_library(
         "//tensorflow/core/kernels:dense_update_functor",
         "//tensorflow/core/kernels:fill_functor",
         "//tensorflow/core/kernels:gather_nd_op",
+        "//tensorflow/core/kernels:gpu_prim_hdrs",
         "//tensorflow/core/kernels:scatter_nd_op",
         "//tensorflow/core/kernels:slice_op",
         "//tensorflow/core/kernels:transpose_functor",
-        "//tensorflow/core/kernels:gpu_prim_hdrs",
+        "//third_party/eigen3",
     ] + if_cuda_or_rocm([
         "//tensorflow/core/util:cuda_solvers",
         "//tensorflow/core/util:cuda_sparse",
diff --git a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
index f5e551e91ea..ea7f3d4ea5f 100644
--- a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <limits>
 
 #define EIGEN_USE_THREADS
 
@@ -83,6 +84,23 @@ class SparseTensorToCSRSparseMatrixCPUOp : public OpKernel {
     const int64_t num_cols = dense_shape_vec((rank == 2) ? 1 : 2);
     const int64_t total_nnz = values.NumElements();
 
+    static constexpr int64_t kInt32Max = std::numeric_limits<int32>::max();
+    OP_REQUIRES(
+        ctx, batch_size < kInt32Max,
+        errors::InvalidArgument("dense_shape batch_size must be < Int32Max,"
+                                " but the input value is ",
+                                batch_size));
+    OP_REQUIRES(ctx, total_nnz <= kInt32Max,
+                errors::InvalidArgument("values number of elements must be <="
+                                        " Int32Max, but the input value is ",
+                                        total_nnz));
+    OP_REQUIRES(
+        ctx, (num_rows + 1) * batch_size <= kInt32Max,
+        errors::InvalidArgument("The csr row index size, computed based on the"
+                                " dense_shape, must be <= Int32Max, but is too"
+                                " large. Current value is ",
+                                (num_rows + 1) * batch_size));
+
     // Allocate output Tensors.
     TensorShape batch_ptr_shape;
     OP_REQUIRES_OK(
@@ -153,6 +171,27 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
     const int64_t rows = dense_shape((rank == 2) ? 0 : 1);
     const int64_t cols = dense_shape((rank == 2) ? 1 : 2);
 
+    static constexpr int64_t kInt32Max = std::numeric_limits<int32>::max();
+    OP_REQUIRES_ASYNC(
+        c, batch_size < kInt32Max,
+        errors::InvalidArgument("dense_shape batch_size must be < Int32Max,"
+                                " but the input value is ",
+                                batch_size),
+        done);
+    OP_REQUIRES_ASYNC(
+        c, values_t.NumElements() <= kInt32Max,
+        errors::InvalidArgument("values number of elements must be <="
+                                " Int32Max, but the input value is ",
+                                values_t.NumElements()),
+        done);
+    OP_REQUIRES_ASYNC(
+        c, (rows + 1) * batch_size <= kInt32Max,
+        errors::InvalidArgument("The csr row index size, computed based on the"
+                                " dense_shape, must be <= Int32Max, but is too"
+                                " large. Current value is ",
+                                (rows + 1) * batch_size),
+        done);
+
     ScratchSpace<int32> nnz_per_batch_host(c, batch_size, /*on_host*/ true);
 
     Tensor nnz_per_batch_device_t;
diff --git a/tensorflow/core/kernels/sparse_split_op.cc b/tensorflow/core/kernels/sparse_split_op.cc
index 44570774ceb..d5c979a85d5 100644
--- a/tensorflow/core/kernels/sparse_split_op.cc
+++ b/tensorflow/core/kernels/sparse_split_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/sparse_utils.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
@@ -99,6 +100,13 @@ void SparseSplitOpImpl(OpKernelContext* context, int num_split,
                         "Input shape should be a vector but received shape ",
                         input_shape.shape().DebugString()),
                     done);
+  OP_REQUIRES_OK_ASYNC(context,
+                       sparse_utils::ValidateSparseTensor<int64_t>(
+                           input_indices, input_values, input_shape,
+                           std::is_same_v<Device, CPUDevice>
+                               ? sparse_utils::IndexValidation::kUnordered
+                               : sparse_utils::IndexValidation::kNone),
+                       done);
 
   const int64_t axis_input = input_axis.scalar<int64_t>()();
   const int64_t input_rank = input_shape.vec<int64_t>().size();
diff --git a/tensorflow/core/kernels/sparse_split_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_split_op_gpu.cu.cc
index 76051643e0e..52d0a126af3 100644
--- a/tensorflow/core/kernels/sparse_split_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_split_op_gpu.cu.cc
@@ -163,7 +163,7 @@ __global__ void SparseSplitScatterKernel(
     const Index* __restrict__ sort_permutation,
     const Index* __restrict__ slice_ends,
     const Index* __restrict__ input_indices, const T* __restrict__ input_values,
-    GpuDeviceArrayStruct<Index*> output_indices_data,
+    const int64_t index_bound, GpuDeviceArrayStruct<Index*> output_indices_data,
     GpuDeviceArrayStruct<T*> output_values_data) {
   Index* __restrict__* __restrict__ output_indices =
       GetGpuDeviceArrayOnDevice(&output_indices_data);
@@ -172,17 +172,19 @@ __global__ void SparseSplitScatterKernel(
 
   for (Index sorted_input_nz : GpuGridRangeX<Index>(input_nnz)) {
     Index input_nz = sort_permutation[sorted_input_nz];
-    int slice_index =
-        slice_indexer.GetSliceIndex(input_indices[input_nz * rank + axis]);
-    Index slice_nz =
-        sorted_input_nz -
-        (slice_index == 0 ? Index(0) : slice_ends[slice_index - 1]);
-    output_values[slice_index][slice_nz] = input_values[input_nz];
-    for (int dim = 0; dim < rank; ++dim) {
-      Index input_index = input_indices[input_nz * rank + dim];
-      output_indices[slice_index][slice_nz * rank + dim] =
-          (dim == axis) ? slice_indexer.GetIndexInSlice(input_index)
-                        : input_index;
+    Index slice_input_index = input_indices[input_nz * rank + axis];
+    int slice_index = slice_indexer.GetSliceIndex(slice_input_index);
+    if (slice_input_index >= 0 && slice_input_index < index_bound) {
+      Index slice_nz =
+          sorted_input_nz -
+          (slice_index == 0 ? Index(0) : slice_ends[slice_index - 1]);
+      output_values[slice_index][slice_nz] = input_values[input_nz];
+      for (int dim = 0; dim < rank; ++dim) {
+        Index input_index = input_indices[input_nz * rank + dim];
+        output_indices[slice_index][slice_nz * rank + dim] =
+            (dim == axis) ? slice_indexer.GetIndexInSlice(input_index)
+                          : input_index;
+      }
     }
   }
 }
@@ -192,7 +194,7 @@ Status LaunchSparseSplitScatterKernel(
     const GPUDevice& device, Index input_nnz, int rank, int axis,
     SliceIndexer<Index> slice_indexer, const Index* sort_permutation,
     const Index* slice_ends, const Index* input_indices, const T* input_values,
-    GpuDeviceArrayStruct<Index*> output_indices_data,
+    int64_t index_bound, GpuDeviceArrayStruct<Index*> output_indices_data,
     GpuDeviceArrayStruct<T*> output_values_data) {
   if (input_nnz == 0) return OkStatus();
   GpuLaunchConfig config = GetGpuLaunchConfig(
@@ -201,7 +203,7 @@ Status LaunchSparseSplitScatterKernel(
   return GpuLaunchKernel(SparseSplitScatterKernel<T, Index>, config.block_count,
                          config.thread_per_block, 0, device.stream(), input_nnz,
                          rank, axis, slice_indexer, sort_permutation,
-                         slice_ends, input_indices, input_values,
+                         slice_ends, input_indices, input_values, index_bound,
                          output_indices_data, output_values_data);
 }
 
@@ -327,7 +329,8 @@ struct SparseSplitFunctor<GPUDevice, T> {
           LaunchSparseSplitScatterKernel(
               device, input_nnz, rank, axis, slice_indexer,
               sort_permutation_ptr, slice_ends_ptr, input_indices_ptr,
-              input_values_ptr, output_indices.data(), output_values.data()),
+              input_values_ptr, dense_shape.dim_size(axis),
+              output_indices.data(), output_values.data()),
           done);
 
       done();
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index fade09dee00..6bfb2170332 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -21,6 +21,12 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#if !defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS) && defined(__APPLE__) && \
+    !defined(ANDROID) && !defined(__ANDROID__) &&                       \
+    (!defined(TARGET_OS_IOS) || !TARGET_OS_IOS)
+#define PLUGGABLE_DEVICE_SUPPORTED_MACOS 1
+#endif
+
 #include <numeric>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -530,6 +536,23 @@ REGISTER_GPU_int32(int64_t);
 
 #undef REGISTER_GPU_int32
 
+#if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
+#define REGISTER_DEFAULT_KERNEL(len_type)                       \
+  REGISTER_KERNEL_BUILDER(Name("SplitV")                        \
+                              .Device(DEVICE_DEFAULT)           \
+                              .TypeConstraint<int32>("T")       \
+                              .TypeConstraint<len_type>("Tlen") \
+                              .HostMemory("size_splits")        \
+                              .HostMemory("split_dim")          \
+                              .HostMemory("value")              \
+                              .HostMemory("output"),            \
+                          SplitVOpCPU<int32, len_type>);
+
+TF_CALL_int32(REGISTER_DEFAULT_KERNEL);
+TF_CALL_int64(REGISTER_DEFAULT_KERNEL);
+#undef REGISTER_DEFAULT_KERNEL
+#endif
+
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/stateless_random_ops_v2.cc b/tensorflow/core/kernels/stateless_random_ops_v2.cc
index a97281c06bc..371d99e2927 100644
--- a/tensorflow/core/kernels/stateless_random_ops_v2.cc
+++ b/tensorflow/core/kernels/stateless_random_ops_v2.cc
@@ -81,20 +81,8 @@ class StatelessRandomOp : public StatelessRandomOpBaseWithKeyCounter {
  protected:
   void Fill(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
             const Tensor& counter, Tensor* output) override {
-    typedef typename Distribution::ResultElementType T;
-    auto flat = output->flat<T>();
-    if (alg == RNG_ALG_PHILOX) {
-      // Reuse the compute kernels from the stateful random ops
-      auto key_data = key.flat<uint64>().data();
-      auto counter_data = counter.flat<uint64>().data();
-      functor::FillPhiloxRandom<Device, Distribution>()(
-          ctx, ctx->eigen_device<Device>(), key_data, counter_data,
-          random::PhiloxRandom() /*dummy*/, flat.data(), flat.size(),
-          Distribution());
-    } else {
-      OP_REQUIRES(ctx, false,
-                  errors::InvalidArgument("Unsupported algorithm id: ", alg));
-    }
+    FillRandomTensor<Device, Distribution>(ctx, alg, key, counter,
+                                           Distribution(), output);
   }
 };
 
@@ -128,19 +116,8 @@ class StatelessRandomUniformIntOp : public StatelessRandomOpBaseWithKeyCounter {
     typedef random::UniformDistribution<random::PhiloxRandom, IntType>
         Distribution;
     Distribution dist(lo, hi);
-
-    auto flat = output->flat<IntType>();
-    if (alg == RNG_ALG_PHILOX) {
-      // Reuse the compute kernels from the stateful random ops
-      auto key_data = key.flat<uint64>().data();
-      auto counter_data = counter.flat<uint64>().data();
-      functor::FillPhiloxRandom<Device, Distribution>()(
-          ctx, ctx->eigen_device<Device>(), key_data, counter_data,
-          random::PhiloxRandom() /*dummy*/, flat.data(), flat.size(), dist);
-    } else {
-      OP_REQUIRES(ctx, false,
-                  errors::InvalidArgument("Unsupported algorithm id: ", alg));
-    }
+    FillRandomTensor<Device, Distribution>(ctx, alg, key, counter, dist,
+                                           output);
   }
 };
 
@@ -154,23 +131,10 @@ class StatelessRandomUniformFullIntOp
  protected:
   void Fill(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
             const Tensor& counter, Tensor* output) override {
-    // Build distribution
     typedef random::UniformFullIntDistribution<random::PhiloxRandom, IntType>
         Distribution;
-    Distribution dist;
-
-    auto flat = output->flat<IntType>();
-    if (alg == RNG_ALG_PHILOX) {
-      // Reuse the compute kernels from the stateful random ops
-      auto key_data = key.flat<uint64>().data();
-      auto counter_data = counter.flat<uint64>().data();
-      functor::FillPhiloxRandom<Device, Distribution>()(
-          ctx, ctx->eigen_device<Device>(), key_data, counter_data,
-          random::PhiloxRandom() /*dummy*/, flat.data(), flat.size(), dist);
-    } else {
-      OP_REQUIRES(ctx, false,
-                  errors::InvalidArgument("Unsupported algorithm id: ", alg));
-    }
+    FillRandomTensor<Device, Distribution>(ctx, alg, key, counter,
+                                           Distribution(), output);
   }
 };
 
diff --git a/tensorflow/core/kernels/stateless_random_ops_v2_util.h b/tensorflow/core/kernels/stateless_random_ops_v2_util.h
index 2c992005c25..8744d848e86 100644
--- a/tensorflow/core/kernels/stateless_random_ops_v2_util.h
+++ b/tensorflow/core/kernels/stateless_random_ops_v2_util.h
@@ -21,7 +21,9 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/random_op.h"
 #include "tensorflow/core/kernels/stateless_random_ops_v2.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
 
 namespace tensorflow {
 
@@ -61,6 +63,24 @@ GetKeyCounterAlgFromInputs(OpKernelContext* ctx, int key_input_idx,
   return std::make_tuple(key_t, counter_t, alg);
 }
 
+template <typename Device, typename Distribution>
+void FillRandomTensor(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+                      const Tensor& counter, Distribution dist,
+                      Tensor* tensor) {
+  typedef typename Distribution::ResultElementType T;
+  auto flat = tensor->flat<T>();
+  if (alg == RNG_ALG_PHILOX) {
+    // Reuse the compute kernels from the stateful random ops
+    auto key_data = key.flat<uint64>().data();
+    auto counter_data = counter.flat<uint64>().data();
+    functor::FillPhiloxRandom<Device, Distribution>()(
+        ctx, ctx->eigen_device<Device>(), key_data, counter_data,
+        random::PhiloxRandom() /*dummy*/, flat.data(), flat.size(), dist);
+  } else {
+    OP_REQUIRES(ctx, false,
+                errors::InvalidArgument("Unsupported algorithm id: ", alg));
+  }
+}
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_UTIL_H_
diff --git a/tensorflow/core/kernels/stochastic_cast_op.cc b/tensorflow/core/kernels/stochastic_cast_op.cc
new file mode 100644
index 00000000000..626a00894da
--- /dev/null
+++ b/tensorflow/core/kernels/stochastic_cast_op.cc
@@ -0,0 +1,116 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/stochastic_cast_op.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
+#include "tensorflow/core/kernels/stateless_random_ops_v2_util.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+using Eigen::half;
+
+void internal::StochasticCastOpBase::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES_VALUE(auto key_counter_alg_tuple, ctx,
+                    GetKeyCounterAlgFromInputs(ctx, 1, 2, 3));
+  auto key_t = std::get<0>(key_counter_alg_tuple);
+  auto counter_t = std::get<1>(key_counter_alg_tuple);
+  auto alg = std::get<2>(key_counter_alg_tuple);
+
+  Tensor* output;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, ctx->input(0).shape(), &output));
+  RoundOff(ctx, alg, key_t, counter_t, output);
+}
+
+template <typename Device, typename FromType, typename ToType>
+class StochasticCastToIntOp : public internal::StochasticCastOpBase {
+ public:
+  using StochasticCastOpBase::StochasticCastOpBase;
+
+ protected:
+  void RoundOff(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+                const Tensor& counter, Tensor* output) override {
+    if (alg == RNG_ALG_PHILOX) {
+      random::PhiloxRandom gen(*counter.flat<uint64>().data(),
+                               *key.flat<uint64>().data());
+      output->flat<ToType>() =
+          ctx->input(0)
+              .flat<FromType>()
+              .unaryExpr(Eigen::internal::StochasticRoundToIntOp<
+                         FromType, ToType, random::PhiloxRandom>(&gen))
+              .template cast<ToType>();
+    } else {
+      OP_REQUIRES(ctx, false,
+                  errors::InvalidArgument("Unsupported algorithm id: ", alg));
+    }
+  }
+};
+
+#define REGISTER_CAST_TO_INT_KERNEL(DEVICE, FROM_TYPE, TO_TYPE) \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("StochasticCastToInt")                               \
+          .Device(DEVICE_##DEVICE)                              \
+          .HostMemory("alg")                                    \
+          .TypeConstraint<FROM_TYPE>("Tin")                     \
+          .TypeConstraint<TO_TYPE>("Tout"),                     \
+      StochasticCastToIntOp<DEVICE##Device, FROM_TYPE, TO_TYPE>)
+
+#define REGISTER_CAST_TO_INT_CPU_KERNEL(FROM_TYPE, TO_TYPE) \
+  REGISTER_CAST_TO_INT_KERNEL(CPU, FROM_TYPE, TO_TYPE)
+#define REGISTER_CAST_TO_INT_GPU_KERNEL(FROM_TYPE, TO_TYPE) \
+  REGISTER_CAST_TO_INT_KERNEL(GPU, FROM_TYPE, TO_TYPE)
+REGISTER_CAST_TO_INT_CPU_KERNEL(half, int8);
+REGISTER_CAST_TO_INT_CPU_KERNEL(half, int16);
+REGISTER_CAST_TO_INT_CPU_KERNEL(half, int32);
+
+REGISTER_CAST_TO_INT_CPU_KERNEL(bfloat16, int8);
+REGISTER_CAST_TO_INT_CPU_KERNEL(bfloat16, int16);
+REGISTER_CAST_TO_INT_CPU_KERNEL(bfloat16, int32);
+
+REGISTER_CAST_TO_INT_CPU_KERNEL(float, int8);
+REGISTER_CAST_TO_INT_CPU_KERNEL(float, int16);
+REGISTER_CAST_TO_INT_CPU_KERNEL(float, int32);
+
+REGISTER_CAST_TO_INT_CPU_KERNEL(double, int8);
+REGISTER_CAST_TO_INT_CPU_KERNEL(double, int16);
+REGISTER_CAST_TO_INT_CPU_KERNEL(double, int32);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+REGISTER_CAST_TO_INT_CPU_KERNEL(half, int8);
+REGISTER_CAST_TO_INT_CPU_KERNEL(half, int16);
+REGISTER_CAST_TO_INT_CPU_KERNEL(half, int32);
+
+REGISTER_CAST_TO_INT_GPU_KERNEL(bfloat16, int8);
+REGISTER_CAST_TO_INT_GPU_KERNEL(bfloat16, int16);
+REGISTER_CAST_TO_INT_GPU_KERNEL(bfloat16, int32);
+
+REGISTER_CAST_TO_INT_GPU_KERNEL(float, int8);
+REGISTER_CAST_TO_INT_GPU_KERNEL(float, int16);
+REGISTER_CAST_TO_INT_GPU_KERNEL(float, int32);
+
+REGISTER_CAST_TO_INT_GPU_KERNEL(double, int8);
+REGISTER_CAST_TO_INT_GPU_KERNEL(double, int16);
+REGISTER_CAST_TO_INT_GPU_KERNEL(double, int32);
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#undef REGISTER
+#undef REGISTER_CPU
+#undef REGISTER_GPU
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stochastic_cast_op.h b/tensorflow/core/kernels/stochastic_cast_op.h
new file mode 100644
index 00000000000..fe8f4b8cdd0
--- /dev/null
+++ b/tensorflow/core/kernels/stochastic_cast_op.h
@@ -0,0 +1,140 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STOCHASTIC_CAST_OP_H_
+#define TENSORFLOW_CORE_KERNELS_STOCHASTIC_CAST_OP_H_
+
+#include <limits>
+#include <type_traits>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/rng_alg.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace internal {
+
+// Base class that dispatches random algorithm, key and counter for
+// StochasticCast ops.
+class StochasticCastOpBase : public OpKernel {
+ public:
+  explicit StochasticCastOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+
+ protected:
+  // Subclasses can implement this rounding kernel with assumption that random
+  // algorithm, key, counter have been given.
+  virtual void RoundOff(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+                        const Tensor& counter, Tensor* output) = 0;
+};
+
+}  // namespace internal
+}  // namespace tensorflow
+
+namespace Eigen {
+namespace internal {
+
+template <typename Scalar, typename IntResultType, typename Generator>
+struct StochasticRoundToIntOp {
+  static_assert(std::is_integral<IntResultType>::value,
+                "Integer type expected");
+  typedef tensorflow::random::UniformDistribution<Generator, Scalar>
+      Distribution;
+  const Scalar max =
+      static_cast<Scalar>(std::numeric_limits<IntResultType>::max());
+  const Scalar min =
+      static_cast<Scalar>(std::numeric_limits<IntResultType>::min());
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC explicit StochasticRoundToIntOp(
+      Generator* g)
+      : gen(g) {}
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar
+  operator()(const Scalar& s) const {
+    if (TF_PREDICT_FALSE(Eigen::numext::isnan(s))) {
+      return Scalar{0};
+    }
+    if (s >= max) {
+      return max;
+    }
+    if (s <= min) {
+      return min;
+    }
+    // Already integer, doesn't need to be rounded.
+    if (Eigen::numext::floor(s) == s) {
+      return s;
+    }
+    // In order to match comparison-based algorithm on some hardware
+    // implementations which rounds abs(operand) up when random <
+    // abs(fractional), we deal with positive and negative operands differently.
+    // TODO(b/232442915): Revisit RNG multi-threading issue when needed.
+    Distribution dist;
+    Scalar random = dist(gen)[0];
+    if (s < 0) {
+      return Eigen::numext::floor(s + random);
+    } else {
+      return Eigen::numext::floor(s + Scalar{1} - random);
+    }
+  }
+
+  template <typename Packet>
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet packetOp(const Packet& p) const {
+    constexpr size_t kPacketSize =
+        Eigen::internal::unpacket_traits<Packet>::size;
+    Scalar unpacked_random[kPacketSize];
+    Distribution dist;
+    auto const sample = dist(gen);
+    for (int i = 0; i < kPacketSize; i += Distribution::kResultElementCount) {
+      int granularity = std::min(Distribution::kResultElementCount,
+                                 static_cast<int>(kPacketSize - i));
+      std::copy(&sample[0], &sample[0] + granularity, &unpacked_random[i]);
+    }
+    Packet random = pload<Packet>(unpacked_random);
+    Packet rounded =
+        pselect(pcmp_eq(pfloor(p), p), p,
+                pselect(pcmp_lt(p, pzero(p)), pfloor(padd(p, random)),
+                        pfloor(padd(p, psub(pset1<Packet>(1), random)))));
+    // Handles out of range inputs.
+    Packet result =
+        pselect(pcmp_le(pset1<Packet>(max), p), pset1<Packet>(max), rounded);
+    result =
+        pselect(pcmp_le(p, pset1<Packet>(min)), pset1<Packet>(min), result);
+    // Handles NaN input.
+    return pselect(pcmp_eq(p, p), result, pset1<Packet>(0));
+  }
+  Generator* gen;
+};
+
+template <typename Scalar, typename IntResultType, typename Generator>
+struct functor_traits<
+    StochasticRoundToIntOp<Scalar, IntResultType, Generator>> {
+  enum {
+    Cost = 3 * NumTraits<Scalar>::AddCost,
+    PacketAccess =
+        packet_traits<Scalar>::HasCmp && packet_traits<Scalar>::HasFloor,
+  };
+};
+
+// TODO(b/232442915): Add support for rounding floats to lower precision floats.
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_STOCHASTIC_CAST_OP_H_
diff --git a/tensorflow/core/kernels/stochastic_cast_op_test.cc b/tensorflow/core/kernels/stochastic_cast_op_test.cc
new file mode 100644
index 00000000000..d7e0f9172cc
--- /dev/null
+++ b/tensorflow/core/kernels/stochastic_cast_op_test.cc
@@ -0,0 +1,293 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/stochastic_cast_op.h"
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include <gtest/gtest.h>
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/bfloat16.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/lib/random/philox_random.h"
+
+namespace Eigen {
+namespace internal {
+
+// Comparison based stochastic rounding for cross verification.
+template <typename Scalar, typename IntResultType, typename Generator>
+struct StochasticCastVerifier {
+  static_assert(std::is_integral<IntResultType>::value,
+                "Integer type expected");
+  const Scalar max =
+      static_cast<Scalar>(std::numeric_limits<IntResultType>::max());
+  const Scalar min =
+      static_cast<Scalar>(std::numeric_limits<IntResultType>::min());
+  typedef tensorflow::random::UniformDistribution<Generator, Scalar>
+      Distribution;
+  using T = typename Eigen::internal::make_integer<Scalar>::type;
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC explicit StochasticCastVerifier(
+      Generator* g)
+      : gen(g) {}
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar
+  operator()(const Scalar& s) const {
+    if (Eigen::numext::isnan(s)) {
+      return Scalar{0};
+    }
+    if (s >= max) {
+      return max;
+    }
+    if (s <= min) {
+      return min;
+    }
+    Scalar abs_s = Eigen::numext::abs(s);
+    // Gets the integral piece of the floating point input.
+    Scalar truncated = Eigen::numext::floor(abs_s);
+    Scalar fractional = abs_s - truncated;
+    if (fractional == Scalar{0}) {
+      // No rounding necessary.
+      return s < Scalar{0} ? -truncated : truncated;
+    }
+    Distribution dist;
+    Scalar random = dist(gen)[0];
+    // Rounds the integer output up if the fractional pieces is larger than
+    // the input random number.
+    if (random < fractional) {
+      truncated++;
+    }
+    return s < Scalar{0} ? -truncated : truncated;
+  }
+
+  template <typename Packet>
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet packetOp(const Packet& p) const {
+    Packet abs_p = pabs<Packet>(p);
+    Packet truncated = pfloor(abs_p);
+    Packet fractional = psub(abs_p, truncated);
+    constexpr size_t kPacketSize =
+        Eigen::internal::unpacket_traits<Packet>::size;
+    Scalar unpacked_random[kPacketSize];
+    Distribution dist;
+    auto const sample = dist(gen);
+    for (int i = 0; i < kPacketSize; i += Distribution::kResultElementCount) {
+      int granularity = std::min(Distribution::kResultElementCount,
+                                 static_cast<int>(kPacketSize - i));
+      std::copy(&sample[0], &sample[0] + granularity, &unpacked_random[i]);
+    }
+    Packet random = pload<Packet>(unpacked_random);
+    truncated = pselect(pcmp_lt(random, fractional),
+                        padd(truncated, pset1<Packet>(1)), truncated);
+    truncated = pselect(pcmp_lt(p, pzero(p)), pnegate(truncated), truncated);
+    // Handles out of range inputs.
+    Packet result =
+        pselect(pcmp_le(pset1<Packet>(max), p), pset1<Packet>(max), truncated);
+    result =
+        pselect(pcmp_le(p, pset1<Packet>(min)), pset1<Packet>(min), result);
+    // Handles NaN input.
+    return pselect(pcmp_eq(p, p), result, pset1<Packet>(1));
+  }
+  Generator* gen;
+};
+
+template <typename Scalar, typename IntResultType, typename Generator>
+struct functor_traits<
+    StochasticCastVerifier<Scalar, IntResultType, Generator>> {
+  enum {
+    Cost = 3 * NumTraits<Scalar>::AddCost,
+    PacketAccess =
+        packet_traits<Scalar>::HasCmp && packet_traits<Scalar>::HasFloor,
+  };
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+namespace tensorflow {
+
+using Eigen::half;
+using tensorflow::random::PhiloxRandom;
+
+class StochasticCastOpToIntTest : public OpsTestBase {
+ public:
+  static const int kAlgorithm = 1;
+  static const uint64_t kRngCounter = 30;
+  static const uint64_t kRngKey = 20;
+
+ protected:
+  template <typename InType, typename OutType>
+  void CastResultProbabilityTestHelper(DataType In_Type, DataType Out_Type) {
+    // Choose a dim large enough without OOM issue.
+    static const uint64_t kDim = 1 << 23;
+    TF_ASSERT_OK(
+        NodeDefBuilder("stochastic_cast_to_int_op", "StochasticCastToInt")
+            .Input(FakeInput(In_Type))
+            .Input(FakeInput(DT_UINT64))
+            .Input(FakeInput(DT_UINT64))
+            .Input(FakeInput(DT_INT32))
+            .Attr("Tin", In_Type)
+            .Attr("Tout", Out_Type)
+            .Finalize(node_def()));
+
+    TF_ASSERT_OK(InitOp());
+
+    const InType value = InType(0.625);
+    AddInput<InType>(TensorShape({kDim, 1}), [&value](int i) { return value; });
+    AddInput<uint64>(TensorShape({1}), [](int i) { return kRngKey; });
+    AddInput<uint64>(TensorShape({1}), [](int i) { return kRngCounter; });
+    AddInput<int>(TensorShape({}), [](int i) { return kAlgorithm; });
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    OutType* result = GetOutput(0)->flat<OutType>().data();
+    int floor_count = 0, ceil_count = 0;
+    for (int i = 0; i < kDim; ++i) {
+      if (*(result + i) == static_cast<OutType>(std::floor(value))) {
+        floor_count++;
+      } else if (*(result + i) == static_cast<OutType>(std::ceil(value))) {
+        ceil_count++;
+      }
+    }
+    float expected_probability_ratio = (Eigen::numext::ceil(value) - value) /
+                                       (value - Eigen::numext::floor(value));
+    float real_probability_ratio =
+        static_cast<float>(floor_count) / static_cast<float>(ceil_count);
+    double err_allowance = 0.03;
+    EXPECT_TRUE(test::internal_test::IsClose(real_probability_ratio,
+                                             expected_probability_ratio,
+                                             err_allowance, err_allowance));
+  }
+
+  template <typename InType, typename OutType>
+  void ExhasutiveTestHelper(DataType in_type, DataType out_type) {
+    const uint64_t digits = Eigen::NumTraits<InType>::digits();
+    const uint64_t dim = digits > 23 ? (1 << 23) : (1L << digits);
+    const uint64_t total = uint64_t{1} << Eigen::NumTraits<InType>::digits();
+    const uint64_t granularity = digits > 23 ? (total / dim) : 1;
+
+    TF_ASSERT_OK(
+        NodeDefBuilder("stochastic_cast_to_int_op", "StochasticCastToInt")
+            .Input(FakeInput(in_type))
+            .Input(FakeInput(DT_UINT64))
+            .Input(FakeInput(DT_UINT64))
+            .Input(FakeInput(DT_INT32))
+            .Attr("Tin", in_type)
+            .Attr("Tout", out_type)
+            .Finalize(node_def()));
+
+    TF_ASSERT_OK(InitOp());
+
+    TensorShape shape({static_cast<int64_t>(dim)});
+    Tensor* input = AddInput(in_type, shape);
+    AddInput<uint64>(TensorShape({1}), [](int i) { return kRngKey; });
+    AddInput<uint64>(TensorShape({1}), [](int i) { return kRngCounter; });
+    AddInput<int>(TensorShape({}), [](int i) { return kAlgorithm; });
+
+    auto in = input->flat<InType>();
+    using T = typename Eigen::internal::make_integer<InType>::type;
+    for (int i = 0; i < dim; ++i) {
+      in(i) = Eigen::numext::bit_cast<InType>(static_cast<T>(i * granularity));
+    }
+
+    Tensor expected(out_type, input->shape());
+    PhiloxRandom gen(kRngCounter, kRngKey);
+    expected.flat<OutType>() =
+        input->flat<InType>()
+            .unaryExpr(
+                Eigen::internal::StochasticCastVerifier<InType, OutType,
+                                                        PhiloxRandom>(&gen))
+            .template cast<OutType>();
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    tensorflow::test::ExpectEqual(expected, *GetOutput(0));
+  }
+};
+
+TEST_F(StochasticCastOpToIntTest, ExhaustiveTestHalfCastToInt32) {
+  ExhasutiveTestHelper<half, int>(DT_HALF, DT_INT32);
+}
+
+TEST_F(StochasticCastOpToIntTest, ExhaustiveTestHalfCastToInt16) {
+  ExhasutiveTestHelper<half, int16>(DT_HALF, DT_INT16);
+}
+
+TEST_F(StochasticCastOpToIntTest, ExhaustiveTestHalfCastToInt8) {
+  ExhasutiveTestHelper<half, int8>(DT_HALF, DT_INT8);
+}
+
+TEST_F(StochasticCastOpToIntTest, ExhaustiveTestBf16CastToInt32) {
+  ExhasutiveTestHelper<bfloat16, int>(DT_BFLOAT16, DT_INT32);
+}
+
+TEST_F(StochasticCastOpToIntTest, ExhaustiveTestBf16CastToInt16) {
+  ExhasutiveTestHelper<bfloat16, int16>(DT_BFLOAT16, DT_INT16);
+}
+
+TEST_F(StochasticCastOpToIntTest, ExhaustiveTestBf16CastToInt8) {
+  ExhasutiveTestHelper<bfloat16, int8>(DT_BFLOAT16, DT_INT8);
+}
+
+TEST_F(StochasticCastOpToIntTest, ExhaustiveTestFloatCastToInt32) {
+  ExhasutiveTestHelper<float, int>(DT_FLOAT, DT_INT32);
+}
+
+TEST_F(StochasticCastOpToIntTest, ExhaustiveTestFloatCastToInt16) {
+  ExhasutiveTestHelper<float, int16>(DT_FLOAT, DT_INT16);
+}
+
+TEST_F(StochasticCastOpToIntTest, ExhaustiveTestFloatCastToInt8) {
+  ExhasutiveTestHelper<float, int8>(DT_FLOAT, DT_INT8);
+}
+
+TEST_F(StochasticCastOpToIntTest, ExhaustiveTestDoubleCastToInt32) {
+  ExhasutiveTestHelper<double, int>(DT_DOUBLE, DT_INT32);
+}
+
+TEST_F(StochasticCastOpToIntTest, ExhaustiveTestDoubleCastToInt16) {
+  ExhasutiveTestHelper<double, int16>(DT_DOUBLE, DT_INT16);
+}
+
+TEST_F(StochasticCastOpToIntTest, ExhaustiveTestDoubleCastToInt8) {
+  ExhasutiveTestHelper<double, int8>(DT_DOUBLE, DT_INT8);
+}
+
+TEST_F(StochasticCastOpToIntTest, CastProbabilityTestHalfToInt8) {
+  CastResultProbabilityTestHelper<half, int8>(DT_HALF, DT_INT8);
+}
+
+TEST_F(StochasticCastOpToIntTest, CastProbabilityTestBf16ToInt8) {
+  CastResultProbabilityTestHelper<bfloat16, int8>(DT_BFLOAT16, DT_INT8);
+}
+
+TEST_F(StochasticCastOpToIntTest, CastProbabilityTestFloatToInt8) {
+  CastResultProbabilityTestHelper<float, int8>(DT_FLOAT, DT_INT8);
+}
+
+TEST_F(StochasticCastOpToIntTest, CastProbabilityTestDoubleToInt8) {
+  CastResultProbabilityTestHelper<double, int8>(DT_DOUBLE, DT_INT8);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index ce56e29ba5e..a4b3ab77370 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -21,6 +21,12 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#if !defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS) && defined(__APPLE__) && \
+    !defined(ANDROID) && !defined(__ANDROID__) &&                       \
+    (!defined(TARGET_OS_IOS) || !TARGET_OS_IOS)
+#define PLUGGABLE_DEVICE_SUPPORTED_MACOS 1
+#endif
+
 #include "tensorflow/core/kernels/strided_slice_op.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -551,4 +557,50 @@ REGISTER_KERNEL_BUILDER(Name("TensorStridedSliceUpdate")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
+REGISTER_KERNEL_BUILDER(Name("StridedSlice")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("input")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides")
+                            .HostMemory("output"),
+                        StridedSliceOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("shape")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides")
+                            .HostMemory("dy")
+                            .HostMemory("output"),
+                        StridedSliceGradOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("ref")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides"),
+                        StridedSliceAssignOp<CPUDevice, int32, false>);
+REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("ref")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides"),
+                        StridedSliceAssignOp<CPUDevice, int32, false>);
+REGISTER_KERNEL_BUILDER(Name("TensorStridedSliceUpdate")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("input")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides"),
+                        StridedSliceAssignOp<CPUDevice, int32, true>);
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index d2685d98bfc..89fea00dd44 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -17,6 +17,12 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#if !defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS) && defined(__APPLE__) && \
+    !defined(ANDROID) && !defined(__ANDROID__) &&                       \
+    (!defined(TARGET_OS_IOS) || !TARGET_OS_IOS)
+#define PLUGGABLE_DEVICE_SUPPORTED_MACOS 1
+#endif
+
 #include <limits>
 #include <vector>
 // TODO(b/31496047): Fix non-standard include order.
@@ -263,6 +269,34 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
+#define REGISTER_DEVICE_DEFAULT(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("TensorArray")                \
+                              .Device(DEVICE_DEFAULT)        \
+                              .TypeConstraint<type>("dtype") \
+                              .HostMemory("size")            \
+                              .HostMemory("handle"),         \
+                          TensorArrayOp);                    \
+  REGISTER_KERNEL_BUILDER(Name("TensorArrayV2")              \
+                              .Device(DEVICE_DEFAULT)        \
+                              .TypeConstraint<type>("dtype") \
+                              .HostMemory("size")            \
+                              .HostMemory("handle"),         \
+                          TensorArrayOp);                    \
+  REGISTER_KERNEL_BUILDER(Name("TensorArrayV3")              \
+                              .Device(DEVICE_DEFAULT)        \
+                              .TypeConstraint<type>("dtype") \
+                              .HostMemory("size")            \
+                              .HostMemory("handle"),         \
+                          TensorArrayOp);
+
+TF_CALL_int64(REGISTER_DEVICE_DEFAULT);
+TF_CALL_bfloat16(REGISTER_DEVICE_DEFAULT);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_DEVICE_DEFAULT);
+TF_CALL_COMPLEX_TYPES(REGISTER_DEVICE_DEFAULT);
+#undef REGISTER_DEVICE_DEFAULT
+#endif
+
 // GRADIENT *******************************************************************
 // Note that this op may have an optional third input. If present, it represents
 // a shape value. It indicates that element shape of this gradient array is that
@@ -405,6 +439,29 @@ REGISTER_KERNEL_BUILDER(Name("TensorArrayGradWithShape")
                             .HostMemory("shape_to_prepend")
                             .HostMemory("grad_handle"),
                         TensorArrayGradOp);
+#if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
+REGISTER_KERNEL_BUILDER(Name("TensorArrayGrad")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("handle")
+                            .HostMemory("grad_handle"),
+                        TensorArrayGradOp);
+REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV2")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("handle")
+                            .HostMemory("grad_handle"),
+                        TensorArrayGradOp);
+REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV3")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("handle")
+                            .HostMemory("grad_handle"),
+                        TensorArrayGradOp);
+REGISTER_KERNEL_BUILDER(Name("TensorArrayGradWithShape")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("handle")
+                            .HostMemory("shape_to_prepend")
+                            .HostMemory("grad_handle"),
+                        TensorArrayGradOp);
+#endif
 
 // WRITE **********************************************************************
 
@@ -1009,6 +1066,27 @@ REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
+REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("dtype")
+                            .HostMemory("lengths")
+                            .HostMemory("handle"),
+                        TensorArrayConcatOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("dtype")
+                            .HostMemory("lengths")
+                            .HostMemory("handle"),
+                        TensorArrayConcatOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("dtype")
+                            .HostMemory("lengths")
+                            .HostMemory("handle"),
+                        TensorArrayConcatOp<CPUDevice, int32>);
+#endif
+
 // UNPACK and SCATTER *********************************************************
 
 template <typename Device, typename T, bool LEGACY_UNPACK>
diff --git a/tensorflow/core/kernels/tensor_to_hash_bucket_op_gpu.cu.cc b/tensorflow/core/kernels/tensor_to_hash_bucket_op_gpu.cu.cc
index 6288620b1ea..6c06778d214 100644
--- a/tensorflow/core/kernels/tensor_to_hash_bucket_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/tensor_to_hash_bucket_op_gpu.cu.cc
@@ -65,8 +65,9 @@ __device__ __forceinline__ int IntegerToString(T val, char* buf) {
 }
 
 template <typename T>
-__global__ void ComputeHashes(const T* __restrict__ vals, int vals_size,
-                              int64 num_buckets, int64* __restrict__ hashes) {
+__global__ __launch_bounds__(1024) void ComputeHashes(
+    const T* __restrict__ vals, int vals_size, int64 num_buckets,
+    int64* __restrict__ hashes) {
   extern __shared__ char s[];
 
   GPU_1D_KERNEL_LOOP(tid, vals_size) {
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 59556f74547..9549865c893 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -37,11 +37,17 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename Device, typename T>
+template <typename Device, typename T, typename Tidx>
 class TopK : public OpKernel {
  public:
   explicit TopK(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("sorted", &sorted_));
+    // Allow "sorted" to be an optional attribute for use with ApproxTopK
+    // which has no such attribute.
+    auto status = context->GetAttr("sorted", &sorted_);
+    if (!status.ok()) {
+      sorted_ = true;  // Default to sorted, as required by ApproxTopK.
+    }
+
     if (num_inputs() < 2) {  // k is an attr (TopK).
       OP_REQUIRES_OK(context, context->GetAttr("k", &k_));
     } else {  // k is an input (TopKV2), so we won't know it until Compute.
@@ -56,7 +62,22 @@ class TopK : public OpKernel {
       OP_REQUIRES(context, TensorShapeUtils::IsScalar(k_in.shape()),
                   errors::InvalidArgument("k must be scalar, got shape ",
                                           k_in.shape().DebugString()));
-      k = k_in.scalar<int32>()();
+      switch (k_in.dtype()) {
+        case DT_INT16:
+          k = k_in.scalar<int16_t>()();
+          break;
+        case DT_INT32:
+          k = k_in.scalar<int32_t>()();
+          break;
+        case DT_INT64:
+          k = k_in.scalar<int64_t>()();
+          break;
+        default:
+          OP_REQUIRES(context, false,
+                      errors::InvalidArgument(
+                          "k must have dtype in {int16, int32, int64}, got  ",
+                          k_in.dtype()));
+      }
     }
     OP_REQUIRES(context, k >= 0,
                 errors::InvalidArgument("Need k >= 0, got ", k));
@@ -73,16 +94,14 @@ class TopK : public OpKernel {
 
     const int64_t num_rows = input.dimension(0);  // generally batch_size
     const int64_t num_cols = input.dimension(1);
-    OP_REQUIRES(
-        context, num_rows <= std::numeric_limits<int32>::max(),
-        errors::InvalidArgument(
-            "First dimension of flattened input must be <= INT_MAX, got ",
-            num_rows));
-    OP_REQUIRES(
-        context, num_cols <= std::numeric_limits<int32>::max(),
-        errors::InvalidArgument(
-            "Second dimension of flattened input must be <= INT_MAX, got ",
-            num_cols));
+    OP_REQUIRES(context, num_rows <= std::numeric_limits<Tidx>::max(),
+                errors::InvalidArgument(
+                    "First dimension of flattened input must be <= ",
+                    std::numeric_limits<Tidx>::max(), ", got ", num_rows));
+    OP_REQUIRES(context, num_cols <= std::numeric_limits<Tidx>::max(),
+                errors::InvalidArgument(
+                    "Second dimension of flattened input must be <= ",
+                    std::numeric_limits<Tidx>::max(), ", got ", num_cols));
 
     TensorShape output_shape = input_in.shape();
     output_shape.set_dim(input_in.dims() - 1, k);
@@ -97,8 +116,8 @@ class TopK : public OpKernel {
     if (k == 0 || num_rows == 0) return;
 
     auto values = values_out->flat_inner_dims<T>();
-    auto indices = indices_out->flat_inner_dims<int32>();
-    Status s = functor::TopKFunctor<Device, T>::Compute(
+    auto indices = indices_out->flat_inner_dims<Tidx>();
+    Status s = functor::TopKFunctor<Device, T, Tidx>::Compute(
         context, sorted_, k, input, num_rows, num_cols, values, indices);
     OP_REQUIRES_OK(context, s);
   }
@@ -110,13 +129,13 @@ class TopK : public OpKernel {
 
 namespace functor {
 
-template <typename T>
-struct TopKFunctor<CPUDevice, T> {
+template <typename T, typename Tidx>
+struct TopKFunctor<CPUDevice, T, Tidx> {
   static EIGEN_ALWAYS_INLINE Status Compute(
       OpKernelContext* context, bool sorted, int k,
       const typename TTypes<T, 2>::ConstTensor& input, const int64_t num_rows,
       const int64_t num_cols, typename TTypes<T, 2>::Tensor values,
-      typename TTypes<int, 2>::Tensor indices) {
+      typename TTypes<Tidx, 2>::Tensor indices) {
     const CPUDevice& d = context->eigen_device<CPUDevice>();
 
     // Special case for k == 1.
@@ -129,10 +148,10 @@ struct TopKFunctor<CPUDevice, T> {
           input.maximum(/*dims=*/reduce_on_cols).eval().reshape(rows_by_one);
       // Get the indices of the maximum values.
       for (int r = 0; r < num_rows; ++r) {
-        indices(r, 0) = 0;
+        indices(r, 0) = Tidx(0);
         for (int c = 0; c < num_cols; ++c) {
           if (values(r, 0) == input(r, c)) {
-            indices(r, 0) = c;
+            indices(r, 0) = static_cast<Tidx>(c);
             break;
           }
         }
@@ -187,15 +206,15 @@ struct TopKFunctor<CPUDevice, T> {
           }
         } else {
           // Use the TopN heap object to sort.
-          gtl::TopN<int32, decltype(stable_comp)> filter(k, stable_comp);
+          gtl::TopN<Tidx, decltype(stable_comp)> filter(k, stable_comp);
           filter.reserve(num_cols);
-          for (int32_t c = 0; c < num_cols; ++c) {
+          for (Tidx c = 0; c < num_cols; ++c) {
             filter.push(c);
           }
 
           int32_t i = 0;
           if (sorted) {
-            std::unique_ptr<std::vector<int32>> top_k(filter.Extract());
+            std::unique_ptr<std::vector<Tidx>> top_k(filter.Extract());
             for (auto top_k_it = top_k->begin(); top_k_it != top_k->end();
                  ++top_k_it, ++i) {
               indices(b, i) = *top_k_it;
@@ -209,15 +228,14 @@ struct TopKFunctor<CPUDevice, T> {
         }
         // Now that the indices are sorted, copy the values over in
         // sorted order.
-        std::transform(
-            &indices(b, 0), &indices(b, k), &values(b, 0),
-            [b, &input](const int32_t loc) { return input(b, loc); });
-      }  // for (int32 b = ...
+        std::transform(&indices(b, 0), &indices(b, k), &values(b, 0),
+                       [b, &input](const Tidx loc) { return input(b, loc); });
+      }  // for (Tidx b = ...
     };
 
     // Guesstimate of cost; 4*N*log(K) where N == num_cols.
     // If K == N, assume the cost is N*log(K + 1).
-    const double cmp_cost = 3 * Eigen::TensorOpCost::AddCost<int32>() +
+    const double cmp_cost = 3 * Eigen::TensorOpCost::AddCost<Tidx>() +
                             Eigen::TensorOpCost::AddCost<T>();
     const double base_cost =
         cmp_cost *
@@ -239,51 +257,93 @@ struct TopKFunctor<CPUDevice, T> {
 
 }  // namespace functor
 
-#define REGISTER_KERNELS_NAME(name, type)                       \
-  REGISTER_KERNEL_BUILDER(                                      \
-      Name(#name).Device(DEVICE_CPU).TypeConstraint<type>("T"), \
-      TopK<CPUDevice, type>)
+#define REGISTER_KERNELS_NAME(name, type, index_type)                    \
+  REGISTER_KERNEL_BUILDER(Name(#name)                                    \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<type>("T")                 \
+                              .TypeConstraint<index_type>("index_type"), \
+                          TopK<CPUDevice, type, index_type>)
 
-#define REGISTER_KERNELS(type)       \
-  REGISTER_KERNELS_NAME(TopK, type); \
-  REGISTER_KERNELS_NAME(TopKV2, type)
+#define REGISTER_KERNELS_WITH_INDEX(type, index_type) \
+  REGISTER_KERNELS_NAME(TopK, type, index_type);      \
+  REGISTER_KERNELS_NAME(TopKV2, type, index_type);
+
+#define REGISTER_APPROX_TOPK_KERNELS(type)                    \
+  REGISTER_KERNEL_BUILDER(                                    \
+      Name("ApproxTopK")                                      \
+          .Device(DEVICE_CPU)                                 \
+          .TypeConstraint<type>("T")                          \
+          .AttrConstraint<int64_t>("reduction_dimension", -1) \
+          .AttrConstraint<bool>("is_max_k", true),            \
+      TopK<CPUDevice, type, int32>);
+
+#define REGISTER_KERNELS(type)                \
+  REGISTER_KERNELS_WITH_INDEX(type, int16);   \
+  REGISTER_KERNELS_WITH_INDEX(type, int32);   \
+  REGISTER_KERNELS_WITH_INDEX(type, int64_t); \
+  REGISTER_APPROX_TOPK_KERNELS(type);
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS_NAME
+
+#undef REGISTER_TOPK_KERNELS_NAME
+#undef REGISTER_TOPK_KERNELS_WITH_INDEX
+#undef REGISTER_APPROX_TOPK_KERNELS
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                    \
+#define DECLARE_GPU_SPEC_WITH_INDEX(T, Tidx)                                   \
   template <>                                                                  \
-  Status TopKFunctor<GPUDevice, T>::Compute(                                   \
+  Status TopKFunctor<GPUDevice, T, Tidx>::Compute(                             \
       OpKernelContext* context, bool sorted, int k,                            \
       const typename TTypes<T, 2>::ConstTensor& input, const int64_t num_rows, \
       const int64_t num_cols, typename TTypes<T, 2>::Tensor values,            \
-      typename TTypes<int, 2>::Tensor indices);                                \
-  extern template struct functor::TopKFunctor<GPUDevice, T>;
+      typename TTypes<Tidx, 2>::Tensor indices);                               \
+  extern template struct functor::TopKFunctor<GPUDevice, T, Tidx>;
+
+#define DECLARE_GPU_SPEC(T) DECLARE_GPU_SPEC_WITH_INDEX(T, int32)
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 TF_CALL_INTEGRAL_TYPES(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
+#undef DECLARE_GPU_SPEC_WITH_INDEX
 
 }  // namespace functor
 
-#define REGISTER_KERNELS(type)                                   \
-  REGISTER_KERNEL_BUILDER(                                       \
-      Name("TopK").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
-      TopK<GPUDevice, type>)                                     \
-  REGISTER_KERNEL_BUILDER(Name("TopKV2")                         \
-                              .Device(DEVICE_GPU)                \
-                              .TypeConstraint<type>("T")         \
-                              .HostMemory("k"),                  \
-                          TopK<GPUDevice, type>)
+#define REGISTER_TOPK_KERNELS_WITH_INDEX(type, index_type)               \
+  REGISTER_KERNEL_BUILDER(Name("TopK")                                   \
+                              .Device(DEVICE_GPU)                        \
+                              .TypeConstraint<type>("T")                 \
+                              .TypeConstraint<index_type>("index_type"), \
+                          TopK<GPUDevice, type, index_type>);            \
+  REGISTER_KERNEL_BUILDER(Name("TopKV2")                                 \
+                              .Device(DEVICE_GPU)                        \
+                              .TypeConstraint<type>("T")                 \
+                              .TypeConstraint<index_type>("index_type")  \
+                              .HostMemory("k"),                          \
+                          TopK<GPUDevice, type, index_type>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS);
-TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
+#define REGISTER_APPROX_TOPK_KERNELS(type)                    \
+  REGISTER_KERNEL_BUILDER(                                    \
+      Name("ApproxTopK")                                      \
+          .Device(DEVICE_GPU)                                 \
+          .TypeConstraint<type>("T")                          \
+          .AttrConstraint<int64_t>("reduction_dimension", -1) \
+          .AttrConstraint<bool>("is_max_k", true),            \
+      TopK<GPUDevice, type, int32>);
+
+#define REGISTER_TOPK_KERNELS(type) \
+  REGISTER_TOPK_KERNELS_WITH_INDEX(type, int32)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TOPK_KERNELS);
+TF_CALL_INTEGRAL_TYPES(REGISTER_TOPK_KERNELS);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_APPROX_TOPK_KERNELS);
+
+#undef REGISTER_TOPK_KERNELS
+#undef REGISTER_TOPK_KERNELS_WITH_INDEX
+#undef REGISTER_APPROX_TOPK_KERNELS
 
 #endif  // end GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/topk_op.h b/tensorflow/core/kernels/topk_op.h
index 92f7c52b818..d0c6795a9ab 100644
--- a/tensorflow/core/kernels/topk_op.h
+++ b/tensorflow/core/kernels/topk_op.h
@@ -26,13 +26,13 @@ namespace tensorflow {
 
 namespace functor {
 
-template <typename Device, typename T>
+template <typename Device, typename T, typename Tidx>
 struct TopKFunctor {
   static Status Compute(OpKernelContext* context, bool sorted, int k,
                         const typename TTypes<T, 2>::ConstTensor& input,
                         const int64_t num_rows, const int64_t num_cols,
                         typename TTypes<T, 2>::Tensor values,
-                        typename TTypes<int, 2>::Tensor indices);
+                        typename TTypes<Tidx, 2>::Tensor indices);
 };
 
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/topk_op_gpu.h b/tensorflow/core/kernels/topk_op_gpu.h
index 9c8f3a94f3d..278ebeb172f 100644
--- a/tensorflow/core/kernels/topk_op_gpu.h
+++ b/tensorflow/core/kernels/topk_op_gpu.h
@@ -565,17 +565,17 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
   return OkStatus();
 }
 
-}  // end namespace impl
+}  // namespace impl
 
 namespace functor {
 
-template <typename T>
-struct TopKFunctor<GPUDevice, T> {
+template <typename T, typename Tidx>
+struct TopKFunctor<GPUDevice, T, Tidx> {
   static EIGEN_ALWAYS_INLINE Status
   Compute(OpKernelContext* context, bool sorted, int k,
           const typename TTypes<T, 2>::ConstTensor& input, const int64 num_rows,
           const int64 num_cols, typename TTypes<T, 2>::Tensor values,
-          typename TTypes<int, 2>::Tensor indices) {
+          typename TTypes<Tidx, 2>::Tensor indices) {
     // For small k, use the heap implementation.  For larger k, use
     // the in-place gpuprim sort.  For k == num_cols, always use the
     // in-place gpuprim sort.  The thresholds for n and k were determined
@@ -598,7 +598,7 @@ struct TopKFunctor<GPUDevice, T> {
   }
 };
 
-}  // end namespace functor
+}  // namespace functor
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_bfloat16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_bfloat16.cu.cc
index 26feed6ce79..c631e027732 100644
--- a/tensorflow/core/kernels/topk_op_gpu_bfloat16.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_bfloat16.cu.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 using Eigen::GpuDevice;
 
-template struct functor::TopKFunctor<GPUDevice, Eigen::bfloat16>;
+template struct functor::TopKFunctor<GPUDevice, Eigen::bfloat16, int32>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_double.cu.cc b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
index 787aafdfd07..b4d2be951a0 100644
--- a/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 using Eigen::GpuDevice;
 
-template struct functor::TopKFunctor<GPUDevice, double>;
+template struct functor::TopKFunctor<GPUDevice, double, int32>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_float.cu.cc b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
index 10d106248f9..51f613f5efe 100644
--- a/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 using Eigen::GpuDevice;
 
-template struct functor::TopKFunctor<GPUDevice, float>;
+template struct functor::TopKFunctor<GPUDevice, float, int32>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_half.cu.cc b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
index bde26cb0951..2f4eae49b70 100644
--- a/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 using Eigen::GpuDevice;
 
-template struct functor::TopKFunctor<GPUDevice, Eigen::half>;
+template struct functor::TopKFunctor<GPUDevice, Eigen::half, int32>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
index fba39300700..626eb94bea8 100644
--- a/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 using Eigen::GpuDevice;
 
-template struct functor::TopKFunctor<GPUDevice, int16>;
+template struct functor::TopKFunctor<GPUDevice, int16, int32>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
index a017234597d..628439756b8 100644
--- a/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 using Eigen::GpuDevice;
 
-template struct functor::TopKFunctor<GPUDevice, int32>;
+template struct functor::TopKFunctor<GPUDevice, int32, int32>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
index ed9f6ea52c6..3ea8c052a16 100644
--- a/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 using Eigen::GpuDevice;
 
-template struct functor::TopKFunctor<GPUDevice, int64>;
+template struct functor::TopKFunctor<GPUDevice, int64, int32>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
index 647700ebcda..b0ca7f09153 100644
--- a/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 using Eigen::GpuDevice;
 
-template struct functor::TopKFunctor<GPUDevice, int8>;
+template struct functor::TopKFunctor<GPUDevice, int8, int32>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
index 41ab6ffa601..09cc65e63f1 100644
--- a/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
@@ -22,8 +22,8 @@ limitations under the License.
 namespace tensorflow {
 using Eigen::GpuDevice;
 
-template struct functor::TopKFunctor<GPUDevice, uint16>;
-template struct functor::TopKFunctor<GPUDevice, uint32>;
+template struct functor::TopKFunctor<GPUDevice, uint16, int32>;
+template struct functor::TopKFunctor<GPUDevice, uint32, int32>;
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
index 6725f478c15..ae75b1fff5a 100644
--- a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 using Eigen::GpuDevice;
 
-template struct functor::TopKFunctor<GPUDevice, uint32>;
+template struct functor::TopKFunctor<GPUDevice, uint32, int32>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
index 0dd65145d41..61ec49fdecb 100644
--- a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 using Eigen::GpuDevice;
 
-template struct functor::TopKFunctor<GPUDevice, uint64>;
+template struct functor::TopKFunctor<GPUDevice, uint64, int32>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
index 6d544291fed..cc60930d38d 100644
--- a/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 using Eigen::GpuDevice;
 
-template struct functor::TopKFunctor<GPUDevice, uint8>;
+template struct functor::TopKFunctor<GPUDevice, uint8, int32>;
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/uniform_quant_ops/BUILD b/tensorflow/core/kernels/uniform_quant_ops/BUILD
index cb3cb5b3f52..1a20db322e8 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/BUILD
+++ b/tensorflow/core/kernels/uniform_quant_ops/BUILD
@@ -34,13 +34,13 @@ filegroup(
     ] + [
         "math_utils.cc",
         "tensor_utils.cc",
+        "uniform_dequantize_op.cc",
         "uniform_quantize_op.cc",
         "uniform_quantized_add_op.cc",
-        "uniform_dequantize_op.cc",
-        "uniform_requantize_op.cc",
-        "uniform_quantized_dot_ops.cc",
-        "uniform_quantized_convolution_ops.cc",
         "uniform_quantized_clip_by_value_op.cc",
+        "uniform_quantized_convolution_ops.cc",
+        "uniform_quantized_dot_ops.cc",
+        "uniform_requantize_op.cc",
     ],
     visibility = ["//tensorflow:__subpackages__"],
 )
diff --git a/tensorflow/core/kernels/uniform_quant_ops/math_utils_test.cc b/tensorflow/core/kernels/uniform_quant_ops/math_utils_test.cc
index 4983a53fd19..fc12a7cfd23 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/math_utils_test.cc
+++ b/tensorflow/core/kernels/uniform_quant_ops/math_utils_test.cc
@@ -23,8 +23,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using errors::IsInvalidArgument;
-
 TEST(MathUtilsTest, AffineQuantize) {
   TensorShape shape({2, 2, 2});
   Tensor tensor = test::AsTensor<float>(
@@ -115,13 +113,13 @@ TEST(MathUtilsTest, QuantizeMultiplierInvalidArgument) {
   int32_t quantized_multiplier;
   int shift;
 
-  EXPECT_TRUE(
-      IsInvalidArgument(QuantizeMultiplier(0, quantized_multiplier, shift)));
-  EXPECT_TRUE(
-      IsInvalidArgument(QuantizeMultiplier(-1, quantized_multiplier, shift)));
-  EXPECT_TRUE(IsInvalidArgument(QuantizeMultiplier(
+  EXPECT_TRUE(absl::IsInvalidArgument(
+      QuantizeMultiplier(0, quantized_multiplier, shift)));
+  EXPECT_TRUE(absl::IsInvalidArgument(
+      QuantizeMultiplier(-1, quantized_multiplier, shift)));
+  EXPECT_TRUE(absl::IsInvalidArgument(QuantizeMultiplier(
       std::numeric_limits<double>::infinity(), quantized_multiplier, shift)));
-  EXPECT_TRUE(IsInvalidArgument(QuantizeMultiplier(
+  EXPECT_TRUE(absl::IsInvalidArgument(QuantizeMultiplier(
       std::numeric_limits<double>::quiet_NaN(), quantized_multiplier, shift)));
 }
 
diff --git a/tensorflow/core/kernels/uniform_quant_ops/uniform_quantize_op_test.cc b/tensorflow/core/kernels/uniform_quant_ops/uniform_quantize_op_test.cc
index 66452a8838e..8ed2dc04e34 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/uniform_quantize_op_test.cc
+++ b/tensorflow/core/kernels/uniform_quant_ops/uniform_quantize_op_test.cc
@@ -20,8 +20,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using errors::IsInvalidArgument;
-
 class UniformQuantizeOpsTest : public OpsTestBase {
  protected:
 };
@@ -38,7 +36,7 @@ TEST_F(UniformQuantizeOpsTest, QuantizeInvalidQuantizationAxis) {
                    .Attr("quantization_max_val", 127)
                    .Finalize(node_def()));
   // quantization_axis < -1.
-  EXPECT_TRUE(IsInvalidArgument(InitOp()));
+  EXPECT_TRUE(absl::IsInvalidArgument(InitOp()));
 
   TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantize")
                    .Input(FakeInput(DT_FLOAT))
@@ -57,7 +55,7 @@ TEST_F(UniformQuantizeOpsTest, QuantizeInvalidQuantizationAxis) {
   AddInputFromArray<int32>(TensorShape({}), {0});
 
   // quantization_axis >= input tensor rank.
-  EXPECT_TRUE(IsInvalidArgument(RunOpKernel()));
+  EXPECT_TRUE(absl::IsInvalidArgument(RunOpKernel()));
 }
 
 TEST_F(UniformQuantizeOpsTest, PerTensorQuantize) {
diff --git a/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_add_op_test.cc b/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_add_op_test.cc
index c1be8f71b46..4d47f794252 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_add_op_test.cc
+++ b/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_add_op_test.cc
@@ -24,8 +24,6 @@ namespace tensorflow {
 
 namespace {
 
-using errors::IsInvalidArgument;
-
 constexpr int32_t kInt32Min = std::numeric_limits<int32_t>::min();
 constexpr int32_t kInt32Max = std::numeric_limits<int32_t>::max();
 
@@ -67,7 +65,7 @@ TEST_F(UniformQuantizedAddOpTest, InvalidShape) {
   AddInputFromArray<float>(TensorShape({3}), {2, 3, 4});
   AddInputFromArray<int32>(TensorShape({3}), {-40, 0, 40});
 
-  EXPECT_TRUE(IsInvalidArgument(RunOpKernel()));
+  EXPECT_TRUE(absl::IsInvalidArgument(RunOpKernel()));
 }
 
 TEST_F(UniformQuantizedAddOpTest, PerChannelSameScale) {
diff --git a/tensorflow/core/kernels/uniform_quant_ops/uniform_requantize_op_test.cc b/tensorflow/core/kernels/uniform_quant_ops/uniform_requantize_op_test.cc
index 32c2dc31c14..5b8ae693ab4 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/uniform_requantize_op_test.cc
+++ b/tensorflow/core/kernels/uniform_quant_ops/uniform_requantize_op_test.cc
@@ -20,8 +20,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using errors::IsInvalidArgument;
-
 class UniformRequantizeOpTest : public OpsTestBase {
  protected:
 };
@@ -43,7 +41,7 @@ TEST_F(UniformRequantizeOpTest, RequantizeInvalidQuantizationAxis) {
           .Attr("output_quantization_max_val", 127)
           .Finalize(node_def()));
   // input_quantization_axis < -1.
-  EXPECT_TRUE(IsInvalidArgument(InitOp()));
+  EXPECT_TRUE(absl::IsInvalidArgument(InitOp()));
 
   TF_ASSERT_OK(
       NodeDefBuilder("test", "UniformRequantize")
@@ -63,7 +61,7 @@ TEST_F(UniformRequantizeOpTest, RequantizeInvalidQuantizationAxis) {
           .Finalize(node_def()));
   // input_quantization_axis and output_quantization_axis both >= 0 but
   // different.
-  EXPECT_TRUE(IsInvalidArgument(InitOp()));
+  EXPECT_TRUE(absl::IsInvalidArgument(InitOp()));
 
   TF_ASSERT_OK(
       NodeDefBuilder("test", "UniformRequantize")
@@ -89,7 +87,7 @@ TEST_F(UniformRequantizeOpTest, RequantizeInvalidQuantizationAxis) {
   AddInputFromArray<int32>(TensorShape({}), {0});
 
   // input_quantization_axis >= input tensor rank.
-  EXPECT_TRUE(IsInvalidArgument(RunOpKernel()));
+  EXPECT_TRUE(absl::IsInvalidArgument(RunOpKernel()));
 }
 
 TEST_F(UniformRequantizeOpTest, PerTensorToPerTensorReQuantize) {
diff --git a/tensorflow/core/lib/core/status_test.cc b/tensorflow/core/lib/core/status_test.cc
index 72dcdd8ca3e..9927545f73a 100644
--- a/tensorflow/core/lib/core/status_test.cc
+++ b/tensorflow/core/lib/core/status_test.cc
@@ -25,7 +25,7 @@ namespace tensorflow {
 
 TEST(Status, OK) {
   EXPECT_EQ(OkStatus().code(), error::OK);
-  EXPECT_EQ(OkStatus().error_message(), "");
+  EXPECT_EQ(OkStatus().message(), "");
   TF_EXPECT_OK(OkStatus());
   TF_ASSERT_OK(OkStatus());
   EXPECT_EQ(OkStatus(), Status());
@@ -40,9 +40,9 @@ TEST(DeathStatus, CheckOK) {
 
 TEST(Status, Set) {
   Status status;
-  status = Status(error::CANCELLED, "Error message");
-  EXPECT_EQ(status.code(), error::CANCELLED);
-  EXPECT_EQ(status.error_message(), "Error message");
+  status = Status(absl::StatusCode::kCancelled, "Error message");
+  EXPECT_EQ(status.code(), absl::StatusCode::kCancelled);
+  EXPECT_EQ(status.message(), "Error message");
 }
 
 TEST(Status, Copy) {
@@ -129,8 +129,7 @@ TEST(StatusGroup, AggregateWithSingleErrorStatus) {
 
   Status concat_status = c.as_concatenated_status();
   ASSERT_EQ(concat_status.code(), internal.code());
-  ASSERT_TRUE(absl::StrContains(concat_status.error_message(),
-                                internal.error_message()));
+  ASSERT_TRUE(absl::StrContains(concat_status.message(), internal.message()));
 
   // Add derived error status
   const Status derived =
@@ -141,8 +140,7 @@ TEST(StatusGroup, AggregateWithSingleErrorStatus) {
 
   concat_status = c.as_concatenated_status();
   ASSERT_EQ(concat_status.code(), internal.code());
-  ASSERT_TRUE(absl::StrContains(concat_status.error_message(),
-                                internal.error_message()));
+  ASSERT_TRUE(absl::StrContains(concat_status.message(), internal.message()));
 }
 
 TEST(StatusGroup, AggregateWithMultipleErrorStatus) {
@@ -158,21 +156,15 @@ TEST(StatusGroup, AggregateWithMultipleErrorStatus) {
   Status summary = c.as_summary_status();
 
   ASSERT_EQ(summary.code(), internal.code());
-  ASSERT_TRUE(
-      absl::StrContains(summary.error_message(), internal.error_message()));
-  ASSERT_TRUE(
-      absl::StrContains(summary.error_message(), cancelled.error_message()));
-  ASSERT_TRUE(
-      absl::StrContains(summary.error_message(), aborted.error_message()));
+  ASSERT_TRUE(absl::StrContains(summary.message(), internal.message()));
+  ASSERT_TRUE(absl::StrContains(summary.message(), cancelled.message()));
+  ASSERT_TRUE(absl::StrContains(summary.message(), aborted.message()));
 
   Status concat_status = c.as_concatenated_status();
   ASSERT_EQ(concat_status.code(), internal.code());
-  ASSERT_TRUE(absl::StrContains(concat_status.error_message(),
-                                internal.error_message()));
-  ASSERT_TRUE(absl::StrContains(concat_status.error_message(),
-                                cancelled.error_message()));
-  ASSERT_TRUE(absl::StrContains(concat_status.error_message(),
-                                aborted.error_message()));
+  ASSERT_TRUE(absl::StrContains(concat_status.message(), internal.message()));
+  ASSERT_TRUE(absl::StrContains(concat_status.message(), cancelled.message()));
+  ASSERT_TRUE(absl::StrContains(concat_status.message(), aborted.message()));
 }
 
 TEST(Status, InvalidPayloadGetsIgnored) {
diff --git a/tensorflow/core/lib/db/BUILD b/tensorflow/core/lib/db/BUILD
index 143e6bf3628..c937ef52c3b 100644
--- a/tensorflow/core/lib/db/BUILD
+++ b/tensorflow/core/lib/db/BUILD
@@ -22,6 +22,7 @@ cc_library(
         ":snapfn",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/status",
         "@org_sqlite",
     ],
 )
diff --git a/tensorflow/core/lib/db/sqlite.cc b/tensorflow/core/lib/db/sqlite.cc
index e3e328b81b4..583aeee34f3 100644
--- a/tensorflow/core/lib/db/sqlite.cc
+++ b/tensorflow/core/lib/db/sqlite.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/lib/db/sqlite.h"
 
+#include "absl/status/status.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
@@ -22,49 +23,49 @@ extern "C" int sqlite3_snapfn_init(sqlite3*, const char**, const void*);
 namespace tensorflow {
 namespace {
 
-error::Code GetTfErrorCode(int code) {
+absl::StatusCode GetTfErrorCode(int code) {
   // See: https://sqlite.org/rescode.html
   switch (code & 0xff) {
     case SQLITE_OK:    // Successful result
     case SQLITE_ROW:   // Step has another row ready
     case SQLITE_DONE:  // Step has finished executing
-      return error::OK;
+      return absl::StatusCode::kOk;
     case SQLITE_ABORT:  // Callback routine requested an abort
-      return error::ABORTED;
+      return absl::StatusCode::kAborted;
     case SQLITE_READONLY:  // Attempt to write a readonly database
     case SQLITE_MISMATCH:  // Data type mismatch
-      return error::FAILED_PRECONDITION;
+      return absl::StatusCode::kFailedPrecondition;
     case SQLITE_MISUSE:    // Library used incorrectly
     case SQLITE_INTERNAL:  // Internal logic error in SQLite
-      return error::INTERNAL;
+      return absl::StatusCode::kInternal;
     case SQLITE_RANGE:  // 2nd parameter to sqlite3_bind out of range
-      return error::OUT_OF_RANGE;
+      return absl::StatusCode::kOutOfRange;
     case SQLITE_CANTOPEN:    // Unable to open the database file
     case SQLITE_CONSTRAINT:  // Abort due to constraint violation
     case SQLITE_NOTFOUND:    // Unknown opcode or statement parameter name
     case SQLITE_NOTADB:      // File opened that is not a database file
-      return error::INVALID_ARGUMENT;
+      return absl::StatusCode::kInvalidArgument;
     case SQLITE_CORRUPT:  // The database disk image is malformed
-      return error::DATA_LOSS;
+      return absl::StatusCode::kDataLoss;
     case SQLITE_AUTH:  // Authorization denied
     case SQLITE_PERM:  // Access permission denied
-      return error::PERMISSION_DENIED;
+      return absl::StatusCode::kPermissionDenied;
     case SQLITE_FULL:    // Insertion failed because database is full
     case SQLITE_TOOBIG:  // String or BLOB exceeds size limit
     case SQLITE_NOLFS:   // Uses OS features not supported on host
-      return error::RESOURCE_EXHAUSTED;
+      return absl::StatusCode::kResourceExhausted;
     case SQLITE_BUSY:      // The database file is locked
     case SQLITE_LOCKED:    // A table in the database is locked
     case SQLITE_PROTOCOL:  // Database lock protocol error
     case SQLITE_NOMEM:     // Out of heap or perhaps lookaside memory
-      return error::UNAVAILABLE;
+      return absl::StatusCode::kUnavailable;
     case SQLITE_INTERRUPT:  // Operation terminated by sqlite3_interrupt
-      return error::CANCELLED;
+      return absl::StatusCode::kCancelled;
     case SQLITE_ERROR:   // SQL error or missing database
     case SQLITE_IOERR:   // Some kind of disk I/O error occurred
     case SQLITE_SCHEMA:  // The database schema changed
     default:
-      return error::UNKNOWN;
+      return absl::StatusCode::kUnknown;
   }
 }
 
diff --git a/tensorflow/core/lib/db/sqlite_test.cc b/tensorflow/core/lib/db/sqlite_test.cc
index 15900559601..1fb7121820a 100644
--- a/tensorflow/core/lib/db/sqlite_test.cc
+++ b/tensorflow/core/lib/db/sqlite_test.cc
@@ -216,7 +216,7 @@ TEST_F(SqliteTest, PrepareFailed) {
   SqliteStatement stmt;
   Status s = db_->Prepare("SELECT", &stmt);
   ASSERT_FALSE(s.ok());
-  EXPECT_NE(string::npos, s.error_message().find("SELECT"));
+  EXPECT_NE(string::npos, s.message().find("SELECT"));
   EXPECT_EQ(SQLITE_ERROR, db_->errcode());
 }
 
@@ -224,9 +224,8 @@ TEST_F(SqliteTest, BindFailed) {
   auto stmt = db_->PrepareOrDie("INSERT INTO T (a) VALUES (123)");
   stmt.BindInt(1, 123);
   Status s = stmt.StepOnce();
-  EXPECT_NE(string::npos,
-            s.error_message().find("INSERT INTO T (a) VALUES (123)"))
-      << s.error_message();
+  EXPECT_NE(string::npos, s.message().find("INSERT INTO T (a) VALUES (123)"))
+      << s.message();
 }
 
 TEST_F(SqliteTest, SnappyExtension) {
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index d7bb9213d4f..f888ba16a77 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -99,16 +99,16 @@ Status IncrementOffset(int old_offset, int64_t increment, size_t max_size,
     return errors::InvalidArgument("Initial offset is outside data range: ",
                                    old_offset);
   }
-  *new_offset = old_offset + increment;
-  if (*new_offset > max_size) {
+  int64_t sum = old_offset + increment;
+  if (sum > max_size) {
     return errors::InvalidArgument("Data too short when trying to read string");
   }
   // See above for the check that the input offset is positive. If it's negative
   // here then it means that there's been an overflow in the arithmetic.
-  if (*new_offset < 0) {
-    return errors::InvalidArgument("Offset too large, overflowed: ",
-                                   *new_offset);
+  if (sum < 0) {
+    return errors::InvalidArgument("Offset too large, overflowed: ", sum);
   }
+  *new_offset = sum;
   return OkStatus();
 }
 
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index 196ce84e6a4..d3a467dfccd 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -205,8 +205,8 @@ TEST(WavIO, ChunkSizeOverflow) {
       wav_data_string, &decoded_audio, &decoded_sample_count,
       &decoded_channel_count, &decoded_sample_rate);
   EXPECT_FALSE(decode_status.ok());
-  EXPECT_TRUE(absl::StrContains(decode_status.error_message(), "too large"))
-      << decode_status.error_message();
+  EXPECT_TRUE(absl::StrContains(decode_status.message(), "too large"))
+      << decode_status.message();
 }
 
 TEST(WavIO, IncrementOffset) {
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index 9fa523fa99e..7dcacb9bb6c 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -879,8 +879,8 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
         VLOG(2) << "call Nccl All to All collective_key "
                 << collective->collective_key << " participant " << p_idx
                 << " num_participants " << collective->participants.size()
-                << " sendbuff " << static_cast<const char*>(sendbuff)
-                << " recvbuff " << static_cast<char*>(recvbuff) << " nccl_comm "
+                << " sendbuff " << static_cast<const void*>(sendbuff)
+                << " recvbuff " << static_cast<void*>(recvbuff) << " nccl_comm "
                 << nccl_comm << " comm_stream " << comm_stream
                 << " cuda_stream " << cu_stream;
         profiler::AnnotatedTraceMe traceme([&] {
@@ -890,10 +890,12 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
                {"collective_type", "all_to_all"}});
         });
         ncclGroupStart();
-        for (int r = 0; r < collective->participants.size(); ++r) {
-          ncclSend(sendbuff + r * rank_offset, count, data_type, r, nccl_comm,
+        for (int i = 0; i < collective->participants.size(); ++i) {
+          ncclSend(sendbuff + i * rank_offset, count, data_type,
+                   collective->participants[i]->global_rank, nccl_comm,
                    *cu_stream);
-          ncclRecv(recvbuff + r * rank_offset, count, data_type, r, nccl_comm,
+          ncclRecv(recvbuff + i * rank_offset, count, data_type,
+                   collective->participants[i]->global_rank, nccl_comm,
                    *cu_stream);
         }
         nccl_result = ncclGroupEnd();
diff --git a/tensorflow/core/ops/BUILD b/tensorflow/core/ops/BUILD
index 3e1de31a4f7..b4eafe7e897 100644
--- a/tensorflow/core/ops/BUILD
+++ b/tensorflow/core/ops/BUILD
@@ -90,6 +90,7 @@ tf_gen_op_libs(
         "state_ops",
         "stateless_random_ops",
         "stateless_random_ops_v2",
+        "stochastic_cast_op",
         "sync_ops",
         "summary_ops",
         "training_ops",
@@ -268,6 +269,7 @@ cc_library(
         ":checkpoint_ops_op_lib",
         ":clustering_ops_op_lib",
         ":collective_ops_op_lib",
+        ":composite_tensor_ops_op_lib",
         ":control_flow_ops_op_lib",
         ":count_ops_op_lib",
         ":ctc_ops_op_lib",
@@ -278,19 +280,17 @@ cc_library(
         ":decode_proto_ops_op_lib",
         ":encode_proto_ops_op_lib",
         ":experimental_dataset_ops_op_lib",
-        ":composite_tensor_ops_op_lib",
         ":filesystem_ops_op_lib",
         ":function_ops_op_lib",
         ":functional_ops_op_lib",
         ":image_ops_op_lib",
         ":io_ops_op_lib",
         ":linalg_ops_op_lib",
-        ":sync_ops_op_lib",
         ":list_ops_op_lib",
-        ":map_ops_op_lib",
         ":logging_ops_op_lib",
         ":lookup_ops_op_lib",
         ":manip_ops_op_lib",
+        ":map_ops_op_lib",
         ":math_ops_op_lib",
         ":nccl_ops_op_lib",
         ":nn_ops_op_lib",
@@ -303,8 +303,6 @@ cc_library(
         ":resource_variable_ops_op_lib",
         ":risc_ops_op_lib",
         ":rnn_ops_op_lib",
-        ":special_math_ops_op_lib",
-        ":stateful_random_ops_op_lib",
         ":scoped_allocator_ops_op_lib",
         ":script_ops_op_lib",
         ":sdca_ops_op_lib",
@@ -312,12 +310,16 @@ cc_library(
         ":set_ops_op_lib",
         ":sparse_csr_matrix_ops_op_lib",
         ":sparse_ops_op_lib",
-        ":summary_ops_op_lib",
+        ":special_math_ops_op_lib",
         ":spectral_ops_op_lib",
         ":state_ops_op_lib",
+        ":stateful_random_ops_op_lib",
         ":stateless_random_ops_op_lib",
         ":stateless_random_ops_v2_op_lib",
+        ":stochastic_cast_op_op_lib",
         ":string_ops_op_lib",
+        ":summary_ops_op_lib",
+        ":sync_ops_op_lib",
         ":training_ops_op_lib",
         ":uniform_quant_ops_op_lib",
         ":word2vec_ops",
@@ -328,16 +330,16 @@ cc_library(
         "//conditions:default": [
             ":tpu_configuration_ops_op_lib",
             ":tpu_cross_replica_ops_op_lib",
-            ":tpu_embedding_ops_op_lib",
             ":tpu_embedding_load_retrieve_ops_op_lib",
+            ":tpu_embedding_ops_op_lib",
             ":tpu_functional_ops_op_lib",
             ":tpu_heartbeat_ops_op_lib",
             ":tpu_host_compute_ops_op_lib",
-            ":tpu_sharding_util_ops_op_lib",
             ":tpu_infeed_ops_op_lib",
-            ":tpu_outfeed_ops_op_lib",
             ":tpu_ordinal_selector_ops_op_lib",
+            ":tpu_outfeed_ops_op_lib",
             ":tpu_replication_ops_op_lib",
+            ":tpu_sharding_util_ops_op_lib",
         ],
     }) + if_mkl([
         ":mkl_array_ops_op_lib",
@@ -393,9 +395,9 @@ cc_library(
     srcs = ["nn_grad.cc"],
     linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
     deps = [
+        ":nn_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        ":nn_ops_op_lib",
     ] + if_mkl([
         ":mkl_nn_ops_op_lib",
     ]),
@@ -515,6 +517,7 @@ tf_cc_test(
         "sparse_ops_test.cc",
         "spectral_ops_test.cc",
         "state_ops_test.cc",
+        "stochastic_cast_op_test.cc",
         "string_ops_test.cc",
         "tpu_cross_replica_ops_test.cc",
         "training_ops_test.cc",
@@ -535,6 +538,5 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/common_runtime:type_inference",
-        "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AsString.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AsString.pbtxt
index fc77b8594dd..b2b27547015 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AsString.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AsString.pbtxt
@@ -318,3 +318,74 @@ op {
     }
   }
 }
+op {
+  name: "AsString"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+        type: DT_VARIANT
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "precision"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "scientific"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "shortest"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "width"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "fill"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNN.pbtxt
index 6306f9de28f..4a7753b4988 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNN.pbtxt
@@ -115,3 +115,121 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "CudnnRNN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackprop.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackprop.pbtxt
index 385475c72d3..bc95978eb0d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackprop.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackprop.pbtxt
@@ -136,3 +136,142 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "CudnnRNNBackprop"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV2.pbtxt
index 2b9f8fc46bc..c76d1583656 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV2.pbtxt
@@ -140,3 +140,146 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "CudnnRNNBackpropV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV3.pbtxt
index a6b4d611dd6..b901c64dbcb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV3.pbtxt
@@ -304,3 +304,164 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CudnnRNNBackpropV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_c_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_proj"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "time_major"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParams.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParams.pbtxt
index 71600dfbe4b..3b46f61998c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParams.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParams.pbtxt
@@ -107,3 +107,113 @@ op 	 {
     }
   }
 }
+op {
+  name: "CudnnRNNCanonicalToParams"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  input_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParamsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParamsV2.pbtxt
index 25a388b7505..9c51d58cdbe 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParamsV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParamsV2.pbtxt
@@ -120,3 +120,126 @@ op 	 {
     }
   }
 }
+op {
+  name: "CudnnRNNCanonicalToParamsV2"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params_weights"
+  }
+  input_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params_biases"
+  }
+  output_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params_weights"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_params_biases"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_proj"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsSize.pbtxt
index 94ff106d11a..d88ab4c5499 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsSize.pbtxt
@@ -211,3 +211,114 @@ op {
     }
   }
 }
+op {
+  name: "CudnnRNNParamsSize"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "params_size"
+    type_attr: "S"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_proj"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonical.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonical.pbtxt
index 7bf9c9cfcb5..4cce256653d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonical.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonical.pbtxt
@@ -107,3 +107,113 @@ op 	 {
     }
   }
 }
+op {
+  name: "CudnnRNNParamsToCanonical"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonicalV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonicalV2.pbtxt
index 0d59102cdeb..70fe8004c48 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonicalV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonicalV2.pbtxt
@@ -120,3 +120,126 @@ op 	 {
     }
   }
 }
+op {
+  name: "CudnnRNNParamsToCanonicalV2"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params_weights"
+  }
+  output_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params_biases"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params_weights"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_params_biases"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_proj"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV2.pbtxt
index c166f11dd52..caf4d172626 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV2.pbtxt
@@ -119,3 +119,125 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "CudnnRNNV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV3.pbtxt
index bf059251170..91535bbe7ad 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV3.pbtxt
@@ -262,3 +262,143 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CudnnRNNV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_proj"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "time_major"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DatasetCardinality.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DatasetCardinality.pbtxt
index a94b9262759..8e6e5f85f83 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DatasetCardinality.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DatasetCardinality.pbtxt
@@ -9,3 +9,21 @@ op 	 {
     type: DT_INT64
   }
 }
+op {
+  name: "DatasetCardinality"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "cardinality"
+    type: DT_INT64
+  }
+  attr {
+    name: "cardinality_options"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV3.pbtxt
new file mode 100644
index 00000000000..ddcfd4f6a87
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV3.pbtxt
@@ -0,0 +1,67 @@
+op 	 {
+  name: "DebugIdentityV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "io_of_node"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "is_input"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "io_index"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+  allows_uninitialized_input: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RelayoutGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RelayoutLike.pbtxt
similarity index 77%
rename from tensorflow/core/ops/compat/ops_history_v2/RelayoutGrad.pbtxt
rename to tensorflow/core/ops/compat/ops_history_v2/RelayoutLike.pbtxt
index 314762a673f..68c9383e613 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RelayoutGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RelayoutLike.pbtxt
@@ -1,11 +1,11 @@
-op 	 {
-  name: "RelayoutGrad"
+op {
+  name: "RelayoutLike"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "forward_input"
+    name: "layout_input"
     type_attr: "T"
   }
   output_arg {
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotChunkDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotChunkDataset.pbtxt
new file mode 100644
index 00000000000..339bcd99809
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotChunkDataset.pbtxt
@@ -0,0 +1,50 @@
+op 	 {
+  name: "SnapshotChunkDataset"
+  input_arg {
+    name: "chunk_file"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StochasticCastToInt.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StochasticCastToInt.pbtxt
new file mode 100644
index 00000000000..66896c5d662
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StochasticCastToInt.pbtxt
@@ -0,0 +1,46 @@
+op 	 {
+  name: "StochasticCastToInt"
+  input_arg {
+    name: "input"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringJoin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringJoin.pbtxt
index c28ae61e5f9..a45262ee65e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringJoin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringJoin.pbtxt
@@ -23,3 +23,27 @@ op 	 {
     }
   }
 }
+op {
+  name: "StringJoin"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TopKV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TopKV2.pbtxt
index 6322bc9e3b4..5da8c479b9a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TopKV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TopKV2.pbtxt
@@ -178,3 +178,77 @@ op {
     }
   }
 }
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type_attr: "Tk"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type_attr: "index_type"
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tk"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "index_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 91a5bd63b9d..d93e6bcc644 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -143,6 +145,13 @@ Status MergeShape(InferenceContext* c) {
   c->set_output(1, c->Scalar());
   return OkStatus();
 }
+
+TypeInferenceFn MergeTypeFn() {
+  std::vector<TypeInferenceFn> func_list{full_type::Merge(),
+                                         full_type::Tensor(TFT_INT32)};
+  return full_type::Tuple(func_list);
+}
+
 }  // namespace
 
 REGISTER_OP("Merge")
@@ -151,7 +160,7 @@ REGISTER_OP("Merge")
     .Output("value_index: int32")
     .Attr("T: type")
     .Attr("N: int >= 1")
-    .SetForwardTypeFn(full_type::Merge())
+    .SetForwardTypeFn(MergeTypeFn())
     .SetShapeFn(MergeShape);
 
 REGISTER_OP("RefMerge")
diff --git a/tensorflow/core/ops/control_flow_ops_test.cc b/tensorflow/core/ops/control_flow_ops_test.cc
index e3175efa44d..bf6620704da 100644
--- a/tensorflow/core/ops/control_flow_ops_test.cc
+++ b/tensorflow/core/ops/control_flow_ops_test.cc
@@ -13,10 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
+#include "tensorflow/core/common_runtime/type_inference.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -98,4 +104,67 @@ TEST(ControlFlowOpsTest, RefSelect_ShapeFn) {
   INFER_OK(op, "[];[1,2,3];[1,2,3];[1,2,3]", "in1");
 }
 
+// Runs type inference pass on graph
+static Status type_inference(Graph& graph) {
+  GraphOptimizationPassOptions opt_options;
+  std::unique_ptr<Graph> graph_ptr(new Graph(OpRegistry::Global()));
+  graph_ptr->Copy(graph);
+  opt_options.graph = &graph_ptr;
+  opt_options.flib_def = graph.mutable_flib_def();
+  TypeInferencePass pass;
+  return pass.Run(opt_options);
+}
+
+// TODO(b/222556529) when Const has a type constructor, remove the following
+// REGISTER_OP definiton for ControlFlowOpsTest>ConstTypeCtor and use the Const
+// op instead of ControlFlowOpsTest>ConstTypeCtor in the Shape_TypeCtor test.
+REGISTER_OP("ControlFlowOpsTest>ConstTypeCtor")
+    .Output("output: dtype")
+    .Attr("value: tensor")
+    .Attr("dtype: type")
+    .SetTypeConstructor(full_type::Unary(TFT_TENSOR, "dtype"))
+    .SetShapeFn(shape_inference::UnknownShape);
+
+TEST(ControlFlowOpsTest, Merge_TypeInfrnc) {
+  Graph graph(OpRegistry::Global());
+  Node* input_tensor_op1;
+  TensorProto tensor_proto1;
+  TF_EXPECT_OK(
+      NodeBuilder("input_tensor_op1", "ControlFlowOpsTest>ConstTypeCtor")
+          .Attr("value", tensor_proto1)
+          .Attr("dtype", DT_FLOAT)
+          .Finalize(&graph, &input_tensor_op1));
+  Node* input_tensor_op2;
+  TensorProto tensor_proto2;
+  TF_EXPECT_OK(
+      NodeBuilder("input_tensor_op2", "ControlFlowOpsTest>ConstTypeCtor")
+          .Attr("value", tensor_proto2)
+          .Attr("dtype", DT_FLOAT)
+          .Finalize(&graph, &input_tensor_op2));
+  Node* shape_op;
+  TF_EXPECT_OK(NodeBuilder("merge_op", "Merge")
+                   .Input({input_tensor_op1, input_tensor_op2})
+                   .Attr("T", DT_FLOAT)
+                   .Finalize(&graph, &shape_op));
+  TF_EXPECT_OK(type_inference(graph));
+  FullTypeDef expected_shape_op_t;
+  protobuf::TextFormat::Parser parser;
+  CHECK(parser.ParseFromString(
+      R"pb(type_id: TFT_PRODUCT
+           args {
+             type_id: TFT_TENSOR
+             args { type_id: TFT_FLOAT }
+           }
+           args {
+             type_id: TFT_TENSOR
+             args { type_id: TFT_INT32 }
+           })pb",
+      &expected_shape_op_t));
+  EXPECT_TRUE(full_type::IsEqual(shape_op->def().experimental_type(),
+                                 expected_shape_op_t))
+      << "fulltype is\n"
+      << shape_op->def().experimental_type().DebugString() << "\nexpected\n"
+      << expected_shape_op_t.DebugString();
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/cudnn_rnn_ops.cc b/tensorflow/core/ops/cudnn_rnn_ops.cc
index d627dc42802..b09af19b633 100644
--- a/tensorflow/core/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops.cc
@@ -41,7 +41,7 @@ REGISTER_OP("CudnnRNNParamsSize")
     .Input("num_layers: int32")
     .Input("num_units: int32")
     .Input("input_size: int32")
-    .Attr("T: {float16, float32, float64}")
+    .Attr("T: {bfloat16, float16, float32, float64}")
     .Attr("S: {int32, int64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
@@ -72,7 +72,7 @@ REGISTER_OP("CudnnRNN")
     .Output("output_h: T")
     .Output("output_c: T")
     .Output("reserve_space: T")
-    .Attr("T: {float16, float32, float64}")
+    .Attr("T: {bfloat16, float16, float32, float64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
@@ -121,7 +121,7 @@ REGISTER_OP("CudnnRNNV2")
     .Output("output_c: T")
     .Output("reserve_space: T")
     .Output("host_reserved: int8")
-    .Attr("T: {float16, float32, float64}")
+    .Attr("T: {bfloat16, float16, float32, float64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
@@ -171,7 +171,7 @@ REGISTER_OP("CudnnRNNV3")
     .Output("output_c: T")
     .Output("reserve_space: T")
     .Output("host_reserved: int8")
-    .Attr("T: {float16, float32, float64}")
+    .Attr("T: {bfloat16, float16, float32, float64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
@@ -235,7 +235,7 @@ REGISTER_OP("CudnnRNNBackprop")
     .Output("input_h_backprop: T")
     .Output("input_c_backprop: T")
     .Output("params_backprop: T")
-    .Attr("T: {float16, float32, float64}")
+    .Attr("T: {bfloat16, float16, float32, float64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
@@ -272,7 +272,7 @@ REGISTER_OP("CudnnRNNBackpropV2")
     .Output("input_h_backprop: T")
     .Output("input_c_backprop: T")
     .Output("params_backprop: T")
-    .Attr("T: {float16, float32, float64}")
+    .Attr("T: {bfloat16, float16, float32, float64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
@@ -310,7 +310,7 @@ REGISTER_OP("CudnnRNNBackpropV3")
     .Output("input_h_backprop: T")
     .Output("input_c_backprop: T")
     .Output("params_backprop: T")
-    .Attr("T: {float16, float32, float64}")
+    .Attr("T: {bfloat16, float16, float32, float64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
@@ -338,7 +338,7 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
     .Input("params: T")
     .Output("weights: num_params * T")
     .Output("biases: num_params * T")
-    .Attr("T: {float16, float32, float64}")
+    .Attr("T: {bfloat16, float16, float32, float64}")
     .Attr("num_params: int")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
@@ -370,7 +370,7 @@ REGISTER_OP("CudnnRNNParamsToCanonicalV2")
     .Input("params: T")
     .Output("weights: num_params_weights * T")
     .Output("biases: num_params_biases * T")
-    .Attr("T: {float16, float32, float64}")
+    .Attr("T: {bfloat16, float16, float32, float64}")
     .Attr("num_params_weights: int")
     .Attr("num_params_biases: int")
     .Attr(kRNNModeAttrs)
@@ -407,7 +407,7 @@ REGISTER_OP("CudnnRNNCanonicalToParams")
     .Input("weights: num_params * T")
     .Input("biases: num_params * T")
     .Output("params: T")
-    .Attr("T: {float16, float32, float64}")
+    .Attr("T: {bfloat16, float16, float32, float64}")
     .Attr("num_params: int")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
@@ -427,7 +427,7 @@ REGISTER_OP("CudnnRNNCanonicalToParamsV2")
     .Input("weights: num_params_weights * T")
     .Input("biases: num_params_biases * T")
     .Output("params: T")
-    .Attr("T: {float16, float32, float64}")
+    .Attr("T: {bfloat16, float16, float32, float64}")
     .Attr("num_params_weights: int")
     .Attr("num_params_biases: int")
     .Attr(kRNNModeAttrs)
diff --git a/tensorflow/core/ops/cudnn_rnn_ops_test.cc b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
index 91043efa425..7439b1705ee 100644
--- a/tensorflow/core/ops/cudnn_rnn_ops_test.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
@@ -24,6 +24,14 @@ limitations under the License.
 
 namespace tensorflow {
 
+static string JoinedCopies(const string& s, int copies) {
+  string res;
+  for (int i = 0; i < copies; ++i) {
+    strings::StrAppend(&res, i > 0 ? ";" : "", s);
+  }
+  return res;
+}
+
 TEST(CudnnRNNOpsTest, ParamsSize_ShapeFn) {
   ShapeInferenceTestOp op("CudnnRNNParamsSize");
   INFER_OK(op, "[];[];[]", "[1]");
@@ -195,4 +203,83 @@ TEST(CudnnRNNOpsTest, ForwardV3Gru) {
   INFER_ERROR("Shape must be rank 1 ", op, "[?,?,?];[?,?,?];[];[?];[]");
 }
 
+TEST(CudnnRNNOpsTest, LSTMBlockCell_ShapeFn) {
+  ShapeInferenceTestOp op("LSTMBlockCell");
+
+  // Last 6 inputs don't affect shape inference.
+  string input_suffix = strings::StrCat(";", JoinedCopies("?", 6));
+
+  // Rank checks.
+  INFER_ERROR("must be rank 2", op, "[?];?" + input_suffix);
+  INFER_ERROR("must be rank 2", op, "?;[?]" + input_suffix);
+
+  // Output
+  INFER_OK(op, "?;?" + input_suffix, JoinedCopies("[?,?]", 7));
+  INFER_OK(op, "[?,?];[?,?]" + input_suffix, JoinedCopies("[d0_0,d1_1]", 7));
+}
+
+TEST(CudnnRNNOpsTest, BlockLSTM_ShapeFn) {
+  ShapeInferenceTestOp op("BlockLSTM");
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "BlockLSTM")
+                   .Input({"seq_len_max", 0, DT_INT64})
+                   .Input({"x", 0, DT_FLOAT})
+                   .Input({"cs_prev", 0, DT_FLOAT})
+                   .Input({"h_prev", 0, DT_FLOAT})
+                   .Input({"w", 0, DT_FLOAT})
+                   .Input({"wci", 0, DT_FLOAT})
+                   .Input({"wcf", 0, DT_FLOAT})
+                   .Input({"wco", 0, DT_FLOAT})
+                   .Input({"b", 0, DT_FLOAT})
+                   .Finalize(&op.node_def));
+
+  // Middle inputs don't affect shape inference.
+  string infix = ";" + JoinedCopies("?", 6) + ";";
+
+  // Rank checks.
+  INFER_ERROR("must be rank 3", op, "?;[?]" + infix + "?");
+  INFER_ERROR("must be rank 1", op, "?;?" + infix + "[?,?]");
+
+  // Output
+  INFER_OK(op, "?;?" + infix + "?", JoinedCopies("[?,?,?]", 7));
+  INFER_OK(op, "?;[?,?,?]" + infix + "?", JoinedCopies("[d1_0,d1_1,?]", 7));
+  INFER_OK(op, "?;[?,?,?]" + infix + "[?]", JoinedCopies("[d1_0,d1_1,?]", 7));
+  INFER_OK(op, "?;[?,?,?]" + infix + "[20]", JoinedCopies("[d1_0,d1_1,5]", 7));
+
+  // cell_size must be divisible by 4.
+  INFER_ERROR("must be evenly divisible", op, "?;?" + infix + "[11]");
+}
+
+TEST(CudnnRNNOpsTest, BlockLSTMV2_ShapeFn) {
+  ShapeInferenceTestOp op("BlockLSTMV2");
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "BlockLSTMV2")
+                   .Input({"seq_len_max", 0, DT_INT64})
+                   .Input({"x", 0, DT_FLOAT})
+                   .Input({"cs_prev", 0, DT_FLOAT})
+                   .Input({"h_prev", 0, DT_FLOAT})
+                   .Input({"w", 0, DT_FLOAT})
+                   .Input({"wci", 0, DT_FLOAT})
+                   .Input({"wcf", 0, DT_FLOAT})
+                   .Input({"wco", 0, DT_FLOAT})
+                   .Input({"b", 0, DT_FLOAT})
+                   .Finalize(&op.node_def));
+
+  // Middle inputs don't affect shape inference.
+  string infix = ";" + JoinedCopies("?", 6) + ";";
+
+  // Rank checks.
+  INFER_ERROR("must be rank 3", op, "?;[?]" + infix + "?");
+  INFER_ERROR("must be rank 1", op, "?;?" + infix + "[?,?]");
+
+  // Output
+  INFER_OK(op, "?;?" + infix + "?", JoinedCopies("[?,?,?]", 7));
+  INFER_OK(op, "?;[?,?,?]" + infix + "?", JoinedCopies("[d1_0,d1_1,?]", 7));
+  INFER_OK(op, "?;[?,?,?]" + infix + "[?]", JoinedCopies("[d1_0,d1_1,?]", 7));
+  INFER_OK(op, "?;[?,?,?]" + infix + "[20]", JoinedCopies("[d1_0,d1_1,5]", 7));
+
+  // cell_size must be divisible by 4.
+  INFER_ERROR("must be evenly divisible", op, "?;?" + infix + "[11]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 7b6b97d4d1d..9e7492099c0 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -560,6 +560,7 @@ REGISTER_OP("AnonymousSeedGenerator")
 REGISTER_OP("DatasetCardinality")
     .Input("input_dataset: variant")
     .Output("cardinality: int64")
+    .Attr("cardinality_options: string = ''")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("DeleteSeedGenerator")
diff --git a/tensorflow/core/ops/debug_ops.cc b/tensorflow/core/ops/debug_ops.cc
index ac67a0f75f3..e541d41bc0c 100644
--- a/tensorflow/core/ops/debug_ops.cc
+++ b/tensorflow/core/ops/debug_ops.cc
@@ -54,6 +54,21 @@ REGISTER_OP("DebugIdentity")
     .SetAllowsUninitializedInput()
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("DebugIdentityV3")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("device_name: string = ''")
+    .Attr("tensor_name: string = ''")
+    .Attr("io_of_node: string = ''")
+    .Attr("is_input: bool = false")
+    .Attr("io_index: int = -1")
+    .SetIsStateful()
+    .Attr("debug_urls: list(string) = []")
+    .Attr("gated_grpc: bool = false")
+    .SetAllowsUninitializedInput()
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 REGISTER_OP("DebugNanCount")
     .Input("input: T")
     .Output("output: int64")  // The debug signal (nan count) is int64
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 802baef36dc..1203d36a895 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -1227,6 +1227,21 @@ REGISTER_OP("SnapshotDatasetReader")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("SnapshotChunkDataset")
+    .Input("chunk_file: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("compression: string = ''")
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `chunk_file` should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("SnapshotNestedDatasetReader")
     .Input("inputs: N * variant")
     .Output("handle: variant")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 6778ea651bd..8d4a88fd117 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -130,6 +130,16 @@ Status EncodeImageShapeFn(InferenceContext* c) {
   return OkStatus();
 }
 
+// Allow encoding batches of images.
+Status BatchedEncodeImageShapeFn(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 3, &input));
+  ShapeHandle s;
+  TF_RETURN_IF_ERROR(c->Subshape(input, 0, -3, &s));
+  c->set_output(0, s);
+  return OkStatus();
+}
+
 Status ColorspaceShapeFn(InferenceContext* c) {
   ShapeHandle input;
   TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &input));
@@ -615,7 +625,7 @@ REGISTER_OP("EncodePng")
     .Attr("T: {uint8, uint16} = DT_UINT8")
     .Input("image: T")
     .Output("contents: string")
-    .SetShapeFn(EncodeImageShapeFn);
+    .SetShapeFn(BatchedEncodeImageShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("DecodeBmp")
diff --git a/tensorflow/core/ops/image_ops_test.cc b/tensorflow/core/ops/image_ops_test.cc
index ccf26ca2cfe..c042da449cf 100644
--- a/tensorflow/core/ops/image_ops_test.cc
+++ b/tensorflow/core/ops/image_ops_test.cc
@@ -172,13 +172,27 @@ TEST(ImageOpsTest, DecodeAndCropJpeg_InvalidCropWindow) {
 }
 
 TEST(ImageOpsTest, EncodeImage_ShapeFn) {
-  for (const char* op_name : {"EncodeJpeg", "EncodePng"}) {
+  for (const char* op_name : {"EncodeJpeg"}) {
     ShapeInferenceTestOp op(op_name);
 
     // Rank check.
     INFER_ERROR("Shape must be rank 3 but is rank 2", op, "[1,2]");
 
-    INFER_OK(op, "[1,?,3]", "[]");  // output is always scalar.
+    INFER_OK(op, "[1,?,3]", "[]");  // Output is always scalar.
+  }
+}
+
+TEST(ImageOpsTest, BatchedEncodeImage_ShapeFn) {
+  for (const char* op_name : {"EncodePng"}) {
+    ShapeInferenceTestOp op(op_name);
+
+    // Rank check.
+    INFER_ERROR("Shape must be at least rank 3 but is rank 2", op, "[1,2]");
+
+    // Batch dimensions are forwarded.
+    INFER_OK(op, "[1,?,3]", "[]");
+    INFER_OK(op, "[?,1,?,3]", "[d0_0]");
+    INFER_OK(op, "[4,5,1,?,3]", "[d0_0,d0_1]");
   }
 }
 
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index c7ba7d75e02..86cc18ff5da 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -264,19 +264,19 @@ TEST(MathOpsTest, Select_ShapeFn) {
 
   // Expect an error when the shapes can't be merged.
   handle_data[2]->at(0).first = shape_proto({2, 2});
-  EXPECT_TRUE(absl::StrContains(run_inference_for_handles().error_message(),
+  EXPECT_TRUE(absl::StrContains(run_inference_for_handles().message(),
                                 "must be equal, but are 1 and 2"));
   handle_data[2]->at(0).first = i1;  // restore to valid
 
   // Expect an error when the types can't be merged.
   handle_data[2]->at(1).second = DT_INT64;
-  EXPECT_TRUE(absl::StrContains(run_inference_for_handles().error_message(),
+  EXPECT_TRUE(absl::StrContains(run_inference_for_handles().message(),
                                 "pointing to different dtypes"));
   handle_data[2]->at(1).second = DT_INT32;  // restore to valid
 
   // Expect an error when different numbers of tensors are merged.
   handle_data[2]->push_back({i1, DT_FLOAT});
-  EXPECT_TRUE(absl::StrContains(run_inference_for_handles().error_message(),
+  EXPECT_TRUE(absl::StrContains(run_inference_for_handles().message(),
                                 "pointing to different numbers of tensors"));
   handle_data[2]->pop_back();  // restore to valid.
 }
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index c93fd088c58..17cc57aef2d 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -680,6 +680,20 @@ on the quantized input.
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_QuantizedMaxPool3D")
+    .Input("input: T")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Output("output: T")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("ksize: list(int) >= 5")
+    .Attr("strides: list(int) >= 5")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: quantizedtype")
+    .SetShapeFn(shape_inference::Pool3DShape);
+
 REGISTER_OP("_MklQuantizedAvgPool")
     .Input("input:           T")
     .Input("min_input:       float")
@@ -1830,6 +1844,22 @@ Uses oneDNN APIs to perform fused batch normalization and relu.
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklFusedInstanceNorm")
+    .Input("x: T")
+    .Input("gamma: T")
+    .Input("beta: T")
+    .Output("y: T")
+    .Attr("T: {float, bfloat16}")
+    .Attr("fused_ops: list(string) = []")
+    .Attr("epsilon: float = 0.0001")
+    .Attr("leakyrelu_alpha: float = 0.2")
+    .Attr("reduction_axes: list(int)")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(
+        R"doc(oneDNN version of fused instance normalization operator.
+        Do not invoke this operator directly in Python.
+        Graph rewrite pass is expected to invoke this operator.)doc");
+
 REGISTER_OP("_MklFusedMish")
     .Input("features: T")
     .Output("activations: T")
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 560bb5072d7..3627f195588 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1444,7 +1444,7 @@ Status ApproxTopKShape(shape_inference::InferenceContext* c) {
     // Reverse index
     reduction_dimension += c->Rank(input_shape);
   }
-  if (reduction_dimension >= c->Rank(input_shape)) {
+  if (reduction_dimension >= c->Rank(input_shape) || reduction_dimension < 0) {
     return errors::InvalidArgument("Invalid reduction dimension: ", r_dim_copy,
                                    ". Must be within the range of [", -rank,
                                    ", ", rank - 1, "]");
@@ -1520,11 +1520,13 @@ REGISTER_OP("TopK")
 // This is the same as `TopK`, but takes `k` as in input rather than an attr.
 REGISTER_OP("TopKV2")
     .Input("input: T")
-    .Input("k: int32")
+    .Input("k: Tk")
     .Output("values: T")
-    .Output("indices: int32")
+    .Output("indices: index_type")
     .Attr("sorted: bool = true")
     .Attr("T: realnumbertype")
+    .Attr("Tk: {int16, int32, int64} = DT_INT32")
+    .Attr("index_type: {int16, int32, int64} = DT_INT32")
     .SetShapeFn(TopKShapeFn);
 
 REGISTER_OP("ApproxTopK")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 771f6747ce4..9336066fd2e 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -2545,6 +2545,7 @@ op {
         type: DT_COMPLEX128
         type: DT_BOOL
         type: DT_VARIANT
+        type: DT_STRING
       }
     }
   }
@@ -10973,6 +10974,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -11118,6 +11120,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -11260,6 +11263,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -11406,6 +11410,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -11524,6 +11529,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -11633,6 +11639,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -11745,6 +11752,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -11865,6 +11873,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -11974,6 +11983,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -12106,6 +12116,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -12231,6 +12242,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -13073,6 +13085,13 @@ op {
     name: "cardinality"
     type: DT_INT64
   }
+  attr {
+    name: "cardinality_options"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
 }
 op {
   name: "DatasetFromGraph"
@@ -13365,6 +13384,73 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "DebugIdentityV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "io_of_node"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "is_input"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "io_index"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+  allows_uninitialized_input: true
+}
 op {
   name: "DebugNanCount"
   input_arg {
@@ -40729,13 +40815,13 @@ op {
   }
 }
 op {
-  name: "RelayoutGrad"
+  name: "RelayoutLike"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "forward_input"
+    name: "layout_input"
     type_attr: "T"
   }
   output_arg {
@@ -50911,6 +50997,56 @@ op {
     type: "type"
   }
 }
+op {
+  name: "SnapshotChunkDataset"
+  input_arg {
+    name: "chunk_file"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
 op {
   name: "SnapshotDataset"
   input_arg {
@@ -57687,6 +57823,52 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StochasticCastToInt"
+  input_arg {
+    name: "input"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+      }
+    }
+  }
+}
 op {
   name: "StopGradient"
   input_arg {
@@ -57981,7 +58163,6 @@ op {
     name: "N"
     type: "int"
     has_minimum: true
-    minimum: 1
   }
   attr {
     name: "separator"
@@ -61856,7 +62037,7 @@ op {
   }
   input_arg {
     name: "k"
-    type: DT_INT32
+    type_attr: "Tk"
   }
   output_arg {
     name: "values"
@@ -61864,7 +62045,7 @@ op {
   }
   output_arg {
     name: "indices"
-    type: DT_INT32
+    type_attr: "index_type"
   }
   attr {
     name: "sorted"
@@ -61893,6 +62074,34 @@ op {
       }
     }
   }
+  attr {
+    name: "Tk"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "index_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "TopKWithUnique"
diff --git a/tensorflow/core/ops/stochastic_cast_op.cc b/tensorflow/core/ops/stochastic_cast_op.cc
new file mode 100644
index 00000000000..0162a372de3
--- /dev/null
+++ b/tensorflow/core/ops/stochastic_cast_op.cc
@@ -0,0 +1,50 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/rng_alg.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/tsl/platform/errors.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("StochasticCastToInt")
+    .Input("input: Tin")
+    .Input("key: uint64")
+    .Input("counter: uint64")
+    .Input("alg: int32")
+    .Output("output: Tout")
+    .Attr("Tin: {half, bfloat16, float32, float64}")
+    .Attr("Tout: {int8, int16, int32}")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle key;
+      ShapeHandle shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &key));    // key shape
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &shape));  // counter shape
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &shape));  // alg shape
+      DimensionHandle dim;
+      TF_RETURN_IF_ERROR(
+          c->WithValue(c->Dim(key, 0), RNG_KEY_SIZE, &dim));  // alg dim
+      c->set_output(0, c->input(0));                          // out shape
+      return OkStatus();
+    });
+
+// TODO(b/232442915): Add support for rounding floats to lower precision floats.
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/stochastic_cast_op_test.cc b/tensorflow/core/ops/stochastic_cast_op_test.cc
new file mode 100644
index 00000000000..823edbb0f08
--- /dev/null
+++ b/tensorflow/core/ops/stochastic_cast_op_test.cc
@@ -0,0 +1,29 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(StochasticCastOpTest, StochasticCastToIntShapeInference) {
+  ShapeInferenceTestOp op("StochasticCastToInt");
+
+  INFER_OK(op, "[4,2];[1];[1];[]", "in0");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[4,2];[1,2];[1];[]");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[4,2];[1];[1,2];[]");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[4,2];[1];[1];[1]");
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index fa666e7c7c2..2fffccd46c1 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -129,7 +129,7 @@ REGISTER_OP("UnsortedSegmentJoin")
 REGISTER_OP("AsString")
     .Input("input: T")
     .Output("output: string")
-    .Attr("T: {realnumbertype, complex64, complex128, bool, variant}")
+    .Attr("T: {realnumbertype, complex64, complex128, bool, variant, string}")
     .Attr("precision: int = -1")
     .Attr("scientific: bool = false")
     .Attr("shortest: bool = false")
@@ -165,7 +165,7 @@ REGISTER_OP("StringFormat")
 
 REGISTER_OP("StringJoin")
     .Input("inputs: N * string")
-    .Attr("N: int")
+    .Attr("N: int>=0")
     .Attr("separator: string = ''")
     .Output("output: string")
     .SetShapeFn([](InferenceContext* c) {
diff --git a/tensorflow/core/ops/tpu_host_compute_ops.cc b/tensorflow/core/ops/tpu_host_compute_ops.cc
index f448e9f293b..5f3b6b229ba 100644
--- a/tensorflow/core/ops/tpu_host_compute_ops.cc
+++ b/tensorflow/core/ops/tpu_host_compute_ops.cc
@@ -25,6 +25,7 @@ REGISTER_OP("_XlaSendFromHost")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("key: string")
     .Attr("device_ordinal: int")
+    .Attr("device_type: string=\"TPU\"")
     .SetIsStateful()
     .SetShapeFn(::tensorflow::shape_inference::NoOutputs)
     .Doc(R"doc(
@@ -45,6 +46,7 @@ REGISTER_OP("_XlaSendFromHostV2")
     .Input("device_ordinal: int64")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("key: string")
+    .Attr("device_type: string=\"TPU\"")
     .SetIsStateful()
     .SetShapeFn(::tensorflow::shape_inference::NoOutputs)
     .Doc(R"doc(
@@ -65,6 +67,7 @@ REGISTER_OP("_XlaRecvAtHost")
     .Attr("Toutputs: list(type) >= 0")
     .Attr("key: string")
     .Attr("device_ordinal: int")
+    .Attr("device_type: string=\"TPU\"")
     .SetIsStateful()
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape)
     .Doc(R"doc(
@@ -85,6 +88,7 @@ REGISTER_OP("_XlaRecvAtHostV2")
     .Output("outputs: Toutputs")
     .Attr("Toutputs: list(type) >= 0")
     .Attr("key: string")
+    .Attr("device_type: string=\"TPU\"")
     .SetIsStateful()
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape)
     .Doc(R"doc(
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 9abfe8b9513..524b8909948 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -218,8 +218,8 @@ cc_library(
     name = "casts",
     hdrs = ["casts.h"],
     deps = [
-        "//tensorflow/tsl/platform:casts",
         ":platform",
+        "//tensorflow/tsl/platform:casts",
     ] + tf_platform_deps("casts"),
 )
 
@@ -444,13 +444,13 @@ filegroup(
     name = "xla_cpu_runtime_srcs",
     srcs = tf_platform_deps("xla_cpu_runtime_srcs") + [
         "cord.h",
-        ":ctstring",
         "dynamic_annotations.h",
         "env_time.h",
         "macros.h",
         "platform.h",
         "tstring.h",
         "types.h",
+        ":ctstring",
     ] + if_windows([
         "//tensorflow/tsl/platform/windows:xla_cpu_runtime_srcs",
     ]),
@@ -841,8 +841,8 @@ cc_library(
         ":platform",
         ":protobuf",
         ":refcount",
-        ":stringpiece",
         ":strcat",
+        ":stringpiece",
         ":types",
     ] + tf_additional_tensor_coding_deps(),
 )
@@ -993,9 +993,9 @@ cc_binary(
         ":byte_order",
         ":cord",
         ":cpu_feature_guard",
+        ":crash_analysis",
         ":denormal",
         ":dynamic_annotations",
-        ":crash_analysis",
         ":env_time",
         ":file_statistics",
         ":fingerprint",
@@ -1011,8 +1011,8 @@ cc_binary(
         ":png",
         ":prefetch",
         ":protobuf",
-        ":retrying_utils",
         ":retrying_file_system",
+        ":retrying_utils",
         ":scanner",
         ":setround",
         ":stacktrace",
@@ -1204,7 +1204,6 @@ tf_cc_tests(
         "//tensorflow/tsl/platform/profile_utils:cpu_utils_test.cc",
     ],
     create_named_test_suite = True,
-    tags = ["no_mac_arm64"],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -1602,7 +1601,6 @@ filegroup(
         "context.h",
         "cord.h",
         "cpu_info.h",
-        ":ctstring",
         "demangle.h",
         "denormal.h",
         "dynamic_annotations.h",
@@ -1631,13 +1629,13 @@ filegroup(
         "ram_file_system.h",
         "raw_coding.h",
         "refcount.h",
-        "resource.h",
         "regexp.h",
+        "resource.h",
         "scanner.h",
         "setround.h",
         "snappy.h",
-        "stacktrace.h",
         "stack_frame.h",
+        "stacktrace.h",
         "status.h",
         "statusor.h",
         "str_util.h",
@@ -1652,6 +1650,7 @@ filegroup(
         "tracing.h",
         "tstring.h",
         "types.h",
+        ":ctstring",
         "//tensorflow/tsl/platform:mobile_srcs_no_runtime",
     ] + tf_platform_alias("additional_mobile_srcs_no_runtime"),
     visibility = ["//tensorflow/core:__pkg__"],
@@ -1708,9 +1707,9 @@ filegroup(
         ],
     ) + [
         "//tensorflow/core/platform/profile_utils:android_armv7a_cpu_utils_helper.h",
+        "//tensorflow/core/platform/profile_utils:clock_cycle_profiler.h",
         "//tensorflow/core/platform/profile_utils:cpu_utils.h",
         "//tensorflow/core/platform/profile_utils:i_cpu_utils_helper.h",
-        "//tensorflow/core/platform/profile_utils:clock_cycle_profiler.h",
         "//tensorflow/tsl/platform:legacy_lib_internal_headers",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
diff --git a/tensorflow/core/platform/distribute.bzl b/tensorflow/core/platform/distribute.bzl
index 2ef46143795..67f16f0cb14 100644
--- a/tensorflow/core/platform/distribute.bzl
+++ b/tensorflow/core/platform/distribute.bzl
@@ -82,3 +82,6 @@ def distribute_py_test(
             disable_mlir_bridge = disable_mlir_bridge,
             disable_tfrt = disable_tpu_use_tfrt,
         )
+
+def distribute_py_strict_test(**kwargs):
+    distribute_py_test(**kwargs)
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index c1a3a2e9b8d..2ecc93f11ab 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -40,7 +40,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     if (BodyExists(parsed_path)) {
       return OkStatus();
     }
-    return Status(tensorflow::error::NOT_FOUND, "File does not exist");
+    return Status(absl::StatusCode::kNotFound, "File does not exist");
   }
 
   // Adds the dir to the parent's children list and creates an entry for itself.
@@ -49,7 +49,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     ParsePath(dirname, &parsed_path);
     // If the directory already exists, throw an error.
     if (celestial_bodies_.find(parsed_path) != celestial_bodies_.end()) {
-      return Status(tensorflow::error::ALREADY_EXISTS,
+      return Status(absl::StatusCode::kAlreadyExists,
                     "dirname already exists.");
     }
     std::vector<string> split_path = str_util::Split(parsed_path, '/');
@@ -68,7 +68,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     }
     if (split_path.size() == 2) {
       if (!BodyExists(split_path[0])) {
-        return Status(tensorflow::error::FAILED_PRECONDITION,
+        return Status(absl::StatusCode::kFailedPrecondition,
                       "Base dir not created");
       }
       celestial_bodies_[split_path[0]].insert(split_path[1]);
@@ -79,7 +79,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     if (split_path.size() == 3) {
       const string& parent_path = this->JoinPath(split_path[0], split_path[1]);
       if (!BodyExists(parent_path)) {
-        return Status(tensorflow::error::FAILED_PRECONDITION,
+        return Status(absl::StatusCode::kFailedPrecondition,
                       "Base dir not created");
       }
       celestial_bodies_[parent_path].insert(split_path[2]);
@@ -87,7 +87,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
           std::pair<string, std::set<string>>(parsed_path, {}));
       return OkStatus();
     }
-    return Status(tensorflow::error::FAILED_PRECONDITION, "Failed to create");
+    return Status(absl::StatusCode::kFailedPrecondition, "Failed to create");
   }
 
   Status IsDirectory(const string& dirname, TransactionToken* token) override {
@@ -99,12 +99,12 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     }
     std::vector<string> split_path = str_util::Split(parsed_path, '/');
     if (split_path.size() > 2) {
-      return Status(tensorflow::error::FAILED_PRECONDITION, "Not a dir");
+      return Status(absl::StatusCode::kFailedPrecondition, "Not a dir");
     }
     if (celestial_bodies_.find(parsed_path) != celestial_bodies_.end()) {
       return OkStatus();
     }
-    return Status(tensorflow::error::FAILED_PRECONDITION, "Not a dir");
+    return Status(absl::StatusCode::kFailedPrecondition, "Not a dir");
   }
 
   Status GetChildren(const string& dir, TransactionToken* token,
@@ -281,7 +281,7 @@ class TestFileSystem : public NullFileSystem {
     if (dirname == "." || dirname.empty()) {
       return OkStatus();
     }
-    return Status(tensorflow::error::FAILED_PRECONDITION, "Not a dir");
+    return Status(absl::StatusCode::kFailedPrecondition, "Not a dir");
   }
 
   // Simulating a FS with a root dir and a single file underneath it.
diff --git a/tensorflow/core/platform/status.h b/tensorflow/core/platform/status.h
index d9d60d76f3a..7f66815de46 100644
--- a/tensorflow/core/platform/status.h
+++ b/tensorflow/core/platform/status.h
@@ -24,7 +24,6 @@ limitations under the License.
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
-using tsl::error_name;
 using tsl::FromAbslStatus;
 using tsl::OkStatus;
 using tsl::SourceLocation;
diff --git a/tensorflow/core/profiler/backends/cpu/BUILD b/tensorflow/core/profiler/backends/cpu/BUILD
index 8cccadc803b..a0adee5cd1d 100644
--- a/tensorflow/core/profiler/backends/cpu/BUILD
+++ b/tensorflow/core/profiler/backends/cpu/BUILD
@@ -68,9 +68,9 @@ cc_library(
     copts = tf_profiler_copts(),
     visibility = ["//tensorflow/core/profiler:internal"],
     deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/core:lib",
         "//tensorflow/tsl/profiler/backends/cpu:traceme_recorder",
+        "@com_google_absl//absl/container:flat_hash_map",
     ] + if_static([
         ":traceme_recorder_impl",
     ]),
@@ -100,9 +100,9 @@ cc_library(
     copts = tf_profiler_copts(),
     visibility = ["//tensorflow/core/profiler:internal"],
     deps = [
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
         "//tensorflow/tsl/profiler/backends/cpu:annotation_stack",
+        "@com_google_absl//absl/strings",
     ] + if_static([
         ":annotation_stack_impl",
     ]),
diff --git a/tensorflow/core/profiler/backends/gpu/BUILD b/tensorflow/core/profiler/backends/gpu/BUILD
index 2e0f992ff7f..274d2ee1e86 100644
--- a/tensorflow/core/profiler/backends/gpu/BUILD
+++ b/tensorflow/core/profiler/backends/gpu/BUILD
@@ -35,14 +35,13 @@ tf_cuda_cc_test(
     srcs = ["device_tracer_test.cc"],
     args = ["--heap_check="],
     tags = tf_cuda_tests_tags() + [
-        "nomac",
         "gpu_cupti",
+        "nomac",
     ],
     deps = [
         ":cupti_collector",
-        "//tensorflow/compiler/xla/backends/profiler/gpu:device_tracer",
-        "@com_google_absl//absl/strings",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/compiler/xla/backends/profiler/gpu:device_tracer",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
@@ -61,6 +60,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/strings",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cupti_headers",
@@ -73,9 +73,9 @@ tf_cuda_library(
     copts = tf_profiler_copts() + tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla/backends/profiler/gpu:cupti_interface",
         "//tensorflow/core:lib",
         "//tensorflow/core:platform_base",
-        "//tensorflow/compiler/xla/backends/profiler/gpu:cupti_interface",
     ] + tf_additional_cupti_deps(),
 )
 
@@ -205,19 +205,19 @@ tf_cuda_library(
     copts = tf_profiler_copts() + tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla/backends/profiler/gpu:cupti_collector",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:parse_annotation",
+        "//tensorflow/core/profiler/utils:trace_utils",
+        "//tensorflow/core/profiler/utils:xplane_builder",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/strings",
-        "//tensorflow/compiler/xla/backends/profiler/gpu:cupti_collector",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:parse_annotation",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "//tensorflow/core/profiler/utils:trace_utils",
     ] + tf_additional_cupti_deps(),
 )
 
diff --git a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
index 75d1f92c172..96002479e6a 100644
--- a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
+++ b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
@@ -119,7 +119,7 @@ class DeviceTracerTest : public ::testing::Test {
   void ExpectFailure(const Status& status, error::Code code) {
     EXPECT_FALSE(status.ok()) << status.ToString();
     if (!status.ok()) {
-      LOG(INFO) << "Status message: " << status.error_message();
+      LOG(INFO) << "Status message: " << status.message();
       EXPECT_EQ(code, status.code()) << status.ToString();
     }
   }
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 8181baff735..5ec5e36279f 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -96,8 +96,8 @@ cc_library(
     srcs = ["op_stats_to_op_profile.cc"],
     hdrs = ["op_stats_to_op_profile.h"],
     deps = [
+        ":op_profile_builder",
         "//tensorflow/core/platform:logging",
-        "//tensorflow/core/profiler/convert:op_profile_builder",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_profile_proto_cc",
@@ -169,7 +169,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/utils:diagnostics",
@@ -192,9 +191,6 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:pod_viewer_proto_cc",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/utils:diagnostics",
-        "//tensorflow/core/profiler/utils:event_span",
-        "//tensorflow/core/profiler/utils:time_utils",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -207,7 +203,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:pod_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
@@ -242,7 +237,6 @@ cc_library(
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:tf_op_utils",
-        "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/tsl/util:stats_calculator_portable",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -263,7 +257,6 @@ cc_library(
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "//tensorflow/core/profiler/utils:time_utils",
     ],
 )
 
@@ -277,12 +270,10 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:math_utils",
-        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_test_utils",
@@ -456,7 +447,6 @@ tf_cc_test(
         ":xplane_to_kernel_stats_db",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
@@ -496,12 +486,9 @@ tf_cc_test(
     srcs = ["xplane_to_tf_functions_test.cc"],
     deps = [
         ":xplane_to_tf_functions",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
@@ -603,6 +590,8 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:derived_timeline",
         "//tensorflow/core/profiler/utils:group_events",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/tsl/profiler/utils:preprocess_xplane",
         "//tensorflow/tsl/profiler/utils:xplane_utils",
     ],
 )
@@ -626,6 +615,7 @@ cc_library(
         ":xplane_to_memory_profile",
         ":xplane_to_op_stats",
         ":xplane_to_tf_data_stats",
+        ":xplane_to_tf_functions",
         ":xplane_to_tool_names",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
@@ -643,7 +633,6 @@ cc_library(
         "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/tsl/profiler/convert:xplane_to_trace_events",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -851,6 +840,7 @@ cc_library(
     hdrs = ["tool_options.h"],
     deps = [
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -863,7 +853,6 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:file_system_utils",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
index f083643001b..92bf64d0f43 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
@@ -1038,7 +1038,7 @@ absl::StatusOr<PreprocessResult> ConvertHloProtoToPreprocessResult(
       ProcessHeapSimulatorTrace(wrapper, memory_color, &simulator_stats);
   if (!status.ok()) {
     return absl::InvalidArgumentError(absl::StrCat(
-        "Failed to process heap simulator trace: ", status.error_message()));
+        "Failed to process heap simulator trace: ", status.message()));
   }
 
   // Process buffers with indefinite lifetime.
diff --git a/tensorflow/core/profiler/convert/preprocess_single_host_xplane.cc b/tensorflow/core/profiler/convert/preprocess_single_host_xplane.cc
index 0181f30ae28..2f6fe12ef05 100644
--- a/tensorflow/core/profiler/convert/preprocess_single_host_xplane.cc
+++ b/tensorflow/core/profiler/convert/preprocess_single_host_xplane.cc
@@ -14,8 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/convert/preprocess_single_host_xplane.h"
 
+#include <vector>
+
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/tsl/profiler/utils/preprocess_xplane.h"
 #include "tensorflow/tsl/profiler/utils/xplane_utils.h"
 
 namespace tensorflow {
@@ -25,8 +29,29 @@ void PreprocessSingleHostXSpace(XSpace* space, bool step_grouping,
                                 bool derived_timeline) {
   if (step_grouping && !tsl::profiler::IsXSpaceGrouped(*space)) {
     // Grouping (i.e. marking step number) events in the XSpace.
+    std::vector<XPlane*> device_traces;
+    bool isTpu = false;
+    for (XPlane& plane : *space->mutable_planes()) {
+      if (tsl::profiler::IsDevicePlane(plane)) {
+        device_traces.push_back(&plane);
+      }
+      // Preprocess XPlane to convert stats to Traceme2 semantics
+      tsl::profiler::PreprocessXPlane(&plane);
+
+      if (!isTpu && absl::StartsWith(plane.name(), kTpuPlanePrefix)) {
+        isTpu = true;
+      }
+    }
+
     EventForest event_forest;
-    GroupTfEvents(space, &event_forest);
+    if (isTpu) {
+      // group TPU events
+      GroupTpuEventsOSS(space, device_traces, &event_forest);
+    } else {
+      // group GPU events
+      GroupTfEvents(space, &event_forest);
+    }
+
     if (derived_timeline) {
       // Generated miscellaneous derived time lines for device planes.
       GenerateDerivedTimeLines(event_forest.GetGroupMetadataMap(), space);
diff --git a/tensorflow/core/profiler/convert/tool_options.h b/tensorflow/core/profiler/convert/tool_options.h
index c8349185906..b3175408572 100644
--- a/tensorflow/core/profiler/convert/tool_options.h
+++ b/tensorflow/core/profiler/convert/tool_options.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <variant>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_format.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -53,6 +54,17 @@ T GetParamWithDefault(const ToolOptions& options, const std::string& key,
   return default_param;
 }
 
+inline std::string DebugString(const ToolOptions& options) {
+  std::string output;
+  for (const auto& [k, v] : options) {
+    absl::StrAppend(
+        &output, k, ":",
+        std::visit([](const auto& value) { return absl::StrCat(value); }, v),
+        ";");
+  }
+  return absl::StrCat("{", output, "}");
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/trace_viewer/BUILD b/tensorflow/core/profiler/convert/trace_viewer/BUILD
new file mode 100644
index 00000000000..d9f41062b04
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/BUILD
@@ -0,0 +1,122 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler:internal",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "trace_events_filter_interface",
+    hdrs = ["trace_events_filter_interface.h"],
+    deps = [
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "trace_viewer_visibility",
+    srcs = ["trace_viewer_visibility.cc"],
+    hdrs = ["trace_viewer_visibility.h"],
+    deps = [
+        ":trace_events_filter_interface",
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/core/profiler/utils:timespan",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "trace_viewer_visibility_test",
+    srcs = ["trace_viewer_visibility_test.cc"],
+    deps = [
+        ":trace_viewer_visibility",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/core/profiler/utils:timespan",
+    ],
+)
+
+cc_library(
+    name = "trace_viewer_color",
+    srcs = ["trace_viewer_color.cc"],
+    hdrs = ["trace_viewer_color.h"],
+    deps = [
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "trace_events_to_json",
+    srcs = ["trace_events_to_json.cc"],
+    hdrs = ["trace_events_to_json.h"],
+    deps = [
+        ":trace_events_util",
+        ":trace_viewer_color",
+        "//tensorflow/core/profiler/lib:context_types",
+        "//tensorflow/core/profiler/protobuf:task_proto_cc",
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/core/profiler/protobuf:trace_events_raw_proto_cc",
+        "//tensorflow/core/profiler/utils:timespan",
+        "//tensorflow/tsl/platform:protobuf",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_library(
+    name = "trace_event_arguments_builder",
+    hdrs = ["trace_event_arguments_builder.h"],
+    deps = [
+        "//tensorflow/core/profiler/protobuf:trace_events_raw_proto_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "trace_events_util",
+    srcs = ["trace_events_util.cc"],
+    hdrs = ["trace_events_util.h"],
+    deps = [
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/core/profiler/utils:timespan",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
+cc_library(
+    name = "trace_events",
+    srcs = ["trace_events.cc"],
+    hdrs = ["trace_events.h"],
+    deps = [
+        ":trace_events_filter_interface",
+        ":trace_events_util",
+        ":trace_viewer_visibility",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:context_types_hdrs",
+        "//tensorflow/core/profiler/protobuf:task_proto_cc",
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/core/profiler/protobuf:trace_events_raw_proto_cc",
+        "//tensorflow/core/profiler/utils:timespan",
+        "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/base:endian",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h b/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h
new file mode 100644
index 00000000000..36d7459e069
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h
@@ -0,0 +1,64 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENT_ARGUMENTS_BUILDER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENT_ARGUMENTS_BUILDER_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Helper class for adding arguments to TraceEventsArguments.
+class TraceEventArgumentsBuilder {
+ public:
+  explicit TraceEventArgumentsBuilder(TraceEventArguments* args)
+      : args_(args) {}
+
+  void Append(absl::string_view key, absl::string_view value) {
+    auto* arg = args_->add_arg();
+    arg->set_name(key);
+    arg->set_str_value(value);
+  }
+
+  void Append(absl::string_view key, int64_t value) {
+    auto* arg = args_->add_arg();
+    arg->set_name(key);
+    arg->set_int_value(value);
+  }
+
+  void Append(absl::string_view key, uint64_t value) {
+    auto* arg = args_->add_arg();
+    arg->set_name(key);
+    arg->set_uint_value(value);
+  }
+
+  void Append(absl::string_view key, double value) {
+    auto* arg = args_->add_arg();
+    arg->set_name(key);
+    arg->set_double_value(value);
+  }
+
+ private:
+  TraceEventArguments* args_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENT_ARGUMENTS_BUILDER_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
new file mode 100644
index 00000000000..571a7c040eb
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
@@ -0,0 +1,355 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events.h"
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/internal/endian.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
+#include "tensorflow/tsl/lib/io/table.h"
+#include "tensorflow/tsl/lib/io/table_builder.h"
+#include "tensorflow/tsl/lib/io/table_options.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+constexpr uint64_t kLayerResolutions[] = {
+    1000000000000ull,  // 1 second.
+    100000000000ull,  10000000000ull, 1000000000ull, 100000000ull,
+    10000000ull,      1000000ull,     100000ull,     10000ull,
+    1000ull,          100ull,         10ull,         1ull,
+};
+
+constexpr int NumLevels() { return TF_ARRAYSIZE(kLayerResolutions); }
+constexpr uint64_t LayerResolutionPs(unsigned level) {
+  // This sometimes gets called in a tight loop, so levels are precomputed.
+  return level >= NumLevels() ? 0 : kLayerResolutions[level];
+}
+
+// Constants used by the LevelDB Table-based efficient trace viewer storage.
+static constexpr char kTraceMetadataKey[] = "/trace";
+static constexpr absl::string_view kLevelKey("123456789ABCDEFGHIJKLMNOPQ");
+static constexpr size_t kLevelDbKeyLength = 10;
+
+// Level Db table don't allow duplicated keys, so we add a tie break at the last
+// bytes. the format is zoom[1B] + timestamp[8B] + repetition[1B]
+std::string LevelDbTableKey(int zoom_level, uint64_t timestamp,
+                            uint64_t repetition) {
+  if (repetition >= 256) return std::string();
+  std::string output(kLevelDbKeyLength, 0);
+  char* ptr = output.data();
+  ptr[0] = kLevelKey[zoom_level];
+  // The big-endianness preserve the monotonic order of timestamp when convert
+  // to lexigraphical order (of Sstable key namespace).
+  uint64_t timestamp_bigendian = absl::big_endian::FromHost64(timestamp);
+  memcpy(ptr + 1, &timestamp_bigendian, sizeof(uint64_t));
+  ptr[9] = repetition;
+  return output;
+}
+
+uint64_t TimestampFromLevelDbTableKey(absl::string_view level_db_table_key) {
+  DCHECK_EQ(level_db_table_key.size(), kLevelDbKeyLength);
+  uint64_t value;  // big endian representation of timestamp.
+  memcpy(&value, level_db_table_key.data() + 1, sizeof(uint64_t));
+  return absl::big_endian::ToHost64(value);
+}
+
+bool ReadTraceMetadata(tsl::table::Iterator* iterator,
+                       absl::string_view metadata_key, Trace* trace) {
+  if (!iterator->Valid()) return false;
+  if (iterator->key() != metadata_key) return false;
+  auto serialized_trace = iterator->value();
+  return trace->ParseFromArray(serialized_trace.data(),
+                               serialized_trace.size());
+}
+
+// Returns the total number of events.
+inline int32_t NumEvents(
+    const std::vector<const TraceEventTrack*>& event_tracks) {
+  int32_t num_events = 0;
+  for (const auto* track : event_tracks) {
+    num_events += track->size();
+  }
+  return num_events;
+}
+
+// Mark events with duplicated timestamp with different serial. This is to
+// help front end to deduplicate events during streaming mode. The uniqueness
+// is guaranteed by the tuple <device_id, timestamp_ps, serial_number>.
+// REQUIRES: events is sorted by timestamp_ps
+void MaybeAddEventUniqueId(std::vector<TraceEvent*>& events) {
+  uint64_t last_ts = UINT64_MAX;
+  uint64_t serial = 0;
+  for (TraceEvent* event : events) {
+    if (event->timestamp_ps() == last_ts) {
+      event->set_serial(++serial);
+    } else {
+      serial = 0;
+    }
+    last_ts = event->timestamp_ps();
+  }
+}
+
+}  // namespace
+
+std::vector<TraceEvent*> MergeEventTracks(
+    const std::vector<const TraceEventTrack*>& event_tracks) {
+  std::vector<TraceEvent*> events;
+  events.reserve(NumEvents(event_tracks));
+  nway_merge(event_tracks, std::back_inserter(events), TraceEventsComparator());
+  return events;
+}
+
+std::vector<std::vector<const TraceEvent*>> GetEventsByLevel(
+    const Trace& trace, std::vector<TraceEvent*>& events) {
+  MaybeAddEventUniqueId(events);
+
+  constexpr int kNumLevels = NumLevels();
+
+  // Track visibility per zoom level.
+  Timespan trace_span = TraceSpan(trace);
+  std::vector<TraceViewerVisibility> visibility_by_level;
+  visibility_by_level.reserve(kNumLevels);
+  for (int zoom_level = 0; zoom_level < kNumLevels - 1; ++zoom_level) {
+    visibility_by_level.emplace_back(trace_span, LayerResolutionPs(zoom_level));
+  }
+
+  std::vector<std::vector<const TraceEvent*>> events_by_level(kNumLevels);
+  for (const TraceEvent* event : events) {
+    int zoom_level = 0;
+    // Find the smallest zoom level on which we can distinguish this event.
+    for (; zoom_level < kNumLevels - 1; ++zoom_level) {
+      if (visibility_by_level[zoom_level].VisibleAtResolution(*event)) {
+        break;
+      }
+    }
+    events_by_level[zoom_level].push_back(event);
+    // Record the visibility of this event in all higher zoom levels.
+    // An event on zoom level N can make events at zoom levels >N invisible.
+    for (++zoom_level; zoom_level < kNumLevels - 1; ++zoom_level) {
+      visibility_by_level[zoom_level].SetVisibleAtResolution(*event);
+    }
+  }
+  return events_by_level;
+}
+
+// Store the contents of this container in an sstable file. The format is as
+// follows:
+//
+// key                     | value
+// trace                   | The Trace-proto trace_
+// 0<timestamp><serial>    | Event at timestamp visible at a 10ms resolution
+// 1<timestamp><serial>    | Event at timestamp visible at a 1ms resolution
+// ...
+// 7<timestamp><serial>    | Event at timestamp visible at a 1ns resolution
+//
+// Note that each event only appears exactly once, at the first layer it's
+// eligible for.
+tsl::Status DoStoreAsLevelDbTable(
+    const std::string& filename, const Trace& trace,
+    const std::vector<std::vector<const TraceEvent*>>& events_by_level) {
+  std::unique_ptr<tensorflow::WritableFile> file;
+  tensorflow::FileSystem* file_system;
+  TF_RETURN_IF_ERROR(
+      tsl::Env::Default()->GetFileSystemForFile(filename, &file_system));
+  TF_RETURN_IF_ERROR(file_system->NewWritableFile(filename, &file));
+
+  tsl::table::Options options;
+  options.block_size = 20 * 1024 * 1024;
+  options.compression = tsl::table::kSnappyCompression;
+  tsl::table::TableBuilder builder(options, file.get());
+
+  builder.Add(kTraceMetadataKey, trace.SerializeAsString());
+
+  size_t num_of_events_dropped = 0;  // Due to too many timestamp repetitions.
+  for (int zoom_level = 0; zoom_level < events_by_level.size(); ++zoom_level) {
+    // The key of level db table have to be monotonically increasing, therefore
+    // we make the timestamp repetition count as the last byte of key as tie
+    // breaker. The hidden assumption was that there are not too many identical
+    // timestamp per resolution, (if there are such duplications, we dropped
+    // them if it overflow the last byte).
+    uint64_t last_timestamp = std::numeric_limits<uint64_t>::max();
+    uint64_t last_timestamp_repetition = 0;
+    for (const TraceEvent* event : events_by_level[zoom_level]) {
+      // NOTE: temporarily mutate the event for the storage efficiency, the
+      // timestamp is restored after serialization.
+      // NOTE: runtime-proto-const-cast lint error is bogus because we are
+      // casting the top level proto.
+      TraceEvent* mutable_event = const_cast<TraceEvent*>(event);
+      auto timestamp = mutable_event->timestamp_ps();
+      mutable_event->clear_timestamp_ps();
+      if (timestamp != last_timestamp) {
+        last_timestamp = timestamp;
+        last_timestamp_repetition = 0;
+      } else {
+        ++last_timestamp_repetition;
+      }
+      auto key =
+          LevelDbTableKey(zoom_level, timestamp, last_timestamp_repetition);
+      if (!key.empty()) {
+        builder.Add(key, mutable_event->SerializeAsString());
+      } else {
+        ++num_of_events_dropped;
+      }
+      mutable_event->set_timestamp_ps(timestamp);
+    }
+  }
+  LOG(INFO) << "Storing " << trace.num_events() - num_of_events_dropped
+            << " as LevelDb table fast file: " << filename << " with "
+            << num_of_events_dropped << " events dropped.";
+
+  TF_RETURN_IF_ERROR(builder.Finish());
+  return tsl::OkStatus();
+}
+
+tsl::Status DoLoadFromLevelDbTable(
+    const std::string& filename,
+    std::unique_ptr<TraceEventsFilterInterface> filter,
+    std::unique_ptr<TraceVisibilityFilter> visibility,
+    int64_t filter_by_visibility_threshold, Trace& trace,
+    bool& filter_by_visibility,
+    const std::function<TraceEvent*(const TraceEvent&)>& copy_event_to_arena,
+    const std::function<void(TraceEvent*)>& add_arena_event) {
+  uint64_t file_size;
+  TF_RETURN_IF_ERROR(tsl::Env::Default()->GetFileSize(filename, &file_size));
+
+  tensorflow::FileSystem* file_system;
+  TF_RETURN_IF_ERROR(
+      tsl::Env::Default()->GetFileSystemForFile(filename, &file_system));
+
+  std::unique_ptr<tensorflow::RandomAccessFile> file;
+  TF_RETURN_IF_ERROR(file_system->NewRandomAccessFile(filename, &file));
+
+  tsl::table::Options options;
+  options.block_size = 20 * 1024 * 1024;
+  tsl::table::Table* table = nullptr;
+  TF_RETURN_IF_ERROR(
+      tsl::table::Table::Open(options, file.get(), file_size, &table));
+  std::unique_ptr<tsl::table::Table> table_deleter(table);
+  std::unique_ptr<tsl::table::Iterator> iterator(table->NewIterator());
+  if (iterator == nullptr) return tsl::errors::Unknown("Could not open table");
+
+  // Read the metadata.
+  iterator->SeekToFirst();
+  if (!ReadTraceMetadata(iterator.get(), kTraceMetadataKey, &trace)) {
+    return tsl::errors::Unknown("Could not parse Trace proto");
+  }
+
+  if (filter) filter->SetUp(trace);
+
+  Timespan visible_span;
+  uint64_t container_resolution_ps = 0;
+
+  filter_by_visibility = filter_by_visibility_threshold == -1LL ||
+                         !trace.has_num_events() ||
+                         trace.num_events() >= filter_by_visibility_threshold;
+  if (!filter_by_visibility) {
+    visibility.reset();  // disable streaming
+  }
+  if (visibility) {
+    visibility->SetUp(trace);
+    visible_span = visibility->VisibleSpan();
+    container_resolution_ps = visibility->ResolutionPs();
+  } else {
+    visible_span = TraceSpan(trace);
+  }
+
+  // Read events at the different zoom levels.
+  std::vector<std::vector<TraceEvent*>> loaded_events_by_level;
+  size_t filtered = 0;
+  TraceEvent event;  // Declared outside of the loop to avoid repeated calls to
+                     // the constructor and destructor in the loop body. Cleared
+                     // by every call to ParseFromCord.
+  for (int i = 0;; ++i) {
+    loaded_events_by_level.emplace_back();
+    auto& loaded_events = loaded_events_by_level.back();
+    uint64_t resolution_ps = LayerResolutionPs(i);
+    // Seek to the first element that might be in range. For the initial zoom
+    // level, we don't know any bounds as events might be arbitrarily large.
+    uint64_t min_timestamp_ps = 0;
+    if (i > 0 && visible_span.begin_ps() > LayerResolutionPs(i - 1)) {
+      min_timestamp_ps = visible_span.begin_ps() - LayerResolutionPs(i - 1);
+    }
+    iterator->Seek(LevelDbTableKey(i, i == 0 ? 0 : min_timestamp_ps, 0));
+    while (iterator->Valid() && iterator->key().at(0) == kLevelKey[i]) {
+      auto serialized_event = iterator->value();
+      if (!event.ParseFromArray(serialized_event.data(),
+                                serialized_event.size())) {
+        return tsl::errors::Unknown("Could not parse TraceEvent proto");
+      }
+      uint64_t timestamp = TimestampFromLevelDbTableKey(iterator->key());
+      event.set_timestamp_ps(timestamp);
+      if (event.timestamp_ps() > visible_span.end_ps()) {
+        // This (and all following) events are outside of our window.
+        break;
+      }
+      // Filter before copying to the arena as it does not require sorting.
+      if (!filter || !filter->Filter(event)) {
+        loaded_events.push_back(copy_event_to_arena(event));
+      } else {
+        ++filtered;
+      }
+      iterator->Next();
+    }
+    if (container_resolution_ps >= resolution_ps) {
+      // No need to read further, the resolution we just loaded already exceeds
+      // the desired resolution.
+      break;
+    }
+  }
+
+  // We have loaded events from different zoom levels. Sort them by timestamp
+  // so visibility filtering works as expected.
+  std::vector<TraceEvent*> loaded_events;
+  nway_merge(loaded_events_by_level, std::back_inserter(loaded_events),
+             TraceEventsComparator());
+  loaded_events_by_level.clear();
+
+  LOG(INFO) << "Loaded " << loaded_events.size() << " events and filtered "
+            << filtered << " events from LevelDb fast file: " << filename;
+  for (TraceEvent* event : loaded_events) {
+    if (!visibility || !visibility->Filter(*event)) add_arena_event(event);
+  }
+  LOG(INFO) << "Added " << trace.num_events()
+            << " visible events from LevelDb fast file: " << filename;
+  return tsl::OkStatus();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
new file mode 100644
index 00000000000..44ed4b43b3f
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
@@ -0,0 +1,464 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_H_
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/bind_front.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h"
+#include "tensorflow/core/profiler/lib/context_types.h"
+#include "tensorflow/core/profiler/protobuf/task.pb.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// A track of events in the trace-viewer.
+using TraceEventTrack = std::vector<TraceEvent*>;
+
+// Merge-sorts the given event tracks. Each track must be sorted.
+std::vector<TraceEvent*> MergeEventTracks(
+    const std::vector<const TraceEventTrack*>& event_tracks);
+
+tsl::Status DoStoreAsLevelDbTable(
+    const std::string& filename, const Trace& trace,
+    const std::vector<std::vector<const TraceEvent*>>& events_by_level);
+
+tsl::Status DoLoadFromLevelDbTable(
+    const std::string& filename,
+    std::unique_ptr<TraceEventsFilterInterface> filter,
+    std::unique_ptr<TraceVisibilityFilter> visibility,
+    int64_t filter_by_visibility_threshold, Trace& trace,
+    bool& filter_by_visibility,
+    const std::function<TraceEvent*(const TraceEvent&)>& copy_event_to_arena,
+    const std::function<void(TraceEvent*)>& add_arena_event);
+
+std::vector<std::vector<const TraceEvent*>> GetEventsByLevel(
+    const Trace& trace, std::vector<TraceEvent*>& events);
+
+struct EventFactory {
+  TraceEvent* Create() {
+    events.push_back(std::make_unique<TraceEvent>());
+    return events.back().get();
+  }
+  std::vector<std::unique_ptr<TraceEvent>> events;
+};
+
+template <typename EventFactory, typename RawData,
+          typename Hash = std::hash<absl::string_view>()>
+class TraceEventsContainer {
+ public:
+  TraceEventsContainer() { arenas_.insert(std::make_shared<EventFactory>()); }
+
+  // Movable but non-copyable.
+  TraceEventsContainer(TraceEventsContainer&&) = default;
+  TraceEventsContainer& operator=(TraceEventsContainer&&) = default;
+  TraceEventsContainer(const TraceEventsContainer&) = delete;
+  TraceEventsContainer& operator=(const TraceEventsContainer&) = delete;
+
+  // Creates a TraceEvent prefilled with the given values.
+  void AddCompleteEvent(absl::string_view name, uint32_t resource_id,
+                        uint32_t device_id, Timespan timespan,
+                        RawData* raw_data = nullptr,
+                        std::optional<int64_t> group_id = std::nullopt) {
+    TraceEvent* event = CreateArenaEvent();
+    MaybeInternEventName(event, name);
+    event->set_resource_id(resource_id);
+    event->set_device_id(device_id);
+    event->set_timestamp_ps(timespan.begin_ps());
+    if (timespan.duration_ps() != 0) {
+      event->set_duration_ps(timespan.duration_ps());
+    }
+    if (raw_data) {
+      MaybeInternTraceArgument(raw_data);
+      raw_data->SerializePartialToString(event->mutable_raw_data());
+      if (event->raw_data().empty()) event->clear_raw_data();
+    }
+    if (group_id) {
+      event->set_group_id(*group_id);
+    }
+    AddArenaEvent(event);
+  }
+
+  // Similar to above, but the TraceEvent also has an associated flow_id and
+  // flow_entry_type, to make it part of a flow.
+  void AddFlowEvent(absl::string_view name, uint32_t resource_id,
+                    uint32_t device_id, Timespan timespan, uint64_t flow_id,
+                    TraceEvent::FlowEntryType flow_entry_type,
+                    ContextType flow_category = ContextType::kGeneric,
+                    RawData* raw_data = nullptr,
+                    std::optional<int64_t> group_id = std::nullopt) {
+    TraceEvent* event = CreateArenaEvent();
+    MaybeInternEventName(event, name);
+    event->set_resource_id(resource_id);
+    event->set_device_id(device_id);
+    event->set_timestamp_ps(timespan.begin_ps());
+    if (timespan.duration_ps() != 0) {
+      event->set_duration_ps(timespan.duration_ps());
+    }
+    event->set_flow_id(flow_id);
+    event->set_flow_entry_type(flow_entry_type);
+    event->set_flow_category(static_cast<uint32_t>(flow_category));
+    if (raw_data) {
+      MaybeInternTraceArgument(raw_data);
+      raw_data->SerializePartialToString(event->mutable_raw_data());
+      if (event->raw_data().empty()) event->clear_raw_data();
+    }
+    if (group_id) {
+      event->set_group_id(*group_id);
+    }
+    AddArenaEvent(event);
+  }
+
+  // Similar to above, but the "async" TraceEvent don't have a resource id, its
+  // name is used as "async channel" which are used as "thread" name. It has an
+  // associated unique flow_id and flow_entry_type to signal asynchronous
+  // start and end events and match up between them.
+  void AddAsyncEvent(absl::string_view name, uint32_t device_id,
+                     Timespan timespan, uint64_t flow_id,
+                     TraceEvent::FlowEntryType flow_entry_type,
+                     ContextType flow_category = ContextType::kGeneric,
+                     RawData* raw_data = nullptr,
+                     std::optional<int64_t> group_id = std::nullopt) {
+    TraceEvent* event = CreateArenaEvent();
+    MaybeInternEventName(event, name);
+    event->set_device_id(device_id);
+    event->set_timestamp_ps(timespan.begin_ps());
+    if (timespan.duration_ps() != 0) {
+      event->set_duration_ps(timespan.duration_ps());
+    }
+    event->set_flow_id(flow_id);
+    event->set_flow_entry_type(flow_entry_type);
+    event->set_flow_category(static_cast<uint32_t>(flow_category));
+    if (raw_data) {
+      MaybeInternTraceArgument(raw_data);
+      raw_data->SerializePartialToString(event->mutable_raw_data());
+      if (event->raw_data().empty()) event->clear_raw_data();
+    }
+    if (group_id) {
+      event->set_group_id(*group_id);
+    }
+    AddArenaEvent(event);
+  }
+
+  // Similar to above, but the TraceEvent also has an associated counter name
+  // and value in RawData.args. Counter events are per device, so no resource_id
+  // is passed.
+  void AddCounterEvent(absl::string_view name, uint32_t device_id,
+                       uint64_t timestamp_ps, const RawData& raw_data) {
+    TraceEvent* event = CreateArenaEvent();
+    event->set_name(name);
+    event->set_device_id(device_id);
+    // Do not set resource_id for counter events, they are per device.
+    event->set_timestamp_ps(timestamp_ps);
+    DCHECK(raw_data.has_args());
+    DCHECK_EQ(raw_data.args().arg_size(), 1);
+    DCHECK(raw_data.args().arg(0).has_uint_value());
+    raw_data.SerializePartialToString(event->mutable_raw_data());
+    AddArenaEvent(event);
+  }
+
+  // Returns a device descriptor.
+  Device* MutableDevice(uint32_t device_id) {
+    return &(*trace_.mutable_devices())[device_id];
+  }
+
+  // Returns a resource descriptor,
+  Resource* MutableResource(uint32_t resource_id, uint32_t device_id) {
+    Device* device = MutableDevice(device_id);
+    return &(*device->mutable_resources())[resource_id];
+  }
+
+  // Adds metadata events to set the name of each device and resource.
+  // The arguments are callbacks that return the names given ids.
+  // This must be called after all AddEvent calls, and no more AddEvent
+  // calls should be made after calling AddMetadataEvents.
+  void AddMetadataEvents(
+      const std::function<std::string(uint32_t /*device_id*/)>& device_name,
+      const std::function<std::string(
+          uint32_t /*device_id*/, uint32_t /*resource_id*/)>& resource_name) {
+    for (const auto& id_and_device : events_by_device_) {
+      uint32_t device_id = id_and_device.first;
+      auto& device = (*trace_.mutable_devices())[device_id];
+      device.set_device_id(device_id);
+      device.set_name(device_name(device_id));
+      const DeviceEvents& device_events = id_and_device.second;
+      for (const auto& id_and_resource : device_events.events_by_resource) {
+        uint32_t resource_id = id_and_resource.first;
+        auto& resource = (*device.mutable_resources())[resource_id];
+        resource.set_resource_id(resource_id);
+        resource.set_name(resource_name(device_id, resource_id));
+        resource.set_num_events(id_and_resource.second.size());
+      }
+    }
+  }
+
+  // Adds task metadata for the given host.
+  void AddTask(int host_id, const Task& task) {
+    (*trace_.mutable_tasks())[host_id] = task;
+  }
+
+  // Stores the contents of this container in a level-db sstable file.
+  tsl::Status StoreAsLevelDbTable(const std::string& filename) const {
+    Trace trace = trace_;
+    trace.set_num_events(NumEvents());
+    auto events_by_level = EventsByLevel();
+    return DoStoreAsLevelDbTable(filename, trace, events_by_level);
+  }
+
+  // Loads the contents of this container from a level-db sstable file.
+  // In order to be efficient, requires resolution__ to be set.
+  // If span_ is not set, it is initialized from the loaded trace_.
+  tsl::Status LoadFromLevelDbTable(
+      const std::string& filename,
+      std::unique_ptr<TraceEventsFilterInterface> filter = nullptr,
+      std::unique_ptr<TraceVisibilityFilter> visibility = nullptr,
+      int64_t filter_by_visibility_threshold = -1LL) {
+    return DoLoadFromLevelDbTable(
+        filename, std::move(filter), std::move(visibility),
+        filter_by_visibility_threshold, trace_, filter_by_visibility_,
+        absl::bind_front(&TraceEventsContainer::CopyEventToArena, this),
+        absl::bind_front(&TraceEventsContainer::AddArenaEvent, this));
+  }
+
+  // Calls 'callback' with all events stored in this container.
+  template <typename Callback>
+  void ForAllEvents(Callback callback) const {
+    for (const auto& [device_id, device] : events_by_device_) {
+      for (const auto& [counter_name, events] : device.counter_events_by_name) {
+        for (auto* event : events) {
+          callback(*event);
+        }
+      }
+      for (const auto& [resource_id, events] : device.events_by_resource) {
+        for (auto* event : events) {
+          callback(*event);
+        }
+      }
+    }
+  }
+
+  // Calls 'callback' with all event tracks stored in this container.
+  template <typename Callback>
+  void ForAllTracks(Callback callback) const {
+    for (const auto& [device_id, device] : events_by_device_) {
+      for (const auto& [counter_name, events] : device.counter_events_by_name) {
+        if (!events.empty()) {
+          if (ABSL_PREDICT_FALSE(!callback(device_id, counter_name, events)))
+            return;
+        }
+      }
+      for (const auto& [resource_id, events] : device.events_by_resource) {
+        if (!events.empty()) {
+          if (ABSL_PREDICT_FALSE(!callback(device_id, resource_id, events)))
+            return;
+        }
+      }
+    }
+  }
+
+  // Calls 'callback' with all event tracks stored in this container.
+  template <typename Callback>
+  void ForAllMutableTracks(Callback callback) const {
+    for (auto& [device_id, device] : events_by_device_) {
+      for (auto& [counter_name, events] : device.counter_events_by_name) {
+        if (!events.empty()) {
+          callback(device_id, counter_name, &events);
+        }
+      }
+      for (auto& [resource_id, events] : device.events_by_resource) {
+        if (!events.empty()) {
+          callback(device_id, resource_id, &events);
+        }
+      }
+    }
+  }
+
+  // Calls 'callback' with all event flows stored in this container.
+  template <typename Callback>
+  void ForAllFlows(Callback callback) const {
+    absl::flat_hash_map<uint64_t /*flow_id*/, TraceEventFlow> flows;
+    for (const auto& [device_id, device] : events_by_device_) {
+      // Counter events are not flow events.
+      for (const auto& [resource_id, events] : device.events_by_resource) {
+        for (auto* event : events) {
+          if (event->has_flow_id()) flows[event->flow_id()].push_back(event);
+        }
+      }
+    }
+    for (auto& [flow_id, combined_flow] : flows) {
+      // If the flow_id is reused, split into individual flows.
+      for (auto& flow : SplitEventFlow(std::move(combined_flow))) {
+        callback(flow_id, flow);
+      }
+    }
+  }
+
+  // Returns the metadata for this trace container.
+  const Trace& trace() const { return trace_; }
+
+  // Returns the number of events.
+  size_t NumEvents() const {
+    size_t count = 0;
+    for (const auto& [device_id, device] : events_by_device_) {
+      for (const auto& [counter_name, events] : device.counter_events_by_name) {
+        count += events.size();
+      }
+      for (const auto& [resource_id, events] : device.events_by_resource) {
+        count += events.size();
+      }
+    }
+    return count;
+  }
+
+  // Returns the number of tracks.
+  size_t NumTracks() const {
+    return std::accumulate(
+        events_by_device_.begin(), events_by_device_.end(), 0,
+        [](const size_t tracks, const std::pair<uint32_t, DeviceEvents> item) {
+          return tracks + item.second.counter_events_by_name.size() +
+                 item.second.events_by_resource.size();
+        });
+  }
+
+  bool FilterByVisibility() const { return filter_by_visibility_; }
+
+ protected:
+  // Allocates an event in the first of the arenas_.
+  TraceEvent* CreateArenaEvent() { return (*arenas_.begin())->Create(); }
+
+  // Copies event into arenas_.
+  TraceEvent* CopyEventToArena(const TraceEvent& event) {
+    TraceEvent* copy = CreateArenaEvent();
+    *copy = event;
+    return copy;
+  }
+
+  // Adds an event from arenas_ to events_by_device_.
+  void AddArenaEvent(TraceEvent* event) {
+    ExpandTraceSpan(EventSpan(*event), &trace_);
+    DeviceEvents& device_events = events_by_device_[event->device_id()];
+    if (!event->has_resource_id()) {
+      device_events.counter_events_by_name[event->name()].push_back(event);
+    } else {
+      device_events.events_by_resource[event->resource_id()].push_back(event);
+    }
+  }
+
+  // Returns all events grouped by visibility level.
+  std::vector<std::vector<const TraceEvent*>> EventsByLevel() const {
+    std::vector<TraceEvent*> events = SortedEvents();
+    return GetEventsByLevel(trace_, events);
+  }
+
+  // Returns all events sorted using TraceEventsComparator.
+  // Helper for EventsByLevel().
+  // REQUIRED: All events have been added and SortTracks() has been called.
+  std::vector<TraceEvent*> SortedEvents() const {
+    std::vector<const TraceEventTrack*> event_tracks;
+    event_tracks.reserve(NumTracks());
+    ForAllMutableTracks(
+        [&event_tracks](uint32_t device_id,
+                        std::variant<uint32_t, absl::string_view> resource_id,
+                        TraceEventTrack* events) {
+          event_tracks.push_back(events);
+        });
+    return MergeEventTracks(event_tracks);
+  }
+
+  uint64_t MaybeInternString(absl::string_view name) {
+    uint64_t fp = hash_(name);
+    auto& it = (*trace_.mutable_name_table())[fp];
+    if (it.empty()) {
+      it = name;
+    }
+    return fp;
+  }
+
+  void MaybeInternEventName(TraceEvent* event, absl::string_view name) {
+    static constexpr size_t kNameInternThreshold = 32;
+    if (name.size() > kNameInternThreshold) {
+      event->set_name_ref(MaybeInternString(name));
+    } else {
+      event->set_name(name);
+    }
+  }
+
+  void MaybeInternTraceArgument(RawData* raw_data) {
+    if (raw_data->has_args()) {
+      for (auto& arg : *raw_data->mutable_args()->mutable_arg()) {
+        constexpr size_t kTraceArgInternThreshold = 16;
+        if (arg.has_str_value() &&
+            arg.str_value().size() > kTraceArgInternThreshold) {
+          // Use name table to string intern the trace argument.
+          if (arg.name() == "long_name" || arg.name() == "hlo_text") {
+            // Also mark it as potential stack frame.
+            arg.set_ref_value(MaybeInternString("@@" + arg.str_value()));
+          } else {
+            arg.set_ref_value(MaybeInternString(arg.str_value()));
+          }
+        }
+      }
+    }
+  }
+
+  // Events shown within a single device.
+  struct DeviceEvents {
+    // Counter events, which are per-device (don't have resource_id), and are
+    // plotted in different tracks for each counter name.
+    absl::flat_hash_map<std::string, TraceEventTrack> counter_events_by_name;
+
+    // Complete events and flow events, mapped by resource_id.
+    std::map<uint32_t, TraceEventTrack> events_by_resource;
+  };
+
+  // Events, mapped by device_id.
+  mutable std::map<uint32_t, DeviceEvents> events_by_device_;
+
+  // Indicator on if visibility filtering is applied or not
+  // Currently skip visibility filtering only applies to ssTable
+  bool filter_by_visibility_ = true;
+
+  // The arenas containing events constructed in this container or in containers
+  // that have been merged into this container.
+  using Arenas = absl::flat_hash_set<std::shared_ptr<EventFactory>>;
+  Arenas arenas_;
+
+  Trace trace_;
+  Hash hash_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h
new file mode 100644
index 00000000000..24f632038b3
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h
@@ -0,0 +1,40 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_FILTER_INTERFACE_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_FILTER_INTERFACE_H_
+
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Trace event filter interface.
+class TraceEventsFilterInterface {
+ public:
+  virtual ~TraceEventsFilterInterface() = default;
+
+  // Allow sub-classes to set up filtering by processing the trace, e.g., by
+  // capturing the names of devices and resources that need to be filtered.
+  virtual void SetUp(const Trace& trace) = 0;
+
+  // Returns true if event should not be added to a TraceEventsContainer.
+  virtual bool Filter(const TraceEvent& event) = 0;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_FILTER_INTERFACE_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.cc
new file mode 100644
index 00000000000..b8da6cebf93
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.cc
@@ -0,0 +1,122 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h"
+
+#include <cstdint>
+#include <map>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+std::string JsonEscape(absl::string_view raw) {
+  std::string escaped_string;
+  const size_t length = raw.length();
+  escaped_string.reserve((length + 1) * 2);
+  escaped_string.push_back('"');
+  for (size_t i = 0; i < length; ++i) {
+    const unsigned char c = raw[i];
+    if (c < 0x20) {
+      // Not printable.
+      escaped_string.push_back('\\');
+      switch (c) {
+        case '\b':
+          escaped_string.push_back('b');
+          break;
+        case '\f':
+          escaped_string.push_back('f');
+          break;
+        case '\n':
+          escaped_string.push_back('n');
+          break;
+        case '\r':
+          escaped_string.push_back('r');
+          break;
+        case '\t':
+          escaped_string.push_back('t');
+          break;
+        default:
+          absl::StrAppendFormat(&escaped_string, "u%04x", static_cast<uint>(c));
+      }
+      continue;
+    }
+
+    switch (c) {
+      case '\"':
+        escaped_string.append("\\\"");
+        continue;
+      case '\\':
+        escaped_string.append("\\\\");
+        continue;
+
+      case '<':
+      case '>':
+      case '&': {
+        absl::StrAppendFormat(&escaped_string, "\\u%04x", static_cast<uint>(c));
+        continue;
+      }
+      case '\xe2': {
+        if ((i + 2 < length) && (raw[i + 1] == '\x80')) {
+          if (raw[i + 2] == '\xa8') {
+            escaped_string.append("\\u2028");
+            i += 2;
+            continue;
+          } else if (raw[i + 2] == '\xa9') {
+            escaped_string.append("\\u2029");
+            i += 2;
+            continue;
+          }
+        }
+        escaped_string.push_back(c);
+        continue;
+      }
+    }
+
+    // Character should not be escaped.
+    escaped_string.push_back(c);
+  }
+
+  escaped_string.push_back('"');
+  return escaped_string;
+}
+
+// Converts the given proto to text format and escapes it for JSON.
+std::string ProtoString(const tsl::protobuf::Message& pb) {
+  return JsonEscape(pb.DebugString());
+}
+
+std::map<uint64_t, uint64_t> BuildStackFrameReferences(const Trace& trace) {
+  const auto& name_table = trace.name_table();
+  std::map<uint64_t, uint64_t> output;
+  for (const auto& [fp, name] : name_table) {
+    if (!absl::StartsWith(name, "@@")) continue;
+    output[fp] = 0;
+  }
+  uint64_t sf = 1;
+  for (auto& it : output) {
+    it.second = sf++;
+  }
+  return output;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
new file mode 100644
index 00000000000..c972ab427ee
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
@@ -0,0 +1,597 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_TO_JSON_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_TO_JSON_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <map>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h"
+#include "tensorflow/core/profiler/lib/context_types.h"
+#include "tensorflow/core/profiler/protobuf/task.pb.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
+#include "tensorflow/tsl/platform/protobuf.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// JSON generation options.
+struct JsonTraceOptions {
+  using Details = std::vector<std::pair<std::string, bool>>;
+
+  // Options and values for filtering based on the "details" menu.
+  Details details;
+
+  // If selected_device_ids is set, we add a field "selected_device_ids"
+  // in the Trace JSON.
+  std::optional<absl::flat_hash_set<uint32_t>> selected_device_ids;
+
+  // Device IDs of devices whose resources should be sorted by name instead of
+  // by resource ID.
+  absl::flat_hash_set<uint32_t /*device_id*/> sort_resources_by_name;
+
+  // Returns the color for an event.
+  TraceEventsColorerInterface* colorer = nullptr;
+
+  bool generate_stack_frames = true;
+};
+
+// Counts generated JSON events by type.
+class JsonEventCounter {
+ public:
+  JsonEventCounter() : event_count_(kNumEventTypes, 0) {}
+  ~JsonEventCounter() { LOG(INFO) << ToString(); }
+
+  // Types of JSON events (bit.ly/trace-event-format)
+  enum EventType {
+    kCompleteEvent,
+    kCompleteEventWithFlow,
+    kCounterEvent,
+    kAsyncEvent,
+  };
+
+  void Inc(EventType e) { ++event_count_[e]; }
+
+  std::string ToString() const {
+    std::string output = "Generated JSON events:";
+    for (size_t i = 0; i < event_count_.size(); ++i) {
+      absl::StrAppend(&output, " ", kEventTypeName[i], ": ", event_count_[i]);
+    }
+    return output;
+  }
+
+ private:
+  static constexpr absl::string_view kEventTypeName[] = {
+      "complete",
+      "complete+flow",
+      "counter",
+      "async",
+  };
+
+  static constexpr size_t kNumEventTypes = ABSL_ARRAYSIZE(kEventTypeName);
+
+  absl::FixedArray<size_t> event_count_;
+};
+
+// Adds a separator between elements of a JSON array or object.
+template <typename IOBuffer>
+class JsonSeparator {
+ public:
+  explicit JsonSeparator(IOBuffer* output) : output_(output) {}
+
+  // Does nothing on the first call; adds a comma to the output on subsequent
+  // calls.
+  void Add() {
+    output_->Append(sep_);
+    sep_ = ",";
+  }
+
+ private:
+  IOBuffer* output_;
+  absl::string_view sep_;
+};
+
+// Converts picoseconds to microseconds.
+inline double PicosToMicros(uint64_t ps) { return ps / 1E6; }
+
+// Escapes the contents of "raw" in JSON style.
+// Also adds double quotes to the beginning and end of the string.
+std::string JsonEscape(absl::string_view raw);
+
+std::string ProtoString(const tsl::protobuf::Message& pb);
+
+template <typename RawDataType, typename IOBuffer>
+void WriteTpuData(const RawDataType& data, JsonSeparator<IOBuffer>* separator,
+                  IOBuffer* output) {}
+
+// Writes JSON events from a TraceEvent.
+template <typename IOBuffer, typename RawDataType>
+class JsonEventWriter {
+ public:
+  JsonEventWriter(const TraceEventsColorerInterface* colorer,
+                  const Trace& trace,
+                  const std::map<uint64_t, uint64_t>& references,
+                  IOBuffer* output)
+      : colorer_(colorer),
+        trace_(trace),
+        references_(references),
+        output_(output) {}
+
+  void WriteEvent(const TraceEvent& event) const {
+    std::optional<TraceEvent> async_event;
+    output_->Append(R"({"pid":)", event.device_id());
+    if (event.has_resource_id()) {
+      output_->Append(R"(,"tid":)", event.resource_id());
+    }
+    const std::string& event_name =
+        event.has_name_ref() ? trace_.name_table().at(event.name_ref())
+                             : event.name();
+    output_->Append(R"(,"name":)", JsonEscape(event_name));
+    Timespan span = EventSpan(event);
+    // "%.17g" is the default double format in proto2::util::JsonFormat.
+    absl::Format(output_, R"(,"ts":%.17g)", PicosToMicros(span.begin_ps()));
+    JsonEventCounter::EventType event_type = JsonEventCounter::kCounterEvent;
+    if (event.has_resource_id()) {
+      event_type = event.has_flow_id()
+                       ? JsonEventCounter::kCompleteEventWithFlow
+                       : JsonEventCounter::kCompleteEvent;
+      // A complete event must have a duration, otherwise trace-viewer will
+      // extend the event to the end of the trace and append "(Did Not Finish)"
+      // to its name. Make the minimum duration 1 picosecond.
+      uint64_t duration_ps = std::max(span.duration_ps(), uint64_t{1});
+      absl::Format(output_, R"(,"dur":%.17g)", PicosToMicros(duration_ps));
+
+      if (std::optional<uint32_t> color_id = colorer_->GetColor(event)) {
+        output_->Append(R"(,"cname":)", TraceViewerColorName(*color_id));
+      }
+
+      // FlowV2
+      if (event_type == JsonEventCounter::kCompleteEventWithFlow) {
+        output_->Append(R"(,"bind_id":)", event.flow_id());
+        if (event.has_flow_category()) {
+          ContextType type = GetSafeContextType(event.flow_category());
+          if (type != ContextType::kGeneric && type != ContextType::kLegacy) {
+            const char* category = GetContextTypeString(type);
+            output_->Append(R"(,"cat":")", category, R"(")");
+          }
+        }
+        switch (event.flow_entry_type()) {
+          case TraceEvent::FLOW_NONE:
+            // The caller prevents this case from happenning.
+            break;
+          case TraceEvent::FLOW_START:
+            output_->Append(R"(,"flow_out":true)");
+            break;
+          case TraceEvent::FLOW_MID:
+            output_->Append(R"(,"flow_in":true,"flow_out":true)");
+            break;
+          case TraceEvent::FLOW_END:
+            output_->Append(R"(,"flow_in":true)");
+            break;
+        }
+      }
+      output_->Append(R"(,"ph":"X")");
+    } else {
+      event_type = event.has_flow_id() ? JsonEventCounter::kAsyncEvent
+                                       : JsonEventCounter::kCounterEvent;
+      if (event_type == JsonEventCounter::kCounterEvent) {
+        output_->Append(R"(,"ph":"C")");
+      } else {  // async events
+        output_->Append(R"(,"id":)", event.flow_id());
+        if (event.has_flow_category()) {
+          ContextType type = GetSafeContextType(event.flow_category());
+          const char* category = GetContextTypeString(type);
+          output_->Append(R"(,"cat":")", category, R"(")");
+        }
+        switch (event.flow_entry_type()) {
+          case TraceEvent::FLOW_NONE:
+            // The caller prevents this case from happenning.
+            break;
+          case TraceEvent::FLOW_START:
+            output_->Append(R"(,"ph":"b")");
+            break;
+          case TraceEvent::FLOW_END:
+            output_->Append(R"(,"ph":"e")");
+            break;
+          case TraceEvent::FLOW_MID:
+            output_->Append(R"(,"ph":"b")");
+            async_event.emplace(event);
+            async_event->set_flow_entry_type(TraceEvent::FLOW_END);
+            async_event->set_timestamp_ps(event.timestamp_ps() +
+                                          event.duration_ps());
+            async_event->clear_raw_data();
+            break;
+        }
+      }
+    }
+    WriteArgs(event);
+    if (event.has_serial()) {
+      output_->Append(R"(,"z":)", event.serial());
+    }
+
+    output_->Append("}");
+    counter_.Inc(event_type);
+    if (async_event) {
+      output_->Append(",");
+      WriteEvent(*async_event);
+    }
+  }
+
+ private:
+  void WriteArgs(const TraceEvent& event) const {
+    if (!event.has_group_id() && !event.has_raw_data()) {
+      return;
+    }
+    output_->Append(R"(,"args":{)");
+    std::optional<uint64_t> stack_frames;
+    JsonSeparator<IOBuffer> separator(output_);
+    if (event.has_group_id()) {
+      separator.Add();
+      output_->Append(R"("group_id":)", event.group_id());
+    }
+    if (event.has_raw_data()) {
+      RawDataType data;
+      data.ParseFromString(event.raw_data());
+      switch (data.raw_data_case()) {
+        case RawDataType::RAW_DATA_NOT_SET:
+          break;
+        case RawDataType::kTpuData:
+          WriteTpuData<RawDataType, IOBuffer>(data, &separator, output_);
+          break;
+        case RawDataType::kDmaActivity:
+          separator.Add();
+          output_->Append(R"("DMA activity":)",
+                          ProtoString(data.dma_activity()));
+          break;
+        case RawDataType::kArgs:
+          for (const auto& arg : data.args().arg()) {
+            switch (arg.value_case()) {
+              case TraceEventArguments::Argument::kStrValue:
+                separator.Add();
+                WriteArg(arg.name(), arg.str_value());
+                break;
+              case TraceEventArguments::Argument::kIntValue:
+                separator.Add();
+                WriteArg(arg.name(), arg.int_value());
+                break;
+              case TraceEventArguments::Argument::kUintValue:
+                separator.Add();
+                WriteArg(arg.name(), arg.uint_value());
+                break;
+              case TraceEventArguments::Argument::kDoubleValue:
+                separator.Add();
+                WriteArg(arg.name(), arg.double_value());
+                break;
+              case TraceEventArguments::Argument::kRefValue: {
+                const auto& it = trace_.name_table().find(arg.ref_value());
+                if (it != trace_.name_table().end()) {
+                  // Each event could only have one stack frame.
+                  if (absl::StartsWith(it->second, "@@") && !stack_frames) {
+                    stack_frames = arg.ref_value();
+                  } else {
+                    separator.Add();
+                    WriteArg(arg.name(), it->second);
+                  }
+                }
+                break;
+              }
+              case TraceEventArguments::Argument::VALUE_NOT_SET:
+                break;
+            }
+          }
+          break;
+      }
+    }
+    output_->Append("}");
+
+    // Write the optional stack frame.
+    if (stack_frames.has_value()) {
+      output_->Append(R"(,"sf":)", references_.at(*stack_frames), R"()");
+    }
+  }
+  void WriteArg(absl::string_view name, absl::string_view value) const {
+    output_->Append(JsonEscape(name), ":", JsonEscape(value));
+  }
+  void WriteArg(absl::string_view name, uint64_t value) const {
+    // Limit beyond which integers converted to 64-bit IEEE floating point may
+    // lose accuracy. JavaScript stores all numbers as doubles, quote the value
+    // to preserve accuracy.
+    // https://en.wikipedia.org/wiki/Double-precision_floating-point_format
+    constexpr uint64_t kIeeeLimit = 1ULL << 53;
+    if (value > kIeeeLimit) {
+      output_->Append(JsonEscape(name), ":\"", value, "\"");
+    } else {
+      output_->Append(JsonEscape(name), ":", value);
+    }
+  }
+  void WriteArg(absl::string_view name, int64_t value) const {
+    // Limit beyond which integers converted to 64-bit IEEE floating point may
+    // lose accuracy. JavaScript stores all numbers as doubles, quote the value
+    // to preserve accuracy.
+    // https://en.wikipedia.org/wiki/Double-precision_floating-point_format
+    constexpr uint64_t kIeeeLimit = 1ULL << 53;
+    if (abs(value) > kIeeeLimit) {
+      output_->Append(JsonEscape(name), ":\"", value, "\"");
+    } else {
+      output_->Append(JsonEscape(name), ":", value);
+    }
+  }
+  void WriteArg(absl::string_view name, double value) const {
+    if (std::isfinite(value)) {
+      output_->Append(JsonEscape(name));
+      // "%.17g" is the default double format in proto2::util::JsonFormat.
+      absl::Format(output_, ":%.17g", value);
+    } else if (std::isinf(value)) {
+      output_->Append(JsonEscape(name), R"(:"Infinity")");
+    } else if (std::isinf(-value)) {
+      output_->Append(JsonEscape(name), R"(:"-Infinity")");
+    } else {
+      output_->Append(JsonEscape(name), R"(:"NaN")");
+    }
+  }
+
+  const TraceEventsColorerInterface* colorer_;
+  const Trace& trace_;
+  const std::map<uint64_t, uint64_t>& references_;
+  IOBuffer* output_;
+  mutable JsonEventCounter counter_;
+};
+
+template <typename IOBuffer>
+void WriteTasks(const Trace& trace, IOBuffer* output) {
+  const auto& tasks = trace.tasks();
+  if (tasks.empty()) return;
+  output->Append(R"("tasks":[)");
+  JsonSeparator<IOBuffer> task_separator(output);
+  std::map<uint32_t, Task> ordered_tasks(tasks.begin(), tasks.end());
+  for (const auto& entry : ordered_tasks) {
+    const uint32_t host_id = entry.first;
+    const auto& task = entry.second;
+
+    task_separator.Add();
+    output->Append("{");
+    JsonSeparator<IOBuffer> field_separator(output);
+    field_separator.Add();
+    output->Append(R"("host_id":)", host_id);
+    if (task.has_changelist()) {
+      field_separator.Add();
+      output->Append(R"("changelist":)", task.changelist());
+    }
+    if (task.has_clean_build()) {
+      field_separator.Add();
+      output->Append(R"("clean_build":)", task.clean_build());
+    }
+    if (task.has_build_time()) {
+      field_separator.Add();
+      output->Append(
+          R"("build_time":)",
+          JsonEscape(absl::FormatTime(absl::FromUnixNanos(task.build_time()),
+                                      absl::UTCTimeZone())));
+    }
+    if (task.has_build_target()) {
+      field_separator.Add();
+      output->Append(R"("build_target":)", JsonEscape(task.build_target()));
+    }
+    if (task.has_command_line()) {
+      field_separator.Add();
+      output->Append(R"("command_line":)", JsonEscape(task.command_line()));
+    }
+    if (task.has_start_time()) {
+      field_separator.Add();
+      output->Append(
+          R"("start_time":)",
+          JsonEscape(absl::FormatTime(absl::FromUnixNanos(task.start_time()),
+                                      absl::UTCTimeZone())));
+    }
+    if (task.has_gtc_freq_hz()) {
+      field_separator.Add();
+      output->Append(R"("gtc_freq_hz":)", task.gtc_freq_hz());
+    }
+    if (task.has_tensor_core_freq_hz()) {
+      field_separator.Add();
+      output->Append(R"("tensor_core_freq_hz":)", task.tensor_core_freq_hz());
+    }
+    if (task.has_sparse_core_freq_hz()) {
+      field_separator.Add();
+      output->Append(R"("sparse_core_freq_hz":)", task.sparse_core_freq_hz());
+    }
+    output->Append("}");
+  }
+  output->Append("],");
+}
+
+template <typename IOBuffer>
+void WriteStackFrames(const Trace& trace,
+                      const std::map<uint64_t, uint64_t>& references,
+                      IOBuffer* output) {
+  const auto& name_table = trace.name_table();
+  output->Append(R"("stackFrames":{)");
+  JsonSeparator<IOBuffer> separator(output);
+  for (const auto& [fp, name] : name_table) {
+    if (!absl::StartsWith(name, "@@")) continue;
+    separator.Add();
+    std::string_view name_view = name;
+    absl::ConsumePrefix(&name_view, "@@");
+    output->Append(R"(")", references.at(fp), R"(":{"name":)",
+                   JsonEscape(name_view), R"(})");
+  }
+  output->Append("},");
+}
+
+template <typename IOBuffer>
+void WriteDetails(const JsonTraceOptions::Details& details, IOBuffer* output) {
+  if (details.empty()) return;
+  output->Append(R"("details":[)");
+  JsonSeparator<IOBuffer> separator(output);
+  for (const auto& detail : details) {
+    separator.Add();
+    output->Append(R"({"name":)", JsonEscape(detail.first), R"(,"value":)",
+                   detail.second ? "true" : "false", "}");
+  }
+  output->Append("],");
+}
+
+template <typename IOBuffer>
+void WriteSelectedDeviceIds(
+    const absl::optional<absl::flat_hash_set<uint32_t>>& selected_device_ids,
+    IOBuffer* output) {
+  if (!selected_device_ids.has_value()) return;
+
+  output->Append(R"("selected_device_ids":[)");
+  JsonSeparator<IOBuffer> separator(output);
+  for (const auto& device_id : selected_device_ids.value()) {
+    separator.Add();
+    output->Append(device_id);
+  }
+  output->Append("],");
+}
+
+std::map<uint64_t, uint64_t> BuildStackFrameReferences(const Trace& trace);
+
+template <typename IOBuffer>
+void WriteReturnedEventsSize(const int events_size, IOBuffer* output) {
+  output->Append(R"("returnedEventsSize":)", events_size, R"(,)");
+}
+
+template <typename IOBuffer>
+void WriteFilteredByVisibility(bool filtered_by_visibility, IOBuffer* output) {
+  absl::string_view filtered_by_visibility_str =
+      filtered_by_visibility ? "true" : "false";
+  output->Append(R"("filteredByVisibility":)", filtered_by_visibility_str,
+                 R"(,)");
+}
+
+template <typename IOBuffer>
+void WriteTraceFullTimespan(const Trace* trace, IOBuffer* output) {
+  auto start_time_ms = trace->min_timestamp_ps() / 1000000000.0;
+  auto end_time_ms = trace->max_timestamp_ps() / 1000000000.0;
+  output->Append(R"("fullTimespan":[)", start_time_ms, R"(,)", end_time_ms,
+                 R"(],)");
+}
+
+template <typename IOBuffer, typename TraceEventsContainer,
+          typename RawDataType>
+void TraceEventsToJson(const JsonTraceOptions& options,
+                       const TraceEventsContainer& events, IOBuffer* output) {
+  // Set the displayTimeUnit to nanoseconds (default is milliseconds), so the UI
+  // uses higher-precision when manipulating event times. Note that the
+  // timestamps of trace events are always given in microseconds.
+  output->Append(
+      R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true},)");
+
+  WriteDetails(options.details, output);
+  WriteSelectedDeviceIds(options.selected_device_ids, output);
+  WriteReturnedEventsSize(events.NumEvents(), output);
+  WriteFilteredByVisibility(events.FilterByVisibility(), output);
+  WriteTraceFullTimespan(&events.trace(), output);
+
+  const Trace& trace = events.trace();
+
+  WriteTasks(trace, output);
+
+  auto references = BuildStackFrameReferences(trace);
+  if (options.generate_stack_frames) {
+    WriteStackFrames(trace, references, output);
+  }
+
+  output->Append(R"("traceEvents":[)");
+  JsonSeparator<IOBuffer> separator(output);
+  // Write metadata events.
+  std::map<uint32_t, Device> ordered_devices(trace.devices().begin(),
+                                             trace.devices().end());
+  for (const auto& [device_id, device] : ordered_devices) {
+    if (device.has_name()) {
+      separator.Add();
+      output->Append(R"({"args":{"name":)", JsonEscape(device.name()),
+                     R"(},"name":"process_name","ph":"M","pid":)", device_id,
+                     "}");
+    }
+    separator.Add();
+    output->Append(R"({"args":{"sort_index":)", device_id,
+                   R"(},"name":"process_sort_index","ph":"M","pid":)",
+                   device_id, "}");
+    std::map<uint32_t, Resource> ordered_resources(device.resources().begin(),
+                                                   device.resources().end());
+    for (const auto& [resource_id, resource] : ordered_resources) {
+      if (resource.has_name()) {
+        separator.Add();
+        output->Append(R"({"args":{"name":)", JsonEscape(resource.name()),
+                       R"(},"name":"thread_name","ph":"M","pid":)", device_id,
+                       R"(,"tid":)", resource_id, "}");
+      }
+      if (!options.sort_resources_by_name.count(device_id)) {
+        separator.Add();
+        output->Append(R"({"args":{"sort_index":)", resource_id,
+                       R"(},"name":"thread_sort_index","ph":"M","pid":)",
+                       device_id, R"(,"tid":)", resource_id, "}");
+      }
+    }
+  }
+
+  TraceEventsColorerInterface* colorer = options.colorer;
+  DefaultTraceEventsColorer default_colorer;
+  if (colorer == nullptr) colorer = &default_colorer;
+  colorer->SetUp(trace);
+
+  // Write events.
+  JsonEventWriter<IOBuffer, RawDataType> writer(colorer, trace, references,
+                                                output);
+  events.ForAllEvents([&](const TraceEvent& event) {
+    separator.Add();
+    writer.WriteEvent(event);
+  });
+  output->Append("]}");
+}
+
+class IOBufferAdapter {
+ public:
+  explicit IOBufferAdapter(std::string* output) : output_(output) {}
+
+  template <typename... AV>
+  inline void Append(AV&&... args) {
+    absl::StrAppend(output_, std::forward<AV>(args)...);
+  }
+
+  // Support IOBufferAdapter as a sink object for absl::Format.
+  friend void AbslFormatFlush(IOBufferAdapter* buffer, absl::string_view s) {
+    buffer->output_->append(s);
+  }
+
+ private:
+  std::string* output_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_TO_JSON_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.cc
new file mode 100644
index 00000000000..4647f834b81
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.cc
@@ -0,0 +1,62 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
+
+#include <vector>
+
+#include "absl/algorithm/container.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Functor that compares flow events for sorting.
+struct FlowEventsComparator {
+  bool operator()(const TraceEvent* a, const TraceEvent* b) const {
+    if (a->timestamp_ps() < b->timestamp_ps()) return true;
+    if (a->timestamp_ps() > b->timestamp_ps()) return false;
+    return (a->flow_entry_type() < b->flow_entry_type());
+  }
+};
+
+std::vector<TraceEventFlow> SplitEventFlow(TraceEventFlow&& flow) {
+  std::vector<TraceEventFlow> flows;
+  absl::c_sort(flow, FlowEventsComparator());
+  TraceEventFlow* current = nullptr;
+  for (TraceEvent* event : flow) {
+    if (current == nullptr ||
+        event->flow_entry_type() == TraceEvent::FLOW_START) {
+      current = &flows.emplace_back();
+    }
+    current->push_back(event);
+    if (event->flow_entry_type() == TraceEvent::FLOW_END) {
+      current = nullptr;
+    }
+  }
+  return flows;
+}
+
+void ExpandTraceSpan(const Timespan& span, Trace* trace) {
+  if (!trace->has_min_timestamp_ps() ||
+      span.begin_ps() < trace->min_timestamp_ps()) {
+    trace->set_min_timestamp_ps(span.begin_ps());
+  }
+  if (!trace->has_max_timestamp_ps() ||
+      span.end_ps() > trace->max_timestamp_ps()) {
+    trace->set_max_timestamp_ps(span.end_ps());
+  }
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h
new file mode 100644
index 00000000000..19c0d6cff6b
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h
@@ -0,0 +1,186 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_UTIL_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_UTIL_H_
+
+#include <type_traits>
+#include <vector>
+
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Returns the resource name for the given (device_id, resource_id) in trace.
+inline absl::string_view ResourceName(const Trace& trace, uint32_t device_id,
+                                      uint32_t resource_id) {
+  return trace.devices().at(device_id).resources().at(resource_id).name();
+}
+
+// Returns the resource name for the given event in trace.
+inline absl::string_view ResourceName(const Trace& trace,
+                                      const TraceEvent& event) {
+  return ResourceName(trace, event.device_id(), event.resource_id());
+}
+
+// Functor that compares trace events for sorting.
+// Trace events are sorted by timestamp_ps (ascending) and duration_ps
+// (descending) so nested events are sorted from outer to innermost.
+struct TraceEventsComparator {
+  bool operator()(const TraceEvent* a, const TraceEvent* b) const {
+    if (a->timestamp_ps() < b->timestamp_ps()) return true;
+    if (a->timestamp_ps() > b->timestamp_ps()) return false;
+    return (a->duration_ps() > b->duration_ps());
+  }
+};
+
+// Creates a Timespan from a TraceEvent.
+inline Timespan EventSpan(const TraceEvent& event) {
+  return Timespan(event.timestamp_ps(), event.duration_ps());
+}
+
+// Creates a Timespan from a Trace.
+inline Timespan TraceSpan(const Trace& trace) {
+  return Timespan::FromEndPoints(trace.min_timestamp_ps(),
+                                 trace.max_timestamp_ps());
+}
+
+// A flow of events in the trace-viewer.
+// All events in the flow have the same flow_id.
+using TraceEventFlow = std::vector<TraceEvent*>;
+
+// In case the flow_id was re-used, split into individual flows based on the
+// flow_entry_type.
+std::vector<TraceEventFlow> SplitEventFlow(TraceEventFlow&& flow);
+
+// Returns whether the flow is complete.
+inline bool IsCompleteFlow(const TraceEventFlow& flow) {
+  DCHECK(!flow.empty());
+  return flow.front()->flow_entry_type() == TraceEvent::FLOW_START &&
+         flow.back()->flow_entry_type() == TraceEvent::FLOW_END;
+}
+
+// Updates the timestamps of a Trace to ensure it includes the given Timespan.
+void ExpandTraceSpan(const Timespan& span, Trace* trace);
+
+// Nway-merge implementation.
+
+// Reorders the elements of the range [first, last) to restore the heap
+// condition (i.e. `std::is_heap(first, last, comp)`) following a change
+// in the value of `*first`.
+//
+// REQUIRES: `first < last`, and [first, last) would be a valid heap if `*first`
+// had a suitable value.
+template <typename RandIt, typename Compare>
+void push_down_root(RandIt first, RandIt last, Compare comp) {
+  size_t size = last - first;
+  size_t hole = 0;  // root.
+  auto value = std::move(*first);
+  while (true) {
+    size_t l_child = 2 * hole + 1;
+    size_t r_child = l_child + 1;
+    size_t max_child = l_child;
+    if (r_child < size && comp(first[l_child], first[r_child])) {
+      max_child = r_child;
+    }
+    if (max_child >= size) break;
+    if (!comp(value, first[max_child])) break;
+    first[hole] = std::move(first[max_child]);
+    hole = max_child;
+  }
+  first[hole] = std::move(value);
+}
+
+template <typename T>
+struct can_dereference_helper {
+  template <typename U, typename = decltype(*std::declval<U>())>
+  static std::true_type test(U);
+  template <typename... U>
+  static std::false_type test(U...);
+  using type = decltype(test(std::declval<T>()));
+};
+
+template <typename T>
+struct can_dereference
+    : can_dereference_helper<typename std::decay<T>::type>::type {};
+
+template <typename T>
+auto recursive_dereference(T&& t, std::false_type)
+    -> decltype(std::forward<T>(t)) {
+  return std::forward<T>(t);
+}
+
+template <typename T>
+auto recursive_dereference(T&& t)
+    -> decltype(recursive_dereference(std::forward<T>(t),
+                                      can_dereference<T>{}));
+
+template <typename T>
+auto recursive_dereference(T&& t, std::true_type)
+    -> decltype(recursive_dereference(*std::forward<T>(t))) {
+  return recursive_dereference(*std::forward<T>(t));
+}
+
+template <typename T>
+auto recursive_dereference(T&& t)
+    -> decltype(recursive_dereference(std::forward<T>(t),
+                                      can_dereference<T>{})) {
+  return recursive_dereference(std::forward<T>(t), can_dereference<T>{});
+}
+
+// ContainerContainer could be a container of a container or a container of
+// pointer of a container.
+template <typename ContainerContainer, typename Out, typename Cmp>
+Out nway_merge(const ContainerContainer& containers, Out out, Cmp cmp) {
+  using std::begin;
+  using std::end;
+  using In = decltype(begin(
+      recursive_dereference(*begin(containers))));  // The input iterator type.
+  using Range = std::pair<In, In>;
+  std::vector<Range> sources;
+  for (const auto& container : containers) {
+    Range r(begin(recursive_dereference(container)),
+            end(recursive_dereference(container)));
+    if (r.first != r.second) {
+      sources.push_back(r);
+    }
+  }
+  if (sources.empty()) return out;
+  // Take a comparator for T and produce an inverse comparator
+  // for std::pair<In<T>, In<T>>, inverted so as to produce a min-heap.
+  auto heap_cmp = [&](const Range& a, const Range& b) {
+    // Compares b < a instead of a < b.
+    return cmp(*b.first, *a.first);
+  };
+  std::make_heap(sources.begin(), sources.end(), heap_cmp);
+  while (true) {
+    Range& r = sources.front();
+    *out = *r.first;
+    ++r.first;
+    ++out;
+    if (r.first == r.second) {
+      if (sources.size() == 1) return out;
+      r = std::move(sources.back());
+      sources.pop_back();
+    }
+    push_down_root(sources.begin(), sources.end(), heap_cmp);
+  }
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_UTIL_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.cc
new file mode 100644
index 00000000000..24c032a2b70
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.cc
@@ -0,0 +1,66 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h"
+
+#include <cstdint>
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+// Pre-defined color names (excluding "black" and "white") from:
+// https://github.com/catapult-project/catapult/blob/master/tracing/tracing/base/color_scheme.html.
+// Use raw string to add double-quote around the color name.
+const absl::string_view kTraceViewerColors[kNumTraceViewerColors] = {
+    R"("thread_state_uninterruptible")",
+    R"("thread_state_iowait")",
+    R"("thread_state_running")",
+    R"("thread_state_runnable")",
+    R"("thread_state_unknown")",
+    R"("background_memory_dump")",
+    R"("light_memory_dump")",
+    R"("detailed_memory_dump")",
+    R"("vsync_highlight_color")",
+    R"("generic_work")",
+    R"("good")",
+    R"("bad")",
+    R"("terrible")",
+    R"("grey")",
+    R"("yellow")",
+    R"("olive")",
+    R"("rail_response")",
+    R"("rail_animation")",
+    R"("rail_idle")",
+    R"("rail_load")",
+    R"("startup")",
+    R"("heap_dump_stack_frame")",
+    R"("heap_dump_object_type")",
+    R"("heap_dump_child_node_arrow")",
+    R"("cq_build_running")",
+    R"("cq_build_passed")",
+    R"("cq_build_failed")",
+    R"("cq_build_abandoned")",
+    R"("cq_build_attempt_runnig")",
+    R"("cq_build_attempt_passed")",
+    R"("cq_build_attempt_failed")"};
+
+}  // namespace
+
+absl::string_view TraceViewerColorName(uint32_t color_id) {
+  return kTraceViewerColors[color_id % kNumTraceViewerColors];
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h
new file mode 100644
index 00000000000..be2bb9f0a24
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h
@@ -0,0 +1,98 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_COLOR_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_COLOR_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Pre-defined color names (excluding "black" and "white") from:
+// https://github.com/catapult-project/catapult/blob/master/tracing/tracing/base/color_scheme.html.
+// Possible value of TraceEvent.color_id
+enum TraceViewerColor {
+  kThreadStateUninterruptible,
+  kThreadStateIowait,
+  kThreadStateRunning,
+  kThreadStateRunnable,
+  kThreadStateUnknown,
+  kBackgroundMemoryDump,
+  kLightMemoryDump,
+  kDetailedMemoryDump,
+  kVsyncHighlightColor,
+  kGenericWork,
+  kGood,
+  kBad,
+  kTerrible,
+  kGrey,
+  kYellow,
+  kOlive,
+  kRailResponse,
+  kRailAnimation,
+  kRailIdle,
+  kRailLoad,
+  kStartup,
+  kHeapDumpStackFrame,
+  kHeapDumpObjectType,
+  kHeapDumpChildNodeArrow,
+  kCqBuildRunning,
+  kCqBuildPassed,
+  kCqBuildFailed,
+  kCqBuildAbandoned,
+  kCqBuildAttemptRunnig,
+  kCqBuildAttemptPassed,
+  kCqBuildAttemptFailed,
+};
+
+// Number of named colors in TraceViewer.
+constexpr uint32_t kNumTraceViewerColors =
+    TraceViewerColor::kCqBuildAttemptFailed + 1;
+
+// Returns the color name for a given color id.
+// Used to decode the value in TraceEvent.color_id.
+absl::string_view TraceViewerColorName(uint32_t color_id);
+
+// Trace event colorer interface.
+class TraceEventsColorerInterface {
+ public:
+  virtual ~TraceEventsColorerInterface() = default;
+
+  // Allow sub-classes to set up coloring by processing the trace, e.g., by
+  // capturing the names of devices and resources that need to be colored.
+  virtual void SetUp(const Trace& trace) = 0;
+
+  // Returns the color for a trace event.
+  virtual std::optional<uint32_t> GetColor(const TraceEvent& event) const = 0;
+};
+
+class DefaultTraceEventsColorer : public TraceEventsColorerInterface {
+ public:
+  void SetUp(const Trace& trace) override {}
+
+  std::optional<uint32_t> GetColor(const TraceEvent& event) const override {
+    return std::nullopt;
+  }
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_COLOR_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.cc
new file mode 100644
index 00000000000..8a9207fa4c3
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.cc
@@ -0,0 +1,156 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h"
+
+#include <cstdint>
+
+#include "tensorflow/core/profiler/utils/timespan.h"
+
+namespace tensorflow {
+namespace profiler {
+
+TraceViewerVisibility::TraceViewerVisibility(Timespan visible_span,
+                                             uint64_t resolution_ps)
+    : visible_span_(visible_span), resolution_ps_(resolution_ps) {}
+
+bool TraceViewerVisibility::Visible(const TraceEvent& event) {
+  // If visible_span_ is instant, we cannot usefully filter.
+  if (visible_span_.Instant()) return true;
+
+  // Events outside visible_span are not visible.
+  Timespan span(event.timestamp_ps(), event.duration_ps());
+  if (!visible_span_.Overlaps(span)) return false;
+
+  // If resolution is zero, no downsampling.
+  if (resolution_ps_ == 0) return true;
+
+  return VisibleAtResolution(event);
+}
+
+bool TraceViewerVisibility::VisibleAtResolution(const TraceEvent& event) {
+  DCHECK_NE(resolution_ps_, 0);
+  // A counter event is visible if its distance from the last visible counter
+  // event in the same device is large enough. The first counter event in a
+  // row is always visible.
+  if (!event.has_resource_id()) {
+#if 1
+    // TODO(b/218368708): Streaming mode does not seem to work for counters:
+    // even if more counter events are loaded, the chart does not refresh.
+    // For now, the workaround is to make counters always visible.
+    return true;
+#else
+    // TODO(b/218368708): Provided streaming mode works, we should use the
+    // difference in counter values as a criteria for visibility: if the height
+    // of the bar changes significantly, ignore the time between updates.
+    CounterRowId counter_row_id(event.device_id(), event.name());
+    auto iter = last_counter_timestamp_ps_.find(counter_row_id);
+    bool found = (iter != last_counter_timestamp_ps_.end());
+    bool visible =
+        !found || ((event.timestamp_ps() - iter->second) >= resolution_ps_);
+    if (visible) {
+      if (found) {
+        iter->second = event.timestamp_ps();
+      } else {
+        last_counter_timestamp_ps_.emplace(counter_row_id,
+                                           event.timestamp_ps());
+      }
+    }
+    return visible;
+#endif
+  }
+
+  // An event is visible if its duration is large enough.
+  Timespan span(event.timestamp_ps(), event.duration_ps());
+  bool visible = (span.duration_ps() >= resolution_ps_);
+
+  auto& row = rows_[RowId(event.device_id(), event.resource_id())];
+
+  // An event is visible if it is the first event at its nesting depth, or its
+  // distance from the last visible event at the same depth is large enough.
+  size_t depth = row.Depth(span.begin_ps());
+  if (!visible) {
+    auto last_end_timestamp_ps = row.LastEndTimestampPs(depth);
+    visible = !last_end_timestamp_ps ||
+              (span.begin_ps() - *last_end_timestamp_ps >= resolution_ps_);
+  }
+
+  // A flow event is visible if the first event in the flow is visible.
+  // The first event in the flow is visible if the distance between its arrow
+  // binding point and the previous visible arrow binding point is large enough.
+  // The arrow binds to the end time of the complete event.
+  if (event.has_flow_id()) {
+    // Only compute visibility for the first event in the flow.
+    auto result = flows_.try_emplace(event.flow_id(), visible);
+    if (!visible) {
+      if (result.second) {
+        auto last_flow_timestamp_ps = row.LastFlowTimestampPs();
+        result.first->second =
+            !last_flow_timestamp_ps ||
+            (span.end_ps() - *last_flow_timestamp_ps >= resolution_ps_);
+      }
+      visible = result.first->second;
+    }
+    // If we see the last event in the flow, remove it from the map. We don't
+    // use flow_entry_type for determining the first event in the flow because
+    // for cross-host flows it won't be FLOW_START.
+    // This removal prevents the map from growing too large.
+    if (event.flow_entry_type() == TraceEvent::FLOW_END) {
+      flows_.erase(result.first);
+    }
+    if (visible) {
+      row.SetLastFlowTimestampPs(span.end_ps());
+    }
+  }
+
+  if (visible) {
+    row.SetLastEndTimestampPs(depth, span.end_ps());
+  }
+  return visible;
+}
+
+void TraceViewerVisibility::SetVisibleAtResolution(const TraceEvent& event) {
+  DCHECK_NE(resolution_ps_, 0);
+  if (!event.has_resource_id()) {
+    CounterRowId counter_row_id(event.device_id(), event.name());
+    last_counter_timestamp_ps_.insert_or_assign(counter_row_id,
+                                                event.timestamp_ps());
+
+  } else {
+    Timespan span(event.timestamp_ps(), event.duration_ps());
+    auto& row = rows_[RowId(event.device_id(), event.resource_id())];
+    if (event.has_flow_id()) {
+      if (event.flow_entry_type() == TraceEvent::FLOW_END) {
+        flows_.erase(event.flow_id());
+      } else {
+        flows_.try_emplace(event.flow_id(), true);
+      }
+      row.SetLastFlowTimestampPs(span.end_ps());
+    }
+    size_t depth = row.Depth(span.begin_ps());
+    row.SetLastEndTimestampPs(depth, span.end_ps());
+  }
+}
+
+size_t TraceViewerVisibility::RowVisibility::Depth(
+    uint64_t begin_timestamp_ps) const {
+  size_t depth = 0;
+  for (; depth < last_end_timestamp_ps_.size(); ++depth) {
+    if (last_end_timestamp_ps_[depth] <= begin_timestamp_ps) break;
+  }
+  return depth;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h
new file mode 100644
index 00000000000..c4427184132
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h
@@ -0,0 +1,161 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_VISIBILITY_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_VISIBILITY_H_
+
+#include <cmath>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Determines whether an event will be visible in trace viewer within a visible
+// Timespan at a certain resolution.
+// Events must be evaluated in order by timestamp, because when an event is
+// determined to be visible, the internal state of this class is updated.
+class TraceViewerVisibility {
+ public:
+  // Create with visible timespan and resolution (in picoseconds).
+  // The visible timespan must have non-zero duration.
+  // If resolution is zero, no events are downsampled.
+  explicit TraceViewerVisibility(Timespan visible_span,
+                                 uint64_t resolution_ps = 0);
+
+  // Returns true if the event overlaps the visible span and is distinguishable
+  // at resolution_ps.
+  bool Visible(const TraceEvent& event);
+
+  // Returns true if the event is distinguishable at resolution_ps.
+  bool VisibleAtResolution(const TraceEvent& event);
+
+  // Records that event is distinguishable at resolution_ps.
+  void SetVisibleAtResolution(const TraceEvent& event);
+
+  Timespan VisibleSpan() const { return visible_span_; }
+  uint64_t ResolutionPs() const { return resolution_ps_; }
+
+ private:
+  // Identifier for one Trace Viewer row.
+  using RowId = std::pair<uint32_t /*device_id*/, uint32_t /*resource_id*/>;
+  using CounterRowId = std::pair<uint32_t /*device_id*/, std::string /*name*/>;
+
+  // Visibility for one Trace Viewer row.
+  class RowVisibility {
+   public:
+    // Returns the nesting depth for an event at begin_timestamp_ps.
+    size_t Depth(uint64_t begin_timestamp_ps) const;
+
+    // Returns the end_timestamp_ps of the last visibile event at the given
+    // nesting depth.
+    std::optional<uint64_t> LastEndTimestampPs(size_t depth) const {
+      std::optional<uint64_t> result;
+      if (depth < last_end_timestamp_ps_.size()) {
+        result = last_end_timestamp_ps_[depth];
+      }
+      return result;
+    }
+
+    // Returns the arrow timestamp of the last visible flow event.
+    std::optional<uint64_t> LastFlowTimestampPs() const {
+      return last_flow_timestamp_ps_;
+    }
+
+    // Sets the last visible timestamp at the given nesting depth.
+    void SetLastEndTimestampPs(size_t depth, uint64_t timestamp_ps) {
+      last_end_timestamp_ps_.resize(depth);
+      last_end_timestamp_ps_.push_back(timestamp_ps);
+    }
+
+    // Sets the last visible arrow timestamp.
+    void SetLastFlowTimestampPs(uint64_t timestamp_ps) {
+      last_flow_timestamp_ps_ = timestamp_ps;
+    }
+
+   private:
+    // Stack of most recently visible event end times. A stack is used to handle
+    // nested events.
+    std::vector<uint64_t> last_end_timestamp_ps_;
+
+    // Timestamp of the arrow binding point of the last visible flow event.
+    std::optional<uint64_t> last_flow_timestamp_ps_;
+  };
+
+  // Constructor arguments.
+  Timespan visible_span_;
+  uint64_t resolution_ps_;
+
+  // Visibility data for all rows.
+  absl::flat_hash_map<RowId, RowVisibility> rows_;
+
+  // Visibility of flows.
+  absl::flat_hash_map<uint64_t /*flow_id*/, bool> flows_;
+
+  // Visibility data for counter events.
+  absl::flat_hash_map<CounterRowId, uint64_t> last_counter_timestamp_ps_;
+};
+
+class TraceVisibilityFilter : public TraceEventsFilterInterface {
+ public:
+  // If visible_span.Instant(), all events are visible.
+  // If resolution is 0.0, events aren't downsampled.
+  TraceVisibilityFilter(Timespan visible_span, double resolution)
+      : resolution_(resolution),
+        visibility_(visible_span, ResolutionPs(visible_span.duration_ps())) {}
+
+  Timespan VisibleSpan() const { return visibility_.VisibleSpan(); }
+  uint64_t ResolutionPs() const { return visibility_.ResolutionPs(); }
+
+  void SetUp(const Trace& trace) override {
+    // If the visible_span was not set at construction time, use the trace
+    // bounds and recompute the resolution in picoseconds.
+    Timespan visible_span = VisibleSpan();
+    if (visible_span.Instant() && trace.has_min_timestamp_ps() &&
+        trace.has_max_timestamp_ps()) {
+      visible_span = Timespan::FromEndPoints(trace.min_timestamp_ps(),
+                                             trace.max_timestamp_ps());
+      visibility_ = TraceViewerVisibility(
+          visible_span, ResolutionPs(visible_span.duration_ps()));
+    }
+  }
+
+  bool Filter(const TraceEvent& event) override {
+    return !visibility_.Visible(event);
+  }
+
+ private:
+  // Returns the minimum duration in picoseconds that an event must have in
+  // order to be visible.
+  uint64_t ResolutionPs(uint64_t duration_ps) {
+    return (resolution_ == 0.0) ? 0 : std::llround(duration_ps / resolution_);
+  }
+
+  double resolution_;  // number of visible events per row
+  TraceViewerVisibility visibility_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_VISIBILITY_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility_test.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility_test.cc
new file mode 100644
index 00000000000..0be330cc777
--- /dev/null
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h"
+
+#include <cstdint>
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+constexpr uint32_t kDeviceId = 10;
+constexpr uint32_t kResourceId = 1;
+constexpr uint32_t kSrcResourceId = 2;
+constexpr uint32_t kDstResourceId = 4;
+
+TraceEvent Complete(Timespan span, uint32_t resource_id = kResourceId) {
+  TraceEvent event;
+  event.set_device_id(kDeviceId);
+  event.set_resource_id(resource_id);
+  event.set_timestamp_ps(span.begin_ps());
+  event.set_duration_ps(span.duration_ps());
+  return event;
+}
+
+TraceEvent Counter(uint64_t time_ps) {
+  TraceEvent event;
+  event.set_device_id(kDeviceId);
+  event.set_timestamp_ps(time_ps);
+  return event;
+}
+
+TraceEvent Flow(Timespan span, uint64_t flow_id, uint32_t resource_id) {
+  TraceEvent event;
+  event.set_flow_id(flow_id);
+  event.set_device_id(kDeviceId);
+  event.set_resource_id(resource_id);
+  event.set_timestamp_ps(span.begin_ps());
+  event.set_duration_ps(span.duration_ps());
+  return event;
+}
+
+TEST(TraceViewerVisibilityTest, VisibilityNoDownsampling) {
+  TraceViewerVisibility v(Timespan(1000, 1000));
+
+  // Instant events.
+  EXPECT_FALSE(v.Visible(Complete(Timespan(999))));
+  EXPECT_TRUE(v.Visible(Complete(Timespan(1000))));
+  EXPECT_TRUE(v.Visible(Complete(Timespan(1500))));
+  EXPECT_TRUE(v.Visible(Complete(Timespan(2000))));
+  EXPECT_FALSE(v.Visible(Complete(Timespan(2001))));
+
+  // Complete events.
+  EXPECT_FALSE(v.Visible(Complete(Timespan(900, 99))));
+  EXPECT_TRUE(v.Visible(Complete(Timespan(900, 100))));
+  EXPECT_TRUE(v.Visible(Complete(Timespan(1450, 100))));
+  EXPECT_TRUE(v.Visible(Complete(Timespan(2000, 50))));
+  EXPECT_FALSE(v.Visible(Complete(Timespan(2001, 50))));
+}
+
+// TODO(b/218368708): Counter events are currently always visible.
+TEST(TraceViewerVisibilityTest, DISABLED_CounterEventsDownsampling) {
+  TraceViewerVisibility v(Timespan(1000, 1000), 100);
+
+  // A counter event within the visible span is visible if its distance from the
+  // previous event is >= resolution_ps.
+  EXPECT_FALSE(v.Visible(Counter(999)));
+  EXPECT_TRUE(v.Visible(Counter(1000)));
+  EXPECT_FALSE(v.Visible(Counter(1099)));
+  EXPECT_TRUE(v.Visible(Counter(1100)));
+  EXPECT_TRUE(v.Visible(Counter(2000)));
+  EXPECT_FALSE(v.Visible(Counter(2001)));
+}
+
+TEST(TraceViewerVisibilityTest, CompleteEventsDownsampling) {
+  TraceViewerVisibility v(Timespan(1000, 1000), 100);
+
+  // First event is always visible.
+  EXPECT_TRUE(v.Visible(Complete(Timespan(950, 50))));
+  // Next visible event must have duration_ps >= resolution_ps or its distance
+  // from the previous event must be >= resolution_ps.
+  EXPECT_FALSE(v.Visible(Complete(Timespan(1050, 50))));
+  EXPECT_TRUE(v.Visible(Complete(Timespan(1055, 200))));
+  EXPECT_TRUE(v.Visible(Complete(Timespan(1355, 50))));
+}
+
+TEST(TraceViewerVisibilityTest, CompleteNestedEventsDownsampling) {
+  TraceViewerVisibility v(Timespan(1000, 1000), 100);
+
+  // First event is always visible.
+  EXPECT_TRUE(v.Visible(Complete(Timespan(1000, 200))));
+  // Nested events are visible when increasing depth.
+  EXPECT_TRUE(v.Visible(Complete(Timespan(1200, 190))));
+  EXPECT_TRUE(v.Visible(Complete(Timespan(1250, 20))));
+  // Next visible event must have duration_ps >= resolution_ps or its distance
+  // from the previous event must be >= resolution_ps.
+  EXPECT_FALSE(v.Visible(Complete(Timespan(1270, 20))));
+  EXPECT_TRUE(v.Visible(Complete(Timespan(1290, 100))));
+}
+
+TEST(TraceViewerVisibilityTest, FlowEventsDownsampling) {
+  TraceViewerVisibility v(Timespan(1000, 1000), 100);
+
+  // First event in the flow determines the visibility of the full flow.
+  // First flow event in a row is always visible.
+  EXPECT_TRUE(v.Visible(Flow(Timespan(1000, 50), 1, kSrcResourceId)));
+  // Distance between arrow points must be >= resolution_ps.
+  EXPECT_FALSE(v.Visible(Flow(Timespan(1050, 50), 2, kSrcResourceId)));
+  EXPECT_TRUE(v.Visible(Flow(Timespan(1100, 50), 3, kSrcResourceId)));
+
+  // Other events in the flow have the same visibility as the first event in
+  // the flow.
+  EXPECT_TRUE(v.Visible(Flow(Timespan(1100, 50), 1, kDstResourceId)));
+  EXPECT_FALSE(v.Visible(Flow(Timespan(1200, 52), 2, kDstResourceId)));
+  EXPECT_TRUE(v.Visible(Flow(Timespan(1252, 10), 3, kDstResourceId)));
+
+  // Sanity check for first events: complete events with same distance between
+  // events, the third event is not visible because the distance between
+  // rectangles is not enough, unlike the distance between arrows.
+  EXPECT_TRUE(v.Visible(Complete(Timespan(1300, 50))));
+  EXPECT_FALSE(v.Visible(Complete(Timespan(1350, 50))));
+  EXPECT_FALSE(v.Visible(Complete(Timespan(1400, 50))));
+
+  // Sanity check for other events: if only the distance between arrows is
+  // considered, the second flow would be visible and the third would be
+  // invisible.
+  EXPECT_TRUE(v.Visible(Flow(Timespan(1600, 50), 4, kResourceId)));
+  EXPECT_TRUE(v.Visible(Flow(Timespan(1700, 52), 5, kResourceId)));
+  EXPECT_FALSE(v.Visible(Flow(Timespan(1752, 10), 6, kResourceId)));
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
index 3cc3fb07557..8cc47033c5b 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/statusor.h"
@@ -35,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 #include "tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h"
+#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
 #include "tensorflow/core/profiler/convert/xplane_to_tool_names.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
@@ -234,6 +234,8 @@ StatusOr<std::string> PreprocessXSpace(
 StatusOr<std::string> ConvertMultiXSpacesToToolData(
     const SessionSnapshot& session_snapshot, const absl::string_view tool_name,
     const ToolOptions& options) {
+  LOG(INFO) << "serving tool: " << tool_name
+            << " with options: " << DebugString(options);
   if (tool_name == "trace_viewer") {
     return ConvertXSpaceToTraceEvents(session_snapshot);
   } else if (tool_name == "overview_page") {
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index a3878a899a9..6af8af3210a 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -43,9 +43,9 @@ cc_library(
     hdrs = ["profiler_session.h"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/tsl/profiler/lib:profiler_session",
     ] + if_not_android([
         ":profiler_interface",
@@ -65,8 +65,8 @@ cc_library(
     hdrs = ["profiler_factory.h"],
     deps = [
         ":profiler_interface",
-        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
         "//tensorflow/tsl/profiler/lib:profiler_factory",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
     ] + if_static([
         ":profiler_factory_impl",
     ]),
@@ -133,11 +133,11 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":traceme_encode",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
-        "//tensorflow/tsl/profiler/lib:traceme_encode",
         "//tensorflow/tsl/profiler/lib:traceme",
+        "//tensorflow/tsl/profiler/lib:traceme_encode",
+        "@com_google_absl//absl/strings",
     ] + if_not_android([
         "//tensorflow/core/profiler/backends/cpu:traceme_recorder",
         "//tensorflow/core/profiler/utils:time_utils",
@@ -211,10 +211,10 @@ cc_library(
     hdrs = ["scoped_annotation.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
         "//tensorflow/tsl/profiler/lib:scoped_annotation",
+        "@com_google_absl//absl/strings",
     ] + if_not_android([
         "//tensorflow/core/profiler/backends/cpu:annotation_stack",
     ]),
@@ -225,10 +225,10 @@ cc_library(
     hdrs = ["scoped_annotation_stack.h"],
     visibility = ["//tensorflow/compiler/xla:runtime"],
     deps = [
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
         "//tensorflow/tsl/profiler/lib:scoped_annotation_stack",
+        "@com_google_absl//absl/strings",
     ] + if_not_android([
         "//tensorflow/core/profiler/backends/cpu:annotation_stack",
     ]),
diff --git a/tensorflow/core/profiler/protobuf/BUILD b/tensorflow/core/profiler/protobuf/BUILD
index ac2c4aa29df..aee1ce1db70 100644
--- a/tensorflow/core/profiler/protobuf/BUILD
+++ b/tensorflow/core/profiler/protobuf/BUILD
@@ -171,19 +171,6 @@ tf_proto_library(
     visibility = [":friends"],
 )
 
-tf_proto_library(
-    name = "trace_events_proto",
-    srcs = ["trace_events.proto"],
-    cc_api_version = 2,
-    protodeps = [
-        "//tensorflow/tsl/profiler/protobuf:trace_events_proto",
-    ],
-    visibility = [":friends"],
-    exports = [
-        "//tensorflow/tsl/profiler/protobuf:trace_events_proto",
-    ],
-)
-
 tf_proto_library(
     name = "hardware_types_proto",
     srcs = ["hardware_types.proto"],
@@ -228,13 +215,6 @@ tf_proto_library(
 # )
 #
 # py_proto_library(
-#     name = "trace_events_py_pb2",
-#     api_version = 2,
-#     visibility = ["//visibility:public"],
-#     deps = [":trace_events_proto"],
-# )
-#
-# py_proto_library(
 #     name = "memory_viewer_preprocess_py_pb2",
 #     api_version = 2,
 #     visibility = [":memory_viewer_friends"],
@@ -247,4 +227,28 @@ tf_proto_library(
 #     visibility = ["//visibility:public"],
 #     deps = [":op_profile_proto"],
 # )
+#
 # copybara:uncomment_end
+tf_proto_library(
+    name = "task_proto",
+    srcs = ["task.proto"],
+    cc_api_version = 2,
+    visibility = [":friends"],
+)
+
+tf_proto_library(
+    name = "trace_events_proto",
+    srcs = ["trace_events.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        ":task_proto",
+    ],
+    visibility = [":friends"],
+)
+
+tf_proto_library(
+    name = "trace_events_raw_proto",
+    srcs = ["trace_events_raw.proto"],
+    cc_api_version = 2,
+    visibility = [":friends"],
+)
diff --git a/tensorflow/core/profiler/protobuf/task.proto b/tensorflow/core/profiler/protobuf/task.proto
new file mode 100644
index 00000000000..7a3fe800762
--- /dev/null
+++ b/tensorflow/core/profiler/protobuf/task.proto
@@ -0,0 +1,38 @@
+// Describes a task that profiler traced.
+
+syntax = "proto2";
+
+package tensorflow.profiler;
+
+option cc_enable_arenas = true;
+
+// 'Task' contains information about a task that profiler traced.
+message Task {
+  // The most recent changelist number from the client that built the binary.
+  optional int32 changelist = 1;
+  // True if the client that built the binary was mint (no local changes).
+  optional bool clean_build = 2;
+  // Build time (in ns relative to the Unix epoch).
+  optional int64 build_time = 3;
+  // Build target for the binary.
+  optional string build_target = 4;
+  // The full command line used to invoke the task.
+  optional string command_line = 5;
+  // Start time of the task (in ns relative to the Unix epoch).
+  optional int64 start_time = 6;
+  // Task address specified by client at time of profiling request.
+  optional string task_address = 7;
+  // Profiling start walltime (in ns).
+  optional uint64 profile_time_ns = 8;
+  // Profiling duration (in ms).
+  optional uint32 profile_duration_ms = 9;
+  // Host trace level.
+  optional uint32 host_trace_level = 10;
+  // Hardware core frequency.
+  optional uint64 tensor_core_freq_hz = 11;
+  optional uint64 sparse_core_freq_hz = 12;
+  optional uint64 gtc_freq_hz = 13;
+  optional uint64 peak_memory_usage = 14;
+  optional double cpu_limit = 15;
+  optional double cpu_usage = 16;
+}
diff --git a/tensorflow/core/profiler/protobuf/trace_events.proto b/tensorflow/core/profiler/protobuf/trace_events.proto
index 2b2918f1d91..a4898335bbb 100644
--- a/tensorflow/core/profiler/protobuf/trace_events.proto
+++ b/tensorflow/core/profiler/protobuf/trace_events.proto
@@ -1,5 +1,189 @@
-syntax = "proto3";
+// This file describes a pre-aggregated event format for further processing and
+// displaying of trace data.
+//
+// A trace can contain events from several devices. Each device has
+// several resources. These can be threads on a CPU or specific computation
+// units on hardware. Within each resource, many trace events occur over time.
+// Most resources can only execute one thing at a time and so trace events don't
+// overlap in those.
+//
+// Use cases:
+// ==========
+// Traces in this format are can be consumed by timeline views (e.g. like
+// the chrome trace viewer).
+//
+// The goal is to have this data be independent of a specific hardware type and
+// be able to represent traces of arbitrary combinations of CPU, GPU, TPU and
+// whatever else might come up.
+//
+// Data format:
+// ============
+// The messages Trace>Device>Resource form the metadata necessary to understand
+// this trace. TraceEvent objects themselves are not nested within these
+// structures for two reasons:
+//
+// - Efficient on-disk representation: A trace can become huge and have millions
+//   of trace events. If the events were nested inside the other data
+//   structures, a single large proto would have to be stored. Having the traces
+//   outside of those structures means that efficient and shardable storage
+//   formats can be used.
+//
+// - Streaming use cases: Some consumers might not be interested in a
+//   per-device-and-resource-structured representation. They might just want to
+//   churn through all of the trace events to get aggregate representations. In
+//   such cases, it's much more effective to churn through the events one by one
+//   rather than creating the huge memory structure.
+//
+// The downside is obviously that two additional integers need to be stored per
+// trace event to identify the device and resource it occurred in.
+//
+// Timestamps:
+// ===========
+// Events may be as short as one processor cycle - on the order of 1 ns.
+// Timestamps are therefore 64-bit picosecond counters, with 0 being the
+// start of the trace, and overflow occurring after ~200 days.
 
-package tensorflow.profiler.empty;
+syntax = "proto2";
 
-import public "tensorflow/tsl/profiler/protobuf/trace_events.proto";
+package tensorflow.profiler;
+
+import "tensorflow/core/profiler/protobuf/task.proto";
+
+option cc_enable_arenas = true;
+
+// A 'Trace' contains metadata for the individual traces of a system.
+message Trace {
+  // The devices that this trace has information about. Maps from device_id to
+  // more data about the specific device.
+  map<uint32, Device> devices = 1;
+
+  // The tasks that were traced, keyed by a unique ID for the server on
+  // which the task ran.
+  map<uint32, tensorflow.profiler.Task> tasks = 6;
+
+  // The time range that this trace covers.
+  // Timestamps are picoseconds since tracing started.
+  optional uint64 min_timestamp_ps = 4;  // Start of first event.
+  optional uint64 max_timestamp_ps = 5;  // End of last event.
+  optional uint64 num_events = 7;
+
+  // String intern table for event's name or TraceMe argument.
+  map<fixed64, string> name_table = 8;
+
+  reserved 2, 3;
+}
+
+// A 'device' is a physical entity in the system and is comprised of several
+// resources.
+message Device {
+  // The name of the device.
+  optional string name = 1;
+
+  // The id of this device, unique in a single trace.
+  optional uint32 device_id = 2;
+
+  // The resources on this device, keyed by resource_id;
+  map<uint32, Resource> resources = 3;
+
+  reserved 4;
+}
+
+// A 'resource' generally is a specific computation component on a device. These
+// can range from threads on CPUs to specific arithmetic units on hardware
+// devices.
+message Resource {
+  // The name of the resource.
+  optional string name = 1;
+
+  // The id of the resource. Unique within a device.
+  optional uint32 resource_id = 2;
+
+  // Number of events added to this resource.
+  optional uint32 num_events = 3;
+}
+
+/* TraceEvent can represent four kinds of different events specified by
+ * go/trace-event-format
+ * 1. Complete/Duration Event
+ * 2. Flow Event
+ * 3. Counter Event
+ * 4. Async Event
+ *
+ * =======================================================================
+ *                         |  has_flow_id = true   |   has_flow_id = false
+ * =======================================================================
+ * has_resource_id = true  |      flow event       |     complete event
+ * =======================================================================
+ * has_resource_id = false |      async event      |     counter event
+ * =======================================================================
+ *
+ *  for async events:
+ *  if flow_entry_type == FLOW_MID, a pair of json events is generated for a
+ *  single async event (one for begin and one for end). if flow_entry_type is
+ *  FLOW_START or FLOW_END, a single json event is generated correspondingly.
+ *  flow_category is Mandetory for async event even if it is "".
+ */
+
+message TraceEvent {
+  // The id of the device that this event occurred on. The full dataset should
+  // have this device present in the Trace object.
+  optional uint32 device_id = 1;
+
+  // The id of the resource that this event occurred on. The full dataset should
+  // have this resource present in the Device object of the Trace object. A
+  // resource_id is unique on a specific device, but not necessarily within the
+  // trace.
+  // NOTE: counter events do not have this field set as they are per device.
+  optional uint32 resource_id = 2;
+
+  oneof name_oneof {
+    // The name of this trace event.
+    string name = 3;
+    // Reference of the name in Trace's name_table (e.g. in SStable format).
+    fixed64 name_ref = 12;
+  }
+
+  // The group id which this event belongs to. This allows the trace viewer to
+  // show only a particular group of trace events.
+  optional int64 group_id = 5 [default = -1];
+
+  // The timestamp when this event occurred (picos since tracing started).
+  // This timestamp is in the range [min_timestamp, max_timestamp].
+  optional uint64 timestamp_ps = 6;
+
+  // The duration of the event in picoseconds, if applicable.
+  // Events without duration are called instant events.
+  optional uint64 duration_ps = 7;
+
+  // Storage for additional details, e.g. the raw data that led to this
+  // TraceEvent. These are stored as raw data so that we don't pay the
+  // deserialization cost (memory and runtime) if the data isn't used.
+  // See RawData in trace_events_raw.proto.
+  optional bytes raw_data = 8;
+
+  // Used to correlate the multiple events of a flow.
+  optional uint64 flow_id = 9;
+
+  // Indicates the order of the event within a flow.
+  // Events with the same flow_id will appear in trace_viewer linked by arrows.
+  // For an arrow to be shown, at least the FLOW_START and FLOW_END must be
+  // present. There can be zero or more FLOW_MID events in the flow. Arrows are
+  // drawn from FLOW_START to FLOW_END and through each FLOW_MID event in
+  // timestamp order.
+  enum FlowEntryType {
+    FLOW_NONE = 0;
+    FLOW_START = 1;
+    FLOW_MID = 2;
+    FLOW_END = 3;
+  }
+  optional FlowEntryType flow_entry_type = 10;
+  optional uint32 flow_category = 11;
+
+  // For streaming trace viewer frontend deduplication, we need an unique id
+  // for each events, in the same time, we want to reduce the entropy overhead
+  // introduced by this. therefore we will use tuple<device_id, timestamp_ps,
+  // serial> as unique ids, serial is optional and only required when timestamp
+  // is not unique.
+  optional uint32 serial = 13;
+  reserved 4;
+}
diff --git a/tensorflow/core/profiler/protobuf/trace_events_raw.proto b/tensorflow/core/profiler/protobuf/trace_events_raw.proto
new file mode 100644
index 00000000000..829105b246e
--- /dev/null
+++ b/tensorflow/core/profiler/protobuf/trace_events_raw.proto
@@ -0,0 +1,54 @@
+// This defines the RawData attachment to the TraceEvent proto.
+// It is in a separate file to avoid all consumers needing these dependencies.
+
+syntax = "proto2";
+
+package tensorflow.profiler;
+
+option cc_enable_arenas = true;
+
+// RawData contains raw data that can be used to attach further details to a
+// TraceEvent. TraceEvents store this raw data in serialized form so it can be
+// decoded on demand. This can improve performance as TraceEvents are often
+// subject to filtering and only a small subset actually needs to be decoded.
+// NEXT ID: 4
+message RawData {
+  oneof raw_data {
+    DmaActivity dma_activity = 1;
+    TraceEventArguments args = 2;
+    TpuTraceData tpu_data = 3;  // Never used. For the ease of template code.
+  }
+}
+
+// DmaActivity can be used to add DMA details to a trace event.
+message DmaActivity {
+  optional uint64 start_time_cycles = 1;
+  optional uint64 end_time_cycles = 2;
+  optional uint64 kilobytes = 4;
+  optional uint32 mesh_chip_id = 5;
+  optional uint32 core_id = 11;
+  optional uint64 dma_address = 6;
+  optional uint32 multicast = 8;
+  optional uint32 segmented = 9;
+  optional uint64 temporary = 10;  // temporary field, not saved to .sstable.
+  reserved 3;
+}
+
+// Generic trace event arguments.
+message TraceEventArguments {
+  message Argument {
+    optional string name = 1;
+    oneof value {
+      string str_value = 2;
+      uint64 uint_value = 3;
+      int64 int_value = 5;
+      double double_value = 4;
+      fixed64 ref_value = 6;  // string type but stored in metadata.
+    }
+  }
+  repeated Argument arg = 1;
+}
+
+message TpuTraceData {
+  optional uint32 dummy = 1;
+}
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index c0fa43fbdc7..7ab23397e6f 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -41,21 +41,21 @@ cc_library(
         ],
     ),
     deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/core/profiler/utils:math_utils",
-        "//tensorflow/core/profiler/rpc/client:save_profile",
-        "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/rpc/client:save_profile",
         "//tensorflow/core/profiler/utils:file_system_utils",
+        "//tensorflow/core/profiler/utils:math_utils",
+        "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core/profiler/utils:xplane_utils",
-        "//tensorflow/tsl/profiler/protobuf:profiler_service_proto_cc",
         "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
-        "//tensorflow/tsl/profiler/rpc:profiler_service_impl",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_proto_cc",
         "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/rpc:profiler_service_impl",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -79,10 +79,10 @@ cc_library(
     ],
     deps = [
         ":profiler_service_impl",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
         "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
         "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
+        "@com_google_absl//absl/strings",
     ] + tf_grpc_cc_dependencies(),
     alwayslink = True,
 )
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index f4c41bf9a47..e1b56e30263 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -58,15 +58,15 @@ cc_library(
         "//tensorflow/python:__pkg__",
     ],
     deps = [
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/tsl/profiler/protobuf:profiler_analysis_cc_grpc_proto",
         "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
         "//tensorflow/tsl/profiler/rpc/client:profiler_client",
         "//tensorflow/tsl/profiler/rpc/client:profiler_client_impl",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
     ] + tf_grpc_cc_dependencies(),
     alwayslink = True,
 )
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 8725eb4a4b3..691bac556ce 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -180,6 +180,7 @@ cc_library(
     visibility = [":friends"],
     deps = [
         "//tensorflow/tsl/profiler/utils:xplane_schema",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index 7aa4fda45e2..94e03f031e2 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -62,7 +62,7 @@ inline std::string HloOpEventPrefix(const GpuEventStats& stats) {
 }
 
 std::vector<XEventMetadata*> GetOrCreateHloOpEventsMetadata(
-    XPlaneBuilder& plane_builder, const GpuEventStats& stats) {
+    XPlaneBuilder& xplane, const GpuEventStats& stats, const Symbol symbol) {
   DCHECK(stats.IsXlaOp());
   DCHECK(!stats.hlo_module_name.empty());
   std::vector<XEventMetadata*> hlo_op_events_metadata;
@@ -71,14 +71,18 @@ std::vector<XEventMetadata*> GetOrCreateHloOpEventsMetadata(
   // different modules have different metadata.
   std::string hlo_op_event_prefix = HloOpEventPrefix(stats);
   for (absl::string_view hlo_op_name : stats.hlo_op_names) {
-    XEventMetadata* hlo_op_event_metadata =
-        plane_builder.GetOrCreateEventMetadata(
-            absl::StrCat(hlo_op_event_prefix, hlo_op_name));
+    XEventMetadata* hlo_op_event_metadata = xplane.GetOrCreateEventMetadata(
+        absl::StrCat(hlo_op_event_prefix, hlo_op_name));
     // Display the HLO name without the module name in tools.
     if (hlo_op_event_metadata->display_name().empty()) {
       hlo_op_event_metadata->set_display_name(std::string(hlo_op_name));
     }
     hlo_op_events_metadata.push_back(hlo_op_event_metadata);
+    if (!symbol.hlo_text.empty()) {
+      XStatsBuilder<XEventMetadata> event_stats(hlo_op_event_metadata, &xplane);
+      event_stats.SetOrAddStatValue(*xplane.GetOrCreateStatMetadata("hlo_text"),
+                                    symbol.hlo_text);
+    }
   }
   return hlo_op_events_metadata;
 }
@@ -283,11 +287,11 @@ void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
     }
 
     if (stats.IsXlaOp()) {
-      hlo_ops.ExpandOrAddEvents(
-          GetOrCreateHloOpEventsMetadata(plane_builder, stats), event_span,
-          stats.group_id);
       auto symbol = symbol_resolver(stats.program_id, stats.hlo_module_name,
                                     stats.hlo_op_names.back());
+      hlo_ops.ExpandOrAddEvents(
+          GetOrCreateHloOpEventsMetadata(plane_builder, stats, symbol),
+          event_span, stats.group_id);
       if (!symbol.tf_op_name.empty()) {
         ProcessTfOpEvent(symbol.tf_op_name,
                          event_span, stats.group_id, plane_builder,
diff --git a/tensorflow/core/profiler/utils/derived_timeline.h b/tensorflow/core/profiler/utils/derived_timeline.h
index 9c267b6712a..5385b883ae1 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.h
+++ b/tensorflow/core/profiler/utils/derived_timeline.h
@@ -98,6 +98,7 @@ class DerivedXLineBuilder {
 struct Symbol {
   absl::string_view tf_op_name;
   std::string source_info;
+  std::string hlo_text;
 };
 
 using SymbolResolver = std::function<Symbol(std::optional<uint64_t> program_id,
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 9f6b743b22f..19a85b2bfb8 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -39,23 +39,37 @@ using tsl::profiler::kDeviceVendorNvidia;           // NOLINT
 using tsl::profiler::kGpuPlanePrefix;               // NOLINT
 using tsl::profiler::kHostThreadsPlaneName;         // NOLINT
 using tsl::profiler::kKernelLaunchLineName;         // NOLINT
-using tsl::profiler::kMetadataPlaneName;            // NOLINT
-using tsl::profiler::kPythonTracerPlaneName;        // NOLINT
-using tsl::profiler::kRoctracerApiPlaneName;        // NOLINT
-using tsl::profiler::kSourceLineName;               // NOLINT
-using tsl::profiler::kStepLineName;                 // NOLINT
-using tsl::profiler::kTensorFlowNameScopeLineName;  // NOLINT
-using tsl::profiler::kTensorFlowOpLineName;         // NOLINT
-using tsl::profiler::kTFStreamzPlaneName;           // NOLINT
-using tsl::profiler::kTpuPlanePrefix;               // NOLINT
-using tsl::profiler::kTpuPlaneRegex;                // NOLINT
-using tsl::profiler::kTpuRuntimePlaneName;          // NOLINT
-using tsl::profiler::kXlaAsyncOpLineName;           // NOLINT
-using tsl::profiler::kXlaModuleLineName;            // NOLINT
-using tsl::profiler::kXlaOpLineName;                // NOLINT
-using tsl::profiler::StatType;                      // NOLINT
-using tsl::profiler::TpuPlaneName;                  // NOLINT
-using tsl::profiler::XFlow;                         // NOLINT
+using tsl::profiler::kMegaScaleBarrier;             // NOLINT
+using tsl::profiler::kMegaScaleD2HTransferFinished;  // NOLINT
+using tsl::profiler::kMegaScaleD2HTransferStart;     // NOLINT
+using tsl::profiler::kMegaScaleDcnReceive;           // NOLINT
+using tsl::profiler::kMegaScaleDcnSend;              // NOLINT
+using tsl::profiler::kMegaScaleDcnSendFinished;      // NOLINT
+using tsl::profiler::kMegaScaleH2DTransferFinished;  // NOLINT
+using tsl::profiler::kMegaScaleH2DTransferStart;     // NOLINT
+using tsl::profiler::kMegaScaleHostCommand;          // NOLINT
+using tsl::profiler::kMegaScaleTopologyDiscovery;    // NOLINT
+using tsl::profiler::kMetadataPlaneName;             // NOLINT
+using tsl::profiler::kPythonTracerPlaneName;         // NOLINT
+using tsl::profiler::kRoctracerApiPlaneName;         // NOLINT
+using tsl::profiler::kSourceLineName;                // NOLINT
+using tsl::profiler::kStepLineName;                  // NOLINT
+using tsl::profiler::kTensorFlowNameScopeLineName;   // NOLINT
+using tsl::profiler::kTensorFlowOpLineName;          // NOLINT
+using tsl::profiler::kTFStreamzPlaneName;            // NOLINT
+using tsl::profiler::kTpuPlanePrefix;                // NOLINT
+using tsl::profiler::kTpuPlaneRegex;                 // NOLINT
+using tsl::profiler::kTpuRuntimePlaneName;           // NOLINT
+using tsl::profiler::kXlaAsyncOpLineName;            // NOLINT
+using tsl::profiler::kXlaModuleLineName;             // NOLINT
+using tsl::profiler::kXlaOpLineName;                 // NOLINT
+using tsl::profiler::kXProfMetadataBufferSize;       // NOLINT
+using tsl::profiler::kXProfMetadataFlow;             // NOLINT
+using tsl::profiler::kXProfMetadataKey;              // NOLINT
+using tsl::profiler::kXProfMetadataTransfers;        // NOLINT
+using tsl::profiler::StatType;                       // NOLINT
+using tsl::profiler::TpuPlaneName;                   // NOLINT
+using tsl::profiler::XFlow;                          // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
index 792ee335881..da63a04c31a 100644
--- a/tensorflow/core/protobuf/BUILD
+++ b/tensorflow/core/protobuf/BUILD
@@ -32,7 +32,6 @@ COMMON_PROTO_SRCS = [
     "debug.proto",
     "device_filters.proto",
     "device_properties.proto",
-    "graph_debug_info.proto",
     "queue_runner.proto",
     "rewriter_config.proto",
     "tensor_bundle.proto",
@@ -53,7 +52,6 @@ COMMON_PROTO_SRCS = [
     for proto_name in [
         "config",
         "device_properties",
-        "graph_debug_info",
         "meta_graph",
         "saved_model",
         "tensorflow_server",
diff --git a/tensorflow/core/protobuf/debug_event.proto b/tensorflow/core/protobuf/debug_event.proto
index 5530004d725..9e1dba29569 100644
--- a/tensorflow/core/protobuf/debug_event.proto
+++ b/tensorflow/core/protobuf/debug_event.proto
@@ -2,8 +2,8 @@ syntax = "proto3";
 
 package tensorflow;
 
+import "tensorflow/core/framework/graph_debug_info.proto";
 import "tensorflow/core/framework/tensor.proto";
-import "tensorflow/core/protobuf/graph_debug_info.proto";
 
 option cc_enable_arenas = true;
 option java_outer_classname = "DebugEventProtos";
diff --git a/tensorflow/core/protobuf/meta_graph.proto b/tensorflow/core/protobuf/meta_graph.proto
index 03dc60b1dee..89ef79ca186 100644
--- a/tensorflow/core/protobuf/meta_graph.proto
+++ b/tensorflow/core/protobuf/meta_graph.proto
@@ -5,6 +5,7 @@ package tensorflow;
 import "google/protobuf/any.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/op_def.proto";
+import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/protobuf/saved_object_graph.proto";
@@ -325,6 +326,8 @@ message SignatureDef {
   // method_name. This is commonly used to support multi-headed computation,
   // where a single graph computation may return multiple results.
   string method_name = 3;
+  // Named input to corresponding default values if any.
+  map<string, TensorProto> defaults = 4;
 }
 
 // An asset file def for a single file or a set of sharded files with the same
diff --git a/tensorflow/core/protobuf/service_config.proto b/tensorflow/core/protobuf/service_config.proto
index dfef9ff4b66..50ba06569ae 100644
--- a/tensorflow/core/protobuf/service_config.proto
+++ b/tensorflow/core/protobuf/service_config.proto
@@ -7,7 +7,7 @@ import "tensorflow/core/protobuf/data_service.proto";
 option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Configuration for a tf.data service DispatchServer.
-// Next id: 11
+// Next id: 12
 message DispatcherConfig {
   // The port for the dispatcher to bind to. A value of 0 indicates that the
   // dispatcher may bind to any available port.
@@ -35,8 +35,15 @@ message DispatcherConfig {
   // How long a job needs to be unused before it becomes a candidate for garbage
   // collection. A value of -1 indicates that jobs should never be garbage
   // collected. A value of 0 indicates that the decision should be left up to
-  // the runtime.
+  // the runtime. Note: This does not apply to dynamic sharding unless users
+  // explicitly opt-in by enabling `gc_dynamic_sharding_jobs` below.
   int64 job_gc_timeout_ms = 6;
+  // Whether dynamically sharded jobs should be eligible for garbage collection.
+  // These jobs are not garbage collected by default, since if a job is garbage
+  // collected and then re-created, it will revisit all data from the start. If
+  // revisiting data is acceptible and you want automatic reclamation of
+  // iterator memory, set `gc_dynamic_sharding_jobs` to `true`.
+  bool gc_dynamic_sharding_jobs = 11;
   // How long to wait before garbage-collecting a client that hasn't
   // heartbeated to the dispatcher. A value of 0 indicates that the timeout
   // should be left to the runtime.
@@ -47,7 +54,7 @@ message DispatcherConfig {
 }
 
 // Configuration for a tf.data service WorkerServer.
-// Next id: 12
+// Next id: 13
 message WorkerConfig {
   // The port for the worker to bind to. A value of 0 indicates that the
   // worker may bind to any available port.
@@ -81,6 +88,9 @@ message WorkerConfig {
   // Maximum size of the cross-trainer cache in bytes. If enabled, make sure
   // your training job provides sufficient memory resources.
   int64 cross_trainer_cache_size_bytes = 11;
+  // The maximum size of a distributed snapshot chunk file. A value of 0
+  // indicates that the decision should be left up to the runtime.
+  int64 snapshot_max_chunk_size_bytes = 12;
   // When shutting down a worker, how long to wait for the gRPC server to
   // process the final requests. This is used to achieve clean shutdown in unit
   // tests.
diff --git a/tensorflow/core/protobuf/struct.proto b/tensorflow/core/protobuf/struct.proto
index 010adf56b35..abc6771f7db 100644
--- a/tensorflow/core/protobuf/struct.proto
+++ b/tensorflow/core/protobuf/struct.proto
@@ -72,6 +72,8 @@ message StructuredValue {
     DictValue dict_value = 53;
     // Represents Python's namedtuple.
     NamedTupleValue named_tuple_value = 54;
+    // Represents a value for tf.Tensor.
+    tensorflow.TensorProto tensor_value = 55;
   }
 }
 
diff --git a/tensorflow/core/protobuf/tpu/compile_metadata.proto b/tensorflow/core/protobuf/tpu/compile_metadata.proto
index 293b97cb662..37c6134eb3f 100644
--- a/tensorflow/core/protobuf/tpu/compile_metadata.proto
+++ b/tensorflow/core/protobuf/tpu/compile_metadata.proto
@@ -66,6 +66,9 @@ message TPUCompileMetadataProto {
     // Whether to use XLA collectives to broadcast this parameter to all
     // replicas, instead of using TensorFlow Send/Recv among the tasks.
     bool requires_xla_broadcast = 11;
+
+    // Which dimension of this arg is bounded dynamic.
+    repeated bool is_bounded_dynamic_dim = 12;
   }
   repeated Arg args = 1;
 
@@ -137,6 +140,9 @@ message TPUCompileMetadataProto {
   uint64 mlir_fingerprint = 17;
 
   TPUCompileOptions compile_options = 21;
+
+  // The name of the MLIR module.
+  string module_name = 22;
 }
 
 // Stable protobuf for TPU compilation options, suitable for persistent storage.
diff --git a/tensorflow/core/protobuf/worker_service.proto b/tensorflow/core/protobuf/worker_service.proto
index d1a53845850..af367fb3283 100644
--- a/tensorflow/core/protobuf/worker_service.proto
+++ b/tensorflow/core/protobuf/worker_service.proto
@@ -69,6 +69,10 @@ service WorkerService {
     // RecvTensor Method
   }
 
+  // See worker.proto for details.
+  rpc MarkRecvFinished(MarkRecvFinishedRequest)
+      returns (MarkRecvFinishedResponse);
+
   // See worker.proto for details.
   rpc Logging(LoggingRequest) returns (LoggingResponse);
 
diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h
index 24884292ccc..ab86d578412 100644
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@@ -355,6 +355,9 @@ Status Reset(const SessionOptions& options,
 /// which contains more helpful error information.
 Session* NewSession(const SessionOptions& options);
 
+/// \brief Export the metric that indicates the session is created.
+void SetSessionCreatedMetric();
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PUBLIC_SESSION_H_
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 683348088d5..6613c853545 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -21,7 +21,7 @@ limitations under the License.
 // Also update tensorflow/tensorflow.bzl and
 // tensorflow/tools/pip_package/setup.py
 #define TF_MAJOR_VERSION 2
-#define TF_MINOR_VERSION 13
+#define TF_MINOR_VERSION 14
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1436  // Updated: 2023/3/15
+#define TF_GRAPH_DEF_VERSION 1489  // Updated: 2023/5/7
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/runtime_fallback/BUILD b/tensorflow/core/runtime_fallback/BUILD
index 985cd1d8136..4ee6b16c6b4 100644
--- a/tensorflow/core/runtime_fallback/BUILD
+++ b/tensorflow/core/runtime_fallback/BUILD
@@ -4,8 +4,8 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 package_group(
     name = "internal",
     packages = [
-        "//tensorflow/core/runtime_fallback/...",
         "//tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/...",
+        "//tensorflow/core/runtime_fallback/...",
     ] + if_google([
         "//learning/brain/experimental/mlir/tflite/tfmrt/...",
         "//learning/brain/experimental/tfrt/...",
@@ -29,22 +29,22 @@ tf_cc_binary(
     tags = ["no_oss"],
     deps = [
         ":bef_executor_lib",
-        "//tensorflow/core/platform:stream_executor",
         "@com_google_absl//absl/strings",
+        "//tensorflow/compiler/mlir/tfrt:tf_jitrt_kernels_alwayslink",
+        "//tensorflow/core/platform:stream_executor",
+        "//tensorflow/core/runtime_fallback/conversion:conversion_alwayslink",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_kernels_alwayslink",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_op_handler",
         # copybara:uncomment "//tensorflow/core/runtime_fallback/test:forwarding_test_kernels",
         # copybara:uncomment "//tensorflow/core/runtime_fallback/test:tfrt_forwarding_kernels_alwayslink",
-        "//tensorflow/core/runtime_fallback/conversion:conversion_alwayslink",
-        "//tensorflow/compiler/mlir/tfrt:tf_jitrt_kernels_alwayslink",
         "@tf_runtime//:basic_kernels_alwayslink",
         "@tf_runtime//:core_runtime_alwayslink",
         "@tf_runtime//:hostcontext_alwayslink",
         "@tf_runtime//:tensor_alwayslink",
         "@tf_runtime//:test_kernels_alwayslink",
-        # copybara:uncomment "@tf_runtime//backends/cpu:proto_alwayslink",
-        # copybara:uncomment "@tf_runtime//backends/cpu:image_alwayslink",
         "@tf_runtime//backends/cpu:core_runtime_alwayslink",
+        # copybara:uncomment "@tf_runtime//backends/cpu:image_alwayslink",
+        # copybara:uncomment "@tf_runtime//backends/cpu:proto_alwayslink",
         "@tf_runtime//backends/cpu:test_ops_alwayslink",
         "@tf_runtime//backends/jitrt:jitrt_corert_kernels_alwayslink",
     ] + select({
@@ -96,12 +96,12 @@ cc_library(
         "//third_party/tf_runtime_google:xprof_tracing_sink_alwayslink",
     ]) + [
         ":bef_executor_flags",
+        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
+        "//tensorflow/core/runtime_fallback/util:fallback_test_util",
+        "//tensorflow/core/tfrt/utils:thread_pool",
         "@com_google_absl//absl/flags:flag",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
-        "//tensorflow/core/tfrt/utils:thread_pool",
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
-        "//tensorflow/core/runtime_fallback/util:fallback_test_util",
         "@tf_runtime//:bef_executor_driver",
         "@tf_runtime//:hostcontext_alwayslink",
         "@tf_runtime//:io_alwayslink",
diff --git a/tensorflow/core/runtime_fallback/conversion/conversion.cc b/tensorflow/core/runtime_fallback/conversion/conversion.cc
index 36da527c5e6..be3afcb12dd 100644
--- a/tensorflow/core/runtime_fallback/conversion/conversion.cc
+++ b/tensorflow/core/runtime_fallback/conversion/conversion.cc
@@ -66,7 +66,7 @@ ConvertRuntimeFallbackToKernelFallbackTensor(
   const tensorflow::Tensor *tf_tensor;
   Status s = tensor.GetTensorHandle()->Tensor(&tf_tensor);
   if (!s.ok()) {
-    return tfrt::MakeErrorAsyncValueRef(s.error_message());
+    return tfrt::MakeErrorAsyncValueRef(s.message());
   }
   auto src_knfb_tensor =
       KernelFallbackTensor(tensor.shape(), tensor.dtype(), *tf_tensor);
diff --git a/tensorflow/core/runtime_fallback/kernel/BUILD b/tensorflow/core/runtime_fallback/kernel/BUILD
index 3a85d96051c..d97d591784d 100644
--- a/tensorflow/core/runtime_fallback/kernel/BUILD
+++ b/tensorflow/core/runtime_fallback/kernel/BUILD
@@ -20,13 +20,13 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":attr_util",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
         "//tensorflow/core/runtime_fallback/util:attr_util",
         "//tensorflow/core/tfrt/utils:error_util",
-        "@tf_runtime//backends/common:eigencompat",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
         "@tf_runtime//:core_runtime",
         "@tf_runtime//:hostcontext",
+        "@tf_runtime//backends/common:eigencompat",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
@@ -69,8 +69,8 @@ cc_library(
     hdrs = ["attr_util.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "@llvm-project//llvm:Support",
         "//tensorflow/core/runtime_fallback/util:attr_util",
+        "@llvm-project//llvm:Support",
         "@tf_runtime//:core_runtime",
         "@tf_runtime//:hostcontext",
     ] + select({
@@ -95,8 +95,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core/runtime_fallback/runtime:kernel_utils",
-        "@llvm-project//llvm:Support",
         "//tensorflow/core/runtime_fallback/util:attr_util",
+        "@llvm-project//llvm:Support",
         "@tf_runtime//:core_runtime",
         "@tf_runtime//:hostcontext",
     ] + select({
@@ -175,24 +175,24 @@ cc_library(
         "//tensorflow/core/tfrt:__subpackages__",
     ],
     deps = [
+        "//tensorflow/core/runtime_fallback/util:tensor_util",
+        "//tensorflow/core/runtime_fallback/util:type_util",
         "@llvm-project//llvm:Support",
+        "@tf_runtime//:dtype",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:support",
-        "@tf_runtime//:dtype",
-        "//tensorflow/core/runtime_fallback/util:tensor_util",
         "@tf_runtime//:tensor_alwayslink",
-        "//tensorflow/core/runtime_fallback/util:type_util",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
+            "//tensorflow/c:tf_datatype",
+            "//tensorflow/c:tf_tensor_internal",
             "//tensorflow/core/framework:tensor",
             "//tensorflow/core/framework:tensor_shape",
             "//tensorflow/core/framework:types_proto_cc",
             "//tensorflow/core/platform:status",
-            "//tensorflow/c:tf_datatype",
-            "//tensorflow/c:tf_tensor_internal",
         ],
     }),
 )
@@ -213,8 +213,8 @@ cc_library(
         "//tensorflow/core/tfrt:__subpackages__",
     ],
     deps = [
-        ":tensor_util",
         ":kernel_fallback_tensor",
+        ":tensor_util",
         "//tensorflow/core/runtime_fallback/util:tensor_util",
         "//tensorflow/core/runtime_fallback/util:type_util",
         "@tf_runtime//:dtype",
@@ -262,13 +262,13 @@ tf_cuda_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":tensor_util",
         ":kernel_fallback_execute_compat",
         ":kernel_fallback_tensor",
-        "@com_google_absl//absl/strings",
+        ":tensor_util",
         "//tensorflow/core/runtime_fallback/util:gpu_util",
         "//tensorflow/core/runtime_fallback/util:tensor_util",
         "//tensorflow/core/runtime_fallback/util:type_util",
+        "@com_google_absl//absl/strings",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:tensor",
         "@tf_runtime//backends/gpu:gpu_config",
@@ -296,20 +296,20 @@ cc_library(
         ":kernel_fallback_execute_compat",
         ":kernel_fallback_tensor",
         ":kernel_fallback_tensor_conversion_alwayslink",
+        "//tensorflow/core/runtime_fallback/runtime:kernel_utils",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner_cache",
         "@tf_runtime//:core_runtime",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:support",
-        "//tensorflow/core/runtime_fallback/runtime:kernel_utils",
         "@tf_runtime//:tensor_alwayslink",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            "//tensorflow/core/platform:mutex",
             "//tensorflow/core/common_runtime/eager:context",
             "//tensorflow/core/framework:tensor",
+            "//tensorflow/core/platform:mutex",
         ],
     }),
     alwayslink = True,
@@ -325,9 +325,9 @@ cc_library(
         ":tfrt_op_kernel",
         "@llvm-project//llvm:Support",
         "@tf_runtime//:core_runtime",
+        "@tf_runtime//:dtype",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:support",
-        "@tf_runtime//:dtype",
         "@tf_runtime//:tensor_alwayslink",
         "@tf_runtime//backends/common:eigencompat",
     ] + select({
@@ -354,10 +354,53 @@ cc_library(
     deps = [
         ":kernel_fallback_compat_request_state",
         ":kernel_fallback_tensor",
-        ":kernel_fallback_tensor_conversion_alwayslink",
         ":kernel_fallback_utils",
+        "//tensorflow/core/framework:tensor_proto_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/runtime_fallback/runtime:op_logger",
+        "//tensorflow/core/runtime_fallback/util:attr_util",
+        "//tensorflow/core/runtime_fallback/util:type_util",
+        "//tensorflow/core/tfrt/fallback:cost_recorder",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner_cache",
+        "//tensorflow/core/tfrt/utils:error_util",
+        "//tensorflow/core/tfrt/utils:fallback_tensor",
+        "//tensorflow/core/tfrt/utils:tensor_util",
         "@com_google_absl//absl/base",
         "@llvm-project//llvm:Support",
+        "@tf_runtime//:core_runtime",
+        "@tf_runtime//:hostcontext",
+        "@tf_runtime//:support",
+        "@tf_runtime//:tensor_alwayslink",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core/framework:tensor",
+        ],
+    }),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "kernel_fallback_execute_compat_eager",
+    srcs = ["kernel_fallback_execute_compat_eager.cc"],
+    hdrs = ["kernel_fallback_execute_compat_eager.h"],
+    visibility = [
+        "//tensorflow/compiler/mlir/tfrt:__pkg__",
+        "//tensorflow/core/runtime_fallback:internal",
+        "//tensorflow/core/tfrt/eager:__pkg__",
+        "//tensorflow/core/tfrt/graph_executor:__pkg__",
+        "//tensorflow/core/tfrt/saved_model:__pkg__",
+    ],
+    deps = [
+        ":kernel_fallback_compat_request_state",
+        ":kernel_fallback_tensor",
+        ":kernel_fallback_tensor_conversion_alwayslink",
+        ":kernel_fallback_utils",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/runtime_fallback/runtime:kernel_utils",
         "//tensorflow/core/runtime_fallback/runtime:op_logger",
@@ -368,6 +411,8 @@ cc_library(
         "//tensorflow/core/tfrt/utils:error_util",
         "//tensorflow/core/tfrt/utils:fallback_tensor",
         "//tensorflow/core/tfrt/utils:tensor_util",
+        "@com_google_absl//absl/base",
+        "@llvm-project//llvm:Support",
         "@tf_runtime//:core_runtime",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:support",
@@ -377,11 +422,11 @@ cc_library(
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            "//tensorflow/core/framework:tensor",
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
             "//tensorflow/core/common_runtime/eager:context",
             "//tensorflow/core/common_runtime/eager:core",
+            "//tensorflow/core/framework:tensor",
         ],
     }),
     alwayslink = True,
@@ -405,8 +450,10 @@ cc_library(
         "//tensorflow/core/tfrt/saved_model:__subpackages__",
     ],
     deps = [
-        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/core/tfrt/fallback:cost_recorder",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
+        "//tensorflow/core/tfrt/graph_executor:config",
+        "//tensorflow/core/tfrt/graph_executor:config_proto_cc",
         "//tensorflow/core/tfrt/utils:fallback_tensor",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:support",
@@ -416,15 +463,14 @@ cc_library(
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            "//tensorflow/core/common_runtime:renamed_device",
-            "//tensorflow/core/common_runtime/eager:context",
-            "//tensorflow/core/common_runtime/eager:core",
             "//tensorflow/core:core_cpu_base",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_lite",
             "//tensorflow/core:lib",
-            "//tensorflow/core/platform:refcount",
+            "//tensorflow/core/common_runtime:renamed_device",
+            "//tensorflow/core/common_runtime:rendezvous_mgr",
             "//tensorflow/core/framework:tensor",
+            "//tensorflow/core/platform:refcount",
         ],
     }),
 )
@@ -446,10 +492,7 @@ cc_library(
     name = "gpurt_kernels",
     srcs = ["gpurt_kernels.cc"],
     visibility = [
-        "//tensorflow/compiler/mlir/tfrt:__pkg__",
         "//tensorflow/core/runtime_fallback:internal",
-        "//tensorflow/core/tfrt/eager:__pkg__",
-        "//tensorflow/core/tfrt/graph_executor:__pkg__",
         "//tensorflow/core/tfrt/saved_model:__pkg__",
     ],
     deps = [
@@ -459,6 +502,7 @@ cc_library(
         ":tensor_util",
         "//tensorflow/core/runtime_fallback/runtime:kernel_utils",
         "//tensorflow/core/tfrt/utils:fallback_tensor",
+        "//tensorflow/core/tfrt/utils:gpu_variables_table",
         "//tensorflow/core/tfrt/utils:tensor_util",
         "@tf_runtime//:core_runtime",
         "@tf_runtime//:hostcontext",
diff --git a/tensorflow/core/runtime_fallback/kernel/gpurt_kernels.cc b/tensorflow/core/runtime_fallback/kernel/gpurt_kernels.cc
index 5bab1c1405f..622d1b7e3ee 100644
--- a/tensorflow/core/runtime_fallback/kernel/gpurt_kernels.cc
+++ b/tensorflow/core/runtime_fallback/kernel/gpurt_kernels.cc
@@ -80,8 +80,7 @@ tfrt::AsyncValueRef<tfrt_stub::FallbackTensor> TransferToDevice(
   Devices devices;
   Status status = GetDevices(exec_ctx, &devices);
   if (!status.ok()) {
-    return tfrt::MakeErrorAsyncValueRef(
-        absl::InternalError(status.error_message()));
+    return tfrt::MakeErrorAsyncValueRef(absl::InternalError(status.message()));
   }
   return TransferTensor(exec_ctx, tensor, devices.cpu_device,
                         devices.gpu_device);
@@ -94,8 +93,7 @@ tfrt::AsyncValueRef<tfrt_stub::FallbackTensor> TransferFromDevice(
   Devices devices;
   Status status = GetDevices(exec_ctx, &devices);
   if (!status.ok()) {
-    return tfrt::MakeErrorAsyncValueRef(
-        absl::InternalError(status.error_message()));
+    return tfrt::MakeErrorAsyncValueRef(absl::InternalError(status.message()));
   }
   return TransferTensor(exec_ctx, tensor, devices.gpu_device,
                         devices.cpu_device);
@@ -122,8 +120,7 @@ tfrt::AsyncValueRef<tfrt_stub::FallbackTensor> MaybeTransferVariable(
   Devices devices;
   Status status = GetDevices(exec_ctx, &devices);
   if (!status.ok()) {
-    return tfrt::MakeErrorAsyncValueRef(
-        absl::InternalError(status.error_message()));
+    return tfrt::MakeErrorAsyncValueRef(absl::InternalError(status.message()));
   }
   auto device_variable = TransferTensor(exec_ctx, variable, devices.cpu_device,
                                         devices.gpu_device);
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.cc
index ce0190beabb..54b017d71a6 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.cc
@@ -14,13 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
 
+#include <cstdlib>
+#include <cstring>
 #include <functional>
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 
-#include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
 #include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/framework/function.h"
@@ -28,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/threadpool_interface.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
 #include "tfrt/support/pointer_util.h"  // from @tf_runtime
 
 namespace tensorflow {
@@ -92,8 +96,10 @@ KernelFallbackCompatRequestState::KernelFallbackCompatRequestState(
   DCHECK(runner_table_);
   DCHECK(resource_array_);
   DCHECK(rendezvous_);
+  DCHECK(pflr_);
 
   cpu_device_ = device_manager_->HostCPU();
+  cpu_function_library_runtime_ = pflr_->GetFLR(cpu_device_->name());
   if (user_intra_op_threadpool != nullptr) {
     custom_cpu_device_ = tensorflow::RenamedDevice::NewRenamedDevice(
         cpu_device_->name(), cpu_device_, /*owns_underlying=*/false,
@@ -142,5 +148,42 @@ KernelFallbackCompatRequestState::KernelFallbackCompatRequestState(
           runner_table, resource_array, user_intra_op_threadpool,
           model_metadata, pflr) {}
 
+static std::function<void(std::function<void()>)>* GetDefaultRunner() {
+  static auto* const default_runner =
+      new std::function<void(std::function<void()>)>(
+          [](const std::function<void()>& f) { f(); });
+  return default_runner;
+}
+
+Status SetUpKernelFallbackCompatRequestContext(
+    tfrt::RequestContextBuilder* builder,
+    const tensorflow::DeviceMgr* device_manager,
+    const tensorflow::ProcessFunctionLibraryRuntime* pflr,
+    tfrt_stub::OpKernelRunnerTable* runner_table,
+    FallbackResourceArray* resource_array,
+    tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool,
+    const absl::optional<SessionMetadata>& model_metadata,
+    std::function<void(std::function<void()>)>* runner,
+    tfrt_stub::CostRecorder* cost_recorder,
+    tfrt::ResourceContext* client_graph_resource_context) {
+  DCHECK(builder);
+  DCHECK(device_manager);
+  DCHECK(pflr);
+  DCHECK(runner_table);
+  DCHECK(resource_array);
+
+  auto& fallback_request_state =
+      builder->context_data().emplace<KernelFallbackCompatRequestState>(
+          runner ? runner : GetDefaultRunner(), device_manager, builder->id(),
+          runner_table, resource_array, user_intra_op_threadpool,
+          model_metadata, pflr);
+
+  fallback_request_state.set_cost_recorder(cost_recorder);
+  fallback_request_state.set_client_graph_resource_context(
+      client_graph_resource_context);
+
+  return OkStatus();
+}
+
 }  // namespace tfd
 }  // namespace tensorflow
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
index e75a2ea6a2d..4b74a21f5e5 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -28,8 +29,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/core/tfrt/graph_executor/config.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
 #include "tfrt/support/pointer_util.h"  // from @tf_runtime
 
 namespace tensorflow {
@@ -108,6 +112,9 @@ class KernelFallbackCompatRequestState {
   }
 
   tensorflow::Device* cpu_device() const { return cpu_device_; }
+  tensorflow::FunctionLibraryRuntime* cpu_function_library_runtime() const {
+    return cpu_function_library_runtime_;
+  }
 
   ScopedStepContainer* step_container() const { return step_container_.get(); }
 
@@ -153,6 +160,24 @@ class KernelFallbackCompatRequestState {
     cost_recorder_ = cost_recorder;
   }
 
+  // Nullable.
+  tfrt::ResourceContext* client_graph_resource_context() const {
+    return client_graph_resource_context_;
+  }
+  void set_client_graph_resource_context(
+      tfrt::ResourceContext* client_graph_resource_context) {
+    client_graph_resource_context_ = client_graph_resource_context;
+  }
+
+  void set_model_config(
+      const tensorflow::tfrt_stub::ModelConfig* model_config) {
+    model_config_ = model_config;
+  }
+
+  const tensorflow::tfrt_stub::ModelConfig* model_config() const {
+    return model_config_;
+  }
+
  private:
   int64_t step_id_ = 0;
   // Below are resources needed by current tensorflow.
@@ -163,6 +188,7 @@ class KernelFallbackCompatRequestState {
       custom_device_;
   std::unique_ptr<tensorflow::Device> custom_cpu_device_;
   tensorflow::Device* cpu_device_ = nullptr;
+  tensorflow::FunctionLibraryRuntime* cpu_function_library_runtime_ = nullptr;
   std::unique_ptr<CollectiveExecutor::Handle> collective_executor_handle_;
   CollectiveExecutor* collective_executor_ = nullptr;
   core::RefCountPtr<Rendezvous> rendezvous_;
@@ -190,8 +216,28 @@ class KernelFallbackCompatRequestState {
 
   // Records the cost per op.
   tensorflow::tfrt_stub::CostRecorder* cost_recorder_ = nullptr;
+
+  tfrt::ResourceContext* client_graph_resource_context_ = nullptr;
+
+  const tensorflow::tfrt_stub::ModelConfig* model_config_ = nullptr;
 };
 
+// Set up fallback context with common tensorflow states such as devices,
+// function library runtime. They will be forwarded to tensorflow::OpKernel as
+// in tensorflow::Executor. If `runner` is nullptr, internally it will use a
+// default runner that executes tasks in the caller thread.
+Status SetUpKernelFallbackCompatRequestContext(
+    tfrt::RequestContextBuilder* builder,
+    const tensorflow::DeviceMgr* device_manager,
+    const tensorflow::ProcessFunctionLibraryRuntime* pflr,
+    tfrt_stub::OpKernelRunnerTable* runner_table,
+    FallbackResourceArray* resource_array,
+    tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool = nullptr,
+    const std::optional<SessionMetadata>& model_metadata = std::nullopt,
+    std::function<void(std::function<void()>)>* runner = nullptr,
+    tfrt_stub::CostRecorder* cost_recorder = nullptr,
+    tfrt::ResourceContext* client_graph_resource_context = nullptr);
+
 }  // namespace tfd
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
index b7e37a7187b..504c6a878ca 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
@@ -15,16 +15,17 @@ limitations under the License.
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h"
 
 #include <cstdint>
+#include <cstring>
 #include <memory>
 #include <optional>
 #include <string>
 
 #include "absl/base/casts.h"
 #include "llvm/ADT/StringRef.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/logging.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h"
 #include "tensorflow/core/runtime_fallback/runtime/op_logger.h"
 #include "tensorflow/core/runtime_fallback/util/attr_util.h"
+#include "tensorflow/core/runtime_fallback/util/type_util.h"
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h"
@@ -96,87 +98,13 @@ void KernelFallbackEmitError(
       absl::Status(
           ToAbslStatus(status).code(),
           tfrt::StrCat(model_info, "error running kernel fallback kernel ",
-                       op_name, ": ", status.error_message())));
+                       op_name, ": ", status.message())));
   std::fill(results.begin(), results.end(), error);
   if (op_chain) *op_chain = std::move(error);
 }
 
-std::function<void(std::function<void()>)>* GetDefaultRunner() {
-  static auto* const default_runner =
-      new std::function<void(std::function<void()>)>(
-          [](const std::function<void()>& f) { f(); });
-  return default_runner;
-}
-
 }  // namespace
 
-Status SetUpKernelFallbackCompatRequestContext(
-    tfrt::RequestContextBuilder* builder,
-    const tensorflow::DeviceMgr* device_manager,
-    const tensorflow::ProcessFunctionLibraryRuntime* pflr,
-    tfrt_stub::OpKernelRunnerTable* runner_table,
-    FallbackResourceArray* resource_array,
-    tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool,
-    const absl::optional<SessionMetadata>& model_metadata,
-    std::function<void(std::function<void()>)>* runner,
-    tfrt_stub::CostRecorder* cost_recorder) {
-  DCHECK(builder);
-  DCHECK(device_manager);
-  DCHECK(pflr);
-  DCHECK(runner_table);
-  DCHECK(resource_array);
-
-  auto& fallback_request_state =
-      builder->context_data().emplace<KernelFallbackCompatRequestState>(
-          runner ? runner : GetDefaultRunner(), device_manager, builder->id(),
-          runner_table, resource_array, user_intra_op_threadpool,
-          model_metadata, pflr);
-
-  fallback_request_state.set_cost_recorder(cost_recorder);
-
-  return OkStatus();
-}
-
-Status SetUpKernelFallbackCompatRequestContext(
-    tfrt::RequestContextBuilder* builder,
-    tfrt_stub::OpKernelRunnerTable* runner_table,
-    tensorflow::EagerContext* eager_context,
-    tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool,
-    const absl::optional<SessionMetadata>& model_metadata) {
-  auto* resource_array =
-      builder->resource_context()->GetOrCreateResource<FallbackResourceArray>(
-          kFallbackResourceArray);
-
-  if (runner_table == nullptr)
-    runner_table = builder->resource_context()
-                       ->GetOrCreateResource<tfrt_stub::OpKernelRunnerTable>(
-                           kOpKernelRunnerTableResourceName);
-
-  auto step_id = builder->id();
-
-  Rendezvous::Factory creator = eager_context->RendezvousFactory();
-  Rendezvous* rendezvous;
-  TF_RETURN_IF_ERROR(
-      creator(step_id, eager_context->local_device_mgr(), &rendezvous));
-
-  // TODO(hhb): Clean up rendezvous from factory after run.
-
-  auto& fallback_request_state =
-      builder->context_data().emplace<KernelFallbackCompatRequestState>(
-          GetDefaultRunner(), eager_context->local_device_mgr(), step_id,
-          tfrt::OwnedOrUnownedPtr<ScopedStepContainer>{
-              eager_context->StepContainer()},
-          eager_context->GetCollectiveExecutorHandle(),
-          tensorflow::core::RefCountPtr<tensorflow::Rendezvous>(rendezvous),
-          runner_table, resource_array, user_intra_op_threadpool,
-          model_metadata, eager_context->pflr());
-
-  fallback_request_state.set_log_device_placement(
-      eager_context->LogDevicePlacement());
-
-  return OkStatus();
-}
-
 static llvm::Expected<gtl::InlinedVector<tensorflow::Tensor, 4>>
 ConvertInputTensors(llvm::ArrayRef<tfrt::Tensor*> arguments,
                     const tfrt::ExecutionContext& exec_ctx) {
@@ -279,7 +207,7 @@ static void KernelFallbackExecuteCompatAsyncInternal(
           absl::Status(ToAbslStatus(context.status()).code(),
                        tfrt::StrCat("error running kernel fallback kernel ",
                                     context.op_kernel().name(), ": ",
-                                    context.status().error_message())));
+                                    context.status().message())));
       for (auto& result : async_state->result_refs)
         result.SetError(diag.status);
       async_state->chain.SetError(diag.status);
@@ -599,7 +527,11 @@ TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOp(
                                   *fallback_request_state, *kernel_runner,
                                   kernel_runner->IsAsync(), device);
 
-  // Finish recording the op execution time, given a non-null cost recorder.
+  // Finish recording the op execution time, given a non-null
+  // cost recorder.
+  //
+  // TODO(b/259602527): Measure async op costs more accurately with whole
+  // execution time. (It's not urgent because async ops are rare.)
   if (cost_recorder != nullptr) {
     op_chain->AndThen(
         [cost_recorder, run_start_time_ns, op_key = frame.op_key().GetValue()] {
@@ -944,6 +876,33 @@ void FallbackCopyTensorIfSmall(
   }
 }
 
+llvm::Expected<tensorflow::tfrt_stub::FallbackTensor> ConstStringTensor(
+    tfrt::ArrayAttr shape, tfrt::AggregateAttr value,
+    const tfrt::ExecutionContext& context) {
+  llvm::SmallVector<int64_t> dims;
+  auto tfrt_tensor_shape = tfrt::TensorShape(shape.GetValue<int64_t>());
+  tfrt_tensor_shape.GetDimensions(&dims);
+  tensorflow::Tensor tensor(tensorflow::DT_STRING,
+                            tensorflow::TensorShape(dims));
+  auto len = tensor.NumElements();
+  auto from = value;
+  auto to = tensor.flat<tensorflow::tstring>();
+  if (from.GetNumElements() == 1) {
+    // All elements are the same, and only one element is saved in BEF.
+    for (size_t i = 0; i < len; ++i) {
+      to(i) = ToAbslStringView(
+          from.GetAttributeOfType<tfrt::StringAttr>(0).GetValue());
+    }
+  } else {
+    assert(len == from.GetNumElements());
+    for (size_t i = 0; i < len; ++i) {
+      to(i) = ToAbslStringView(
+          from.GetAttributeOfType<tfrt::StringAttr>(i).GetValue());
+    }
+  }
+  return tensorflow::tfrt_stub::FallbackTensor(tensor);
+}
+
 llvm::Expected<tensorflow::tfrt_stub::FallbackTensor> ConstTensorProto(
     tfrt::StringAttr serialized_tensor_proto) {
   tensorflow::TensorProto tensor_proto;
@@ -960,6 +919,55 @@ llvm::Expected<tensorflow::tfrt_stub::FallbackTensor> ConstTensorProto(
   return tensorflow::tfrt_stub::FallbackTensor(std::move(tensor));
 }
 
+// Returns true if the tensorflow::DataType is trivially copyable.
+bool IsTriviallyCopyableTensorflowDataType(tensorflow::DataType dtype) {
+  static const auto* const non_trivially_copyable_dtypes =
+      new absl::flat_hash_set<tensorflow::DataType>{
+          tensorflow::DataType::DT_STRING, tensorflow::DataType::DT_RESOURCE,
+          tensorflow::DataType::DT_VARIANT};
+  return !non_trivially_copyable_dtypes->contains(dtype);
+}
+
+llvm::Expected<tensorflow::tfrt_stub::FallbackTensor> ConstDenseTensor(
+    tfrt::DenseAttr value, const tfrt::ExecutionContext& context) {
+  auto dtype = GetTfDataType(tfrt::DType(value.dtype()));
+  // The data type must be trivially copyable so that we can use memcpy.
+  DCHECK(IsTriviallyCopyableTensorflowDataType(dtype));
+  tensorflow::Tensor tensor(dtype, tensorflow::TensorShape(value.shape()));
+  std::memcpy(tensor.data(), value.GetElements(), tensor.TotalBytes());
+  return tensorflow::tfrt_stub::FallbackTensor(tensor);
+}
+
+llvm::Expected<bool> Predicate(
+    const tensorflow::tfrt_stub::FallbackTensor& input,
+    const tfrt::ExecutionContext& exec_ctx) {
+  const auto& tensor = input.tensor();
+  if (TensorShapeUtils::IsScalar(tensor.shape())) {
+    switch (tensor.dtype()) {
+#define CASE(T)                  \
+  case DataTypeToEnum<T>::value: \
+    return tensor.scalar<T>()() != 0;
+
+      CASE(float);
+      CASE(double);
+      CASE(uint8);
+      CASE(int8);
+      CASE(int16);
+      CASE(int32);
+      CASE(int64_t);
+      CASE(bool);
+#undef CASE
+      case DT_STRING:
+        return !tensor.scalar<tstring>()().empty();
+      default:
+        return tfrt::MakeStringError(DataTypeString(tensor.dtype()),
+                                     " cannot be converted to a boolean");
+    }
+  }
+
+  return tensor.NumElements() > 0;
+}
+
 void BatchFunction(
     tfrt::RepeatedArguments<tensorflow::tfrt_stub::FallbackTensor> args,
     tfrt::RemainingResults results, tfrt::StringAttr device,
@@ -1091,6 +1099,11 @@ void RegisterKernelFallbackCompatKernels(tfrt::KernelRegistry* registry) {
                       TFRT_KERNEL(FallbackGetResource));
   registry->AddKernel("tfrt_fallback_async.batch_function",
                       TFRT_KERNEL(BatchFunction));
+  registry->AddKernel("tfrt_fallback_async.const_dense_tensor",
+                      TFRT_KERNEL(ConstDenseTensor));
+  registry->AddKernel("tfrt_fallback_async.const_string_tensor",
+                      TFRT_KERNEL(ConstStringTensor));
+  registry->AddKernel("tfrt_fallback_async.predicate", TFRT_KERNEL(Predicate));
 
   // TODO(chky): Move test kernels to test-only library.
   registry->AddKernel("tfrt_fallback_async.get_test_allocator",
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h
index 9a77c3711a0..a388848697b 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 
-#include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/threadpool_interface.h"
 #include "tensorflow/core/platform/types.h"
@@ -42,30 +41,6 @@ namespace tfd {
 
 ABSL_CONST_INIT extern const char kOpKernelRunnerCacheResourceName[];
 
-// Set up fallback context with common tensorflow states such as devices,
-// function library runtime. They will be forwarded to tensorflow::OpKernel as
-// in tensorflow::Executor. If `runner` is nullptr, internally it will use a
-// default runner that executes tasks in the caller thread.
-Status SetUpKernelFallbackCompatRequestContext(
-    tfrt::RequestContextBuilder* builder,
-    const tensorflow::DeviceMgr* device_manager,
-    const tensorflow::ProcessFunctionLibraryRuntime* pflr,
-    tfrt_stub::OpKernelRunnerTable* runner_table,
-    FallbackResourceArray* resource_array,
-    tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool = nullptr,
-    const absl::optional<SessionMetadata>& model_metadata = absl::nullopt,
-    std::function<void(std::function<void()>)>* runner = nullptr,
-    tfrt_stub::CostRecorder* cost_recorder = nullptr);
-
-// Runner_table can be nullptr. In that case, kernel_fallback will use
-// the default runner_table.
-Status SetUpKernelFallbackCompatRequestContext(
-    tfrt::RequestContextBuilder* builder,
-    tfrt_stub::OpKernelRunnerTable* runner_table,
-    tensorflow::EagerContext* eager_context,
-    tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool = nullptr,
-    const absl::optional<SessionMetadata>& model_metadata = absl::nullopt);
-
 // The CoreRuntime dispatch function to run a TF kernel in kernel fallback
 // compat mode.
 tfrt::AsyncValueRef<tfrt::Chain> KernelFallbackExecuteCompatCoreRuntimeDispatch(
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.cc
new file mode 100644
index 00000000000..074019fda38
--- /dev/null
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.cc
@@ -0,0 +1,80 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.h"
+
+#include <functional>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/tsl/platform/refcount.h"
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+namespace {
+
+using ::tfrt::string_view;
+
+constexpr char kOpKernelRunnerTableResourceName[] =
+    "OpKernelRunnerTableResourceName";
+constexpr char kFallbackResourceArray[] = "FallbackResourceArray";
+
+
+}  // namespace
+
+Status SetUpKernelFallbackCompatRequestContext(
+    tfrt::RequestContextBuilder* builder,
+    tfrt_stub::OpKernelRunnerTable* runner_table,
+    tensorflow::EagerContext* eager_context,
+    tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool,
+    const absl::optional<SessionMetadata>& model_metadata) {
+  auto* resource_array =
+      builder->resource_context()->GetOrCreateResource<FallbackResourceArray>(
+          kFallbackResourceArray);
+
+  if (runner_table == nullptr)
+    runner_table = builder->resource_context()
+                       ->GetOrCreateResource<tfrt_stub::OpKernelRunnerTable>(
+                           kOpKernelRunnerTableResourceName);
+
+  auto step_id = builder->id();
+
+  Rendezvous::Factory creator = eager_context->RendezvousFactory();
+  tsl::core::RefCountPtr<Rendezvous> rendezvous;
+  TF_RETURN_IF_ERROR(
+      creator(step_id, eager_context->local_device_mgr(), &rendezvous));
+
+  // TODO(hhb): Clean up rendezvous from factory after run.
+
+  auto& fallback_request_state =
+      builder->context_data().emplace<KernelFallbackCompatRequestState>(
+          GetDefaultRunner(), eager_context->local_device_mgr(), step_id,
+          tfrt::OwnedOrUnownedPtr<ScopedStepContainer>{
+              eager_context->StepContainer()},
+          eager_context->GetCollectiveExecutorHandle(), std::move(rendezvous),
+          runner_table, resource_array, user_intra_op_threadpool,
+          model_metadata, eager_context->pflr());
+
+  fallback_request_state.set_log_device_placement(
+      eager_context->LogDevicePlacement());
+
+  return OkStatus();
+}
+
+}  // namespace tfd
+}  // namespace tensorflow
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.h b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.h
new file mode 100644
index 00000000000..05c302e9299
--- /dev/null
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_COMPAT_EAGER_H_
+#define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_COMPAT_EAGER_H_
+
+#include <optional>
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfd {
+
+// Runner_table can be nullptr. In that case, kernel_fallback will use
+// the default runner_table.
+Status SetUpKernelFallbackCompatRequestContext(
+    tfrt::RequestContextBuilder* builder,
+    tfrt_stub::OpKernelRunnerTable* runner_table,
+    tensorflow::EagerContext* eager_context,
+    tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool = nullptr,
+    const absl::optional<SessionMetadata>& model_metadata = std::nullopt);
+
+}  // namespace tfd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_COMPAT_EAGER_H_
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc
index 4fbab3b0f1e..c9e7aa0ce13 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc
@@ -56,7 +56,7 @@ static void TFDForwardKernel(tfrt::RemainingArguments arguments,
   Status s = FillOpAttrs(attributes, &opattrs);
   if (!s.ok()) {
     frame->ReportError("TFDForwardKernel: Error while parsing attributes: ",
-                       s.error_message());
+                       s.message());
   }
 
   tfrt::OpAttrsRef opattrsref(opattrs);
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc
index fa3abf9f8e6..0f287f27b3f 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc
@@ -157,7 +157,7 @@ Expected<CoreRuntimeOp> KernelFallbackOpHandler::MakeOp(string_view op_name) {
               absl::Status(
                   ToAbslStatus(s).code(),
                   tfrt::StrCat("Error running kernel fallback OpHandler ",
-                               invocation.op_name, ":", s.error_message())));
+                               invocation.op_name, ":", s.message())));
           for (auto& result : invocation.results) {
             result = tfrt::TensorHandle::CreateError(error.CopyRef());
           }
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.cc
index b5781c71e7a..04684ce2383 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h"
 
+#include <functional>
+
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -23,6 +25,13 @@ limitations under the License.
 namespace tensorflow {
 namespace tfd {
 
+std::function<void(std::function<void()>)>* GetDefaultRunner() {
+  static auto* const default_runner =
+      new std::function<void(std::function<void()>)>(
+          [](const std::function<void()>& f) { f(); });
+  return default_runner;
+}
+
 void SetUpParams(const tfrt_stub::OpKernelRunner& runner,
                  const KernelFallbackCompatRequestState& fallback_request_state,
                  tensorflow::Device* device,
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h
index e7dfeb5a9f6..f24a6b1b813 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_UTILS_H_
 #define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_UTILS_H_
 
+#include <functional>
+
 #include "llvm/ADT/ArrayRef.h"
 #include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
@@ -28,6 +30,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tfd {
 
+std::function<void(std::function<void()>)>* GetDefaultRunner();
+
 using TfInputs =
     tfrt::Variant<tfrt::Monostate, llvm::ArrayRef<tfrt::AsyncValue*>,
                   tfrt::RepeatedSyncArguments<tfrt_stub::FallbackTensor>&>;
diff --git a/tensorflow/core/runtime_fallback/kernel/tensor_util.cc b/tensorflow/core/runtime_fallback/kernel/tensor_util.cc
index 5b7d94f2969..2bbc121d549 100644
--- a/tensorflow/core/runtime_fallback/kernel/tensor_util.cc
+++ b/tensorflow/core/runtime_fallback/kernel/tensor_util.cc
@@ -56,7 +56,7 @@ llvm::Expected<Device*> GetTfDevice(const tfrt::ExecutionContext& exec_ctx,
   Status s = eager_context_expected.get()->FindDeviceFromName(
       device.name().data(), &tf_device);
   if (!s.ok()) {
-    return tfrt::MakeStringError(s.error_message());
+    return tfrt::MakeStringError(s.message());
   }
   return tf_device;
 }
diff --git a/tensorflow/core/runtime_fallback/kernel/tensor_util.h b/tensorflow/core/runtime_fallback/kernel/tensor_util.h
index 47359210a67..8e0ab312d35 100644
--- a/tensorflow/core/runtime_fallback/kernel/tensor_util.h
+++ b/tensorflow/core/runtime_fallback/kernel/tensor_util.h
@@ -92,7 +92,7 @@ tfrt::AsyncValueRef<TensorWrapperType> TransferTensorToDevice(
         // that might have nothing to do with this tensor to complete).
         Status s = src_device->Sync();
         if (!s.ok()) {
-          result.SetError(absl::InternalError(s.error_message()));
+          result.SetError(absl::InternalError(s.message()));
           return;
         }
         tensorflow::Notification n;
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
index f7a8ad77795..d9b8785c195 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
@@ -172,9 +172,7 @@ DataType TFRTOpKernelContext::expected_output_dtype(int i) const {
   return op_meta_->output_type(i);
 }
 
-void TFRTOpKernelContext::CtxFailure(const Status& s) {
-  error_ = s.error_message();
-}
+void TFRTOpKernelContext::CtxFailure(const Status& s) { error_ = s.message(); }
 void TFRTOpKernelContext::CtxFailureWithWarning(const Status& s) {
   CtxFailure(s);
 }
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
index 842c0cdcbae..59b2ea421ff 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
@@ -48,12 +48,7 @@ namespace tfrt {
 class AsyncKernelFrame;
 }  // namespace tfrt
 
-namespace tsl {
-class Status;
-}  // namespace tsl
-
 namespace tensorflow {
-using tsl::Status;
 
 class TFRTOpKernel;
 class TFRTOpMeta;
diff --git a/tensorflow/core/runtime_fallback/runtime/BUILD b/tensorflow/core/runtime_fallback/runtime/BUILD
index 4bed5e5b0ef..cc034faa2a5 100644
--- a/tensorflow/core/runtime_fallback/runtime/BUILD
+++ b/tensorflow/core/runtime_fallback/runtime/BUILD
@@ -49,7 +49,6 @@ tf_cuda_library(
     ],
     hdrs = [
         "conversion_function.h",
-        "fallback_batch_kernel.h",
         "runtime_fallback_kernels.h",
         "runtime_fallback_op_handler.h",
         "runtime_fallback_tensor.h",
@@ -68,13 +67,10 @@ tf_cuda_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":op_logger",
+        ":fallback_batch_kernel",
         ":kernel_utils",
+        ":op_logger",
         ":runtime_fallback_tensor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_tensor",
@@ -84,6 +80,10 @@ tf_cuda_library(
         "//tensorflow/core/tfrt/runtime:work_queue_interface",
         "//tensorflow/core/tfrt/utils:error_util",
         "//tensorflow/core/tfrt/utils:tensor_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
         "@tf_runtime//:core_runtime",
         "@tf_runtime//:dtype",
         "@tf_runtime//:hostcontext",
@@ -107,18 +107,18 @@ tf_cuda_library(
             "//tensorflow/core:session_options",
             "//tensorflow/core/common_runtime:core_cpu_impl",
             "//tensorflow/core/common_runtime/eager:context",
-            "//tensorflow/core/common_runtime/eager:execute",
             "//tensorflow/core/common_runtime/eager:core",  # Needed due to circular dep
             "//tensorflow/core/common_runtime/eager:eager_operation",
+            "//tensorflow/core/common_runtime/eager:execute",
             "//tensorflow/core/common_runtime/eager:tensor_handle",
             "//tensorflow/core/framework:node_def_proto_cc",
             "//tensorflow/core/framework:tensor",
+            "//tensorflow/core/kernels/batching_util:batch_resource_base",
+            "//tensorflow/core/kernels/batching_util:bounded_executor",
             "//tensorflow/core/platform:errors",
             "//tensorflow/core/platform:random",
             "//tensorflow/core/platform:status",
             "//tensorflow/core/profiler/lib:traceme",
-            "//tensorflow/core/kernels/batching_util:batch_resource_base",
-            "//tensorflow/core/kernels/batching_util:bounded_executor",
         ],
     }),
     alwayslink = 1,
@@ -138,9 +138,9 @@ tf_cuda_library(
     ],
     deps = [
         ":kernel_utils",
-        "@llvm-project//llvm:Support",
         "//tensorflow/core/runtime_fallback/util:tensor_util",
         "//tensorflow/core/runtime_fallback/util:type_util",
+        "@llvm-project//llvm:Support",
         "@tf_runtime//:dtype",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:support",
@@ -186,11 +186,12 @@ cc_library(
         ],
         "//conditions:default": [
             "//tensorflow/c:tf_tensor",
-            "//tensorflow/core/common_runtime:device_mgr",
-            "//tensorflow/core/common_runtime/eager:context_distributed_manager",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:session_options",
+            "//tensorflow/core/common_runtime:device_mgr",
             "//tensorflow/core/common_runtime/eager:context",
+            "//tensorflow/core/common_runtime/eager:context_distributed_manager",
+            "//tensorflow/core/common_runtime/eager:core_no_xla",
             "//tensorflow/core/common_runtime/eager:eager_operation",
             "//tensorflow/core/common_runtime/eager:tensor_handle",
             "//tensorflow/core/platform:status",
@@ -198,6 +199,34 @@ cc_library(
     }),
 )
 
+cc_library(
+    name = "fallback_batch_kernel",
+    srcs = ["fallback_batch_kernel.cc"],
+    hdrs = ["fallback_batch_kernel.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core/framework:op_requires",
+        "//tensorflow/core/kernels/batching_util:adaptive_shared_batch_scheduler",
+        "//tensorflow/core/kernels/batching_util:batch_resource_base",
+        "//tensorflow/core/kernels/batching_util:bounded_executor",
+        "//tensorflow/core/lib/core:refcount",
+        "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:random",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
+        "//tensorflow/core/tfrt/utils:error_util",
+        "//tensorflow/core/tfrt/utils:fallback_tensor",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@tf_runtime//:hostcontext",
+    ],
+)
+
 tf_cuda_library(
     name = "runtime_fallback_gpu_alwayslink",
     srcs = [
@@ -219,10 +248,10 @@ tf_cuda_library(
         ":kernel_utils",
         ":op_logger",
         ":runtime_fallback_tensor",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core/runtime_fallback/util:attr_util",
         "//tensorflow/core/runtime_fallback/util:gpu_util",
         "//tensorflow/core/runtime_fallback/util:type_util",
+        "@com_google_absl//absl/strings",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:tensor",
         "@tf_runtime//backends/gpu:gpu_config",
diff --git a/tensorflow/core/runtime_fallback/runtime/conversion_function.cc b/tensorflow/core/runtime_fallback/runtime/conversion_function.cc
index b24dee8919a..cc9a5b3983b 100644
--- a/tensorflow/core/runtime_fallback/runtime/conversion_function.cc
+++ b/tensorflow/core/runtime_fallback/runtime/conversion_function.cc
@@ -48,7 +48,7 @@ ConvertRuntimeFallbackTensorToDenseHostTensor(
       tensor.GetTensorHandle()->Resolve(&status)};
   if (!status.ok())
     return tfrt::MakeStringError("error resolving TensorHandle: ",
-                                 status.error_message());
+                                 status.message());
 
   void *data = tensor_interface->Data();
   size_t size = tensor_interface->ByteSize();
@@ -75,7 +75,7 @@ ConvertRuntimeFallbackTensorToStringHostTensor(
   if (!status.ok())
     return tfrt::MakeErrorAsyncValueRef(
 
-        tfrt::StrCat("error resolving TensorHandle: ", status.error_message()));
+        tfrt::StrCat("error resolving TensorHandle: ", status.message()));
 
   assert(tensor_interface->Type() == DT_STRING);
 
@@ -152,7 +152,7 @@ TransferRuntimeFallbackToAnotherDevice(const RuntimeFallbackTensor &tensor,
   auto *th = tensor.GetTensorHandle();
   Device *tf_device;
   Status s = eager_context->FindDeviceFromName(dst.name().data(), &tf_device);
-  if (!s.ok()) return tfrt::MakeStringError(s.error_message());
+  if (!s.ok()) return tfrt::MakeStringError(s.message());
 
   auto *host = exec_ctx.host();
 
@@ -161,7 +161,7 @@ TransferRuntimeFallbackToAnotherDevice(const RuntimeFallbackTensor &tensor,
   s = EagerCopyToDevice(th, eager_context, &eager_context->Executor(),
                         tf_device,
                         /*mirror=*/false, &result_th);
-  if (!s.ok()) return tfrt::MakeStringError(s.error_message());
+  if (!s.ok()) return tfrt::MakeStringError(s.message());
   return CreateRuntimeFallbackTensorFromTfTensorHandle(
       OwnedTensorHandle(result_th), host);
 }
diff --git a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
new file mode 100644
index 00000000000..ca5f827cca7
--- /dev/null
+++ b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
@@ -0,0 +1,207 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h"
+
+#include "tensorflow/core/kernels/batching_util/bounded_executor.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/core/tfrt/utils/error_util.h"
+#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tfrt_stub {
+namespace {
+
+// Op attributes.
+constexpr char kEnableAdaptiveSchedulerAttr[] = "_enable_adaptive_scheduler";
+constexpr char kMinInflightBatchesAttr[] = "_min_inflight_batches";
+constexpr char kInitialInflightBatchesAttr[] = "_initial_inflight_batches";
+constexpr char kMaxInflightBatchesAttr[] = "_max_inflight_batches";
+constexpr char kBatchesToAverageOverAttr[] = "_batches_to_average_over";
+// Default thread count in the per-process batching thread pool.
+// The value is the same as the TF batch kernel BatchKernel.
+
+}  // namespace
+
+int32 BatchFunctionFallbackKernelBase::
+    NumBatchThreadsFromEnvironmentWithDefault(int default_num_batch_threads) {
+  int32_t num;
+  const char* val = std::getenv("TF_NUM_BATCH_THREADS");
+
+  return (val && strings::safe_strto32(val, &num)) ? num
+                                                   : default_num_batch_threads;
+}
+
+thread::ThreadPool*
+BatchFunctionFallbackKernelBase::GetOrCreateBatchThreadsPool() {
+  static thread::ThreadPool* shared_thread_pool = [&]() -> thread::ThreadPool* {
+    serving::BoundedExecutor::Options options;
+
+    options.num_threads =
+        NumBatchThreadsFromEnvironmentWithDefault(kBatchThreadPoolSize);
+
+    options.thread_name = std::string("adaptive_batch_threads");
+
+    auto status_or_executor = serving::BoundedExecutor::Create(options);
+    if (!status_or_executor.ok()) {
+      LOG(WARNING) << "Failed to create a batch threads pool with error "
+                   << status_or_executor.status();
+      return nullptr;
+    }
+    static serving::BoundedExecutor* executor =
+        status_or_executor.value().release();
+    return new thread::ThreadPool(executor);
+  }();
+  return shared_thread_pool;
+}
+
+BatchFunctionFallbackKernelBase::BatchFunctionFallbackKernelBase(
+    OpKernelConstruction* c)
+    : AsyncOpKernel(c) {
+  OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
+  OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
+  OP_REQUIRES_OK(c, c->GetAttr("batching_queue", &batcher_queue_));
+  OP_REQUIRES_OK(c, c->GetAttr("num_batch_threads", &num_batch_threads_));
+  OP_REQUIRES_OK(c, c->GetAttr("max_batch_size", &max_batch_size_));
+  OP_REQUIRES_OK(c, c->GetAttr("batch_timeout_micros", &batch_timeout_micros_));
+  OP_REQUIRES_OK(c, c->GetAttr("max_enqueued_batches", &max_enqueued_batches_));
+  OP_REQUIRES_OK(c, c->GetAttr("allowed_batch_sizes", &allowed_batch_sizes_));
+
+  if (shared_name_.empty()) {
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    shared_name_ = name();
+  }
+
+  VLOG(1) << "BatchFunctionFallbackKernel(" << this
+          << ") container attribute: \"" << container_
+          << "\", shared_name attribute: \"" << shared_name_
+          << "\", batching_queue attribute: \"" << batcher_queue_ << "\"";
+
+  if (c->HasAttr("enable_large_batch_splitting")) {
+    OP_REQUIRES_OK(c, c->GetAttr("enable_large_batch_splitting",
+                                 &enable_large_batch_splitting_));
+  } else {
+    enable_large_batch_splitting_ = false;
+  }
+
+  if (c->HasAttr("disable_padding")) {
+    OP_REQUIRES_OK(c, c->GetAttr("disable_padding", &disable_padding_));
+  } else {
+    disable_padding_ = false;
+  }
+
+  // Helper function `SetAdaptiveBatchSchedulerOptions` calls
+  // `OP_REQUIRES_OK`, which exits the current function upon error.
+  // So validate status of `op-kernel-construction`.
+  SetAdaptiveBatchSchedulerOptions(c, num_batch_threads_);
+  if (!c->status().ok()) {
+    return;
+  }
+
+  if (enable_adaptive_batch_threads_) {
+    // One scheduler instance contains a couple of queue instances,
+    // `batcher_queue_` is the key to find queue for this batch-op in the
+    // graph.
+    // Use `shared_name_` and name() as prefix for `batcher_queue_`.
+    // Note name() is unique per session (from session metadata).
+    batcher_queue_ = name() + "/" + shared_name_ + batcher_queue_;
+  }
+
+  OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
+}
+
+Status BatchFunctionFallbackKernelBase::ValidateAllowedBatchSizes() const {
+  if (allowed_batch_sizes_.empty()) {
+    return OkStatus();
+  }
+  int32_t last_size = 0;
+  for (size_t i = 0; i < allowed_batch_sizes_.size(); ++i) {
+    const int32_t size = allowed_batch_sizes_.at(i);
+    if (i > 0 && size <= last_size) {
+      return errors::InvalidArgument(
+          "allowed_batch_sizes entries must be monotonically increasing");
+    }
+
+    if ((!enable_large_batch_splitting_) &&
+        (i == allowed_batch_sizes_.size() - 1) && (size != max_batch_size_)) {
+      return errors::InvalidArgument(
+          "final entry in allowed_batch_sizes must equal max_batch_size when "
+          "enable_large_batch_splitting is False");
+    }
+
+    last_size = size;
+  }
+  return OkStatus();
+}
+
+void BatchFunctionFallbackKernelBase::SetAdaptiveBatchSchedulerOptions(
+    OpKernelConstruction* c, int32_t num_batch_threads) {
+  if (c->HasAttr(kEnableAdaptiveSchedulerAttr)) {
+    OP_REQUIRES_OK(c, c->GetAttr(kEnableAdaptiveSchedulerAttr,
+                                 &enable_adaptive_batch_threads_));
+  }
+
+  if (num_batch_threads <= 0) {
+    enable_adaptive_batch_threads_ = true;
+  }
+
+  if (!enable_adaptive_batch_threads_) {
+    // adaptive_batch_scheduler_options_ is nullopt.
+    return;
+  }
+
+  // adaptive_batch_scheduler_options_ is not nullopt
+  AdaptiveBatchSchedulerOptions options;
+
+  if (c->HasAttr(kBatchesToAverageOverAttr)) {
+    OP_REQUIRES_OK(c, c->GetAttr(kBatchesToAverageOverAttr,
+                                 &options.batches_to_average_over));
+  }
+
+  if (c->HasAttr(kMinInflightBatchesAttr)) {
+    OP_REQUIRES_OK(c, c->GetAttr(kMinInflightBatchesAttr,
+                                 &options.min_in_flight_batches_limit));
+  }
+
+  if (c->HasAttr(kInitialInflightBatchesAttr)) {
+    OP_REQUIRES_OK(c, c->GetAttr(kInitialInflightBatchesAttr,
+                                 &options.initial_in_flight_batches_limit));
+  }
+
+  if (c->HasAttr(kMaxInflightBatchesAttr)) {
+    OP_REQUIRES_OK(c, c->GetAttr(kMaxInflightBatchesAttr,
+                                 &options.max_in_flight_batches_limit));
+  }
+
+  // At this point, the batch kernel is configured to use adaptive scheduling.
+  // To validate or return error at kernel construction time, invokes
+  // `GetOrCreateBatchThreadsPool` and validates returned `thread_pool` is
+  // valid.
+  // Note`GetOrCreateBatchThreadsPool` creates the thread pool once and
+  // re-uses the thread-pool instance afterwards.
+  thread::ThreadPool* thread_pool = GetOrCreateBatchThreadsPool();
+  OP_REQUIRES(
+      c, thread_pool != nullptr,
+      errors::FailedPrecondition("Failed to create batch threads pool"));
+
+  adaptive_batch_scheduler_options_ = options;
+}
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
diff --git a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
index 6d6b662ce34..9c1265022cd 100644
--- a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
+++ b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
@@ -15,22 +15,31 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_FALLBACK_BATCH_KERNEL_H_
 #define TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_FALLBACK_BATCH_KERNEL_H_
 
+#include <memory>
 #include <optional>
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/batch_resource_base.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/random.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+#include "util/task/status_macros.h"
 
 namespace tensorflow {
 namespace tfrt_stub {
 
-thread::ThreadPool* GetOrCreateBatchThreadsPool();
-
 class BatchFunctionFallbackKernelBase : public AsyncOpKernel {
  public:
   explicit BatchFunctionFallbackKernelBase(OpKernelConstruction* c);
@@ -50,6 +59,11 @@ class BatchFunctionFallbackKernelBase : public AsyncOpKernel {
   void SetAdaptiveBatchSchedulerOptions(OpKernelConstruction* c,
                                         int32_t num_batch_threads);
 
+  static int32 NumBatchThreadsFromEnvironmentWithDefault(
+      int default_num_batch_threads);
+  static thread::ThreadPool* GetOrCreateBatchThreadsPool();
+  static constexpr int64_t kBatchThreadPoolSize = 128;
+
   std::string container_;
   std::string shared_name_;
   std::string batcher_queue_;
@@ -102,10 +116,18 @@ class BatchFunctionFallbackKernel : public BatchFunctionFallbackKernelBase {
 template <typename BatchResourceType>
 void BatchFunctionFallbackKernel<BatchResourceType>::ComputeAsync(
     OpKernelContext* c, DoneCallback done) {
-  BatchResourceType* br;
-  std::function<Status(BatchResourceType**)> creator;
+  OP_REQUIRES_VALUE(tfrt::ResourceContext * client_graph_resource_context, c,
+                    BatchResourceType::GetClientGraphResourceContext(c));
+  OP_REQUIRES_ASYNC(
+      c, client_graph_resource_context != nullptr,
+      errors::FailedPrecondition("client graph resource context not found"),
+      done);
+  std::function<
+      absl::StatusOr<tensorflow::core::RefCountPtr<BatchResourceType>>()>
+      creator;
   if (adaptive_batch_scheduler_options_ != std::nullopt) {
-    creator = [this, c](BatchResourceType** r) {
+    creator = [this, c]()
+        -> absl::StatusOr<tensorflow::core::RefCountPtr<BatchResourceType>> {
       serving::AdaptiveSharedBatchScheduler<
           serving::BatchResourceBase::BatchTask>::Options
           adaptive_shared_batch_scheduler_options;
@@ -115,6 +137,21 @@ void BatchFunctionFallbackKernel<BatchResourceType>::ComputeAsync(
           adaptive_batch_scheduler_options_->max_in_flight_batches_limit;
       adaptive_shared_batch_scheduler_options.thread_pool =
           GetOrCreateBatchThreadsPool();
+
+      // When we explicitly specify 'thread_pool', you'd think ASBS would ignore
+      // 'num_batch_threads', but in fact ASBS still uses num_batch_threads as
+      // the max number of in-flight batches.  It makes no sense to have more
+      // in-flight batches than threads (it would result in strictly bad
+      // batching decisions), so we cap this parameter (which otherwise comes
+      // from the saved model) to the actual number of batch threads (which
+      // comes from a process-wide environment variable).
+      //
+      // We have to apply the same capping to min_ and initial_
+      // in_flight_batches_limit below to produce valid configurations.
+      adaptive_shared_batch_scheduler_options.num_batch_threads = std::min(
+          NumBatchThreadsFromEnvironmentWithDefault(kBatchThreadPoolSize),
+          adaptive_batch_scheduler_options_->max_in_flight_batches_limit);
+
       // adaptive_shared_batch_scheduler_options.full_batch_scheduling_boost_micros
       // is 0 (default value) intentionally, so tasks are scheduled in a FIFO
       // way.
@@ -128,42 +165,46 @@ void BatchFunctionFallbackKernel<BatchResourceType>::ComputeAsync(
       // the batch processing latency (which varies on a model basis).
       // If a non-zero value is not set properly, it harms tail latency.
       adaptive_shared_batch_scheduler_options.min_in_flight_batches_limit =
-          adaptive_batch_scheduler_options_->min_in_flight_batches_limit;
-      adaptive_shared_batch_scheduler_options.initial_in_flight_batches_limit =
-          adaptive_batch_scheduler_options_->initial_in_flight_batches_limit;
+          std::min(
+              NumBatchThreadsFromEnvironmentWithDefault(kBatchThreadPoolSize),
+              adaptive_batch_scheduler_options_->min_in_flight_batches_limit);
+      adaptive_shared_batch_scheduler_options
+          .initial_in_flight_batches_limit = std::min(
+          NumBatchThreadsFromEnvironmentWithDefault(kBatchThreadPoolSize),
+          adaptive_batch_scheduler_options_->initial_in_flight_batches_limit);
       adaptive_shared_batch_scheduler_options.batches_to_average_over =
           adaptive_batch_scheduler_options_->batches_to_average_over;
       adaptive_shared_batch_scheduler_options.fifo_scheduling = true;
 
       std::unique_ptr<BatchResourceType> new_resource;
-      TF_RETURN_IF_ERROR(BatchResourceType::Create(
+      auto status = BatchResourceType::Create(
           c, adaptive_shared_batch_scheduler_options, max_batch_size_,
           batch_timeout_micros_, max_enqueued_batches_, allowed_batch_sizes_,
-          batch_function_, disable_padding_, &new_resource));
-      *r = new_resource.release();
-
-      return OkStatus();
+          batch_function_, disable_padding_, &new_resource);
+      if (!status.ok()) return tsl::ToAbslStatus(status);
+      return tensorflow::core::RefCountPtr<BatchResourceType>(
+          new_resource.release());
     };
   } else {
-    creator = [this, c](BatchResourceType** r) {
+    creator = [this, c]()
+        -> absl::StatusOr<tensorflow::core::RefCountPtr<BatchResourceType>> {
       std::unique_ptr<BatchResourceType> new_resource;
-      TF_RETURN_IF_ERROR(BatchResourceType::Create(
+      auto status = BatchResourceType::Create(
           c, num_batch_threads_, max_batch_size_, batch_timeout_micros_,
           max_enqueued_batches_, allowed_batch_sizes_, batch_function_,
-          enable_large_batch_splitting_, disable_padding_, &new_resource));
-      *r = new_resource.release();
-
-      return OkStatus();
+          enable_large_batch_splitting_, disable_padding_, &new_resource);
+      if (!status.ok()) return tsl::ToAbslStatus(status);
+      return tensorflow::core::RefCountPtr<BatchResourceType>(
+          new_resource.release());
     };
   }
-  OP_REQUIRES_OK_ASYNC(c,
-                       c->resource_manager()->LookupOrCreate(
-                           container_, shared_name_, &br, creator),
-                       done);
 
+  auto br = client_graph_resource_context->GetOrCreateResource<
+      tensorflow::core::RefCountPtr<BatchResourceType>>(shared_name_, creator);
+  if (!br.ok()) OP_REQUIRES_OK_ASYNC(c, tsl::FromAbslStatus(br.status()), done);
   auto expected_name = BatchResourceType::GetBatchFunctionName(batch_function_);
   auto received_name =
-      BatchResourceType::GetBatchFunctionName(br->batch_function());
+      BatchResourceType::GetBatchFunctionName((*br)->get()->batch_function());
 
   // TODO(b/187173237): When we can guarantee only 1 copy of BEF function is
   // generated for the batched function, we can assert the pointers are equal
@@ -173,10 +214,9 @@ void BatchFunctionFallbackKernel<BatchResourceType>::ComputeAsync(
           "Provided BEF function doesn't match with BatchResource. Expected:",
           expected_name, " Received:", received_name)),
       done);
-  Status status = br->RegisterInput(
+  Status status = (*br)->get()->RegisterInput(
       random::New64(), c, batcher_queue_,
       [c]() { return BatchResourceType::CreateBatchTask(c); }, done);
-  br->Unref();
   OP_REQUIRES_OK_ASYNC(c, status, done);
   // Assume br calls done, so nothing to do here.
 }
diff --git a/tensorflow/core/runtime_fallback/runtime/gpu/conversion_function.cc b/tensorflow/core/runtime_fallback/runtime/gpu/conversion_function.cc
index bba46cfba6d..f5f08d0f877 100644
--- a/tensorflow/core/runtime_fallback/runtime/gpu/conversion_function.cc
+++ b/tensorflow/core/runtime_fallback/runtime/gpu/conversion_function.cc
@@ -93,7 +93,7 @@ ConvertRuntimeFallbackTensorToDenseGpuTensor(
   if (!status.ok()) {
     return EmitErrorAsync(
         exec_ctx, tfrt::StrCat("error getting device name from TensorHandle: ",
-                               status.error_message()));
+                               status.message()));
   }
 
   // Check if the underlying tensorflow::TensorHandle is already on GPU.
@@ -104,7 +104,7 @@ ConvertRuntimeFallbackTensorToDenseGpuTensor(
     if (!status.ok()) {
       return EmitErrorAsync(
           exec_ctx, tfrt::StrCat("error getting shape from TF tensor handle: ",
-                                 status.error_message()));
+                                 status.message()));
     }
 
     auto tf_shape = shape.dim_sizes();
@@ -116,7 +116,7 @@ ConvertRuntimeFallbackTensorToDenseGpuTensor(
     if (!status.ok()) {
       return EmitErrorAsync(exec_ctx,
                             tfrt::StrCat("error calling TensorHandle::Tensor: ",
-                                         status.error_message()));
+                                         status.message()));
     }
 
     auto platform = tensorflow::tfd::GetTfrtGpuPlatform(tf_tensor_handle);
@@ -216,7 +216,7 @@ ConvertDenseGpuTensorToRuntimeFallbackTensor(
     return EmitErrorAsync(exec_ctx,
                           absl::InternalError(tfrt::StrCat(
                               "error looking up gpu device from EagerContext: ",
-                              status.error_message())));
+                              status.message())));
 
   auto fallback_tensor = CopyRefGpuTensorToRuntimeFallbackTensor(
       tensor, device, device, eager_ctx);
diff --git a/tensorflow/core/runtime_fallback/runtime/kernel_utils.cc b/tensorflow/core/runtime_fallback/runtime/kernel_utils.cc
index 9325f39a903..f433ab2aede 100644
--- a/tensorflow/core/runtime_fallback/runtime/kernel_utils.cc
+++ b/tensorflow/core/runtime_fallback/runtime/kernel_utils.cc
@@ -33,17 +33,18 @@ tfrt::Expected<OwnedEagerContext> InitEagerContext(
   tensorflow::Status status = tensorflow::DeviceFactory::AddDevices(
       session_opts, "/job:localhost/replica:0/task:0", &devices);
   if (!status.ok()) {
-    return tfrt::MakeStringError(tfrt::StrCat(status.error_message()));
+    return tfrt::MakeStringError(status.message());
   }
 
   if (device_mgr != nullptr) {
     Status s = device_mgr->AddDevices(std::move(devices));
     DCHECK(s.ok()) << "Failed to initialize device manager.";
-    auto r = new tensorflow::IntraProcessRendezvous(device_mgr);
+    auto r = tsl::core::RefCountPtr<IntraProcessRendezvous>(
+        new tensorflow::IntraProcessRendezvous(device_mgr));
 
     OwnedEagerContext owned_eager_context{new tensorflow::EagerContext(
         session_opts, default_device_placement_policy, is_async, device_mgr,
-        /*device_mgr_owned=*/false, r)};
+        /*device_mgr_owned=*/false, std::move(r))};
 
 #if !defined(IS_MOBILE_PLATFORM)
     owned_eager_context->SetDistributedManager(
@@ -56,11 +57,12 @@ tfrt::Expected<OwnedEagerContext> InitEagerContext(
 
   auto owned_device_mgr =
       std::make_unique<tensorflow::StaticDeviceMgr>(std::move(devices));
-  auto r = new tensorflow::IntraProcessRendezvous(owned_device_mgr.get());
+  auto r = tsl::core::RefCountPtr<IntraProcessRendezvous>(
+      new tensorflow::IntraProcessRendezvous(owned_device_mgr.get()));
 
   OwnedEagerContext owned_eager_context{new tensorflow::EagerContext(
       session_opts, default_device_placement_policy, is_async,
-      owned_device_mgr.release(), /*device_mgr_owned=*/true, r)};
+      owned_device_mgr.release(), /*device_mgr_owned=*/true, std::move(r))};
 
 #if !defined(IS_MOBILE_PLATFORM)
   owned_eager_context->SetDistributedManager(
diff --git a/tensorflow/core/runtime_fallback/runtime/kernel_utils.h b/tensorflow/core/runtime_fallback/runtime/kernel_utils.h
index 7342e8b005f..c616550e74e 100644
--- a/tensorflow/core/runtime_fallback/runtime/kernel_utils.h
+++ b/tensorflow/core/runtime_fallback/runtime/kernel_utils.h
@@ -135,7 +135,7 @@ class EagerContextResource {
     Status s = dynamic_cast<tensorflow::DynamicDeviceMgr*>(
                    ctx_.get()->local_device_mgr())
                    ->AddDevices(std::move(devices));
-    if (!s.ok()) return tfrt::MakeStringError(s.error_message());
+    if (!s.ok()) return tfrt::MakeStringError(s.message());
     ctx_.get()->InitPrioritizedDeviceTypeList();
     ctx_.get()->pflr()->InitializeDeviceAndFlr();
     return llvm::Error::success();
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
index 7f5e7a0114e..449b57cb210 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
@@ -17,13 +17,14 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/str_format.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/batch_resource_base.h"
-#include "tensorflow/core/kernels/batching_util/bounded_executor.h"
 #include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h"
 #include "tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h"
 #include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tfrt/host_context/execution_context.h"  // from @tf_runtime
 #include "tfrt/host_context/function.h"  // from @tf_runtime
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
 #include "tfrt/support/error_util.h"  // from @tf_runtime
 #include "tfrt/support/string_util.h"  // from @tf_runtime
 
@@ -50,16 +52,6 @@ using ::tfrt::AsyncValue;
 using ::tfrt::HostContext;
 using ::tfrt::RCReference;
 
-// Op attributes.
-constexpr char kEnableAdaptiveSchedulerAttr[] = "_enable_adaptive_scheduler";
-constexpr char kMinInflightBatchesAttr[] = "_min_inflight_batches";
-constexpr char kInitialInflightBatchesAttr[] = "_initial_inflight_batches";
-constexpr char kMaxInflightBatchesAttr[] = "_max_inflight_batches";
-constexpr char kBatchesToAverageOverAttr[] = "_batches_to_average_over";
-// Default thread count in the per-process batching thread pool.
-// The value is the same as the TF batch kernel BatchKernel.
-constexpr int64_t kBatchThreadPoolSize = 128;
-
 Status GetTfrtExecutionContext(OpKernelContext* c,
                                const tfrt::ExecutionContext** exec_ctx) {
   // ExecutionContext's address is passed in as an I64 input. exec_ctx is only
@@ -72,40 +64,6 @@ Status GetTfrtExecutionContext(OpKernelContext* c,
   return OkStatus();
 }
 
-int32 NumBatchThreadsFromEnvironmentWithDefault(int default_num_batch_threads) {
-  int32_t num;
-  const char* val = std::getenv("TF_NUM_BATCH_THREADS");
-
-  return (val && strings::safe_strto32(val, &num)) ? num
-                                                   : default_num_batch_threads;
-}
-
-}  // namespace
-
-thread::ThreadPool* GetOrCreateBatchThreadsPool() {
-  static thread::ThreadPool* shared_thread_pool = [&]() -> thread::ThreadPool* {
-    serving::BoundedExecutor::Options options;
-
-    options.num_threads =
-        NumBatchThreadsFromEnvironmentWithDefault(kBatchThreadPoolSize);
-
-    options.thread_name = std::string("adaptive_batch_threads");
-
-    auto status_or_executor = serving::BoundedExecutor::Create(options);
-    if (!status_or_executor.ok()) {
-      LOG(WARNING) << "Failed to create a batch threads pool with error "
-                   << status_or_executor.status();
-      return nullptr;
-    }
-    static serving::BoundedExecutor* executor =
-        status_or_executor.value().release();
-    return new thread::ThreadPool(executor);
-  }();
-  return shared_thread_pool;
-}
-
-namespace {
-
 class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
  public:
   using BatchFunctionType = tsl::RCReference<const tfrt::Function>;
@@ -121,6 +79,20 @@ class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
     }
   };
 
+  static StatusOr<tfrt::ResourceContext*> GetClientGraphResourceContext(
+      OpKernelContext* context) {
+    const tfrt::ExecutionContext* exec_ctx = nullptr;
+    TF_RETURN_IF_ERROR(GetTfrtExecutionContext(context, &exec_ctx));
+    const auto* fallback_request_state =
+        exec_ctx->request_ctx()
+            ->GetDataIfExists<tfd::KernelFallbackCompatRequestState>();
+    // If `client_graph_resource_context` is null, it implies that it's safe to
+    // fall back to the per-model resource context.
+    return fallback_request_state->client_graph_resource_context() != nullptr
+               ? fallback_request_state->client_graph_resource_context()
+               : exec_ctx->resource_context();
+  }
+
   static StatusOr<std::unique_ptr<BatchTask>> CreateBatchTask(
       OpKernelContext* context) {
     const tfrt::ExecutionContext* exec_ctx = nullptr;
@@ -261,141 +233,6 @@ class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
   RCReference<const tfrt::Function> bef_func_;
 };
 
-}  // namespace
-
-BatchFunctionFallbackKernelBase::BatchFunctionFallbackKernelBase(
-    OpKernelConstruction* c)
-    : AsyncOpKernel(c) {
-  OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
-  OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
-  OP_REQUIRES_OK(c, c->GetAttr("batching_queue", &batcher_queue_));
-  OP_REQUIRES_OK(c, c->GetAttr("num_batch_threads", &num_batch_threads_));
-  OP_REQUIRES_OK(c, c->GetAttr("max_batch_size", &max_batch_size_));
-  OP_REQUIRES_OK(c, c->GetAttr("batch_timeout_micros", &batch_timeout_micros_));
-  OP_REQUIRES_OK(c, c->GetAttr("max_enqueued_batches", &max_enqueued_batches_));
-  OP_REQUIRES_OK(c, c->GetAttr("allowed_batch_sizes", &allowed_batch_sizes_));
-
-  if (shared_name_.empty()) {
-    // If shared_name is not supplied, use name instead (prevent collisions by
-    // default).
-    shared_name_ = name();
-  }
-
-  VLOG(1) << "BatchFunctionFallbackKernel(" << this
-          << ") container attribute: \"" << container_
-          << "\", shared_name attribute: \"" << shared_name_
-          << "\", batching_queue attribute: \"" << batcher_queue_ << "\"";
-
-  if (c->HasAttr("enable_large_batch_splitting")) {
-    OP_REQUIRES_OK(c, c->GetAttr("enable_large_batch_splitting",
-                                 &enable_large_batch_splitting_));
-  } else {
-    enable_large_batch_splitting_ = false;
-  }
-
-  if (c->HasAttr("disable_padding")) {
-    OP_REQUIRES_OK(c, c->GetAttr("disable_padding", &disable_padding_));
-  } else {
-    disable_padding_ = false;
-  }
-
-  // Helper function `SetAdaptiveBatchSchedulerOptions` calls
-  // `OP_REQUIRES_OK`, which exits the current function upon error.
-  // So validate status of `op-kernel-construction`.
-  SetAdaptiveBatchSchedulerOptions(c, num_batch_threads_);
-  if (!c->status().ok()) {
-    return;
-  }
-
-  if (enable_adaptive_batch_threads_) {
-    // One scheduler instance contains a couple of queue instances,
-    // `batcher_queue_` is the key to find queue for this batch-op in the
-    // graph.
-    // Use `shared_name_` and name() as prefix for `batcher_queue_`.
-    // Note name() is unique per session (from session metadata).
-    batcher_queue_ = name() + "/" + shared_name_ + batcher_queue_;
-  }
-
-  OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
-}
-
-Status BatchFunctionFallbackKernelBase::ValidateAllowedBatchSizes() const {
-  if (allowed_batch_sizes_.empty()) {
-    return OkStatus();
-  }
-  int32_t last_size = 0;
-  for (size_t i = 0; i < allowed_batch_sizes_.size(); ++i) {
-    const int32_t size = allowed_batch_sizes_.at(i);
-    if (i > 0 && size <= last_size) {
-      return errors::InvalidArgument(
-          "allowed_batch_sizes entries must be monotonically increasing");
-    }
-
-    if ((!enable_large_batch_splitting_) &&
-        (i == allowed_batch_sizes_.size() - 1) && (size != max_batch_size_)) {
-      return errors::InvalidArgument(
-          "final entry in allowed_batch_sizes must equal max_batch_size when "
-          "enable_large_batch_splitting is False");
-    }
-
-    last_size = size;
-  }
-  return OkStatus();
-}
-
-void BatchFunctionFallbackKernelBase::SetAdaptiveBatchSchedulerOptions(
-    OpKernelConstruction* c, int32_t num_batch_threads) {
-  if (c->HasAttr(kEnableAdaptiveSchedulerAttr)) {
-    OP_REQUIRES_OK(c, c->GetAttr(kEnableAdaptiveSchedulerAttr,
-                                 &enable_adaptive_batch_threads_));
-  }
-
-  if (num_batch_threads <= 0) {
-    enable_adaptive_batch_threads_ = true;
-  }
-
-  if (!enable_adaptive_batch_threads_) {
-    // adaptive_batch_scheduler_options_ is nullopt.
-    return;
-  }
-
-  // adaptive_batch_scheduler_options_ is not nullopt
-  AdaptiveBatchSchedulerOptions options;
-
-  if (c->HasAttr(kBatchesToAverageOverAttr)) {
-    OP_REQUIRES_OK(c, c->GetAttr(kBatchesToAverageOverAttr,
-                                 &options.batches_to_average_over));
-  }
-
-  if (c->HasAttr(kMinInflightBatchesAttr)) {
-    OP_REQUIRES_OK(c, c->GetAttr(kMinInflightBatchesAttr,
-                                 &options.min_in_flight_batches_limit));
-  }
-
-  if (c->HasAttr(kInitialInflightBatchesAttr)) {
-    OP_REQUIRES_OK(c, c->GetAttr(kInitialInflightBatchesAttr,
-                                 &options.initial_in_flight_batches_limit));
-  }
-
-  if (c->HasAttr(kMaxInflightBatchesAttr)) {
-    OP_REQUIRES_OK(c, c->GetAttr(kMaxInflightBatchesAttr,
-                                 &options.max_in_flight_batches_limit));
-  }
-
-  // At this point, the batch kernel is configured to use adaptive scheduling.
-  // To validate or return error at kernel construction time, invokes
-  // `GetOrCreateBatchThreadsPool` and validates returned `thread_pool` is
-  // valid.
-  // Note`GetOrCreateBatchThreadsPool` creates the thread pool once and
-  // re-uses the thread-pool instance afterwards.
-  thread::ThreadPool* thread_pool = tfrt_stub::GetOrCreateBatchThreadsPool();
-  OP_REQUIRES(
-      c, thread_pool != nullptr,
-      errors::FailedPrecondition("Failed to create batch threads pool"));
-
-  adaptive_batch_scheduler_options_ = options;
-}
-
 tfrt::AsyncValueRef<tfrt_stub::FallbackTensor> TFTensorToFallbackTensor(
     const tensorflow::Tensor& tf_tensor) {
   return tfrt::MakeAvailableAsyncValueRef<tfrt_stub::FallbackTensor>(tf_tensor);
@@ -458,12 +295,12 @@ void FallbackBatchResource::ProcessFuncBatchImpl(
     const BatchTask& last_task, absl::Span<const Tensor> inputs,
     std::vector<Tensor>* combined_outputs,
     std::function<void(const Status&)> done) const {
-  llvm::SmallVector<AsyncValue*, 8> arguments;
+  std::vector<tsl::RCReference<AsyncValue>> arguments;
   arguments.reserve(inputs.size() + 1);
   // The first argument is a Chain.
-  arguments.push_back(tfrt::GetReadyChain().release());
+  arguments.push_back(tfrt::GetReadyChain());
   for (auto& input : inputs) {
-    arguments.push_back(TFTensorToFallbackTensor(input).release());
+    arguments.push_back(TFTensorToFallbackTensor(input));
   }
   llvm::SmallVector<RCReference<AsyncValue>, 4> results;
   results.resize(bef_func_->result_types().size());
@@ -494,7 +331,7 @@ void FallbackBatchResource::ProcessFuncBatchImpl(
   batch_exec_ctx.set_work_queue(&exec_ctx.work_queue());
   batch_exec_ctx.set_location(exec_ctx.location());
 
-  bef_func_->ExecuteAsync(batch_exec_ctx, arguments, results);
+  bef_func_->ExecuteAsync(batch_exec_ctx, std::move(arguments), results);
   // There is a comment in tensorflow/core/kernels/batch_kernels.cc
   // counterpart of this method that blocking here seems to improve
   // latency/throughput in practice with how the batching library manage
@@ -502,9 +339,6 @@ void FallbackBatchResource::ProcessFuncBatchImpl(
   // this behavior for now, should reconsider when we redo the batching
   // kernels.
   batch_exec_ctx.work_queue().Await(results);
-  for (AsyncValue* arg : arguments) {
-    arg->DropRef();
-  }
 
   // The first result is a Chain.
   combined_outputs->reserve(results.size() - 1);
@@ -543,8 +377,6 @@ void FallbackBatchResource::ProcessFuncBatchImpl(
   done(final_status);
 }
 
-namespace {
-
 REGISTER_KERNEL_BUILDER(
     Name("_BatchFunctionFallback").Device(DEVICE_CPU),
     tfrt_stub::BatchFunctionFallbackKernel<FallbackBatchResource>);
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc
index 3102aa9f85d..4aa0b50466c 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc
@@ -130,7 +130,7 @@ using tfrt::TensorShape;
 
 #define TFD_REPORT_AND_RETURN_IF_ERROR(handler, status) \
   if (!status.ok()) {                                   \
-    handler.ReportError(status.error_message());        \
+    handler.ReportError(status.message());              \
     return;                                             \
   }
 
@@ -143,7 +143,7 @@ static AsyncValueRef<RuntimeFallbackTensor> CreateRuntimeFallbackTensor(
   tensorflow::Status status = th->NumDims(&rank);
   if (!status.ok())
     return tfrt::MakeErrorAsyncValueRef(tfrt::StrCat(
-        "error getting rank from TF tensor handle: ", status.error_message()));
+        "error getting rank from TF tensor handle: ", status.message()));
 
   llvm::SmallVector<tfrt::Index, 4> dims;
   for (auto i = 0; i < rank; ++i) {
@@ -152,7 +152,7 @@ static AsyncValueRef<RuntimeFallbackTensor> CreateRuntimeFallbackTensor(
     if (!status.ok())
       return tfrt::MakeErrorAsyncValueRef(
           tfrt::StrCat("error getting dimension from TFE tensor handle: ",
-                       status.error_message()));
+                       status.message()));
     dims.push_back(dim);
   }
 
@@ -926,52 +926,6 @@ void CoreRTTensorHandleToFallbackTensorInternal(
   }
 }
 
-// Returns true if the tensorflow::DataType is trivially copyable.
-static bool IsTriviallyCopyableTensorflowDataType(tensorflow::DataType dtype) {
-  static const auto* const non_trivially_copyable_dtypes =
-      new absl::flat_hash_set<tensorflow::DataType>{
-          tensorflow::DataType::DT_STRING, tensorflow::DataType::DT_RESOURCE,
-          tensorflow::DataType::DT_VARIANT};
-  return !non_trivially_copyable_dtypes->contains(dtype);
-}
-
-static llvm::Expected<tensorflow::tfrt_stub::FallbackTensor> ConstDenseTensor(
-    tfrt::DenseAttr value, const tfrt::ExecutionContext& context) {
-  auto dtype = GetTfDataType(tfrt::DType(value.dtype()));
-  // The data type must be trivially copyable so that we can use memcpy.
-  DCHECK(IsTriviallyCopyableTensorflowDataType(dtype));
-  tensorflow::Tensor tensor(dtype, tensorflow::TensorShape(value.shape()));
-  std::memcpy(tensor.data(), value.GetElements(), tensor.TotalBytes());
-  return tensorflow::tfrt_stub::FallbackTensor(tensor);
-}
-
-static llvm::Expected<tensorflow::tfrt_stub::FallbackTensor> ConstStringTensor(
-    tfrt::ArrayAttr shape, tfrt::AggregateAttr value,
-    const ExecutionContext& context) {
-  llvm::SmallVector<int64_t> dims;
-  auto tfrt_tensor_shape = tfrt::TensorShape(shape.GetValue<int64_t>());
-  tfrt_tensor_shape.GetDimensions(&dims);
-  tensorflow::Tensor tensor(tensorflow::DT_STRING,
-                            tensorflow::TensorShape(dims));
-  auto len = tensor.NumElements();
-  auto from = value;
-  auto to = tensor.flat<tensorflow::tstring>();
-  if (from.GetNumElements() == 1) {
-    // All elements are the same, and only one element is saved in BEF.
-    for (size_t i = 0; i < len; ++i) {
-      to(i) =
-          ToAbslStringView(from.GetAttributeOfType<StringAttr>(0).GetValue());
-    }
-  } else {
-    assert(len == from.GetNumElements());
-    for (size_t i = 0; i < len; ++i) {
-      to(i) =
-          ToAbslStringView(from.GetAttributeOfType<StringAttr>(i).GetValue());
-    }
-  }
-  return tensorflow::tfrt_stub::FallbackTensor(tensor);
-}
-
 // The BEF kernel for tfrt::TensorHandle to tensorflow::Tensor conversion.
 void CoreRTTensorHandleToFallbackTensor(
     RemainingArguments args, RemainingResults results, StringAttr device,
@@ -1027,36 +981,6 @@ void FallbackTensorToCoreRTTensorHandle(
                                              device.GetValue(), exec_ctx);
 }
 
-static llvm::Expected<bool> Predicate(
-    const tensorflow::tfrt_stub::FallbackTensor& input,
-    const tfrt::ExecutionContext& exec_ctx) {
-  const auto& tensor = input.tensor();
-  if (TensorShapeUtils::IsScalar(tensor.shape())) {
-    switch (tensor.dtype()) {
-#define CASE(T)                  \
-  case DataTypeToEnum<T>::value: \
-    return tensor.scalar<T>()() != 0;
-
-      CASE(float);
-      CASE(double);
-      CASE(uint8);
-      CASE(int8);
-      CASE(int16);
-      CASE(int32);
-      CASE(int64_t);
-      CASE(bool);
-#undef CASE
-      case DT_STRING:
-        return !tensor.scalar<tstring>()().empty();
-      default:
-        return tfrt::MakeStringError(DataTypeString(tensor.dtype()),
-                                     " cannot be converted to a boolean");
-    }
-  }
-
-  return tensor.NumElements() > 0;
-}
-
 tfrt::Chain PrintFallbackTensor(
     const tensorflow::tfrt_stub::FallbackTensor& arg, const tfrt::Chain& ch) {
   std::string message;
@@ -1111,7 +1035,7 @@ static void RuntimeFallbackExecuteOp(
   Status s = eager_ctx->local_device_mgr()->LookupDevice(device_name, &device);
   if (!s.ok()) {
     // The device name can be invalid in certain cases. Use default CPU device.
-    VLOG(1) << s.error_message() << " using default CPU device.";
+    VLOG(1) << s.message() << " using default CPU device.";
   }
 
   // First we convert tensorflow::Tensor to RuntimeFallbackTensors.
@@ -1218,7 +1142,7 @@ static llvm::Expected<tfrt::Value> ConvertTFTensorHandleToTFRTTensor(
       tensor_handle->Resolve(&status)};
   if (!status.ok()) {
     return tfrt::MakeStringError("error resolving TensorHandle: ",
-                                 status.error_message());
+                                 status.message());
   }
   auto tf_dtype = tensor_interface->Type();
   if (tf_dtype == DT_STRING) {
@@ -1261,10 +1185,6 @@ void RegisterTfdDelegateKernels(tfrt::KernelRegistry* registry) {
   registry->AddKernel("tfd.convert_tft_to_dht",
                       TFRT_KERNEL(TfdConvertTFTToDHT));
   registry->AddKernel("tfd.print_tft", TFRT_KERNEL(TfdPrintTFT));
-  registry->AddKernel("tfrt_fallback_async.const_dense_tensor",
-                      TFRT_KERNEL(ConstDenseTensor));
-  registry->AddKernel("tfrt_fallback_async.const_string_tensor",
-                      TFRT_KERNEL(ConstStringTensor));
   registry->AddKernel(
       "tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor",
       TFRT_KERNEL(CoreRTTensorHandleToFallbackTensor));
@@ -1274,7 +1194,6 @@ void RegisterTfdDelegateKernels(tfrt::KernelRegistry* registry) {
 
   // TODO(b/187106271): Move fallback kernels to fallback only libraries so that
   // we don't have to depend on or link in corert kernels.
-  registry->AddKernel("tfrt_fallback_async.predicate", TFRT_KERNEL(Predicate));
   registry->AddKernel("tfrt_fallback_async.print_tensor",
                       TFRT_KERNEL(PrintFallbackTensor));
   registry->AddKernel("corert.create_runtime_fallback_op_handler",
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.cc
index 6ac807b493d..0ee295a3d05 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.cc
@@ -90,11 +90,9 @@ using tfrt::AsyncValueRef;
 using tfrt::Chain;
 using tfrt::CoreRuntime;
 using tfrt::CoreRuntimeOp;
-using tfrt::DenseHostTensor;
 using tfrt::ExecutionContext;
 using tfrt::Expected;
 using tfrt::OpAttrsRef;
-using tfrt::OpHandler;
 using tfrt::OpInvocation;
 using tfrt::OpMetadataFn;
 using tfrt::raw_ostream;
@@ -131,7 +129,7 @@ static Expected<tfrt::RCReference<tfrt::Device>> GetDeviceFromFallbackTensor(
   const char* tf_device_name =
       result_tensor.GetTensorHandle()->DeviceName(&status);
   if (!status.ok()) {
-    return tfrt::MakeStringError(status.error_message());
+    return tfrt::MakeStringError(status.message());
   }
 
   // TODO(b/165872892): Unify device name for tests.
@@ -312,8 +310,8 @@ llvm::Error RuntimeFallbackOpHandler::Initialize() {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   Status status = InjectTfGpuResources();
   if (!status.ok()) {
-    return tfrt::MakeStringError(tfrt::StrCat("error injecting GPU resources: ",
-                                              status.error_message()));
+    return tfrt::MakeStringError(
+        tfrt::StrCat("error injecting GPU resources: ", status.message()));
   }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.cc
index c0948b16b77..7d130a76b75 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.cc
@@ -49,7 +49,6 @@ using tfrt::HostBuffer;
 using tfrt::HostContext;
 using tfrt::RCReference;
 using tfrt::StringHostTensor;
-using tfrt::Tensor;
 using tfrt::TensorMetadata;
 using tfrt::TensorShape;
 
@@ -150,7 +149,7 @@ CreateRuntimeFallbackTensorFromTfTensorHandle(OwnedTensorHandle owned_th,
   tensorflow::Status status = owned_th->NumDims(&rank);
   if (!status.ok())
     return tfrt::MakeStringError(tfrt::StrCat(
-        "error getting rank from TF tensor handle: ", status.error_message()));
+        "error getting rank from TF tensor handle: ", status.message()));
 
   llvm::SmallVector<tfrt::Index, 4> dims;
   for (auto i = 0; i < rank; ++i) {
@@ -159,7 +158,7 @@ CreateRuntimeFallbackTensorFromTfTensorHandle(OwnedTensorHandle owned_th,
     if (!status.ok())
       return tfrt::MakeStringError(
           tfrt::StrCat("error getting dimension from TFE tensor handle: ",
-                       status.error_message()));
+                       status.message()));
     dims.push_back(dim);
   }
 
diff --git a/tensorflow/core/runtime_fallback/util/BUILD b/tensorflow/core/runtime_fallback/util/BUILD
index a8185dd4f3f..db29b8fc705 100644
--- a/tensorflow/core/runtime_fallback/util/BUILD
+++ b/tensorflow/core/runtime_fallback/util/BUILD
@@ -34,9 +34,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core/tfrt/utils:tensor_util",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
-        "//tensorflow/core/tfrt/utils:tensor_util",
         "@tf_runtime//:core_runtime",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:support",
@@ -46,11 +46,11 @@ cc_library(
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
+            "//tensorflow/core:framework",
+            "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/framework:tensor",
             "//tensorflow/core/framework:tensor_proto_cc",
             "//tensorflow/core/framework:types_proto_cc",
-            "//tensorflow/core:framework",
-            "//tensorflow/core:protos_all_cc",
         ],
     }),
 )
@@ -63,9 +63,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core/platform:logging",
         "@llvm-project//llvm:Support",
         "@tf_runtime//:dtype",
-        "//tensorflow/core/platform:logging",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
@@ -109,10 +109,10 @@ tf_cuda_library(
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            "//tensorflow/core/framework:tensor",
-            "//tensorflow/core/platform:status",
             "//tensorflow/c:tf_tensor",
             "//tensorflow/c:tf_tensor_internal",
+            "//tensorflow/core/framework:tensor",
+            "//tensorflow/core/platform:status",
         ],
     }),
 )
@@ -133,22 +133,22 @@ tf_cuda_library(
         "requires_cuda",
     ],
     deps = [
-        "@tf_runtime//backends/gpu:gpu_device",
-        "@tf_runtime//backends/gpu:gpu_config",
-        "@tf_runtime//:support",
-        ":type_util",
         ":tensor_util",
+        ":type_util",
+        "@tf_runtime//:support",
+        "@tf_runtime//backends/gpu:gpu_config",
+        "@tf_runtime//backends/gpu:gpu_device",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            "//tensorflow/core/common_runtime/eager:tensor_handle",
-            "//tensorflow/core/common_runtime/gpu:gpu_runtime",
-            "//tensorflow/compiler/xla/stream_executor:platform",
-            "//tensorflow/compiler/xla/stream_executor/cuda:cuda_driver",
             "//tensorflow/c:tf_tensor",
             "//tensorflow/c:tf_tensor_internal",
+            "//tensorflow/compiler/xla/stream_executor:platform",
+            "//tensorflow/compiler/xla/stream_executor/cuda:cuda_driver",
+            "//tensorflow/core/common_runtime/eager:tensor_handle",
+            "//tensorflow/core/common_runtime/gpu:gpu_runtime",
         ],
     }),
 )
diff --git a/tensorflow/core/runtime_fallback/util/tensor_util.cc b/tensorflow/core/runtime_fallback/util/tensor_util.cc
index e36d5382d02..97d778d7bb8 100644
--- a/tensorflow/core/runtime_fallback/util/tensor_util.cc
+++ b/tensorflow/core/runtime_fallback/util/tensor_util.cc
@@ -25,7 +25,6 @@ using tfrt::Expected;
 using tfrt::HostBuffer;
 using tfrt::RCReference;
 using tfrt::StringHostTensor;
-using tfrt::TensorShape;
 
 // Moves one ref on HostBuffer to tensorflow::Tensor.
 tensorflow::Tensor MoveHostBufferToTfTensor(RCReference<HostBuffer> host_buffer,
diff --git a/tensorflow/core/summary/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
index e8d1f07ce5e..e96a0175161 100644
--- a/tensorflow/core/summary/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -894,7 +894,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     Status s = run_.Finish(db_);
     if (!s.ok()) {
       // TODO(jart): Retry on transient errors here.
-      LOG(ERROR) << s.ToString();
+      LOG(ERROR) << s;
     }
     int64_t run_id = meta_.run_id();
     if (run_id == kAbsent) return;
@@ -909,8 +909,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
       s = update.StepAndReset();
     }
     if (!s.ok()) {
-      LOG(ERROR) << "Failed to set Runs[" << run_id
-                 << "].finish_time: " << s.ToString();
+      LOG(ERROR) << "Failed to set Runs[" << run_id << "].finish_time: " << s;
     }
   }
 
diff --git a/tensorflow/core/tfrt/common/BUILD b/tensorflow/core/tfrt/common/BUILD
index e5c6a687c71..35b6c57d122 100644
--- a/tensorflow/core/tfrt/common/BUILD
+++ b/tensorflow/core/tfrt/common/BUILD
@@ -20,6 +20,7 @@ package_group(
         # copybara:uncomment "//learning/brain/google/xla/...",
         # copybara:uncomment "//learning/brain/tfrc/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
+        # copybara:uncomment "//platforms/xla/megascale/tensorflow/...",
         "//tensorflow/c/...",
         "//tensorflow/compiler/jit/...",
         "//tensorflow/core/common_runtime/next_pluggable_device/...",
@@ -75,9 +76,8 @@ cc_library(
     ],
     visibility = [":friends"],
     deps = [
-        ":global_state",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
-        "//tensorflow/core:framework_types_hdr",
+        "//tensorflow/core:framework",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
     ],
@@ -105,8 +105,7 @@ cc_library(
 )
 
 # Uses this target if you need to create a PJRT client.
-# TODO(b/269308144): consider adding a PJRT client factory to separate the
-# dependenceis of different PJRT clients.
+# TODO(b/280671896) Combines pjrt_util and create_pjrt_client_util
 cc_library(
     name = "create_pjrt_client_util",
     srcs = [
@@ -117,11 +116,12 @@ cc_library(
     ],
     visibility = [":friends"],
     deps = [
+        ":pjrt_client_factory_options",
+        ":pjrt_client_factory_registry",
         ":pjrt_util",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
-        "//tensorflow/compiler/xla/pjrt/gpu:gpu_helpers",
-        "//tensorflow/compiler/xla/pjrt/gpu:se_gpu_pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:tf_pjrt_client",
         "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core/platform:errors",
     ],
@@ -166,6 +166,7 @@ tf_cuda_cc_test(
     deps = [
         ":create_pjrt_client_util",
         ":global_state",
+        ":pjrt_gpu_client_registration",
         ":pjrt_state",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
@@ -178,3 +179,84 @@ tf_cuda_cc_test(
         "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
     ],
 )
+
+cc_library(
+    name = "pjrt_client_factory_options",
+    hdrs = ["pjrt_client_factory_options.h"],
+    visibility = [":friends"],
+    deps = [
+        "//tensorflow/compiler/xla/pjrt/distributed:client",
+        "//tensorflow/compiler/xla/pjrt/gpu:gpu_helpers",
+    ],
+)
+
+cc_library(
+    name = "pjrt_client_factory_registry",
+    srcs = ["pjrt_client_factory_registry.cc"],
+    hdrs = ["pjrt_client_factory_registry.h"],
+    deps = [
+        ":pjrt_client_factory_options",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/tsl/framework:device_type",
+        "//tensorflow/tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "pjrt_cpu_client_registration",
+    srcs = ["pjrt_cpu_client_registration.cc"],
+    deps = [
+        ":pjrt_client_factory_options",
+        ":pjrt_client_factory_registry",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:tfrt_cpu_pjrt_client",
+        "//tensorflow/core:framework_types_hdr",
+    ],
+    alwayslink = True,
+)
+
+tf_cc_test(
+    name = "pjrt_cpu_client_registration_test",
+    srcs = ["pjrt_cpu_client_registration_test.cc"],
+    deps = [
+        ":pjrt_client_factory_options",
+        ":pjrt_client_factory_registry",
+        ":pjrt_cpu_client_registration",
+        "//tensorflow/core:framework_types_hdr",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "pjrt_gpu_client_registration",
+    srcs = ["pjrt_gpu_client_registration.cc"],
+    deps = [
+        ":pjrt_client_factory_options",
+        ":pjrt_client_factory_registry",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt/gpu:se_gpu_pjrt_client",
+        "//tensorflow/core:framework_types_hdr",
+    ],
+    alwayslink = True,
+)
+
+tf_cuda_cc_test(
+    name = "pjrt_gpu_client_registration_test",
+    size = "small",
+    srcs = ["pjrt_gpu_client_registration_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":pjrt_client_factory_options",
+        ":pjrt_client_factory_registry",
+        ":pjrt_gpu_client_registration",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/core:framework_types_hdr",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/core/tfrt/common/create_pjrt_client_util.cc b/tensorflow/core/tfrt/common/create_pjrt_client_util.cc
index 5b40a02e44c..4d7ec46a8ff 100644
--- a/tensorflow/core/tfrt/common/create_pjrt_client_util.cc
+++ b/tensorflow/core/tfrt/common/create_pjrt_client_util.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <set>
+#include <utility>
 
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.h"
-#include "tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
-#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/compiler/xla/pjrt/tf_pjrt_client.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_registry.h"
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
 
 namespace tensorflow {
@@ -40,22 +40,20 @@ StatusOr<xla::PjRtClient*> GetOrCreatePjRtClient(
   if (!tsl::errors::IsNotFound(existing_pjrt_client.status())) {
     return existing_pjrt_client;
   }
+  std::unique_ptr<xla::PjRtClient> pjrt_client;
   // TODO(b/260799193): use XlaPlatformInfo to pass device-specific options.
   // This info should be set in the plugin init for next pluggable device.
-  if (device_type != DEVICE_XLA_GPU) {
-    return errors::Unimplemented(
-        "The PJRT client for ", device_type,
-        " is not created explicitly before its first use and creating this "
-        "PJRT client on the first use is not implemented.");
-  }
-  xla::GpuAllocatorConfig allocator_config;
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtClient> pjrt_client,
-                      xla::GetStreamExecutorGpuClient(
-                          /*asynchronous=*/true, allocator_config,
-                          /*distributed_client=*/nullptr,
-                          /*node_id=*/0, allowed_devices));
-  // Gets a pointer of pjrt_client because the ownership of pjrt_client will be
-  // transferred in the SetPjRtClientInTFGlobalResourceManager call below.
+
+  // TODO(b/280111106): make PjrtClientFactoryOptions an input of
+  // GetOrCreatePjRtClient.
+  xla::PjrtClientFactoryOptions options = xla::PjrtClientFactoryOptions();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtClient> client,
+                      xla::PjrtClientFactoryRegistry::Get().GetPjrtClient(
+                          device_type, options));
+  pjrt_client = xla::TfPjRtClient::CreateTfPjRtClient(std::move(client));
+
+  // Gets a pointer of pjrt_client because the ownership of pjrt_client will
+  // be transferred in the SetPjRtClientInTFGlobalResourceManager call below.
   auto pjrt_client_ptr = pjrt_client.get();
   TF_RETURN_IF_ERROR(SetPjRtClientInTFGlobalResourceManager(
       device_type, std::move(pjrt_client)));
diff --git a/tensorflow/core/tfrt/common/create_pjrt_client_util.h b/tensorflow/core/tfrt/common/create_pjrt_client_util.h
index d6a9afce75d..989a760276c 100644
--- a/tensorflow/core/tfrt/common/create_pjrt_client_util.h
+++ b/tensorflow/core/tfrt/common/create_pjrt_client_util.h
@@ -34,6 +34,8 @@ namespace tensorflow {
 // created. `allowed_devices` is only used for GPU.
 // TODO(b/260802979): consider passing `XlaPlatformInfo` for the options to
 // create a client, or creating a class similar to `LocalClientOptions`.
+// TODO(b/280111106): make PjrtClientFactoryOptions an input of
+// GetOrCreatePjRtClient.
 StatusOr<xla::PjRtClient*> GetOrCreatePjRtClient(
     const DeviceType& device_type,
     std::optional<std::set<int>> allowed_devices = std::nullopt);
diff --git a/tensorflow/core/tfrt/common/create_pjrt_client_util_test.cc b/tensorflow/core/tfrt/common/create_pjrt_client_util_test.cc
index a56f51ac248..e6736fcc63b 100644
--- a/tensorflow/core/tfrt/common/create_pjrt_client_util_test.cc
+++ b/tensorflow/core/tfrt/common/create_pjrt_client_util_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/types.h"
-// #include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/status_matchers.h"
 
 namespace tensorflow {
@@ -27,11 +26,10 @@ using ::tsl::testing::StatusIs;
 
 TEST(CreatePjRtClientTest, GetNotExistPjRtClientNotImplemented) {
   EXPECT_THAT(
-      GetOrCreatePjRtClient(DEVICE_TPU),
-      StatusIs(error::UNIMPLEMENTED,
-               HasSubstr("The PJRT client for TPU is not created explicitly "
-                         "before its first use and creating this PJRT client "
-                         "on the first use is not implemented.")));
+      GetOrCreatePjRtClient(DEVICE_CPU),
+      StatusIs(error::NOT_FOUND,
+               HasSubstr(absl::StrCat("The PJRT client factory of `",
+                                      DEVICE_CPU, "` is not registered"))));
 }
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/tfrt/common/pjrt_client_factory_options.h b/tensorflow/core/tfrt/common/pjrt_client_factory_options.h
new file mode 100644
index 00000000000..fb4fe5919ef
--- /dev/null
+++ b/tensorflow/core/tfrt/common/pjrt_client_factory_options.h
@@ -0,0 +1,49 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_OPTIONS_H_
+#define TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_OPTIONS_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+
+#include "tensorflow/compiler/xla/pjrt/distributed/client.h"
+#include "tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.h"
+
+namespace xla {
+// PjrtClientFactoryOptions store arguments to create PJRT client.
+// Caller is responsible to set option value for corresponding PJRT client
+// factory.
+struct PjrtClientFactoryOptions {
+  struct GpuClientCreateOptions {
+    bool asynchronous = false;
+    xla::GpuAllocatorConfig allocator_config = {};
+    std::shared_ptr<xla::DistributedRuntimeClient> distributed_client = nullptr;
+    int node_id = 0;
+    std::optional<std::set<int>> allowed_devices = std::nullopt;
+    std::optional<std::string> platform_name = std::nullopt;
+  };
+
+  struct CpuClientCreateOptions {
+    bool asynchronous = false;
+  };
+  GpuClientCreateOptions gpu_options;
+  CpuClientCreateOptions cpu_options;
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_OPTIONS_H_
diff --git a/tensorflow/core/tfrt/common/pjrt_client_factory_registry.cc b/tensorflow/core/tfrt/common/pjrt_client_factory_registry.cc
new file mode 100644
index 00000000000..87f86fe6fa7
--- /dev/null
+++ b/tensorflow/core/tfrt/common/pjrt_client_factory_registry.cc
@@ -0,0 +1,63 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_registry.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+PjrtClientFactoryRegistry& PjrtClientFactoryRegistry::Get() {
+  static PjrtClientFactoryRegistry* kInstance = new PjrtClientFactoryRegistry();
+  return *kInstance;
+}
+
+tensorflow::InitOnStartupMarker
+PjrtClientFactoryRegistry::RegisterPjrtClientFactory(
+    const tsl::DeviceType& device_type,
+    const PjrtClientFactory& client_factory) {
+  tensorflow::mutex_lock l(mu_);
+  const std::string& device_type_str = device_type.type_string();
+  if (registry_.find(device_type_str) != registry_.end()) {
+    LOG(ERROR) << "Duplicate device type " << device_type_str;
+  }
+  registry_.emplace(device_type_str, client_factory);
+  return {};
+}
+
+StatusOr<std::unique_ptr<PjRtClient>> PjrtClientFactoryRegistry::GetPjrtClient(
+    const tsl::DeviceType& device_type,
+    const PjrtClientFactoryOptions& options) {
+  tensorflow::tf_shared_lock l(mu_);
+  const std::string& device_type_str = device_type.type_string();
+  const auto client_factory_it = registry_.find(device_type_str);
+  if (client_factory_it == registry_.end()) {
+    std::string error_msg;
+    absl::StrAppend(&error_msg, " The PJRT client factory of `",
+                    device_type_str,
+                    "` is not registered, available client factory: [");
+    for (const auto& [device_name, ignored_func] : registry_) {
+      absl::StrAppend(&error_msg, "`", device_name, "`, ");
+    }
+    absl::StrAppend(&error_msg,
+                    "]. Did you forget to link with the appropriate "
+                    "`pjrt_*_client_registration` library?");
+    return tsl::errors::NotFound(error_msg);
+  }
+  return client_factory_it->second(options);
+}
+}  // namespace xla
diff --git a/tensorflow/core/tfrt/common/pjrt_client_factory_registry.h b/tensorflow/core/tfrt/common/pjrt_client_factory_registry.h
new file mode 100644
index 00000000000..b82f0022a0a
--- /dev/null
+++ b/tensorflow/core/tfrt/common/pjrt_client_factory_registry.h
@@ -0,0 +1,71 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_REGISTRY_H_
+#define TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/registration/registration.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
+#include "tensorflow/tsl/framework/device_type.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+
+namespace xla {
+
+using PjrtClientFactory = std::function<StatusOr<std::unique_ptr<PjRtClient>>(
+    const PjrtClientFactoryOptions&)>;
+
+// The Pjrt client factory registry holds all the registered client factories.
+class PjrtClientFactoryRegistry {
+ public:
+  explicit PjrtClientFactoryRegistry() = default;
+
+  // Registers PjrtClientFactory with DeviceType as key.
+  tensorflow::InitOnStartupMarker RegisterPjrtClientFactory(
+      const tsl::DeviceType& device_type,
+      const PjrtClientFactory& client_factory);
+
+  // Given the device type, finds related PjrtClientFactory function which takes
+  // factory option and returns PjrtClient if succeeds.
+  StatusOr<std::unique_ptr<PjRtClient>> GetPjrtClient(
+      const tsl::DeviceType& device_type,
+      const PjrtClientFactoryOptions& options);
+
+  // Returns singleton instance of PjrtClientFactoryRegistry class.
+  static PjrtClientFactoryRegistry& Get();
+
+ private:
+  absl::flat_hash_map<std::string, PjrtClientFactory> registry_
+      TF_GUARDED_BY(mu_);
+
+  mutable ::tensorflow::mutex mu_;
+};
+
+// The `REGISTER_PJRT_CLIENT_FACTORY()` calls RegisterPjrtClientFactory on
+// program startup.
+#define REGISTER_PJRT_CLIENT_FACTORY(pjrt_client, device_type, client_factory) \
+  static ::tensorflow::InitOnStartupMarker const register_##pjrt_client =      \
+      ::tensorflow::InitOnStartupMarker{}                                      \
+      << ::xla::PjrtClientFactoryRegistry::Get().RegisterPjrtClientFactory(    \
+             tsl::DeviceType(device_type), client_factory)
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_CORE_TFRT_COMMON_PJRT_CLIENT_FACTORY_REGISTRY_H_
diff --git a/tensorflow/core/tfrt/common/pjrt_cpu_client_registration.cc b/tensorflow/core/tfrt/common/pjrt_cpu_client_registration.cc
new file mode 100644
index 00000000000..9f75b95eec8
--- /dev/null
+++ b/tensorflow/core/tfrt/common/pjrt_cpu_client_registration.cc
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_registry.h"
+
+namespace xla {
+
+StatusOr<std::unique_ptr<xla::PjRtClient>> GetCpuClient(
+    const PjrtClientFactoryOptions& option) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
+                      xla::GetTfrtCpuClient(option.cpu_options.asynchronous));
+
+  return std::move(client);
+}
+
+REGISTER_PJRT_CLIENT_FACTORY(cpu_client, tensorflow::DEVICE_CPU, GetCpuClient);
+
+}  // namespace xla
diff --git a/tensorflow/core/tfrt/common/pjrt_cpu_client_registration_test.cc b/tensorflow/core/tfrt/common/pjrt_cpu_client_registration_test.cc
new file mode 100644
index 00000000000..26d6884e910
--- /dev/null
+++ b/tensorflow/core/tfrt/common/pjrt_cpu_client_registration_test.cc
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_registry.h"
+
+namespace xla {
+namespace {
+
+TEST(PjrtCpuClientCreateTest, TestCpuCreateoption) {
+  PjrtClientFactoryOptions options = PjrtClientFactoryOptions();
+  options.cpu_options.asynchronous = true;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, xla::PjrtClientFactoryRegistry::Get().GetPjrtClient(
+                       tsl::DeviceType(tensorflow::DEVICE_CPU), options));
+}
+}  // namespace
+
+}  // namespace xla
diff --git a/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc
new file mode 100644
index 00000000000..d9b74a7d8d2
--- /dev/null
+++ b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_registry.h"
+namespace xla {
+
+StatusOr<std::unique_ptr<xla::PjRtClient>> GetGpuClient(
+    const PjrtClientFactoryOptions& option) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<PjRtClient> client,
+      xla::GetStreamExecutorGpuClient(
+          option.gpu_options.asynchronous, option.gpu_options.allocator_config,
+          option.gpu_options.distributed_client, option.gpu_options.node_id,
+          option.gpu_options.allowed_devices,
+          option.gpu_options.platform_name));
+  return std::move(client);
+}
+
+REGISTER_PJRT_CLIENT_FACTORY(gpu_client, tensorflow::DEVICE_GPU, GetGpuClient);
+
+REGISTER_PJRT_CLIENT_FACTORY(xla_gpu_client, tensorflow::DEVICE_XLA_GPU,
+                             GetGpuClient);
+
+}  // namespace xla
diff --git a/tensorflow/core/tfrt/common/pjrt_gpu_client_registration_test.cc b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration_test.cc
new file mode 100644
index 00000000000..b3cf4974fb9
--- /dev/null
+++ b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration_test.cc
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <optional>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_options.h"
+#include "tensorflow/core/tfrt/common/pjrt_client_factory_registry.h"
+
+namespace xla {
+namespace {
+
+TEST(PjrtGpuClientCreateTest, TestGpuCreateOption) {
+  PjrtClientFactoryOptions options = PjrtClientFactoryOptions();
+  options.gpu_options.asynchronous = true;
+  options.gpu_options.allocator_config = xla::GpuAllocatorConfig();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, xla::PjrtClientFactoryRegistry::Get().GetPjrtClient(
+                       tsl::DeviceType(tensorflow::DEVICE_GPU), options));
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/tensorflow/core/tfrt/common/pjrt_state.cc b/tensorflow/core/tfrt/common/pjrt_state.cc
index 451f63d3f1c..8f63e70bd6d 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.cc
+++ b/tensorflow/core/tfrt/common/pjrt_state.cc
@@ -44,12 +44,15 @@ Status PjRtState::SetPjRtClient(const DeviceType& device_type,
   return OkStatus();
 }
 
-Status PjRtState::DeletePjRtClientIfExists(const DeviceType& device_type) {
+Status PjRtState::MovePjRtClientToUnused(const DeviceType& device_type) {
   absl::MutexLock lock(&mu_);
   if (auto it = clients_.find(device_type); it != clients_.end()) {
+    unused_.push_back(std::move(it->second));
     clients_.erase(it);
+    return OkStatus();
   }
-  return OkStatus();
+  return errors::NotFound("PjRt client not found for device type ",
+                          device_type);
 }
 
 string PjRtState::DebugString() const { return "PjRtState"; }
diff --git a/tensorflow/core/tfrt/common/pjrt_state.h b/tensorflow/core/tfrt/common/pjrt_state.h
index 70aff63014a..8b1816e617b 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.h
+++ b/tensorflow/core/tfrt/common/pjrt_state.h
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <map>
 #include <memory>
-#include <unordered_map>
 #include <vector>
 
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
@@ -37,7 +36,9 @@ class PjRtState : public ResourceBase {
   StatusOr<xla::PjRtClient*> GetPjRtClient(const DeviceType& device_type);
   Status SetPjRtClient(const DeviceType& device_type,
                        std::unique_ptr<xla::PjRtClient> client);
-  Status DeletePjRtClientIfExists(const DeviceType& device_type);
+  // Moves PJRT client to `unused_`. The PJRT client moved to `unused_` will not
+  // be returned by `GetPjRtClient`.
+  Status MovePjRtClientToUnused(const DeviceType& device_type);
   string DebugString() const override;
 
  private:
diff --git a/tensorflow/core/tfrt/common/pjrt_state_test.cc b/tensorflow/core/tfrt/common/pjrt_state_test.cc
index f03356bfd37..43f778706e3 100644
--- a/tensorflow/core/tfrt/common/pjrt_state_test.cc
+++ b/tensorflow/core/tfrt/common/pjrt_state_test.cc
@@ -76,19 +76,26 @@ TEST_F(PjRtStateTestFixture, GetNotExistPjRtClient) {
 }
 
 TEST_F(PjRtStateTestFixture, DeletePjRtClient) {
-  TF_ASSERT_OK(pjrt_state_->SetPjRtClient(
-      tensorflow::DEVICE_CPU,
-      xla::GetTfrtCpuClient(/*asynchronous=*/true, /*cpu_device_count=*/1)
-          .value()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto pjrt_client,
+      xla::GetTfrtCpuClient(/*asynchronous=*/true, /*cpu_device_count=*/1));
+  xla::PjRtClient* pjrt_client_ptr = pjrt_client.get();
+  TF_ASSERT_OK(pjrt_state_->SetPjRtClient(tensorflow::DEVICE_CPU,
+                                          std::move(pjrt_client)));
+
+  TF_ASSERT_OK(pjrt_state_->MovePjRtClientToUnused(tensorflow::DEVICE_CPU));
 
-  TF_ASSERT_OK(pjrt_state_->DeletePjRtClientIfExists(tensorflow::DEVICE_CPU));
   EXPECT_THAT(pjrt_state_->GetPjRtClient(tensorflow::DEVICE_CPU),
               StatusIs(tensorflow::error::NOT_FOUND,
                        HasSubstr("PjRt client not found for device type")));
+  // Verifies that the PJRT client is still alive.
+  EXPECT_EQ(pjrt_client_ptr->platform_name(), "cpu");
+}
+
+TEST_F(PjRtStateTestFixture, DeleteNotExistPjRtClient) {
+  EXPECT_THAT(pjrt_state_->MovePjRtClientToUnused(tensorflow::DEVICE_CPU),
+              StatusIs(tensorflow::error::NOT_FOUND,
+                       HasSubstr("PjRt client not found for device type")));
 }
 
-TEST_F(PjRtStateTestFixture, DeleteNotExistPjRtClient) {
-  TF_EXPECT_OK(pjrt_state_->DeletePjRtClientIfExists(tensorflow::DEVICE_CPU));
-}
-
 }  // namespace
diff --git a/tensorflow/core/tfrt/eager/BUILD b/tensorflow/core/tfrt/eager/BUILD
index ff9f5068a52..e61771c2b61 100644
--- a/tensorflow/core/tfrt/eager/BUILD
+++ b/tensorflow/core/tfrt/eager/BUILD
@@ -63,24 +63,19 @@ cc_library(
         ":tfrt_context",
         ":transform_graph_function",
         ":virtual_device",
-        "@com_google_absl//absl/types:optional",
-        "@llvm-project//llvm:Support",
         "//tensorflow/c:tensor_interface",
+        "//tensorflow/c:tf_tensor_internal",
         "//tensorflow/c/eager:abstract_function",
+        "//tensorflow/c/eager:abstract_op_attrs",
         "//tensorflow/c/eager:abstract_operation",
         "//tensorflow/c/eager:abstract_tensor_handle",
-        "//tensorflow/c/eager:abstract_op_attrs",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/c/experimental/saved_model/core:saved_model_api",
-        "//tensorflow/c:tf_tensor_internal",
-        "//tensorflow/compiler/mlir/tfrt:import_model",
+        "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/mlir/tfrt:function",
-        "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/core/runtime_fallback/util:attr_util",
-        "//tensorflow/core/runtime_fallback/util:tensor_util",
-        "//tensorflow/core/tfrt/utils:error_util",
+        "//tensorflow/compiler/mlir/tfrt:import_model",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -89,26 +84,31 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:attr_builder",
         "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/runtime_fallback/conversion:conversion_alwayslink",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat_eager",
         "//tensorflow/core/runtime_fallback/runtime:op_logger",
         "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
-        "//tensorflow/core/runtime_fallback/conversion:conversion_alwayslink",
+        "//tensorflow/core/runtime_fallback/util:attr_util",
+        "//tensorflow/core/runtime_fallback/util:tensor_util",
+        "//tensorflow/core/tfrt/eager/backends/cpu:cpu_registration_alwayslink",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/core/tfrt/utils",
+        "//tensorflow/core/tfrt/utils:error_util",
+        "@com_google_absl//absl/types:optional",
+        "@llvm-project//llvm:Support",
         "@tf_runtime//:basic_kernels_alwayslink",
         "@tf_runtime//:bef",
+        "@tf_runtime//:bef_attr_encoder",
         "@tf_runtime//:befexecutor",
         "@tf_runtime//:core_runtime_alwayslink",
         "@tf_runtime//:dtype",
-        "@tf_runtime//:bef_attr_encoder",
         "@tf_runtime//:hostcontext_alwayslink",
         "@tf_runtime//:metrics",
         "@tf_runtime//:support",
         "@tf_runtime//:tensor_alwayslink",
-        "@tf_runtime//backends/cpu:core_runtime_alwayslink",
         "@tf_runtime//backends/common:eigentype",
-        "//tensorflow/compiler/jit:common",
-        "//tensorflow/core/tfrt/eager/backends/cpu:cpu_registration_alwayslink",
+        "@tf_runtime//backends/cpu:core_runtime_alwayslink",
     ] + if_cuda([
         "//tensorflow/core/tfrt/eager/backends/gpu:gpu_registration_alwayslink",
     ]),
@@ -125,22 +125,22 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":core_runtime",
-        "//tensorflow/core/common_runtime:process_util",
-        "//tensorflow/core/platform:threadpool_interface",
-        "//tensorflow/core/runtime_fallback/runtime:kernel_utils",
-        "//tensorflow/core/tfrt/common:global_state",
-        "//tensorflow/core/tpu:virtual_device",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core/common_runtime:process_util",
+        "//tensorflow/core/platform:threadpool_interface",
         "//tensorflow/core/runtime_fallback/conversion:conversion_alwayslink",
+        "//tensorflow/core/runtime_fallback/runtime:kernel_utils",
+        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
+        "//tensorflow/core/tfrt/common:global_state",
+        "//tensorflow/core/tfrt/eager/backends/cpu:cpu_registration_alwayslink",
         "//tensorflow/core/tfrt/runtime:tf_threadpool_concurrent_work_queue",
+        "//tensorflow/core/tpu:virtual_device",
         "@tf_runtime//:basic_kernels_alwayslink",
         "@tf_runtime//:core_runtime_alwayslink",
         "@tf_runtime//:hostcontext_alwayslink",
-        "//tensorflow/core/tfrt/eager/backends/cpu:cpu_registration_alwayslink",
     ] + if_cuda([
         "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_gpu_alwayslink",
         "//tensorflow/core/tfrt/eager/backends/gpu:gpu_registration_alwayslink",
@@ -203,7 +203,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/lib/llvm_rtti",
         "//tensorflow/core/platform:refcount",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat_eager",
         "@com_google_absl//absl/types:span",
         "@tf_runtime//backends/cpu:tf_ops_alwayslink",
     ],
diff --git a/tensorflow/core/tfrt/eager/backends/gpu/gpu_registration.cc b/tensorflow/core/tfrt/eager/backends/gpu/gpu_registration.cc
index 3f97d33549b..7abd250ac05 100644
--- a/tensorflow/core/tfrt/eager/backends/gpu/gpu_registration.cc
+++ b/tensorflow/core/tfrt/eager/backends/gpu/gpu_registration.cc
@@ -45,8 +45,7 @@ static void RegisterGpuOpHandler(CoreRuntime* core_runtime,
           tf_device_id, &platform_device_id);
       if (!s.ok()) {
         LOG(ERROR) << "Failed to convert gpu device [" << device->name()
-                   << "] to platform device id due to error: "
-                   << s.error_message();
+                   << "] to platform device id due to error: " << s.message();
         continue;
       }
       auto gpu = tfrt::gpu::GetOrCreateGpuDevice(
diff --git a/tensorflow/core/tfrt/eager/c_api_tfrt.cc b/tensorflow/core/tfrt/eager/c_api_tfrt.cc
index 226529322b8..6a4527efdf1 100644
--- a/tensorflow/core/tfrt/eager/c_api_tfrt.cc
+++ b/tensorflow/core/tfrt/eager/c_api_tfrt.cc
@@ -51,7 +51,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/public/session_options.h"
-#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.h"
 #include "tensorflow/core/runtime_fallback/runtime/op_logger.h"
 #include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.h"
 #include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h"
@@ -290,6 +290,24 @@ int64_t GetNextLocationId() {
   static std::atomic<int64_t> id(0);
   return id.fetch_add(1, std::memory_order_relaxed);
 }
+
+// TODO(b/161370736): Have a formal method to convert between TF's and TFRT's
+// device name. Currently TFRT adopts the suffix of TF's device name,
+// e.g. CPU:0.
+tfrt::Expected<const char*> ConvertTfDeviceNameToTfrt(
+    const char* device_name, tensorflow::EagerContext* eager_context) {
+  // NOTE(fishx): We need to get tf_device first because DeviceMgr in current TF
+  // allows us get the device with simplified name like "CPU:0". However, TFRT
+  // DeviceManager only allows get device via its fullname.
+  tensorflow::Device* tf_device;
+  tensorflow::Status s =
+      eager_context->FindDeviceFromName(device_name, &tf_device);
+  if (!s.ok()) {
+    return MakeStringError(s.message());
+  }
+  return tf_device->name().c_str();
+}
+
 }  // namespace
 
 tensorflow::DataType TensorInterface::Type() const {
@@ -539,7 +557,7 @@ tensorflow::AbstractTensorInterface* TensorHandleInterface::Resolve(
           .build();
   if (!req_ctx) {
     *status = tensorflow::Status(
-        tensorflow::error::Code::UNKNOWN,
+        absl::StatusCode::kUnknown,
         StrCat("Failed to build a RequestContext: ", req_ctx.takeError()));
     return nullptr;
   }
@@ -553,7 +571,7 @@ tensorflow::AbstractTensorInterface* TensorHandleInterface::Resolve(
   }
   if (target_av->IsError()) {
     *status = tensorflow::Status(
-        tensorflow::error::Code::UNKNOWN,
+        absl::StatusCode::kUnknown,
         StrCat("Cannot resolve tensor: ", target_av->GetError().message()));
     return nullptr;
   }
@@ -1048,7 +1066,7 @@ ContextInterface::CopyTensorHandleToDevice(
     *status = tensorflow::errors::InvalidArgument(
         StrCat(tfrt_device_name.takeError()));
     RCReference<AsyncValue> error_av =
-        MakeErrorAsyncValueRef(status->error_message());
+        MakeErrorAsyncValueRef(status->message());
     return new TensorHandleInterface(
         Value(TensorHandle::CreateError(std::move(error_av))),
         GetTfrtContext());
@@ -1123,6 +1141,11 @@ const tensorflow::FunctionDef* ContextInterface::FindFunctionDef(
   return GetEagerContext()->FindFunctionDef(name);
 }
 
+tensorflow::core::RefCountPtr<tensorflow::FunctionRecord>
+ContextInterface::FindRecord(const std::string& name) const {
+  return GetEagerContext()->FindRecord(name);
+}
+
 const tensorflow::DeviceNameUtils::ParsedName&
 ContextInterface::HostCPUParsedName() const {
   return context_.HostCPUParsedName();
diff --git a/tensorflow/core/tfrt/eager/c_api_tfrt.h b/tensorflow/core/tfrt/eager/c_api_tfrt.h
index fbdc5a214ad..94c57e91855 100644
--- a/tensorflow/core/tfrt/eager/c_api_tfrt.h
+++ b/tensorflow/core/tfrt/eager/c_api_tfrt.h
@@ -169,6 +169,8 @@ class ContextInterface : public tensorflow::ImmediateExecutionContext {
       const std::string& func, std::function<void()> notifier) override;
   const tensorflow::FunctionDef* FindFunctionDef(
       const std::string& name) const override;
+  tensorflow::core::RefCountPtr<tensorflow::FunctionRecord> FindRecord(
+      const std::string& name) const override;
 
   const tensorflow::DeviceNameUtils::ParsedName& HostCPUParsedName()
       const override;
@@ -556,7 +558,7 @@ class OperationInterface : public tensorflow::ImmediateExecutionOperation {
     // TODO(b/181368626): Support cancellation.
   }
 
-  absl::optional<tensorflow::ManagedStackTrace> GetStackTrace() override {
+  std::optional<tensorflow::ManagedStackTrace> GetStackTrace() override {
     return stack_trace_;
   }
 
@@ -601,7 +603,7 @@ class OperationInterface : public tensorflow::ImmediateExecutionOperation {
   AbortLocationHandler abort_location_handler_;
   ContextInterface* const context_;
   // TODO(kkb): Use tfrt::Location and implement TFRT async stack tracing.
-  absl::optional<tensorflow::ManagedStackTrace> stack_trace_;
+  std::optional<tensorflow::ManagedStackTrace> stack_trace_;
 
   int custom_device_tensor_handle_count_ = 0;
 };
diff --git a/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/op_handler_selector_test.cc b/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/op_handler_selector_test.cc
index 783f6d8ebfc..b79ce9747d8 100644
--- a/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/op_handler_selector_test.cc
+++ b/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/op_handler_selector_test.cc
@@ -111,7 +111,7 @@ class FakeTensorHandle : public tensorflow::ImmediateExecutionTensorHandle {
         device_name_(device_name),
         dtype_(dtype) {}
 
-  void Release() override { Unref(); }
+  void Release() { Unref(); }
 
   tensorflow::DataType DataType() const override { return dtype_; }
   Status Shape(tensorflow::PartialTensorShape* shape) const override {
@@ -146,7 +146,7 @@ class FakeTensorHandle : public tensorflow::ImmediateExecutionTensorHandle {
   tensorflow::AbstractTensorInterface* Resolve(Status* status) override {
     llvm_unreachable("unimplemented method.");
   }
-  ImmediateExecutionTensorHandle* Copy() override {
+  ImmediateExecutionTensorHandle* Copy() {
     Ref();
     return this;
   }
@@ -467,8 +467,7 @@ TEST_F(SelectorTest, InvalidDeviceNameTest) {
                                     &op_handler);
   ASSERT_EQ(s.code(), absl::StatusCode::kInvalidArgument);
   ASSERT_FALSE(static_cast<bool>(op_handler));
-  EXPECT_TRUE(
-      absl::StrContains(s.error_message(), "Failed to parse device name"));
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "Failed to parse device name"));
 }
 
 TEST_F(SelectorTest, SoftPlacementTest) {
@@ -480,7 +479,7 @@ TEST_F(SelectorTest, SoftPlacementTest) {
   s = selector()->SelectFromNodeDef(*op, &op->GetAttrs()->BuildNodeDef(),
                                     &op_handler);
   ASSERT_EQ(s, ::tensorflow::OkStatus());
-  ASSERT_TRUE(static_cast<bool>(op_handler)) << StrCat(s.error_message());
+  ASSERT_TRUE(static_cast<bool>(op_handler)) << StrCat(s.ToString());
   ASSERT_EQ(op_handler, gpu_op_handler_);
 }
 
diff --git a/tensorflow/core/tfrt/eager/function_cache_test.cc b/tensorflow/core/tfrt/eager/function_cache_test.cc
index 7da3fbaf070..cc1185fe307 100644
--- a/tensorflow/core/tfrt/eager/function_cache_test.cc
+++ b/tensorflow/core/tfrt/eager/function_cache_test.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.h"
 #include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
 
 namespace tfrt {
@@ -53,7 +53,7 @@ class CppTests : public ::testing::TestWithParam<const char*> {
     TF_StatusPtr status(TF_NewStatus());
     TF_SetTracingImplementation(GetParam(), status.get());
     Status s = StatusFromTF_Status(status.get());
-    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.message();
   }
 };
 
@@ -185,7 +185,7 @@ TEST_P(CppTests, TestFunctionCacheWithAdd) {
   {
     tensorflow::AbstractContext* ctx_raw = nullptr;
     tensorflow::Status s = BuildImmediateExecutionContext(true, &ctx_raw);
-    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.message();
     ctx.reset(ctx_raw);
   }
 
@@ -193,7 +193,7 @@ TEST_P(CppTests, TestFunctionCacheWithAdd) {
   {
     tensorflow::AbstractTensorHandle* x_raw = nullptr;
     tensorflow::Status s = TestScalarTensorHandle(ctx.get(), 2.0f, &x_raw);
-    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.message();
     x.reset(x_raw);
   }
 
@@ -201,7 +201,7 @@ TEST_P(CppTests, TestFunctionCacheWithAdd) {
   {
     tensorflow::AbstractTensorHandle* y_raw = nullptr;
     tensorflow::Status s = TestScalarTensorHandle(ctx.get(), 2.0f, &y_raw);
-    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.error_message();
+    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.message();
     y.reset(y_raw);
   }
 
diff --git a/tensorflow/core/tfrt/eager/op_cache.cc b/tensorflow/core/tfrt/eager/op_cache.cc
index 69fdeaadf33..3786cd77089 100644
--- a/tensorflow/core/tfrt/eager/op_cache.cc
+++ b/tensorflow/core/tfrt/eager/op_cache.cc
@@ -42,7 +42,7 @@ Expected<CoreRuntimeOp*> OpCache::GetOrAddOp(
     tensorflow::Status s = context->SelectOpHandlerFromNodeDef(
         *op_interface, &op_interface->fallback_attrs_.BuildNodeDef(),
         &op_handler);
-    if (!s.ok()) return MakeStringError(s.error_message());
+    if (!s.ok()) return MakeStringError(s.message());
   }
   Expected<CoreRuntimeOp> expected_op =
       context->GetCoreRuntime()->MakeOp(tfrt_op_name, op_handler);
diff --git a/tensorflow/core/tfrt/eager/transform_graph_function.cc b/tensorflow/core/tfrt/eager/transform_graph_function.cc
index d1b748f91df..8650236d4ac 100644
--- a/tensorflow/core/tfrt/eager/transform_graph_function.cc
+++ b/tensorflow/core/tfrt/eager/transform_graph_function.cc
@@ -96,8 +96,8 @@ Status TransformGraphFunction(const std::string& func_name,
   ConfigProto config;
   bool control_rets_updated = false;
   TF_RETURN_IF_ERROR(FunctionOptimizationPassRegistry::Global().Run(
-      func_name, device_set, config, &graph, func_lib_def,
-      &control_ret_node_names, &control_rets_updated));
+      func_name, device_set, config, /*xla_compile_device_type*/ "", &graph,
+      func_lib_def, &control_ret_node_names, &control_rets_updated));
 
   if (control_rets_updated) {
     // Function graph pass may have resulted in different nodes/node names for
diff --git a/tensorflow/core/tfrt/fallback/BUILD b/tensorflow/core/tfrt/fallback/BUILD
index ba851074ede..86bc9760fce 100644
--- a/tensorflow/core/tfrt/fallback/BUILD
+++ b/tensorflow/core/tfrt/fallback/BUILD
@@ -24,7 +24,7 @@ package_group(
     name = "friends",
     packages = [
         # Authorized users go here.
-        "//tensorflow/compiler/mlir/tfrt/...",
+        "//tensorflow/compiler/mlir/...",
         "//tensorflow/core/...",
         # copybara:uncomment "//learning/brain/experimental/tfrt/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
@@ -66,6 +66,7 @@ cc_library(
     hdrs = ["op_kernel_runner.h"],
     features = tf_features_nolayering_check_if_ios(),
     visibility = [
+        ":friends",
         # copybara:uncomment "//tensorflow/core/runtime_fallback:internal",
         # copybara:uncomment "//tensorflow/core/tfrt/eager:__pkg__",
         "//tensorflow/core/tfrt/graph_executor:__subpackages__",
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.cc b/tensorflow/core/tfrt/fallback/fallback_state.cc
index ff0f2f3ecfa..54d3a083457 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state.cc
@@ -34,6 +34,18 @@ StatusOr<std::unique_ptr<FallbackState>> FallbackState::Create(
                                          fdef_lib);
 }
 
+StatusOr<std::unique_ptr<FallbackState>> FallbackState::CreateWithCpuDevice(
+    const SessionOptions &session_options,
+    const tensorflow::FunctionDefLibrary &fdef_lib) {
+  // Create devices.
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_RETURN_IF_ERROR(DeviceFactory::AddCpuDevices(
+      session_options, "/job:localhost/replica:0/task:0", &devices));
+
+  return std::make_unique<FallbackState>(session_options, std::move(devices),
+                                         fdef_lib);
+}
+
 FallbackState::FallbackState(const SessionOptions &session_options,
                              std::vector<std::unique_ptr<Device>> devices,
                              const tensorflow::FunctionDefLibrary &fdef_lib)
@@ -45,11 +57,12 @@ FallbackState::FallbackState(const SessionOptions &session_options,
             session_options.config.graph_options().optimizer_options(),
             /*thread_pool=*/nullptr, /*parent=*/nullptr,
             /*session_metadata=*/nullptr,
-            Rendezvous::Factory{
-                [](const int64, const DeviceMgr *device_mgr, Rendezvous **r) {
-                  *r = new IntraProcessRendezvous(device_mgr);
-                  return OkStatus();
-                }}) {
+            Rendezvous::Factory{[](const int64, const DeviceMgr *device_mgr,
+                                   tsl::core::RefCountPtr<Rendezvous> *r) {
+              *r = tsl::core::RefCountPtr<Rendezvous>(
+                  new IntraProcessRendezvous(device_mgr));
+              return OkStatus();
+            }}) {
   for (auto *d : device_manager_.ListDevices()) {
     device_set_.AddDevice(d);
   }
@@ -59,7 +72,8 @@ FallbackState::FallbackState(const SessionOptions &session_options,
 }
 
 StatusOr<std::unique_ptr<GraphExecutionState>>
-FallbackState::CreateGraphExecutionState(GraphDef graph_def) const {
+FallbackState::CreateGraphExecutionState(GraphDef graph_def,
+                                         bool run_placer) const {
   // Create GraphExecutionState which contains the preprocessed graph including
   // device information. The following code is adapted from
   // http://cs?q=tensorflow/core/common_runtime/direct_session.cc:427%20at_cl:352783230
@@ -68,6 +82,7 @@ FallbackState::CreateGraphExecutionState(GraphDef graph_def) const {
   options.device_set = &device_set_;
   options.session_options = &session_options_;
   options.session_handle = "tfrt_fallback_handle";
+  options.run_placer = run_placer;
 
   std::unique_ptr<GraphExecutionState> execution_state;
   TF_RETURN_IF_ERROR(GraphExecutionState::MakeForBaseGraph(
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.h b/tensorflow/core/tfrt/fallback/fallback_state.h
index 7cd18de0691..16aa74351ae 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.h
+++ b/tensorflow/core/tfrt/fallback/fallback_state.h
@@ -38,6 +38,10 @@ class FallbackState {
       const SessionOptions &session_options,
       const tensorflow::FunctionDefLibrary &fdef_lib);
 
+  static StatusOr<std::unique_ptr<FallbackState>> CreateWithCpuDevice(
+      const SessionOptions &session_options,
+      const tensorflow::FunctionDefLibrary &fdef_lib);
+
   FallbackState(const SessionOptions &session_options,
                 std::vector<std::unique_ptr<Device>> devices,
                 const tensorflow::FunctionDefLibrary &fdef_lib);
@@ -45,7 +49,7 @@ class FallbackState {
   // Create GraphExecutionState from the `graph_def`. The result will contain a
   // preprocessed graph with runtime information such as devices.
   StatusOr<std::unique_ptr<GraphExecutionState>> CreateGraphExecutionState(
-      GraphDef graph_def) const;
+      GraphDef graph_def, bool run_placer = true) const;
 
   // Adds `func_def` to the function library.
   Status AddFunctionDef(const FunctionDef &func_def);
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner.cc b/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
index 1a5b4f83076..3f64c55c062 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 
 #include <string>
+#include <utility>
 
 #include "tensorflow/core/platform/errors.h"
 
@@ -137,26 +138,34 @@ OpKernelRunner::OpKernelRunner(
     tensorflow::Device* device,
     tensorflow::FunctionLibraryRuntime* function_library_runtime,
     std::unique_ptr<tensorflow::OpKernel> op_kernel)
-    : device_(device),
-      function_library_runtime_(function_library_runtime),
-      resource_manager_(device->resource_manager()),
-      op_kernel_(std::move(op_kernel)),
-      is_async_(op_kernel_->AsAsync() != nullptr) {
-  DCHECK(device_);
-  DCHECK(function_library_runtime_);
+    : op_kernel_(std::move(op_kernel)), info_(std::make_unique<Info>()) {
+  DCHECK(device);
+  DCHECK(function_library_runtime);
+
+  info_->device = device;
+  info_->function_library_runtime = function_library_runtime;
+  info_->resource_manager = device->resource_manager();
+  info_->is_async = (op_kernel_->AsAsync() != nullptr);
 
   const auto& input_memory_types = op_kernel_->input_memory_types();
-  input_alloc_attrs_.resize(op_kernel_->num_inputs());
+
+  auto& input_alloc_attrs = info_->input_alloc_attrs;
+  auto& output_alloc_attrs = info_->output_alloc_attrs;
+
+  input_alloc_attrs.resize(op_kernel_->num_inputs());
   for (size_t i = 0, e = op_kernel_->num_inputs(); i < e; ++i) {
-    input_alloc_attrs_[i].set_on_host(input_memory_types[i] ==
-                                      tensorflow::HOST_MEMORY);
+    input_alloc_attrs[i].set_on_host(input_memory_types[i] ==
+                                     tensorflow::HOST_MEMORY);
   }
   const auto& output_memory_types = op_kernel_->output_memory_types();
-  output_alloc_attrs_.resize(op_kernel_->num_outputs());
-  for (size_t i = 0, e = output_alloc_attrs_.size(); i < e; ++i) {
-    output_alloc_attrs_[i].set_on_host(output_memory_types[i] ==
-                                       tensorflow::HOST_MEMORY);
+  output_alloc_attrs.resize(op_kernel_->num_outputs());
+  for (size_t i = 0, e = output_alloc_attrs.size(); i < e; ++i) {
+    output_alloc_attrs[i].set_on_host(output_memory_types[i] ==
+                                      tensorflow::HOST_MEMORY);
   }
+
+  input_alloc_attrs_ = input_alloc_attrs;
+  output_alloc_attrs_ = output_alloc_attrs;
 }
 
 void OpKernelRunner::RunAsync(OpKernelContext* context,
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner.h b/tensorflow/core/tfrt/fallback/op_kernel_runner.h
index db3aa908078..2ca45c13f2d 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner.h
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -95,21 +96,21 @@ class OpKernelRunner {
   void RunAsync(OpKernelContext* context,
                 AsyncOpKernel::DoneCallback done_callback) const;
 
-  bool IsAsync() const { return is_async_; }
+  bool IsAsync() const { return info_->is_async; }
 
   tensorflow::OpKernel* op_kernel() const { return op_kernel_.get(); }
-  tensorflow::Device* device() const { return device_; }
+  tensorflow::Device* device() const { return info_->device; }
   tensorflow::FunctionLibraryRuntime* function_library_runtime() const {
-    return function_library_runtime_;
+    return info_->function_library_runtime;
   }
   tensorflow::ResourceMgr* resource_manager() const {
-    return resource_manager_;
+    return info_->resource_manager;
   }
 
-  const gtl::InlinedVector<AllocatorAttributes, 4>& input_alloc_attrs() const {
+  absl::Span<const AllocatorAttributes> input_alloc_attrs() const {
     return input_alloc_attrs_;
   }
-  const gtl::InlinedVector<AllocatorAttributes, 1>& output_alloc_attrs() const {
+  absl::Span<const AllocatorAttributes> output_alloc_attrs() const {
     return output_alloc_attrs_;
   }
 
@@ -119,25 +120,31 @@ class OpKernelRunner {
       tensorflow::FunctionLibraryRuntime* function_library_runtime,
       std::unique_ptr<OpKernel> op_kernel);
 
-  tensorflow::Device* device_ = nullptr;
-  tensorflow::FunctionLibraryRuntime* function_library_runtime_ = nullptr;
-  tensorflow::ResourceMgr* resource_manager_ = nullptr;
   std::unique_ptr<OpKernel> op_kernel_;
-  bool is_async_ = false;
-  gtl::InlinedVector<AllocatorAttributes, 4> input_alloc_attrs_;
-  gtl::InlinedVector<AllocatorAttributes, 1> output_alloc_attrs_;
+  absl::Span<const AllocatorAttributes> input_alloc_attrs_;
+  absl::Span<const AllocatorAttributes> output_alloc_attrs_;
+
+  struct Info {
+    tensorflow::Device* device = nullptr;
+    tensorflow::FunctionLibraryRuntime* function_library_runtime = nullptr;
+    tensorflow::ResourceMgr* resource_manager = nullptr;
+    bool is_async = false;
+    gtl::InlinedVector<AllocatorAttributes, 4> input_alloc_attrs;
+    gtl::InlinedVector<AllocatorAttributes, 1> output_alloc_attrs;
+  };
+  std::unique_ptr<Info> info_;
 };
 
 // OpKernelRunState keeps the states needed for per-kernel execution.
 struct OpKernelRunState {
-  gtl::InlinedVector<tensorflow::Tensor, 4> input_tf_tensors;
-  gtl::InlinedVector<tensorflow::TensorValue, 4> input_tf_tensor_values;
+  std::vector<const tensorflow::TensorBuffer*> tensor_buffers;
+  std::vector<tensorflow::TensorValue> input_tf_tensor_values;
   OpKernelContext::Params params;
+  gtl::InlinedVector<tensorflow::Tensor, 4> input_tf_tensors;
 
   OpKernelRunState() = default;
-  OpKernelRunState(
-      const gtl::InlinedVector<tensorflow::TensorValue, 4>& tensor_values,
-      const OpKernelContext::Params& p) {
+  OpKernelRunState(absl::Span<const tensorflow::TensorValue> tensor_values,
+                   const OpKernelContext::Params& p) {
     // `input_tf_tensor_values` contains the reference to all tensor used,
     // while `input_tf_tensors` only contains those needs ownership so their
     // sizes may not match. For this copy assignment, we conservatively copy all
@@ -177,7 +184,7 @@ class OpKernelRunnerTable {
   // dense.
   bool Insert(int64_t index, OpKernelRunner runner) {
     if (runners_.size() <= index) runners_.resize(index + 1);
-    if (runners_[index].has_value()) return false;
+    if (runners_[index]) return false;
     runners_[index] = std::move(runner);
     return true;
   }
@@ -189,16 +196,23 @@ class OpKernelRunnerTable {
   const OpKernelRunner* Get(int64_t index) const {
     // Out of bounds vector access will throw an exception and anyway will crash
     // the binary, prefer a more readable error message.
-    DCHECK_GT(runners_.size(), index)
+    CHECK_GT(runners_.size(), index)  // Crash OK
         << "runner index is out of bounds: index=" << index
         << " size=" << runners_.size();
-    auto& result = runners_.at(index);
-    DCHECK(result.has_value()) << "runner is not available: index=" << index;
-    return &(*result);
+    CHECK(runners_[index])  // Crash OK
+        << "runner is not available: index=" << index;
+    return GetUnsafe(index);
+  }
+
+  const OpKernelRunner* GetUnsafe(int64_t index) const {
+    DCHECK_GT(runners_.size(), index);
+    auto& result = runners_[index];
+    DCHECK(result);
+    return &result;
   }
 
  private:
-  std::vector<absl::optional<OpKernelRunner>> runners_;
+  std::vector<OpKernelRunner> runners_;
 };
 
 }  // namespace tfrt_stub
diff --git a/tensorflow/core/tfrt/graph_executor/BUILD b/tensorflow/core/tfrt/graph_executor/BUILD
index 48b664a27f3..9de585bfc3f 100644
--- a/tensorflow/core/tfrt/graph_executor/BUILD
+++ b/tensorflow/core/tfrt/graph_executor/BUILD
@@ -1,4 +1,6 @@
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -10,8 +12,7 @@ package_group(
     name = "friends",
     packages = [
         "//smartass/brain/inference/...",
-        "//tensorflow/core/tfrt/saved_model/...",
-        "//tensorflow/core/tfrt/tfrt_session/...",
+        "//tensorflow/core/tfrt/...",
     ],
 )
 
@@ -19,11 +20,13 @@ cc_library(
     name = "graph_execution_options",
     srcs = ["graph_execution_options.cc"],
     hdrs = ["graph_execution_options.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
+        ":config",
         "//tensorflow/compiler/mlir/tfrt:tfrt_compile_options",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core/protobuf:for_core_protos_cc",
-        "//tensorflow/core/tfrt/runtime",
+        "//tensorflow/core/tfrt/runtime:work_queue_interface",
         "//tensorflow/core/tfrt/utils:bridge_graph_analysis",
         "@com_google_absl//absl/types:optional",
     ],
@@ -36,12 +39,9 @@ cc_library(
     tags = ["no_oss"],
     deps = [
         ":graph_execution_options",
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/attribute",
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms:parallelization",
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms:tf_to_mlrt",
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
+        ":sync_resource_state",
+        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms:import_model",
         "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel:context",
-        "//learning/brain/experimental/tfrt/mlrt/mlir_to_bytecode",
         "//learning/brain/experimental/tfrt/native_lowering/kernels:sync_context",
         "//learning/brain/experimental/tfrt/native_lowering/saved_model:saved_model_translate",
         "//learning/infra/mira/mlrt/bytecode",
@@ -52,6 +52,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tfrt:import_model",
         "//tensorflow/compiler/mlir/tfrt:tf_jitrt_request_context",
+        "//tensorflow/compiler/mlir/tfrt:tfrt_compile_options",
         "//tensorflow/compiler/mlir/tfrt:transforms/update_op_cost_in_tfrt_mlir",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:lib",
@@ -62,9 +63,8 @@ cc_library(
         "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme_encode",
         "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_op_handler",
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
         "//tensorflow/core/tfrt/fallback:cost_recorder",
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
@@ -75,12 +75,13 @@ cc_library(
         "//tensorflow/core/tfrt/utils:tfrt_graph_execution_state",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@tf_runtime//:basic_kernels_alwayslink",
         "@tf_runtime//:bef",
@@ -112,6 +113,7 @@ tf_cc_test(
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
+        "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_googletest//:gtest_main",
         "@tf_runtime//:tensor",
@@ -129,6 +131,7 @@ cc_library(
         "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
         "//learning/brain/experimental/tfrt/native_lowering/kernels",
         "//learning/brain/experimental/tfrt/native_lowering/kernels:kernels_alwayslink",
+        "//learning/brain/tfrt/mlrt/application/vrooml:kernel",
         "//learning/infra/mira/mlrt/interpreter:context",
         "//learning/infra/mira/mlrt/interpreter:value",
         "//tensorflow/core/framework:graph_proto_cc",
@@ -142,6 +145,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "config",
+    srcs = ["config.cc"],
+    hdrs = ["config.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":config_proto_cc",
+        "//google/protobuf:any_cc_proto",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
 tf_cc_test(
     name = "synchronous_graph_executor_test",
     srcs = ["synchronous_graph_executor_test.cc"],
@@ -156,3 +173,36 @@ tf_cc_test(
         "@tf_runtime//cpp_tests:common",
     ],
 )
+
+tf_proto_library(
+    name = "config_proto",
+    srcs = ["config.proto"],
+    protodeps = ["//google/protobuf:any"],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library(
+    name = "test_config_proto",
+    testonly = True,
+    srcs = ["test_config.proto"],
+)
+
+tf_cc_test(
+    name = "config_test",
+    srcs = ["config_test.cc"],
+    deps = [
+        ":config",
+        ":config_proto_cc",
+        ":test_config_proto_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "sync_resource_state",
+    hdrs = ["sync_resource_state.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@tf_runtime//:tensor",
+    ],
+)
diff --git a/tensorflow/core/tfrt/graph_executor/config.cc b/tensorflow/core/tfrt/graph_executor/config.cc
new file mode 100644
index 00000000000..5d55d8c2457
--- /dev/null
+++ b/tensorflow/core/tfrt/graph_executor/config.cc
@@ -0,0 +1,40 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/graph_executor/config.h"
+
+#include <string>
+#include <utility>
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+absl::StatusOr<ModelConfig> ModelConfig::CreateFromProto(
+    ModelConfigProto proto) {
+  ModelConfig model_config;
+  model_config.proto_ = std::move(proto);
+  size_t i = 0;
+  for (const auto& any : model_config.proto_.config()) {
+    std::string full_name;
+    if (!::google::protobuf::Any::ParseAnyTypeUrl(any.type_url(), &full_name)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Invalid Any proto type url: ", any.type_url()));
+    }
+    model_config.map_[full_name] = i++;
+  }
+  return model_config;
+}
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/graph_executor/config.h b/tensorflow/core/tfrt/graph_executor/config.h
new file mode 100644
index 00000000000..432b38e7c27
--- /dev/null
+++ b/tensorflow/core/tfrt/graph_executor/config.h
@@ -0,0 +1,81 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_CONFIG_H_
+#define TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_CONFIG_H_
+
+#include <string>
+
+#include "google/protobuf/any.pb.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/tfrt/graph_executor/config.pb.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// The helper class for building ModelConfigProto and retrieving configs of
+// certain types from the ModelConfigProto.
+class ModelConfig {
+ public:
+  ModelConfig() = default;
+
+  static absl::StatusOr<ModelConfig> CreateFromProto(ModelConfigProto proto);
+
+  template <typename ConcreteProto>
+  absl::Status Add(const ConcreteProto& config) {
+    const auto& full_name = config.GetDescriptor()->full_name();
+    if (map_.contains(full_name)) {
+      return absl::AlreadyExistsError(
+          absl::StrCat(full_name, " already exists in ModelConfig."));
+    }
+
+    size_t id = proto_.config_size();
+    if (!proto_.add_config()->PackFrom(config)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Failed to pack proto to Any: ", full_name));
+    }
+    map_[full_name] = id;
+    return absl::OkStatus();
+  }
+
+  template <typename ConcreteProto>
+  absl::StatusOr<ConcreteProto> Get() const {
+    const auto& full_name = ConcreteProto::GetDescriptor()->full_name();
+    auto iter = map_.find(full_name);
+
+    if (iter == map_.end()) {
+      return absl::NotFoundError(
+          absl::StrCat(full_name, " not found in ModelConfig."));
+    }
+
+    ConcreteProto config;
+    if (!proto_.config().at(iter->second).UnpackTo(&config)) {
+      return absl::DataLossError(
+          absl::StrCat("Failed to unpack proto: ", full_name));
+    }
+    return config;
+  }
+
+  const ModelConfigProto& ToProto() const { return proto_; }
+
+ private:
+  ModelConfigProto proto_;
+  absl::flat_hash_map<std::string, size_t> map_;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_CONFIG_H_
diff --git a/tensorflow/core/tfrt/graph_executor/config.proto b/tensorflow/core/tfrt/graph_executor/config.proto
new file mode 100644
index 00000000000..20681a45217
--- /dev/null
+++ b/tensorflow/core/tfrt/graph_executor/config.proto
@@ -0,0 +1,11 @@
+syntax = "proto3";
+
+package tensorflow.tfrt_stub;
+
+import "google/protobuf/any.proto";
+
+// The serialization format for custom model configs. Though it is using
+// repeated Any protos, only one object of a proto type is allowed in the list.
+message ModelConfigProto {
+  repeated google.protobuf.Any config = 1;
+}
diff --git a/tensorflow/core/tfrt/graph_executor/config_test.cc b/tensorflow/core/tfrt/graph_executor/config_test.cc
new file mode 100644
index 00000000000..6d0cbee3dc2
--- /dev/null
+++ b/tensorflow/core/tfrt/graph_executor/config_test.cc
@@ -0,0 +1,95 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/graph_executor/config.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/tfrt/graph_executor/config.pb.h"
+#include "tensorflow/core/tfrt/graph_executor/test_config.pb.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+namespace {
+
+TEST(ConfigTest, Basic) {
+  TestConfig1 expected_test_config1;
+  expected_test_config1.set_tag("test config1");
+  TestConfig2 expected_test_config2;
+  expected_test_config2.set_tag("test config2");
+
+  ModelConfig model_config;
+
+  ASSERT_OK(model_config.Add(expected_test_config2));
+  ASSERT_OK(model_config.Add(expected_test_config1));
+
+  auto test_config1 = model_config.Get<TestConfig1>();
+  ASSERT_OK(test_config1);
+  auto test_config2 = model_config.Get<TestConfig2>();
+  ASSERT_OK(test_config2);
+
+  EXPECT_EQ(test_config1->tag(), "test config1");
+  EXPECT_EQ(test_config2->tag(), "test config2");
+}
+
+TEST(ConfigTest, Load) {
+  TestConfig1 expected_test_config1;
+  expected_test_config1.set_tag("test config1");
+  TestConfig2 expected_test_config2;
+  expected_test_config2.set_tag("test config2");
+
+  ModelConfigProto model_config_proto;
+  model_config_proto.add_config()->PackFrom(expected_test_config1);
+  model_config_proto.add_config()->PackFrom(expected_test_config2);
+
+  ASSERT_OK_AND_ASSIGN(auto model_config,
+                       ModelConfig::CreateFromProto(model_config_proto));
+
+  auto test_config1 = model_config.Get<TestConfig1>();
+  ASSERT_OK(test_config1);
+  auto test_config2 = model_config.Get<TestConfig2>();
+  ASSERT_OK(test_config2);
+
+  EXPECT_EQ(test_config1->tag(), "test config1");
+  EXPECT_EQ(test_config2->tag(), "test config2");
+}
+
+TEST(ConfigTest, NotFound) {
+  TestConfig1 expected_test_config1;
+  expected_test_config1.set_tag("test config1");
+
+  ModelConfigProto model_config_proto;
+  model_config_proto.add_config()->PackFrom(expected_test_config1);
+
+  ASSERT_OK_AND_ASSIGN(auto model_config,
+                       ModelConfig::CreateFromProto(model_config_proto));
+
+  EXPECT_THAT(model_config.Get<TestConfig2>(),
+              ::testing::status::StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST(ConfigTest, Duplicate) {
+  TestConfig1 expected_test_config1;
+  expected_test_config1.set_tag("test config1");
+
+  ModelConfig model_config;
+
+  ASSERT_OK(model_config.Add(expected_test_config1));
+  EXPECT_THAT(model_config.Add(expected_test_config1),
+              ::testing::status::StatusIs(absl::StatusCode::kAlreadyExists));
+}
+
+}  // namespace
+}  // namespace tfrt_stub
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc b/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
index 3a1885ebcb4..baf2bdc4b79 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
@@ -28,6 +28,11 @@ tensorflow::SessionOptions CreateDefaultSessionOptions(
   tensorflow::SessionOptions session_options;
   auto& config = session_options.config;
 
+  *config.mutable_experimental()->mutable_session_metadata() =
+      options.model_metadata;
+
+  *config.mutable_graph_options() = options.compile_options.graph_options;
+
   config.mutable_graph_options()
       ->mutable_rewrite_options()
       ->set_disable_meta_optimizer(!options.compile_options.enable_grappler);
diff --git a/tensorflow/core/tfrt/graph_executor/graph_execution_options.h b/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
index 00e193b3f53..abb7ea918db 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
+++ b/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
@@ -22,11 +22,14 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
-#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tensorflow/core/tfrt/graph_executor/config.h"
+#include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 
 namespace tensorflow {
 namespace tfrt_stub {
 
+class Runtime;
+
 // General options for graph execution.
 struct GraphExecutionOptions {
   explicit GraphExecutionOptions(const tensorflow::tfrt_stub::Runtime* rt)
@@ -52,6 +55,9 @@ struct GraphExecutionOptions {
   // Model metadata used for monitoring and tracing.
   tensorflow::SessionMetadata model_metadata;
 
+  // The model-specific configurations.
+  tensorflow::tfrt_stub::ModelConfig model_config;
+
   // If true, for each client graph, the op costs of the first request will be
   // recorded and used to re-compile the client graph.
   // TODO(b/266251216): Maybe flip the default value or remote it.
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.cc b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
index 4d7547f6f8f..d9e3a26a380 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
@@ -25,18 +25,17 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/attribute/attribute.h"
-#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms/parallelization.h"
-#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms/tf_to_mlrt.h"
+#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms/import_model.h"
 #include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/context.h"
-#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/kernel.h"
-#include "learning/brain/experimental/tfrt/mlrt/mlir_to_bytecode/mlir_to_bytecode.h"
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_context.h"
 #include "learning/brain/experimental/tfrt/native_lowering/saved_model/saved_model_translate.h"
+#include "learning/infra/mira/mlrt/bytecode/bytecode.h"
 #include "learning/infra/mira/mlrt/bytecode/executable.h"
 #include "learning/infra/mira/mlrt/interpreter/context.h"
 #include "learning/infra/mira/mlrt/interpreter/execute.h"
 #include "absl/base/call_once.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -44,31 +43,34 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_request_context.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/platform/threadpool_interface.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/session.h"
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+#include "tensorflow/core/tfrt/graph_executor/sync_resource_state.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tfrt/bef_converter/mlir_to_bef.h"  // from @tf_runtime
 #include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
 #include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
@@ -81,7 +83,6 @@ limitations under the License.
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
 #include "tfrt/host_context/request_deadline_tracker.h"  // from @tf_runtime
 #include "tfrt/host_context/resource_context.h"  // from @tf_runtime
-#include "tfrt/host_context/value.h"  // from @tf_runtime
 #include "tfrt/support/forward_decls.h"  // from @tf_runtime
 #include "tfrt/support/ref_count.h"  // from @tf_runtime
 #include "tfrt/support/string_util.h"  // from @tf_runtime
@@ -104,7 +105,8 @@ tensorflow::Status RunMlrtFunction(
     const tsl::RCReference<tfrt::RequestContext>& request_context,
     tfrt::ConcurrentWorkQueue& work_queue,
     absl::Span<const tensorflow::Tensor> inputs,
-    std::vector<tensorflow::Tensor>* outputs) {
+    std::vector<tensorflow::Tensor>* outputs,
+    SyncResourceState* sync_resource_state) {
   DCHECK(function);
   const auto* fallback_request_state =
       request_context->GetDataIfExists<tfd::KernelFallbackCompatRequestState>();
@@ -118,11 +120,12 @@ tensorflow::Status RunMlrtFunction(
   // TODO(chky, rohitju): Unify tfrt::SyncContext with tf_mlrt::Context.
   tfrt::ExecutionContext exec_ctx(request_context);
   execution_context.AddUserContext(
-      std::make_unique<tfrt::SyncContext>(&exec_ctx));
+      std::make_unique<tfrt::SyncContext>(&exec_ctx, sync_resource_state));
 
   // Set up tf_mlrt::Context which is used for executing tensorflow::OpKernel.
   execution_context.AddUserContext(std::make_unique<tf_mlrt::Context>(
-      fallback_request_state, request_context->resource_context()));
+      fallback_request_state, request_context->resource_context(),
+      request_context->cancellation_context().get()));
 
   absl::InlinedVector<mlrt::Value, 4> mlrt_inputs;
   mlrt_inputs.reserve(inputs.size());
@@ -143,7 +146,9 @@ tensorflow::Status RunMlrtFunction(
   execution_context.set_exit_handler(
       [chain = chain.get()]() { chain->SetStateConcrete(); });
 
-  execution_context.Call(function, absl::MakeSpan(mlrt_inputs),
+  std::vector<uint8_t> last_uses;
+  execution_context.Call(function, last_uses, absl::Span<mlrt::Value>(),
+                         absl::MakeSpan(mlrt_inputs),
                          absl::MakeSpan(mlrt_outputs));
 
   // TODO(chky): Set up cancellation.
@@ -167,15 +172,20 @@ tensorflow::Status RunMlrtFunction(
 }
 
 StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
+    const GraphExecutionOptions& options,
     const GraphExecutionRunOptions& run_options,
-    const SessionMetadata& model_metadata, const Runtime& runtime,
     tensorflow::tfrt_stub::WorkQueueInterface* work_queue,
-    tfrt::ResourceContext* resource_context, OpKernelRunnerTable* runner_table,
+    tfrt::ResourceContext* resource_context,
+    tfrt::ResourceContext* client_graph_resource_context,
+    OpKernelRunnerTable* runner_table,
     tfd::FallbackResourceArray* resource_array,
     const tensorflow::tfrt_stub::FallbackState& fallback_state,
     CostRecorder* cost_recorder) {
   auto request_info = std::make_unique<RequestInfo>();
 
+  DCHECK(options.runtime);
+  const Runtime& runtime = *options.runtime;
+
   // Set the request queue.
   // TODO(tfrt-devs): Consider using an ID unique within each model to reduce
   // contention.
@@ -203,14 +213,25 @@ StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
   // Create a request context builder.
   tfrt::RequestContextBuilder request_context_builder(
       runtime.core_runtime()->GetHostContext(), resource_context, request_id);
+
   // Set up the request contexts in the builder.
   // Note: if the intra-op thread pool from the request queue is null, the
   // thread pool in `tensorflow::Device` will be used.
-  TF_RETURN_IF_ERROR(tensorflow::tfd::SetUpKernelFallbackCompatRequestContext(
-      &request_context_builder, &fallback_state.device_manager(),
-      &fallback_state.process_function_library_runtime(), runner_table,
-      resource_array, request_queue->GetIntraOpThreadPool(), model_metadata,
-      &request_info->runner, cost_recorder));
+  DCHECK(runner_table);
+  DCHECK(resource_array);
+  auto& fallback_request_state =
+      request_context_builder.context_data()
+          .emplace<tfd::KernelFallbackCompatRequestState>(
+              &request_info->runner, &fallback_state.device_manager(),
+              request_context_builder.id(), runner_table, resource_array,
+              request_queue->GetIntraOpThreadPool(), options.model_metadata,
+              &fallback_state.process_function_library_runtime());
+
+  fallback_request_state.set_cost_recorder(cost_recorder);
+  fallback_request_state.set_client_graph_resource_context(
+      client_graph_resource_context);
+  fallback_request_state.set_model_config(&options.model_config);
+
   TF_RETURN_IF_ERROR(
       tensorflow::SetUpTfJitRtRequestContext(&request_context_builder));
   // Set priority in the builder.
@@ -235,16 +256,19 @@ tensorflow::Status GraphExecutionRunOnFunction(
     const mlrt::LoadedExecutable* loaded_executable,
     absl::Span<const tensorflow::Tensor> inputs,
     std::vector<tensorflow::Tensor>* outputs,
-    tfrt::ResourceContext* resource_context, OpKernelRunnerTable* runner_table,
+    tfrt::ResourceContext* resource_context,
+    tfrt::ResourceContext* client_graph_resource_context,
+    OpKernelRunnerTable* runner_table,
     tfd::FallbackResourceArray* resource_array, const Runtime& runtime,
     const FallbackState& fallback_state,
     tfrt::RequestDeadlineTracker* req_deadline_tracker,
     CostRecorder* cost_recorder) {
   TF_ASSIGN_OR_RETURN(
       auto request_info,
-      CreateRequestInfo(run_options, options.model_metadata, runtime,
-                        run_options.work_queue, resource_context, runner_table,
-                        resource_array, fallback_state, cost_recorder));
+      CreateRequestInfo(options, run_options, run_options.work_queue,
+                        resource_context, client_graph_resource_context,
+                        runner_table, resource_array, fallback_state,
+                        cost_recorder));
 
   tensorflow::profiler::TraceMeProducer traceme(
       // To TraceMeConsumers in RunHandlerThreadPool::WorkerLoop.
@@ -261,20 +285,6 @@ tensorflow::Status GraphExecutionRunOnFunction(
       tensorflow::profiler::ContextType::kTfrtExecutor,
       request_info->tfrt_request_context->id());
 
-  if (loaded_executable) {
-    auto function = loaded_executable->GetFunction(signature_name);
-    if (!function) {
-      return errors::InvalidArgument(absl::StrCat(
-          "Function not found in MLRT executable: ", signature_name));
-    }
-
-    return RunMlrtFunction(function, *loaded_executable,
-                           request_info->tfrt_request_context,
-                           *request_info->request_queue, inputs, outputs);
-  }
-
-  DCHECK(func);
-
   // Only configure timer when the deadline is set.
   if (run_options.deadline.has_value()) {
     auto deadline = run_options.deadline.value();
@@ -289,6 +299,21 @@ tensorflow::Status GraphExecutionRunOnFunction(
         deadline, request_info->tfrt_request_context);
   }
 
+  if (loaded_executable) {
+    auto function = loaded_executable->GetFunction(signature_name);
+    if (!function) {
+      return errors::InvalidArgument(absl::StrCat(
+          "Function not found in MLRT executable: ", signature_name));
+    }
+
+    return RunMlrtFunction(function, *loaded_executable,
+                           request_info->tfrt_request_context,
+                           *request_info->request_queue, inputs, outputs,
+                           /*sync_resource_state=*/nullptr);
+  }
+
+  DCHECK(func);
+
   tfrt::ExecutionContext exec_ctx{request_info->tfrt_request_context};
   if (run_options.work_queue) {
     // TODO(b/198671794): Avoid creating `request_queue` when the `work_queue`
@@ -383,9 +408,10 @@ GraphExecutor::GraphExecutor(
       graph_execution_state_(std::move(graph_execution_state)),
       req_deadline_tracker_(options_.runtime->core_runtime()->GetHostContext()),
       kernel_registry_(std::move(kernel_registry)) {
+  SetSessionCreatedMetric();
   // Creates a ResourceContext and populate it with per model resource from
   // Runtime.
-  options_.runtime->CreateRuntimeResources(&resource_context_);
+  options_.runtime->CreateRuntimeResources(options_, &resource_context_);
 }
 
 StatusOr<std::unique_ptr<GraphExecutor>> GraphExecutor::Create(
@@ -403,6 +429,9 @@ StatusOr<std::unique_ptr<GraphExecutor>> GraphExecutor::Create(
   graph_execution_state_options.use_bridge_for_gpu =
       options.compile_options.use_bridge_for_gpu;
 
+  options.compile_options.fuse_get_resource_ops_in_hoisting =
+      !options.enable_mlrt;
+
   TF_ASSIGN_OR_RETURN(
       auto graph_execution_state,
       TfrtGraphExecutionState::Create(graph_execution_state_options,
@@ -485,12 +514,17 @@ tensorflow::Status GraphExecutor::Run(
                           sorted_output_names, sorted_target_node_names,
                           run_options.work_queue));
 
+  // Get a shared_ptr of the executable so that during the current request the
+  // executable to use is guaranteed to be alive.
+  auto executable_context = loaded_client_graph.executable_context();
+  const mlrt::LoadedExecutable* loaded_executable = nullptr;
   const tfrt::Function* func = nullptr;
-  if (auto bef_context = loaded_client_graph.bef_context()) {
-    func = bef_context->bef_file->GetFunction(loaded_client_graph.name());
+  if (executable_context->IsForMlrt()) {
+    loaded_executable = executable_context->bytecode_executable.get();
+  } else {
+    func =
+        executable_context->bef_file->GetFunction(loaded_client_graph.name());
   }
-  const auto* loaded_executable = loaded_client_graph.bytecode_executable();
-
   DCHECK(func || loaded_executable);
 
   // Create the actual arguments to the compiled function, which are sorted
@@ -511,6 +545,7 @@ tensorflow::Status GraphExecutor::Run(
   TF_RETURN_IF_ERROR(GraphExecutionRunOnFunction(
       options_, run_options, loaded_client_graph.name(), func,
       loaded_executable, flat_inputs, &flat_outputs, &resource_context_,
+      &executable_context->resource_context,
       &loaded_client_graph.runner_table(),
       &loaded_client_graph.resource_array(), runtime(), fallback_state_,
       &req_deadline_tracker_, cost_recorder.get()));
@@ -554,28 +589,40 @@ GraphExecutor::ImportAndCompileClientGraph(
   // TODO(b/229261464): Unify the sync and async lowering passes so we do not
   // need this branch.
   auto compile_start_time = absl::Now();
-  std::shared_ptr<BefContext> bef_context = nullptr;
-  mlrt::bc::Buffer bytecode_buffer;
-  std::unique_ptr<mlrt::LoadedExecutable> bytecode_executable = nullptr;
+  mlir::OwningOpRef<mlir::ModuleOp> module_with_op_keys;
+  std::shared_ptr<ExecutableContext> executable_context = nullptr;
   if (options_.compile_options.compile_to_sync_tfrt_dialect) {
+    if (kernel_registry_ == nullptr) {
+      return tensorflow::errors::Internal("Missing kernel registry in MLRT.");
+    }
+
     ASSIGN_OR_RETURN_IN_COMPILE(
-        bytecode_buffer, tfrt::CompileTfMlirModuleToBytecode(module.get()));
+        auto bytecode_buffer,
+        tfrt::CompileTfMlirModuleToBytecode(module.get()));
     mlrt::bc::Executable executable(bytecode_buffer.data());
-    bytecode_executable =
+    auto bytecode_executable =
         std::make_unique<mlrt::LoadedExecutable>(executable, *kernel_registry_);
+    executable_context = std::make_shared<ExecutableContext>(
+        std::move(bytecode_buffer), std::move(bytecode_executable));
   } else if (options_.enable_mlrt) {
+    if (kernel_registry_ == nullptr) {
+      return tensorflow::errors::Internal("Missing kernel registry in MLRT.");
+    }
+
     ASSIGN_OR_RETURN_IN_COMPILE(
-        bytecode_buffer,
-        CompileMlirModuleToByteCode(options_.compile_options, module.get()));
+        auto bytecode_buffer,
+        CompileMlirModuleToByteCode(module.get(), &module_with_op_keys));
     mlrt::bc::Executable executable(bytecode_buffer.data());
-    bytecode_executable =
+    auto bytecode_executable =
         std::make_unique<mlrt::LoadedExecutable>(executable, *kernel_registry_);
+    executable_context = std::make_shared<ExecutableContext>(
+        std::move(bytecode_buffer), std::move(bytecode_executable));
   } else {
     ASSIGN_OR_RETURN_IN_COMPILE(auto bef, CompileMlirModuleToBef(module.get()));
     ASSIGN_OR_RETURN_IN_COMPILE(
         auto bef_file, tfrt::CreateBefFileFromBefBuffer(runtime(), bef));
-    bef_context =
-        std::make_shared<BefContext>(std::move(bef), std::move(bef_file));
+    executable_context = std::make_shared<ExecutableContext>(
+        std::move(bef), std::move(bef_file));
   }
   auto compile_duration = absl::Now() - compile_start_time;
   LOG(INFO) << "TFRT finished compiling client graph (" << &client_graph
@@ -583,9 +630,9 @@ GraphExecutor::ImportAndCompileClientGraph(
             << " ms. Client graph name: " << client_graph.name;
 
   return std::make_unique<LoadedClientGraph>(
-      client_graph.name, std::move(context), std::move(module),
-      std::move(bef_context), std::move(bytecode_buffer),
-      std::move(bytecode_executable));
+      client_graph.name, this, std::move(context),
+      std::move(module_with_op_keys), std::move(module),
+      std::move(executable_context));
 }
 
 StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>>
@@ -599,14 +646,10 @@ GraphExecutor::LoadClientGraph(
 
   // Step 3 of loading: Initialize runtime states using special BEF functions.
   auto init_start_time = absl::Now();
-  if (loaded_client_graph->bef_context() != nullptr) {
-    RETURN_IF_ERROR_IN_INIT(InitBef(loaded_client_graph.get(), work_queue));
-  } else if (loaded_client_graph->bytecode_executable() != nullptr) {
+  if (loaded_client_graph->executable_context()->IsForMlrt()) {
     RETURN_IF_ERROR_IN_INIT(InitBytecode(loaded_client_graph.get()));
   } else {
-    return tsl::errors::FailedPrecondition(
-        "Found neither a BEF buffer nor MLRT bytecode in the results of the "
-        "compilation.");
+    RETURN_IF_ERROR_IN_INIT(InitBef(loaded_client_graph.get(), work_queue));
   }
   auto init_duration = absl::Now() - init_start_time;
   LOG(INFO) << "TFRT finished initializing client graph (" << &client_graph
@@ -657,49 +700,29 @@ StatusOr<tfrt::BefBuffer> GraphExecutor::CompileMlirModuleToBef(
   return bef;
 }
 
-StatusOr<mlrt::bc::Buffer> CompileMlirModuleToByteCode(
-    const TfrtCompileOptions& options, mlir::ModuleOp module) {
-  mlrt::bc::Buffer bytecode_buffer;
-  TF_RETURN_IF_ERROR(tensorflow::ConvertTfMlirToRuntimeExecutable(
-      options, module,
-      [&bytecode_buffer](mlir::PassManager& pm, mlir::ModuleOp module,
-                         const TfrtPipelineOptions& options) {
-        mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
+StatusOr<mlrt::bc::Buffer> GraphExecutor::CompileMlirModuleToByteCode(
+    mlir::ModuleOp module,
+    mlir::OwningOpRef<mlir::ModuleOp>* module_with_op_keys) const {
+  return tensorflow::mlrt_compiler::ConvertTfMlirToBytecode(
+      options_.compile_options, module, module_with_op_keys);
+}
 
-        pm.addPass(mlrt_compiler::CreateParallelizationPass(
-            options.cost_threshold, options.merge_inter_dependent_streams));
-        pm.addPass(mlrt_compiler::CreateTfToMlrtConversionPass(options));
-
-        if (mlir::failed(pm.run(module)))
-          return diag_handler.Combine(tensorflow::errors::Internal(
-              "failed to lower TF Dialect to CoreRT dialect."));
-
-        mlrt::AttributeEncoderRegistry registry;
-        registry.Register("tf_mlrt",
-                          &tensorflow::tf_mlrt::EncodeTensorflowAttribute);
-
-        auto statusor = mlrt::EmitExecutable(registry, module);
-        if (!statusor.ok()) {
-          return tsl::FromAbslStatus(statusor.status());
-        }
-
-        bytecode_buffer = std::move(*statusor);
-
-        return OkStatus();
-      }));
-
-  return bytecode_buffer;
+StatusOr<mlrt::bc::Buffer> GraphExecutor::CompileMlirModuleWithOpKeysToByteCode(
+    mlir::ModuleOp module, const CostRecorder& cost_recorder) const {
+  return tensorflow::mlrt_compiler::ConvertTfMlirWithOpKeysToBytecode(
+      options_.compile_options, module, cost_recorder);
 }
 
 tensorflow::Status GraphExecutor::InitBef(
     LoadedClientGraph* loaded_client_graph,
     tensorflow::tfrt_stub::WorkQueueInterface* work_queue) {
-  auto* bef_file = loaded_client_graph->bef_context()->bef_file.get();
+  auto* bef_file = loaded_client_graph->executable_context()->bef_file.get();
   TF_ASSIGN_OR_RETURN(
       auto request_info,
       CreateRequestInfo(
-          /*run_options=*/{}, /*model_metadata=*/{}, runtime(), work_queue,
-          &resource_context_, &loaded_client_graph->runner_table(),
+          options_, /*run_options=*/{}, work_queue, &resource_context_,
+          /*client_graph_resource_context=*/nullptr,
+          &loaded_client_graph->runner_table(),
           &loaded_client_graph->resource_array(), fallback_state_));
 
   tfrt::ExecutionContext exec_ctx(request_info->tfrt_request_context);
@@ -723,25 +746,29 @@ tensorflow::Status GraphExecutor::InitBytecode(
     LoadedClientGraph* loaded_graph) {
   TF_ASSIGN_OR_RETURN(
       auto request_info,
-      CreateRequestInfo(/*run_options=*/{}, /*model_metadata=*/{},
-                        *options_.runtime, options_.runtime->work_queue(),
-                        &resource_context_, &loaded_graph->runner_table(),
+      CreateRequestInfo(options_, /*run_options=*/{},
+                        options_.runtime->work_queue(), &resource_context_,
+                        /*client_graph_resource_context=*/nullptr,
+                        &loaded_graph->runner_table(),
                         &loaded_graph->resource_array(), fallback_state_));
 
-  const auto* loaded_executable = loaded_graph->bytecode_executable();
+  const auto* loaded_executable =
+      loaded_graph->executable_context()->bytecode_executable.get();
   DCHECK(loaded_executable);
 
   std::vector<tensorflow::Tensor> outputs;
   if (auto function = loaded_executable->GetFunction(kFallbackInitFunction)) {
     TF_RETURN_IF_ERROR(RunMlrtFunction(
         function, *loaded_executable, request_info->tfrt_request_context,
-        *request_info->request_queue, {}, &outputs));
+        *request_info->request_queue, {}, &outputs,
+        &loaded_graph->sync_resource_state()));
   }
 
   if (auto function = loaded_executable->GetFunction(kResourceInitFunction)) {
     TF_RETURN_IF_ERROR(RunMlrtFunction(
         function, *loaded_executable, request_info->tfrt_request_context,
-        *request_info->request_queue, {}, &outputs));
+        *request_info->request_queue, {}, &outputs,
+        &loaded_graph->sync_resource_state()));
   }
 
   return OkStatus();
@@ -815,25 +842,32 @@ tensorflow::Status GraphExecutor::RunWithSyncInterpreter(
     absl::Span<const std::string> output_tensor_names,
     absl::Span<const std::string> target_tensor_names,
     absl::Span<mlrt::Value> outputs) {
-  TF_ASSIGN_OR_RETURN(LoadedClientGraph & loaded_client_graph,
-                      GetOrCreateLoadedClientGraph(
-                          /*run_options=*/{}, input_names, input_dtypes,
-                          output_tensor_names, target_tensor_names,
-                          /*work_queue=*/nullptr, graph_name));
+  TF_ASSIGN_OR_RETURN(
+      LoadedClientGraph & loaded_client_graph,
+      GetOrCreateLoadedClientGraph(
+          /*run_options=*/{}, input_names, input_dtypes, output_tensor_names,
+          target_tensor_names,
+          /*work_queue=*/nullptr,
+          graph_name.empty() ? output_tensor_names[0] : graph_name));
 
   TF_ASSIGN_OR_RETURN(
       auto request_info,
-      CreateRequestInfo(
-          /*run_options=*/{}, /*model_metadata=*/{}, *options_.runtime,
-          options_.runtime->work_queue(), &resource_context_,
-          &loaded_client_graph.runner_table(),
-          &loaded_client_graph.resource_array(), fallback_state_));
+      CreateRequestInfo(options_, /*run_options=*/{},
+                        options_.runtime->work_queue(), &resource_context_,
+                        /*client_graph_resource_context=*/nullptr,
+                        &loaded_client_graph.runner_table(),
+                        &loaded_client_graph.resource_array(),
+                        fallback_state_));
   tfrt::ExecutionContext exec_ctx{request_info->tfrt_request_context};
 
+  // Get a shared_ptr of the executable so that during the current request the
+  // executable to use is guaranteed to be alive.
+  auto executable_context = loaded_client_graph.executable_context();
   mlrt::ExecutionContext execution_context(
-      loaded_client_graph.bytecode_executable());
+      executable_context->bytecode_executable.get());
 
-  auto sync_context = std::make_unique<tfrt::SyncContext>(&exec_ctx);
+  auto sync_context = std::make_unique<tfrt::SyncContext>(
+      &exec_ctx, &loaded_client_graph.sync_resource_state());
   execution_context.AddUserContext(std::move(sync_context));
 
   auto tf_context = std::make_unique<tensorflow::tf_mlrt::Context>(
@@ -842,12 +876,13 @@ tensorflow::Status GraphExecutor::RunWithSyncInterpreter(
       request_info->tfrt_request_context->resource_context());
   execution_context.AddUserContext(std::move(tf_context));
 
-  auto serving_function =
-      loaded_client_graph.bytecode_executable()->GetFunction(
-          loaded_client_graph.name());
+  auto serving_function = executable_context->bytecode_executable->GetFunction(
+      loaded_client_graph.name());
   DCHECK(serving_function);
 
-  execution_context.Call(serving_function, input_values, outputs);
+  std::vector<uint8_t> last_uses;
+  execution_context.Call(serving_function, last_uses, absl::Span<mlrt::Value>(),
+                         input_values, outputs);
   mlrt::Execute(execution_context);
   return tsl::FromAbslStatus(execution_context.status());
 }
@@ -862,30 +897,48 @@ GraphExecutor::LoadedClientGraph::MaybeCreateCostRecorder() const {
 
 Status GraphExecutor::LoadedClientGraph::UpdateCost(
     const CostRecorder& cost_recorder, const Runtime& runtime) {
+  LOG(INFO) << "TFRT updating op costs of loaded client graph (" << this << ") "
+            << name_;
+  // Move to function scope to reduce memory footprint.
   auto tfrt_mlir = std::move(tfrt_mlir_);
+  auto tf_mlir_with_op_keys = std::move(tf_mlir_with_op_keys_);
   mlir::StatusScopedDiagnosticHandler diag_handler(
       tfrt_mlir.get().getContext());
-  // TODO(b/259602527): Update costs in bytecode path.
-  // Update costs in MLIR.
-  tfrt_compiler::UpdateOpCostInTfrtMlir(tfrt_mlir.get(), cost_recorder);
-  // Create a new `BefContext` with the updated MLIR.
-  auto bef = tfrt::ConvertMLIRToBEF(tfrt_mlir.get(),
-                                    /*disable_optional_sections=*/true);
+  std::shared_ptr<ExecutableContext> new_executable_context = nullptr;
+  if (executable_context()->IsForMlrt()) {
+    // Recompile from the TF MLIR with recorded costs (skipping
+    // AssignOpKeyPass), during which Stream Analysis is redone.
+    TF_ASSIGN_OR_RETURN(auto bytecode_buffer,
+                        graph_executor_->CompileMlirModuleWithOpKeysToByteCode(
+                            tf_mlir_with_op_keys.get(), cost_recorder));
+    mlrt::bc::Executable executable(bytecode_buffer.data());
+    auto bytecode_executable = std::make_unique<mlrt::LoadedExecutable>(
+        executable, *graph_executor_->kernel_registry_);
+    new_executable_context = std::make_shared<ExecutableContext>(
+        std::move(bytecode_buffer), std::move(bytecode_executable));
+  } else {
+    // Update costs in TFRT MLIR.
+    tfrt_compiler::UpdateOpCostInTfrtMlir(tfrt_mlir.get(), cost_recorder);
+    // Recompile from the updated TFRT MLIR, during which Stream Analysis is
+    // redone.
+    auto bef = tfrt::ConvertMLIRToBEF(tfrt_mlir.get(),
+                                      /*disable_optional_sections=*/true);
 
-  if (bef.empty()) {
-    return diag_handler.Combine(
-        tensorflow::errors::Internal("failed to convert MLIR to BEF."));
+    if (bef.empty()) {
+      return diag_handler.Combine(
+          tensorflow::errors::Internal("failed to convert MLIR to BEF."));
+    }
+    bef.shrink_to_fit();
+    TF_ASSIGN_OR_RETURN(auto bef_file,
+                        tfrt::CreateBefFileFromBefBuffer(runtime, bef));
+    new_executable_context = std::make_shared<ExecutableContext>(
+        std::move(bef), std::move(bef_file));
   }
-  bef.shrink_to_fit();
-  TF_ASSIGN_OR_RETURN(auto bef_file,
-                      tfrt::CreateBefFileFromBefBuffer(runtime, bef));
-  auto bef_context =
-      std::make_shared<BefContext>(std::move(bef), std::move(bef_file));
-  // Swap in the new `BefContext`.
-  tensorflow::mutex_lock lock(bef_context_mu_);
-  // TODO(b/259602527): Add test cases that fail when code is changed. E.g., add
-  // a test kernel that examines the cost.
-  bef_context_ = std::move(bef_context);
+  // Swap in the new `ExecutableContext`.
+  tensorflow::mutex_lock lock(executable_context_mu_);
+  // TODO(b/259602527): Add test cases that fail when code is changed. E.g.,
+  // add a test kernel that examines the cost.
+  executable_context_ = std::move(new_executable_context);
   return OkStatus();
 }
 
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.h b/tensorflow/core/tfrt/graph_executor/graph_executor.h
index 1c87d472a20..29b958efa9f 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.h
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.h
@@ -22,18 +22,20 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_context.h"
 #include "learning/infra/mira/mlrt/bytecode/bytecode.h"
-#include "learning/infra/mira/mlrt/bytecode/executable.h"
 #include "learning/infra/mira/mlrt/interpreter/context.h"
 #include "absl/base/call_once.h"
+#include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+#include "tensorflow/core/tfrt/graph_executor/sync_resource_state.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h"
@@ -44,6 +46,7 @@ limitations under the License.
 #include "tfrt/host_context/execution_context.h"  // from @tf_runtime
 #include "tfrt/host_context/function.h"  // from @tf_runtime
 #include "tfrt/host_context/request_deadline_tracker.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
 #include "tfrt/support/ref_count.h"  // from @tf_runtime
 
 namespace tensorflow {
@@ -64,15 +67,23 @@ struct RequestInfo {
 };
 
 // Creates a `RequestInfo` given relative data.
+// Note: `resource_context` is per-graph-executor and
+// `client_graph_resource_context` is per-loaded-client-graph. See the comment
+// above `GraphExecutor::resource_context_` about the todo to merge these two.
 StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
+    const GraphExecutionOptions& options,
     const GraphExecutionRunOptions& run_options,
-    const SessionMetadata& model_metadata, const Runtime& runtime,
     tensorflow::tfrt_stub::WorkQueueInterface* work_queue,
-    tfrt::ResourceContext* resource_context, OpKernelRunnerTable* runner_table,
+    tfrt::ResourceContext* resource_context,
+    tfrt::ResourceContext* client_graph_resource_context,
+    OpKernelRunnerTable* runner_table,
     tfd::FallbackResourceArray* resource_array,
     const FallbackState& fallback_state, CostRecorder* cost_recorder = nullptr);
 
 // Runs on a function given input/output and other info.
+// Note: `resource_context` is per-graph-executor and
+// `client_graph_resource_context` is per-loaded-client-graph. See the comment
+// above `GraphExecutor::resource_context_` about the todo to merge these two.
 tensorflow::Status GraphExecutionRunOnFunction(
     const GraphExecutionOptions& options,
     const GraphExecutionRunOptions& run_options,
@@ -80,16 +91,14 @@ tensorflow::Status GraphExecutionRunOnFunction(
     const mlrt::LoadedExecutable* loaded_executable,
     absl::Span<const tensorflow::Tensor> inputs,
     std::vector<tensorflow::Tensor>* outputs,
-    tfrt::ResourceContext* resource_context, OpKernelRunnerTable* runner_table,
+    tfrt::ResourceContext* resource_context,
+    tfrt::ResourceContext* client_graph_resource_context,
+    OpKernelRunnerTable* runner_table,
     tfd::FallbackResourceArray* resource_array, const Runtime& runtime,
     const FallbackState& fallback_state,
     tfrt::RequestDeadlineTracker* req_deadline_tracker,
     CostRecorder* cost_recorder = nullptr);
 
-// Compiles MLIR in TF executor dialect to MLRT bytecode executable.
-StatusOr<mlrt::bc::Buffer> CompileMlirModuleToByteCode(
-    const TfrtCompileOptions& options, mlir::ModuleOp module);
-
 // Runs a MLRT function for executing tensorflow graphs.
 tensorflow::Status RunMlrtFunction(
     mlrt::bc::Function function,
@@ -97,7 +106,8 @@ tensorflow::Status RunMlrtFunction(
     const tsl::RCReference<tfrt::RequestContext>& request_context,
     tfrt::ConcurrentWorkQueue& work_queue,
     absl::Span<const tensorflow::Tensor> inputs,
-    std::vector<tensorflow::Tensor>* outputs);
+    std::vector<tensorflow::Tensor>* outputs,
+    SyncResourceState* sync_resource_state);
 
 // Loads (if not yet) and runs a subgraph in a graph as per each request.
 class GraphExecutor {
@@ -105,30 +115,52 @@ class GraphExecutor {
   using Options = GraphExecutionOptions;
   using RunOptions = GraphExecutionRunOptions;
 
-  // Stores BEF-related data.
-  struct BefContext {
-    BefContext(tfrt::BefBuffer bef, tfrt::RCReference<tfrt::BEFFile> bef_file)
+  // Stores executable-related data.
+  struct ExecutableContext {
+    ExecutableContext(
+        mlrt::bc::Buffer bytecode_buffer,
+        std::unique_ptr<mlrt::LoadedExecutable> bytecode_executable)
+        : bytecode_buffer(std::move(bytecode_buffer)),
+          bytecode_executable(std::move(bytecode_executable)) {}
+
+    ExecutableContext(tfrt::BefBuffer bef,
+                      tfrt::RCReference<tfrt::BEFFile> bef_file)
         : bef(std::move(bef)), bef_file(std::move(bef_file)) {}
 
+    bool IsForMlrt() const { return bytecode_executable != nullptr; }
+
+    // Only one set of values will be filled.
+
+    // For the MLRT path.
+    mlrt::bc::Buffer bytecode_buffer;
+    std::unique_ptr<mlrt::LoadedExecutable> bytecode_executable = nullptr;
+
+    // For the TFRT path.
     tfrt::BefBuffer bef;
     tfrt::RCReference<tfrt::BEFFile> bef_file;
+
+    // There are some resources that need re-creating when the executable is
+    // re-created, so a resource context is stored along with the executable.
+    // This resource context is meant to be passed to the op kernels for their
+    // references. See the comment above `GraphExecutor::resource_context_`
+    // about the todo to merge that resource context with this one.
+    tfrt::ResourceContext resource_context;
   };
 
   // The loading result of a `ClientGraph`.
   class LoadedClientGraph {
    public:
-    LoadedClientGraph(
-        std::string name, std::unique_ptr<mlir::MLIRContext> mlir_context,
-        mlir::OwningOpRef<mlir::ModuleOp> tfrt_mlir,
-        std::shared_ptr<BefContext> bef_context,
-        mlrt::bc::Buffer bytecode_buffer,
-        std::unique_ptr<mlrt::LoadedExecutable> bytecode_executable)
+    LoadedClientGraph(std::string name, GraphExecutor* graph_executor,
+                      std::unique_ptr<mlir::MLIRContext> mlir_context,
+                      mlir::OwningOpRef<mlir::ModuleOp> tf_mlir_with_op_keys,
+                      mlir::OwningOpRef<mlir::ModuleOp> tfrt_mlir,
+                      std::shared_ptr<ExecutableContext> executable_context)
         : name_(std::move(name)),
+          graph_executor_(graph_executor),
           mlir_context_(std::move(mlir_context)),
+          tf_mlir_with_op_keys_(std::move(tf_mlir_with_op_keys)),
           tfrt_mlir_(std::move(tfrt_mlir)),
-          bef_context_(std::move(bef_context)),
-          bytecode_buffer_(std::move(bytecode_buffer)),
-          bytecode_executable_(std::move(bytecode_executable)) {}
+          executable_context_(std::move(executable_context)) {}
 
     // Returns a `CostRecorder` if none has been created before for this
     // `LoadedClientGraph`.
@@ -140,34 +172,32 @@ class GraphExecutor {
                       const Runtime& runtime);
 
     // Getters.
-    std::shared_ptr<BefContext> bef_context() const {
-      tensorflow::mutex_lock lock(bef_context_mu_);
-      return bef_context_;
+    std::shared_ptr<ExecutableContext> executable_context() const {
+      tensorflow::mutex_lock lock(executable_context_mu_);
+      return executable_context_;
     }
     absl::string_view name() const { return name_; }
 
     OpKernelRunnerTable& runner_table() { return runner_table_; }
     tfd::FallbackResourceArray& resource_array() { return resource_array_; }
-
-    mlrt::LoadedExecutable* bytecode_executable() const {
-      return bytecode_executable_.get();
-    }
+    SyncResourceState& sync_resource_state() { return sync_resource_state_; }
 
    private:
     std::string name_;
+    GraphExecutor* graph_executor_ = nullptr;
     OpKernelRunnerTable runner_table_;
     tfd::FallbackResourceArray resource_array_;
     std::unique_ptr<mlir::MLIRContext> mlir_context_;
     // Thread-safety resulted from `create_cost_recorder_once_`.
-    mlir::OwningOpRef<mlir::ModuleOp> tfrt_mlir_;
-    // Only one of `bef_context_` or `bytecode_executable_` should be filled for
-    // a single `LoadedClientGraph`.
-    mutable tensorflow::mutex bef_context_mu_;
+    mlir::OwningOpRef<mlir::ModuleOp>
+        tf_mlir_with_op_keys_;                     // For recompilation in MLRT.
+    mlir::OwningOpRef<mlir::ModuleOp> tfrt_mlir_;  // For recompilation in TFRT.
+    mutable tensorflow::mutex executable_context_mu_;
     // Can be updated if online cost analysis is enabled.
-    std::shared_ptr<BefContext> bef_context_ TF_GUARDED_BY(bef_context_mu_);
-    mlrt::bc::Buffer bytecode_buffer_;
-    std::unique_ptr<mlrt::LoadedExecutable> bytecode_executable_ = nullptr;
+    std::shared_ptr<ExecutableContext> executable_context_
+        TF_GUARDED_BY(executable_context_mu_);
     mutable absl::once_flag create_cost_recorder_once_;
+    SyncResourceState sync_resource_state_;
   };
 
   // A subgraph constructed by specifying input/output tensors.
@@ -233,6 +263,8 @@ class GraphExecutor {
 
   tfrt::ResourceContext& resource_context() { return resource_context_; }
 
+  const Options& options() const { return options_; }
+
  private:
   // A set of methods to load a client graph.
   StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>> LoadClientGraph(
@@ -244,6 +276,11 @@ class GraphExecutor {
   ImportClientGraphToMlirModule(const GraphExecutor::ClientGraph& client_graph,
                                 mlir::MLIRContext* context) const;
   StatusOr<tfrt::BefBuffer> CompileMlirModuleToBef(mlir::ModuleOp module) const;
+  StatusOr<mlrt::bc::Buffer> CompileMlirModuleToByteCode(
+      mlir::ModuleOp module,
+      mlir::OwningOpRef<mlir::ModuleOp>* module_with_op_keys) const;
+  StatusOr<mlrt::bc::Buffer> CompileMlirModuleWithOpKeysToByteCode(
+      mlir::ModuleOp module, const CostRecorder& cost_recorder) const;
 
   tensorflow::Status InitBef(
       LoadedClientGraph* loaded_client_graph,
@@ -267,6 +304,9 @@ class GraphExecutor {
   Options options_;
   std::reference_wrapper<const FallbackState> fallback_state_;
 
+  // TODO(juanlishen): Maybe remove this per-model resource context and delegate
+  // to the one in each `LoadedClientGraph` instead. Ideally, only one resource
+  // context should be passed to the op kernels.
   tfrt::ResourceContext resource_context_;
 
   std::unique_ptr<tensorflow::tfrt_stub::TfrtGraphExecutionState>
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc b/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc
index f6c1a8a472c..44e061a8d83 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
 #include "tfrt/cpp_tests/test_util.h""  // from @tf_runtime
 #include "tfrt/tensor/dense_host_tensor.h"  // from @tf_runtime
@@ -96,16 +97,14 @@ TEST_P(GraphExecutorTest, Vanilla) {
               ::testing::ElementsAreArray({2}));
 }
 
-INSTANTIATE_TEST_SUITE_P(GraphExecutorTestSuite, GraphExecutorTest,
-                         ::testing::Bool());
-
-TEST_F(GraphExecutorTest, BasicWithOnlineCostAnalysis) {
+TEST_P(GraphExecutorTest, BasicWithOnlineCostAnalysis) {
   GraphDef graph_def;
   TF_ASSERT_OK(GetSimpleGraphDef(graph_def));
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   GraphExecutor::Options options(runtime.get());
   options.enable_online_cost_analysis = true;
+  options.enable_mlrt = GetParam();
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto fallback_state,
@@ -113,7 +112,7 @@ TEST_F(GraphExecutorTest, BasicWithOnlineCostAnalysis) {
           CreateDefaultSessionOptions(options), graph_def.library()));
   TF_ASSERT_OK_AND_ASSIGN(
       auto graph_executor,
-      GraphExecutor::Create(std::move(options), *fallback_state, graph_def,
+      GraphExecutor::Create(options, *fallback_state, graph_def,
                             GetKernelRegistry()));
 
   // Set input 'x' to [[1, 1, 1]]
@@ -142,15 +141,18 @@ TEST_F(GraphExecutorTest, BasicWithOnlineCostAnalysis) {
               ::testing::ElementsAreArray({2}));
 }
 
+INSTANTIATE_TEST_SUITE_P(GraphExecutorTestSuite, GraphExecutorTest,
+                         ::testing::Bool());
+
 TEST_F(GraphExecutorTest, DoOnlineCostAnalysisExactlyOnce) {
   GraphExecutor::LoadedClientGraph loaded_client_graph_0(
-      "name0", /*mlir_context=*/nullptr, /*tfrt_mlir=*/{},
-      /*bef_context=*/nullptr, /*bytecode_buffer=*/{},
-      /*bytecode_executable=*/nullptr);
+      "name0", /*graph_executor=*/nullptr, /*mlir_context=*/nullptr,
+      /*tf_mlir_with_op_keys=*/{}, /*tfrt_mlir=*/{},
+      /*executable_context=*/nullptr);
   GraphExecutor::LoadedClientGraph loaded_client_graph_1(
-      "name1", /*mlir_context=*/nullptr, /*tfrt_mlir=*/{},
-      /*bef_context=*/nullptr, /*bytecode_buffer=*/{},
-      /*bytecode_executable=*/nullptr);
+      "name1", /*graph_executor=*/nullptr, /*mlir_context=*/nullptr,
+      /*tf_mlir_with_op_keys=*/{}, /*tfrt_mlir=*/{},
+      /*executable_context=*/nullptr);
 
   // For each `LoadedClientGraph`, `MaybeCreateCostRecorder()` only returns a
   // cost recorder for once.
@@ -246,7 +248,7 @@ TEST_F(GraphExecutorTest, DisableCompilation) {
                                     /*target_tensor_names=*/{}, &outputs);
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.ToString(),
       ::testing::HasSubstr("GraphExecutor: compilation is disabled in "
                            "execution but the compiled graph is not found"));
 
diff --git a/tensorflow/core/tfrt/graph_executor/sync_resource_state.h b/tensorflow/core/tfrt/graph_executor/sync_resource_state.h
new file mode 100644
index 00000000000..d0b3783f91f
--- /dev/null
+++ b/tensorflow/core/tfrt/graph_executor/sync_resource_state.h
@@ -0,0 +1,48 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_SYNC_RESOURCE_STATE_H_
+#define TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_SYNC_RESOURCE_STATE_H_
+
+#include <utility>
+#include <vector>
+
+#include "tfrt/tensor/dense_host_tensor.h"  // from @tf_runtime
+namespace tensorflow {
+namespace tfrt_stub {
+
+class SyncResourceState {
+ public:
+  // Sets `dht` in the array at `index`. `index` should be dense and
+  // duplicate indices are not allowed.
+  void SetResourceDht(int index, tfrt::DenseHostTensor dht) {
+    if (resource_dht_.size() <= index) {
+      resource_dht_.resize(index + 1);
+    }
+
+    resource_dht_[index] = std::move(dht);
+  }
+
+  tfrt::DenseHostTensor GetResourceDht(int index) const {
+    return resource_dht_.at(index).CopyRef();
+  }
+
+ private:
+  std::vector<tfrt::DenseHostTensor> resource_dht_;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_SYNC_RESOURCE_STATE_H_
diff --git a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.cc b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.cc
index f5f79195be2..976425e3101 100644
--- a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/kernel.h"
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/math_kernels.h"
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_fallback_kernels.h"
+#include "learning/brain/tfrt/mlrt/application/vrooml/kernel.h"
 #include "absl/status/statusor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/status.h"
@@ -72,6 +73,7 @@ SynchronousGraphExecutor::Create(
   tensorflow::tf_mlrt::RegisterTfMlrtKernels(*kernel_registry);
   tfrt::cpu::RegisterMlrtMathKernels(kernel_registry.get());
   tfrt::cpu::RegisterMlrtFallbackCompatKernels(kernel_registry.get());
+  tensorflow::vrooml_mlrt::RegisterDhtResourceKernels(*kernel_registry);
 
   tensorflow::StatusOr<std::unique_ptr<tensorflow::tfrt_stub::GraphExecutor>>
       graph_executor = tensorflow::tfrt_stub::GraphExecutor::Create(
diff --git a/tensorflow/core/tfrt/graph_executor/test_config.proto b/tensorflow/core/tfrt/graph_executor/test_config.proto
new file mode 100644
index 00000000000..1dbf8759744
--- /dev/null
+++ b/tensorflow/core/tfrt/graph_executor/test_config.proto
@@ -0,0 +1,11 @@
+syntax = "proto3";
+
+package tensorflow.tfrt_stub;
+
+message TestConfig1 {
+  string tag = 1;
+}
+
+message TestConfig2 {
+  string tag = 1;
+}
diff --git a/tensorflow/core/tfrt/mla/BUILD b/tensorflow/core/tfrt/mla/BUILD
index c2740bf512c..3e836a7ca6a 100644
--- a/tensorflow/core/tfrt/mla/BUILD
+++ b/tensorflow/core/tfrt/mla/BUILD
@@ -13,9 +13,9 @@ cc_library(
         "//tensorflow/core/tfrt/saved_model/tests:__pkg__",
     ],
     deps = [
-        "@com_google_absl//absl/strings",
-        "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/strings",
     ] + if_google([
         "//third_party/mira/mlarchive:mla",
         "//third_party/mira/mlarchive/google:google_env",
@@ -31,9 +31,9 @@ cc_library(
         "//tensorflow/core/tfrt/saved_model:__pkg__",
     ],
     deps = [
-        "@com_google_absl//absl/strings",
-        "//tensorflow/core/platform:statusor",
         "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:statusor",
+        "@com_google_absl//absl/strings",
     ] + if_google([
         "//third_party/mira/mlarchive:mla",
         "//third_party/mira/mlarchive:status_macro",
diff --git a/tensorflow/core/tfrt/runtime/BUILD b/tensorflow/core/tfrt/runtime/BUILD
index 956a08cb43e..10f02ad482d 100644
--- a/tensorflow/core/tfrt/runtime/BUILD
+++ b/tensorflow/core/tfrt/runtime/BUILD
@@ -1,4 +1,3 @@
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
@@ -26,29 +25,18 @@ cc_library(
     name = "runtime",
     srcs = ["runtime.cc"],
     hdrs = ["runtime.h"],
-    copts = if_cuda(
-        ["-DGOOGLE_CUDA=1"],
-        [],
-    ),
     deps = [
         ":work_queue_interface",
-        "//tensorflow/core:lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_op_handler",
+        "//tensorflow/core:lib",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_tensor",
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
+        "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
         "@tf_runtime//:core_runtime_alwayslink",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:tensor",
-        "@tf_runtime//backends/cpu:core_runtime_alwayslink",
         "@tf_runtime//backends/cpu:core_runtime",
-    ] + if_cuda(
-        [
-            "@tf_runtime//backends/gpu:gpu_device",
-            "@tf_runtime//backends/gpu:gpu_op_handler",
-            "@tf_runtime//backends/gpu:gpu_tensor",
-        ],
-    ),
+        "@tf_runtime//backends/cpu:core_runtime_alwayslink",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/core/tfrt/runtime/runtime.cc b/tensorflow/core/tfrt/runtime/runtime.cc
index 8611e0c72f8..2348b9f045e 100644
--- a/tensorflow/core/tfrt/runtime/runtime.cc
+++ b/tensorflow/core/tfrt/runtime/runtime.cc
@@ -20,111 +20,19 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h"
-#include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.h"
-#include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h"
 #include "tfrt/cpu/core_runtime/cpu_op_handler.h"  // from @tf_runtime
+#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
 #include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+#include "tfrt/host_context/diagnostic.h"  // from @tf_runtime
 #include "tfrt/host_context/host_allocator.h"  // from @tf_runtime
 #include "tfrt/tensor/scalar_host_tensor.h"  // from @tf_runtime
 
-#ifdef GOOGLE_CUDA
-#include "tfrt/gpu/core_runtime/gpu_op_handler.h"  // from @tf_runtime
-#include "tfrt/gpu/device/device.h"  // from @tf_runtime
-#include "tfrt/gpu/device/device_util.h"  // from @tf_runtime
-#include "tfrt/gpu/tensor/dense_gpu_tensor.h"  // from @tf_runtime
-#endif  // GOOGLE_CUDA
-
 constexpr char const kDefaultHostDeviceName[] =
     "/job:localhost/replica:0/task:0/device:CPU:0";
 
 namespace tensorflow {
 namespace tfrt_stub {
-namespace {
-
-tensorflow::Status InitializeOpHandlers(tfrt::CoreRuntime* corert) {
-  // TODO(b/196962112): Make default device configurable through Runtime.
-  std::string default_device = kDefaultHostDeviceName;
-
-  DeviceNameUtils::ParsedName device_parsed_name;
-  if (!DeviceNameUtils::ParseFullName(default_device, &device_parsed_name) ||
-      !device_parsed_name.has_type) {
-    return tensorflow::Status(absl::StatusCode::kInvalidArgument,
-                              "Invalid device name");
-  }
-
-  if (device_parsed_name.type == DEVICE_CPU) {
-    default_device = kDefaultHostDeviceName;
-  } else if (device_parsed_name.type == DEVICE_GPU &&
-             (!device_parsed_name.has_job || !device_parsed_name.has_id ||
-              !device_parsed_name.has_replica ||
-              !device_parsed_name.has_task)) {
-    return tensorflow::Status(absl::StatusCode::kInvalidArgument,
-                              "Device name must be fully specified");
-  }
-
-  tfrt::OpHandler* op_handler = nullptr;
-
-  if (device_parsed_name.type == DEVICE_GPU) {
-#ifdef GOOGLE_CUDA
-    auto fallback_op_handler = tensorflow::tfd::CreateRuntimeFallbackOpHandler(
-        corert, /*tf_device_name=*/"");
-    corert->RegisterOpHandler("tf", fallback_op_handler.get());
-    op_handler = fallback_op_handler.get();
-#endif  // GOOGLE_CUDA
-  } else {
-    auto fallback_op_handler = tensorflow::tfd::CreateKernelFallbackOpHandler(
-        corert, corert->GetHostContext()->GetHostDeviceRef());
-    corert->RegisterOpHandler("tfkernel", fallback_op_handler.get());
-    op_handler = fallback_op_handler.get();
-  }
-
-  if (device_parsed_name.type == DEVICE_CPU) {
-    auto cpu_device = corert->GetHostContext()->GetHostDeviceRef();
-    auto cpu_op_handler =
-        tfrt::CreateCpuOpHandler(corert, std::move(cpu_device), op_handler);
-
-    cpu_op_handler.get()->AddImplicitConversion(
-        tensorflow::tfd::RuntimeFallbackTensor::kTensorType,
-        tfrt::DenseHostTensor::kTensorType);
-    cpu_op_handler.get()->AddImplicitConversion(
-        tensorflow::tfd::RuntimeFallbackTensor::kTensorType,
-        tfrt::AnyScalarHostTensor::kTensorType);
-    cpu_op_handler.get()->AddImplicitConversion(
-        tensorflow::tfd::RuntimeFallbackTensor::kTensorType,
-        tfrt::StringHostTensor::kTensorType);
-    cpu_op_handler.get()->AddImplicitConversion(
-        tensorflow::KernelFallbackTensor::kTensorType,
-        tfrt::DenseHostTensor::kTensorType);
-    cpu_op_handler.get()->AddImplicitConversion(
-        tensorflow::KernelFallbackTensor::kTensorType,
-        tfrt::AnyScalarHostTensor::kTensorType);
-    cpu_op_handler.get()->AddImplicitConversion(
-        tensorflow::KernelFallbackTensor::kTensorType,
-        tfrt::StringHostTensor::kTensorType);
-
-    op_handler = cpu_op_handler.get();
-#ifdef GOOGLE_CUDA
-  } else if (device_parsed_name.type == DEVICE_GPU) {
-    const int gpu_ordinal = 0;
-    auto gpu_device = tfrt::gpu::GetOrCreateGpuDevice(
-        default_device, gpu_ordinal, corert->GetHostContext());
-    auto gpu_op_handler = tfrt::gpu::CreateGpuOpHandler(
-        corert, std::move(gpu_device.get()), op_handler);
-    op_handler = gpu_op_handler.get();
-#endif  // GOOGLE_CUDA
-  } else {
-    return tensorflow::Status(absl::StatusCode::kInvalidArgument,
-                              "Unknown device type");
-  }
-
-  corert->RegisterOpHandler(default_device, op_handler);
-
-  return OkStatus();
-}
-
-}  // namespace
 
 std::unique_ptr<Runtime> Runtime::Create(
     std::unique_ptr<WorkQueueInterface> work_queue) {
@@ -134,11 +42,6 @@ std::unique_ptr<Runtime> Runtime::Create(
       tfrt::CreateMallocAllocator(), std::move(work_queue),
       kDefaultHostDeviceName);
   DCHECK(expected_core_runtime);
-  const auto& status = InitializeOpHandlers(expected_core_runtime.get().get());
-  if (!status.ok()) {
-    LOG(ERROR) << "Failed to initialize op handlers: " << status;
-    return {};
-  }
 
   // We don't use std::make_unique here because the constructor should better be
   // private.
diff --git a/tensorflow/core/tfrt/runtime/runtime.h b/tensorflow/core/tfrt/runtime/runtime.h
index fedbfc3e624..7c5ea818967 100644
--- a/tensorflow/core/tfrt/runtime/runtime.h
+++ b/tensorflow/core/tfrt/runtime/runtime.h
@@ -16,7 +16,10 @@ limitations under the License.
 #define TENSORFLOW_CORE_TFRT_RUNTIME_RUNTIME_H_
 
 #include <memory>
+#include <utility>
+#include <vector>
 
+#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tfrt/host_context/resource_context.h"  // from @tf_runtime
 
@@ -75,6 +78,15 @@ class Runtime {
   // The argument `fn` should be thread-safe.
   void AddCreateRuntimeResourceFn(
       std::function<void(tfrt::ResourceContext*)> fn) {
+    runtime_resource_fns_.emplace_back(
+        [fn = std::move(fn)](const GraphExecutionOptions&,
+                             tfrt::ResourceContext* resource_ctx) {
+          fn(resource_ctx);
+        });
+  }
+  void AddCreateRuntimeResourceFn(
+      std::function<void(const GraphExecutionOptions&, tfrt::ResourceContext*)>
+          fn) {
     runtime_resource_fns_.emplace_back(std::move(fn));
   }
 
@@ -82,9 +94,10 @@ class Runtime {
   // resources.
   //
   // This function is thread-safe.
-  void CreateRuntimeResources(tfrt::ResourceContext* resource_ctx) const {
+  void CreateRuntimeResources(const GraphExecutionOptions& options,
+                              tfrt::ResourceContext* resource_ctx) const {
     for (auto& fn : runtime_resource_fns_) {
-      fn(resource_ctx);
+      fn(options, resource_ctx);
     }
   }
 
@@ -112,7 +125,8 @@ class Runtime {
   std::function<StatusOr<std::unique_ptr<WorkQueueInterface>>(int64_t)>
       create_request_queue_fn_;
   WorkQueueInterface* work_queue_ = nullptr;
-  std::vector<std::function<void(tfrt::ResourceContext*)>>
+  std::vector<std::function<void(const GraphExecutionOptions& options,
+                                 tfrt::ResourceContext*)>>
       runtime_resource_fns_;
 };
 
diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index bfc77dafe7c..0566127265e 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -18,6 +18,7 @@ package_group(
         "//tensorflow/core/tfrt/saved_model/tests/...",
         "//tensorflow/core/tfrt/graph_executor/...",
         "//tensorflow/core/tfrt/tfrt_session/...",
+        "//tensorflow/core/tfrt/utils/debug/...",
         "//tensorflow_serving/...",
         "//platforms/xla/tests/saved_models/...",
         # copybara:uncomment "//quality/webanswers/servo2/...",
@@ -25,23 +26,15 @@ package_group(
 )
 
 cc_library(
-    name = "saved_model",
+    name = "saved_model_lib",
     srcs = [
         "saved_model.cc",
+        "saved_model.h",
         "saved_model_import_input.cc",
         "saved_model_import_input.h",
     ],
-    hdrs = ["saved_model.h"],
-    tags = ["no_oss"],
+    visibility = ["//visibility:private"],
     deps = [
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
-        "//learning/brain/experimental/tfrt/native_lowering/kernels",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
         "//tensorflow/cc/saved_model:reader",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
@@ -49,23 +42,21 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:upgrade_graph",
         "//tensorflow/compiler/mlir/tfrt:import_model",
         "//tensorflow/compiler/mlir/tfrt:saved_model",
-        "//tensorflow/compiler/mlir/tfrt:tf_jitrt_kernels_alwayslink",
         "//tensorflow/compiler/mlir/tfrt:tfrt_compile_options",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
+        "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/framework:graph_proto_cc",
+        "//tensorflow/core/ops",
         "//tensorflow/core/platform:enable_tf2_utils",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:path",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_op_handler",
-        "//tensorflow/core/runtime_fallback/kernel:gpurt_kernels",
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/graph_executor",
         "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
@@ -75,6 +66,15 @@ cc_library(
         "//tensorflow/core/tfrt/utils:error_util",
         "//tensorflow/core/tfrt/utils:fallback_tensor",
         "//tensorflow/core/tfrt/utils:tfrt_graph_execution_state",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@tf_runtime//:bef",
         "@tf_runtime//:befexecutor",
         "@tf_runtime//:core_runtime_alwayslink",
         "@tf_runtime//:hostcontext",
@@ -82,6 +82,58 @@ cc_library(
         "@tf_runtime//:support",
     ] + if_google([
         "//third_party/tf_runtime_google:streamz_metrics_registry_alwayslink",
+        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms:import_model",
+        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
+        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel:batch_kernel",
+        "//learning/brain/experimental/tfrt/native_lowering/kernels",
+        "//learning/infra/mira/mlrt/bytecode",
+    ]),
+)
+
+cc_library(
+    name = "saved_model_cpu",
+    hdrs = ["saved_model.h"],
+    deps = [
+        ":saved_model_lib",
+        "//tensorflow/core/framework:graph_proto_cc",
+        "//tensorflow/core/platform:thread_annotations",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
+        "//tensorflow/core/tfrt/graph_executor",
+        "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
+        "//tensorflow/core/tfrt/runtime",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@tf_runtime//:hostcontext",
+    ],
+)
+
+cc_library(
+    name = "saved_model",
+    hdrs = ["saved_model.h"],
+    tags = ["no_oss"],
+    deps = [
+        ":saved_model_lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "//tensorflow/compiler/mlir/tfrt:tf_jitrt_kernels_alwayslink",
+        "//tensorflow/compiler/mlir/tfrt:tfrt_jitrt_passes",
+        "//tensorflow/core/framework:graph_proto_cc",
+        "//tensorflow/core/platform:thread_annotations",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        # TODO(chky): Remove kernel fallback tensor deps.
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_tensor_conversion_alwayslink",
+        "//tensorflow/core/runtime_fallback/kernel:gpurt_kernels",
+        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
+        "//tensorflow/core/tfrt/graph_executor",
+        "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
+        "//tensorflow/core/tfrt/runtime",
+        "@tf_runtime//:hostcontext",
+    ] + if_google([
+        "//learning/brain/tfrt/tpu/compiler/mlir:tf_to_tfrt_tpu",
     ]),
 )
 
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.cc b/tensorflow/core/tfrt/saved_model/saved_model.cc
index 41c0f32dc43..321161dc89d 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model.cc
@@ -25,9 +25,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms/import_model.h"
+#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/batch_kernel.h"
 #include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/kernel.h"
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/math_kernels.h"
+#include "learning/infra/mira/mlrt/bytecode/bytecode.h"
+#include "absl/cleanup/cleanup.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -43,7 +48,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
@@ -63,6 +67,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
 #include "tensorflow/tsl/platform/statusor.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 #include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
 #include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
 #include "tfrt/host_context/async_value.h"  // from @tf_runtime
@@ -81,7 +86,6 @@ namespace {
 constexpr absl::string_view kSignatureJoiningDelimiter = "+";
 
 using SignatureMap = absl::flat_hash_map<std::string, internal::Signature>;
-using ::tensorflow::SessionMetadata;
 using ::tensorflow::StatusOr;
 
 struct Initializer {
@@ -235,35 +239,36 @@ StatusOr<InitializersAndSignatures> GetInitializersAndSignatures(
 }
 
 tensorflow::Status RunBytecodeInitializers(
+    const GraphExecutionOptions& options,
     const InitializersAndSignatures& initializers_and_signatures,
-    const SessionMetadata& model_metadata,
-    const mlrt::LoadedExecutable& loaded_executable, const Runtime& runtime,
+    const mlrt::LoadedExecutable& loaded_executable,
     tfrt::ResourceContext* resource_context, OpKernelRunnerTable* runner_table,
     tfd::FallbackResourceArray* resource_array,
     const FallbackState& fallback_state) {
   TF_ASSIGN_OR_RETURN(
       auto request_info,
-      CreateRequestInfo(/*run_options=*/{}, model_metadata, runtime,
-                        runtime.work_queue(), resource_context, runner_table,
+      CreateRequestInfo(options, /*run_options=*/{},
+                        options.runtime->work_queue(), resource_context,
+                        /*client_graph_resource_context=*/nullptr, runner_table,
                         resource_array, fallback_state));
 
   std::vector<tensorflow::Tensor> outputs;
   if (auto function = loaded_executable.GetFunction("_tfrt_fallback_init")) {
     TF_RETURN_IF_ERROR(RunMlrtFunction(
         function, loaded_executable, request_info->tfrt_request_context,
-        *request_info->request_queue, {}, &outputs));
+        *request_info->request_queue, {}, &outputs,
+        /*sync_resource_state=*/nullptr));
   }
 
   for (const auto& p : initializers_and_signatures.initializers) {
     const auto& initializer_name = p.name;
     const auto& initializer_inputs = p.inputs;
-    GraphExecutionOptions options(&runtime);
-    options.model_metadata = model_metadata;
     std::vector<tensorflow::Tensor> outputs;
     TF_RETURN_IF_ERROR(GraphExecutionRunOnFunction(
         options, /*run_options=*/{}, initializer_name, nullptr,
         &loaded_executable, initializer_inputs, &outputs, resource_context,
-        runner_table, resource_array, runtime, fallback_state,
+        /*client_graph_resource_context=*/nullptr, runner_table, resource_array,
+        *options.runtime, fallback_state,
         /*req_deadline_tracker=*/nullptr));
     DCHECK(outputs.empty());
   }
@@ -271,23 +276,26 @@ tensorflow::Status RunBytecodeInitializers(
   if (auto function = loaded_executable.GetFunction("_tfrt_resource_init")) {
     TF_RETURN_IF_ERROR(RunMlrtFunction(
         function, loaded_executable, request_info->tfrt_request_context,
-        *request_info->request_queue, {}, &outputs));
+        *request_info->request_queue, {}, &outputs,
+        /*sync_resource_state=*/nullptr));
   }
 
   return OkStatus();
 }
 
 tensorflow::Status RunBefInitializers(
+    const GraphExecutionOptions& options,
     const InitializersAndSignatures& initializers_and_signatures,
-    const SessionMetadata& model_metadata, tfrt::BEFFile* bef_file,
-    const Runtime& runtime, tfrt::ResourceContext* resource_context,
+    tfrt::BEFFile* bef_file, tfrt::ResourceContext* resource_context,
     OpKernelRunnerTable* runner_table,
     tfd::FallbackResourceArray* resource_array,
     const FallbackState& fallback_state) {
+  DCHECK(options.runtime);
   TF_ASSIGN_OR_RETURN(
       auto request_info,
-      CreateRequestInfo(/*run_options=*/{}, model_metadata, runtime,
-                        runtime.work_queue(), resource_context, runner_table,
+      CreateRequestInfo(options, /*run_options=*/{},
+                        options.runtime->work_queue(), resource_context,
+                        /*client_graph_resource_context=*/nullptr, runner_table,
                         resource_array, fallback_state));
 
   tfrt::ExecutionContext exec_ctx(request_info->tfrt_request_context);
@@ -301,15 +309,15 @@ tensorflow::Status RunBefInitializers(
   for (const auto& p : initializers_and_signatures.initializers) {
     const auto& initializer_name = p.name;
     const auto& initializer_inputs = p.inputs;
-    GraphExecutionOptions options(&runtime);
-    options.model_metadata = model_metadata;
     auto* func = bef_file->GetFunction(initializer_name);
     DCHECK(func);
     std::vector<tensorflow::Tensor> outputs;
     TF_RETURN_IF_ERROR(GraphExecutionRunOnFunction(
         options, /*run_options=*/{}, initializer_name, func,
         /*loaded_executable=*/nullptr, initializer_inputs, &outputs,
-        resource_context, runner_table, resource_array, runtime, fallback_state,
+        resource_context,
+        /*client_graph_resource_context=*/nullptr, runner_table, resource_array,
+        *options.runtime, fallback_state,
         /*req_deadline_tracker=*/nullptr));
     DCHECK(outputs.empty());
   }
@@ -446,10 +454,9 @@ tensorflow::Status CheckInputSpecs(
         ->GetCell(
             absl::StrCat(model_metadata.name(), ":", model_metadata.version()))
         ->Set(true);
-    const auto error_string =
-        absl::StrCat("model: ", model_metadata.name(),
-                     ", version: ", model_metadata.version(),
-                     ", error: ", status.error_message());
+    const auto error_string = absl::StrCat(
+        "model: ", model_metadata.name(),
+        ", version: ", model_metadata.version(), ", error: ", status.message());
     if (!run_options.validate_input_specs_dry_run) {
       return tensorflow::errors::InvalidArgument(error_string);
     }
@@ -580,6 +587,17 @@ void UpdateCompileOptions(SavedModel::Options& options) {
     // remove the obsolete TFRT GPU runtime as well.
     options.graph_execution_options.compile_options.use_bridge_for_gpu = true;
   }
+
+  options.graph_execution_options.compile_options
+      .fuse_get_resource_ops_in_hoisting =
+      !options.graph_execution_options.enable_mlrt;
+
+  if (options.graph_execution_options.enable_mlrt) {
+    options.lazy_loading_use_graph_executor = options.enable_lazy_loading;
+    LOG(INFO) << "lazy_loading_use_graph_executor is updated to be the same as "
+                 "enable_lazy_loading: "
+              << options.enable_lazy_loading;
+  }
 }
 
 StatusOr<tensorflow::MetaGraphDef> ReadSavedModel(
@@ -675,11 +693,11 @@ SavedModelImpl::LoadSavedModel(Options options,
     GetSignaturesFromSignatureDef(initializers_and_signatures.signature_map,
                                   meta_graph_def.signature_def(), options);
   }
-  tfrt::BefBuffer bef;
   mlrt::bc::Buffer bytecode;
+  tfrt::BefBuffer bef;
   if (options.graph_execution_options.enable_mlrt) {
     ASSIGN_OR_RETURN_IN_COMPILE(
-        bytecode, CompileMlirModuleToByteCode(
+        bytecode, tensorflow::mlrt_compiler::ConvertTfMlirToBytecode(
                       options.graph_execution_options.compile_options,
                       mlir_module.get()));
   } else {
@@ -700,6 +718,7 @@ SavedModelImpl::LoadSavedModel(Options options,
   auto kernel_registry = std::make_unique<mlrt::KernelRegistry>();
   // Register infra and standard math kernels
   tensorflow::tf_mlrt::RegisterTfMlrtKernels(*kernel_registry);
+  tensorflow::tf_mlrt::RegisterTfMlrtBatchKernels(*kernel_registry);
   tfrt::cpu::RegisterMlrtMathKernels(kernel_registry.get());
 
   std::optional<mlrt::LoadedExecutable> loaded_executable;
@@ -725,17 +744,13 @@ SavedModelImpl::LoadSavedModel(Options options,
 
   if (loaded_executable) {
     RETURN_IF_ERROR_IN_INIT(RunBytecodeInitializers(
-        initializers_and_signatures,
-        options.graph_execution_options.model_metadata, *loaded_executable,
-        *options.graph_execution_options.runtime,
-        &graph_executor->resource_context(), runner_table.get(),
-        resource_array.get(), *fallback_state));
+        graph_executor->options(), initializers_and_signatures,
+        *loaded_executable, &graph_executor->resource_context(),
+        runner_table.get(), resource_array.get(), *fallback_state));
   } else {
     DCHECK(bef_file);
     RETURN_IF_ERROR_IN_INIT(RunBefInitializers(
-        initializers_and_signatures,
-        options.graph_execution_options.model_metadata, bef_file.get(),
-        *options.graph_execution_options.runtime,
+        graph_executor->options(), initializers_and_signatures, bef_file.get(),
         &graph_executor->resource_context(), runner_table.get(),
         resource_array.get(), *fallback_state));
   }
@@ -867,8 +882,9 @@ tensorflow::Status SavedModelImpl::Run(
 
   return GraphExecutionRunOnFunction(
       options_.graph_execution_options, run_options, name, func,
-      loaded_executable, inputs, outputs, resource_context, runner_table,
-      resource_array, runtime(), *fallback_state_, &req_deadline_tracker_);
+      loaded_executable, inputs, outputs, resource_context,
+      /*client_graph_resource_context=*/nullptr, runner_table, resource_array,
+      runtime(), *fallback_state_, &req_deadline_tracker_);
 }
 
 struct SavedModelImpl::JoinedSignature {
@@ -1095,9 +1111,8 @@ SavedModelImpl::LoadJoinedSignature(const JoinedSignature& joined_signature) {
       tfrt::CreateBefFileFromBefBuffer(
           *options_.graph_execution_options.runtime, loading_result->bef));
   RETURN_IF_ERROR_IN_INIT(RunBefInitializers(
-      /*initializers_and_signatures=*/{},
-      options_.graph_execution_options.model_metadata,
-      loading_result->bef_file.get(), *options_.graph_execution_options.runtime,
+      graph_executor_->options(),
+      /*initializers_and_signatures=*/{}, loading_result->bef_file.get(),
       &graph_executor_->resource_context(), loading_result->runner_table.get(),
       loading_result->resource_array.get(), *fallback_state_));
 
@@ -1126,6 +1141,13 @@ SavedModelImpl::GetOrCreateLoadingResult(const RunOptions& run_options,
       const auto joined_signature,
       JoinSignatures(names, signatures_, meta_graph_def_.signature_def()));
 
+  LOG(INFO) << "TFRT loading joined signature " << joined_signature.name;
+  absl::Cleanup log_finish([&joined_signature, start_time = absl::Now()]() {
+    LOG(INFO) << "TFRT finished loading joined signature "
+              << joined_signature.name << ". Took "
+              << absl::ToInt64Milliseconds(absl::Now() - start_time) << " ms.";
+  });
+
   return LoadJoinedSignature(joined_signature);
 }
 
diff --git a/tensorflow/core/tfrt/saved_model/tests/BUILD b/tensorflow/core/tfrt/saved_model/tests/BUILD
index ddf735819fa..d6c29a975bf 100644
--- a/tensorflow/core/tfrt/saved_model/tests/BUILD
+++ b/tensorflow/core/tfrt/saved_model/tests/BUILD
@@ -15,6 +15,7 @@ package_group(
     packages = [
         "//tensorflow/core/tfrt/saved_model/tests/...",
         "//tensorflow/core/tfrt/tfrt_session/...",
+        "//tensorflow/core/tfrt/utils/debug/...",
     ],
 )
 
@@ -167,6 +168,7 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:standard_ops",  # build_cleaner: keep
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -294,6 +296,7 @@ pytype_strict_binary(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:standard_ops",  # build_cleaner: keep
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
@@ -348,12 +351,12 @@ pytype_strict_binary(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
@@ -372,6 +375,7 @@ pytype_strict_binary(
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -603,17 +607,15 @@ cc_library(
         "//tensorflow/core:test",
         "//tensorflow/core/platform:path",
         "//tensorflow/core/platform:resource_loader",
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
         "//tensorflow/core/tfrt/fallback:cost_recorder",
-        "//tensorflow/core/tfrt/mla:mla_test_utils",
+        "//tensorflow/core/tfrt/graph_executor:config",
+        "//tensorflow/core/tfrt/graph_executor:test_config_proto_cc",
         "//tensorflow/core/tfrt/run_handler_thread_pool:run_handler_concurrent_work_queue",
         "//tensorflow/core/tfrt/saved_model:saved_model_mira_impl",
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
         "//tensorflow/python/framework:test_ops_kernels",
-        "@com_google_googletest//:gtest",
+        "//testing/base/public:gunit_for_library_testonly",
         "@tf_runtime//:core_runtime_alwayslink",
-        "@tf_runtime//backends/cpu:core_runtime_alwayslink",
-        "@tf_runtime//backends/cpu:tf_ops_alwayslink",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/tfrt/saved_model/tests/gen_if_v1.py b/tensorflow/core/tfrt/saved_model/tests/gen_if_v1.py
index 1f58e7eac38..a8045656d33 100644
--- a/tensorflow/core/tfrt/saved_model/tests/gen_if_v1.py
+++ b/tensorflow/core/tfrt/saved_model/tests/gen_if_v1.py
@@ -21,7 +21,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -46,7 +46,7 @@ def main(argv):
   one = variable_scope.get_variable(name='y', initializer=[1])
   neg_one = variable_scope.get_variable(name='z', initializer=[-1])
   x = array_ops.placeholder(dtypes.int32, shape=(), name='input')
-  r = control_flow_ops.cond(
+  r = cond.cond(
       x < zero, lambda: math_ops.cast(math_ops.greater(x, one), dtypes.int32),
       lambda: math_ops.cast(math_ops.greater(x, neg_one), dtypes.int32))
 
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
index 59f5ef8351f..e24a2b704af 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tensorflow/core/tfrt/graph_executor/config.h"
+#include "tensorflow/core/tfrt/graph_executor/test_config.pb.h"
 #include "tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_mira_impl.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
@@ -116,6 +118,50 @@ TEST(SavedModelTest, BasicV2) {
   EXPECT_EQ(output.flat<int32_t>()(0), 6);
 }
 
+TEST_P(SavedModelTest, OnlineCostAnalysis) {
+  // SavedModel toy contains a graph of a single 'tf.AddV2' op. It is generated
+  // using the following python code:
+  //  x = tf.placeholder(tf.int32, shape=(3))
+  //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
+  //  r = tf.matmul(x, y)
+  std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+
+  auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
+  auto options = DefaultSavedModelOptions(runtime.get());
+  options.graph_execution_options.enable_online_cost_analysis = true;
+  options.enable_lazy_loading = true;
+  options.lazy_loading_use_graph_executor = true;
+  // This is an example feature that should continue to work with online cost
+  // analysis enabled.
+  options.graph_execution_options.compile_options.hoist_invariant_ops = true;
+
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
+
+  // Set input 'x' to [[1, 1, 1]]
+  std::vector<tensorflow::Tensor> inputs;
+  inputs.push_back(
+      CreateTfTensor<int32_t>(/*shape=*/{1, 3}, /*data=*/{1, 1, 1}));
+
+  tfrt::SavedModel::RunOptions run_options;
+
+  std::vector<tensorflow::Tensor> outputs;
+  TF_ASSERT_OK((*saved_model)->Run(run_options, "toy", inputs, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
+
+  EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
+              ::testing::ElementsAreArray({6}));
+
+  outputs.clear();
+  TF_ASSERT_OK((*saved_model)->Run(run_options, "toy", inputs, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
+
+  EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
+              ::testing::ElementsAreArray({6}));
+}
+
 TEST(SavedModelTest, BasicInlineExecution) {
   // SavedModel toy contains a graph of a single 'tf.AddV2' op. It is generated
   // using the following python code:
@@ -662,8 +708,7 @@ TEST(SavedModelTest, WrongShape) {
   auto status = test.GetSavedModel()->Run(run_options, "serving_default",
                                           inputs, &outputs);
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              ::testing::HasSubstr("input shape is wrong"));
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("input shape is wrong"));
 }
 
 TEST(SavedModelTest, RefTypeTensorInput) {
@@ -833,7 +878,7 @@ TEST(SavedModelTest, Error) {
   EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument);
 
   EXPECT_TRUE(absl::StrContains(
-      status.error_message(), "You must feed a value for placeholder tensor"));
+      status.message(), "You must feed a value for placeholder tensor"));
 }
 
 struct PowTestParam {
@@ -983,9 +1028,6 @@ TEST(SavedModelTest, DeadlineExceeded) {
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
 
-  // TODO(chky): Implement cancellation in MLRT.
-  if (options.graph_execution_options.enable_mlrt) return;
-
   auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
                                                     /*tags=*/{"serve"});
   TF_CHECK_OK(saved_model.status());
@@ -1003,8 +1045,7 @@ TEST(SavedModelTest, DeadlineExceeded) {
   auto status = (*saved_model)->Run(run_options, "toy", inputs, &outputs);
 
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              ::testing::HasSubstr("Deadline exceeded"));
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("Deadline exceeded"));
 }
 
 TEST(SavedModelTest, DisableCompilation) {
@@ -1038,7 +1079,7 @@ TEST(SavedModelTest, DisableCompilation) {
 
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
-      status.error_message(),
+      status.message(),
       ::testing::HasSubstr("GraphExecutor: compilation is disabled in "
                            "execution but the compiled graph is not found"));
 
@@ -1046,6 +1087,49 @@ TEST(SavedModelTest, DisableCompilation) {
   TF_ASSERT_OK((*saved_model)->Run(run_options, "toy", inputs, &outputs));
 }
 
+TEST(SavedModelTest, CustomModelConfig) {
+  // SavedModel toy contains a graph of a single 'tf.AddV2' op. It is generated
+  // using the following python code:
+  //  x = tf.placeholder(tf.int32, shape=(3))
+  //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
+  //  r = tf.matmul(x, y)
+  std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+
+  auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
+
+  TestConfig1 test_config;
+  runtime->AddCreateRuntimeResourceFn(
+      [&test_config](const GraphExecutionOptions& options,
+                     tfrt::ResourceContext*) {
+        test_config = options.model_config.Get<TestConfig1>().value();
+      });
+
+  auto options = DefaultSavedModelOptions(runtime.get());
+  options.enable_lazy_loading = true;
+
+  TestConfig1 expected_test_config;
+  expected_test_config.set_tag("test config");
+  ASSERT_OK(options.graph_execution_options.model_config.Add<TestConfig1>(
+      expected_test_config));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto saved_model, SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                       /*tags=*/{"serve"}));
+
+  // Set input 'x' to [[1, 1, 1]]
+  std::vector<tensorflow::Tensor> inputs;
+  inputs.push_back(
+      CreateTfTensor<int32_t>(/*shape=*/{1, 3}, /*data=*/{1, 1, 1}));
+
+  std::vector<tensorflow::Tensor> outputs;
+
+  tfrt::SavedModel::RunOptions run_options;
+  TF_ASSERT_OK(saved_model->Run(run_options, "toy", inputs, &outputs));
+
+  EXPECT_EQ(test_config.tag(), "test config");
+}
+
 }  // namespace
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/utils/BUILD b/tensorflow/core/tfrt/utils/BUILD
index 71c306187eb..889a19c919b 100644
--- a/tensorflow/core/tfrt/utils/BUILD
+++ b/tensorflow/core/tfrt/utils/BUILD
@@ -39,7 +39,6 @@ cc_library(
         ":error_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:framework",
-        "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/lib/gtl:array_slice",
         "//tensorflow/core/platform:errors",
@@ -64,8 +63,6 @@ tf_cc_test(
     deps = [
         ":utils",
         "//tensorflow/core:framework",
-        "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/core/common_runtime/eager:core_no_xla",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
@@ -83,17 +80,17 @@ cc_library(
         "tensor_util.h",
     ],
     deps = [
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/strings",
-        "//third_party/eigen3",
-        "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/framework:tensor_shape",
+        "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/platform:tstring",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_tensor",
         "//tensorflow/core/runtime_fallback/util:tensor_util",
         "//tensorflow/core/runtime_fallback/util:type_util",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
         "@tf_runtime//:core_runtime",
         "@tf_runtime//:dtype",
         "@tf_runtime//:hostcontext",
@@ -268,7 +265,7 @@ cc_library(
     deps = [
         "//tensorflow/core/common_runtime:dma_helper",
         "//tensorflow/core/framework:tensor",
-        "@com_google_absl//absl/types:variant",
+        "//tensorflow/tsl/profiler/lib:traceme",
     ],
 )
 
@@ -292,8 +289,8 @@ cc_library(
         "//learning/brain/mlir/bridge:graph_analysis",
         "//tensorflow/core/platform:enable_tf2_utils",
     ]) + [
-        "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
     ],
 )
 
diff --git a/tensorflow/core/tfrt/utils/debug/BUILD b/tensorflow/core/tfrt/utils/debug/BUILD
new file mode 100644
index 00000000000..bb1316e4f85
--- /dev/null
+++ b/tensorflow/core/tfrt/utils/debug/BUILD
@@ -0,0 +1,62 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "node_io_dump_rewriter",
+    srcs = ["node_io_dump_rewriter.cc"],
+    hdrs = ["node_io_dump_rewriter.h"],
+    deps = [
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/lib/core:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:statusor",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "node_io_dump_rewriter_test",
+    srcs = ["node_io_dump_rewriter_test.cc"],
+    data = [
+        "//tensorflow/core/tfrt/saved_model/tests:toy_v1/saved_model.pb",
+        "//tensorflow/core/tfrt/saved_model/tests:toy_v1/variables/variables.data-00000-of-00001",
+        "//tensorflow/core/tfrt/saved_model/tests:toy_v1/variables/variables.index",
+        "//tensorflow/core/tfrt/saved_model/tests:toy_v2/saved_model.pb",
+        "//tensorflow/core/tfrt/saved_model/tests:toy_v2/variables/variables.data-00000-of-00001",
+        "//tensorflow/core/tfrt/saved_model/tests:toy_v2/variables/variables.index",
+    ],
+    deps = [
+        ":node_io_dump_rewriter",
+        "//tensorflow/cc:array_ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/cc/saved_model:reader",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core/framework:op",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/core/tfrt/saved_model:saved_model_cpu",
+        "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
+        "//tensorflow/python/framework:test_ops_kernels",
+        "//tensorflow/tsl/platform:statusor",
+        "//testing/base/public:unique-test-directory",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.cc b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.cc
new file mode 100644
index 00000000000..6da6ba07f2a
--- /dev/null
+++ b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.cc
@@ -0,0 +1,129 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.h"
+
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+namespace {
+
+StatusOr<std::string> GetDumpDir(absl::string_view dump_dir) {
+  if (!dump_dir.empty()) return std::string(dump_dir);
+  const char* prefix = getenv("TF_DUMP_GRAPH_PREFIX");
+  if (prefix != nullptr) return std::string(prefix);
+  return errors::InvalidArgument("TF_DUMP_GRAPH_PREFIX not specified");
+}
+
+Status InsertDumpOpsForNode(Graph& graph, Node& node,
+                            absl::string_view dump_dir) {
+  auto insert = [&](bool is_input, const std::vector<const Edge*> edges) {
+    for (const Edge* edge : edges) {
+      if (edge->IsControlEdge()) continue;
+      // For each edge, insert a dump node.
+      Node* dump_node;
+      TF_RETURN_IF_ERROR(
+          NodeBuilder(absl::StrCat(edge->src()->name(), "/", edge->src_output(),
+                                   "/debug_identity"),
+                      "DebugIdentityV3")
+              .Attr("io_of_node", node.name())
+              .Attr("is_input", is_input)
+              .Attr("io_index",
+                    is_input ? edge->dst_input() : edge->src_output())
+              .Attr("tensor_name",
+                    absl::StrCat(edge->src()->name(), ":", edge->src_output()))
+              .Attr("debug_urls", {absl::StrCat("file://", dump_dir)})
+              .Input(edge->src(), edge->src_output())
+              .Finalize(&graph, &dump_node));
+      TF_RETURN_IF_ERROR(
+          graph.UpdateEdge(dump_node, 0, edge->dst(), edge->dst_input()));
+    }
+    return OkStatus();
+  };
+
+  // Make a copy of the edges to avoid modifying edges while iterating.
+  TF_RETURN_IF_ERROR(insert(/*is_input=*/true,
+                            {node.in_edges().begin(), node.in_edges().end()}));
+  TF_RETURN_IF_ERROR(insert(
+      /*is_input=*/false, {node.out_edges().begin(), node.out_edges().end()}));
+  return OkStatus();
+}
+
+}  // namespace
+
+Status InsertDumpOps(Graph& graph,
+                     const absl::flat_hash_set<std::string>& nodes_to_dump,
+                     absl::string_view dump_dir) {
+  TF_ASSIGN_OR_RETURN(auto dir, GetDumpDir(dump_dir));
+  auto insert = [&](Graph& graph) {
+    for (Node* node : graph.op_nodes()) {
+      if (nodes_to_dump.contains(node->name())) {
+        TF_RETURN_IF_ERROR(InsertDumpOpsForNode(graph, *node, dir));
+      }
+    }
+    return OkStatus();
+  };
+
+  TF_RETURN_IF_ERROR(insert(graph));
+  for (const auto& fname : graph.flib_def().ListFunctionNames()) {
+    // Convert fdef to graph.
+    std::unique_ptr<FunctionBody> fbody;
+    TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+        *graph.flib_def().Find(fname), AttrSlice(), &graph.flib_def(), &fbody));
+    // Insert dump nodes.
+    TF_RETURN_IF_ERROR(insert(*fbody->graph));
+    // Convert graph to fdef.
+    FunctionDef new_fdef;
+    TF_RETURN_IF_ERROR(GraphToFunctionDef(*fbody->graph, fname, &new_fdef));
+    TF_RETURN_IF_ERROR(
+        graph.mutable_flib_def()->ReplaceFunction(fname, new_fdef));
+  }
+  return OkStatus();
+}
+
+Status InsertDumpOps(MetaGraphDef& meta_graph_def,
+                     const absl::flat_hash_set<std::string>& nodes_to_dump,
+                     absl::string_view dump_dir) {
+  Graph graph(OpRegistry::Global());
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph({}, meta_graph_def.graph_def(), &graph));
+  TF_RETURN_IF_ERROR(InsertDumpOps(graph, nodes_to_dump, dump_dir));
+  graph.ToGraphDef(meta_graph_def.mutable_graph_def());
+  return OkStatus();
+}
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.h b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.h
new file mode 100644
index 00000000000..759a6c8f4ed
--- /dev/null
+++ b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_DEBUG_NODE_IO_DUMP_REWRITER_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_DEBUG_NODE_IO_DUMP_REWRITER_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// Rewrites `graph` by inserting dump nodes for `nodes_to_dump`. During graph
+// execution, the inputs and outputs of `nodes_to_dump` will be dumped to the
+// folder specified by env var `TF_DUMP_GRAPH_PREFIX`.
+Status InsertDumpOps(Graph& graph,
+                     const absl::flat_hash_set<std::string>& nodes_to_dump,
+                     absl::string_view dump_dir = "");
+// Similar to the above, but rewrites a `meta_graph_def`.
+Status InsertDumpOps(MetaGraphDef& meta_graph_def,
+                     const absl::flat_hash_set<std::string>& nodes_to_dump,
+                     absl::string_view dump_dir = "");
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_DEBUG_NODE_IO_DUMP_REWRITER_H_
diff --git a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
new file mode 100644
index 00000000000..aa8ae077568
--- /dev/null
+++ b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
@@ -0,0 +1,212 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.h"
+
+#include <dirent.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "testing/base/public/unique-test-directory.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/saved_model/reader.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/resource_loader.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/tfrt/saved_model/saved_model.h"
+#include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+namespace {
+
+constexpr absl::string_view kDumpSubDirName = "node-io-dump";
+
+const Node* FindNode(const Graph* graph, absl::string_view node_name) {
+  for (Node* node : graph->nodes()) {
+    if (node->name() == node_name) return node;
+  }
+  return nullptr;
+}
+
+const Node* GetInputNode(const Node* node, size_t index) {
+  const Node* input_node;
+  CHECK(node->input_node(index, &input_node).ok());
+  return input_node;
+}
+
+const Node* GetOutputNode(const Node* node, size_t index) {
+  for (const Edge* edge : node->out_edges()) {
+    if (edge->src_output() == index) return edge->dst();
+  }
+  return nullptr;
+}
+
+absl::StatusOr<std::vector<std::string>> GetFilenames(
+    absl::string_view dump_dir) {
+  // Read the step directory names.
+  auto dump_sub_dir = absl::StrCat(dump_dir, "/", kDumpSubDirName);
+  DIR* dir = opendir(dump_sub_dir.data());
+  if (dir == nullptr) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("can't open directory: ", dump_sub_dir));
+  }
+  std::vector<std::string> step_dirs;
+  struct dirent* entry;
+  while ((entry = readdir(dir)) != nullptr) {
+    if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
+      continue;
+    }
+    if (entry->d_type != DT_DIR) {
+      return absl::InternalError(absl::StrCat(
+          "Found non-directory entry under dump_sub_dir: ", entry->d_name));
+    }
+    step_dirs.push_back(absl::StrCat(dump_sub_dir, "/", entry->d_name));
+  }
+  closedir(dir);
+  CHECK_EQ(step_dirs.size(), 1);
+  // Read the filenames.
+  dir = opendir(step_dirs[0].data());
+  if (dir == nullptr) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("can't open directory: ", step_dirs[0]));
+  }
+  std::vector<std::string> filenames;
+  while ((entry = readdir(dir)) != nullptr) {
+    if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
+      continue;
+    }
+    if (entry->d_type == DT_DIR) {
+      return absl::InternalError(absl::StrCat(
+          "Found directory entry under step_dir: ", entry->d_name));
+    }
+    filenames.push_back(entry->d_name);
+  }
+  closedir(dir);
+  return filenames;
+}
+
+TEST(NodeIoDumpRewriterTest, OnGraph) {
+  // Construct a graph.
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  Scope scope = Scope::NewRootScope().WithDevice("/device:CPU:0");
+  auto input_a = ops::Placeholder(scope.WithOpName("input_a"), DT_INT32);
+  auto input_b = ops::Placeholder(scope.WithOpName("input_b"), DT_INT32);
+  auto add = ops::Add(scope.WithOpName("add"), input_a, input_b);
+  auto output = ops::Identity(scope.WithOpName("output"), add);
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+  // Insert dump ops.
+  const std::string dump_dir = ::testing::UniqueTestDirectory();
+  TF_ASSERT_OK(InsertDumpOps(*graph, {"add"}, dump_dir));
+  // Check the inserted dump ops.
+  auto* node = FindNode(graph.get(), "add");
+  EXPECT_EQ(node->num_inputs(), 2);
+  EXPECT_EQ(GetInputNode(node, 0)->name(), "input_a/0/debug_identity");
+  EXPECT_EQ(GetInputNode(node, 1)->name(), "input_b/0/debug_identity");
+  EXPECT_EQ(node->num_outputs(), 1);
+  EXPECT_EQ(GetOutputNode(node, 0)->name(), "add/0/debug_identity");
+}
+
+TEST(NodeIoDumpRewriterTest, OnSavedModelV1) {
+  // Read meta_graph_def.
+  std::string saved_model_dir = GetDataDependencyFilepath(
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+  MetaGraphDef meta_graph_def;
+  TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(saved_model_dir, {"serve"},
+                                              &meta_graph_def));
+  // Insert dump ops.
+  const std::string dump_dir = ::testing::UniqueTestDirectory();
+  TF_ASSERT_OK(InsertDumpOps(meta_graph_def, {"Add"}, dump_dir));
+  // Load and run saved model.
+  auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
+  SavedModel::Options options(runtime.get());
+  options.graph_execution_options.compile_options.enable_grappler = false;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto saved_model,
+      SavedModelImpl::LoadSavedModel(options, meta_graph_def, saved_model_dir));
+  std::vector<tensorflow::Tensor> inputs;
+  inputs.push_back(
+      CreateTfTensor<int32_t>(/*shape=*/{1, 3}, /*data=*/{1, 1, 1}));
+  std::vector<tensorflow::Tensor> outputs;
+  TF_ASSERT_OK(saved_model->Run({}, "another_toy", inputs, &outputs));
+  ASSERT_EQ(outputs.size(), 2);
+  EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
+              ::testing::ElementsAreArray({6}));
+  EXPECT_THAT(GetTfTensorData<int32_t>(outputs[1]),
+              ::testing::ElementsAreArray({12}));
+  // Check the dump files.
+  ASSERT_OK_AND_ASSIGN(auto filenames, GetFilenames(dump_dir));
+  ASSERT_EQ(filenames.size(), 3);
+  EXPECT_TRUE(absl::StartsWith(filenames[0], "Add:out:0_"));
+  EXPECT_TRUE(absl::StartsWith(filenames[1], "Add:in:0_"));
+  EXPECT_TRUE(absl::StartsWith(filenames[2], "Add:in:1_"));
+}
+
+TEST(NodeIoDumpRewriterTest, OnSavedModelV2) {
+  // Read meta_graph_def.
+  std::string saved_model_dir = GetDataDependencyFilepath(
+      "tensorflow/core/tfrt/saved_model/tests/toy_v2");
+  MetaGraphDef meta_graph_def;
+  TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(saved_model_dir, {"serve"},
+                                              &meta_graph_def));
+  // Insert dump ops.
+  const std::string dump_dir = ::testing::UniqueTestDirectory();
+  TF_ASSERT_OK(InsertDumpOps(meta_graph_def, {"result"}, dump_dir));
+  // Load and run saved model.
+  auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
+  SavedModel::Options options(runtime.get());
+  options.graph_execution_options.compile_options.enable_grappler = false;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto saved_model,
+      SavedModelImpl::LoadSavedModel(options, meta_graph_def, saved_model_dir));
+  std::vector<tensorflow::Tensor> inputs;
+  inputs.push_back(
+      CreateTfTensor<int32_t>(/*shape=*/{1, 3}, /*data=*/{1, 1, 1}));
+  std::vector<tensorflow::Tensor> outputs;
+  TF_ASSERT_OK(saved_model->Run({}, "serving_default", inputs, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
+  EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
+              ::testing::ElementsAreArray({6}));
+  // Check the dump files.
+  ASSERT_OK_AND_ASSIGN(auto filenames, GetFilenames(dump_dir));
+  ASSERT_EQ(filenames.size(), 3);
+  EXPECT_TRUE(absl::StartsWith(filenames[0], "result:out:0_"));
+  EXPECT_TRUE(absl::StartsWith(filenames[1], "result:in:1_"));
+  EXPECT_TRUE(absl::StartsWith(filenames[2], "result:in:0_"));
+}
+
+}  // namespace
+}  // namespace tfrt_stub
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/utils/error_type.def b/tensorflow/core/tfrt/utils/error_type.def
index 2cd2fe9ae21..79996becf42 100644
--- a/tensorflow/core/tfrt/utils/error_type.def
+++ b/tensorflow/core/tfrt/utils/error_type.def
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+  /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,16 +20,16 @@ limitations under the License.
 // first argument provided is a TFRT error code, the second is the corresponding
 // TF error code.
 
-ERROR_TYPE(kCancelled, CANCELLED)
-ERROR_TYPE(kUnknown, UNKNOWN)
-ERROR_TYPE(kInvalidArgument, INVALID_ARGUMENT)
-ERROR_TYPE(kDeadlineExceeded, DEADLINE_EXCEEDED)
-ERROR_TYPE(kNotFound, NOT_FOUND)
-ERROR_TYPE(kFailedPrecondition, FAILED_PRECONDITION)
-ERROR_TYPE(kOutOfRange, OUT_OF_RANGE)
-ERROR_TYPE(kUnimplemented, UNIMPLEMENTED)
-ERROR_TYPE(kDataLoss, DATA_LOSS)
-ERROR_TYPE(kAlreadyExists, ALREADY_EXISTS)
-ERROR_TYPE(kUnavailable, UNAVAILABLE)
+ERROR_TYPE(kCancelled, kCancelled)
+ERROR_TYPE(kUnknown, kUnknown)
+ERROR_TYPE(kInvalidArgument, kInvalidArgument)
+ERROR_TYPE(kDeadlineExceeded, kDeadlineExceeded)
+ERROR_TYPE(kNotFound, kNotFound)
+ERROR_TYPE(kFailedPrecondition, kFailedPrecondition)
+ERROR_TYPE(kOutOfRange, kOutOfRange)
+ERROR_TYPE(kUnimplemented, kUnimplemented)
+ERROR_TYPE(kDataLoss, kDataLoss)
+ERROR_TYPE(kAlreadyExists, kAlreadyExists)
+ERROR_TYPE(kUnavailable, kUnavailable)
 
 #undef ERROR_TYPE
diff --git a/tensorflow/core/tfrt/utils/error_util.cc b/tensorflow/core/tfrt/utils/error_util.cc
index b8d58fd60cf..4f0f93a767d 100644
--- a/tensorflow/core/tfrt/utils/error_util.cc
+++ b/tensorflow/core/tfrt/utils/error_util.cc
@@ -28,7 +28,7 @@ tfrt::ErrorCode ConvertTfErrorCodeToTfrtErrorCode(
       LOG(INFO) << "Unsupported TensorFlow error code: " << status.ToString();
       return tfrt::ErrorCode::kUnknown;
 #define ERROR_TYPE(TFRT_ERROR, TF_ERROR) \
-  case tensorflow::error::TF_ERROR:      \
+  case absl::StatusCode::TF_ERROR:       \
     return tfrt::ErrorCode::TFRT_ERROR;
 #include "tensorflow/core/tfrt/utils/error_type.def"  // NOLINT
   }
@@ -50,7 +50,7 @@ tensorflow::Status ToTfStatus(const tfrt::AsyncValue* av) {
 absl::Status AbslStatusFromTfStatus(tensorflow::Status status) {
   if (status.ok()) return absl::OkStatus();
   return absl::Status(static_cast<absl::StatusCode>(status.code()),
-                      status.error_message());
+                      status.message());
 }
 
 }  // namespace tfrt
diff --git a/tensorflow/core/tfrt/utils/error_util.h b/tensorflow/core/tfrt/utils/error_util.h
index 3ecaeb9a76e..4be952ca05f 100644
--- a/tensorflow/core/tfrt/utils/error_util.h
+++ b/tensorflow/core/tfrt/utils/error_util.h
@@ -34,39 +34,39 @@ inline std::string MakeStatusString(tensorflow::Status status) {
     case absl::StatusCode::kOk:
       return "OK";
     case absl::StatusCode::kCancelled:
-      return absl::StrCat("Cancelled: ", status.error_message());
+      return absl::StrCat("Cancelled: ", status.message());
     case absl::StatusCode::kUnknown:
-      return absl::StrCat("Unknown: ", status.error_message());
+      return absl::StrCat("Unknown: ", status.message());
     case absl::StatusCode::kInvalidArgument:
-      return absl::StrCat("Invalid argument: ", status.error_message());
+      return absl::StrCat("Invalid argument: ", status.message());
     case absl::StatusCode::kDeadlineExceeded:
-      return absl::StrCat("Deadline exceeded: ", status.error_message());
+      return absl::StrCat("Deadline exceeded: ", status.message());
     case absl::StatusCode::kNotFound:
-      return absl::StrCat("Not found: ", status.error_message());
+      return absl::StrCat("Not found: ", status.message());
     case absl::StatusCode::kAlreadyExists:
-      return absl::StrCat("Already exists: ", status.error_message());
+      return absl::StrCat("Already exists: ", status.message());
     case absl::StatusCode::kPermissionDenied:
-      return absl::StrCat("Permission denied: ", status.error_message());
+      return absl::StrCat("Permission denied: ", status.message());
     case absl::StatusCode::kUnauthenticated:
-      return absl::StrCat("Unauthenticated: ", status.error_message());
+      return absl::StrCat("Unauthenticated: ", status.message());
     case absl::StatusCode::kResourceExhausted:
-      return absl::StrCat("Resource exhausted: ", status.error_message());
+      return absl::StrCat("Resource exhausted: ", status.message());
     case absl::StatusCode::kFailedPrecondition:
-      return absl::StrCat("Failed precondition: ", status.error_message());
+      return absl::StrCat("Failed precondition: ", status.message());
     case absl::StatusCode::kAborted:
-      return absl::StrCat("Aborted: ", status.error_message());
+      return absl::StrCat("Aborted: ", status.message());
     case absl::StatusCode::kOutOfRange:
-      return absl::StrCat("Out of range: ", status.error_message());
+      return absl::StrCat("Out of range: ", status.message());
     case absl::StatusCode::kUnimplemented:
-      return absl::StrCat("Unimplemented: ", status.error_message());
+      return absl::StrCat("Unimplemented: ", status.message());
     case absl::StatusCode::kInternal:
-      return absl::StrCat("Internal: ", status.error_message());
+      return absl::StrCat("Internal: ", status.message());
     case absl::StatusCode::kUnavailable:
-      return absl::StrCat("Unavailable: ", status.error_message());
+      return absl::StrCat("Unavailable: ", status.message());
     case absl::StatusCode::kDataLoss:
-      return absl::StrCat("Data loss: ", status.error_message());
+      return absl::StrCat("Data loss: ", status.message());
     default:
-      return absl::StrCat("Unknown code: ", status.error_message());
+      return absl::StrCat("Unknown code: ", status.message());
   }
 }
 
diff --git a/tensorflow/core/tfrt/utils/error_util_test.cc b/tensorflow/core/tfrt/utils/error_util_test.cc
index 200c215a3a3..07c65905825 100644
--- a/tensorflow/core/tfrt/utils/error_util_test.cc
+++ b/tensorflow/core/tfrt/utils/error_util_test.cc
@@ -23,17 +23,17 @@ namespace tfrt {
 namespace {
 
 TEST(ErrorUtilTest, AllSupportedErrorConversion){
-#define ERROR_TYPE(TFRT_ERROR, TF_ERROR)                                  \
-  {                                                                       \
-    tensorflow::Status status(tensorflow::error::TF_ERROR, "error_test"); \
-    EXPECT_EQ(ConvertTfErrorCodeToTfrtErrorCode(status),                  \
-              tfrt::ErrorCode::TFRT_ERROR);                               \
+#define ERROR_TYPE(TFRT_ERROR, TF_ERROR)                                 \
+  {                                                                      \
+    tensorflow::Status status(absl::StatusCode::TF_ERROR, "error_test"); \
+    EXPECT_EQ(ConvertTfErrorCodeToTfrtErrorCode(status),                 \
+              tfrt::ErrorCode::TFRT_ERROR);                              \
   }
 #include "tensorflow/core/tfrt/utils/error_type.def"  // NOLINT
 }
 
 TEST(ErrorUtilTest, UnsupportedErrorConversion) {
-  tensorflow::Status status(tensorflow::error::UNAUTHENTICATED, "error_test");
+  tensorflow::Status status(absl::StatusCode::kUnauthenticated, "error_test");
   EXPECT_EQ(ConvertTfErrorCodeToTfrtErrorCode(status),
             tfrt::ErrorCode::kUnknown);
 }
diff --git a/tensorflow/core/tfrt/utils/fallback_tensor.h b/tensorflow/core/tfrt/utils/fallback_tensor.h
index 9e88fae4a66..1c12f47f7e0 100644
--- a/tensorflow/core/tfrt/utils/fallback_tensor.h
+++ b/tensorflow/core/tfrt/utils/fallback_tensor.h
@@ -15,8 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_UTILS_FALLBACK_TENSOR_H_
 #define TENSORFLOW_CORE_TFRT_UTILS_FALLBACK_TENSOR_H_
 
-#include "absl/types/variant.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/tsl/profiler/lib/traceme.h"
 
 namespace tensorflow {
 namespace tfrt_stub {
@@ -56,22 +57,41 @@ class FallbackTensor {
       : tensor_(std::move(tensor)) {}
 
   explicit FallbackTensor(ImmutableTensor* immutable_tensor)
-      : tensor_(immutable_tensor) {}
+      : tensor_(immutable_tensor->tensor()), is_immutable_(true) {}
 
-  bool is_immutable() const {
-    return absl::holds_alternative<ImmutableTensor*>(tensor_);
+  FallbackTensor(const FallbackTensor& other) { *this = other; }
+  FallbackTensor& operator=(const FallbackTensor& other) {
+    tsl::profiler::TraceMe trace_me("FallbackTensor::Copy");
+    if (!other.is_immutable()) {
+      // Create a new TensorBuffer which contains a new atomic counter for each
+      // result, to avoid downstream threads contending the original atomic
+      // counter.
+      tensor_ = std::move(
+          tensorflow::tfrt_stub::ImmutableTensor::Create(other.tensor())
+              .tensor());
+    } else {
+      // For immutable tensors, we just need to copy the pointer.
+      tensor_ = other.tensor();
+    }
+    is_immutable_ = true;
+    return *this;
   }
 
-  tensorflow::Tensor& tensor() {
-    if (is_immutable()) return absl::get<ImmutableTensor*>(tensor_)->tensor();
-    return absl::get<tensorflow::Tensor>(tensor_);
-  }
-  const tensorflow::Tensor& tensor() const {
-    return const_cast<FallbackTensor*>(this)->tensor();
+  FallbackTensor(FallbackTensor&&) = default;
+  FallbackTensor& operator=(FallbackTensor&&) = default;
+
+  const TensorBuffer* buffer() const {
+    return tensorflow::DMAHelper::buffer(&tensor());
   }
 
+  bool is_immutable() const { return is_immutable_; }
+
+  tensorflow::Tensor& tensor() { return tensor_; }
+  const tensorflow::Tensor& tensor() const { return tensor_; }
+
  private:
-  absl::variant<absl::monostate, tensorflow::Tensor, ImmutableTensor*> tensor_;
+  tensorflow::Tensor tensor_;
+  bool is_immutable_ = false;
 };
 
 }  // namespace tfrt_stub
diff --git a/tensorflow/core/tfrt/utils/fallback_tensor_test.cc b/tensorflow/core/tfrt/utils/fallback_tensor_test.cc
index 8b9a053ac57..5ce18412c38 100644
--- a/tensorflow/core/tfrt/utils/fallback_tensor_test.cc
+++ b/tensorflow/core/tfrt/utils/fallback_tensor_test.cc
@@ -61,6 +61,26 @@ TEST(FallbackTensorTest, FallbackTensor) {
     ASSERT_EQ(fallback_tensor.tensor().dtype(), tensorflow::DT_INT32);
     auto flat = fallback_tensor.tensor().flat<int32_t>();
     EXPECT_EQ(flat(0), 123);
+
+    FallbackTensor copy(fallback_tensor);
+    FallbackTensor assign;
+    assign = fallback_tensor;
+
+    ASSERT_EQ(copy.tensor().NumElements(), 1);
+    ASSERT_EQ(copy.tensor().dtype(), tensorflow::DT_INT32);
+    EXPECT_EQ(copy.tensor().flat<int32_t>()(0), 123);
+    ASSERT_EQ(assign.tensor().NumElements(), 1);
+    ASSERT_EQ(assign.tensor().dtype(), tensorflow::DT_INT32);
+    EXPECT_EQ(assign.tensor().flat<int32_t>()(0), 123);
+
+    fallback_tensor = {};
+
+    ASSERT_EQ(copy.tensor().NumElements(), 1);
+    ASSERT_EQ(copy.tensor().dtype(), tensorflow::DT_INT32);
+    EXPECT_EQ(copy.tensor().flat<int32_t>()(0), 123);
+    ASSERT_EQ(assign.tensor().NumElements(), 1);
+    ASSERT_EQ(assign.tensor().dtype(), tensorflow::DT_INT32);
+    EXPECT_EQ(assign.tensor().flat<int32_t>()(0), 123);
   }
 
   auto immutable_tensor = ImmutableTensor::Create(tensor);
@@ -73,6 +93,49 @@ TEST(FallbackTensorTest, FallbackTensor) {
     ASSERT_EQ(fallback_tensor.tensor().dtype(), tensorflow::DT_INT32);
     auto flat = fallback_tensor.tensor().flat<int32_t>();
     EXPECT_EQ(flat(0), 123);
+
+    FallbackTensor copy(fallback_tensor);
+    FallbackTensor assign;
+    assign = fallback_tensor;
+
+    ASSERT_EQ(copy.tensor().NumElements(), 1);
+    ASSERT_EQ(copy.tensor().dtype(), tensorflow::DT_INT32);
+    EXPECT_EQ(copy.tensor().flat<int32_t>()(0), 123);
+    ASSERT_EQ(assign.tensor().NumElements(), 1);
+    ASSERT_EQ(assign.tensor().dtype(), tensorflow::DT_INT32);
+    EXPECT_EQ(assign.tensor().flat<int32_t>()(0), 123);
+
+    fallback_tensor = {};
+
+    ASSERT_EQ(copy.tensor().NumElements(), 1);
+    ASSERT_EQ(copy.tensor().dtype(), tensorflow::DT_INT32);
+    EXPECT_EQ(copy.tensor().flat<int32_t>()(0), 123);
+    ASSERT_EQ(assign.tensor().NumElements(), 1);
+    ASSERT_EQ(assign.tensor().dtype(), tensorflow::DT_INT32);
+    EXPECT_EQ(assign.tensor().flat<int32_t>()(0), 123);
+  }
+}
+
+TEST(FallbackTensorTest, FallbackTensorCopy) {
+  int32_t scalar = 123;
+  tensorflow::Tensor tensor(scalar);
+
+  {
+    FallbackTensor fallback_tensor(tensor);
+    EXPECT_FALSE(fallback_tensor.is_immutable());
+
+    auto copy = fallback_tensor;
+    EXPECT_TRUE(copy.is_immutable());
+  }
+
+  auto immutable_tensor = ImmutableTensor::Create(tensor);
+
+  {
+    FallbackTensor fallback_tensor(&immutable_tensor);
+    EXPECT_TRUE(fallback_tensor.is_immutable());
+
+    auto copy = fallback_tensor;
+    EXPECT_TRUE(copy.is_immutable());
   }
 }
 
diff --git a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc
index 663eca4eebf..22ffa9a6b32 100644
--- a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc
+++ b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc
@@ -65,7 +65,7 @@ absl::flat_hash_set<std::string> FindFunctionsToOptimize(
     const GraphDef& graph_def) {
   // TODO(b/203689805): Add more functional ops.
   static const auto* const kOpWhitelist = new absl::flat_hash_set<std::string>{
-      "PartitionedCall", "StatefulPartitionedCall"};
+      "PartitionedCall", "StatefulPartitionedCall", "BatchFunction"};
   absl::flat_hash_map<
       std::string /*function_name*/,
       absl::flat_hash_set<std::string> /*ops_using_the_function*/>
@@ -136,9 +136,9 @@ TfrtGraphExecutionState::Create(const TfrtGraphExecutionState::Options& options,
 
   // `CreateGraphExecutionState()` will preprocess the graph (e.g., apply
   // Placer to the top level graph).
-  TF_ASSIGN_OR_RETURN(
-      auto graph_execution_state,
-      fallback_state.CreateGraphExecutionState(std::move(graph_def)));
+  TF_ASSIGN_OR_RETURN(auto graph_execution_state,
+                      fallback_state.CreateGraphExecutionState(
+                          std::move(graph_def), options.run_placer_on_graph));
 
   return std::make_unique<TfrtGraphExecutionState>(
       options, std::move(graph_execution_state), fallback_state,
diff --git a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h
index 495a13a3e63..d592d857fd6 100644
--- a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h
+++ b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h
@@ -56,6 +56,7 @@ class TfrtGraphExecutionState {
     bool enable_tfrt_gpu = false;
     // TODO(b/260915352): Remove the flag and default to using bridge.
     bool use_bridge_for_gpu = false;
+    bool run_placer_on_graph = true;
   };
 
   // Creates a `GraphExecutionState` given `graph_def` and `fallback_state`.
diff --git a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state_test.cc b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state_test.cc
index d98e6dde07e..629e09767ea 100644
--- a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state_test.cc
+++ b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state_test.cc
@@ -55,9 +55,7 @@ using ::testing::ElementsAre;
 using ::testing::EqualsProto;
 using ::testing::HasSubstr;
 using ::testing::IsEmpty;
-using ::testing::NotNull;
 using ::testing::Pair;
-using ::testing::SizeIs;
 using ::testing::proto::IgnoringFieldPaths;
 using ::testing::proto::IgnoringRepeatedFieldOrdering;
 
@@ -298,8 +296,7 @@ TEST_F(PruneGraphDefTest, EliminateRefVariablesFromV1ControlFlowFailed) {
 
   const auto status = EliminateRefVariablesFromV1ControlFlow(graphdef);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.error_message(),
-              HasSubstr("requires its input to be refs"));
+  EXPECT_THAT(status.ToString(), HasSubstr("requires its input to be refs"));
 }
 
 TEST_F(PruneGraphDefTest, KeepLoopStructureComplete) {
diff --git a/tensorflow/core/tfrt/utils/utils.cc b/tensorflow/core/tfrt/utils/utils.cc
index d1e325f6dec..3d1ecbe2310 100644
--- a/tensorflow/core/tfrt/utils/utils.cc
+++ b/tensorflow/core/tfrt/utils/utils.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/tfrt/eager/virtual_device.h"
 #include "tensorflow/core/tfrt/utils/error_util.h"
@@ -39,20 +38,6 @@ namespace tfrt {
 
 using ::tensorflow::StatusOr;
 
-Expected<const char*> ConvertTfDeviceNameToTfrt(
-    const char* device_name, tensorflow::EagerContext* eager_context) {
-  // NOTE(fishx): We need to get tf_device first because DeviceMgr in current TF
-  // allows us get the device with simplified name like "CPU:0". However, TFRT
-  // DeviceManager only allows get device via its fullname.
-  tensorflow::Device* tf_device;
-  tensorflow::Status s =
-      eager_context->FindDeviceFromName(device_name, &tf_device);
-  if (!s.ok()) {
-    return MakeStringError(s.error_message());
-  }
-  return tf_device->name().c_str();
-}
-
 DType ConvertTfDTypeToTfrtDType(tensorflow::DataType dtype) {
   switch (dtype) {
 #define DTYPE(TFRT_DTYPE, TF_DTYPE) \
diff --git a/tensorflow/core/tfrt/utils/utils.h b/tensorflow/core/tfrt/utils/utils.h
index a52fe318c61..179bcbb93dc 100644
--- a/tensorflow/core/tfrt/utils/utils.h
+++ b/tensorflow/core/tfrt/utils/utils.h
@@ -30,7 +30,6 @@ limitations under the License.
 
 namespace tensorflow {
 class Device;
-class EagerContext;
 }  // namespace tensorflow
 
 namespace tfrt {
@@ -42,12 +41,6 @@ class HostContext;
 typedef tensorflow::gtl::InlinedVector<tfrt::DType, 4> TfrtDataTypeVector;
 typedef tensorflow::gtl::ArraySlice<tfrt::DType> TfrtDataTypeSlice;
 
-// TODO(b/161370736): Have a formal method to convert between TF's and TFRT's
-// device name. Currently TFRT adopts the suffix of TF's device name,
-// e.g. CPU:0.
-Expected<const char*> ConvertTfDeviceNameToTfrt(
-    const char* device_name, tensorflow::EagerContext* eager_context);
-
 DType ConvertTfDTypeToTfrtDType(tensorflow::DataType dtype);
 
 // Runs the runtime initialization function. A runtime initialization function
@@ -93,14 +86,14 @@ int64_t GetUniqueInt();
 #define RETURN_IF_ERROR_IN_INIT(...) \
   RETURN_IF_ERROR_WITH_STAGE_INFO("Initialize TFRT", __VA_ARGS__)
 
-#define RETURN_IF_ERROR_WITH_STAGE_INFO(stage, ...)                         \
-  do {                                                                      \
-    ::tensorflow::Status _status = (__VA_ARGS__);                           \
-    if (TF_PREDICT_FALSE(!_status.ok())) {                                  \
-      return ::tensorflow::errors::CreateWithUpdatedMessage(                \
-          _status, ::tensorflow::strings::StrCat(stage, ": ",               \
-                                                 _status.error_message())); \
-    }                                                                       \
+#define RETURN_IF_ERROR_WITH_STAGE_INFO(stage, ...)                       \
+  do {                                                                    \
+    ::tensorflow::Status _status = (__VA_ARGS__);                         \
+    if (TF_PREDICT_FALSE(!_status.ok())) {                                \
+      return ::tensorflow::errors::CreateWithUpdatedMessage(              \
+          _status,                                                        \
+          ::tensorflow::strings::StrCat(stage, ": ", _status.message())); \
+    }                                                                     \
   } while (0)
 
 // A list of macros similar to `TF_ASSIGN_OR_RETURN`, with additional model
@@ -122,14 +115,14 @@ int64_t GetUniqueInt();
       TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), stage, lhs, \
       rexpr)
 
-#define ASSIGN_OR_RETURN_WITH_STAGE_INFO_IMPL(statusor, stage, lhs, rexpr)    \
-  auto statusor = (rexpr);                                                    \
-  if (TF_PREDICT_FALSE(!statusor.ok())) {                                     \
-    const auto& _status = statusor.status();                                  \
-    return ::tensorflow::errors::CreateWithUpdatedMessage(                    \
-        _status,                                                              \
-        ::tensorflow::strings::StrCat(stage, ": ", _status.error_message())); \
-  }                                                                           \
+#define ASSIGN_OR_RETURN_WITH_STAGE_INFO_IMPL(statusor, stage, lhs, rexpr) \
+  auto statusor = (rexpr);                                                 \
+  if (TF_PREDICT_FALSE(!statusor.ok())) {                                  \
+    const auto& _status = statusor.status();                               \
+    return ::tensorflow::errors::CreateWithUpdatedMessage(                 \
+        _status,                                                           \
+        ::tensorflow::strings::StrCat(stage, ": ", _status.message()));    \
+  }                                                                        \
   lhs = std::move(statusor.value())
 
 }  // namespace tfrt
diff --git a/tensorflow/core/tfrt/utils/utils_test.cc b/tensorflow/core/tfrt/utils/utils_test.cc
index 960eb9dcf5e..8b057cd4d03 100644
--- a/tensorflow/core/tfrt/utils/utils_test.cc
+++ b/tensorflow/core/tfrt/utils/utils_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
@@ -29,26 +28,6 @@ namespace {
 
 using ::testing::HasSubstr;
 using ::testing::SizeIs;
-using ::testing::StartsWith;
-
-TEST(UtilsTest, ConvertTfDeviceNameToTfrt) {
-  const std::string device_name_prefix =
-      "/job:localhost/replica:0/task:0/device:CPU:0";
-  tensorflow::StaticDeviceMgr device_mgr(
-      tensorflow::DeviceFactory::NewDevice("CPU", {}, device_name_prefix));
-  auto eager_context = new tensorflow::EagerContext(
-      tensorflow::SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      /*async=*/false, &device_mgr, /*device_mgr_owned=*/false,
-      /*rendezvous=*/nullptr);
-
-  EXPECT_FALSE(ConvertTfDeviceNameToTfrt("unknown_device", eager_context));
-  EXPECT_THAT(std::string(*ConvertTfDeviceNameToTfrt(device_name_prefix.c_str(),
-                                                     eager_context)),
-              StartsWith(device_name_prefix));
-
-  eager_context->Unref();
-}
 
 TEST(UtilsTest, ConvertTfDTypeToTfrtDType) {
 #define DTYPE(TFRT_DTYPE, TF_DTYPE)                          \
@@ -96,7 +75,8 @@ TEST(UtilsTest, ReturnIfErrorInImport) {
     return tensorflow::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
-  EXPECT_STREQ(status.error_message().c_str(), "GraphDef proto -> MLIR: msg");
+  EXPECT_STREQ(status.ToString().c_str(),
+               "CANCELLED: GraphDef proto -> MLIR: msg [a='b']");
   EXPECT_EQ(status.GetPayload("a"), "b");
 }
 
@@ -107,9 +87,10 @@ TEST(UtilsTest, ReturnIfErrorInCompile) {
     return tensorflow::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
-  EXPECT_STREQ(status.error_message().c_str(),
-               "TF dialect -> TFRT dialect, compiler issue, please contact "
-               "the TFRT team: msg");
+  EXPECT_STREQ(
+      status.ToString().c_str(),
+      "CANCELLED: TF dialect -> TFRT dialect, compiler issue, please contact "
+      "the TFRT team: msg [a='b']");
   EXPECT_EQ(status.GetPayload("a"), "b");
 }
 
@@ -120,7 +101,8 @@ TEST(UtilsTest, ReturnIfErrorInInit) {
     return tensorflow::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
-  EXPECT_STREQ(status.error_message().c_str(), "Initialize TFRT: msg");
+  EXPECT_STREQ(status.ToString().c_str(),
+               "CANCELLED: Initialize TFRT: msg [a='b']");
   EXPECT_EQ(status.GetPayload("a"), "b");
 }
 
@@ -133,7 +115,8 @@ TEST(UtilsTest, AssignOrReturnInImport) {
     return tensorflow::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
-  EXPECT_STREQ(status.error_message().c_str(), "GraphDef proto -> MLIR: msg");
+  EXPECT_STREQ(status.ToString().c_str(),
+               "CANCELLED: GraphDef proto -> MLIR: msg [a='b']");
   EXPECT_EQ(status.GetPayload("a"), "b");
 }
 
@@ -146,9 +129,9 @@ TEST(UtilsTest, AssignOrReturnInCompile) {
     return tensorflow::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
-  EXPECT_STREQ(status.error_message().c_str(),
-               "TF dialect -> TFRT dialect, compiler issue, please contact "
-               "the TFRT team: msg");
+  EXPECT_STREQ(status.ToString().c_str(),
+               "CANCELLED: TF dialect -> TFRT dialect, compiler issue, please "
+               "contact the TFRT team: msg [a='b']");
   EXPECT_EQ(status.GetPayload("a"), "b");
 }
 
@@ -161,7 +144,8 @@ TEST(UtilsTest, AssignOrReturnInInit) {
     return tensorflow::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
-  EXPECT_STREQ(status.error_message().c_str(), "Initialize TFRT: msg");
+  EXPECT_STREQ(std::string(status.ToString()).c_str(),
+               "CANCELLED: Initialize TFRT: msg [a='b']");
   EXPECT_EQ(status.GetPayload("a"), "b");
 }
 
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index eb997bfaece..12c3572a972 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -11,12 +11,7 @@ load(
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//tensorflow/compiler/jit:__subpackages__",
-        "//tensorflow/compiler/mlir/tensorflow:__subpackages__",
-        "//tensorflow/compiler/tf2xla/kernels:__subpackages__",
-        "//tensorflow/compiler/xla:__subpackages__",
-        "//tensorflow/compiler/xla/backends/profiler/tpu:__subpackages__",
-        "//tensorflow/compiler/xla/stream_executor/tpu:__subpackages__",
+        "//tensorflow/compiler/mlir/tf2xla:__subpackages__",
         "//tensorflow/compiler/xrt:__subpackages__",
         "//tensorflow/core/tpu:__subpackages__",
         "//tensorflow/dtensor:__subpackages__",
@@ -227,9 +222,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
         "//tensorflow/core/framework:attr_value_proto_cc",
-        "//tensorflow/core/framework:node_def_util",
         "//tensorflow/core/framework:versions_proto_cc",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
@@ -330,8 +323,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":tpu_defs",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "//tensorflow/cc:scope",
         "//tensorflow/cc:tpu_ops",
         "//tensorflow/core:core_cpu",
@@ -342,6 +333,8 @@ cc_library(
         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
         "//tensorflow/core/tpu/graph_rewrite:distributed_tpu_configuration_rewrite_pass",
         "//tensorflow/core/tpu/graph_rewrite:distributed_tpu_rewrite_helpers",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + if_libtpu(["//tensorflow/compiler/jit"]),
 )
 
@@ -357,5 +350,7 @@ cc_library(
     name = "loose_headers",
     tags = ["avoid_dep"],
     textual_hdrs = ["tpu_global_init.h"],
-    visibility = ["//tensorflow_serving/model_servers:__pkg__"],
+    visibility = [
+        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
+    ],
 )
diff --git a/tensorflow/core/tpu/graph_rewrite/BUILD b/tensorflow/core/tpu/graph_rewrite/BUILD
index 6517cd4735b..c74205053d7 100644
--- a/tensorflow/core/tpu/graph_rewrite/BUILD
+++ b/tensorflow/core/tpu/graph_rewrite/BUILD
@@ -91,11 +91,6 @@ cc_library(
         "encapsulate_tpu_computations_pass.h",
     ],
     deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "//tensorflow/compiler/jit:compilation_passes",
         "//tensorflow/compiler/jit:encapsulate_util",
         "//tensorflow/compiler/jit:xla_cluster_util",
@@ -109,6 +104,11 @@ cc_library(
         "//tensorflow/core:session_options",
         "//tensorflow/core/tpu:tpu_compile_interface",
         "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + if_static(
         [
             "//tensorflow/core/common_runtime:function",
@@ -142,12 +142,6 @@ cc_library(
         ":distributed_tpu_rewrite_pass_internal",
         ":host_training_loop_optimization_util",
         ":incomplete_nodedef_builder",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/jit:encapsulate_util",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/tf2xla:common",
@@ -156,7 +150,6 @@ cc_library(
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:shape_util",
@@ -164,22 +157,29 @@ cc_library(
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/client:sharding_builder",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/core/common_runtime:device_propagation",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_interface",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_topology_external",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
-        "//tensorflow/core/platform:error_payloads",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
+        "//tensorflow/core/common_runtime:device_propagation",
+        "//tensorflow/core/platform:error_payloads",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
         "//tensorflow/core/tpu:tpu_compile_interface",
         "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/core/tpu:tpu_fingerprint_utils",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_interface",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_topology_external",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ] + if_static(
         [
             "//tensorflow/core/common_runtime:function",
diff --git a/tensorflow/core/tpu/graph_rewrite/combine_tpu_embedding_load_retrieve_pass.cc b/tensorflow/core/tpu/graph_rewrite/combine_tpu_embedding_load_retrieve_pass.cc
index 5fd641984c6..0063c3c377c 100644
--- a/tensorflow/core/tpu/graph_rewrite/combine_tpu_embedding_load_retrieve_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/combine_tpu_embedding_load_retrieve_pass.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/graph/graph.h"
@@ -605,7 +606,7 @@ Status CombineTPUEmbeddingLoadRetrievePass::Run(
       CHECK_NE(old_single_load_op, nullptr);  // Crash OK
       load_nodes.erase(old_single_load_op);
       NodeDef new_no_op_def = old_single_load_op->def();
-      new_no_op_def.set_op("NoOp");
+      ChangeToNoOp(&new_no_op_def);
       new_no_op_def.clear_input();
       new_no_op_def.clear_attr();
       Node* new_no_op;
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
index 2aead8857ea..94bbca61f0d 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
@@ -4967,7 +4967,7 @@ Status DistributedTPURewritePass::PlaceUnassignedDeviceNodesOnTPUIfPossible(
 Status DistributedTPURewritePass::Run(
     const GraphOptimizationPassOptions& options) {
   Status status = InternalRun(options);
-  OkOrSetErrorCounterPayload(
+  tsl::OkOrSetErrorCounterPayload(
       tensorflow::core::platform::ErrorSourceProto::TF_XLA_BRIDGE, status);
   return status;
 }
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index cfe20a0f5f8..b0c70589166 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -17,6 +17,7 @@ load(
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
+        "//tensorflow/compiler/mlir/tf2xla:__subpackages__",
         "//tensorflow/compiler/xrt/kernels:__subpackages__",
         "//tensorflow/core/tpu:__subpackages__",
         "//tensorflow/dtensor:__subpackages__",
@@ -65,22 +66,13 @@ cc_library(
         ":tpu_compilation_metrics_hdrs",
         ":tpu_compile_op_options",
         ":tpu_compile_op_support",
+        ":tpu_fingerprint_lookup",
         ":tpu_mesh_state_interface",
         ":tpu_op_consts",
         ":tpu_op_util",
         ":tpu_program_group_interface",
         ":tpu_util",
-        ":tpu_fingerprint_lookup",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/types:optional",
-        "//tensorflow/core/platform:error_payloads",
-        "//tensorflow/core/tpu:tpu_compile_interface",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
         ":tpu_util_hdrs",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
@@ -89,18 +81,27 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:compile_only_client",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_interface",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:error_payloads",
         "//tensorflow/core/protobuf/tpu:compilation_result_proto_cc",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
+        "//tensorflow/core/tpu:tpu_compile_interface",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_interface",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
     ],
     alwayslink = 1,
 )
@@ -124,26 +125,26 @@ tf_kernel_library(
         ":tpu_compilation_cache_local_lookup",
         ":tpu_compilation_cache_lookup",
         ":tpu_compilation_cache_rpc_lookup",
-        ":tpu_execute_op_options",
-        ":tpu_mesh_state_interface",
         ":tpu_embedding_engine_state_interface",
+        ":tpu_execute_op_options",
+        ":tpu_fingerprint_lookup",
+        ":tpu_mesh_state_interface",
         ":tpu_op_consts",
         ":tpu_pod_state",
-        ":tpu_fingerprint_lookup",
-        "@com_google_absl//absl/cleanup",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
+        "//tensorflow/compiler/xla/stream_executor/tpu:proto_helper",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:refcount",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/tpu:proto_helper",
+        "@com_google_absl//absl/cleanup",
     ],
     alwayslink = 1,
 )
@@ -160,28 +161,28 @@ tf_kernel_library(
         ":tpu_compilation_cache_local_lookup",
         ":tpu_compilation_cache_lookup",
         ":tpu_compilation_cache_rpc_lookup",
-        ":tpu_execute_op_options",
-        ":tpu_mesh_state_interface",
         ":tpu_embedding_engine_state_interface",
+        ":tpu_execute_op_options",
+        ":tpu_fingerprint_lookup",
+        ":tpu_mesh_state_interface",
         ":tpu_op_consts",
         ":tpu_pod_state",
-        ":tpu_fingerprint_lookup",
-        "@com_google_absl//absl/cleanup",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
+        "//tensorflow/compiler/xla/stream_executor/tpu:proto_helper",
+        "//tensorflow/compiler/xla/stream_executor/tpu:status_helper",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:refcount",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
-        "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
-        "//tensorflow/compiler/xla/stream_executor/tpu:status_helper",
+        "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/tpu:proto_helper",
+        "@com_google_absl//absl/cleanup",
     ],
     alwayslink = 1,
 )
@@ -450,18 +451,18 @@ cc_library(
         ":tpu_util",
         ":tpu_util_hdrs",
         ":trace_util_hdrs",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
         "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
     ],
     alwayslink = 1,
 )
@@ -536,13 +537,13 @@ cc_library(
     deps = [
         ":tpu_compilation_cache_key",
         ":tpu_program_group_interface",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
         "//tensorflow/cc:ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -569,16 +570,16 @@ cc_library(
     hdrs = ["tpu_util.h"],
     deps = [
         ":tpu_compilation_cache_key",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "//tensorflow/cc:ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:compile_only_client",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ] + tf_grpc_cc_dependencies(),
     alwayslink = 1,
 )
@@ -601,8 +602,8 @@ cc_library(
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_lookup",
         ":tpu_program_group_interface",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/strings",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -633,16 +634,16 @@ cc_library(
         [":tpu_compilation_cache_rpc_support"],
         ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_rpc_support"],
     ) + [
+        ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_grpc",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_lookup",
-        ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_rpc_support_hdrs",
         ":tpu_program_group_interface",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -1034,11 +1035,11 @@ cc_library(
         ["//tensorflow/core/tpu/kernels:tpu_util"],
     ) + [
         ":tpu_compilation_cache_service",
-        "@com_google_absl//absl/cleanup",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
         "//tensorflow/core:framework",
+        "@com_google_absl//absl/cleanup",
     ],
 )
 
@@ -1158,19 +1159,20 @@ cc_library(
     hdrs = ["tpu_functional_ops.h"],
     deps = [
         ":tpu_compile_op_support",
+        ":tpu_fingerprint_lookup",
+        ":tpu_op_consts",
         ":tpu_op_util",
         ":tpu_ordinal_selector",
         ":tpu_util_hdrs",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "//third_party/eigen3",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/tf2xla:sharding_util",
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_interface",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_topology_external",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1184,15 +1186,14 @@ cc_library(
         "//tensorflow/core/platform:refcount",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
-        ":tpu_fingerprint_lookup",
-        ":tpu_op_consts",
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_interface",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_topology_external",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
     ] + if_static(["//tensorflow/core/common_runtime:rendezvous_mgr"]),
     alwayslink = 1,
 )
diff --git a/tensorflow/core/tpu/kernels/host_compute_ops.cc b/tensorflow/core/tpu/kernels/host_compute_ops.cc
index 60afa7274f0..1bda86ecb37 100644
--- a/tensorflow/core/tpu/kernels/host_compute_ops.cc
+++ b/tensorflow/core/tpu/kernels/host_compute_ops.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -36,6 +38,7 @@ class RecvAtHostOp : public AsyncOpKernel {
  public:
   explicit RecvAtHostOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("key", &key_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("device_type", &device_type_));
     int device_ordinal = 0;
     if (device_ordinal_is_attr) {
       OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ordinal", &device_ordinal));
@@ -64,16 +67,16 @@ class RecvAtHostOp : public AsyncOpKernel {
     parsed_name.id = 0;
     cpu_device_ = DeviceNameUtils::ParsedNameToString(parsed_name);
     if (device_ordinal_is_attr) {
-      parsed_name.type = "TPU";
+      parsed_name.type = device_type_;
       parsed_name.id = device_ordinal;
-      tpu_device_ = DeviceNameUtils::ParsedNameToString(parsed_name);
-      VLOG(2) << "  tpu_device_ = " << tpu_device_;
+      remote_device_ = DeviceNameUtils::ParsedNameToString(parsed_name);
+      VLOG(2) << "  remote_device_ = " << remote_device_;
       VLOG(2) << "  cpu_device_ = " << cpu_device_;
     }
   }
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    string tpu_device;
+    string remote_device;
     if (!device_ordinal_is_attr) {
       const Tensor& device_ordinal_tensor = ctx->input(1);
       OP_REQUIRES_ASYNC(
@@ -90,10 +93,10 @@ class RecvAtHostOp : public AsyncOpKernel {
       OP_REQUIRES_ASYNC(
           ctx, DeviceNameUtils::ParseFullName(cpu_device_, &parsed_name),
           errors::Internal("Could not parse device name."), done);
-      parsed_name.type = "TPU";
+      parsed_name.type = device_type_;
       parsed_name.id = device_ordinal;
-      tpu_device = DeviceNameUtils::ParsedNameToString(parsed_name);
-      VLOG(2) << "  tpu_device_ = " << tpu_device;
+      remote_device = DeviceNameUtils::ParsedNameToString(parsed_name);
+      VLOG(2) << "  remote_device_ = " << remote_device;
       VLOG(2) << "  cpu_device_ = " << cpu_device_;
     }
 
@@ -126,7 +129,7 @@ class RecvAtHostOp : public AsyncOpKernel {
     std::vector<Rendezvous::ParsedKey> parsed_key(ctx->num_outputs());
     for (int i = 0; i < ctx->num_outputs(); ++i) {
       rendezvous_key[i] = Rendezvous::CreateKey(
-          device_ordinal_is_attr ? tpu_device_ : tpu_device,
+          device_ordinal_is_attr ? remote_device_ : remote_device,
           /*src_incarnation=*/1, cpu_device_,
           strings::StrCat(rendezvous_key_base, key_, "_dtoh_", i),
           FrameAndIter(0, 0));
@@ -169,8 +172,9 @@ class RecvAtHostOp : public AsyncOpKernel {
 
  private:
   string key_;
-  string tpu_device_;
+  string remote_device_;
   string cpu_device_;
+  string device_type_;
 
   // RecvAtHostOp is neither copyable nor movable.
   RecvAtHostOp(const RecvAtHostOp&) = delete;
@@ -187,6 +191,7 @@ class SendFromHostOp : public OpKernel {
  public:
   explicit SendFromHostOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("key", &key_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("device_type", &device_type_));
     int device_ordinal = 0;
     if (device_ordinal_is_attr) {
       OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ordinal", &device_ordinal));
@@ -221,16 +226,16 @@ class SendFromHostOp : public OpKernel {
     parsed_name.id = 0;
     cpu_device_ = DeviceNameUtils::ParsedNameToString(parsed_name);
     if (device_ordinal_is_attr) {
-      parsed_name.type = "TPU";
+      parsed_name.type = device_type_;
       parsed_name.id = device_ordinal;
-      tpu_device_ = DeviceNameUtils::ParsedNameToString(parsed_name);
-      VLOG(2) << "  tpu_device_ = " << tpu_device_;
+      remote_device_ = DeviceNameUtils::ParsedNameToString(parsed_name);
+      VLOG(2) << "  remote_device_ = " << remote_device_;
       VLOG(2) << "  cpu_device_ = " << cpu_device_;
     }
   }
 
   void Compute(OpKernelContext* ctx) override {
-    std::string tpu_device;
+    std::string remote_device;
     if (!device_ordinal_is_attr) {
       const Tensor& device_ordinal_tensor = ctx->input(ctx->num_inputs() - 1);
       OP_REQUIRES(
@@ -245,10 +250,10 @@ class SendFromHostOp : public OpKernel {
       OP_REQUIRES(ctx,
                   DeviceNameUtils::ParseFullName(cpu_device_, &parsed_name),
                   errors::Internal("Could not parse device name."));
-      parsed_name.type = "TPU";
+      parsed_name.type = device_type_;
       parsed_name.id = device_ordinal;
-      tpu_device = DeviceNameUtils::ParsedNameToString(parsed_name);
-      VLOG(2) << "  tpu_device_ = " << tpu_device;
+      remote_device = DeviceNameUtils::ParsedNameToString(parsed_name);
+      VLOG(2) << "  remote_device_ = " << remote_device;
       VLOG(2) << "  cpu_device_ = " << cpu_device_;
     }
 
@@ -274,7 +279,7 @@ class SendFromHostOp : public OpKernel {
       // TODO(misard) Fix this once we have replication.
       const string& rendezvous_key = Rendezvous::CreateKey(
           cpu_device_, /*src_incarnation=*/1,
-          device_ordinal_is_attr ? tpu_device_ : tpu_device,
+          device_ordinal_is_attr ? remote_device_ : remote_device,
           strings::StrCat(rendezvous_key_base, key_, "_htod_", i),
           FrameAndIter(0, 0));
 
@@ -289,7 +294,8 @@ class SendFromHostOp : public OpKernel {
  private:
   string key_;
   string cpu_device_;
-  string tpu_device_;
+  string remote_device_;
+  string device_type_;
 
   // SendFromHostOp is neither copyable nor movable.
   SendFromHostOp(const SendFromHostOp&) = delete;
@@ -299,7 +305,7 @@ class SendFromHostOp : public OpKernel {
 }  // anonymous namespace
 
 // These ops execute on the CPU device and must specify a non-negative value for
-// device_ordinal to indicate which TPU to send infeed to.
+// device_ordinal to indicate which remote device to send infeed to.
 REGISTER_KERNEL_BUILDER(Name("_XlaRecvAtHost").Device(DEVICE_CPU),
                         RecvAtHostOp<true>);
 
diff --git a/tensorflow/core/tpu/kernels/sharding_util_ops_test.cc b/tensorflow/core/tpu/kernels/sharding_util_ops_test.cc
index e2365c2cd23..c7dd1f8fd1a 100644
--- a/tensorflow/core/tpu/kernels/sharding_util_ops_test.cc
+++ b/tensorflow/core/tpu/kernels/sharding_util_ops_test.cc
@@ -45,7 +45,7 @@ namespace {
 
 MATCHER_P2(IsStatus, error_code, error_message, "") {
   return arg.code() == error_code &&
-         absl::StrContains(arg.error_message(), error_message);
+         absl::StrContains(arg.message(), error_message);
 }
 
 Status RunGraph(const Graph& graph,
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
index 7df813cf7eb..93a9cb44410 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
@@ -110,7 +110,7 @@ CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
   if (!initialization_status.ok()) {
     // Compilation failure might caused the subsequent tpu_program_group init
     // failed with assert error. Log the error here to make debugging easier.
-    LOG(ERROR) << initialization_status.error_message();
+    LOG(ERROR) << initialization_status.message();
   }
 
   // Add the entry to the uid index.
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
index a23e2896f7d..c39aa2f90b2 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
@@ -117,7 +117,7 @@ void TpuCompilationCacheService::GetTpuProgram(GetTpuProgramCall* call) {
         absl::StrCat(
             "Error getting the fetching target ",
             CompilationCacheFetchTarget_Name(call->request.fetch_target())),
-        s.error_message()));
+        std::string(s.message())));
   }
 
   TpuCompilationCacheEntry cache_entry = entry->get();
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op.cc b/tensorflow/core/tpu/kernels/tpu_compile_op.cc
index db17574e0f8..84c6c87f7a6 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op.cc
@@ -52,7 +52,8 @@ void TpuCompileSucceededAssertOp::Compute(OpKernelContext* ctx) {
         errors::InvalidArgument("Unable to parse compilation result proto");
   }
   if (!status.ok() || proto.status_code() != error::Code::OK) {
-    status.Update(Status(proto.status_code(), proto.status_error_message()));
+    status.Update(Status(static_cast<absl::StatusCode>(proto.status_code()),
+                         proto.status_error_message()));
     LOG(WARNING) << "TPU compilation failed: " << status;
     errors::AppendToMessage(&status, "TPU compilation failed");
     if (tensorflow::internal::TpuCompilationFailureClosesChips()) {
@@ -68,7 +69,7 @@ void TpuCompileSucceededAssertOp::Compute(OpKernelContext* ctx) {
       Status close_status = TpuNodeContext::CloseTpuHost();
 
       if (!close_status.ok()) {
-        errors::AppendToMessage(&status, close_status.error_message());
+        errors::AppendToMessage(&status, close_status.message());
       }
     }
     ctx->CtxFailure(status);
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index 4343b35a7b9..1f52783ffb1 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -154,7 +154,7 @@ void TpuCompileOpKernelCommon::Compute(OpKernelContext* ctx) {
     // correctly propagates the status payload set.
     const std::string new_error_message =
         absl::StrCat(TpuCompileInterface::kTpuCompileErrorMessage, ". ",
-                     compile_status.error_message());
+                     compile_status.message());
 
     tpu::CompilationResultProto proto;
     proto.set_status_code(static_cast<error::Code>(compile_status.code()));
@@ -180,7 +180,7 @@ Status TpuCompileOpKernelCommon::CompileLocallyAndFillHostCache(
   Status status = CompileLocallyAndFillHostCacheInternal(
       flib_runtime, session_metadata, mesh_state, dynamic_shapes,
       guaranteed_constants, key, tpu_program_group);
-  OkOrSetErrorCounterPayload(
+  tsl::OkOrSetErrorCounterPayload(
       tensorflow::core::platform::ErrorSourceProto::TPU_COMPILE_OP, status);
   return status;
 }
@@ -222,8 +222,8 @@ Status TpuCompileOpKernelCommon::CompileLocallyAndFillHostCacheInternal(
             << session_name << " took " << duration << " and "
             << (compile_status.ok() ? "succeeded" : "failed");
   tpu_program_group->LogProgramMemorySummary();
-  metrics::UpdateTpuErrorCounter("TpuCompileOp",
-                                 error_name(compile_status.code()));
+  metrics::UpdateTpuErrorCounter(
+      "TpuCompileOp", absl::StatusCodeToString(compile_status.code()));
   metrics::UpdateXlaCompilationTime(absl::ToInt64Microseconds(duration));
   TpuCompilationMetrics::IncrementCompilationCount(session_name);
 
@@ -406,7 +406,7 @@ Status TpuCompileOpKernelCommon::ComputeInternal(OpKernelContext* ctx) {
     proto.set_status_code(static_cast<error::Code>(status.code()));
     if (!status.ok()) {
       proto.set_status_error_message(TruncateMessage(
-          absl::StrCat("Compilation failure: ", status.error_message()), 128));
+          absl::StrCat("Compilation failure: ", status.message()), 128));
     }
     if (return_hlo_protos_) {
       // Return the HloProtos as part of compilation status.
diff --git a/tensorflow/core/tpu/kernels/tpu_functional_ops.cc b/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
index 5e62fc008ae..fc8aead74d6 100644
--- a/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
@@ -528,7 +528,7 @@ Status MaybeRegisterFingerprint(
   auto status =
       tpu::ComputeArgumentShapes(metadata_proto, input_shapes, &arg_shapes);
   if (!status.ok()) {
-    VLOG(2) << status.error_message();
+    VLOG(2) << status.message();
     return OkStatus();
   }
   uint64 tf_fingerprint =
diff --git a/tensorflow/core/tpu/ops/host_compute_ops.cc b/tensorflow/core/tpu/ops/host_compute_ops.cc
index b0e3d01d31a..1788f19e3df 100644
--- a/tensorflow/core/tpu/ops/host_compute_ops.cc
+++ b/tensorflow/core/tpu/ops/host_compute_ops.cc
@@ -20,9 +20,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::InferenceContext;
-using shape_inference::ShapeHandle;
-
 REGISTER_OP("_XlaHostComputeMlir")
     .Input("inputs: Tinputs")
     .Output("outputs: Toutputs")
@@ -31,6 +28,7 @@ REGISTER_OP("_XlaHostComputeMlir")
     .Attr("send_key: string")
     .Attr("recv_key: string")
     .Attr("host_mlir_module: string=\"\"")
+    .Attr("manual_sharding: bool = false")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       return ::tensorflow::shape_inference::UnknownShape(c);
     })
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index cadcc6bf645..6b789fe0e59 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -22,8 +22,17 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 namespace {
+Status InitializeTpuLibrary() {
+  Status status = FindAndLoadTpuLibrary();
+  if (!status.ok()) {
+    LOG(INFO) << "FindAndLoadTpuLibrary failed with " << status.ToString()
+              << ". This is expected if TPU is not used.";
+  }
+  return status;
+}
+
 #if !defined(PLATFORM_GOOGLE)
-static Status tpu_library_finder = FindAndLoadTpuLibrary();
+static Status tpu_library_finder = InitializeTpuLibrary();
 #endif
 }  // namespace
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/tpu_compile.cc b/tensorflow/core/tpu/tpu_compile.cc
index 0c79feda545..1ec2e073e81 100644
--- a/tensorflow/core/tpu/tpu_compile.cc
+++ b/tensorflow/core/tpu/tpu_compile.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/statusor.h"
diff --git a/tensorflow/core/tpu/tpu_compile.h b/tensorflow/core/tpu/tpu_compile.h
index 9011b4f323d..55ee52a26cd 100644
--- a/tensorflow/core/tpu/tpu_compile.h
+++ b/tensorflow/core/tpu/tpu_compile.h
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
-#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 
diff --git a/tensorflow/core/tpu/tpu_embedding_errors.cc b/tensorflow/core/tpu/tpu_embedding_errors.cc
index 53efce986f1..53bb305c227 100644
--- a/tensorflow/core/tpu/tpu_embedding_errors.cc
+++ b/tensorflow/core/tpu/tpu_embedding_errors.cc
@@ -26,7 +26,7 @@ Status AppendTpuEmbeddingErrorPayload(Status obj) {
     return OkStatus();
   } else {
     const std::string error_message =
-        absl::StrCat(kTpuEmbeddingErrorMessage, ". ", obj.error_message());
+        absl::StrCat(kTpuEmbeddingErrorMessage, ". ", obj.message());
     Status status(obj.code(), error_message);
     TPUEmbeddingError error_payload;
     status.SetPayload(kTpuEmbeddingErrorUrl,
@@ -40,7 +40,7 @@ bool HasTpuEmbeddingErrorPayload(const Status& status) {
 }
 
 bool HasTpuEmbeddingErrorMessage(const Status& status) {
-  return absl::StrContains(status.error_message(), kTpuEmbeddingErrorMessage);
+  return absl::StrContains(status.message(), kTpuEmbeddingErrorMessage);
 }
 
 }  // namespace tensorflow::tpu
diff --git a/tensorflow/core/tpu/tpu_embedding_errors.h b/tensorflow/core/tpu/tpu_embedding_errors.h
index 9c5d308b336..f1b72e169c8 100644
--- a/tensorflow/core/tpu/tpu_embedding_errors.h
+++ b/tensorflow/core/tpu/tpu_embedding_errors.h
@@ -46,8 +46,8 @@ StatusOr<T> AppendTpuEmbeddingErrorPayload(StatusOr<T> obj) {
   if (obj.ok()) {
     return std::move(obj.value());
   } else {
-    const std::string error_message = absl::StrCat(
-        kTpuEmbeddingErrorMessage, ". ", obj.status().error_message());
+    const std::string error_message =
+        absl::StrCat(kTpuEmbeddingErrorMessage, ". ", obj.status().message());
     Status status(obj.status().code(), error_message);
     TPUEmbeddingError error_payload;
     status.SetPayload(kTpuEmbeddingErrorUrl,
diff --git a/tensorflow/core/transforms/BUILD b/tensorflow/core/transforms/BUILD
index 8015d674b1e..cd20ec56853 100644
--- a/tensorflow/core/transforms/BUILD
+++ b/tensorflow/core/transforms/BUILD
@@ -138,11 +138,11 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core/framework:graph_debug_info_proto_cc",
         "//tensorflow/core/ir/importexport:graphdef_export",
         "//tensorflow/core/ir/importexport:graphdef_import",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
-        "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/core/transforms/consolidate_attrs/pass.cc b/tensorflow/core/transforms/consolidate_attrs/pass.cc
index d0d64a10f0b..aa5dab19f22 100644
--- a/tensorflow/core/transforms/consolidate_attrs/pass.cc
+++ b/tensorflow/core/transforms/consolidate_attrs/pass.cc
@@ -205,7 +205,8 @@ ArrayAttr ConsolidateAttributesPassImpl::reifyAndDropFunctionResultAttributes(
   SmallVector<Attribute> ret_attrs;
   // The result types are propagated to the data operands to `return`.
   auto ret_op = cast<ReturnOp>(func.getBody().front().getTerminator());
-  for (auto &it : llvm::enumerate(res_attrs.getAsRange<DictionaryAttr>())) {
+  for (const auto &it :
+       llvm::enumerate(res_attrs.getAsRange<DictionaryAttr>())) {
     NamedAttrList attrs(it.value());
     Value ret = ret_op.getOperand(it.index());
     Type ret_type = ret.getType();
diff --git a/tensorflow/core/transforms/constant_folding/pass.cc b/tensorflow/core/transforms/constant_folding/pass.cc
index c564df9e76b..317114efc7b 100644
--- a/tensorflow/core/transforms/constant_folding/pass.cc
+++ b/tensorflow/core/transforms/constant_folding/pass.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "llvm/ADT/Twine.h"
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -320,7 +321,7 @@ static FailureOr<TFOp> ReplaceOpWithBroadcastTo(OpBuilder &builder, TFOp op,
   // Create a vector of control operands. We should not fail beyond this point
   // since GetControlDependency may create a control anchor (a new op).
   SmallVector<Value> control_operands;
-  for (auto &it : llvm::enumerate(op.getNonControlOperands())) {
+  for (const auto &it : llvm::enumerate(op.getNonControlOperands())) {
     int idx = it.index();
     Value v = it.value();
     if (idx == idx_to_replace) continue;
@@ -748,7 +749,7 @@ class EvaluateConstant : public FolderPatternBase<EvaluateConstant> {
         OperandControlRetRange(op->getOperands()));
 
     SmallVector<TFOp> const_ops(result.size());
-    for (auto &it : llvm::enumerate(result)) {
+    for (const auto &it : llvm::enumerate(result)) {
       TypedAttr attr = it.value();
       // Null values represent dead outputs. They can result from evaluating a
       // switch op.
@@ -776,7 +777,7 @@ class EvaluateConstant : public FolderPatternBase<EvaluateConstant> {
       const_op.setName(TFOp(op).nameAttr());
       rewriter.replaceOp(op, const_op->getResults());
     } else {
-      for (auto &it : llvm::enumerate(const_ops)) {
+      for (const auto &it : llvm::enumerate(const_ops)) {
         if (!it.value()) continue;
         for (OpOperand &use :
              llvm::make_early_inc_range(op->getResult(it.index()).getUses())) {
@@ -1892,7 +1893,8 @@ class MoveConstantsPastEnterOpBase
 
     FailureOr<TFOp> cloned_const_op = CreateConstantTensorOp(
         rewriter, op->getLoc(), TFOp(op).name(), *(input->result_type_begin()),
-        TFOp(op).controlRet(), input->getAttr("value"), input->getAttrs());
+        TFOp(op).controlRet(), cast<TypedAttr>(input->getAttr("value")),
+        input->getAttrs());
     if (failed(cloned_const_op)) return failure();
 
     (*cloned_const_op).setName(Twine(TFOp(op).name(), "/_enter"));
@@ -2019,7 +2021,8 @@ class SimplifyReductionOp : public FolderPatternBase<SimplifyReductionOp> {
     Operation *reduction_indices = op->getOperand(1).getDefiningOp();
     if (!reduction_indices) return failure();
 
-    ShapedType indices_type = *(reduction_indices->result_type_begin());
+    ShapedType indices_type =
+        cast<ShapedType>(*(reduction_indices->result_type_begin()));
     if (indices_type.hasStaticShape() && indices_type.getNumElements() == 0) {
       Operation *identity_op = ReplaceReductionWithIdentity(rewriter, op);
       if (!identity_op) return failure();
@@ -3153,7 +3156,7 @@ class PartialConcatConstFolding
 
     if (!inputs_to_delete.empty()) {
       OperationState state(op->getLoc(), op->getName());
-      for (auto &it : llvm::enumerate(non_control_operands)) {
+      for (const auto &it : llvm::enumerate(non_control_operands)) {
         if (inputs_to_delete.contains(it.index())) continue;
         state.addOperands(it.value());
       }
diff --git a/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc b/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
index f79c12a4a94..8576a75f8f3 100644
--- a/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
+++ b/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
@@ -45,7 +45,7 @@ template <typename RangeT>
 static SmallVector<llvm::detail::ValueOfRange<RangeT>> FilterByIndex(
     RangeT &&range, const llvm::BitVector &indices) {
   SmallVector<llvm::detail::ValueOfRange<RangeT>> result;
-  for (auto &it : llvm::enumerate(range))
+  for (const auto &it : llvm::enumerate(range))
     if (!indices.test(it.index())) result.push_back(it.value());
   return result;
 }
@@ -110,7 +110,7 @@ struct EliminatePassthroughIterArgs {
     // value and remove the argument. Insert the implicitly captured value into
     // the result list to replace the removed results from the original op.
     SmallVector<Value> results = llvm::to_vector(ValueRange(new_op.getOuts()));
-    for (auto &it : llvm::enumerate(indices)) {
+    for (const auto &it : llvm::enumerate(indices)) {
       unsigned idx = it.value() - it.index();
       Value data = op.getInit()[it.value()];
       results.insert(results.begin() + it.value(), data);
diff --git a/tensorflow/core/transforms/func_to_graph/pass.cc b/tensorflow/core/transforms/func_to_graph/pass.cc
index 08650580295..4cb3bf3cac4 100644
--- a/tensorflow/core/transforms/func_to_graph/pass.cc
+++ b/tensorflow/core/transforms/func_to_graph/pass.cc
@@ -62,7 +62,7 @@ void FuncToGraphPass::runOnOperation() {
   auto status = FuncToGraph(lifted_graph_func);
   if (!status.ok()) {
     emitError(lifted_graph_func.getLoc())
-        << "FuncToGraph failed: " << status.error_message();
+        << "FuncToGraph failed: " << status.message();
     signalPassFailure();
   }
 }
diff --git a/tensorflow/core/transforms/functional_to_region/impl.cc b/tensorflow/core/transforms/functional_to_region/impl.cc
index 3b4392b0e79..8612571579c 100644
--- a/tensorflow/core/transforms/functional_to_region/impl.cc
+++ b/tensorflow/core/transforms/functional_to_region/impl.cc
@@ -278,7 +278,7 @@ void BasePattern::CloneAndReorderArgs(TypeRange types, Region &from, Region &to,
   IRMapping bv;
   CloneAndRename(from, to, bv);
   SmallVector<Location> arg_locs(types.size(), from.getLoc());
-  for (auto &it :
+  for (const auto &it :
        llvm::enumerate(llvm::to_vector(to.addArguments(types, arg_locs)))) {
     BlockArgument arg = to.getArgument(it.index() * 2);
     BlockArgument ctl = to.getArgument(arg.getArgNumber() + 1);
@@ -399,7 +399,7 @@ LogicalResult ConvertCaseLikeOp<CaseLikeOp, CaseLikeRegionOp>::matchAndRewrite(
   ControlType control_ty = this->dialect_.getControlType();
   SmallVector<IRMapping> bvs(branch_funcs.size(), {});
   rewriter.setInsertionPoint(region_op);
-  for (auto &arg : llvm::enumerate(args)) {
+  for (const auto &arg : llvm::enumerate(args)) {
     for (auto it : llvm::zip(branch_funcs, bvs)) {
       BlockArgument branch_arg =
           GraphFuncOp::getDataValue(std::get<0>(it).getBody(), arg.index());
diff --git a/tensorflow/core/transforms/graph_compactor/pass.cc b/tensorflow/core/transforms/graph_compactor/pass.cc
index 2a3d6323cbb..6412ef8ec11 100644
--- a/tensorflow/core/transforms/graph_compactor/pass.cc
+++ b/tensorflow/core/transforms/graph_compactor/pass.cc
@@ -231,7 +231,7 @@ LogicalResult StripDefaultAttrsPass::removeDefaultValuedAttrs(Operation *op) {
     tensorflow::StatusOr<Attribute> maybe_attr =
         ConvertAttributeValue(attr.default_value(), b);
     if (!maybe_attr.ok())
-      return op->emitError(maybe_attr.status().error_message());
+      return op->emitError(std::string(maybe_attr.status().message()));
     if (maybe_attr.value() == it.first->getValue())
       indices_to_remove.set(std::distance(attrs.begin(), it.first));
   }
@@ -240,7 +240,7 @@ LogicalResult StripDefaultAttrsPass::removeDefaultValuedAttrs(Operation *op) {
   // Construct and set the new attributes.
   SmallVector<NamedAttribute> new_attrs;
   new_attrs.reserve(attrs.size());
-  for (auto &it : llvm::enumerate(attrs)) {
+  for (const auto &it : llvm::enumerate(attrs)) {
     if (indices_to_remove.test(it.index())) continue;
     new_attrs.push_back(it.value());
   }
@@ -321,7 +321,7 @@ LogicalResult AddDefaultAttrsPass::addDefaultValuedAttrs(Operation *op) {
     tensorflow::StatusOr<Attribute> maybe_attr =
         ConvertAttributeValue(attr.default_value(), b);
     if (!maybe_attr.ok())
-      return op->emitError(maybe_attr.status().error_message());
+      return op->emitError(std::string(maybe_attr.status().message()));
     attrs.set(attr.name(), maybe_attr.value());
   }
   op->setAttrs(attrs.getDictionary(&getContext()));
diff --git a/tensorflow/core/transforms/graph_to_func/pass.cc b/tensorflow/core/transforms/graph_to_func/pass.cc
index 50a796cb2fe..91d3ca714ad 100644
--- a/tensorflow/core/transforms/graph_to_func/pass.cc
+++ b/tensorflow/core/transforms/graph_to_func/pass.cc
@@ -49,8 +49,7 @@ struct GraphToFuncPass : impl::GraphToFuncBase<GraphToFuncPass> {
     GraphOp graph = *ops_list.begin();
     auto status = GraphToFunc(graph, feeds_, fetches_, control_rets_);
     if (!status.ok()) {
-      emitError(graph.getLoc())
-          << "GraphToFunc failed: " << status.error_message();
+      emitError(graph.getLoc()) << "GraphToFunc failed: " << status.message();
       signalPassFailure();
     }
   }
diff --git a/tensorflow/core/transforms/graph_transform_wrapper.h b/tensorflow/core/transforms/graph_transform_wrapper.h
index 7b580459b6f..604c627232a 100644
--- a/tensorflow/core/transforms/graph_transform_wrapper.h
+++ b/tensorflow/core/transforms/graph_transform_wrapper.h
@@ -21,9 +21,9 @@ limitations under the License.
 
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 
 namespace mlir {
 namespace tfg {
diff --git a/tensorflow/core/transforms/region_to_functional/impl.cc b/tensorflow/core/transforms/region_to_functional/impl.cc
index dbf6a743dad..7fac57bb7fd 100644
--- a/tensorflow/core/transforms/region_to_functional/impl.cc
+++ b/tensorflow/core/transforms/region_to_functional/impl.cc
@@ -474,7 +474,7 @@ FailureOr<std::vector<Value>> BasePattern::CollectValuesDefinedAboveAll(
   if (!force_control_capture_ && !ctl_only.empty()) return failure();
 
   Operation *parent = regions.front()->getParentOp();
-  for (auto &ctl : llvm::enumerate(ctl_only.takeVector())) {
+  for (const auto &ctl : llvm::enumerate(ctl_only.takeVector())) {
     Operation *const_op =
         MakeChainConstant(parent, ctl.value(), ctl.index(), rewriter);
     for (Region *region : regions)
@@ -529,11 +529,11 @@ NamedAttrList BasePattern::BuildAttributes(RegionAttr preserved,
     return attrs.getDictionary(ctx_);
   };
 
-  for (auto &it : llvm::enumerate(arguments)) {
+  for (const auto &it : llvm::enumerate(arguments)) {
     arg_attrs.append({build_attrs(preserved_arg_attrs, it, {}),
                       DictionaryAttr::get(ctx_, {})});
   }
-  for (auto &it : llvm::enumerate(results))
+  for (const auto &it : llvm::enumerate(results))
     res_attrs.push_back(build_attrs(preserved_res_attrs, it, arguments));
 
   std::optional<RegisteredOperationName> name =
@@ -734,7 +734,7 @@ static bool RegionEqualTo(Region &region, GraphFuncOp func) {
 
   // Compare the non-control block arguments.
   if (region.getNumArguments() != func.getNumArguments()) return false;
-  for (auto &it : llvm::enumerate(GetLoopRegionDataArgs(region))) {
+  for (const auto &it : llvm::enumerate(GetLoopRegionDataArgs(region))) {
     Value rhs = GraphFuncOp::getDataValue(func.getBody(), it.index());
     if (!map_value(it.value(), rhs)) return false;
   }
@@ -900,7 +900,7 @@ LogicalResult ConvertCaseLikeOp<CaseLikeRegionOp, CaseLikeOp>::matchAndRewrite(
   // Outline the regions.
   ArrayAttr branch_func_attrs = op.getBranchAttrsAttr();
   SmallVector<BasePattern::RegionFunction> branch_regions;
-  for (auto &it : llvm::enumerate(op.getBranches())) {
+  for (const auto &it : llvm::enumerate(op.getBranches())) {
     unsigned idx = it.index();
     // Get the preserved attributes, if there are any.
     RegionAttr preserved =
diff --git a/tensorflow/core/transforms/shape_inference/pass.cc b/tensorflow/core/transforms/shape_inference/pass.cc
index 19c066f0c5d..ada5a4f81d7 100644
--- a/tensorflow/core/transforms/shape_inference/pass.cc
+++ b/tensorflow/core/transforms/shape_inference/pass.cc
@@ -110,28 +110,28 @@ void ShapeInference::TryToCacheResultsTensorValue(Operation *op) {
   } else if (op_name == "Rank") {
     ShapedType operand_shape = op->getOperand(0).getType().cast<ShapedType>();
     if (!operand_shape.hasRank()) return;
-    ShapedType return_shape = op->getResultTypes()[0];
+    ShapedType return_shape = op->getResultTypes()[0].cast<ShapedType>();
     DenseElementsAttr tensor_value;
     if (return_shape.getElementType().isInteger(32)) {
       tensor_value = DenseElementsAttr::get(
-          op->getResultTypes()[0], ArrayRef<int>(operand_shape.getRank()));
+          return_shape, ArrayRef<int>(operand_shape.getRank()));
     } else {
       tensor_value = DenseElementsAttr::get(
-          op->getResultTypes()[0], ArrayRef<int64_t>(operand_shape.getRank()));
+          return_shape, ArrayRef<int64_t>(operand_shape.getRank()));
     }
     cached_tensor_values_[op->getResult(0)] = tensor_value;
   } else if (op_name == "Size") {
     ShapedType operand_shape = op->getOperand(0).getType().cast<ShapedType>();
     if (!operand_shape.hasStaticShape()) return;
-    ShapedType return_shape = op->getResultTypes()[0];
+    ShapedType return_shape = op->getResultTypes()[0].cast<ShapedType>();
     DenseElementsAttr tensor_value;
     if (return_shape.getElementType().isInteger(32)) {
       tensor_value =
-          DenseElementsAttr::get(op->getResultTypes()[0],
+          DenseElementsAttr::get(return_shape,
                                  ArrayRef<int>(operand_shape.getNumElements()));
     } else {
       tensor_value = DenseElementsAttr::get(
-          op->getResultTypes()[0],
+          return_shape,
           ArrayRef<int64_t>(operand_shape.getNumElements()));
     }
     cached_tensor_values_[op->getResult(0)] = tensor_value;
@@ -144,16 +144,16 @@ void ShapeInference::TryToCacheResultsTensorValue(Operation *op) {
       if (!operand_shape.hasStaticShape()) continue;
 
       int idx = operand.getOperandNumber();
-      ShapedType return_shape = op->getResultTypes()[idx];
+      ShapedType return_shape = op->getResultTypes()[idx].cast<ShapedType>();
       DenseElementsAttr tensor_value;
       if (return_shape.getElementType().isInteger(32)) {
         tensor_value = DenseElementsAttr::get<int>(
-            op->getResultTypes()[idx],
+            return_shape,
             SmallVector<int>(llvm::map_range(
                 operand_shape.getShape(),
                 [](int64_t dim) { return static_cast<int>(dim); })));
       } else {
-        tensor_value = DenseElementsAttr::get(op->getResultTypes()[idx],
+        tensor_value = DenseElementsAttr::get(return_shape,
                                               operand_shape.getShape());
       }
       cached_tensor_values_[op->getResult(idx)] = tensor_value;
@@ -306,7 +306,7 @@ void ShapeInference::runOnOperation() {
     Operation *return_op = func.SingleBlock::getBody()->getTerminator();
 
     bool types_updated = false;
-    for (auto &indexed_type : llvm::enumerate(func_type.getResults())) {
+    for (const auto &indexed_type : llvm::enumerate(func_type.getResults())) {
       int res_num = indexed_type.index();
       Type return_arg_type = return_op->getOperand(res_num).getType();
       if (return_arg_type != indexed_type.value()) {
diff --git a/tensorflow/core/transforms/utils/eval_utils.cc b/tensorflow/core/transforms/utils/eval_utils.cc
index 91a976ca775..32cab6d9dc9 100644
--- a/tensorflow/core/transforms/utils/eval_utils.cc
+++ b/tensorflow/core/transforms/utils/eval_utils.cc
@@ -115,7 +115,7 @@ LogicalResult EvaluateOperation(tensorflow::DeviceBase *cpu_device,
       "CPU", cpu_device, cpu_device->GetAllocator({}), node_def,
       TF_GRAPH_DEF_VERSION, &status);
   if (!status.ok()) {
-    VLOG(3) << status.error_message();
+    VLOG(3) << status.message();
     return failure();
   }
 
@@ -135,7 +135,7 @@ LogicalResult EvaluateOperation(tensorflow::DeviceBase *cpu_device,
   tensorflow::OpKernelContext op_context(&params);
   op_kernel->Compute(&op_context);
   if (!op_context.status().ok()) {
-    VLOG(3) << op_context.status().error_message();
+    VLOG(3) << op_context.status().message();
     return failure();
   }
 
@@ -151,7 +151,7 @@ LogicalResult EvaluateOperation(tensorflow::DeviceBase *cpu_device,
     tensorflow::StatusOr<ElementsAttr> attr_or =
         ConvertTensor(*(op_context.mutable_output(i)), builder);
     if (!attr_or.status().ok()) {
-      VLOG(3) << attr_or.status().error_message();
+      VLOG(3) << attr_or.status().message();
       return failure();
     }
     results.push_back(attr_or.value());
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index d6c7889a86f..573ddf69aa5 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -730,11 +730,10 @@ tf_kernel_library(
     }),
     visibility = ["//tensorflow/core/kernels:friends"],
     deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_lib",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
         "//tensorflow/compiler/xla/stream_executor/cuda:cusolver_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
     ],
 )
 
@@ -745,15 +744,15 @@ tf_kernel_library(
     compatible_with = [],
     visibility = ["//tensorflow/core/kernels:friends"],
     deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
+        "//tensorflow/compiler/xla/stream_executor/rocm:hipsolver_wrapper",
         "//tensorflow/compiler/xla/stream_executor/rocm:rocblas_plugin",
         "//tensorflow/compiler/xla/stream_executor/rocm:rocblas_wrapper",
         "//tensorflow/compiler/xla/stream_executor/rocm:rocm_gpu_executor",
         "//tensorflow/compiler/xla/stream_executor/rocm:rocsolver_wrapper",
-        "//tensorflow/compiler/xla/stream_executor/rocm:hipsolver_wrapper",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
     ] + if_rocm([
         "@local_config_rocm//rocm:rocprim",
     ]),
diff --git a/tensorflow/core/util/debug_data_dumper.cc b/tensorflow/core/util/debug_data_dumper.cc
index d9b088fdb03..c6472476145 100644
--- a/tensorflow/core/util/debug_data_dumper.cc
+++ b/tensorflow/core/util/debug_data_dumper.cc
@@ -27,38 +27,114 @@ DebugDataDumper* DebugDataDumper::Global() {
   return global_instance_;
 }
 
-bool DebugDataDumper::ShouldDump(const std::string& name,
-                                 bool bypass_name_filter) const {
-  // Do name filter check if bypass_name_filter is false.
-  if (!bypass_name_filter) {
-    // Get the name filter from TF_DUMP_GRAPH_NAME_FILTER.
-    const char* name_filter = getenv("TF_DUMP_GRAPH_NAME_FILTER");
-    if (name_filter == nullptr) {
-      VLOG(1) << "Skip dumping graph '" << name
-              << "', because TF_DUMP_GRAPH_NAME_FILTER is not set";
-      return false;
-    }
+DebugDataDumper::DebugDataDumper() { LoadEnvvars(); }
 
-    // If name_filter is not '*' or name doesn't contain the name_filter,
-    // skip the dump.
-    std::string str_name_filter = std::string(name_filter);
-    if (str_name_filter != "*" &&
-        name.find(str_name_filter) == std::string::npos) {
-      VLOG(1) << "Skip dumping graph '" << name
-              << "', because TF_DUMP_GRAPH_NAME_FILTER is not '*' and "
-              << "it is not contained by the graph name";
-      return false;
-    }
+void DebugDataDumper::LoadEnvvars() {
+  // Load TF_DUMP_GRAPH_PREFIX.
+  const char* dump_wrapped = getenv("TF_DUMP_GRAPH_WRAPPED");
+  dump_wrapped_ = static_cast<bool>(dump_wrapped);
+
+  // Load the name filter. Default value is null.
+  const char* name_filter = getenv("TF_DUMP_GRAPH_NAME_FILTER");
+  name_filter_ =
+      name_filter ? std::optional<std::string>{name_filter} : std::nullopt;
+
+  // Load the groups filter. Default value is "main".
+  const char* groups_filter = getenv("TF_DUMP_GRAPH_GROUPS");
+  groups_filter_ =
+      groups_filter ? std::set<std::string>(absl::StrSplit(groups_filter, ','))
+                    : std::set<std::string>({kDebugGroupMain});
+}
+
+bool DebugDataDumper::ShouldDump(const std::string& name,
+                                 const std::string& group) const {
+  // Skip dumping wrapped functions if needed.
+  if (!dump_wrapped_ && absl::StartsWith(name, "__wrapped__")) return false;
+
+  // Check the name filter.
+  if (name_filter_ == std::nullopt) {
+    VLOG(1) << "Skip dumping graph '" << name
+            << "', because TF_DUMP_GRAPH_NAME_FILTER is not set";
+    return false;
   }
 
+  // If name_filter is not '*' or name doesn't contain the name_filter,
+  // skip the dump.
+  if (!absl::EqualsIgnoreCase(*name_filter_, "*") &&
+      !absl::StrContains(name, *name_filter_)) {
+    VLOG(1) << "Skip dumping graph '" << name
+            << "', because TF_DUMP_GRAPH_NAME_FILTER is not '*' and "
+            << "it is not contained by the graph name";
+    return false;
+  }
+
+  // Check the group filter.
+  if (groups_filter_.find(group) == groups_filter_.end() &&
+      groups_filter_.find("*") == groups_filter_.end())
+    return false;
+
   // If all conditions are met, return true to allow the dump.
   return true;
 }
 
-void DebugDataDumper::DumpGraph(const std::string& name, const std::string& tag,
-                                const Graph* graph) {
+void DebugDataDumper::DumpOpCreationStackTraces(const std::string& name,
+                                                const std::string& group,
+                                                const std::string& tag,
+                                                const Graph* graph) {
+  // Check if we should take the dump.
+  if (!ShouldDump(name, group)) return;
+
   // Construct the dump filename.
-  std::string dump_filename = GetDumpFileBasename(name, tag);
+  std::string dump_filename = GetDumpFilename(name, group, tag);
+
+  DumpToFile(dump_filename, "", ".csv", "StackTrace",
+             [graph, &dump_filename](WritableFile* file) {
+               auto status = file->Append("node_id,node_name,stackframes\n");
+               if (!status.ok()) {
+                 LOG(WARNING) << "error writing to file to " << dump_filename
+                              << ": " << status.message();
+                 return status;
+               }
+
+               for (Node* node : graph->nodes()) {
+                 auto stack_trace = node->GetStackTrace();
+                 if (stack_trace == nullptr) continue;
+
+                 int node_id = node->id();
+                 const std::string& node_name = node->name();
+                 std::vector<std::string> stackframes;
+                 stackframes.reserve(stack_trace->ToFrames().size());
+
+                 for (auto& frame : stack_trace->ToFrames()) {
+                   stackframes.push_back(
+                       absl::StrFormat("%s(%d): %s", frame.file_name,
+                                       frame.line_number, frame.function_name));
+                 }
+
+                 status = file->Append(
+                     absl::StrFormat("%d,%s,%s\n", node_id, node_name,
+                                     absl::StrJoin(stackframes, ";")));
+
+                 if (!status.ok()) {
+                   LOG(WARNING) << "error writing to file to " << dump_filename
+                                << ": " << status.message();
+                   return status;
+                 }
+               }
+
+               return file->Close();
+             });
+}
+
+void DebugDataDumper::DumpGraph(const std::string& name,
+                                const std::string& group,
+                                const std::string& tag, const Graph* graph,
+                                const FunctionLibraryDefinition* func_lib_def,
+                                bool bypass_filter) {
+  if (!ShouldDump(name, group) && !bypass_filter) return;
+
+  // Construct the dump filename.
+  std::string dump_filename = GetDumpFilename(name, group, tag);
 
   // Make sure the dump filename is not longer than 255,
   // because Linux won't take filename that long.
@@ -68,40 +144,26 @@ void DebugDataDumper::DumpGraph(const std::string& name, const std::string& tag,
     return;
   }
 
-  // Now dump the graph into the target file.
-  DumpGraphToFile(dump_filename, *graph);
-}
+  // Construct a graph def.
+  GraphDef graph_def;
+  graph->ToGraphDef(&graph_def);
 
-void DebugDataDumper::DumpMLIRModule(const std::string& name,
-                                     const std::string& tag,
-                                     const std::string& module_txt) {
-  // Construct the dump filename.
-  std::string dump_filename = GetDumpFileBasename(name, tag);
-
-  // Make sure the dump filename is not longer than 255,
-  // because Linux won't take filename that long.
-  if (dump_filename.size() > 255) {
-    LOG(WARNING) << "Failed to dump graph " << dump_filename
-                 << ", because the file name is longer than 255";
-    return;
+  if (func_lib_def) {
+    FunctionLibraryDefinition reachable_lib_def =
+        func_lib_def->ReachableDefinitions(graph_def);
+    *graph_def.mutable_library() = reachable_lib_def.ToProto();
   }
 
-  // Dump module txt to file.
-  DumpToFile(dump_filename, "", ".mlir", "MLIR",
-             [&module_txt, &dump_filename](WritableFile* file) {
-               auto status = file->Append(module_txt);
-               if (!status.ok()) {
-                 LOG(WARNING) << "error writing to file to " << dump_filename
-                              << ": " << status.error_message();
-                 return status;
-               }
-               return file->Close();
-             });
+  // Now dump the graph into the target file.
+  DumpGraphDefToFile(dump_filename, graph_def);
 }
 
-std::string DebugDataDumper::GetDumpFileBasename(const std::string& name,
-                                                 const std::string& tag) {
-  return absl::StrFormat("%s.%d.%s", name, GetNextDumpId(name), tag);
+std::string DebugDataDumper::GetDumpFilename(const std::string& name,
+                                             const std::string& group,
+                                             const std::string& tag) {
+  std::string dump_name = name.empty() ? "unknown_graph" : name;
+  return absl::StrFormat("%s.%04d.%s.%s", dump_name, GetNextDumpId(name), group,
+                         tag);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/debug_data_dumper.h b/tensorflow/core/util/debug_data_dumper.h
index 71d6bab326b..b544ea4c0cd 100644
--- a/tensorflow/core/util/debug_data_dumper.h
+++ b/tensorflow/core/util/debug_data_dumper.h
@@ -16,85 +16,93 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_DEBUG_DATA_DUMPER_H_
 #define TENSORFLOW_CORE_UTIL_DEBUG_DATA_DUMPER_H_
 
+#include <optional>
+#include <set>
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/mutex.h"
 
-#define DUMP_GRAPH(name, tag, graph)                          \
-  do {                                                        \
-    if (DebugDataDumper::Global()->ShouldDump(name))          \
-      DebugDataDumper::Global()->DumpGraph(name, tag, graph); \
-  } while (false)
+#define DEBUG_DATA_DUMPER() ::tensorflow::DebugDataDumper::Global()
 
-#define DUMP_MLIR_MODULE(name, tag, module_txt, bypass_name_filter)      \
-  do {                                                                   \
-    if (DebugDataDumper::Global()->ShouldDump(name, bypass_name_filter)) \
-      DebugDataDumper::Global()->DumpMLIRModule(name, tag, module_txt);  \
-  } while (false)
+inline constexpr const char* kDebugGroupMain = "main";
+inline constexpr const char* kDebugGroupOpStacktrace = "op_stacktrace";
+inline constexpr const char* kDebugGroupGraphOptPass = "graph_opt_pass";
+inline constexpr const char* kDebugGroupBridgePhase1 = "bridge_phase1";
+inline constexpr const char* kDebugGroupBridgePhase2 = "bridge_phase2";
 
 namespace tensorflow {
 
+class FunctionLibraryDefinition;
 class Graph;
 
 ////////////////////////////////////////////////////////////////////////////////
 // This class is responsible for dumping debugging data (e.g., GraphDef, MLIR).
 //
-// To use DebugDataDumper, take the following steps:
+// To dump GraphDef/MLIRs, take the following steps:
 // * Set envvar TF_DUMP_GRAPH_PREFIX to your target dump directory.
 // * Set envvar TF_DUMP_GRAPH_NAME_FILTER to '*' to dump all graphs,
 //   or a name filter to dump graphs with a name containing it.
+// * Set envvar TF_DUMP_GRAPH_GROUPS to your dump groups (comma-separated).
 //
 // The dumped graphs then can be found in your target dump directory.
 // The filename of the dump looks like this:
-// <name>.<order-id>.<tag>
+// <name>.<order-id>.<group>.<tag>
 //
 // This is what each field means:
 // * <name>     : The name of your dump.
 // * <order-id> : The order of dumps of a specific name.
 //                Lower orders are executed before higher orders.
+// * <group>    : The group of your dump, e.g., main.
 // * <tag>      : The tag of your dump, e.g., your pass name.
 //
 // Example dump files are:
-// __inference_train_step_441.0.before_pre_placement_passes.pbtxt
-// __inference_train_step_441.1.before_placer.pbtxt
-// __inference_train_step_441.2.before_post_placement_passes.pbtxt
-// __inference_train_step_441.3.before_graph_optimization.pbtxt
-// __inference_train_step_441.4.after_graph_optimization.pbtxt
-// __inference_train_step_441.5.before_post_rewrite_for_exec_passes.pbtxt
+// __inference_train_step_441.0.main.before_pre_placement_passes.pbtxt
+// __inference_train_step_441.1.main.before_placer.pbtxt
+// __inference_train_step_441.2.main.before_post_placement_passes.pbtxt
+// __inference_train_step_441.3.main.before_graph_optimization.pbtxt
+// __inference_train_step_441.4.main.after_graph_optimization.pbtxt
+// __inference_train_step_441.5.main.before_post_rewrite_for_exec_passes.pbtxt
 ////////////////////////////////////////////////////////////////////////////////
 class DebugDataDumper {
  public:
   // Get the singleton instance.
   static DebugDataDumper* Global();
 
+  // Initialize the debug data dumper.
+  void LoadEnvvars();
+
   // Check if we should dump debug data.
-  // We should dump debug data only if 1 and 2 are both true:
+  // We should dump debug data only if the followings are true:
   // 1. Envvar TF_DUMP_GRAPH_PREFIX is set to your target dump directory.
   // 2. This condition is true if one of the followings is true.
   //    2.1. TF_DUMP_GRAPH_NAME_FILTER is set to '*'
   //    2.2. TF_DUMP_GRAPH_NAME_FILTER is set to a name filter
   //         which is a substr of name.
-  //    2.3. bypass_name_filter is true.
-  bool ShouldDump(const std::string& name,
-                  bool bypass_name_filter = false) const;
+  // 3. The group is defined in TF_DUMP_GRAPH_GROUPS.
+  bool ShouldDump(const std::string& name, const std::string& group) const;
+
+  // Dump op creation callstacks, if ShouldDump returns true.
+  void DumpOpCreationStackTraces(const std::string& name,
+                                 const std::string& group,
+                                 const std::string& tag, const Graph* graph);
 
   // Dump a graph, if ShouldDump returns true.
-  void DumpGraph(const std::string& name, const std::string& tag,
-                 const Graph* graph);
-
-  // Dump a MLIR module, if ShouldDump returns true.
-  void DumpMLIRModule(const std::string& name, const std::string& tag,
-                      const std::string& module_txt);
+  void DumpGraph(const std::string& name, const std::string& group,
+                 const std::string& tag, const Graph* graph,
+                 const FunctionLibraryDefinition* func_lib_def,
+                 bool bypass_filter = false);
 
   // Get the dump file basename. Dump file basenames are in this format:
-  // <name>.<order-id>.<tag>
+  // <name>.<order-id>.<group>.<tag>
   //
   // What each field means is explained on the class level comment.
-  std::string GetDumpFileBasename(const std::string& name,
-                                  const std::string& tag);
+  std::string GetDumpFilename(const std::string& name, const std::string& group,
+                              const std::string& tag);
 
  private:
+  DebugDataDumper();
+
   // Get next dump id for a name.
   int GetNextDumpId(const std::string& name) {
     // Use a lock to make sure this is thread safe.
@@ -107,6 +115,15 @@ class DebugDataDumper {
 
   // A mutex to make sure this is thread safe.
   tensorflow::mutex lock_;
+
+  // The name filter.
+  std::optional<std::string> name_filter_;
+
+  // The groups filter.
+  std::set<string> groups_filter_;
+
+  // A flag indicating whether to dump wrapped graphs.
+  bool dump_wrapped_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/debug_data_dumper_test.cc b/tensorflow/core/util/debug_data_dumper_test.cc
index b5ad54c9124..46d9f4fd666 100644
--- a/tensorflow/core/util/debug_data_dumper_test.cc
+++ b/tensorflow/core/util/debug_data_dumper_test.cc
@@ -28,50 +28,71 @@ namespace tensorflow {
 namespace {
 
 TEST(DebugDataDumper, NoPrefixTest) {
-  EXPECT_EQ(false,
-            DebugDataDumper::Global()->ShouldDump("DumpGraphToFileTest"));
+  EXPECT_EQ(false, DEBUG_DATA_DUMPER()->ShouldDump("DumpGraphToFileTest",
+                                                   kDebugGroupMain));
 }
 
 TEST(DebugDataDumper, NoNameFilterTest) {
   std::string dir = testing::TmpDir();
   setenv("TF_DUMP_GRAPH_PREFIX", dir.c_str(), 1);
-  EXPECT_EQ(false,
-            DebugDataDumper::Global()->ShouldDump("DumpGraphToFileTest"));
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
+
+  EXPECT_EQ(false, DEBUG_DATA_DUMPER()->ShouldDump("DumpGraphToFileTest",
+                                                   kDebugGroupMain));
 }
 
 TEST(DebugDataDumper, ShouldDumpTest) {
   std::string dir = testing::TmpDir();
   setenv("TF_DUMP_GRAPH_PREFIX", dir.c_str(), 1);
-
   setenv("TF_DUMP_GRAPH_NAME_FILTER", "*", 1);
-  EXPECT_EQ(true, DebugDataDumper::Global()->ShouldDump("DumpGraphToFileTest"));
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
+  EXPECT_EQ(true, DEBUG_DATA_DUMPER()->ShouldDump("DumpGraphToFileTest",
+                                                  kDebugGroupMain));
 
   setenv("TF_DUMP_GRAPH_NAME_FILTER", "DumpGraph", 1);
-  EXPECT_EQ(true, DebugDataDumper::Global()->ShouldDump("DumpGraphToFileTest"));
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
+  EXPECT_EQ(true, DEBUG_DATA_DUMPER()->ShouldDump("DumpGraphToFileTest",
+                                                  kDebugGroupMain));
 
   setenv("TF_DUMP_GRAPH_NAME_FILTER", "DoNotDumpGraph", 1);
-  EXPECT_EQ(false,
-            DebugDataDumper::Global()->ShouldDump("DumpGraphToFileTest"));
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
+  EXPECT_EQ(false, DEBUG_DATA_DUMPER()->ShouldDump("DumpGraphToFileTest",
+                                                   kDebugGroupMain));
 
-  setenv("TF_DUMP_GRAPH_NAME_FILTER", "DoNotDumpGraph", 1);
-  EXPECT_EQ(true,
-            DebugDataDumper::Global()->ShouldDump("DumpGraphToFileTest", true));
+  setenv("TF_DUMP_GRAPH_NAME_FILTER", "*", 1);
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
+  EXPECT_EQ(false, DEBUG_DATA_DUMPER()->ShouldDump("DumpGraphToFileTest",
+                                                   kDebugGroupBridgePhase1));
+
+  setenv("TF_DUMP_GRAPH_GROUPS", "main,bridge_phase1", 1);
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
+  EXPECT_EQ(true, DEBUG_DATA_DUMPER()->ShouldDump("DumpGraphToFileTest",
+                                                  kDebugGroupBridgePhase1));
+
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
+  EXPECT_EQ(false, DEBUG_DATA_DUMPER()->ShouldDump(
+                       "__wrapped__DumpGraphToFileTest", kDebugGroupMain));
+
+  setenv("TF_DUMP_GRAPH_WRAPPED", "true", 1);
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
+  EXPECT_EQ(true, DEBUG_DATA_DUMPER()->ShouldDump(
+                      "__wrapped__DumpGraphToFileTest", kDebugGroupMain));
 }
 
 TEST(DebugDataDumper, DumpFileBasenameTest) {
   // For the same name, the order id should increment for each new dump file
   // name.
-  EXPECT_EQ("DumpFileBasenameTest1.0.tag1",
-            DebugDataDumper::Global()->GetDumpFileBasename(
-                "DumpFileBasenameTest1", "tag1"));
-  EXPECT_EQ("DumpFileBasenameTest1.1.tag2",
-            DebugDataDumper::Global()->GetDumpFileBasename(
-                "DumpFileBasenameTest1", "tag2"));
+  EXPECT_EQ("DumpFileBasenameTest1.0000.main.tag1",
+            DEBUG_DATA_DUMPER()->GetDumpFilename("DumpFileBasenameTest1",
+                                                 kDebugGroupMain, "tag1"));
+  EXPECT_EQ("DumpFileBasenameTest1.0001.main.tag2",
+            DEBUG_DATA_DUMPER()->GetDumpFilename("DumpFileBasenameTest1",
+                                                 kDebugGroupMain, "tag2"));
 
   // For other names, the order id should restart from 0.
-  EXPECT_EQ("DumpFileBasenameTest2.0.tag1",
-            DebugDataDumper::Global()->GetDumpFileBasename(
-                "DumpFileBasenameTest2", "tag1"));
+  EXPECT_EQ("DumpFileBasenameTest2.0000.main.tag1",
+            DEBUG_DATA_DUMPER()->GetDumpFilename("DumpFileBasenameTest2",
+                                                 kDebugGroupMain, "tag1"));
 }
 
 TEST(DebugDataDumper, DumpGraphToFileTest) {
@@ -82,11 +103,13 @@ TEST(DebugDataDumper, DumpGraphToFileTest) {
   std::string dir = testing::TmpDir();
   setenv("TF_DUMP_GRAPH_PREFIX", dir.c_str(), 1);
   setenv("TF_DUMP_GRAPH_NAME_FILTER", "*", 1);
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
 
-  DUMP_GRAPH("DumpGraphToFileTest", "tag", &graph);
+  DEBUG_DATA_DUMPER()->DumpGraph("DumpGraphToFileTest", kDebugGroupMain, "tag",
+                                 &graph, nullptr, false);
 
   std::string dumpFilename =
-      io::JoinPath(dir, "DumpGraphToFileTest.0.tag.pbtxt");
+      io::JoinPath(dir, "DumpGraphToFileTest.0000.main.tag.pbtxt");
   EXPECT_EQ(OkStatus(), Env::Default()->FileExists(dumpFilename));
 }
 
@@ -98,40 +121,53 @@ TEST(DebugDataDumper, DumpGraphLongFileNameCrashTest) {
   std::string dir = testing::TmpDir();
   setenv("TF_DUMP_GRAPH_PREFIX", dir.c_str(), 1);
   setenv("TF_DUMP_GRAPH_NAME_FILTER", "*", 1);
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
 
   // Make sure long file name does not crash.
   std::string name = std::string(256, 'x');
-  DUMP_GRAPH(name, "tag", &graph);
+  DEBUG_DATA_DUMPER()->DumpGraph(name, kDebugGroupMain, "tag", &graph, nullptr,
+                                 false);
 
-  std::string dumpFilename =
-      io::JoinPath(dir, absl::StrFormat("%s.0.tag.pbtxt", name.c_str()));
+  std::string dumpFilename = io::JoinPath(
+      dir, absl::StrFormat("%s.0000.main.tag.pbtxt", name.c_str()));
   EXPECT_EQ(absl::StatusCode::kNotFound,
             Env::Default()->FileExists(dumpFilename).code());
 }
 
-TEST(DebugDataDumper, DumpMLIRModuleTest) {
+TEST(DebugDataDumper, DumpOpCreationStacktracesTest) {
+  Graph graph(OpRegistry::Global());
+  Node* node;
+  TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
+
   std::string dir = testing::TmpDir();
   setenv("TF_DUMP_GRAPH_PREFIX", dir.c_str(), 1);
   setenv("TF_DUMP_GRAPH_NAME_FILTER", "*", 1);
+  setenv("TF_DUMP_OP_CREATION_STACKTRACES", "1", 1);
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
 
-  DUMP_MLIR_MODULE("DumpMLIRModuleTest", "test", "fake_mlir_txt", false);
-
-  std::string dumpFilepath =
-      io::JoinPath(dir, "DumpMLIRModuleTest.0.test.mlir");
-  EXPECT_EQ(OkStatus(), Env::Default()->FileExists(dumpFilepath));
-}
-
-TEST(DebugDataDumper, DumpMLIRModuleLongFileNameCrashTest) {
-  std::string dir = testing::TmpDir();
-  setenv("TF_DUMP_GRAPH_PREFIX", dir.c_str(), 1);
-  setenv("TF_DUMP_GRAPH_NAME_FILTER", "*", 1);
-
-  // Make sure long file name does not crash.
-  std::string name = std::string(256, 'x');
-  DUMP_MLIR_MODULE(name, "tag", "fake_mlir_txt", false);
+  DEBUG_DATA_DUMPER()->DumpOpCreationStackTraces(
+      "DumpOpCreationStacktracesTest", kDebugGroupMain, "test", &graph);
 
   std::string dumpFilename =
-      io::JoinPath(dir, absl::StrFormat("%s.0.tag.pbtxt", name.c_str()));
+      io::JoinPath(dir, "DumpOpCreationStacktracesTest.0000.main.test.csv");
+  EXPECT_EQ(OkStatus(), Env::Default()->FileExists(dumpFilename));
+}
+
+TEST(DebugDataDumper, NoDumpOpCreationStacktracesTest) {
+  Graph graph(OpRegistry::Global());
+  Node* node;
+  TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
+
+  std::string dir = testing::TmpDir();
+  setenv("TF_DUMP_GRAPH_PREFIX", dir.c_str(), 1);
+  setenv("TF_DUMP_GRAPH_NAME_FILTER", "*", 1);
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
+
+  DEBUG_DATA_DUMPER()->DumpOpCreationStackTraces(
+      "DumpOpCreationStacktracesTest", kDebugGroupMain, "test", &graph);
+
+  std::string dumpFilename =
+      io::JoinPath(dir, "DumpOpCreationStacktracesTest.0000.main.test.json");
   EXPECT_EQ(absl::StatusCode::kNotFound,
             Env::Default()->FileExists(dumpFilename).code());
 }
diff --git a/tensorflow/core/util/debug_events_writer_test.cc b/tensorflow/core/util/debug_events_writer_test.cc
index 8c289c44949..074d3053e24 100644
--- a/tensorflow/core/util/debug_events_writer_test.cc
+++ b/tensorflow/core/util/debug_events_writer_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 
 namespace tensorflow {
 namespace tfdbg {
diff --git a/tensorflow/core/util/matmul_autotune.cc b/tensorflow/core/util/matmul_autotune.cc
index 741a78a193f..e8704bb4af7 100644
--- a/tensorflow/core/util/matmul_autotune.cc
+++ b/tensorflow/core/util/matmul_autotune.cc
@@ -25,7 +25,7 @@ bool MatmulAutotuneEnable() {
   Status status =
       ReadBoolFromEnvVar("TF_MATMUL_AUTOTUNE_ENABLE", false, &value);
   if (!status.ok()) {
-    LOG(ERROR) << status.error_message();
+    LOG(ERROR) << status.message();
   }
   return value;
 }
@@ -43,7 +43,7 @@ bool MatmulDoFP32ComputationFP16Input() {
   Status status =
       ReadBoolFromEnvVar("TF_FP16_MATMUL_USE_FP32_COMPUTE", true, &value);
   if (!status.ok()) {
-    LOG(ERROR) << status.error_message();
+    LOG(ERROR) << status.message();
   }
   return value;
 }
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index 5af865fe3a4..c2bb728d669 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -65,15 +65,15 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
               char* scratch) const override {
     if (offset >= length_) {
       *result = StringPiece(scratch, 0);
-      return Status(error::OUT_OF_RANGE, "Read after file end");
+      return Status(absl::StatusCode::kOutOfRange, "Read after file end");
     }
     const uint64 region_left =
         std::min(length_ - offset, static_cast<uint64>(to_read));
     *result =
         StringPiece(reinterpret_cast<const char*>(data_) + offset, region_left);
-    return (region_left == to_read)
-               ? OkStatus()
-               : Status(error::OUT_OF_RANGE, "Read less bytes than requested");
+    return (region_left == to_read) ? OkStatus()
+                                    : Status(absl::StatusCode::kOutOfRange,
+                                             "Read less bytes than requested");
   }
 
  private:
diff --git a/tensorflow/core/util/mkl_threadpool.h b/tensorflow/core/util/mkl_threadpool.h
index e3bf6adc0b5..cedc828b98b 100644
--- a/tensorflow/core/util/mkl_threadpool.h
+++ b/tensorflow/core/util/mkl_threadpool.h
@@ -28,9 +28,11 @@ limitations under the License.
 #include "dnnl_threadpool.hpp"
 #include "dnnl.hpp"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/util/onednn_env_vars.h"
+
 #define EIGEN_USE_THREADS
 
 namespace tensorflow {
@@ -80,8 +82,14 @@ struct MklDnnThreadPool : public threadpool_iface {
     eigen_interface_ = ctx->device()
                            ->tensorflow_cpu_worker_threads()
                            ->workers->AsEigenThreadPool();
-    num_threads_ =
-        (num_threads == -1) ? eigen_interface_->NumThreads() : num_threads;
+    if (num_threads == -1) {
+      dnnl_threadpool_interop_set_max_concurrency(
+          eigen_interface_->NumThreads());
+      num_threads_ = eigen_interface_->NumThreads();
+    } else {
+      dnnl_threadpool_interop_set_max_concurrency(num_threads);
+      num_threads_ = num_threads;
+    }
   }
   virtual int get_num_threads() const override { return num_threads_; }
   virtual bool get_in_parallel() const override {
@@ -108,15 +116,35 @@ struct MklDnnThreadPool : public threadpool_iface {
     const bool use_caller_thread =
         ThreadPoolUseCallerThread() && nthr == port::NumSchedulableCPUs();
     const int njobs_to_schedule = use_caller_thread ? njobs - 1 : njobs;
-    for (int i = 0; i < njobs_to_schedule; i++) {
-      eigen_interface_->ScheduleWithHint(
-          [balance, i, n, njobs, fn]() { run_jobs(balance, i, n, njobs, fn); },
-          i, i + 1);
-    }
+
+    BlockingCounter counter(njobs_to_schedule);
+    std::function<void(int, int)> handle_range = [=, &handle_range, &counter](
+                                                     int first, int last) {
+      while (last - first > 1) {
+        const auto mid = first + (last - first) / 2;
+        // Find something near the midpoint which is a multiple of block size.
+        eigen_interface_->ScheduleWithHint([=]() { handle_range(mid, last); },
+                                           mid, mid + 1);
+        last = mid;
+      }
+      counter.DecrementCount();
+      run_jobs(balance, first, n, njobs, fn);
+    };
+
+    // Eigen avoids a thread hop by running the root of the tree on the main
+    // thread. We have disabled this because it actually slows things down
+    // relative to base because base cheats and uses n threads while letting
+    // main continue doing other work
+    eigen_interface_->ScheduleWithHint(
+        [=]() { handle_range(0, njobs_to_schedule); }, 0, 1);
+
     if (use_caller_thread) {
       run_jobs(balance, njobs - 1, n, njobs, fn);
     }
+
+    counter.Wait();
   }
+
   ~MklDnnThreadPool() {}
 
  private:
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 6f2719825b0..322991376f9 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/mkl_threadpool.h"
+#include "tensorflow/core/util/onednn_env_vars.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 #ifdef DNNL_AARCH64_USE_ACL
@@ -129,6 +130,26 @@ enum class MklQuantization {
 
 static const int kSmallBatchSize = 32;
 
+enum class OneDNNMathModeSetting {
+  kNone = 0,
+  kBF16,
+};
+
+inline OneDNNMathModeSetting SetFPMathMode() {
+  static OneDNNMathModeSetting math_mode = [] {
+    OneDNNMathModeSetting mode = OneDNNMathModeSetting::kNone;
+    if (FPMathModeSetting() == "BF16") {
+      if (dnnl::set_default_fpmath_mode(dnnl::fpmath_mode::bf16) ==
+          dnnl::status::success) {
+        mode = OneDNNMathModeSetting::kBF16;
+      }
+    }
+    return mode;
+  }();
+
+  return math_mode;
+}
+
 inline void execute_primitives(
     std::vector<dnnl::primitive>& primitives, std::shared_ptr<stream> stream,
     std::vector<std::unordered_map<int, memory>>& net_args) {
@@ -138,6 +159,34 @@ inline void execute_primitives(
   }
 }
 
+#ifndef ENABLE_ONEDNN_V3
+#define ARE_MEMORY_DESCS_EQUAL(md1, md2) dnnl_memory_desc_equal(&md1, &md2)
+#define CREATE_MEMORY_DESC_USING_STRIDES dnnl_memory_desc_init_by_strides
+#define GET_DATA_TYPE data_type
+#define GET_DIMS dims
+#define GET_INNER_BLKS format_desc.blocking.inner_blks
+#define GET_INNER_INDICES format_desc.blocking.inner_idxs
+#define GET_INNER_NBLKS format_desc.blocking.inner_nblks
+#define GET_MEMORY_DESC get_desc().data
+#define GET_MEMORY_DESC_USING_MKLDNN_SHAPE_PTR GetMklLayout().data
+#define GET_NDIMS ndims
+#define GET_STRIDES format_desc.blocking.strides
+#define MEMORY_DESC dnnl_memory_desc_t
+#else
+#define ARE_MEMORY_DESCS_EQUAL(md1, md2) md1 == md2
+#define CREATE_MEMORY_DESC_USING_STRIDES dnnl_memory_desc_create_with_strides
+#define GET_DATA_TYPE get_data_type()
+#define GET_DIMS get_dims()
+#define GET_INNER_BLKS get_inner_blks()
+#define GET_INNER_INDICES get_inner_idxs()
+#define GET_INNER_NBLKS get_inner_nblks()
+#define GET_MEMORY_DESC get_desc()
+#define GET_MEMORY_DESC_USING_MKLDNN_SHAPE_PTR GetMklLayout()
+#define GET_NDIMS get_ndims()
+#define GET_STRIDES get_strides()
+#define MEMORY_DESC memory::desc
+#endif  // !ENABLE_ONEDNN_V3
+
 // In oneDNN v1.x, the format (ex. NCHW) used to initialize a memory descriptor
 // (md) structure will no longer be recorded in its `format` field. Instead, it
 // will be set to a canonical `blocked` format for every fully described md.
@@ -253,7 +302,7 @@ class MklDnnShape {
     MklTensorFormat tf_data_format_ = MklTensorFormat::FORMAT_BLOCKED;
     memory::data_type T_ = memory::data_type::undef;
     // MKL layout
-    dnnl_memory_desc_t mkl_md_;
+    MEMORY_DESC mkl_md_;
     /// TF dimension corresponding to this MKL dimension
     dnnl_dims_t map_;
   };
@@ -287,11 +336,11 @@ class MklDnnShape {
     // If input tensors are in MKL layout, then we check for dimensions and
     // sizes.
     if (this->IsMklTensor()) {
-      const dnnl_memory_desc_t& cur_md = (this->GetMklLayout()).data;
-      const dnnl_memory_desc_t& input_shape_md =
-          input_shape.GetMklLayout().data;
-      return this->GetTfShape() == input_shape.GetTfShape() &&
-             dnnl_memory_desc_equal(&cur_md, &input_shape_md);
+      auto const& cur_md = this->GET_MEMORY_DESC_USING_MKLDNN_SHAPE_PTR;
+      auto const& input_shape_md =
+          input_shape.GET_MEMORY_DESC_USING_MKLDNN_SHAPE_PTR;
+      return (this->GetTfShape() == input_shape.GetTfShape()) &&
+             ARE_MEMORY_DESCS_EQUAL(cur_md, input_shape_md);
     }
 
     // Both inputs are not MKL tensors.
@@ -420,10 +469,14 @@ class MklDnnShape {
   inline void SetElemType(memory::data_type dt) { data_.T_ = dt; }
   inline const memory::data_type GetElemType() { return data_.T_; }
 
+#ifndef ENABLE_ONEDNN_V3
   inline void SetMklLayout(memory::desc* md) {
     CHECK_NOTNULL(md);
     data_.mkl_md_ = md->data;
   }
+#else
+  inline void SetMklLayout(const memory::desc& md) { data_.mkl_md_ = md; }
+#endif  // !ENABLE_ONEDNN_V3
 
   inline const memory::desc GetMklLayout() const {
     return memory::desc(data_.mkl_md_);
@@ -653,7 +706,7 @@ inline Status ConvertMklToTF(OpKernelContext* context,
       bool status = input.CheckReorderToOpMem(output_tf_md, output_tf_tensor,
                                               net, net_args, cpu_engine);
       if (!status) {
-        return Status(error::Code::INTERNAL,
+        return Status(absl::StatusCode::kInternal,
                       "ConvertMklToTF(): Failed to create reorder for input");
       }
       ExecutePrimitive(net, &net_args, cpu_engine, context);
@@ -663,7 +716,7 @@ inline Status ConvertMklToTF(OpKernelContext* context,
           output_tf_tensor->CopyFrom(input_mkl_tensor, output_tf_shape);
       if (!status) {
         return Status(
-            error::Code::INTERNAL,
+            absl::StatusCode::kInternal,
             "ConvertMklToTF(): Failed to forward input tensor to output");
       }
     }
@@ -1061,7 +1114,8 @@ inline memory::format_tag MklTensorFormatToMklDnnDataFormat(
 inline MklTensorFormat TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
   if (format == FORMAT_NHWC) return MklTensorFormat::FORMAT_NDHWC;
   if (format == FORMAT_NCHW) return MklTensorFormat::FORMAT_NCDHW;
-  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
+  TF_CHECK_OK(
+      Status(absl::StatusCode::kInvalidArgument, "Unsupported data format"));
   return MklTensorFormat::FORMAT_INVALID;
 }
 
@@ -1073,7 +1127,8 @@ inline MklTensorFormat TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
 inline MklTensorFormat TFDataFormatToMklDnnDataFormat(TensorFormat format) {
   if (format == FORMAT_NHWC) return MklTensorFormat::FORMAT_NHWC;
   if (format == FORMAT_NCHW) return MklTensorFormat::FORMAT_NCHW;
-  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
+  TF_CHECK_OK(
+      Status(absl::StatusCode::kInvalidArgument, "Unsupported data format"));
   return MklTensorFormat::FORMAT_INVALID;
 }
 
@@ -1089,7 +1144,8 @@ inline TensorFormat MklDnnDataFormatToTFDataFormat(MklTensorFormat format) {
   if (format == MklTensorFormat::FORMAT_NCHW ||
       format == MklTensorFormat::FORMAT_NCDHW)
     return FORMAT_NCHW;
-  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
+  TF_CHECK_OK(
+      Status(absl::StatusCode::kInvalidArgument, "Unsupported data format"));
 
   // Return to prevent compiler warnings, otherwise TF_CHECK_OK will ensure
   // that we don't come here.
@@ -1247,7 +1303,7 @@ inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
     input_strides[i] = strides[i];
   }
   try {
-    dnnl_memory_desc_init_by_strides(blocked_md, kNumDims, input_dims,
+    CREATE_MEMORY_DESC_USING_STRIDES(blocked_md, kNumDims, input_dims,
                                      memory::convert_to_c(dtype),
                                      input_strides);
     delete[] input_dims;
@@ -1255,7 +1311,7 @@ inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
   } catch (dnnl::error& e) {
     delete[] input_dims;
     delete[] input_strides;
-    return Status(error::Code::INTERNAL,
+    return Status(absl::StatusCode::kInternal,
                   tensorflow::strings::StrCat(
                       "Failed to create blocked memory descriptor.",
                       "Status: ", e.status, ", message: ", e.message));
@@ -1432,11 +1488,11 @@ class MklDnnData {
                                   std::shared_ptr<stream> t_stream = nullptr) {
     CHECK_NOTNULL(user_memory_);
     CHECK_NOTNULL(data_buffer);
-#ifndef ENABLE_ONEDNN_OPENMP
+#if !defined(ENABLE_ONEDNN_OPENMP) && !defined(ENABLE_ONEDNN_V3)
     user_memory_->set_data_handle(data_buffer, *t_stream);
 #else
     user_memory_->set_data_handle(data_buffer);
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // !ENABLE_ONEDNN_OPENMP && !ENABLE_ONEDNN_V3
   }
 
   /// Set function for data buffer of user memory primitive.
@@ -2073,6 +2129,7 @@ template <typename T>
 class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
  public:
   static MklReorderPrimitive* Get(const memory* from, const memory* to) {
+#ifndef ENABLE_ONEDNN_V3
     auto reorderPrim = static_cast<MklReorderPrimitive*>(
         MklReorderPrimitiveFactory<T>::GetInstance().GetReorder(from, to));
     if (reorderPrim == nullptr) {
@@ -2082,8 +2139,14 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     }
     reorderPrim->SetMemory(from, to);
     return reorderPrim;
+#else
+    // TODO(bhavanis): enable ReorderPrimitive cache for v3.x
+    auto reorderPrim = new MklReorderPrimitive(from, to);
+    return reorderPrim;
+#endif  // !ENABLE_ONEDNN_V3
   }
 
+#ifndef ENABLE_ONEDNN_V3
   static MklReorderPrimitiveFactory& GetInstance() {
     static MklReorderPrimitiveFactory instance_;
     return instance_;
@@ -2092,32 +2155,36 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
   static string CreateKey(const memory* from, const memory* to) {
     string prefix = "reorder";
     FactoryKeyCreator key_creator;
-    auto const& from_desc = from->get_desc().data;
-    auto const& to_desc = to->get_desc().data;
-    memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
-    memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
-    auto from_strides = from_desc.format_desc.blocking.strides;
+    auto const& from_desc = from->GET_MEMORY_DESC;
+    auto const& to_desc = to->GET_MEMORY_DESC;
+    memory::dims from_dims(from_desc.GET_DIMS,
+                           &from_desc.GET_DIMS[from_desc.GET_NDIMS]);
+    memory::dims to_dims(to_desc.GET_DIMS,
+                         &to_desc.GET_DIMS[to_desc.GET_NDIMS]);
+    auto from_strides = from_desc.GET_STRIDES;
 
     // As DNNL memory desc has C style array and only init the used
     // part, so need use the valid part as key.
-    auto from_inner_nblks = from_desc.format_desc.blocking.inner_nblks;
-    auto from_inner_blks = from_desc.format_desc.blocking.inner_blks;
-    auto from_inner_idxs = from_desc.format_desc.blocking.inner_idxs;
+    auto from_inner_nblks = from_desc.GET_INNER_NBLKS;
+    auto from_inner_blks = from_desc.GET_INNER_BLKS;
+    auto from_inner_idxs = from_desc.GET_INNER_INDICES;
+
     memory::dims from_inner_blks_1(from_inner_blks,
                                    &from_inner_blks[from_inner_nblks]);
     memory::dims from_inner_idxs_1(from_inner_idxs,
                                    &from_inner_idxs[from_inner_nblks]);
-    auto to_inner_nblks = to_desc.format_desc.blocking.inner_nblks;
-    auto to_inner_blks = to_desc.format_desc.blocking.inner_blks;
-    auto to_inner_idxs = to_desc.format_desc.blocking.inner_idxs;
+    auto to_inner_nblks = to_desc.GET_INNER_NBLKS;
+    auto to_inner_blks = to_desc.GET_INNER_BLKS;
+    auto to_inner_idxs = to_desc.GET_INNER_INDICES;
+
     memory::dims to_inner_blks_1(to_inner_blks, &to_inner_blks[to_inner_nblks]);
     memory::dims to_inner_idxs_1(to_inner_idxs, &to_inner_idxs[to_inner_nblks]);
 
-    auto to_strides = to_desc.format_desc.blocking.strides;
+    auto to_strides = to_desc.GET_STRIDES;
     memory::dims from_strides_outer_blocks(from_strides,
-                                           &from_strides[from_desc.ndims]);
+                                           &from_strides[from_desc.GET_NDIMS]);
     memory::dims to_strides_outer_blocks(to_strides,
-                                         &to_strides[to_desc.ndims]);
+                                         &to_strides[to_desc.GET_NDIMS]);
 
     key_creator.AddAsKey(prefix);
 #ifdef DNNL_AARCH64_USE_ACL
@@ -2129,23 +2196,25 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(static_cast<int>(from_inner_nblks));
     key_creator.AddAsKey(from_inner_blks_1);
     key_creator.AddAsKey(from_inner_idxs_1);
-    key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
+    key_creator.AddAsKey(static_cast<int>(from_desc.GET_DATA_TYPE));
     key_creator.AddAsKey(from_dims);
     key_creator.AddAsKey(from_strides_outer_blocks);
     key_creator.AddAsKey(static_cast<int>(to_desc.extra.flags));
     key_creator.AddAsKey(static_cast<int>(to_inner_nblks));
     key_creator.AddAsKey(to_inner_blks_1);
     key_creator.AddAsKey(to_inner_idxs_1);
-    key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
+    key_creator.AddAsKey(static_cast<int>(to_desc.GET_DATA_TYPE));
     key_creator.AddAsKey(to_dims);
     key_creator.AddAsKey(to_strides_outer_blocks);
     return key_creator.GetKey();
   }
+#endif  // !ENABLE_ONEDNN_V3
 
  private:
   MklReorderPrimitiveFactory() {}
   ~MklReorderPrimitiveFactory() {}
 
+#ifndef ENABLE_ONEDNN_V3
   MklPrimitive* GetReorder(const memory* from, const memory* to) {
     string key = CreateKey(from, to);
     return this->GetOp(key);
@@ -2155,6 +2224,7 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     string key = CreateKey(from, to);
     this->SetOp(key, op);
   }
+#endif  // !ENABLE_ONEDNN_V3
 };
 
 /// Function to find(or create) a reorder from memory pointed by
@@ -2181,6 +2251,19 @@ inline bool IsConv1x1StrideNot1(memory::dims filter_dims,
           ((strides[0] != 1) || (strides[1] != 1)));
 }
 
+#undef ARE_MEMORY_DESCS_EQUAL
+#undef CREATE_MEMORY_DESC_USING_STRIDES
+#undef GET_DATA_TYPE
+#undef GET_DIMS
+#undef GET_INNER_BLKS
+#undef GET_INNER_INDICES
+#undef GET_INNER_NBLKS
+#undef GET_MEMORY_DESC
+#undef GET_MEMORY_DESC_USING_MKLDNN_SHAPE_PTR
+#undef GET_NDIMS
+#undef GET_STRIDES
+#undef MEMORY_DESC
+
 }  // namespace tensorflow
 
 /////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/util/onednn_env_vars.cc b/tensorflow/core/util/onednn_env_vars.cc
index 56850fedd62..0bba22bc9b1 100644
--- a/tensorflow/core/util/onednn_env_vars.cc
+++ b/tensorflow/core/util/onednn_env_vars.cc
@@ -53,5 +53,15 @@ bool ThreadPoolUseCallerThread() {
   return threadpool_use_caller_thread;
 }
 
+std::string FPMathModeSetting() {
+  static std::string math_mode_setting = [] {
+    std::string setting = "";
+    TF_CHECK_OK(ReadStringFromEnvVar("TF_SET_ONEDNN_FPMATH_MODE",
+                                     /*default_value*/ "", &setting));
+    return setting;
+  }();
+
+  return math_mode_setting;
+}
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/util/onednn_env_vars.h b/tensorflow/core/util/onednn_env_vars.h
index 2a74c9bdd98..4cf97500379 100644
--- a/tensorflow/core/util/onednn_env_vars.h
+++ b/tensorflow/core/util/onednn_env_vars.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_ONEDNN_ENV_VARS_H_
 #ifdef INTEL_MKL
 
+#include <string>
+
 namespace tensorflow {
 
 bool AreWeightsFrozen();
@@ -25,6 +27,7 @@ bool UseSystemAlloc();
 
 bool ThreadPoolUseCallerThread();
 
+std::string FPMathModeSetting();
 }  // namespace tensorflow
 #endif  // INTEL_MKL
 #endif  // TENSORFLOW_CORE_UTIL_ONEDNN_ENV_VARS_H_
diff --git a/tensorflow/core/util/proto/descriptors.h b/tensorflow/core/util/proto/descriptors.h
index 8c6a554e051..36cf62f47e5 100644
--- a/tensorflow/core/util/proto/descriptors.h
+++ b/tensorflow/core/util/proto/descriptors.h
@@ -20,14 +20,13 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace tsl {
-class Status;
 class Env;
 }  // namespace tsl
 namespace tensorflow {
 using tsl::Env;
-using tsl::Status;
 
 // Gets a `DescriptorPool` object from the `descriptor_source`. This may be:
 //
diff --git a/tensorflow/core/util/proto/proto_utils_test.cc b/tensorflow/core/util/proto/proto_utils_test.cc
index 7f6312a7185..00153d2b6f2 100644
--- a/tensorflow/core/util/proto/proto_utils_test.cc
+++ b/tensorflow/core/util/proto/proto_utils_test.cc
@@ -36,7 +36,7 @@ TEST(ParseTextFormatFromStringTest, ErrorOnInvalidSyntax) {
   protobuf::DescriptorProto output;
   Status status = ParseTextFormatFromString("name: foo", &output);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_THAT(status.error_message(), ContainsRegex("foo"));
+  EXPECT_THAT(status.message(), ContainsRegex("foo"));
   EXPECT_FALSE(output.has_name());
 }
 
@@ -44,7 +44,7 @@ TEST(ParseTextFormatFromStringTest, ErrorOnUnknownFieldName) {
   protobuf::DescriptorProto output;
   Status status = ParseTextFormatFromString("badname: \"foo\"", &output);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_THAT(status.error_message(), ContainsRegex("badname"));
+  EXPECT_THAT(status.message(), ContainsRegex("badname"));
   EXPECT_FALSE(output.has_name());
 }
 
@@ -56,7 +56,7 @@ TEST(ParseTextFormatFromStringTest, DiesOnNullOutputPointer) {
   // Under NDEBUG we don't die but should still return an error status.
   Status status = ParseTextFormatFromString("foo", nullptr);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_THAT(status.error_message(), ContainsRegex("output.*non NULL"));
+  EXPECT_THAT(status.message(), ContainsRegex("output.*non NULL"));
 #endif
 }
 
diff --git a/tensorflow/core/util/reffed_status_callback_test.cc b/tensorflow/core/util/reffed_status_callback_test.cc
index 9cd2e3ff2d9..c32dfff1cd1 100644
--- a/tensorflow/core/util/reffed_status_callback_test.cc
+++ b/tensorflow/core/util/reffed_status_callback_test.cc
@@ -59,8 +59,8 @@ TEST(TestReffedStatusCallback, CallsBackFail) {
   EXPECT_THAT(status.code(),
               ::testing::AnyOf(error::INTERNAL, error::INVALID_ARGUMENT));
   // Both errors are reported.
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "1"));
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "2"));
+  EXPECT_TRUE(absl::StrContains(status.message(), "1"));
+  EXPECT_TRUE(absl::StrContains(status.message(), "2"));
 }
 
 TEST(TestReffedStatusCallback, RefMulti) {
@@ -81,8 +81,8 @@ TEST(TestReffedStatusCallback, RefMulti) {
   cb->Unref();  // Created by constructor.
   EXPECT_TRUE(called);
   // Both errors are reported.
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "1"));
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "2"));
+  EXPECT_TRUE(absl::StrContains(status.message(), "1"));
+  EXPECT_TRUE(absl::StrContains(status.message(), "2"));
 }
 
 TEST(TestReffedStatusCallback, MultiThreaded) {
@@ -114,7 +114,7 @@ TEST(TestReffedStatusCallback, MultiThreaded) {
 
   EXPECT_EQ(num_called.load(), 1);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "err"));
+  EXPECT_TRUE(absl::StrContains(status.message(), "err"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc
index 84e4d2feb85..8b76b37563a 100644
--- a/tensorflow/core/util/sparse/sparse_tensor_test.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc
@@ -197,7 +197,7 @@ TEST(SparseTensorTest, SparseTensorConstruction) {
       "Many sparse ops require sorted indices.\n"
       "    Use `tf.sparse.reorder` to create a correctly ordered copy."
       "\n\n",
-      st_indices_valid.error_message());
+      st_indices_valid.message());
 
   // Regardless of how order is updated; so long as there are no
   // duplicates, the resulting indices are valid.
@@ -298,8 +298,7 @@ TEST(SparseTensorTest, ValidateIndicesFindsInvalid) {
   st.Reorder<tstring>(order);
   Status st_indices_valid = st.IndicesValid();
   EXPECT_FALSE(st_indices_valid.ok());
-  EXPECT_EQ("indices[1] = [0,0,0] is repeated",
-            st_indices_valid.error_message());
+  EXPECT_EQ("indices[1] = [0,0,0] is repeated", st_indices_valid.message());
 
   ix_orig(1, 2) = 1;
   ix_t = ix_orig;
@@ -311,8 +310,7 @@ TEST(SparseTensorTest, ValidateIndicesFindsInvalid) {
   st.Reorder<tstring>(order);
   st_indices_valid = st.IndicesValid();
   EXPECT_FALSE(st_indices_valid.ok());  // first index now (0, 0, 1)
-  EXPECT_EQ("indices[1] = [0,0,1] is repeated",
-            st_indices_valid.error_message());
+  EXPECT_EQ("indices[1] = [0,0,1] is repeated", st_indices_valid.message());
 }
 
 TEST(SparseTensorTest, SparseTensorCheckBoundaries) {
@@ -343,7 +341,7 @@ TEST(SparseTensorTest, SparseTensorCheckBoundaries) {
   EXPECT_FALSE(st_indices_valid.ok());
   // Error message references index 4 because of the call to Reorder.
   EXPECT_EQ("[11,0,0] is out of bounds: need 0 <= index < [10,10,10]",
-            st_indices_valid.error_message().substr(13));
+            st_indices_valid.message().substr(13));
 
   ix_t(0, 0) = -1;
   ix.matrix<int64_t>() = ix_t;
@@ -351,7 +349,7 @@ TEST(SparseTensorTest, SparseTensorCheckBoundaries) {
   st_indices_valid = st.IndicesValid();
   EXPECT_FALSE(st_indices_valid.ok());
   EXPECT_EQ("[-1,0,0] is out of bounds: need 0 <= index < [10,10,10]",
-            st_indices_valid.error_message().substr(13));
+            st_indices_valid.message().substr(13));
 
   ix_t(0, 0) = 0;
   ix.matrix<int64_t>() = ix_t;
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index f5cf406fcb8..8ec4833b274 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -138,6 +138,9 @@ static Status TF_MUST_USE_RESULT BuildDenseSpec(
         dense->final_shape_gather_indices_sparse.push_back(-1);
       } else {
         if (full_index == dense->begin.size()) {
+          if (dense->dims == 0) {
+            return errors::InvalidArgument("Attempting to slice scalar input.");
+          }
           return errors::InvalidArgument("Index out of range using input dim ",
                                          full_index, "; input has only ",
                                          dense->dims, " dims");
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index b2b74e2795c..cd0abb6d0dd 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -390,7 +390,7 @@ Status CorruptFileError(const Status& in_status, const string& filename,
       strings::StrCat("Unable to read file (", filename,
                       "). Perhaps the file is corrupt or was produced by a "
                       "newer version of TensorFlow with format changes (",
-                      detail, "): ", in_status.error_message()));
+                      detail, "): ", in_status.message()));
 }
 
 table::Options TableBuilderOptions() {
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index c84cbaf1861..0976e52c079 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -531,8 +531,7 @@ void VersionTest(const VersionDef& version, StringPiece expected_error) {
   // Read it back in and verify that we get the expected error.
   BundleReader reader(Env::Default(), path);
   EXPECT_TRUE(errors::IsInvalidArgument(reader.status()));
-  EXPECT_TRUE(
-      absl::StartsWith(reader.status().error_message(), expected_error));
+  EXPECT_TRUE(absl::StartsWith(reader.status().message(), expected_error));
 }
 
 }  // namespace
diff --git a/tensorflow/core/util/tensor_slice_reader.cc b/tensorflow/core/util/tensor_slice_reader.cc
index 0d3f3517fd0..3163f4444e5 100644
--- a/tensorflow/core/util/tensor_slice_reader.cc
+++ b/tensorflow/core/util/tensor_slice_reader.cc
@@ -87,7 +87,7 @@ Status OpenTableTensorSliceReader(const string& fname,
         return OkStatus();
       } else {
         s = errors::CreateWithUpdatedMessage(
-            s, strings::StrCat(s.error_message(),
+            s, strings::StrCat(s.message(),
                                ": perhaps your file is in a different "
                                "file format and you need to use a "
                                "different restore operator?"));
diff --git a/tensorflow/core/util/tensor_slice_reader_test.cc b/tensorflow/core/util/tensor_slice_reader_test.cc
index efb2d9d9748..0d33fb71e4c 100644
--- a/tensorflow/core/util/tensor_slice_reader_test.cc
+++ b/tensorflow/core/util/tensor_slice_reader_test.cc
@@ -600,7 +600,7 @@ static void VersionTest(const VersionDef& versions, const string& error) {
   // Read it back in and verify that we get the expected error
   TensorSliceReader reader(path, OpenTableTensorSliceReader);
   EXPECT_TRUE(reader.status().code() == error::INVALID_ARGUMENT &&
-              absl::StartsWith(reader.status().error_message(), error))
+              absl::StartsWith(reader.status().message(), error))
       << "Expected error starting with '" << errors::InvalidArgument(error)
       << "', got '" << reader.status() << "'";
 }
diff --git a/tensorflow/core/util/tensor_slice_writer.cc b/tensorflow/core/util/tensor_slice_writer.cc
index 75197a52c6d..451d4d55ccf 100644
--- a/tensorflow/core/util/tensor_slice_writer.cc
+++ b/tensorflow/core/util/tensor_slice_writer.cc
@@ -54,7 +54,7 @@ class TableBuilder : public TensorSliceWriter::Builder {
     }
     if (!s.ok()) {
       s = errors::Internal("Error writing (tmp) checkpoint file: ", name_, ": ",
-                           s.error_message());
+                           s.message());
     }
     builder_.reset();
     file_.reset();
diff --git a/tensorflow/core/util/tensor_slice_writer_test.cc b/tensorflow/core/util/tensor_slice_writer_test.cc
index 5120be5a8b5..65b636ea36a 100644
--- a/tensorflow/core/util/tensor_slice_writer_test.cc
+++ b/tensorflow/core/util/tensor_slice_writer_test.cc
@@ -348,7 +348,7 @@ TEST(TensorSliceWriteTest, SizeErrors) {
     const std::vector<int8> data(300000000, -1);
     Status s = writer.Add("test1", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(absl::StrContains(s.error_message(),
+    EXPECT_TRUE(absl::StrContains(s.message(),
                                   "Tensor slice is too large to serialize"));
   }
 
@@ -359,7 +359,7 @@ TEST(TensorSliceWriteTest, SizeErrors) {
     const std::vector<tstring> data(256 * 1024, std::string(8192, 'f'));
     Status s = writer.Add("test2", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(absl::StrContains(s.error_message(),
+    EXPECT_TRUE(absl::StrContains(s.message(),
                                   "Tensor slice is too large to serialize"));
   }
 }
@@ -371,8 +371,7 @@ TEST(TensorSliceWriterTest, InvalidInput) {
   Status s = TensorSliceWriter::SaveData(data.data(), data.size(), &ss);
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(absl::StrContains(
-      s.error_message(),
-      "Tensor slice serialization not implemented for dtype"));
+      s.message(), "Tensor slice serialization not implemented for dtype"));
 }
 
 }  // namespace checkpoint
diff --git a/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc b/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
index 380a866a51f..ffa8586d699 100644
--- a/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
+++ b/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
@@ -778,7 +778,7 @@ void RpcCheckStatusOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         Tensor error_code(DT_INT64, TensorShape({})),
             error_message(DT_STRING, TensorShape({}));
         error_code.scalar<int64_t>()() = status.raw_code();
-        error_message.scalar<tstring>()() = status.error_message();
+        error_message.scalar<tstring>()() = status.message();
 
         ctx->set_output(0, error_code);
         ctx->set_output(1, error_message);
diff --git a/tensorflow/dtensor/BUILD b/tensorflow/dtensor/BUILD
index f1a9c5a37af..841e31e31a5 100644
--- a/tensorflow/dtensor/BUILD
+++ b/tensorflow/dtensor/BUILD
@@ -24,6 +24,6 @@ package_group(
         "//tensorflow/core/...",
         "//tensorflow/dtensor/...",
         "//tensorflow/python/...",
-        "//tensorflow_federated/cc/core/...",  #  Needs access to C++ API for dtensor device
+        "//tensorflow_federated/...",  #  Needs access to C++ API for dtensor device
     ],
 )
diff --git a/tensorflow/dtensor/build_defs.bzl b/tensorflow/dtensor/build_defs.bzl
index 0307e0630f6..a86c33bce04 100644
--- a/tensorflow/dtensor/build_defs.bzl
+++ b/tensorflow/dtensor/build_defs.bzl
@@ -8,6 +8,7 @@ TPU_V3_DONUT_BACKEND = "tpu_v3_2x2"
 TPU_V4_DONUT_BACKEND = "tpu_v4_2x2"
 GPU_2DEVS_BACKEND = "2gpus"
 PATHWAYS = "pw"
+PATHWAYS_V3_DONUT_BACKEND = "pw_v3_2x2"
 # LINT.ThenChange(
 #     python/tests/test_backend_name.py:backend_name,
 #     python/tests/test_backend_name.oss.py:backend_name
@@ -38,6 +39,8 @@ def _get_configurations(
         ],
         PATHWAYS: [
         ],
+        PATHWAYS_V3_DONUT_BACKEND: [
+        ],
     }
     configurations = [
         dict(suffix = "cpu", backend = "cpu", tags = [], flags = [], env = {}, deps = []),
diff --git a/tensorflow/dtensor/cc/BUILD b/tensorflow/dtensor/cc/BUILD
index 6544f581f0f..3cdd091bb58 100644
--- a/tensorflow/dtensor/cc/BUILD
+++ b/tensorflow/dtensor/cc/BUILD
@@ -41,6 +41,7 @@ cc_library(
     hdrs = ["dtensor_utils.h"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/tsl/util:env_var",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -112,7 +113,10 @@ cc_library(
         ":small_constant_optimization",
         ":tensor_layout",
         ":tensor_with_layout",
+        "//tensorflow/c:safe_ptr",
+        "//tensorflow/c:tf_datatype",
         "//tensorflow/c:tf_status_headers",
+        "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/c/eager:c_api_internal",
@@ -288,9 +292,11 @@ cc_library(
     name = "parallel_executor_interface",
     hdrs = ["parallel_executor.h"],
     deps = [
-        ":dtensor_device_util",
+        ":tensor_layout",
+        ":tensor_with_layout",
+        "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/compiler/xla/pjrt:pjrt_future",
-        "//tensorflow/tsl/platform:statusor",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -302,23 +308,16 @@ cc_library(
     deps = [
         ":constants",
         ":default_parallel_executor_lib",
-        ":parallel_executor_interface",
         ":dstatus",
         ":dtensor_device_util",
         ":dtensor_graph_to_mlir_pass",
         ":dtensor_meta_ops",
         ":dtensor_ops",
         ":dtensor_tpu_ops",
+        ":parallel_executor_interface",
         ":small_constant_optimization",
         ":tensor_layout",
         ":tpu_system_interface",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
         "//tensorflow/c:tf_status_helper",
@@ -326,26 +325,37 @@ cc_library(
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/c/eager:tfe_context_internal",
+        "//tensorflow/c/eager:tfe_op_attrs_internal",
         "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/c/eager/parallel_device:parallel_device_lib",
         "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_interface",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_topology_external",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime:device_set",
         "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/dtensor/mlir:layout_parsing",
+        "//tensorflow/dtensor/mlir:op_utils",
+        "//tensorflow/dtensor/mlir:spmd_expander",
         "//tensorflow/dtensor/proto:layout_proto_cc",
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_interface",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_topology_external",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/util:env_var",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
     ] + tf_dtensor_tpu_dependencies(),
 )
 
@@ -354,12 +364,14 @@ cc_library(
     srcs = ["xla_spmd/layout_to_xla_sharding.cc"],
     hdrs = ["xla_spmd/layout_to_xla_sharding.h"],
     deps = [
-        ":constants",
         ":dstatus",
         ":tensor_layout",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
     ],
 )
 
diff --git a/tensorflow/dtensor/cc/constants.h b/tensorflow/dtensor/cc/constants.h
index f11f8ce740d..5b9673cb0be 100644
--- a/tensorflow/dtensor/cc/constants.h
+++ b/tensorflow/dtensor/cc/constants.h
@@ -142,6 +142,9 @@ static constexpr char kIteratorElementLayouts[] = "tf._element_layouts";
 // Attribute used in tf.data ops which stores the shapes of the output elements.
 static constexpr char kIteratorOutputShapes[] = "output_shapes";
 
+// The number of list of regular tensors used to represent sparse tensors.
+static constexpr int kSparseTensorNum = 3;
+
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/cc/dstatus.h b/tensorflow/dtensor/cc/dstatus.h
index 40886a25ced..c2bf23b0c2a 100644
--- a/tensorflow/dtensor/cc/dstatus.h
+++ b/tensorflow/dtensor/cc/dstatus.h
@@ -38,7 +38,7 @@ inline Status WithContext(const Status& ds, absl::string_view file,
   if (ds.ok()) {
     return ds;
   }
-  return Status(ds.code(), absl::StrCat(ds.error_message(), "\n", file, ":",
+  return Status(ds.code(), absl::StrCat(ds.message(), "\n", file, ":",
                                         line_number, " :: ", context));
 }
 
@@ -50,7 +50,7 @@ inline StatusOr<T> WithContext(StatusOr<T>&& ds, absl::string_view file,
     return ds;
   }
   return Status(ds.status().code(),
-                absl::StrCat(ds.status().error_message(), "\n", file, ":",
+                absl::StrCat(ds.status().message(), "\n", file, ":",
                              line_number, " :: ", context));
 }
 
diff --git a/tensorflow/dtensor/cc/dtensor_device.cc b/tensorflow/dtensor/cc/dtensor_device.cc
index 1f80b2deb30..521a7a0e33e 100644
--- a/tensorflow/dtensor/cc/dtensor_device.cc
+++ b/tensorflow/dtensor/cc/dtensor_device.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -41,6 +42,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
 #include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_attrs_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
@@ -54,6 +56,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_topology.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
@@ -82,40 +85,74 @@ limitations under the License.
 #include "tensorflow/dtensor/cc/small_constant_optimization.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/cc/tpu_system_interface.h"
+#include "tensorflow/dtensor/mlir/op_utils.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
 #include "tensorflow/dtensor/proto/layout.pb.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/util/env_var.h"
 
+using tensorflow::EagerExecutor;
+
 namespace tensorflow {
 namespace dtensor {
 
+bool ShouldRunAsSingleDevice(const DTensorOperation& dtensor_operation) {
+  // Functions shall always run via the rewrites.
+  if (dtensor_operation.function_def != nullptr) return false;
+
+  static auto* unsupported_ops = new std::unordered_set<std::string>({
+      // Op-by-op const has no obvious layout. DTensor skips SPMD expansion and
+      // runs
+      // them on TensorFlow's default placement logic,
+      // Relying on copy-on-use when the value is used later.
+      // Except a single device mesh will override this placement logic.
+      // TODO(feyu): I think we shall change this to always broadcast to
+      // the default mesh, to make it consistent with the rest of DTensor.
+      "_EagerConst",
+  });
+  if (unsupported_ops->find(dtensor_operation.name) != unsupported_ops->end()) {
+    return true;
+  }
+  return !SPMDExpanderRegistry::Global()->IsOpSupported(
+      GetFullOpName(dtensor_operation.name));
+}
+
 class DTensorDevice {
  public:
-  static StatusOr<DTensorDevice*> Create(absl::string_view name) {
+  static StatusOr<DTensorDevice*> Create(absl::string_view name, bool is_async,
+                                         int in_flight_nodes_limit) {
     std::string use_parallel_executor;
     TF_RETURN_IF_ERROR(tsl::ReadStringFromEnvVar(
         "DTENSOR_USE_PARALLEL_EXECUTOR", "", &use_parallel_executor));
-    if (use_parallel_executor.empty()) {
-      return new DTensorDevice(name, nullptr);
-    } else {
-      TF_ASSIGN_OR_RETURN(auto parallel_executor,
-                          CreateDefaultParallelExecutor());
-      return new DTensorDevice(name, std::move(parallel_executor));
+    std::unique_ptr<ParallelExecutor> parallel_executor;
+    if (!use_parallel_executor.empty()) {
+      TF_ASSIGN_OR_RETURN(parallel_executor, CreateDefaultParallelExecutor());
     }
+    auto eager_executor = std::make_unique<EagerExecutor>(
+        is_async, /*enable_streaming_enqueue=*/true, in_flight_nodes_limit);
+    return new DTensorDevice(name, std::move(parallel_executor),
+                             std::move(eager_executor), is_async,
+                             in_flight_nodes_limit);
   }
 
   bool use_parallel_executor() const { return parallel_executor_ != nullptr; }
 
-  void AddMesh(
-      Mesh mesh_config,
-      std::unique_ptr<tensorflow::parallel_device::ParallelDevice> parallel,
-      bool is_host_mesh) {
-    auto mesh = std::make_unique<MeshWithParallelDevice>(std::move(mesh_config),
-                                                         std::move(parallel));
+  void AddMesh(Mesh mesh_config, bool is_host_mesh) {
+    std::vector<std::string> underlying_devices;
+    underlying_devices.insert(underlying_devices.end(),
+                              mesh_config.local_devices().begin(),
+                              mesh_config.local_devices().end());
+
+    // DTensor uses multi-client setup which doesn't use remote eager, so we can
+    // enable eager async execution in ParallelDevice.
+    std::unique_ptr<tensorflow::parallel_device::ParallelDevice> parallel(
+        new tensorflow::parallel_device::ParallelDevice(
+            underlying_devices, is_async_, in_flight_nodes_limit_));
+
     if (is_host_mesh) {
       std::string& tpu_host_mesh = Mesh::tpu_host_mesh();
-      const std::string new_tpu_host_mesh = mesh->mesh_config().ToString();
+      const std::string new_tpu_host_mesh = mesh_config.ToString();
       if (!tpu_host_mesh.empty()) {
         // TODO(b/180046115): Add per-TPU-mesh host mesh bookkeeping.
         LOG(WARNING)
@@ -125,12 +162,10 @@ class DTensorDevice {
       tpu_host_mesh.assign(new_tpu_host_mesh);
     }
     // For idempotency, don't register the same mesh twice.
-    if (!mesh_to_device_map_.insert({mesh->mesh_config(), std::move(mesh)})
-             .second)
+    if (!mesh_to_device_map_.insert({mesh_config, std::move(parallel)}).second)
       return;
-    if (!default_mesh_) {
-      global_default_mesh_ = mesh_to_device_map_.begin()->first;
-      default_mesh_ = global_default_mesh_;
+    if (!GetDefaultMesh().has_value()) {
+      SetDefaultMesh(mesh_to_device_map_.begin()->first);
     }
   }
 
@@ -144,10 +179,46 @@ class DTensorDevice {
   void Execute(const TFE_Op* original_op, int* num_outputs,
                TFE_TensorHandle** outputs, TF_Status* status);
 
-  void SetDefaultLayout(Layout layout) { default_layout_.emplace(layout); }
-  void ClearDefaultLayout() { default_layout_.reset(); }
-  void SetDefaultMesh(Mesh mesh) { default_mesh_ = mesh; }
-  void ClearDefaultMesh() { default_mesh_ = global_default_mesh_; }
+  // Sets the output layout for the current device.
+  // This API only supports setting the layout of the first output.
+  // All eager operations launched this device will be affected.
+  //
+  // Example (pseudo-code):
+  // dtensor_device.set_default_layout(layout)
+  // run_some_eager_op()
+  //
+  // If the pattern is used by multiple threads, the caller shall
+  // ensure there is no racing between threads.
+  void SetDefaultLayout(Layout layout) {
+    mutex_lock lock(mu_default_layout_);
+    default_layout_.emplace(layout);
+  }
+  void ClearDefaultLayout() {
+    mutex_lock lock(mu_default_layout_);
+    default_layout_.reset();
+  }
+
+  // Similar to SetDefaultLayout, but sets the default mesh used for
+  // all Operations and operands whose mesh cannot be determined from
+  // inference.
+  //
+  // If the pattern is used by multiple threads, the caller shall
+  // ensure there is no racing between threads.
+  void SetDefaultMesh(Mesh mesh) {
+    mutex_lock lock(mu_default_mesh_);
+    default_mesh_ = mesh;
+    if (!global_default_mesh_.has_value()) {
+      global_default_mesh_ = mesh;
+    }
+  }
+  std::optional<Mesh> GetDefaultMesh() const {
+    mutex_lock lock(mu_default_mesh_);
+    return default_mesh_;
+  }
+  void ClearDefaultMesh() {
+    mutex_lock lock(mu_default_mesh_);
+    default_mesh_ = global_default_mesh_;
+  }
 
   Status SetTPUCoreIDs(const std::string& mesh_name,
                        const std::vector<int>& tpu_core_ids) {
@@ -242,12 +313,18 @@ class DTensorDevice {
     std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> first_bad_status(
         nullptr, TF_DeleteStatus);
 
+    Set_TF_Status_from_Status(status,
+                              eager_executor_->WaitForAllPendingNodes());
+
+    if (TF_GetCode(status) != TF_OK) {
+      first_bad_status.reset(status);
+    }
+
     for (const auto& pair : mesh_to_device_map_) {
       std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> async_wait_status(
           TF_NewStatus(), TF_DeleteStatus);
 
-      pair.second->parallel_device().AsyncWait(context,
-                                               async_wait_status.get());
+      pair.second->AsyncWait(context, async_wait_status.get());
 
       TF_Code error_code = TF_GetCode(async_wait_status.get());
       if (error_code != TF_OK &&
@@ -282,9 +359,8 @@ class DTensorDevice {
                                         TFE_TensorHandle* input,
                                         TF_Status* status);
 
-  TFE_TensorHandle* ToHostTensorHandle(TFE_Context* context,
-                                       TensorWithLayout* input,
-                                       TF_Status* status);
+  TFE_TensorHandle* ToTensorHandle(TFE_Context* context,
+                                   TensorWithLayout* input, TF_Status* status);
 
   // Return the layout for the input tensor.
   std::string FetchLayout(TFE_Context* context, TFE_TensorHandle* input,
@@ -312,26 +388,22 @@ class DTensorDevice {
                                  TF_Status* status);
 
   std::vector<std::unique_ptr<tensorflow::dtensor::TensorWithLayout>>
-  Disassemble(TensorWithLayout* t, TF_Status* status);
+  Disassemble(TFE_Context* context, TensorWithLayout* t, TF_Status* status);
 
  private:
   DTensorDevice(absl::string_view name,
-                std::unique_ptr<ParallelExecutor> parallel_executor)
+                std::unique_ptr<ParallelExecutor> parallel_executor,
+                std::unique_ptr<EagerExecutor> eager_executor, bool is_async,
+                int in_flight_nodes_limit)
       : name_(name),
+        is_async_(is_async),
+        in_flight_nodes_limit_(in_flight_nodes_limit),
         module_manager_(
             new ExecutableManager<mlir::OwningOpRef<mlir::ModuleOp>>()),
         function_manager_(new ExecutableManager<ExecutionFunctions>()),
         cancellation_manager_(std::make_unique<CancellationManager>()),
-        parallel_executor_(std::move(parallel_executor)) {}
-
-  // If the `operation_name` of an op indicates a custom DTensor op then
-  // separately handle those custom ops instead of running default DTensor graph
-  // compilation.
-  void MaybeHandleDTensorCustomOps(
-      const char* operation_name, const int num_inputs,
-      const TFE_OpAttrs* attributes, TFE_Context* context,
-      TFE_TensorHandle** inputs, int* num_outputs, TFE_TensorHandle** outputs,
-      bool* is_custom_dtensor_op, TF_Status* status);
+        parallel_executor_(std::move(parallel_executor)),
+        eager_executor_(std::move(eager_executor)) {}
 
   // Stores states of a DTensorOperation that will be used for lowering,
   // including different representations (e.g. MLIR Module) of the
@@ -374,8 +446,9 @@ class DTensorDevice {
   // Execute a given function.
   void ExecuteFunctionAndWait(
       TFE_Context* context, const TranslatedFunction* function_ptr,
-      const MeshWithParallelDevice* parallel_device_mesh,
-      const std::vector<parallel_device::ParallelTensor*>& parallel_inputs,
+      const Mesh& target_mesh,
+      const parallel_device::ParallelDevice* parallel_device,
+      const std::vector<std::vector<TFE_TensorHandle*>>& parallel_inputs,
       const int64_t step_id, const TFE_OpAttrs* attributes, TF_Status* status);
 
   // Execute regular operation with ParallelExecutor
@@ -392,94 +465,67 @@ class DTensorDevice {
                                const TFE_OpAttrs* attributes, int* num_outputs,
                                TFE_TensorHandle** outputs, TF_Status* status);
 
+  // Executes special functions for embedding.
+  void ExecuteEPUFunctions(TFE_Context* context,
+                           std::vector<TensorWithLayoutTf*> inputs,
+                           const ExecutionFunctions* execution_functions,
+                           const TFE_OpAttrs* attributes, uint64 step_id,
+                           absl::flat_hash_set<std::string>* executed_fn_names,
+                           TF_Status* status);
+
+  // Executes a single device Operation, on a single device.
+  void ExecuteSingleDeviceOperation(
+      TFE_Context* context, const std::vector<TFE_TensorHandle*>& inputs,
+      const std::string& operation_name, const std::string& device_name,
+      const TFE_OpAttrs* attributes, int* num_outputs,
+      TFE_TensorHandle** outputs, TF_Status* status);
+
   // Wraps a TensorWithLayout into a TFE_TensorHandle.
   TFE_TensorHandle* MakeLayoutTensorHandle(TFE_Context* context,
                                            std::unique_ptr<TensorWithLayout> t,
                                            TF_Status* status);
 
-  void RecordInShapeLayoutCache(const TensorWithLayout& tensor);
-
   // Choose a mesh to broadcast a non-dtensor to a dtensor based on the
   // operation, other input meshes, default mesh, and dtypes.
   std::optional<Mesh> ChooseBroadcastingMesh(
       const absl::flat_hash_set<Mesh>& input_meshes,
       const std::vector<TF_DataType>& dtypes);
 
-  // Returns whether a given mesh is a remote mesh.
-  bool is_remote_mesh(const Mesh& mesh) const;
+  // Returns the ParallelDevice object for a given mesh.
+  // If strict is false, return the device for the default mesh if the mesh
+  // doesn't exist.
+  // FIXME(feyu): we probably never need strict == false.
+  StatusOr<const parallel_device::ParallelDevice*> GetParallelDevice(
+      const Mesh& mesh, bool strict) {
+    auto iter = mesh_to_device_map_.find(mesh);
 
-  // Broadcasts `tensor` to `mesh` using replicated sharding. Returns `nullptr`
-  // if it fails.
-  // TODO(b/256016071): Unify this and the one in `TensorWithLayoutTf`.
-  std::unique_ptr<TensorWithLayout> Broadcast(TFE_Context* context,
-                                              TFE_TensorHandle* input,
-                                              const Mesh& mesh,
-                                              TF_Status* status);
-
-  // The name of the device (the custom device)
-  std::string name_;
-  // Mesh configs with matching parallel devices.
-  //
-  // For now we just consider the first entry added to dtensor_device as the
-  // default mesh. Before we reach an agreement on this, we'll leave it as is.
-  absl::flat_hash_map<Mesh, std::unique_ptr<MeshWithParallelDevice>>
-      mesh_to_device_map_;
-  // TODO(hthu): Consider whether we want to preserve the default_mesh semantic.
-  // Current default mesh consistent to default_layout_. If default_layout_ is
-  // not set, it equals to global_default_mesh_.
-  std::optional<Mesh> default_mesh_;
-  // The default mesh of a DTensorDevice, which cannot be modified once being
-  // set.
-  std::optional<Mesh> global_default_mesh_;
-  // If the user has specified a default output layout.
-  std::optional<Layout> default_layout_;
-
-  DTensorMlirPassRunner pass_runner_;
-
-  struct CachedLayout {
-    // The first layout seen with this shape
-    Layout layout;
-    // Whether the layout is unique for this shape
-    bool is_unique;
-  };
-  absl::flat_hash_map<int64_t, CachedLayout> shape_layout_cache_;
-
-  // DTensor op execution is divided into two general stages:
-  // Stage 1: DTensor function is converted to MLIR module. And the
-  // module_manager_ is used in this stage to cache the module. The cache key
-  // generated in this stage will be reused in the next stage.
-  core::RefCountPtr<ExecutableManager<mlir::OwningOpRef<mlir::ModuleOp>>>
-      module_manager_;
-  // Stage 2: MLIR module is processed. On TensorFlow runtime execution the
-  // module is lowered to ExecutionFunctions for execution. function_manager_
-  // caches the ExecutionFunctions.
-  core::RefCountPtr<ExecutableManager<ExecutionFunctions>> function_manager_;
-
-  // Coordinates cancelling ops across meshes on error. Must outlive any queued
-  // async op launches, so we only reset it after seeing a failure status.
-  std::unique_ptr<CancellationManager> cancellation_manager_;
-
-  // Map each function_mesh_fingerprint (based on the set of the mesh involved)
-  // to the number of times of the function execution. The
-  // function_mesh_fingerprint and the counter together are used for generating
-  // the step id, which is used for rendezvous creation.
-  absl::flat_hash_map<uint64, uint64> func_mesh_fingerprint_to_step_counter_;
-
-  // Dispatchs post-SPMD functions.
-  std::unique_ptr<ParallelExecutor> parallel_executor_;
-};
-
-int64_t FingerprintShape(const absl::Span<const int64_t> shape) {
-  int64_t fprint = 0;
-  for (int64_t dim : shape) {
-    fprint = FingerprintCat64(fprint, dim);
+    const parallel_device::ParallelDevice* parallel_device = nullptr;
+    if (iter != mesh_to_device_map_.end()) {
+      parallel_device = iter->second.get();
+    }
+    if (!strict) {
+      if (parallel_device == nullptr) {
+        auto default_mesh = GetDefaultMesh();
+        if (default_mesh.has_value()) {
+          parallel_device = mesh_to_device_map_[*default_mesh].get();
+        }
+      }
+    }
+    if (parallel_device == nullptr) {
+      return errors::Internal(absl::StrCat("Mesh in Unpack: ", mesh.ToString(),
+                                           "is not registered with DTensor"));
+    }
+    return parallel_device;
   }
-  return fprint;
-}
 
-parallel_device::ParallelTensor* MeshWithParallelDevice::DeviceIDs(
-    TFE_Context* context, TF_Status* status) const {
-  if (device_ids_tensor_ == nullptr) {
+  // A parallel tensor containing scalar integer device IDs for underlying
+  // devices, each placed on its corresponding device.
+  //
+  // TODO(allenl): It would be nice if DeviceID worked as an op inside the
+  // function's graph. Then we wouldn't need to feed it as an argument.
+  parallel_device::ParallelTensor* GetDeviceIDs(
+      TFE_Context* context, TF_Status* status, const Mesh& target_mesh,
+      const parallel_device::ParallelDevice* parallel_device) {
     // Global device IDs sequentially increase.
     //
     // This is the assumption in the dtensor software stack. MLIR pass relies on
@@ -510,17 +556,21 @@ parallel_device::ParallelTensor* MeshWithParallelDevice::DeviceIDs(
     // possible approach is to take a N-D mapping vector for N-D mesh and lookup
     // the coordinates in MLIR, by consulting tensor layout as well, rather than
     // calculation on-the-fly.
+    MeshToDeviceIDsMap::iterator search =
+        mesh_to_device_ids_map_.find(target_mesh);
+    if (search != mesh_to_device_ids_map_.end()) {
+      return search->second.get();
+    }
 
     // LINT.IfChange
-    for (int64_t i = 0; i < mesh_config_.global_device_ids().size(); ++i) {
-      if (mesh_config_.global_device_ids()[i] - i !=
-          mesh_config_.global_device_ids()[0]) {
+    for (int64_t i = 0; i < target_mesh.global_device_ids().size(); ++i) {
+      if (target_mesh.global_device_ids()[i] - i !=
+          target_mesh.global_device_ids()[0]) {
         TF_SetStatus(
             status, TF_INTERNAL,
             absl::StrCat("Global device IDs should be consecutive: ",
-                         absl::StrJoin(mesh_config_.global_device_ids(), ", "))
+                         absl::StrJoin(target_mesh.global_device_ids(), ", "))
                 .c_str());
-        return nullptr;
       }
     }
     // LINT.ThenChange(//tensorflow/dtensor/python/layout.py)
@@ -528,15 +578,110 @@ parallel_device::ParallelTensor* MeshWithParallelDevice::DeviceIDs(
     // Local device IDs are a subset of global device IDs, arranged in device
     // ordinal order.
     std::vector<int32_t> ids;
-    for (int64_t id : mesh_config_.local_device_ids()) {
+    for (int64_t id : target_mesh.local_device_ids()) {
       ids.push_back(id);
     }
     VLOG(1) << "Parallel device IDs: " << absl::StrJoin(ids, ", ");
-    device_ids_tensor_ =
-        parallel_device_->ScalarsFromSequence<int32_t>(ids, context, status);
-    if (TF_GetCode(status) != TF_OK) return nullptr;
+
+    // TODO(twelve) : Internalize the logic of this function, which creates
+    //                tensor handles for each of the scalars, to return
+    //                tensor handles here instead of a ParallelTesnor.
+    std::unique_ptr<parallel_device::ParallelTensor> tensor =
+        parallel_device->ScalarsFromSequence<int32_t>(ids, context, status);
+
+    std::pair<MeshToDeviceIDsMap::iterator, bool> ins =
+        mesh_to_device_ids_map_.emplace(target_mesh, std::move(tensor));
+    if (ins.second) {
+      return ins.first->second.get();
+    } else {
+      TF_SetStatus(status, TF_INTERNAL,
+                   "Failed to insert parallel tensor for device IDs!");
+      return nullptr;
+    }
   }
-  return device_ids_tensor_.get();
+
+  // Returns whether a given mesh is a remote mesh.
+  bool is_remote_mesh(const Mesh& mesh) const;
+
+  // Broadcasts `tensor` to `mesh` using replicated sharding. Returns `nullptr`
+  // if it fails.
+  std::unique_ptr<TensorWithLayout> Broadcast(TFE_Context* context,
+                                              TFE_TensorHandle* input,
+                                              const Mesh& mesh,
+                                              TF_Status* status);
+
+  // The name of the device (the custom device)
+  std::string name_;
+
+  // True if the nested eager executors shall run with EagerAsync mode.
+  bool is_async_;
+
+  // If nonzero, limit the number of in-flight EagerAsync nodes in nested
+  // Eager executors.
+  int in_flight_nodes_limit_;
+
+  // Mesh configs with matching parallel devices.
+  //
+  // For now we just consider the first entry added to dtensor_device as the
+  // default mesh. Before we reach an agreement on this, we'll leave it as is.
+  absl::flat_hash_map<Mesh, std::unique_ptr<parallel_device::ParallelDevice>>
+      mesh_to_device_map_;
+
+  using MeshToDeviceIDsMap =
+      absl::flat_hash_map<Mesh,
+                          std::unique_ptr<parallel_device::ParallelTensor>>;
+  MeshToDeviceIDsMap mesh_to_device_ids_map_;
+
+  // TODO(hthu): Consider whether we want to preserve the default_mesh semantic.
+  // Current default mesh consistent to default_layout_. If default_layout_ is
+  // not set, it equals to global_default_mesh_.
+  std::optional<Mesh> default_mesh_;
+  // The default mesh of a DTensorDevice, which cannot be modified once being
+  // set.
+  std::optional<Mesh> global_default_mesh_;
+  // If the user has specified a default output layout.
+  std::optional<Layout> default_layout_;
+
+  DTensorMlirPassRunner pass_runner_;
+
+  // DTensor op execution is divided into two general stages:
+  // Stage 1: DTensor function is converted to MLIR module. And the
+  // module_manager_ is used in this stage to cache the module. The cache key
+  // generated in this stage will be reused in the next stage.
+  core::RefCountPtr<ExecutableManager<mlir::OwningOpRef<mlir::ModuleOp>>>
+      module_manager_;
+  // Stage 2: MLIR module is processed. On TensorFlow runtime execution the
+  // module is lowered to ExecutionFunctions for execution. function_manager_
+  // caches the ExecutionFunctions.
+  core::RefCountPtr<ExecutableManager<ExecutionFunctions>> function_manager_;
+
+  // Coordinates cancelling ops across meshes on error. Must outlive any queued
+  // async op launches, so we only reset it after seeing a failure status.
+  std::unique_ptr<CancellationManager> cancellation_manager_;
+
+  // Map each function_mesh_fingerprint (based on the set of the mesh involved)
+  // to the number of times of the function execution. The
+  // function_mesh_fingerprint and the counter together are used for generating
+  // the step id, which is used for rendezvous creation.
+  absl::flat_hash_map<uint64, uint64> func_mesh_fingerprint_to_step_counter_;
+
+  // Dispatchs functions for Pathways.
+  std::unique_ptr<ParallelExecutor> parallel_executor_;
+
+  // Dispatchs functions for TensorFlow.
+  std::unique_ptr<EagerExecutor> eager_executor_;
+
+  mutable mutex mu_;  // Mutex for dtensor_device->execute
+  mutable mutex mu_default_mesh_;    // Mutex for default mesh object
+  mutable mutex mu_default_layout_;  // Mutex for default layout object
+};
+
+int64_t FingerprintShape(const absl::Span<const int64_t> shape) {
+  int64_t fprint = 0;
+  for (int64_t dim : shape) {
+    fprint = FingerprintCat64(fprint, dim);
+  }
+  return fprint;
 }
 
 int TensorWithLayoutNumDims(void* data, TF_Status* status) {
@@ -578,7 +723,7 @@ std::optional<Mesh> DTensorDevice::ChooseBroadcastingMesh(
   // mesh to broadcast to. Otherwise we fallback to default mesh.
   if (input_meshes.size() == 1) return *input_meshes.begin();
 
-  return default_mesh_;
+  return GetDefaultMesh();
 }
 
 TFE_TensorHandle* DTensorDevice::MakeLayoutTensorHandle(
@@ -595,25 +740,13 @@ TFE_TensorHandle* DTensorDevice::MakeLayoutTensorHandle(
                                          status);
 }
 
-void DTensorDevice::RecordInShapeLayoutCache(const TensorWithLayout& tensor) {
-  auto existing = shape_layout_cache_.insert(
-      {FingerprintShape(tensor.global_shape()),
-       CachedLayout{tensor.layout(), /*is_unique=*/true}});
-
-  if (!existing.second) {
-    // There is an entry already; if the layout doesn't match we should record
-    // the fact that it's not unique.
-    if (tensor.layout() != existing.first->second.layout) {
-      existing.first->second.is_unique = false;
-    }
-  }
-}
-
 bool DTensorDevice::is_remote_mesh(const Mesh& mesh) const {
   // An empty mesh might be assigned to VarHandleOp during DTensor MLIR lowering
   // pass. Decide whether the empty mesh is remote based on the current default
   // mesh.
-  return mesh.is_remote() || (mesh.IsEmpty() && default_mesh_->is_remote());
+
+  return mesh.is_remote() ||
+         (mesh.IsEmpty() && GetDefaultMesh().value().is_remote());
 }
 
 std::unique_ptr<TensorWithLayout> DTensorDevice::Broadcast(
@@ -632,8 +765,23 @@ std::unique_ptr<TensorWithLayout> DTensorDevice::Broadcast(
   }
 
   if (!parallel_executor_) {
-    return TensorWithLayoutTf::Broadcast(context, input,
-                                         *mesh_to_device_map_[mesh], status);
+    if (mesh.IsSingleDevice()) {
+      StatusOr<Layout> single_device_layout =
+          Layout::GetSingleDeviceLayout(mesh);
+      if (!single_device_layout.ok()) {
+        Set_TF_Status_from_Status(status, single_device_layout.status());
+        return nullptr;
+      }
+      TensorHandlePtr single_tensor(
+          TFE_TensorHandleCopySharingTensor(input, status));
+      if (TF_GetCode(status) != TF_OK) {
+        return nullptr;
+      }
+      return TensorWithLayoutTf::Wrap(std::move(single_tensor),
+                                      *single_device_layout, status);
+    } else {
+      return TensorWithLayoutTf::Broadcast(context, input, mesh, status);
+    }
   }
 
   TF_Tensor* tf_tensor = TFE_TensorHandleResolve(input, status);
@@ -754,101 +902,62 @@ std::vector<TFE_TensorHandle*> DTensorDevice::Unpack(TFE_Context* context,
     return outputs;
   }
 
-  if (parallel_executor_) {
-    StatusOr<
-        std::vector<std::unique_ptr<tensorflow::dtensor::TensorWithLayout>>>
-        tensor_with_layouts = parallel_executor_->Disassemble(t);
-    if (!tensor_with_layouts.ok()) {
-      TF_SetStatus(status, TF_INTERNAL,
-                   "Failed to disassemble the TensorWithLayout.");
+  std::vector<std::unique_ptr<tensorflow::dtensor::TensorWithLayout>>
+      tensor_with_layouts = Disassemble(context, t, status);
+  if (TF_GetCode(status) != TF_OK) {
+    return outputs;
+  }
+  outputs.reserve(tensor_with_layouts.size());
+  for (auto& tensor_with_layout : tensor_with_layouts) {
+    // TODO(b/256016071): Eventually we may still support producing single
+    // device tensors with layouts instead of eager tensors, either in this API
+    // or in a new API.
+    outputs.push_back(
+        ToTensorHandle(context, tensor_with_layout.get(), status));
+    if (TF_GetCode(status) != TF_OK) {
       return outputs;
     }
-    outputs.reserve(tensor_with_layouts->size());
-    for (auto& tensor_with_layout : *tensor_with_layouts) {
-      TFE_TensorHandle* output = MakeLayoutTensorHandle(
-          context, std::move(tensor_with_layout), status);
-      if (TF_GetCode(status) != TF_OK) {
-        return outputs;
-      }
-      outputs.push_back(output);
-    }
-  } else {
-    const int output_size = t->num_tensors();
-    outputs.resize(output_size);
-
-    for (int output_index = 0; output_index < output_size; ++output_index) {
-      outputs[output_index] = TFE_TensorHandleCopySharingTensor(
-          t->get_tensor(output_index), status);
-      if (TF_GetCode(status) != TF_OK) {
-        return outputs;
-      }
-    }
   }
   return outputs;
 }
 
-TFE_TensorHandle* DTensorDevice::ToHostTensorHandle(TFE_Context* context,
-                                                    TensorWithLayout* input,
-                                                    TF_Status* status) {
-  if (!parallel_executor_) {
-    TF_SetStatus(
-        status, TF_UNIMPLEMENTED,
-        "ToHostTensorHandle has not been implemented for the case of not "
-        "using parallel executor.");
-    return nullptr;
-  }
-  StatusOr<Tensor> tensor = parallel_executor_->ToHostBuffer(input).Await();
-  if (!tensor.ok()) {
-    TF_SetStatus(status, TF_INTERNAL, tensor.status().ToString().c_str());
-    return nullptr;
-  }
-  Status tf_tensor_from_tensor_status;
-  TF_Tensor* tf_tensor =
-      TF_TensorFromTensor(*tensor, &tf_tensor_from_tensor_status);
-  if (!tf_tensor_from_tensor_status.ok()) {
-    TF_SetStatus(status, TF_INTERNAL,
-                 tf_tensor_from_tensor_status.ToString().c_str());
-    return nullptr;
-  }
-  TFE_TensorHandle* tensor_handle =
-      TFE_NewTensorHandleFromTensor(context, tf_tensor, status);
-  if (TF_GetCode(status) != TF_OK) {
-    return nullptr;
+TFE_TensorHandle* DTensorDevice::ToTensorHandle(TFE_Context* context,
+                                                TensorWithLayout* input,
+                                                TF_Status* status) {
+  TFE_TensorHandle* tensor_handle = nullptr;
+  if (parallel_executor_) {
+    StatusOr<Tensor> tensor = parallel_executor_->ToHostBuffer(input).Await();
+    if (!tensor.ok()) {
+      TF_SetStatus(status, TF_INTERNAL, tensor.status().ToString().c_str());
+      return tensor_handle;
+    }
+    Status tf_tensor_from_tensor_status;
+    TF_Tensor* tf_tensor =
+        TF_TensorFromTensor(*tensor, &tf_tensor_from_tensor_status);
+    if (!tf_tensor_from_tensor_status.ok()) {
+      TF_SetStatus(status, TF_INTERNAL,
+                   tf_tensor_from_tensor_status.ToString().c_str());
+      return tensor_handle;
+    }
+    tensor_handle = TFE_NewTensorHandleFromTensor(context, tf_tensor, status);
+    if (TF_GetCode(status) != TF_OK) {
+      return tensor_handle;
+    }
+  } else {
+    tensor_handle = TFE_TensorHandleCopySharingTensor(
+        llvm::cast<TensorWithLayoutTf>(input)->single_tensor(), status);
   }
   return tensor_handle;
 }
 
-void DTensorDevice::MaybeHandleDTensorCustomOps(
-    const char* operation_name, const int num_inputs,
-    const TFE_OpAttrs* attributes, TFE_Context* context,
-    TFE_TensorHandle** inputs, int* num_outputs, TFE_TensorHandle** outputs,
-    bool* is_custom_dtensor_op, TF_Status* status) {
-  *is_custom_dtensor_op = true;
-  if (operation_name == std::string("_EagerConst")) {
-    // Op-by-op const has no obvious layout. DTensor skips an SPMD expansion and
-    // instead relies on copy-on when the value is used later.
-    std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-        TFE_NewOp(context, operation_name, status), TFE_DeleteOp);
-    if (TF_GetCode(status) != TF_OK) return;
-    for (int input_index = 0; input_index < num_inputs; ++input_index) {
-      TFE_OpAddInput(op.get(), inputs[input_index], status);
-      if (TF_GetCode(status) != TF_OK) return;
-    }
-    TFE_OpAddAttrs(op.get(), attributes);
-    TFE_Execute(op.get(), outputs, num_outputs, status);
-    return;
-  }
-
-  *is_custom_dtensor_op = false;
-}
 
 namespace {
 
 // Verifies that all components have the same dtype and shape.
 // The component shape will be set upon success.
-void VerifyPackTensorShapeAndDtype(
-    std::vector<parallel_device::TensorHandlePtr>& components,
-    std::vector<int64_t>* component_shape, TF_Status* status) {
+void VerifyPackTensorShapeAndDtype(std::vector<TensorHandlePtr>& components,
+                                   std::vector<int64_t>* component_shape,
+                                   TF_Status* status) {
   TF_DataType dtype = TFE_TensorHandleDataType(components[0].get());
   auto size = TFE_TensorHandleNumDims(components[0].get(), status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -918,18 +1027,14 @@ TFE_TensorHandle* DTensorDevice::Pack(TFE_Context* context, int num_inputs,
     return nullptr;
   }
   const Mesh& target_mesh = target_layout->mesh();
-  const MeshWithParallelDevice* target_parallel_device =
-      mesh_to_device_map_[target_mesh].get();
-  if (target_parallel_device == nullptr) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 absl::StrCat("Required mesh : ", target_mesh.ToString(),
-                              "is not registered with DTensor ")
-                     .c_str());
+  StatusOr<const parallel_device::ParallelDevice*> parallel_device =
+      GetParallelDevice(target_layout->mesh(), /*strict=*/true);
+  if (!parallel_device.ok()) {
+    Set_TF_Status_from_Status(status, parallel_device.status());
     return nullptr;
   }
-
-  std::unique_ptr<TensorWithLayout> packed_tensor;
-  if (is_remote_mesh(target_parallel_device->mesh_config())) {
+  StatusOr<std::unique_ptr<TensorWithLayout>> packed_tensor;
+  if (is_remote_mesh(target_mesh)) {
     // Create a dummy output for DTensorPack if inputs are on a remote mesh.
     TF_DataType dtype = TFE_TensorHandleDataType(inputs[0]);
     auto size = TFE_TensorHandleNumDims(inputs[0], status);
@@ -940,15 +1045,13 @@ TFE_TensorHandle* DTensorDevice::Pack(TFE_Context* context, int num_inputs,
       component_shape.push_back(TFE_TensorHandleDim(inputs[0], i, status));
       if (TF_GetCode(status) != TF_OK) return nullptr;
     }
-    packed_tensor = CreateDummyTensorWithLayout(
-        component_shape, dtype, target_parallel_device->mesh_config(),
-        *target_layout);
+    packed_tensor = std::unique_ptr<TensorWithLayout>(
+        CreateDummyTensorWithLayout(component_shape, dtype, *target_layout));
 
   } else {
-    auto local_devices = target_parallel_device->mesh_config().local_devices();
+    auto local_devices = target_mesh.local_devices();
 
-    if (num_inputs !=
-        target_parallel_device->parallel_device().num_underlying_devices()) {
+    if (num_inputs != (*parallel_device)->num_underlying_devices()) {
       TF_SetStatus(status, TF_INVALID_ARGUMENT,
                    absl::StrCat("The dtensor device ", name_, " expected ",
                                 local_devices.size(),
@@ -957,7 +1060,7 @@ TFE_TensorHandle* DTensorDevice::Pack(TFE_Context* context, int num_inputs,
       return nullptr;
     }
 
-    std::vector<parallel_device::TensorHandlePtr> components;
+    std::vector<TensorHandlePtr> components;
     components.reserve(num_inputs);
     for (int i = 0; i < num_inputs; ++i) {
       TFE_TensorHandle* input = inputs[i];
@@ -980,12 +1083,6 @@ TFE_TensorHandle* DTensorDevice::Pack(TFE_Context* context, int num_inputs,
     VerifyPackTensorShapeAndDtype(components, &component_shape, status);
     if (TF_GetCode(status) != TF_OK) return nullptr;
 
-    std::unique_ptr<parallel_device::ParallelTensor> parallel_tensor =
-        parallel_device::ParallelTensor::FromTensorHandles(
-            target_parallel_device->parallel_device(), std::move(components),
-            status);
-    if (TF_GetCode(status) != TF_OK) return nullptr;
-
     if (target_layout->rank() != component_shape.size()) {
       TF_SetStatus(
           status, TF_INVALID_ARGUMENT,
@@ -999,15 +1096,17 @@ TFE_TensorHandle* DTensorDevice::Pack(TFE_Context* context, int num_inputs,
       return nullptr;
     }
 
-    packed_tensor = CreateTensorWithLayout(
-                        std::move(parallel_tensor),
-                        target_parallel_device->mesh_config(), *target_layout)
-                        .value();
+    packed_tensor =
+        CreateTensorWithLayout(std::move(components), *target_layout);
+  }
+
+  if (!packed_tensor.ok()) {
+    Set_TF_Status_from_Status(status, packed_tensor.status());
+    return nullptr;
   }
 
-  RecordInShapeLayoutCache(*packed_tensor);
   TFE_TensorHandle* output =
-      MakeLayoutTensorHandle(context, std::move(packed_tensor), status);
+      MakeLayoutTensorHandle(context, std::move(*packed_tensor), status);
   if (TF_GetCode(status) != TF_OK) return nullptr;
   return output;
 }
@@ -1022,14 +1121,12 @@ TFE_TensorHandle* DTensorDevice::SparsePack(
                  "Failed to parse layout from string layout");
     return nullptr;
   }
+
   const Mesh& target_mesh = target_layout->mesh();
-  const MeshWithParallelDevice* target_parallel_device =
-      mesh_to_device_map_[target_mesh].get();
-  if (target_parallel_device == nullptr) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 absl::StrCat("Required mesh : ", target_mesh.ToString(),
-                              "is not registered with DTensor ")
-                     .c_str());
+  StatusOr<const parallel_device::ParallelDevice*> parallel_device =
+      GetParallelDevice(target_layout->mesh(), /*strict=*/true);
+  if (!parallel_device.ok()) {
+    Set_TF_Status_from_Status(status, parallel_device.status());
     return nullptr;
   }
 
@@ -1087,24 +1184,23 @@ TFE_TensorHandle* DTensorDevice::SparsePack(
   }
 
   // Create the SparseTensorWithLayout.
-  std::unique_ptr<TensorWithLayout> packed_tensor;
-  if (is_remote_mesh(target_parallel_device->mesh_config())) {
+  StatusOr<std::unique_ptr<TensorWithLayout>> packed_tensor;
+  if (is_remote_mesh(target_mesh)) {
     // Create a dummy SparseTensorWithLayout.
-    packed_tensor = SparseTensorWithLayout::Dummy(
-        local_shape, target_parallel_device->mesh_config(),
-        target_layout.value());
+    packed_tensor = std::unique_ptr<TensorWithLayout>(
+        SparseTensorWithLayout::Dummy(local_shape, *target_layout));
   } else {
     // Parse the indices, values, and dense_shape tensors and put them into
     // parallel tensors, and then pack it into a single SparseTensorWithLayout.
-    auto local_devices = target_parallel_device->mesh_config().local_devices();
+    auto local_devices = target_mesh.local_devices();
 
-    std::vector<parallel_device::TensorHandlePtr> indices_components;
-    std::vector<parallel_device::TensorHandlePtr> values_components;
-    std::vector<parallel_device::TensorHandlePtr> dense_shapes_components;
+    std::vector<TensorHandlePtr> indices_components;
+    std::vector<TensorHandlePtr> values_components;
+    std::vector<TensorHandlePtr> dense_shapes_components;
 
     // Just a nice trick to make code cleaner to pack each of indices, values,
     // shapes.
-    std::vector<std::vector<parallel_device::TensorHandlePtr>*> components{
+    std::vector<std::vector<TensorHandlePtr>*> components{
         &indices_components, &values_components, &dense_shapes_components};
     std::vector<TFE_TensorHandle**> input_vectors{indices, values, shapes};
     for (int component_index = 0; component_index < 3; ++component_index) {
@@ -1128,33 +1224,30 @@ TFE_TensorHandle* DTensorDevice::SparsePack(
     }
     std::unique_ptr<parallel_device::ParallelTensor> parallel_indices_tensor =
         parallel_device::ParallelTensor::FromTensorHandles(
-            target_parallel_device->parallel_device(),
-            std::move(indices_components), status);
+            **parallel_device, std::move(indices_components), status);
 
     std::unique_ptr<parallel_device::ParallelTensor> parallel_values_tensor =
         parallel_device::ParallelTensor::FromTensorHandles(
-            target_parallel_device->parallel_device(),
-            std::move(values_components), status);
+            **parallel_device, std::move(values_components), status);
 
     std::unique_ptr<parallel_device::ParallelTensor>
         parallel_dense_shapes_tensor =
             parallel_device::ParallelTensor::FromTensorHandles(
-                target_parallel_device->parallel_device(),
-                std::move(dense_shapes_components), status);
+                **parallel_device, std::move(dense_shapes_components), status);
 
     if (TF_GetCode(status) != TF_OK) return nullptr;
-    packed_tensor =
-        SparseTensorWithLayout::Wrap(std::move(parallel_indices_tensor),
-                                     std::move(parallel_values_tensor),
-                                     std::move(parallel_dense_shapes_tensor),
-                                     target_parallel_device->mesh_config(),
-                                     target_layout.value(), local_shape)
-            .value();
+    packed_tensor = SparseTensorWithLayout::Wrap(
+        std::move(parallel_indices_tensor), std::move(parallel_values_tensor),
+        std::move(parallel_dense_shapes_tensor), *target_layout, local_shape);
+  }
+
+  if (!packed_tensor.ok()) {
+    Set_TF_Status_from_Status(status, packed_tensor.status());
+    return nullptr;
   }
 
-  RecordInShapeLayoutCache(*packed_tensor);
   TFE_TensorHandle* output =
-      MakeLayoutTensorHandle(context, std::move(packed_tensor), status);
+      MakeLayoutTensorHandle(context, std::move(*packed_tensor), status);
   if (TF_GetCode(status) != TF_OK) return nullptr;
   return output;
 }
@@ -1186,6 +1279,8 @@ std::unordered_map<std::string, int> DTensorDevice::GetFunctionCacheStats(
       {"size", stats.size},
       {"device_cache.size", eager_stats.device_cache_size},
       {"kernel_cache.size", eager_stats.kernel_cache_size},
+      {"local_rendezvous_cache.active.size",
+       eager_stats.local_rendezvous_cache_active_size},
   };
   for (const auto& iter : eager_stats.func_kernel_cache_entries) {
     result[absl::StrCat("kernel_cache.", iter.first, ".size")] = iter.second;
@@ -1220,12 +1315,13 @@ void DTensorDevice::SetIteratorElementLayouts(
 }
 
 std::vector<std::unique_ptr<tensorflow::dtensor::TensorWithLayout>>
-DTensorDevice::Disassemble(TensorWithLayout* t, TF_Status* status) {
+DTensorDevice::Disassemble(TFE_Context* context, TensorWithLayout* t,
+                           TF_Status* status) {
   if (parallel_executor_) {
     StatusOr<
         std::vector<std::unique_ptr<tensorflow::dtensor::TensorWithLayout>>>
         tensor_with_layouts = parallel_executor_->Disassemble(t);
-    if (tensor_with_layouts.ok()) {
+    if (!tensor_with_layouts.ok()) {
       TF_SetStatus(status, TF_INTERNAL,
                    absl::StrCat("Failed in Disassemble of parallel executor ",
                                 tensor_with_layouts.status().ToString())
@@ -1236,23 +1332,23 @@ DTensorDevice::Disassemble(TensorWithLayout* t, TF_Status* status) {
   }
   std::vector<std::unique_ptr<tensorflow::dtensor::TensorWithLayout>>
       tensor_with_layouts;
-  const MeshWithParallelDevice* mesh_with_parallel_device =
-      mesh_to_device_map_[t->mesh()].get();
-  if (mesh_with_parallel_device == nullptr) {
-    TF_SetStatus(status, TF_INTERNAL,
-                 absl::StrCat("Mesh in Unpack: ", t->mesh().ToString(),
-                              "is not registered with DTensor")
-                     .c_str());
+
+  StatusOr<const parallel_device::ParallelDevice*> parallel_device =
+      GetParallelDevice(t->mesh(), /*strict=*/true);
+  if (!parallel_device.ok()) {
+    Set_TF_Status_from_Status(status, parallel_device.status());
     return tensor_with_layouts;
   }
+
   const int output_size = t->num_tensors();
-  const parallel_device::ParallelDevice& parallel_device =
-      mesh_with_parallel_device->parallel_device();
-  if (output_size != parallel_device.num_underlying_devices()) {
+  const int num_underlying_devices =
+      (*parallel_device)->num_underlying_devices();
+  if (output_size != num_underlying_devices &&
+      output_size != kSparseTensorNum * num_underlying_devices) {
     TF_SetStatus(status, TF_INTERNAL,
                  absl::StrCat("The output size is ", output_size,
                               " but the number of underlying devices is ",
-                              parallel_device.num_underlying_devices())
+                              num_underlying_devices)
                      .c_str());
     return tensor_with_layouts;
   }
@@ -1260,12 +1356,9 @@ DTensorDevice::Disassemble(TensorWithLayout* t, TF_Status* status) {
   tensor_with_layouts.reserve(output_size);
   for (int output_index = 0; output_index < output_size; ++output_index) {
     const std::string& underlying_device =
-        parallel_device.underlying_devices()[output_index];
-    TFE_TensorHandle* copied_tensor =
-        TFE_TensorHandleCopySharingTensor(t->get_tensor(output_index), status);
-    if (TF_GetCode(status) != TF_OK) {
-      return tensor_with_layouts;
-    }
+        (*parallel_device)
+            ->underlying_devices()[output_index % num_underlying_devices];
+
     StatusOr<Mesh> single_device_mesh =
         Mesh::GetSingleDeviceMesh(underlying_device);
     if (!single_device_mesh.ok()) {
@@ -1275,19 +1368,9 @@ DTensorDevice::Disassemble(TensorWithLayout* t, TF_Status* status) {
                        .c_str());
       return tensor_with_layouts;
     }
-    StatusOr<Layout> single_device_layout =
-        Layout::GetSingleDeviceLayout(*single_device_mesh);
-    if (!single_device_layout.ok()) {
-      TF_SetStatus(status, TF_INTERNAL,
-                   absl::StrCat("Failed to create single device layout ",
-                                single_device_layout.status().ToString())
-                       .c_str());
-      return tensor_with_layouts;
-    }
-    parallel_device::TensorHandlePtr single_tensor(copied_tensor);
-    std::unique_ptr<TensorWithLayoutTf> tensor_with_layout =
-        TensorWithLayoutTf::Wrap(std::move(single_tensor), *single_device_mesh,
-                                 *single_device_layout, status);
+
+    std::unique_ptr<TensorWithLayout> tensor_with_layout = Broadcast(
+        context, t->get_tensor(output_index), *single_device_mesh, status);
     if (TF_GetCode(status) != TF_OK) {
       return tensor_with_layouts;
     }
@@ -1466,14 +1549,19 @@ DTensorDevice::DTensorOperationToModule(
   // Output layouts must be inferred before cache
   // key computation, since they might depend on the current DTensorDevice
   // state.
-  TF_RETURN_IF_ERROR(InferOutputLayouts(doperation, eager_attributes,
-                                        default_layout_, result.graph.get(),
-                                        &result.output_layouts));
+  std::pair<tensorflow::Fprint128, const mlir::OwningOpRef<mlir::ModuleOp>*>
+      cached_key_and_module;
+  {
+    mutex_lock lock(mu_default_layout_);
+    TF_RETURN_IF_ERROR(InferOutputLayouts(doperation, eager_attributes,
+                                          default_layout_, result.graph.get(),
+                                          &result.output_layouts));
 
-  TF_ASSIGN_OR_RETURN(
-      auto cached_key_and_module,
-      module_manager_->GetCachedExecutable(doperation, eager_attributes, inputs,
-                                           result.output_layouts));
+    TF_ASSIGN_OR_RETURN(
+        cached_key_and_module,
+        module_manager_->GetCachedExecutable(doperation, eager_attributes,
+                                             inputs, result.output_layouts));
+  }
   auto [cache_key, cached_mlir_module] = cached_key_and_module;
   result.doperation_cache_key = cache_key;
 
@@ -1668,12 +1756,13 @@ void DTensorDevice::ModuleToExecutionFunctions(
 
 void DTensorDevice::ExecuteFunctionAndWait(
     TFE_Context* context, const TranslatedFunction* function_ptr,
-    const MeshWithParallelDevice* parallel_device_mesh,
-    const std::vector<parallel_device::ParallelTensor*>& parallel_inputs,
+    const Mesh& target_mesh,
+    const parallel_device::ParallelDevice* parallel_device,
+    const std::vector<std::vector<TFE_TensorHandle*>>& parallel_inputs,
     const int64_t step_id, const TFE_OpAttrs* attributes, TF_Status* status) {
   const std::string mesh_str = function_ptr->function_mesh.ToString();
   VLOG(4) << "Launching computation for mesh : " << mesh_str;
-  parallel_device_mesh->parallel_device().StartExecute(
+  parallel_device->StartExecute(
       context,
       /*inputs=*/parallel_inputs,
       /*operation_name=*/function_ptr->translated_function_name.c_str(),
@@ -1683,8 +1772,7 @@ void DTensorDevice::ExecuteFunctionAndWait(
       /*step_id=*/step_id);
 
   VLOG(4) << "Joining computation result from mesh : " << mesh_str;
-  parallel_device_mesh->parallel_device().Join(
-      function_ptr->local_output_shapes, status);
+  parallel_device->Join(function_ptr->local_output_shapes, status);
   VLOG(4) << "Joining status: " << TF_Message(status);
   if (TF_GetCode(status) != TF_OK && TF_GetCode(status) != TF_CANCELLED) {
     LOG(ERROR) << "Encountered error while executing function: "
@@ -1707,13 +1795,13 @@ void DTensorDevice::ParallelExecuteRegularOperation(
     mlir::ModuleOp mlir_module, const DTensorOperation& doperation,
     const TFE_OpAttrs* attributes, int* num_outputs, TFE_TensorHandle** outputs,
     TF_Status* status) {
-  auto future_result =
-      parallel_executor_->Execute(context, inputs, mlir_module,
-                                  /*entry_function_name=*/"main", attributes);
-  auto result_with_status = future_result.Await();
+  ASSIGN_OR_RETURN_C_STATUS(
+      ParallelExecutor::ExecutionResult execution_result,
+      parallel_executor_->Execute(context, inputs, mlir_module, attributes),
+      status);
+  RETURN_C_STATUS_IF_NOT_OK(execution_result.status.Await(), status);
 
-  std::vector<TensorWithLayout*> typed_outputs;
-  ASSIGN_OR_RETURN_C_STATUS(typed_outputs, result_with_status, status);
+  std::vector<TensorWithLayout*> typed_outputs = execution_result.outputs;
   // assign outputs and take outputs' ownership
   *num_outputs = typed_outputs.size();
   for (int i = 0; i < *num_outputs; ++i) {
@@ -1722,10 +1810,145 @@ void DTensorDevice::ParallelExecuteRegularOperation(
   }
 }
 
+void DTensorDevice::ExecuteSingleDeviceOperation(
+    TFE_Context* context, const std::vector<TFE_TensorHandle*>& inputs,
+    const std::string& operation_name, const std::string& device_name,
+    const TFE_OpAttrs* attributes, int* num_outputs, TFE_TensorHandle** outputs,
+    TF_Status* status) {
+  std::unique_ptr<tensorflow::EagerOperation> new_op(
+      reinterpret_cast<EagerOperation*>(
+          tensorflow::unwrap(context)->CreateOperation()));
+
+  // TODO(b/274647196): don't forget setting step id when this is moved after
+  // rewrite.
+  Set_TF_Status_from_Status(
+      status, new_op->Reset(operation_name.c_str(),
+                            /*device_name=*/device_name.c_str(),
+                            /*remote=*/false, eager_executor_.get()));
+  if (TF_GetCode(status) != TF_OK) return;
+  new_op->AddAttrs(tensorflow::unwrap(attributes));
+  for (int input_index = 0; input_index < inputs.size(); ++input_index) {
+    auto input = inputs[input_index];
+    const char* input_device = TFE_TensorHandleDeviceName(input, status);
+    if (TF_GetCode(status) != TF_OK) return;
+    if (input_device == name_) {
+      TensorWithLayout* t = reinterpret_cast<TensorWithLayout*>(
+          TFE_TensorHandleDevicePointer(input, status));
+      if (!t->layout().IsSingleDevice() && !t->layout().IsFullyReplicated()) {
+        TF_SetStatus(status, TF_UNIMPLEMENTED,
+                     absl::StrCat("Trying to copy a non-single-device and "
+                                  "non-replicated DTensor is not "
+                                  "supported. Input tensor is: ",
+                                  t->DebugString())
+                         .c_str());
+
+        return;
+      }
+      input = t->get_tensor(0);
+    }
+    Set_TF_Status_from_Status(status,
+                              new_op->AddInput(tensorflow::unwrap(input)));
+    if (TF_GetCode(status) != TF_OK) return;
+  }
+  new_op->SetCancellationManager(cancellation_manager_.get());
+
+  absl::FixedArray<tensorflow::AbstractTensorHandle*> output_raw_handles(
+      *num_outputs);
+
+  // This line executes the Op.
+  Set_TF_Status_from_Status(
+      status, new_op->Execute(absl::MakeSpan(output_raw_handles), num_outputs));
+
+  if (TF_GetCode(status) != TF_OK) return;
+  for (int output_index = 0; output_index < *num_outputs; ++output_index) {
+    outputs[output_index] =
+        tensorflow::wrap(reinterpret_cast<ImmediateExecutionTensorHandle*>(
+            output_raw_handles[output_index]));
+  }
+}
+
+void DTensorDevice::ExecuteEPUFunctions(
+    TFE_Context* context, std::vector<TensorWithLayoutTf*> inputs,
+    const ExecutionFunctions* execution_functions,
+    const TFE_OpAttrs* attributes, uint64 step_id,
+    absl::flat_hash_set<std::string>* executed_fn_names, TF_Status* status) {
+  std::unique_ptr<const TranslatedFunction> epu_fn_ptr, load_embedding_ptr;
+
+  for (const TranslatedFunction& function :
+       execution_functions->function_list) {
+    if (function.function_mesh.is_epu_mesh()) {
+      if (epu_fn_ptr != nullptr) {
+        RETURN_STATUS(status, TF_INTERNAL,
+                      "There are more than one function defined on EPU mesh.");
+      }
+      epu_fn_ptr = std::make_unique<const TranslatedFunction>(function);
+    }
+    if (absl::StartsWith(function.function_name, kLoadEmbeddingFn)) {
+      if (load_embedding_ptr != nullptr) {
+        RETURN_STATUS(status, TF_INTERNAL,
+                      "There are more than one function defined on EPU mesh.");
+      }
+      load_embedding_ptr = std::make_unique<const TranslatedFunction>(function);
+    }
+  }
+
+  // Execute excluded functions in sequence.
+  if (epu_fn_ptr != nullptr) {
+    StatusOr<Mesh> mesh = epu_fn_ptr->function_mesh;
+    if (mesh->is_epu_mesh()) {
+      mesh = mesh->ToDeviceType("CPU");
+    }
+
+    if (!mesh.ok()) {
+      RETURN_STATUS(status, TF_INVALID_ARGUMENT,
+                    absl::StrCat("Failed to convert mesh, get error: ",
+                                 mesh.status().message())
+                        .c_str());
+    }
+
+    ASSIGN_OR_RETURN_C_STATUS(
+        const parallel_device::ParallelDevice* parallel_device,
+        GetParallelDevice(*mesh, /*strict=*/false), status);
+
+    ExecuteFunctionAndWait(context,
+                           /*function_ptr=*/epu_fn_ptr.get(), *mesh,
+                           parallel_device,
+                           /*parallel_inputs=*/{}, /*step_id=*/step_id,
+                           /*attributes=*/attributes,
+                           /*status=*/status);
+    executed_fn_names->insert(epu_fn_ptr->translated_function_name);
+  }
+
+  if (load_embedding_ptr != nullptr) {
+    Mesh mesh = load_embedding_ptr->function_mesh;
+
+    StatusOr<std::vector<std::vector<TFE_TensorHandle*>>> parallel_inputs =
+        PrepareEmbeddingInputs(inputs);
+
+    if (!parallel_inputs.ok()) {
+      RETURN_STATUS(status, TF_INTERNAL,
+                    tsl::NullTerminatedMessage(parallel_inputs.status()));
+    }
+
+    ASSIGN_OR_RETURN_C_STATUS(
+        const parallel_device::ParallelDevice* parallel_device,
+        GetParallelDevice(mesh, /*strict=*/false), status);
+
+    ExecuteFunctionAndWait(context,
+                           /*function_ptr=*/load_embedding_ptr.get(), mesh,
+                           parallel_device,
+                           /*parallel_inputs=*/*parallel_inputs,
+                           /*step_id=*/step_id,
+                           /*attributes=*/attributes, /*status=*/status);
+    executed_fn_names->insert(load_embedding_ptr->translated_function_name);
+  }
+}
+
 void DTensorDevice::ExecuteRegularOperation(
     TFE_Context* context, const std::vector<TensorWithLayout*>& inputs,
     const DTensorOperation& doperation, const TFE_OpAttrs* attributes,
     int* num_outputs, TFE_TensorHandle** outputs, TF_Status* status) {
+  mutex_lock lock(mu_);
   ASSIGN_OR_RETURN_C_STATUS(auto eager_attributes, FetchAttributes(attributes),
                             status);
 
@@ -1745,12 +1968,6 @@ void DTensorDevice::ExecuteRegularOperation(
     return;
   }
 
-  std::vector<TensorWithLayoutTf*> inputs_tf;
-  inputs_tf.reserve(inputs.size());
-  for (const auto& input : inputs) {
-    inputs_tf.push_back(llvm::cast<TensorWithLayoutTf>(input));
-  }
-
   const ExecutionFunctions* execution_functions = nullptr;
   ModuleToExecutionFunctions(context, inputs, doperation, eager_attributes,
                              *num_outputs, lowering_context,
@@ -1781,52 +1998,7 @@ void DTensorDevice::ExecuteRegularOperation(
 
   int num_global_outputs = 0;
 
-  std::map<std::string, const MeshWithParallelDevice*>
-      function_name_and_mesh_mapping;
   absl::flat_hash_set<std::string> excluded_fn_names;
-  std::unique_ptr<const TranslatedFunction> epu_fn_ptr, load_embedding_ptr;
-  for (const TranslatedFunction& function :
-       execution_functions->function_list) {
-    StatusOr<Mesh> maybe_converted_mesh = function.function_mesh;
-    if (function.function_mesh.is_epu_mesh()) {
-      maybe_converted_mesh = function.function_mesh.ToDeviceType("CPU");
-    }
-
-    if (!maybe_converted_mesh.ok()) {
-      RETURN_STATUS(status, TF_INVALID_ARGUMENT,
-                    absl::StrCat("Failed to convert mesh, get error: ",
-                                 maybe_converted_mesh.status().error_message())
-                        .c_str());
-    }
-    const Mesh& mesh = *maybe_converted_mesh;
-    const MeshWithParallelDevice* parallel_device_mesh =
-        mesh_to_device_map_.contains(mesh)
-            ? mesh_to_device_map_[mesh].get()
-            : mesh_to_device_map_[*default_mesh_].get();
-    if (parallel_device_mesh == nullptr) {
-      RETURN_STATUS(status, TF_INTERNAL,
-                    "required mesh is not registered with DTensor device");
-    }
-    function_name_and_mesh_mapping[function.translated_function_name] =
-        parallel_device_mesh;
-
-    if (function.function_mesh.is_epu_mesh()) {
-      if (epu_fn_ptr != nullptr) {
-        RETURN_STATUS(status, TF_INTERNAL,
-                      "There are more than one function defined on EPU mesh.");
-      }
-      epu_fn_ptr = std::make_unique<const TranslatedFunction>(function);
-      excluded_fn_names.insert(function.translated_function_name);
-    }
-    if (absl::StartsWith(function.function_name, kLoadEmbeddingFn)) {
-      if (load_embedding_ptr != nullptr) {
-        RETURN_STATUS(status, TF_INTERNAL,
-                      "There are more than one function defined on EPU mesh.");
-      }
-      load_embedding_ptr = std::make_unique<const TranslatedFunction>(function);
-      excluded_fn_names.insert(function.translated_function_name);
-    }
-  }
 
   // Compute the step_id based on the function_mesh_fingerprint and the
   // corresponding function execution counter.
@@ -1843,47 +2015,33 @@ void DTensorDevice::ExecuteRegularOperation(
       function_mesh_fingerprint,
       func_mesh_fingerprint_to_step_counter_.at(function_mesh_fingerprint));
 
-  // Execute excluded functions in sequence.
-  if (epu_fn_ptr != nullptr) {
-    ExecuteFunctionAndWait(
-        context,
-        /*function_ptr=*/epu_fn_ptr.get(),
-        /*parallel_device_mesh=*/
-        function_name_and_mesh_mapping[epu_fn_ptr->translated_function_name],
-        /*parallel_inputs=*/{}, /*step_id=*/step_id, /*attributes=*/attributes,
-        /*status=*/status);
+  std::vector<TensorWithLayoutTf*> inputs_tf;
+  inputs_tf.reserve(inputs.size());
+  for (const auto& input : inputs) {
+    inputs_tf.push_back(llvm::cast<TensorWithLayoutTf>(input));
   }
 
-  if (load_embedding_ptr != nullptr) {
-    StatusOr<std::vector<parallel_device::ParallelTensor*>> parallel_inputs =
-        PrepareEmbeddingInputs(inputs_tf);
-    if (!parallel_inputs.ok()) {
-      RETURN_STATUS(status, TF_INTERNAL,
-                    parallel_inputs.status().error_message().c_str());
-    }
-    ExecuteFunctionAndWait(
-        context,
-        /*function_ptr=*/load_embedding_ptr.get(),
-        /*parallel_device_mesh=*/
-        function_name_and_mesh_mapping[load_embedding_ptr
-                                           ->translated_function_name],
-        /*parallel_inputs=*/*parallel_inputs, /*step_id=*/step_id,
-        /*attributes=*/attributes, /*status=*/status);
-  }
+  ExecuteEPUFunctions(context, inputs_tf, execution_functions, attributes,
+                      step_id, &excluded_fn_names, status);
+
+  if (TF_GetCode(status) != TF_OK) return;
 
   // Extract the global parallel inputs and flatten SparseTensors
   // into the three component tensors.
-  std::vector<parallel_device::ParallelTensor*> global_parallel_inputs;
-  std::vector<parallel_device::ParallelTensor*> global_parallel_sparse_inputs;
+  std::vector<std::vector<TFE_TensorHandle*>> global_parallel_inputs;
+  std::vector<std::vector<TFE_TensorHandle*>> global_parallel_sparse_inputs;
   absl::flat_hash_set<int> global_sparse_input_indices;
   for (auto input : inputs_tf) {
     if (auto* sparse_input = llvm::dyn_cast<SparseTensorWithLayout>(input);
         sparse_input) {
-      global_parallel_sparse_inputs.push_back(sparse_input->indices());
-      global_parallel_sparse_inputs.push_back(sparse_input->dense_shapes());
-      global_parallel_sparse_inputs.push_back(sparse_input->values());
+      global_parallel_sparse_inputs.emplace_back(
+          sparse_input->indices()->tensors());
+      global_parallel_sparse_inputs.emplace_back(
+          sparse_input->dense_shapes()->tensors());
+      global_parallel_sparse_inputs.emplace_back(
+          sparse_input->values()->tensors());
     } else {
-      global_parallel_inputs.push_back(input->tensor());
+      global_parallel_inputs.push_back(input->tensors());
     }
   }
   // Insert SparseTensor components to the end, this is because
@@ -1900,6 +2058,8 @@ void DTensorDevice::ExecuteRegularOperation(
     const std::string& translated_function_name =
         function.translated_function_name;
 
+    VLOG(4) << "Launching computation for mesh : " << mesh.ToString();
+
     num_global_outputs += function.local_output_shapes.size();
 
     if (is_remote_mesh(mesh) ||
@@ -1909,12 +2069,16 @@ void DTensorDevice::ExecuteRegularOperation(
       // excluded.
       continue;
     }
+    if (mesh.IsSingleDevice()) {
+      continue;
+    }
 
-    const MeshWithParallelDevice* parallel_device_mesh =
-        function_name_and_mesh_mapping[translated_function_name];
+    ASSIGN_OR_RETURN_C_STATUS(
+        const parallel_device::ParallelDevice* parallel_device,
+        GetParallelDevice(mesh, /*strict=*/false), status);
 
     // Gather the local inputs for this function.
-    std::vector<parallel_device::ParallelTensor*> parallel_inputs;
+    std::vector<std::vector<TFE_TensorHandle*>> parallel_inputs;
     parallel_inputs.reserve(inputs.size() + 1);
     auto input_mapping = function.input_index_map;
 
@@ -1929,16 +2093,16 @@ void DTensorDevice::ExecuteRegularOperation(
       auto input_index = global_index - execution_functions->num_device_ids;
 
       if (global_index < execution_functions->num_device_ids) {
-        parallel_inputs.push_back(
-            parallel_device_mesh->DeviceIDs(context, status));
+        parallel_device::ParallelTensor* device_ids =
+            GetDeviceIDs(context, status, mesh, parallel_device);
         if (TF_GetCode(status) != TF_OK) return;
+        parallel_inputs.emplace_back(device_ids->tensors());
       } else {
-        parallel_inputs.push_back(global_parallel_inputs[input_index]);
+        parallel_inputs.emplace_back(global_parallel_inputs[input_index]);
       }
     }
 
-    VLOG(4) << "Launching computation for mesh : " << mesh.ToString();
-    parallel_device_mesh->parallel_device().StartExecute(
+    parallel_device->StartExecute(
         context, parallel_inputs, translated_function_name.c_str(), attributes,
         /*expected_max_outputs=*/function.local_output_shapes.size(),
         *cancellation_manager_, /*step_id=*/step_id);
@@ -1958,13 +2122,48 @@ void DTensorDevice::ExecuteRegularOperation(
     if (excluded_fn_names.contains(function.translated_function_name)) {
       continue;
     }
-    const Mesh& mesh = function.function_mesh;
-    const MeshWithParallelDevice* parallel_device_mesh =
-        function_name_and_mesh_mapping[function.translated_function_name];
+    const Mesh mesh = function.function_mesh;
+
+    ASSIGN_OR_RETURN_C_STATUS(
+        const parallel_device::ParallelDevice* parallel_device,
+        GetParallelDevice(mesh, /*strict=*/false), status);
 
     std::vector<std::unique_ptr<TensorWithLayout>> output_with_layout;
     output_with_layout.reserve(function.output_index_map.size());
-    if (is_remote_mesh(mesh)) {
+    VLOG(4) << "Joining computation result from mesh : " << mesh.ToString();
+
+    if (mesh.IsSingleDevice()) {
+      const std::string device_name = std::string{mesh.single_device()};
+      std::vector<TFE_TensorHandle*> single_device_inputs;
+      single_device_inputs.reserve(inputs_tf.size());
+      for (auto input_tf : inputs_tf) {
+        if (!input_tf->layout().IsSingleDevice()) {
+          Set_TF_Status_from_Status(
+              status, errors::InvalidArgument(
+                          "Some of the inputs are not single device"));
+          break;
+        }
+        single_device_inputs.push_back(input_tf->single_tensor());
+      }
+      int num_outputs = function.output_index_map.size();
+
+      std::vector<TFE_TensorHandle*> single_device_outputs;
+      single_device_outputs.resize(num_outputs);
+
+      if (TF_GetCode(status) == TF_OK) {
+        ExecuteSingleDeviceOperation(context, single_device_inputs,
+                                     function.translated_function_name,
+                                     device_name, attributes, &num_outputs,
+                                     single_device_outputs.data(), status);
+        for (auto output : single_device_outputs) {
+          auto output_tensor = Broadcast(context, output, mesh, status);
+          if (TF_GetCode(status) != TF_OK) {
+            break;
+          }
+          output_with_layout.push_back(std::move(output_tensor));
+        }
+      }
+    } else if (is_remote_mesh(mesh)) {
       // Create dummy outputs on a remote mesh.
       for (int i = 0; i < function.output_index_map.size(); ++i) {
         const auto dim_sizes = function.local_output_shapes.at(i).dim_sizes();
@@ -1973,44 +2172,52 @@ void DTensorDevice::ExecuteRegularOperation(
         TF_DataType dtype =
             static_cast<TF_DataType>(function.output_dtypes.at(i));
         auto remote_output = CreateDummyTensorWithLayout(
-            local_shape, dtype, parallel_device_mesh->mesh_config(),
-            function.output_layouts[i]);
+            local_shape, dtype, function.output_layouts[i]);
         output_with_layout.push_back(std::move(remote_output));
       }
     } else {
-      VLOG(4) << "Joining computation result from mesh : " << mesh.ToString();
-      auto result = parallel_device_mesh->parallel_device().Join(
-          function.local_output_shapes, status);
-      if (TF_GetCode(join_status.get()) != TF_OK &&
-          // Preserve the first failure we see, but only if it is a real failure
-          // and not a cancellation (which was probably triggered by the error
-          // we want to propagate).
-          (TF_GetCode(status) == TF_OK ||
-           TF_GetCode(join_status.get()) != TF_CANCELLED)) {
-        continue;
-      }
-      if (TF_GetCode(status) != TF_OK) {
-        if (TF_GetCode(status) != TF_CANCELLED) {
-          LOG(ERROR) << "Encountered error while executing function: "
-                     << function.translated_function_name
-                     << " for mesh : " << mesh.ToString()
-                     << " / error : " << TF_Message(status);
-        }
-        TF_SetStatus(join_status.get(), TF_GetCode(status), TF_Message(status));
-        continue;
-      }
+      auto result = parallel_device->Join(function.local_output_shapes, status);
 
-      for (int i = 0; i < result->size(); ++i) {
-        ASSIGN_OR_RETURN_C_STATUS(
-            auto local_output,
-            CreateTensorWithLayout(std::move((*result)[i]),
-                                   parallel_device_mesh->mesh_config(),
-                                   function.output_layouts[i]),
-            status);
-        output_with_layout.push_back(std::move(local_output));
+      if (TF_GetCode(status) == TF_OK) {
+        for (int i = 0; i < result->size(); ++i) {
+          auto& result_tensor = (*result)[i];
+          const std::vector<int64_t>* result_tensor_shape;
+          Status shape_status = result_tensor->Shape(&result_tensor_shape);
+          if (!shape_status.ok()) {
+            Set_TF_Status_from_Status(status, shape_status);
+            return;
+          }
+
+          ASSIGN_OR_RETURN_C_STATUS(
+              auto local_output,
+              CreateTensorWithLayout(result_tensor->release_tensors(),
+                                     function.output_layouts[i],
+                                     *result_tensor_shape),
+              status);
+          output_with_layout.push_back(std::move(local_output));
+        }
       }
     }
 
+    if (TF_GetCode(join_status.get()) != TF_OK &&
+        // Preserve the first failure we see, but only if it is a real failure
+        // and not a cancellation (which was probably triggered by the error
+        // we want to propagate).
+        (TF_GetCode(status) == TF_OK ||
+         TF_GetCode(join_status.get()) != TF_CANCELLED)) {
+      continue;
+    }
+    if (TF_GetCode(status) != TF_OK) {
+      if (TF_GetCode(status) != TF_CANCELLED) {
+        LOG(ERROR) << "Encountered error while executing function: "
+                   << function.translated_function_name
+                   << " for mesh : " << mesh.ToString()
+                   << " / error : " << TF_Message(status);
+      }
+      TF_SetStatus(join_status.get(), TF_GetCode(status), TF_Message(status));
+      continue;
+    }
+
     for (int i = 0; i < function.output_index_map.size(); ++i) {
       // TODO(b/162744844): Generalize this pattern so that the extraction is
       // not special cased.
@@ -2023,7 +2230,6 @@ void DTensorDevice::ExecuteRegularOperation(
                 function.shape_output_metadata.at(i));
       }
 
-      RecordInShapeLayoutCache(*output_with_layout[i]);
       typed_outputs[function.output_index_map[i]] =
           std::move(output_with_layout[i]);
     }
@@ -2065,14 +2271,11 @@ void DTensorDevice::ExecuteRegularOperation(
       RETURN_STATUS(status, TF_INTERNAL,
                     "Expected one output from VarHandleOp");
     }
-    NameAttrList name_and_attrs;
-    ASSIGN_OR_RETURN_C_STATUS(name_and_attrs, FetchAttributes(attributes),
-                              status);
 
     RETURN_C_STATUS_IF_NOT_OK(
         llvm::cast<ResourceHandleWithLayout>(typed_outputs[0].get())
-            ->UpdateShapeAndDType(name_and_attrs.attr().at("shape").shape(),
-                                  name_and_attrs.attr().at("dtype").type()),
+            ->UpdateShapeAndDType(eager_attributes.attr().at("shape").shape(),
+                                  eager_attributes.attr().at("dtype").type()),
         status);
   }
 
@@ -2092,8 +2295,8 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
   const TFE_OpAttrs* attributes = TFE_OpGetAttrs(original_op);
   int num_inputs = TFE_OpGetFlatInputCount(original_op, status);
   if (TF_GetCode(status) != TF_OK) return;
-  std::vector<TFE_TensorHandle*> inputs_vector;
-  inputs_vector.reserve(num_inputs);
+  std::vector<TFE_TensorHandle*> inputs;
+  inputs.reserve(num_inputs);
 
   std::vector<TF_DataType> dtypes;
   dtypes.reserve(num_inputs);
@@ -2102,11 +2305,10 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
     TFE_TensorHandle* input =
         TFE_OpGetFlatInput(original_op, input_index, status);
     if (TF_GetCode(status) != TF_OK) return;
-    inputs_vector.push_back(input);
+    inputs.push_back(input);
     dtypes.push_back(TFE_TensorHandleDataType(input));
   }
-  TFE_TensorHandle** inputs = inputs_vector.data();
-  if (!default_mesh_) {
+  if (!GetDefaultMesh().has_value()) {
     RETURN_STATUS(status, TF_INVALID_ARGUMENT,
                   "No default mesh has been registered to DTensor. Use "
                   "dtensor.default_mesh to "
@@ -2114,32 +2316,30 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
   }
   FunctionLibraryDefinition* flib_def =
       tensorflow::unwrap(context)->FuncLibDef();
+
+  StackTracesMap stack_traces;
+  const StackTracesMap* op_stack_traces =
+      flib_def->GetStackTraces(operation_name);
+  if (op_stack_traces != nullptr) {
+    stack_traces = *op_stack_traces;
+  }
   DTensorOperation dtensor_operation{
       /*name*/ operation_name,
       /*function_def*/
       tensorflow::unwrap(context)->FindFunctionDef(operation_name),
-      /*default_mesh*/ *default_mesh_,
+      /*default_mesh*/ GetDefaultMesh().value(),
       /*stack_traces*/
-      flib_def->GetStackTraces(operation_name),
+      stack_traces,
   };
 
-  // First handle DTensor-specific virtual operations.
-  bool is_op_handled = false;
-  MaybeHandleDTensorCustomOps(operation_name, num_inputs, attributes, context,
-                              inputs, num_outputs, outputs, &is_op_handled,
-                              status);
-  if (is_op_handled) return;
-
-  // This isn't a special op, so we'll defer to TFE_Execute to actually execute
-  // it, but we'll also run DTensor MLIR passes and propagate the layout.
   std::vector<TensorWithLayout*> typed_inputs;
-  std::vector<std::unique_ptr<TensorWithLayout>> inputs_with_no_layout;
+  std::vector<std::unique_ptr<TensorWithLayout>> broadcast_results_keep_alive;
 
   // Record a unique mesh identified through all inputs that's already on
   // DTensor device. If we can identify a single mesh, the same mesh is used as
   // the mesh to broadcast non-dtensor inputs.
   absl::flat_hash_set<Mesh> input_meshes;
-  std::vector<int> not_on_device_input_indices;
+  std::vector<int> single_device_input_indices;
 
   typed_inputs.resize(num_inputs);
   for (int j = 0; j < num_inputs; ++j) {
@@ -2147,7 +2347,8 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
     const char* input_device = TFE_TensorHandleDeviceName(input, status);
     if (TF_GetCode(status) != TF_OK) return;
     if (name_ != input_device) {
-      not_on_device_input_indices.push_back(j);
+      single_device_input_indices.push_back(j);
+      typed_inputs[j] = nullptr;
       continue;
     }
     // Handle input which is on DTensor device already.
@@ -2159,29 +2360,66 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
     if (!t->layout().mesh().IsEmpty()) {
       input_meshes.insert(t->layout().mesh());
     }
-    // Remote mesh inputs are not able to be read and evaluated.
-    if (!is_remote_mesh(t->layout().mesh()) &&
-        t->const_value_node() != nullptr &&
-        !t->const_value_node()->const_value().has_value()) {
-      std::optional<NodeDef> const_value =
-          ExtractSmallTensorValue(context, input, t->layout(), status);
-      if (TF_GetCode(status) != TF_OK) return;
-      if (const_value.has_value()) {
-        t->const_value_node()->set_const_value(const_value.value());
-      }
-    }
     typed_inputs[j] = t;
   }
 
   const std::optional<Mesh> mesh = ChooseBroadcastingMesh(input_meshes, dtypes);
-  if (!mesh) {
+
+  // TODO(feyu): This short circuit only allows running unsupported op
+  // via DTensorDevice in eager mode. for tf.function and its graph, we will
+  // need to build single device mesh placement rules in mesh propagation.
+  // Then we can consolidate this eager mode short-circuit with the rewrite
+  // passes. (If the performance regression is acceptable).
+  if (ShouldRunAsSingleDevice(dtensor_operation)) {
+    std::string device_name = "";
+    if (mesh.has_value()) {
+      device_name = std::string{mesh->single_device()};
+    }
+    ExecuteSingleDeviceOperation(context, inputs, dtensor_operation.name,
+                                 device_name, attributes, num_outputs, outputs,
+                                 status);
+    return;
+  }
+
+  // This isn't a special op, we'll also run DTensor MLIR passes and propagate
+  // the layout, before deferring to TFE_Execute to deal with the processed
+  // graph.
+
+  if (!mesh.has_value()) {
     RETURN_STATUS(status, TF_INVALID_ARGUMENT,
                   "No mesh has been registered to DTensor. Use copy_to_mesh to "
                   "explicit specify a mesh instead.");
   }
 
-  for (int not_on_device_input_index : not_on_device_input_indices) {
-    TFE_TensorHandle* input = inputs[not_on_device_input_index];
+  // Materialize and record small constant tensor value, such that they can be
+  // folded if desirable.
+  for (int j = 0; j < num_inputs; ++j) {
+    TFE_TensorHandle* input = inputs[j];
+    TensorWithLayout* t = typed_inputs[j];
+    // FIXME(b/274647196): Use Single device layout.
+    if (t == nullptr) continue;
+
+    // Remote mesh inputs are not able to be read and evaluated.
+    if (is_remote_mesh(t->layout().mesh())) continue;
+
+    if (t->const_value_node() != nullptr &&
+        !t->const_value_node()->const_value().has_value()) {
+      std::optional<NodeDef> const_value = std::nullopt;
+      {
+        mutex_lock lock(mu_);
+        const_value =
+            ExtractSmallTensorValue(context, input, t->layout(), status);
+      }
+      if (TF_GetCode(status) != TF_OK) return;
+      if (const_value.has_value()) {
+        t->const_value_node()->set_const_value(const_value.value());
+      }
+    }
+  }
+
+  // Convert single device Tensor inputs to TensorWithLayout.
+  for (int single_device_input_index : single_device_input_indices) {
+    TFE_TensorHandle* input = inputs[single_device_input_index];
     // DTensor creation should be explicit, with some exceptions for usability
     // (scalars/shapes/slice specs/etc.) Here we do some trivial validation to
     // enforce this rule.
@@ -2192,20 +2430,26 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
     TF_DataType dtype = TFE_TensorHandleDataType(input);
     const bool small_int_tensor = num_elements < kSmallTensorThreshold &&
                                   (dtype == TF_INT32 || dtype == TF_INT64);
+
     // Only allow large constant autobroadcast for CopyToMesh and Relayout ops.
-    if ((operation_name != std::string("CopyToMesh") &&
-         operation_name != std::string("CopyToMeshGrad") &&
-         operation_name != std::string("Relayout") &&
-         operation_name != std::string("RelayoutGrad")) &&
+    if (!mesh->IsSingleDevice()  // Broadcast to single device tensor is
+                                 // allowed.
+        && (operation_name != std::string("CopyToMesh") &&
+            operation_name != std::string("CopyToMeshGrad") &&
+            operation_name != std::string("Relayout") &&
+            operation_name != std::string("RelayoutLike")) &&
         !(num_dims == 0 || dtype == TF_STRING || small_int_tensor)) {
-      std::vector<int64_t> tensor_shape(TensorShapeAsVector(input, status));
-      if (TF_GetCode(status) != TF_OK) return;
+      StatusOr<std::vector<int64_t>> shape = GetTensorShapeAsVector(input);
+      if (!shape.ok()) {
+        Set_TF_Status_from_Status(status, shape.status());
+        return;
+      }
       RETURN_STATUS(
           status, TF_UNIMPLEMENTED,
           absl::StrCat(
               "The op/function ", operation_name,
-              " got a regular tensor for input ", not_on_device_input_index,
-              " (shape ", ShapeToDebugString(tensor_shape),
+              " got a regular tensor for input ", single_device_input_index,
+              " (shape ", ShapeToDebugString(*shape),
               ") but was expecting a DTensor. Currently only scalars and "
               "small integer/string tensors are auto-broadcast to "
               "DTensors. For other tensors, please use copy_to_mesh to "
@@ -2214,7 +2458,7 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
               .c_str());
     }
     // Construct temporary TensorWithLayout objects for inputs that didn't
-    // have any to start. These are owned by the `inputs_with_no_layout`
+    // have any to start. These are owned by the `broadcast_results_keep_alive`
     // vector, whereas the input `TFE_TensorHandle`s maintain ownership for
     // inputs that already had layouts (and therefor had TensorWithLayout
     // objects).
@@ -2223,8 +2467,8 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
     if (TF_GetCode(status) != TF_OK) {
       return;
     }
-    typed_inputs[not_on_device_input_index] = tensor_with_layout.get();
-    inputs_with_no_layout.emplace_back(tensor_with_layout.release());
+    typed_inputs[single_device_input_index] = tensor_with_layout.get();
+    broadcast_results_keep_alive.emplace_back(std::move(tensor_with_layout));
   }
 
   ExecuteRegularOperation(context, typed_inputs, dtensor_operation, attributes,
@@ -2263,9 +2507,11 @@ TFE_TensorHandle* CopyFromDTensorDevice(TFE_Context* context,
   }
   TensorWithLayout* typed_input = reinterpret_cast<TensorWithLayout*>(
       TFE_TensorHandleDevicePointer(tensor, status));
-  if (!tensorflow::dtensor::Layout(typed_input->layout()).IsFullyReplicated()) {
+  if (!typed_input->layout().IsSingleDevice() &&
+      !typed_input->layout().IsFullyReplicated()) {
     TF_SetStatus(status, TF_UNIMPLEMENTED,
-                 absl::StrCat("Trying to copy a non-replicated DTensor is not "
+                 absl::StrCat("Trying to copy a non-single-device and "
+                              "non-replicated DTensor is not "
                               "supported. Input tensor is: ",
                               typed_input->DebugString())
                      .c_str());
@@ -2293,7 +2539,7 @@ TFE_TensorHandle* CopyFromDTensorDevice(TFE_Context* context,
   }
   if (dev->use_parallel_executor()) {
     TFE_TensorHandle* handle =
-        dev->ToHostTensorHandle(context, typed_input, status);
+        dev->ToTensorHandle(context, typed_input, status);
     if (TF_GetCode(status) != TF_OK) {
       return nullptr;
     }
@@ -2315,8 +2561,6 @@ bool PinToDTensorDevice(const TFE_Op* op, TF_Status* s) {
   // to a CPU mesh. If any other dtensor inputs are on a TPU mesh,
   // then the mesh that is broadcasted will be the TPU mesh.
   int num_inputs = TFE_OpGetFlatInputCount(op, s);
-  std::vector<TFE_TensorHandle*> inputs_vector;
-  inputs_vector.reserve(num_inputs);
 
   absl::flat_hash_set<Mesh> input_meshes;
 
@@ -2353,7 +2597,7 @@ bool PinToDTensorDevice(const TFE_Op* op, TF_Status* s) {
   if (has_non_dtensor_resource && broadcast_mesh &&
       !broadcast_mesh->is_cpu_mesh()) {
     LOG(WARNING)
-        << "DTensor Function has been pinned back to a physical device because"
+        << "DTensor Function has been pinned back to a physical device because "
         << "a regular TF Variable is an input along with dtensor inputs and "
         << "was unable to be upcasted to a DVariable. This "
         << "may be unintended and signify an error in the way the user is "
@@ -2366,14 +2610,18 @@ bool PinToDTensorDevice(const TFE_Op* op, TF_Status* s) {
 
 void AllocateDTensorDevice(absl::string_view device_name,
                            TFE_CustomDevice* device, void** device_info,
+                           bool is_async, int in_flight_nodes_limit,
                            TF_Status* status) {
   DTensorDevice* dtensor_device = nullptr;
   if (status) {
-    ASSIGN_OR_RETURN_C_STATUS(dtensor_device,
-                              DTensorDevice::Create(device_name), status);
+    ASSIGN_OR_RETURN_C_STATUS(
+        dtensor_device,
+        DTensorDevice::Create(device_name, is_async, in_flight_nodes_limit),
+        status);
   } else {
     // TODO(b/268241383): Remove this branch.
-    auto device_status = DTensorDevice::Create(device_name);
+    auto device_status =
+        DTensorDevice::Create(device_name, is_async, in_flight_nodes_limit);
     TF_CHECK_OK(device_status.status());
     dtensor_device = device_status.value();
   }
@@ -2387,36 +2635,27 @@ void AllocateDTensorDevice(absl::string_view device_name,
 }
 
 void AddMesh(const std::string& serialized_mesh, void* device_info,
-             bool is_async, bool is_host_mesh, int in_flight_nodes_limit,
-             TF_Status* status) {
+             bool is_host_mesh, TF_Status* status) {
   auto mesh_config_or_status = Mesh::FromString(serialized_mesh);
   if (!mesh_config_or_status.ok()) {
     TF_SetStatus(status, TF_INTERNAL,
                  absl::StrCat("Failed to parse mesh config. ",
-                              mesh_config_or_status.status().error_message())
+                              mesh_config_or_status.status().message())
                      .c_str());
     return;
   }
   auto mesh_config = mesh_config_or_status.value();
-  std::vector<std::string> underlying_devices;
-  underlying_devices.insert(underlying_devices.end(),
-                            mesh_config.local_devices().begin(),
-                            mesh_config.local_devices().end());
-  // DTensor uses multi-client setup which doesn't use remote eager, so we can
-  // enable eager async execution in ParallelDevice.
-  std::unique_ptr<tensorflow::parallel_device::ParallelDevice> parallel(
-      new tensorflow::parallel_device::ParallelDevice(
-          underlying_devices, is_async, in_flight_nodes_limit));
 
   DTensorDevice* device = reinterpret_cast<DTensorDevice*>(device_info);
-  device->AddMesh(std::move(mesh_config), std::move(parallel), is_host_mesh);
+  device->AddMesh(std::move(mesh_config), is_host_mesh);
 }
 
 void ExperimentalSetDefaultLayout(const std::string& serialized_layout,
                                   void* device_info, TF_Status* status) {
   StatusOr<Layout> layout = Layout::FromString(serialized_layout);
   if (!layout.ok()) {
-    RETURN_STATUS(status, TF_INTERNAL, layout.status().error_message().c_str());
+    RETURN_STATUS(status, TF_INTERNAL,
+                  tsl::NullTerminatedMessage(layout.status()));
   }
   DTensorDevice* device = reinterpret_cast<DTensorDevice*>(device_info);
   device->SetDefaultLayout(layout.value());
@@ -2431,7 +2670,8 @@ void ExperimentalSetDefaultMesh(const std::string& serialized_mesh,
                                 void* device_info, TF_Status* status) {
   StatusOr<Mesh> mesh = Mesh::FromString(serialized_mesh);
   if (!mesh.ok()) {
-    RETURN_STATUS(status, TF_INTERNAL, mesh.status().error_message().c_str());
+    RETURN_STATUS(status, TF_INTERNAL,
+                  tsl::NullTerminatedMessage(mesh.status()));
   }
   DTensorDevice* device = reinterpret_cast<DTensorDevice*>(device_info);
   device->SetDefaultMesh(mesh.value());
diff --git a/tensorflow/dtensor/cc/dtensor_device.h b/tensorflow/dtensor/cc/dtensor_device.h
index 8d89ded9ad3..66d00da1c04 100644
--- a/tensorflow/dtensor/cc/dtensor_device.h
+++ b/tensorflow/dtensor/cc/dtensor_device.h
@@ -38,6 +38,7 @@ namespace dtensor {
 // TODO(b/268241383): Remove the `status = nullptr` overload.
 void AllocateDTensorDevice(absl::string_view device_name,
                            TFE_CustomDevice* device, void** device_info,
+                           bool is_async, int in_flight_nodes_limit,
                            TF_Status* status = nullptr);
 
 // Add a mesh to the `DTensorDevice` indicated by `device_info`.
@@ -56,8 +57,7 @@ void AllocateDTensorDevice(absl::string_view device_name,
 // async executors used by DTensor. The throttling bounds the memory usage
 // of an eager training loop. Python API sets this value to 8 by default.
 void AddMesh(const std::string& serialized_mesh, void* device_info,
-             bool is_async, bool is_host_mesh, int in_flight_nodes_limit,
-             TF_Status* status);
+             bool is_host_mesh, TF_Status* status);
 
 // Sets a requested layout for outputs of all operations.
 void ExperimentalSetDefaultLayout(const std::string& serialized_layout,
diff --git a/tensorflow/dtensor/cc/dtensor_device_util.cc b/tensorflow/dtensor/cc/dtensor_device_util.cc
index fdd7cb5e9b5..95659c0b396 100644
--- a/tensorflow/dtensor/cc/dtensor_device_util.cc
+++ b/tensorflow/dtensor/cc/dtensor_device_util.cc
@@ -21,14 +21,18 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/Support/Casting.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
@@ -60,17 +64,14 @@ struct FunctionArgument {
   NodeDefBuilder::NodeOut output;
 };
 
-std::unique_ptr<parallel_device::ParallelTensor>
-BroadcastTensorHandleToParallelTensor(TFE_Context* context,
-                                      TFE_TensorHandle* tensor,
-                                      const MeshWithParallelDevice& mesh,
-                                      TF_Status* status) {
+std::vector<TensorHandlePtr> BroadcastTensorHandleToParallelTensor(
+    TFE_Context* context, TFE_TensorHandle* tensor, const Mesh& target_mesh,
+    TF_Status* status) {
   // Broadcast tensor value to local devices.
-  const Mesh& target_mesh = mesh.mesh_config();
   absl::Span<const std::string> local_devices = target_mesh.local_devices();
   const int num_local_devices = local_devices.size();
 
-  std::vector<parallel_device::TensorHandlePtr> components;
+  std::vector<TensorHandlePtr> components;
   components.reserve(num_local_devices);
   for (int i = 0; i < num_local_devices; ++i) {
     // Create tensor copies to each local devices specifie by `target_mesh`.
@@ -83,22 +84,18 @@ BroadcastTensorHandleToParallelTensor(TFE_Context* context,
               "Unable to copy tensor value for broadcast. Original message: ",
               TF_Message(status))
               .c_str());
-      return nullptr;
+      return {};
     }
   }
 
-  std::unique_ptr<parallel_device::ParallelTensor> parallel_tensor =
-      parallel_device::ParallelTensor::FromTensorHandles(
-          mesh.parallel_device(), std::move(components), status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  return parallel_tensor;
+  return components;
 }
 
 // Broadcast a single non-parallel resource tensor onto `mesh` with a fully
 // replicated sharding spec. Does not take ownership of `tensor`.
 std::unique_ptr<TensorWithLayoutTf> BroadcastResourceTensor(
-    TFE_Context* context, TFE_TensorHandle* tensor,
-    const MeshWithParallelDevice& mesh, TF_Status* status) {
+    TFE_Context* context, TFE_TensorHandle* tensor, const Mesh& target_mesh,
+    TF_Status* status) {
   // Only broadcast resource tensors that point to scalars since they are
   // always replicated. We also still want to catch honest user errors so
   // error out on non-scalars.
@@ -111,7 +108,7 @@ std::unique_ptr<TensorWithLayoutTf> BroadcastResourceTensor(
   if (!convert_status.ok() || t.dtype() != DataType::DT_RESOURCE) {
     TF_SetStatus(status, TF_INTERNAL,
                  absl::StrCat("TF_TensorToTensor() Conversion failed:",
-                              convert_status.error_message())
+                              convert_status.message())
                      .c_str());
     return nullptr;
   }
@@ -119,7 +116,6 @@ std::unique_ptr<TensorWithLayoutTf> BroadcastResourceTensor(
   // associated device of the resource itself.
   ResourceHandle r = t.flat<ResourceHandle>()(0);
 
-  const Mesh& target_mesh = mesh.mesh_config();
   // Only broadcast resource tensors onto a CPU mesh. Copying
   // resource tensors to non CPU device is not supported.
   if (!target_mesh.is_cpu_mesh()) {
@@ -140,31 +136,32 @@ std::unique_ptr<TensorWithLayoutTf> BroadcastResourceTensor(
   LOG(INFO) << "Broadcasting resource tensor to a dtensor resource tensor.";
   if (target_mesh.is_remote()) {
     TF_DataType dtype = TFE_TensorHandleDataType(tensor);
-    std::vector<int64_t> shape(TensorShapeAsVector(tensor, status));
-    if (TF_GetCode(status) != TF_OK) return nullptr;
-    auto layout = Layout::ReplicatedOnMesh(target_mesh, shape.size());
-
-    auto ret = CreateDummyTensorWithLayout(shape, dtype, target_mesh, layout);
+    StatusOr<std::vector<int64_t>> shape = GetTensorShapeAsVector(tensor);
+    if (!shape.ok()) {
+      Set_TF_Status_from_Status(status, shape.status());
+      return nullptr;
+    }
+    auto layout = Layout::ReplicatedOnMesh(target_mesh, shape->size());
+    auto ret = CreateDummyTensorWithLayout(*shape, dtype, layout);
     return ret;
   }
 
-  std::unique_ptr<parallel_device::ParallelTensor> parallel_tensor =
-      BroadcastTensorHandleToParallelTensor(context, tensor, mesh, status);
+  std::vector<TensorHandlePtr> tensors = BroadcastTensorHandleToParallelTensor(
+      context, tensor, target_mesh, status);
   if (TF_GetCode(status) != TF_OK) return nullptr;
 
   int rank = r.dtypes_and_shapes().empty()
                  ? 0
                  : r.dtypes_and_shapes().begin()->shape.dims();
 
-  StatusOr<std::unique_ptr<TensorWithLayoutTf>> result =
-      CreateTensorWithLayout(std::move(parallel_tensor), target_mesh,
-                             Layout::ReplicatedOnMesh(target_mesh, rank));
+  StatusOr<std::unique_ptr<TensorWithLayoutTf>> result = CreateTensorWithLayout(
+      std::move(tensors), Layout::ReplicatedOnMesh(target_mesh, rank));
   if (!result.ok()) {
     TF_SetStatus(
         status, TF_INTERNAL,
         absl::StrCat("Error creating a TensorWithLayout from a resource tensor "
                      "during broadcasting with original error message:",
-                     result.status().error_message())
+                     result.status().message())
             .c_str());
     return nullptr;
   }
@@ -182,7 +179,7 @@ std::unique_ptr<TensorWithLayoutTf> BroadcastResourceTensor(
           status, TF_INTERNAL,
           absl::StrCat(
               "Error updating shape and dtype of the resource tensor: ",
-              s.error_message())
+              s.message())
               .c_str());
       return nullptr;
     }
@@ -278,6 +275,26 @@ StatusOr<Layout> GetLayoutThroughIdentityOps(Node* op, int output_index) {
 
 char TensorWithLayoutTf::ID = 0;
 
+StatusOr<std::vector<int64_t>> GetTensorShapeAsVector(
+    TFE_TensorHandle* tensor) {
+  tensorflow::PartialTensorShape shape;
+  const Status status = tensorflow::unwrap(tensor)->Shape(&shape);
+  if (status.ok()) {
+    const int dims = shape.dims();
+    if (dims < 0) {
+      return errors::InvalidArgument("Unavailable tensor shape!");
+    }
+    std::vector<int64_t> result;
+    result.reserve(dims);
+    for (const TensorShapeDim& dim : shape) {
+      result.emplace_back(dim.size);
+    }
+    return result;
+  } else {
+    return status;
+  }
+}
+
 tensorflow::Fprint128 TensorWithLayoutTf::CacheKey() const {
   tensorflow::Fprint128 f = tensorflow::Fingerprint128(layout_.ToString());
   // Use exact shape to compute the key.
@@ -294,21 +311,22 @@ tensorflow::Fprint128 TensorWithLayoutTf::CacheKey() const {
 }
 
 std::unique_ptr<TensorWithLayoutTf> TensorWithLayoutTf::Broadcast(
-    TFE_Context* context, TFE_TensorHandle* tensor,
-    const MeshWithParallelDevice& mesh, TF_Status* status) {
+    TFE_Context* context, TFE_TensorHandle* tensor, const Mesh& target_mesh,
+    TF_Status* status) {
   // Handle resource tensor broadcasting to the mesh.
   if (TFE_TensorHandleDataType(tensor) == TF_RESOURCE) {
-    return BroadcastResourceTensor(context, tensor, mesh, status);
+    return BroadcastResourceTensor(context, tensor, target_mesh, status);
   }
 
-  const Mesh& target_mesh = mesh.mesh_config();
   if (target_mesh.is_remote()) {
     TF_DataType dtype = TFE_TensorHandleDataType(tensor);
-    std::vector<int64_t> shape(TensorShapeAsVector(tensor, status));
-    if (TF_GetCode(status) != TF_OK) return nullptr;
-    auto layout = Layout::ReplicatedOnMesh(target_mesh, shape.size());
-
-    auto ret = CreateDummyTensorWithLayout(shape, dtype, target_mesh, layout);
+    StatusOr<std::vector<int64_t>> shape = GetTensorShapeAsVector(tensor);
+    if (!shape.ok()) {
+      Set_TF_Status_from_Status(status, shape.status());
+      return nullptr;
+    }
+    auto layout = Layout::ReplicatedOnMesh(target_mesh, shape->size());
+    auto ret = CreateDummyTensorWithLayout(*shape, dtype, layout);
     std::optional<NodeDef> const_value =
         ExtractSmallTensorValue(context, tensor, layout, status);
     if (TF_GetCode(status) != TF_OK) return nullptr;
@@ -318,75 +336,132 @@ std::unique_ptr<TensorWithLayoutTf> TensorWithLayoutTf::Broadcast(
     return ret;
   }
 
-  std::unique_ptr<parallel_device::ParallelTensor> parallel_tensor =
-      BroadcastTensorHandleToParallelTensor(context, tensor, mesh, status);
+  std::vector<TensorHandlePtr> tensors = BroadcastTensorHandleToParallelTensor(
+      context, tensor, target_mesh, status);
   if (TF_GetCode(status) != TF_OK) return nullptr;
 
-  const std::vector<int64_t>* shape;
-  Status s = parallel_tensor->Shape(&shape);
-  if (!s.ok()) {
-    TF_SetStatus(status, static_cast<TF_Code>(s.code()),
-                 s.error_message().c_str());
+  StatusOr<std::vector<int64_t>> shape =
+      GetTensorShapeAsVector(tensors[0].get());
+  if (!shape.ok()) {
+    Set_TF_Status_from_Status(status, shape.status());
     return nullptr;
   }
-  size_t num_dims = shape->size();
+  const size_t num_dims = shape->size();
   const Layout layout = Layout::ReplicatedOnMesh(target_mesh, num_dims);
 
   std::optional<NodeDef> const_value =
       ExtractSmallTensorValue(context, tensor, layout, status);
   if (TF_GetCode(status) != TF_OK) return nullptr;
 
-  std::unique_ptr<TensorWithLayoutTf> result(new TensorWithLayoutTf(
-      std::move(parallel_tensor), target_mesh, std::move(layout), *shape,
-      /*dtype=*/std::nullopt, std::move(const_value)));
+  std::unique_ptr<TensorWithLayoutTf> result(
+      new TensorWithLayoutTf(std::move(tensors), std::move(layout), *shape,
+                             /*dtype=*/std::nullopt, std::move(const_value)));
   return result;
 }
 
 StatusOr<std::unique_ptr<TensorWithLayoutTf>> TensorWithLayoutTf::Wrap(
-    std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
-    const Layout& layout) {
-  const std::vector<int64_t>* shape;
-  TF_RETURN_IF_ERROR(tensor->Shape(&shape));
-
+    std::vector<TensorHandlePtr>&& tensors, const Layout& layout,
+    std::optional<std::vector<int64_t>>&& shape) {
+  if (!shape.has_value()) {
+    TF_ASSIGN_OR_RETURN(shape, GetTensorShapeAsVector(tensors[0].get()));
+  }
   return absl::WrapUnique(
-      new TensorWithLayoutTf(std::move(tensor), mesh, layout, *shape));
+      new TensorWithLayoutTf(std::move(tensors), layout, *shape));
 }
 
 std::unique_ptr<TensorWithLayoutTf> TensorWithLayoutTf::Wrap(
-    parallel_device::TensorHandlePtr single_tensor, const Mesh& mesh,
-    const Layout& layout, TF_Status* status) {
-  if (!mesh.IsSingleDevice() || !layout.IsSingleDevice()) {
+    TensorHandlePtr single_tensor, const Layout& layout, TF_Status* status) {
+  if (!layout.IsSingleDevice()) {
     TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 "Input mesh or layout is not for single device.");
+                 "Input layout is not for single device.");
     return nullptr;
   }
-  std::vector<int64_t> shape = TensorShapeAsVector(single_tensor.get(), status);
-  if (TF_GetCode(status) != TF_OK) {
+
+  StatusOr<std::vector<int64_t>> shape =
+      GetTensorShapeAsVector(single_tensor.get());
+  if (!shape.ok()) {
+    Set_TF_Status_from_Status(status, shape.status());
     return nullptr;
   }
 
   return absl::WrapUnique(
-      new TensorWithLayoutTf(std::move(single_tensor), mesh, layout, shape));
+      new TensorWithLayoutTf(std::move(single_tensor), layout, *shape));
 }
 
 std::unique_ptr<TensorWithLayoutTf> TensorWithLayoutTf::Dummy(
     const std::vector<int64_t>& local_shape, const TF_DataType dtype,
-    const Mesh& mesh, const Layout& layout) {
-  return absl::WrapUnique(
-      new TensorWithLayoutTf(std::unique_ptr<parallel_device::ParallelTensor>(),
-                             mesh, layout, local_shape, dtype));
+    const Layout& layout) {
+  return absl::WrapUnique(new TensorWithLayoutTf(std::vector<TensorHandlePtr>(),
+                                                 layout, local_shape, dtype));
 }
 
+namespace {
+std::vector<std::string> SummarizeDeviceNames(
+    absl::Span<const std::string> underlying_devices_) {
+  std::vector<DeviceNameUtils::ParsedName> parsed_components(
+      underlying_devices_.size());
+  for (int component_index = 0; component_index < underlying_devices_.size();
+       ++component_index) {
+    if (!DeviceNameUtils::ParseFullName(underlying_devices_[component_index],
+                                        &parsed_components[component_index]) ||
+        !DeviceNameUtils::IsSameAddressSpace(
+            underlying_devices_[component_index], underlying_devices_[0])) {
+      // Device names are from different address spaces, or we can't figure out
+      // whether they are, so we'll fully-qualify everything.
+      return std::vector<std::string>(underlying_devices_.begin(),
+                                      underlying_devices_.end());
+    }
+  }
+  std::vector<std::string> local_names;
+  local_names.reserve(underlying_devices_.size());
+  for (const DeviceNameUtils::ParsedName& parsed_component :
+       parsed_components) {
+    local_names.push_back(
+        absl::StrCat(parsed_component.type, ":", parsed_component.id));
+  }
+  return local_names;
+}
+
+StatusOr<std::string> SummarizeValues(
+    absl::Span<const std::string> underlying_devices_,
+    const std::vector<TensorHandlePtr>& tensors_) {
+  std::string summary = "{";
+  std::vector<std::string> summarized_devices =
+      SummarizeDeviceNames(underlying_devices_);
+  for (int component_index = 0; component_index < tensors_.size();
+       ++component_index) {
+    // TODO(allenl): Add a C API for summarizing tensors. Currently custom
+    // devices limiting themselves to a C API (for ABI compatibility) would need
+    // to implement summarization for component tensors themselves.
+    ImmediateExecutionTensorHandle* component =
+        tensorflow::unwrap(tensors_[component_index].get());
+    std::string component_summary;
+    TF_RETURN_IF_ERROR(component->SummarizeValue(component_summary));
+    absl::StrAppend(&summary, component_index == 0 ? "" : ", ", "\"",
+                    summarized_devices[component_index],
+                    "\": ", component_summary);
+  }
+  summary += "}";
+  return summary;
+}
+}  // namespace
+
 std::string TensorWithLayoutTf::SummarizeValue() const {
   std::string value_summary;
   Status status;
-  if (layout().IsFullyReplicated()) {
+  if (layout_.IsSingleDevice() || layout_.IsFullyReplicated()) {
     status =
-        tensorflow::unwrap(tensor_->tensor(0))->SummarizeValue(value_summary);
+        tensorflow::unwrap(tensors_[0].get())->SummarizeValue(value_summary);
   } else {
     // Note that this just prints the local values for sharded tensors. We could
     // instead run a collective here to relayout to replicated.
-    status = tensor_->SummarizeValue(value_summary);
+    const StatusOr<std::string> summary_status =
+        SummarizeValues(layout_.mesh().local_devices(), tensors_);
+    if (summary_status.ok()) {
+      value_summary = summary_status.value();
+    } else {
+      status = summary_status.status();
+    }
   }
   if (!status.ok()) {
     value_summary = "<error computing value>";
@@ -395,7 +470,8 @@ std::string TensorWithLayoutTf::SummarizeValue() const {
 }
 
 std::string TensorWithLayoutTf::DebugString() const {
-  auto dtype = static_cast<DataType>(tensor_->dtype());
+  TF_DataType tf_dtype = dtype();
+  auto dtype = static_cast<DataType>(tf_dtype);
 
   const auto& shape_vector = global_shape();
   return absl::StrCat("DTensor(", SummarizeValue(),
@@ -406,20 +482,20 @@ std::string TensorWithLayoutTf::DebugString() const {
 char ResourceHandleWithLayout::ID = 0;
 
 StatusOr<std::unique_ptr<ResourceHandleWithLayout>>
-ResourceHandleWithLayout::Wrap(
-    std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
-    const Layout& layout) {
-  const std::vector<int64_t>* shape;
-  TF_RETURN_IF_ERROR(tensor->Shape(&shape));
+ResourceHandleWithLayout::Wrap(std::vector<TensorHandlePtr>&& tensors,
+                               const Layout& layout,
+                               std::optional<std::vector<int64_t>>&& shape) {
+  if (!shape.has_value()) {
+    TF_ASSIGN_OR_RETURN(shape, GetTensorShapeAsVector(tensors[0].get()));
+  }
   return absl::WrapUnique(
-      new ResourceHandleWithLayout(std::move(tensor), mesh, layout, *shape));
+      new ResourceHandleWithLayout(std::move(tensors), layout, *shape));
 }
 
 std::unique_ptr<ResourceHandleWithLayout> ResourceHandleWithLayout::Dummy(
-    const std::vector<int64_t>& local_shape, const Mesh& mesh,
-    const Layout& layout) {
+    const std::vector<int64_t>& local_shape, const Layout& layout) {
   return absl::WrapUnique(new ResourceHandleWithLayout(
-      /*tensor=*/nullptr, mesh, layout, local_shape));
+      std::vector<TensorHandlePtr>(), layout, local_shape));
 }
 
 void ResourceHandleWithLayout::EncodeAttributes(
@@ -487,11 +563,10 @@ StatusOr<std::unique_ptr<SparseTensorWithLayout>> SparseTensorWithLayout::Wrap(
     std::unique_ptr<parallel_device::ParallelTensor> indices_tensor,
     std::unique_ptr<parallel_device::ParallelTensor> values_tensor,
     std::unique_ptr<parallel_device::ParallelTensor> shapes_tensor,
-    const Mesh& mesh, const Layout& layout,
-    const std::vector<int64_t>& local_shape) {
+    const Layout& layout, const std::vector<int64_t>& local_shape) {
   return absl::WrapUnique(new SparseTensorWithLayout(
       std::move(indices_tensor), std::move(values_tensor),
-      std::move(shapes_tensor), mesh, layout, local_shape));
+      std::move(shapes_tensor), layout, local_shape));
 }
 
 std::string SparseTensorWithLayout::SummarizeValue() const {
@@ -547,7 +622,7 @@ TF_DataType SparseTensorWithLayout::dtype() const {
 }
 
 TFE_TensorHandle* SparseTensorWithLayout::get_tensor(size_t index) const {
-  int num_sparse_tensors = num_tensors() / 3;
+  int num_sparse_tensors = num_tensors() / kSparseTensorNum;
   if (index < num_sparse_tensors) {
     return indices_->tensor(index);
   } else if (index < 2 * num_sparse_tensors) {
@@ -559,37 +634,28 @@ TFE_TensorHandle* SparseTensorWithLayout::get_tensor(size_t index) const {
 
 std::unique_ptr<TensorWithLayoutTf> CreateDummyTensorWithLayout(
     const std::vector<int64_t>& local_shape, TF_DataType dtype,
-    const Mesh& mesh, const Layout& layout) {
+    const Layout& layout) {
   switch (dtype) {
     case TF_RESOURCE:
-      return ResourceHandleWithLayout::Dummy(local_shape, mesh, layout);
+      return ResourceHandleWithLayout::Dummy(local_shape, layout);
     default:
-      return TensorWithLayoutTf::Dummy(local_shape, dtype, mesh, layout);
+      return TensorWithLayoutTf::Dummy(local_shape, dtype, layout);
   }
 }
 
 StatusOr<std::unique_ptr<TensorWithLayoutTf>> CreateTensorWithLayout(
-    std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
-    const Layout& layout) {
-  switch (tensor->dtype()) {
+    std::vector<TensorHandlePtr>&& tensors, const Layout& layout,
+    std::optional<std::vector<int64_t>>&& shape) {
+  switch (TFE_TensorHandleDataType(tensors[0].get())) {
     case TF_RESOURCE:
-      return ResourceHandleWithLayout::Wrap(std::move(tensor), mesh, layout);
+      return ResourceHandleWithLayout::Wrap(std::move(tensors), layout,
+                                            std::move(shape));
     default:
-      return TensorWithLayoutTf::Wrap(std::move(tensor), mesh, layout);
+      return TensorWithLayoutTf::Wrap(std::move(tensors), layout,
+                                      std::move(shape));
   }
 }
 
-std::vector<int64_t> TensorShapeAsVector(TFE_TensorHandle* tensor,
-                                         TF_Status* status) {
-  std::vector<int64_t> shape(TFE_TensorHandleNumDims(tensor, status));
-  if (TF_GetCode(status) != TF_OK) return {};
-  for (int i = 0; i < shape.size(); ++i) {
-    shape[i] = TFE_TensorHandleDim(tensor, i, status);
-    if (TF_GetCode(status) != TF_OK) return {};
-  }
-  return shape;
-}
-
 template <>
 StatusOr<bool> ExecutableManager<ExecutionFunctions>::ShouldFoldInput(
     const DTensorOperation& doperation,
@@ -621,8 +687,7 @@ Status InferOutputLayouts(const DTensorOperation& doperation,
   for (int output_index = 0; output_index < op_node->num_outputs();
        ++output_index) {
     const Layout* layout = nullptr;
-    if (!doperation.is_func() && default_layout.has_value() &&
-        output_index == 0) {
+    if (default_layout.has_value() && output_index == 0) {
       // Record the user's requested output layout. The scope currently only
       // covers the first output of an op.
       layout = &default_layout.value();
@@ -971,7 +1036,7 @@ void AddDTensorFunctionAttr(FunctionDef& function_def) {
       {"_OutputsOnOpDevice", outputs_on_op_device});
 }
 
-StatusOr<std::vector<parallel_device::ParallelTensor*>> PrepareEmbeddingInputs(
+StatusOr<std::vector<std::vector<TFE_TensorHandle*>>> PrepareEmbeddingInputs(
     const std::vector<TensorWithLayoutTf*>& inputs) {
   absl::flat_hash_map<int64_t, std::vector<int64_t>> table_vars_input_index;
   for (int64_t i = 0; i < inputs.size(); ++i) {
@@ -988,11 +1053,11 @@ StatusOr<std::vector<parallel_device::ParallelTensor*>> PrepareEmbeddingInputs(
   if (table_vars_input_index.empty()) {
     return errors::Internal("There are no TPU embedding resource input found.");
   }
-  std::vector<parallel_device::ParallelTensor*> parallel_inputs;
+  std::vector<std::vector<TFE_TensorHandle*>> parallel_inputs;
   // Assure parallel inputs has numeric order as table ids.
   for (const auto& [table_id, table_vars_indices] : table_vars_input_index) {
     for (const int64_t input_index : table_vars_indices) {
-      parallel_inputs.push_back(inputs[input_index]->tensor());
+      parallel_inputs.push_back(inputs[input_index]->tensors());
     }
   }
   return parallel_inputs;
@@ -1043,11 +1108,10 @@ StatusOr<std::map<int64_t, std::vector<Node*>>> GetTPUEmbeddingInputNodes(
     const Status status = resource->UpdateAttrs(embedding_input_attrs);
     if (!status.ok()) {
       TF_SetStatus(s, static_cast<TF_Code>(status.code()),
-                   status.error_message().c_str());
-      // TODO(b/256016071): Try finding a way to append source locations.
+                   tsl::NullTerminatedMessage(status));
       return errors::Internal(
           "Failed to set embedding resource attrs. \n Got error: ",
-          status.error_message());
+          status.message());
     }
   }
   return table_id_node_map;
@@ -1086,7 +1150,7 @@ Status InsertFunctionForTPUEmbeddingCheckpoint(
   StatusOr<std::map<int64_t, std::vector<Node*>>> table_id_node_map =
       GetTPUEmbeddingInputNodes(status, *graph, inputs);
   if (!table_id_node_map.ok()) {
-    return errors::Internal(table_id_node_map.status().error_message());
+    return errors::Internal(table_id_node_map.status().message());
   }
 
   StatusOr<std::string> mesh_str = ValidateResourceMeshConsistency(inputs);
diff --git a/tensorflow/dtensor/cc/dtensor_device_util.h b/tensorflow/dtensor/cc/dtensor_device_util.h
index 3e550035099..687d3d2c099 100644
--- a/tensorflow/dtensor/cc/dtensor_device_util.h
+++ b/tensorflow/dtensor/cc/dtensor_device_util.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
 #include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/function.h"
@@ -52,6 +53,8 @@ limitations under the License.
 namespace tensorflow {
 namespace dtensor {
 
+using TensorHandlePtr = tensorflow::Safe_TFE_TensorHandlePtr;
+
 #define RETURN_STATUS(status, code, message)   \
   {                                            \
     TF_SetStatus((status), (code), (message)); \
@@ -64,7 +67,7 @@ namespace dtensor {
     if (!return_if_not_ok_status.ok()) {                                  \
       RETURN_STATUS((c_status),                                           \
                     static_cast<TF_Code>(return_if_not_ok_status.code()), \
-                    return_if_not_ok_status.error_message().c_str());     \
+                    tsl::NullTerminatedMessage(return_if_not_ok_status)); \
     }                                                                     \
   }
 
@@ -133,46 +136,11 @@ struct DTensorOperation {
   const FunctionDef* function_def;
   // Default mesh is used when Mesh Propagation does not identify a mesh
   // otherwise.
-  const Mesh& default_mesh;
+  const Mesh default_mesh;
   const StackTracesMap& stack_traces;
   inline bool is_func() const { return function_def != nullptr; }
 };
 
-// Contains a mesh bundled with a parallel device over all of the devices in
-// that mesh.
-class MeshWithParallelDevice {
- public:
-  MeshWithParallelDevice(
-      const Mesh& mesh_config,
-      std::unique_ptr<parallel_device::ParallelDevice> parallel_device)
-      : mesh_config_(mesh_config),
-        parallel_device_(std::move(parallel_device)),
-        // Device IDs are constructed lazily because we don't have a context
-        // until we start executing ops.
-        device_ids_tensor_(nullptr) {}
-
-  // A parallel tensor containing scalar integer device IDs for underlying
-  // devices, each placed on its corresponding device.
-  //
-  // TODO(allenl): It would be nice if DeviceID worked as an op inside the
-  // function's graph. Then we wouldn't need to feed it as an argument.
-  parallel_device::ParallelTensor* DeviceIDs(TFE_Context* context,
-                                             TF_Status* status) const;
-  const parallel_device::ParallelDevice& parallel_device() const {
-    return *parallel_device_;
-  }
-
-  const dtensor::Mesh& mesh_config() const { return mesh_config_; }
-
- private:
-  dtensor::Mesh mesh_config_;
-  std::unique_ptr<parallel_device::ParallelDevice> parallel_device_;
-
-  // Constructed lazily; contains a parallel tensor with scalar integer device
-  // IDs for each device.
-  mutable std::unique_ptr<parallel_device::ParallelTensor> device_ids_tensor_;
-};
-
 class TensorWithLayoutTf
     : public llvm::RTTIExtends<TensorWithLayoutTf, TensorWithLayout> {
  public:
@@ -180,24 +148,23 @@ class TensorWithLayoutTf
   // sharding spec. Does not take ownership of `tensor`. The tensor must not
   // already be on a DTensorDevice.
   static std::unique_ptr<TensorWithLayoutTf> Broadcast(
-      TFE_Context* context, TFE_TensorHandle* tensor,
-      const MeshWithParallelDevice& mesh, TF_Status* status);
+      TFE_Context* context, TFE_TensorHandle* tensor, const Mesh& target_mesh,
+      TF_Status* status);
 
   // Given an already-parallel tensor, wraps it with a mesh and a layout.
   static StatusOr<std::unique_ptr<TensorWithLayoutTf>> Wrap(
-      std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
-      const Layout& layout);
+      std::vector<TensorHandlePtr>&& tensors, const Layout& layout,
+      std::optional<std::vector<int64_t>>&& shape);
 
-  // Given a single tensor, wraps it with a single device mesh and a single
-  // device layout.
-  static std::unique_ptr<TensorWithLayoutTf> Wrap(
-      parallel_device::TensorHandlePtr single_tensor, const Mesh& mesh,
-      const Layout& layout, TF_Status* status);
+  // Given a single tensor, wraps it with a single device layout.
+  static std::unique_ptr<TensorWithLayoutTf> Wrap(TensorHandlePtr single_tensor,
+                                                  const Layout& layout,
+                                                  TF_Status* status);
 
   // Creates a dummy TensorWithLayoutTf without holding a ParallelTensor.
   static std::unique_ptr<TensorWithLayoutTf> Dummy(
       const std::vector<int64_t>& local_shape, TF_DataType dtype,
-      const Mesh& mesh, const Layout& layout);
+      const Layout& layout);
 
   ~TensorWithLayoutTf() override = default;
 
@@ -206,28 +173,39 @@ class TensorWithLayoutTf
   TensorType tensor_type() const override { return TensorType::kDense; }
 
   TF_DataType dtype() const override {
-    return dtype_.has_value() ? dtype_.value() : tensor_->dtype();
+    return dtype_.has_value() ? *dtype_
+                              : TFE_TensorHandleDataType(tensors_[0].get());
   }
-
   // Encodes the NodeDef via provided builder, if applicable.
   void EncodeAttributes(tensorflow::NodeDefBuilder& builder) const override {}
 
   tensorflow::Fprint128 CacheKey() const override;
 
   TFE_TensorHandle* get_tensor(size_t index) const override {
-    return tensor_->tensor(index);
+    return tensors_[index].get();
   }
 
-  size_t num_tensors() const override { return tensor_->num_tensors(); }
+  size_t num_tensors() const override {
+    return layout_.IsSingleDevice() ? 1 : tensors_.size();
+  }
 
-  parallel_device::ParallelTensor* tensor() const { return tensor_.get(); }
+  std::vector<TFE_TensorHandle*> tensors() const {
+    std::vector<TFE_TensorHandle*> result;
+    result.reserve(tensors_.size());
+    for (const TensorHandlePtr& tensor : tensors_) {
+      result.emplace_back(tensor.get());
+    }
+    return result;
+  }
+
+  TFE_TensorHandle* single_tensor() const {
+    return layout_.IsSingleDevice() ? get_tensor(0) : nullptr;
+  }
 
   std::string SummarizeValue() const override;
 
   std::string DebugString() const override;
 
-  const Mesh& mesh() const override { return mesh_; }
-
   std::vector<int64_t> global_shape() const override {
     return layout_.GlobalShapeFromLocalShape(local_shape_);
   }
@@ -240,46 +218,41 @@ class TensorWithLayoutTf
   static char ID;  // NOLINT
 
  protected:
-  TensorWithLayoutTf(std::unique_ptr<parallel_device::ParallelTensor> tensor,
-                     const Mesh& mesh, const Layout& layout,
+  TensorWithLayoutTf(std::vector<TensorHandlePtr>&& tensors,
+                     const Layout& layout,
                      const std::vector<int64_t>& local_shape,
                      std::optional<TF_DataType> dtype = std::nullopt,
                      std::optional<NodeDef> const_value = std::nullopt)
-      : tensor_(std::move(tensor)),
+      : tensors_(std::move(tensors)),
         layout_(layout),
-        mesh_(mesh),
         local_shape_(local_shape),
         dtype_(dtype) {
     const_value_node_ = std::make_unique<ConstValueNode>(const_value);
   }
 
-  TensorWithLayoutTf(parallel_device::TensorHandlePtr single_tensor,
-                     const Mesh& mesh, const Layout& layout,
+  TensorWithLayoutTf(TensorHandlePtr&& single_tensor, const Layout& layout,
                      const std::vector<int64_t>& local_shape,
                      std::optional<TF_DataType> dtype = std::nullopt,
                      std::optional<NodeDef> const_value = std::nullopt)
-      : single_tensor_(std::move(single_tensor)),
+      : tensors_([&single_tensor] {
+          std::vector<TensorHandlePtr> result;
+          result.emplace_back(std::move(single_tensor));
+          return result;
+        }()),
         layout_(layout),
-        mesh_(mesh),
         local_shape_(local_shape),
         dtype_(dtype) {
     const_value_node_ = std::make_unique<ConstValueNode>(const_value);
   }
 
-  std::unique_ptr<parallel_device::ParallelTensor> tensor_;
-
-  // Holds the tensor but not the underlying device. This is only used when the
-  // `layout_` is a single device layout and the `mesh_` is a single device
-  // mesh.
-  parallel_device::TensorHandlePtr single_tensor_;
+  std::vector<TensorHandlePtr> tensors_;
 
   Layout layout_;
 
-  const Mesh& mesh_;
-
   // The local shape of tensors placed on each of `tensor_`'s component devices.
   std::vector<int64_t> local_shape_;
 
+  // dtype of tensor_. Empty if the layout is Single Device.
   std::optional<TF_DataType> dtype_;
 
   std::unique_ptr<ConstValueNode> const_value_node_;
@@ -296,13 +269,12 @@ class ResourceHandleWithLayout
  public:
   // Similar to `Wrap` in `TensorWithLayoutTf` but for resource handle.
   static StatusOr<std::unique_ptr<ResourceHandleWithLayout>> Wrap(
-      std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
-      const Layout& layout);
+      std::vector<TensorHandlePtr>&& tensors, const Layout& layout,
+      std::optional<std::vector<int64_t>>&& shape);
 
   // Similar to `Dummy` in `TensorWithLayoutTf` but for resource handle.
   static std::unique_ptr<ResourceHandleWithLayout> Dummy(
-      const std::vector<int64_t>& local_shape, const Mesh& mesh,
-      const Layout& layout);
+      const std::vector<int64_t>& local_shape, const Layout& layout);
 
   // The layout of uninitialized resource tensors, or the layout of the tensor
   // contained in an initialized resource.
@@ -314,7 +286,8 @@ class ResourceHandleWithLayout
   TensorType tensor_type() const override { return TensorType::kResource; }
 
   TF_DataType dtype() const override {
-    return dtype_.has_value() ? dtype_.value() : tensor_->dtype();
+    return dtype_.has_value() ? *dtype_
+                              : TFE_TensorHandleDataType(tensors_[0].get());
   }
 
   void EncodeAttributes(tensorflow::NodeDefBuilder& builder) const override;
@@ -377,11 +350,11 @@ class ResourceHandleWithLayout
   static char ID;  // NOLINT
 
  private:
-  ResourceHandleWithLayout(
-      std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
-      const Layout& layout, const std::vector<int64_t>& local_shape)
+  ResourceHandleWithLayout(std::vector<TensorHandlePtr>&& tensors,
+                           const Layout& layout,
+                           const std::vector<int64_t>& local_shape)
       : llvm::RTTIExtends<ResourceHandleWithLayout, TensorWithLayoutTf>(
-            std::move(tensor), mesh, layout, local_shape, TF_RESOURCE) {}
+            std::move(tensors), layout, local_shape, TF_RESOURCE) {}
 
   // The layout of the tensor pointed to by this handle, if any.
   std::optional<Layout> dereferenced_layout_;
@@ -410,15 +383,13 @@ class SparseTensorWithLayout
       std::unique_ptr<parallel_device::ParallelTensor> indices_tensor,
       std::unique_ptr<parallel_device::ParallelTensor> values_tensor,
       std::unique_ptr<parallel_device::ParallelTensor> shapes_tensor,
-      const Mesh& mesh, const Layout& layout,
-      const std::vector<int64_t>& local_shape);
+      const Layout& layout, const std::vector<int64_t>& local_shape);
 
   // A dummy TensorWithLayout without holding a ParallelTensor.
   static std::unique_ptr<SparseTensorWithLayout> Dummy(
-      const std::vector<int64_t>& local_shape, const Mesh& mesh,
-      const Layout& layout) {
+      const std::vector<int64_t>& local_shape, const Layout& layout) {
     return absl::WrapUnique(new SparseTensorWithLayout(
-        /*indices=*/nullptr, /*values=*/nullptr, /*dense_shapes=*/nullptr, mesh,
+        /*indices=*/nullptr, /*values=*/nullptr, /*dense_shapes=*/nullptr,
         layout, local_shape));
   }
 
@@ -430,7 +401,9 @@ class SparseTensorWithLayout
 
   TensorType tensor_type() const override { return TensorType::kSparse; }
 
-  size_t num_tensors() const override { return 3 * indices_->num_tensors(); }
+  size_t num_tensors() const override {
+    return kSparseTensorNum * indices_->num_tensors();
+  }
 
   TFE_TensorHandle* get_tensor(size_t index) const override;
 
@@ -458,13 +431,11 @@ class SparseTensorWithLayout
       std::unique_ptr<parallel_device::ParallelTensor> indices,
       std::unique_ptr<parallel_device::ParallelTensor> values,
       std::unique_ptr<parallel_device::ParallelTensor> dense_shapes,
-      const Mesh& mesh, const Layout& layout,
-      const std::vector<int64_t>& local_shape,
+      const Layout& layout, const std::vector<int64_t>& local_shape,
       std::optional<TF_DataType> dtype = std::nullopt,
       std::optional<NodeDef> const_value = std::nullopt)
       : llvm::RTTIExtends<SparseTensorWithLayout, TensorWithLayoutTf>(
-            std::unique_ptr<parallel_device::ParallelTensor>(), mesh, layout,
-            local_shape),
+            std::vector<TensorHandlePtr>(), layout, local_shape),
         indices_(std::move(indices)),
         values_(std::move(values)),
         dense_shapes_(std::move(dense_shapes)) {}
@@ -474,18 +445,13 @@ class SparseTensorWithLayout
   std::unique_ptr<parallel_device::ParallelTensor> dense_shapes_;
 };
 
-// TODO(b/256016071): Instead of having the following two functions, create a
-// factory which can branch the creation of `TensorWithLayoutTf`,
-// `ResourceHandleWithLayout`, `SparseTensorWithLayout` and the incoming
-// `TensorWithLayoutPw`.
-
 std::unique_ptr<TensorWithLayoutTf> CreateDummyTensorWithLayout(
     const std::vector<int64_t>& local_shape, TF_DataType dtype,
-    const Mesh& mesh, const Layout& layout);
+    const Layout& layout);
 
 StatusOr<std::unique_ptr<TensorWithLayoutTf>> CreateTensorWithLayout(
-    std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
-    const Layout& layout);
+    std::vector<TensorHandlePtr>&& tensor, const Layout& layout,
+    std::optional<std::vector<int64_t>>&& shape = std::nullopt);
 
 template <typename T>
 std::string ShapeToDebugString(const std::vector<T> shape_vector) {
@@ -621,8 +587,7 @@ class ExecutableManager : public tsl::core::WeakRefCounted {
 };
 
 // Returns the shape of a given tensor.
-std::vector<int64_t> TensorShapeAsVector(TFE_TensorHandle* tensor,
-                                         TF_Status* status);
+StatusOr<std::vector<int64_t>> GetTensorShapeAsVector(TFE_TensorHandle* tensor);
 
 Status InferOutputLayouts(const DTensorOperation& doperation,
                           const NameAttrList& attributes,
@@ -658,7 +623,7 @@ Status MaybeInsertIdentityNodes(const FunctionDef* function_def, Graph* graph);
 void AddDTensorFunctionAttr(FunctionDef& function_def);
 
 // Prepare inputs of embeddings for checkpoint functions.
-StatusOr<std::vector<parallel_device::ParallelTensor*>> PrepareEmbeddingInputs(
+StatusOr<std::vector<std::vector<TFE_TensorHandle*>>> PrepareEmbeddingInputs(
     const std::vector<TensorWithLayoutTf*>& inputs);
 
 Status InsertFunctionForTPUEmbeddingCheckpoint(
diff --git a/tensorflow/dtensor/cc/dtensor_meta_ops.cc b/tensorflow/dtensor/cc/dtensor_meta_ops.cc
index fde4e03eca9..a38e8392171 100644
--- a/tensorflow/dtensor/cc/dtensor_meta_ops.cc
+++ b/tensorflow/dtensor/cc/dtensor_meta_ops.cc
@@ -27,7 +27,9 @@ REGISTER_OP("DTensorAllReduce")
     .Input("input: T")
     .Input("group_assignment: int32")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float, float64, int32, uint32, int64, bool}")
+    .Attr(
+        "T: {half, bfloat16, float, float64, int8, uint8, int32, uint32, "
+        "int64, uint64, bool}")
     .Attr("reduce_op: {'Min', 'Max', 'Mul', 'Add', 'Mean', 'Any', 'All'}")
     .Attr("device_type: string")  // e.g. "/device:TPU"
     .SetShapeFn(shape_inference::UnchangedShape);
@@ -45,7 +47,9 @@ REGISTER_OP("DTensorReduceScatter")
 REGISTER_OP("DTensorAllScatter")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float, int32, uint32, int64, bool, string}")
+    .Attr(
+        "T: {half, bfloat16, float, float64, int8, uint8, int32, uint32, "
+        "int64, uint64, bool, string}")
     .Attr("input_layout: string")
     .Attr("output_layout: string")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
@@ -102,7 +106,10 @@ REGISTER_OP("DTensorAllScatter")
 REGISTER_OP("DTensorAllGather")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float, int32, uint32, int64, bool}")
+    .Attr(
+        "T: {half, bfloat16, float, float64, int8, uint8, int32, uint32, "
+        "int64, uint64, "
+        "bool}")
     .Attr("input_layout: string")
     .Attr("output_layout: string")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
@@ -155,5 +162,64 @@ REGISTER_OP("DTensorAllGather")
       return OkStatus();
     });
 
+REGISTER_OP("DTensorAllToAll")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: {half, bfloat16, float, float64, int32, uint32, int64, bool}")
+    .Attr("input_layout: string")
+    .Attr("output_layout: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
+      shape_inference::ShapeHandle in = c->input(0);
+      if (!c->RankKnown(in)) {
+        // Input shape unknown, so set unknown output shape.
+        c->set_output(0, in);
+        return OkStatus();
+      }
+
+      std::string input_layout_string;
+      std::string output_layout_string;
+      TF_RETURN_IF_ERROR(c->GetAttr("input_layout", &input_layout_string));
+      TF_RETURN_IF_ERROR(c->GetAttr("output_layout", &output_layout_string));
+      TF_ASSIGN_OR_RETURN(Layout input_layout,
+                          Layout::FromString(input_layout_string));
+      TF_ASSIGN_OR_RETURN(Layout output_layout,
+                          Layout::FromString(output_layout_string));
+      if (c->Rank(in) != input_layout.rank() ||
+          c->Rank(in) != output_layout.rank()) {
+        return errors::InvalidArgument(
+            "Input tensor rank and layout ranks do not agree: input rank ",
+            c->Rank(in), " input layout rank ", input_layout.rank(),
+            " output "
+            "layout rank ",
+            output_layout.rank());
+      }
+      const std::vector<int32> input_sharding = input_layout.num_shards();
+      const std::vector<int32> output_sharding = output_layout.num_shards();
+      std::vector<shape_inference::DimensionHandle> out_dims;
+      out_dims.reserve(c->Rank(in));
+      for (int i = 0; i < c->Rank(in); ++i) {
+        shape_inference::DimensionHandle dim = c->Dim(in, i);
+        if (!c->ValueKnown(dim) ||
+            input_layout.sharding_spec(i) == output_layout.sharding_spec(i)) {
+          out_dims.emplace_back(dim);
+        } else if (Layout::IsUnshardedDimension(
+                       input_layout.sharding_spec(i)) &&
+                   Layout::IsShardedDimension(output_layout.sharding_spec(i))) {
+          shape_inference::DimensionHandle out_dim;
+          TF_RETURN_IF_ERROR(c->Divide(dim, output_sharding[i],
+                                       /*evenly_divisible=*/true, &out_dim));
+          out_dims.push_back(out_dim);
+        } else if (Layout::IsShardedDimension(input_layout.sharding_spec(i)) &&
+                   Layout::IsUnshardedDimension(
+                       output_layout.sharding_spec(i))) {
+          shape_inference::DimensionHandle out_dim;
+          TF_RETURN_IF_ERROR(c->Multiply(dim, input_sharding[i], &out_dim));
+          out_dims.push_back(out_dim);
+        }
+      }
+      c->set_output(0, c->MakeShape(out_dims));
+      return OkStatus();
+    });
+
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/cc/dtensor_ops.cc b/tensorflow/dtensor/cc/dtensor_ops.cc
index b1a44439afb..c3fd22cd3e1 100644
--- a/tensorflow/dtensor/cc/dtensor_ops.cc
+++ b/tensorflow/dtensor/cc/dtensor_ops.cc
@@ -40,10 +40,10 @@ REGISTER_OP("Relayout")
     .Attr("T: type")
     .SetShapeFn(UnchangedShape);
 
-// Gradient of Relayout.
-REGISTER_OP("RelayoutGrad")
+// Relayout the input according to the layout of layout_input.
+REGISTER_OP("RelayoutLike")
     .Input("input: T")
-    .Input("forward_input: T")  // To infer the output mesh.
+    .Input("layout_input: T")  // To infer the output mesh.
     .Output("output: T")
     .Attr("T: type")
     .SetShapeFn(UnchangedShape);
diff --git a/tensorflow/dtensor/cc/dtensor_tpu_ops.cc b/tensorflow/dtensor/cc/dtensor_tpu_ops.cc
index 3646e9d969c..3e096a4c7c3 100644
--- a/tensorflow/dtensor/cc/dtensor_tpu_ops.cc
+++ b/tensorflow/dtensor/cc/dtensor_tpu_ops.cc
@@ -28,7 +28,6 @@ namespace dtensor {
 
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
-using shape_inference::UnchangedShape;
 
 // Initializes global TPU's for mutli-client execution.
 //
diff --git a/tensorflow/dtensor/cc/dtensor_utils.cc b/tensorflow/dtensor/cc/dtensor_utils.cc
index 0cb83874732..0cef65a6347 100644
--- a/tensorflow/dtensor/cc/dtensor_utils.cc
+++ b/tensorflow/dtensor/cc/dtensor_utils.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/tsl/util/env_var.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -127,5 +128,16 @@ bool EnableReplicatedSpmdAsDefault(const std::string& op_name) {
   return dtensor_enable_replicated_spmd_as_default != nullptr;
 }
 
+bool EnableAllToAllForRelayout() {
+  // Whether to use all-to-all collective for relayout when possible.
+  static bool is_enabled = [] {
+    bool ret = true;
+    TF_CHECK_OK(tsl::ReadBoolFromEnvVar("DTENSOR_USE_ALL_TO_ALL_RELAYOUT",
+                                        /*default_val=*/true, &ret));
+    return ret;
+  }();
+  return is_enabled;
+}
+
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/cc/dtensor_utils.h b/tensorflow/dtensor/cc/dtensor_utils.h
index b27894dbf50..fa008d29ddd 100644
--- a/tensorflow/dtensor/cc/dtensor_utils.h
+++ b/tensorflow/dtensor/cc/dtensor_utils.h
@@ -60,6 +60,9 @@ bool LowerCollectiveGatherToCollectiveGatherV2();
 // implementation to default to the ReplicatedOpSpmdExpander.
 bool EnableReplicatedSpmdAsDefault(const std::string& op_name);
 
+// Returns whether to use all-to-all collective for relayout when possible.
+bool EnableAllToAllForRelayout();
+
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/cc/parallel_executor.h b/tensorflow/dtensor/cc/parallel_executor.h
index 90757fc569e..adb1adfd851 100644
--- a/tensorflow/dtensor/cc/parallel_executor.h
+++ b/tensorflow/dtensor/cc/parallel_executor.h
@@ -20,10 +20,12 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "llvm/ADT/StringRef.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
-#include "tensorflow/dtensor/cc/dtensor_device_util.h"
-#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/cc/tensor_with_layout.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -35,36 +37,38 @@ using Future = ::xla::PjRtFuture<T>;
 // Note: The interface is under development and APIs are subject to change.
 class ParallelExecutor {
  public:
-  using ExecutionResult = tsl::StatusOr<std::vector<TensorWithLayout*>>;
-
   virtual ~ParallelExecutor() = default;
 
   // Broadcasts `tensor` to `mesh` using replicated sharding and returns a
-  // DTensor representation.
-  virtual tsl::StatusOr<std::unique_ptr<tensorflow::dtensor::TensorWithLayout>>
-  Broadcast(const tensorflow::Tensor& tensor,
-            const tensorflow::dtensor::Mesh& mesh,
-            std::optional<tensorflow::NodeDef> const_value) = 0;
+  // DTensor representation. `mesh` can be a single device mesh and in that case
+  // `const_value` is useless.
+  virtual StatusOr<std::unique_ptr<TensorWithLayout>> Broadcast(
+      const Tensor& tensor, const Mesh& mesh,
+      std::optional<NodeDef> const_value) = 0;
 
-  // Takes input TensorWithLayouts, a MLIR module and the entry function name.
+  // Takes input TensorWithLayouts and a MLIR module.
+  // The MLIR module should have `main` as its entry function name.
   // Attributes are forwarded to executed operations unmodified.
   // The execute is non-blocking and returns a Future of output TensorWithLayout
   // raw pointers.
   // The client is responsible for the ownership of the outputs.
-  virtual Future<ExecutionResult> Execute(
+  struct ExecutionResult {
+    Future<Status> status;
+    // The pointed data of `outputs` are filled after `status` future resolves
+    // as ok.
+    std::vector<TensorWithLayout*> outputs;
+  };
+  virtual StatusOr<ExecutionResult> Execute(
       TFE_Context* context, const std::vector<TensorWithLayout*>& inputs,
-      mlir::ModuleOp module, llvm::StringRef entry_function_name,
-      const TFE_OpAttrs* attributes) const = 0;
+      mlir::ModuleOp module, const TFE_OpAttrs* attributes) const = 0;
 
   // Disassembles `t` into multiple TensorWithLayouts. `t` may or may not be
   // valid to use afterwards.
-  virtual tsl::StatusOr<
-      std::vector<std::unique_ptr<tensorflow::dtensor::TensorWithLayout>>>
-  Disassemble(TensorWithLayout* t) = 0;
+  virtual StatusOr<std::vector<std::unique_ptr<TensorWithLayout>>> Disassemble(
+      TensorWithLayout* t) = 0;
 
   // Returns a tensor copied from `t` when `t` contains only a single device.
-  virtual Future<tsl::StatusOr<tensorflow::Tensor>> ToHostBuffer(
-      TensorWithLayout* t) = 0;
+  virtual Future<StatusOr<Tensor>> ToHostBuffer(TensorWithLayout* t) = 0;
 };
 
 // Factory method for Default ParallelExecutor instance.
diff --git a/tensorflow/dtensor/cc/tensor_layout.cc b/tensorflow/dtensor/cc/tensor_layout.cc
index 94e5bb1ded8..6b94560292c 100644
--- a/tensorflow/dtensor/cc/tensor_layout.cc
+++ b/tensorflow/dtensor/cc/tensor_layout.cc
@@ -342,7 +342,12 @@ std::vector<int64_t> Mesh::dim_sizes() const {
 }
 
 bool Mesh::operator==(const Mesh& b) const {
-  return protobuf::util::MessageDifferencer::Equals(ToProto(), b.ToProto());
+  StatusOr<MeshProto> this_proto = ToProto();
+  StatusOr<MeshProto> b_proto = b.ToProto();
+  if (!this_proto.ok() || !b_proto.ok()) {
+    return false;
+  }
+  return protobuf::util::MessageDifferencer::Equals(*this_proto, *b_proto);
 }
 
 bool Mesh::IsEmpty() const {
@@ -401,7 +406,9 @@ std::vector<std::string> Mesh::hosts() const {
 std::string Mesh::device_type() const {
   if (IsEmpty()) return std::string();
   std::string device;
-  if (!global_devices_.empty()) {
+  if (IsSingleDevice()) {
+    device = single_device_;
+  } else if (!global_devices_.empty()) {
     device = global_devices_[0];
   } else {
     device = local_devices_[0];
@@ -445,8 +452,7 @@ int64 Mesh::size() const {
 
 Mesh Mesh::Empty() { return Mesh(); }
 
-// TODO(b/256016071): This should return a `StatusOr<MeshProto>`.
-MeshProto Mesh::ToProto() const {
+StatusOr<MeshProto> Mesh::ToProto() const {
   MeshProto mesh_proto;
   mesh_proto.set_name(name());
   mesh_proto.set_use_xla_spmd(use_xla_spmd());
@@ -465,8 +471,10 @@ MeshProto Mesh::ToProto() const {
         mesh_proto.add_global_device_ids(i);
       }
 
+      auto& mesh_dimensions = *mesh_proto.mutable_mesh_dimensions();
+      mesh_dimensions.Reserve(mesh_dims_.size());
       for (const auto& dim : mesh_dims_) {
-        MeshDimensionProto* mesh_dim_proto = mesh_proto.add_mesh_dimensions();
+        MeshDimensionProto* mesh_dim_proto = mesh_dimensions.Add();
         mesh_dim_proto->set_name(dim.name);
         mesh_dim_proto->set_size(dim.size);
       }
@@ -481,7 +489,8 @@ MeshProto Mesh::ToProto() const {
       break;
     }
     default: {
-      LOG(ERROR) << "Unsupported mesh type " << static_cast<int>(mesh_type_);
+      return errors::InvalidArgument("Unsupported mesh type ",
+                                     static_cast<int>(mesh_type_));
     }
   }
   return mesh_proto;
@@ -500,6 +509,7 @@ std::string Mesh::ToString() const {
 
   // Add mesh dimensions
   absl::InlinedVector<std::string, 4> mesh_dim_lst;
+  mesh_dim_lst.reserve(mesh_dims_.size());
   for (const auto& dim : mesh_dims_)
     mesh_dim_lst.push_back(absl::StrCat(dim.name, "=", dim.size));
   mesh_str += absl::StrJoin(mesh_dim_lst, ",") + "|";
@@ -533,6 +543,7 @@ uint64 Mesh::GlobalFingerprint() const {
   std::string mesh_str;
   // Add mesh dimensions
   absl::InlinedVector<std::string, 4> mesh_dim_lst;
+  mesh_dim_lst.reserve(mesh_dims_.size());
   for (const auto& dim : mesh_dims_)
     mesh_dim_lst.push_back(absl::StrCat(dim.name, "=", dim.size));
   mesh_str += absl::StrJoin(mesh_dim_lst, ",") + "|";
@@ -607,7 +618,18 @@ StatusOr<Mesh> Mesh::FromString(absl::string_view str) {
 
   std::vector<std::string> mesh_parts = absl::StrSplit(str, '|');
 
-  if (mesh_parts.size() == 1 && !mesh_parts[0].empty()) {
+  // We do not support specifying mesh name in single device mesh, i.e.
+  // the mesh name would always be empty.
+  if (mesh_parts.size() == 1) {
+    std::vector<std::string> single_device_parts =
+        absl::StrSplit(mesh_parts[0], ':');
+    // The single device can be
+    // "/job:localhost/replica:0/task:0/device:CPU:0" or
+    // "/job:localhost/task:0/device:CPU:0".
+    if (single_device_parts.size() != 5 && single_device_parts.size() != 6) {
+      TF_RETURN_WITH_CONTEXT(
+          errors::InvalidArgument("Input string is invalid: ", mesh_parts[0]))
+    }
     Mesh mesh;
     mesh.mesh_type_ = MeshType::kSingleDevice;
     mesh.single_device_ = str;
@@ -1074,9 +1096,9 @@ bool Layout::IsBatchParallel(int non_batch_rank) const {
   return true;
 }
 
-LayoutProto Layout::ToProto() const {
+StatusOr<LayoutProto> Layout::ToProto() const {
   LayoutProto proto;
-  *proto.mutable_mesh_config() = mesh_.ToProto();
+  TF_ASSIGN_OR_RETURN(*proto.mutable_mesh_config(), mesh_.ToProto());
   for (const auto& dim : sharding_specs_) {
     *proto.add_sharding_specs() = dim;
   }
@@ -1098,12 +1120,17 @@ bool Layout::IsEquivalent(const Layout& b) const {
 }
 
 bool Layout::operator==(const Layout& b) const {
-  return protobuf::util::MessageDifferencer::Equals(ToProto(), b.ToProto());
+  StatusOr<LayoutProto> this_proto = ToProto();
+  StatusOr<LayoutProto> b_proto = b.ToProto();
+  if (!this_proto.ok() || !b_proto.ok()) {
+    return false;
+  }
+  return protobuf::util::MessageDifferencer::Equals(*this_proto, *b_proto);
 }
 
 std::vector<int64_t> Layout::GlobalShapeFromLocalShape(
     absl::Span<const int64_t> local_shape) const {
-  if (IsFullyReplicated()) {
+  if (IsSingleDevice() || IsFullyReplicated()) {
     return std::vector<int64_t>(local_shape.begin(), local_shape.end());
   }
   std::vector<int64_t> global_shape;
@@ -1267,10 +1294,10 @@ std::string Layout::ToString() const {
   return layout_str;
 }
 
-Layout Layout::GetLayoutWithReducedDims(
+StatusOr<Layout> Layout::GetLayoutWithReducedDims(
     const absl::flat_hash_set<int>& reduced_dims, bool keep_dims) const {
   dtensor::LayoutProto output_layout;
-  *output_layout.mutable_mesh_config() = mesh().ToProto();
+  TF_ASSIGN_OR_RETURN(*output_layout.mutable_mesh_config(), mesh().ToProto());
 
   for (int i = 0; i < rank(); ++i) {
     // reduced_dims may contain negative values.
@@ -1288,44 +1315,27 @@ Layout Layout::Truncate(int64 split_point, bool end) const {
   if ((split_point == 0 && end) || (split_point == rank() && !end))
     return *this;
 
-  dtensor::LayoutProto output_layout;
-  *output_layout.mutable_mesh_config() = mesh().ToProto();
+  Layout output_layout(*this);
 
+  auto& specs = output_layout.sharding_specs_;
   if (end) {
-    for (int i = split_point; i < rank(); ++i)
-      *output_layout.add_sharding_specs() = dim(i);
+    specs.erase(specs.begin(), specs.begin() + split_point);
   } else {
-    for (int i = 0; i < split_point; ++i)
-      *output_layout.add_sharding_specs() = dim(i);
+    specs.resize(split_point);
   }
-  return Layout::FromProto(output_layout).value();
+  return output_layout;
 }
 
-namespace {
-// Adds unsharded sharding specs to layout.
-Layout PadLayout(const int64 rank, const bool is_padding_before,
-                 const Layout& layout) {
-  if (rank <= layout.rank()) return layout;
+Layout Layout::LeftPad(int64_t rank) const {
+  if (rank <= this->rank()) return *this;
 
-  // Create list of padding sharding specs.
-  const int n = rank - layout.rank();
-  std::vector<ShardingSpec> new_specs(n);
-  for (int i = 0; i < n; ++i)
-    new_specs[i].set_sharding_spec(Layout::kUnshardedDim);
+  Layout output_layout(*this);
 
-  // Define concatenation point of layout specs.
-  auto concat_point = is_padding_before ? new_specs.end() : new_specs.begin();
-
-  // Concatenate old layout specs and new unsharded specs.
-  new_specs.insert(concat_point, layout.sharding_specs().begin(),
-                   layout.sharding_specs().end());
-  return Layout::GetLayout(new_specs, layout.mesh()).value();
-}
-}  // namespace
-
-Layout Layout::LeftPad(int64 rank) const {
-  bool is_padding_before = true;
-  return PadLayout(rank, is_padding_before, *this);
+  auto& specs = output_layout.sharding_specs_;
+  ShardingSpec spec;
+  spec.set_sharding_spec(Layout::kUnshardedDim);
+  specs.insert(specs.begin(), rank - this->rank(), spec);
+  return output_layout;
 }
 
 StatusOr<Layout> ConcatenateLayouts(const Layout& layout_a,
@@ -1347,8 +1357,8 @@ StatusOr<Layout> ConcatenateLayouts(const Layout& layout_a,
           "dimension: ",
           layout_b.sharding_spec(i), " is used in both layouts.");
 
-  LayoutProto layout_proto_a = layout_a.ToProto();
-  LayoutProto layout_proto_b = layout_b.ToProto();
+  TF_ASSIGN_OR_RETURN(LayoutProto layout_proto_a, layout_a.ToProto());
+  TF_ASSIGN_OR_RETURN(LayoutProto layout_proto_b, layout_b.ToProto());
   LayoutProto output_layout_proto;
 
   *output_layout_proto.mutable_mesh_config() = layout_proto_a.mesh_config();
diff --git a/tensorflow/dtensor/cc/tensor_layout.h b/tensorflow/dtensor/cc/tensor_layout.h
index b05961c50c9..26cdc83f8bb 100644
--- a/tensorflow/dtensor/cc/tensor_layout.h
+++ b/tensorflow/dtensor/cc/tensor_layout.h
@@ -136,7 +136,7 @@ class Mesh {
   //  <name|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
   static StatusOr<Mesh> FromString(absl::string_view str);
   std::string ToString() const;
-  MeshProto ToProto() const;
+  StatusOr<MeshProto> ToProto() const;
 
   // Creates mesh without specific devices associated to it (aka abstract mesh).
   // This is an experimental API. Use only if strictly needed.
@@ -217,10 +217,13 @@ class Mesh {
   int64 size() const;
   bool use_xla_spmd() const { return use_xla_spmd_; }
   const std::string& name() const { return name_; }
+  absl::string_view single_device() const { return single_device_; }
 
   // Global unique fingerprint. Same on different workers.
   uint64 GlobalFingerprint() const;
 
+  // Uses proto to compare the equality. If any conversion to proto fails,
+  // returns false.
   bool operator==(const Mesh& b) const;
   bool operator!=(const Mesh& b) const { return !((*this) == b); }
   bool operator<(const Mesh& b) const {
@@ -285,6 +288,9 @@ class Layout {
   // the layout for the given dimension.
   static constexpr const char* kMatch = "match";
 
+  Layout() = default;
+  Layout(const Layout& other) = default;
+
   inline bool IsSingleDevice() const { return mesh_.IsSingleDevice(); }
 
   // Returns empty layout.
@@ -304,7 +310,7 @@ class Layout {
   static StatusOr<Layout> FromString(absl::string_view layout_str);
   // Creates human readable string version of a layout.
   std::string ToString() const;
-  LayoutProto ToProto() const;
+  StatusOr<LayoutProto> ToProto() const;
 
   const Mesh& mesh() const { return mesh_; }
   static Layout ReplicatedOnMesh(const Mesh& mesh, int rank);
@@ -314,7 +320,6 @@ class Layout {
   static Layout BatchShardedLike(const Layout& layout, const string& mesh_dim,
                                  int axis = 0);
   static Layout AnyOnMesh(const Mesh& mesh, int rank);
-  static StatusOr<Layout> SingleDeviceOnMesh(const Mesh& mesh);
   // Creates a mesh of unique shards.
   Mesh ReducedMesh() const;
   void set_mesh(Mesh mesh) { mesh_ = mesh; }
@@ -344,8 +349,8 @@ class Layout {
   // Makes a new layout from this one dropping the given dimensions.
   // If keep_dims is true, the dimensions are replicated rather than
   // deleted.
-  Layout GetLayoutWithReducedDims(const absl::flat_hash_set<int>& reduced_dims,
-                                  bool keep_dims) const;
+  StatusOr<Layout> GetLayoutWithReducedDims(
+      const absl::flat_hash_set<int>& reduced_dims, bool keep_dims) const;
 
   // Truncates a layout at the front or back, depending on the value of end.
   // end = false returns the layout up to the split point,
@@ -355,6 +360,10 @@ class Layout {
   // Left or right pad the layout to a max rank.
   Layout LeftPad(int64 rank) const;
 
+  // Minimally pads replicated axes on the left, or removes axes on the right,
+  // such that the result layout has the provided rank.
+  StatusOr<Layout> EnsureRank(int64_t rank) const;
+
   bool IsFullyReplicated() const;
   bool IsLastDimReplicated() const;
   // Checks that the last N-1 dimensions are replicated
@@ -399,6 +408,8 @@ class Layout {
   // the tensor. E.g. if one is unsharded and the other is sharded on a mesh
   // dimension of size 1.
   bool IsEquivalent(const Layout& b) const;
+  // Uses proto to compare the equality. If any conversion to proto fails,
+  // returns false.
   bool operator==(const Layout& b) const;
   bool operator!=(const Layout& b) const { return !((*this) == b); }
   bool operator<(const Layout& b) const {
diff --git a/tensorflow/dtensor/cc/tensor_with_layout.h b/tensorflow/dtensor/cc/tensor_with_layout.h
index 76f3d7fbad8..1c67b696809 100644
--- a/tensorflow/dtensor/cc/tensor_with_layout.h
+++ b/tensorflow/dtensor/cc/tensor_with_layout.h
@@ -129,7 +129,7 @@ class TensorWithLayout
   virtual std::string DebugString() const = 0;
 
   // Gets the mesh for the tensors.
-  virtual const Mesh& mesh() const = 0;
+  const Mesh& mesh() const { return layout().mesh(); }
 
   // Computes global shape from layout & local tensor shape.
   //
diff --git a/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
index 1cc0a7e8e7e..200a252708a 100644
--- a/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
+++ b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
@@ -15,14 +15,13 @@ limitations under the License.
 
 #include "tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h"
 
-#include <algorithm>
-#include <cstdint>
-#include <map>
 #include <string>
-#include <utility>
 #include <vector>
 
-#include "tensorflow/core/platform/errors.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 
 namespace tensorflow {
@@ -30,178 +29,87 @@ namespace dtensor {
 
 namespace {
 
-StatusOr<int64_t> DeviceLocationToLinearIndex(
-    absl::Span<const int64_t> mesh_shape, DeviceLocation dev_loc,
-    absl::Span<const int32_t> minor_to_major_ordering) {
-  if (mesh_shape.size() != dev_loc.size()) {
-    return errors::InvalidArgument(
-        "Mesh shape size and multi_index size must be equal");
-  }
-  int64_t scale = 1;
-  int64_t linear_index = 0;
-  for (auto dimension : minor_to_major_ordering) {
-    linear_index += scale * dev_loc[dimension];
-    scale *= mesh_shape[dimension];
-  }
-
-  return linear_index;
-}
-
-// Returns a grouping of devices in major-first to minor-last ordering based on
-// groups of devices that have the same piece of sharded tensor.
+// Produces the flat list from a slice of MajorToMinor::permutation to
+// `out_devices`.
 //
-// `sharded_indices` represent the sharded indices from some Layout.
-//
-// Example:
-//  For a mesh with dimensions {'x': 2, 'y': 2} and sharded_indices {0},
-//  this means that the Layout is sharded only on the `x` dimension, and thus
-//  the mesh coordinates {0, 0} and {0, 1} are in one group and {1, 0} and
-//  {1, 1} are in the other group. If we convert mesh coordinates to linearized
-//  device index, this returns {{0, 1}, {2, 3}}.
-StatusOr<std::vector<std::vector<int64_t>>> ComputeReplicatedGroups(
-    const Mesh& mesh, const std::vector<int64_t>& sharded_indices) {
-  std::map<DeviceLocation, std::vector<int64_t>> replicated_group_map;
-
-  for (size_t device_id = 0; device_id < mesh.size(); ++device_id) {
-    TF_ASSIGN_OR_RETURN(DeviceLocation dev_loc,
-                        mesh.device_location(device_id));
-    DeviceLocation reduced_dev_loc;
-    for (int64_t shard_index : sharded_indices) {
-      reduced_dev_loc.push_back(dev_loc[shard_index]);
-    }
-    replicated_group_map[reduced_dev_loc].push_back(device_id);
-  }
-
-  // Reorder these replica groups from the map in major-first to minor-last
-  // ordering by going through each device id in increasing ordering.
-  std::vector<std::vector<int64_t>> replicated_groups;
-  absl::flat_hash_set<int64_t> already_included_devices;
-
-  for (size_t device_id = 0; device_id < mesh.size(); ++device_id) {
-    if (already_included_devices.contains(device_id)) {
-      continue;
-    }
-    // Find the replicated group that contains this device_id and add all
-    // devices in that group to replicated_groups.
-    for (const auto& [unused_hash, group] : replicated_group_map) {
-      if (std::find(group.begin(), group.end(), device_id) != group.end()) {
-        replicated_groups.push_back(group);
-
-        for (int64_t device : group) {
-          already_included_devices.insert(device);
-        }
-      }
+// This function runs recursively to expand `permutation` from major to minor.
+// `sizes` is the size of mesh dimensions before the permutaiton.
+// `cum_sizes` is the accumulative product of the element in `sizes`.
+// `base` is the start device id of this slice of `permutation`.
+void PopulateDevices(absl::Span<const int64_t> permutation,
+                     absl::Span<const int64_t> sizes,
+                     absl::Span<const int64_t> cum_sizes,
+                     std::vector<int64_t>* out_devices, int64_t base = 0) {
+  int expanding_dim = permutation[0];
+  int expanding_dim_size = sizes[expanding_dim];
+  int expanding_cum_dim_size = cum_sizes[expanding_dim];
+  for (int i = 0; i < expanding_dim_size; ++i) {
+    if (permutation.size() == 1) {
+      // This is the last dimension. Fill `out_devices` with the device id.
+      out_devices->push_back(base + i * expanding_cum_dim_size);
+    } else {
+      // Recursively call the function to process the truncated `permutation`.
+      PopulateDevices(permutation.subspan(1), sizes, cum_sizes, out_devices,
+                      base + i * expanding_cum_dim_size);
     }
   }
-  return replicated_groups;
-}
-
-struct MeshDimInfo {
-  // Stores the dimension size of a mesh dimension.
-  int64_t size;
-  // Stores the index of a mesh dimension.
-  int64_t index;
-};
-
-// Returns a vector of device ids for XLA OpShardings `tile_assignment_devices`
-// field, based on the layout_shard_specs and the dimensions of the mesh.
-//
-// Note that this function assumes that the layouts are fully sharded, i.e
-// there is no Layout::UNSHARDED dimension in `layout_shard_specs`.
-//
-// At a high level, this function is a permutation function that permutes device
-// ids from [0, n) to a new ordering based on however the `layout_shard_specs`
-// transposes the ordering of `mesh_dims`.
-//
-// `tile_assignment_devices` in ::xla::OpSharding is a linearized list of
-// devices based on a defined `minor_to_major` ordering. The default
-// `minor_to_major` ordering of a Mesh is first index major, i.e [n-1, n-2, ...,
-// 0]. The `layout_shard_specs` essentially defines a new minor_to_major
-// ordering based on the ordering of the shard specs, and is needed  to
-// compute `tile_assignment_devices`.
-StatusOr<std::vector<int64_t>> ComputeTileAssignmentDevices(
-    const std::vector<std::string>& layout_shard_specs,
-    const std::vector<MeshDimension>& mesh_dims) {
-  if (layout_shard_specs.size() != mesh_dims.size()) {
-    return errors::InvalidArgument(
-        "Number of shard specs must equal number of mesh dimensions. This "
-        "might indicate that Layout is not fully sharded.");
-  }
-
-  absl::flat_hash_map<std::string, MeshDimInfo> mesh_spec_to_info;
-  int64_t num_devices = 1;
-
-  for (int64_t i = 0; i < mesh_dims.size(); ++i) {
-    num_devices *= mesh_dims[i].size;
-    MeshDimInfo mesh_dim_info;
-    mesh_dim_info.size = mesh_dims[i].size;
-    mesh_dim_info.index = i;
-    mesh_spec_to_info[mesh_dims[i].name] = mesh_dim_info;
-  }
-
-  // Shape of transposed mesh based on the ordering of layout's sharding specs.
-  std::vector<int64_t> mesh_shape;
-  mesh_shape.reserve(layout_shard_specs.size());
-  for (const MeshDimension& mesh_dim : mesh_dims) {
-    mesh_shape.push_back(mesh_dim.size);
-  }
-
-  // Compute the new minor to major ordering based on the ordering of layout
-  // sharding.
-  //
-  // Example:
-  //   For a Mesh with specs ['x', 'y'], the original minor_to_major is [1, 0].
-  //   But if the layout is ['y', 'x'], the new minor_to_major is [0, 1].
-  std::vector<int32_t> minor_to_major_ordering;
-  for (const std::string& shard_spec : layout_shard_specs) {
-    if (shard_spec == Layout::kUnshardedDim) {
-      return errors::InvalidArgument(
-          "Expected a sharded mesh dimension but received an unsharded "
-          "dimension.");
-    }
-    minor_to_major_ordering.insert(minor_to_major_ordering.begin(),
-                                   mesh_spec_to_info[shard_spec].index);
-  }
-
-  // For each device id increasing from [0, n), compute its multi-dimensional
-  // index in the mesh, and then compute its new linear index based on
-  // the new minor to major ordering. This will give us the new location
-  // in the transposed mesh based on the layout. Intuitively, this is just
-  // a permutation function of Layout: Layout can be thought of as how
-  // it permutes the pieces of tensors.
-  absl::flat_hash_map<int64_t, int64_t> permutation_map;
-  for (int device = 0; device < num_devices; ++device) {
-    // Compute the multidimensional index from this linear index.
-    DeviceLocation dev_loc;
-
-    int offset = device;
-    int64 i = mesh_shape.size() - 1;
-    while (i >= 0) {
-      dev_loc.insert(dev_loc.begin(), offset % mesh_shape[i]);
-      offset /= mesh_shape[i];
-      --i;
-    }
-
-    TF_ASSIGN_OR_RETURN(int64_t linear_index,
-                        DeviceLocationToLinearIndex(mesh_shape, dev_loc,
-                                                    minor_to_major_ordering));
-    permutation_map[linear_index] = device;
-  }
-
-  // For each device id increasing from [0, n), use the permutation map to
-  // reverse the permutation and linearize the device ordering. This
-  // gives us the final tile assignment devices such that it is ordered
-  // correctly based on Layout.
-  std::vector<int64_t> tile_assignment_devices;
-  tile_assignment_devices.reserve(num_devices);
-  for (int device = 0; device < num_devices; ++device) {
-    tile_assignment_devices.push_back(permutation_map[device]);
-  }
-  return tile_assignment_devices;
 }
 
 }  // namespace
 
+std::vector<int64_t> MeshMajorToMinor::ToDeviceList() {
+  std::vector<int64_t> cum_sizes(sizes.size());
+  int64_t cum_size = 1;
+  for (int i = sizes.size() - 1; i >= 0; --i) {
+    cum_sizes[i] = cum_size;
+    cum_size *= sizes[i];
+  }
+
+  std::vector<int64_t> devices;
+  devices.reserve(cum_size * sizes[0]);
+  PopulateDevices(permutation, sizes, cum_sizes, &devices);
+  return devices;
+}
+
+StatusOr<MeshMajorToMinor> ConvertMeshMajorToMinor(const Layout& layout,
+                                                   const Mesh& mesh) {
+  MeshMajorToMinor major_to_minor;
+
+  major_to_minor.permutation.reserve(mesh.dims().size());
+  major_to_minor.sizes.reserve(mesh.dims().size());
+  absl::flat_hash_map<std::string, int64_t> dim_name_to_index_map;
+  // Populate dim sizes according to the order in mesh.
+  for (const auto& [index, mesh_dim] : llvm::enumerate(mesh.dims())) {
+    major_to_minor.sizes.push_back(mesh_dim.size);
+    dim_name_to_index_map[mesh_dim.name] = index;
+  }
+  // Sharded dims appear at the beginning of permutations according to the
+  // order in layout.
+  for (const auto& spec : layout.sharding_spec_strs()) {
+    if (mesh.IsMeshDim(spec)) {
+      const auto it = dim_name_to_index_map.find(spec);
+      TF_RET_CHECK(it != dim_name_to_index_map.end());
+      const auto& dimension_index = it->second;
+      major_to_minor.permutation.push_back(dimension_index);
+      dim_name_to_index_map.erase(it);
+    }
+  }
+  // Replicated dims (dims not in layout) appear at the end of permutations
+  // according to the order in mesh. The order here doesn't matter
+  // mathematically.
+  for (const auto& [name, unused_size] : mesh.dims()) {
+    if (const auto it = dim_name_to_index_map.find(name);
+        it != dim_name_to_index_map.end()) {
+      const auto& dimension_index = it->second;
+      major_to_minor.permutation.push_back(dimension_index);
+    }
+  }
+  TF_RET_CHECK(major_to_minor.permutation.size() ==
+               major_to_minor.sizes.size());
+
+  return major_to_minor;
+}
+
 StatusOr<::xla::OpSharding> ConvertLayoutToXlaOpSharding(const Layout& layout) {
   ::xla::OpSharding xla_sharding;
 
@@ -215,75 +123,33 @@ StatusOr<::xla::OpSharding> ConvertLayoutToXlaOpSharding(const Layout& layout) {
   // If not replicated, then this is tile sharded, aka OpSharding::OTHER.
   xla_sharding.set_type(::xla::OpSharding::OTHER);
 
-  // Set Tile Assignment Dimensions by handling both partially sharded and fully
-  // sharded.
-  int32 product_of_sharded_dimensions = 1;
-  for (int32 dim_size : layout.num_shards()) {
-    product_of_sharded_dimensions *= dim_size;
-    xla_sharding.add_tile_assignment_dimensions(dim_size);
-  }
+  const Mesh& mesh = layout.mesh();
 
-  const Mesh mesh = layout.mesh();
+  // Compute tile_assignment_dimensions.
+  {
+    // Set Tile Assignment Dimensions by handling both partially sharded and
+    // fully sharded.
+    int32 product_of_sharded_dimensions = 1;
+    for (int32 dim_size : layout.num_shards()) {
+      product_of_sharded_dimensions *= dim_size;
+      xla_sharding.add_tile_assignment_dimensions(dim_size);
+    }
 
-  // Add the (n+1)th dimension representing the replicated group size. This
-  // only happens for partially sharded layouts.
-  if (product_of_sharded_dimensions != mesh.num_devices()) {
-    xla_sharding.add_tile_assignment_dimensions(mesh.num_devices() /
-                                                product_of_sharded_dimensions);
-    xla_sharding.set_replicate_on_last_tile_dim(true);
-  }
-
-  // Set Tile Assignment Devices, handling both partially and fully sharded
-  // layouts.
-  std::vector<std::string> sharded_layout_specs;
-  std::vector<int64_t> sharded_mesh_indices;
-
-  // Extract the non-replicated layout specs and mesh indices.
-  for (const std::string& spec : layout.sharding_spec_strs()) {
-    if (spec == Layout::kUnshardedDim) continue;
-    sharded_layout_specs.push_back(spec);
-    sharded_mesh_indices.push_back(mesh.idx_for_dim(spec).value());
-  }
-
-  // Create a new sub-mesh based only on the sharded dimensions of `layout`.
-  std::vector<MeshDimension> reduced_mesh_dims;
-  for (const MeshDimension& mesh_dim : mesh.dims()) {
-    if (std::find(sharded_layout_specs.begin(), sharded_layout_specs.end(),
-                  mesh_dim.name) != sharded_layout_specs.end()) {
-      reduced_mesh_dims.push_back(mesh_dim);
+    // Add the (n+1)th dimension representing the replicated group size. This
+    // only happens for partially sharded layouts.
+    if (product_of_sharded_dimensions != mesh.num_devices()) {
+      xla_sharding.add_tile_assignment_dimensions(
+          mesh.num_devices() / product_of_sharded_dimensions);
+      xla_sharding.set_replicate_on_last_tile_dim(true);
     }
   }
 
-  TF_ASSIGN_OR_RETURN(
-      std::vector<int64_t> tile_assignment_devices,
-      ComputeTileAssignmentDevices(sharded_layout_specs, reduced_mesh_dims));
-
-  TF_ASSIGN_OR_RETURN(std::vector<std::vector<int64_t>> replicated_groups,
-                      ComputeReplicatedGroups(mesh, sharded_mesh_indices));
-
-  if (tile_assignment_devices.size() != replicated_groups.size()) {
-    return errors::Internal(
-        "Replicated group size was not equal to the number of tile assignment "
-        "devices. Please file a bug to DTensor.",
-        "tile_assignment_devices size=", tile_assignment_devices.size(),
-        "and replicated_grous size=", replicated_groups.size(),
-        "for Layout=", layout.ToString());
-  }
-
-  // For partially sharded layouts, we need to expand the
-  // tile_assignment_devices based on the replica groups. This is a no-op
-  // for fully sharded layouts.
-  std::vector<int64_t> expanded_tile_assignment_devices;
-  for (int64_t group_index : tile_assignment_devices) {
-    for (int64_t device : replicated_groups[group_index]) {
-      expanded_tile_assignment_devices.push_back(device);
-    }
-  }
-
-  // Finally add this to the OpSharding proto.
-  for (int64_t device : expanded_tile_assignment_devices) {
-    xla_sharding.add_tile_assignment_devices(device);
-  }
+  // Compute tile_assignment_devices.
+  TF_ASSIGN_OR_RETURN(auto major_to_minor,
+                      ConvertMeshMajorToMinor(layout, mesh));
+  std::vector<int64_t> tile_assignment_devices = major_to_minor.ToDeviceList();
+  *(xla_sharding.mutable_tile_assignment_devices()) = {
+      tile_assignment_devices.begin(), tile_assignment_devices.end()};
 
   return xla_sharding;
 }
diff --git a/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h
index 3cb1f8ee93b..52014d79910 100644
--- a/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h
+++ b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_CC_XLA_SPMD_LAYOUT_TO_XLA_SHARDING_H_
 #define TENSORFLOW_DTENSOR_CC_XLA_SPMD_LAYOUT_TO_XLA_SHARDING_H_
 
+#include <cstdint>
+#include <vector>
+
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
@@ -28,6 +31,24 @@ namespace dtensor {
 // and outputs of a function for XLA SPMD.
 constexpr char kXlaShardingAttr[] = "mhlo.sharding";
 
+// Represents a permutation of DTensor Layout mesh dimensions from major to
+// minor.
+//
+// Sizes of `permutation` and `sizes` must be equal.
+struct MeshMajorToMinor {
+  // A permutation of range [0...n].
+  std::vector<int64_t> permutation;
+  // The size of mesh dimensions before the permutation.
+  std::vector<int64_t> sizes;
+
+  // Produces a flat list of device ids according to the permutation.
+  std::vector<int64_t> ToDeviceList();
+};
+
+// Get the mesh dimensions from major to minor.
+StatusOr<MeshMajorToMinor> ConvertMeshMajorToMinor(const Layout& layout,
+                                                   const Mesh& mesh);
+
 // Returns an ::xla::OpSharding protobuf from `layout`.
 StatusOr<::xla::OpSharding> ConvertLayoutToXlaOpSharding(const Layout& layout);
 
diff --git a/tensorflow/dtensor/mlir/BUILD b/tensorflow/dtensor/mlir/BUILD
index 89003fb920d..ecd0319b06f 100644
--- a/tensorflow/dtensor/mlir/BUILD
+++ b/tensorflow/dtensor/mlir/BUILD
@@ -97,11 +97,10 @@ cc_library(
         ":tf_dtensor_dialect",
         ":value_utils",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/core:lib",
         "//tensorflow/dtensor/cc:dstatus",
+        "//tensorflow/dtensor/cc:dtensor_utils",
         "//tensorflow/dtensor/cc:tensor_layout",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -191,8 +190,10 @@ cc_library(
         "dtensor_allreduce_combine_optimization.cc",
         "dtensor_allreduce_scatter_optimization.cc",
         "dtensor_allreduce_sum_optimization.cc",
+        "dtensor_layout_to_xla_sharding_op.cc",
         "dtensor_mixed_precision_reduce.cc",
         "dtensor_mlir_passes.cc",
+        "dtensor_multi_device_expansion.cc",
         "dtensor_remove_dtensorlayout.cc",
         "dtensor_replace_auxiliary_layout_op.cc",
         "dtensor_replace_relayout_with_identity.cc",
@@ -245,6 +246,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
         "//tensorflow/compiler/mlir/utils:name_utils",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:sharding_builder",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:lib",
@@ -254,8 +256,11 @@ cc_library(
         "//tensorflow/dtensor/cc:tensor_layout",
         "//tensorflow/dtensor/mlir/dtensor_dialect:ir/dtensor_attributes",
         "//tensorflow/dtensor/mlir/utils:dtensor_mlir_passes_internal",
+        "//tensorflow/tsl/platform:logging",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
@@ -370,6 +375,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/dtensor/cc:constants",
         "//tensorflow/dtensor/cc:dstatus",
+        "//tensorflow/dtensor/cc:tensor_layout",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -449,6 +455,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
     alwayslink = True,
 )
diff --git a/tensorflow/dtensor/mlir/Passes.td b/tensorflow/dtensor/mlir/Passes.td
index 6f1eafe763b..fc3396ed2b4 100644
--- a/tensorflow/dtensor/mlir/Passes.td
+++ b/tensorflow/dtensor/mlir/Passes.td
@@ -167,6 +167,7 @@ def DTensorAllReduceLowering
   let summary = "Converts logical AllReduce ops into physical AllReduce ops.";
   let constructor = "CreateDTensorAllReduceLoweringPass()";
   let dependentDialects = [
+    "::mlir::dtensor::DTensorDialect"
   ];
 }
 
@@ -175,6 +176,7 @@ def DTensorReduceScatterLowering
   let summary = "Converts logical ReduceScatter ops into physical ReduceScatter ops.";
   let constructor = "CreateDTensorReduceScatterLoweringPass()";
   let dependentDialects = [
+    "::mlir::dtensor::DTensorDialect"
   ];
 }
 
@@ -183,6 +185,7 @@ def DTensorAllGatherLowering
   let summary = "Converts logical AllGather ops into physical AllGather ops.";
   let constructor = "CreateDTensorAllGatherLoweringPass()";
   let dependentDialects = [
+    "::mlir::dtensor::DTensorDialect"
   ];
 }
 
@@ -191,6 +194,16 @@ def DTensorAllScatterLowering
   let summary = "Converts logical AllScatter ops into physical Split ops.";
   let constructor = "CreateDTensorAllScatterLoweringPass()";
   let dependentDialects = [
+    "::mlir::dtensor::DTensorDialect"
+  ];
+}
+
+def DTensorAllToAllLowering
+    : Pass<"dtensor-all-to-all-lowering", "mlir::ModuleOp"> {
+  let summary = "Converts logical AllToAll ops into physical AllToAll ops.";
+  let constructor = "CreateDTensorAllToAllLoweringPass()";
+  let dependentDialects = [
+    "::mlir::dtensor::DTensorDialect"
   ];
 }
 
@@ -274,6 +287,14 @@ def DTensorFunctionRenaming
   ];
 }
 
+def DTensorMultiDeviceExpansion
+    : Pass<"dtensor-multi-device-expansion", "mlir::ModuleOp"> {
+  let summary = "Expands a per-device, post-SPMD graph for multiple devices.";
+  let constructor = "CreateDTensorMultiDeviceExpansionPass()";
+  let dependentDialects = [
+  ];
+}
+
 def DTensorMergeClusters
     : Pass<"dtensor-merge-clusters", "mlir::ModuleOp"> {
   let summary = "Merges tf_device.Clusters ops with same mesh specification.";
@@ -390,6 +411,37 @@ def DTensorSetHloShardingPass : Pass<"dtensor-set-hlo-sharding", "mlir::ModuleOp
   ];
 }
 
+def DTensorLayoutToXlaShardingOpPass : Pass<"dtensor-layout-to-xla-sharding-op", "mlir::func::FuncOp"> {
+  let summary = "Replace `tf.DTensorLayout` op with `tf.XlaSharding` op.";
+
+  let description = [{
+    Provides sharding guide to XLA based on DTensor layout propagation result.
+
+    tf2xla bridge will further lower `tf.XlaSharding` to `hlo.custom_call`.
+
+    For example:
+
+    ```
+      %1 = tf.DTensorLayout(%0)
+    ```
+
+    will be converted to
+    ```
+      %1 = tf.XlaSharding(%0)
+    ```
+
+    When the DTensorLayout's operand is produced by a constant, the
+    DTensorLayout will be removed and no XlaSharding is inserted. This is
+    because tf2xla requires some op operands to be constant. Inserting
+    XlaSharding will break them.
+
+    A side note: "mhlo.sharding" attributes on ops except FuncOp won't be used
+    by XLA.
+  }];
+
+  let constructor = "CreateDTensorLayoutToXlaShardingOpPass()";
+}
+
 def DTensorReplaceAuxiliaryDTensorLayoutOpPass
     : Pass<"dtensor-replace-auxiliary-layout-op", "mlir::ModuleOp"> {
   let summary = "Replace auxiliary `tf.DTensorLayout` op with `tf.Identity`.";
diff --git a/tensorflow/dtensor/mlir/collectives.cc b/tensorflow/dtensor/mlir/collectives.cc
index 00a363e34b1..6d4ea9593bc 100644
--- a/tensorflow/dtensor/mlir/collectives.cc
+++ b/tensorflow/dtensor/mlir/collectives.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/dtensor_utils.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives_common.h"
 #include "tensorflow/dtensor/mlir/dtensor_location.h"
@@ -110,7 +112,11 @@ StatusOr<const mlir::Value> EmitAllScatter(
 
   // Have an early return if desired layout is not more sharded then the
   // original_layout.
-  assert(original_layout.rank() == desired_layout.rank());
+  if (original_layout.rank() != desired_layout.rank()) {
+    return errors::InvalidArgument(absl::StrCat(
+        "Rank mismatch for original layout (", original_layout.ToString(),
+        ") and desired layout (", desired_layout.ToString(), ")"));
+  }
   for (int i = 0; i < original_layout.rank(); ++i) {
     if (original_layout.sharding_spec(i) != desired_layout.sharding_spec(i) &&
         Layout::IsShardedDimension(original_layout.sharding_spec(i))) {
@@ -146,6 +152,78 @@ StatusOr<const mlir::Value> EmitAllScatter(
   return all_scatter.getOutput();
 }
 
+bool CanUseAllToAll(const dtensor::Layout& src_layout,
+                    const dtensor::Layout& tgt_layout) {
+  // All-to-all can be used for relayout if one dimension is becoming more
+  // sharded while another is becoming less sharded, for example x,unsharded ->
+  // unsharded,x.
+  // TODO(trevor-m): There may be more types of relayouts which can utilize
+  // all-to-all in addition to these which can be supported later.
+  int num_split_dims = 0;
+  int num_concat_dims = 0;
+  ShardingSpec split_spec;
+  ShardingSpec concat_spec;
+  for (int i = 0; i < src_layout.rank(); ++i) {
+    if (src_layout.sharding_spec(i) == tgt_layout.sharding_spec(i)) continue;
+    if (Layout::IsUnshardedDimension(src_layout.sharding_spec(i)) &&
+        Layout::IsShardedDimension(tgt_layout.sharding_spec(i))) {
+      num_split_dims++;
+      split_spec = tgt_layout.dim(i);
+    } else if (Layout::IsShardedDimension(src_layout.sharding_spec(i)) &&
+               Layout::IsUnshardedDimension(tgt_layout.sharding_spec(i))) {
+      num_concat_dims++;
+      concat_spec = src_layout.dim(i);
+    }
+  }
+  return num_split_dims == 1 && num_concat_dims == 1 &&
+         split_spec.sharding_spec() == concat_spec.sharding_spec();
+}
+
+StatusOr<mlir::Value> EmitAllToAll(
+    mlir::OpBuilder& builder, mlir::Value input,
+    const dtensor::Layout& src_layout, const dtensor::Layout& tgt_layout,
+    llvm::SmallPtrSet<mlir::Operation*, 4>* newly_created_ops) {
+  if (src_layout.IsEquivalent(tgt_layout)) return input;
+
+  if (src_layout.rank() != tgt_layout.rank()) {
+    return errors::InvalidArgument(
+        "Expected source and target layout to have the same rank, got ",
+        src_layout.rank(), " vs ", tgt_layout.rank());
+  }
+  // Assume valid because CanUseAllToAll returned true.
+
+  // For convenience, operate on explicit input shapes. This isn't necessary,
+  // as we could instead generate operations on top of the dynamic shape.
+  const mlir::TensorType input_type =
+      input.getType().dyn_cast<mlir::TensorType>();
+  if (!input_type) {
+    return errors::Internal(
+        llvm::formatv(
+            "Cannot cast input_type : {0} to TensorType. Shape must be "
+            " statically known before emitting AllToAll. This should not "
+            "happen as we already cast it when getting its shape.",
+            input.getType())
+            .str());
+  }
+
+  TF_ASSIGN_OR_RETURN(mlir::TensorType global_type,
+                      GlobalTypeFromLocalType(src_layout, input_type));
+  TF_ASSIGN_OR_RETURN(mlir::TensorType output_type,
+                      LocalTypeFromGlobalType(tgt_layout, global_type));
+
+  mlir::Location loc = DT_LOC2(input.getLoc(), "DTensorAllToAllOp");
+  mlir::TF::DTensorAllToAllOp all_to_all =
+      builder.create<mlir::TF::DTensorAllToAllOp>(
+          loc, output_type, input,
+          mlir::dtensor::LayoutAttr::get(builder.getContext(), src_layout),
+          mlir::dtensor::LayoutAttr::get(builder.getContext(), tgt_layout));
+  SetSingleLayoutOnOp(all_to_all, tgt_layout);
+
+  if (newly_created_ops != nullptr) newly_created_ops->insert(all_to_all);
+
+  return all_to_all.getOutput();
+}
+
 StatusOr<mlir::Value> EmitDenseToSparseToDense(
     mlir::OpBuilder& builder, mlir::Value input,
     llvm::SmallPtrSet<mlir::Operation*, 4>* newly_created_ops) {
@@ -222,13 +300,29 @@ StatusOr<mlir::Value> EmitRelayout(
         "have a rank");
 
   if (src_layout.mesh() != tgt_layout.mesh()) {
-    return errors::Internal("Attempted to relayout to a different mesh.");
+    return errors::Internal(
+        absl::StrCat("Attempted to relayout to a different "
+                     " mesh. Source Mesh = (",
+                     src_layout.mesh().ToString(),
+                     "). Target Mesh = ", tgt_layout.mesh().ToString(), ")."));
   }
   if (src_layout.rank() != tgt_layout.rank()) {
     return errors::Internal(
         "Attempted to relayout to a different global shape.");
   }
 
+  mlir::OpBuilder builder(input.getContext());
+  TF_RETURN_IF_ERROR(SetBuilderInsertionAfterValue(input, builder));
+
+  if (EnableAllToAllForRelayout() && !is_sparse &&
+      CanUseAllToAll(src_layout, tgt_layout)) {
+    // TODO(tmorris): support sparse case
+    TF_ASSIGN_OR_RETURN(mlir::Value all_to_all_result,
+                        EmitAllToAll(builder, input, src_layout, tgt_layout,
+                                     newly_created_ops));
+    return all_to_all_result;
+  }
+
   absl::flat_hash_set<std::string> src_sharding_dims;
   for (int i = 0; i < src_layout.rank(); ++i)
     src_sharding_dims.emplace(src_layout.sharding_spec(i));
@@ -246,9 +340,6 @@ StatusOr<mlir::Value> EmitRelayout(
       Layout intermediate_layout_1,
       Layout::GetLayout(intermediate_specs_1, src_layout.mesh()));
 
-  mlir::OpBuilder builder(input.getContext());
-  TF_RETURN_IF_ERROR(SetBuilderInsertionAfterValue(input, builder));
-
   llvm::SmallPtrSet<mlir::Operation*, 4> local_newly_created_ops;
   TF_ASSIGN_OR_RETURN(mlir::Value split_result,
                       EmitAllScatter(builder, input, src_layout,
diff --git a/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h b/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h
index 3469000454c..29875fec834 100644
--- a/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h
+++ b/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h
@@ -109,9 +109,15 @@ CreateDTensorEmbeddingCheckpointPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateFunctionRenamingPass();
 
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorMultiDeviceExpansionPass();
+
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateDTensorAllReduceLoweringPass();
 
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorAllToAllLoweringPass();
+
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateDTensorReduceScatterLoweringPass();
 
@@ -143,6 +149,9 @@ std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateDTensorSetHloShardingPass(
     std::optional<bool> check_layout_use_xla_spmd = std::optional<bool>(false));
 
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorLayoutToXlaShardingOpPass();
+
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateDTensorReplaceAuxiliaryDTensorLayoutOpPass();
 
diff --git a/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc b/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
index 2d606f4acea..ca3016b9cff 100644
--- a/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
+++ b/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
@@ -57,11 +57,13 @@ mlir::LogicalResult ShouldMergeClusters(mlir::tf_device::ClusterOp cluster_a,
 
   auto mesh_a_or_status = ExtractDeviceMeshFromOp(cluster_a.getOperation());
   if (!mesh_a_or_status.ok())
-    return cluster_a.emitOpError(mesh_a_or_status.status().error_message());
+    return cluster_a.emitOpError(
+        tsl::NullTerminatedMessage(mesh_a_or_status.status()));
 
   auto mesh_b_or_status = ExtractDeviceMeshFromOp(cluster_b.getOperation());
   if (!mesh_b_or_status.ok())
-    return cluster_b.emitOpError(mesh_b_or_status.status().error_message());
+    return cluster_b.emitOpError(
+        tsl::NullTerminatedMessage(mesh_b_or_status.status()));
 
   auto mesh_a = mesh_a_or_status.value();
   auto mesh_b = mesh_b_or_status.value();
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
index c0fe7dedf8e..1a7a482ee0d 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
@@ -337,7 +337,7 @@ mlir::LogicalResult CombineAllReduceOps(
     if (!group_assignment.ok()) {
       return all_reduce.emitOpError(
           llvm::formatv("Failed to create a GroupAssignment due to {0}",
-                        group_assignment.status().error_message()));
+                        group_assignment.status().message()));
     }
     // Unit tests have only one slice. Always combine all all-reduces in them.
     if (group_assignment->num_slices() == 1 ||
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
index b369165cbba..43f31cc83ae 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
@@ -163,7 +163,7 @@ mlir::LogicalResult OptimizeAllReduceAndSum(mlir::Operation* op,
   if (!layout_or_status.ok())
     return first_reduction_op->emitOpError(llvm::formatv(
         "Malformed layout specification for DTensorAllReduce op found: {0}",
-        layout_or_status.status().error_message()));
+        layout_or_status.status().message()));
 
   if (!layout_or_status->has_value())
     return first_reduction_op->emitOpError(
@@ -398,7 +398,7 @@ mlir::LogicalResult ExtractAllReduceFromWhileOp(
   if (!layout_or_status.ok())
     return all_reduce->emitOpError(llvm::formatv(
         "Malformed layout specification for DTensorAllReduce op found: {0}",
-        layout_or_status.status().error_message()));
+        layout_or_status.status().message()));
 
   if (!layout_or_status->has_value())
     return all_reduce->emitOpError(
diff --git a/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc b/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
new file mode 100644
index 00000000000..f59624a282e
--- /dev/null
+++ b/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
@@ -0,0 +1,131 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h"
+#include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
+
+namespace tensorflow {
+namespace dtensor {
+namespace {
+
+#define GEN_PASS_DECL_DTENSORLAYOUTTOXLASHARDINGOPPASS
+#define GEN_PASS_DEF_DTENSORLAYOUTTOXLASHARDINGOPPASS
+#include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
+
+using mlir::TF::DTensorLayout;
+
+class RemoveDTensorLayoutAfterConstOrBlockArgPattern
+    : public mlir::OpRewritePattern<DTensorLayout> {
+ public:
+  using mlir::OpRewritePattern<DTensorLayout>::OpRewritePattern;
+
+  mlir::LogicalResult match(DTensorLayout layout_op) const override;
+
+  void rewrite(DTensorLayout layout_op,
+               mlir::PatternRewriter& rewriter) const override {
+    rewriter.replaceAllUsesWith(layout_op, layout_op.getInput());
+    rewriter.eraseOp(layout_op);
+  }
+};
+
+class DTensorLayoutToXlaShardingOpPass
+    : public impl::DTensorLayoutToXlaShardingOpPassBase<
+          DTensorLayoutToXlaShardingOpPass> {
+ public:
+  void runOnOperation() override;
+};
+
+mlir::LogicalResult RemoveDTensorLayoutAfterConstOrBlockArgPattern::match(
+    DTensorLayout layout_op) const {
+  auto input = layout_op.getInput();
+  if (input.isa<mlir::BlockArgument>()) {
+    return mlir::success();
+  }
+  mlir::Operation* input_op = input.getDefiningOp();
+  if (input_op != nullptr) {
+    return mlir::success(input_op->hasTrait<mlir::OpTrait::ConstantLike>());
+  } else {
+    return layout_op->emitOpError() << "Can't find defining op for " << input;
+  }
+}
+
+void DTensorLayoutToXlaShardingOpPass::runOnOperation() {
+  mlir::RewritePatternSet patterns(&getContext());
+  // Some patterns in tf2xla requires operands to be ConstantLike.
+  // Inserting tf.XlaSharding between them will fail the pattern match.
+  // We remove all tf.DTensorLayout after constants so no tf.XlaSharding is
+  // inserted in the above case. XLA will figure out the sharding of constants
+  // without DTensor guidance.
+  //
+  // For BlockArgument, the sharding is already attached to function attribute
+  // by DTensorSetHloShardingPass. No additional tf.XlaSharding is needed.
+  patterns.add<RemoveDTensorLayoutAfterConstOrBlockArgPattern>(&getContext());
+  if (mlir::failed(mlir::applyPatternsAndFoldGreedily(getOperation(),
+                                                      std::move(patterns)))) {
+    signalPassFailure();
+  }
+
+  auto result =
+      getOperation().walk([](DTensorLayout layout_op) -> mlir::WalkResult {
+        Layout layout = layout_op.getLayout();
+        StatusOr<xla::OpSharding> sharding =
+            ConvertLayoutToXlaOpSharding(layout);
+        if (!sharding.ok()) {
+          return layout_op.emitOpError()
+                 << "Failed to convert layout to sharding for "
+                 << layout.ToString() << ": " << sharding.status().message();
+        }
+        mlir::OpBuilder builder(layout_op);
+        auto sharding_attr =
+            builder.getStringAttr(sharding->SerializeAsString());
+        auto sharding_op = builder.create<mlir::TF::XlaShardingOp>(
+            layout_op.getLoc(), layout_op.getOutput().getType(),
+            layout_op.getInput(),
+            /*sharding=*/builder.getStringAttr(""),  // Not used by tf2xla.
+            /*_xlaSharding=*/sharding_attr);
+        layout_op.getOutput().replaceAllUsesWith(sharding_op);
+        layout_op.erase();
+        return mlir::WalkResult::advance();
+      });
+
+  if (result.wasInterrupted()) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateDTensorLayoutToXlaShardingOpPass() {
+  return std::make_unique<DTensorLayoutToXlaShardingOpPass>();
+}
+
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
index 4d29b61aaa4..b84a4a09d8a 100644
--- a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
+++ b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
@@ -89,7 +89,7 @@ mlir::LogicalResult MaybeUpcastForReduction(ReduceOpType reduce_op,
   if (!reduce_layout.ok())
     return reduce_op.emitOpError(llvm::formatv(
         "Malformed layout specification for DTensor reduce op found: {0}",
-        reduce_layout.status().error_message()));
+        reduce_layout.status().message()));
 
   // The original output tensor type that would have been used by all users of
   // the reduce op.
diff --git a/tensorflow/dtensor/mlir/dtensor_mlir_passes.cc b/tensorflow/dtensor/mlir/dtensor_mlir_passes.cc
index cd1520415d7..a72a4f48cba 100644
--- a/tensorflow/dtensor/mlir/dtensor_mlir_passes.cc
+++ b/tensorflow/dtensor/mlir/dtensor_mlir_passes.cc
@@ -93,6 +93,10 @@ void CreateDTensorMLIRPass(const mlir::TF::StandardPipelineOptions &options,
   pm->addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
   pm->addPass(mlir::createInlinerPass());
 
+  // An additional shape inference to catch any newly created constants
+  // from canonicalizer.
+  pm->addPass(mlir::TF::CreateTFShapeInferencePass());
+
   // Ensure that all functions have `device_id` as 0th argument.
   pm->addPass(CreateDTensorPropagateDeviceIdToFunctionArgs());
 
@@ -229,6 +233,8 @@ void CreateDTensorMLIRPass(const mlir::TF::StandardPipelineOptions &options,
 
   pm->addPass(CreateDTensorAllScatterLoweringPass());
 
+  pm->addPass(CreateDTensorAllToAllLoweringPass());
+
   // Group together multiple device clusters assigned to the same mesh. Repeat
   // this for every mesh to support multi-mesh. Collective lowering may have
   // created multiple CPU mesh clusters for executing collective operations on
@@ -286,7 +292,8 @@ void CreateDTensorMLIRPass(const mlir::TF::StandardPipelineOptions &options,
     pm->addPass(CreateDTensorSetHloShardingPass(
         /*check_layout_use_xla_spmd=*/true));
     pm->addPass(CreateDTensorReplaceAuxiliaryDTensorLayoutOpPass());
-    pm->addPass(CreateDTensorRemoveDTensorLayoutPass());
+    pm->addNestedPass<mlir::func::FuncOp>(
+        CreateDTensorLayoutToXlaShardingOpPass());
     // We lower all remaining Relayout to Identity here to make XLA happy.
     // Under XLA SPMD the RelayoutOp is not expanded by DTensor's SPMD expander.
     // Note that we do not lower much earlier because
diff --git a/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc b/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
new file mode 100644
index 00000000000..4314796a4cd
--- /dev/null
+++ b/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
@@ -0,0 +1,392 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/dtensor/cc/constants.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/layout_parsing.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+namespace {
+#define GEN_PASS_DEF_DTENSORMULTIDEVICEEXPANSION
+#include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
+
+constexpr char kDeviceAttr[] = "device";
+constexpr char kFuncDeviceAttr[] = "tf.device";
+constexpr char kEntryFuncAttr[] = "tf.entry_function";
+constexpr char kMainFuncName[] = "main";
+constexpr int kDeviceIDArgumentNumber = 0;
+
+// This is a map from argument numbers and meshes to per-device values.
+// Most arguments will only be expanded on one mesh (the one given by its
+// "tf._layout" attribute); however, the device id may be expanded across
+// multiple meshes. For example, when main functions have both cpu and tpu
+// mesh partitioned calls.
+using ExpandedArgumentMap =
+    absl::flat_hash_map<int,
+                        absl::flat_hash_map<Mesh, std::vector<mlir::Value>>>;
+
+mlir::BlockArgument MakeArgumentForDevice(mlir::Builder& builder,
+                                          mlir::func::FuncOp func,
+                                          mlir::Type arg_type,
+                                          const std::string& device) {
+  const int arg_index = func.getNumArguments();
+
+  std::vector<mlir::NamedAttribute> named_attrs = {builder.getNamedAttr(
+      kFuncDeviceAttr, builder.getStringAttr(llvm::StringRef(device)))};
+
+  llvm::ArrayRef<mlir::NamedAttribute> named_array_ref(named_attrs);
+  mlir::DictionaryAttr dic_attr = builder.getDictionaryAttr(named_array_ref);
+  func.insertArgument(arg_index, arg_type, dic_attr, func.getLoc());
+
+  return func.getArgument(arg_index);
+}
+
+StatusOr<absl::Span<mlir::Value>> GetExpandedArguments(
+    mlir::func::FuncOp func, ExpandedArgumentMap& expanded_arguments,
+    unsigned int argument_number, const Mesh* target_mesh = nullptr);
+
+template <typename Operation>
+Status ExpandOperation(ExpandedArgumentMap& expanded_arguments_map,
+                       absl::Span<const std::string> devices, Operation op,
+                       const Layout& layout) {
+  auto func = op->template getParentOfType<mlir::func::FuncOp>();
+  if (!func) {
+    // This line should be unreachable within the current framework.
+    // This function is only called on operations discovered while walking
+    // through the main function.
+    return errors::InvalidArgument("Operator not within function.");
+  }
+
+  mlir::OpBuilder builder(op);
+  const Mesh& mesh = layout.mesh();
+  const std::size_t num_devices = devices.size();
+
+  llvm::SmallVector<Operation> replications;
+  for (std::size_t i = 0; i < num_devices; ++i) {
+    llvm::SmallVector<mlir::Value, 8> operands;
+    for (const mlir::Value& operand : op->getOperands()) {
+      if (const auto arg = operand.dyn_cast_or_null<mlir::BlockArgument>()) {
+        TF_ASSIGN_OR_RETURN(const absl::Span<mlir::Value> expanded_arguments,
+                            GetExpandedArguments(func, expanded_arguments_map,
+                                                 arg.getArgNumber(), &mesh));
+        if (expanded_arguments.empty()) {
+          operands.push_back(operand);
+        } else {
+          operands.push_back(expanded_arguments[i]);
+        }
+      } else {
+        operands.push_back(operand);
+      }
+    }
+
+    auto new_op = builder.create<Operation>(
+        op->getLoc(), op->getResultTypes(), operands, op.getFAttr(),
+        /*config=*/builder.getStringAttr(""),
+        /*config_proto=*/builder.getStringAttr(""),
+        /*executor_type=*/builder.getStringAttr(""));
+
+    new_op->setAttr(kDeviceAttr, builder.getStringAttr(devices[i]));
+
+    replications.emplace_back(new_op);
+  }
+
+  mlir::func::ReturnOp return_op;
+  for (const mlir::OpOperand& user : op->getUses()) {
+    const mlir::Operation* owner = user.getOwner();
+    if (!(return_op = llvm::dyn_cast_or_null<mlir::func::ReturnOp>(owner))) {
+      // TODO(twelve) : Determine whether this restriction should be lifted.
+      return errors::InvalidArgument("Call result must be used by return op.");
+    }
+  }
+
+  if (return_op) {
+    llvm::SmallVector<mlir::Value, 8> operands;
+    for (const mlir::Value operand : return_op->getOperands()) {
+      if (op == operand.getDefiningOp()) {
+        const mlir::Operation::result_range results = op->getResults();
+        const mlir::Operation::result_range::iterator search =
+            llvm::find(results, operand);
+        const std::size_t result_number = search - results.begin();
+        for (const Operation& replication : replications) {
+          operands.push_back(replication->getResult(result_number));
+        }
+      } else {
+        operands.push_back(operand);
+      }
+    }
+
+    llvm::SmallVector<mlir::Type, 8> results;
+    for (const mlir::Value& operand : operands) {
+      results.push_back(operand.getType());
+    }
+
+    const mlir::FunctionType func_type = func.getFunctionType();
+    func.removeResAttrsAttr();
+    func.setFunctionType(
+        builder.getFunctionType(func_type.getInputs(), results));
+
+    builder.create<mlir::func::ReturnOp>(return_op->getLoc(), operands);
+
+    return_op->erase();
+  }
+
+  return OkStatus();
+}
+
+// Returns the devices for a given mesh.
+absl::Span<const std::string> GetDevices(const Mesh& mesh) {
+  const std::vector<std::string>& devices = mesh.global_devices();
+  if (devices.empty()) {
+    return mesh.local_devices();
+  } else {
+    return devices;
+  }
+}
+
+// Extracts the operation's layouts, then expands it across them.
+template <typename Operation>
+mlir::LogicalResult ExpandOperations(ExpandedArgumentMap& expanded_arguments,
+                                     Operation op) {
+  const StatusOr<std::optional<Mesh>> mesh = ExtractDeviceMeshFromOp(op);
+  const StatusOr<std::vector<std::optional<Layout>>> layouts =
+      ExtractLayoutFromOp(op);
+  if (!((mesh.ok() && *mesh) && (layouts.ok() && !layouts->empty()))) {
+    op->emitOpError("Failed to retrieve op mesh or layout.");
+    return mlir::failure();
+  }
+
+  bool expanded = false;
+  for (const std::optional<Layout>& layout : *layouts) {
+    if (layout) {
+      const Mesh& layout_mesh = layout->mesh();
+      if (**mesh != layout_mesh) {
+        op->emitOpError("Unimplemented, outputs not on op mesh.");
+        return mlir::failure();
+      } else if (layout_mesh.IsSingleDevice()) {
+        op->emitOpError("Unimplemented, single-device expansion support.");
+        return mlir::failure();
+      } else {
+        const absl::Span<const std::string> devices = GetDevices(layout_mesh);
+        const Status status =
+            ExpandOperation(expanded_arguments, devices, op, *layout);
+        if (status.ok()) {
+          expanded = true;
+        } else {
+          op->emitOpError(tsl::NullTerminatedMessage(status));
+          return mlir::failure();
+        }
+      }
+    }
+  }
+
+  if (expanded) {
+    op->erase();
+  }
+
+  return mlir::success();
+}
+
+// Generates a `NamedAttr` whose value is a comma-separated list of
+// elements, given by applying the format function to each integer in
+// the range 0..len(types).
+template <typename Format>
+mlir::NamedAttribute GetNamesAttr(mlir::OpBuilder& builder, const char* name,
+                                  llvm::ArrayRef<mlir::Type> types,
+                                  const Format& fmt) {
+  llvm::SmallVector<string, 8> names;
+  for (int i = 0; i < types.size(); ++i) {
+    names.push_back(fmt(i));
+  }
+  return builder.getNamedAttr(name,
+                              builder.getStringAttr(absl::StrJoin(names, ",")));
+}
+
+// Updates the entry function's attributes to reflect its inputs/outputs.
+void UpdateEntryFuncAttr(mlir::OpBuilder& builder, mlir::func::FuncOp func) {
+  const mlir::FunctionType func_type = func.getFunctionType();
+  const mlir::NamedAttribute inputs =
+      GetNamesAttr(builder, "inputs", func_type.getInputs(),
+                   [](int i) { return absl::StrFormat("input_%d", i); });
+  const mlir::NamedAttribute outputs =
+      GetNamesAttr(builder, "outputs", func_type.getResults(),
+                   [](int i) { return absl::StrFormat("output_%d", i); });
+  llvm::SmallVector<mlir::NamedAttribute, 2> named_attrs = {inputs, outputs};
+  llvm::ArrayRef<mlir::NamedAttribute> named_array_ref(named_attrs);
+  mlir::DictionaryAttr dic_attr = builder.getDictionaryAttr(named_array_ref);
+  func->setAttr(kEntryFuncAttr, dic_attr);
+}
+
+StatusOr<absl::Span<mlir::Value>> GetExpandedArguments(
+    mlir::func::FuncOp func, ExpandedArgumentMap& expanded_arguments,
+    unsigned int argument_number, const Mesh* target_mesh) {
+  if (func.getName() != kMainFuncName) {
+    return absl::Span<mlir::Value>();  // only expand main function arguments
+  }
+  const mlir::BlockArgument arg = func.getArgument(argument_number);
+  std::optional<Mesh> mesh;
+  if (argument_number == kDeviceIDArgumentNumber) {
+    if (target_mesh) {
+      mesh = *target_mesh;
+    }
+  } else {
+    TF_ASSIGN_OR_RETURN(const std::optional<Layout> layout,
+                        ExtractLayoutFromOperand(arg));
+    if (layout) {
+      mesh = layout->mesh();
+    }
+  }
+  if (mesh.has_value()) {
+    std::vector<mlir::Value>& replications =
+        expanded_arguments[argument_number][*mesh];
+    if (replications.empty()) {
+      const absl::Span<const std::string> devices = GetDevices(*mesh);
+      const std::size_t num_devices = devices.size();
+      replications.reserve(num_devices);
+      mlir::Block& func_block = func.getBody().front();
+      mlir::OpBuilder builder(&(func_block.front()));
+      if (argument_number == kDeviceIDArgumentNumber) {
+        mlir::Location loc = func_block.front().getLoc();
+        for (int i = 0; i < num_devices; ++i) {
+          const auto value_attr = mlir::DenseIntElementsAttr::get<int>(
+              mlir::RankedTensorType::get({0}, builder.getI32Type()), {i});
+          replications.emplace_back(
+              builder.create<mlir::TF::ConstOp>(loc, value_attr));
+        }
+      } else {
+        mlir::TensorType tensor_type =
+            arg.getType().dyn_cast_or_null<mlir::TensorType>();
+        if (!tensor_type) {
+          return errors::InvalidArgument("Could not determine tensor type.");
+        }
+        for (int i = 0; i < num_devices; ++i) {
+          replications.emplace_back(
+              MakeArgumentForDevice(builder, func, tensor_type, devices[i]));
+        }
+      }
+    }
+    return absl::Span<mlir::Value>(replications);
+  } else {
+    return absl::Span<mlir::Value>();  // no per-device arguments necessary
+  }
+}
+
+struct DTensorMultiDeviceExpansion
+    : public impl::DTensorMultiDeviceExpansionBase<
+          DTensorMultiDeviceExpansion> {
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+    mlir::func::FuncOp main_func =
+        module.lookupSymbol<mlir::func::FuncOp>(kMainFuncName);
+    if (!main_func) {
+      return;
+    }
+
+    ExpandedArgumentMap expanded_arguments_map;
+    for (unsigned i = 1; i < main_func.getNumArguments(); ++i) {
+      // Expand all the arguments (in case they're unused).
+      StatusOr<absl::Span<mlir::Value>> expanded_arguments =
+          GetExpandedArguments(main_func, expanded_arguments_map, i);
+      if (!expanded_arguments.ok()) {
+        main_func->emitOpError(
+            tsl::NullTerminatedMessage(expanded_arguments.status()));
+        return;
+      }
+    }
+
+    // Note, we cannot simultaneously walk through the call ops and expand
+    // them since we'd be creating and removing ops as we walk through them.
+    llvm::SmallVector<mlir::TF::StatefulPartitionedCallOp, 8> stateful_call_ops;
+    main_func.walk([&](mlir::Operation* op) {
+      if (const auto stateful_call_op =
+              llvm::dyn_cast_or_null<mlir::TF::StatefulPartitionedCallOp>(op)) {
+        if (stateful_call_op->hasAttr(kLayoutAttr) &&
+            stateful_call_op->hasAttr(kMeshAttr)) {
+          stateful_call_ops.emplace_back(stateful_call_op);
+        }
+      }
+    });
+
+    for (const mlir::TF::StatefulPartitionedCallOp& stateful_call_op :
+         stateful_call_ops) {
+      mlir::LogicalResult status =
+          ExpandOperations(expanded_arguments_map, stateful_call_op);
+      if (status.failed()) {
+        return;
+      }
+    }
+
+    if (main_func && !expanded_arguments_map.empty()) {
+      mlir::OpBuilder builder(main_func);
+      const mlir::FunctionType func_type = main_func.getFunctionType();
+      const llvm::ArrayRef<mlir::Type> inputs = func_type.getInputs();
+      llvm::SmallVector<mlir::Type, 8> next_inputs;
+      unsigned num_erased = 0;
+      for (unsigned i = 0; i < inputs.size(); ++i) {
+        const ExpandedArgumentMap::iterator search =
+            expanded_arguments_map.find(i);
+        // Always erase the device id, even when it's unexpanded.
+        if ((search == expanded_arguments_map.end()) &&
+            (i != kDeviceIDArgumentNumber)) {
+          next_inputs.push_back(inputs[i]);
+        } else {
+          main_func.eraseArgument(i - num_erased);
+          num_erased += 1;
+        }
+      }
+      main_func.setFunctionType(
+          builder.getFunctionType(next_inputs, func_type.getResults()));
+      UpdateEntryFuncAttr(builder, main_func);
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorMultiDeviceExpansionPass() {
+  return std::make_unique<DTensorMultiDeviceExpansion>();
+}
+
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/dtensor_set_hlo_sharding.cc b/tensorflow/dtensor/mlir/dtensor_set_hlo_sharding.cc
index 0f9dc5a5973..934474d7489 100644
--- a/tensorflow/dtensor/mlir/dtensor_set_hlo_sharding.cc
+++ b/tensorflow/dtensor/mlir/dtensor_set_hlo_sharding.cc
@@ -16,10 +16,13 @@ limitations under the License.
 #include <memory>
 #include <optional>
 
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h"
 #include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
@@ -31,74 +34,77 @@ namespace {
 #define GEN_PASS_DEF_DTENSORSETHLOSHARDINGPASS
 #include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
 
-mlir::LogicalResult SetHloShardingForInputsAndOps(
-    mlir::ModuleOp module, mlir::OpBuilder builder,
-    bool check_layout_use_xla_spmd) {
-  module.walk([&](mlir::TF::DTensorLayout layout_op) {
-    if (check_layout_use_xla_spmd &&
-        !layout_op.getLayout().mesh().use_xla_spmd()) {
-      layout_op.emitOpError(
-          "Found a layout operation that is not on XLA SPMD mesh during XLA "
-          "SPMD integration.");
-    }
-    StatusOr<::xla::OpSharding> xla_sharding =
-        ConvertLayoutToXlaOpSharding(layout_op.getLayout());
+mlir::LogicalResult SetHloShardingForInputs(mlir::ModuleOp module,
+                                            mlir::OpBuilder builder,
+                                            bool check_layout_use_xla_spmd) {
+  auto result = module.walk([&](mlir::func::FuncOp func_op)
+                                -> mlir::WalkResult {
+    for (int arg_index = 0; arg_index < func_op.getNumArguments();
+         ++arg_index) {
+      mlir::BlockArgument arg = func_op.getArgument(arg_index);
+      for (const auto* user : arg.getUsers()) {
+        auto layout_op = llvm::dyn_cast<mlir::TF::DTensorLayout>(*user);
+        if (!layout_op) {
+          continue;
+        }
 
-    if (!xla_sharding.ok())
-      module.emitError(xla_sharding.status().error_message());
+        if (check_layout_use_xla_spmd &&
+            !layout_op.getLayout().mesh().use_xla_spmd()) {
+          return layout_op.emitOpError(
+              "Found a layout operation that is not on XLA SPMD mesh during "
+              "XLA SPMD integration.");
+        }
 
-    mlir::Value operand = layout_op.getOperand();
-
-    if (mlir::BlockArgument block_arg =
-            operand.dyn_cast_or_null<mlir::BlockArgument>()) {
-      mlir::func::FuncOp func_op =
-          layout_op->getParentOfType<mlir::func::FuncOp>();
-      if (!func_op) {
-        module.emitError(
-            "Error finding surrounding FuncOp during "
-            "DTensorXlaSpmdIntegration.");
+        StatusOr<xla::OpSharding> xla_sharding =
+            ConvertLayoutToXlaOpSharding(layout_op.getLayout());
+        if (!xla_sharding.ok()) {
+          return layout_op.emitError(xla_sharding.status().message());
+        }
+        func_op.setArgAttr(
+            arg_index, kXlaShardingAttr,
+            builder.getStringAttr(xla_sharding->SerializeAsString()));
+        break;
       }
-      func_op.setArgAttr(
-          block_arg.getArgNumber(), kXlaShardingAttr,
-          builder.getStringAttr(xla_sharding->SerializeAsString()));
-    } else if (mlir::Operation* producing_op = operand.getDefiningOp()) {
-      producing_op->setAttr(
-          kXlaShardingAttr,
-          builder.getStringAttr(xla_sharding->SerializeAsString()));
     }
+    return mlir::WalkResult::advance();
   });
-  return mlir::success();
+
+  return mlir::failure(result.wasInterrupted());
 }
 
 mlir::LogicalResult SetHloShardingForOutputs(mlir::ModuleOp module,
                                              mlir::OpBuilder builder) {
   // Set output attributes
-  module.walk([&](mlir::func::ReturnOp return_op) {
-    for (auto return_index = 0; return_index < return_op.getNumOperands();
-         ++return_index) {
-      if (auto layout_op = llvm::dyn_cast_or_null<mlir::TF::DTensorLayout>(
-              return_op->getOperand(return_index).getDefiningOp())) {
-        StatusOr<::xla::OpSharding> xla_sharding =
-            ConvertLayoutToXlaOpSharding(layout_op.getLayout());
+  auto result =
+      module.walk([&](mlir::func::ReturnOp return_op) -> mlir::WalkResult {
+        for (int return_index = 0; return_index < return_op.getNumOperands();
+             ++return_index) {
+          auto layout_op = llvm::dyn_cast_or_null<mlir::TF::DTensorLayout>(
+              return_op->getOperand(return_index).getDefiningOp());
+          if (!layout_op) continue;
 
-        if (!xla_sharding.ok())
-          module.emitError(xla_sharding.status().error_message());
+          StatusOr<xla::OpSharding> xla_sharding =
+              ConvertLayoutToXlaOpSharding(layout_op.getLayout());
 
-        mlir::func::FuncOp func_op =
-            layout_op->getParentOfType<mlir::func::FuncOp>();
-        if (!func_op) {
-          module.emitError(
-              "Error finding surrounding FuncOp during "
-              "DTensorXlaSpmdIntegration.");
+          if (!xla_sharding.ok()) {
+            return module.emitError(xla_sharding.status().message());
+          }
+
+          mlir::func::FuncOp func_op =
+              layout_op->getParentOfType<mlir::func::FuncOp>();
+          if (!func_op) {
+            return module.emitError(
+                "Error finding surrounding FuncOp during "
+                "DTensorXlaSpmdIntegration.");
+          }
+
+          func_op.setResultAttr(
+              return_index, kXlaShardingAttr,
+              builder.getStringAttr(xla_sharding->SerializeAsString()));
         }
-
-        func_op.setResultAttr(
-            return_index, kXlaShardingAttr,
-            builder.getStringAttr(xla_sharding->SerializeAsString()));
-      }
-    }
-  });
-  return mlir::success();
+        return mlir::WalkResult::advance();
+      });
+  return mlir::failure(result.wasInterrupted());
 }
 
 class DTensorSetHloShardingPass
@@ -117,7 +123,7 @@ class DTensorSetHloShardingPass
     mlir::MLIRContext& context = getContext();
     mlir::OpBuilder builder(&context);
     mlir::ModuleOp module = getOperation();
-    if (mlir::failed(SetHloShardingForInputsAndOps(
+    if (mlir::failed(SetHloShardingForInputs(
             module, builder, check_layout_use_xla_spmd_.getValue()))) {
       return signalPassFailure();
     }
diff --git a/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc
index 024133259e8..b7f8c865bf4 100644
--- a/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc
@@ -51,7 +51,8 @@ StatusOr<Layout> ComputeResultLayout(mlir::Operation* op,
   if (axis < 0) axis += input_rank;
 
   LayoutProto output_layout_proto;
-  *output_layout_proto.mutable_mesh_config() = input_layout.mesh().ToProto();
+  TF_ASSIGN_OR_RETURN(*output_layout_proto.mutable_mesh_config(),
+                      input_layout.mesh().ToProto());
 
   for (int i = 0; i < input_rank; ++i) {
     if (i != axis)
@@ -84,8 +85,8 @@ StatusOr<mlir::Operation*> ArgMaxSPMDExpander::ExpandOp(mlir::Operation* op) {
   mlir::OpBuilder builder(op);
   {
     LayoutProto tgt_input_layout_proto;
-    *tgt_input_layout_proto.mutable_mesh_config() =
-        input_layout->mesh().ToProto();
+    TF_ASSIGN_OR_RETURN(*tgt_input_layout_proto.mutable_mesh_config(),
+                        input_layout->mesh().ToProto());
 
     for (int i = 0; i < input_shape.size(); ++i) {
       // const auto dim_name
diff --git a/tensorflow/dtensor/mlir/expansions/broadcast_to_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/broadcast_to_spmd_expander.cc
index 12e4bd86e77..2535198ddb0 100644
--- a/tensorflow/dtensor/mlir/expansions/broadcast_to_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/broadcast_to_spmd_expander.cc
@@ -192,7 +192,7 @@ BroadcastToSPMDExpander::ComputeLayoutBackward(
   const int broadcasted_dimensions = output_shape_rank - input_shape_rank;
 
   LayoutProto layout_proto;
-  *layout_proto.mutable_mesh_config() = mesh.ToProto();
+  TF_ASSIGN_OR_RETURN(*layout_proto.mutable_mesh_config(), mesh.ToProto());
   for (int i = 0; i < input_shape_rank; ++i) {
     if (input_shape[i] == 1) {
       layout_proto.add_sharding_specs()->set_sharding_spec(
diff --git a/tensorflow/dtensor/mlir/expansions/cumsum_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/cumsum_spmd_expander.cc
index a98a938f822..6fe24698054 100644
--- a/tensorflow/dtensor/mlir/expansions/cumsum_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/cumsum_spmd_expander.cc
@@ -68,8 +68,9 @@ StatusOr<mlir::Operation*> CumsumSPMDExpander::ExpandOp(mlir::Operation* op) {
   // Our intermediate computation layout is the output layout with
   // the axis dimension replicated. So set both the operand and output layout
   // to this intermediate layout.
-  Layout intermediate_layout = output_layout->GetLayoutWithReducedDims(
-      {axis_dim.value()}, /*keep_dims=*/true);
+  TF_ASSIGN_OR_RETURN(Layout intermediate_layout,
+                      output_layout->GetLayoutWithReducedDims(
+                          {axis_dim.value()}, /*keep_dims=*/true));
 
   // Relayout operand to intermediate layout.
   mlir::OpBuilder builder(op);
@@ -106,9 +107,11 @@ StatusOr<llvm::DenseMap<int, Layout>> CumsumSPMDExpander::ComputeLayoutForward(
     return llvm::DenseMap<int, Layout>();
 
   auto input_layout = input_layouts.lookup(0);
-  return llvm::DenseMap<int, Layout>(
-      {{0, input_layout.GetLayoutWithReducedDims({axis_dim},
-                                                 /*keep_dims=*/true)}});
+  TF_ASSIGN_OR_RETURN(
+      Layout input_layout_reduced_dims,
+      input_layout.GetLayoutWithReducedDims({axis_dim},
+                                            /*keep_dims=*/true));
+  return llvm::DenseMap<int, Layout>({{0, input_layout_reduced_dims}});
 }
 
 StatusOr<llvm::DenseMap<int, Layout>> CumsumSPMDExpander::ComputeLayoutBackward(
@@ -119,9 +122,11 @@ StatusOr<llvm::DenseMap<int, Layout>> CumsumSPMDExpander::ComputeLayoutBackward(
   if (output_layouts.find(0) == output_layouts.end())
     return llvm::DenseMap<int, Layout>();
   auto output_layout = output_layouts.lookup(0);
-  return llvm::DenseMap<int, Layout>(
-      {{0, output_layout.GetLayoutWithReducedDims({axis_dim},
-                                                  /*keep_dims=*/true)}});
+  TF_ASSIGN_OR_RETURN(
+      Layout output_layout_reduced_dims,
+      output_layout.GetLayoutWithReducedDims({axis_dim},
+                                             /*keep_dims=*/true));
+  return llvm::DenseMap<int, Layout>({{0, output_layout_reduced_dims}});
 }
 
 }  // namespace dtensor
diff --git a/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc
index dbf01110dfa..b06b7265d2a 100644
--- a/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc
@@ -141,7 +141,7 @@ StatusOr<mlir::Operation*> ExpandRelayoutOp(RelayoutOp relayout,
                       "to layout : {1}. Found error {2}",
                       relayout->getName().getStringRef(),
                       target_layout.ToString(),
-                      value_or_status.status().error_message())
+                      value_or_status.status().message())
             .str());
   mlir::Value output = value_or_status.value();
   relayout.getOutput().replaceAllUsesWith(output);
@@ -163,7 +163,7 @@ StatusOr<Layout> MergeLayouts(
   return Layout::GetLayout(sharding_specs, target_layout.mesh());
 }
 
-// Computes the layout of Relayout's (or RelayoutGrad's) input or output, based
+// Computes the layout of Relayout's (or RelayoutLike's) input or output, based
 // on the layout from the corresponding output or input (as `incoming_layout`).
 // Note that this implies that we compute the same layout for the
 // operand and output.
@@ -232,26 +232,26 @@ RelayoutSPMDExpander::ComputeLayoutBackward(
   return ComputeRelayoutLayout(mask_layout, incoming_layout);
 }
 
-StatusOr<mlir::Operation*> RelayoutGradSPMDExpander::ExpandOp(
+StatusOr<mlir::Operation*> RelayoutLikeSPMDExpander::ExpandOp(
     mlir::Operation* op) {
-  auto relayout_grad = mlir::cast<mlir::TF::RelayoutGradOp>(op);
+  auto relayout_grad = mlir::cast<mlir::TF::RelayoutLikeOp>(op);
   TF_ASSIGN_OR_RETURN(
       const Layout target_layout,
-      ExtractRequiredLayoutFromOperand(relayout_grad.getForwardInput()));
+      ExtractRequiredLayoutFromOperand(relayout_grad.getLayoutInput()));
   TF_ASSIGN_OR_RETURN(const Layout output_layout,
                       ExtractRequiredSingleLayoutFromOp(op));
   TF_ASSIGN_OR_RETURN(
       const Layout input_layout,
       ExtractRequiredLayoutFromOperand(relayout_grad.getInput()));
 
-  return ExpandRelayoutOp<mlir::TF::RelayoutGradOp>(
+  return ExpandRelayoutOp<mlir::TF::RelayoutLikeOp>(
       relayout_grad, target_layout, input_layout, output_layout);
 }
 
 StatusOr<llvm::DenseMap<int, Layout>>
-RelayoutGradSPMDExpander::ComputeLayoutForward(
+RelayoutLikeSPMDExpander::ComputeLayoutForward(
     mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
-  // RelayoutGrad's output has the same layout as the corresponding Relayout's
+  // RelayoutLike's output has the same layout as the corresponding Relayout's
   // input operand.
   if (input_layouts.find(1) == input_layouts.end())
     return llvm::DenseMap<int, Layout>();
@@ -259,7 +259,7 @@ RelayoutGradSPMDExpander::ComputeLayoutForward(
 }
 
 StatusOr<llvm::DenseMap<int, Layout>>
-RelayoutGradSPMDExpander::ComputeLayoutBackward(
+RelayoutLikeSPMDExpander::ComputeLayoutBackward(
     mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
   if (output_layouts.find(0) == output_layouts.end())
     return llvm::DenseMap<int, Layout>();
@@ -270,7 +270,7 @@ RelayoutGradSPMDExpander::ComputeLayoutBackward(
       // enforce any particular layout on it.
       {0, Layout::ReplicatedLike(output_layout)},
       // Set layout for the forward pass's input operand to match the output of
-      // the RelayoutGrad op.
+      // the RelayoutLike op.
       {1, output_layout},
   });
 }
diff --git a/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.h b/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.h
index 6ef19460fb2..1f015a510a1 100644
--- a/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.h
+++ b/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.h
@@ -40,7 +40,7 @@ class RelayoutSPMDExpander : public SPMDExpanderBase {
 
 // Converts layout of gradient tensor to the layout of the original Relayout's
 // input tensor, using the same expansion logic as RelayoutOp.
-class RelayoutGradSPMDExpander : public SPMDExpanderBase {
+class RelayoutLikeSPMDExpander : public SPMDExpanderBase {
  public:
   StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
 
diff --git a/tensorflow/dtensor/mlir/expansions/elementwise_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/elementwise_spmd_expander.cc
index fb607bc9e93..ec47c82abcf 100644
--- a/tensorflow/dtensor/mlir/expansions/elementwise_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/elementwise_spmd_expander.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -48,12 +49,7 @@ namespace dtensor {
 namespace {
 
 StatusOr<llvm::SmallVector<int64_t, 4>> GetShape(mlir::Value value) {
-  auto type = value.getType().dyn_cast<mlir::RankedTensorType>();
-  if (!type)
-    return errors::InvalidArgument(
-        "Rank of input values must be statically known.");
-
-  const auto shape = type.getShape();
+  TF_ASSIGN_OR_RETURN(const auto shape, GetShapeOfValue(value));
   return llvm::SmallVector<int64_t, 4>{shape.begin(), shape.end()};
 }
 
@@ -105,13 +101,27 @@ StatusOr<mlir::Operation*> ElementwiseSPMDExpander::ExpandOp(
     absl::flat_hash_set<int> size_one_dims;
     for (int i = 0; i < shape.size(); ++i)
       if (shape[i] == 1) size_one_dims.emplace(i);
-    truncated_layout = truncated_layout.GetLayoutWithReducedDims(
-        size_one_dims, /*keep_dims=*/true);
+    TF_ASSIGN_OR_RETURN(truncated_layout,
+                        truncated_layout.GetLayoutWithReducedDims(
+                            size_one_dims, /*keep_dims=*/true));
     TF_ASSIGN_OR_RETURN(
         output, EmitRelayout(operand.get(), *operand_layout, truncated_layout));
     operand.set(output);
   }
 
+  // If result is a resource, the shape of the result should be adjusted to
+  // local value of the resource, based on the layout for output.
+  // This logic is similar to VarHandle op SPMD expansion.
+  //
+  // Resource output is only likely to be for identity op. However, keeping
+  // the checkgeneric here.
+  auto op_result = op->getOpResult(0);
+  if (llvm::isa<mlir::TF::ResourceType>(
+          mlir::getElementTypeOrSelf(op_result))) {
+    TF_RETURN_IF_ERROR(InferSPMDExpandedLocalShapeForResourceOutput(
+        &op_result, output_layout.value(), builder.getContext()));
+  }
+
   // For element-wise op SPMD expansion, given that operand layouts are
   // compatible to op's layout, op can simply be executed without any changes.
   return InferSPMDExpandedLocalShape(op);
@@ -154,12 +164,11 @@ ElementwiseSPMDExpander::ComputeLayoutBackward(
     auto operand = operand_and_index.value();
 
     TF_ASSIGN_OR_RETURN(auto operand_shape, GetShape(operand));
+    Layout output_layout_truncated = output_layout.Truncate(
+        output_layout.sharding_specs().size() - operand_shape.size(),
+        /*end=*/true);
     auto inferred_operand_layout_strs =
-        output_layout
-            .Truncate(
-                output_layout.sharding_specs().size() - operand_shape.size(),
-                /*end=*/true)
-            .sharding_spec_strs();
+        output_layout_truncated.sharding_spec_strs();
 
     if (inferred_operand_layout_strs.size() != operand_shape.size())
       return errors::FailedPrecondition(
diff --git a/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.cc
index 43a268e749a..e7c4fc33367 100644
--- a/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.cc
@@ -42,7 +42,8 @@ StatusOr<mlir::Operation*> FillSPMDExpander::ExpandOp(mlir::Operation* op) {
   if (!dims_layout->IsFullyReplicated()) {
     return errors::InvalidArgument(
         "Expected the layout for fill's `dims` argument to be fully "
-        "replicated.");
+        "replicated. Got ",
+        dims_layout->ToString());
   }
   TF_ASSIGN_OR_RETURN(absl::optional<Layout> output_layout,
                       ExtractSingleLayoutFromOp(op));
diff --git a/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.h b/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.h
index 460851ff9de..d767618afd9 100644
--- a/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.h
+++ b/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.h
@@ -71,7 +71,8 @@ class GatherCommonSPMDExpander : public SPMDExpanderBase {
     // all-concat, which can be added later.
     {
       LayoutProto tgt_params_layout;
-      *tgt_params_layout.mutable_mesh_config() = params_layout.mesh().ToProto();
+      TF_ASSIGN_OR_RETURN(*tgt_params_layout.mutable_mesh_config(),
+                          params_layout.mesh().ToProto());
       // check the first half
       for (int i = 0; i < axis; ++i) {
         const std::string& dim_name = params_layout.sharding_spec(i);
@@ -128,8 +129,8 @@ class GatherCommonSPMDExpander : public SPMDExpanderBase {
     {
       bool indices_relayout_needed = false;
       LayoutProto tgt_indices_layout;
-      *tgt_indices_layout.mutable_mesh_config() =
-          output_layout.mesh().ToProto();
+      TF_ASSIGN_OR_RETURN(*tgt_indices_layout.mutable_mesh_config(),
+                          output_layout.mesh().ToProto());
       for (int i = 0; i < indices_rank; ++i) {
         int index_in_output;
         int index_in_indices;
diff --git a/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc
index ddd3cd85538..6cd2f10c4c9 100644
--- a/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "llvm/Support/FormatVariadic.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives.h"
 #include "tensorflow/dtensor/mlir/device_utils.h"
@@ -120,6 +121,8 @@ StatusOr<mlir::Operation*> Expand(mlir::Operation* op) {
 StatusOr<mlir::Operation*> IOOpSPMDExpander::ExpandOp(mlir::Operation* op) {
   if (llvm::isa<mlir::TF::WriteSummaryOp>(op)) {
     return Expand<mlir::TF::WriteSummaryOp>(op);
+  } else if (llvm::isa<mlir::TF::FlushSummaryWriterOp>(op)) {
+    return Expand<mlir::TF::FlushSummaryWriterOp>(op);
   }
   return errors::Unimplemented(
       llvm::formatv("SPMD for op : {0} is not implemented ", OpName(op)).str());
diff --git a/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.cc
index e9ee2fdd838..6cd8ac7f174 100644
--- a/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.cc
@@ -416,8 +416,9 @@ StatusOr<llvm::DenseMap<int, Layout>> MatMulSPMDExpander::ComputeLayoutBackward(
   // other may be shorter.
   Layout left = output_layout.Truncate(output_layout.rank() - left_shape.size(),
                                        /*end=*/true);
-  Layout right = output_layout.Truncate(
-      output_layout.rank() - right_shape.size(), /*end=*/true);
+  Layout right =
+      output_layout.Truncate(output_layout.rank() - right_shape.size(),
+                             /*end=*/true);
 
   // Make sure necessary dimensions are replicated.
   //
diff --git a/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
index 54d3a643e08..d6896a5540d 100644
--- a/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
@@ -70,8 +70,8 @@ StatusOr<llvm::DenseMap<int, Layout>> LayoutsFromPackedTensor(
   TF_ASSIGN_OR_RETURN(axis,
                       CanonicalizeAxis(axis,
                                        /*packed_rank=*/packed_layout.rank()));
-  const Layout unpacked_layout =
-      packed_layout.GetLayoutWithReducedDims({axis}, false);
+  TF_ASSIGN_OR_RETURN(const Layout unpacked_layout,
+                      packed_layout.GetLayoutWithReducedDims({axis}, false));
   llvm::DenseMap<int, Layout> layouts(num_unpacked_tensors);
   for (int i = 0; i < num_unpacked_tensors; ++i) {
     layouts[i] = unpacked_layout;
@@ -144,8 +144,9 @@ StatusOr<mlir::Operation*> PackSPMDExpander::ExpandOp(mlir::Operation* op) {
   // to match the output layout. E.g. if the output layout is not but the input
   // is, this would force a AllConcat on all inputs, rather than first packing
   // and emitting one AllConcat.
-  const Layout new_input_layout =
-      output_layout->GetLayoutWithReducedDims({axis}, /*keep_dims=*/false);
+  TF_ASSIGN_OR_RETURN(
+      const Layout new_input_layout,
+      output_layout->GetLayoutWithReducedDims({axis}, /*keep_dims=*/false));
 
   for (int i = 0; i < op->getNumOperands(); ++i) {
     TF_ASSIGN_OR_RETURN(const absl::optional<Layout> layout,
diff --git a/tensorflow/dtensor/mlir/expansions/qr_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/qr_spmd_expander.cc
index f3f491842ea..11d3be18f0a 100644
--- a/tensorflow/dtensor/mlir/expansions/qr_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/qr_spmd_expander.cc
@@ -47,8 +47,9 @@ StatusOr<mlir::Operation*> QRSPMDExpander::ExpandOp(mlir::Operation* op) {
 
   // Relayout all layouts to the first output layout with the last two
   // dimensions replicated. We can do more optimization but this is fine
-  Layout new_layout =
-      output_layouts[0].GetLayoutWithReducedDims({-1, -2}, /*keep_dims=*/true);
+  TF_ASSIGN_OR_RETURN(
+      Layout new_layout,
+      output_layouts[0].GetLayoutWithReducedDims({-1, -2}, /*keep_dims=*/true));
 
   TF_ASSIGN_OR_RETURN(
       const auto new_operand,
@@ -96,8 +97,9 @@ StatusOr<llvm::DenseMap<int, Layout>> QRSPMDExpander::ComputeLayoutForward(
 
   // Set the output layouts as the copy of the input layouts with the last 2
   // dimensions replicated.
-  Layout output_layout = input_layouts.lookup(0).GetLayoutWithReducedDims(
-      {-1, -2}, /*keep_dims=*/true);
+  TF_ASSIGN_OR_RETURN(Layout output_layout,
+                      input_layouts.lookup(0).GetLayoutWithReducedDims(
+                          {-1, -2}, /*keep_dims=*/true));
   return llvm::DenseMap<int, Layout>({{0, output_layout}, {1, output_layout}});
 }
 
@@ -111,8 +113,10 @@ StatusOr<llvm::DenseMap<int, Layout>> QRSPMDExpander::ComputeLayoutBackward(
   Layout layout = output_layouts.find(0) != output_layouts.end()
                       ? output_layouts.lookup(0)
                       : output_layouts.lookup(1);
-  return llvm::DenseMap<int, Layout>(
-      {{0, layout.GetLayoutWithReducedDims({-1, -2}, /*keep_dims=*/true)}});
+  TF_ASSIGN_OR_RETURN(
+      Layout layout_reduced_dims,
+      layout.GetLayoutWithReducedDims({-1, -2}, /*keep_dims=*/true));
+  return llvm::DenseMap<int, Layout>({{0, layout_reduced_dims}});
 }
 
 }  // namespace dtensor
diff --git a/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
index b45ee012c21..c15becba9e7 100644
--- a/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
@@ -106,6 +106,22 @@ Status ExtractDims(mlir::Operation* op,
   return OkStatus();
 }
 
+template <>
+Status ExtractDims<mlir::TF::L2LossOp>(
+    mlir::Operation* op, llvm::SmallVector<int64_t, 4>* reduced_dims,
+    bool* keep_dims, bool* matched) {
+  if (!llvm::isa<mlir::TF::L2LossOp>(op)) return OkStatus();
+  auto loss_op = llvm::cast<mlir::TF::L2LossOp>(op);
+  *reduced_dims = llvm::SmallVector<int64_t, 4>{};
+  reduced_dims->resize(ValueRank(loss_op->getOperand(0)));
+  for (int i = 0; i < reduced_dims->size(); ++i) {
+    (*reduced_dims)[i] = i;
+  }
+  *keep_dims = false;
+  *matched = true;
+  return OkStatus();
+}
+
 template <>
 Status ExtractDims<mlir::TF::BiasAddGradOp>(
     mlir::Operation* op, llvm::SmallVector<int64_t, 4>* reduced_dims,
@@ -152,6 +168,8 @@ Status ExtractReductionParameters(mlir::Operation* op,
       ExtractDims<mlir::TF::MeanOp>(op, &reduced_dims, &keep_dims, &matched));
   TF_RETURN_IF_ERROR(
       ExtractDims<mlir::TF::ProdOp>(op, &reduced_dims, &keep_dims, &matched));
+  TF_RETURN_IF_ERROR(
+      ExtractDims<mlir::TF::L2LossOp>(op, &reduced_dims, &keep_dims, &matched));
   TF_RETURN_IF_ERROR(ExtractDims<mlir::TF::BiasAddGradOp>(
       op, &reduced_dims, &keep_dims, &matched));
 
@@ -165,10 +183,6 @@ Status ExtractReductionParameters(mlir::Operation* op,
 
 StatusOr<Layout> ComputeResultLayout(mlir::Operation* op,
                                      const Layout& input_layout) {
-  //  The output layout for L2Loss is scalar replicated mesh, where rank = 0.
-  if (mlir::isa<mlir::TF::L2LossOp>(op))
-    return Layout::ReplicatedOnMesh(input_layout.mesh(), /*rank=*/0);
-
   absl::flat_hash_set<int> reduced_dims_set;
   bool keep_dims;
   TF_RETURN_IF_ERROR(
@@ -279,15 +293,6 @@ StatusOr<llvm::DenseMap<int, Layout>> ReduceSPMDExpander::ComputeLayoutBackward(
   Layout output_layout = output_layouts.lookup(0);
   TF_ASSIGN_OR_RETURN(auto input_shape, GetShapeOfValue(op->getOperand(0)));
 
-  // The output layout for L2Loss is scalar replicated mesh, then output
-  // always is rank 0. As L2Loss does not care about its input layout, clear
-  // the layout so we make no request.
-  if (mlir::isa<mlir::TF::L2LossOp>(op)) {
-    // TODO(b/178423341) Update this once the bug is fixed.
-    return llvm::DenseMap<int, Layout>(
-        {{0, Layout::AnyOnMesh(mesh, ValueRank(op->getOperand(0)))}});
-  }
-
   std::vector<std::string> inferred_operand_layout_str;
 
   absl::flat_hash_set<int> reduced_dims_set;
diff --git a/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc
index 8fd5bcc3b1c..0a99c6e9355 100644
--- a/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/op_utils.h"
 #include "tensorflow/dtensor/mlir/shape_utils.h"
+#include "tensorflow/dtensor/mlir/value_utils.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -61,50 +62,17 @@ StatusOr<mlir::Operation*> ExpandVarHandleOp(mlir::Operation* op) {
   TF_ASSIGN_OR_RETURN(std::optional<Layout> resource_layout,
                       ExtractSingleLayoutFromOp(op));
 
-  TF_ASSIGN_OR_RETURN(
-      llvm::ArrayRef<int64_t> global_shape,
-      GetGlobalShapeOfValueFromDTensorLayout(op->getOpResult(0)));
-
   if (!resource_layout) {
     // If resource does not have a layout, perform local SPMD expansion.
     return InferSPMDExpandedLocalShape(op);
   }
 
   // If resource has a layout, create VarHandleOps with local shape.
-  auto var_op = llvm::cast<mlir::TF::VarHandleOp>(op);
-  const std::vector<int64_t>& local_shape =
-      resource_layout->LocalShapeFromGlobalShape(global_shape);
-
-  // Replace var handle op with a VarHandleOp with local shape
-  auto resource_type = op->getOpResult(0)
-                           .getType()
-                           .cast<mlir::TensorType>()
-                           .getElementType()
-                           .dyn_cast<mlir::TF::ResourceType>();
-
-  auto sub_types = resource_type.getSubtypes();
-  auto resource_arg_sub_type = sub_types.front();
-
-  // The local shape that is to be assigned to this resource output.
-  llvm::SmallVector<int64_t, 4> local_arg_shape(local_shape.begin(),
-                                                local_shape.end());
-
-  auto local_variable_subtype = mlir::RankedTensorType::get(
-      local_arg_shape, resource_arg_sub_type.getElementType());
-  auto new_var_type = mlir::RankedTensorType::get(
-      {}, mlir::TF::ResourceType::get(
-              mlir::ArrayRef<mlir::TensorType>{local_variable_subtype},
-              builder.getContext()));
-
-  auto var_handle_op = builder.create<mlir::TF::VarHandleOp>(
-      var_op->getLoc(), new_var_type, var_op.getContainer(),
-      var_op.getSharedName());
-
-  auto result_op = InferSPMDExpandedLocalShape(var_handle_op);
-  op->getOpResult(0).replaceAllUsesWith(result_op->getOpResult(0));
-  op->erase();
-
-  return result_op;
+  // auto var_op = llvm::cast<mlir::TF::VarHandleOp>(op);
+  auto op_result = op->getOpResult(0);
+  TF_RETURN_IF_ERROR(InferSPMDExpandedLocalShapeForResourceOutput(
+      &op_result, resource_layout.value(), builder.getContext()));
+  return InferSPMDExpandedLocalShape(op);
 }
 
 Status ValidateAndAssignResourceInputLayout(mlir::tf_device::ClusterOp op,
@@ -276,8 +244,9 @@ StatusOr<llvm::DenseMap<int, Layout>>
 ResourceSPMDExpander::ComputeLayoutForward(
     mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
   // VarHandle and VarIsInitialized have 0 rank outputs.
-  if (llvm::isa<mlir::TF::VarHandleOp, mlir::TF::VarIsInitializedOp>(op))
+  if (llvm::isa<mlir::TF::VarHandleOp, mlir::TF::VarIsInitializedOp>(op)) {
     return llvm::DenseMap<int, Layout>({{0, Layout::Empty()}});
+  }
 
   // Handling of resource destruction is no-op.
   if (llvm::isa<mlir::TF::DestroyResourceOp>(op))
@@ -285,7 +254,9 @@ ResourceSPMDExpander::ComputeLayoutForward(
 
   // Read variable ops have one input so infer the output layout if input
   // layout exists.
-  if (llvm::isa<mlir::TF::ReadVariableOp>(op)) return input_layouts;
+  if (llvm::isa<mlir::TF::ReadVariableOp>(op)) {
+    return input_layouts;
+  }
 
   // These ops do not have outputs, so do not infer any layout.
   if (llvm::isa<mlir::TF::AssignVariableOp, mlir::TF::AssignAddVariableOp,
@@ -308,14 +279,28 @@ ResourceSPMDExpander::ComputeLayoutBackward(
   // resource tensor layout exists.
   if (llvm::isa<mlir::TF::AssignVariableOp, mlir::TF::AssignAddVariableOp,
                 mlir::TF::AssignSubVariableOp>(op)) {
-    if (input_layouts.find(0) != input_layouts.end())
+    if (input_layouts.find(0) != input_layouts.end()) {
+      auto resource_layout = input_layouts.lookup(0);
+      if (resource_layout.IsEmpty()) {
+        auto mesh = GetMeshOnParentCluster(op);
+        if (mesh.ok()) {
+          auto layout = Layout::ReplicatedOnMesh(
+              mesh.value(), ValueRank(op->getOpOperand(1).get()));
+
+          // Resource has an empty layout, propagate back replicated layout
+          // for the resource.
+          return llvm::DenseMap<int, Layout>({{0, layout}, {1, layout}});
+        }
+      }
       return llvm::DenseMap<int, Layout>({{1, input_layouts.lookup(0)}});
+    }
     return llvm::DenseMap<int, Layout>();
   }
   // Handling of these ops are no-ops.
   if (llvm::isa<mlir::TF::DestroyResourceOp, mlir::TF::VarHandleOp,
-                mlir::TF::VarIsInitializedOp, mlir::TF::ReadVariableOp>(op))
+                mlir::TF::VarIsInitializedOp, mlir::TF::ReadVariableOp>(op)) {
     return llvm::DenseMap<int, Layout>();
+  }
 
   // Return an error if not any of the ops above.
   return errors::InvalidArgument(
diff --git a/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc
index a7bc0587d2a..92d265b8fb5 100644
--- a/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc
@@ -42,10 +42,10 @@ UnsortedSegmentSumSPMDExpander::ComputeLayoutForward(
     const int segment_ids_rank =
         ValueRank(unsorted_segmented_sum.getSegmentIds());
 
+    Layout input_layout_truncated =
+        input_layouts.lookup(0).Truncate(segment_ids_rank, /*end=*/true);
     return llvm::DenseMap<int, Layout>(
-        {{0, input_layouts.lookup(0)
-                 .Truncate(segment_ids_rank, /*end=*/true)
-                 .LeftPad(output_rank)}});
+        {{0, input_layout_truncated.LeftPad(output_rank)}});
   }
 
   // When we don't have a data layout we can only output a replicated layout.
@@ -66,11 +66,12 @@ UnsortedSegmentSumSPMDExpander::ComputeLayoutBackward(
     // If we have an output layout, we can send it backwards to the last few
     // dimension
     const int data_rank = ValueRank(unsorted_segmented_sum.getData());
-    return llvm::DenseMap<int, Layout>({{0, output_layouts.lookup(0)
-                                                .Truncate(1, /*end=*/true)
-                                                .LeftPad(data_rank)},
-                                        {1, segment_ids_layout},
-                                        {2, num_segments_layout}});
+    Layout output_layout_truncated =
+        output_layouts.lookup(0).Truncate(1, /*end=*/true);
+    return llvm::DenseMap<int, Layout>(
+        {{0, output_layout_truncated.LeftPad(data_rank)},
+         {1, segment_ids_layout},
+         {2, num_segments_layout}});
   }
   return llvm::DenseMap<int, Layout>(
       {{0, Layout::ReplicatedOnMesh(
@@ -121,9 +122,10 @@ StatusOr<mlir::Operation*> UnsortedSegmentSumSPMDExpander::ExpandOp(
 
   InferSPMDExpandedLocalShape(new_sum_op);
 
-  Layout result_output_layout =
-      data_layout.Truncate(segment_ids_rank, /*end=*/true)
-          .LeftPad(data_rank - segment_ids_rank + 1);  // This is output rank.
+  Layout data_layout_truncated =
+      data_layout.Truncate(segment_ids_rank, /*end=*/true);
+  Layout result_output_layout = data_layout_truncated.LeftPad(
+      data_rank - segment_ids_rank + 1);  // This is output rank.
 
   TF_ASSIGN_OR_RETURN(
       new_sum_op, EmitAllReduce(builder, result_output_layout,
diff --git a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
index 72cfe9ec94c..d3291e223ac 100644
--- a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
@@ -77,7 +77,8 @@ StatusOr<Layout> VerifySliceLayout(
   auto num_shards = layout.num_shards();
 
   LayoutProto proposed_proto;
-  *proposed_proto.mutable_mesh_config() = layout.mesh().ToProto();
+  TF_ASSIGN_OR_RETURN(*proposed_proto.mutable_mesh_config(),
+                      layout.mesh().ToProto());
   for (int64_t i = 0; i < rank; ++i) {
     // Slice performed on replicated dimension translates to local expansion.
     if (num_shards[i] == 1) {
diff --git a/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
index be20accb381..185901a3573 100644
--- a/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
@@ -41,9 +41,10 @@ StatusOr<mlir::Value> ComputeGlobalReduce(
     mlir::OpBuilder& builder, const mlir::Value& input,
     const Layout& input_layout, const absl::flat_hash_set<int>& reduced_dims,
     absl::string_view reduce_op, bool keep_dims) {
-  const Layout reduction_layout =
+  TF_ASSIGN_OR_RETURN(
+      const Layout reduction_layout,
       input_layout.GetLayoutWithReducedDims(reduced_dims,
-                                            /*keep_dims=*/true);
+                                            /*keep_dims=*/true));
   std::vector<int32> reduce_dim_array(reduced_dims.begin(), reduced_dims.end());
   const mlir::Value reduction_indices =
       IntConst(builder, input.getLoc(), reduce_dim_array);
diff --git a/tensorflow/dtensor/mlir/expansions/trivial_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/trivial_spmd_expander.cc
index ca0b69be7d7..cf1bc232da2 100644
--- a/tensorflow/dtensor/mlir/expansions/trivial_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/trivial_spmd_expander.cc
@@ -38,7 +38,7 @@ StatusOr<mlir::Operation*> TerminatorSPMDExpander::ExpandOp(
   auto output_types = llvm::to_vector<8>(terminator_op.getOperandTypes());
   assert(output_types.size() == parent_op->getNumResults());
 
-  for (auto& output_type_and_index : llvm::enumerate(output_types)) {
+  for (const auto& output_type_and_index : llvm::enumerate(output_types)) {
     const int index = output_type_and_index.index();
     const auto& type = output_type_and_index.value();
     parent_op->getResult(index).setType(type);
diff --git a/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc b/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
index 58c05847700..14420275f83 100644
--- a/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
+++ b/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
@@ -91,7 +91,7 @@ mlir::LogicalResult CloneOpToCluster(mlir::Operation* const_op,
   StatusOr<Layout> layout = Layout::FromString(layout_attr);
   if (!layout.ok())
     return copy_to_mesh.emitOpError(llvm::formatv(
-        kInvalidLayoutMsg, layout_attr, layout.status().error_message()));
+        kInvalidLayoutMsg, layout_attr, layout.status().message()));
 
   mlir::OpBuilder builder(&cluster.GetBody().front());
   mlir::Operation* cloned_op = builder.clone(*const_op);
@@ -240,9 +240,8 @@ mlir::LogicalResult LowerToSendRecv(mlir::TF::CopyToMeshOp copy_to_mesh,
   const std::string layout_attr = copy_to_mesh.getLayout().str();
   auto layout_or_status = Layout::FromString(layout_attr);
   if (!layout_or_status.ok())
-    return copy_to_mesh.emitOpError(
-        llvm::formatv(kInvalidLayoutMsg, layout_attr,
-                      layout_or_status.status().error_message()));
+    return copy_to_mesh.emitOpError(llvm::formatv(
+        kInvalidLayoutMsg, layout_attr, layout_or_status.status().message()));
 
   // Create send op that sends data from input cluster to target cluster.
   const Layout& target_layout = layout_or_status.value();
diff --git a/tensorflow/dtensor/mlir/ir/tf_dtensor.cc b/tensorflow/dtensor/mlir/ir/tf_dtensor.cc
index 4b5698ba528..c9b4fe235ec 100644
--- a/tensorflow/dtensor/mlir/ir/tf_dtensor.cc
+++ b/tensorflow/dtensor/mlir/ir/tf_dtensor.cc
@@ -51,6 +51,7 @@ mlir::LogicalResult DTensorLayout::verify() {
   DTensorLayout op = *this;
   const auto& layout = op.getLayout();
   if (layout.IsEmpty()) return mlir::success();
+  if (layout.IsSingleDevice()) return mlir::success();
 
   auto input_value = op.getInput();
 
@@ -197,9 +198,80 @@ mlir::LogicalResult DTensorAllScatterOp::verify() {
   return mlir::success();
 }
 
+mlir::LogicalResult DTensorAllToAllOp::verify() {
+  DTensorAllToAllOp op = *this;
+  const tensorflow::dtensor::Layout input_layout = op.getInputLayout();
+  const tensorflow::dtensor::Layout output_layout = op.getOutputLayout();
+
+  if (input_layout.rank() != output_layout.rank())
+    return op.emitOpError()
+           << "received input and output layouts of unequal ranks "
+           << input_layout.rank() << " and " << output_layout.rank();
+
+  int32_t num_split_dims = 0;
+  int32_t num_concat_dims = 0;
+  tensorflow::dtensor::ShardingSpec split_spec;
+  tensorflow::dtensor::ShardingSpec concat_spec;
+  for (int32_t i = 0; i < input_layout.rank(); ++i) {
+    if (input_layout.sharding_spec(i) == output_layout.sharding_spec(i))
+      continue;
+    if (tensorflow::dtensor::Layout::IsUnshardedDimension(
+            input_layout.sharding_spec(i)) &&
+        tensorflow::dtensor::Layout::IsShardedDimension(
+            output_layout.sharding_spec(i))) {
+      num_split_dims++;
+      split_spec = output_layout.dim(i);
+    } else if (tensorflow::dtensor::Layout::IsShardedDimension(
+                   input_layout.sharding_spec(i)) &&
+               tensorflow::dtensor::Layout::IsUnshardedDimension(
+                   output_layout.sharding_spec(i))) {
+      num_concat_dims++;
+      concat_spec = input_layout.dim(i);
+    }
+  }
+  if (num_split_dims != 1 || num_concat_dims != 1 ||
+      split_spec.sharding_spec() != concat_spec.sharding_spec()) {
+    return op.emitOpError() << "must have one mesh dimension which is being "
+                               "unsharded in one axis and sharded in another";
+  }
+
+  RankedTensorType input_type =
+      op.getInput().getType().dyn_cast<RankedTensorType>();
+  if (!input_type) return mlir::success();
+
+  if (input_type.getRank() != input_layout.rank())
+    return op.emitOpError()
+           << "input layout rank " << input_layout.rank()
+           << " is not equal to input rank " << input_type.getRank();
+
+  RankedTensorType output_type =
+      op.getOutput().getType().dyn_cast<RankedTensorType>();
+  if (!output_type) return mlir::success();
+
+  if (output_type.getRank() != output_layout.rank())
+    return op.emitOpError()
+           << "output layout rank " << output_layout.rank()
+           << " is not equal to output rank " << output_type.getRank();
+
+  std::vector<int64_t> computed_output_shape =
+      output_layout.LocalShapeFromGlobalShape(
+          input_layout.GlobalShapeFromLocalShape(input_type.getShape()));
+
+  for (int32_t i = 0; i < computed_output_shape.size(); ++i) {
+    if (computed_output_shape[i] != output_type.getShape()[i]) {
+      return op.emitOpError()
+             << "computed output shape " << computed_output_shape[i]
+             << " at dimension " << i << " is not equal to actual output shape "
+             << output_type.getShape()[i];
+    }
+  }
+
+  return mlir::success();
+}
+
 LogicalResult DTensorLayout::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   assert(operands.size() == 1);
   inferredReturnTypes.assign({operands[0].getType()});
diff --git a/tensorflow/dtensor/mlir/ir/tf_dtensor.td b/tensorflow/dtensor/mlir/ir/tf_dtensor.td
index 11a7fed72ab..4a05ff9a7b5 100644
--- a/tensorflow/dtensor/mlir/ir/tf_dtensor.td
+++ b/tensorflow/dtensor/mlir/ir/tf_dtensor.td
@@ -137,13 +137,13 @@ def TF_RelayoutOp : TF_Op<"Relayout", [Pure, TF_AllTypesMatch<["input", "output"
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_RelayoutGradOp : TF_Op<"RelayoutGrad", [Pure, TF_AllTypesMatch<["input", "output"]>, TF_NoConstantFold]> {
+def TF_RelayoutLikeOp : TF_Op<"RelayoutLike", [Pure, TF_AllTypesMatch<["input", "output"]>, TF_NoConstantFold]> {
   let summary = "Change layout of the gradients to match the layout of the forward pass Relayout's input.";
 
   let arguments = (ins
     TF_Tensor:$input,
 
-    TF_Tensor:$forward_input
+    TF_Tensor:$layout_input
   );
 
   let results = (outs
@@ -536,7 +536,7 @@ def TF_DTensorAllReduceOp : TF_Op<"DTensorAllReduce", [Pure]> {
   let summary = "";
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Bool, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint32]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint32, TF_Uint64, TF_Int8, TF_Uint8]>:$input,
     TF_Int32Tensor:$group_assignment,
 
     TF_AnyStrAttrOf<["Min", "Max", "Mul", "Add", "Mean", "Any", "All"]>:$reduce_op,
@@ -544,7 +544,7 @@ def TF_DTensorAllReduceOp : TF_Op<"DTensorAllReduce", [Pure]> {
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Bool, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint32]>:$output
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint32, TF_Uint64, TF_Int8, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -617,7 +617,6 @@ blocked by CopyToMesh. Note that this will be also used by name-based restore.
 
   TF_DerivedResultTypeListAttr dtypes = TF_DerivedResultTypeListAttr<0>;
 }
-
 def TF_DTensorAllScatterOp : TF_Op<"DTensorAllScatter", [Pure]> {
   let summary = "Slices the input to the given layout.";
 
@@ -627,13 +626,13 @@ sharded than the input layout.
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Uint32, TF_Int64, TF_Bool, TF_Str]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int8, TF_Uint8, TF_Int32, TF_Uint32, TF_Int64, TF_Uint64, TF_Bool, TF_Str]>:$input,
     DTensor_LayoutAttr:$input_layout,
     DTensor_LayoutAttr:$output_layout
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Uint32, TF_Int64, TF_Bool, TF_Str]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int8, TF_Uint8, TF_Int32, TF_Uint32, TF_Int64, TF_Uint64, TF_Bool, TF_Str]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -649,6 +648,30 @@ This op takes both an input and an output layout. The output layout must be less
 sharded than the input layout.
   }];
 
+  let arguments = (ins
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Int8, TF_Uint8, TF_Int32, TF_Uint32, TF_Int64, TF_Uint64, TF_Bool]>:$input,
+    DTensor_LayoutAttr:$input_layout,
+    DTensor_LayoutAttr:$output_layout
+  );
+
+  let results = (outs
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Int8, TF_Uint8, TF_Int32, TF_Uint32, TF_Int64, TF_Uint64, TF_Bool]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let hasVerifier = 1;
+}
+
+def TF_DTensorAllToAllOp : TF_Op<"DTensorAllToAll", [Pure]> {
+  let summary = "Mutually exchanges the input to match the given layout.";
+
+  let description = [{
+This op takes both an input and an output layout. There can be one mesh
+dimension which is becoming unsharded in one axis while becoming sharded 
+in another axis."
+  }];
+
   let arguments = (ins
     TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Int32, TF_Uint32, TF_Int64, TF_Bool]>:$input,
     DTensor_LayoutAttr:$input_layout,
diff --git a/tensorflow/dtensor/mlir/layout_parsing.cc b/tensorflow/dtensor/mlir/layout_parsing.cc
index 0eab62e82f3..4197d94d450 100644
--- a/tensorflow/dtensor/mlir/layout_parsing.cc
+++ b/tensorflow/dtensor/mlir/layout_parsing.cc
@@ -264,8 +264,7 @@ StatusOr<absl::optional<Layout>> ExtractLayoutFromFunctionReturnAttr(
     return errors::InvalidArgument(
         llvm::formatv("Malformed default return layout received. {0} Received "
                       "layout : {1}",
-                      result_layout_or_status.status().error_message(),
-                      layout_string)
+                      result_layout_or_status.status().message(), layout_string)
             .str());
 
   layout.emplace(result_layout_or_status.value());
diff --git a/tensorflow/dtensor/mlir/layout_propagation_v2.cc b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
index 2415e16194d..aad1f3ceb47 100644
--- a/tensorflow/dtensor/mlir/layout_propagation_v2.cc
+++ b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
@@ -367,7 +367,7 @@ mlir::LogicalResult InsertInitialLayoutsFromComputeLayout(
             /*output_layouts=*/llvm::DenseMap<int, Layout>());
     if (!forward_result.ok()) {
       op->emitOpError() << "ComputeLayoutForward error: "
-                        << forward_result.status().error_message();
+                        << forward_result.status().message();
       return mlir::WalkResult::interrupt();
     }
     StatusOr<llvm::DenseMap<int, Layout>> backward_result =
@@ -376,7 +376,7 @@ mlir::LogicalResult InsertInitialLayoutsFromComputeLayout(
             /*output_layouts=*/llvm::DenseMap<int, Layout>());
     if (!backward_result.ok()) {
       op->emitOpError() << "ComputeLayoutBackward error: "
-                        << backward_result.status().error_message();
+                        << backward_result.status().message();
       return mlir::WalkResult::interrupt();
     }
 
@@ -461,8 +461,7 @@ mlir::LogicalResult MergeAndGetUpdatedLayouts(
     auto merged =
         MergeLayouts(value, producer_layout, consumer_requests[value]);
     if (!merged.ok())
-      return value.getDefiningOp()->emitOpError()
-             << merged.status().error_message();
+      return value.getDefiningOp()->emitOpError() << merged.status().message();
 
     auto current_layout = merged_layouts.find(value);
     if (current_layout == merged_layouts.end() ||
@@ -525,7 +524,7 @@ mlir::LogicalResult GetMostShardedLayout(llvm::ArrayRef<Layout> layouts,
     return mlir::emitError(
         location, llvm::formatv("error in layout propagation while merging "
                                 "producer layouts. {0}",
-                                new_layout.status().error_message()));
+                                new_layout.status().message()));
   }
   out->emplace(*new_layout);
   return mlir::success();
@@ -619,14 +618,14 @@ mlir::LogicalResult UpdateLayoutsForOp(
       expander->ComputeLayoutForward(op, input_layouts, output_layouts);
   if (!forward_result.ok()) {
     return op->emitOpError() << "ComputeLayoutForward error: "
-                             << forward_result.status().error_message();
+                             << forward_result.status().message();
   }
   const auto new_output_layouts = *forward_result;
   auto backward_result =
       expander->ComputeLayoutBackward(op, input_layouts, output_layouts);
   if (!backward_result.ok()) {
     return op->emitOpError() << "ComputeLayoutBackward error: "
-                             << backward_result.status().error_message();
+                             << backward_result.status().message();
   }
   const auto new_input_layouts = *backward_result;
 
@@ -656,17 +655,37 @@ mlir::LogicalResult UpdateLayoutsForOp(
         const bool exempt_restore_unknown_rank =
             ValueRank(value) == -1 && value.getDefiningOp() &&
             llvm::isa<mlir::TF::RestoreV2Op>(value.getDefiningOp());
-        if (!exempt_restore_unknown_rank &&
-            input_layout->second.rank() != ValueRank(value))
-          return op->emitOpError()
-                 << "Rank for input " << i << " layout is "
-                 << input_layout->second.rank() << " but actual rank is "
-                 << ValueRank(value);
+        bool producer_is_resource =
+            value.getDefiningOp() &&
+            llvm::isa<mlir::TF::VarHandleOp>(value.getDefiningOp());
+        if (producer_is_resource) {
+          // If producer is a VarHandleOp, the input layout could be empty,
+          // In this case  ComputeBackward method for Assign op will derive a
+          // replicated layout for the resource based on the rank of value.
+          // This block will update resource layout.
+          // Read and Assign ops.
+          //
+          // If resource has a non-empty value, the existing layout will be
+          // returned from ComputeBackward and Forward methods for Assign and
+          // Read variable ops.
+          //
+          // Thus, it is safe to update the producer layout here for the
+          // resource.
+          producer_request[value] = input_layout->second;
+          is_updated.insert(value);
+        } else {
+          if (!exempt_restore_unknown_rank &&
+              input_layout->second.rank() != ValueRank(value))
+            return op->emitOpError()
+                   << "Rank for input " << i << " layout is "
+                   << input_layout->second.rank() << " but actual rank is "
+                   << ValueRank(value);
 
-        // If there was a layout returned and either no previous request or the
-        // request changed, insert and mark as updated.
-        consumer_request[operand] = input_layout->second;
-        is_updated.insert(value);
+          // If there was a layout returned and either no previous request or
+          // the request changed, insert and mark as updated.
+          consumer_request[operand] = input_layout->second;
+          is_updated.insert(value);
+        }
       } else if (input_layout == new_input_layouts.end() &&
                  consumer_request_from_op_operand != consumer_request.end()) {
         // If no layout was returned and there is previous request, erase the
@@ -683,6 +702,12 @@ mlir::LogicalResult UpdateLayoutsForOp(
   for (int i = 0; i < op->getNumResults(); ++i) {
     const auto output_layout = new_output_layouts.find(i);
     if (output_layout == new_output_layouts.end()) continue;
+    if (output_layout->second.IsEmpty()) {
+      // Empty layout is derived from var handle op, bypass the check for update
+      // layout and let the next passes propagate layout update for producer to
+      // handle it.
+      continue;
+    }
     const auto& result = op->getOpResult(i);
     if (producer_request[result] != output_layout->second) {
       if (output_layout->second.rank() != ValueRank(result))
@@ -690,6 +715,7 @@ mlir::LogicalResult UpdateLayoutsForOp(
                                  << output_layout->second.rank()
                                  << " but actual rank is " << ValueRank(result);
       producer_request[result] = output_layout->second;
+
       is_updated.insert(result);
     }
   }
@@ -1006,8 +1032,8 @@ void LogLayoutsAndOps(const int stage, const uint64_t module_hash,
   auto* env = tensorflow::Env::Default();
   auto status = env->RecursivelyCreateDir(prefix);
   if (!status.ok()) {
-    LOG(WARNING) << "cannot create directory '" + prefix +
-                        "': " + status.error_message();
+    LOG(WARNING) << "cannot create directory '" << prefix
+                 << "': " << status.message();
     return;
   }
 
@@ -1021,8 +1047,7 @@ void LogLayoutsAndOps(const int stage, const uint64_t module_hash,
   std::unique_ptr<WritableFile> file_writer;
   status = env->NewWritableFile(prefix, &file_writer);
   if (!status.ok()) {
-    LOG(WARNING) << "cannot open file '" + prefix +
-                        "': " + status.error_message();
+    LOG(WARNING) << "cannot open file '" << prefix << "': " << status.message();
     return;
   }
 
@@ -1036,8 +1061,8 @@ void LogLayoutsAndOps(const int stage, const uint64_t module_hash,
 
   status = file_writer->Append(txt_module);
   if (!status.ok()) {
-    LOG(WARNING) << "error writing to file '" + prefix +
-                        "': " + status.error_message();
+    LOG(WARNING) << "error writing to file '" << prefix
+                 << "': " << status.message();
     return;
   }
   (void)file_writer->Close();
diff --git a/tensorflow/dtensor/mlir/lower_send_recv.cc b/tensorflow/dtensor/mlir/lower_send_recv.cc
index d71583a2cd8..1028829660d 100644
--- a/tensorflow/dtensor/mlir/lower_send_recv.cc
+++ b/tensorflow/dtensor/mlir/lower_send_recv.cc
@@ -44,13 +44,13 @@ mlir::LogicalResult LowerDTensorSendRecvsOps(mlir::ModuleOp module) {
     auto recv_op = GetCorrespondingDTensorSendRecvOp<mlir::TF::DTensorSend>(
         module, send_op);
     if (!recv_op.ok()) {
-      result = send_op.emitOpError(recv_op.status().error_message());
+      result = send_op.emitOpError(recv_op.status().message());
       return;
     }
 
     auto status = LowerDTensorSendAndRecv(send_op, *recv_op);
     if (!status.ok()) {
-      result = send_op->emitOpError(status.status().error_message());
+      result = send_op->emitOpError(status.status().message());
       return;
     }
   });
diff --git a/tensorflow/dtensor/mlir/merge_clusters.cc b/tensorflow/dtensor/mlir/merge_clusters.cc
index 38258beee78..367325e5d05 100644
--- a/tensorflow/dtensor/mlir/merge_clusters.cc
+++ b/tensorflow/dtensor/mlir/merge_clusters.cc
@@ -464,6 +464,10 @@ mlir::LogicalResult DecomposeControlflow(mlir::MLIRContext* context,
   for (mlir::tf_device::ClusterOp cluster : clusters) {
     mlir::WalkResult walk_result = cluster->walk([&](mlir::Operation* op) {
       if (auto if_op = mlir::dyn_cast<mlir::TF::IfRegionOp>(op)) {
+        // Remove the device attr to follow the 'default' placement set during
+        // replicated execution. If there is a device attr, TensorFlow will
+        // run the body on that device instead.
+        op->removeAttr("device");
         if (mlir::failed(
                 DecomposeIf(if_op, context, num_control_flow_send_recvs)))
           return mlir::WalkResult::interrupt();
diff --git a/tensorflow/dtensor/mlir/mesh_propagation.cc b/tensorflow/dtensor/mlir/mesh_propagation.cc
index 61846b44373..62af0969f1c 100644
--- a/tensorflow/dtensor/mlir/mesh_propagation.cc
+++ b/tensorflow/dtensor/mlir/mesh_propagation.cc
@@ -60,7 +60,7 @@ mlir::LogicalResult ExtractMeshFromBlockArgument(mlir::BlockArgument block_arg,
   }
   auto layout_or_status = ExtractLayoutFromOperand(block_arg);
   if (!layout_or_status.ok())
-    return func_op.emitOpError(layout_or_status.status().error_message());
+    return func_op.emitOpError(layout_or_status.status().message());
 
   if (layout_or_status->has_value()) {
     out->emplace(layout_or_status->value().mesh());
@@ -100,7 +100,7 @@ mlir::LogicalResult ExtractMeshFromOpOutput(mlir::Value value,
   if (!mesh_or_status.ok())
     return operand_cluster.emitOpError(
         llvm::formatv("Failed during mesh propagation. {0}",
-                      mesh_or_status.status().error_message()));
+                      mesh_or_status.status().message()));
 
   auto extracted_mesh = mesh_or_status.value();
   if (extracted_mesh) *out = extracted_mesh.value();
@@ -307,7 +307,7 @@ mlir::LogicalResult InferMeshFromConsumers(
 
       auto mesh_or_status = ExtractDeviceMeshFromOp(consumer_cluster);
       if (!mesh_or_status.ok())
-        return cluster.emitOpError(mesh_or_status.status().error_message());
+        return cluster.emitOpError(mesh_or_status.status().message());
 
       auto consumer_mesh = mesh_or_status.value();
       if (!consumer_mesh) continue;
@@ -413,7 +413,7 @@ mlir::LogicalResult AnnotateFunctionReturnValuesWithMeshInformation(
             Layout::FromString(function_result_layout.getValue().str());
         if (!layout_or_status.ok())
           return parent_function.emitOpError(
-              layout_or_status.status().error_message());
+              layout_or_status.status().message());
 
         result_mesh_attribute.emplace(
             builder->getStringAttr(layout_or_status->mesh().ToString()));
@@ -534,8 +534,7 @@ DTensorMeshPropagation::PropagateDefaultMeshToUnAssignedClusters(
 
     auto mesh_or_status = ExtractDeviceMeshFromOp(cluster);
     if (!mesh_or_status.ok()) {
-      cluster.GetBody().front().emitOpError(
-          mesh_or_status.status().error_message());
+      cluster.GetBody().front().emitOpError(mesh_or_status.status().message());
       return mlir::WalkResult::interrupt();
     }
 
diff --git a/tensorflow/dtensor/mlir/move_compilation_to_host.cc b/tensorflow/dtensor/mlir/move_compilation_to_host.cc
index e9b0cf86157..39da2c20664 100644
--- a/tensorflow/dtensor/mlir/move_compilation_to_host.cc
+++ b/tensorflow/dtensor/mlir/move_compilation_to_host.cc
@@ -277,7 +277,7 @@ mlir::LogicalResult HandleCompilationOps(
     if (!device_ordinal_host.ok())
       return compile_op.emitOpError(
           llvm::formatv("error while creating TPU compilation logic. {0}",
-                        device_ordinal_host.status().error_message()));
+                        device_ordinal_host.status().message()));
 
     mlir::Value predicate_host = builder.create<mlir::TF::EqualOp>(
         compile_op.getLoc(), *device_ordinal_host,
diff --git a/tensorflow/dtensor/mlir/op_to_device_cluster.cc b/tensorflow/dtensor/mlir/op_to_device_cluster.cc
index 7b49ad4bc00..b52b607e37a 100644
--- a/tensorflow/dtensor/mlir/op_to_device_cluster.cc
+++ b/tensorflow/dtensor/mlir/op_to_device_cluster.cc
@@ -78,7 +78,7 @@ mlir::LogicalResult WrapDeviceCluster(mlir::OpBuilder *builder,
     if (!status_or_mesh.ok())
       return op->emitOpError(
           llvm::formatv("failed to wrap to device cluster. {0}",
-                        status_or_mesh.status().error_message()));
+                        status_or_mesh.status().message()));
 
     const auto mesh_config = status_or_mesh.value();
     if (mesh_config)
diff --git a/tensorflow/dtensor/mlir/op_utils.h b/tensorflow/dtensor/mlir/op_utils.h
index c63f034707f..c98aa612aca 100644
--- a/tensorflow/dtensor/mlir/op_utils.h
+++ b/tensorflow/dtensor/mlir/op_utils.h
@@ -34,6 +34,10 @@ inline std::string OpName(mlir::Operation* op) {
   return ref.str();
 }
 
+inline std::string GetFullOpName(const std::string& name) {
+  return "tf." + name;
+}
+
 // Returns FuncOp if `op` is a callable.
 absl::optional<mlir::func::FuncOp> MaybeFindFunction(mlir::Operation* op);
 
diff --git a/tensorflow/dtensor/mlir/restore_shape_inference.cc b/tensorflow/dtensor/mlir/restore_shape_inference.cc
index 77f02781592..1bce632d6eb 100644
--- a/tensorflow/dtensor/mlir/restore_shape_inference.cc
+++ b/tensorflow/dtensor/mlir/restore_shape_inference.cc
@@ -117,7 +117,7 @@ mlir::LogicalResult BackwardShapeInferenceToRestoreOp(mlir::ModuleOp module,
         module, new_recv_op);
 
     if (!send_op.ok())
-      return recv_op.emitOpError(send_op.status().error_message());
+      return recv_op.emitOpError(tsl::NullTerminatedMessage(send_op.status()));
 
     // Recursively shape inference to the input of the send op.
     return BackwardShapeInferenceToRestoreOp(
diff --git a/tensorflow/dtensor/mlir/shape_utils.cc b/tensorflow/dtensor/mlir/shape_utils.cc
index 021e88b6df3..33a30a1007c 100644
--- a/tensorflow/dtensor/mlir/shape_utils.cc
+++ b/tensorflow/dtensor/mlir/shape_utils.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/dtensor/cc/constants.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/value_utils.h"
 
@@ -134,7 +135,7 @@ mlir::LogicalResult InferShapeOfTFOpWithCustomOperandConstantFn(
     auto result = type_op.inferReturnTypes(
         op->getContext(), location, op->getOperands(),
         mlir::DictionaryAttr::get(op->getContext(), attributes),
-        op->getRegions(), inferred_return_types);
+        op->getPropertiesStorage(), op->getRegions(), inferred_return_types);
     if (failed(result)) return mlir::failure();
 
     inferred_return_shapes.resize(inferred_return_types.size());
@@ -162,7 +163,7 @@ mlir::LogicalResult InferShapeOfTFOpWithCustomOperandConstantFn(
     return shape_type_op.inferReturnTypeComponents(
         op->getContext(), location, op->getOperands(),
         mlir::DictionaryAttr::get(op->getContext(), attributes),
-        op->getRegions(), inferred_return_shapes);
+        op->getPropertiesStorage(), op->getRegions(), inferred_return_shapes);
   }
 
   // If `operand` is from DTensorLayout op, use input value of DTensorLayout op
@@ -206,6 +207,38 @@ mlir::LogicalResult InferShapeOfTFOpWithCustomOperandConstantFn(
 
 }  // namespace
 
+Status InferSPMDExpandedLocalShapeForResourceOutput(
+    mlir::OpResult* op_result, const Layout& output_layout,
+    mlir::MLIRContext* context) {
+  if (llvm::isa<mlir::TF::ResourceType>(
+          mlir::getElementTypeOrSelf(*op_result))) {
+    TF_ASSIGN_OR_RETURN(llvm::ArrayRef<int64_t> global_shape,
+                        GetGlobalShapeOfValueFromDTensorLayout(*op_result));
+    const std::vector<int64_t>& local_shape =
+        output_layout.LocalShapeFromGlobalShape(global_shape);
+    auto resource_type = op_result->getType()
+                             .cast<mlir::TensorType>()
+                             .getElementType()
+                             .dyn_cast<mlir::TF::ResourceType>();
+
+    auto sub_types = resource_type.getSubtypes();
+    auto resource_arg_sub_type = sub_types.front();
+
+    // The local shape that is to be assigned to this resource output.
+    llvm::SmallVector<int64_t, 4> local_arg_shape(local_shape.begin(),
+                                                  local_shape.end());
+
+    auto local_variable_subtype = mlir::RankedTensorType::get(
+        local_arg_shape, resource_arg_sub_type.getElementType());
+    auto new_var_type = mlir::RankedTensorType::get(
+        {},
+        mlir::TF::ResourceType::get(
+            mlir::ArrayRef<mlir::TensorType>{local_variable_subtype}, context));
+    op_result->setType(new_var_type);
+  }
+  return OkStatus();
+}
+
 mlir::Operation* InferSPMDExpandedLocalShape(mlir::Operation* op) {
   llvm::SmallVector<mlir::ShapedTypeComponents, 4> inferred_return_types;
   (void)InferShapeOfTFOpWithCustomOperandConstantFn(
diff --git a/tensorflow/dtensor/mlir/shape_utils.h b/tensorflow/dtensor/mlir/shape_utils.h
index c7e3334b72c..403e9fcf212 100644
--- a/tensorflow/dtensor/mlir/shape_utils.h
+++ b/tensorflow/dtensor/mlir/shape_utils.h
@@ -17,9 +17,11 @@ limitations under the License.
 #define TENSORFLOW_DTENSOR_MLIR_SHAPE_UTILS_H_
 
 #include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -30,6 +32,12 @@ StatusOr<llvm::ArrayRef<int64_t>> ExtractGlobalInputShape(
 StatusOr<llvm::ArrayRef<int64_t>> ExtractGlobalOutputShape(
     mlir::OpResult result_value);
 
+// If result is a resource, the shape of the result should be adjusted to
+// local value of the resource, based on the layout for output.
+Status InferSPMDExpandedLocalShapeForResourceOutput(mlir::OpResult* op_result,
+                                                    const Layout& output_layout,
+                                                    mlir::MLIRContext* context);
+
 // Returns op with recalculated local shape of `op` given all it's operands.
 mlir::Operation* InferSPMDExpandedLocalShape(mlir::Operation* op);
 
diff --git a/tensorflow/dtensor/mlir/sparse_expansion.cc b/tensorflow/dtensor/mlir/sparse_expansion.cc
index 1d423879cb2..a88ccf5551c 100644
--- a/tensorflow/dtensor/mlir/sparse_expansion.cc
+++ b/tensorflow/dtensor/mlir/sparse_expansion.cc
@@ -54,7 +54,7 @@ mlir::LogicalResult ConductSparseExpansion(mlir::ModuleOp module) {
       if (expanded_op != nullptr) emit_op = expanded_op;
       return emit_op->emitError(WithContext(status, __FILE__, __LINE__,
                                             "While computing Sparse expansion")
-                                    .error_message());
+                                    .message());
     }
   }
   return mlir::success();
diff --git a/tensorflow/dtensor/mlir/spmd_expander.cc b/tensorflow/dtensor/mlir/spmd_expander.cc
index d086234678b..f76db7b6ca6 100644
--- a/tensorflow/dtensor/mlir/spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander.cc
@@ -51,14 +51,14 @@ SPMDExpanderRegistry* SPMDExpanderRegistry::Global() {
   return registry;
 }
 
-SPMDExpanderBase* SPMDExpanderRegistry::GetPropagateFnForOp(
-    mlir::Operation* op) {
-  auto key = OpName(op);
+SPMDExpanderBase* SPMDExpanderRegistry::GetPropagateFnForFullOpName(
+    const std::string& full_op_name) {
+  auto key = full_op_name;
   auto fn = op_to_propagate_fn_map_.find(key);
   if (fn == op_to_propagate_fn_map_.end()) {
     if (EnableReplicatedSpmdAsDefault(key)) {
       LOG(WARNING)
-          << key << " is defaulting to ReplicatedOpSPMDExpander. This "
+          << full_op_name << " is defaulting to ReplicatedOpSPMDExpander. This "
           << " has performance implications as all inputs and outputs "
           << " will be replicated if they are not already. Please file a "
           << " feature request to TF DTensor to implement an efficient "
@@ -73,6 +73,11 @@ SPMDExpanderBase* SPMDExpanderRegistry::GetPropagateFnForOp(
   return fn->second.get();
 }
 
+SPMDExpanderBase* SPMDExpanderRegistry::GetPropagateFnForOp(
+    mlir::Operation* op) {
+  return GetPropagateFnForFullOpName(OpName(op));
+}
+
 InitOnStartupMarker SPMDExpanderRegistry::RegisterPropagateFn(
     std::string opName, std::unique_ptr<SPMDExpanderBase> prop) {
   CHECK(op_to_propagate_fn_map_  // Crash ok
@@ -95,7 +100,7 @@ Status SPMDExpanderBase::ExpandOpAndSetLayout(mlir::Operation* op,
 
   // If op is on an XLA SPMD mesh, then set layout and skip expansion.
   TF_ASSIGN_OR_RETURN(const Mesh& mesh, ExtractDeviceMeshEnclosingCluster(op));
-  if (mesh.use_xla_spmd()) {
+  if (mesh.IsSingleDevice() || mesh.use_xla_spmd()) {
     *output = op;
     SetLayoutOnOp(*output, absl::Span<std::optional<Layout>>(
                                computed_layout.data(), computed_layout.size()));
diff --git a/tensorflow/dtensor/mlir/spmd_expander.h b/tensorflow/dtensor/mlir/spmd_expander.h
index 93e25436bbf..eeef7eee2c2 100644
--- a/tensorflow/dtensor/mlir/spmd_expander.h
+++ b/tensorflow/dtensor/mlir/spmd_expander.h
@@ -129,10 +129,21 @@ class SPMDExpanderRegistry {
   // A singleton available at startup.
   static SPMDExpanderRegistry* Global();
 
+  // Returns true if the op name is supported.
+  // The name includes the "tf." prefix.
+  bool IsOpSupported(const std::string& full_op_name) {
+    return GetPropagateFnForFullOpName(full_op_name) != nullptr;
+  }
+
   // Returns the expansion for the given operation (or nullptr if no expansion
   // has been registered).
   SPMDExpanderBase* GetPropagateFnForOp(mlir::Operation* op);
 
+  // Returns the expansion for the given operation (or nullptr if no expansion
+  // has been registered). The name is the full name with "tf." prefix.
+  SPMDExpanderBase* GetPropagateFnForFullOpName(
+      const std::string& full_op_name);
+
   // Registers an expander for the provided opName.
   InitOnStartupMarker RegisterPropagateFn(
       std::string opName, std::unique_ptr<SPMDExpanderBase> prop);
diff --git a/tensorflow/dtensor/mlir/spmd_expander_common.cc b/tensorflow/dtensor/mlir/spmd_expander_common.cc
index 3ea023ef015..bbf6c532ff7 100644
--- a/tensorflow/dtensor/mlir/spmd_expander_common.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander_common.cc
@@ -33,8 +33,10 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -344,6 +346,7 @@ llvm::SmallVector<mlir::OpOperand*, 4> TraceUseToNextTFOp(
     llvm::SmallVector<mlir::Value, 4>* skipped_values) {
   mlir::Operation* owner = operand->getOwner();
   llvm::SmallVector<mlir::Value, 4> values;
+  llvm::SmallVector<mlir::Value, 4> unused_values;
   if (mlir::isa<mlir::TF::PartitionedCallOp>(owner) ||
       mlir::isa<mlir::TF::StatefulPartitionedCallOp>(owner)) {
     mlir::func::FuncOp func;
@@ -356,46 +359,69 @@ llvm::SmallVector<mlir::OpOperand*, 4> TraceUseToNextTFOp(
     auto device_return = mlir::cast<mlir::tf_device::ReturnOp>(owner);
     auto enclosing_cluster =
         device_return->getParentOfType<mlir::tf_device::ClusterOp>();
-    values.emplace_back(
-        enclosing_cluster.getResult(operand->getOperandNumber()));
+    auto value = enclosing_cluster.getResult(operand->getOperandNumber());
+    values.emplace_back(value);
   } else if (mlir::isa<mlir::func::ReturnOp>(owner)) {
     auto func = mlir::cast<mlir::func::ReturnOp>(owner)
                     ->getParentOfType<mlir::func::FuncOp>();
-    // The one function we don't have a caller for is the main function.
-    // In this case return the empty list as there are no consumers.
-    auto caller = func_to_caller.find(func.getName());
-    if (caller != func_to_caller.end())
-      values.emplace_back(
-          caller->second->getOpResult(operand->getOperandNumber()));
+    if (func) {
+      // The one function we don't have a caller for is the main function.
+      // In this case return the empty list as there are no consumers.
+      auto caller = func_to_caller.find(func.getName());
+      if (caller != func_to_caller.end()) {
+        auto value = caller->second->getOpResult(operand->getOperandNumber());
+        values.emplace_back(value);
+      }
+    } else {
+      LOG(WARNING) << "func is null. "
+                   << "owner is " << owner;
+    }
   } else if (auto yield = mlir::dyn_cast<mlir::TF::YieldOp>(owner)) {
-    if (auto if_op = owner->getParentOfType<mlir::TF::IfRegionOp>()) {
-      values.emplace_back(if_op.getResult(operand->getOperandNumber()));
-    } else if (auto while_op =
-                   owner->getParentOfType<mlir::TF::WhileRegionOp>()) {
-      if (while_op && !while_op.getCond().isAncestor(yield->getParentRegion()))
-        values.emplace_back(while_op.getResult(operand->getOperandNumber()));
+    auto op = owner->getParentOp();
+    while (op != nullptr) {
+      if (mlir::isa<mlir::TF::IfRegionOp>(op)) {
+        break;
+      }
+      if (mlir::isa<mlir::TF::WhileRegionOp>(op)) {
+        break;
+      }
+      op = op->getParentOp();
+    }
+    if (auto if_op = mlir::dyn_cast<mlir::TF::IfRegionOp>(op)) {
+      auto value = if_op.getResult(operand->getOperandNumber());
+      values.emplace_back(value);
+    } else if (auto while_op = mlir::dyn_cast<mlir::TF::WhileRegionOp>(op)) {
+      if (while_op &&
+          !while_op.getCond().isAncestor(yield->getParentRegion())) {
+        auto value = while_op.getResult(operand->getOperandNumber());
+        values.emplace_back(value);
+      }
     } else {
       LOG(WARNING)
           << "Found terminator op for unsupported controlflow operations.";
     }
   } else if (mlir::isa<mlir::TF::DTensorLayout>(owner)) {
     auto dtensor_layout = mlir::cast<mlir::TF::DTensorLayout>(owner);
-    values.emplace_back(dtensor_layout.getOutput());
+    auto value = dtensor_layout.getOutput();
+    values.emplace_back(value);
   } else if (auto while_op = mlir::dyn_cast<mlir::TF::WhileRegionOp>(owner)) {
     // Handle loop variant inputs of while op.
     mlir::Region& cond = while_op.getCond();
     mlir::Region& body = while_op.getBody();
     const int operand_index = operand->getOperandNumber();
-    values.emplace_back(cond.front().getArgument(operand_index));
-    values.emplace_back(body.front().getArgument(operand_index));
+    auto value1 = cond.front().getArgument(operand_index);
+    values.emplace_back(value1);
+    auto value2 = body.front().getArgument(operand_index);
+    values.emplace_back(value2);
   } else {
     return {operand};
   }
   llvm::SmallVector<mlir::OpOperand*, 4> ret;
   for (mlir::Value value : values) {
     if (skipped_values != nullptr) skipped_values->emplace_back(value);
+    if (value.use_empty()) continue;
     for (mlir::OpOperand& use : value.getUses()) {
-      // TODO(bfontain): Remove recursion here.
+      //  TODO(bfontain): Remove recursion here.
       const auto& traced_operands =
           TraceUseToNextTFOp(&use, func_to_caller, skipped_values);
       ret.append(traced_operands.begin(), traced_operands.end());
@@ -545,6 +571,17 @@ StatusOr<mlir::Value> GetMeshCoordinatesFromCluster(
   return mod_op.getZ();
 }
 
+StatusOr<Mesh> GetMeshOnParentCluster(mlir::Operation* op) {
+  mlir::tf_device::ClusterOp cluster =
+      op->getParentOfType<mlir::tf_device::ClusterOp>();
+
+  auto mesh_attr = cluster->getAttrOfType<mlir::StringAttr>(kMeshAttr);
+  if (mesh_attr) {
+    return Mesh::FromString(mesh_attr.getValue().str());
+  }
+  return errors::InvalidArgument("missing mesh attribute on cluster.");
+}
+
 mlir::LogicalResult ValidateMetadataAttributes(mlir::Operation* op) {
   // If cluster function has attributes containing inferred layout of resource
   // handle arguments, then add the attributes to the newly created
diff --git a/tensorflow/dtensor/mlir/spmd_expander_common.h b/tensorflow/dtensor/mlir/spmd_expander_common.h
index 78aa7eb9398..f817bbfa548 100644
--- a/tensorflow/dtensor/mlir/spmd_expander_common.h
+++ b/tensorflow/dtensor/mlir/spmd_expander_common.h
@@ -117,6 +117,9 @@ mlir::LogicalResult PopulateConsumersFromModule(
 StatusOr<mlir::Value> GetMeshCoordinatesFromCluster(
     mlir::tf_device::ClusterOp cluster);
 
+// Returns Mesh attribute on the parent cluster op for the input operation.
+StatusOr<Mesh> GetMeshOnParentCluster(mlir::Operation* op);
+
 // Checks that optional metadata attributes of `op` are valid if they
 // exist. More specifically, output layouts of tf.Shape op and layouts of
 // resources inferred from AssignVariable op is added as metadata.
diff --git a/tensorflow/dtensor/mlir/spmd_expanders.cc b/tensorflow/dtensor/mlir/spmd_expanders.cc
index d21ffb37e80..ff6752573fe 100644
--- a/tensorflow/dtensor/mlir/spmd_expanders.cc
+++ b/tensorflow/dtensor/mlir/spmd_expanders.cc
@@ -71,6 +71,7 @@ REGISTER_SPMD(Cast, TF::CastOp, ElementwiseSPMDExpander);
 REGISTER_SPMD(Identity, TF::IdentityOp, ElementwiseSPMDExpander);
 REGISTER_SPMD(Neg, TF::NegOp, ElementwiseSPMDExpander);
 REGISTER_SPMD(ZerosLike, TF::ZerosLikeOp, ElementwiseSPMDExpander);
+REGISTER_SPMD(OnesLike, TF::OnesLikeOp, ElementwiseSPMDExpander);
 REGISTER_SPMD(Exp, TF::ExpOp, ElementwiseSPMDExpander);
 REGISTER_SPMD(Sqrt, TF::SqrtOp, ElementwiseSPMDExpander);
 REGISTER_SPMD(Rsqrt, TF::RsqrtOp, ElementwiseSPMDExpander);
@@ -392,8 +393,14 @@ REGISTER_SPMD(DTensorShardedPrefix, TF::DTensorShardedPrefixOp,
               DTensorShardPrefixSPMDExpander);
 
 // DTensor Virtual ops
+REGISTER_SPMD(
+    CopyToMesh, TF::CopyToMeshOp, UnsupportedOpSPMDExpander,
+    "CopyToMesh should have been lowered to DTensorSend and DTensorRecv.");
+REGISTER_SPMD(
+    CopyToMeshGrad, TF::CopyToMeshGradOp, UnsupportedOpSPMDExpander,
+    "CopyToMesh should have been lowered to DTensorSend and DTensorRecv.");
 REGISTER_SPMD(Relayout, TF::RelayoutOp, RelayoutSPMDExpander);
-REGISTER_SPMD(RelayoutGrad, TF::RelayoutGradOp, RelayoutGradSPMDExpander);
+REGISTER_SPMD(RelayoutLike, TF::RelayoutLikeOp, RelayoutLikeSPMDExpander);
 REGISTER_SPMD(DTensorSend, TF::DTensorSend, DTensorSendSPMDExpander);
 REGISTER_SPMD(DTensorRecv, TF::DTensorRecv, DTensorRecvSPMDExpander);
 
@@ -522,6 +529,7 @@ REGISTER_SPMD(TensorListSetItem, TF::TensorListSetItemOp,
 
 // IO ops
 REGISTER_SPMD(WriteSummary, TF::WriteSummaryOp, IOOpSPMDExpander);
+REGISTER_SPMD(FlushSummaryWriter, TF::FlushSummaryWriterOp, IOOpSPMDExpander);
 REGISTER_SPMD(DisableCopyOnRead, TF::DisableCopyOnReadOp,
               DisableCopyOnReadSPMDExpander);
 REGISTER_SPMD(ShardedFilename, TF::ShardedFilenameOp, ReplicatedOpSPMDExpander);
diff --git a/tensorflow/dtensor/mlir/spmd_expansion.cc b/tensorflow/dtensor/mlir/spmd_expansion.cc
index 7294e44bd01..361475b9129 100644
--- a/tensorflow/dtensor/mlir/spmd_expansion.cc
+++ b/tensorflow/dtensor/mlir/spmd_expansion.cc
@@ -145,7 +145,7 @@ mlir::LogicalResult UpdateResourceArgumentType(
   } else {
     auto layout_or_status = ExtractLayoutFromOperand(resource_arg);
     if (!layout_or_status.ok())
-      return function.emitOpError(layout_or_status.status().error_message());
+      return function.emitOpError(layout_or_status.status().message());
 
     const auto& layout = layout_or_status.value();
     if (!layout) return mlir::success();
@@ -174,17 +174,19 @@ mlir::LogicalResult UpdateResourceArgumentType(
 
 // Returns whether `value` is used by AssignVariable op, skipping DTensorLayout
 // op.
-bool IsValueUsedByAssignVariableOp(
+bool GetResourceArgIndexIfUsedInAssignmentOp(
     mlir::Value value, int* resource_argument_index_for_assign_variable) {
   for (auto user : value.getUsers()) {
     if (auto assign_variable_op =
             llvm::dyn_cast_or_null<mlir::TF::AssignVariableOp>(
                 NextTFOp(user))) {
-      *resource_argument_index_for_assign_variable =
-          GetForwardedDTensorLayoutInput(assign_variable_op.getResource())
-              .cast<mlir::BlockArgument>()
-              .getArgNumber();
-      return true;
+      auto resource =
+          GetForwardedDTensorLayoutInput(assign_variable_op.getResource());
+      if (llvm::isa<mlir::BlockArgument>(resource)) {
+        *resource_argument_index_for_assign_variable =
+            resource.cast<mlir::BlockArgument>().getArgNumber();
+        return true;
+      }
     }
   }
   return false;
@@ -202,10 +204,11 @@ mlir::LogicalResult UpdateFunctionArgsUsingLayout(mlir::func::FuncOp function) {
     if (!arg_layout.ok())
       return function.emitOpError(llvm::formatv(
           "Invalid layout attribute found during SPMD expansion: {0}",
-          arg_layout.status().error_message()));
+          arg_layout.status().message()));
 
     // XLA SPMD will handle argument shape updating for us.
-    if (arg_layout->mesh().use_xla_spmd()) {
+    if (arg_layout->mesh().IsSingleDevice() ||
+        arg_layout->mesh().use_xla_spmd()) {
       continue;
     }
 
@@ -235,12 +238,14 @@ mlir::LogicalResult UpdateFunctionArgsUsingLayout(mlir::func::FuncOp function) {
         arg_local_shape, ranked_type.getElementType());
     UpdateFunctionInputShape(argument_index, new_arg_type, function);
 
-    // If non-resource value was used for AssignVariable op, then ensure that
+    // If Resource is an input to the function and a non-resource value was used
+    // for AssignVariable op, then ensure that
     // resource shape of updated/assigned resource is consistent with the
     // local shape of assigned value.
     int assigned_resource_argument_index = -1;
-    if (IsValueUsedByAssignVariableOp(function.getArgument(argument_index),
-                                      &assigned_resource_argument_index)) {
+    if (GetResourceArgIndexIfUsedInAssignmentOp(
+            function.getArgument(argument_index),
+            &assigned_resource_argument_index)) {
       (void)UpdateResourceArgumentType(assigned_resource_argument_index,
                                        function, new_arg_type);
     }
@@ -291,14 +296,14 @@ mlir::LogicalResult UpdateReturnValueShapes(mlir::ModuleOp module,
       auto callsite_op = function_use.getUser();
       if (!callsite_op) continue;
 
-      for (auto& output_type_and_index : llvm::enumerate(output_types)) {
+      for (const auto& output_type_and_index : llvm::enumerate(output_types)) {
         int index = output_type_and_index.index();
         const auto& type = output_type_and_index.value();
         callsite_op->getResult(index).setType(type);
       }
     }
   } else {
-    for (auto& output_type_and_index : llvm::enumerate(output_types)) {
+    for (const auto& output_type_and_index : llvm::enumerate(output_types)) {
       int index = output_type_and_index.index();
       const auto& type = output_type_and_index.value();
       parent_op->getResult(index).setType(type);
@@ -345,7 +350,7 @@ mlir::LogicalResult ConductSPMDExpansion(mlir::ModuleOp module) {
       if (expanded_op != nullptr) emit_op = expanded_op;
       return emit_op->emitError(WithContext(status, __FILE__, __LINE__,
                                             "While computing SPMD expansion")
-                                    .error_message());
+                                    .message());
     }
 
     // If expanded op is terminator of tf_device.Cluster or a function, then
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_lowering.mlir b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_lowering.mlir
index 8f2b864d9b3..832d9f7175d 100644
--- a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_lowering.mlir
+++ b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_lowering.mlir
@@ -1,4 +1,4 @@
-// RUN: dtensor-opt %s -split-input-file -dtensor-all-reduce-lowering -verify-diagnostics| FileCheck %s --dump-input=fail
+// RUN: dtensor-opt -split-input-file -dtensor-all-reduce-lowering -verify-diagnostics %s| FileCheck %s --dump-input=fail
 
 // Check the lowering of AllReduce on TPU with sum reduction.
 // CHECK-LABEL: func @lower_allreduce_sum
@@ -14,6 +14,8 @@ func.func @lower_allreduce_sum() -> (tensor<4096x8192xf32>) {
   func.return %2: tensor<4096x8192xf32>
 }
 
+// -----
+
 // Check the lowering of AllReduce on TPU with any boolean reduction.
 // CHECK-LABEL: func @lower_allreduce_any_boolean
 func.func @lower_allreduce_any_boolean() -> (tensor<4096x8192xi1>) {
@@ -30,6 +32,8 @@ func.func @lower_allreduce_any_boolean() -> (tensor<4096x8192xi1>) {
   func.return %2: tensor<4096x8192xi1>
 }
 
+// -----
+
 // Check the lowering of AllReduce on TPU with all boolean reduction.
 // CHECK-LABEL: func @lower_allreduce_all_boolean
 func.func @lower_allreduce_all_boolean() -> (tensor<4096x8192xi1>) {
@@ -52,7 +56,55 @@ func.func @lower_allreduce_all_boolean() -> (tensor<4096x8192xi1>) {
 func.func @lower_allreduce_sum_boolean() -> (tensor<4096x8192xi1>) {
   %0 = "tf.Const"() {value = dense<1> : tensor<4096x8192xi1>} : () -> tensor<4096x8192xi1>
   %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
-  // expected-error @+1 {{reduce for boolean only supports 'All' or 'Any' reduction}}
+  // expected-error @+1 {{reduce for boolean only supports 'All'/'Min' or 'Any'/'Max' reduction}}
   %2= "tf.DTensorAllReduce"(%0, %1) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<4096x8192xi1>, tensor<4x2xi32>) -> tensor<4096x8192xi1>
   func.return %2: tensor<4096x8192xi1>
 }
+
+// -----
+
+// CHECK-LABEL: func @lower_all_reduce_gpu_mesh
+func.func @lower_all_reduce_gpu_mesh(%arg0: tensor<i32>,
+           %arg1: tensor<4096x8192xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:gpu_mesh|x=2,y=4|*GPU"}) -> tensor<4096x8192xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:       %[[DEVICE_ID_RESHAPE:.*]] = "tf.Reshape"(%arg0
+  // CHECK:       %[[RELATIVE_DEVICE_ID:.*]] = "tf.Sub"(%[[DEVICE_ID_RESHAPE]]
+  // CHECK-DAG:   %[[CONST_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-DAG:   %[[DEVICE_ID_TO_GROUP_KEY:.*]] = "tf.Const"() {value = dense<[[[GROUP_KEYS:.*]]]> : tensor<8xi32>}
+  // CHECK:       %[[GROUP_KEY_SLICE:.*]] = "tf.Slice"(%[[DEVICE_ID_TO_GROUP_KEY]], %[[RELATIVE_DEVICE_ID]], %[[CONST_1]]
+  // CHECK:       %[[GROUP_KEY_RESHAPE:.*]] = "tf.Reshape"(%[[GROUP_KEY_SLICE]]
+  // CHECK-DAG:   %[[INSTANCE_KEY:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} 
+  // CHECK-DAG:   %[[GROUP_SIZE:.*]] = "tf.Const"() {value = dense<2> : tensor<i32>}
+  // CHECK:       %[[REDUCE_OUT:.*]] = "tf.CollectiveReduceV2"(%arg1, %[[GROUP_SIZE]], %[[GROUP_KEY_RESHAPE]], %[[INSTANCE_KEY]])
+  // CHECK-SAME:  final_op = "Id"
+  // CHECK-SAME:  merge_op = "Add"
+  // CHECK-NEXT   tf_device.return %[[REDUCE_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+    %3 = "tf.DTensorAllReduce"(%arg1, %1) {_layout = ["sharding_specs:x,unsharded, mesh:gpu_mesh|x=2,y=4|*GPU"], device_type = "/job:localhost/replica:0/task:0/device:GPU", reduce_op = "Add"} : (tensor<4096x8192xf32>, tensor<4x2xi32>) -> tensor<4096x8192xf32>
+    tf_device.return %3 : tensor<4096x8192xf32>
+  }) {_mesh = "GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3"} : () -> tensor<4096x8192xf32>
+  func.return %0 : tensor<4096x8192xf32>
+}
+
+// -----
+
+// Tests unsupported integer types are promoted to i64.
+// CHECK-LABEL: func @lower_all_reduce_i8_gpu_mesh
+func.func @lower_all_reduce_i8_gpu_mesh(%arg0: tensor<i32>,
+           %arg1: tensor<4096x8192xi8> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:gpu_mesh|x=2,y=4|*GPU"}) -> tensor<4096x8192xi8> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:       %[[PRECAST_OUT:.*]] = "tf.Cast"(%arg1)
+  // CHECK-SAME:      -> tensor<4096x8192xi64>
+  // CHECK:       %[[REDUCE_OUT:.*]] = "tf.CollectiveReduceV2"(%[[PRECAST_OUT]],
+  // CHECK:       %[[POSTCAST_OUT:.*]] = "tf.Cast"(%[[REDUCE_OUT]])
+  // CHECK-SAME:      -> tensor<4096x8192xi8>
+  // CHECK-NEXT   tf_device.return %[[POSTCAST_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+    %3 = "tf.DTensorAllReduce"(%arg1, %1) {_layout = ["sharding_specs:x,unsharded, mesh:gpu_mesh|x=2,y=4|*GPU"], device_type = "/job:localhost/replica:0/task:0/device:GPU", reduce_op = "Add"} : (tensor<4096x8192xi8>, tensor<4x2xi32>) -> tensor<4096x8192xi8>
+    tf_device.return %3 : tensor<4096x8192xi8>
+  }) {_mesh = "GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3"} : () -> tensor<4096x8192xi8>
+  func.return %0 : tensor<4096x8192xi8>
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_alltoall_lowering.mlir b/tensorflow/dtensor/mlir/tests/dtensor_alltoall_lowering.mlir
new file mode 100644
index 00000000000..309ece55c61
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_alltoall_lowering.mlir
@@ -0,0 +1,33 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-all-to-all-lowering -verify-diagnostics | FileCheck %s --dump-input=fail
+
+// CHECK-LABEL: func @lower_alltoall_tpu_mesh
+func.func @lower_alltoall_tpu_mesh(%arg0: tensor<i32>,
+           %arg1: tensor<4x2xf32> {tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) -> tensor<2x4xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[ALLTOALL_OUT:.*]] = "tf.AllToAll"(%arg1
+  // CHECK:      tf_device.return %[[ALLTOALL_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorAllToAll"(%arg1) {input_layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>, output_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:TPU|x=2,y
+=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x2xf32>) -> tensor<2x4xf32>
+    tf_device.return %1 : tensor<2x4xf32>
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+}
+
+// CHECK-LABEL: func @lower_alltoall_gpu_mesh
+func.func @lower_alltoall_gpu_mesh(%arg0: tensor<i32>,
+           %arg1: tensor<4x2xf32> {tf._layout = "sharding_specs:x,y, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3"}) -> tensor<2x4xf32> {
+  // CHECK:  "tf_device.cluster"
+  // CHECK:  %[[FLATTEN_OUT:.*]] = "tf.Reshape"(%arg1
+  // CHECK:  %[[ALLTOALL_OUT:.*]] = "tf.CollectiveAllToAllV2"(%[[FLATTEN_OUT]]
+  // CHECK:  %[[RESHAPE_1_OUT:.*]] = "tf.Reshape"(%[[ALLTOALL_OUT]]
+  // CHECK:  %[[TRANSPOSE_OUT:.*]] = "tf.Transpose"(%[[RESHAPE_1_OUT]]
+  // CHECK:  %[[RESHAPE_2_OUT:.*]] = "tf.Reshape"(%[[TRANSPOSE_OUT]]
+  // CHECK:  tf_device.return %[[RESHAPE_2_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorAllToAll"(%arg1) {input_layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3>, output_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:GPU|x=2,y
+=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3>} : (tensor<4x2xf32>) -> tensor<2x4xf32>
+    tf_device.return %1 : tensor<2x4xf32>
+  }) {_mesh = "GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3"} : () -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_layout_to_xla_sharding_op.mlir b/tensorflow/dtensor/mlir/tests/dtensor_layout_to_xla_sharding_op.mlir
new file mode 100644
index 00000000000..27b71b0deaf
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_layout_to_xla_sharding_op.mlir
@@ -0,0 +1,55 @@
+// RUN: dtensor-opt %s -dtensor-layout-to-xla-sharding-op | FileCheck %s
+
+// CHECK-LABEL: @check_layouts_are_converted_to_xla_sharding_op
+func.func @check_layouts_are_converted_to_xla_sharding_op(
+  %arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd"}) -> tensor<8x8xi32> {
+  // CHECK:      [[tensor1:%[0-9]+]] = "tf.Identity"(%arg0)
+  // CHECK:      [[tensor2:%[0-9]+]] = "tf.XlaSharding"([[tensor1]])
+  // CHECK-SAME: _XlaSharding = ""
+  // CHECK: return [[tensor2]]
+  %1 = "tf.Identity"(%arg0) {_global_shape = [#tf_type.shape<8x8>], device = ""} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  %2 = "tf.DTensorLayout"(%1) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  return %2 : tensor<8x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @no_xla_sharding_op_for_block_arg
+func.func @no_xla_sharding_op_for_block_arg(
+  %arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd"}) -> tensor<8x8xi32> {
+  // CHECK-NOT: "tf.DTensorLayout"
+  // CHECK-NOT: "tf.XlaSharding"
+  // CHECK: return %arg0
+  %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  return %1 : tensor<8x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @no_xla_sharding_op_for_const_input
+func.func @no_xla_sharding_op_for_const_input() -> tensor<8x8xi32> {
+  // CHECK: [[tensor:%[a-z0-9]+]] = "tf.Const"
+  // CHECK-NOT: "tf.DTensorLayout"
+  // CHECK-NOT: "tf.XlaSharding"
+  // CHECK: return [[tensor]]
+  %cst = "tf.Const"() {value = dense<-3> : tensor<i32>} : () -> tensor<8x8xi32>
+  %1 = "tf.DTensorLayout"(%cst) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  return %1 : tensor<8x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @no_xla_sharding_op_for_const_foldable_input
+func.func @no_xla_sharding_op_for_const_foldable_input() -> tensor<8x8xi32> {
+  // CHECK: [[tensor:%[a-z0-9]+]] = "tf.Const"
+  // CHECK-NOT: "tf.DTensorLayout"
+  // CHECK-NOT: "tf.XlaSharding"
+  // CHECK-NOT: "tf.Reshape"
+  // CHECK: return [[tensor]]
+  %cst = "tf.Const"() {value = dense<-3> : tensor<i32>} : () -> tensor<8x8xi32>
+  %1 = "tf.DTensorLayout"(%cst) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  %shape = arith.constant dense<8> : tensor<2xi32>
+  %2 = "tf.Reshape"(%1, %shape) {_global_shape = [#tf_type.shape<8x8>], device = ""} : (tensor<8x8xi32>, tensor<2xi32>) -> tensor<8x8xi32>
+  %3 = "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  return %3 : tensor<8x8xi32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_reduce_scatter_lowering.mlir b/tensorflow/dtensor/mlir/tests/dtensor_reduce_scatter_lowering.mlir
index f052c58e5a7..ac59b9288c0 100644
--- a/tensorflow/dtensor/mlir/tests/dtensor_reduce_scatter_lowering.mlir
+++ b/tensorflow/dtensor/mlir/tests/dtensor_reduce_scatter_lowering.mlir
@@ -134,7 +134,7 @@ func.func @lower_reduce_sum_boolean_tpu() -> (tensor<2048x8192xi1>) {
   %0 = "tf.Const"() {value = dense<1> : tensor<4096x8192xi1>, _layout = ["sharding_specs:unsharded,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"]} : () -> tensor<4096x8192xi1>
   %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
   %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // expected-error @+1 {{reduce for boolean only supports 'All' or 'Any' reduction}}
+  // expected-error @+1 {{reduce for boolean only supports 'All'/'Min' or 'Any'/'Max' reduction}}
   %3 = "tf.DTensorReduceScatter"(%0, %1, %2) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<4096x8192xi1>, tensor<4x2xi32>, tensor<i32>) -> tensor<2048x8192xi1>
   func.return %3: tensor<2048x8192xi1>
 }
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding.mlir b/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding.mlir
index daf17686b2b..ca5a947e422 100644
--- a/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding.mlir
+++ b/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding.mlir
@@ -1,4 +1,4 @@
-// RUN: dtensor-opt %s -split-input-file -dtensor-set-hlo-sharding='check_layout_use_xla_spmd=true' | FileCheck %s
+// RUN: dtensor-opt %s -split-input-file -dtensor-set-hlo-sharding='check_layout_use_xla_spmd=true' --verify-diagnostics | FileCheck %s
 
 // Check all inputs, outputs, and operations have sharding attributes, with `check_layout_use_xla_spmd` set to true.
 // CHECK-LABEL: func @check_layouts_are_converted_to_xla_sharding_attributes
@@ -7,7 +7,6 @@ func.func @check_layouts_are_converted_to_xla_sharding_attributes(
   %arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd"}) -> tensor<8x8xi32> {
   // CHECK:      "tf.DTensorLayout"
   // CHECK:      "tf.Identity"
-  // CHECK-SAME: mhlo.sharding = ""
   // CHECK:      "tf.DTensorLayout"
   // CHECK-NEXT: return
   %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
@@ -24,4 +23,4 @@ func.func @check_layouts_not_xla_spmd_is_not_allowed(
   // expected-error @+1 {{'tf.DTensorLayout' op Found a layout operation that is not on XLA SPMD mesh during XLA SPMD integration.}}
   %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
   return %1 : tensor<8x8xi32>
-}
\ No newline at end of file
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding_default.mlir b/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding_default.mlir
index 8d4b55f0a97..66c9a0070d5 100644
--- a/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding_default.mlir
+++ b/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding_default.mlir
@@ -7,11 +7,10 @@ func.func @check_layouts_are_converted_to_xla_sharding_attributes(
   %arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"}) -> tensor<8x8xi32> {
   // CHECK:      "tf.DTensorLayout"
   // CHECK:      "tf.Identity"
-  // CHECK-SAME: mhlo.sharding = ""
   // CHECK:      "tf.DTensorLayout"
   // CHECK-NEXT: return
   %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
   %2 = "tf.Identity"(%1) {_global_shape = [#tf_type.shape<8x8>], device = ""} : (tensor<8x8xi32>) -> tensor<8x8xi32>
   %3 = "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
   return %3 : tensor<8x8xi32>
-}
\ No newline at end of file
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_xla_spmd_integration.mlir b/tensorflow/dtensor/mlir/tests/dtensor_xla_spmd_integration.mlir
index d417f09309d..26ee58aa44d 100644
--- a/tensorflow/dtensor/mlir/tests/dtensor_xla_spmd_integration.mlir
+++ b/tensorflow/dtensor/mlir/tests/dtensor_xla_spmd_integration.mlir
@@ -1,4 +1,4 @@
-// RUN: dtensor-opt %s -split-input-file -dtensor-set-hlo-sharding='check_layout_use_xla_spmd=true' -dtensor-replace-auxiliary-layout-op -dtensor-remove-dtensorlayout | FileCheck %s
+// RUN: dtensor-opt %s -split-input-file -dtensor-set-hlo-sharding='check_layout_use_xla_spmd=true' -dtensor-replace-auxiliary-layout-op -dtensor-layout-to-xla-sharding-op -verify-diagnostics | FileCheck %s
 
 // Check after XLA SPMD integration, all inputs, outputs, and operations have sharding attributes.
 //  And all "tf.DTensorLayout" are removed.
@@ -6,9 +6,10 @@
 // CHECK-SAME: (%arg0: tensor<8x8xi32> {mhlo.sharding = "", tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd"}) -> (tensor<8x8xi32> {mhlo.sharding = ""})
 func.func @check_layouts_are_converted_to_xla_sharding_attributes(
   %arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd"}) -> (tensor<8x8xi32>) {
-  // CHECK:      "tf.Identity"
-  // CHECK-SAME: mhlo.sharding = ""
-  // CHECK-NEXT: return
+  // CHECK:      [[tensor2:%[0-9]+]] = "tf.Identity"(%arg0)
+  // CHECK:      [[tensor3:%[0-9]+]] = "tf.XlaSharding"([[tensor2]])
+  // CHECK-SAME: _XlaSharding = ""
+  // CHECK: return [[tensor3]]
   %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
   %2 = "tf.Identity"(%1) {_global_shape = [#tf_type.shape<8x8>], device = ""} : (tensor<8x8xi32>) -> tensor<8x8xi32>
   %3 = "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
diff --git a/tensorflow/dtensor/mlir/tests/layout_propagation_v2.mlir b/tensorflow/dtensor/mlir/tests/layout_propagation_v2.mlir
index 65dde0eac6f..20c8c9a7b4c 100644
--- a/tensorflow/dtensor/mlir/tests/layout_propagation_v2.mlir
+++ b/tensorflow/dtensor/mlir/tests/layout_propagation_v2.mlir
@@ -333,7 +333,7 @@ func.func @main(%arg0: tensor<i32>  {},
 
 // -----
 
-// Check that RelayoutGrad propagates the original Relayout input's layout to
+// Check that RelayoutLike propagates the original Relayout input's layout to
 // the output gradient.
 
 // CHECK-LABEL: func @main
@@ -352,10 +352,10 @@ func.func @main(
     %2 = "tf.Relayout"(%1) {global_shape = #tf_type.shape<8x8>, layout = "sharding_specs:unsharded,x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"} : (tensor<8x8xf32>) -> tensor<8x8xf32>
     %3 = "tf.Identity"(%cst) {_global_shape = [#tf_type.shape<8x8>]} : (tensor<8x8xf32>) -> tensor<8x8xf32>
     %4 = "tf.AddN"(%2, %3) {_global_shape = [#tf_type.shape<8x8>]} : (tensor<8x8xf32>, tensor<8x8xf32>) -> tensor<8x8xf32>
-    // CHECK:        %[[RELAYOUT_GRAD_OUT:.*]] = "tf.RelayoutGrad"
+    // CHECK:        %[[RELAYOUT_GRAD_OUT:.*]] = "tf.RelayoutLike"
     // CHECK-NEXT:   "tf.DTensorLayout"(%[[RELAYOUT_GRAD_OUT]])
     // CHECK-SAME:       layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>
-    %5 = "tf.RelayoutGrad"(%4, %1) {_global_shape = [#tf_type.shape<8x8>]} : (tensor<8x8xf32>, tensor<8x8xf32>) -> tensor<8x8xf32>
+    %5 = "tf.RelayoutLike"(%4, %1) {_global_shape = [#tf_type.shape<8x8>]} : (tensor<8x8xf32>, tensor<8x8xf32>) -> tensor<8x8xf32>
     tf_device.return %5 : tensor<8x8xf32>
   }) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"} : () -> tensor<8x8xf32>
   return %0 : tensor<8x8xf32>
diff --git a/tensorflow/dtensor/mlir/tests/mesh_propagation.mlir b/tensorflow/dtensor/mlir/tests/mesh_propagation.mlir
index 45a8cfdc1a0..9e82bf4d048 100644
--- a/tensorflow/dtensor/mlir/tests/mesh_propagation.mlir
+++ b/tensorflow/dtensor/mlir/tests/mesh_propagation.mlir
@@ -598,3 +598,59 @@ module @test_if {
     func.return %9 : tensor<4xf32>
   }
 }
+
+// -----
+
+// Check mesh propagation of tf.WhileRegion inside tf.IfRegion op.
+// This test only checks that the code doesn't crash under asan.
+// Correctness check are covered by other tests.
+// CHECK-LABEL: module @test_nested_while_inside_if
+module @test_nested_while_inside_if {
+  func.func @main(%arg0: tensor<i32>,
+    %arg1: tensor<4xf32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+                          tf._mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) -> tensor<4xf32> {
+
+   %0 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {value = dense<0> : tensor<i1>} : () -> tensor<i1>
+      tf_device.return %1 : tensor<i1>
+    }) : () -> tensor<i1>
+
+   %7:1 = "tf_device.cluster"() ({
+      %10:1 = "tf.IfRegion"(%0) ({
+          %3:2 = "tf.WhileRegion"(%arg1, %arg0) ({
+            ^bb0(%carg0: tensor<4xf32>, %carg1: tensor<i32>):
+               %11 = "tf_device.cluster"() ({
+                 %limit = arith.constant dense<5> : tensor<i32>
+                 tf_device.return %limit : tensor<i32>
+               }) : () -> tensor<i32>
+
+               %12 = "tf_device.cluster"() ({
+                 %cond = "tf.NotEqual"(%carg1, %11) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+                 tf_device.return %cond : tensor<i1>
+               }) : () -> tensor<i1>
+
+           "tf.Yield"(%12) : (tensor<i1>) -> ()
+          },  {
+            ^bb0(%barg0: tensor<4xf32>, %barg1: tensor<i32>):
+              %13 = "tf_device.cluster"() ({
+                %one = arith.constant dense<1.0> : tensor<4xf32>
+                tf_device.return %one: tensor<4xf32>
+               }) : () -> tensor<4xf32>
+
+              %14 = "tf_device.cluster"() ({
+                %sub = "tf.Sub"(%barg0, %13) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+                tf_device.return %sub: tensor<4xf32>
+               }) : () -> tensor<4xf32>
+
+              "tf.Yield"(%14, %barg1) : (tensor<4xf32>, tensor<i32>) -> ()
+          }) {is_stateless = true} : (tensor<4xf32>, tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
+        "tf.Yield"(%3#0) : (tensor<4xf32>) -> ()
+      },  {
+        "tf.Yield"(%arg1) : (tensor<4xf32>) -> ()
+      }) {_else_func_name = "cond_false_150", _lower_using_switch_merge = true, _read_only_resource_inputs = [], _then_func_name = "cond_true_140", device = "", is_stateless = true} : (tensor<i1>) -> (tensor<4xf32>)
+      tf_device.return %10#0 : tensor<4xf32>
+    }) : () -> (tensor<4xf32>)
+
+    func.return %7#0 : tensor<4xf32>
+  }
+}
diff --git a/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir b/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir
new file mode 100644
index 00000000000..71dd3a47c8c
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir
@@ -0,0 +1,66 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-multi-device-expansion -verify-diagnostics | FileCheck %s
+
+module attributes {tf._default_mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:CPU:1", "/job:localhost/replica:0/task:0/device:CPU:2", "/job:localhost/replica:0/task:0/device:CPU:3", "/job:localhost/replica:0/task:0/device:CPU:4", "/job:localhost/replica:0/task:0/device:CPU:5", "/job:localhost/replica:0/task:0/device:CPU:6", "/job:localhost/replica:0/task:0/device:CPU:7"}} {
+  // CHECK-LABEL: func @main
+  // CHECK: %arg0: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+  // CHECK: %arg1: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:1"}
+  // CHECK: %arg2: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:2"}
+  // CHECK: %arg3: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:3"}
+  // CHECK: %arg4: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:4"}
+  // CHECK: %arg5: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:5"}
+  // CHECK: %arg6: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:6"}
+  // CHECK: %arg7: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:7"}
+  func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>}, %arg1: tensor<8xi32> {tf._global_shape = #tf_type.shape<8>, tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", tf._mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"}) -> (tensor<8xi32> {tf._global_shape = #tf_type.shape<8>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+    // CHECK: %[[CST0:.*]] = "tf.Const"
+    // CHECK: %[[CST1:.*]] = "tf.Const"
+    // CHECK: %[[CST2:.*]] = "tf.Const"
+    // CHECK: %[[CST3:.*]] = "tf.Const"
+    // CHECK: %[[CST4:.*]] = "tf.Const"
+    // CHECK: %[[CST5:.*]] = "tf.Const"
+    // CHECK: %[[CST6:.*]] = "tf.Const"
+    // CHECK: %[[CST7:.*]] = "tf.Const"
+    // CHECK: %[[RES0:.*]] = "tf.StatefulPartitionedCall"(%[[CST0]], %arg0)
+    // CHECK: %[[RES1:.*]] = "tf.StatefulPartitionedCall"(%[[CST1]], %arg1)
+    // CHECK: %[[RES2:.*]] = "tf.StatefulPartitionedCall"(%[[CST2]], %arg2)
+    // CHECK: %[[RES3:.*]] = "tf.StatefulPartitionedCall"(%[[CST3]], %arg3)
+    // CHECK: %[[RES4:.*]] = "tf.StatefulPartitionedCall"(%[[CST4]], %arg4)
+    // CHECK: %[[RES5:.*]] = "tf.StatefulPartitionedCall"(%[[CST5]], %arg5)
+    // CHECK: %[[RES6:.*]] = "tf.StatefulPartitionedCall"(%[[CST6]], %arg6)
+    // CHECK: %[[RES7:.*]] = "tf.StatefulPartitionedCall"(%[[CST7]], %arg7)
+    %1 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"], _mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", config = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", config_proto = "", executor_type = "", f = @_test_func} : (tensor<i32>, tensor<8xi32>) -> tensor<8xi32>
+    // CHECK: return %[[RES0]], %[[RES1]], %[[RES2]], %[[RES3]], %[[RES4]], %[[RES5]], %[[RES6]], %[[RES7]]
+    return %1 : tensor<8xi32>
+  }
+  func.func private @_test_func(%arg0: tensor<i32>, %arg1: tensor<8xi32>) -> tensor<8xi32> {
+    return %arg1 : tensor<8xi32>
+  }
+}
+
+// -----
+
+// Foo and bar are not valid layouts or meshes, respectively.
+
+module {
+  func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>}, %arg1: tensor<8xi32> {tf._global_shape = #tf_type.shape<8>, tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", tf._mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"}) -> (tensor<8xi32> {tf._global_shape = #tf_type.shape<8>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+    // expected-error @+1 {{Failed to retrieve op mesh or layout.}}
+    %1 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {_layout = ["foo"], _mesh = "bar", config = "", config_proto = "", executor_type = "", f = @_test_func} : (tensor<i32>, tensor<8xi32>) -> tensor<8xi32>
+    return %1 : tensor<8xi32>
+  }
+  func.func private @_test_func(%arg0: tensor<i32>, %arg1: tensor<8xi32>) -> tensor<8xi32> {
+    return %arg1 : tensor<8xi32>
+  }
+}
+
+// -----
+
+module {
+  func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>}, %arg1: tensor<8xi32> {tf._global_shape = #tf_type.shape<8>, tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", tf._mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"}) -> (tensor<8xi32> {tf._global_shape = #tf_type.shape<8>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+    // expected-error @+1 {{Call result must be used by return op.}}
+    %1 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"], _mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", config = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", config_proto = "", executor_type = "", f = @_test_func} : (tensor<i32>, tensor<8xi32>) -> tensor<8xi32>
+    %2 = "tf.Identity"(%1) : (tensor<8xi32>) -> tensor<8xi32>
+    return %2 : tensor<8xi32>
+  }
+  func.func private @_test_func(%arg0: tensor<i32>, %arg1: tensor<8xi32>) -> tensor<8xi32> {
+    return %arg1 : tensor<8xi32>
+  }
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_expansion.mlir b/tensorflow/dtensor/mlir/tests/spmd_expansion.mlir
index 80e2ec3aafa..6467864d207 100644
--- a/tensorflow/dtensor/mlir/tests/spmd_expansion.mlir
+++ b/tensorflow/dtensor/mlir/tests/spmd_expansion.mlir
@@ -1,6 +1,6 @@
 // RUN: dtensor-opt -- %s -split-input-file -dtensor-annotate-global-shape -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
 
-
+module @test_spmd {
 func.func @main() {
   %0 = "tf_device.cluster"() ({
     %1 = "tf.A"() : () -> tensor<i32>
@@ -11,9 +11,9 @@ func.func @main() {
   }) {_mesh = "TPU|x=2,y=1|*TPU"} : () -> (tensor<i32>)
   func.return
 }
-
+}
 // -----
-
+module @test_spmd_malformed_layouts {
 // Check that ops with malformed layouts are disallowed.
 func.func @main() {
   %0 = "tf_device.cluster"() ({
@@ -25,9 +25,9 @@ func.func @main() {
   }) {_mesh = "TPU|x=2,y=1|*TPU"} : () -> (tensor<i32>)
   func.return
 }
-
+}
 // -----
-
+module @test_spmd_operands_without_layouts {
 // Check operands without layouts are disallowed.
 func.func @main(%arg0: tensor<i32>) {
   %0 = "tf_device.cluster"() ({
@@ -39,13 +39,13 @@ func.func @main(%arg0: tensor<i32>) {
   }) {_mesh = "TPU|x=2,y=1|*TPU"} : () -> (tensor<2x2xi32>)
   func.return
 }
-
+}
 // -----
 
 // Check SPMD is skipped for layouts with XLA SPMD mesh.
 //
 // Arguments and ops and Retvals should remain in global shape.
-// CHECK-LABEL: func @main
+module @test_spmd_skipped_for_xla {
 func.func @main(%arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7|use_xla_spmd"}) -> (tensor<8x8xi32>) {
   %0 = "tf_device.cluster"() ({
     %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
@@ -55,11 +55,12 @@ func.func @main(%arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:x,unsharded
   }) {_mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7|use_xla_spmd"} : () -> tensor<8x8xi32>
   return %0 : tensor<8x8xi32>
 }
-
+}
 // -----
 
 // Check that elementwise batch parallel op SPMD expansion.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_batch_parallel
+module @test_spmd_batch_parallel {
 func.func @main(
   %arg0: tensor<2x2xi32> { tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|*TPU"},
   %arg1: tensor<2x2xi32> { tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|*TPU"}) {
@@ -72,11 +73,12 @@ func.func @main(
   }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> (tensor<2x2xi32>)
   func.return
 }
-
+}
 // -----
 
 // Check tf.Add SPMD with sharded inputs/outputs
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_sharded_inputs
+module @test_spmd_sharded_inputs {
 func.func @main(
   %arg0: tensor<2x2xi32> { tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU"},
   %arg1: tensor<2x2xi32> { tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU"}) {
@@ -90,11 +92,13 @@ func.func @main(
   }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> (tensor<i32>)
   func.return
 }
+}
 
 // -----
 
 // Check tf.Neg Op SPMD.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_neg_op
+module @test_spmd_neg_op {
 func.func @main(
   %arg0: tensor<2x2xi32> { tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU"}) {
   // CHECK:        "tf_device.cluster"
@@ -107,11 +111,13 @@ func.func @main(
   }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> (tensor<i32>)
   func.return
 }
+}
 
 // -----
 
 // Check replicated tf.Const op SPMD.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_const_op
+module @test_spmd_const_op {
 func.func @main(%arg0: tensor<i32>) {
   // CHECK:        "tf_device.cluster"
   // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.Const"
@@ -127,38 +133,42 @@ func.func @main(%arg0: tensor<i32>) {
   }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<1x1xi32>
   func.return
 }
-
+}
 // -----
 
-// Check sharded tf.Const op SPMD.
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<i32>
-func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<1>}) {
-  // CHECK:        "tf_device.cluster"
-  // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.Const"
-  // CHECK-NEXT:      %[[A_SLICE:[0-9]*]] = "tf.DTensorAllScatter"(%[[A_OUT]])
-  // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
-  // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
-  // CHECK-NEXT:      %[[IDENTITY_OUT:[0-9]*]] = "tf.IdentityN"(%[[A_SLICE]])
-  // CHECK-NEXT:      %[[B_OUT:.*]] = "tf.Const"
-  // CHECK-NEXT:      %[[ADD_OUT:[0-9]*]] = "tf.Add"(%[[IDENTITY_OUT]], %[[B_OUT]])
-  // CHECK-NEXT:      tf_device.return
-  // CHECK-SAME:      %[[ADD_OUT]]
-  %0 = "tf_device.cluster"() ({
-    %1 = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
-    %2 = "tf.DTensorLayout"(%1) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2xi32>) -> (tensor<2xi32>)
-    %3 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:scalar, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<i32>) -> (tensor<i32>)
-    %5 = "tf.Add"(%2, %4) {_layout = ["sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]}: (tensor<2xi32>, tensor<i32>) -> tensor<2xi32>
-    tf_device.return %5 : tensor<2xi32>
- }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<2xi32>
- func.return
+  // Check sharded tf.Const op SPMD.
+// CHECK-LABEL: module @test_spmd_const_op_sharded
+module @test_spmd_const_op_sharded {
+  // CHECK: func @main
+  // CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<i32>
+  func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<1>}) {
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.Const"
+    // CHECK-NEXT:      %[[A_SLICE:[0-9]*]] = "tf.DTensorAllScatter"(%[[A_OUT]])
+    // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+    // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+    // CHECK-NEXT:      %[[IDENTITY_OUT:[0-9]*]] = "tf.IdentityN"(%[[A_SLICE]])
+    // CHECK-NEXT:      %[[B_OUT:.*]] = "tf.Const"
+    // CHECK-NEXT:      %[[ADD_OUT:[0-9]*]] = "tf.Add"(%[[IDENTITY_OUT]], %[[B_OUT]])
+    // CHECK-NEXT:      tf_device.return
+    // CHECK-SAME:      %[[ADD_OUT]]
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+      %2 = "tf.DTensorLayout"(%1) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2xi32>) -> (tensor<2xi32>)
+      %3 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+      %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:scalar, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<i32>) -> (tensor<i32>)
+      %5 = "tf.Add"(%2, %4) {_layout = ["sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]}: (tensor<2xi32>, tensor<i32>) -> tensor<2xi32>
+      tf_device.return %5 : tensor<2xi32>
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<2xi32>
+  func.return
+  }
 }
 
 // -----
 
 // Check sharded tf.Const op SPMD with splat.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_const_op_sharded_with_splat
+module @test_spmd_const_op_sharded_with_splat {
 func.func @main(%arg0: tensor<i32>) {
   // CHECK:        "tf_device.cluster"
   // CHECK-NEXT:      %[[CONST_OUT:.*]] = "tf.Const"() {[[BEFORE_ATTR:.*]]value = dense<1> : tensor<1xi32>[[AFTER_ATTR:.*]]} : () -> tensor<1xi32>
@@ -169,11 +179,12 @@ func.func @main(%arg0: tensor<i32>) {
  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<2xi32>
  func.return
 }
-
+}
 // -----
 
 // Check replicated tf.BroadcastTo op SPMD.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_broadcast_replicated
+module @test_spmd_broadcast_replicated {
 func.func @main(%arg0: tensor<3xi32>) {
   // CHECK:       "tf.BroadcastTo"
   // CHECK-SAME:  tensor<3xi32>, tensor<2xi64>) -> tensor<3x3xi32>
@@ -186,11 +197,12 @@ func.func @main(%arg0: tensor<3xi32>) {
  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<3x3xi32>
  func.return
 }
-
+}
 // -----
 
 // Check replicated tf.range op SPMD.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_range
+module @test_spmd_range {
 func.func @main() {
   // CHECK:       "tf.Range"
   // CHECK-SAME:  tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
@@ -203,11 +215,13 @@ func.func @main() {
  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<3xi32>
  func.return
 }
+}
 
 // -----
 
 // Check tf.AssignVariable op SPMD
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_assign_var
+module @test_spmd_assign_var {
 func.func @main(%arg0: tensor<32x32xi32> { tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}, %arg1: tensor<!tf_type.resource> { tf._layout = "sharding_specs: mesh:||||"}) {
   // CHECK:        "tf_device.cluster"
   // CHECK-NEXT:     "tf.AssignVariableOp"
@@ -221,11 +235,12 @@ func.func @main(%arg0: tensor<32x32xi32> { tf._layout = "sharding_specs:x,y, mes
  }) { _mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
  func.return
 }
-
+}
 // -----
 
 // Check tf.Softmax op SPMD where last dimension is not sharded.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_softmax_last_dim_unsharded
+module @test_spmd_softmax_last_dim_unsharded {
 func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
   // CHECK:        "tf_device.cluster"
   // CHECK-NEXT:     "tf.Softmax"
@@ -236,11 +251,13 @@ func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,unshar
  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
  func.return
 }
+}
 
 // -----
 
 // Check tf.Softmax op with rank 3.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_softmax_rank_3
+module @test_spmd_softmax_rank_3 {
 func.func @main(%arg0: tensor<32x32x32xf32> { tf._layout = "sharding_specs:x,y,unsharded, mesh:TPU|x=2,y=2,z=2|*TPU"}) {
   // CHECK:        "tf_device.cluster"
   // CHECK-NEXT:     "tf.Softmax"
@@ -251,11 +268,13 @@ func.func @main(%arg0: tensor<32x32x32xf32> { tf._layout = "sharding_specs:x,y,u
  }) {_mesh = "TPU|x=2,y=2,z=2|*TPU"} : () -> ()
  func.return
 }
+}
 
 // -----
 
 // Check SPMD expansion of softmax op with non-sharded last dimension.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_softmax_last_dim_unsharded1
+module @test_spmd_softmax_last_dim_unsharded1 {
 func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
   // CHECK:        "tf_device.cluster"
   // CHECK-NEXT:     "tf.LogSoftmax"
@@ -266,11 +285,12 @@ func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,unshar
  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
  func.return
 }
-
+}
 // -----
 
 // Check SPMD expansion of SoftMax op.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_softmax
+module @test_spmd_softmax {
 func.func @main(%arg0: tensor<32x32x32xf32> { tf._layout = "sharding_specs:x,y,unsharded, mesh:TPU|x=2,y=2,z=2|*TPU"}) {
   // CHECK:        "tf_device.cluster"
   // CHECK-NEXT:     "tf.LogSoftmax"
@@ -281,11 +301,12 @@ func.func @main(%arg0: tensor<32x32x32xf32> { tf._layout = "sharding_specs:x,y,u
  }) {_mesh = "TPU|x=2,y=2,z=2|*TPU"} : () -> ()
  func.return
 }
-
+}
 // -----
 
 // Check that Softmax op with last dimension sharded is supported on TPU's.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_softmax_last_dim_sharded
+module @test_spmd_softmax_last_dim_sharded {
 func.func @main(%arg0: tensor<32x32x32xf32> { tf._layout = "sharding_specs:x,y,z, mesh:TPU|x=2,y=2,z=2|*TPU"}) {
   "tf_device.cluster"() ({
     "tf.Softmax"(%arg0) {_layout = ["sharding_specs:x,y,z, mesh:TPU|x=2,y=2,z=2|*TPU"]} : (tensor<32x32x32xf32>) -> (tensor<32x32x32xf32>)
@@ -293,10 +314,12 @@ func.func @main(%arg0: tensor<32x32x32xf32> { tf._layout = "sharding_specs:x,y,z
  }) {_mesh = "TPU|x=2,y=2,z=2|*TPU"} : () -> ()
  func.return
 }
+}
 
 // -----
 
 // Check that random uniform op with incompatible shape is disallowed.
+module @test_spmd_random_op_with_incomplete_shape_disallowed {
 func.func @main(%arg0: tensor<i32>) {
   %0 = "tf_device.cluster"() ({
     // %1 = "tf.Const"() {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], device = "", value = dense<16> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -309,11 +332,14 @@ func.func @main(%arg0: tensor<i32>) {
  }) { _mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<16x16xf32>
  func.return
 }
+}
 
 // -----
 
 // Check Resource Apply op SPMD.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_resource
+module @test_spmd_resource {
+// CHECK: func @main
 // CHECK-SAME: %arg0: tensor<f32>
 // CHECK-SAME: %arg1: tensor<1x1xf32>
 // CHECK-SAME: tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
@@ -332,11 +358,14 @@ func.func @main(
  }) { _mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
  func.return
 }
+}
 
 // -----
 
 // Check that function inputs are modified to reflect local input shapes.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_inputs_have_local_shapes
+module @test_spmd_inputs_have_local_shapes {
+// CHECK: func @main
 // CHECK-SAME: %arg0: tensor<f32>
 // CHECK-SAME: %arg1: tensor<1x1xf32>
 // CHECK-SAME: tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
@@ -351,12 +380,15 @@ func.func @main(
  }) {_mesh = "mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
  func.return
 }
+}
 
 // -----
 
 // Check that tf_device.Cluster op return values are updated to reflect local
 // shape.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_returned_values_have_local_shapes
+module @test_spmd_returned_values_have_local_shapes {
+// CHECK: func @main
 // CHECK-SAME: %arg0: tensor<f32>
 // CHECK-SAME: %arg1: tensor<1x1xf32>
 // CHECK-SAME: tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
@@ -373,12 +405,14 @@ func.func @main(
  }) {_mesh = "mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<2x2xf32>)
  func.return
 }
-
+}
 // -----
 
 // Check that function signature as well as return types of callsite operations
 // are updated to reflect local shape.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_return_types_have_local_shapes_at_callsite
+module @test_spmd_return_types_have_local_shapes_at_callsite {
+// CHECK: func @main
 // CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<f32>
 // CHECK-SAME: %[[ARG1:[a-z0-9]*]]: tensor<1x1xf32>
 // CHECK-SAME: tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
@@ -397,17 +431,19 @@ func.func @main(
  func.return
 }
 
-// CHECK-LABEL: func @pcall_func
+// CHECK: func @pcall_func
 // CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<1x1xf32>
 func.func @pcall_func(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   // CHECK: return %[[ARG0]] : tensor<1x1xf32>
   func.return %arg0 : tensor<2x2xf32>
 }
-
+}
 // -----
 
 // Check DTensorLayout ops are removed after SPMD Expansion.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_dtensor_layout_ops_are_removed
+module @test_spmd_dtensor_layout_ops_are_removed {
+// CHECK: func @main
 func.func @main(%arg0: tensor<i32>) {
   %0 = "tf_device.cluster"() ({
     // CHECK:      "tf.Const"()
@@ -427,10 +463,11 @@ func.func @main(%arg0: tensor<i32>) {
   }) {_mesh = "mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<2x2xi32>)
   func.return
 }
-
+}
 // -----
-
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_neg_op1
+module @test_spmd_neg_op1 {
+// CHECK: func @main
 // CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<i32>
 // CHECK-SAME: %[[ARG1:[a-z0-9]*]]: tensor<2x2xf32>
 func.func @main(
@@ -449,11 +486,13 @@ func.func @main(
  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<2x2xf32>)
  func.return
 }
-
+}
 // -----
 
 // A super tricky case where the DTensorLayout is out of the tf_device.cluster and somewhat gets casted to BlockArgument with a wild argument number 3.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_var_input
+module @test_spmd_var_input {
+// CHECK: func @main
 // CHECK:       %arg1: tensor<!tf_type.resource<tensor<1xf32>>>
 func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
   %arg1: tensor<!tf_type.resource<tensor<2xf32>>> {tf._global_shape = #tf_type.shape<2>, tf._layout = "empty_layout", tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"},
@@ -468,11 +507,36 @@ func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
   }) {_mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> ()
   func.return
 }
+}
+
+// -----
+// A super tricky case where the DTensorLayout is out of the tf_device.cluster and somewhat gets casted to BlockArgument with a wild argument number 3.
+// CHECK-LABEL: module @test_spmd_assigned_value_is_input
+module @test_spmd_assigned_value_is_input {
+  // CHECK: func @main
+  // CHECK:          "tf.VarHandleOp"()
+  // CHECK-SAME:      _layout = ["sharding_specs:x, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"]
+  func.func @main(%arg0: tensor<2xf32> {tf._global_shape = #tf_type.shape<2>,
+      tf._layout = "sharding_specs:x, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+      tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) {
+    // %1 =  "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf_device.cluster"() ({
+      %0 = "tf.VarHandleOp"() {_global_shape = [#tf_type.shape<2>], allowed_devices = [], container = "", device = "", shared_name = ""} : () -> tensor<!tf_type.resource<tensor<2xf32>>> 
+      %1 = "tf.DTensorLayout"(%0) {_global_shape = [#tf_type.shape<2>], global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:x, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<!tf_type.resource<tensor<2xf32>>>
+      %2 = "tf.DTensorLayout"(%arg0) {_global_shape = [#tf_type.shape<2>], global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:x, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2xf32>) -> tensor<2xf32>
+      "tf.AssignVariableOp"(%1, %2) {_global_shape = [], device = ""} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+      tf_device.return
+    }) {_mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> ()
+    func.return
+  }
+}
 
 // -----
 
 // Check to ensure that the local shape of resource-type arguments are not double-calculated if they are assigned to a tensor value wihtin the function.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_var_arg_local_shapes
+module @test_spmd_var_arg_local_shapes {
+// CHECK: func @main
 // CHECK-SAME: %arg0: tensor<i32>
 // CHECK-SAME: %arg1: tensor<1x4xf32>
 // CHECK-SAME: %arg2: tensor<!tf_type.resource<tensor<1x4xf32>>>
@@ -499,13 +563,15 @@ func.func @main(
   }) {_mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> tensor<2x4xf32>
   func.return %0 : tensor<2x4xf32>
 }
-
+}
 // -----
 
 // Check SPMD expansion of Cumsum op with sharding on axis dimension, should
 // produce a replicated layout on that axis dimension, with allgather and
 // allscatter for intermediate layout computation.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_cumsum_op
+module @test_spmd_cumsum_op {
+// CHECK: func @main
 func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
   // CHECK:        "tf_device.cluster"
   // CHECK-NEXT:   "tf.DTensorAllGather"
@@ -520,13 +586,16 @@ func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,unshar
  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
  func.return
 }
+}
 
 // -----
 
 // Check SPMD expansion of Cumsum op with no sharding on axis dim. This should
 // not produce an allscatter or allgather for intermediate layout computation
 // since no relayouts are happening.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_cumsum_op_no_sharding_on_axis_dim
+module @test_spmd_cumsum_op_no_sharding_on_axis_dim {
+// CHECK: func @main
 func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
   // CHECK:        "tf_device.cluster"
   // CHECK-NEXT:   "tf.Const"
@@ -539,13 +608,15 @@ func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,unshar
  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
  func.return
 }
-
+}
 // -----
 
 // Check Relayout for SparseTensors emits the appropriate ops required for relaying out a SparseTensor.
 // We do this by doing a matmul (between a sparsetensor and a densetensor)
 // tf.matmul (*,x) multiplied by (x,*) causes a relayout on the left operand.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_relayout_on_sparse_tensors
+module @test_spmd_relayout_on_sparse_tensors {
+// CHECK: func @main
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<4x16xf32> {tf._layout = "sharding_specs:unsharded,x, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"}, %arg2: tensor<?x2xi64>, %arg3: tensor<2xi64>, %arg4: tensor<?xf32>) -> tensor<8x16xf32> {
   // CHECK: %[[CST:.*]] = "tf.Const"
   // CHECK-NEXT: %[[DENSE_0:.*]] = "tf.SparseToDense"(%arg2, %arg3, %arg4, %[[CST]])
@@ -574,11 +645,39 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<4x16xf32> {tf._layout = "shard
   }) {_mesh = "|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"} : () -> tensor<8x16xf32>
   func.return %1 : tensor<8x16xf32>
 }
+}
+
+// -----
+
+// Check that relayout uses all-to-all for unsharded,x to x,unsharded.
+// CHECK-LABEL: module @test_relayout_using_all_to_all
+module @test_relayout_using_all_to_all {
+// CHECK: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<32x32xf32> { tf._layout = "sharding_specs:unsharded,x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) -> tensor<32x32xf32>  {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: %[[CST:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[BIAS_ADD_OUT:.*]] = "tf.BiasAdd"(%arg1, %cst)
+  // CHECK-NEXT: %[[ALL_TO_ALL_OUT:.*]] = "tf.DTensorAllToAll"(%[[BIAS_ADD_OUT]])
+  // CHECK-SAME: input_layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>
+  // CHECK-SAME: output_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>
+  // CHECK-NEXT: tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<32xf32>, _layout = ["sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]} : () -> tensor<32xf32>
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<32x32>, layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>} : (tensor<32x32xf32>) -> tensor<32x32xf32>
+    %2 = "tf.BiasAdd"(%1, %cst) {global_shape = #tf_type.shape<32x32>} : (tensor<32x32xf32>, tensor<32xf32>) -> (tensor<32x32xf32>)
+    %3 = "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<32x32>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>} : (tensor<32x32xf32>) -> tensor<32x32xf32>
+    tf_device.return %3 : tensor<32x32xf32>
+ }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<32x32xf32>)
+ func.return %0 : tensor<32x32xf32>
+}
+}
 
 // -----
 
 // Check SPMD expansion of TensorListReserve replicated and TensorListSet with a sharded tensor emits a gather to replicated.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_tensor_list_reserve_replicated
+module @test_spmd_tensor_list_reserve_replicated {
+// CHECK: func @main
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<4x4xi32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) -> (tensor<4x4xi32>) {
   // CHECK:        "tf_device.cluster"
   // CHECK-NEXT:   "tf.Const"
@@ -615,11 +714,14 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<4x4xi32> {tf._layout = "shardi
   }) {_mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> tensor<4x4xi32>
   func.return %0 : tensor<4x4xi32>
 }
+}
 
 // -----
 
 // Check SPMD expansion of DisableCopyOnRead has correct shape.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_disable_copy_on_read
+module @test_spmd_disable_copy_on_read {
+// CHECK: func @main
 func.func @main(
   %arg0: tensor<i32>,
   %arg1: tensor<!tf_type.resource<tensor<4x8xi32>>> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) -> () {
@@ -632,12 +734,14 @@ func.func @main(
   }) {_mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> ()
   func.return
 }
-
+}
 // -----
 
 // Check SPMD expansion of ScatterNd op output is the sharding of updates
 // tensor.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_scatter_nd_op
+module @test_spmd_scatter_nd_op {
+// CHECK: func @main
 func.func @main(%arg0: tensor<2x4x4xi32> {tf._layout = "sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"}) -> (tensor<16x4x4xi32>) {
   // CHECK:      "tf_device.cluster"
   // CHECK-NEXT:   "tf.Const"()
@@ -656,11 +760,13 @@ func.func @main(%arg0: tensor<2x4x4xi32> {tf._layout = "sharding_specs:unsharded
   }) {_mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"} : () -> tensor<16x4x4xi32>
   return %0 : tensor<16x4x4xi32>
 }
-
+}
 // -----
 
 // Check SPMD expansion of ScatterNd op indices is relayout to replicated.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_spmd_scatter_nd_op_indices_layout
+module @test_spmd_scatter_nd_op_indices_layout {
+// CHECK: func @main
 func.func @main(%arg0: tensor<2x4x4xi32> {tf._layout = "sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"},
                 %arg1: tensor<2x1xi32> {tf._global_shape = #tf_type.shape<2x1>, tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"}) -> (tensor<16x4x4xi32>) {
   // CHECK:   "tf_device.cluster"
@@ -677,10 +783,11 @@ func.func @main(%arg0: tensor<2x4x4xi32> {tf._layout = "sharding_specs:unsharded
   }) {_mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"} : () -> tensor<16x4x4xi32>
   return %0 : tensor<16x4x4xi32>
 }
-
+}
 // -----
 
 // Check stateful random operations raise error.
+module @test_spmd_error_on_stateful_random_op {
 func.func @main(
   %arg0: tensor<2xi32> { tf._layout = "sharding_specs:unsharded, mesh:CPU|x=2,y=2|*CPU"}) {
   %0 = "tf_device.cluster"() ({
@@ -691,11 +798,12 @@ func.func @main(
   }) {_mesh = "CPU|x=2,y=2|*CPU"} : () -> (tensor<4x4xf32>)
   func.return
 }
-
+}
 
 // -----
 
 // Check stateful random operations raise error.
+module @test_spmd_error_on_stateful_random_op1 {
 func.func @main(
   %arg0: tensor<2xi32> { tf._layout = "sharding_specs:unsharded, mesh:CPU|x=2,y=2|*CPU"},
   %arg1: tensor<1xi32> { tf._layout = "sharding_specs:unsharded, mesh:CPU|x=2,y=2|*CPU"},
@@ -709,4 +817,4 @@ func.func @main(
   }) {_mesh = "CPU|x=2,y=2|*CPU"} : () -> (tensor<4x4xi32>)
   func.return
 }
-
+}
\ No newline at end of file
diff --git a/tensorflow/dtensor/mlir/utils/collective_lowering.cc b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
index 17e08b292a9..e65c4df9352 100644
--- a/tensorflow/dtensor/mlir/utils/collective_lowering.cc
+++ b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <atomic>
 #include <cstdlib>
 #include <iterator>
 #include <map>
 #include <memory>
+#include <numeric>
 #include <string>
 #include <tuple>
 #include <vector>
@@ -67,6 +69,7 @@ namespace {
 #define GEN_PASS_DEF_DTENSORREDUCESCATTERLOWERING
 #define GEN_PASS_DEF_DTENSORALLGATHERLOWERING
 #define GEN_PASS_DEF_DTENSORALLSCATTERLOWERING
+#define GEN_PASS_DEF_DTENSORALLTOALLLOWERING
 #include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
 }  // namespace
 
@@ -184,9 +187,22 @@ int32_t GetCollectiveKeyBase(
   }
   int32_t key_base = tf_collective_key_base++;
   mesh_to_key_base->insert({{mesh.ToString(), group_key_offsets}, key_base});
+  VLOG(4) << "key base = " << key_base << " mesh = " << mesh.ToString();
   return key_base;
 }
 
+mlir::Value GetRelativeDeviceId(mlir::Operation* op,
+                                const Layout& output_layout,
+                                mlir::OpBuilder& builder,
+                                const mlir::Location& loc) {
+  // TODO(tmorris): Should this be using op.getResult()?
+  mlir::Value device_id =
+      ops_util::ReshapeScalarToSizeType(builder, DeviceId(op).value(), loc);
+  mlir::Value start_device_id = ops_util::GetR1Const(
+      {output_layout.mesh().min_global_device_id()}, builder, loc);
+  return builder.create<mlir::TF::SubOp>(loc, device_id, start_device_id);
+}
+
 void CreateGroupAndInstanceKey(
     mlir::OpBuilder& builder, const mlir::Location& loc,
     const mlir::DenseIntElementsAttr& group_assignment, int32 key_base,
@@ -235,25 +251,6 @@ mlir::Operation* EmitCollectiveReduce(
     const mlir::DenseIntElementsAttr& group_assignment, int32 key_base,
     mlir::Value device_id, int32 host_group_size,
     const mlir::StringRef device_type) {
-  const mlir::TensorType input_type =
-      input.getType().dyn_cast<mlir::TensorType>();
-
-  const bool need_int32_to_int64_upcast =
-      (device_type.endswith("GPU") && input_type &&
-       input_type.getElementType().isInteger(32));
-
-  if (need_int32_to_int64_upcast) {
-    LOG(WARNING) << "On GPU, collective reduce of int32 is not supported. "
-                    "Casting to int64 as a workaround: "
-                 << mlir::debugString(loc);
-
-    mlir::TF::CastOp cast_to_int64 = builder.create<mlir::TF::CastOp>(
-        loc,
-        mlir::RankedTensorType::get(input_type.getShape(),
-                                    builder.getIntegerType(64)),
-        input);
-    input = cast_to_int64.getResult();
-  }
 
   mlir::Value group_key_scalar;
   mlir::Value instance_key_scalar;
@@ -273,13 +270,6 @@ mlir::Operation* EmitCollectiveReduce(
       /*timeout_seconds=*/builder.getF32FloatAttr(0.),
       /*max_subdivs_per_device=*/builder.getI64IntegerAttr(16));
   SetSingleLayoutOnOp(collective_reduce, Layout::Empty());
-  if (need_int32_to_int64_upcast) {
-    return builder.create<mlir::TF::CastOp>(
-        loc,
-        mlir::RankedTensorType::get(input_type.getShape(),
-                                    builder.getIntegerType(32)),
-        collective_reduce);
-  }
   return collective_reduce;
 }
 
@@ -368,6 +358,150 @@ mlir::Operation* EmitCollectiveReduceScatter(
   return collective_reduce_scatter;
 }
 
+mlir::Operation* EmitCollectiveAllToAll(
+    mlir::OpBuilder& builder, const mlir::Location& loc, mlir::Value input,
+    const mlir::DenseIntElementsAttr& group_assignment, int32 concat_dimension,
+    int32 split_dimension, int32 key_base, mlir::Value device_id,
+    int32 host_group_size, const mlir::StringRef device_type) {
+  // This function implements an all-to-all with variable split and concat
+  // dimensions using the CollectiveAllToAllV2 which treats the input as a flat
+  // buffer. This requires permuting the data before or after the all-to-all
+  // using a reshape from rank N to rank N+1 followed by a transpose.
+  // Additionally, if neither the split or concat dimensions are rank 0, a pair
+  // of transpose, one before and one after all-to-all is needed to split the
+  // data correctly. An example relayout that requires this is [y, unsharded, x]
+  // -> [y, x, unsharded].
+  const mlir::TensorType input_type =
+      input.getType().dyn_cast<mlir::TensorType>();
+  auto input_shape = input_type.dyn_cast<mlir::TensorType>().getShape();
+
+  // TODO(trevor-m): One of the transpose pairs created when requires_transpose
+  // is true can be combined with the transpose in permute_data() that lies on
+  // the same side of all-to-all.
+  const bool permute_before = concat_dimension < split_dimension;
+  const bool requires_transpose = concat_dimension != 0 && split_dimension != 0;
+  std::vector<int64> transposed_shape(input_shape.begin(), input_shape.end());
+  std::vector<int64> original_shape(input_shape);
+  int move_dims = std::min(concat_dimension, split_dimension);
+  if (requires_transpose) {
+    std::vector<int64> perm_for_transpose;
+    perm_for_transpose.reserve(input_shape.size());
+    // Move all dims before concat/split to end. This will be undone after the
+    // all-to-all.
+    for (int i = move_dims; i < input_shape.size(); ++i) {
+      perm_for_transpose.push_back(i);
+    }
+    for (int i = 0; i < move_dims; ++i) {
+      perm_for_transpose.push_back(i);
+    }
+    input =
+        EmitTransposeOp(builder, loc, input, perm_for_transpose)->getResult(0);
+    for (int i = 0; i < input_shape.size(); i++) {
+      transposed_shape[i] = input_shape[perm_for_transpose[i]];
+    }
+    if (permute_before) {
+      concat_dimension -= move_dims;
+      split_dimension -= move_dims;
+      input_shape = transposed_shape;
+    }
+  }
+
+  auto permute_data = [&](mlir::Value data) {
+    // Reshape
+    std::vector<int64> new_shape;
+    new_shape.reserve(input_shape.size() + 1);
+    for (int i = 0; i < input_shape.size(); ++i) {
+      if (i == split_dimension) {
+        new_shape.push_back(host_group_size);
+        new_shape.push_back(input_shape[i] / host_group_size);
+      } else {
+        new_shape.push_back(input_shape[i]);
+      }
+    }
+    auto reshape_op = builder.create<mlir::TF::ReshapeOp>(
+        loc, data, ops_util::GetR1Const(new_shape, builder, loc));
+
+    std::vector<int64> perm_for_permute_transpose;
+    perm_for_permute_transpose.reserve(input_shape.size() + 1);
+    for (int i = 0; i < input_shape.size(); ++i) {
+      if (i == concat_dimension) {
+        perm_for_permute_transpose.push_back(split_dimension);
+      }
+      int dim_after_reshape = i >= split_dimension ? i + 1 : i;
+      perm_for_permute_transpose.push_back(dim_after_reshape);
+    }
+    return EmitTransposeOp(builder, loc, reshape_op->getResult(0),
+                           perm_for_permute_transpose);
+  };
+
+  if (permute_before) {
+    input = permute_data(input)->getResult(0);
+  }
+
+  // Flatten input. CPU implementation requires first dim to equal the group
+  // size.
+  int64 num_elements = std::accumulate(input_shape.begin(), input_shape.end(),
+                                       1LL, std::multiplies<int64>());
+  std::vector<int64> flatten_shape = {host_group_size,
+                                      num_elements / host_group_size};
+  auto flatten_reshape_op = builder.create<mlir::TF::ReshapeOp>(
+      loc, input, ops_util::GetR1Const(flatten_shape, builder, loc));
+  mlir::TensorType output_type =
+      mlir::RankedTensorType::get(flatten_shape, input_type.getElementType());
+
+  // All-to-all
+  mlir::Value group_key_scalar;
+  mlir::Value instance_key_scalar;
+  CreateGroupAndInstanceKey(builder, loc, group_assignment, key_base, device_id,
+                            &group_key_scalar, &instance_key_scalar);
+  mlir::Value group_size_scalar =
+      ops_util::CreateScalarConst(host_group_size, builder, loc);
+  auto collective_alltoall = builder.create<mlir::TF::CollectiveAllToAllV2Op>(
+      loc, /*output_type=*/output_type, flatten_reshape_op->getResult(0),
+      group_size_scalar, group_key_scalar, instance_key_scalar,
+      /*ordering_token=*/mlir::ValueRange({}),
+      /*communication_hint=*/builder.getStringAttr(""),
+      /*timeout_seconds=*/builder.getF32FloatAttr(0.));
+  SetSingleLayoutOnOp(collective_alltoall, Layout::Empty());
+  mlir::Value prev_op = collective_alltoall->getResult(0);
+
+  if (requires_transpose) {
+    // Unflatten after all-to-all.
+    auto reshape_op = builder.create<mlir::TF::ReshapeOp>(
+        loc, prev_op, ops_util::GetR1Const(transposed_shape, builder, loc));
+    // Undo earlier transpose which moved split or concat dim to rank 0.
+    std::vector<int64> perm_for_transpose;
+    perm_for_transpose.reserve(input_shape.size());
+    for (int i = move_dims + 1; i < input_shape.size(); ++i) {
+      perm_for_transpose.push_back(i);
+    }
+    for (int i = 0; i <= move_dims; ++i) {
+      perm_for_transpose.push_back(i);
+    }
+    prev_op = EmitTransposeOp(builder, loc, reshape_op->getResult(0),
+                              perm_for_transpose)
+                  ->getResult(0);
+    if (permute_before) {
+      concat_dimension += move_dims;
+      split_dimension += move_dims;
+      input_shape = original_shape;
+    }
+  }
+
+  if (!permute_before) {
+    prev_op = permute_data(prev_op)->getResult(0);
+  }
+
+  // Reshape
+  std::vector<int64> output_shape(input_shape.begin(), input_shape.end());
+  output_shape[concat_dimension] *= host_group_size;
+  output_shape[split_dimension] /= host_group_size;
+  auto post_reshape_op = builder.create<mlir::TF::ReshapeOp>(
+      loc, prev_op, ops_util::GetR1Const(output_shape, builder, loc));
+
+  return post_reshape_op;
+}
+
 mlir::Operation* EmitCollectiveGather(
     mlir::OpBuilder& builder, const mlir::Location& loc, mlir::Value input,
     const mlir::DenseIntElementsAttr& group_assignment, int32 key_base,
@@ -411,7 +545,7 @@ mlir::LogicalResult LowerAllReduceOpImpl(
   StatusOr<Layout> output_layout =
       ExtractRequiredSingleLayoutFromOp(all_reduce);
   if (!output_layout.ok()) {
-    return all_reduce.emitOpError(output_layout.status().error_message());
+    return all_reduce.emitOpError(output_layout.status().message());
   }
   mlir::DenseIntElementsAttr group_assignment_attr;
   if (!matchPattern(all_reduce.getGroupAssignment(),
@@ -443,14 +577,8 @@ mlir::LogicalResult LowerAllReduceOpImpl(
     // deterministic, and moreover if we have multiple distinct reductions
     // groups in one program reducing over all hosts and reducing over pairs
     // of hosts, we need unique ids for each case.
-    mlir::Value device_id = ops_util::ReshapeScalarToSizeType(
-        builder, DeviceId(all_reduce.getResult()).value(), loc);
-    // TODO(b/188076080): Clean up device id.
-    mlir::Value start_device_id = ops_util::GetR1Const(
-        {(*output_layout).mesh().min_global_device_id()}, builder, loc);
     mlir::Value relative_device_id =
-        builder.create<mlir::TF::SubOp>(loc, device_id, start_device_id);
-
+        GetRelativeDeviceId(all_reduce, *output_layout, builder, loc);
     final_op = internal::EmitCollectiveReduce(
         builder, loc, all_reduce.getInput(), all_reduce.getReduceOp().str(),
         group_assignment_attr, key_base, relative_device_id,
@@ -462,8 +590,12 @@ mlir::LogicalResult LowerAllReduceOpImpl(
 }
 
 template <class ReduceOpType>
-mlir::LogicalResult ConvertBoolReduce(ReduceOpType reduce_op) {
+mlir::LogicalResult ConvertShortIntReduce(ReduceOpType reduce_op) {
   mlir::OpBuilder builder(reduce_op);
+  StatusOr<Layout> output_layout = ExtractRequiredSingleLayoutFromOp(reduce_op);
+  if (!output_layout.ok()) {
+    return reduce_op.emitOpError(output_layout.status().message());
+  }
   const mlir::Location loc = reduce_op.getLoc();
   const mlir::Type output_type = reduce_op.getResult().getType();
   const mlir::Type input_type = reduce_op.getOperand(0).getType();
@@ -473,41 +605,52 @@ mlir::LogicalResult ConvertBoolReduce(ReduceOpType reduce_op) {
       input_type.dyn_cast<mlir::TensorType>();
   const mlir::TensorType& tensor_output_type =
       output_type.dyn_cast<mlir::TensorType>();
-  if (tensor_input_type && tensor_output_type &&
-      tensor_input_type.getElementType().isInteger(1)) {
+  if (!tensor_input_type) return mlir::success();
+  if (!tensor_output_type) return mlir::success();
+
+  if (tensor_input_type.getElementType().isInteger(1)) {
     if (reduce_op.getReduceOpAttr().getValue().str() == kReduceOpAll)
       reduce_op.setReduceOpAttr(
           builder.getStringAttr(std::string(kReduceOpMin)));
     else if (reduce_op.getReduceOpAttr().getValue().str() == kReduceOpAny)
       reduce_op.setReduceOpAttr(
           builder.getStringAttr(std::string(kReduceOpMax)));
-    else
+    else if (reduce_op.getReduceOpAttr().getValue().str() != kReduceOpMax &&
+             reduce_op.getReduceOpAttr().getValue().str() != kReduceOpMin)
       return reduce_op.emitOpError()
-             << "reduce for boolean only supports 'All' or 'Any' reduction. "
+             << "reduce for boolean only supports 'All'/'Min' or 'Any'/'Max' "
+                "reduction. "
              << "Received '" << reduce_op.getReduceOpAttr().getValue().str()
              << "'";
+  }
+  int32_t min_width = 64;
+  if (output_layout->mesh().is_tpu_mesh()) {
+    min_width = 32;
+  }
+  if (mlir::isa<mlir::IntegerType>(tensor_input_type.getElementType()) &&
+      tensor_input_type.getElementType().getIntOrFloatBitWidth() < min_width) {
     const mlir::Type integer_input_type = mlir::RankedTensorType::get(
-        tensor_input_type.getShape(), builder.getIntegerType(32));
-    mlir::TF::CastOp cast_to_int32 = builder.create<mlir::TF::CastOp>(
+        tensor_input_type.getShape(), builder.getIntegerType(min_width));
+    mlir::TF::CastOp cast_to_long = builder.create<mlir::TF::CastOp>(
         loc, integer_input_type, reduce_op.getInput());
-    reduce_op.setOperand(0, cast_to_int32.getY());
+    reduce_op.setOperand(0, cast_to_long.getY());
     auto integer_output_type = mlir::RankedTensorType::get(
-        tensor_output_type.getShape(), builder.getIntegerType(32));
+        tensor_output_type.getShape(), builder.getIntegerType(min_width));
     reduce_op.getOutput().setType(integer_output_type);
 
     // Add cast back to boolean after reduction.
     mlir::Value result = reduce_op.getOutput();
     builder.setInsertionPointAfter(reduce_op);
-    mlir::TF::CastOp cast_to_bool =
+    mlir::TF::CastOp cast_to_original =
         builder.create<mlir::TF::CastOp>(loc, output_type, result);
     StatusOr<Layout> result_layout =
         ExtractRequiredSingleLayoutFromOp(result.getDefiningOp());
     if (!result_layout.ok()) {
-      return reduce_op.emitOpError(result_layout.status().error_message());
+      return reduce_op.emitOpError(result_layout.status().message());
     }
-    SetSingleLayoutOnOp(cast_to_bool, *result_layout);
-    reduce_op.getOutput().replaceAllUsesExcept(cast_to_bool.getY(),
-                                               cast_to_bool);
+    SetSingleLayoutOnOp(cast_to_original, *result_layout);
+    reduce_op.getOutput().replaceAllUsesExcept(cast_to_original.getY(),
+                                               cast_to_original);
   }
 
   return mlir::success();
@@ -515,7 +658,8 @@ mlir::LogicalResult ConvertBoolReduce(ReduceOpType reduce_op) {
 
 mlir::LogicalResult LowerAllReduceOp(mlir::MLIRContext& context,
                                      mlir::TF::DTensorAllReduceOp all_reduce) {
-  if (mlir::failed(ConvertBoolReduce<mlir::TF::DTensorAllReduceOp>(all_reduce)))
+  if (mlir::failed(
+          ConvertShortIntReduce<mlir::TF::DTensorAllReduceOp>(all_reduce)))
     return mlir::failure();
 
   mlir::OpBuilder builder(all_reduce);
@@ -535,7 +679,7 @@ mlir::LogicalResult LowerReduceScatterOp(
   StatusOr<Layout> output_layout =
       ExtractRequiredSingleLayoutFromOp(reduce_scatter);
   if (!output_layout.ok()) {
-    return reduce_scatter.emitOpError(output_layout.status().error_message());
+    return reduce_scatter.emitOpError(output_layout.status().message());
   }
   mlir::DenseIntElementsAttr group_assignment_attr;
   if (!matchPattern(reduce_scatter.getGroupAssignment(),
@@ -554,7 +698,7 @@ mlir::LogicalResult LowerReduceScatterOp(
 
   mlir::OpBuilder builder(reduce_scatter);
   if (reduce_scatter.getDeviceType().endswith("TPU")) {
-    if (mlir::failed(ConvertBoolReduce<mlir::TF::DTensorReduceScatterOp>(
+    if (mlir::failed(ConvertShortIntReduce<mlir::TF::DTensorReduceScatterOp>(
             reduce_scatter)))
       return mlir::failure();
     // For TPUs, lower to XlaReduceScatter straightforwardly.
@@ -569,13 +713,8 @@ mlir::LogicalResult LowerReduceScatterOp(
   } else if (reduce_scatter.getDeviceType().endswith("GPU") &&
              UseNcclCommunicationOnGpu()) {
     // Use CollectiveReduceScatterV2 which has a NCCL GPU implementation.
-    mlir::Value device_id = ops_util::ReshapeScalarToSizeType(
-        builder, DeviceId(reduce_scatter.getResult()).value(), loc);
-    // TODO(b/188076080): Clean up device id.
-    mlir::Value start_device_id = ops_util::GetR1Const(
-        {(*output_layout).mesh().min_global_device_id()}, builder, loc);
     mlir::Value relative_device_id =
-        builder.create<mlir::TF::SubOp>(loc, device_id, start_device_id);
+        GetRelativeDeviceId(reduce_scatter, *output_layout, builder, loc);
 
     int32 group_size = group_assignment_attr.getType().getShape()[1];
     const int32_t key_base =
@@ -607,7 +746,7 @@ mlir::LogicalResult LowerReduceScatterOp(
     }
 
     if (!input_layout.ok()) {
-      return reduce_scatter.emitOpError(input_layout.status().error_message());
+      return reduce_scatter.emitOpError(input_layout.status().message());
     }
 
     auto dtensor_allreduce = builder.create<mlir::TF::DTensorAllReduceOp>(
@@ -715,18 +854,13 @@ mlir::LogicalResult LowerAllGatherOpToCollective(
 
   const mlir::Location loc = DT_LOC(all_gather.getLoc());
 
-  mlir::Value device_id = ops_util::ReshapeScalarToSizeType(
-      builder, DeviceId(all_gather.getResult()).value(), loc);
-  mlir::Value start_device_id = ops_util::GetR1Const(
-      {(tgt_layout).mesh().min_global_device_id()}, builder, loc);
   mlir::Value relative_device_id =
-      builder.create<mlir::TF::SubOp>(loc, device_id, start_device_id);
+      GetRelativeDeviceId(all_gather, tgt_layout, builder, loc);
 
   StatusOr<std::string> device_type_or_status =
       DeviceTypeFromMesh(src_layout.mesh());
   if (!device_type_or_status.ok())
-    return all_gather.emitOpError()
-           << device_type_or_status.status().error_message();
+    return all_gather.emitOpError() << device_type_or_status.status().message();
   const std::string device_type = device_type_or_status.value();
 
   const mlir::RankedTensorType input_type =
@@ -783,8 +917,7 @@ mlir::LogicalResult LowerAllGatherOpToCollective(
   auto group_assignment_or =
       GetGroupAssignment(builder, src_layout, dims_to_gather);
   if (!group_assignment_or.ok()) {
-    return all_gather.emitOpError()
-           << group_assignment_or.status().error_message();
+    return all_gather.emitOpError() << group_assignment_or.status().message();
   }
   auto group_assignment = group_assignment_or.value();
   int32 group_size = group_assignment.getType().getShape()[1];
@@ -931,7 +1064,7 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
             src_layout.mesh().device_location(device_id);
         if (!device_loc_or_status.ok())
           return all_gather.emitOpError()
-                 << device_loc_or_status.status().error_message();
+                 << device_loc_or_status.status().message();
         const DeviceLocation device_loc = device_loc_or_status.value();
         const int32 mesh_idx =
             src_layout.mesh().idx_for_dim(src_layout.sharding_spec(i)).value();
@@ -948,19 +1081,8 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
 
   // Resize three flat lists to 2D matrices and select one vertical vector out
   // of every matrix based on device ID.
-  StatusOr<mlir::Value> device_id_scalar_or_status =
-      DeviceId(all_gather.getInput());
-  if (!device_id_scalar_or_status.ok())
-    return all_gather.emitOpError()
-           << device_id_scalar_or_status.status().error_message();
-  const mlir::Value device_id_scalar = device_id_scalar_or_status.value();
-  const mlir::Value device_id =
-      ops_util::ReshapeScalarToSizeType(builder, device_id_scalar, loc);
-  // TODO(b/188076080): Clean up device id.
-  const mlir::Value start_device_id = ops_util::GetR1Const(
-      {src_layout.mesh().min_global_device_id()}, builder, loc);
-  const mlir::Value relative_device_id =
-      builder.create<mlir::TF::SubOp>(loc, device_id, start_device_id);
+  mlir::Value relative_device_id =
+      GetRelativeDeviceId(all_gather, src_layout, builder, loc);
   const mlir::Value begin = SelectElementsBasedOnId(
       builder, loc, relative_device_id, device_id_to_begin_flat, num_devices,
       output_shape_size);
@@ -994,8 +1116,7 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
   auto partitions_or_status =
       GetAllReducePartitionsFromReducedDims(src_layout, reduced_dims);
   if (!partitions_or_status.ok())
-    return all_gather.emitOpError()
-           << partitions_or_status.status().error_message();
+    return all_gather.emitOpError() << partitions_or_status.status().message();
   auto partitions = partitions_or_status.value();
   const int32 num_partitions = partitions.size();
   assert(num_partitions <= num_devices);
@@ -1028,8 +1149,7 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
   StatusOr<std::string> device_type_or_status =
       DeviceTypeFromMesh(src_layout.mesh());
   if (!device_type_or_status.ok())
-    return all_gather.emitOpError()
-           << device_type_or_status.status().error_message();
+    return all_gather.emitOpError() << device_type_or_status.status().message();
   const std::string device_type = device_type_or_status.value();
 
   // Support bool types by switching to Any reduce rather than Add. For each
@@ -1063,7 +1183,7 @@ mlir::LogicalResult LowerAllScatterOp(
       GetMeshCoordinatesFromCluster(cluster);
   if (!mesh_coordinates_status.ok())
     return all_scatter.emitOpError()
-           << mesh_coordinates_status.status().error_message();
+           << mesh_coordinates_status.status().message();
   mlir::Value mesh_coordinates = mesh_coordinates_status.value();
 
   // We need to compute the slice offset, which is dynamic based on the id.
@@ -1153,6 +1273,78 @@ mlir::LogicalResult LowerAllScatterOp(
   return mlir::LogicalResult::success();
 }
 
+mlir::LogicalResult LowerAllToAllOp(mlir::TF::DTensorAllToAllOp all_to_all) {
+  mlir::OpBuilder builder(all_to_all);
+  mlir::Location loc = all_to_all.getLoc();
+  const Layout src_layout = all_to_all.getInputLayout();
+  const Layout tgt_layout = all_to_all.getOutputLayout();
+
+  absl::flat_hash_set<std::string> dims_to_gather;
+  for (int i = 0; i < src_layout.rank(); i++) {
+    if (src_layout.num_shards_for_dim(src_layout.dim(i)) ==
+            tgt_layout.num_shards_for_dim(tgt_layout.dim(i)) ||
+        src_layout.num_shards_for_dim(src_layout.dim(i)) == 1) {
+      continue;
+    }
+    dims_to_gather.insert(src_layout.sharding_spec(i));
+  }
+
+  auto group_assignment_or =
+      GetGroupAssignment(builder, src_layout, dims_to_gather);
+  if (!group_assignment_or.ok()) {
+    return all_to_all.emitOpError() << group_assignment_or.status().message();
+  }
+  auto group_assignment = group_assignment_or.value();
+  int32 group_size = group_assignment.getType().getShape()[1];
+
+  StatusOr<std::string> device_type_or_status =
+      DeviceTypeFromMesh(src_layout.mesh());
+  if (!device_type_or_status.ok())
+    return all_to_all.emitOpError() << device_type_or_status.status().message();
+  const std::string device_type = device_type_or_status.value();
+
+  // Find concat and split dimensions
+  int32 split_dimension = -1;
+  int32 concat_dimension = -1;
+  for (int i = 0; i < src_layout.rank(); ++i) {
+    if (src_layout.sharding_spec(i) != tgt_layout.sharding_spec(i)) {
+      if (Layout::IsUnshardedDimension(src_layout.sharding_spec(i)) &&
+          Layout::IsShardedDimension(tgt_layout.sharding_spec(i))) {
+        split_dimension = i;
+      } else if (Layout::IsShardedDimension(src_layout.sharding_spec(i)) &&
+                 Layout::IsUnshardedDimension(tgt_layout.sharding_spec(i))) {
+        concat_dimension = i;
+      }
+    }
+  }
+  if (split_dimension == -1 || concat_dimension == -1) {
+    return all_to_all.emitOpError();
+  }
+
+  if (mlir::StringRef(device_type).endswith("TPU")) {
+    // For TPUs, lower to XlaAllToAll.
+    mlir::Operation* xla_all_to_all = builder.create<mlir::TF::AllToAllOp>(
+        loc, all_to_all.getResult().getType(), all_to_all.getInput(),
+        builder.create<mlir::TF::ConstOp>(loc, group_assignment),
+        concat_dimension, split_dimension, group_size);
+    SetSingleLayoutOnOp(xla_all_to_all, tgt_layout);
+    all_to_all.replaceAllUsesWith(xla_all_to_all);
+  } else {
+    // Use CollectiveAllToAllV2
+    mlir::Value relative_device_id =
+        GetRelativeDeviceId(all_to_all, tgt_layout, builder, loc);
+    int32 key_base = GetCollectiveKeyBase(tgt_layout.mesh(), group_assignment);
+
+    mlir::Operation* collective_op = EmitCollectiveAllToAll(
+        builder, loc, all_to_all.getInput(), group_assignment, concat_dimension,
+        split_dimension, key_base, relative_device_id, group_size, device_type);
+    SetSingleLayoutOnOp(collective_op, tgt_layout);
+    all_to_all.replaceAllUsesWith(collective_op);
+  }
+  all_to_all.erase();
+  return mlir::LogicalResult::success();
+}
+
 }  // namespace internal
 
 namespace {
@@ -1178,10 +1370,6 @@ struct DTensorAllReduceLowering
 struct DTensorReduceScatterLowering
     : public impl::DTensorReduceScatterLoweringBase<
           DTensorReduceScatterLowering> {
-  void getDependentDialects(mlir::DialectRegistry& registry) const override {
-    registry.insert<mlir::dtensor::DTensorDialect>();
-  }
-
   void runOnOperation() override {
     mlir::ModuleOp module = getOperation();
 
@@ -1232,6 +1420,24 @@ struct DTensorAllScatterLowering
   }
 };
 
+struct DTensorAllToAllLowering
+    : public impl::DTensorAllToAllLoweringBase<DTensorAllToAllLowering> {
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+
+    // Find all DTensorAllToAll ops.
+    llvm::SmallVector<mlir::TF::DTensorAllToAllOp, 4> all_to_alls;
+    module.walk([&](mlir::TF::DTensorAllToAllOp all_to_all) {
+      all_to_alls.emplace_back(all_to_all);
+    });
+
+    // Replace every DTensorAllToAll op with device-specific implementations.
+    for (auto& all_to_all : all_to_alls)
+      if (mlir::failed(internal::LowerAllToAllOp(all_to_all)))
+        return signalPassFailure();
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
@@ -1254,5 +1460,10 @@ CreateDTensorAllScatterLoweringPass() {
   return std::make_unique<DTensorAllScatterLowering>();
 }
 
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorAllToAllLoweringPass() {
+  return std::make_unique<DTensorAllToAllLowering>();
+}
+
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/python/BUILD b/tensorflow/dtensor/python/BUILD
index 352c3974595..7c32baf8a2e 100644
--- a/tensorflow/dtensor/python/BUILD
+++ b/tensorflow/dtensor/python/BUILD
@@ -18,7 +18,7 @@ package(
 # A list of all modules linked into TensorFlow.
 
 py_library(
-    name = "core",
+    name = "dtensor",
     srcs = ["__init__.py"],
     visibility = ["//tensorflow/python:__pkg__"],
     deps = [
@@ -46,9 +46,8 @@ pytype_strict_library(
         ":dtensor_device",
         ":gen_dtensor_ops",
         ":layout",
-        "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -121,7 +120,6 @@ pytype_strict_library(
         "//tensorflow/dtensor/proto:layout_proto_py_pb2",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
-        "//tensorflow/python:util",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_options",
         "//tensorflow/python/checkpoint:graph_view",
@@ -134,6 +132,8 @@ pytype_strict_library(
         "//tensorflow/python/training:py_checkpoint_reader",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -149,7 +149,7 @@ pytype_strict_library(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python/eager/polymorphic_function",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:core",
         "//third_party/py/numpy",
     ],
 )
@@ -187,14 +187,13 @@ pytype_strict_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:_pywrap_dtensor_device",
         "//tensorflow/python:device",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:core",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:_pywrap_utils",
         "//third_party/py/numpy",
     ],
 )
@@ -250,12 +249,13 @@ pytype_strict_library(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/platform:remote_utils",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/tpu:topology",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
@@ -272,8 +272,8 @@ pytype_strict_library(
         "//tensorflow/python:collective_ops",
         "//tensorflow/python:device",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -305,8 +305,8 @@ pytype_strict_library(
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:data",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/dtensor/python/accelerator_util.py b/tensorflow/dtensor/python/accelerator_util.py
index f06ebd95786..96979ead672 100644
--- a/tensorflow/dtensor/python/accelerator_util.py
+++ b/tensorflow/dtensor/python/accelerator_util.py
@@ -178,12 +178,13 @@ def initialize_accelerator_system(
         "Call tf.experimental.dtensor.shutdown_accelerator_system() first.")
 
   if experimental_reset_context:
-    logging.warn(
-        "experimental_reset_context is True. "
-        "Resetting TensorFlow context. Existing TensorFlow objects "
-        "(e.g. Tensors and resources) are invalidated."
-    )
-    context.context().ensure_uninitialized()  # pylint: disable=protected-access
+    if context.context()._initialized:    # pylint: disable=protected-access
+      logging.warn(
+          "experimental_reset_context is True. "
+          "Resetting TensorFlow context. Existing TensorFlow objects "
+          "(e.g. Tensors and resources) are invalidated."
+      )
+      context.context().ensure_uninitialized()
 
   if context.context()._initialized:  # pylint: disable=protected-access
     raise ValueError(
@@ -251,24 +252,27 @@ def initialize_accelerator_system(
 def shutdown_accelerator_system() -> None:
   """Shuts down the accelerator system."""
   global _INITIALIZED_ACCELERATOR_SYSTEM_TYPE
-  context.async_wait()
+  try:
+    context.async_wait()
+  finally:
+    if not is_initialized():
+      raise ValueError(
+          "Accelerator system is not initialized. Call "
+          "tf.experimental.dtensor.initialize_accelerator_system first."
+      )
 
-  if not is_initialized():
-    raise ValueError(
-        "Accelerator system is not initialized. Call "
-        "tf.experimental.dtensor.initialize_accelerator_system first.")
+    device_type = _INITIALIZED_ACCELERATOR_SYSTEM_TYPE
 
-  device_type = _INITIALIZED_ACCELERATOR_SYSTEM_TYPE
+    if not config.is_local_mode():
+      raise ValueError(
+          "Shutting down accelerator system under multi-client mode is "
+          "not supported."
+      )
 
-  if not config.is_local_mode():
-    raise ValueError(
-        "Shutting down accelerator system under multi-client mode is "
-        "not supported.")
+    if device_type == "TPU" and not config.backend_is_pw():
+      tpu_util.shutdown_tpu_system()
 
-  if device_type == "TPU" and not config.backend_is_pw():
-    tpu_util.shutdown_tpu_system()
-
-  # reset TF context to stop gRPC servers.
-  context._reset_context()  # pylint: disable=protected-access
-  context.context()._clear_caches()  # pylint: disable=protected-access
-  _INITIALIZED_ACCELERATOR_SYSTEM_TYPE = None
+    # reset TF context to stop gRPC servers.
+    context._reset_context()  # pylint: disable=protected-access
+    context.context()._clear_caches()  # pylint: disable=protected-access
+    _INITIALIZED_ACCELERATOR_SYSTEM_TYPE = None
diff --git a/tensorflow/dtensor/python/api.py b/tensorflow/dtensor/python/api.py
index 0e9a8e5c328..bcfb36b8625 100644
--- a/tensorflow/dtensor/python/api.py
+++ b/tensorflow/dtensor/python/api.py
@@ -21,7 +21,6 @@ from typing import Any, Callable, Optional, Sequence
 from tensorflow.dtensor.python import dtensor_device
 from tensorflow.dtensor.python import gen_dtensor_ops
 from tensorflow.dtensor.python import layout as layout_lib
-from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
@@ -54,22 +53,9 @@ def call_with_layout(fn: Callable[...,
     The return value of `fn` transformed to a DTensor if requested.
   """
   if layout is not None:
-    if not context.executing_eagerly():
-      # This is a workaround for b/199324097, where functions such as tf.ones
-      # could attach an incorrect layout to the tf.const generated under the
-      # hood. The op runs successfully in eager mode, but in graph mode, MLIR
-      # passes sometimes attach the default layout to a scalar constant.
-      # %cst = tf.Const([1])  -- With the given layout
-      # %0 = "tf.DTensorLayout"(%cst). -- Fails in MLIR pass since shape for
-      #                                -- layout could be different than
-      #                                -- shape[0] for %cst.
-      # %1 = tf.Fill(%0, 1)
-      result = fn(*args, **kwargs)
-      return relayout(result, layout)
-    else:
-      with default_mesh(layout.mesh):
-        with _dtensor_device()._default_layout(layout):  # pylint: disable=protected-access
-          return fn(*args, **kwargs)
+    with default_mesh(layout.mesh):
+      with _dtensor_device()._default_layout(layout):  # pylint: disable=protected-access
+        return fn(*args, **kwargs)
   return fn(*args, **kwargs)
 
 
@@ -465,14 +451,14 @@ def _reset() -> None:
 
 @ops.RegisterGradient("Relayout")
 def _relayout_gradient(op, grad):
-  grad = gen_dtensor_ops.relayout_grad(grad, forward_input=op.inputs[0])
+  grad = gen_dtensor_ops.relayout_like(grad, layout_input=op.inputs[0])
   return grad
 
 
-@ops.RegisterGradient("RelayoutGrad")
+@ops.RegisterGradient("RelayoutLike")
 def _relayout_grad_gradient(op, grad):
   # Gradient of RelayoutGrad is relayout to the original Relayout's output.
-  grad = gen_dtensor_ops.relayout_grad(grad, forward_input=op.inputs[0])
+  grad = gen_dtensor_ops.relayout_like(grad, layout_input=op.inputs[0])
   # Return None for forward_input's partial gradient since it is not connected
   # to the target's gradient.
   return grad, None
diff --git a/tensorflow/dtensor/python/dtensor_device.py b/tensorflow/dtensor/python/dtensor_device.py
index 281f0cb80d6..7cf349320a0 100644
--- a/tensorflow/dtensor/python/dtensor_device.py
+++ b/tensorflow/dtensor/python/dtensor_device.py
@@ -33,8 +33,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.util import _pywrap_utils
 
 
 # TODO(allenl): Allow something other than "CUSTOM" so we don't need device
@@ -72,14 +71,14 @@ class DTensorDevice(object):
       self.name = "{}/device:CUSTOM:{}".format(ctx.host_address_space(),
                                                _next_device_number)
       _next_device_number += 1
-    device, device_info = _pywrap_dtensor_device.Allocate(self.name)
+    device, device_info = _pywrap_dtensor_device.Allocate(
+        self.name, is_async, in_flight_nodes_limit
+    )
     context.register_custom_device(device, self.name, device_info)
 
     self._device_info = device_info
     self._current_output_layout = None
     self._current_default_mesh = None
-    self._is_async = is_async
-    self._in_flight_nodes_limit = in_flight_nodes_limit
     self._meshes = set()
     self._mesh_lock = threading.Lock()
     for mesh in meshes:
@@ -136,28 +135,26 @@ class DTensorDevice(object):
     """Idempotently register `mesh` with the dtensor device."""
     with self._mesh_lock:
       if mesh not in self._meshes:
-        _pywrap_dtensor_device.AddMesh(self._device_info, mesh.to_string(),
-                                       self._is_async, False,
-                                       self._in_flight_nodes_limit)
+        _pywrap_dtensor_device.AddMesh(
+            self._device_info, mesh.to_string(), False
+        )
         self._meshes.add(mesh)
         if mesh.device_type().upper() == "TPU":
           logging.info(
               "Registering virtual 1:1 mapped host mesh %s for mesh %s",
               mesh.host_mesh().to_string(), mesh.to_string())
-          _pywrap_dtensor_device.AddMesh(self._device_info,
-                                         mesh.host_mesh().to_string(),
-                                         self._is_async, True,
-                                         self._in_flight_nodes_limit)
+          _pywrap_dtensor_device.AddMesh(
+              self._device_info, mesh.host_mesh().to_string(), True
+          )
           self._meshes.add(mesh.host_mesh())
           embedding_host_mesh = self._create_embedding_host_mesh(mesh)
           if embedding_host_mesh:
             logging.info(
                 "Registering embedding host mesh %s on each client for mesh %s",
                 embedding_host_mesh.to_string(), mesh.to_string())
-            _pywrap_dtensor_device.AddMesh(self._device_info,
-                                           embedding_host_mesh.to_string(),
-                                           self._is_async, False,
-                                           self._in_flight_nodes_limit)
+            _pywrap_dtensor_device.AddMesh(
+                self._device_info, embedding_host_mesh.to_string(), False
+            )
             self._meshes.add(embedding_host_mesh)
 
   @property
@@ -194,11 +191,6 @@ class DTensorDevice(object):
     """
     if not context.executing_eagerly():
       raise RuntimeError("`pack` must be called eagerly.")
-    if any(
-        issubclass(type(t), resource_variable_ops.BaseResourceVariable)
-        for t in tensors):
-      raise TypeError(
-          "Received Variable input to Pack, Variable is not supported.")
     self._register_mesh(layout.mesh)
     with ops.device(self.name):
       if all(isinstance(t, sparse_tensor.SparseTensor) for t in tensors):
@@ -245,9 +237,6 @@ class DTensorDevice(object):
     """
     if not context.executing_eagerly():
       raise RuntimeError("`unpack` must be called eagerly.")
-    if issubclass(type(dtensor), resource_variable_ops.BaseResourceVariable):
-      raise TypeError(
-          "Received Variable input to unpack, Variable is not supported.")
     try:
       tensors = _pywrap_dtensor_device.Unpack(
           context.context()._handle,  # pylint: disable=protected-access
@@ -285,7 +274,7 @@ class DTensorDevice(object):
     """
     if not context.executing_eagerly():
       raise RuntimeError("`fetch_layout` must be called eagerly.")
-    if issubclass(type(dtensor), resource_variable_ops.BaseResourceVariable):
+    if _pywrap_utils.IsVariable(dtensor):
       dtensor = dtensor.read_value()
     try:
       layout_string = _pywrap_dtensor_device.FetchLayout(
@@ -294,6 +283,9 @@ class DTensorDevice(object):
           self._device_info)
     except core._NotOkStatusException as e:  # pylint: disable=protected-access
       raise core._status_to_exception(e) from None  # pylint: disable=protected-access
+
+    if layout_string is None:
+      return None
     return layout_lib.Layout.from_string(layout_string)
 
   def is_dtensor(self, tensor: Any) -> bool:
@@ -315,9 +307,8 @@ class DTensorDevice(object):
       raise RuntimeError("`is_dtensor` must be called eagerly.")
     if not tensor_util.is_tensor(tensor):
       return False
-    if isinstance(tensor, variables.Variable):
-      # Get the resource handle for tf.Variable
-      tensor = tensor._handle   # pylint: disable=protected-access
+    if _pywrap_utils.IsVariable(tensor):
+      tensor = tensor._handle  # pylint: disable=protected-access
     return _pywrap_dtensor_device.IsDTensor(
         context.context()._handle,  # pylint: disable=protected-access
         tensor,
diff --git a/tensorflow/dtensor/python/input_util.py b/tensorflow/dtensor/python/input_util.py
index 32fdd5152fa..a4f2beca404 100644
--- a/tensorflow/dtensor/python/input_util.py
+++ b/tensorflow/dtensor/python/input_util.py
@@ -641,3 +641,7 @@ class DTensorDataset(dataset_ops.UnaryUnchangedStructureDataset):
     enumerated_dataset = dataset.enumerate()
     partitioned_dataset = enumerated_dataset.map(slice_batch)
     return partitioned_dataset
+
+  @property
+  def element_spec(self):
+    return self._global_element_spec
diff --git a/tensorflow/dtensor/python/layout.py b/tensorflow/dtensor/python/layout.py
index 9d0fa83d6b0..5eb136be824 100644
--- a/tensorflow/dtensor/python/layout.py
+++ b/tensorflow/dtensor/python/layout.py
@@ -248,17 +248,24 @@ class Mesh(_pywrap_dtensor_device.Mesh):
     shape = ops.convert_to_tensor(self.shape())
     return (device_idx // strides) % shape
 
-  @staticmethod
-  def from_proto(proto: layout_pb2.MeshProto) -> 'Mesh':
+  @classmethod
+  def from_proto(cls, proto: layout_pb2.MeshProto) -> 'Mesh':
     """Construct a mesh instance from input `proto`."""
-    mesh = _pywrap_dtensor_device.Mesh.__new__(Mesh)
-    _pywrap_dtensor_device.Mesh.__init__(mesh, proto)
+    mesh = _pywrap_dtensor_device.Mesh.__new__(cls)
+    _pywrap_dtensor_device.Mesh.__init__(mesh, mesh_proto=proto)
     return mesh
 
-  @staticmethod
-  def from_string(mesh_str: str) -> 'Mesh':
-    mesh = _pywrap_dtensor_device.Mesh.__new__(Mesh)
-    _pywrap_dtensor_device.Mesh.__init__(mesh, mesh_str)
+  @classmethod
+  def from_string(cls, mesh_str: str) -> 'Mesh':
+    mesh = _pywrap_dtensor_device.Mesh.__new__(cls)
+    _pywrap_dtensor_device.Mesh.__init__(mesh, mesh_str=mesh_str)
+    return mesh
+
+  @classmethod
+  def from_device(cls, device: str) -> 'Mesh':
+    """Constructs a single device mesh from a device string."""
+    mesh = _pywrap_dtensor_device.Mesh.__new__(cls)
+    _pywrap_dtensor_device.Mesh.__init__(mesh, single_device=device)
     return mesh
 
   # TODO(b/242201545): implement this in Mesh C++ class
@@ -438,7 +445,7 @@ class Layout(_pywrap_dtensor_device.Layout):
              'valid mesh dimension or UNSHARDED.').format(
                  dim_sharding=dim_sharding))
 
-    super().__init__(sharding_specs, mesh)
+    super().__init__(sharding_specs=sharding_specs, mesh=mesh)
 
   def __repr__(self) -> str:
     return f'Layout(sharding_specs={self.sharding_specs}, mesh={self.mesh})'
@@ -459,12 +466,12 @@ class Layout(_pywrap_dtensor_device.Layout):
   def shape(self):
     return self.mesh.shape()
 
-  @staticmethod
+  @classmethod
   def batch_sharded(
-      mesh: Mesh, batch_dim: str, rank: int, axis: int = 0
+      cls, mesh: Mesh, batch_dim: str, rank: int, axis: int = 0
   ) -> 'Layout':
     """Returns a layout sharded on batch dimension."""
-    layout_obj = _pywrap_dtensor_device.Layout.__new__(Layout)
+    layout_obj = _pywrap_dtensor_device.Layout.__new__(cls)
     _pywrap_dtensor_device.Layout.__init__(
         # Watchout for the different ordering.
         layout_obj,
@@ -485,24 +492,38 @@ class Layout(_pywrap_dtensor_device.Layout):
     ]
     return Layout(new_specs, self.mesh)
 
-  @staticmethod
-  def from_proto(layout_proto: layout_pb2.LayoutProto) -> 'Layout':
+  @classmethod
+  def from_proto(cls, layout_proto: layout_pb2.LayoutProto) -> 'Layout':
     """Creates an instance from a LayoutProto."""
-    layout_obj = _pywrap_dtensor_device.Layout.__new__(Layout)
-    _pywrap_dtensor_device.Layout__init__(layout_obj, layout_proto)
+    layout_obj = _pywrap_dtensor_device.Layout.__new__(cls)
+    _pywrap_dtensor_device.Layout.__init__(
+        layout_obj, layout_proto=layout_proto
+    )
     return layout_obj
 
-  @staticmethod
-  def from_string(layout_str: str) -> 'Layout':
+  @classmethod
+  def from_string(cls, layout_str: str) -> 'Layout':
     """Creates an instance from a human-readable string."""
-    layout_obj = _pywrap_dtensor_device.Layout.__new__(Layout)
-    _pywrap_dtensor_device.Layout.__init__(layout_obj, layout_str)
+    layout_obj = _pywrap_dtensor_device.Layout.__new__(cls)
+    _pywrap_dtensor_device.Layout.__init__(layout_obj, layout_str=layout_str)
     return layout_obj
 
-  @staticmethod
-  def inner_sharded(mesh: Mesh, inner_dim: str, rank: int) -> 'Layout':
+  @classmethod
+  def inner_sharded(cls, mesh: Mesh, inner_dim: str, rank: int) -> 'Layout':
     """Returns a layout sharded on inner dimension."""
-    return Layout.batch_sharded(mesh, inner_dim, rank, axis=rank - 1)
+    return cls.batch_sharded(mesh, inner_dim, rank, axis=rank - 1)
+
+  @classmethod
+  def from_single_device_mesh(cls, mesh: Mesh) -> 'Layout':
+    """Constructs a single device layout from a single device mesh."""
+    layout = _pywrap_dtensor_device.Layout.__new__(cls)
+    _pywrap_dtensor_device.Layout.__init__(layout, mesh=mesh)
+    return layout
+
+  @classmethod
+  def from_device(cls, device: str) -> 'Layout':
+    """Constructs a single device layout from a single device mesh."""
+    return cls.from_single_device_mesh(Mesh.from_device(device))
 
   # TODO(b/242201545): Move this to C++ / find the corresponding function there.
   def offset_to_shard(self):
@@ -531,9 +552,9 @@ class Layout(_pywrap_dtensor_device.Layout):
       index = index + m * o
     return index
 
-  @staticmethod
-  def replicated(mesh: Mesh, rank: int) -> 'Layout':
+  @classmethod
+  def replicated(cls, mesh: Mesh, rank: int) -> 'Layout':
     """Returns a replicated layout of rank `rank`."""
-    layout_obj = _pywrap_dtensor_device.Layout.__new__(Layout)
+    layout_obj = _pywrap_dtensor_device.Layout.__new__(cls)
     _pywrap_dtensor_device.Layout.__init__(layout_obj, mesh=mesh, rank=rank)
     return layout_obj
diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
index 6cb2b98ca6b..d5ef7c00162 100644
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -6,6 +6,7 @@ load(
     "ALL_BACKENDS",
     "GPU_2DEVS_BACKEND",
     "PATHWAYS",
+    "PATHWAYS_V3_DONUT_BACKEND",
     "TPU_V3_DONUT_BACKEND",
     "dtensor_test",
 )
@@ -36,6 +37,7 @@ pytype_library(
     deps = [
         "//tensorflow/dtensor/python:api",
         "//tensorflow/dtensor/python:config",
+        "//tensorflow/dtensor/python:dtensor_device",
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:numpy_util",
         "//tensorflow/dtensor/python:tpu_util",
@@ -85,11 +87,12 @@ dtensor_test(
         TPU_V3_DONUT_BACKEND,
         GPU_2DEVS_BACKEND,
         PATHWAYS,
+        PATHWAYS_V3_DONUT_BACKEND,
     ],
-    disable = [PATHWAYS],
     deps = [
         ":test_util",
         "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:config",
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:dtensor_device",
         "//tensorflow/dtensor/python:layout",
@@ -97,7 +100,6 @@ dtensor_test(
         "//tensorflow/python:array_ops_gen",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
@@ -122,6 +124,7 @@ dtensor_test(
     deps = [
         ":test_util",
         "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:config",
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:dtensor_device",
         "//tensorflow/dtensor/python:layout",
@@ -129,7 +132,6 @@ dtensor_test(
         "//tensorflow/python:array_ops_gen",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
@@ -139,12 +141,48 @@ dtensor_test(
     ],
 )
 
+dtensor_test(
+    name = "device_test",
+    srcs = ["device_test.py"],
+    additional_backends = [TPU_V3_DONUT_BACKEND],
+    main = "device_test.py",
+    shard_count = {
+        TPU_V3_DONUT_BACKEND: 32,
+    },
+    tags = [
+        "nomultivm",
+    ],
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:dtensor_device",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:collective_ops_gen",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_strict_test(
     name = "input_util_test",
     srcs = ["input_util_test.py"],
     shard_count = 8,
     deps = [
         ":test_util",
+        "//tensorflow/dtensor/python:api",
         "//tensorflow/dtensor/python:input_util",
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:mesh_util",
@@ -164,6 +202,89 @@ py_strict_test(
     ],
 )
 
+dtensor_test(
+    name = "layout_test",
+    srcs = ["layout_test.py"],
+    disable = [
+        "gpu",
+        "tpu",
+    ],
+    main = "layout_test.py",
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+dtensor_test(
+    name = "layout_propagation_test",
+    srcs = ["layout_propagation_test.py"],
+    args = if_google(
+        [
+            "--vmodule=dtensor_mlir_passes=4",
+        ],
+        [],
+    ),
+    disable = [
+        "gpu",
+        "tpu",
+    ],
+    main = "layout_propagation_test.py",
+    shard_count = {
+        "cpu": 5,
+    },
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+pytype_library(
+    name = "multi_client_test_util",
+    testonly = if_google(
+        True,
+        oss_value = False,  # build_pip_package depends on this target.
+    ),
+    srcs = ["multi_client_test_util.py"],
+    visibility = [
+        "//tensorflow/dtensor:dtensor-internal",
+        "//tensorflow/dtensor:dtensor-users",
+        "//tensorflow/tools/pip_package:__pkg__",
+    ],
+    deps = [
+        ":test_util",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/flags",
+    ],
+)
+
 dtensor_test(
     name = "multi_client_test",
     srcs = ["multi_client_test.py"],
@@ -184,6 +305,7 @@ dtensor_test(
         "nosan",
     ],  # b/195537906
     deps = [
+        ":multi_client_test_util",
         ":test_util",
         "//tensorflow/dtensor/python:accelerator_util",
         "//tensorflow/dtensor/python:api",
@@ -215,20 +337,21 @@ dtensor_test(
     ],
     args = [
         "--num_clients=0",
-        "--num_devices=2",
+        "--num_local_devices=2",
         "--model_dim_size=2",
     ],
     disable = ALL_BACKENDS,
     env = {
         "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
         "NCCL_P2P_DISABLE": "1",  # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
-        "NCCL_PROTO": "Simple,LL128",  # FIXME(b/272050398): Delete this when the Clang-16/NCCL incompatibility has been resolved.
+        "NCCL_PROTO": "Simple",  # FIXME(b/272050398): Delete this when the Clang-16/NCCL incompatibility has been resolved.
     },
     tags = [
         "no_windows",
         "nosan",  # b/195537906
     ],
     deps = [
+        ":multi_client_test_util",
         ":test_util",
         "//tensorflow/dtensor/python:accelerator_util",
         "//tensorflow/dtensor/python:api",
@@ -260,20 +383,21 @@ dtensor_test(
     ],
     args = [
         "--num_clients=2",
-        "--num_devices=1",
+        "--num_local_devices=1",
         "--model_dim_size=2",
     ],
     disable = ALL_BACKENDS,
     env = {
         "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
         "NCCL_P2P_DISABLE": "1",  # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
-        "NCCL_PROTO": "Simple,LL128",  # FIXME(b/272050398): Delete this when the Clang-16/NCCL incompatibility has been resolved.
+        "NCCL_PROTO": "Simple",  # FIXME(b/272050398): Delete this when the Clang-16/NCCL incompatibility has been resolved.
     },
     tags = [
         "no_windows",
         "nosan",  # b/195537906
     ],
     deps = [
+        ":multi_client_test_util",
         ":test_util",
         "//tensorflow/dtensor/python:accelerator_util",
         "//tensorflow/dtensor/python:api",
@@ -357,7 +481,7 @@ dtensor_test(
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -410,7 +534,7 @@ dtensor_test(
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/dtensor/python/tests/collective_test.py b/tensorflow/dtensor/python/tests/collective_test.py
index 309edb88ffa..1db7eadc31d 100644
--- a/tensorflow/dtensor/python/tests/collective_test.py
+++ b/tensorflow/dtensor/python/tests/collective_test.py
@@ -21,9 +21,11 @@ import numpy as np
 
 # pylint: disable=g-direct-tensorflow-import
 from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import config
 from tensorflow.dtensor.python import d_variable
 from tensorflow.dtensor.python import dtensor_device
 from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python.tests import test_backend_util
 from tensorflow.dtensor.python.tests import test_util
 from tensorflow.python.eager.polymorphic_function import polymorphic_function
 from tensorflow.python.framework import constant_op
@@ -88,7 +90,20 @@ class CollectiveTest(test_util.DTensorBaseTest):
 
     self.assertDTensorEqual(expected_result, self.scalar_layout, dtensor_result)
 
+  def testReduceOnInt8(self):
+    a = constant_op.constant(
+        np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), dtype=dtypes.int8
+    )
+
+    expected_result = math_ops.reduce_sum(a)
+
+    sharded_a = api.relayout(a, self.first_dimension_sharded_layout_2d)
+    dtensor_result = math_ops.reduce_sum(sharded_a)
+
+    self.assertDTensorEqual(expected_result, self.scalar_layout, dtensor_result)
+
   def testTwoReducesWithAssign(self):
+    self.skipForPathways('TODO(b/260775095)')
     # FIXME(b/238384852): The purpose of this test is to validate the control
     # dependency added by DTensor.
     # However, as we have no way of testing the per-device graph
@@ -202,30 +217,31 @@ class CollectiveTest(test_util.DTensorBaseTest):
     mesh = layout_lib.Mesh(_MESH_DIMS, global_ids, local_ids,
                            test_util.create_device_list((2, 4), 'TPU'),
                            'tpu_mesh')
-    device = dtensor_device.DTensorDevice(meshes=[mesh])
     # This works because on 2x2, global device IDs are equal to physical TPU
     # core IDs: both are range(8). So local device IDs happen to be usable here.
     # TODO(b/180046115): Add a device.get_tpu_core_ids method and translate
     # device IDs to core IDs before setting the list here.
-    device.set_tpu_core_ids('tpu_mesh', local_ids)
+    if not config.backend_is_pw():
+      device = dtensor_device.DTensorDevice(meshes=[mesh])
+      device.set_tpu_core_ids('tpu_mesh', local_ids)
+    else:
+      test_backend_util.config_test_mesh(mesh)
     layout_x = Layout.batch_sharded(mesh, _MESH_DIM_X, 2)
     layout_y = Layout.batch_sharded(mesh, _MESH_DIM_Y, 2)
 
     # Create a 2x4 batch-sharded d-tensor, with batch IDs in its first column
     # and zeros in other columns.
-    # pylint: disable=g-complex-comprehension
-    replica_ids = [
-        constant_op.constant([loc[_MESH_DIM_X], 0, 0, 0],
-                             dtype=dtypes.int32,
-                             shape=[1, 4])
-        for loc in mesh.local_device_locations()
-    ]
-    # pylint: enable=g-complex-comprehension
-    replica_ids = device.pack(replica_ids, layout_x)
+    replica_ids = constant_op.constant(
+        np.array([[0, 0, 0, 0], [1, 0, 0, 0]]), dtype=dtypes.int32
+    )
+    replica_ids = api.relayout(replica_ids, layout_x)
 
     # Create a 4x4 y-sharded d-tensor filled with ones.
-    ones = [array_ops.ones([1, 4], dtype=dtypes.int32)] * 8
-    ones = device.pack(ones, layout_y)
+    ones = constant_op.constant(
+        np.array([[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]),
+        dtype=dtypes.int32,
+    )
+    ones = api.relayout(ones, layout_y)
 
     # If `a` has a layout of [x, unsharded], and `b` has a layout of
     # [y, unsharded], the matmul will slice `a` to [x, y], do a local matmul,
@@ -236,7 +252,7 @@ class CollectiveTest(test_util.DTensorBaseTest):
     # function) to produce correct `begin` values for slicing `a`.
     #
     # Although this function only contains a single op, running it in op-by-op
-    # mode doesn't produce the intented effect because the output of
+    # mode doesn't produce the intended effect because the output of
     # math_ops.matmul would have a layout of [y, unsharded] instead of
     # [x, unsharded].
     @polymorphic_function.function
@@ -253,8 +269,8 @@ class CollectiveTest(test_util.DTensorBaseTest):
         for loc in mesh.local_device_locations()
     ]
 
-    self.assertEqual(device.fetch_layout(dtensor_result), layout_x)
-    dtensor_result = [t.numpy() for t in device.unpack(dtensor_result)]
+    self.assertEqual(api.fetch_layout(dtensor_result), layout_x)
+    dtensor_result = [t.numpy() for t in api.unpack(dtensor_result)]
     self.assertAllEqual(expected_result, dtensor_result)
 
   def testDifferentShapesBetweenCalls(self):
diff --git a/tensorflow/dtensor/python/tests/device_test.py b/tensorflow/dtensor/python/tests/device_test.py
new file mode 100644
index 00000000000..f8377606c09
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/device_test.py
@@ -0,0 +1,775 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DTensorDevice in python."""
+import threading
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.dtensor.python import api
+
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import dtensor_device
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_collective_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+# Convenient constants to use for tests.
+_BATCH_DIM = "batch"
+_MESH_DIM_X = "x"
+
+# Shorter notation
+Layout = layout_lib.Layout
+Mesh = layout_lib.Mesh
+UNSHARDED = layout_lib.UNSHARDED
+
+
+class DTensorDeviceTest(test_util.DTensorBaseTest, parameterized.TestCase):
+
+  def setUp(self):
+    super(DTensorDeviceTest, self).setUp()
+    device_ids = test_util.create_device_ids_array((2,))
+    local_device_ids = np.ravel(device_ids).tolist()
+    mesh_dict = {  # pylint: disable=g-complex-comprehension
+        device: Mesh(
+            [_BATCH_DIM],
+            device_ids,
+            local_device_ids,
+            test_util.create_device_list((2,), device),
+        )
+        for device in ("CPU", "GPU", "TPU")
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+  def testInvalidLayout(self):
+    a = api.copy_to_mesh(
+        constant_op.constant([1.0]), Layout.replicated(self.mesh, rank=1)
+    )
+    b = array_ops.identity(a)
+    with self.assertRaises(ValueError):
+      api.check_layout(b, Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=1))
+
+  @parameterized.parameters(True, False)
+  def testAsyncOption(self, is_async):
+    # There isn't a great way to test whether something actually executed
+    # synchronously; this test just exercises the option.
+    device = dtensor_device.DTensorDevice([], is_async=is_async)
+    a = device.copy_to_mesh(
+        constant_op.constant([1.0]), Layout.replicated(self.mesh, rank=1)
+    )
+    b = array_ops.identity(a)
+    self.assertEqual([1.], b.numpy())
+
+  def testBasicTypeBasedDispatch(self):
+    # Tests for b = Op(a).
+    a = constant_op.constant([1.0, 2.0, 3.0, 4.0])
+    a = api.copy_to_mesh(a, Layout.replicated(self.mesh, rank=1))
+
+    # __getitem__
+    b = a[2:-2]
+    api.check_layout(b, Layout.replicated(self.mesh, rank=1))
+
+    c = a * 2
+    api.check_layout(b, Layout.replicated(self.mesh, rank=1))
+
+    self.assertAllEqual(a.numpy(), [1., 2., 3., 4.])
+    self.assertAllEqual(c.numpy(), [2., 4., 6., 8.])
+
+  def testNoImplicitCopyOnForLargeIntegerTensors(self):
+    a = array_ops.ones([10, 10], dtype=dtypes.int32)
+    a = api.copy_to_mesh(a, Layout.replicated(self.mesh, rank=2))
+    big = array_ops.ones([10, 10], dtype=dtypes.int32)
+    small = array_ops.ones([10], dtype=dtypes.int32)
+    with self.assertRaises(errors_impl.UnimplementedError):
+      a + big  # pylint:disable=pointless-statement
+    a + small  # pylint:disable=pointless-statement
+
+  def test_concurrent_execute(self):
+    results = {}
+
+    def func(thread_id):
+      @polymorphic_function.function
+      def update_variable(initial_value, num_round):
+        y = math_ops.multiply(initial_value, num_round)
+        return math_ops.add(initial_value, y)
+
+      for n in range(10):
+        with api._dtensor_device()._experimental_default_mesh(self.mesh):
+          x = stateless_random_ops.stateless_random_uniform(
+              [10], seed=(1, 2), minval=0, maxval=255
+          )
+          y = api.copy_to_mesh(x, Layout.replicated(self.mesh, rank=1))
+          y = update_variable(y, n + 1)
+          results[thread_id] = y
+
+    threads = {}
+    for a in range(10):
+      t = threading.Thread(target=func, args=(a,))
+      threads[a] = t
+      t.start()
+
+    for thrad_id, thread in threads.items():
+      thread.join()
+      self.assertIsNotNone(results[thrad_id])
+
+  def testNoImplicitCopyOnForScalarVariableOnNonCPUMesh(self):
+    self.skipForTfrt("b/235088250")
+    self.skipForDeviceType(["CPU"], "CPU mesh implicit copy is allowed.")
+    init_value = api.call_with_layout(
+        array_ops.ones, shape=(1), layout=Layout.replicated(self.mesh, rank=1)
+    )
+    with self.assertRaisesRegex(
+        errors_impl.InvalidArgumentError,
+        r"Using a non-DTensor variable with DTensor is only supported for..*\n"
+        r".*Shape: \[1\].*\n"
+        ".*device_test.py.*",
+    ):
+      api.copy_to_mesh(
+          variables.Variable(init_value), Layout.replicated(self.mesh, rank=1)
+      )
+
+  @parameterized.named_parameters(
+      test_util.product(
+          [
+              ("Int32", dtypes.int32),
+              ("Float32", dtypes.float32),
+              ("Int64", dtypes.int64),
+              ("Float64", dtypes.float64),
+          ],
+          [
+              (
+                  "Scalar",
+                  [],
+              ),
+              (
+                  "RankOne",
+                  [1],
+              ),
+              ("RankTwo", [2, 2]),
+          ],
+      )
+  )
+  def testImplicitCopyVariableOnCPUMesh(self, dtype, shape):
+    self.skipForTfrt("b/235088250")
+    self.skipForDeviceType(
+        ["GPU", "TPU"], "Variable implicit copy is only allowed for CPU mesh.")
+
+    variable = d_variable.DVariable(array_ops.ones(shape=shape, dtype=dtype))
+    new_value = array_ops.zeros(shape=shape, dtype=dtype)
+
+    @polymorphic_function.function
+    def assign_function(v, new_value):
+      return v.assign(new_value)
+
+    layout = Layout.replicated(self.mesh, rank=len(shape))
+    # Run explicitly on the dtensor device with a default mesh since
+    # we do not have any registered mesh to broadcast the inputs to.
+    with api.default_mesh(self.mesh):
+      assign_function(variable, api.pack([new_value] * self.mesh.size, layout))
+      read_value = variable.read_value()
+    self.assertDTensorEqual(new_value, layout, read_value)
+
+  def testNumpyCallWithReplicatedInput(self):
+    a = constant_op.constant([1.0, 2.0, 3.0, 4.0])
+    a = api.copy_to_mesh(a, Layout.replicated(self.mesh, rank=1))
+    b = a.numpy()
+    self.assertAllEqual(b, [1., 2., 3., 4.])
+
+  def testTensorIteration(self):
+    a = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+    a = api.copy_to_mesh(a, Layout.replicated(self.mesh, rank=1))
+    iterator = iter(a)
+    self.assertAllClose(1., next(iterator))
+
+  def testCopyToMeshWithSameLayout(self):
+    a = constant_op.constant([1.0, 2.0, 3.0, 4.0])
+    a = api.copy_to_mesh(a, Layout.replicated(self.mesh, rank=1))
+    a = api.copy_to_mesh(a, Layout.replicated(self.mesh, rank=1))
+    api.check_layout(a, Layout.replicated(self.mesh, rank=1))
+
+  def testSetDefaultLayoutEager(self):
+    tensor = constant_op.constant([[1.0], [1.0]])
+    tensor = api.copy_to_mesh(tensor, Layout.replicated(self.mesh, rank=2))
+    with api._dtensor_device()._default_layout(
+        Layout.replicated(self.mesh, rank=1)):
+      tensor = array_ops.reshape(tensor, [-1])
+    api.check_layout(tensor, Layout.replicated(self.mesh, rank=1))
+    self.assertAllClose([1., 1.], tensor.numpy())
+
+  def testSetDefaultLayoutFunction(self):
+
+    @polymorphic_function.function
+    def func():
+      tensor = constant_op.constant([[1.0], [1.0]])
+      return array_ops.reshape(tensor, [-1]), array_ops.reshape(tensor, [-1])
+
+    with api._dtensor_device()._default_layout(
+        Layout.batch_sharded(self.mesh, batch_dim=_BATCH_DIM, rank=1)
+    ):
+      tensor1, tensor2 = func()
+
+    api.check_layout(
+        tensor1, Layout.batch_sharded(self.mesh, batch_dim=_BATCH_DIM, rank=1)
+    )
+    api.check_layout(tensor2, Layout.replicated(self.mesh, rank=1))
+    tensor1 = api.relayout(tensor1, Layout.replicated(self.mesh, rank=1))
+
+    self.assertAllClose([1.0, 1.0], tensor1.numpy())
+    self.assertAllClose([1.0, 1.0], tensor2.numpy())
+
+  @parameterized.named_parameters(
+      # pylint: disable=unnecessary-lambda
+      # Needed for the DVariable monkey patch to work.
+      ("Variable", lambda x: d_variable.DVariable(x)),
+      # pylint: enable=unnecessary-lambda
+      ("Tensor", lambda x: x),
+  )
+  def testStringRepresentation(self, transform):
+    replicated = api.copy_to_mesh(
+        constant_op.constant(8.0), Layout.replicated(self.mesh, rank=0)
+    )
+    replicated = transform(replicated)
+    replicated_str = str(replicated)
+    self.assertIn("8", replicated_str)
+    self.assertIn("layout", replicated_str)
+
+    sharded = api.pack(
+        [constant_op.constant([8.0]), constant_op.constant([9.0])],
+        layout=Layout([_BATCH_DIM], self.mesh),
+    )
+    sharded = transform(sharded)
+    sharded_str = str(sharded)
+    self.assertIn("8", sharded_str)
+    self.assertIn("9", sharded_str)
+    self.assertIn("layout", sharded_str)
+
+  @parameterized.named_parameters(("Async", True), ("Sync", False))
+  def testCancellation(self, is_async):
+    self.skipForTfrt("b/181368626: support cancellation in tfrt.")
+    self.skipForDeviceType(["TPU"], "b/195552283: Fix cancellation on TPU.")
+    device = dtensor_device.DTensorDevice(meshes=[self.mesh], is_async=is_async)
+
+    @polymorphic_function.function
+    def f(x):
+      # Integer division by 0 on one device, which returns a bad status.
+      x = math_ops.cast(gen_math_ops.div(x=x, y=x), dtypes.float32)
+      # A reduction requiring a collective, which would normally deadlock with
+      # one of its participants missing.
+      return math_ops.reduce_sum(x, axis=0)
+
+    a = constant_op.constant([[1, 2]])
+    b = constant_op.constant([[0, 1]])
+    x = device.pack([a, b], layout=Layout([_BATCH_DIM, UNSHARDED], self.mesh))
+    with self.assertRaisesRegex(
+        errors_impl.InvalidArgumentError, "Integer division by zero"
+    ):
+      y = f(x)
+      y.numpy()
+    z = array_ops.identity(x)
+    self.assertAllClose([[[1, 2]], [[0, 1]]], device.unpack(z))
+
+  def testCopyToMeshShapeFn(self):
+
+    @polymorphic_function.function
+    def f():
+      c = constant_op.constant([1.0, 2.0])
+      on_mesh = api.copy_to_mesh(c, Layout.replicated(self.mesh, rank=1))
+      return on_mesh
+
+    output, = f.get_concrete_function().outputs
+    self.assertEqual([2], output.shape)
+
+  def testUnpackInvalidInput(self):
+    # Test for b/255629824
+    with self.assertRaisesRegex(TypeError, "Expecting a Tensor"):
+      api.unpack(
+          **{
+              "tensor": [[
+                  41.8684053521925,
+                  731.610023060566,
+                  356.0701500440248,
+                  9.62928117100512,
+                  185.0041559439026,
+                  225.87663065861508,
+                  450.2403652750002,
+                  268.7273627027147,
+              ]]
+          }
+      )
+
+  def testIsDTensorInvalidInput(self):
+    # Test for b/272381211
+    self.assertFalse(api.fetch_layout(**{"tensor": -1024}))
+
+  def testFetchLayoutInvalidInput(self):
+    # Test for b/272381211
+    self.assertIsNone(api.fetch_layout(**{"tensor": -1024}))
+
+  def testFetchLayoutForDVariablesReturnsCorrectLayout(self):
+    layout = Layout.replicated(self.mesh, 2)
+    with api._dtensor_device()._experimental_default_mesh(self.mesh):
+      dvariable = d_variable.DVariable(
+          api.call_with_layout(
+              array_ops.ones, shape=[2, 3], dtype=dtypes.float32, layout=layout
+          )
+      )
+    self.assertEqual(layout, api.fetch_layout(dvariable))
+
+  def testFetchLayoutForDTensorReturnsCorrectLayout(self):
+    layout = Layout.replicated(self.mesh, 2)
+    tensor = api.call_with_layout(
+        array_ops.ones, shape=[2, 3], dtype=dtypes.float32, layout=layout
+    )
+    self.assertEqual(layout, api.fetch_layout(tensor))
+
+  def testFetchLayoutForRegularTensorsThrowsError(self):
+    with self.assertRaisesRegex(
+        errors_impl.InvalidArgumentError,
+        "FetchLayout expects a tensor placed on the layout device.",
+    ):
+      api.fetch_layout(constant_op.constant([2, 3]))
+
+  def testFetchLayoutNotEagerlyRaisesRuntimeError(self):
+
+    @polymorphic_function.function
+    def f(dtensor_input):
+      api.fetch_layout(dtensor_input)
+
+    with self.assertRaisesRegex(RuntimeError,
+                                "`fetch_layout` must be called eagerly."):
+      f(
+          api.copy_to_mesh(
+              constant_op.constant(1.0), Layout.replicated(self.mesh, rank=0)
+          )
+      )
+
+  def testIsDTensor(self):
+    normal_tensor = array_ops.zeros(shape=[10, 10])
+    self.assertFalse(api.is_dtensor(normal_tensor))
+
+    layout = Layout.replicated(self.mesh, rank=1)
+    d_tensor = api.call_with_layout(array_ops.zeros, layout=layout, shape=[10])
+    self.assertTrue(api.is_dtensor(d_tensor))
+
+    var = d_variable.DVariable(d_tensor)
+    self.assertTrue(api.is_dtensor(var))
+
+    self.assertFalse(api.is_dtensor([0, 1]))
+    self.assertFalse(api.is_dtensor({False: True}))
+
+    self.assertFalse(api.is_dtensor(1))
+
+    class C:
+      pass
+
+    self.assertFalse(api.is_dtensor(C()))
+
+  def testIsDTensorNotEagerlyRaisesRuntimeError(self):
+
+    @polymorphic_function.function
+    def f(dtensor_input):
+      api.is_dtensor(dtensor_input)
+
+    with self.assertRaisesRegex(
+        RuntimeError, "`is_dtensor` must be called eagerly."):
+      f(
+          api.copy_to_mesh(
+              constant_op.constant(1.0), Layout.replicated(self.mesh, 0)
+          )
+      )
+
+  def testSingleDeviceMesh(self):
+    # FIXME(b/274647196): Add a mesh_util API that takes CPU:0.
+    cpu0_mesh = Mesh.from_device("/job:localhost/replica:0/task:0/device:CPU:0")
+    with api.default_mesh(cpu0_mesh):
+      a = constant_op.constant(1.0)
+
+    self.assertFalse(api.is_dtensor(a))
+    self.assertIn("CPU:0", a.device)
+
+    with api.default_mesh(cpu0_mesh):
+      b = array_ops.ones(shape=(3, 3))
+
+    self.assertTrue(api.is_dtensor(b))
+    self.assertEqual(api.fetch_layout(b).mesh, cpu0_mesh)
+
+  def testUnsupportedOpReplicatedInput(self):
+    with api.default_mesh(self.mesh):
+      t = array_ops.ones(shape=(8, 3))
+      a = gen_collective_ops.collective_reduce_v2(
+          t,
+          group_size=1,
+          group_key=1030,
+          instance_key=1,
+          merge_op="Add",
+          final_op="Id",
+          ordering_token=[],
+      )
+
+    self.assertFalse(api.is_dtensor(a))
+    self.assertAllClose(a.numpy(), t.numpy())
+
+  def testUnsupportedOpShardedInput(self):
+    with api.default_mesh(self.mesh):
+      t = array_ops.ones(shape=(8, 3))
+      t = api.relayout(
+          t, Layout.batch_sharded(mesh=self.mesh, batch_dim=_BATCH_DIM, rank=2)
+      )
+      with self.assertRaisesRegex(
+          errors_impl.UnimplementedError, "not supported"
+      ):
+        # This is an Op that we don't have a SPMD expander.
+        gen_collective_ops.collective_reduce_v2(
+            t,
+            group_size=1,
+            group_key=1030,
+            instance_key=1,
+            merge_op="Add",
+            final_op="Id",
+            ordering_token=[],
+        )
+
+
+class DTensorPackUnpackOnOneDMeshTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(DTensorPackUnpackOnOneDMeshTest, self).setUp()
+    global_ids = test_util.create_device_ids_array((2,))
+    local_device_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {  # pylint: disable=g-complex-comprehension
+        device: Mesh(
+            [_BATCH_DIM],
+            global_ids,
+            local_device_ids,
+            test_util.create_device_list((2,), device),
+        )
+        for device in ("CPU", "GPU", "TPU")
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+  def testUnpack(self):
+    with api.default_mesh(self.mesh):
+      v = constant_op.constant(1.0)
+      v = api.copy_to_mesh(v, Layout.replicated(self.mesh, rank=0))
+      self.assertAllClose([1.0, 1.0], api.unpack(v))
+
+  def testUnpackVariables(self):
+    v0 = d_variable.DVariable(
+        api.call_with_layout(
+            array_ops.ones,
+            shape=[2, 3],
+            dtype=dtypes.float32,
+            layout=Layout.replicated(self.mesh, 2),
+        )
+    )
+    with self.assertRaisesRegex(TypeError, "Expecting a Tensor"):
+      api._dtensor_device().unpack(v0)
+
+  def testUnpackingRegularTensorRaisesInvalidArgumentError(self):
+    with self.assertRaisesRegex(
+        errors_impl.InvalidArgumentError,
+        "DTensorUnpack expects a tensor placed on the DTensor device",
+    ):
+      api._dtensor_device().unpack(constant_op.constant([1.0, 2.0]))
+
+  def testUnpackingNotEagerlyRaisesRuntimeError(self):
+
+    @polymorphic_function.function
+    def f(dtensor_input):
+      api._dtensor_device().unpack(dtensor_input)
+
+    with self.assertRaisesRegex(
+        RuntimeError, "`unpack` must be called eagerly."):
+      f(
+          api.copy_to_mesh(
+              constant_op.constant(1.0), Layout.replicated(self.mesh, rank=0)
+          )
+      )
+
+  def testPack(self):
+    a = constant_op.constant([1.0, 2.0])
+    b = constant_op.constant([3.0, 4.0])
+    with ops.device_v2(api.device_name()):
+      packed_tensor = api.pack(
+          [a, b], layout=Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=1)
+      )
+      api.check_layout(
+          packed_tensor, Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=1)
+      )
+      self.assertAllEqual([
+          4,
+      ], packed_tensor.shape)
+      unpacked_tensor = api.unpack(packed_tensor)
+      self.assertAllClose([1., 2.], unpacked_tensor[0])
+      self.assertAllClose([3., 4.], unpacked_tensor[1])
+
+  def testPackingNotEagerlyRaisesRuntimeError(self):
+
+    @polymorphic_function.function
+    def f(a):
+      api.pack([a, a], layout=Layout.replicated(self.mesh, rank=1))
+
+    with self.assertRaisesRegex(RuntimeError, "`pack` must be called eagerly."):
+      f(constant_op.constant([1.0]))
+
+  def testPackingVariablesRaisesError(self):
+    with self.assertRaisesRegex(
+        errors_impl.InvalidArgumentError, "Variable input is not supported."
+    ):
+      api._dtensor_device().pack(
+          [
+              d_variable.DVariable(array_ops.ones([2, 3])),
+              d_variable.DVariable(array_ops.ones([2, 3])),
+          ],
+          Layout.replicated(self.mesh, rank=2),
+      )
+
+  def testPackDevice(self):
+    a = constant_op.constant([1.0, 2.0])
+    b = constant_op.constant([3.0, 4.0])
+    with ops.device_v2(api.device_name()):
+      packed_tensor = api.pack(
+          [a, b], layout=Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=1)
+      )
+      unpacked_tensor = api.unpack(packed_tensor)
+      self.assertAllEqual(self.mesh.local_devices(),
+                          [t.device for t in unpacked_tensor])
+
+  def testPackScalar(self):
+    a = constant_op.constant(1.0)
+    with ops.device_v2(api.device_name()):
+      packed_layout = Layout([], self.mesh)
+      packed_tensor = api.pack([a, a], layout=packed_layout)
+      api.check_layout(packed_tensor, packed_layout)
+      self.assertAllEqual([], packed_tensor.shape)
+      unpacked_tensor = api.unpack(packed_tensor)
+      self.assertAllClose([a, a], unpacked_tensor)
+
+  def testPackHigherRankValue(self):
+    # Pack a rank 3 matrix into a 1d mesh.
+    a = constant_op.constant(
+        [[[1, 2, 3], [4, 5, 6]], [[2, 3, 4], [5, 6, 7]]]
+    )  # 2x2x3
+    b = constant_op.constant(
+        [[[3, 2, 1], [6, 5, 4]], [[4, 3, 2], [7, 6, 5]]]
+    )  # 2x2x3
+    pack_layout = Layout([_BATCH_DIM, UNSHARDED, UNSHARDED], self.mesh)
+    with ops.device_v2(api.device_name()):
+      # pack to 4x2x3
+      packed_tensor = api.pack([a, b], layout=pack_layout)
+      api.check_layout(packed_tensor, pack_layout)
+      self.assertAllEqual([4, 2, 3], packed_tensor.shape)
+
+
+class DTensorPackUnpackOnTwoDMeshTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+
+    self.skipForDeviceType(["TPU"],
+                           "all tests require 8 TPU cores.",
+                           unless_device_count_equals_to=8)
+
+    global_ids = test_util.create_device_ids_array((2, 4))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {  # pylint: disable=g-complex-comprehension
+        device: Mesh(
+            [_BATCH_DIM, _MESH_DIM_X],
+            global_ids,
+            local_ids,
+            test_util.create_device_list((2, 4), device),
+        )
+        for device in ("CPU", "GPU", "TPU")
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+  def testPackWithScalars(self):
+    a = constant_op.constant(1.23)
+    with ops.device_v2(api.device_name()):
+      packed_layout = Layout([], self.mesh)
+      packed_tensor = api.pack([a, a, a, a, a, a, a, a], layout=packed_layout)
+
+      api.check_layout(packed_tensor, packed_layout)
+
+      self.assertAllEqual([], packed_tensor.shape)
+      unpacked_tensor = api.unpack(packed_tensor)
+      self.assertAllClose([a, a, a, a, a, a, a, a], unpacked_tensor)
+
+  def testPackWithScalarsWithInvalidRank(self):
+    a = constant_op.constant(1.23)
+    with ops.device_v2(api.device_name()):
+      invalid_packed_layout = Layout([UNSHARDED], self.mesh)
+      with self.assertRaisesRegex(
+          errors_impl.InvalidArgumentError,
+          "Packed layout should have the same rank",
+      ):
+        _ = api.pack([a, a, a, a, a, a, a, a], layout=invalid_packed_layout)
+
+  def testPackWithBatchSharding(self):
+    a = constant_op.constant([[1.0], [2.0]])
+    b = constant_op.constant([[3.0], [4.0]])
+    with ops.device_v2(api.device_name()):
+      packed_tensor = api.pack(
+          [a, a, a, a, b, b, b, b],
+          layout=Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=2),
+      )
+      api.check_layout(
+          packed_tensor, Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=2)
+      )
+      self.assertAllEqual([4, 1], packed_tensor.shape)
+      unpacked_tensor = api.unpack(packed_tensor)
+    self.assertAllClose([a, a, a, a, b, b, b, b], unpacked_tensor)
+
+  def testPackWithFullReplicated(self):
+    a = constant_op.constant([[1.0], [2.0]])
+    with ops.device_v2(api.device_name()):
+      packed_layout = Layout([UNSHARDED, UNSHARDED], self.mesh)
+      packed_tensor = api.pack([a, a, a, a, a, a, a, a], layout=packed_layout)
+      api.check_layout(packed_tensor, packed_layout)
+      self.assertAllEqual([2, 1], packed_tensor.shape)
+      unpacked_tensor = api.unpack(packed_tensor)
+    self.assertAllClose([a, a, a, a, a, a, a, a], unpacked_tensor)
+
+  def testFillUsesSpecifiedLayout(self):
+    with api.default_mesh(self.mesh):
+      # TODO(allenl): Figure out why the embedded constant (triggered for
+      # dtypes.int32) gets a sharded layout by default.
+      dims = constant_op.constant([4, 1], dtype=dtypes.int64)
+      value = constant_op.constant(1.0)
+      with api._dtensor_device()._default_layout(
+          Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=2)):
+        filled = array_ops.fill(value=value, dims=dims)
+      api.check_layout(
+          filled, Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=2)
+      )
+      unpacked_tensor = api.unpack(filled)
+    self.assertAllClose(8 * [[[1.], [1.]]], unpacked_tensor)
+    self.assertEqual([4, 1], filled.shape)
+
+
+class DTensorSparse(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+
+    self.skipForDeviceType(["TPU"],
+                           "all tests require 8 TPU cores.",
+                           unless_device_count_equals_to=8)
+
+    global_ids = test_util.create_device_ids_array((2, 4))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {  # pylint: disable=g-complex-comprehension
+        device: Mesh(
+            [_BATCH_DIM, _MESH_DIM_X],
+            global_ids,
+            local_ids,
+            test_util.create_device_list((2, 4), device),
+        )
+        for device in ("CPU", "GPU", "TPU")
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+  @parameterized.named_parameters(
+      test_util.product([("Replicated", "replicated"), ("Sharded", "batch")], [(
+          "RankTwo",
+          [2, 4],
+      ), (
+          "RankThree",
+          [2, 2, 3],
+      )]))
+  def testPackUnpackReturnsCorrectValuesAndDevices(self, sharding, shape):
+    a = sparse_ops.from_dense(
+        stateless_random_ops.stateless_random_uniform(shape, seed=[0, 1])
+    )
+    b = sparse_ops.from_dense(
+        stateless_random_ops.stateless_random_uniform(shape, seed=[0, 1])
+    )
+
+    if sharding == "replicated":
+      layout = Layout(["unsharded"] * len(shape), self.mesh)
+      input_tensor = 8 * [a]
+      expected = 8 * [sparse_ops.sparse_tensor_to_dense(a)]
+      expected_shape = shape
+    else:
+      layout = Layout([_BATCH_DIM] + ["unsharded"] * (len(shape) - 1),
+                      self.mesh)
+      input_tensor = 4 * [a] + 4 * [b]
+      expected = 4 * [sparse_ops.sparse_tensor_to_dense(a)] + 4 * [
+          sparse_ops.sparse_tensor_to_dense(b)
+      ]
+      expected_shape = [shape[0] * 2] + shape[1:]
+
+    with ops.device_v2(api._dtensor_device().name):
+      packed_tensor = api.pack(input_tensor, layout)
+      api.check_layout(packed_tensor, layout)
+      unpacked_tensor = api.unpack(packed_tensor)
+
+    got = [sparse_ops.sparse_tensor_to_dense(t) for t in unpacked_tensor]
+
+    # Check shape of packed tensor.
+    self.assertAllEqual(expected_shape, packed_tensor.shape)
+    # Check values.
+    self.assertAllClose(expected, got)
+    # Check devices.
+    self.assertAllEqual(self.mesh.local_devices(),
+                        [t.indices.device for t in unpacked_tensor])
+    self.assertAllEqual(self.mesh.local_devices(),
+                        [t.values.device for t in unpacked_tensor])
+
+  def testPackingMixedTensorTypesRaisesTypeError(self):
+    tensor = stateless_random_ops.stateless_random_uniform([2, 4], seed=[0, 1])
+    sparse_tensor = sparse_ops.from_dense(tensor)
+    with ops.device_v2(api.device_name()):
+      with self.assertRaisesRegex(TypeError,
+                                  "Cannot Pack SparseTensors with Tensors."):
+        api.pack(
+            4 * [tensor] + 4 * [sparse_tensor],
+            Layout.replicated(self.mesh, rank=2),
+        )
+
+  def testPackingTensorsWithDifferentShapesRaisesTypeError(self):
+    a = sparse_ops.from_dense(
+        stateless_random_ops.stateless_random_uniform([2, 2], seed=[0, 1])
+    )
+    b = sparse_ops.from_dense(
+        stateless_random_ops.stateless_random_uniform([4, 4], seed=[0, 1])
+    )
+    with ops.device_v2(api.device_name()):
+      with self.assertRaisesRegex(
+          TypeError, "All input SparseTensors to Pack must be same shape."):
+        api.pack(4 * [a] + 4 * [b], Layout.replicated(self.mesh, rank=2))
+
+  def testPackingSparseTensorsReturnsCorrectLayout(self):
+    layout = Layout.replicated(self.mesh, 2)
+    a = sparse_ops.from_dense(
+        stateless_random_ops.stateless_random_uniform([16, 16], seed=[0, 1])
+    )
+    with ops.device_v2(api.device_name()):
+      api.check_layout(api.pack(8 * [a], layout), layout)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/input_util_test.py b/tensorflow/dtensor/python/tests/input_util_test.py
index a48e18da26e..08f31975f4c 100644
--- a/tensorflow/dtensor/python/tests/input_util_test.py
+++ b/tensorflow/dtensor/python/tests/input_util_test.py
@@ -14,12 +14,14 @@
 # ==============================================================================
 """Tests for DTensor input pipeline utilities."""
 
+import contextlib
 import threading
 
 from absl.testing import parameterized
 
 import numpy as np
 
+from tensorflow.dtensor.python import api
 from tensorflow.dtensor.python import input_util
 from tensorflow.dtensor.python import layout as layout_lib
 from tensorflow.dtensor.python import mesh_util
@@ -133,13 +135,10 @@ class DTensorDatasetTest(test_util.DTensorBaseTest):
 
     iterator = iter(dataset.batch(batch_size, drop_remainder=True))
     output, iters = train_fn(iterator, num_batches)
-    # Try one more iteration which will raise an exception since the iterator is
-    # exhausted.
-    with self.assertRaises(exception):
-      train_fn(iterator, 1)
 
     d_iterator = iter(d_dataset)
     d_output, d_iters = train_fn(d_iterator, num_batches)
+    mesh_util.barrier(self.mesh)
 
     # Try one more iteration which will raise an exception since the iterator is
     # exhausted.
@@ -153,6 +152,41 @@ class DTensorDatasetTest(test_util.DTensorBaseTest):
     self.assertEqual(iters, d_iters)
     self.assertDTensorEqual(output, images_layout, d_output)
 
+  @parameterized.named_parameters(('Eager', False), ('Graph', True))
+  def testForInIteration(self, is_graph):
+    batch_size = 8
+    num_batches = 4
+    dataset = dataset_ops.DatasetV2.from_tensor_slices(
+        self._images([batch_size * num_batches, 8, 8, 3]))
+    images_layout = Layout.batch_sharded(
+        self.mesh, batch_dim=MESH_DIM_BATCH, rank=4)
+
+    d_dataset = input_util.DTensorDataset(
+        dataset=dataset,
+        global_batch_size=batch_size,
+        mesh=self.mesh,
+        layouts=images_layout,
+        batch_dim=MESH_DIM_BATCH)
+
+    def train(iterator):
+      iters = 1
+      output = next(iterator)
+      for img in iterator:
+        output += img
+        iters += 1
+      return output, iters
+
+    train_fn = polymorphic_function.function(train) if is_graph else train
+
+    iterator = iter(dataset.batch(batch_size, drop_remainder=True))
+    output, iters = train_fn(iterator)
+
+    d_iterator = iter(d_dataset)
+    d_output, d_iters = train_fn(d_iterator)
+
+    self.assertEqual(iters, d_iters)
+    self.assertDTensorEqual(output, images_layout, d_output)
+
   @parameterized.named_parameters(('Eager', False), ('Graph', True))
   def testIterSingleInput(self, is_graph):
     dataset = dataset_ops.DatasetV2.from_tensors(self.images).repeat()
@@ -167,12 +201,16 @@ class DTensorDatasetTest(test_util.DTensorBaseTest):
         layouts=images_layout,
         batch_dim=MESH_DIM_BATCH)
 
+    self.assertEqual(d_dataset.element_spec.shape, [batch_size, 8, 8, 3])
+
     def train(iterator):
       return next(iterator)
 
     train_fn = polymorphic_function.function(train) if is_graph else train
 
     d_iterator = iter(d_dataset)
+    self.assertEqual(d_iterator.element_spec.shape, [batch_size, 8, 8, 3])
+
     d_images = train_fn(d_iterator)
 
     expected = next(iter(dataset.batch(batch_size, drop_remainder=True)))
@@ -443,35 +481,45 @@ class DTensorDatasetTest(test_util.DTensorBaseTest):
           ),
       ),
       is_graph=[False, True],
+      through_dtensor=[False, True],
   )
-  def testIterWithLayouts(self, images_sharding, labels_sharding, is_graph):
-    batch_size = 32
-    dataset = dataset_ops.DatasetV2.from_tensors(
-        (self.images, self.labels)
-    ).repeat()
-    batched_dataset = dataset.batch(batch_size, drop_remainder=True)
+  def testIterWithLayouts(
+      self, images_sharding, labels_sharding, is_graph, through_dtensor
+  ):
+    if through_dtensor:
+      scope = api.default_mesh(self.mesh)
+    else:
+      scope = contextlib.nullcontext()
 
-    images_layout = Layout(images_sharding, self.mesh)
-    labels_layout = Layout(labels_sharding, self.mesh)
-    layouts = (images_layout, labels_layout)
-    batch_dim = None
-    if MESH_DIM_BATCH in images_sharding or MESH_DIM_BATCH in labels_sharding:
-      batch_dim = MESH_DIM_BATCH
+    with scope:
+      batch_size = 32
+      dataset = dataset_ops.DatasetV2.from_tensors(
+          (self.images, self.labels)
+      ).repeat()
+      batched_dataset = dataset.batch(batch_size, drop_remainder=True)
 
-    d_dataset = input_util.DTensorDataset(
-        dataset=dataset,
-        global_batch_size=batch_size,
-        mesh=self.mesh,
-        layouts=layouts,
-        batch_dim=batch_dim)
+      images_layout = Layout(images_sharding, self.mesh)
+      labels_layout = Layout(labels_sharding, self.mesh)
+      layouts = (images_layout, labels_layout)
+      batch_dim = None
+      if MESH_DIM_BATCH in images_sharding or MESH_DIM_BATCH in labels_sharding:
+        batch_dim = MESH_DIM_BATCH
 
-    def train(iterator):
-      return next(iterator)
+      d_dataset = input_util.DTensorDataset(
+          dataset=dataset,
+          global_batch_size=batch_size,
+          mesh=self.mesh,
+          layouts=layouts,
+          batch_dim=batch_dim,
+      )
 
-    train_fn = polymorphic_function.function(train) if is_graph else train
+      def train(iterator):
+        return next(iterator)
 
-    d_iterator = iter(d_dataset)
-    d_images, d_labels = train_fn(d_iterator)
+      train_fn = polymorphic_function.function(train) if is_graph else train
+
+      d_iterator = iter(d_dataset)
+      d_images, d_labels = train_fn(d_iterator)
 
     iterator = iter(batched_dataset)
     images, labels = train_fn(iterator)
diff --git a/tensorflow/dtensor/python/tests/layout_propagation_test.py b/tensorflow/dtensor/python/tests/layout_propagation_test.py
new file mode 100644
index 00000000000..8b2f37952b7
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/layout_propagation_test.py
@@ -0,0 +1,356 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for layout propagation."""
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import layout
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+UNSHARDED = layout.UNSHARDED
+
+# Convenient constants to use for tests.
+_MESH_DIM_BATCH = 'batch'
+_MESH_DIM_X = 'x'
+_MESH_DIM_Y = 'y'
+_MESH_2D_STRING = (
+    '|batch=2,x=2|0,1,2,3|0,1,2,3|'
+    '/job:localhost/replica:0/task:0/device:TPU:0,'
+    '/job:localhost/replica:0/task:0/device:TPU:1,'
+    '/job:localhost/replica:0/task:0/device:TPU:2,'
+    '/job:localhost/replica:0/task:0/device:TPU:3'
+)
+
+_2D_GLOBAL_IDS = test_util.create_device_ids_array((2, 2))
+
+_2D_MESH = layout.Mesh([_MESH_DIM_BATCH, _MESH_DIM_X], _2D_GLOBAL_IDS,
+                       np.ravel(_2D_GLOBAL_IDS).tolist(),
+                       test_util.create_device_list((2, 2), 'TPU'))
+_2D_X_Y_MESH = layout.Mesh([_MESH_DIM_X, _MESH_DIM_Y], _2D_GLOBAL_IDS,
+                           np.ravel(_2D_GLOBAL_IDS).tolist(),
+                           test_util.create_device_list((2, 2), 'CPU'))
+
+
+class LayoutPropagationV2Test(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(LayoutPropagationV2Test, self).setUp()
+    global_ids = test_util.create_device_ids_array((2, 2))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {  # pylint: disable=g-complex-comprehension
+        device: layout.Mesh(
+            [_MESH_DIM_X, _MESH_DIM_Y],
+            global_ids,
+            local_ids,
+            test_util.create_device_list((2, 2), device),
+        )
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+    # 1D Layouts
+    self.unsharded_layout = layout.Layout.replicated(self.mesh, rank=1)
+    self.x_layout = layout.Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=1)
+    self.y_layout = layout.Layout.batch_sharded(self.mesh, _MESH_DIM_Y, rank=1)
+
+    # 2D Layouts
+    self.unsharded_unsharded_layout = layout.Layout.replicated(
+        self.mesh, rank=2
+    )
+    self.x_unsharded_layout = layout.Layout.batch_sharded(
+        self.mesh, _MESH_DIM_X, rank=2
+    )
+    self.unsharded_x_layout = layout.Layout.inner_sharded(
+        self.mesh, _MESH_DIM_X, rank=2
+    )
+
+  def test_layout_prop_v2_with_const_tf_function(self):
+    a = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    b = constant_op.constant([[10.0, 20.0], [30.0, 40.0]])
+    golden_result = math_ops.add(a, b)
+
+    c = api.copy_to_mesh(a, self.unsharded_unsharded_layout)
+
+    @polymorphic_function.function
+    def add_function():
+      d = constant_op.constant([[10.0, 20.0], [30.0, 40.0]])
+      return math_ops.add(c, d)
+
+    dtensor_result = add_function()
+    self.assertDTensorEqual(golden_result, self.unsharded_unsharded_layout,
+                            dtensor_result)
+
+  def test_layout_prop_v2_while(self):
+    a = constant_op.constant([0, 1, 2, 1], dtype=dtypes.float32)
+    num_iterations = 10
+
+    @polymorphic_function.function
+    def function_with_while(t):
+      for _ in math_ops.range(num_iterations):
+        random_number = stateless_random_ops.stateless_random_normal(
+            shape=[4], seed=[1, 2], dtype=dtypes.float32
+        )
+        t = t + random_number
+      return t
+
+    golden_result = function_with_while(a)
+
+    a = numpy_util.pack_numpy(a, self.unsharded_layout)
+
+    dtensor_result = function_with_while(a)
+
+    self.assertDTensorEqual(golden_result, self.unsharded_layout,
+                            dtensor_result)
+
+  @parameterized.named_parameters(
+      dict(testcase_name='unsharded', sharded_layout=0, use_split=False),
+      dict(testcase_name='x_sharded', sharded_layout=1, use_split=False),
+      dict(testcase_name='unsharded_split', sharded_layout=0, use_split=True),
+      dict(testcase_name='x_sharded_split', sharded_layout=1, use_split=True))
+  def test_while_microbatch(self, sharded_layout, use_split):
+
+    layouts = [self.unsharded_unsharded_layout, self.x_unsharded_layout]
+    sharded_layout = layouts[sharded_layout]
+
+    np.random.seed(0)
+    random_initial_value = np.random.uniform(size=4 * 4).reshape([4, 4])
+    if use_split:
+      random_batch = np.random.uniform(size=12 * 4).reshape([12, 4])
+    else:
+      random_batch = np.random.uniform(size=4 * 4).reshape([4, 4])
+    golden_variable = variables.Variable(random_initial_value)
+
+    @polymorphic_function.function
+    def update_weights(batch, variable):
+      accum_grads = array_ops.zeros_like_v2(variable)
+      for i in math_ops.range(3):
+        if use_split:
+          reshaped = array_ops.reshape(batch, [4, 3, 4])
+          mini_batch = array_ops.gather_v2(reshaped, i, axis=1)
+        else:
+          mini_batch = batch
+        with backprop.GradientTape() as tape:
+          logits = variable * variable + mini_batch
+          loss = math_ops.reduce_sum(logits * logits)
+        accum_grads += tape.gradient(loss, variable)
+      new_variable = variable + accum_grads
+      variable.assign(new_variable)
+      return accum_grads
+
+    golden_accum = update_weights(
+        constant_op.constant(random_batch), golden_variable
+    )
+
+    random_batch = numpy_util.pack_numpy(random_batch, sharded_layout)
+    random_initial_value = numpy_util.pack_numpy(random_initial_value,
+                                                 sharded_layout)
+    dtensor_variable = d_variable.DVariable(random_initial_value)
+
+    dtensor_accum = update_weights(random_batch, dtensor_variable)
+    self.assertDTensorEqual(golden_accum, sharded_layout, dtensor_accum)
+
+  @parameterized.named_parameters(
+      dict(testcase_name='unsharded_split', sharded_layout=0),
+      dict(testcase_name='x_sharded_split', sharded_layout=1))
+  def test_while_microbatch_with_reused_gradient_accumulator(
+      self, sharded_layout):
+    layouts = [
+        self.unsharded_unsharded_layout, self.x_unsharded_layout,
+        self.unsharded_x_layout
+    ]
+    sharded_layout_0 = layouts[sharded_layout]
+    sharded_layout_1 = layouts[2]
+
+    np.random.seed(0)
+    random_initial_value_1 = np.random.uniform(size=4 * 4).reshape(
+        [4, 4]).astype(np.float32)
+    random_initial_value_2 = np.random.uniform(size=4 * 4).reshape(
+        [4, 4]).astype(np.float32)
+    random_batch = np.random.uniform(size=12 * 4).reshape([12, 4
+                                                          ]).astype(np.float32)
+
+    golden_variable_1 = variables.Variable(random_initial_value_1)
+    golden_variable_2 = variables.Variable(random_initial_value_2)
+
+    @polymorphic_function.function
+    def update_weights(batch, variable1, variable2):
+      accum_grads = array_ops.zeros_like_v2(variable1)
+      accum_grads_2 = array_ops.zeros_like_v2(variable2)
+
+      for i in math_ops.range(3):
+        reshaped = array_ops.reshape(batch, [4, 3, 4])
+        mini_batch = array_ops.gather_v2(reshaped, i, axis=1)
+
+        with backprop.GradientTape() as tape:
+          logits_1 = variable1 * variable1 + mini_batch
+          logits_2 = variable2 * variable2 + mini_batch
+          loss_1 = math_ops.reduce_sum(logits_1 * logits_1)
+          loss_2 = math_ops.reduce_sum(logits_2 * logits_2)
+          loss = loss_1 + loss_2
+        grads = tape.gradient(loss, [variable1, variable2])
+        accum_grads += grads[0]
+        accum_grads_2 += grads[1]
+
+      new_variable = variable1 + accum_grads
+      new_variable_2 = variable2 + accum_grads_2
+      variable1.assign(new_variable)
+      variable2.assign(new_variable_2)
+      return accum_grads, accum_grads_2
+
+    golden_accum, golden_accum_2 = update_weights(
+        constant_op.constant(random_batch), golden_variable_1, golden_variable_2
+    )
+
+    random_batch = numpy_util.pack_numpy(random_batch, sharded_layout_0)
+    random_initial_value_1 = numpy_util.pack_numpy(random_initial_value_1,
+                                                   sharded_layout_0)
+    dtensor_variable_1 = d_variable.DVariable(random_initial_value_1)
+
+    random_initial_value_2 = numpy_util.pack_numpy(random_initial_value_2,
+                                                   sharded_layout_1)
+    dtensor_variable_2 = d_variable.DVariable(random_initial_value_2)
+
+    dtensor_accum, dtensor_accum_2 = update_weights(random_batch,
+                                                    dtensor_variable_1,
+                                                    dtensor_variable_2)
+    self.assertDTensorEqual(golden_accum, sharded_layout_0, dtensor_accum)
+    self.assertDTensorEqual(golden_accum_2, sharded_layout_1, dtensor_accum_2)
+
+  def test_layout_prop_v2_if(self):
+    a = constant_op.constant([0, 1, 2, 1], dtype=dtypes.float32)
+    a = numpy_util.pack_numpy(a, self.unsharded_layout)
+
+    @polymorphic_function.function
+    def function_with_if(t):
+      if math_ops.equal(math_ops.reduce_sum(t), 0):
+        t = math_ops.sqrt(t)
+        return api.relayout(t, self.x_layout)
+      else:
+        return array_ops.zeros_like_v2(t)
+
+    dtensor_result = function_with_if(a)
+    api.check_layout(dtensor_result, self.x_layout)
+
+  def test_layout_prop_v2_if_with_different_layouts_for_branches(self):
+    unsharded_unsharded = layout.Layout.replicated(self.mesh, rank=2)
+    unsharded_y = layout.Layout.inner_sharded(self.mesh, _MESH_DIM_Y, rank=2)
+    x_unsharded = layout.Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=2)
+    a = np.random.uniform(size=16).reshape([4, 4])
+    a = numpy_util.pack_numpy(a, unsharded_unsharded)
+
+    @polymorphic_function.function
+    def function_with_if(t):
+      if math_ops.equal(math_ops.reduce_sum(t), 0):
+        t = math_ops.sqrt(t)
+        return api.relayout(t, unsharded_y)
+      else:
+        t = array_ops.zeros_like_v2(a)
+        return api.relayout(t, x_unsharded)
+
+    dtensor_result = function_with_if(a)
+
+    x_y_sharded = layout.Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh)
+    api.check_layout(dtensor_result, x_y_sharded)
+
+  def test_partial_relayout_in_function(self):
+    sharded_layout = layout.Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh)
+
+    a = np.random.uniform(size=16).reshape([4, 4])
+    a = numpy_util.pack_numpy(a, sharded_layout)
+    replicated_layout = layout.Layout(
+        [layout.MATCH, layout.UNSHARDED], mesh=self.mesh
+    )
+
+    @polymorphic_function.function
+    def func_with_relayout(t):
+      out = math_ops.cast(t, dtypes.float32)
+      out = math_ops.sqrt(out)
+      return api.relayout(out, replicated_layout)
+
+    out = func_with_relayout(a)
+    expected_layout = layout.Layout([_MESH_DIM_X, layout.UNSHARDED], self.mesh)
+    api.check_layout(out, expected_layout)
+
+  def test_partial_relayout_in_eager(self):
+
+    sharded_layout = layout.Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh)
+
+    a = np.random.uniform(size=16).reshape([4, 4])
+    a = numpy_util.pack_numpy(a, sharded_layout)
+    replicated_layout = layout.Layout(
+        [layout.MATCH, layout.UNSHARDED], mesh=self.mesh
+    )
+
+    a = math_ops.cast(a, dtypes.float32)
+    a = math_ops.sqrt(a)
+    out = api.relayout(a, replicated_layout)
+
+    expected_layout = layout.Layout([_MESH_DIM_X, layout.UNSHARDED], self.mesh)
+    api.check_layout(out, expected_layout)
+
+  def test_strided_slice_grad(self):
+
+    np.random.seed(0)
+    random_initial_value = np.random.uniform(size=4).reshape([4])
+    random_initial_value = numpy_util.pack_numpy(random_initial_value,
+                                                 self.unsharded_layout)
+
+    @polymorphic_function.function
+    def fn_with_strided_slice(t):
+      a = array_ops.strided_slice(t, [1], [2], shrink_axis_mask=1)
+      return math_ops.sqrt(a)
+
+    random_variable = d_variable.DVariable(random_initial_value)
+    with backprop.GradientTape() as tape:
+      output = fn_with_strided_slice(random_variable)
+
+    grads = tape.gradient(output, [random_variable])
+    self.assertTrue(api.fetch_layout(grads[0]).is_fully_replicated())
+
+  def test_layout_prop_v2_infinite_loop(self):
+    x_unsharded = layout.Layout([_MESH_DIM_X, layout.UNSHARDED], self.mesh)
+    unsharded_x = layout.Layout([layout.UNSHARDED, _MESH_DIM_X], self.mesh)
+
+    @polymorphic_function.function
+    def func(input_a, input_b):
+      out = array_ops.identity(
+          math_ops.matmul(input_a, array_ops.identity(input_b))
+      )
+      return api.relayout(out, unsharded_x)
+
+    result = func(
+        api.call_with_layout(
+            array_ops.ones, shape=(16, 16), layout=x_unsharded
+        ),
+        api.call_with_layout(
+            array_ops.ones, shape=(16, 16), layout=unsharded_x
+        ),
+    )
+    api.check_layout(result, unsharded_x)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/layout_test.py b/tensorflow/dtensor/python/tests/layout_test.py
new file mode 100644
index 00000000000..243015f676f
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/layout_test.py
@@ -0,0 +1,508 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for layout.py."""
+
+import copy
+import itertools
+import pickle
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import layout
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.platform import test
+
+UNSHARDED = layout.UNSHARDED
+
+# Convenient constants to use for tests.
+_MESH_DIM_BATCH = 'batch'
+_MESH_DIM_X = 'x'
+_MESH_DIM_Y = 'y'
+_MESH_2D_STRING = (
+    '|batch=2,x=2|0,1,2,3|0,1,2,3|'
+    '/job:localhost/replica:0/task:0/device:TPU:0,'
+    '/job:localhost/replica:0/task:0/device:TPU:1,'
+    '/job:localhost/replica:0/task:0/device:TPU:2,'
+    '/job:localhost/replica:0/task:0/device:TPU:3'
+)
+
+_2D_GLOBAL_IDS = test_util.create_device_ids_array((2, 2))
+
+_2D_MESH = layout.Mesh([_MESH_DIM_BATCH, _MESH_DIM_X], _2D_GLOBAL_IDS,
+                       np.ravel(_2D_GLOBAL_IDS).tolist(),
+                       test_util.create_device_list((2, 2), 'TPU'))
+_2D_X_Y_MESH = layout.Mesh([_MESH_DIM_X, _MESH_DIM_Y], _2D_GLOBAL_IDS,
+                           np.ravel(_2D_GLOBAL_IDS).tolist(),
+                           test_util.create_device_list((2, 2), 'CPU'))
+
+_SINGLE_DEVICE_MESH = layout.Mesh.from_device(
+    'job:localhost/replica:0/task:0/TPU:0'
+)
+
+
+class MeshTest(test_util.DTensorBaseTest, parameterized.TestCase):
+
+  def test_mesh_single_device(self):
+    self.assertTrue(_SINGLE_DEVICE_MESH.is_single_device())
+
+  def test_mesh_single_device_to_string(self):
+    roundtrip = layout.Mesh.from_string(_SINGLE_DEVICE_MESH.to_string())
+    self.assertTrue(roundtrip.is_single_device())
+    self.assertEqual(roundtrip.single_device, _SINGLE_DEVICE_MESH.single_device)
+
+  def test_mesh_single_device_to_proto(self):
+    roundtrip = layout.Mesh.from_proto(_SINGLE_DEVICE_MESH.as_proto())
+    self.assertTrue(roundtrip.is_single_device())
+    self.assertEqual(roundtrip.single_device, _SINGLE_DEVICE_MESH.single_device)
+
+  def test_mesh_reciprocal_mock_string_and_object(self):
+    generated_mesh_from_string = layout.Mesh.from_string(_MESH_2D_STRING)
+    self.assertProtoEquals(_2D_MESH.as_proto(),
+                           generated_mesh_from_string.as_proto())
+
+  def test_mesh_reciprocal_string_rep(self):
+    new_mesh_str = layout.Mesh.from_string(_MESH_2D_STRING).to_string()
+    self.assertEqual(_MESH_2D_STRING, new_mesh_str)
+
+  def test_mesh_repr(self):
+    device_ids = test_util.create_device_ids_array((4, 2))
+    mesh = layout.Mesh([_MESH_DIM_BATCH, _MESH_DIM_X], device_ids,
+                       np.ravel(device_ids).tolist(),
+                       test_util.create_device_list((4, 2), 'CPU'))
+    self.assertIn(
+        '<Mesh object with dims=[(\'batch\', 4), (\'x\', 2)], '
+        'device_type="CPU", num_local_devices=8), size=8', repr(mesh))
+
+  def test_mesh_contains_dim(self):
+    self.assertTrue(_2D_MESH.contains_dim('batch'))
+    self.assertTrue(_2D_MESH.contains_dim('x'))
+    self.assertFalse(_2D_MESH.contains_dim('y'))
+
+  def test_mesh_contains(self):
+    self.assertIn('batch', _2D_MESH)
+    self.assertIn('x', _2D_MESH)
+    self.assertNotIn('y', _2D_MESH)
+
+  def test_mesh_dim_names_property(self):
+    self.assertSequenceEqual(_2D_MESH.dim_names, ['batch', 'x'])
+
+  def test_mesh_size_property(self):
+    self.assertEqual(_2D_MESH.size, 4)
+
+  def test_mesh_device_type(self):
+    self.assertEqual(_2D_MESH.device_type(), 'TPU')
+    self.assertEqual(_2D_X_Y_MESH.device_type(), 'CPU')
+
+  def test_mesh_num_local_devices(self):
+    self.assertEqual(_2D_MESH.num_local_devices(), 4)
+
+  def test_mesh_min_global_device_id(self):
+    self.assertEqual(_2D_MESH.min_global_device_id(), 0)
+
+  def test_mesh_is_remote(self):
+    self.assertFalse(_2D_MESH.is_remote())
+
+  def test_mesh_local_device_ids(self):
+    self.assertSequenceEqual(_2D_MESH.local_device_ids(), [0, 1, 2, 3])
+
+  def test_mesh_local_devices(self):
+    self.assertSequenceEqual(_2D_MESH.local_devices(), [
+        '/job:localhost/replica:0/task:0/device:TPU:0',
+        '/job:localhost/replica:0/task:0/device:TPU:1',
+        '/job:localhost/replica:0/task:0/device:TPU:2',
+        '/job:localhost/replica:0/task:0/device:TPU:3'
+    ])
+
+  def test_mesh_shape(self):
+    self.assertSequenceEqual(_2D_MESH.shape(), [2, 2])
+
+  def test_mesh_pickle(self):
+    pickled = pickle.dumps(_2D_MESH)
+    unpickled = pickle.loads(pickled)
+
+    self.assertEqual(_2D_MESH, unpickled)
+
+  def test_mesh_pickle_w_modification_after_init(self):
+    mesh = copy.copy(_2D_MESH)
+    mesh._name = 'fake_name'
+    pickled = pickle.dumps(mesh)
+    unpickled = pickle.loads(pickled)
+
+    self.assertEqual(mesh, unpickled)
+
+  def test_mesh_dims(self):
+    device_ids = test_util.create_device_ids_array((4, 2))
+    mesh = layout.Mesh(
+        [_MESH_DIM_BATCH, _MESH_DIM_X],
+        device_ids,
+        np.ravel(device_ids).tolist(),
+        test_util.create_device_list((4, 2), 'CPU'))
+
+    self.assertIn(_MESH_DIM_BATCH, mesh)
+    self.assertIn(_MESH_DIM_X, mesh)
+    self.assertNotIn(_MESH_DIM_Y, mesh)
+
+    self.assertEqual(mesh[_MESH_DIM_BATCH].name, _MESH_DIM_BATCH)
+    self.assertEqual(mesh[_MESH_DIM_BATCH].size, 4)
+    self.assertEqual(mesh[_MESH_DIM_X].name, _MESH_DIM_X)
+    self.assertEqual(mesh[_MESH_DIM_X].size, 2)
+
+  @parameterized.parameters(
+      {
+          'mesh_dims': ('a',),
+          'mesh_shape': (8,),
+          'strides': [1],
+      }, {
+          'mesh_dims': ('a', 'b', 'c'),
+          'mesh_shape': (2, 4, 2),
+          'strides': [8, 2, 1],
+      }, {
+          'mesh_dims': ('a', 'b', 'c', 'd'),
+          'mesh_shape': (8, 16, 2, 4),
+          'strides': [128, 8, 4, 1],
+      })
+  def test_mesh_strides(self, mesh_dims, mesh_shape, strides):
+    device_ids = test_util.create_device_ids_array(mesh_shape)
+    mesh = layout.Mesh(
+        dim_names=list(mesh_dims),
+        global_device_ids=device_ids,
+        local_device_ids=np.ravel(device_ids).tolist(),
+        local_devices=test_util.create_device_list(mesh_shape, 'CPU'))
+
+    self.assertEqual(mesh.strides, strides)
+
+  def test_mesh_coords(self):
+    mesh_shape = (2, 4, 2)
+    device_ids = test_util.create_device_ids_array(mesh_shape)
+    mesh = layout.Mesh(
+        dim_names=['a', 'b', 'c'],
+        global_device_ids=device_ids,
+        local_device_ids=np.ravel(device_ids).tolist(),
+        local_devices=test_util.create_device_list(mesh_shape, 'CPU'))
+
+    coords = itertools.product(range(2), range(4), range(2))
+    # Repeat coords to overflow idx, mesh coords should still be correct.
+    coords = itertools.chain.from_iterable(
+        itertools.repeat(tuple(coords), times=3))
+    for idx, (a, b, c) in enumerate(coords):
+      self.assertAllEqual(mesh.coords(idx), [a, b, c])
+
+  @parameterized.named_parameters(('use_xla_spmd', True),
+                                  ('do_not_use_xla_spmd', False))
+  def test_mesh_use_xla_spmd_tpu_mesh(self, use_xla_spmd):
+    mesh_shape = (2, 4, 2)
+    device_ids = test_util.create_device_ids_array(mesh_shape)
+    mesh = layout.Mesh(
+        dim_names=['a', 'b', 'c'],
+        global_device_ids=device_ids,
+        local_device_ids=np.ravel(device_ids).tolist(),
+        local_devices=test_util.create_device_list(mesh_shape, 'TPU'),
+        use_xla_spmd=use_xla_spmd)
+    self.assertEqual(use_xla_spmd, mesh.use_xla_spmd())
+
+  @parameterized.named_parameters(('gpu', 'GPU'), ('cpu', 'CPU'))
+  def test_mesh_use_xla_spmd_for_non_tpu_mesh_raises_error(self, mesh_type):
+    mesh_shape = (2, 4, 2)
+    device_ids = test_util.create_device_ids_array(mesh_shape)
+    with self.assertRaisesRegex(ValueError,
+                                'XLA SPMD is not currently not supported for'):
+      layout.Mesh(
+          dim_names=['a', 'b', 'c'],
+          global_device_ids=device_ids,
+          local_device_ids=np.ravel(device_ids).tolist(),
+          local_devices=test_util.create_device_list(mesh_shape, mesh_type),
+          use_xla_spmd=True)
+
+  @parameterized.named_parameters(
+      ('use_xla_spmd', True), ('do_not_use_xla_spmd', False)
+  )
+  def test_mesh_as_proto_use_xla_spmd(self, use_xla_spmd):
+    mesh_shape = (2, 4, 2)
+    device_ids = test_util.create_device_ids_array(mesh_shape)
+    mesh = layout.Mesh(
+        dim_names=['a', 'b', 'c'],
+        global_device_ids=device_ids,
+        local_device_ids=np.ravel(device_ids).tolist(),
+        local_devices=test_util.create_device_list(mesh_shape, 'TPU'),
+        use_xla_spmd=use_xla_spmd,
+    )
+    mesh_proto = mesh.as_proto()
+
+    self.assertEqual(mesh_proto.use_xla_spmd, mesh.use_xla_spmd())
+
+  def test_mesh_from_string_with_use_xla_spmd(self):
+    mesh_str_without_global_device_ids = (
+        '|batch=2|0,1|0,1|/job:localhost/replica:0/task:0'
+        '/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1|use_xla_spmd'
+    )
+    mesh = layout.Mesh.from_string(mesh_str_without_global_device_ids)
+    self.assertTrue(mesh.use_xla_spmd())
+
+  def test_mesh_from_string_with_use_xla_spmd_and_global_devices(self):
+    mesh_str_with_global_device_ids = (
+        '|batch=2|0,1|0|/job:localhost/replica:0/task:0'
+        '/device:TPU:0|/job:localhost/replica:0/task:0/device:TPU:0,'
+        '/job:localhost/replica:0/task:0/device:TPU:1|use_xla_spmd'
+    )
+    mesh = layout.Mesh.from_string(mesh_str_with_global_device_ids)
+    self.assertTrue(mesh.use_xla_spmd())
+
+  def test_non_unique_device_type(self):
+    a = test_util.create_device_array((2,), 'CPU')
+    b = test_util.create_device_array((2,), 'TPU')
+    c = np.vstack([a, b])
+    global_ids = test_util.create_device_ids_array(c.shape)
+    local_ids = np.ravel(global_ids).tolist()
+    with self.assertRaisesRegex(ValueError,
+                                'Devices containing multiple device_types'):
+      layout.Mesh([_MESH_DIM_BATCH, _MESH_DIM_X], global_ids, local_ids,
+                  np.ravel(c).tolist())
+
+  def test_duplicated_devices(self):
+    a = test_util.create_device_array((2,), 'CPU')
+    b = test_util.create_device_array((2,), 'CPU')
+    c = np.vstack([a, b])
+    global_ids = test_util.create_device_ids_array((2, 2))
+    local_ids = global_ids.flatten().tolist()
+    with self.assertRaisesRegex(
+        ValueError, 'Duplicate devices found in mesh specification'):
+      layout.Mesh([_MESH_DIM_BATCH, _MESH_DIM_X], global_ids, local_ids,
+                  np.ravel(c).tolist())
+
+  def test_inconsecutive_device_ids(self):
+    a = test_util.create_device_array((2,), 'CPU')
+    global_ids = test_util.create_device_ids_array((2))
+    global_ids = np.flip(global_ids)
+    local_ids = global_ids.flatten().tolist()
+    with self.assertRaisesRegex(ValueError,
+                                'global_device_ids must sequentially increase'):
+      layout.Mesh([_MESH_DIM_BATCH], global_ids, local_ids,
+                  np.ravel(a).tolist())
+
+
+class LayoutTest(test_util.DTensorBaseTest, parameterized.TestCase):
+
+  def test_empty_sharding_spec_different_from_single_unsharded(self):
+    layout_str_single_unsharded = (
+        'sharding_specs:unsharded, mesh:' + _MESH_2D_STRING
+    )
+    layout_str_empty_sharding_spec = 'sharding_specs: mesh:' + _MESH_2D_STRING
+
+    self.assertNotEqual(
+        layout.Layout.from_string(layout_str_single_unsharded).to_string(),
+        layout.Layout.from_string(layout_str_empty_sharding_spec).to_string(),
+    )
+
+  @parameterized.named_parameters(
+      dict(
+          testcase_name='sharded_batch_and_x',
+          test_layout_str='sharding_specs:batch,x, mesh:' + _MESH_2D_STRING,
+      ),
+      dict(
+          testcase_name='unsharded_explicit',
+          test_layout_str='sharding_specs:'
+          + UNSHARDED
+          + ','
+          + UNSHARDED
+          + ','
+          + ' mesh:'
+          + _MESH_2D_STRING,
+      ),
+  )
+  def test_layout_reciprocal_string_rep(self, test_layout_str):
+    new_layout_str = layout.Layout.from_string(test_layout_str).to_string()
+    self.assertEqual(test_layout_str, new_layout_str)
+
+  def test_layout_pickle(self):
+    replicated = layout.Layout.replicated(_2D_MESH, rank=3)
+    pickled = pickle.dumps(replicated)
+    unpickled = pickle.loads(pickled)
+
+    self.assertEqual(replicated, unpickled)
+
+  def test_layout_repr(self):
+    tensor_layout = layout.Layout.batch_sharded(
+        _2D_MESH, _MESH_DIM_BATCH, rank=2)
+    self.assertIn('Layout(sharding_specs=[\'batch\', \'unsharded\'], mesh=',
+                  repr(tensor_layout))
+
+  def test_throws_for_non_mesh(self):
+    with self.assertRaisesRegex(ValueError, 'mesh is not a valid Mesh object'):
+      layout.Layout([_MESH_DIM_BATCH, _MESH_DIM_X], 'string_mesh')
+
+  def test_throws_for_repeated_dimension(self):
+    with self.assertRaisesRegex(ValueError, 'Mesh dimensions must be unique.'):
+      layout.Layout([_MESH_DIM_BATCH, _MESH_DIM_BATCH], _2D_MESH)
+
+  def test_throws_for_invalid_sharding_spec(self):
+    with self.assertRaisesRegex(
+        ValueError,
+        'A dimension sharding must either be a valid mesh dimension or ' +
+        'UNSHARDED.'):
+      layout.Layout(['WRONG_SHARDING_SPEC', 'UNSHARDED'], _2D_MESH)
+
+  def test_data_parallel_layout(self):
+    tensor_layout = layout.Layout.batch_sharded(
+        _2D_MESH, _MESH_DIM_BATCH, rank=2)
+    self.assertEqual(
+        tensor_layout.num_shards(0), _2D_MESH.dim_size(_MESH_DIM_BATCH))
+    self.assertEqual(tensor_layout.num_shards(1), 1)
+
+  def test_single_device_layout(self):
+    tensor_layout = layout.Layout.from_single_device_mesh(_SINGLE_DEVICE_MESH)
+    tensor_layout2 = layout.Layout.from_device(
+        _SINGLE_DEVICE_MESH.single_device
+    )
+    self.assertTrue(tensor_layout.is_single_device())
+    self.assertEqual(tensor_layout.mesh, _SINGLE_DEVICE_MESH)
+    self.assertEqual(tensor_layout, tensor_layout2)
+
+  def test_single_device_layout_from_string(self):
+    tensor_layout = layout.Layout.from_single_device_mesh(_SINGLE_DEVICE_MESH)
+    roundtrip = layout.Layout.from_string(tensor_layout.to_string())
+    self.assertEqual(roundtrip, tensor_layout)
+
+  def test_single_device_layout_from_proto(self):
+    tensor_layout = layout.Layout.from_single_device_mesh(_SINGLE_DEVICE_MESH)
+    roundtrip = layout.Layout.from_proto(tensor_layout.as_proto())
+    self.assertEqual(roundtrip, tensor_layout)
+
+
+class RelayoutTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+    global_ids = test_util.create_device_ids_array((2, 2))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {  # pylint: disable=g-complex-comprehension
+        device: layout.Mesh(
+            [_MESH_DIM_X, _MESH_DIM_Y],
+            global_ids,
+            local_ids,
+            test_util.create_device_list((2, 2), device),
+        )
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+    # 2D Layouts
+    self.unsharded_unsharded_layout = layout.Layout.replicated(
+        self.mesh, rank=2
+    )
+    self.x_unsharded_layout = layout.Layout.batch_sharded(
+        self.mesh, _MESH_DIM_X, rank=2
+    )
+    self.unsharded_x_layout = layout.Layout.inner_sharded(
+        self.mesh, _MESH_DIM_X, rank=2
+    )
+
+  @combinations.generate(
+      combinations.combine(is_graph=[False, True], is_replicated=[False, True])
+  )
+  def test_relayout(self, is_graph, is_replicated):
+    inp = stateless_random_ops.stateless_random_uniform([4, 4], seed=[0, 1])
+    if is_replicated:
+      to_layout = self.unsharded_unsharded_layout
+    else:
+      to_layout = self.x_unsharded_layout
+
+    def do_relayout():
+      return api.relayout(inp, to_layout)
+
+    if is_graph:
+      relayout_fn = polymorphic_function.function(do_relayout)
+      self.assertRaisesRegex(
+          errors_impl.InvalidArgumentError,
+          "No OpKernel was registered to support Op 'Relayout'",
+          relayout_fn,
+      )
+    else:
+      self.assertDTensorEqual(inp, to_layout, do_relayout())
+
+  def test_nested_relayout_gradient_preserves_layout(self):
+    # Test that nesting gradient tapes with relayouts preserves the layout of
+    # the original DTensor input. The second-order gradient should have a layout
+    # equivalent to the original input, even if the inner gradient tape
+    # relayouts the DTensor to a different layout.
+
+    @polymorphic_function.function
+    def inner(x):
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        t = x * 1.0
+        t = api.relayout(t, self.unsharded_x_layout)
+        cube = t * t * t
+      grad = tape.gradient(cube, x)
+      return grad
+
+    @polymorphic_function.function
+    def outer(x):
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        t = api.relayout(x, self.x_unsharded_layout)
+        grad = inner(t)
+        out = grad + t
+      out_grad = tape.gradient(out, x)
+      return out_grad
+
+    a = stateless_random_ops.stateless_random_uniform([8, 8], seed=[0, 1])
+    a_dt = api.relayout(a, self.unsharded_unsharded_layout)
+
+    with ops.device_v2(api.device_name()):
+      inner_grad = inner(a_dt)
+      outer_grad = outer(a_dt)
+
+    self.assertDTensorEqual(
+        3 * a * a, self.unsharded_unsharded_layout, inner_grad
+    )
+    self.assertDTensorEqual(
+        6 * a + 1, self.unsharded_unsharded_layout, outer_grad
+    )
+
+  def test_wus_using_relayout(self):
+    sharded_layout = layout.Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=2)
+    w = stateless_random_ops.stateless_random_uniform(
+        [4, 4], seed=[0, 1], dtype=dtypes.float32
+    )
+    sharded_w = api.relayout(w, sharded_layout)
+    replicated_layout = layout.Layout(
+        [layout.UNSHARDED, layout.UNSHARDED], mesh=self.mesh
+    )
+
+    @polymorphic_function.function
+    def func_with_relayout(t):
+      with backprop.GradientTape() as tape:
+        tape.watch(t)
+        t = t + t
+        out = api.relayout(t, replicated_layout)
+        loss = math_ops.reduce_sum(out)
+      grad = tape.gradient(loss, t)
+      t = t - grad
+      return t
+
+    func_with_relayout(sharded_w)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/multi_client_test.py b/tensorflow/dtensor/python/tests/multi_client_test.py
index bca12c43970..6e5162f1e82 100644
--- a/tensorflow/dtensor/python/tests/multi_client_test.py
+++ b/tensorflow/dtensor/python/tests/multi_client_test.py
@@ -14,11 +14,9 @@
 # ==============================================================================
 """Tests for DTensor multi-client setup."""
 import os
-import sys
 
 from absl import flags
 import numpy as np
-import portpicker
 
 from tensorflow.dtensor.python import accelerator_util
 from tensorflow.dtensor.python import api as d_api
@@ -26,6 +24,7 @@ from tensorflow.dtensor.python import config as d_config
 from tensorflow.dtensor.python import d_variable
 from tensorflow.dtensor.python import layout as d_layout
 from tensorflow.dtensor.python import mesh_util
+from tensorflow.dtensor.python.tests import multi_client_test_util
 from tensorflow.dtensor.python.tests import test_backend_util
 from tensorflow.dtensor.python.tests import test_util
 from tensorflow.python.eager import backprop
@@ -41,12 +40,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as tf_test
 
 
-_NUM_DEVICES = flags.DEFINE_integer(
-    'num_devices', 4, 'Number of local devices. '
-    '4 is the only allowed value for TPU.')
-_NUM_CLIENTS = flags.DEFINE_integer(
-    'num_clients', 2, 'Number of clients. 0 for local mode.'
-    '2 is the only allowed value for TPU.')
 _MODEL_DIM_SIZE = flags.DEFINE_integer('model_dim_size', 4,
                                        'Size of the model dimension.')
 
@@ -208,98 +201,15 @@ class DTensorMNISTTest(tf_test.TestCase):
       self.assertAllClose(i, array_ops.ones((2, 2)), atol=1e-5)
 
 
-def multi_client_main():
-  """Creates a Flock of TensorFlow Processes on localhost."""
-  flags.FLAGS(sys.argv, known_only=True)
-  num_clients = _NUM_CLIENTS.value or 1
-  num_devices = _NUM_DEVICES.value
+def client_config_function(config_params):
+  num_clients = config_params['num_clients']
+  dtensor_client_id = config_params['client_id']
+  dtensor_jobs = config_params['worker_jobs']
+  num_devices = config_params['num_devices']
 
-  # No GPU visible to the flock controller.
-  os.environ['CUDA_VISIBLE_DEVICES'] = ''
-
-  # Python multiprocess module in OSS.
-  mp_context = test_backend_util.get_mp_context()
-
-  print('Check per client log in Test artifacts.', flush=True)
-
-  # Inverts the order of ports intentionally to rule out ordering bugs.
-  server_ports = sorted(
-      [portpicker.pick_unused_port() for _ in range(num_clients)], reverse=True)
-
-  additional_ports = sorted(
-      [portpicker.pick_unused_port() for _ in range(num_clients)]
-  )
-
-  # Starts processes
-  procs = []
-  for client_idx in range(num_clients):
-    proc = mp_context.Process(
-        target=run_client,
-        args=(client_idx, server_ports, additional_ports, num_devices),
-        name=f'Client-{client_idx}',
-    )
-    proc.start()
-    procs.append(proc)
-
-  # Joins processes
-  exitcode = 0
-  for proc in procs:
-    proc.join()
-    if proc.exitcode != 0:
-      exitcode = proc.exitcode
-
-  sys.exit(exitcode)
-
-
-def run_client(idx, server_ports, additional_ports, num_devices):
-  """Runs test.main() from a DTensor Client process on localhost.
-
-  This function runs in a separate process so that the eager context is
-  proprely separated, which resembles real world multi-client setup.
-
-  Virtual devices are configured before test.main() is called.
-
-  Each client is configured to only have access to the physical GPU device
-  corresponding to its client id via CUDA_VISIBLE_DEVICES.
-
-  Each client is configured to only have access to some TPU cores
-  corresponding to its client id via flags.
-
-  The clients redirects stdout and stderr to files under Test Artifacts.
-
-  Args:
-    idx: integer task number represents the client's id from global picture.
-    server_ports: A list of ports that is allocated and to be used to construct
-      GRPC server. server_ports[idx] will be the GRPC server on the
-      corresponding client.
-    additional_ports: A list of ports that is allocated and to be used to
-      construct the backends.
-    num_devices: Number of devices per client.
-  """
-  # Python ForkServer doesn't parse the absl flags.
-  flags.FLAGS(sys.argv, known_only=True)
-
-  test_backend_util.slice_host_devices_for_multiworker(
-      _NUM_CLIENTS.value, idx, additional_ports
-  )
-
-  artifact_dir = os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', '')
-
-  # Redirect extra client's stderr/stdout to undeclared outputs on sponge.
-  if artifact_dir:
-    with open(
-        os.path.join(artifact_dir, f'test-client-process-{idx}.log'),
-        'wb') as fp:
-      os.dup2(fp.fileno(), 1)
-      os.dup2(fp.fileno(), 2)
-
-  # Set up cluster and enable collectives.
-  dtensor_jobs = [f'localhost:{port:06d}' for port in server_ports]
-
-  # Configures DTensor multi-client environment variables.
   # pylint: disable=protected-access
-  if _NUM_CLIENTS.value != 0:
-    os.environ[d_config._DT_CLIENT_ID] = f'{idx}'
+  if num_clients != 0:
+    os.environ[d_config._DT_CLIENT_ID] = f'{dtensor_client_id}'
     os.environ[d_config._DT_JOB_NAME] = 'worker'
     os.environ[d_config._DT_JOBS] = ','.join(dtensor_jobs)
   # pylint: enable=protected-access
@@ -311,20 +221,12 @@ def run_client(idx, server_ports, additional_ports, num_devices):
   else:
     device_type = 'CPU'
 
-  reset_logical_devices(device_type, num_devices)
-
-  # The following function call never returns.
-  tf_test.main()
-
-
-def reset_logical_devices(device_type, num_devices):
-  """Ensures multi-client with the number of logical devices for CPU/GPU/TPU."""
+  # reset_logical_devices
   test_util.reset_context()
   if device_type != 'TPU':
     # Configure virtual devices. This does not initialize the TensorFlow
     # context.
     test_util.reset_logical_devices(device_type, num_devices)
-
   accelerator_util.initialize_accelerator_system(
       device_type, enable_coordination_service=True)
 
@@ -332,8 +234,9 @@ def reset_logical_devices(device_type, num_devices):
   logical_devices = test_util.list_local_logical_devices(device_type)
   assert len(logical_devices) == num_devices, (
       logical_devices,
-      f'Test is misconfigured: expecting {num_devices} logical_devices.')
+      f'Test is mis-configured: expecting {num_devices} logical_devices.')
 
 
 if __name__ == '__main__':
-  test_backend_util.handle_test_main(multi_client_main)
+  test_backend_util.handle_test_main(
+      multi_client_test_util.multi_client_main, client_config_function)
diff --git a/tensorflow/dtensor/python/tests/multi_client_test_util.py b/tensorflow/dtensor/python/tests/multi_client_test_util.py
new file mode 100644
index 00000000000..35d3e7aa10e
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/multi_client_test_util.py
@@ -0,0 +1,141 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for multi-client setup."""
+import os
+import sys
+
+from absl import flags
+import portpicker
+
+from tensorflow.dtensor.python.tests import test_backend_util
+from tensorflow.python.platform import test as tf_test
+
+
+_NUM_LOCAL_DEVICES = flags.DEFINE_integer(
+    'num_local_devices', 4,
+    'Number of local devices. 4 is the only allowed value for TPU.')
+_NUM_CLIENTS = flags.DEFINE_integer(
+    'num_clients', 2,
+    'Number of clients. 0 for local mode. 2 is the only allowed value for TPU.')
+
+
+def multi_client_main(client_config_function):
+  """Creates a Flock of TensorFlow Processes on localhost."""
+  flags.FLAGS(sys.argv, known_only=True)
+  num_clients = _NUM_CLIENTS.value
+  num_process = num_clients or 1
+  num_local_devices = _NUM_LOCAL_DEVICES.value
+
+  # No GPU visible to the flock controller.
+  os.environ['CUDA_VISIBLE_DEVICES'] = ''
+  os.environ['HIP_VISIBLE_DEVICES'] = ''
+
+  # Python multiprocess module in OSS.
+  mp_context = test_backend_util.get_mp_context()
+
+  print('Check per client log in Test artifacts.', flush=True)
+
+  # Inverts the order of ports intentionally to rule out ordering bugs.
+  server_ports = sorted(
+      [portpicker.pick_unused_port() for _ in range(num_process)], reverse=True)
+
+  additional_ports = sorted(
+      [portpicker.pick_unused_port() for _ in range(num_process)]
+  )
+
+  # Starts processes
+  procs = []
+  for client_idx in range(num_process):
+    proc = mp_context.Process(
+        target=run_client,
+        args=(client_idx, num_clients, server_ports, additional_ports,
+              num_local_devices, client_config_function),
+        name=f'Client-{client_idx}',
+    )
+    proc.start()
+    procs.append(proc)
+
+  # Joins processes
+  exitcode = 0
+  for proc in procs:
+    proc.join()
+    if proc.exitcode != 0:
+      exitcode = proc.exitcode
+
+  sys.exit(exitcode)
+
+
+def run_client(idx, num_clients, server_ports, additional_ports,
+               num_local_devices, client_config_function):
+  """Runs test.main() from a DTensor Client process on localhost.
+
+  This function runs in a separate process so that the eager context is
+  properly separated, which resembles real world multi-client setup.
+
+  Virtual devices are configured before test.main() is called.
+
+  Each client is configured to only have access to the physical GPU device
+  corresponding to its client id via CUDA_VISIBLE_DEVICES/HIP_VISIBLE_DEVICES.
+
+  Each client is configured to only have access to some TPU cores
+  corresponding to its client id via flags.
+
+  The clients redirect stdout and stderr to files under Test Artifacts.
+
+  Args:
+    idx: integer task number represents the client's id from global picture.
+    num_clients: total number of clients.
+    server_ports: A list of ports that is allocated and to be used to construct
+      GRPC server. server_ports[idx] will be the GRPC server on the
+      corresponding client.
+    additional_ports: A list of ports that is allocated and to be used to
+      construct the backends.
+    num_local_devices: Number of devices per client.
+    client_config_function: A function, for each of the client to config the
+      local environment variables, etc. Note that the function will be called
+      with a dict of extra params, eg:
+        {'num_clients': 2
+         'client_id': 0,
+         'worker_jobs': ['localhost:port1', 'localhost:port2'],
+         'num_devices': 4,
+        }
+  """
+  test_backend_util.slice_host_devices_for_multiworker(
+      num_clients, idx, additional_ports
+  )
+
+  artifact_dir = os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', '')
+
+  # Redirect extra client's stderr/stdout to undeclared outputs on sponge.
+  if artifact_dir:
+    with open(
+        os.path.join(artifact_dir, f'test-client-process-{idx}.log'),
+        'wb') as fp:
+      os.dup2(fp.fileno(), 1)
+      os.dup2(fp.fileno(), 2)
+
+  # Set up cluster and enable collectives.
+  worker_jobs = [f'localhost:{port:06d}' for port in server_ports]
+  client_config_func_param = {
+      'num_clients': num_clients,
+      'client_id': idx,
+      'worker_jobs': worker_jobs,
+      'num_devices': num_local_devices,
+  }
+  client_config_function(client_config_func_param)
+
+  # The following function call never returns.
+  tf_test.main()
+
diff --git a/tensorflow/dtensor/python/tests/spmd_test.py b/tensorflow/dtensor/python/tests/spmd_test.py
index f5911bfe5f8..7d59318299f 100644
--- a/tensorflow/dtensor/python/tests/spmd_test.py
+++ b/tensorflow/dtensor/python/tests/spmd_test.py
@@ -302,6 +302,131 @@ class DTensorSPMDTest(test_util.DTensorBaseTest):
         dtensor_scattered_result,
     )
 
+  @parameterized.named_parameters(
+      (
+          'xu_ux',
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'ux_xu',
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+      (
+          'yu_uy',
+          [_MESH_DIM_Y, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED, _MESH_DIM_Y],
+      ),
+      (
+          'uy_yu',
+          [layout_lib.UNSHARDED, _MESH_DIM_Y],
+          [_MESH_DIM_Y, layout_lib.UNSHARDED],
+      ),
+  )
+  def testAllToAll2D(self, src_spec, tgt_spec):
+    a = constant_op.constant(
+        np.arange(
+            8 * 8,
+        ).reshape((8, 8)),
+        dtype=dtypes.float32,
+    )
+    sharded_a = numpy_util.pack_numpy(a, layout=Layout(src_spec, self.mesh))
+
+    @polymorphic_function.function
+    def func(a):
+      return api.relayout(a, Layout(tgt_spec, self.mesh))
+
+    dtensor_result = func(sharded_a)
+    self.assertDTensorEqual(a, Layout(tgt_spec, self.mesh), dtensor_result)
+
+  @parameterized.named_parameters(
+      (
+          'yuu_uuy',
+          [_MESH_DIM_Y, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED, _MESH_DIM_Y],
+      ),
+      (
+          'xuu_uux',
+          [_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'uux_xuu',
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED, _MESH_DIM_X],
+          [_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+      ),
+      (
+          'xuu_uxu',
+          [_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED, _MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+      (
+          'uxu_xuu',
+          [layout_lib.UNSHARDED, _MESH_DIM_X, layout_lib.UNSHARDED],
+          [_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+      ),
+      (
+          'xuy_uxy',
+          [_MESH_DIM_X, layout_lib.UNSHARDED, _MESH_DIM_Y],
+          [layout_lib.UNSHARDED, _MESH_DIM_X, _MESH_DIM_Y],
+      ),
+      (
+          'uxy_xuy',
+          [layout_lib.UNSHARDED, _MESH_DIM_X, _MESH_DIM_Y],
+          [_MESH_DIM_X, layout_lib.UNSHARDED, _MESH_DIM_Y],
+      ),
+      (
+          'xyu_uyx',
+          [_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED, _MESH_DIM_Y, _MESH_DIM_X],
+      ),
+      # Requires additional transpose
+      (
+          'uxu_uux',
+          [layout_lib.UNSHARDED, _MESH_DIM_X, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'uux_uxu',
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED, _MESH_DIM_X],
+          [layout_lib.UNSHARDED, _MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+      (
+          'xyu_xuy',
+          [_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+          [_MESH_DIM_X, layout_lib.UNSHARDED, _MESH_DIM_Y],
+      ),
+      (
+          'xuy_xyu',
+          [_MESH_DIM_X, layout_lib.UNSHARDED, _MESH_DIM_Y],
+          [_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+      ),
+      (
+          'yxu_yux',
+          [_MESH_DIM_Y, _MESH_DIM_X, layout_lib.UNSHARDED],
+          [_MESH_DIM_Y, layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'yux_yxu',
+          [_MESH_DIM_Y, layout_lib.UNSHARDED, _MESH_DIM_X],
+          [_MESH_DIM_Y, _MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+  )
+  def testAllToAll3D(self, src_spec, tgt_spec):
+    a = constant_op.constant(
+        np.arange(8 * 8 * 8).reshape((8, 8, 8)), dtype=dtypes.float32
+    )
+    sharded_a = numpy_util.pack_numpy(a, layout=Layout(src_spec, self.mesh))
+
+    @polymorphic_function.function
+    def func(a):
+      return api.relayout(a, Layout(tgt_spec, self.mesh))
+
+    dtensor_result = func(sharded_a)
+
+    self.assertDTensorEqual(a, Layout(tgt_spec, self.mesh), dtensor_result)
+
   def testExpandDimsDifferentInputAndOutputLayouts(self,):
     src_numpy = np.random.uniform(size=[10, 10])
     src = constant_op.constant(src_numpy, dtype=dtypes.float32)
diff --git a/tensorflow/dtensor/python/tests/test_backend_name.py b/tensorflow/dtensor/python/tests/test_backend_name.py
index b7b02d7648c..0aa665b58bc 100644
--- a/tensorflow/dtensor/python/tests/test_backend_name.py
+++ b/tensorflow/dtensor/python/tests/test_backend_name.py
@@ -30,6 +30,7 @@ class DTensorTestUtilBackend(enum.Enum):
   TPU_V3_DONUT_BACKEND = 'tpu_v3_2x2'
   TPU_V4_DONUT_BACKEND = 'tpu_v4_2x2'
   PATHWAYS = 'pw'
+  PATHWAYS_V3_DONUT_BACKEND = 'pw_v3_2x2'
 
 
 DTENSOR_TEST_UTIL_BACKEND = DTensorTestUtilBackend(
diff --git a/tensorflow/dtensor/python/tests/test_backend_util.py b/tensorflow/dtensor/python/tests/test_backend_util.py
index 6fe491da099..02fc82a71b7 100644
--- a/tensorflow/dtensor/python/tests/test_backend_util.py
+++ b/tensorflow/dtensor/python/tests/test_backend_util.py
@@ -54,9 +54,11 @@ def slice_host_devices_for_multiworker(num_clients, client_id, ports):
   if num_clients == 0:
     # All GPUs are visible to the client.
     del os.environ['CUDA_VISIBLE_DEVICES']
+    del os.environ['HIP_VISIBLE_DEVICES']
   else:
     # Make the client_id-th GPU visible to the client.
     os.environ['CUDA_VISIBLE_DEVICES'] = f'{client_id}'
+    os.environ['HIP_VISIBLE_DEVICES'] = f'{client_id}'
     # Make the client_id-th (4x) TPU cores visible to the client.
     os.environ['CLOUD_TPU_TASK_ID'] = f'{client_id}'
     if 'tpu' in DTENSOR_TEST_UTIL_BACKEND.value:
@@ -72,8 +74,8 @@ def get_mp_context():
   return multiprocessing.get_context('forkserver')
 
 
-def handle_test_main(main):
-  main()
+def handle_test_main(main, *args, **kwargs):
+  main(*args, **kwargs)
 
 
 # LINT.ThenChange(test_backend_util.py)
diff --git a/tensorflow/dtensor/python/tests/test_util.py b/tensorflow/dtensor/python/tests/test_util.py
index 876d3ca7cc0..fed11025d42 100644
--- a/tensorflow/dtensor/python/tests/test_util.py
+++ b/tensorflow/dtensor/python/tests/test_util.py
@@ -167,15 +167,15 @@ class DTensorBaseTest(tf_test.TestCase, parameterized.TestCase):
 
   def tearDown(self):
     # Make sure all async ops finish.
-    context.async_wait()
+    try:
+      context.async_wait()
+    finally:
+      # TODO(hthu): Remove the reset once we fixed the CopyToMesh with
+      # DefaultMesh placement issue.
+      reset_dtensor()
 
-    # TODO(hthu): Remove the reset once we fixed the CopyToMesh with
-    # DefaultMesh placement issue.
-    reset_dtensor()
-
-    self._backend_configurator.tearDown()
-
-    super().tearDown()
+      self._backend_configurator.tearDown()
+      super().tearDown()
 
   @staticmethod
   def configTestMesh(  # pylint: disable=invalid-name
@@ -260,6 +260,10 @@ class DTensorBaseTest(tf_test.TestCase, parameterized.TestCase):
       self._backend_configurator.tearDown()
     super().skipTest(reason)
 
+  def skipForPathways(self, reason: str):  # pylint: disable=invalid-name
+    if config.backend_is_pw():
+      self.skipTest(reason)
+
   def assertDTensorEqual(
       self,  # pylint: disable=invalid-name
       expected_result,
diff --git a/tensorflow/dtensor/tests/BUILD b/tensorflow/dtensor/tests/BUILD
index e5089a5200f..ab022e13d8e 100644
--- a/tensorflow/dtensor/tests/BUILD
+++ b/tensorflow/dtensor/tests/BUILD
@@ -21,12 +21,13 @@ tf_cc_test(
     name = "layout_to_xla_sharding_test",
     srcs = ["layout_to_xla_sharding_test.cc"],
     deps = [
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/dtensor/cc:layout_to_xla_sharding",
-        "//tensorflow/dtensor/cc:tensor_layout",
-        "//tensorflow/dtensor/proto:layout_proto_cc",
-        "@com_google_absl//absl/strings",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -44,3 +45,16 @@ tf_cc_test(
         "@llvm-project//mlir:IR",
     ],
 )
+
+tf_cc_test(
+    name = "spmd_expander_test",
+    srcs = ["spmd_expander_test.cc"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:test_main",
+        "//tensorflow/dtensor/mlir:spmd_expander",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/dtensor/tests/layout_to_xla_sharding_test.cc b/tensorflow/dtensor/tests/layout_to_xla_sharding_test.cc
index b6b5f8fee6a..2d44cc017d5 100644
--- a/tensorflow/dtensor/tests/layout_to_xla_sharding_test.cc
+++ b/tensorflow/dtensor/tests/layout_to_xla_sharding_test.cc
@@ -15,35 +15,39 @@ limitations under the License.
 
 #include "tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h"
 
-#include <map>
-#include <memory>
-#include <ostream>
 #include <string>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/dtensor/proto/layout.pb.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace dtensor {
 namespace {
 
-using ::testing::ElementsAre;
-using ::testing::IsEmpty;
+StatusOr<std::string> ConvertLayoutStrToHloShardingStr(std::string layout_str) {
+  TF_ASSIGN_OR_RETURN(const Layout layout, Layout::FromString(layout_str));
+  TF_ASSIGN_OR_RETURN(const xla::OpSharding op_sharding,
+                      ConvertLayoutToXlaOpSharding(layout));
+  TF_ASSIGN_OR_RETURN(const auto hlo_sharding,
+                      xla::HloSharding::FromProto(op_sharding));
+  return hlo_sharding.ToString();
+}
 
 TEST(LayoutToXLAShardingTest, ReplicatedLayout1D) {
   std::string layout_str =
       "sharding_specs:unsharded, "
       "mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/"
       "task:0/device:CPU:1";
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::REPLICATED, op_sharding.type());
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), IsEmpty());
-  EXPECT_THAT(op_sharding.tile_assignment_devices(), IsEmpty());
+  EXPECT_EQ("{replicated}", sharding);
 }
 
 TEST(LayoutToXLAShardingTest, ReplicatedLayout2D) {
@@ -53,14 +57,10 @@ TEST(LayoutToXLAShardingTest, ReplicatedLayout2D) {
       "job:localhost/"
       "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
       "task:0/device:CPU:3";
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::REPLICATED, op_sharding.type());
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), IsEmpty());
-  EXPECT_THAT(op_sharding.tile_assignment_devices(), IsEmpty());
+  EXPECT_EQ("{replicated}", sharding);
 }
 
 TEST(LayoutToXLAShardingTest, ReplicatedLayout3D) {
@@ -73,14 +73,10 @@ TEST(LayoutToXLAShardingTest, ReplicatedLayout3D) {
       "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
       "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
       "task:0/device:CPU:7";
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::REPLICATED, op_sharding.type());
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), IsEmpty());
-  EXPECT_THAT(op_sharding.tile_assignment_devices(), IsEmpty());
+  EXPECT_EQ("{replicated}", sharding);
 }
 
 TEST(LayoutToXLAShardingTest, FullyShardedLayout1D) {
@@ -88,15 +84,10 @@ TEST(LayoutToXLAShardingTest, FullyShardedLayout1D) {
       "sharding_specs:x, "
       "mesh:|x=3|0,1,2|0,1,2|/job:localhost/task:0/device:CPU:0,/job:localhost/"
       "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2";
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(3));
-  EXPECT_THAT(op_sharding.tile_assignment_devices(), ElementsAre(0, 1, 2));
+  EXPECT_EQ("{devices=[3]0,1,2}", sharding);
 }
 
 TEST(LayoutToXLAShardingTest, FullyShardedLayout2D) {
@@ -106,15 +97,10 @@ TEST(LayoutToXLAShardingTest, FullyShardedLayout2D) {
       "job:localhost/"
       "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
       "task:0/device:CPU:3";
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 2));
-  EXPECT_THAT(op_sharding.tile_assignment_devices(), ElementsAre(0, 1, 2, 3));
+  EXPECT_EQ("{devices=[2,2]0,1,2,3}", sharding);
 }
 
 TEST(LayoutToXLAShardingTest, FullyShardedLayout2DAsymmetricMesh) {
@@ -125,16 +111,10 @@ TEST(LayoutToXLAShardingTest, FullyShardedLayout2DAsymmetricMesh) {
       "device:CPU:2,/job:localhost/task:0/device:CPU:3,/job:localhost/task:0/"
       "device:CPU:4,/job:localhost/task:0/device:CPU:5,/job:localhost/task:0/"
       "device:CPU:6,/job:localhost/task:0/device:CPU:7";
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(4, 2));
-  EXPECT_THAT(op_sharding.tile_assignment_devices(),
-              ElementsAre(0, 4, 1, 5, 2, 6, 3, 7));
+  EXPECT_EQ("{devices=[4,2]0,4,1,5,2,6,3,7}", sharding);
 }
 
 TEST(LayoutToXLAShardingTest, FullyShardedPermutedLayout2D) {
@@ -144,17 +124,11 @@ TEST(LayoutToXLAShardingTest, FullyShardedPermutedLayout2D) {
       "job:localhost/"
       "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
       "task:0/device:CPU:3";
-
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 2));
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
   // Devices should now be ordered 'y' axis first.
-  EXPECT_THAT(op_sharding.tile_assignment_devices(), ElementsAre(0, 2, 1, 3));
+  EXPECT_EQ("{devices=[2,2]0,2,1,3}", sharding);
 }
 
 TEST(LayoutToXLAShardingTest, FullyShardedLayout3D) {
@@ -167,18 +141,10 @@ TEST(LayoutToXLAShardingTest, FullyShardedLayout3D) {
       "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
       "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
       "task:0/device:CPU:7";
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 2, 2));
-
-  // Devices should now be ordered 'y' axis first.
-  EXPECT_THAT(op_sharding.tile_assignment_devices(),
-              ElementsAre(0, 1, 2, 3, 4, 5, 6, 7));
+  EXPECT_EQ("{devices=[2,2,2]0,1,2,3,4,5,6,7}", sharding);
 }
 
 TEST(LayoutToXLAShardingTest, FullyShardedPermutedLayout3D_1) {
@@ -191,19 +157,12 @@ TEST(LayoutToXLAShardingTest, FullyShardedPermutedLayout3D_1) {
       "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
       "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
       "task:0/device:CPU:7";
-
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 2, 2));
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
   // Devices are permuted in z axis first and then x and y. It helps to manually
   // draw this to confirm it.
-  EXPECT_THAT(op_sharding.tile_assignment_devices(),
-              ElementsAre(0, 2, 4, 6, 1, 3, 5, 7));
+  EXPECT_EQ("{devices=[2,2,2]0,2,4,6,1,3,5,7}", sharding);
 }
 
 TEST(LayoutToXLAShardingTest, FullyShardedPermutedLayout3D_2) {
@@ -216,19 +175,12 @@ TEST(LayoutToXLAShardingTest, FullyShardedPermutedLayout3D_2) {
       "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
       "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
       "task:0/device:CPU:7";
-
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 2, 2));
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
   // Devices are permuted in reverse order, it helps to draw this out manually
   // to understand this is correct.
-  EXPECT_THAT(op_sharding.tile_assignment_devices(),
-              ElementsAre(0, 4, 2, 6, 1, 5, 3, 7));
+  EXPECT_EQ("{devices=[2,2,2]0,4,2,6,1,5,3,7}", sharding);
 }
 
 TEST(LayoutToXLAShardingTest, PartiallyShardedLayout2D) {
@@ -238,16 +190,10 @@ TEST(LayoutToXLAShardingTest, PartiallyShardedLayout2D) {
       "job:localhost/"
       "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
       "task:0/device:CPU:3";
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_TRUE(op_sharding.replicate_on_last_tile_dim());
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 1, 2));
-
-  EXPECT_THAT(op_sharding.tile_assignment_devices(), ElementsAre(0, 1, 2, 3));
+  EXPECT_EQ("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}", sharding);
 }
 
 TEST(LayoutToXLAShardingTest, PartiallyShardedPermutedLayout2D) {
@@ -257,17 +203,11 @@ TEST(LayoutToXLAShardingTest, PartiallyShardedPermutedLayout2D) {
       "job:localhost/"
       "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
       "task:0/device:CPU:3";
-
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_TRUE(op_sharding.replicate_on_last_tile_dim());
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 1, 2));
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
   // Permuted on the Y dimension.
-  EXPECT_THAT(op_sharding.tile_assignment_devices(), ElementsAre(0, 2, 1, 3));
+  EXPECT_EQ("{devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}", sharding);
 }
 
 TEST(LayoutToXLAShardingTest, PartiallyShardedLayout3D_1) {
@@ -280,19 +220,12 @@ TEST(LayoutToXLAShardingTest, PartiallyShardedLayout3D_1) {
       "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
       "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
       "task:0/device:CPU:7";
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_TRUE(op_sharding.replicate_on_last_tile_dim());
   // Last dim is two since every replication group is size 2.
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(),
-              ElementsAre(2, 2, 1, 2));
-
-  EXPECT_THAT(op_sharding.tile_assignment_devices(),
-              ElementsAre(0, 1, 2, 3, 4, 5, 6, 7));
+  EXPECT_EQ("{devices=[2,2,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}",
+            sharding);
 }
 
 TEST(LayoutToXLAShardingTest, PartiallyShardedLayout3D_2) {
@@ -305,19 +238,12 @@ TEST(LayoutToXLAShardingTest, PartiallyShardedLayout3D_2) {
       "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
       "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
       "task:0/device:CPU:7";
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_TRUE(op_sharding.replicate_on_last_tile_dim());
   // Last dim is four since every replication group is size 4.
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(),
-              ElementsAre(2, 1, 1, 4));
-
-  EXPECT_THAT(op_sharding.tile_assignment_devices(),
-              ElementsAre(0, 1, 2, 3, 4, 5, 6, 7));
+  EXPECT_EQ("{devices=[2,1,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}",
+            sharding);
 }
 
 TEST(LayoutToXLAShardingTest, PartiallyShardedPermutedLayout3D_1) {
@@ -330,20 +256,13 @@ TEST(LayoutToXLAShardingTest, PartiallyShardedPermutedLayout3D_1) {
       "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
       "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
       "task:0/device:CPU:7";
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_TRUE(op_sharding.replicate_on_last_tile_dim());
   // Last dim is two since every replication group is size 2.
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(),
-              ElementsAre(2, 2, 1, 2));
-
   // Same permutation as 'z', 'y', 'x'.
-  EXPECT_THAT(op_sharding.tile_assignment_devices(),
-              ElementsAre(0, 4, 2, 6, 1, 5, 3, 7));
+  EXPECT_EQ("{devices=[2,2,1,2]0,4,2,6,1,5,3,7 last_tile_dim_replicate}",
+            sharding);
 }
 
 TEST(LayoutToXLAShardingTest, PartiallyShardedPermutedLayout3D_2) {
@@ -356,22 +275,36 @@ TEST(LayoutToXLAShardingTest, PartiallyShardedPermutedLayout3D_2) {
       "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
       "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
       "task:0/device:CPU:7";
+  TF_ASSERT_OK_AND_ASSIGN(std::string sharding,
+                          ConvertLayoutStrToHloShardingStr(layout_str));
 
-  ::xla::OpSharding op_sharding =
-      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
-          .value();
-
-  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
-  EXPECT_TRUE(op_sharding.replicate_on_last_tile_dim());
   // Last dim is two since every replication group is size 2.
-  EXPECT_THAT(op_sharding.tile_assignment_dimensions(),
-              ElementsAre(2, 1, 2, 2));
-
   // Same permutation as 'y', 'z'.
-  EXPECT_THAT(op_sharding.tile_assignment_devices(),
-              ElementsAre(0, 4, 1, 5, 2, 6, 3, 7));
+  EXPECT_EQ("{devices=[2,1,2,2]0,4,1,5,2,6,3,7 last_tile_dim_replicate}",
+            sharding);
 }
 
+void BM_65536Devices(benchmark::State& state) {
+  std::vector<int64_t> device_ids(65536);
+  absl::c_iota(device_ids, 0);
+  std::vector<std::string> devices_str(65536);
+  absl::c_generate(devices_str, [n = 0]() mutable {
+    return absl::StrCat("/job:localhost/task:0/device:CPU:", n++);
+  });
+  auto mesh = Mesh::CreateMesh(/*mesh_name=*/"", /*dim_names=*/{"x", "y", "z"},
+                               /*mesh_shape=*/{8, 128, 64},
+                               /*global_device_ids=*/device_ids,
+                               /*global_devices_str=*/{},
+                               /*local_device_ids=*/device_ids,
+                               /*local_devices_str=*/devices_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto layout,
+                          Layout::GetLayout({"x", "y", "z"}, mesh));
+  for (auto s : state) {
+    TF_EXPECT_OK(ConvertLayoutToXlaOpSharding(layout).status());
+  }
+}
+BENCHMARK(BM_65536Devices);
+
 }  // namespace
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/tests/spmd_expander_test.cc b/tensorflow/dtensor/tests/spmd_expander_test.cc
new file mode 100644
index 00000000000..a18123e58b6
--- /dev/null
+++ b/tensorflow/dtensor/tests/spmd_expander_test.cc
@@ -0,0 +1,69 @@
+
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+
+namespace tensorflow {
+namespace dtensor {
+namespace {
+
+using ::testing::IsNull;
+using ::testing::NotNull;
+
+class DummyExpander : public SPMDExpanderBase {
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override {
+    return errors::Unimplemented("");
+  }
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override {
+    return errors::Unimplemented("");
+  }
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override {
+    return errors::Unimplemented("");
+  }
+};
+
+class SPMDExpanderRegistryTest : public ::testing::Test {
+ public:
+  SPMDExpanderRegistryTest() {
+    registry_.RegisterPropagateFn(mlir::TF::AddOp::getOperationName().str(),
+                                  std::make_unique<DummyExpander>());
+  }
+
+ protected:
+  SPMDExpanderRegistry registry_;
+};
+
+TEST_F(SPMDExpanderRegistryTest, LookupFromOpName) {
+  EXPECT_THAT(registry_.GetPropagateFnForFullOpName("tf.Add"), NotNull());
+  EXPECT_THAT(registry_.GetPropagateFnForFullOpName("Unknown"), IsNull());
+}
+
+}  // namespace
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/tests/tensor_layout_test.cc b/tensorflow/dtensor/tests/tensor_layout_test.cc
index 2b41d395f3b..6c8b03b47e3 100644
--- a/tensorflow/dtensor/tests/tensor_layout_test.cc
+++ b/tensorflow/dtensor/tests/tensor_layout_test.cc
@@ -136,6 +136,11 @@ TEST(MeshTest, ToStringMeshWithXLASPMD) {
   EXPECT_THAT(mesh.ToString(), ContainsRegex(Mesh::kUseXLASPMDString));
 }
 
+TEST(MeshTest, FromStringInvalidSingleDeviceMesh) {
+  EXPECT_THAT(Mesh::FromString("/job:localhost/device:CPU:0"),
+              StatusIs(tsl::error::INVALID_ARGUMENT));
+}
+
 TEST(MeshTest, FromStringSingleDeviceMesh) {
   TF_ASSERT_OK_AND_ASSIGN(
       Mesh mesh, Mesh::FromString("/job:localhost/task:0/device:CPU:0"));
@@ -147,7 +152,10 @@ TEST_F(LayoutTest, FromStringEmptyLayout) {
   std::string layout_str = layout.ToString();
   TF_ASSERT_OK_AND_ASSIGN(Layout layout_from_str,
                           Layout::FromString(layout_str));
-  EXPECT_THAT(layout.ToProto(), EqualsProto(layout_from_str.ToProto()));
+  TF_ASSERT_OK_AND_ASSIGN(LayoutProto layout_from_str_proto,
+                          layout_from_str.ToProto());
+  EXPECT_THAT(layout.ToProto(),
+              IsOkAndHolds(EqualsProto(layout_from_str_proto)));
 }
 
 TEST_F(LayoutTest, LayoutToFromString) {
@@ -155,7 +163,10 @@ TEST_F(LayoutTest, LayoutToFromString) {
   std::string layout_str = layout.ToString();
   TF_ASSERT_OK_AND_ASSIGN(Layout layout_from_str,
                           Layout::FromString(layout_str));
-  EXPECT_THAT(layout.ToProto(), EqualsProto(layout_from_str.ToProto()));
+  TF_ASSERT_OK_AND_ASSIGN(LayoutProto layout_from_str_proto,
+                          layout_from_str.ToProto());
+  EXPECT_THAT(layout.ToProto(),
+              IsOkAndHolds(EqualsProto(layout_from_str_proto)));
 }
 
 TEST_F(LayoutTest, LayoutToFromStringNotSharded) {
@@ -190,7 +201,9 @@ TEST_F(LayoutTest, MeshToFromString) {
   Mesh mesh = BatchLayout().mesh();
   std::string mesh_str = mesh.ToString();
   TF_ASSERT_OK_AND_ASSIGN(Mesh mesh_from_str, Mesh::FromString(mesh_str));
-  EXPECT_THAT(mesh.ToProto(), EqualsProto(mesh_from_str.ToProto()));
+  TF_ASSERT_OK_AND_ASSIGN(MeshProto mesh_from_str_proto,
+                          mesh_from_str.ToProto());
+  EXPECT_THAT(mesh.ToProto(), IsOkAndHolds(EqualsProto(mesh_from_str_proto)));
 }
 
 TEST_F(LayoutTest, GetType) {
@@ -231,7 +244,8 @@ TEST_F(LayoutTest, ScalarLayout) {
       Layout::FromString("sharding_specs:scalar, mesh:|x=4,y=4|*TPU"));
   EXPECT_EQ(layout.num_devices(), 16);
   EXPECT_TRUE(layout.mesh().is_tpu_mesh());
-  EXPECT_EQ(layout.ToProto().mesh_config().mesh_dimensions(0).size(), 4);
+  TF_ASSERT_OK_AND_ASSIGN(LayoutProto layout_proto, layout.ToProto());
+  EXPECT_EQ(layout_proto.mesh_config().mesh_dimensions(0).size(), 4);
   EXPECT_EQ(layout.rank(), 0);
 }
 
@@ -241,7 +255,8 @@ TEST_F(LayoutTest, ParseSimpleTpuMesh) {
       Layout::FromString("sharding_specs:x, mesh:|x=4,y=4|*TPU"));
   EXPECT_EQ(layout.num_devices(), 16);
   EXPECT_TRUE(layout.mesh().is_tpu_mesh());
-  EXPECT_EQ(layout.ToProto().mesh_config().mesh_dimensions(0).size(), 4);
+  TF_ASSERT_OK_AND_ASSIGN(LayoutProto layout_proto, layout.ToProto());
+  EXPECT_EQ(layout_proto.mesh_config().mesh_dimensions(0).size(), 4);
 }
 
 TEST_F(LayoutTest, ParseSimpleCpuMesh) {
@@ -251,7 +266,8 @@ TEST_F(LayoutTest, ParseSimpleCpuMesh) {
   EXPECT_EQ(layout.num_devices(), 16);
   EXPECT_FALSE(layout.mesh().is_tpu_mesh());
 
-  EXPECT_EQ(layout.ToProto().mesh_config().mesh_dimensions(0).size(), 4);
+  TF_ASSERT_OK_AND_ASSIGN(LayoutProto layout_proto, layout.ToProto());
+  EXPECT_EQ(layout_proto.mesh_config().mesh_dimensions(0).size(), 4);
 }
 
 TEST_F(LayoutTest, ParseFailsOnRepeatedShardingSpec) {
@@ -286,7 +302,8 @@ TEST_F(LayoutTest, ParseReplicatedLayout) {
   EXPECT_EQ(layout.num_devices(), 16);
   EXPECT_FALSE(layout.mesh().is_tpu_mesh());
   EXPECT_TRUE(layout.IsFullyReplicated());
-  EXPECT_EQ(layout.ToProto().mesh_config().mesh_dimensions(0).size(), 4);
+  TF_ASSERT_OK_AND_ASSIGN(LayoutProto layout_proto, layout.ToProto());
+  EXPECT_EQ(layout_proto.mesh_config().mesh_dimensions(0).size(), 4);
 }
 
 TEST_F(LayoutTest, SingleHostFullyReplicatedReducedMesh) {
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index 682279cb245..c1962f684a1 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -134,8 +134,8 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY3",
     tags = tf_cuda_tests_tags() + [
-        "notap",
         "no_pip",
+        "notap",
     ],
     deps = [
         ":cuda_op",
diff --git a/tensorflow/examples/adding_an_op/cuda_op_kernel.cc b/tensorflow/examples/adding_an_op/cuda_op_kernel.cc
index af04258fa2c..4492df58cb3 100644
--- a/tensorflow/examples/adding_an_op/cuda_op_kernel.cc
+++ b/tensorflow/examples/adding_an_op/cuda_op_kernel.cc
@@ -37,13 +37,13 @@ class AddOneOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     // Grab the input tensor
     const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<int32>();
+    auto input = input_tensor.flat<int32_t>();
 
     // Create an output tensor
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                      &output_tensor));
-    auto output = output_tensor->template flat<int32>();
+    auto output = output_tensor->template flat<int32_t>();
 
     // Set all but the first element of the output tensor to 0.
     const int N = input.size();
diff --git a/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc b/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc
index aa9c81367b6..cc148e1e2c0 100644
--- a/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc
+++ b/tensorflow/examples/adding_an_op/zero_out_op_kernel_1.cc
@@ -40,13 +40,13 @@ class ZeroOutOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     // Grab the input tensor
     const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<int32>();
+    auto input = input_tensor.flat<int32_t>();
 
     // Create an output tensor
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                      &output_tensor));
-    auto output = output_tensor->template flat<int32>();
+    auto output = output_tensor->template flat<int32_t>();
 
     // Set all but the first element of the output tensor to 0.
     const int N = input.size();
diff --git a/tensorflow/examples/adding_an_op/zero_out_op_kernel_2.cc b/tensorflow/examples/adding_an_op/zero_out_op_kernel_2.cc
index 4a04e5c3c94..14e5072a042 100644
--- a/tensorflow/examples/adding_an_op/zero_out_op_kernel_2.cc
+++ b/tensorflow/examples/adding_an_op/zero_out_op_kernel_2.cc
@@ -102,7 +102,7 @@ REGISTER_KERNEL_BUILDER(Name("ZeroOut")
 
 REGISTER_KERNEL(float);
 REGISTER_KERNEL(double);
-REGISTER_KERNEL(int32);
+REGISTER_KERNEL(int32_t);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/examples/adding_an_op/zero_out_op_kernel_3.cc b/tensorflow/examples/adding_an_op/zero_out_op_kernel_3.cc
index 7a9628e3cf3..3431e5e108b 100644
--- a/tensorflow/examples/adding_an_op/zero_out_op_kernel_3.cc
+++ b/tensorflow/examples/adding_an_op/zero_out_op_kernel_3.cc
@@ -43,7 +43,7 @@ class ZeroOutOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     // Grab the input tensor
     const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<int32>();
+    auto input = input_tensor.flat<int32_t>();
 
     // Check that preserve_index is in range
     OP_REQUIRES(context, preserve_index_ < input.dimension(0),
@@ -53,7 +53,7 @@ class ZeroOutOp : public OpKernel {
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                      &output_tensor));
-    auto output = output_tensor->template flat<int32>();
+    auto output = output_tensor->template flat<int32_t>();
 
     // Set all the elements of the output tensor to 0
     const int N = input.size();
diff --git a/tensorflow/examples/label_image/label_image.py b/tensorflow/examples/label_image/label_image.py
index 3e4bc1ddaab..4b7b79a7291 100644
--- a/tensorflow/examples/label_image/label_image.py
+++ b/tensorflow/examples/label_image/label_image.py
@@ -17,11 +17,11 @@ import argparse
 
 import numpy as np
 import tensorflow as tf
-
+tf.compat.v1.disable_eager_execution()
 
 def load_graph(model_file):
   graph = tf.Graph()
-  graph_def = tf.GraphDef()
+  graph_def = tf.compat.v1.GraphDef()
 
   with open(model_file, "rb") as f:
     graph_def.ParseFromString(f.read())
@@ -38,7 +38,7 @@ def read_tensor_from_image_file(file_name,
                                 input_std=255):
   input_name = "file_reader"
   output_name = "normalized"
-  file_reader = tf.read_file(file_name, input_name)
+  file_reader = tf.io.read_file(file_name, input_name)
   if file_name.endswith(".png"):
     image_reader = tf.io.decode_png(file_reader, channels=3, name="png_reader")
   elif file_name.endswith(".gif"):
@@ -50,14 +50,16 @@ def read_tensor_from_image_file(file_name,
         file_reader, channels=3, name="jpeg_reader")
   float_caster = tf.cast(image_reader, tf.float32)
   dims_expander = tf.expand_dims(float_caster, 0)
-  resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
+  resized = tf.compat.v1.image.resize_bilinear(
+      dims_expander, [input_height, input_width]
+  )
   normalized = tf.divide(tf.subtract(resized, [input_mean]), [input_std])
   sess = tf.compat.v1.Session()
   return sess.run(normalized)
 
 
 def load_labels(label_file):
-  proto_as_ascii_lines = tf.gfile.GFile(label_file).readlines()
+  proto_as_ascii_lines = tf.io.gfile.GFile(label_file).readlines()
   return [l.rstrip() for l in proto_as_ascii_lines]
 
 
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index 3a5d44eb669..04f509244b3 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -167,8 +167,8 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
       root, dims_expander,
       Const(root.WithOpName("size"), {input_height, input_width}));
   // Subtract the mean and divide by the scale.
-  Div(root.WithOpName(output_name), Sub(root, resized, {input_mean}),
-      {input_std});
+  Div output_op(root.WithOpName(output_name), Sub(root, resized, {input_mean}),
+                {input_std});
 
   // This runs the GraphDef network definition that we've just constructed, and
   // returns the results in the output tensor.
@@ -209,7 +209,7 @@ Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
   string output_name = "top_k";
-  TopK(root.WithOpName(output_name), outputs[0], how_many_labels);
+  TopK top_k(root.WithOpName(output_name), outputs[0], how_many_labels);
   // This runs the GraphDef network definition that we've just constructed, and
   // returns the results in the output tensors.
   tensorflow::GraphDef graph;
diff --git a/tensorflow/examples/speech_commands/accuracy_utils.cc b/tensorflow/examples/speech_commands/accuracy_utils.cc
index d2b0344a8d0..b6c629b4356 100644
--- a/tensorflow/examples/speech_commands/accuracy_utils.cc
+++ b/tensorflow/examples/speech_commands/accuracy_utils.cc
@@ -39,7 +39,7 @@ Status ReadGroundTruthFile(const string& file_name,
       continue;
     }
     float timestamp;
-    if (!tensorflow::strings::safe_strtof(pieces[1].c_str(), &timestamp)) {
+    if (!tensorflow::strings::safe_strtof(pieces[1], &timestamp)) {
       return tensorflow::errors::InvalidArgument(
           "Wrong number format at line: ", line);
     }
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 65e614ab7d4..3fa52415df7 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -10288,6 +10288,17 @@ func DataServiceDatasetV2(scope *Scope, dataset_id tf.Output, processing_mode tf
 	return op.Output(0)
 }
 
+// DatasetCardinalityAttr is an optional argument to DatasetCardinality.
+type DatasetCardinalityAttr func(optionalAttr)
+
+// DatasetCardinalityCardinalityOptions sets the optional cardinality_options attribute to value.
+// If not specified, defaults to ""
+func DatasetCardinalityCardinalityOptions(value string) DatasetCardinalityAttr {
+	return func(m optionalAttr) {
+		m["cardinality_options"] = value
+	}
+}
+
 // Returns the cardinality of `input_dataset`.
 //
 // Returns the cardinality of `input_dataset`.
@@ -10298,15 +10309,20 @@ func DataServiceDatasetV2(scope *Scope, dataset_id tf.Output, processing_mode tf
 //
 // Returns The cardinality of `input_dataset`. Named constants are used to represent
 // infinite and unknown cardinality.
-func DatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
+func DatasetCardinality(scope *Scope, input_dataset tf.Output, optional ...DatasetCardinalityAttr) (cardinality tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "DatasetCardinality",
 		Input: []tf.Input{
 			input_dataset,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -10719,6 +10735,115 @@ func DebugIdentityV2(scope *Scope, input tf.Output, optional ...DebugIdentityV2A
 	return op.Output(0)
 }
 
+// DebugIdentityV3Attr is an optional argument to DebugIdentityV3.
+type DebugIdentityV3Attr func(optionalAttr)
+
+// DebugIdentityV3DeviceName sets the optional device_name attribute to value.
+//
+// value: Name of the device on which the tensor resides.
+// If not specified, defaults to ""
+func DebugIdentityV3DeviceName(value string) DebugIdentityV3Attr {
+	return func(m optionalAttr) {
+		m["device_name"] = value
+	}
+}
+
+// DebugIdentityV3TensorName sets the optional tensor_name attribute to value.
+//
+// value: Name of the input tensor.
+// If not specified, defaults to ""
+func DebugIdentityV3TensorName(value string) DebugIdentityV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_name"] = value
+	}
+}
+
+// DebugIdentityV3IoOfNode sets the optional io_of_node attribute to value.
+//
+// value: Name of the node of which the tensor is an input or output.
+// If not specified, defaults to ""
+func DebugIdentityV3IoOfNode(value string) DebugIdentityV3Attr {
+	return func(m optionalAttr) {
+		m["io_of_node"] = value
+	}
+}
+
+// DebugIdentityV3IsInput sets the optional is_input attribute to value.
+//
+// value: If true, the tensor is an input of the node; otherwise the output.
+// If not specified, defaults to false
+func DebugIdentityV3IsInput(value bool) DebugIdentityV3Attr {
+	return func(m optionalAttr) {
+		m["is_input"] = value
+	}
+}
+
+// DebugIdentityV3IoIndex sets the optional io_index attribute to value.
+//
+// value: The index of which the tensor is an input or output of the node.
+// If not specified, defaults to -1
+func DebugIdentityV3IoIndex(value int64) DebugIdentityV3Attr {
+	return func(m optionalAttr) {
+		m["io_index"] = value
+	}
+}
+
+// DebugIdentityV3DebugUrls sets the optional debug_urls attribute to value.
+//
+// value: List of URLs to debug targets, e.g.,
+//
+//	file:///foo/tfdbg_dump, grpc:://localhost:11011
+//
+// If not specified, defaults to {}
+func DebugIdentityV3DebugUrls(value []string) DebugIdentityV3Attr {
+	return func(m optionalAttr) {
+		m["debug_urls"] = value
+	}
+}
+
+// DebugIdentityV3GatedGrpc sets the optional gated_grpc attribute to value.
+//
+// value: Whether this op will be gated. If any of the debug_urls of this
+//
+//	debug node is of the grpc:// scheme, when the value of this attribute is set
+//	to True, the data will not actually be sent via the grpc stream unless this
+//	debug op has been enabled at the debug_url. If all of the debug_urls of this
+//	debug node are of the grpc:// scheme and the debug op is enabled at none of
+//	them, the output will be an empty Tensor.
+//
+// If not specified, defaults to false
+func DebugIdentityV3GatedGrpc(value bool) DebugIdentityV3Attr {
+	return func(m optionalAttr) {
+		m["gated_grpc"] = value
+	}
+}
+
+// Provides an identity mapping of the non-Ref type input tensor for debugging.
+//
+// Provides an identity mapping of the non-Ref type input tensor for debugging.
+//
+// Arguments:
+//
+//	input: Input tensor, non-Reference type
+func DebugIdentityV3(scope *Scope, input tf.Output, optional ...DebugIdentityV3Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DebugIdentityV3",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DebugNanCountAttr is an optional argument to DebugNanCount.
 type DebugNanCountAttr func(optionalAttr)
 
@@ -43520,7 +43645,8 @@ func SetSizeValidateIndices(value bool) SetSizeAttr {
 // allowed but ignored.
 //
 // If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
+// indices. Setting is to `False` while passing invalid arguments results in
+// undefined behavior.
 //
 // Arguments:
 //
@@ -49527,6 +49653,37 @@ func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output
 	return op.Output(0)
 }
 
+// Stochastically cast a given tensor from floats to ints.
+//
+// The values are cast with a deterministic pseudo-random tensor from a uniform distribution generated from user given key, counter, algorithm. Values will saturate if out of the specified integer type range, and will become zero if inputs are NaN.
+//
+// The outputs are a deterministic function of `input`, `key`, `counter`, `alg`.
+//
+// Arguments:
+//
+//	input: The operand to stochastically cast to int.
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+//	alg: The RNG algorithm (shape int32[]).
+//	Tout: The type of the output.
+//
+// Returns The cast result with the same shape as the input.
+func StochasticCastToInt(scope *Scope, input tf.Output, key tf.Output, counter tf.Output, alg tf.Output, Tout tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tout": Tout}
+	opspec := tf.OpSpec{
+		Type: "StochasticCastToInt",
+		Input: []tf.Input{
+			input, key, counter, alg,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Stops gradient computation.
 //
 // When executed in a graph, this op outputs its input tensor as-is.
@@ -53995,6 +54152,14 @@ func TopKV2Sorted(value bool) TopKV2Attr {
 	}
 }
 
+// TopKV2IndexType sets the optional index_type attribute to value.
+// If not specified, defaults to DT_INT32
+func TopKV2IndexType(value tf.DataType) TopKV2Attr {
+	return func(m optionalAttr) {
+		m["index_type"] = value
+	}
+}
+
 // Finds values and indices of the `k` largest elements for the last dimension.
 //
 // If the input is a vector (rank-1), finds the `k` largest entries in the vector
@@ -57489,100 +57654,6 @@ func XlaBroadcastHelper(scope *Scope, lhs tf.Output, rhs tf.Output, broadcast_di
 	return op.Output(0), op.Output(1)
 }
 
-// XlaCallModuleAttr is an optional argument to XlaCallModule.
-type XlaCallModuleAttr func(optionalAttr)
-
-// XlaCallModuleDimArgsSpec sets the optional dim_args_spec attribute to value.
-//
-// value: in presence of dynamic shapes, this is the specification for the
-// dimension arguments. In absence of dynamic shapes this list is empty. The
-// `module` takes one 0-dimensional integer tensor dimension argument for each
-// element of `dim_spec_args`. The dimension arguments come after the platform
-// index argument and before the actual arguments. Each specification is a
-// string of the form "<arg_idx>.<axis_idx>" that specifies that the value of
-// the corresponding dimension argument must be "args[arg_idx].shape[axis_idx]",
-// where "args" are the actual array arguments.
-// If not specified, defaults to {}
-func XlaCallModuleDimArgsSpec(value []string) XlaCallModuleAttr {
-	return func(m optionalAttr) {
-		m["dim_args_spec"] = value
-	}
-}
-
-// XlaCallModulePlatforms sets the optional platforms attribute to value.
-//
-// value: the list of platforms supported by `module`. If the list is empty,
-// the `module` is platform independent or there should be no platform checking
-// or preprocessing. The list can contain the strings "CPU", "CUDA", "ROCM",
-// or "TPU".
-// If the list is not empty then it is an error to compile this op for a
-// platform that does not appear in the list. If the list contains more than
-// one platform, then the `module` takes one additional 0-dimensional
-// integer-tensor parameter in the first position, encoding the index in
-// `platforms` of the current compilation platform.
-// If not specified, defaults to {}
-func XlaCallModulePlatforms(value []string) XlaCallModuleAttr {
-	return func(m optionalAttr) {
-		m["platforms"] = value
-	}
-}
-
-// Invokes a StableHLO module.
-//
-// This op is experimental and is intended for use with JAX native serialization
-// in a TensorFlow context.
-//
-// Arguments:
-//
-//	args: A list of `Tensor` with possibly different types to be passed as arguments
-//
-// to the `module`. These are the actual arguments and do not include the
-// platform argument (see `platforms`) nor the dimension arguments (see
-// `dim_args_spec`).
-//
-//	version: Tracks changes the semantics of the op, to support backwards
-//
-// compatibility. Minimum supported version is 2. From
-// version 2, the op carries a StableHLO text or bytecode `module`. From
-// version 3, the op also supports the `platforms` attribute.
-//
-//	module: A serialized computation, a text or bytecode representation of
-//
-// an mlir.Module. The return type must be a tuple if and only if the `Sout` is
-// a list with 0 or more than 1 elements. The length of `Tout` and
-// `Sout` must match. This op always returns a tuple of results, even if the
-// module returns a single result.
-//
-//	Sout: List of output tensor shapes.
-//	Tout: List of output tensor data types.
-func XlaCallModule(scope *Scope, args []tf.Output, version int64, module string, Sout []tf.Shape, Tout []tf.DataType, optional ...XlaCallModuleAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"version": version, "module": module, "Sout": Sout, "Tout": Tout}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "XlaCallModule",
-		Input: []tf.Input{
-			tf.OutputList(args),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("XlaCallModule", err)
-		return
-	}
-	return output
-}
-
 // XlaConcatNDAttr is an optional argument to XlaConcatND.
 type XlaConcatNDAttr func(optionalAttr)
 
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index c8474941b0c..c0fe51b8005 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -78,7 +78,7 @@ type Tensor struct {
 // NewTensor converts from a Go value to a Tensor. Valid values are scalars,
 // slices, and arrays. Every element of a slice must have the same length so
 // that the resulting Tensor has a valid shape.
-func NewTensor(value interface{}) (*Tensor, error) {
+func NewTensor(value any) (*Tensor, error) {
 	val := reflect.ValueOf(value)
 	shape, dataType, err := shapeAndDataTypeOf(val)
 	if err != nil {
@@ -161,7 +161,7 @@ type eface struct {
 // reference to the original value. But we just want a pointer to make it
 // efficient to read the value, so cheating like this should be safe and
 // reasonable.
-func unpackEFace(obj interface{}) *eface {
+func unpackEFace(obj any) *eface {
 	return (*eface)(unsafe.Pointer(&obj))
 }
 
@@ -253,7 +253,7 @@ func (t *Tensor) Reshape(newShape []int64) error {
 // For example:
 // Tensor(int64, 0): int64
 // Tensor(float64, 3): [][][]float64
-func (t *Tensor) Value() interface{} {
+func (t *Tensor) Value() any {
 	raw := tensorData(t.c)
 	shape := t.Shape()
 	dt := t.DataType()
@@ -542,7 +542,7 @@ func copyPtr(w *bytes.Buffer, ptr unsafe.Pointer, l int) (int, error) {
 	return w.Write(b)
 }
 
-func bug(format string, args ...interface{}) error {
+func bug(format string, args ...any) error {
 	return fmt.Errorf("BUG: Please report at https://github.com/tensorflow/tensorflow/issues with the note: Go TensorFlow %v: %v", Version(), fmt.Sprintf(format, args...))
 }
 
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index f841ef33956..d6f984966ed 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -439,6 +439,7 @@ tf_cc_binary(
             "-z defs",
             "-s",
             "-Wl,--version-script,$(location {})".format(LINKER_VERSION_SCRIPT),
+            "-Wl,--undefined-version",
         ],
     }),
     linkshared = 1,
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index b1b54da17d2..13ae0937474 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -193,7 +193,7 @@ void RenderSecondaryFactoryMethod(const OpSpec& op, const Type& op_class,
   factory_doc.add_tag("return", "a new instance of " + op_class.name());
 
   writer->BeginMethod(factory, PUBLIC | STATIC, &factory_doc);
-  writer->Append(factory_statement.str().c_str()).Append(");").EndLine();
+  writer->Append(factory_statement.str()).Append(");").EndLine();
   writer->EndMethod();
 }
 
diff --git a/tensorflow/java/src/main/java/org/tensorflow/examples/BUILD b/tensorflow/java/src/main/java/org/tensorflow/examples/BUILD
index 1f278494550..c8e8abbf1c4 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/examples/BUILD
+++ b/tensorflow/java/src/main/java/org/tensorflow/examples/BUILD
@@ -13,19 +13,3 @@ java_binary(
     main_class = "org.tensorflow.examples.LabelImage",
     deps = ["//tensorflow/java:tensorflow"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            # Exclude any .class files that might have been generated
-            # by manual invocations of javac as described in
-            # tensorflow/java/README.md
-            "**/*.class",
-        ],
-    ),
-    visibility = ["//visibility:private"],
-)
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index e38e58d6fe6..2bc604973a7 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -69,15 +69,3 @@ genrule(
     outs = ["jni_md.h"],
     cmd = "cp -f $< $@",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 73f96d2cdb4..c08cd7bcec5 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "if_google", "if_not_windows", "if_oss", "tf_cc_test")
-load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_copts", "tflite_copts_warnings", "tflite_self_contained_libs_test_suite")
+load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_copts", "tflite_copts_warnings", "tflite_linkopts_no_undefined", "tflite_self_contained_libs_test_suite")
 load("//tensorflow/lite:special_rules.bzl", "SPECIAL_RULES_DEPS", "internal_visibility_allowlist", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
@@ -324,29 +324,31 @@ cc_library(
     alwayslink = 1,
 )
 
-# Benchmarks should link this library to see the actual result.
-cc_library(
-    name = "tensorflow_profiler_logger",
-    srcs = ["tensorflow_profiler_logger.cc"],
-    hdrs = ["tensorflow_profiler_logger.h"],
-    copts = tflite_copts_warnings(),
-    deps = [
-        ":macros",
-        "//base:addressmap",
-        "//base:examine_stack",
-        "//base:low_level_alloc",
-        "//base:malloc_hook",
-        "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
-        "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/kernels:kernel_util",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/debugging:stacktrace",
-        "@com_google_absl//absl/debugging:symbolize",
-        "@com_google_absl//absl/synchronization",
-    ],
-    alwayslink = 1,
-)
+# copybara:uncomment_begin(google-only)
+# # Benchmarks should link this library to see the actual result.
+# cc_library(
+#     name = "tensorflow_profiler_logger",
+#     srcs = ["tensorflow_profiler_logger.cc"],
+#     hdrs = ["tensorflow_profiler_logger.h"],
+#     copts = tflite_copts_warnings(),
+#     deps = [
+#         ":macros",
+#         "//base:addressmap",
+#         "//base:examine_stack",
+#         "//base:low_level_alloc",
+#         "//base:malloc_hook",
+#         "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
+#         "//tensorflow/core/profiler/lib:traceme",
+#         "//tensorflow/lite/core/c:common",
+#         "//tensorflow/lite/kernels:kernel_util",
+#         "@com_google_absl//absl/base",
+#         "@com_google_absl//absl/debugging:stacktrace",
+#         "@com_google_absl//absl/debugging:symbolize",
+#         "@com_google_absl//absl/synchronization",
+#     ],
+#     alwayslink = 1,
+# )
+# copybara:uncomment_end
 
 cc_library(
     name = "simple_memory_arena_debug_dump",
@@ -544,21 +546,6 @@ cc_library_with_tflite(
     alwayslink = 1,  # TODO(b/161243354): eliminate this.
 )
 
-# The key parts of the C++ API.  This target defines the TF Lite classes for
-# loading models and interpreting them.
-# DEPRECATED: prefer to depend on :cc_api_stable or :cc_api_experimental.
-alias_with_tflite(
-    name = "cc_api",
-    actual = "cc_api_experimental",
-    compatible_with = get_compatible_with_portable(),
-    generate_opaque_delegate_target = True,
-    visibility = [
-        "//tensorflow/lite/core/shims:__subpackages__",
-        "//tensorflow/lite/delegates/flex:__subpackages__",
-        "//tensorflow/lite/kernels:__subpackages__",
-    ],
-)
-
 cc_library(
     name = "logger",
     srcs = ["logger.cc"],
@@ -895,13 +882,13 @@ cc_library(
         "//tensorflow/lite/core/c:common",
     ] + select({
         ":tflite_with_xnnpack_explicit_true": [
-            "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_hdrs_only",
             ":tflite_with_xnnpack_enabled",
+            "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_hdrs_only",
         ],
         ":tflite_with_xnnpack_explicit_false": [],
         "//conditions:default": [
-            "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_hdrs_only",
             ":tflite_with_xnnpack_default",
+            "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_hdrs_only",
         ],
     }),
 )
@@ -1156,16 +1143,10 @@ cc_library_with_tflite(
 # Defines CreateOpResolver with all builtin ops.
 cc_library_with_tflite(
     name = "create_op_resolver_with_builtin_ops",
-    srcs = ["create_op_resolver_with_builtin_ops.cc"],
     hdrs = ["create_op_resolver.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
-    deps = [
-        ":mutable_op_resolver",
-        ":op_resolver",
-        "//tensorflow/lite/core:create_op_resolver_header",
-        "//tensorflow/lite/core/kernels:builtin_ops",
-    ],
+    deps = ["//tensorflow/lite/core:create_op_resolver_with_builtin_ops"],
     # Some targets only have an implicit dependency on CreateOpResolver.
     # This avoids warnings about backwards references when linking.
     alwayslink = True,
@@ -1305,13 +1286,12 @@ tflite_cc_shared_object(
     # Until we have more granular symbol export for the C++ API on Windows,
     # export all symbols.
     features = ["windows_export_all_symbols"],
-    linkopts = select({
+    linkopts = tflite_linkopts_no_undefined() + select({
         "//tensorflow:macos": [
             "-Wl,-exported_symbols_list,$(location //tensorflow/lite:tflite_exported_symbols.lds)",
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
-            "-Wl,-z,defs",
             "-Wl,--version-script,$(location //tensorflow/lite:tflite_version_script.lds)",
         ],
     }),
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index a9619bddaa7..dd6ffd9d897 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -218,6 +218,9 @@ if(CMAKE_SYSTEM_NAME MATCHES "Windows")
 endif()
 if(CMAKE_SYSTEM_NAME MATCHES "Android")
   find_library(ANDROID_LOG_LIB log)
+  list(APPEND TFLITE_TARGET_DEPENDENCIES
+    log
+  )
 endif()
 # Build a list of source files to compile into the TF Lite library.
 populate_tflite_source_vars("." TFLITE_SRCS)
@@ -237,13 +240,19 @@ if(_TFLITE_ENABLE_MMAP)
 else()
   list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*mmap_allocation\\.cc$")
 endif()
-if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
-  list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*minimal_logging_android\\.cc$")
-endif()
-if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "iOS")
-  list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*minimal_logging_ios\\.cc$")
+
+# Handle TFLite logging source.
+list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*minimal_logging_.*\\.cc$")
+if("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+  list(APPEND TFLITE_SRCS ${TFLITE_SOURCE_DIR}/minimal_logging_android.cc)
+elseif("${CMAKE_SYSTEM_NAME}" STREQUAL "iOS")
+  list(APPEND TFLITE_SRCS ${TFLITE_SOURCE_DIR}/minimal_logging_ios.cc)
+else()
+  list(APPEND TFLITE_SRCS ${TFLITE_SOURCE_DIR}/minimal_logging_default.cc)
 endif()
+
 populate_tflite_source_vars("core" TFLITE_CORE_SRCS)
+populate_tflite_source_vars("core/acceleration/configuration" TFLITE_CORE_ACCELERATION_SRCS)
 populate_tflite_source_vars("core/api" TFLITE_CORE_API_SRCS)
 populate_tflite_source_vars("core/async" TFLITE_CORE_ASYNC_SRCS)
 populate_tflite_source_vars("core/async/c" TFLITE_CORE_ASYNC_C_SRCS)
@@ -347,6 +356,14 @@ if(TFLITE_ENABLE_GPU)
     enable_language(OBJCXX)
     list(APPEND TFLITE_DELEGATES_METAL_SRCS
       ${TFLITE_SOURCE_DIR}/delegates/gpu/metal_delegate.mm
+      ${TFLITE_SOURCE_DIR}/delegates/gpu/metal/buffer.cc
+      ${TFLITE_SOURCE_DIR}/delegates/gpu/metal/buffer_convert.mm
+      ${TFLITE_SOURCE_DIR}/delegates/gpu/metal/common.mm
+      ${TFLITE_SOURCE_DIR}/delegates/gpu/metal/compute_task.cc
+      ${TFLITE_SOURCE_DIR}/delegates/gpu/metal/inference_context.cc
+      ${TFLITE_SOURCE_DIR}/delegates/gpu/metal/metal_arguments.cc
+      ${TFLITE_SOURCE_DIR}/delegates/gpu/metal/metal_device.cc
+      ${TFLITE_SOURCE_DIR}/delegates/gpu/metal/metal_spatial_tensor.cc
     )
     add_library(metal_delegate STATIC
       ${TFLITE_DELEGATES_METAL_SRCS}
@@ -354,8 +371,28 @@ if(TFLITE_ENABLE_GPU)
     target_include_directories(metal_delegate PUBLIC
       ${CMAKE_BINARY_DIR}/abseil-cpp
       ${CMAKE_BINARY_DIR}/flatbuffers/include
+      PRIVATE ${TENSORFLOW_SOURCE_DIR}
     )
-
+    #
+    # generate flatbuffers header for inference_context
+    #
+    if(FLATBUFFERS_FLATC_EXECUTABLE)
+      set(FLATC ${FLATBUFFERS_FLATC_EXECUTABLE})
+    else()
+      set(FLATC flatc)
+    endif()
+    add_custom_command(
+        OUTPUT ${TFLITE_SOURCE_DIR}/delegates/gpu/metal/inference_context_generated.h
+        COMMAND ${FLATC} --scoped-enums
+        -I ${TENSORFLOW_SOURCE_DIR}
+        -o ${TFLITE_SOURCE_DIR}/delegates/gpu/metal
+        -c ${TFLITE_SOURCE_DIR}/delegates/gpu/metal/inference_context.fbs
+    )
+    add_custom_target(
+        inference_context_cc_fbs
+        DEPENDS ${TFLITE_SOURCE_DIR}/delegates/gpu/metal/inference_context_generated.h
+    )
+    add_dependencies(metal_delegate inference_context_cc_fbs)
     #
     # supplementary libraries for libmetal_delegate
     #
@@ -363,11 +400,9 @@ if(TFLITE_ENABLE_GPU)
         buffer
         compute_task
         inference_context
-        linear_storage
         metal_arguments
         metal_device
         metal_spatial_tensor
-        texture2d
     )
    SET(METAL_DELEGATE_PATH ${TFLITE_SOURCE_DIR}/delegates/gpu/metal/)
 
@@ -511,6 +546,7 @@ endif()
 
 # TFLite library
 set(_ALL_TFLITE_SRCS
+  ${TFLITE_CORE_ACCELERATION_SRCS}
   ${TFLITE_CORE_API_SRCS}
   ${TFLITE_CORE_C_SRCS}
   ${TFLITE_CORE_EXPERIMENTAL_SRCS}
@@ -554,6 +590,7 @@ set(_ALL_TFLITE_HDRS ${_ALL_TFLITE_SRCS})
 list(FILTER _ALL_TFLITE_HDRS INCLUDE REGEX ".*\\.h$")
 target_include_directories(tensorflow-lite
   PUBLIC $<BUILD_INTERFACE:${TENSORFLOW_SOURCE_DIR}> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+  ${CMAKE_BINARY_DIR}/gemmlowp
 )
 target_link_libraries(tensorflow-lite
   PUBLIC
@@ -568,7 +605,6 @@ target_link_libraries(tensorflow-lite
     farmhash
     fft2d_fftsg2d
     flatbuffers::flatbuffers
-    gemmlowp
     ruy::ruy
     pthreadpool
     ${CMAKE_DL_LIBS}
diff --git a/tensorflow/lite/acceleration/configuration/BUILD b/tensorflow/lite/acceleration/configuration/BUILD
new file mode 100644
index 00000000000..f5d5c8382b0
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/BUILD
@@ -0,0 +1,454 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+load("@flatbuffers//:build_defs.bzl", "DEFAULT_FLATC_ARGS", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library", "flatc_path")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
+load("//tensorflow/lite:special_rules.bzl", "nnapi_plugin_impl_visibility_allowlist", "tflite_portable_test_suite")
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
+load(":build_defs.bzl", "flatbuffer_schema_compat_test")
+
+# copybara:comment_begin(oss-only)
+load("//tensorflow/tsl/platform/default:build_config.bzl", "tf_proto_library_py")
+# copybara:comment_end
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],
+)
+
+# We generate a FlatBuffer schema from the Protobuf schema.
+genrule(
+    name = "configuration_schema",
+    srcs = ["configuration.proto"],
+    outs = ["configuration.fbs"],
+    # We rename the namespace since otherwise the proto classes and flatbuffer
+    # classes would have the same names.
+    cmd = """
+        $(location {}) --proto -o $(@D) $(location :configuration.proto)
+        perl -p -i -e 's/tflite.proto/tflite/' $@
+    """.format(flatc_path),
+    compatible_with = get_compatible_with_portable(),
+    tools = [flatc_path],
+)
+
+# We also do the same transformation for the _previous_
+# version of the schema -- this is used to test that changes
+# to the schema preserve binary backwards compatibility.
+genrule(
+    name = "configuration_prev_schema",
+    srcs = ["testdata/configuration.proto_prev"],  # Must NOT end in '.proto'.
+    outs = ["configuration.prev.fbs"],  # MUST end in '.fbs'.
+    # We rename the namespace since otherwise the proto classes and flatbuffer
+    # classes would have the same names.
+    cmd = """
+        cp $(location :testdata/configuration.proto_prev) $(@D)/configuration.prev.proto
+        $(location {}) --proto -o $(@D) $(@D)/configuration.prev.proto
+        perl -p -i -e 's/tflite.proto/tflite/' $@
+    """.format(flatc_path),
+    compatible_with = get_compatible_with_portable(),
+    tools = [flatc_path],
+)
+
+# Test that changes to the proto file preserve binary backwards compatibility
+# of the generated FlatBuffer schema, relative to the one generated from the
+# previous proto file.
+flatbuffer_schema_compat_test(
+    name = "configuration_abi_stability_test",
+    ref_schema = ":configuration.prev.fbs",
+    schema = ":configuration.fbs",
+)
+
+# Test that changes to the proto file OR to the FlatBuffer proto-to-flatbuffer
+# schema conversion itself will preserve binary backwards compatibility of the
+# generated FlatBuffer schema, relative to an older snapshot of the
+# generated FlatBuffer schema.
+flatbuffer_schema_compat_test(
+    name = "configuration_flatbuffer_abi_stability_test",
+    ref_schema = "testdata/configuration.old.fbs",
+    schema = ":configuration.fbs",
+)
+
+# Test that the previous version of the proto is different than the current version.
+sh_test(
+    name = "prev_is_different_than_current_test",
+    srcs = ["prev_is_different_than_current_test.sh"],
+    data = [
+        "configuration.proto",
+        "testdata/configuration.proto_prev",
+    ],
+)
+
+# Generate a C++ header containing the contents of the FlatBuffer schema
+# as a char array literal.  This is potentially useful for embedding in programs
+# (e.g. for JSON parsing using that schema).
+genrule(
+    name = "configuration_fbs_contents_cc",
+    srcs = ["configuration.fbs"],
+    outs = ["configuration_fbs_contents-inl.h"],
+    cmd = """
+      echo 'constexpr char configuration_fbs_contents[] = R"Delimiter(' > $(@)
+      cat < $(<) >> $(@)
+      echo ')Delimiter";' >> $(@)
+    """,
+)
+
+exports_files(["configuration.proto"])
+
+proto_library(
+    name = "configuration_proto",
+    srcs = [
+        "configuration.proto",
+    ],
+)
+
+cc_proto_library(
+    name = "configuration_cc_proto",
+    deps = [":configuration_proto"],
+)
+
+java_lite_proto_library(
+    name = "configuration_java_proto_lite",
+    deps = [":configuration_proto"],
+)
+
+# copybara:comment_begin(oss-only)
+tf_proto_library_py(
+    name = "configuration_proto_external",
+    srcs = ["configuration.proto"],
+)
+# copybara:comment_end
+
+flatbuffer_cc_library(
+    name = "configuration_fbs",
+    srcs = [":configuration.fbs"],
+    compatible_with = get_compatible_with_portable(),
+    flatc_args = DEFAULT_FLATC_ARGS + ["--gen-compare"],
+)
+
+flatbuffer_java_library(
+    name = "configuration_fbs_java",
+    srcs = [":configuration.fbs"],
+)
+
+flatbuffer_android_library(
+    name = "configuration_fbs_android",
+    srcs = [":configuration.fbs"],
+)
+
+cc_library(
+    name = "proto_to_flatbuffer",
+    srcs = [
+        "proto_to_flatbuffer.cc",
+    ],
+    hdrs = ["proto_to_flatbuffer.h"],
+    deps = [
+        ":configuration_cc_proto",
+        ":configuration_fbs",
+        "//tensorflow/lite:minimal_logging",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "proto_to_flatbuffer_test",
+    srcs = ["proto_to_flatbuffer_test.cc"],
+    deps = [
+        ":proto_to_flatbuffer",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "flatbuffer_to_proto",
+    srcs = [
+        "flatbuffer_to_proto.cc",
+    ],
+    hdrs = ["flatbuffer_to_proto.h"],
+    deps = [
+        ":configuration_cc_proto",
+        ":configuration_fbs",
+        "//tensorflow/lite:minimal_logging",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "flatbuffer_to_proto_test",
+    srcs = ["flatbuffer_to_proto_test.cc"],
+    deps = [
+        ":configuration_cc_proto",
+        ":configuration_fbs",
+        ":flatbuffer_to_proto",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library_with_tflite(
+    name = "delegate_registry",
+    hdrs = ["delegate_registry.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts_warnings(),
+    deps = [
+        ":configuration_fbs",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/core/c:common",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library_with_tflite(
+    name = "delegate_plugin_converter",
+    srcs = ["delegate_plugin_converter.cc"],
+    hdrs = ["delegate_plugin_converter.h"],
+    tflite_deps = [
+        "//tensorflow/lite/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/c:common",
+    ],
+    deps = ["@com_google_absl//absl/memory"],
+)
+
+cc_library(
+    name = "nnapi_plugin",
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":nnapi_plugin_impl",
+    ],
+)
+
+cc_library(
+    name = "nnapi_plugin_impl",
+    srcs = ["nnapi_plugin.cc"],
+    hdrs = ["nnapi_plugin.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = nnapi_plugin_impl_visibility_allowlist() + [
+        "//tensorflow/lite/acceleration/configuration/c:__pkg__",
+        "//tensorflow/lite/core/acceleration/configuration/c:__pkg__",
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:__pkg__",
+        "//tensorflow/lite/experimental/acceleration/configuration:__pkg__",
+        "//tensorflow/lite/experimental/acceleration/configuration/c:__pkg__",
+    ],
+    deps = [
+        ":configuration_fbs",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/core/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
+        "//tensorflow/lite/nnapi:nnapi_implementation_headers",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = 1,  # For registration to always run.
+)
+
+cc_test(
+    name = "nnapi_plugin_test",
+    srcs = ["nnapi_plugin_test.cc"],
+    tags = [
+        "no_mac",
+        "no_windows",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":configuration_fbs",
+        ":nnapi_plugin",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/acceleration/configuration:nnapi_plugin_impl",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate_mock_test",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "hexagon_plugin",
+    srcs = ["hexagon_plugin.cc"],
+    deps = [
+        ":configuration_fbs",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "@com_google_absl//absl/memory",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+    alwayslink = 1,  # For registration to always run.
+)
+
+cc_library(
+    name = "gpu_plugin",
+    deps = [
+        ":gpu_plugin_impl",
+    ],
+)
+
+common_copts = tflite_copts() + tflite_copts_warnings()
+
+cc_library(
+    name = "gpu_plugin_impl",
+    srcs = ["gpu_plugin.cc"],
+    hdrs = ["gpu_plugin.h"],
+    copts = common_copts + select({
+        "//tensorflow:ios": [
+            "-xobjective-c++",
+        ],
+        "//tensorflow:macos_arm64": [
+            "-xobjective-c++",
+        ],
+        "//conditions:default": [],
+    }),
+    visibility = [
+        "//tensorflow/lite/acceleration/configuration/c:__pkg__",
+        "//tensorflow/lite/core/acceleration/configuration/c:__pkg__",
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:__pkg__",
+        "//tensorflow/lite/experimental/acceleration/configuration:__pkg__",
+        "//tensorflow/lite/experimental/acceleration/configuration/c:__pkg__",
+    ],
+    deps = [
+        ":configuration_fbs",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "@com_google_absl//absl/memory",
+    ] + select({
+        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [
+            "//tensorflow/lite/delegates/gpu:delegate",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:ios": [
+            "//tensorflow/lite/delegates/gpu:metal_delegate",
+        ],
+        "//tensorflow:macos_arm64": [
+            "//tensorflow/lite/delegates/gpu:metal_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+    alwayslink = 1,  # For registration to always run.
+)
+
+cc_test(
+    name = "gpu_plugin_test",
+    srcs = ["gpu_plugin_test.cc"],
+    tags = [
+        # TODO(b/214492180): Enable the tests for mac/ios too.  This most likely
+        # needs TAP to recognize the xobjective-c++ flag.
+        "no_mac",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":configuration_cc_proto",
+        ":configuration_fbs",
+        ":gpu_plugin_impl",
+        ":proto_to_flatbuffer",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:runtime_cc",
+    ],
+)
+
+cc_library(
+    name = "xnnpack_plugin",
+    srcs = ["xnnpack_plugin.cc"],
+    deps = [
+        ":configuration_fbs",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "@com_google_absl//absl/base:log_severity",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = 1,  # For registration to always run.
+)
+
+cc_test(
+    name = "xnnpack_plugin_test",
+    srcs = ["xnnpack_plugin_test.cc"],
+    deps = [
+        ":configuration_fbs",
+        ":xnnpack_plugin",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:runtime_cc",
+        "@pthreadpool",
+    ],
+)
+
+cc_library(
+    name = "coreml_plugin",
+    srcs = ["coreml_plugin.cc"],
+    deps = [
+        ":configuration_fbs",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "@com_google_absl//absl/memory",
+    ] + select({
+        "//tensorflow:macos": [
+            "//tensorflow/lite/delegates/coreml:coreml_delegate",
+        ],
+        "//tensorflow:ios": [
+            "//tensorflow/lite/delegates/coreml:coreml_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+    alwayslink = 1,  # For registration to always run.
+)
+
+# TODO(b/260582614): Add support for TF Lite in Play Services.
+cc_library(
+    name = "stable_delegate_plugin",
+    srcs = ["stable_delegate_plugin.cc"],
+    hdrs = ["stable_delegate_plugin.h"],
+    deps = [
+        ":configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/acceleration/configuration/c:stable_delegate",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:delegate_loader",
+        "//tensorflow/lite/tools:logging",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = 1,  # For registration to always run.
+)
+
+cc_test(
+    name = "stable_delegate_plugin_test",
+    srcs = ["stable_delegate_plugin_test.cc"],
+    data = ["//tensorflow/lite/delegates/utils/experimental/stable_delegate:libtensorflowlite_stable_xnnpack_delegate.so"],
+    tags = [
+        # TODO(b/259303511): Propagate build config to data correctly to enable the test on x86 platforms.
+        "no_test_android_x86",
+    ],
+    deps = [
+        ":configuration_fbs",
+        ":stable_delegate_plugin",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:runtime_cc",
+        "@pthreadpool",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/acceleration/configuration/build_defs.bzl b/tensorflow/lite/acceleration/configuration/build_defs.bzl
new file mode 100644
index 00000000000..db95f203586
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/build_defs.bzl
@@ -0,0 +1,43 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Build macros for checking ABI compatibility."""
+
+load("@flatbuffers//:build_defs.bzl", "flatc_path")
+
+def flatbuffer_schema_compat_test(name, ref_schema, schema):
+    """Generates a test for schema binary compatibility.
+
+    Generates a test that the specified schema file is binary backwards
+    compatible with a reference schema (e.g. a previous version of the
+    schema).
+
+    Note: currently this build macro requires that the schema be a single
+    fully self-contained .fbs file; it does not yet support includes.
+    """
+
+    native.genrule(
+        name = name + "_gen",
+        srcs = [ref_schema, schema],
+        outs = [name + "_test.sh"],
+        tools = [flatc_path],
+        cmd = ("echo $(rootpath {}) --conform $(rootpath {}) $(rootpath {}) > $@"
+            .format(flatc_path, ref_schema, schema)),
+    )
+
+    native.sh_test(
+        name = name,
+        srcs = [name + "_test.sh"],
+        data = [flatc_path, ref_schema, schema],
+    )
diff --git a/tensorflow/lite/acceleration/configuration/c/BUILD b/tensorflow/lite/acceleration/configuration/c/BUILD
new file mode 100644
index 00000000000..073889e3871
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/c/BUILD
@@ -0,0 +1,95 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# C API for delegate plugins.
+
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load(
+    "//tensorflow/lite/core/shims:cc_library_with_tflite.bzl",
+    "cc_library_with_tflite_with_c_headers_test",
+)
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],
+)
+
+cc_library_with_tflite_with_c_headers_test(
+    name = "delegate_plugin",
+    hdrs = ["delegate_plugin.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts() + tflite_copts_warnings(),
+    generate_opaque_delegate_target = True,
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/lite/core/acceleration/configuration/c:delegate_plugin"],
+)
+
+cc_library_with_tflite_with_c_headers_test(
+    name = "nnapi_plugin",
+    hdrs = ["nnapi_plugin.h"],
+    copts = tflite_copts() + tflite_copts_warnings(),
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/lite/core/acceleration/configuration/c:nnapi_plugin"],
+)
+
+test_suite(
+    name = "nnapi_plugin_test",
+    tests = [
+        "//tensorflow/lite/core/acceleration/configuration/c:nnapi_plugin_test",
+    ],
+)
+
+cc_library_with_tflite_with_c_headers_test(
+    name = "gpu_plugin",
+    hdrs = ["gpu_plugin.h"],
+    copts = tflite_copts() + tflite_copts_warnings(),
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/lite/core/acceleration/configuration/c:gpu_plugin"],
+)
+
+# For non-Android platforms, this should be built with '--copt=-DCL_DELEGATE_NO_GL'.
+# On non-supported platforms (i.e. non-Android platforms if -DCL_DELEGATE_NO_GL wasn't specified),
+# the test srcs are set to the empty list, so the test will succeed without testing anything.
+test_suite(
+    name = "gpu_plugin_test",
+    tests = [
+        "//tensorflow/lite/core/acceleration/configuration/c:gpu_plugin_test",
+    ],
+)
+
+cc_library_with_tflite_with_c_headers_test(
+    name = "xnnpack_plugin",
+    hdrs = ["xnnpack_plugin.h"],
+    copts = tflite_copts() + tflite_copts_warnings(),
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/lite/core/acceleration/configuration/c:xnnpack_plugin"],
+)
+
+test_suite(
+    name = "xnnpack_plugin_test",
+    tests = [
+        "//tensorflow/lite/core/acceleration/configuration/c:xnnpack_plugin_test",
+    ],
+)
+
+cc_library_with_tflite_with_c_headers_test(
+    name = "stable_delegate",
+    hdrs = ["stable_delegate.h"],
+    generate_opaque_delegate_target = True,
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/lite/core/acceleration/configuration/c:stable_delegate"],
+)
diff --git a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h b/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h
similarity index 63%
rename from tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h
rename to tensorflow/lite/acceleration/configuration/c/delegate_plugin.h
index 0ac219521bc..3c186b0345b 100644
--- a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h
+++ b/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_H_
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
 
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_H_
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
diff --git a/tensorflow/lite/core/shims/c/c_api_types.h b/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h
similarity index 70%
rename from tensorflow/lite/core/shims/c/c_api_types.h
rename to tensorflow/lite/acceleration/configuration/c/gpu_plugin.h
index cf940fc050f..8a8202ef2bd 100644
--- a/tensorflow/lite/core/shims/c/c_api_types.h
+++ b/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_C_C_API_TYPES_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_C_C_API_TYPES_H_
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
 
-#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h"
 
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_C_C_API_TYPES_H_
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
diff --git a/tensorflow/lite/core/shims/c/c_api_opaque.h b/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h
similarity index 70%
rename from tensorflow/lite/core/shims/c/c_api_opaque.h
rename to tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h
index 9b6f4484990..74be5f9b3a9 100644
--- a/tensorflow/lite/core/shims/c/c_api_opaque.h
+++ b/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_C_C_API_OPAQUE_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_C_C_API_OPAQUE_H_
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
 
-#include "tensorflow/lite/core/c/c_api_opaque.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h"
 
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_C_C_API_OPAQUE_H_
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
diff --git a/tensorflow/lite/acceleration/configuration/c/stable_delegate.h b/tensorflow/lite/acceleration/configuration/c/stable_delegate.h
new file mode 100644
index 00000000000..2b34a32f4bf
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/c/stable_delegate.h
@@ -0,0 +1,20 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+
+#include "tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h"
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
diff --git a/tensorflow/lite/core/shims/c/builtin_op_data.h b/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h
similarity index 69%
rename from tensorflow/lite/core/shims/c/builtin_op_data.h
rename to tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h
index c3a74bf9f18..9ced18f3dc5 100644
--- a/tensorflow/lite/core/shims/c/builtin_op_data.h
+++ b/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_C_BUILTIN_OP_DATA_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_C_BUILTIN_OP_DATA_H_
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
 
-#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h"
 
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_C_BUILTIN_OP_DATA_H_
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
diff --git a/tensorflow/lite/acceleration/configuration/configuration.proto b/tensorflow/lite/acceleration/configuration/configuration.proto
new file mode 100644
index 00000000000..7b864b2d314
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/configuration.proto
@@ -0,0 +1,959 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//-----------------------------------------------------------------------------
+// WARNING: read all the warnings below before modifying this file!
+//-----------------------------------------------------------------------------
+//
+// This schema defines how to configure TFLite for delegation. These
+// definitions can be used in multiple ways: as output of a compatibility list,
+// in benchmarking tools and to decouple delegate instantiation from code.
+//
+// The schema is work-in-progress, covering the most broadly used delegates and
+// options.
+//
+// This schema is written using ProtoBuf syntax, but it is also used to generate
+// a corresponding FlatBuffer schema.
+//
+// WARNING: The TfLiteSettings flatbuffer is used as part of the ABI
+// for TensorFlow in Play Services, so please be careful to preserve
+// binary backwards compatibility!
+//
+// WARNING: the Protobuf to Flatbuffer schema conversion does NOT
+// pay any attention to the protobuf field numbers in this file,
+// so setting the protobuf field numbers is NOT sufficient to preserve binary
+// backwards compatibility.  Instead, to preserve backwards binary
+// compatibility, new fields MUST ONLY be added at the END of messages,
+// and fields should NEVER be deleted, but instead can only be deprecated.
+//
+// WARNING: before modifying this file, you should copy the previous contents
+// of this file to 'testdata/configuration.proto_prev'.  This is used to test
+// that your changes will preserve binary backwards compatibility.
+//
+// WARNING: you need to manually generate and update the generated flatbuffer
+// code (configuration_generated.h) when modifying this file. See BUILD for
+// more information. Below are manual steps for reference:
+// bazel build
+// //tensorflow/lite/acceleration/configuration:proto_to_flatbuffer
+// && cp
+// bazel-bin/tensorflow/lite/acceleration/configuration/configuration_generated.h
+// tensorflow/lite/acceleration/configuration/configuration_generated.h
+// NOTE: If you are a Google developer using the internal dev environment,
+// please read the description of the following bash script and then run it:
+// ./third_party/tensorflow/lite/acceleration/configuration/google/regenerate_tflite_configuration_generated_header.sh
+
+// LINT.IfChange
+
+syntax = "proto2";
+
+package tflite.proto;
+
+// ExecutionPreference is used to match accelerators against the preferences of
+// the current application or usecase. Some of the values here can appear both
+// in the compatibility list and as input, some only as input.
+//
+// These are separate from NNAPIExecutionPreference - the compatibility list
+// design doesn't assume a one-to-one mapping between which usecases
+// compatibility list entries have been developed for and what settings are used
+// for NNAPI.
+enum ExecutionPreference {
+  // Match any selected preference. Allowlist (semantically - value is same as
+  // on input).
+  ANY = 0;
+  // Match low latency preference. Both compatibility list and input.
+  LOW_LATENCY = 1;
+  // Math low power preference. Both compatibility list and input.
+  LOW_POWER = 2;
+  // Never accelerate. Can be used for input to compatibility list or for
+  // standalone Acceleration configuration.
+  FORCE_CPU = 3;
+}
+
+// TFLite accelerator to use.
+//
+// STATUS: support library and the stable delegate loader settings are agnostic
+// to the actual accelerator.
+enum Delegate {
+  NONE = 0;
+
+  NNAPI = 1;
+  GPU = 2;
+  HEXAGON = 3;
+  XNNPACK = 4;
+  // The EdgeTpu in Pixel devices.
+  EDGETPU = 5;
+  // The Coral EdgeTpu Dev Board / USB accelerator.
+  EDGETPU_CORAL = 6;
+  // Apple CoreML.
+  CORE_ML = 7;
+}
+
+enum NNAPIExecutionPreference {
+  // Undefined.
+  UNDEFINED = 0;
+  // Prefer executing in a way that minimizes battery drain.
+  NNAPI_LOW_POWER = 1;
+  // Prefer returning a single answer as fast as possible, even if this causes
+  // more power consumption.
+  NNAPI_FAST_SINGLE_ANSWER = 2;
+  // Prefer maximizing the throughput of successive frames, for example when
+  // processing successive frames coming from the camera.
+  NNAPI_SUSTAINED_SPEED = 3;
+}
+
+enum NNAPIExecutionPriority {
+  NNAPI_PRIORITY_UNDEFINED = 0;
+  NNAPI_PRIORITY_LOW = 1;
+  NNAPI_PRIORITY_MEDIUM = 2;
+  NNAPI_PRIORITY_HIGH = 3;
+}
+
+// One possible acceleration configuration.
+message ComputeSettings {
+  // Which preference to use this accelerator for.
+  optional ExecutionPreference preference = 1;
+  // How to configure TFLite
+  optional TFLiteSettings tflite_settings = 2;
+  // Identifiers to use for instrumentation and telemetry.
+  optional string model_namespace_for_statistics = 3;
+  optional string model_identifier_for_statistics = 4;
+
+  // 'Maybe' acceleration: use mini-benchmark to select settings.
+  optional MinibenchmarkSettings settings_to_test_locally = 5;
+}
+
+// NNAPI delegate settings.
+message NNAPISettings {
+  // Which instance (NNAPI accelerator) to use. One driver may provide several
+  // accelerators (though a driver may also hide several back-ends behind one
+  // name, at the choice of the driver vendor).
+  // Note that driver introspection is only available in Android Q and later.
+  optional string accelerator_name = 1;
+
+  // Deprecated; use the compilation_caching_settings in TFLiteSettings.
+  //
+  // NNAPI model compilation caching settings to be passed to
+  // tflite::StatefulNnApiDelegate
+  optional string cache_directory = 2 [deprecated = true];
+  optional string model_token = 3 [deprecated = true];
+
+  // NNAPI execution preference to pass. See
+  // https://developer.android.com/ndk/reference/group/neural-networks.html
+  optional NNAPIExecutionPreference execution_preference = 4;
+
+  // Number of instances to cache for the same model (for input size
+  // changes). This is mandatory for getting reasonable performance in that
+  // case.
+  optional int32 no_of_nnapi_instances_to_cache = 5;
+
+  // Deprecated; use the fallback_settings in TFLiteSettings.
+  //
+  // Whether to automatically fall back to TFLite CPU path.
+  optional FallbackSettings fallback_settings = 6 [deprecated = true];
+
+  // Whether to allow use of NNAPI CPU (nnapi-reference accelerator) on Android
+  // 10+ when an accelerator name is not specified. The NNAPI CPU typically
+  // performs less well than the TfLite built-in kernels; but allowing allows a
+  // model to be partially accelerated which may be a win.
+  optional bool allow_nnapi_cpu_on_android_10_plus = 7;
+
+  optional NNAPIExecutionPriority execution_priority = 8;
+
+  // Whether to allow dynamic dimension sizes without re-compilation.
+  // A tensor of with dynamic dimension must have a valid dims_signature
+  // defined.
+  // Only supported in NNAPI 1.1 and newer versions.
+  // WARNING: Setting this flag to true may result in model being rejected by
+  // accelerator. This should only be enabled if the target device supports
+  // dynamic dimensions of the model.
+  // By default this is set to false.
+  optional bool allow_dynamic_dimensions = 9;
+
+  // Whether to allow the NNAPI accelerator to optionally use lower-precision
+  // float16 (16-bit floating point) arithmetic when doing calculations on
+  // float32 (32-bit floating point).
+  optional bool allow_fp16_precision_for_fp32 = 10;
+
+  // Whether to use NNAPI Burst mode.
+  // Burst mode allows accelerators to efficiently manage resources, which
+  // would significantly reduce overhead especially if the same delegate
+  // instance is to be used for multiple inferences.
+  optional bool use_burst_computation = 11;
+
+  // Optional pointer to NNAPI Support Library provided pointer to
+  // NnApiSLDriverImplFL5 which can be used to construct the
+  // NNAPI delegate.
+  optional int64 support_library_handle = 12;
+}
+
+// LINT.IfChange
+// Which GPU backend to select. Default behaviour on Android is to try OpenCL
+// and if it's not available fall back to OpenGL.
+enum GPUBackend {
+  UNSET = 0;
+  OPENCL = 1;
+  OPENGL = 2;
+  // Not yet supported.
+  // VULKAN = 3;
+  // METAL = 4;
+}
+
+// GPU inference priorities define relative priorities given by the GPU delegate
+// to different client needs.
+// Corresponds to TfLiteGpuInferencePriority.
+enum GPUInferencePriority {
+  GPU_PRIORITY_AUTO = 0;
+  GPU_PRIORITY_MAX_PRECISION = 1;
+  GPU_PRIORITY_MIN_LATENCY = 2;
+  GPU_PRIORITY_MIN_MEMORY_USAGE = 3;
+}
+
+// GPU inference preference for initialization time vs. inference time.
+// Corresponds to TfLiteGpuInferenceUsage.
+enum GPUInferenceUsage {
+  // Delegate will be used only once, therefore, bootstrap/init time should
+  // be taken into account.
+  GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER = 0;
+
+  // Prefer maximizing the throughput. Same delegate will be used repeatedly on
+  // multiple inputs.
+  GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED = 1;
+}
+
+// GPU Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/gpu/delegate.h
+message GPUSettings {
+  // Obsolete: Ignored if inference_priority1/2/3 are set.
+  optional bool is_precision_loss_allowed = 1;
+  optional bool enable_quantized_inference = 2 [default = true];
+  optional GPUBackend force_backend = 3;
+
+  // Ordered priorities provide better control over desired semantics,
+  // where priority(n) is more important than priority(n+1). Therefore,
+  // each time inference engine needs to make a decision, it uses
+  // ordered priorities to do so.
+  //
+  // Default values correspond to GPU_PRIORITY_AUTO.
+  // AUTO priority can only be used when higher priorities are fully specified.
+  // For example:
+  //   VALID:   priority1 = MIN_LATENCY, priority2 = AUTO, priority3 = AUTO
+  //   VALID:   priority1 = MIN_LATENCY, priority2 = MAX_PRECISION,
+  //            priority3 = AUTO
+  //   INVALID: priority1 = AUTO, priority2 = MIN_LATENCY, priority3 = AUTO
+  //   INVALID: priority1 = MIN_LATENCY, priority2 = AUTO,
+  //            priority3 = MAX_PRECISION
+  // Invalid priorities will result in error.
+  //
+  // For more information, see TfLiteGpuDelegateOptionsV2.
+  optional GPUInferencePriority inference_priority1 = 4
+      [default = GPU_PRIORITY_AUTO];
+  optional GPUInferencePriority inference_priority2 = 5
+      [default = GPU_PRIORITY_AUTO];
+  optional GPUInferencePriority inference_priority3 = 6
+      [default = GPU_PRIORITY_AUTO];
+
+  // Whether to optimize for compilation+execution time or execution time only.
+  optional GPUInferenceUsage inference_preference = 7;
+
+  // Model serialization. Setting both of these fields will also set the
+  // TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_SERIALIZATION flag on the delegate.
+  //
+  // GPU model serialization directory passed in TfLiteGpuDelegateOptionsV2.
+  // This should be set to the application's code cache directory so that it can
+  // not be accessed by other apps and is correctly deleted on app updates.
+  // tflite::StatefulNnApiDelegate
+  optional string cache_directory = 8;
+  // Normally, the model name with version number should be provided here, since
+  // each model needs an unique ID to avoid cache collision.
+  optional string model_token = 9;
+}
+// LINT.ThenChange(GpuAccelerationConfig.java)
+
+// Hexagon Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/hexagon/hexagon_delegate.h
+message HexagonSettings {
+  optional int32 debug_level = 1;
+  optional int32 powersave_level = 2;
+  optional bool print_graph_profile = 3;
+  optional bool print_graph_debug = 4;
+}
+
+// XNNPack Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
+enum XNNPackFlags {
+  // These flags match the flags in xnnpack_delegate.h.
+  TFLITE_XNNPACK_DELEGATE_NO_FLAGS = 0;
+  // Enable fast signed integer XNNpack kernels.
+  TFLITE_XNNPACK_DELEGATE_FLAG_QS8 = 1;
+  // Enable fast unsigned integer XNNpack kernels.
+  TFLITE_XNNPACK_DELEGATE_FLAG_QU8 = 2;
+  // Enable both, signed and unsigned integer XNNpack kernels.
+  TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8 = 3;
+  // Force 16-bit floating point inference.
+  TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16 = 4;
+}
+
+message XNNPackSettings {
+  optional int32 num_threads = 1;
+  optional XNNPackFlags flags = 2 [default = TFLITE_XNNPACK_DELEGATE_NO_FLAGS];
+}
+
+// CoreML Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/coreml/coreml_delegate.h
+message CoreMLSettings {
+  // Note the enum order change from the above header for better proto practice.
+  enum EnabledDevices {
+    // Always create Core ML delegate.
+    DEVICES_ALL = 0;
+    // Create Core ML delegate only on devices with Apple Neural Engine.
+    DEVICES_WITH_NEURAL_ENGINE = 1;
+  }
+  // Only create delegate when Neural Engine is available on the device.
+  optional EnabledDevices enabled_devices = 1;
+
+  // Specifies target Core ML version for model conversion.
+  // Core ML 3 come with a lot more ops, but some ops (e.g. reshape) is not
+  // delegated due to input rank constraint.
+  // if not set to one of the valid versions, the delegate will use highest
+  // version possible in the platform.
+  // Valid versions: (2, 3)
+  optional int32 coreml_version = 2;
+  // This sets the maximum number of Core ML delegates created.
+  // Each graph corresponds to one delegated node subset in the
+  // TFLite model. Set this to 0 to delegate all possible partitions.
+  optional int32 max_delegated_partitions = 3 [default = 0];
+  // This sets the minimum number of nodes per partition delegated with
+  // Core ML delegate. Defaults to 2.
+  optional int32 min_nodes_per_partition = 4 [default = 2];
+}
+
+// Stable delegate loader settings.
+//
+// See
+// tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h
+// An example stable delegate:
+// tensorflow/lite/delegates/utils/experimental/sample_stable_delegate
+message StableDelegateLoaderSettings {
+  // The path of the stable delegate shared object file. Then the stable
+  // delegate provider can dynamically load the shared object file.
+  optional string delegate_path = 1;
+}
+
+// Fields related to compilation caching.  In this context compilation caching
+// refers to the concept of caching compilation artifacts that a delegate might
+// produce when translating a model graph into a delegate-internal structure
+// (for example, this could include compiled CPU code, or instructions for a
+// separate accelerator chip such as a GPU, TPU, or DSP). Caching compilation
+// artifacts can speed-up subsequent compilations, and hence the time it takes
+// to apply a delegate.
+//
+// Compilation caching is an optional feature.  Setting these fields for a
+// delegate that does not implement it will have no effect.
+message CompilationCachingSettings {
+  // The cache dir for the TFLite model.
+  // If not set then the delegate should not try to cache the compilation.
+  //
+  // The delegate is responsible for synchronizing access to files in the
+  // 'cache_dir'.  E.g., it is legal to create multiple threads or processes,
+  // each of which has its own delegate instance, and provide the same
+  // 'cache_dir' to those delegate instances.
+  optional string cache_dir = 1;
+  // The unique token string for a TFLite model.  A caller that wants the
+  // delegate to cache the compilation should set this field.  If set then it is
+  // the caller's responsibility to ensure there is no clash of the tokens.
+  // E.g., if an app uses several models (with this delegate) on a given device,
+  // then no two models should have the same model_token.  If no model token is
+  // provided, but the 'cache_dir' is set, then the delegate might still cache
+  // the compilation, e.g. by deriving a unique token internally, but this
+  // behavior can be delegate-specific. NOTE: when using compilation caching, it
+  // is not recommended to use the same delegate instance for multiple models.
+  optional string model_token = 2;
+}
+
+// EdgeTPU device spec.
+//
+message EdgeTpuDeviceSpec {
+  // EdgeTPU platform types.
+  enum PlatformType {
+    MMIO = 0;
+    REFERENCE = 1;
+    SIMULATOR = 2;
+    REMOTE_SIMULATOR = 3;
+  }
+
+  // Execution platform for the EdgeTPU device.
+  optional PlatformType platform_type = 1;
+
+  // Number of chips to use for the EdgeTPU device.
+  optional int32 num_chips = 2;
+
+  // Paths to the EdgeTPU devices;
+  repeated string device_paths = 3;
+
+  // Chip family used by the EdgeTpu device.
+  optional int32 chip_family = 4;
+}
+
+// Generic definitions of EdgeTPU power states.
+enum EdgeTpuPowerState {
+  // Undefined power state.
+  UNDEFINED_POWERSTATE = 0;
+
+  // TPU core is off but control cluster is on.
+  TPU_CORE_OFF = 1;
+
+  // A non-active low-power state that has much smaller transition time to
+  // active compared to off.
+  READY = 2;
+
+  // Minimum power active state.
+  ACTIVE_MIN_POWER = 3;
+
+  // Very low performance, very low power.
+  ACTIVE_VERY_LOW_POWER = 4;
+
+  // Low performance, low power.
+  ACTIVE_LOW_POWER = 5;
+
+  // The normal performance and power. This setting usually provides the
+  // optimal perf/power trade-off for the average use-case.
+  ACTIVE = 6;
+
+  // Maximum performance level. Potentially higher power and thermal. This
+  // setting may not be allowed in production depending on the system.
+  OVER_DRIVE = 7;
+}
+
+message EdgeTpuInactivePowerConfig {
+  // Inactive power states between inferences.
+  optional EdgeTpuPowerState inactive_power_state = 1;
+
+  // Inactive timeout in microseconds between inferences.
+  optional int64 inactive_timeout_us = 2;
+}
+
+// EdgeTPU Delegate settings.
+//
+// For security reasons, only certain apps that are part of the platform's
+// trusted code base are permitted to use the features defined in this message.
+// General apps should use `GoogleEdgeTpuSettings` instead.
+message EdgeTpuSettings {
+  // Float truncation types for EdgeTPU.
+  enum FloatTruncationType {
+    UNSPECIFIED = 0;
+    NO_TRUNCATION = 1;
+    BFLOAT16 = 2;
+    HALF = 3;
+  }
+
+  enum QosClass {
+    QOS_UNDEFINED = 0;
+    BEST_EFFORT = 1;
+    REALTIME = 2;
+  }
+
+  // Target inference power state for running the model.
+  optional EdgeTpuPowerState inference_power_state = 1;
+
+  // Inactive power states between inferences.
+  repeated EdgeTpuInactivePowerConfig inactive_power_configs = 2;
+
+  // Priority for the inference request.
+  optional int32 inference_priority = 3 [default = -1];
+
+  // Device spec for creating the EdgeTpu device.
+  optional EdgeTpuDeviceSpec edgetpu_device_spec = 4;
+
+  // A unique identifier of the input TfLite model.
+  optional string model_token = 5;
+
+  // Float truncation type for EdgeTPU.
+  optional FloatTruncationType float_truncation_type = 6;
+
+  // QoS class to determine chunking size for PRO onward.
+  optional QosClass qos_class = 7 [default = QOS_UNDEFINED];
+
+  // Cluster IDs the model will be compiled for.
+  repeated int32 hardware_cluster_ids = 8 [packed = true];
+
+  // Public model ID to be logged in logs, traces and metrics for identifying
+  // the model to help debugging.
+  // The configured string must obey the following rules:
+  // 1. Must not contain any confidential information, because public_model_id
+  // will be logged in android logs and traces which are publicly visible.
+  // 2. Must not contain any private user data or PII (Personally Identifiable
+  // Information), such as age, language, geography, religion, etc.
+  // 3. Should be <=30 chars, otherwise EdgeTpu software might truncate the
+  // string due to logging size constraints.
+  // 4. Please try to use a unique name so that it's easier to identify the
+  // model during debugging.
+  optional string public_model_id = 9;
+}
+
+// Google EdgeTPU delegate settings.
+message GoogleEdgeTpuSettings {
+  enum Priority {
+    PRIORITY_UNDEFINED = 0;
+    PRIORITY_LOW = 1;
+    PRIORITY_MEDIUM = 2;
+    PRIORITY_HIGH = 3;
+  }
+
+  enum TriState {
+    TRISTATE_UNDEFINED = 0;
+    TRISTATE_FALSE = 1;
+    TRISTATE_TRUE = 2;
+  }
+
+  // Controls the verbosity level of the delegate log messages. Set to -1 to let
+  // the delegate choose. Otherwise, it must range from 0 to 10 (inclusive),
+  // where lower values indicate less verbosity. A higher verbosity level may
+  // have an adverse impact on the delegate performance.
+  optional int32 log_verbosity = 1 [default = -1];
+
+  // Whether or not the client requests detailed delegate traces.
+  // The resulting traces can be used for performance analysis with tools such
+  // as perfetto (https://perfetto.dev/docs/quickstart/android-tracing).
+  // Enabling tracing may have an adverse impact on the delegate performance.
+  optional bool enable_tracing = 2 [default = false];
+
+  // Specifies the execution priority. The priority is global. Requests from
+  // different clients are prioritized relative to one another.
+  optional Priority priority = 3;
+
+  // Reserved.
+  optional bytes extension_data = 4;
+
+  // A unique identifier of the input model. Creating delegates with different
+  // user model binaries with the same model identifier will overwrite
+  // previously cached entries, saving disk space.
+  // If this field is not set, the model will be treated as a new entry, and
+  // will cost disk space to cache.
+  // This field is different from the model_token in CompilationCachingSettings,
+  // where the users may reuse the same model_identifier for different flavors
+  // of the same model to save disk space, whereas model_token must be unique.
+  // Example usage:
+  // (1) An app only uses one model, and wants to update the model.
+  // For both the existing and new models, set: model_identifier = "my_model"
+  // Creating the delegate with the new entry this way will delete the old
+  // cache entry, and replace it with the new version of "my_model"
+  // (2) An app A/B tests two versions of the same model (e.g. a stable version
+  //  and a testing/staging/beta version), and wants to frequently switch
+  // between them.
+  // The clients should use different model_identifier for the two variants.
+  // Model_A: model_identifier = "my_model_a"
+  // Model_B: model_identifier = "my_model_b"
+  // Both Model A and B will be cached separately, and coexist for efficient
+  // lookups.
+  optional string model_identifier = 5 [default = ""];
+
+  // If set to true, the user must use TFLite Async API to run the inference.
+  optional bool use_async_api = 6 [default = false];
+
+  // Specifies whether or not the delegate should handle cache management for
+  // the imported input or output buffers with TFLite Async API. These
+  // options have no effect if the user is not using the TFLite Async API.
+  optional bool delegate_should_manage_cache_for_inputs = 7 [default = true];
+  optional bool delegate_should_manage_cache_for_outputs = 8 [default = true];
+
+  // Specifies whether or not cache coherency is preferred for the imported
+  // input or output buffers with TFLite Async API. These options are purely
+  // advisory. Even if the user specifies that cache coherency is preferred,
+  // the delegate may still choose to use cache incoherent memory under certain
+  // circumstances, e.g. hardware limitation. If it is set to
+  // TRISTATE_UNDEFINED, the delegate will use the default value based on the
+  // device type. These options have no effect if the user is not using the
+  // TFLite Async API.
+  optional TriState prefer_cache_coherency_for_inputs = 9;
+  optional TriState prefer_cache_coherency_for_outputs = 10;
+}
+
+// Coral Dev Board / USB accelerator delegate settings.
+//
+// See
+// https://github.com/google-coral/edgetpu/blob/master/libedgetpu/edgetpu_c.h
+message CoralSettings {
+  enum Performance {
+    UNDEFINED = 0;
+    MAXIMUM = 1;
+    HIGH = 2;
+    MEDIUM = 3;
+    LOW = 4;
+  }
+
+  // The Edge Tpu device to be used. See
+  // https://github.com/google-coral/libcoral/blob/982426546dfa10128376d0c24fd8a8b161daac97/coral/tflite_utils.h#L131-L137
+  optional string device = 1;
+  // The desired performance level. This setting adjusts the internal clock
+  // rate to achieve different performance / power balance. Higher performance
+  // values improve speed, but increase power usage.
+  optional Performance performance = 2 [default = MAXIMUM];
+  // If true, always perform device firmware update (DFU) after reset. DFU is
+  // usually only necessary after power cycle.
+  optional bool usb_always_dfu = 3;
+  // The maximum bulk in queue length. Larger queue length may improve USB
+  // performance on the direction from device to host. When not specified (or
+  // zero), `usb_max_bulk_in_queue_length` will default to 32 according to the
+  // current EdgeTpu Coral implementation.
+  optional int32 usb_max_bulk_in_queue_length = 4;
+}
+
+message CPUSettings {
+  // Set to -1 to let the interpreter choose. Otherwise, must be > 0.
+  optional int32 num_threads = 1 [default = -1];
+}
+
+// How to configure TFLite.
+message TFLiteSettings {
+  // Which delegate to use.
+  optional Delegate delegate = 1;
+
+  // How to configure the chosen delegate.
+  // (In principle we would like to use 'oneof', but flatc turns that into an
+  // nested anonymous table rather than a union. See
+  // https://github.com/google/flatbuffers/issues/4628).
+  optional NNAPISettings nnapi_settings = 2;
+  optional GPUSettings gpu_settings = 3;
+  optional HexagonSettings hexagon_settings = 4;
+  optional XNNPackSettings xnnpack_settings = 5;
+  optional CoreMLSettings coreml_settings = 11;
+
+  // How to configure CPU execution.
+  optional CPUSettings cpu_settings = 6;
+
+  // Shared delegation settings.
+  optional int32 max_delegated_partitions = 7;
+
+  // For configuring the EdgeTpuDelegate.
+  // See also `google_edgetpu_settings` below.
+  optional EdgeTpuSettings edgetpu_settings = 8;
+
+  // For configuring the Coral EdgeTpu Delegate.
+  optional CoralSettings coral_settings = 10;
+
+  // Whether to automatically fall back to TFLite CPU path.
+  optional FallbackSettings fallback_settings = 9;
+
+  // Whether to disable default delegates (XNNPack).
+  // TODO(b/260405596): Update the comment to clarify the interaction between
+  // `disable_default_delegates` and `fallback_settings`.
+  optional bool disable_default_delegates = 12;
+
+  // For loading a stable delegate. If an app supplies a delegate shared library
+  // (e.g. packaged with the app, or downloaded separately), the app can use
+  // this field for passing the path to the delegate shared library.
+  //
+  // The stable delegate loader settings field works together with the settings
+  // of other concrete stable delegates; the stable delegate loader is not a
+  // concrete delegate type but a mechanism for initializing the TF Lite stable
+  // delegates.
+  //
+  // See
+  // tensorflow/lite/delegates/utils/experimental/sample_stable_delegate
+  optional StableDelegateLoaderSettings stable_delegate_loader_settings = 13;
+
+  // For configuring the Google EdgeTpu Delegate.
+  optional GoogleEdgeTpuSettings google_edgetpu_settings = 14;
+
+  // Compilation caching settings.
+  optional CompilationCachingSettings compilation_caching_settings = 15;
+}
+
+// Whether to automatically fallback to TFLite CPU path on delegation errors.
+//
+// Typically fallback is enabled in production use but disabled in tests and
+// benchmarks to ensure they test the intended path.
+message FallbackSettings {
+  // Whether to allow automatically falling back to TfLite CPU path on
+  // compilation failure. Default is not allowing automatic fallback.
+  //
+  // This is useful in naive production usecases where the caller would prefer
+  // for the model to run even if it's not accelerated. More advanced users will
+  // implement fallback themselves; e.g., by using a different model on CPU.
+  //
+  // Note that compilation errors may occur either at initial
+  // ModifyGraphWithDelegate() time, or when calling AllocateTensors() after
+  // resizing.
+  optional bool allow_automatic_fallback_on_compilation_error = 7;
+  // Whether to allow automatically falling back to TfLite CPU path on
+  // execution error. Default is not allowing automatic fallback.
+  //
+  // Experimental, use with care (only when you have complete control over the
+  // client code).
+  //
+  // The caveat above for compilation error holds.  Additionally, execution-time
+  // errors are harder to handle automatically as they require invalidating the
+  // TfLite interpreter which most client code has not been designed to deal
+  // with.
+  optional bool allow_automatic_fallback_on_execution_error = 8;
+}
+
+// On-device mini-benchmark result storage. The following definitions are used
+// to keep an append-only log of benchmark results on-device. (Hence there is
+// single top-level event that is used for all data).
+//
+// These definitions don't need a proto-to-flatbuffer conversion, since they are
+// not used for specifying configuration in the Tasks library.
+
+// Which stage of benchmarking the event is for.
+// There might be multiple events with the same type, if a benchmark is run
+// multiple times.
+enum BenchmarkEventType {
+  UNDEFINED_BENCHMARK_EVENT_TYPE = 0;
+  // Benchmark start. A start without an end can be interpreted as a test that
+  // has crashed or hung.
+  START = 1;
+  // Benchmarking completion. A model was successfully loaded, acceleration
+  // configured and inference run without errors. There may still be an issue
+  // with correctness of results, or with performance.
+  END = 2;
+  // Benchmark was not completed due to an error. The error may be a handled
+  // error (e.g., failure in a delegate), or a crash.
+  ERROR = 3;
+  // Benchmark data has been sent for logging.
+  LOGGED = 4;
+  // Benchmark encountered an error but was able to continue. The error is not
+  // related to the model execution but to the mini-benchmark logic. An example
+  // of error is a failure when trying to set the CPU affinity of the benchmark
+  // runner process.
+  RECOVERED_ERROR = 5;
+}
+
+// A correctness metric from a benchmark, for example KL-divergence between
+// known-good CPU output and on-device output. These are primarily used for
+// telemetry and monitored server-side.
+message BenchmarkMetric {
+  optional string name = 1;
+  repeated float values = 2 [packed = true];
+}
+
+// Outcome of a successfully complete benchmark run. This information is
+// intended to both be used on-device to select best compute configuration as
+// well as sent to server for monitoring.
+//
+// Used with event type END.
+// Next ID: 7
+message BenchmarkResult {
+  // Time to load model and apply acceleration. Initialization may get run
+  // multiple times to get information on variance.
+  repeated int64 initialization_time_us = 1 [packed = true];
+  // Time to run inference (call Invoke()). Inference may get run multiple times
+  // to get information on variance.
+  repeated int64 inference_time_us = 2 [packed = true];
+  // Maximum memory used. Measures size of application heap (does not
+  // necessarily take into account driver-side allocation.
+  optional int32 max_memory_kb = 3;
+  // Whether the inference produced correct results (validation graph output
+  // 'ok' for all test inputs). Used on-device to disallow configurations that
+  // produce incorrect results (e.g., due to OpenCL driver bugs).
+  optional bool ok = 4;
+  // Metrics that were used to determine the 'ok' status.
+  repeated BenchmarkMetric metrics = 5;
+
+  message InferenceOutput {
+    // The matching Flatbuffer type is ubyte.
+    optional bytes value = 1;
+  }
+  // Model output in byte format. Each InferenceOutput comes from one output
+  // tensor. It is ordered the same as tflite::Interpreter::output_tensor(),
+  // i.e. the value of output_tensor(i) is stored in actual_output[i]. Only
+  // populated in custom validation case.
+  repeated InferenceOutput actual_output = 6;
+}
+
+// A handled error.
+message ErrorCode {
+  // Which delegate the error comes from (or NONE, if it comes from the tflite
+  // framework).
+  optional Delegate source = 1;
+  // What the tflite level error is.
+  optional int32 tflite_error = 2;
+  // What the underlying error is (e.g., NNAPI or OpenGL error).
+  optional int64 underlying_api_error = 3;
+}
+
+// When during benchmark execution an error occurred.
+enum BenchmarkStage {
+  UNKNOWN = 0;
+  // During model loading or delegation.
+  INITIALIZATION = 1;
+  // During inference.
+  INFERENCE = 2;
+}
+
+// An error that occurred during benchmarking.
+//
+// Used with event type ERROR.
+message BenchmarkError {
+  // How far benchmarking got.
+  optional BenchmarkStage stage = 1;
+  // Process exit code.
+  optional int32 exit_code = 2;
+  // Signal the process received.
+  optional int32 signal = 3;
+  // Handled tflite error.
+  repeated ErrorCode error_code = 4;
+  // Mini-benchmark error code.
+  optional int32 mini_benchmark_error_code = 5;
+}
+
+// Top-level benchmarking event stored on-device. All events for a model are
+// parsed to detect the status.
+message BenchmarkEvent {
+  // Which settings were used for benchmarking.
+  optional TFLiteSettings tflite_settings = 1;
+  // Type of the event.
+  optional BenchmarkEventType event_type = 2;
+  // Result of benchmark, used when type is END.
+  optional BenchmarkResult result = 3;
+  // Error during benchmark, used when type is ERROR.
+  optional BenchmarkError error = 4;
+  // Start timestamps. These are used for
+  // 1. Checking whether a test was started but not completed within a given
+  // deadline.
+  // 2. Optionally, telemetry timestamps.
+  optional int64 boottime_us = 5;
+  optional int64 wallclock_us = 6;
+}
+
+// Represent the decision on the best acceleration from the mini-benchmark.
+message BestAccelerationDecision {
+  // Number of events used to take the decision.
+  // Using just the size instaed of the full list of events to save space.
+  optional int32 number_of_source_events = 1;
+
+  // Event with min latency in the source ones.
+  optional BenchmarkEvent min_latency_event = 2;
+
+  // Min latency as read from min_latency_event.
+  optional int64 min_inference_time_us = 3;
+}
+
+// Represent a failure during the initialization of the mini-benchmark.
+message BenchmarkInitializationFailure {
+  // Status code returned by the mini-benchmark initialization function.
+  optional int32 initialization_status = 1;
+}
+
+// Events generated by the mini-benchmark before and after triggering
+// the different configuration-specific benchmarks
+message MiniBenchmarkEvent {
+  // Not using oneof because of the way the generated cpp code.
+  // See comment above on TfLite settings for details.
+
+  // If set to true, this event is used to mark all previous events in the
+  // mini-benchmark internal storage as read and one of the other fields
+  // in this message will have a value.
+  optional bool is_log_flushing_event = 1;
+  // Event generated when a best acceleration decision is taken.
+  optional BestAccelerationDecision best_acceleration_decision = 2;
+  // Reports a failure during mini-benchmark initialization.
+  optional BenchmarkInitializationFailure initialization_failure = 3;
+  // Event generated while benchmarking the different settings to test locally.
+  optional BenchmarkEvent benchmark_event = 4;
+}
+
+// How to access the model for mini-benchmark.
+// Mini-benchmark can read the model from a file path, a file
+// descriptor, or in-memory model. The file descriptor typically comes from the
+// Android asset manager. Since mini-benchmark runs in a separate process, it
+// can not access the in-memory model directly. Instead, it will copy the
+// in-memory model to the validation process.
+//
+// Users should set one of  the following:
+// 1) filename, or
+// 2) all of fd, offset (optional, default to 0) and length, or
+// 3) both buffer_handle and length.
+message ModelFile {
+  // Filename for reading model from.
+  optional string filename = 1;
+  // File descriptor to read model from.
+  optional int64 fd = 2;
+  // Offset for model in file descriptor.
+  optional int64 offset = 3;
+  // Length of model.
+  optional int64 length = 4;
+  optional ModelIdGroup model_id_group = 5;
+  // In-memory buffer handle to the model. This handle will be cast to a pointer
+  // of type const uint8_t* to load the model. The caller needs to ensure the
+  // buffer handle out-lives the mini-benchmark main process.
+  // NOTE: When using buffer_handle, this proto should not serialized and copied
+  // across process boundaries (e.g. via a file), since it may contain handles
+  // that refer to addresses in the current process's address space.
+  optional int64 buffer_handle = 6;
+}
+
+message ModelIdGroup {
+  optional string model_namespace = 1;
+  optional string model_id = 2;
+}
+
+// Where to store mini-benchmark state.
+message BenchmarkStoragePaths {
+  // Base path to the files used to store benchmark results in. Two files
+  // will be generated: one with the given path and an extra file to store
+  // events related to best acceleration results at path storage_file_path +
+  // ".extra.fb". Must be specific to the model.
+  // Note on Android, this should be the code cache directory.
+  optional string storage_file_path = 1;
+
+  // Path to a directory for intermediate files (lock files, extracted
+  // binaries).
+  // Note on Android, this typically is the data cache directory (i.e. the one
+  // returned by `getCacheDir()`).
+  optional string data_directory_path = 2;
+}
+
+// Validation related settings.
+// Next ID: 2
+message ValidationSettings {
+  // Timeout for one settings under test. If test didn't finish within this
+  // timeout, this setting is considered hanging.
+  optional int64 per_test_timeout_ms = 1;
+}
+
+// How to run a minibenchmark.
+// Next ID: 5
+message MinibenchmarkSettings {
+  // Which settings to test. This would typically be filled in from an
+  // allowlist.
+  repeated TFLiteSettings settings_to_test = 1;
+  // How to access the model. This would typically be set dynamically, as it
+  // depends on the application folder and/or runtime state.
+  // NOTE: When using buffer_handle, this proto should not serialized and copied
+  // across process boundaries (e.g. via a file), since it may contain handles
+  // that refer to addresses in the current process's address space.
+  optional ModelFile model_file = 2;
+  // Where to store state. This would typically be set dynamically, as it
+  // depends on the application folder.
+  optional BenchmarkStoragePaths storage_paths = 3;
+  // Validation test related settings.
+  optional ValidationSettings validation_settings = 4;
+}
+
+// Schema used for cache Benchmark result.
+message BenchmarkEventStorage {
+  optional ModelIdGroup model_id_group = 1;
+  optional BenchmarkEvent benchmark_event = 2;
+}
+
+// LINT.ThenChange(//tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev)
diff --git a/tensorflow/lite/acceleration/configuration/configuration_generated.h b/tensorflow/lite/acceleration/configuration/configuration_generated.h
new file mode 100644
index 00000000000..6c00ee4d0ae
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/configuration_generated.h
@@ -0,0 +1,5881 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_CONFIGURATION_TFLITE_H_
+#define FLATBUFFERS_GENERATED_CONFIGURATION_TFLITE_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 1 &&
+              FLATBUFFERS_VERSION_REVISION == 21,
+             "Non-compatible flatbuffers version included");
+
+namespace tflite {
+
+struct ComputeSettings;
+struct ComputeSettingsBuilder;
+struct ComputeSettingsT;
+
+struct NNAPISettings;
+struct NNAPISettingsBuilder;
+struct NNAPISettingsT;
+
+struct GPUSettings;
+struct GPUSettingsBuilder;
+struct GPUSettingsT;
+
+struct HexagonSettings;
+struct HexagonSettingsBuilder;
+struct HexagonSettingsT;
+
+struct XNNPackSettings;
+struct XNNPackSettingsBuilder;
+struct XNNPackSettingsT;
+
+struct CoreMLSettings;
+struct CoreMLSettingsBuilder;
+struct CoreMLSettingsT;
+
+struct StableDelegateLoaderSettings;
+struct StableDelegateLoaderSettingsBuilder;
+struct StableDelegateLoaderSettingsT;
+
+struct CompilationCachingSettings;
+struct CompilationCachingSettingsBuilder;
+struct CompilationCachingSettingsT;
+
+struct EdgeTpuDeviceSpec;
+struct EdgeTpuDeviceSpecBuilder;
+struct EdgeTpuDeviceSpecT;
+
+struct EdgeTpuInactivePowerConfig;
+struct EdgeTpuInactivePowerConfigBuilder;
+struct EdgeTpuInactivePowerConfigT;
+
+struct EdgeTpuSettings;
+struct EdgeTpuSettingsBuilder;
+struct EdgeTpuSettingsT;
+
+struct GoogleEdgeTpuSettings;
+struct GoogleEdgeTpuSettingsBuilder;
+struct GoogleEdgeTpuSettingsT;
+
+struct CoralSettings;
+struct CoralSettingsBuilder;
+struct CoralSettingsT;
+
+struct CPUSettings;
+struct CPUSettingsBuilder;
+struct CPUSettingsT;
+
+struct TFLiteSettings;
+struct TFLiteSettingsBuilder;
+struct TFLiteSettingsT;
+
+struct FallbackSettings;
+struct FallbackSettingsBuilder;
+struct FallbackSettingsT;
+
+struct BenchmarkMetric;
+struct BenchmarkMetricBuilder;
+struct BenchmarkMetricT;
+
+struct BenchmarkResult;
+struct BenchmarkResultBuilder;
+struct BenchmarkResultT;
+
+namespace BenchmarkResult_ {
+
+struct InferenceOutput;
+struct InferenceOutputBuilder;
+struct InferenceOutputT;
+
+}  // namespace BenchmarkResult_
+
+struct ErrorCode;
+struct ErrorCodeBuilder;
+struct ErrorCodeT;
+
+struct BenchmarkError;
+struct BenchmarkErrorBuilder;
+struct BenchmarkErrorT;
+
+struct BenchmarkEvent;
+struct BenchmarkEventBuilder;
+struct BenchmarkEventT;
+
+struct BestAccelerationDecision;
+struct BestAccelerationDecisionBuilder;
+struct BestAccelerationDecisionT;
+
+struct BenchmarkInitializationFailure;
+struct BenchmarkInitializationFailureBuilder;
+struct BenchmarkInitializationFailureT;
+
+struct MiniBenchmarkEvent;
+struct MiniBenchmarkEventBuilder;
+struct MiniBenchmarkEventT;
+
+struct ModelFile;
+struct ModelFileBuilder;
+struct ModelFileT;
+
+struct ModelIdGroup;
+struct ModelIdGroupBuilder;
+struct ModelIdGroupT;
+
+struct BenchmarkStoragePaths;
+struct BenchmarkStoragePathsBuilder;
+struct BenchmarkStoragePathsT;
+
+struct ValidationSettings;
+struct ValidationSettingsBuilder;
+struct ValidationSettingsT;
+
+struct MinibenchmarkSettings;
+struct MinibenchmarkSettingsBuilder;
+struct MinibenchmarkSettingsT;
+
+struct BenchmarkEventStorage;
+struct BenchmarkEventStorageBuilder;
+struct BenchmarkEventStorageT;
+
+bool operator==(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs);
+bool operator!=(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs);
+bool operator==(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs);
+bool operator!=(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs);
+bool operator==(const GPUSettingsT &lhs, const GPUSettingsT &rhs);
+bool operator!=(const GPUSettingsT &lhs, const GPUSettingsT &rhs);
+bool operator==(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs);
+bool operator!=(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs);
+bool operator==(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs);
+bool operator!=(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs);
+bool operator==(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs);
+bool operator!=(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs);
+bool operator==(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs);
+bool operator!=(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs);
+bool operator==(const CompilationCachingSettingsT &lhs, const CompilationCachingSettingsT &rhs);
+bool operator!=(const CompilationCachingSettingsT &lhs, const CompilationCachingSettingsT &rhs);
+bool operator==(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs);
+bool operator!=(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs);
+bool operator==(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs);
+bool operator!=(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs);
+bool operator==(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs);
+bool operator!=(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs);
+bool operator==(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs);
+bool operator!=(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs);
+bool operator==(const CoralSettingsT &lhs, const CoralSettingsT &rhs);
+bool operator!=(const CoralSettingsT &lhs, const CoralSettingsT &rhs);
+bool operator==(const CPUSettingsT &lhs, const CPUSettingsT &rhs);
+bool operator!=(const CPUSettingsT &lhs, const CPUSettingsT &rhs);
+bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs);
+bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs);
+bool operator==(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs);
+bool operator!=(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs);
+bool operator==(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs);
+bool operator!=(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs);
+bool operator==(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs);
+bool operator!=(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs);
+namespace BenchmarkResult_ {
+
+bool operator==(const InferenceOutputT &lhs, const InferenceOutputT &rhs);
+bool operator!=(const InferenceOutputT &lhs, const InferenceOutputT &rhs);
+}  // namespace BenchmarkResult_
+
+bool operator==(const ErrorCodeT &lhs, const ErrorCodeT &rhs);
+bool operator!=(const ErrorCodeT &lhs, const ErrorCodeT &rhs);
+bool operator==(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs);
+bool operator!=(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs);
+bool operator==(const BenchmarkEventT &lhs, const BenchmarkEventT &rhs);
+bool operator!=(const BenchmarkEventT &lhs, const BenchmarkEventT &rhs);
+bool operator==(const BestAccelerationDecisionT &lhs, const BestAccelerationDecisionT &rhs);
+bool operator!=(const BestAccelerationDecisionT &lhs, const BestAccelerationDecisionT &rhs);
+bool operator==(const BenchmarkInitializationFailureT &lhs, const BenchmarkInitializationFailureT &rhs);
+bool operator!=(const BenchmarkInitializationFailureT &lhs, const BenchmarkInitializationFailureT &rhs);
+bool operator==(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs);
+bool operator!=(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs);
+bool operator==(const ModelFileT &lhs, const ModelFileT &rhs);
+bool operator!=(const ModelFileT &lhs, const ModelFileT &rhs);
+bool operator==(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs);
+bool operator!=(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs);
+bool operator==(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs);
+bool operator!=(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs);
+bool operator==(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs);
+bool operator!=(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs);
+bool operator==(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs);
+bool operator!=(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs);
+bool operator==(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs);
+bool operator!=(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs);
+
+enum ExecutionPreference : int32_t {
+  ExecutionPreference_ANY = 0,
+  ExecutionPreference_LOW_LATENCY = 1,
+  ExecutionPreference_LOW_POWER = 2,
+  ExecutionPreference_FORCE_CPU = 3,
+  ExecutionPreference_MIN = ExecutionPreference_ANY,
+  ExecutionPreference_MAX = ExecutionPreference_FORCE_CPU
+};
+
+inline const ExecutionPreference (&EnumValuesExecutionPreference())[4] {
+  static const ExecutionPreference values[] = {
+    ExecutionPreference_ANY,
+    ExecutionPreference_LOW_LATENCY,
+    ExecutionPreference_LOW_POWER,
+    ExecutionPreference_FORCE_CPU
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesExecutionPreference() {
+  static const char * const names[5] = {
+    "ANY",
+    "LOW_LATENCY",
+    "LOW_POWER",
+    "FORCE_CPU",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameExecutionPreference(ExecutionPreference e) {
+  if (::flatbuffers::IsOutRange(e, ExecutionPreference_ANY, ExecutionPreference_FORCE_CPU)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesExecutionPreference()[index];
+}
+
+enum Delegate : int32_t {
+  Delegate_NONE = 0,
+  Delegate_NNAPI = 1,
+  Delegate_GPU = 2,
+  Delegate_HEXAGON = 3,
+  Delegate_XNNPACK = 4,
+  Delegate_EDGETPU = 5,
+  Delegate_EDGETPU_CORAL = 6,
+  Delegate_CORE_ML = 7,
+  Delegate_MIN = Delegate_NONE,
+  Delegate_MAX = Delegate_CORE_ML
+};
+
+inline const Delegate (&EnumValuesDelegate())[8] {
+  static const Delegate values[] = {
+    Delegate_NONE,
+    Delegate_NNAPI,
+    Delegate_GPU,
+    Delegate_HEXAGON,
+    Delegate_XNNPACK,
+    Delegate_EDGETPU,
+    Delegate_EDGETPU_CORAL,
+    Delegate_CORE_ML
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesDelegate() {
+  static const char * const names[9] = {
+    "NONE",
+    "NNAPI",
+    "GPU",
+    "HEXAGON",
+    "XNNPACK",
+    "EDGETPU",
+    "EDGETPU_CORAL",
+    "CORE_ML",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameDelegate(Delegate e) {
+  if (::flatbuffers::IsOutRange(e, Delegate_NONE, Delegate_CORE_ML)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDelegate()[index];
+}
+
+enum NNAPIExecutionPreference : int32_t {
+  NNAPIExecutionPreference_UNDEFINED = 0,
+  NNAPIExecutionPreference_NNAPI_LOW_POWER = 1,
+  NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER = 2,
+  NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED = 3,
+  NNAPIExecutionPreference_MIN = NNAPIExecutionPreference_UNDEFINED,
+  NNAPIExecutionPreference_MAX = NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED
+};
+
+inline const NNAPIExecutionPreference (&EnumValuesNNAPIExecutionPreference())[4] {
+  static const NNAPIExecutionPreference values[] = {
+    NNAPIExecutionPreference_UNDEFINED,
+    NNAPIExecutionPreference_NNAPI_LOW_POWER,
+    NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER,
+    NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesNNAPIExecutionPreference() {
+  static const char * const names[5] = {
+    "UNDEFINED",
+    "NNAPI_LOW_POWER",
+    "NNAPI_FAST_SINGLE_ANSWER",
+    "NNAPI_SUSTAINED_SPEED",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameNNAPIExecutionPreference(NNAPIExecutionPreference e) {
+  if (::flatbuffers::IsOutRange(e, NNAPIExecutionPreference_UNDEFINED, NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesNNAPIExecutionPreference()[index];
+}
+
+enum NNAPIExecutionPriority : int32_t {
+  NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED = 0,
+  NNAPIExecutionPriority_NNAPI_PRIORITY_LOW = 1,
+  NNAPIExecutionPriority_NNAPI_PRIORITY_MEDIUM = 2,
+  NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH = 3,
+  NNAPIExecutionPriority_MIN = NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+  NNAPIExecutionPriority_MAX = NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH
+};
+
+inline const NNAPIExecutionPriority (&EnumValuesNNAPIExecutionPriority())[4] {
+  static const NNAPIExecutionPriority values[] = {
+    NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+    NNAPIExecutionPriority_NNAPI_PRIORITY_LOW,
+    NNAPIExecutionPriority_NNAPI_PRIORITY_MEDIUM,
+    NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesNNAPIExecutionPriority() {
+  static const char * const names[5] = {
+    "NNAPI_PRIORITY_UNDEFINED",
+    "NNAPI_PRIORITY_LOW",
+    "NNAPI_PRIORITY_MEDIUM",
+    "NNAPI_PRIORITY_HIGH",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameNNAPIExecutionPriority(NNAPIExecutionPriority e) {
+  if (::flatbuffers::IsOutRange(e, NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED, NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesNNAPIExecutionPriority()[index];
+}
+
+enum GPUBackend : int32_t {
+  GPUBackend_UNSET = 0,
+  GPUBackend_OPENCL = 1,
+  GPUBackend_OPENGL = 2,
+  GPUBackend_MIN = GPUBackend_UNSET,
+  GPUBackend_MAX = GPUBackend_OPENGL
+};
+
+inline const GPUBackend (&EnumValuesGPUBackend())[3] {
+  static const GPUBackend values[] = {
+    GPUBackend_UNSET,
+    GPUBackend_OPENCL,
+    GPUBackend_OPENGL
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesGPUBackend() {
+  static const char * const names[4] = {
+    "UNSET",
+    "OPENCL",
+    "OPENGL",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameGPUBackend(GPUBackend e) {
+  if (::flatbuffers::IsOutRange(e, GPUBackend_UNSET, GPUBackend_OPENGL)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesGPUBackend()[index];
+}
+
+enum GPUInferencePriority : int32_t {
+  GPUInferencePriority_GPU_PRIORITY_AUTO = 0,
+  GPUInferencePriority_GPU_PRIORITY_MAX_PRECISION = 1,
+  GPUInferencePriority_GPU_PRIORITY_MIN_LATENCY = 2,
+  GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE = 3,
+  GPUInferencePriority_MIN = GPUInferencePriority_GPU_PRIORITY_AUTO,
+  GPUInferencePriority_MAX = GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE
+};
+
+inline const GPUInferencePriority (&EnumValuesGPUInferencePriority())[4] {
+  static const GPUInferencePriority values[] = {
+    GPUInferencePriority_GPU_PRIORITY_AUTO,
+    GPUInferencePriority_GPU_PRIORITY_MAX_PRECISION,
+    GPUInferencePriority_GPU_PRIORITY_MIN_LATENCY,
+    GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesGPUInferencePriority() {
+  static const char * const names[5] = {
+    "GPU_PRIORITY_AUTO",
+    "GPU_PRIORITY_MAX_PRECISION",
+    "GPU_PRIORITY_MIN_LATENCY",
+    "GPU_PRIORITY_MIN_MEMORY_USAGE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameGPUInferencePriority(GPUInferencePriority e) {
+  if (::flatbuffers::IsOutRange(e, GPUInferencePriority_GPU_PRIORITY_AUTO, GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesGPUInferencePriority()[index];
+}
+
+enum GPUInferenceUsage : int32_t {
+  GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER = 0,
+  GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED = 1,
+  GPUInferenceUsage_MIN = GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+  GPUInferenceUsage_MAX = GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED
+};
+
+inline const GPUInferenceUsage (&EnumValuesGPUInferenceUsage())[2] {
+  static const GPUInferenceUsage values[] = {
+    GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+    GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesGPUInferenceUsage() {
+  static const char * const names[3] = {
+    "GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER",
+    "GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameGPUInferenceUsage(GPUInferenceUsage e) {
+  if (::flatbuffers::IsOutRange(e, GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER, GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesGPUInferenceUsage()[index];
+}
+
+enum XNNPackFlags : int32_t {
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS = 0,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8 = 1,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QU8 = 2,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8 = 3,
+  XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16 = 4,
+  XNNPackFlags_MIN = XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS,
+  XNNPackFlags_MAX = XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16
+};
+
+inline const XNNPackFlags (&EnumValuesXNNPackFlags())[5] {
+  static const XNNPackFlags values[] = {
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QU8,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8,
+    XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesXNNPackFlags() {
+  static const char * const names[6] = {
+    "TFLITE_XNNPACK_DELEGATE_NO_FLAGS",
+    "TFLITE_XNNPACK_DELEGATE_FLAG_QS8",
+    "TFLITE_XNNPACK_DELEGATE_FLAG_QU8",
+    "TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8",
+    "TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameXNNPackFlags(XNNPackFlags e) {
+  if (::flatbuffers::IsOutRange(e, XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS, XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesXNNPackFlags()[index];
+}
+
+namespace CoreMLSettings_ {
+
+enum EnabledDevices : int32_t {
+  EnabledDevices_DEVICES_ALL = 0,
+  EnabledDevices_DEVICES_WITH_NEURAL_ENGINE = 1,
+  EnabledDevices_MIN = EnabledDevices_DEVICES_ALL,
+  EnabledDevices_MAX = EnabledDevices_DEVICES_WITH_NEURAL_ENGINE
+};
+
+inline const EnabledDevices (&EnumValuesEnabledDevices())[2] {
+  static const EnabledDevices values[] = {
+    EnabledDevices_DEVICES_ALL,
+    EnabledDevices_DEVICES_WITH_NEURAL_ENGINE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesEnabledDevices() {
+  static const char * const names[3] = {
+    "DEVICES_ALL",
+    "DEVICES_WITH_NEURAL_ENGINE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameEnabledDevices(EnabledDevices e) {
+  if (::flatbuffers::IsOutRange(e, EnabledDevices_DEVICES_ALL, EnabledDevices_DEVICES_WITH_NEURAL_ENGINE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEnabledDevices()[index];
+}
+
+}  // namespace CoreMLSettings_
+
+namespace EdgeTpuDeviceSpec_ {
+
+enum PlatformType : int32_t {
+  PlatformType_MMIO = 0,
+  PlatformType_REFERENCE = 1,
+  PlatformType_SIMULATOR = 2,
+  PlatformType_REMOTE_SIMULATOR = 3,
+  PlatformType_MIN = PlatformType_MMIO,
+  PlatformType_MAX = PlatformType_REMOTE_SIMULATOR
+};
+
+inline const PlatformType (&EnumValuesPlatformType())[4] {
+  static const PlatformType values[] = {
+    PlatformType_MMIO,
+    PlatformType_REFERENCE,
+    PlatformType_SIMULATOR,
+    PlatformType_REMOTE_SIMULATOR
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesPlatformType() {
+  static const char * const names[5] = {
+    "MMIO",
+    "REFERENCE",
+    "SIMULATOR",
+    "REMOTE_SIMULATOR",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNamePlatformType(PlatformType e) {
+  if (::flatbuffers::IsOutRange(e, PlatformType_MMIO, PlatformType_REMOTE_SIMULATOR)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPlatformType()[index];
+}
+
+}  // namespace EdgeTpuDeviceSpec_
+
+enum EdgeTpuPowerState : int32_t {
+  EdgeTpuPowerState_UNDEFINED_POWERSTATE = 0,
+  EdgeTpuPowerState_TPU_CORE_OFF = 1,
+  EdgeTpuPowerState_READY = 2,
+  EdgeTpuPowerState_ACTIVE_MIN_POWER = 3,
+  EdgeTpuPowerState_ACTIVE_VERY_LOW_POWER = 4,
+  EdgeTpuPowerState_ACTIVE_LOW_POWER = 5,
+  EdgeTpuPowerState_ACTIVE = 6,
+  EdgeTpuPowerState_OVER_DRIVE = 7,
+  EdgeTpuPowerState_MIN = EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+  EdgeTpuPowerState_MAX = EdgeTpuPowerState_OVER_DRIVE
+};
+
+inline const EdgeTpuPowerState (&EnumValuesEdgeTpuPowerState())[8] {
+  static const EdgeTpuPowerState values[] = {
+    EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+    EdgeTpuPowerState_TPU_CORE_OFF,
+    EdgeTpuPowerState_READY,
+    EdgeTpuPowerState_ACTIVE_MIN_POWER,
+    EdgeTpuPowerState_ACTIVE_VERY_LOW_POWER,
+    EdgeTpuPowerState_ACTIVE_LOW_POWER,
+    EdgeTpuPowerState_ACTIVE,
+    EdgeTpuPowerState_OVER_DRIVE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesEdgeTpuPowerState() {
+  static const char * const names[9] = {
+    "UNDEFINED_POWERSTATE",
+    "TPU_CORE_OFF",
+    "READY",
+    "ACTIVE_MIN_POWER",
+    "ACTIVE_VERY_LOW_POWER",
+    "ACTIVE_LOW_POWER",
+    "ACTIVE",
+    "OVER_DRIVE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameEdgeTpuPowerState(EdgeTpuPowerState e) {
+  if (::flatbuffers::IsOutRange(e, EdgeTpuPowerState_UNDEFINED_POWERSTATE, EdgeTpuPowerState_OVER_DRIVE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEdgeTpuPowerState()[index];
+}
+
+namespace EdgeTpuSettings_ {
+
+enum FloatTruncationType : int32_t {
+  FloatTruncationType_UNSPECIFIED = 0,
+  FloatTruncationType_NO_TRUNCATION = 1,
+  FloatTruncationType_BFLOAT16 = 2,
+  FloatTruncationType_HALF = 3,
+  FloatTruncationType_MIN = FloatTruncationType_UNSPECIFIED,
+  FloatTruncationType_MAX = FloatTruncationType_HALF
+};
+
+inline const FloatTruncationType (&EnumValuesFloatTruncationType())[4] {
+  static const FloatTruncationType values[] = {
+    FloatTruncationType_UNSPECIFIED,
+    FloatTruncationType_NO_TRUNCATION,
+    FloatTruncationType_BFLOAT16,
+    FloatTruncationType_HALF
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesFloatTruncationType() {
+  static const char * const names[5] = {
+    "UNSPECIFIED",
+    "NO_TRUNCATION",
+    "BFLOAT16",
+    "HALF",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameFloatTruncationType(FloatTruncationType e) {
+  if (::flatbuffers::IsOutRange(e, FloatTruncationType_UNSPECIFIED, FloatTruncationType_HALF)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesFloatTruncationType()[index];
+}
+
+enum QosClass : int32_t {
+  QosClass_QOS_UNDEFINED = 0,
+  QosClass_BEST_EFFORT = 1,
+  QosClass_REALTIME = 2,
+  QosClass_MIN = QosClass_QOS_UNDEFINED,
+  QosClass_MAX = QosClass_REALTIME
+};
+
+inline const QosClass (&EnumValuesQosClass())[3] {
+  static const QosClass values[] = {
+    QosClass_QOS_UNDEFINED,
+    QosClass_BEST_EFFORT,
+    QosClass_REALTIME
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesQosClass() {
+  static const char * const names[4] = {
+    "QOS_UNDEFINED",
+    "BEST_EFFORT",
+    "REALTIME",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameQosClass(QosClass e) {
+  if (::flatbuffers::IsOutRange(e, QosClass_QOS_UNDEFINED, QosClass_REALTIME)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesQosClass()[index];
+}
+
+}  // namespace EdgeTpuSettings_
+
+namespace GoogleEdgeTpuSettings_ {
+
+enum Priority : int32_t {
+  Priority_PRIORITY_UNDEFINED = 0,
+  Priority_PRIORITY_LOW = 1,
+  Priority_PRIORITY_MEDIUM = 2,
+  Priority_PRIORITY_HIGH = 3,
+  Priority_MIN = Priority_PRIORITY_UNDEFINED,
+  Priority_MAX = Priority_PRIORITY_HIGH
+};
+
+inline const Priority (&EnumValuesPriority())[4] {
+  static const Priority values[] = {
+    Priority_PRIORITY_UNDEFINED,
+    Priority_PRIORITY_LOW,
+    Priority_PRIORITY_MEDIUM,
+    Priority_PRIORITY_HIGH
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesPriority() {
+  static const char * const names[5] = {
+    "PRIORITY_UNDEFINED",
+    "PRIORITY_LOW",
+    "PRIORITY_MEDIUM",
+    "PRIORITY_HIGH",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNamePriority(Priority e) {
+  if (::flatbuffers::IsOutRange(e, Priority_PRIORITY_UNDEFINED, Priority_PRIORITY_HIGH)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPriority()[index];
+}
+
+enum TriState : int32_t {
+  TriState_TRISTATE_UNDEFINED = 0,
+  TriState_TRISTATE_FALSE = 1,
+  TriState_TRISTATE_TRUE = 2,
+  TriState_MIN = TriState_TRISTATE_UNDEFINED,
+  TriState_MAX = TriState_TRISTATE_TRUE
+};
+
+inline const TriState (&EnumValuesTriState())[3] {
+  static const TriState values[] = {
+    TriState_TRISTATE_UNDEFINED,
+    TriState_TRISTATE_FALSE,
+    TriState_TRISTATE_TRUE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesTriState() {
+  static const char * const names[4] = {
+    "TRISTATE_UNDEFINED",
+    "TRISTATE_FALSE",
+    "TRISTATE_TRUE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameTriState(TriState e) {
+  if (::flatbuffers::IsOutRange(e, TriState_TRISTATE_UNDEFINED, TriState_TRISTATE_TRUE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTriState()[index];
+}
+
+}  // namespace GoogleEdgeTpuSettings_
+
+namespace CoralSettings_ {
+
+enum Performance : int32_t {
+  Performance_UNDEFINED = 0,
+  Performance_MAXIMUM = 1,
+  Performance_HIGH = 2,
+  Performance_MEDIUM = 3,
+  Performance_LOW = 4,
+  Performance_MIN = Performance_UNDEFINED,
+  Performance_MAX = Performance_LOW
+};
+
+inline const Performance (&EnumValuesPerformance())[5] {
+  static const Performance values[] = {
+    Performance_UNDEFINED,
+    Performance_MAXIMUM,
+    Performance_HIGH,
+    Performance_MEDIUM,
+    Performance_LOW
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesPerformance() {
+  static const char * const names[6] = {
+    "UNDEFINED",
+    "MAXIMUM",
+    "HIGH",
+    "MEDIUM",
+    "LOW",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNamePerformance(Performance e) {
+  if (::flatbuffers::IsOutRange(e, Performance_UNDEFINED, Performance_LOW)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPerformance()[index];
+}
+
+}  // namespace CoralSettings_
+
+enum BenchmarkEventType : int32_t {
+  BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE = 0,
+  BenchmarkEventType_START = 1,
+  BenchmarkEventType_END = 2,
+  BenchmarkEventType_ERROR = 3,
+  BenchmarkEventType_LOGGED = 4,
+  BenchmarkEventType_RECOVERED_ERROR = 5,
+  BenchmarkEventType_MIN = BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE,
+  BenchmarkEventType_MAX = BenchmarkEventType_RECOVERED_ERROR
+};
+
+inline const BenchmarkEventType (&EnumValuesBenchmarkEventType())[6] {
+  static const BenchmarkEventType values[] = {
+    BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE,
+    BenchmarkEventType_START,
+    BenchmarkEventType_END,
+    BenchmarkEventType_ERROR,
+    BenchmarkEventType_LOGGED,
+    BenchmarkEventType_RECOVERED_ERROR
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBenchmarkEventType() {
+  static const char * const names[7] = {
+    "UNDEFINED_BENCHMARK_EVENT_TYPE",
+    "START",
+    "END",
+    "ERROR",
+    "LOGGED",
+    "RECOVERED_ERROR",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBenchmarkEventType(BenchmarkEventType e) {
+  if (::flatbuffers::IsOutRange(e, BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE, BenchmarkEventType_RECOVERED_ERROR)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBenchmarkEventType()[index];
+}
+
+enum BenchmarkStage : int32_t {
+  BenchmarkStage_UNKNOWN = 0,
+  BenchmarkStage_INITIALIZATION = 1,
+  BenchmarkStage_INFERENCE = 2,
+  BenchmarkStage_MIN = BenchmarkStage_UNKNOWN,
+  BenchmarkStage_MAX = BenchmarkStage_INFERENCE
+};
+
+inline const BenchmarkStage (&EnumValuesBenchmarkStage())[3] {
+  static const BenchmarkStage values[] = {
+    BenchmarkStage_UNKNOWN,
+    BenchmarkStage_INITIALIZATION,
+    BenchmarkStage_INFERENCE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBenchmarkStage() {
+  static const char * const names[4] = {
+    "UNKNOWN",
+    "INITIALIZATION",
+    "INFERENCE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBenchmarkStage(BenchmarkStage e) {
+  if (::flatbuffers::IsOutRange(e, BenchmarkStage_UNKNOWN, BenchmarkStage_INFERENCE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBenchmarkStage()[index];
+}
+
+struct ComputeSettingsT : public ::flatbuffers::NativeTable {
+  typedef ComputeSettings TableType;
+  tflite::ExecutionPreference preference = tflite::ExecutionPreference_ANY;
+  std::unique_ptr<tflite::TFLiteSettingsT> tflite_settings{};
+  std::string model_namespace_for_statistics{};
+  std::string model_identifier_for_statistics{};
+  std::unique_ptr<tflite::MinibenchmarkSettingsT> settings_to_test_locally{};
+  ComputeSettingsT() = default;
+  ComputeSettingsT(const ComputeSettingsT &o);
+  ComputeSettingsT(ComputeSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  ComputeSettingsT &operator=(ComputeSettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct ComputeSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ComputeSettingsT NativeTableType;
+  typedef ComputeSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PREFERENCE = 4,
+    VT_TFLITE_SETTINGS = 6,
+    VT_MODEL_NAMESPACE_FOR_STATISTICS = 8,
+    VT_MODEL_IDENTIFIER_FOR_STATISTICS = 10,
+    VT_SETTINGS_TO_TEST_LOCALLY = 12
+  };
+  tflite::ExecutionPreference preference() const {
+    return static_cast<tflite::ExecutionPreference>(GetField<int32_t>(VT_PREFERENCE, 0));
+  }
+  const tflite::TFLiteSettings *tflite_settings() const {
+    return GetPointer<const tflite::TFLiteSettings *>(VT_TFLITE_SETTINGS);
+  }
+  const ::flatbuffers::String *model_namespace_for_statistics() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_NAMESPACE_FOR_STATISTICS);
+  }
+  const ::flatbuffers::String *model_identifier_for_statistics() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_IDENTIFIER_FOR_STATISTICS);
+  }
+  const tflite::MinibenchmarkSettings *settings_to_test_locally() const {
+    return GetPointer<const tflite::MinibenchmarkSettings *>(VT_SETTINGS_TO_TEST_LOCALLY);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_PREFERENCE, 4) &&
+           VerifyOffset(verifier, VT_TFLITE_SETTINGS) &&
+           verifier.VerifyTable(tflite_settings()) &&
+           VerifyOffset(verifier, VT_MODEL_NAMESPACE_FOR_STATISTICS) &&
+           verifier.VerifyString(model_namespace_for_statistics()) &&
+           VerifyOffset(verifier, VT_MODEL_IDENTIFIER_FOR_STATISTICS) &&
+           verifier.VerifyString(model_identifier_for_statistics()) &&
+           VerifyOffset(verifier, VT_SETTINGS_TO_TEST_LOCALLY) &&
+           verifier.VerifyTable(settings_to_test_locally()) &&
+           verifier.EndTable();
+  }
+  ComputeSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ComputeSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ComputeSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ComputeSettingsBuilder {
+  typedef ComputeSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_preference(tflite::ExecutionPreference preference) {
+    fbb_.AddElement<int32_t>(ComputeSettings::VT_PREFERENCE, static_cast<int32_t>(preference), 0);
+  }
+  void add_tflite_settings(::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings) {
+    fbb_.AddOffset(ComputeSettings::VT_TFLITE_SETTINGS, tflite_settings);
+  }
+  void add_model_namespace_for_statistics(::flatbuffers::Offset<::flatbuffers::String> model_namespace_for_statistics) {
+    fbb_.AddOffset(ComputeSettings::VT_MODEL_NAMESPACE_FOR_STATISTICS, model_namespace_for_statistics);
+  }
+  void add_model_identifier_for_statistics(::flatbuffers::Offset<::flatbuffers::String> model_identifier_for_statistics) {
+    fbb_.AddOffset(ComputeSettings::VT_MODEL_IDENTIFIER_FOR_STATISTICS, model_identifier_for_statistics);
+  }
+  void add_settings_to_test_locally(::flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally) {
+    fbb_.AddOffset(ComputeSettings::VT_SETTINGS_TO_TEST_LOCALLY, settings_to_test_locally);
+  }
+  explicit ComputeSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ComputeSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ComputeSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ComputeSettings> CreateComputeSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ExecutionPreference preference = tflite::ExecutionPreference_ANY,
+    ::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_namespace_for_statistics = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_identifier_for_statistics = 0,
+    ::flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally = 0) {
+  ComputeSettingsBuilder builder_(_fbb);
+  builder_.add_settings_to_test_locally(settings_to_test_locally);
+  builder_.add_model_identifier_for_statistics(model_identifier_for_statistics);
+  builder_.add_model_namespace_for_statistics(model_namespace_for_statistics);
+  builder_.add_tflite_settings(tflite_settings);
+  builder_.add_preference(preference);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ComputeSettings> CreateComputeSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ExecutionPreference preference = tflite::ExecutionPreference_ANY,
+    ::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
+    const char *model_namespace_for_statistics = nullptr,
+    const char *model_identifier_for_statistics = nullptr,
+    ::flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally = 0) {
+  auto model_namespace_for_statistics__ = model_namespace_for_statistics ? _fbb.CreateString(model_namespace_for_statistics) : 0;
+  auto model_identifier_for_statistics__ = model_identifier_for_statistics ? _fbb.CreateString(model_identifier_for_statistics) : 0;
+  return tflite::CreateComputeSettings(
+      _fbb,
+      preference,
+      tflite_settings,
+      model_namespace_for_statistics__,
+      model_identifier_for_statistics__,
+      settings_to_test_locally);
+}
+
+::flatbuffers::Offset<ComputeSettings> CreateComputeSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct NNAPISettingsT : public ::flatbuffers::NativeTable {
+  typedef NNAPISettings TableType;
+  std::string accelerator_name{};
+  std::string cache_directory{};
+  std::string model_token{};
+  tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED;
+  int32_t no_of_nnapi_instances_to_cache = 0;
+  std::unique_ptr<tflite::FallbackSettingsT> fallback_settings{};
+  bool allow_nnapi_cpu_on_android_10_plus = false;
+  tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED;
+  bool allow_dynamic_dimensions = false;
+  bool allow_fp16_precision_for_fp32 = false;
+  bool use_burst_computation = false;
+  int64_t support_library_handle = 0;
+  NNAPISettingsT() = default;
+  NNAPISettingsT(const NNAPISettingsT &o);
+  NNAPISettingsT(NNAPISettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  NNAPISettingsT &operator=(NNAPISettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct NNAPISettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NNAPISettingsT NativeTableType;
+  typedef NNAPISettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ACCELERATOR_NAME = 4,
+    VT_CACHE_DIRECTORY = 6,
+    VT_MODEL_TOKEN = 8,
+    VT_EXECUTION_PREFERENCE = 10,
+    VT_NO_OF_NNAPI_INSTANCES_TO_CACHE = 12,
+    VT_FALLBACK_SETTINGS = 14,
+    VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS = 16,
+    VT_EXECUTION_PRIORITY = 18,
+    VT_ALLOW_DYNAMIC_DIMENSIONS = 20,
+    VT_ALLOW_FP16_PRECISION_FOR_FP32 = 22,
+    VT_USE_BURST_COMPUTATION = 24,
+    VT_SUPPORT_LIBRARY_HANDLE = 26
+  };
+  const ::flatbuffers::String *accelerator_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_ACCELERATOR_NAME);
+  }
+  const ::flatbuffers::String *cache_directory() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CACHE_DIRECTORY);
+  }
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
+  }
+  tflite::NNAPIExecutionPreference execution_preference() const {
+    return static_cast<tflite::NNAPIExecutionPreference>(GetField<int32_t>(VT_EXECUTION_PREFERENCE, 0));
+  }
+  int32_t no_of_nnapi_instances_to_cache() const {
+    return GetField<int32_t>(VT_NO_OF_NNAPI_INSTANCES_TO_CACHE, 0);
+  }
+  const tflite::FallbackSettings *fallback_settings() const {
+    return GetPointer<const tflite::FallbackSettings *>(VT_FALLBACK_SETTINGS);
+  }
+  bool allow_nnapi_cpu_on_android_10_plus() const {
+    return GetField<uint8_t>(VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS, 0) != 0;
+  }
+  tflite::NNAPIExecutionPriority execution_priority() const {
+    return static_cast<tflite::NNAPIExecutionPriority>(GetField<int32_t>(VT_EXECUTION_PRIORITY, 0));
+  }
+  bool allow_dynamic_dimensions() const {
+    return GetField<uint8_t>(VT_ALLOW_DYNAMIC_DIMENSIONS, 0) != 0;
+  }
+  bool allow_fp16_precision_for_fp32() const {
+    return GetField<uint8_t>(VT_ALLOW_FP16_PRECISION_FOR_FP32, 0) != 0;
+  }
+  bool use_burst_computation() const {
+    return GetField<uint8_t>(VT_USE_BURST_COMPUTATION, 0) != 0;
+  }
+  int64_t support_library_handle() const {
+    return GetField<int64_t>(VT_SUPPORT_LIBRARY_HANDLE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ACCELERATOR_NAME) &&
+           verifier.VerifyString(accelerator_name()) &&
+           VerifyOffset(verifier, VT_CACHE_DIRECTORY) &&
+           verifier.VerifyString(cache_directory()) &&
+           VerifyOffset(verifier, VT_MODEL_TOKEN) &&
+           verifier.VerifyString(model_token()) &&
+           VerifyField<int32_t>(verifier, VT_EXECUTION_PREFERENCE, 4) &&
+           VerifyField<int32_t>(verifier, VT_NO_OF_NNAPI_INSTANCES_TO_CACHE, 4) &&
+           VerifyOffset(verifier, VT_FALLBACK_SETTINGS) &&
+           verifier.VerifyTable(fallback_settings()) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS, 1) &&
+           VerifyField<int32_t>(verifier, VT_EXECUTION_PRIORITY, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_DYNAMIC_DIMENSIONS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_FP16_PRECISION_FOR_FP32, 1) &&
+           VerifyField<uint8_t>(verifier, VT_USE_BURST_COMPUTATION, 1) &&
+           VerifyField<int64_t>(verifier, VT_SUPPORT_LIBRARY_HANDLE, 8) &&
+           verifier.EndTable();
+  }
+  NNAPISettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NNAPISettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<NNAPISettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct NNAPISettingsBuilder {
+  typedef NNAPISettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_accelerator_name(::flatbuffers::Offset<::flatbuffers::String> accelerator_name) {
+    fbb_.AddOffset(NNAPISettings::VT_ACCELERATOR_NAME, accelerator_name);
+  }
+  void add_cache_directory(::flatbuffers::Offset<::flatbuffers::String> cache_directory) {
+    fbb_.AddOffset(NNAPISettings::VT_CACHE_DIRECTORY, cache_directory);
+  }
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
+    fbb_.AddOffset(NNAPISettings::VT_MODEL_TOKEN, model_token);
+  }
+  void add_execution_preference(tflite::NNAPIExecutionPreference execution_preference) {
+    fbb_.AddElement<int32_t>(NNAPISettings::VT_EXECUTION_PREFERENCE, static_cast<int32_t>(execution_preference), 0);
+  }
+  void add_no_of_nnapi_instances_to_cache(int32_t no_of_nnapi_instances_to_cache) {
+    fbb_.AddElement<int32_t>(NNAPISettings::VT_NO_OF_NNAPI_INSTANCES_TO_CACHE, no_of_nnapi_instances_to_cache, 0);
+  }
+  void add_fallback_settings(::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings) {
+    fbb_.AddOffset(NNAPISettings::VT_FALLBACK_SETTINGS, fallback_settings);
+  }
+  void add_allow_nnapi_cpu_on_android_10_plus(bool allow_nnapi_cpu_on_android_10_plus) {
+    fbb_.AddElement<uint8_t>(NNAPISettings::VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS, static_cast<uint8_t>(allow_nnapi_cpu_on_android_10_plus), 0);
+  }
+  void add_execution_priority(tflite::NNAPIExecutionPriority execution_priority) {
+    fbb_.AddElement<int32_t>(NNAPISettings::VT_EXECUTION_PRIORITY, static_cast<int32_t>(execution_priority), 0);
+  }
+  void add_allow_dynamic_dimensions(bool allow_dynamic_dimensions) {
+    fbb_.AddElement<uint8_t>(NNAPISettings::VT_ALLOW_DYNAMIC_DIMENSIONS, static_cast<uint8_t>(allow_dynamic_dimensions), 0);
+  }
+  void add_allow_fp16_precision_for_fp32(bool allow_fp16_precision_for_fp32) {
+    fbb_.AddElement<uint8_t>(NNAPISettings::VT_ALLOW_FP16_PRECISION_FOR_FP32, static_cast<uint8_t>(allow_fp16_precision_for_fp32), 0);
+  }
+  void add_use_burst_computation(bool use_burst_computation) {
+    fbb_.AddElement<uint8_t>(NNAPISettings::VT_USE_BURST_COMPUTATION, static_cast<uint8_t>(use_burst_computation), 0);
+  }
+  void add_support_library_handle(int64_t support_library_handle) {
+    fbb_.AddElement<int64_t>(NNAPISettings::VT_SUPPORT_LIBRARY_HANDLE, support_library_handle, 0);
+  }
+  explicit NNAPISettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<NNAPISettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<NNAPISettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> accelerator_name = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> cache_directory = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0,
+    tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED,
+    int32_t no_of_nnapi_instances_to_cache = 0,
+    ::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
+    bool allow_nnapi_cpu_on_android_10_plus = false,
+    tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+    bool allow_dynamic_dimensions = false,
+    bool allow_fp16_precision_for_fp32 = false,
+    bool use_burst_computation = false,
+    int64_t support_library_handle = 0) {
+  NNAPISettingsBuilder builder_(_fbb);
+  builder_.add_support_library_handle(support_library_handle);
+  builder_.add_execution_priority(execution_priority);
+  builder_.add_fallback_settings(fallback_settings);
+  builder_.add_no_of_nnapi_instances_to_cache(no_of_nnapi_instances_to_cache);
+  builder_.add_execution_preference(execution_preference);
+  builder_.add_model_token(model_token);
+  builder_.add_cache_directory(cache_directory);
+  builder_.add_accelerator_name(accelerator_name);
+  builder_.add_use_burst_computation(use_burst_computation);
+  builder_.add_allow_fp16_precision_for_fp32(allow_fp16_precision_for_fp32);
+  builder_.add_allow_dynamic_dimensions(allow_dynamic_dimensions);
+  builder_.add_allow_nnapi_cpu_on_android_10_plus(allow_nnapi_cpu_on_android_10_plus);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<NNAPISettings> CreateNNAPISettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *accelerator_name = nullptr,
+    const char *cache_directory = nullptr,
+    const char *model_token = nullptr,
+    tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED,
+    int32_t no_of_nnapi_instances_to_cache = 0,
+    ::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
+    bool allow_nnapi_cpu_on_android_10_plus = false,
+    tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+    bool allow_dynamic_dimensions = false,
+    bool allow_fp16_precision_for_fp32 = false,
+    bool use_burst_computation = false,
+    int64_t support_library_handle = 0) {
+  auto accelerator_name__ = accelerator_name ? _fbb.CreateString(accelerator_name) : 0;
+  auto cache_directory__ = cache_directory ? _fbb.CreateString(cache_directory) : 0;
+  auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
+  return tflite::CreateNNAPISettings(
+      _fbb,
+      accelerator_name__,
+      cache_directory__,
+      model_token__,
+      execution_preference,
+      no_of_nnapi_instances_to_cache,
+      fallback_settings,
+      allow_nnapi_cpu_on_android_10_plus,
+      execution_priority,
+      allow_dynamic_dimensions,
+      allow_fp16_precision_for_fp32,
+      use_burst_computation,
+      support_library_handle);
+}
+
+::flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GPUSettingsT : public ::flatbuffers::NativeTable {
+  typedef GPUSettings TableType;
+  bool is_precision_loss_allowed = false;
+  bool enable_quantized_inference = true;
+  tflite::GPUBackend force_backend = tflite::GPUBackend_UNSET;
+  tflite::GPUInferencePriority inference_priority1 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO;
+  tflite::GPUInferencePriority inference_priority2 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO;
+  tflite::GPUInferencePriority inference_priority3 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO;
+  tflite::GPUInferenceUsage inference_preference = tflite::GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
+  std::string cache_directory{};
+  std::string model_token{};
+};
+
+struct GPUSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GPUSettingsT NativeTableType;
+  typedef GPUSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IS_PRECISION_LOSS_ALLOWED = 4,
+    VT_ENABLE_QUANTIZED_INFERENCE = 6,
+    VT_FORCE_BACKEND = 8,
+    VT_INFERENCE_PRIORITY1 = 10,
+    VT_INFERENCE_PRIORITY2 = 12,
+    VT_INFERENCE_PRIORITY3 = 14,
+    VT_INFERENCE_PREFERENCE = 16,
+    VT_CACHE_DIRECTORY = 18,
+    VT_MODEL_TOKEN = 20
+  };
+  bool is_precision_loss_allowed() const {
+    return GetField<uint8_t>(VT_IS_PRECISION_LOSS_ALLOWED, 0) != 0;
+  }
+  bool enable_quantized_inference() const {
+    return GetField<uint8_t>(VT_ENABLE_QUANTIZED_INFERENCE, 1) != 0;
+  }
+  tflite::GPUBackend force_backend() const {
+    return static_cast<tflite::GPUBackend>(GetField<int32_t>(VT_FORCE_BACKEND, 0));
+  }
+  tflite::GPUInferencePriority inference_priority1() const {
+    return static_cast<tflite::GPUInferencePriority>(GetField<int32_t>(VT_INFERENCE_PRIORITY1, 0));
+  }
+  tflite::GPUInferencePriority inference_priority2() const {
+    return static_cast<tflite::GPUInferencePriority>(GetField<int32_t>(VT_INFERENCE_PRIORITY2, 0));
+  }
+  tflite::GPUInferencePriority inference_priority3() const {
+    return static_cast<tflite::GPUInferencePriority>(GetField<int32_t>(VT_INFERENCE_PRIORITY3, 0));
+  }
+  tflite::GPUInferenceUsage inference_preference() const {
+    return static_cast<tflite::GPUInferenceUsage>(GetField<int32_t>(VT_INFERENCE_PREFERENCE, 0));
+  }
+  const ::flatbuffers::String *cache_directory() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CACHE_DIRECTORY);
+  }
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_IS_PRECISION_LOSS_ALLOWED, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ENABLE_QUANTIZED_INFERENCE, 1) &&
+           VerifyField<int32_t>(verifier, VT_FORCE_BACKEND, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PRIORITY1, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PRIORITY2, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PRIORITY3, 4) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PREFERENCE, 4) &&
+           VerifyOffset(verifier, VT_CACHE_DIRECTORY) &&
+           verifier.VerifyString(cache_directory()) &&
+           VerifyOffset(verifier, VT_MODEL_TOKEN) &&
+           verifier.VerifyString(model_token()) &&
+           verifier.EndTable();
+  }
+  GPUSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GPUSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GPUSettingsBuilder {
+  typedef GPUSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_is_precision_loss_allowed(bool is_precision_loss_allowed) {
+    fbb_.AddElement<uint8_t>(GPUSettings::VT_IS_PRECISION_LOSS_ALLOWED, static_cast<uint8_t>(is_precision_loss_allowed), 0);
+  }
+  void add_enable_quantized_inference(bool enable_quantized_inference) {
+    fbb_.AddElement<uint8_t>(GPUSettings::VT_ENABLE_QUANTIZED_INFERENCE, static_cast<uint8_t>(enable_quantized_inference), 1);
+  }
+  void add_force_backend(tflite::GPUBackend force_backend) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_FORCE_BACKEND, static_cast<int32_t>(force_backend), 0);
+  }
+  void add_inference_priority1(tflite::GPUInferencePriority inference_priority1) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PRIORITY1, static_cast<int32_t>(inference_priority1), 0);
+  }
+  void add_inference_priority2(tflite::GPUInferencePriority inference_priority2) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PRIORITY2, static_cast<int32_t>(inference_priority2), 0);
+  }
+  void add_inference_priority3(tflite::GPUInferencePriority inference_priority3) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PRIORITY3, static_cast<int32_t>(inference_priority3), 0);
+  }
+  void add_inference_preference(tflite::GPUInferenceUsage inference_preference) {
+    fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PREFERENCE, static_cast<int32_t>(inference_preference), 0);
+  }
+  void add_cache_directory(::flatbuffers::Offset<::flatbuffers::String> cache_directory) {
+    fbb_.AddOffset(GPUSettings::VT_CACHE_DIRECTORY, cache_directory);
+  }
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
+    fbb_.AddOffset(GPUSettings::VT_MODEL_TOKEN, model_token);
+  }
+  explicit GPUSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GPUSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GPUSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GPUSettings> CreateGPUSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool is_precision_loss_allowed = false,
+    bool enable_quantized_inference = true,
+    tflite::GPUBackend force_backend = tflite::GPUBackend_UNSET,
+    tflite::GPUInferencePriority inference_priority1 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferencePriority inference_priority2 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferencePriority inference_priority3 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferenceUsage inference_preference = tflite::GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+    ::flatbuffers::Offset<::flatbuffers::String> cache_directory = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0) {
+  GPUSettingsBuilder builder_(_fbb);
+  builder_.add_model_token(model_token);
+  builder_.add_cache_directory(cache_directory);
+  builder_.add_inference_preference(inference_preference);
+  builder_.add_inference_priority3(inference_priority3);
+  builder_.add_inference_priority2(inference_priority2);
+  builder_.add_inference_priority1(inference_priority1);
+  builder_.add_force_backend(force_backend);
+  builder_.add_enable_quantized_inference(enable_quantized_inference);
+  builder_.add_is_precision_loss_allowed(is_precision_loss_allowed);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<GPUSettings> CreateGPUSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool is_precision_loss_allowed = false,
+    bool enable_quantized_inference = true,
+    tflite::GPUBackend force_backend = tflite::GPUBackend_UNSET,
+    tflite::GPUInferencePriority inference_priority1 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferencePriority inference_priority2 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferencePriority inference_priority3 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
+    tflite::GPUInferenceUsage inference_preference = tflite::GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
+    const char *cache_directory = nullptr,
+    const char *model_token = nullptr) {
+  auto cache_directory__ = cache_directory ? _fbb.CreateString(cache_directory) : 0;
+  auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
+  return tflite::CreateGPUSettings(
+      _fbb,
+      is_precision_loss_allowed,
+      enable_quantized_inference,
+      force_backend,
+      inference_priority1,
+      inference_priority2,
+      inference_priority3,
+      inference_preference,
+      cache_directory__,
+      model_token__);
+}
+
+::flatbuffers::Offset<GPUSettings> CreateGPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct HexagonSettingsT : public ::flatbuffers::NativeTable {
+  typedef HexagonSettings TableType;
+  int32_t debug_level = 0;
+  int32_t powersave_level = 0;
+  bool print_graph_profile = false;
+  bool print_graph_debug = false;
+};
+
+struct HexagonSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HexagonSettingsT NativeTableType;
+  typedef HexagonSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DEBUG_LEVEL = 4,
+    VT_POWERSAVE_LEVEL = 6,
+    VT_PRINT_GRAPH_PROFILE = 8,
+    VT_PRINT_GRAPH_DEBUG = 10
+  };
+  int32_t debug_level() const {
+    return GetField<int32_t>(VT_DEBUG_LEVEL, 0);
+  }
+  int32_t powersave_level() const {
+    return GetField<int32_t>(VT_POWERSAVE_LEVEL, 0);
+  }
+  bool print_graph_profile() const {
+    return GetField<uint8_t>(VT_PRINT_GRAPH_PROFILE, 0) != 0;
+  }
+  bool print_graph_debug() const {
+    return GetField<uint8_t>(VT_PRINT_GRAPH_DEBUG, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_DEBUG_LEVEL, 4) &&
+           VerifyField<int32_t>(verifier, VT_POWERSAVE_LEVEL, 4) &&
+           VerifyField<uint8_t>(verifier, VT_PRINT_GRAPH_PROFILE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_PRINT_GRAPH_DEBUG, 1) &&
+           verifier.EndTable();
+  }
+  HexagonSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HexagonSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HexagonSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct HexagonSettingsBuilder {
+  typedef HexagonSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_debug_level(int32_t debug_level) {
+    fbb_.AddElement<int32_t>(HexagonSettings::VT_DEBUG_LEVEL, debug_level, 0);
+  }
+  void add_powersave_level(int32_t powersave_level) {
+    fbb_.AddElement<int32_t>(HexagonSettings::VT_POWERSAVE_LEVEL, powersave_level, 0);
+  }
+  void add_print_graph_profile(bool print_graph_profile) {
+    fbb_.AddElement<uint8_t>(HexagonSettings::VT_PRINT_GRAPH_PROFILE, static_cast<uint8_t>(print_graph_profile), 0);
+  }
+  void add_print_graph_debug(bool print_graph_debug) {
+    fbb_.AddElement<uint8_t>(HexagonSettings::VT_PRINT_GRAPH_DEBUG, static_cast<uint8_t>(print_graph_debug), 0);
+  }
+  explicit HexagonSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HexagonSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HexagonSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t debug_level = 0,
+    int32_t powersave_level = 0,
+    bool print_graph_profile = false,
+    bool print_graph_debug = false) {
+  HexagonSettingsBuilder builder_(_fbb);
+  builder_.add_powersave_level(powersave_level);
+  builder_.add_debug_level(debug_level);
+  builder_.add_print_graph_debug(print_graph_debug);
+  builder_.add_print_graph_profile(print_graph_profile);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct XNNPackSettingsT : public ::flatbuffers::NativeTable {
+  typedef XNNPackSettings TableType;
+  int32_t num_threads = 0;
+  tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS;
+};
+
+struct XNNPackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef XNNPackSettingsT NativeTableType;
+  typedef XNNPackSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_THREADS = 4,
+    VT_FLAGS = 6
+  };
+  int32_t num_threads() const {
+    return GetField<int32_t>(VT_NUM_THREADS, 0);
+  }
+  tflite::XNNPackFlags flags() const {
+    return static_cast<tflite::XNNPackFlags>(GetField<int32_t>(VT_FLAGS, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_THREADS, 4) &&
+           VerifyField<int32_t>(verifier, VT_FLAGS, 4) &&
+           verifier.EndTable();
+  }
+  XNNPackSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(XNNPackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<XNNPackSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct XNNPackSettingsBuilder {
+  typedef XNNPackSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num_threads(int32_t num_threads) {
+    fbb_.AddElement<int32_t>(XNNPackSettings::VT_NUM_THREADS, num_threads, 0);
+  }
+  void add_flags(tflite::XNNPackFlags flags) {
+    fbb_.AddElement<int32_t>(XNNPackSettings::VT_FLAGS, static_cast<int32_t>(flags), 0);
+  }
+  explicit XNNPackSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<XNNPackSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<XNNPackSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_threads = 0,
+    tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS) {
+  XNNPackSettingsBuilder builder_(_fbb);
+  builder_.add_flags(flags);
+  builder_.add_num_threads(num_threads);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CoreMLSettingsT : public ::flatbuffers::NativeTable {
+  typedef CoreMLSettings TableType;
+  tflite::CoreMLSettings_::EnabledDevices enabled_devices = tflite::CoreMLSettings_::EnabledDevices_DEVICES_ALL;
+  int32_t coreml_version = 0;
+  int32_t max_delegated_partitions = 0;
+  int32_t min_nodes_per_partition = 2;
+};
+
+struct CoreMLSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CoreMLSettingsT NativeTableType;
+  typedef CoreMLSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ENABLED_DEVICES = 4,
+    VT_COREML_VERSION = 6,
+    VT_MAX_DELEGATED_PARTITIONS = 8,
+    VT_MIN_NODES_PER_PARTITION = 10
+  };
+  tflite::CoreMLSettings_::EnabledDevices enabled_devices() const {
+    return static_cast<tflite::CoreMLSettings_::EnabledDevices>(GetField<int32_t>(VT_ENABLED_DEVICES, 0));
+  }
+  int32_t coreml_version() const {
+    return GetField<int32_t>(VT_COREML_VERSION, 0);
+  }
+  int32_t max_delegated_partitions() const {
+    return GetField<int32_t>(VT_MAX_DELEGATED_PARTITIONS, 0);
+  }
+  int32_t min_nodes_per_partition() const {
+    return GetField<int32_t>(VT_MIN_NODES_PER_PARTITION, 2);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_ENABLED_DEVICES, 4) &&
+           VerifyField<int32_t>(verifier, VT_COREML_VERSION, 4) &&
+           VerifyField<int32_t>(verifier, VT_MAX_DELEGATED_PARTITIONS, 4) &&
+           VerifyField<int32_t>(verifier, VT_MIN_NODES_PER_PARTITION, 4) &&
+           verifier.EndTable();
+  }
+  CoreMLSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CoreMLSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CoreMLSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CoreMLSettingsBuilder {
+  typedef CoreMLSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_enabled_devices(tflite::CoreMLSettings_::EnabledDevices enabled_devices) {
+    fbb_.AddElement<int32_t>(CoreMLSettings::VT_ENABLED_DEVICES, static_cast<int32_t>(enabled_devices), 0);
+  }
+  void add_coreml_version(int32_t coreml_version) {
+    fbb_.AddElement<int32_t>(CoreMLSettings::VT_COREML_VERSION, coreml_version, 0);
+  }
+  void add_max_delegated_partitions(int32_t max_delegated_partitions) {
+    fbb_.AddElement<int32_t>(CoreMLSettings::VT_MAX_DELEGATED_PARTITIONS, max_delegated_partitions, 0);
+  }
+  void add_min_nodes_per_partition(int32_t min_nodes_per_partition) {
+    fbb_.AddElement<int32_t>(CoreMLSettings::VT_MIN_NODES_PER_PARTITION, min_nodes_per_partition, 2);
+  }
+  explicit CoreMLSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CoreMLSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CoreMLSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::CoreMLSettings_::EnabledDevices enabled_devices = tflite::CoreMLSettings_::EnabledDevices_DEVICES_ALL,
+    int32_t coreml_version = 0,
+    int32_t max_delegated_partitions = 0,
+    int32_t min_nodes_per_partition = 2) {
+  CoreMLSettingsBuilder builder_(_fbb);
+  builder_.add_min_nodes_per_partition(min_nodes_per_partition);
+  builder_.add_max_delegated_partitions(max_delegated_partitions);
+  builder_.add_coreml_version(coreml_version);
+  builder_.add_enabled_devices(enabled_devices);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StableDelegateLoaderSettingsT : public ::flatbuffers::NativeTable {
+  typedef StableDelegateLoaderSettings TableType;
+  std::string delegate_path{};
+};
+
+struct StableDelegateLoaderSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StableDelegateLoaderSettingsT NativeTableType;
+  typedef StableDelegateLoaderSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DELEGATE_PATH = 4
+  };
+  const ::flatbuffers::String *delegate_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DELEGATE_PATH);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DELEGATE_PATH) &&
+           verifier.VerifyString(delegate_path()) &&
+           verifier.EndTable();
+  }
+  StableDelegateLoaderSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StableDelegateLoaderSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StableDelegateLoaderSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StableDelegateLoaderSettingsBuilder {
+  typedef StableDelegateLoaderSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_delegate_path(::flatbuffers::Offset<::flatbuffers::String> delegate_path) {
+    fbb_.AddOffset(StableDelegateLoaderSettings::VT_DELEGATE_PATH, delegate_path);
+  }
+  explicit StableDelegateLoaderSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StableDelegateLoaderSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StableDelegateLoaderSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> delegate_path = 0) {
+  StableDelegateLoaderSettingsBuilder builder_(_fbb);
+  builder_.add_delegate_path(delegate_path);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *delegate_path = nullptr) {
+  auto delegate_path__ = delegate_path ? _fbb.CreateString(delegate_path) : 0;
+  return tflite::CreateStableDelegateLoaderSettings(
+      _fbb,
+      delegate_path__);
+}
+
+::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CompilationCachingSettingsT : public ::flatbuffers::NativeTable {
+  typedef CompilationCachingSettings TableType;
+  std::string cache_dir{};
+  std::string model_token{};
+};
+
+struct CompilationCachingSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CompilationCachingSettingsT NativeTableType;
+  typedef CompilationCachingSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_CACHE_DIR = 4,
+    VT_MODEL_TOKEN = 6
+  };
+  const ::flatbuffers::String *cache_dir() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CACHE_DIR);
+  }
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CACHE_DIR) &&
+           verifier.VerifyString(cache_dir()) &&
+           VerifyOffset(verifier, VT_MODEL_TOKEN) &&
+           verifier.VerifyString(model_token()) &&
+           verifier.EndTable();
+  }
+  CompilationCachingSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CompilationCachingSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CompilationCachingSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CompilationCachingSettingsBuilder {
+  typedef CompilationCachingSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_cache_dir(::flatbuffers::Offset<::flatbuffers::String> cache_dir) {
+    fbb_.AddOffset(CompilationCachingSettings::VT_CACHE_DIR, cache_dir);
+  }
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
+    fbb_.AddOffset(CompilationCachingSettings::VT_MODEL_TOKEN, model_token);
+  }
+  explicit CompilationCachingSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CompilationCachingSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CompilationCachingSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> cache_dir = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0) {
+  CompilationCachingSettingsBuilder builder_(_fbb);
+  builder_.add_model_token(model_token);
+  builder_.add_cache_dir(cache_dir);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *cache_dir = nullptr,
+    const char *model_token = nullptr) {
+  auto cache_dir__ = cache_dir ? _fbb.CreateString(cache_dir) : 0;
+  auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
+  return tflite::CreateCompilationCachingSettings(
+      _fbb,
+      cache_dir__,
+      model_token__);
+}
+
+::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EdgeTpuDeviceSpecT : public ::flatbuffers::NativeTable {
+  typedef EdgeTpuDeviceSpec TableType;
+  tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO;
+  int32_t num_chips = 0;
+  std::vector<std::string> device_paths{};
+  int32_t chip_family = 0;
+};
+
+struct EdgeTpuDeviceSpec FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EdgeTpuDeviceSpecT NativeTableType;
+  typedef EdgeTpuDeviceSpecBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PLATFORM_TYPE = 4,
+    VT_NUM_CHIPS = 6,
+    VT_DEVICE_PATHS = 8,
+    VT_CHIP_FAMILY = 10
+  };
+  tflite::EdgeTpuDeviceSpec_::PlatformType platform_type() const {
+    return static_cast<tflite::EdgeTpuDeviceSpec_::PlatformType>(GetField<int32_t>(VT_PLATFORM_TYPE, 0));
+  }
+  int32_t num_chips() const {
+    return GetField<int32_t>(VT_NUM_CHIPS, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *device_paths() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_DEVICE_PATHS);
+  }
+  int32_t chip_family() const {
+    return GetField<int32_t>(VT_CHIP_FAMILY, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_PLATFORM_TYPE, 4) &&
+           VerifyField<int32_t>(verifier, VT_NUM_CHIPS, 4) &&
+           VerifyOffset(verifier, VT_DEVICE_PATHS) &&
+           verifier.VerifyVector(device_paths()) &&
+           verifier.VerifyVectorOfStrings(device_paths()) &&
+           VerifyField<int32_t>(verifier, VT_CHIP_FAMILY, 4) &&
+           verifier.EndTable();
+  }
+  EdgeTpuDeviceSpecT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EdgeTpuDeviceSpecT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EdgeTpuDeviceSpec> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EdgeTpuDeviceSpecBuilder {
+  typedef EdgeTpuDeviceSpec Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_platform_type(tflite::EdgeTpuDeviceSpec_::PlatformType platform_type) {
+    fbb_.AddElement<int32_t>(EdgeTpuDeviceSpec::VT_PLATFORM_TYPE, static_cast<int32_t>(platform_type), 0);
+  }
+  void add_num_chips(int32_t num_chips) {
+    fbb_.AddElement<int32_t>(EdgeTpuDeviceSpec::VT_NUM_CHIPS, num_chips, 0);
+  }
+  void add_device_paths(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> device_paths) {
+    fbb_.AddOffset(EdgeTpuDeviceSpec::VT_DEVICE_PATHS, device_paths);
+  }
+  void add_chip_family(int32_t chip_family) {
+    fbb_.AddElement<int32_t>(EdgeTpuDeviceSpec::VT_CHIP_FAMILY, chip_family, 0);
+  }
+  explicit EdgeTpuDeviceSpecBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EdgeTpuDeviceSpec> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EdgeTpuDeviceSpec>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO,
+    int32_t num_chips = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> device_paths = 0,
+    int32_t chip_family = 0) {
+  EdgeTpuDeviceSpecBuilder builder_(_fbb);
+  builder_.add_chip_family(chip_family);
+  builder_.add_device_paths(device_paths);
+  builder_.add_num_chips(num_chips);
+  builder_.add_platform_type(platform_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpecDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO,
+    int32_t num_chips = 0,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *device_paths = nullptr,
+    int32_t chip_family = 0) {
+  auto device_paths__ = device_paths ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*device_paths) : 0;
+  return tflite::CreateEdgeTpuDeviceSpec(
+      _fbb,
+      platform_type,
+      num_chips,
+      device_paths__,
+      chip_family);
+}
+
+::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EdgeTpuInactivePowerConfigT : public ::flatbuffers::NativeTable {
+  typedef EdgeTpuInactivePowerConfig TableType;
+  tflite::EdgeTpuPowerState inactive_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE;
+  int64_t inactive_timeout_us = 0;
+};
+
+struct EdgeTpuInactivePowerConfig FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EdgeTpuInactivePowerConfigT NativeTableType;
+  typedef EdgeTpuInactivePowerConfigBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INACTIVE_POWER_STATE = 4,
+    VT_INACTIVE_TIMEOUT_US = 6
+  };
+  tflite::EdgeTpuPowerState inactive_power_state() const {
+    return static_cast<tflite::EdgeTpuPowerState>(GetField<int32_t>(VT_INACTIVE_POWER_STATE, 0));
+  }
+  int64_t inactive_timeout_us() const {
+    return GetField<int64_t>(VT_INACTIVE_TIMEOUT_US, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_INACTIVE_POWER_STATE, 4) &&
+           VerifyField<int64_t>(verifier, VT_INACTIVE_TIMEOUT_US, 8) &&
+           verifier.EndTable();
+  }
+  EdgeTpuInactivePowerConfigT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EdgeTpuInactivePowerConfigBuilder {
+  typedef EdgeTpuInactivePowerConfig Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_inactive_power_state(tflite::EdgeTpuPowerState inactive_power_state) {
+    fbb_.AddElement<int32_t>(EdgeTpuInactivePowerConfig::VT_INACTIVE_POWER_STATE, static_cast<int32_t>(inactive_power_state), 0);
+  }
+  void add_inactive_timeout_us(int64_t inactive_timeout_us) {
+    fbb_.AddElement<int64_t>(EdgeTpuInactivePowerConfig::VT_INACTIVE_TIMEOUT_US, inactive_timeout_us, 0);
+  }
+  explicit EdgeTpuInactivePowerConfigBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EdgeTpuInactivePowerConfig>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuPowerState inactive_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+    int64_t inactive_timeout_us = 0) {
+  EdgeTpuInactivePowerConfigBuilder builder_(_fbb);
+  builder_.add_inactive_timeout_us(inactive_timeout_us);
+  builder_.add_inactive_power_state(inactive_power_state);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EdgeTpuSettingsT : public ::flatbuffers::NativeTable {
+  typedef EdgeTpuSettings TableType;
+  tflite::EdgeTpuPowerState inference_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE;
+  std::vector<std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT>> inactive_power_configs{};
+  int32_t inference_priority = -1;
+  std::unique_ptr<tflite::EdgeTpuDeviceSpecT> edgetpu_device_spec{};
+  std::string model_token{};
+  tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED;
+  tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED;
+  std::vector<int32_t> hardware_cluster_ids{};
+  std::string public_model_id{};
+  EdgeTpuSettingsT() = default;
+  EdgeTpuSettingsT(const EdgeTpuSettingsT &o);
+  EdgeTpuSettingsT(EdgeTpuSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  EdgeTpuSettingsT &operator=(EdgeTpuSettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EdgeTpuSettingsT NativeTableType;
+  typedef EdgeTpuSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INFERENCE_POWER_STATE = 4,
+    VT_INACTIVE_POWER_CONFIGS = 6,
+    VT_INFERENCE_PRIORITY = 8,
+    VT_EDGETPU_DEVICE_SPEC = 10,
+    VT_MODEL_TOKEN = 12,
+    VT_FLOAT_TRUNCATION_TYPE = 14,
+    VT_QOS_CLASS = 16,
+    VT_HARDWARE_CLUSTER_IDS = 18,
+    VT_PUBLIC_MODEL_ID = 20
+  };
+  tflite::EdgeTpuPowerState inference_power_state() const {
+    return static_cast<tflite::EdgeTpuPowerState>(GetField<int32_t>(VT_INFERENCE_POWER_STATE, 0));
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *inactive_power_configs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *>(VT_INACTIVE_POWER_CONFIGS);
+  }
+  int32_t inference_priority() const {
+    return GetField<int32_t>(VT_INFERENCE_PRIORITY, -1);
+  }
+  const tflite::EdgeTpuDeviceSpec *edgetpu_device_spec() const {
+    return GetPointer<const tflite::EdgeTpuDeviceSpec *>(VT_EDGETPU_DEVICE_SPEC);
+  }
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
+  }
+  tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type() const {
+    return static_cast<tflite::EdgeTpuSettings_::FloatTruncationType>(GetField<int32_t>(VT_FLOAT_TRUNCATION_TYPE, 0));
+  }
+  tflite::EdgeTpuSettings_::QosClass qos_class() const {
+    return static_cast<tflite::EdgeTpuSettings_::QosClass>(GetField<int32_t>(VT_QOS_CLASS, 0));
+  }
+  const ::flatbuffers::Vector<int32_t> *hardware_cluster_ids() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_HARDWARE_CLUSTER_IDS);
+  }
+  const ::flatbuffers::String *public_model_id() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_PUBLIC_MODEL_ID);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_POWER_STATE, 4) &&
+           VerifyOffset(verifier, VT_INACTIVE_POWER_CONFIGS) &&
+           verifier.VerifyVector(inactive_power_configs()) &&
+           verifier.VerifyVectorOfTables(inactive_power_configs()) &&
+           VerifyField<int32_t>(verifier, VT_INFERENCE_PRIORITY, 4) &&
+           VerifyOffset(verifier, VT_EDGETPU_DEVICE_SPEC) &&
+           verifier.VerifyTable(edgetpu_device_spec()) &&
+           VerifyOffset(verifier, VT_MODEL_TOKEN) &&
+           verifier.VerifyString(model_token()) &&
+           VerifyField<int32_t>(verifier, VT_FLOAT_TRUNCATION_TYPE, 4) &&
+           VerifyField<int32_t>(verifier, VT_QOS_CLASS, 4) &&
+           VerifyOffset(verifier, VT_HARDWARE_CLUSTER_IDS) &&
+           verifier.VerifyVector(hardware_cluster_ids()) &&
+           VerifyOffset(verifier, VT_PUBLIC_MODEL_ID) &&
+           verifier.VerifyString(public_model_id()) &&
+           verifier.EndTable();
+  }
+  EdgeTpuSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EdgeTpuSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EdgeTpuSettingsBuilder {
+  typedef EdgeTpuSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_inference_power_state(tflite::EdgeTpuPowerState inference_power_state) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_INFERENCE_POWER_STATE, static_cast<int32_t>(inference_power_state), 0);
+  }
+  void add_inactive_power_configs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>> inactive_power_configs) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_INACTIVE_POWER_CONFIGS, inactive_power_configs);
+  }
+  void add_inference_priority(int32_t inference_priority) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_INFERENCE_PRIORITY, inference_priority, -1);
+  }
+  void add_edgetpu_device_spec(::flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_EDGETPU_DEVICE_SPEC, edgetpu_device_spec);
+  }
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_MODEL_TOKEN, model_token);
+  }
+  void add_float_truncation_type(tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_FLOAT_TRUNCATION_TYPE, static_cast<int32_t>(float_truncation_type), 0);
+  }
+  void add_qos_class(tflite::EdgeTpuSettings_::QosClass qos_class) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_QOS_CLASS, static_cast<int32_t>(qos_class), 0);
+  }
+  void add_hardware_cluster_ids(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> hardware_cluster_ids) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_HARDWARE_CLUSTER_IDS, hardware_cluster_ids);
+  }
+  void add_public_model_id(::flatbuffers::Offset<::flatbuffers::String> public_model_id) {
+    fbb_.AddOffset(EdgeTpuSettings::VT_PUBLIC_MODEL_ID, public_model_id);
+  }
+  explicit EdgeTpuSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EdgeTpuSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EdgeTpuSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuPowerState inference_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>> inactive_power_configs = 0,
+    int32_t inference_priority = -1,
+    ::flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0,
+    tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED,
+    tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> hardware_cluster_ids = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> public_model_id = 0) {
+  EdgeTpuSettingsBuilder builder_(_fbb);
+  builder_.add_public_model_id(public_model_id);
+  builder_.add_hardware_cluster_ids(hardware_cluster_ids);
+  builder_.add_qos_class(qos_class);
+  builder_.add_float_truncation_type(float_truncation_type);
+  builder_.add_model_token(model_token);
+  builder_.add_edgetpu_device_spec(edgetpu_device_spec);
+  builder_.add_inference_priority(inference_priority);
+  builder_.add_inactive_power_configs(inactive_power_configs);
+  builder_.add_inference_power_state(inference_power_state);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::EdgeTpuPowerState inference_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE,
+    const std::vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *inactive_power_configs = nullptr,
+    int32_t inference_priority = -1,
+    ::flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec = 0,
+    const char *model_token = nullptr,
+    tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED,
+    tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED,
+    const std::vector<int32_t> *hardware_cluster_ids = nullptr,
+    const char *public_model_id = nullptr) {
+  auto inactive_power_configs__ = inactive_power_configs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>(*inactive_power_configs) : 0;
+  auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
+  auto hardware_cluster_ids__ = hardware_cluster_ids ? _fbb.CreateVector<int32_t>(*hardware_cluster_ids) : 0;
+  auto public_model_id__ = public_model_id ? _fbb.CreateString(public_model_id) : 0;
+  return tflite::CreateEdgeTpuSettings(
+      _fbb,
+      inference_power_state,
+      inactive_power_configs__,
+      inference_priority,
+      edgetpu_device_spec,
+      model_token__,
+      float_truncation_type,
+      qos_class,
+      hardware_cluster_ids__,
+      public_model_id__);
+}
+
+::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GoogleEdgeTpuSettingsT : public ::flatbuffers::NativeTable {
+  typedef GoogleEdgeTpuSettings TableType;
+  int32_t log_verbosity = -1;
+  bool enable_tracing = false;
+  tflite::GoogleEdgeTpuSettings_::Priority priority = tflite::GoogleEdgeTpuSettings_::Priority_PRIORITY_UNDEFINED;
+  std::vector<uint8_t> extension_data{};
+  std::string model_identifier{};
+  bool use_async_api = false;
+  bool delegate_should_manage_cache_for_inputs = true;
+  bool delegate_should_manage_cache_for_outputs = true;
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED;
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED;
+};
+
+struct GoogleEdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GoogleEdgeTpuSettingsT NativeTableType;
+  typedef GoogleEdgeTpuSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_LOG_VERBOSITY = 4,
+    VT_ENABLE_TRACING = 6,
+    VT_PRIORITY = 8,
+    VT_EXTENSION_DATA = 10,
+    VT_MODEL_IDENTIFIER = 12,
+    VT_USE_ASYNC_API = 14,
+    VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS = 16,
+    VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS = 18,
+    VT_PREFER_CACHE_COHERENCY_FOR_INPUTS = 20,
+    VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS = 22
+  };
+  int32_t log_verbosity() const {
+    return GetField<int32_t>(VT_LOG_VERBOSITY, -1);
+  }
+  bool enable_tracing() const {
+    return GetField<uint8_t>(VT_ENABLE_TRACING, 0) != 0;
+  }
+  tflite::GoogleEdgeTpuSettings_::Priority priority() const {
+    return static_cast<tflite::GoogleEdgeTpuSettings_::Priority>(GetField<int32_t>(VT_PRIORITY, 0));
+  }
+  const ::flatbuffers::Vector<uint8_t> *extension_data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_EXTENSION_DATA);
+  }
+  const ::flatbuffers::String *model_identifier() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_IDENTIFIER);
+  }
+  bool use_async_api() const {
+    return GetField<uint8_t>(VT_USE_ASYNC_API, 0) != 0;
+  }
+  bool delegate_should_manage_cache_for_inputs() const {
+    return GetField<uint8_t>(VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS, 1) != 0;
+  }
+  bool delegate_should_manage_cache_for_outputs() const {
+    return GetField<uint8_t>(VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, 1) != 0;
+  }
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs() const {
+    return static_cast<tflite::GoogleEdgeTpuSettings_::TriState>(GetField<int32_t>(VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, 0));
+  }
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs() const {
+    return static_cast<tflite::GoogleEdgeTpuSettings_::TriState>(GetField<int32_t>(VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_LOG_VERBOSITY, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ENABLE_TRACING, 1) &&
+           VerifyField<int32_t>(verifier, VT_PRIORITY, 4) &&
+           VerifyOffset(verifier, VT_EXTENSION_DATA) &&
+           verifier.VerifyVector(extension_data()) &&
+           VerifyOffset(verifier, VT_MODEL_IDENTIFIER) &&
+           verifier.VerifyString(model_identifier()) &&
+           VerifyField<uint8_t>(verifier, VT_USE_ASYNC_API, 1) &&
+           VerifyField<uint8_t>(verifier, VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, 1) &&
+           VerifyField<int32_t>(verifier, VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, 4) &&
+           VerifyField<int32_t>(verifier, VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, 4) &&
+           verifier.EndTable();
+  }
+  GoogleEdgeTpuSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GoogleEdgeTpuSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GoogleEdgeTpuSettingsBuilder {
+  typedef GoogleEdgeTpuSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_log_verbosity(int32_t log_verbosity) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_LOG_VERBOSITY, log_verbosity, -1);
+  }
+  void add_enable_tracing(bool enable_tracing) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_ENABLE_TRACING, static_cast<uint8_t>(enable_tracing), 0);
+  }
+  void add_priority(tflite::GoogleEdgeTpuSettings_::Priority priority) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PRIORITY, static_cast<int32_t>(priority), 0);
+  }
+  void add_extension_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> extension_data) {
+    fbb_.AddOffset(GoogleEdgeTpuSettings::VT_EXTENSION_DATA, extension_data);
+  }
+  void add_model_identifier(::flatbuffers::Offset<::flatbuffers::String> model_identifier) {
+    fbb_.AddOffset(GoogleEdgeTpuSettings::VT_MODEL_IDENTIFIER, model_identifier);
+  }
+  void add_use_async_api(bool use_async_api) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_USE_ASYNC_API, static_cast<uint8_t>(use_async_api), 0);
+  }
+  void add_delegate_should_manage_cache_for_inputs(bool delegate_should_manage_cache_for_inputs) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS, static_cast<uint8_t>(delegate_should_manage_cache_for_inputs), 1);
+  }
+  void add_delegate_should_manage_cache_for_outputs(bool delegate_should_manage_cache_for_outputs) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, static_cast<uint8_t>(delegate_should_manage_cache_for_outputs), 1);
+  }
+  void add_prefer_cache_coherency_for_inputs(tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, static_cast<int32_t>(prefer_cache_coherency_for_inputs), 0);
+  }
+  void add_prefer_cache_coherency_for_outputs(tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, static_cast<int32_t>(prefer_cache_coherency_for_outputs), 0);
+  }
+  explicit GoogleEdgeTpuSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GoogleEdgeTpuSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GoogleEdgeTpuSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t log_verbosity = -1,
+    bool enable_tracing = false,
+    tflite::GoogleEdgeTpuSettings_::Priority priority = tflite::GoogleEdgeTpuSettings_::Priority_PRIORITY_UNDEFINED,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> extension_data = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_identifier = 0,
+    bool use_async_api = false,
+    bool delegate_should_manage_cache_for_inputs = true,
+    bool delegate_should_manage_cache_for_outputs = true,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED) {
+  GoogleEdgeTpuSettingsBuilder builder_(_fbb);
+  builder_.add_prefer_cache_coherency_for_outputs(prefer_cache_coherency_for_outputs);
+  builder_.add_prefer_cache_coherency_for_inputs(prefer_cache_coherency_for_inputs);
+  builder_.add_model_identifier(model_identifier);
+  builder_.add_extension_data(extension_data);
+  builder_.add_priority(priority);
+  builder_.add_log_verbosity(log_verbosity);
+  builder_.add_delegate_should_manage_cache_for_outputs(delegate_should_manage_cache_for_outputs);
+  builder_.add_delegate_should_manage_cache_for_inputs(delegate_should_manage_cache_for_inputs);
+  builder_.add_use_async_api(use_async_api);
+  builder_.add_enable_tracing(enable_tracing);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t log_verbosity = -1,
+    bool enable_tracing = false,
+    tflite::GoogleEdgeTpuSettings_::Priority priority = tflite::GoogleEdgeTpuSettings_::Priority_PRIORITY_UNDEFINED,
+    const std::vector<uint8_t> *extension_data = nullptr,
+    const char *model_identifier = nullptr,
+    bool use_async_api = false,
+    bool delegate_should_manage_cache_for_inputs = true,
+    bool delegate_should_manage_cache_for_outputs = true,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED) {
+  auto extension_data__ = extension_data ? _fbb.CreateVector<uint8_t>(*extension_data) : 0;
+  auto model_identifier__ = model_identifier ? _fbb.CreateString(model_identifier) : 0;
+  return tflite::CreateGoogleEdgeTpuSettings(
+      _fbb,
+      log_verbosity,
+      enable_tracing,
+      priority,
+      extension_data__,
+      model_identifier__,
+      use_async_api,
+      delegate_should_manage_cache_for_inputs,
+      delegate_should_manage_cache_for_outputs,
+      prefer_cache_coherency_for_inputs,
+      prefer_cache_coherency_for_outputs);
+}
+
+::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CoralSettingsT : public ::flatbuffers::NativeTable {
+  typedef CoralSettings TableType;
+  std::string device{};
+  tflite::CoralSettings_::Performance performance = tflite::CoralSettings_::Performance_UNDEFINED;
+  bool usb_always_dfu = false;
+  int32_t usb_max_bulk_in_queue_length = 0;
+};
+
+struct CoralSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CoralSettingsT NativeTableType;
+  typedef CoralSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DEVICE = 4,
+    VT_PERFORMANCE = 6,
+    VT_USB_ALWAYS_DFU = 8,
+    VT_USB_MAX_BULK_IN_QUEUE_LENGTH = 10
+  };
+  const ::flatbuffers::String *device() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DEVICE);
+  }
+  tflite::CoralSettings_::Performance performance() const {
+    return static_cast<tflite::CoralSettings_::Performance>(GetField<int32_t>(VT_PERFORMANCE, 0));
+  }
+  bool usb_always_dfu() const {
+    return GetField<uint8_t>(VT_USB_ALWAYS_DFU, 0) != 0;
+  }
+  int32_t usb_max_bulk_in_queue_length() const {
+    return GetField<int32_t>(VT_USB_MAX_BULK_IN_QUEUE_LENGTH, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DEVICE) &&
+           verifier.VerifyString(device()) &&
+           VerifyField<int32_t>(verifier, VT_PERFORMANCE, 4) &&
+           VerifyField<uint8_t>(verifier, VT_USB_ALWAYS_DFU, 1) &&
+           VerifyField<int32_t>(verifier, VT_USB_MAX_BULK_IN_QUEUE_LENGTH, 4) &&
+           verifier.EndTable();
+  }
+  CoralSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CoralSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CoralSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CoralSettingsBuilder {
+  typedef CoralSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_device(::flatbuffers::Offset<::flatbuffers::String> device) {
+    fbb_.AddOffset(CoralSettings::VT_DEVICE, device);
+  }
+  void add_performance(tflite::CoralSettings_::Performance performance) {
+    fbb_.AddElement<int32_t>(CoralSettings::VT_PERFORMANCE, static_cast<int32_t>(performance), 0);
+  }
+  void add_usb_always_dfu(bool usb_always_dfu) {
+    fbb_.AddElement<uint8_t>(CoralSettings::VT_USB_ALWAYS_DFU, static_cast<uint8_t>(usb_always_dfu), 0);
+  }
+  void add_usb_max_bulk_in_queue_length(int32_t usb_max_bulk_in_queue_length) {
+    fbb_.AddElement<int32_t>(CoralSettings::VT_USB_MAX_BULK_IN_QUEUE_LENGTH, usb_max_bulk_in_queue_length, 0);
+  }
+  explicit CoralSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CoralSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CoralSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CoralSettings> CreateCoralSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> device = 0,
+    tflite::CoralSettings_::Performance performance = tflite::CoralSettings_::Performance_UNDEFINED,
+    bool usb_always_dfu = false,
+    int32_t usb_max_bulk_in_queue_length = 0) {
+  CoralSettingsBuilder builder_(_fbb);
+  builder_.add_usb_max_bulk_in_queue_length(usb_max_bulk_in_queue_length);
+  builder_.add_performance(performance);
+  builder_.add_device(device);
+  builder_.add_usb_always_dfu(usb_always_dfu);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<CoralSettings> CreateCoralSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *device = nullptr,
+    tflite::CoralSettings_::Performance performance = tflite::CoralSettings_::Performance_UNDEFINED,
+    bool usb_always_dfu = false,
+    int32_t usb_max_bulk_in_queue_length = 0) {
+  auto device__ = device ? _fbb.CreateString(device) : 0;
+  return tflite::CreateCoralSettings(
+      _fbb,
+      device__,
+      performance,
+      usb_always_dfu,
+      usb_max_bulk_in_queue_length);
+}
+
+::flatbuffers::Offset<CoralSettings> CreateCoralSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CPUSettingsT : public ::flatbuffers::NativeTable {
+  typedef CPUSettings TableType;
+  int32_t num_threads = -1;
+};
+
+struct CPUSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CPUSettingsT NativeTableType;
+  typedef CPUSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_THREADS = 4
+  };
+  int32_t num_threads() const {
+    return GetField<int32_t>(VT_NUM_THREADS, -1);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_THREADS, 4) &&
+           verifier.EndTable();
+  }
+  CPUSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CPUSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CPUSettingsBuilder {
+  typedef CPUSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num_threads(int32_t num_threads) {
+    fbb_.AddElement<int32_t>(CPUSettings::VT_NUM_THREADS, num_threads, -1);
+  }
+  explicit CPUSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CPUSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CPUSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_threads = -1) {
+  CPUSettingsBuilder builder_(_fbb);
+  builder_.add_num_threads(num_threads);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<CPUSettings> CreateCPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TFLiteSettingsT : public ::flatbuffers::NativeTable {
+  typedef TFLiteSettings TableType;
+  tflite::Delegate delegate = tflite::Delegate_NONE;
+  std::unique_ptr<tflite::NNAPISettingsT> nnapi_settings{};
+  std::unique_ptr<tflite::GPUSettingsT> gpu_settings{};
+  std::unique_ptr<tflite::HexagonSettingsT> hexagon_settings{};
+  std::unique_ptr<tflite::XNNPackSettingsT> xnnpack_settings{};
+  std::unique_ptr<tflite::CoreMLSettingsT> coreml_settings{};
+  std::unique_ptr<tflite::CPUSettingsT> cpu_settings{};
+  int32_t max_delegated_partitions = 0;
+  std::unique_ptr<tflite::EdgeTpuSettingsT> edgetpu_settings{};
+  std::unique_ptr<tflite::CoralSettingsT> coral_settings{};
+  std::unique_ptr<tflite::FallbackSettingsT> fallback_settings{};
+  bool disable_default_delegates = false;
+  std::unique_ptr<tflite::StableDelegateLoaderSettingsT> stable_delegate_loader_settings{};
+  std::unique_ptr<tflite::GoogleEdgeTpuSettingsT> google_edgetpu_settings{};
+  std::unique_ptr<tflite::CompilationCachingSettingsT> compilation_caching_settings{};
+  TFLiteSettingsT() = default;
+  TFLiteSettingsT(const TFLiteSettingsT &o);
+  TFLiteSettingsT(TFLiteSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  TFLiteSettingsT &operator=(TFLiteSettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TFLiteSettingsT NativeTableType;
+  typedef TFLiteSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DELEGATE = 4,
+    VT_NNAPI_SETTINGS = 6,
+    VT_GPU_SETTINGS = 8,
+    VT_HEXAGON_SETTINGS = 10,
+    VT_XNNPACK_SETTINGS = 12,
+    VT_COREML_SETTINGS = 14,
+    VT_CPU_SETTINGS = 16,
+    VT_MAX_DELEGATED_PARTITIONS = 18,
+    VT_EDGETPU_SETTINGS = 20,
+    VT_CORAL_SETTINGS = 22,
+    VT_FALLBACK_SETTINGS = 24,
+    VT_DISABLE_DEFAULT_DELEGATES = 26,
+    VT_STABLE_DELEGATE_LOADER_SETTINGS = 28,
+    VT_GOOGLE_EDGETPU_SETTINGS = 30,
+    VT_COMPILATION_CACHING_SETTINGS = 32
+  };
+  tflite::Delegate delegate() const {
+    return static_cast<tflite::Delegate>(GetField<int32_t>(VT_DELEGATE, 0));
+  }
+  const tflite::NNAPISettings *nnapi_settings() const {
+    return GetPointer<const tflite::NNAPISettings *>(VT_NNAPI_SETTINGS);
+  }
+  const tflite::GPUSettings *gpu_settings() const {
+    return GetPointer<const tflite::GPUSettings *>(VT_GPU_SETTINGS);
+  }
+  const tflite::HexagonSettings *hexagon_settings() const {
+    return GetPointer<const tflite::HexagonSettings *>(VT_HEXAGON_SETTINGS);
+  }
+  const tflite::XNNPackSettings *xnnpack_settings() const {
+    return GetPointer<const tflite::XNNPackSettings *>(VT_XNNPACK_SETTINGS);
+  }
+  const tflite::CoreMLSettings *coreml_settings() const {
+    return GetPointer<const tflite::CoreMLSettings *>(VT_COREML_SETTINGS);
+  }
+  const tflite::CPUSettings *cpu_settings() const {
+    return GetPointer<const tflite::CPUSettings *>(VT_CPU_SETTINGS);
+  }
+  int32_t max_delegated_partitions() const {
+    return GetField<int32_t>(VT_MAX_DELEGATED_PARTITIONS, 0);
+  }
+  const tflite::EdgeTpuSettings *edgetpu_settings() const {
+    return GetPointer<const tflite::EdgeTpuSettings *>(VT_EDGETPU_SETTINGS);
+  }
+  const tflite::CoralSettings *coral_settings() const {
+    return GetPointer<const tflite::CoralSettings *>(VT_CORAL_SETTINGS);
+  }
+  const tflite::FallbackSettings *fallback_settings() const {
+    return GetPointer<const tflite::FallbackSettings *>(VT_FALLBACK_SETTINGS);
+  }
+  bool disable_default_delegates() const {
+    return GetField<uint8_t>(VT_DISABLE_DEFAULT_DELEGATES, 0) != 0;
+  }
+  const tflite::StableDelegateLoaderSettings *stable_delegate_loader_settings() const {
+    return GetPointer<const tflite::StableDelegateLoaderSettings *>(VT_STABLE_DELEGATE_LOADER_SETTINGS);
+  }
+  const tflite::GoogleEdgeTpuSettings *google_edgetpu_settings() const {
+    return GetPointer<const tflite::GoogleEdgeTpuSettings *>(VT_GOOGLE_EDGETPU_SETTINGS);
+  }
+  const tflite::CompilationCachingSettings *compilation_caching_settings() const {
+    return GetPointer<const tflite::CompilationCachingSettings *>(VT_COMPILATION_CACHING_SETTINGS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_DELEGATE, 4) &&
+           VerifyOffset(verifier, VT_NNAPI_SETTINGS) &&
+           verifier.VerifyTable(nnapi_settings()) &&
+           VerifyOffset(verifier, VT_GPU_SETTINGS) &&
+           verifier.VerifyTable(gpu_settings()) &&
+           VerifyOffset(verifier, VT_HEXAGON_SETTINGS) &&
+           verifier.VerifyTable(hexagon_settings()) &&
+           VerifyOffset(verifier, VT_XNNPACK_SETTINGS) &&
+           verifier.VerifyTable(xnnpack_settings()) &&
+           VerifyOffset(verifier, VT_COREML_SETTINGS) &&
+           verifier.VerifyTable(coreml_settings()) &&
+           VerifyOffset(verifier, VT_CPU_SETTINGS) &&
+           verifier.VerifyTable(cpu_settings()) &&
+           VerifyField<int32_t>(verifier, VT_MAX_DELEGATED_PARTITIONS, 4) &&
+           VerifyOffset(verifier, VT_EDGETPU_SETTINGS) &&
+           verifier.VerifyTable(edgetpu_settings()) &&
+           VerifyOffset(verifier, VT_CORAL_SETTINGS) &&
+           verifier.VerifyTable(coral_settings()) &&
+           VerifyOffset(verifier, VT_FALLBACK_SETTINGS) &&
+           verifier.VerifyTable(fallback_settings()) &&
+           VerifyField<uint8_t>(verifier, VT_DISABLE_DEFAULT_DELEGATES, 1) &&
+           VerifyOffset(verifier, VT_STABLE_DELEGATE_LOADER_SETTINGS) &&
+           verifier.VerifyTable(stable_delegate_loader_settings()) &&
+           VerifyOffset(verifier, VT_GOOGLE_EDGETPU_SETTINGS) &&
+           verifier.VerifyTable(google_edgetpu_settings()) &&
+           VerifyOffset(verifier, VT_COMPILATION_CACHING_SETTINGS) &&
+           verifier.VerifyTable(compilation_caching_settings()) &&
+           verifier.EndTable();
+  }
+  TFLiteSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TFLiteSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TFLiteSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TFLiteSettingsBuilder {
+  typedef TFLiteSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_delegate(tflite::Delegate delegate) {
+    fbb_.AddElement<int32_t>(TFLiteSettings::VT_DELEGATE, static_cast<int32_t>(delegate), 0);
+  }
+  void add_nnapi_settings(::flatbuffers::Offset<tflite::NNAPISettings> nnapi_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_NNAPI_SETTINGS, nnapi_settings);
+  }
+  void add_gpu_settings(::flatbuffers::Offset<tflite::GPUSettings> gpu_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_GPU_SETTINGS, gpu_settings);
+  }
+  void add_hexagon_settings(::flatbuffers::Offset<tflite::HexagonSettings> hexagon_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_HEXAGON_SETTINGS, hexagon_settings);
+  }
+  void add_xnnpack_settings(::flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_XNNPACK_SETTINGS, xnnpack_settings);
+  }
+  void add_coreml_settings(::flatbuffers::Offset<tflite::CoreMLSettings> coreml_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_COREML_SETTINGS, coreml_settings);
+  }
+  void add_cpu_settings(::flatbuffers::Offset<tflite::CPUSettings> cpu_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_CPU_SETTINGS, cpu_settings);
+  }
+  void add_max_delegated_partitions(int32_t max_delegated_partitions) {
+    fbb_.AddElement<int32_t>(TFLiteSettings::VT_MAX_DELEGATED_PARTITIONS, max_delegated_partitions, 0);
+  }
+  void add_edgetpu_settings(::flatbuffers::Offset<tflite::EdgeTpuSettings> edgetpu_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_EDGETPU_SETTINGS, edgetpu_settings);
+  }
+  void add_coral_settings(::flatbuffers::Offset<tflite::CoralSettings> coral_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_CORAL_SETTINGS, coral_settings);
+  }
+  void add_fallback_settings(::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_FALLBACK_SETTINGS, fallback_settings);
+  }
+  void add_disable_default_delegates(bool disable_default_delegates) {
+    fbb_.AddElement<uint8_t>(TFLiteSettings::VT_DISABLE_DEFAULT_DELEGATES, static_cast<uint8_t>(disable_default_delegates), 0);
+  }
+  void add_stable_delegate_loader_settings(::flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_STABLE_DELEGATE_LOADER_SETTINGS, stable_delegate_loader_settings);
+  }
+  void add_google_edgetpu_settings(::flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_GOOGLE_EDGETPU_SETTINGS, google_edgetpu_settings);
+  }
+  void add_compilation_caching_settings(::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_COMPILATION_CACHING_SETTINGS, compilation_caching_settings);
+  }
+  explicit TFLiteSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TFLiteSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TFLiteSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Delegate delegate = tflite::Delegate_NONE,
+    ::flatbuffers::Offset<tflite::NNAPISettings> nnapi_settings = 0,
+    ::flatbuffers::Offset<tflite::GPUSettings> gpu_settings = 0,
+    ::flatbuffers::Offset<tflite::HexagonSettings> hexagon_settings = 0,
+    ::flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings = 0,
+    ::flatbuffers::Offset<tflite::CoreMLSettings> coreml_settings = 0,
+    ::flatbuffers::Offset<tflite::CPUSettings> cpu_settings = 0,
+    int32_t max_delegated_partitions = 0,
+    ::flatbuffers::Offset<tflite::EdgeTpuSettings> edgetpu_settings = 0,
+    ::flatbuffers::Offset<tflite::CoralSettings> coral_settings = 0,
+    ::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
+    bool disable_default_delegates = false,
+    ::flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings = 0,
+    ::flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings = 0,
+    ::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings = 0) {
+  TFLiteSettingsBuilder builder_(_fbb);
+  builder_.add_compilation_caching_settings(compilation_caching_settings);
+  builder_.add_google_edgetpu_settings(google_edgetpu_settings);
+  builder_.add_stable_delegate_loader_settings(stable_delegate_loader_settings);
+  builder_.add_fallback_settings(fallback_settings);
+  builder_.add_coral_settings(coral_settings);
+  builder_.add_edgetpu_settings(edgetpu_settings);
+  builder_.add_max_delegated_partitions(max_delegated_partitions);
+  builder_.add_cpu_settings(cpu_settings);
+  builder_.add_coreml_settings(coreml_settings);
+  builder_.add_xnnpack_settings(xnnpack_settings);
+  builder_.add_hexagon_settings(hexagon_settings);
+  builder_.add_gpu_settings(gpu_settings);
+  builder_.add_nnapi_settings(nnapi_settings);
+  builder_.add_delegate(delegate);
+  builder_.add_disable_default_delegates(disable_default_delegates);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FallbackSettingsT : public ::flatbuffers::NativeTable {
+  typedef FallbackSettings TableType;
+  bool allow_automatic_fallback_on_compilation_error = false;
+  bool allow_automatic_fallback_on_execution_error = false;
+};
+
+struct FallbackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FallbackSettingsT NativeTableType;
+  typedef FallbackSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR = 4,
+    VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR = 6
+  };
+  bool allow_automatic_fallback_on_compilation_error() const {
+    return GetField<uint8_t>(VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR, 0) != 0;
+  }
+  bool allow_automatic_fallback_on_execution_error() const {
+    return GetField<uint8_t>(VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR, 1) &&
+           verifier.EndTable();
+  }
+  FallbackSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FallbackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FallbackSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FallbackSettingsBuilder {
+  typedef FallbackSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_allow_automatic_fallback_on_compilation_error(bool allow_automatic_fallback_on_compilation_error) {
+    fbb_.AddElement<uint8_t>(FallbackSettings::VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR, static_cast<uint8_t>(allow_automatic_fallback_on_compilation_error), 0);
+  }
+  void add_allow_automatic_fallback_on_execution_error(bool allow_automatic_fallback_on_execution_error) {
+    fbb_.AddElement<uint8_t>(FallbackSettings::VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR, static_cast<uint8_t>(allow_automatic_fallback_on_execution_error), 0);
+  }
+  explicit FallbackSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FallbackSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FallbackSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool allow_automatic_fallback_on_compilation_error = false,
+    bool allow_automatic_fallback_on_execution_error = false) {
+  FallbackSettingsBuilder builder_(_fbb);
+  builder_.add_allow_automatic_fallback_on_execution_error(allow_automatic_fallback_on_execution_error);
+  builder_.add_allow_automatic_fallback_on_compilation_error(allow_automatic_fallback_on_compilation_error);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkMetricT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkMetric TableType;
+  std::string name{};
+  std::vector<float> values{};
+};
+
+struct BenchmarkMetric FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkMetricT NativeTableType;
+  typedef BenchmarkMetricBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_VALUES = 6
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  const ::flatbuffers::Vector<float> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_VALUES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  BenchmarkMetricT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkMetricT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkMetric> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkMetricBuilder {
+  typedef BenchmarkMetric Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(BenchmarkMetric::VT_NAME, name);
+  }
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<float>> values) {
+    fbb_.AddOffset(BenchmarkMetric::VT_VALUES, values);
+  }
+  explicit BenchmarkMetricBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkMetric> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkMetric>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> values = 0) {
+  BenchmarkMetricBuilder builder_(_fbb);
+  builder_.add_values(values);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetricDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    const std::vector<float> *values = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto values__ = values ? _fbb.CreateVector<float>(*values) : 0;
+  return tflite::CreateBenchmarkMetric(
+      _fbb,
+      name__,
+      values__);
+}
+
+::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkResultT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkResult TableType;
+  std::vector<int64_t> initialization_time_us{};
+  std::vector<int64_t> inference_time_us{};
+  int32_t max_memory_kb = 0;
+  bool ok = false;
+  std::vector<std::unique_ptr<tflite::BenchmarkMetricT>> metrics{};
+  std::vector<std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT>> actual_output{};
+  BenchmarkResultT() = default;
+  BenchmarkResultT(const BenchmarkResultT &o);
+  BenchmarkResultT(BenchmarkResultT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkResultT &operator=(BenchmarkResultT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkResult FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkResultT NativeTableType;
+  typedef BenchmarkResultBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INITIALIZATION_TIME_US = 4,
+    VT_INFERENCE_TIME_US = 6,
+    VT_MAX_MEMORY_KB = 8,
+    VT_OK = 10,
+    VT_METRICS = 12,
+    VT_ACTUAL_OUTPUT = 14
+  };
+  const ::flatbuffers::Vector<int64_t> *initialization_time_us() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INITIALIZATION_TIME_US);
+  }
+  const ::flatbuffers::Vector<int64_t> *inference_time_us() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INFERENCE_TIME_US);
+  }
+  int32_t max_memory_kb() const {
+    return GetField<int32_t>(VT_MAX_MEMORY_KB, 0);
+  }
+  bool ok() const {
+    return GetField<uint8_t>(VT_OK, 0) != 0;
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>> *>(VT_METRICS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *actual_output() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *>(VT_ACTUAL_OUTPUT);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_INITIALIZATION_TIME_US) &&
+           verifier.VerifyVector(initialization_time_us()) &&
+           VerifyOffset(verifier, VT_INFERENCE_TIME_US) &&
+           verifier.VerifyVector(inference_time_us()) &&
+           VerifyField<int32_t>(verifier, VT_MAX_MEMORY_KB, 4) &&
+           VerifyField<uint8_t>(verifier, VT_OK, 1) &&
+           VerifyOffset(verifier, VT_METRICS) &&
+           verifier.VerifyVector(metrics()) &&
+           verifier.VerifyVectorOfTables(metrics()) &&
+           VerifyOffset(verifier, VT_ACTUAL_OUTPUT) &&
+           verifier.VerifyVector(actual_output()) &&
+           verifier.VerifyVectorOfTables(actual_output()) &&
+           verifier.EndTable();
+  }
+  BenchmarkResultT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkResultT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkResult> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkResultBuilder {
+  typedef BenchmarkResult Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_initialization_time_us(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> initialization_time_us) {
+    fbb_.AddOffset(BenchmarkResult::VT_INITIALIZATION_TIME_US, initialization_time_us);
+  }
+  void add_inference_time_us(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> inference_time_us) {
+    fbb_.AddOffset(BenchmarkResult::VT_INFERENCE_TIME_US, inference_time_us);
+  }
+  void add_max_memory_kb(int32_t max_memory_kb) {
+    fbb_.AddElement<int32_t>(BenchmarkResult::VT_MAX_MEMORY_KB, max_memory_kb, 0);
+  }
+  void add_ok(bool ok) {
+    fbb_.AddElement<uint8_t>(BenchmarkResult::VT_OK, static_cast<uint8_t>(ok), 0);
+  }
+  void add_metrics(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics) {
+    fbb_.AddOffset(BenchmarkResult::VT_METRICS, metrics);
+  }
+  void add_actual_output(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>> actual_output) {
+    fbb_.AddOffset(BenchmarkResult::VT_ACTUAL_OUTPUT, actual_output);
+  }
+  explicit BenchmarkResultBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkResult> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkResult>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> initialization_time_us = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> inference_time_us = 0,
+    int32_t max_memory_kb = 0,
+    bool ok = false,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>> actual_output = 0) {
+  BenchmarkResultBuilder builder_(_fbb);
+  builder_.add_actual_output(actual_output);
+  builder_.add_metrics(metrics);
+  builder_.add_max_memory_kb(max_memory_kb);
+  builder_.add_inference_time_us(inference_time_us);
+  builder_.add_initialization_time_us(initialization_time_us);
+  builder_.add_ok(ok);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResultDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *initialization_time_us = nullptr,
+    const std::vector<int64_t> *inference_time_us = nullptr,
+    int32_t max_memory_kb = 0,
+    bool ok = false,
+    const std::vector<::flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *actual_output = nullptr) {
+  auto initialization_time_us__ = initialization_time_us ? _fbb.CreateVector<int64_t>(*initialization_time_us) : 0;
+  auto inference_time_us__ = inference_time_us ? _fbb.CreateVector<int64_t>(*inference_time_us) : 0;
+  auto metrics__ = metrics ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkMetric>>(*metrics) : 0;
+  auto actual_output__ = actual_output ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>(*actual_output) : 0;
+  return tflite::CreateBenchmarkResult(
+      _fbb,
+      initialization_time_us__,
+      inference_time_us__,
+      max_memory_kb,
+      ok,
+      metrics__,
+      actual_output__);
+}
+
+::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+namespace BenchmarkResult_ {
+
+struct InferenceOutputT : public ::flatbuffers::NativeTable {
+  typedef InferenceOutput TableType;
+  std::vector<uint8_t> value{};
+};
+
+struct InferenceOutput FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef InferenceOutputT NativeTableType;
+  typedef InferenceOutputBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUE = 4
+  };
+  const ::flatbuffers::Vector<uint8_t> *value() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_VALUE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyVector(value()) &&
+           verifier.EndTable();
+  }
+  InferenceOutputT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(InferenceOutputT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<InferenceOutput> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct InferenceOutputBuilder {
+  typedef InferenceOutput Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_value(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> value) {
+    fbb_.AddOffset(InferenceOutput::VT_VALUE, value);
+  }
+  explicit InferenceOutputBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<InferenceOutput> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<InferenceOutput>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> value = 0) {
+  InferenceOutputBuilder builder_(_fbb);
+  builder_.add_value(value);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<InferenceOutput> CreateInferenceOutputDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *value = nullptr) {
+  auto value__ = value ? _fbb.CreateVector<uint8_t>(*value) : 0;
+  return tflite::BenchmarkResult_::CreateInferenceOutput(
+      _fbb,
+      value__);
+}
+
+::flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+}  // namespace BenchmarkResult_
+
+struct ErrorCodeT : public ::flatbuffers::NativeTable {
+  typedef ErrorCode TableType;
+  tflite::Delegate source = tflite::Delegate_NONE;
+  int32_t tflite_error = 0;
+  int64_t underlying_api_error = 0;
+};
+
+struct ErrorCode FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ErrorCodeT NativeTableType;
+  typedef ErrorCodeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SOURCE = 4,
+    VT_TFLITE_ERROR = 6,
+    VT_UNDERLYING_API_ERROR = 8
+  };
+  tflite::Delegate source() const {
+    return static_cast<tflite::Delegate>(GetField<int32_t>(VT_SOURCE, 0));
+  }
+  int32_t tflite_error() const {
+    return GetField<int32_t>(VT_TFLITE_ERROR, 0);
+  }
+  int64_t underlying_api_error() const {
+    return GetField<int64_t>(VT_UNDERLYING_API_ERROR, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_SOURCE, 4) &&
+           VerifyField<int32_t>(verifier, VT_TFLITE_ERROR, 4) &&
+           VerifyField<int64_t>(verifier, VT_UNDERLYING_API_ERROR, 8) &&
+           verifier.EndTable();
+  }
+  ErrorCodeT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ErrorCodeT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ErrorCode> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ErrorCodeBuilder {
+  typedef ErrorCode Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_source(tflite::Delegate source) {
+    fbb_.AddElement<int32_t>(ErrorCode::VT_SOURCE, static_cast<int32_t>(source), 0);
+  }
+  void add_tflite_error(int32_t tflite_error) {
+    fbb_.AddElement<int32_t>(ErrorCode::VT_TFLITE_ERROR, tflite_error, 0);
+  }
+  void add_underlying_api_error(int64_t underlying_api_error) {
+    fbb_.AddElement<int64_t>(ErrorCode::VT_UNDERLYING_API_ERROR, underlying_api_error, 0);
+  }
+  explicit ErrorCodeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ErrorCode> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ErrorCode>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ErrorCode> CreateErrorCode(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Delegate source = tflite::Delegate_NONE,
+    int32_t tflite_error = 0,
+    int64_t underlying_api_error = 0) {
+  ErrorCodeBuilder builder_(_fbb);
+  builder_.add_underlying_api_error(underlying_api_error);
+  builder_.add_tflite_error(tflite_error);
+  builder_.add_source(source);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ErrorCode> CreateErrorCode(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkErrorT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkError TableType;
+  tflite::BenchmarkStage stage = tflite::BenchmarkStage_UNKNOWN;
+  int32_t exit_code = 0;
+  int32_t signal = 0;
+  std::vector<std::unique_ptr<tflite::ErrorCodeT>> error_code{};
+  int32_t mini_benchmark_error_code = 0;
+  BenchmarkErrorT() = default;
+  BenchmarkErrorT(const BenchmarkErrorT &o);
+  BenchmarkErrorT(BenchmarkErrorT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkErrorT &operator=(BenchmarkErrorT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkError FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkErrorT NativeTableType;
+  typedef BenchmarkErrorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_STAGE = 4,
+    VT_EXIT_CODE = 6,
+    VT_SIGNAL = 8,
+    VT_ERROR_CODE = 10,
+    VT_MINI_BENCHMARK_ERROR_CODE = 12
+  };
+  tflite::BenchmarkStage stage() const {
+    return static_cast<tflite::BenchmarkStage>(GetField<int32_t>(VT_STAGE, 0));
+  }
+  int32_t exit_code() const {
+    return GetField<int32_t>(VT_EXIT_CODE, 0);
+  }
+  int32_t signal() const {
+    return GetField<int32_t>(VT_SIGNAL, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>> *error_code() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>> *>(VT_ERROR_CODE);
+  }
+  int32_t mini_benchmark_error_code() const {
+    return GetField<int32_t>(VT_MINI_BENCHMARK_ERROR_CODE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_STAGE, 4) &&
+           VerifyField<int32_t>(verifier, VT_EXIT_CODE, 4) &&
+           VerifyField<int32_t>(verifier, VT_SIGNAL, 4) &&
+           VerifyOffset(verifier, VT_ERROR_CODE) &&
+           verifier.VerifyVector(error_code()) &&
+           verifier.VerifyVectorOfTables(error_code()) &&
+           VerifyField<int32_t>(verifier, VT_MINI_BENCHMARK_ERROR_CODE, 4) &&
+           verifier.EndTable();
+  }
+  BenchmarkErrorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkErrorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkError> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkErrorBuilder {
+  typedef BenchmarkError Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_stage(tflite::BenchmarkStage stage) {
+    fbb_.AddElement<int32_t>(BenchmarkError::VT_STAGE, static_cast<int32_t>(stage), 0);
+  }
+  void add_exit_code(int32_t exit_code) {
+    fbb_.AddElement<int32_t>(BenchmarkError::VT_EXIT_CODE, exit_code, 0);
+  }
+  void add_signal(int32_t signal) {
+    fbb_.AddElement<int32_t>(BenchmarkError::VT_SIGNAL, signal, 0);
+  }
+  void add_error_code(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>>> error_code) {
+    fbb_.AddOffset(BenchmarkError::VT_ERROR_CODE, error_code);
+  }
+  void add_mini_benchmark_error_code(int32_t mini_benchmark_error_code) {
+    fbb_.AddElement<int32_t>(BenchmarkError::VT_MINI_BENCHMARK_ERROR_CODE, mini_benchmark_error_code, 0);
+  }
+  explicit BenchmarkErrorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkError> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkError>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::BenchmarkStage stage = tflite::BenchmarkStage_UNKNOWN,
+    int32_t exit_code = 0,
+    int32_t signal = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>>> error_code = 0,
+    int32_t mini_benchmark_error_code = 0) {
+  BenchmarkErrorBuilder builder_(_fbb);
+  builder_.add_mini_benchmark_error_code(mini_benchmark_error_code);
+  builder_.add_error_code(error_code);
+  builder_.add_signal(signal);
+  builder_.add_exit_code(exit_code);
+  builder_.add_stage(stage);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BenchmarkError> CreateBenchmarkErrorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::BenchmarkStage stage = tflite::BenchmarkStage_UNKNOWN,
+    int32_t exit_code = 0,
+    int32_t signal = 0,
+    const std::vector<::flatbuffers::Offset<tflite::ErrorCode>> *error_code = nullptr,
+    int32_t mini_benchmark_error_code = 0) {
+  auto error_code__ = error_code ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ErrorCode>>(*error_code) : 0;
+  return tflite::CreateBenchmarkError(
+      _fbb,
+      stage,
+      exit_code,
+      signal,
+      error_code__,
+      mini_benchmark_error_code);
+}
+
+::flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkEventT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkEvent TableType;
+  std::unique_ptr<tflite::TFLiteSettingsT> tflite_settings{};
+  tflite::BenchmarkEventType event_type = tflite::BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE;
+  std::unique_ptr<tflite::BenchmarkResultT> result{};
+  std::unique_ptr<tflite::BenchmarkErrorT> error{};
+  int64_t boottime_us = 0;
+  int64_t wallclock_us = 0;
+  BenchmarkEventT() = default;
+  BenchmarkEventT(const BenchmarkEventT &o);
+  BenchmarkEventT(BenchmarkEventT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkEventT &operator=(BenchmarkEventT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkEvent FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkEventT NativeTableType;
+  typedef BenchmarkEventBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TFLITE_SETTINGS = 4,
+    VT_EVENT_TYPE = 6,
+    VT_RESULT = 8,
+    VT_ERROR = 10,
+    VT_BOOTTIME_US = 12,
+    VT_WALLCLOCK_US = 14
+  };
+  const tflite::TFLiteSettings *tflite_settings() const {
+    return GetPointer<const tflite::TFLiteSettings *>(VT_TFLITE_SETTINGS);
+  }
+  tflite::BenchmarkEventType event_type() const {
+    return static_cast<tflite::BenchmarkEventType>(GetField<int32_t>(VT_EVENT_TYPE, 0));
+  }
+  const tflite::BenchmarkResult *result() const {
+    return GetPointer<const tflite::BenchmarkResult *>(VT_RESULT);
+  }
+  const tflite::BenchmarkError *error() const {
+    return GetPointer<const tflite::BenchmarkError *>(VT_ERROR);
+  }
+  int64_t boottime_us() const {
+    return GetField<int64_t>(VT_BOOTTIME_US, 0);
+  }
+  int64_t wallclock_us() const {
+    return GetField<int64_t>(VT_WALLCLOCK_US, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TFLITE_SETTINGS) &&
+           verifier.VerifyTable(tflite_settings()) &&
+           VerifyField<int32_t>(verifier, VT_EVENT_TYPE, 4) &&
+           VerifyOffset(verifier, VT_RESULT) &&
+           verifier.VerifyTable(result()) &&
+           VerifyOffset(verifier, VT_ERROR) &&
+           verifier.VerifyTable(error()) &&
+           VerifyField<int64_t>(verifier, VT_BOOTTIME_US, 8) &&
+           VerifyField<int64_t>(verifier, VT_WALLCLOCK_US, 8) &&
+           verifier.EndTable();
+  }
+  BenchmarkEventT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkEvent> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkEventBuilder {
+  typedef BenchmarkEvent Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_tflite_settings(::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings) {
+    fbb_.AddOffset(BenchmarkEvent::VT_TFLITE_SETTINGS, tflite_settings);
+  }
+  void add_event_type(tflite::BenchmarkEventType event_type) {
+    fbb_.AddElement<int32_t>(BenchmarkEvent::VT_EVENT_TYPE, static_cast<int32_t>(event_type), 0);
+  }
+  void add_result(::flatbuffers::Offset<tflite::BenchmarkResult> result) {
+    fbb_.AddOffset(BenchmarkEvent::VT_RESULT, result);
+  }
+  void add_error(::flatbuffers::Offset<tflite::BenchmarkError> error) {
+    fbb_.AddOffset(BenchmarkEvent::VT_ERROR, error);
+  }
+  void add_boottime_us(int64_t boottime_us) {
+    fbb_.AddElement<int64_t>(BenchmarkEvent::VT_BOOTTIME_US, boottime_us, 0);
+  }
+  void add_wallclock_us(int64_t wallclock_us) {
+    fbb_.AddElement<int64_t>(BenchmarkEvent::VT_WALLCLOCK_US, wallclock_us, 0);
+  }
+  explicit BenchmarkEventBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkEvent> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkEvent>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
+    tflite::BenchmarkEventType event_type = tflite::BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE,
+    ::flatbuffers::Offset<tflite::BenchmarkResult> result = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkError> error = 0,
+    int64_t boottime_us = 0,
+    int64_t wallclock_us = 0) {
+  BenchmarkEventBuilder builder_(_fbb);
+  builder_.add_wallclock_us(wallclock_us);
+  builder_.add_boottime_us(boottime_us);
+  builder_.add_error(error);
+  builder_.add_result(result);
+  builder_.add_event_type(event_type);
+  builder_.add_tflite_settings(tflite_settings);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BestAccelerationDecisionT : public ::flatbuffers::NativeTable {
+  typedef BestAccelerationDecision TableType;
+  int32_t number_of_source_events = 0;
+  std::unique_ptr<tflite::BenchmarkEventT> min_latency_event{};
+  int64_t min_inference_time_us = 0;
+  BestAccelerationDecisionT() = default;
+  BestAccelerationDecisionT(const BestAccelerationDecisionT &o);
+  BestAccelerationDecisionT(BestAccelerationDecisionT&&) FLATBUFFERS_NOEXCEPT = default;
+  BestAccelerationDecisionT &operator=(BestAccelerationDecisionT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BestAccelerationDecision FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BestAccelerationDecisionT NativeTableType;
+  typedef BestAccelerationDecisionBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUMBER_OF_SOURCE_EVENTS = 4,
+    VT_MIN_LATENCY_EVENT = 6,
+    VT_MIN_INFERENCE_TIME_US = 8
+  };
+  int32_t number_of_source_events() const {
+    return GetField<int32_t>(VT_NUMBER_OF_SOURCE_EVENTS, 0);
+  }
+  const tflite::BenchmarkEvent *min_latency_event() const {
+    return GetPointer<const tflite::BenchmarkEvent *>(VT_MIN_LATENCY_EVENT);
+  }
+  int64_t min_inference_time_us() const {
+    return GetField<int64_t>(VT_MIN_INFERENCE_TIME_US, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUMBER_OF_SOURCE_EVENTS, 4) &&
+           VerifyOffset(verifier, VT_MIN_LATENCY_EVENT) &&
+           verifier.VerifyTable(min_latency_event()) &&
+           VerifyField<int64_t>(verifier, VT_MIN_INFERENCE_TIME_US, 8) &&
+           verifier.EndTable();
+  }
+  BestAccelerationDecisionT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BestAccelerationDecisionT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BestAccelerationDecision> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BestAccelerationDecisionBuilder {
+  typedef BestAccelerationDecision Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_number_of_source_events(int32_t number_of_source_events) {
+    fbb_.AddElement<int32_t>(BestAccelerationDecision::VT_NUMBER_OF_SOURCE_EVENTS, number_of_source_events, 0);
+  }
+  void add_min_latency_event(::flatbuffers::Offset<tflite::BenchmarkEvent> min_latency_event) {
+    fbb_.AddOffset(BestAccelerationDecision::VT_MIN_LATENCY_EVENT, min_latency_event);
+  }
+  void add_min_inference_time_us(int64_t min_inference_time_us) {
+    fbb_.AddElement<int64_t>(BestAccelerationDecision::VT_MIN_INFERENCE_TIME_US, min_inference_time_us, 0);
+  }
+  explicit BestAccelerationDecisionBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BestAccelerationDecision> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BestAccelerationDecision>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t number_of_source_events = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkEvent> min_latency_event = 0,
+    int64_t min_inference_time_us = 0) {
+  BestAccelerationDecisionBuilder builder_(_fbb);
+  builder_.add_min_inference_time_us(min_inference_time_us);
+  builder_.add_min_latency_event(min_latency_event);
+  builder_.add_number_of_source_events(number_of_source_events);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkInitializationFailureT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkInitializationFailure TableType;
+  int32_t initialization_status = 0;
+};
+
+struct BenchmarkInitializationFailure FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkInitializationFailureT NativeTableType;
+  typedef BenchmarkInitializationFailureBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INITIALIZATION_STATUS = 4
+  };
+  int32_t initialization_status() const {
+    return GetField<int32_t>(VT_INITIALIZATION_STATUS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_INITIALIZATION_STATUS, 4) &&
+           verifier.EndTable();
+  }
+  BenchmarkInitializationFailureT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkInitializationFailureT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkInitializationFailure> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkInitializationFailureBuilder {
+  typedef BenchmarkInitializationFailure Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_initialization_status(int32_t initialization_status) {
+    fbb_.AddElement<int32_t>(BenchmarkInitializationFailure::VT_INITIALIZATION_STATUS, initialization_status, 0);
+  }
+  explicit BenchmarkInitializationFailureBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkInitializationFailure> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkInitializationFailure>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t initialization_status = 0) {
+  BenchmarkInitializationFailureBuilder builder_(_fbb);
+  builder_.add_initialization_status(initialization_status);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MiniBenchmarkEventT : public ::flatbuffers::NativeTable {
+  typedef MiniBenchmarkEvent TableType;
+  bool is_log_flushing_event = false;
+  std::unique_ptr<tflite::BestAccelerationDecisionT> best_acceleration_decision{};
+  std::unique_ptr<tflite::BenchmarkInitializationFailureT> initialization_failure{};
+  std::unique_ptr<tflite::BenchmarkEventT> benchmark_event{};
+  MiniBenchmarkEventT() = default;
+  MiniBenchmarkEventT(const MiniBenchmarkEventT &o);
+  MiniBenchmarkEventT(MiniBenchmarkEventT&&) FLATBUFFERS_NOEXCEPT = default;
+  MiniBenchmarkEventT &operator=(MiniBenchmarkEventT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct MiniBenchmarkEvent FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MiniBenchmarkEventT NativeTableType;
+  typedef MiniBenchmarkEventBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IS_LOG_FLUSHING_EVENT = 4,
+    VT_BEST_ACCELERATION_DECISION = 6,
+    VT_INITIALIZATION_FAILURE = 8,
+    VT_BENCHMARK_EVENT = 10
+  };
+  bool is_log_flushing_event() const {
+    return GetField<uint8_t>(VT_IS_LOG_FLUSHING_EVENT, 0) != 0;
+  }
+  const tflite::BestAccelerationDecision *best_acceleration_decision() const {
+    return GetPointer<const tflite::BestAccelerationDecision *>(VT_BEST_ACCELERATION_DECISION);
+  }
+  const tflite::BenchmarkInitializationFailure *initialization_failure() const {
+    return GetPointer<const tflite::BenchmarkInitializationFailure *>(VT_INITIALIZATION_FAILURE);
+  }
+  const tflite::BenchmarkEvent *benchmark_event() const {
+    return GetPointer<const tflite::BenchmarkEvent *>(VT_BENCHMARK_EVENT);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_IS_LOG_FLUSHING_EVENT, 1) &&
+           VerifyOffset(verifier, VT_BEST_ACCELERATION_DECISION) &&
+           verifier.VerifyTable(best_acceleration_decision()) &&
+           VerifyOffset(verifier, VT_INITIALIZATION_FAILURE) &&
+           verifier.VerifyTable(initialization_failure()) &&
+           VerifyOffset(verifier, VT_BENCHMARK_EVENT) &&
+           verifier.VerifyTable(benchmark_event()) &&
+           verifier.EndTable();
+  }
+  MiniBenchmarkEventT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MiniBenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MiniBenchmarkEvent> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MiniBenchmarkEventBuilder {
+  typedef MiniBenchmarkEvent Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_is_log_flushing_event(bool is_log_flushing_event) {
+    fbb_.AddElement<uint8_t>(MiniBenchmarkEvent::VT_IS_LOG_FLUSHING_EVENT, static_cast<uint8_t>(is_log_flushing_event), 0);
+  }
+  void add_best_acceleration_decision(::flatbuffers::Offset<tflite::BestAccelerationDecision> best_acceleration_decision) {
+    fbb_.AddOffset(MiniBenchmarkEvent::VT_BEST_ACCELERATION_DECISION, best_acceleration_decision);
+  }
+  void add_initialization_failure(::flatbuffers::Offset<tflite::BenchmarkInitializationFailure> initialization_failure) {
+    fbb_.AddOffset(MiniBenchmarkEvent::VT_INITIALIZATION_FAILURE, initialization_failure);
+  }
+  void add_benchmark_event(::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event) {
+    fbb_.AddOffset(MiniBenchmarkEvent::VT_BENCHMARK_EVENT, benchmark_event);
+  }
+  explicit MiniBenchmarkEventBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MiniBenchmarkEvent> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MiniBenchmarkEvent>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool is_log_flushing_event = false,
+    ::flatbuffers::Offset<tflite::BestAccelerationDecision> best_acceleration_decision = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkInitializationFailure> initialization_failure = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event = 0) {
+  MiniBenchmarkEventBuilder builder_(_fbb);
+  builder_.add_benchmark_event(benchmark_event);
+  builder_.add_initialization_failure(initialization_failure);
+  builder_.add_best_acceleration_decision(best_acceleration_decision);
+  builder_.add_is_log_flushing_event(is_log_flushing_event);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ModelFileT : public ::flatbuffers::NativeTable {
+  typedef ModelFile TableType;
+  std::string filename{};
+  int64_t fd = 0;
+  int64_t offset = 0;
+  int64_t length = 0;
+  std::unique_ptr<tflite::ModelIdGroupT> model_id_group{};
+  int64_t buffer_handle = 0;
+  ModelFileT() = default;
+  ModelFileT(const ModelFileT &o);
+  ModelFileT(ModelFileT&&) FLATBUFFERS_NOEXCEPT = default;
+  ModelFileT &operator=(ModelFileT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct ModelFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ModelFileT NativeTableType;
+  typedef ModelFileBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FILENAME = 4,
+    VT_FD = 6,
+    VT_OFFSET = 8,
+    VT_LENGTH = 10,
+    VT_MODEL_ID_GROUP = 12,
+    VT_BUFFER_HANDLE = 14
+  };
+  const ::flatbuffers::String *filename() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_FILENAME);
+  }
+  int64_t fd() const {
+    return GetField<int64_t>(VT_FD, 0);
+  }
+  int64_t offset() const {
+    return GetField<int64_t>(VT_OFFSET, 0);
+  }
+  int64_t length() const {
+    return GetField<int64_t>(VT_LENGTH, 0);
+  }
+  const tflite::ModelIdGroup *model_id_group() const {
+    return GetPointer<const tflite::ModelIdGroup *>(VT_MODEL_ID_GROUP);
+  }
+  int64_t buffer_handle() const {
+    return GetField<int64_t>(VT_BUFFER_HANDLE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_FILENAME) &&
+           verifier.VerifyString(filename()) &&
+           VerifyField<int64_t>(verifier, VT_FD, 8) &&
+           VerifyField<int64_t>(verifier, VT_OFFSET, 8) &&
+           VerifyField<int64_t>(verifier, VT_LENGTH, 8) &&
+           VerifyOffset(verifier, VT_MODEL_ID_GROUP) &&
+           verifier.VerifyTable(model_id_group()) &&
+           VerifyField<int64_t>(verifier, VT_BUFFER_HANDLE, 8) &&
+           verifier.EndTable();
+  }
+  ModelFileT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelFileT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ModelFile> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ModelFileBuilder {
+  typedef ModelFile Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_filename(::flatbuffers::Offset<::flatbuffers::String> filename) {
+    fbb_.AddOffset(ModelFile::VT_FILENAME, filename);
+  }
+  void add_fd(int64_t fd) {
+    fbb_.AddElement<int64_t>(ModelFile::VT_FD, fd, 0);
+  }
+  void add_offset(int64_t offset) {
+    fbb_.AddElement<int64_t>(ModelFile::VT_OFFSET, offset, 0);
+  }
+  void add_length(int64_t length) {
+    fbb_.AddElement<int64_t>(ModelFile::VT_LENGTH, length, 0);
+  }
+  void add_model_id_group(::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group) {
+    fbb_.AddOffset(ModelFile::VT_MODEL_ID_GROUP, model_id_group);
+  }
+  void add_buffer_handle(int64_t buffer_handle) {
+    fbb_.AddElement<int64_t>(ModelFile::VT_BUFFER_HANDLE, buffer_handle, 0);
+  }
+  explicit ModelFileBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ModelFile> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ModelFile>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ModelFile> CreateModelFile(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> filename = 0,
+    int64_t fd = 0,
+    int64_t offset = 0,
+    int64_t length = 0,
+    ::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    int64_t buffer_handle = 0) {
+  ModelFileBuilder builder_(_fbb);
+  builder_.add_buffer_handle(buffer_handle);
+  builder_.add_length(length);
+  builder_.add_offset(offset);
+  builder_.add_fd(fd);
+  builder_.add_model_id_group(model_id_group);
+  builder_.add_filename(filename);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ModelFile> CreateModelFileDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *filename = nullptr,
+    int64_t fd = 0,
+    int64_t offset = 0,
+    int64_t length = 0,
+    ::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    int64_t buffer_handle = 0) {
+  auto filename__ = filename ? _fbb.CreateString(filename) : 0;
+  return tflite::CreateModelFile(
+      _fbb,
+      filename__,
+      fd,
+      offset,
+      length,
+      model_id_group,
+      buffer_handle);
+}
+
+::flatbuffers::Offset<ModelFile> CreateModelFile(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ModelIdGroupT : public ::flatbuffers::NativeTable {
+  typedef ModelIdGroup TableType;
+  std::string model_namespace{};
+  std::string model_id{};
+};
+
+struct ModelIdGroup FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ModelIdGroupT NativeTableType;
+  typedef ModelIdGroupBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODEL_NAMESPACE = 4,
+    VT_MODEL_ID = 6
+  };
+  const ::flatbuffers::String *model_namespace() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_NAMESPACE);
+  }
+  const ::flatbuffers::String *model_id() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_ID);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MODEL_NAMESPACE) &&
+           verifier.VerifyString(model_namespace()) &&
+           VerifyOffset(verifier, VT_MODEL_ID) &&
+           verifier.VerifyString(model_id()) &&
+           verifier.EndTable();
+  }
+  ModelIdGroupT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelIdGroupT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ModelIdGroup> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ModelIdGroupBuilder {
+  typedef ModelIdGroup Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_model_namespace(::flatbuffers::Offset<::flatbuffers::String> model_namespace) {
+    fbb_.AddOffset(ModelIdGroup::VT_MODEL_NAMESPACE, model_namespace);
+  }
+  void add_model_id(::flatbuffers::Offset<::flatbuffers::String> model_id) {
+    fbb_.AddOffset(ModelIdGroup::VT_MODEL_ID, model_id);
+  }
+  explicit ModelIdGroupBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ModelIdGroup> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ModelIdGroup>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> model_namespace = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_id = 0) {
+  ModelIdGroupBuilder builder_(_fbb);
+  builder_.add_model_id(model_id);
+  builder_.add_model_namespace(model_namespace);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroupDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *model_namespace = nullptr,
+    const char *model_id = nullptr) {
+  auto model_namespace__ = model_namespace ? _fbb.CreateString(model_namespace) : 0;
+  auto model_id__ = model_id ? _fbb.CreateString(model_id) : 0;
+  return tflite::CreateModelIdGroup(
+      _fbb,
+      model_namespace__,
+      model_id__);
+}
+
+::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkStoragePathsT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkStoragePaths TableType;
+  std::string storage_file_path{};
+  std::string data_directory_path{};
+};
+
+struct BenchmarkStoragePaths FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkStoragePathsT NativeTableType;
+  typedef BenchmarkStoragePathsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_STORAGE_FILE_PATH = 4,
+    VT_DATA_DIRECTORY_PATH = 6
+  };
+  const ::flatbuffers::String *storage_file_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_STORAGE_FILE_PATH);
+  }
+  const ::flatbuffers::String *data_directory_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DATA_DIRECTORY_PATH);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_STORAGE_FILE_PATH) &&
+           verifier.VerifyString(storage_file_path()) &&
+           VerifyOffset(verifier, VT_DATA_DIRECTORY_PATH) &&
+           verifier.VerifyString(data_directory_path()) &&
+           verifier.EndTable();
+  }
+  BenchmarkStoragePathsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkStoragePathsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkStoragePaths> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkStoragePathsBuilder {
+  typedef BenchmarkStoragePaths Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_storage_file_path(::flatbuffers::Offset<::flatbuffers::String> storage_file_path) {
+    fbb_.AddOffset(BenchmarkStoragePaths::VT_STORAGE_FILE_PATH, storage_file_path);
+  }
+  void add_data_directory_path(::flatbuffers::Offset<::flatbuffers::String> data_directory_path) {
+    fbb_.AddOffset(BenchmarkStoragePaths::VT_DATA_DIRECTORY_PATH, data_directory_path);
+  }
+  explicit BenchmarkStoragePathsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkStoragePaths> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkStoragePaths>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> storage_file_path = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> data_directory_path = 0) {
+  BenchmarkStoragePathsBuilder builder_(_fbb);
+  builder_.add_data_directory_path(data_directory_path);
+  builder_.add_storage_file_path(storage_file_path);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePathsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *storage_file_path = nullptr,
+    const char *data_directory_path = nullptr) {
+  auto storage_file_path__ = storage_file_path ? _fbb.CreateString(storage_file_path) : 0;
+  auto data_directory_path__ = data_directory_path ? _fbb.CreateString(data_directory_path) : 0;
+  return tflite::CreateBenchmarkStoragePaths(
+      _fbb,
+      storage_file_path__,
+      data_directory_path__);
+}
+
+::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ValidationSettingsT : public ::flatbuffers::NativeTable {
+  typedef ValidationSettings TableType;
+  int64_t per_test_timeout_ms = 0;
+};
+
+struct ValidationSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ValidationSettingsT NativeTableType;
+  typedef ValidationSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PER_TEST_TIMEOUT_MS = 4
+  };
+  int64_t per_test_timeout_ms() const {
+    return GetField<int64_t>(VT_PER_TEST_TIMEOUT_MS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int64_t>(verifier, VT_PER_TEST_TIMEOUT_MS, 8) &&
+           verifier.EndTable();
+  }
+  ValidationSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ValidationSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ValidationSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ValidationSettingsBuilder {
+  typedef ValidationSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_per_test_timeout_ms(int64_t per_test_timeout_ms) {
+    fbb_.AddElement<int64_t>(ValidationSettings::VT_PER_TEST_TIMEOUT_MS, per_test_timeout_ms, 0);
+  }
+  explicit ValidationSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ValidationSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ValidationSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ValidationSettings> CreateValidationSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t per_test_timeout_ms = 0) {
+  ValidationSettingsBuilder builder_(_fbb);
+  builder_.add_per_test_timeout_ms(per_test_timeout_ms);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<ValidationSettings> CreateValidationSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MinibenchmarkSettingsT : public ::flatbuffers::NativeTable {
+  typedef MinibenchmarkSettings TableType;
+  std::vector<std::unique_ptr<tflite::TFLiteSettingsT>> settings_to_test{};
+  std::unique_ptr<tflite::ModelFileT> model_file{};
+  std::unique_ptr<tflite::BenchmarkStoragePathsT> storage_paths{};
+  std::unique_ptr<tflite::ValidationSettingsT> validation_settings{};
+  MinibenchmarkSettingsT() = default;
+  MinibenchmarkSettingsT(const MinibenchmarkSettingsT &o);
+  MinibenchmarkSettingsT(MinibenchmarkSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
+  MinibenchmarkSettingsT &operator=(MinibenchmarkSettingsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct MinibenchmarkSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MinibenchmarkSettingsT NativeTableType;
+  typedef MinibenchmarkSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SETTINGS_TO_TEST = 4,
+    VT_MODEL_FILE = 6,
+    VT_STORAGE_PATHS = 8,
+    VT_VALIDATION_SETTINGS = 10
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>> *settings_to_test() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>> *>(VT_SETTINGS_TO_TEST);
+  }
+  const tflite::ModelFile *model_file() const {
+    return GetPointer<const tflite::ModelFile *>(VT_MODEL_FILE);
+  }
+  const tflite::BenchmarkStoragePaths *storage_paths() const {
+    return GetPointer<const tflite::BenchmarkStoragePaths *>(VT_STORAGE_PATHS);
+  }
+  const tflite::ValidationSettings *validation_settings() const {
+    return GetPointer<const tflite::ValidationSettings *>(VT_VALIDATION_SETTINGS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SETTINGS_TO_TEST) &&
+           verifier.VerifyVector(settings_to_test()) &&
+           verifier.VerifyVectorOfTables(settings_to_test()) &&
+           VerifyOffset(verifier, VT_MODEL_FILE) &&
+           verifier.VerifyTable(model_file()) &&
+           VerifyOffset(verifier, VT_STORAGE_PATHS) &&
+           verifier.VerifyTable(storage_paths()) &&
+           VerifyOffset(verifier, VT_VALIDATION_SETTINGS) &&
+           verifier.VerifyTable(validation_settings()) &&
+           verifier.EndTable();
+  }
+  MinibenchmarkSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MinibenchmarkSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MinibenchmarkSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MinibenchmarkSettingsBuilder {
+  typedef MinibenchmarkSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_settings_to_test(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>>> settings_to_test) {
+    fbb_.AddOffset(MinibenchmarkSettings::VT_SETTINGS_TO_TEST, settings_to_test);
+  }
+  void add_model_file(::flatbuffers::Offset<tflite::ModelFile> model_file) {
+    fbb_.AddOffset(MinibenchmarkSettings::VT_MODEL_FILE, model_file);
+  }
+  void add_storage_paths(::flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths) {
+    fbb_.AddOffset(MinibenchmarkSettings::VT_STORAGE_PATHS, storage_paths);
+  }
+  void add_validation_settings(::flatbuffers::Offset<tflite::ValidationSettings> validation_settings) {
+    fbb_.AddOffset(MinibenchmarkSettings::VT_VALIDATION_SETTINGS, validation_settings);
+  }
+  explicit MinibenchmarkSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MinibenchmarkSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MinibenchmarkSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>>> settings_to_test = 0,
+    ::flatbuffers::Offset<tflite::ModelFile> model_file = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths = 0,
+    ::flatbuffers::Offset<tflite::ValidationSettings> validation_settings = 0) {
+  MinibenchmarkSettingsBuilder builder_(_fbb);
+  builder_.add_validation_settings(validation_settings);
+  builder_.add_storage_paths(storage_paths);
+  builder_.add_model_file(model_file);
+  builder_.add_settings_to_test(settings_to_test);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::TFLiteSettings>> *settings_to_test = nullptr,
+    ::flatbuffers::Offset<tflite::ModelFile> model_file = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths = 0,
+    ::flatbuffers::Offset<tflite::ValidationSettings> validation_settings = 0) {
+  auto settings_to_test__ = settings_to_test ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TFLiteSettings>>(*settings_to_test) : 0;
+  return tflite::CreateMinibenchmarkSettings(
+      _fbb,
+      settings_to_test__,
+      model_file,
+      storage_paths,
+      validation_settings);
+}
+
+::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BenchmarkEventStorageT : public ::flatbuffers::NativeTable {
+  typedef BenchmarkEventStorage TableType;
+  std::unique_ptr<tflite::ModelIdGroupT> model_id_group{};
+  std::unique_ptr<tflite::BenchmarkEventT> benchmark_event{};
+  BenchmarkEventStorageT() = default;
+  BenchmarkEventStorageT(const BenchmarkEventStorageT &o);
+  BenchmarkEventStorageT(BenchmarkEventStorageT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkEventStorageT &operator=(BenchmarkEventStorageT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkEventStorage FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BenchmarkEventStorageT NativeTableType;
+  typedef BenchmarkEventStorageBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODEL_ID_GROUP = 4,
+    VT_BENCHMARK_EVENT = 6
+  };
+  const tflite::ModelIdGroup *model_id_group() const {
+    return GetPointer<const tflite::ModelIdGroup *>(VT_MODEL_ID_GROUP);
+  }
+  const tflite::BenchmarkEvent *benchmark_event() const {
+    return GetPointer<const tflite::BenchmarkEvent *>(VT_BENCHMARK_EVENT);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MODEL_ID_GROUP) &&
+           verifier.VerifyTable(model_id_group()) &&
+           VerifyOffset(verifier, VT_BENCHMARK_EVENT) &&
+           verifier.VerifyTable(benchmark_event()) &&
+           verifier.EndTable();
+  }
+  BenchmarkEventStorageT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkEventStorageT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkEventStorage> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkEventStorageBuilder {
+  typedef BenchmarkEventStorage Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_model_id_group(::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group) {
+    fbb_.AddOffset(BenchmarkEventStorage::VT_MODEL_ID_GROUP, model_id_group);
+  }
+  void add_benchmark_event(::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event) {
+    fbb_.AddOffset(BenchmarkEventStorage::VT_BENCHMARK_EVENT, benchmark_event);
+  }
+  explicit BenchmarkEventStorageBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BenchmarkEventStorage> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BenchmarkEventStorage>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event = 0) {
+  BenchmarkEventStorageBuilder builder_(_fbb);
+  builder_.add_benchmark_event(benchmark_event);
+  builder_.add_model_id_group(model_id_group);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+
+inline bool operator==(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs) {
+  return
+      (lhs.preference == rhs.preference) &&
+      ((lhs.tflite_settings == rhs.tflite_settings) || (lhs.tflite_settings && rhs.tflite_settings && *lhs.tflite_settings == *rhs.tflite_settings)) &&
+      (lhs.model_namespace_for_statistics == rhs.model_namespace_for_statistics) &&
+      (lhs.model_identifier_for_statistics == rhs.model_identifier_for_statistics) &&
+      ((lhs.settings_to_test_locally == rhs.settings_to_test_locally) || (lhs.settings_to_test_locally && rhs.settings_to_test_locally && *lhs.settings_to_test_locally == *rhs.settings_to_test_locally));
+}
+
+inline bool operator!=(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ComputeSettingsT::ComputeSettingsT(const ComputeSettingsT &o)
+      : preference(o.preference),
+        tflite_settings((o.tflite_settings) ? new tflite::TFLiteSettingsT(*o.tflite_settings) : nullptr),
+        model_namespace_for_statistics(o.model_namespace_for_statistics),
+        model_identifier_for_statistics(o.model_identifier_for_statistics),
+        settings_to_test_locally((o.settings_to_test_locally) ? new tflite::MinibenchmarkSettingsT(*o.settings_to_test_locally) : nullptr) {
+}
+
+inline ComputeSettingsT &ComputeSettingsT::operator=(ComputeSettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(preference, o.preference);
+  std::swap(tflite_settings, o.tflite_settings);
+  std::swap(model_namespace_for_statistics, o.model_namespace_for_statistics);
+  std::swap(model_identifier_for_statistics, o.model_identifier_for_statistics);
+  std::swap(settings_to_test_locally, o.settings_to_test_locally);
+  return *this;
+}
+
+inline ComputeSettingsT *ComputeSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ComputeSettingsT>(new ComputeSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ComputeSettings::UnPackTo(ComputeSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = preference(); _o->preference = _e; }
+  { auto _e = tflite_settings(); if (_e) { if(_o->tflite_settings) { _e->UnPackTo(_o->tflite_settings.get(), _resolver); } else { _o->tflite_settings = std::unique_ptr<tflite::TFLiteSettingsT>(_e->UnPack(_resolver)); } } else if (_o->tflite_settings) { _o->tflite_settings.reset(); } }
+  { auto _e = model_namespace_for_statistics(); if (_e) _o->model_namespace_for_statistics = _e->str(); }
+  { auto _e = model_identifier_for_statistics(); if (_e) _o->model_identifier_for_statistics = _e->str(); }
+  { auto _e = settings_to_test_locally(); if (_e) { if(_o->settings_to_test_locally) { _e->UnPackTo(_o->settings_to_test_locally.get(), _resolver); } else { _o->settings_to_test_locally = std::unique_ptr<tflite::MinibenchmarkSettingsT>(_e->UnPack(_resolver)); } } else if (_o->settings_to_test_locally) { _o->settings_to_test_locally.reset(); } }
+}
+
+inline ::flatbuffers::Offset<ComputeSettings> ComputeSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateComputeSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ComputeSettings> CreateComputeSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ComputeSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _preference = _o->preference;
+  auto _tflite_settings = _o->tflite_settings ? CreateTFLiteSettings(_fbb, _o->tflite_settings.get(), _rehasher) : 0;
+  auto _model_namespace_for_statistics = _o->model_namespace_for_statistics.empty() ? 0 : _fbb.CreateString(_o->model_namespace_for_statistics);
+  auto _model_identifier_for_statistics = _o->model_identifier_for_statistics.empty() ? 0 : _fbb.CreateString(_o->model_identifier_for_statistics);
+  auto _settings_to_test_locally = _o->settings_to_test_locally ? CreateMinibenchmarkSettings(_fbb, _o->settings_to_test_locally.get(), _rehasher) : 0;
+  return tflite::CreateComputeSettings(
+      _fbb,
+      _preference,
+      _tflite_settings,
+      _model_namespace_for_statistics,
+      _model_identifier_for_statistics,
+      _settings_to_test_locally);
+}
+
+
+inline bool operator==(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs) {
+  return
+      (lhs.accelerator_name == rhs.accelerator_name) &&
+      (lhs.cache_directory == rhs.cache_directory) &&
+      (lhs.model_token == rhs.model_token) &&
+      (lhs.execution_preference == rhs.execution_preference) &&
+      (lhs.no_of_nnapi_instances_to_cache == rhs.no_of_nnapi_instances_to_cache) &&
+      ((lhs.fallback_settings == rhs.fallback_settings) || (lhs.fallback_settings && rhs.fallback_settings && *lhs.fallback_settings == *rhs.fallback_settings)) &&
+      (lhs.allow_nnapi_cpu_on_android_10_plus == rhs.allow_nnapi_cpu_on_android_10_plus) &&
+      (lhs.execution_priority == rhs.execution_priority) &&
+      (lhs.allow_dynamic_dimensions == rhs.allow_dynamic_dimensions) &&
+      (lhs.allow_fp16_precision_for_fp32 == rhs.allow_fp16_precision_for_fp32) &&
+      (lhs.use_burst_computation == rhs.use_burst_computation) &&
+      (lhs.support_library_handle == rhs.support_library_handle);
+}
+
+inline bool operator!=(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline NNAPISettingsT::NNAPISettingsT(const NNAPISettingsT &o)
+      : accelerator_name(o.accelerator_name),
+        cache_directory(o.cache_directory),
+        model_token(o.model_token),
+        execution_preference(o.execution_preference),
+        no_of_nnapi_instances_to_cache(o.no_of_nnapi_instances_to_cache),
+        fallback_settings((o.fallback_settings) ? new tflite::FallbackSettingsT(*o.fallback_settings) : nullptr),
+        allow_nnapi_cpu_on_android_10_plus(o.allow_nnapi_cpu_on_android_10_plus),
+        execution_priority(o.execution_priority),
+        allow_dynamic_dimensions(o.allow_dynamic_dimensions),
+        allow_fp16_precision_for_fp32(o.allow_fp16_precision_for_fp32),
+        use_burst_computation(o.use_burst_computation),
+        support_library_handle(o.support_library_handle) {
+}
+
+inline NNAPISettingsT &NNAPISettingsT::operator=(NNAPISettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(accelerator_name, o.accelerator_name);
+  std::swap(cache_directory, o.cache_directory);
+  std::swap(model_token, o.model_token);
+  std::swap(execution_preference, o.execution_preference);
+  std::swap(no_of_nnapi_instances_to_cache, o.no_of_nnapi_instances_to_cache);
+  std::swap(fallback_settings, o.fallback_settings);
+  std::swap(allow_nnapi_cpu_on_android_10_plus, o.allow_nnapi_cpu_on_android_10_plus);
+  std::swap(execution_priority, o.execution_priority);
+  std::swap(allow_dynamic_dimensions, o.allow_dynamic_dimensions);
+  std::swap(allow_fp16_precision_for_fp32, o.allow_fp16_precision_for_fp32);
+  std::swap(use_burst_computation, o.use_burst_computation);
+  std::swap(support_library_handle, o.support_library_handle);
+  return *this;
+}
+
+inline NNAPISettingsT *NNAPISettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<NNAPISettingsT>(new NNAPISettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void NNAPISettings::UnPackTo(NNAPISettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = accelerator_name(); if (_e) _o->accelerator_name = _e->str(); }
+  { auto _e = cache_directory(); if (_e) _o->cache_directory = _e->str(); }
+  { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
+  { auto _e = execution_preference(); _o->execution_preference = _e; }
+  { auto _e = no_of_nnapi_instances_to_cache(); _o->no_of_nnapi_instances_to_cache = _e; }
+  { auto _e = fallback_settings(); if (_e) { if(_o->fallback_settings) { _e->UnPackTo(_o->fallback_settings.get(), _resolver); } else { _o->fallback_settings = std::unique_ptr<tflite::FallbackSettingsT>(_e->UnPack(_resolver)); } } else if (_o->fallback_settings) { _o->fallback_settings.reset(); } }
+  { auto _e = allow_nnapi_cpu_on_android_10_plus(); _o->allow_nnapi_cpu_on_android_10_plus = _e; }
+  { auto _e = execution_priority(); _o->execution_priority = _e; }
+  { auto _e = allow_dynamic_dimensions(); _o->allow_dynamic_dimensions = _e; }
+  { auto _e = allow_fp16_precision_for_fp32(); _o->allow_fp16_precision_for_fp32 = _e; }
+  { auto _e = use_burst_computation(); _o->use_burst_computation = _e; }
+  { auto _e = support_library_handle(); _o->support_library_handle = _e; }
+}
+
+inline ::flatbuffers::Offset<NNAPISettings> NNAPISettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNNAPISettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const NNAPISettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _accelerator_name = _o->accelerator_name.empty() ? 0 : _fbb.CreateString(_o->accelerator_name);
+  auto _cache_directory = _o->cache_directory.empty() ? 0 : _fbb.CreateString(_o->cache_directory);
+  auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
+  auto _execution_preference = _o->execution_preference;
+  auto _no_of_nnapi_instances_to_cache = _o->no_of_nnapi_instances_to_cache;
+  auto _fallback_settings = _o->fallback_settings ? CreateFallbackSettings(_fbb, _o->fallback_settings.get(), _rehasher) : 0;
+  auto _allow_nnapi_cpu_on_android_10_plus = _o->allow_nnapi_cpu_on_android_10_plus;
+  auto _execution_priority = _o->execution_priority;
+  auto _allow_dynamic_dimensions = _o->allow_dynamic_dimensions;
+  auto _allow_fp16_precision_for_fp32 = _o->allow_fp16_precision_for_fp32;
+  auto _use_burst_computation = _o->use_burst_computation;
+  auto _support_library_handle = _o->support_library_handle;
+  return tflite::CreateNNAPISettings(
+      _fbb,
+      _accelerator_name,
+      _cache_directory,
+      _model_token,
+      _execution_preference,
+      _no_of_nnapi_instances_to_cache,
+      _fallback_settings,
+      _allow_nnapi_cpu_on_android_10_plus,
+      _execution_priority,
+      _allow_dynamic_dimensions,
+      _allow_fp16_precision_for_fp32,
+      _use_burst_computation,
+      _support_library_handle);
+}
+
+
+inline bool operator==(const GPUSettingsT &lhs, const GPUSettingsT &rhs) {
+  return
+      (lhs.is_precision_loss_allowed == rhs.is_precision_loss_allowed) &&
+      (lhs.enable_quantized_inference == rhs.enable_quantized_inference) &&
+      (lhs.force_backend == rhs.force_backend) &&
+      (lhs.inference_priority1 == rhs.inference_priority1) &&
+      (lhs.inference_priority2 == rhs.inference_priority2) &&
+      (lhs.inference_priority3 == rhs.inference_priority3) &&
+      (lhs.inference_preference == rhs.inference_preference) &&
+      (lhs.cache_directory == rhs.cache_directory) &&
+      (lhs.model_token == rhs.model_token);
+}
+
+inline bool operator!=(const GPUSettingsT &lhs, const GPUSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline GPUSettingsT *GPUSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GPUSettingsT>(new GPUSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GPUSettings::UnPackTo(GPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = is_precision_loss_allowed(); _o->is_precision_loss_allowed = _e; }
+  { auto _e = enable_quantized_inference(); _o->enable_quantized_inference = _e; }
+  { auto _e = force_backend(); _o->force_backend = _e; }
+  { auto _e = inference_priority1(); _o->inference_priority1 = _e; }
+  { auto _e = inference_priority2(); _o->inference_priority2 = _e; }
+  { auto _e = inference_priority3(); _o->inference_priority3 = _e; }
+  { auto _e = inference_preference(); _o->inference_preference = _e; }
+  { auto _e = cache_directory(); if (_e) _o->cache_directory = _e->str(); }
+  { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<GPUSettings> GPUSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGPUSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<GPUSettings> CreateGPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GPUSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _is_precision_loss_allowed = _o->is_precision_loss_allowed;
+  auto _enable_quantized_inference = _o->enable_quantized_inference;
+  auto _force_backend = _o->force_backend;
+  auto _inference_priority1 = _o->inference_priority1;
+  auto _inference_priority2 = _o->inference_priority2;
+  auto _inference_priority3 = _o->inference_priority3;
+  auto _inference_preference = _o->inference_preference;
+  auto _cache_directory = _o->cache_directory.empty() ? 0 : _fbb.CreateString(_o->cache_directory);
+  auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
+  return tflite::CreateGPUSettings(
+      _fbb,
+      _is_precision_loss_allowed,
+      _enable_quantized_inference,
+      _force_backend,
+      _inference_priority1,
+      _inference_priority2,
+      _inference_priority3,
+      _inference_preference,
+      _cache_directory,
+      _model_token);
+}
+
+
+inline bool operator==(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs) {
+  return
+      (lhs.debug_level == rhs.debug_level) &&
+      (lhs.powersave_level == rhs.powersave_level) &&
+      (lhs.print_graph_profile == rhs.print_graph_profile) &&
+      (lhs.print_graph_debug == rhs.print_graph_debug);
+}
+
+inline bool operator!=(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline HexagonSettingsT *HexagonSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<HexagonSettingsT>(new HexagonSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void HexagonSettings::UnPackTo(HexagonSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = debug_level(); _o->debug_level = _e; }
+  { auto _e = powersave_level(); _o->powersave_level = _e; }
+  { auto _e = print_graph_profile(); _o->print_graph_profile = _e; }
+  { auto _e = print_graph_debug(); _o->print_graph_debug = _e; }
+}
+
+inline ::flatbuffers::Offset<HexagonSettings> HexagonSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateHexagonSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HexagonSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _debug_level = _o->debug_level;
+  auto _powersave_level = _o->powersave_level;
+  auto _print_graph_profile = _o->print_graph_profile;
+  auto _print_graph_debug = _o->print_graph_debug;
+  return tflite::CreateHexagonSettings(
+      _fbb,
+      _debug_level,
+      _powersave_level,
+      _print_graph_profile,
+      _print_graph_debug);
+}
+
+
+inline bool operator==(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs) {
+  return
+      (lhs.num_threads == rhs.num_threads) &&
+      (lhs.flags == rhs.flags);
+}
+
+inline bool operator!=(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline XNNPackSettingsT *XNNPackSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<XNNPackSettingsT>(new XNNPackSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void XNNPackSettings::UnPackTo(XNNPackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_threads(); _o->num_threads = _e; }
+  { auto _e = flags(); _o->flags = _e; }
+}
+
+inline ::flatbuffers::Offset<XNNPackSettings> XNNPackSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateXNNPackSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const XNNPackSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_threads = _o->num_threads;
+  auto _flags = _o->flags;
+  return tflite::CreateXNNPackSettings(
+      _fbb,
+      _num_threads,
+      _flags);
+}
+
+
+inline bool operator==(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs) {
+  return
+      (lhs.enabled_devices == rhs.enabled_devices) &&
+      (lhs.coreml_version == rhs.coreml_version) &&
+      (lhs.max_delegated_partitions == rhs.max_delegated_partitions) &&
+      (lhs.min_nodes_per_partition == rhs.min_nodes_per_partition);
+}
+
+inline bool operator!=(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline CoreMLSettingsT *CoreMLSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CoreMLSettingsT>(new CoreMLSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CoreMLSettings::UnPackTo(CoreMLSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = enabled_devices(); _o->enabled_devices = _e; }
+  { auto _e = coreml_version(); _o->coreml_version = _e; }
+  { auto _e = max_delegated_partitions(); _o->max_delegated_partitions = _e; }
+  { auto _e = min_nodes_per_partition(); _o->min_nodes_per_partition = _e; }
+}
+
+inline ::flatbuffers::Offset<CoreMLSettings> CoreMLSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCoreMLSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CoreMLSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _enabled_devices = _o->enabled_devices;
+  auto _coreml_version = _o->coreml_version;
+  auto _max_delegated_partitions = _o->max_delegated_partitions;
+  auto _min_nodes_per_partition = _o->min_nodes_per_partition;
+  return tflite::CreateCoreMLSettings(
+      _fbb,
+      _enabled_devices,
+      _coreml_version,
+      _max_delegated_partitions,
+      _min_nodes_per_partition);
+}
+
+
+inline bool operator==(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs) {
+  return
+      (lhs.delegate_path == rhs.delegate_path);
+}
+
+inline bool operator!=(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline StableDelegateLoaderSettingsT *StableDelegateLoaderSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StableDelegateLoaderSettingsT>(new StableDelegateLoaderSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StableDelegateLoaderSettings::UnPackTo(StableDelegateLoaderSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = delegate_path(); if (_e) _o->delegate_path = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> StableDelegateLoaderSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStableDelegateLoaderSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StableDelegateLoaderSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _delegate_path = _o->delegate_path.empty() ? 0 : _fbb.CreateString(_o->delegate_path);
+  return tflite::CreateStableDelegateLoaderSettings(
+      _fbb,
+      _delegate_path);
+}
+
+
+inline bool operator==(const CompilationCachingSettingsT &lhs, const CompilationCachingSettingsT &rhs) {
+  return
+      (lhs.cache_dir == rhs.cache_dir) &&
+      (lhs.model_token == rhs.model_token);
+}
+
+inline bool operator!=(const CompilationCachingSettingsT &lhs, const CompilationCachingSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline CompilationCachingSettingsT *CompilationCachingSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CompilationCachingSettingsT>(new CompilationCachingSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CompilationCachingSettings::UnPackTo(CompilationCachingSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = cache_dir(); if (_e) _o->cache_dir = _e->str(); }
+  { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<CompilationCachingSettings> CompilationCachingSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCompilationCachingSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CompilationCachingSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _cache_dir = _o->cache_dir.empty() ? 0 : _fbb.CreateString(_o->cache_dir);
+  auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
+  return tflite::CreateCompilationCachingSettings(
+      _fbb,
+      _cache_dir,
+      _model_token);
+}
+
+
+inline bool operator==(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs) {
+  return
+      (lhs.platform_type == rhs.platform_type) &&
+      (lhs.num_chips == rhs.num_chips) &&
+      (lhs.device_paths == rhs.device_paths) &&
+      (lhs.chip_family == rhs.chip_family);
+}
+
+inline bool operator!=(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline EdgeTpuDeviceSpecT *EdgeTpuDeviceSpec::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EdgeTpuDeviceSpecT>(new EdgeTpuDeviceSpecT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EdgeTpuDeviceSpec::UnPackTo(EdgeTpuDeviceSpecT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = platform_type(); _o->platform_type = _e; }
+  { auto _e = num_chips(); _o->num_chips = _e; }
+  { auto _e = device_paths(); if (_e) { _o->device_paths.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->device_paths[_i] = _e->Get(_i)->str(); } } else { _o->device_paths.resize(0); } }
+  { auto _e = chip_family(); _o->chip_family = _e; }
+}
+
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> EdgeTpuDeviceSpec::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEdgeTpuDeviceSpec(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuDeviceSpecT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _platform_type = _o->platform_type;
+  auto _num_chips = _o->num_chips;
+  auto _device_paths = _o->device_paths.size() ? _fbb.CreateVectorOfStrings(_o->device_paths) : 0;
+  auto _chip_family = _o->chip_family;
+  return tflite::CreateEdgeTpuDeviceSpec(
+      _fbb,
+      _platform_type,
+      _num_chips,
+      _device_paths,
+      _chip_family);
+}
+
+
+inline bool operator==(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs) {
+  return
+      (lhs.inactive_power_state == rhs.inactive_power_state) &&
+      (lhs.inactive_timeout_us == rhs.inactive_timeout_us);
+}
+
+inline bool operator!=(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline EdgeTpuInactivePowerConfigT *EdgeTpuInactivePowerConfig::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EdgeTpuInactivePowerConfigT>(new EdgeTpuInactivePowerConfigT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EdgeTpuInactivePowerConfig::UnPackTo(EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = inactive_power_state(); _o->inactive_power_state = _e; }
+  { auto _e = inactive_timeout_us(); _o->inactive_timeout_us = _e; }
+}
+
+inline ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> EdgeTpuInactivePowerConfig::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEdgeTpuInactivePowerConfig(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuInactivePowerConfigT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _inactive_power_state = _o->inactive_power_state;
+  auto _inactive_timeout_us = _o->inactive_timeout_us;
+  return tflite::CreateEdgeTpuInactivePowerConfig(
+      _fbb,
+      _inactive_power_state,
+      _inactive_timeout_us);
+}
+
+
+inline bool operator==(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs) {
+  return
+      (lhs.inference_power_state == rhs.inference_power_state) &&
+      (lhs.inactive_power_configs.size() == rhs.inactive_power_configs.size() && std::equal(lhs.inactive_power_configs.cbegin(), lhs.inactive_power_configs.cend(), rhs.inactive_power_configs.cbegin(), [](std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT> const &a, std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      (lhs.inference_priority == rhs.inference_priority) &&
+      ((lhs.edgetpu_device_spec == rhs.edgetpu_device_spec) || (lhs.edgetpu_device_spec && rhs.edgetpu_device_spec && *lhs.edgetpu_device_spec == *rhs.edgetpu_device_spec)) &&
+      (lhs.model_token == rhs.model_token) &&
+      (lhs.float_truncation_type == rhs.float_truncation_type) &&
+      (lhs.qos_class == rhs.qos_class) &&
+      (lhs.hardware_cluster_ids == rhs.hardware_cluster_ids) &&
+      (lhs.public_model_id == rhs.public_model_id);
+}
+
+inline bool operator!=(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline EdgeTpuSettingsT::EdgeTpuSettingsT(const EdgeTpuSettingsT &o)
+      : inference_power_state(o.inference_power_state),
+        inference_priority(o.inference_priority),
+        edgetpu_device_spec((o.edgetpu_device_spec) ? new tflite::EdgeTpuDeviceSpecT(*o.edgetpu_device_spec) : nullptr),
+        model_token(o.model_token),
+        float_truncation_type(o.float_truncation_type),
+        qos_class(o.qos_class),
+        hardware_cluster_ids(o.hardware_cluster_ids),
+        public_model_id(o.public_model_id) {
+  inactive_power_configs.reserve(o.inactive_power_configs.size());
+  for (const auto &inactive_power_configs_ : o.inactive_power_configs) { inactive_power_configs.emplace_back((inactive_power_configs_) ? new tflite::EdgeTpuInactivePowerConfigT(*inactive_power_configs_) : nullptr); }
+}
+
+inline EdgeTpuSettingsT &EdgeTpuSettingsT::operator=(EdgeTpuSettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(inference_power_state, o.inference_power_state);
+  std::swap(inactive_power_configs, o.inactive_power_configs);
+  std::swap(inference_priority, o.inference_priority);
+  std::swap(edgetpu_device_spec, o.edgetpu_device_spec);
+  std::swap(model_token, o.model_token);
+  std::swap(float_truncation_type, o.float_truncation_type);
+  std::swap(qos_class, o.qos_class);
+  std::swap(hardware_cluster_ids, o.hardware_cluster_ids);
+  std::swap(public_model_id, o.public_model_id);
+  return *this;
+}
+
+inline EdgeTpuSettingsT *EdgeTpuSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EdgeTpuSettingsT>(new EdgeTpuSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void EdgeTpuSettings::UnPackTo(EdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = inference_power_state(); _o->inference_power_state = _e; }
+  { auto _e = inactive_power_configs(); if (_e) { _o->inactive_power_configs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->inactive_power_configs[_i]) { _e->Get(_i)->UnPackTo(_o->inactive_power_configs[_i].get(), _resolver); } else { _o->inactive_power_configs[_i] = std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->inactive_power_configs.resize(0); } }
+  { auto _e = inference_priority(); _o->inference_priority = _e; }
+  { auto _e = edgetpu_device_spec(); if (_e) { if(_o->edgetpu_device_spec) { _e->UnPackTo(_o->edgetpu_device_spec.get(), _resolver); } else { _o->edgetpu_device_spec = std::unique_ptr<tflite::EdgeTpuDeviceSpecT>(_e->UnPack(_resolver)); } } else if (_o->edgetpu_device_spec) { _o->edgetpu_device_spec.reset(); } }
+  { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
+  { auto _e = float_truncation_type(); _o->float_truncation_type = _e; }
+  { auto _e = qos_class(); _o->qos_class = _e; }
+  { auto _e = hardware_cluster_ids(); if (_e) { _o->hardware_cluster_ids.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->hardware_cluster_ids[_i] = _e->Get(_i); } } else { _o->hardware_cluster_ids.resize(0); } }
+  { auto _e = public_model_id(); if (_e) _o->public_model_id = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<EdgeTpuSettings> EdgeTpuSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEdgeTpuSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _inference_power_state = _o->inference_power_state;
+  auto _inactive_power_configs = _o->inactive_power_configs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> (_o->inactive_power_configs.size(), [](size_t i, _VectorArgs *__va) { return CreateEdgeTpuInactivePowerConfig(*__va->__fbb, __va->__o->inactive_power_configs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _inference_priority = _o->inference_priority;
+  auto _edgetpu_device_spec = _o->edgetpu_device_spec ? CreateEdgeTpuDeviceSpec(_fbb, _o->edgetpu_device_spec.get(), _rehasher) : 0;
+  auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
+  auto _float_truncation_type = _o->float_truncation_type;
+  auto _qos_class = _o->qos_class;
+  auto _hardware_cluster_ids = _o->hardware_cluster_ids.size() ? _fbb.CreateVector(_o->hardware_cluster_ids) : 0;
+  auto _public_model_id = _o->public_model_id.empty() ? 0 : _fbb.CreateString(_o->public_model_id);
+  return tflite::CreateEdgeTpuSettings(
+      _fbb,
+      _inference_power_state,
+      _inactive_power_configs,
+      _inference_priority,
+      _edgetpu_device_spec,
+      _model_token,
+      _float_truncation_type,
+      _qos_class,
+      _hardware_cluster_ids,
+      _public_model_id);
+}
+
+
+inline bool operator==(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs) {
+  return
+      (lhs.log_verbosity == rhs.log_verbosity) &&
+      (lhs.enable_tracing == rhs.enable_tracing) &&
+      (lhs.priority == rhs.priority) &&
+      (lhs.extension_data == rhs.extension_data) &&
+      (lhs.model_identifier == rhs.model_identifier) &&
+      (lhs.use_async_api == rhs.use_async_api) &&
+      (lhs.delegate_should_manage_cache_for_inputs == rhs.delegate_should_manage_cache_for_inputs) &&
+      (lhs.delegate_should_manage_cache_for_outputs == rhs.delegate_should_manage_cache_for_outputs) &&
+      (lhs.prefer_cache_coherency_for_inputs == rhs.prefer_cache_coherency_for_inputs) &&
+      (lhs.prefer_cache_coherency_for_outputs == rhs.prefer_cache_coherency_for_outputs);
+}
+
+inline bool operator!=(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline GoogleEdgeTpuSettingsT *GoogleEdgeTpuSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<GoogleEdgeTpuSettingsT>(new GoogleEdgeTpuSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void GoogleEdgeTpuSettings::UnPackTo(GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = log_verbosity(); _o->log_verbosity = _e; }
+  { auto _e = enable_tracing(); _o->enable_tracing = _e; }
+  { auto _e = priority(); _o->priority = _e; }
+  { auto _e = extension_data(); if (_e) { _o->extension_data.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->extension_data.begin()); } }
+  { auto _e = model_identifier(); if (_e) _o->model_identifier = _e->str(); }
+  { auto _e = use_async_api(); _o->use_async_api = _e; }
+  { auto _e = delegate_should_manage_cache_for_inputs(); _o->delegate_should_manage_cache_for_inputs = _e; }
+  { auto _e = delegate_should_manage_cache_for_outputs(); _o->delegate_should_manage_cache_for_outputs = _e; }
+  { auto _e = prefer_cache_coherency_for_inputs(); _o->prefer_cache_coherency_for_inputs = _e; }
+  { auto _e = prefer_cache_coherency_for_outputs(); _o->prefer_cache_coherency_for_outputs = _e; }
+}
+
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> GoogleEdgeTpuSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGoogleEdgeTpuSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GoogleEdgeTpuSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _log_verbosity = _o->log_verbosity;
+  auto _enable_tracing = _o->enable_tracing;
+  auto _priority = _o->priority;
+  auto _extension_data = _o->extension_data.size() ? _fbb.CreateVector(_o->extension_data) : 0;
+  auto _model_identifier = _o->model_identifier.empty() ? 0 : _fbb.CreateString(_o->model_identifier);
+  auto _use_async_api = _o->use_async_api;
+  auto _delegate_should_manage_cache_for_inputs = _o->delegate_should_manage_cache_for_inputs;
+  auto _delegate_should_manage_cache_for_outputs = _o->delegate_should_manage_cache_for_outputs;
+  auto _prefer_cache_coherency_for_inputs = _o->prefer_cache_coherency_for_inputs;
+  auto _prefer_cache_coherency_for_outputs = _o->prefer_cache_coherency_for_outputs;
+  return tflite::CreateGoogleEdgeTpuSettings(
+      _fbb,
+      _log_verbosity,
+      _enable_tracing,
+      _priority,
+      _extension_data,
+      _model_identifier,
+      _use_async_api,
+      _delegate_should_manage_cache_for_inputs,
+      _delegate_should_manage_cache_for_outputs,
+      _prefer_cache_coherency_for_inputs,
+      _prefer_cache_coherency_for_outputs);
+}
+
+
+inline bool operator==(const CoralSettingsT &lhs, const CoralSettingsT &rhs) {
+  return
+      (lhs.device == rhs.device) &&
+      (lhs.performance == rhs.performance) &&
+      (lhs.usb_always_dfu == rhs.usb_always_dfu) &&
+      (lhs.usb_max_bulk_in_queue_length == rhs.usb_max_bulk_in_queue_length);
+}
+
+inline bool operator!=(const CoralSettingsT &lhs, const CoralSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline CoralSettingsT *CoralSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CoralSettingsT>(new CoralSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CoralSettings::UnPackTo(CoralSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = device(); if (_e) _o->device = _e->str(); }
+  { auto _e = performance(); _o->performance = _e; }
+  { auto _e = usb_always_dfu(); _o->usb_always_dfu = _e; }
+  { auto _e = usb_max_bulk_in_queue_length(); _o->usb_max_bulk_in_queue_length = _e; }
+}
+
+inline ::flatbuffers::Offset<CoralSettings> CoralSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCoralSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CoralSettings> CreateCoralSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CoralSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _device = _o->device.empty() ? 0 : _fbb.CreateString(_o->device);
+  auto _performance = _o->performance;
+  auto _usb_always_dfu = _o->usb_always_dfu;
+  auto _usb_max_bulk_in_queue_length = _o->usb_max_bulk_in_queue_length;
+  return tflite::CreateCoralSettings(
+      _fbb,
+      _device,
+      _performance,
+      _usb_always_dfu,
+      _usb_max_bulk_in_queue_length);
+}
+
+
+inline bool operator==(const CPUSettingsT &lhs, const CPUSettingsT &rhs) {
+  return
+      (lhs.num_threads == rhs.num_threads);
+}
+
+inline bool operator!=(const CPUSettingsT &lhs, const CPUSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline CPUSettingsT *CPUSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<CPUSettingsT>(new CPUSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void CPUSettings::UnPackTo(CPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_threads(); _o->num_threads = _e; }
+}
+
+inline ::flatbuffers::Offset<CPUSettings> CPUSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCPUSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CPUSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_threads = _o->num_threads;
+  return tflite::CreateCPUSettings(
+      _fbb,
+      _num_threads);
+}
+
+
+inline bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
+  return
+      (lhs.delegate == rhs.delegate) &&
+      ((lhs.nnapi_settings == rhs.nnapi_settings) || (lhs.nnapi_settings && rhs.nnapi_settings && *lhs.nnapi_settings == *rhs.nnapi_settings)) &&
+      ((lhs.gpu_settings == rhs.gpu_settings) || (lhs.gpu_settings && rhs.gpu_settings && *lhs.gpu_settings == *rhs.gpu_settings)) &&
+      ((lhs.hexagon_settings == rhs.hexagon_settings) || (lhs.hexagon_settings && rhs.hexagon_settings && *lhs.hexagon_settings == *rhs.hexagon_settings)) &&
+      ((lhs.xnnpack_settings == rhs.xnnpack_settings) || (lhs.xnnpack_settings && rhs.xnnpack_settings && *lhs.xnnpack_settings == *rhs.xnnpack_settings)) &&
+      ((lhs.coreml_settings == rhs.coreml_settings) || (lhs.coreml_settings && rhs.coreml_settings && *lhs.coreml_settings == *rhs.coreml_settings)) &&
+      ((lhs.cpu_settings == rhs.cpu_settings) || (lhs.cpu_settings && rhs.cpu_settings && *lhs.cpu_settings == *rhs.cpu_settings)) &&
+      (lhs.max_delegated_partitions == rhs.max_delegated_partitions) &&
+      ((lhs.edgetpu_settings == rhs.edgetpu_settings) || (lhs.edgetpu_settings && rhs.edgetpu_settings && *lhs.edgetpu_settings == *rhs.edgetpu_settings)) &&
+      ((lhs.coral_settings == rhs.coral_settings) || (lhs.coral_settings && rhs.coral_settings && *lhs.coral_settings == *rhs.coral_settings)) &&
+      ((lhs.fallback_settings == rhs.fallback_settings) || (lhs.fallback_settings && rhs.fallback_settings && *lhs.fallback_settings == *rhs.fallback_settings)) &&
+      (lhs.disable_default_delegates == rhs.disable_default_delegates) &&
+      ((lhs.stable_delegate_loader_settings == rhs.stable_delegate_loader_settings) || (lhs.stable_delegate_loader_settings && rhs.stable_delegate_loader_settings && *lhs.stable_delegate_loader_settings == *rhs.stable_delegate_loader_settings)) &&
+      ((lhs.google_edgetpu_settings == rhs.google_edgetpu_settings) || (lhs.google_edgetpu_settings && rhs.google_edgetpu_settings && *lhs.google_edgetpu_settings == *rhs.google_edgetpu_settings)) &&
+      ((lhs.compilation_caching_settings == rhs.compilation_caching_settings) || (lhs.compilation_caching_settings && rhs.compilation_caching_settings && *lhs.compilation_caching_settings == *rhs.compilation_caching_settings));
+}
+
+inline bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline TFLiteSettingsT::TFLiteSettingsT(const TFLiteSettingsT &o)
+      : delegate(o.delegate),
+        nnapi_settings((o.nnapi_settings) ? new tflite::NNAPISettingsT(*o.nnapi_settings) : nullptr),
+        gpu_settings((o.gpu_settings) ? new tflite::GPUSettingsT(*o.gpu_settings) : nullptr),
+        hexagon_settings((o.hexagon_settings) ? new tflite::HexagonSettingsT(*o.hexagon_settings) : nullptr),
+        xnnpack_settings((o.xnnpack_settings) ? new tflite::XNNPackSettingsT(*o.xnnpack_settings) : nullptr),
+        coreml_settings((o.coreml_settings) ? new tflite::CoreMLSettingsT(*o.coreml_settings) : nullptr),
+        cpu_settings((o.cpu_settings) ? new tflite::CPUSettingsT(*o.cpu_settings) : nullptr),
+        max_delegated_partitions(o.max_delegated_partitions),
+        edgetpu_settings((o.edgetpu_settings) ? new tflite::EdgeTpuSettingsT(*o.edgetpu_settings) : nullptr),
+        coral_settings((o.coral_settings) ? new tflite::CoralSettingsT(*o.coral_settings) : nullptr),
+        fallback_settings((o.fallback_settings) ? new tflite::FallbackSettingsT(*o.fallback_settings) : nullptr),
+        disable_default_delegates(o.disable_default_delegates),
+        stable_delegate_loader_settings((o.stable_delegate_loader_settings) ? new tflite::StableDelegateLoaderSettingsT(*o.stable_delegate_loader_settings) : nullptr),
+        google_edgetpu_settings((o.google_edgetpu_settings) ? new tflite::GoogleEdgeTpuSettingsT(*o.google_edgetpu_settings) : nullptr),
+        compilation_caching_settings((o.compilation_caching_settings) ? new tflite::CompilationCachingSettingsT(*o.compilation_caching_settings) : nullptr) {
+}
+
+inline TFLiteSettingsT &TFLiteSettingsT::operator=(TFLiteSettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(delegate, o.delegate);
+  std::swap(nnapi_settings, o.nnapi_settings);
+  std::swap(gpu_settings, o.gpu_settings);
+  std::swap(hexagon_settings, o.hexagon_settings);
+  std::swap(xnnpack_settings, o.xnnpack_settings);
+  std::swap(coreml_settings, o.coreml_settings);
+  std::swap(cpu_settings, o.cpu_settings);
+  std::swap(max_delegated_partitions, o.max_delegated_partitions);
+  std::swap(edgetpu_settings, o.edgetpu_settings);
+  std::swap(coral_settings, o.coral_settings);
+  std::swap(fallback_settings, o.fallback_settings);
+  std::swap(disable_default_delegates, o.disable_default_delegates);
+  std::swap(stable_delegate_loader_settings, o.stable_delegate_loader_settings);
+  std::swap(google_edgetpu_settings, o.google_edgetpu_settings);
+  std::swap(compilation_caching_settings, o.compilation_caching_settings);
+  return *this;
+}
+
+inline TFLiteSettingsT *TFLiteSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<TFLiteSettingsT>(new TFLiteSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void TFLiteSettings::UnPackTo(TFLiteSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = delegate(); _o->delegate = _e; }
+  { auto _e = nnapi_settings(); if (_e) { if(_o->nnapi_settings) { _e->UnPackTo(_o->nnapi_settings.get(), _resolver); } else { _o->nnapi_settings = std::unique_ptr<tflite::NNAPISettingsT>(_e->UnPack(_resolver)); } } else if (_o->nnapi_settings) { _o->nnapi_settings.reset(); } }
+  { auto _e = gpu_settings(); if (_e) { if(_o->gpu_settings) { _e->UnPackTo(_o->gpu_settings.get(), _resolver); } else { _o->gpu_settings = std::unique_ptr<tflite::GPUSettingsT>(_e->UnPack(_resolver)); } } else if (_o->gpu_settings) { _o->gpu_settings.reset(); } }
+  { auto _e = hexagon_settings(); if (_e) { if(_o->hexagon_settings) { _e->UnPackTo(_o->hexagon_settings.get(), _resolver); } else { _o->hexagon_settings = std::unique_ptr<tflite::HexagonSettingsT>(_e->UnPack(_resolver)); } } else if (_o->hexagon_settings) { _o->hexagon_settings.reset(); } }
+  { auto _e = xnnpack_settings(); if (_e) { if(_o->xnnpack_settings) { _e->UnPackTo(_o->xnnpack_settings.get(), _resolver); } else { _o->xnnpack_settings = std::unique_ptr<tflite::XNNPackSettingsT>(_e->UnPack(_resolver)); } } else if (_o->xnnpack_settings) { _o->xnnpack_settings.reset(); } }
+  { auto _e = coreml_settings(); if (_e) { if(_o->coreml_settings) { _e->UnPackTo(_o->coreml_settings.get(), _resolver); } else { _o->coreml_settings = std::unique_ptr<tflite::CoreMLSettingsT>(_e->UnPack(_resolver)); } } else if (_o->coreml_settings) { _o->coreml_settings.reset(); } }
+  { auto _e = cpu_settings(); if (_e) { if(_o->cpu_settings) { _e->UnPackTo(_o->cpu_settings.get(), _resolver); } else { _o->cpu_settings = std::unique_ptr<tflite::CPUSettingsT>(_e->UnPack(_resolver)); } } else if (_o->cpu_settings) { _o->cpu_settings.reset(); } }
+  { auto _e = max_delegated_partitions(); _o->max_delegated_partitions = _e; }
+  { auto _e = edgetpu_settings(); if (_e) { if(_o->edgetpu_settings) { _e->UnPackTo(_o->edgetpu_settings.get(), _resolver); } else { _o->edgetpu_settings = std::unique_ptr<tflite::EdgeTpuSettingsT>(_e->UnPack(_resolver)); } } else if (_o->edgetpu_settings) { _o->edgetpu_settings.reset(); } }
+  { auto _e = coral_settings(); if (_e) { if(_o->coral_settings) { _e->UnPackTo(_o->coral_settings.get(), _resolver); } else { _o->coral_settings = std::unique_ptr<tflite::CoralSettingsT>(_e->UnPack(_resolver)); } } else if (_o->coral_settings) { _o->coral_settings.reset(); } }
+  { auto _e = fallback_settings(); if (_e) { if(_o->fallback_settings) { _e->UnPackTo(_o->fallback_settings.get(), _resolver); } else { _o->fallback_settings = std::unique_ptr<tflite::FallbackSettingsT>(_e->UnPack(_resolver)); } } else if (_o->fallback_settings) { _o->fallback_settings.reset(); } }
+  { auto _e = disable_default_delegates(); _o->disable_default_delegates = _e; }
+  { auto _e = stable_delegate_loader_settings(); if (_e) { if(_o->stable_delegate_loader_settings) { _e->UnPackTo(_o->stable_delegate_loader_settings.get(), _resolver); } else { _o->stable_delegate_loader_settings = std::unique_ptr<tflite::StableDelegateLoaderSettingsT>(_e->UnPack(_resolver)); } } else if (_o->stable_delegate_loader_settings) { _o->stable_delegate_loader_settings.reset(); } }
+  { auto _e = google_edgetpu_settings(); if (_e) { if(_o->google_edgetpu_settings) { _e->UnPackTo(_o->google_edgetpu_settings.get(), _resolver); } else { _o->google_edgetpu_settings = std::unique_ptr<tflite::GoogleEdgeTpuSettingsT>(_e->UnPack(_resolver)); } } else if (_o->google_edgetpu_settings) { _o->google_edgetpu_settings.reset(); } }
+  { auto _e = compilation_caching_settings(); if (_e) { if(_o->compilation_caching_settings) { _e->UnPackTo(_o->compilation_caching_settings.get(), _resolver); } else { _o->compilation_caching_settings = std::unique_ptr<tflite::CompilationCachingSettingsT>(_e->UnPack(_resolver)); } } else if (_o->compilation_caching_settings) { _o->compilation_caching_settings.reset(); } }
+}
+
+inline ::flatbuffers::Offset<TFLiteSettings> TFLiteSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTFLiteSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TFLiteSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _delegate = _o->delegate;
+  auto _nnapi_settings = _o->nnapi_settings ? CreateNNAPISettings(_fbb, _o->nnapi_settings.get(), _rehasher) : 0;
+  auto _gpu_settings = _o->gpu_settings ? CreateGPUSettings(_fbb, _o->gpu_settings.get(), _rehasher) : 0;
+  auto _hexagon_settings = _o->hexagon_settings ? CreateHexagonSettings(_fbb, _o->hexagon_settings.get(), _rehasher) : 0;
+  auto _xnnpack_settings = _o->xnnpack_settings ? CreateXNNPackSettings(_fbb, _o->xnnpack_settings.get(), _rehasher) : 0;
+  auto _coreml_settings = _o->coreml_settings ? CreateCoreMLSettings(_fbb, _o->coreml_settings.get(), _rehasher) : 0;
+  auto _cpu_settings = _o->cpu_settings ? CreateCPUSettings(_fbb, _o->cpu_settings.get(), _rehasher) : 0;
+  auto _max_delegated_partitions = _o->max_delegated_partitions;
+  auto _edgetpu_settings = _o->edgetpu_settings ? CreateEdgeTpuSettings(_fbb, _o->edgetpu_settings.get(), _rehasher) : 0;
+  auto _coral_settings = _o->coral_settings ? CreateCoralSettings(_fbb, _o->coral_settings.get(), _rehasher) : 0;
+  auto _fallback_settings = _o->fallback_settings ? CreateFallbackSettings(_fbb, _o->fallback_settings.get(), _rehasher) : 0;
+  auto _disable_default_delegates = _o->disable_default_delegates;
+  auto _stable_delegate_loader_settings = _o->stable_delegate_loader_settings ? CreateStableDelegateLoaderSettings(_fbb, _o->stable_delegate_loader_settings.get(), _rehasher) : 0;
+  auto _google_edgetpu_settings = _o->google_edgetpu_settings ? CreateGoogleEdgeTpuSettings(_fbb, _o->google_edgetpu_settings.get(), _rehasher) : 0;
+  auto _compilation_caching_settings = _o->compilation_caching_settings ? CreateCompilationCachingSettings(_fbb, _o->compilation_caching_settings.get(), _rehasher) : 0;
+  return tflite::CreateTFLiteSettings(
+      _fbb,
+      _delegate,
+      _nnapi_settings,
+      _gpu_settings,
+      _hexagon_settings,
+      _xnnpack_settings,
+      _coreml_settings,
+      _cpu_settings,
+      _max_delegated_partitions,
+      _edgetpu_settings,
+      _coral_settings,
+      _fallback_settings,
+      _disable_default_delegates,
+      _stable_delegate_loader_settings,
+      _google_edgetpu_settings,
+      _compilation_caching_settings);
+}
+
+
+inline bool operator==(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs) {
+  return
+      (lhs.allow_automatic_fallback_on_compilation_error == rhs.allow_automatic_fallback_on_compilation_error) &&
+      (lhs.allow_automatic_fallback_on_execution_error == rhs.allow_automatic_fallback_on_execution_error);
+}
+
+inline bool operator!=(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline FallbackSettingsT *FallbackSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<FallbackSettingsT>(new FallbackSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void FallbackSettings::UnPackTo(FallbackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = allow_automatic_fallback_on_compilation_error(); _o->allow_automatic_fallback_on_compilation_error = _e; }
+  { auto _e = allow_automatic_fallback_on_execution_error(); _o->allow_automatic_fallback_on_execution_error = _e; }
+}
+
+inline ::flatbuffers::Offset<FallbackSettings> FallbackSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFallbackSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FallbackSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _allow_automatic_fallback_on_compilation_error = _o->allow_automatic_fallback_on_compilation_error;
+  auto _allow_automatic_fallback_on_execution_error = _o->allow_automatic_fallback_on_execution_error;
+  return tflite::CreateFallbackSettings(
+      _fbb,
+      _allow_automatic_fallback_on_compilation_error,
+      _allow_automatic_fallback_on_execution_error);
+}
+
+
+inline bool operator==(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs) {
+  return
+      (lhs.name == rhs.name) &&
+      (lhs.values == rhs.values);
+}
+
+inline bool operator!=(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkMetricT *BenchmarkMetric::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkMetricT>(new BenchmarkMetricT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkMetric::UnPackTo(BenchmarkMetricT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } else { _o->values.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<BenchmarkMetric> BenchmarkMetric::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkMetric(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkMetricT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateBenchmarkMetric(
+      _fbb,
+      _name,
+      _values);
+}
+
+
+inline bool operator==(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs) {
+  return
+      (lhs.initialization_time_us == rhs.initialization_time_us) &&
+      (lhs.inference_time_us == rhs.inference_time_us) &&
+      (lhs.max_memory_kb == rhs.max_memory_kb) &&
+      (lhs.ok == rhs.ok) &&
+      (lhs.metrics.size() == rhs.metrics.size() && std::equal(lhs.metrics.cbegin(), lhs.metrics.cend(), rhs.metrics.cbegin(), [](std::unique_ptr<tflite::BenchmarkMetricT> const &a, std::unique_ptr<tflite::BenchmarkMetricT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      (lhs.actual_output.size() == rhs.actual_output.size() && std::equal(lhs.actual_output.cbegin(), lhs.actual_output.cend(), rhs.actual_output.cbegin(), [](std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT> const &a, std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT> const &b) { return (a == b) || (a && b && *a == *b); }));
+}
+
+inline bool operator!=(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkResultT::BenchmarkResultT(const BenchmarkResultT &o)
+      : initialization_time_us(o.initialization_time_us),
+        inference_time_us(o.inference_time_us),
+        max_memory_kb(o.max_memory_kb),
+        ok(o.ok) {
+  metrics.reserve(o.metrics.size());
+  for (const auto &metrics_ : o.metrics) { metrics.emplace_back((metrics_) ? new tflite::BenchmarkMetricT(*metrics_) : nullptr); }
+  actual_output.reserve(o.actual_output.size());
+  for (const auto &actual_output_ : o.actual_output) { actual_output.emplace_back((actual_output_) ? new tflite::BenchmarkResult_::InferenceOutputT(*actual_output_) : nullptr); }
+}
+
+inline BenchmarkResultT &BenchmarkResultT::operator=(BenchmarkResultT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(initialization_time_us, o.initialization_time_us);
+  std::swap(inference_time_us, o.inference_time_us);
+  std::swap(max_memory_kb, o.max_memory_kb);
+  std::swap(ok, o.ok);
+  std::swap(metrics, o.metrics);
+  std::swap(actual_output, o.actual_output);
+  return *this;
+}
+
+inline BenchmarkResultT *BenchmarkResult::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkResultT>(new BenchmarkResultT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkResult::UnPackTo(BenchmarkResultT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = initialization_time_us(); if (_e) { _o->initialization_time_us.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->initialization_time_us[_i] = _e->Get(_i); } } else { _o->initialization_time_us.resize(0); } }
+  { auto _e = inference_time_us(); if (_e) { _o->inference_time_us.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inference_time_us[_i] = _e->Get(_i); } } else { _o->inference_time_us.resize(0); } }
+  { auto _e = max_memory_kb(); _o->max_memory_kb = _e; }
+  { auto _e = ok(); _o->ok = _e; }
+  { auto _e = metrics(); if (_e) { _o->metrics.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metrics[_i]) { _e->Get(_i)->UnPackTo(_o->metrics[_i].get(), _resolver); } else { _o->metrics[_i] = std::unique_ptr<tflite::BenchmarkMetricT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->metrics.resize(0); } }
+  { auto _e = actual_output(); if (_e) { _o->actual_output.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->actual_output[_i]) { _e->Get(_i)->UnPackTo(_o->actual_output[_i].get(), _resolver); } else { _o->actual_output[_i] = std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->actual_output.resize(0); } }
+}
+
+inline ::flatbuffers::Offset<BenchmarkResult> BenchmarkResult::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkResult(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkResultT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _initialization_time_us = _o->initialization_time_us.size() ? _fbb.CreateVector(_o->initialization_time_us) : 0;
+  auto _inference_time_us = _o->inference_time_us.size() ? _fbb.CreateVector(_o->inference_time_us) : 0;
+  auto _max_memory_kb = _o->max_memory_kb;
+  auto _ok = _o->ok;
+  auto _metrics = _o->metrics.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkMetric>> (_o->metrics.size(), [](size_t i, _VectorArgs *__va) { return CreateBenchmarkMetric(*__va->__fbb, __va->__o->metrics[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _actual_output = _o->actual_output.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> (_o->actual_output.size(), [](size_t i, _VectorArgs *__va) { return CreateInferenceOutput(*__va->__fbb, __va->__o->actual_output[i].get(), __va->__rehasher); }, &_va ) : 0;
+  return tflite::CreateBenchmarkResult(
+      _fbb,
+      _initialization_time_us,
+      _inference_time_us,
+      _max_memory_kb,
+      _ok,
+      _metrics,
+      _actual_output);
+}
+
+namespace BenchmarkResult_ {
+
+
+inline bool operator==(const InferenceOutputT &lhs, const InferenceOutputT &rhs) {
+  return
+      (lhs.value == rhs.value);
+}
+
+inline bool operator!=(const InferenceOutputT &lhs, const InferenceOutputT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline InferenceOutputT *InferenceOutput::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<InferenceOutputT>(new InferenceOutputT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void InferenceOutput::UnPackTo(InferenceOutputT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = value(); if (_e) { _o->value.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->value.begin()); } }
+}
+
+inline ::flatbuffers::Offset<InferenceOutput> InferenceOutput::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateInferenceOutput(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const InferenceOutputT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _value = _o->value.size() ? _fbb.CreateVector(_o->value) : 0;
+  return tflite::BenchmarkResult_::CreateInferenceOutput(
+      _fbb,
+      _value);
+}
+
+}  // namespace BenchmarkResult_
+
+
+inline bool operator==(const ErrorCodeT &lhs, const ErrorCodeT &rhs) {
+  return
+      (lhs.source == rhs.source) &&
+      (lhs.tflite_error == rhs.tflite_error) &&
+      (lhs.underlying_api_error == rhs.underlying_api_error);
+}
+
+inline bool operator!=(const ErrorCodeT &lhs, const ErrorCodeT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ErrorCodeT *ErrorCode::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ErrorCodeT>(new ErrorCodeT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ErrorCode::UnPackTo(ErrorCodeT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = source(); _o->source = _e; }
+  { auto _e = tflite_error(); _o->tflite_error = _e; }
+  { auto _e = underlying_api_error(); _o->underlying_api_error = _e; }
+}
+
+inline ::flatbuffers::Offset<ErrorCode> ErrorCode::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateErrorCode(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ErrorCode> CreateErrorCode(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ErrorCodeT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _source = _o->source;
+  auto _tflite_error = _o->tflite_error;
+  auto _underlying_api_error = _o->underlying_api_error;
+  return tflite::CreateErrorCode(
+      _fbb,
+      _source,
+      _tflite_error,
+      _underlying_api_error);
+}
+
+
+inline bool operator==(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs) {
+  return
+      (lhs.stage == rhs.stage) &&
+      (lhs.exit_code == rhs.exit_code) &&
+      (lhs.signal == rhs.signal) &&
+      (lhs.error_code.size() == rhs.error_code.size() && std::equal(lhs.error_code.cbegin(), lhs.error_code.cend(), rhs.error_code.cbegin(), [](std::unique_ptr<tflite::ErrorCodeT> const &a, std::unique_ptr<tflite::ErrorCodeT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      (lhs.mini_benchmark_error_code == rhs.mini_benchmark_error_code);
+}
+
+inline bool operator!=(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkErrorT::BenchmarkErrorT(const BenchmarkErrorT &o)
+      : stage(o.stage),
+        exit_code(o.exit_code),
+        signal(o.signal),
+        mini_benchmark_error_code(o.mini_benchmark_error_code) {
+  error_code.reserve(o.error_code.size());
+  for (const auto &error_code_ : o.error_code) { error_code.emplace_back((error_code_) ? new tflite::ErrorCodeT(*error_code_) : nullptr); }
+}
+
+inline BenchmarkErrorT &BenchmarkErrorT::operator=(BenchmarkErrorT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(stage, o.stage);
+  std::swap(exit_code, o.exit_code);
+  std::swap(signal, o.signal);
+  std::swap(error_code, o.error_code);
+  std::swap(mini_benchmark_error_code, o.mini_benchmark_error_code);
+  return *this;
+}
+
+inline BenchmarkErrorT *BenchmarkError::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkErrorT>(new BenchmarkErrorT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkError::UnPackTo(BenchmarkErrorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = stage(); _o->stage = _e; }
+  { auto _e = exit_code(); _o->exit_code = _e; }
+  { auto _e = signal(); _o->signal = _e; }
+  { auto _e = error_code(); if (_e) { _o->error_code.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->error_code[_i]) { _e->Get(_i)->UnPackTo(_o->error_code[_i].get(), _resolver); } else { _o->error_code[_i] = std::unique_ptr<tflite::ErrorCodeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->error_code.resize(0); } }
+  { auto _e = mini_benchmark_error_code(); _o->mini_benchmark_error_code = _e; }
+}
+
+inline ::flatbuffers::Offset<BenchmarkError> BenchmarkError::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkError(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkErrorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _stage = _o->stage;
+  auto _exit_code = _o->exit_code;
+  auto _signal = _o->signal;
+  auto _error_code = _o->error_code.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ErrorCode>> (_o->error_code.size(), [](size_t i, _VectorArgs *__va) { return CreateErrorCode(*__va->__fbb, __va->__o->error_code[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _mini_benchmark_error_code = _o->mini_benchmark_error_code;
+  return tflite::CreateBenchmarkError(
+      _fbb,
+      _stage,
+      _exit_code,
+      _signal,
+      _error_code,
+      _mini_benchmark_error_code);
+}
+
+
+inline bool operator==(const BenchmarkEventT &lhs, const BenchmarkEventT &rhs) {
+  return
+      ((lhs.tflite_settings == rhs.tflite_settings) || (lhs.tflite_settings && rhs.tflite_settings && *lhs.tflite_settings == *rhs.tflite_settings)) &&
+      (lhs.event_type == rhs.event_type) &&
+      ((lhs.result == rhs.result) || (lhs.result && rhs.result && *lhs.result == *rhs.result)) &&
+      ((lhs.error == rhs.error) || (lhs.error && rhs.error && *lhs.error == *rhs.error)) &&
+      (lhs.boottime_us == rhs.boottime_us) &&
+      (lhs.wallclock_us == rhs.wallclock_us);
+}
+
+inline bool operator!=(const BenchmarkEventT &lhs, const BenchmarkEventT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkEventT::BenchmarkEventT(const BenchmarkEventT &o)
+      : tflite_settings((o.tflite_settings) ? new tflite::TFLiteSettingsT(*o.tflite_settings) : nullptr),
+        event_type(o.event_type),
+        result((o.result) ? new tflite::BenchmarkResultT(*o.result) : nullptr),
+        error((o.error) ? new tflite::BenchmarkErrorT(*o.error) : nullptr),
+        boottime_us(o.boottime_us),
+        wallclock_us(o.wallclock_us) {
+}
+
+inline BenchmarkEventT &BenchmarkEventT::operator=(BenchmarkEventT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(tflite_settings, o.tflite_settings);
+  std::swap(event_type, o.event_type);
+  std::swap(result, o.result);
+  std::swap(error, o.error);
+  std::swap(boottime_us, o.boottime_us);
+  std::swap(wallclock_us, o.wallclock_us);
+  return *this;
+}
+
+inline BenchmarkEventT *BenchmarkEvent::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkEventT>(new BenchmarkEventT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkEvent::UnPackTo(BenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = tflite_settings(); if (_e) { if(_o->tflite_settings) { _e->UnPackTo(_o->tflite_settings.get(), _resolver); } else { _o->tflite_settings = std::unique_ptr<tflite::TFLiteSettingsT>(_e->UnPack(_resolver)); } } else if (_o->tflite_settings) { _o->tflite_settings.reset(); } }
+  { auto _e = event_type(); _o->event_type = _e; }
+  { auto _e = result(); if (_e) { if(_o->result) { _e->UnPackTo(_o->result.get(), _resolver); } else { _o->result = std::unique_ptr<tflite::BenchmarkResultT>(_e->UnPack(_resolver)); } } else if (_o->result) { _o->result.reset(); } }
+  { auto _e = error(); if (_e) { if(_o->error) { _e->UnPackTo(_o->error.get(), _resolver); } else { _o->error = std::unique_ptr<tflite::BenchmarkErrorT>(_e->UnPack(_resolver)); } } else if (_o->error) { _o->error.reset(); } }
+  { auto _e = boottime_us(); _o->boottime_us = _e; }
+  { auto _e = wallclock_us(); _o->wallclock_us = _e; }
+}
+
+inline ::flatbuffers::Offset<BenchmarkEvent> BenchmarkEvent::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkEvent(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkEventT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _tflite_settings = _o->tflite_settings ? CreateTFLiteSettings(_fbb, _o->tflite_settings.get(), _rehasher) : 0;
+  auto _event_type = _o->event_type;
+  auto _result = _o->result ? CreateBenchmarkResult(_fbb, _o->result.get(), _rehasher) : 0;
+  auto _error = _o->error ? CreateBenchmarkError(_fbb, _o->error.get(), _rehasher) : 0;
+  auto _boottime_us = _o->boottime_us;
+  auto _wallclock_us = _o->wallclock_us;
+  return tflite::CreateBenchmarkEvent(
+      _fbb,
+      _tflite_settings,
+      _event_type,
+      _result,
+      _error,
+      _boottime_us,
+      _wallclock_us);
+}
+
+
+inline bool operator==(const BestAccelerationDecisionT &lhs, const BestAccelerationDecisionT &rhs) {
+  return
+      (lhs.number_of_source_events == rhs.number_of_source_events) &&
+      ((lhs.min_latency_event == rhs.min_latency_event) || (lhs.min_latency_event && rhs.min_latency_event && *lhs.min_latency_event == *rhs.min_latency_event)) &&
+      (lhs.min_inference_time_us == rhs.min_inference_time_us);
+}
+
+inline bool operator!=(const BestAccelerationDecisionT &lhs, const BestAccelerationDecisionT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BestAccelerationDecisionT::BestAccelerationDecisionT(const BestAccelerationDecisionT &o)
+      : number_of_source_events(o.number_of_source_events),
+        min_latency_event((o.min_latency_event) ? new tflite::BenchmarkEventT(*o.min_latency_event) : nullptr),
+        min_inference_time_us(o.min_inference_time_us) {
+}
+
+inline BestAccelerationDecisionT &BestAccelerationDecisionT::operator=(BestAccelerationDecisionT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(number_of_source_events, o.number_of_source_events);
+  std::swap(min_latency_event, o.min_latency_event);
+  std::swap(min_inference_time_us, o.min_inference_time_us);
+  return *this;
+}
+
+inline BestAccelerationDecisionT *BestAccelerationDecision::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BestAccelerationDecisionT>(new BestAccelerationDecisionT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BestAccelerationDecision::UnPackTo(BestAccelerationDecisionT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = number_of_source_events(); _o->number_of_source_events = _e; }
+  { auto _e = min_latency_event(); if (_e) { if(_o->min_latency_event) { _e->UnPackTo(_o->min_latency_event.get(), _resolver); } else { _o->min_latency_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } else if (_o->min_latency_event) { _o->min_latency_event.reset(); } }
+  { auto _e = min_inference_time_us(); _o->min_inference_time_us = _e; }
+}
+
+inline ::flatbuffers::Offset<BestAccelerationDecision> BestAccelerationDecision::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBestAccelerationDecision(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BestAccelerationDecisionT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _number_of_source_events = _o->number_of_source_events;
+  auto _min_latency_event = _o->min_latency_event ? CreateBenchmarkEvent(_fbb, _o->min_latency_event.get(), _rehasher) : 0;
+  auto _min_inference_time_us = _o->min_inference_time_us;
+  return tflite::CreateBestAccelerationDecision(
+      _fbb,
+      _number_of_source_events,
+      _min_latency_event,
+      _min_inference_time_us);
+}
+
+
+inline bool operator==(const BenchmarkInitializationFailureT &lhs, const BenchmarkInitializationFailureT &rhs) {
+  return
+      (lhs.initialization_status == rhs.initialization_status);
+}
+
+inline bool operator!=(const BenchmarkInitializationFailureT &lhs, const BenchmarkInitializationFailureT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkInitializationFailureT *BenchmarkInitializationFailure::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkInitializationFailureT>(new BenchmarkInitializationFailureT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkInitializationFailure::UnPackTo(BenchmarkInitializationFailureT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = initialization_status(); _o->initialization_status = _e; }
+}
+
+inline ::flatbuffers::Offset<BenchmarkInitializationFailure> BenchmarkInitializationFailure::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkInitializationFailure(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkInitializationFailureT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _initialization_status = _o->initialization_status;
+  return tflite::CreateBenchmarkInitializationFailure(
+      _fbb,
+      _initialization_status);
+}
+
+
+inline bool operator==(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs) {
+  return
+      (lhs.is_log_flushing_event == rhs.is_log_flushing_event) &&
+      ((lhs.best_acceleration_decision == rhs.best_acceleration_decision) || (lhs.best_acceleration_decision && rhs.best_acceleration_decision && *lhs.best_acceleration_decision == *rhs.best_acceleration_decision)) &&
+      ((lhs.initialization_failure == rhs.initialization_failure) || (lhs.initialization_failure && rhs.initialization_failure && *lhs.initialization_failure == *rhs.initialization_failure)) &&
+      ((lhs.benchmark_event == rhs.benchmark_event) || (lhs.benchmark_event && rhs.benchmark_event && *lhs.benchmark_event == *rhs.benchmark_event));
+}
+
+inline bool operator!=(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline MiniBenchmarkEventT::MiniBenchmarkEventT(const MiniBenchmarkEventT &o)
+      : is_log_flushing_event(o.is_log_flushing_event),
+        best_acceleration_decision((o.best_acceleration_decision) ? new tflite::BestAccelerationDecisionT(*o.best_acceleration_decision) : nullptr),
+        initialization_failure((o.initialization_failure) ? new tflite::BenchmarkInitializationFailureT(*o.initialization_failure) : nullptr),
+        benchmark_event((o.benchmark_event) ? new tflite::BenchmarkEventT(*o.benchmark_event) : nullptr) {
+}
+
+inline MiniBenchmarkEventT &MiniBenchmarkEventT::operator=(MiniBenchmarkEventT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(is_log_flushing_event, o.is_log_flushing_event);
+  std::swap(best_acceleration_decision, o.best_acceleration_decision);
+  std::swap(initialization_failure, o.initialization_failure);
+  std::swap(benchmark_event, o.benchmark_event);
+  return *this;
+}
+
+inline MiniBenchmarkEventT *MiniBenchmarkEvent::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MiniBenchmarkEventT>(new MiniBenchmarkEventT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MiniBenchmarkEvent::UnPackTo(MiniBenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = is_log_flushing_event(); _o->is_log_flushing_event = _e; }
+  { auto _e = best_acceleration_decision(); if (_e) { if(_o->best_acceleration_decision) { _e->UnPackTo(_o->best_acceleration_decision.get(), _resolver); } else { _o->best_acceleration_decision = std::unique_ptr<tflite::BestAccelerationDecisionT>(_e->UnPack(_resolver)); } } else if (_o->best_acceleration_decision) { _o->best_acceleration_decision.reset(); } }
+  { auto _e = initialization_failure(); if (_e) { if(_o->initialization_failure) { _e->UnPackTo(_o->initialization_failure.get(), _resolver); } else { _o->initialization_failure = std::unique_ptr<tflite::BenchmarkInitializationFailureT>(_e->UnPack(_resolver)); } } else if (_o->initialization_failure) { _o->initialization_failure.reset(); } }
+  { auto _e = benchmark_event(); if (_e) { if(_o->benchmark_event) { _e->UnPackTo(_o->benchmark_event.get(), _resolver); } else { _o->benchmark_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } else if (_o->benchmark_event) { _o->benchmark_event.reset(); } }
+}
+
+inline ::flatbuffers::Offset<MiniBenchmarkEvent> MiniBenchmarkEvent::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMiniBenchmarkEvent(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MiniBenchmarkEventT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _is_log_flushing_event = _o->is_log_flushing_event;
+  auto _best_acceleration_decision = _o->best_acceleration_decision ? CreateBestAccelerationDecision(_fbb, _o->best_acceleration_decision.get(), _rehasher) : 0;
+  auto _initialization_failure = _o->initialization_failure ? CreateBenchmarkInitializationFailure(_fbb, _o->initialization_failure.get(), _rehasher) : 0;
+  auto _benchmark_event = _o->benchmark_event ? CreateBenchmarkEvent(_fbb, _o->benchmark_event.get(), _rehasher) : 0;
+  return tflite::CreateMiniBenchmarkEvent(
+      _fbb,
+      _is_log_flushing_event,
+      _best_acceleration_decision,
+      _initialization_failure,
+      _benchmark_event);
+}
+
+
+inline bool operator==(const ModelFileT &lhs, const ModelFileT &rhs) {
+  return
+      (lhs.filename == rhs.filename) &&
+      (lhs.fd == rhs.fd) &&
+      (lhs.offset == rhs.offset) &&
+      (lhs.length == rhs.length) &&
+      ((lhs.model_id_group == rhs.model_id_group) || (lhs.model_id_group && rhs.model_id_group && *lhs.model_id_group == *rhs.model_id_group)) &&
+      (lhs.buffer_handle == rhs.buffer_handle);
+}
+
+inline bool operator!=(const ModelFileT &lhs, const ModelFileT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ModelFileT::ModelFileT(const ModelFileT &o)
+      : filename(o.filename),
+        fd(o.fd),
+        offset(o.offset),
+        length(o.length),
+        model_id_group((o.model_id_group) ? new tflite::ModelIdGroupT(*o.model_id_group) : nullptr),
+        buffer_handle(o.buffer_handle) {
+}
+
+inline ModelFileT &ModelFileT::operator=(ModelFileT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(filename, o.filename);
+  std::swap(fd, o.fd);
+  std::swap(offset, o.offset);
+  std::swap(length, o.length);
+  std::swap(model_id_group, o.model_id_group);
+  std::swap(buffer_handle, o.buffer_handle);
+  return *this;
+}
+
+inline ModelFileT *ModelFile::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ModelFileT>(new ModelFileT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ModelFile::UnPackTo(ModelFileT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = filename(); if (_e) _o->filename = _e->str(); }
+  { auto _e = fd(); _o->fd = _e; }
+  { auto _e = offset(); _o->offset = _e; }
+  { auto _e = length(); _o->length = _e; }
+  { auto _e = model_id_group(); if (_e) { if(_o->model_id_group) { _e->UnPackTo(_o->model_id_group.get(), _resolver); } else { _o->model_id_group = std::unique_ptr<tflite::ModelIdGroupT>(_e->UnPack(_resolver)); } } else if (_o->model_id_group) { _o->model_id_group.reset(); } }
+  { auto _e = buffer_handle(); _o->buffer_handle = _e; }
+}
+
+inline ::flatbuffers::Offset<ModelFile> ModelFile::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateModelFile(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ModelFile> CreateModelFile(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ModelFileT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _filename = _o->filename.empty() ? 0 : _fbb.CreateString(_o->filename);
+  auto _fd = _o->fd;
+  auto _offset = _o->offset;
+  auto _length = _o->length;
+  auto _model_id_group = _o->model_id_group ? CreateModelIdGroup(_fbb, _o->model_id_group.get(), _rehasher) : 0;
+  auto _buffer_handle = _o->buffer_handle;
+  return tflite::CreateModelFile(
+      _fbb,
+      _filename,
+      _fd,
+      _offset,
+      _length,
+      _model_id_group,
+      _buffer_handle);
+}
+
+
+inline bool operator==(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs) {
+  return
+      (lhs.model_namespace == rhs.model_namespace) &&
+      (lhs.model_id == rhs.model_id);
+}
+
+inline bool operator!=(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ModelIdGroupT *ModelIdGroup::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ModelIdGroupT>(new ModelIdGroupT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ModelIdGroup::UnPackTo(ModelIdGroupT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = model_namespace(); if (_e) _o->model_namespace = _e->str(); }
+  { auto _e = model_id(); if (_e) _o->model_id = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<ModelIdGroup> ModelIdGroup::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateModelIdGroup(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ModelIdGroupT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _model_namespace = _o->model_namespace.empty() ? 0 : _fbb.CreateString(_o->model_namespace);
+  auto _model_id = _o->model_id.empty() ? 0 : _fbb.CreateString(_o->model_id);
+  return tflite::CreateModelIdGroup(
+      _fbb,
+      _model_namespace,
+      _model_id);
+}
+
+
+inline bool operator==(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs) {
+  return
+      (lhs.storage_file_path == rhs.storage_file_path) &&
+      (lhs.data_directory_path == rhs.data_directory_path);
+}
+
+inline bool operator!=(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkStoragePathsT *BenchmarkStoragePaths::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkStoragePathsT>(new BenchmarkStoragePathsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkStoragePaths::UnPackTo(BenchmarkStoragePathsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = storage_file_path(); if (_e) _o->storage_file_path = _e->str(); }
+  { auto _e = data_directory_path(); if (_e) _o->data_directory_path = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> BenchmarkStoragePaths::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkStoragePaths(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkStoragePathsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _storage_file_path = _o->storage_file_path.empty() ? 0 : _fbb.CreateString(_o->storage_file_path);
+  auto _data_directory_path = _o->data_directory_path.empty() ? 0 : _fbb.CreateString(_o->data_directory_path);
+  return tflite::CreateBenchmarkStoragePaths(
+      _fbb,
+      _storage_file_path,
+      _data_directory_path);
+}
+
+
+inline bool operator==(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs) {
+  return
+      (lhs.per_test_timeout_ms == rhs.per_test_timeout_ms);
+}
+
+inline bool operator!=(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ValidationSettingsT *ValidationSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ValidationSettingsT>(new ValidationSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ValidationSettings::UnPackTo(ValidationSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = per_test_timeout_ms(); _o->per_test_timeout_ms = _e; }
+}
+
+inline ::flatbuffers::Offset<ValidationSettings> ValidationSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateValidationSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ValidationSettings> CreateValidationSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ValidationSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _per_test_timeout_ms = _o->per_test_timeout_ms;
+  return tflite::CreateValidationSettings(
+      _fbb,
+      _per_test_timeout_ms);
+}
+
+
+inline bool operator==(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs) {
+  return
+      (lhs.settings_to_test.size() == rhs.settings_to_test.size() && std::equal(lhs.settings_to_test.cbegin(), lhs.settings_to_test.cend(), rhs.settings_to_test.cbegin(), [](std::unique_ptr<tflite::TFLiteSettingsT> const &a, std::unique_ptr<tflite::TFLiteSettingsT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      ((lhs.model_file == rhs.model_file) || (lhs.model_file && rhs.model_file && *lhs.model_file == *rhs.model_file)) &&
+      ((lhs.storage_paths == rhs.storage_paths) || (lhs.storage_paths && rhs.storage_paths && *lhs.storage_paths == *rhs.storage_paths)) &&
+      ((lhs.validation_settings == rhs.validation_settings) || (lhs.validation_settings && rhs.validation_settings && *lhs.validation_settings == *rhs.validation_settings));
+}
+
+inline bool operator!=(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline MinibenchmarkSettingsT::MinibenchmarkSettingsT(const MinibenchmarkSettingsT &o)
+      : model_file((o.model_file) ? new tflite::ModelFileT(*o.model_file) : nullptr),
+        storage_paths((o.storage_paths) ? new tflite::BenchmarkStoragePathsT(*o.storage_paths) : nullptr),
+        validation_settings((o.validation_settings) ? new tflite::ValidationSettingsT(*o.validation_settings) : nullptr) {
+  settings_to_test.reserve(o.settings_to_test.size());
+  for (const auto &settings_to_test_ : o.settings_to_test) { settings_to_test.emplace_back((settings_to_test_) ? new tflite::TFLiteSettingsT(*settings_to_test_) : nullptr); }
+}
+
+inline MinibenchmarkSettingsT &MinibenchmarkSettingsT::operator=(MinibenchmarkSettingsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(settings_to_test, o.settings_to_test);
+  std::swap(model_file, o.model_file);
+  std::swap(storage_paths, o.storage_paths);
+  std::swap(validation_settings, o.validation_settings);
+  return *this;
+}
+
+inline MinibenchmarkSettingsT *MinibenchmarkSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<MinibenchmarkSettingsT>(new MinibenchmarkSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void MinibenchmarkSettings::UnPackTo(MinibenchmarkSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = settings_to_test(); if (_e) { _o->settings_to_test.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->settings_to_test[_i]) { _e->Get(_i)->UnPackTo(_o->settings_to_test[_i].get(), _resolver); } else { _o->settings_to_test[_i] = std::unique_ptr<tflite::TFLiteSettingsT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->settings_to_test.resize(0); } }
+  { auto _e = model_file(); if (_e) { if(_o->model_file) { _e->UnPackTo(_o->model_file.get(), _resolver); } else { _o->model_file = std::unique_ptr<tflite::ModelFileT>(_e->UnPack(_resolver)); } } else if (_o->model_file) { _o->model_file.reset(); } }
+  { auto _e = storage_paths(); if (_e) { if(_o->storage_paths) { _e->UnPackTo(_o->storage_paths.get(), _resolver); } else { _o->storage_paths = std::unique_ptr<tflite::BenchmarkStoragePathsT>(_e->UnPack(_resolver)); } } else if (_o->storage_paths) { _o->storage_paths.reset(); } }
+  { auto _e = validation_settings(); if (_e) { if(_o->validation_settings) { _e->UnPackTo(_o->validation_settings.get(), _resolver); } else { _o->validation_settings = std::unique_ptr<tflite::ValidationSettingsT>(_e->UnPack(_resolver)); } } else if (_o->validation_settings) { _o->validation_settings.reset(); } }
+}
+
+inline ::flatbuffers::Offset<MinibenchmarkSettings> MinibenchmarkSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMinibenchmarkSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MinibenchmarkSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _settings_to_test = _o->settings_to_test.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TFLiteSettings>> (_o->settings_to_test.size(), [](size_t i, _VectorArgs *__va) { return CreateTFLiteSettings(*__va->__fbb, __va->__o->settings_to_test[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _model_file = _o->model_file ? CreateModelFile(_fbb, _o->model_file.get(), _rehasher) : 0;
+  auto _storage_paths = _o->storage_paths ? CreateBenchmarkStoragePaths(_fbb, _o->storage_paths.get(), _rehasher) : 0;
+  auto _validation_settings = _o->validation_settings ? CreateValidationSettings(_fbb, _o->validation_settings.get(), _rehasher) : 0;
+  return tflite::CreateMinibenchmarkSettings(
+      _fbb,
+      _settings_to_test,
+      _model_file,
+      _storage_paths,
+      _validation_settings);
+}
+
+
+inline bool operator==(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs) {
+  return
+      ((lhs.model_id_group == rhs.model_id_group) || (lhs.model_id_group && rhs.model_id_group && *lhs.model_id_group == *rhs.model_id_group)) &&
+      ((lhs.benchmark_event == rhs.benchmark_event) || (lhs.benchmark_event && rhs.benchmark_event && *lhs.benchmark_event == *rhs.benchmark_event));
+}
+
+inline bool operator!=(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkEventStorageT::BenchmarkEventStorageT(const BenchmarkEventStorageT &o)
+      : model_id_group((o.model_id_group) ? new tflite::ModelIdGroupT(*o.model_id_group) : nullptr),
+        benchmark_event((o.benchmark_event) ? new tflite::BenchmarkEventT(*o.benchmark_event) : nullptr) {
+}
+
+inline BenchmarkEventStorageT &BenchmarkEventStorageT::operator=(BenchmarkEventStorageT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(model_id_group, o.model_id_group);
+  std::swap(benchmark_event, o.benchmark_event);
+  return *this;
+}
+
+inline BenchmarkEventStorageT *BenchmarkEventStorage::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkEventStorageT>(new BenchmarkEventStorageT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkEventStorage::UnPackTo(BenchmarkEventStorageT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = model_id_group(); if (_e) { if(_o->model_id_group) { _e->UnPackTo(_o->model_id_group.get(), _resolver); } else { _o->model_id_group = std::unique_ptr<tflite::ModelIdGroupT>(_e->UnPack(_resolver)); } } else if (_o->model_id_group) { _o->model_id_group.reset(); } }
+  { auto _e = benchmark_event(); if (_e) { if(_o->benchmark_event) { _e->UnPackTo(_o->benchmark_event.get(), _resolver); } else { _o->benchmark_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } else if (_o->benchmark_event) { _o->benchmark_event.reset(); } }
+}
+
+inline ::flatbuffers::Offset<BenchmarkEventStorage> BenchmarkEventStorage::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkEventStorage(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkEventStorageT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _model_id_group = _o->model_id_group ? CreateModelIdGroup(_fbb, _o->model_id_group.get(), _rehasher) : 0;
+  auto _benchmark_event = _o->benchmark_event ? CreateBenchmarkEvent(_fbb, _o->benchmark_event.get(), _rehasher) : 0;
+  return tflite::CreateBenchmarkEventStorage(
+      _fbb,
+      _model_id_group,
+      _benchmark_event);
+}
+
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_CONFIGURATION_TFLITE_H_
diff --git a/tensorflow/lite/acceleration/configuration/coreml_plugin.cc b/tensorflow/lite/acceleration/configuration/coreml_plugin.cc
new file mode 100644
index 00000000000..1db5ffb2e12
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/coreml_plugin.cc
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/minimal_logging.h"
+
+// Guarding anyway although this file not expected to be compiled for non-Apple.
+#if defined(__APPLE__)
+#include "tensorflow/lite/delegates/coreml/coreml_delegate.h"
+
+namespace tflite {
+namespace delegates {
+class CoreMLPlugin : public DelegatePluginInterface {
+ public:
+  TfLiteDelegatePtr Create() override {
+    TfLiteDelegate* delegate_ptr = TfLiteCoreMlDelegateCreate(&options_);
+    TfLiteDelegatePtr delegate(delegate_ptr, [](TfLiteDelegate* delegate) {
+      TfLiteCoreMlDelegateDelete(delegate);
+    });
+    return delegate;
+  }
+  int GetDelegateErrno(TfLiteDelegate* /* from_delegate */) override {
+    return 0;
+  }
+  static std::unique_ptr<CoreMLPlugin> New(
+      const TFLiteSettings& tflite_settings) {
+    return absl::make_unique<CoreMLPlugin>(tflite_settings);
+  }
+  explicit CoreMLPlugin(const TFLiteSettings& tflite_settings) {
+    const CoreMLSettings* settings = tflite_settings.coreml_settings();
+    options_ = TfLiteCoreMlDelegateOptions({});
+    // Using the proto defaults if the settings were not set.
+    switch (settings->enabled_devices()) {
+      case tflite::CoreMLSettings_::EnabledDevices_DEVICES_ALL:
+        options_.enabled_devices = TfLiteCoreMlDelegateAllDevices;
+        break;
+      case tflite::CoreMLSettings_::EnabledDevices_DEVICES_WITH_NEURAL_ENGINE:
+        options_.enabled_devices = TfLiteCoreMlDelegateDevicesWithNeuralEngine;
+        break;
+      default:
+        TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Invalid devices enum: %d",
+                        settings->enabled_devices());
+    }
+    options_.coreml_version = settings->coreml_version();
+    options_.max_delegated_partitions = settings->max_delegated_partitions();
+    options_.min_nodes_per_partition = settings->min_nodes_per_partition();
+  }
+
+ private:
+  TfLiteCoreMlDelegateOptions options_;
+};
+
+TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(CoreMLPlugin, CoreMLPlugin::New);
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // __APPLE__
diff --git a/tensorflow/lite/experimental/acceleration/configuration/delegate_plugin_converter.cc b/tensorflow/lite/acceleration/configuration/delegate_plugin_converter.cc
similarity index 88%
rename from tensorflow/lite/experimental/acceleration/configuration/delegate_plugin_converter.cc
rename to tensorflow/lite/acceleration/configuration/delegate_plugin_converter.cc
index fe222fb3567..cd06f00a6c5 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/delegate_plugin_converter.cc
+++ b/tensorflow/lite/acceleration/configuration/delegate_plugin_converter.cc
@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_plugin_converter.h"
+#include "tensorflow/lite/acceleration/configuration/delegate_plugin_converter.h"
 
 #include <functional>
 #include <memory>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/core/shims/c/common.h"
+#include "tensorflow/lite/c/common.h"
 
 namespace tflite {
 namespace delegates {
 
-using ::tflite_shims::delegates::DelegatePluginInterface;
-using ::tflite_shims::delegates::TfLiteOpaqueDelegatePtr;
+using ::tflite::delegates::DelegatePluginInterface;
+using ::tflite::delegates::TfLiteOpaqueDelegatePtr;
 
 // This class implements the C++ DelegatePluginInterface using
 // the equivalent C API, which is the TfLiteDelegatePlugin struct.
diff --git a/tensorflow/lite/acceleration/configuration/delegate_plugin_converter.h b/tensorflow/lite/acceleration/configuration/delegate_plugin_converter.h
new file mode 100644
index 00000000000..07ea83fe586
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/delegate_plugin_converter.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_CONVERTER_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_CONVERTER_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/delegate_registry.h"
+
+namespace tflite {
+namespace delegates {
+
+// Converts from the C delegate plugin API to the C++ delegate plugin API.
+// Given an instance of the (C) TfLiteOpaqueDelegatePlugin struct, this returns
+// a function that takes a TFLiteSettings FlatBuffer and returns a unique_ptr to
+// an instance of the (C++) DelegatePluginInterface abstract class.
+std::function<std::unique_ptr<tflite::delegates::DelegatePluginInterface>(
+    const ::tflite::TFLiteSettings&)>
+DelegatePluginConverter(const TfLiteOpaqueDelegatePlugin& plugin_c_api);
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_CONVERTER_H_
diff --git a/tensorflow/lite/core/shims/cc/experimental/acceleration/configuration/delegate_registry.h b/tensorflow/lite/acceleration/configuration/delegate_registry.h
similarity index 55%
rename from tensorflow/lite/core/shims/cc/experimental/acceleration/configuration/delegate_registry.h
rename to tensorflow/lite/acceleration/configuration/delegate_registry.h
index 912b89e481b..b1064054f30 100644
--- a/tensorflow/lite/core/shims/cc/experimental/acceleration/configuration/delegate_registry.h
+++ b/tensorflow/lite/acceleration/configuration/delegate_registry.h
@@ -12,23 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_CC_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
 
-// NOLINTBEGIN(whitespace/line_length)
-/// For documentation, see
-/// third_party/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h.
-// NOLINTEND(whitespace/line_length)
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
 
-namespace tflite_shims {
+namespace tflite {
 namespace delegates {
 
-using TfLiteOpaqueDelegatePtr = ::tflite::delegates::TfLiteOpaqueDelegatePtr;
+using TfLiteOpaqueDelegatePtr = ::tflite::delegates::TfLiteDelegatePtr;
 using DelegatePluginInterface = ::tflite::delegates::DelegatePluginInterface;
 using DelegatePluginRegistry = ::tflite::delegates::DelegatePluginRegistry;
 
 }  // namespace delegates
-}  // namespace tflite_shims
+}  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.cc b/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.cc
similarity index 98%
rename from tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.cc
rename to tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.cc
index 296c760fdee..a61d81f4be7 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.cc
+++ b/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.h"
+#include "tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.h"
 
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration.pb.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/minimal_logging.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.h b/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.h
new file mode 100644
index 00000000000..12c37a50c75
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_FLATBUFFER_TO_PROTO_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_FLATBUFFER_TO_PROTO_H_
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration.pb.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+
+// Converts the provided ComputeSettings from flatbuffer to proto format.
+proto::ComputeSettings ConvertFromFlatbuffer(
+    const ComputeSettings& settings, bool skip_mini_benchmark_settings = false);
+
+proto::ComputeSettings ConvertFromFlatbuffer(
+    const ComputeSettingsT& settings,
+    bool skip_mini_benchmark_settings = false);
+
+// Converts the provided MiniBenchmarkEvent from flatbuffer to proto format.
+proto::MiniBenchmarkEvent ConvertFromFlatbuffer(
+    const MiniBenchmarkEvent& event);
+
+proto::MiniBenchmarkEvent ConvertFromFlatbuffer(
+    const MiniBenchmarkEventT& event);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_FLATBUFFER_TO_PROTO_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto_test.cc b/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto_test.cc
similarity index 99%
rename from tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto_test.cc
rename to tensorflow/lite/acceleration/configuration/flatbuffer_to_proto_test.cc
index 3467de23b27..395065e8086 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto_test.cc
+++ b/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.h"
+#include "tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.h"
 
 #include <memory>
 #include <string>
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration.pb.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 namespace acceleration {
diff --git a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc b/tensorflow/lite/acceleration/configuration/gpu_plugin.cc
similarity index 98%
rename from tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc
rename to tensorflow/lite/acceleration/configuration/gpu_plugin.cc
index cc0bca50bc3..ca0c51abf27 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.cc
+++ b/tensorflow/lite/acceleration/configuration/gpu_plugin.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/gpu_plugin.h"
 
 #include <memory>
 #include <string>
diff --git a/tensorflow/lite/acceleration/configuration/gpu_plugin.h b/tensorflow/lite/acceleration/configuration/gpu_plugin.h
new file mode 100644
index 00000000000..a45c44d8ca3
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/gpu_plugin.h
@@ -0,0 +1,79 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
+
+// This file provides the GpuPlugin class, which implements the
+// TFLite Delegate Plugin for the GPU Delegate.
+
+#if defined(__ANDROID__) || defined(CL_DELEGATE_NO_GL)
+#define TFLITE_SUPPORTS_GPU_DELEGATE 1
+#endif
+
+#include <string>
+
+#if TFLITE_SUPPORTS_GPU_DELEGATE
+#include "tensorflow/lite/delegates/gpu/delegate.h"
+#elif defined(__APPLE__)
+#include "TargetConditionals.h"
+#if (TARGET_OS_IPHONE && !TARGET_IPHONE_SIMULATOR) || \
+    (TARGET_OS_OSX && TARGET_CPU_ARM64)
+// Only enable metal delegate when using a real iPhone device or Apple Silicon.
+#define REAL_IPHONE_DEVICE
+#include "tensorflow/lite/delegates/gpu/metal_delegate.h"
+#endif
+#endif
+
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+
+namespace tflite {
+namespace delegates {
+
+// Note that if running on GPU is not supported for some reason (e.g., desktop
+// machine with no OpenGL/CL), this library will still compile but calling
+// Create() will return a nullptr.
+class GpuPlugin : public DelegatePluginInterface {
+ public:
+  explicit GpuPlugin(const TFLiteSettings& tflite_settings);
+  static std::unique_ptr<DelegatePluginInterface> New(
+      const TFLiteSettings& acceleration);
+
+  TfLiteDelegatePtr Create() override;
+  int GetDelegateErrno(TfLiteDelegate* from_delegate) override;
+
+#if TFLITE_SUPPORTS_GPU_DELEGATE
+  const TfLiteGpuDelegateOptionsV2& Options() { return options_; }
+#elif defined(REAL_IPHONE_DEVICE)
+  const TFLGpuDelegateOptions& Options() { return options_; }
+#endif
+
+  std::string GetCacheDir() const { return cache_dir_; }
+  std::string GetModelToken() const { return model_token_; }
+
+ private:
+#if TFLITE_SUPPORTS_GPU_DELEGATE
+  TfLiteGpuDelegateOptionsV2 options_;
+#elif defined(REAL_IPHONE_DEVICE)
+  TFLGpuDelegateOptions options_;
+#endif
+  std::string cache_dir_;
+  std::string model_token_;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin_test.cc b/tensorflow/lite/acceleration/configuration/gpu_plugin_test.cc
similarity index 95%
rename from tensorflow/lite/experimental/acceleration/configuration/gpu_plugin_test.cc
rename to tensorflow/lite/acceleration/configuration/gpu_plugin_test.cc
index c296a2e9d4c..ebd8eca3123 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin_test.cc
+++ b/tensorflow/lite/acceleration/configuration/gpu_plugin_test.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/gpu_plugin.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h"
+#include "tensorflow/lite/acceleration/configuration/configuration.pb.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/acceleration/configuration/hexagon_plugin.cc b/tensorflow/lite/acceleration/configuration/hexagon_plugin.cc
new file mode 100644
index 00000000000..daf6edb452c
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/hexagon_plugin.cc
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+
+#if defined(__ARM_ARCH)
+#include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
+#endif
+
+namespace tflite {
+namespace delegates {
+class HexagonPlugin : public DelegatePluginInterface {
+ public:
+  TfLiteDelegatePtr Create() override {
+#if defined(__ARM_ARCH)
+    TfLiteHexagonInit();
+    auto* delegate_ptr = TfLiteHexagonDelegateCreate(&options_);
+    TfLiteDelegatePtr delegate(delegate_ptr, [](TfLiteDelegate* delegate) {
+      TfLiteHexagonDelegateDelete(delegate);
+      TfLiteHexagonTearDown();
+    });
+    return delegate;
+#else   // !defined(__ARM_ARCH)
+    return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+#endif  // defined(__ARM_ARCH)
+  }
+  int GetDelegateErrno(TfLiteDelegate* /* from_delegate */) override {
+    return 0;
+  }
+  static std::unique_ptr<HexagonPlugin> New(
+      const TFLiteSettings& tflite_settings) {
+    return std::make_unique<HexagonPlugin>(tflite_settings);
+  }
+  explicit HexagonPlugin(const TFLiteSettings& tflite_settings) {
+    const HexagonSettings* settings = tflite_settings.hexagon_settings();
+#if defined(__ARM_ARCH)
+    options_ = TfLiteHexagonDelegateOptions({0});
+    if (settings) {
+      options_.debug_level = settings->debug_level();
+      options_.powersave_level = settings->powersave_level();
+      options_.print_graph_profile = settings->print_graph_profile();
+      options_.print_graph_debug = settings->print_graph_debug();
+      if (tflite_settings.max_delegated_partitions() >= 0) {
+        options_.max_delegated_partitions =
+            tflite_settings.max_delegated_partitions();
+      }
+    }
+#else
+    (void)settings;
+#endif
+  }
+
+ private:
+#if defined(__ARM_ARCH)
+  TfLiteHexagonDelegateOptions options_;
+#endif
+};
+
+TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(HexagonPlugin, HexagonPlugin::New);
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc b/tensorflow/lite/acceleration/configuration/nnapi_plugin.cc
similarity index 91%
rename from tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
rename to tensorflow/lite/acceleration/configuration/nnapi_plugin.cc
index fdda69a72cf..0d3e73a6148 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.cc
+++ b/tensorflow/lite/acceleration/configuration/nnapi_plugin.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // This file implements the TFLite Delegate Plugin for the NNAPI Delegate.
 
-#include "tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/nnapi_plugin.h"
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/acceleration/configuration/nnapi_plugin.h b/tensorflow/lite/acceleration/configuration/nnapi_plugin.h
new file mode 100644
index 00000000000..a7541ae67dc
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/nnapi_plugin.h
@@ -0,0 +1,165 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
+
+// This file provides the NNApiPlugin class, which implements the
+// TFLite Delegate Plugin for the NNAPI Delegate.
+
+#include <memory>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+namespace tflite {
+namespace delegates {
+
+class NnapiPlugin : public DelegatePluginInterface {
+ public:
+  TfLiteDelegatePtr Create() override {
+    std::unique_ptr<tflite::StatefulNnApiDelegate> nnapi_delegate = nullptr;
+    if (!support_library_handle_) {
+      nnapi_delegate =
+          std::make_unique<tflite::StatefulNnApiDelegate>(options_);
+    } else {
+      auto nnapi_support_library_driver =
+          reinterpret_cast<const NnApiSLDriverImplFL5*>(
+              support_library_handle_);
+      nnapi_delegate = std::make_unique<tflite::StatefulNnApiDelegate>(
+          nnapi_support_library_driver, options_);
+    }
+    return TfLiteDelegatePtr(
+        nnapi_delegate.release(), [](TfLiteDelegate* delegate) {
+          delete static_cast<tflite::StatefulNnApiDelegate*>(delegate);
+        });
+  }
+  int GetDelegateErrno(TfLiteDelegate* from_delegate) override {
+    auto nnapi_delegate =
+        static_cast<tflite::StatefulNnApiDelegate*>(from_delegate);
+    return nnapi_delegate->GetNnApiErrno();
+  }
+  static std::unique_ptr<NnapiPlugin> New(
+      const TFLiteSettings& tflite_settings) {
+    return std::make_unique<NnapiPlugin>(tflite_settings);
+  }
+  explicit NnapiPlugin(const TFLiteSettings& tflite_settings) {
+    const NNAPISettings* nnapi_settings = tflite_settings.nnapi_settings();
+    if (!nnapi_settings) return;
+    if (nnapi_settings->accelerator_name() &&
+        nnapi_settings->accelerator_name()->Length() != 0) {
+      accelerator_ = nnapi_settings->accelerator_name()->str();
+      options_.accelerator_name = accelerator_.c_str();
+    }
+
+    SetCompilationCacheDir(tflite_settings);
+    SetModelToken(tflite_settings);
+
+    options_.execution_preference =
+        ConvertExecutionPrefence(nnapi_settings->execution_preference());
+    options_.disallow_nnapi_cpu =
+        !nnapi_settings->allow_nnapi_cpu_on_android_10_plus();
+    options_.execution_priority =
+        ConvertExecutionPriority(nnapi_settings->execution_priority());
+    options_.allow_fp16 = nnapi_settings->allow_fp16_precision_for_fp32();
+    options_.use_burst_computation = nnapi_settings->use_burst_computation();
+    if (tflite_settings.max_delegated_partitions() >= 0) {
+      options_.max_number_delegated_partitions =
+          tflite_settings.max_delegated_partitions();
+    }
+    support_library_handle_ = nnapi_settings->support_library_handle();
+  }
+  const tflite::StatefulNnApiDelegate::Options& Options() { return options_; }
+  const int64_t GetSupportLibraryHandle() { return support_library_handle_; }
+
+ private:
+  void SetCompilationCacheDir(const TFLiteSettings& tflite_settings) {
+    if (tflite_settings.compilation_caching_settings() &&
+        tflite_settings.compilation_caching_settings()->cache_dir() &&
+        tflite_settings.compilation_caching_settings()->cache_dir()->Length() !=
+            0) {
+      cache_dir_ =
+          tflite_settings.compilation_caching_settings()->cache_dir()->str();
+      options_.cache_dir = cache_dir_.c_str();
+    } else if (tflite_settings.nnapi_settings() &&
+               tflite_settings.nnapi_settings()->cache_directory() &&
+               tflite_settings.nnapi_settings()->cache_directory()->Length() !=
+                   0) {
+      cache_dir_ = tflite_settings.nnapi_settings()->cache_directory()->str();
+      options_.cache_dir = cache_dir_.c_str();
+    }
+  }
+
+  void SetModelToken(const TFLiteSettings& tflite_settings) {
+    if (tflite_settings.compilation_caching_settings() &&
+        tflite_settings.compilation_caching_settings()->model_token() &&
+        tflite_settings.compilation_caching_settings()
+                ->model_token()
+                ->Length() != 0) {
+      model_token_ =
+          tflite_settings.compilation_caching_settings()->model_token()->str();
+      options_.model_token = model_token_.c_str();
+    } else if (tflite_settings.nnapi_settings()->model_token() &&
+               tflite_settings.nnapi_settings()->model_token()->Length() != 0) {
+      model_token_ = tflite_settings.nnapi_settings()->model_token()->str();
+      options_.model_token = model_token_.c_str();
+    }
+  }
+
+  static inline tflite::StatefulNnApiDelegate::Options::ExecutionPreference
+  ConvertExecutionPrefence(
+      NNAPIExecutionPreference from_compatibility_preference) {
+    using TflitePreference =
+        tflite::StatefulNnApiDelegate::Options::ExecutionPreference;
+    switch (from_compatibility_preference) {
+      case NNAPIExecutionPreference_NNAPI_LOW_POWER:
+        return TflitePreference::kLowPower;
+      case NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER:
+        return TflitePreference::kFastSingleAnswer;
+      case NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED:
+        return TflitePreference::kSustainedSpeed;
+      default:
+        return TflitePreference::kUndefined;
+    }
+  }
+
+  static inline int ConvertExecutionPriority(
+      NNAPIExecutionPriority from_compatibility_priority) {
+    switch (from_compatibility_priority) {
+      case NNAPIExecutionPriority_NNAPI_PRIORITY_LOW:
+        return ANEURALNETWORKS_PRIORITY_LOW;
+      case NNAPIExecutionPriority_NNAPI_PRIORITY_MEDIUM:
+        return ANEURALNETWORKS_PRIORITY_MEDIUM;
+      case NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH:
+        return ANEURALNETWORKS_PRIORITY_HIGH;
+      default:
+        return ANEURALNETWORKS_PRIORITY_DEFAULT;
+    }
+  }
+
+  std::string accelerator_, cache_dir_, model_token_;
+  tflite::StatefulNnApiDelegate::Options options_;
+  int64_t support_library_handle_ = 0;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc b/tensorflow/lite/acceleration/configuration/nnapi_plugin_test.cc
similarity index 98%
rename from tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
rename to tensorflow/lite/acceleration/configuration/nnapi_plugin_test.cc
index 6da9aff8ef4..ea6b2209a7e 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
+++ b/tensorflow/lite/acceleration/configuration/nnapi_plugin_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/nnapi_plugin.h"
 
 #include <algorithm>
 #include <memory>
@@ -21,13 +21,13 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/kernels/test_util.h"
 
 // Tests for checking that the NNAPI Delegate plugin correctly handles all the
diff --git a/tensorflow/lite/experimental/acceleration/configuration/prev_is_different_than_current_test.sh b/tensorflow/lite/acceleration/configuration/prev_is_different_than_current_test.sh
similarity index 90%
rename from tensorflow/lite/experimental/acceleration/configuration/prev_is_different_than_current_test.sh
rename to tensorflow/lite/acceleration/configuration/prev_is_different_than_current_test.sh
index 14fdc28913b..23a49c65cf4 100755
--- a/tensorflow/lite/experimental/acceleration/configuration/prev_is_different_than_current_test.sh
+++ b/tensorflow/lite/acceleration/configuration/prev_is_different_than_current_test.sh
@@ -16,7 +16,7 @@
 set -o nounset
 set -o errexit
 
-readonly DIR_PREFIX="third_party/tensorflow/lite/experimental/acceleration/configuration"
+readonly DIR_PREFIX="third_party/tensorflow/lite/acceleration/configuration"
 readonly CURRENT_PROTO="$DIR_PREFIX/configuration.proto"
 readonly PREV_PROTO="$DIR_PREFIX/testdata/configuration.proto_prev"
 
diff --git a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc b/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.cc
similarity index 97%
rename from tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
rename to tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.cc
index 81af37ffb27..baf622a07db 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
+++ b/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h"
+#include "tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h"
 
 #include <cstdint>
 
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration.pb.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/minimal_logging.h"
 
 namespace tflite {
@@ -410,6 +410,13 @@ Offset<MinibenchmarkSettings> ConvertMinibenchmarkSettings(
       ConvertBenchmarkStoragePaths(settings.storage_paths(), builder));
 }
 
+const TFLiteSettings* ConvertFromProto(
+    const proto::TFLiteSettings& proto_settings, FlatBufferBuilder* builder) {
+  Offset<TFLiteSettings> settings =
+      ConvertTfliteSettings(proto_settings, *builder);
+  return flatbuffers::GetTemporaryPointer(*builder, settings);
+}
+
 const ComputeSettings* ConvertFromProto(
     const proto::ComputeSettings& proto_settings, FlatBufferBuilder* builder) {
   auto settings = CreateComputeSettings(
diff --git a/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h b/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h
new file mode 100644
index 00000000000..c1496ad7805
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration.pb.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+
+// Converts the provided TFLiteSettings from proto to flatbuffer format.
+// The returned TFLiteSettings pointer is only valid until either the
+// FlatBufferBuilder is modified or when the FlatBufferBuilder's lifetime ends.
+const TFLiteSettings* ConvertFromProto(
+    const proto::TFLiteSettings& proto_settings,
+    flatbuffers::FlatBufferBuilder* builder);
+
+// Converts the provided ComputeSettings from proto to flatbuffer format.
+// The returned ComputeSettings pointer is only valid until either the
+// FlatBufferBuilder is modified or when the FlatBufferBuilder's lifetime ends.
+const ComputeSettings* ConvertFromProto(
+    const proto::ComputeSettings& proto_settings,
+    flatbuffers::FlatBufferBuilder* builder);
+
+// Converts the provided MiniBenchmarkSettings from proto to flatbuffer format.
+// The returned MinibenchmarkSettings pointer is only valid until either the
+// FlatBufferBuilder is modified or when the FlatBufferBuilder's lifetime ends.
+const MinibenchmarkSettings* ConvertFromProto(
+    const proto::MinibenchmarkSettings& proto_settings,
+    flatbuffers::FlatBufferBuilder* builder);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer_test.cc b/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer_test.cc
similarity index 59%
rename from tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer_test.cc
rename to tensorflow/lite/acceleration/configuration/proto_to_flatbuffer_test.cc
index 5423c168818..a1c415d3958 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer_test.cc
+++ b/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h"
+#include "tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h"
 
 #include <string>
 #include <vector>
@@ -49,5 +49,32 @@ TEST(ConversionTest, EdgeTpuSettings) {
   EXPECT_EQ(output_settings->public_model_id()->str(), kPublicModelId);
 }
 
+// Tests converting TFLiteSettings from proto to flatbuffer format.
+TEST(ConversionTest, TFLiteSettings) {
+  // Define the fields to be tested.
+  const std::vector<int32_t> kHardwareClusterIds{1};
+  const std::string kPublicModelId = "public_model_id";
+
+  // Create the proto settings.
+  proto::TFLiteSettings input_settings;
+  input_settings.set_delegate(::tflite::proto::EDGETPU);
+  auto* edgetpu_settings = input_settings.mutable_edgetpu_settings();
+  edgetpu_settings->set_public_model_id(kPublicModelId);
+  flatbuffers::FlatBufferBuilder flatbuffers_builder;
+  *edgetpu_settings->mutable_hardware_cluster_ids() = {
+      kHardwareClusterIds.begin(), kHardwareClusterIds.end()};
+
+  // Convert.
+  auto output_settings = ConvertFromProto(input_settings, &flatbuffers_builder);
+
+  // Verify the conversion results.
+  EXPECT_EQ(output_settings->delegate(), ::tflite::Delegate_EDGETPU);
+  const auto* output_edgetpu_settings = output_settings->edgetpu_settings();
+  EXPECT_EQ(output_edgetpu_settings->hardware_cluster_ids()->size(), 1);
+  EXPECT_EQ(output_edgetpu_settings->hardware_cluster_ids()->Get(0),
+            kHardwareClusterIds[0]);
+  EXPECT_EQ(output_edgetpu_settings->public_model_id()->str(), kPublicModelId);
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.cc b/tensorflow/lite/acceleration/configuration/stable_delegate_plugin.cc
similarity index 91%
rename from tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.cc
rename to tensorflow/lite/acceleration/configuration/stable_delegate_plugin.cc
index 6aa94c04ee1..5e43386a325 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.cc
+++ b/tensorflow/lite/acceleration/configuration/stable_delegate_plugin.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // This file implements the TFLite Delegate Plugin for the NNAPI Delegate.
 
-#include "tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/stable_delegate_plugin.h"
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/acceleration/configuration/stable_delegate_plugin.h b/tensorflow/lite/acceleration/configuration/stable_delegate_plugin.h
new file mode 100644
index 00000000000..c1e7590d086
--- /dev/null
+++ b/tensorflow/lite/acceleration/configuration/stable_delegate_plugin.h
@@ -0,0 +1,92 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
+
+// This file provides the StableDelegatePlugin class, which implements the
+// TFLite Delegate Plugin Interface for the stable delegates.
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
+#include "tensorflow/lite/tools/logging.h"
+
+namespace tflite {
+namespace delegates {
+
+class StableDelegatePlugin : public DelegatePluginInterface {
+ public:
+  static std::unique_ptr<StableDelegatePlugin> New(
+      const TFLiteSettings& tflite_settings) {
+    return std::make_unique<StableDelegatePlugin>(tflite_settings);
+  }
+
+  explicit StableDelegatePlugin(const TFLiteSettings& tflite_settings) {
+    // Creates a copy of TFLiteSettings within the stable delegate plugin.
+    TFLiteSettingsT tflite_settings_t;
+    tflite_settings.UnPackTo(&tflite_settings_t);
+    tflite_settings_builder_.Finish(
+        CreateTFLiteSettings(tflite_settings_builder_, &tflite_settings_t));
+    const StableDelegateLoaderSettings* stable_delegate_loader_settings =
+        GetTFLiteSettings()->stable_delegate_loader_settings();
+    if (!stable_delegate_loader_settings ||
+        !stable_delegate_loader_settings->delegate_path() ||
+        stable_delegate_loader_settings->delegate_path()->Length() == 0) {
+      TFLITE_LOG(ERROR) << "The delegate path field is not available from the "
+                           "provided stable delegate loader settings.";
+      return;
+    }
+    const auto* stable_delegate_ = utils::LoadDelegateFromSharedLibrary(
+        stable_delegate_loader_settings->delegate_path()->str());
+    if (!stable_delegate_) {
+      TFLITE_LOG(ERROR) << "Failed to load stable delegate plugin symbol from "
+                        << stable_delegate_loader_settings->delegate_path();
+      return;
+    }
+    stable_delegate_plugin_ = stable_delegate_->delegate_plugin;
+    TFLITE_LOG(INFO)
+        << "The stable delegate plugin has loaded delegate plugin for "
+        << stable_delegate_->delegate_name;
+  }
+
+  TfLiteDelegatePtr Create() override {
+    return TfLiteDelegatePtr(
+        stable_delegate_plugin_->create(GetTFLiteSettings()),
+        stable_delegate_plugin_->destroy);
+  }
+
+  int GetDelegateErrno(TfLiteOpaqueDelegate* from_delegate) override {
+    return stable_delegate_plugin_->get_delegate_errno(from_delegate);
+  }
+
+ private:
+  const TFLiteSettings* GetTFLiteSettings() {
+    return flatbuffers::GetRoot<TFLiteSettings>(
+        tflite_settings_builder_.GetBufferPointer());
+  }
+
+  const TfLiteOpaqueDelegatePlugin* stable_delegate_plugin_;
+  flatbuffers::FlatBufferBuilder tflite_settings_builder_;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin_test.cc b/tensorflow/lite/acceleration/configuration/stable_delegate_plugin_test.cc
similarity index 96%
rename from tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin_test.cc
rename to tensorflow/lite/acceleration/configuration/stable_delegate_plugin_test.cc
index bde73863593..6ae2dfd7f8a 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin_test.cc
+++ b/tensorflow/lite/acceleration/configuration/stable_delegate_plugin_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "pthreadpool.h"  // from @pthreadpool
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.old.fbs b/tensorflow/lite/acceleration/configuration/testdata/configuration.old.fbs
similarity index 100%
rename from tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.old.fbs
rename to tensorflow/lite/acceleration/configuration/testdata/configuration.old.fbs
diff --git a/tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.proto_prev b/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
similarity index 93%
rename from tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.proto_prev
rename to tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
index 6edc40cdf11..e1a12dc8255 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.proto_prev
+++ b/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
@@ -45,10 +45,10 @@
 // code (configuration_generated.h) when modifying this file. See BUILD for
 // more information. Below are manual steps for reference:
 // bazel build
-// //tensorflow/lite/experimental/acceleration/configuration:proto_to_flatbuffer
+// //tensorflow/lite/acceleration/configuration:proto_to_flatbuffer
 // && cp
-// bazel-bin/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
-// tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h.oss
+// bazel-bin/tensorflow/lite/acceleration/configuration/configuration_generated.h
+// tensorflow/lite/acceleration/configuration/configuration_generated.h.oss
 
 syntax = "proto2";
 
@@ -344,7 +344,7 @@ message CoreMLSettings {
 // Stable delegate loader settings.
 //
 // See
-// tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h
+// tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h
 // An example stable delegate:
 // tensorflow/lite/delegates/utils/experimental/sample_stable_delegate
 message StableDelegateLoaderSettings {
@@ -489,6 +489,19 @@ message EdgeTpuSettings {
 
   // Cluster IDs the model will be compiled for.
   repeated int32 hardware_cluster_ids = 8 [packed = true];
+
+  // Public model ID to be logged in logs, traces and metrics for identifying
+  // the model to help debugging.
+  // The configured string must obey the following rules:
+  // 1. Must not contain any confidential information, because public_model_id
+  // will be logged in android logs and traces which are publicly visible.
+  // 2. Must not contain any private user data or PII (Personally Identifiable
+  // Information), such as age, language, geography, religion, etc.
+  // 3. Should be <=30 chars, otherwise EdgeTpu software might truncate the
+  // string due to logging size constraints.
+  // 4. Please try to use a unique name so that it's easier to identify the
+  // model during debugging.
+  optional string public_model_id = 9;
 }
 
 // Google EdgeTPU delegate settings.
@@ -518,6 +531,29 @@ message GoogleEdgeTpuSettings {
 
   // Reserved.
   optional bytes extension_data = 4;
+
+  // A unique identifier of the input model. Creating delegates with different
+  // user model binaries with the same model identifier will overwrite
+  // previously cached entries, saving disk space.
+  // If this field is not set, the model will be treated as a new entry, and
+  // will cost disk space to cache.
+  // This field is different from the model_token in CompilationCachingSettings,
+  // where the users may reuse the same model_identifier for different flavors
+  // of the same model to save disk space, whereas model_token must be unique.
+  // Example usage:
+  // (1) An app only uses one model, and wants to update the model.
+  // For both the existing and new models, set: model_identifier = "my_model"
+  // Creating the delegate with the new entry this way will delete the old
+  // cache entry, and replace it with the new version of "my_model"
+  // (2) An app A/B tests two versions of the same model (e.g. a stable version
+  //  and a testing/staging/beta version), and wants to frequently switch
+  // between them.
+  // The clients should use different model_identifier for the two variants.
+  // Model_A: model_identifier = "my_model_a"
+  // Model_B: model_identifier = "my_model_b"
+  // Both Model A and B will be cached separately, and coexist for efficient
+  // lookups.
+  optional string model_identifier = 5 [default = ""];
 }
 
 // Coral Dev Board / USB accelerator delegate settings.
diff --git a/tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin.cc b/tensorflow/lite/acceleration/configuration/xnnpack_plugin.cc
similarity index 90%
rename from tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin.cc
rename to tensorflow/lite/acceleration/configuration/xnnpack_plugin.cc
index 93c875b09d8..81566bcd7f1 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin.cc
+++ b/tensorflow/lite/acceleration/configuration/xnnpack_plugin.cc
@@ -15,9 +15,9 @@ limitations under the License.
 #include <memory>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin_test.cc b/tensorflow/lite/acceleration/configuration/xnnpack_plugin_test.cc
similarity index 95%
rename from tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin_test.cc
rename to tensorflow/lite/acceleration/configuration/xnnpack_plugin_test.cc
index 0ecccf57dab..2aa1d95a44f 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin_test.cc
+++ b/tensorflow/lite/acceleration/configuration/xnnpack_plugin_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "pthreadpool.h"  // from @pthreadpool
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/allocation.h b/tensorflow/lite/allocation.h
index 2d83939acc7..6840646a115 100644
--- a/tensorflow/lite/allocation.h
+++ b/tensorflow/lite/allocation.h
@@ -83,6 +83,18 @@ class MMAPAllocation : public Allocation {
 
   int fd() const { return mmap_fd_; }
 
+  // The start address of the mmapped buffer.
+  // This will be base() rounded down to the nearest page boundary.
+  const void* mmapped_buffer() const { return mmapped_buffer_; }
+
+  // The size of the mmapped buffer.
+  size_t mmapped_buffer_size() const { return bytes() + offset_in_buffer_; }
+
+  // Offset of mmapped_buffer() in the file referenced by the file descriptor.
+  size_t mmapped_buffer_offset_in_file() const {
+    return offset_of_buffer_in_file_;
+  }
+
   static bool IsSupported();
 
  protected:
@@ -92,6 +104,7 @@ class MMAPAllocation : public Allocation {
   size_t buffer_size_bytes_ = 0;
   // Used when the address to mmap is not page-aligned.
   size_t offset_in_buffer_ = 0;
+  size_t offset_of_buffer_in_file_ = 0;
 
  private:
   // Assumes ownership of the provided `owned_fd` instance.
diff --git a/tensorflow/lite/allocation_test.cc b/tensorflow/lite/allocation_test.cc
index 883166bcb58..c17f52f25a3 100644
--- a/tensorflow/lite/allocation_test.cc
+++ b/tensorflow/lite/allocation_test.cc
@@ -86,7 +86,8 @@ TEST(MMAPAllocation, TestInvalidSizeAndOffset) {
                                            &error_reporter);
   EXPECT_FALSE(allocation_invalid_length.valid());
 
-  MMAPAllocation allocation_excessive_length(fd, /*offset=*/0, /*length=*/0,
+  MMAPAllocation allocation_excessive_length(fd, /*offset=*/0,
+                                             /*length=*/file_size + 1,
                                              &error_reporter);
   EXPECT_FALSE(allocation_excessive_length.valid());
 
diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc
index bbe7455a115..32a7eadba3d 100644
--- a/tensorflow/lite/arena_planner.cc
+++ b/tensorflow/lite/arena_planner.cc
@@ -41,6 +41,7 @@ bool ShareFirstInputWithFirstOutputForNode(const TfLiteRegistration& node_reg) {
     case kTfLiteBuiltinExpandDims:
     case kTfLiteBuiltinReshape:
     case kTfLiteBuiltinSqueeze:
+    case kTfLiteBuiltinBitcast:
       return true;
     default:
       return false;
diff --git a/tensorflow/lite/async/BUILD b/tensorflow/lite/async/BUILD
new file mode 100644
index 00000000000..91845f0a536
--- /dev/null
+++ b/tensorflow/lite/async/BUILD
@@ -0,0 +1,39 @@
+# Description:
+#
+# This package contains shim library targets for the Async C package.
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:private",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library_with_tflite(
+    name = "backend_async_kernel_interface",
+    srcs = ["backend_async_kernel_interface.cc"],
+    hdrs = ["backend_async_kernel_interface.h"],
+    tflite_deps = [
+        "//tensorflow/lite/async/c:async_kernel",
+        "//tensorflow/lite/async/c:types",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_test(
+    name = "backend_async_kernel_interface_test",
+    srcs = ["backend_async_kernel_interface_test.cc"],
+    deps = [
+        ":backend_async_kernel_interface",
+        "//tensorflow/lite/async/c:types",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/async:async_kernel_internal",
+        "//tensorflow/lite/core/async/testing:mock_async_kernel",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/core/async/backend_async_kernel_interface.cc b/tensorflow/lite/async/backend_async_kernel_interface.cc
similarity index 97%
rename from tensorflow/lite/core/async/backend_async_kernel_interface.cc
rename to tensorflow/lite/async/backend_async_kernel_interface.cc
index 22b524955fd..ef7bc056301 100644
--- a/tensorflow/lite/core/async/backend_async_kernel_interface.cc
+++ b/tensorflow/lite/async/backend_async_kernel_interface.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/core/async/backend_async_kernel_interface.h"
+#include "tensorflow/lite/async/backend_async_kernel_interface.h"
 
 #include <vector>
 
-#include "tensorflow/lite/core/async/c/async_kernel.h"
-#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/async/c/async_kernel.h"
+#include "tensorflow/lite/async/c/types.h"
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/async/backend_async_kernel_interface.h b/tensorflow/lite/async/backend_async_kernel_interface.h
new file mode 100644
index 00000000000..2849c229c61
--- /dev/null
+++ b/tensorflow/lite/async/backend_async_kernel_interface.h
@@ -0,0 +1,200 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
+#define TENSORFLOW_LITE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
+
+#include <vector>
+
+#include "tensorflow/lite/async/c/async_kernel.h"
+#include "tensorflow/lite/async/c/types.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace delegates {
+
+// A C++ wrapper around TfLiteAsyncKernel C API that delegate developers
+// can use to add support for asynchronous execution.
+// The implementation of `BackendAsyncKernelInterface` must be thread safe.
+class BackendAsyncKernelInterface {
+ public:
+  BackendAsyncKernelInterface();
+  virtual ~BackendAsyncKernelInterface() { TfLiteAsyncKernelDelete(kernel_); }
+
+  // Returns the TfLiteAsyncKernel instance.
+  // kernel_ will be filled with the implementation of the class.
+  virtual TfLiteAsyncKernel* kernel() { return kernel_; }
+
+  // The following methods should be implemented to support buffer interop
+  // and asynchronous execution.
+
+  // Buffer operations
+  // ======================
+  // Registers the TfLiteBackendBuffer to `handle`.
+  // `TfLiteBackendBuffer` is a wrapper around a platform-specific buffer
+  // object (e.g. AHardwareBuffer).
+  // `buffer` and `attrs` lifespan is not guaranteed after the function call.
+  // kernels should read the stored attributes instead of caching the
+  // attribute map.
+  // `io_type` specifies whether this buffer is used as an input buffer
+  // or an output buffer. If a buffer is both used as input and output,
+  // specify it as output. Not null.
+  // `attrs` describes the attributes of the buffer. It's guaranteed to be
+  // of kTfLiteAttrMapTypeBuffer type and not null.
+  // `handle` is the buffer handle assigned by TfLite runtime to recognize
+  // this piece of buffer.
+  // In `attrs`, the application must provide the type of the buffer.
+  // If additional attributes (e.g. padding, size) are provided, the backend
+  // is responsible for validating those attributes to be compatible.
+  // The backend will not own the actual buffer wrapped in `buffer`, but the
+  // backend can choose to increase the ref count if underlying implementation
+  // supports that.
+  virtual TfLiteStatus RegisterBuffer(TfLiteOpaqueContext* context,
+                                      TfLiteIoType io_type,
+                                      const TfLiteBackendBuffer* buffer,
+                                      const TfLiteAttributeMap* attrs,
+                                      TfLiteBufferHandle handle) = 0;
+
+  // Registers a buffer slice from a previously registered memory.
+  // `buffer` is the handle of the buffer pool previously registered.
+  // `attrs` contains the information of the buffer slice.
+  // `handle` is the buffer handle assigned by TfLite runtime to recognize
+  // this piece of buffer.
+  // NOTE: The backend is responsible to validate the slicing is "valid":
+  // * The slicing is not nested from another slice. (i.e. the `buffer_pool` is
+  //   a handle returned by `RegisterBuffer`.)
+  // * The attributes of the slice (e.g. size, offset) is of valid values
+  //   from the buffer pool.
+  // If the `handle` is not recognized, returns error.
+  virtual TfLiteStatus RegisterBufferSlice(TfLiteOpaqueContext* context,
+                                           TfLiteBufferHandle buffer_pool,
+                                           const TfLiteAttributeMap* attrs,
+                                           TfLiteBufferHandle handle) = 0;
+
+  // Unregisters a buffer or a buffer slice.
+  // `handle` is a buffer handle previously assigned via register_* calls.
+  // If the `handle` is not recognized, returns error.
+  // Unregistering the buffer does not mean deallocating the buffer. However
+  // the backend need to reduce the ref-count if ref counting is performed
+  // during `Register*` calls.
+  virtual TfLiteStatus UnregisterBuffer(TfLiteOpaqueContext* context,
+                                        TfLiteBufferHandle handle) = 0;
+
+  // Reconciliations
+  // ===================
+  // Inspects the buffer object types supported by the backend.
+  // `io_type` specify whether the call returns supported input or output
+  // buffer.
+  virtual const std::vector<const char*>& SupportedBufferTypes(
+      TfLiteIoType io_type) const = 0;
+
+  // Inspects the sync object types supported by the backend.
+  // `io_type` specify whether the call returns supported input or output
+  // sync object.
+  virtual const std::vector<const char*>& SupportedSynchronizations(
+      TfLiteIoType io_type) const = 0;
+
+  // Reconciles buffer or sync attributes for tensor at tensor_index.
+  // Fills `merged` with reconciled attributes.
+  // If `conflict` is provided, conflicting attributes will be provided there.
+  // If the type of the `user_provided_attributes` is not recognizable, returns
+  // error.
+  // If any of the attribute in the `user_provided_attributes` is not
+  // recognizable skip this attribute.
+  // Returns true if the attribute map type is recognizable and there's no
+  // conflicting attribute.
+  virtual bool ReconcileRestrictions(
+      const TfLiteOpaqueContext* context, const TfLiteOpaqueNode* node,
+      int tensor_index, const TfLiteAttributeMap* user_provided_attributes,
+      TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict) const = 0;
+
+  // Sets the input / output buffer / sync attributes.
+  // Backend kernel will check the input attributes covers all the requirements.
+  // A typical workflow is for callers call Reconcile*Restrictions method
+  // above to have a merged attribute list, check all restrictions are met
+  // and set input / output attribute here.
+  // Returns TfLiteOk if provided `attrs` covers all requirements.
+  virtual TfLiteStatus SetAttributes(TfLiteOpaqueContext* context,
+                                     TfLiteOpaqueNode* node, int tensor_index,
+                                     const TfLiteAttributeMap* attrs) = 0;
+
+  // Prepares the kernel using the information from Set[In|Out]putAttributes
+  // call above.
+  virtual TfLiteStatus Prepare(TfLiteOpaqueContext* context,
+                               TfLiteOpaqueNode* node) = 0;
+
+  // Execution methods
+  // =============================
+
+  // Schedules an execution with the information provided in task.
+  // The application is responsible for filling out buffer and sync mappings
+  // to tensors.
+  // Backend will set the sync ptr for related tensors if requested.
+  // i.e. SetOutputAttributes has sync implementation requested, and
+  // the TfLiteSynchronization is not null for the tensor in `task`.
+  //
+  // TfLite runtime guarantees that the task is in ready state (i.e. no
+  // un-ended execution for this task).
+  //
+  // Input synchronizations:
+  // If the synchronization of a input tensor is `kTfLiteSyncTypeNoSyncObj`
+  // type or it's nullptr, it means the data is ready during Eval call.
+  // If not, data will be available when the synchronization signals and the
+  // backend is responsible for closing the underlying synchronization.
+  // The backend is responsible for dedupping the input sync.
+  //
+  // Output synchronizations:
+  // If the synchronization type is `kTfLiteSyncTypeNoSyncObj` or is nullptr,
+  // the backend does not need to provide synchronization objects to the user.
+  // Otherwise, the backend need to provide the sync according to the sync type
+  // provided. The underlying sync object will be closed by the app (or
+  // downstream components).
+  // If there are multiple non-nullptr kTfLiteSynchronization provided for
+  // different output tensors, the backend is responsible for duplicating the
+  // synchronization.
+  // TODO(b/191883048): What if the sync fence is not dup-able?
+  //
+  // Returns kTfLiteOk if the execution is successfully scheduled.
+  virtual TfLiteStatus Eval(TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node,
+                            TfLiteExecutionTask* task) = 0;
+
+  // Waits on the execution scheduled using the task to finish.
+  // TfLite runtime guarantees that the task has an un-ended execution.
+  //
+  // Callers should be able to call `Wait` on the same task from multiple
+  // threads, and those calls should return the same status (i.e. if the backend
+  // failed to successfully wait on the task, all `Wait` to the task should
+  // return the same error before a new invocation is scheduled). Returns
+  // kTfLiteOk if the task is finished (w/ or w/o blocking).
+  virtual TfLiteStatus Wait(TfLiteOpaqueContext* context,
+                            TfLiteExecutionTask* task) = 0;
+
+  // Finishes the task and clean up allocated resources for the task.
+  // May block if there's pending executions.
+  // This function will be called once and only once for individual task.
+  // Returns kTfLiteOk if there's no error. The backend is responsible to
+  // clean up task resources regardless there's error or not.
+  virtual TfLiteStatus Finish(TfLiteOpaqueContext* context,
+                              TfLiteExecutionTask* task) = 0;
+
+ protected:
+  TfLiteAsyncKernel* kernel_ = nullptr;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
diff --git a/tensorflow/lite/core/async/backend_async_kernel_interface_test.cc b/tensorflow/lite/async/backend_async_kernel_interface_test.cc
similarity index 91%
rename from tensorflow/lite/core/async/backend_async_kernel_interface_test.cc
rename to tensorflow/lite/async/backend_async_kernel_interface_test.cc
index ba8526eb48c..2620cc707a1 100644
--- a/tensorflow/lite/core/async/backend_async_kernel_interface_test.cc
+++ b/tensorflow/lite/async/backend_async_kernel_interface_test.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/core/async/backend_async_kernel_interface.h"
+#include "tensorflow/lite/async/backend_async_kernel_interface.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/async/c/types.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/async/async_kernel_internal.h"
-#include "tensorflow/lite/core/async/c/types.h"
 #include "tensorflow/lite/core/async/testing/mock_async_kernel.h"
-#include "tensorflow/lite/core/c/c_api_types.h"
-#include "tensorflow/lite/core/c/common.h"
 
 using ::testing::_;
 
diff --git a/tensorflow/lite/async/c/BUILD b/tensorflow/lite/async/c/BUILD
index 0cc343b8f4a..1f194390cfa 100644
--- a/tensorflow/lite/async/c/BUILD
+++ b/tensorflow/lite/async/c/BUILD
@@ -2,24 +2,44 @@
 # that forward to the TF Lite C and C++ API targets.
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
 
-cc_library(
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:private",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library_with_tflite(
     name = "task",
     hdrs = ["task.h"],
     copts = tflite_copts() + tflite_copts_warnings(),
+    visibility = ["//visibility:public"],
     deps = ["//tensorflow/lite/core/async/c:task"],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "types",
     hdrs = ["types.h"],
     copts = tflite_copts() + tflite_copts_warnings(),
+    visibility = ["//visibility:public"],
     deps = ["//tensorflow/lite/core/async/c:types"],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "async_signature_runner",
     hdrs = ["async_signature_runner.h"],
     copts = tflite_copts() + tflite_copts_warnings(),
+    visibility = ["//visibility:public"],
     deps = ["//tensorflow/lite/core/async/c:async_signature_runner"],
 )
+
+cc_library_with_tflite(
+    name = "async_kernel",
+    hdrs = ["async_kernel.h"],
+    copts = tflite_copts() + tflite_copts_warnings(),
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/lite/core/async/c:async_kernel"],
+)
diff --git a/tensorflow/lite/async/interop/c/BUILD b/tensorflow/lite/async/interop/c/BUILD
index a4a9aca14c8..53835b1384b 100644
--- a/tensorflow/lite/async/interop/c/BUILD
+++ b/tensorflow/lite/async/interop/c/BUILD
@@ -7,6 +7,9 @@ cc_library(
     name = "attribute_map",
     hdrs = ["attribute_map.h"],
     copts = tflite_copts() + tflite_copts_warnings(),
+    visibility = [
+        "//visibility:public",
+    ],
     deps = ["//tensorflow/lite/core/async/interop/c:attribute_map"],
 )
 
@@ -14,5 +17,20 @@ cc_library(
     name = "types",
     hdrs = ["types.h"],
     copts = tflite_copts() + tflite_copts_warnings(),
+    visibility = [
+        "//visibility:public",
+    ],
     deps = ["//tensorflow/lite/core/async/interop/c:types"],
 )
+
+cc_library(
+    name = "constants",
+    hdrs = ["constants.h"],
+    copts = tflite_copts() + tflite_copts_warnings(),
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//tensorflow/lite/core/async/interop/c:constants",
+    ],
+)
diff --git a/tensorflow/lite/core/shims/c/c_api_experimental.h b/tensorflow/lite/async/interop/c/constants.h
similarity index 66%
rename from tensorflow/lite/core/shims/c/c_api_experimental.h
rename to tensorflow/lite/async/interop/c/constants.h
index a317ef1a2fe..07365bf9f41 100644
--- a/tensorflow/lite/core/shims/c/c_api_experimental.h
+++ b/tensorflow/lite/async/interop/c/constants.h
@@ -1,20 +1,17 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_C_C_API_EXPERIMENTAL_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_C_C_API_EXPERIMENTAL_H_
+#ifndef TENSORFLOW_LITE_ASYNC_INTEROP_C_CONSTANTS_H_
+#define TENSORFLOW_LITE_ASYNC_INTEROP_C_CONSTANTS_H_
 
-#include "tensorflow/lite/core/c/c_api_experimental.h"
+#include "tensorflow/lite/core/async/interop/c/constants.h"  // IWYU pragma: export
 
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_C_C_API_EXPERIMENTAL_H_
+#endif  // TENSORFLOW_LITE_ASYNC_INTEROP_C_CONSTANTS_H_
diff --git a/tensorflow/lite/async/testing/BUILD b/tensorflow/lite/async/testing/BUILD
new file mode 100644
index 00000000000..f8ab5e4c8b9
--- /dev/null
+++ b/tensorflow/lite/async/testing/BUILD
@@ -0,0 +1,24 @@
+# Test utilities for TFLite async execution.
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:private",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library_with_tflite(
+    name = "mock_async_kernel",
+    testonly = 1,
+    hdrs = ["mock_async_kernel.h"],
+    tflite_deps = [
+        "//tensorflow/lite/async:backend_async_kernel_interface",
+        "//tensorflow/lite/async/c:types",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/async/testing/mock_async_kernel.h b/tensorflow/lite/async/testing/mock_async_kernel.h
new file mode 100644
index 00000000000..a3297f849b1
--- /dev/null
+++ b/tensorflow/lite/async/testing/mock_async_kernel.h
@@ -0,0 +1,79 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
+#define TENSORFLOW_LITE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include "tensorflow/lite/async/backend_async_kernel_interface.h"
+#include "tensorflow/lite/async/c/types.h"
+
+namespace tflite {
+namespace async {
+namespace testing {
+
+// A fully mocked out async kernel.
+// Mocked TfLiteAsyncKernel can be retreived by `MockAsyncKernel::kernel()`.
+class MockAsyncKernel : public delegates::BackendAsyncKernelInterface {
+ public:
+  MOCK_METHOD(TfLiteStatus, RegisterBuffer,
+              (TfLiteOpaqueContext*, TfLiteIoType, const TfLiteBackendBuffer*,
+               const TfLiteAttributeMap*, TfLiteBufferHandle),
+              (override));
+  MOCK_METHOD(TfLiteStatus, RegisterBufferSlice,
+              (TfLiteOpaqueContext*, TfLiteBufferHandle,
+               const TfLiteAttributeMap*, TfLiteBufferHandle),
+              (override));
+  MOCK_METHOD(TfLiteStatus, UnregisterBuffer,
+              (TfLiteOpaqueContext*, TfLiteBufferHandle), (override));
+  MOCK_METHOD(bool, ReconcileRestrictions,
+              (const TfLiteOpaqueContext*, const TfLiteOpaqueNode*, int,
+               const TfLiteAttributeMap*, TfLiteAttributeMap*,
+               TfLiteAttributeMap*),
+              (const, override));
+  MOCK_METHOD(TfLiteStatus, SetAttributes,
+              (TfLiteOpaqueContext*, TfLiteOpaqueNode*, int,
+               const TfLiteAttributeMap*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, Prepare, (TfLiteOpaqueContext*, TfLiteOpaqueNode*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, Eval,
+              (TfLiteOpaqueContext*, TfLiteOpaqueNode*, TfLiteExecutionTask*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, Wait, (TfLiteOpaqueContext*, TfLiteExecutionTask*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, Finish,
+              (TfLiteOpaqueContext*, TfLiteExecutionTask*), (override));
+
+  const std::vector<const char*>& SupportedBufferTypes(
+      TfLiteIoType io_type) const override {
+    return buffer_types_;
+  }
+  const std::vector<const char*>& SupportedSynchronizations(
+      TfLiteIoType io_type) const override {
+    return sync_types_;
+  }
+
+ private:
+  const std::vector<const char*> buffer_types_{"buffer_type"};
+  const std::vector<const char*> sync_types_{"sync_type"};
+};
+
+}  // namespace testing
+}  // namespace async
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 8c927f2ab0e..8f022d52698 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -202,6 +202,9 @@ def tflite_jni_binary(
         clean_dep("//tensorflow:windows"): [],
         "//conditions:default": [
             "-Wl,--version-script,$(location {})".format(linkscript),
+            # copybara:uncomment_begin(google-only)
+            # "-Wl,--undefined-version",
+            # copybara:uncomment_end
             "-Wl,-soname," + name,
         ],
     })
@@ -691,7 +694,12 @@ def tflite_combine_cc_tests(
         extra_build_test_tags = [],
         generate_cc_library = False,
         **kwargs):
-    """Combine all certain cc_tests into a single cc_test and a build_test.
+    """Combine certain cc_tests into a single cc_test and a build_test.
+
+    This rule should normally be placed at the bottom of a package.
+    Any cc_test rules that appear after the call to this rule will not
+    be included in the combined cc_test rule, even if they meet the
+    other conditions.
 
     Args:
       name: the name of the combined cc_test.
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index 3370730802d..f9871add248 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -186,6 +186,9 @@ typedef enum {
   kTfLiteBuiltinAtan2 = 156,
   kTfLiteBuiltinUnsortedSegmentMin = 157,
   kTfLiteBuiltinSign = 158,
+  kTfLiteBuiltinBitcast = 159,
+  kTfLiteBuiltinBitwiseXor = 160,
+  kTfLiteBuiltinRightShift = 161,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 373b9a13fb9..682391f2377 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -5,6 +5,7 @@ load(
     "tflite_copts",
     "tflite_copts_warnings",
     "tflite_custom_c_library",
+    "tflite_linkopts_no_undefined",
     "tflite_self_contained_libs_test_suite",
 )
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
@@ -25,7 +26,7 @@ package(
 #   - Windows: `tensorflowlite_c.dll`
 tflite_cc_shared_object(
     name = "tensorflowlite_c",
-    linkopts = select({
+    linkopts = tflite_linkopts_no_undefined() + select({
         "//tensorflow:ios": [
             "-Wl,-exported_symbols_list,$(location //tensorflow/lite/c:exported_symbols.lds)",
         ],
@@ -34,7 +35,6 @@ tflite_cc_shared_object(
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
-            "-z defs",
             "-Wl,--version-script,$(location //tensorflow/lite/c:version_script.lds)",
         ],
     }),
@@ -58,6 +58,7 @@ cc_library_with_tflite_with_c_headers_test(
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
         ":c_api_without_op_resolver",
+        "//tensorflow/lite/core/async/c:types",
         "//tensorflow/lite/core/c:c_api",
     ],
     alwayslink = 1,  # Why?? TODO(b/161243354): eliminate this.
@@ -117,6 +118,7 @@ cc_library_with_tflite_with_c_headers_test(
     ],
     deps = [
         ":c_api_opaque_internal",
+        ":common_internal",
         "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/c:c_api",
         "//tensorflow/lite/core/c:c_api_experimental",
@@ -134,12 +136,12 @@ cc_library_with_tflite_with_c_headers_test(
         "c_api_experimental.h",
         "c_api_opaque.h",
     ],
-    aspect_hints = ["//tools/build_defs/swift:auto_module"],
     copts = tflite_copts() + tflite_copts_warnings(),
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
         ":c_api_opaque_internal",
         ":c_api_without_op_resolver",
+        ":common_internal",
         "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/c:c_api_experimental_without_op_resolver",
         "//tensorflow/lite/core/c:c_api_types",
@@ -157,7 +159,6 @@ cc_library_with_tflite_with_c_headers_test(
         "c_api_experimental.h",
         "c_api_opaque.h",
     ],
-    aspect_hints = ["//tools/build_defs/swift:auto_module"],
     copts = tflite_copts() + tflite_copts_warnings(),
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
@@ -275,9 +276,9 @@ cc_library_with_tflite_with_c_headers_test(
         ":c_api_types",
     ],
     deps = [
-        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite:tflite_kernel_use_xnnpack_optional",
         "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
     ] + select({
         "//tensorflow/lite:tensorflow_profiler_config": [
             "//tensorflow/lite:macros",
diff --git a/tensorflow/lite/c/c_api.h b/tensorflow/lite/c/c_api.h
index 12bf86e955f..4b09cf88440 100644
--- a/tensorflow/lite/c/c_api.h
+++ b/tensorflow/lite/c/c_api.h
@@ -15,6 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_C_C_API_H_
 #define TENSORFLOW_LITE_C_C_API_H_
 
+/// \file
+///
+/// C API for TensorFlow Lite.
+///
+/// For documentation, see tensorflow/lite/core/c/c_api.h
+
 #include "tensorflow/lite/core/c/c_api.h"
 
 #endif  // TENSORFLOW_LITE_C_C_API_H_
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index 6882b0eface..9b33caf04de 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <stdarg.h>
 
+#include <functional>
 #include <memory>
 #include <mutex>  // NOLINT
 #include <vector>
@@ -25,7 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/profiling/telemetry/c/profiler.h"
 #include "tensorflow/lite/signature_runner.h"
@@ -55,32 +56,42 @@ struct TfLiteOpResolverCallbacks {
   // code.  The `user_data` parameter will be set to the
   // `op_resolver_user_data` value that was passed to
   // `TfLiteInterpreterOptionsSetOpResolver`.
-  const TfLiteRegistration* (*find_builtin_op)(void* user_data,
-                                               TfLiteBuiltinOperator op,
-                                               int version);
+  std::function<const TfLiteRegistration*(
+      void* user_data, TfLiteBuiltinOperator op, int version)>
+      find_builtin_op;
   // Callback that finds the op registration of a custom operator by op name.
   // The `user_data` parameter will be set to the `op_resolver_user_data` value
   // that was passed to `TfLiteInterpreterOptionsSetOpResolver`.
-  const TfLiteRegistration* (*find_custom_op)(void* user_data, const char* op,
-                                              int version);
+  std::function<const TfLiteRegistration*(void* user_data, const char* op,
+                                          int version)>
+      find_custom_op;
 
-  // `find_builtin_op` which returns `TfLiteRegistration_V2`.
-  const TfLiteRegistration_V2* (*find_builtin_op_v2)(void* user_data,
-                                                     TfLiteBuiltinOperator op,
-                                                     int version);
-  // `find_custom_op` which returns `TfLiteRegistration_V2`.
-  const TfLiteRegistration_V2* (*find_custom_op_v2)(void* user_data,
-                                                    const char* op,
-                                                    int version);
+  // Variant of `find_builtin_op` which returns `TfLiteRegistration_V2`.
+  std::function<const TfLiteRegistration_V2*(
+      void* user_data, TfLiteBuiltinOperator op, int version)>
+      find_builtin_op_v2;
+  // Variant of `find_custom_op` which returns `TfLiteRegistration_V2`.
+  std::function<const TfLiteRegistration_V2*(void* user_data, const char* op,
+                                             int version)>
+      find_custom_op_v2;
 
-  // `find_builtin_op` which returns `TfLiteRegistration_V1`.
-  const TfLiteRegistration_V1* (*find_builtin_op_v1)(void* user_data,
-                                                     TfLiteBuiltinOperator op,
-                                                     int version);
-  // `find_custom_op` which returns `TfLiteRegistration_V1`.
-  const TfLiteRegistration_V1* (*find_custom_op_v1)(void* user_data,
-                                                    const char* op,
-                                                    int version);
+  // Varant of `find_builtin_op` which returns `TfLiteRegistration_V1`.
+  std::function<const TfLiteRegistration_V1*(
+      void* user_data, TfLiteBuiltinOperator op, int version)>
+      find_builtin_op_v1;
+  // Varant of `find_custom_op` which returns `TfLiteRegistration_V1`.
+  std::function<const TfLiteRegistration_V1*(void* user_data, const char* op,
+                                             int version)>
+      find_custom_op_v1;
+
+  // Variant of `find_builtin_op` which returns `TfLiteRegistrationExternal`.
+  std::function<const TfLiteRegistrationExternal*(
+      void* user_data, TfLiteBuiltinOperator op, int version)>
+      find_builtin_op_external;
+  // Variant of `find_custom_op` which returns `TfLiteRegistrationExternal`.
+  std::function<const TfLiteRegistrationExternal*(void* user_data,
+                                                  const char* op, int version)>
+      find_custom_op_external;
 };
 
 // This struct mirrors the tflite::ErrorResolver C++ abstract base class.
@@ -187,8 +198,9 @@ class CallbackOpResolver : public ::tflite::OpResolver {
   template <class LegacyRegistrationT>
   TfLiteRegistration* BuildBuiltinOpFromLegacyRegistration(
       tflite::BuiltinOperator op, int version,
-      const LegacyRegistrationT* (*legacy_find_builtin_op)(
-          void* user_data, TfLiteBuiltinOperator op, int version)) const {
+      std::function<const LegacyRegistrationT*(
+          void* user_data, TfLiteBuiltinOperator op, int version)>
+          legacy_find_builtin_op) const {
     if (legacy_find_builtin_op) {
       // Get a deprecated Registration object to create a Registration.
       const LegacyRegistrationT* legacy_registration = legacy_find_builtin_op(
@@ -220,9 +232,9 @@ class CallbackOpResolver : public ::tflite::OpResolver {
   template <class LegacyRegistrationT>
   TfLiteRegistration* BuildCustomOpFromLegacyRegistration(
       const char* op, int version,
-      const LegacyRegistrationT* (*legacy_find_custom_op)(void* user_data,
-                                                          const char* op,
-                                                          int version)) const {
+      std::function<const LegacyRegistrationT*(void* user_data, const char* op,
+                                               int version)>
+          legacy_find_custom_op) const {
     if (legacy_find_custom_op) {
       // Get a deprecated Registration object to create a Registration.
       const LegacyRegistrationT* legacy_registration =
diff --git a/tensorflow/lite/c/c_test.c b/tensorflow/lite/c/c_test.c
index 5dbe1515cb6..8b318d78160 100644
--- a/tensorflow/lite/c/c_test.c
+++ b/tensorflow/lite/c/c_test.c
@@ -31,6 +31,7 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 #include <stddef.h>
+#include <stdbool.h>
 
 static void CheckFailed(const char *expression, const char *filename,
                         int line_number) {
@@ -343,12 +344,95 @@ static void TestTfLiteOpaqueContextGetExecutionPlan(void) {
   TfLiteOpaqueDelegateDelete(opaque_delegate);
 }
 
+static void TestTfLiteOpaqueContextReportErrorMacros(
+    TfLiteStatus (*Prepare)(TfLiteOpaqueContext* context,
+                            TfLiteOpaqueDelegate* delegate, void* data)) {
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+
+  // Create and install a delegate instance.
+  bool delegate_prepared_called = false;
+  TfLiteOpaqueDelegateBuilder opaque_delegate_builder = { NULL };
+  opaque_delegate_builder.data = &delegate_prepared_called;
+  opaque_delegate_builder.Prepare = Prepare;
+  TfLiteOpaqueDelegate* opaque_delegate =
+      TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddDelegate(options, opaque_delegate);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+
+  // The delegate's prepare function should have been called, even though it
+  // returned an error code.
+  CHECK(delegate_prepared_called);
+
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteModelDelete(model);
+  TfLiteOpaqueDelegateDelete(opaque_delegate);
+}
+
+TfLiteStatus TfLiteOpaqueContextReportErrorMacros_EnsureMsg_Prepare(
+    TfLiteOpaqueContext* context,
+    TfLiteOpaqueDelegate* opaque_delegate, void* data) {
+    bool* delegate_prepared = (bool*) data;
+    *delegate_prepared = true;
+    TF_LITE_OPAQUE_ENSURE_MSG(context, false, "false was not true!!!");
+    return kTfLiteOk;
+}
+
+TfLiteStatus TfLiteOpaqueContextReportErrorMacros_Ensure_Prepare(
+    TfLiteOpaqueContext* context,
+    TfLiteOpaqueDelegate* opaque_delegate, void* data) {
+    bool* delegate_prepared = (bool*) data;
+    *delegate_prepared = true;
+    TF_LITE_OPAQUE_ENSURE(context, false);
+    return kTfLiteOk;
+}
+
+TfLiteStatus TfLiteOpaqueContextReportErrorMacros_EnsureEq_Prepare(
+    TfLiteOpaqueContext* context,
+    TfLiteOpaqueDelegate* opaque_delegate, void* data) {
+    bool* delegate_prepared = (bool*) data;
+    *delegate_prepared = true;
+    TF_LITE_OPAQUE_ENSURE_EQ(context, 1, 2);
+    return kTfLiteOk;
+}
+
+TfLiteStatus TfLiteOpaqueContextReportErrorMacros_EnsureTypesEq_Prepare(
+    TfLiteOpaqueContext* context,
+    TfLiteOpaqueDelegate* opaque_delegate, void* data) {
+    bool* delegate_prepared = (bool*) data;
+    *delegate_prepared = true;
+    TF_LITE_OPAQUE_ENSURE_TYPES_EQ(context, '1', 2);
+    return kTfLiteOk;
+}
+
+TfLiteStatus TfLiteOpaqueContextReportErrorMacros_EnsureNear_Prepare(
+    TfLiteOpaqueContext* context,
+    TfLiteOpaqueDelegate* opaque_delegate, void* data) {
+    bool* delegate_prepared = (bool*) data;
+    *delegate_prepared = true;
+    TF_LITE_OPAQUE_ENSURE_NEAR(context, 3, 10, 1);
+    return kTfLiteOk;
+}
+
 static void RunTests(void) {
   TestVersion();
   TestInferenceUsingSignature();
   TestRepeatResizeInputTensor();
   TestInferenceUsingInterpreter();
   TestTfLiteOpaqueContextGetExecutionPlan();
+  TestTfLiteOpaqueContextReportErrorMacros(
+      TfLiteOpaqueContextReportErrorMacros_Ensure_Prepare);
+  TestTfLiteOpaqueContextReportErrorMacros(
+      TfLiteOpaqueContextReportErrorMacros_EnsureMsg_Prepare);
+  TestTfLiteOpaqueContextReportErrorMacros(
+      TfLiteOpaqueContextReportErrorMacros_EnsureEq_Prepare);
+  TestTfLiteOpaqueContextReportErrorMacros(
+      TfLiteOpaqueContextReportErrorMacros_EnsureTypesEq_Prepare);
+  TestTfLiteOpaqueContextReportErrorMacros(
+      TfLiteOpaqueContextReportErrorMacros_EnsureNear_Prepare);
 }
 
 int main(void) {
diff --git a/tensorflow/lite/c/common_internal.h b/tensorflow/lite/c/common_internal.h
index 482decbdd07..fdcccabb72e 100644
--- a/tensorflow/lite/c/common_internal.h
+++ b/tensorflow/lite/c/common_internal.h
@@ -49,6 +49,11 @@ typedef struct TfLiteRegistrationExternal {
   // node->outputs).
   TfLiteStatus (*invoke)(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
 
+  // Retrieves the async kernel. The functor is nullptr if the node / backend
+  // does not support asynchronous execution.
+  struct TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
+                                            TfLiteOpaqueNode* node);
+
   // Builtin op code.
   // The values stored in this field should be enum constants from the
   // TfLiteBuiltinOperator enum.
diff --git a/tensorflow/lite/c/jni/BUILD b/tensorflow/lite/c/jni/BUILD
new file mode 100644
index 00000000000..94f99e9d3d3
--- /dev/null
+++ b/tensorflow/lite/c/jni/BUILD
@@ -0,0 +1,22 @@
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite_with_c_headers_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+#------------------------------------------------------------------------------
+# Utilities for use in JNI Bindings (e.g. Java API and Java Tasks library).
+
+cc_library_with_tflite_with_c_headers_test(
+    name = "jni_utils",
+    srcs = ["jni_utils.cc"],
+    hdrs = ["jni_utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/java/jni",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/core/shims/jni/jni_utils.cc b/tensorflow/lite/c/jni/jni_utils.cc
similarity index 93%
rename from tensorflow/lite/core/shims/jni/jni_utils.cc
rename to tensorflow/lite/c/jni/jni_utils.cc
index e8062edb81b..53ce69d2a00 100644
--- a/tensorflow/lite/core/shims/jni/jni_utils.cc
+++ b/tensorflow/lite/c/jni/jni_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/core/shims/jni/jni_utils.h"
+#include "tensorflow/lite/c/jni/jni_utils.h"
 
 bool TfLiteCheckInitializedOrThrow(JNIEnv* env) {
   // No additional initialization is required.
diff --git a/tensorflow/lite/core/shims/jni/jni_utils.h b/tensorflow/lite/c/jni/jni_utils.h
similarity index 83%
rename from tensorflow/lite/core/shims/jni/jni_utils.h
rename to tensorflow/lite/c/jni/jni_utils.h
index 98c486dbc70..a425dcf4788 100644
--- a/tensorflow/lite/core/shims/jni/jni_utils.h
+++ b/tensorflow/lite/c/jni/jni_utils.h
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_JNI_JNI_UTILS_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_JNI_JNI_UTILS_H_
+#ifndef TENSORFLOW_LITE_JNI_JNI_UTILS_H_
+#define TENSORFLOW_LITE_JNI_JNI_UTILS_H_
 
 #include <jni.h>
+#include <stdbool.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -27,4 +28,4 @@ bool TfLiteCheckInitializedOrThrow(JNIEnv* env);
 }  // extern "C"
 #endif
 
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_JNI_JNI_UTILS_H_
+#endif  // TENSORFLOW_LITE_JNI_JNI_UTILS_H_
diff --git a/tensorflow/lite/core/BUILD b/tensorflow/lite/core/BUILD
index 36141dcb515..0d92e317a6b 100644
--- a/tensorflow/lite/core/BUILD
+++ b/tensorflow/lite/core/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings", "tflite_self_contained_libs_test_suite")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist", "tflite_portable_test_suite")
+load("//tensorflow/lite/core:special_rules.bzl", "macros_visibility_allowlist")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
@@ -224,7 +225,7 @@ alias(
     name = "cc_api",
     actual = "cc_api_experimental",
     visibility = [
-        "//tensorflow/lite/core/shims:__subpackages__",
+        "//tensorflow/lite:__subpackages__",
         "//tensorflow/lite/delegates/flex:__subpackages__",
         "//tensorflow/lite/kernels:__subpackages__",
     ],
@@ -338,11 +339,11 @@ cc_test(
     ],
     deps = [
         ":framework",
+        "//tensorflow/core/platform:resource_loader",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:interpreter_test_util",
         "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
         "//tensorflow/lite/core/c:common",
@@ -369,6 +370,26 @@ cc_library(
     ],
 )
 
+# Defines CreateOpResolver with all builtin ops.
+cc_library(
+    name = "create_op_resolver_with_builtin_ops",
+    srcs = ["create_op_resolver_with_builtin_ops.cc"],
+    hdrs = ["create_op_resolver.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ],
+    deps = [
+        "//tensorflow/lite:mutable_op_resolver",
+        "//tensorflow/lite:op_resolver",
+        "//tensorflow/lite/core/kernels:builtin_ops",
+    ],
+    # Some targets only have an implicit dependency on CreateOpResolver.
+    # This avoids warnings about backwards references when linking.
+    alwayslink = True,
+)
+
 # This target is only for use by the "tflite_custom_c_library" and "tflite_custom_cc_library" build
 # macro and should not be used anywhere other than in the implementation of that build macro.
 # "tflite_custom_c_library" requires target to be public, that's why we duplicated
@@ -393,8 +414,8 @@ cc_library(
     hdrs = ["macros.h"],
     compatible_with = get_compatible_with_portable(),
     visibility = [
-        "//visibility:private",
-    ],
+        "//tensorflow/lite:__subpackages__",
+    ] + macros_visibility_allowlist(),
 )
 
 cc_library(
@@ -422,12 +443,12 @@ cc_library(
         "//tensorflow/lite:memory_planner",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/core/c:c_api_types",
-        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core/api",
-        "//tensorflow/lite/experimental/resource",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/remat:metadata_util",
+        "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/profiling:root_profiler",
         "//tensorflow/lite/profiling/telemetry",
         "//tensorflow/lite/schema:schema_fbs",
@@ -457,8 +478,8 @@ cc_test(
     deps = [
         ":framework_stable",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/testing:util",
+        "//tensorflow/lite/kernels:builtin_ops",  # build_cleaner: keep
+        "//tensorflow/lite/testing:util",  # build_cleaner: keep
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/core/README.md b/tensorflow/lite/core/README.md
new file mode 100644
index 00000000000..85b4b6401e9
--- /dev/null
+++ b/tensorflow/lite/core/README.md
@@ -0,0 +1,28 @@
+This directory contains the "core" part of the TensorFlow Lite runtime library.
+The header files in this `tensorflow/lite/core/` directory fall into several
+categories.
+
+1.  Public API headers, in the `api` subdirectory `tensorflow/lite/core/api/`
+
+    These are in addition to the other public API headers in `tensorflow/lite/`.
+
+    For example:
+    - `tensorflow/lite/core/api/error_reporter.h`
+    - `tensorflow/lite/core/api/op_resolver.h`
+
+2.  Private headers that define public API types and functions.
+    These headers are each `#include`d from a corresponding public "shim" header
+    in `tensorflow/lite/` that forwards to the private header.
+
+    For example:
+    - `tensorflow/lite/core/interpreter.h` is a private header file that is
+      included from the public "shim" header file `tensorflow/lite/interpeter.h`.
+
+    These private header files should be used as follows: `#include`s from `.cc`
+    files in TF Lite itself that are _implementing_ the TF Lite APIs should
+    include the "core" TF Lite API headers.  `#include`s from files that are
+    just _using_ the regular TF Lite APIs should include the regular public
+    headers.
+
+3.  The header file `tensorflow/lite/core/subgraph.h`. This contains
+    some experimental APIs.
\ No newline at end of file
diff --git a/tensorflow/lite/core/acceleration/configuration/BUILD b/tensorflow/lite/core/acceleration/configuration/BUILD
new file mode 100644
index 00000000000..51261e2720e
--- /dev/null
+++ b/tensorflow/lite/core/acceleration/configuration/BUILD
@@ -0,0 +1,46 @@
+load("//tensorflow/lite/core:special_rules.bzl", "delegate_registry_visibility_allowlist")
+load("//tensorflow/lite/core/c:special_rules.bzl", "experimental_acceleration_api_allowlist")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "delegate_registry",
+    srcs = ["delegate_registry.cc"],
+    hdrs = ["delegate_registry.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ] + delegate_registry_visibility_allowlist(),
+    deps = [
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/core/c:common",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "stable_delegate_registry",
+    srcs = ["stable_delegate_registry.cc"],
+    hdrs = ["stable_delegate_registry.h"],
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ] + experimental_acceleration_api_allowlist(),
+    deps = [
+        "//tensorflow/lite/core/acceleration/configuration/c:stable_delegate",
+        "//tensorflow/lite/core/shims:tflite_use_opaque_delegate",  # buildcleaner: keep
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_test(
+    name = "stable_delegate_registry_test",
+    srcs = ["stable_delegate_registry_test.cc"],
+    deps = [
+        ":stable_delegate_registry",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/core/acceleration/configuration/c/BUILD b/tensorflow/lite/core/acceleration/configuration/c/BUILD
new file mode 100644
index 00000000000..ffa5074dea7
--- /dev/null
+++ b/tensorflow/lite/core/acceleration/configuration/c/BUILD
@@ -0,0 +1,172 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# C API for delegate plugins.
+
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_cc_library_with_c_headers_test", "tflite_copts", "tflite_copts_warnings")
+load("//tensorflow/lite/delegates/gpu:build_defs.bzl", "gpu_delegate_linkopts")
+load("//tensorflow/lite/core/c:special_rules.bzl", "experimental_acceleration_api_allowlist")
+load(
+    "//tensorflow/lite/core/acceleration/configuration/c:special_rules.bzl",
+    "delegate_plugin_visibility_allowlist",
+    "gpu_plugin_visibility_allowlist",
+    "xnnpack_plugin_visibility_allowlist",
+)
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "tf_gpu_tests_tags",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ] + experimental_acceleration_api_allowlist(),
+    licenses = ["notice"],
+)
+
+tflite_cc_library_with_c_headers_test(
+    name = "delegate_plugin",
+    hdrs = ["delegate_plugin.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ] + delegate_plugin_visibility_allowlist(),
+    deps = [
+        "//tensorflow/lite/core/c:common",
+    ],
+)
+
+common_copts = tflite_copts() + tflite_copts_warnings()
+
+tflite_cc_library_with_c_headers_test(
+    name = "gpu_plugin",
+    srcs = ["gpu_plugin.cc"],
+    hdrs = ["gpu_plugin.h"],
+    copts = common_copts + select({
+        "//tensorflow:ios": [
+            "-xobjective-c++",
+        ],
+        "//tensorflow:macos_arm64": [
+            "-xobjective-c++",
+        ],
+        "//conditions:default": [],
+    }),
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ] + gpu_plugin_visibility_allowlist(),
+    deps = [
+        ":delegate_plugin",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:gpu_plugin_impl",
+        "//tensorflow/lite/core/c:common",
+    ] + select({
+        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [
+            "//tensorflow/lite/delegates/gpu:delegate",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:ios": [
+            "//tensorflow/lite/delegates/gpu:metal_delegate",
+        ],
+        "//tensorflow:macos_arm64": [
+            "//tensorflow/lite/delegates/gpu:metal_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+# For non-Android platforms, this should be built with '--copt=-DCL_DELEGATE_NO_GL'.
+# On non-supported platforms (i.e. non-Android platforms if -DCL_DELEGATE_NO_GL wasn't specified),
+# the test srcs are set to the empty list, so the test will succeed without testing anything.
+cc_test(
+    name = "gpu_plugin_test",
+    srcs = select({
+        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": ["gpu_plugin_test.cc"],
+        "//conditions:default": [],
+    }),
+    linkopts = gpu_delegate_linkopts(),
+    tags = tf_gpu_tests_tags(),
+    deps = select({
+        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [":gpu_plugin"],
+        "//conditions:default": [],
+    }) + [
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/core/c:common",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tflite_cc_library_with_c_headers_test(
+    name = "nnapi_plugin",
+    srcs = ["nnapi_plugin.cc"],
+    hdrs = ["nnapi_plugin.h"],
+    deps = [
+        ":delegate_plugin",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:nnapi_plugin_impl",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
+    ],
+)
+
+cc_test(
+    name = "nnapi_plugin_test",
+    srcs = ["nnapi_plugin_test.cc"],
+    deps = [
+        ":nnapi_plugin",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/core/c:common",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tflite_cc_library_with_c_headers_test(
+    name = "xnnpack_plugin",
+    srcs = ["xnnpack_plugin.cc"],
+    hdrs = ["xnnpack_plugin.h"],
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ] + xnnpack_plugin_visibility_allowlist(),
+    deps = [
+        ":delegate_plugin",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+    ],
+)
+
+cc_test(
+    name = "xnnpack_plugin_test",
+    srcs = ["xnnpack_plugin_test.cc"],
+    deps = [
+        ":xnnpack_plugin",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "@com_google_googletest//:gtest_main",
+        "@pthreadpool",
+    ],
+)
+
+tflite_cc_library_with_c_headers_test(
+    name = "stable_delegate",
+    hdrs = ["stable_delegate.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/lite/core/acceleration/configuration/c:delegate_plugin",
+    ],
+)
diff --git a/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h b/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h
new file mode 100644
index 00000000000..5f5d9ceaefd
--- /dev/null
+++ b/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h
@@ -0,0 +1,114 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOLINTBEGIN(whitespace/line_length)
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// NOLINTEND(whitespace/line_length)
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+
+// C API types for TF Lite delegate plugins.
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Type of delegate creation function used to allocate and construct a delegate.
+//
+// The tflite_settings parameter passed to the delegate creation function
+// should be a pointer to a FlatBuffer table object of type
+// tflite::TFLiteSettings.  (We use 'void *' here since this is a C API so we
+// don't want to directly reference C++ types such as tflite::TFLiteSettings.)
+//
+// Ownership of the tflite_settings flatbuffer remains with the caller.
+// The caller of a delegate creation function may end the lifetime of the
+// tflite_settings FlatBuffer immediately after the call to the function.
+// So the delegate creation function should ensure that any settings that the
+// delegate may need to reference later, after the delegate has been
+// constructed, are copied from the FlatBuffer into storage owned by the
+// delegate.
+typedef TfLiteDelegate *TfLiteDelegatePluginCreateFunc(
+    const void *tflite_settings);
+
+// Type of function to destroy and deallocate a delegate.
+// The delegate argument must have been created with the corresponding
+// create function from the same delegate plugin.
+typedef void TfLiteDelegatePluginDestroyFunc(TfLiteDelegate *);
+
+// Type of function to return an error code for the last delegate operation.
+// The delegate argument must have been created with the corresponding
+// create function from the same delegate plugin.
+typedef int TfLiteDelegatePluginGetDelegateErrnoFunc(TfLiteDelegate *);
+
+// Struct to hold all the methods for a delegate plugin.
+typedef struct TfLiteDelegatePlugin {
+  // Function to allocate and construct a delegate.
+  TfLiteDelegatePluginCreateFunc *create;
+
+  // Function to deallocate a delegate.
+  TfLiteDelegatePluginDestroyFunc *destroy;
+
+  // Function to return an error code for the last delegate operation.
+  TfLiteDelegatePluginGetDelegateErrnoFunc *get_delegate_errno;
+} TfLiteDelegatePlugin;
+
+// The following block guarded by TFLITE_USE_OPAQUE_DELEGATE has the exact same
+// functionality as the concrete types above but only uses truly opaque types.
+// Note that it has to be an addition along with the concrete types at this
+// point because the in some cases both types are used together in a same build
+// target. e.g. TFLite-in-Play Services initialization context.
+#if TFLITE_USE_OPAQUE_DELEGATE
+
+// Same as TfLiteDelegatePluginCreateFunc but uses truly opaque types.
+typedef TfLiteOpaqueDelegateStruct *TfLiteOpaqueDelegatePluginCreateFunc(
+    const void *tflite_settings);
+
+// Same as TfLiteDelegatePluginDestroyFunc but uses truly opaque types.
+typedef void TfLiteOpaqueDelegatePluginDestroyFunc(
+    TfLiteOpaqueDelegateStruct *delegate);
+
+// Same as TfLiteDelegatePluginGetDelegateErrnoFunc but uses truly opaque types.
+typedef int TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc(
+    TfLiteOpaqueDelegateStruct *delegate);
+
+// Same as TfLiteDelegatePlugin but uses truly opaque types.
+typedef struct TfLiteOpaqueDelegatePlugin {
+  TfLiteOpaqueDelegatePluginCreateFunc *create;
+
+  TfLiteOpaqueDelegatePluginDestroyFunc *destroy;
+
+  TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc *get_delegate_errno;
+} TfLiteOpaqueDelegatePlugin;
+
+#else
+
+typedef TfLiteDelegatePluginCreateFunc TfLiteOpaqueDelegatePluginCreateFunc;
+typedef TfLiteDelegatePluginDestroyFunc TfLiteOpaqueDelegatePluginDestroyFunc;
+typedef TfLiteDelegatePluginGetDelegateErrnoFunc
+    TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc;
+typedef TfLiteDelegatePlugin TfLiteOpaqueDelegatePlugin;
+
+#endif  // TFLITE_USE_OPAQUE_DELEGATE
+
+#ifdef __cplusplus
+};  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.cc b/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.cc
similarity index 88%
rename from tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.cc
rename to tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.cc
index b9c0ced2b39..70d8b295ace 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.cc
+++ b/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 // This file implements the Delegate Plugin for the GPU Delegate.
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h"
 
 #include <memory>
 
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/gpu_plugin.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h"
 
 #if TFLITE_SUPPORTS_GPU_DELEGATE
 #include "tensorflow/lite/delegates/gpu/delegate.h"
diff --git a/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h b/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h
new file mode 100644
index 00000000000..cbee7f128d9
--- /dev/null
+++ b/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOLINTBEGIN(whitespace/line_length)
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// NOLINTEND(whitespace/line_length)
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+
+// This header file is for the delegate plugin for GPU.
+//
+// For the C++ delegate plugin interface, the GPU delegate plugin is added to
+// the DelegatePluginRegistry by the side effect of a constructor for a static
+// object, so there's no public API needed for this plugin, other than the API
+// of tflite::delegates::DelegatePluginRegistry, which is declared in
+// delegate_registry.h.
+//
+// But to provide a C API to access the GPU delegate plugin, we do expose
+// some functions, which are declared below.
+
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C API for the GPU delegate plugin.
+// Returns a pointer to a statically allocated table of function pointers.
+const TfLiteDelegatePlugin* TfLiteGpuDelegatePluginCApi();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin_test.cc b/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin_test.cc
similarity index 93%
rename from tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin_test.cc
rename to tensorflow/lite/core/acceleration/configuration/c/gpu_plugin_test.cc
index aebd1535e55..731cf41e982 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin_test.cc
+++ b/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin_test.cc
@@ -16,12 +16,12 @@ limitations under the License.
 // Some very simple unit tests of the C API Delegate Plugin for the
 // GPU Delegate.
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.cc b/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.cc
similarity index 88%
rename from tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.cc
rename to tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.cc
index 99a1240cbf3..23f52d4b2c7 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.cc
+++ b/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 // This file implements the Delegate Plugin for the NNAPI Delegate.
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h"
 
 #include <memory>
 
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/nnapi_plugin.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h"
 
 extern "C" {
 
diff --git a/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h b/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h
new file mode 100644
index 00000000000..52312be9d65
--- /dev/null
+++ b/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOLINTBEGIN(whitespace/line_length)
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// NOLINTEND(whitespace/line_length)
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+
+// This header file is for the delegate plugin for NNAPI.
+//
+// For the C++ delegate plugin interface, the NNAPI delegate plugin is added to
+// the DelegatePluginRegistry by the side effect of a constructor for a static
+// object, so there's no public API needed for this plugin, other than the API
+// of tflite::delegates::DelegatePluginRegistry, which is declared in
+// delegate_registry.h.
+//
+// But to provide a C API to access the NNAPI delegate plugin, we do expose
+// some functions, which are declared below.
+
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C API for the NNAPI delegate plugin.
+// Returns a pointer to a statically allocated table of function pointers.
+const TfLiteDelegatePlugin* TfLiteNnapiDelegatePluginCApi();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin_test.cc b/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin_test.cc
similarity index 93%
rename from tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin_test.cc
rename to tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin_test.cc
index 5edb149b616..66761fc37b2 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin_test.cc
+++ b/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin_test.cc
@@ -16,12 +16,12 @@ limitations under the License.
 // Some very simple unit tests of the C API Delegate Plugin for the
 // NNAPI Delegate.
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/core/acceleration/configuration/c/special_rules.bzl b/tensorflow/lite/core/acceleration/configuration/c/special_rules.bzl
new file mode 100644
index 00000000000..16325b2ca80
--- /dev/null
+++ b/tensorflow/lite/core/acceleration/configuration/c/special_rules.bzl
@@ -0,0 +1,31 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""External-only build rules for delegate plugins."""
+
+def delegate_plugin_visibility_allowlist():
+    """Returns a list of packages that can depend on delegate_plugin."""
+    return []
+
+def gpu_plugin_visibility_allowlist():
+    """Returns a list of packages that can depend on gpu_plugin."""
+    return []
+
+def xnnpack_plugin_visibility_allowlist():
+    """Returns a list of packages that can depend on xnnpack_plugin."""
+    return []
+
+def stable_delegate_visibility_allowlist():
+    """Returns a list of packages that can depend on stable_delegate."""
+    return []
diff --git a/tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h b/tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h
new file mode 100644
index 00000000000..eab926c13ab
--- /dev/null
+++ b/tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h
@@ -0,0 +1,54 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+
+// C API types for TFLite delegates that implement stable delegate ABI.
+
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Constant that identifies the TfLiteStableDelegate ABI version that the
+// delegate supports. This will get incremented if there are changes to the
+// struct. The version is in semver 2 format (see https://semver.org).
+#define TFL_STABLE_DELEGATE_ABI_VERSION "1.0.0"
+
+// Contains stable delegate metadata and implementation.
+typedef struct TfLiteStableDelegate {
+  // The struct ABI version this delegate supports in semver 2 format. It should
+  // be set to TFL_STABLE_DELEGATE_ABI_VERSION.
+  const char* delegate_abi_version;
+
+  // Uniquely identifies a delegate.
+  // Format: {vendor}_{delegate}. Prefer using snake_case.
+  // e.g. "mycompany_gpu_delegate"
+  const char* delegate_name;
+
+  // Release version of this delegate.
+  // Prefer using semver 2 format.
+  const char* delegate_version;
+
+  // Provides the implementation of the delegate plugin.
+  const TfLiteOpaqueDelegatePlugin* delegate_plugin;
+} TfLiteStableDelegate;
+
+#ifdef __cplusplus
+};  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.cc b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.cc
similarity index 90%
rename from tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.cc
rename to tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.cc
index fe080b3f0de..9cbe4617afe 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.cc
+++ b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 // This file implements the C API Delegate Plugin for the XNNPACK Delegate.
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h"
 
 #include <memory>
 
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 extern "C" {
 
diff --git a/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h
new file mode 100644
index 00000000000..224e7869cb9
--- /dev/null
+++ b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOLINTBEGIN(whitespace/line_length)
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// NOLINTEND(whitespace/line_length)
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+
+// This header file is for the delegate plugin for XNNPACK.
+//
+// For the C++ delegate plugin interface, the XNNPACK delegate plugin is added
+// to the DelegatePluginRegistry by the side effect of a constructor for a
+// static object, so there's no public API needed for this plugin, other than
+// the API of tflite::delegates::DelegatePluginRegistry, which is declared in
+// delegate_registry.h.
+//
+// But to provide a C API to access the XNNPACK delegate plugin, we do expose
+// some functions, which are declared below.
+
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C API for the XNNPACK delegate plugin.
+// Returns a pointer to a statically allocated table of function pointers.
+const TfLiteDelegatePlugin* TfLiteXnnpackDelegatePluginCApi();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin_test.cc b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin_test.cc
similarity index 94%
rename from tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin_test.cc
rename to tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin_test.cc
index 2cd83832779..224e0efe256 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin_test.cc
+++ b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin_test.cc
@@ -16,14 +16,14 @@ limitations under the License.
 // Some very simple unit tests of the C API Delegate Plugin for the
 // XNNPACK Delegate.
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "pthreadpool.h"  // from @pthreadpool
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.cc b/tensorflow/lite/core/acceleration/configuration/delegate_registry.cc
similarity index 95%
rename from tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.cc
rename to tensorflow/lite/core/acceleration/configuration/delegate_registry.cc
index a347b7e803c..b28759b6af7 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.cc
+++ b/tensorflow/lite/core/acceleration/configuration/delegate_registry.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
 
 #include <functional>
 #include <string>
diff --git a/tensorflow/lite/core/acceleration/configuration/delegate_registry.h b/tensorflow/lite/core/acceleration/configuration/delegate_registry.h
new file mode 100644
index 00000000000..e3dc41e5dd7
--- /dev/null
+++ b/tensorflow/lite/core/acceleration/configuration/delegate_registry.h
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOLINTBEGIN(whitespace/line_length)
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/acceleration/configuration/delegate_registry.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// NOLINTEND(whitespace/line_length)
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/c/common.h"
+
+// Defines an interface for TFLite delegate plugins.
+//
+// The acceleration library aims to support all TFLite delegates based on
+// configuration expressed as data (flatbuffers). However, consumers tend to
+// care about size and also use a subset of delegates. Hence we don't want to
+// statically build against all delegates.
+//
+// This interface allows plugins to handle specific delegates.
+//
+// Goal of this interface is not to abstract away all the differences between
+// delegates. The goal is only to avoid static linking.
+//
+// Note to implementers: this interface may change if new delegates don't fit
+// into the same design.
+namespace tflite {
+namespace delegates {
+
+// Same w/ Interpreter::TfLiteDelegatePtr to avoid pulling
+// tensorflow/lite/interpreter.h dependency
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+class DelegatePluginInterface {
+ public:
+  virtual TfLiteDelegatePtr Create() = 0;
+  virtual int GetDelegateErrno(TfLiteDelegate* from_delegate) = 0;
+  virtual ~DelegatePluginInterface() = default;
+};
+
+// A stripped-down registry that allows delegate plugins to be created by name.
+//
+// Limitations:
+// - Doesn't allow deregistration.
+// - Doesn't check for duplication registration.
+//
+class DelegatePluginRegistry {
+ public:
+  typedef std::function<std::unique_ptr<DelegatePluginInterface>(
+      const TFLiteSettings&)>
+      CreatorFunction;
+  // Returns a DelegatePluginInterface registered with `name` or nullptr if no
+  // matching plugin found.
+  // TFLiteSettings is per-plugin, so that the corresponding delegate options
+  // data lifetime is maintained.
+  static std::unique_ptr<DelegatePluginInterface> CreateByName(
+      const std::string& name, const TFLiteSettings& settings);
+
+  // Struct to be statically allocated for registration.
+  struct Register {
+    Register(const std::string& name, CreatorFunction creator_function);
+  };
+
+ private:
+  void RegisterImpl(const std::string& name, CreatorFunction creator_function);
+  std::unique_ptr<DelegatePluginInterface> CreateImpl(
+      const std::string& name, const TFLiteSettings& settings);
+  static DelegatePluginRegistry* GetSingleton();
+  absl::Mutex mutex_;
+  std::unordered_map<std::string, CreatorFunction> factories_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#define TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION_VNAME(name, f) \
+  static auto* g_delegate_plugin_##name##_ =                     \
+      new tflite::delegates::DelegatePluginRegistry::Register(#name, f);
+#define TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(name, f) \
+  TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION_VNAME(name, f);
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.cc b/tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.cc
similarity index 94%
rename from tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.cc
rename to tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.cc
index ea0a68faa60..e5203762d2a 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.cc
+++ b/tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h"
+#include "tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.h"
 
 #include <string>
 
diff --git a/tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.h b/tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.h
new file mode 100644
index 00000000000..ede67164500
--- /dev/null
+++ b/tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
+#define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h"
+
+namespace tflite {
+namespace delegates {
+
+// A dedicated singleton registry for TfLiteStableDelegate.
+// Note that there is also a non-stable delegate registry
+// (third_party/tensorflow/lite/core/acceleration/configuration/
+// delegate_registry.h)
+// but it does not serve very well for TfLiteStableDelegate as it could not
+// register all the information of TfLiteStableDelegate and it uses concrete
+// types.
+class StableDelegateRegistry {
+ public:
+  // Registers a TfLiteStableDelegate pointer to the registry.
+  static void RegisterStableDelegate(const TfLiteStableDelegate* delegate);
+  // Retrieves the pointer to the corresponding TfLiteStableDelegate from the
+  // registry given a delegate name. Returns nullptr if no registration found.
+  static const TfLiteStableDelegate* RetrieveStableDelegate(
+      const std::string& name);
+
+ private:
+  static StableDelegateRegistry* GetSingleton();
+  void RegisterStableDelegateImpl(const TfLiteStableDelegate* delegate);
+  const TfLiteStableDelegate* RetrieveStableDelegateImpl(
+      const std::string& name);
+
+  absl::Mutex mutex_;
+  std::unordered_map<std::string, const TfLiteStableDelegate*> registry_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry_test.cc b/tensorflow/lite/core/acceleration/configuration/stable_delegate_registry_test.cc
similarity index 94%
rename from tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry_test.cc
rename to tensorflow/lite/core/acceleration/configuration/stable_delegate_registry_test.cc
index 2af1056cb84..a3b6725599e 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry_test.cc
+++ b/tensorflow/lite/core/acceleration/configuration/stable_delegate_registry_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h"
+#include "tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.h"
 
 #include <gtest/gtest.h>
 
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 2879afd8c1d..af0a0eb229e 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -256,6 +256,10 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseElu(op, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_EMBEDDING_LOOKUP: {
+      return ParseEmbeddingLookup(op, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_EXP: {
       return ParseExp(op, error_reporter, allocator, builtin_data);
     }
@@ -542,6 +546,14 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseZerosLike(op, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_BITWISE_XOR: {
+      return ParseBitwiseXor(op, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_RIGHT_SHIFT: {
+      return ParseRightShift(op, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_CAST: {
       return ParseCast(op, error_reporter, allocator, builtin_data);
     }
@@ -845,6 +857,7 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
+
     // Below are the ops with no builtin_data structure.
     // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
     // ok for now, since there is no call implementation either.
@@ -855,7 +868,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_CUSTOM:
     case BuiltinOperator_DENSIFY:
     case BuiltinOperator_DYNAMIC_UPDATE_SLICE:
-    case BuiltinOperator_EMBEDDING_LOOKUP:
     case BuiltinOperator_EQUAL:
     case BuiltinOperator_HASHTABLE_FIND:
     case BuiltinOperator_HASHTABLE_IMPORT:
@@ -885,6 +897,7 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_UNSORTED_SEGMENT_SUM:
     case BuiltinOperator_ATAN2:
     case BuiltinOperator_SIGN:
+    case BuiltinOperator_BITCAST:
     case BuiltinOperator_WHERE:
       return kTfLiteOk;
     case BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES:
@@ -1335,6 +1348,14 @@ TfLiteStatus ParseElu(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseEmbeddingLookup(const Operator*, ErrorReporter*,
+                                  BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
@@ -2441,6 +2462,22 @@ TfLiteStatus ParseZerosLike(const Operator*, ErrorReporter*,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseBitwiseXor(const Operator*, ErrorReporter*,
+                             BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseRightShift(const Operator*, ErrorReporter*,
+                             BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data) {
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index 4df83d5eeaa..9ffe3971c14 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -151,6 +151,11 @@ TfLiteStatus ParseDiv(const Operator* op, ErrorReporter* error_reporter,
 TfLiteStatus ParseElu(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseEmbeddingLookup(const Operator* op,
+                                  ErrorReporter* error_reporter,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
 TfLiteStatus ParseEqual(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
@@ -407,6 +412,14 @@ TfLiteStatus ParseZerosLike(const Operator* op, ErrorReporter* error_reporter,
                             BuiltinDataAllocator* allocator,
                             void** builtin_data);
 
+TfLiteStatus ParseBitwiseXor(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseRightShift(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
diff --git a/tensorflow/lite/core/async/BUILD b/tensorflow/lite/core/async/BUILD
index 8bf3a9fb9c4..45fb2917c19 100644
--- a/tensorflow/lite/core/async/BUILD
+++ b/tensorflow/lite/core/async/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -78,20 +79,17 @@ cc_test(
         "//tensorflow/lite/core/async/testing:mock_async_kernel",
         "//tensorflow/lite/core/async/testing:test_backend",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "backend_async_kernel_interface",
-    srcs = ["backend_async_kernel_interface.cc"],
     hdrs = ["backend_async_kernel_interface.h"],
-    deps = [
-        "//tensorflow/lite/core/async/c:async_kernel",
-        "//tensorflow/lite/core/async/c:types",
-        "//tensorflow/lite/core/c:c_api_types",
-        "//tensorflow/lite/core/c:common",
+    deprecation = "Use //tensorflow/lite/async:backend_async_kernel_interface instead.",
+    tflite_deps = [
+        "//tensorflow/lite/async:backend_async_kernel_interface",
     ],
 )
 
@@ -133,17 +131,3 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-cc_test(
-    name = "backend_async_kernel_interface_test",
-    srcs = ["backend_async_kernel_interface_test.cc"],
-    deps = [
-        ":async_kernel_internal",
-        ":backend_async_kernel_interface",
-        "//tensorflow/lite/core/async/c:types",
-        "//tensorflow/lite/core/async/testing:mock_async_kernel",
-        "//tensorflow/lite/core/c:c_api_types",
-        "//tensorflow/lite/core/c:common",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/core/async/async_subgraph.cc b/tensorflow/lite/core/async/async_subgraph.cc
index 16e1dccbb87..6bed5710d8a 100644
--- a/tensorflow/lite/core/async/async_subgraph.cc
+++ b/tensorflow/lite/core/async/async_subgraph.cc
@@ -28,32 +28,58 @@ limitations under the License.
 namespace tflite {
 namespace async {
 
+namespace {
+
+TfLiteAsyncKernel* GetAsyncKernel(TfLiteContext* context,
+                                  const TfLiteRegistration& op_reg,
+                                  TfLiteNode& node) {
+  if (op_reg.registration_external &&
+      op_reg.registration_external->async_kernel) {
+    return op_reg.registration_external->async_kernel(
+        // The casts here are only safe because this code is part of TFLite
+        // runtime. Applications should not rely on TfLiteContext / TfLiteNode
+        // being equivalent to TfLiteOpaqueContext / TfLiteOpaqueNode.
+        reinterpret_cast<TfLiteOpaqueContext*>(context),
+        reinterpret_cast<TfLiteOpaqueNode*>(&node));
+  }
+  if (op_reg.async_kernel) {
+    return op_reg.async_kernel(context, &node);
+  }
+  return nullptr;
+}
+
+}  // namespace
+
 Subgraph* AsyncSubgraph::subgraph() const { return subgraph_; }
 
 TfLiteContext* AsyncSubgraph::context() const { return subgraph_->context(); }
 
 TfLiteOpaqueContext* AsyncSubgraph::opaque_context() const {
+  // The casts here are only safe because this code is part of TFLite
+  // runtime. Applications should not rely on TfLiteContext
+  // being equivalent to TfLiteOpaqueContext.
   return reinterpret_cast<TfLiteOpaqueContext*>(context());
 }
 
-TfLiteAsyncKernel* AsyncSubgraph::async_kernel() const {
-  if (async_kernel_ == nullptr) {
-    auto* node = reinterpret_cast<TfLiteNode*>(opaque_node_);
-    async_kernel_ = reinterpret_cast<TfLiteAsyncKernel*>(node->user_data);
-  }
-  return async_kernel_;
-}
+TfLiteAsyncKernel* AsyncSubgraph::async_kernel() const { return async_kernel_; }
 
 AsyncSubgraph::AsyncSubgraph(Subgraph* subgraph) : subgraph_(subgraph) {
-  // Currently we only support one delegate and fully delegated subgph.
+  // Currently we only support one delegate and fully delegated subgraph.
   if (!IsFullyDelegated()) {
     subgraph->ReportError("Model is not fully delegated by 1 backend.");
     return;
   }
-  // TODO(b/191883048): Add/Check delegate flag to indicate kernel support.
-  const TfLiteNode& node =
-      subgraph->nodes_and_registration()[subgraph_->execution_plan()[0]].first;
-  async_kernel_ = reinterpret_cast<TfLiteAsyncKernel*>(node.user_data);
+
+  // Ensured by `IsFullyDelegated`, there's only 1 node in execution plan.
+  auto node_index = subgraph_->execution_plan()[0];
+  TfLiteNode& node = subgraph_->nodes_and_registration_[node_index].first;
+  const TfLiteRegistration& registration =
+      subgraph_->nodes_and_registration_[node_index].second;
+  async_kernel_ = GetAsyncKernel(context(), registration, node);
+  if (!async_kernel_) {
+    subgraph->ReportError("Backend does not support asynchronous execution.");
+    return;
+  }
   // TODO(b/191883048): Add AsyncSubgraph as friend class of Subgraph and
   // remove the const cast.
   opaque_node_ =
diff --git a/tensorflow/lite/core/async/async_subgraph.h b/tensorflow/lite/core/async/async_subgraph.h
index 56241c0fd70..edf87ecaae7 100644
--- a/tensorflow/lite/core/async/async_subgraph.h
+++ b/tensorflow/lite/core/async/async_subgraph.h
@@ -162,7 +162,7 @@ class AsyncSubgraph {
   Subgraph* subgraph_ = nullptr;
 
   // Next buffer handle to assign in Register* calls.
-  std::atomic<TfLiteBufferHandle> next_buffer_handle_ = 0;
+  std::atomic<TfLiteBufferHandle> next_buffer_handle_ = {0};
 
   // Supported buffer and sync types.
   std::map<TfLiteIoType, std::vector<const char*>> supported_buffer_types_;
diff --git a/tensorflow/lite/core/async/async_subgraph_test.cc b/tensorflow/lite/core/async/async_subgraph_test.cc
index dca7ad84021..85233be7b15 100644
--- a/tensorflow/lite/core/async/async_subgraph_test.cc
+++ b/tensorflow/lite/core/async/async_subgraph_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/core/async/testing/test_backend.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/builtin_op_kernels.h"
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 
 using ::testing::_;
 
diff --git a/tensorflow/lite/core/async/backend_async_kernel_interface.h b/tensorflow/lite/core/async/backend_async_kernel_interface.h
index 0b9b8742099..a9969a7372d 100644
--- a/tensorflow/lite/core/async/backend_async_kernel_interface.h
+++ b/tensorflow/lite/core/async/backend_async_kernel_interface.h
@@ -15,186 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
 #define TENSORFLOW_LITE_CORE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
 
-#include <vector>
-
-#include "tensorflow/lite/core/async/c/async_kernel.h"
-#include "tensorflow/lite/core/async/c/types.h"
-#include "tensorflow/lite/core/c/c_api_types.h"
-#include "tensorflow/lite/core/c/common.h"
-
-namespace tflite {
-namespace delegates {
-
-// A C++ wrapper around TfLiteAsyncKernel C API that delegate developers
-// can use to add support for asynchronous execution.
-// The implementation of `BackendAsyncKernelInterface` must be thread safe.
-class BackendAsyncKernelInterface {
- public:
-  BackendAsyncKernelInterface();
-  virtual ~BackendAsyncKernelInterface() { TfLiteAsyncKernelDelete(kernel_); }
-
-  // Returns the TfLiteAsyncKernel instance.
-  // kernel_ will be filled with the implementation of the class.
-  virtual TfLiteAsyncKernel* kernel() { return kernel_; }
-
-  // The following methods should be implemented to support buffer interop
-  // and asynchronous execution.
-
-  // Buffer operations
-  // ======================
-  // Registers the TfLiteBackendBuffer to `handle`.
-  // `TfLiteBackendBuffer` is a wrapper around a platform-specific buffer
-  // object (e.g. AHardwareBuffer).
-  // `buffer` and `attrs` lifespan is not guaranteed after the function call.
-  // kernels should read the stored attributes instead of caching the
-  // attribute map.
-  // `io_type` specifies whether this buffer is used as an input buffer
-  // or an output buffer. If a buffer is both used as input and output,
-  // specify it as output. Not null.
-  // `attrs` describes the attributes of the buffer. It's guaranteed to be
-  // of kTfLiteAttrMapTypeBuffer type and not null.
-  // `handle` is the buffer handle assigned by TfLite runtime to recognize
-  // this piece of buffer.
-  // In `attrs`, the application must provide the type of the buffer.
-  // If additional attributes (e.g. padding, size) are provided, the backend
-  // is responsible for validating those attributes to be compatible.
-  // The backend will not own the actual buffer wrapped in `buffer`, but the
-  // backend can choose to increase the ref count if underlying implementaion
-  // supports that.
-  virtual TfLiteStatus RegisterBuffer(TfLiteOpaqueContext* context,
-                                      TfLiteIoType io_type,
-                                      const TfLiteBackendBuffer* buffer,
-                                      const TfLiteAttributeMap* attrs,
-                                      TfLiteBufferHandle handle) = 0;
-
-  // Registers a buffer slice from a previously registered memory.
-  // `buffer` is the handle of the buffer pool previously registered.
-  // `attrs` contains the information of the buffer slice.
-  // `handle` is the buffer handle assigned by TfLite runtime to recognize
-  // this piece of buffer.
-  // NOTE: The backend is responsible to validate the slicing is "valid":
-  // * The slicing is not nested from another slice. (i.e. the `buffer_pool` is
-  //   a handle returned by `RegisterBuffer`.)
-  // * The attributes of the slice (e.g. size, offset) is of valid values
-  //   from the buffer pool.
-  // If the `handle` is not recognized, returns error.
-  virtual TfLiteStatus RegisterBufferSlice(TfLiteOpaqueContext* context,
-                                           TfLiteBufferHandle buffer_pool,
-                                           const TfLiteAttributeMap* attrs,
-                                           TfLiteBufferHandle handle) = 0;
-
-  // Unregisters a buffer or a buffer slice.
-  // `handle` is a buffer handle previously assigned via register_* calls.
-  // If the `handle` is not recognized, returns error.
-  // Unregistering the buffer does not mean deallocating the buffer. However
-  // the backend need to reduce the ref-count if ref counting is performed
-  // during `Register*` calls.
-  virtual TfLiteStatus UnregisterBuffer(TfLiteOpaqueContext* context,
-                                        TfLiteBufferHandle handle) = 0;
-
-  // Reconciliations
-  // ===================
-  // Inspects the buffer object types supported by the backend.
-  // `io_type` specify whether the call returns supported input or output
-  // buffer.
-  virtual const std::vector<const char*>& SupportedBufferTypes(
-      TfLiteIoType io_type) const = 0;
-
-  // Inspects the sync object types supported by the backend.
-  // `io_type` specify whether the call returns supported input or output
-  // sync object.
-  virtual const std::vector<const char*>& SupportedSynchronizations(
-      TfLiteIoType io_type) const = 0;
-
-  // Reconciles buffer or sync attributes for tensor at tensor_index.
-  // Fills `merged` with reconciled attributes.
-  // If `conflict` is provided, conflicting attributes will be provided there.
-  // If the type of the `user_provided_attributes` is not recognizable, returns
-  // error.
-  // If any of the attribute in the `user_provided_attributes` is not
-  // recognizable skip this attribute.
-  // Returns true if the attribute map type is recognizable and there's no
-  // conflicting attribute.
-  virtual bool ReconcileRestrictions(
-      const TfLiteOpaqueContext* context, const TfLiteOpaqueNode* node,
-      int tensor_index, const TfLiteAttributeMap* user_provided_attributes,
-      TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict) const = 0;
-
-  // Sets the input / output buffer / sync attributes.
-  // Backend kernel will check the input attributes covers all the requirements.
-  // A typical workflow is for callers call Reconcile*Restrictions method
-  // above to have a merged attribute list, check all restrictions are met
-  // and set input / output attribute here.
-  // Returns TfLiteOk if provided `attrs` covers all requirements.
-  virtual TfLiteStatus SetAttributes(TfLiteOpaqueContext* context,
-                                     TfLiteOpaqueNode* node, int tensor_index,
-                                     const TfLiteAttributeMap* attrs) = 0;
-
-  // Prepares the kernel using the information from Set[In|Out]putAttributes
-  // call above.
-  virtual TfLiteStatus Prepare(TfLiteOpaqueContext* context,
-                               TfLiteOpaqueNode* node) = 0;
-
-  // Execution methods
-  // =============================
-
-  // Schedules an execution with the information provided in task.
-  // The application is responsible for filling out buffer and sync mappings
-  // to tensors.
-  // Backend will set the sync ptr for related tensors if requested.
-  // i.e. SetOutputAttributes has sync implementation requested, and
-  // the TfLiteSynchronization is not null for the tensor in `task`.
-  //
-  // TfLite runtime guarantees that the task is in ready state (i.e. no
-  // un-ended execution for this task).
-  //
-  // Input synchronizations:
-  // If the synchronization of a input tensor is `kTfLiteSyncTypeNoSyncObj`
-  // type or it's nullptr, it means the data is ready during Eval call.
-  // If not, data will be available when the synchronization signals and the
-  // backend is responsible for closing the underlying synchronization.
-  // The backend is responsible for dedupping the input sync.
-  //
-  // Output synchronizations:
-  // If the synchronization type is `kTfLiteSyncTypeNoSyncObj` or is nullptr,
-  // the backend does not need to provide synchronization objects to the user.
-  // Otherwise, the backend need to provide the sync according to the sync type
-  // provided. The underlying sync object will be closed by the app (or
-  // downstream components).
-  // If there are multiple non-nullptr kTfLiteSynchronization provided for
-  // different output tensors, the backend is responsible for duplicating the
-  // synchronization.
-  // TODO(b/191883048): What if the sync fence is not dup-able?
-  //
-  // Returns kTfLiteOk if the execution is successfully scheduled.
-  virtual TfLiteStatus Eval(TfLiteOpaqueContext* context,
-                            TfLiteOpaqueNode* node,
-                            TfLiteExecutionTask* task) = 0;
-
-  // Waits on the execution scheduled using the task to finish.
-  // TfLite runtime guarantees that the task has an un-ended execution.
-  //
-  // Callers should be able to call `Wait` on the same task from multiple
-  // threads, and those calls should return the same status (i.e. if the backend
-  // failed to successfully wait on the task, all `Wait` to the task should
-  // return the same error before a new invocation is scheduled). Returns
-  // kTfLiteOk if the task is finished (w/ or w/o blocking).
-  virtual TfLiteStatus Wait(TfLiteOpaqueContext* context,
-                            TfLiteExecutionTask* task) = 0;
-
-  // Finishes the task and clean up allocated resources for the task.
-  // May block if there's pending executions.
-  // This function will be called once and only once for individual task.
-  // Returns kTfLiteOk if there's no error. The backend is responsible to
-  // clean up task resources regardless there's error or not.
-  virtual TfLiteStatus Finish(TfLiteOpaqueContext* context,
-                              TfLiteExecutionTask* task) = 0;
-
- protected:
-  TfLiteAsyncKernel* kernel_ = nullptr;
-};
-
-}  // namespace delegates
-}  // namespace tflite
+#include "tensorflow/lite/async/backend_async_kernel_interface.h"  // IWYU pragma: export
+// IWYU pragma: private, include "third_party/tensorflow/lite/async/backend_async_kernel_interface.h"
 
 #endif  // TENSORFLOW_LITE_CORE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
diff --git a/tensorflow/lite/core/async/c/BUILD b/tensorflow/lite/core/async/c/BUILD
index 110aed50a17..af618aaa223 100644
--- a/tensorflow/lite/core/async/c/BUILD
+++ b/tensorflow/lite/core/async/c/BUILD
@@ -15,6 +15,15 @@ package(
     licenses = ["notice"],
 )
 
+exports_files(
+    srcs = [
+        "types.h",
+    ],
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ],
+)
+
 tflite_cc_library_with_c_headers_test(
     name = "types",
     hdrs = ["types.h"],
@@ -127,7 +136,7 @@ cc_test(
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:c_api_without_op_resolver",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/core/async/c/async_signature_runner_test.cc b/tensorflow/lite/core/async/c/async_signature_runner_test.cc
index 5a5eef546dc..e2cd0847e85 100644
--- a/tensorflow/lite/core/async/c/async_signature_runner_test.cc
+++ b/tensorflow/lite/core/async/c/async_signature_runner_test.cc
@@ -34,8 +34,8 @@ limitations under the License.
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/interpreter_test_util.h"
-#include "tensorflow/lite/kernels/builtin_op_kernels.h"
 
 using ::testing::_;
 using ::testing::Return;
diff --git a/tensorflow/lite/core/async/interop/variant.cc b/tensorflow/lite/core/async/interop/variant.cc
index 038c8a53c55..45193d54d40 100644
--- a/tensorflow/lite/core/async/interop/variant.cc
+++ b/tensorflow/lite/core/async/interop/variant.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/core/async/interop/variant.h"
 
+#include <cstring>
 #include <utility>
 
 namespace tflite {
diff --git a/tensorflow/lite/core/async/testing/BUILD b/tensorflow/lite/core/async/testing/BUILD
index e27bc38d90a..7a65cfe1d99 100644
--- a/tensorflow/lite/core/async/testing/BUILD
+++ b/tensorflow/lite/core/async/testing/BUILD
@@ -1,4 +1,5 @@
 # Test utilities for TFLite async execution.
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -23,13 +24,12 @@ cc_library(
     ],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "mock_async_kernel",
     testonly = 1,
     hdrs = ["mock_async_kernel.h"],
-    deps = [
-        "//tensorflow/lite/core/async:backend_async_kernel_interface",
-        "//tensorflow/lite/core/async/c:types",
-        "//testing/base/public:gunit_for_library_testonly",
+    deprecation = "Use //tensorflow/lite/async/testing:mock_async_kernel instead.",
+    tflite_deps = [
+        "//tensorflow/lite/async/testing:mock_async_kernel",
     ],
 )
diff --git a/tensorflow/lite/core/async/testing/mock_async_kernel.h b/tensorflow/lite/core/async/testing/mock_async_kernel.h
index 92ff5f1159d..fa7d434d9a7 100644
--- a/tensorflow/lite/core/async/testing/mock_async_kernel.h
+++ b/tensorflow/lite/core/async/testing/mock_async_kernel.h
@@ -15,65 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
 #define TENSORFLOW_LITE_CORE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
 
-#include <vector>
-
-#include <gmock/gmock.h>
-#include "tensorflow/lite/core/async/backend_async_kernel_interface.h"
-#include "tensorflow/lite/core/async/c/types.h"
-
-namespace tflite {
-namespace async {
-namespace testing {
-
-// A fully mocked out async kernel.
-// Mocked TfLiteAsyncKernel can be retreived by `MockAsyncKernel::kernel()`.
-class MockAsyncKernel : public delegates::BackendAsyncKernelInterface {
- public:
-  MOCK_METHOD(TfLiteStatus, RegisterBuffer,
-              (TfLiteOpaqueContext*, TfLiteIoType, const TfLiteBackendBuffer*,
-               const TfLiteAttributeMap*, TfLiteBufferHandle),
-              (override));
-  MOCK_METHOD(TfLiteStatus, RegisterBufferSlice,
-              (TfLiteOpaqueContext*, TfLiteBufferHandle,
-               const TfLiteAttributeMap*, TfLiteBufferHandle),
-              (override));
-  MOCK_METHOD(TfLiteStatus, UnregisterBuffer,
-              (TfLiteOpaqueContext*, TfLiteBufferHandle), (override));
-  MOCK_METHOD(bool, ReconcileRestrictions,
-              (const TfLiteOpaqueContext*, const TfLiteOpaqueNode*, int,
-               const TfLiteAttributeMap*, TfLiteAttributeMap*,
-               TfLiteAttributeMap*),
-              (const, override));
-  MOCK_METHOD(TfLiteStatus, SetAttributes,
-              (TfLiteOpaqueContext*, TfLiteOpaqueNode*, int,
-               const TfLiteAttributeMap*),
-              (override));
-  MOCK_METHOD(TfLiteStatus, Prepare, (TfLiteOpaqueContext*, TfLiteOpaqueNode*),
-              (override));
-  MOCK_METHOD(TfLiteStatus, Eval,
-              (TfLiteOpaqueContext*, TfLiteOpaqueNode*, TfLiteExecutionTask*),
-              (override));
-  MOCK_METHOD(TfLiteStatus, Wait, (TfLiteOpaqueContext*, TfLiteExecutionTask*),
-              (override));
-  MOCK_METHOD(TfLiteStatus, Finish,
-              (TfLiteOpaqueContext*, TfLiteExecutionTask*), (override));
-
-  const std::vector<const char*>& SupportedBufferTypes(
-      TfLiteIoType io_type) const override {
-    return buffer_types_;
-  }
-  const std::vector<const char*>& SupportedSynchronizations(
-      TfLiteIoType io_type) const override {
-    return sync_types_;
-  }
-
- private:
-  const std::vector<const char*> buffer_types_{"buffer_type"};
-  const std::vector<const char*> sync_types_{"sync_type"};
-};
-
-}  // namespace testing
-}  // namespace async
-}  // namespace tflite
+#include "tensorflow/lite/async/testing/mock_async_kernel.h"  // IWYU pragma: export
+// IWYU pragma: private, include "third_party/tensorflow/lite/async/testing/mock_async_kernel.h"
 
 #endif  // TENSORFLOW_LITE_CORE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
diff --git a/tensorflow/lite/core/async/testing/test_backend.cc b/tensorflow/lite/core/async/testing/test_backend.cc
index e577fe4e87b..a70dc0ef729 100644
--- a/tensorflow/lite/core/async/testing/test_backend.cc
+++ b/tensorflow/lite/core/async/testing/test_backend.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/utils.h"
-#include "tensorflow/lite/core/async/async_kernel_internal.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
@@ -69,6 +69,10 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context,
   reg.builtin_code = kTfLiteBuiltinDelegate;
   reg.custom_name = "TestBackend";
   reg.version = 1;
+  reg.async_kernel = [](TfLiteContext*,
+                        TfLiteNode* node) -> TfLiteAsyncKernel* {
+    return reinterpret_cast<TfLiteAsyncKernel*>(node->user_data);
+  };
 
   return context->ReplaceNodeSubsetsWithDelegateKernels(
       context, reg, BuildTfLiteIntArray(supported_nodes).get(),
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index f510e0ecc4e..0de86e3b0be 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -61,6 +61,7 @@ tflite_cc_library_with_c_headers_test(
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:create_op_resolver_with_builtin_ops",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/async/c:types",
     ],
     alwayslink = 1,  # Why?? TODO(b/161243354): eliminate this.
 )
@@ -95,6 +96,7 @@ tflite_cc_library_with_c_headers_test(
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core:create_op_resolver_header",
         "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/async/c:types",
         "//tensorflow/lite/delegates:interpreter_utils",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels/internal:compatibility",
@@ -116,6 +118,7 @@ tflite_cc_library_with_c_headers_test(
     deps = [
         ":c_api_types",
         "//tensorflow/lite:builtin_ops",
+        "//tensorflow/lite/core/async/c:types",
     ],
 )
 
@@ -135,6 +138,7 @@ tflite_cc_library_with_c_headers_test(
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core:create_op_resolver_header",
         "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/async/c:types",
         "//tensorflow/lite/delegates:interpreter_utils",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels:kernel_util",
@@ -160,6 +164,7 @@ cc_test(
     srcs = ["c_api_test.cc"],
     copts = tflite_copts(),
     data = [
+        "//tensorflow/lite:testdata/2_subgraphs.bin",
         "//tensorflow/lite:testdata/add.bin",
         "//tensorflow/lite:testdata/add_quantized.bin",
         "//tensorflow/lite:testdata/custom_sinh.bin",
@@ -169,7 +174,9 @@ cc_test(
         ":c_api_experimental",
         ":c_api_types",
         ":common",
+        "//tensorflow/core/platform:resource_loader",
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/delegates:delegate_test_util",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
@@ -183,6 +190,7 @@ cc_test(
     srcs = ["c_api_test.cc"],
     copts = tflite_copts(),
     data = [
+        "//tensorflow/lite:testdata/2_subgraphs.bin",
         "//tensorflow/lite:testdata/add.bin",
         "//tensorflow/lite:testdata/add_quantized.bin",
         "//tensorflow/lite:testdata/custom_sinh.bin",
@@ -192,8 +200,10 @@ cc_test(
         ":c_api_types",
         ":c_api_without_op_resolver_without_alwayslink",
         ":common",
+        "//tensorflow/core/platform:resource_loader",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/c:selectively_built_c_api_test_lib",
+        "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/delegates:delegate_test_util",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
@@ -326,7 +336,6 @@ tflite_cc_library_with_c_headers_test(
         "c_api_experimental.h",
         "c_api_opaque.h",
     ],
-    aspect_hints = ["//tools/build_defs/swift:auto_module"],
     copts = tflite_copts(),
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
@@ -356,7 +365,6 @@ tflite_cc_library_with_c_headers_test(
         "c_api_experimental.h",
         "c_api_opaque.h",
     ],
-    aspect_hints = ["//tools/build_defs/swift:auto_module"],
     copts = tflite_copts(),
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
diff --git a/tensorflow/lite/core/c/c_api.cc b/tensorflow/lite/core/c/c_api.cc
index ea1bcfeef8e..d8eeaadd9c2 100644
--- a/tensorflow/lite/core/c/c_api.cc
+++ b/tensorflow/lite/core/c/c_api.cc
@@ -279,6 +279,7 @@ TfLiteRegistrationExternal* TfLiteRegistrationExternalCreate(
                                         /*.free =*/nullptr,
                                         /*.prepare =*/nullptr,
                                         /*.invoke =*/nullptr,
+                                        /*.async_kernel =*/nullptr,
                                         /*.builtin_code =*/builtin_code,
                                         /*.node_index =*/-1};
 }
@@ -314,6 +315,13 @@ void TfLiteRegistrationExternalSetInvoke(
   registration->invoke = invoke;
 }
 
+void TfLiteRegistrationExternalSetAsyncKernel(
+    TfLiteRegistrationExternal* registration,
+    TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
+                                       TfLiteOpaqueNode* node)) {
+  registration->async_kernel = async_kernel;
+}
+
 TfLiteBuiltinOperator TfLiteRegistrationExternalGetBuiltInCode(
     const TfLiteRegistrationExternal* registration) {
   return static_cast<TfLiteBuiltinOperator>(registration->builtin_code);
@@ -338,6 +346,18 @@ const char* TfLiteRegistrationExternalGetCustomName(
 namespace tflite {
 namespace internal {
 
+static TfLiteRegistration* RegistrationExternalToRegistration(
+    const TfLiteRegistrationExternal* registration_external) {
+  // All TfLiteRegistrationExternal objects are dynamically allocated via
+  // TfLiteRegistrationExternalCreate(), so they are guaranteed
+  // to be mutable, hence the const_cast below should be safe.
+  auto registration_external_non_const =
+      const_cast<TfLiteRegistrationExternal*>(registration_external);
+  TfLiteRegistration* new_registration = new TfLiteRegistration{};
+  InitTfLiteRegistration(new_registration, registration_external_non_const);
+  return new_registration;
+}
+
 // Implementation of CallbackOpResolver class which is defined in
 // c_api_internal.h. CallbackOpResolver is a (C++) `tflite::OpResolver` that
 // forwards the methods to (C ABI) callback functions from a
@@ -352,6 +372,7 @@ const TfLiteRegistration* CallbackOpResolver::FindOp(tflite::BuiltinOperator op,
         op_resolver_callbacks_.user_data,
         static_cast<TfLiteBuiltinOperator>(op), version);
   }
+
   // Check if cached Registration is available.
   std::lock_guard<std::mutex> lock(mutex_);
   for (const auto& created_registration : temporary_builtin_registrations_) {
@@ -360,14 +381,35 @@ const TfLiteRegistration* CallbackOpResolver::FindOp(tflite::BuiltinOperator op,
       return created_registration.get();
     }
   }
+
   if (auto* registration =
           BuildBuiltinOpFromLegacyRegistration<TfLiteRegistration_V2>(
               op, version, op_resolver_callbacks_.find_builtin_op_v2);
       registration) {
     return registration;
   }
-  return BuildBuiltinOpFromLegacyRegistration<TfLiteRegistration_V1>(
-      op, version, op_resolver_callbacks_.find_builtin_op_v1);
+  if (auto* registration =
+          BuildBuiltinOpFromLegacyRegistration<TfLiteRegistration_V1>(
+              op, version, op_resolver_callbacks_.find_builtin_op_v1);
+      registration) {
+    return registration;
+  }
+  // Try using newer RegistrationExternal API.
+  if (op_resolver_callbacks_.find_builtin_op_external) {
+    // Get a RegistrationExternal object and create a Registration (V3) object.
+    const TfLiteRegistrationExternal* registration_external =
+        op_resolver_callbacks_.find_builtin_op_external(
+            op_resolver_callbacks_.user_data,
+            static_cast<TfLiteBuiltinOperator>(op), version);
+    if (registration_external) {
+      TfLiteRegistration* new_registration =
+          RegistrationExternalToRegistration(registration_external);
+      temporary_builtin_registrations_.push_back(
+          std::unique_ptr<TfLiteRegistration>(new_registration));
+      return new_registration;
+    }
+  }
+  return nullptr;
 }
 
 // FindOp for custom op query.
@@ -386,14 +428,33 @@ const TfLiteRegistration* CallbackOpResolver::FindOp(const char* op,
       return created_registration.get();
     }
   }
+
   if (auto* registration =
           BuildCustomOpFromLegacyRegistration<TfLiteRegistration_V2>(
               op, version, op_resolver_callbacks_.find_custom_op_v2);
       registration) {
     return registration;
   }
-  return BuildCustomOpFromLegacyRegistration<TfLiteRegistration_V1>(
-      op, version, op_resolver_callbacks_.find_custom_op_v1);
+  if (auto* registration =
+          BuildCustomOpFromLegacyRegistration<TfLiteRegistration_V1>(
+              op, version, op_resolver_callbacks_.find_custom_op_v1);
+      registration) {
+    return registration;
+  }
+  if (op_resolver_callbacks_.find_custom_op_external) {
+    // Get a RegistrationExternal object and create a Registration (V2) object.
+    const TfLiteRegistrationExternal* registration_external =
+        op_resolver_callbacks_.find_custom_op_external(
+            op_resolver_callbacks_.user_data, op, version);
+    if (registration_external) {
+      TfLiteRegistration* new_registration =
+          RegistrationExternalToRegistration(registration_external);
+      temporary_builtin_registrations_.push_back(
+          std::unique_ptr<TfLiteRegistration>(new_registration));
+      return new_registration;
+    }
+  }
+  return nullptr;
 }
 
 TfLiteInterpreter* InterpreterCreateWithOpResolver(
@@ -435,7 +496,11 @@ TfLiteInterpreter* InterpreterCreateWithOpResolver(
        optional_options->op_resolver_callbacks.find_builtin_op_v1 != nullptr ||
        optional_options->op_resolver_callbacks.find_custom_op_v1 != nullptr ||
        optional_options->op_resolver_callbacks.find_builtin_op_v2 != nullptr ||
-       optional_options->op_resolver_callbacks.find_custom_op_v2 != nullptr)) {
+       optional_options->op_resolver_callbacks.find_custom_op_v2 != nullptr ||
+       optional_options->op_resolver_callbacks.find_builtin_op_external !=
+           nullptr ||
+       optional_options->op_resolver_callbacks.find_custom_op_external !=
+           nullptr)) {
     callback_op_resolver.SetCallbacks(optional_options->op_resolver_callbacks);
     op_resolver = &callback_op_resolver;
   }
diff --git a/tensorflow/lite/core/c/c_api.h b/tensorflow/lite/core/c/c_api.h
index 94e39e4ebfb..a4316b6ae9e 100644
--- a/tensorflow/lite/core/c/c_api.h
+++ b/tensorflow/lite/core/c/c_api.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <stdlib.h>
 
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/async/c/types.h"
 #include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
 
 // --------------------------------------------------------------------------
@@ -543,6 +544,19 @@ TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInvoke(
     TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
                            TfLiteOpaqueNode* node));
 
+/// Sets the async kernel accessor callback for the registration.
+///
+/// The callback is called to retrieve the async kernel if the delegate supports
+/// it. If the delegate does not support async execution, either this function
+/// should not be called, or `async_kernel` needs to be nullptr.
+/// `node` is the delegate TfLiteNode created by `ModifyGraphWithDelegate`.
+/// Please refer `async_kernel` of `TfLiteRegistration` for the detail.
+/// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetAsyncKernel(
+    TfLiteRegistrationExternal* registration,
+    TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
+                                       TfLiteOpaqueNode* node));
+
 // NOLINTEND(modernize-redundant-void-arg)
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/core/c/c_api_experimental.cc b/tensorflow/lite/core/c/c_api_experimental.cc
index 9fecba0fda9..6466deae900 100644
--- a/tensorflow/lite/core/c/c_api_experimental.cc
+++ b/tensorflow/lite/core/c/c_api_experimental.cc
@@ -61,6 +61,20 @@ void TfLiteInterpreterOptionsAddCustomOp(TfLiteInterpreterOptions* options,
                                          max_version);
 }
 
+void TfLiteInterpreterOptionsSetOpResolverExternal(
+    TfLiteInterpreterOptions* options,
+    const TfLiteRegistrationExternal* (*find_builtin_op)(void* user_data,
+                                                         int op, int version),
+    const TfLiteRegistrationExternal* (*find_custom_op)(void* user_data,
+                                                        const char* custom_op,
+                                                        int version),
+    void* op_resolver_user_data) {
+  options->op_resolver_callbacks = {};  // Sets all fields to null.
+  options->op_resolver_callbacks.find_builtin_op_external = find_builtin_op;
+  options->op_resolver_callbacks.find_custom_op_external = find_custom_op;
+  options->op_resolver_callbacks.user_data = op_resolver_user_data;
+}
+
 void TfLiteInterpreterOptionsSetOpResolver(
     TfLiteInterpreterOptions* options,
     const TfLiteRegistration* (*find_builtin_op)(void* user_data,
@@ -69,6 +83,7 @@ void TfLiteInterpreterOptionsSetOpResolver(
     const TfLiteRegistration* (*find_custom_op)(void* user_data, const char* op,
                                                 int version),
     void* op_resolver_user_data) {
+  options->op_resolver_callbacks = {};  // Sets all fields to null.
   options->op_resolver_callbacks.find_builtin_op = find_builtin_op;
   options->op_resolver_callbacks.find_custom_op = find_custom_op;
   options->op_resolver_callbacks.user_data = op_resolver_user_data;
@@ -83,6 +98,7 @@ void TfLiteInterpreterOptionsSetOpResolverV1(
                                                       const char* op,
                                                       int version),
     void* op_resolver_user_data) {
+  options->op_resolver_callbacks = {};  // Sets all fields to null.
   options->op_resolver_callbacks.find_builtin_op_v1 = find_builtin_op_v1;
   options->op_resolver_callbacks.find_custom_op_v1 = find_custom_op_v1;
   options->op_resolver_callbacks.user_data = op_resolver_user_data;
@@ -97,6 +113,7 @@ void TfLiteInterpreterOptionsSetOpResolverV2(
                                                       const char* op,
                                                       int version),
     void* op_resolver_user_data) {
+  options->op_resolver_callbacks = {};  // Sets all fields to null.
   options->op_resolver_callbacks.find_builtin_op_v2 = find_builtin_op_v2;
   options->op_resolver_callbacks.find_custom_op_v2 = find_custom_op_v2;
   options->op_resolver_callbacks.user_data = op_resolver_user_data;
diff --git a/tensorflow/lite/core/c/c_api_experimental.h b/tensorflow/lite/core/c/c_api_experimental.h
index 638bf36129b..f203f8a4198 100644
--- a/tensorflow/lite/core/c/c_api_experimental.h
+++ b/tensorflow/lite/core/c/c_api_experimental.h
@@ -64,7 +64,8 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterResetVariableTensors(
 /// practice is making the provided `TfLiteRegistration` instance static.
 ///
 /// Code that uses this function should NOT call
-/// `TfLiteInterpreterOptionsSetOpResolver` on the same options object.
+/// `TfLiteInterpreterOptionsSetOpResolver` (or related functions) on the same
+/// options object.
 ///
 /// WARNING: This is an experimental API and subject to change.
 TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddBuiltinOp(
@@ -86,7 +87,8 @@ TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddBuiltinOp(
 /// as the lifetime of the `TfLiteInterpreterOptions`.
 ///
 /// Code that uses this function should NOT call
-/// `TfLiteInterpreterOptionsSetOpResolver` on the same options object.
+/// `TfLiteInterpreterOptionsSetOpResolver` (or related functions) on the same
+/// options object.
 ///
 /// WARNING: This is an experimental API and subject to change.
 TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
@@ -94,6 +96,32 @@ TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
     const TfLiteRegistration* registration, int32_t min_version,
     int32_t max_version);
 
+/// Registers callbacks for resolving builtin or custom operators.
+///
+/// The `TfLiteInterpreterOptionsSetOpResolverExternal` function provides an
+/// alternative method for registering builtin ops and/or custom ops, by
+/// providing operator resolver callbacks.  Unlike using
+/// `TfLiteInterpreterOptionsAddBuiltinOp` and/or
+/// `TfLiteInterpreterOptionsAddAddCustomOp`, these let you register all the
+/// operators in a single call.
+///
+/// Code that uses this function should NOT call
+/// `TfLiteInterpreterOptionsAddBuiltin` or
+/// `TfLiteInterpreterOptionsAddCustomOp` on the same options object.
+///
+/// If `op_resolver_user_data` is non-null, its lifetime must be at least as
+/// long as the lifetime of the `TfLiteInterpreterOptions`.
+///
+/// WARNING: This is an experimental API and subject to change.
+void TfLiteInterpreterOptionsSetOpResolverExternal(
+    TfLiteInterpreterOptions* options,
+    const TfLiteRegistrationExternal* (*find_builtin_op)(void* user_data,
+                                                         int op, int version),
+    const TfLiteRegistrationExternal* (*find_custom_op)(void* user_data,
+                                                        const char* custom_op,
+                                                        int version),
+    void* op_resolver_user_data);
+
 /// Registers callbacks for resolving builtin or custom operators.
 ///
 /// The `TfLiteInterpreterOptionsSetOpResolver` function provides an alternative
@@ -110,6 +138,8 @@ TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
 /// long as the lifetime of the `TfLiteInterpreterOptions`.
 ///
 /// WARNING: This is an experimental API and subject to change.
+///
+/// DEPRECATED: use TfLiteInterpreterOptionsSetOpResolverExternal instead.
 void TfLiteInterpreterOptionsSetOpResolver(
     TfLiteInterpreterOptions* options,
     const TfLiteRegistration* (*find_builtin_op)(void* user_data,
diff --git a/tensorflow/lite/core/c/c_api_experimental_test.cc b/tensorflow/lite/core/c/c_api_experimental_test.cc
index 543a5f665de..f1d045d737c 100644
--- a/tensorflow/lite/core/c/c_api_experimental_test.cc
+++ b/tensorflow/lite/core/c/c_api_experimental_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string.h>
 
+#include <array>
 #include <memory>
 #include <vector>
 
@@ -35,7 +36,7 @@ using tflite::delegates::test_utils::TestDelegate;
 
 namespace {
 
-const TfLiteRegistration* GetDummyRegistration() {
+const TfLiteRegistration* GetNoOpRegistration() {
   static const TfLiteRegistration registration = {
       /*init=*/nullptr,
       /*free=*/nullptr,
@@ -44,6 +45,17 @@ const TfLiteRegistration* GetDummyRegistration() {
   return &registration;
 }
 
+const TfLiteRegistrationExternal* GetNoOpRegistrationExternal() {
+  static TfLiteRegistrationExternal* registration =
+      TfLiteRegistrationExternalCreate(kTfLiteBuiltinCustom, "NoOp", 1);
+  TfLiteRegistrationExternalSetInvoke(
+      registration,
+      /*invoke=*/[](TfLiteOpaqueContext*, TfLiteOpaqueNode*) {
+        return kTfLiteOk;
+      });
+  return registration;
+}
+
 TEST(CApiExperimentalTest, Smoke) {
   TfLiteModel* model = TfLiteModelCreateFromFile(
       tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
@@ -52,7 +64,7 @@ TEST(CApiExperimentalTest, Smoke) {
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
   TfLiteInterpreterOptionsAddBuiltinOp(options, kTfLiteBuiltinAdd,
-                                       GetDummyRegistration(), 1, 1);
+                                       GetNoOpRegistration(), 1, 1);
   TfLiteInterpreterOptionsSetUseNNAPI(options, true);
 
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
@@ -75,7 +87,7 @@ TEST(CApiExperimentalTest, SelectedBuiltins) {
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
   TfLiteInterpreterOptionsAddBuiltinOp(options, kTfLiteBuiltinAdd,
-                                       GetDummyRegistration(), 1, 1);
+                                       GetNoOpRegistration(), 1, 1);
 
   TfLiteInterpreter* interpreter =
       TfLiteInterpreterCreateWithSelectedOps(model, options);
@@ -135,7 +147,7 @@ const TfLiteRegistration* MyFindBuiltinOp(void* user_data,
   OpResolverData* my_data = static_cast<OpResolverData*>(user_data);
   if (op == kTfLiteBuiltinAdd && version == 1) {
     my_data->called_for_add = true;
-    return GetDummyRegistration();
+    return GetNoOpRegistration();
   }
   return nullptr;
 }
@@ -143,7 +155,7 @@ const TfLiteRegistration* MyFindBuiltinOp(void* user_data,
 const TfLiteRegistration* MyFindCustomOp(void*, const char* custom_op,
                                          int version) {
   if (absl::string_view(custom_op) == "foo" && version == 1) {
-    return GetDummyRegistration();
+    return GetNoOpRegistration();
   }
   return nullptr;
 }
@@ -175,6 +187,52 @@ TEST(CApiExperimentalTest, SetOpResolver) {
   TfLiteModelDelete(model);
 }
 
+const TfLiteRegistrationExternal* MyFindBuiltinOpExternal(void* user_data,
+                                                          int op, int version) {
+  OpResolverData* my_data = static_cast<OpResolverData*>(user_data);
+  if (op == kTfLiteBuiltinAdd && version == 1) {
+    my_data->called_for_add = true;
+    return GetNoOpRegistrationExternal();
+  }
+  return nullptr;
+}
+
+const TfLiteRegistrationExternal* MyFindCustomOpExternal(void*,
+                                                         const char* custom_op,
+                                                         int version) {
+  if (absl::string_view(custom_op) == "foo" && version == 1) {
+    return GetNoOpRegistrationExternal();
+  }
+  return nullptr;
+}
+
+// Test using TfLiteInterpreterCreateWithSelectedOps.
+TEST(CApiExperimentalTest, SetOpResolverExternal) {
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
+  ASSERT_NE(model, nullptr);
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+
+  OpResolverData my_data;
+  TfLiteInterpreterOptionsSetOpResolverExternal(
+      options, MyFindBuiltinOpExternal, MyFindCustomOpExternal, &my_data);
+  EXPECT_FALSE(my_data.called_for_add);
+
+  TfLiteInterpreter* interpreter =
+      TfLiteInterpreterCreateWithSelectedOps(model, options);
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  EXPECT_EQ(TfLiteInterpreterResetVariableTensors(interpreter), kTfLiteOk);
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+  EXPECT_TRUE(my_data.called_for_add);
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+}
+
 void AllocateAndSetInputs(TfLiteInterpreter* interpreter) {
   std::array<int, 1> input_dims = {2};
   ASSERT_EQ(TfLiteInterpreterResizeInputTensor(
diff --git a/tensorflow/lite/core/c/c_api_opaque.cc b/tensorflow/lite/core/c/c_api_opaque.cc
index da0d919321b..54fb05b2985 100644
--- a/tensorflow/lite/core/c/c_api_opaque.cc
+++ b/tensorflow/lite/core/c/c_api_opaque.cc
@@ -49,6 +49,20 @@ const TfLiteContext* Convert(const TfLiteOpaqueContext* opaque_context) {
   return reinterpret_cast<const TfLiteContext*>(opaque_context);
 }
 
+TfLiteContext* Convert(TfLiteOpaqueContext* opaque_context) {
+  // The following cast is safe only because this code is part of the
+  // TF Lite runtime implementation.  Apps using TF Lite should not rely on
+  // TfLiteOpaqueContext and TfLiteContext being equivalent.
+  return reinterpret_cast<TfLiteContext*>(opaque_context);
+}
+
+TfLiteOpaqueContext* Convert(TfLiteContext* tflite_context) {
+  // The following cast is safe only because this code is part of the
+  // TF Lite runtime implementation.  Apps using TF Lite should not rely on
+  // TfLiteOpaqueContext and TfLiteContext being equivalent.
+  return reinterpret_cast<TfLiteOpaqueContext*>(tflite_context);
+}
+
 const ::tflite::Subgraph* GetSubgraph(
     const TfLiteOpaqueContext* opaque_context) {
   // The following cast is safe only because this code is part of the
@@ -58,6 +72,12 @@ const ::tflite::Subgraph* GetSubgraph(
       Convert(opaque_context)->impl_);
 }
 
+::tflite::Subgraph* GetSubgraph(TfLiteOpaqueContext* opaque_context) {
+  // The following cast is safe only because this code is part of the
+  // TF Lite runtime implementation.  Apps using TF Lite should not rely on
+  // TfLiteContext::impl_ having type ::tflite::Subgraph*.
+  return reinterpret_cast<::tflite::Subgraph*>(Convert(opaque_context)->impl_);
+}
 }  // namespace
 
 TfLiteType TfLiteOpaqueTensorType(const TfLiteOpaqueTensor* opaque_tensor) {
@@ -360,6 +380,18 @@ TfLiteStatus TfLiteOpaqueContextResizeTensor(TfLiteOpaqueContext* context,
       tflite_context, reinterpret_cast<TfLiteTensor*>(tensor), new_size);
 }
 
+TfLiteOpaqueContext* TfLiteOpaqueContextGetSubgraphContext(
+    struct TfLiteOpaqueContext* opaque_context, int subgraph_index) {
+  auto* subgraph = GetSubgraph(opaque_context);
+  return Convert(subgraph->GetSubgraphContext(subgraph_index));
+}
+
+TfLiteStatus TfLiteOpaqueContextMarkSubgraphAsDelegationSkippable(
+    TfLiteOpaqueContext* opaque_context, int subgraph_index) {
+  auto* subgraph = GetSubgraph(opaque_context);
+  return subgraph->MarkSubgraphAsDelegationSkippable(subgraph_index);
+}
+
 void TfLiteOpaqueContextReportError(struct TfLiteOpaqueContext* opaque_context,
                                     const char* format, ...) {
   va_list vlist;
diff --git a/tensorflow/lite/core/c/c_api_opaque.h b/tensorflow/lite/core/c/c_api_opaque.h
index 2e031e827d1..749383b9ffd 100644
--- a/tensorflow/lite/core/c/c_api_opaque.h
+++ b/tensorflow/lite/core/c/c_api_opaque.h
@@ -353,6 +353,50 @@ TfLiteStatus TfLiteOpaqueContextResizeTensor(TfLiteOpaqueContext* context,
                                              TfLiteOpaqueTensor* tensor,
                                              TfLiteIntArray* new_size);
 
+// Entry point for C API GetSubgraphContext.
+//
+// Retrieves the corresponding TfLiteOpaqueContext of a subgraph given a
+// subgraph index. If an invalid subgraph index is given, then returns nullptr.
+TFL_CAPI_EXPORT
+TfLiteOpaqueContext* TfLiteOpaqueContextGetSubgraphContext(
+    struct TfLiteOpaqueContext* opaque_context, int subgraph_index);
+
+// Entry point for C API MarkSubgraphAsDelegationSkippable
+//
+// Marks the subgraph with the given index as "delegation-skippable". Returns
+// kTfLiteOk if the given subgraph index is valid and is successfully marked
+// as delegation-skippable, and an error status if the subgraph index is
+// invalid.
+// If a subgraph is delegation-skippable, then the subgraph will be handled by a
+// TfLiteOpaqueDelegate (and that the delegate is supposed to be already aware
+// of this state), and therefore, TfLiteInterpreter can skip invoking
+// `ModifyGraphWithDelegate` on this subgraph.
+// NOTE: This function is expected to be called only when the subgraph that
+// `subgraph_index` is pointing to should be skipped by
+// interpreter::ModifyGraphWithDelegate (e.g. the subgraph is part of the list
+// of callee subgraphs of the same control flow node, and all of those callees
+// are supported by the same delegate at once).
+//
+// For  example, this function can be used when the delegate is handling control
+// flow ops like while op.
+// E.g. A while op has condition subgraph indexed at `i` and body subgraph
+// indexed at `j`. The op can be delegated when the following condition
+// satisfied:
+//   1. The delegate supports while op
+//   2. Both condition subgraph `i` and body subgraph `j` can be fully delegated
+//      by the delegate.
+// Then if the delegate decides to support the while node along with both body
+// and condition subgraphs, it should mark subgraphs `i` and `j` skippable so
+// those two subgraphs won't be delegated separately again after being
+// absorbed by the parent subgraph.
+// WARNING: It is the delegate's responsibility to define when to skip
+// subgraph->ModifyGraphWithDelegate, to check any edge cases (i.e. multiple
+// references to the subgraph that `subgraph_index` is pointing to), and to mark
+// that subgraph as skippable using this function.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextMarkSubgraphAsDelegationSkippable(
+    TfLiteOpaqueContext* opaque_context, int subgraph_index);
+
 // Reports an error message formed by using the provided 'format' string in
 // combination with the data provided via the unnamed arguments following the
 // the 'format' parameter ('...').  The intended usage and behavior is the same
@@ -461,38 +505,39 @@ void TfLiteOpaqueContextReportErrorVa(
 // extremely expensive computations should be done.
 // NOTE: Use TF_LITE_ENSURE_TYPES_EQ if comparing TfLiteTypes.
 #if !defined(TF_LITE_OPAQUE_ENSURE_EQ)
-#define TF_LITE_OPAQUE_ENSURE_EQ(opaque_context, a, b)                         \
-  do {                                                                         \
-    if ((a) != (b)) {                                                          \
-      TF_LITE_OPAQUE_KERNEL_LOG((opaque_context), "%s:%d %s != %s (%d != %d)", \
-                                __FILE__, __LINE__, #a, #b, (a), (b));         \
-      return kTfLiteError;                                                     \
-    }                                                                          \
+#define TF_LITE_OPAQUE_ENSURE_EQ(opaque_context, a, b)                  \
+  do {                                                                  \
+    if ((a) != (b)) {                                                   \
+      TF_LITE_OPAQUE_KERNEL_LOG((opaque_context),                       \
+                                "%s:%d: %s != %s (%d != %d)", __FILE__, \
+                                __LINE__, #a, #b, (a), (b));            \
+      return kTfLiteError;                                              \
+    }                                                                   \
   } while (0)
 #endif
 
 #if !defined(TF_LITE_OPAQUE_ENSURE_TYPES_EQ)
-#define TF_LITE_OPAQUE_ENSURE_TYPES_EQ(opaque_context, a, b)                   \
-  do {                                                                         \
-    if ((a) != (b)) {                                                          \
-      TF_LITE_OPAQUE_KERNEL_LOG((opaque_context), "%s:%d %s != %s (%s != %s)", \
-                                __FILE__, __LINE__, #a, #b,                    \
-                                TfLiteTypeGetName(a), TfLiteTypeGetName(b));   \
-      return kTfLiteError;                                                     \
-    }                                                                          \
+#define TF_LITE_OPAQUE_ENSURE_TYPES_EQ(opaque_context, a, b)                  \
+  do {                                                                        \
+    if ((a) != (b)) {                                                         \
+      TF_LITE_OPAQUE_KERNEL_LOG(                                              \
+          (opaque_context), "%s:%d: %s != %s (%s != %s)", __FILE__, __LINE__, \
+          #a, #b, TfLiteTypeGetName(a), TfLiteTypeGetName(b));                \
+      return kTfLiteError;                                                    \
+    }                                                                         \
   } while (0)
 #endif
 
 #if !defined(TF_LITE_OPAQUE_ENSURE_NEAR)
-#define TF_LITE_OPAQUE_ENSURE_NEAR(opaque_context, a, b, epsilon)            \
-  do {                                                                       \
-    auto delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a));                    \
-    if (delta > epsilon) {                                                   \
-      TF_LITE_OPAQUE_KERNEL_LOG(                                             \
-          (opaque_context), "%s:%d %s not near %s (%f != %f)", __FILE__,     \
-          __LINE__, #a, #b, static_cast<double>(a), static_cast<double>(b)); \
-      return kTfLiteError;                                                   \
-    }                                                                        \
+#define TF_LITE_OPAQUE_ENSURE_NEAR(opaque_context, a, b, epsilon)             \
+  do {                                                                        \
+    double delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a));                   \
+    if (delta > epsilon) {                                                    \
+      TF_LITE_OPAQUE_KERNEL_LOG((opaque_context),                             \
+                                "%s:%d: %s not near %s (%f != %f)", __FILE__, \
+                                __LINE__, #a, #b, (double)(a), (double)(b));  \
+      return kTfLiteError;                                                    \
+    }                                                                         \
   } while (0)
 #endif
 
diff --git a/tensorflow/lite/core/c/c_api_test.cc b/tensorflow/lite/core/c/c_api_test.cc
index b1eda023bf7..8565cc5ada6 100644
--- a/tensorflow/lite/core/c/c_api_test.cc
+++ b/tensorflow/lite/core/c/c_api_test.cc
@@ -28,11 +28,13 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/c_api_opaque.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/delegates/delegate_test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/testing/util.h"
@@ -64,8 +66,9 @@ TEST(CApiSimple, SchemaVersion) {
 }
 
 TEST(CApiSimple, Smoke) {
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
   ASSERT_NE(model, nullptr);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
@@ -141,7 +144,9 @@ TEST(CApiSimple, Smoke) {
 
 TEST(CApiSimple, QuantizationParams) {
   TfLiteModel* model = TfLiteModelCreateFromFile(
-      "third_party/tensorflow/lite/testdata/add_quantized.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/add_quantized.bin")
+          .c_str());
   ASSERT_NE(model, nullptr);
 
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, nullptr);
@@ -200,8 +205,9 @@ TEST(CApiSimple, QuantizationParams) {
 }
 
 TEST(CApiSimple, TfLiteInterpreterGetTensor) {
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
   ASSERT_NE(model, nullptr);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
@@ -223,7 +229,7 @@ TEST(CApiSimple, TfLiteInterpreterGetTensor) {
             kTfLiteOk);
   ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
 
-  // The 'third_party/tensorflow/lite/testdata/add.bin' model uses model tensor
+  // The 'tensorflow/lite/testdata/add.bin' model uses model tensor
   // at index 1 as the input tensor.
   TfLiteTensor* input_tensor = TfLiteInterpreterGetTensor(interpreter, 1);
   ASSERT_NE(input_tensor, nullptr);
@@ -246,7 +252,7 @@ TEST(CApiSimple, TfLiteInterpreterGetTensor) {
 
   ASSERT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
 
-  // The 'third_party/tensorflow/lite/testdata/add.bin' model uses model tensor
+  // The 'tensorflow/lite/testdata/add.bin' model uses model tensor
   // at index 2 as the output tensor.
   const TfLiteTensor* output_tensor =
       TfLiteInterpreterGetTensor(interpreter, 2);
@@ -274,8 +280,9 @@ TEST(CApiSimple, TfLiteInterpreterGetTensor) {
 }
 
 TEST(CApiSimple, Delegate) {
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
 
   // Create and install a delegate instance.
   bool delegate_prepared = false;
@@ -300,8 +307,9 @@ TEST(CApiSimple, Delegate) {
 }
 
 TEST(CApiSimple, DelegateExternal_GetExecutionPlan) {
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
 
   // Create and install a delegate instance.
   bool delegate_prepared = false;
@@ -336,9 +344,64 @@ TEST(CApiSimple, DelegateExternal_GetExecutionPlan) {
   TfLiteOpaqueDelegateDelete(opaque_delegate);
 }
 
-TEST(CApiSimple, DelegateFails) {
+// NOTE: This function does not illustrate intended usage by applications, and
+// clients should not mimic such a scenario in their code.
+// This is a helper function that retrieves whether the subgraph pointed by the
+// given subgraph_index is marked as "delegation-skippable", a check that is
+// expected to happen in the TFLite runtime (in the
+// Interpreter::ModifyGraphWithDelegate function call).
+// The following cast is safe only because this code is part of the API testing.
+bool SubgraphIsDelegationSkippable(TfLiteOpaqueContext* context,
+                                   int subgraph_index) {
+  TfLiteOpaqueContext* skipped_subgraph_context =
+      TfLiteOpaqueContextGetSubgraphContext(context, subgraph_index);
+  tflite::Subgraph* subgraph = reinterpret_cast<::tflite::Subgraph*>(
+      reinterpret_cast<TfLiteContext*>(skipped_subgraph_context)->impl_);
+  return subgraph->IsDelegationSkippable();
+}
+
+TEST(CApiSimple, DelegateExternal_MarkSubgraphAsDelegationSkippable) {
   TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+      TfLiteModelCreateFromFile(tensorflow::GetDataDependencyFilepath(
+                                    "tensorflow/lite/testdata/2_subgraphs.bin")
+                                    .c_str());
+
+  // Create and install a delegate instance.
+  bool delegate_prepared = false;
+  TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
+  opaque_delegate_builder.data = &delegate_prepared;
+  opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* context,  // NOLINT
+                                       TfLiteOpaqueDelegate* opaque_delegate,
+                                       void* data) {
+    *static_cast<bool*>(data) = true;
+
+    EXPECT_EQ(kTfLiteOk,
+              TfLiteOpaqueContextMarkSubgraphAsDelegationSkippable(context, 1));
+    EXPECT_TRUE(SubgraphIsDelegationSkippable(context, 1));
+
+    return kTfLiteOk;
+  };
+
+  TfLiteOpaqueDelegate* opaque_delegate =
+      TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddDelegate(options, opaque_delegate);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+
+  // The delegate should have been applied.
+  EXPECT_TRUE(delegate_prepared);
+
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteModelDelete(model);
+  TfLiteOpaqueDelegateDelete(opaque_delegate);
+}
+
+TEST(CApiSimple, DelegateFails) {
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
 
   // Create and install a delegate instance.
   TfLiteDelegate delegate = TfLiteDelegateCreate();
@@ -358,7 +421,6 @@ TEST(CApiSimple, DelegateFails) {
 
 struct DelegateState {
   bool delegate_prepared;
-  TfLiteRegistrationExternal* registration_external;
 };
 
 struct OpState {
@@ -368,12 +430,12 @@ struct OpState {
 std::vector<int>* g_nodes_to_replace;
 TfLiteOpaqueDelegate* g_opaque_delegate_struct;
 
-TfLiteRegistrationExternal* CreateExternalRegistration() {
-  TfLiteRegistrationExternal* registration_external =
+TfLiteRegistrationExternal* CreateDelegateKernelExternalRegistration() {
+  TfLiteRegistrationExternal* delegate_kernel_registration_external =
       TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate,
                                        "TEST DELEGATE KERNEL", /*version=*/1);
   TfLiteRegistrationExternalSetInit(
-      registration_external,
+      delegate_kernel_registration_external,
       [](TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
         const TfLiteOpaqueDelegateParams* params =
@@ -395,22 +457,22 @@ TfLiteRegistrationExternal* CreateExternalRegistration() {
         return new OpState{true};
       });
   TfLiteRegistrationExternalSetFree(
-      registration_external, [](TfLiteOpaqueContext* context, void* buffer) {
+      delegate_kernel_registration_external,
+      [](TfLiteOpaqueContext* context, void* buffer) {
         delete (reinterpret_cast<OpState*>(buffer));
       });
-  return registration_external;
+  return delegate_kernel_registration_external;
 }
 
 TEST(CApiSimple, OpaqueDelegate_ReplaceNodeSubsetsWithDelegateKernels) {
   g_nodes_to_replace = new std::vector<int>();
 
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
 
-  TfLiteRegistrationExternal* registration_external =
-      CreateExternalRegistration();
   // Create and install a delegate instance.
-  DelegateState delegate_state{false, registration_external};
+  DelegateState delegate_state{false};
   TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
   opaque_delegate_builder.data = &delegate_state;
   opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* opaque_context,
@@ -431,8 +493,8 @@ TEST(CApiSimple, OpaqueDelegate_ReplaceNodeSubsetsWithDelegateKernels) {
     EXPECT_NE(registration_external, nullptr);
 
     TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
-        opaque_context, delegate_state->registration_external, execution_plan,
-        opaque_delegate);
+        opaque_context, CreateDelegateKernelExternalRegistration(),
+        execution_plan, opaque_delegate);
 
     return kTfLiteOk;
   };
@@ -450,7 +512,7 @@ TEST(CApiSimple, OpaqueDelegate_ReplaceNodeSubsetsWithDelegateKernels) {
   // The delegate should have been applied.
   EXPECT_TRUE(delegate_state.delegate_prepared);
   std::vector<int>& nodes_to_replace = *g_nodes_to_replace;
-  // We know that "third_party/tensorflow/lite/testdata/add.bin" contains two
+  // We know that "tensorflow/lite/testdata/add.bin" contains two
   // nodes, 0 and 1, and that 0 comes before 1 in the execution plan.
   EXPECT_EQ(nodes_to_replace.size(), 2);
   EXPECT_EQ(nodes_to_replace[0], 0);
@@ -467,13 +529,12 @@ TEST(CApiSimple,
      OpaqueDelegate_TransferRegistrationExternalOwnershipWithoutNodeToReplace) {
   g_nodes_to_replace = new std::vector<int>();
 
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
 
-  TfLiteRegistrationExternal* registration_external =
-      CreateExternalRegistration();
   // Create and install a delegate instance.
-  DelegateState delegate_state{false, registration_external};
+  DelegateState delegate_state{false};
   TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
   opaque_delegate_builder.data = &delegate_state;
   opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* opaque_context,
@@ -492,7 +553,7 @@ TEST(CApiSimple,
     // Create a fake execution plan to avoid replacing nodes.
     TfLiteIntArray* fake_execution_plan = TfLiteIntArrayCreate(0);
     TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
-        opaque_context, delegate_state->registration_external,
+        opaque_context, CreateDelegateKernelExternalRegistration(),
         fake_execution_plan, opaque_delegate);
     TfLiteIntArrayFree(fake_execution_plan);
 
@@ -527,10 +588,8 @@ TEST_F(TestFP16Delegation,
        ReplaceNodeSubsetsWithDelegateKernels_MultipleDelegateKernels) {
   g_nodes_to_replace = new std::vector<int>();
 
-  TfLiteRegistrationExternal* registration_external =
-      CreateExternalRegistration();
   // Create and install a delegate instance.
-  DelegateState delegate_state{false, registration_external};
+  DelegateState delegate_state{false};
   TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
   opaque_delegate_builder.data = &delegate_state;
   opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* opaque_context,
@@ -563,7 +622,7 @@ TEST_F(TestFP16Delegation,
     }
 
     TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
-        opaque_context, delegate_state->registration_external,
+        opaque_context, CreateDelegateKernelExternalRegistration(),
         subset_to_replace, opaque_delegate);
 
     TfLiteIntArrayFree(subset_to_replace);
@@ -605,8 +664,9 @@ TEST(CApiSimple, InterpreterOptionsCopy) {
 }
 
 TEST(CApiSimple, ErrorReporter) {
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
 
   // Install a custom error reporter into the interpreter by way of options.
@@ -634,7 +694,9 @@ TEST(CApiSimple, ModelCreateWithErrorReporter) {
   tflite::TestErrorReporter reporter;
 
   // valid model with error reporter
-  std::ifstream model_file("tensorflow/lite/testdata/add.bin");
+  std::ifstream model_file(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
   model_file.seekg(0, std::ios_base::end);
   std::vector<char> model_buffer(model_file.tellg());
   model_file.seekg(0, std::ios_base::beg);
@@ -661,8 +723,9 @@ TEST(CApiSimple, ModelCreateFromFileWithErrorReporter) {
 
   // valid model file with error reporter
   model = TfLiteModelCreateFromFileWithErrorReporter(
-      "third_party/tensorflow/lite/testdata/add.bin", error_reporter,
-      &reporter);
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str(),
+      error_reporter, &reporter);
   ASSERT_NE(model, nullptr);
   EXPECT_EQ(reporter.error_messages(), "");
   TfLiteModelDelete(model);
@@ -678,15 +741,12 @@ TEST(CApiSimple, ModelCreateFromFileWithErrorReporter) {
   TfLiteModelDelete(model);
 }
 
-TEST(CApiSimple, OpaqueDelegate_TfLiteOpaqueTensorGet) {
-  struct DelegateKernelState {
-    TfLiteOpaqueTensor* input_tensor = nullptr;
-    TfLiteOpaqueTensor* output_tensor = nullptr;
-  };
-
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+struct DelegateKernelState {
+  TfLiteOpaqueTensor* input_tensor = nullptr;
+  TfLiteOpaqueTensor* output_tensor = nullptr;
+};
 
+TfLiteRegistrationExternal* CreateReg() {
   auto reg_ex = TfLiteRegistrationExternalCreate(
       kTfLiteBuiltinDelegate, "Test driver delegate", /*version=*/1);
   TfLiteRegistrationExternalSetInit(
@@ -745,12 +805,18 @@ TEST(CApiSimple, OpaqueDelegate_TfLiteOpaqueTensorGet) {
     DelegateKernelState* state = reinterpret_cast<DelegateKernelState*>(data);
     delete state;
   });
+  return reg_ex;
+}
+
+TEST(CApiSimple, OpaqueDelegate_TfLiteOpaqueTensorGet) {
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
 
   struct DelegateState {
     bool delegate_prepared = false;
-    TfLiteRegistrationExternal* registration_external = nullptr;
   };
-  DelegateState delegate_state{false, reg_ex};
+  DelegateState delegate_state{false};
 
   // Create and install a delegate instance.
   TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
@@ -787,8 +853,7 @@ TEST(CApiSimple, OpaqueDelegate_TfLiteOpaqueTensorGet) {
     }
 
     TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
-        context, delegate_state->registration_external, nodes_to_replace,
-        opaque_delegate);
+        context, CreateReg(), nodes_to_replace, opaque_delegate);
 
     TfLiteIntArrayFree(nodes_to_replace);
     return kTfLiteOk;
@@ -853,8 +918,9 @@ TEST(CApiSimple, OpaqueContextGetNodeAndRegistration) {
   };
   DelegatePrepareStatus delegate_state{false};
 
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
 
   TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
   opaque_delegate_builder.data = &delegate_state;
@@ -910,8 +976,9 @@ TEST(CApiSimple, TfLiteOpaqueContextResizeTensor) {
   };
   DelegatePrepareStatus delegate_state{false};
 
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
 
   TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
   opaque_delegate_builder.data = &delegate_state;
@@ -960,7 +1027,9 @@ TEST(CApiSimple, TfLiteOpaqueContextResizeTensor) {
 }
 
 TEST(CApiSimple, ValidModel) {
-  std::ifstream model_file("tensorflow/lite/testdata/add.bin");
+  std::ifstream model_file(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
 
   model_file.seekg(0, std::ios_base::end);
   std::vector<char> model_buffer(model_file.tellg());
@@ -975,8 +1044,9 @@ TEST(CApiSimple, ValidModel) {
 }
 
 TEST(CApiSimple, ValidModelFromFile) {
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
   ASSERT_NE(model, nullptr);
   TfLiteModelDelete(model);
 }
@@ -1001,7 +1071,7 @@ void* FlexSinhInit(TfLiteOpaqueContext* context, const char* buffer,
                    size_t length) {
   auto sinh_params = new SinhParams;
   // The buffer that is passed into here is the custom_options
-  // field from the flatbuffer (third_party/tensorflow/lite/schema/schema.fbs)
+  // field from the flatbuffer (tensorflow/lite/schema/schema.fbs)
   // `Operator` for this node.
   // Typically it should be stored as a FlexBuffer, but for this test
   // we assume that it is just a string.
@@ -1040,8 +1110,10 @@ TfLiteStatus FlexSinhEval(TfLiteOpaqueContext* context,
 }
 
 TEST(CApiSimple, CustomOpSupport) {
-  TfLiteModel* model = TfLiteModelCreateFromFile(
-      "third_party/tensorflow/lite/testdata/custom_sinh.bin");
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile(tensorflow::GetDataDependencyFilepath(
+                                    "tensorflow/lite/testdata/custom_sinh.bin")
+                                    .c_str());
   ASSERT_NE(model, nullptr);
 
   TfLiteRegistrationExternal* reg =
@@ -1402,6 +1474,9 @@ TEST(CApiSimple, OpaqueApiAccessors) {
           // 1 node for ADD and 1 node for the delegate kernel.
           EXPECT_EQ(2, TfLiteOpaqueContextGetNumNodes(opaque_context));
 
+          EXPECT_EQ(opaque_context,
+                    TfLiteOpaqueContextGetSubgraphContext(opaque_context, 0));
+
           TfLiteOpaqueNode* node = nullptr;
           TfLiteRegistrationExternal* registration_external = nullptr;
           TfLiteOpaqueContextGetNodeAndRegistration(
diff --git a/tensorflow/lite/core/c/c_api_types.h b/tensorflow/lite/core/c/c_api_types.h
index 3aab43f4444..670ec1ee553 100644
--- a/tensorflow/lite/core/c/c_api_types.h
+++ b/tensorflow/lite/core/c/c_api_types.h
@@ -21,6 +21,7 @@ limitations under the License.
 /// "third_party/tensorflow/lite/c/c_api_types.h".
 /// Only the TensorFlow Lite implementation itself should include this
 /// file directly.
+// IWYU pragma: private, include "third_party/tensorflow/lite/c/c_api_types.h"
 
 #ifndef TENSORFLOW_LITE_CORE_C_C_API_TYPES_H_
 #define TENSORFLOW_LITE_CORE_C_C_API_TYPES_H_
diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc
index 00bbcde2805..a25abcfbfd9 100644
--- a/tensorflow/lite/core/c/common.cc
+++ b/tensorflow/lite/core/c/common.cc
@@ -98,11 +98,22 @@ TfLiteFloatArray* TfLiteFloatArrayCreate(int size) {
   return ret;
 }
 
+TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src) {
+  if (!src) return nullptr;
+  TfLiteFloatArray* ret = TfLiteFloatArrayCreate(src->size);
+  if (ret) {
+    memcpy(ret->data, src->data, src->size * sizeof(float));
+  }
+  return ret;
+}
+
 void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); }
 
 void TfLiteTensorDataFree(TfLiteTensor* t) {
-  if (t->allocation_type == kTfLiteDynamic ||
-      t->allocation_type == kTfLitePersistentRo) {
+  if (t->allocation_type == kTfLiteVariantObject) {
+    delete reinterpret_cast<VariantData*>(t->data.data);
+  } else if (t->allocation_type == kTfLiteDynamic ||
+             t->allocation_type == kTfLitePersistentRo) {
     if (t->data.raw) {
 #ifdef TF_LITE_TENSORFLOW_PROFILER
       tflite::PauseHeapMonitoring(/*pause=*/true);
@@ -207,11 +218,16 @@ TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst) {
   if (!src || !dst) return kTfLiteOk;
   if (src->bytes != dst->bytes) return kTfLiteError;
   if (src == dst) return kTfLiteOk;
-
   dst->type = src->type;
   if (dst->dims) TfLiteIntArrayFree(dst->dims);
   dst->dims = TfLiteIntArrayCopy(src->dims);
-  memcpy(dst->data.raw, src->data.raw, src->bytes);
+  if (src->allocation_type == kTfLiteVariantObject) {
+    if (dst->allocation_type != kTfLiteVariantObject) return kTfLiteError;
+    dst->data.data =
+        reinterpret_cast<VariantData*>(src->data.data)->Clone(dst->data.raw);
+  } else {
+    memcpy(dst->data.raw, src->data.raw, src->bytes);
+  }
   dst->buffer_handle = src->buffer_handle;
   dst->data_is_stale = src->data_is_stale;
   dst->delegate = src->delegate;
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index 6e660e9eec8..24e30eb6db7 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -38,6 +38,7 @@ limitations under the License.
 /// "third_party/tensorflow/lite/c/common.h".
 /// Only the TensorFlow Lite implementation itself should include this
 /// file directly.
+// IWYU pragma: private, include "third_party/tensorflow/lite/c/common.h"
 
 #ifndef TENSORFLOW_LITE_CORE_C_COMMON_H_
 #define TENSORFLOW_LITE_CORE_C_COMMON_H_
@@ -157,6 +158,10 @@ int TfLiteFloatArrayGetSizeInBytes(int size);
 // This returns a pointer, that you must free using TfLiteFloatArrayFree().
 TfLiteFloatArray* TfLiteFloatArrayCreate(int size);
 
+// Create a copy of an array passed as `src`.
+// You are expected to free memory with TfLiteFloatArrayFree.
+TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src);
+
 // Free memory of array `a`.
 void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 #endif  // TF_LITE_STATIC_MEMORY
@@ -345,6 +350,8 @@ typedef union TfLitePtrUnion {
 //        as constant inputs for downstream ops (also in prepare).
 //  * kTfLiteCustom: Custom memory allocation provided by the user. See
 //        TfLiteCustomAllocation below.
+// * kTfLiteVariantObject: Allocation is an arbitrary type-erased C++ object.
+//        Allocation and deallocation are done through `new` and `delete`.
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
@@ -353,6 +360,7 @@ typedef enum TfLiteAllocationType {
   kTfLiteDynamic,
   kTfLitePersistentRo,
   kTfLiteCustom,
+  kTfLiteVariantObject,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.
@@ -961,7 +969,14 @@ typedef struct TfLiteRegistration {
   TfLiteRegistrationExternal* registration_external;
 
   // Retrieves asynchronous kernel.
-  // If the node is not capable of asynchronous execution, returns nullptr.
+  //
+  // If the `async_kernel` field is nullptr, it means the operation described by
+  // this TfLiteRegistration object does not support asynchronous execution.
+  // Otherwise, the function that the field points to should only be called for
+  // delegate kernel nodes, i.e. `node` should be a delegate kernel node created
+  // by applying a delegate.
+  // If the function returns nullptr, that means that the underlying delegate
+  // does not support asynchronous execution for this `node`.
   struct TfLiteAsyncKernel* (*async_kernel)(TfLiteContext* context,
                                             TfLiteNode* node);
 } TfLiteRegistration;
@@ -1189,5 +1204,74 @@ void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate);
 
 #ifdef __cplusplus
 }  // extern "C"
+
+#include <utility>
+
+// `kTfLiteVariant` type tensors encode arbitrary C++ objects behind their
+// `data.data : void*` member. This is the type-erased interface for interacting
+// with such objects at runtime. Deleting or Cloning any `VariantData`
+// will call the destructor and copy constructor of the erased type
+// automatically. For example usage, see `common_test.cc`.
+class VariantData {
+ public:
+  // All variant objects must be able to be destroyed and copied.
+  virtual ~VariantData() = default;
+  // This allows for a "virtual copy-constructor" like pattern.
+  // In most cases, we will be copying from an input to an output tensor.
+  // Often, the output tensor is already allocated so we can pass
+  // a pointer to its buffer for reuse.
+  virtual VariantData* Clone(char* maybe_alloc) const = 0;
+};
+
+// An abstract base class for variant objects. The template parameter
+// is the type we are erasing.
+template <typename ErasedDerived>
+class AbstractVariantData : public VariantData {
+ public:
+  VariantData* Clone(char* maybe_alloc) const override {
+    if (maybe_alloc) {
+      // We assume that the output tensor is already a variant of the same
+      // derived type. If the output is still allocated, then it still may have
+      // state that was not destroyed, so we must call the destructor before
+      // using the buffer.
+      //     This may actual have a non-negligle effect on perfomance if the
+      // destructor is complex. In a future optimization we would want to
+      // introduce something like "move to" semantics, allowing for the
+      // underlying implementation to optimize for this case.
+      reinterpret_cast<VariantData*>(maybe_alloc)->~VariantData();
+      return new (maybe_alloc)
+          ErasedDerived(static_cast<ErasedDerived const&>(*this));
+    }
+    return new ErasedDerived(static_cast<ErasedDerived const&>(*this));
+  }
+
+ protected:
+  AbstractVariantData() = default;
+  AbstractVariantData(const AbstractVariantData&) = default;
+  AbstractVariantData(AbstractVariantData&&) = delete;
+};
+
+// Analogous to `TfLiteTensorRealloc` for allocation of tensors whose
+// data member points to an arbitrary C++ object. `VariantType` refers
+// to the erased type of said object and `VariantArgs` refers to
+// a list of argument types with which to construct a new `VariantType`
+// `VariantArgs` must match constructor in `VariantType`.
+template <class VariantType, class... VariantArgs>
+TfLiteStatus TfLiteTensorVariantRealloc(TfLiteTensor* t,
+                                        VariantArgs&&... args) {
+  if (t->type != kTfLiteVariant) return kTfLiteError;
+  if (t->data.raw) {
+    reinterpret_cast<VariantData*>(t->data.data)->~VariantData();
+    // For now we assume if `t` is already allocated then it was allocated
+    // with the same `VariantType` as templated.
+    t->data.data =
+        new (t->data.raw) VariantType(std::forward<VariantArgs...>(args...));
+  } else {
+    t->data.data = new VariantType(std::forward<VariantArgs...>(args...));
+  }
+  t->allocation_type = kTfLiteVariantObject;
+  return kTfLiteOk;
+}
+
 #endif  // __cplusplus
 #endif  // TENSORFLOW_LITE_CORE_C_COMMON_H_
diff --git a/tensorflow/lite/core/c/common_test.cc b/tensorflow/lite/core/c/common_test.cc
index ae3b7e99db5..e9d6c9c5f8b 100644
--- a/tensorflow/lite/core/c/common_test.cc
+++ b/tensorflow/lite/core/c/common_test.cc
@@ -83,6 +83,19 @@ TEST(FloatArray, TestFloatArrayCreate) {
   TfLiteFloatArrayFree(b);
 }
 
+TEST(FloatArray, TestFloatArrayCopy) {
+  TfLiteFloatArray* a = TfLiteFloatArrayCreate(2);
+  a->data[0] = 22.0;
+  a->data[1] = 24.0;
+  TfLiteFloatArray* b = TfLiteFloatArrayCopy(a);
+  ASSERT_NE(a, b);
+  ASSERT_EQ(a->size, b->size);
+  ASSERT_EQ(a->data[0], b->data[0]);
+  ASSERT_EQ(a->data[1], b->data[1]);
+  TfLiteFloatArrayFree(a);
+  TfLiteFloatArrayFree(b);
+}
+
 TEST(Types, TestTypeNames) {
   auto type_name = [](TfLiteType t) {
     return std::string(TfLiteTypeGetName(t));
@@ -413,4 +426,121 @@ TEST(TestTfLiteOpaqueDelegate, GetData_NoDataSetViaOpaqueDelegateBuilder) {
   TfLiteOpaqueDelegateDelete(opaque_delegate);
 }
 
+struct Foo {
+  int data;
+  bool copied;
+};
+
+class VariantFoo : public AbstractVariantData<VariantFoo> {
+ public:
+  explicit VariantFoo(int number) : foo_data_(Foo{number, false}) {}
+  VariantFoo(const VariantFoo& other) {
+    foo_data_ = other.foo_data_;
+    foo_data_.copied = true;
+  }
+  int GetFooInt() { return foo_data_.data; }
+  bool GetFooCopied() { return foo_data_.copied; }
+
+ private:
+  Foo foo_data_;
+};
+
+TEST(TestTfLiteReallocWithObject, SimpleConstruction) {
+  TfLiteTensor t = {};
+  t.type = kTfLiteVariant;
+  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(&t, 3)), kTfLiteOk);
+  ASSERT_EQ(reinterpret_cast<VariantFoo*>(t.data.data)->GetFooInt(), 3);
+  ASSERT_EQ(t.type, kTfLiteVariant);
+  ASSERT_EQ(t.allocation_type, kTfLiteVariantObject);
+  TfLiteTensorFree(&t);
+}
+
+TEST(TestTfLiteReallocWithObject, ConstructWithAlreadyAllocated) {
+  TfLiteTensor t = {};
+  t.type = kTfLiteVariant;
+  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(&t, 3)), kTfLiteOk);
+  void* before_address = t.data.data;
+  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(&t, 5)), kTfLiteOk);
+  EXPECT_EQ(t.data.data, before_address);
+  EXPECT_EQ(reinterpret_cast<VariantFoo*>(t.data.data)->GetFooInt(), 5);
+  EXPECT_EQ(t.type, kTfLiteVariant);
+  EXPECT_EQ(t.allocation_type, kTfLiteVariantObject);
+  TfLiteTensorFree(&t);
+}
+
+TEST(TestTfLiteReallocWithObject, NonVariantTypeError) {
+  TfLiteTensor t = {};
+  t.type = kTfLiteInt32;
+  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(&t, 3)), kTfLiteError);
+}
+
+TEST(TestVariantData, CopyVariantTensorCallsDerivedCopyCstor) {
+  // Basic setup for tensors.
+  TfLiteTensor src_variant_tensor = {};
+  TfLiteTensor dst_variant_tensor = {};
+  for (auto* tensor : {&src_variant_tensor, &dst_variant_tensor}) {
+    tensor->dims = ConvertVectorToTfLiteIntArray({0});
+    tensor->allocation_type = kTfLiteVariantObject;
+    tensor->type = kTfLiteVariant;
+  }
+  // Initialize variant data object. `src_variant_tensor` takes ownership
+  // of any arguments passed to `TfLiteTensorRealloc` should it succeed.
+  ASSERT_EQ(
+      (TfLiteTensorVariantRealloc<VariantFoo, int>(&src_variant_tensor, 1)),
+      kTfLiteOk);
+  auto* src_variant_data =
+      reinterpret_cast<VariantFoo*>(src_variant_tensor.data.data);
+  EXPECT_EQ(src_variant_data->GetFooInt(), 1);
+  EXPECT_EQ(src_variant_data->GetFooCopied(), false);
+
+  // Copy one variant tensor to another as usual.
+  ASSERT_EQ(TfLiteTensorCopy(&src_variant_tensor, &dst_variant_tensor),
+            kTfLiteOk);
+
+  // The destination tensor's data.data member will point to the result
+  // of calling the copy constructor of the source tensors underlying object.
+  auto* dst_variant_data =
+      reinterpret_cast<VariantFoo*>(dst_variant_tensor.data.data);
+  EXPECT_EQ(dst_variant_data->GetFooInt(), 1);
+  EXPECT_EQ(dst_variant_data->GetFooCopied(), true);
+
+  TfLiteTensorFree(&src_variant_tensor);
+  TfLiteTensorFree(&dst_variant_tensor);
+}
+
+TEST(TestVariantData, CopyVariantTensorCallsDerivedCopyCstorWithAllocation) {
+  // Basic setup for tensors.
+  TfLiteTensor src_variant_tensor = {};
+  TfLiteTensor dst_variant_tensor = {};
+  for (auto* tensor : {&src_variant_tensor, &dst_variant_tensor}) {
+    tensor->dims = ConvertVectorToTfLiteIntArray({0});
+    tensor->allocation_type = kTfLiteVariantObject;
+    tensor->type = kTfLiteVariant;
+  }
+  // Initialize variant data objects.
+  ASSERT_EQ(
+      (TfLiteTensorVariantRealloc<VariantFoo, int>(&src_variant_tensor, 1)),
+      kTfLiteOk);
+  ASSERT_EQ(
+      (TfLiteTensorVariantRealloc<VariantFoo, int>(&dst_variant_tensor, 2)),
+      kTfLiteOk);
+
+  void* before_address = dst_variant_tensor.data.data;
+
+  // Copy one variant tensor to another as usual.
+  ASSERT_EQ(TfLiteTensorCopy(&src_variant_tensor, &dst_variant_tensor),
+            kTfLiteOk);
+
+  auto* dst_variant_data =
+      reinterpret_cast<VariantFoo*>(dst_variant_tensor.data.data);
+  EXPECT_EQ(dst_variant_data->GetFooInt(), 1);
+
+  // If the destination tensor is already populated, the dstor will be called
+  // and the buffer reused.
+  EXPECT_EQ(dst_variant_tensor.data.data, before_address);
+
+  TfLiteTensorFree(&src_variant_tensor);
+  TfLiteTensorFree(&dst_variant_tensor);
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/create_op_resolver_with_builtin_ops.cc b/tensorflow/lite/core/create_op_resolver_with_builtin_ops.cc
similarity index 100%
rename from tensorflow/lite/create_op_resolver_with_builtin_ops.cc
rename to tensorflow/lite/core/create_op_resolver_with_builtin_ops.cc
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/BUILD b/tensorflow/lite/core/experimental/acceleration/configuration/BUILD
index 96f8769bea1..cfdc671e5db 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/BUILD
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/BUILD
@@ -9,38 +9,23 @@ package(
 
 cc_library(
     name = "delegate_registry",
-    srcs = ["delegate_registry.cc"],
     hdrs = ["delegate_registry.h"],
     compatible_with = get_compatible_with_portable(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
     ] + delegate_registry_visibility_allowlist(),
     deps = [
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "@com_google_absl//absl/synchronization",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
     ],
 )
 
 cc_library(
     name = "stable_delegate_registry",
-    srcs = ["stable_delegate_registry.cc"],
     hdrs = ["stable_delegate_registry.h"],
     visibility = [
         "//tensorflow/lite:__subpackages__",
     ] + experimental_acceleration_api_allowlist(),
     deps = [
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:stable_delegate",
-        "//tensorflow/lite/core/shims:tflite_use_opaque_delegate",  # buildcleaner: keep
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-cc_test(
-    name = "stable_delegate_registry_test",
-    srcs = ["stable_delegate_registry_test.cc"],
-    deps = [
-        ":stable_delegate_registry",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/core/acceleration/configuration:stable_delegate_registry",
     ],
 )
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/BUILD b/tensorflow/lite/core/experimental/acceleration/configuration/c/BUILD
index df4b927e95b..0774c13d0bb 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/BUILD
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/BUILD
@@ -17,7 +17,6 @@
 
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_cc_library_with_c_headers_test")
-load("//tensorflow/lite/delegates/gpu:build_defs.bzl", "gpu_delegate_linkopts")
 load("//tensorflow/lite/core/c:special_rules.bzl", "experimental_acceleration_api_allowlist")
 load(
     "//tensorflow/lite/core/experimental/acceleration/configuration/c:special_rules.bzl",
@@ -42,95 +41,37 @@ tflite_cc_library_with_c_headers_test(
         "//tensorflow/lite:__subpackages__",
     ] + delegate_plugin_visibility_allowlist(),
     deps = [
-        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/acceleration/configuration/c:delegate_plugin",
     ],
 )
 
 tflite_cc_library_with_c_headers_test(
     name = "gpu_plugin",
-    srcs = ["gpu_plugin.cc"],
     hdrs = ["gpu_plugin.h"],
     visibility = [
         "//tensorflow/lite:__subpackages__",
     ] + gpu_plugin_visibility_allowlist(),
     deps = [
-        ":delegate_plugin",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/delegates/gpu:delegate",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/configuration:gpu_plugin_impl",
-    ],
-)
-
-# For non-Android platforms, this should be built with '--copt=-DCL_DELEGATE_NO_GL'.
-# On non-supported platforms (i.e. non-Android platforms if -DCL_DELEGATE_NO_GL wasn't specified),
-# the test srcs are set to the empty list, so the test will succeed without testing anything.
-cc_test(
-    name = "gpu_plugin_test",
-    srcs = select({
-        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": ["gpu_plugin_test.cc"],
-        "//conditions:default": [],
-    }),
-    linkopts = gpu_delegate_linkopts(),
-    deps = select({
-        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [":gpu_plugin"],
-        "//conditions:default": [],
-    }) + [
-        "@com_google_googletest//:gtest_main",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/core/acceleration/configuration/c:gpu_plugin",
     ],
 )
 
 tflite_cc_library_with_c_headers_test(
     name = "nnapi_plugin",
-    srcs = ["nnapi_plugin.cc"],
     hdrs = ["nnapi_plugin.h"],
     deps = [
-        ":delegate_plugin",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin_impl",
-    ],
-)
-
-cc_test(
-    name = "nnapi_plugin_test",
-    srcs = ["nnapi_plugin_test.cc"],
-    deps = [
-        ":nnapi_plugin",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/core/acceleration/configuration/c:nnapi_plugin",
     ],
 )
 
 tflite_cc_library_with_c_headers_test(
     name = "xnnpack_plugin",
-    srcs = ["xnnpack_plugin.cc"],
     hdrs = ["xnnpack_plugin.h"],
     visibility = [
         "//tensorflow/lite:__subpackages__",
     ] + xnnpack_plugin_visibility_allowlist(),
     deps = [
-        ":delegate_plugin",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-    ],
-)
-
-cc_test(
-    name = "xnnpack_plugin_test",
-    srcs = ["xnnpack_plugin_test.cc"],
-    deps = [
-        ":xnnpack_plugin",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "@com_google_googletest//:gtest_main",
-        "@pthreadpool",
+        "//tensorflow/lite/core/acceleration/configuration/c:xnnpack_plugin",
     ],
 )
 
@@ -139,6 +80,6 @@ tflite_cc_library_with_c_headers_test(
     hdrs = ["stable_delegate.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/core/acceleration/configuration/c:stable_delegate",
     ],
 )
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h b/tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h
index 79e9019b27f..77970166477 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h
@@ -12,103 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// NOLINTBEGIN(whitespace/line_length)
-/// WARNING: Users of TensorFlow Lite should not include this file directly,
-/// but should instead include
-/// "third_party/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h".
-/// Only the TensorFlow Lite implementation itself should include this
-/// file directly.
-// NOLINTEND(whitespace/line_length)
 #ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
 
-// C API types for TF Lite delegate plugins.
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-#include "tensorflow/lite/core/c/common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Type of delegate creation function used to allocate and construct a delegate.
+#include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"  // IWYU pragma: export
 //
-// The tflite_settings parameter passed to the delegate creation function
-// should be a pointer to a FlatBuffer table object of type
-// tflite::TFLiteSettings.  (We use 'void *' here since this is a C API so we
-// don't want to directly reference C++ types such as tflite::TFLiteSettings.)
-//
-// Ownership of the tflite_settings flatbuffer remains with the caller.
-// The caller of a delegate creation function may end the lifetime of the
-// tflite_settings FlatBuffer immediately after the call to the function.
-// So the delegate creation function should ensure that any settings that the
-// delegate may need to reference later, after the delegate has been
-// constructed, are copied from the FlatBuffer into storage owned by the
-// delegate.
-typedef TfLiteDelegate *TfLiteDelegatePluginCreateFunc(
-    const void *tflite_settings);
-
-// Type of function to destroy and deallocate a delegate.
-// The delegate argument must have been created with the corresponding
-// create function from the same delegate plugin.
-typedef void TfLiteDelegatePluginDestroyFunc(TfLiteDelegate *);
-
-// Type of function to return an error code for the last delegate operation.
-// The delegate argument must have been created with the corresponding
-// create function from the same delegate plugin.
-typedef int TfLiteDelegatePluginGetDelegateErrnoFunc(TfLiteDelegate *);
-
-// Struct to hold all the methods for a delegate plugin.
-typedef struct TfLiteDelegatePlugin {
-  // Function to allocate and construct a delegate.
-  TfLiteDelegatePluginCreateFunc *create;
-
-  // Function to deallocate a delegate.
-  TfLiteDelegatePluginDestroyFunc *destroy;
-
-  // Function to return an error code for the last delegate operation.
-  TfLiteDelegatePluginGetDelegateErrnoFunc *get_delegate_errno;
-} TfLiteDelegatePlugin;
-
-// The following block guarded by TFLITE_USE_OPAQUE_DELEGATE has the exact same
-// functionality as the concrete types above but only uses truly opaque types.
-// Note that it has to be an addition along with the concrete types at this
-// point because the in some cases both types are used together in a same build
-// target. e.g. TFLite-in-Play Services initialization context.
-#if TFLITE_USE_OPAQUE_DELEGATE
-
-// Same as TfLiteDelegatePluginCreateFunc but uses truly opaque types.
-typedef TfLiteOpaqueDelegateStruct *TfLiteOpaqueDelegatePluginCreateFunc(
-    const void *tflite_settings);
-
-// Same as TfLiteDelegatePluginDestroyFunc but uses truly opaque types.
-typedef void TfLiteOpaqueDelegatePluginDestroyFunc(
-    TfLiteOpaqueDelegateStruct *delegate);
-
-// Same as TfLiteDelegatePluginGetDelegateErrnoFunc but uses truly opaque types.
-typedef int TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc(
-    TfLiteOpaqueDelegateStruct *delegate);
-
-// Same as TfLiteDelegatePlugin but uses truly opaque types.
-typedef struct TfLiteOpaqueDelegatePlugin {
-  TfLiteOpaqueDelegatePluginCreateFunc *create;
-
-  TfLiteOpaqueDelegatePluginDestroyFunc *destroy;
-
-  TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc *get_delegate_errno;
-} TfLiteOpaqueDelegatePlugin;
-
-#else
-
-typedef TfLiteDelegatePluginCreateFunc TfLiteOpaqueDelegatePluginCreateFunc;
-typedef TfLiteDelegatePluginDestroyFunc TfLiteOpaqueDelegatePluginDestroyFunc;
-typedef TfLiteDelegatePluginGetDelegateErrnoFunc
-    TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc;
-typedef TfLiteDelegatePlugin TfLiteOpaqueDelegatePlugin;
-
-#endif  // TFLITE_USE_OPAQUE_DELEGATE
-
-#ifdef __cplusplus
-};  // extern "C"
-#endif
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
 
 #endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h b/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h
index bd6d52204dc..8606ad4250c 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h
@@ -12,39 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// NOLINTBEGIN(whitespace/line_length)
-/// WARNING: Users of TensorFlow Lite should not include this file directly,
-/// but should instead include
-/// "third_party/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h".
-/// Only the TensorFlow Lite implementation itself should include this
-/// file directly.
-// NOLINTEND(whitespace/line_length)
 #ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
 
-// This header file is for the delegate plugin for GPU.
-//
-// For the C++ delegate plugin interface, the GPU delegate plugin is added to
-// the DelegatePluginRegistry by the side effect of a constructor for a static
-// object, so there's no public API needed for this plugin, other than the API
-// of tflite::delegates::DelegatePluginRegistry, which is declared in
-// delegate_registry.h.
-//
-// But to provide a C API to access the GPU delegate plugin, we do expose
-// some functions, which are declared below.
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h"  // IWYU pragma: export
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C API for the GPU delegate plugin.
-// Returns a pointer to a statically allocated table of function pointers.
-const TfLiteDelegatePlugin* TfLiteGpuDelegatePluginCApi();
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h"
 
 #endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h b/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h
index ca8fdd17fcd..303b5f9b1e7 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h
@@ -12,39 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// NOLINTBEGIN(whitespace/line_length)
-/// WARNING: Users of TensorFlow Lite should not include this file directly,
-/// but should instead include
-/// "third_party/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h".
-/// Only the TensorFlow Lite implementation itself should include this
-/// file directly.
-// NOLINTEND(whitespace/line_length)
 #ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
 
-// This header file is for the delegate plugin for NNAPI.
-//
-// For the C++ delegate plugin interface, the NNAPI delegate plugin is added to
-// the DelegatePluginRegistry by the side effect of a constructor for a static
-// object, so there's no public API needed for this plugin, other than the API
-// of tflite::delegates::DelegatePluginRegistry, which is declared in
-// delegate_registry.h.
-//
-// But to provide a C API to access the NNAPI delegate plugin, we do expose
-// some functions, which are declared below.
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h"  // IWYU pragma: export
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C API for the NNAPI delegate plugin.
-// Returns a pointer to a statically allocated table of function pointers.
-const TfLiteDelegatePlugin* TfLiteNnapiDelegatePluginCApi();
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h"
 
 #endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h b/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h
index 45e52b0dea0..bd2f520b01f 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h
@@ -15,37 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
 #define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
 
-// C API types for TFLite delegates that implement stable delegate ABI.
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h"  // IWYU pragma: export
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Constant that identifies the TfLiteStableDelegate ABI version that the
-// delegate supports. This will get incremented if there are changes to the
-// struct. The version is in semver 2 format (see https://semver.org).
-#define TFL_STABLE_DELEGATE_ABI_VERSION "1.0.0"
-
-// Contains stable delegate metadata and implementation.
-typedef struct TfLiteStableDelegate {
-  // The struct ABI version this delegate supports in semver 2 format. It should
-  // be set to TFL_STABLE_DELEGATE_ABI_VERSION.
-  const char* delegate_abi_version;
-
-  // Uniquely identifies a delegate.
-  const char* delegate_name;
-
-  // Release version of this delegate.
-  const char* delegate_version;
-
-  // Provides the implementation of the delegate plugin.
-  const TfLiteOpaqueDelegatePlugin* delegate_plugin;
-} TfLiteStableDelegate;
-
-#ifdef __cplusplus
-};  // extern "C"
-#endif
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h"
 
 #endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h b/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h
index eea33705212..39a3648bde5 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h
@@ -12,39 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// NOLINTBEGIN(whitespace/line_length)
-/// WARNING: Users of TensorFlow Lite should not include this file directly,
-/// but should instead include
-/// "third_party/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h".
-/// Only the TensorFlow Lite implementation itself should include this
-/// file directly.
-// NOLINTEND(whitespace/line_length)
 #ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
 
-// This header file is for the delegate plugin for XNNPACK.
-//
-// For the C++ delegate plugin interface, the XNNPACK delegate plugin is added
-// to the DelegatePluginRegistry by the side effect of a constructor for a
-// static object, so there's no public API needed for this plugin, other than
-// the API of tflite::delegates::DelegatePluginRegistry, which is declared in
-// delegate_registry.h.
-//
-// But to provide a C API to access the XNNPACK delegate plugin, we do expose
-// some functions, which are declared below.
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h"  // IWYU pragma: export
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C API for the XNNPACK delegate plugin.
-// Returns a pointer to a statically allocated table of function pointers.
-const TfLiteDelegatePlugin* TfLiteXnnpackDelegatePluginCApi();
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h"
 
 #endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h b/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h
index 6451681c2cf..9129fae8583 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h
@@ -12,94 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// NOLINTBEGIN(whitespace/line_length)
-/// WARNING: Users of TensorFlow Lite should not include this file directly,
-/// but should instead include
-/// "third_party/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h".
-/// Only the TensorFlow Lite implementation itself should include this
-/// file directly.
-// NOLINTEND(whitespace/line_length)
 #ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
 #define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
 
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-#include "absl/synchronization/mutex.h"
-#include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"  // IWYU pragma: export
 
-// Defines an interface for TFLite delegate plugins.
-//
-// The acceleration library aims to support all TFLite delegates based on
-// configuration expressed as data (flatbuffers). However, consumers tend to
-// care about size and also use a subset of delegates. Hence we don't want to
-// statically build against all delegates.
-//
-// This interface allows plugins to handle specific delegates.
-//
-// Goal of this interface is not to abstract away all the differences between
-// delegates. The goal is only to avoid static linking.
-//
-// Note to implementers: this interface may change if new delegates don't fit
-// into the same design.
-namespace tflite {
-namespace delegates {
-
-// Same w/ Interpreter::TfLiteDelegatePtr to avoid pulling
-// tensorflow/lite/interpreter.h dependency
-using TfLiteDelegatePtr =
-    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
-
-class DelegatePluginInterface {
- public:
-  virtual TfLiteDelegatePtr Create() = 0;
-  virtual int GetDelegateErrno(TfLiteDelegate* from_delegate) = 0;
-  virtual ~DelegatePluginInterface() = default;
-};
-
-// A stripped-down registry that allows delegate plugins to be created by name.
-//
-// Limitations:
-// - Doesn't allow deregistration.
-// - Doesn't check for duplication registration.
-//
-class DelegatePluginRegistry {
- public:
-  typedef std::function<std::unique_ptr<DelegatePluginInterface>(
-      const TFLiteSettings&)>
-      CreatorFunction;
-  // Returns a DelegatePluginInterface registered with `name` or nullptr if no
-  // matching plugin found.
-  // TFLiteSettings is per-plugin, so that the corresponding delegate options
-  // data lifetime is maintained.
-  static std::unique_ptr<DelegatePluginInterface> CreateByName(
-      const std::string& name, const TFLiteSettings& settings);
-
-  // Struct to be statically allocated for registration.
-  struct Register {
-    Register(const std::string& name, CreatorFunction creator_function);
-  };
-
- private:
-  void RegisterImpl(const std::string& name, CreatorFunction creator_function);
-  std::unique_ptr<DelegatePluginInterface> CreateImpl(
-      const std::string& name, const TFLiteSettings& settings);
-  static DelegatePluginRegistry* GetSingleton();
-  absl::Mutex mutex_;
-  std::unordered_map<std::string, CreatorFunction> factories_
-      ABSL_GUARDED_BY(mutex_);
-};
-
-}  // namespace delegates
-}  // namespace tflite
-
-#define TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION_VNAME(name, f) \
-  static auto* g_delegate_plugin_##name##_ =                     \
-      new tflite::delegates::DelegatePluginRegistry::Register(#name, f);
-#define TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(name, f) \
-  TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION_VNAME(name, f);
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
 
 #endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h b/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h
index 1c346691802..b235f52c2d3 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h
@@ -15,43 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
 #define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
 
-#include <string>
-#include <unordered_map>
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-#include "absl/synchronization/mutex.h"
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h"
+#include "tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.h"  // IWYU pragma: export
 
-namespace tflite {
-namespace delegates {
-
-// A dedicated singleton registry for TfLiteStableDelegate.
-// Note that there is also a non-stable delegate registry
-// (third_party/tensorflow/lite/core/experimental/acceleration/configuration/
-// delegate_registry.h)
-// but it does not serve very well for TfLiteStableDelegate as it could not
-// register all the information of TfLiteStableDelegate and it uses concrete
-// types.
-class StableDelegateRegistry {
- public:
-  // Registers a TfLiteStableDelegate pointer to the registry.
-  static void RegisterStableDelegate(const TfLiteStableDelegate* delegate);
-  // Retrieves the pointer to the corresponding TfLiteStableDelegate from the
-  // registry given a delegate name. Returns nullptr if no registration found.
-  static const TfLiteStableDelegate* RetrieveStableDelegate(
-      const std::string& name);
-
- private:
-  static StableDelegateRegistry* GetSingleton();
-  void RegisterStableDelegateImpl(const TfLiteStableDelegate* delegate);
-  const TfLiteStableDelegate* RetrieveStableDelegateImpl(
-      const std::string& name);
-
-  absl::Mutex mutex_;
-  std::unordered_map<std::string, const TfLiteStableDelegate*> registry_
-      ABSL_GUARDED_BY(mutex_);
-};
-
-}  // namespace delegates
-}  // namespace tflite
+// IWYU pragma: private, include "third_party/tensorflow/lite/core/acceleration/configuration/stable_delegate_registry.h"
 
 #endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
diff --git a/tensorflow/lite/core/interpreter.cc b/tensorflow/lite/core/interpreter.cc
index 2e8da7fbfbc..61a2713b4ac 100644
--- a/tensorflow/lite/core/interpreter.cc
+++ b/tensorflow/lite/core/interpreter.cc
@@ -394,7 +394,11 @@ TfLiteStatus Interpreter::ModifyGraphWithDelegateImpl(
     TfLiteDelegate* delegate) {
   TfLiteStatus status = kTfLiteOk;
   for (auto& subgraph : subgraphs_) {
-    if (IsValidationSubgraph(subgraph->GetName().c_str())) {
+    if (IsValidationSubgraph(subgraph->GetName().c_str()) ||
+        subgraph->IsDelegationSkippable()) {
+      TFLITE_LOG(TFLITE_LOG_INFO,
+                 "Skipping calling ModifyGraphWithDelegate on Subgraph %i: %s",
+                 subgraph->GetSubgraphIndex(), subgraph->GetName().c_str());
       continue;
     }
     status = subgraph->ModifyGraphWithDelegate(delegate);
diff --git a/tensorflow/lite/core/interpreter.h b/tensorflow/lite/core/interpreter.h
index 23243d7defc..e9695c64632 100644
--- a/tensorflow/lite/core/interpreter.h
+++ b/tensorflow/lite/core/interpreter.h
@@ -635,6 +635,18 @@ class Interpreter {
                                TfLiteBufferHandle buffer_handle,
                                TfLiteDelegate* delegate);
 
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Set the delegate buffer handle to the given tensor.
+  // It can be called in the following cases:
+  // 1. Set the buffer handle to a tensor that is used by other computing
+  // hardware such as EdgeTpu. For example, EdgeTpu delegate imports a tensor's
+  // memory into EdgeTpu's virtual address and returns a buffer handle. Then
+  // EdgeTpu delegate calls this API to associate the tensor with the buffer
+  // handle. Example bug b/277217867.
+  TfLiteStatus SetBufferHandle(TfLiteTensor* tensor,
+                               TfLiteBufferHandle buffer_handle,
+                               TfLiteDelegate* delegate);
+
   /// \warning This is an experimental API and subject to change. \n
   /// \brief Get the delegate buffer handle, and the delegate which can process
   /// the buffer handle.
diff --git a/tensorflow/lite/core/interpreter_experimental.cc b/tensorflow/lite/core/interpreter_experimental.cc
index 1f981ffb24b..464585445f0 100644
--- a/tensorflow/lite/core/interpreter_experimental.cc
+++ b/tensorflow/lite/core/interpreter_experimental.cc
@@ -87,7 +87,13 @@ TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
                                           TfLiteDelegate* delegate) {
   TF_LITE_ENSURE(context_, tensor_index < tensors_size());
   TfLiteTensor* tensor = primary_subgraph().tensor(tensor_index);
+  return SetBufferHandle(tensor, buffer_handle, delegate);
+}
 
+TfLiteStatus Interpreter::SetBufferHandle(TfLiteTensor* tensor,
+                                          TfLiteBufferHandle buffer_handle,
+                                          TfLiteDelegate* delegate) {
+  TF_LITE_ENSURE(context_, tensor != nullptr);
   TF_LITE_ENSURE(context_,
                  tensor->delegate == nullptr || tensor->delegate == delegate);
   tensor->delegate = delegate;
diff --git a/tensorflow/lite/core/kernels/BUILD b/tensorflow/lite/core/kernels/BUILD
index 262170c9250..81d643a6759 100644
--- a/tensorflow/lite/core/kernels/BUILD
+++ b/tensorflow/lite/core/kernels/BUILD
@@ -12,7 +12,7 @@ exports_files(
         "builtin_op_kernels.h",
         "register.h",
     ],
-    visibility = ["//tensorflow/lite/core/shims:__subpackages__"],
+    visibility = ["//tensorflow/lite:__subpackages__"],
 )
 
 cc_test(
diff --git a/tensorflow/lite/core/kernels/builtin_op_kernels.h b/tensorflow/lite/core/kernels/builtin_op_kernels.h
index 15a0f0d7bbb..be4cb791606 100644
--- a/tensorflow/lite/core/kernels/builtin_op_kernels.h
+++ b/tensorflow/lite/core/kernels/builtin_op_kernels.h
@@ -17,6 +17,8 @@ limitations under the License.
 /// "third_party/tensorflow/lite/kernels/builtin_op_kernels.h".
 /// Only the TensorFlow Lite implementation itself should include this
 /// file directly.
+// IWYU pragma: private, include "third_party/tensorflow/lite/kernels/builtin_op_kernels.h"
+
 #ifndef TENSORFLOW_LITE_CORE_KERNELS_BUILTIN_OP_KERNELS_H_
 #define TENSORFLOW_LITE_CORE_KERNELS_BUILTIN_OP_KERNELS_H_
 
@@ -191,6 +193,9 @@ TfLiteRegistration* Register_VAR_HANDLE();
 TfLiteRegistration* Register_WHERE();
 TfLiteRegistration* Register_WHILE();
 TfLiteRegistration* Register_ZEROS_LIKE();
+TfLiteRegistration* Register_BITCAST();
+TfLiteRegistration* Register_BITWISE_XOR();
+TfLiteRegistration* Register_RIGHT_SHIFT();
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/lite/core/kernels/register.cc b/tensorflow/lite/core/kernels/register.cc
index 01af72cd7db..e59d1d8f576 100644
--- a/tensorflow/lite/core/kernels/register.cc
+++ b/tensorflow/lite/core/kernels/register.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -96,12 +96,12 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version */ 5);
   AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_MUL, Register_MUL(), /* min_version = */ 1,
-             /* max_version = */ 6);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION(),
              /* min_version = */ 1,
              /* max_version = */ 2);
@@ -160,13 +160,13 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE(),
              /* min_version = */ 1,
-             /* max_version = */ 6);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_EXP, Register_EXP(),
              /* min_version = */ 1,
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_LOG, Register_LOG());
   AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX(),
              /* min_version = */ 1,
@@ -208,11 +208,13 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
   AddBuiltin(BuiltinOperator_SELECT, Register_SELECT(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
-  AddBuiltin(BuiltinOperator_SELECT_V2, Register_SELECT_V2());
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_SELECT_V2, Register_SELECT_V2(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_SLICE, Register_SLICE(),
              /* min_version = */ 1,
-             /* max_version = */ 5);
+             /* max_version = */ 6);
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
   AddBuiltin(BuiltinOperator_COS, Register_COS());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV(),
@@ -255,7 +257,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2);
   AddBuiltin(BuiltinOperator_PACK, Register_PACK(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT());
   AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
   AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
@@ -359,6 +361,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SIGN, Register_SIGN(),
              /* min_version = */ 1,
              /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_BITCAST, Register_BITCAST());
+  AddBuiltin(BuiltinOperator_BITWISE_XOR, Register_BITWISE_XOR());
+  AddBuiltin(BuiltinOperator_RIGHT_SHIFT, Register_RIGHT_SHIFT());
   AddCustom("NumericVerify", tflite::ops::custom::Register_NUMERIC_VERIFY());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/core/macros.h b/tensorflow/lite/core/macros.h
index 034ad8daac5..8ebc8db2e81 100644
--- a/tensorflow/lite/core/macros.h
+++ b/tensorflow/lite/core/macros.h
@@ -32,6 +32,20 @@ limitations under the License.
 #define TFLITE_EXPECT_TRUE(cond) (cond)
 #endif
 
+#ifdef _WIN32
+#define TFLITE_NOINLINE __declspec(noinline)
+#else
+#ifdef __has_attribute
+#if __has_attribute(noinline)
+#define TFLITE_NOINLINE __attribute__((noinline))
+#else
+#define TFLITE_NOINLINE
+#endif  // __has_attribute(noinline)
+#else
+#define TFLITE_NOINLINE
+#endif  // __has_attribute
+#endif  // _WIN32
+
 // Normally we'd use ABSL_HAVE_ATTRIBUTE_WEAK and ABSL_ATTRIBUTE_WEAK, but
 // we avoid the absl dependency for binary size reasons.
 #ifdef __has_attribute
diff --git a/tensorflow/lite/core/model_test.cc b/tensorflow/lite/core/model_test.cc
index 8d0583e62e7..02b18aa1754 100644
--- a/tensorflow/lite/core/model_test.cc
+++ b/tensorflow/lite/core/model_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
@@ -96,7 +97,9 @@ TEST(BasicFlatBufferModel, TestBufferAlignment) {
 
   // Use real model data so that we can be sure error is only from the
   // alignment requirement and not from bad data.
-  std::ifstream fp("third_party/tensorflow/lite/testdata/empty_model.bin");
+  std::ifstream fp(tensorflow::GetDataDependencyFilepath(
+                       "tensorflow/lite/testdata/empty_model.bin")
+                       .c_str());
   ASSERT_TRUE(fp.good());
   std::string empty_model_data((std::istreambuf_iterator<char>(fp)),
                                std::istreambuf_iterator<char>());
@@ -129,7 +132,9 @@ TEST(BasicFlatBufferModel, TestBufferAlignment) {
 // Make sure a model with nothing in it loads properly.
 TEST(BasicFlatBufferModel, TestEmptyModels) {
   auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/empty_model.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/empty_model.bin")
+          .c_str());
   ASSERT_TRUE(model);
   // Now try to build it into a model.
   std::unique_ptr<Interpreter> interpreter;
@@ -140,7 +145,9 @@ TEST(BasicFlatBufferModel, TestEmptyModels) {
 
 TEST(BasicFlatBufferModel, TestNullDestination) {
   auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/empty_model.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/empty_model.bin")
+          .c_str());
   ASSERT_TRUE(model);
   // Test that building with null destination fails.
   ASSERT_NE(InterpreterBuilder(*model, TrivialResolver())(nullptr), kTfLiteOk);
@@ -150,7 +157,9 @@ TEST(BasicFlatBufferModel, TestNullDestination) {
 // TODO(aselle): Replace this test when multiple subgraphs are supported.
 TEST(BasicFlatBufferModel, TestZeroSubgraphs) {
   auto m = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/0_subgraphs.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/0_subgraphs.bin")
+          .c_str());
   ASSERT_TRUE(m);
   std::unique_ptr<Interpreter> interpreter;
   ASSERT_NE(InterpreterBuilder(*m, TrivialResolver())(&interpreter), kTfLiteOk);
@@ -158,7 +167,9 @@ TEST(BasicFlatBufferModel, TestZeroSubgraphs) {
 
 TEST(BasicFlatBufferModel, TestMultipleSubgraphs) {
   auto m = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/2_subgraphs.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/2_subgraphs.bin")
+          .c_str());
   ASSERT_TRUE(m);
   std::unique_ptr<Interpreter> interpreter;
   ASSERT_EQ(InterpreterBuilder(*m, TrivialResolver())(&interpreter), kTfLiteOk);
@@ -167,8 +178,9 @@ TEST(BasicFlatBufferModel, TestMultipleSubgraphs) {
 
 TEST(BasicFlatBufferModel, TestSubgraphName) {
   auto m = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/"
-      "2_subgraphs_dont_delegate_name.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/2_subgraphs_dont_delegate_name.bin")
+          .c_str());
   ASSERT_TRUE(m);
   std::unique_ptr<Interpreter> interpreter;
   ASSERT_EQ(InterpreterBuilder(*m, TrivialResolver())(&interpreter), kTfLiteOk);
@@ -180,7 +192,9 @@ TEST(BasicFlatBufferModel, TestSubgraphName) {
 // Test what happens if we cannot bind any of the ops.
 TEST(BasicFlatBufferModel, TestModelWithoutNullRegistrations) {
   auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model.bin")
+          .c_str());
   ASSERT_TRUE(model);
   // Check that we get an error code and interpreter pointer is reset.
   std::unique_ptr<Interpreter> interpreter(new Interpreter);
@@ -192,7 +206,9 @@ TEST(BasicFlatBufferModel, TestModelWithoutNullRegistrations) {
 // Make sure model is read to interpreter properly
 TEST(BasicFlatBufferModel, TestModelInInterpreter) {
   auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model.bin")
+          .c_str());
   ASSERT_TRUE(model);
   // Check that we get an error code and interpreter pointer is reset.
   std::unique_ptr<Interpreter> interpreter(new Interpreter);
@@ -271,7 +287,10 @@ TEST(BasicFlatBufferModel, TestModelInInterpreter) {
 TEST(BasicFlatBufferModel, TestWithNumThreads) {
   TestErrorReporter reporter;
   auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model.bin", &reporter);
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model.bin")
+          .c_str(),
+      &reporter);
   ASSERT_TRUE(model);
   TrivialResolver resolver(&dummy_reg);
   InterpreterBuilder builder(*model, resolver);
@@ -304,7 +323,10 @@ TEST(BasicFlatBufferModel, TestWithNumThreads) {
 TEST(BasicFlatBufferModel, TestSetNumThreads) {
   TestErrorReporter reporter;
   auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model.bin", &reporter);
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model.bin")
+          .c_str(),
+      &reporter);
   ASSERT_TRUE(model);
   std::unique_ptr<Interpreter> interpreter;
   TrivialResolver resolver(&dummy_reg);
@@ -339,7 +361,10 @@ TEST(BasicFlatBufferModel, TestSetNumThreads) {
 TEST(BasicFlatBufferModel, TestSetNumThreadsWithMultipleSubgraphs) {
   TestErrorReporter reporter;
   auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/2_subgraphs.bin", &reporter);
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/2_subgraphs.bin")
+          .c_str(),
+      &reporter);
   ASSERT_TRUE(model);
   std::unique_ptr<Interpreter> interpreter;
   TrivialResolver resolver(&dummy_reg);
@@ -360,7 +385,9 @@ TEST(BasicFlatBufferModel, TestSetNumThreadsWithMultipleSubgraphs) {
 // not linked into the target.
 TEST(FlexModel, FailureWithoutFlexDelegate) {
   auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/multi_add_flex.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/multi_add_flex.bin")
+          .c_str());
   ASSERT_TRUE(model);
 
   // Note that creation will succeed when using the BuiltinOpResolver, but
@@ -381,7 +408,9 @@ TEST(FlexModel, FailureWithoutFlexDelegate) {
 // buffer. But the buffer is provided to be only 1 element.
 TEST(BasicFlatBufferModel, TestBrokenMmap) {
   ASSERT_FALSE(FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model_broken.bin"));
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model_broken.bin")
+          .c_str()));
 }
 
 TEST(BasicFlatBufferModel, TestNullModel) {
@@ -409,18 +438,27 @@ class FakeVerifier : public tflite::TfLiteVerifier {
 TEST(BasicFlatBufferModel, TestWithTrueVerifier) {
   FakeVerifier verifier(true);
   ASSERT_TRUE(FlatBufferModel::VerifyAndBuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model.bin", &verifier));
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model.bin")
+          .c_str(),
+      &verifier));
 }
 
 TEST(BasicFlatBufferModel, TestWithFalseVerifier) {
   FakeVerifier verifier(false);
   ASSERT_FALSE(FlatBufferModel::VerifyAndBuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model.bin", &verifier));
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model.bin")
+          .c_str(),
+      &verifier));
 }
 
 TEST(BasicFlatBufferModel, TestWithNullVerifier) {
   ASSERT_TRUE(FlatBufferModel::VerifyAndBuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model.bin", nullptr));
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model.bin")
+          .c_str(),
+      nullptr));
 }
 
 // This makes sure the ErrorReporter is marshalled from FlatBufferModel to
@@ -428,7 +466,10 @@ TEST(BasicFlatBufferModel, TestWithNullVerifier) {
 TEST(BasicFlatBufferModel, TestCustomErrorReporter) {
   TestErrorReporter reporter;
   auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/empty_model.bin", &reporter);
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/empty_model.bin")
+          .c_str(),
+      &reporter);
   ASSERT_TRUE(model);
 
   std::unique_ptr<Interpreter> interpreter;
@@ -442,7 +483,10 @@ TEST(BasicFlatBufferModel, TestCustomErrorReporter) {
 // the Interpreter.
 TEST(BasicFlatBufferModel, TestNullErrorReporter) {
   auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/empty_model.bin", nullptr);
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/empty_model.bin")
+          .c_str(),
+      nullptr);
   ASSERT_TRUE(model);
 
   std::unique_ptr<Interpreter> interpreter;
@@ -455,7 +499,10 @@ TEST(BasicFlatBufferModel, TestNullErrorReporter) {
 TEST(BasicFlatBufferModel, TestBuildFromModel) {
   TestErrorReporter reporter;
   FileCopyAllocation model_allocation(
-      "third_party/tensorflow/lite/testdata/test_model.bin", &reporter);
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model.bin")
+          .c_str(),
+      &reporter);
   ASSERT_TRUE(model_allocation.valid());
   ::flatbuffers::Verifier verifier(
       reinterpret_cast<const uint8_t*>(model_allocation.base()),
@@ -476,8 +523,11 @@ TEST(BasicFlatBufferModel, TestBuildFromModel) {
 // Test that loading model directly from an Allocation works.
 TEST(BasicFlatBufferModel, TestBuildFromAllocation) {
   TestErrorReporter reporter;
-  std::unique_ptr<Allocation> model_allocation(new FileCopyAllocation(
-      "third_party/tensorflow/lite/testdata/test_model.bin", &reporter));
+  std::unique_ptr<Allocation> model_allocation(
+      new FileCopyAllocation(tensorflow::GetDataDependencyFilepath(
+                                 "tensorflow/lite/testdata/test_model.bin")
+                                 .c_str(),
+                             &reporter));
   ASSERT_TRUE(model_allocation->valid());
 
   auto model =
@@ -514,13 +564,17 @@ TEST(BasicFlatBufferModel, TestBuildFromInvalidAllocation) {
 TEST(BasicFlatBufferModel, TestReadRuntimeVersionFromModel) {
   // First read a model that doesn't have the runtime string.
   auto model1 = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model.bin")
+          .c_str());
   ASSERT_TRUE(model1);
   ASSERT_EQ(model1->GetMinimumRuntime(), "");
 
   // Read a model that has minimum runtime string populated.
   auto model2 = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_min_runtime.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_min_runtime.bin")
+          .c_str());
   ASSERT_TRUE(model2);
   // Check that we have read the runtime string correctly.
   ASSERT_EQ(model2->GetMinimumRuntime(), "1.5.0");
@@ -530,14 +584,18 @@ TEST(BasicFlatBufferModel, TestReadRuntimeVersionFromModel) {
 TEST(BasicFlatBufferModel, TestReadMetadataFromModel) {
   // First read a model that doesn't have the runtime string.
   auto model1 = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model.bin")
+          .c_str());
   ASSERT_TRUE(model1);
   std::map<std::string, std::string> metadata = model1->ReadAllMetadata();
   ASSERT_EQ(metadata.size(), 0);
 
   // Read a model that has reduced precision support mask populated
   auto model2 = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model_redux_precision.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model_redux_precision.bin")
+          .c_str());
   ASSERT_TRUE(model2);
   // Check that we have read the runtime string correctly.
   metadata = model2->ReadAllMetadata();
@@ -548,7 +606,9 @@ TEST(BasicFlatBufferModel, TestReadMetadataFromContext) {
   const std::string reduced_precision_meta_key = "reduced_precision_support";
   // First read a model that doesn't have any metadata.
   auto model1 = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model.bin")
+          .c_str());
   ASSERT_TRUE(model1);
   std::unique_ptr<Interpreter> interpreter;
   TrivialResolver resolver(&dummy_reg);
@@ -566,7 +626,9 @@ TEST(BasicFlatBufferModel, TestReadMetadataFromContext) {
 
   // This model has metadata mapped to kTfLiteReducedPrecisionKey.
   auto model2 = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/test_model_redux_precision.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/test_model_redux_precision.bin")
+          .c_str());
   ASSERT_TRUE(model2);
   InterpreterBuilder builder2(*model2, resolver);
   interpreter.reset();
@@ -592,7 +654,9 @@ TEST(BasicFlatBufferModel, TestReadMetadataFromContext) {
 TEST(BasicFlatBufferModel, TestParseModelWithSparseTensor) {
   // The model only has 1 sparse constant tensor.
   auto model = FlatBufferModel::BuildFromFile(
-      "third_party/tensorflow/lite/testdata/sparse_tensor.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/sparse_tensor.bin")
+          .c_str());
   ASSERT_TRUE(model);
 
   std::unique_ptr<Interpreter> interpreter(new Interpreter);
@@ -662,11 +726,11 @@ TEST(BasicFlatBufferModel, TestParseModelWithSparseTensor) {
 // `AllocateTensors`, hence the test checks that `interpreter->AllocateTensors`
 // detects these bad models.
 TEST(BasicFlatBufferModel, TestHandleMalformedModelReuseTensor) {
-  const auto model_path =
-      "third_party/tensorflow/lite/testdata/add_shared_tensors.bin";
+  const auto model_path = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/lite/testdata/add_shared_tensors.bin");
 
   std::unique_ptr<tflite::FlatBufferModel> model =
-      FlatBufferModel::BuildFromFile(model_path);
+      FlatBufferModel::BuildFromFile(model_path.c_str());
   ASSERT_NE(model, nullptr);
 
   tflite::ops::builtin::BuiltinOpResolver resolver;
@@ -683,11 +747,11 @@ TEST(BasicFlatBufferModel, TestHandleMalformedModelReuseTensor) {
 // segfault if no precondition check is added. The test checks that the
 // precondition check exists.
 TEST(BasicFlatBufferModel, TestHandleMalformedModelInvalidBuffer) {
-  const auto model_path =
-      "third_party/tensorflow/lite/testdata/segment_sum_invalid_buffer.bin";
+  const auto model_path = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/lite/testdata/segment_sum_invalid_buffer.bin");
 
   std::unique_ptr<tflite::FlatBufferModel> model =
-      FlatBufferModel::BuildFromFile(model_path);
+      FlatBufferModel::BuildFromFile(model_path.c_str());
   ASSERT_NE(model, nullptr);
 
   tflite::ops::builtin::BuiltinOpResolver resolver;
@@ -728,7 +792,9 @@ TEST(TestAddDelegateOwnership, AddDelegateDoesNotTakeOwnership) {
     {
       // Load a model.
       auto model = FlatBufferModel::BuildFromFile(
-          "third_party/tensorflow/lite/testdata/empty_model.bin");
+          tensorflow::GetDataDependencyFilepath(
+              "tensorflow/lite/testdata/empty_model.bin")
+              .c_str());
       ASSERT_TRUE(model);
       // Now try to build it into an interpreter.
       std::unique_ptr<Interpreter> interpreter;
@@ -765,11 +831,11 @@ TEST(TestAddDelegateOwnership, AddDelegateDoesNotTakeOwnership) {
 // is detected. If not, the while loop will be failed at handling the dynamic
 // tensor handling as a static tensor.
 TEST(BasicFlatBufferModel, TestHandleModelWithWhileOpContainsForwardingInput) {
-  const auto model_path =
-      "third_party/tensorflow/lite/testdata/while_op_with_forwarding_input.bin";
+  const auto model_path = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/lite/testdata/while_op_with_forwarding_input.bin");
 
   std::unique_ptr<tflite::FlatBufferModel> model =
-      FlatBufferModel::BuildFromFile(model_path);
+      FlatBufferModel::BuildFromFile(model_path.c_str());
   ASSERT_NE(model, nullptr);
 
   tflite::ops::builtin::BuiltinOpResolver resolver;
@@ -793,7 +859,10 @@ TEST(BasicFlatBufferModel, TestHandleModelWithWhileOpContainsForwardingInput) {
 TEST(BasicFlatBufferModel, TestHandleZeroSizeConstant) {
   TestErrorReporter reporter;
   FileCopyAllocation model_allocation(
-      "third_party/tensorflow/lite/testdata/zero_size_constant.bin", &reporter);
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/zero_size_constant.bin")
+          .c_str(),
+      &reporter);
   EXPECT_TRUE(model_allocation.valid());
   ::flatbuffers::Verifier verifier(
       reinterpret_cast<const uint8_t*>(model_allocation.base()),
diff --git a/tensorflow/lite/core/shims/BUILD b/tensorflow/lite/core/shims/BUILD
index 695f591980d..7d8d0253a5a 100644
--- a/tensorflow/lite/core/shims/BUILD
+++ b/tensorflow/lite/core/shims/BUILD
@@ -2,11 +2,10 @@
 # to the TF Lite C and C++ API targets.  See README.md.
 
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(":build_defs.bzl", "build_test")
-load(":cc_library_with_tflite.bzl", "cc_library_with_tflite", "custom_c_library_with_tflite")
+load(":cc_library_with_tflite.bzl", "custom_c_library_with_tflite")
 load(":cc_library_with_tflite_test.bzl", "cc_library_with_tflite_test_suite")
 
 package(
@@ -14,19 +13,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(
-    srcs = [
-        "c/c_api.h",
-        "c/c_api_experimental.h",
-        "c/c_api_opaque.h",
-    ],
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-)
-
 # Any target that depends on this one, directly or indirectly,
 # will have -DTFLITE_USE_OPAQUE_DELEGATE passed to the C/C++ compiler.
 cc_library(
@@ -40,465 +26,6 @@ cc_library(
     ],
 )
 
-#------------------------------------------------------------------------------
-# C++ API
-
-FRAMEWORK_STABLE_LIB_HDRS = [
-    "//tensorflow/lite:allocation.h",
-    "//tensorflow/lite:context_util.h",
-    "//tensorflow/lite/core:macros.h",
-    "//tensorflow/lite:error_reporter.h",
-    "//tensorflow/lite:mutable_op_resolver.h",
-    "//tensorflow/lite:op_resolver.h",
-    "//tensorflow/lite:stderr_reporter.h",
-]
-
-FRAMEWORK_LIB_HDRS = FRAMEWORK_STABLE_LIB_HDRS + [
-    "//tensorflow/lite:context.h",
-    "//tensorflow/lite/core:subgraph.h",
-    "//tensorflow/lite:graph_info.h",
-    "//tensorflow/lite:optional_debug_tools.h",
-]
-
-CC_API_HDRS = [
-    "cc/interpreter.h",
-    "cc/interpreter_builder.h",
-    "cc/model.h",
-    "cc/model_builder.h",
-]
-
-# The public target for the full C++ API.
-#
-# Experimental APIs are functional, tested and usable in production; however,
-# the corresponding API surface has not been finalized, and is subject to
-# change.
-alias(
-    name = "framework",
-    actual = "framework_experimental",
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-)
-
-# The public target for the C++ API excluding experimental features.
-cc_library(
-    name = "framework_stable",
-    srcs = [],
-    hdrs = FRAMEWORK_STABLE_LIB_HDRS + CC_API_HDRS,
-    copts = tflite_copts() + tflite_copts_warnings(),
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite:allocation",
-        "//tensorflow/lite:arena_planner",
-        "//tensorflow/lite:external_cpu_backend_context",
-        "//tensorflow/lite:framework_stable",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite:memory_planner",
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite:simple_memory_arena",
-        "//tensorflow/lite:string",
-        "//tensorflow/lite:type_to_tflitetype",
-        "//tensorflow/lite:util",
-        "//tensorflow/lite:version",
-        "//tensorflow/lite/core:framework",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/core/api:verifier",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/experimental/resource",
-        "//tensorflow/lite/schema:schema_fbs",
-    ],
-)
-
-# The public target for the full C++ API, including experimental features.
-#
-# Experimental APIs are functional, tested and usable in production; however,
-# the corresponding API surface has not been finalized, and is subject to
-# change.
-#
-# Note that if you have code which depends on both stable and experimental API
-# features, it's fine to depend only on 'framework_experimental', since
-# that includes 'framework_stable' as a subset.
-cc_library(
-    name = "framework_experimental",
-    srcs = [],
-    hdrs = FRAMEWORK_LIB_HDRS + CC_API_HDRS,
-    copts = tflite_copts() + tflite_copts_warnings(),
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite:allocation",
-        "//tensorflow/lite:arena_planner",
-        "//tensorflow/lite:external_cpu_backend_context",
-        "//tensorflow/lite:framework_experimental",
-        "//tensorflow/lite:graph_info",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite:memory_planner",
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite:simple_memory_arena",
-        "//tensorflow/lite:string",
-        "//tensorflow/lite:type_to_tflitetype",
-        "//tensorflow/lite:util",
-        "//tensorflow/lite:version",
-        "//tensorflow/lite/core:framework_experimental",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/core/api:verifier",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/experimental/resource",
-        "//tensorflow/lite/schema:schema_fbs",
-    ],
-)
-
-# DEPRECATED: use 'cc_api_stable' or 'cc_api_experimental' instead.
-alias(
-    name = "cc_api",
-    actual = "cc_api_experimental",
-    visibility = ["//tensorflow/lite:__pkg__"],
-)
-
-# The key parts of C++ API, excluding experimental features.
-cc_library(
-    name = "cc_api_stable",
-    srcs = [],
-    hdrs = CC_API_HDRS,
-    copts = tflite_copts() + tflite_copts_warnings(),
-    visibility = ["//tensorflow/lite:__pkg__"],
-    deps = [
-        "//tensorflow/lite:allocation",
-        "//tensorflow/lite:arena_planner",
-        "//tensorflow/lite:cc_api_stable",
-        "//tensorflow/lite:external_cpu_backend_context",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite:macros",
-        "//tensorflow/lite:memory_planner",
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite:simple_memory_arena",
-        "//tensorflow/lite:string",
-        "//tensorflow/lite:type_to_tflitetype",
-        "//tensorflow/lite:util",
-        "//tensorflow/lite:version",
-        "//tensorflow/lite/core:cc_api_stable",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/core/api:verifier",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/experimental/resource",
-        "//tensorflow/lite/schema:schema_fbs",
-    ],
-)
-
-# The key parts of C++ API, including experimental features.
-cc_library(
-    name = "cc_api_experimental",
-    srcs = [],
-    hdrs = CC_API_HDRS,
-    copts = tflite_copts() + tflite_copts_warnings(),
-    visibility = ["//tensorflow/lite:__pkg__"],
-    deps = [
-        "//tensorflow/lite:allocation",
-        "//tensorflow/lite:arena_planner",
-        "//tensorflow/lite:cc_api_experimental",
-        "//tensorflow/lite:external_cpu_backend_context",
-        "//tensorflow/lite:graph_info",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite:macros",
-        "//tensorflow/lite:memory_planner",
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite:simple_memory_arena",
-        "//tensorflow/lite:string",
-        "//tensorflow/lite:type_to_tflitetype",
-        "//tensorflow/lite:util",
-        "//tensorflow/lite:version",
-        "//tensorflow/lite/core:cc_api_experimental",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/core/api:verifier",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/experimental/resource",
-        "//tensorflow/lite/schema:schema_fbs",
-    ],
-)
-
-cc_library_with_tflite(
-    name = "builtin_ops",
-    hdrs = [
-        "cc/kernels/builtin_op_kernels.h",
-        "cc/kernels/register.h",
-        # TODO(b/161243354): remove the line below when clients no
-        # longer depend on it.
-        "//tensorflow/lite/core/kernels:builtin_op_kernels.h",
-        "//tensorflow/lite/kernels:fully_connected.h",
-    ],
-    generate_opaque_delegate_target = True,
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        ":builtin_ops_list",
-        "//tensorflow/lite:cc_api",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
-    ],
-)
-
-# This target only _declares_ a method to create an op resolver, but does not define it. This allows
-# clients to provide a custom implementation of the method, in particular to build in only a subset
-# of supported ops. Providing any other custom implementation is done at clients' own risk.
-cc_library(
-    name = "create_op_resolver_header",
-    srcs = [
-        "//tensorflow/lite/core:create_op_resolver.h",
-    ],
-    hdrs = [
-        "cc/create_op_resolver.h",
-    ],
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite:create_op_resolver_header",
-    ],
-)
-
-cc_library(
-    name = "create_op_resolver_with_builtin_ops",
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite:create_op_resolver_with_builtin_ops",
-    ],
-)
-
-build_test(
-    name = "cc_api_build_test",
-    targets = [
-        ":builtin_ops",
-        ":create_op_resolver_header",
-        ":create_op_resolver_with_builtin_ops",
-        ":cc_api",
-        ":cc_api_stable",
-        ":cc_api_experimental",
-        ":framework",
-        ":framework_stable",
-        ":framework_experimental",
-    ],
-)
-
-cc_library(
-    name = "delegate_registry",
-    hdrs = ["cc/experimental/acceleration/configuration/delegate_registry.h"],
-    copts = tflite_copts_warnings(),
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/acceleration/configuration:delegate_registry",
-    ],
-)
-
-build_test(
-    name = "delegate_registry_build_test",
-    targets = [
-        ":delegate_registry",
-    ],
-)
-
-cc_library(
-    name = "verifier",
-    hdrs = ["cc/tools/verifier.h"],
-    copts = tflite_copts_warnings(),
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite/tools:verifier",
-    ],
-)
-
-cc_library(
-    name = "verifier_internal",
-    hdrs = ["cc/tools/verifier_internal.h"],
-    copts = tflite_copts_warnings(),
-    visibility = [
-        "//tensorflow/lite/java/src/main/native:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/lite/tools:verifier_internal",
-    ],
-)
-
-build_test(
-    name = "verifier_build_test",
-    targets = [
-        ":verifier",
-        ":verifier_internal",
-    ],
-)
-
-#------------------------------------------------------------------------------
-# C API
-
-cc_library_with_tflite(
-    name = "c_api",
-    hdrs = ["c/c_api.h"],
-    copts = tflite_copts_warnings(),
-    generate_opaque_delegate_target = True,
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = ["//tensorflow/lite/c:c_api"],
-)
-
-cc_library(
-    name = "c_api_without_op_resolver",
-    hdrs = ["c/c_api.h"],
-    copts = tflite_copts_warnings(),
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = ["//tensorflow/lite/c:c_api_without_op_resolver"],
-)
-
-cc_library_with_tflite(
-    name = "c_api_experimental",
-    hdrs = [
-        "c/c_api_experimental.h",
-        "c/c_api_opaque.h",
-    ],
-    copts = tflite_copts_warnings(),
-    generate_opaque_delegate_target = True,
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite/c:c_api_experimental",
-        "//tensorflow/lite/core/c:c_api_experimental",
-    ],
-)
-
-cc_library_with_tflite(
-    name = "c_api_types",
-    hdrs = ["c/c_api_types.h"],
-    copts = tflite_copts_warnings(),
-    generate_opaque_delegate_target = True,
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = ["//tensorflow/lite/core/c:c_api_types"],
-)
-
-cc_library_with_tflite(
-    name = "common",
-    hdrs = ["c/common.h"],
-    copts = tflite_copts_warnings(),
-    generate_opaque_delegate_target = True,
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite/core/c:common",
-    ],
-)
-
-cc_library_with_tflite(
-    name = "builtin_op_data",
-    hdrs = ["c/builtin_op_data.h"],
-    copts = tflite_copts_warnings(),
-    generate_opaque_delegate_target = True,
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite/core/c:common",
-    ],
-)
-
-cc_library_with_tflite(
-    name = "delegate_plugin",
-    hdrs = ["c/experimental/acceleration/configuration/delegate_plugin.h"],
-    copts = tflite_copts_warnings(),
-    generate_opaque_delegate_target = True,
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:delegate_plugin",
-    ],
-)
-
-cc_library(
-    name = "gpu_plugin",
-    hdrs = ["c/experimental/acceleration/configuration/gpu_plugin.h"],
-    copts = tflite_copts_warnings(),
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:gpu_plugin",
-    ],
-)
-
-cc_library(
-    name = "nnapi_plugin",
-    hdrs = ["c/experimental/acceleration/configuration/nnapi_plugin.h"],
-    copts = tflite_copts_warnings(),
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:nnapi_plugin",
-    ],
-)
-
-cc_library(
-    name = "xnnpack_plugin",
-    hdrs = ["c/experimental/acceleration/configuration/xnnpack_plugin.h"],
-    copts = tflite_copts_warnings(),
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:xnnpack_plugin",
-    ],
-)
-
 custom_c_library_with_tflite(
     name = "custom_c_api_example",
     testonly = True,
@@ -510,82 +37,9 @@ custom_c_library_with_tflite(
 )
 
 build_test(
-    name = "c_api_build_test",
-    targets = [
-        ":builtin_op_data",
-        ":c_api",
-        ":c_api_experimental",
-        ":common",
-        ":custom_c_api_example",
-        ":delegate_plugin",
-        ":nnapi_plugin",
-        ":xnnpack_plugin",
-    ],
+    name = "custom_c_api_build_test",
+    targets = [":custom_c_api_example"],
 )
-
-cc_library(
-    name = "builtin_ops_list",
-    textual_hdrs = ["builtin_ops_list.inc"],
-    visibility = ["//tensorflow/lite:__subpackages__"],
-)
-
-exports_files(
-    srcs = ["builtin_ops_list.inc"],
-    visibility = ["//tensorflow/lite:__subpackages__"],
-)
-
-#------------------------------------------------------------------------------
-# Testing infrastructure
-
-cc_library(
-    name = "c_shims_test_util",
-    testonly = True,
-    srcs = ["c/shims_test_util.cc"],
-    hdrs = ["c/shims_test_util.h"],
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-)
-
-cc_library(
-    name = "cc_shims_test_util",
-    testonly = True,
-    hdrs = ["cc/shims_test_util.h"],
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe:__subpackages__",
-        "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite:test_util",
-    ],
-)
-
-build_test(
-    name = "shims_test_util_build_test",
-    targets = [
-        ":c_shims_test_util",
-        ":cc_shims_test_util",
-    ],
-)
-
-#------------------------------------------------------------------------------
-# JNI bindings (Java API and Java Tasks library)
-
-cc_library(
-    name = "jni_utils",
-    srcs = ["jni/jni_utils.cc"],
-    hdrs = ["jni/jni_utils.h"],
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite/java/jni",
-    ],
-)
-
 #------------------------------------------------------------------------------
 # Bazel rules
 
diff --git a/tensorflow/lite/core/shims/README.md b/tensorflow/lite/core/shims/README.md
index 2a95d5cb9c6..6e55eb81aec 100644
--- a/tensorflow/lite/core/shims/README.md
+++ b/tensorflow/lite/core/shims/README.md
@@ -1,11 +1,9 @@
-This directory contains shim header files that forward to the TF Lite
-C API and to the key headers of the TF Lite C++ API.
+This directory contains build macros such as `cc_library_with_tflite`,
+`java_library_with_tflite`, etc.
 
-The intent is that the shims in this directory could be modified to optionally
-redirect to a different implementation of those APIs (for example,
-one built into the underlying operating system platform).
+`cc_library_with_tflite` generates a `cc_library` target by default.
+The target will not use TF Lite in Play Services.
 
-These should be used as follows: #includes from .cc files that are
-_implementing_ the shimmed TF Lite APIs should include the regular TF
-Lite API headers.  #includes from files that are _using_ the shimmed
-APIs should include the shimmed headers.
+The intent is that the build macros in this directory could be modified to
+optionally redirect to a different implementation of TF Lite C and C++ APIs
+(for example, one built into the underlying operating system platform).
diff --git a/tensorflow/lite/core/shims/builtin_ops_list.inc b/tensorflow/lite/core/shims/builtin_ops_list.inc
deleted file mode 100644
index 7d7bdf70205..00000000000
--- a/tensorflow/lite/core/shims/builtin_ops_list.inc
+++ /dev/null
@@ -1,173 +0,0 @@
-
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// DO NOT EDIT MANUALLY: This file is automatically generated by
-// `tensorflow/lite/schema/builtin_ops_list/generator.cc`.
-
-TFLITE_OP(Register_ADD)
-TFLITE_OP(Register_AVERAGE_POOL_2D)
-TFLITE_OP(Register_CONCATENATION)
-TFLITE_OP(Register_CONV_2D)
-TFLITE_OP(Register_DEPTHWISE_CONV_2D)
-TFLITE_OP(Register_DEPTH_TO_SPACE)
-TFLITE_OP(Register_DEQUANTIZE)
-TFLITE_OP(Register_EMBEDDING_LOOKUP)
-TFLITE_OP(Register_FLOOR)
-TFLITE_OP(Register_FULLY_CONNECTED)
-TFLITE_OP(Register_HASHTABLE_LOOKUP)
-TFLITE_OP(Register_L2_NORMALIZATION)
-TFLITE_OP(Register_L2_POOL_2D)
-TFLITE_OP(Register_LOCAL_RESPONSE_NORMALIZATION)
-TFLITE_OP(Register_LOGISTIC)
-TFLITE_OP(Register_LSH_PROJECTION)
-TFLITE_OP(Register_LSTM)
-TFLITE_OP(Register_MAX_POOL_2D)
-TFLITE_OP(Register_MUL)
-TFLITE_OP(Register_RELU)
-TFLITE_OP(Register_RELU_N1_TO_1)
-TFLITE_OP(Register_RELU6)
-TFLITE_OP(Register_RESHAPE)
-TFLITE_OP(Register_RESIZE_BILINEAR)
-TFLITE_OP(Register_RNN)
-TFLITE_OP(Register_SOFTMAX)
-TFLITE_OP(Register_SPACE_TO_DEPTH)
-TFLITE_OP(Register_SVDF)
-TFLITE_OP(Register_TANH)
-TFLITE_OP(Register_SKIP_GRAM)
-TFLITE_OP(Register_EMBEDDING_LOOKUP_SPARSE)
-TFLITE_OP(Register_PAD)
-TFLITE_OP(Register_UNIDIRECTIONAL_SEQUENCE_RNN)
-TFLITE_OP(Register_GATHER)
-TFLITE_OP(Register_BATCH_TO_SPACE_ND)
-TFLITE_OP(Register_SPACE_TO_BATCH_ND)
-TFLITE_OP(Register_TRANSPOSE)
-TFLITE_OP(Register_MEAN)
-TFLITE_OP(Register_SUB)
-TFLITE_OP(Register_DIV)
-TFLITE_OP(Register_SQUEEZE)
-TFLITE_OP(Register_UNIDIRECTIONAL_SEQUENCE_LSTM)
-TFLITE_OP(Register_STRIDED_SLICE)
-TFLITE_OP(Register_BIDIRECTIONAL_SEQUENCE_RNN)
-TFLITE_OP(Register_EXP)
-TFLITE_OP(Register_TOPK_V2)
-TFLITE_OP(Register_SPLIT)
-TFLITE_OP(Register_LOG_SOFTMAX)
-TFLITE_OP(Register_BIDIRECTIONAL_SEQUENCE_LSTM)
-TFLITE_OP(Register_CAST)
-TFLITE_OP(Register_PRELU)
-TFLITE_OP(Register_MAXIMUM)
-TFLITE_OP(Register_ARG_MAX)
-TFLITE_OP(Register_MINIMUM)
-TFLITE_OP(Register_LESS)
-TFLITE_OP(Register_NEG)
-TFLITE_OP(Register_PADV2)
-TFLITE_OP(Register_GREATER)
-TFLITE_OP(Register_GREATER_EQUAL)
-TFLITE_OP(Register_LESS_EQUAL)
-TFLITE_OP(Register_SELECT)
-TFLITE_OP(Register_SLICE)
-TFLITE_OP(Register_SIN)
-TFLITE_OP(Register_TRANSPOSE_CONV)
-TFLITE_OP(Register_SPARSE_TO_DENSE)
-TFLITE_OP(Register_TILE)
-TFLITE_OP(Register_EXPAND_DIMS)
-TFLITE_OP(Register_EQUAL)
-TFLITE_OP(Register_NOT_EQUAL)
-TFLITE_OP(Register_LOG)
-TFLITE_OP(Register_SUM)
-TFLITE_OP(Register_SQRT)
-TFLITE_OP(Register_RSQRT)
-TFLITE_OP(Register_SHAPE)
-TFLITE_OP(Register_POW)
-TFLITE_OP(Register_ARG_MIN)
-TFLITE_OP(Register_FAKE_QUANT)
-TFLITE_OP(Register_REDUCE_PROD)
-TFLITE_OP(Register_REDUCE_MAX)
-TFLITE_OP(Register_PACK)
-TFLITE_OP(Register_LOGICAL_OR)
-TFLITE_OP(Register_ONE_HOT)
-TFLITE_OP(Register_LOGICAL_AND)
-TFLITE_OP(Register_LOGICAL_NOT)
-TFLITE_OP(Register_UNPACK)
-TFLITE_OP(Register_REDUCE_MIN)
-TFLITE_OP(Register_FLOOR_DIV)
-TFLITE_OP(Register_REDUCE_ANY)
-TFLITE_OP(Register_SQUARE)
-TFLITE_OP(Register_ZEROS_LIKE)
-TFLITE_OP(Register_FILL)
-TFLITE_OP(Register_FLOOR_MOD)
-TFLITE_OP(Register_RANGE)
-TFLITE_OP(Register_RESIZE_NEAREST_NEIGHBOR)
-TFLITE_OP(Register_LEAKY_RELU)
-TFLITE_OP(Register_SQUARED_DIFFERENCE)
-TFLITE_OP(Register_MIRROR_PAD)
-TFLITE_OP(Register_ABS)
-TFLITE_OP(Register_SPLIT_V)
-TFLITE_OP(Register_UNIQUE)
-TFLITE_OP(Register_CEIL)
-TFLITE_OP(Register_REVERSE_V2)
-TFLITE_OP(Register_ADD_N)
-TFLITE_OP(Register_GATHER_ND)
-TFLITE_OP(Register_COS)
-TFLITE_OP(Register_WHERE)
-TFLITE_OP(Register_RANK)
-TFLITE_OP(Register_ELU)
-TFLITE_OP(Register_REVERSE_SEQUENCE)
-TFLITE_OP(Register_MATRIX_DIAG)
-TFLITE_OP(Register_QUANTIZE)
-TFLITE_OP(Register_MATRIX_SET_DIAG)
-TFLITE_OP(Register_ROUND)
-TFLITE_OP(Register_HARD_SWISH)
-TFLITE_OP(Register_IF)
-TFLITE_OP(Register_WHILE)
-TFLITE_OP(Register_NON_MAX_SUPPRESSION_V4)
-TFLITE_OP(Register_NON_MAX_SUPPRESSION_V5)
-TFLITE_OP(Register_SCATTER_ND)
-TFLITE_OP(Register_SELECT_V2)
-TFLITE_OP(Register_DENSIFY)
-TFLITE_OP(Register_SEGMENT_SUM)
-TFLITE_OP(Register_BATCH_MATMUL)
-TFLITE_OP(Register_CUMSUM)
-TFLITE_OP(Register_CALL_ONCE)
-TFLITE_OP(Register_BROADCAST_TO)
-TFLITE_OP(Register_RFFT2D)
-TFLITE_OP(Register_CONV_3D)
-TFLITE_OP(Register_IMAG)
-TFLITE_OP(Register_REAL)
-TFLITE_OP(Register_COMPLEX_ABS)
-TFLITE_OP(Register_HASHTABLE)
-TFLITE_OP(Register_HASHTABLE_FIND)
-TFLITE_OP(Register_HASHTABLE_IMPORT)
-TFLITE_OP(Register_HASHTABLE_SIZE)
-TFLITE_OP(Register_REDUCE_ALL)
-TFLITE_OP(Register_CONV_3D_TRANSPOSE)
-TFLITE_OP(Register_VAR_HANDLE)
-TFLITE_OP(Register_READ_VARIABLE)
-TFLITE_OP(Register_ASSIGN_VARIABLE)
-TFLITE_OP(Register_BROADCAST_ARGS)
-TFLITE_OP(Register_RANDOM_STANDARD_NORMAL)
-TFLITE_OP(Register_BUCKETIZE)
-TFLITE_OP(Register_RANDOM_UNIFORM)
-TFLITE_OP(Register_MULTINOMIAL)
-TFLITE_OP(Register_GELU)
-TFLITE_OP(Register_DYNAMIC_UPDATE_SLICE)
-TFLITE_OP(Register_RELU_0_TO_1)
-TFLITE_OP(Register_UNSORTED_SEGMENT_PROD)
-TFLITE_OP(Register_UNSORTED_SEGMENT_MAX)
-TFLITE_OP(Register_UNSORTED_SEGMENT_SUM)
-TFLITE_OP(Register_ATAN2)
-TFLITE_OP(Register_UNSORTED_SEGMENT_MIN)
-TFLITE_OP(Register_SIGN)
diff --git a/tensorflow/lite/core/shims/c/common.h b/tensorflow/lite/core/shims/c/common.h
deleted file mode 100644
index e1220c30b95..00000000000
--- a/tensorflow/lite/core/shims/c/common.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_C_COMMON_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_C_COMMON_H_
-
-#include "tensorflow/lite/core/c/common.h"
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_C_COMMON_H_
diff --git a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/nnapi_plugin.h b/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/nnapi_plugin.h
deleted file mode 100644
index 2f7c590ad8f..00000000000
--- a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/nnapi_plugin.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
-
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h"
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
diff --git a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h b/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h
deleted file mode 100644
index 66fa56b463b..00000000000
--- a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_XNNPACK_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_XNNPACK_H_
-
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h"
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_XNNPACK_H_
diff --git a/tensorflow/lite/core/shims/c/shims_test_util.h b/tensorflow/lite/core/shims/c/shims_test_util.h
deleted file mode 100644
index a77509faea2..00000000000
--- a/tensorflow/lite/core/shims/c/shims_test_util.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_C_SHIMS_TEST_UTIL_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_C_SHIMS_TEST_UTIL_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Initialize TF Lite shims, in a manner appropriate for running unit tests.
-// Returns zero on success, or an implementation-defined error code on failure.
-// This should be called before calling any other shims functions or methods
-// in unit tests.
-int TfLiteInitializeShimsForTest(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_SHIMS_TEST_UTIL_H_
diff --git a/tensorflow/lite/core/shims/cc/create_op_resolver.h b/tensorflow/lite/core/shims/cc/create_op_resolver.h
deleted file mode 100644
index 253ab1ca379..00000000000
--- a/tensorflow/lite/core/shims/cc/create_op_resolver.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_CREATE_OP_RESOLVER_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_CC_CREATE_OP_RESOLVER_H_
-
-/// For documentation, see third_party/tensorflow/lite/create_op_resolver.h.
-#include "tensorflow/lite/create_op_resolver.h"
-
-namespace tflite_shims {
-using ::tflite::CreateOpResolver;
-}  // namespace tflite_shims
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_CREATE_OP_RESOLVER_H_
diff --git a/tensorflow/lite/core/shims/cc/interpreter.h b/tensorflow/lite/core/shims/cc/interpreter.h
deleted file mode 100644
index f9054a058c1..00000000000
--- a/tensorflow/lite/core/shims/cc/interpreter.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_INTERPRETER_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_CC_INTERPRETER_H_
-
-/// For documentation, see third_party/tensorflow/lite/interpreter.h.
-#include "tensorflow/lite/interpreter.h"
-
-namespace tflite_shims {
-using Interpreter = ::tflite::Interpreter;
-}  // namespace tflite_shims
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_INTERPRETER_H_
diff --git a/tensorflow/lite/core/shims/cc/interpreter_builder.h b/tensorflow/lite/core/shims/cc/interpreter_builder.h
deleted file mode 100644
index f71abe68a61..00000000000
--- a/tensorflow/lite/core/shims/cc/interpreter_builder.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_INTERPRETER_BUILDER_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_CC_INTERPRETER_BUILDER_H_
-
-/// For documentation, see third_party/tensorflow/lite/interpreter_builder.h.
-#include "tensorflow/lite/interpreter_builder.h"
-
-namespace tflite_shims {
-using InterpreterBuilder = ::tflite::InterpreterBuilder;
-}  // namespace tflite_shims
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_INTERPRETER_BUILDER_H_
diff --git a/tensorflow/lite/core/shims/cc/kernels/builtin_op_kernels.h b/tensorflow/lite/core/shims/cc/kernels/builtin_op_kernels.h
deleted file mode 100644
index d36b9c54e82..00000000000
--- a/tensorflow/lite/core/shims/cc/kernels/builtin_op_kernels.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_KERNELS_BUILTIN_OP_KERNELS_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_CC_KERNELS_BUILTIN_OP_KERNELS_H_
-
-/// For documentation,
-/// see third_party/tensorflow/lite/kernels/builtin_op_kernels.h.
-#include "tensorflow/lite/kernels/builtin_op_kernels.h"
-
-namespace tflite_shims {
-namespace ops {
-namespace builtin {
-
-#define TFLITE_OP(NAME) \
-    using ::tflite::ops::builtin::NAME;
-
-#include "tensorflow/lite/core/shims/builtin_ops_list.inc"
-
-#undef TFLITE_OP
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite_shims
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_KERNELS_BUILTIN_OP_KERNELS_H_
diff --git a/tensorflow/lite/core/shims/cc/kernels/register.h b/tensorflow/lite/core/shims/cc/kernels/register.h
deleted file mode 100644
index 25c5796c7b9..00000000000
--- a/tensorflow/lite/core/shims/cc/kernels/register.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_KERNELS_REGISTER_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_CC_KERNELS_REGISTER_H_
-
-/// For documentation,
-/// see third_party/tensorflow/lite/kernels/register.h.
-#include "tensorflow/lite/kernels/register.h"
-
-namespace tflite_shims {
-namespace ops {
-namespace builtin {
-using BuiltinOpResolver = ::tflite::ops::builtin::BuiltinOpResolver;
-using BuiltinOpResolverWithoutDefaultDelegates =
-    ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates;
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite_shims
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_KERNELS_REGISTER_H_
diff --git a/tensorflow/lite/core/shims/cc/model_builder.h b/tensorflow/lite/core/shims/cc/model_builder.h
deleted file mode 100644
index aac8c788703..00000000000
--- a/tensorflow/lite/core/shims/cc/model_builder.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_BUILDER_H_
-#define TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_BUILDER_H_
-
-/// For documentation, see third_party/tensorflow/lite/model_builder.h.
-#include "tensorflow/lite/model_builder.h"
-
-namespace tflite_shims {
-using FlatBufferModel = ::tflite::FlatBufferModel;
-}  // namespace tflite_shims
-
-#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_BUILDER_H_
diff --git a/tensorflow/lite/core/shims/cc_library_with_tflite.bzl b/tensorflow/lite/core/shims/cc_library_with_tflite.bzl
index c4493785746..9218d279661 100644
--- a/tensorflow/lite/core/shims/cc_library_with_tflite.bzl
+++ b/tensorflow/lite/core/shims/cc_library_with_tflite.bzl
@@ -74,7 +74,7 @@ def android_library_with_tflite(
     Note that this build rule doesn't itself add any dependencies on
     TF Lite; this macro should normally be used in conjunction with a
     direct or indirect 'tflite_deps' dependency on one of the "shim"
-    library targets from //third_party/tensorflow/lite/core/shims:*.
+    library targets from //tensorflow/lite/core/shims:*.
 
     Args:
       name: as for android_library.
@@ -125,7 +125,7 @@ def cc_library_with_tflite(
       generate_opaque_delegate_target: (bool) If set, generates an additional
         cc_library target, which has "_opaque_delegate" appended to the name.
         The target depends on
-        //third_party/tensorflow/lite/core/shims:tflite_use_opaque_delegate
+        //tensorflow/lite/core/shims:tflite_use_opaque_delegate
         which enables the truly opaque delegate type. This macro ensures that
         dependencies listed in 'tflite_deps' use _opaque_delegate variant.
       **kwargs: Additional cc_library parameters.
@@ -190,7 +190,7 @@ def cc_library_with_tflite_with_c_headers_test(name, hdrs, **kwargs):
         cc_library_with_tflite(
             name = "%s_lib" % basename,
             srcs = ["%s.c" % basename],
-            deps = [":" + name],
+            tflite_deps = [":" + name],
             copts = kwargs.get("copts", []),
             visibility = ["//visibility:private"],
             testonly = True,
@@ -241,7 +241,7 @@ def cc_test_with_tflite(
     Note that this build rule doesn't itself add any dependencies on
     TF Lite; this macro should normally be used in conjunction with a
     direct or indirect 'tflite_deps' dependency on one of the "shim"
-    library targets from //third_party/tensorflow/lite/core/shims:*.
+    library targets from //tensorflow/lite/core/shims:*.
 
     Args:
       name: as for cc_test.
@@ -274,7 +274,7 @@ def java_library_with_tflite(
     TF Lite; this macro should normally be used in conjunction with a
     direct or indirect 'tflite_deps' or 'tflite_jni_binaries' dependency
     on one of the "shim" library targets from
-    //third_party/tensorflow/lite/core/shims:*.
+    //tensorflow/lite/core/shims:*.
 
     Args:
       name: as for java_library.
@@ -312,7 +312,7 @@ def java_test_with_tflite(
     TF Lite; this macro should normally be used in conjunction with a
     direct or indirect 'tflite_deps' or 'tflite_jni_binaries' dependency
     on one of the "shim" library targets from
-    //third_party/tensorflow/lite/core/shims:*.
+    //tensorflow/lite/core/shims:*.
 
     Args:
       name: as for java_library.
@@ -344,7 +344,7 @@ def jni_binary_with_tflite(
     Note that this build rule doesn't itself add any dependencies on
     TF Lite; this macro should normally be used in conjunction with a
     direct or indirect 'tflite_deps' dependency on one of the "shim"
-    library targets from //third_party/tensorflow/lite/core/shims:*.
+    library targets from //tensorflow/lite/core/shims:*.
 
     Args:
       name: as for tflite_jni_binary.
diff --git a/tensorflow/lite/core/special_rules.bzl b/tensorflow/lite/core/special_rules.bzl
index 08be72b50d6..10d09ecd2fd 100644
--- a/tensorflow/lite/core/special_rules.bzl
+++ b/tensorflow/lite/core/special_rules.bzl
@@ -25,3 +25,7 @@ def verifier_visibility_allowlist():
 def delegate_registry_visibility_allowlist():
     """Returns a list of packages that can depend on delegate_registry.h."""
     return []
+
+def macros_visibility_allowlist():
+    """Returns a list of packages that can depend on macros.h."""
+    return []
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 7fa3c65adc1..28bfa533eb0 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -15,11 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/core/subgraph.h"
 
-#include <stdarg.h>
-#include <stddef.h>
-
 #include <algorithm>
 #include <atomic>
+#include <cstdarg>
+#include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
@@ -39,8 +38,6 @@ limitations under the License.
 #include "tensorflow/lite/core/api/tensor_utils.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/core/macros.h"
-#include "tensorflow/lite/experimental/remat/metadata_util.h"
 #include "tensorflow/lite/experimental/resource/resource_base.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/memory_planner.h"
@@ -528,12 +525,12 @@ TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
   // On Android the log message below is used for diagnosing delegation success
   // also in production builds. Use VERBOSE here so that the logging is turned
   // off in production builds on other platforms.
-  TFLITE_LOG_PROD(
-      tflite::TFLITE_LOG_VERBOSE,
-      "Replacing %d node(s) with delegate (%s) node, yielding %zu partitions "
-      "for the whole graph.",
-      nodes_to_replace->size, GetDelegateKernalName(registration),
-      node_subsets.size());
+  TFLITE_LOG_PROD(tflite::TFLITE_LOG_VERBOSE,
+                  "Replacing %d out of %d node(s) with delegate (%s) node, "
+                  "yielding %zu partitions "
+                  "for the whole graph.",
+                  nodes_to_replace->size, execution_plan_.size(),
+                  GetDelegateKernalName(registration), node_subsets.size());
 
   execution_plan_.clear();
 
@@ -665,14 +662,6 @@ TfLiteStatus Subgraph::GetModelMetadata(const struct TfLiteContext* context,
       ->GetModelMetadata(name, ptr, bytes);
 }
 
-TfLiteStatus Subgraph::MarkSubgraphAsDelegationSkippable(int subgraph_index) {
-  TF_LITE_ENSURE(&context_, subgraph_index > 0);
-  TF_LITE_ENSURE(&context_,
-                 static_cast<size_t>(subgraph_index) <= subgraphs_->size());
-  subgraphs_->at(subgraph_index)->MarkAsDelegationSkippable();
-  return kTfLiteOk;
-}
-
 TfLiteContext* Subgraph::GetSubgraphContext(int subgraph_index) {
   if (subgraph_index < 0 ||
       static_cast<size_t>(subgraph_index) >= subgraphs_->size()) {
@@ -681,6 +670,22 @@ TfLiteContext* Subgraph::GetSubgraphContext(int subgraph_index) {
   return subgraphs_->at(subgraph_index)->context();
 }
 
+const TfLiteContext* Subgraph::GetSubgraphContext(int subgraph_index) const {
+  if (subgraph_index < 0 ||
+      static_cast<size_t>(subgraph_index) >= subgraphs_->size()) {
+    return nullptr;
+  }
+  return subgraphs_->at(subgraph_index)->context();
+}
+
+TfLiteStatus Subgraph::MarkSubgraphAsDelegationSkippable(int subgraph_index) {
+  TF_LITE_ENSURE(&context_, subgraph_index > 0);
+  TF_LITE_ENSURE(&context_,
+                 static_cast<size_t>(subgraph_index) < subgraphs_->size());
+  subgraphs_->at(subgraph_index)->MarkAsDelegationSkippable();
+  return kTfLiteOk;
+}
+
 TfLiteStatus Subgraph::PreviewDelegatePartitioning(
     const TfLiteIntArray* nodes_to_replace,
     TfLiteDelegateParams** partition_params_array, int* num_partitions) {
@@ -1173,6 +1178,7 @@ void* Subgraph::OpInit(const TfLiteRegistration& op_reg, const char* buffer,
     TfLiteRegistration* referenced_registration =
         &nodes_and_registration_[op_reg.registration_external->node_index]
              .second;
+    if (referenced_registration->init == nullptr) return nullptr;
     return referenced_registration->init(&context_, buffer, length);
   }
 
@@ -1203,6 +1209,18 @@ TfLiteStatus Subgraph::OpPrepare(const TfLiteRegistration& op_reg,
     TfLiteRegistration* referenced_registration =
         &nodes_and_registration_[op_reg.registration_external->node_index]
              .second;
+    if (referenced_registration->prepare == nullptr) {
+      if (IsUnresolvedCustomOp(op_reg)) {
+        ReportError(
+            "Encountered unresolved custom op: %s.\nSee instructions: "
+            "https://www.tensorflow.org/lite/guide/ops_custom ",
+            op_reg.custom_name ? op_reg.custom_name : "UnknownOp");
+        return kTfLiteUnresolvedOps;
+      } else {
+        // Resolved ops can have a null Prepare function.
+        return kTfLiteOk;
+      }
+    }
     return referenced_registration->prepare(&context_, node);
   }
   if (op_reg.registration_external && op_reg.registration_external->prepare) {
@@ -1257,6 +1275,7 @@ TfLiteStatus Subgraph::OpInvoke(const TfLiteRegistration& op_reg,
     TfLiteRegistration* referenced_registration =
         &nodes_and_registration_[op_reg.registration_external->node_index]
              .second;
+    if (referenced_registration->invoke == nullptr) return kTfLiteError;
     return referenced_registration->invoke(&context_, node);
   }
 
@@ -1289,6 +1308,7 @@ void Subgraph::OpFree(const TfLiteRegistration& op_reg, void* buffer) {
     TfLiteRegistration* referenced_registration =
         &nodes_and_registration_[op_reg.registration_external->node_index]
              .second;
+    if (referenced_registration->free == nullptr) return;
     return referenced_registration->free(&context_, buffer);
   }
   if (op_reg.registration_external && op_reg.registration_external->free &&
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 279449d5e98..ce61e89a671 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -51,6 +51,10 @@ namespace internal {
 class CommonOpaqueConversionUtil;  // Class for friend declarations.
 }
 
+namespace async {
+class AsyncSubgraph;  // Class for friend declarations.
+}
+
 namespace impl {
 class Interpreter;         // Class for friend declarations.
 class InterpreterBuilder;  // Class for friend declarations.
@@ -460,17 +464,40 @@ class Subgraph {
   // index. If an invalid subgraph index is given, then returns nullptr.
   TfLiteContext* GetSubgraphContext(int subgraph_index);
 
+  // Retrieves a const pointer to the corresponding TfLiteContext of a subgraph
+  // given a subgraph index. If an invalid subgraph index is given, then returns
+  // nullptr.
+  const TfLiteContext* GetSubgraphContext(int subgraph_index) const;
+
   // Marks the subgraph with the given index as "delegation-skippable". Returns
   // kTfLiteOk if the given subgraph index is valid and is successfully marked
-  // as delegation-skippable.
+  // as delegation-skippable, and an error status if the subgraph index is
+  // invalid.
   // If a subgraph is delegation-skippable, then the subgraph will be handled by
   // a TfLiteDelegate (and that the delegate is supposed to be already aware of
   // this state), and therefore, TfLiteInterpreter can skip invoking
   // `ModifyGraphWithDelegate` on this subgraph.
-  // NOTE: This function is expected to be called only when the subgraph will be
-  // skipped by the interpreter (e.g. the subgraph is part of the list of callee
-  // subgraphs of the same control flow node, and all of those callees are
-  // supported by the same delegate at once).
+  // NOTE: This function is expected to be called only when the subgraph that
+  // `subgraph_index` is pointing to should be skipped by
+  // interpreter::ModifyGraphWithDelegate (e.g. the subgraph is part of the list
+  // of callee subgraphs of the same control flow node, and all of those callees
+  // are supported by the same delegate at once).
+  //
+  // For example, this function can be used when the delegate is handling
+  // control flow ops like while op. E.g. A while op has condition subgraph
+  // indexed at `i` and body subgraph indexed at `j`. The op can be delegated
+  // when the following condition satisfied:
+  //   1. The delegate supports while op
+  //   2. Both condition subgraph `i` and body subgraph `j` can be fully
+  //      delegated by the delegate.
+  // Then if the delegate decides to support the while node along with both body
+  // and condition subgraphs, it should mark subgraphs `i` and `j` skippable so
+  // those two subgraphs won't be delegated separately again after being
+  // absorbed by the parent subgraph.
+  // WARNING: It is the delegate's responsibility to define when to skip
+  // subgraph->ModifyGraphWithDelegate, to check any edge cases (i.e. multiple
+  // references to the subgraph that `subgraph_index` is pointing to), and to
+  // mark that subgraph as skippable using this function.
   TfLiteStatus MarkSubgraphAsDelegationSkippable(int subgraph_index);
 
   // Returns whether this subgraph is delegation-skippable.
@@ -488,6 +515,7 @@ class Subgraph {
  private:
 #ifndef DOXYGEN_SKIP
   friend class tflite::impl::InterpreterBuilder;
+  friend class tflite::async::AsyncSubgraph;
   friend class TestDelegate;
 #endif  // DOXYGEN_SKIP
   // SubgraphAwareProfiler wraps an actual TFLite profiler, such as a
diff --git a/tensorflow/lite/core/subgraph_test.cc b/tensorflow/lite/core/subgraph_test.cc
index a14dc5ec2eb..edda4ff85c7 100644
--- a/tensorflow/lite/core/subgraph_test.cc
+++ b/tensorflow/lite/core/subgraph_test.cc
@@ -15,11 +15,12 @@ limitations under the License.
 
 #include "tensorflow/lite/core/subgraph.h"
 
-#include <algorithm>
+#include <memory>
+#include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/stderr_reporter.h"
 
 namespace tflite {
 
@@ -71,5 +72,63 @@ TEST(RemoveUnusedInputs, BypassInputsWithoutOp) {
   ASSERT_EQ(subgraph.inputs(), std::vector<int>({0, -1, 2}));
 }
 
+TEST(GetSubgraphContext, NonConstGetSubgraphContext) {
+  Interpreter interpreter;
+  auto& subgraph = interpreter.primary_subgraph();
+  TfLiteContext* context;
+
+  context = subgraph.GetSubgraphContext(0);
+  ASSERT_NE(context, nullptr);
+
+  context = subgraph.GetSubgraphContext(-1);
+  ASSERT_EQ(context, nullptr);
+
+  context = subgraph.GetSubgraphContext(1);
+  ASSERT_EQ(context, nullptr);
+
+  const auto& const_subgraph = interpreter.primary_subgraph();
+  const_subgraph.GetSubgraphContext(1);
+}
+
+TEST(GetSubgraphContext, ConstGetSubgraphContext) {
+  Interpreter interpreter;
+  const auto& subgraph = interpreter.primary_subgraph();
+  const TfLiteContext* context;
+
+  context = subgraph.GetSubgraphContext(0);
+  ASSERT_NE(context, nullptr);
+
+  context = subgraph.GetSubgraphContext(-1);
+  ASSERT_EQ(context, nullptr);
+
+  context = subgraph.GetSubgraphContext(1);
+  ASSERT_EQ(context, nullptr);
+}
+
+TEST(MarkSubgraphAsDelegationSkippable, MarkSubgraphAsDelegationSkippable) {
+  static StderrReporter* error_reporter = new StderrReporter;
+  // Construct a mock subgraph vector with two entries.
+  std::vector<std::unique_ptr<Subgraph>> subgraphs;
+  for (int i = 0; i < 2; ++i) {
+    subgraphs.emplace_back(new Subgraph(/*error_reporter=*/error_reporter,
+                                        /*external_contexts=*/nullptr,
+                                        /*subgraphs=*/&subgraphs,
+                                        /*resources=*/nullptr,
+                                        /*resource_ids=*/nullptr,
+                                        /*initialization_status_map=*/nullptr,
+                                        /*subgraph_index=*/i));
+  }
+
+  // The primary subgraph shouldn't be delegation-skippable.
+  ASSERT_EQ(subgraphs[0]->MarkSubgraphAsDelegationSkippable(0), kTfLiteError);
+  ASSERT_FALSE(subgraphs[0]->IsDelegationSkippable());
+
+  // The subgraph_index shouldn't exceed the total number of subgraphs.
+  ASSERT_EQ(subgraphs[0]->MarkSubgraphAsDelegationSkippable(2), kTfLiteError);
+
+  ASSERT_EQ(subgraphs[0]->MarkSubgraphAsDelegationSkippable(1), kTfLiteOk);
+  ASSERT_TRUE(subgraphs[1]->IsDelegationSkippable());
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/core/tools/BUILD b/tensorflow/lite/core/tools/BUILD
index c0abbc81ed9..3e5363b809d 100644
--- a/tensorflow/lite/core/tools/BUILD
+++ b/tensorflow/lite/core/tools/BUILD
@@ -66,9 +66,9 @@ cc_library(
     hdrs = ["verifier_internal.h"],
     compatible_with = get_compatible_with_portable(),
     visibility = verifier_internal_visibility_allowlist() + [
-        "//tensorflow/lite/tools:__subpackages__",
         "//tensorflow/lite/core/shims:__subpackages__",
         "//tensorflow/lite/java/src/main/native:__pkg__",
+        "//tensorflow/lite/tools:__subpackages__",
     ],
     deps = [
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index 50797efbd6b..3c9bb1641f5 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -510,7 +510,7 @@ TEST(TestOpaqueDelegate, PrepareCopyFromFree) {
 
   std::unique_ptr<tflite::FlatBufferModel> model =
       tflite::FlatBufferModel::BuildFromFile(
-          "third_party/tensorflow/lite/testdata/add.bin");
+          "tensorflow/lite/testdata/add.bin");
   ASSERT_NE(model, nullptr);
   constexpr int kNumTensorElements = 1 * 8 * 8 * 3;
 
@@ -581,7 +581,7 @@ TEST(TestOpaqueDelegate, PrepareCopyFromFree) {
 TEST(TestDelegateKernel, WithoutName) {
   std::unique_ptr<tflite::FlatBufferModel> model =
       tflite::FlatBufferModel::BuildFromFile(
-          "third_party/tensorflow/lite/testdata/add.bin");
+          "tensorflow/lite/testdata/add.bin");
   ASSERT_NE(model, nullptr);
 
   tflite::ops::builtin::BuiltinOpResolver resolver;
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 742d0144cc9..0dafc00ba0c 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -41,10 +41,10 @@ cc_library(
     copts = tf_opts_nortti_if_lite_protos(),
     features = tf_features_nolayering_check_if_ios(),
     deps = [
-        ":util",
         ":buffer_map_util",
-        "//tensorflow/lite/core/c:common",
+        ":util",
         "//tensorflow/lite:string",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:compatibility",
     ] + if_mobile([
         "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -63,8 +63,8 @@ cc_library(
     features = tf_features_nolayering_check_if_ios(),
     deps = [
         ":util",
-        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/resource",
     ] + if_mobile([
         "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -162,21 +162,21 @@ cc_library(
     deps = [
         ":buffer_map",
         ":delegate_data",
-        ":util",
         ":tflite_subgraph_execute",
-        "@flatbuffers",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/core/c:common",
+        ":util",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:macros",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/utils:simple_delegate",
         "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
+        "@com_google_absl//absl/strings",
+        "@flatbuffers",
     ] + if_mobile([
         "//tensorflow/core:portable_tensorflow_lib_lite",
     ]) + if_not_mobile([
@@ -216,13 +216,13 @@ cc_library(
         ":buffer_map",
         ":subgraph_resource",
         ":util",
+        "//tensorflow/lite:cc_api_experimental",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@flatbuffers",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite:cc_api",
-        "//tensorflow/lite:util",
     ] + if_mobile([
         "//tensorflow/core:portable_tensorflow_lib_lite",
     ]) + if_not_mobile([
@@ -262,8 +262,8 @@ cc_library(
     compatible_with = get_compatible_with_cloud(),
     features = tf_features_nolayering_check_if_ios(),
     deps = [
+        "//tensorflow/lite:cc_api_experimental",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite:cc_api",
     ] + if_mobile([
         "//tensorflow/core:portable_tensorflow_lib_lite",
     ]) + if_not_mobile([
@@ -317,12 +317,12 @@ cc_library(
     #TODO(b/206038955): Consider restrict the visibility to '//third_party/fcp/client:__subpackages__'.
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite:kernel_api",
-        "@com_google_absl//absl/strings:str_format",
-        "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "@com_google_absl//absl/strings:str_format",
     ] + if_mobile([
         "//tensorflow/core:portable_tensorflow_lib_lite",
     ]) + if_not_mobile([
@@ -378,8 +378,8 @@ tf_cc_test(
     ],
     features = tf_features_nolayering_check_if_ios(),
     deps = [
-        ":delegate",
         ":allowlisted_flex_ops_lib",
+        ":delegate",
         "@com_google_googletest//:gtest_main",
     ] + if_mobile([
         "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -410,15 +410,15 @@ cc_library(
         ":buffer_map_util",
         ":subgraph_resource",
         ":util",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "@com_google_absl//absl/strings:str_format",
-        "//tensorflow/lite:cc_api",
+        "//tensorflow/lite:cc_api_experimental",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ] + if_mobile([
         "//tensorflow/core:portable_tensorflow_lib_lite",
     ]) + if_not_mobile([
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index ac96c57334d..791e324ad9b 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -76,7 +76,7 @@ TfLiteStatus FlexDelegate::Initialize(TfLiteContext* context) {
       base_delegate_);
   if (!status.ok()) {
     TF_LITE_KERNEL_LOG(context, "Failed to initialize TensorFlow context: %s",
-                       status.error_message().c_str());
+                       tsl::NullTerminatedMessage(status));
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/delegates/flex/delegate_data.cc b/tensorflow/lite/delegates/flex/delegate_data.cc
index d95d213b4e4..e0b0d1041e4 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data.cc
@@ -134,7 +134,7 @@ tensorflow::Status GetSubgraphNamesForFunctionExecution(
               .AsVector();
       // TODO(b/181352924): Use proto arena if we see performance regression.
       if (!node_def.ParseFromString(v[1].AsString().str())) {
-        return tensorflow::Status(tensorflow::error::INTERNAL,
+        return tensorflow::Status(absl::StatusCode::kInternal,
                                   "could not parse NodeDef");
       }
       // Loop through all the attributes in this node to check if it has
@@ -208,7 +208,7 @@ tensorflow::Status DelegateData::Prepare(
   }
   if (flex_delegate == nullptr && main_subgraph != nullptr) {
     return tensorflow::Status(
-        tensorflow::error::FAILED_PRECONDITION,
+        absl::StatusCode::kFailedPrecondition,
         "flex_delegate must be non-null when main_subgraph is provided.");
   }
 
@@ -220,13 +220,13 @@ tensorflow::Status DelegateData::Prepare(
   auto device_mgr =
       std::make_unique<tensorflow::StaticDeviceMgr>(std::move(devices));
   // Note that Rendezvous is ref-counted so it will be automatically deleted.
-  tensorflow::Rendezvous* rendezvous =
-      new tensorflow::IntraProcessRendezvous(device_mgr.get());
+  auto rendezvous = tsl::core::RefCountPtr<tensorflow::IntraProcessRendezvous>(
+      new tensorflow::IntraProcessRendezvous(device_mgr.get()));
   eager_context_ = new tensorflow::EagerContext(
       session_options,
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
       /*async=*/false, device_mgr.release(), /*device_mgr_owned*/ true,
-      rendezvous, nullptr);
+      std::move(rendezvous), nullptr);
 
   if (main_subgraph) {
     TF_RETURN_IF_ERROR(RegisterFunctionDefForSubgraphs(
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 0fef201c34a..31de8b4216f 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -356,7 +356,7 @@ class OpNode {
                                            node_index)) {
           if (CopyToTfLiteTensor(context, shared_info, tensor, &tf_tensor,
                                  tflite_index) != kTfLiteOk) {
-            return tensorflow::Status(tensorflow::error::INTERNAL,
+            return tensorflow::Status(absl::StatusCode::kInternal,
                                       "failed to copy data from TF tensor");
           }
         } else {
@@ -474,7 +474,8 @@ TfLiteStatus DelegateKernel::Init(TfLiteContext* context,
   // tensors (buffer forwarding).
   auto check_if_op_reuses_input = [](const string& op_name) {
     return op_name == "TensorListPushBack" || op_name == "TensorListSetItem" ||
-           op_name == "SparseReshape" || op_name == "StridedSlice";
+           op_name == "SparseReshape" || op_name == "StridedSlice" ||
+           op_name == "RaggedTensorToVariant";
   };
 
   for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
diff --git a/tensorflow/lite/delegates/flex/tflite_subgraph_execute.cc b/tensorflow/lite/delegates/flex/tflite_subgraph_execute.cc
index c0778e5d264..207ae18f274 100644
--- a/tensorflow/lite/delegates/flex/tflite_subgraph_execute.cc
+++ b/tensorflow/lite/delegates/flex/tflite_subgraph_execute.cc
@@ -102,7 +102,9 @@ class TfLiteSubgraphExecute : public OpKernel {
       OP_REQUIRES(ctx, subgraph_selected.AllocateTensors() == kTfLiteOk,
                   errors::Internal("Failed to call allocate tensors"));
       tfl_tensors_need_allocation_ = false;
-      output_tensors_can_be_shared_ = !subgraph_selected.HasDynamicTensors();
+      // TODO(b/274934361): re-enable once allocation issue is solved.
+      // output_tensors_can_be_shared_ =!subgraph_selected.HasDynamicTensors();
+      output_tensors_can_be_shared_ = false;
     }
 
     if (output_tensors_can_be_shared_) {
diff --git a/tensorflow/lite/delegates/flex/util.cc b/tensorflow/lite/delegates/flex/util.cc
index c4adef0ab2a..af0ed0a59b8 100644
--- a/tensorflow/lite/delegates/flex/util.cc
+++ b/tensorflow/lite/delegates/flex/util.cc
@@ -32,7 +32,7 @@ static constexpr char kResourceVariablePrefix[] = "tflite_resource_variable";
 TfLiteStatus ConvertStatus(TfLiteContext* context,
                            const tensorflow::Status& status) {
   if (!status.ok()) {
-    TF_LITE_KERNEL_LOG(context, "%s", status.error_message().c_str());
+    TF_LITE_KERNEL_LOG(context, "%s", tsl::NullTerminatedMessage(status));
     return kTfLiteError;
   }
   return kTfLiteOk;
@@ -120,8 +120,12 @@ TfLiteType GetTensorFlowLiteType(TF_DataType type) {
       return kTfLiteFloat64;
     case TF_INT16:
       return kTfLiteInt16;
+    case TF_UINT16:
+      return kTfLiteUInt16;
     case TF_INT32:
       return kTfLiteInt32;
+    case TF_UINT32:
+      return kTfLiteUInt32;
     case TF_UINT8:
       return kTfLiteUInt8;
     case TF_INT8:
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 8497e1aebfe..40f95082cd8 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -35,8 +35,6 @@ cc_library(
     hdrs = ["gl_delegate.h"],
     linkopts = gpu_delegate_linkopts(),
     deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/types:span",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/core/c:common",
@@ -52,19 +50,21 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl:command_queue",
         "//tensorflow/lite/delegates/gpu/gl:compiler",
         "//tensorflow/lite/delegates/gpu/gl:egl_environment",
-        "//tensorflow/lite/delegates/gpu/gl:request_gpu_info",
         "//tensorflow/lite/delegates/gpu/gl:gl_call",
+        "//tensorflow/lite/delegates/gpu/gl:request_gpu_info",
         "//tensorflow/lite/delegates/gpu/gl/converters:bhwc_to_phwc4",
         "//tensorflow/lite/delegates/gpu/gl/converters:phwc4_to_bhwc",
         "//tensorflow/lite/delegates/gpu/gl/kernels:registry",
         "//tensorflow/lite/delegates/gpu/gl/workgroups:best_effort_calculator",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/types:span",
     ] + select({
         "//conditions:default": [
             "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs",
             "//tensorflow/lite/delegates/gpu/gl:metadata_cc_fbs",
             "//tensorflow/lite/delegates/gpu/gl:workgroups_cc_fbs",
-            "@flatbuffers",
             "//tensorflow/lite/schema:schema_fbs",
+            "@flatbuffers",
         ],
         ":tflite_gpu_binary_release": [],
     }) + tflite_extra_gles_deps(),
@@ -240,15 +240,14 @@ cc_library(
     }) + [
         ":api",
         ":delegate_options",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/types:span",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/core/async:backend_async_kernel_interface",
         "//tensorflow/lite/core/async/c:task",
         "//tensorflow/lite/core/async/interop/c:attribute_map",
         "//tensorflow/lite/core/async/interop/c:constants",
-        "//tensorflow/lite/core/async:backend_async_kernel_interface",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates:serialization",
         "//tensorflow/lite/delegates/gpu/cl:api",
         "//tensorflow/lite/delegates/gpu/cl:util",
         "//tensorflow/lite/delegates/gpu/common:model_builder",
@@ -258,12 +257,13 @@ cc_library(
         "//tensorflow/lite/delegates/utils:async_type_helpers",
         "//tensorflow/lite/delegates/utils:ret_macros",
         "//tensorflow/lite/delegates/utils:sync_fence",
-        "//tensorflow/lite/delegates:serialization",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/profiling/telemetry",
-        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
         "//tensorflow/lite/profiling/telemetry:telemetry_status",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 4fe70b95e2a..50ab40d61f8 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -35,8 +35,6 @@ cc_library(
         ":opencl_wrapper",
         ":tensor",
         ":tensor_type_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/types:span",
         "//tensorflow/lite/delegates/gpu:api",
         "//tensorflow/lite/delegates/gpu/cl/kernels:converter",
         "//tensorflow/lite/delegates/gpu/common:data_type",
@@ -46,6 +44,8 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common/task:tensor_desc",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -439,9 +439,9 @@ cc_library(
         "//conditions:default": ["-ldl"],  # opencl_wrapper calls dlopen()
     }),
     deps = [
+        "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_absl//absl/strings",
         "@opencl_headers",
-        "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/cl/" + if_google("google", "default") + ":qcom_wrapper",
     ] + select({
         # copybara:uncomment_begin(google-only)
diff --git a/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
old mode 100644
new mode 100755
index ec0f176b45a..2e3f07e46b6
--- a/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
+++ b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
@@ -20,6 +20,13 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffers.h"
 
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 1 &&
+              FLATBUFFERS_VERSION_REVISION == 21,
+             "Non-compatible flatbuffers version included");
+
 namespace tflite {
 namespace gpu {
 namespace cl {
@@ -31,7 +38,7 @@ struct ProgramBuilder;
 struct CompiledCache;
 struct CompiledCacheBuilder;
 
-struct Program FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Program FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ProgramBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FINGERPRINT = 4,
@@ -40,10 +47,10 @@ struct Program FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint64_t fingerprint() const {
     return GetField<uint64_t>(VT_FINGERPRINT, 0);
   }
-  const flatbuffers::Vector<uint8_t> *binary() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_BINARY);
+  const ::flatbuffers::Vector<uint8_t> *binary() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_BINARY);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint64_t>(verifier, VT_FINGERPRINT, 8) &&
            VerifyOffset(verifier, VT_BINARY) &&
@@ -54,37 +61,37 @@ struct Program FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct ProgramBuilder {
   typedef Program Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_fingerprint(uint64_t fingerprint) {
     fbb_.AddElement<uint64_t>(Program::VT_FINGERPRINT, fingerprint, 0);
   }
-  void add_binary(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> binary) {
+  void add_binary(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> binary) {
     fbb_.AddOffset(Program::VT_BINARY, binary);
   }
-  explicit ProgramBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ProgramBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Program> Finish() {
+  ::flatbuffers::Offset<Program> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Program>(end);
+    auto o = ::flatbuffers::Offset<Program>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Program> CreateProgram(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Program> CreateProgram(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     uint64_t fingerprint = 0,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> binary = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> binary = 0) {
   ProgramBuilder builder_(_fbb);
   builder_.add_fingerprint(fingerprint);
   builder_.add_binary(binary);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Program> CreateProgramDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Program> CreateProgramDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     uint64_t fingerprint = 0,
     const std::vector<uint8_t> *binary = nullptr) {
   auto binary__ = binary ? _fbb.CreateVector<uint8_t>(*binary) : 0;
@@ -94,19 +101,19 @@ inline flatbuffers::Offset<Program> CreateProgramDirect(
       binary__);
 }
 
-struct CompiledCache FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct CompiledCache FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef CompiledCacheBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DRIVER_VERSION = 4,
     VT_PROGRAMS = 6
   };
-  const flatbuffers::String *driver_version() const {
-    return GetPointer<const flatbuffers::String *>(VT_DRIVER_VERSION);
+  const ::flatbuffers::String *driver_version() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DRIVER_VERSION);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::Program>> *programs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::Program>> *>(VT_PROGRAMS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::Program>> *programs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::Program>> *>(VT_PROGRAMS);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DRIVER_VERSION) &&
            verifier.VerifyString(driver_version()) &&
@@ -119,41 +126,41 @@ struct CompiledCache FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct CompiledCacheBuilder {
   typedef CompiledCache Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_driver_version(flatbuffers::Offset<flatbuffers::String> driver_version) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_driver_version(::flatbuffers::Offset<::flatbuffers::String> driver_version) {
     fbb_.AddOffset(CompiledCache::VT_DRIVER_VERSION, driver_version);
   }
-  void add_programs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::Program>>> programs) {
+  void add_programs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::Program>>> programs) {
     fbb_.AddOffset(CompiledCache::VT_PROGRAMS, programs);
   }
-  explicit CompiledCacheBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit CompiledCacheBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CompiledCache> Finish() {
+  ::flatbuffers::Offset<CompiledCache> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CompiledCache>(end);
+    auto o = ::flatbuffers::Offset<CompiledCache>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CompiledCache> CreateCompiledCache(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> driver_version = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::Program>>> programs = 0) {
+inline ::flatbuffers::Offset<CompiledCache> CreateCompiledCache(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> driver_version = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::Program>>> programs = 0) {
   CompiledCacheBuilder builder_(_fbb);
   builder_.add_programs(programs);
   builder_.add_driver_version(driver_version);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<CompiledCache> CreateCompiledCacheDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<CompiledCache> CreateCompiledCacheDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *driver_version = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::cl::data::Program>> *programs = nullptr) {
+    const std::vector<::flatbuffers::Offset<tflite::gpu::cl::data::Program>> *programs = nullptr) {
   auto driver_version__ = driver_version ? _fbb.CreateString(driver_version) : 0;
-  auto programs__ = programs ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::cl::data::Program>>(*programs) : 0;
+  auto programs__ = programs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::cl::data::Program>>(*programs) : 0;
   return tflite::gpu::cl::data::CreateCompiledCache(
       _fbb,
       driver_version__,
@@ -161,11 +168,11 @@ inline flatbuffers::Offset<CompiledCache> CreateCompiledCacheDirect(
 }
 
 inline const tflite::gpu::cl::data::CompiledCache *GetCompiledCache(const void *buf) {
-  return flatbuffers::GetRoot<tflite::gpu::cl::data::CompiledCache>(buf);
+  return ::flatbuffers::GetRoot<tflite::gpu::cl::data::CompiledCache>(buf);
 }
 
 inline const tflite::gpu::cl::data::CompiledCache *GetSizePrefixedCompiledCache(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<tflite::gpu::cl::data::CompiledCache>(buf);
+  return ::flatbuffers::GetSizePrefixedRoot<tflite::gpu::cl::data::CompiledCache>(buf);
 }
 
 inline const char *CompiledCacheIdentifier() {
@@ -173,22 +180,22 @@ inline const char *CompiledCacheIdentifier() {
 }
 
 inline bool CompiledCacheBufferHasIdentifier(const void *buf) {
-  return flatbuffers::BufferHasIdentifier(
+  return ::flatbuffers::BufferHasIdentifier(
       buf, CompiledCacheIdentifier());
 }
 
 inline bool SizePrefixedCompiledCacheBufferHasIdentifier(const void *buf) {
-  return flatbuffers::BufferHasIdentifier(
+  return ::flatbuffers::BufferHasIdentifier(
       buf, CompiledCacheIdentifier(), true);
 }
 
 inline bool VerifyCompiledCacheBuffer(
-    flatbuffers::Verifier &verifier) {
+    ::flatbuffers::Verifier &verifier) {
   return verifier.VerifyBuffer<tflite::gpu::cl::data::CompiledCache>(CompiledCacheIdentifier());
 }
 
 inline bool VerifySizePrefixedCompiledCacheBuffer(
-    flatbuffers::Verifier &verifier) {
+    ::flatbuffers::Verifier &verifier) {
   return verifier.VerifySizePrefixedBuffer<tflite::gpu::cl::data::CompiledCache>(CompiledCacheIdentifier());
 }
 
@@ -197,14 +204,14 @@ inline const char *CompiledCacheExtension() {
 }
 
 inline void FinishCompiledCacheBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<tflite::gpu::cl::data::CompiledCache> root) {
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::gpu::cl::data::CompiledCache> root) {
   fbb.Finish(root, CompiledCacheIdentifier());
 }
 
 inline void FinishSizePrefixedCompiledCacheBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<tflite::gpu::cl::data::CompiledCache> root) {
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::gpu::cl::data::CompiledCache> root) {
   fbb.FinishSizePrefixed(root, CompiledCacheIdentifier());
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 6f0d59e4296..e5219f2645e 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -554,10 +554,20 @@ absl::Status InferenceContext::AllocateBufferBasedTensors(
 
   std::vector<bool> created_tensors(buffer_usage_records.size(), false);
   shared_buffer_tensors_.resize(buffer_usage_records.size());
+  bool create_model_output_tensors = false;
   for (auto& node : gpu_model.nodes) {
+    // Handle node input tensors.
     std::vector<ValueId> node_tensor_ids = node.inputs;
+    // Handle node output tensors.
     node_tensor_ids.insert(node_tensor_ids.end(), node.outputs.begin(),
                            node.outputs.end());
+    if (!create_model_output_tensors) {
+      // Handle graph output tensors.
+      for (const auto& output : gpu_model.output_ids_and_refs) {
+        node_tensor_ids.push_back(output.first);
+      }
+      create_model_output_tensors = true;
+    }
     for (auto& tensor_id : node_tensor_ids) {
       if (GetTensorType(gpu_model, create_info, gpu_info, tensor_id) !=
           TensorType::kRuntime) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index bf69869c4e6..889b178463e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -20,10 +20,11 @@ cc_test(
     ],
     deps = [
         ":cl_test",
+        # TODO(b/279977471) Once b/279347631 is resolved, check for heap again
+        "@com_google_googletest//:gtest_main_no_heapcheck",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:add_test_util",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -41,7 +42,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:cast_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -75,7 +76,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:concat_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -111,7 +112,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:conv_generic_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -129,7 +130,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:conv_weights_converter_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -172,7 +173,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:convolution_transposed_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -190,7 +191,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:convolution_transposed_3x3_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -226,7 +227,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:convolution_transposed_4x4_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -262,7 +263,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:cumsum_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -280,7 +281,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:depthwise_conv_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -299,7 +300,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:depthwise_conv_3x3_stride_h2_test_util",
         "//tensorflow/lite/delegates/gpu/common/tasks:depthwise_conv_3x3_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -317,7 +318,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:elementwise_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -340,7 +341,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
         "//tensorflow/lite/delegates/gpu/common/tasks:fully_connected",
         "//tensorflow/lite/delegates/gpu/common/tasks:fully_connected_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -358,7 +359,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:gather_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -376,7 +377,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:lstm_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -413,7 +414,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:max_unpooling_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -431,7 +432,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:mean_stddev_normalization_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -449,7 +450,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:one_hot_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -467,7 +468,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:padding_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -485,7 +486,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:pooling_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -503,7 +504,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:prelu_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -521,7 +522,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:quantize_and_dequantize_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -539,7 +540,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:reduce_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -557,7 +558,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:relu_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -575,7 +576,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:resampler_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -593,7 +594,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:reshape_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -611,7 +612,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:reshape_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -629,7 +630,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:select_v2_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -647,7 +648,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:softmax_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -665,7 +666,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:softmax_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -683,7 +684,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:space_to_depth_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -701,7 +702,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:split_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -719,7 +720,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:strided_slice_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -737,7 +738,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:tile_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -755,7 +756,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:transpose_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -773,7 +774,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:resize_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
@@ -791,7 +792,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:winograd_test_util",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest_main_no_heapcheck",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
index f1532b2eeee..dd545b02b13 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/add_test.cc
@@ -29,17 +29,17 @@ namespace {
 
 TEST_F(OpenCLOperationTest, AddTwoEqualTensors) {
   auto status = AddTwoEqualTensorsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, AddFirstTensorHasMoreChannelsThanSecond) {
   auto status = AddFirstTensorHasMoreChannelsThanSecondTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, AddFirstTensorHasLessChannelsThanSecond) {
   auto status = AddFirstTensorHasLessChannelsThanSecond(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/cast_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/cast_test.cc
index 60755c480fe..ff04086e4b6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/cast_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cast_test.cc
@@ -27,17 +27,17 @@ namespace {
 
 TEST_F(OpenCLOperationTest, Cast) {
   auto status = CastTests(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, CastToBool) {
   auto status = CastToBoolTests(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, CastFromBool) {
   auto status = CastFromBoolTests(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h
index 91413a0101a..2ee3641e887 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h
@@ -30,6 +30,10 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+#ifndef ASSERT_OK
+#define ASSERT_OK(x) ASSERT_TRUE(x.ok());
+#endif
+
 class ClExecutionEnvironment : public TestExecutionEnvironment {
  public:
   ClExecutionEnvironment() = default;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
index 3ccf52d7ad2..11b77bc4e5a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
@@ -29,22 +29,22 @@ namespace {
 
 TEST_F(OpenCLOperationTest, ConcatWidth) {
   auto status = ConcatWidthTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConcatHeight) {
   auto status = ConcatHeightTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConcatChannels) {
   auto status = ConcatChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConcatChannelsAlignedx4) {
   auto status = ConcatChannelsAlignedx4Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
index bca987ae93f..f3bb3e242a3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
@@ -26,12 +26,12 @@ namespace cl {
 
 TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
   const auto status = ConvConstantsSimpleWeightsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConvConstants) {
   const auto status = ConvConstantsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_generic_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_generic_test.cc
index 5a73ee864e5..4c3e682bcbd 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_generic_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_generic_test.cc
@@ -28,27 +28,27 @@ namespace cl {
 
 TEST_F(OpenCLOperationTest, ConvGeneric1x1SimpleWeights) {
   const auto status = ConvGeneric1x1SimpleWeightsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConvGeneric1x1) {
   const auto status = ConvGeneric1x1Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConvGenericSimpleWeights) {
   const auto status = ConvGenericSimpleWeightsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConvGeneric) {
   const auto status = ConvGenericTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConvGenericGrouped) {
   const auto status = ConvGenericGroupedTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter_test.cc
index cddac9aadcb..ca96ec887b5 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter_test.cc
@@ -28,32 +28,32 @@ namespace cl {
 
 TEST_F(OpenCLOperationTest, ConverterToConvWeights1x1OutX4) {
   const auto status = ConverterToConvWeights1x1OutX4Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConverterToConvWeights1x1OutX4Unaligned) {
   const auto status = ConverterToConvWeights1x1OutX4UnalignedTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConverterToConvWeights1x1OutX2) {
   const auto status = ConverterToConvWeights1x1OutX2Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConverterToConvWeightsOutX2) {
   const auto status = ConverterToConvWeightsOutX2Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConverterToConvTransposedWeights4x4) {
   const auto status = ConverterToConvTransposedWeights4x4Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConverterToConvWeights4xTextures) {
   const auto status = ConverterToConvWeights4xTexturesTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
index 8affdbcc1f0..87c02e28348 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
@@ -29,12 +29,12 @@ namespace {
 
 TEST_F(OpenCLOperationTest, ConvolutionTransposedSimpleWeights) {
   auto status = ConvolutionTransposedSimpleWeightsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ConvolutionTransposed) {
   auto status = ConvolutionTransposedTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/cumsum_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/cumsum_test.cc
index f7b0f97ce0c..3e67bd05fa6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/cumsum_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cumsum_test.cc
@@ -24,12 +24,12 @@ namespace {
 
 TEST_F(OpenCLOperationTest, CumsumHWCTest) {
   absl::Status status = CumsumHWCTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, CumsumBHWCTest) {
   absl::Status status = CumsumBHWCTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 }  // namespace
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3_test.cc
index 04c9ac4cdd5..722dd60e996 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3_test.cc
@@ -30,17 +30,17 @@ namespace {
 
 TEST_F(OpenCLOperationTest, DepthwiseConv3x3SimpleWeights) {
   auto status = DepthwiseConv3x3SimpleWeightsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, DepthwiseConv3x3) {
   auto status = DepthwiseConv3x3Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, DepthWiseConv3x3StrideH2SimpleWeights) {
   auto status = DepthWiseConv3x3StrideH2SimpleWeightsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
index cbcfeabdadc..72af8f2b286 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
@@ -29,17 +29,17 @@ namespace {
 
 TEST_F(OpenCLOperationTest, DepthwiseConvSimpleWeights) {
   auto status = DepthwiseConvSimpleWeightsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, DepthwiseConvNoMultiplier) {
   auto status = DepthwiseConvNoMultiplierTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, DepthwiseConvMultiplier2) {
   auto status = DepthwiseConvMultiplier2Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
index 743d22219c7..2c84dbc0b77 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
@@ -43,6 +43,8 @@ TEST_F(OpenCLOperationTest, FloorDiv) { ASSERT_OK(FloorDivTest(&exec_env_)); }
 
 TEST_F(OpenCLOperationTest, FloorMod) { ASSERT_OK(FloorModTest(&exec_env_)); }
 
+TEST_F(OpenCLOperationTest, Gelu) { ASSERT_OK(GeluTest(&exec_env_)); }
+
 TEST_F(OpenCLOperationTest, HardSwish) { ASSERT_OK(HardSwishTest(&exec_env_)); }
 
 TEST_F(OpenCLOperationTest, Log) { ASSERT_OK(LogTest(&exec_env_)); }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
index 58a4ce64f61..5ede1c6fcb8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
@@ -37,22 +37,22 @@ namespace {
 
 TEST_F(OpenCLOperationTest, FullyConnected) {
   auto status = FullyConnectedTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, FullyConnectedLarge) {
   auto status = FullyConnectedLargeTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, FullyConnectedExtraLarge) {
   auto status = FullyConnectedExtraLargeTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, FullyConnectedInt8) {
   auto status = FullyConnectedInt8Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, RearrageWeights) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gather_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gather_test.cc
index 04bcebd4f47..6132561123a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gather_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gather_test.cc
@@ -24,9 +24,44 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+TEST_F(OpenCLOperationTest, GatherBatch) {
+  auto status = GatherBatchTest(&exec_env_, false);
+  ASSERT_TRUE(status.ok()) << status.message();
+}
+
+TEST_F(OpenCLOperationTest, GatherBatchConst) {
+  auto status = GatherBatchTest(&exec_env_, true);
+  ASSERT_TRUE(status.ok()) << status.message();
+}
+
+TEST_F(OpenCLOperationTest, GatherHeight) {
+  auto status = GatherHeightTest(&exec_env_, false);
+  ASSERT_TRUE(status.ok()) << status.message();
+}
+
+TEST_F(OpenCLOperationTest, GatherHeightConst) {
+  auto status = GatherHeightTest(&exec_env_, true);
+  ASSERT_TRUE(status.ok()) << status.message();
+}
+
 TEST_F(OpenCLOperationTest, GatherWidth) {
-  auto status = GatherWidthTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  auto status = GatherWidthTest(&exec_env_, false);
+  ASSERT_TRUE(status.ok()) << status.message();
+}
+
+TEST_F(OpenCLOperationTest, GatherWidthConst) {
+  auto status = GatherWidthTest(&exec_env_, true);
+  ASSERT_TRUE(status.ok()) << status.message();
+}
+
+TEST_F(OpenCLOperationTest, GatherChannels) {
+  auto status = GatherChannelsTest(&exec_env_, false);
+  ASSERT_TRUE(status.ok()) << status.message();
+}
+
+TEST_F(OpenCLOperationTest, GatherChannelsConst) {
+  auto status = GatherChannelsTest(&exec_env_, true);
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc
index 264bf9cfcc7..fa8b60fa091 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc
@@ -30,7 +30,7 @@ namespace cl {
 
 TEST_F(OpenCLOperationTest, LSTM) {
   auto status = LstmTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc
index e30372a153a..9d53abec0ce 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 TEST_F(OpenCLOperationTest, MaxUnpooling) {
   auto status = MaxUnpoolingTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
index e4ba67abe95..4bdc86c77d0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
@@ -31,54 +31,54 @@ namespace {
 TEST_F(OpenCLOperationTest, MeanStddevNormSeparateBatches) {
   // zero mean, zero variance
   auto status = MeanStddevNormSeparateBatchesTest(0.0f, 0.0f, 0.0f, &exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   // zero mean, small variance
   status = MeanStddevNormSeparateBatchesTest(0.0f, 0.01f, 2.63e-4f, &exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   // zero mean, large variance
   status =
       MeanStddevNormSeparateBatchesTest(0.0f, 100.0f, 2.63e-4f, &exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   // small mean, zero variance
   status = MeanStddevNormSeparateBatchesTest(0.01f, 0.0f, 0.0f, &exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   // small mean, small variance
   status =
       MeanStddevNormSeparateBatchesTest(0.01f, 0.01f, 3.57e-4f, &exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   // small mean, large variance
   status =
       MeanStddevNormSeparateBatchesTest(1.0f, 100.0f, 2.63e-4f, &exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   // large mean, zero variance
   status = MeanStddevNormSeparateBatchesTest(100.0f, 0.0f, 0.0f, &exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   // large mean, small variance
   status =
       MeanStddevNormSeparateBatchesTest(100.0f, 1.0f, 2.63e-4f, &exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   // large mean, large variance
   status =
       MeanStddevNormSeparateBatchesTest(100.0f, 100.0f, 2.63e-4f, &exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, MeanStddevNormalizationAllBatches) {
   auto status = MeanStddevNormalizationAllBatchesTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, MeanStddevNormalizationLargeVector) {
   auto status = MeanStddevNormalizationLargeVectorTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/one_hot_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/one_hot_test.cc
index 736ec74d412..2e2e90846ee 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/one_hot_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/one_hot_test.cc
@@ -27,12 +27,12 @@ namespace {
 
 TEST_F(OpenCLOperationTest, OneHot) {
   auto status = OneHotTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, OneHotBatch) {
   auto status = OneHotBatchTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
index 5ced0d10251..f48f0edc072 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
@@ -29,57 +29,57 @@ namespace {
 
 TEST_F(OpenCLOperationTest, PaddingAppendWidth) {
   auto status = PaddingAppendWidthTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, PaddingAppendWidthConstValues) {
   auto status = PaddingAppendWidthConstValuesTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, PaddingPrependWidth) {
   auto status = PaddingPrependWidthTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, PaddingAppendHeight) {
   auto status = PaddingAppendHeightTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, PaddingPrependHeight) {
   auto status = PaddingPrependHeightTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, PaddingAppendChannels) {
   auto status = PaddingAppendChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, PaddingPrependChannels) {
   auto status = PaddingPrependChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, PaddingPrependChannelsX4) {
   auto status = PaddingPrependChannelsX4Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, PaddingComplex) {
   auto status = PaddingComplexTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, PaddingReflectWidth) {
   auto status = PaddingReflectWidthTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, PaddingReflectChannels) {
   auto status = PaddingReflectChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc
index 6a1f9b3d27b..1c5e6f1f3dc 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc
@@ -29,22 +29,22 @@ namespace {
 
 TEST_F(OpenCLOperationTest, AveragePooling) {
   auto status = AveragePoolingTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, AveragePoolingNonEmptyPadding) {
   auto status = AveragePoolingNonEmptyPaddingTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, MaxPooling) {
   auto status = MaxPoolingTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, MaxPoolingIndices) {
   auto status = MaxPoolingIndicesTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
index e19f7992baf..85dd898a4a9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
@@ -29,12 +29,12 @@ namespace {
 
 TEST_F(OpenCLOperationTest, PReLUAlpha) {
   auto status = PReLUAlphaTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, PReLUHWCAlpha) {
   auto status = PReLUHWCAlphaTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
index 8e66ffa44e5..31d4b13944a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
@@ -29,22 +29,22 @@ namespace {
 
 TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits8) {
   auto status = QuantAndDequant_Dim2Bits8Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits8_NegativeRange) {
   auto status = QuantAndDequant_Dim3Bits8_NegativeRangeTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits16) {
   auto status = QuantAndDequant_Dim3Bits16Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits16_NegativeRange) {
   auto status = QuantAndDequant_Dim2Bits16_NegativeRangeTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reduce_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reduce_test.cc
index 148b9159f04..97d04636f0c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reduce_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reduce_test.cc
@@ -29,27 +29,27 @@ namespace {
 
 TEST_F(OpenCLOperationTest, MeanHW) {
   auto status = MeanHWTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ReduceSumChannels) {
   auto status = ReduceSumChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ReduceProductChannels) {
   auto status = ReduceProductChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ReduceMaxChannels) {
   auto status = ReduceMaxChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ReduceMinChannels) {
   auto status = ReduceMinChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
index b7a113666c6..10a5c565c24 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
@@ -28,22 +28,22 @@ namespace cl {
 
 TEST_F(OpenCLOperationTest, ReLUNoClipNoAlpha) {
   auto status = ReLUNoClipNoAlphaTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ReLUClip) {
   auto status = ReLUClipTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ReLUAlpha) {
   auto status = ReLUAlphaTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ReLUAlphaClip) {
   auto status = ReLUAlphaClipTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/resampler_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/resampler_test.cc
index 9b88427cbac..c14f1bdd6a0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/resampler_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resampler_test.cc
@@ -28,13 +28,13 @@ namespace cl {
 
 TEST_F(OpenCLOperationTest, ResamplerIdentity) {
   auto status = ResamplerIdentityTest(BHWC(1, 2, 2, 1), &exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   status = ResamplerIdentityTest(BHWC(1, 3, 5, 3), &exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 
   status = ResamplerIdentityTest(BHWC(1, 6, 1, 7), &exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc
index 4949a18c2a3..1c30d5f878e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 TEST_F(OpenCLOperationTest, Reshape) {
   auto status = ReshapeTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc
index bb1db97e660..0040e62e05b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 TEST_F(OpenCLOperationTest, Reshapex4) {
   auto status = Reshapex4Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/resize_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/resize_test.cc
index 72004bea0a8..11675c2cf2e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize_test.cc
@@ -29,37 +29,37 @@ namespace {
 
 TEST_F(OpenCLOperationTest, ResizeBilinearAligned) {
   auto status = ResizeBilinearAlignedTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ResizeBilinearNonAligned) {
   auto status = ResizeBilinearNonAlignedTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ResizeBilinearWithoutHalfPixel) {
   auto status = ResizeBilinearWithoutHalfPixelTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ResizeBilinearWithHalfPixel) {
   auto status = ResizeBilinearWithHalfPixelTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ResizeNearest) {
   auto status = ResizeNearestTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ResizeNearestAlignCorners) {
   auto status = ResizeNearestAlignCornersTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, ResizeNearestHalfPixelCenters) {
   auto status = ResizeNearestHalfPixelCentersTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/select_v2_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/select_v2_test.cc
index 6c7d0d94eba..eb46f7db975 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/select_v2_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/select_v2_test.cc
@@ -24,42 +24,42 @@ namespace {
 
 TEST_F(OpenCLOperationTest, SelectV2) {
   auto status = SelectV2Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SelectV2Batch) {
   auto status = SelectV2BatchTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SelectV2Channels) {
   auto status = SelectV2ChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SelectV2ChannelsBatch) {
   auto status = SelectV2ChannelsBatchTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SelectV2BroadcastTrue) {
   auto status = SelectV2BroadcastTrueTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SelectV2BroadcastFalse) {
   auto status = SelectV2BroadcastFalseTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SelectV2BroadcastBoth) {
   auto status = SelectV2BroadcastBothTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SelectV2ChannelsBroadcastFalse) {
   auto status = SelectV2ChannelsBroadcastFalseTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1_test.cc
index 9e9fe082e4e..f8785d8b5ca 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1_test.cc
@@ -29,12 +29,12 @@ namespace {
 
 TEST_F(OpenCLOperationTest, Softmax1x1) {
   auto status = Softmax1x1Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, Softmax1x1BigNumber) {
   auto status = Softmax1x1BigNumberTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc
index b5466343c6c..893af684b6e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc
@@ -29,12 +29,12 @@ namespace {
 
 TEST_F(OpenCLOperationTest, Softmax) {
   auto status = SoftmaxTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SoftmaxBigNumber) {
   auto status = SoftmaxBigNumberTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
index e1b31bcb660..17b32bcaecc 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
@@ -28,23 +28,23 @@ namespace {
 // A known Qualcomm Adreno bug makes the 1 channel test fail on some Adreno
 // 5xxs.
 TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x1BlockSize2) {
-  auto status = SpaceToDepthTensorShape1x2x2x1BlockSize2Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(SpaceToDepthTensorShape1x2x2x1BlockSize2Test(&exec_env_));
 }
 
 TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x2BlockSize2) {
-  auto status = SpaceToDepthTensorShape1x2x2x2BlockSize2Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(SpaceToDepthTensorShape1x2x2x2BlockSize2Test(&exec_env_));
 }
 
 TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x3BlockSize2) {
-  auto status = SpaceToDepthTensorShape1x2x2x3BlockSize2Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(SpaceToDepthTensorShape1x2x2x3BlockSize2Test(&exec_env_));
 }
 
 TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x4x4x1BlockSize2) {
-  auto status = SpaceToDepthTensorShape1x4x4x1BlockSize2Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(SpaceToDepthTensorShape1x4x4x1BlockSize2Test(&exec_env_));
+}
+
+TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x6x6x1BlockSize3) {
+  ASSERT_OK(SpaceToDepthTensorShape1x6x6x1BlockSize3Test(&exec_env_));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/split_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/split_test.cc
index 10ff2da8127..ed26dcfdec6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/split_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/split_test.cc
@@ -28,32 +28,32 @@ namespace cl {
 
 TEST_F(OpenCLOperationTest, SplitChannels) {
   auto status = SplitChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SplitChannelsX4) {
   auto status = SplitChannelsX4Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SplitWidth) {
   auto status = SplitWidthTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SplitHeight) {
   auto status = SplitHeightTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SplitBatch) {
   auto status = SplitBatchTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, SplitDepth) {
   auto status = SplitDepthTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice_test.cc
index 6e42149af4e..d2cdde8e311 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 TEST_F(OpenCLOperationTest, StridedSlice) {
   auto status = StridedSliceTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/tile_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/tile_test.cc
index be54aae094d..13c49267b78 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/tile_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/tile_test.cc
@@ -26,27 +26,27 @@ namespace cl {
 
 TEST_F(OpenCLOperationTest, TileChannels) {
   auto status = TileChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, TileChannelsX4) {
   auto status = TileChannelsX4Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, TileWidth) {
   auto status = TileWidthTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, TileHeight) {
   auto status = TileHeightTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, TileHWC) {
   auto status = TileHWCTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose_test.cc
index b32a764787f..7e2e1d18802 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 TEST_F(OpenCLOperationTest, Transpose) {
   auto status = TransposeTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
index 103c253970d..76dbb8175f3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
@@ -26,27 +26,27 @@ namespace cl {
 
 TEST_F(OpenCLOperationTest, Winograd4x4To36TileX6) {
   auto status = Winograd4x4To36TileX6Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, Winograd36To4x4Tile4x1) {
   auto status = Winograd36To4x4Tile4x1Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, Winograd4x4To36) {
   auto status = Winograd4x4To36Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, Winograd4x4To36Batch) {
   auto status = Winograd4x4To36BatchTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, Winograd36To4x4) {
   auto status = Winograd36To4x4Test(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization_generated.h b/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
old mode 100644
new mode 100755
index aa4affb7ed4..c0d14cfbc02
--- a/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
+++ b/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
@@ -20,8 +20,15 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffers.h"
 
-#include "serialization_base_generated.h"
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 1 &&
+              FLATBUFFERS_VERSION_REVISION == 21,
+             "Non-compatible flatbuffers version included");
+
 #include "gpu_model_generated.h"
+#include "serialization_base_generated.h"
 
 namespace tflite {
 namespace gpu {
@@ -34,7 +41,7 @@ struct BinaryProgramBuilder;
 struct InferenceContext;
 struct InferenceContextBuilder;
 
-struct BinaryProgram FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BinaryProgram FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BinaryProgramBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FINGERPRINT = 4,
@@ -43,10 +50,10 @@ struct BinaryProgram FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint64_t fingerprint() const {
     return GetField<uint64_t>(VT_FINGERPRINT, 0);
   }
-  const flatbuffers::Vector<uint8_t> *binary() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_BINARY);
+  const ::flatbuffers::Vector<uint8_t> *binary() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_BINARY);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint64_t>(verifier, VT_FINGERPRINT, 8) &&
            VerifyOffset(verifier, VT_BINARY) &&
@@ -57,37 +64,37 @@ struct BinaryProgram FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct BinaryProgramBuilder {
   typedef BinaryProgram Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_fingerprint(uint64_t fingerprint) {
     fbb_.AddElement<uint64_t>(BinaryProgram::VT_FINGERPRINT, fingerprint, 0);
   }
-  void add_binary(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> binary) {
+  void add_binary(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> binary) {
     fbb_.AddOffset(BinaryProgram::VT_BINARY, binary);
   }
-  explicit BinaryProgramBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BinaryProgramBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BinaryProgram> Finish() {
+  ::flatbuffers::Offset<BinaryProgram> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BinaryProgram>(end);
+    auto o = ::flatbuffers::Offset<BinaryProgram>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BinaryProgram> CreateBinaryProgram(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BinaryProgram> CreateBinaryProgram(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     uint64_t fingerprint = 0,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> binary = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> binary = 0) {
   BinaryProgramBuilder builder_(_fbb);
   builder_.add_fingerprint(fingerprint);
   builder_.add_binary(binary);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<BinaryProgram> CreateBinaryProgramDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BinaryProgram> CreateBinaryProgramDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     uint64_t fingerprint = 0,
     const std::vector<uint8_t> *binary = nullptr) {
   auto binary__ = binary ? _fbb.CreateVector<uint8_t>(*binary) : 0;
@@ -97,7 +104,7 @@ inline flatbuffers::Offset<BinaryProgram> CreateBinaryProgramDirect(
       binary__);
 }
 
-struct InferenceContext FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct InferenceContext FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef InferenceContextBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_GPU_MODEL = 4,
@@ -109,19 +116,19 @@ struct InferenceContext FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::gpu::data::GpuModel *gpu_model() const {
     return GetPointer<const tflite::gpu::data::GpuModel *>(VT_GPU_MODEL);
   }
-  const flatbuffers::String *driver_version() const {
-    return GetPointer<const flatbuffers::String *>(VT_DRIVER_VERSION);
+  const ::flatbuffers::String *driver_version() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DRIVER_VERSION);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>> *binary_programs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>> *>(VT_BINARY_PROGRAMS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>> *binary_programs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>> *>(VT_BINARY_PROGRAMS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::Int3>> *tuned_work_group_sizes_per_node() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::Int3>> *>(VT_TUNED_WORK_GROUP_SIZES_PER_NODE);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::Int3>> *tuned_work_group_sizes_per_node() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::Int3>> *>(VT_TUNED_WORK_GROUP_SIZES_PER_NODE);
   }
-  const flatbuffers::Vector<uint64_t> *fingerprints_per_node() const {
-    return GetPointer<const flatbuffers::Vector<uint64_t> *>(VT_FINGERPRINTS_PER_NODE);
+  const ::flatbuffers::Vector<uint64_t> *fingerprints_per_node() const {
+    return GetPointer<const ::flatbuffers::Vector<uint64_t> *>(VT_FINGERPRINTS_PER_NODE);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_GPU_MODEL) &&
            verifier.VerifyTable(gpu_model()) &&
@@ -141,41 +148,41 @@ struct InferenceContext FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct InferenceContextBuilder {
   typedef InferenceContext Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_gpu_model(flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_gpu_model(::flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model) {
     fbb_.AddOffset(InferenceContext::VT_GPU_MODEL, gpu_model);
   }
-  void add_driver_version(flatbuffers::Offset<flatbuffers::String> driver_version) {
+  void add_driver_version(::flatbuffers::Offset<::flatbuffers::String> driver_version) {
     fbb_.AddOffset(InferenceContext::VT_DRIVER_VERSION, driver_version);
   }
-  void add_binary_programs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>>> binary_programs) {
+  void add_binary_programs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>>> binary_programs) {
     fbb_.AddOffset(InferenceContext::VT_BINARY_PROGRAMS, binary_programs);
   }
-  void add_tuned_work_group_sizes_per_node(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::Int3>>> tuned_work_group_sizes_per_node) {
+  void add_tuned_work_group_sizes_per_node(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::Int3>>> tuned_work_group_sizes_per_node) {
     fbb_.AddOffset(InferenceContext::VT_TUNED_WORK_GROUP_SIZES_PER_NODE, tuned_work_group_sizes_per_node);
   }
-  void add_fingerprints_per_node(flatbuffers::Offset<flatbuffers::Vector<uint64_t>> fingerprints_per_node) {
+  void add_fingerprints_per_node(::flatbuffers::Offset<::flatbuffers::Vector<uint64_t>> fingerprints_per_node) {
     fbb_.AddOffset(InferenceContext::VT_FINGERPRINTS_PER_NODE, fingerprints_per_node);
   }
-  explicit InferenceContextBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit InferenceContextBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<InferenceContext> Finish() {
+  ::flatbuffers::Offset<InferenceContext> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<InferenceContext>(end);
+    auto o = ::flatbuffers::Offset<InferenceContext>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<InferenceContext> CreateInferenceContext(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model = 0,
-    flatbuffers::Offset<flatbuffers::String> driver_version = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>>> binary_programs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::Int3>>> tuned_work_group_sizes_per_node = 0,
-    flatbuffers::Offset<flatbuffers::Vector<uint64_t>> fingerprints_per_node = 0) {
+inline ::flatbuffers::Offset<InferenceContext> CreateInferenceContext(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> driver_version = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>>> binary_programs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::Int3>>> tuned_work_group_sizes_per_node = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint64_t>> fingerprints_per_node = 0) {
   InferenceContextBuilder builder_(_fbb);
   builder_.add_fingerprints_per_node(fingerprints_per_node);
   builder_.add_tuned_work_group_sizes_per_node(tuned_work_group_sizes_per_node);
@@ -185,16 +192,16 @@ inline flatbuffers::Offset<InferenceContext> CreateInferenceContext(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<InferenceContext> CreateInferenceContextDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model = 0,
+inline ::flatbuffers::Offset<InferenceContext> CreateInferenceContextDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model = 0,
     const char *driver_version = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>> *binary_programs = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::Int3>> *tuned_work_group_sizes_per_node = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>> *binary_programs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::Int3>> *tuned_work_group_sizes_per_node = nullptr,
     const std::vector<uint64_t> *fingerprints_per_node = nullptr) {
   auto driver_version__ = driver_version ? _fbb.CreateString(driver_version) : 0;
-  auto binary_programs__ = binary_programs ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>>(*binary_programs) : 0;
-  auto tuned_work_group_sizes_per_node__ = tuned_work_group_sizes_per_node ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::Int3>>(*tuned_work_group_sizes_per_node) : 0;
+  auto binary_programs__ = binary_programs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>>(*binary_programs) : 0;
+  auto tuned_work_group_sizes_per_node__ = tuned_work_group_sizes_per_node ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::Int3>>(*tuned_work_group_sizes_per_node) : 0;
   auto fingerprints_per_node__ = fingerprints_per_node ? _fbb.CreateVector<uint64_t>(*fingerprints_per_node) : 0;
   return tflite::gpu::cl::data::CreateInferenceContext(
       _fbb,
@@ -206,32 +213,32 @@ inline flatbuffers::Offset<InferenceContext> CreateInferenceContextDirect(
 }
 
 inline const tflite::gpu::cl::data::InferenceContext *GetInferenceContext(const void *buf) {
-  return flatbuffers::GetRoot<tflite::gpu::cl::data::InferenceContext>(buf);
+  return ::flatbuffers::GetRoot<tflite::gpu::cl::data::InferenceContext>(buf);
 }
 
 inline const tflite::gpu::cl::data::InferenceContext *GetSizePrefixedInferenceContext(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<tflite::gpu::cl::data::InferenceContext>(buf);
+  return ::flatbuffers::GetSizePrefixedRoot<tflite::gpu::cl::data::InferenceContext>(buf);
 }
 
 inline bool VerifyInferenceContextBuffer(
-    flatbuffers::Verifier &verifier) {
+    ::flatbuffers::Verifier &verifier) {
   return verifier.VerifyBuffer<tflite::gpu::cl::data::InferenceContext>(nullptr);
 }
 
 inline bool VerifySizePrefixedInferenceContextBuffer(
-    flatbuffers::Verifier &verifier) {
+    ::flatbuffers::Verifier &verifier) {
   return verifier.VerifySizePrefixedBuffer<tflite::gpu::cl::data::InferenceContext>(nullptr);
 }
 
 inline void FinishInferenceContextBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<tflite::gpu::cl::data::InferenceContext> root) {
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::gpu::cl::data::InferenceContext> root) {
   fbb.Finish(root);
 }
 
 inline void FinishSizePrefixedInferenceContextBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<tflite::gpu::cl::data::InferenceContext> root) {
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::gpu::cl::data::InferenceContext> root) {
   fbb.FinishSizePrefixed(root);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/gpu_model_test.cc b/tensorflow/lite/delegates/gpu/cl/testing/gpu_model_test.cc
index 765ec300a88..ffcea163914 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/gpu_model_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/gpu_model_test.cc
@@ -25,69 +25,69 @@ namespace {
 
 TEST_F(OpenCLOperationTest, LinkingConvolutionAndCosOp) {
   auto status = TestLinkingConvolutionAndCosOp(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, LinkingConvolution2InputMul2InputMul) {
   auto status = TestLinkingConvolution2InputMul2InputMul(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, LinkingConvolution2InputBroadcastMul2InputMul) {
   auto status = TestLinkingConvolution2InputBroadcastMul2InputMul(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, LinkingConvolution2InputMul2InputBroadcastMul) {
   auto status = TestLinkingConvolution2InputMul2InputBroadcastMul(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, LinkingConvolution2InputMul2InputMulCos) {
   auto status = TestLinkingConvolution2InputMul2InputMulCos(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, LinkingConvolutionFirstTanh2InputDiff) {
   auto status = TestLinkingConvolutionFirstTanh2InputDiff(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, LinkingConvolutionSecondTanh2InputDiff) {
   auto status = TestLinkingConvolutionSecondTanh2InputDiff(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, LinkingConvolutionFirstTanhSecondCos2InputDiff) {
   auto status = TestLinkingConvolutionFirstTanhSecondCos2InputDiff(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, LinkingComplex0) {
   auto status = TestLinkingComplex0(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, LinkingConvElem2InputAddElemsOp) {
   auto status = TestLinkingConvElem2InputAddElemsOp(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, LinkingSliceCastOp) {
   auto status = TestLinkingSliceCastOp(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, LinkingAddAddMulOp) {
   auto status = TestLinkingAddAddMulOp(&exec_env_,
                                        /*use_second_input_add=*/true);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST_F(OpenCLOperationTest, LinkingAddMulOp) {
   auto status =
       TestLinkingAddAddMulOp(&exec_env_, /*use_second_input_add=*/false);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index 05cb98642fa..84f603e2221 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -238,24 +238,24 @@ cc_library(
         ":shape",
         ":status",
         ":tensor",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates:utils",
+        "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/tools/versioning:gpu_compatibility",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite:util",
-        "//tensorflow/lite/core:framework",
-        "//tensorflow/lite/core/c:c_api_types",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/delegates:utils",
-        "//tensorflow/lite/tools/versioning:gpu_compatibility",
-        "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
-        "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/kernels/internal:reference_base",
-        "//tensorflow/lite/kernels/internal:tensor",
     ] + tf_platform_alias("custom_parsers", "//tensorflow/lite/delegates/gpu/common/"),
 )
 
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_model.cc b/tensorflow/lite/delegates/gpu/common/gpu_model.cc
index 5d87daf1bc4..a5135439ae6 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_model.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_model.cc
@@ -666,16 +666,25 @@ absl::Status MergeElementwiseNodes(const GpuInfo& gpu_info,
 
 absl::Status MergeNodes(const GpuInfo& gpu_info, GpuModel* gpu_model) {
   absl::flat_hash_set<ValueId> ready_tensors;
+  absl::flat_hash_set<ValueId> output_tensors;
   for (const auto& input : gpu_model->input_ids_and_refs) {
     ready_tensors.insert(input.first);
   }
+  for (const auto& output : gpu_model->output_ids_and_refs) {
+    output_tensors.insert(output.first);
+  }
   auto& nodes = gpu_model->nodes;
   for (int i = 0; i < nodes.size(); ++i) {
     auto& node = nodes[i];
+    bool node_has_graph_output = false;
     for (const auto& out_id : node.outputs) {
       ready_tensors.insert(out_id);
+      if (output_tensors.find(out_id) != output_tensors.end()) {
+        node_has_graph_output = true;
+      }
     }
-    if (node.outputs.size() != 1) {
+    // Don't merge node if it has multiple outputs or a graph output.
+    if (node_has_graph_output || node.outputs.size() != 1) {
       continue;
     }
     std::vector<int> next_nodes;
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h b/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
old mode 100644
new mode 100755
index 8cc0d6c75c9..3fa3fbaf816
--- a/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
@@ -19,6 +19,13 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffers.h"
 
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 1 &&
+              FLATBUFFERS_VERSION_REVISION == 21,
+             "Non-compatible flatbuffers version included");
+
 #include "serialization_base_generated.h"
 
 namespace tflite {
@@ -37,7 +44,7 @@ struct GpuNodeBuilder;
 struct GpuModel;
 struct GpuModelBuilder;
 
-struct TensorDescWithId FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct TensorDescWithId FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef TensorDescWithIdBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DESC = 4,
@@ -49,7 +56,7 @@ struct TensorDescWithId FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t id() const {
     return GetField<int32_t>(VT_ID, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DESC) &&
            verifier.VerifyTable(desc()) &&
@@ -60,28 +67,28 @@ struct TensorDescWithId FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct TensorDescWithIdBuilder {
   typedef TensorDescWithId Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_desc(flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> desc) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_desc(::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> desc) {
     fbb_.AddOffset(TensorDescWithId::VT_DESC, desc);
   }
   void add_id(int32_t id) {
     fbb_.AddElement<int32_t>(TensorDescWithId::VT_ID, id, 0);
   }
-  explicit TensorDescWithIdBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit TensorDescWithIdBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<TensorDescWithId> Finish() {
+  ::flatbuffers::Offset<TensorDescWithId> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<TensorDescWithId>(end);
+    auto o = ::flatbuffers::Offset<TensorDescWithId>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<TensorDescWithId> CreateTensorDescWithId(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> desc = 0,
+inline ::flatbuffers::Offset<TensorDescWithId> CreateTensorDescWithId(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> desc = 0,
     int32_t id = 0) {
   TensorDescWithIdBuilder builder_(_fbb);
   builder_.add_id(id);
@@ -89,7 +96,7 @@ inline flatbuffers::Offset<TensorDescWithId> CreateTensorDescWithId(
   return builder_.Finish();
 }
 
-struct PairOfValueIds FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct PairOfValueIds FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef PairOfValueIdsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FIRST = 4,
@@ -101,7 +108,7 @@ struct PairOfValueIds FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t second() const {
     return GetField<int32_t>(VT_SECOND, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_FIRST, 4) &&
            VerifyField<int32_t>(verifier, VT_SECOND, 4) &&
@@ -111,27 +118,27 @@ struct PairOfValueIds FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct PairOfValueIdsBuilder {
   typedef PairOfValueIds Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_first(int32_t first) {
     fbb_.AddElement<int32_t>(PairOfValueIds::VT_FIRST, first, 0);
   }
   void add_second(int32_t second) {
     fbb_.AddElement<int32_t>(PairOfValueIds::VT_SECOND, second, 0);
   }
-  explicit PairOfValueIdsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit PairOfValueIdsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<PairOfValueIds> Finish() {
+  ::flatbuffers::Offset<PairOfValueIds> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<PairOfValueIds>(end);
+    auto o = ::flatbuffers::Offset<PairOfValueIds>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<PairOfValueIds> CreatePairOfValueIds(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<PairOfValueIds> CreatePairOfValueIds(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t first = 0,
     int32_t second = 0) {
   PairOfValueIdsBuilder builder_(_fbb);
@@ -140,7 +147,7 @@ inline flatbuffers::Offset<PairOfValueIds> CreatePairOfValueIds(
   return builder_.Finish();
 }
 
-struct GpuNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct GpuNode FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef GpuNodeBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_GPU_OP = 4,
@@ -151,16 +158,16 @@ struct GpuNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::gpu::data::GPUOperation *gpu_op() const {
     return GetPointer<const tflite::gpu::data::GPUOperation *>(VT_GPU_OP);
   }
-  const flatbuffers::Vector<int32_t> *input_ids() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUT_IDS);
+  const ::flatbuffers::Vector<int32_t> *input_ids() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INPUT_IDS);
   }
-  const flatbuffers::Vector<int32_t> *output_ids() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUT_IDS);
+  const ::flatbuffers::Vector<int32_t> *output_ids() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OUTPUT_IDS);
   }
-  const flatbuffers::String *name() const {
-    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_GPU_OP) &&
            verifier.VerifyTable(gpu_op()) &&
@@ -176,37 +183,37 @@ struct GpuNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct GpuNodeBuilder {
   typedef GpuNode Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_gpu_op(flatbuffers::Offset<tflite::gpu::data::GPUOperation> gpu_op) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_gpu_op(::flatbuffers::Offset<tflite::gpu::data::GPUOperation> gpu_op) {
     fbb_.AddOffset(GpuNode::VT_GPU_OP, gpu_op);
   }
-  void add_input_ids(flatbuffers::Offset<flatbuffers::Vector<int32_t>> input_ids) {
+  void add_input_ids(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> input_ids) {
     fbb_.AddOffset(GpuNode::VT_INPUT_IDS, input_ids);
   }
-  void add_output_ids(flatbuffers::Offset<flatbuffers::Vector<int32_t>> output_ids) {
+  void add_output_ids(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> output_ids) {
     fbb_.AddOffset(GpuNode::VT_OUTPUT_IDS, output_ids);
   }
-  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
     fbb_.AddOffset(GpuNode::VT_NAME, name);
   }
-  explicit GpuNodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit GpuNodeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<GpuNode> Finish() {
+  ::flatbuffers::Offset<GpuNode> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<GpuNode>(end);
+    auto o = ::flatbuffers::Offset<GpuNode>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<GpuNode> CreateGpuNode(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::gpu::data::GPUOperation> gpu_op = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> input_ids = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> output_ids = 0,
-    flatbuffers::Offset<flatbuffers::String> name = 0) {
+inline ::flatbuffers::Offset<GpuNode> CreateGpuNode(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GPUOperation> gpu_op = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> input_ids = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> output_ids = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0) {
   GpuNodeBuilder builder_(_fbb);
   builder_.add_name(name);
   builder_.add_output_ids(output_ids);
@@ -215,9 +222,9 @@ inline flatbuffers::Offset<GpuNode> CreateGpuNode(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<GpuNode> CreateGpuNodeDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::gpu::data::GPUOperation> gpu_op = 0,
+inline ::flatbuffers::Offset<GpuNode> CreateGpuNodeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GPUOperation> gpu_op = 0,
     const std::vector<int32_t> *input_ids = nullptr,
     const std::vector<int32_t> *output_ids = nullptr,
     const char *name = nullptr) {
@@ -232,7 +239,7 @@ inline flatbuffers::Offset<GpuNode> CreateGpuNodeDirect(
       name__);
 }
 
-struct GpuModel FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct GpuModel FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef GpuModelBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NODES = 4,
@@ -244,31 +251,31 @@ struct GpuModel FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_OUTPUT_REFS = 16,
     VT_VARIABLE_IDS_AND_REFS = 18
   };
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::GpuNode>> *nodes() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::GpuNode>> *>(VT_NODES);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::GpuNode>> *nodes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::GpuNode>> *>(VT_NODES);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *tensors() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *>(VT_TENSORS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *>(VT_TENSORS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *const_tensors() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *>(VT_CONST_TENSORS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *const_tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *>(VT_CONST_TENSORS);
   }
-  const flatbuffers::Vector<int32_t> *input_ids() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUT_IDS);
+  const ::flatbuffers::Vector<int32_t> *input_ids() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INPUT_IDS);
   }
-  const flatbuffers::Vector<int32_t> *output_ids() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUT_IDS);
+  const ::flatbuffers::Vector<int32_t> *output_ids() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OUTPUT_IDS);
   }
-  const flatbuffers::Vector<int64_t> *input_refs() const {
-    return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_INPUT_REFS);
+  const ::flatbuffers::Vector<int64_t> *input_refs() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INPUT_REFS);
   }
-  const flatbuffers::Vector<int64_t> *output_refs() const {
-    return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_OUTPUT_REFS);
+  const ::flatbuffers::Vector<int64_t> *output_refs() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_OUTPUT_REFS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>> *variable_ids_and_refs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>> *>(VT_VARIABLE_IDS_AND_REFS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>> *variable_ids_and_refs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>> *>(VT_VARIABLE_IDS_AND_REFS);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NODES) &&
            verifier.VerifyVector(nodes()) &&
@@ -296,53 +303,53 @@ struct GpuModel FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct GpuModelBuilder {
   typedef GpuModel Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_nodes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::GpuNode>>> nodes) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_nodes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::GpuNode>>> nodes) {
     fbb_.AddOffset(GpuModel::VT_NODES, nodes);
   }
-  void add_tensors(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>> tensors) {
+  void add_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>> tensors) {
     fbb_.AddOffset(GpuModel::VT_TENSORS, tensors);
   }
-  void add_const_tensors(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>> const_tensors) {
+  void add_const_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>> const_tensors) {
     fbb_.AddOffset(GpuModel::VT_CONST_TENSORS, const_tensors);
   }
-  void add_input_ids(flatbuffers::Offset<flatbuffers::Vector<int32_t>> input_ids) {
+  void add_input_ids(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> input_ids) {
     fbb_.AddOffset(GpuModel::VT_INPUT_IDS, input_ids);
   }
-  void add_output_ids(flatbuffers::Offset<flatbuffers::Vector<int32_t>> output_ids) {
+  void add_output_ids(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> output_ids) {
     fbb_.AddOffset(GpuModel::VT_OUTPUT_IDS, output_ids);
   }
-  void add_input_refs(flatbuffers::Offset<flatbuffers::Vector<int64_t>> input_refs) {
+  void add_input_refs(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> input_refs) {
     fbb_.AddOffset(GpuModel::VT_INPUT_REFS, input_refs);
   }
-  void add_output_refs(flatbuffers::Offset<flatbuffers::Vector<int64_t>> output_refs) {
+  void add_output_refs(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> output_refs) {
     fbb_.AddOffset(GpuModel::VT_OUTPUT_REFS, output_refs);
   }
-  void add_variable_ids_and_refs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>>> variable_ids_and_refs) {
+  void add_variable_ids_and_refs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>>> variable_ids_and_refs) {
     fbb_.AddOffset(GpuModel::VT_VARIABLE_IDS_AND_REFS, variable_ids_and_refs);
   }
-  explicit GpuModelBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit GpuModelBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<GpuModel> Finish() {
+  ::flatbuffers::Offset<GpuModel> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<GpuModel>(end);
+    auto o = ::flatbuffers::Offset<GpuModel>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<GpuModel> CreateGpuModel(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::GpuNode>>> nodes = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>> tensors = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>> const_tensors = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> input_ids = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> output_ids = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int64_t>> input_refs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int64_t>> output_refs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>>> variable_ids_and_refs = 0) {
+inline ::flatbuffers::Offset<GpuModel> CreateGpuModel(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::GpuNode>>> nodes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>> tensors = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>> const_tensors = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> input_ids = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> output_ids = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> input_refs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> output_refs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>>> variable_ids_and_refs = 0) {
   GpuModelBuilder builder_(_fbb);
   builder_.add_variable_ids_and_refs(variable_ids_and_refs);
   builder_.add_output_refs(output_refs);
@@ -355,24 +362,24 @@ inline flatbuffers::Offset<GpuModel> CreateGpuModel(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<GpuModel> CreateGpuModelDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::GpuNode>> *nodes = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *tensors = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *const_tensors = nullptr,
+inline ::flatbuffers::Offset<GpuModel> CreateGpuModelDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::GpuNode>> *nodes = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *tensors = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>> *const_tensors = nullptr,
     const std::vector<int32_t> *input_ids = nullptr,
     const std::vector<int32_t> *output_ids = nullptr,
     const std::vector<int64_t> *input_refs = nullptr,
     const std::vector<int64_t> *output_refs = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>> *variable_ids_and_refs = nullptr) {
-  auto nodes__ = nodes ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::GpuNode>>(*nodes) : 0;
-  auto tensors__ = tensors ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>(*tensors) : 0;
-  auto const_tensors__ = const_tensors ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>(*const_tensors) : 0;
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>> *variable_ids_and_refs = nullptr) {
+  auto nodes__ = nodes ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::GpuNode>>(*nodes) : 0;
+  auto tensors__ = tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>(*tensors) : 0;
+  auto const_tensors__ = const_tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::TensorDescWithId>>(*const_tensors) : 0;
   auto input_ids__ = input_ids ? _fbb.CreateVector<int32_t>(*input_ids) : 0;
   auto output_ids__ = output_ids ? _fbb.CreateVector<int32_t>(*output_ids) : 0;
   auto input_refs__ = input_refs ? _fbb.CreateVector<int64_t>(*input_refs) : 0;
   auto output_refs__ = output_refs ? _fbb.CreateVector<int64_t>(*output_refs) : 0;
-  auto variable_ids_and_refs__ = variable_ids_and_refs ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>>(*variable_ids_and_refs) : 0;
+  auto variable_ids_and_refs__ = variable_ids_and_refs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::PairOfValueIds>>(*variable_ids_and_refs) : 0;
   return tflite::gpu::data::CreateGpuModel(
       _fbb,
       nodes__,
diff --git a/tensorflow/lite/delegates/gpu/common/model.cc b/tensorflow/lite/delegates/gpu/common/model.cc
index 37a0a01926a..e8d0ecb4f2f 100644
--- a/tensorflow/lite/delegates/gpu/common/model.cc
+++ b/tensorflow/lite/delegates/gpu/common/model.cc
@@ -52,7 +52,28 @@ std::vector<Value*> GraphFloat32::variable_inputs() const {
 }
 
 std::vector<Value*> GraphFloat32::outputs() const {
-  return FilterValues([](const ValueDef& v) { return v.consumers.empty(); });
+  std::vector<Value*> values;
+  std::vector<Value*> values_known_graph_outputs;
+  values.reserve(values_.size());
+  values_known_graph_outputs.reserve(values_.size());
+  for (auto& v : values_) {
+    auto value_ptr = v.value.get();
+    if (value_ptr == nullptr) continue;
+    // Find v which meets one of the following conditions.
+    // 1. v doesn't have a consumer.
+    // 2. v has a consumer but it's also in known_graph_outputs_.
+    if (v.consumers.empty()) {
+      values.push_back(v.value.get());
+    } else if (std::find(known_graph_outputs_.begin(),
+                         known_graph_outputs_.end(),
+                         value_ptr) != known_graph_outputs_.end()) {
+      values_known_graph_outputs.push_back(v.value.get());
+    }
+  }
+  // Add known_graph_outputs later to provide compatibility in output ordering.
+  values.insert(values.end(), values_known_graph_outputs.begin(),
+                values_known_graph_outputs.end());
+  return values;
 }
 
 std::vector<Value*> GraphFloat32::FindInputs(NodeId id) const {
@@ -80,6 +101,10 @@ bool GraphFloat32::IsGraphOutput(ValueId id) const {
   if (id >= values_.size()) {
     return false;
   }
+  if (std::find(known_graph_outputs_.begin(), known_graph_outputs_.end(),
+                values_[id].value.get()) != known_graph_outputs_.end()) {
+    return true;
+  }
   return values_[id].consumers.empty();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/common/model.h b/tensorflow/lite/delegates/gpu/common/model.h
index 8a4e1e3442c..ce6c2ae06f3 100644
--- a/tensorflow/lite/delegates/gpu/common/model.h
+++ b/tensorflow/lite/delegates/gpu/common/model.h
@@ -87,7 +87,8 @@ class GraphFloat32 {
   // @return graph inputs, that are values without producers.
   std::vector<Value*> inputs() const;
 
-  // @return graph outputs, that are values without consumers.
+  // @return graph outputs, that are values without consumers or values added by
+  // AddKnownGraphOutput.
   std::vector<Value*> outputs() const;
 
   // @return values updated in place with a previously defined tensor reference.
@@ -116,6 +117,10 @@ class GraphFloat32 {
   // @return a value or nullptr if value with the given id is not present.
   Value* GetValue(ValueId id) const;
 
+  // Add a value to the list of known graph output list. The value should be
+  // gotten from delegate_params->output_tensors.
+  void AddKnownGraphOutput(Value* id) { known_graph_outputs_.push_back(id); }
+
   //////////////////////////////////////////////////////////////////////////////
   // Graph manipulation functions are below
   //////////////////////////////////////////////////////////////////////////////
@@ -223,6 +228,9 @@ class GraphFloat32 {
   std::map<NodeId, NodeDef> nodes_;
   // Node Ids in order of execution.
   std::vector<NodeId> execution_plan_;
+
+  // List of known graph outputs.
+  std::vector<Value*> known_graph_outputs_;
 };
 
 // Removes to_remove node that precedes to_keep node only if to_remove has
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 2867df08419..04e11988bff 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -1203,6 +1203,7 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
       case OperationType::ELU:
       case OperationType::EXP:
       case OperationType::FLOOR:
+      case OperationType::GELU:
       case OperationType::LOG:
       case OperationType::NEG:
       case OperationType::RSQRT:
@@ -1374,6 +1375,38 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
   }
 };
 
+class GatherOperationParser : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    return CheckGpuDelegateCompatibility(context, tflite_node, registration);
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::GATHER);
+    GatherAttributes attr;
+    const TfLiteTensor* input_tensor = reader->GetInputTensor(0);
+    const TfLiteGatherParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+    RETURN_IF_ERROR(
+        ExtractAxisFromIndex(*input_tensor, tf_options->axis, &attr.axis));
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    const TfLiteTensor* idx_tensor = reader->GetInputTensor(1);
+    if (!IsConstantTensor(idx_tensor)) {
+      RETURN_IF_ERROR(reader->AddInput(node, 1));
+    } else {
+      RETURN_IF_ERROR(reader->ReadTensor(1, &attr.indices));
+    }
+    node->operation.attributes = std::move(attr);
+    return reader->AddOutputs(node);
+  }
+};
+
 class HardSwishOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
@@ -2090,7 +2123,12 @@ class SelectV2OperationParser : public TFLiteOperationParser {
     const bool is_else_constant =
         false_tensor->allocation_type == kTfLiteMmapRo;
     BHWC cond_shape, true_shape, false_shape;
-    RETURN_IF_ERROR(ExtractTensorShape(*cond_tensor, &cond_shape));
+    if (cond_tensor->dims->size == 0) {
+      attr.scalar_cond = true;
+    } else {
+      RETURN_IF_ERROR(ExtractTensorShape(*cond_tensor, &cond_shape));
+      attr.scalar_cond = cond_shape.DimensionsProduct() == 1;
+    }
     if (true_tensor->dims->size == 0) {
       attr.broadcast_true = true;
     } else {
@@ -3036,6 +3074,7 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
     case kTfLiteBuiltinAbs:
       return std::make_unique<ElementwiseOperationParser>(OperationType::ABS);
     case kTfLiteBuiltinAdd:
+    case kTfLiteBuiltinAddN:
       return std::make_unique<ElementwiseOperationParser>(OperationType::ADD);
     case kTfLiteBuiltinAveragePool2d:
       return std::make_unique<Pooling2DOperationParser>(PoolingType::AVERAGE);
@@ -3080,6 +3119,10 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
           OperationType::FLOOR_MOD);
     case kTfLiteBuiltinFullyConnected:
       return std::make_unique<FullyConnectedOperationParser>();
+    case kTfLiteBuiltinGather:
+      return std::make_unique<GatherOperationParser>();
+    case kTfLiteBuiltinGelu:
+      return std::make_unique<ElementwiseOperationParser>(OperationType::GELU);
     case kTfLiteBuiltinGreater:
       return std::make_unique<ElementwiseOperationParser>(
           OperationType::GREATER);
@@ -3230,7 +3273,8 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
 // TODO(impjdi): Check ops' parameters.
 TfLiteIntArray* GetOpsToReplace(
     TfLiteContext* context, bool allow_quant_ops, int max_delegated_partitions,
-    const absl::flat_hash_set<TfLiteBuiltinOperator>* excluded_ops) {
+    const absl::flat_hash_set<TfLiteBuiltinOperator>* excluded_ops,
+    int start_node_index, int end_node_index) {
   delegates::IsNodeSupportedFn node_supported_fn =
       [=](TfLiteContext* context, TfLiteNode* node,
           TfLiteRegistration* registration,
@@ -3276,6 +3320,9 @@ TfLiteIntArray* GetOpsToReplace(
       allowed_in_types.push_back(kTfLiteBool);
       allowed_out_types.push_back(kTfLiteBool);
     }
+    if (registration->builtin_code == kTfLiteBuiltinGather) {
+      allowed_in_types.push_back(kTfLiteInt32);
+    }
     if (!IsAllAllowedTensors(context, node->inputs, allowed_in_types) ||
         !IsAllAllowedTensors(context, node->outputs, allowed_out_types)) {
       if (unsupported_details) {
@@ -3290,7 +3337,13 @@ TfLiteIntArray* GetOpsToReplace(
   delegates::FP16GraphPartitionHelper partition_helper(context,
                                                        node_supported_fn);
   std::set<std::string> unsupported_nodes_info;
-  if (partition_helper.Partition(&unsupported_nodes_info) != kTfLiteOk) {
+#ifndef TFLITE_DEBUG_DELEGATE
+  auto res = partition_helper.Partition(&unsupported_nodes_info);
+#else
+  auto res = partition_helper.Partition(&unsupported_nodes_info,
+                                        start_node_index, end_node_index);
+#endif
+  if (res != kTfLiteOk) {
     return TfLiteIntArrayCreate(0);
   }
 
@@ -3322,16 +3375,17 @@ TfLiteIntArray* GetOpsToReplace(
   return ConvertVectorToTfLiteIntArray(ops_to_replace);
 }
 
-// Creates inputs and outputs passed by io_tensors parameters in the resulting
+// Creates inputs passed by io_tensors parameters in the resulting
 // graph. We force it to make sure that delegated subgraph has same order of
 // inputs and outputs with the original one. When delegated model is built from
 // the tflite model representation tensors are created lazily, so there is no
 // guarantee that the order will match the source model tensors order.
-absl::Status PrecreateIOTensors(
-    TfLiteContext* context, GraphFloat32* graph, const std::vector<int>& io_ids,
+absl::Status PrecreateInputTensors(
+    TfLiteContext* context, GraphFloat32* graph,
+    const std::vector<int>& input_ids,
     absl::flat_hash_map<int, int>* quant_conversion_map,
     absl::flat_hash_map<int, Value*>* tensor_to_value) {
-  for (const auto& id : io_ids) {
+  for (const auto& id : input_ids) {
     const TfLiteTensor& tflite_tensor = context->tensors[id];
     if (tflite::IsConstantTensor(&tflite_tensor)) continue;
     RETURN_IF_ERROR(ObjectReader::ReadNonConstantTensor(
@@ -3340,6 +3394,26 @@ absl::Status PrecreateIOTensors(
   return absl::OkStatus();
 }
 
+// Similar to PrecreateInputTensors(), it creates outputs passed by io_tensors
+// parameters in the resulting graph. In addition to that, it calls
+// graph->AddKnownGraphOutput() to notify graph outputs from
+// delegate_params->output_tensors.
+absl::Status PrecreateOutputTensors(
+    TfLiteContext* context, GraphFloat32* graph,
+    const std::vector<int>& output_ids,
+    absl::flat_hash_map<int, int>* quant_conversion_map,
+    absl::flat_hash_map<int, Value*>* tensor_to_value) {
+  for (const auto& id : output_ids) {
+    const TfLiteTensor& tflite_tensor = context->tensors[id];
+    if (tflite::IsConstantTensor(&tflite_tensor)) continue;
+    Value* value;
+    RETURN_IF_ERROR(ObjectReader::ReadNonConstantTensor(
+        context, tensor_to_value, quant_conversion_map, graph, id, &value));
+    graph->AddKnownGraphOutput(value);
+  }
+  return absl::OkStatus();
+}
+
 absl::Status CopyVariableTensorOutputs(
     TfLiteNode* tflite_node, TfLiteRegistration* registration,
     GraphFloat32* graph, ObjectReader& reader,
@@ -3429,10 +3503,10 @@ absl::Status BuildModelEnforceIO(
   absl::flat_hash_map<int, Value*> tensor_to_value;
   std::vector<ValueId> variable_inputs_to_value_id;
 
-  RETURN_IF_ERROR(PrecreateIOTensors(context, graph, input_ids,
-                                     quant_conversion_map, &tensor_to_value));
-  RETURN_IF_ERROR(PrecreateIOTensors(context, graph, output_ids,
-                                     quant_conversion_map, &tensor_to_value));
+  RETURN_IF_ERROR(PrecreateInputTensors(
+      context, graph, input_ids, quant_conversion_map, &tensor_to_value));
+  RETURN_IF_ERROR(PrecreateOutputTensors(
+      context, graph, output_ids, quant_conversion_map, &tensor_to_value));
   for (int i = 0; i < operations.size(); ++i) {
     TfLiteNode* tflite_node;
     TfLiteRegistration* registration;
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.h b/tensorflow/lite/delegates/gpu/common/model_builder.h
index 5bea8aaf033..6e72635db47 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_H_
 
+#include <limits>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/builtin_ops.h"
@@ -40,7 +42,9 @@ namespace gpu {
 TfLiteIntArray* GetOpsToReplace(
     TfLiteContext* context, bool allow_quant_ops = false,
     int max_delegated_partitions = 1,
-    const absl::flat_hash_set<TfLiteBuiltinOperator>* excluded_ops = nullptr);
+    const absl::flat_hash_set<TfLiteBuiltinOperator>* excluded_ops = nullptr,
+    int start_node_index = 0,
+    int end_node_index = std::numeric_limits<int>::max());
 
 // Extracts TFLite delegate execution plan from the input TFLite context and
 // converts it into generic graph format.
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
index a33fd3b9292..0898347a020 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
@@ -1936,6 +1936,80 @@ TEST(FullyConnectedOperationParserTest, TestIsSupported) {
           .ok());
 }
 
+TEST(GatherOperationParserTest, TestIsSupported) {
+  // Invalid num_inputs
+  auto context = std::make_unique<StubTfLiteContext>(kTfLiteBuiltinGather,
+                                                     /*op_version=*/1,
+                                                     /*num_inputs=*/1);
+  auto parser = NewOperationParser(context->registration());
+  EXPECT_FALSE(
+      parser
+          ->IsSupported(context.get(), context->node(), context->registration())
+          .ok());
+  // Invalid num inputs
+  context = std::make_unique<StubTfLiteContext>(kTfLiteBuiltinGather,
+                                                /*op_version=*/1,
+                                                /*num_inputs=*/3);
+  context->tensor(2)->dims->size = 1;
+  context->tensor(2)->type = kTfLiteInt32;
+  EXPECT_FALSE(
+      parser
+          ->IsSupported(context.get(), context->node(), context->registration())
+          .ok());
+  // Can't have int32 input
+  context = std::make_unique<StubTfLiteContext>(kTfLiteBuiltinGather,
+                                                /*op_version=*/1,
+                                                /*num_inputs=*/2);
+  context->tensor(1)->type = kTfLiteInt32;
+  context->tensor(2)->dims->size = 1;
+  context->tensor(2)->type = kTfLiteInt32;
+  EXPECT_FALSE(
+      parser
+          ->IsSupported(context.get(), context->node(), context->registration())
+          .ok());
+  // Need 1D indices
+  context = std::make_unique<StubTfLiteContext>(kTfLiteBuiltinGather,
+                                                /*op_version=*/1,
+                                                /*num_inputs=*/2);
+  context->tensor(2)->dims->size = 2;
+  context->tensor(2)->type = kTfLiteInt32;
+  EXPECT_FALSE(
+      parser
+          ->IsSupported(context.get(), context->node(), context->registration())
+          .ok());
+  // Need int32 indices
+  context = std::make_unique<StubTfLiteContext>(kTfLiteBuiltinGather,
+                                                /*op_version=*/1,
+                                                /*num_inputs=*/2);
+  context->tensor(2)->dims->size = 1;
+  context->tensor(2)->type = kTfLiteFloat32;
+  EXPECT_FALSE(
+      parser
+          ->IsSupported(context.get(), context->node(), context->registration())
+          .ok());
+  // VALID
+  context = std::make_unique<StubTfLiteContext>(kTfLiteBuiltinGather,
+                                                /*op_version=*/1,
+                                                /*num_inputs=*/2);
+  context->tensor(2)->dims->size = 1;
+  context->tensor(2)->type = kTfLiteInt32;
+  EXPECT_TRUE(
+      parser
+          ->IsSupported(context.get(), context->node(), context->registration())
+          .ok());
+  // VALID with runtime indices
+  context = std::make_unique<StubTfLiteContext>(kTfLiteBuiltinGather,
+                                                /*op_version=*/1,
+                                                /*num_inputs=*/2);
+  context->tensor(2)->dims->size = 1;
+  context->tensor(2)->type = kTfLiteInt32;
+  context->tensor(2)->allocation_type = kTfLiteMmapRo;
+  EXPECT_TRUE(
+      parser
+          ->IsSupported(context.get(), context->node(), context->registration())
+          .ok());
+}
+
 TEST(HardSwishOperationParserTest, TestIsSupported) {
   // Invalid num_inputs
   auto context = std::make_unique<StubTfLiteContext>(kTfLiteBuiltinHardSwish,
diff --git a/tensorflow/lite/delegates/gpu/common/model_test.cc b/tensorflow/lite/delegates/gpu/common/model_test.cc
index efa076aabc3..98e9c183c4d 100644
--- a/tensorflow/lite/delegates/gpu/common/model_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_test.cc
@@ -534,6 +534,28 @@ TEST(BatchMatchingTest, NotAllMatch) {
             absl::StatusCode::kInvalidArgument);
 }
 
+TEST(Model, KnownGraphOutput) {
+  // graph_input -> node1 -> graph_output1 -> node2 -> graph_output2
+  GraphFloat32 graph;
+  Node* node1 = graph.NewNode();
+  Node* node2 = graph.NewNode();
+  Value* graph_input = graph.NewValue();
+  Value* graph_output1 = graph.NewValue();
+  Value* graph_output2 = graph.NewValue();
+  ASSERT_TRUE(graph.AddConsumer(node1->id, graph_input->id).ok());
+  ASSERT_TRUE(graph.SetProducer(node1->id, graph_output1->id).ok());
+  ASSERT_TRUE(graph.AddConsumer(node2->id, graph_output1->id).ok());
+  ASSERT_TRUE(graph.SetProducer(node2->id, graph_output2->id).ok());
+  graph.AddKnownGraphOutput(graph_output1);
+  graph.AddKnownGraphOutput(graph_output2);
+
+  EXPECT_THAT(graph.nodes(), ElementsAre(node1, node2));
+  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
+  // `graph_output2` should appear first than the known graph output
+  // `graph_output1` for the compatibility.
+  EXPECT_THAT(graph.outputs(), ElementsAre(graph_output2, graph_output1));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index da5f7690bab..7167813f0fe 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -30,11 +30,6 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
-Padding2D& Padding2D::operator=(const Padding2D& value) {
-  prepended = value.prepended;
-  appended = value.appended;
-  return *this;
-}
 
 bool Padding2D::operator==(const Padding2D& value) const {
   return this->prepended == value.prepended && this->appended == value.appended;
@@ -52,12 +47,6 @@ Padding2D& Padding2D::operator-(const Padding2D& value) {
   return *this;
 }
 
-Padding3D& Padding3D::operator=(const Padding3D& value) {
-  prepended = value.prepended;
-  appended = value.appended;
-  return *this;
-}
-
 bool Padding3D::operator==(const Padding3D& value) {
   return this->prepended == value.prepended && this->appended == value.appended;
 }
@@ -130,6 +119,8 @@ std::string ToString(enum OperationType op) {
       return "fully_connected_int8";
     case OperationType::GATHER:
       return "gather";
+    case OperationType::GELU:
+      return "gelu";
     case OperationType::GREATER:
       return "greater";
     case OperationType::GREATER_EQUAL:
@@ -261,6 +252,7 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"fully_connected", OperationType::FULLY_CONNECTED},
           {"fully_connected_int8", OperationType::FULLY_CONNECTED_INT8},
           {"gather", OperationType::GATHER},
+          {"gelu", OperationType::GELU},
           {"greater", OperationType::GREATER},
           {"greater_equal", OperationType::GREATER_EQUAL},
           {"hard_swish", OperationType::HARD_SWISH},
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 9a8169aa333..37d8dfa0fcd 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -61,6 +61,7 @@ enum class OperationType {
   FULLY_CONNECTED,
   FULLY_CONNECTED_INT8,
   GATHER,
+  GELU,
   GREATER,
   GREATER_EQUAL,
   HARD_SWISH,
@@ -124,9 +125,6 @@ using TensorOrScalarBase = std::variant<std::monostate, Tensor<HWC, DataTypeT>,
 using TensorOrScalar = TensorOrScalarBase<DataType::FLOAT32, float>;
 
 struct Padding2D {
-  Padding2D() = default;
-  Padding2D(const Padding2D& value) = default;
-  Padding2D& operator=(const Padding2D& value);
   bool operator==(const Padding2D& value) const;
   bool operator!=(const Padding2D& value) const;
   Padding2D& operator-(const Padding2D& value);
@@ -139,9 +137,6 @@ struct Padding2D {
 };
 
 struct Padding3D {
-  Padding3D() = default;
-  Padding3D(const Padding3D& value) = default;
-  Padding3D& operator=(const Padding3D& value);
   bool operator==(const Padding3D& value);
   bool operator!=(const Padding3D& value);
   Padding3D& operator-(const Padding3D& value);
@@ -632,6 +627,7 @@ struct QuantizeAndDequantizeAttributes {
 
 struct GatherAttributes {
   Axis axis = Axis::UNKNOWN;
+  Tensor<Linear, DataType::INT32> indices;
 };
 
 struct OneHotAttributes {
@@ -642,6 +638,7 @@ struct OneHotAttributes {
 struct SelectV2Attributes {
   bool broadcast_true = false;
   bool broadcast_false = false;
+  bool scalar_cond = false;
 };
 
 struct CumsumAttributes {
diff --git a/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc
index 34bb6fbdc38..655c7bba225 100644
--- a/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc
@@ -621,7 +621,7 @@ absl::Status GPUOperationFromNodePart0(
     }
     case OperationType::GATHER: {
       auto attr = absl::any_cast<GatherAttributes>(node.operation.attributes);
-      RETURN_IF_ERROR(SelectGather(attr, op_def, gpu_op));
+      RETURN_IF_ERROR(SelectGather(attr, op_def, gpu_info, gpu_op));
       return absl::OkStatus();
     }
     case OperationType::LSTM: {
@@ -760,6 +760,7 @@ absl::Status GPUOperationFromNodePart0(
     case OperationType::COS:
     case OperationType::ELU:
     case OperationType::EXP:
+    case OperationType::GELU:
     case OperationType::HARD_SWISH:
     case OperationType::LOG:
     case OperationType::NEG:
diff --git a/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.cc
index c02cd65f868..61271f8f6d3 100644
--- a/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.cc
@@ -90,13 +90,9 @@ void SelectAdd(const OperationDef& op_def, const std::vector<int>& channels,
 }
 
 absl::Status SelectGather(const GatherAttributes& attr,
-                          const OperationDef& op_def,
+                          const OperationDef& op_def, const GpuInfo& gpu_info,
                           std::unique_ptr<GPUOperation>* ptr) {
-  if (attr.axis != Axis::WIDTH) {
-    return absl::UnimplementedError(
-        "No gather for this axis. Only Width axis supported.");
-  }
-  GPUOperation operation = CreateGather(op_def, attr);
+  GPUOperation operation = CreateGather(gpu_info, op_def, attr);
   *ptr = std::make_unique<GPUOperation>(std::move(operation));
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.h
index 2f4f25b5730..4968ad7e1bf 100644
--- a/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.h
@@ -51,7 +51,7 @@ void SelectAdd(const OperationDef& op_def, const std::vector<int>& channels,
                int dst_channels, std::unique_ptr<GPUOperation>* ptr);
 
 absl::Status SelectGather(const GatherAttributes& attr,
-                          const OperationDef& op_def,
+                          const OperationDef& op_def, const GpuInfo& gpu_info,
                           std::unique_ptr<GPUOperation>* ptr);
 
 absl::Status SelectResize(const Resize2DAttributes& attr,
diff --git a/tensorflow/lite/delegates/gpu/common/selectors/special_selector.cc b/tensorflow/lite/delegates/gpu/common/selectors/special_selector.cc
index a4f49590819..c654f9360b2 100644
--- a/tensorflow/lite/delegates/gpu/common/selectors/special_selector.cc
+++ b/tensorflow/lite/delegates/gpu/common/selectors/special_selector.cc
@@ -60,7 +60,8 @@ absl::Status GPUSubgraphFromGraph(
   if (TryFusedPointwiseConv(graph, first_node_id, precision, tensor_descriptors,
                             consumed_nodes, gpu_subgraph)
           .ok()) {
-    gpu_subgraph->operations[0].name = "slice_mul_mean_concat";
+    // TODO(b/278745183) Add tests for slice_mul_reduce_concat
+    gpu_subgraph->operations[0].name = "slice_mul_reduce_concat";
     return absl::OkStatus();
   }
   if (TryMeanStdDevNormalization(gpu_info, precision, graph, first_node_id,
diff --git a/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h b/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
old mode 100644
new mode 100755
index f97303bc789..59831408614
--- a/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
+++ b/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
@@ -20,6 +20,13 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffers.h"
 
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 1 &&
+              FLATBUFFERS_VERSION_REVISION == 21,
+             "Non-compatible flatbuffers version included");
+
 namespace tflite {
 namespace gpu {
 namespace data {
@@ -103,7 +110,7 @@ inline const char * const *EnumNamesAccessType() {
 }
 
 inline const char *EnumNameAccessType(AccessType e) {
-  if (flatbuffers::IsOutRange(e, AccessType::READ, AccessType::READ_WRITE)) return "";
+  if (::flatbuffers::IsOutRange(e, AccessType::READ, AccessType::READ_WRITE)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesAccessType()[index];
 }
@@ -166,7 +173,7 @@ inline const char * const *EnumNamesDataType() {
 }
 
 inline const char *EnumNameDataType(DataType e) {
-  if (flatbuffers::IsOutRange(e, DataType::UNKNOWN, DataType::BOOL)) return "";
+  if (::flatbuffers::IsOutRange(e, DataType::UNKNOWN, DataType::BOOL)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesDataType()[index];
 }
@@ -199,7 +206,7 @@ inline const char * const *EnumNamesMemoryType() {
 }
 
 inline const char *EnumNameMemoryType(MemoryType e) {
-  if (flatbuffers::IsOutRange(e, MemoryType::GLOBAL, MemoryType::LOCAL)) return "";
+  if (::flatbuffers::IsOutRange(e, MemoryType::GLOBAL, MemoryType::LOCAL)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesMemoryType()[index];
 }
@@ -244,7 +251,7 @@ inline const char * const *EnumNamesTensorStorageType() {
 }
 
 inline const char *EnumNameTensorStorageType(TensorStorageType e) {
-  if (flatbuffers::IsOutRange(e, TensorStorageType::UNKNOWN, TensorStorageType::SINGLE_TEXTURE_2D)) return "";
+  if (::flatbuffers::IsOutRange(e, TensorStorageType::UNKNOWN, TensorStorageType::SINGLE_TEXTURE_2D)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesTensorStorageType()[index];
 }
@@ -263,19 +270,33 @@ enum class Layout : int8_t {
 
 inline const Layout (&EnumValuesLayout())[7] {
   static const Layout values[] = {
-      Layout::UNKNOWN, Layout::HWC,    Layout::BHWC, Layout::HWDC,
-      Layout::BHWDC,   Layout::LINEAR, Layout::HW};
+    Layout::UNKNOWN,
+    Layout::HWC,
+    Layout::BHWC,
+    Layout::HWDC,
+    Layout::BHWDC,
+    Layout::LINEAR,
+    Layout::HW
+  };
   return values;
 }
 
 inline const char * const *EnumNamesLayout() {
-  static const char *const names[8] = {"UNKNOWN", "HWC",    "BHWC", "HWDC",
-                                       "BHWDC",   "LINEAR", "HW",   nullptr};
+  static const char * const names[8] = {
+    "UNKNOWN",
+    "HWC",
+    "BHWC",
+    "HWDC",
+    "BHWDC",
+    "LINEAR",
+    "HW",
+    nullptr
+  };
   return names;
 }
 
 inline const char *EnumNameLayout(Layout e) {
-  if (flatbuffers::IsOutRange(e, Layout::UNKNOWN, Layout::HW)) return "";
+  if (::flatbuffers::IsOutRange(e, Layout::UNKNOWN, Layout::HW)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesLayout()[index];
 }
@@ -308,7 +329,7 @@ inline const char * const *EnumNamesCalculationsPrecision() {
 }
 
 inline const char *EnumNameCalculationsPrecision(CalculationsPrecision e) {
-  if (flatbuffers::IsOutRange(e, CalculationsPrecision::F32, CalculationsPrecision::F16)) return "";
+  if (::flatbuffers::IsOutRange(e, CalculationsPrecision::F32, CalculationsPrecision::F16)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesCalculationsPrecision()[index];
 }
@@ -347,7 +368,7 @@ inline const char * const *EnumNamesTensorToGrid() {
 }
 
 inline const char *EnumNameTensorToGrid(TensorToGrid e) {
-  if (flatbuffers::IsOutRange(e, TensorToGrid::CUSTOM, TensorToGrid::B_TO_X_Y_IS_1_Z_IS_1)) return "";
+  if (::flatbuffers::IsOutRange(e, TensorToGrid::CUSTOM, TensorToGrid::B_TO_X_Y_IS_1_Z_IS_1)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesTensorToGrid()[index];
 }
@@ -389,12 +410,12 @@ inline const char * const *EnumNamesCompilerOptions() {
 }
 
 inline const char *EnumNameCompilerOptions(CompilerOptions e) {
-  if (flatbuffers::IsOutRange(e, CompilerOptions::ADRENO_FULL_SIMD_LINE, CompilerOptions::CL_3_0)) return "";
+  if (::flatbuffers::IsOutRange(e, CompilerOptions::ADRENO_FULL_SIMD_LINE, CompilerOptions::CL_3_0)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesCompilerOptions()[index];
 }
 
-struct Int4 FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Int4 FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef Int4Builder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_X = 4,
@@ -414,7 +435,7 @@ struct Int4 FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t w() const {
     return GetField<int32_t>(VT_W, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_X, 4) &&
            VerifyField<int32_t>(verifier, VT_Y, 4) &&
@@ -426,8 +447,8 @@ struct Int4 FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct Int4Builder {
   typedef Int4 Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_x(int32_t x) {
     fbb_.AddElement<int32_t>(Int4::VT_X, x, 0);
   }
@@ -440,19 +461,19 @@ struct Int4Builder {
   void add_w(int32_t w) {
     fbb_.AddElement<int32_t>(Int4::VT_W, w, 0);
   }
-  explicit Int4Builder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit Int4Builder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Int4> Finish() {
+  ::flatbuffers::Offset<Int4> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Int4>(end);
+    auto o = ::flatbuffers::Offset<Int4>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Int4> CreateInt4(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Int4> CreateInt4(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t x = 0,
     int32_t y = 0,
     int32_t z = 0,
@@ -465,7 +486,7 @@ inline flatbuffers::Offset<Int4> CreateInt4(
   return builder_.Finish();
 }
 
-struct Int3 FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Int3 FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef Int3Builder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_X = 4,
@@ -481,7 +502,7 @@ struct Int3 FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t z() const {
     return GetField<int32_t>(VT_Z, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_X, 4) &&
            VerifyField<int32_t>(verifier, VT_Y, 4) &&
@@ -492,8 +513,8 @@ struct Int3 FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct Int3Builder {
   typedef Int3 Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_x(int32_t x) {
     fbb_.AddElement<int32_t>(Int3::VT_X, x, 0);
   }
@@ -503,19 +524,19 @@ struct Int3Builder {
   void add_z(int32_t z) {
     fbb_.AddElement<int32_t>(Int3::VT_Z, z, 0);
   }
-  explicit Int3Builder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit Int3Builder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Int3> Finish() {
+  ::flatbuffers::Offset<Int3> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Int3>(end);
+    auto o = ::flatbuffers::Offset<Int3>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Int3> CreateInt3(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Int3> CreateInt3(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t x = 0,
     int32_t y = 0,
     int32_t z = 0) {
@@ -526,7 +547,7 @@ inline flatbuffers::Offset<Int3> CreateInt3(
   return builder_.Finish();
 }
 
-struct Int2 FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Int2 FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef Int2Builder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_X = 4,
@@ -538,7 +559,7 @@ struct Int2 FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t y() const {
     return GetField<int32_t>(VT_Y, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_X, 4) &&
            VerifyField<int32_t>(verifier, VT_Y, 4) &&
@@ -548,27 +569,27 @@ struct Int2 FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct Int2Builder {
   typedef Int2 Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_x(int32_t x) {
     fbb_.AddElement<int32_t>(Int2::VT_X, x, 0);
   }
   void add_y(int32_t y) {
     fbb_.AddElement<int32_t>(Int2::VT_Y, y, 0);
   }
-  explicit Int2Builder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit Int2Builder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Int2> Finish() {
+  ::flatbuffers::Offset<Int2> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Int2>(end);
+    auto o = ::flatbuffers::Offset<Int2>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Int2> CreateInt2(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Int2> CreateInt2(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t x = 0,
     int32_t y = 0) {
   Int2Builder builder_(_fbb);
@@ -577,19 +598,19 @@ inline flatbuffers::Offset<Int2> CreateInt2(
   return builder_.Finish();
 }
 
-struct StateVariable FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct StateVariable FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef StateVariableBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_KEY = 4,
     VT_VALUE = 6
   };
-  const flatbuffers::String *key() const {
-    return GetPointer<const flatbuffers::String *>(VT_KEY);
+  const ::flatbuffers::String *key() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_KEY);
   }
-  const flatbuffers::String *value() const {
-    return GetPointer<const flatbuffers::String *>(VT_VALUE);
+  const ::flatbuffers::String *value() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_VALUE);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_KEY) &&
            verifier.VerifyString(key()) &&
@@ -601,37 +622,37 @@ struct StateVariable FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct StateVariableBuilder {
   typedef StateVariable Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_key(flatbuffers::Offset<flatbuffers::String> key) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_key(::flatbuffers::Offset<::flatbuffers::String> key) {
     fbb_.AddOffset(StateVariable::VT_KEY, key);
   }
-  void add_value(flatbuffers::Offset<flatbuffers::String> value) {
+  void add_value(::flatbuffers::Offset<::flatbuffers::String> value) {
     fbb_.AddOffset(StateVariable::VT_VALUE, value);
   }
-  explicit StateVariableBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit StateVariableBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<StateVariable> Finish() {
+  ::flatbuffers::Offset<StateVariable> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<StateVariable>(end);
+    auto o = ::flatbuffers::Offset<StateVariable>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<StateVariable> CreateStateVariable(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> key = 0,
-    flatbuffers::Offset<flatbuffers::String> value = 0) {
+inline ::flatbuffers::Offset<StateVariable> CreateStateVariable(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> key = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> value = 0) {
   StateVariableBuilder builder_(_fbb);
   builder_.add_value(value);
   builder_.add_key(key);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<StateVariable> CreateStateVariableDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<StateVariable> CreateStateVariableDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *key = nullptr,
     const char *value = nullptr) {
   auto key__ = key ? _fbb.CreateString(key) : 0;
@@ -642,19 +663,19 @@ inline flatbuffers::Offset<StateVariable> CreateStateVariableDirect(
       value__);
 }
 
-struct GPUObjectDescriptor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct GPUObjectDescriptor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef GPUObjectDescriptorBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_STATE_VARS = 4,
     VT_ACCESS_TYPE = 6
   };
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::StateVariable>> *state_vars() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::StateVariable>> *>(VT_STATE_VARS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::StateVariable>> *state_vars() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::StateVariable>> *>(VT_STATE_VARS);
   }
   tflite::gpu::data::AccessType access_type() const {
     return static_cast<tflite::gpu::data::AccessType>(GetField<int8_t>(VT_ACCESS_TYPE, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_STATE_VARS) &&
            verifier.VerifyVector(state_vars()) &&
@@ -666,28 +687,28 @@ struct GPUObjectDescriptor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
 
 struct GPUObjectDescriptorBuilder {
   typedef GPUObjectDescriptor Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_state_vars(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::StateVariable>>> state_vars) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_state_vars(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::StateVariable>>> state_vars) {
     fbb_.AddOffset(GPUObjectDescriptor::VT_STATE_VARS, state_vars);
   }
   void add_access_type(tflite::gpu::data::AccessType access_type) {
     fbb_.AddElement<int8_t>(GPUObjectDescriptor::VT_ACCESS_TYPE, static_cast<int8_t>(access_type), 0);
   }
-  explicit GPUObjectDescriptorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit GPUObjectDescriptorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<GPUObjectDescriptor> Finish() {
+  ::flatbuffers::Offset<GPUObjectDescriptor> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<GPUObjectDescriptor>(end);
+    auto o = ::flatbuffers::Offset<GPUObjectDescriptor>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<GPUObjectDescriptor> CreateGPUObjectDescriptor(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::StateVariable>>> state_vars = 0,
+inline ::flatbuffers::Offset<GPUObjectDescriptor> CreateGPUObjectDescriptor(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::StateVariable>>> state_vars = 0,
     tflite::gpu::data::AccessType access_type = tflite::gpu::data::AccessType::READ) {
   GPUObjectDescriptorBuilder builder_(_fbb);
   builder_.add_state_vars(state_vars);
@@ -695,26 +716,26 @@ inline flatbuffers::Offset<GPUObjectDescriptor> CreateGPUObjectDescriptor(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<GPUObjectDescriptor> CreateGPUObjectDescriptorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::StateVariable>> *state_vars = nullptr,
+inline ::flatbuffers::Offset<GPUObjectDescriptor> CreateGPUObjectDescriptorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::StateVariable>> *state_vars = nullptr,
     tflite::gpu::data::AccessType access_type = tflite::gpu::data::AccessType::READ) {
-  auto state_vars__ = state_vars ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::StateVariable>>(*state_vars) : 0;
+  auto state_vars__ = state_vars ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::StateVariable>>(*state_vars) : 0;
   return tflite::gpu::data::CreateGPUObjectDescriptor(
       _fbb,
       state_vars__,
       access_type);
 }
 
-struct IntValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct IntValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef IntValueBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NAME = 4,
     VT_VALUE = 6,
     VT_ACTIVE = 8
   };
-  const flatbuffers::String *name() const {
-    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
   }
   int32_t value() const {
     return GetField<int32_t>(VT_VALUE, 0);
@@ -722,7 +743,7 @@ struct IntValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool active() const {
     return GetField<uint8_t>(VT_ACTIVE, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
@@ -734,9 +755,9 @@ struct IntValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct IntValueBuilder {
   typedef IntValue Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
     fbb_.AddOffset(IntValue::VT_NAME, name);
   }
   void add_value(int32_t value) {
@@ -745,20 +766,20 @@ struct IntValueBuilder {
   void add_active(bool active) {
     fbb_.AddElement<uint8_t>(IntValue::VT_ACTIVE, static_cast<uint8_t>(active), 0);
   }
-  explicit IntValueBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit IntValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<IntValue> Finish() {
+  ::flatbuffers::Offset<IntValue> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<IntValue>(end);
+    auto o = ::flatbuffers::Offset<IntValue>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<IntValue> CreateIntValue(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> name = 0,
+inline ::flatbuffers::Offset<IntValue> CreateIntValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
     int32_t value = 0,
     bool active = false) {
   IntValueBuilder builder_(_fbb);
@@ -768,8 +789,8 @@ inline flatbuffers::Offset<IntValue> CreateIntValue(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<IntValue> CreateIntValueDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<IntValue> CreateIntValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *name = nullptr,
     int32_t value = 0,
     bool active = false) {
@@ -781,15 +802,15 @@ inline flatbuffers::Offset<IntValue> CreateIntValueDirect(
       active);
 }
 
-struct FloatValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct FloatValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef FloatValueBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NAME = 4,
     VT_VALUE = 6,
     VT_ACTIVE = 8
   };
-  const flatbuffers::String *name() const {
-    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
   }
   float value() const {
     return GetField<float>(VT_VALUE, 0.0f);
@@ -797,7 +818,7 @@ struct FloatValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool active() const {
     return GetField<uint8_t>(VT_ACTIVE, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
@@ -809,9 +830,9 @@ struct FloatValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct FloatValueBuilder {
   typedef FloatValue Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
     fbb_.AddOffset(FloatValue::VT_NAME, name);
   }
   void add_value(float value) {
@@ -820,20 +841,20 @@ struct FloatValueBuilder {
   void add_active(bool active) {
     fbb_.AddElement<uint8_t>(FloatValue::VT_ACTIVE, static_cast<uint8_t>(active), 0);
   }
-  explicit FloatValueBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit FloatValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<FloatValue> Finish() {
+  ::flatbuffers::Offset<FloatValue> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<FloatValue>(end);
+    auto o = ::flatbuffers::Offset<FloatValue>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<FloatValue> CreateFloatValue(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> name = 0,
+inline ::flatbuffers::Offset<FloatValue> CreateFloatValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
     float value = 0.0f,
     bool active = false) {
   FloatValueBuilder builder_(_fbb);
@@ -843,8 +864,8 @@ inline flatbuffers::Offset<FloatValue> CreateFloatValue(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<FloatValue> CreateFloatValueDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<FloatValue> CreateFloatValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *name = nullptr,
     float value = 0.0f,
     bool active = false) {
@@ -856,15 +877,15 @@ inline flatbuffers::Offset<FloatValue> CreateFloatValueDirect(
       active);
 }
 
-struct HalfValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct HalfValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef HalfValueBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NAME = 4,
     VT_VALUE = 6,
     VT_ACTIVE = 8
   };
-  const flatbuffers::String *name() const {
-    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
   }
   float value() const {
     return GetField<float>(VT_VALUE, 0.0f);
@@ -872,7 +893,7 @@ struct HalfValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool active() const {
     return GetField<uint8_t>(VT_ACTIVE, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
@@ -884,9 +905,9 @@ struct HalfValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct HalfValueBuilder {
   typedef HalfValue Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
     fbb_.AddOffset(HalfValue::VT_NAME, name);
   }
   void add_value(float value) {
@@ -895,20 +916,20 @@ struct HalfValueBuilder {
   void add_active(bool active) {
     fbb_.AddElement<uint8_t>(HalfValue::VT_ACTIVE, static_cast<uint8_t>(active), 0);
   }
-  explicit HalfValueBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit HalfValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<HalfValue> Finish() {
+  ::flatbuffers::Offset<HalfValue> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<HalfValue>(end);
+    auto o = ::flatbuffers::Offset<HalfValue>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<HalfValue> CreateHalfValue(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> name = 0,
+inline ::flatbuffers::Offset<HalfValue> CreateHalfValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
     float value = 0.0f,
     bool active = false) {
   HalfValueBuilder builder_(_fbb);
@@ -918,8 +939,8 @@ inline flatbuffers::Offset<HalfValue> CreateHalfValue(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<HalfValue> CreateHalfValueDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<HalfValue> CreateHalfValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *name = nullptr,
     float value = 0.0f,
     bool active = false) {
@@ -931,7 +952,7 @@ inline flatbuffers::Offset<HalfValue> CreateHalfValueDirect(
       active);
 }
 
-struct BufferDescriptor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BufferDescriptor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BufferDescriptorBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_BASE_OBJ = 4,
@@ -954,16 +975,16 @@ struct BufferDescriptor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::gpu::data::MemoryType memory_type() const {
     return static_cast<tflite::gpu::data::MemoryType>(GetField<int8_t>(VT_MEMORY_TYPE, 0));
   }
-  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *attributes() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_ATTRIBUTES);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_ATTRIBUTES);
   }
   int32_t size() const {
     return GetField<int32_t>(VT_SIZE, 0);
   }
-  const flatbuffers::Vector<uint8_t> *data() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  const ::flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_DATA);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_BASE_OBJ) &&
            verifier.VerifyTable(base_obj()) &&
@@ -982,9 +1003,9 @@ struct BufferDescriptor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct BufferDescriptorBuilder {
   typedef BufferDescriptor Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_base_obj(flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_base_obj(::flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj) {
     fbb_.AddOffset(BufferDescriptor::VT_BASE_OBJ, base_obj);
   }
   void add_element_type(tflite::gpu::data::DataType element_type) {
@@ -996,35 +1017,35 @@ struct BufferDescriptorBuilder {
   void add_memory_type(tflite::gpu::data::MemoryType memory_type) {
     fbb_.AddElement<int8_t>(BufferDescriptor::VT_MEMORY_TYPE, static_cast<int8_t>(memory_type), 0);
   }
-  void add_attributes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> attributes) {
+  void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> attributes) {
     fbb_.AddOffset(BufferDescriptor::VT_ATTRIBUTES, attributes);
   }
   void add_size(int32_t size) {
     fbb_.AddElement<int32_t>(BufferDescriptor::VT_SIZE, size, 0);
   }
-  void add_data(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data) {
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data) {
     fbb_.AddOffset(BufferDescriptor::VT_DATA, data);
   }
-  explicit BufferDescriptorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BufferDescriptorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BufferDescriptor> Finish() {
+  ::flatbuffers::Offset<BufferDescriptor> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BufferDescriptor>(end);
+    auto o = ::flatbuffers::Offset<BufferDescriptor>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BufferDescriptor> CreateBufferDescriptor(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj = 0,
+inline ::flatbuffers::Offset<BufferDescriptor> CreateBufferDescriptor(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj = 0,
     tflite::gpu::data::DataType element_type = tflite::gpu::data::DataType::UNKNOWN,
     int32_t element_size = 0,
     tflite::gpu::data::MemoryType memory_type = tflite::gpu::data::MemoryType::GLOBAL,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> attributes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> attributes = 0,
     int32_t size = 0,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data = 0) {
   BufferDescriptorBuilder builder_(_fbb);
   builder_.add_data(data);
   builder_.add_size(size);
@@ -1036,16 +1057,16 @@ inline flatbuffers::Offset<BufferDescriptor> CreateBufferDescriptor(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<BufferDescriptor> CreateBufferDescriptorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj = 0,
+inline ::flatbuffers::Offset<BufferDescriptor> CreateBufferDescriptorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj = 0,
     tflite::gpu::data::DataType element_type = tflite::gpu::data::DataType::UNKNOWN,
     int32_t element_size = 0,
     tflite::gpu::data::MemoryType memory_type = tflite::gpu::data::MemoryType::GLOBAL,
-    const std::vector<flatbuffers::Offset<flatbuffers::String>> *attributes = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *attributes = nullptr,
     int32_t size = 0,
     const std::vector<uint8_t> *data = nullptr) {
-  auto attributes__ = attributes ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*attributes) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*attributes) : 0;
   auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
   return tflite::gpu::data::CreateBufferDescriptor(
       _fbb,
@@ -1058,7 +1079,7 @@ inline flatbuffers::Offset<BufferDescriptor> CreateBufferDescriptorDirect(
       data__);
 }
 
-struct BHWDC FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BHWDC FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BHWDCBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_B = 4,
@@ -1082,7 +1103,7 @@ struct BHWDC FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t c() const {
     return GetField<int32_t>(VT_C, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_B, 4) &&
            VerifyField<int32_t>(verifier, VT_H, 4) &&
@@ -1095,8 +1116,8 @@ struct BHWDC FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct BHWDCBuilder {
   typedef BHWDC Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_b(int32_t b) {
     fbb_.AddElement<int32_t>(BHWDC::VT_B, b, 0);
   }
@@ -1112,19 +1133,19 @@ struct BHWDCBuilder {
   void add_c(int32_t c) {
     fbb_.AddElement<int32_t>(BHWDC::VT_C, c, 0);
   }
-  explicit BHWDCBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BHWDCBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BHWDC> Finish() {
+  ::flatbuffers::Offset<BHWDC> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BHWDC>(end);
+    auto o = ::flatbuffers::Offset<BHWDC>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BHWDC> CreateBHWDC(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BHWDC> CreateBHWDC(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t b = 0,
     int32_t h = 0,
     int32_t w = 0,
@@ -1139,7 +1160,7 @@ inline flatbuffers::Offset<BHWDC> CreateBHWDC(
   return builder_.Finish();
 }
 
-struct TensorDescriptor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct TensorDescriptor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef TensorDescriptorBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_BASE_OBJ = 4,
@@ -1166,8 +1187,8 @@ struct TensorDescriptor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::gpu::data::BHWDC *shape() const {
     return GetPointer<const tflite::gpu::data::BHWDC *>(VT_SHAPE);
   }
-  const flatbuffers::Vector<uint8_t> *data() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  const ::flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_DATA);
   }
   bool use_buffer_for_write_only_2d_texture() const {
     return GetField<uint8_t>(VT_USE_BUFFER_FOR_WRITE_ONLY_2D_TEXTURE, 0) != 0;
@@ -1175,7 +1196,7 @@ struct TensorDescriptor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool use_buffer_for_write_only_image_buffer() const {
     return GetField<uint8_t>(VT_USE_BUFFER_FOR_WRITE_ONLY_IMAGE_BUFFER, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_BASE_OBJ) &&
            verifier.VerifyTable(base_obj()) &&
@@ -1194,9 +1215,9 @@ struct TensorDescriptor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct TensorDescriptorBuilder {
   typedef TensorDescriptor Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_base_obj(flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_base_obj(::flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj) {
     fbb_.AddOffset(TensorDescriptor::VT_BASE_OBJ, base_obj);
   }
   void add_data_type(tflite::gpu::data::DataType data_type) {
@@ -1208,10 +1229,10 @@ struct TensorDescriptorBuilder {
   void add_layout(tflite::gpu::data::Layout layout) {
     fbb_.AddElement<int8_t>(TensorDescriptor::VT_LAYOUT, static_cast<int8_t>(layout), 0);
   }
-  void add_shape(flatbuffers::Offset<tflite::gpu::data::BHWDC> shape) {
+  void add_shape(::flatbuffers::Offset<tflite::gpu::data::BHWDC> shape) {
     fbb_.AddOffset(TensorDescriptor::VT_SHAPE, shape);
   }
-  void add_data(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data) {
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data) {
     fbb_.AddOffset(TensorDescriptor::VT_DATA, data);
   }
   void add_use_buffer_for_write_only_2d_texture(bool use_buffer_for_write_only_2d_texture) {
@@ -1220,25 +1241,25 @@ struct TensorDescriptorBuilder {
   void add_use_buffer_for_write_only_image_buffer(bool use_buffer_for_write_only_image_buffer) {
     fbb_.AddElement<uint8_t>(TensorDescriptor::VT_USE_BUFFER_FOR_WRITE_ONLY_IMAGE_BUFFER, static_cast<uint8_t>(use_buffer_for_write_only_image_buffer), 0);
   }
-  explicit TensorDescriptorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit TensorDescriptorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<TensorDescriptor> Finish() {
+  ::flatbuffers::Offset<TensorDescriptor> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<TensorDescriptor>(end);
+    auto o = ::flatbuffers::Offset<TensorDescriptor>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<TensorDescriptor> CreateTensorDescriptor(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj = 0,
+inline ::flatbuffers::Offset<TensorDescriptor> CreateTensorDescriptor(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj = 0,
     tflite::gpu::data::DataType data_type = tflite::gpu::data::DataType::UNKNOWN,
     tflite::gpu::data::TensorStorageType storage_type = tflite::gpu::data::TensorStorageType::UNKNOWN,
     tflite::gpu::data::Layout layout = tflite::gpu::data::Layout::UNKNOWN,
-    flatbuffers::Offset<tflite::gpu::data::BHWDC> shape = 0,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::BHWDC> shape = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data = 0,
     bool use_buffer_for_write_only_2d_texture = false,
     bool use_buffer_for_write_only_image_buffer = false) {
   TensorDescriptorBuilder builder_(_fbb);
@@ -1253,13 +1274,13 @@ inline flatbuffers::Offset<TensorDescriptor> CreateTensorDescriptor(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<TensorDescriptor> CreateTensorDescriptorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj = 0,
+inline ::flatbuffers::Offset<TensorDescriptor> CreateTensorDescriptorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::GPUObjectDescriptor> base_obj = 0,
     tflite::gpu::data::DataType data_type = tflite::gpu::data::DataType::UNKNOWN,
     tflite::gpu::data::TensorStorageType storage_type = tflite::gpu::data::TensorStorageType::UNKNOWN,
     tflite::gpu::data::Layout layout = tflite::gpu::data::Layout::UNKNOWN,
-    flatbuffers::Offset<tflite::gpu::data::BHWDC> shape = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::BHWDC> shape = 0,
     const std::vector<uint8_t> *data = nullptr,
     bool use_buffer_for_write_only_2d_texture = false,
     bool use_buffer_for_write_only_image_buffer = false) {
@@ -1276,19 +1297,19 @@ inline flatbuffers::Offset<TensorDescriptor> CreateTensorDescriptorDirect(
       use_buffer_for_write_only_image_buffer);
 }
 
-struct BufferDescriptorMapValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BufferDescriptorMapValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BufferDescriptorMapValueBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_KEY = 4,
     VT_VALUE = 6
   };
-  const flatbuffers::String *key() const {
-    return GetPointer<const flatbuffers::String *>(VT_KEY);
+  const ::flatbuffers::String *key() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_KEY);
   }
   const tflite::gpu::data::BufferDescriptor *value() const {
     return GetPointer<const tflite::gpu::data::BufferDescriptor *>(VT_VALUE);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_KEY) &&
            verifier.VerifyString(key()) &&
@@ -1300,39 +1321,39 @@ struct BufferDescriptorMapValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::T
 
 struct BufferDescriptorMapValueBuilder {
   typedef BufferDescriptorMapValue Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_key(flatbuffers::Offset<flatbuffers::String> key) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_key(::flatbuffers::Offset<::flatbuffers::String> key) {
     fbb_.AddOffset(BufferDescriptorMapValue::VT_KEY, key);
   }
-  void add_value(flatbuffers::Offset<tflite::gpu::data::BufferDescriptor> value) {
+  void add_value(::flatbuffers::Offset<tflite::gpu::data::BufferDescriptor> value) {
     fbb_.AddOffset(BufferDescriptorMapValue::VT_VALUE, value);
   }
-  explicit BufferDescriptorMapValueBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BufferDescriptorMapValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BufferDescriptorMapValue> Finish() {
+  ::flatbuffers::Offset<BufferDescriptorMapValue> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BufferDescriptorMapValue>(end);
+    auto o = ::flatbuffers::Offset<BufferDescriptorMapValue>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BufferDescriptorMapValue> CreateBufferDescriptorMapValue(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> key = 0,
-    flatbuffers::Offset<tflite::gpu::data::BufferDescriptor> value = 0) {
+inline ::flatbuffers::Offset<BufferDescriptorMapValue> CreateBufferDescriptorMapValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> key = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::BufferDescriptor> value = 0) {
   BufferDescriptorMapValueBuilder builder_(_fbb);
   builder_.add_value(value);
   builder_.add_key(key);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<BufferDescriptorMapValue> CreateBufferDescriptorMapValueDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BufferDescriptorMapValue> CreateBufferDescriptorMapValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *key = nullptr,
-    flatbuffers::Offset<tflite::gpu::data::BufferDescriptor> value = 0) {
+    ::flatbuffers::Offset<tflite::gpu::data::BufferDescriptor> value = 0) {
   auto key__ = key ? _fbb.CreateString(key) : 0;
   return tflite::gpu::data::CreateBufferDescriptorMapValue(
       _fbb,
@@ -1340,19 +1361,19 @@ inline flatbuffers::Offset<BufferDescriptorMapValue> CreateBufferDescriptorMapVa
       value);
 }
 
-struct TensorDescriptorMapValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct TensorDescriptorMapValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef TensorDescriptorMapValueBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_KEY = 4,
     VT_VALUE = 6
   };
-  const flatbuffers::String *key() const {
-    return GetPointer<const flatbuffers::String *>(VT_KEY);
+  const ::flatbuffers::String *key() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_KEY);
   }
   const tflite::gpu::data::TensorDescriptor *value() const {
     return GetPointer<const tflite::gpu::data::TensorDescriptor *>(VT_VALUE);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_KEY) &&
            verifier.VerifyString(key()) &&
@@ -1364,39 +1385,39 @@ struct TensorDescriptorMapValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::T
 
 struct TensorDescriptorMapValueBuilder {
   typedef TensorDescriptorMapValue Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_key(flatbuffers::Offset<flatbuffers::String> key) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_key(::flatbuffers::Offset<::flatbuffers::String> key) {
     fbb_.AddOffset(TensorDescriptorMapValue::VT_KEY, key);
   }
-  void add_value(flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> value) {
+  void add_value(::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> value) {
     fbb_.AddOffset(TensorDescriptorMapValue::VT_VALUE, value);
   }
-  explicit TensorDescriptorMapValueBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit TensorDescriptorMapValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<TensorDescriptorMapValue> Finish() {
+  ::flatbuffers::Offset<TensorDescriptorMapValue> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<TensorDescriptorMapValue>(end);
+    auto o = ::flatbuffers::Offset<TensorDescriptorMapValue>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<TensorDescriptorMapValue> CreateTensorDescriptorMapValue(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> key = 0,
-    flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> value = 0) {
+inline ::flatbuffers::Offset<TensorDescriptorMapValue> CreateTensorDescriptorMapValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> key = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> value = 0) {
   TensorDescriptorMapValueBuilder builder_(_fbb);
   builder_.add_value(value);
   builder_.add_key(key);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<TensorDescriptorMapValue> CreateTensorDescriptorMapValueDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<TensorDescriptorMapValue> CreateTensorDescriptorMapValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *key = nullptr,
-    flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> value = 0) {
+    ::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor> value = 0) {
   auto key__ = key ? _fbb.CreateString(key) : 0;
   return tflite::gpu::data::CreateTensorDescriptorMapValue(
       _fbb,
@@ -1404,7 +1425,7 @@ inline flatbuffers::Offset<TensorDescriptorMapValue> CreateTensorDescriptorMapVa
       value);
 }
 
-struct Arguments FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Arguments FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ArgumentsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_INT_VALUES = 4,
@@ -1415,28 +1436,28 @@ struct Arguments FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_BUFFER_OBJECTS = 14,
     VT_TENSOR_OBJECTS = 16
   };
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::IntValue>> *int_values() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::IntValue>> *>(VT_INT_VALUES);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::IntValue>> *int_values() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::IntValue>> *>(VT_INT_VALUES);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::FloatValue>> *float_values() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::FloatValue>> *>(VT_FLOAT_VALUES);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::FloatValue>> *float_values() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::FloatValue>> *>(VT_FLOAT_VALUES);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::HalfValue>> *half_values() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::HalfValue>> *>(VT_HALF_VALUES);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::HalfValue>> *half_values() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::HalfValue>> *>(VT_HALF_VALUES);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *buffer_refs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *>(VT_BUFFER_REFS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *buffer_refs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *>(VT_BUFFER_REFS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *tensor_refs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *>(VT_TENSOR_REFS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *tensor_refs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *>(VT_TENSOR_REFS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *buffer_objects() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *>(VT_BUFFER_OBJECTS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *buffer_objects() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *>(VT_BUFFER_OBJECTS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *tensor_objects() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *>(VT_TENSOR_OBJECTS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *tensor_objects() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *>(VT_TENSOR_OBJECTS);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_INT_VALUES) &&
            verifier.VerifyVector(int_values()) &&
@@ -1465,49 +1486,49 @@ struct Arguments FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct ArgumentsBuilder {
   typedef Arguments Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_int_values(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::IntValue>>> int_values) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_int_values(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::IntValue>>> int_values) {
     fbb_.AddOffset(Arguments::VT_INT_VALUES, int_values);
   }
-  void add_float_values(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::FloatValue>>> float_values) {
+  void add_float_values(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::FloatValue>>> float_values) {
     fbb_.AddOffset(Arguments::VT_FLOAT_VALUES, float_values);
   }
-  void add_half_values(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::HalfValue>>> half_values) {
+  void add_half_values(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::HalfValue>>> half_values) {
     fbb_.AddOffset(Arguments::VT_HALF_VALUES, half_values);
   }
-  void add_buffer_refs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>> buffer_refs) {
+  void add_buffer_refs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>> buffer_refs) {
     fbb_.AddOffset(Arguments::VT_BUFFER_REFS, buffer_refs);
   }
-  void add_tensor_refs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>> tensor_refs) {
+  void add_tensor_refs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>> tensor_refs) {
     fbb_.AddOffset(Arguments::VT_TENSOR_REFS, tensor_refs);
   }
-  void add_buffer_objects(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>> buffer_objects) {
+  void add_buffer_objects(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>> buffer_objects) {
     fbb_.AddOffset(Arguments::VT_BUFFER_OBJECTS, buffer_objects);
   }
-  void add_tensor_objects(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>> tensor_objects) {
+  void add_tensor_objects(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>> tensor_objects) {
     fbb_.AddOffset(Arguments::VT_TENSOR_OBJECTS, tensor_objects);
   }
-  explicit ArgumentsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ArgumentsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Arguments> Finish() {
+  ::flatbuffers::Offset<Arguments> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Arguments>(end);
+    auto o = ::flatbuffers::Offset<Arguments>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Arguments> CreateArguments(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::IntValue>>> int_values = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::FloatValue>>> float_values = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::HalfValue>>> half_values = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>> buffer_refs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>> tensor_refs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>> buffer_objects = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>> tensor_objects = 0) {
+inline ::flatbuffers::Offset<Arguments> CreateArguments(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::IntValue>>> int_values = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::FloatValue>>> float_values = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::HalfValue>>> half_values = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>> buffer_refs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>> tensor_refs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>> buffer_objects = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>> tensor_objects = 0) {
   ArgumentsBuilder builder_(_fbb);
   builder_.add_tensor_objects(tensor_objects);
   builder_.add_buffer_objects(buffer_objects);
@@ -1519,22 +1540,22 @@ inline flatbuffers::Offset<Arguments> CreateArguments(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Arguments> CreateArgumentsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::IntValue>> *int_values = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::FloatValue>> *float_values = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::HalfValue>> *half_values = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *buffer_refs = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *tensor_refs = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *buffer_objects = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *tensor_objects = nullptr) {
-  auto int_values__ = int_values ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::IntValue>>(*int_values) : 0;
-  auto float_values__ = float_values ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::FloatValue>>(*float_values) : 0;
-  auto half_values__ = half_values ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::HalfValue>>(*half_values) : 0;
-  auto buffer_refs__ = buffer_refs ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>(*buffer_refs) : 0;
-  auto tensor_refs__ = tensor_refs ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>(*tensor_refs) : 0;
-  auto buffer_objects__ = buffer_objects ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>(*buffer_objects) : 0;
-  auto tensor_objects__ = tensor_objects ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>(*tensor_objects) : 0;
+inline ::flatbuffers::Offset<Arguments> CreateArgumentsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::IntValue>> *int_values = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::FloatValue>> *float_values = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::HalfValue>> *half_values = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *buffer_refs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *tensor_refs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>> *buffer_objects = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>> *tensor_objects = nullptr) {
+  auto int_values__ = int_values ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::IntValue>>(*int_values) : 0;
+  auto float_values__ = float_values ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::FloatValue>>(*float_values) : 0;
+  auto half_values__ = half_values ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::HalfValue>>(*half_values) : 0;
+  auto buffer_refs__ = buffer_refs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>(*buffer_refs) : 0;
+  auto tensor_refs__ = tensor_refs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>(*tensor_refs) : 0;
+  auto buffer_objects__ = buffer_objects ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::BufferDescriptorMapValue>>(*buffer_objects) : 0;
+  auto tensor_objects__ = tensor_objects ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptorMapValue>>(*tensor_objects) : 0;
   return tflite::gpu::data::CreateArguments(
       _fbb,
       int_values__,
@@ -1546,7 +1567,7 @@ inline flatbuffers::Offset<Arguments> CreateArgumentsDirect(
       tensor_objects__);
 }
 
-struct OperationDef FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct OperationDef FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef OperationDefBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_PRECISION = 4,
@@ -1556,13 +1577,13 @@ struct OperationDef FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::gpu::data::CalculationsPrecision precision() const {
     return static_cast<tflite::gpu::data::CalculationsPrecision>(GetField<int8_t>(VT_PRECISION, 0));
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *src_tensors() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *>(VT_SRC_TENSORS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *src_tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *>(VT_SRC_TENSORS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *dst_tensors() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *>(VT_DST_TENSORS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *dst_tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *>(VT_DST_TENSORS);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_PRECISION, 1) &&
            VerifyOffset(verifier, VT_SRC_TENSORS) &&
@@ -1577,33 +1598,33 @@ struct OperationDef FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct OperationDefBuilder {
   typedef OperationDef Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_precision(tflite::gpu::data::CalculationsPrecision precision) {
     fbb_.AddElement<int8_t>(OperationDef::VT_PRECISION, static_cast<int8_t>(precision), 0);
   }
-  void add_src_tensors(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>> src_tensors) {
+  void add_src_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>> src_tensors) {
     fbb_.AddOffset(OperationDef::VT_SRC_TENSORS, src_tensors);
   }
-  void add_dst_tensors(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>> dst_tensors) {
+  void add_dst_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>> dst_tensors) {
     fbb_.AddOffset(OperationDef::VT_DST_TENSORS, dst_tensors);
   }
-  explicit OperationDefBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit OperationDefBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<OperationDef> Finish() {
+  ::flatbuffers::Offset<OperationDef> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<OperationDef>(end);
+    auto o = ::flatbuffers::Offset<OperationDef>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<OperationDef> CreateOperationDef(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<OperationDef> CreateOperationDef(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::gpu::data::CalculationsPrecision precision = tflite::gpu::data::CalculationsPrecision::F32,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>> src_tensors = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>> dst_tensors = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>> src_tensors = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>> dst_tensors = 0) {
   OperationDefBuilder builder_(_fbb);
   builder_.add_dst_tensors(dst_tensors);
   builder_.add_src_tensors(src_tensors);
@@ -1611,13 +1632,13 @@ inline flatbuffers::Offset<OperationDef> CreateOperationDef(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<OperationDef> CreateOperationDefDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<OperationDef> CreateOperationDefDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::gpu::data::CalculationsPrecision precision = tflite::gpu::data::CalculationsPrecision::F32,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *src_tensors = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *dst_tensors = nullptr) {
-  auto src_tensors__ = src_tensors ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>(*src_tensors) : 0;
-  auto dst_tensors__ = dst_tensors ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>(*dst_tensors) : 0;
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *src_tensors = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>> *dst_tensors = nullptr) {
+  auto src_tensors__ = src_tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>(*src_tensors) : 0;
+  auto dst_tensors__ = dst_tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::TensorDescriptor>>(*dst_tensors) : 0;
   return tflite::gpu::data::CreateOperationDef(
       _fbb,
       precision,
@@ -1625,7 +1646,7 @@ inline flatbuffers::Offset<OperationDef> CreateOperationDefDirect(
       dst_tensors__);
 }
 
-struct CompilerOption FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct CompilerOption FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef CompilerOptionBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_OPTION = 4
@@ -1633,7 +1654,7 @@ struct CompilerOption FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::gpu::data::CompilerOptions option() const {
     return static_cast<tflite::gpu::data::CompilerOptions>(GetField<int8_t>(VT_OPTION, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_OPTION, 1) &&
            verifier.EndTable();
@@ -1642,31 +1663,31 @@ struct CompilerOption FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct CompilerOptionBuilder {
   typedef CompilerOption Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_option(tflite::gpu::data::CompilerOptions option) {
     fbb_.AddElement<int8_t>(CompilerOption::VT_OPTION, static_cast<int8_t>(option), 0);
   }
-  explicit CompilerOptionBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit CompilerOptionBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CompilerOption> Finish() {
+  ::flatbuffers::Offset<CompilerOption> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CompilerOption>(end);
+    auto o = ::flatbuffers::Offset<CompilerOption>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CompilerOption> CreateCompilerOption(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<CompilerOption> CreateCompilerOption(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::gpu::data::CompilerOptions option = tflite::gpu::data::CompilerOptions::ADRENO_FULL_SIMD_LINE) {
   CompilerOptionBuilder builder_(_fbb);
   builder_.add_option(option);
   return builder_.Finish();
 }
 
-struct GPUOperation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct GPUOperation FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef GPUOperationBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_ARGUMENTS = 4,
@@ -1686,14 +1707,14 @@ struct GPUOperation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::gpu::data::Arguments *arguments() const {
     return GetPointer<const tflite::gpu::data::Arguments *>(VT_ARGUMENTS);
   }
-  const flatbuffers::String *code() const {
-    return GetPointer<const flatbuffers::String *>(VT_CODE);
+  const ::flatbuffers::String *code() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CODE);
   }
   const tflite::gpu::data::Int3 *work_group_size() const {
     return GetPointer<const tflite::gpu::data::Int3 *>(VT_WORK_GROUP_SIZE);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::CompilerOption>> *compiler_options() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::CompilerOption>> *>(VT_COMPILER_OPTIONS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::CompilerOption>> *compiler_options() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::CompilerOption>> *>(VT_COMPILER_OPTIONS);
   }
   tflite::gpu::data::TensorToGrid tensor_to_grid() const {
     return static_cast<tflite::gpu::data::TensorToGrid>(GetField<int8_t>(VT_TENSOR_TO_GRID, 0));
@@ -1713,16 +1734,16 @@ struct GPUOperation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::gpu::data::Int3 *grid_size() const {
     return GetPointer<const tflite::gpu::data::Int3 *>(VT_GRID_SIZE);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *src_tensors_names() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_SRC_TENSORS_NAMES);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *src_tensors_names() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_SRC_TENSORS_NAMES);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *dst_tensors_names() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_DST_TENSORS_NAMES);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *dst_tensors_names() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_DST_TENSORS_NAMES);
   }
   const tflite::gpu::data::Int3 *work_groups_count() const {
     return GetPointer<const tflite::gpu::data::Int3 *>(VT_WORK_GROUPS_COUNT);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_ARGUMENTS) &&
            verifier.VerifyTable(arguments()) &&
@@ -1756,18 +1777,18 @@ struct GPUOperation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 
 struct GPUOperationBuilder {
   typedef GPUOperation Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_arguments(flatbuffers::Offset<tflite::gpu::data::Arguments> arguments) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_arguments(::flatbuffers::Offset<tflite::gpu::data::Arguments> arguments) {
     fbb_.AddOffset(GPUOperation::VT_ARGUMENTS, arguments);
   }
-  void add_code(flatbuffers::Offset<flatbuffers::String> code) {
+  void add_code(::flatbuffers::Offset<::flatbuffers::String> code) {
     fbb_.AddOffset(GPUOperation::VT_CODE, code);
   }
-  void add_work_group_size(flatbuffers::Offset<tflite::gpu::data::Int3> work_group_size) {
+  void add_work_group_size(::flatbuffers::Offset<tflite::gpu::data::Int3> work_group_size) {
     fbb_.AddOffset(GPUOperation::VT_WORK_GROUP_SIZE, work_group_size);
   }
-  void add_compiler_options(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::CompilerOption>>> compiler_options) {
+  void add_compiler_options(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::CompilerOption>>> compiler_options) {
     fbb_.AddOffset(GPUOperation::VT_COMPILER_OPTIONS, compiler_options);
   }
   void add_tensor_to_grid(tflite::gpu::data::TensorToGrid tensor_to_grid) {
@@ -1776,60 +1797,53 @@ struct GPUOperationBuilder {
   void add_flops(uint64_t flops) {
     fbb_.AddElement<uint64_t>(GPUOperation::VT_FLOPS, flops, 0);
   }
-  void add_definition(flatbuffers::Offset<tflite::gpu::data::OperationDef> definition) {
+  void add_definition(::flatbuffers::Offset<tflite::gpu::data::OperationDef> definition) {
     fbb_.AddOffset(GPUOperation::VT_DEFINITION, definition);
   }
   void add_grid_dimension(int32_t grid_dimension) {
     fbb_.AddElement<int32_t>(GPUOperation::VT_GRID_DIMENSION, grid_dimension, 0);
   }
-  void add_work_group_launch_order(flatbuffers::Offset<tflite::gpu::data::Int3> work_group_launch_order) {
+  void add_work_group_launch_order(::flatbuffers::Offset<tflite::gpu::data::Int3> work_group_launch_order) {
     fbb_.AddOffset(GPUOperation::VT_WORK_GROUP_LAUNCH_ORDER, work_group_launch_order);
   }
-  void add_grid_size(flatbuffers::Offset<tflite::gpu::data::Int3> grid_size) {
+  void add_grid_size(::flatbuffers::Offset<tflite::gpu::data::Int3> grid_size) {
     fbb_.AddOffset(GPUOperation::VT_GRID_SIZE, grid_size);
   }
-  void add_src_tensors_names(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> src_tensors_names) {
+  void add_src_tensors_names(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> src_tensors_names) {
     fbb_.AddOffset(GPUOperation::VT_SRC_TENSORS_NAMES, src_tensors_names);
   }
-  void add_dst_tensors_names(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> dst_tensors_names) {
+  void add_dst_tensors_names(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> dst_tensors_names) {
     fbb_.AddOffset(GPUOperation::VT_DST_TENSORS_NAMES, dst_tensors_names);
   }
-  void add_work_groups_count(flatbuffers::Offset<tflite::gpu::data::Int3> work_groups_count) {
+  void add_work_groups_count(::flatbuffers::Offset<tflite::gpu::data::Int3> work_groups_count) {
     fbb_.AddOffset(GPUOperation::VT_WORK_GROUPS_COUNT, work_groups_count);
   }
-  explicit GPUOperationBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit GPUOperationBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<GPUOperation> Finish() {
+  ::flatbuffers::Offset<GPUOperation> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<GPUOperation>(end);
+    auto o = ::flatbuffers::Offset<GPUOperation>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<GPUOperation> CreateGPUOperation(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::gpu::data::Arguments> arguments = 0,
-    flatbuffers::Offset<flatbuffers::String> code = 0,
-    flatbuffers::Offset<tflite::gpu::data::Int3> work_group_size = 0,
-    flatbuffers::Offset<flatbuffers::Vector<
-        flatbuffers::Offset<tflite::gpu::data::CompilerOption>>>
-        compiler_options = 0,
-    tflite::gpu::data::TensorToGrid tensor_to_grid =
-        tflite::gpu::data::TensorToGrid::CUSTOM,
+inline ::flatbuffers::Offset<GPUOperation> CreateGPUOperation(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::Arguments> arguments = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> code = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> work_group_size = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::gpu::data::CompilerOption>>> compiler_options = 0,
+    tflite::gpu::data::TensorToGrid tensor_to_grid = tflite::gpu::data::TensorToGrid::CUSTOM,
     uint64_t flops = 0,
-    flatbuffers::Offset<tflite::gpu::data::OperationDef> definition = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::OperationDef> definition = 0,
     int32_t grid_dimension = 0,
-    flatbuffers::Offset<tflite::gpu::data::Int3> work_group_launch_order = 0,
-    flatbuffers::Offset<tflite::gpu::data::Int3> grid_size = 0,
-    flatbuffers::Offset<
-        flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>>
-        src_tensors_names = 0,
-    flatbuffers::Offset<
-        flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>>
-        dst_tensors_names = 0,
-    flatbuffers::Offset<tflite::gpu::data::Int3> work_groups_count = 0) {
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> work_group_launch_order = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> grid_size = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> src_tensors_names = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> dst_tensors_names = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> work_groups_count = 0) {
   GPUOperationBuilder builder_(_fbb);
   builder_.add_flops(flops);
   builder_.add_work_groups_count(work_groups_count);
@@ -1847,34 +1861,40 @@ inline flatbuffers::Offset<GPUOperation> CreateGPUOperation(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<GPUOperation> CreateGPUOperationDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::gpu::data::Arguments> arguments = 0,
+inline ::flatbuffers::Offset<GPUOperation> CreateGPUOperationDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::gpu::data::Arguments> arguments = 0,
     const char *code = nullptr,
-    flatbuffers::Offset<tflite::gpu::data::Int3> work_group_size = 0,
-    const std::vector<flatbuffers::Offset<tflite::gpu::data::CompilerOption>>
-        *compiler_options = nullptr,
-    tflite::gpu::data::TensorToGrid tensor_to_grid =
-        tflite::gpu::data::TensorToGrid::CUSTOM,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> work_group_size = 0,
+    const std::vector<::flatbuffers::Offset<tflite::gpu::data::CompilerOption>> *compiler_options = nullptr,
+    tflite::gpu::data::TensorToGrid tensor_to_grid = tflite::gpu::data::TensorToGrid::CUSTOM,
     uint64_t flops = 0,
-    flatbuffers::Offset<tflite::gpu::data::OperationDef> definition = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::OperationDef> definition = 0,
     int32_t grid_dimension = 0,
-    flatbuffers::Offset<tflite::gpu::data::Int3> work_group_launch_order = 0,
-    flatbuffers::Offset<tflite::gpu::data::Int3> grid_size = 0,
-    const std::vector<flatbuffers::Offset<flatbuffers::String>>
-        *src_tensors_names = nullptr,
-    const std::vector<flatbuffers::Offset<flatbuffers::String>>
-        *dst_tensors_names = nullptr,
-    flatbuffers::Offset<tflite::gpu::data::Int3> work_groups_count = 0) {
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> work_group_launch_order = 0,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> grid_size = 0,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *src_tensors_names = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *dst_tensors_names = nullptr,
+    ::flatbuffers::Offset<tflite::gpu::data::Int3> work_groups_count = 0) {
   auto code__ = code ? _fbb.CreateString(code) : 0;
-  auto compiler_options__ = compiler_options ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::CompilerOption>>(*compiler_options) : 0;
-  auto src_tensors_names__ = src_tensors_names ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*src_tensors_names) : 0;
-  auto dst_tensors_names__ = dst_tensors_names ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*dst_tensors_names) : 0;
+  auto compiler_options__ = compiler_options ? _fbb.CreateVector<::flatbuffers::Offset<tflite::gpu::data::CompilerOption>>(*compiler_options) : 0;
+  auto src_tensors_names__ = src_tensors_names ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*src_tensors_names) : 0;
+  auto dst_tensors_names__ = dst_tensors_names ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*dst_tensors_names) : 0;
   return tflite::gpu::data::CreateGPUOperation(
-      _fbb, arguments, code__, work_group_size, compiler_options__,
-      tensor_to_grid, flops, definition, grid_dimension,
-      work_group_launch_order, grid_size, src_tensors_names__,
-      dst_tensors_names__, work_groups_count);
+      _fbb,
+      arguments,
+      code__,
+      work_group_size,
+      compiler_options__,
+      tensor_to_grid,
+      flops,
+      definition,
+      grid_dimension,
+      work_group_launch_order,
+      grid_size,
+      src_tensors_names__,
+      dst_tensors_names__,
+      work_groups_count);
 }
 
 }  // namespace data
diff --git a/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h b/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h
index 42b62b920d1..dd5ae203935 100644
--- a/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h
+++ b/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h
@@ -101,6 +101,9 @@ class TensorDescriptor : public GPUObjectDescriptor {
   template <DataType T>
   void UploadData(const tflite::gpu::Tensor<HWC, T>& src);
 
+  template <DataType T>
+  void UploadData(const tflite::gpu::Tensor<Linear, T>& src);
+
   int GetLinearIndex(const BHWDC& shape5d, int b, int x, int y, int d, int s,
                      int sub_c) const;
 
@@ -315,6 +318,12 @@ TensorDescriptor CreateConstantHWVec4TensorDescriptor(
     DataType data_type, TensorStorageType storage_type, int width, int height,
     const uint8_t* data);
 
+template <DataType T>
+void TensorDescriptor::UploadData(const tflite::gpu::Tensor<Linear, T>& src) {
+  shape_ = BHWDC(src.shape.v, 1, 1, 1, 1);
+  UploadData(src.data.data());
+}
+
 template <DataType T>
 void TensorDescriptor::UploadData(const tflite::gpu::Tensor<BHWC, T>& src) {
   shape_ = BHWDC(src.shape.b, src.shape.h, src.shape.w, 1, src.shape.c);
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/BUILD b/tensorflow/lite/delegates/gpu/common/tasks/BUILD
index ed4d4636ddd..4352c480468 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/tasks/BUILD
@@ -1020,6 +1020,7 @@ cc_library(
         ":space_to_depth",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common/task:testing_util",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/elementwise.cc b/tensorflow/lite/delegates/gpu/common/tasks/elementwise.cc
index 2cb079851b8..ff8cf35fd4b 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/elementwise.cc
@@ -78,6 +78,14 @@ $0.w = $1.w < INIT_FLT(0.0f) ? exp($1.w) - INIT_FLT(1.0f) : $1.w;)";
     case OperationType::FLOOR:
       result = "$0 = floor($1);";
       break;
+    case OperationType::GELU:
+      // OpenCL has erfc and so it can use the more accurate gelu calculation
+      // as compared to the OpenGL and Vulkan implementations.
+      // gelu(x) = 0.5 * x * erfc(x * -sqrt(0.5))
+      result =
+          "$0 = INIT_FLT4(0.5f) * $1 * erfc($1 * "
+          "INIT_FLT4(-0.70710678118654752440f));";
+      break;
     case OperationType::HARD_SWISH:
       result =
           "$0 = $1 * clamp($1 * INIT_FLT(0.16666667f) + INIT_FLT(0.5f), "
@@ -301,7 +309,9 @@ ElementwiseDescriptor CreateElementwiseTwoInput(
     bool swap_inputs) {
   const BHWC shape = BHWC(1, constant_tensor.shape.h, constant_tensor.shape.w,
                           constant_tensor.shape.c);
-  TensorDescriptor const_tensor_desc = definition.src_tensors[0];
+  TensorDescriptor const_tensor_desc =
+      TensorDescriptor(definition.src_tensors[0].GetDataType(),
+                       definition.src_tensors[0].GetStorageType(), Layout::HWC);
   auto status = const_tensor_desc.UpdateToSupportedStorageType(gpu_info, shape);
   const_tensor_desc.UploadData(constant_tensor);
 
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.cc
index 80b69a7b974..bb17899d402 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.h"
 
+#include <cmath>
 #include <memory>
 #include <vector>
 
@@ -256,6 +257,34 @@ absl::Status FloorModTest(TestExecutionEnvironment* env) {
   return absl::OkStatus();
 }
 
+absl::Status GeluTest(TestExecutionEnvironment* env) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 6);
+  src_tensor.data = {0.0f, 1.0f, 3.0f, 1.0f, -1.0f, -2.0f};
+
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-5f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation = CreateElementwiseOneInput(
+          env->GetGpuInfo(), op_def, OperationType::GELU);
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
+          BHWC(1, 1, 1, 6), &dst_tensor));
+      // Matches FloatActivationsOpTest::Gelu.
+      RETURN_IF_ERROR(PointWiseNear(
+          {0.0f, 0.841345f, 2.99595f, 0.841345f, -0.158655f, -0.0455003},
+          dst_tensor.data, eps));
+    }
+  }
+  return absl::OkStatus();
+}
+
 absl::Status HardSwishTest(TestExecutionEnvironment* env) {
   TensorFloat32 src_tensor;
   src_tensor.shape = BHWC(1, 1, 1, 7);
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.h b/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.h
index 2fcc734305d..97490d91e24 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.h
@@ -30,6 +30,7 @@ absl::Status ExpTest(TestExecutionEnvironment* env);
 absl::Status FloorTest(TestExecutionEnvironment* env);
 absl::Status FloorDivTest(TestExecutionEnvironment* env);
 absl::Status FloorModTest(TestExecutionEnvironment* env);
+absl::Status GeluTest(TestExecutionEnvironment* env);
 absl::Status HardSwishTest(TestExecutionEnvironment* env);
 absl::Status LogTest(TestExecutionEnvironment* env);
 absl::Status NegTest(TestExecutionEnvironment* env);
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/gather.cc b/tensorflow/lite/delegates/gpu/common/tasks/gather.cc
index 4a8261c5948..5b97f255873 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/gather.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/gather.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/tasks/gather.h"
 
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -23,7 +24,7 @@ namespace tflite {
 namespace gpu {
 
 namespace {
-std::string GetGatherCode(const OperationDef& op_def) {
+std::string GetGatherCode(const OperationDef& op_def, GatherAttributes attr) {
   std::string c;
   c += "MAIN_FUNCTION($0) {\n";
   if (op_def.IsBatchSupported()) {
@@ -41,22 +42,61 @@ std::string GetGatherCode(const OperationDef& op_def) {
        "S >= args.dst_tensor.Slices()) { \n";
   c += "    return; \n";
   c += "  } \n";
-  c += "  int src_x;\n";
-  c += "  args.indices.ReadPerChannel<int>(src_x, 0, 0, X);\n";
-  c += "  args.src_tensor::type result = args.src_tensor.Read(src_x, Y, S);\n";
+  c += "  int idx;\n";
+  c += "  args.src_tensor::type result;\n";
+  switch (attr.axis) {
+    case Axis::BATCH:
+      c += "  idx = args.indices.Read<int>(0, 0, 0, B).x;\n";
+      c += "  result = args.src_tensor.Read(X, Y, "
+           "S, idx);\n";
+      break;
+    case Axis::HEIGHT:
+      c += "  idx = args.indices.Read<int>(0, 0, 0, Y).x;\n";
+      c += "  result = args.src_tensor.Read(X, idx, "
+           "S, B);\n";
+      break;
+    case Axis::WIDTH:
+      c += "  idx = args.indices.Read<int>(0, 0, 0, X).x;\n";
+      c += "  result = args.src_tensor.Read(idx, Y, "
+           ", S, B);\n";
+      break;
+    case Axis::CHANNELS:
+      c += "  idx = args.indices.Read<int>(0, 0, 0, S * 4).x;\n";
+      c += "  args.src_tensor.ReadPerChannel(result.x, X, Y, idx, B);\n";
+      c += "  idx = args.indices.Read<int>(0, 0, 0, S * 4 + 1).x;\n";
+      c += "  args.src_tensor.ReadPerChannel(result.y, X, Y, idx, B);\n";
+      c += "  idx = args.indices.Read<int>(0, 0, 0, S * 4 + 2).x;\n";
+      c += "  args.src_tensor.ReadPerChannel(result.z, X, Y, idx, B);\n";
+      c += "  idx = args.indices.Read<int>(0, 0, 0, S * 4 + 3).x;\n";
+      c += "  args.src_tensor.ReadPerChannel(result.w, X, Y, idx, B);\n";
+      break;
+    default:
+      c += "  return;\n";
+  }
   c += "  args.dst_tensor.Write(result, X, Y, S);\n";
   c += "}\n";
   return c;
 }
 }  // namespace
 
-GPUOperation CreateGather(const OperationDef& op_def,
+GPUOperation CreateGather(const GpuInfo& gpu_info, const OperationDef& op_def,
                           const GatherAttributes& attr) {
   GPUOperation op(op_def);
   op.AddSrcTensor("src_tensor", op_def.src_tensors[0]);
-  op.AddSrcTensor("indices", op_def.src_tensors[1]);
   op.AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-  op.code_ = GetGatherCode(op_def);
+  if (op_def.src_tensors.size() == 1) {  // Constant indices
+    BHWC shape = BHWC(attr.indices.shape.v, 1, 1, 1);
+    TensorStorageType storage_type = GetStorageTypeForLinearTensor(
+        gpu_info, DataType::INT32, attr.indices.shape);
+    TensorDescriptor indices =
+        CreateBhwcTensorDescriptor(DataType::INT32, storage_type, shape);
+    indices.UploadData(attr.indices);
+    op.args_.AddObject("indices",
+                       std::make_unique<TensorDescriptor>(std::move(indices)));
+  } else {  // Runtime indices
+    op.AddSrcTensor("indices", op_def.src_tensors[1]);
+  }
+  op.code_ = GetGatherCode(op_def, attr);
   op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
   return op;
 }
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/gather.h b/tensorflow/lite/delegates/gpu/common/tasks/gather.h
index 5634dc6a4d0..cb6fe6f5a06 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/gather.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/gather.h
@@ -24,7 +24,7 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
-GPUOperation CreateGather(const OperationDef& op_def,
+GPUOperation CreateGather(const GpuInfo& gpu_info, const OperationDef& op_def,
                           const GatherAttributes& attr);
 
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.cc
index faa30a0245b..7bb4ceebf5e 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.cc
@@ -25,35 +25,46 @@ limitations under the License.
 
 namespace tflite {
 namespace gpu {
-namespace {
-absl::Status GatherWidthIntTest(TestExecutionEnvironment* env) {
+
+absl::Status GatherBatchTest(TestExecutionEnvironment* env, bool constant_idx) {
   TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 1, 5, 1);
+  src_tensor.shape = BHWC(5, 1, 1, 1);
   src_tensor.data = {half(1.5f), half(2.4f), half(3.3f), half(4.2f),
                      half(5.1f)};
-  tflite::gpu::Tensor<BHWC, DataType::INT32> src_indices;
-  src_indices.shape = BHWC(1, 1, 1, 9);
-  src_indices.data = {1, 2, 3, 0, 1, 4, 2, 3, 1};
+  std::vector<int> src_indices_data{1, 2, 3, 0, 1, 4, 2, 3, 1};
   GatherAttributes attr;
-  attr.axis = Axis::WIDTH;
+  attr.axis = Axis::BATCH;
   for (auto precision : env->GetSupportedPrecisions()) {
     auto data_type = DeduceDataTypeFromPrecision(precision);
     for (auto storage : env->GetSupportedStorages(data_type)) {
       OperationDef op_def;
       op_def.precision = precision;
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.src_tensors.push_back({DataType::INT32, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
       TensorDescriptor src_0, src_1, dst;
       src_0 = op_def.src_tensors[0];
-      src_1 = op_def.src_tensors[1];
       src_0.UploadData(src_tensor);
-      src_1.UploadData(src_indices);
-      dst.SetBHWDCShape(BHWDC(1, 1, 9, 1, 1));
-      GPUOperation operation = CreateGather(op_def, attr);
-      RETURN_IF_ERROR(env->ExecuteGPUOperation(
-          {&src_0, &src_1}, {&dst},
-          std::make_unique<GPUOperation>(std::move(operation))));
+      dst.SetBHWDCShape(BHWDC(9, 1, 1, 1, 1));
+      if (!constant_idx) {
+        op_def.src_tensors.push_back({DataType::INT32, storage, Layout::BHWC});
+        TensorDescriptor src_1;
+        src_1 = op_def.src_tensors[1];
+        tflite::gpu::Tensor<BHWC, DataType::INT32> src_indices;
+        src_indices.shape = BHWC(9, 1, 1, 1);
+        src_indices.data = src_indices_data;
+        src_1.UploadData(src_indices);
+        GPUOperation operation = CreateGather(env->GetGpuInfo(), op_def, attr);
+        RETURN_IF_ERROR(env->ExecuteGPUOperation(
+            {&src_0, &src_1}, {&dst},
+            std::make_unique<GPUOperation>(std::move(operation))));
+      } else {
+        attr.indices.shape = Linear(9);
+        attr.indices.data = src_indices_data;
+        GPUOperation operation = CreateGather(env->GetGpuInfo(), op_def, attr);
+        RETURN_IF_ERROR(env->ExecuteGPUOperation(
+            {&src_0}, {&dst},
+            std::make_unique<GPUOperation>(std::move(operation))));
+      }
       TensorFloat32 dst_tensor;
       dst.DownloadData(&dst_tensor);
       RETURN_IF_ERROR(PointWiseNear(
@@ -64,18 +75,64 @@ absl::Status GatherWidthIntTest(TestExecutionEnvironment* env) {
   }
   return absl::OkStatus();
 }
-}  // namespace
 
-absl::Status GatherWidthTest(TestExecutionEnvironment* env) {
+absl::Status GatherHeightTest(TestExecutionEnvironment* env,
+                              bool constant_idx) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 5, 1, 1);
+  src_tensor.data = {half(1.5f), half(2.4f), half(3.3f), half(4.2f),
+                     half(5.1f)};
+  std::vector<int> src_indices_data{1, 2, 3, 0, 1, 4, 2, 3, 1};
+  GatherAttributes attr;
+  attr.axis = Axis::HEIGHT;
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
+      TensorDescriptor src_0, src_1, dst;
+      src_0 = op_def.src_tensors[0];
+      src_0.UploadData(src_tensor);
+      dst.SetBHWDCShape(BHWDC(1, 9, 1, 1, 1));
+      if (!constant_idx) {
+        op_def.src_tensors.push_back({DataType::INT32, storage, Layout::BHWC});
+        TensorDescriptor src_1;
+        src_1 = op_def.src_tensors[1];
+        tflite::gpu::Tensor<BHWC, DataType::INT32> src_indices;
+        src_indices.shape = BHWC(9, 1, 1, 1);
+        src_indices.data = src_indices_data;
+        src_1.UploadData(src_indices);
+        GPUOperation operation = CreateGather(env->GetGpuInfo(), op_def, attr);
+        RETURN_IF_ERROR(env->ExecuteGPUOperation(
+            {&src_0, &src_1}, {&dst},
+            std::make_unique<GPUOperation>(std::move(operation))));
+      } else {
+        attr.indices.shape = Linear(9);
+        attr.indices.data = src_indices_data;
+        GPUOperation operation = CreateGather(env->GetGpuInfo(), op_def, attr);
+        RETURN_IF_ERROR(env->ExecuteGPUOperation(
+            {&src_0}, {&dst},
+            std::make_unique<GPUOperation>(std::move(operation))));
+      }
+      TensorFloat32 dst_tensor;
+      dst.DownloadData(&dst_tensor);
+      RETURN_IF_ERROR(PointWiseNear(
+          {half(2.4f), half(3.3f), half(4.2f), half(1.5f), half(2.4f),
+           half(5.1f), half(3.3f), half(4.2f), half(2.4f)},
+          dst_tensor.data, 0.0f));
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status GatherWidthTest(TestExecutionEnvironment* env, bool constant_idx) {
   TensorFloat32 src_tensor;
   src_tensor.shape = BHWC(1, 1, 5, 1);
   src_tensor.data = {half(1.5f), half(2.4f), half(3.3f), half(4.2f),
                      half(5.1f)};
-  TensorFloat32 src_indices;
-  src_indices.shape = BHWC(1, 1, 1, 9);
-  src_indices.data = {half(1.1f), half(2.1f), half(3.1f),
-                      half(0.1f), half(1.1f), half(4.1f),
-                      half(2.1f), half(3.1f), half(1.1f)};
+  std::vector<int> src_indices_data{1, 2, 3, 0, 1, 4, 2, 3, 1};
   GatherAttributes attr;
   attr.axis = Axis::WIDTH;
   for (auto precision : env->GetSupportedPrecisions()) {
@@ -83,23 +140,93 @@ absl::Status GatherWidthTest(TestExecutionEnvironment* env) {
     for (auto storage : env->GetSupportedStorages(data_type)) {
       OperationDef op_def;
       op_def.precision = precision;
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
+      TensorDescriptor src_0, src_1, dst;
+      src_0 = op_def.src_tensors[0];
+      src_0.UploadData(src_tensor);
+      dst.SetBHWDCShape(BHWDC(1, 1, 9, 1, 1));
+      GPUOperation operation = CreateGather(env->GetGpuInfo(), op_def, attr);
+      if (!constant_idx) {
+        op_def.src_tensors.push_back({DataType::INT32, storage, Layout::BHWC});
+        TensorDescriptor src_1;
+        src_1 = op_def.src_tensors[1];
+        tflite::gpu::Tensor<BHWC, DataType::INT32> src_indices;
+        src_indices.shape = BHWC(9, 1, 1, 1);
+        src_indices.data = src_indices_data;
+        src_1.UploadData(src_indices);
+        GPUOperation operation = CreateGather(env->GetGpuInfo(), op_def, attr);
+        RETURN_IF_ERROR(env->ExecuteGPUOperation(
+            {&src_0, &src_1}, {&dst},
+            std::make_unique<GPUOperation>(std::move(operation))));
+      } else {
+        attr.indices.shape = Linear(9);
+        attr.indices.data = src_indices_data;
+        GPUOperation operation = CreateGather(env->GetGpuInfo(), op_def, attr);
+        RETURN_IF_ERROR(env->ExecuteGPUOperation(
+            {&src_0}, {&dst},
+            std::make_unique<GPUOperation>(std::move(operation))));
+      }
       TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateGather(op_def, attr);
-      RETURN_IF_ERROR(env->ExecuteGPUOperation(
-          {src_tensor, src_indices},
-          std::make_unique<GPUOperation>(std::move(operation)),
-          BHWC(1, 1, 9, 1), &dst_tensor));
+      dst.DownloadData(&dst_tensor);
       RETURN_IF_ERROR(PointWiseNear(
           {half(2.4f), half(3.3f), half(4.2f), half(1.5f), half(2.4f),
            half(5.1f), half(3.3f), half(4.2f), half(2.4f)},
           dst_tensor.data, 0.0f));
     }
   }
+  return absl::OkStatus();
+}
 
-  RETURN_IF_ERROR(GatherWidthIntTest(env));
+absl::Status GatherChannelsTest(TestExecutionEnvironment* env,
+                                bool constant_idx) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 5);
+  src_tensor.data = {half(1.5f), half(2.4f), half(3.3f), half(4.2f),
+                     half(5.1f)};
+  std::vector<int> src_indices_data{1, 2, 3, 0, 1, 4, 2, 3, 1};
+  GatherAttributes attr;
+  attr.axis = Axis::CHANNELS;
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
+      TensorDescriptor src_0, src_1, dst;
+      src_0 = op_def.src_tensors[0];
+      src_0.UploadData(src_tensor);
+      dst.SetBHWDCShape(BHWDC(1, 1, 1, 1, 9));
+      GPUOperation operation = CreateGather(env->GetGpuInfo(), op_def, attr);
+      if (!constant_idx) {
+        op_def.src_tensors.push_back({DataType::INT32, storage, Layout::BHWC});
+        TensorDescriptor src_1;
+        src_1 = op_def.src_tensors[1];
+        tflite::gpu::Tensor<BHWC, DataType::INT32> src_indices;
+        src_indices.shape = BHWC(9, 1, 1, 1);
+        src_indices.data = src_indices_data;
+        src_1.UploadData(src_indices);
+        GPUOperation operation = CreateGather(env->GetGpuInfo(), op_def, attr);
+        RETURN_IF_ERROR(env->ExecuteGPUOperation(
+            {&src_0, &src_1}, {&dst},
+            std::make_unique<GPUOperation>(std::move(operation))));
+      } else {
+        attr.indices.shape = Linear(9);
+        attr.indices.data = src_indices_data;
+        GPUOperation operation = CreateGather(env->GetGpuInfo(), op_def, attr);
+        RETURN_IF_ERROR(env->ExecuteGPUOperation(
+            {&src_0}, {&dst},
+            std::make_unique<GPUOperation>(std::move(operation))));
+      }
+      TensorFloat32 dst_tensor;
+      dst.DownloadData(&dst_tensor);
+      RETURN_IF_ERROR(PointWiseNear(
+          {half(2.4f), half(3.3f), half(4.2f), half(1.5f), half(2.4f),
+           half(5.1f), half(3.3f), half(4.2f), half(2.4f)},
+          dst_tensor.data, 0.0f));
+    }
+  }
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.h b/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.h
index 63f0b76eab9..6e512a2a0a0 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.h
@@ -22,7 +22,11 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
-absl::Status GatherWidthTest(TestExecutionEnvironment* env);
+absl::Status GatherBatchTest(TestExecutionEnvironment* env, bool constant_idx);
+absl::Status GatherHeightTest(TestExecutionEnvironment* env, bool constant_idx);
+absl::Status GatherWidthTest(TestExecutionEnvironment* env, bool constant_idx);
+absl::Status GatherChannelsTest(TestExecutionEnvironment* env,
+                                bool constant_idx);
 
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/select_v2.cc b/tensorflow/lite/delegates/gpu/common/tasks/select_v2.cc
index e5649860b25..9502f541822 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/select_v2.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/select_v2.cc
@@ -63,17 +63,22 @@ std::string GetSelectV2Code(const OperationDef& op_def,
     c += "      args.dst_tensor.Slices() != args.cond_tensor.Slices();\n";
   }
   c += "  FLT4 res;\n";
-  c += "  if (should_gather_rows) {\n";
-  c += "    bool cond = args.cond_tensor.Read<bool>(X, 0, 0).x;\n";
-  c += "    res = cond ? true_val : else_val;\n";
-  c += "  } else {\n";
-  c += "    bool4 cond = args.cond_tensor.Read<bool>(0, Y, Z);\n";
-  c += "    res = true_val;\n";
-  c += "    res.x = cond.x ? true_val.x : else_val.x;\n";
-  c += "    res.y = cond.y ? true_val.y : else_val.y;\n";
-  c += "    res.z = cond.z ? true_val.z : else_val.z;\n";
-  c += "    res.w = cond.w ? true_val.w : else_val.w;\n";
-  c += "  }\n;";
+  if (attr.scalar_cond) {
+    c += "    bool cond = args.cond_tensor.Read<bool>(0, 0, 0).x;\n";
+    c += "    res = cond ? true_val : else_val;\n";
+  } else {
+    c += "  if (should_gather_rows) {\n";
+    c += "    bool cond = args.cond_tensor.Read<bool>(X, 0, 0).x;\n";
+    c += "    res = cond ? true_val : else_val;\n";
+    c += "  } else {\n";
+    c += "    bool4 cond = args.cond_tensor.Read<bool>(0, Y, Z);\n";
+    c += "    res = true_val;\n";
+    c += "    res.x = cond.x ? true_val.x : else_val.x;\n";
+    c += "    res.y = cond.y ? true_val.y : else_val.y;\n";
+    c += "    res.z = cond.z ? true_val.z : else_val.z;\n";
+    c += "    res.w = cond.w ? true_val.w : else_val.w;\n";
+    c += "  }\n;";
+  }
   c += "  args.dst_tensor.Write(res, X, Y, Z);\n";
   c += "}\n";
   return c;
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc b/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc
index 68c38379548..54c0050781e 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc
@@ -89,25 +89,19 @@ Softmax1x1::Softmax1x1(const OperationDef& definition, const GpuInfo& gpu_info,
                        const BHWC& shape)
     : GPUOperation(definition) {
   // work_group_size_.x must be power of 2 up to 1024
-  if (gpu_info.IsAdreno()) {
+  if (gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno7xx()) {
     work_group_size_ = int3(512, 1, 1);
   } else if (gpu_info.IsMali()) {
     work_group_size_ = int3(1024, 1, 1);
   } else {
-    work_group_size_ = int3(256, 1, 1);
+    work_group_size_ = int3(128, 1, 1);
   }
   const int slices = DivideRoundUp(shape.c, 4);
   while (work_group_size_.x >= slices * 2) {
     work_group_size_.x /= 2;
   }
-  if (gpu_info.IsAdreno()) {
-    while (work_group_size_.x >= gpu_info.GetMaxWorkGroupSizeForX()) {
-      work_group_size_.x /= 2;
-    }
-  } else {
-    while (work_group_size_.x > gpu_info.GetMaxWorkGroupSizeForX()) {
-      work_group_size_.x /= 2;
-    }
+  while (work_group_size_.x >= gpu_info.GetMaxWorkGroupSizeForX()) {
+    work_group_size_.x /= 2;
   }
   code_ = GetSoftmaxKernelCode(definition_);
 }
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth.cc b/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth.cc
index 58b31649dda..0f6c40cf861 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth.cc
@@ -43,17 +43,35 @@ std::string GetSpaceToDepthCode(const OperationDef& op_def) {
        "S >= args.dst_tensor.Slices()) { \n";
   c += "    return; \n";
   c += "  } \n";
+  // Note: block_size and channels are always > 0 so we omit the check here.
+  c += "  bool block_is_power_of_2 = (args.block_size & (args.block_size - 1)) "
+       "== 0;\n";
+  c += "  bool src_c_is_power_of_2 = (args.src_tensor.Channels() & "
+       "(args.src_tensor.Channels() - 1)) == 0;\n";
   c += "  args.src_tensor::scalar_type tmp[4];\n";
   c += "  tmp[0] = args.src_tensor::scalar_zero_value;\n";
   c += "  tmp[1] = args.src_tensor::scalar_zero_value;\n";
   c += "  tmp[2] = args.src_tensor::scalar_zero_value;\n";
   c += "  tmp[3] = args.src_tensor::scalar_zero_value;\n";
+  c += "  int block_exponent = (int)log2((float)args.block_size);\n";
   c += "  for (int i = 0; i < 4; ++i) {\n";
   c += "    int dst_c = 4 * S + i;\n";
   c += "    int block_id = dst_c / args.src_tensor.Channels();\n";
-  c += "    int src_x = X * args.block_size + block_id % args.block_size;\n";
-  c += "    int src_y = Y * args.block_size + block_id / args.block_size;\n";
-  c += "    int src_c = dst_c % args.src_tensor.Channels();\n";
+  c += "    int src_x; int src_y; \n";
+  c += "    if (block_is_power_of_2) {\n";
+  c += "        src_x = X * args.block_size + (block_id & (args.block_size - "
+       "1));\n";
+  c += "        src_y = Y * args.block_size + (block_id >> block_exponent);\n";
+  c += "    } else {\n";
+  c += "       src_x = X * args.block_size + block_id % args.block_size;\n";
+  c += "       src_y = Y * args.block_size + block_id / args.block_size;\n";
+  c += "    }\n";
+  c += "    int src_c;\n";
+  c += "    if (src_c_is_power_of_2) {\n";
+  c += "      src_c = (dst_c & (args.src_tensor.Channels() - 1));\n";
+  c += "    } else {\n";
+  c += "      src_c = dst_c % args.src_tensor.Channels();\n";
+  c += "    }\n";
   c += "    args.src_tensor.ReadPerChannel(tmp[i], src_x, src_y, src_c);\n";
   c += "  }\n";
   c += "  args.src_tensor::type result;\n";
@@ -97,6 +115,7 @@ std::string GetDepthToSpaceCode(const OperationDef& op_def) {
   c += "    int src_y = Y / args.block_size;\n";
   c += "    int block_id = block_y * args.block_size + block_x;\n";
   c += "    int src_c = block_id * args.dst_tensor.Channels() + dst_c;\n";
+  c += "    src_c = min(src_c, args.src_tensor.Channels() - 1);\n";
   c += "    args.src_tensor.ReadPerChannel(tmp[i], src_x, src_y, src_c);\n";
   c += "  }\n";
   c += "  FLT4 result;\n";
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth_test_util.cc
index 5bc3b42443d..036976e667a 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth_test_util.cc
@@ -22,10 +22,24 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
 #include "tensorflow/lite/delegates/gpu/common/tasks/space_to_depth.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
 namespace gpu {
 
+namespace {
+
+std::vector<float> GetRange(int size) {
+  std::vector<float> range;
+  range.reserve(size);
+  for (int i = 1; i <= size; ++i) {
+    range.push_back(half(static_cast<float>(i)));
+  }
+  return range;
+}
+
+}  // namespace
+
 absl::Status SpaceToDepthTensorShape1x2x2x1BlockSize2Test(
     TestExecutionEnvironment* env) {
   TensorFloat32 src_tensor;
@@ -44,9 +58,7 @@ absl::Status SpaceToDepthTensorShape1x2x2x1BlockSize2Test(
       RETURN_IF_ERROR(env->ExecuteGPUOperation(
           src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
           BHWC(1, 1, 1, 4), &dst_tensor));
-      RETURN_IF_ERROR(
-          PointWiseNear({half(1.0f), half(2.0f), half(3.0f), half(4.0f)},
-                        dst_tensor.data, 0.0f));
+      RETURN_IF_ERROR(PointWiseNear(GetRange(4), dst_tensor.data, 0.0f));
     }
   }
   return absl::OkStatus();
@@ -100,11 +112,7 @@ absl::Status SpaceToDepthTensorShape1x2x2x3BlockSize2Test(
       RETURN_IF_ERROR(env->ExecuteGPUOperation(
           src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
           BHWC(1, 1, 1, 12), &dst_tensor));
-      RETURN_IF_ERROR(
-          PointWiseNear({half(1.0f), half(2.0f), half(3.0f), half(4.0f),
-                         half(5.0f), half(6.0f), half(7.0f), half(8.0f),
-                         half(9.0f), half(10.0f), half(11.0f), half(12.0f)},
-                        dst_tensor.data, 0.0f));
+      RETURN_IF_ERROR(PointWiseNear(GetRange(12), dst_tensor.data, 0.0f));
     }
   }
   return absl::OkStatus();
@@ -131,12 +139,37 @@ absl::Status SpaceToDepthTensorShape1x4x4x1BlockSize2Test(
       RETURN_IF_ERROR(env->ExecuteGPUOperation(
           src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
           BHWC(1, 2, 2, 4), &dst_tensor));
-      RETURN_IF_ERROR(
-          PointWiseNear({half(1.0f), half(2.0f), half(3.0f), half(4.0f),
-                         half(5.0f), half(6.0f), half(7.0f), half(8.0f),
-                         half(9.0f), half(10.0f), half(11.0f), half(12.0f),
-                         half(13.0f), half(14.0f), half(15.0f), half(16.0f)},
-                        dst_tensor.data, 0.0f));
+      RETURN_IF_ERROR(PointWiseNear(GetRange(16), dst_tensor.data, 0.0f));
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status SpaceToDepthTensorShape1x6x6x1BlockSize3Test(
+    TestExecutionEnvironment* env) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 6, 6, 1);
+  src_tensor.data = {
+      half(1),  half(2),  half(3),  half(10), half(11), half(12),
+      half(4),  half(5),  half(6),  half(13), half(14), half(15),
+      half(7),  half(8),  half(9),  half(16), half(17), half(18),
+      half(19), half(20), half(21), half(28), half(29), half(30),
+      half(22), half(23), half(24), half(31), half(32), half(33),
+      half(25), half(26), half(27), half(34), half(35), half(36)};
+  const SpaceToDepthAttributes attr = {3};
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation = CreateSpaceToDepth(op_def, attr);
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          src_tensor, std::make_unique<GPUOperation>(std::move(operation)),
+          BHWC(1, 2, 2, 9), &dst_tensor));
+      RETURN_IF_ERROR(PointWiseNear(GetRange(36), dst_tensor.data, 0.0f));
     }
   }
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth_test_util.h b/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth_test_util.h
index d8c2b9a4d1c..28692542eb5 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth_test_util.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/space_to_depth_test_util.h
@@ -30,7 +30,8 @@ absl::Status SpaceToDepthTensorShape1x2x2x3BlockSize2Test(
     TestExecutionEnvironment* env);
 absl::Status SpaceToDepthTensorShape1x4x4x1BlockSize2Test(
     TestExecutionEnvironment* env);
-
+absl::Status SpaceToDepthTensorShape1x6x6x1BlockSize3Test(
+    TestExecutionEnvironment* env);
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise.cc b/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise.cc
index 19a9f2deca7..b8c4d05c6f1 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise.cc
@@ -30,7 +30,7 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace {
-std::string GenerateCode() {
+std::string GenerateCode(const ConvPointwiseAttributes& attr) {
   std::string c = R"(
 MAIN_FUNCTION($0) {
   int X = GLOBAL_ID_0;
@@ -64,9 +64,13 @@ MAIN_FUNCTION($0) {
     res.z += dot(src, w2);
     res.w += dot(src, w3);
   }
-  FLT4 result = TO_FLT4(res) / INIT_FLT(args.src_tensor.Channels());
-  args.dst_tensor.Write(result, X, Y, S);
-})";
+  FLT4 result = TO_FLT4(res);
+)";
+  if (attr.mean) {
+    c += "  result = result / INIT_FLT(args.src_tensor.Channels());\n";
+  }
+  c += "  args.dst_tensor.Write(result, X, Y, S);\n";
+  c += "}\n";
   return c;
 }
 
@@ -117,6 +121,19 @@ absl::Status IsMeanNode(const GraphFloat32& graph, Node* node,
   return absl::OkStatus();
 }
 
+absl::Status IsReduceSumNode(const GraphFloat32& graph, Node* node,
+                             NodeContext* node_context) {
+  RETURN_IF_ERROR(
+      IsNode(graph, OperationType::REDUCE_SUM, 1, 1, node, node_context));
+  auto reduce_attr =
+      std::any_cast<ReduceAttributes>(node_context->node->operation.attributes);
+  if (reduce_attr.dims != std::set<Axis>{Axis::CHANNELS}) {
+    return absl::InternalError(
+        "Expected reduce_sum node with channels reduction.");
+  }
+  return absl::OkStatus();
+}
+
 absl::Status IsMulNode(const GraphFloat32& graph, Node* node,
                        NodeContext* node_context) {
   RETURN_IF_ERROR(IsNode(graph, OperationType::MUL, 2, 1, node, node_context));
@@ -154,11 +171,15 @@ absl::Status IsConcatNode(const GraphFloat32& graph, Node* node,
 absl::Status GetOffset(const GraphFloat32& graph, NodeId concat_input_node,
                        NodeId second_commom_input_id, int* offset_x,
                        int* offset_y, std::set<NodeId>* consumed_nodes) {
-  NodeContext mean_node, mul_node, slice_node;
-  RETURN_IF_ERROR(
-      IsMeanNode(graph, graph.FindProducer(concat_input_node), &mean_node));
-  RETURN_IF_ERROR(
-      IsMulNode(graph, graph.FindProducer(mean_node.inputs[0]->id), &mul_node));
+  NodeContext reduce_node, mul_node, slice_node;
+  absl::Status status =
+      IsMeanNode(graph, graph.FindProducer(concat_input_node), &reduce_node);
+  if (!status.ok()) {
+    RETURN_IF_ERROR(IsReduceSumNode(
+        graph, graph.FindProducer(concat_input_node), &reduce_node));
+  }
+  RETURN_IF_ERROR(IsMulNode(
+      graph, graph.FindProducer(reduce_node.inputs[0]->id), &mul_node));
   const ValueId slice_output_id =
       mul_node.inputs[0]->id == second_commom_input_id ? mul_node.inputs[1]->id
                                                        : mul_node.inputs[0]->id;
@@ -168,7 +189,7 @@ absl::Status GetOffset(const GraphFloat32& graph, NodeId concat_input_node,
       absl::any_cast<SliceAttributes>(slice_node.node->operation.attributes);
   *offset_x = slice_attr.starts.w;
   *offset_y = slice_attr.starts.h;
-  consumed_nodes->insert(mean_node.node->id);
+  consumed_nodes->insert(reduce_node.node->id);
   consumed_nodes->insert(mul_node.node->id);
   consumed_nodes->insert(slice_node.node->id);
   return absl::OkStatus();
@@ -194,7 +215,7 @@ GPUOperation CreateConvPointwise(const OperationDef& definition,
   op.AddSrcTensor("src_tensor", definition.src_tensors[0]);
   op.AddSrcTensor("weights_tensor", definition.src_tensors[1]);
   op.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
-  op.code_ = GenerateCode();
+  op.code_ = GenerateCode(attr);
   op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
 
   TensorDescriptor desc = CreateConstantHWVec4TensorDescriptor(
@@ -226,15 +247,21 @@ absl::Status TryFusedPointwiseConv(
   if (mul_consumers.size() != 1) {
     return absl::NotFoundError("FusedPointwiseConv not suitable.");
   }
-  NodeContext mean_node;
-  RETURN_IF_ERROR(IsMeanNode(graph, mul_consumers[0], &mean_node));
-  auto mean_consumers = graph.FindConsumers(mean_node.outputs[0]->id);
-  if (mean_consumers.size() != 1) {
+  NodeContext reduce_node;
+  bool mean = true;
+  absl::Status status = IsMeanNode(graph, mul_consumers[0], &reduce_node);
+  if (!status.ok()) {
+    RETURN_IF_ERROR(IsReduceSumNode(graph, mul_consumers[0], &reduce_node));
+    mean = false;
+  }
+  auto reduce_consumers = graph.FindConsumers(reduce_node.outputs[0]->id);
+  if (reduce_consumers.size() != 1) {
     return absl::NotFoundError("FusedPointwiseConv not suitable.");
   }
   NodeContext concat_node;
-  RETURN_IF_ERROR(IsConcatNode(graph, mean_consumers[0], &concat_node));
+  RETURN_IF_ERROR(IsConcatNode(graph, reduce_consumers[0], &concat_node));
   ConvPointwiseAttributes op_attr;
+  op_attr.mean = mean;
   std::set<NodeId> temp_consumed_nodes;
   for (const auto& concat_input : concat_node.inputs) {
     int offset_x, offset_y;
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise.h b/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise.h
index a59d60de657..5ddb01ace74 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise.h
@@ -27,7 +27,10 @@ namespace tflite {
 namespace gpu {
 
 struct ConvPointwiseAttributes {
+  // (Slice start width, slice start height)
   std::vector<int2> offsets;
+  // True if we use mean as the reduce op, false to use reduce_sum.
+  bool mean;
 };
 
 GPUOperation CreateConvPointwise(const OperationDef& definition,
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/special/thin_pointwise_fuser.cc b/tensorflow/lite/delegates/gpu/common/tasks/special/thin_pointwise_fuser.cc
index eba0af8ace5..03fd1292827 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/special/thin_pointwise_fuser.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/special/thin_pointwise_fuser.cc
@@ -94,7 +94,7 @@ int GetDepthwiseConvWeightsSize(const DepthwiseConvolution2DAttributes& attr,
 bool IsElementwiseOneInput(const OperationType& op_type) {
   return op_type == OperationType::ABS || op_type == OperationType::COPY ||
          op_type == OperationType::COS || op_type == OperationType::ELU ||
-         op_type == OperationType::EXP ||
+         op_type == OperationType::EXP || op_type == OperationType::GELU ||
          op_type == OperationType::HARD_SWISH ||
          op_type == OperationType::LOG || op_type == OperationType::NEG ||
          op_type == OperationType::RSQRT || op_type == OperationType::SIGMOID ||
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/BUILD b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
index d6e52049bf0..10609f8d298 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
@@ -283,8 +283,8 @@ cc_library(
         ":fuse_add_to_conv",
         ":fuse_mul_to_conv",
         ":make_fully_connected",
-        ":merge_densify",
         ":make_padding",
+        ":merge_densify",
         ":merge_padding_with",
         ":remove_noop",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index e925a366806..e0ba598c843 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -216,15 +216,6 @@ class Delegate {
 
   bool async_;
 
-#if defined(__ANDROID__)
-  // TODO(b/245966018): While tflite async is in experimental stage, there's no
-  // clean way to deallocate a kernel via TfLiteRegistration::free. For now, we
-  // allow kernels to live until the Delegate is destroyed.
-  mutable absl::Mutex async_kernels_mutex_;
-  std::vector<std::unique_ptr<DelegateAsyncKernel>> async_kernels_
-      ABSL_GUARDED_BY(async_kernels_mutex_);
-#endif
-
   friend class DelegateKernelCore;
 #if defined(__ANDROID__)
   friend TfLiteRegistration CreateAsyncRegistration();
@@ -351,14 +342,13 @@ absl::Status DelegateKernelCore::InitializeGraph(
   //
   // Similarly, TfLiteDelegateParams.output_tensors is an array of all output
   // tensors, and can contain static tensors with buggy conversion.
-  // GraphFloat32.outputs() is an array of runtime tensors that don't have a
-  // consumer (this is a bug in the assumption) and the order may not be the
-  // same as defined by TfLiteDelegateParams.output_tensors.  Again, these two
-  // sets are not the same, especially on a multi-partition delegation.  These
-  // are matched by inserting the tensors by the order defined by
-  // TfLiteDelegateParams.output_tensors.  Similarly, this logic is shared
-  // with ModelBuilder::PrecreateIOTensors() which is eventually called with
-  // BuildFinalModel() above.
+  // GraphFloat32.outputs() is an array of runtime tensors and the order may not
+  // be the same as defined by TfLiteDelegateParams.output_tensors.  Again,
+  // these two sets are not the same, especially on a multi-partition
+  // delegation.  These are matched by inserting the tensors by the order
+  // defined by TfLiteDelegateParams.output_tensors.  Similarly, this logic is
+  // shared with ModelBuilder::PrecreateIOTensors() which is eventually called
+  // with BuildFinalModel() above.
   //
   // The aforementioned matching in BuildFinalModel() is ported here to match
   // input/output_refs.
@@ -1432,16 +1422,11 @@ TfLiteRegistration CreateAsyncRegistration() {
                              std::string(status.message()).c_str());
           return nullptr;
         }
-        TfLiteAsyncKernel* tflite_async_kernel =
-            gpu_delegate_kernel.get()->kernel();
-        absl::MutexLock lock(&gpu_delegate->async_kernels_mutex_);
-        gpu_delegate->async_kernels_.emplace_back(
-            std::move(gpu_delegate_kernel));
-        return tflite_async_kernel;
+        return gpu_delegate_kernel.release();
       },
       // .free
       [](TfLiteContext*, void* buffer) -> void {
-        // Do nothing: See Delegate::async_kernels_.
+        delete reinterpret_cast<DelegateAsyncKernel*>(buffer);
       },
       // ,prepare
       [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
@@ -1464,7 +1449,14 @@ TfLiteRegistration CreateAsyncRegistration() {
       0,                        // .builtin_code
       kRegistrationCustomName,  // .custom_name
       1,                        // .version
-  };
+      nullptr,                  // .registration_external
+      // .async_kernel
+      [](TfLiteContext*, TfLiteNode* node) -> TfLiteAsyncKernel* {
+        if (node->user_data) {
+          return static_cast<DelegateAsyncKernel*>(node->user_data)->kernel();
+        }
+        return nullptr;
+      }};
 }
 #endif  // defined(__ANDROID__)
 
@@ -1483,9 +1475,17 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
     excluded_ops.insert(kTfLiteBuiltinSplit);
     excluded_ops.insert(kTfLiteBuiltinSplitV);
   }
+#ifndef TFLITE_DEBUG_DELEGATE
   TfLiteIntArray* ops_to_replace =
       GetOpsToReplace(context, gpu_delegate->IsQuantOpsAllowed(),
                       gpu_delegate->MaxDelegatedPartitions(), &excluded_ops);
+#else
+  TfLiteIntArray* ops_to_replace =
+      GetOpsToReplace(context, gpu_delegate->IsQuantOpsAllowed(),
+                      gpu_delegate->MaxDelegatedPartitions(), &excluded_ops,
+                      gpu_delegate->options().first_delegate_node_index,
+                      gpu_delegate->options().last_delegate_node_index);
+#endif
   const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
       context, kRegistration, ops_to_replace, delegate);
   TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Created %d GPU delegate kernels.",
diff --git a/tensorflow/lite/delegates/gpu/delegate_options.cc b/tensorflow/lite/delegates/gpu/delegate_options.cc
index f18c993372b..c6ee62be9b9 100644
--- a/tensorflow/lite/delegates/gpu/delegate_options.cc
+++ b/tensorflow/lite/delegates/gpu/delegate_options.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/delegate_options.h"
 
+#include <limits>
+
 TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
   TfLiteGpuDelegateOptionsV2 options;
   // set it to -1 to detect whether it was later adjusted.
@@ -28,5 +30,9 @@ TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
   options.max_delegated_partitions = 1;
   options.model_token = nullptr;
   options.serialization_dir = nullptr;
+#ifdef TFLITE_DEBUG_DELEGATE
+  options.first_delegate_node_index = 0;
+  options.last_delegate_node_index = std::numeric_limits<int>::max();
+#endif
   return options;
 }
diff --git a/tensorflow/lite/delegates/gpu/delegate_options.h b/tensorflow/lite/delegates/gpu/delegate_options.h
index c6d6500df76..279ae0de949 100644
--- a/tensorflow/lite/delegates/gpu/delegate_options.h
+++ b/tensorflow/lite/delegates/gpu/delegate_options.h
@@ -136,6 +136,13 @@ typedef struct {
   // Set to nullptr in TfLiteGpuDelegateOptionsV2Default(), which implies the
   // delegate will not try serialization.
   const char* model_token;
+
+#ifdef TFLITE_DEBUG_DELEGATE
+  // This sets the index of the first node that could be delegated.
+  int first_delegate_node_index;
+  // This sets the index of the last node that could be delegated.
+  int last_delegate_node_index;
+#endif
 } TfLiteGpuDelegateOptionsV2;
 
 // Populates TfLiteGpuDelegateOptionsV2 as follows:
diff --git a/tensorflow/lite/delegates/gpu/gl/BUILD b/tensorflow/lite/delegates/gpu/gl/BUILD
index 83a69bb6b52..ff4c1663c0e 100644
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@@ -21,23 +21,23 @@ cc_library(
         ":compiler",
         ":compiler_options",
         ":gl_call",
-        ":request_gpu_info",
         ":node_shader",
         ":object",
         ":object_manager",
         ":portable",
+        ":request_gpu_info",
         ":runtime",
         ":runtime_options",
         ":stats",
         ":variable",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/gl/workgroups:calculator",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow/lite/delegates/gpu:tflite_gpu_binary_release": [],
         "//conditions:default": [
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
index 1877fb1f144..9556a4f54d2 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
@@ -120,6 +120,7 @@ absl::Status GetSSBOSize(GLuint id, int64_t* size_bytes);
 
 // Creates new shader storage buffer that will be modified and used many
 // times.
+// Buffer will be initialized with 0's.
 //
 // See https://www.khronos.org/opengl/wiki/Shader_Storage_Buffer_Object for
 // details.
@@ -248,9 +249,9 @@ absl::Status CreateReadWriteShaderStorageBuffer(uint32_t num_elements,
   gl_buffer_internal::BufferId id;
   gl_buffer_internal::BufferBinder binder(GL_SHADER_STORAGE_BUFFER, id.id());
   // TODO(akulik): benchmark DYNAMIC vs STREAM buffer
-  RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(glBufferData, GL_SHADER_STORAGE_BUFFER,
-                                     num_elements * sizeof(T), nullptr,
-                                     GL_STREAM_COPY));
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(
+      glBufferData, GL_SHADER_STORAGE_BUFFER, num_elements * sizeof(T),
+      std::vector<T>(num_elements).data(), GL_STREAM_COPY));
   *gl_buffer = GlBuffer{GL_SHADER_STORAGE_BUFFER, id.Release(),
                         num_elements * sizeof(T), 0, true};
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer_test.cc b/tensorflow/lite/delegates/gpu/gl/gl_buffer_test.cc
index 863f5ec6020..eed65d73c20 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer_test.cc
@@ -27,6 +27,17 @@ namespace gpu {
 namespace gl {
 namespace {
 
+TEST(Buffer, CreateReadWrite) {
+  std::unique_ptr<EglEnvironment> env;
+  ASSERT_TRUE(EglEnvironment::NewEglEnvironment(&env).ok());
+  GlBuffer buffer;
+  ASSERT_TRUE(CreateReadWriteShaderStorageBuffer<float>(4, &buffer).ok());
+
+  std::vector<float> from_buffer;
+  ASSERT_TRUE(AppendFromBuffer(buffer, &from_buffer).ok());
+  EXPECT_THAT(from_buffer, testing::ElementsAre(0, 0, 0, 0));
+}
+
 TEST(Buffer, Read) {
   std::unique_ptr<EglEnvironment> env;
   ASSERT_TRUE(EglEnvironment::NewEglEnvironment(&env).ok());
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 9b45ceab9d2..c648b75cd0f 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -844,10 +844,10 @@ cc_library(
                "//conditions:default": NON_TFLITE_GPU_BINARY_RELEASE_OPERATORS,
            }) + [
         ":custom_registry",
-        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:node_shader",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
index b2a6a99738f..db6714b0160 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
@@ -62,6 +62,15 @@ class ElementwiseOneArgument : public NodeShader {
       case tflite::gpu::OperationType::FLOOR:
         source = "value_0 = floor(value_0);";
         break;
+      case tflite::gpu::OperationType::GELU:
+        // Matches the approximate implementation of the cpu reference op.
+        // There's no gpu implementation of erfc so we can't match the accurate
+        // version.
+        // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+        source =
+            "value_0 = 0.5 * value_0 * (1.0 + tanh(0.7978845608 * (value_0 + "
+            "0.044715 * value_0 * value_0 * value_0)));";
+        break;
       case OperationType::HARD_SWISH:
         source =
             "value_0 *= clamp(value_0 / 6.0 + vec4(0.5), vec4(0.0), "
@@ -243,6 +252,7 @@ std::unique_ptr<NodeShader> NewElementwiseNodeShader(
     case OperationType::ELU:
     case OperationType::EXP:
     case OperationType::FLOOR:
+    case OperationType::GELU:
     case OperationType::HARD_SWISH:
     case OperationType::LOG:
     case OperationType::NEG:
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
index 01dac453462..fa8b222bbfa 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
@@ -119,6 +119,21 @@ TEST(ElementwiseOneArgumentTest, Floor) {
                         {-5.0f, -3.0f, -2.0f, 0.0f, 1.0f, 3.0f, 4.0f}));
 }
 
+TEST(ElementwiseOneArgumentTest, Gelu) {
+  OperationType op_type = OperationType::GELU;
+  const BHWC shape(1, 1, 1, 6);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  ASSERT_TRUE(model.PopulateTensor(0, {0.0f, 1.0f, 3.0f, 1.0f, -1.0f, -2.0f}));
+  ASSERT_OK(model.Invoke(*NewElementwiseNodeShader(op_type)));
+  // Matches FloatActivationsOpTest::GeluApproximate since the gpu kernel only
+  // uses the approximate version.
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-5), {0.0f, 0.841192f, 2.99636f, 0.841192f,
+                                          -0.158808f, -0.0454023f}));
+}
+
 TEST(ElementwiseOneArgumentTest, HardSwish) {
   OperationType op_type = OperationType::HARD_SWISH;
   const BHWC shape(1, 1, 1, 7);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/lstm_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/lstm_test.cc
index afa24592683..ceb6ccf2497 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/lstm_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/lstm_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/gl/kernels/lstm.h"
 
+#include <cmath>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -30,40 +31,49 @@ namespace gpu {
 namespace gl {
 namespace {
 
-TEST(LstmTest, Input2x2x1) {
+TEST(LstmTest, BaseTest) {
   TensorRef<BHWC> input;
   input.type = DataType::FLOAT32;
   input.ref = 0;
-  input.shape = BHWC(1, 2, 2, 1);
+  input.shape = BHWC(1, 1, 1, 16);
 
   TensorRef<BHWC> prev_state;
   prev_state.type = DataType::FLOAT32;
   prev_state.ref = 1;
-  prev_state.shape = BHWC(1, 2, 2, 1);
+  prev_state.shape = BHWC(1, 1, 1, 4);
 
   TensorRef<BHWC> output_state;
   output_state.type = DataType::FLOAT32;
   output_state.ref = 2;
-  output_state.shape = BHWC(1, 2, 2, 1);
+  output_state.shape = BHWC(1, 1, 1, 4);
 
   TensorRef<BHWC> output_activation;
   output_activation.type = DataType::FLOAT32;
   output_activation.ref = 3;
-  output_activation.shape = BHWC(1, 2, 2, 1);
+  output_activation.shape = BHWC(1, 1, 1, 4);
 
   LstmAttributes attr;
   attr.kernel_type = LstmKernelType::BASIC;
 
   SingleOpModel model({ToString(OperationType::LSTM), attr},
                       {input, prev_state}, {output_state, output_activation});
-  ASSERT_TRUE(model.PopulateTensor(0, {1, 2, 3, 4}));
-  ASSERT_TRUE(model.PopulateTensor(1, {5, 6, 7, 8}));
+  std::vector input_data = {
+      -std::log(2.0f), -std::log(2.0f), -std::log(2.0f), -std::log(2.0f),
+      std::log(3.0f),  std::log(3.0f),  std::log(3.0f),  std::log(3.0f),
+      -std::log(4.0f), -std::log(4.0f), -std::log(4.0f), -std::log(4.0f),
+      -std::log(5.0f), -std::log(5.0f), -std::log(5.0f), -std::log(5.0f)};
+  ASSERT_TRUE(model.PopulateTensor(0, std::move(input_data)));
+  ASSERT_TRUE(model.PopulateTensor(1, {1, 2, 3, 4}));
   ASSERT_OK(model.Invoke(*NewLstmNodeShader()));
   EXPECT_THAT(model.GetOutput(0),
-              Pointwise(FloatNear(1e-6), {2.5, 3.0, 3.5, 4.0}));
+              Pointwise(FloatNear(1e-6),
+                        {7.0 / 15.0, 10.0 / 15.0, 13.0 / 15.0, 16.0 / 15.0}));
   EXPECT_THAT(
       model.GetOutput(1),
-      Pointwise(FloatNear(1e-6), {0.493307, 0.497527, 0.499089, 0.499665}));
+      Pointwise(FloatNear(1e-6), {(1.f / 6.f) * std::tanh(7.f / 15.f),
+                                  (1.f / 6.f) * std::tanh(10.f / 15.f),
+                                  (1.f / 6.f) * std::tanh(13.f / 15.f),
+                                  (1.f / 6.f) * std::tanh(16.f / 15.f)}));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc
index 9b0a1df1394..fbb37e0e6be 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc
@@ -201,7 +201,7 @@ void GenerateTiledMean(const NodeShader::GenerationContext& ctx,
 
   std::vector<Variable> shared_variables = {
       {"tile_sum",
-       std::vector<float4>((w / kTileSize.x) * (h / kTileSize.y) * s)}};
+       std::vector<float4>((w / kTileSize.x) * (h / kTileSize.y) * s * 4)}};
 
   std::string source = R"(
   ivec2 tile_size = ivec2($tile_size_w$, $tile_size_h$);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
index 58f4dca60b2..915b1bdf672 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
@@ -111,6 +111,7 @@ class Registry : public NodeShader {
     insert_elementwise_op(Type::FLOOR);
     insert_elementwise_op(Type::FLOOR_DIV);
     insert_elementwise_op(Type::FLOOR_MOD);
+    insert_elementwise_op(Type::GELU);
     insert_elementwise_op(Type::HARD_SWISH);
     insert_elementwise_op(Type::LOG);
     insert_elementwise_op(Type::NEG);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/resampler_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/resampler_test.cc
index f218132816c..25e9f079a7b 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/resampler_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/resampler_test.cc
@@ -74,17 +74,17 @@ absl::Status ResamplerIdentityTest(const BHWC& shape) {
 
 TEST(ResamplerTest, Identity_2_2_1) {
   auto status = ResamplerIdentityTest(BHWC(1, 2, 2, 1));
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST(ResamplerTest, Identity_3_5_3) {
   auto status = ResamplerIdentityTest(BHWC(1, 3, 5, 3));
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 TEST(ResamplerTest, Identity_6_1_7) {
   auto status = ResamplerIdentityTest(BHWC(1, 6, 1, 7));
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_TRUE(status.ok()) << status.message();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD b/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
index 99d02fb7bdc..43de9560fac 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
@@ -35,14 +35,14 @@ cc_library(
         "//conditions:default": [
             ":calculator",
             ":default_calculator",
-            "@com_google_absl//absl/container:flat_hash_map",
-            "@com_google_absl//absl/memory",
-            "@flatbuffers",
             "//tensorflow/lite/delegates/gpu/common:gpu_info",
             "//tensorflow/lite/delegates/gpu/common:types",
             "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs",
             "//tensorflow/lite/delegates/gpu/gl:metadata_cc_fbs",
             "//tensorflow/lite/delegates/gpu/gl:workgroups_cc_fbs",
+            "@com_google_absl//absl/container:flat_hash_map",
+            "@com_google_absl//absl/memory",
+            "@flatbuffers",
         ],
     }),
 )
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
index 16ad8fba81e..612c4d76cc0 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
@@ -21,8 +21,8 @@ limitations under the License.
 
 #if TFLITE_DISABLE_SELECT_JAVA_APIS
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/core/shims/c/common.h"
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/gpu_plugin.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #else
 #include "tensorflow/lite/delegates/gpu/delegate.h"
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index 2232d17f75b..72206b7678b 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -647,6 +647,7 @@ objc_library(
     sdk_frameworks = ["XCTest"],
     deps = [
         ":test_util",
+        "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:space_to_depth_test_util",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth_test.mm
index 15504cbedd8..49e3d93ceaf 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth_test.mm
@@ -46,4 +46,9 @@ limitations under the License.
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }
 
+- (void)testSpaceToDepthTensorShape1x6x6x1BlockSize3 {
+  auto status = SpaceToDepthTensorShape1x6x6x1BlockSize3Test(&exec_env_);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
 @end
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index f1405e6e661..6bb146bb6e7 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -23,8 +23,8 @@ cc_library(
         "//conditions:default": [
             "nnapi_delegate.cc",
             "nnapi_delegate_c_api.cc",
-            "quant_lstm_sup.h",
             "quant_lstm_sup.cc",
+            "quant_lstm_sup.h",
         ],
     }),
     hdrs = [
@@ -87,8 +87,8 @@ cc_library(
         ],
         "//conditions:default": [
             "nnapi_delegate.cc",
-            "quant_lstm_sup.h",
             "quant_lstm_sup.cc",
+            "quant_lstm_sup.h",
         ],
     }),
     hdrs = [
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_impl_jni.cc b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_impl_jni.cc
index 757272b4100..d12657d1eae 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_impl_jni.cc
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_impl_jni.cc
@@ -20,8 +20,8 @@ limitations under the License.
 
 #if TFLITE_DISABLE_SELECT_JAVA_APIS
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/core/shims/c/common.h"
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/nnapi_plugin.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #else
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index fa2a7dd4beb..844710eaa45 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -1945,16 +1945,18 @@ class NNAPIOpBuilder {
         if (allocation_memory_mapping_->count(mmap_alloc) == 0) {
           ANeuralNetworksMemory* ann_memory_handle = nullptr;
           nnapi_->ANeuralNetworksMemory_createFromFd(
-              mmap_alloc->bytes(), PROT_READ, mmap_alloc->fd(), 0,
-              &ann_memory_handle);
+              mmap_alloc->mmapped_buffer_size(), PROT_READ, mmap_alloc->fd(),
+              mmap_alloc->mmapped_buffer_offset_in_file(), &ann_memory_handle);
           allocation_memory_mapping_->insert(
               std::make_pair(mmap_alloc, ann_memory_handle));
         }
         ANeuralNetworksMemory* ann_memory_handle =
             allocation_memory_mapping_->at(mmap_alloc);
-        // Compute the offset to the base pointer of the MMAPAllocation.
-        auto offset = reinterpret_cast<const uint8_t*>(tensor->data.raw) -
-                      reinterpret_cast<const uint8_t*>(mmap_alloc->base());
+        // Compute the offset to the mmapped buffer address of the
+        // MMAPAllocation.
+        auto offset =
+            reinterpret_cast<const uint8_t*>(tensor->data.raw) -
+            reinterpret_cast<const uint8_t*>(mmap_alloc->mmapped_buffer());
         RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context_,
             nnapi_->ANeuralNetworksModel_setOperandValueFromMemory(
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 423590a00b4..5e3a31ee2d9 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -149,7 +149,7 @@ enum class NNAPIValidationFailureType : int {
   // is specified in the validation failure message.
   // For more details on each operator version see
   // the GetBuiltinOperatorVersion function in
-  // third_party/tensorflow/lite/tools/versioning/op_version.cc.
+  // tensorflow/lite/tools/versioning/op_version.cc.
   kUnsupportedOperatorVersion = 2,
   // The given input operand type is not supported for the current combination
   // of operator type and sdk version.
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index baf5486cf71..75efbeee597 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -420,7 +420,7 @@ TEST(NNAPIDelegate, StatefulDelegateWithQoS) {
 }
 
 // Sanity check for the state-ful NNAPI delegate using TfLiteBufferHandle.
-TEST(NNAPIDelegate, StatefulDelegateWithBufferHandles) {
+TEST(NNAPIDelegate, DISABLED_StatefulDelegateWithBufferHandles) {
   // Skip the test if Android specific functions could not be found.
   if (!NnApiImplementation()->ASharedMemory_create ||
       !NnApiImplementation()->ANeuralNetworksMemory_createFromFd) {
@@ -5653,7 +5653,7 @@ TEST(NNAPIDelegate, CustomFloorVendorExtension) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0.0, 0.0, 1.0, 2.0}));
 }
 
-TEST(NNAPIDelegate, CustomFloorVendorExtensionDynamic) {
+TEST(NNAPIDelegate, DISABLED_CustomFloorVendorExtensionDynamic) {
   // Models with dynamic dimensions and vendor plugin is not supported before
   // NNAPI 1.2 (API level 29).
   if (NnApiImplementation()->android_sdk_version <
diff --git a/tensorflow/lite/delegates/opaque_delegate_test.cc b/tensorflow/lite/delegates/opaque_delegate_test.cc
index 3d301da8a4d..c547bdba501 100644
--- a/tensorflow/lite/delegates/opaque_delegate_test.cc
+++ b/tensorflow/lite/delegates/opaque_delegate_test.cc
@@ -36,7 +36,7 @@ namespace {
 TEST(TestOpaqueDelegate, AddDelegate) {
   std::unique_ptr<tflite::FlatBufferModel> model =
       tflite::FlatBufferModel::BuildFromFile(
-          "third_party/tensorflow/lite/testdata/add.bin");
+          "tensorflow/lite/testdata/add.bin");
   ASSERT_NE(model, nullptr);
 
   TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
@@ -83,7 +83,7 @@ class TestOpaqueMacros : public ::testing::Test {
   }
   void SetUp() override {
     model_ = tflite::FlatBufferModel::BuildFromFile(
-        "third_party/tensorflow/lite/testdata/add.bin", &reporter_);
+        "tensorflow/lite/testdata/add.bin", &reporter_);
     ASSERT_NE(model_, nullptr);
   }
   void TearDown() override { TfLiteOpaqueDelegateDelete(opaque_delegate_); }
@@ -111,7 +111,7 @@ TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_REPORTS) {
   };
   EnsureDelegationFails();
   const std::string txt_regex(
-      ".*third_party/tensorflow/lite/delegates/"
+      ".*tensorflow/lite/delegates/"
       "opaque_delegate_test\\.cc.*false was not true.*");
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
   EXPECT_THAT(reporter_.error_messages(), ContainsRegex(txt_regex));
@@ -128,7 +128,7 @@ TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_SILENT) {
   };
   EnsureDelegationSucceeds();
   const std::string txt_regex(
-      ".*third_party/tensorflow/lite/delegates/"
+      ".*tensorflow/lite/delegates/"
       "opaque_delegate_test\\.cc.*was not true.*");
   EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
 }
@@ -141,7 +141,7 @@ TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_EQ_REPORTS) {
   };
   EnsureDelegationFails();
   const std::string txt_regex(
-      ".*third_party/tensorflow/lite/delegates/"
+      ".*tensorflow/lite/delegates/"
       "opaque_delegate_test\\.cc.*true != false.*");
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
   EXPECT_THAT(reporter_.error_messages(), ContainsRegex(txt_regex));
@@ -158,7 +158,7 @@ TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_EQ_SILENT) {
   };
   EnsureDelegationSucceeds();
   const std::string txt_regex(
-      ".*third_party/tensorflow/lite/delegates/"
+      ".*tensorflow/lite/delegates/"
       "opaque_delegate_test\\.cc.* != *");
   EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
 }
@@ -171,7 +171,7 @@ TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_MSG_REPORTS) {
   };
   EnsureDelegationFails();
   const std::string txt_regex(
-      ".*third_party/tensorflow/lite/delegates/"
+      ".*tensorflow/lite/delegates/"
       "opaque_delegate_test\\.cc.*custom error msg.*");
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
   EXPECT_THAT(reporter_.error_messages(), ContainsRegex(txt_regex));
@@ -188,7 +188,7 @@ TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_MSG_SILENT) {
   };
   EnsureDelegationSucceeds();
   const std::string txt_regex(
-      ".*third_party/tensorflow/lite/delegates/"
+      ".*tensorflow/lite/delegates/"
       "opaque_delegate_test\\.cc.*");
   EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
 }
@@ -202,7 +202,7 @@ TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_TYPES_EQ_REPORTS) {
   };
   EnsureDelegationFails();
   const std::string txt_regex(
-      ".*third_party/tensorflow/lite/delegates/"
+      ".*tensorflow/lite/delegates/"
       "opaque_delegate_test\\.cc.*kTfLiteFloat32 != kTfLiteInt32.*");
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
   EXPECT_THAT(reporter_.error_messages(), ContainsRegex(txt_regex));
@@ -220,7 +220,7 @@ TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_TYPES_EQ_SILENT) {
   };
   EnsureDelegationSucceeds();
   const std::string txt_regex(
-      ".*third_party/tensorflow/lite/delegates/"
+      ".*tensorflow/lite/delegates/"
       "opaque_delegate_test\\.cc.*!=.*");
   EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
 }
@@ -233,7 +233,7 @@ TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_NEAR_REPORTS) {
   };
   EnsureDelegationFails();
   const std::string txt_regex(
-      ".*third_party/tensorflow/lite/delegates/"
+      ".*tensorflow/lite/delegates/"
       "opaque_delegate_test\\.cc.*1 not near 10.*");
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
   EXPECT_THAT(reporter_.error_messages(), ContainsRegex(txt_regex));
@@ -250,7 +250,7 @@ TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_NEAR_SILENT) {
   };
   EnsureDelegationSucceeds();
   const std::string txt_regex(
-      ".*third_party/tensorflow/lite/delegates/"
+      ".*tensorflow/lite/delegates/"
       "opaque_delegate_test\\.cc.*10 not near 10.*");
   EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
 }
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index 2d0deab6c42..a20a6acfb96 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -49,13 +49,33 @@ TfLiteContext* GetSubgraphContext(const TfLiteContext* context,
 
 // Marks the subgraph with the given index as delegation-skippable. Returns
 // kTfLiteOk if the given subgraph index is valid and is successfully marked
-// as delegation-skippable.
-// See the documentation on the private is_delegation_skippable_ field for
-// more details.
-// NOTE: This function is expected to be called only when the subgraph will be
-// skipped by the interpreter::ModifyGraphWithDelegate (e.g. the subgraph is
-// part of the list of callee subgraphs of the same control flow node, and all
-// of those callees are supported by the same delegate at once).
+// as delegation-skippable, and an error status if the subgraph index is
+// invalid.
+// If a subgraph is delegation-skippable, then the subgraph will be handled by
+// a TfLiteDelegate (and that the delegate is supposed to be already aware of
+// this state), and therefore, TfLiteInterpreter can skip invoking
+// `ModifyGraphWithDelegate` on this subgraph.
+// NOTE: This function is expected to be called only when the subgraph that
+// `subgraph_index` is pointing to should be skipped by
+// interpreter::ModifyGraphWithDelegate (e.g. the subgraph is part of the list
+// of callee subgraphs of the same control flow node, and all of those callees
+// are supported by the same delegate at once).
+//
+// For example, this function can be used when the delegate is handling
+// control flow ops like while op. E.g. A while op has condition subgraph
+// indexed at `i` and body subgraph indexed at `j`. The op can be delegated
+// when the following condition satisfied:
+//   1. The delegate supports while op
+//   2. Both condition subgraph `i` and body subgraph `j` can be fully
+//   delegated by the delegate.
+// Then if the delegate decides to support the while node along with both body
+// and condition subgraphs, it should mark subgraphs `i` and `j` skippable so
+// those two subgraphs won't be delegated separately again after being
+// absorbed by the parent subgraph.
+// WARNING: It is the delegate's responsibility to define when to skip
+// subgraph->ModifyGraphWithDelegate, to check any edge cases (i.e. multiple
+// references to the subgraph that `subgraph_index` is pointing to), and to mark
+// that subgraph as skippable using this function.
 // NOTE: Entry point for C node plugin API.
 TfLiteStatus MarkSubgraphAsDelegationSkippable(const TfLiteContext* context,
                                                int subgraph_index);
diff --git a/tensorflow/lite/delegates/utils/BUILD b/tensorflow/lite/delegates/utils/BUILD
index e252828ece3..1cfb3983be6 100644
--- a/tensorflow/lite/delegates/utils/BUILD
+++ b/tensorflow/lite/delegates/utils/BUILD
@@ -46,10 +46,10 @@ cc_library_with_tflite(
     hdrs = ["simple_opaque_delegate.h"],
     generate_opaque_delegate_target = True,
     tflite_deps = [
-        "//tensorflow/lite/core/shims:c_api",
-        "//tensorflow/lite/core/shims:c_api_experimental",
-        "//tensorflow/lite/core/shims:common",
-        "//tensorflow/lite/core/shims:c_api_types",
+        "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/c:c_api_experimental",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
     ],
     deps = [
         "//tensorflow/lite:builtin_ops",
@@ -75,31 +75,28 @@ cc_test(
     data = [":c_api_test_builtin_op_models"],
     deps = [
         ":simple_opaque_delegate",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite/core/shims:builtin_ops",
-        "//tensorflow/lite/core/shims:c_api",
-        "//tensorflow/lite/core/shims:c_api_experimental",
-        "//tensorflow/lite/core/shims:c_api_types",
-        "//tensorflow/lite/core/shims:common",
-        "//tensorflow/lite/core/shims:framework",
+        "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/c:c_api_experimental",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates:delegate_test_util",
         "//tensorflow/lite/delegates/utils/experimental/sample_stable_delegate",
+        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "async_type_helpers",
     srcs = ["async_type_helpers.cc"],
     hdrs = ["async_type_helpers.h"],
-    deps = [
+    tflite_deps = [
         ":ret_macros",
-        "//tensorflow/lite/core/async/interop/c:attribute_map",
-        "//tensorflow/lite/core/async/interop/c:constants",
-        "//tensorflow/lite/core/async/interop/c:types",
+        "//tensorflow/lite/async/interop/c:attribute_map",
+        "//tensorflow/lite/async/interop/c:constants",
+        "//tensorflow/lite/async/interop/c:types",
     ],
 )
 
@@ -110,30 +107,32 @@ cc_library_with_tflite(
     tflite_deps = [
         "//tensorflow/lite/c:c_api_experimental",
         "//tensorflow/lite/c:c_api_types",
+        ":ret_macros",
     ],
     deps = [
-        ":ret_macros",
         "//tensorflow/lite:minimal_logging",
         "@com_google_absl//absl/status",
     ],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "ret_macros",
     srcs = [],
     hdrs = ["ret_macros.h"],
+    tflite_deps = ["//tensorflow/lite/c:c_api_types"],
     deps = [
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/core/c:c_api_types",
     ],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "sync_fence",
     srcs = ["sync_fence.cc"],
     hdrs = ["sync_fence.h"],
-    deps = [
+    tflite_deps = [
         ":ret_macros",
+    ],
+    deps = [
         "//tensorflow/lite:minimal_logging",
         "@com_google_absl//absl/types:span",
     ],
diff --git a/tensorflow/lite/delegates/utils/async_type_helpers.cc b/tensorflow/lite/delegates/utils/async_type_helpers.cc
index 338f37486b9..4f6904c45bf 100644
--- a/tensorflow/lite/delegates/utils/async_type_helpers.cc
+++ b/tensorflow/lite/delegates/utils/async_type_helpers.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <cstring>
 #include <string_view>
 
-#include "tensorflow/lite/core/async/interop/c/attribute_map.h"
-#include "tensorflow/lite/core/async/interop/c/constants.h"
-#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/async/interop/c/attribute_map.h"
+#include "tensorflow/lite/async/interop/c/constants.h"
+#include "tensorflow/lite/async/interop/c/types.h"
 #include "tensorflow/lite/delegates/utils/ret_macros.h"
 
 // TODO(b/191883048): Cleanup this file when refactoring the attribute map
diff --git a/tensorflow/lite/delegates/utils/async_type_helpers.h b/tensorflow/lite/delegates/utils/async_type_helpers.h
index ae7fe9a418e..14f5061c264 100644
--- a/tensorflow/lite/delegates/utils/async_type_helpers.h
+++ b/tensorflow/lite/delegates/utils/async_type_helpers.h
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <optional>
 
-#include "tensorflow/lite/core/async/interop/c/attribute_map.h"
-#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/async/interop/c/attribute_map.h"
+#include "tensorflow/lite/async/interop/c/types.h"
 
 namespace tflite::delegates::utils {
 
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/BUILD b/tensorflow/lite/delegates/utils/dummy_delegate/BUILD
index 75e60e63880..9958bca3264 100644
--- a/tensorflow/lite/delegates/utils/dummy_delegate/BUILD
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/BUILD
@@ -99,9 +99,9 @@ sh_test(
     name = "external_delegate_test",
     srcs = ["external_delegate_test.sh"],
     data = [
-        "//tensorflow/lite/delegates/coreml/internal_test/testdata:mobilenet_v2_1.0_224_quantized_weights_fp16.tflite",
         "//tensorflow/lite/delegates/utils/dummy_delegate:dummy_external_delegate.so",
         "//tensorflow/lite/tools/benchmark:benchmark_model",
+        "@tflite_mobilenet_float//:mobilenet_v1_1.0_224.tflite",
     ],
     visibility = ["//visibility:private"],
 )
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_test.sh b/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_test.sh
index f802888f5e2..4f81a75a156 100755
--- a/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_test.sh
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_test.sh
@@ -17,9 +17,9 @@
 set -o errexit
 set -o nounset
 
-readonly benchmark_tool=third_party/tensorflow/lite/tools/benchmark/benchmark_model
-readonly external_delegate=third_party/tensorflow/lite/delegates/utils/dummy_delegate/dummy_external_delegate.so
-readonly model=third_party/tensorflow/lite/delegates/coreml/internal_test/testdata/mobilenet_v2_1.0_224_quantized_weights_fp16.tflite
+readonly benchmark_tool=tensorflow/lite/tools/benchmark/benchmark_model
+readonly external_delegate=tensorflow/lite/delegates/utils/dummy_delegate/dummy_external_delegate.so
+readonly model=external/tflite_mobilenet_float/mobilenet_v1_1.0_224.tflite
 readonly benchmark_log=/tmp/benchmark.out
 
 die() { echo "$@" >&2; exit 1; }
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD
index 980db70188b..57e654c9960 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD
@@ -55,6 +55,7 @@ cc_test(
     srcs = ["sample_stable_delegate_test.cc"],
     data = [
         "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/sub.bin",
     ],
     deps = [
         ":sample_stable_delegate_opaque_delegate",
@@ -83,8 +84,8 @@ cc_library_with_tflite(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates/utils:simple_opaque_delegate",
         "//tensorflow/lite/delegates/utils/experimental/stable_delegate:stable_delegate_interface",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:stable_delegate",
+        "//tensorflow/lite/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/acceleration/configuration/c:stable_delegate",
     ],
 )
 
@@ -114,13 +115,46 @@ cc_test(
     ],
     deps = [
         ":sample_stable_delegate_opaque_delegate",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/c:c_api",
         "//tensorflow/lite/c:c_api_experimental",
         "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates/utils/experimental/stable_delegate:delegate_loader",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/kernels:kernel_util",
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_test(
+    name = "sample_stable_delegate_kernel_tests",
+    timeout = "moderate",
+    args = ["--stable_delegate_settings_file=$(location stable_delegate_settings.json)"],
+    data = [
+        "stable_delegate_settings.json",
+        ":tensorflowlite_sample_stable_delegate",
+    ],
+    # This shard_count value of 30 was chosen because empirically this seems to be near the sweet
+    # spot that minimizes test time while not wasting resources with unnecessary shards.
+    # This runs roughly 30% faster than with shard_count = 20, but values beyond this show little
+    # improvement, e.g. shard_count = 40 gave only a few per cent improvement.
+    shard_count = 30,
+    deps = [
+        "//tensorflow/lite/kernels:combined_all_kernel_tests_lib",
+        "//tensorflow/lite/kernels:test_main",
+    ],
+)
+
+genrule(
+    name = "gen_stable_delegate_settings",
+    outs = ["stable_delegate_settings.json"],
+    cmd = """
+        echo '
+            {
+              "stable_delegate_loader_settings": {
+                "delegate_path": "tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so"
+              }
+            }
+        ' > $@
+    """,
+)
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/README.md b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/README.md
index 9b6fe30e80d..9e91e3f27fa 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/README.md
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/README.md
@@ -45,6 +45,66 @@ See [sample_stable_delegate_external_test.cc](https://github.com/tensorflow/tens
 for a standalone test driver that loads the sample stable delegate dynamically
 and runs inference on a TF Lite model.
 
+### Delegate Test Suite
+
+The Delegate Test Suite provides correctness testing for a delegate at the
+operation level. It checks whether a delegate produces results that meet the
+accuracy thresholds of the supported operations.
+
+Support for stable delegate binaries has been integrated into the Delegate Test
+Suite.
+
+#### Run on Android
+
+The following instructions show how to run the test suite on Android.
+
+First, we build the sample stable delegate shared library file,
+`libtensorflowlite_sample_stable_delegate.so`, which we will later load
+dynamically as part of the test:
+
+```bash
+bazel build -c opt --config=android_arm64 //tensorflow/lite/delegates/utils/experimental/sample_stable_delegate:tensorflowlite_sample_stable_delegate
+
+adb push "$(bazel info -c opt --config=android_arm64 bazel-bin)"/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so /data/local/tmp
+```
+
+Next, we create a configuration file for the component that loads the stable
+delegate:
+
+```bash
+adb shell 'echo "{
+  \"stable_delegate_loader_settings\": {
+    \"delegate_path\": \"/data/local/tmp/libtensorflowlite_sample_stable_delegate.so\"
+  }
+  // Add concrete delegate settings for the test target delegate.
+}
+"> /data/local/tmp/stable_delegate_settings.json'
+```
+
+Then, we build the test suite itself:
+
+```bash
+bazel build -c opt --config=android_arm64 //tensorflow/lite/kernels:combined_all_kernel_tests
+
+adb push "$(bazel info -c opt --config=android_arm64 bazel-bin)"/tensorflow/lite/kernels/combined_all_kernel_tests /data/local/tmp
+```
+
+Now, we can execute the test suite with providing the settings file:
+
+```bash
+adb shell "/data/local/tmp/combined_all_kernel_tests \
+  --stable_delegate_settings_file=/data/local/tmp/stable_delegate_settings.json"
+```
+
+The test suite will show the following output in console after all tests are
+passed:
+
+```
+...
+[==========] 3338 tests from 349 test suites ran. (24555 ms total)
+[  PASSED  ] 3338 tests.
+```
+
 ### Benchmark Tools
 
 #### Delegate Performance Benchmark app
@@ -111,7 +171,6 @@ Note that when you make changes to the sample delegate you need to rebuild the
 delegate's shared library file, in order for benchmark_model to pick up the new
 delegate code.
 
-
 ##### B) Run on Android
 
 The following instructions show how to run the tool on Android.
@@ -157,4 +216,3 @@ adb shell "/data/local/tmp/benchmark_model \
   --stable_delegate_settings_file=/data/local/tmp/stable_delegate_settings.json \
   --graph=/data/local/tmp/add.bin"
 ```
-
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
index 4750572b3b1..f497f4a4b64 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
@@ -28,9 +28,9 @@ int CalculateNumElements(const TfLiteOpaqueTensor* opaque_tensor);
 }  // namespace helpers
 
 // LINT.IfChange
-static const char kSampleStableDelegateName[] = "SampleStableDelegate";
-static const char kSampleStableDelegateVersion[] = "1.0.0";
+static const char kSampleStableDelegateName[] = "google_sample_delegate";
 // LINT.ThenChange(Google-internal path)
+static const char kSampleStableDelegateVersion[] = "1.0.0";
 
 // A simple delegate that supports only addition and subtraction operations.
 // Implements SimpleOpaqueDelegateInterface, and therefore the delegate can be
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external.cc b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external.cc
index e06cd91c657..86a82dd0a56 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external.cc
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external.cc
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
 #include "tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h"
 #include "tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h"
 #include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h"
+#include "tensorflow/lite/acceleration/configuration/c/stable_delegate.h"
 
 namespace {
 
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external_test.cc b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external_test.cc
index 4978f90430e..9c2672343d8 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external_test.cc
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external_test.cc
@@ -16,13 +16,13 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/c/c_api.h"
 #include "tensorflow/lite/c/c_api_opaque.h"
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h"
 #include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace {
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD b/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
index 15973d0015a..a479aa6def2 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
@@ -22,7 +22,11 @@ cc_library_with_tflite(
     name = "delegate_loader",
     srcs = ["delegate_loader.cc"],
     hdrs = ["delegate_loader.h"],
-    tflite_deps = ["//tensorflow/lite/experimental/acceleration/configuration/c:stable_delegate"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-ldl"],
+    }),
+    tflite_deps = ["//tensorflow/lite/acceleration/configuration/c:stable_delegate"],
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/lite/experimental/acceleration/compatibility:android_info",
@@ -34,7 +38,7 @@ cc_library_with_tflite(
     name = "stable_delegate_interface",
     hdrs = ["stable_delegate_interface.h"],
     generate_opaque_delegate_target = True,
-    tflite_deps = ["//tensorflow/lite/experimental/acceleration/configuration/c:stable_delegate"],
+    tflite_deps = ["//tensorflow/lite/acceleration/configuration/c:stable_delegate"],
     visibility = ["//visibility:public"],
 )
 
@@ -59,9 +63,9 @@ tflite_cc_shared_object(
     deps = [
         ":stable_delegate_interface",
         ":version_script.lds",
-        "//tensorflow/lite/core/shims:c_api",
-        "//tensorflow/lite/core/shims:xnnpack_plugin",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:stable_delegate",
+        "//tensorflow/lite/acceleration/configuration/c:stable_delegate",
+        "//tensorflow/lite/acceleration/configuration/c:xnnpack_plugin",
+        "//tensorflow/lite/c:c_api",
     ],
 )
 
@@ -71,8 +75,8 @@ cc_test(
     data = ["//tensorflow/lite/delegates/utils/experimental/sample_stable_delegate:tensorflowlite_sample_stable_delegate"],
     deps = [
         ":delegate_loader",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/delegates/utils/experimental/sample_stable_delegate",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -81,12 +85,12 @@ cc_library(
     name = "tflite_settings_json_parser",
     srcs = [
         "tflite_settings_json_parser.cc",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs_contents-inl.h",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs_contents-inl.h",
     ],
     hdrs = ["tflite_settings_json_parser.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/tools:logging",
         "@flatbuffers",
@@ -112,7 +116,7 @@ cc_test(
     ],
     deps = [
         ":tflite_settings_json_parser",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.cc
index 950e68dc412..abf89f71b3f 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.cc
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.cc
@@ -15,7 +15,10 @@ limitations under the License.
 #include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
 
 #include <dlfcn.h>
+#include <stdlib.h>
+#include <string.h>
 
+#include <cerrno>
 #include <string>
 
 #include "tensorflow/lite/experimental/acceleration/compatibility/android_info.h"
@@ -24,6 +27,27 @@ limitations under the License.
 namespace tflite {
 namespace delegates {
 namespace utils {
+namespace {
+void setLibraryPathEnvironmentVariable(const std::string& delegate_path) {
+  std::string directory_path = "";
+  // Finds the last '/' character in the file path.
+  size_t last_slash_index = delegate_path.rfind('/');
+  // If there is no '/' character, then the file path is a relative path and
+  // the directory path is empty.
+  if (last_slash_index != std::string::npos) {
+    // Extracts the directory path from the file path.
+    directory_path = delegate_path.substr(0, last_slash_index);
+  }
+  if (setenv(kTfLiteLibraryPathEnvironmentVariable, directory_path.c_str(),
+             /* overwrite= */ 1) != 0) {
+    // Delegate loading can continue as the environment variable is optional for
+    // the stable delegate shared library to use.
+    TFLITE_LOG(WARN) << "Error setting environment variable "
+                     << kTfLiteLibraryPathEnvironmentVariable
+                     << " with error: " << strerror(errno);
+  }
+}
+}  // namespace
 
 using ::tflite::acceleration::AndroidInfo;
 using ::tflite::acceleration::RequestAndroidInfo;
@@ -58,9 +82,12 @@ void* LoadSymbolFromSharedLibrary(const std::string& delegate_path,
     TFLITE_LOG(INFO) << "Android SDK level is " << sdk_version
                      << ", using dlopen with RTLD_NODELETE.";
   }
-  // On the one hand, the handld would not be closed in production. The resource
-  // will be release when the process is killed. On the other hand, it may cause
-  // issue for leak detection in testing.
+  // Exports the path of the folder containing the stable delegate shared
+  // library to the TFLITE_STABLE_DELEGATE_LIBRARY_PATH environment variable.
+  setLibraryPathEnvironmentVariable(delegate_path);
+  // On the one hand, the handle would not be closed in production. The resource
+  // will be released when the process is killed. On the other hand, it may
+  // cause issues for leak detection in testing.
   // TODO(b/268483011): Better support for cleanup the shared library handle.
   delegate_lib_handle = dlopen(delegate_path.c_str(), dlopen_flags);
   if (!delegate_lib_handle) {
@@ -76,7 +103,6 @@ void* LoadSymbolFromSharedLibrary(const std::string& delegate_path,
     dlclose(delegate_lib_handle);
     return nullptr;
   }
-  TFLITE_LOG(INFO) << "Found symbol: " << delegate_symbol;
   return symbol_pointer;
 }
 
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h
index 06f22d23e6b..940e4fd0b1e 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h
@@ -17,17 +17,19 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h"
+#include "tensorflow/lite/acceleration/configuration/c/stable_delegate.h"
 
 namespace tflite {
 namespace delegates {
 namespace utils {
 
 const char kTfLiteStableDelegateSymbol[] = "TFL_TheStableDelegate";
+const char kTfLiteLibraryPathEnvironmentVariable[] =
+    "TFLITE_STABLE_DELEGATE_LIBRARY_PATH";
 
 // Loads the TFLite delegate shared library and returns the pointer to
 // TfLiteStableDelegate (defined in
-// tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h).
+// tensorflow/lite/acceleration/configuration/c/stable_delegate.h).
 // The returned pointer could be null if the delegate shared library cannot be
 // opened or the delegate symbol cannot be found.
 const TfLiteStableDelegate* LoadDelegateFromSharedLibrary(
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader_test.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader_test.cc
index 40f0b8f68b2..4a25c28e995 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader_test.cc
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader_test.cc
@@ -15,10 +15,11 @@ limitations under the License.
 #include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
 
 #include <cstddef>
+#include <cstdlib>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 namespace {
 
@@ -41,6 +42,10 @@ TEST(TfLiteDelegateLoaderUtilsTest, Simple) {
   EXPECT_STREQ(stable_delegate_handle->delegate_version,
                tflite::example::kSampleStableDelegateVersion);
   EXPECT_NE(stable_delegate_handle->delegate_plugin, nullptr);
+  EXPECT_STREQ(
+      getenv(tflite::delegates::utils::kTfLiteLibraryPathEnvironmentVariable),
+      "tensorflow/lite/delegates/utils/experimental/"
+      "sample_stable_delegate");
 
   // Builds TFLiteSettings flatbuffer and passes into delegate plugin create
   // method.
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h b/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h
index 89fa5d73652..e7b0b096e52 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_STABLE_DELEGATE_INTERFACE_H_
 #define TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_STABLE_DELEGATE_INTERFACE_H_
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h"
+#include "tensorflow/lite/acceleration/configuration/c/stable_delegate.h"
 
 // This header file declares the interface that stable delegate shared
 // libraries need to implement. The stable delegate loader will dynamically load
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_xnnpack_delegate.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_xnnpack_delegate.cc
index d680415cd6c..8152660f1f0 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_xnnpack_delegate.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/core/shims/c/c_api.h"
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h"
+#include "tensorflow/lite/c/c_api.h"
 #include "tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h"
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/test_xnnpack_settings.json b/tensorflow/lite/delegates/utils/experimental/stable_delegate/test_xnnpack_settings.json
index 098d00c0f33..c0798305e97 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/test_xnnpack_settings.json
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/test_xnnpack_settings.json
@@ -2,7 +2,7 @@
 //
 // This file follows the TFLiteSettings structure in the schema of the generated
 // flatbuffer, which is based on the proto file:
-// tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+// tensorflow/lite/acceleration/configuration/configuration.proto
 {
   "delegate": "XNNPACK",
   "xnnpack_settings": {
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.cc
index 3599451b7d9..6c22198c6ab 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.cc
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <string>
 
 #include "flatbuffers/idl.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_fbs_contents-inl.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_fbs_contents-inl.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/tools/logging.h"
 
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h b/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h
index 46851e756dc..376358e9074 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "flatbuffers/idl.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser_test.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser_test.cc
index f0678a00253..7daa96c2064 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser_test.cc
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/buffer.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 
 namespace {
 
diff --git a/tensorflow/lite/delegates/utils/ret_macros.h b/tensorflow/lite/delegates/utils/ret_macros.h
index aa441e229e7..de6cd3089da 100644
--- a/tensorflow/lite/delegates/utils/ret_macros.h
+++ b/tensorflow/lite/delegates/utils/ret_macros.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdlib>
 
-#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/minimal_logging.h"
 
 // Evaluate an expression whose type is std::optional<T>. If it returns an
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc b/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
index fc3071d3f27..b6086f32a4c 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/core/shims/c/c_api.h"
-#include "tensorflow/lite/core/shims/c/c_api_opaque.h"
-#include "tensorflow/lite/core/shims/c/c_api_types.h"
-#include "tensorflow/lite/core/shims/c/common.h"
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/c_api_opaque.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/util.h"
 
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate.h b/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
index 0673b7f6b91..817186c01a1 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
@@ -33,8 +33,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/lite/core/shims/c/c_api_types.h"
-#include "tensorflow/lite/core/shims/c/common.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc b/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
index 61547d36ec5..f48c18757c7 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
@@ -20,14 +20,15 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/core/shims/c/c_api.h"
-#include "tensorflow/lite/core/shims/c/c_api_opaque.h"
-#include "tensorflow/lite/core/shims/c/c_api_types.h"
-#include "tensorflow/lite/core/shims/c/common.h"
-#include "tensorflow/lite/core/shims/cc/kernels/register.h"
+#include "tensorflow/core/platform/resource_loader.h"
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/c_api_opaque.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/delegate_test_util.h"
 #include "tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
 
 namespace tflite {
 
@@ -44,8 +45,9 @@ TEST_F(TestDelegate, TestDataAddBin_SingleInputSingleOutput_FullyDelegated) {
   //
   // Create the model and the interpreter
   //
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("third_party/tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
   ASSERT_NE(model, nullptr);
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
   ASSERT_NE(options, nullptr);
@@ -121,8 +123,9 @@ TEST(DelegateTest,
       TfLiteOpaqueDelegateFactory::Create(
           std::make_unique<example::SampleStableDelegate>());
 
-  TfLiteModel* model =
-      TfLiteModelCreateFromFile("third_party/tensorflow/lite/testdata/add.bin");
+  TfLiteModel* model = TfLiteModelCreateFromFile(
+      tensorflow::GetDataDependencyFilepath("tensorflow/lite/testdata/add.bin")
+          .c_str());
   ASSERT_NE(model, nullptr);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
@@ -198,8 +201,10 @@ TEST(DelegateTest, TestDataMultiAddBin_MultiInputMultiOutput_FullyDelegated) {
       TfLiteOpaqueDelegateFactory::Create(
           std::make_unique<example::SampleStableDelegate>());
 
-  TfLiteModel* model = TfLiteModelCreateFromFile(
-      "third_party/tensorflow/lite/testdata/multi_add.bin");
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile(tensorflow::GetDataDependencyFilepath(
+                                    "tensorflow/lite/testdata/multi_add.bin")
+                                    .c_str());
   ASSERT_NE(model, nullptr);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
@@ -263,7 +268,7 @@ TEST(DelegateTest, TestDataMultiAddBin_MultiInputMultiOutput_FullyDelegated) {
   TfLiteModelDelete(model);
 }
 
-TfLiteRegistrationExternal* GetDelegateKernelRegistrationImpl(
+TfLiteRegistrationExternal* CreateDelegateKernelRegistrationImpl(
     SimpleOpaqueDelegateInterface* delegate) {
   TfLiteRegistrationExternal* kernel_registration =
       TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate, delegate->Name(),
@@ -377,23 +382,22 @@ TEST_F(TestDelegate, SetBufferHandle) {
   // A 'Prepare' callback that blindly replaces the full execution plan.
   // We do this because all that we are interested is to verify the buffer
   // handle-related code.
-  opaque_delegate_builder.Prepare =
-      [](TfLiteOpaqueContext* opaque_context,
-         TfLiteOpaqueDelegate* opaque_delegate, void* data) {
-        auto* simple_opaque_delegate =
-            reinterpret_cast<SimpleOpaqueDelegateInterface*>(data);
-        TF_LITE_ENSURE_STATUS(
-            simple_opaque_delegate->Initialize(opaque_context));
-        TfLiteIntArray* execution_plan;
-        TF_LITE_ENSURE_STATUS(TfLiteOpaqueContextGetExecutionPlan(
-            opaque_context, &execution_plan));
-        TfLiteRegistrationExternal* delegate_kernel_registration =
-            GetDelegateKernelRegistrationImpl(simple_opaque_delegate);
+  opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                       TfLiteOpaqueDelegate* opaque_delegate,
+                                       void* data) {
+    auto* simple_opaque_delegate =
+        reinterpret_cast<SimpleOpaqueDelegateInterface*>(data);
+    TF_LITE_ENSURE_STATUS(simple_opaque_delegate->Initialize(opaque_context));
+    TfLiteIntArray* execution_plan;
+    TF_LITE_ENSURE_STATUS(
+        TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan));
+    TfLiteRegistrationExternal* delegate_kernel_registration =
+        CreateDelegateKernelRegistrationImpl(simple_opaque_delegate);
 
-        return TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
-            opaque_context, delegate_kernel_registration, execution_plan,
-            opaque_delegate);
-      };
+    return TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
+        opaque_context, delegate_kernel_registration, execution_plan,
+        opaque_delegate);
+  };
   opaque_delegate_builder.flags = kTfLiteDelegateFlagsNone;
   opaque_delegate_builder.data = &my_simple_delegate;
   opaque_delegate_builder.CopyFromBufferHandle =
@@ -406,21 +410,23 @@ TEST_F(TestDelegate, SetBufferHandle) {
                                                  tensor);
     return kTfLiteOk;
   };
-  opaque_delegate_builder.FreeBufferHandle =
-      [](TfLiteOpaqueContext* context, TfLiteOpaqueDelegate* delegate,
-         void* data, TfLiteBufferHandle* handle) {
-        auto* simple_opaque_delegate =
-            reinterpret_cast<MySimpleOpaqueDelegateWithBufferHandleSupport*>(
-                data);
-        simple_opaque_delegate->FreeBufferHandle(context, delegate, handle);
-      };
+  opaque_delegate_builder.FreeBufferHandle = [](TfLiteOpaqueContext* context,
+                                                TfLiteOpaqueDelegate* delegate,
+                                                void* data,
+                                                TfLiteBufferHandle* handle) {
+    auto* simple_opaque_delegate =
+        reinterpret_cast<MySimpleOpaqueDelegateWithBufferHandleSupport*>(data);
+    simple_opaque_delegate->FreeBufferHandle(context, delegate, handle);
+  };
   TfLiteDelegate tflite_delegate{};
   tflite_delegate.opaque_delegate_builder = &opaque_delegate_builder;
 
   // Load a model and build an interpreter.
   std::unique_ptr<tflite::FlatBufferModel> model =
       tflite::FlatBufferModel::BuildFromFile(
-          "third_party/tensorflow/lite/testdata/add.bin");
+          tensorflow::GetDataDependencyFilepath(
+              "tensorflow/lite/testdata/add.bin")
+              .c_str());
   ASSERT_NE(model, nullptr);
   tflite::ops::builtin::BuiltinOpResolver resolver;
   tflite::InterpreterBuilder builder(*model, resolver);
@@ -490,7 +496,9 @@ TEST(DelegateTest,
       TfLiteOpaqueDelegateFactory::Create(
           std::make_unique<example::SampleStableDelegate>());
   TfLiteModel* model = TfLiteModelCreateFromFile(
-      "third_party/tensorflow/lite/testdata/conv_huge_im2col.bin");
+      tensorflow::GetDataDependencyFilepath(
+          "tensorflow/lite/testdata/conv_huge_im2col.bin")
+          .c_str());
   ASSERT_NE(model, nullptr);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 4f01a256600..1ebd967f3c8 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -95,6 +95,74 @@ cc_library(
     }),
 )
 
+# Enable offloading of Fully Connected operator with dynamic weights to XNNPACK delegate
+config_setting(
+    name = "tflite_with_xnnpack_dynamic_fully_connected_explicit_true",
+    define_values = {"tflite_with_xnnpack_dynamic_fully_connected": "true"},
+)
+
+# Disable offloading of Fully Connected operator with dynamic weights to XNNPACK delegate
+config_setting(
+    name = "tflite_with_xnnpack_dynamic_fully_connected_explicit_false",
+    define_values = {"tflite_with_xnnpack_dynamic_fully_connected": "false"},
+)
+
+# Default setting for offloading of Fully Connected operator with dynamic weights to XNNPACK delegate
+cc_library(
+    name = "tflite_with_xnnpack_dynamic_fully_connected_default",
+    compatible_with = get_compatible_with_portable(),
+    defines = [],
+)
+
+cc_library(
+    name = "tflite_with_xnnpack_dynamic_fully_connected",
+    compatible_with = get_compatible_with_portable(),
+    defines = select({
+        ":tflite_with_xnnpack_dynamic_fully_connected_explicit_true": ["XNNPACK_DELEGATE_ENABLE_DYNAMIC_FULLY_CONNECTED=1"],
+        ":tflite_with_xnnpack_dynamic_fully_connected_explicit_false": [],
+        "//conditions:default": [],
+    }),
+    deps = select({
+        ":tflite_with_xnnpack_dynamic_fully_connected_explicit_true": [],
+        ":tflite_with_xnnpack_dynamic_fully_connected_explicit_false": [],
+        "//conditions:default": [":tflite_with_xnnpack_dynamic_fully_connected_default"],
+    }),
+)
+
+# Enable logging of delegation errors in XNNPACK delegate
+config_setting(
+    name = "tflite_with_xnnpack_logging_explicit_true",
+    define_values = {"tflite_with_xnnpack_logging": "true"},
+)
+
+# Disable logging of delegation errors in XNNPACK delegate
+config_setting(
+    name = "tflite_with_xnnpack_logging_explicit_false",
+    define_values = {"tflite_with_xnnpack_logging": "false"},
+)
+
+# Default setting for logging of delegation errors in XNNPACK delegate
+cc_library(
+    name = "tflite_with_xnnpack_logging_default",
+    compatible_with = get_compatible_with_portable(),
+    defines = [],
+)
+
+cc_library(
+    name = "tflite_with_xnnpack_logging",
+    compatible_with = get_compatible_with_portable(),
+    defines = select({
+        ":tflite_with_xnnpack_logging_explicit_true": ["XNNPACK_DELEGATE_ENABLE_LOGGING=1"],
+        ":tflite_with_xnnpack_logging_explicit_false": [],
+        "//conditions:default": [],
+    }),
+    deps = select({
+        ":tflite_with_xnnpack_logging_explicit_true": [],
+        ":tflite_with_xnnpack_logging_explicit_false": [],
+        "//conditions:default": [":tflite_with_xnnpack_logging_default"],
+    }),
+)
+
 cc_library(
     name = "xnnpack_delegate_test",
     testonly = True,
@@ -116,6 +184,8 @@ cc_library(
     linkstatic = True,
     deps = [
         ":quantization_util",
+        ":tflite_with_xnnpack_dynamic_fully_connected",
+        ":tflite_with_xnnpack_logging",
         ":tflite_with_xnnpack_qs8",
         ":tflite_with_xnnpack_qu8",
         "//tensorflow/lite:kernel_api",
@@ -128,6 +198,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal/utils:sparsity_format_converter",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
         "@XNNPACK//:xnnpack_for_tflite",
     ],
@@ -161,8 +232,9 @@ cc_library(
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal/utils:sparsity_format_converter",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
-        "@XNNPACK",
+        "@XNNPACK//:XNNPACK_test_mode",
     ],
 )
 
@@ -672,7 +744,7 @@ cc_library(
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//testing/base/public:gunit_for_library_testonly",
+        "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
 )
@@ -725,7 +797,7 @@ cc_library(
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//testing/base/public:gunit_for_library_testonly",
+        "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
 )
@@ -743,7 +815,7 @@ cc_library(
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//testing/base/public:gunit_for_library_testonly",
+        "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
 )
@@ -838,7 +910,7 @@ cc_library(
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//testing/base/public:gunit_for_library_testonly",
+        "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
 )
@@ -860,7 +932,7 @@ cc_library(
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//testing/base/public:gunit_for_library_testonly",
+        "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
 )
@@ -2360,8 +2432,8 @@ cc_test(
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:mutable_op_resolver",
         "//tensorflow/lite/core:framework",
-        "//tensorflow/lite/core/kernels:builtin_ops",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index 0c92d6289bb..eb2d058e916 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -462,8 +462,8 @@ Below is the list of currently supported floating-point operators:
 
 ### Floating-Point (IEEE FP16) Operators
 
-XNNPACK supports half-precision (using IEEE FP16 format) inference for a subset
-of floating-point operators. XNNPACK automatically enables half-precision
+XNNPACK supports half-precision (using IEEE FP16 format) inference for all
+floating-point operators. XNNPACK automatically enables half-precision
 inference when the following conditions are met:
 
 * XNNPACK runs on hardware that natively supports computations in IEEE FP16
@@ -473,9 +473,6 @@ Pixel 3, Galaxy S9 (Snapdragon SoC), Galaxy S10 (Exynos SoC), iOS devices with
 A11 or newer SoCs, all Apple Silicon Macs, and Windows ARM64 laptops based with
 Snapdragon 850 SoC or newer.
 
-* IEEE FP16 inference is supported for every floating-point operator in the
-model.
-
 * The model's "reduced_precision_support" metadata indicates that the model
 is compatible with FP16 inference. The metadata can be added during model
 conversion using the `_experimental_supported_accumulation_type` attribute
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
index 5b64efd9675..596928cd31d 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
@@ -136,29 +136,25 @@ void BinaryElementwiseTester::Test(tflite::BuiltinOperator binary_op,
   if (!Input1Static()) {
     float* default_input1_data =
         default_interpreter->typed_input_tensor<float>(0);
-    std::generate(default_input1_data,
-                  default_input1_data + ComputeSize(Input1Shape()),
-                  std::ref(input1_rng));
+    std::generate_n(default_input1_data, ComputeSize(Input1Shape()),
+                    std::ref(input1_rng));
 
     float* xnnpack_input1_data =
         delegate_interpreter->typed_input_tensor<float>(0);
-    std::copy(default_input1_data,
-              default_input1_data + ComputeSize(Input1Shape()),
-              xnnpack_input1_data);
+    std::copy_n(default_input1_data, ComputeSize(Input1Shape()),
+                xnnpack_input1_data);
   }
 
   if (!Input2Static()) {
     float* default_input2_data =
         default_interpreter->typed_input_tensor<float>(Input1Static() ? 0 : 1);
-    std::generate(default_input2_data,
-                  default_input2_data + ComputeSize(Input2Shape()),
-                  std::ref(input2_rng));
+    std::generate_n(default_input2_data, ComputeSize(Input2Shape()),
+                    std::ref(input2_rng));
 
     float* xnnpack_input2_data =
         delegate_interpreter->typed_input_tensor<float>(Input1Static() ? 0 : 1);
-    std::copy(default_input2_data,
-              default_input2_data + ComputeSize(Input2Shape()),
-              xnnpack_input2_data);
+    std::copy_n(default_input2_data, ComputeSize(Input2Shape()),
+                xnnpack_input2_data);
   }
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/concatenation_tester.cc b/tensorflow/lite/delegates/xnnpack/concatenation_tester.cc
index d7b9071b3a4..9cc3c1b52f6 100644
--- a/tensorflow/lite/delegates/xnnpack/concatenation_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/concatenation_tester.cc
@@ -54,14 +54,12 @@ void ConcatenationTester::Test(Interpreter *delegate_interpreter,
 
   for (size_t i = 0; i < NumInputs(); i++) {
     T *default_input_data = default_interpreter->typed_input_tensor<T>(i);
-    std::generate(default_input_data,
-                  default_input_data + ComputeSize(InputShape(i)),
-                  std::ref(input_rng));
+    std::generate_n(default_input_data, ComputeSize(InputShape(i)),
+                    std::ref(input_rng));
 
     T *xnnpack_input_data = delegate_interpreter->typed_input_tensor<T>(i);
-    std::copy(default_input_data,
-              default_input_data + ComputeSize(InputShape(i)),
-              xnnpack_input_data);
+    std::copy_n(default_input_data, ComputeSize(InputShape(i)),
+                xnnpack_input_data);
   }
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
@@ -87,15 +85,13 @@ void ConcatenationTester::Test<float>(Interpreter *delegate_interpreter,
   for (size_t i = 0; i < NumInputs(); i++) {
     float *default_input_data =
         default_interpreter->typed_input_tensor<float>(i);
-    std::generate(default_input_data,
-                  default_input_data + ComputeSize(InputShape(i)),
-                  std::ref(input_rng));
+    std::generate_n(default_input_data, ComputeSize(InputShape(i)),
+                    std::ref(input_rng));
 
     float *xnnpack_input_data =
         delegate_interpreter->typed_input_tensor<float>(i);
-    std::copy(default_input_data,
-              default_input_data + ComputeSize(InputShape(i)),
-              xnnpack_input_data);
+    std::copy_n(default_input_data, ComputeSize(InputShape(i)),
+                xnnpack_input_data);
   }
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
@@ -170,16 +166,16 @@ std::vector<char> ConcatenationTester::CreateTfLiteModel(
 
   std::vector<flatbuffers::Offset<Tensor>> tensors;
   for (size_t i = 0; i < NumInputs(); i++) {
-    tensors.push_back(
-        CreateTensor(builder,
-                     builder.CreateVector<int32_t>(InputShape(i).data(),
-                                                   InputShape(i).size()),
-                     tensor_type,
-                     /*buffer=*/0, /*name=*/0,
-                     CreateQuantizationParameters(
-                         builder, /*min=*/0, /*max=*/0,
-                         builder.CreateVector<float>({/*scale=*/1.0f}),
-                         builder.CreateVector<int64_t>({/*zero_point=*/0}))));
+    tensors.push_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(InputShape(i).data(),
+                                      InputShape(i).size()),
+        tensor_type,
+        /*buffer=*/0, /*name=*/0,
+        CreateQuantizationParameters(
+            builder, /*min=*/0, /*max=*/0,
+            builder.CreateVector<float>({input_scales_[i]}),
+            builder.CreateVector<int64_t>({input_zero_points_[i]}))));
   }
 
   tensors.push_back(CreateTensor(
@@ -189,8 +185,8 @@ std::vector<char> ConcatenationTester::CreateTfLiteModel(
       /*buffer=*/0, /*name=*/0,
       CreateQuantizationParameters(
           builder, /*min=*/0, /*max=*/0,
-          builder.CreateVector<float>({/*scale=*/1.0f}),
-          builder.CreateVector<int64_t>({/*zero_point=*/0}))));
+          builder.CreateVector<float>({output_scale_}),
+          builder.CreateVector<int64_t>({output_zero_point_}))));
 
   std::vector<int32_t> op_inputs;
   for (size_t i = 0; i < NumInputs(); i++) {
diff --git a/tensorflow/lite/delegates/xnnpack/concatenation_tester.h b/tensorflow/lite/delegates/xnnpack/concatenation_tester.h
index ec80aedfdf6..2af4638fe52 100644
--- a/tensorflow/lite/delegates/xnnpack/concatenation_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/concatenation_tester.h
@@ -53,6 +53,10 @@ class ConcatenationTester {
       }
     }
     input_shapes_ = shapes;
+    input_scales_.resize(shapes.size(), 1);
+    output_scale_ = 1.f;
+    input_zero_points_.resize(shapes.size(), 0);
+    output_zero_point_ = 0;
     return *this;
   }
 
@@ -60,6 +64,35 @@ class ConcatenationTester {
     return input_shapes_[i];
   }
 
+  inline ConcatenationTester& InputScales(const std::vector<float> scales) {
+    input_scales_ = scales;
+    return *this;
+  }
+
+  inline float InputScale(size_t i) const { return input_scales_[i]; }
+
+  inline ConcatenationTester& InputZeroPoint(
+      const std::vector<int32_t> zero_points) {
+    input_zero_points_ = zero_points;
+    return *this;
+  }
+
+  inline float InputZeroPoint(size_t i) const { return input_zero_points_[i]; }
+
+  inline ConcatenationTester& OutputScale(float scale) {
+    output_scale_ = scale;
+    return *this;
+  }
+
+  inline float OutputScale() const { return output_scale_; }
+
+  inline ConcatenationTester& OutputZeroPoint(int32_t zero_point) {
+    output_zero_point_ = zero_point;
+    return *this;
+  }
+
+  inline int32_t OutputZeroPoint() const { return output_scale_; }
+
   inline size_t NumInputs() const { return input_shapes_.size(); }
 
   std::vector<int32_t> OutputShape() const {
@@ -86,6 +119,10 @@ class ConcatenationTester {
   int axis_;
   std::vector<int32_t> output_shape_;
   std::vector<std::vector<int32_t>> input_shapes_;
+  std::vector<float> input_scales_;
+  float output_scale_;
+  std::vector<int32_t> input_zero_points_;
+  int32_t output_zero_point_;
 };
 
 }  // namespace xnnpack
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
index 469184e7e53..5332e5909a4 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
@@ -351,7 +351,7 @@ TEST(Conv2D, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, INT8Weights) {
+TEST(Conv2D, TensorWiseQuantizedInt8Weights) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -379,11 +379,11 @@ TEST(Conv2D, INT8Weights) {
       .KernelWidth(kernel_rng())
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
-      .INT8Weights()
+      .TensorWiseQuantizedInt8Weights()
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, INT8ChannelWiseWeights) {
+TEST(Conv2D, ChannelWiseQuantizedInt8Weights) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -411,7 +411,7 @@ TEST(Conv2D, INT8ChannelWiseWeights) {
       .KernelWidth(kernel_rng())
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
-      .INT8ChannelWiseWeights()
+      .ChannelWiseQuantizedInt8Weights()
       .Test(xnnpack_delegate.get());
 }
 
@@ -480,7 +480,7 @@ TEST(Conv2D, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseINT8Weights) {
+TEST(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -509,11 +509,11 @@ TEST(Conv2D, SparseINT8Weights) {
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
       .SparseWeights()
-      .INT8Weights()
+      .TensorWiseQuantizedInt8Weights()
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseINT8ChannelWiseWeights) {
+TEST(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -542,7 +542,7 @@ TEST(Conv2D, SparseINT8ChannelWiseWeights) {
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
       .SparseWeights()
-      .INT8ChannelWiseWeights()
+      .ChannelWiseQuantizedInt8Weights()
       .Test(xnnpack_delegate.get());
 }
 
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
index 03b66e403f9..051db730e7a 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
@@ -80,17 +80,15 @@ void Conv2DTester::Test(TfLiteDelegate* delegate) const {
   auto input_rng =
       std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data,
-                default_input_data + BatchSize() * InputHeight() *
-                                         InputWidth() * InputChannels(),
-                input_rng);
+  std::generate_n(default_input_data,
+                  BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+                  input_rng);
 
   float* delegate_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data,
-            default_input_data +
-                BatchSize() * InputHeight() * InputWidth() * InputChannels(),
-            delegate_input_data);
+  std::copy_n(default_input_data,
+              BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+              delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
@@ -125,135 +123,119 @@ std::vector<char> Conv2DTester::CreateTfLiteModel() const {
   auto range_rng = std::bind(
       std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
 
+  /*************************** Define operator codes **************************/
   flatbuffers::FlatBufferBuilder builder;
   std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
       {CreateOperatorCode(builder, BuiltinOperator_CONV_2D)}};
-  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
-  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
-      {CreateBuffer(builder, builder.CreateVector({}))}};
-
-  if (SparseWeights()) {
-    operator_codes.emplace_back(
-        CreateOperatorCode(builder, BuiltinOperator_DENSIFY));
-    const std::array<int32_t, 1> densify_filter_inputs{{0}};
-    const std::array<int32_t, 1> densify_filter_outputs{
-        {(FP16Weights() || INT8Weights() || INT8ChannelWiseWeights()) ? 1 : 2}};
-    operators.emplace_back(CreateOperator(
-        builder, /*opcode_index=*/operator_codes.size() - 1,
-        builder.CreateVector<int32_t>(densify_filter_inputs.data(),
-                                      densify_filter_inputs.size()),
-        builder.CreateVector<int32_t>(densify_filter_outputs.data(),
-                                      densify_filter_outputs.size())));
-  }
-
-  const std::vector<int32_t> filter_shape = {
-      OutputChannels(), KernelHeight(), KernelWidth(), KernelInputChannels()};
-  const std::vector<int32_t> bias_shape = {OutputChannels()};
-  std::vector<float> filter_scales;
-  std::vector<int64_t> filter_zero_points;
-  int32_t filter_quantized_dimension = 0;
-  if (FP16Weights()) {
-    operator_codes.emplace_back(
-        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
-
-    std::vector<uint16_t> filter_data(OutputChannels() * KernelHeight() *
-                                      KernelWidth() * KernelInputChannels());
-    std::vector<uint16_t> bias_data(OutputChannels());
-    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
-      // Use the same range of all-positive or all-negative values to generate
-      // all weights within the same output channel, but different ranges for
-      // different output channels. This ensures that no catastrophic
-      // cancellation occur, but test covers both positive and negative inputs.
-      const float range = range_rng();
-      auto value_rng =
-          std::bind(fp16_ieee_from_fp32_value,
-                    std::bind(std::uniform_real_distribution<float>(
-                                  std::min(range, 0.0f), std::max(range, 0.0f)),
-                              std::ref(rng)));
-      bias_data[oc] = value_rng();
-      for (int32_t ic = 0; ic < KernelInputChannels(); ic++) {
-        for (int32_t y = 0; y < KernelHeight(); y++) {
-          for (int32_t x = 0; x < KernelWidth(); x++) {
-            const int32_t index =
-                ((oc * KernelHeight() + y) * KernelWidth() + x) *
-                    KernelInputChannels() +
-                ic;
-            filter_data[index] = value_rng();
-          }
-        }
-      }
-    }
-
-    buffers.emplace_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(filter_data.data()),
-                     sizeof(uint16_t) * filter_data.size())));
-    buffers.emplace_back(CreateBuffer(
-        builder,
-        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
-                             sizeof(uint16_t) * bias_data.size())));
-
-    const std::array<int32_t, 1> dequantize_filter_inputs{
-        {SparseWeights() ? 1 : 0}};
-    const std::array<int32_t, 1> dequantize_filter_outputs{
-        {SparseWeights() ? 4 : 3}};
-    operators.emplace_back(CreateOperator(
-        builder, /*opcode_index=*/operator_codes.size() - 1,
-        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
-                                      dequantize_filter_inputs.size()),
-        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
-                                      dequantize_filter_outputs.size())));
-    const std::array<int32_t, 1> dequantize_bias_inputs{
-        {SparseWeights() ? 2 : 1}};
-    const std::array<int32_t, 1> dequantize_bias_outputs{
-        {SparseWeights() ? 5 : 4}};
-    operators.emplace_back(CreateOperator(
-        builder, /*opcode_index=*/operator_codes.size() - 1,
-        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
-                                      dequantize_bias_inputs.size()),
-        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
-                                      dequantize_bias_outputs.size())));
-  } else {
-    std::vector<float> filter_data(OutputChannels() * KernelHeight() *
-                                   KernelWidth() * KernelInputChannels());
-    std::vector<float> bias_data(OutputChannels());
-    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
-      // Use the same range of all-positive or all-negative values to generate
-      // all weights within the same output channel, but different ranges for
-      // different output channels. This ensures that no catastrophic
-      // cancellation occur, but test covers both positive and negative inputs.
-      const float range = range_rng();
-      auto value_rng =
-          std::bind(std::uniform_real_distribution<float>(
-                        std::min(range, 0.0f), std::max(range, 0.0f)),
-                    std::ref(rng));
-      bias_data[oc] = value_rng();
-      for (int32_t ic = 0; ic < KernelInputChannels(); ic++) {
-        for (int32_t y = 0; y < KernelHeight(); y++) {
-          for (int32_t x = 0; x < KernelWidth(); x++) {
-            const int32_t index =
-                ((oc * KernelHeight() + y) * KernelWidth() + x) *
-                    KernelInputChannels() +
-                ic;
-            filter_data[index] = value_rng();
-          }
-        }
-      }
-    }
-
-    if (INT8Weights() || INT8ChannelWiseWeights()) {
+  int dequantize_operator_code = -1;
+  switch (WeightsType()) {
+    case WeightsType::kFP32:
+      break;
+    case WeightsType::kFP16:
+    case WeightsType::kTensorWiseQuantizedInt8:
+    case WeightsType::kChannelWiseQuantizedInt8:
+      dequantize_operator_code = operator_codes.size();
       operator_codes.emplace_back(
           CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+      break;
+  }
+  int densify_operator_code = -1;
+  if (SparseWeights()) {
+    densify_operator_code = operator_codes.size();
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DENSIFY));
+  }
+
+  /*********************** Generate filter and bias data **********************/
+  std::vector<float> filter_data(OutputChannels() * KernelHeight() *
+                                 KernelWidth() * KernelInputChannels());
+  std::vector<float> bias_data(OutputChannels());
+  for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+    // Use the same range of all-positive or all-negative values to generate
+    // all weights within the same output channel, but different ranges for
+    // different output channels. This ensures that no catastrophic
+    // cancellation occur, but test covers both positive and negative
+    // inputs.
+    const float range = range_rng();
+    const auto value_dist = std::uniform_real_distribution<float>(
+        std::min(range, 0.0f), std::max(range, 0.0f));
+    auto value_rng = std::bind(value_dist, std::ref(rng));
+    bias_data[oc] = value_rng();
+    for (int32_t ic = 0; ic < KernelInputChannels(); ic++) {
+      for (int32_t y = 0; y < KernelHeight(); y++) {
+        for (int32_t x = 0; x < KernelWidth(); x++) {
+          const int32_t index =
+              ((oc * KernelHeight() + y) * KernelWidth() + x) *
+                  KernelInputChannels() +
+              ic;
+          filter_data[index] = value_rng();
+        }
+      }
+    }
+  }
+
+  /************************ Define sparsity parameters ************************/
+  flatbuffers::Offset<SparsityParameters> filter_sparsity_params = 0;
+  const std::vector<int32_t> filter_shape = {
+      OutputChannels(), KernelHeight(), KernelWidth(), KernelInputChannels()};
+  if (SparseWeights()) {
+    // Sparse tensor in TFLite can be in different formats. Here we choose the
+    // simplest configuration that
+    //   1. all dimensions are dense,
+    //   2. in-order traversal, and
+    //   3. no block configuration.
+    const int dims_count = filter_shape.size();
+    std::vector<flatbuffers::Offset<DimensionMetadata>> dim_metadata(
+        dims_count);
+    std::vector<int> traversal_order(dims_count);
+    for (int i = 0; i < dims_count; i++) {
+      traversal_order[i] = i;
+      dim_metadata[i] = CreateDimensionMetadata(builder, DimensionType_DENSE,
+                                                filter_shape[i]);
+    }
+    filter_sparsity_params =
+        CreateSparsityParameters(builder, builder.CreateVector(traversal_order),
+                                 0, builder.CreateVector(dim_metadata));
+  }
+
+  /****************************** Define buffers ******************************/
+  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
+  tflite::TensorType quantized_filter_type = TensorType_FLOAT32;
+  flatbuffers::Offset<tflite::QuantizationParameters>
+      filter_quantization_params = 0;
+  int filter_buffer_id = 0, quantized_filter_buffer_id = 0;
+  switch (WeightsType()) {
+    case WeightsType::kFP32:
+      filter_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(filter_data.data()),
+                       sizeof(float) * filter_data.size())));
+      break;
+    case WeightsType::kFP16: {
+      std::vector<uint16_t> quantized_filter_data(filter_data.size());
+      std::transform(filter_data.begin(), filter_data.end(),
+                     quantized_filter_data.begin(), fp16_ieee_from_fp32_value);
+
+      quantized_filter_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
+          builder,
+          builder.CreateVector(
+              reinterpret_cast<const uint8_t*>(quantized_filter_data.data()),
+              sizeof(uint16_t) * quantized_filter_data.size())));
+
+      quantized_filter_type = TensorType_FLOAT16;
+      break;
+    }
+    case WeightsType::kTensorWiseQuantizedInt8:
+    case WeightsType::kChannelWiseQuantizedInt8: {
+      std::vector<float> filter_scales;
+      std::vector<int64_t> filter_zero_points;
+      int32_t filter_quantized_dimension = 0;
 
       std::vector<int8_t> quantized_filter_data(filter_data.size());
-      if (INT8Weights()) {
-        filter_scales.resize(1, GetInt8QuantizationScale(filter_data));
-        filter_zero_points.resize(1, 0);
-        std::transform(filter_data.begin(), filter_data.end(),
-                       quantized_filter_data.begin(),
-                       std::bind(QuantizeInt8, std::placeholders::_1, 0,
-                                 filter_scales[0]));
-      } else {
+      if (WeightsType() == WeightsType::kChannelWiseQuantizedInt8) {
         filter_quantized_dimension =
             static_cast<int32_t>(filter_shape.size()) - 1;
         const int32_t num_scales = filter_shape[filter_quantized_dimension];
@@ -263,136 +245,165 @@ std::vector<char> Conv2DTester::CreateTfLiteModel() const {
         QuantizeInt8PerChannel(filter_scales.data(), filter_zero_points.data(),
                                filter_quantized_dimension, filter_data.data(),
                                quantized_filter_data.data(), filter_shape);
+      } else {
+        filter_scales.resize(1, GetInt8QuantizationScale(filter_data));
+        filter_zero_points.resize(1, 0);
+        std::transform(filter_data.begin(), filter_data.end(),
+                       quantized_filter_data.begin(),
+                       std::bind(QuantizeInt8, std::placeholders::_1, 0,
+                                 filter_scales[0]));
       }
+
+      quantized_filter_buffer_id = buffers.size();
       buffers.emplace_back(CreateBuffer(
           builder,
           builder.CreateVector(
               reinterpret_cast<const uint8_t*>(quantized_filter_data.data()),
               sizeof(int8_t) * quantized_filter_data.size())));
 
-      const std::array<int32_t, 1> dequantize_filter_inputs{
-          {SparseWeights() ? 1 : 0}};
-      const std::array<int32_t, 1> dequantize_filter_outputs{
-          {SparseWeights() ? 3 : 2}};
-      operators.emplace_back(CreateOperator(
-          builder, /*opcode_index=*/operator_codes.size() - 1,
-          builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
-                                        dequantize_filter_inputs.size()),
-          builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
-                                        dequantize_filter_outputs.size())));
-    } else {
+      quantized_filter_type = TensorType_INT8;
+      filter_quantization_params = CreateQuantizationParameters(
+          builder, /*min=*/0, /*max=*/0,
+          builder.CreateVector<float>(filter_scales),
+          builder.CreateVector<int64_t>(filter_zero_points),
+          /*details_type=*/QuantizationDetails_NONE,
+          /*details=*/0, filter_quantized_dimension);
+      break;
+    }
+  }
+  tflite::TensorType quantized_bias_type = TensorType_FLOAT32;
+  int bias_buffer_id = 0, quantized_bias_buffer_id = 0;
+  switch (BiasType()) {
+    case BiasType::kFP32:
+      bias_buffer_id = buffers.size();
       buffers.emplace_back(CreateBuffer(
           builder, builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(filter_data.data()),
-                       sizeof(float) * filter_data.size())));
-    }
+                       reinterpret_cast<const uint8_t*>(bias_data.data()),
+                       sizeof(float) * bias_data.size())));
+      break;
+    case BiasType::kFP16: {
+      std::vector<uint16_t> quantized_bias_data(bias_data.size());
+      std::transform(bias_data.begin(), bias_data.end(),
+                     quantized_bias_data.begin(), fp16_ieee_from_fp32_value);
 
-    // Bias is stored in FP32 even when filter is quantized to INT8
-    buffers.emplace_back(CreateBuffer(
-        builder,
-        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
-                             sizeof(float) * bias_data.size())));
-  }
-
-  const std::array<int32_t, 4> input_shape{
-      {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
-  const std::array<int32_t, 4> output_shape{
-      {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
-
-  std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
-  if (SparseWeights()) {
-    // Sparse tensor in TFLite can be in different formats. Here we choose the
-    // simplest configuration that
-    //   1. all dimensions are dense,
-    //   2. in-order traversal, and
-    //   3. no block configuration.
-    int dims_count = filter_shape.size();
-    std::vector<flatbuffers::Offset<DimensionMetadata>> dim_metadata(
-        dims_count);
-    std::vector<int> traversal_order(dims_count);
-    for (int i = 0; i < dims_count; i++) {
-      traversal_order[i] = i;
-      dim_metadata[i] = CreateDimensionMetadata(builder, DimensionType_DENSE,
-                                                filter_shape[i]);
-    }
-    flatbuffers::Offset<SparsityParameters> sparsity_param =
-        CreateSparsityParameters(builder, builder.CreateVector(traversal_order),
-                                 0, builder.CreateVector(dim_metadata));
-    if (INT8Weights() || INT8ChannelWiseWeights()) {
-      tensors.emplace_back(
-          CreateTensor(builder,
-                       builder.CreateVector<int32_t>(filter_shape.data(),
-                                                     filter_shape.size()),
-                       /*type=*/TensorType_INT8,
-                       /*buffer=*/1, /*name=*/0,
-                       CreateQuantizationParameters(
-                           builder, /*min=*/0, /*max=*/0,
-                           builder.CreateVector<float>(filter_scales),
-                           builder.CreateVector<int64_t>(filter_zero_points),
-                           /*details_type=*/QuantizationDetails_NONE,
-                           /*details=*/0, filter_quantized_dimension),
-                       /*is_variable=*/false, /*sparsity=*/sparsity_param));
-    } else {
-      tensors.emplace_back(CreateTensor(
+      quantized_bias_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
           builder,
-          builder.CreateVector<int32_t>(filter_shape.data(),
-                                        filter_shape.size()),
-          /*type=*/FP16Weights() ? TensorType_FLOAT16 : TensorType_FLOAT32,
-          /*buffer=*/1, /*name=*/0, /*quantization=*/0,
-          /*is_variable=*/false, /*sparsity=*/sparsity_param));
+          builder.CreateVector(
+              reinterpret_cast<const uint8_t*>(quantized_bias_data.data()),
+              sizeof(uint16_t) * quantized_bias_data.size())));
+
+      quantized_bias_type = TensorType_FLOAT16;
+      break;
     }
   }
-  if (FP16Weights()) {
+
+  /****************************** Define tensors ******************************/
+  std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
+  int sparse_filter_tensor_id = -1;
+  if (SparseWeights()) {
+    sparse_filter_tensor_id = tensors.size();
     tensors.emplace_back(CreateTensor(
         builder,
         builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
-        TensorType_FLOAT16, /*buffer=*/SparseWeights() ? 0 : 1));
+        /*type=*/quantized_filter_type,
+        /*buffer=*/std::max(filter_buffer_id, quantized_filter_buffer_id),
+        /*name=*/0, filter_quantization_params,
+        /*is_variable=*/false, filter_sparsity_params));
+  }
+  int quantized_filter_tensor_id = -1;
+  if (quantized_filter_type != TensorType_FLOAT32) {
+    quantized_filter_tensor_id = tensors.size();
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        /*type=*/quantized_filter_type,
+        /*buffer=*/SparseWeights() ? 0 : quantized_filter_buffer_id,
+        /*name=*/0, filter_quantization_params));
+  }
+  int quantized_bias_tensor_id = -1;
+  const std::vector<int32_t> bias_shape = {OutputChannels()};
+  if (quantized_bias_type != TensorType_FLOAT32) {
+    quantized_bias_tensor_id = tensors.size();
     tensors.emplace_back(CreateTensor(
         builder,
         builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-        TensorType_FLOAT16, /*buffer=*/2));
-  } else if (INT8Weights() || INT8ChannelWiseWeights()) {
-    tensors.emplace_back(CreateTensor(
-        builder,
-        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
-        TensorType_INT8, /*buffer=*/SparseWeights() ? 0 : 1, /*name=*/0,
-        CreateQuantizationParameters(
-            builder, /*min=*/0, /*max=*/0,
-            builder.CreateVector<float>(filter_scales),
-            builder.CreateVector<int64_t>(filter_zero_points),
-            /*details_type=*/QuantizationDetails_NONE,
-            /*details=*/0, filter_quantized_dimension)));
+        quantized_bias_type, /*buffer=*/quantized_bias_buffer_id));
   }
+
+  const int input_tensor_id = tensors.size();
+  const std::array<int32_t, 4> input_shape{
+      {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
       TensorType_FLOAT32));
+
+  const int filter_tensor_id = tensors.size();
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
       TensorType_FLOAT32,
-      /*buffer=*/
-      (FP16Weights() || INT8Weights() || INT8ChannelWiseWeights() ||
-       SparseWeights())
-          ? 0
-          : 1));
+      /*buffer=*/SparseWeights() ? 0 : filter_buffer_id));
+
+  const int bias_tensor_id = tensors.size();
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+      TensorType_FLOAT32, bias_buffer_id));
+
+  const int output_tensor_id = tensors.size();
+  const std::array<int32_t, 4> output_shape{
+      {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
       TensorType_FLOAT32));
 
-  const std::array<int32_t, 3> op_inputs{
-      {static_cast<int>(tensors.size()) - 4,
-       static_cast<int>(tensors.size()) - 3,
-       static_cast<int>(tensors.size()) - 2}};
-  const std::array<int32_t, 1> op_outputs{
-      {static_cast<int>(tensors.size()) - 1}};
+  /***************************** Define operators *****************************/
+  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
+  if (SparseWeights()) {
+    const std::array<int32_t, 1> densify_filter_inputs{
+        {sparse_filter_tensor_id}};
+    const std::array<int32_t, 1> densify_filter_outputs{
+        {quantized_filter_tensor_id >= 0 ? quantized_filter_tensor_id
+                                         : filter_tensor_id}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/densify_operator_code,
+        builder.CreateVector<int32_t>(densify_filter_inputs.data(),
+                                      densify_filter_inputs.size()),
+        builder.CreateVector<int32_t>(densify_filter_outputs.data(),
+                                      densify_filter_outputs.size())));
+  }
 
-  flatbuffers::Offset<Conv2DOptions> conv2d_options =
+  if (quantized_filter_tensor_id >= 0) {
+    const std::array<int32_t, 1> dequantize_filter_inputs{
+        {quantized_filter_tensor_id}};
+    const std::array<int32_t, 1> dequantize_filter_outputs{{filter_tensor_id}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/dequantize_operator_code,
+        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                      dequantize_filter_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                      dequantize_filter_outputs.size())));
+  }
+
+  if (quantized_bias_tensor_id >= 0) {
+    const std::array<int32_t, 1> dequantize_bias_inputs{
+        {quantized_bias_tensor_id}};
+    const std::array<int32_t, 1> dequantize_bias_outputs{{bias_tensor_id}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/dequantize_operator_code,
+        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                      dequantize_bias_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                      dequantize_bias_outputs.size())));
+  }
+
+  const std::array<int32_t, 3> op_inputs{
+      {input_tensor_id, filter_tensor_id, bias_tensor_id}};
+  const std::array<int32_t, 1> op_outputs{{output_tensor_id}};
+  const flatbuffers::Offset<Conv2DOptions> conv2d_options =
       CreateConv2DOptions(builder, Padding(), StrideWidth(), StrideHeight(),
                           Activation(), DilationWidth(), DilationHeight());
   operators.emplace_back(CreateOperator(
@@ -401,11 +412,10 @@ std::vector<char> Conv2DTester::CreateTfLiteModel() const {
       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
       BuiltinOptions_Conv2DOptions, conv2d_options.Union()));
 
-  const std::array<int32_t, 1> subgraph_inputs{
-      {static_cast<int>(tensors.size()) - 4}};
-  const std::array<int32_t, 1> subgraph_outputs{
-      {static_cast<int>(tensors.size()) - 1}};
-  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+  /****************************** Define subgraph *****************************/
+  const std::array<int32_t, 1> subgraph_inputs{{input_tensor_id}};
+  const std::array<int32_t, 1> subgraph_outputs{{output_tensor_id}};
+  const flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
       builder, builder.CreateVector(tensors.data(), tensors.size()),
       builder.CreateVector<int32_t>(subgraph_inputs.data(),
                                     subgraph_inputs.size()),
@@ -413,10 +423,10 @@ std::vector<char> Conv2DTester::CreateTfLiteModel() const {
                                     subgraph_outputs.size()),
       builder.CreateVector(operators.data(), operators.size()));
 
-  flatbuffers::Offset<flatbuffers::String> description =
+  const flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("Conv2D model");
 
-  flatbuffers::Offset<Model> model_buffer = CreateModel(
+  const flatbuffers::Offset<Model> model_buffer = CreateModel(
       builder, TFLITE_SCHEMA_VERSION,
       builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
index 4f9f284ba28..0681bff6fcd 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
@@ -29,6 +29,17 @@ namespace xnnpack {
 
 class Conv2DTester {
  public:
+  enum class WeightsType {
+    kFP32,
+    kFP16,
+    kTensorWiseQuantizedInt8,
+    kChannelWiseQuantizedInt8,
+  };
+  enum class BiasType {
+    kFP32,
+    kFP16,
+  };
+
   Conv2DTester() = default;
   Conv2DTester(const Conv2DTester&) = delete;
   Conv2DTester& operator=(const Conv2DTester&) = delete;
@@ -163,28 +174,25 @@ class Conv2DTester {
   }
 
   inline Conv2DTester& FP16Weights() {
-    fp16_weights_ = true;
+    weights_type_ = WeightsType::kFP16;
+    bias_type_ = BiasType::kFP16;
     return *this;
   }
 
-  inline bool FP16Weights() const { return fp16_weights_; }
-
-  inline Conv2DTester& INT8Weights() {
-    int8_weights_ = true;
+  inline Conv2DTester& TensorWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kTensorWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
     return *this;
   }
 
-  inline bool INT8Weights() const { return int8_weights_; }
-
-  inline Conv2DTester& INT8ChannelWiseWeights() {
-    int8_channel_wise_weights_ = true;
+  inline Conv2DTester& ChannelWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kChannelWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
     return *this;
   }
 
-  inline bool INT8ChannelWiseWeights() const {
-    return int8_channel_wise_weights_;
-  }
-
   inline Conv2DTester& SparseWeights() {
     sparse_weights_ = true;
     return *this;
@@ -238,6 +246,10 @@ class Conv2DTester {
   std::vector<char> CreateTfLiteModel() const;
 
  private:
+  inline WeightsType WeightsType() const { return weights_type_; }
+
+  inline BiasType BiasType() const { return bias_type_; }
+
   inline ::tflite::Padding Padding() const { return padding_; }
 
   inline ::tflite::ActivationFunctionType Activation() const {
@@ -256,9 +268,8 @@ class Conv2DTester {
   int32_t stride_width_ = 1;
   int32_t dilation_height_ = 1;
   int32_t dilation_width_ = 1;
-  bool fp16_weights_ = false;
-  bool int8_weights_ = false;
-  bool int8_channel_wise_weights_ = false;
+  enum WeightsType weights_type_ { WeightsType::kFP32 };
+  enum BiasType bias_type_ { BiasType::kFP32 };
   bool sparse_weights_ = false;
   ::tflite::Padding padding_ = ::tflite::Padding_VALID;
   ::tflite::ActivationFunctionType activation_ =
diff --git a/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.cc b/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.cc
index 3757c008ae1..c54d2f5d97b 100644
--- a/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.cc
@@ -45,16 +45,14 @@ void DepthToSpaceTester::Test(TensorType tensor_type,
   auto input_rng = std::bind(input_distribution, std::ref(rng));
 
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data,
-                default_input_data + BatchSize() * InputHeight() *
-                                         InputWidth() * InputChannels(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data,
+                  BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+                  std::ref(input_rng));
 
   T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data,
-            default_input_data +
-                BatchSize() * InputHeight() * InputWidth() * InputChannels(),
-            delegate_input_data);
+  std::copy_n(default_input_data,
+              BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+              delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
@@ -91,17 +89,15 @@ void DepthToSpaceTester::Test<float>(TensorType tensor_type,
       std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
 
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data,
-                default_input_data + BatchSize() * InputHeight() *
-                                         InputWidth() * InputChannels(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data,
+                  BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+                  std::ref(input_rng));
 
   float* delegate_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data,
-            default_input_data +
-                BatchSize() * InputHeight() * InputWidth() * InputChannels(),
-            delegate_input_data);
+  std::copy_n(default_input_data,
+              BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+              delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
index 45476f93abd..c07c5dfcd29 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
@@ -402,7 +402,7 @@ TEST(DepthwiseConv2D, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, INT8Weights) {
+TEST(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -429,11 +429,11 @@ TEST(DepthwiseConv2D, INT8Weights) {
       .KernelWidth(kernel_rng())
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
-      .INT8Weights()
+      .TensorWiseQuantizedInt8Weights()
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, INT8ChannelWiseWeights) {
+TEST(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -460,7 +460,7 @@ TEST(DepthwiseConv2D, INT8ChannelWiseWeights) {
       .KernelWidth(kernel_rng())
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
-      .INT8ChannelWiseWeights()
+      .ChannelWiseQuantizedInt8Weights()
       .Test(xnnpack_delegate.get());
 }
 
@@ -495,6 +495,102 @@ TEST(DepthwiseConv2D, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
+TEST(DepthwiseConv2D, SparseFP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SparseWeights()
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SparseWeights()
+      .TensorWiseQuantizedInt8Weights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SparseWeights()
+      .ChannelWiseQuantizedInt8Weights()
+      .Test(xnnpack_delegate.get());
+}
+
 TEST(DepthwiseConv2D, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
index e4d86cc7aed..f4a2ea36315 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
@@ -79,17 +79,15 @@ void DepthwiseConv2DTester::Test(TfLiteDelegate* delegate) const {
   auto input_rng =
       std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data,
-                default_input_data + BatchSize() * InputHeight() *
-                                         InputWidth() * InputChannels(),
-                input_rng);
+  std::generate_n(default_input_data,
+                  BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+                  input_rng);
 
   float* delegate_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data,
-            default_input_data +
-                BatchSize() * InputHeight() * InputWidth() * InputChannels(),
-            delegate_input_data);
+  std::copy_n(default_input_data,
+              BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+              delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
@@ -124,112 +122,116 @@ std::vector<char> DepthwiseConv2DTester::CreateTfLiteModel() const {
   auto range_rng = std::bind(
       std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
 
+  /*************************** Define operator codes **************************/
   flatbuffers::FlatBufferBuilder builder;
   std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
       {CreateOperatorCode(builder, BuiltinOperator_DEPTHWISE_CONV_2D)}};
-  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
-  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
-      {CreateBuffer(builder, builder.CreateVector({}))}};
+  int dequantize_operator_code = -1;
+  switch (WeightsType()) {
+    case WeightsType::kFP32:
+      break;
+    case WeightsType::kFP16:
+    case WeightsType::kTensorWiseQuantizedInt8:
+    case WeightsType::kChannelWiseQuantizedInt8:
+      dequantize_operator_code = operator_codes.size();
+      operator_codes.emplace_back(
+          CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+      break;
+  }
+  int densify_operator_code = -1;
+  if (SparseWeights()) {
+    densify_operator_code = operator_codes.size();
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DENSIFY));
+  }
 
+  /*********************** Generate filter and bias data **********************/
+  std::vector<float> filter_data(KernelHeight() * KernelWidth() *
+                                 OutputChannels());
+  std::vector<float> bias_data(OutputChannels());
+  for (int32_t ic = 0; ic < InputChannels(); ic++) {
+    // Use the same range of all-positive or all-negative values to generate
+    // all pixels within the same batch index & channel, but different ranges
+    // for different channels or batches. This ensures that no catastrophic
+    // cancellation occur, but test covers both positive and negative inputs.
+    const float range = range_rng();
+    const auto value_dist = std::uniform_real_distribution<float>(
+        std::min(range, 0.0f), std::max(range, 0.0f));
+    auto value_rng = std::bind(value_dist, std::ref(rng));
+    for (int32_t m = 0; m < DepthMultiplier(); m++) {
+      const int32_t oc = ic * DepthMultiplier() + m;
+      bias_data[oc] = value_rng();
+      for (int32_t y = 0; y < KernelHeight(); y++) {
+        for (int32_t x = 0; x < KernelWidth(); x++) {
+          const int32_t index = (y * KernelWidth() + x) * OutputChannels() + oc;
+          filter_data[index] = value_rng();
+        }
+      }
+    }
+  }
+
+  /************************ Define sparsity parameters ************************/
+  flatbuffers::Offset<SparsityParameters> filter_sparsity_params = 0;
   const std::vector<int32_t> filter_shape = {1, KernelHeight(), KernelWidth(),
                                              OutputChannels()};
-  const std::vector<int32_t> bias_shape = {OutputChannels()};
-  std::vector<float> filter_scales;
-  std::vector<int64_t> filter_zero_points;
-  int32_t filter_quantized_dimension = 0;
-  if (FP16Weights()) {
-    operator_codes.emplace_back(
-        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
-
-    std::vector<uint16_t> filter_data(KernelHeight() * KernelWidth() *
-                                      OutputChannels());
-    std::vector<uint16_t> bias_data(OutputChannels());
-    for (int32_t ic = 0; ic < InputChannels(); ic++) {
-      // Use the same range of all-positive or all-negative values to generate
-      // all pixels within the same batch index & channel, but different ranges
-      // for different channels or batches. This ensures that no catastrophic
-      // cancellation occur, but test covers both positive and negative inputs.
-      const float range = range_rng();
-      auto value_rng =
-          std::bind(fp16_ieee_from_fp32_value,
-                    std::bind(std::uniform_real_distribution<float>(
-                                  std::min(range, 0.0f), std::max(range, 0.0f)),
-                              std::ref(rng)));
-      for (int32_t m = 0; m < DepthMultiplier(); m++) {
-        const int32_t oc = ic * DepthMultiplier() + m;
-        bias_data[oc] = value_rng();
-        for (int32_t y = 0; y < KernelHeight(); y++) {
-          for (int32_t x = 0; x < KernelWidth(); x++) {
-            const int32_t index =
-                (y * KernelWidth() + x) * OutputChannels() + oc;
-            filter_data[index] = value_rng();
-          }
-        }
-      }
+  if (SparseWeights()) {
+    // Sparse tensor in TFLite can be in different formats. Here we choose the
+    // simplest configuration that
+    //   1. all dimensions are dense,
+    //   2. in-order traversal, and
+    //   3. no block configuration.
+    const int dims_count = filter_shape.size();
+    std::vector<flatbuffers::Offset<DimensionMetadata>> dim_metadata(
+        dims_count);
+    std::vector<int> traversal_order(dims_count);
+    for (int i = 0; i < dims_count; i++) {
+      traversal_order[i] = i;
+      dim_metadata[i] = CreateDimensionMetadata(builder, DimensionType_DENSE,
+                                                filter_shape[i]);
     }
+    filter_sparsity_params =
+        CreateSparsityParameters(builder, builder.CreateVector(traversal_order),
+                                 0, builder.CreateVector(dim_metadata));
+  }
 
-    buffers.emplace_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(filter_data.data()),
-                     sizeof(uint16_t) * filter_data.size())));
-    buffers.emplace_back(CreateBuffer(
-        builder,
-        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
-                             sizeof(uint16_t) * bias_data.size())));
+  /****************************** Define buffers ******************************/
+  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
+  tflite::TensorType quantized_filter_type = TensorType_FLOAT32;
+  flatbuffers::Offset<tflite::QuantizationParameters>
+      filter_quantization_params = 0;
+  int filter_buffer_id = 0, quantized_filter_buffer_id = 0;
+  switch (WeightsType()) {
+    case WeightsType::kFP32:
+      filter_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(filter_data.data()),
+                       sizeof(float) * filter_data.size())));
+      break;
+    case WeightsType::kFP16: {
+      std::vector<uint16_t> quantized_filter_data(filter_data.size());
+      std::transform(filter_data.begin(), filter_data.end(),
+                     quantized_filter_data.begin(), fp16_ieee_from_fp32_value);
 
-    const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
-    const std::array<int32_t, 1> dequantize_filter_outputs{{3}};
-    operators.emplace_back(CreateOperator(
-        builder, /*opcode_index=*/1,
-        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
-                                      dequantize_filter_inputs.size()),
-        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
-                                      dequantize_filter_outputs.size())));
-    const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
-    const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
-    operators.emplace_back(CreateOperator(
-        builder, /*opcode_index=*/1,
-        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
-                                      dequantize_bias_inputs.size()),
-        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
-                                      dequantize_bias_outputs.size())));
-  } else {
-    std::vector<float> filter_data(KernelHeight() * KernelWidth() *
-                                   OutputChannels());
-    std::vector<float> bias_data(OutputChannels());
-    for (int32_t ic = 0; ic < InputChannels(); ic++) {
-      // Use the same range of all-positive or all-negative values to generate
-      // all pixels within the same batch index & channel, but different ranges
-      // for different channels or batches. This ensures that no catastrophic
-      // cancellation occur, but test covers both positive and negative inputs.
-      const float range = range_rng();
-      auto value_rng =
-          std::bind(std::uniform_real_distribution<float>(
-                        std::min(range, 0.0f), std::max(range, 0.0f)),
-                    std::ref(rng));
-      for (int32_t m = 0; m < DepthMultiplier(); m++) {
-        const int32_t oc = ic * DepthMultiplier() + m;
-        bias_data[oc] = value_rng();
-        for (int32_t y = 0; y < KernelHeight(); y++) {
-          for (int32_t x = 0; x < KernelWidth(); x++) {
-            const int32_t index =
-                (y * KernelWidth() + x) * OutputChannels() + oc;
-            filter_data[index] = value_rng();
-          }
-        }
-      }
+      quantized_filter_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
+          builder,
+          builder.CreateVector(
+              reinterpret_cast<const uint8_t*>(quantized_filter_data.data()),
+              sizeof(uint16_t) * quantized_filter_data.size())));
+
+      quantized_filter_type = TensorType_FLOAT16;
+      break;
     }
+    case WeightsType::kTensorWiseQuantizedInt8:
+    case WeightsType::kChannelWiseQuantizedInt8: {
+      std::vector<float> filter_scales;
+      std::vector<int64_t> filter_zero_points;
+      int32_t filter_quantized_dimension = 0;
 
-    if (INT8Weights() || INT8ChannelWiseWeights()) {
       std::vector<int8_t> quantized_filter_data(filter_data.size());
-      if (INT8Weights()) {
-        filter_scales.resize(1, GetInt8QuantizationScale(filter_data));
-        filter_zero_points.resize(1, 0);
-        std::transform(filter_data.begin(), filter_data.end(),
-                       quantized_filter_data.begin(),
-                       std::bind(QuantizeInt8, std::placeholders::_1, 0,
-                                 filter_scales[0]));
-      } else {
+      if (WeightsType() == WeightsType::kChannelWiseQuantizedInt8) {
         filter_quantized_dimension =
             static_cast<int32_t>(filter_shape.size()) - 1;
         const int32_t num_scales = filter_shape[filter_quantized_dimension];
@@ -239,129 +241,165 @@ std::vector<char> DepthwiseConv2DTester::CreateTfLiteModel() const {
         QuantizeInt8PerChannel(filter_scales.data(), filter_zero_points.data(),
                                filter_quantized_dimension, filter_data.data(),
                                quantized_filter_data.data(), filter_shape);
+      } else {
+        filter_scales.resize(1, GetInt8QuantizationScale(filter_data));
+        filter_zero_points.resize(1, 0);
+        std::transform(filter_data.begin(), filter_data.end(),
+                       quantized_filter_data.begin(),
+                       std::bind(QuantizeInt8, std::placeholders::_1, 0,
+                                 filter_scales[0]));
       }
+
+      quantized_filter_buffer_id = buffers.size();
       buffers.emplace_back(CreateBuffer(
           builder,
           builder.CreateVector(
               reinterpret_cast<const uint8_t*>(quantized_filter_data.data()),
               sizeof(int8_t) * quantized_filter_data.size())));
-      operator_codes.emplace_back(
-          CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
-      const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
-      const std::array<int32_t, 1> dequantize_filter_outputs{{2}};
-      operators.emplace_back(CreateOperator(
-          builder, /*opcode_index=*/1,
-          builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
-                                        dequantize_filter_inputs.size()),
-          builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
-                                        dequantize_filter_outputs.size())));
-    } else {
+
+      quantized_filter_type = TensorType_INT8;
+      filter_quantization_params = CreateQuantizationParameters(
+          builder, /*min=*/0, /*max=*/0,
+          builder.CreateVector<float>(filter_scales),
+          builder.CreateVector<int64_t>(filter_zero_points),
+          /*details_type=*/QuantizationDetails_NONE,
+          /*details=*/0, filter_quantized_dimension);
+      break;
+    }
+  }
+  tflite::TensorType quantized_bias_type = TensorType_FLOAT32;
+  int bias_buffer_id = 0, quantized_bias_buffer_id = 0;
+  switch (BiasType()) {
+    case BiasType::kFP32:
+      bias_buffer_id = buffers.size();
       buffers.emplace_back(CreateBuffer(
           builder, builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(filter_data.data()),
-                       sizeof(float) * filter_data.size())));
+                       reinterpret_cast<const uint8_t*>(bias_data.data()),
+                       sizeof(float) * bias_data.size())));
+      break;
+    case BiasType::kFP16: {
+      std::vector<uint16_t> quantized_bias_data(bias_data.size());
+      std::transform(bias_data.begin(), bias_data.end(),
+                     quantized_bias_data.begin(), fp16_ieee_from_fp32_value);
 
-      if (SparseWeights()) {
-        operator_codes.emplace_back(
-            CreateOperatorCode(builder, BuiltinOperator_DENSIFY));
-        const std::array<int32_t, 1> densify_filter_inputs{{0}};
-        const std::array<int32_t, 1> densify_filter_outputs{{2}};
-        operators.emplace_back(CreateOperator(
-            builder, /*opcode_index=*/1,
-            builder.CreateVector<int32_t>(densify_filter_inputs.data(),
-                                          densify_filter_inputs.size()),
-            builder.CreateVector<int32_t>(densify_filter_outputs.data(),
-                                          densify_filter_outputs.size())));
-      }
+      quantized_bias_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
+          builder,
+          builder.CreateVector(
+              reinterpret_cast<const uint8_t*>(quantized_bias_data.data()),
+              sizeof(uint16_t) * quantized_bias_data.size())));
+
+      quantized_bias_type = TensorType_FLOAT16;
+      break;
     }
-
-    // Bias is stored in FP32 even when filter is quantized to INT8
-    buffers.emplace_back(CreateBuffer(
-        builder,
-        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
-                             sizeof(float) * bias_data.size())));
   }
 
-  const std::array<int32_t, 4> input_shape{
-      {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
-  const std::array<int32_t, 4> output_shape{
-      {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
-
+  /****************************** Define tensors ******************************/
   std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
-  if (FP16Weights()) {
+  int sparse_filter_tensor_id = -1;
+  if (SparseWeights()) {
+    sparse_filter_tensor_id = tensors.size();
     tensors.emplace_back(CreateTensor(
         builder,
         builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
-        TensorType_FLOAT16, /*buffer=*/1));
+        /*type=*/quantized_filter_type,
+        /*buffer=*/std::max(filter_buffer_id, quantized_filter_buffer_id),
+        /*name=*/0, filter_quantization_params,
+        /*is_variable=*/false, filter_sparsity_params));
+  }
+  int quantized_filter_tensor_id = -1;
+  if (quantized_filter_type != TensorType_FLOAT32) {
+    quantized_filter_tensor_id = tensors.size();
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        /*type=*/quantized_filter_type,
+        /*buffer=*/SparseWeights() ? 0 : quantized_filter_buffer_id,
+        /*name=*/0, filter_quantization_params));
+  }
+  int quantized_bias_tensor_id = -1;
+  const std::vector<int32_t> bias_shape = {OutputChannels()};
+  if (quantized_bias_type != TensorType_FLOAT32) {
+    quantized_bias_tensor_id = tensors.size();
     tensors.emplace_back(CreateTensor(
         builder,
         builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-        TensorType_FLOAT16, /*buffer=*/2));
-  } else if (INT8Weights() || INT8ChannelWiseWeights()) {
-    tensors.emplace_back(CreateTensor(
-        builder,
-        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
-        TensorType_INT8, /*buffer=*/1, /*name=*/0,
-        CreateQuantizationParameters(
-            builder, /*min=*/0, /*max=*/0,
-            builder.CreateVector<float>(filter_scales),
-            builder.CreateVector<int64_t>(filter_zero_points),
-            /*details_type=*/QuantizationDetails_NONE,
-            /*details=*/0, filter_quantized_dimension)));
-  } else if (SparseWeights()) {
-    // Sparse tensor in TFLite can be in different formats. Here we choose the
-    // simplest configuration that
-    //   1. all dimensions are dense,
-    //   2. in-order traversal, and
-    //   3. no block configuration.
-    int dims_count = filter_shape.size();
-    std::vector<flatbuffers::Offset<DimensionMetadata>> dim_metadata(
-        dims_count);
-    std::vector<int> traversal_order(dims_count);
-    for (int i = 0; i < dims_count; i++) {
-      traversal_order[i] = i;
-      dim_metadata[i] = CreateDimensionMetadata(builder, DimensionType_DENSE,
-                                                filter_shape[i]);
-    }
-    flatbuffers::Offset<SparsityParameters> sparsity_param =
-        CreateSparsityParameters(builder, builder.CreateVector(traversal_order),
-                                 0, builder.CreateVector(dim_metadata));
-    tensors.emplace_back(CreateTensor(
-        builder,
-        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
-        TensorType_FLOAT32, /*buffer=*/1, /*name=*/0, /*quantization=*/0,
-        /*is_variable=*/false, /*sparsity=*/sparsity_param));
+        quantized_bias_type, /*buffer=*/quantized_bias_buffer_id));
   }
+
+  const int input_tensor_id = tensors.size();
+  const std::array<int32_t, 4> input_shape{
+      {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
       TensorType_FLOAT32));
+
+  const int filter_tensor_id = tensors.size();
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
       TensorType_FLOAT32,
-      /*buffer=*/
-      (FP16Weights() || INT8Weights() || INT8ChannelWiseWeights() ||
-       SparseWeights())
-          ? 0
-          : 1));
+      /*buffer=*/SparseWeights() ? 0 : filter_buffer_id));
+
+  const int bias_tensor_id = tensors.size();
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+      TensorType_FLOAT32, bias_buffer_id));
+
+  const int output_tensor_id = tensors.size();
+  const std::array<int32_t, 4> output_shape{
+      {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
       TensorType_FLOAT32));
 
-  const std::array<int32_t, 3> op_inputs{
-      {static_cast<int>(tensors.size()) - 4,
-       static_cast<int>(tensors.size()) - 3,
-       static_cast<int>(tensors.size()) - 2}};
-  const std::array<int32_t, 1> op_outputs{
-      {static_cast<int>(tensors.size()) - 1}};
+  /***************************** Define operators *****************************/
+  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
+  if (SparseWeights()) {
+    const std::array<int32_t, 1> densify_filter_inputs{
+        {sparse_filter_tensor_id}};
+    const std::array<int32_t, 1> densify_filter_outputs{
+        {quantized_filter_tensor_id >= 0 ? quantized_filter_tensor_id
+                                         : filter_tensor_id}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/densify_operator_code,
+        builder.CreateVector<int32_t>(densify_filter_inputs.data(),
+                                      densify_filter_inputs.size()),
+        builder.CreateVector<int32_t>(densify_filter_outputs.data(),
+                                      densify_filter_outputs.size())));
+  }
 
-  flatbuffers::Offset<DepthwiseConv2DOptions> depthwise_conv2d_options =
+  if (quantized_filter_tensor_id >= 0) {
+    const std::array<int32_t, 1> dequantize_filter_inputs{
+        {quantized_filter_tensor_id}};
+    const std::array<int32_t, 1> dequantize_filter_outputs{{filter_tensor_id}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/dequantize_operator_code,
+        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                      dequantize_filter_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                      dequantize_filter_outputs.size())));
+  }
+
+  if (quantized_bias_tensor_id >= 0) {
+    const std::array<int32_t, 1> dequantize_bias_inputs{
+        {quantized_bias_tensor_id}};
+    const std::array<int32_t, 1> dequantize_bias_outputs{{bias_tensor_id}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/dequantize_operator_code,
+        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                      dequantize_bias_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                      dequantize_bias_outputs.size())));
+  }
+
+  const std::array<int32_t, 3> op_inputs{
+      {input_tensor_id, filter_tensor_id, bias_tensor_id}};
+  const std::array<int32_t, 1> op_outputs{{output_tensor_id}};
+  const flatbuffers::Offset<DepthwiseConv2DOptions> depthwise_conv2d_options =
       CreateDepthwiseConv2DOptions(
           builder, Padding(), StrideWidth(), StrideHeight(), DepthMultiplier(),
           Activation(), DilationWidth(), DilationHeight());
@@ -371,11 +409,10 @@ std::vector<char> DepthwiseConv2DTester::CreateTfLiteModel() const {
       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
       BuiltinOptions_DepthwiseConv2DOptions, depthwise_conv2d_options.Union()));
 
-  const std::array<int32_t, 1> subgraph_inputs{
-      {static_cast<int>(tensors.size()) - 4}};
-  const std::array<int32_t, 1> subgraph_outputs{
-      {static_cast<int>(tensors.size()) - 1}};
-  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+  /****************************** Define subgraph *****************************/
+  const std::array<int32_t, 1> subgraph_inputs{{input_tensor_id}};
+  const std::array<int32_t, 1> subgraph_outputs{{output_tensor_id}};
+  const flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
       builder, builder.CreateVector(tensors.data(), tensors.size()),
       builder.CreateVector<int32_t>(subgraph_inputs.data(),
                                     subgraph_inputs.size()),
@@ -383,10 +420,10 @@ std::vector<char> DepthwiseConv2DTester::CreateTfLiteModel() const {
                                     subgraph_outputs.size()),
       builder.CreateVector(operators.data(), operators.size()));
 
-  flatbuffers::Offset<flatbuffers::String> description =
+  const flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("DepthwiseConv2D model");
 
-  flatbuffers::Offset<Model> model_buffer = CreateModel(
+  const flatbuffers::Offset<Model> model_buffer = CreateModel(
       builder, TFLITE_SCHEMA_VERSION,
       builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
index 8748d8663da..1bcbb4ffced 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
@@ -29,6 +29,17 @@ namespace xnnpack {
 
 class DepthwiseConv2DTester {
  public:
+  enum class WeightsType {
+    kFP32,
+    kFP16,
+    kTensorWiseQuantizedInt8,
+    kChannelWiseQuantizedInt8,
+  };
+  enum class BiasType {
+    kFP32,
+    kFP16,
+  };
+
   DepthwiseConv2DTester() = default;
   DepthwiseConv2DTester(const DepthwiseConv2DTester&) = delete;
   DepthwiseConv2DTester& operator=(const DepthwiseConv2DTester&) = delete;
@@ -154,28 +165,25 @@ class DepthwiseConv2DTester {
   }
 
   inline DepthwiseConv2DTester& FP16Weights() {
-    fp16_weights_ = true;
+    weights_type_ = WeightsType::kFP16;
+    bias_type_ = BiasType::kFP16;
     return *this;
   }
 
-  inline bool FP16Weights() const { return fp16_weights_; }
-
-  inline DepthwiseConv2DTester& INT8Weights() {
-    int8_weights_ = true;
+  inline DepthwiseConv2DTester& TensorWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kTensorWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
     return *this;
   }
 
-  inline bool INT8Weights() const { return int8_weights_; }
-
-  inline DepthwiseConv2DTester& INT8ChannelWiseWeights() {
-    int8_channel_wise_weights_ = true;
+  inline DepthwiseConv2DTester& ChannelWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kChannelWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
     return *this;
   }
 
-  inline bool INT8ChannelWiseWeights() const {
-    return int8_channel_wise_weights_;
-  }
-
   inline DepthwiseConv2DTester& SparseWeights() {
     sparse_weights_ = true;
     return *this;
@@ -229,6 +237,10 @@ class DepthwiseConv2DTester {
  private:
   std::vector<char> CreateTfLiteModel() const;
 
+  inline WeightsType WeightsType() const { return weights_type_; }
+
+  inline BiasType BiasType() const { return bias_type_; }
+
   inline ::tflite::Padding Padding() const { return padding_; }
 
   inline ::tflite::ActivationFunctionType Activation() const {
@@ -246,9 +258,8 @@ class DepthwiseConv2DTester {
   int32_t stride_width_ = 1;
   int32_t dilation_height_ = 1;
   int32_t dilation_width_ = 1;
-  bool fp16_weights_ = false;
-  bool int8_weights_ = false;
-  bool int8_channel_wise_weights_ = false;
+  enum WeightsType weights_type_ { WeightsType::kFP32 };
+  enum BiasType bias_type_ { BiasType::kFP32 };
   bool sparse_weights_ = false;
   ::tflite::Padding padding_ = ::tflite::Padding_VALID;
   ::tflite::ActivationFunctionType activation_ =
diff --git a/tensorflow/lite/delegates/xnnpack/dequantize_tester.cc b/tensorflow/lite/delegates/xnnpack/dequantize_tester.cc
index c5e67f6e974..edfb52f36ec 100644
--- a/tensorflow/lite/delegates/xnnpack/dequantize_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/dequantize_tester.cc
@@ -45,12 +45,11 @@ void DequantizeTester::Test(Interpreter* delegate_interpreter,
   auto input_rng = std::bind(input_distribution, std::ref(rng));
 
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data, default_input_data + ComputeSize(Shape()),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, ComputeSize(Shape()),
+                  std::ref(input_rng));
 
   T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data, default_input_data + ComputeSize(Shape()),
-            delegate_input_data);
+  std::copy_n(default_input_data, ComputeSize(Shape()), delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
index ac4aaef49c5..ffc6973de75 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
@@ -298,7 +298,7 @@ TEST(FullyConnected, FP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, INT8Weights) {
+TEST(FullyConnected, DynamicWeights) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -317,11 +317,11 @@ TEST(FullyConnected, INT8Weights) {
       .InputShape({batch, input_channels})
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
-      .INT8Weights()
+      .DynamicWeights()
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, INT8WeightsNoBias) {
+TEST(FullyConnected, DynamicWeightsNoBias) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -340,12 +340,12 @@ TEST(FullyConnected, INT8WeightsNoBias) {
       .InputShape({batch, input_channels})
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
-      .INT8Weights()
+      .DynamicWeights()
       .NoBias()
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, INT8ChannelWiseWeights) {
+TEST(FullyConnected, DynamicBias) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -364,11 +364,11 @@ TEST(FullyConnected, INT8ChannelWiseWeights) {
       .InputShape({batch, input_channels})
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
-      .INT8ChannelWiseWeights()
+      .DynamicBias()
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, INT8ChannelWiseWeightsNoBias) {
+TEST(FullyConnected, DynamicWeightsAndBias) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -387,7 +387,101 @@ TEST(FullyConnected, INT8ChannelWiseWeightsNoBias) {
       .InputShape({batch, input_channels})
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
-      .INT8ChannelWiseWeights()
+      .DynamicWeights()
+      .DynamicBias()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, TensorWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .TensorWiseQuantizedInt8Weights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, TensorWiseQuantizedInt8WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .TensorWiseQuantizedInt8Weights()
+      .NoBias()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, ChannelWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .ChannelWiseQuantizedInt8Weights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(FullyConnected, ChannelWiseQuantizedInt8WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .ChannelWiseQuantizedInt8Weights()
       .NoBias()
       .Test(xnnpack_delegate.get());
 }
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
index e8c74d7600b..236c2296f64 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
@@ -79,8 +79,12 @@ void FullyConnectedTester::Test(TfLiteDelegate* delegate) const {
   ASSERT_TRUE(delegate_interpreter);
   ASSERT_TRUE(default_interpreter);
 
-  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
-  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+  ASSERT_EQ(delegate_interpreter->inputs().size(),
+            1 + static_cast<int>(WeightsType() == WeightsType::kDynamic) +
+                static_cast<int>(BiasType() == BiasType::kDynamic));
+  ASSERT_EQ(default_interpreter->inputs().size(),
+            1 + static_cast<int>(WeightsType() == WeightsType::kDynamic) +
+                static_cast<int>(BiasType() == BiasType::kDynamic));
 
   ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
   ASSERT_EQ(default_interpreter->outputs().size(), 1);
@@ -95,13 +99,32 @@ void FullyConnectedTester::Test(TfLiteDelegate* delegate) const {
   }
 
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data, default_input_data + InputSize(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, InputSize(), std::ref(input_rng));
 
   float* delegate_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data, default_input_data + InputSize(),
-            delegate_input_data);
+  std::copy_n(default_input_data, InputSize(), delegate_input_data);
+
+  if (WeightsType() == WeightsType::kDynamic) {
+    float* default_kernel_data =
+        default_interpreter->typed_input_tensor<float>(1);
+    std::generate_n(default_kernel_data, InputChannels() * OutputChannels(),
+                    std::ref(input_rng));
+
+    float* delegate_kernel_data =
+        delegate_interpreter->typed_input_tensor<float>(1);
+    std::copy_n(default_kernel_data, InputChannels() * OutputChannels(),
+                delegate_kernel_data);
+  }
+  if (BiasType() == BiasType::kDynamic) {
+    float* default_bias_data = default_interpreter->typed_tensor<float>(
+        default_interpreter->inputs().back());
+    std::generate_n(default_bias_data, OutputChannels(), std::ref(input_rng));
+
+    float* delegate_bias_data = delegate_interpreter->typed_tensor<float>(
+        delegate_interpreter->inputs().back());
+    std::copy_n(default_bias_data, OutputChannels(), delegate_bias_data);
+  }
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
@@ -114,7 +137,7 @@ void FullyConnectedTester::Test(TfLiteDelegate* delegate) const {
   for (size_t i = 0; i < ComputeSize(OutputShape()); i++) {
     ASSERT_NEAR(default_output_data[i], delegate_output_data[i],
                 std::numeric_limits<float>::epsilon() *
-                    std::max(std::abs(default_output_data[i]) * 10.0f, 1.0f));
+                    std::max(std::abs(default_output_data[i]) * 20.0f, 1.0f));
   }
 }
 
@@ -124,104 +147,83 @@ std::vector<char> FullyConnectedTester::CreateTfLiteModel() const {
   auto range_rng = std::bind(
       std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
 
+  /*************************** Define operator codes **************************/
   flatbuffers::FlatBufferBuilder builder;
   std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
       {CreateOperatorCode(builder, BuiltinOperator_FULLY_CONNECTED)}};
-  std::vector<flatbuffers::Offset<Operator>> operators;
-  std::vector<flatbuffers::Offset<Buffer>> buffers{
-      {CreateBuffer(builder, builder.CreateVector({}))}};
+  int dequantize_operator_code = -1;
+  switch (WeightsType()) {
+    case WeightsType::kFP32:
+    case WeightsType::kDynamic:
+      break;
+    case WeightsType::kFP16:
+    case WeightsType::kTensorWiseQuantizedInt8:
+    case WeightsType::kChannelWiseQuantizedInt8:
+      dequantize_operator_code = operator_codes.size();
+      operator_codes.emplace_back(
+          CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+      break;
+  }
 
-  const std::vector<int32_t> filter_shape = {OutputChannels(), InputChannels()};
-  const std::vector<int32_t> bias_shape = {OutputChannels()};
-  std::vector<float> filter_scales;
-  std::vector<int64_t> filter_zero_points;
-  int32_t filter_quantized_dimension = 0;
-  if (FP16Weights()) {
-    operator_codes.emplace_back(
-        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+  /*********************** Generate filter and bias data **********************/
+  std::vector<float> filter_data(InputChannels() * OutputChannels());
+  std::vector<float> bias_data(OutputChannels());
 
-    std::vector<uint16_t> filter_data(InputChannels() * OutputChannels());
-    std::vector<uint16_t> bias_data(OutputChannels());
+  for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+    // Use the same range of all-positive or all-negative values to generate
+    // all filter & bias weights within the same channel, but different ranges
+    // for different output channels. This ensures that no catastrophic
+    // cancellation occur, but test covers both positive and negative inputs.
+    const float range = range_rng();
+    const auto value_dist = std::uniform_real_distribution<float>(
+        std::min(range, 0.0f), std::max(range, 0.0f));
+    auto value_rng = std::bind(value_dist, std::ref(rng));
 
-    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
-      // Use the same range of all-positive or all-negative values to generate
-      // all filter & bias weights within the same channel, but different ranges
-      // for different output channels. This ensures that no catastrophic
-      // cancellation occur, but test covers both positive and negative inputs.
-      const float range = range_rng();
-      auto value_rng =
-          std::bind(fp16_ieee_from_fp32_value,
-                    std::bind(std::uniform_real_distribution<float>(
-                                  std::min(range, 0.0f), std::max(range, 0.0f)),
-                              std::ref(rng)));
-
-      bias_data[oc] = value_rng();
-      for (int32_t ic = 0; ic < InputChannels(); ic++) {
-        filter_data[oc * InputChannels() + ic] = value_rng();
-      }
+    bias_data[oc] = value_rng();
+    for (int32_t ic = 0; ic < InputChannels(); ic++) {
+      filter_data[oc * InputChannels() + ic] = value_rng();
     }
+  }
 
-    buffers.emplace_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(filter_data.data()),
-                     sizeof(uint16_t) * filter_data.size())));
-    if (HasBias()) {
+  /****************************** Define buffers ******************************/
+  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
+  tflite::TensorType quantized_filter_type = TensorType_FLOAT32;
+  flatbuffers::Offset<tflite::QuantizationParameters>
+      filter_quantization_params = 0;
+  int filter_buffer_id = 0, quantized_filter_buffer_id = 0;
+  const std::vector<int32_t> filter_shape = {OutputChannels(), InputChannels()};
+  switch (WeightsType()) {
+    case WeightsType::kFP32:
+      filter_buffer_id = buffers.size();
       buffers.emplace_back(CreateBuffer(
           builder, builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(bias_data.data()),
-                       sizeof(uint16_t) * bias_data.size())));
+                       reinterpret_cast<const uint8_t*>(filter_data.data()),
+                       sizeof(float) * filter_data.size())));
+      break;
+    case WeightsType::kFP16: {
+      std::vector<uint16_t> quantized_filter_data(filter_data.size());
+      std::transform(filter_data.begin(), filter_data.end(),
+                     quantized_filter_data.begin(), fp16_ieee_from_fp32_value);
+
+      quantized_filter_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
+          builder,
+          builder.CreateVector(
+              reinterpret_cast<const uint8_t*>(quantized_filter_data.data()),
+              sizeof(uint16_t) * quantized_filter_data.size())));
+
+      quantized_filter_type = TensorType_FLOAT16;
+      break;
     }
+    case WeightsType::kTensorWiseQuantizedInt8:
+    case WeightsType::kChannelWiseQuantizedInt8: {
+      std::vector<float> filter_scales;
+      std::vector<int64_t> filter_zero_points;
+      int32_t filter_quantized_dimension = 0;
 
-    const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
-    const std::array<int32_t, 1> dequantize_filter_outputs{
-        {2 + static_cast<int32_t>(HasBias())}};
-    operators.emplace_back(CreateOperator(
-        builder, /*opcode_index=*/1,
-        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
-                                      dequantize_filter_inputs.size()),
-        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
-                                      dequantize_filter_outputs.size())));
-    if (HasBias()) {
-      const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
-      const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
-      operators.emplace_back(CreateOperator(
-          builder, /*opcode_index=*/1,
-          builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
-                                        dequantize_bias_inputs.size()),
-          builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
-                                        dequantize_bias_outputs.size())));
-    }
-  } else {
-    std::vector<float> filter_data(InputChannels() * OutputChannels());
-    std::vector<float> bias_data(OutputChannels());
-
-    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
-      // Use the same range of all-positive or all-negative values to generate
-      // all filter & bias weights within the same channel, but different ranges
-      // for different output channels. This ensures that no catastrophic
-      // cancellation occur, but test covers both positive and negative inputs.
-      const float range = range_rng();
-      auto value_rng =
-          std::bind(std::uniform_real_distribution<float>(
-                        std::min(range, 0.0f), std::max(range, 0.0f)),
-                    std::ref(rng));
-
-      bias_data[oc] = value_rng();
-      for (int32_t ic = 0; ic < InputChannels(); ic++) {
-        filter_data[oc * InputChannels() + ic] = value_rng();
-      }
-    }
-
-    if (INT8Weights() || INT8ChannelWiseWeights()) {
       std::vector<int8_t> quantized_filter_data(filter_data.size());
-      if (INT8Weights()) {
-        filter_scales.resize(1, GetInt8QuantizationScale(filter_data));
-        filter_zero_points.resize(1, 0);
-        std::transform(filter_data.begin(), filter_data.end(),
-                       quantized_filter_data.begin(),
-                       std::bind(QuantizeInt8, std::placeholders::_1, 0,
-                                 filter_scales[0]));
-      } else {
+      if (WeightsType() == WeightsType::kChannelWiseQuantizedInt8) {
         filter_quantized_dimension =
             static_cast<int32_t>(filter_shape.size()) - 1;
         const int32_t num_scales = filter_shape[filter_quantized_dimension];
@@ -231,108 +233,165 @@ std::vector<char> FullyConnectedTester::CreateTfLiteModel() const {
         QuantizeInt8PerChannel(filter_scales.data(), filter_zero_points.data(),
                                filter_quantized_dimension, filter_data.data(),
                                quantized_filter_data.data(), filter_shape);
+      } else {
+        filter_scales.resize(1, GetInt8QuantizationScale(filter_data));
+        filter_zero_points.resize(1, 0);
+        std::transform(filter_data.begin(), filter_data.end(),
+                       quantized_filter_data.begin(),
+                       std::bind(QuantizeInt8, std::placeholders::_1, 0,
+                                 filter_scales[0]));
       }
+
+      quantized_filter_buffer_id = buffers.size();
       buffers.emplace_back(CreateBuffer(
           builder,
           builder.CreateVector(
               reinterpret_cast<const uint8_t*>(quantized_filter_data.data()),
               sizeof(int8_t) * quantized_filter_data.size())));
-      operator_codes.emplace_back(
-          CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
-      const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
-      const std::array<int32_t, 1> dequantize_filter_outputs{{2}};
-      operators.emplace_back(CreateOperator(
-          builder, /*opcode_index=*/1,
-          builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
-                                        dequantize_filter_inputs.size()),
-          builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
-                                        dequantize_filter_outputs.size())));
-    } else {
-      buffers.emplace_back(CreateBuffer(
-          builder, builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(filter_data.data()),
-                       sizeof(float) * filter_data.size())));
-    }
 
-    // Bias is stored in FP32 even when filter is quantized to INT8
-    if (HasBias()) {
+      quantized_filter_type = TensorType_INT8;
+      filter_quantization_params = CreateQuantizationParameters(
+          builder, /*min=*/0, /*max=*/0,
+          builder.CreateVector<float>(filter_scales),
+          builder.CreateVector<int64_t>(filter_zero_points),
+          /*details_type=*/QuantizationDetails_NONE,
+          /*details=*/0, filter_quantized_dimension);
+      break;
+    }
+    case WeightsType::kDynamic:
+      break;
+  }
+  tflite::TensorType quantized_bias_type = TensorType_FLOAT32;
+  int bias_buffer_id = 0, quantized_bias_buffer_id = 0;
+  switch (BiasType()) {
+    case BiasType::kNone:
+    case BiasType::kDynamic:
+      break;
+    case BiasType::kFP32:
+      bias_buffer_id = buffers.size();
       buffers.emplace_back(CreateBuffer(
           builder, builder.CreateVector(
                        reinterpret_cast<const uint8_t*>(bias_data.data()),
                        sizeof(float) * bias_data.size())));
+      break;
+    case BiasType::kFP16: {
+      std::vector<uint16_t> quantized_bias_data(bias_data.size());
+      std::transform(bias_data.begin(), bias_data.end(),
+                     quantized_bias_data.begin(), fp16_ieee_from_fp32_value);
+
+      quantized_bias_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
+          builder,
+          builder.CreateVector(
+              reinterpret_cast<const uint8_t*>(quantized_bias_data.data()),
+              sizeof(uint16_t) * quantized_bias_data.size())));
+
+      quantized_bias_type = TensorType_FLOAT16;
+      break;
     }
   }
 
-  const std::vector<int32_t> output_shape = OutputShape();
-  std::vector<flatbuffers::Offset<Tensor>> tensors;
-  if (FP16Weights()) {
+  /****************************** Define tensors ******************************/
+  std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
+  int quantized_filter_tensor_id = -1;
+  if (quantized_filter_type != TensorType_FLOAT32) {
+    quantized_filter_tensor_id = tensors.size();
     tensors.emplace_back(CreateTensor(
         builder,
         builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
-        TensorType_FLOAT16, /*buffer=*/1));
-    if (HasBias()) {
-      tensors.emplace_back(CreateTensor(
-          builder,
-          builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-          TensorType_FLOAT16, /*buffer=*/2));
-    }
-  } else if (INT8Weights() || INT8ChannelWiseWeights()) {
-    tensors.emplace_back(CreateTensor(
-        builder,
-        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
-        TensorType_INT8, /*buffer=*/1, /*name=*/0,
-        CreateQuantizationParameters(
-            builder, /*min=*/0, /*max=*/0,
-            builder.CreateVector<float>(filter_scales),
-            builder.CreateVector<int64_t>(filter_zero_points),
-            /*details_type=*/QuantizationDetails_NONE,
-            /*details=*/0, filter_quantized_dimension)));
+        /*type=*/quantized_filter_type,
+        /*buffer=*/quantized_filter_buffer_id,
+        /*name=*/0, filter_quantization_params));
   }
+  int quantized_bias_tensor_id = -1;
+  const std::vector<int32_t> bias_shape = {OutputChannels()};
+  if (HasBias() && quantized_bias_type != TensorType_FLOAT32) {
+    quantized_bias_tensor_id = tensors.size();
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+        quantized_bias_type, /*buffer=*/quantized_bias_buffer_id));
+  }
+
+  const int input_tensor_id = tensors.size();
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(InputShape().data(), InputShape().size()),
       TensorType_FLOAT32));
+
+  const int filter_tensor_id = tensors.size();
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
       TensorType_FLOAT32,
-      /*buffer=*/
-      (FP16Weights() || INT8Weights() || INT8ChannelWiseWeights()) ? 0 : 1));
+      /*buffer=*/filter_buffer_id));
+
+  const int bias_tensor_id = HasBias() ? tensors.size() : -1;
   if (HasBias()) {
     tensors.emplace_back(CreateTensor(
         builder,
         builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-        TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+        TensorType_FLOAT32, bias_buffer_id));
   }
+
+  const int output_tensor_id = tensors.size();
+  const std::vector<int32_t> output_shape = OutputShape();
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
       TensorType_FLOAT32));
 
-  flatbuffers::Offset<FullyConnectedOptions> fully_connected_options =
+  /***************************** Define operators *****************************/
+  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
+  if (quantized_filter_tensor_id >= 0) {
+    const std::array<int32_t, 1> dequantize_filter_inputs{
+        {quantized_filter_tensor_id}};
+    const std::array<int32_t, 1> dequantize_filter_outputs{{filter_tensor_id}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/dequantize_operator_code,
+        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                      dequantize_filter_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                      dequantize_filter_outputs.size())));
+  }
+
+  if (quantized_bias_tensor_id >= 0) {
+    const std::array<int32_t, 1> dequantize_bias_inputs{
+        {quantized_bias_tensor_id}};
+    const std::array<int32_t, 1> dequantize_bias_outputs{{bias_tensor_id}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/dequantize_operator_code,
+        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                      dequantize_bias_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                      dequantize_bias_outputs.size())));
+  }
+
+  std::vector<int32_t> op_inputs{{input_tensor_id, filter_tensor_id}};
+  if (HasBias()) {
+    op_inputs.push_back(bias_tensor_id);
+  }
+  const std::array<int32_t, 1> op_outputs{{output_tensor_id}};
+  const flatbuffers::Offset<FullyConnectedOptions> fully_connected_options =
       CreateFullyConnectedOptions(builder, Activation(),
                                   FullyConnectedOptionsWeightsFormat_DEFAULT,
                                   KeepDims());
-
-  std::vector<int32_t> op_inputs{{static_cast<int32_t>(tensors.size()) - 3,
-                                  static_cast<int32_t>(tensors.size()) - 2}};
-  if (HasBias()) {
-    op_inputs.insert(op_inputs.begin(),
-                     static_cast<int32_t>(tensors.size()) - 4);
-  }
-  const std::array<int32_t, 1> op_outputs{
-      {static_cast<int32_t>(tensors.size()) - 1}};
   operators.emplace_back(CreateOperator(
       builder, /*opcode_index=*/0,
       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
       BuiltinOptions_FullyConnectedOptions, fully_connected_options.Union()));
 
-  const std::array<int32_t, 1> subgraph_inputs{
-      {static_cast<int>(tensors.size()) - 3 - static_cast<int32_t>(HasBias())}};
-  const std::array<int32_t, 1> subgraph_outputs{
-      {static_cast<int>(tensors.size()) - 1}};
-  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+  /****************************** Define subgraph *****************************/
+  std::vector<int32_t> subgraph_inputs{input_tensor_id};
+  if (WeightsType() == WeightsType::kDynamic) {
+    subgraph_inputs.push_back(filter_tensor_id);
+  }
+  if (BiasType() == BiasType::kDynamic) {
+    subgraph_inputs.push_back(bias_tensor_id);
+  }
+  const std::array<int32_t, 1> subgraph_outputs{{output_tensor_id}};
+  const flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
       builder, builder.CreateVector(tensors.data(), tensors.size()),
       builder.CreateVector<int32_t>(subgraph_inputs.data(),
                                     subgraph_inputs.size()),
@@ -340,10 +399,10 @@ std::vector<char> FullyConnectedTester::CreateTfLiteModel() const {
                                     subgraph_outputs.size()),
       builder.CreateVector(operators.data(), operators.size()));
 
-  flatbuffers::Offset<flatbuffers::String> description =
+  const flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("Fully Connected model");
 
-  flatbuffers::Offset<Model> model_buffer = CreateModel(
+  const flatbuffers::Offset<Model> model_buffer = CreateModel(
       builder, TFLITE_SCHEMA_VERSION,
       builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
index d7b51ef3872..da9a4aeea51 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
@@ -29,6 +29,20 @@ namespace xnnpack {
 
 class FullyConnectedTester {
  public:
+  enum class WeightsType {
+    kFP32,
+    kFP16,
+    kTensorWiseQuantizedInt8,
+    kChannelWiseQuantizedInt8,
+    kDynamic,
+  };
+  enum class BiasType {
+    kNone,
+    kFP32,
+    kFP16,
+    kDynamic,
+  };
+
   FullyConnectedTester() = default;
   FullyConnectedTester(const FullyConnectedTester&) = delete;
   FullyConnectedTester& operator=(const FullyConnectedTester&) = delete;
@@ -73,35 +87,38 @@ class FullyConnectedTester {
   inline bool KeepDims() const { return keep_dims_; }
 
   inline FullyConnectedTester& FP16Weights() {
-    fp16_weights_ = true;
+    weights_type_ = WeightsType::kFP16;
+    bias_type_ = BiasType::kFP16;
     return *this;
   }
 
-  inline bool FP16Weights() const { return fp16_weights_; }
-
-  inline FullyConnectedTester& INT8Weights() {
-    int8_weights_ = true;
+  inline FullyConnectedTester& TensorWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kTensorWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
     return *this;
   }
 
-  inline bool INT8Weights() const { return int8_weights_; }
-
-  inline FullyConnectedTester& INT8ChannelWiseWeights() {
-    int8_channel_wise_weights_ = true;
+  inline FullyConnectedTester& ChannelWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kChannelWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
     return *this;
   }
 
-  inline bool INT8ChannelWiseWeights() const {
-    return int8_channel_wise_weights_;
+  inline FullyConnectedTester& DynamicWeights() {
+    weights_type_ = WeightsType::kDynamic;
+    bias_type_ = BiasType::kFP32;
+    return *this;
   }
 
   inline FullyConnectedTester& NoBias() {
-    has_bias_ = false;
+    bias_type_ = BiasType::kNone;
     return *this;
   }
 
-  inline FullyConnectedTester& WithBias() {
-    has_bias_ = true;
+  inline FullyConnectedTester& DynamicBias() {
+    bias_type_ = BiasType::kDynamic;
     return *this;
   }
 
@@ -131,7 +148,11 @@ class FullyConnectedTester {
  private:
   std::vector<char> CreateTfLiteModel() const;
 
-  inline bool HasBias() const { return has_bias_; }
+  inline bool HasBias() const { return bias_type_ != BiasType::kNone; }
+
+  inline WeightsType WeightsType() const { return weights_type_; }
+
+  inline BiasType BiasType() const { return bias_type_; }
 
   inline ::tflite::ActivationFunctionType Activation() const {
     return activation_;
@@ -144,10 +165,8 @@ class FullyConnectedTester {
   int32_t input_channels_ = 1;
   int32_t output_channels_ = 1;
   bool keep_dims_ = false;
-  bool fp16_weights_ = false;
-  bool int8_weights_ = false;
-  bool int8_channel_wise_weights_ = false;
-  bool has_bias_ = true;
+  enum WeightsType weights_type_ { WeightsType::kFP32 };
+  enum BiasType bias_type_ { BiasType::kFP32 };
   ::tflite::ActivationFunctionType activation_ =
       ::tflite::ActivationFunctionType_NONE;
   TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
diff --git a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
index f1d5a5d3ae4..bc1b64f0180 100644
--- a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
@@ -75,13 +75,11 @@ void LeakyReluTester::Test(TfLiteDelegate* delegate) const {
   ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
 
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data, default_input_data + Size(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, Size(), std::ref(input_rng));
 
   float* delegate_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data, default_input_data + Size(),
-            delegate_input_data);
+  std::copy_n(default_input_data, Size(), delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.cc b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
index 35b39c58345..172bca097a4 100644
--- a/tensorflow/lite/delegates/xnnpack/pad_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
@@ -94,14 +94,13 @@ void PadTester::Test(TfLiteDelegate* delegate) const {
   ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
 
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data,
-                default_input_data + ComputeSize(InputShape()),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, ComputeSize(InputShape()),
+                  std::ref(input_rng));
 
   float* delegate_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
-            delegate_input_data);
+  std::copy_n(default_input_data, ComputeSize(InputShape()),
+              delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
index d36a6539510..b6bdce22023 100644
--- a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
@@ -86,14 +86,13 @@ void PreluTester::Test(TfLiteDelegate* delegate) const {
   }
 
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data,
-                default_input_data + ComputeSize(InputShape()),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, ComputeSize(InputShape()),
+                  std::ref(input_rng));
 
   float* xnnpack_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
-            xnnpack_input_data);
+  std::copy_n(default_input_data, ComputeSize(InputShape()),
+              xnnpack_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/quantize_tester.cc b/tensorflow/lite/delegates/xnnpack/quantize_tester.cc
index bb4a3249b04..4a44c31e032 100644
--- a/tensorflow/lite/delegates/xnnpack/quantize_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantize_tester.cc
@@ -46,12 +46,11 @@ void QuantizeTester::PopulateInput(Interpreter* delegate_interpreter,
   auto input_rng = std::bind(input_distribution, std::ref(rng));
 
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data, default_input_data + ComputeSize(Shape()),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, ComputeSize(Shape()),
+                  std::ref(input_rng));
 
   T* xnnpack_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data, default_input_data + ComputeSize(Shape()),
-            xnnpack_input_data);
+  std::copy_n(default_input_data, ComputeSize(Shape()), xnnpack_input_data);
 }
 
 template <>
@@ -63,13 +62,12 @@ void QuantizeTester::PopulateInput<float>(
   auto input_rng = std::bind(input_distribution, std::ref(rng));
 
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data, default_input_data + ComputeSize(Shape()),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, ComputeSize(Shape()),
+                  std::ref(input_rng));
 
   float* xnnpack_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data, default_input_data + ComputeSize(Shape()),
-            xnnpack_input_data);
+  std::copy_n(default_input_data, ComputeSize(Shape()), xnnpack_input_data);
 }
 
 template <class T>
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.cc
index 28867e105c6..a2665d612d3 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.cc
@@ -73,28 +73,24 @@ void QuantizedBinaryElementwiseTester::Test(
   auto input2_rng = std::bind(input2_distribution, std::ref(rng));
   if (!Input1Static()) {
     T* default_input1_data = default_interpreter->typed_input_tensor<T>(0);
-    std::generate(default_input1_data,
-                  default_input1_data + ComputeSize(Input1Shape()),
-                  std::ref(input1_rng));
+    std::generate_n(default_input1_data, ComputeSize(Input1Shape()),
+                    std::ref(input1_rng));
 
     T* xnnpack_input1_data = delegate_interpreter->typed_input_tensor<T>(0);
-    std::copy(default_input1_data,
-              default_input1_data + ComputeSize(Input1Shape()),
-              xnnpack_input1_data);
+    std::copy_n(default_input1_data, ComputeSize(Input1Shape()),
+                xnnpack_input1_data);
   }
 
   if (!Input2Static()) {
     T* default_input2_data =
         default_interpreter->typed_input_tensor<T>(Input1Static() ? 0 : 1);
-    std::generate(default_input2_data,
-                  default_input2_data + ComputeSize(Input2Shape()),
-                  std::ref(input2_rng));
+    std::generate_n(default_input2_data, ComputeSize(Input2Shape()),
+                    std::ref(input2_rng));
 
     T* xnnpack_input2_data =
         delegate_interpreter->typed_input_tensor<T>(Input1Static() ? 0 : 1);
-    std::copy(default_input2_data,
-              default_input2_data + ComputeSize(Input2Shape()),
-              xnnpack_input2_data);
+    std::copy_n(default_input2_data, ComputeSize(Input2Shape()),
+                xnnpack_input2_data);
   }
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc
index 5aa5b026195..266a8a22ac7 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc
@@ -45,16 +45,14 @@ void QuantizedConv2DTester::Test(Interpreter* delegate_interpreter,
                                              std::numeric_limits<T>::max()),
       std::ref(rng));
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data,
-                default_input_data + BatchSize() * InputHeight() *
-                                         InputWidth() * InputChannels(),
-                input_rng);
+  std::generate_n(default_input_data,
+                  BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+                  input_rng);
 
   T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data,
-            default_input_data +
-                BatchSize() * InputHeight() * InputWidth() * InputChannels(),
-            delegate_input_data);
+  std::copy_n(default_input_data,
+              BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+              delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.cc
index 4ef1758f6e2..017713b4761 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.cc
@@ -45,16 +45,14 @@ void QuantizedDepthwiseConv2DTester::Test(
                                              std::numeric_limits<T>::max()),
       std::ref(rng));
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data,
-                default_input_data + BatchSize() * InputHeight() *
-                                         InputWidth() * InputChannels(),
-                input_rng);
+  std::generate_n(default_input_data,
+                  BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+                  input_rng);
 
   T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data,
-            default_input_data +
-                BatchSize() * InputHeight() * InputWidth() * InputChannels(),
-            delegate_input_data);
+  std::copy_n(default_input_data,
+              BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+              delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.cc
index ac63ec48247..c42f8d78f97 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.cc
@@ -61,12 +61,10 @@ void QuantizedFullyConnectedTester::Test(
       std::ref(rng));
 
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data, default_input_data + InputSize(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, InputSize(), std::ref(input_rng));
 
   T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data, default_input_data + InputSize(),
-            delegate_input_data);
+  std::copy_n(default_input_data, InputSize(), delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.cc
index 8f64df1c547..88dead108e7 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.cc
@@ -45,12 +45,11 @@ void QuantizedLeakyReluTester::Test(Interpreter* delegate_interpreter,
   auto input_rng = std::bind(input_distribution, std::ref(rng));
 
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data, default_input_data + ComputeSize(Shape()),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, ComputeSize(Shape()),
+                  std::ref(input_rng));
 
   T* xnnpack_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data, default_input_data + ComputeSize(Shape()),
-            xnnpack_input_data);
+  std::copy_n(default_input_data, ComputeSize(Shape()), xnnpack_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.cc
index 30c92ff7ae7..aa1f2391613 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.cc
@@ -62,13 +62,12 @@ void QuantizedPadTester::Test(Interpreter* delegate_interpreter,
       std::ref(rng));
 
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data,
-                default_input_data + ComputeSize(InputShape()),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, ComputeSize(InputShape()),
+                  std::ref(input_rng));
 
   T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
-            delegate_input_data);
+  std::copy_n(default_input_data, ComputeSize(InputShape()),
+              delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.cc
index e2f633fcaca..4918b34aeb7 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.cc
@@ -45,16 +45,14 @@ void QuantizedPool2DTester::Test(tflite::BuiltinOperator pool_op,
       std::ref(rng));
 
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data,
-                default_input_data +
-                    BatchSize() * InputHeight() * InputWidth() * Channels(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data,
+                  BatchSize() * InputHeight() * InputWidth() * Channels(),
+                  std::ref(input_rng));
 
   T* xnnpack_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data,
-            default_input_data +
-                BatchSize() * InputHeight() * InputWidth() * Channels(),
-            xnnpack_input_data);
+  std::copy_n(default_input_data,
+              BatchSize() * InputHeight() * InputWidth() * Channels(),
+              xnnpack_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.cc
index f42265b2dd0..c4a8cf1b381 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.cc
@@ -46,12 +46,10 @@ void QuantizedReduceTester::Test(Interpreter* delegate_interpreter,
       std::ref(rng));
 
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data, default_input_data + InputSize(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, InputSize(), std::ref(input_rng));
 
   T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data, default_input_data + InputSize(),
-            delegate_input_data);
+  std::copy_n(default_input_data, InputSize(), delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc
index d992992d883..d9ab3f53595 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc
@@ -46,16 +46,14 @@ void QuantizedResizeBilinearTester::Test(
       std::ref(rng));
 
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data,
-                default_input_data +
-                    BatchSize() * InputHeight() * InputWidth() * Channels(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data,
+                  BatchSize() * InputHeight() * InputWidth() * Channels(),
+                  std::ref(input_rng));
 
   T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data,
-            default_input_data +
-                BatchSize() * InputHeight() * InputWidth() * Channels(),
-            delegate_input_data);
+  std::copy_n(default_input_data,
+              BatchSize() * InputHeight() * InputWidth() * Channels(),
+              delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc
index 02fb44af791..e9a9a19d856 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc
@@ -89,13 +89,11 @@ void QuantizedTransposeConvTester::Test(TfLiteDelegate* delegate) const {
       std::bind(std::uniform_int_distribution<int32_t>(0, 255), rng);
   uint8_t* default_input_data = reinterpret_cast<uint8_t*>(
       default_interpreter->input_tensor(0)->data.data);
-  std::generate(default_input_data, default_input_data + input_data_size,
-                std::ref(uint8rng));
+  std::generate_n(default_input_data, input_data_size, std::ref(uint8rng));
 
   uint8_t* xnnpack_input_data = reinterpret_cast<uint8_t*>(
       delegate_interpreter->input_tensor(0)->data.data);
-  std::copy(default_input_data, default_input_data + input_data_size,
-            xnnpack_input_data);
+  std::copy_n(default_input_data, input_data_size, xnnpack_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.cc
index ebeecb285d8..b8e297ae461 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.cc
@@ -45,12 +45,11 @@ void QuantizedUnaryElementwiseTester::Test(
   auto input_rng = std::bind(input_distribution, std::ref(rng));
 
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data, default_input_data + ComputeSize(Shape()),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, ComputeSize(Shape()),
+                  std::ref(input_rng));
 
   T* xnnpack_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data, default_input_data + ComputeSize(Shape()),
-            xnnpack_input_data);
+  std::copy_n(default_input_data, ComputeSize(Shape()), xnnpack_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/reduce_tester.cc b/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
index e9204ff8da8..cc6f69066f8 100644
--- a/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
@@ -76,13 +76,11 @@ void ReduceTester::Test(tflite::BuiltinOperator reduce_op,
   ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
 
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data, default_input_data + InputSize(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, InputSize(), std::ref(input_rng));
 
   float* delegate_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data, default_input_data + InputSize(),
-            delegate_input_data);
+  std::copy_n(default_input_data, InputSize(), delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/reshape_tester.cc b/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
index 295fd597ca5..e2f4fe2e63e 100644
--- a/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
@@ -47,12 +47,10 @@ void ReshapeTester::Test(TensorType tensor_type,
   auto input_rng = std::bind(input_distribution, std::ref(rng));
 
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data, default_input_data + InputSize(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, InputSize(), std::ref(input_rng));
 
   T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data, default_input_data + InputSize(),
-            delegate_input_data);
+  std::copy_n(default_input_data, InputSize(), delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
@@ -75,13 +73,11 @@ void ReshapeTester::Test<float>(TensorType tensor_type,
       std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
 
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data, default_input_data + InputSize(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, InputSize(), std::ref(input_rng));
 
   float* delegate_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data, default_input_data + InputSize(),
-            delegate_input_data);
+  std::copy_n(default_input_data, InputSize(), delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
index b6f7c26eec2..e4ee08280f4 100644
--- a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
@@ -75,17 +75,15 @@ void ResizeBilinearTester::Test(TfLiteDelegate* delegate) const {
   ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
 
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data,
-                default_input_data +
-                    BatchSize() * InputHeight() * InputWidth() * Channels(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data,
+                  BatchSize() * InputHeight() * InputWidth() * Channels(),
+                  std::ref(input_rng));
 
   float* delegate_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data,
-            default_input_data +
-                BatchSize() * InputHeight() * InputWidth() * Channels(),
-            delegate_input_data);
+  std::copy_n(default_input_data,
+              BatchSize() * InputHeight() * InputWidth() * Channels(),
+              delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/slice_tester.cc b/tensorflow/lite/delegates/xnnpack/slice_tester.cc
index 53e7d1681de..5c1aa6c5921 100644
--- a/tensorflow/lite/delegates/xnnpack/slice_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/slice_tester.cc
@@ -53,13 +53,12 @@ void SliceTester::Test(Interpreter* default_interpreter,
   auto input_distribution = GetDist<T>();
   auto input_rng = std::bind(input_distribution, std::ref(rng));
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data,
-                default_input_data + ComputeSize(InputShape()),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, ComputeSize(InputShape()),
+                  std::ref(input_rng));
 
   T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
-            delegate_input_data);
+  std::copy_n(default_input_data, ComputeSize(InputShape()),
+              delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
index 5717010e117..34afd06f062 100644
--- a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
@@ -75,13 +75,11 @@ void SoftmaxTester::Test(TfLiteDelegate* delegate) const {
   ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
 
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data, default_input_data + Size(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, Size(), std::ref(input_rng));
 
   float* delegate_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data, default_input_data + Size(),
-            delegate_input_data);
+  std::copy_n(default_input_data, Size(), delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.cc b/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.cc
index 76f0cf63e4a..bd3e7205ced 100644
--- a/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.cc
@@ -44,12 +44,10 @@ void SpaceToDepthTester::Test(TensorType tensor_type,
   auto input_rng = std::bind(input_distribution, std::ref(rng));
 
   T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
-  std::generate(default_input_data, default_input_data + NumInputElements(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, NumInputElements(), std::ref(input_rng));
 
   T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
-  std::copy(default_input_data, default_input_data + NumInputElements(),
-            delegate_input_data);
+  std::copy_n(default_input_data, NumInputElements(), delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/split_tester.cc b/tensorflow/lite/delegates/xnnpack/split_tester.cc
index d881826578b..7eb20af3cc0 100644
--- a/tensorflow/lite/delegates/xnnpack/split_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/split_tester.cc
@@ -46,13 +46,12 @@ void SplitTester::Test(Interpreter *delegate_interpreter,
   auto input_rng = std::bind(input_distribution, std::ref(rng));
 
   T *default_input_data = default_interpreter->typed_input_tensor<T>(1);
-  std::generate(default_input_data,
-                default_input_data + ComputeSize(InputShape()),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, ComputeSize(InputShape()),
+                  std::ref(input_rng));
 
   T *xnnpack_input_data = delegate_interpreter->typed_input_tensor<T>(1);
-  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
-            xnnpack_input_data);
+  std::copy_n(default_input_data, ComputeSize(InputShape()),
+              xnnpack_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
@@ -79,14 +78,13 @@ void SplitTester::Test<float>(Interpreter *delegate_interpreter,
   auto input_rng = std::bind(input_distribution, std::ref(rng));
 
   float *default_input_data = default_interpreter->typed_input_tensor<float>(1);
-  std::generate(default_input_data,
-                default_input_data + ComputeSize(InputShape()),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, ComputeSize(InputShape()),
+                  std::ref(input_rng));
 
   float *xnnpack_input_data =
       delegate_interpreter->typed_input_tensor<float>(1);
-  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
-            xnnpack_input_data);
+  std::copy_n(default_input_data, ComputeSize(InputShape()),
+              xnnpack_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
index 067ec1311db..260fd87e282 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
@@ -535,79 +535,12 @@ TEST(TransposeConvTest, FP16WeightsNoBias) {
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
       .SamePadding()
-      .NoBias()
       .FP16Weights()
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(TransposeConvTest, INT8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto batch_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
-  auto output_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
-  auto kernel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
-  auto stride_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
-  auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-
-  TransposeConvTester()
-      .BatchSize(batch_rng())
-      .OutputHeight(output_rng())
-      .OutputWidth(output_rng())
-      .InputChannels(channel_rng())
-      .OutputChannels(channel_rng())
-      .KernelHeight(kernel_rng())
-      .KernelWidth(kernel_rng())
-      .StrideHeight(stride_rng())
-      .StrideWidth(stride_rng())
-      .SamePadding()
-      .INT8Weights()
-      .Test(xnnpack_delegate.get());
-}
-
-TEST(TransposeConvTest, INT8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto batch_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
-  auto output_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
-  auto kernel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
-  auto stride_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
-  auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
-
-  TransposeConvTester()
-      .BatchSize(batch_rng())
-      .OutputHeight(output_rng())
-      .OutputWidth(output_rng())
-      .InputChannels(channel_rng())
-      .OutputChannels(channel_rng())
-      .KernelHeight(kernel_rng())
-      .KernelWidth(kernel_rng())
-      .StrideHeight(stride_rng())
-      .StrideWidth(stride_rng())
-      .SamePadding()
       .NoBias()
-      .INT8Weights()
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, INT8ChannelWiseWeights) {
+TEST(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -636,11 +569,11 @@ TEST(TransposeConvTest, INT8ChannelWiseWeights) {
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
       .SamePadding()
-      .INT8ChannelWiseWeights()
+      .TensorWiseQuantizedInt8Weights()
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, INT8ChannelWiseWeightsNoBias) {
+TEST(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
@@ -669,8 +602,75 @@ TEST(TransposeConvTest, INT8ChannelWiseWeightsNoBias) {
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
       .SamePadding()
+      .TensorWiseQuantizedInt8Weights()
+      .NoBias()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  TransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding()
+      .ChannelWiseQuantizedInt8Weights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  TransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding()
+      .ChannelWiseQuantizedInt8Weights()
       .NoBias()
-      .INT8ChannelWiseWeights()
       .Test(xnnpack_delegate.get());
 }
 
@@ -736,8 +736,215 @@ TEST(TransposeConvTest, SparseWeightsNoBias) {
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
       .SamePadding()
-      .NoBias()
       .SparseWeights()
+      .NoBias()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(TransposeConvTest, SparseFP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  TransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding()
+      .SparseWeights()
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(TransposeConvTest, SparseFP16WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  TransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding()
+      .SparseWeights()
+      .FP16Weights()
+      .NoBias()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  TransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding()
+      .SparseWeights()
+      .TensorWiseQuantizedInt8Weights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  TransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding()
+      .SparseWeights()
+      .TensorWiseQuantizedInt8Weights()
+      .NoBias()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  TransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding()
+      .SparseWeights()
+      .ChannelWiseQuantizedInt8Weights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  TransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding()
+      .SparseWeights()
+      .ChannelWiseQuantizedInt8Weights()
+      .NoBias()
       .Test(xnnpack_delegate.get());
 }
 
@@ -848,7 +1055,6 @@ TEST(TransposeConvTest, WeightsCache) {
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
       .SamePadding()
-      .NoBias()
       .WeightsCache(weights_cache.get())
       .Test(xnnpack_delegate.get());
 }
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.cc b/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.cc
index 1899b6825de..c68aa3e3b7d 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h"
 
 #include <algorithm>
-#include <cassert>
+#include <array>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -83,13 +83,11 @@ void TransposeConvTester::Test(TfLiteDelegate* delegate) const {
   const int input_data_size =
       BatchSize() * InputHeight() * InputWidth() * InputChannels();
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data, default_input_data + input_data_size,
-                std::ref(f32rng));
+  std::generate_n(default_input_data, input_data_size, std::ref(f32rng));
 
   float* xnnpack_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data, default_input_data + input_data_size,
-            xnnpack_input_data);
+  std::copy_n(default_input_data, input_data_size, xnnpack_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
@@ -111,295 +109,331 @@ void TransposeConvTester::Test(TfLiteDelegate* delegate) const {
 std::vector<char> TransposeConvTester::CreateTfLiteModel() const {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
-  auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
+  auto range_rng = std::bind(
+      std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
 
-  const std::vector<int32_t> input_shape = {BatchSize(), InputHeight(),
-                                            InputWidth(), InputChannels()};
-  const std::vector<int32_t> output_shape = {BatchSize(), OutputHeight(),
-                                             OutputWidth(), OutputChannels()};
+  /*************************** Define operator codes **************************/
+  flatbuffers::FlatBufferBuilder builder;
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_TRANSPOSE_CONV)}};
+  int dequantize_operator_code = -1;
+  switch (WeightsType()) {
+    case WeightsType::kFP32:
+      break;
+    case WeightsType::kFP16:
+    case WeightsType::kTensorWiseQuantizedInt8:
+    case WeightsType::kChannelWiseQuantizedInt8:
+      dequantize_operator_code = operator_codes.size();
+      operator_codes.emplace_back(
+          CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+      break;
+  }
+  int densify_operator_code = -1;
+  if (SparseWeights()) {
+    densify_operator_code = operator_codes.size();
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DENSIFY));
+  }
+
+  /*********************** Generate filter and bias data **********************/
+  std::vector<float> filter_data(OutputChannels() * KernelHeight() *
+                                 KernelWidth() * InputChannels());
+  std::vector<float> bias_data(OutputChannels());
+  for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+    // Use the same range of all-positive or all-negative values to generate
+    // all weights within the same output channel, but different ranges for
+    // different output channels. This ensures that no catastrophic
+    // cancellation occur, but test covers both positive and negative
+    // inputs.
+    const float range = range_rng();
+    const auto value_dist = std::uniform_real_distribution<float>(
+        std::min(range, 0.0f), std::max(range, 0.0f));
+    auto value_rng = std::bind(value_dist, std::ref(rng));
+    bias_data[oc] = value_rng();
+    for (int32_t ic = 0; ic < InputChannels(); ic++) {
+      for (int32_t y = 0; y < KernelHeight(); y++) {
+        for (int32_t x = 0; x < KernelWidth(); x++) {
+          const int32_t index =
+              ((oc * KernelHeight() + y) * KernelWidth() + x) *
+                  InputChannels() +
+              ic;
+          filter_data[index] = value_rng();
+        }
+      }
+    }
+  }
+
+  /************************ Define sparsity parameters ************************/
+  flatbuffers::Offset<SparsityParameters> filter_sparsity_params = 0;
   const std::vector<int32_t> filter_shape = {OutputChannels(), KernelHeight(),
                                              KernelWidth(), InputChannels()};
-  const std::vector<int32_t> bias_shape = {OutputChannels()};
+  if (SparseWeights()) {
+    // Sparse tensor in TFLite can be in different formats. Here we choose the
+    // simplest configuration that
+    //   1. all dimensions are dense,
+    //   2. in-order traversal, and
+    //   3. no block configuration.
+    const int dims_count = filter_shape.size();
+    std::vector<flatbuffers::Offset<DimensionMetadata>> dim_metadata(
+        dims_count);
+    std::vector<int> traversal_order(dims_count);
+    for (int i = 0; i < dims_count; i++) {
+      traversal_order[i] = i;
+      dim_metadata[i] = CreateDimensionMetadata(builder, DimensionType_DENSE,
+                                                filter_shape[i]);
+    }
+    filter_sparsity_params =
+        CreateSparsityParameters(builder, builder.CreateVector(traversal_order),
+                                 0, builder.CreateVector(dim_metadata));
+  }
 
-  flatbuffers::FlatBufferBuilder builder;
+  /****************************** Define buffers ******************************/
+  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
+  tflite::TensorType quantized_filter_type = TensorType_FLOAT32;
+  flatbuffers::Offset<tflite::QuantizationParameters>
+      filter_quantization_params = 0;
+  int filter_buffer_id = 0, quantized_filter_buffer_id = 0;
+  switch (WeightsType()) {
+    case WeightsType::kFP32:
+      filter_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(filter_data.data()),
+                       sizeof(float) * filter_data.size())));
+      break;
+    case WeightsType::kFP16: {
+      std::vector<uint16_t> quantized_filter_data(filter_data.size());
+      std::transform(filter_data.begin(), filter_data.end(),
+                     quantized_filter_data.begin(), fp16_ieee_from_fp32_value);
 
-  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes;
+      quantized_filter_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
+          builder,
+          builder.CreateVector(
+              reinterpret_cast<const uint8_t*>(quantized_filter_data.data()),
+              sizeof(uint16_t) * quantized_filter_data.size())));
 
-  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
-  std::vector<flatbuffers::Offset<Tensor>> tensors;
+      quantized_filter_type = TensorType_FLOAT16;
+      break;
+    }
+    case WeightsType::kTensorWiseQuantizedInt8:
+    case WeightsType::kChannelWiseQuantizedInt8: {
+      std::vector<float> filter_scales;
+      std::vector<int64_t> filter_zero_points;
+      int32_t filter_quantized_dimension = 0;
 
-  // Buffer 0 is a sentinel as required by the schema, means "no buffer".
-  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers = {
-      CreateBuffer(builder, builder.CreateVector({}))};
-  const int kNoBuffer = 0;
+      std::vector<int8_t> quantized_filter_data(filter_data.size());
+      if (WeightsType() == WeightsType::kChannelWiseQuantizedInt8) {
+        filter_quantized_dimension =
+            static_cast<int32_t>(filter_shape.size()) - 1;
+        const int32_t num_scales = filter_shape[filter_quantized_dimension];
+        filter_scales = GetInt8QuantizationScalePerChannel(
+            filter_data.data(), filter_quantized_dimension, filter_shape);
+        filter_zero_points.resize(num_scales, 0);
+        QuantizeInt8PerChannel(filter_scales.data(), filter_zero_points.data(),
+                               filter_quantized_dimension, filter_data.data(),
+                               quantized_filter_data.data(), filter_shape);
+      } else {
+        filter_scales.resize(1, GetInt8QuantizationScale(filter_data));
+        filter_zero_points.resize(1, 0);
+        std::transform(filter_data.begin(), filter_data.end(),
+                       quantized_filter_data.begin(),
+                       std::bind(QuantizeInt8, std::placeholders::_1, 0,
+                                 filter_scales[0]));
+      }
 
-  // Create a tensor containing the expected output shape.
-  const int buffer_index_output_shape = buffers.size();
+      quantized_filter_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
+          builder,
+          builder.CreateVector(
+              reinterpret_cast<const uint8_t*>(quantized_filter_data.data()),
+              sizeof(int8_t) * quantized_filter_data.size())));
+
+      quantized_filter_type = TensorType_INT8;
+      filter_quantization_params = CreateQuantizationParameters(
+          builder, /*min=*/0, /*max=*/0,
+          builder.CreateVector<float>(filter_scales),
+          builder.CreateVector<int64_t>(filter_zero_points),
+          /*details_type=*/QuantizationDetails_NONE,
+          /*details=*/0, filter_quantized_dimension);
+      break;
+    }
+  }
+  tflite::TensorType quantized_bias_type = TensorType_FLOAT32;
+  int bias_buffer_id = 0, quantized_bias_buffer_id = 0;
+  switch (BiasType()) {
+    case BiasType::kNone:
+      break;
+    case BiasType::kFP32:
+      bias_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(bias_data.data()),
+                       sizeof(float) * bias_data.size())));
+      break;
+    case BiasType::kFP16: {
+      std::vector<uint16_t> quantized_bias_data(bias_data.size());
+      std::transform(bias_data.begin(), bias_data.end(),
+                     quantized_bias_data.begin(), fp16_ieee_from_fp32_value);
+
+      quantized_bias_buffer_id = buffers.size();
+      buffers.emplace_back(CreateBuffer(
+          builder,
+          builder.CreateVector(
+              reinterpret_cast<const uint8_t*>(quantized_bias_data.data()),
+              sizeof(uint16_t) * quantized_bias_data.size())));
+
+      quantized_bias_type = TensorType_FLOAT16;
+      break;
+    }
+  }
+  const std::array<int32_t, 4> output_shape{
+      {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
+  const int output_shape_buffer_id = buffers.size();
   buffers.emplace_back(CreateBuffer(
       builder, builder.CreateVector(
                    reinterpret_cast<const uint8_t*>(output_shape.data()),
                    sizeof(int32_t) * output_shape.size())));
 
-  std::vector<int32_t> output_shape_tensor_shape = {4};
-  const int tensor_index_output_shape = tensors.size();
-  tensors.emplace_back(
-      CreateTensorDirect(builder, &output_shape_tensor_shape, TensorType_INT32,
-                         /*buffer=*/buffer_index_output_shape));
-
-  // The last one (two) tensor(s) will be the float32 kernel (and bias if used).
-  if (FP16Weights()) {
-    const int kOpCodeIndexDequantize = operator_codes.size();
-    operator_codes.emplace_back(
-        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
-
-    auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
-
-    std::vector<uint16_t> filter_data(OutputChannels() * KernelHeight() *
-                                      KernelWidth() * InputChannels());
-
-    std::generate(filter_data.begin(), filter_data.end(), f16rng);
-
-    const int buffer_index_filter = buffers.size();
-    buffers.emplace_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(filter_data.data()),
-                     sizeof(uint16_t) * filter_data.size())));
-
-    const int tensor_index_float16_filter = tensors.size();
-    tensors.emplace_back(CreateTensorDirect(builder, &filter_shape,
-                                            TensorType_FLOAT16,
-                                            /*buffer=*/buffer_index_filter));
-
-    const int kInvalidIndex = -1;
-    int tensor_index_float16_bias = kInvalidIndex;
-    if (UseBias()) {
-      std::vector<uint16_t> bias_data(OutputChannels());
-      std::generate(bias_data.begin(), bias_data.end(), f16rng);
-
-      const int buffer_index_bias = buffers.size();
-      buffers.emplace_back(CreateBuffer(
-          builder, builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(bias_data.data()),
-                       sizeof(uint16_t) * bias_data.size())));
-
-      tensor_index_float16_bias = tensors.size();
-      tensors.emplace_back(CreateTensorDirect(builder, &bias_shape,
-                                              TensorType_FLOAT16,
-                                              /*buffer=*/buffer_index_bias));
-    }
-
-    const int tensor_index_filter = tensors.size();
-    tensors.emplace_back(CreateTensorDirect(
-        builder, &filter_shape, TensorType_FLOAT32, /*buffer=*/kNoBuffer));
-
-    const std::vector<int32_t> dequantize_filter_inputs = {
-        tensor_index_float16_filter};
-    const std::vector<int32_t> dequantize_filter_outputs{tensor_index_filter};
-    operators.emplace_back(CreateOperatorDirect(
-        builder, /*opcode_index=*/kOpCodeIndexDequantize,
-        &dequantize_filter_inputs, &dequantize_filter_outputs));
-
-    assert(tensor_index_filter + 1 == tensors.size());
-
-    if (UseBias()) {
-      const int tensor_index_bias = tensors.size();
-      tensors.emplace_back(CreateTensorDirect(
-          builder, &bias_shape, TensorType_FLOAT32, /*buffer=*/kNoBuffer));
-
-      const std::vector<int32_t> dequantize_bias_inputs = {
-          tensor_index_float16_bias};
-      const std::vector<int32_t> dequantize_bias_outputs = {tensor_index_bias};
-      operators.emplace_back(CreateOperatorDirect(
-          builder, /*opcode_index=*/kOpCodeIndexDequantize,
-          &dequantize_bias_inputs, &dequantize_bias_outputs));
-
-      assert(tensor_index_bias + 1 == tensors.size());
-    }
-  } else if (INT8Weights() || INT8ChannelWiseWeights()) {
-    const int kOpCodeIndexDequantize = operator_codes.size();
-    operator_codes.emplace_back(
-        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
-
-    std::vector<float> filter_data(OutputChannels() * KernelHeight() *
-                                   KernelWidth() * InputChannels());
-    std::generate(filter_data.begin(), filter_data.end(), f32rng);
-
-    std::vector<float> filter_scales;
-    std::vector<int64_t> filter_zero_points;
-    int32_t filter_quantized_dimension = 0;
-    std::vector<int8_t> quantized_filter_data(filter_data.size());
-    if (INT8Weights()) {
-      filter_scales.resize(1, GetInt8QuantizationScale(filter_data));
-      filter_zero_points.resize(1, 0);
-      std::transform(
-          filter_data.begin(), filter_data.end(), quantized_filter_data.begin(),
-          std::bind(QuantizeInt8, std::placeholders::_1, 0, filter_scales[0]));
-    } else {
-      filter_quantized_dimension =
-          static_cast<int32_t>(filter_shape.size()) - 1;
-      const int32_t num_scales = filter_shape[filter_quantized_dimension];
-      filter_scales = GetInt8QuantizationScalePerChannel(
-          filter_data.data(), filter_quantized_dimension, filter_shape);
-      filter_zero_points.resize(num_scales, 0);
-      QuantizeInt8PerChannel(filter_scales.data(), filter_zero_points.data(),
-                             filter_quantized_dimension, filter_data.data(),
-                             quantized_filter_data.data(), filter_shape);
-    }
-    const int buffer_index_filter = buffers.size();
-    buffers.emplace_back(CreateBuffer(
+  /****************************** Define tensors ******************************/
+  std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
+  int sparse_filter_tensor_id = -1;
+  if (SparseWeights()) {
+    sparse_filter_tensor_id = tensors.size();
+    tensors.emplace_back(CreateTensor(
         builder,
-        builder.CreateVector(
-            reinterpret_cast<const uint8_t*>(quantized_filter_data.data()),
-            sizeof(int8_t) * quantized_filter_data.size())));
-
-    const int tensor_index_int8_filter = tensors.size();
-    tensors.emplace_back(CreateTensorDirect(
-        builder, &filter_shape, TensorType_INT8,
-        /*buffer=*/buffer_index_filter, /*name=*/nullptr,
-        CreateQuantizationParameters(
-            builder, /*min=*/0, /*max=*/0,
-            builder.CreateVector<float>(filter_scales),
-            builder.CreateVector<int64_t>(filter_zero_points),
-            /*details_type=*/QuantizationDetails_NONE,
-            /*details=*/0, filter_quantized_dimension)));
-
-    const int tensor_index_filter = tensors.size();
-    tensors.emplace_back(CreateTensorDirect(
-        builder, &filter_shape, TensorType_FLOAT32, /*buffer=*/kNoBuffer));
-
-    const std::vector<int32_t> dequantize_filter_inputs = {
-        tensor_index_int8_filter};
-    const std::vector<int32_t> dequantize_filter_outputs{tensor_index_filter};
-    operators.emplace_back(CreateOperatorDirect(
-        builder, /*opcode_index=*/kOpCodeIndexDequantize,
-        &dequantize_filter_inputs, &dequantize_filter_outputs));
-
-    assert(tensor_index_filter + 1 == tensors.size());
-
-    if (UseBias()) {
-      std::vector<float> bias_data(OutputChannels());
-      std::generate(bias_data.begin(), bias_data.end(), f32rng);
-
-      const int buffer_index_bias = buffers.size();
-      buffers.emplace_back(CreateBuffer(
-          builder, builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(bias_data.data()),
-                       sizeof(float) * bias_data.size())));
-
-      tensors.emplace_back(CreateTensorDirect(builder, &bias_shape,
-                                              TensorType_FLOAT32,
-                                              /*buffer=*/buffer_index_bias));
-    }
-  } else {
-    std::vector<float> filter_data(OutputChannels() * KernelHeight() *
-                                   KernelWidth() * InputChannels());
-
-    std::generate(filter_data.begin(), filter_data.end(), f32rng);
-
-    const int buffer_index_filter = buffers.size();
-    buffers.emplace_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(filter_data.data()),
-                     sizeof(float) * filter_data.size())));
-
-    if (SparseWeights()) {
-      const int dims_count = filter_shape.size();
-      std::vector<flatbuffers::Offset<DimensionMetadata>> dim_metadata(
-          dims_count);
-      std::vector<int> traversal_order(dims_count);
-      for (int dim = 0; dim < dims_count; dim++) {
-        traversal_order[dim] = dim;
-        dim_metadata[dim] = CreateDimensionMetadata(
-            builder, DimensionType_DENSE, filter_shape[dim]);
-      }
-      flatbuffers::Offset<SparsityParameters> sparsity_parameters =
-          CreateSparsityParameters(
-              builder, builder.CreateVector(traversal_order),
-              /*block_map=*/0, builder.CreateVector(dim_metadata));
-      const int tensor_index_filter_sparse = tensors.size();
-      tensors.emplace_back(CreateTensorDirect(
-          builder, &filter_shape, TensorType_FLOAT32,
-          /*buffer=*/buffer_index_filter, /*name=*/nullptr, /*quantization=*/0,
-          /*is_variable=*/false, /*sparsity=*/sparsity_parameters));
-
-      const int opcode_index_densify = operator_codes.size();
-      operator_codes.emplace_back(
-          CreateOperatorCode(builder, BuiltinOperator_DENSIFY));
-
-      const int future_tensor_index_filter = tensors.size();
-      const std::vector<int32_t> densify_filter_inputs = {
-          tensor_index_filter_sparse};
-      const std::vector<int32_t> densify_filter_outputs = {
-          future_tensor_index_filter};
-      operators.emplace_back(CreateOperatorDirect(
-          builder, /*opcode_index=*/opcode_index_densify,
-          &densify_filter_inputs, &densify_filter_outputs));
-
-      // The dense filter tensor is just about to be added.
-      assert(future_tensor_index_filter == tensors.size());
-    }
-
-    tensors.emplace_back(CreateTensorDirect(
-        builder, &filter_shape, TensorType_FLOAT32,
-        /*buffer=*/SparseWeights() ? kNoBuffer : buffer_index_filter));
-
-    if (UseBias()) {
-      std::vector<float> bias_data(OutputChannels());
-      std::generate(bias_data.begin(), bias_data.end(), f32rng);
-
-      const int buffer_index_bias = buffers.size();
-      buffers.emplace_back(CreateBuffer(
-          builder, builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(bias_data.data()),
-                       sizeof(float) * bias_data.size())));
-
-      tensors.emplace_back(CreateTensorDirect(builder, &bias_shape,
-                                              TensorType_FLOAT32,
-                                              /*buffer=*/buffer_index_bias));
-    }
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        /*type=*/quantized_filter_type,
+        /*buffer=*/std::max(filter_buffer_id, quantized_filter_buffer_id),
+        /*name=*/0, filter_quantization_params,
+        /*is_variable=*/false, filter_sparsity_params));
+  }
+  int quantized_filter_tensor_id = -1;
+  if (quantized_filter_type != TensorType_FLOAT32) {
+    quantized_filter_tensor_id = tensors.size();
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        /*type=*/quantized_filter_type,
+        /*buffer=*/SparseWeights() ? 0 : quantized_filter_buffer_id,
+        /*name=*/0, filter_quantization_params));
+  }
+  int quantized_bias_tensor_id = -1;
+  const std::vector<int32_t> bias_shape = {OutputChannels()};
+  if (quantized_bias_type != TensorType_FLOAT32) {
+    quantized_bias_tensor_id = tensors.size();
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+        quantized_bias_type, /*buffer=*/quantized_bias_buffer_id));
   }
 
-  const int top_tensor = tensors.size() - 1;
-  const int tensor_index_filter = UseBias() ? top_tensor - 1 : top_tensor;
+  const int input_tensor_id = tensors.size();
+  const std::array<int32_t, 4> input_shape{
+      {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+      TensorType_FLOAT32));
 
-  const int tensor_index_input = tensors.size();
+  const int filter_tensor_id = tensors.size();
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+      TensorType_FLOAT32,
+      /*buffer=*/SparseWeights() ? 0 : filter_buffer_id));
+
+  const int bias_tensor_id = tensors.size();
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+      TensorType_FLOAT32, bias_buffer_id));
+
+  const int output_tensor_id = tensors.size();
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
+
+  const int output_shape_tensor_id = tensors.size();
+  const std::array<int32_t, 1> output_shape_shape{{4}};
   tensors.emplace_back(
-      CreateTensorDirect(builder, &input_shape, TensorType_FLOAT32));
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(output_shape_shape.data(),
+                                                 output_shape_shape.size()),
+                   TensorType_INT32, output_shape_buffer_id));
 
-  std::vector<int32_t> op_inputs = {tensor_index_output_shape,
-                                    tensor_index_filter, tensor_index_input};
-  if (UseBias()) {
-    const int tensor_index_bias = top_tensor;
-    op_inputs.push_back(tensor_index_bias);
+  /***************************** Define operators *****************************/
+  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
+  if (SparseWeights()) {
+    const std::array<int32_t, 1> densify_filter_inputs{
+        {sparse_filter_tensor_id}};
+    const std::array<int32_t, 1> densify_filter_outputs{
+        {quantized_filter_tensor_id >= 0 ? quantized_filter_tensor_id
+                                         : filter_tensor_id}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/densify_operator_code,
+        builder.CreateVector<int32_t>(densify_filter_inputs.data(),
+                                      densify_filter_inputs.size()),
+        builder.CreateVector<int32_t>(densify_filter_outputs.data(),
+                                      densify_filter_outputs.size())));
   }
 
-  const int tensor_index_output = tensors.size();
-  tensors.emplace_back(
-      CreateTensorDirect(builder, &output_shape, TensorType_FLOAT32));
+  if (quantized_filter_tensor_id >= 0) {
+    const std::array<int32_t, 1> dequantize_filter_inputs{
+        {quantized_filter_tensor_id}};
+    const std::array<int32_t, 1> dequantize_filter_outputs{{filter_tensor_id}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/dequantize_operator_code,
+        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                      dequantize_filter_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                      dequantize_filter_outputs.size())));
+  }
 
-  const std::vector<int32_t> op_outputs = {tensor_index_output};
+  if (quantized_bias_tensor_id >= 0) {
+    const std::array<int32_t, 1> dequantize_bias_inputs{
+        {quantized_bias_tensor_id}};
+    const std::array<int32_t, 1> dequantize_bias_outputs{{bias_tensor_id}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/dequantize_operator_code,
+        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                      dequantize_bias_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                      dequantize_bias_outputs.size())));
+  }
 
-  const int opcode_index_transpose_conv = operator_codes.size();
-  operator_codes.emplace_back(
-      CreateOperatorCode(builder, BuiltinOperator_TRANSPOSE_CONV));
-
-  flatbuffers::Offset<TransposeConvOptions> transpose_conv_options =
+  std::vector<int32_t> op_inputs{
+      {output_shape_tensor_id, filter_tensor_id, input_tensor_id}};
+  if (HasBias()) {
+    op_inputs.push_back(bias_tensor_id);
+  }
+  const std::array<int32_t, 1> op_outputs{{output_tensor_id}};
+  const flatbuffers::Offset<TransposeConvOptions> transpose_conv_options =
       CreateTransposeConvOptions(builder, Padding(), StrideWidth(),
                                  StrideHeight());
-  operators.emplace_back(CreateOperatorDirect(
-      builder, /*opcode_index=*/opcode_index_transpose_conv, &op_inputs,
-      &op_outputs, BuiltinOptions_TransposeConvOptions,
-      transpose_conv_options.Union()));
+  operators.emplace_back(CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      BuiltinOptions_TransposeConvOptions, transpose_conv_options.Union()));
 
-  const std::vector<int32_t> subgraph_inputs = {tensor_index_input};
-  const std::vector<int32_t> subgraph_outputs = {tensor_index_output};
-  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraphDirect(
-      builder, &tensors, &subgraph_inputs, &subgraph_outputs, &operators);
+  /****************************** Define subgraph *****************************/
+  const std::array<int32_t, 1> subgraph_inputs{{input_tensor_id}};
+  const std::array<int32_t, 1> subgraph_outputs{{output_tensor_id}};
+  const flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(operators.data(), operators.size()));
 
-  flatbuffers::Offset<flatbuffers::String> description =
+  const flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("TransposeConv model");
 
-  flatbuffers::Offset<Model> model_buffer = CreateModel(
+  const flatbuffers::Offset<Model> model_buffer = CreateModel(
       builder, TFLITE_SCHEMA_VERSION,
       builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h b/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h
index 9588653e748..1e8387c6cf3 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h
@@ -31,6 +31,18 @@ namespace xnnpack {
 
 class TransposeConvTester {
  public:
+  enum class WeightsType {
+    kFP32,
+    kFP16,
+    kTensorWiseQuantizedInt8,
+    kChannelWiseQuantizedInt8,
+  };
+  enum class BiasType {
+    kNone,
+    kFP32,
+    kFP16,
+  };
+
   TransposeConvTester() = default;
   TransposeConvTester(const TransposeConvTester&) = delete;
   TransposeConvTester& operator=(const TransposeConvTester&) = delete;
@@ -108,28 +120,25 @@ class TransposeConvTester {
   inline int32_t StrideWidth() const { return stride_width_; }
 
   inline TransposeConvTester& FP16Weights() {
-    fp16_weights_ = true;
+    weights_type_ = WeightsType::kFP16;
+    bias_type_ = BiasType::kFP16;
     return *this;
   }
 
-  inline bool FP16Weights() const { return fp16_weights_; }
-
-  inline TransposeConvTester& INT8Weights() {
-    int8_weights_ = true;
+  inline TransposeConvTester& TensorWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kTensorWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
     return *this;
   }
 
-  inline bool INT8Weights() const { return int8_weights_; }
-
-  inline TransposeConvTester& INT8ChannelWiseWeights() {
-    int8_channel_wise_weights_ = true;
+  inline TransposeConvTester& ChannelWiseQuantizedInt8Weights() {
+    weights_type_ = WeightsType::kChannelWiseQuantizedInt8;
+    // Bias is stored in FP32 even when filter is quantized to INT8
+    bias_type_ = BiasType::kFP32;
     return *this;
   }
 
-  inline bool INT8ChannelWiseWeights() const {
-    return int8_channel_wise_weights_;
-  }
-
   inline TransposeConvTester& SparseWeights() {
     sparse_weights_ = true;
     return *this;
@@ -165,15 +174,11 @@ class TransposeConvTester {
     return ComputePadding(OutputHeight(), KernelHeight(), StrideHeight());
   }
 
-  inline bool UseBias() const { return use_bias_; }
-
-  inline TransposeConvTester& WithBias(bool use_bias = true) {
-    use_bias_ = use_bias;
+  inline TransposeConvTester& NoBias() {
+    bias_type_ = BiasType::kNone;
     return *this;
   }
 
-  inline TransposeConvTester& NoBias() { return WithBias(false); }
-
   inline TransposeConvTester& WeightsCache(
       TfLiteXNNPackDelegateWeightsCache* weights_cache) {
     weights_cache_ = weights_cache;
@@ -213,6 +218,12 @@ class TransposeConvTester {
  private:
   std::vector<char> CreateTfLiteModel() const;
 
+  inline bool HasBias() const { return bias_type_ != BiasType::kNone; }
+
+  inline WeightsType WeightsType() const { return weights_type_; }
+
+  inline BiasType BiasType() const { return bias_type_; }
+
   int32_t batch_size_ = 1;
   int32_t input_channels_ = 1;
   int32_t output_channels_ = 1;
@@ -223,10 +234,8 @@ class TransposeConvTester {
   int32_t stride_height_ = 1;
   int32_t stride_width_ = 1;
   ::tflite::Padding padding_ = ::tflite::Padding_VALID;
-  bool use_bias_ = true;
-  bool fp16_weights_ = false;
-  bool int8_weights_ = false;
-  bool int8_channel_wise_weights_ = false;
+  enum WeightsType weights_type_ { WeightsType::kFP32 };
+  enum BiasType bias_type_ { BiasType::kFP32 };
   bool sparse_weights_ = false;
   TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
 };
diff --git a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
index e75d0b4e073..1bf519d46d6 100644
--- a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
@@ -83,13 +83,11 @@ void UnaryElementwiseTester::Test(tflite::BuiltinOperator unary_op,
   ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
 
   float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data, default_input_data + Size(),
-                std::ref(input_rng));
+  std::generate_n(default_input_data, Size(), std::ref(input_rng));
 
   float* delegate_input_data =
       delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data, default_input_data + Size(),
-            delegate_input_data);
+  std::copy_n(default_input_data, Size(), delegate_input_data);
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
   ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_concatenation_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_concatenation_test.cc
index ce60137f83b..4aa1549b061 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_concatenation_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_concatenation_test.cc
@@ -324,5 +324,58 @@ TEST(UnsignedQuantizedConcatenation, 4D_of_4) {
   }
 }
 
+TEST(UnsignedQuantizedConcatenation,
+     DISABLED_2D_2_inputs_different_zero_points) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+
+  for (int i = -1; i < 2; i++) {
+    // All dimensions must be the same, except for axis.
+    const std::vector<int32_t> shape1({shape_rng(), shape_rng()});
+    auto shape2 = SameShapeDifferentAxis(shape1, i, shape_rng());
+
+    // clang-format off
+    ConcatenationTester()
+        .InputShapes({shape1, shape2})
+        .Axis(i)
+        .InputZeroPoint({2, 3})
+        .OutputZeroPoint(1)
+        .Test(TensorType_UINT8, xnnpack_delegate.get());
+    // clang-format on
+  }
+}
+
+TEST(UnsignedQuantizedConcatenation, DISABLED_2D_2_inputs_different_scales) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+
+  for (int i = -1; i < 2; i++) {
+    // All dimensions must be the same, except for axis.
+    const std::vector<int32_t> shape1({shape_rng(), shape_rng()});
+    auto shape2 = SameShapeDifferentAxis(shape1, i, shape_rng());
+
+    // clang-format off
+    ConcatenationTester()
+        .InputShapes({shape1, shape2})
+        .Axis(i)
+        .InputScales({2, 3})
+        .OutputScale(1)
+        .Test(TensorType_UINT8, xnnpack_delegate.get());
+    // clang-format on
+  }
+}
+
 }  // namespace xnnpack
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/variable_ops_tester.cc b/tensorflow/lite/delegates/xnnpack/variable_ops_tester.cc
index 9501693ab00..5d019db105d 100644
--- a/tensorflow/lite/delegates/xnnpack/variable_ops_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/variable_ops_tester.cc
@@ -868,12 +868,10 @@ void VariableOpsTester::Test(TfLiteDelegate* delegate,
   for (size_t i = 0; i < NumInputs(); i++) {
     float* default_input_data =
         default_interpreter->typed_input_tensor<float>(i);
-    std::generate(default_input_data, default_input_data + InputSize(),
-                  std::ref(f32rng));
+    std::generate_n(default_input_data, InputSize(), std::ref(f32rng));
     float* delegate_input_data =
         delegate_interpreter->typed_input_tensor<float>(i);
-    std::copy(default_input_data, default_input_data + InputSize(),
-              delegate_input_data);
+    std::copy_n(default_input_data, InputSize(), delegate_input_data);
   }
 
   ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/xnnpack/weights_cache_test.cc b/tensorflow/lite/delegates/xnnpack/weights_cache_test.cc
index e7f88c01172..c047ca44427 100644
--- a/tensorflow/lite/delegates/xnnpack/weights_cache_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/weights_cache_test.cc
@@ -18,21 +18,35 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/core/kernels/register.h"
-#include "tensorflow/lite/core/model_builder.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
 #include "tensorflow/lite/delegates/xnnpack/conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
 
 namespace tflite {
 namespace xnnpack {
 
+class DummyOpResolver : public MutableOpResolver {
+ public:
+  DummyOpResolver() {
+    AddBuiltin(BuiltinOperator_CONV_2D, DummyRegistration(), 1, 3);
+  }
+
+ private:
+  static const TfLiteRegistration* DummyRegistration() {
+    static TfLiteRegistration r = {nullptr, nullptr, Prepare, Invoke};
+    return &r;
+  }
+  static TfLiteStatus Prepare(TfLiteContext*, TfLiteNode*) { return kTfLiteOk; }
+  static TfLiteStatus Invoke(TfLiteContext*, TfLiteNode*) { return kTfLiteOk; }
+};
+
 TEST(XNNPACK_WEIGHTS_CACHE, WithSize) {
   std::vector<char> buffer = Conv2DTester().CreateTfLiteModel();
   const Model* model = GetModel(buffer.data());
-  ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  DummyOpResolver resolver;
 
   std::unique_ptr<Interpreter> interpreter;
   ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter));
@@ -63,7 +77,7 @@ TEST(XNNPACK_WEIGHTS_CACHE, WithSize) {
 TEST(XNNPACK_WEIGHTS_CACHE, InvokeBeforeFinalization) {
   std::vector<char> buffer = Conv2DTester().CreateTfLiteModel();
   const Model* model = GetModel(buffer.data());
-  ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  DummyOpResolver resolver;
 
   std::unique_ptr<Interpreter> interpreter;
   ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter));
@@ -91,7 +105,7 @@ TEST(XNNPACK_WEIGHTS_CACHE, InvokeBeforeFinalization) {
 TEST(XNNPACK_WEIGHTS_CACHE, HardFinalization) {
   std::vector<char> buffer = Conv2DTester().CreateTfLiteModel();
   const Model* model = GetModel(buffer.data());
-  ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  DummyOpResolver resolver;
 
   std::unique_ptr<Interpreter> interpreter1;
   ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter1));
@@ -129,7 +143,7 @@ TEST(XNNPACK_WEIGHTS_CACHE, HardFinalization) {
 TEST(XNNPACK_WEIGHTS_CACHE, SoftFinalization) {
   std::vector<char> buffer = Conv2DTester().CreateTfLiteModel();
   const Model* model = GetModel(buffer.data());
-  ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  DummyOpResolver resolver;
 
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
                   decltype(&TfLiteXNNPackDelegateWeightsCacheDelete)>
@@ -170,7 +184,7 @@ class WeightsCacheTest : public testing::TestWithParam<size_t> {};
 TEST_P(WeightsCacheTest, SoftFinalizationMultithreaded) {
   std::vector<char> buffer = Conv2DTester().CreateTfLiteModel();
   const Model* model = GetModel(buffer.data());
-  ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  DummyOpResolver resolver;
 
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
                   decltype(&TfLiteXNNPackDelegateWeightsCacheDelete)>
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 86ad930340b..fc7c571dd83 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
 
 struct TfLiteXNNPackDelegateWeightsCache;
@@ -515,6 +516,11 @@ class Delegate {
                               TFLITE_XNNPACK_DELEGATE_FLAG_QS8)) != 0;
   }
 
+  bool support_dynamic_fully_connected_operator() const {
+    return (options_.flags &
+            TFLITE_XNNPACK_DELEGATE_FLAG_DYNAMIC_FULLY_CONNECTED) != 0;
+  }
+
   bool force_fp16() const {
 #ifdef XNNPACK_DELEGATE_FORCE_PRECISION_FP16
     return true;
@@ -1422,42 +1428,49 @@ class Subgraph {
 
   static TfLiteStatus CheckPoolingParams(TfLiteContext* context,
                                          const TfLitePoolParams* params,
+                                         BuiltinOperator op_type,
                                          int node_index) {
     if (params->stride_width <= 0) {
-      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                               params->stride_width, node_index);
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "invalid stride width %d in %s node #%d",
+          params->stride_width, EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                               params->stride_height, node_index);
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "invalid stride height %d in %s node #%d",
+          params->stride_height, EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
 
     if (params->filter_width <= 0) {
-      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid filter width %d in node #%d",
-                               params->filter_width, node_index);
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "invalid filter width %d in %s node #%d",
+          params->filter_width, EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
     if (params->filter_height <= 0) {
-      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid filter height %d in node #%d",
-                               params->filter_height, node_index);
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "invalid filter height %d in %s node #%d",
+          params->filter_height, EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
 
     if (params->stride_width > params->filter_width) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          context,
-          "unsupported width stride %d exceeding filter width %d in node #%d",
-          params->stride_width, params->filter_width, node_index);
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unsupported width stride %d exceeding filter "
+                               "width %d in %s node #%d",
+                               params->stride_width, params->filter_width,
+                               EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
 
     if (params->stride_height > params->filter_height) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          context,
-          "unsupported height stride %d exceeding filter height %d in node #%d",
-          params->stride_height, params->filter_height, node_index);
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unsupported height stride %d exceeding filter "
+                               "height %d in %s node #%d",
+                               params->stride_height, params->filter_height,
+                               EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
 
@@ -1465,9 +1478,9 @@ class Subgraph {
         std::max(params->stride_width, params->stride_height) > 1) {
       TF_LITE_MAYBE_KERNEL_LOG(context,
                                "unsupported pooling with 1x1 filter "
-                               "and %dx%d stride in node #%d",
+                               "and %dx%d stride in %s node #%d",
                                params->stride_width, params->stride_height,
-                               node_index);
+                               EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
 
@@ -1475,11 +1488,13 @@ class Subgraph {
   }
 
   static TfLiteStatus CheckNumInputs(TfLiteContext* context, TfLiteNode* node,
-                                     int expected_num_inputs, int node_index) {
+                                     int expected_num_inputs,
+                                     BuiltinOperator op_type, int node_index) {
     if (node->inputs->size != expected_num_inputs) {
       TF_LITE_MAYBE_KERNEL_LOG(
           context, "unexpected number of inputs (%d != %d) in node #%d",
-          node->inputs->size, expected_num_inputs, node_index);
+          node->inputs->size, expected_num_inputs,
+          EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -1487,12 +1502,12 @@ class Subgraph {
 
   static TfLiteStatus CheckNumInputs(TfLiteContext* context, TfLiteNode* node,
                                      int min_num_inputs, int max_num_inputs,
-                                     int node_index) {
+                                     BuiltinOperator op_type, int node_index) {
     if (node->inputs->size < min_num_inputs ||
         node->inputs->size > max_num_inputs) {
-      TF_LITE_MAYBE_KERNEL_LOG(context,
-                               "unexpected number of inputs (%d) in node #%d",
-                               node->inputs->size, node_index);
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unexpected number of inputs (%d) in %s node #%d",
+          node->inputs->size, EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -1500,11 +1515,12 @@ class Subgraph {
 
   static TfLiteStatus CheckNumOutputs(TfLiteContext* context, TfLiteNode* node,
                                       int expected_num_outputs,
-                                      int node_index) {
+                                      BuiltinOperator op_type, int node_index) {
     if (node->outputs->size != expected_num_outputs) {
       TF_LITE_MAYBE_KERNEL_LOG(
-          context, "unexpected number of outputs (%d != %d) in node #%d",
-          node->outputs->size, expected_num_outputs, node_index);
+          context, "unexpected number of outputs (%d != %d) in %s node #%d",
+          node->outputs->size, expected_num_outputs,
+          EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -1512,12 +1528,12 @@ class Subgraph {
 
   static TfLiteStatus CheckNumOutputs(TfLiteContext* context, TfLiteNode* node,
                                       int min_num_outputs, int max_num_outputs,
-                                      int node_index) {
+                                      BuiltinOperator op_type, int node_index) {
     if (node->outputs->size < min_num_outputs ||
         node->outputs->size > max_num_outputs) {
-      TF_LITE_MAYBE_KERNEL_LOG(context,
-                               "unexpected number of outputs (%d) in node #%d",
-                               node->outputs->size, node_index);
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unexpected number of outputs (%d) in %s node #%d",
+          node->outputs->size, EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -1525,23 +1541,22 @@ class Subgraph {
 
   static TfLiteStatus CheckNumInputsAndOutputs(
       TfLiteContext* context, TfLiteNode* node, int min_num_inputs,
-      int max_num_inputs, int expected_num_outputs, int node_index) {
+      int max_num_inputs, int expected_num_outputs, BuiltinOperator op_type,
+      int node_index) {
     TF_LITE_ENSURE_STATUS(CheckNumInputs(context, node, min_num_inputs,
-                                         max_num_inputs, node_index));
-    TF_LITE_ENSURE_STATUS(
-        CheckNumOutputs(context, node, expected_num_outputs, node_index));
+                                         max_num_inputs, op_type, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumOutputs(context, node, expected_num_outputs,
+                                          op_type, node_index));
     return kTfLiteOk;
   }
 
-  static TfLiteStatus CheckNumInputsAndOutputs(TfLiteContext* context,
-                                               TfLiteNode* node,
-                                               int expected_num_inputs,
-                                               int expected_num_outputs,
-                                               int node_index) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputs(context, node, expected_num_inputs, node_index));
-    TF_LITE_ENSURE_STATUS(
-        CheckNumOutputs(context, node, expected_num_outputs, node_index));
+  static TfLiteStatus CheckNumInputsAndOutputs(
+      TfLiteContext* context, TfLiteNode* node, int expected_num_inputs,
+      int expected_num_outputs, BuiltinOperator op_type, int node_index) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputs(context, node, expected_num_inputs,
+                                         op_type, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumOutputs(context, node, expected_num_outputs,
+                                          op_type, node_index));
     return kTfLiteOk;
   }
 
@@ -1882,31 +1897,39 @@ class Subgraph {
   static TfLiteStatus CheckTensorShape(TfLiteContext* context,
                                        const TfLiteTensor& tensor,
                                        int min_num_dims, int max_num_dims,
-                                       int tensor_index) {
+                                       int tensor_index,
+                                       BuiltinOperator op_type,
+                                       int node_index) {
     if (min_num_dims == max_num_dims) {
       if (NumDimensions(&tensor) != min_num_dims) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            context,
-            "unsupported number of shape dimensions (%d) in tensor #%d: "
-            "%d dimensions expected",
-            NumDimensions(&tensor), tensor_index, min_num_dims);
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "unsupported number of shape dimensions (%d) "
+                                 "in tensor #%d in %s node #%d: "
+                                 "%d dimensions expected",
+                                 NumDimensions(&tensor), tensor_index,
+                                 EnumNameBuiltinOperator(op_type), node_index,
+                                 min_num_dims);
         return kTfLiteError;
       }
     } else {
       if (NumDimensions(&tensor) < min_num_dims) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            context,
-            "unsupported number of shape dimensions (%d) in tensor #%d: "
-            "at least %d dimensions expected",
-            NumDimensions(&tensor), tensor_index, min_num_dims);
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "unsupported number of shape dimensions (%d) "
+                                 "in tensor #%d in %s node #%d: "
+                                 "at least %d dimensions expected",
+                                 NumDimensions(&tensor), tensor_index,
+                                 EnumNameBuiltinOperator(op_type), node_index,
+                                 min_num_dims);
         return kTfLiteError;
       }
       if (NumDimensions(&tensor) > max_num_dims) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            context,
-            "unsupported number of shape dimensions (%d) in tensor #%d: "
-            "at most %d dimensions expected",
-            NumDimensions(&tensor), tensor_index, max_num_dims);
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "unsupported number of shape dimensions (%d) "
+                                 "in tensor #%d in %s node #%d: "
+                                 "at most %d dimensions expected",
+                                 NumDimensions(&tensor), tensor_index,
+                                 EnumNameBuiltinOperator(op_type), node_index,
+                                 max_num_dims);
         return kTfLiteError;
       }
     }
@@ -1914,8 +1937,9 @@ class Subgraph {
       if (SizeOfDimension(&tensor, i) <= 0) {
         TF_LITE_MAYBE_KERNEL_LOG(context,
                                  "invalid num of elements (%d) in "
-                                 "dimension #%d in tensor #%d",
-                                 SizeOfDimension(&tensor, i), i, tensor_index);
+                                 "dimension #%d in tensor #%d in %s node #%d",
+                                 SizeOfDimension(&tensor, i), i, tensor_index,
+                                 EnumNameBuiltinOperator(op_type), node_index);
         return kTfLiteError;
       }
     }
@@ -1924,22 +1948,26 @@ class Subgraph {
 
   static TfLiteStatus CheckTensorShape(TfLiteContext* context,
                                        const TfLiteTensor& tensor,
-                                       int expected_num_dims,
-                                       int tensor_index) {
+                                       int expected_num_dims, int tensor_index,
+                                       BuiltinOperator op_type,
+                                       int node_index) {
     return CheckTensorShape(context, tensor, expected_num_dims,
-                            expected_num_dims, tensor_index);
+                            expected_num_dims, tensor_index, op_type,
+                            node_index);
   }
 
   static TfLiteStatus CheckSlopeTensorShape(TfLiteContext* context,
                                             const TfLiteTensor& tensor,
-                                            int tensor_index, int node_index) {
+                                            int tensor_index,
+                                            BuiltinOperator op_type,
+                                            int node_index) {
     if (NumDimensions(&tensor) < 1) {
       TF_LITE_MAYBE_KERNEL_LOG(context,
                                "unexpected number of shape dimensions (%d) in "
-                               "tensor #%d in node #%d: "
+                               "tensor #%d in %s node #%d: "
                                "expected at least a 1D tensor",
                                NumDimensions(&tensor), tensor_index,
-                               node_index);
+                               EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
     // Validate that all non-channel dimensions (if any) are exactly 1.
@@ -1948,9 +1976,10 @@ class Subgraph {
         TF_LITE_MAYBE_KERNEL_LOG(
             context,
             "unexpected value %d of shape dimension #%d in "
-            "tensor #%d in node #%d: "
+            "tensor #%d in %s node #%d: "
             "expected 1 for non-channel dimensions",
-            tensor.dims[i], i, tensor_index, node_index);
+            tensor.dims[i], i, tensor_index, EnumNameBuiltinOperator(op_type),
+            node_index);
         return kTfLiteError;
       }
     }
@@ -2009,15 +2038,35 @@ class Subgraph {
 
   static TfLiteStatus CheckShapeTensorShape(TfLiteContext* context,
                                             const TfLiteTensor& tensor,
-                                            int tensor_index, int node_index) {
-    if (NumDimensions(&tensor) != 1) {
-      TF_LITE_MAYBE_KERNEL_LOG(context,
-                               "unexpected number of shape dimensions (%d) in "
-                               "shape tensor #%d in node #%d: "
-                               "expected a 1D tensor",
-                               NumDimensions(&tensor), tensor_index,
-                               node_index);
-      return kTfLiteError;
+                                            bool squeeze_dims, int tensor_index,
+                                            BuiltinOperator op_type,
+                                            int node_index) {
+    const int num_dims = NumDimensions(&tensor);
+    if (num_dims != 1) {
+      if (squeeze_dims) {
+        for (int i = 0; i < num_dims - 1; i++) {
+          if (tensor.dims->data[i] != 1) {
+            TF_LITE_MAYBE_KERNEL_LOG(
+                context,
+                "unexpected non-unit (%d) shape dimension #%d in shape tensor "
+                "#%d in %s node #%d: expected %d leading dimensions of the %dD "
+                "tensor to be 1",
+                tensor.dims->data[i], i, tensor_index,
+                EnumNameBuiltinOperator(op_type), node_index, num_dims - 1,
+                num_dims);
+            return kTfLiteError;
+          }
+        }
+      } else {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unexpected number of shape dimensions (%d) in "
+            "shape tensor #%d in %s node #%d: "
+            "expected a 1D tensor",
+            num_dims, tensor_index, EnumNameBuiltinOperator(op_type),
+            node_index);
+        return kTfLiteError;
+      }
     }
     return kTfLiteOk;
   }
@@ -2040,19 +2089,36 @@ class Subgraph {
   static TfLiteStatus CheckTensorStaticAllocation(TfLiteContext* context,
                                                   const TfLiteTensor& tensor,
                                                   int tensor_index,
+                                                  BuiltinOperator op_type,
                                                   int node_index) {
     if (tensor.allocation_type != kTfLiteMmapRo ||
         tensor.data.raw_const == nullptr) {
       TF_LITE_MAYBE_KERNEL_LOG(
           context,
-          "invalid allocation type in tensor #%d in node #%d: "
+          "invalid allocation type in tensor #%d in %s node #%d: "
           "expected static read-only tensor",
-          tensor_index, node_index);
+          tensor_index, EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
   }
 
+  static TfLiteStatus CheckTensorStaticOrPersistentRoAllocation(
+      TfLiteContext* context, const TfLiteTensor& tensor, int tensor_index,
+      int node_index) {
+    if (tensor.allocation_type == kTfLiteMmapRo ||
+        tensor.allocation_type == kTfLitePersistentRo ||
+        tensor.data.raw_const == nullptr) {
+      return kTfLiteOk;
+    }
+    TF_LITE_MAYBE_KERNEL_LOG(
+        context,
+        "invalid allocation type in tensor #%d in node #%d: "
+        "expected static or persistent read-only tensor",
+        tensor_index, node_index);
+    return kTfLiteError;
+  }
+
   static TfLiteStatus CheckTensorsDimensionMatch(
       TfLiteContext* context, const TfLiteTensor& input_tensor,
       const TfLiteTensor& output_tensor, int dimension_index, int node_index,
@@ -2100,7 +2166,7 @@ class Subgraph {
   static TfLiteStatus CheckTensorsInputOutputScale(
       TfLiteContext* context, const TfLiteTensor& input_tensor,
       const TfLiteTensor& output_tensor, float scale_min, float scale_max,
-      int node_index, const char* op_name) {
+      BuiltinOperator op_type, int node_index) {
     if (input_tensor.type != output_tensor.type) {
       // No validation needed
       return kTfLiteOk;
@@ -2117,8 +2183,8 @@ class Subgraph {
       const float input_output_scale = input_scale / output_scale;
       if (input_output_scale < scale_min || input_output_scale >= scale_max) {
         TF_LITE_MAYBE_KERNEL_LOG(
-            context, "unsupported input-to-output scale in node #%d",
-            node_index);
+            context, "unsupported input-to-output scale in %s node #%d",
+            EnumNameBuiltinOperator(op_type), node_index);
         return kTfLiteError;
       }
     }
@@ -2128,7 +2194,8 @@ class Subgraph {
   static TfLiteStatus CheckTensorsInputProductOutputScale(
       TfLiteContext* context, const TfLiteTensor& input1_tensor,
       const TfLiteTensor& input2_tensor, const TfLiteTensor& output_tensor,
-      float scale_min, float scale_max, int node_index, const char* op_name) {
+      float scale_min, float scale_max, BuiltinOperator op_type,
+      int node_index) {
     if (input1_tensor.type != input2_tensor.type ||
         input1_tensor.type != output_tensor.type) {
       // No validation needed
@@ -2152,8 +2219,9 @@ class Subgraph {
       if (product_output_scale < scale_min ||
           product_output_scale >= scale_max) {
         TF_LITE_MAYBE_KERNEL_LOG(
-            context, "unsupported input-product-to-output scale in node #%d",
-            node_index);
+            context,
+            "unsupported input-product-to-output scale in %s, node #%d",
+            EnumNameBuiltinOperator(op_type), node_index);
         return kTfLiteError;
       }
     }
@@ -2170,7 +2238,11 @@ class Subgraph {
     // messages are passed to TFLite. When we detect supported operations
     // (subgraph is null), logging context is null, and error messages are
     // supressed.
+#ifdef XNNPACK_DELEGATE_ENABLE_LOGGING
+    TfLiteContext* logging_context = context;
+#else
     TfLiteContext* logging_context = subgraph == nullptr ? nullptr : context;
+#endif
     switch (registration->builtin_code) {
       case kTfLiteBuiltinAbs:
         return VisitAbsNode(subgraph, delegate, logging_context, node_index,
@@ -2456,10 +2528,22 @@ class Subgraph {
           return VisitMediaPipeUnpoolingNode(subgraph, delegate, context,
                                              node_index, node, context->tensors,
                                              &pool_params, xnnpack_tensors);
+        } else {
+#ifdef XNNPACK_DELEGATE_ENABLE_LOGGING
+          TF_LITE_KERNEL_LOG(
+              context, "unsupported custom operator type \"%s\" in node #%d",
+              registration->custom_name, node_index);
+#endif  // XNNPACK_DELEGATE_ENABLE_LOGGING
         }
         return kTfLiteError;
       }
       default:
+#ifdef XNNPACK_DELEGATE_ENABLE_LOGGING
+        TF_LITE_KERNEL_LOG(context, "unsupported operator type %s in node #%d",
+                           EnumNameBuiltinOperator(static_cast<BuiltinOperator>(
+                               registration->builtin_code)),
+                           node_index);
+#endif  // XNNPACK_DELEGATE_ENABLE_LOGGING
         return kTfLiteError;
     }
   }
@@ -2469,8 +2553,8 @@ class Subgraph {
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
       const TfLiteTensor* tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_ABS, node_index));
 
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
@@ -2489,7 +2573,8 @@ class Subgraph {
           subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate ABS node #%d",
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_ABS),
                            node_index);
         return kTfLiteError;
       }
@@ -2503,8 +2588,8 @@ class Subgraph {
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
       const TfLiteTensor* tensors, const TfLiteAddParams* add_params,
       const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 2, 1, BuiltinOperator_ADD, node_index));
 
     const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(
@@ -2531,10 +2616,10 @@ class Subgraph {
     const float scale_max = 256.0f;
     TF_LITE_ENSURE_STATUS(CheckTensorsInputOutputScale(
         logging_context, input1_tensor, output_tensor, scale_min, scale_max,
-        node_index, "ADD"));
+        BuiltinOperator_ADD, node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorsInputOutputScale(
         logging_context, input2_tensor, output_tensor, scale_min, scale_max,
-        node_index, "ADD"));
+        BuiltinOperator_ADD, node_index));
 
     float output_min = -std::numeric_limits<float>::infinity();
     float output_max = +std::numeric_limits<float>::infinity();
@@ -2551,7 +2636,8 @@ class Subgraph {
           /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate ADD node #%d",
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_ADD),
                            node_index);
         return kTfLiteError;
       }
@@ -2577,9 +2663,10 @@ class Subgraph {
           subgraph, xnnpack_tensors[node->inputs->data[1]],
           xnnpack_tensors[node->inputs->data[0]], 0 /* flags */);
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate ASSIGN_VARIABLE node #%d",
-                           node_index);
+        TF_LITE_KERNEL_LOG(
+            logging_context, "failed to delegate %s node #%d",
+            EnumNameBuiltinOperator(BuiltinOperator_ASSIGN_VARIABLE),
+            node_index);
         return kTfLiteError;
       }
     }
@@ -2592,7 +2679,8 @@ class Subgraph {
       const TfLiteTensor* tensors, const TfLitePoolParams* pool_params,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
+        CheckNumInputsAndOutputs(logging_context, node, 1, 1,
+                                 BuiltinOperator_AVERAGE_POOL_2D, node_index));
 
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
@@ -2606,8 +2694,9 @@ class Subgraph {
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, output_tensor, node->outputs->data[0], node_index));
 
-    TF_LITE_ENSURE_STATUS(
-        CheckPoolingParams(logging_context, pool_params, node_index));
+    TF_LITE_ENSURE_STATUS(CheckPoolingParams(logging_context, pool_params,
+                                             BuiltinOperator_AVERAGE_POOL_2D,
+                                             node_index));
 
     uint32_t flags = 0;
     TF_LITE_ENSURE_STATUS(CalculatePadding(
@@ -2642,9 +2731,10 @@ class Subgraph {
             /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
       }
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate AVERAGE_POOL_2D node #%d",
-                           node_index);
+        TF_LITE_KERNEL_LOG(
+            logging_context, "failed to delegate %s node #%d",
+            EnumNameBuiltinOperator(BuiltinOperator_AVERAGE_POOL_2D),
+            node_index);
         return kTfLiteError;
       }
     }
@@ -2657,8 +2747,8 @@ class Subgraph {
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
       const TfLiteTensor* tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_CEIL, node_index));
 
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
@@ -2677,7 +2767,8 @@ class Subgraph {
           subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate CEIL node #%d",
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_CEIL),
                            node_index);
         return kTfLiteError;
       }
@@ -2686,6 +2777,2262 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus VisitConcatenationNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const TfLiteConcatenationParams* concat_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 2, 4, 1,
+                                 BuiltinOperator_CONCATENATION, node_index));
+    const int num_inputs = NumInputs(node);
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    // Check dimensions
+    int axis = concat_params->axis;
+    if (axis < 0) axis += NumDimensions(&output_tensor);
+    int sum_axis = 0;
+
+    if (output_tensor.type == kTfLiteUInt8) {
+      const int32_t zero_point =
+          tensors[node->outputs->data[0]].params.zero_point;
+      const float scale = tensors[node->outputs->data[0]].params.scale;
+      for (int i = 0; i < num_inputs; i++) {
+        if (tensors[node->inputs->data[i]].params.zero_point != zero_point) {
+          TF_LITE_MAYBE_KERNEL_LOG(
+              logging_context,
+              "Mismatching quantization zero point across the %dth input "
+              "(%" PRId32 ") and the output (%" PRId32
+              ") for CONCATENATE operator #%d",
+              i, tensors[node->inputs->data[i]].params.zero_point, zero_point,
+              node_index);
+          return kTfLiteError;
+        }
+        if (tensors[node->inputs->data[i]].params.scale != scale) {
+          TF_LITE_MAYBE_KERNEL_LOG(
+              logging_context,
+              "Mismatching quantization scale across the %dth input (%f) "
+              "and the output (%f) for CONCATENATE operator #%d",
+              i, tensors[node->inputs->data[i]].params.scale, scale,
+              node_index);
+          return kTfLiteError;
+        }
+      }
+    }
+
+    for (int i = 0; i < num_inputs; i++) {
+      const TfLiteTensor& input_tensor = tensors[node->inputs->data[i]];
+      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
+          delegate, logging_context, input_tensor, node->inputs->data[i],
+          node_index));
+      TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+          logging_context, input_tensor, node->inputs->data[i], node_index));
+
+      TF_LITE_ENSURE_EQ(logging_context, NumDimensions(&input_tensor),
+                        NumDimensions(&output_tensor));
+
+      for (int d = 0; d < NumDimensions(&output_tensor); d++) {
+        // All dimensions must match except the 'axis'.
+        if (d == axis) {
+          continue;
+        }
+        const TfLiteTensor& input_tensor = tensors[node->inputs->data[i]];
+        TF_LITE_ENSURE_STATUS(CheckTensorsDimensionMatch(
+            logging_context, input_tensor, output_tensor, d, node_index,
+            "CONCATENATE"));
+      }
+      sum_axis += SizeOfDimension(&input_tensor, axis);
+    }
+
+    if (SizeOfDimension(&output_tensor, axis) != sum_axis) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "mismatch in axis dimension %d (%d != %d) in output and input"
+          "tensors of CONCATENATE operator #%d",
+          axis, SizeOfDimension(&output_tensor, axis), sum_axis, node_index);
+      return kTfLiteError;
+    }
+
+    if (subgraph != nullptr) {
+      xnn_status status = xnn_status_invalid_parameter;
+      if (num_inputs == 2) {
+        status = xnn_define_concatenate2(
+            subgraph, axis,
+            /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
+            /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
+            /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+            /*flags=*/0);
+      } else if (num_inputs == 3) {
+        status = xnn_define_concatenate3(
+            subgraph, axis,
+            /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
+            /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
+            /*input3_id=*/xnnpack_tensors[node->inputs->data[2]],
+            /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+            /*flags=*/0);
+      } else if (num_inputs == 4) {
+        status = xnn_define_concatenate4(
+            subgraph, axis,
+            /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
+            /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
+            /*input3_id=*/xnnpack_tensors[node->inputs->data[2]],
+            /*input4_id=*/xnnpack_tensors[node->inputs->data[3]],
+            /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+            /*flags=*/0);
+      }
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(
+            logging_context, "failed to delegate %s node #%d",
+            EnumNameBuiltinOperator(BuiltinOperator_CONCATENATION), node_index);
+        return kTfLiteError;
+      }
+    }
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitConv2DNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors, const TfLiteConvParams* conv_params,
+      const std::unordered_set<int>& quasi_static_tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckConvolutionParams(logging_context, conv_params, node_index));
+
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 3, 1, BuiltinOperator_CONV_2D, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, input_tensor, 4, node->inputs->data[0],
+        BuiltinOperator_CONV_2D, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& filter_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQCInt8Type(
+        delegate, logging_context, filter_tensor,
+        /*expected_quantized_dimension=*/0, node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, filter_tensor, 4, node->inputs->data[1],
+        BuiltinOperator_CONV_2D, node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1],
+          BuiltinOperator_CONV_2D, node_index));
+    }
+
+    const int bias_tensor_id = node->inputs->data[2];
+    if (bias_tensor_id < 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                               "unsupported CONV_2D node #%d without bias",
+                               node_index);
+      return kTfLiteError;
+    }
+    const TfLiteTensor& bias_tensor = tensors[bias_tensor_id];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQCInt32Type(delegate, logging_context, bias_tensor,
+                                        node->inputs->data[2], node_index));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorShape(logging_context, bias_tensor, 1, node->inputs->data[2],
+                         BuiltinOperator_CONV_2D, node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2],
+          BuiltinOperator_CONV_2D, node_index));
+    }
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, output_tensor, 4, node->outputs->data[0],
+        BuiltinOperator_CONV_2D, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (input_tensor.type != output_tensor.type ||
+        input_tensor.type != filter_tensor.type) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context, "unsupported mixed types in CONV_2D operator #%d",
+          node_index);
+      return kTfLiteError;
+    }
+
+    const int output_channels = SizeOfDimension(&filter_tensor, 0);
+    const int kernel_height = SizeOfDimension(&filter_tensor, 1);
+    const int kernel_width = SizeOfDimension(&filter_tensor, 2);
+    const int input_channels = SizeOfDimension(&filter_tensor, 3);
+    const int groups = SizeOfDimension(&input_tensor, 3) / input_channels;
+
+    uint32_t flags;
+    TF_LITE_ENSURE_STATUS(CalculatePadding(
+        logging_context, conv_params->padding, &flags, node_index));
+
+    float output_min = -std::numeric_limits<float>::infinity();
+    float output_max = +std::numeric_limits<float>::infinity();
+    TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+        logging_context, node_index, conv_params->activation, &output_min,
+        &output_max));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_convolution_2d(
+          subgraph,
+          /*input_padding_top=*/0,
+          /*input_padding_right=*/0,
+          /*input_padding_bottom=*/0,
+          /*input_padding_left=*/0, static_cast<uint32_t>(kernel_height),
+          static_cast<uint32_t>(kernel_width),
+          static_cast<uint32_t>(conv_params->stride_height),
+          static_cast<uint32_t>(conv_params->stride_width),
+          static_cast<uint32_t>(conv_params->dilation_height_factor),
+          static_cast<uint32_t>(conv_params->dilation_width_factor), groups,
+          static_cast<size_t>(input_channels),
+          static_cast<size_t>(output_channels) / groups, output_min, output_max,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*filter_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*bias_id=*/xnnpack_tensors[node->inputs->data[2]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_CONV_2D),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitDepthwiseConv2DNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const TfLiteDepthwiseConvParams* dwconv_params,
+      const std::unordered_set<int>& quasi_static_tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 3, 1, BuiltinOperator_DEPTHWISE_CONV_2D,
+        node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, input_tensor, 4, node->inputs->data[0],
+        BuiltinOperator_DEPTHWISE_CONV_2D, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& filter_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQCInt8Type(
+        delegate, logging_context, filter_tensor,
+        /*expected_quantized_dimension=*/3, node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, filter_tensor, 4, node->inputs->data[1],
+        BuiltinOperator_DEPTHWISE_CONV_2D, node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1],
+          BuiltinOperator_DEPTHWISE_CONV_2D, node_index));
+    }
+
+    const int bias_tensor_id = node->inputs->data[2];
+    if (bias_tensor_id < 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unsupported DEPTHWISE_CONV_2D node #%d without bias", node_index);
+      return kTfLiteError;
+    }
+    const TfLiteTensor& bias_tensor = tensors[bias_tensor_id];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQCInt32Type(delegate, logging_context, bias_tensor,
+                                        node->inputs->data[2], node_index));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorShape(logging_context, bias_tensor, 1, node->inputs->data[2],
+                         BuiltinOperator_DEPTHWISE_CONV_2D, node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2],
+          BuiltinOperator_DEPTHWISE_CONV_2D, node_index));
+    }
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, output_tensor, 4, node->outputs->data[0],
+        BuiltinOperator_DEPTHWISE_CONV_2D, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (input_tensor.type != output_tensor.type ||
+        input_tensor.type != filter_tensor.type) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unsupported mixed types in DEPTHWISE_CONV_2D operator #%d",
+          node_index);
+      return kTfLiteError;
+    }
+
+    const int kernel_height = SizeOfDimension(&filter_tensor, 1);
+    const int kernel_width = SizeOfDimension(&filter_tensor, 2);
+    const int output_channels = SizeOfDimension(&filter_tensor, 3);
+
+    TF_LITE_ENSURE_STATUS(CheckDepthwiseConvolutionParams(
+        logging_context, dwconv_params, output_channels, node_index));
+
+    uint32_t flags = 0;
+    TF_LITE_ENSURE_STATUS(CalculatePadding(
+        logging_context, dwconv_params->padding, &flags, node_index));
+
+    float output_min = -std::numeric_limits<float>::infinity();
+    float output_max = +std::numeric_limits<float>::infinity();
+    TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+        logging_context, node_index, dwconv_params->activation, &output_min,
+        &output_max));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_depthwise_convolution_2d(
+          subgraph,
+          /*input_padding_top=*/0,
+          /*input_padding_right=*/0,
+          /*input_padding_bottom=*/0,
+          /*input_padding_left=*/0, static_cast<uint32_t>(kernel_height),
+          static_cast<uint32_t>(kernel_width),
+          static_cast<uint32_t>(dwconv_params->stride_height),
+          static_cast<uint32_t>(dwconv_params->stride_width),
+          static_cast<uint32_t>(dwconv_params->dilation_height_factor),
+          static_cast<uint32_t>(dwconv_params->dilation_width_factor),
+          static_cast<uint32_t>(dwconv_params->depth_multiplier),
+          /*input_channels=*/
+          static_cast<uint32_t>(output_channels /
+                                dwconv_params->depth_multiplier),
+          output_min, output_max,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*filter_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*bias_id=*/xnnpack_tensors[node->inputs->data[2]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(
+            logging_context, "failed to delegate %s node #%d",
+            EnumNameBuiltinOperator(BuiltinOperator_DEPTHWISE_CONV_2D),
+            node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitDepthToSpaceNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const TfLiteDepthToSpaceParams* depth_to_space_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 1, 1,
+                                 BuiltinOperator_DEPTH_TO_SPACE, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (depth_to_space_params->block_size <= 1) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context, "invalid block size (%d) in DEPTH_TO_SPACE node #%d",
+          depth_to_space_params->block_size, node_index);
+      return kTfLiteError;
+    }
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_depth_to_space(
+          subgraph,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+          /*block_size=*/
+          static_cast<uint32_t>(depth_to_space_params->block_size),
+          /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(
+            logging_context, "failed to delegate %s node #%d",
+            EnumNameBuiltinOperator(BuiltinOperator_DEPTH_TO_SPACE),
+            node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitDequantizeNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_DEQUANTIZE, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorQInt8OrQUInt8Type(delegate, logging_context, input_tensor,
+                                     node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_convert(
+          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_DEQUANTIZE),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitDivNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors, const TfLiteDivParams* div_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 2, 1, BuiltinOperator_DIV, node_index));
+
+    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input1_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input1_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input2_tensor, node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input2_tensor, node->inputs->data[1], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    float output_min = -std::numeric_limits<float>::infinity();
+    float output_max = +std::numeric_limits<float>::infinity();
+    if (div_params != nullptr) {
+      TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+          logging_context, node_index, div_params->activation, &output_min,
+          &output_max));
+    }
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_divide(
+          subgraph, output_min, output_max,
+          /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_DIV),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitEluNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_ELU, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQInt8Type(delegate, logging_context, input_tensor,
+                                      node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQInt8Type(delegate, logging_context, output_tensor,
+                                      node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status =
+          xnn_define_elu(subgraph, /*alpha=*/1.0f,
+                         /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+                         /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+                         /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_ELU),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitFullyConnectedNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors, const TfLiteFullyConnectedParams* fc_params,
+      const std::unordered_set<int>& quasi_static_tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckFullyConnectedParams(logging_context, fc_params, node_index));
+
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 2, 3, 1,
+                                 BuiltinOperator_FULLY_CONNECTED, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& filter_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, filter_tensor, 2, node->inputs->data[1],
+        BuiltinOperator_FULLY_CONNECTED, node_index));
+    // Dynamic filter is supported, but only for FP32.
+    if (delegate.support_dynamic_fully_connected_operator() &&
+        filter_tensor.type == kTfLiteFloat32) {
+      TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    } else {
+      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
+          delegate, logging_context, filter_tensor, node->inputs->data[1],
+          node_index));
+      if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+        TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+            logging_context, filter_tensor, node->inputs->data[1],
+            BuiltinOperator_FULLY_CONNECTED, node_index));
+      }
+    }
+
+    int bias_tensor_id = -1;
+    if (node->inputs->size >= 3) {
+      bias_tensor_id = node->inputs->data[2];
+      if (bias_tensor_id >= 0) {
+        const TfLiteTensor& bias_tensor = tensors[bias_tensor_id];
+        TF_LITE_ENSURE_STATUS(CheckTensorShape(
+            logging_context, bias_tensor, 1, node->inputs->data[2],
+            BuiltinOperator_FULLY_CONNECTED, node_index));
+        // Dynamic bias is supported, but only for FP32.
+        if (delegate.support_dynamic_fully_connected_operator() &&
+            bias_tensor.type == kTfLiteFloat32) {
+          TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+              logging_context, bias_tensor, node->inputs->data[2], node_index));
+        } else {
+          TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQInt32Type(
+              delegate, logging_context, bias_tensor, node->inputs->data[2],
+              node_index));
+          if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+            TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+                logging_context, bias_tensor, node->inputs->data[2],
+                BuiltinOperator_FULLY_CONNECTED, node_index));
+          }
+        }
+      }
+    }
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const int32_t output_channels = SizeOfDimension(&filter_tensor, 0);
+    const int32_t input_channels = SizeOfDimension(&filter_tensor, 1);
+
+    if (input_tensor.type != output_tensor.type ||
+        input_tensor.type != filter_tensor.type) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unsupported mixed types in FULLY_CONNECTED operator #%d",
+          node_index);
+      return kTfLiteError;
+    }
+
+    if (NumDimensions(&input_tensor) == 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unexpected number of shape dimensions %d in tensor #%d",
+          NumDimensions(&input_tensor), node->inputs->data[0]);
+      return kTfLiteError;
+    }
+
+    int32_t num_input_elements = 1;
+    for (int i = 0; i < NumDimensions(&input_tensor); i++) {
+      if (SizeOfDimension(&input_tensor, i) <= 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context, "invalid dimension #%d (%d) in tensor #%d", i,
+            SizeOfDimension(&input_tensor, i), node->inputs->data[0]);
+        return kTfLiteError;
+      }
+      num_input_elements *= SizeOfDimension(&input_tensor, i);
+    }
+
+    if (fc_params->keep_num_dims) {
+      TF_LITE_ENSURE_STATUS(CheckTensorShape(
+          logging_context, output_tensor, NumDimensions(&input_tensor),
+          node->outputs->data[0], BuiltinOperator_FULLY_CONNECTED, node_index));
+
+      for (int i = 0; i < NumDimensions(&input_tensor) - 1; i++) {
+        if (SizeOfDimension(&input_tensor, i) !=
+            SizeOfDimension(&output_tensor, i)) {
+          TF_LITE_MAYBE_KERNEL_LOG(
+              logging_context,
+              "mismatch in shape dimension %d (%d != %d) in input and output "
+              "tensors of FULLY_CONNECTED operator #%d",
+              i, SizeOfDimension(&input_tensor, i),
+              SizeOfDimension(&output_tensor, i), node_index);
+          return kTfLiteError;
+        }
+      }
+    } else {
+      if (num_input_elements % input_channels != 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "number of elements in input tensor #%d in FULLY_CONNECTED "
+            "operator is not divisible by input channels (%d)",
+            node->inputs->data[0], input_channels);
+        return kTfLiteError;
+      }
+
+      TF_LITE_ENSURE_STATUS(CheckTensorShape(
+          logging_context, output_tensor, 2, node->outputs->data[0],
+          BuiltinOperator_FULLY_CONNECTED, node_index));
+
+      if (SizeOfDimension(&output_tensor, 0) !=
+          num_input_elements / input_channels) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "batch size %d in output tensor #%d in FULLY_CONNECTED operator "
+            "does not match batch size %d in reshaped input tensor #%d",
+            SizeOfDimension(&output_tensor, 0), node->outputs->data[0],
+            num_input_elements / input_channels, node->inputs->data[0]);
+        return kTfLiteError;
+      }
+    }
+
+    if (SizeOfDimension(&output_tensor, NumDimensions(&output_tensor) - 1) !=
+        output_channels) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "number of channels %d in output tensor #%d does not match output "
+          "channels %d in filter tensor #%d",
+          SizeOfDimension(&output_tensor, NumDimensions(&output_tensor) - 1),
+          node->outputs->data[0], output_channels, node->inputs->data[1]);
+      return kTfLiteError;
+    }
+
+    float output_min = -std::numeric_limits<float>::infinity();
+    float output_max = +std::numeric_limits<float>::infinity();
+    TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+        logging_context, node_index, fc_params->activation, &output_min,
+        &output_max));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_fully_connected(
+          subgraph, output_min, output_max,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*filter_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*bias_id=*/bias_tensor_id >= 0 ? xnnpack_tensors[bias_tensor_id]
+                                          : XNN_INVALID_VALUE_ID,
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+          /*flags=*/fc_params->keep_num_dims ? 0
+                                             : XNN_FLAG_TENSORFLOW_RESHAPE_2D);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(
+            logging_context, "failed to delegate %s node #%d",
+            EnumNameBuiltinOperator(BuiltinOperator_FULLY_CONNECTED),
+            node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitFloorNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_FLOOR, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_floor(
+          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_FLOOR),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitHardSwishNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_HARD_SWISH, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_hardswish(
+          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_HARD_SWISH),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitLeakyReluNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const TfLiteLeakyReluParams* leaky_relu_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_LEAKY_RELU, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (!std::isnormal(leaky_relu_params->alpha) ||
+        leaky_relu_params->alpha == 0.0f) {
+      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                               "unsupported alpha %g in LEAKY_RELU node #%d",
+                               leaky_relu_params->alpha, node_index);
+      return kTfLiteError;
+    }
+
+    const float input_scale =
+        GetTensorScaleOrDefault(input_tensor, std::nanf(""));
+    const float output_scale =
+        GetTensorScaleOrDefault(output_tensor, std::nanf(""));
+    if (std::isnormal(input_scale) && std::isnormal(output_scale)) {
+      const float positive_scale = input_scale / output_scale;
+      if (positive_scale < 1.0f / 256.0f || positive_scale > 128.0f) {
+        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                                 "unsupported positive input-to-output scale "
+                                 "%g in LEAKY_RELU node #%d",
+                                 positive_scale, node_index);
+        return kTfLiteError;
+      }
+
+      const float negative_scale = positive_scale * leaky_relu_params->alpha;
+      if (negative_scale < -127.99609375f || negative_scale > 128.0f ||
+          std::fabs(negative_scale) < 1.0f / 256.0f) {
+        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                                 "unsupported negative input-to-output scale "
+                                 "%g in LEAKY_RELU node #%d",
+                                 negative_scale, node_index);
+        return kTfLiteError;
+      }
+    }
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_leaky_relu(
+          subgraph, leaky_relu_params->alpha,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_LEAKY_RELU),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitLogisticNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_LOGISTIC, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_sigmoid(
+          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_LOGISTIC),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitMaxPool2DNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors, const TfLitePoolParams* pool_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_MAX_POOL_2D, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    TF_LITE_ENSURE_STATUS(CheckPoolingParams(
+        logging_context, pool_params, BuiltinOperator_MAX_POOL_2D, node_index));
+
+    uint32_t flags = 0;
+    TF_LITE_ENSURE_STATUS(CalculatePadding(
+        logging_context, pool_params->padding, &flags, node_index));
+
+    float output_min = -std::numeric_limits<float>::infinity();
+    float output_max = +std::numeric_limits<float>::infinity();
+    TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+        logging_context, node_index, pool_params->activation, &output_min,
+        &output_max));
+
+    if (subgraph != nullptr) {
+      xnn_status status = xnn_status_success;
+      if (pool_params->filter_height == 1 && pool_params->filter_width == 1) {
+        status = xnn_define_clamp(
+            subgraph, output_min, output_max,
+            /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+            /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      } else {
+        status = xnn_define_max_pooling_2d(
+            subgraph,
+            /*input_padding_top=*/0,
+            /*input_padding_right=*/0,
+            /*input_padding_bottom=*/0,
+            /*input_padding_left=*/0,
+            static_cast<uint32_t>(pool_params->filter_height),
+            static_cast<uint32_t>(pool_params->filter_width),
+            static_cast<uint32_t>(pool_params->stride_height),
+            static_cast<uint32_t>(pool_params->stride_width),
+            /*dilation_height=*/1, /*dilation_width=*/1, output_min, output_max,
+            /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+            /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
+      }
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_MAX_POOL_2D),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitMaximumNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 2, 1, BuiltinOperator_MAXIMUM, node_index));
+
+    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input1_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input1_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input2_tensor, node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input2_tensor, node->inputs->data[1], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_maximum2(
+          subgraph, /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_MAXIMUM),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitMeanNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors, const TfLiteReducerParams* reducer_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 2, 1, BuiltinOperator_MEAN, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
+                                           node->inputs->data[0],
+                                           BuiltinOperator_MEAN, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& axes_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, axes_tensor,
+                                          kTfLiteInt32, node->inputs->data[1],
+                                          node_index));
+    TF_LITE_ENSURE_STATUS(CheckAxesTensorShape(
+        logging_context, axes_tensor, node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, axes_tensor, node->inputs->data[1],
+        BuiltinOperator_MEAN, node_index));
+
+    const int32_t* axes_data =
+        reinterpret_cast<const int32_t*>(axes_tensor.data.data);
+    const int num_reduction_axes = NumElements(&axes_tensor);
+    switch (num_reduction_axes) {
+      case 1:
+        if (axes_data[0] != 2) {
+          TF_LITE_MAYBE_KERNEL_LOG(
+              logging_context,
+              "unsupported MEAN reduction along non-spatial "
+              "axis %d in node %d",
+              axes_data[0], node_index);
+          return kTfLiteError;
+        }
+        break;
+      case 2:
+        if (std::min(axes_data[0], axes_data[1]) != 1 ||
+            std::max(axes_data[0], axes_data[1]) != 2) {
+          TF_LITE_MAYBE_KERNEL_LOG(
+              logging_context,
+              "unsupported MEAN reduction along non-spatial "
+              "axes %d and %d in node %d",
+              std::min(axes_data[0], axes_data[1]),
+              std::max(axes_data[0], axes_data[1]), node_index);
+          return kTfLiteError;
+        }
+        break;
+      default:
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "unsupported MEAN reduction along %d axes in node %d",
+            SizeOfDimension(&axes_tensor, 0), node_index);
+        return kTfLiteError;
+    }
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    int expected_output_dims = 4;
+    if (!reducer_params->keep_dims) {
+      expected_output_dims -= num_reduction_axes;
+    }
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, output_tensor, expected_output_dims,
+        node->outputs->data[0], BuiltinOperator_MEAN, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      xnn_status status = xnn_status_success;
+      switch (num_reduction_axes) {
+        case 1:
+          status = xnn_define_global_average_pooling_1d(
+              subgraph,
+              /*output_min=*/-std::numeric_limits<float>::infinity(),
+              /*output_max=*/+std::numeric_limits<float>::infinity(),
+              /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+              /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+              /*flags=*/0);
+          break;
+        case 2:
+          status = xnn_define_global_average_pooling_2d(
+              subgraph,
+              /*output_min=*/-std::numeric_limits<float>::infinity(),
+              /*output_max=*/+std::numeric_limits<float>::infinity(),
+              /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+              /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+              /*flags=*/0);
+          break;
+        default:
+          break;
+      }
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_MEAN),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitMediaPipeDeconvolutionNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const TfLiteTransposeConvParams* deconv_params,
+      const std::unordered_set<int>& quasi_static_tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 3, 1, BuiltinOperator_CUSTOM, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
+                                           node->inputs->data[0],
+                                           BuiltinOperator_CUSTOM, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& filter_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
+                                           node->inputs->data[1],
+                                           BuiltinOperator_CUSTOM, node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1],
+          BuiltinOperator_CUSTOM, node_index));
+    }
+
+    const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
+                                           node->inputs->data[2],
+                                           BuiltinOperator_CUSTOM, node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2],
+          BuiltinOperator_CUSTOM, node_index));
+    }
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
+                                           node->outputs->data[0],
+                                           BuiltinOperator_CUSTOM, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const int* input_tensor_dims = input_tensor.dims->data;
+    const int input_height = input_tensor_dims[1];
+    const int input_width = input_tensor_dims[2];
+
+    const int* output_tensor_dims = output_tensor.dims->data;
+    const int output_height = output_tensor_dims[1];
+    const int output_width = output_tensor_dims[2];
+
+    const int output_channels = SizeOfDimension(&filter_tensor, 0);
+    const int kernel_height = SizeOfDimension(&filter_tensor, 1);
+    const int kernel_width = SizeOfDimension(&filter_tensor, 2);
+    const int input_channels = SizeOfDimension(&filter_tensor, 3);
+
+    TF_LITE_ENSURE_STATUS(CheckMediaPipeTransposedConvolutionParams(
+        logging_context, deconv_params, node_index));
+
+    int padding_top = 0;
+    int padding_bottom = 0;
+    int padding_left = 0;
+    int padding_right = 0;
+    int adjustment_height = 0;
+    int adjustment_width = 0;
+    TF_LITE_ENSURE_STATUS(CalculateTransposeConvPaddings(
+        logging_context, deconv_params->padding, input_height, input_width,
+        kernel_height, kernel_width, /*dilation_height=*/1,
+        /*dilation_width=*/1, deconv_params->stride_height,
+        deconv_params->stride_width, node_index, output_height, output_width,
+        &padding_top, &padding_bottom, &padding_left, &padding_right,
+        &adjustment_height, &adjustment_width));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_deconvolution_2d(
+          subgraph,
+          /*padding_top=*/padding_top,
+          /*padding_right=*/padding_right,
+          /*padding_bottom=*/padding_bottom,
+          /*padding_left=*/padding_left,
+          /*adjustment_height=*/adjustment_height,
+          /*adjustment_width=*/adjustment_width,
+          static_cast<uint32_t>(kernel_height),
+          static_cast<uint32_t>(kernel_width),
+          static_cast<uint32_t>(deconv_params->stride_height),
+          static_cast<uint32_t>(deconv_params->stride_width),
+          /*dilation_height=*/1,
+          /*dilation_width=*/1,
+          /*groups=*/1,
+          /*group_input_channels=*/input_channels,
+          /*group_output_channels=*/output_channels,
+          /*output_min=*/-std::numeric_limits<float>::infinity(),
+          /*output_max=*/+std::numeric_limits<float>::infinity(),
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*filter_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*bias_id=*/xnnpack_tensors[node->inputs->data[2]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+          /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate CUSTOM(%s) node #%d",
+                           "Convolution2DTransposeBias", node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitMediaPipeMaxPoolingNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors, const TfLitePoolParams* pool_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 2, BuiltinOperator_CUSTOM, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
+                                           node->inputs->data[0],
+                                           BuiltinOperator_CUSTOM, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_value_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32Type(logging_context, output_value_tensor,
+                               node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_value_tensor,
+                                           4, node->outputs->data[0],
+                                           BuiltinOperator_CUSTOM, node_index));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorNonDynamicAllocation(logging_context, output_value_tensor,
+                                        node->outputs->data[0], node_index));
+
+    const TfLiteTensor& output_index_tensor = tensors[node->outputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_index_tensor,
+                                           4, node->outputs->data[1],
+                                           BuiltinOperator_CUSTOM, node_index));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorNonDynamicAllocation(logging_context, output_index_tensor,
+                                        node->outputs->data[1], node_index));
+
+    TF_LITE_ENSURE_STATUS(
+        CheckMediaPipePoolParams(logging_context, pool_params, node_index));
+
+    uint32_t flags = 0;
+    TF_LITE_ENSURE_STATUS(CalculatePadding(
+        logging_context, pool_params->padding, &flags, node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_argmax_pooling_2d(
+          subgraph,
+          /*input_padding_top=*/0,
+          /*input_padding_right=*/0,
+          /*input_padding_bottom=*/0,
+          /*input_padding_left=*/0,
+          static_cast<uint32_t>(pool_params->filter_height),
+          static_cast<uint32_t>(pool_params->filter_width),
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_value_id=*/xnnpack_tensors[node->outputs->data[0]],
+          /*output_index_id=*/xnnpack_tensors[node->outputs->data[1]], flags);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate CUSTOM(%s) node #%d",
+                           "MaxPoolingWithArgmax2D", node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitMediaPipeUnpoolingNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors, const TfLitePoolParams* pool_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 2, 1, BuiltinOperator_CUSTOM, node_index));
+
+    const TfLiteTensor& input_value_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32Type(logging_context, input_value_tensor,
+                               node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_value_tensor,
+                                           4, node->inputs->data[0],
+                                           BuiltinOperator_CUSTOM, node_index));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorNonDynamicAllocation(logging_context, input_value_tensor,
+                                        node->inputs->data[0], node_index));
+
+    const TfLiteTensor& input_index_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_index_tensor,
+                                           4, node->inputs->data[1],
+                                           BuiltinOperator_CUSTOM, node_index));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorNonDynamicAllocation(logging_context, input_index_tensor,
+                                        node->inputs->data[1], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
+                                           node->outputs->data[0],
+                                           BuiltinOperator_CUSTOM, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    TF_LITE_ENSURE_STATUS(
+        CheckMediaPipePoolParams(logging_context, pool_params, node_index));
+
+    uint32_t flags = 0;
+    TF_LITE_ENSURE_STATUS(CalculatePadding(
+        logging_context, pool_params->padding, &flags, node_index));
+    if (flags != 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context, "invalid padding mode (%d) in node #%d",
+          static_cast<int>(pool_params->padding), node_index);
+    }
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_unpooling_2d(
+          subgraph,
+          /*padding_top=*/0,
+          /*padding_right=*/0,
+          /*padding_bottom=*/0,
+          /*padding_left=*/0, static_cast<uint32_t>(pool_params->filter_height),
+          static_cast<uint32_t>(pool_params->filter_width),
+          /*input_value_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*input_index_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+          /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate CUSTOM(%s) node #%d",
+                           "MaxUnpooling2D", node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitMinimumNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 2, 1, BuiltinOperator_MINIMUM, node_index));
+
+    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input1_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input1_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input2_tensor, node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input2_tensor, node->inputs->data[1], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_minimum2(
+          subgraph, /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_MINIMUM),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitMulNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors, const TfLiteMulParams* mul_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 2, 1, BuiltinOperator_MUL, node_index));
+
+    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input1_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input1_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input2_tensor,
+                                       node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input2_tensor, node->inputs->data[1], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const float scale_min = 1.0f / 65536.0f;
+    const float scale_max = 256.0f;
+    TF_LITE_ENSURE_STATUS(CheckTensorsInputProductOutputScale(
+        logging_context, input1_tensor, input2_tensor, output_tensor, scale_min,
+        scale_max, BuiltinOperator_MUL, node_index));
+
+    float output_min = -std::numeric_limits<float>::infinity();
+    float output_max = +std::numeric_limits<float>::infinity();
+    if (mul_params != nullptr) {
+      TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+          logging_context, node_index, mul_params->activation, &output_min,
+          &output_max));
+    }
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_multiply2(
+          subgraph, output_min, output_max,
+          /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_MUL),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitNegNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_NEG, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_negate(
+          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_NEG),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitPadNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 2, 1, BuiltinOperator_PAD, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, input_tensor, 1, XNN_MAX_TENSOR_DIMS,
+        node->inputs->data[0], BuiltinOperator_PAD, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& paddings_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, paddings_tensor,
+                                          kTfLiteInt32, node->inputs->data[1],
+                                          node_index));
+    TF_LITE_ENSURE_STATUS(CheckPaddingsTensorShape(
+        logging_context, paddings_tensor, NumDimensions(&input_tensor),
+        node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, paddings_tensor, node->inputs->data[1],
+        BuiltinOperator_PAD, node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, output_tensor, 1, XNN_MAX_TENSOR_DIMS,
+        node->outputs->data[0], BuiltinOperator_PAD, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const int32_t* paddings_data =
+        reinterpret_cast<const int32_t*>(paddings_tensor.data.data);
+    for (int i = 0; i < NumDimensions(&paddings_tensor); i++) {
+      const int32_t pre_padding = paddings_data[i * 2 + 0];
+      if (pre_padding < 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "invalid pre-padding %d for dimension #%d in node %d", pre_padding,
+            i, node_index);
+        return kTfLiteError;
+      }
+
+      const int32_t post_padding = paddings_data[i * 2 + 1];
+      if (post_padding < 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "invalid post-padding %d for dimension #%d in node %d", pre_padding,
+            i, node_index);
+        return kTfLiteError;
+      }
+    }
+
+    if (subgraph != nullptr) {
+      std::array<size_t, XNN_MAX_TENSOR_DIMS> pre_paddings{};
+      std::array<size_t, XNN_MAX_TENSOR_DIMS> post_paddings{};
+      for (int i = 0; i < SizeOfDimension(&paddings_tensor, 0); i++) {
+        pre_paddings[i] = static_cast<size_t>(paddings_data[i * 2 + 0]);
+        post_paddings[i] = static_cast<size_t>(paddings_data[i * 2 + 1]);
+      }
+
+      const xnn_status status = xnn_define_static_constant_pad(
+          subgraph, pre_paddings.data(), post_paddings.data(),
+          /*padding_value=*/0.0f,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_PAD),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitPreluNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::unordered_set<int>& quasi_static_tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 2, 1, BuiltinOperator_PRELU, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, input_tensor, 1, XNN_MAX_TENSOR_DIMS,
+        node->inputs->data[0], BuiltinOperator_PRELU, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& slope_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, slope_tensor, node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckSlopeTensorShape(
+        logging_context, slope_tensor, node->inputs->data[1],
+        BuiltinOperator_PRELU, node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, slope_tensor, node->inputs->data[1],
+          BuiltinOperator_PRELU, node_index));
+    }
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, output_tensor, 1, XNN_MAX_TENSOR_DIMS,
+        node->outputs->data[0], BuiltinOperator_PRELU, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_prelu(
+          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*slope_id=*/xnnpack_tensors[node->inputs->data[1]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_PRELU),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitQuantizeNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_QUANTIZE, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorQInt8OrQUInt8Type(delegate, logging_context, output_tensor,
+                                     node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const xnn_datatype input_datatype = GetXNNPackDatatype(
+        logging_context, input_tensor, node->inputs->data[0]);
+    const xnn_datatype output_datatype = GetXNNPackDatatype(
+        logging_context, output_tensor, node->outputs->data[0]);
+    bool supported_combination = false;
+    switch (input_datatype) {
+      case xnn_datatype_fp32:
+        supported_combination = true;
+        break;
+      case xnn_datatype_qint8:
+      case xnn_datatype_quint8:
+        if (input_datatype == output_datatype) {
+          const float input_scale =
+              GetTensorScaleOrDefault(input_tensor, std::nanf(""));
+          const float output_scale =
+              GetTensorScaleOrDefault(output_tensor, std::nanf(""));
+          const float input_output_scale = input_scale / output_scale;
+          if (input_output_scale < 1.0f / 256.0f ||
+              input_output_scale > 128.0f) {
+            TF_LITE_MAYBE_KERNEL_LOG(
+                logging_context,
+                "unsupported input-to-output scale in QUANTIZE node #%d",
+                node_index);
+            return kTfLiteError;
+          }
+          supported_combination = true;
+        }
+        break;
+      default:
+        break;
+    }
+    if (!supported_combination) {
+      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                               "unsupported combination of input type (%s) and "
+                               "output type (%s) in QUANTIZE node #%d",
+                               TfLiteTypeGetName(input_tensor.type),
+                               TfLiteTypeGetName(output_tensor.type),
+                               node_index);
+      return kTfLiteError;
+    }
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_convert(
+          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_QUANTIZE),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitReadVariableNode(
+      xnn_subgraph_t subgraph, Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, const TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    if (!delegate.handle_variable_ops()) {
+      return kTfLiteError;
+    }
+    const int resource_tensor_id = node->inputs->data[0];
+    const int output_tensor_id = node->outputs->data[0];
+    const TfLiteTensor& output_tensor = tensors[output_tensor_id];
+
+    if (subgraph == nullptr) {
+      // This could be a scalar or unranked tensor, we don't support
+      // unranked tensor so skip it.
+      // TODO(b/245990811): try to support this, we can delay associating
+      // dim and type with this tensor, assuming that another operation will
+      // provide it, then check that we have dim and type later when
+      // defining tensors.
+      if (output_tensor.dims->size == 0) {
+        return kTfLiteError;
+      }
+      return delegate.AssociateVariableWithDimAndType(
+          resource_tensor_id, &tensors[node->outputs->data[0]],
+          logging_context);
+    } else {
+      const xnn_status status =
+          xnn_define_copy(subgraph, xnnpack_tensors[resource_tensor_id],
+                          xnnpack_tensors[output_tensor_id], 0 /* flags */);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(
+            logging_context, "failed to delegate %s node #%d",
+            EnumNameBuiltinOperator(BuiltinOperator_READ_VARIABLE), node_index);
+        return kTfLiteError;
+      }
+    }
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitReluNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors, float output_min, float output_max,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_RELU, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_clamp(
+          subgraph, output_min, output_max,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_RELU),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitReshapeNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors, const TfLiteReshapeParams* reshape_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    switch (node->inputs->size) {
+      case 1:
+      case 2:
+        break;
+      default:
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "unexpected number of inputs (%d) in node #%d: "
+            "either one or two inputs expected",
+            node->inputs->size, node_index);
+        return kTfLiteError;
+    }
+    if (node->outputs->size != 1) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unexpected number of outputs (%d) in node #%d: one output expected",
+          node->outputs->size, node_index);
+      return kTfLiteError;
+    }
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, input_tensor, 0, XNN_MAX_TENSOR_DIMS,
+        node->inputs->data[0], BuiltinOperator_RESHAPE, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    if (node->inputs->size == 2) {
+      const TfLiteTensor& shape_tensor = tensors[node->inputs->data[1]];
+      TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, shape_tensor,
+                                            kTfLiteInt32, node->inputs->data[1],
+                                            node_index));
+      TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(
+          logging_context, shape_tensor, /*squeeze_dims=*/true,
+          node->inputs->data[1], BuiltinOperator_RESHAPE, node_index));
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticOrPersistentRoAllocation(
+          logging_context, shape_tensor, node->inputs->data[1], node_index));
+    }
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, output_tensor, 0, XNN_MAX_TENSOR_DIMS,
+        node->outputs->data[0], BuiltinOperator_RESHAPE, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      std::array<size_t, XNN_MAX_TENSOR_DIMS> new_shape;
+      std::copy(&output_tensor.dims->data[0],
+                &output_tensor.dims->data[NumDimensions(&output_tensor)],
+                new_shape.begin());
+      const xnn_status status = xnn_define_static_reshape(
+          subgraph, static_cast<size_t>(NumDimensions(&output_tensor)),
+          new_shape.data(),
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_RESHAPE),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitResizeBilinearNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const TfLiteResizeBilinearParams* resize_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 2, 1,
+                                 BuiltinOperator_RESIZE_BILINEAR, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, input_tensor, 4, node->inputs->data[0],
+        BuiltinOperator_RESIZE_BILINEAR, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& shape_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, shape_tensor,
+                                          kTfLiteInt32, node->inputs->data[1],
+                                          node_index));
+    TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(
+        logging_context, shape_tensor, /*squeeze_dims=*/false,
+        node->inputs->data[1], BuiltinOperator_RESIZE_BILINEAR, node_index));
+    if (SizeOfDimension(&shape_tensor, 0) != 2) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unexpected number of dimensions %d in the output shape in node %d",
+          SizeOfDimension(&shape_tensor, 0), node_index);
+    }
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, shape_tensor, node->inputs->data[1],
+        BuiltinOperator_RESIZE_BILINEAR, node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, output_tensor, 4, node->outputs->data[0],
+        BuiltinOperator_RESIZE_BILINEAR, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const int32_t* shape_data =
+        reinterpret_cast<const int32_t*>(shape_tensor.data.data);
+    for (int i = 0; i < NumDimensions(&shape_tensor); i++) {
+      const int32_t dim = shape_data[i];
+      if (dim <= 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context, "invalid output dimension #%d value %d in node %d",
+            i, dim, node_index);
+        return kTfLiteError;
+      }
+    }
+
+    if (subgraph != nullptr) {
+      uint32_t flags = 0;
+      if (resize_params->align_corners) {
+        flags |= XNN_FLAG_ALIGN_CORNERS;
+      } else if (!resize_params->half_pixel_centers) {
+        flags |= XNN_FLAG_TENSORFLOW_LEGACY_MODE;
+      }
+      const xnn_status status = xnn_define_static_resize_bilinear_2d(
+          subgraph, static_cast<size_t>(shape_data[0]),
+          static_cast<size_t>(shape_data[1]),
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(
+            logging_context, "failed to delegate %s node #%d",
+            EnumNameBuiltinOperator(BuiltinOperator_RESIZE_BILINEAR),
+            node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitRoundNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_ROUND, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_bankers_rounding(
+          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_ROUND),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitSliceNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    const int input_tensor_index = node->inputs->data[0];
+    const int begin_tensor_index = node->inputs->data[1];
+    const int size_tensor_index = node->inputs->data[2];
+    const int output_tensor_index = node->outputs->data[0];
+    const TfLiteTensor& input_tensor = tensors[input_tensor_index];
+    const TfLiteTensor& begin_tensor = tensors[begin_tensor_index];
+    const TfLiteTensor& size_tensor = tensors[size_tensor_index];
+    const TfLiteTensor& output_tensor = tensors[output_tensor_index];
+
+    TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(
+        logging_context, begin_tensor, /*squeeze_dims=*/false,
+        begin_tensor_index, BuiltinOperator_SLICE, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, begin_tensor, begin_tensor_index,
+        BuiltinOperator_SLICE, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorInt32OrInt64Type(
+        logging_context, begin_tensor, begin_tensor_index, node_index));
+
+    TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(
+        logging_context, size_tensor, /*squeeze_dims=*/false, size_tensor_index,
+        BuiltinOperator_SLICE, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, size_tensor, size_tensor_index, BuiltinOperator_SLICE,
+        node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorInt32OrInt64Type(
+        logging_context, size_tensor, size_tensor_index, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorsDimensionMatch(
+        logging_context, begin_tensor, size_tensor, 0, node_index, "SLICE"));
+
+    const int num_dims = begin_tensor.dims->data[0];
+    if (num_dims > XNN_MAX_TENSOR_DIMS) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "number of dimensions %d must be less than %d in SLICE node #%d",
+          num_dims, XNN_MAX_TENSOR_DIMS, node_index);
+    }
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       input_tensor_index, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor,
+                                           num_dims, input_tensor_index,
+                                           BuiltinOperator_SLICE, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, input_tensor_index, node_index));
+
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       output_tensor_index, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor,
+                                           num_dims, output_tensor_index,
+                                           BuiltinOperator_SLICE, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, output_tensor_index, node_index));
+
+    const auto input_shape = input_tensor.dims;
+    const auto output_shape = output_tensor.dims;
+    std::array<int64_t, XNN_MAX_TENSOR_DIMS> begin;
+    std::array<int64_t, XNN_MAX_TENSOR_DIMS> size;
+    CopyTensorDataInt32OrInt64(begin.data(), begin_tensor, num_dims);
+    CopyTensorDataInt32OrInt64(size.data(), size_tensor, num_dims);
+
+    for (size_t i = 0; i < num_dims; i++) {
+      if (begin[i] < 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                                 "begin %" PRId64
+                                 " must be greater than 0 in SLICE node #%d",
+                                 begin[i], node_index);
+      }
+      if (begin[i] >= input_shape->data[i]) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "begin %" PRId64
+            " must be less than input dimension %d in SLICE node #%d",
+            begin[i], input_shape->data[i], node_index);
+      }
+      if (size[i] <= 0) {
+        if (size[i] != -1) {
+          TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                                   "size %" PRId64
+                                   " must be positive or -1 in SLICE node #%d",
+                                   size[i], node_index);
+          return kTfLiteError;
+        }
+        size[i] = input_shape->data[i] - begin[i];
+      }
+      if (size[i] > input_shape->data[i]) {
+        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                                 "size %" PRId64
+                                 " must be less than or equals to input "
+                                 "dimension %d in SLICE node #%d",
+                                 size[i], input_shape->data[i], node_index);
+        return kTfLiteError;
+      }
+      if (size[i] != output_shape->data[i]) {
+        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                                 "size %" PRId64
+                                 " does not match output shape %d at "
+                                 "dimension %d in SLICE node #%d",
+                                 size[i], output_shape->data[i], i, node_index);
+        return kTfLiteError;
+      }
+      if (begin[i] + size[i] > input_shape->data[i]) {
+        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                                 "begin + size (%" PRId64 " + %" PRId64
+                                 ") must not be greater than input "
+                                 "dimension %d in SLICE node #%d",
+                                 begin[i], size[i], input_shape->data[i],
+                                 node_index);
+        return kTfLiteError;
+      }
+    }
+
+    if (subgraph != nullptr) {
+      // Convert to size_t.
+      std::array<size_t, XNN_MAX_TENSOR_DIMS> offsets;
+      std::copy(begin.begin(), begin.end(), offsets.begin());
+      std::array<size_t, XNN_MAX_TENSOR_DIMS> sizes;
+      std::copy(size.begin(), size.end(), sizes.begin());
+
+      const xnn_status status = xnn_define_static_slice(
+          subgraph, num_dims, offsets.data(), sizes.data(),
+          xnnpack_tensors[node->inputs->data[0]],
+          xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_SLICE),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitSoftmaxNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors, const TfLiteSoftmaxParams* params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    if (params->beta != 1.0f) {
+      if (logging_context != nullptr) {
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "unsupported beta value %.7f in SOFTMAX node #%d",
+                           params->beta, node_index);
+      }
+      return kTfLiteError;
+    }
+
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_SOFTMAX, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_softmax(
+          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_SOFTMAX),
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitSpaceToDepthNode(
+      xnn_subgraph_t subgraph, const Delegate& delegate,
+      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
+      const TfLiteTensor* tensors,
+      const TfLiteSpaceToDepthParams* space_to_depth_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 1, 1,
+                                 BuiltinOperator_SPACE_TO_DEPTH, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const int block_size = space_to_depth_params->block_size;
+    if (block_size <= 1) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "block size (%d) in SPACE_TO_DEPTH node #%d must be greater > 1",
+          block_size, node_index);
+      return kTfLiteError;
+    }
+
+    const int input_height = input_tensor.dims->data[1];
+    const int input_width = input_tensor.dims->data[2];
+    if (input_height % block_size != 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                               "SPACE_TO_DEPTH node #%d input height (%d) must "
+                               "be divisible by block_size (%d).",
+                               input_height, block_size, node_index);
+      return kTfLiteError;
+    }
+
+    if (input_width % block_size != 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                               "SPACE_TO_DEPTH node #%d input width (%d) must "
+                               "be divisible by block_size (%d).",
+                               input_width, block_size, node_index);
+      return kTfLiteError;
+    }
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_space_to_depth_2d(
+          subgraph, static_cast<uint32_t>(block_size),
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
+          /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(
+            logging_context, "failed to delegate %s node #%d",
+            EnumNameBuiltinOperator(BuiltinOperator_SPACE_TO_DEPTH),
+            node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
   static TfLiteStatus VisitSplitNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
@@ -2693,9 +5040,10 @@ class Subgraph {
       const std::vector<uint32_t>& xnnpack_tensors) {
     const int num_outputs = NumOutputs(node);
     TF_LITE_ENSURE_EQ(logging_context, split_params->num_splits, num_outputs);
-    TF_LITE_ENSURE_STATUS(CheckNumInputs(logging_context, node, 2, node_index));
-    TF_LITE_ENSURE_STATUS(
-        CheckNumOutputs(logging_context, node, 2, 4, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumInputs(logging_context, node, 2,
+                                         BuiltinOperator_SPLIT, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumOutputs(logging_context, node, 2, 4,
+                                          BuiltinOperator_SPLIT, node_index));
 
     const int split_dim_idx = node->inputs->data[0];
     const TfLiteTensor& split_dim_tensor = tensors[split_dim_idx];
@@ -2703,7 +5051,8 @@ class Subgraph {
                                           kTfLiteInt32, split_dim_idx,
                                           node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, split_dim_tensor, split_dim_idx, node_index));
+        logging_context, split_dim_tensor, split_dim_idx, BuiltinOperator_SPLIT,
+        node_index));
 
     const int input_idx = node->inputs->data[1];
     const TfLiteTensor& input_tensor = tensors[input_idx];
@@ -2790,2150 +5139,8 @@ class Subgraph {
       }
 
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate SPLIT node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitConcatenationNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const TfLiteConcatenationParams* concat_params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 4, 1, node_index));
-    const int num_inputs = NumInputs(node);
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    // Check dimensions
-    int axis = concat_params->axis;
-    if (axis < 0) axis += NumDimensions(&output_tensor);
-    int sum_axis = 0;
-
-    for (int i = 0; i < num_inputs; i++) {
-      const TfLiteTensor& input_tensor = tensors[node->inputs->data[i]];
-      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
-          delegate, logging_context, input_tensor, node->inputs->data[i],
-          node_index));
-      TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-          logging_context, input_tensor, node->inputs->data[i], node_index));
-
-      TF_LITE_ENSURE_EQ(logging_context, NumDimensions(&input_tensor),
-                        NumDimensions(&output_tensor));
-
-      for (int d = 0; d < NumDimensions(&output_tensor); d++) {
-        // All dimensions must match except the 'axis'.
-        if (d == axis) {
-          continue;
-        }
-        const TfLiteTensor& input_tensor = tensors[node->inputs->data[i]];
-        TF_LITE_ENSURE_STATUS(CheckTensorsDimensionMatch(
-            logging_context, input_tensor, output_tensor, d, node_index,
-            "CONCATENATE"));
-      }
-      sum_axis += SizeOfDimension(&input_tensor, axis);
-    }
-
-    if (SizeOfDimension(&output_tensor, axis) != sum_axis) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "mismatch in axis dimension %d (%d != %d) in output and input"
-          "tensors of CONCATENATE operator #%d",
-          axis, SizeOfDimension(&output_tensor, axis), sum_axis, node_index);
-      return kTfLiteError;
-    }
-
-    if (subgraph != nullptr) {
-      xnn_status status = xnn_status_invalid_parameter;
-      if (num_inputs == 2) {
-        status = xnn_define_concatenate2(
-            subgraph, axis,
-            /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
-            /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
-            /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
-            /*flags=*/0);
-      } else if (num_inputs == 3) {
-        status = xnn_define_concatenate3(
-            subgraph, axis,
-            /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
-            /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
-            /*input3_id=*/xnnpack_tensors[node->inputs->data[2]],
-            /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
-            /*flags=*/0);
-      } else if (num_inputs == 4) {
-        status = xnn_define_concatenate4(
-            subgraph, axis,
-            /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
-            /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
-            /*input3_id=*/xnnpack_tensors[node->inputs->data[2]],
-            /*input4_id=*/xnnpack_tensors[node->inputs->data[3]],
-            /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
-            /*flags=*/0);
-      }
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate CONCATENATION node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitConv2DNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLiteConvParams* conv_params,
-      const std::unordered_set<int>& quasi_static_tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckConvolutionParams(logging_context, conv_params, node_index));
-
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 3, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
-                                           node->inputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& filter_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQCInt8Type(
-        delegate, logging_context, filter_tensor,
-        /*expected_quantized_dimension=*/0, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
-                                           node->inputs->data[1]));
-    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
-      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-          logging_context, filter_tensor, node->inputs->data[1], node_index));
-    }
-
-    const int bias_tensor_id = node->inputs->data[2];
-    if (bias_tensor_id < 0) {
-      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                               "unsupported CONV_2D node #%d without bias",
-                               node_index);
-      return kTfLiteError;
-    }
-    const TfLiteTensor& bias_tensor = tensors[bias_tensor_id];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQCInt32Type(delegate, logging_context, bias_tensor,
-                                        node->inputs->data[2], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
-                                           node->inputs->data[2]));
-    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
-      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-          logging_context, bias_tensor, node->inputs->data[2], node_index));
-    }
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
-                                           node->outputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (input_tensor.type != output_tensor.type ||
-        input_tensor.type != filter_tensor.type) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context, "unsupported mixed types in CONV_2D operator #%d",
-          node_index);
-      return kTfLiteError;
-    }
-
-    const int output_channels = SizeOfDimension(&filter_tensor, 0);
-    const int kernel_height = SizeOfDimension(&filter_tensor, 1);
-    const int kernel_width = SizeOfDimension(&filter_tensor, 2);
-    const int input_channels = SizeOfDimension(&filter_tensor, 3);
-    const int groups = SizeOfDimension(&input_tensor, 3) / input_channels;
-
-    uint32_t flags;
-    TF_LITE_ENSURE_STATUS(CalculatePadding(
-        logging_context, conv_params->padding, &flags, node_index));
-
-    float output_min = -std::numeric_limits<float>::infinity();
-    float output_max = +std::numeric_limits<float>::infinity();
-    TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
-        logging_context, node_index, conv_params->activation, &output_min,
-        &output_max));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_convolution_2d(
-          subgraph,
-          /*input_padding_top=*/0,
-          /*input_padding_right=*/0,
-          /*input_padding_bottom=*/0,
-          /*input_padding_left=*/0, static_cast<uint32_t>(kernel_height),
-          static_cast<uint32_t>(kernel_width),
-          static_cast<uint32_t>(conv_params->stride_height),
-          static_cast<uint32_t>(conv_params->stride_width),
-          static_cast<uint32_t>(conv_params->dilation_height_factor),
-          static_cast<uint32_t>(conv_params->dilation_width_factor), groups,
-          static_cast<size_t>(input_channels),
-          static_cast<size_t>(output_channels) / groups, output_min, output_max,
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*filter_id=*/xnnpack_tensors[node->inputs->data[1]],
-          /*bias_id=*/xnnpack_tensors[node->inputs->data[2]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate CONV_2D node #%d", node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitDepthwiseConv2DNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const TfLiteDepthwiseConvParams* dwconv_params,
-      const std::unordered_set<int>& quasi_static_tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 3, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
-                                           node->inputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& filter_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQCInt8Type(
-        delegate, logging_context, filter_tensor,
-        /*expected_quantized_dimension=*/3, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
-                                           node->inputs->data[1]));
-    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
-      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-          logging_context, filter_tensor, node->inputs->data[1], node_index));
-    }
-
-    const int bias_tensor_id = node->inputs->data[2];
-    if (bias_tensor_id < 0) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "unsupported DEPTHWISE_CONV_2D node #%d without bias", node_index);
-      return kTfLiteError;
-    }
-    const TfLiteTensor& bias_tensor = tensors[bias_tensor_id];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQCInt32Type(delegate, logging_context, bias_tensor,
-                                        node->inputs->data[2], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
-                                           node->inputs->data[2]));
-    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
-      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-          logging_context, bias_tensor, node->inputs->data[2], node_index));
-    }
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
-                                           node->outputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (input_tensor.type != output_tensor.type ||
-        input_tensor.type != filter_tensor.type) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "unsupported mixed types in DEPTHWISE_CONV_2D operator #%d",
-          node_index);
-      return kTfLiteError;
-    }
-
-    const int kernel_height = SizeOfDimension(&filter_tensor, 1);
-    const int kernel_width = SizeOfDimension(&filter_tensor, 2);
-    const int output_channels = SizeOfDimension(&filter_tensor, 3);
-
-    TF_LITE_ENSURE_STATUS(CheckDepthwiseConvolutionParams(
-        logging_context, dwconv_params, output_channels, node_index));
-
-    uint32_t flags = 0;
-    TF_LITE_ENSURE_STATUS(CalculatePadding(
-        logging_context, dwconv_params->padding, &flags, node_index));
-
-    float output_min = -std::numeric_limits<float>::infinity();
-    float output_max = +std::numeric_limits<float>::infinity();
-    TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
-        logging_context, node_index, dwconv_params->activation, &output_min,
-        &output_max));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_depthwise_convolution_2d(
-          subgraph,
-          /*input_padding_top=*/0,
-          /*input_padding_right=*/0,
-          /*input_padding_bottom=*/0,
-          /*input_padding_left=*/0, static_cast<uint32_t>(kernel_height),
-          static_cast<uint32_t>(kernel_width),
-          static_cast<uint32_t>(dwconv_params->stride_height),
-          static_cast<uint32_t>(dwconv_params->stride_width),
-          static_cast<uint32_t>(dwconv_params->dilation_height_factor),
-          static_cast<uint32_t>(dwconv_params->dilation_width_factor),
-          static_cast<uint32_t>(dwconv_params->depth_multiplier),
-          /*input_channels=*/
-          static_cast<uint32_t>(output_channels /
-                                dwconv_params->depth_multiplier),
-          output_min, output_max,
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*filter_id=*/xnnpack_tensors[node->inputs->data[1]],
-          /*bias_id=*/xnnpack_tensors[node->inputs->data[2]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate DEPTHWISE_CONV_2D node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitDepthToSpaceNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const TfLiteDepthToSpaceParams* depth_to_space_params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (depth_to_space_params->block_size <= 1) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context, "invalid block size (%d) in DEPTH_TO_SPACE node #%d",
-          depth_to_space_params->block_size, node_index);
-      return kTfLiteError;
-    }
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_depth_to_space(
-          subgraph,
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
-          /*block_size=*/
-          static_cast<uint32_t>(depth_to_space_params->block_size),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate DEPTH_TO_SPACE node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitDequantizeNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorQInt8OrQUInt8Type(delegate, logging_context, input_tensor,
-                                     node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_convert(
-          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate DEQUANTIZE node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitDivNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLiteDivParams* div_params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
-
-    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input1_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input1_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input2_tensor, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input2_tensor, node->inputs->data[1], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    float output_min = -std::numeric_limits<float>::infinity();
-    float output_max = +std::numeric_limits<float>::infinity();
-    if (div_params != nullptr) {
-      TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
-          logging_context, node_index, div_params->activation, &output_min,
-          &output_max));
-    }
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_divide(
-          subgraph, output_min, output_max,
-          /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate DIV node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitEluNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQInt8Type(delegate, logging_context, input_tensor,
-                                      node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQInt8Type(delegate, logging_context, output_tensor,
-                                      node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status =
-          xnn_define_elu(subgraph, /*alpha=*/1.0f,
-                         /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-                         /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
-                         /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate ELU node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitFullyConnectedNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLiteFullyConnectedParams* fc_params,
-      const std::unordered_set<int>& quasi_static_tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckFullyConnectedParams(logging_context, fc_params, node_index));
-
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 3, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& filter_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, filter_tensor,
-                                       node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 2,
-                                           node->inputs->data[1]));
-    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
-      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-          logging_context, filter_tensor, node->inputs->data[1], node_index));
-    }
-
-    int bias_tensor_id = -1;
-    if (node->inputs->size >= 3) {
-      bias_tensor_id = node->inputs->data[2];
-      if (bias_tensor_id >= 0) {
-        const TfLiteTensor& bias_tensor = tensors[bias_tensor_id];
-        TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQInt32Type(
-            delegate, logging_context, bias_tensor, node->inputs->data[2],
-            node_index));
-        TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
-                                               node->inputs->data[2]));
-        if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
-          TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-              logging_context, bias_tensor, node->inputs->data[2], node_index));
-        }
-      }
-    }
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    const int32_t output_channels = SizeOfDimension(&filter_tensor, 0);
-    const int32_t input_channels = SizeOfDimension(&filter_tensor, 1);
-
-    if (input_tensor.type != output_tensor.type ||
-        input_tensor.type != filter_tensor.type) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "unsupported mixed types in FULLY_CONNECTED operator #%d",
-          node_index);
-      return kTfLiteError;
-    }
-
-    if (NumDimensions(&input_tensor) == 0) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "unexpected number of shape dimensions %d in tensor #%d",
-          NumDimensions(&input_tensor), node->inputs->data[0]);
-      return kTfLiteError;
-    }
-
-    int32_t num_input_elements = 1;
-    for (int i = 0; i < NumDimensions(&input_tensor); i++) {
-      if (SizeOfDimension(&input_tensor, i) <= 0) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context, "invalid dimension #%d (%d) in tensor #%d", i,
-            SizeOfDimension(&input_tensor, i), node->inputs->data[0]);
-        return kTfLiteError;
-      }
-      num_input_elements *= SizeOfDimension(&input_tensor, i);
-    }
-
-    if (fc_params->keep_num_dims) {
-      TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor,
-                                             NumDimensions(&input_tensor),
-                                             node->outputs->data[0]));
-
-      for (int i = 0; i < NumDimensions(&input_tensor) - 1; i++) {
-        if (SizeOfDimension(&input_tensor, i) !=
-            SizeOfDimension(&output_tensor, i)) {
-          TF_LITE_MAYBE_KERNEL_LOG(
-              logging_context,
-              "mismatch in shape dimension %d (%d != %d) in input and output "
-              "tensors of FULLY_CONNECTED operator #%d",
-              i, SizeOfDimension(&input_tensor, i),
-              SizeOfDimension(&output_tensor, i), node_index);
-          return kTfLiteError;
-        }
-      }
-    } else {
-      if (num_input_elements % input_channels != 0) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context,
-            "number of elements in input tensor #%d in FULLY_CONNECTED "
-            "operator is not divisible by input channels (%d)",
-            node->inputs->data[0], input_channels);
-        return kTfLiteError;
-      }
-
-      TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 2,
-                                             node->outputs->data[0]));
-
-      if (SizeOfDimension(&output_tensor, 0) !=
-          num_input_elements / input_channels) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context,
-            "batch size %d in output tensor #%d in FULLY_CONNECTED operator "
-            "does not match batch size %d in reshaped input tensor #%d",
-            SizeOfDimension(&output_tensor, 0), node->outputs->data[0],
-            num_input_elements / input_channels, node->inputs->data[0]);
-        return kTfLiteError;
-      }
-    }
-
-    if (SizeOfDimension(&output_tensor, NumDimensions(&output_tensor) - 1) !=
-        output_channels) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "number of channels %d in output tensor #%d does not match output "
-          "channels %d in filter tensor #%d",
-          SizeOfDimension(&output_tensor, NumDimensions(&output_tensor) - 1),
-          node->outputs->data[0], output_channels, node->inputs->data[1]);
-      return kTfLiteError;
-    }
-
-    float output_min = -std::numeric_limits<float>::infinity();
-    float output_max = +std::numeric_limits<float>::infinity();
-    TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
-        logging_context, node_index, fc_params->activation, &output_min,
-        &output_max));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_fully_connected(
-          subgraph, output_min, output_max,
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*filter_id=*/xnnpack_tensors[node->inputs->data[1]],
-          /*bias_id=*/bias_tensor_id >= 0 ? xnnpack_tensors[bias_tensor_id]
-                                          : XNN_INVALID_VALUE_ID,
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
-          /*flags=*/fc_params->keep_num_dims ? 0
-                                             : XNN_FLAG_TENSORFLOW_RESHAPE_2D);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate FULLY_CONNECTED node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitFloorNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_floor(
-          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate FLOOR node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitHardSwishNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_hardswish(
-          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate HARD_SWISH node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitLeakyReluNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const TfLiteLeakyReluParams* leaky_relu_params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (!std::isnormal(leaky_relu_params->alpha) ||
-        leaky_relu_params->alpha == 0.0f) {
-      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                               "unsupported alpha %g in LEAKY_RELU node #%d",
-                               leaky_relu_params->alpha, node_index);
-      return kTfLiteError;
-    }
-
-    const float input_scale =
-        GetTensorScaleOrDefault(input_tensor, std::nanf(""));
-    const float output_scale =
-        GetTensorScaleOrDefault(output_tensor, std::nanf(""));
-    if (std::isnormal(input_scale) && std::isnormal(output_scale)) {
-      const float positive_scale = input_scale / output_scale;
-      if (positive_scale < 1.0f / 256.0f || positive_scale > 128.0f) {
-        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                 "unsupported positive input-to-output scale "
-                                 "%g in LEAKY_RELU node #%d",
-                                 positive_scale, node_index);
-        return kTfLiteError;
-      }
-
-      const float negative_scale = positive_scale * leaky_relu_params->alpha;
-      if (negative_scale < -127.99609375f || negative_scale > 128.0f ||
-          std::fabs(negative_scale) < 1.0f / 256.0f) {
-        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                 "unsupported negative input-to-output scale "
-                                 "%g in LEAKY_RELU node #%d",
-                                 negative_scale, node_index);
-        return kTfLiteError;
-      }
-    }
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_leaky_relu(
-          subgraph, leaky_relu_params->alpha,
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate LEAKY_RELU node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitLogisticNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_sigmoid(
-          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate LOGISTIC node #%d", node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitMaxPool2DNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLitePoolParams* pool_params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    TF_LITE_ENSURE_STATUS(
-        CheckPoolingParams(logging_context, pool_params, node_index));
-
-    uint32_t flags = 0;
-    TF_LITE_ENSURE_STATUS(CalculatePadding(
-        logging_context, pool_params->padding, &flags, node_index));
-
-    float output_min = -std::numeric_limits<float>::infinity();
-    float output_max = +std::numeric_limits<float>::infinity();
-    TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
-        logging_context, node_index, pool_params->activation, &output_min,
-        &output_max));
-
-    if (subgraph != nullptr) {
-      xnn_status status = xnn_status_success;
-      if (pool_params->filter_height == 1 && pool_params->filter_width == 1) {
-        status = xnn_define_clamp(
-            subgraph, output_min, output_max,
-            /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-            /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      } else {
-        status = xnn_define_max_pooling_2d(
-            subgraph,
-            /*input_padding_top=*/0,
-            /*input_padding_right=*/0,
-            /*input_padding_bottom=*/0,
-            /*input_padding_left=*/0,
-            static_cast<uint32_t>(pool_params->filter_height),
-            static_cast<uint32_t>(pool_params->filter_width),
-            static_cast<uint32_t>(pool_params->stride_height),
-            static_cast<uint32_t>(pool_params->stride_width),
-            /*dilation_height=*/1, /*dilation_width=*/1, output_min, output_max,
-            /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-            /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
-      }
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate MAX_POOL_2D node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitMaximumNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
-
-    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input1_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input1_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input2_tensor, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input2_tensor, node->inputs->data[1], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_maximum2(
-          subgraph, /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate MAXIMUM node #%d", node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitMeanNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLiteReducerParams* reducer_params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
-                                           node->inputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& axes_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, axes_tensor,
-                                          kTfLiteInt32, node->inputs->data[1],
-                                          node_index));
-    TF_LITE_ENSURE_STATUS(CheckAxesTensorShape(
-        logging_context, axes_tensor, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, axes_tensor, node->inputs->data[1], node_index));
-
-    const int32_t* axes_data =
-        reinterpret_cast<const int32_t*>(axes_tensor.data.data);
-    const int num_reduction_axes = NumElements(&axes_tensor);
-    switch (num_reduction_axes) {
-      case 1:
-        if (axes_data[0] != 2) {
-          TF_LITE_MAYBE_KERNEL_LOG(
-              logging_context,
-              "unsupported MEAN reduction along non-spatial "
-              "axis %d in node %d",
-              axes_data[0], node_index);
-          return kTfLiteError;
-        }
-        break;
-      case 2:
-        if (std::min(axes_data[0], axes_data[1]) != 1 ||
-            std::max(axes_data[0], axes_data[1]) != 2) {
-          TF_LITE_MAYBE_KERNEL_LOG(
-              logging_context,
-              "unsupported MEAN reduction along non-spatial "
-              "axes %d and %d in node %d",
-              std::min(axes_data[0], axes_data[1]),
-              std::max(axes_data[0], axes_data[1]), node_index);
-          return kTfLiteError;
-        }
-        break;
-      default:
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context,
-            "unsupported MEAN reduction along %d axes in node %d",
-            SizeOfDimension(&axes_tensor, 0), node_index);
-        return kTfLiteError;
-    }
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    int expected_output_dims = 4;
-    if (!reducer_params->keep_dims) {
-      expected_output_dims -= num_reduction_axes;
-    }
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor,
-                                           expected_output_dims,
-                                           node->outputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      xnn_status status = xnn_status_success;
-      switch (num_reduction_axes) {
-        case 1:
-          status = xnn_define_global_average_pooling_1d(
-              subgraph,
-              /*output_min=*/-std::numeric_limits<float>::infinity(),
-              /*output_max=*/+std::numeric_limits<float>::infinity(),
-              /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-              /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
-              /*flags=*/0);
-          break;
-        case 2:
-          status = xnn_define_global_average_pooling_2d(
-              subgraph,
-              /*output_min=*/-std::numeric_limits<float>::infinity(),
-              /*output_max=*/+std::numeric_limits<float>::infinity(),
-              /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-              /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
-              /*flags=*/0);
-          break;
-        default:
-          break;
-      }
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate MEAN node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitMediaPipeDeconvolutionNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const TfLiteTransposeConvParams* deconv_params,
-      const std::unordered_set<int>& quasi_static_tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 3, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
-                                           node->inputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& filter_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
-                                           node->inputs->data[1]));
-    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
-      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-          logging_context, filter_tensor, node->inputs->data[1], node_index));
-    }
-
-    const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
-                                           node->inputs->data[2]));
-    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
-      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-          logging_context, bias_tensor, node->inputs->data[2], node_index));
-    }
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
-                                           node->outputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    const int* input_tensor_dims = input_tensor.dims->data;
-    const int input_height = input_tensor_dims[1];
-    const int input_width = input_tensor_dims[2];
-
-    const int* output_tensor_dims = output_tensor.dims->data;
-    const int output_height = output_tensor_dims[1];
-    const int output_width = output_tensor_dims[2];
-
-    const int output_channels = SizeOfDimension(&filter_tensor, 0);
-    const int kernel_height = SizeOfDimension(&filter_tensor, 1);
-    const int kernel_width = SizeOfDimension(&filter_tensor, 2);
-    const int input_channels = SizeOfDimension(&filter_tensor, 3);
-
-    TF_LITE_ENSURE_STATUS(CheckMediaPipeTransposedConvolutionParams(
-        logging_context, deconv_params, node_index));
-
-    int padding_top = 0;
-    int padding_bottom = 0;
-    int padding_left = 0;
-    int padding_right = 0;
-    int adjustment_height = 0;
-    int adjustment_width = 0;
-    TF_LITE_ENSURE_STATUS(CalculateTransposeConvPaddings(
-        logging_context, deconv_params->padding, input_height, input_width,
-        kernel_height, kernel_width, /*dilation_height=*/1,
-        /*dilation_width=*/1, deconv_params->stride_height,
-        deconv_params->stride_width, node_index, output_height, output_width,
-        &padding_top, &padding_bottom, &padding_left, &padding_right,
-        &adjustment_height, &adjustment_width));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_deconvolution_2d(
-          subgraph,
-          /*padding_top=*/padding_top,
-          /*padding_right=*/padding_right,
-          /*padding_bottom=*/padding_bottom,
-          /*padding_left=*/padding_left,
-          /*adjustment_height=*/adjustment_height,
-          /*adjustment_width=*/adjustment_width,
-          static_cast<uint32_t>(kernel_height),
-          static_cast<uint32_t>(kernel_width),
-          static_cast<uint32_t>(deconv_params->stride_height),
-          static_cast<uint32_t>(deconv_params->stride_width),
-          /*dilation_height=*/1,
-          /*dilation_width=*/1,
-          /*groups=*/1,
-          /*group_input_channels=*/input_channels,
-          /*group_output_channels=*/output_channels,
-          /*output_min=*/-std::numeric_limits<float>::infinity(),
-          /*output_max=*/+std::numeric_limits<float>::infinity(),
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*filter_id=*/xnnpack_tensors[node->inputs->data[1]],
-          /*bias_id=*/xnnpack_tensors[node->inputs->data[2]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(
-            logging_context,
-            "failed to delegate Convolution2DTransposeBias node #%d",
-            node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitMediaPipeMaxPoolingNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLitePoolParams* pool_params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 2, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
-                                           node->inputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_value_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32Type(logging_context, output_value_tensor,
-                               node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_value_tensor,
-                                           4, node->outputs->data[0]));
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorNonDynamicAllocation(logging_context, output_value_tensor,
-                                        node->outputs->data[0], node_index));
-
-    const TfLiteTensor& output_index_tensor = tensors[node->outputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_index_tensor,
-                                           4, node->outputs->data[1]));
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorNonDynamicAllocation(logging_context, output_index_tensor,
-                                        node->outputs->data[1], node_index));
-
-    TF_LITE_ENSURE_STATUS(
-        CheckMediaPipePoolParams(logging_context, pool_params, node_index));
-
-    uint32_t flags = 0;
-    TF_LITE_ENSURE_STATUS(CalculatePadding(
-        logging_context, pool_params->padding, &flags, node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_argmax_pooling_2d(
-          subgraph,
-          /*input_padding_top=*/0,
-          /*input_padding_right=*/0,
-          /*input_padding_bottom=*/0,
-          /*input_padding_left=*/0,
-          static_cast<uint32_t>(pool_params->filter_height),
-          static_cast<uint32_t>(pool_params->filter_width),
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_value_id=*/xnnpack_tensors[node->outputs->data[0]],
-          /*output_index_id=*/xnnpack_tensors[node->outputs->data[1]], flags);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(
-            logging_context,
-            "failed to delegate CUSTOM(MaxPoolingWithArgmax2D) node #%d",
-            node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitMediaPipeUnpoolingNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLitePoolParams* pool_params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
-
-    const TfLiteTensor& input_value_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32Type(logging_context, input_value_tensor,
-                               node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_value_tensor,
-                                           4, node->inputs->data[0]));
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorNonDynamicAllocation(logging_context, input_value_tensor,
-                                        node->inputs->data[0], node_index));
-
-    const TfLiteTensor& input_index_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_index_tensor,
-                                           4, node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorNonDynamicAllocation(logging_context, input_index_tensor,
-                                        node->inputs->data[1], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
-                                           node->outputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    TF_LITE_ENSURE_STATUS(
-        CheckMediaPipePoolParams(logging_context, pool_params, node_index));
-
-    uint32_t flags = 0;
-    TF_LITE_ENSURE_STATUS(CalculatePadding(
-        logging_context, pool_params->padding, &flags, node_index));
-    if (flags != 0) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context, "invalid padding mode (%d) in node #%d",
-          static_cast<int>(pool_params->padding), node_index);
-    }
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_unpooling_2d(
-          subgraph,
-          /*padding_top=*/0,
-          /*padding_right=*/0,
-          /*padding_bottom=*/0,
-          /*padding_left=*/0, static_cast<uint32_t>(pool_params->filter_height),
-          static_cast<uint32_t>(pool_params->filter_width),
-          /*input_value_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*input_index_id=*/xnnpack_tensors[node->inputs->data[1]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate CUSTOM(MaxUnpooling2D) node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitMinimumNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
-
-    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input1_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input1_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input2_tensor, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input2_tensor, node->inputs->data[1], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_minimum2(
-          subgraph, /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate MINIMUM node #%d", node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitMulNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLiteMulParams* mul_params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
-
-    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input1_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input1_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input2_tensor,
-                                       node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input2_tensor, node->inputs->data[1], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    const float scale_min = 1.0f / 65536.0f;
-    const float scale_max = 256.0f;
-    TF_LITE_ENSURE_STATUS(CheckTensorsInputProductOutputScale(
-        logging_context, input1_tensor, input2_tensor, output_tensor, scale_min,
-        scale_max, node_index, "MUL"));
-
-    float output_min = -std::numeric_limits<float>::infinity();
-    float output_max = +std::numeric_limits<float>::infinity();
-    if (mul_params != nullptr) {
-      TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
-          logging_context, node_index, mul_params->activation, &output_min,
-          &output_max));
-    }
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_multiply2(
-          subgraph, output_min, output_max,
-          /*input1_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate MUL node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitNegNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_negate(
-          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate NEG node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitPadNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 1,
-                                           XNN_MAX_TENSOR_DIMS,
-                                           node->inputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& paddings_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, paddings_tensor,
-                                          kTfLiteInt32, node->inputs->data[1],
-                                          node_index));
-    TF_LITE_ENSURE_STATUS(CheckPaddingsTensorShape(
-        logging_context, paddings_tensor, NumDimensions(&input_tensor),
-        node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, paddings_tensor, node->inputs->data[1], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 1,
-                                           XNN_MAX_TENSOR_DIMS,
-                                           node->outputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    const int32_t* paddings_data =
-        reinterpret_cast<const int32_t*>(paddings_tensor.data.data);
-    for (int i = 0; i < NumDimensions(&paddings_tensor); i++) {
-      const int32_t pre_padding = paddings_data[i * 2 + 0];
-      if (pre_padding < 0) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context,
-            "invalid pre-padding %d for dimension #%d in node %d", pre_padding,
-            i, node_index);
-        return kTfLiteError;
-      }
-
-      const int32_t post_padding = paddings_data[i * 2 + 1];
-      if (post_padding < 0) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context,
-            "invalid post-padding %d for dimension #%d in node %d", pre_padding,
-            i, node_index);
-        return kTfLiteError;
-      }
-    }
-
-    if (subgraph != nullptr) {
-      std::array<size_t, XNN_MAX_TENSOR_DIMS> pre_paddings{};
-      std::array<size_t, XNN_MAX_TENSOR_DIMS> post_paddings{};
-      for (int i = 0; i < SizeOfDimension(&paddings_tensor, 0); i++) {
-        pre_paddings[i] = static_cast<size_t>(paddings_data[i * 2 + 0]);
-        post_paddings[i] = static_cast<size_t>(paddings_data[i * 2 + 1]);
-      }
-
-      const xnn_status status = xnn_define_static_constant_pad(
-          subgraph, pre_paddings.data(), post_paddings.data(),
-          /*padding_value=*/0.0f,
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate PAD node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitPreluNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_set<int>& quasi_static_tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 1,
-                                           XNN_MAX_TENSOR_DIMS,
-                                           node->inputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& slope_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, slope_tensor, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckSlopeTensorShape(
-        logging_context, slope_tensor, node->inputs->data[1], node_index));
-    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
-      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-          logging_context, slope_tensor, node->inputs->data[1], node_index));
-    }
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 1,
-                                           XNN_MAX_TENSOR_DIMS,
-                                           node->outputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_prelu(
-          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*slope_id=*/xnnpack_tensors[node->inputs->data[1]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate PRELU node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitQuantizeNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorQInt8OrQUInt8Type(delegate, logging_context, output_tensor,
-                                     node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    const xnn_datatype input_datatype = GetXNNPackDatatype(
-        logging_context, input_tensor, node->inputs->data[0]);
-    const xnn_datatype output_datatype = GetXNNPackDatatype(
-        logging_context, output_tensor, node->outputs->data[0]);
-    bool supported_combination = false;
-    switch (input_datatype) {
-      case xnn_datatype_fp32:
-        supported_combination = true;
-        break;
-      case xnn_datatype_qint8:
-      case xnn_datatype_quint8:
-        if (input_datatype == output_datatype) {
-          const float input_scale =
-              GetTensorScaleOrDefault(input_tensor, std::nanf(""));
-          const float output_scale =
-              GetTensorScaleOrDefault(output_tensor, std::nanf(""));
-          const float input_output_scale = input_scale / output_scale;
-          if (input_output_scale < 1.0f / 256.0f ||
-              input_output_scale > 128.0f) {
-            TF_LITE_MAYBE_KERNEL_LOG(
-                logging_context,
-                "unsupported input-to-output scale in QUANTIZE node #%d",
-                node_index);
-            return kTfLiteError;
-          }
-          supported_combination = true;
-        }
-        break;
-      default:
-        break;
-    }
-    if (!supported_combination) {
-      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                               "unsupported combination of input type (%s) and "
-                               "output type (%s) in QUANTIZE node #%d",
-                               TfLiteTypeGetName(input_tensor.type),
-                               TfLiteTypeGetName(output_tensor.type),
-                               node_index);
-      return kTfLiteError;
-    }
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_convert(
-          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate QUANTIZE node #%d", node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitReadVariableNode(
-      xnn_subgraph_t subgraph, Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, const TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    if (!delegate.handle_variable_ops()) {
-      return kTfLiteError;
-    }
-    const int resource_tensor_id = node->inputs->data[0];
-    const int output_tensor_id = node->outputs->data[0];
-    const TfLiteTensor& output_tensor = tensors[output_tensor_id];
-
-    if (subgraph == nullptr) {
-      // This could be a scalar or unranked tensor, we don't support
-      // unranked tensor so skip it.
-      // TODO(b/245990811): try to support this, we can delay associating
-      // dim and type with this tensor, assuming that another operation will
-      // provide it, then check that we have dim and type later when
-      // defining tensors.
-      if (output_tensor.dims->size == 0) {
-        return kTfLiteError;
-      }
-      return delegate.AssociateVariableWithDimAndType(
-          resource_tensor_id, &tensors[node->outputs->data[0]],
-          logging_context);
-    } else {
-      const xnn_status status =
-          xnn_define_copy(subgraph, xnnpack_tensors[resource_tensor_id],
-                          xnnpack_tensors[output_tensor_id], 0 /* flags */);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate READ_VARIABLE node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitReluNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, float output_min, float output_max,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_clamp(
-          subgraph, output_min, output_max,
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate RELU node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitReshapeNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLiteReshapeParams* reshape_params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    switch (node->inputs->size) {
-      case 1:
-      case 2:
-        break;
-      default:
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context,
-            "unexpected number of inputs (%d) in node #%d: "
-            "either one or two inputs expected",
-            node->inputs->size, node_index);
-        return kTfLiteError;
-    }
-    if (node->outputs->size != 1) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "unexpected number of outputs (%d) in node #%d: one output expected",
-          node->outputs->size, node_index);
-      return kTfLiteError;
-    }
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 0,
-                                           XNN_MAX_TENSOR_DIMS,
-                                           node->inputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    if (node->inputs->size == 2) {
-      const TfLiteTensor& shape_tensor = tensors[node->inputs->data[1]];
-      TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, shape_tensor,
-                                            kTfLiteInt32, node->inputs->data[1],
-                                            node_index));
-      TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(
-          logging_context, shape_tensor, node->inputs->data[1], node_index));
-      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-          logging_context, shape_tensor, node->inputs->data[1], node_index));
-    }
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 0,
-                                           XNN_MAX_TENSOR_DIMS,
-                                           node->outputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      std::array<size_t, XNN_MAX_TENSOR_DIMS> new_shape;
-      std::copy(&output_tensor.dims->data[0],
-                &output_tensor.dims->data[NumDimensions(&output_tensor)],
-                new_shape.begin());
-      const xnn_status status = xnn_define_static_reshape(
-          subgraph, static_cast<size_t>(NumDimensions(&output_tensor)),
-          new_shape.data(),
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate RESHAPE node #%d", node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitResizeBilinearNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const TfLiteResizeBilinearParams* resize_params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
-                                           node->inputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& shape_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, shape_tensor,
-                                          kTfLiteInt32, node->inputs->data[1],
-                                          node_index));
-    TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(
-        logging_context, shape_tensor, node->inputs->data[1], node_index));
-    if (SizeOfDimension(&shape_tensor, 0) != 2) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "unexpected number of dimensions %d in the output shape in node %d",
-          SizeOfDimension(&shape_tensor, 0), node_index);
-    }
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, shape_tensor, node->inputs->data[1], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
-                                           node->outputs->data[0]));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    const int32_t* shape_data =
-        reinterpret_cast<const int32_t*>(shape_tensor.data.data);
-    for (int i = 0; i < NumDimensions(&shape_tensor); i++) {
-      const int32_t dim = shape_data[i];
-      if (dim <= 0) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context, "invalid output dimension #%d value %d in node %d",
-            i, dim, node_index);
-        return kTfLiteError;
-      }
-    }
-
-    if (subgraph != nullptr) {
-      uint32_t flags = 0;
-      if (resize_params->align_corners) {
-        flags |= XNN_FLAG_ALIGN_CORNERS;
-      } else if (!resize_params->half_pixel_centers) {
-        flags |= XNN_FLAG_TENSORFLOW_LEGACY_MODE;
-      }
-      const xnn_status status = xnn_define_static_resize_bilinear_2d(
-          subgraph, static_cast<size_t>(shape_data[0]),
-          static_cast<size_t>(shape_data[1]),
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate RESIZE_BILINEAR node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitRoundNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_bankers_rounding(
-          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate ROUND node #%d",
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitSliceNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    const int input_tensor_index = node->inputs->data[0];
-    const int begin_tensor_index = node->inputs->data[1];
-    const int size_tensor_index = node->inputs->data[2];
-    const int output_tensor_index = node->outputs->data[0];
-    const TfLiteTensor& input_tensor = tensors[input_tensor_index];
-    const TfLiteTensor& begin_tensor = tensors[begin_tensor_index];
-    const TfLiteTensor& size_tensor = tensors[size_tensor_index];
-    const TfLiteTensor& output_tensor = tensors[output_tensor_index];
-
-    TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(
-        logging_context, begin_tensor, begin_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, begin_tensor, begin_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorInt32OrInt64Type(
-        logging_context, begin_tensor, begin_tensor_index, node_index));
-
-    TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(logging_context, size_tensor,
-                                                size_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, size_tensor, size_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorInt32OrInt64Type(
-        logging_context, size_tensor, size_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorsDimensionMatch(
-        logging_context, begin_tensor, size_tensor, 0, node_index, "SLICE"));
-
-    const int num_dims = begin_tensor.dims->data[0];
-    if (num_dims > XNN_MAX_TENSOR_DIMS) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "number of dimensions %d must be less than %d in SLICE node #%d",
-          num_dims, XNN_MAX_TENSOR_DIMS, node_index);
-    }
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       input_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor,
-                                           num_dims, input_tensor_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, input_tensor_index, node_index));
-
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       output_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor,
-                                           num_dims, output_tensor_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, output_tensor_index, node_index));
-
-    const auto input_shape = input_tensor.dims;
-    const auto output_shape = output_tensor.dims;
-    std::array<int64_t, XNN_MAX_TENSOR_DIMS> begin;
-    std::array<int64_t, XNN_MAX_TENSOR_DIMS> size;
-    CopyTensorDataInt32OrInt64(begin.data(), begin_tensor, num_dims);
-    CopyTensorDataInt32OrInt64(size.data(), size_tensor, num_dims);
-
-    for (size_t i = 0; i < num_dims; i++) {
-      if (begin[i] < 0) {
-        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                 "begin %" PRId64
-                                 " must be greater than 0 in SLICE node #%d",
-                                 begin[i], node_index);
-      }
-      if (begin[i] >= input_shape->data[i]) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context,
-            "begin %" PRId64
-            " must be less than input dimension %d in SLICE node #%d",
-            begin[i], input_shape->data[i], node_index);
-      }
-      if (size[i] <= 0) {
-        if (size[i] != -1) {
-          TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                   "size %" PRId64
-                                   " must be positive or -1 in SLICE node #%d",
-                                   size[i], node_index);
-          return kTfLiteError;
-        }
-        size[i] = input_shape->data[i] - begin[i];
-      }
-      if (size[i] > input_shape->data[i]) {
-        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                 "size %" PRId64
-                                 " must be less than or equals to input "
-                                 "dimension %d in SLICE node #%d",
-                                 size[i], input_shape->data[i], node_index);
-        return kTfLiteError;
-      }
-      if (size[i] != output_shape->data[i]) {
-        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                 "size %" PRId64
-                                 " does not match output shape %d at "
-                                 "dimension %d in SLICE node #%d",
-                                 size[i], output_shape->data[i], i, node_index);
-        return kTfLiteError;
-      }
-      if (begin[i] + size[i] >= input_shape->data[i]) {
-        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                 "begin + size (%" PRId64 " + %" PRId64
-                                 ") must be less input "
-                                 "dimension %d in SLICE node #%d",
-                                 begin[i], size[i], input_shape->data[i],
-                                 node_index);
-        return kTfLiteError;
-      }
-    }
-
-    if (subgraph != nullptr) {
-      // Convert to size_t.
-      std::array<size_t, XNN_MAX_TENSOR_DIMS> offsets;
-      std::copy(begin.begin(), begin.end(), offsets.begin());
-      std::array<size_t, XNN_MAX_TENSOR_DIMS> sizes;
-      std::copy(size.begin(), size.end(), sizes.begin());
-
-      const xnn_status status = xnn_define_static_slice(
-          subgraph, num_dims, offsets.data(), sizes.data(),
-          xnnpack_tensors[node->inputs->data[0]],
-          xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_MAYBE_KERNEL_LOG(
-            logging_context, "failed to delegate SLICE node #%d", node_index);
-        return kTfLiteError;
-      }
-    }
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitSoftmaxNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLiteSoftmaxParams* params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    if (params->beta != 1.0f) {
-      if (logging_context != nullptr) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "unsupported beta value %.7f in SOFTMAX node #%d",
-                           params->beta, node_index);
-      }
-      return kTfLiteError;
-    }
-
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_softmax(
-          subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate SOFTMAX node #%d", node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitSpaceToDepthNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const TfLiteSpaceToDepthParams* space_to_depth_params,
-      const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    const int block_size = space_to_depth_params->block_size;
-    if (block_size <= 1) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "block size (%d) in SPACE_TO_DEPTH node #%d must be greater > 1",
-          block_size, node_index);
-      return kTfLiteError;
-    }
-
-    const int input_height = input_tensor.dims->data[1];
-    const int input_width = input_tensor.dims->data[2];
-    if (input_height % block_size != 0) {
-      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                               "SPACE_TO_DEPTH node #%d input height (%d) must "
-                               "be divisible by block_size (%d).",
-                               input_height, block_size, node_index);
-      return kTfLiteError;
-    }
-
-    if (input_width % block_size != 0) {
-      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                               "SPACE_TO_DEPTH node #%d input width (%d) must "
-                               "be divisible by block_size (%d).",
-                               input_width, block_size, node_index);
-      return kTfLiteError;
-    }
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_space_to_depth_2d(
-          subgraph, static_cast<uint32_t>(block_size),
-          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
-          /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate SPACE_TO_DEPTH node #%d",
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_SPLIT),
                            node_index);
         return kTfLiteError;
       }
@@ -4947,8 +5154,8 @@ class Subgraph {
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
       const TfLiteTensor* tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_SQUARE, node_index));
 
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
@@ -4967,8 +5174,9 @@ class Subgraph {
           subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate SQUARE node #%d", node_index);
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_SQUARE),
+                           node_index);
         return kTfLiteError;
       }
     }
@@ -4981,8 +5189,8 @@ class Subgraph {
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
       const TfLiteTensor* tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_TANH, node_index));
 
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(
@@ -5003,7 +5211,8 @@ class Subgraph {
           subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate TANH node #%d",
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_TANH),
                            node_index);
         return kTfLiteError;
       }
@@ -5017,8 +5226,8 @@ class Subgraph {
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
       const TfLiteTensor* tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 2, 1, BuiltinOperator_TRANSPOSE, node_index));
 
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
@@ -5026,7 +5235,8 @@ class Subgraph {
 
     const TfLiteTensor& perm_tensor = tensors[node->inputs->data[1]];
     TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, perm_tensor, node->inputs->data[1], node_index));
+        logging_context, perm_tensor, node->inputs->data[1],
+        BuiltinOperator_TRANSPOSE, node_index));
 
     const int* perm_data = GetTensorData<int32_t>(&perm_tensor);
 
@@ -5043,8 +5253,9 @@ class Subgraph {
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]],
           /*flags=*/0);
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate TRANSPOSE node #%d", node_index);
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_TRANSPOSE),
+                           node_index);
         return kTfLiteError;
       }
     }
@@ -5057,8 +5268,8 @@ class Subgraph {
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
       const TfLiteTensor* tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 1, 1, BuiltinOperator_SQRT, node_index));
 
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
@@ -5077,7 +5288,8 @@ class Subgraph {
           subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate SQRT node #%d",
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_SQRT),
                            node_index);
         return kTfLiteError;
       }
@@ -5091,8 +5303,9 @@ class Subgraph {
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
       const TfLiteTensor* tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 2, 1, BuiltinOperator_SQUARED_DIFFERENCE,
+        node_index));
 
     const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
@@ -5118,9 +5331,10 @@ class Subgraph {
           /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate SQUARED_DIFFERENCE node #%d",
-                           node_index);
+        TF_LITE_KERNEL_LOG(
+            logging_context, "failed to delegate %s node #%d",
+            EnumNameBuiltinOperator(BuiltinOperator_SQUARED_DIFFERENCE),
+            node_index);
         return kTfLiteError;
       }
     }
@@ -5144,9 +5358,11 @@ class Subgraph {
     const TfLiteTensor& stride_tensor = tensors[stride_tensor_index];
 
     TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(
-        logging_context, stride_tensor, stride_tensor_index, node_index));
+        logging_context, stride_tensor, /*squeeze_dims=*/false,
+        stride_tensor_index, BuiltinOperator_STRIDED_SLICE, node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, stride_tensor, stride_tensor_index, node_index));
+        logging_context, stride_tensor, stride_tensor_index,
+        BuiltinOperator_STRIDED_SLICE, node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorInt32Type(
         logging_context, stride_tensor, stride_tensor_index, node_index));
 
@@ -5180,18 +5396,22 @@ class Subgraph {
     const TfLiteTensor& output_tensor = tensors[output_tensor_index];
 
     TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(
-        logging_context, begin_tensor, begin_tensor_index, node_index));
+        logging_context, begin_tensor, /*squeeze_dims=*/false,
+        begin_tensor_index, BuiltinOperator_STRIDED_SLICE, node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, begin_tensor, begin_tensor_index, node_index));
+        logging_context, begin_tensor, begin_tensor_index,
+        BuiltinOperator_STRIDED_SLICE, node_index));
     // TODO(b/246969669): TFLite only supports int32 begin ends and strides,
     // support int64 too when TFLite supports it as well.
     TF_LITE_ENSURE_STATUS(CheckTensorInt32Type(logging_context, begin_tensor,
                                                begin_tensor_index, node_index));
 
-    TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(logging_context, end_tensor,
-                                                end_tensor_index, node_index));
+    TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(
+        logging_context, end_tensor, /*squeeze_dims=*/false, end_tensor_index,
+        BuiltinOperator_STRIDED_SLICE, node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, end_tensor, end_tensor_index, node_index));
+        logging_context, end_tensor, end_tensor_index,
+        BuiltinOperator_STRIDED_SLICE, node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorInt32Type(logging_context, end_tensor,
                                                end_tensor_index, node_index));
 
@@ -5204,16 +5424,18 @@ class Subgraph {
     TF_LITE_ENSURE_STATUS(
         CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
                                        input_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor,
-                                           num_dims, input_tensor_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, input_tensor, num_dims, input_tensor_index,
+        BuiltinOperator_STRIDED_SLICE, node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, input_tensor, input_tensor_index, node_index));
 
     TF_LITE_ENSURE_STATUS(
         CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
                                        output_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor,
-                                           num_dims, output_tensor_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, output_tensor, num_dims, output_tensor_index,
+        BuiltinOperator_STRIDED_SLICE, node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, output_tensor, output_tensor_index, node_index));
 
@@ -5271,9 +5493,9 @@ class Subgraph {
           xnnpack_tensors[input_tensor_index],
           xnnpack_tensors[output_tensor_index], /*flags=*/0);
       if (status != xnn_status_success) {
-        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                 "failed to delegate STRIDED_SLICE node #%d",
-                                 node_index);
+        TF_LITE_KERNEL_LOG(
+            logging_context, "failed to delegate %s node #%d",
+            EnumNameBuiltinOperator(BuiltinOperator_STRIDED_SLICE), node_index);
         return kTfLiteError;
       }
     }
@@ -5285,8 +5507,8 @@ class Subgraph {
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
       const TfLiteTensor* tensors, const TfLiteSubParams* sub_params,
       const std::vector<uint32_t>& xnnpack_tensors) {
-    TF_LITE_ENSURE_STATUS(
-        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
+        logging_context, node, 2, 1, BuiltinOperator_SUB, node_index));
 
     const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(
@@ -5313,10 +5535,10 @@ class Subgraph {
     const float scale_max = 256.0f;
     TF_LITE_ENSURE_STATUS(CheckTensorsInputOutputScale(
         logging_context, input1_tensor, output_tensor, scale_min, scale_max,
-        node_index, "SUB"));
+        BuiltinOperator_SUB, node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorsInputOutputScale(
         logging_context, input2_tensor, output_tensor, scale_min, scale_max,
-        node_index, "SUB"));
+        BuiltinOperator_SUB, node_index));
 
     float output_min = -std::numeric_limits<float>::infinity();
     float output_max = +std::numeric_limits<float>::infinity();
@@ -5333,7 +5555,8 @@ class Subgraph {
           /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate SUB node #%d",
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
+                           EnumNameBuiltinOperator(BuiltinOperator_SUB),
                            node_index);
         return kTfLiteError;
       }
@@ -5352,7 +5575,8 @@ class Subgraph {
     TF_LITE_ENSURE_STATUS(
         CheckNumInputsAndOutputs(logging_context, node,
                                  /*min_num_inputs=*/3, /*max_num_inputs=*/4,
-                                 /*expected_num_outputs=*/1, node_index));
+                                 /*expected_num_outputs=*/1,
+                                 BuiltinOperator_TRANSPOSE_CONV, node_index));
     const bool use_bias = node->inputs->size >= 4;
 
     const int output_shape_tensor_index = node->inputs->data[0];
@@ -5361,12 +5585,12 @@ class Subgraph {
     TF_LITE_ENSURE_STATUS(
         CheckTensorType(logging_context, output_shape_tensor, kTfLiteInt32,
                         output_shape_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(
-        CheckShapeTensorShape(logging_context, output_shape_tensor,
-                              output_shape_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorStaticAllocation(logging_context, output_shape_tensor,
-                                    output_shape_tensor_index, node_index));
+    TF_LITE_ENSURE_STATUS(CheckShapeTensorShape(
+        logging_context, output_shape_tensor, /*squeeze_dims=*/false,
+        output_shape_tensor_index, BuiltinOperator_TRANSPOSE, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, output_shape_tensor, output_shape_tensor_index,
+        BuiltinOperator_TRANSPOSE_CONV, node_index));
     const int output_shape_dims = SizeOfDimension(&output_shape_tensor, 0);
     if (output_shape_dims != 4) {
       TF_LITE_MAYBE_KERNEL_LOG(
@@ -5382,11 +5606,13 @@ class Subgraph {
     TF_LITE_ENSURE_STATUS(
         CheckTensorFloat32OrQUInt8Type(delegate, logging_context, filter_tensor,
                                        filter_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
-                                           filter_tensor_index));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorShape(logging_context, filter_tensor, 4, filter_tensor_index,
+                         BuiltinOperator_TRANSPOSE_CONV, node_index));
     if (quasi_static_tensors.count(filter_tensor_index) == 0) {
       TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-          logging_context, filter_tensor, filter_tensor_index, node_index));
+          logging_context, filter_tensor, filter_tensor_index,
+          BuiltinOperator_TRANSPOSE_CONV, node_index));
     }
 
     const int input_tensor_index = node->inputs->data[2];
@@ -5395,7 +5621,8 @@ class Subgraph {
         CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
                                        input_tensor_index, node_index));
     TF_LITE_ENSURE_STATUS(
-        CheckTensorShape(logging_context, input_tensor, 4, input_tensor_index));
+        CheckTensorShape(logging_context, input_tensor, 4, input_tensor_index,
+                         BuiltinOperator_TRANSPOSE_CONV, node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, input_tensor, input_tensor_index, node_index));
 
@@ -5407,11 +5634,13 @@ class Subgraph {
         TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQInt32Type(
             delegate, logging_context, bias_tensor, bias_tensor_index,
             node_index));
-        TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
-                                               bias_tensor_index));
+        TF_LITE_ENSURE_STATUS(
+            CheckTensorShape(logging_context, bias_tensor, 1, bias_tensor_index,
+                             BuiltinOperator_TRANSPOSE_CONV, node_index));
         if (quasi_static_tensors.count(bias_tensor_index) == 0) {
           TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-              logging_context, bias_tensor, bias_tensor_index, node_index));
+              logging_context, bias_tensor, bias_tensor_index,
+              BuiltinOperator_TRANSPOSE_CONV, node_index));
         }
         if (subgraph != nullptr) {
           xnnpack_tensor_bias = xnnpack_tensors[bias_tensor_index];
@@ -5424,8 +5653,9 @@ class Subgraph {
     TF_LITE_ENSURE_STATUS(
         CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
                                        output_tensor_index, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
-                                           output_tensor_index));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorShape(logging_context, output_tensor, 4, output_tensor_index,
+                         BuiltinOperator_TRANSPOSE_CONV, node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, output_tensor, output_tensor_index, node_index));
 
@@ -5507,9 +5737,10 @@ class Subgraph {
           /*output_id=*/xnnpack_tensors[output_tensor_index],
           /*flags=*/0);
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "failed to delegate TransposeConv node #%d",
-                           node_index);
+        TF_LITE_KERNEL_LOG(
+            logging_context, "failed to delegate %s node #%d",
+            EnumNameBuiltinOperator(BuiltinOperator_TRANSPOSE_CONV),
+            node_index);
         return kTfLiteError;
       }
     }
@@ -5732,16 +5963,21 @@ TfLiteIntArray* Delegate::PrepareOpsToDelegate(TfLiteContext* context) {
     }
 
     if (node->inputs->size != 1) {
-      TF_LITE_KERNEL_LOG(context, "unexpected number of inputs (%d) in node %d",
-                         node->inputs->size, producer_index);
+      TF_LITE_KERNEL_LOG(
+          context, "unexpected number of inputs (%d) in %s node %d",
+          node->inputs->size,
+          static_cast<BuiltinOperator>(registration->builtin_code),
+          producer_index);
       TfLiteIntArrayFree(nodes_to_delegate);
       return nullptr;  // Hard error.
     }
 
     if (node->outputs->size != 1) {
-      TF_LITE_KERNEL_LOG(context,
-                         "unexpected number of outputs (%d) in node %d",
-                         node->outputs->size, producer_index);
+      TF_LITE_KERNEL_LOG(
+          context, "unexpected number of outputs (%d) in %s node %d",
+          node->outputs->size,
+          static_cast<BuiltinOperator>(registration->builtin_code),
+          producer_index);
       TfLiteIntArrayFree(nodes_to_delegate);
       return nullptr;  // Hard error.
     }
@@ -6055,11 +6291,17 @@ TfLiteXNNPackDelegateOptions TfLiteXNNPackDelegateOptionsDefault() {
 #ifdef XNNPACK_DELEGATE_ENABLE_QU8
   options.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_QU8;
 #endif
+#ifdef XNNPACK_DELEGATE_ENABLE_DYNAMIC_FULLY_CONNECTED
+  options.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_DYNAMIC_FULLY_CONNECTED;
+#endif
 
   // Enable quantized inference for the delegate build used in unit tests.
+  // Enable FULLY_CONNECTED operator with dynamic weights for the delegate build
+  // used in unit tests.
 #ifdef XNNPACK_DELEGATE_TEST_MODE
   options.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_QS8;
   options.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_QU8;
+  options.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_DYNAMIC_FULLY_CONNECTED;
 #endif  // XNNPACK_DELEGATE_TEST_MODE
 
   options.handle_variable_ops = false;
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
index 3dd742258d2..951e0a9de07 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
@@ -29,6 +29,9 @@ extern "C" {
 #define TFLITE_XNNPACK_DELEGATE_FLAG_QU8 0x00000002
 // Force FP16 inference for FP32 operators.
 #define TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16 0x00000004
+// Enable XNNPACK acceleration for FULLY_CONNECTED operator with dynamic
+// weights.
+#define TFLITE_XNNPACK_DELEGATE_FLAG_DYNAMIC_FULLY_CONNECTED 0x00000008
 
 struct TfLiteXNNPackDelegateWeightsCache;
 
@@ -40,6 +43,7 @@ typedef struct {
   // - TFLITE_XNNPACK_DELEGATE_FLAG_QS8
   // - TFLITE_XNNPACK_DELEGATE_FLAG_QU8
   // - TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16
+  // - TFLITE_XNNPACK_DELEGATE_FLAG_DYNAMIC_FULLY_CONNECTED
   uint32_t flags;
   // Cache for packed weights, can be shared between multiple instances of
   // delegates.
diff --git a/tensorflow/lite/examples/label_image/README.md b/tensorflow/lite/examples/label_image/README.md
index e096d0ffe25..2be19503c59 100644
--- a/tensorflow/lite/examples/label_image/README.md
+++ b/tensorflow/lite/examples/label_image/README.md
@@ -140,7 +140,7 @@ average time:10.348 ms
 ```
 
 To run a model with the Hexagon Delegate, assuming we have followed the
-[Hexagon Delegate Guide](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/hexagon_delegate.md)
+[Hexagon Delegate Guide](https://www.tensorflow.org/lite/android/delegates/hexagon)
 and installed Hexagon libraries in `/data/local/tmp`. Run it with (`-j 1`)
 
 ```
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/BUILD b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
index 31f3fe7b907..76c313a9b16 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/BUILD
+++ b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
@@ -198,16 +198,16 @@ cc_library(
     srcs = ["gpu_compatibility.cc"],
     hdrs = ["gpu_compatibility.h"],
     deps = [
-        ":canonicalize_value",
         ":android_info",
+        ":canonicalize_value",
         ":database_fbs",
         ":devicedb",
         ":gpu_compatibility_binary_embed",
+        "//tensorflow/lite/delegates/gpu:delegate_options",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@flatbuffers",
-        "//tensorflow/lite/delegates/gpu:delegate_options",
-        "//tensorflow/lite/delegates/gpu/common:gpu_info",
     ] + tflite_extra_gles_deps(),
 )
 
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin
index 8916ca2edea..9a985e1f5b8 100644
Binary files a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin and b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin differ
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
index 557eda3def2..4546fe0d4ff 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
+++ b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <string>
+#include <utility>
 
 #include "absl/strings/string_view.h"
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
@@ -45,6 +46,12 @@ GPUCompatibilityList::GPUCompatibilityList(
       flatbuffers::GetRoot<DeviceDatabase>(compatibility_list_flatbuffer);
 }
 
+GPUCompatibilityList::GPUCompatibilityList(
+    std::string compatibility_list_flatbuffer)
+    : fbcontent_(std::move(compatibility_list_flatbuffer)) {
+  database_ = flatbuffers::GetRoot<DeviceDatabase>(fbcontent_.data());
+}
+
 std::unique_ptr<GPUCompatibilityList> GPUCompatibilityList::Create() {
   return Create(g_tflite_acceleration_gpu_compatibility_binary,
                 g_tflite_acceleration_gpu_compatibility_binary_len);
@@ -60,23 +67,25 @@ std::unique_ptr<GPUCompatibilityList> GPUCompatibilityList::Create(
       new GPUCompatibilityList(compatibility_list_flatbuffer));
 }
 
+std::unique_ptr<GPUCompatibilityList> GPUCompatibilityList::Create(
+    std::string compatibility_list_flatbuffer) {
+  if (!IsValidFlatbuffer(reinterpret_cast<const unsigned char*>(
+                             compatibility_list_flatbuffer.data()),
+                         compatibility_list_flatbuffer.size())) {
+    return nullptr;
+  }
+  return std::unique_ptr<GPUCompatibilityList>(
+      new GPUCompatibilityList(std::move(compatibility_list_flatbuffer)));
+}
+
 std::map<std::string, std::string> GPUCompatibilityList::CalculateVariables(
     const AndroidInfo& android_info,
     const ::tflite::gpu::GpuInfo& gpu_info) const {
-  std::map<std::string, std::string> variables;
+  std::map<std::string, std::string> variables =
+      InfosToMap(android_info, gpu_info);
 
-  variables[kAndroidSdkVersion] = android_info.android_sdk_version;
-  variables[kDeviceModel] = android_info.model;
-  variables[kDeviceName] = android_info.device;
-  variables[kManufacturer] = android_info.manufacturer;
-  const auto& gl_info = gpu_info.opengl_info;
-  variables[kGPUModel] = gl_info.renderer_name;
-  char buffer[128];
-  int len = snprintf(buffer, 128 - 1, "%d.%d", gl_info.major_version,
-                     gl_info.minor_version);
-  buffer[len] = '\0';
-  variables[kOpenGLESVersion] = std::string(buffer);
   CanonicalizeValues(&variables);
+
   if (!database_) return variables;
   UpdateVariablesFromDatabase(&variables, *database_);
   return variables;
@@ -89,6 +98,14 @@ bool GPUCompatibilityList::Includes(
   return variables[gpu::kStatus] == std::string(gpu::kStatusSupported);
 }
 
+gpu::CompatibilityStatus GPUCompatibilityList::GetStatus(
+    const AndroidInfo& android_info,
+    const ::tflite::gpu::GpuInfo& gpu_info) const {
+  std::map<std::string, std::string> variables =
+      InfosToMap(android_info, gpu_info);
+  return GetStatus(variables);
+}
+
 gpu::CompatibilityStatus GPUCompatibilityList::GetStatus(
     std::map<std::string, std::string>& variables) const {
   CanonicalizeValues(&variables);
@@ -121,5 +138,23 @@ bool GPUCompatibilityList::IsValidFlatbuffer(const unsigned char* data,
   return tflite::acceleration::VerifyDeviceDatabaseBuffer(verifier);
 }
 
+std::map<std::string, std::string> GPUCompatibilityList::InfosToMap(
+    const AndroidInfo& android_info,
+    const ::tflite::gpu::GpuInfo& gpu_info) const {
+  std::map<std::string, std::string> variables;
+  variables[kAndroidSdkVersion] = android_info.android_sdk_version;
+  variables[kDeviceModel] = android_info.model;
+  variables[kDeviceName] = android_info.device;
+  variables[kManufacturer] = android_info.manufacturer;
+  const auto& gl_info = gpu_info.opengl_info;
+  variables[kGPUModel] = gl_info.renderer_name;
+  char buffer[128];
+  int len = snprintf(buffer, 128 - 1, "%d.%d", gl_info.major_version,
+                     gl_info.minor_version);
+  buffer[len] = '\0';
+  variables[kOpenGLESVersion] = std::string(buffer);
+  return variables;
+}
+
 }  // namespace acceleration
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
index b3aee4b0444..bf76d966df7 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
+++ b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
@@ -59,13 +59,29 @@ class GPUCompatibilityList {
 
   // Constructs list from the given flatbuffer data. Returns a unique_ptr to a
   // nullptr is the given flatbuffer is empty or invalid.
+  // The flatbuffer pointer must remain valid during the usage of the
+  // compatibility list, it is the caller's responsibility to make sure of that.
+  // To have the compatibility list own the flatbuffer, use the alternative
+  // Create() method below.
   static std::unique_ptr<GPUCompatibilityList> Create(
       const unsigned char* compatibility_list_flatbuffer, int length);
 
+  // Constructs list from the given flatbuffer data. Returns a unique_ptr to a
+  // nullptr is the given flatbuffer is empty or invalid.
+  // The passed flatbuffer will be owned by the compatibility list object, so
+  // this method can be used safely with local temporary strings.
+  static std::unique_ptr<GPUCompatibilityList> Create(
+      std::string compatibility_list_flatbuffer);
+
   // Returns true if the provided device specs are supported by the database.
   bool Includes(const AndroidInfo& android_info,
                 const ::tflite::gpu::GpuInfo& gpu_info) const;
 
+  // Returns the compatibility status as an enum (unknown/supported/unsupported)
+  gpu::CompatibilityStatus GetStatus(
+      const AndroidInfo& android_info,
+      const ::tflite::gpu::GpuInfo& gpu_info) const;
+
   // Returns the compatibility status as an enum (unknown/supported/unsupported)
   // of the provided device specified as a map of variables (properties).
   // Map keys should all be from here:
@@ -96,9 +112,19 @@ class GPUCompatibilityList {
  protected:
   const DeviceDatabase* database_;
 
+  // Optional container of the flatbuffer content, to support ownership of the
+  // flatbuffer by the compatibility list object itself.
+  std::string fbcontent_;
+
  private:
   explicit GPUCompatibilityList(
       const unsigned char* compatibility_list_flatbuffer);
+
+  explicit GPUCompatibilityList(std::string compatibility_list_flatbuffer);
+
+  std::map<std::string, std::string> InfosToMap(
+      const AndroidInfo& android_info,
+      const ::tflite::gpu::GpuInfo& gpu_info) const;
 };
 
 }  // namespace acceleration
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/variables.h b/tensorflow/lite/experimental/acceleration/compatibility/variables.h
index d9aab438e2c..2fa0788f52f 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/variables.h
+++ b/tensorflow/lite/experimental/acceleration/compatibility/variables.h
@@ -56,6 +56,10 @@ constexpr char kGPUVendor[] = "tflite.gpu_vendor";
 // "opengl_es_3.2_v@328.0_(git@6fb5a5b,_ife855c4895)_(date:08/21/18)"
 constexpr char kOpenGLDriverVersion[] = "tflite.opengl_driver_version";
 
+// Allowlist use case. This property is used to allow joining multiple lists
+// into a single decision tree.
+constexpr char kUseCase[] = "tflite.use_case";
+
 // NNAPI-related properties.
 //
 // NNAPI accelerator name, returned by ANeuralNetworksDevice_getName. E.g.,
@@ -76,6 +80,7 @@ namespace gpu {
 constexpr char kStatus[] = "tflite.gpu.status";
 
 constexpr char kStatusSupported[] = "SUPPORTED";
+constexpr char kStatusUnknown[] = "UNKNOWN";
 constexpr char kStatusUnsupported[] = "UNSUPPORTED";
 
 enum class CompatibilityStatus {
diff --git a/tensorflow/lite/experimental/acceleration/configuration/BUILD b/tensorflow/lite/experimental/acceleration/configuration/BUILD
index e396745fff8..1c4d806b2f9 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/BUILD
+++ b/tensorflow/lite/experimental/acceleration/configuration/BUILD
@@ -13,12 +13,11 @@
 # limitations under the License.
 # ==============================================================================
 
-load("@flatbuffers//:build_defs.bzl", "DEFAULT_FLATC_ARGS", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library", "flatc_path")
+load("@flatbuffers//:build_defs.bzl", "DEFAULT_FLATC_ARGS", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow/lite:special_rules.bzl", "nnapi_plugin_impl_visibility_allowlist", "tflite_portable_test_suite")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
-load(":build_defs.bzl", "flatbuffer_schema_compat_test")
 
 # copybara:comment_begin(oss-only)
 load("//tensorflow/tsl/platform/default:build_config.bzl", "tf_proto_library_py")
@@ -32,68 +31,6 @@ package(
     licenses = ["notice"],
 )
 
-# We generate a FlatBuffer schema from the Protobuf schema.
-genrule(
-    name = "configuration_schema",
-    srcs = ["configuration.proto"],
-    outs = ["configuration.fbs"],
-    # We rename the namespace since otherwise the proto classes and flatbuffer
-    # classes would have the same names.
-    cmd = """
-        $(location {}) --proto -o $(@D) $(location :configuration.proto)
-        perl -p -i -e 's/tflite.proto/tflite/' $@
-    """.format(flatc_path),
-    compatible_with = get_compatible_with_portable(),
-    tools = [flatc_path],
-)
-
-# We also do the same transformation for the _previous_
-# version of the schema -- this is used to test that changes
-# to the schema preserve binary backwards compatibility.
-genrule(
-    name = "configuration_prev_schema",
-    srcs = ["testdata/configuration.proto_prev"],  # Must NOT end in '.proto'.
-    outs = ["configuration.prev.fbs"],  # MUST end in '.fbs'.
-    # We rename the namespace since otherwise the proto classes and flatbuffer
-    # classes would have the same names.
-    cmd = """
-        cp $(location :testdata/configuration.proto_prev) $(@D)/configuration.prev.proto
-        $(location {}) --proto -o $(@D) $(@D)/configuration.prev.proto
-        perl -p -i -e 's/tflite.proto/tflite/' $@
-    """.format(flatc_path),
-    compatible_with = get_compatible_with_portable(),
-    tools = [flatc_path],
-)
-
-# Test that changes to the proto file preserve binary backwards compatibility
-# of the generated FlatBuffer schema, relative to the one generated from the
-# previous proto file.
-flatbuffer_schema_compat_test(
-    name = "configuration_abi_stability_test",
-    ref_schema = ":configuration.prev.fbs",
-    schema = ":configuration.fbs",
-)
-
-# Test that changes to the proto file OR to the FlatBuffer proto-to-flatbuffer
-# schema conversion itself will preserve binary backwards compatibility of the
-# generated FlatBuffer schema, relative to an older snapshot of the
-# generated FlatBuffer schema.
-flatbuffer_schema_compat_test(
-    name = "configuration_flatbuffer_abi_stability_test",
-    ref_schema = "testdata/configuration.old.fbs",
-    schema = ":configuration.fbs",
-)
-
-# Test that the previous version of the proto is different than the current version.
-sh_test(
-    name = "prev_is_different_than_current_test",
-    srcs = ["prev_is_different_than_current_test.sh"],
-    data = [
-        "configuration.proto",
-        "testdata/configuration.proto_prev",
-    ],
-)
-
 # Generate a C++ header containing the contents of the FlatBuffer schema
 # as a char array literal.  This is potentially useful for embedding in programs
 # (e.g. for JSON parsing using that schema).
@@ -106,15 +43,27 @@ genrule(
       cat < $(<) >> $(@)
       echo ')Delimiter";' >> $(@)
     """,
+    compatible_with = get_compatible_with_portable(),
 )
 
-exports_files(["configuration.proto"])
+genrule(
+    name = "configuration.fbs-backwards-compat-stub",
+    srcs = ["//tensorflow/lite/acceleration/configuration:configuration.fbs"],
+    outs = ["configuration.fbs"],
+    cmd = "cp $< $@",
+    compatible_with = get_compatible_with_portable(),
+)
 
 proto_library(
     name = "configuration_proto",
-    srcs = [
-        "configuration.proto",
-    ],
+    srcs = ["configuration.proto"],
+    exports = ["//tensorflow/lite/acceleration/configuration:configuration_proto"],
+    deps = ["//tensorflow/lite/acceleration/configuration:configuration_proto"],
+)
+
+alias(
+    name = "configuration_java_proto_lite",
+    actual = "//tensorflow/lite/acceleration/configuration:configuration_java_proto_lite",
 )
 
 cc_proto_library(
@@ -122,11 +71,6 @@ cc_proto_library(
     deps = [":configuration_proto"],
 )
 
-java_lite_proto_library(
-    name = "configuration_java_proto_lite",
-    deps = [":configuration_proto"],
-)
-
 # copybara:comment_begin(oss-only)
 tf_proto_library_py(
     name = "configuration_proto_external",
@@ -153,49 +97,17 @@ flatbuffer_android_library(
 
 cc_library(
     name = "proto_to_flatbuffer",
-    srcs = [
-        "proto_to_flatbuffer.cc",
-    ],
     hdrs = ["proto_to_flatbuffer.h"],
     deps = [
-        ":configuration_cc_proto",
-        ":configuration_fbs",
-        "//tensorflow/lite:minimal_logging",
-        "@flatbuffers",
-    ],
-)
-
-cc_test(
-    name = "proto_to_flatbuffer_test",
-    srcs = ["proto_to_flatbuffer_test.cc"],
-    deps = [
-        ":proto_to_flatbuffer",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/acceleration/configuration:proto_to_flatbuffer",
     ],
 )
 
 cc_library(
     name = "flatbuffer_to_proto",
-    srcs = [
-        "flatbuffer_to_proto.cc",
-    ],
     hdrs = ["flatbuffer_to_proto.h"],
     deps = [
-        ":configuration_cc_proto",
-        ":configuration_fbs",
-        "//tensorflow/lite:minimal_logging",
-        "@flatbuffers",
-    ],
-)
-
-cc_test(
-    name = "flatbuffer_to_proto_test",
-    srcs = ["flatbuffer_to_proto_test.cc"],
-    deps = [
-        ":configuration_cc_proto",
-        ":configuration_fbs",
-        ":flatbuffer_to_proto",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/acceleration/configuration:flatbuffer_to_proto",
     ],
 )
 
@@ -203,25 +115,17 @@ cc_library_with_tflite(
     name = "delegate_registry",
     hdrs = ["delegate_registry.h"],
     compatible_with = get_compatible_with_portable(),
-    copts = tflite_copts_warnings(),
-    deps = [
-        ":configuration_fbs",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
-        "@com_google_absl//absl/synchronization",
+    tflite_deps = [
+        "//tensorflow/lite/acceleration/configuration:delegate_registry",
     ],
 )
 
 cc_library_with_tflite(
     name = "delegate_plugin_converter",
-    srcs = ["delegate_plugin_converter.cc"],
     hdrs = ["delegate_plugin_converter.h"],
     tflite_deps = [
-        "//tensorflow/lite/core/shims:delegate_plugin",
-        "//tensorflow/lite/core/shims:delegate_registry",
-        "//tensorflow/lite/core/shims:common",
+        "//tensorflow/lite/acceleration/configuration:delegate_plugin_converter",
     ],
-    deps = ["@com_google_absl//absl/memory"],
 )
 
 cc_library(
@@ -234,46 +138,16 @@ cc_library(
 
 cc_library(
     name = "nnapi_plugin_impl",
-    srcs = ["nnapi_plugin.cc"],
     hdrs = ["nnapi_plugin.h"],
     compatible_with = get_compatible_with_portable(),
     visibility = nnapi_plugin_impl_visibility_allowlist() + [
+        "//tensorflow/lite/acceleration/configuration/c:__pkg__",
+        "//tensorflow/lite/core/acceleration/configuration/c:__pkg__",
         "//tensorflow/lite/core/experimental/acceleration/configuration/c:__pkg__",
         "//tensorflow/lite/experimental/acceleration/configuration/c:__pkg__",
     ],
     deps = [
-        ":configuration_fbs",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:delegate_plugin",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
-        "//tensorflow/lite/nnapi:nnapi_implementation_headers",
-        "@com_google_absl//absl/memory",
-    ],
-    alwayslink = 1,  # For registration to always run.
-)
-
-cc_test(
-    name = "nnapi_plugin_test",
-    srcs = ["nnapi_plugin_test.cc"],
-    tags = [
-        "no_mac",
-        "no_windows",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":configuration_fbs",
-        ":nnapi_plugin",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/core:framework",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate_mock_test",
-        "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin_impl",
-        "//tensorflow/lite/kernels:test_util",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers",
+        "//tensorflow/lite/acceleration/configuration:nnapi_plugin_impl",
     ],
 )
 
@@ -282,8 +156,8 @@ cc_library(
     srcs = ["hexagon_plugin.cc"],
     deps = [
         ":configuration_fbs",
-        "@com_google_absl//absl/memory",
         "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
+        "@com_google_absl//absl/memory",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
@@ -304,7 +178,6 @@ common_copts = tflite_copts() + tflite_copts_warnings()
 
 cc_library(
     name = "gpu_plugin_impl",
-    srcs = ["gpu_plugin.cc"],
     hdrs = ["gpu_plugin.h"],
     copts = common_copts + select({
         "//tensorflow:ios": [
@@ -315,76 +188,15 @@ cc_library(
         ],
         "//conditions:default": [],
     }),
-    visibility = [
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:__pkg__",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:__pkg__",
-    ],
     deps = [
-        ":configuration_fbs",
-        "@com_google_absl//absl/memory",
-        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
-    ] + select({
-        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [
-            "//tensorflow/lite/delegates/gpu:delegate",
-        ],
-        "//conditions:default": [],
-    }) + select({
-        "//tensorflow:ios": [
-            "//tensorflow/lite/delegates/gpu:metal_delegate",
-        ],
-        "//tensorflow:macos_arm64": [
-            "//tensorflow/lite/delegates/gpu:metal_delegate",
-        ],
-        "//conditions:default": [],
-    }),
-    alwayslink = 1,  # For registration to always run.
-)
-
-cc_test(
-    name = "gpu_plugin_test",
-    srcs = ["gpu_plugin_test.cc"],
-    tags = [
-        # TODO(b/214492180): Enable the tests for mac/ios too.  This most likely
-        # needs TAP to recognize the xobjective-c++ flag.
-        "no_mac",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":configuration_cc_proto",
-        ":configuration_fbs",
-        ":gpu_plugin_impl",
-        ":proto_to_flatbuffer",
-        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers//:runtime_cc",
+        "//tensorflow/lite/acceleration/configuration:gpu_plugin_impl",
     ],
 )
 
 cc_library(
     name = "xnnpack_plugin",
-    srcs = ["xnnpack_plugin.cc"],
     deps = [
-        ":configuration_fbs",
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
-        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
-        "@com_google_absl//absl/base:log_severity",
-        "@com_google_absl//absl/memory",
-    ],
-    alwayslink = 1,  # For registration to always run.
-)
-
-cc_test(
-    name = "xnnpack_plugin_test",
-    srcs = ["xnnpack_plugin_test.cc"],
-    deps = [
-        ":configuration_fbs",
-        ":xnnpack_plugin",
-        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
-        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers//:runtime_cc",
-        "@pthreadpool",
+        "//tensorflow/lite/acceleration/configuration:xnnpack_plugin",
     ],
 )
 
@@ -393,9 +205,9 @@ cc_library(
     srcs = ["coreml_plugin.cc"],
     deps = [
         ":configuration_fbs",
-        "@com_google_absl//absl/memory",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
+        "@com_google_absl//absl/memory",
     ] + select({
         "//tensorflow:macos": [
             "//tensorflow/lite/delegates/coreml:coreml_delegate",
@@ -411,37 +223,9 @@ cc_library(
 # TODO(b/260582614): Add support for TF Lite in Play Services.
 cc_library(
     name = "stable_delegate_plugin",
-    srcs = ["stable_delegate_plugin.cc"],
     hdrs = ["stable_delegate_plugin.h"],
     deps = [
-        ":configuration_fbs",
-        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
-        "//tensorflow/lite/core/shims:common",
-        "//tensorflow/lite/core/shims:delegate_plugin",
-        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:delegate_loader",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:stable_delegate",
-        "//tensorflow/lite/tools:logging",
-        "@com_google_absl//absl/memory",
-    ],
-    alwayslink = 1,  # For registration to always run.
-)
-
-cc_test(
-    name = "stable_delegate_plugin_test",
-    srcs = ["stable_delegate_plugin_test.cc"],
-    data = ["//tensorflow/lite/delegates/utils/experimental/stable_delegate:libtensorflowlite_stable_xnnpack_delegate.so"],
-    tags = [
-        # TODO(b/259303511): Propagate build config to data correctly to enable the test on x86 platforms.
-        "no_test_android_x86",
-    ],
-    deps = [
-        ":configuration_fbs",
-        ":stable_delegate_plugin",
-        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
-        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers//:runtime_cc",
-        "@pthreadpool",
+        "//tensorflow/lite/acceleration/configuration:stable_delegate_plugin",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/BUILD b/tensorflow/lite/experimental/acceleration/configuration/c/BUILD
index f5ac106c5fc..5d5a19e9041 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/BUILD
+++ b/tensorflow/lite/experimental/acceleration/configuration/c/BUILD
@@ -20,7 +20,6 @@ load(
     "//tensorflow/lite/core/shims:cc_library_with_tflite.bzl",
     "cc_library_with_tflite_with_c_headers_test",
 )
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -32,64 +31,46 @@ cc_library_with_tflite_with_c_headers_test(
     name = "delegate_plugin",
     hdrs = ["delegate_plugin.h"],
     compatible_with = get_compatible_with_portable(),
-    copts = tflite_copts() + tflite_copts_warnings(),
     generate_opaque_delegate_target = True,
+    tflite_deps = [
+        "//tensorflow/lite/acceleration/configuration/c:delegate_plugin",
+    ],
     visibility = ["//visibility:public"],
-    deps = ["//tensorflow/lite/core/experimental/acceleration/configuration/c:delegate_plugin"],
 )
 
 cc_library_with_tflite_with_c_headers_test(
     name = "nnapi_plugin",
     hdrs = ["nnapi_plugin.h"],
-    copts = tflite_copts() + tflite_copts_warnings(),
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/lite/core/experimental/acceleration/configuration/c:nnapi_plugin"],
-)
-
-test_suite(
-    name = "nnapi_plugin_test",
-    tests = [
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:nnapi_plugin_test",
+    tflite_deps = [
+        "//tensorflow/lite/acceleration/configuration/c:nnapi_plugin",
     ],
+    visibility = ["//visibility:public"],
 )
 
 cc_library_with_tflite_with_c_headers_test(
     name = "gpu_plugin",
     hdrs = ["gpu_plugin.h"],
-    copts = tflite_copts() + tflite_copts_warnings(),
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/lite/core/experimental/acceleration/configuration/c:gpu_plugin"],
-)
-
-# For non-Android platforms, this should be built with '--copt=-DCL_DELEGATE_NO_GL'.
-# On non-supported platforms (i.e. non-Android platforms if -DCL_DELEGATE_NO_GL wasn't specified),
-# the test srcs are set to the empty list, so the test will succeed without testing anything.
-test_suite(
-    name = "gpu_plugin_test",
-    tests = [
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:gpu_plugin_test",
+    tflite_deps = [
+        "//tensorflow/lite/acceleration/configuration/c:gpu_plugin",
     ],
+    visibility = ["//visibility:public"],
 )
 
 cc_library_with_tflite_with_c_headers_test(
     name = "xnnpack_plugin",
     hdrs = ["xnnpack_plugin.h"],
-    copts = tflite_copts() + tflite_copts_warnings(),
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/lite/core/experimental/acceleration/configuration/c:xnnpack_plugin"],
-)
-
-test_suite(
-    name = "xnnpack_plugin_test",
-    tests = [
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:xnnpack_plugin_test",
+    tflite_deps = [
+        "//tensorflow/lite/acceleration/configuration/c:xnnpack_plugin",
     ],
+    visibility = ["//visibility:public"],
 )
 
 cc_library_with_tflite_with_c_headers_test(
     name = "stable_delegate",
     hdrs = ["stable_delegate.h"],
     generate_opaque_delegate_target = True,
+    tflite_deps = [
+        "//tensorflow/lite/acceleration/configuration/c:stable_delegate",
+    ],
     visibility = ["//visibility:public"],
-    deps = ["//tensorflow/lite/core/experimental/acceleration/configuration/c:stable_delegate"],
 )
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h
index 40b1dd427b6..8f547566b38 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h
@@ -15,6 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h
index 07052d3d5dc..7e37522c3bc 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h
@@ -15,6 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h"
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/c/gpu_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h"
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h
index 28193a2c71d..f2abad6077d 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h
@@ -15,6 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h"
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h"
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h b/tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h
index ca84ed2917f..3e4a6741135 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h
@@ -15,6 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h"
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/c/stable_delegate.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/c/stable_delegate.h"
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h
index a451adda6f3..0d1efc913c1 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h
@@ -15,6 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h"
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
+
+#include "tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h"  // IWYU pragma: export
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h"
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
index 95ad2743f1c..c01227acc97 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
@@ -1,907 +1,5 @@
-// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//-----------------------------------------------------------------------------
-// WARNING: read all the warnings below before modifying this file!
-//-----------------------------------------------------------------------------
-//
-// This schema defines how to configure TFLite for delegation. These
-// definitions can be used in multiple ways: as output of a compatibility list,
-// in benchmarking tools and to decouple delegate instantiation from code.
-//
-// The schema is work-in-progress, covering the most broadly used delegates and
-// options.
-//
-// This schema is written using ProtoBuf syntax, but it is also used to generate
-// a corresponding FlatBuffer schema.
-//
-// WARNING: The TfLiteSettings flatbuffer is used as part of the ABI
-// for TensorFlow in Play Services, so please be careful to preserve
-// binary backwards compatibility!
-//
-// WARNING: the Protobuf to Flatbuffer schema conversion does NOT
-// pay any attention to the protobuf field numbers in this file,
-// so setting the protobuf field numbers is NOT sufficient to preserve binary
-// backwards compatibility.  Instead, to preserve backwards binary
-// compatibility, new fields MUST ONLY be added at the END of messages,
-// and fields should NEVER be deleted, but instead can only be deprecated.
-//
-// WARNING: before modifying this file, you should copy the previous contents
-// of this file to 'testdata/configuration.proto_prev'.  This is used to test
-// that your changes will preserve binary backwards compatibility.
-//
-// WARNING: you need to manually generate and update the generated flatbuffer
-// code (configuration_generated.h) when modifying this file. See BUILD for
-// more information. Below are manual steps for reference:
-// bazel build
-// //tensorflow/lite/experimental/acceleration/configuration:proto_to_flatbuffer
-// && cp
-// bazel-bin/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
-// tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h.oss
-
-// LINT.IfChange
-
 syntax = "proto2";
 
 package tflite.proto;
 
-// ExecutionPreference is used to match accelerators against the preferences of
-// the current application or usecase. Some of the values here can appear both
-// in the compatibility list and as input, some only as input.
-//
-// These are separate from NNAPIExecutionPreference - the compatibility list
-// design doesn't assume a one-to-one mapping between which usecases
-// compatibility list entries have been developed for and what settings are used
-// for NNAPI.
-enum ExecutionPreference {
-  // Match any selected preference. Allowlist (semantically - value is same as
-  // on input).
-  ANY = 0;
-  // Match low latency preference. Both compatibility list and input.
-  LOW_LATENCY = 1;
-  // Math low power preference. Both compatibility list and input.
-  LOW_POWER = 2;
-  // Never accelerate. Can be used for input to compatibility list or for
-  // standalone Acceleration configuration.
-  FORCE_CPU = 3;
-}
-
-// TFLite accelerator to use.
-//
-// STATUS: support library and the stable delegate loader settings are agnostic
-// to the actual accelerator.
-enum Delegate {
-  NONE = 0;
-
-  NNAPI = 1;
-  GPU = 2;
-  HEXAGON = 3;
-  XNNPACK = 4;
-  // The EdgeTpu in Pixel devices.
-  EDGETPU = 5;
-  // The Coral EdgeTpu Dev Board / USB accelerator.
-  EDGETPU_CORAL = 6;
-  // Apple CoreML.
-  CORE_ML = 7;
-}
-
-enum NNAPIExecutionPreference {
-  // Undefined.
-  UNDEFINED = 0;
-  // Prefer executing in a way that minimizes battery drain.
-  NNAPI_LOW_POWER = 1;
-  // Prefer returning a single answer as fast as possible, even if this causes
-  // more power consumption.
-  NNAPI_FAST_SINGLE_ANSWER = 2;
-  // Prefer maximizing the throughput of successive frames, for example when
-  // processing successive frames coming from the camera.
-  NNAPI_SUSTAINED_SPEED = 3;
-}
-
-enum NNAPIExecutionPriority {
-  NNAPI_PRIORITY_UNDEFINED = 0;
-  NNAPI_PRIORITY_LOW = 1;
-  NNAPI_PRIORITY_MEDIUM = 2;
-  NNAPI_PRIORITY_HIGH = 3;
-}
-
-// One possible acceleration configuration.
-message ComputeSettings {
-  // Which preference to use this accelerator for.
-  optional ExecutionPreference preference = 1;
-  // How to configure TFLite
-  optional TFLiteSettings tflite_settings = 2;
-  // Identifiers to use for instrumentation and telemetry.
-  optional string model_namespace_for_statistics = 3;
-  optional string model_identifier_for_statistics = 4;
-
-  // 'Maybe' acceleration: use mini-benchmark to select settings.
-  optional MinibenchmarkSettings settings_to_test_locally = 5;
-}
-
-// NNAPI delegate settings.
-message NNAPISettings {
-  // Which instance (NNAPI accelerator) to use. One driver may provide several
-  // accelerators (though a driver may also hide several back-ends behind one
-  // name, at the choice of the driver vendor).
-  // Note that driver introspection is only available in Android Q and later.
-  optional string accelerator_name = 1;
-
-  // Deprecated; use the compilation_caching_settings in TFLiteSettings.
-  //
-  // NNAPI model compilation caching settings to be passed to
-  // tflite::StatefulNnApiDelegate
-  optional string cache_directory = 2 [deprecated = true];
-  optional string model_token = 3 [deprecated = true];
-
-  // NNAPI execution preference to pass. See
-  // https://developer.android.com/ndk/reference/group/neural-networks.html
-  optional NNAPIExecutionPreference execution_preference = 4;
-
-  // Number of instances to cache for the same model (for input size
-  // changes). This is mandatory for getting reasonable performance in that
-  // case.
-  optional int32 no_of_nnapi_instances_to_cache = 5;
-
-  // Deprecated; use the fallback_settings in TFLiteSettings.
-  //
-  // Whether to automatically fall back to TFLite CPU path.
-  optional FallbackSettings fallback_settings = 6 [deprecated = true];
-
-  // Whether to allow use of NNAPI CPU (nnapi-reference accelerator) on Android
-  // 10+ when an accelerator name is not specified. The NNAPI CPU typically
-  // performs less well than the TfLite built-in kernels; but allowing allows a
-  // model to be partially accelerated which may be a win.
-  optional bool allow_nnapi_cpu_on_android_10_plus = 7;
-
-  optional NNAPIExecutionPriority execution_priority = 8;
-
-  // Whether to allow dynamic dimension sizes without re-compilation.
-  // A tensor of with dynamic dimension must have a valid dims_signature
-  // defined.
-  // Only supported in NNAPI 1.1 and newer versions.
-  // WARNING: Setting this flag to true may result in model being rejected by
-  // accelerator. This should only be enabled if the target device supports
-  // dynamic dimensions of the model.
-  // By default this is set to false.
-  optional bool allow_dynamic_dimensions = 9;
-
-  // Whether to allow the NNAPI accelerator to optionally use lower-precision
-  // float16 (16-bit floating point) arithmetic when doing calculations on
-  // float32 (32-bit floating point).
-  optional bool allow_fp16_precision_for_fp32 = 10;
-
-  // Whether to use NNAPI Burst mode.
-  // Burst mode allows accelerators to efficiently manage resources, which
-  // would significantly reduce overhead especially if the same delegate
-  // instance is to be used for multiple inferences.
-  optional bool use_burst_computation = 11;
-
-  // Optional pointer to NNAPI Support Library provided pointer to
-  // NnApiSLDriverImplFL5 which can be used to construct the
-  // NNAPI delegate.
-  optional int64 support_library_handle = 12;
-}
-
-// LINT.IfChange
-// Which GPU backend to select. Default behaviour on Android is to try OpenCL
-// and if it's not available fall back to OpenGL.
-enum GPUBackend {
-  UNSET = 0;
-  OPENCL = 1;
-  OPENGL = 2;
-  // Not yet supported.
-  // VULKAN = 3;
-  // METAL = 4;
-}
-
-// GPU inference priorities define relative priorities given by the GPU delegate
-// to different client needs.
-// Corresponds to TfLiteGpuInferencePriority.
-enum GPUInferencePriority {
-  GPU_PRIORITY_AUTO = 0;
-  GPU_PRIORITY_MAX_PRECISION = 1;
-  GPU_PRIORITY_MIN_LATENCY = 2;
-  GPU_PRIORITY_MIN_MEMORY_USAGE = 3;
-}
-
-// GPU inference preference for initialization time vs. inference time.
-// Corresponds to TfLiteGpuInferenceUsage.
-enum GPUInferenceUsage {
-  // Delegate will be used only once, therefore, bootstrap/init time should
-  // be taken into account.
-  GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER = 0;
-
-  // Prefer maximizing the throughput. Same delegate will be used repeatedly on
-  // multiple inputs.
-  GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED = 1;
-}
-
-// GPU Delegate settings.
-//
-// See
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/gpu/delegate.h
-message GPUSettings {
-  // Obsolete: Ignored if inference_priority1/2/3 are set.
-  optional bool is_precision_loss_allowed = 1;
-  optional bool enable_quantized_inference = 2 [default = true];
-  optional GPUBackend force_backend = 3;
-
-  // Ordered priorities provide better control over desired semantics,
-  // where priority(n) is more important than priority(n+1). Therefore,
-  // each time inference engine needs to make a decision, it uses
-  // ordered priorities to do so.
-  //
-  // Default values correspond to GPU_PRIORITY_AUTO.
-  // AUTO priority can only be used when higher priorities are fully specified.
-  // For example:
-  //   VALID:   priority1 = MIN_LATENCY, priority2 = AUTO, priority3 = AUTO
-  //   VALID:   priority1 = MIN_LATENCY, priority2 = MAX_PRECISION,
-  //            priority3 = AUTO
-  //   INVALID: priority1 = AUTO, priority2 = MIN_LATENCY, priority3 = AUTO
-  //   INVALID: priority1 = MIN_LATENCY, priority2 = AUTO,
-  //            priority3 = MAX_PRECISION
-  // Invalid priorities will result in error.
-  //
-  // For more information, see TfLiteGpuDelegateOptionsV2.
-  optional GPUInferencePriority inference_priority1 = 4
-      [default = GPU_PRIORITY_AUTO];
-  optional GPUInferencePriority inference_priority2 = 5
-      [default = GPU_PRIORITY_AUTO];
-  optional GPUInferencePriority inference_priority3 = 6
-      [default = GPU_PRIORITY_AUTO];
-
-  // Whether to optimize for compilation+execution time or execution time only.
-  optional GPUInferenceUsage inference_preference = 7;
-
-  // Model serialization. Setting both of these fields will also set the
-  // TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_SERIALIZATION flag on the delegate.
-  //
-  // GPU model serialization directory passed in TfLiteGpuDelegateOptionsV2.
-  // This should be set to the application's code cache directory so that it can
-  // not be accessed by other apps and is correctly deleted on app updates.
-  // tflite::StatefulNnApiDelegate
-  optional string cache_directory = 8;
-  // Normally, the model name with version number should be provided here, since
-  // each model needs an unique ID to avoid cache collision.
-  optional string model_token = 9;
-}
-// LINT.ThenChange(GpuAccelerationConfig.java)
-
-// Hexagon Delegate settings.
-//
-// See
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/hexagon/hexagon_delegate.h
-message HexagonSettings {
-  optional int32 debug_level = 1;
-  optional int32 powersave_level = 2;
-  optional bool print_graph_profile = 3;
-  optional bool print_graph_debug = 4;
-}
-
-// XNNPack Delegate settings.
-//
-// See
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
-enum XNNPackFlags {
-  // These flags match the flags in xnnpack_delegate.h.
-  TFLITE_XNNPACK_DELEGATE_NO_FLAGS = 0;
-  // Enable fast signed integer XNNpack kernels.
-  TFLITE_XNNPACK_DELEGATE_FLAG_QS8 = 1;
-  // Enable fast unsigned integer XNNpack kernels.
-  TFLITE_XNNPACK_DELEGATE_FLAG_QU8 = 2;
-  // Enable both, signed and unsigned integer XNNpack kernels.
-  TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8 = 3;
-  // Force 16-bit floating point inference.
-  TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16 = 4;
-}
-
-message XNNPackSettings {
-  optional int32 num_threads = 1;
-  optional XNNPackFlags flags = 2 [default = TFLITE_XNNPACK_DELEGATE_NO_FLAGS];
-}
-
-// CoreML Delegate settings.
-//
-// See
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/coreml/coreml_delegate.h
-message CoreMLSettings {
-  // Note the enum order change from the above header for better proto practice.
-  enum EnabledDevices {
-    // Always create Core ML delegate.
-    DEVICES_ALL = 0;
-    // Create Core ML delegate only on devices with Apple Neural Engine.
-    DEVICES_WITH_NEURAL_ENGINE = 1;
-  }
-  // Only create delegate when Neural Engine is available on the device.
-  optional EnabledDevices enabled_devices = 1;
-
-  // Specifies target Core ML version for model conversion.
-  // Core ML 3 come with a lot more ops, but some ops (e.g. reshape) is not
-  // delegated due to input rank constraint.
-  // if not set to one of the valid versions, the delegate will use highest
-  // version possible in the platform.
-  // Valid versions: (2, 3)
-  optional int32 coreml_version = 2;
-  // This sets the maximum number of Core ML delegates created.
-  // Each graph corresponds to one delegated node subset in the
-  // TFLite model. Set this to 0 to delegate all possible partitions.
-  optional int32 max_delegated_partitions = 3 [default = 0];
-  // This sets the minimum number of nodes per partition delegated with
-  // Core ML delegate. Defaults to 2.
-  optional int32 min_nodes_per_partition = 4 [default = 2];
-}
-
-// Stable delegate loader settings.
-//
-// See
-// tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h
-// An example stable delegate:
-// tensorflow/lite/delegates/utils/experimental/sample_stable_delegate
-message StableDelegateLoaderSettings {
-  // The path of the stable delegate shared object file. Then the stable
-  // delegate provider can dynamically load the shared object file.
-  optional string delegate_path = 1;
-}
-
-// Fields related to compilation caching.  In this context compilation caching
-// refers to the concept of caching compilation artifacts that a delegate might
-// produce when translating a model graph into a delegate-internal structure
-// (for example, this could include compiled CPU code, or instructions for a
-// separate accelerator chip such as a GPU, TPU, or DSP). Caching compilation
-// artifacts can speed-up subsequent compilations, and hence the time it takes
-// to apply a delegate.
-//
-// Compilation caching is an optional feature.  Setting these fields for a
-// delegate that does not implement it will have no effect.
-message CompilationCachingSettings {
-  // The cache dir for the TFLite model.
-  // If not set then the delegate should not try to cache the compilation.
-  //
-  // The delegate is responsible for synchronizing access to files in the
-  // 'cache_dir'.  E.g., it is legal to create multiple threads or processes,
-  // each of which has its own delegate instance, and provide the same
-  // 'cache_dir' to those delegate instances.
-  optional string cache_dir = 1;
-  // The unique token string for a TFLite model.  A caller that wants the
-  // delegate to cache the compilation should set this field.  If set then it is
-  // the caller's responsibility to ensure there is no clash of the tokens.
-  // E.g., if an app uses several models (with this delegate) on a given device,
-  // then no two models should have the same model_token.  If no model token is
-  // provided, but the 'cache_dir' is set, then the delegate might still cache
-  // the compilation, e.g. by deriving a unique token internally, but this
-  // behavior can be delegate-specific. NOTE: when using compilation caching, it
-  // is not recommended to use the same delegate instance for multiple models.
-  optional string model_token = 2;
-}
-
-// EdgeTPU device spec.
-//
-message EdgeTpuDeviceSpec {
-  // EdgeTPU platform types.
-  enum PlatformType {
-    MMIO = 0;
-    REFERENCE = 1;
-    SIMULATOR = 2;
-    REMOTE_SIMULATOR = 3;
-  }
-
-  // Execution platform for the EdgeTPU device.
-  optional PlatformType platform_type = 1;
-
-  // Number of chips to use for the EdgeTPU device.
-  optional int32 num_chips = 2;
-
-  // Paths to the EdgeTPU devices;
-  repeated string device_paths = 3;
-
-  // Chip family used by the EdgeTpu device.
-  optional int32 chip_family = 4;
-}
-
-// Generic definitions of EdgeTPU power states.
-enum EdgeTpuPowerState {
-  // Undefined power state.
-  UNDEFINED_POWERSTATE = 0;
-
-  // TPU core is off but control cluster is on.
-  TPU_CORE_OFF = 1;
-
-  // A non-active low-power state that has much smaller transition time to
-  // active compared to off.
-  READY = 2;
-
-  // Minimum power active state.
-  ACTIVE_MIN_POWER = 3;
-
-  // Very low performance, very low power.
-  ACTIVE_VERY_LOW_POWER = 4;
-
-  // Low performance, low power.
-  ACTIVE_LOW_POWER = 5;
-
-  // The normal performance and power. This setting usually provides the
-  // optimal perf/power trade-off for the average use-case.
-  ACTIVE = 6;
-
-  // Maximum performance level. Potentially higher power and thermal. This
-  // setting may not be allowed in production depending on the system.
-  OVER_DRIVE = 7;
-}
-
-message EdgeTpuInactivePowerConfig {
-  // Inactive power states between inferences.
-  optional EdgeTpuPowerState inactive_power_state = 1;
-
-  // Inactive timeout in microseconds between inferences.
-  optional int64 inactive_timeout_us = 2;
-}
-
-// EdgeTPU Delegate settings.
-//
-// For security reasons, only certain apps that are part of the platform's
-// trusted code base are permitted to use the features defined in this message.
-// General apps should use `GoogleEdgeTpuSettings` instead.
-message EdgeTpuSettings {
-  // Float truncation types for EdgeTPU.
-  enum FloatTruncationType {
-    UNSPECIFIED = 0;
-    NO_TRUNCATION = 1;
-    BFLOAT16 = 2;
-    HALF = 3;
-  }
-
-  enum QosClass {
-    QOS_UNDEFINED = 0;
-    BEST_EFFORT = 1;
-    REALTIME = 2;
-  }
-
-  // Target inference power state for running the model.
-  optional EdgeTpuPowerState inference_power_state = 1;
-
-  // Inactive power states between inferences.
-  repeated EdgeTpuInactivePowerConfig inactive_power_configs = 2;
-
-  // Priority for the inference request.
-  optional int32 inference_priority = 3 [default = -1];
-
-  // Device spec for creating the EdgeTpu device.
-  optional EdgeTpuDeviceSpec edgetpu_device_spec = 4;
-
-  // A unique identifier of the input TfLite model.
-  optional string model_token = 5;
-
-  // Float truncation type for EdgeTPU.
-  optional FloatTruncationType float_truncation_type = 6;
-
-  // QoS class to determine chunking size for PRO onward.
-  optional QosClass qos_class = 7 [default = QOS_UNDEFINED];
-
-  // Cluster IDs the model will be compiled for.
-  repeated int32 hardware_cluster_ids = 8 [packed = true];
-
-  // Public model ID to be logged in logs, traces and metrics for identifying
-  // the model to help debugging.
-  // The configured string must obey the following rules:
-  // 1. Must not contain any confidential information, because public_model_id
-  // will be logged in android logs and traces which are publicly visible.
-  // 2. Must not contain any private user data or PII (Personally Identifiable
-  // Information), such as age, language, geography, religion, etc.
-  // 3. Should be <=30 chars, otherwise EdgeTpu software might truncate the
-  // string due to logging size constraints.
-  // 4. Please try to use a unique name so that it's easier to identify the
-  // model during debugging.
-  optional string public_model_id = 9;
-}
-
-// Google EdgeTPU delegate settings.
-message GoogleEdgeTpuSettings {
-  enum Priority {
-    PRIORITY_UNDEFINED = 0;
-    PRIORITY_LOW = 1;
-    PRIORITY_MEDIUM = 2;
-    PRIORITY_HIGH = 3;
-  }
-
-  // Controls the verbosity level of the delegate log messages. Set to -1 to let
-  // the delegate choose. Otherwise, it must range from 0 to 10 (inclusive),
-  // where lower values indicate less verbosity. A higher verbosity level may
-  // have an adverse impact on the delegate performance.
-  optional int32 log_verbosity = 1 [default = -1];
-
-  // Whether or not the client requests detailed delegate traces.
-  // The resulting traces can be used for performance analysis with tools such
-  // as perfetto (https://perfetto.dev/docs/quickstart/android-tracing).
-  // Enabling tracing may have an adverse impact on the delegate performance.
-  optional bool enable_tracing = 2 [default = false];
-
-  // Specifies the execution priority. The priority is global. Requests from
-  // different clients are prioritized relative to one another.
-  optional Priority priority = 3;
-
-  // Reserved.
-  optional bytes extension_data = 4;
-}
-
-// Coral Dev Board / USB accelerator delegate settings.
-//
-// See
-// https://github.com/google-coral/edgetpu/blob/master/libedgetpu/edgetpu_c.h
-message CoralSettings {
-  enum Performance {
-    UNDEFINED = 0;
-    MAXIMUM = 1;
-    HIGH = 2;
-    MEDIUM = 3;
-    LOW = 4;
-  }
-
-  // The Edge Tpu device to be used. See
-  // https://github.com/google-coral/libcoral/blob/982426546dfa10128376d0c24fd8a8b161daac97/coral/tflite_utils.h#L131-L137
-  optional string device = 1;
-  // The desired performance level. This setting adjusts the internal clock
-  // rate to achieve different performance / power balance. Higher performance
-  // values improve speed, but increase power usage.
-  optional Performance performance = 2 [default = MAXIMUM];
-  // If true, always perform device firmware update (DFU) after reset. DFU is
-  // usually only necessary after power cycle.
-  optional bool usb_always_dfu = 3;
-  // The maximum bulk in queue length. Larger queue length may improve USB
-  // performance on the direction from device to host. When not specified (or
-  // zero), `usb_max_bulk_in_queue_length` will default to 32 according to the
-  // current EdgeTpu Coral implementation.
-  optional int32 usb_max_bulk_in_queue_length = 4;
-}
-
-message CPUSettings {
-  // Set to -1 to let the interpreter choose. Otherwise, must be > 0.
-  optional int32 num_threads = 1 [default = -1];
-}
-
-// How to configure TFLite.
-message TFLiteSettings {
-  // Which delegate to use.
-  optional Delegate delegate = 1;
-
-  // How to configure the chosen delegate.
-  // (In principle we would like to use 'oneof', but flatc turns that into an
-  // nested anonymous table rather than a union. See
-  // https://github.com/google/flatbuffers/issues/4628).
-  optional NNAPISettings nnapi_settings = 2;
-  optional GPUSettings gpu_settings = 3;
-  optional HexagonSettings hexagon_settings = 4;
-  optional XNNPackSettings xnnpack_settings = 5;
-  optional CoreMLSettings coreml_settings = 11;
-
-  // How to configure CPU execution.
-  optional CPUSettings cpu_settings = 6;
-
-  // Shared delegation settings.
-  optional int32 max_delegated_partitions = 7;
-
-  // For configuring the EdgeTpuDelegate.
-  // See also `google_edgetpu_settings` below.
-  optional EdgeTpuSettings edgetpu_settings = 8;
-
-  // For configuring the Coral EdgeTpu Delegate.
-  optional CoralSettings coral_settings = 10;
-
-  // Whether to automatically fall back to TFLite CPU path.
-  optional FallbackSettings fallback_settings = 9;
-
-  // Whether to disable default delegates (XNNPack).
-  // TODO(b/260405596): Update the comment to clarify the interaction between
-  // `disable_default_delegates` and `fallback_settings`.
-  optional bool disable_default_delegates = 12;
-
-  // For loading a stable delegate. If an app supplies a delegate shared library
-  // (e.g. packaged with the app, or downloaded separately), the app can use
-  // this field for passing the path to the delegate shared library.
-  //
-  // The stable delegate loader settings field works together with the settings
-  // of other concrete stable delegates; the stable delegate loader is not a
-  // concrete delegate type but a mechanism for initializing the TF Lite stable
-  // delegates.
-  //
-  // See
-  // tensorflow/lite/delegates/utils/experimental/sample_stable_delegate
-  optional StableDelegateLoaderSettings stable_delegate_loader_settings = 13;
-
-  // For configuring the Google EdgeTpu Delegate.
-  optional GoogleEdgeTpuSettings google_edgetpu_settings = 14;
-
-  // Compilation caching settings.
-  optional CompilationCachingSettings compilation_caching_settings = 15;
-}
-
-// Whether to automatically fallback to TFLite CPU path on delegation errors.
-//
-// Typically fallback is enabled in production use but disabled in tests and
-// benchmarks to ensure they test the intended path.
-message FallbackSettings {
-  // Whether to allow automatically falling back to TfLite CPU path on
-  // compilation failure. Default is not allowing automatic fallback.
-  //
-  // This is useful in naive production usecases where the caller would prefer
-  // for the model to run even if it's not accelerated. More advanced users will
-  // implement fallback themselves; e.g., by using a different model on CPU.
-  //
-  // Note that compilation errors may occur either at initial
-  // ModifyGraphWithDelegate() time, or when calling AllocateTensors() after
-  // resizing.
-  optional bool allow_automatic_fallback_on_compilation_error = 7;
-  // Whether to allow automatically falling back to TfLite CPU path on
-  // execution error. Default is not allowing automatic fallback.
-  //
-  // Experimental, use with care (only when you have complete control over the
-  // client code).
-  //
-  // The caveat above for compilation error holds.  Additionally, execution-time
-  // errors are harder to handle automatically as they require invalidating the
-  // TfLite interpreter which most client code has not been designed to deal
-  // with.
-  optional bool allow_automatic_fallback_on_execution_error = 8;
-}
-
-// On-device mini-benchmark result storage. The following definitions are used
-// to keep an append-only log of benchmark results on-device. (Hence there is
-// single top-level event that is used for all data).
-//
-// These definitions don't need a proto-to-flatbuffer conversion, since they are
-// not used for specifying configuration in the Tasks library.
-
-// Which stage of benchmarking the event is for.
-// There might be multiple events with the same type, if a benchmark is run
-// multiple times.
-enum BenchmarkEventType {
-  UNDEFINED_BENCHMARK_EVENT_TYPE = 0;
-  // Benchmark start. A start without an end can be interpreted as a test that
-  // has crashed or hung.
-  START = 1;
-  // Benchmarking completion. A model was successfully loaded, acceleration
-  // configured and inference run without errors. There may still be an issue
-  // with correctness of results, or with performance.
-  END = 2;
-  // Benchmark was not completed due to an error. The error may be a handled
-  // error (e.g., failure in a delegate), or a crash.
-  ERROR = 3;
-  // Benchmark data has been sent for logging.
-  LOGGED = 4;
-  // Benchmark encountered an error but was able to continue. The error is not
-  // related to the model execution but to the mini-benchmark logic. An example
-  // of error is a failure when trying to set the CPU affinity of the benchmark
-  // runner process.
-  RECOVERED_ERROR = 5;
-}
-
-// A correctness metric from a benchmark, for example KL-divergence between
-// known-good CPU output and on-device output. These are primarily used for
-// telemetry and monitored server-side.
-message BenchmarkMetric {
-  optional string name = 1;
-  repeated float values = 2 [packed = true];
-}
-
-// Outcome of a successfully complete benchmark run. This information is
-// intended to both be used on-device to select best compute configuration as
-// well as sent to server for monitoring.
-//
-// Used with event type END.
-// Next ID: 7
-message BenchmarkResult {
-  // Time to load model and apply acceleration. Initialization may get run
-  // multiple times to get information on variance.
-  repeated int64 initialization_time_us = 1 [packed = true];
-  // Time to run inference (call Invoke()). Inference may get run multiple times
-  // to get information on variance.
-  repeated int64 inference_time_us = 2 [packed = true];
-  // Maximum memory used. Measures size of application heap (does not
-  // necessarily take into account driver-side allocation.
-  optional int32 max_memory_kb = 3;
-  // Whether the inference produced correct results (validation graph output
-  // 'ok' for all test inputs). Used on-device to disallow configurations that
-  // produce incorrect results (e.g., due to OpenCL driver bugs).
-  optional bool ok = 4;
-  // Metrics that were used to determine the 'ok' status.
-  repeated BenchmarkMetric metrics = 5;
-
-  message InferenceOutput {
-    // The matching Flatbuffer type is ubyte.
-    optional bytes value = 1;
-  }
-  // Model output in byte format. Each InferenceOutput comes from one output
-  // tensor. It is ordered the same as tflite::Interpreter::output_tensor(),
-  // i.e. the value of output_tensor(i) is stored in actual_output[i]. Only
-  // populated in custom validation case.
-  repeated InferenceOutput actual_output = 6;
-}
-
-// A handled error.
-message ErrorCode {
-  // Which delegate the error comes from (or NONE, if it comes from the tflite
-  // framework).
-  optional Delegate source = 1;
-  // What the tflite level error is.
-  optional int32 tflite_error = 2;
-  // What the underlying error is (e.g., NNAPI or OpenGL error).
-  optional int64 underlying_api_error = 3;
-}
-
-// When during benchmark execution an error occurred.
-enum BenchmarkStage {
-  UNKNOWN = 0;
-  // During model loading or delegation.
-  INITIALIZATION = 1;
-  // During inference.
-  INFERENCE = 2;
-}
-
-// An error that occurred during benchmarking.
-//
-// Used with event type ERROR.
-message BenchmarkError {
-  // How far benchmarking got.
-  optional BenchmarkStage stage = 1;
-  // Process exit code.
-  optional int32 exit_code = 2;
-  // Signal the process received.
-  optional int32 signal = 3;
-  // Handled tflite error.
-  repeated ErrorCode error_code = 4;
-  // Mini-benchmark error code.
-  optional int32 mini_benchmark_error_code = 5;
-}
-
-// Top-level benchmarking event stored on-device. All events for a model are
-// parsed to detect the status.
-message BenchmarkEvent {
-  // Which settings were used for benchmarking.
-  optional TFLiteSettings tflite_settings = 1;
-  // Type of the event.
-  optional BenchmarkEventType event_type = 2;
-  // Result of benchmark, used when type is END.
-  optional BenchmarkResult result = 3;
-  // Error during benchmark, used when type is ERROR.
-  optional BenchmarkError error = 4;
-  // Start timestamps. These are used for
-  // 1. Checking whether a test was started but not completed within a given
-  // deadline.
-  // 2. Optionally, telemetry timestamps.
-  optional int64 boottime_us = 5;
-  optional int64 wallclock_us = 6;
-}
-
-// Represent the decision on the best acceleration from the mini-benchmark.
-message BestAccelerationDecision {
-  // Number of events used to take the decision.
-  // Using just the size instaed of the full list of events to save space.
-  optional int32 number_of_source_events = 1;
-
-  // Event with min latency in the source ones.
-  optional BenchmarkEvent min_latency_event = 2;
-
-  // Min latency as read from min_latency_event.
-  optional int64 min_inference_time_us = 3;
-}
-
-// Represent a failure during the initialization of the mini-benchmark.
-message BenchmarkInitializationFailure {
-  // Status code returned by the mini-benchmark initialization function.
-  optional int32 initialization_status = 1;
-}
-
-// Events generated by the mini-benchmark before and after triggering
-// the different configuration-specific benchmarks
-message MiniBenchmarkEvent {
-  // Not using oneof because of the way the generated cpp code.
-  // See comment above on TfLite settings for details.
-
-  // If set to true, this event is used to mark all previous events in the
-  // mini-benchmark internal storage as read and one of the other fields
-  // in this message will have a value.
-  optional bool is_log_flushing_event = 1;
-  // Event generated when a best acceleration decision is taken.
-  optional BestAccelerationDecision best_acceleration_decision = 2;
-  // Reports a failure during mini-benchmark initialization.
-  optional BenchmarkInitializationFailure initialization_failure = 3;
-  // Event generated while benchmarking the different settings to test locally.
-  optional BenchmarkEvent benchmark_event = 4;
-}
-
-// How to access the model for mini-benchmark.
-// Mini-benchmark can read the model from a file path, a file
-// descriptor, or in-memory model. The file descriptor typically comes from the
-// Android asset manager. Since mini-benchmark runs in a separate process, it
-// can not access the in-memory model directly. Instead, it will copy the
-// in-memory model to the validation process.
-//
-// Users should set one of  the following:
-// 1) filename, or
-// 2) all of fd, offset (optional, default to 0) and length, or
-// 3) both buffer_handle and length.
-message ModelFile {
-  // Filename for reading model from.
-  optional string filename = 1;
-  // File descriptor to read model from.
-  optional int64 fd = 2;
-  // Offset for model in file descriptor.
-  optional int64 offset = 3;
-  // Length of model.
-  optional int64 length = 4;
-  optional ModelIdGroup model_id_group = 5;
-  // In-memory buffer handle to the model. This handle will be cast to a pointer
-  // of type const uint8_t* to load the model. The caller needs to ensure the
-  // buffer handle out-lives the mini-benchmark main process.
-  // NOTE: When using buffer_handle, this proto should not serialized and copied
-  // across process boundaries (e.g. via a file), since it may contain handles
-  // that refer to addresses in the current process's address space.
-  optional int64 buffer_handle = 6;
-}
-
-message ModelIdGroup {
-  optional string model_namespace = 1;
-  optional string model_id = 2;
-}
-
-// Where to store mini-benchmark state.
-message BenchmarkStoragePaths {
-  // Base path to the files used to store benchmark results in. Two files
-  // will be generated: one with the given path and an extra file to store
-  // events related to best acceleration results at path storage_file_path +
-  // ".extra.fb". Must be specific to the model.
-  // Note on Android, this should be the code cache directory.
-  optional string storage_file_path = 1;
-
-  // Path to a directory for intermediate files (lock files, extracted
-  // binaries).
-  // Note on Android, this typically is the data cache directory (i.e. the one
-  // returned by `getCacheDir()`).
-  optional string data_directory_path = 2;
-}
-
-// Validation related settings.
-// Next ID: 2
-message ValidationSettings {
-  // Timeout for one settings under test. If test didn't finish within this
-  // timeout, this setting is considered hanging.
-  optional int64 per_test_timeout_ms = 1;
-}
-
-// How to run a minibenchmark.
-// Next ID: 5
-message MinibenchmarkSettings {
-  // Which settings to test. This would typically be filled in from an
-  // allowlist.
-  repeated TFLiteSettings settings_to_test = 1;
-  // How to access the model. This would typically be set dynamically, as it
-  // depends on the application folder and/or runtime state.
-  // NOTE: When using buffer_handle, this proto should not serialized and copied
-  // across process boundaries (e.g. via a file), since it may contain handles
-  // that refer to addresses in the current process's address space.
-  optional ModelFile model_file = 2;
-  // Where to store state. This would typically be set dynamically, as it
-  // depends on the application folder.
-  optional BenchmarkStoragePaths storage_paths = 3;
-  // Validation test related settings.
-  optional ValidationSettings validation_settings = 4;
-}
-
-// Schema used for cache Benchmark result.
-message BenchmarkEventStorage {
-  optional ModelIdGroup model_id_group = 1;
-  optional BenchmarkEvent benchmark_event = 2;
-}
-
-// LINT.ThenChange(//tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.proto_prev)
+import public "tensorflow/lite/acceleration/configuration/configuration.proto";
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h b/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
index c8c1abde648..6c00ee4d0ae 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
@@ -20,6 +20,13 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffers.h"
 
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 1 &&
+              FLATBUFFERS_VERSION_REVISION == 21,
+             "Non-compatible flatbuffers version included");
+
 namespace tflite {
 
 struct ComputeSettings;
@@ -248,7 +255,7 @@ inline const char * const *EnumNamesExecutionPreference() {
 }
 
 inline const char *EnumNameExecutionPreference(ExecutionPreference e) {
-  if (flatbuffers::IsOutRange(e, ExecutionPreference_ANY, ExecutionPreference_FORCE_CPU)) return "";
+  if (::flatbuffers::IsOutRange(e, ExecutionPreference_ANY, ExecutionPreference_FORCE_CPU)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesExecutionPreference()[index];
 }
@@ -296,7 +303,7 @@ inline const char * const *EnumNamesDelegate() {
 }
 
 inline const char *EnumNameDelegate(Delegate e) {
-  if (flatbuffers::IsOutRange(e, Delegate_NONE, Delegate_CORE_ML)) return "";
+  if (::flatbuffers::IsOutRange(e, Delegate_NONE, Delegate_CORE_ML)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesDelegate()[index];
 }
@@ -332,7 +339,7 @@ inline const char * const *EnumNamesNNAPIExecutionPreference() {
 }
 
 inline const char *EnumNameNNAPIExecutionPreference(NNAPIExecutionPreference e) {
-  if (flatbuffers::IsOutRange(e, NNAPIExecutionPreference_UNDEFINED, NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED)) return "";
+  if (::flatbuffers::IsOutRange(e, NNAPIExecutionPreference_UNDEFINED, NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesNNAPIExecutionPreference()[index];
 }
@@ -368,7 +375,7 @@ inline const char * const *EnumNamesNNAPIExecutionPriority() {
 }
 
 inline const char *EnumNameNNAPIExecutionPriority(NNAPIExecutionPriority e) {
-  if (flatbuffers::IsOutRange(e, NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED, NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH)) return "";
+  if (::flatbuffers::IsOutRange(e, NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED, NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesNNAPIExecutionPriority()[index];
 }
@@ -401,7 +408,7 @@ inline const char * const *EnumNamesGPUBackend() {
 }
 
 inline const char *EnumNameGPUBackend(GPUBackend e) {
-  if (flatbuffers::IsOutRange(e, GPUBackend_UNSET, GPUBackend_OPENGL)) return "";
+  if (::flatbuffers::IsOutRange(e, GPUBackend_UNSET, GPUBackend_OPENGL)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesGPUBackend()[index];
 }
@@ -437,7 +444,7 @@ inline const char * const *EnumNamesGPUInferencePriority() {
 }
 
 inline const char *EnumNameGPUInferencePriority(GPUInferencePriority e) {
-  if (flatbuffers::IsOutRange(e, GPUInferencePriority_GPU_PRIORITY_AUTO, GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE)) return "";
+  if (::flatbuffers::IsOutRange(e, GPUInferencePriority_GPU_PRIORITY_AUTO, GPUInferencePriority_GPU_PRIORITY_MIN_MEMORY_USAGE)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesGPUInferencePriority()[index];
 }
@@ -467,7 +474,7 @@ inline const char * const *EnumNamesGPUInferenceUsage() {
 }
 
 inline const char *EnumNameGPUInferenceUsage(GPUInferenceUsage e) {
-  if (flatbuffers::IsOutRange(e, GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER, GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED)) return "";
+  if (::flatbuffers::IsOutRange(e, GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER, GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesGPUInferenceUsage()[index];
 }
@@ -506,7 +513,7 @@ inline const char * const *EnumNamesXNNPackFlags() {
 }
 
 inline const char *EnumNameXNNPackFlags(XNNPackFlags e) {
-  if (flatbuffers::IsOutRange(e, XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS, XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16)) return "";
+  if (::flatbuffers::IsOutRange(e, XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS, XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesXNNPackFlags()[index];
 }
@@ -538,7 +545,7 @@ inline const char * const *EnumNamesEnabledDevices() {
 }
 
 inline const char *EnumNameEnabledDevices(EnabledDevices e) {
-  if (flatbuffers::IsOutRange(e, EnabledDevices_DEVICES_ALL, EnabledDevices_DEVICES_WITH_NEURAL_ENGINE)) return "";
+  if (::flatbuffers::IsOutRange(e, EnabledDevices_DEVICES_ALL, EnabledDevices_DEVICES_WITH_NEURAL_ENGINE)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesEnabledDevices()[index];
 }
@@ -578,7 +585,7 @@ inline const char * const *EnumNamesPlatformType() {
 }
 
 inline const char *EnumNamePlatformType(PlatformType e) {
-  if (flatbuffers::IsOutRange(e, PlatformType_MMIO, PlatformType_REMOTE_SIMULATOR)) return "";
+  if (::flatbuffers::IsOutRange(e, PlatformType_MMIO, PlatformType_REMOTE_SIMULATOR)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesPlatformType()[index];
 }
@@ -628,7 +635,7 @@ inline const char * const *EnumNamesEdgeTpuPowerState() {
 }
 
 inline const char *EnumNameEdgeTpuPowerState(EdgeTpuPowerState e) {
-  if (flatbuffers::IsOutRange(e, EdgeTpuPowerState_UNDEFINED_POWERSTATE, EdgeTpuPowerState_OVER_DRIVE)) return "";
+  if (::flatbuffers::IsOutRange(e, EdgeTpuPowerState_UNDEFINED_POWERSTATE, EdgeTpuPowerState_OVER_DRIVE)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesEdgeTpuPowerState()[index];
 }
@@ -666,7 +673,7 @@ inline const char * const *EnumNamesFloatTruncationType() {
 }
 
 inline const char *EnumNameFloatTruncationType(FloatTruncationType e) {
-  if (flatbuffers::IsOutRange(e, FloatTruncationType_UNSPECIFIED, FloatTruncationType_HALF)) return "";
+  if (::flatbuffers::IsOutRange(e, FloatTruncationType_UNSPECIFIED, FloatTruncationType_HALF)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesFloatTruncationType()[index];
 }
@@ -699,7 +706,7 @@ inline const char * const *EnumNamesQosClass() {
 }
 
 inline const char *EnumNameQosClass(QosClass e) {
-  if (flatbuffers::IsOutRange(e, QosClass_QOS_UNDEFINED, QosClass_REALTIME)) return "";
+  if (::flatbuffers::IsOutRange(e, QosClass_QOS_UNDEFINED, QosClass_REALTIME)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesQosClass()[index];
 }
@@ -739,11 +746,44 @@ inline const char * const *EnumNamesPriority() {
 }
 
 inline const char *EnumNamePriority(Priority e) {
-  if (flatbuffers::IsOutRange(e, Priority_PRIORITY_UNDEFINED, Priority_PRIORITY_HIGH)) return "";
+  if (::flatbuffers::IsOutRange(e, Priority_PRIORITY_UNDEFINED, Priority_PRIORITY_HIGH)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesPriority()[index];
 }
 
+enum TriState : int32_t {
+  TriState_TRISTATE_UNDEFINED = 0,
+  TriState_TRISTATE_FALSE = 1,
+  TriState_TRISTATE_TRUE = 2,
+  TriState_MIN = TriState_TRISTATE_UNDEFINED,
+  TriState_MAX = TriState_TRISTATE_TRUE
+};
+
+inline const TriState (&EnumValuesTriState())[3] {
+  static const TriState values[] = {
+    TriState_TRISTATE_UNDEFINED,
+    TriState_TRISTATE_FALSE,
+    TriState_TRISTATE_TRUE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesTriState() {
+  static const char * const names[4] = {
+    "TRISTATE_UNDEFINED",
+    "TRISTATE_FALSE",
+    "TRISTATE_TRUE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameTriState(TriState e) {
+  if (::flatbuffers::IsOutRange(e, TriState_TRISTATE_UNDEFINED, TriState_TRISTATE_TRUE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTriState()[index];
+}
+
 }  // namespace GoogleEdgeTpuSettings_
 
 namespace CoralSettings_ {
@@ -782,7 +822,7 @@ inline const char * const *EnumNamesPerformance() {
 }
 
 inline const char *EnumNamePerformance(Performance e) {
-  if (flatbuffers::IsOutRange(e, Performance_UNDEFINED, Performance_LOW)) return "";
+  if (::flatbuffers::IsOutRange(e, Performance_UNDEFINED, Performance_LOW)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesPerformance()[index];
 }
@@ -826,7 +866,7 @@ inline const char * const *EnumNamesBenchmarkEventType() {
 }
 
 inline const char *EnumNameBenchmarkEventType(BenchmarkEventType e) {
-  if (flatbuffers::IsOutRange(e, BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE, BenchmarkEventType_RECOVERED_ERROR)) return "";
+  if (::flatbuffers::IsOutRange(e, BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE, BenchmarkEventType_RECOVERED_ERROR)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBenchmarkEventType()[index];
 }
@@ -859,12 +899,12 @@ inline const char * const *EnumNamesBenchmarkStage() {
 }
 
 inline const char *EnumNameBenchmarkStage(BenchmarkStage e) {
-  if (flatbuffers::IsOutRange(e, BenchmarkStage_UNKNOWN, BenchmarkStage_INFERENCE)) return "";
+  if (::flatbuffers::IsOutRange(e, BenchmarkStage_UNKNOWN, BenchmarkStage_INFERENCE)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBenchmarkStage()[index];
 }
 
-struct ComputeSettingsT : public flatbuffers::NativeTable {
+struct ComputeSettingsT : public ::flatbuffers::NativeTable {
   typedef ComputeSettings TableType;
   tflite::ExecutionPreference preference = tflite::ExecutionPreference_ANY;
   std::unique_ptr<tflite::TFLiteSettingsT> tflite_settings{};
@@ -877,7 +917,7 @@ struct ComputeSettingsT : public flatbuffers::NativeTable {
   ComputeSettingsT &operator=(ComputeSettingsT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct ComputeSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ComputeSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ComputeSettingsT NativeTableType;
   typedef ComputeSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -893,16 +933,16 @@ struct ComputeSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::TFLiteSettings *tflite_settings() const {
     return GetPointer<const tflite::TFLiteSettings *>(VT_TFLITE_SETTINGS);
   }
-  const flatbuffers::String *model_namespace_for_statistics() const {
-    return GetPointer<const flatbuffers::String *>(VT_MODEL_NAMESPACE_FOR_STATISTICS);
+  const ::flatbuffers::String *model_namespace_for_statistics() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_NAMESPACE_FOR_STATISTICS);
   }
-  const flatbuffers::String *model_identifier_for_statistics() const {
-    return GetPointer<const flatbuffers::String *>(VT_MODEL_IDENTIFIER_FOR_STATISTICS);
+  const ::flatbuffers::String *model_identifier_for_statistics() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_IDENTIFIER_FOR_STATISTICS);
   }
   const tflite::MinibenchmarkSettings *settings_to_test_locally() const {
     return GetPointer<const tflite::MinibenchmarkSettings *>(VT_SETTINGS_TO_TEST_LOCALLY);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_PREFERENCE, 4) &&
            VerifyOffset(verifier, VT_TFLITE_SETTINGS) &&
@@ -915,48 +955,48 @@ struct ComputeSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyTable(settings_to_test_locally()) &&
            verifier.EndTable();
   }
-  ComputeSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ComputeSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ComputeSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ComputeSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ComputeSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ComputeSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ComputeSettingsBuilder {
   typedef ComputeSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_preference(tflite::ExecutionPreference preference) {
     fbb_.AddElement<int32_t>(ComputeSettings::VT_PREFERENCE, static_cast<int32_t>(preference), 0);
   }
-  void add_tflite_settings(flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings) {
+  void add_tflite_settings(::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings) {
     fbb_.AddOffset(ComputeSettings::VT_TFLITE_SETTINGS, tflite_settings);
   }
-  void add_model_namespace_for_statistics(flatbuffers::Offset<flatbuffers::String> model_namespace_for_statistics) {
+  void add_model_namespace_for_statistics(::flatbuffers::Offset<::flatbuffers::String> model_namespace_for_statistics) {
     fbb_.AddOffset(ComputeSettings::VT_MODEL_NAMESPACE_FOR_STATISTICS, model_namespace_for_statistics);
   }
-  void add_model_identifier_for_statistics(flatbuffers::Offset<flatbuffers::String> model_identifier_for_statistics) {
+  void add_model_identifier_for_statistics(::flatbuffers::Offset<::flatbuffers::String> model_identifier_for_statistics) {
     fbb_.AddOffset(ComputeSettings::VT_MODEL_IDENTIFIER_FOR_STATISTICS, model_identifier_for_statistics);
   }
-  void add_settings_to_test_locally(flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally) {
+  void add_settings_to_test_locally(::flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally) {
     fbb_.AddOffset(ComputeSettings::VT_SETTINGS_TO_TEST_LOCALLY, settings_to_test_locally);
   }
-  explicit ComputeSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ComputeSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ComputeSettings> Finish() {
+  ::flatbuffers::Offset<ComputeSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ComputeSettings>(end);
+    auto o = ::flatbuffers::Offset<ComputeSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ComputeSettings> CreateComputeSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ComputeSettings> CreateComputeSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::ExecutionPreference preference = tflite::ExecutionPreference_ANY,
-    flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
-    flatbuffers::Offset<flatbuffers::String> model_namespace_for_statistics = 0,
-    flatbuffers::Offset<flatbuffers::String> model_identifier_for_statistics = 0,
-    flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally = 0) {
+    ::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_namespace_for_statistics = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_identifier_for_statistics = 0,
+    ::flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally = 0) {
   ComputeSettingsBuilder builder_(_fbb);
   builder_.add_settings_to_test_locally(settings_to_test_locally);
   builder_.add_model_identifier_for_statistics(model_identifier_for_statistics);
@@ -966,13 +1006,13 @@ inline flatbuffers::Offset<ComputeSettings> CreateComputeSettings(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<ComputeSettings> CreateComputeSettingsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ComputeSettings> CreateComputeSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::ExecutionPreference preference = tflite::ExecutionPreference_ANY,
-    flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
+    ::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
     const char *model_namespace_for_statistics = nullptr,
     const char *model_identifier_for_statistics = nullptr,
-    flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally = 0) {
+    ::flatbuffers::Offset<tflite::MinibenchmarkSettings> settings_to_test_locally = 0) {
   auto model_namespace_for_statistics__ = model_namespace_for_statistics ? _fbb.CreateString(model_namespace_for_statistics) : 0;
   auto model_identifier_for_statistics__ = model_identifier_for_statistics ? _fbb.CreateString(model_identifier_for_statistics) : 0;
   return tflite::CreateComputeSettings(
@@ -984,9 +1024,9 @@ inline flatbuffers::Offset<ComputeSettings> CreateComputeSettingsDirect(
       settings_to_test_locally);
 }
 
-flatbuffers::Offset<ComputeSettings> CreateComputeSettings(flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ComputeSettings> CreateComputeSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct NNAPISettingsT : public flatbuffers::NativeTable {
+struct NNAPISettingsT : public ::flatbuffers::NativeTable {
   typedef NNAPISettings TableType;
   std::string accelerator_name{};
   std::string cache_directory{};
@@ -1006,7 +1046,7 @@ struct NNAPISettingsT : public flatbuffers::NativeTable {
   NNAPISettingsT &operator=(NNAPISettingsT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct NNAPISettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct NNAPISettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef NNAPISettingsT NativeTableType;
   typedef NNAPISettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -1023,14 +1063,14 @@ struct NNAPISettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_USE_BURST_COMPUTATION = 24,
     VT_SUPPORT_LIBRARY_HANDLE = 26
   };
-  const flatbuffers::String *accelerator_name() const {
-    return GetPointer<const flatbuffers::String *>(VT_ACCELERATOR_NAME);
+  const ::flatbuffers::String *accelerator_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_ACCELERATOR_NAME);
   }
-  const flatbuffers::String *cache_directory() const {
-    return GetPointer<const flatbuffers::String *>(VT_CACHE_DIRECTORY);
+  const ::flatbuffers::String *cache_directory() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CACHE_DIRECTORY);
   }
-  const flatbuffers::String *model_token() const {
-    return GetPointer<const flatbuffers::String *>(VT_MODEL_TOKEN);
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
   }
   tflite::NNAPIExecutionPreference execution_preference() const {
     return static_cast<tflite::NNAPIExecutionPreference>(GetField<int32_t>(VT_EXECUTION_PREFERENCE, 0));
@@ -1059,7 +1099,7 @@ struct NNAPISettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int64_t support_library_handle() const {
     return GetField<int64_t>(VT_SUPPORT_LIBRARY_HANDLE, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_ACCELERATOR_NAME) &&
            verifier.VerifyString(accelerator_name()) &&
@@ -1079,22 +1119,22 @@ struct NNAPISettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int64_t>(verifier, VT_SUPPORT_LIBRARY_HANDLE, 8) &&
            verifier.EndTable();
   }
-  NNAPISettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(NNAPISettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<NNAPISettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  NNAPISettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NNAPISettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<NNAPISettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct NNAPISettingsBuilder {
   typedef NNAPISettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_accelerator_name(flatbuffers::Offset<flatbuffers::String> accelerator_name) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_accelerator_name(::flatbuffers::Offset<::flatbuffers::String> accelerator_name) {
     fbb_.AddOffset(NNAPISettings::VT_ACCELERATOR_NAME, accelerator_name);
   }
-  void add_cache_directory(flatbuffers::Offset<flatbuffers::String> cache_directory) {
+  void add_cache_directory(::flatbuffers::Offset<::flatbuffers::String> cache_directory) {
     fbb_.AddOffset(NNAPISettings::VT_CACHE_DIRECTORY, cache_directory);
   }
-  void add_model_token(flatbuffers::Offset<flatbuffers::String> model_token) {
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
     fbb_.AddOffset(NNAPISettings::VT_MODEL_TOKEN, model_token);
   }
   void add_execution_preference(tflite::NNAPIExecutionPreference execution_preference) {
@@ -1103,7 +1143,7 @@ struct NNAPISettingsBuilder {
   void add_no_of_nnapi_instances_to_cache(int32_t no_of_nnapi_instances_to_cache) {
     fbb_.AddElement<int32_t>(NNAPISettings::VT_NO_OF_NNAPI_INSTANCES_TO_CACHE, no_of_nnapi_instances_to_cache, 0);
   }
-  void add_fallback_settings(flatbuffers::Offset<tflite::FallbackSettings> fallback_settings) {
+  void add_fallback_settings(::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings) {
     fbb_.AddOffset(NNAPISettings::VT_FALLBACK_SETTINGS, fallback_settings);
   }
   void add_allow_nnapi_cpu_on_android_10_plus(bool allow_nnapi_cpu_on_android_10_plus) {
@@ -1124,25 +1164,25 @@ struct NNAPISettingsBuilder {
   void add_support_library_handle(int64_t support_library_handle) {
     fbb_.AddElement<int64_t>(NNAPISettings::VT_SUPPORT_LIBRARY_HANDLE, support_library_handle, 0);
   }
-  explicit NNAPISettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit NNAPISettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<NNAPISettings> Finish() {
+  ::flatbuffers::Offset<NNAPISettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<NNAPISettings>(end);
+    auto o = ::flatbuffers::Offset<NNAPISettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> accelerator_name = 0,
-    flatbuffers::Offset<flatbuffers::String> cache_directory = 0,
-    flatbuffers::Offset<flatbuffers::String> model_token = 0,
+inline ::flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> accelerator_name = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> cache_directory = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0,
     tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED,
     int32_t no_of_nnapi_instances_to_cache = 0,
-    flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
+    ::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
     bool allow_nnapi_cpu_on_android_10_plus = false,
     tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
     bool allow_dynamic_dimensions = false,
@@ -1165,14 +1205,14 @@ inline flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<NNAPISettings> CreateNNAPISettingsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<NNAPISettings> CreateNNAPISettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *accelerator_name = nullptr,
     const char *cache_directory = nullptr,
     const char *model_token = nullptr,
     tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED,
     int32_t no_of_nnapi_instances_to_cache = 0,
-    flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
+    ::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
     bool allow_nnapi_cpu_on_android_10_plus = false,
     tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
     bool allow_dynamic_dimensions = false,
@@ -1198,9 +1238,9 @@ inline flatbuffers::Offset<NNAPISettings> CreateNNAPISettingsDirect(
       support_library_handle);
 }
 
-flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct GPUSettingsT : public flatbuffers::NativeTable {
+struct GPUSettingsT : public ::flatbuffers::NativeTable {
   typedef GPUSettings TableType;
   bool is_precision_loss_allowed = false;
   bool enable_quantized_inference = true;
@@ -1213,7 +1253,7 @@ struct GPUSettingsT : public flatbuffers::NativeTable {
   std::string model_token{};
 };
 
-struct GPUSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct GPUSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef GPUSettingsT NativeTableType;
   typedef GPUSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -1248,13 +1288,13 @@ struct GPUSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::GPUInferenceUsage inference_preference() const {
     return static_cast<tflite::GPUInferenceUsage>(GetField<int32_t>(VT_INFERENCE_PREFERENCE, 0));
   }
-  const flatbuffers::String *cache_directory() const {
-    return GetPointer<const flatbuffers::String *>(VT_CACHE_DIRECTORY);
+  const ::flatbuffers::String *cache_directory() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CACHE_DIRECTORY);
   }
-  const flatbuffers::String *model_token() const {
-    return GetPointer<const flatbuffers::String *>(VT_MODEL_TOKEN);
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_IS_PRECISION_LOSS_ALLOWED, 1) &&
            VerifyField<uint8_t>(verifier, VT_ENABLE_QUANTIZED_INFERENCE, 1) &&
@@ -1269,15 +1309,15 @@ struct GPUSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(model_token()) &&
            verifier.EndTable();
   }
-  GPUSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(GPUSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<GPUSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  GPUSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GPUSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct GPUSettingsBuilder {
   typedef GPUSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_is_precision_loss_allowed(bool is_precision_loss_allowed) {
     fbb_.AddElement<uint8_t>(GPUSettings::VT_IS_PRECISION_LOSS_ALLOWED, static_cast<uint8_t>(is_precision_loss_allowed), 0);
   }
@@ -1299,25 +1339,25 @@ struct GPUSettingsBuilder {
   void add_inference_preference(tflite::GPUInferenceUsage inference_preference) {
     fbb_.AddElement<int32_t>(GPUSettings::VT_INFERENCE_PREFERENCE, static_cast<int32_t>(inference_preference), 0);
   }
-  void add_cache_directory(flatbuffers::Offset<flatbuffers::String> cache_directory) {
+  void add_cache_directory(::flatbuffers::Offset<::flatbuffers::String> cache_directory) {
     fbb_.AddOffset(GPUSettings::VT_CACHE_DIRECTORY, cache_directory);
   }
-  void add_model_token(flatbuffers::Offset<flatbuffers::String> model_token) {
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
     fbb_.AddOffset(GPUSettings::VT_MODEL_TOKEN, model_token);
   }
-  explicit GPUSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit GPUSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<GPUSettings> Finish() {
+  ::flatbuffers::Offset<GPUSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<GPUSettings>(end);
+    auto o = ::flatbuffers::Offset<GPUSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<GPUSettings> CreateGPUSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<GPUSettings> CreateGPUSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool is_precision_loss_allowed = false,
     bool enable_quantized_inference = true,
     tflite::GPUBackend force_backend = tflite::GPUBackend_UNSET,
@@ -1325,8 +1365,8 @@ inline flatbuffers::Offset<GPUSettings> CreateGPUSettings(
     tflite::GPUInferencePriority inference_priority2 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
     tflite::GPUInferencePriority inference_priority3 = tflite::GPUInferencePriority_GPU_PRIORITY_AUTO,
     tflite::GPUInferenceUsage inference_preference = tflite::GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER,
-    flatbuffers::Offset<flatbuffers::String> cache_directory = 0,
-    flatbuffers::Offset<flatbuffers::String> model_token = 0) {
+    ::flatbuffers::Offset<::flatbuffers::String> cache_directory = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0) {
   GPUSettingsBuilder builder_(_fbb);
   builder_.add_model_token(model_token);
   builder_.add_cache_directory(cache_directory);
@@ -1340,8 +1380,8 @@ inline flatbuffers::Offset<GPUSettings> CreateGPUSettings(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<GPUSettings> CreateGPUSettingsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<GPUSettings> CreateGPUSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool is_precision_loss_allowed = false,
     bool enable_quantized_inference = true,
     tflite::GPUBackend force_backend = tflite::GPUBackend_UNSET,
@@ -1366,9 +1406,9 @@ inline flatbuffers::Offset<GPUSettings> CreateGPUSettingsDirect(
       model_token__);
 }
 
-flatbuffers::Offset<GPUSettings> CreateGPUSettings(flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<GPUSettings> CreateGPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct HexagonSettingsT : public flatbuffers::NativeTable {
+struct HexagonSettingsT : public ::flatbuffers::NativeTable {
   typedef HexagonSettings TableType;
   int32_t debug_level = 0;
   int32_t powersave_level = 0;
@@ -1376,7 +1416,7 @@ struct HexagonSettingsT : public flatbuffers::NativeTable {
   bool print_graph_debug = false;
 };
 
-struct HexagonSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct HexagonSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef HexagonSettingsT NativeTableType;
   typedef HexagonSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -1397,7 +1437,7 @@ struct HexagonSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool print_graph_debug() const {
     return GetField<uint8_t>(VT_PRINT_GRAPH_DEBUG, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_DEBUG_LEVEL, 4) &&
            VerifyField<int32_t>(verifier, VT_POWERSAVE_LEVEL, 4) &&
@@ -1405,15 +1445,15 @@ struct HexagonSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<uint8_t>(verifier, VT_PRINT_GRAPH_DEBUG, 1) &&
            verifier.EndTable();
   }
-  HexagonSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(HexagonSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<HexagonSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  HexagonSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HexagonSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HexagonSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct HexagonSettingsBuilder {
   typedef HexagonSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_debug_level(int32_t debug_level) {
     fbb_.AddElement<int32_t>(HexagonSettings::VT_DEBUG_LEVEL, debug_level, 0);
   }
@@ -1426,19 +1466,19 @@ struct HexagonSettingsBuilder {
   void add_print_graph_debug(bool print_graph_debug) {
     fbb_.AddElement<uint8_t>(HexagonSettings::VT_PRINT_GRAPH_DEBUG, static_cast<uint8_t>(print_graph_debug), 0);
   }
-  explicit HexagonSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit HexagonSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<HexagonSettings> Finish() {
+  ::flatbuffers::Offset<HexagonSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<HexagonSettings>(end);
+    auto o = ::flatbuffers::Offset<HexagonSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t debug_level = 0,
     int32_t powersave_level = 0,
     bool print_graph_profile = false,
@@ -1451,15 +1491,15 @@ inline flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct XNNPackSettingsT : public flatbuffers::NativeTable {
+struct XNNPackSettingsT : public ::flatbuffers::NativeTable {
   typedef XNNPackSettings TableType;
   int32_t num_threads = 0;
   tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS;
 };
 
-struct XNNPackSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct XNNPackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef XNNPackSettingsT NativeTableType;
   typedef XNNPackSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -1472,40 +1512,40 @@ struct XNNPackSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::XNNPackFlags flags() const {
     return static_cast<tflite::XNNPackFlags>(GetField<int32_t>(VT_FLAGS, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_NUM_THREADS, 4) &&
            VerifyField<int32_t>(verifier, VT_FLAGS, 4) &&
            verifier.EndTable();
   }
-  XNNPackSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(XNNPackSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<XNNPackSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  XNNPackSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(XNNPackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<XNNPackSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct XNNPackSettingsBuilder {
   typedef XNNPackSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_num_threads(int32_t num_threads) {
     fbb_.AddElement<int32_t>(XNNPackSettings::VT_NUM_THREADS, num_threads, 0);
   }
   void add_flags(tflite::XNNPackFlags flags) {
     fbb_.AddElement<int32_t>(XNNPackSettings::VT_FLAGS, static_cast<int32_t>(flags), 0);
   }
-  explicit XNNPackSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit XNNPackSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<XNNPackSettings> Finish() {
+  ::flatbuffers::Offset<XNNPackSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<XNNPackSettings>(end);
+    auto o = ::flatbuffers::Offset<XNNPackSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t num_threads = 0,
     tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS) {
   XNNPackSettingsBuilder builder_(_fbb);
@@ -1514,9 +1554,9 @@ inline flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct CoreMLSettingsT : public flatbuffers::NativeTable {
+struct CoreMLSettingsT : public ::flatbuffers::NativeTable {
   typedef CoreMLSettings TableType;
   tflite::CoreMLSettings_::EnabledDevices enabled_devices = tflite::CoreMLSettings_::EnabledDevices_DEVICES_ALL;
   int32_t coreml_version = 0;
@@ -1524,7 +1564,7 @@ struct CoreMLSettingsT : public flatbuffers::NativeTable {
   int32_t min_nodes_per_partition = 2;
 };
 
-struct CoreMLSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct CoreMLSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef CoreMLSettingsT NativeTableType;
   typedef CoreMLSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -1545,7 +1585,7 @@ struct CoreMLSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t min_nodes_per_partition() const {
     return GetField<int32_t>(VT_MIN_NODES_PER_PARTITION, 2);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_ENABLED_DEVICES, 4) &&
            VerifyField<int32_t>(verifier, VT_COREML_VERSION, 4) &&
@@ -1553,15 +1593,15 @@ struct CoreMLSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int32_t>(verifier, VT_MIN_NODES_PER_PARTITION, 4) &&
            verifier.EndTable();
   }
-  CoreMLSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(CoreMLSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<CoreMLSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  CoreMLSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CoreMLSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CoreMLSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct CoreMLSettingsBuilder {
   typedef CoreMLSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_enabled_devices(tflite::CoreMLSettings_::EnabledDevices enabled_devices) {
     fbb_.AddElement<int32_t>(CoreMLSettings::VT_ENABLED_DEVICES, static_cast<int32_t>(enabled_devices), 0);
   }
@@ -1574,19 +1614,19 @@ struct CoreMLSettingsBuilder {
   void add_min_nodes_per_partition(int32_t min_nodes_per_partition) {
     fbb_.AddElement<int32_t>(CoreMLSettings::VT_MIN_NODES_PER_PARTITION, min_nodes_per_partition, 2);
   }
-  explicit CoreMLSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit CoreMLSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CoreMLSettings> Finish() {
+  ::flatbuffers::Offset<CoreMLSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CoreMLSettings>(end);
+    auto o = ::flatbuffers::Offset<CoreMLSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::CoreMLSettings_::EnabledDevices enabled_devices = tflite::CoreMLSettings_::EnabledDevices_DEVICES_ALL,
     int32_t coreml_version = 0,
     int32_t max_delegated_partitions = 0,
@@ -1599,61 +1639,61 @@ inline flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct StableDelegateLoaderSettingsT : public flatbuffers::NativeTable {
+struct StableDelegateLoaderSettingsT : public ::flatbuffers::NativeTable {
   typedef StableDelegateLoaderSettings TableType;
   std::string delegate_path{};
 };
 
-struct StableDelegateLoaderSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct StableDelegateLoaderSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef StableDelegateLoaderSettingsT NativeTableType;
   typedef StableDelegateLoaderSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DELEGATE_PATH = 4
   };
-  const flatbuffers::String *delegate_path() const {
-    return GetPointer<const flatbuffers::String *>(VT_DELEGATE_PATH);
+  const ::flatbuffers::String *delegate_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DELEGATE_PATH);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DELEGATE_PATH) &&
            verifier.VerifyString(delegate_path()) &&
            verifier.EndTable();
   }
-  StableDelegateLoaderSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(StableDelegateLoaderSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<StableDelegateLoaderSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  StableDelegateLoaderSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StableDelegateLoaderSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StableDelegateLoaderSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct StableDelegateLoaderSettingsBuilder {
   typedef StableDelegateLoaderSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_delegate_path(flatbuffers::Offset<flatbuffers::String> delegate_path) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_delegate_path(::flatbuffers::Offset<::flatbuffers::String> delegate_path) {
     fbb_.AddOffset(StableDelegateLoaderSettings::VT_DELEGATE_PATH, delegate_path);
   }
-  explicit StableDelegateLoaderSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit StableDelegateLoaderSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<StableDelegateLoaderSettings> Finish() {
+  ::flatbuffers::Offset<StableDelegateLoaderSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<StableDelegateLoaderSettings>(end);
+    auto o = ::flatbuffers::Offset<StableDelegateLoaderSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> delegate_path = 0) {
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> delegate_path = 0) {
   StableDelegateLoaderSettingsBuilder builder_(_fbb);
   builder_.add_delegate_path(delegate_path);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettingsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *delegate_path = nullptr) {
   auto delegate_path__ = delegate_path ? _fbb.CreateString(delegate_path) : 0;
   return tflite::CreateStableDelegateLoaderSettings(
@@ -1661,28 +1701,28 @@ inline flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoa
       delegate_path__);
 }
 
-flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct CompilationCachingSettingsT : public flatbuffers::NativeTable {
+struct CompilationCachingSettingsT : public ::flatbuffers::NativeTable {
   typedef CompilationCachingSettings TableType;
   std::string cache_dir{};
   std::string model_token{};
 };
 
-struct CompilationCachingSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct CompilationCachingSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef CompilationCachingSettingsT NativeTableType;
   typedef CompilationCachingSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_CACHE_DIR = 4,
     VT_MODEL_TOKEN = 6
   };
-  const flatbuffers::String *cache_dir() const {
-    return GetPointer<const flatbuffers::String *>(VT_CACHE_DIR);
+  const ::flatbuffers::String *cache_dir() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CACHE_DIR);
   }
-  const flatbuffers::String *model_token() const {
-    return GetPointer<const flatbuffers::String *>(VT_MODEL_TOKEN);
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_CACHE_DIR) &&
            verifier.VerifyString(cache_dir()) &&
@@ -1690,44 +1730,44 @@ struct CompilationCachingSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers:
            verifier.VerifyString(model_token()) &&
            verifier.EndTable();
   }
-  CompilationCachingSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(CompilationCachingSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<CompilationCachingSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  CompilationCachingSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CompilationCachingSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CompilationCachingSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct CompilationCachingSettingsBuilder {
   typedef CompilationCachingSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_cache_dir(flatbuffers::Offset<flatbuffers::String> cache_dir) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_cache_dir(::flatbuffers::Offset<::flatbuffers::String> cache_dir) {
     fbb_.AddOffset(CompilationCachingSettings::VT_CACHE_DIR, cache_dir);
   }
-  void add_model_token(flatbuffers::Offset<flatbuffers::String> model_token) {
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
     fbb_.AddOffset(CompilationCachingSettings::VT_MODEL_TOKEN, model_token);
   }
-  explicit CompilationCachingSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit CompilationCachingSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CompilationCachingSettings> Finish() {
+  ::flatbuffers::Offset<CompilationCachingSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CompilationCachingSettings>(end);
+    auto o = ::flatbuffers::Offset<CompilationCachingSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> cache_dir = 0,
-    flatbuffers::Offset<flatbuffers::String> model_token = 0) {
+inline ::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> cache_dir = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0) {
   CompilationCachingSettingsBuilder builder_(_fbb);
   builder_.add_model_token(model_token);
   builder_.add_cache_dir(cache_dir);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettingsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *cache_dir = nullptr,
     const char *model_token = nullptr) {
   auto cache_dir__ = cache_dir ? _fbb.CreateString(cache_dir) : 0;
@@ -1738,9 +1778,9 @@ inline flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingS
       model_token__);
 }
 
-flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct EdgeTpuDeviceSpecT : public flatbuffers::NativeTable {
+struct EdgeTpuDeviceSpecT : public ::flatbuffers::NativeTable {
   typedef EdgeTpuDeviceSpec TableType;
   tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO;
   int32_t num_chips = 0;
@@ -1748,7 +1788,7 @@ struct EdgeTpuDeviceSpecT : public flatbuffers::NativeTable {
   int32_t chip_family = 0;
 };
 
-struct EdgeTpuDeviceSpec FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct EdgeTpuDeviceSpec FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef EdgeTpuDeviceSpecT NativeTableType;
   typedef EdgeTpuDeviceSpecBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -1763,13 +1803,13 @@ struct EdgeTpuDeviceSpec FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t num_chips() const {
     return GetField<int32_t>(VT_NUM_CHIPS, 0);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *device_paths() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(VT_DEVICE_PATHS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *device_paths() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_DEVICE_PATHS);
   }
   int32_t chip_family() const {
     return GetField<int32_t>(VT_CHIP_FAMILY, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_PLATFORM_TYPE, 4) &&
            VerifyField<int32_t>(verifier, VT_NUM_CHIPS, 4) &&
@@ -1779,43 +1819,43 @@ struct EdgeTpuDeviceSpec FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int32_t>(verifier, VT_CHIP_FAMILY, 4) &&
            verifier.EndTable();
   }
-  EdgeTpuDeviceSpecT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(EdgeTpuDeviceSpecT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<EdgeTpuDeviceSpec> Pack(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  EdgeTpuDeviceSpecT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EdgeTpuDeviceSpecT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EdgeTpuDeviceSpec> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct EdgeTpuDeviceSpecBuilder {
   typedef EdgeTpuDeviceSpec Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_platform_type(tflite::EdgeTpuDeviceSpec_::PlatformType platform_type) {
     fbb_.AddElement<int32_t>(EdgeTpuDeviceSpec::VT_PLATFORM_TYPE, static_cast<int32_t>(platform_type), 0);
   }
   void add_num_chips(int32_t num_chips) {
     fbb_.AddElement<int32_t>(EdgeTpuDeviceSpec::VT_NUM_CHIPS, num_chips, 0);
   }
-  void add_device_paths(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> device_paths) {
+  void add_device_paths(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> device_paths) {
     fbb_.AddOffset(EdgeTpuDeviceSpec::VT_DEVICE_PATHS, device_paths);
   }
   void add_chip_family(int32_t chip_family) {
     fbb_.AddElement<int32_t>(EdgeTpuDeviceSpec::VT_CHIP_FAMILY, chip_family, 0);
   }
-  explicit EdgeTpuDeviceSpecBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit EdgeTpuDeviceSpecBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<EdgeTpuDeviceSpec> Finish() {
+  ::flatbuffers::Offset<EdgeTpuDeviceSpec> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<EdgeTpuDeviceSpec>(end);
+    auto o = ::flatbuffers::Offset<EdgeTpuDeviceSpec>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO,
     int32_t num_chips = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>> device_paths = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> device_paths = 0,
     int32_t chip_family = 0) {
   EdgeTpuDeviceSpecBuilder builder_(_fbb);
   builder_.add_chip_family(chip_family);
@@ -1825,13 +1865,13 @@ inline flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpecDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpecDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO,
     int32_t num_chips = 0,
-    const std::vector<flatbuffers::Offset<flatbuffers::String>> *device_paths = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *device_paths = nullptr,
     int32_t chip_family = 0) {
-  auto device_paths__ = device_paths ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*device_paths) : 0;
+  auto device_paths__ = device_paths ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*device_paths) : 0;
   return tflite::CreateEdgeTpuDeviceSpec(
       _fbb,
       platform_type,
@@ -1840,15 +1880,15 @@ inline flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpecDirect(
       chip_family);
 }
 
-flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct EdgeTpuInactivePowerConfigT : public flatbuffers::NativeTable {
+struct EdgeTpuInactivePowerConfigT : public ::flatbuffers::NativeTable {
   typedef EdgeTpuInactivePowerConfig TableType;
   tflite::EdgeTpuPowerState inactive_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE;
   int64_t inactive_timeout_us = 0;
 };
 
-struct EdgeTpuInactivePowerConfig FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct EdgeTpuInactivePowerConfig FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef EdgeTpuInactivePowerConfigT NativeTableType;
   typedef EdgeTpuInactivePowerConfigBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -1861,40 +1901,40 @@ struct EdgeTpuInactivePowerConfig FLATBUFFERS_FINAL_CLASS : private flatbuffers:
   int64_t inactive_timeout_us() const {
     return GetField<int64_t>(VT_INACTIVE_TIMEOUT_US, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_INACTIVE_POWER_STATE, 4) &&
            VerifyField<int64_t>(verifier, VT_INACTIVE_TIMEOUT_US, 8) &&
            verifier.EndTable();
   }
-  EdgeTpuInactivePowerConfigT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(EdgeTpuInactivePowerConfigT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<EdgeTpuInactivePowerConfig> Pack(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  EdgeTpuInactivePowerConfigT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct EdgeTpuInactivePowerConfigBuilder {
   typedef EdgeTpuInactivePowerConfig Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_inactive_power_state(tflite::EdgeTpuPowerState inactive_power_state) {
     fbb_.AddElement<int32_t>(EdgeTpuInactivePowerConfig::VT_INACTIVE_POWER_STATE, static_cast<int32_t>(inactive_power_state), 0);
   }
   void add_inactive_timeout_us(int64_t inactive_timeout_us) {
     fbb_.AddElement<int64_t>(EdgeTpuInactivePowerConfig::VT_INACTIVE_TIMEOUT_US, inactive_timeout_us, 0);
   }
-  explicit EdgeTpuInactivePowerConfigBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit EdgeTpuInactivePowerConfigBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<EdgeTpuInactivePowerConfig> Finish() {
+  ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<EdgeTpuInactivePowerConfig>(end);
+    auto o = ::flatbuffers::Offset<EdgeTpuInactivePowerConfig>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::EdgeTpuPowerState inactive_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE,
     int64_t inactive_timeout_us = 0) {
   EdgeTpuInactivePowerConfigBuilder builder_(_fbb);
@@ -1903,9 +1943,9 @@ inline flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowe
   return builder_.Finish();
 }
 
-flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct EdgeTpuSettingsT : public flatbuffers::NativeTable {
+struct EdgeTpuSettingsT : public ::flatbuffers::NativeTable {
   typedef EdgeTpuSettings TableType;
   tflite::EdgeTpuPowerState inference_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE;
   std::vector<std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT>> inactive_power_configs{};
@@ -1922,7 +1962,7 @@ struct EdgeTpuSettingsT : public flatbuffers::NativeTable {
   EdgeTpuSettingsT &operator=(EdgeTpuSettingsT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef EdgeTpuSettingsT NativeTableType;
   typedef EdgeTpuSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -1939,8 +1979,8 @@ struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::EdgeTpuPowerState inference_power_state() const {
     return static_cast<tflite::EdgeTpuPowerState>(GetField<int32_t>(VT_INFERENCE_POWER_STATE, 0));
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *inactive_power_configs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *>(VT_INACTIVE_POWER_CONFIGS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *inactive_power_configs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *>(VT_INACTIVE_POWER_CONFIGS);
   }
   int32_t inference_priority() const {
     return GetField<int32_t>(VT_INFERENCE_PRIORITY, -1);
@@ -1948,8 +1988,8 @@ struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::EdgeTpuDeviceSpec *edgetpu_device_spec() const {
     return GetPointer<const tflite::EdgeTpuDeviceSpec *>(VT_EDGETPU_DEVICE_SPEC);
   }
-  const flatbuffers::String *model_token() const {
-    return GetPointer<const flatbuffers::String *>(VT_MODEL_TOKEN);
+  const ::flatbuffers::String *model_token() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_TOKEN);
   }
   tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type() const {
     return static_cast<tflite::EdgeTpuSettings_::FloatTruncationType>(GetField<int32_t>(VT_FLOAT_TRUNCATION_TYPE, 0));
@@ -1957,13 +1997,13 @@ struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::EdgeTpuSettings_::QosClass qos_class() const {
     return static_cast<tflite::EdgeTpuSettings_::QosClass>(GetField<int32_t>(VT_QOS_CLASS, 0));
   }
-  const flatbuffers::Vector<int32_t> *hardware_cluster_ids() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_HARDWARE_CLUSTER_IDS);
+  const ::flatbuffers::Vector<int32_t> *hardware_cluster_ids() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_HARDWARE_CLUSTER_IDS);
   }
-  const flatbuffers::String *public_model_id() const {
-    return GetPointer<const flatbuffers::String *>(VT_PUBLIC_MODEL_ID);
+  const ::flatbuffers::String *public_model_id() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_PUBLIC_MODEL_ID);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_INFERENCE_POWER_STATE, 4) &&
            VerifyOffset(verifier, VT_INACTIVE_POWER_CONFIGS) &&
@@ -1982,28 +2022,28 @@ struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(public_model_id()) &&
            verifier.EndTable();
   }
-  EdgeTpuSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(EdgeTpuSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<EdgeTpuSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  EdgeTpuSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EdgeTpuSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct EdgeTpuSettingsBuilder {
   typedef EdgeTpuSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_inference_power_state(tflite::EdgeTpuPowerState inference_power_state) {
     fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_INFERENCE_POWER_STATE, static_cast<int32_t>(inference_power_state), 0);
   }
-  void add_inactive_power_configs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>> inactive_power_configs) {
+  void add_inactive_power_configs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>> inactive_power_configs) {
     fbb_.AddOffset(EdgeTpuSettings::VT_INACTIVE_POWER_CONFIGS, inactive_power_configs);
   }
   void add_inference_priority(int32_t inference_priority) {
     fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_INFERENCE_PRIORITY, inference_priority, -1);
   }
-  void add_edgetpu_device_spec(flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec) {
+  void add_edgetpu_device_spec(::flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec) {
     fbb_.AddOffset(EdgeTpuSettings::VT_EDGETPU_DEVICE_SPEC, edgetpu_device_spec);
   }
-  void add_model_token(flatbuffers::Offset<flatbuffers::String> model_token) {
+  void add_model_token(::flatbuffers::Offset<::flatbuffers::String> model_token) {
     fbb_.AddOffset(EdgeTpuSettings::VT_MODEL_TOKEN, model_token);
   }
   void add_float_truncation_type(tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type) {
@@ -2012,34 +2052,34 @@ struct EdgeTpuSettingsBuilder {
   void add_qos_class(tflite::EdgeTpuSettings_::QosClass qos_class) {
     fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_QOS_CLASS, static_cast<int32_t>(qos_class), 0);
   }
-  void add_hardware_cluster_ids(flatbuffers::Offset<flatbuffers::Vector<int32_t>> hardware_cluster_ids) {
+  void add_hardware_cluster_ids(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> hardware_cluster_ids) {
     fbb_.AddOffset(EdgeTpuSettings::VT_HARDWARE_CLUSTER_IDS, hardware_cluster_ids);
   }
-  void add_public_model_id(flatbuffers::Offset<flatbuffers::String> public_model_id) {
+  void add_public_model_id(::flatbuffers::Offset<::flatbuffers::String> public_model_id) {
     fbb_.AddOffset(EdgeTpuSettings::VT_PUBLIC_MODEL_ID, public_model_id);
   }
-  explicit EdgeTpuSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit EdgeTpuSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<EdgeTpuSettings> Finish() {
+  ::flatbuffers::Offset<EdgeTpuSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<EdgeTpuSettings>(end);
+    auto o = ::flatbuffers::Offset<EdgeTpuSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::EdgeTpuPowerState inference_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>> inactive_power_configs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>> inactive_power_configs = 0,
     int32_t inference_priority = -1,
-    flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec = 0,
-    flatbuffers::Offset<flatbuffers::String> model_token = 0,
+    ::flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_token = 0,
     tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED,
     tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> hardware_cluster_ids = 0,
-    flatbuffers::Offset<flatbuffers::String> public_model_id = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> hardware_cluster_ids = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> public_model_id = 0) {
   EdgeTpuSettingsBuilder builder_(_fbb);
   builder_.add_public_model_id(public_model_id);
   builder_.add_hardware_cluster_ids(hardware_cluster_ids);
@@ -2053,18 +2093,18 @@ inline flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettingsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::EdgeTpuPowerState inference_power_state = tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE,
-    const std::vector<flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *inactive_power_configs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> *inactive_power_configs = nullptr,
     int32_t inference_priority = -1,
-    flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec = 0,
+    ::flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec = 0,
     const char *model_token = nullptr,
     tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED,
     tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED,
     const std::vector<int32_t> *hardware_cluster_ids = nullptr,
     const char *public_model_id = nullptr) {
-  auto inactive_power_configs__ = inactive_power_configs ? _fbb.CreateVector<flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>(*inactive_power_configs) : 0;
+  auto inactive_power_configs__ = inactive_power_configs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>(*inactive_power_configs) : 0;
   auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
   auto hardware_cluster_ids__ = hardware_cluster_ids ? _fbb.CreateVector<int32_t>(*hardware_cluster_ids) : 0;
   auto public_model_id__ = public_model_id ? _fbb.CreateString(public_model_id) : 0;
@@ -2081,24 +2121,36 @@ inline flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettingsDirect(
       public_model_id__);
 }
 
-flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct GoogleEdgeTpuSettingsT : public flatbuffers::NativeTable {
+struct GoogleEdgeTpuSettingsT : public ::flatbuffers::NativeTable {
   typedef GoogleEdgeTpuSettings TableType;
   int32_t log_verbosity = -1;
   bool enable_tracing = false;
   tflite::GoogleEdgeTpuSettings_::Priority priority = tflite::GoogleEdgeTpuSettings_::Priority_PRIORITY_UNDEFINED;
   std::vector<uint8_t> extension_data{};
+  std::string model_identifier{};
+  bool use_async_api = false;
+  bool delegate_should_manage_cache_for_inputs = true;
+  bool delegate_should_manage_cache_for_outputs = true;
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED;
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED;
 };
 
-struct GoogleEdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct GoogleEdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef GoogleEdgeTpuSettingsT NativeTableType;
   typedef GoogleEdgeTpuSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_LOG_VERBOSITY = 4,
     VT_ENABLE_TRACING = 6,
     VT_PRIORITY = 8,
-    VT_EXTENSION_DATA = 10
+    VT_EXTENSION_DATA = 10,
+    VT_MODEL_IDENTIFIER = 12,
+    VT_USE_ASYNC_API = 14,
+    VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS = 16,
+    VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS = 18,
+    VT_PREFER_CACHE_COHERENCY_FOR_INPUTS = 20,
+    VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS = 22
   };
   int32_t log_verbosity() const {
     return GetField<int32_t>(VT_LOG_VERBOSITY, -1);
@@ -2109,27 +2161,52 @@ struct GoogleEdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tabl
   tflite::GoogleEdgeTpuSettings_::Priority priority() const {
     return static_cast<tflite::GoogleEdgeTpuSettings_::Priority>(GetField<int32_t>(VT_PRIORITY, 0));
   }
-  const flatbuffers::Vector<uint8_t> *extension_data() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_EXTENSION_DATA);
+  const ::flatbuffers::Vector<uint8_t> *extension_data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_EXTENSION_DATA);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  const ::flatbuffers::String *model_identifier() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_IDENTIFIER);
+  }
+  bool use_async_api() const {
+    return GetField<uint8_t>(VT_USE_ASYNC_API, 0) != 0;
+  }
+  bool delegate_should_manage_cache_for_inputs() const {
+    return GetField<uint8_t>(VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS, 1) != 0;
+  }
+  bool delegate_should_manage_cache_for_outputs() const {
+    return GetField<uint8_t>(VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, 1) != 0;
+  }
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs() const {
+    return static_cast<tflite::GoogleEdgeTpuSettings_::TriState>(GetField<int32_t>(VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, 0));
+  }
+  tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs() const {
+    return static_cast<tflite::GoogleEdgeTpuSettings_::TriState>(GetField<int32_t>(VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_LOG_VERBOSITY, 4) &&
            VerifyField<uint8_t>(verifier, VT_ENABLE_TRACING, 1) &&
            VerifyField<int32_t>(verifier, VT_PRIORITY, 4) &&
            VerifyOffset(verifier, VT_EXTENSION_DATA) &&
            verifier.VerifyVector(extension_data()) &&
+           VerifyOffset(verifier, VT_MODEL_IDENTIFIER) &&
+           verifier.VerifyString(model_identifier()) &&
+           VerifyField<uint8_t>(verifier, VT_USE_ASYNC_API, 1) &&
+           VerifyField<uint8_t>(verifier, VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, 1) &&
+           VerifyField<int32_t>(verifier, VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, 4) &&
+           VerifyField<int32_t>(verifier, VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, 4) &&
            verifier.EndTable();
   }
-  GoogleEdgeTpuSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(GoogleEdgeTpuSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<GoogleEdgeTpuSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  GoogleEdgeTpuSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GoogleEdgeTpuSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct GoogleEdgeTpuSettingsBuilder {
   typedef GoogleEdgeTpuSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_log_verbosity(int32_t log_verbosity) {
     fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_LOG_VERBOSITY, log_verbosity, -1);
   }
@@ -2139,52 +2216,95 @@ struct GoogleEdgeTpuSettingsBuilder {
   void add_priority(tflite::GoogleEdgeTpuSettings_::Priority priority) {
     fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PRIORITY, static_cast<int32_t>(priority), 0);
   }
-  void add_extension_data(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> extension_data) {
+  void add_extension_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> extension_data) {
     fbb_.AddOffset(GoogleEdgeTpuSettings::VT_EXTENSION_DATA, extension_data);
   }
-  explicit GoogleEdgeTpuSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  void add_model_identifier(::flatbuffers::Offset<::flatbuffers::String> model_identifier) {
+    fbb_.AddOffset(GoogleEdgeTpuSettings::VT_MODEL_IDENTIFIER, model_identifier);
+  }
+  void add_use_async_api(bool use_async_api) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_USE_ASYNC_API, static_cast<uint8_t>(use_async_api), 0);
+  }
+  void add_delegate_should_manage_cache_for_inputs(bool delegate_should_manage_cache_for_inputs) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS, static_cast<uint8_t>(delegate_should_manage_cache_for_inputs), 1);
+  }
+  void add_delegate_should_manage_cache_for_outputs(bool delegate_should_manage_cache_for_outputs) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, static_cast<uint8_t>(delegate_should_manage_cache_for_outputs), 1);
+  }
+  void add_prefer_cache_coherency_for_inputs(tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, static_cast<int32_t>(prefer_cache_coherency_for_inputs), 0);
+  }
+  void add_prefer_cache_coherency_for_outputs(tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs) {
+    fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, static_cast<int32_t>(prefer_cache_coherency_for_outputs), 0);
+  }
+  explicit GoogleEdgeTpuSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<GoogleEdgeTpuSettings> Finish() {
+  ::flatbuffers::Offset<GoogleEdgeTpuSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<GoogleEdgeTpuSettings>(end);
+    auto o = ::flatbuffers::Offset<GoogleEdgeTpuSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t log_verbosity = -1,
     bool enable_tracing = false,
     tflite::GoogleEdgeTpuSettings_::Priority priority = tflite::GoogleEdgeTpuSettings_::Priority_PRIORITY_UNDEFINED,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> extension_data = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> extension_data = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_identifier = 0,
+    bool use_async_api = false,
+    bool delegate_should_manage_cache_for_inputs = true,
+    bool delegate_should_manage_cache_for_outputs = true,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED) {
   GoogleEdgeTpuSettingsBuilder builder_(_fbb);
+  builder_.add_prefer_cache_coherency_for_outputs(prefer_cache_coherency_for_outputs);
+  builder_.add_prefer_cache_coherency_for_inputs(prefer_cache_coherency_for_inputs);
+  builder_.add_model_identifier(model_identifier);
   builder_.add_extension_data(extension_data);
   builder_.add_priority(priority);
   builder_.add_log_verbosity(log_verbosity);
+  builder_.add_delegate_should_manage_cache_for_outputs(delegate_should_manage_cache_for_outputs);
+  builder_.add_delegate_should_manage_cache_for_inputs(delegate_should_manage_cache_for_inputs);
+  builder_.add_use_async_api(use_async_api);
   builder_.add_enable_tracing(enable_tracing);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettingsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t log_verbosity = -1,
     bool enable_tracing = false,
     tflite::GoogleEdgeTpuSettings_::Priority priority = tflite::GoogleEdgeTpuSettings_::Priority_PRIORITY_UNDEFINED,
-    const std::vector<uint8_t> *extension_data = nullptr) {
+    const std::vector<uint8_t> *extension_data = nullptr,
+    const char *model_identifier = nullptr,
+    bool use_async_api = false,
+    bool delegate_should_manage_cache_for_inputs = true,
+    bool delegate_should_manage_cache_for_outputs = true,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED) {
   auto extension_data__ = extension_data ? _fbb.CreateVector<uint8_t>(*extension_data) : 0;
+  auto model_identifier__ = model_identifier ? _fbb.CreateString(model_identifier) : 0;
   return tflite::CreateGoogleEdgeTpuSettings(
       _fbb,
       log_verbosity,
       enable_tracing,
       priority,
-      extension_data__);
+      extension_data__,
+      model_identifier__,
+      use_async_api,
+      delegate_should_manage_cache_for_inputs,
+      delegate_should_manage_cache_for_outputs,
+      prefer_cache_coherency_for_inputs,
+      prefer_cache_coherency_for_outputs);
 }
 
-flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct CoralSettingsT : public flatbuffers::NativeTable {
+struct CoralSettingsT : public ::flatbuffers::NativeTable {
   typedef CoralSettings TableType;
   std::string device{};
   tflite::CoralSettings_::Performance performance = tflite::CoralSettings_::Performance_UNDEFINED;
@@ -2192,7 +2312,7 @@ struct CoralSettingsT : public flatbuffers::NativeTable {
   int32_t usb_max_bulk_in_queue_length = 0;
 };
 
-struct CoralSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct CoralSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef CoralSettingsT NativeTableType;
   typedef CoralSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -2201,8 +2321,8 @@ struct CoralSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_USB_ALWAYS_DFU = 8,
     VT_USB_MAX_BULK_IN_QUEUE_LENGTH = 10
   };
-  const flatbuffers::String *device() const {
-    return GetPointer<const flatbuffers::String *>(VT_DEVICE);
+  const ::flatbuffers::String *device() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DEVICE);
   }
   tflite::CoralSettings_::Performance performance() const {
     return static_cast<tflite::CoralSettings_::Performance>(GetField<int32_t>(VT_PERFORMANCE, 0));
@@ -2213,7 +2333,7 @@ struct CoralSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t usb_max_bulk_in_queue_length() const {
     return GetField<int32_t>(VT_USB_MAX_BULK_IN_QUEUE_LENGTH, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DEVICE) &&
            verifier.VerifyString(device()) &&
@@ -2222,16 +2342,16 @@ struct CoralSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int32_t>(verifier, VT_USB_MAX_BULK_IN_QUEUE_LENGTH, 4) &&
            verifier.EndTable();
   }
-  CoralSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(CoralSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<CoralSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  CoralSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CoralSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CoralSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct CoralSettingsBuilder {
   typedef CoralSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_device(flatbuffers::Offset<flatbuffers::String> device) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_device(::flatbuffers::Offset<::flatbuffers::String> device) {
     fbb_.AddOffset(CoralSettings::VT_DEVICE, device);
   }
   void add_performance(tflite::CoralSettings_::Performance performance) {
@@ -2243,20 +2363,20 @@ struct CoralSettingsBuilder {
   void add_usb_max_bulk_in_queue_length(int32_t usb_max_bulk_in_queue_length) {
     fbb_.AddElement<int32_t>(CoralSettings::VT_USB_MAX_BULK_IN_QUEUE_LENGTH, usb_max_bulk_in_queue_length, 0);
   }
-  explicit CoralSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit CoralSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CoralSettings> Finish() {
+  ::flatbuffers::Offset<CoralSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CoralSettings>(end);
+    auto o = ::flatbuffers::Offset<CoralSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CoralSettings> CreateCoralSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> device = 0,
+inline ::flatbuffers::Offset<CoralSettings> CreateCoralSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> device = 0,
     tflite::CoralSettings_::Performance performance = tflite::CoralSettings_::Performance_UNDEFINED,
     bool usb_always_dfu = false,
     int32_t usb_max_bulk_in_queue_length = 0) {
@@ -2268,8 +2388,8 @@ inline flatbuffers::Offset<CoralSettings> CreateCoralSettings(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<CoralSettings> CreateCoralSettingsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<CoralSettings> CreateCoralSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *device = nullptr,
     tflite::CoralSettings_::Performance performance = tflite::CoralSettings_::Performance_UNDEFINED,
     bool usb_always_dfu = false,
@@ -2283,14 +2403,14 @@ inline flatbuffers::Offset<CoralSettings> CreateCoralSettingsDirect(
       usb_max_bulk_in_queue_length);
 }
 
-flatbuffers::Offset<CoralSettings> CreateCoralSettings(flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<CoralSettings> CreateCoralSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct CPUSettingsT : public flatbuffers::NativeTable {
+struct CPUSettingsT : public ::flatbuffers::NativeTable {
   typedef CPUSettings TableType;
   int32_t num_threads = -1;
 };
 
-struct CPUSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct CPUSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef CPUSettingsT NativeTableType;
   typedef CPUSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -2299,45 +2419,45 @@ struct CPUSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t num_threads() const {
     return GetField<int32_t>(VT_NUM_THREADS, -1);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_NUM_THREADS, 4) &&
            verifier.EndTable();
   }
-  CPUSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(CPUSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<CPUSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  CPUSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CPUSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct CPUSettingsBuilder {
   typedef CPUSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_num_threads(int32_t num_threads) {
     fbb_.AddElement<int32_t>(CPUSettings::VT_NUM_THREADS, num_threads, -1);
   }
-  explicit CPUSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit CPUSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CPUSettings> Finish() {
+  ::flatbuffers::Offset<CPUSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CPUSettings>(end);
+    auto o = ::flatbuffers::Offset<CPUSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CPUSettings> CreateCPUSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t num_threads = -1) {
   CPUSettingsBuilder builder_(_fbb);
   builder_.add_num_threads(num_threads);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<CPUSettings> CreateCPUSettings(flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<CPUSettings> CreateCPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct TFLiteSettingsT : public flatbuffers::NativeTable {
+struct TFLiteSettingsT : public ::flatbuffers::NativeTable {
   typedef TFLiteSettings TableType;
   tflite::Delegate delegate = tflite::Delegate_NONE;
   std::unique_ptr<tflite::NNAPISettingsT> nnapi_settings{};
@@ -2360,7 +2480,7 @@ struct TFLiteSettingsT : public flatbuffers::NativeTable {
   TFLiteSettingsT &operator=(TFLiteSettingsT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef TFLiteSettingsT NativeTableType;
   typedef TFLiteSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -2425,7 +2545,7 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::CompilationCachingSettings *compilation_caching_settings() const {
     return GetPointer<const tflite::CompilationCachingSettings *>(VT_COMPILATION_CACHING_SETTINGS);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_DELEGATE, 4) &&
            VerifyOffset(verifier, VT_NNAPI_SETTINGS) &&
@@ -2456,88 +2576,88 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyTable(compilation_caching_settings()) &&
            verifier.EndTable();
   }
-  TFLiteSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(TFLiteSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<TFLiteSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  TFLiteSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TFLiteSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TFLiteSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct TFLiteSettingsBuilder {
   typedef TFLiteSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_delegate(tflite::Delegate delegate) {
     fbb_.AddElement<int32_t>(TFLiteSettings::VT_DELEGATE, static_cast<int32_t>(delegate), 0);
   }
-  void add_nnapi_settings(flatbuffers::Offset<tflite::NNAPISettings> nnapi_settings) {
+  void add_nnapi_settings(::flatbuffers::Offset<tflite::NNAPISettings> nnapi_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_NNAPI_SETTINGS, nnapi_settings);
   }
-  void add_gpu_settings(flatbuffers::Offset<tflite::GPUSettings> gpu_settings) {
+  void add_gpu_settings(::flatbuffers::Offset<tflite::GPUSettings> gpu_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_GPU_SETTINGS, gpu_settings);
   }
-  void add_hexagon_settings(flatbuffers::Offset<tflite::HexagonSettings> hexagon_settings) {
+  void add_hexagon_settings(::flatbuffers::Offset<tflite::HexagonSettings> hexagon_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_HEXAGON_SETTINGS, hexagon_settings);
   }
-  void add_xnnpack_settings(flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings) {
+  void add_xnnpack_settings(::flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_XNNPACK_SETTINGS, xnnpack_settings);
   }
-  void add_coreml_settings(flatbuffers::Offset<tflite::CoreMLSettings> coreml_settings) {
+  void add_coreml_settings(::flatbuffers::Offset<tflite::CoreMLSettings> coreml_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_COREML_SETTINGS, coreml_settings);
   }
-  void add_cpu_settings(flatbuffers::Offset<tflite::CPUSettings> cpu_settings) {
+  void add_cpu_settings(::flatbuffers::Offset<tflite::CPUSettings> cpu_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_CPU_SETTINGS, cpu_settings);
   }
   void add_max_delegated_partitions(int32_t max_delegated_partitions) {
     fbb_.AddElement<int32_t>(TFLiteSettings::VT_MAX_DELEGATED_PARTITIONS, max_delegated_partitions, 0);
   }
-  void add_edgetpu_settings(flatbuffers::Offset<tflite::EdgeTpuSettings> edgetpu_settings) {
+  void add_edgetpu_settings(::flatbuffers::Offset<tflite::EdgeTpuSettings> edgetpu_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_EDGETPU_SETTINGS, edgetpu_settings);
   }
-  void add_coral_settings(flatbuffers::Offset<tflite::CoralSettings> coral_settings) {
+  void add_coral_settings(::flatbuffers::Offset<tflite::CoralSettings> coral_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_CORAL_SETTINGS, coral_settings);
   }
-  void add_fallback_settings(flatbuffers::Offset<tflite::FallbackSettings> fallback_settings) {
+  void add_fallback_settings(::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_FALLBACK_SETTINGS, fallback_settings);
   }
   void add_disable_default_delegates(bool disable_default_delegates) {
     fbb_.AddElement<uint8_t>(TFLiteSettings::VT_DISABLE_DEFAULT_DELEGATES, static_cast<uint8_t>(disable_default_delegates), 0);
   }
-  void add_stable_delegate_loader_settings(flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings) {
+  void add_stable_delegate_loader_settings(::flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_STABLE_DELEGATE_LOADER_SETTINGS, stable_delegate_loader_settings);
   }
-  void add_google_edgetpu_settings(flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings) {
+  void add_google_edgetpu_settings(::flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_GOOGLE_EDGETPU_SETTINGS, google_edgetpu_settings);
   }
-  void add_compilation_caching_settings(flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings) {
+  void add_compilation_caching_settings(::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_COMPILATION_CACHING_SETTINGS, compilation_caching_settings);
   }
-  explicit TFLiteSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit TFLiteSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<TFLiteSettings> Finish() {
+  ::flatbuffers::Offset<TFLiteSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<TFLiteSettings>(end);
+    auto o = ::flatbuffers::Offset<TFLiteSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::Delegate delegate = tflite::Delegate_NONE,
-    flatbuffers::Offset<tflite::NNAPISettings> nnapi_settings = 0,
-    flatbuffers::Offset<tflite::GPUSettings> gpu_settings = 0,
-    flatbuffers::Offset<tflite::HexagonSettings> hexagon_settings = 0,
-    flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings = 0,
-    flatbuffers::Offset<tflite::CoreMLSettings> coreml_settings = 0,
-    flatbuffers::Offset<tflite::CPUSettings> cpu_settings = 0,
+    ::flatbuffers::Offset<tflite::NNAPISettings> nnapi_settings = 0,
+    ::flatbuffers::Offset<tflite::GPUSettings> gpu_settings = 0,
+    ::flatbuffers::Offset<tflite::HexagonSettings> hexagon_settings = 0,
+    ::flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings = 0,
+    ::flatbuffers::Offset<tflite::CoreMLSettings> coreml_settings = 0,
+    ::flatbuffers::Offset<tflite::CPUSettings> cpu_settings = 0,
     int32_t max_delegated_partitions = 0,
-    flatbuffers::Offset<tflite::EdgeTpuSettings> edgetpu_settings = 0,
-    flatbuffers::Offset<tflite::CoralSettings> coral_settings = 0,
-    flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
+    ::flatbuffers::Offset<tflite::EdgeTpuSettings> edgetpu_settings = 0,
+    ::flatbuffers::Offset<tflite::CoralSettings> coral_settings = 0,
+    ::flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
     bool disable_default_delegates = false,
-    flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings = 0,
-    flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings = 0,
-    flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings = 0) {
+    ::flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings = 0,
+    ::flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings = 0,
+    ::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings = 0) {
   TFLiteSettingsBuilder builder_(_fbb);
   builder_.add_compilation_caching_settings(compilation_caching_settings);
   builder_.add_google_edgetpu_settings(google_edgetpu_settings);
@@ -2557,15 +2677,15 @@ inline flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct FallbackSettingsT : public flatbuffers::NativeTable {
+struct FallbackSettingsT : public ::flatbuffers::NativeTable {
   typedef FallbackSettings TableType;
   bool allow_automatic_fallback_on_compilation_error = false;
   bool allow_automatic_fallback_on_execution_error = false;
 };
 
-struct FallbackSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct FallbackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef FallbackSettingsT NativeTableType;
   typedef FallbackSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -2578,40 +2698,40 @@ struct FallbackSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool allow_automatic_fallback_on_execution_error() const {
     return GetField<uint8_t>(VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR, 1) &&
            VerifyField<uint8_t>(verifier, VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR, 1) &&
            verifier.EndTable();
   }
-  FallbackSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(FallbackSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<FallbackSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  FallbackSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FallbackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FallbackSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct FallbackSettingsBuilder {
   typedef FallbackSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_allow_automatic_fallback_on_compilation_error(bool allow_automatic_fallback_on_compilation_error) {
     fbb_.AddElement<uint8_t>(FallbackSettings::VT_ALLOW_AUTOMATIC_FALLBACK_ON_COMPILATION_ERROR, static_cast<uint8_t>(allow_automatic_fallback_on_compilation_error), 0);
   }
   void add_allow_automatic_fallback_on_execution_error(bool allow_automatic_fallback_on_execution_error) {
     fbb_.AddElement<uint8_t>(FallbackSettings::VT_ALLOW_AUTOMATIC_FALLBACK_ON_EXECUTION_ERROR, static_cast<uint8_t>(allow_automatic_fallback_on_execution_error), 0);
   }
-  explicit FallbackSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit FallbackSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<FallbackSettings> Finish() {
+  ::flatbuffers::Offset<FallbackSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<FallbackSettings>(end);
+    auto o = ::flatbuffers::Offset<FallbackSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool allow_automatic_fallback_on_compilation_error = false,
     bool allow_automatic_fallback_on_execution_error = false) {
   FallbackSettingsBuilder builder_(_fbb);
@@ -2620,28 +2740,28 @@ inline flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BenchmarkMetricT : public flatbuffers::NativeTable {
+struct BenchmarkMetricT : public ::flatbuffers::NativeTable {
   typedef BenchmarkMetric TableType;
   std::string name{};
   std::vector<float> values{};
 };
 
-struct BenchmarkMetric FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BenchmarkMetric FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BenchmarkMetricT NativeTableType;
   typedef BenchmarkMetricBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NAME = 4,
     VT_VALUES = 6
   };
-  const flatbuffers::String *name() const {
-    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
   }
-  const flatbuffers::Vector<float> *values() const {
-    return GetPointer<const flatbuffers::Vector<float> *>(VT_VALUES);
+  const ::flatbuffers::Vector<float> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_VALUES);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
@@ -2649,44 +2769,44 @@ struct BenchmarkMetric FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyVector(values()) &&
            verifier.EndTable();
   }
-  BenchmarkMetricT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BenchmarkMetricT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BenchmarkMetric> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BenchmarkMetricT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkMetricT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkMetric> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BenchmarkMetricBuilder {
   typedef BenchmarkMetric Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
     fbb_.AddOffset(BenchmarkMetric::VT_NAME, name);
   }
-  void add_values(flatbuffers::Offset<flatbuffers::Vector<float>> values) {
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<float>> values) {
     fbb_.AddOffset(BenchmarkMetric::VT_VALUES, values);
   }
-  explicit BenchmarkMetricBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BenchmarkMetricBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BenchmarkMetric> Finish() {
+  ::flatbuffers::Offset<BenchmarkMetric> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BenchmarkMetric>(end);
+    auto o = ::flatbuffers::Offset<BenchmarkMetric>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> name = 0,
-    flatbuffers::Offset<flatbuffers::Vector<float>> values = 0) {
+inline ::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> values = 0) {
   BenchmarkMetricBuilder builder_(_fbb);
   builder_.add_values(values);
   builder_.add_name(name);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetricDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetricDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *name = nullptr,
     const std::vector<float> *values = nullptr) {
   auto name__ = name ? _fbb.CreateString(name) : 0;
@@ -2697,9 +2817,9 @@ inline flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetricDirect(
       values__);
 }
 
-flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BenchmarkResultT : public flatbuffers::NativeTable {
+struct BenchmarkResultT : public ::flatbuffers::NativeTable {
   typedef BenchmarkResult TableType;
   std::vector<int64_t> initialization_time_us{};
   std::vector<int64_t> inference_time_us{};
@@ -2713,7 +2833,7 @@ struct BenchmarkResultT : public flatbuffers::NativeTable {
   BenchmarkResultT &operator=(BenchmarkResultT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct BenchmarkResult FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BenchmarkResult FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BenchmarkResultT NativeTableType;
   typedef BenchmarkResultBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -2724,11 +2844,11 @@ struct BenchmarkResult FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_METRICS = 12,
     VT_ACTUAL_OUTPUT = 14
   };
-  const flatbuffers::Vector<int64_t> *initialization_time_us() const {
-    return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_INITIALIZATION_TIME_US);
+  const ::flatbuffers::Vector<int64_t> *initialization_time_us() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INITIALIZATION_TIME_US);
   }
-  const flatbuffers::Vector<int64_t> *inference_time_us() const {
-    return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_INFERENCE_TIME_US);
+  const ::flatbuffers::Vector<int64_t> *inference_time_us() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INFERENCE_TIME_US);
   }
   int32_t max_memory_kb() const {
     return GetField<int32_t>(VT_MAX_MEMORY_KB, 0);
@@ -2736,13 +2856,13 @@ struct BenchmarkResult FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool ok() const {
     return GetField<uint8_t>(VT_OK, 0) != 0;
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkMetric>> *>(VT_METRICS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>> *>(VT_METRICS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *actual_output() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *>(VT_ACTUAL_OUTPUT);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *actual_output() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *>(VT_ACTUAL_OUTPUT);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_INITIALIZATION_TIME_US) &&
            verifier.VerifyVector(initialization_time_us()) &&
@@ -2758,19 +2878,19 @@ struct BenchmarkResult FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyVectorOfTables(actual_output()) &&
            verifier.EndTable();
   }
-  BenchmarkResultT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BenchmarkResultT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BenchmarkResult> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BenchmarkResultT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkResultT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkResult> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BenchmarkResultBuilder {
   typedef BenchmarkResult Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_initialization_time_us(flatbuffers::Offset<flatbuffers::Vector<int64_t>> initialization_time_us) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_initialization_time_us(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> initialization_time_us) {
     fbb_.AddOffset(BenchmarkResult::VT_INITIALIZATION_TIME_US, initialization_time_us);
   }
-  void add_inference_time_us(flatbuffers::Offset<flatbuffers::Vector<int64_t>> inference_time_us) {
+  void add_inference_time_us(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> inference_time_us) {
     fbb_.AddOffset(BenchmarkResult::VT_INFERENCE_TIME_US, inference_time_us);
   }
   void add_max_memory_kb(int32_t max_memory_kb) {
@@ -2779,31 +2899,31 @@ struct BenchmarkResultBuilder {
   void add_ok(bool ok) {
     fbb_.AddElement<uint8_t>(BenchmarkResult::VT_OK, static_cast<uint8_t>(ok), 0);
   }
-  void add_metrics(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics) {
+  void add_metrics(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics) {
     fbb_.AddOffset(BenchmarkResult::VT_METRICS, metrics);
   }
-  void add_actual_output(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>> actual_output) {
+  void add_actual_output(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>> actual_output) {
     fbb_.AddOffset(BenchmarkResult::VT_ACTUAL_OUTPUT, actual_output);
   }
-  explicit BenchmarkResultBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BenchmarkResultBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BenchmarkResult> Finish() {
+  ::flatbuffers::Offset<BenchmarkResult> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BenchmarkResult>(end);
+    auto o = ::flatbuffers::Offset<BenchmarkResult>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int64_t>> initialization_time_us = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int64_t>> inference_time_us = 0,
+inline ::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> initialization_time_us = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> inference_time_us = 0,
     int32_t max_memory_kb = 0,
     bool ok = false,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>> actual_output = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>> actual_output = 0) {
   BenchmarkResultBuilder builder_(_fbb);
   builder_.add_actual_output(actual_output);
   builder_.add_metrics(metrics);
@@ -2814,18 +2934,18 @@ inline flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResultDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResultDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int64_t> *initialization_time_us = nullptr,
     const std::vector<int64_t> *inference_time_us = nullptr,
     int32_t max_memory_kb = 0,
     bool ok = false,
-    const std::vector<flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *actual_output = nullptr) {
+    const std::vector<::flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *actual_output = nullptr) {
   auto initialization_time_us__ = initialization_time_us ? _fbb.CreateVector<int64_t>(*initialization_time_us) : 0;
   auto inference_time_us__ = inference_time_us ? _fbb.CreateVector<int64_t>(*inference_time_us) : 0;
-  auto metrics__ = metrics ? _fbb.CreateVector<flatbuffers::Offset<tflite::BenchmarkMetric>>(*metrics) : 0;
-  auto actual_output__ = actual_output ? _fbb.CreateVector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>(*actual_output) : 0;
+  auto metrics__ = metrics ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkMetric>>(*metrics) : 0;
+  auto actual_output__ = actual_output ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>(*actual_output) : 0;
   return tflite::CreateBenchmarkResult(
       _fbb,
       initialization_time_us__,
@@ -2836,63 +2956,63 @@ inline flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResultDirect(
       actual_output__);
 }
 
-flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 namespace BenchmarkResult_ {
 
-struct InferenceOutputT : public flatbuffers::NativeTable {
+struct InferenceOutputT : public ::flatbuffers::NativeTable {
   typedef InferenceOutput TableType;
   std::vector<uint8_t> value{};
 };
 
-struct InferenceOutput FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct InferenceOutput FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef InferenceOutputT NativeTableType;
   typedef InferenceOutputBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_VALUE = 4
   };
-  const flatbuffers::Vector<uint8_t> *value() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_VALUE);
+  const ::flatbuffers::Vector<uint8_t> *value() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_VALUE);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_VALUE) &&
            verifier.VerifyVector(value()) &&
            verifier.EndTable();
   }
-  InferenceOutputT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(InferenceOutputT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<InferenceOutput> Pack(flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  InferenceOutputT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(InferenceOutputT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<InferenceOutput> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct InferenceOutputBuilder {
   typedef InferenceOutput Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_value(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> value) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_value(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> value) {
     fbb_.AddOffset(InferenceOutput::VT_VALUE, value);
   }
-  explicit InferenceOutputBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit InferenceOutputBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<InferenceOutput> Finish() {
+  ::flatbuffers::Offset<InferenceOutput> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<InferenceOutput>(end);
+    auto o = ::flatbuffers::Offset<InferenceOutput>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> value = 0) {
+inline ::flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> value = 0) {
   InferenceOutputBuilder builder_(_fbb);
   builder_.add_value(value);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<InferenceOutput> CreateInferenceOutputDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<InferenceOutput> CreateInferenceOutputDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<uint8_t> *value = nullptr) {
   auto value__ = value ? _fbb.CreateVector<uint8_t>(*value) : 0;
   return tflite::BenchmarkResult_::CreateInferenceOutput(
@@ -2900,18 +3020,18 @@ inline flatbuffers::Offset<InferenceOutput> CreateInferenceOutputDirect(
       value__);
 }
 
-flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 }  // namespace BenchmarkResult_
 
-struct ErrorCodeT : public flatbuffers::NativeTable {
+struct ErrorCodeT : public ::flatbuffers::NativeTable {
   typedef ErrorCode TableType;
   tflite::Delegate source = tflite::Delegate_NONE;
   int32_t tflite_error = 0;
   int64_t underlying_api_error = 0;
 };
 
-struct ErrorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ErrorCode FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ErrorCodeT NativeTableType;
   typedef ErrorCodeBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -2928,22 +3048,22 @@ struct ErrorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int64_t underlying_api_error() const {
     return GetField<int64_t>(VT_UNDERLYING_API_ERROR, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_SOURCE, 4) &&
            VerifyField<int32_t>(verifier, VT_TFLITE_ERROR, 4) &&
            VerifyField<int64_t>(verifier, VT_UNDERLYING_API_ERROR, 8) &&
            verifier.EndTable();
   }
-  ErrorCodeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ErrorCodeT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ErrorCode> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ErrorCodeT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ErrorCodeT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ErrorCode> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ErrorCodeBuilder {
   typedef ErrorCode Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_source(tflite::Delegate source) {
     fbb_.AddElement<int32_t>(ErrorCode::VT_SOURCE, static_cast<int32_t>(source), 0);
   }
@@ -2953,19 +3073,19 @@ struct ErrorCodeBuilder {
   void add_underlying_api_error(int64_t underlying_api_error) {
     fbb_.AddElement<int64_t>(ErrorCode::VT_UNDERLYING_API_ERROR, underlying_api_error, 0);
   }
-  explicit ErrorCodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ErrorCodeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ErrorCode> Finish() {
+  ::flatbuffers::Offset<ErrorCode> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ErrorCode>(end);
+    auto o = ::flatbuffers::Offset<ErrorCode>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ErrorCode> CreateErrorCode(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ErrorCode> CreateErrorCode(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::Delegate source = tflite::Delegate_NONE,
     int32_t tflite_error = 0,
     int64_t underlying_api_error = 0) {
@@ -2976,9 +3096,9 @@ inline flatbuffers::Offset<ErrorCode> CreateErrorCode(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ErrorCode> CreateErrorCode(flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ErrorCode> CreateErrorCode(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BenchmarkErrorT : public flatbuffers::NativeTable {
+struct BenchmarkErrorT : public ::flatbuffers::NativeTable {
   typedef BenchmarkError TableType;
   tflite::BenchmarkStage stage = tflite::BenchmarkStage_UNKNOWN;
   int32_t exit_code = 0;
@@ -2991,7 +3111,7 @@ struct BenchmarkErrorT : public flatbuffers::NativeTable {
   BenchmarkErrorT &operator=(BenchmarkErrorT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct BenchmarkError FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BenchmarkError FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BenchmarkErrorT NativeTableType;
   typedef BenchmarkErrorBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -3010,13 +3130,13 @@ struct BenchmarkError FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t signal() const {
     return GetField<int32_t>(VT_SIGNAL, 0);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::ErrorCode>> *error_code() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::ErrorCode>> *>(VT_ERROR_CODE);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>> *error_code() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>> *>(VT_ERROR_CODE);
   }
   int32_t mini_benchmark_error_code() const {
     return GetField<int32_t>(VT_MINI_BENCHMARK_ERROR_CODE, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_STAGE, 4) &&
            VerifyField<int32_t>(verifier, VT_EXIT_CODE, 4) &&
@@ -3027,15 +3147,15 @@ struct BenchmarkError FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int32_t>(verifier, VT_MINI_BENCHMARK_ERROR_CODE, 4) &&
            verifier.EndTable();
   }
-  BenchmarkErrorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BenchmarkErrorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BenchmarkError> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BenchmarkErrorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkErrorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkError> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BenchmarkErrorBuilder {
   typedef BenchmarkError Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_stage(tflite::BenchmarkStage stage) {
     fbb_.AddElement<int32_t>(BenchmarkError::VT_STAGE, static_cast<int32_t>(stage), 0);
   }
@@ -3045,29 +3165,29 @@ struct BenchmarkErrorBuilder {
   void add_signal(int32_t signal) {
     fbb_.AddElement<int32_t>(BenchmarkError::VT_SIGNAL, signal, 0);
   }
-  void add_error_code(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::ErrorCode>>> error_code) {
+  void add_error_code(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>>> error_code) {
     fbb_.AddOffset(BenchmarkError::VT_ERROR_CODE, error_code);
   }
   void add_mini_benchmark_error_code(int32_t mini_benchmark_error_code) {
     fbb_.AddElement<int32_t>(BenchmarkError::VT_MINI_BENCHMARK_ERROR_CODE, mini_benchmark_error_code, 0);
   }
-  explicit BenchmarkErrorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BenchmarkErrorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BenchmarkError> Finish() {
+  ::flatbuffers::Offset<BenchmarkError> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BenchmarkError>(end);
+    auto o = ::flatbuffers::Offset<BenchmarkError>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::BenchmarkStage stage = tflite::BenchmarkStage_UNKNOWN,
     int32_t exit_code = 0,
     int32_t signal = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::ErrorCode>>> error_code = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::ErrorCode>>> error_code = 0,
     int32_t mini_benchmark_error_code = 0) {
   BenchmarkErrorBuilder builder_(_fbb);
   builder_.add_mini_benchmark_error_code(mini_benchmark_error_code);
@@ -3078,14 +3198,14 @@ inline flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<BenchmarkError> CreateBenchmarkErrorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BenchmarkError> CreateBenchmarkErrorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::BenchmarkStage stage = tflite::BenchmarkStage_UNKNOWN,
     int32_t exit_code = 0,
     int32_t signal = 0,
-    const std::vector<flatbuffers::Offset<tflite::ErrorCode>> *error_code = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::ErrorCode>> *error_code = nullptr,
     int32_t mini_benchmark_error_code = 0) {
-  auto error_code__ = error_code ? _fbb.CreateVector<flatbuffers::Offset<tflite::ErrorCode>>(*error_code) : 0;
+  auto error_code__ = error_code ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ErrorCode>>(*error_code) : 0;
   return tflite::CreateBenchmarkError(
       _fbb,
       stage,
@@ -3095,9 +3215,9 @@ inline flatbuffers::Offset<BenchmarkError> CreateBenchmarkErrorDirect(
       mini_benchmark_error_code);
 }
 
-flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BenchmarkEventT : public flatbuffers::NativeTable {
+struct BenchmarkEventT : public ::flatbuffers::NativeTable {
   typedef BenchmarkEvent TableType;
   std::unique_ptr<tflite::TFLiteSettingsT> tflite_settings{};
   tflite::BenchmarkEventType event_type = tflite::BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE;
@@ -3111,7 +3231,7 @@ struct BenchmarkEventT : public flatbuffers::NativeTable {
   BenchmarkEventT &operator=(BenchmarkEventT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct BenchmarkEvent FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BenchmarkEvent FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BenchmarkEventT NativeTableType;
   typedef BenchmarkEventBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -3140,7 +3260,7 @@ struct BenchmarkEvent FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int64_t wallclock_us() const {
     return GetField<int64_t>(VT_WALLCLOCK_US, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_TFLITE_SETTINGS) &&
            verifier.VerifyTable(tflite_settings()) &&
@@ -3153,25 +3273,25 @@ struct BenchmarkEvent FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int64_t>(verifier, VT_WALLCLOCK_US, 8) &&
            verifier.EndTable();
   }
-  BenchmarkEventT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BenchmarkEventT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BenchmarkEvent> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BenchmarkEventT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkEvent> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BenchmarkEventBuilder {
   typedef BenchmarkEvent Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_tflite_settings(flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_tflite_settings(::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings) {
     fbb_.AddOffset(BenchmarkEvent::VT_TFLITE_SETTINGS, tflite_settings);
   }
   void add_event_type(tflite::BenchmarkEventType event_type) {
     fbb_.AddElement<int32_t>(BenchmarkEvent::VT_EVENT_TYPE, static_cast<int32_t>(event_type), 0);
   }
-  void add_result(flatbuffers::Offset<tflite::BenchmarkResult> result) {
+  void add_result(::flatbuffers::Offset<tflite::BenchmarkResult> result) {
     fbb_.AddOffset(BenchmarkEvent::VT_RESULT, result);
   }
-  void add_error(flatbuffers::Offset<tflite::BenchmarkError> error) {
+  void add_error(::flatbuffers::Offset<tflite::BenchmarkError> error) {
     fbb_.AddOffset(BenchmarkEvent::VT_ERROR, error);
   }
   void add_boottime_us(int64_t boottime_us) {
@@ -3180,23 +3300,23 @@ struct BenchmarkEventBuilder {
   void add_wallclock_us(int64_t wallclock_us) {
     fbb_.AddElement<int64_t>(BenchmarkEvent::VT_WALLCLOCK_US, wallclock_us, 0);
   }
-  explicit BenchmarkEventBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BenchmarkEventBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BenchmarkEvent> Finish() {
+  ::flatbuffers::Offset<BenchmarkEvent> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BenchmarkEvent>(end);
+    auto o = ::flatbuffers::Offset<BenchmarkEvent>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
+inline ::flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::TFLiteSettings> tflite_settings = 0,
     tflite::BenchmarkEventType event_type = tflite::BenchmarkEventType_UNDEFINED_BENCHMARK_EVENT_TYPE,
-    flatbuffers::Offset<tflite::BenchmarkResult> result = 0,
-    flatbuffers::Offset<tflite::BenchmarkError> error = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkResult> result = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkError> error = 0,
     int64_t boottime_us = 0,
     int64_t wallclock_us = 0) {
   BenchmarkEventBuilder builder_(_fbb);
@@ -3209,9 +3329,9 @@ inline flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BestAccelerationDecisionT : public flatbuffers::NativeTable {
+struct BestAccelerationDecisionT : public ::flatbuffers::NativeTable {
   typedef BestAccelerationDecision TableType;
   int32_t number_of_source_events = 0;
   std::unique_ptr<tflite::BenchmarkEventT> min_latency_event{};
@@ -3222,7 +3342,7 @@ struct BestAccelerationDecisionT : public flatbuffers::NativeTable {
   BestAccelerationDecisionT &operator=(BestAccelerationDecisionT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct BestAccelerationDecision FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BestAccelerationDecision FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BestAccelerationDecisionT NativeTableType;
   typedef BestAccelerationDecisionBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -3239,7 +3359,7 @@ struct BestAccelerationDecision FLATBUFFERS_FINAL_CLASS : private flatbuffers::T
   int64_t min_inference_time_us() const {
     return GetField<int64_t>(VT_MIN_INFERENCE_TIME_US, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_NUMBER_OF_SOURCE_EVENTS, 4) &&
            VerifyOffset(verifier, VT_MIN_LATENCY_EVENT) &&
@@ -3247,39 +3367,39 @@ struct BestAccelerationDecision FLATBUFFERS_FINAL_CLASS : private flatbuffers::T
            VerifyField<int64_t>(verifier, VT_MIN_INFERENCE_TIME_US, 8) &&
            verifier.EndTable();
   }
-  BestAccelerationDecisionT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BestAccelerationDecisionT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BestAccelerationDecision> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BestAccelerationDecisionT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BestAccelerationDecisionT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BestAccelerationDecision> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BestAccelerationDecisionBuilder {
   typedef BestAccelerationDecision Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_number_of_source_events(int32_t number_of_source_events) {
     fbb_.AddElement<int32_t>(BestAccelerationDecision::VT_NUMBER_OF_SOURCE_EVENTS, number_of_source_events, 0);
   }
-  void add_min_latency_event(flatbuffers::Offset<tflite::BenchmarkEvent> min_latency_event) {
+  void add_min_latency_event(::flatbuffers::Offset<tflite::BenchmarkEvent> min_latency_event) {
     fbb_.AddOffset(BestAccelerationDecision::VT_MIN_LATENCY_EVENT, min_latency_event);
   }
   void add_min_inference_time_us(int64_t min_inference_time_us) {
     fbb_.AddElement<int64_t>(BestAccelerationDecision::VT_MIN_INFERENCE_TIME_US, min_inference_time_us, 0);
   }
-  explicit BestAccelerationDecisionBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BestAccelerationDecisionBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BestAccelerationDecision> Finish() {
+  ::flatbuffers::Offset<BestAccelerationDecision> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BestAccelerationDecision>(end);
+    auto o = ::flatbuffers::Offset<BestAccelerationDecision>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t number_of_source_events = 0,
-    flatbuffers::Offset<tflite::BenchmarkEvent> min_latency_event = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkEvent> min_latency_event = 0,
     int64_t min_inference_time_us = 0) {
   BestAccelerationDecisionBuilder builder_(_fbb);
   builder_.add_min_inference_time_us(min_inference_time_us);
@@ -3288,14 +3408,14 @@ inline flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecis
   return builder_.Finish();
 }
 
-flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BenchmarkInitializationFailureT : public flatbuffers::NativeTable {
+struct BenchmarkInitializationFailureT : public ::flatbuffers::NativeTable {
   typedef BenchmarkInitializationFailure TableType;
   int32_t initialization_status = 0;
 };
 
-struct BenchmarkInitializationFailure FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BenchmarkInitializationFailure FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BenchmarkInitializationFailureT NativeTableType;
   typedef BenchmarkInitializationFailureBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -3304,45 +3424,45 @@ struct BenchmarkInitializationFailure FLATBUFFERS_FINAL_CLASS : private flatbuff
   int32_t initialization_status() const {
     return GetField<int32_t>(VT_INITIALIZATION_STATUS, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_INITIALIZATION_STATUS, 4) &&
            verifier.EndTable();
   }
-  BenchmarkInitializationFailureT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BenchmarkInitializationFailureT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BenchmarkInitializationFailure> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BenchmarkInitializationFailureT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkInitializationFailureT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkInitializationFailure> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BenchmarkInitializationFailureBuilder {
   typedef BenchmarkInitializationFailure Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_initialization_status(int32_t initialization_status) {
     fbb_.AddElement<int32_t>(BenchmarkInitializationFailure::VT_INITIALIZATION_STATUS, initialization_status, 0);
   }
-  explicit BenchmarkInitializationFailureBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BenchmarkInitializationFailureBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BenchmarkInitializationFailure> Finish() {
+  ::flatbuffers::Offset<BenchmarkInitializationFailure> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BenchmarkInitializationFailure>(end);
+    auto o = ::flatbuffers::Offset<BenchmarkInitializationFailure>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t initialization_status = 0) {
   BenchmarkInitializationFailureBuilder builder_(_fbb);
   builder_.add_initialization_status(initialization_status);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct MiniBenchmarkEventT : public flatbuffers::NativeTable {
+struct MiniBenchmarkEventT : public ::flatbuffers::NativeTable {
   typedef MiniBenchmarkEvent TableType;
   bool is_log_flushing_event = false;
   std::unique_ptr<tflite::BestAccelerationDecisionT> best_acceleration_decision{};
@@ -3354,7 +3474,7 @@ struct MiniBenchmarkEventT : public flatbuffers::NativeTable {
   MiniBenchmarkEventT &operator=(MiniBenchmarkEventT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct MiniBenchmarkEvent FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct MiniBenchmarkEvent FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef MiniBenchmarkEventT NativeTableType;
   typedef MiniBenchmarkEventBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -3375,7 +3495,7 @@ struct MiniBenchmarkEvent FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::BenchmarkEvent *benchmark_event() const {
     return GetPointer<const tflite::BenchmarkEvent *>(VT_BENCHMARK_EVENT);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_IS_LOG_FLUSHING_EVENT, 1) &&
            VerifyOffset(verifier, VT_BEST_ACCELERATION_DECISION) &&
@@ -3386,44 +3506,44 @@ struct MiniBenchmarkEvent FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyTable(benchmark_event()) &&
            verifier.EndTable();
   }
-  MiniBenchmarkEventT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MiniBenchmarkEventT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<MiniBenchmarkEvent> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MiniBenchmarkEventT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MiniBenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MiniBenchmarkEvent> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct MiniBenchmarkEventBuilder {
   typedef MiniBenchmarkEvent Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_is_log_flushing_event(bool is_log_flushing_event) {
     fbb_.AddElement<uint8_t>(MiniBenchmarkEvent::VT_IS_LOG_FLUSHING_EVENT, static_cast<uint8_t>(is_log_flushing_event), 0);
   }
-  void add_best_acceleration_decision(flatbuffers::Offset<tflite::BestAccelerationDecision> best_acceleration_decision) {
+  void add_best_acceleration_decision(::flatbuffers::Offset<tflite::BestAccelerationDecision> best_acceleration_decision) {
     fbb_.AddOffset(MiniBenchmarkEvent::VT_BEST_ACCELERATION_DECISION, best_acceleration_decision);
   }
-  void add_initialization_failure(flatbuffers::Offset<tflite::BenchmarkInitializationFailure> initialization_failure) {
+  void add_initialization_failure(::flatbuffers::Offset<tflite::BenchmarkInitializationFailure> initialization_failure) {
     fbb_.AddOffset(MiniBenchmarkEvent::VT_INITIALIZATION_FAILURE, initialization_failure);
   }
-  void add_benchmark_event(flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event) {
+  void add_benchmark_event(::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event) {
     fbb_.AddOffset(MiniBenchmarkEvent::VT_BENCHMARK_EVENT, benchmark_event);
   }
-  explicit MiniBenchmarkEventBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit MiniBenchmarkEventBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<MiniBenchmarkEvent> Finish() {
+  ::flatbuffers::Offset<MiniBenchmarkEvent> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<MiniBenchmarkEvent>(end);
+    auto o = ::flatbuffers::Offset<MiniBenchmarkEvent>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool is_log_flushing_event = false,
-    flatbuffers::Offset<tflite::BestAccelerationDecision> best_acceleration_decision = 0,
-    flatbuffers::Offset<tflite::BenchmarkInitializationFailure> initialization_failure = 0,
-    flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event = 0) {
+    ::flatbuffers::Offset<tflite::BestAccelerationDecision> best_acceleration_decision = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkInitializationFailure> initialization_failure = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event = 0) {
   MiniBenchmarkEventBuilder builder_(_fbb);
   builder_.add_benchmark_event(benchmark_event);
   builder_.add_initialization_failure(initialization_failure);
@@ -3432,9 +3552,9 @@ inline flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ModelFileT : public flatbuffers::NativeTable {
+struct ModelFileT : public ::flatbuffers::NativeTable {
   typedef ModelFile TableType;
   std::string filename{};
   int64_t fd = 0;
@@ -3448,7 +3568,7 @@ struct ModelFileT : public flatbuffers::NativeTable {
   ModelFileT &operator=(ModelFileT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct ModelFile FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ModelFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ModelFileT NativeTableType;
   typedef ModelFileBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -3459,8 +3579,8 @@ struct ModelFile FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_MODEL_ID_GROUP = 12,
     VT_BUFFER_HANDLE = 14
   };
-  const flatbuffers::String *filename() const {
-    return GetPointer<const flatbuffers::String *>(VT_FILENAME);
+  const ::flatbuffers::String *filename() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_FILENAME);
   }
   int64_t fd() const {
     return GetField<int64_t>(VT_FD, 0);
@@ -3477,7 +3597,7 @@ struct ModelFile FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int64_t buffer_handle() const {
     return GetField<int64_t>(VT_BUFFER_HANDLE, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_FILENAME) &&
            verifier.VerifyString(filename()) &&
@@ -3489,16 +3609,16 @@ struct ModelFile FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int64_t>(verifier, VT_BUFFER_HANDLE, 8) &&
            verifier.EndTable();
   }
-  ModelFileT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ModelFileT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ModelFile> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ModelFileT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelFileT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ModelFile> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ModelFileBuilder {
   typedef ModelFile Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_filename(flatbuffers::Offset<flatbuffers::String> filename) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_filename(::flatbuffers::Offset<::flatbuffers::String> filename) {
     fbb_.AddOffset(ModelFile::VT_FILENAME, filename);
   }
   void add_fd(int64_t fd) {
@@ -3510,30 +3630,30 @@ struct ModelFileBuilder {
   void add_length(int64_t length) {
     fbb_.AddElement<int64_t>(ModelFile::VT_LENGTH, length, 0);
   }
-  void add_model_id_group(flatbuffers::Offset<tflite::ModelIdGroup> model_id_group) {
+  void add_model_id_group(::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group) {
     fbb_.AddOffset(ModelFile::VT_MODEL_ID_GROUP, model_id_group);
   }
   void add_buffer_handle(int64_t buffer_handle) {
     fbb_.AddElement<int64_t>(ModelFile::VT_BUFFER_HANDLE, buffer_handle, 0);
   }
-  explicit ModelFileBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ModelFileBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ModelFile> Finish() {
+  ::flatbuffers::Offset<ModelFile> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ModelFile>(end);
+    auto o = ::flatbuffers::Offset<ModelFile>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ModelFile> CreateModelFile(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> filename = 0,
+inline ::flatbuffers::Offset<ModelFile> CreateModelFile(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> filename = 0,
     int64_t fd = 0,
     int64_t offset = 0,
     int64_t length = 0,
-    flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    ::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
     int64_t buffer_handle = 0) {
   ModelFileBuilder builder_(_fbb);
   builder_.add_buffer_handle(buffer_handle);
@@ -3545,13 +3665,13 @@ inline flatbuffers::Offset<ModelFile> CreateModelFile(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<ModelFile> CreateModelFileDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ModelFile> CreateModelFileDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *filename = nullptr,
     int64_t fd = 0,
     int64_t offset = 0,
     int64_t length = 0,
-    flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    ::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
     int64_t buffer_handle = 0) {
   auto filename__ = filename ? _fbb.CreateString(filename) : 0;
   return tflite::CreateModelFile(
@@ -3564,28 +3684,28 @@ inline flatbuffers::Offset<ModelFile> CreateModelFileDirect(
       buffer_handle);
 }
 
-flatbuffers::Offset<ModelFile> CreateModelFile(flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ModelFile> CreateModelFile(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ModelIdGroupT : public flatbuffers::NativeTable {
+struct ModelIdGroupT : public ::flatbuffers::NativeTable {
   typedef ModelIdGroup TableType;
   std::string model_namespace{};
   std::string model_id{};
 };
 
-struct ModelIdGroup FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ModelIdGroup FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ModelIdGroupT NativeTableType;
   typedef ModelIdGroupBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_MODEL_NAMESPACE = 4,
     VT_MODEL_ID = 6
   };
-  const flatbuffers::String *model_namespace() const {
-    return GetPointer<const flatbuffers::String *>(VT_MODEL_NAMESPACE);
+  const ::flatbuffers::String *model_namespace() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_NAMESPACE);
   }
-  const flatbuffers::String *model_id() const {
-    return GetPointer<const flatbuffers::String *>(VT_MODEL_ID);
+  const ::flatbuffers::String *model_id() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_MODEL_ID);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_MODEL_NAMESPACE) &&
            verifier.VerifyString(model_namespace()) &&
@@ -3593,44 +3713,44 @@ struct ModelIdGroup FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(model_id()) &&
            verifier.EndTable();
   }
-  ModelIdGroupT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ModelIdGroupT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ModelIdGroup> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ModelIdGroupT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelIdGroupT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ModelIdGroup> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ModelIdGroupBuilder {
   typedef ModelIdGroup Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_model_namespace(flatbuffers::Offset<flatbuffers::String> model_namespace) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_model_namespace(::flatbuffers::Offset<::flatbuffers::String> model_namespace) {
     fbb_.AddOffset(ModelIdGroup::VT_MODEL_NAMESPACE, model_namespace);
   }
-  void add_model_id(flatbuffers::Offset<flatbuffers::String> model_id) {
+  void add_model_id(::flatbuffers::Offset<::flatbuffers::String> model_id) {
     fbb_.AddOffset(ModelIdGroup::VT_MODEL_ID, model_id);
   }
-  explicit ModelIdGroupBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ModelIdGroupBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ModelIdGroup> Finish() {
+  ::flatbuffers::Offset<ModelIdGroup> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ModelIdGroup>(end);
+    auto o = ::flatbuffers::Offset<ModelIdGroup>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> model_namespace = 0,
-    flatbuffers::Offset<flatbuffers::String> model_id = 0) {
+inline ::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> model_namespace = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> model_id = 0) {
   ModelIdGroupBuilder builder_(_fbb);
   builder_.add_model_id(model_id);
   builder_.add_model_namespace(model_namespace);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<ModelIdGroup> CreateModelIdGroupDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroupDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *model_namespace = nullptr,
     const char *model_id = nullptr) {
   auto model_namespace__ = model_namespace ? _fbb.CreateString(model_namespace) : 0;
@@ -3641,28 +3761,28 @@ inline flatbuffers::Offset<ModelIdGroup> CreateModelIdGroupDirect(
       model_id__);
 }
 
-flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BenchmarkStoragePathsT : public flatbuffers::NativeTable {
+struct BenchmarkStoragePathsT : public ::flatbuffers::NativeTable {
   typedef BenchmarkStoragePaths TableType;
   std::string storage_file_path{};
   std::string data_directory_path{};
 };
 
-struct BenchmarkStoragePaths FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BenchmarkStoragePaths FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BenchmarkStoragePathsT NativeTableType;
   typedef BenchmarkStoragePathsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_STORAGE_FILE_PATH = 4,
     VT_DATA_DIRECTORY_PATH = 6
   };
-  const flatbuffers::String *storage_file_path() const {
-    return GetPointer<const flatbuffers::String *>(VT_STORAGE_FILE_PATH);
+  const ::flatbuffers::String *storage_file_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_STORAGE_FILE_PATH);
   }
-  const flatbuffers::String *data_directory_path() const {
-    return GetPointer<const flatbuffers::String *>(VT_DATA_DIRECTORY_PATH);
+  const ::flatbuffers::String *data_directory_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DATA_DIRECTORY_PATH);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_STORAGE_FILE_PATH) &&
            verifier.VerifyString(storage_file_path()) &&
@@ -3670,44 +3790,44 @@ struct BenchmarkStoragePaths FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tabl
            verifier.VerifyString(data_directory_path()) &&
            verifier.EndTable();
   }
-  BenchmarkStoragePathsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BenchmarkStoragePathsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BenchmarkStoragePaths> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BenchmarkStoragePathsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkStoragePathsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkStoragePaths> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BenchmarkStoragePathsBuilder {
   typedef BenchmarkStoragePaths Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_storage_file_path(flatbuffers::Offset<flatbuffers::String> storage_file_path) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_storage_file_path(::flatbuffers::Offset<::flatbuffers::String> storage_file_path) {
     fbb_.AddOffset(BenchmarkStoragePaths::VT_STORAGE_FILE_PATH, storage_file_path);
   }
-  void add_data_directory_path(flatbuffers::Offset<flatbuffers::String> data_directory_path) {
+  void add_data_directory_path(::flatbuffers::Offset<::flatbuffers::String> data_directory_path) {
     fbb_.AddOffset(BenchmarkStoragePaths::VT_DATA_DIRECTORY_PATH, data_directory_path);
   }
-  explicit BenchmarkStoragePathsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BenchmarkStoragePathsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BenchmarkStoragePaths> Finish() {
+  ::flatbuffers::Offset<BenchmarkStoragePaths> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BenchmarkStoragePaths>(end);
+    auto o = ::flatbuffers::Offset<BenchmarkStoragePaths>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> storage_file_path = 0,
-    flatbuffers::Offset<flatbuffers::String> data_directory_path = 0) {
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> storage_file_path = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> data_directory_path = 0) {
   BenchmarkStoragePathsBuilder builder_(_fbb);
   builder_.add_data_directory_path(data_directory_path);
   builder_.add_storage_file_path(storage_file_path);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePathsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePathsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *storage_file_path = nullptr,
     const char *data_directory_path = nullptr) {
   auto storage_file_path__ = storage_file_path ? _fbb.CreateString(storage_file_path) : 0;
@@ -3718,14 +3838,14 @@ inline flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePathsDir
       data_directory_path__);
 }
 
-flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ValidationSettingsT : public flatbuffers::NativeTable {
+struct ValidationSettingsT : public ::flatbuffers::NativeTable {
   typedef ValidationSettings TableType;
   int64_t per_test_timeout_ms = 0;
 };
 
-struct ValidationSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ValidationSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ValidationSettingsT NativeTableType;
   typedef ValidationSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -3734,45 +3854,45 @@ struct ValidationSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int64_t per_test_timeout_ms() const {
     return GetField<int64_t>(VT_PER_TEST_TIMEOUT_MS, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int64_t>(verifier, VT_PER_TEST_TIMEOUT_MS, 8) &&
            verifier.EndTable();
   }
-  ValidationSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ValidationSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ValidationSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ValidationSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ValidationSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ValidationSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ValidationSettingsBuilder {
   typedef ValidationSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_per_test_timeout_ms(int64_t per_test_timeout_ms) {
     fbb_.AddElement<int64_t>(ValidationSettings::VT_PER_TEST_TIMEOUT_MS, per_test_timeout_ms, 0);
   }
-  explicit ValidationSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ValidationSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ValidationSettings> Finish() {
+  ::flatbuffers::Offset<ValidationSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ValidationSettings>(end);
+    auto o = ::flatbuffers::Offset<ValidationSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ValidationSettings> CreateValidationSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ValidationSettings> CreateValidationSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int64_t per_test_timeout_ms = 0) {
   ValidationSettingsBuilder builder_(_fbb);
   builder_.add_per_test_timeout_ms(per_test_timeout_ms);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ValidationSettings> CreateValidationSettings(flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ValidationSettings> CreateValidationSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct MinibenchmarkSettingsT : public flatbuffers::NativeTable {
+struct MinibenchmarkSettingsT : public ::flatbuffers::NativeTable {
   typedef MinibenchmarkSettings TableType;
   std::vector<std::unique_ptr<tflite::TFLiteSettingsT>> settings_to_test{};
   std::unique_ptr<tflite::ModelFileT> model_file{};
@@ -3784,7 +3904,7 @@ struct MinibenchmarkSettingsT : public flatbuffers::NativeTable {
   MinibenchmarkSettingsT &operator=(MinibenchmarkSettingsT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct MinibenchmarkSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct MinibenchmarkSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef MinibenchmarkSettingsT NativeTableType;
   typedef MinibenchmarkSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -3793,8 +3913,8 @@ struct MinibenchmarkSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tabl
     VT_STORAGE_PATHS = 8,
     VT_VALIDATION_SETTINGS = 10
   };
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::TFLiteSettings>> *settings_to_test() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::TFLiteSettings>> *>(VT_SETTINGS_TO_TEST);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>> *settings_to_test() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>> *>(VT_SETTINGS_TO_TEST);
   }
   const tflite::ModelFile *model_file() const {
     return GetPointer<const tflite::ModelFile *>(VT_MODEL_FILE);
@@ -3805,7 +3925,7 @@ struct MinibenchmarkSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tabl
   const tflite::ValidationSettings *validation_settings() const {
     return GetPointer<const tflite::ValidationSettings *>(VT_VALIDATION_SETTINGS);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_SETTINGS_TO_TEST) &&
            verifier.VerifyVector(settings_to_test()) &&
@@ -3818,44 +3938,44 @@ struct MinibenchmarkSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tabl
            verifier.VerifyTable(validation_settings()) &&
            verifier.EndTable();
   }
-  MinibenchmarkSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MinibenchmarkSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<MinibenchmarkSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MinibenchmarkSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MinibenchmarkSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MinibenchmarkSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct MinibenchmarkSettingsBuilder {
   typedef MinibenchmarkSettings Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_settings_to_test(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TFLiteSettings>>> settings_to_test) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_settings_to_test(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>>> settings_to_test) {
     fbb_.AddOffset(MinibenchmarkSettings::VT_SETTINGS_TO_TEST, settings_to_test);
   }
-  void add_model_file(flatbuffers::Offset<tflite::ModelFile> model_file) {
+  void add_model_file(::flatbuffers::Offset<tflite::ModelFile> model_file) {
     fbb_.AddOffset(MinibenchmarkSettings::VT_MODEL_FILE, model_file);
   }
-  void add_storage_paths(flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths) {
+  void add_storage_paths(::flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths) {
     fbb_.AddOffset(MinibenchmarkSettings::VT_STORAGE_PATHS, storage_paths);
   }
-  void add_validation_settings(flatbuffers::Offset<tflite::ValidationSettings> validation_settings) {
+  void add_validation_settings(::flatbuffers::Offset<tflite::ValidationSettings> validation_settings) {
     fbb_.AddOffset(MinibenchmarkSettings::VT_VALIDATION_SETTINGS, validation_settings);
   }
-  explicit MinibenchmarkSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit MinibenchmarkSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<MinibenchmarkSettings> Finish() {
+  ::flatbuffers::Offset<MinibenchmarkSettings> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<MinibenchmarkSettings>(end);
+    auto o = ::flatbuffers::Offset<MinibenchmarkSettings>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TFLiteSettings>>> settings_to_test = 0,
-    flatbuffers::Offset<tflite::ModelFile> model_file = 0,
-    flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths = 0,
-    flatbuffers::Offset<tflite::ValidationSettings> validation_settings = 0) {
+inline ::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TFLiteSettings>>> settings_to_test = 0,
+    ::flatbuffers::Offset<tflite::ModelFile> model_file = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths = 0,
+    ::flatbuffers::Offset<tflite::ValidationSettings> validation_settings = 0) {
   MinibenchmarkSettingsBuilder builder_(_fbb);
   builder_.add_validation_settings(validation_settings);
   builder_.add_storage_paths(storage_paths);
@@ -3864,13 +3984,13 @@ inline flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettingsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<flatbuffers::Offset<tflite::TFLiteSettings>> *settings_to_test = nullptr,
-    flatbuffers::Offset<tflite::ModelFile> model_file = 0,
-    flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths = 0,
-    flatbuffers::Offset<tflite::ValidationSettings> validation_settings = 0) {
-  auto settings_to_test__ = settings_to_test ? _fbb.CreateVector<flatbuffers::Offset<tflite::TFLiteSettings>>(*settings_to_test) : 0;
+inline ::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::TFLiteSettings>> *settings_to_test = nullptr,
+    ::flatbuffers::Offset<tflite::ModelFile> model_file = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkStoragePaths> storage_paths = 0,
+    ::flatbuffers::Offset<tflite::ValidationSettings> validation_settings = 0) {
+  auto settings_to_test__ = settings_to_test ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TFLiteSettings>>(*settings_to_test) : 0;
   return tflite::CreateMinibenchmarkSettings(
       _fbb,
       settings_to_test__,
@@ -3879,9 +3999,9 @@ inline flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettingsDir
       validation_settings);
 }
 
-flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BenchmarkEventStorageT : public flatbuffers::NativeTable {
+struct BenchmarkEventStorageT : public ::flatbuffers::NativeTable {
   typedef BenchmarkEventStorage TableType;
   std::unique_ptr<tflite::ModelIdGroupT> model_id_group{};
   std::unique_ptr<tflite::BenchmarkEventT> benchmark_event{};
@@ -3891,7 +4011,7 @@ struct BenchmarkEventStorageT : public flatbuffers::NativeTable {
   BenchmarkEventStorageT &operator=(BenchmarkEventStorageT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct BenchmarkEventStorage FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BenchmarkEventStorage FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BenchmarkEventStorageT NativeTableType;
   typedef BenchmarkEventStorageBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -3904,7 +4024,7 @@ struct BenchmarkEventStorage FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tabl
   const tflite::BenchmarkEvent *benchmark_event() const {
     return GetPointer<const tflite::BenchmarkEvent *>(VT_BENCHMARK_EVENT);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_MODEL_ID_GROUP) &&
            verifier.VerifyTable(model_id_group()) &&
@@ -3912,43 +4032,43 @@ struct BenchmarkEventStorage FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tabl
            verifier.VerifyTable(benchmark_event()) &&
            verifier.EndTable();
   }
-  BenchmarkEventStorageT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BenchmarkEventStorageT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BenchmarkEventStorage> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BenchmarkEventStorageT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkEventStorageT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BenchmarkEventStorage> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BenchmarkEventStorageBuilder {
   typedef BenchmarkEventStorage Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_model_id_group(flatbuffers::Offset<tflite::ModelIdGroup> model_id_group) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_model_id_group(::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group) {
     fbb_.AddOffset(BenchmarkEventStorage::VT_MODEL_ID_GROUP, model_id_group);
   }
-  void add_benchmark_event(flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event) {
+  void add_benchmark_event(::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event) {
     fbb_.AddOffset(BenchmarkEventStorage::VT_BENCHMARK_EVENT, benchmark_event);
   }
-  explicit BenchmarkEventStorageBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BenchmarkEventStorageBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BenchmarkEventStorage> Finish() {
+  ::flatbuffers::Offset<BenchmarkEventStorage> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BenchmarkEventStorage>(end);
+    auto o = ::flatbuffers::Offset<BenchmarkEventStorage>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
-    flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event = 0) {
+inline ::flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    ::flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event = 0) {
   BenchmarkEventStorageBuilder builder_(_fbb);
   builder_.add_benchmark_event(benchmark_event);
   builder_.add_model_id_group(model_id_group);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 
 inline bool operator==(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs) {
@@ -3982,30 +4102,30 @@ inline ComputeSettingsT &ComputeSettingsT::operator=(ComputeSettingsT o) FLATBUF
   return *this;
 }
 
-inline ComputeSettingsT *ComputeSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ComputeSettingsT *ComputeSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ComputeSettingsT>(new ComputeSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ComputeSettings::UnPackTo(ComputeSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ComputeSettings::UnPackTo(ComputeSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = preference(); _o->preference = _e; }
-  { auto _e = tflite_settings(); if (_e) { if(_o->tflite_settings) { _e->UnPackTo(_o->tflite_settings.get(), _resolver); } else { _o->tflite_settings = std::unique_ptr<tflite::TFLiteSettingsT>(_e->UnPack(_resolver)); } } }
+  { auto _e = tflite_settings(); if (_e) { if(_o->tflite_settings) { _e->UnPackTo(_o->tflite_settings.get(), _resolver); } else { _o->tflite_settings = std::unique_ptr<tflite::TFLiteSettingsT>(_e->UnPack(_resolver)); } } else if (_o->tflite_settings) { _o->tflite_settings.reset(); } }
   { auto _e = model_namespace_for_statistics(); if (_e) _o->model_namespace_for_statistics = _e->str(); }
   { auto _e = model_identifier_for_statistics(); if (_e) _o->model_identifier_for_statistics = _e->str(); }
-  { auto _e = settings_to_test_locally(); if (_e) { if(_o->settings_to_test_locally) { _e->UnPackTo(_o->settings_to_test_locally.get(), _resolver); } else { _o->settings_to_test_locally = std::unique_ptr<tflite::MinibenchmarkSettingsT>(_e->UnPack(_resolver)); } } }
+  { auto _e = settings_to_test_locally(); if (_e) { if(_o->settings_to_test_locally) { _e->UnPackTo(_o->settings_to_test_locally.get(), _resolver); } else { _o->settings_to_test_locally = std::unique_ptr<tflite::MinibenchmarkSettingsT>(_e->UnPack(_resolver)); } } else if (_o->settings_to_test_locally) { _o->settings_to_test_locally.reset(); } }
 }
 
-inline flatbuffers::Offset<ComputeSettings> ComputeSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ComputeSettings> ComputeSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateComputeSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ComputeSettings> CreateComputeSettings(flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ComputeSettings> CreateComputeSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ComputeSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ComputeSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ComputeSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _preference = _o->preference;
   auto _tflite_settings = _o->tflite_settings ? CreateTFLiteSettings(_fbb, _o->tflite_settings.get(), _rehasher) : 0;
   auto _model_namespace_for_statistics = _o->model_namespace_for_statistics.empty() ? 0 : _fbb.CreateString(_o->model_namespace_for_statistics);
@@ -4073,13 +4193,13 @@ inline NNAPISettingsT &NNAPISettingsT::operator=(NNAPISettingsT o) FLATBUFFERS_N
   return *this;
 }
 
-inline NNAPISettingsT *NNAPISettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline NNAPISettingsT *NNAPISettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<NNAPISettingsT>(new NNAPISettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void NNAPISettings::UnPackTo(NNAPISettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void NNAPISettings::UnPackTo(NNAPISettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = accelerator_name(); if (_e) _o->accelerator_name = _e->str(); }
@@ -4087,7 +4207,7 @@ inline void NNAPISettings::UnPackTo(NNAPISettingsT *_o, const flatbuffers::resol
   { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
   { auto _e = execution_preference(); _o->execution_preference = _e; }
   { auto _e = no_of_nnapi_instances_to_cache(); _o->no_of_nnapi_instances_to_cache = _e; }
-  { auto _e = fallback_settings(); if (_e) { if(_o->fallback_settings) { _e->UnPackTo(_o->fallback_settings.get(), _resolver); } else { _o->fallback_settings = std::unique_ptr<tflite::FallbackSettingsT>(_e->UnPack(_resolver)); } } }
+  { auto _e = fallback_settings(); if (_e) { if(_o->fallback_settings) { _e->UnPackTo(_o->fallback_settings.get(), _resolver); } else { _o->fallback_settings = std::unique_ptr<tflite::FallbackSettingsT>(_e->UnPack(_resolver)); } } else if (_o->fallback_settings) { _o->fallback_settings.reset(); } }
   { auto _e = allow_nnapi_cpu_on_android_10_plus(); _o->allow_nnapi_cpu_on_android_10_plus = _e; }
   { auto _e = execution_priority(); _o->execution_priority = _e; }
   { auto _e = allow_dynamic_dimensions(); _o->allow_dynamic_dimensions = _e; }
@@ -4096,14 +4216,14 @@ inline void NNAPISettings::UnPackTo(NNAPISettingsT *_o, const flatbuffers::resol
   { auto _e = support_library_handle(); _o->support_library_handle = _e; }
 }
 
-inline flatbuffers::Offset<NNAPISettings> NNAPISettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<NNAPISettings> NNAPISettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateNNAPISettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(::flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const NNAPISettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const NNAPISettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _accelerator_name = _o->accelerator_name.empty() ? 0 : _fbb.CreateString(_o->accelerator_name);
   auto _cache_directory = _o->cache_directory.empty() ? 0 : _fbb.CreateString(_o->cache_directory);
   auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
@@ -4151,13 +4271,13 @@ inline bool operator!=(const GPUSettingsT &lhs, const GPUSettingsT &rhs) {
 }
 
 
-inline GPUSettingsT *GPUSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline GPUSettingsT *GPUSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<GPUSettingsT>(new GPUSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void GPUSettings::UnPackTo(GPUSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void GPUSettings::UnPackTo(GPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = is_precision_loss_allowed(); _o->is_precision_loss_allowed = _e; }
@@ -4171,14 +4291,14 @@ inline void GPUSettings::UnPackTo(GPUSettingsT *_o, const flatbuffers::resolver_
   { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
 }
 
-inline flatbuffers::Offset<GPUSettings> GPUSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GPUSettings> GPUSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateGPUSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<GPUSettings> CreateGPUSettings(flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GPUSettings> CreateGPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GPUSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GPUSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _is_precision_loss_allowed = _o->is_precision_loss_allowed;
   auto _enable_quantized_inference = _o->enable_quantized_inference;
   auto _force_backend = _o->force_backend;
@@ -4215,13 +4335,13 @@ inline bool operator!=(const HexagonSettingsT &lhs, const HexagonSettingsT &rhs)
 }
 
 
-inline HexagonSettingsT *HexagonSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline HexagonSettingsT *HexagonSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<HexagonSettingsT>(new HexagonSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void HexagonSettings::UnPackTo(HexagonSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void HexagonSettings::UnPackTo(HexagonSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = debug_level(); _o->debug_level = _e; }
@@ -4230,14 +4350,14 @@ inline void HexagonSettings::UnPackTo(HexagonSettingsT *_o, const flatbuffers::r
   { auto _e = print_graph_debug(); _o->print_graph_debug = _e; }
 }
 
-inline flatbuffers::Offset<HexagonSettings> HexagonSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<HexagonSettings> HexagonSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateHexagonSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(::flatbuffers::FlatBufferBuilder &_fbb, const HexagonSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const HexagonSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HexagonSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _debug_level = _o->debug_level;
   auto _powersave_level = _o->powersave_level;
   auto _print_graph_profile = _o->print_graph_profile;
@@ -4262,27 +4382,27 @@ inline bool operator!=(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs)
 }
 
 
-inline XNNPackSettingsT *XNNPackSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline XNNPackSettingsT *XNNPackSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<XNNPackSettingsT>(new XNNPackSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void XNNPackSettings::UnPackTo(XNNPackSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void XNNPackSettings::UnPackTo(XNNPackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = num_threads(); _o->num_threads = _e; }
   { auto _e = flags(); _o->flags = _e; }
 }
 
-inline flatbuffers::Offset<XNNPackSettings> XNNPackSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<XNNPackSettings> XNNPackSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateXNNPackSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const XNNPackSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const XNNPackSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _num_threads = _o->num_threads;
   auto _flags = _o->flags;
   return tflite::CreateXNNPackSettings(
@@ -4305,13 +4425,13 @@ inline bool operator!=(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs) {
 }
 
 
-inline CoreMLSettingsT *CoreMLSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline CoreMLSettingsT *CoreMLSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<CoreMLSettingsT>(new CoreMLSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void CoreMLSettings::UnPackTo(CoreMLSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void CoreMLSettings::UnPackTo(CoreMLSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = enabled_devices(); _o->enabled_devices = _e; }
@@ -4320,14 +4440,14 @@ inline void CoreMLSettings::UnPackTo(CoreMLSettingsT *_o, const flatbuffers::res
   { auto _e = min_nodes_per_partition(); _o->min_nodes_per_partition = _e; }
 }
 
-inline flatbuffers::Offset<CoreMLSettings> CoreMLSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CoreMLSettings> CoreMLSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateCoreMLSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CoreMLSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CoreMLSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _enabled_devices = _o->enabled_devices;
   auto _coreml_version = _o->coreml_version;
   auto _max_delegated_partitions = _o->max_delegated_partitions;
@@ -4351,26 +4471,26 @@ inline bool operator!=(const StableDelegateLoaderSettingsT &lhs, const StableDel
 }
 
 
-inline StableDelegateLoaderSettingsT *StableDelegateLoaderSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline StableDelegateLoaderSettingsT *StableDelegateLoaderSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<StableDelegateLoaderSettingsT>(new StableDelegateLoaderSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void StableDelegateLoaderSettings::UnPackTo(StableDelegateLoaderSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void StableDelegateLoaderSettings::UnPackTo(StableDelegateLoaderSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = delegate_path(); if (_e) _o->delegate_path = _e->str(); }
 }
 
-inline flatbuffers::Offset<StableDelegateLoaderSettings> StableDelegateLoaderSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> StableDelegateLoaderSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateStableDelegateLoaderSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(::flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const StableDelegateLoaderSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StableDelegateLoaderSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _delegate_path = _o->delegate_path.empty() ? 0 : _fbb.CreateString(_o->delegate_path);
   return tflite::CreateStableDelegateLoaderSettings(
       _fbb,
@@ -4389,27 +4509,27 @@ inline bool operator!=(const CompilationCachingSettingsT &lhs, const Compilation
 }
 
 
-inline CompilationCachingSettingsT *CompilationCachingSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline CompilationCachingSettingsT *CompilationCachingSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<CompilationCachingSettingsT>(new CompilationCachingSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void CompilationCachingSettings::UnPackTo(CompilationCachingSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void CompilationCachingSettings::UnPackTo(CompilationCachingSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = cache_dir(); if (_e) _o->cache_dir = _e->str(); }
   { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
 }
 
-inline flatbuffers::Offset<CompilationCachingSettings> CompilationCachingSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CompilationCachingSettings> CompilationCachingSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateCompilationCachingSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CompilationCachingSettings> CreateCompilationCachingSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CompilationCachingSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CompilationCachingSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CompilationCachingSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _cache_dir = _o->cache_dir.empty() ? 0 : _fbb.CreateString(_o->cache_dir);
   auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
   return tflite::CreateCompilationCachingSettings(
@@ -4432,29 +4552,29 @@ inline bool operator!=(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &
 }
 
 
-inline EdgeTpuDeviceSpecT *EdgeTpuDeviceSpec::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline EdgeTpuDeviceSpecT *EdgeTpuDeviceSpec::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<EdgeTpuDeviceSpecT>(new EdgeTpuDeviceSpecT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void EdgeTpuDeviceSpec::UnPackTo(EdgeTpuDeviceSpecT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void EdgeTpuDeviceSpec::UnPackTo(EdgeTpuDeviceSpecT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = platform_type(); _o->platform_type = _e; }
   { auto _e = num_chips(); _o->num_chips = _e; }
-  { auto _e = device_paths(); if (_e) { _o->device_paths.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->device_paths[_i] = _e->Get(_i)->str(); } } }
+  { auto _e = device_paths(); if (_e) { _o->device_paths.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->device_paths[_i] = _e->Get(_i)->str(); } } else { _o->device_paths.resize(0); } }
   { auto _e = chip_family(); _o->chip_family = _e; }
 }
 
-inline flatbuffers::Offset<EdgeTpuDeviceSpec> EdgeTpuDeviceSpec::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> EdgeTpuDeviceSpec::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateEdgeTpuDeviceSpec(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<EdgeTpuDeviceSpec> CreateEdgeTpuDeviceSpec(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuDeviceSpecT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuDeviceSpecT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuDeviceSpecT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _platform_type = _o->platform_type;
   auto _num_chips = _o->num_chips;
   auto _device_paths = _o->device_paths.size() ? _fbb.CreateVectorOfStrings(_o->device_paths) : 0;
@@ -4479,27 +4599,27 @@ inline bool operator!=(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInac
 }
 
 
-inline EdgeTpuInactivePowerConfigT *EdgeTpuInactivePowerConfig::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline EdgeTpuInactivePowerConfigT *EdgeTpuInactivePowerConfig::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<EdgeTpuInactivePowerConfigT>(new EdgeTpuInactivePowerConfigT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void EdgeTpuInactivePowerConfig::UnPackTo(EdgeTpuInactivePowerConfigT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void EdgeTpuInactivePowerConfig::UnPackTo(EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = inactive_power_state(); _o->inactive_power_state = _e; }
   { auto _e = inactive_timeout_us(); _o->inactive_timeout_us = _e; }
 }
 
-inline flatbuffers::Offset<EdgeTpuInactivePowerConfig> EdgeTpuInactivePowerConfig::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> EdgeTpuInactivePowerConfig::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateEdgeTpuInactivePowerConfig(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowerConfig(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuInactivePowerConfigT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuInactivePowerConfigT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuInactivePowerConfigT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _inactive_power_state = _o->inactive_power_state;
   auto _inactive_timeout_us = _o->inactive_timeout_us;
   return tflite::CreateEdgeTpuInactivePowerConfig(
@@ -4512,7 +4632,7 @@ inline flatbuffers::Offset<EdgeTpuInactivePowerConfig> CreateEdgeTpuInactivePowe
 inline bool operator==(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs) {
   return
       (lhs.inference_power_state == rhs.inference_power_state) &&
-      (lhs.inactive_power_configs == rhs.inactive_power_configs) &&
+      (lhs.inactive_power_configs.size() == rhs.inactive_power_configs.size() && std::equal(lhs.inactive_power_configs.cbegin(), lhs.inactive_power_configs.cend(), rhs.inactive_power_configs.cbegin(), [](std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT> const &a, std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
       (lhs.inference_priority == rhs.inference_priority) &&
       ((lhs.edgetpu_device_spec == rhs.edgetpu_device_spec) || (lhs.edgetpu_device_spec && rhs.edgetpu_device_spec && *lhs.edgetpu_device_spec == *rhs.edgetpu_device_spec)) &&
       (lhs.model_token == rhs.model_token) &&
@@ -4537,7 +4657,7 @@ inline EdgeTpuSettingsT::EdgeTpuSettingsT(const EdgeTpuSettingsT &o)
         hardware_cluster_ids(o.hardware_cluster_ids),
         public_model_id(o.public_model_id) {
   inactive_power_configs.reserve(o.inactive_power_configs.size());
-  for (const auto &v : o.inactive_power_configs) { inactive_power_configs.emplace_back((v) ? new tflite::EdgeTpuInactivePowerConfigT(*v) : nullptr); }
+  for (const auto &inactive_power_configs_ : o.inactive_power_configs) { inactive_power_configs.emplace_back((inactive_power_configs_) ? new tflite::EdgeTpuInactivePowerConfigT(*inactive_power_configs_) : nullptr); }
 }
 
 inline EdgeTpuSettingsT &EdgeTpuSettingsT::operator=(EdgeTpuSettingsT o) FLATBUFFERS_NOEXCEPT {
@@ -4553,36 +4673,36 @@ inline EdgeTpuSettingsT &EdgeTpuSettingsT::operator=(EdgeTpuSettingsT o) FLATBUF
   return *this;
 }
 
-inline EdgeTpuSettingsT *EdgeTpuSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline EdgeTpuSettingsT *EdgeTpuSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<EdgeTpuSettingsT>(new EdgeTpuSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void EdgeTpuSettings::UnPackTo(EdgeTpuSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void EdgeTpuSettings::UnPackTo(EdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = inference_power_state(); _o->inference_power_state = _e; }
-  { auto _e = inactive_power_configs(); if (_e) { _o->inactive_power_configs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->inactive_power_configs[_i]) { _e->Get(_i)->UnPackTo(_o->inactive_power_configs[_i].get(), _resolver); } else { _o->inactive_power_configs[_i] = std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
+  { auto _e = inactive_power_configs(); if (_e) { _o->inactive_power_configs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->inactive_power_configs[_i]) { _e->Get(_i)->UnPackTo(_o->inactive_power_configs[_i].get(), _resolver); } else { _o->inactive_power_configs[_i] = std::unique_ptr<tflite::EdgeTpuInactivePowerConfigT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->inactive_power_configs.resize(0); } }
   { auto _e = inference_priority(); _o->inference_priority = _e; }
-  { auto _e = edgetpu_device_spec(); if (_e) { if(_o->edgetpu_device_spec) { _e->UnPackTo(_o->edgetpu_device_spec.get(), _resolver); } else { _o->edgetpu_device_spec = std::unique_ptr<tflite::EdgeTpuDeviceSpecT>(_e->UnPack(_resolver)); } } }
+  { auto _e = edgetpu_device_spec(); if (_e) { if(_o->edgetpu_device_spec) { _e->UnPackTo(_o->edgetpu_device_spec.get(), _resolver); } else { _o->edgetpu_device_spec = std::unique_ptr<tflite::EdgeTpuDeviceSpecT>(_e->UnPack(_resolver)); } } else if (_o->edgetpu_device_spec) { _o->edgetpu_device_spec.reset(); } }
   { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
   { auto _e = float_truncation_type(); _o->float_truncation_type = _e; }
   { auto _e = qos_class(); _o->qos_class = _e; }
-  { auto _e = hardware_cluster_ids(); if (_e) { _o->hardware_cluster_ids.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->hardware_cluster_ids[_i] = _e->Get(_i); } } }
+  { auto _e = hardware_cluster_ids(); if (_e) { _o->hardware_cluster_ids.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->hardware_cluster_ids[_i] = _e->Get(_i); } } else { _o->hardware_cluster_ids.resize(0); } }
   { auto _e = public_model_id(); if (_e) _o->public_model_id = _e->str(); }
 }
 
-inline flatbuffers::Offset<EdgeTpuSettings> EdgeTpuSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<EdgeTpuSettings> EdgeTpuSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateEdgeTpuSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EdgeTpuSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _inference_power_state = _o->inference_power_state;
-  auto _inactive_power_configs = _o->inactive_power_configs.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> (_o->inactive_power_configs.size(), [](size_t i, _VectorArgs *__va) { return CreateEdgeTpuInactivePowerConfig(*__va->__fbb, __va->__o->inactive_power_configs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _inactive_power_configs = _o->inactive_power_configs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>> (_o->inactive_power_configs.size(), [](size_t i, _VectorArgs *__va) { return CreateEdgeTpuInactivePowerConfig(*__va->__fbb, __va->__o->inactive_power_configs[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _inference_priority = _o->inference_priority;
   auto _edgetpu_device_spec = _o->edgetpu_device_spec ? CreateEdgeTpuDeviceSpec(_fbb, _o->edgetpu_device_spec.get(), _rehasher) : 0;
   auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
@@ -4609,7 +4729,13 @@ inline bool operator==(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSet
       (lhs.log_verbosity == rhs.log_verbosity) &&
       (lhs.enable_tracing == rhs.enable_tracing) &&
       (lhs.priority == rhs.priority) &&
-      (lhs.extension_data == rhs.extension_data);
+      (lhs.extension_data == rhs.extension_data) &&
+      (lhs.model_identifier == rhs.model_identifier) &&
+      (lhs.use_async_api == rhs.use_async_api) &&
+      (lhs.delegate_should_manage_cache_for_inputs == rhs.delegate_should_manage_cache_for_inputs) &&
+      (lhs.delegate_should_manage_cache_for_outputs == rhs.delegate_should_manage_cache_for_outputs) &&
+      (lhs.prefer_cache_coherency_for_inputs == rhs.prefer_cache_coherency_for_inputs) &&
+      (lhs.prefer_cache_coherency_for_outputs == rhs.prefer_cache_coherency_for_outputs);
 }
 
 inline bool operator!=(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs) {
@@ -4617,39 +4743,57 @@ inline bool operator!=(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSet
 }
 
 
-inline GoogleEdgeTpuSettingsT *GoogleEdgeTpuSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline GoogleEdgeTpuSettingsT *GoogleEdgeTpuSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<GoogleEdgeTpuSettingsT>(new GoogleEdgeTpuSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void GoogleEdgeTpuSettings::UnPackTo(GoogleEdgeTpuSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void GoogleEdgeTpuSettings::UnPackTo(GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = log_verbosity(); _o->log_verbosity = _e; }
   { auto _e = enable_tracing(); _o->enable_tracing = _e; }
   { auto _e = priority(); _o->priority = _e; }
   { auto _e = extension_data(); if (_e) { _o->extension_data.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->extension_data.begin()); } }
+  { auto _e = model_identifier(); if (_e) _o->model_identifier = _e->str(); }
+  { auto _e = use_async_api(); _o->use_async_api = _e; }
+  { auto _e = delegate_should_manage_cache_for_inputs(); _o->delegate_should_manage_cache_for_inputs = _e; }
+  { auto _e = delegate_should_manage_cache_for_outputs(); _o->delegate_should_manage_cache_for_outputs = _e; }
+  { auto _e = prefer_cache_coherency_for_inputs(); _o->prefer_cache_coherency_for_inputs = _e; }
+  { auto _e = prefer_cache_coherency_for_outputs(); _o->prefer_cache_coherency_for_outputs = _e; }
 }
 
-inline flatbuffers::Offset<GoogleEdgeTpuSettings> GoogleEdgeTpuSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> GoogleEdgeTpuSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateGoogleEdgeTpuSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GoogleEdgeTpuSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GoogleEdgeTpuSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _log_verbosity = _o->log_verbosity;
   auto _enable_tracing = _o->enable_tracing;
   auto _priority = _o->priority;
   auto _extension_data = _o->extension_data.size() ? _fbb.CreateVector(_o->extension_data) : 0;
+  auto _model_identifier = _o->model_identifier.empty() ? 0 : _fbb.CreateString(_o->model_identifier);
+  auto _use_async_api = _o->use_async_api;
+  auto _delegate_should_manage_cache_for_inputs = _o->delegate_should_manage_cache_for_inputs;
+  auto _delegate_should_manage_cache_for_outputs = _o->delegate_should_manage_cache_for_outputs;
+  auto _prefer_cache_coherency_for_inputs = _o->prefer_cache_coherency_for_inputs;
+  auto _prefer_cache_coherency_for_outputs = _o->prefer_cache_coherency_for_outputs;
   return tflite::CreateGoogleEdgeTpuSettings(
       _fbb,
       _log_verbosity,
       _enable_tracing,
       _priority,
-      _extension_data);
+      _extension_data,
+      _model_identifier,
+      _use_async_api,
+      _delegate_should_manage_cache_for_inputs,
+      _delegate_should_manage_cache_for_outputs,
+      _prefer_cache_coherency_for_inputs,
+      _prefer_cache_coherency_for_outputs);
 }
 
 
@@ -4666,13 +4810,13 @@ inline bool operator!=(const CoralSettingsT &lhs, const CoralSettingsT &rhs) {
 }
 
 
-inline CoralSettingsT *CoralSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline CoralSettingsT *CoralSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<CoralSettingsT>(new CoralSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void CoralSettings::UnPackTo(CoralSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void CoralSettings::UnPackTo(CoralSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = device(); if (_e) _o->device = _e->str(); }
@@ -4681,14 +4825,14 @@ inline void CoralSettings::UnPackTo(CoralSettingsT *_o, const flatbuffers::resol
   { auto _e = usb_max_bulk_in_queue_length(); _o->usb_max_bulk_in_queue_length = _e; }
 }
 
-inline flatbuffers::Offset<CoralSettings> CoralSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CoralSettings> CoralSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateCoralSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CoralSettings> CreateCoralSettings(flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CoralSettings> CreateCoralSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CoralSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CoralSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CoralSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _device = _o->device.empty() ? 0 : _fbb.CreateString(_o->device);
   auto _performance = _o->performance;
   auto _usb_always_dfu = _o->usb_always_dfu;
@@ -4712,26 +4856,26 @@ inline bool operator!=(const CPUSettingsT &lhs, const CPUSettingsT &rhs) {
 }
 
 
-inline CPUSettingsT *CPUSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline CPUSettingsT *CPUSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<CPUSettingsT>(new CPUSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void CPUSettings::UnPackTo(CPUSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void CPUSettings::UnPackTo(CPUSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = num_threads(); _o->num_threads = _e; }
 }
 
-inline flatbuffers::Offset<CPUSettings> CPUSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CPUSettings> CPUSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateCPUSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CPUSettings> CreateCPUSettings(flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CPUSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CPUSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _num_threads = _o->num_threads;
   return tflite::CreateCPUSettings(
       _fbb,
@@ -4800,40 +4944,40 @@ inline TFLiteSettingsT &TFLiteSettingsT::operator=(TFLiteSettingsT o) FLATBUFFER
   return *this;
 }
 
-inline TFLiteSettingsT *TFLiteSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline TFLiteSettingsT *TFLiteSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<TFLiteSettingsT>(new TFLiteSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void TFLiteSettings::UnPackTo(TFLiteSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void TFLiteSettings::UnPackTo(TFLiteSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = delegate(); _o->delegate = _e; }
-  { auto _e = nnapi_settings(); if (_e) { if(_o->nnapi_settings) { _e->UnPackTo(_o->nnapi_settings.get(), _resolver); } else { _o->nnapi_settings = std::unique_ptr<tflite::NNAPISettingsT>(_e->UnPack(_resolver)); } } }
-  { auto _e = gpu_settings(); if (_e) { if(_o->gpu_settings) { _e->UnPackTo(_o->gpu_settings.get(), _resolver); } else { _o->gpu_settings = std::unique_ptr<tflite::GPUSettingsT>(_e->UnPack(_resolver)); } } }
-  { auto _e = hexagon_settings(); if (_e) { if(_o->hexagon_settings) { _e->UnPackTo(_o->hexagon_settings.get(), _resolver); } else { _o->hexagon_settings = std::unique_ptr<tflite::HexagonSettingsT>(_e->UnPack(_resolver)); } } }
-  { auto _e = xnnpack_settings(); if (_e) { if(_o->xnnpack_settings) { _e->UnPackTo(_o->xnnpack_settings.get(), _resolver); } else { _o->xnnpack_settings = std::unique_ptr<tflite::XNNPackSettingsT>(_e->UnPack(_resolver)); } } }
-  { auto _e = coreml_settings(); if (_e) { if(_o->coreml_settings) { _e->UnPackTo(_o->coreml_settings.get(), _resolver); } else { _o->coreml_settings = std::unique_ptr<tflite::CoreMLSettingsT>(_e->UnPack(_resolver)); } } }
-  { auto _e = cpu_settings(); if (_e) { if(_o->cpu_settings) { _e->UnPackTo(_o->cpu_settings.get(), _resolver); } else { _o->cpu_settings = std::unique_ptr<tflite::CPUSettingsT>(_e->UnPack(_resolver)); } } }
+  { auto _e = nnapi_settings(); if (_e) { if(_o->nnapi_settings) { _e->UnPackTo(_o->nnapi_settings.get(), _resolver); } else { _o->nnapi_settings = std::unique_ptr<tflite::NNAPISettingsT>(_e->UnPack(_resolver)); } } else if (_o->nnapi_settings) { _o->nnapi_settings.reset(); } }
+  { auto _e = gpu_settings(); if (_e) { if(_o->gpu_settings) { _e->UnPackTo(_o->gpu_settings.get(), _resolver); } else { _o->gpu_settings = std::unique_ptr<tflite::GPUSettingsT>(_e->UnPack(_resolver)); } } else if (_o->gpu_settings) { _o->gpu_settings.reset(); } }
+  { auto _e = hexagon_settings(); if (_e) { if(_o->hexagon_settings) { _e->UnPackTo(_o->hexagon_settings.get(), _resolver); } else { _o->hexagon_settings = std::unique_ptr<tflite::HexagonSettingsT>(_e->UnPack(_resolver)); } } else if (_o->hexagon_settings) { _o->hexagon_settings.reset(); } }
+  { auto _e = xnnpack_settings(); if (_e) { if(_o->xnnpack_settings) { _e->UnPackTo(_o->xnnpack_settings.get(), _resolver); } else { _o->xnnpack_settings = std::unique_ptr<tflite::XNNPackSettingsT>(_e->UnPack(_resolver)); } } else if (_o->xnnpack_settings) { _o->xnnpack_settings.reset(); } }
+  { auto _e = coreml_settings(); if (_e) { if(_o->coreml_settings) { _e->UnPackTo(_o->coreml_settings.get(), _resolver); } else { _o->coreml_settings = std::unique_ptr<tflite::CoreMLSettingsT>(_e->UnPack(_resolver)); } } else if (_o->coreml_settings) { _o->coreml_settings.reset(); } }
+  { auto _e = cpu_settings(); if (_e) { if(_o->cpu_settings) { _e->UnPackTo(_o->cpu_settings.get(), _resolver); } else { _o->cpu_settings = std::unique_ptr<tflite::CPUSettingsT>(_e->UnPack(_resolver)); } } else if (_o->cpu_settings) { _o->cpu_settings.reset(); } }
   { auto _e = max_delegated_partitions(); _o->max_delegated_partitions = _e; }
-  { auto _e = edgetpu_settings(); if (_e) { if(_o->edgetpu_settings) { _e->UnPackTo(_o->edgetpu_settings.get(), _resolver); } else { _o->edgetpu_settings = std::unique_ptr<tflite::EdgeTpuSettingsT>(_e->UnPack(_resolver)); } } }
-  { auto _e = coral_settings(); if (_e) { if(_o->coral_settings) { _e->UnPackTo(_o->coral_settings.get(), _resolver); } else { _o->coral_settings = std::unique_ptr<tflite::CoralSettingsT>(_e->UnPack(_resolver)); } } }
-  { auto _e = fallback_settings(); if (_e) { if(_o->fallback_settings) { _e->UnPackTo(_o->fallback_settings.get(), _resolver); } else { _o->fallback_settings = std::unique_ptr<tflite::FallbackSettingsT>(_e->UnPack(_resolver)); } } }
+  { auto _e = edgetpu_settings(); if (_e) { if(_o->edgetpu_settings) { _e->UnPackTo(_o->edgetpu_settings.get(), _resolver); } else { _o->edgetpu_settings = std::unique_ptr<tflite::EdgeTpuSettingsT>(_e->UnPack(_resolver)); } } else if (_o->edgetpu_settings) { _o->edgetpu_settings.reset(); } }
+  { auto _e = coral_settings(); if (_e) { if(_o->coral_settings) { _e->UnPackTo(_o->coral_settings.get(), _resolver); } else { _o->coral_settings = std::unique_ptr<tflite::CoralSettingsT>(_e->UnPack(_resolver)); } } else if (_o->coral_settings) { _o->coral_settings.reset(); } }
+  { auto _e = fallback_settings(); if (_e) { if(_o->fallback_settings) { _e->UnPackTo(_o->fallback_settings.get(), _resolver); } else { _o->fallback_settings = std::unique_ptr<tflite::FallbackSettingsT>(_e->UnPack(_resolver)); } } else if (_o->fallback_settings) { _o->fallback_settings.reset(); } }
   { auto _e = disable_default_delegates(); _o->disable_default_delegates = _e; }
-  { auto _e = stable_delegate_loader_settings(); if (_e) { if(_o->stable_delegate_loader_settings) { _e->UnPackTo(_o->stable_delegate_loader_settings.get(), _resolver); } else { _o->stable_delegate_loader_settings = std::unique_ptr<tflite::StableDelegateLoaderSettingsT>(_e->UnPack(_resolver)); } } }
-  { auto _e = google_edgetpu_settings(); if (_e) { if(_o->google_edgetpu_settings) { _e->UnPackTo(_o->google_edgetpu_settings.get(), _resolver); } else { _o->google_edgetpu_settings = std::unique_ptr<tflite::GoogleEdgeTpuSettingsT>(_e->UnPack(_resolver)); } } }
-  { auto _e = compilation_caching_settings(); if (_e) { if(_o->compilation_caching_settings) { _e->UnPackTo(_o->compilation_caching_settings.get(), _resolver); } else { _o->compilation_caching_settings = std::unique_ptr<tflite::CompilationCachingSettingsT>(_e->UnPack(_resolver)); } } }
+  { auto _e = stable_delegate_loader_settings(); if (_e) { if(_o->stable_delegate_loader_settings) { _e->UnPackTo(_o->stable_delegate_loader_settings.get(), _resolver); } else { _o->stable_delegate_loader_settings = std::unique_ptr<tflite::StableDelegateLoaderSettingsT>(_e->UnPack(_resolver)); } } else if (_o->stable_delegate_loader_settings) { _o->stable_delegate_loader_settings.reset(); } }
+  { auto _e = google_edgetpu_settings(); if (_e) { if(_o->google_edgetpu_settings) { _e->UnPackTo(_o->google_edgetpu_settings.get(), _resolver); } else { _o->google_edgetpu_settings = std::unique_ptr<tflite::GoogleEdgeTpuSettingsT>(_e->UnPack(_resolver)); } } else if (_o->google_edgetpu_settings) { _o->google_edgetpu_settings.reset(); } }
+  { auto _e = compilation_caching_settings(); if (_e) { if(_o->compilation_caching_settings) { _e->UnPackTo(_o->compilation_caching_settings.get(), _resolver); } else { _o->compilation_caching_settings = std::unique_ptr<tflite::CompilationCachingSettingsT>(_e->UnPack(_resolver)); } } else if (_o->compilation_caching_settings) { _o->compilation_caching_settings.reset(); } }
 }
 
-inline flatbuffers::Offset<TFLiteSettings> TFLiteSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<TFLiteSettings> TFLiteSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateTFLiteSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TFLiteSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TFLiteSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _delegate = _o->delegate;
   auto _nnapi_settings = _o->nnapi_settings ? CreateNNAPISettings(_fbb, _o->nnapi_settings.get(), _rehasher) : 0;
   auto _gpu_settings = _o->gpu_settings ? CreateGPUSettings(_fbb, _o->gpu_settings.get(), _rehasher) : 0;
@@ -4880,27 +5024,27 @@ inline bool operator!=(const FallbackSettingsT &lhs, const FallbackSettingsT &rh
 }
 
 
-inline FallbackSettingsT *FallbackSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline FallbackSettingsT *FallbackSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<FallbackSettingsT>(new FallbackSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void FallbackSettings::UnPackTo(FallbackSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void FallbackSettings::UnPackTo(FallbackSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = allow_automatic_fallback_on_compilation_error(); _o->allow_automatic_fallback_on_compilation_error = _e; }
   { auto _e = allow_automatic_fallback_on_execution_error(); _o->allow_automatic_fallback_on_execution_error = _e; }
 }
 
-inline flatbuffers::Offset<FallbackSettings> FallbackSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<FallbackSettings> FallbackSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateFallbackSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<FallbackSettings> CreateFallbackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const FallbackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FallbackSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FallbackSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _allow_automatic_fallback_on_compilation_error = _o->allow_automatic_fallback_on_compilation_error;
   auto _allow_automatic_fallback_on_execution_error = _o->allow_automatic_fallback_on_execution_error;
   return tflite::CreateFallbackSettings(
@@ -4921,27 +5065,27 @@ inline bool operator!=(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs)
 }
 
 
-inline BenchmarkMetricT *BenchmarkMetric::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BenchmarkMetricT *BenchmarkMetric::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BenchmarkMetricT>(new BenchmarkMetricT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BenchmarkMetric::UnPackTo(BenchmarkMetricT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BenchmarkMetric::UnPackTo(BenchmarkMetricT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = name(); if (_e) _o->name = _e->str(); }
-  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } }
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } else { _o->values.resize(0); } }
 }
 
-inline flatbuffers::Offset<BenchmarkMetric> BenchmarkMetric::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkMetric> BenchmarkMetric::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBenchmarkMetric(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkMetric> CreateBenchmarkMetric(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkMetricT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkMetricT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkMetricT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
   auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
   return tflite::CreateBenchmarkMetric(
@@ -4957,8 +5101,8 @@ inline bool operator==(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs)
       (lhs.inference_time_us == rhs.inference_time_us) &&
       (lhs.max_memory_kb == rhs.max_memory_kb) &&
       (lhs.ok == rhs.ok) &&
-      (lhs.metrics == rhs.metrics) &&
-      (lhs.actual_output == rhs.actual_output);
+      (lhs.metrics.size() == rhs.metrics.size() && std::equal(lhs.metrics.cbegin(), lhs.metrics.cend(), rhs.metrics.cbegin(), [](std::unique_ptr<tflite::BenchmarkMetricT> const &a, std::unique_ptr<tflite::BenchmarkMetricT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
+      (lhs.actual_output.size() == rhs.actual_output.size() && std::equal(lhs.actual_output.cbegin(), lhs.actual_output.cend(), rhs.actual_output.cbegin(), [](std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT> const &a, std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT> const &b) { return (a == b) || (a && b && *a == *b); }));
 }
 
 inline bool operator!=(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs) {
@@ -4972,9 +5116,9 @@ inline BenchmarkResultT::BenchmarkResultT(const BenchmarkResultT &o)
         max_memory_kb(o.max_memory_kb),
         ok(o.ok) {
   metrics.reserve(o.metrics.size());
-  for (const auto &v : o.metrics) { metrics.emplace_back((v) ? new tflite::BenchmarkMetricT(*v) : nullptr); }
+  for (const auto &metrics_ : o.metrics) { metrics.emplace_back((metrics_) ? new tflite::BenchmarkMetricT(*metrics_) : nullptr); }
   actual_output.reserve(o.actual_output.size());
-  for (const auto &v : o.actual_output) { actual_output.emplace_back((v) ? new tflite::BenchmarkResult_::InferenceOutputT(*v) : nullptr); }
+  for (const auto &actual_output_ : o.actual_output) { actual_output.emplace_back((actual_output_) ? new tflite::BenchmarkResult_::InferenceOutputT(*actual_output_) : nullptr); }
 }
 
 inline BenchmarkResultT &BenchmarkResultT::operator=(BenchmarkResultT o) FLATBUFFERS_NOEXCEPT {
@@ -4987,37 +5131,37 @@ inline BenchmarkResultT &BenchmarkResultT::operator=(BenchmarkResultT o) FLATBUF
   return *this;
 }
 
-inline BenchmarkResultT *BenchmarkResult::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BenchmarkResultT *BenchmarkResult::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BenchmarkResultT>(new BenchmarkResultT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BenchmarkResult::UnPackTo(BenchmarkResultT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BenchmarkResult::UnPackTo(BenchmarkResultT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = initialization_time_us(); if (_e) { _o->initialization_time_us.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->initialization_time_us[_i] = _e->Get(_i); } } }
-  { auto _e = inference_time_us(); if (_e) { _o->inference_time_us.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inference_time_us[_i] = _e->Get(_i); } } }
+  { auto _e = initialization_time_us(); if (_e) { _o->initialization_time_us.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->initialization_time_us[_i] = _e->Get(_i); } } else { _o->initialization_time_us.resize(0); } }
+  { auto _e = inference_time_us(); if (_e) { _o->inference_time_us.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inference_time_us[_i] = _e->Get(_i); } } else { _o->inference_time_us.resize(0); } }
   { auto _e = max_memory_kb(); _o->max_memory_kb = _e; }
   { auto _e = ok(); _o->ok = _e; }
-  { auto _e = metrics(); if (_e) { _o->metrics.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metrics[_i]) { _e->Get(_i)->UnPackTo(_o->metrics[_i].get(), _resolver); } else { _o->metrics[_i] = std::unique_ptr<tflite::BenchmarkMetricT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
-  { auto _e = actual_output(); if (_e) { _o->actual_output.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->actual_output[_i]) { _e->Get(_i)->UnPackTo(_o->actual_output[_i].get(), _resolver); } else { _o->actual_output[_i] = std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
+  { auto _e = metrics(); if (_e) { _o->metrics.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metrics[_i]) { _e->Get(_i)->UnPackTo(_o->metrics[_i].get(), _resolver); } else { _o->metrics[_i] = std::unique_ptr<tflite::BenchmarkMetricT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->metrics.resize(0); } }
+  { auto _e = actual_output(); if (_e) { _o->actual_output.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->actual_output[_i]) { _e->Get(_i)->UnPackTo(_o->actual_output[_i].get(), _resolver); } else { _o->actual_output[_i] = std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->actual_output.resize(0); } }
 }
 
-inline flatbuffers::Offset<BenchmarkResult> BenchmarkResult::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkResult> BenchmarkResult::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBenchmarkResult(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkResultT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkResultT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _initialization_time_us = _o->initialization_time_us.size() ? _fbb.CreateVector(_o->initialization_time_us) : 0;
   auto _inference_time_us = _o->inference_time_us.size() ? _fbb.CreateVector(_o->inference_time_us) : 0;
   auto _max_memory_kb = _o->max_memory_kb;
   auto _ok = _o->ok;
-  auto _metrics = _o->metrics.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::BenchmarkMetric>> (_o->metrics.size(), [](size_t i, _VectorArgs *__va) { return CreateBenchmarkMetric(*__va->__fbb, __va->__o->metrics[i].get(), __va->__rehasher); }, &_va ) : 0;
-  auto _actual_output = _o->actual_output.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> (_o->actual_output.size(), [](size_t i, _VectorArgs *__va) { return CreateInferenceOutput(*__va->__fbb, __va->__o->actual_output[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _metrics = _o->metrics.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkMetric>> (_o->metrics.size(), [](size_t i, _VectorArgs *__va) { return CreateBenchmarkMetric(*__va->__fbb, __va->__o->metrics[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _actual_output = _o->actual_output.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> (_o->actual_output.size(), [](size_t i, _VectorArgs *__va) { return CreateInferenceOutput(*__va->__fbb, __va->__o->actual_output[i].get(), __va->__rehasher); }, &_va ) : 0;
   return tflite::CreateBenchmarkResult(
       _fbb,
       _initialization_time_us,
@@ -5041,26 +5185,26 @@ inline bool operator!=(const InferenceOutputT &lhs, const InferenceOutputT &rhs)
 }
 
 
-inline InferenceOutputT *InferenceOutput::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline InferenceOutputT *InferenceOutput::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<InferenceOutputT>(new InferenceOutputT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void InferenceOutput::UnPackTo(InferenceOutputT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void InferenceOutput::UnPackTo(InferenceOutputT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = value(); if (_e) { _o->value.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->value.begin()); } }
 }
 
-inline flatbuffers::Offset<InferenceOutput> InferenceOutput::Pack(flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<InferenceOutput> InferenceOutput::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateInferenceOutput(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(::flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const InferenceOutputT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const InferenceOutputT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _value = _o->value.size() ? _fbb.CreateVector(_o->value) : 0;
   return tflite::BenchmarkResult_::CreateInferenceOutput(
       _fbb,
@@ -5082,13 +5226,13 @@ inline bool operator!=(const ErrorCodeT &lhs, const ErrorCodeT &rhs) {
 }
 
 
-inline ErrorCodeT *ErrorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ErrorCodeT *ErrorCode::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ErrorCodeT>(new ErrorCodeT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ErrorCode::UnPackTo(ErrorCodeT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ErrorCode::UnPackTo(ErrorCodeT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = source(); _o->source = _e; }
@@ -5096,14 +5240,14 @@ inline void ErrorCode::UnPackTo(ErrorCodeT *_o, const flatbuffers::resolver_func
   { auto _e = underlying_api_error(); _o->underlying_api_error = _e; }
 }
 
-inline flatbuffers::Offset<ErrorCode> ErrorCode::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ErrorCode> ErrorCode::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateErrorCode(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ErrorCode> CreateErrorCode(flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ErrorCode> CreateErrorCode(::flatbuffers::FlatBufferBuilder &_fbb, const ErrorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ErrorCodeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ErrorCodeT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _source = _o->source;
   auto _tflite_error = _o->tflite_error;
   auto _underlying_api_error = _o->underlying_api_error;
@@ -5120,7 +5264,7 @@ inline bool operator==(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs) {
       (lhs.stage == rhs.stage) &&
       (lhs.exit_code == rhs.exit_code) &&
       (lhs.signal == rhs.signal) &&
-      (lhs.error_code == rhs.error_code) &&
+      (lhs.error_code.size() == rhs.error_code.size() && std::equal(lhs.error_code.cbegin(), lhs.error_code.cend(), rhs.error_code.cbegin(), [](std::unique_ptr<tflite::ErrorCodeT> const &a, std::unique_ptr<tflite::ErrorCodeT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
       (lhs.mini_benchmark_error_code == rhs.mini_benchmark_error_code);
 }
 
@@ -5135,7 +5279,7 @@ inline BenchmarkErrorT::BenchmarkErrorT(const BenchmarkErrorT &o)
         signal(o.signal),
         mini_benchmark_error_code(o.mini_benchmark_error_code) {
   error_code.reserve(o.error_code.size());
-  for (const auto &v : o.error_code) { error_code.emplace_back((v) ? new tflite::ErrorCodeT(*v) : nullptr); }
+  for (const auto &error_code_ : o.error_code) { error_code.emplace_back((error_code_) ? new tflite::ErrorCodeT(*error_code_) : nullptr); }
 }
 
 inline BenchmarkErrorT &BenchmarkErrorT::operator=(BenchmarkErrorT o) FLATBUFFERS_NOEXCEPT {
@@ -5147,34 +5291,34 @@ inline BenchmarkErrorT &BenchmarkErrorT::operator=(BenchmarkErrorT o) FLATBUFFER
   return *this;
 }
 
-inline BenchmarkErrorT *BenchmarkError::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BenchmarkErrorT *BenchmarkError::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BenchmarkErrorT>(new BenchmarkErrorT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BenchmarkError::UnPackTo(BenchmarkErrorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BenchmarkError::UnPackTo(BenchmarkErrorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = stage(); _o->stage = _e; }
   { auto _e = exit_code(); _o->exit_code = _e; }
   { auto _e = signal(); _o->signal = _e; }
-  { auto _e = error_code(); if (_e) { _o->error_code.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->error_code[_i]) { _e->Get(_i)->UnPackTo(_o->error_code[_i].get(), _resolver); } else { _o->error_code[_i] = std::unique_ptr<tflite::ErrorCodeT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
+  { auto _e = error_code(); if (_e) { _o->error_code.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->error_code[_i]) { _e->Get(_i)->UnPackTo(_o->error_code[_i].get(), _resolver); } else { _o->error_code[_i] = std::unique_ptr<tflite::ErrorCodeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->error_code.resize(0); } }
   { auto _e = mini_benchmark_error_code(); _o->mini_benchmark_error_code = _e; }
 }
 
-inline flatbuffers::Offset<BenchmarkError> BenchmarkError::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkError> BenchmarkError::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBenchmarkError(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkError> CreateBenchmarkError(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkErrorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkErrorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkErrorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _stage = _o->stage;
   auto _exit_code = _o->exit_code;
   auto _signal = _o->signal;
-  auto _error_code = _o->error_code.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::ErrorCode>> (_o->error_code.size(), [](size_t i, _VectorArgs *__va) { return CreateErrorCode(*__va->__fbb, __va->__o->error_code[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _error_code = _o->error_code.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::ErrorCode>> (_o->error_code.size(), [](size_t i, _VectorArgs *__va) { return CreateErrorCode(*__va->__fbb, __va->__o->error_code[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _mini_benchmark_error_code = _o->mini_benchmark_error_code;
   return tflite::CreateBenchmarkError(
       _fbb,
@@ -5220,31 +5364,31 @@ inline BenchmarkEventT &BenchmarkEventT::operator=(BenchmarkEventT o) FLATBUFFER
   return *this;
 }
 
-inline BenchmarkEventT *BenchmarkEvent::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BenchmarkEventT *BenchmarkEvent::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BenchmarkEventT>(new BenchmarkEventT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BenchmarkEvent::UnPackTo(BenchmarkEventT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BenchmarkEvent::UnPackTo(BenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = tflite_settings(); if (_e) { if(_o->tflite_settings) { _e->UnPackTo(_o->tflite_settings.get(), _resolver); } else { _o->tflite_settings = std::unique_ptr<tflite::TFLiteSettingsT>(_e->UnPack(_resolver)); } } }
+  { auto _e = tflite_settings(); if (_e) { if(_o->tflite_settings) { _e->UnPackTo(_o->tflite_settings.get(), _resolver); } else { _o->tflite_settings = std::unique_ptr<tflite::TFLiteSettingsT>(_e->UnPack(_resolver)); } } else if (_o->tflite_settings) { _o->tflite_settings.reset(); } }
   { auto _e = event_type(); _o->event_type = _e; }
-  { auto _e = result(); if (_e) { if(_o->result) { _e->UnPackTo(_o->result.get(), _resolver); } else { _o->result = std::unique_ptr<tflite::BenchmarkResultT>(_e->UnPack(_resolver)); } } }
-  { auto _e = error(); if (_e) { if(_o->error) { _e->UnPackTo(_o->error.get(), _resolver); } else { _o->error = std::unique_ptr<tflite::BenchmarkErrorT>(_e->UnPack(_resolver)); } } }
+  { auto _e = result(); if (_e) { if(_o->result) { _e->UnPackTo(_o->result.get(), _resolver); } else { _o->result = std::unique_ptr<tflite::BenchmarkResultT>(_e->UnPack(_resolver)); } } else if (_o->result) { _o->result.reset(); } }
+  { auto _e = error(); if (_e) { if(_o->error) { _e->UnPackTo(_o->error.get(), _resolver); } else { _o->error = std::unique_ptr<tflite::BenchmarkErrorT>(_e->UnPack(_resolver)); } } else if (_o->error) { _o->error.reset(); } }
   { auto _e = boottime_us(); _o->boottime_us = _e; }
   { auto _e = wallclock_us(); _o->wallclock_us = _e; }
 }
 
-inline flatbuffers::Offset<BenchmarkEvent> BenchmarkEvent::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkEvent> BenchmarkEvent::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBenchmarkEvent(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkEvent> CreateBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkEventT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkEventT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _tflite_settings = _o->tflite_settings ? CreateTFLiteSettings(_fbb, _o->tflite_settings.get(), _rehasher) : 0;
   auto _event_type = _o->event_type;
   auto _result = _o->result ? CreateBenchmarkResult(_fbb, _o->result.get(), _rehasher) : 0;
@@ -5287,28 +5431,28 @@ inline BestAccelerationDecisionT &BestAccelerationDecisionT::operator=(BestAccel
   return *this;
 }
 
-inline BestAccelerationDecisionT *BestAccelerationDecision::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BestAccelerationDecisionT *BestAccelerationDecision::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BestAccelerationDecisionT>(new BestAccelerationDecisionT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BestAccelerationDecision::UnPackTo(BestAccelerationDecisionT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BestAccelerationDecision::UnPackTo(BestAccelerationDecisionT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = number_of_source_events(); _o->number_of_source_events = _e; }
-  { auto _e = min_latency_event(); if (_e) { if(_o->min_latency_event) { _e->UnPackTo(_o->min_latency_event.get(), _resolver); } else { _o->min_latency_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } }
+  { auto _e = min_latency_event(); if (_e) { if(_o->min_latency_event) { _e->UnPackTo(_o->min_latency_event.get(), _resolver); } else { _o->min_latency_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } else if (_o->min_latency_event) { _o->min_latency_event.reset(); } }
   { auto _e = min_inference_time_us(); _o->min_inference_time_us = _e; }
 }
 
-inline flatbuffers::Offset<BestAccelerationDecision> BestAccelerationDecision::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BestAccelerationDecision> BestAccelerationDecision::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBestAccelerationDecision(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BestAccelerationDecision> CreateBestAccelerationDecision(::flatbuffers::FlatBufferBuilder &_fbb, const BestAccelerationDecisionT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BestAccelerationDecisionT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BestAccelerationDecisionT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _number_of_source_events = _o->number_of_source_events;
   auto _min_latency_event = _o->min_latency_event ? CreateBenchmarkEvent(_fbb, _o->min_latency_event.get(), _rehasher) : 0;
   auto _min_inference_time_us = _o->min_inference_time_us;
@@ -5330,26 +5474,26 @@ inline bool operator!=(const BenchmarkInitializationFailureT &lhs, const Benchma
 }
 
 
-inline BenchmarkInitializationFailureT *BenchmarkInitializationFailure::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BenchmarkInitializationFailureT *BenchmarkInitializationFailure::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BenchmarkInitializationFailureT>(new BenchmarkInitializationFailureT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BenchmarkInitializationFailure::UnPackTo(BenchmarkInitializationFailureT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BenchmarkInitializationFailure::UnPackTo(BenchmarkInitializationFailureT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = initialization_status(); _o->initialization_status = _e; }
 }
 
-inline flatbuffers::Offset<BenchmarkInitializationFailure> BenchmarkInitializationFailure::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkInitializationFailure> BenchmarkInitializationFailure::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBenchmarkInitializationFailure(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkInitializationFailure> CreateBenchmarkInitializationFailure(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkInitializationFailureT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkInitializationFailureT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkInitializationFailureT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _initialization_status = _o->initialization_status;
   return tflite::CreateBenchmarkInitializationFailure(
       _fbb,
@@ -5385,29 +5529,29 @@ inline MiniBenchmarkEventT &MiniBenchmarkEventT::operator=(MiniBenchmarkEventT o
   return *this;
 }
 
-inline MiniBenchmarkEventT *MiniBenchmarkEvent::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline MiniBenchmarkEventT *MiniBenchmarkEvent::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<MiniBenchmarkEventT>(new MiniBenchmarkEventT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void MiniBenchmarkEvent::UnPackTo(MiniBenchmarkEventT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MiniBenchmarkEvent::UnPackTo(MiniBenchmarkEventT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = is_log_flushing_event(); _o->is_log_flushing_event = _e; }
-  { auto _e = best_acceleration_decision(); if (_e) { if(_o->best_acceleration_decision) { _e->UnPackTo(_o->best_acceleration_decision.get(), _resolver); } else { _o->best_acceleration_decision = std::unique_ptr<tflite::BestAccelerationDecisionT>(_e->UnPack(_resolver)); } } }
-  { auto _e = initialization_failure(); if (_e) { if(_o->initialization_failure) { _e->UnPackTo(_o->initialization_failure.get(), _resolver); } else { _o->initialization_failure = std::unique_ptr<tflite::BenchmarkInitializationFailureT>(_e->UnPack(_resolver)); } } }
-  { auto _e = benchmark_event(); if (_e) { if(_o->benchmark_event) { _e->UnPackTo(_o->benchmark_event.get(), _resolver); } else { _o->benchmark_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } }
+  { auto _e = best_acceleration_decision(); if (_e) { if(_o->best_acceleration_decision) { _e->UnPackTo(_o->best_acceleration_decision.get(), _resolver); } else { _o->best_acceleration_decision = std::unique_ptr<tflite::BestAccelerationDecisionT>(_e->UnPack(_resolver)); } } else if (_o->best_acceleration_decision) { _o->best_acceleration_decision.reset(); } }
+  { auto _e = initialization_failure(); if (_e) { if(_o->initialization_failure) { _e->UnPackTo(_o->initialization_failure.get(), _resolver); } else { _o->initialization_failure = std::unique_ptr<tflite::BenchmarkInitializationFailureT>(_e->UnPack(_resolver)); } } else if (_o->initialization_failure) { _o->initialization_failure.reset(); } }
+  { auto _e = benchmark_event(); if (_e) { if(_o->benchmark_event) { _e->UnPackTo(_o->benchmark_event.get(), _resolver); } else { _o->benchmark_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } else if (_o->benchmark_event) { _o->benchmark_event.reset(); } }
 }
 
-inline flatbuffers::Offset<MiniBenchmarkEvent> MiniBenchmarkEvent::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MiniBenchmarkEvent> MiniBenchmarkEvent::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateMiniBenchmarkEvent(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MiniBenchmarkEvent> CreateMiniBenchmarkEvent(::flatbuffers::FlatBufferBuilder &_fbb, const MiniBenchmarkEventT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MiniBenchmarkEventT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MiniBenchmarkEventT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _is_log_flushing_event = _o->is_log_flushing_event;
   auto _best_acceleration_decision = _o->best_acceleration_decision ? CreateBestAccelerationDecision(_fbb, _o->best_acceleration_decision.get(), _rehasher) : 0;
   auto _initialization_failure = _o->initialization_failure ? CreateBenchmarkInitializationFailure(_fbb, _o->initialization_failure.get(), _rehasher) : 0;
@@ -5455,31 +5599,31 @@ inline ModelFileT &ModelFileT::operator=(ModelFileT o) FLATBUFFERS_NOEXCEPT {
   return *this;
 }
 
-inline ModelFileT *ModelFile::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ModelFileT *ModelFile::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ModelFileT>(new ModelFileT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ModelFile::UnPackTo(ModelFileT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ModelFile::UnPackTo(ModelFileT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = filename(); if (_e) _o->filename = _e->str(); }
   { auto _e = fd(); _o->fd = _e; }
   { auto _e = offset(); _o->offset = _e; }
   { auto _e = length(); _o->length = _e; }
-  { auto _e = model_id_group(); if (_e) { if(_o->model_id_group) { _e->UnPackTo(_o->model_id_group.get(), _resolver); } else { _o->model_id_group = std::unique_ptr<tflite::ModelIdGroupT>(_e->UnPack(_resolver)); } } }
+  { auto _e = model_id_group(); if (_e) { if(_o->model_id_group) { _e->UnPackTo(_o->model_id_group.get(), _resolver); } else { _o->model_id_group = std::unique_ptr<tflite::ModelIdGroupT>(_e->UnPack(_resolver)); } } else if (_o->model_id_group) { _o->model_id_group.reset(); } }
   { auto _e = buffer_handle(); _o->buffer_handle = _e; }
 }
 
-inline flatbuffers::Offset<ModelFile> ModelFile::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ModelFile> ModelFile::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateModelFile(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ModelFile> CreateModelFile(flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ModelFile> CreateModelFile(::flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ModelFileT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ModelFileT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _filename = _o->filename.empty() ? 0 : _fbb.CreateString(_o->filename);
   auto _fd = _o->fd;
   auto _offset = _o->offset;
@@ -5508,27 +5652,27 @@ inline bool operator!=(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs) {
 }
 
 
-inline ModelIdGroupT *ModelIdGroup::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ModelIdGroupT *ModelIdGroup::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ModelIdGroupT>(new ModelIdGroupT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ModelIdGroup::UnPackTo(ModelIdGroupT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ModelIdGroup::UnPackTo(ModelIdGroupT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = model_namespace(); if (_e) _o->model_namespace = _e->str(); }
   { auto _e = model_id(); if (_e) _o->model_id = _e->str(); }
 }
 
-inline flatbuffers::Offset<ModelIdGroup> ModelIdGroup::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ModelIdGroup> ModelIdGroup::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateModelIdGroup(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(::flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ModelIdGroupT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ModelIdGroupT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _model_namespace = _o->model_namespace.empty() ? 0 : _fbb.CreateString(_o->model_namespace);
   auto _model_id = _o->model_id.empty() ? 0 : _fbb.CreateString(_o->model_id);
   return tflite::CreateModelIdGroup(
@@ -5549,27 +5693,27 @@ inline bool operator!=(const BenchmarkStoragePathsT &lhs, const BenchmarkStorage
 }
 
 
-inline BenchmarkStoragePathsT *BenchmarkStoragePaths::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BenchmarkStoragePathsT *BenchmarkStoragePaths::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BenchmarkStoragePathsT>(new BenchmarkStoragePathsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BenchmarkStoragePaths::UnPackTo(BenchmarkStoragePathsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BenchmarkStoragePaths::UnPackTo(BenchmarkStoragePathsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = storage_file_path(); if (_e) _o->storage_file_path = _e->str(); }
   { auto _e = data_directory_path(); if (_e) _o->data_directory_path = _e->str(); }
 }
 
-inline flatbuffers::Offset<BenchmarkStoragePaths> BenchmarkStoragePaths::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> BenchmarkStoragePaths::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBenchmarkStoragePaths(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkStoragePaths> CreateBenchmarkStoragePaths(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkStoragePathsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkStoragePathsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkStoragePathsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _storage_file_path = _o->storage_file_path.empty() ? 0 : _fbb.CreateString(_o->storage_file_path);
   auto _data_directory_path = _o->data_directory_path.empty() ? 0 : _fbb.CreateString(_o->data_directory_path);
   return tflite::CreateBenchmarkStoragePaths(
@@ -5589,26 +5733,26 @@ inline bool operator!=(const ValidationSettingsT &lhs, const ValidationSettingsT
 }
 
 
-inline ValidationSettingsT *ValidationSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ValidationSettingsT *ValidationSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ValidationSettingsT>(new ValidationSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ValidationSettings::UnPackTo(ValidationSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ValidationSettings::UnPackTo(ValidationSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = per_test_timeout_ms(); _o->per_test_timeout_ms = _e; }
 }
 
-inline flatbuffers::Offset<ValidationSettings> ValidationSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ValidationSettings> ValidationSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateValidationSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ValidationSettings> CreateValidationSettings(flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ValidationSettings> CreateValidationSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ValidationSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ValidationSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ValidationSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _per_test_timeout_ms = _o->per_test_timeout_ms;
   return tflite::CreateValidationSettings(
       _fbb,
@@ -5618,7 +5762,7 @@ inline flatbuffers::Offset<ValidationSettings> CreateValidationSettings(flatbuff
 
 inline bool operator==(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs) {
   return
-      (lhs.settings_to_test == rhs.settings_to_test) &&
+      (lhs.settings_to_test.size() == rhs.settings_to_test.size() && std::equal(lhs.settings_to_test.cbegin(), lhs.settings_to_test.cend(), rhs.settings_to_test.cbegin(), [](std::unique_ptr<tflite::TFLiteSettingsT> const &a, std::unique_ptr<tflite::TFLiteSettingsT> const &b) { return (a == b) || (a && b && *a == *b); })) &&
       ((lhs.model_file == rhs.model_file) || (lhs.model_file && rhs.model_file && *lhs.model_file == *rhs.model_file)) &&
       ((lhs.storage_paths == rhs.storage_paths) || (lhs.storage_paths && rhs.storage_paths && *lhs.storage_paths == *rhs.storage_paths)) &&
       ((lhs.validation_settings == rhs.validation_settings) || (lhs.validation_settings && rhs.validation_settings && *lhs.validation_settings == *rhs.validation_settings));
@@ -5634,7 +5778,7 @@ inline MinibenchmarkSettingsT::MinibenchmarkSettingsT(const MinibenchmarkSetting
         storage_paths((o.storage_paths) ? new tflite::BenchmarkStoragePathsT(*o.storage_paths) : nullptr),
         validation_settings((o.validation_settings) ? new tflite::ValidationSettingsT(*o.validation_settings) : nullptr) {
   settings_to_test.reserve(o.settings_to_test.size());
-  for (const auto &v : o.settings_to_test) { settings_to_test.emplace_back((v) ? new tflite::TFLiteSettingsT(*v) : nullptr); }
+  for (const auto &settings_to_test_ : o.settings_to_test) { settings_to_test.emplace_back((settings_to_test_) ? new tflite::TFLiteSettingsT(*settings_to_test_) : nullptr); }
 }
 
 inline MinibenchmarkSettingsT &MinibenchmarkSettingsT::operator=(MinibenchmarkSettingsT o) FLATBUFFERS_NOEXCEPT {
@@ -5645,30 +5789,30 @@ inline MinibenchmarkSettingsT &MinibenchmarkSettingsT::operator=(MinibenchmarkSe
   return *this;
 }
 
-inline MinibenchmarkSettingsT *MinibenchmarkSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline MinibenchmarkSettingsT *MinibenchmarkSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<MinibenchmarkSettingsT>(new MinibenchmarkSettingsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void MinibenchmarkSettings::UnPackTo(MinibenchmarkSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MinibenchmarkSettings::UnPackTo(MinibenchmarkSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = settings_to_test(); if (_e) { _o->settings_to_test.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->settings_to_test[_i]) { _e->Get(_i)->UnPackTo(_o->settings_to_test[_i].get(), _resolver); } else { _o->settings_to_test[_i] = std::unique_ptr<tflite::TFLiteSettingsT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
-  { auto _e = model_file(); if (_e) { if(_o->model_file) { _e->UnPackTo(_o->model_file.get(), _resolver); } else { _o->model_file = std::unique_ptr<tflite::ModelFileT>(_e->UnPack(_resolver)); } } }
-  { auto _e = storage_paths(); if (_e) { if(_o->storage_paths) { _e->UnPackTo(_o->storage_paths.get(), _resolver); } else { _o->storage_paths = std::unique_ptr<tflite::BenchmarkStoragePathsT>(_e->UnPack(_resolver)); } } }
-  { auto _e = validation_settings(); if (_e) { if(_o->validation_settings) { _e->UnPackTo(_o->validation_settings.get(), _resolver); } else { _o->validation_settings = std::unique_ptr<tflite::ValidationSettingsT>(_e->UnPack(_resolver)); } } }
+  { auto _e = settings_to_test(); if (_e) { _o->settings_to_test.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->settings_to_test[_i]) { _e->Get(_i)->UnPackTo(_o->settings_to_test[_i].get(), _resolver); } else { _o->settings_to_test[_i] = std::unique_ptr<tflite::TFLiteSettingsT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->settings_to_test.resize(0); } }
+  { auto _e = model_file(); if (_e) { if(_o->model_file) { _e->UnPackTo(_o->model_file.get(), _resolver); } else { _o->model_file = std::unique_ptr<tflite::ModelFileT>(_e->UnPack(_resolver)); } } else if (_o->model_file) { _o->model_file.reset(); } }
+  { auto _e = storage_paths(); if (_e) { if(_o->storage_paths) { _e->UnPackTo(_o->storage_paths.get(), _resolver); } else { _o->storage_paths = std::unique_ptr<tflite::BenchmarkStoragePathsT>(_e->UnPack(_resolver)); } } else if (_o->storage_paths) { _o->storage_paths.reset(); } }
+  { auto _e = validation_settings(); if (_e) { if(_o->validation_settings) { _e->UnPackTo(_o->validation_settings.get(), _resolver); } else { _o->validation_settings = std::unique_ptr<tflite::ValidationSettingsT>(_e->UnPack(_resolver)); } } else if (_o->validation_settings) { _o->validation_settings.reset(); } }
 }
 
-inline flatbuffers::Offset<MinibenchmarkSettings> MinibenchmarkSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MinibenchmarkSettings> MinibenchmarkSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateMinibenchmarkSettings(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(::flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MinibenchmarkSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _settings_to_test = _o->settings_to_test.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::TFLiteSettings>> (_o->settings_to_test.size(), [](size_t i, _VectorArgs *__va) { return CreateTFLiteSettings(*__va->__fbb, __va->__o->settings_to_test[i].get(), __va->__rehasher); }, &_va ) : 0;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MinibenchmarkSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _settings_to_test = _o->settings_to_test.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TFLiteSettings>> (_o->settings_to_test.size(), [](size_t i, _VectorArgs *__va) { return CreateTFLiteSettings(*__va->__fbb, __va->__o->settings_to_test[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _model_file = _o->model_file ? CreateModelFile(_fbb, _o->model_file.get(), _rehasher) : 0;
   auto _storage_paths = _o->storage_paths ? CreateBenchmarkStoragePaths(_fbb, _o->storage_paths.get(), _rehasher) : 0;
   auto _validation_settings = _o->validation_settings ? CreateValidationSettings(_fbb, _o->validation_settings.get(), _rehasher) : 0;
@@ -5703,27 +5847,27 @@ inline BenchmarkEventStorageT &BenchmarkEventStorageT::operator=(BenchmarkEventS
   return *this;
 }
 
-inline BenchmarkEventStorageT *BenchmarkEventStorage::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BenchmarkEventStorageT *BenchmarkEventStorage::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BenchmarkEventStorageT>(new BenchmarkEventStorageT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BenchmarkEventStorage::UnPackTo(BenchmarkEventStorageT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BenchmarkEventStorage::UnPackTo(BenchmarkEventStorageT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = model_id_group(); if (_e) { if(_o->model_id_group) { _e->UnPackTo(_o->model_id_group.get(), _resolver); } else { _o->model_id_group = std::unique_ptr<tflite::ModelIdGroupT>(_e->UnPack(_resolver)); } } }
-  { auto _e = benchmark_event(); if (_e) { if(_o->benchmark_event) { _e->UnPackTo(_o->benchmark_event.get(), _resolver); } else { _o->benchmark_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } }
+  { auto _e = model_id_group(); if (_e) { if(_o->model_id_group) { _e->UnPackTo(_o->model_id_group.get(), _resolver); } else { _o->model_id_group = std::unique_ptr<tflite::ModelIdGroupT>(_e->UnPack(_resolver)); } } else if (_o->model_id_group) { _o->model_id_group.reset(); } }
+  { auto _e = benchmark_event(); if (_e) { if(_o->benchmark_event) { _e->UnPackTo(_o->benchmark_event.get(), _resolver); } else { _o->benchmark_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } else if (_o->benchmark_event) { _o->benchmark_event.reset(); } }
 }
 
-inline flatbuffers::Offset<BenchmarkEventStorage> BenchmarkEventStorage::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkEventStorage> BenchmarkEventStorage::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBenchmarkEventStorage(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(::flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkEventStorageT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkEventStorageT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _model_id_group = _o->model_id_group ? CreateModelIdGroup(_fbb, _o->model_id_group.get(), _rehasher) : 0;
   auto _benchmark_event = _o->benchmark_event ? CreateBenchmarkEvent(_fbb, _o->benchmark_event.get(), _rehasher) : 0;
   return tflite::CreateBenchmarkEventStorage(
diff --git a/tensorflow/lite/experimental/acceleration/configuration/delegate_plugin_converter.h b/tensorflow/lite/experimental/acceleration/configuration/delegate_plugin_converter.h
index 17d860f87d6..707c9ce3244 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/delegate_plugin_converter.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/delegate_plugin_converter.h
@@ -15,23 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_CONVERTER_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_CONVERTER_H_
 
-#include <functional>
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h"
-#include "tensorflow/lite/core/shims/cc/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/acceleration/configuration/delegate_plugin_converter.h"  // IWYU pragma: export
 
-namespace tflite {
-namespace delegates {
-
-// Converts from the C delegate plugin API to the C++ delegate plugin API.
-// Given an instance of the (C) TfLiteOpaqueDelegatePlugin struct, this returns
-// a function that takes a TFLiteSettings FlatBuffer and returns a unique_ptr to
-// an instance of the (C++) DelegatePluginInterface abstract class.
-std::function<std::unique_ptr<tflite_shims::delegates::DelegatePluginInterface>(
-    const ::tflite::TFLiteSettings&)>
-DelegatePluginConverter(const TfLiteOpaqueDelegatePlugin& plugin_c_api);
-
-}  // namespace delegates
-}  // namespace tflite
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/delegate_plugin_converter.h"
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_CONVERTER_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
index 2c6a3fcd57f..54a5da39fed 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
@@ -15,16 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-namespace tflite {
-namespace delegates {
+#include "tensorflow/lite/acceleration/configuration/delegate_registry.h"  // IWYU pragma: export
 
-using TfLiteOpaqueDelegatePtr = ::tflite::delegates::TfLiteDelegatePtr;
-using DelegatePluginInterface = ::tflite::delegates::DelegatePluginInterface;
-using DelegatePluginRegistry = ::tflite::delegates::DelegatePluginRegistry;
-
-}  // namespace delegates
-}  // namespace tflite
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/delegate_registry.h"
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.h b/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.h
index 25a2669e178..f40f4ece9cf 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.h
@@ -15,27 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_FLATBUFFER_TO_PROTO_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_FLATBUFFER_TO_PROTO_H_
 
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-namespace tflite {
+#include "tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.h"  // IWYU pragma: export
 
-// Converts the provided ComputeSettings from flatbuffer to proto format.
-proto::ComputeSettings ConvertFromFlatbuffer(
-    const ComputeSettings& settings, bool skip_mini_benchmark_settings = false);
-
-proto::ComputeSettings ConvertFromFlatbuffer(
-    const ComputeSettingsT& settings,
-    bool skip_mini_benchmark_settings = false);
-
-// Converts the provided MiniBenchmarkEvent from flatbuffer to proto format.
-proto::MiniBenchmarkEvent ConvertFromFlatbuffer(
-    const MiniBenchmarkEvent& event);
-
-proto::MiniBenchmarkEvent ConvertFromFlatbuffer(
-    const MiniBenchmarkEventT& event);
-
-}  // namespace tflite
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.h"
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_FLATBUFFER_TO_PROTO_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h
index 37ffb6bd395..191f689bcce 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h
@@ -15,65 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
 
-// This file provides the GpuPlugin class, which implements the
-// TFLite Delegate Plugin for the GPU Delegate.
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-#if defined(__ANDROID__) || defined(CL_DELEGATE_NO_GL)
-#define TFLITE_SUPPORTS_GPU_DELEGATE 1
-#endif
+#include "tensorflow/lite/acceleration/configuration/gpu_plugin.h"  // IWYU pragma: export
 
-#include <string>
-
-#if TFLITE_SUPPORTS_GPU_DELEGATE
-#include "tensorflow/lite/delegates/gpu/delegate.h"
-#elif defined(__APPLE__)
-#include "TargetConditionals.h"
-#if (TARGET_OS_IPHONE && !TARGET_IPHONE_SIMULATOR) || \
-    (TARGET_OS_OSX && TARGET_CPU_ARM64)
-// Only enable metal delegate when using a real iPhone device or Apple Silicon.
-#define REAL_IPHONE_DEVICE
-#include "tensorflow/lite/delegates/gpu/metal_delegate.h"
-#endif
-#endif
-
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-
-namespace tflite {
-namespace delegates {
-
-// Note that if running on GPU is not supported for some reason (e.g., desktop
-// machine with no OpenGL/CL), this library will still compile but calling
-// Create() will return a nullptr.
-class GpuPlugin : public DelegatePluginInterface {
- public:
-  explicit GpuPlugin(const TFLiteSettings& tflite_settings);
-  static std::unique_ptr<DelegatePluginInterface> New(
-      const TFLiteSettings& acceleration);
-
-  TfLiteDelegatePtr Create() override;
-  int GetDelegateErrno(TfLiteDelegate* from_delegate) override;
-
-#if TFLITE_SUPPORTS_GPU_DELEGATE
-  const TfLiteGpuDelegateOptionsV2& Options() { return options_; }
-#elif defined(REAL_IPHONE_DEVICE)
-  const TFLGpuDelegateOptions& Options() { return options_; }
-#endif
-
-  std::string GetCacheDir() const { return cache_dir_; }
-  std::string GetModelToken() const { return model_token_; }
-
- private:
-#if TFLITE_SUPPORTS_GPU_DELEGATE
-  TfLiteGpuDelegateOptionsV2 options_;
-#elif defined(REAL_IPHONE_DEVICE)
-  TFLGpuDelegateOptions options_;
-#endif
-  std::string cache_dir_;
-  std::string model_token_;
-};
-
-}  // namespace delegates
-}  // namespace tflite
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/gpu_plugin.h"
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h
index 19e6990d747..2d0b7cf6afc 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h
@@ -15,151 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
 
-// This file provides the NNApiPlugin class, which implements the
-// TFLite Delegate Plugin for the NNAPI Delegate.
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-#include <memory>
-#include <string>
+#include "tensorflow/lite/acceleration/configuration/nnapi_plugin.h"  // IWYU pragma: export
 
-#include "absl/memory/memory.h"
-#include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/nnapi/nnapi_implementation.h"
-
-namespace tflite {
-namespace delegates {
-
-class NnapiPlugin : public DelegatePluginInterface {
- public:
-  TfLiteDelegatePtr Create() override {
-    std::unique_ptr<tflite::StatefulNnApiDelegate> nnapi_delegate = nullptr;
-    if (!support_library_handle_) {
-      nnapi_delegate =
-          std::make_unique<tflite::StatefulNnApiDelegate>(options_);
-    } else {
-      auto nnapi_support_library_driver =
-          reinterpret_cast<const NnApiSLDriverImplFL5*>(
-              support_library_handle_);
-      nnapi_delegate = std::make_unique<tflite::StatefulNnApiDelegate>(
-          nnapi_support_library_driver, options_);
-    }
-    return TfLiteDelegatePtr(
-        nnapi_delegate.release(), [](TfLiteDelegate* delegate) {
-          delete static_cast<tflite::StatefulNnApiDelegate*>(delegate);
-        });
-  }
-  int GetDelegateErrno(TfLiteDelegate* from_delegate) override {
-    auto nnapi_delegate =
-        static_cast<tflite::StatefulNnApiDelegate*>(from_delegate);
-    return nnapi_delegate->GetNnApiErrno();
-  }
-  static std::unique_ptr<NnapiPlugin> New(
-      const TFLiteSettings& tflite_settings) {
-    return std::make_unique<NnapiPlugin>(tflite_settings);
-  }
-  explicit NnapiPlugin(const TFLiteSettings& tflite_settings) {
-    const NNAPISettings* nnapi_settings = tflite_settings.nnapi_settings();
-    if (!nnapi_settings) return;
-    if (nnapi_settings->accelerator_name() &&
-        nnapi_settings->accelerator_name()->Length() != 0) {
-      accelerator_ = nnapi_settings->accelerator_name()->str();
-      options_.accelerator_name = accelerator_.c_str();
-    }
-
-    SetCompilationCacheDir(tflite_settings);
-    SetModelToken(tflite_settings);
-
-    options_.execution_preference =
-        ConvertExecutionPrefence(nnapi_settings->execution_preference());
-    options_.disallow_nnapi_cpu =
-        !nnapi_settings->allow_nnapi_cpu_on_android_10_plus();
-    options_.execution_priority =
-        ConvertExecutionPriority(nnapi_settings->execution_priority());
-    options_.allow_fp16 = nnapi_settings->allow_fp16_precision_for_fp32();
-    options_.use_burst_computation = nnapi_settings->use_burst_computation();
-    if (tflite_settings.max_delegated_partitions() >= 0) {
-      options_.max_number_delegated_partitions =
-          tflite_settings.max_delegated_partitions();
-    }
-    support_library_handle_ = nnapi_settings->support_library_handle();
-  }
-  const tflite::StatefulNnApiDelegate::Options& Options() { return options_; }
-  const int64_t GetSupportLibraryHandle() { return support_library_handle_; }
-
- private:
-  void SetCompilationCacheDir(const TFLiteSettings& tflite_settings) {
-    if (tflite_settings.compilation_caching_settings() &&
-        tflite_settings.compilation_caching_settings()->cache_dir() &&
-        tflite_settings.compilation_caching_settings()->cache_dir()->Length() !=
-            0) {
-      cache_dir_ =
-          tflite_settings.compilation_caching_settings()->cache_dir()->str();
-      options_.cache_dir = cache_dir_.c_str();
-    } else if (tflite_settings.nnapi_settings() &&
-               tflite_settings.nnapi_settings()->cache_directory() &&
-               tflite_settings.nnapi_settings()->cache_directory()->Length() !=
-                   0) {
-      cache_dir_ = tflite_settings.nnapi_settings()->cache_directory()->str();
-      options_.cache_dir = cache_dir_.c_str();
-    }
-  }
-
-  void SetModelToken(const TFLiteSettings& tflite_settings) {
-    if (tflite_settings.compilation_caching_settings() &&
-        tflite_settings.compilation_caching_settings()->model_token() &&
-        tflite_settings.compilation_caching_settings()
-                ->model_token()
-                ->Length() != 0) {
-      model_token_ =
-          tflite_settings.compilation_caching_settings()->model_token()->str();
-      options_.model_token = model_token_.c_str();
-    } else if (tflite_settings.nnapi_settings()->model_token() &&
-               tflite_settings.nnapi_settings()->model_token()->Length() != 0) {
-      model_token_ = tflite_settings.nnapi_settings()->model_token()->str();
-      options_.model_token = model_token_.c_str();
-    }
-  }
-
-  static inline tflite::StatefulNnApiDelegate::Options::ExecutionPreference
-  ConvertExecutionPrefence(
-      NNAPIExecutionPreference from_compatibility_preference) {
-    using TflitePreference =
-        tflite::StatefulNnApiDelegate::Options::ExecutionPreference;
-    switch (from_compatibility_preference) {
-      case NNAPIExecutionPreference_NNAPI_LOW_POWER:
-        return TflitePreference::kLowPower;
-      case NNAPIExecutionPreference_NNAPI_FAST_SINGLE_ANSWER:
-        return TflitePreference::kFastSingleAnswer;
-      case NNAPIExecutionPreference_NNAPI_SUSTAINED_SPEED:
-        return TflitePreference::kSustainedSpeed;
-      default:
-        return TflitePreference::kUndefined;
-    }
-  }
-
-  static inline int ConvertExecutionPriority(
-      NNAPIExecutionPriority from_compatibility_priority) {
-    switch (from_compatibility_priority) {
-      case NNAPIExecutionPriority_NNAPI_PRIORITY_LOW:
-        return ANEURALNETWORKS_PRIORITY_LOW;
-      case NNAPIExecutionPriority_NNAPI_PRIORITY_MEDIUM:
-        return ANEURALNETWORKS_PRIORITY_MEDIUM;
-      case NNAPIExecutionPriority_NNAPI_PRIORITY_HIGH:
-        return ANEURALNETWORKS_PRIORITY_HIGH;
-      default:
-        return ANEURALNETWORKS_PRIORITY_DEFAULT;
-    }
-  }
-
-  std::string accelerator_, cache_dir_, model_token_;
-  tflite::StatefulNnApiDelegate::Options options_;
-  int64_t support_library_handle_ = 0;
-};
-
-}  // namespace delegates
-}  // namespace tflite
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/nnapi_plugin.h"
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h
index a719a1ed370..d3b0c1d2a0e 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h
@@ -15,22 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
 
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-namespace tflite {
+#include "tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h"  // IWYU pragma: export
 
-// Converts the provided ComputeSettings from proto to flatbuffer format.
-const ComputeSettings* ConvertFromProto(
-    const proto::ComputeSettings& proto_settings,
-    flatbuffers::FlatBufferBuilder* builder);
-
-// Converts the provided MiniBenchmarkSettings from proto to flatbuffer format.
-const MinibenchmarkSettings* ConvertFromProto(
-    const proto::MinibenchmarkSettings& proto_settings,
-    flatbuffers::FlatBufferBuilder* builder);
-
-}  // namespace tflite
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h"
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_PROTO_TO_FLATBUFFER_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.h
index 6ca42f334d6..653f6a6b495 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.h
@@ -15,78 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
 
-// This file provides the StableDelegatePlugin class, which implements the
-// TFLite Delegate Plugin Interface for the stable delegates.
+// This header file is no longer experimental.
+// Please use the non-experimental file instead.
 
-#include <memory>
-#include <string>
+#include "tensorflow/lite/acceleration/configuration/stable_delegate_plugin.h"  // IWYU pragma: export
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
-#include "tensorflow/lite/core/shims/c/common.h"
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h"
-#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/tools/logging.h"
-
-namespace tflite {
-namespace delegates {
-
-class StableDelegatePlugin : public DelegatePluginInterface {
- public:
-  static std::unique_ptr<StableDelegatePlugin> New(
-      const TFLiteSettings& tflite_settings) {
-    return std::make_unique<StableDelegatePlugin>(tflite_settings);
-  }
-
-  explicit StableDelegatePlugin(const TFLiteSettings& tflite_settings) {
-    // Creates a copy of TFLiteSettings within the stable delegate plugin.
-    TFLiteSettingsT tflite_settings_t;
-    tflite_settings.UnPackTo(&tflite_settings_t);
-    tflite_settings_builder_.Finish(
-        CreateTFLiteSettings(tflite_settings_builder_, &tflite_settings_t));
-    const StableDelegateLoaderSettings* stable_delegate_loader_settings =
-        GetTFLiteSettings()->stable_delegate_loader_settings();
-    if (!stable_delegate_loader_settings ||
-        !stable_delegate_loader_settings->delegate_path() ||
-        stable_delegate_loader_settings->delegate_path()->Length() == 0) {
-      TFLITE_LOG(ERROR) << "The delegate path field is not available from the "
-                           "provided stable delegate loader settings.";
-      return;
-    }
-    const auto* stable_delegate_ = utils::LoadDelegateFromSharedLibrary(
-        stable_delegate_loader_settings->delegate_path()->str());
-    if (!stable_delegate_) {
-      TFLITE_LOG(ERROR) << "Failed to load stable delegate plugin symbol from "
-                        << stable_delegate_loader_settings->delegate_path();
-      return;
-    }
-    stable_delegate_plugin_ = stable_delegate_->delegate_plugin;
-    TFLITE_LOG(INFO)
-        << "The stable delegate plugin has loaded delegate plugin for "
-        << stable_delegate_->delegate_name;
-  }
-
-  TfLiteDelegatePtr Create() override {
-    return TfLiteDelegatePtr(
-        stable_delegate_plugin_->create(GetTFLiteSettings()),
-        stable_delegate_plugin_->destroy);
-  }
-
-  int GetDelegateErrno(TfLiteOpaqueDelegate* from_delegate) override {
-    return stable_delegate_plugin_->get_delegate_errno(from_delegate);
-  }
-
- private:
-  const TFLiteSettings* GetTFLiteSettings() {
-    return flatbuffers::GetRoot<TFLiteSettings>(
-        tflite_settings_builder_.GetBufferPointer());
-  }
-
-  const TfLiteOpaqueDelegatePlugin* stable_delegate_plugin_;
-  flatbuffers::FlatBufferBuilder tflite_settings_builder_;
-};
-
-}  // namespace delegates
-}  // namespace tflite
+// IWYU pragma: private, include "third_party/tensorflow/lite/acceleration/configuration/stable_delegate_plugin.h"
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
index 0b168f5157a..b11ec65cfe3 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
@@ -95,9 +95,9 @@ cc_test(
     deps = [
         ":fb_storage",
         ":status_codes",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
     ],
@@ -406,7 +406,7 @@ cc_library(
     name = "benchmark_result_evaluator",
     srcs = ["benchmark_result_evaluator.cc"],
     hdrs = ["benchmark_result_evaluator.h"],
-    deps = ["//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs"],
+    deps = ["//tensorflow/lite/acceleration/configuration:configuration_fbs"],
 )
 
 cc_library(
@@ -428,9 +428,9 @@ cc_library(
         "//tensorflow/lite/core/c:c_api",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
         "//tensorflow/lite/core/kernels:builtin_ops",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:constants",
         "//tensorflow/lite/tools:model_loader",
     ],
@@ -443,9 +443,9 @@ cc_library(
     deps = [
         ":benchmark_result_evaluator",
         "//tensorflow/lite:stderr_reporter",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration/c:delegate_plugin",
         "//tensorflow/lite/core/api:error_reporter",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
         "//tensorflow/lite/nnapi/sl:nnapi_support_library",  # buildcleaner: keep
         "//tensorflow/lite/nnapi/sl:nnapi_support_library_headers",
         "@com_google_absl//absl/strings",
@@ -465,11 +465,11 @@ cc_library(
         ":validator",
         "@com_google_absl//absl/strings",
         "@flatbuffers",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/acceleration/configuration/c:delegate_plugin",
         "//tensorflow/lite:allocation",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/core/api:error_reporter",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier:custom_validation_embedder",
         # For NNAPI support library, the headers and source files are defined
         # as two separate targets. We need to include both targets for NNAPI to
@@ -490,7 +490,7 @@ cc_library(
         ":validator_runner_impl",
         ":validator_runner_options",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@flatbuffers",
@@ -511,7 +511,7 @@ cc_library(
         "@flatbuffers",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/core/api",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         # For NNAPI support library, the headers and source files are defined
         # as two separate targets. We need to include both targets for NNAPI to
         # be invoked.
@@ -536,9 +536,8 @@ cc_library(
         ":set_big_core_affinity_h",
         ":status_codes",
         ":validator",
-        "@com_google_absl//absl/strings",
         "@flatbuffers",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         # For NNAPI support library, the headears and source files are defined
         # as two separate targets. We need to include both targets for NNAPI to
         # be invoked.
@@ -627,13 +626,13 @@ cc_library(
         ":big_little_affinity",
         ":constants",
         ":status_codes",
-        "@com_google_absl//absl/strings",
-        "@flatbuffers//:runtime_cc",
-        "//tensorflow/lite/core/api:error_reporter",
-        "//tensorflow/lite/experimental/acceleration/compatibility:android_info",
         "//tensorflow/lite:allocation",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:stderr_reporter",
+        "//tensorflow/lite/core/api:error_reporter",
+        "//tensorflow/lite/experimental/acceleration/compatibility:android_info",
+        "@com_google_absl//absl/strings",
+        "@flatbuffers//:runtime_cc",
     ] + select({
         clean_dep("//tensorflow:android"): [
             ":embedded_runner_executable",
@@ -675,7 +674,7 @@ cc_library(
         "@org_tensorflow_lite_support//tensorflow_lite_support/cc:__subpackages__",
     ] + minibenchmark_visibility_allowlist(),
     deps = [
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "@flatbuffers",
@@ -700,7 +699,7 @@ cc_library(
         ":validator_runner_entrypoint",
         ":validator_runner_options",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/nnapi/sl:nnapi_support_library_headers",
         "@flatbuffers",
     ],
@@ -725,9 +724,9 @@ cc_library(
         "//conditions:default": [],
     }),
     deps = [
-        "@com_google_googletest//:gtest",
         "//tensorflow/lite/experimental/acceleration/compatibility:android_info",
         "//tensorflow/lite/tools:logging",
+        "@com_google_googletest//:gtest",
     ] + select({
         clean_dep("//tensorflow:android"): [
             ":embedded_runner_executable",
@@ -753,11 +752,11 @@ cc_test(
         ":mini_benchmark_implementation",
         ":mini_benchmark_test_helper",
         ":status_codes",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_cc_proto",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin",
-        "//tensorflow/lite/experimental/acceleration/configuration:proto_to_flatbuffer",
-        "//tensorflow/lite/experimental/acceleration/configuration:xnnpack_plugin",
+        "//tensorflow/lite/acceleration/configuration:configuration_cc_proto",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:nnapi_plugin",
+        "//tensorflow/lite/acceleration/configuration:proto_to_flatbuffer",
+        "//tensorflow/lite/acceleration/configuration:xnnpack_plugin",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_nnapi_sl_fake_impl",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:nnapi_sl_fake_impl_client",
         "//tensorflow/lite/nnapi/sl:nnapi_support_library",
@@ -841,11 +840,11 @@ cc_test(
     deps = [
         ":runner",
         ":status_codes",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers//:runtime_cc",
-        "//tensorflow/lite/schema:schema_fbs_with_mutable",
         "//tensorflow/lite:allocation",
         "//tensorflow/lite:stderr_reporter",
+        "//tensorflow/lite/schema:schema_fbs_with_mutable",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:runtime_cc",
     ] + select({
         clean_dep("//tensorflow:android"): [
             ":embedded_runner_executable",
@@ -869,26 +868,26 @@ cc_binary(
     deps = [
         ":constants",
         ":fb_storage",
-        ":runner",
-        ":status_codes",
-        ":set_big_core_affinity",
         ":file_lock",
+        ":runner",
+        ":set_big_core_affinity",
+        ":status_codes",
         ":validator",
         ":validator_runner",
-        "@com_google_absl//absl/strings",
-        "@flatbuffers",
-        "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:nnapi_plugin",
         "//tensorflow/lite/nnapi/sl:nnapi_support_library_headers",
         "//tensorflow/lite/tools:model_loader",
-        "//tensorflow/lite:minimal_logging",
+        "@com_google_absl//absl/strings",
+        "@flatbuffers",
     ] + select({
         # On Android, as the validation runs in a separate process as a
         # different binary, any TFLite delegates to be validated need to
         # include corresponding delegate plugins.
         clean_dep("//tensorflow:android"): [
-            "//tensorflow/lite/experimental/acceleration/configuration:gpu_plugin",
-            "//tensorflow/lite/experimental/acceleration/configuration:xnnpack_plugin",
+            "//tensorflow/lite/acceleration/configuration:gpu_plugin",
+            "//tensorflow/lite/acceleration/configuration:xnnpack_plugin",
         ],
         "//conditions:default": [],
     }),
@@ -921,9 +920,9 @@ cc_library(
     deps = [
         ":status_codes",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
         "@com_google_absl//absl/memory",
         "@flatbuffers",
     ],
@@ -951,6 +950,13 @@ embedded_binary(
     binary = "//tensorflow/lite/experimental/acceleration/mini_benchmark/models:mobilenet_v1_1.0_224_quant.tflite",
 )
 
+embedded_binary(
+    name = "embedded_simple_addition_model",
+    testonly = 1,
+    array_variable_name = "g_tflite_acceleration_embedded_simple_addition_model",
+    binary = "//tensorflow/lite:testdata/add.bin",
+)
+
 cc_test(
     name = "validator_test",
     srcs = ["validator_test.cc"],
@@ -960,27 +966,27 @@ cc_test(
         "tflite_not_portable_ios",
     ],
     deps = [
-        ":embedded_mobilenet_validation_model",
         ":embedded_mobilenet_model",
+        ":embedded_mobilenet_validation_model",
         ":mini_benchmark_test_helper",
         ":status_codes",
         ":validator",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers//:runtime_cc",
-        "@flatbuffers",
+        "//tensorflow/lite:stderr_reporter",
+        "//tensorflow/lite/acceleration/configuration:configuration_cc_proto",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:flatbuffer_to_proto",
+        "//tensorflow/lite/acceleration/configuration:nnapi_plugin",
+        "//tensorflow/lite/acceleration/configuration:proto_to_flatbuffer",
         "//tensorflow/lite/core:model_builder",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier:custom_validation_embedder",
         "//tensorflow/lite/schema:schema_fbs_with_mutable",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_cc_proto",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/configuration:flatbuffer_to_proto",
-        "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin",
-        "//tensorflow/lite/experimental/acceleration/configuration:proto_to_flatbuffer",
         "//tensorflow/lite/tools:model_loader",
-        "//tensorflow/lite:stderr_reporter",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
+        "@flatbuffers//:runtime_cc",
     ] + select({
         clean_dep("//tensorflow:android"): [
-            "//tensorflow/lite/experimental/acceleration/configuration:gpu_plugin",
+            "//tensorflow/lite/acceleration/configuration:gpu_plugin",
         ],
         "//conditions:default": [],
     }),
@@ -1003,10 +1009,10 @@ cc_test(
         ":validator_runner_impl",
         ":validator_runner_options",
         "//tensorflow/lite:stderr_reporter",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:nnapi_plugin",
         "//tensorflow/lite/core/api:error_reporter",
         "//tensorflow/lite/experimental/acceleration/compatibility:android_info",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier:custom_validation_embedder",
         "//tensorflow/lite/nnapi/sl:nnapi_support_library",  # buildcleaner: keep
         "//tensorflow/lite/nnapi/sl:nnapi_support_library_headers",
@@ -1029,7 +1035,7 @@ cc_test(
         ":status_codes",
         ":validator_runner_entrypoint",
         ":validator_runner_options",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
@@ -1051,16 +1057,16 @@ cc_test(
         ":nnapi_sl_fake_impl_client",
         ":status_codes",
         ":validator_runner",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:nnapi_plugin",
         "//tensorflow/lite/experimental/acceleration/compatibility:android_info",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin",
         "//tensorflow/lite/nnapi/sl:nnapi_support_library",
         "//tensorflow/lite/nnapi/sl:nnapi_support_library_headers",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
     ] + select({
         clean_dep("//tensorflow:android"): [
-            "//tensorflow/lite/experimental/acceleration/configuration:gpu_plugin",
+            "//tensorflow/lite/acceleration/configuration:gpu_plugin",
         ],
         "//conditions:default": [
             ":validator_runner_entrypoint",
@@ -1087,7 +1093,7 @@ cc_test(
         ":validator_runner_entrypoint_without_affinity_deps",  # buildcleaner: keep
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.cc
index ce8a0062a7f..348a0d99803 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 namespace acceleration {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h
index 3cda96ce35b..fe7fc1cac78 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_BENCHMARK_RESULT_EVALUATOR_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_BENCHMARK_RESULT_EVALUATOR_H_
 
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 namespace acceleration {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.cc
index 121e16e0e2f..1ee04ec490e 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.cc
@@ -20,9 +20,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h"
@@ -31,6 +32,7 @@ limitations under the License.
 
 namespace tflite {
 namespace acceleration {
+namespace {
 
 using ::flatbuffers::FlatBufferBuilder;
 using ::flatbuffers::GetRoot;
@@ -39,10 +41,28 @@ using ::flatbuffers::GetRoot;
 // microseconds.
 constexpr absl::Duration kWaitBetweenRefresh = absl::Milliseconds(20);
 
+// Generate a string of 10 chars.
+std::string GenerateRandomString() {
+  static const char charset[] =
+      "0123456789"
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+      "abcdefghijklmnopqrstuvwxyz";
+  const int size = 10;
+  std::string result;
+  result.resize(size);
+  for (int i = 0; i < size; ++i) {
+    result[i] = charset[rand() % (sizeof(charset) - 1)];
+  }
+
+  return result;
+}
+
+}  // namespace
+
 BlockingValidatorRunner::BlockingValidatorRunner(
     const ValidatorRunnerOptions& options)
     : per_test_timeout_ms_(options.per_test_timeout_ms),
-      storage_path_(options.storage_path) {
+      storage_path_base_(options.storage_path) {
   validator_runner_impl_ = std::make_unique<ValidatorRunnerImpl>(
       CreateModelLoaderPath(options), options.storage_path,
       options.data_directory_path, options.per_test_timeout_ms,
@@ -65,11 +85,12 @@ std::vector<FlatBufferBuilder> BlockingValidatorRunner::TriggerValidation(
     return {};
   }
 
-  // Delete storage_file before running the tests, so that each run is
-  // independent from each other.
-  (void)unlink(storage_path_.c_str());
-  auto to_be_run =
-      std::make_unique<std::vector<flatbuffers::FlatBufferBuilder>>();
+  // Create a unique storage_path.
+  std::string storage_path =
+      absl::StrCat(storage_path_base_, ".", GenerateRandomString());
+  TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Validation storage path: %s",
+                  storage_path.c_str());
+  std::vector<flatbuffers::FlatBufferBuilder> to_be_run;
   std::vector<TFLiteSettingsT> for_settings_obj;
   for_settings_obj.reserve(for_settings.size());
   for (auto settings : for_settings) {
@@ -77,10 +98,11 @@ std::vector<FlatBufferBuilder> BlockingValidatorRunner::TriggerValidation(
     settings->UnPackTo(&tflite_settings);
     flatbuffers::FlatBufferBuilder copy;
     copy.Finish(CreateTFLiteSettings(copy, &tflite_settings));
-    to_be_run->emplace_back(std::move(copy));
+    to_be_run.emplace_back(std::move(copy));
     for_settings_obj.emplace_back(tflite_settings);
   }
-  validator_runner_impl_->TriggerValidationAsync(std::move(to_be_run));
+  validator_runner_impl_->TriggerValidationAsync(std::move(to_be_run),
+                                                 storage_path);
 
   // The underlying process runner should ensure each test finishes on time or
   // timed out. deadline_us is added here as an extra safety guard.
@@ -138,6 +160,10 @@ std::vector<FlatBufferBuilder> BlockingValidatorRunner::TriggerValidation(
       results.emplace_back(std::move(fbb));
     }
   }
+  // Delete storage_file before returning. In case of test timeout, the child
+  // thread or process may create and continue to write to the storage_path. In
+  // this case we cannot delete the file.
+  (void)unlink(storage_path.c_str());
   return results;
 }
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h
index a0684033b81..823d0520691 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h"
@@ -49,7 +49,7 @@ class BlockingValidatorRunner {
 
  private:
   int per_test_timeout_ms_ = 0;
-  std::string storage_path_;
+  const std::string storage_path_base_;
   std::unique_ptr<ValidatorRunnerImpl> validator_runner_impl_;
 };
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner_test.cc
index 3a9a4f890e0..c726b64ffd1 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "flatbuffers/buffer.h"  // from @flatbuffers
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
@@ -60,7 +60,7 @@ class BlockingValidatorRunnerTest : public ::testing::Test {
 
     options_.data_directory_path = ::testing::TempDir();
     options_.storage_path =
-        absl::StrCat(::testing::TempDir(), "/storage_path.fb");
+        absl::StrCat(::testing::TempDir(), "storage_path.fb.1");
     options_.per_test_timeout_ms = 5000;
 
     plain_model_path_ = MiniBenchmarkTestHelper::DumpToTempFile(
@@ -69,12 +69,6 @@ class BlockingValidatorRunnerTest : public ::testing::Test {
         g_tflite_acceleration_embedded_mobilenet_model_len);
   }
 
-  void TearDown() override {
-    if (should_perform_test_) {
-      ASSERT_EQ(unlink(options_.storage_path.c_str()), 0);
-    }
-  }
-
   std::string plain_model_path_;
   ValidatorRunnerOptions options_;
   bool should_perform_test_ = true;
@@ -205,7 +199,6 @@ TEST_F(BlockingValidatorRunnerTest, SucceedWithFdModelCustomValidation) {
   }
 }
 
-#ifndef __ANDROID__
 TEST_F(BlockingValidatorRunnerTest, SucceedWhenRunningMultipleTimes) {
   if (!should_perform_test_) {
     std::cerr << "Skipping test";
@@ -231,7 +224,6 @@ TEST_F(BlockingValidatorRunnerTest, SucceedWhenRunningMultipleTimes) {
     }
   }
 }
-#endif  // !__ANDROID__
 
 TEST_F(BlockingValidatorRunnerTest, ReturnErrorWhenTimedOut) {
   if (!should_perform_test_) {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/build_defs.bzl b/tensorflow/lite/experimental/acceleration/mini_benchmark/build_defs.bzl
index 7bb09520454..c8364b70d40 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/build_defs.bzl
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/build_defs.bzl
@@ -159,15 +159,15 @@ def validation_test(name, validation_model, tags = [], copts = [], deps = []):
             "@com_google_googletest//:gtest_main",
             "@flatbuffers",
             "//tensorflow/lite/experimental/acceleration/compatibility:android_info",
-            "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-            "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin",
+            "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+            "//tensorflow/lite/acceleration/configuration:nnapi_plugin",
             "//tensorflow/lite/experimental/acceleration/mini_benchmark:big_little_affinity",
             "//tensorflow/lite/experimental/acceleration/mini_benchmark:status_codes",
             "//tensorflow/lite/experimental/acceleration/mini_benchmark:validator",
             "//tensorflow/lite/tools:model_loader",
         ] + select({
             clean_dep("//tensorflow:android"): [
-                "//tensorflow/lite/experimental/acceleration/configuration:gpu_plugin",
+                "//tensorflow/lite/acceleration/configuration:gpu_plugin",
             ],
             "//conditions:default": [],
         }),
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/BUILD
index 2041c9a6d9e..8b0b025b647 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/BUILD
@@ -49,6 +49,7 @@ cc_test(
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_mobilenet_model",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_mobilenet_validation_model",
+        "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_simple_addition_model",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:mini_benchmark_test_helper",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:status_codes",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.cc
index c8e8f7abcc4..49e4e773b36 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.cc
@@ -89,16 +89,14 @@ void CreateData(std::vector<FlatBufferBuilder> benchmark_events,
 
   std::vector<uint8_t> data;
   data.reserve(kPerBenchmarkEventSize * benchmark_events.size());
-  auto cur = data.begin();
   for (auto& event_data : benchmark_events) {
     FlatBufferBuilder fbb;
     tflite::BenchmarkEventT event_obj;
     flatbuffers::GetRoot<tflite::BenchmarkEvent>(event_data.GetBufferPointer())
         ->UnPackTo(&event_obj);
     fbb.FinishSizePrefixed(CreateBenchmarkEvent(fbb, &event_obj));
-    data.insert(cur, fbb.GetBufferPointer(),
+    data.insert(data.end(), fbb.GetBufferPointer(),
                 fbb.GetBufferPointer() + fbb.GetSize());
-    std::advance(cur, fbb.GetSize());
   }
   minibenchmark_result.flatbuffer_data = new uint8_t[data.size()];
   minibenchmark_result.flatbuffer_data_size = data.size();
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_test.cc
index c3d2f6061fd..76b5b64ab03 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_simple_addition_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 
@@ -96,6 +97,10 @@ class CApiTest : public ::testing::Test {
         "mobilenet_quant.tflite",
         g_tflite_acceleration_embedded_mobilenet_model,
         g_tflite_acceleration_embedded_mobilenet_model_len);
+
+    simple_addition_model_path_ = helper.DumpToTempFile(
+        "add.bin", g_tflite_acceleration_embedded_simple_addition_model,
+        g_tflite_acceleration_embedded_simple_addition_model_len);
   }
 
   flatbuffers::Offset<tflite::BenchmarkStoragePaths> CreateStoragePaths() {
@@ -133,6 +138,7 @@ class CApiTest : public ::testing::Test {
   flatbuffers::FlatBufferBuilder mini_benchmark_fbb_;
   std::string embedded_model_path_;
   std::string plain_model_path_;
+  std::string simple_addition_model_path_;
   bool should_perform_test_ = true;
 };
 
@@ -217,6 +223,61 @@ TEST_F(CApiTest, SucceedWithCustomValidationAndPassingRule) {
   TfLiteMiniBenchmarkSettingsFree(settings);
 }
 
+TEST_F(CApiTest, SucceedWithMultipleTfliteSettings) {
+  if (!should_perform_test_) {
+    std::cerr << "Skipping test";
+    return;
+  }
+
+  const int batch_size = 5;
+  size_t input_size[] = {batch_size * 224 * 224 * 3};
+  std::vector<uint8_t> custom_input_data(input_size[0], 1);
+  mini_benchmark_fbb_.Finish(tflite::CreateMinibenchmarkSettings(
+      mini_benchmark_fbb_,
+      mini_benchmark_fbb_.CreateVector(
+          {tflite::CreateTFLiteSettings(mini_benchmark_fbb_),
+           tflite::CreateTFLiteSettings(mini_benchmark_fbb_),
+           tflite::CreateTFLiteSettings(mini_benchmark_fbb_)}),
+      CreateModelFile(plain_model_path_), CreateStoragePaths(),
+      CreateValidationSettings()));
+  MockResultEvaluator mock_evaluator;
+  EXPECT_CALL(mock_evaluator, HasPassedAccuracyCheck(_, _))
+      .Times(9)
+      .WillRepeatedly(testing::Return(true));
+
+  TfLiteMiniBenchmarkSettings* settings = TfLiteMiniBenchmarkSettingsCreate();
+  TfLiteMiniBenchmarkSettingsSetFlatBufferData(
+      settings, mini_benchmark_fbb_.GetBufferPointer(),
+      mini_benchmark_fbb_.GetSize());
+  TfLiteMiniBenchmarkCustomValidationInfo* custom_validation =
+      TfLiteMiniBenchmarkSettingsCustomValidationInfo(settings);
+  TfLiteMiniBenchmarkCustomValidationInfoSetBuffer(custom_validation,
+                                                   /*batch_size=*/batch_size,
+                                                   custom_input_data.data(),
+                                                   /*buffer_dim=*/input_size,
+                                                   /*buffer_dim_size=*/1);
+  TfLiteMiniBenchmarkCustomValidationInfoSetAccuracyValidator(
+      custom_validation, &mock_evaluator, MockResultEvaluator::Invoke);
+  for (int i = 0; i < 3; i++) {
+    TfLiteMiniBenchmarkResult* result =
+        TfLiteBlockingValidatorRunnerTriggerValidation(settings);
+    std::vector<const tflite::BenchmarkEvent*> events =
+        ToBenchmarkEvents(TfLiteMiniBenchmarkResultFlatBufferData(result),
+                          TfLiteMiniBenchmarkResultFlatBufferDataSize(result));
+
+    EXPECT_THAT(TfLiteMiniBenchmarkResultInitStatus(result),
+                tflite::acceleration::kMinibenchmarkSuccess);
+
+    EXPECT_THAT(events, testing::Not(testing::IsEmpty()));
+    for (auto& event : events) {
+      EXPECT_EQ(event->event_type(), tflite::BenchmarkEventType_END);
+      EXPECT_TRUE(event->result()->ok());
+    }
+    TfLiteMiniBenchmarkResultFree(result);
+  }
+  TfLiteMiniBenchmarkSettingsFree(settings);
+}
+
 TEST_F(CApiTest, ReturnNotOkWhenAccuracyCheckFail) {
   if (!should_perform_test_) {
     std::cerr << "Skipping test";
@@ -357,4 +418,91 @@ TEST_F(CApiTest, ReturnFailStatusWhenSettingsCorrupted) {
   TfLiteMiniBenchmarkResultFree(result);
   TfLiteMiniBenchmarkSettingsFree(settings);
 }
+
+TEST_F(CApiTest,
+       SucceedOnSimpleAdditionModelWithCustomValidationAndPassingRule) {
+  if (!should_perform_test_) {
+    std::cerr << "Skipping test";
+    return;
+  }
+
+  const int batch_size = 1;
+  size_t input_size[] = {batch_size * 8 * 8 * 3 * 4};
+  std::vector<uint8_t> custom_input_data(input_size[0]);
+
+  float test_input_float = 55.25f;
+  float test_output_float = test_input_float * 3;
+
+  uint8_t* test_input_bytes = reinterpret_cast<uint8_t*>(&test_input_float);
+  uint8_t* test_output_bytes = reinterpret_cast<uint8_t*>(&test_output_float);
+
+  for (int i = 0; i < input_size[0]; i += 4) {
+    custom_input_data[i] = test_input_bytes[0] & 0xff;
+    custom_input_data[i + 1] = test_input_bytes[1] & 0xff;
+    custom_input_data[i + 2] = test_input_bytes[2] & 0xff;
+    custom_input_data[i + 3] = test_input_bytes[3] & 0xff;
+  }
+
+  mini_benchmark_fbb_.Finish(tflite::CreateMinibenchmarkSettings(
+      mini_benchmark_fbb_, CreateTFLiteSettings(),
+      CreateModelFile(simple_addition_model_path_), CreateStoragePaths(),
+      CreateValidationSettings()));
+  MockResultEvaluator mock_evaluator;
+  EXPECT_CALL(mock_evaluator, HasPassedAccuracyCheck)
+      .Times(1)
+      .WillRepeatedly(testing::Return(true));
+
+  TfLiteMiniBenchmarkSettings* settings = TfLiteMiniBenchmarkSettingsCreate();
+  TfLiteMiniBenchmarkSettingsSetFlatBufferData(
+      settings, mini_benchmark_fbb_.GetBufferPointer(),
+      mini_benchmark_fbb_.GetSize());
+  TfLiteMiniBenchmarkCustomValidationInfo* custom_validation =
+      TfLiteMiniBenchmarkSettingsCustomValidationInfo(settings);
+  TfLiteMiniBenchmarkCustomValidationInfoSetBuffer(custom_validation,
+                                                   /*batch_size=*/batch_size,
+                                                   custom_input_data.data(),
+                                                   /*buffer_dim=*/input_size,
+                                                   /*buffer_dim_size=*/1);
+  TfLiteMiniBenchmarkCustomValidationInfoSetAccuracyValidator(
+      custom_validation, &mock_evaluator, MockResultEvaluator::Invoke);
+
+  TfLiteMiniBenchmarkResult* result =
+      TfLiteBlockingValidatorRunnerTriggerValidation(settings);
+  std::vector<const tflite::BenchmarkEvent*> events =
+      ToBenchmarkEvents(TfLiteMiniBenchmarkResultFlatBufferData(result),
+                        TfLiteMiniBenchmarkResultFlatBufferDataSize(result));
+
+  EXPECT_THAT(TfLiteMiniBenchmarkResultInitStatus(result),
+              tflite::acceleration::kMinibenchmarkSuccess);
+
+  EXPECT_THAT(events, testing::Not(testing::IsEmpty()));
+  for (auto& event : events) {
+    const tflite::BenchmarkResult* result = event->result();
+    ASSERT_NE(result, nullptr);
+    EXPECT_EQ(event->event_type(), tflite::BenchmarkEventType_END);
+    EXPECT_TRUE(event->result()->ok());
+    EXPECT_THAT(event->result()->actual_output(),
+                testing::Pointee(testing::SizeIs(1)));
+
+    const flatbuffers::Vector<unsigned char>* inference_output =
+        event->result()->actual_output()->Get(0)->value();
+
+    EXPECT_EQ(inference_output->size(), input_size[0]);
+
+    for (int i = 0; i < inference_output->size(); i += 4) {
+      float f;
+      uint8_t b[] = {inference_output->Get(i), inference_output->Get(i + 1),
+                     inference_output->Get(i + 2),
+                     inference_output->Get(i + 3)};
+      memcpy(&f, &b, sizeof(f));
+      EXPECT_EQ(f, test_output_float);
+      EXPECT_EQ(b[0], test_output_bytes[0]);
+      EXPECT_EQ(b[1], test_output_bytes[1]);
+      EXPECT_EQ(b[2], test_output_bytes[2]);
+      EXPECT_EQ(b[3], test_output_bytes[3]);
+    }
+  }
+  TfLiteMiniBenchmarkResultFree(result);
+  TfLiteMiniBenchmarkSettingsFree(settings);
+}
 }  // namespace
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage_test.cc
index 05db755da1e..5652c9b2276 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 namespace acceleration {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.cc
index 9f974c87c3d..99fc20c52c2 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <string>
 
 #include "flatbuffers/buffer.h"  // from @flatbuffers
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/minimal_logging.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.h
index 38f0410d0c3..f84f34cc14e 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.h
@@ -22,9 +22,9 @@ limitations under the License.
 #include <string>
 
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark.h
index a86242508bc..9d692a1be1a 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/synchronization/mutex.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 namespace acceleration {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_implementation.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_implementation.cc
index 878e5c0f7e5..f9505db3632 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_implementation.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_implementation.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test.cc
index 11335114a1c..a3eebc9852a 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h"
+#include "tensorflow/lite/acceleration/configuration/configuration.pb.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_float_validation_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_nnapi_sl_fake_impl.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_validation_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_validation_test.cc
index 3fb1633d68f..55cb8db5795 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_validation_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_validation_test.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/compatibility/android_info.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/big_little_affinity.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h"
 #include "tensorflow/lite/tools/model_loader.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/models/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/models/BUILD
index 99f7a572589..880e6f12c7d 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/models/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/models/BUILD
@@ -26,6 +26,13 @@ package(
 
 exports_files(["blazeface_mlkit_v1.tfl"])
 
+filegroup(
+    name = "add.bin",
+    srcs = [
+        "//tensorflow/lite:testdata/add.bin",
+    ],
+)
+
 filegroup(
     name = "mobilenet_v1_1.0_224.tflite",
     srcs = [
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/runner_test_entry_points.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/runner_test_entry_points.cc
index 7c350adedcd..c802758ce49 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/runner_test_entry_points.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/runner_test_entry_points.cc
@@ -46,7 +46,9 @@ int TfLiteSigKillSelf(int argc, char** argv) {
 }
 
 int TfLiteWriteOk(int argc, char** argv) {
-  write(kStdOutFd, "ok\n", 3);
+  if (write(kStdOutFd, "ok\n", 3) == -1) {
+    return -1;
+  }
   return ::tflite::acceleration::kMinibenchmarkSuccess;
 }
 
@@ -55,7 +57,10 @@ int TfLiteWriteOk(int argc, char** argv) {
 int TfLiteWritePidThenSleepNSec(int argc, char** argv) {
   std::string pid = std::to_string(getpid());
   pid.resize(::tflite::acceleration::kPidBufferLength);
-  write(kStdOutFd, pid.data(), ::tflite::acceleration::kPidBufferLength);
+  if (write(kStdOutFd, pid.data(), ::tflite::acceleration::kPidBufferLength) ==
+      -1) {
+    return 1;
+  }
 
   int sleep_sec;
   if (!absl::SimpleAtoi(argv[3], &sleep_sec)) {
@@ -75,8 +80,9 @@ int TfLiteWrite10kChars(int argc, char** argv) {
 
 int TfLiteWriteArgs(int argc, char** argv) {
   for (int i = 3; i < argc; i++) {
-    write(1, argv[i], strlen(argv[i]));
-    write(1, "\n", 1);
+    if (write(1, argv[i], strlen(argv[i])) == -1 || write(1, "\n", 1) == -1) {
+      return 1;
+    }
   }
   return ::tflite::acceleration::kMinibenchmarkSuccess;
 }
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.cc
index 8e2ee66b869..14a53102f4d 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.cc
@@ -27,16 +27,16 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/c/c_api.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/core/interpreter_builder.h"
 #include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/core/subgraph.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/constants.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_register.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h
index a5ccde6eaae..127aa692c46 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h
@@ -22,11 +22,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/core/subgraph.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/tools/model_loader.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.cc
index 2a77307fba2..f2acd1754eb 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.cc
@@ -14,14 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h"
 
+#include <cstdlib>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/strings/str_cat.h"
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h"
@@ -33,7 +32,8 @@ namespace acceleration {
 constexpr int kMaxAttempts = 2;
 
 ValidatorRunner::ValidatorRunner(const ValidatorRunnerOptions& options)
-    : storage_(options.storage_path, options.error_reporter),
+    : storage_path_(options.storage_path),
+      storage_(options.storage_path, options.error_reporter),
       error_reporter_(options.error_reporter) {
   validator_runner_impl_ = std::make_unique<ValidatorRunnerImpl>(
       CreateModelLoaderPath(options), options.storage_path,
@@ -65,8 +65,7 @@ int ValidatorRunner::TriggerMissingValidation(
   storage_.Read();
 
   // Filter out settings that have already been tried.
-  auto to_be_run =
-      std::make_unique<std::vector<flatbuffers::FlatBufferBuilder>>();
+  std::vector<flatbuffers::FlatBufferBuilder> to_be_run;
   for (auto settings : for_settings) {
     TFLiteSettingsT tflite_settings;
     settings->UnPackTo(&tflite_settings);
@@ -99,10 +98,11 @@ int ValidatorRunner::TriggerMissingValidation(
     }
     flatbuffers::FlatBufferBuilder copy;
     copy.Finish(CreateTFLiteSettings(copy, &tflite_settings));
-    to_be_run->emplace_back(std::move(copy));
+    to_be_run.emplace_back(std::move(copy));
   }
-  int to_be_run_count = to_be_run->size();
-  validator_runner_impl_->TriggerValidationAsync(std::move(to_be_run));
+  int to_be_run_count = to_be_run.size();
+  validator_runner_impl_->TriggerValidationAsync(std::move(to_be_run),
+                                                 storage_path_);
   return to_be_run_count;
 }
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h
index ae34fc49c3e..6c12a2568cc 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h"
@@ -80,6 +80,7 @@ class ValidatorRunner {
       int64_t timeout_us = kDefaultEventTimeoutUs);
 
  private:
+  const std::string storage_path_;
   FlatbufferStorage<BenchmarkEvent> storage_;
   ErrorReporter* error_reporter_;
   bool triggered_ = false;
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint.cc
index d0e2b19bc5f..15402c85dd3 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint.cc
@@ -30,9 +30,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/strings/string_view.h"
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/constants.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/file_lock.h"
@@ -48,7 +47,7 @@ namespace {
 
 using flatbuffers::Offset;
 
-Validator::Status RunValidator(absl::string_view model_path,
+Validator::Status RunValidator(const std::string& model_path,
                                const std::string& delegate_so_path,
                                const TFLiteSettingsT& tflite_settings,
                                Validator::Results& results) {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint_test.cc
index be9984b879e..82ab1f5c47d 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.cc
index ba6ae223654..a984c431277 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <iostream>
 #include <memory>
-#include <optional>
 #include <ostream>
 #include <string>
 #include <thread>  // NOLINT: code only used on Android, where std::thread is allowed
@@ -28,9 +27,9 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/file_lock.h"
@@ -134,7 +133,6 @@ MinibenchmarkStatus ValidatorRunnerImpl::Init() {
     TF_LITE_REPORT_ERROR(error_reporter_, "Storage::Read failed.");
     return status;
   }
-
   std::unique_ptr<tools::ModelLoader> model_loader =
       tools::CreateModelLoaderFromPath(fd_or_model_path_);
   if (!model_loader) {
@@ -207,18 +205,22 @@ MinibenchmarkStatus ValidatorRunnerImpl::Init() {
 }
 
 void ValidatorRunnerImpl::TriggerValidationAsync(
-    std::unique_ptr<std::vector<FlatBufferBuilder>> tflite_settings) {
-  if (!tflite_settings || tflite_settings->empty()) {
+    std::vector<FlatBufferBuilder> tflite_settings,
+    absl::string_view storage_path) {
+  if (tflite_settings.empty()) {
     return;
   }
 
+  storage_ = FlatbufferStorage<BenchmarkEvent>(storage_path, error_reporter_);
+
   // We purposefully detach the thread and have it own all the data. The
   // runner may potentially hang, so we can't wait for it to terminate.
   // error_reporter is not passed in because the ownership cannot be passed to
   // the thread. Model data is copied from model_allocation_ if set and owned by
   // the thread.
   std::thread detached_thread(
-      [original_model_path = fd_or_model_path_, storage_path = storage_path_,
+      [original_model_path = fd_or_model_path_,
+       storage_path = std::string(storage_path),
        data_directory_path = data_directory_path_,
        tflite_settings = std::move(tflite_settings),
        validation_entrypoint_name =
@@ -229,7 +231,7 @@ void ValidatorRunnerImpl::TriggerValidationAsync(
        allocation_and_model =
            CopyModel(model_allocation_.get(), error_reporter_),
        timeout_ms = timeout_ms_]() {
-        FileLock lock(storage_path + ".parent_lock");
+        FileLock lock(absl::StrCat(storage_path, ".parent_lock"));
         if (!lock.TryLock()) {
           return;
         }
@@ -237,14 +239,14 @@ void ValidatorRunnerImpl::TriggerValidationAsync(
         std::string model_path = original_model_path;
         std::unique_ptr<FdHolder> fd_holder =
             UpdateModelPathIfUsingFd(model_path);
-        for (auto& one_setting : *tflite_settings) {
+        for (auto& one_setting : tflite_settings) {
           FlatbufferStorage<BenchmarkEvent> storage(storage_path);
           TFLiteSettingsT tflite_settings_obj;
           flatbuffers::GetRoot<TFLiteSettings>(one_setting.GetBufferPointer())
               ->UnPackTo(&tflite_settings_obj);
           TFLITE_LOG_PROD(TFLITE_LOG_INFO,
-                          "Run validation with entry point '%s'",
-                          validation_entrypoint_name);
+                          "Run validation with entry point '%s' %s",
+                          validation_entrypoint_name, storage_path.c_str());
           ProcessRunner runner(data_directory_path, validation_entrypoint_name,
                                validation_entrypoint, timeout_ms);
           int exitcode = 0;
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h
index 21dbaceb990..9daaa253886 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h
@@ -21,10 +21,10 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h"
@@ -66,10 +66,11 @@ class ValidatorRunnerImpl {
   MinibenchmarkStatus Init();
 
   // Trigger the test for the given tflite_settings in a new thread. The
-  // settings will run sequentially.
+  // settings will run sequentially. The storage_path will be used to store
+  // intermediate test output.
   void TriggerValidationAsync(
-      std::unique_ptr<std::vector<flatbuffers::FlatBufferBuilder>>
-          tflite_settings);
+      std::vector<flatbuffers::FlatBufferBuilder> tflite_settings,
+      absl::string_view storage_path);
 
   // Returns the unmodified successful BenchmarkEvent from storage. If a
   // BenchmarkEvent is considered pass with the BenchmarkResultEvaluator, but
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl_test.cc
index 21483b54580..a36a1ed5af7 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl_test.cc
@@ -24,9 +24,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/time/time.h"
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
-#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/compatibility/android_info.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
@@ -143,9 +142,8 @@ TEST_F(ValidatorRunnerImplTest,
                            CreateNNAPISettings(tflite_settings[0])));
 
   // Run.
-  validator.TriggerValidationAsync(
-      std::make_unique<std::vector<flatbuffers::FlatBufferBuilder>>(
-          std::move(tflite_settings)));
+  validator.TriggerValidationAsync(std::move(tflite_settings),
+                                   options_.storage_path);
 
   // Validate.
   FlatbufferStorage<BenchmarkEvent> storage(options_.storage_path,
@@ -189,9 +187,8 @@ TEST_F(ValidatorRunnerImplTest,
   tflite_settings[0].Finish(CreateTFLiteSettings(tflite_settings[0]));
 
   // Run.
-  validator.TriggerValidationAsync(
-      std::make_unique<std::vector<flatbuffers::FlatBufferBuilder>>(
-          std::move(tflite_settings)));
+  validator.TriggerValidationAsync(std::move(tflite_settings),
+                                   options_.storage_path);
 
   // Validate.
   FlatbufferStorage<BenchmarkEvent> storage(options_.storage_path,
@@ -235,9 +232,8 @@ TEST_F(ValidatorRunnerImplTest,
   tflite_settings[0].Finish(CreateTFLiteSettings(tflite_settings[0]));
 
   // Run.
-  validator.TriggerValidationAsync(
-      std::make_unique<std::vector<flatbuffers::FlatBufferBuilder>>(
-          std::move(tflite_settings)));
+  validator.TriggerValidationAsync(std::move(tflite_settings),
+                                   options_.storage_path);
 
   // Validate.
   FlatbufferStorage<BenchmarkEvent> storage(options_.storage_path,
@@ -279,9 +275,8 @@ TEST_F(ValidatorRunnerImplTest,
   tflite_settings[0].Finish(CreateTFLiteSettings(tflite_settings[0]));
 
   // Run.
-  validator.TriggerValidationAsync(
-      std::make_unique<std::vector<flatbuffers::FlatBufferBuilder>>(
-          std::move(tflite_settings)));
+  validator.TriggerValidationAsync(std::move(tflite_settings),
+                                   options_.storage_path);
 
   // Validate.
   FlatbufferStorage<BenchmarkEvent> storage(options_.storage_path,
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.cc
index 8d890d3422e..b6f35d6e5a9 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 namespace acceleration {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h
index 81279e10631..44a69cbff49 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/nnapi/sl/include/SupportLibrary.h"
 #include "tensorflow/lite/stderr_reporter.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_test.cc
index fb0fa60632a..b18871fe98f 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_test.cc
@@ -25,8 +25,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/compatibility/android_info.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_nnapi_sl_fake_impl.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_test.cc
index d51b84dd94e..0380d400a3a 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_test.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #if FLATBUFFERS_LITTLEENDIAN == 0
 #include "tensorflow/lite/core/model_builder.h"
 #endif
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h"
+#include "tensorflow/lite/acceleration/configuration/configuration.pb.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h"
diff --git a/tensorflow/lite/experimental/microfrontend/BUILD b/tensorflow/lite/experimental/microfrontend/BUILD
index 430f85284e9..29cc6af148c 100644
--- a/tensorflow/lite/experimental/microfrontend/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/BUILD
@@ -118,8 +118,8 @@ tf_custom_op_py_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
+        "//tensorflow/python/platform:resource_loader",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 78947cc336c..119217d44c9 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -112,6 +112,8 @@ upper_tabs:
       - title: "Smart reply"
         path: https://github.com/tensorflow/examples/tree/master/lite/examples/smart_reply/android
         status: external
+      - title: "Auto complete"
+        path: /lite/examples/auto_complete/overview
 
       - heading: "Audio"
       - title: "Sound and word recognition"
diff --git a/tensorflow/lite/g3doc/android/lite_build.md b/tensorflow/lite/g3doc/android/lite_build.md
index 63ab77b0c4f..39d97440b41 100644
--- a/tensorflow/lite/g3doc/android/lite_build.md
+++ b/tensorflow/lite/g3doc/android/lite_build.md
@@ -47,7 +47,7 @@ You can download the Docker file
 <a href="https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/lite/tools/tflite-android.Dockerfile">here</a>
 {% dynamic else %} You must acknowledge the terms of service to download the
 file.
-<a class="button button-blue devsite-acknowledgement-link" data-globally-unique-wall-id="tflite-android-tos">Acknowledge</a>
+<button class="button-blue devsite-acknowledgement-link" data-globally-unique-wall-id="tflite-android-tos">Acknowledge</button>
 {% dynamic endif %}
 <!-- mdformat on -->
 
diff --git a/tensorflow/lite/g3doc/examples/auto_complete/images/context_window.png b/tensorflow/lite/g3doc/examples/auto_complete/images/context_window.png
new file mode 100644
index 00000000000..94f670f5a1e
Binary files /dev/null and b/tensorflow/lite/g3doc/examples/auto_complete/images/context_window.png differ
diff --git a/tensorflow/lite/g3doc/examples/auto_complete/images/tflite_workflow.png b/tensorflow/lite/g3doc/examples/auto_complete/images/tflite_workflow.png
new file mode 100644
index 00000000000..3b8c804e39b
Binary files /dev/null and b/tensorflow/lite/g3doc/examples/auto_complete/images/tflite_workflow.png differ
diff --git a/tensorflow/lite/g3doc/examples/auto_complete/overview.md b/tensorflow/lite/g3doc/examples/auto_complete/overview.md
new file mode 100644
index 00000000000..f73b1c811b0
--- /dev/null
+++ b/tensorflow/lite/g3doc/examples/auto_complete/overview.md
@@ -0,0 +1,311 @@
+# Auto Complete
+
+<table class="tfo-notebook-buttons" align="left">
+  <td>
+    <a target="_blank" href="https://colab.sandbox.google.com/github/tensorflow/codelabs/blob/main/KerasNLP/io2023_workshop.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
+  </td>
+</table>
+
+## Introduction
+
+Large language models (LLMs) are a class of machine learning models that are
+trained to generate text based on large datasets. They can be used for natural
+language processing (NLP) tasks, including text generation, question answering,
+and machine translation. They are based on Transformer architecture and are
+trained on massive amounts of text data, often involving billions of words. Even
+LLMs of a smaller scale, such as GPT-2, can perform impressively. Converting
+TensorFlow models to a lighter, faster, and low-power model allows for us to run
+generative AI models on-device, with benefits of better user security because
+data will never leave your device.
+
+This runbook shows you how to build an Android app with TensorFlow Lite to run a
+Keras LLM and provides suggestions for model optimization using quantizing
+techniques, which otherwise would require a much larger amount of memory and
+greater computational power to run.
+
+## Guides
+
+### Model authoring
+
+For this demonstration, we will use KerasNLP to get the GPT-2 model. KerasNLP is
+a library that contains state-of-the-art pretrained models for natural language
+processing tasks, and can support users through their entire development cycle.
+You can see the list of models available in the
+[KerasNLP repository](https://github.com/keras-team/keras-nlp/tree/master/keras_nlp/models).
+The workflows are built from modular components that have state-of-the-art
+preset weights and architectures when used out-of-the-box and are easily
+customizable when more control is needed. Creating the GPT-2 model can be done
+with the following steps:
+
+```python
+gpt2_tokenizer = keras_nlp.models.GPT2Tokenizer.from_preset("gpt2_base_en")
+
+gpt2_preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
+    "gpt2_base_en",
+    sequence_length=256,
+    add_end_token=True,
+)
+
+gpt2_lm =
+keras_nlp.models.GPT2CausalLM.from_preset(
+"gpt2_base_en",
+preprocessor=gpt2_preprocessor
+)
+```
+
+One commonality among these three lines of code is the `from_preset()` method,
+which will instantiate the part of Keras API from a preset architecture and/or
+weights, therefore loading the pre-trained model. From this code snippet, you’ll
+also notice three modular components:
+
+1.  **Tokenizer**: converts a raw string input into integer token IDs suitable
+    for a Keras Embedding layer. GPT-2 uses the byte-pair encoding (BPE)
+    tokenizer specifically.
+
+2.  **Preprocessor**: layer for tokenizing and packing inputs to be fed into a
+    Keras model. Here, the preprocessor will pad the tensor of token IDs to a
+    specified length (256) after tokenization.
+
+3.  **Backbone**: Keras model that follows the SoTA transformer backbone
+    architecture and has the preset weights.
+
+Additionally, you can check out the full GPT-2 model implementation on
+[GitHub](https://github.com/keras-team/keras-nlp/tree/master/keras_nlp/models/gpt2).
+
+### Model conversion
+
+TensorFlow Lite is a mobile library for deploying methods on mobile,
+microcontrollers, and other edge devices. The first step is to convert a Keras
+model to a more compact TensorFlow Lite format using the TensorFlow Lite
+**converter**, and then use the TensorFlow Lite **interpreter**, which is highly
+optimized for mobile devices, to run the converted model.
+
+<img src="../images/tflite_workflow.png" class="attempt-right" />
+
+Start with the `generate()` function from `GPT2CausalLM` that performs the
+conversion. Wrap the `generate()` function to create a concrete TensorFlow
+function:
+
+```python
+@tf.function
+def generate(prompt, max_length):
+    """
+    Args:
+        prompt: input prompt to the LLM in string format
+        max_length: the max length of the generated tokens
+    """
+    return gpt2_lm.generate(prompt, max_length)
+
+concrete_func = generate.get_concrete_function(tf.TensorSpec([], tf.string), 100)
+```
+
+Note that you can also use `from_keras_model()` from
+[`TFLiteConverter`](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter#from_keras_model)
+in order to perform the conversion.
+
+Now define a helper function that will run inference with an input and a TFLite
+model. TensorFlow text ops are not built-in ops in the TFLite runtime, so you
+will need to add these custom ops in order for the interpreter to make inference
+on this model. This helper function accepts an input and a function that
+performs the conversion, namely the `generator()` function defined above.
+
+```python
+def run_inference(input, generate_tflite):
+    interp = interpreter.InterpreterWithCustomOps(
+        model_content=generate_tflite,
+        custom_op_registerers=
+            tf_text.tflite_registrar.SELECT_TFTEXT_OPS
+    )
+
+    interp.get_signature_list()
+
+    generator = interp.get_signature_runner('serving_default')
+    output = generator(prompt=np.array([input]))
+```
+
+You can convert the model now:
+
+```python
+gpt2_lm.jit_compile = False
+converter = tf.lite.TFLiteConverter.from_concrete_functions(
+    [concrete_func],
+    gpt2_lm)
+
+converter.target_spec.supported_ops = [
+    tf.lite.OpsSet.TFLITE_BUILTINS, # enable TFLite ops
+    tf.lite.OpsSet.SELECT_TF_OPS, # enable TF ops
+]
+converter.allow_custom_ops = True
+converter.target_spec.experimental_select_user_tf_ops = [
+    "UnsortedSegmentJoin",
+    "UpperBound"
+]
+converter._experimental_guarantee_all_funcs_one_use = True
+generate_tflite = converter.convert()
+run_inference("I'm enjoying a", generate_tflite)
+```
+
+### Quantization
+
+TensorFlow Lite has implemented an optimization technique called
+**quantization** which can reduce model size and accelerate inference. Through
+the quantization process, 32-bit floats are mapped to smaller 8-bit integers,
+therefore reducing the model size by a factor of 4 for more efficient execution
+on modern hardwares. There are several ways to do quantization in TensorFlow.
+You can visit the
+[TFLite Model optimization](https://www.tensorflow.org/lite/performance/model_optimization)
+and
+[TensorFlow Model Optimization Toolkit](https://www.tensorflow.org/model_optimization)
+pages for more information. The types of quantizations are explained briefly
+below.
+
+Here, you will use the
+[post-training dynamic range quantization](https://www.tensorflow.org/lite/performance/post_training_quant)
+on the GPT-2 model by setting the converter optimization flag to
+`tf.lite.Optimize.DEFAULT`, and the rest of the conversion process is the same
+as detailed before. We tested that with this quantization technique the latency
+is around 6.7 seconds on Pixel 7 with max output length set to 100.
+
+```python
+gpt2_lm.jit_compile = False
+converter = tf.lite.TFLiteConverter.from_concrete_functions(
+    [concrete_func],
+    gpt2_lm)
+
+converter.target_spec.supported_ops = [
+    tf.lite.OpsSet.TFLITE_BUILTINS, # enable TFLite ops
+    tf.lite.OpsSet.SELECT_TF_OPS, # enable TF ops
+]
+converter.allow_custom_ops = True
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.experimental_select_user_tf_ops = [
+    "UnsortedSegmentJoin",
+    "UpperBound"
+]
+converter._experimental_guarantee_all_funcs_one_use = True
+quant_generate_tflite = converter.convert()
+run_inference("I'm enjoying a", quant_generate_tflite)
+```
+
+**Dynamic Range**
+
+Dynamic range quantization is the recommended starting point for optimizing
+on-device models. It can achieve about a 4x reduction in the model size, and is
+a recommended starting point as it provides reduced memory usage and faster
+computation without you having to provide a representative dataset for
+calibration. This type of quantization statically quantizes only the weights
+from floating point to 8-bit integer at conversion time.
+
+**FP16**
+
+Floating point models can also be optimized by quantizing the weights to float16
+type. The advantages of
+[float16 quantization](https://www.tensorflow.org/lite/performance/post_training_float16_quant)
+are reducing the model size by up to half (as all weights become half their
+size), causing minimal loss in accuracy, and supporting GPU delegates that can
+operate directly on float16 data (which results in faster computation than on
+float32 data). A model converted to float16 weights can still run on the CPU
+without additional modifications. The float16 weights are upsampled to float32
+before the first inference, which permits a reduction in model size in exchange
+for a minimal impact to latency and accuracy.
+
+**Full Integer Quantization**
+
+[Full integer quantization](https://www.tensorflow.org/lite/performance/post_training_integer_quant)
+both converts the 32 bit floating point numbers, including weights and
+activations, to the nearest 8 bit integers. This type of quantization results in
+a smaller model with increased inference speed, which is incredibly valuable
+when using microcontrollers. This mode is recommended when activations are
+sensitive to the quantization.
+
+### Android App integration
+
+You can follow this
+[Android example](https://github.com/tensorflow/examples/tree/master/lite/examples/generative_ai)
+to integrate your TFLite model into an Android App.
+
+### Prerequisites
+
+If you have not already, install
+[Android Studio](https://developer.android.com/studio/index.html), following the
+instructions on the website.
+
+*   Android Studio 2022.2.1 or above.
+*   An Android device or Android emulator with more than 4G memory
+
+### Building and Running with Android Studio
+
+*   Open Android Studio, and from the Welcome screen, select **Open an existing
+    Android Studio project**.
+*   From the Open File or Project window that appears, navigate to and select
+    the
+    [`lite/examples/generative_ai/android`](https://github.com/tensorflow/examples/tree/master/lite/examples/generative_ai/android)
+    directory from wherever you cloned the TensorFlow Lite sample GitHub repo.
+*   You may also need to install various platforms and tools according to error
+    messages.
+*   Rename the converted .tflite model to `autocomplete.tflite` and copy it into
+    `app/src/main/assets/` folder.
+*   Select menu **Build -> Make Project** to build the app. (Ctrl+F9, depending
+    on your version).
+*   Click menu **Run -> Run 'app'**. (Shift+F10, depending on your version)
+
+Alternatively, you can also use the
+[gradle wrapper](https://docs.gradle.org/current/userguide/gradle_wrapper.html#gradle_wrapper)
+to build it in the command line. Please refer to the
+[Gradle documentation](https://docs.gradle.org/current/userguide/command_line_interface.html)
+for more information.
+
+### (Optional) Building the .aar file
+
+By default the app automatically downloads the needed `.aar` files. But if you
+want to build your own, switch to `app/libs/build_aar/` folder run
+`./build_aar.sh`. This script will pull in the necessary ops from TensorFlow
+Text and build the aar for Select TF operators.
+
+After compilation, a new file `tftext_tflite_flex.aar` is generated. Replace the
+.aar file in `app/libs/` folder and re-build the app.
+
+Note that you still need to include the standard `tensorflow-lite` aar in your
+gradle file.
+
+### Context window size
+
+<img src="../images/context_window.png" class="attempt-right" />
+
+The app has a changeable parameter ‘context window size’, which is needed
+because LLMs today generally have a fixed context size which limits how many
+words/tokens can be fed into the model as ‘prompt’ (note that ‘word’ is not
+necessarily equivalent to ‘token’ in this case, due to different tokenization
+methods). This number is important because:
+
+*   Setting it too small, the model will not have enough context to generate
+    meaningful output
+*   Setting it too big, the model will not have enough room to work with (since
+    the output sequence is inclusive of the prompt)
+
+You can experiment with it, but setting it to ~50% of output sequence length is
+a good start.
+
+## Safety and Responsible AI
+
+As noted in the original
+[OpenAI GPT-2 announcement](https://openai.com/research/better-language-models),
+there are
+[notable caveats and limitations](https://github.com/openai/gpt-2#some-caveats)
+with the GPT-2 model. In fact, LLMs today generally have some well-known
+challenges such as hallucinations, fairness, and bias; this is because these
+models are trained on real-world data, which make them reflect real world
+issues.
+
+This codelab is created only to demonstrate how to create an app powered by LLMs
+with TensorFlow tooling. The model produced in this codelab is for educational
+purposes only and not intended for production usage.
+
+LLM production usage requires thoughtful selection of training datasets and
+comprehensive safety mitigations. One such functionality offered in this Android
+app is the profanity filter, which rejects bad user inputs or model outputs. If
+any inappropriate language is detected, the app will in return reject that
+action. To learn more about Responsible AI in the context of LLMs, make sure to
+watch the Safe and Responsible Development with Generative Language Models
+technical session at Google I/O 2023 and check out the
+[Responsible AI Toolkit](https://www.tensorflow.org/responsible_ai).
diff --git a/tensorflow/lite/g3doc/examples/image_classification/overview.md b/tensorflow/lite/g3doc/examples/image_classification/overview.md
index 0cc84a6d4ea..d2fdcc737f6 100644
--- a/tensorflow/lite/g3doc/examples/image_classification/overview.md
+++ b/tensorflow/lite/g3doc/examples/image_classification/overview.md
@@ -159,7 +159,7 @@ on metrics such as performance, accuracy, and model size. For example, you might
 need a faster model for building a bar code scanner while you might prefer a
 slower, more accurate model for a medical imaging app.
 
-Note that the <a href=https://www.tensorflow.org/lite/guide/hosted_models#image_classification>image classification models</a> provided accept varying sizes of input. For some models, this is indicated in the filename. For example, the Mobilenet_V1_1.0_224 model accepts an input of 224x224 pixels. All of the models require three color channels per pixel (red, green, and blue). Quantized models require 1 byte per channel, and float models require 4 bytes per channel. The <a href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/EXPLORE_THE_CODE.md">Android</a> and <a href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios/EXPLORE_THE_CODE.md">iOS</a> code samples demonstrate how to process full-sized camera images into the required format for each model.
+Note that the <a href=https://www.tensorflow.org/lite/guide/hosted_models#image_classification>image classification models</a> provided accept varying sizes of input. For some models, this is indicated in the filename. For example, the Mobilenet_V1_1.0_224 model accepts an input of 224x224 pixels. All of the models require three color channels per pixel (red, green, and blue). Quantized models require 1 byte per channel, and float models require 4 bytes per channel. The <a href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android_java">Android</a> and <a href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios">iOS</a> code samples demonstrate how to process full-sized camera images into the required format for each model.
 
 <h3>Uses and limitations</h3>
 
diff --git a/tensorflow/lite/g3doc/examples/object_detection/overview.md b/tensorflow/lite/g3doc/examples/object_detection/overview.md
index a4dc243ef45..c685816a595 100644
--- a/tensorflow/lite/g3doc/examples/object_detection/overview.md
+++ b/tensorflow/lite/g3doc/examples/object_detection/overview.md
@@ -58,9 +58,9 @@ build your own custom inference pipeline using the
 [TensorFlow Lite Interpreter Java API](../../guide/inference#load_and_run_a_model_in_java).
 
 The Android example below demonstrates the implementation for both methods as
-[lib_task_api](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android/lib_task_api)
+[lib_task_api](https://github.com/tensorflow/examples/tree/r2.12/lite/examples/object_detection/android/lib_task_api)
 and
-[lib_interpreter](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android/lib_interpreter),
+[lib_interpreter](https://github.com/tensorflow/examples/tree/eb925e460f761f5ed643d17f0c449e040ac2ac45/lite/examples/object_detection/android/lib_interpreter),
 respectively.
 
 <a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android">View
diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index b765099db93..64054f6cc53 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -232,7 +232,8 @@ primitive types. In particular, the shape of a string Tensor dictates the number
 and arrangement of strings in the Tensor, with each element itself being a
 variable length string. In this sense, the (byte) size of the Tensor cannot be
 computed from the shape and type alone, and consequently strings cannot be
-provided as a single, flat `ByteBuffer` argument.
+provided as a single, flat `ByteBuffer` argument. You can see some examples in
+this [page](https://www.tensorflow.org/lite/api_docs/java/org/tensorflow/lite/Interpreter).
 
 If other data types, including boxed types like `Integer` and `Float`, are used,
 an `IllegalArgumentException` will be thrown.
diff --git a/tensorflow/lite/g3doc/guide/ios.md b/tensorflow/lite/g3doc/guide/ios.md
index fc21086845e..b4715e0cad6 100644
--- a/tensorflow/lite/g3doc/guide/ios.md
+++ b/tensorflow/lite/g3doc/guide/ios.md
@@ -132,7 +132,7 @@ objc_library(
 # Using C++ API directly
 objc_library(
   deps = [
-      "//third_party/tensorflow/lite:framework",
+      "//tensorflow/lite:framework",
   ],
 )
 ```
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md
index f5fde54bdff..15725b50aff 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md
@@ -7,7 +7,7 @@ in a given picture. The task of predicting what an image represents is called
 classes of images. For example, a model might be trained to recognize photos
 representing three different types of animals: rabbits, hamsters, and dogs. See
 the
-[image classification overview](../../examples/image_classification/overview)
+[image classification overview](https://www.tensorflow.org/lite/examples/image_classification/overview)
 for more information about image classifiers.
 
 Use the Task Library `ImageClassifier` API to deploy your custom image
@@ -48,7 +48,7 @@ API.
 ## Run inference in Java
 
 See the
-[Image Classification reference app](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/android/EXPLORE_THE_CODE.md)
+[Image Classification reference app](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/android/README.md)
 for an example of how to use `ImageClassifier` in an Android app.
 
 ### Step 1: Import Gradle dependency and other settings
@@ -274,8 +274,8 @@ with your own model and test data.
 ## Model compatibility requirements
 
 The `ImageClassifier` API expects a TFLite model with mandatory
-[TFLite Model Metadata](../../models/convert/metadata). See examples of creating
-metadata for image classifiers using the
+[TFLite Model Metadata](https://www.tensorflow.org/lite/models/convert/metadata).
+See examples of creating metadata for image classifiers using the
 [TensorFlow Lite Metadata Writer API](../../models/convert/metadata_writer_tutorial.ipynb#image_classifiers).
 
 The compatible image classifier models should meet the following requirements:
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
index 32984459c1a..5465d3a1fbe 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
@@ -171,7 +171,7 @@ std::vector<QaAnswer> results = answerer->Answer(context_of_question, question_t
 ```
 
 Explore more advanced accelerator settings
-[here](https://github.com/tensorflow/tensorflow/blob/1a8e885b864c818198a5b2c0cbbeca5a1e833bc8/tensorflow/lite/experimental/acceleration/configuration/configuration.proto).
+[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/acceleration/configuration/configuration.proto).
 
 ### Example usage of Coral Edge TPU in Python
 
@@ -254,7 +254,7 @@ Try out the
 [Task Library CLI demo tool](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/examples/task/vision/desktop)
 with your Coral Edge TPU devices. Explore more on the
 [pretrained Edge TPU models](https://coral.ai/models/) and
-[advanced Edge TPU settings](https://github.com/tensorflow/tensorflow/blob/1a8e885b864c818198a5b2c0cbbeca5a1e833bc8/tensorflow/lite/experimental/acceleration/configuration/configuration.proto#L275).
+[advanced Edge TPU settings](https://github.com/tensorflow/tensorflow/blob/4d999fda8d68adfdfacd4d0098124f1b2ea57927/tensorflow/lite/acceleration/configuration/configuration.proto#L594).
 
 ### Example usage of Core ML Delegate in C++
 
diff --git a/tensorflow/lite/g3doc/microcontrollers/get_started_low_level.md b/tensorflow/lite/g3doc/microcontrollers/get_started_low_level.md
index c3631587935..537258f2a01 100644
--- a/tensorflow/lite/g3doc/microcontrollers/get_started_low_level.md
+++ b/tensorflow/lite/g3doc/microcontrollers/get_started_low_level.md
@@ -85,7 +85,7 @@ following header files:
 
 -   [`all_ops_resolver.h`](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/all_ops_resolver.h)
     provides the operations used by the interpreter to run the model.
--   [`micro_error_reporter.h`](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/micro_error_reporter.h)
+-   [`micro_error_reporter.h`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/tflite_bridge/micro_error_reporter.h)
     outputs debug information.
 -   [`micro_interpreter.h`](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/micro_interpreter.h)
     contains code to load and run models.
@@ -163,7 +163,7 @@ if (model->version() != TFLITE_SCHEMA_VERSION) {
 ### 6. Instantiate operations resolver
 
 An
-[`AllOpsResolver`](github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/all_ops_resolver.h)
+[`AllOpsResolver`](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/all_ops_resolver.h)
 instance is declared. This will be used by the interpreter to access the
 operations that are used by the model:
 
diff --git a/tensorflow/lite/g3doc/microcontrollers/library.md b/tensorflow/lite/g3doc/microcontrollers/library.md
index 1973bf2e686..f9884206645 100644
--- a/tensorflow/lite/g3doc/microcontrollers/library.md
+++ b/tensorflow/lite/g3doc/microcontrollers/library.md
@@ -30,7 +30,7 @@ interpreter are located in the root of the project, accompanied by tests:
     uses a lot of memory. In production applications, you should use
     `micro_mutable_op_resolver.h` to pull in only the operations your model
     needs.
--   [`micro_error_reporter.h`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/micro_error_reporter.h)
+-   [`micro_error_reporter.h`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/tflite_bridge/micro_error_reporter.h)
     outputs debug information.
 -   [`micro_interpreter.h`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/micro_interpreter.h)
     contains code to handle and run models.
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 3f17d3f2057..7c3c2e1c1eb 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include <map>
 #include <memory>
-#include <new>
 #include <string>
 #include <thread>  // NOLINT(build/c++11)
 #include <utility>
@@ -38,21 +37,16 @@ limitations under the License.
 #include "tensorflow/lite/delegates/utils/simple_delegate.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
 #include "tensorflow/lite/interpreter_test_util.h"
-#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/util.h"
 
-namespace tflite {
+#ifdef __APPLE__
+#include "TargetConditionals.h"
+#endif
 
-namespace ops {
-namespace builtin {
-TfLiteRegistration* Register_PADV2();
-TfLiteRegistration* Register_NEG();
-}  // namespace builtin
-}  // namespace ops
+namespace tflite {
 
 namespace {
 
@@ -65,13 +59,14 @@ TEST(BasicInterpreter, ZeroInterpreter) {
 
   Interpreter interpreter;
 
-#ifndef NDEBUG
+#if (!defined(NDEBUG)) || defined(__ANDROID__) || \
+    (defined(__APPLE__) && (TARGET_IPHONE_SIMULATOR || TARGET_OS_IPHONE))
   const char* kExpectedLog = "INFO: Initialized TensorFlow Lite runtime";
-#else
-  const char* kExpectedLog = "";
-#endif
   EXPECT_THAT(testing::internal::GetCapturedStderr(),
               testing::HasSubstr(kExpectedLog));
+#else
+  EXPECT_THAT(testing::internal::GetCapturedStderr(), testing::IsEmpty());
+#endif
 
   interpreter.SetInputs({});
   interpreter.SetOutputs({});
@@ -575,7 +570,7 @@ TEST(BasicInterpreter, ResizingTensorsStrictInvalid) {
   EXPECT_EQ(tensor->bytes, 3 * sizeof(float));
   ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
 
-  // Invalid becuase `dims_signature` is not specified.
+  // Invalid because `dims_signature` is not specified.
   ASSERT_EQ(interpreter.ResizeInputTensorStrict(t, {1, 2, 3}), kTfLiteError);
   EXPECT_EQ(tensor->bytes, 3 * sizeof(float));
   ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
@@ -614,6 +609,17 @@ TEST(BasicInterpreter, ResizingTensorsStrict) {
   ASSERT_EQ(interpreter.ResizeInputTensor(t, {1, 2, 4}), kTfLiteOk);
   EXPECT_EQ(tensor->bytes, 8 * sizeof(float));
   ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  // Resizing to a smaller rank isn't permitted.
+  ASSERT_EQ(interpreter.ResizeInputTensorStrict(t, {8}), kTfLiteError);
+  ASSERT_EQ(interpreter.ResizeInputTensorStrict(t, {1}), kTfLiteError);
+  EXPECT_EQ(tensor->bytes, 8 * sizeof(float));
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  // Resizing to a larger rank isn't permitted either.
+  ASSERT_EQ(interpreter.ResizeInputTensorStrict(t, {1, 2, 4, 1}), kTfLiteError);
+  EXPECT_EQ(tensor->bytes, 8 * sizeof(float));
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
 }
 
 // Simple op that does input = output.
diff --git a/tensorflow/lite/ios/BUILD.apple b/tensorflow/lite/ios/BUILD.apple
index dcfd5b99b25..178fadb21e1 100644
--- a/tensorflow/lite/ios/BUILD.apple
+++ b/tensorflow/lite/ios/BUILD.apple
@@ -69,8 +69,8 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":extract_object_files",
-        "//tensorflow/python/platform",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:resource_loader",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -84,6 +84,7 @@ strip_common_include_path_prefix(
         "//tensorflow/lite/core/c:common.h",
         "//tensorflow/lite/core/c:c_api.h",
         "//tensorflow/lite/core/c:c_api_types.h",
+        "//tensorflow/lite/core/async/c:types.h",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
         "//tensorflow/lite/profiling/telemetry/c:profiler.h",
         "//tensorflow/lite/profiling/telemetry/c:telemetry_setting.h",
@@ -110,6 +111,7 @@ tflite_ios_framework(
         ":common.h",
         ":profiler.h",
         ":telemetry_setting.h",
+        ":types.h",
         ":xnnpack_delegate.h",
     ],
     allowlist_symbols_file = ":allowlist_TensorFlowLiteC.txt",
@@ -132,6 +134,7 @@ ios_static_framework(
         ":common.h",
         ":profiler.h",
         ":telemetry_setting.h",
+        ":types.h",
         ":xnnpack_delegate.h",
     ],
     bundle_name = "TensorFlowLiteC",
@@ -154,6 +157,8 @@ ios_static_framework(
     name = "TensorFlowLiteSelectTfOps_framework",
     avoid_deps = [
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/async/interop/c:types",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting",
     ],
     bundle_name = "TensorFlowLiteSelectTfOps",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
@@ -204,6 +209,7 @@ cc_library(
     name = "tensorflow_lite_c",
     hdrs = [
         "//tensorflow/lite:builtin_ops.h",
+        "//tensorflow/lite/core/async/c:types.h",
         "//tensorflow/lite/core/c:c_api.h",
         "//tensorflow/lite/core/c:c_api_experimental.h",
         "//tensorflow/lite/core/c:c_api_types.h",
@@ -218,6 +224,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite:builtin_ops",
+        "//tensorflow/lite/core/async/c:types",
         "//tensorflow/lite/core/c:c_api",
         "//tensorflow/lite/core/c:c_api_experimental",
         "//tensorflow/lite/core/c:c_api_types",
diff --git a/tensorflow/lite/ios/TensorFlowLiteC.h b/tensorflow/lite/ios/TensorFlowLiteC.h
index 4c4da6dd250..e1f94983010 100644
--- a/tensorflow/lite/ios/TensorFlowLiteC.h
+++ b/tensorflow/lite/ios/TensorFlowLiteC.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_IOS_TENSORFLOWLITEC_H_
 
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/async/c/types.h"
 #include "tensorflow/lite/core/c/c_api.h"
 #include "tensorflow/lite/core/c/c_api_experimental.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
diff --git a/tensorflow/lite/ios/build_frameworks.sh b/tensorflow/lite/ios/build_frameworks.sh
index 4bcf96220fd..6e5f29ff6a1 100755
--- a/tensorflow/lite/ios/build_frameworks.sh
+++ b/tensorflow/lite/ios/build_frameworks.sh
@@ -82,10 +82,13 @@ function generate_tflite_framework {
   # Generate the BUILD file.
   message=(
     'load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")'
-    'load("//tensorflow/lite:build_def.bzl", "tflite_custom_c_library")'
+    'load("//tensorflow/lite:build_def.bzl", "tflite_custom_cc_library")'
     'load("//tensorflow/lite/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION")'
-    'tflite_custom_c_library('
+    'tflite_custom_cc_library('
     '    name = "custom_c_api",'
+    '    deps = ['
+    '        "//tensorflow/lite/kernels:builtin_ops_list",'
+    '    ],'
     '    '"$(generate_list_field "models" "$MODEL_NAMES")"
     ')'
     'ios_static_framework('
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 603a0924acc..2b2edef4b69 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -137,13 +137,15 @@ aar_with_jni(
     name = "tensorflow-lite",
     android_library = ":tensorflowlite",
     headers = [
-        # TODO(b/175298345): Clean up and if possible remove common.h here.
+        # TODO(b/175298345): Clean up and if possible remove c:common.h and core/c:common.h here.
         "//tensorflow/lite:builtin_ops.h",
         "//tensorflow/lite/c:c_api.h",
         "//tensorflow/lite/c:c_api_experimental.h",
+        "//tensorflow/lite/c:c_api_opaque.h",
         "//tensorflow/lite/c:c_api_types.h",
         "//tensorflow/lite/c:common.h",
         "//tensorflow/lite/core/c:c_api.h",
+        "//tensorflow/lite/core/c:c_api_opaque.h",
         "//tensorflow/lite/core/c:c_api_types.h",
         "//tensorflow/lite/core/c:c_api_experimental.h",
         "//tensorflow/lite/core/c:common.h",
@@ -164,6 +166,8 @@ aar_with_jni(
     headers = [
         "//tensorflow/lite:builtin_ops.h",
         "//tensorflow/lite/c:c_api.h",
+        "//tensorflow/lite/c:c_api_types.h",
+        "//tensorflow/lite/core/c:c_api.h",
         "//tensorflow/lite/core/c:c_api_types.h",
         # TODO(b/175298345): move the stable parts of common.h into
         # a separate header file that contains no experimental APIs
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index fd0425c53e9..a1d2104f59f 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -32,14 +32,15 @@ import org.checkerframework.checker.nullness.qual.NonNull;
  *
  * <p>For example, if a model takes only one input and returns only one output:
  *
- * <pre> {@code
+ * <pre>{@code
  * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
  *   interpreter.run(input, output);
- * }}</pre>
+ * }
+ * }</pre>
  *
  * <p>If a model takes multiple inputs or outputs:
  *
- * <pre> {@code
+ * <pre>{@code
  * Object[] inputs = {input0, input1, ...};
  * Map<Integer, Object> map_of_indices_to_outputs = new HashMap<>();
  * FloatBuffer ith_output = FloatBuffer.allocateDirect(3 * 2 * 4);  // Float tensor, shape 3x2x4.
@@ -47,16 +48,33 @@ import org.checkerframework.checker.nullness.qual.NonNull;
  * map_of_indices_to_outputs.put(i, ith_output);
  * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
  *   interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
- * }}</pre>
+ * }
+ * }</pre>
  *
  * <p>If a model takes or produces string tensors:
  *
- * <pre> {@code
+ * <pre>{@code
  * String[] input = {"foo", "bar"};  // Input tensor shape is [2].
- * String[] output = new String[3][2];  // Output tensor shape is [3, 2].
+ * String[][] output = new String[3][2];  // Output tensor shape is [3, 2].
  * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
  *   interpreter.runForMultipleInputsOutputs(input, output);
- * }}</pre>
+ * }
+ * }</pre>
+ *
+ * <p>Note that there's a distinction between shape [] and shape[1]. For scalar string tensor
+ * outputs:
+ *
+ * <pre>{@code
+ * String[] input = {"foo"};  // Input tensor shape is [1].
+ * ByteBuffer outputBuffer = ByteBuffer.allocate(OUTPUT_BYTES_SIZE);  // Output tensor shape is [].
+ * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
+ *   interpreter.runForMultipleInputsOutputs(input, outputBuffer);
+ * }
+ * byte[] outputBytes = new byte[outputBuffer.remaining()];
+ * outputBuffer.get(outputBytes);
+ * // Below, the `charset` can be StandardCharsets.UTF_8.
+ * String output = new String(outputBytes, charset);
+ * }</pre>
  *
  * <p>Orders of inputs and outputs are determined when converting TensorFlow model to TensorFlowLite
  * model with Toco, as are the default shapes of the inputs.
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/InterpreterApi.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/InterpreterApi.java
index e65155304d5..4c98a3d378b 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/InterpreterApi.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/InterpreterApi.java
@@ -34,15 +34,16 @@ import org.tensorflow.lite.nnapi.NnApiDelegate;
  *
  * <p>For example, if a model takes only one input and returns only one output:
  *
- * <pre> {@code
+ * <pre>{@code
  * try (InterpreterApi interpreter =
  *     new InterpreterApi.create(file_of_a_tensorflowlite_model)) {
  *   interpreter.run(input, output);
- * }}</pre>
+ * }
+ * }</pre>
  *
  * <p>If a model takes multiple inputs or outputs:
  *
- * <pre> {@code
+ * <pre>{@code
  * Object[] inputs = {input0, input1, ...};
  * Map<Integer, Object> map_of_indices_to_outputs = new HashMap<>();
  * FloatBuffer ith_output = FloatBuffer.allocateDirect(3 * 2 * 4);  // Float tensor, shape 3x2x4.
@@ -51,17 +52,34 @@ import org.tensorflow.lite.nnapi.NnApiDelegate;
  * try (InterpreterApi interpreter =
  *     new InterpreterApi.create(file_of_a_tensorflowlite_model)) {
  *   interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
- * }}</pre>
+ * }
+ * }</pre>
  *
  * <p>If a model takes or produces string tensors:
  *
- * <pre> {@code
+ * <pre>{@code
  * String[] input = {"foo", "bar"};  // Input tensor shape is [2].
- * String[] output = new String[3][2];  // Output tensor shape is [3, 2].
+ * String[][] output = new String[3][2];  // Output tensor shape is [3, 2].
  * try (InterpreterApi interpreter =
  *     new InterpreterApi.create(file_of_a_tensorflowlite_model)) {
  *   interpreter.runForMultipleInputsOutputs(input, output);
- * }}</pre>
+ * }
+ * }</pre>
+ *
+ * <p>Note that there's a distinction between shape [] and shape[1]. For scalar string tensor
+ * outputs:
+ *
+ * <pre>{@code
+ * String[] input = {"foo"};  // Input tensor shape is [1].
+ * ByteBuffer outputBuffer = ByteBuffer.allocate(OUTPUT_BYTES_SIZE);  // Output tensor shape is [].
+ * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
+ *   interpreter.runForMultipleInputsOutputs(input, outputBuffer);
+ * }
+ * byte[] outputBytes = new byte[outputBuffer.remaining()];
+ * outputBuffer.get(outputBytes);
+ * // Below, the `charset` can be StandardCharsets.UTF_8.
+ * String output = new String(outputBytes, charset);
+ * }</pre>
  *
  * <p>Orders of inputs and outputs are determined when converting TensorFlow model to TensorFlowLite
  * model with Toco, as are the default shapes of the inputs.
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index 5735d58eea3..34521348d0a 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -3,6 +3,7 @@
 # TensorFlow Lite Java API using the TensorFlow Lite CC library.
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite:special_rules.bzl", "jni_utils_visibility_allowlist")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "alias_with_tflite", "cc_library_with_tflite")
 
 package(
@@ -16,8 +17,9 @@ cc_library_with_tflite(
     srcs = ["jni_utils.cc"],
     hdrs = ["jni_utils.h"],
     tflite_deps = [
-        "//tensorflow/lite/core/shims:jni_utils",
+        "//tensorflow/lite/c/jni:jni_utils",
     ],
+    visibility = jni_utils_visibility_allowlist(),
     deps = [
         "//tensorflow/lite:error_reporter",
         "//tensorflow/lite/java/jni",
@@ -40,7 +42,7 @@ cc_library_with_tflite(
     tflite_deps = [
         ":jni_utils",
         ":native_stable_framework_only",
-        "//tensorflow/lite/core/shims:framework_experimental",
+        "//tensorflow/lite:framework_experimental",
     ],
     deps = [
         "//tensorflow/lite/java/jni",
@@ -95,14 +97,12 @@ cc_library_with_tflite(
     tflite_deps = [
         ":jni_utils",
         "//tensorflow/lite:create_op_resolver_header",
-        "//tensorflow/lite/core/shims:c_api_without_op_resolver",
-        "//tensorflow/lite/core/shims:common",
-        "//tensorflow/lite/core/shims:create_op_resolver_header",
-        "//tensorflow/lite/core/shims:delegate_plugin",
-        "//tensorflow/lite/core/shims:framework_stable",
-        "//tensorflow/lite/core/shims:jni_utils",
-        "//tensorflow/lite/core/shims:xnnpack_plugin",
-        "//tensorflow/lite/core/shims:verifier_internal",
+        "//tensorflow/lite:framework_stable",
+        "//tensorflow/lite/c/jni:jni_utils",
+        "//tensorflow/lite/c:c_api_without_op_resolver",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/experimental/acceleration/configuration/c:xnnpack_plugin",
         "//tensorflow/lite/tools:verifier_internal",
     ],
     deps = [
@@ -131,9 +131,9 @@ cc_library_with_tflite(
     copts = tflite_copts(),
     tflite_deps = [
         ":native_experimental_framework_only",
-        "//tensorflow/lite/core/shims:builtin_ops",
-        "//tensorflow/lite/core/shims:create_op_resolver_with_builtin_ops",
-        "//tensorflow/lite/core/shims:framework",
+        "//tensorflow/lite:create_op_resolver_with_builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
     ],
     deps = [
         "//tensorflow/lite/core/api",
@@ -149,9 +149,9 @@ cc_library_with_tflite(
     copts = tflite_copts(),
     tflite_deps = [
         ":native_stable_framework_only",
-        "//tensorflow/lite/core/shims:builtin_ops",
-        "//tensorflow/lite/core/shims:create_op_resolver_with_builtin_ops",
-        "//tensorflow/lite/core/shims:framework_stable",
+        "//tensorflow/lite:create_op_resolver_with_builtin_ops",
+        "//tensorflow/lite:framework_stable",
+        "//tensorflow/lite/kernels:builtin_ops",
     ],
     deps = [
         "//tensorflow/lite/core/api",
diff --git a/tensorflow/lite/java/src/main/native/interpreter_factory_impl_jni.cc b/tensorflow/lite/java/src/main/native/interpreter_factory_impl_jni.cc
index c4db8d2a61c..770089d52be 100644
--- a/tensorflow/lite/java/src/main/native/interpreter_factory_impl_jni.cc
+++ b/tensorflow/lite/java/src/main/native/interpreter_factory_impl_jni.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <jni.h>
 #include <stdio.h>
 
-#include "tensorflow/lite/core/shims/c/c_api.h"
+#include "tensorflow/lite/c/c_api.h"
 #include "tensorflow/lite/java/src/main/native/jni_utils.h"
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/java/src/main/native/jni_utils.cc b/tensorflow/lite/java/src/main/native/jni_utils.cc
index b3fe4f3d722..68ada930f91 100644
--- a/tensorflow/lite/java/src/main/native/jni_utils.cc
+++ b/tensorflow/lite/java/src/main/native/jni_utils.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "tensorflow/lite/core/shims/jni/jni_utils.h"
+#include "tensorflow/lite/c/jni/jni_utils.h"
 
 namespace tflite {
 namespace jni {
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index e85fd498478..26f1e1bd71b 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -24,15 +24,15 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/core/shims/c/common.h"
-#include "tensorflow/lite/core/shims/cc/create_op_resolver.h"
-#include "tensorflow/lite/core/shims/cc/interpreter.h"
-#include "tensorflow/lite/core/shims/cc/interpreter_builder.h"
-#include "tensorflow/lite/core/shims/cc/model_builder.h"
-#include "tensorflow/lite/core/shims/cc/tools/verifier_internal.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/create_op_resolver.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/interpreter_builder.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/tools/verifier_internal.h"
 #if TFLITE_DISABLE_SELECT_JAVA_APIS
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h"
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #else
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
@@ -42,15 +42,15 @@ limitations under the License.
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/util.h"
 
+using tflite::FlatBufferModel;
+using tflite::Interpreter;
+using tflite::InterpreterBuilder;
 using tflite::OpResolver;
 using tflite::jni::AreDimsDifferent;
 using tflite::jni::BufferErrorReporter;
 using tflite::jni::CastLongToPointer;
 using tflite::jni::ConvertJIntArrayToVector;
 using tflite::jni::ThrowException;
-using tflite_shims::FlatBufferModel;
-using tflite_shims::Interpreter;
-using tflite_shims::InterpreterBuilder;
 
 namespace {
 
@@ -87,7 +87,7 @@ int getDataType(TfLiteType data_type) {
 
 // TODO(yichengfan): evaluate the benefit to use tflite verifier.
 bool VerifyModel(const void* buf, size_t length) {
-  return tflite_shims::internal::VerifyFlatBufferAndGetModel(buf, length);
+  return tflite::internal::VerifyFlatBufferAndGetModel(buf, length);
 }
 
 // Verifies whether the model is a flatbuffer file.
@@ -463,7 +463,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
 
   std::unique_ptr<OpResolver> resolver =
       std::make_unique<tflite::jni::OpResolverLazyDelegateProxy>(
-          tflite_shims::CreateOpResolver(), useXnnpack != JNI_FALSE);
+          tflite::CreateOpResolver(), useXnnpack != JNI_FALSE);
 
   InterpreterBuilder interpreter_builder(*model, *resolver);
   interpreter_builder.SetNumThreads(static_cast<int>(num_threads));
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapperexperimental_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapperexperimental_jni.cc
index 48d1b54a595..cc78d6a5329 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapperexperimental_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapperexperimental_jni.cc
@@ -23,12 +23,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/core/shims/cc/interpreter.h"
+#include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/java/src/main/native/jni_utils.h"
 
+using tflite::Interpreter;
 using tflite::jni::BufferErrorReporter;
 using tflite::jni::ThrowException;
-using tflite_shims::Interpreter;
 
 namespace {
 
diff --git a/tensorflow/lite/java/src/main/native/nativesignaturerunner_jni.cc b/tensorflow/lite/java/src/main/native/nativesignaturerunner_jni.cc
index 2c5f9eef07e..1955b58e268 100644
--- a/tensorflow/lite/java/src/main/native/nativesignaturerunner_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativesignaturerunner_jni.cc
@@ -14,13 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include <jni.h>
 
-#include "tensorflow/lite/core/shims/cc/interpreter.h"
+#include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/java/src/main/native/jni_utils.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/util.h"
 
+using tflite::Interpreter;
 using tflite::jni::ThrowException;
-using tflite_shims::Interpreter;
 
 #ifndef TFLITE_DISABLE_SELECT_JAVA_APIS
 namespace tflite {
@@ -87,11 +87,11 @@ class SignatureRunnerJNIHelper {
 };
 }  // namespace tflite
 
+using tflite::Interpreter;
 using tflite::SignatureRunner;
 using tflite::SignatureRunnerJNIHelper;
 using tflite::jni::BufferErrorReporter;
 using tflite::jni::CastLongToPointer;
-using tflite_shims::Interpreter;
 #endif  // TFLITE_DISABLE_SELECT_JAVA_APIS
 
 extern "C" {
diff --git a/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.cc b/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.cc
index 78c0cabffbe..f15ded27bc1 100644
--- a/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.cc
+++ b/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.cc
@@ -25,12 +25,12 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/op_resolver_internal.h"
-#include "tensorflow/lite/core/shims/c/common.h"
-#include "tensorflow/lite/core/shims/cc/model_builder.h"
+#include "tensorflow/lite/model_builder.h"
 #if TFLITE_DISABLE_SELECT_JAVA_APIS
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h"
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #else
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
index 39ce6a98d86..03bd0ed2daf 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -19,14 +19,14 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/lite/core/shims/c/common.h"
-#include "tensorflow/lite/core/shims/cc/interpreter.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/java/src/main/native/jni_utils.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/string_util.h"
 
+using tflite::Interpreter;
 using tflite::jni::ThrowException;
-using tflite_shims::Interpreter;
 
 namespace tflite {
 // Convenience handle for obtaining a TfLiteTensor given an interpreter and
diff --git a/tensorflow/lite/java/src/test/native/BUILD b/tensorflow/lite/java/src/test/native/BUILD
index 4dce2423501..2c32fc61833 100644
--- a/tensorflow/lite/java/src/test/native/BUILD
+++ b/tensorflow/lite/java/src/test/native/BUILD
@@ -33,7 +33,7 @@ cc_library_with_tflite(
         "test_init_jni.cc",
     ],
     tflite_deps = [
-        "//tensorflow/lite/core/shims:c_shims_test_util",
+        "//tensorflow/lite/c:test_util",
         "//tensorflow/lite/delegates/nnapi/java/src/main/native",
         "//tensorflow/lite/java/src/main/native",
         "//tensorflow/lite/java/src/main/native:jni_utils",
@@ -60,7 +60,7 @@ cc_library_with_tflite(
         "test_init_jni.cc",
     ],
     tflite_deps = [
-        "//tensorflow/lite/core/shims:c_shims_test_util",
+        "//tensorflow/lite/c:test_util",
         "//tensorflow/lite/delegates/nnapi/java/src/main/native",
         "//tensorflow/lite/java/src/main/native:jni_utils",
         "//tensorflow/lite/java/src/main/native:native_stable",
diff --git a/tensorflow/lite/java/src/test/native/test_init_jni.cc b/tensorflow/lite/java/src/test/native/test_init_jni.cc
index 0d68fa8bb6b..642de030b70 100644
--- a/tensorflow/lite/java/src/test/native/test_init_jni.cc
+++ b/tensorflow/lite/java/src/test/native/test_init_jni.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <jni.h>
 
-#include "tensorflow/lite/core/shims/c/shims_test_util.h"
+#include "tensorflow/lite/c/test_util.h"
 #include "tensorflow/lite/java/src/main/native/jni_utils.h"
 
 extern "C" {
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 9f68297443c..a3a2c27e5f5 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -192,7 +192,6 @@ cc_library(
         ":acceleration_test_util",
         ":kernel_util",
         ":test_delegate_providers_lib",
-        "//tensorflow/core/platform:logging",
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:simple_planner",
@@ -216,6 +215,7 @@ cc_library(
         "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools/optimize:quantization_utils",
         "//tensorflow/lite/tools/versioning",
+        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -391,7 +391,7 @@ cc_library(
         "//tensorflow/lite:macros",
         "//tensorflow/lite:external_cpu_backend_context",
         "//tensorflow/lite/kernels/internal:compatibility",
-        "@pthreadpool//:pthreadpool",
+        "@pthreadpool",
     ] + select({
         # This select must match the similar select in `copts`
         "//tensorflow:linux_ppc64le": [],
@@ -586,6 +586,8 @@ BUILTIN_KERNEL_SRCS = [
     "batch_to_space_nd.cc",
     "bidirectional_sequence_lstm.cc",
     "bidirectional_sequence_rnn.cc",
+    "bitcast.cc",
+    "bitwise_xor.cc",
     "broadcast_args.cc",
     "broadcast_to.cc",
     "bucketize.cc",
@@ -654,6 +656,7 @@ BUILTIN_KERNEL_SRCS = [
     "resize_nearest_neighbor.cc",
     "reverse.cc",
     "reverse_sequence.cc",
+    "right_shift.cc",
     "round.cc",
     "scatter_nd.cc",
     "segment_sum.cc",
@@ -752,8 +755,8 @@ cc_library(
         ":variable_op_kernels",
         "@fft2d",
         "@ruy//ruy/profiler:instrumentation",
-        "//tensorflow/core/lib/random:philox_random",
-        "//tensorflow/core/lib/random:random_distributions_utils",
+        "//tensorflow/tsl/lib/random:philox_random",
+        "//tensorflow/tsl/lib/random:random_distributions_utils",
         "//tensorflow/lite/core/c:c_api_types",
         # TODO(b/179298174): Move out from the experimental directory.
         "//tensorflow/lite/experimental/resource",
@@ -895,9 +898,11 @@ cc_library(
     name = "builtin_ops_list",
     compatible_with = get_compatible_with_portable(),
     textual_hdrs = ["builtin_ops_list.inc"],
-    visibility = [
-        "//tensorflow/lite:__subpackages__",
-    ],
+)
+
+exports_files(
+    srcs = ["builtin_ops_list.inc"],
+    visibility = ["//tensorflow/lite:__subpackages__"],
 )
 
 # For internal usage by shared libraries only.
@@ -1036,8 +1041,9 @@ cc_test(
 
 cc_test(
     name = "add_test",
-    size = "small",
+    size = "medium",
     srcs = ["add_test.cc"],
+    shard_count = 10,
     tags = [
         "tflite_nnapi",
         "tflite_xnnpack",
@@ -2192,7 +2198,9 @@ cc_test(
     name = "squeeze_test",
     size = "small",
     srcs = ["squeeze_test.cc"],
-    tags = ["tflite_nnapi"],
+    tags = [
+        "tflite_nnapi",
+    ],
     deps = [
         ":test_main",
         ":test_util",
@@ -2670,18 +2678,6 @@ cc_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_test(
     name = "mirror_pad_test",
     srcs = ["mirror_pad_test.cc"],
@@ -2842,7 +2838,7 @@ exports_files(
         "builtin_op_kernels.h",
         "fully_connected.h",
     ],
-    visibility = ["//tensorflow/lite/core/shims:__subpackages__"],
+    visibility = ["//tensorflow/lite:__subpackages__"],
 )
 
 cc_test(
@@ -2977,6 +2973,40 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "bitcast_test",
+    size = "small",
+    srcs = ["bitcast_test.cc"],
+    deps = [
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "bitwise_xor_test",
+    size = "small",
+    srcs = ["bitwise_xor_test.cc"],
+    deps = [
+        ":test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "right_shift_test",
+    size = "small",
+    srcs = ["right_shift_test.cc"],
+    deps = [
+        ":test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 tflite_portable_test_suite_combined(
     combine_conditions = {"deps": [":test_main"]},
     # TODO(b/229985981) : Remove `nnapi_args` after adding Relu0To1 is completed.
diff --git a/tensorflow/lite/kernels/CMakeLists.txt b/tensorflow/lite/kernels/CMakeLists.txt
index 2b5d1fe1424..57ed10d7e64 100644
--- a/tensorflow/lite/kernels/CMakeLists.txt
+++ b/tensorflow/lite/kernels/CMakeLists.txt
@@ -112,7 +112,7 @@ set(TEST_FRAMEWORK_OPTIONS "")
 if(TFLITE_ENABLE_XNNPACK)
   list(APPEND TEST_FRAMEWORK_SRC
     ${TFLITE_SOURCE_DIR}/tools/delegates/xnnpack_delegate_provider.cc
-    ${TFLITE_SOURCE_DIR}/core/experimental/acceleration/configuration/c/xnnpack_plugin.cc)
+    ${TFLITE_SOURCE_DIR}/core/acceleration/configuration/c/xnnpack_plugin.cc)
 else()
   list(APPEND TEST_FRAMEWORK_OPTIONS "-DTFLITE_WITHOUT_XNNPACK")
 endif()
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index a480e6bb647..5b0abf9914a 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -2729,16 +2729,22 @@ TEST(FloatActivationsOpTest, Gelu) {
 
 TEST(FloatActivationsOpTest, GeluApproximate) {
   FloatGeluOpModel m({TensorType_FLOAT32, {2, 3}}, /*approximate=*/true);
+  // The OpenCL delegate always uses the accurate version so use a higher
+  // tolerance for validation.
+  constexpr float kEpsilon = 1e-3;
 
   m.SetInput({
       0.0f, 1.0f, 3.0f,    // Row 1
       1.0f, -1.0f, -2.0f,  // Row 2
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                 0.0f, 0.841192f, 2.99636f,           // Row 1
-                                 0.841192f, -0.158808f, -0.0454023f,  // Row 2
-                             })));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 0.841192f, 2.99636f,           // Row 1
+                      0.841192f, -0.158808f, -0.0454023f,  // Row 2
+                  },
+                  kEpsilon)));
 }
 
 TEST(QuantizedGeluOpTest, GeluInt8) {
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index 9c4366f11ec..0101246d656 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -89,6 +89,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
+  TF_LITE_ENSURE(context, params);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -143,7 +145,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
-    general_scale_int16 = !params || !params->pot_scale_int16;
+    general_scale_int16 = !params->pot_scale_int16;
 
     if (!general_scale_int16) {
       // Do preparation in the case of the scale parameter is power of 2.
@@ -252,13 +254,13 @@ void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
   if (output->type == kTfLiteInt32) {
     if (kernel_type == kReference) {
       if (need_broadcast) {
-        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int32_t);
+        TF_LITE_ADD(reference_ops, BroadcastAdd6DSlow, int32_t);
       } else {
         TF_LITE_ADD(reference_ops, Add, int32_t);
       }
     } else {
       if (need_broadcast) {
-        TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, int32_t);
+        TF_LITE_ADD(optimized_ops, BroadcastAdd6DSlow, int32_t);
       } else {
         TF_LITE_ADD(optimized_ops, Add, int32_t);
       }
@@ -266,13 +268,13 @@ void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
   } else if (output->type == kTfLiteInt64) {
     if (kernel_type == kReference) {
       if (need_broadcast) {
-        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int64_t);
+        TF_LITE_ADD(reference_ops, BroadcastAdd6DSlow, int64_t);
       } else {
         TF_LITE_ADD(reference_ops, Add, int64_t);
       }
     } else {
       if (need_broadcast) {
-        TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, int64_t);
+        TF_LITE_ADD(optimized_ops, BroadcastAdd6DSlow, int64_t);
       } else {
         TF_LITE_ADD(optimized_ops, Add, int64_t);
       }
@@ -280,7 +282,7 @@ void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
   } else if (output->type == kTfLiteFloat32) {
     if (kernel_type == kReference) {
       if (need_broadcast) {
-        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, float);
+        TF_LITE_ADD(reference_ops, BroadcastAdd6DSlow, float);
       } else {
         TF_LITE_ADD(reference_ops, Add, float);
       }
@@ -297,7 +299,7 @@ void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
                              &output_activation_max);
     SetActivationParams(output_activation_min, output_activation_max,
                         &op_params);
-    reference_ops::BroadcastAdd4DSlow<int16_t, true>(
+    reference_ops::BroadcastAdd6DSlow<int16_t, true>(
         op_params, GetTensorShape(input1), GetTensorData<int16_t>(input1),
         GetTensorShape(input2), GetTensorData<int16_t>(input2),
         GetTensorShape(output), GetTensorData<int16_t>(output));
@@ -336,7 +338,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
     if (output->type == kTfLiteInt8) {
       if (kernel_type == kReference) {
         if (need_broadcast) {
-          TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
+          TF_LITE_ADD(reference_integer_ops, BroadcastAdd6DSlow, int8_t);
         } else {
           TF_LITE_ADD(reference_integer_ops, Add, int8_t);
         }
@@ -349,7 +351,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
       }
     } else if (output->type == kTfLiteInt16) {
       if (need_broadcast) {
-        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int16_t);
+        TF_LITE_ADD(reference_ops, BroadcastAdd6DSlow, int16_t);
       } else {
         if (kernel_type == kReference) {
           reference_ops::Add(
@@ -363,7 +365,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
     } else {
       if (kernel_type == kReference) {
         if (need_broadcast) {
-          TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
+          TF_LITE_ADD(reference_ops, BroadcastAdd6DSlow, uint8_t);
         } else {
           TF_LITE_ADD(reference_ops, Add, uint8_t);
         }
diff --git a/tensorflow/lite/kernels/add_n_test.cc b/tensorflow/lite/kernels/add_n_test.cc
index 30dd7d941d8..62173427a06 100644
--- a/tensorflow/lite/kernels/add_n_test.cc
+++ b/tensorflow/lite/kernels/add_n_test.cc
@@ -39,6 +39,16 @@ TEST(FloatAddNOpModel, AddMultipleTensors) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.4, 0.5, 1.1, 1.5}));
 }
 
+TEST(FloatAddNOpModel, Add2Tensors) {
+  FloatAddNOpModel m(
+      {{TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}}},
+      {TensorType_FLOAT32, {}});
+  m.PopulateTensor<float>(m.input(0), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input(1), {0.1, 0.2, 0.3, 0.5});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
 TEST(IntegerAddNOpModel, AddMultipleTensors) {
   IntegerAddNOpModel m({{TensorType_INT32, {1, 2, 2, 1}},
                         {TensorType_INT32, {1, 2, 2, 1}},
diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc
index d1870922d46..5eeefc87ee4 100644
--- a/tensorflow/lite/kernels/add_test.cc
+++ b/tensorflow/lite/kernels/add_test.cc
@@ -15,10 +15,13 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
+#include <algorithm>
+#include <array>
+#include <numeric>
+#include <random>
 #include <utility>
 #include <vector>
 
-#include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -38,6 +41,7 @@ class BaseAddOpModel : public SingleOpModel {
     output_ = AddOutput(output);
     SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                  CreateAddOptions(builder_, activation_type).Union());
+    SetBypassDefaultDelegates();
     BuildInterpreter({GetShape(input1_), GetShape(input2_)});
   }
 
@@ -49,6 +53,7 @@ class BaseAddOpModel : public SingleOpModel {
     output_ = AddOutput(type);
     SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                  CreateAddOptions(builder_, activation_type).Union());
+    SetBypassDefaultDelegates();
     BuildInterpreter({input1_shape, input2_shape});
   }
 
@@ -241,13 +246,270 @@ TEST(FloatAddOpModel, MixedBroadcast) {
   }
 }
 
-TEST(FloatAddOpModel, Float32MultiDimBroadcast) {
-  FloatAddOpModel m({TensorType_FLOAT32, {1, 2}}, {TensorType_FLOAT32, {2, 1}},
+constexpr int kDim1 = 2;
+constexpr int kDim2 = 3;
+constexpr int kDim3 = 4;
+constexpr int kDim4 = 5;
+constexpr int kDim5 = 6;
+constexpr int kDim6 = 7;
+
+void TestFloatBroadcast(std::vector<int> input1_shape,
+                        std::vector<int> input2_shape) {
+  std::array<int, 6> input1_dims;
+  std::array<int, 6> input2_dims;
+  std::array<int, 6> output_dims;
+  std::array<int, 6> input1_strides;
+  std::array<int, 6> input2_strides;
+  std::array<int, 6> output_strides;
+  std::fill(input1_dims.begin(), input1_dims.end(), 1);
+  std::fill(input2_dims.begin(), input2_dims.end(), 1);
+  std::fill(output_dims.begin(), output_dims.end(), 1);
+  std::copy(input1_shape.cbegin(), input1_shape.cend(),
+            input1_dims.end() - input1_shape.size());
+  std::copy(input2_shape.cbegin(), input2_shape.cend(),
+            input2_dims.end() - input2_shape.size());
+
+  for (size_t i = 0; i < 6; i++) {
+    if (input1_dims[i] != 1 && input2_dims[i] != 1) {
+      ASSERT_EQ(input1_dims[i], input2_dims[i]);
+    }
+    output_dims[i] = std::max(input1_dims[i], input2_dims[i]);
+  }
+  // Compute generalized strides.
+  size_t input1_stride = 1, input2_stride = 1, output_stride = 1;
+  for (size_t i = 6; i != 0; i--) {
+    input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride;
+    input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride;
+    output_strides[i - 1] = output_stride;
+    input1_stride *= input1_dims[i - 1];
+    input2_stride *= input2_dims[i - 1];
+    output_stride *= output_dims[i - 1];
+  }
+  const int num_input1_elements = std::accumulate(
+      input1_dims.begin(), input1_dims.end(), 1, std::multiplies<int>());
+  const int num_input2_elements = std::accumulate(
+      input2_dims.begin(), input2_dims.end(), 1, std::multiplies<int>());
+  const int num_output_elements = std::accumulate(
+      output_dims.begin(), output_dims.end(), 1, std::multiplies<int>());
+  std::vector<float> input1(num_input1_elements);
+  std::vector<float> input2(num_input2_elements);
+  std::vector<float> output_ref(num_output_elements);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  std::uniform_real_distribution<float> f32dist(0.01f, 1.0f);
+
+  std::generate(input1.begin(), input1.end(), [&]() { return f32dist(rng); });
+  std::generate(input2.begin(), input2.end(), [&]() { return f32dist(rng); });
+
+  // Compute reference results.
+  for (size_t i = 0; i < output_dims[0]; i++) {
+    for (size_t j = 0; j < output_dims[1]; j++) {
+      for (size_t k = 0; k < output_dims[2]; k++) {
+        for (size_t l = 0; l < output_dims[3]; l++) {
+          for (size_t m = 0; m < output_dims[4]; m++) {
+            for (size_t n = 0; n < output_dims[5]; n++) {
+              output_ref[i * output_strides[0] + j * output_strides[1] +
+                         k * output_strides[2] + l * output_strides[3] +
+                         m * output_strides[4] + n * output_strides[5]] =
+                  input1[i * input1_strides[0] + j * input1_strides[1] +
+                         k * input1_strides[2] + l * input1_strides[3] +
+                         m * input1_strides[4] + n * input1_strides[5]] +
+                  input2[i * input2_strides[0] + j * input2_strides[1] +
+                         k * input2_strides[2] + l * input2_strides[3] +
+                         m * input2_strides[4] + n * input2_strides[5]];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  FloatAddOpModel m({TensorType_FLOAT32, input1_shape},
+                    {TensorType_FLOAT32, input2_shape},
                     {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
-  m.PopulateTensor<float>(m.input1(), {3, 5});
-  m.PopulateTensor<float>(m.input2(), {1, 4});
+  m.PopulateTensor<float>(m.input1(), input1);
+  m.PopulateTensor<float>(m.input2(), input2);
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4.0, 6.0, 7.0, 9.0}));
+  EXPECT_THAT(m.GetOutput(), testing::ContainerEq(output_ref));
+}
+
+template <typename IntegerType>
+void TestIntegerBroadcast(std::vector<int> input1_shape,
+                          std::vector<int> input2_shape) {
+  std::array<int, 6> input1_dims;
+  std::array<int, 6> input2_dims;
+  std::array<int, 6> output_dims;
+  std::array<int, 6> input1_strides;
+  std::array<int, 6> input2_strides;
+  std::array<int, 6> output_strides;
+  std::fill(input1_dims.begin(), input1_dims.end(), 1);
+  std::fill(input2_dims.begin(), input2_dims.end(), 1);
+  std::fill(output_dims.begin(), output_dims.end(), 1);
+  std::copy(input1_shape.cbegin(), input1_shape.cend(),
+            input1_dims.end() - input1_shape.size());
+  std::copy(input2_shape.cbegin(), input2_shape.cend(),
+            input2_dims.end() - input2_shape.size());
+
+  for (size_t i = 0; i < 6; i++) {
+    if (input1_dims[i] != 1 && input2_dims[i] != 1) {
+      ASSERT_EQ(input1_dims[i], input2_dims[i]);
+    }
+    output_dims[i] = std::max(input1_dims[i], input2_dims[i]);
+  }
+  // Compute generalized strides.
+  size_t input1_stride = 1, input2_stride = 1, output_stride = 1;
+  for (size_t i = 6; i != 0; i--) {
+    input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride;
+    input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride;
+    output_strides[i - 1] = output_stride;
+    input1_stride *= input1_dims[i - 1];
+    input2_stride *= input2_dims[i - 1];
+    output_stride *= output_dims[i - 1];
+  }
+  const int num_input1_elements = std::accumulate(
+      input1_dims.begin(), input1_dims.end(), 1, std::multiplies<int>());
+  const int num_input2_elements = std::accumulate(
+      input2_dims.begin(), input2_dims.end(), 1, std::multiplies<int>());
+  const int num_output_elements = std::accumulate(
+      output_dims.begin(), output_dims.end(), 1, std::multiplies<int>());
+  std::vector<IntegerType> input1(num_input1_elements);
+  std::vector<IntegerType> input2(num_input2_elements);
+  std::vector<IntegerType> output_ref(num_output_elements);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  std::uniform_int_distribution<IntegerType> dist(0, 256);
+
+  std::generate(input1.begin(), input1.end(), [&]() { return dist(rng); });
+  std::generate(input2.begin(), input2.end(), [&]() { return dist(rng); });
+
+  // Compute reference results.
+  for (size_t i = 0; i < output_dims[0]; i++) {
+    for (size_t j = 0; j < output_dims[1]; j++) {
+      for (size_t k = 0; k < output_dims[2]; k++) {
+        for (size_t l = 0; l < output_dims[3]; l++) {
+          for (size_t m = 0; m < output_dims[4]; m++) {
+            for (size_t n = 0; n < output_dims[5]; n++) {
+              output_ref[i * output_strides[0] + j * output_strides[1] +
+                         k * output_strides[2] + l * output_strides[3] +
+                         m * output_strides[4] + n * output_strides[5]] =
+                  input1[i * input1_strides[0] + j * input1_strides[1] +
+                         k * input1_strides[2] + l * input1_strides[3] +
+                         m * input1_strides[4] + n * input1_strides[5]] +
+                  input2[i * input2_strides[0] + j * input2_strides[1] +
+                         k * input2_strides[2] + l * input2_strides[3] +
+                         m * input2_strides[4] + n * input2_strides[5]];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  IntegerAddOpModel m({GetTensorType<IntegerType>(), input1_shape},
+                      {GetTensorType<IntegerType>(), input2_shape},
+                      {GetTensorType<IntegerType>(), {}},
+                      ActivationFunctionType_NONE);
+  m.PopulateTensor<IntegerType>(m.input1(), input1);
+  m.PopulateTensor<IntegerType>(m.input2(), input2);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput<IntegerType>(), testing::ContainerEq(output_ref));
+}
+
+// To improve automatic test sharding (via shard_count in the BUILD file),
+// we need to ensure that each individual test case runs in a reasonable time,
+// otherwise we end up being limited by the performance of the longest shard.
+// Since TestFloat32MultiDimBroadcast has 2^12 iterations, it takes a
+// long time (over 30 seconds) to execute all iterations -- too long for a
+// single shard.  So we split it into a few "subshards" and have a separate
+// TYPED_TEST macro invocation for each subshard.
+
+void TestFloat32MultiDimBroadcast(int selected_subshard, int subshard_count) {
+  int iteration = 0;
+  for (uint32_t bm1 = 0; bm1 < (static_cast<uint32_t>(1) << 6); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (static_cast<uint32_t>(1) << 6); bm2++) {
+      if (iteration++ % subshard_count != selected_subshard) {
+        continue;  // This iteration of the loop is not part of this subshard.
+      }
+      const bool input1_broadcast_dim1 = bm1 & (static_cast<uint32_t>(1) << 0);
+      const bool input1_broadcast_dim2 = bm1 & (static_cast<uint32_t>(1) << 1);
+      const bool input1_broadcast_dim3 = bm1 & (static_cast<uint32_t>(1) << 2);
+      const bool input1_broadcast_dim4 = bm1 & (static_cast<uint32_t>(1) << 3);
+      const bool input1_broadcast_dim5 = bm1 & (static_cast<uint32_t>(1) << 4);
+      const bool input1_broadcast_dim6 = bm1 & (static_cast<uint32_t>(1) << 5);
+      const bool input2_broadcast_dim1 = bm2 & (static_cast<uint32_t>(1) << 0);
+      const bool input2_broadcast_dim2 = bm2 & (static_cast<uint32_t>(1) << 1);
+      const bool input2_broadcast_dim3 = bm2 & (static_cast<uint32_t>(1) << 2);
+      const bool input2_broadcast_dim4 = bm2 & (static_cast<uint32_t>(1) << 3);
+      const bool input2_broadcast_dim5 = bm2 & (static_cast<uint32_t>(1) << 4);
+      const bool input2_broadcast_dim6 = bm2 & (static_cast<uint32_t>(1) << 5);
+      const int input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const int input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const int input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const int input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const int input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const int input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+      const int input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const int input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const int input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const int input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const int input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      const int input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+      std::vector<int> input1_full_shape{input1_dim1, input1_dim2, input1_dim3,
+                                         input1_dim4, input1_dim5, input1_dim6};
+      std::vector<int> input2_full_shape{input2_dim1, input2_dim2, input2_dim3,
+                                         input2_dim4, input2_dim5, input2_dim6};
+      for (int input1_dims = 1; input1_dims <= 6; ++input1_dims) {
+        for (int input2_dims = 1; input2_dims <= 6; ++input2_dims) {
+          std::vector<int> input1_shape(input1_dims), input2_shape(input2_dims);
+          std::copy(input1_full_shape.end() - input1_dims,
+                    input1_full_shape.end(), input1_shape.data());
+          std::copy(input2_full_shape.end() - input2_dims,
+                    input2_full_shape.end(), input2_shape.data());
+          TestFloatBroadcast(input1_shape, input2_shape);
+        }
+      }
+    }
+  }
+}
+
+// Should match the number of TEST or TYPED_TEST invoations for each of
+// Float32MultiDimBroadcastSubshard*,
+// IntegerMultiDimBroadcastSubshard*,
+// Int8QuantizedMultiDimBroadcastSubshard*, and
+// Uint8QuantizedMultiDimBroadcastSubshard* below.
+constexpr int kMultiDimBroadcastSubshardCount = 10;
+
+TEST(FloatAddOpModel, Float32MultiDimBroadcastSubshard0) {
+  TestFloat32MultiDimBroadcast(0, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatAddOpModel, Float32MultiDimBroadcastSubshard1) {
+  TestFloat32MultiDimBroadcast(1, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatAddOpModel, Float32MultiDimBroadcastSubshard2) {
+  TestFloat32MultiDimBroadcast(2, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatAddOpModel, Float32MultiDimBroadcastSubshard3) {
+  TestFloat32MultiDimBroadcast(3, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatAddOpModel, Float32MultiDimBroadcastSubshard4) {
+  TestFloat32MultiDimBroadcast(4, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatAddOpModel, Float32MultiDimBroadcastSubshard5) {
+  TestFloat32MultiDimBroadcast(5, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatAddOpModel, Float32MultiDimBroadcastSubshard6) {
+  TestFloat32MultiDimBroadcast(6, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatAddOpModel, Float32MultiDimBroadcastSubshard7) {
+  TestFloat32MultiDimBroadcast(7, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatAddOpModel, Float32MultiDimBroadcastSubshard8) {
+  TestFloat32MultiDimBroadcast(8, kMultiDimBroadcastSubshardCount);
+}
+TEST(FloatAddOpModel, Float32MultiDimBroadcastSubshard9) {
+  TestFloat32MultiDimBroadcast(9, kMultiDimBroadcastSubshardCount);
 }
 
 template <typename T>
@@ -256,6 +518,96 @@ class IntegerAddOpTest : public ::testing::Test {};
 using Int16OrInt32Or64Types = ::testing::Types<int16_t, int32_t, int64_t>;
 TYPED_TEST_SUITE(IntegerAddOpTest, Int16OrInt32Or64Types);
 
+// To improve automatic test sharding (via shard_count in the BUILD file),
+// we need to ensure that each individual test case runs in a reasonable time,
+// otherwise we end up being limited by the performance of the longest shard.
+// Since TestIntegerMultiDimBroadcast has 2^12 iterations, it takes a
+// long time (over 30 seconds) to execute all iterations -- too long for a
+// single shard.  So we split it into a few "subshards" and have a separate
+// TYPED_TEST macro invocation for each subshard.
+
+template <class TypeParam>
+void TestIntegerMultiDimBroadcast(int selected_subshard, int subshard_count) {
+  ASSERT_LT(selected_subshard, subshard_count);
+  int iteration = 0;
+  for (uint32_t bm1 = 0; bm1 < (static_cast<uint32_t>(1) << 6); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (static_cast<uint32_t>(1) << 6); bm2++) {
+      if (iteration++ % subshard_count != selected_subshard) {
+        continue;  // This iteration of the loop is not part of this subshard.
+      }
+      const bool input1_broadcast_dim1 = bm1 & (static_cast<uint32_t>(1) << 0);
+      const bool input1_broadcast_dim2 = bm1 & (static_cast<uint32_t>(1) << 1);
+      const bool input1_broadcast_dim3 = bm1 & (static_cast<uint32_t>(1) << 2);
+      const bool input1_broadcast_dim4 = bm1 & (static_cast<uint32_t>(1) << 3);
+      const bool input1_broadcast_dim5 = bm1 & (static_cast<uint32_t>(1) << 4);
+      const bool input1_broadcast_dim6 = bm1 & (static_cast<uint32_t>(1) << 5);
+      const bool input2_broadcast_dim1 = bm2 & (static_cast<uint32_t>(1) << 0);
+      const bool input2_broadcast_dim2 = bm2 & (static_cast<uint32_t>(1) << 1);
+      const bool input2_broadcast_dim3 = bm2 & (static_cast<uint32_t>(1) << 2);
+      const bool input2_broadcast_dim4 = bm2 & (static_cast<uint32_t>(1) << 3);
+      const bool input2_broadcast_dim5 = bm2 & (static_cast<uint32_t>(1) << 4);
+      const bool input2_broadcast_dim6 = bm2 & (static_cast<uint32_t>(1) << 5);
+      const int input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const int input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const int input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const int input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const int input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const int input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+      const int input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const int input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const int input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const int input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const int input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      const int input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+      std::vector<int> input1_full_shape{input1_dim1, input1_dim2, input1_dim3,
+                                         input1_dim4, input1_dim5, input1_dim6};
+      std::vector<int> input2_full_shape{input2_dim1, input2_dim2, input2_dim3,
+                                         input2_dim4, input2_dim5, input2_dim6};
+      for (int input1_dims = 1; input1_dims <= 6; ++input1_dims) {
+        for (int input2_dims = 1; input2_dims <= 6; ++input2_dims) {
+          std::vector<int> input1_shape(input1_dims), input2_shape(input2_dims);
+          std::copy(input1_full_shape.end() - input1_dims,
+                    input1_full_shape.end(), input1_shape.data());
+          std::copy(input2_full_shape.end() - input2_dims,
+                    input2_full_shape.end(), input2_shape.data());
+          TestIntegerBroadcast<TypeParam>(input1_shape, input2_shape);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(IntegerAddOpTest, IntegerMultiDimBroadcastSubshard0) {
+  TestIntegerMultiDimBroadcast<TypeParam>(0, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerAddOpTest, IntegerMultiDimBroadcastSubshard1) {
+  TestIntegerMultiDimBroadcast<TypeParam>(1, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerAddOpTest, IntegerMultiDimBroadcastSubshard2) {
+  TestIntegerMultiDimBroadcast<TypeParam>(2, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerAddOpTest, IntegerMultiDimBroadcastSubshard3) {
+  TestIntegerMultiDimBroadcast<TypeParam>(3, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerAddOpTest, IntegerMultiDimBroadcastSubshard4) {
+  TestIntegerMultiDimBroadcast<TypeParam>(4, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerAddOpTest, IntegerMultiDimBroadcastSubshard5) {
+  TestIntegerMultiDimBroadcast<TypeParam>(5, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerAddOpTest, IntegerMultiDimBroadcastSubshard6) {
+  TestIntegerMultiDimBroadcast<TypeParam>(6, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerAddOpTest, IntegerMultiDimBroadcastSubshard7) {
+  TestIntegerMultiDimBroadcast<TypeParam>(7, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerAddOpTest, IntegerMultiDimBroadcastSubshard8) {
+  TestIntegerMultiDimBroadcast<TypeParam>(8, kMultiDimBroadcastSubshardCount);
+}
+TYPED_TEST(IntegerAddOpTest, IntegerMultiDimBroadcastSubshard9) {
+  TestIntegerMultiDimBroadcast<TypeParam>(9, kMultiDimBroadcastSubshardCount);
+}
+
 TYPED_TEST(IntegerAddOpTest, NoActivation) {
   IntegerAddOpModel m(GetTensorType<TypeParam>(), {1, 2, 2, 1}, {1, 2, 2, 1},
                       ActivationFunctionType_NONE);
@@ -314,6 +666,236 @@ TYPED_TEST(IntegerAddOpTest, Int32MultiDimBroadcast) {
   EXPECT_THAT(m.GetOutput<TypeParam>(), ElementsAreArray({4, 6, 7, 9}));
 }
 
+template <typename QuantizedType>
+void TestQuantizedBroadcast(std::vector<int> input1_shape,
+                            std::vector<int> input2_shape) {
+  std::array<int, 6> input1_dims;
+  std::array<int, 6> input2_dims;
+  std::array<int, 6> output_dims;
+  std::array<int, 6> input1_strides;
+  std::array<int, 6> input2_strides;
+  std::array<int, 6> output_strides;
+  std::fill(input1_dims.begin(), input1_dims.end(), 1);
+  std::fill(input2_dims.begin(), input2_dims.end(), 1);
+  std::fill(output_dims.begin(), output_dims.end(), 1);
+  std::copy(input1_shape.cbegin(), input1_shape.cend(),
+            input1_dims.end() - input1_shape.size());
+  std::copy(input2_shape.cbegin(), input2_shape.cend(),
+            input2_dims.end() - input2_shape.size());
+
+  for (size_t i = 0; i < 6; i++) {
+    if (input1_dims[i] != 1 && input2_dims[i] != 1) {
+      ASSERT_EQ(input1_dims[i], input2_dims[i]);
+    }
+    output_dims[i] = std::max(input1_dims[i], input2_dims[i]);
+  }
+  // Compute generalized strides.
+  size_t input1_stride = 1, input2_stride = 1, output_stride = 1;
+  for (size_t i = 6; i != 0; i--) {
+    input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride;
+    input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride;
+    output_strides[i - 1] = output_stride;
+    input1_stride *= input1_dims[i - 1];
+    input2_stride *= input2_dims[i - 1];
+    output_stride *= output_dims[i - 1];
+  }
+  const int num_input1_elements = std::accumulate(
+      input1_dims.begin(), input1_dims.end(), 1, std::multiplies<int>());
+  const int num_input2_elements = std::accumulate(
+      input2_dims.begin(), input2_dims.end(), 1, std::multiplies<int>());
+  const int num_output_elements = std::accumulate(
+      output_dims.begin(), output_dims.end(), 1, std::multiplies<int>());
+  std::vector<float> input1(num_input1_elements);
+  std::vector<float> input2(num_input2_elements);
+  std::vector<float> output_ref(num_output_elements);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+
+  std::uniform_real_distribution<float> dist(-0.5f, 0.5f);
+
+  std::generate(input1.begin(), input1.end(), [&]() { return dist(rng); });
+  std::generate(input2.begin(), input2.end(), [&]() { return dist(rng); });
+
+  QuantizedAddOpModel m(
+      {GetTensorType<QuantizedType>(), input1_shape, -0.5f, 0.5f},
+      {GetTensorType<QuantizedType>(), input2_shape, -0.5f, 0.5f},
+      {GetTensorType<QuantizedType>(), {}, -1.f, 1.f},
+      ActivationFunctionType_NONE);
+  m.QuantizeAndPopulate<QuantizedType>(m.input1(), input1);
+  m.QuantizeAndPopulate<QuantizedType>(m.input2(), input2);
+  // Compute reference results.
+  for (size_t i = 0; i < output_dims[0]; i++) {
+    for (size_t j = 0; j < output_dims[1]; j++) {
+      for (size_t k = 0; k < output_dims[2]; k++) {
+        for (size_t l = 0; l < output_dims[3]; l++) {
+          for (size_t m = 0; m < output_dims[4]; m++) {
+            for (size_t n = 0; n < output_dims[5]; n++) {
+              float x = input1[i * input1_strides[0] + j * input1_strides[1] +
+                               k * input1_strides[2] + l * input1_strides[3] +
+                               m * input1_strides[4] + n * input1_strides[5]];
+              float y = input2[i * input2_strides[0] + j * input2_strides[1] +
+                               k * input2_strides[2] + l * input2_strides[3] +
+                               m * input2_strides[4] + n * input2_strides[5]];
+              output_ref[i * output_strides[0] + j * output_strides[1] +
+                         k * output_strides[2] + l * output_strides[3] +
+                         m * output_strides[4] + n * output_strides[5]] = x + y;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (float& output_value : output_ref) {
+    output_value = std::max<float>(output_value, -1.0f);
+    output_value = std::min<float>(output_value, 1.0f);
+  }
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  std::vector<float> output = m.GetDequantizedOutput<QuantizedType>();
+  for (size_t i = 0; i < output_dims[0]; i++) {
+    for (size_t j = 0; j < output_dims[1]; j++) {
+      for (size_t k = 0; k < output_dims[2]; k++) {
+        for (size_t l = 0; l < output_dims[3]; l++) {
+          for (size_t m = 0; m < output_dims[4]; m++) {
+            for (size_t n = 0; n < output_dims[5]; n++) {
+              const size_t index =
+                  i * output_strides[0] + j * output_strides[1] +
+                  k * output_strides[2] + l * output_strides[3] +
+                  m * output_strides[4] + n * output_strides[5];
+              EXPECT_NEAR(output[index], output_ref[index], 0.6f)
+                  << "(i, j, k, l, m, n) = (" << i << ", " << j << ", " << k
+                  << ", " << l << ", " << m << ", " << n << ")";
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// To improve automatic test sharding (via shard_count in the BUILD file),
+// we need to ensure that each individual test case runs in a reasonable time,
+// otherwise we end up being limited by the performance of the longest shard.
+// Since TestQuantizedMultiDimBroadcast has 2^12 iterations, it takes a
+// long time (over 30 seconds) to execute all iterations -- too long for a
+// single shard.  So we split it into a few "subshards" and have a separate
+// TEST macro invocation for each subshard.
+
+template <class T>
+void TestQuantizedMultiDimBroadcast(int selected_subshard, int subshard_count) {
+  ASSERT_LT(selected_subshard, subshard_count);
+  int iteration = 0;
+  for (uint32_t bm1 = 0; bm1 < (static_cast<uint32_t>(1) << 6); bm1++) {
+    for (uint32_t bm2 = 0; bm2 < (static_cast<int32_t>(1) << 6); bm2++) {
+      if (iteration++ % subshard_count != selected_subshard) {
+        continue;  // This iteration of the loop is not part of this subshard.
+      }
+      const bool input1_broadcast_dim1 = bm1 & (static_cast<uint32_t>(1) << 0);
+      const bool input1_broadcast_dim2 = bm1 & (static_cast<uint32_t>(1) << 1);
+      const bool input1_broadcast_dim3 = bm1 & (static_cast<uint32_t>(1) << 2);
+      const bool input1_broadcast_dim4 = bm1 & (static_cast<uint32_t>(1) << 3);
+      const bool input1_broadcast_dim5 = bm1 & (static_cast<uint32_t>(1) << 4);
+      const bool input1_broadcast_dim6 = bm1 & (static_cast<uint32_t>(1) << 5);
+      const bool input2_broadcast_dim1 = bm2 & (static_cast<uint32_t>(1) << 0);
+      const bool input2_broadcast_dim2 = bm2 & (static_cast<uint32_t>(1) << 1);
+      const bool input2_broadcast_dim3 = bm2 & (static_cast<uint32_t>(1) << 2);
+      const bool input2_broadcast_dim4 = bm2 & (static_cast<uint32_t>(1) << 3);
+      const bool input2_broadcast_dim5 = bm2 & (static_cast<uint32_t>(1) << 4);
+      const bool input2_broadcast_dim6 = bm2 & (static_cast<uint32_t>(1) << 5);
+      const int input1_dim1 = input1_broadcast_dim1 ? 1 : kDim1;
+      const int input1_dim2 = input1_broadcast_dim2 ? 1 : kDim2;
+      const int input1_dim3 = input1_broadcast_dim3 ? 1 : kDim3;
+      const int input1_dim4 = input1_broadcast_dim4 ? 1 : kDim4;
+      const int input1_dim5 = input1_broadcast_dim5 ? 1 : kDim5;
+      const int input1_dim6 = input1_broadcast_dim6 ? 1 : kDim6;
+      const int input2_dim1 = input2_broadcast_dim1 ? 1 : kDim1;
+      const int input2_dim2 = input2_broadcast_dim2 ? 1 : kDim2;
+      const int input2_dim3 = input2_broadcast_dim3 ? 1 : kDim3;
+      const int input2_dim4 = input2_broadcast_dim4 ? 1 : kDim4;
+      const int input2_dim5 = input2_broadcast_dim5 ? 1 : kDim5;
+      const int input2_dim6 = input2_broadcast_dim6 ? 1 : kDim6;
+      std::vector<int> input1_full_shape{input1_dim1, input1_dim2, input1_dim3,
+                                         input1_dim4, input1_dim5, input1_dim6};
+      std::vector<int> input2_full_shape{input2_dim1, input2_dim2, input2_dim3,
+                                         input2_dim4, input2_dim5, input2_dim6};
+      for (int input1_dims = 1; input1_dims <= 6; ++input1_dims) {
+        for (int input2_dims = 1; input2_dims <= 6; ++input2_dims) {
+          std::vector<int> input1_shape(input1_dims), input2_shape(input2_dims);
+          std::copy(input1_full_shape.end() - input1_dims,
+                    input1_full_shape.end(), input1_shape.data());
+          std::copy(input2_full_shape.end() - input2_dims,
+                    input2_full_shape.end(), input2_shape.data());
+          TestQuantizedBroadcast<T>(input1_shape, input2_shape);
+        }
+      }
+    }
+  }
+}
+
+TEST(QuantizedAddOpModel, Int8QuantizedMultiDimBroadcastSubshard0) {
+  TestQuantizedMultiDimBroadcast<int8_t>(0, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Int8QuantizedMultiDimBroadcastSubshard1) {
+  TestQuantizedMultiDimBroadcast<int8_t>(1, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Int8QuantizedMultiDimBroadcastSubshard2) {
+  TestQuantizedMultiDimBroadcast<int8_t>(2, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Int8QuantizedMultiDimBroadcastSubshard3) {
+  TestQuantizedMultiDimBroadcast<int8_t>(3, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Int8QuantizedMultiDimBroadcastSubshard4) {
+  TestQuantizedMultiDimBroadcast<int8_t>(4, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Int8QuantizedMultiDimBroadcastSubshard5) {
+  TestQuantizedMultiDimBroadcast<int8_t>(5, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Int8QuantizedMultiDimBroadcastSubshard6) {
+  TestQuantizedMultiDimBroadcast<int8_t>(6, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Int8QuantizedMultiDimBroadcastSubshard7) {
+  TestQuantizedMultiDimBroadcast<int8_t>(7, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Int8QuantizedMultiDimBroadcastSubshard8) {
+  TestQuantizedMultiDimBroadcast<int8_t>(8, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Int8QuantizedMultiDimBroadcastSubshard9) {
+  TestQuantizedMultiDimBroadcast<int8_t>(9, kMultiDimBroadcastSubshardCount);
+}
+
+TEST(QuantizedAddOpModel, Uint8QuantizedMultiDimBroadcastSubshard0) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(0, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Uint8QuantizedMultiDimBroadcastSubshard1) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(1, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Uint8QuantizedMultiDimBroadcastSubshard2) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(2, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Uint8QuantizedMultiDimBroadcastSubshard3) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(3, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Uint8QuantizedMultiDimBroadcastSubshard4) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(4, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Uint8QuantizedMultiDimBroadcastSubshard5) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(5, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Uint8QuantizedMultiDimBroadcastSubshard6) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(6, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Uint8QuantizedMultiDimBroadcastSubshard7) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(7, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Uint8QuantizedMultiDimBroadcastSubshard8) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(8, kMultiDimBroadcastSubshardCount);
+}
+TEST(QuantizedAddOpModel, Uint8QuantizedMultiDimBroadcastSubshard9) {
+  TestQuantizedMultiDimBroadcast<uint8_t>(9, kMultiDimBroadcastSubshardCount);
+}
+
 template <TensorType tensor_type, typename integer_dtype>
 void QuantizedTestsNoActivation() {
   float kQuantizedTolerance = GetTolerance<integer_dtype>(-1.0, 1.0);
diff --git a/tensorflow/lite/kernels/arg_min_max.cc b/tensorflow/lite/kernels/arg_min_max.cc
index 3b98859edfe..fbe2b83ed08 100644
--- a/tensorflow/lite/kernels/arg_min_max.cc
+++ b/tensorflow/lite/kernels/arg_min_max.cc
@@ -115,7 +115,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
-  if (IsConstantTensor(axis)) {
+  if (IsConstantOrPersistentTensor(axis)) {
     TF_LITE_ENSURE_STATUS(ResizeOutput(context, input, axis, output));
   } else {
     SetTensorToDynamic(output);
diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc
index 15788ff09ce..3b5963e693c 100644
--- a/tensorflow/lite/kernels/batch_matmul.cc
+++ b/tensorflow/lite/kernels/batch_matmul.cc
@@ -606,7 +606,7 @@ TfLiteStatus EvalInt16(TfLiteContext* context, const OpData* data,
   op_params.quantized_activation_min = data->output_activation_min;
   op_params.quantized_activation_max = data->output_activation_max;
 
-  // optimized_ops not yet implemnted for int16_t, use reference_ops in all
+  // optimized_ops not yet implemented for int16_t, use reference_ops in all
   // cases.
   reference_ops::BatchMatMul<int16_t, int64_t>(
       op_params, rhs_shape, GetTensorData<int16_t>(rhs), lhs_shape,
@@ -721,6 +721,29 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   bool adj_y = op_context.params->adj_y;
   bool adj_x = op_context.params->adj_x;
 
+  int32_t rhs_dims_count = orig_rhs_shape.DimensionsCount();
+  int32_t lhs_dims_count = orig_lhs_shape.DimensionsCount();
+  // Compress ops where rhs shape is [..., 1, X, Y] and lhs shape is
+  // [..., Q, R, S] which is equivalent to rhs: [..., X, Y] and
+  // lhs: [..., Q * R, S].
+  if (rhs_dims_count > 2 && lhs_dims_count > 2) {
+    int rhs_one = orig_rhs_shape.DimsData()[rhs_dims_count - 3];
+    if (rhs_one == 1) {
+      int32_t* lhs_dims = orig_lhs_shape.DimsData();
+      int32_t* rhs_dims = orig_rhs_shape.DimsData();
+      RuntimeShape tmp_l(lhs_dims_count - 1, lhs_dims);
+      tmp_l.SetDim(lhs_dims_count - 3,
+                   lhs_dims[lhs_dims_count - 3] * lhs_dims[lhs_dims_count - 2]);
+      tmp_l.SetDim(lhs_dims_count - 2, lhs_dims[lhs_dims_count - 1]);
+      orig_lhs_shape.ReplaceWith(tmp_l.DimensionsCount(), tmp_l.DimsData());
+      RuntimeShape tmp_r(rhs_dims_count - 1, orig_rhs_shape.DimsData());
+      tmp_r.SetDim(rhs_dims_count - 3, rhs_dims[rhs_dims_count - 2]);
+      tmp_r.SetDim(rhs_dims_count - 2, rhs_dims[rhs_dims_count - 1]);
+      orig_rhs_shape.ReplaceWith(tmp_r.DimensionsCount(), tmp_r.DimsData());
+    }
+  }
+  rhs_dims_count = orig_rhs_shape.DimensionsCount();
+  lhs_dims_count = orig_lhs_shape.DimensionsCount();
   const TfLiteTensor* rhs_tensor = adj_y ? rhs : GetTempRhs(context, node, rhs);
   const TfLiteTensor* lhs_tensor = adj_x ? GetTempLhs(context, node, lhs) : lhs;
   if (!adj_y) {
diff --git a/tensorflow/lite/kernels/batch_matmul_test.cc b/tensorflow/lite/kernels/batch_matmul_test.cc
index 5eabf2eb507..9e968865f10 100644
--- a/tensorflow/lite/kernels/batch_matmul_test.cc
+++ b/tensorflow/lite/kernels/batch_matmul_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include <initializer_list>
+#include <numeric>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -91,6 +92,37 @@ class BatchMatMulOpTest : public SingleOpTest {
   }
 };
 
+TEST_P(BatchMatMulOpTest, Float32Test_Ones) {
+  BatchMatMulOpModel<float> model({TensorType_FLOAT32, {3, 2, 1, 4}},
+                                  {TensorType_FLOAT32, {3, 1, 4, 1}});
+  std::vector<float> lhs(24);
+  std::iota(lhs.begin(), lhs.end(), 1);
+  std::vector<float> rhs(12);
+  std::iota(rhs.begin(), rhs.end(), 1);
+  std::vector<float> res{30, 70, 278, 382, 782, 950};
+  model.PopulateTensor<float>(model.lhs(), lhs);
+  model.PopulateTensor<float>(model.rhs(), rhs);
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray(res));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 2, 1, 1}));
+}
+
+TEST_P(BatchMatMulOpTest, Float32Test_Flatten) {
+  BatchMatMulOpModel<float> model({TensorType_FLOAT32, {3, 2, 2, 4}},
+                                  {TensorType_FLOAT32, {3, 1, 4, 1}});
+  std::vector<float> lhs(48);
+  std::iota(lhs.begin(), lhs.end(), 1);
+  std::vector<float> rhs(12);
+  std::iota(rhs.begin(), rhs.end(), 1);
+  std::vector<float> res{30,  70,  110,  150,  486,  590,
+                         694, 798, 1454, 1622, 1790, 1958};
+  model.PopulateTensor<float>(model.lhs(), lhs);
+  model.PopulateTensor<float>(model.rhs(), rhs);
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray(res));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 2, 2, 1}));
+}
+
 TEST_P(BatchMatMulOpTest, Float32Test_Simple) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {1, 2, 3}},
                                   {TensorType_FLOAT32, {1, 3, 4}});
diff --git a/tensorflow/lite/kernels/batch_to_space_nd.cc b/tensorflow/lite/kernels/batch_to_space_nd.cc
index b98ac91c20e..9a2c7b72bad 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd.cc
@@ -103,8 +103,22 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                  NumDimensions(op_context.input) <= kInputMaxDimensionNum);
   TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
 
-  if (!IsConstantTensor(op_context.block_shape) ||
-      !IsConstantTensor(op_context.crops)) {
+  if (op_context.input->type == kTfLiteUInt8 ||
+      op_context.input->type == kTfLiteInt8 ||
+      op_context.input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, op_context.input->params.scale,
+                      op_context.output->params.scale);
+    TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point,
+                      op_context.output->params.zero_point);
+  }
+
+  if (op_context.input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point, 0);
+  }
+
+  if (!IsConstantOrPersistentTensor(op_context.block_shape) ||
+      !IsConstantOrPersistentTensor(op_context.crops)) {
     SetTensorToDynamic(op_context.output);
     return kTfLiteOk;
   }
@@ -151,6 +165,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, int8_t);
       }
       break;
+    case kTfLiteInt16:
+      if (kernel_type == kReference) {
+        TF_LITE_BATCH_TO_SPACE_ND(reference_ops, int16_t);
+      } else {
+        TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, int16_t);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_BATCH_TO_SPACE_ND(reference_ops, int32_t);
diff --git a/tensorflow/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
index fd748d6b915..7841b1d2b92 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
@@ -44,11 +44,19 @@ class BatchToSpaceNDOpModel : public SingleOpModel {
     PopulateTensor<int>(crops_, data);
   }
 
+  int input_tensor_id() { return input_; }
+
   template <typename T>
   std::vector<T> GetOutput() {
     return ExtractVector<T>(output_);
   }
 
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
   int32_t GetOutputSize() { return GetTensorSize(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
@@ -67,20 +75,20 @@ class BatchToSpaceNDOpModel : public SingleOpModel {
 //    m.Invoke();
 class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
  public:
-  BatchToSpaceNDOpConstModel(std::initializer_list<int> input_shape,
+  BatchToSpaceNDOpConstModel(const TensorData& input,
                              std::initializer_list<int> block_shape,
                              std::initializer_list<int> crops,
-                             const TensorType& type = TensorType_FLOAT32) {
+                             const TensorData& output) {
     int spatial_dims = static_cast<int>(block_shape.size());
-    input_ = AddInput({type, input_shape});
+    input_ = AddInput(input);
     block_shape_ = AddConstInput(TensorType_INT32, block_shape, {spatial_dims});
     crops_ = AddConstInput(TensorType_INT32, crops, {spatial_dims, 2});
-    output_ = AddOutput(type);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
                  BuiltinOptions_BatchToSpaceNDOptions,
                  CreateBatchToSpaceNDOptions(builder_).Union());
-    BuildInterpreter({input_shape});
+    BuildInterpreter({GetShape(input_)});
   }
 };
 
@@ -94,23 +102,32 @@ class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
 //    m.Invoke();
 class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
  public:
-  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape,
-                               const TensorType& type = TensorType_FLOAT32) {
-    input_ = AddInput({type, input_shape});
+  BatchToSpaceNDOpDynamicModel(const TensorData& input,
+                               const TensorData& output) {
+    input_ = AddInput(input);
     block_shape_ = AddInput(TensorType_INT32);
     crops_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(type);
+    output_ = AddOutput(output);
 
-    int spatial_dims = static_cast<int>(input_shape.size()) - 2;
+    int spatial_dims = static_cast<int>(GetShape(input_).size()) - 2;
     SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
                  BuiltinOptions_BatchToSpaceNDOptions,
                  CreateBatchToSpaceNDOptions(builder_).Union());
-    BuildInterpreter({input_shape, {spatial_dims}, {spatial_dims, 2}});
+    BuildInterpreter({GetShape(input_), {spatial_dims}, {spatial_dims, 2}});
   }
 };
 
+template <typename integer_type>
+float GetTolerance(float min, float max) {
+  float kQuantizedStep =
+      (max - min) / (std::numeric_limits<integer_type>::max() -
+                     std::numeric_limits<integer_type>::min());
+  return kQuantizedStep;
+}
+
 TEST(BatchToSpaceNDOpTest, SimpleConstTest) {
-  BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0});
+  BatchToSpaceNDOpConstModel m({TensorType_FLOAT32, {4, 2, 2, 1}}, {2, 2},
+                               {0, 0, 0, 0}, {TensorType_FLOAT32});
   m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
@@ -120,8 +137,8 @@ TEST(BatchToSpaceNDOpTest, SimpleConstTest) {
 }
 
 TEST(BatchToSpaceNDOpTest, SimpleConstTestInt8) {
-  BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0},
-                               TensorType_INT8);
+  BatchToSpaceNDOpConstModel m({TensorType_INT8, {4, 2, 2, 1}}, {2, 2},
+                               {0, 0, 0, 0}, {TensorType_INT8});
   m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
@@ -131,7 +148,8 @@ TEST(BatchToSpaceNDOpTest, SimpleConstTestInt8) {
 }
 
 TEST(BatchToSpaceNDOpTest, BatchOneConstTest) {
-  BatchToSpaceNDOpConstModel m({1, 2, 2, 1}, {1, 1}, {0, 0, 0, 0});
+  BatchToSpaceNDOpConstModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {1, 1},
+                               {0, 0, 0, 0}, {TensorType_FLOAT32});
   m.SetInput<float>({1, 2, 3, 4});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 2, 1}));
@@ -144,8 +162,8 @@ TEST(BatchToSpaceNDOpTest, SimpleConstTestInt8EmptyOutput) {
     return;
   }
 
-  BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 2, 2},
-                               TensorType_INT8);
+  BatchToSpaceNDOpConstModel m({TensorType_INT8, {4, 2, 2, 1}}, {2, 2},
+                               {0, 0, 2, 2}, {TensorType_INT8});
   m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 0, 1}));
@@ -153,7 +171,8 @@ TEST(BatchToSpaceNDOpTest, SimpleConstTestInt8EmptyOutput) {
 }
 
 TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
-  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
+  BatchToSpaceNDOpDynamicModel m({TensorType_FLOAT32, {4, 2, 2, 1}},
+                                 {TensorType_FLOAT32});
   m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetCrops({0, 0, 0, 0});
@@ -165,7 +184,8 @@ TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
 }
 
 TEST(BatchToSpaceNDOpTest, SimpleDynamicTestInt8) {
-  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1}, TensorType_INT8);
+  BatchToSpaceNDOpDynamicModel m({TensorType_INT8, {4, 2, 2, 1}},
+                                 {TensorType_INT8});
   m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetCrops({0, 0, 0, 0});
@@ -177,7 +197,8 @@ TEST(BatchToSpaceNDOpTest, SimpleDynamicTestInt8) {
 }
 
 TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
-  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
+  BatchToSpaceNDOpDynamicModel m({TensorType_FLOAT32, {4, 2, 2, 1}},
+                                 {TensorType_FLOAT32});
   m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetCrops({0, 0, -1, 0});
@@ -190,7 +211,8 @@ TEST(BatchToSpaceNDOpTest, SimpleDynamicTestInt8EmptyOutput) {
     return;
   }
 
-  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1}, TensorType_INT8);
+  BatchToSpaceNDOpDynamicModel m({TensorType_INT8, {4, 2, 2, 1}},
+                                 {TensorType_INT8});
   m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetCrops({2, 2, 0, 0});
@@ -201,18 +223,23 @@ TEST(BatchToSpaceNDOpTest, SimpleDynamicTestInt8EmptyOutput) {
 
 #if GTEST_HAS_DEATH_TEST
 TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
-  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}),
-               "Cannot allocate tensors");
+  EXPECT_DEATH(
+      BatchToSpaceNDOpConstModel({TensorType_FLOAT32, {3, 2, 2, 1}}, {2, 2},
+                                 {0, 0, 0, 0}, {TensorType_FLOAT32}),
+      "Cannot allocate tensors");
 }
 
 TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
-  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, -1}),
-               "crops.i. >= 0 was not true.");
+  EXPECT_DEATH(
+      BatchToSpaceNDOpConstModel({TensorType_FLOAT32, {3, 2, 2, 1}}, {2, 2},
+                                 {0, 0, 0, -1}, {TensorType_FLOAT32}),
+      "crops.i. >= 0 was not true.");
 }
 #endif
 
 TEST(BatchToSpaceNDOpTest, Simple3DConstTest) {
-  BatchToSpaceNDOpConstModel m({4, 4, 1}, {2}, {0, 0});
+  BatchToSpaceNDOpConstModel m({TensorType_FLOAT32, {4, 4, 1}}, {2}, {0, 0},
+                               {TensorType_FLOAT32});
   m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 8, 1}));
@@ -222,7 +249,8 @@ TEST(BatchToSpaceNDOpTest, Simple3DConstTest) {
 }
 
 TEST(BatchToSpaceNDOpTest, Simple3DConstTestWithCrops) {
-  BatchToSpaceNDOpConstModel m({4, 4, 1}, {2}, {1, 1});
+  BatchToSpaceNDOpConstModel m({TensorType_FLOAT32, {4, 4, 1}}, {2}, {1, 1},
+                               {TensorType_FLOAT32});
   m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 6, 1}));
@@ -230,8 +258,43 @@ TEST(BatchToSpaceNDOpTest, Simple3DConstTestWithCrops) {
               ElementsAreArray({9, 2, 10, 3, 11, 4, 13, 6, 14, 7, 15, 8}));
 }
 
+template <typename integer_dtype>
+void Simple3DConstTestWithCropsQuant() {
+  const float kMin = -1;
+  const float kMax =
+      std::numeric_limits<integer_dtype>::max() /
+      static_cast<float>(std::numeric_limits<integer_dtype>::max() + 1);
+  float kQuantizedTolerance = GetTolerance<integer_dtype>(-16.0f, 16.0f);
+  BatchToSpaceNDOpConstModel m(
+      {GetTensorType<integer_dtype>(), {4, 4, 1}, 16.0f * kMin, 16.0f * kMax},
+      {2}, {1, 1},
+      {GetTensorType<integer_dtype>(), {}, 16.0f * kMin, 16.0f * kMax});
+  m.QuantizeAndPopulate<integer_dtype>(
+      m.input_tensor_id(),
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 6, 1}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<integer_dtype>(),
+      ElementsAreArray(ArrayFloatNear({9, 2, 10, 3, 11, 4, 13, 6, 14, 7, 15, 8},
+                                      kQuantizedTolerance)));
+}
+
+TEST(BatchToSpaceNDOpTest, Simple3DConstTestWithCropsUINT8) {
+  Simple3DConstTestWithCropsQuant<uint8_t>();
+}
+
+TEST(BatchToSpaceNDOpTest, Simple3DConstTestWithCropsINT8) {
+  Simple3DConstTestWithCropsQuant<int8_t>();
+}
+
+TEST(BatchToSpaceNDOpTest, Simple3DConstTestWithCropsINT16) {
+  Simple3DConstTestWithCropsQuant<int16_t>();
+}
+
 TEST(BatchToSpaceNDOpTest, Simple3DDynamicTest) {
-  BatchToSpaceNDOpDynamicModel m({4, 4, 1});
+  BatchToSpaceNDOpDynamicModel m({TensorType_FLOAT32, {4, 4, 1}},
+                                 {TensorType_FLOAT32});
   m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2});
   m.SetCrops({0, 0});
@@ -243,7 +306,8 @@ TEST(BatchToSpaceNDOpTest, Simple3DDynamicTest) {
 }
 
 TEST(BatchToSpaceNDOpTest, Simple3DDynamicTestWithCrops) {
-  BatchToSpaceNDOpDynamicModel m({4, 4, 1});
+  BatchToSpaceNDOpDynamicModel m({TensorType_FLOAT32, {4, 4, 1}},
+                                 {TensorType_FLOAT32});
   m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2});
   m.SetCrops({1, 1});
@@ -253,5 +317,40 @@ TEST(BatchToSpaceNDOpTest, Simple3DDynamicTestWithCrops) {
               ElementsAreArray({9, 2, 10, 3, 11, 4, 13, 6, 14, 7, 15, 8}));
 }
 
+template <typename integer_dtype>
+void Simple3DDynamicTestWithCropsQuant() {
+  const float kMin = -1;
+  const float kMax =
+      std::numeric_limits<integer_dtype>::max() /
+      static_cast<float>(std::numeric_limits<integer_dtype>::max() + 1);
+  float kQuantizedTolerance = GetTolerance<integer_dtype>(-16.0, 16.0);
+  BatchToSpaceNDOpDynamicModel m(
+      {GetTensorType<integer_dtype>(), {4, 4, 1}, 16.0f * kMin, 16.0f * kMax},
+      {GetTensorType<integer_dtype>(), {}, 16.0f * kMin, 16.0f * kMax});
+  m.QuantizeAndPopulate<integer_dtype>(
+      m.input_tensor_id(),
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2});
+  m.SetCrops({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 6, 1}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<integer_dtype>(),
+      ElementsAreArray(ArrayFloatNear({9, 2, 10, 3, 11, 4, 13, 6, 14, 7, 15, 8},
+                                      kQuantizedTolerance)));
+}
+
+TEST(BatchToSpaceNDOpTest, Simple3DDynamicTestWithCropsQuantUINT8) {
+  Simple3DDynamicTestWithCropsQuant<uint8_t>();
+}
+
+TEST(BatchToSpaceNDOpTest, Simple3DDynamicTestWithCropsQuantINT8) {
+  Simple3DDynamicTestWithCropsQuant<int8_t>();
+}
+
+TEST(BatchToSpaceNDOpTest, Simple3DDynamicTestWithCropsQuantINT16) {
+  Simple3DDynamicTestWithCropsQuant<int16_t>();
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/bitcast.cc b/tensorflow/lite/kernels/bitcast.cc
new file mode 100644
index 00000000000..e0877107f1f
--- /dev/null
+++ b/tensorflow/lite/kernels/bitcast.cc
@@ -0,0 +1,108 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstring>
+#include <memory>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace bitcast {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus CalculateShape(TfLiteContext* context, const TfLiteTensor* input,
+                            const TfLiteTensor* output,
+                            TfLiteIntArray** output_shape) {
+  const int dims = NumDimensions(input);
+
+  auto input_type = input->type;
+  auto output_type = output->type;
+  size_t input_type_size, output_type_size;
+  TF_LITE_ENSURE_STATUS(GetSizeOfType(context, input_type, &input_type_size));
+  TF_LITE_ENSURE_STATUS(GetSizeOfType(context, output_type, &output_type_size));
+
+  TfLiteIntArray* shape = nullptr;
+  if (input_type_size > output_type_size) {
+    // If the input datatype T is larger than the output datatype type then the
+    // shape changes from [...] to [..., sizeof(T)/sizeof(type)].
+    shape = TfLiteIntArrayCreate(dims + 1);
+    for (int i = 0; i < dims; ++i) {
+      shape->data[i] = input->dims->data[i];
+    }
+    shape->data[dims] = input_type_size / output_type_size;
+  } else if (input_type_size < output_type_size) {
+    // If T is smaller than type, the operator requires that the rightmost
+    // dimension be equal to sizeof(type)/sizeof(T). The shape then goes from
+    // [..., sizeof(type)/sizeof(T)] to [...].
+    TF_LITE_ENSURE_EQ(context, input->dims->data[dims - 1],
+                      output_type_size / input_type_size);
+    shape = TfLiteIntArrayCreate(dims - 1);
+    for (int i = 0; i < dims - 1; ++i) {
+      shape->data[i] = input->dims->data[i];
+    }
+  } else {
+    // Same element type size.
+    shape = TfLiteIntArrayCopy(input->dims);
+  }
+  *output_shape = shape;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteIntArray* output_size = nullptr;
+  TF_LITE_ENSURE_OK(context,
+                    CalculateShape(context, input, output, &output_size));
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+
+  // Only copy data if input and output do not share a buffer.
+  if (output->data.data != input->data.data) {
+    memcpy(output->data.data, input->data.data, input->bytes);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace bitcast
+
+TfLiteRegistration* Register_BITCAST() {
+  static TfLiteRegistration r = {nullptr, nullptr, bitcast::Prepare,
+                                 bitcast::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/bitcast_test.cc b/tensorflow/lite/kernels/bitcast_test.cc
new file mode 100644
index 00000000000..0913b489a24
--- /dev/null
+++ b/tensorflow/lite/kernels/bitcast_test.cc
@@ -0,0 +1,137 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+// std::bit_cast is only available since c++20, provide a handrolled version
+// of bit_cast here. Implementation modified from abseil bit_cast.
+template <
+    typename Dest, typename Source,
+    typename std::enable_if<sizeof(Dest) == sizeof(Source) &&
+                                std::is_trivially_copyable<Source>::value &&
+                                std::is_trivially_copyable<Dest>::value &&
+                                std::is_default_constructible<Dest>::value,
+                            int>::type = 0>
+inline Dest bit_cast(const Source& source) {
+  Dest dest;
+  memcpy(static_cast<void*>(std::addressof(dest)),
+         static_cast<const void*>(std::addressof(source)), sizeof(dest));
+  return dest;
+}
+
+class BitcastOpModel : public SingleOpModel {
+ public:
+  BitcastOpModel(const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_BITCAST, BuiltinOptions_BitcastOptions,
+                 CreateBitcastOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() const { return input_; }
+  int output() const { return output_; }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+TEST(BitcastOpModel, BitastInt32ToUint32) {
+  BitcastOpModel m({TensorType_INT32, {2, 3}}, {TensorType_UINT32, {2, 3}});
+  std::vector<int32_t> input = {INT32_MIN, -100, -1, 0, 100, INT32_MAX};
+  m.PopulateTensor<int32_t>(m.input(), input);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  std::vector<uint32_t> output;
+
+  std::transform(input.cbegin(), input.cend(), std::back_inserter(output),
+                 [](int32_t a) { return bit_cast<std::uint32_t>(a); });
+  EXPECT_THAT(m.ExtractVector<uint32_t>(m.output()), ElementsAreArray(output));
+}
+
+TEST(BitcastOpModel, BitastUInt32Toint32) {
+  BitcastOpModel m({TensorType_UINT32, {2, 3}}, {TensorType_INT32, {2, 3}});
+  std::vector<uint32_t> input = {0,
+                                 1,
+                                 100,
+                                 bit_cast<uint32_t>(INT32_MAX),
+                                 bit_cast<uint32_t>(INT32_MIN),
+                                 UINT32_MAX};
+  m.PopulateTensor<uint32_t>(m.input(), input);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  std::vector<int32_t> output;
+
+  std::transform(input.cbegin(), input.cend(), std::back_inserter(output),
+                 [](uint32_t a) { return bit_cast<std::uint32_t>(a); });
+  EXPECT_THAT(m.ExtractVector<int32_t>(m.output()), ElementsAreArray(output));
+}
+
+TEST(BitcastOpModel, BitcastUInt32Toint16) {
+  BitcastOpModel m({TensorType_UINT32, {2, 1}}, {TensorType_INT16, {2, 1, 2}});
+  std::vector<uint32_t> input = {(uint32_t)UINT16_MAX + 1,
+                                 (uint32_t)UINT16_MAX};
+  m.PopulateTensor<uint32_t>(m.input(), input);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+    __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  // 00..01 00..00
+  // 00..00 11..11
+  std::vector<int16_t> output = {1, 0, 0, -1};
+#else
+  // 00..00 00..01
+  // 11..11 00..00
+  std::vector<int16_t> output = {0, 1, -1, 0};
+#endif
+  EXPECT_THAT(m.ExtractVector<int16_t>(m.output()), ElementsAreArray(output));
+}
+
+TEST(BitcastOpModel, BitcastInt16ToUint32) {
+  BitcastOpModel m({TensorType_INT16, {2, 1, 2}}, {TensorType_UINT32, {2, 1}});
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+    __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  std::vector<int16_t> input = {1, 0, 0, -1};
+#else
+  std::vector<int16_t> input = {0, 1, -1, 0};
+#endif
+  m.PopulateTensor<int16_t>(m.input(), input);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  std::vector<uint32_t> output = {(uint32_t)UINT16_MAX + 1,
+                                  (uint32_t)UINT16_MAX};
+  EXPECT_THAT(m.ExtractVector<uint32_t>(m.output()), ElementsAreArray(output));
+}
+
+TEST(BitcastOpModel, BitcastInt16ToUint32WrongShape) {
+#if GTEST_HAS_DEATH_TEST
+  EXPECT_DEATH(BitcastOpModel m({TensorType_INT16, {2, 2, 7}},
+                                {TensorType_UINT32, {2, 7}}),
+               "7 != 2");
+#endif
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/bitwise_xor.cc b/tensorflow/lite/kernels/bitwise_xor.cc
new file mode 100644
index 00000000000..2650b1ee4e1
--- /dev/null
+++ b/tensorflow/lite/kernels/bitwise_xor.cc
@@ -0,0 +1,168 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace bitwise_xor {
+
+// Input/output tensor index.
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+// Op data for bitwise xor op.
+struct OpData {
+  bool requires_broadcast = false;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
+
+  output->type = input1->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <typename T>
+T BitwiseXor(T x, T y) {
+  return x ^ y;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+
+  const TfLiteType type = output->type;
+  switch (type) {
+    // The fallthrough is indended. Since bitwise xor function operates on the
+    // underlying binary representation of the integers, both integers and
+    // unsigned integers will have the same behavior
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastBinaryFunction4DSlow<int8_t, int8_t, int8_t>(
+            GetTensorShape(input1), GetTensorData<int8_t>(input1),
+            GetTensorShape(input2), GetTensorData<int8_t>(input2),
+            GetTensorShape(output), GetTensorData<int8_t>(output), BitwiseXor);
+      } else {
+        reference_ops::BinaryFunction<int8_t, int8_t, int8_t>(
+            GetTensorShape(input1), GetTensorData<int8_t>(input1),
+            GetTensorShape(input2), GetTensorData<int8_t>(input2),
+            GetTensorShape(output), GetTensorData<int8_t>(output), BitwiseXor);
+      }
+      break;
+    }
+    case kTfLiteUInt16:
+    case kTfLiteInt16: {
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastBinaryFunction4DSlow<int16_t, int16_t, int16_t>(
+            GetTensorShape(input1), GetTensorData<int16_t>(input1),
+            GetTensorShape(input2), GetTensorData<int16_t>(input2),
+            GetTensorShape(output), GetTensorData<int16_t>(output), BitwiseXor);
+      } else {
+        reference_ops::BinaryFunction<int16_t, int16_t, int16_t>(
+            GetTensorShape(input1), GetTensorData<int16_t>(input1),
+            GetTensorShape(input2), GetTensorData<int16_t>(input2),
+            GetTensorShape(output), GetTensorData<int16_t>(output), BitwiseXor);
+      }
+      break;
+    }
+    case kTfLiteUInt32:
+    case kTfLiteInt32: {
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastBinaryFunction4DSlow<int32_t, int32_t, int32_t>(
+            GetTensorShape(input1), GetTensorData<int32_t>(input1),
+            GetTensorShape(input2), GetTensorData<int32_t>(input2),
+            GetTensorShape(output), GetTensorData<int32_t>(output), BitwiseXor);
+      } else {
+        reference_ops::BinaryFunction<int32_t, int32_t, int32_t>(
+            GetTensorShape(input1), GetTensorData<int32_t>(input1),
+            GetTensorShape(input2), GetTensorData<int32_t>(input2),
+            GetTensorShape(output), GetTensorData<int32_t>(output), BitwiseXor);
+      }
+      break;
+    }
+    default:
+      TF_LITE_KERNEL_LOG(context,
+                         "BitwiseXor currently only supports "
+                         "8-bit/16-bit/32-bit integer/unsigned integer, got %s",
+                         TfLiteTypeGetName(type));
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace bitwise_xor
+
+TfLiteRegistration* Register_BITWISE_XOR() {
+  static TfLiteRegistration r = {bitwise_xor::Init, bitwise_xor::Free,
+                                 bitwise_xor::Prepare, bitwise_xor::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/bitwise_xor_test.cc b/tensorflow/lite/kernels/bitwise_xor_test.cc
new file mode 100644
index 00000000000..f508cb53142
--- /dev/null
+++ b/tensorflow/lite/kernels/bitwise_xor_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BitwiseXorOpModel : public SingleOpModel {
+ public:
+  BitwiseXorOpModel(std::initializer_list<int> input1_shape,
+                    std::initializer_list<int> input2_shape,
+                    TensorType tensor_type) {
+    input1_ = AddInput(tensor_type);
+    input2_ = AddInput(tensor_type);
+    output_ = AddOutput(tensor_type);
+    SetBuiltinOp(BuiltinOperator_BITWISE_XOR, BuiltinOptions_BitwiseXorOptions,
+                 CreateBitwiseXorOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape});
+  }
+
+  int input1() const { return input1_; }
+  int input2() const { return input2_; }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(BitwiseXorOpTest, SimpleTestInt8) {
+  BitwiseXorOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT8);
+  model.PopulateTensor<int8_t>(model.input1(), {0, 5, 3, 14});
+  model.PopulateTensor<int8_t>(model.input2(), {5, 0, 7, 11});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<int8_t>(), ElementsAreArray({5, 5, 4, 5}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(BitwiseXorOpTest, SimpleTestInt16) {
+  BitwiseXorOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT16);
+  model.PopulateTensor<int16_t>(model.input1(), {0, 5, 3, 14});
+  model.PopulateTensor<int16_t>(model.input2(), {5, 0, 7, 11});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<int16_t>(), ElementsAreArray({5, 5, 4, 5}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(BitwiseXorOpTest, SimpleTestInt32) {
+  BitwiseXorOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int32_t>(model.input1(), {0, 5, 3, 14});
+  model.PopulateTensor<int32_t>(model.input2(), {5, 0, 7, 11});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<int32_t>(), ElementsAreArray({5, 5, 4, 5}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(BitwiseXorOpTest, SimpleTestUInt8) {
+  BitwiseXorOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_UINT8);
+  model.PopulateTensor<uint8_t>(model.input1(), {0, 5, 3, 14});
+  model.PopulateTensor<uint8_t>(model.input2(), {5, 0, 7, 11});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<uint8_t>(), ElementsAreArray({5, 5, 4, 5}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(BitwiseXorOpTest, SimpleTestUInt16) {
+  BitwiseXorOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_UINT16);
+  model.PopulateTensor<uint16_t>(model.input1(), {0, 5, 3, 14});
+  model.PopulateTensor<uint16_t>(model.input2(), {5, 0, 7, 11});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<uint16_t>(), ElementsAreArray({5, 5, 4, 5}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(BitwiseXorOpTest, SimpleTestUInt32) {
+  BitwiseXorOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_UINT32);
+  model.PopulateTensor<uint32_t>(model.input1(), {0, 5, 3, 14});
+  model.PopulateTensor<uint32_t>(model.input2(), {5, 0, 7, 11});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<uint32_t>(), ElementsAreArray({5, 5, 4, 5}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(BitwiseXorOpTest, BroadcastLhs) {
+  BitwiseXorOpModel model({1, 1, 1, 1}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int32_t>(model.input1(), {5});
+  model.PopulateTensor<int32_t>(model.input2(), {0, -5, -3, 14});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<int32_t>(), ElementsAreArray({5, -2, -8, 11}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(BitwiseXorOpTest, BroadcastRhs) {
+  BitwiseXorOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_UINT32);
+  model.PopulateTensor<uint32_t>(model.input1(), {0, 5, 3, 14});
+  model.PopulateTensor<uint32_t>(model.input2(), {5});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<uint32_t>(), ElementsAreArray({5, 0, 6, 11}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/builtin_op_kernels.h b/tensorflow/lite/kernels/builtin_op_kernels.h
index 4c767a3bff6..54c4ccdd488 100644
--- a/tensorflow/lite/kernels/builtin_op_kernels.h
+++ b/tensorflow/lite/kernels/builtin_op_kernels.h
@@ -24,7 +24,7 @@ namespace builtin {
 #define TFLITE_OP(NAME) \
     using ::tflite::ops::builtin::NAME;
 
-#include "tensorflow/lite/core/shims/builtin_ops_list.inc"
+#include "tensorflow/lite/kernels/builtin_ops_list.inc"
 
 #undef TFLITE_OP
 
diff --git a/tensorflow/lite/kernels/builtin_ops_list.inc b/tensorflow/lite/kernels/builtin_ops_list.inc
index 7d7bdf70205..1d865b00ab8 100644
--- a/tensorflow/lite/kernels/builtin_ops_list.inc
+++ b/tensorflow/lite/kernels/builtin_ops_list.inc
@@ -171,3 +171,6 @@ TFLITE_OP(Register_UNSORTED_SEGMENT_SUM)
 TFLITE_OP(Register_ATAN2)
 TFLITE_OP(Register_UNSORTED_SEGMENT_MIN)
 TFLITE_OP(Register_SIGN)
+TFLITE_OP(Register_BITCAST)
+TFLITE_OP(Register_BITWISE_XOR)
+TFLITE_OP(Register_RIGHT_SHIFT)
diff --git a/tensorflow/lite/kernels/conv3d_transpose.cc b/tensorflow/lite/kernels/conv3d_transpose.cc
index ddcfd43450c..b72a0696e0a 100644
--- a/tensorflow/lite/kernels/conv3d_transpose.cc
+++ b/tensorflow/lite/kernels/conv3d_transpose.cc
@@ -193,7 +193,7 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
   }
 
   // Resize the output tensor.
-  if (!IsConstantTensor(output_shape)) {
+  if (!IsConstantOrPersistentTensor(output_shape)) {
     SetTensorToDynamic(output);
     if (opdata->need_col2im) {
       SetTensorToDynamic(col2im);
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
index 0e37a68ab0d..d1142da2a05 100644
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -405,9 +405,6 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 
   KernelType effective_kernel_type = kernel_type;
 
-  if (filter->type == kTfLiteInt4) {
-    effective_kernel_type = kReference;
-  }
   const int8_t* filter_data;
   const size_t bytes_unpacked = filter->bytes * 2;
   auto unpacked_filter_data = std::make_unique<int8_t[]>(bytes_unpacked);
diff --git a/tensorflow/lite/kernels/div.cc b/tensorflow/lite/kernels/div.cc
index 4963c884fec..9f72caa6c7e 100644
--- a/tensorflow/lite/kernels/div.cc
+++ b/tensorflow/lite/kernels/div.cc
@@ -36,6 +36,7 @@ limitations under the License.
 
 #include "xnnpack.h"  // from @XNNPACK
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/minimal_logging.h"
 #endif  // TFLITE_KERNEL_USE_XNNPACK
 
 namespace tflite {
@@ -174,6 +175,7 @@ void EvalDiv(TfLiteContext* context, TfLiteNode* node, TfLiteDivParams* params,
             CpuBackendContext::GetFromContext(context);
         pthreadpool_t threadpool =
             cpu_backend_context->get_xnnpack_threadpool();
+        threadpool = nullptr;
         float output_min = -std::numeric_limits<float>::infinity();
         float output_max = std::numeric_limits<float>::infinity();
         CalculateActivationRange(params->activation, &output_min, &output_max);
@@ -186,10 +188,13 @@ void EvalDiv(TfLiteContext* context, TfLiteNode* node, TfLiteDivParams* params,
             GetTensorData<float>(input2), GetTensorData<float>(output),
             output_min, output_max,
             /*flags=*/XNN_FLAG_YIELD_WORKERS, threadpool);
-        if (status != xnn_status_success) {
-          TF_LITE_KERNEL_LOG(context, "Failed to run xnn_run_divide_nd_f32");
+        if (status == xnn_status_success) {
+          return;
         }
-        return;
+        TFLITE_LOG(
+            TFLITE_LOG_INFO,
+            "Failed to run xnnpack xnn_run_divide_nd_f32. Error code: %d",
+            status);
       }
 #endif  // TFLITE_KERNEL_USE_XNNPACK
       if (data->requires_broadcast) {
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index 7fbafaf84ca..4379febbe36 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -285,6 +285,7 @@ TfLiteStatus SqrtEval(TfLiteContext* context, TfLiteNode* node) {
     CpuBackendContext* cpu_backend_context =
         CpuBackendContext::GetFromContext(context);
     pthreadpool_t threadpool = cpu_backend_context->get_xnnpack_threadpool();
+    threadpool = nullptr;
     status = xnn_run_square_root_nc_f32(
         channel_dim, channel_dim, channel_dim, batch_size,
         GetTensorData<float>(input), GetTensorData<float>(output),
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index d778f708da6..ce6f5478098 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <vector>
 
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
@@ -337,7 +338,8 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
   // quantized values prior to multiplication by the scaling factor.
   const bool is_hybrid =
       (input->type == kTfLiteFloat32 &&
-       (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8));
+       (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8 ||
+        filter->type == kTfLiteInt4));
   const bool is_sparse = filter->sparsity != nullptr;
   if (is_hybrid) {
     TfLiteIntArrayFree(node->temporaries);
@@ -352,7 +354,7 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
     TfLiteTensor* input_quantized;
     TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/0,
                                                 &input_quantized));
-    input_quantized->type = filter->type;
+    input_quantized->type = kTfLiteInt8;
     input_quantized->allocation_type = kTfLiteArenaRw;
 
     TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -555,7 +557,18 @@ TfLiteStatus EvalHybridDense(
     row_sums_ptr = GetTensorData<int32_t>(row_sums);
   }
   int8_t* quant_data = GetTensorData<int8_t>(input_quantized);
-  const int8_t* filter_data = GetTensorData<int8_t>(filter);
+  const int8_t* filter_data = nullptr;
+  std::unique_ptr<int8_t[]> unpacked_filter_data = nullptr;
+  if (filter->type == kTfLiteInt4) {
+    const size_t bytes_unpacked = filter->bytes * 2;
+    unpacked_filter_data = std::make_unique<int8_t[]>(bytes_unpacked);
+    tflite::tensor_utils::UnpackDenseInt4IntoInt8(
+        GetTensorData<int8_t>(filter), GetTensorShape(filter).FlatSize(),
+        unpacked_filter_data.get());
+    filter_data = unpacked_filter_data.get();
+  } else {
+    filter_data = GetTensorData<int8_t>(filter);
+  }
   const float* input_ptr = GetTensorData<float>(input);
   tensor_utils::BatchQuantizeFloats(
       input_ptr, batch_size, input_size, quant_data, scaling_factors_ptr,
@@ -809,10 +822,11 @@ void FullyConnectedInt8(const OpData* data, const TfLiteTensor* input,
   op_params.rhs_cacheable = IsConstantTensor(input);
 
   const int8_t* filter_data;
-  const size_t bytes_unpacked = filter->bytes * 2;
-  auto unpacked_filter_data = std::make_unique<int8_t[]>(bytes_unpacked);
+  std::unique_ptr<int8_t[]> unpacked_filter_data = nullptr;
 
   if (filter->type == kTfLiteInt4) {
+    const size_t bytes_unpacked = filter->bytes * 2;
+    unpacked_filter_data = std::make_unique<int8_t[]>(bytes_unpacked);
     tflite::tensor_utils::UnpackDenseInt4IntoInt8(
         GetTensorData<int8_t>(filter), GetTensorShape(filter).FlatSize(),
         unpacked_filter_data.get());
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index d091c9c7088..e7a278dd001 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -454,6 +454,10 @@ class HybridFullyConnectedOpModel : public SingleOpModel {
     SignedSymmetricQuantizeAndPopulate(weights_, f);
   }
 
+  void SetSignedWeights4Bit(std::initializer_list<float> f) {
+    SignedSymmetricQuantizeAndPopulate4Bit(weights_, f);
+  }
+
   void SetInput(const std::vector<float>& f) { PopulateTensor(input_, f); }
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
@@ -1373,6 +1377,36 @@ TEST(HybridAsymmetricInputFullyConnectedOpTest, SimpleTestQuantizedInt8) {
                                  /*max_abs_error=*/1.3f)));
 }
 
+// The expected values for this test were obtained by running the test with the
+// same weights but populated to a Int8 filter.
+TEST(HybridFullyConnectedOpTest, SimpleTestQuantizedInt4) {
+  HybridFullyConnectedOpModel m(
+      /*units=*/3, /*batches=*/2,
+      /*input=*/{TensorType_FLOAT32, {2, 10}},
+      /*weights=*/{TensorType_INT4, {3, 10}, 0, 0, 10.0 / 7.0, 0});  // Hybrid
+
+  m.SetSignedWeights4Bit({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     36, 37, 38,  //
+                                     52, 53, 54,  //
+                                 },
+                                 /*max_abs_error=*/1.3f)));
+}
+
 TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInput) {
   // Note that it is not required that the first dimension be the number of
   // batches. All we care is that the input can be evenly distributed in
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index 207bf80160a..6b3c99d8c6e 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -206,6 +206,59 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+template <class PosT>
+TfLiteStatus DispatchEvalInputType(TfLiteContext* const context,
+                                   const TfLiteGatherParams* const params,
+                                   const TfLiteTensor* const input,
+                                   const TfLiteTensor* const positions,
+                                   TfLiteTensor* const output) {
+  switch (input->type) {
+    case kTfLiteFloat32:
+      return Gather<float, PosT>(context, *params, input, positions, output);
+    case kTfLiteUInt8:
+      return Gather<uint8_t, PosT>(context, *params, input, positions, output);
+    case kTfLiteInt8:
+      return Gather<int8_t, PosT>(context, *params, input, positions, output);
+    case kTfLiteInt16:
+      return Gather<int16_t, PosT>(context, *params, input, positions, output);
+    case kTfLiteInt32:
+      return Gather<int32_t, PosT>(context, *params, input, positions, output);
+    case kTfLiteInt64:
+      return Gather<int64_t, PosT>(context, *params, input, positions, output);
+    case kTfLiteBool:
+      return Gather<bool, PosT>(context, *params, input, positions, output);
+    case kTfLiteString:
+      return GatherStrings<PosT>(context, input, positions, output);
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type '%s' is not supported by gather.",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus DispatchEvalPositionType(TfLiteContext* const context,
+                                      const TfLiteGatherParams* const params,
+                                      const TfLiteTensor* const input,
+                                      const TfLiteTensor* const positions,
+                                      TfLiteTensor* const output) {
+  switch (positions->type) {
+    case kTfLiteInt16:
+      return DispatchEvalInputType<int16_t>(context, params, input, positions,
+                                            output);
+    case kTfLiteInt32:
+      return DispatchEvalInputType<int32_t>(context, params, input, positions,
+                                            output);
+    case kTfLiteInt64:
+      return DispatchEvalInputType<int64_t>(context, params, input, positions,
+                                            output);
+    default:
+      TF_LITE_KERNEL_LOG(context,
+                         "Positions of type '%s' are not supported by gather.",
+                         TfLiteTypeGetName(positions->type));
+      return kTfLiteError;
+  }
+}
+
 TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
   const auto* params =
       reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
@@ -218,132 +271,12 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     GetOutputSafe(context, node, kOutputTensor, &output));
 
-  TfLiteStatus status = kTfLiteError;
-  if (positions->type == kTfLiteInt32) {
-    switch (input->type) {
-      case kTfLiteFloat32:
-        status =
-            Gather<float, int32_t>(context, *params, input, positions, output);
-        break;
-      case kTfLiteUInt8:
-        status = Gather<uint8_t, int32_t>(context, *params, input, positions,
-                                          output);
-        break;
-      case kTfLiteInt8:
-        status =
-            Gather<int8_t, int32_t>(context, *params, input, positions, output);
-        break;
-      case kTfLiteInt16:
-        status = Gather<int16_t, int32_t>(context, *params, input, positions,
-                                          output);
-        break;
-      case kTfLiteInt32:
-        status = Gather<int32_t, int32_t>(context, *params, input, positions,
-                                          output);
-        break;
-      case kTfLiteInt64:
-        status = Gather<int64_t, int32_t>(context, *params, input, positions,
-                                          output);
-        break;
-      case kTfLiteBool:
-        status =
-            Gather<bool, int32_t>(context, *params, input, positions, output);
-        break;
-      case kTfLiteString:
-        status = GatherStrings<int32_t>(context, input, positions, output);
-        break;
-      default:
-        TF_LITE_KERNEL_LOG(context, "Type '%s' is not supported by gather.",
-                           TfLiteTypeGetName(input->type));
-        return kTfLiteError;
-    }
-  }
-  if (positions->type == kTfLiteInt64) {
-    switch (input->type) {
-      case kTfLiteFloat32:
-        status =
-            Gather<float, int64_t>(context, *params, input, positions, output);
-        break;
-      case kTfLiteUInt8:
-        status = Gather<uint8_t, int64_t>(context, *params, input, positions,
-                                          output);
-        break;
-      case kTfLiteInt8:
-        status =
-            Gather<int8_t, int64_t>(context, *params, input, positions, output);
-        break;
-      case kTfLiteInt16:
-        status = Gather<int16_t, int64_t>(context, *params, input, positions,
-                                          output);
-        break;
-      case kTfLiteInt32:
-        status = Gather<int32_t, int64_t>(context, *params, input, positions,
-                                          output);
-        break;
-      case kTfLiteInt64:
-        status = Gather<int64_t, int64_t>(context, *params, input, positions,
-                                          output);
-        break;
-      case kTfLiteBool:
-        status =
-            Gather<bool, int64_t>(context, *params, input, positions, output);
-        break;
-      case kTfLiteString:
-        status = GatherStrings<int64_t>(context, input, positions, output);
-        break;
-      default:
-        TF_LITE_KERNEL_LOG(context, "Type '%s' is not supported by gather.",
-                           TfLiteTypeGetName(input->type));
-        return kTfLiteError;
-    }
-  }
-  if (positions->type == kTfLiteInt16) {
-    switch (input->type) {
-      case kTfLiteFloat32:
-        status =
-            Gather<float, int16_t>(context, *params, input, positions, output);
-        break;
-      case kTfLiteUInt8:
-        status = Gather<uint8_t, int16_t>(context, *params, input, positions,
-                                          output);
-        break;
-      case kTfLiteInt8:
-        status =
-            Gather<int8_t, int16_t>(context, *params, input, positions, output);
-        break;
-      case kTfLiteInt16:
-        status = Gather<int16_t, int16_t>(context, *params, input, positions,
-                                          output);
-        break;
-      case kTfLiteInt32:
-        status = Gather<int32_t, int16_t>(context, *params, input, positions,
-                                          output);
-        break;
-      case kTfLiteInt64:
-        status = Gather<int64_t, int16_t>(context, *params, input, positions,
-                                          output);
-        break;
-      case kTfLiteBool:
-        status =
-            Gather<bool, int16_t>(context, *params, input, positions, output);
-        break;
-      case kTfLiteString:
-        status = GatherStrings<int16_t>(context, input, positions, output);
-        break;
-      default:
-        TF_LITE_KERNEL_LOG(context, "Type '%s' is not supported by gather.",
-                           TfLiteTypeGetName(input->type));
-        return kTfLiteError;
-    }
-  }
+  TfLiteStatus status =
+      DispatchEvalPositionType(context, params, input, positions, output);
   if (status != kTfLiteOk) {
     TF_LITE_KERNEL_LOG(context, "gather index out of bounds");
   }
   return status;
-  TF_LITE_KERNEL_LOG(context,
-                     "Positions of type '%s' are not supported by gather.",
-                     TfLiteTypeGetName(positions->type));
-  return kTfLiteError;
 }
 }  // namespace gather
 
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index c5c316fe47d..1cff74a2826 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <initializer_list>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -47,22 +48,23 @@ class GatherOpModel : public SingleOpModel {
                  CreateGatherOptions(builder_, axis, batch_dims).Union());
     BuildInterpreter({GetShape(input_), GetShape(positions_)});
     if (!constant_tensor) {
-      SetInput(input_, input_data);
-      SetInput(positions_, positions_data);
+      SetInput(input_, input_data, std::is_same<std::string, InputType>());
+      SetPositions(positions_data);
     }
   }
 
   template <typename T>
-  void SetInput(int input, const std::vector<T> data) {
+  void SetInput(int input, const std::vector<T> data, std::false_type) {
     PopulateTensor<T>(input, data);
   }
 
-  template <>
-  void SetInput(int input, const std::vector<std::string> data) {
+  // Overload for string inputs.
+  template <typename T>
+  void SetInput(int input, const std::vector<T> data, std::true_type) {
     PopulateStringTensor(input_, data);
   }
 
-  void SetPositions(std::initializer_list<PositionsType> data) {
+  void SetPositions(const std::vector<PositionsType>& data) {
     PopulateTensor<PositionsType>(positions_, data);
   }
 
@@ -70,8 +72,8 @@ class GatherOpModel : public SingleOpModel {
     return ExtractVector<InputType>(output_);
   }
 
-  std::vector<string> GetStringOutput() {
-    return ExtractVector<string>(output_);
+  std::vector<std::string> GetStringOutput() {
+    return ExtractVector<std::string>(output_);
   }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 97824c10976..db5e58dcf6d 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -51,6 +51,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_shape",
+    srcs = ["runtime_shape.cc"],
+    hdrs = ["runtime_shape.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [
+        ":compatibility",
+    ],
+)
+
 cc_library(
     name = "types",
     hdrs = [
@@ -61,6 +72,7 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         ":compatibility",
+        ":runtime_shape",
     ],
 )
 
@@ -259,7 +271,7 @@ selects.config_setting_group(
 
 cc_library(
     name = "common",
-    srcs = [],
+    srcs = ["common.cc"],
     hdrs = ["common.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
@@ -267,6 +279,7 @@ cc_library(
         ":cppmath",
         ":cpu_check",
         ":types",
+        "//tensorflow/lite/core:macros",
         "@gemmlowp//:fixedpoint",
     ],
 )
@@ -319,6 +332,7 @@ cc_library(
         ":tensor_utils",
         ":transpose_utils",
         ":types",
+        "//tensorflow/lite:macros",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:cpu_backend_gemm",
@@ -357,6 +371,7 @@ cc_library(
         ":tensor_utils",
         ":transpose_utils",
         ":types",
+        "//tensorflow/lite:macros",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:cpu_backend_gemm",
@@ -507,10 +522,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "reference_comparisons",
+    srcs = ["reference/comparisons.cc"],
+    hdrs = ["reference/comparisons.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [
+        ":common",
+        ":types",
+        "//tensorflow/lite/core/c:common",
+    ],
+)
+
 cc_library(
     name = "reference_base",
     srcs = [],
     hdrs = [
+        "reduce_common.h",
         "reference/add.h",
         "reference/add_n.h",
         "reference/arg_min_max.h",
@@ -557,8 +586,8 @@ cc_library(
         "reference/integer_ops/transpose_conv.h",
         "reference/l2normalization.h",
         "reference/leaky_relu.h",
-        "reference/logistic.h",
         "reference/log_softmax.h",
+        "reference/logistic.h",
         "reference/lstm_cell.h",
         "reference/maximum_minimum.h",
         "reference/mul.h",
@@ -570,7 +599,6 @@ cc_library(
         "reference/process_broadcast_shapes.h",
         "reference/quantize.h",
         "reference/reduce.h",
-        "reduce_common.h",
         "reference/requantize.h",
         "reference/resize_bilinear.h",
         "reference/resize_nearest_neighbor.h",
@@ -592,8 +620,8 @@ cc_library(
             "reference/integer_ops/dequantize.h",
             "reference/integer_ops/log_softmax.h",
             "reference/reference_ops.h",
-            "reference/string_comparisons.h",
             "reference/sparse_ops/fully_connected.h",
+            "reference/string_comparisons.h",
         ],
     }),
     compatible_with = get_compatible_with_portable(),
@@ -612,6 +640,7 @@ cc_library(
         ":constants",
         ":cppmath",
         ":quantization_util",
+        ":reference_comparisons",
         ":strided_slice_logic",
         ":tensor",
         ":tensor_utils",
@@ -707,6 +736,7 @@ cc_library(
         ":legacy_types",
         ":quantization_util",
         ":reference_base",
+        ":reference_comparisons",
         ":strided_slice_logic",
         ":tensor",
         ":tensor_utils",
@@ -721,6 +751,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tensor_ctypes",
+    srcs = ["tensor_ctypes.cc"],
+    hdrs = ["tensor_ctypes.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [
+        ":types",
+        "//tensorflow/lite/core:macros",
+        "//tensorflow/lite/core/c:common",
+    ],
+)
+
 cc_library(
     name = "tensor",
     hdrs = [
@@ -735,7 +778,9 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
+        ":tensor_ctypes",
         ":types",
+        "//tensorflow/lite:macros",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/core/c:common",
     ],
@@ -755,7 +800,9 @@ cc_library(
     }),
     copts = tflite_copts(),
     deps = [
+        ":tensor_ctypes",
         ":types",
+        "//tensorflow/lite:macros",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/core/c:common",
     ],
@@ -934,8 +981,8 @@ cc_library(
     deps = [
         ":cpu_check",
         ":tensor_utils_no_eigen",
-        "//third_party/eigen3",
         "//tensorflow/lite/core/c:common",
+        "//third_party/eigen3",
     ] + selects.with_or({
         (
             ":arm_any",
@@ -1289,6 +1336,17 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "runtime_shape_test",
+    srcs = ["runtime_shape_test.cc"],
+    deps = [
+        ":types",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 filegroup(
     name = "optimized_op_headers",
     srcs = glob([
diff --git a/tensorflow/lite/kernels/internal/common.cc b/tensorflow/lite/kernels/internal/common.cc
new file mode 100644
index 00000000000..1654ab84f0d
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/common.cc
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+
+int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier,
+                                      int shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                                 x * (1 << left_shift), quantized_multiplier),
+                             right_shift);
+}
+
+int32_t MultiplyByQuantizedMultiplier(int64_t x, int32_t quantized_multiplier,
+                                      int shift) {
+  // Inputs:
+  // - quantized_multiplier has fixed point at bit 31
+  // - shift is -31 to +7 (negative for right shift)
+  //
+  // Assumptions: The following input ranges are assumed
+  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
+  // - scaling is chosen so final scaled result fits in int32_t
+  // - input x is in the range -(1<<47) <= x < (1<<47)
+  assert(quantized_multiplier >= 0);
+  assert(shift >= -31 && shift < 8);
+  assert(x >= -(static_cast<int64_t>(1) << 47) &&
+         x < (static_cast<int64_t>(1) << 47));
+
+  int32_t reduced_multiplier = (quantized_multiplier < 0x7FFF0000)
+                                   ? ((quantized_multiplier + (1 << 15)) >> 16)
+                                   : 0x7FFF;
+  int total_shift = 15 - shift;
+  x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1));
+  int32_t result = x >> total_shift;
+  return result;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index 00fe01f7a85..05184df663b 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include <functional>
 
 #include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -250,42 +251,11 @@ inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
                                            quantized_multiplier);
 }
 
-inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  int left_shift = shift > 0 ? shift : 0;
-  int right_shift = shift > 0 ? 0 : -shift;
-  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                                 x * (1 << left_shift), quantized_multiplier),
-                             right_shift);
-}
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int32_t x, int32_t quantized_multiplier, int shift);
 
-inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
-  // Inputs:
-  // - quantized_multiplier has fixed point at bit 31
-  // - shift is -31 to +7 (negative for right shift)
-  //
-  // Assumptions: The following input ranges are assumed
-  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
-  // - scaling is chosen so final scaled result fits in int32_t
-  // - input x is in the range -(1<<47) <= x < (1<<47)
-  assert(quantized_multiplier >= 0);
-  assert(shift >= -31 && shift < 8);
-  assert(x >= -(static_cast<int64_t>(1) << 47) &&
-         x < (static_cast<int64_t>(1) << 47));
-
-  int32_t reduced_multiplier = (quantized_multiplier < 0x7FFF0000)
-                                   ? ((quantized_multiplier + (1 << 15)) >> 16)
-                                   : 0x7FFF;
-  int total_shift = 15 - shift;
-  x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1));
-  int32_t result = x >> total_shift;
-  return result;
-}
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int64_t x, int32_t quantized_multiplier, int shift);
 
 #ifdef USE_NEON
 // Round uses ARM's rounding shift right.
@@ -1040,8 +1010,8 @@ inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
 
 // Copies dims to desc, calculating strides.
 template <int N>
-inline void CopyDimsToDesc(const RuntimeShape& input_shape,
-                           NdArrayDesc<N>* desc_out) {
+TFLITE_NOINLINE void CopyDimsToDesc(const RuntimeShape& input_shape,
+                                    NdArrayDesc<N>* desc_out) {
   int desc_stride = 1;
   for (int i = N - 1; i >= 0; --i) {
     desc_out->extents[i] = input_shape.Dims(i);
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index 2b0b075fca9..47ae6f8a814 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -497,7 +497,7 @@ inline void BroadcastAddDispatch(const ArithmeticParams& params,
                                  const RuntimeShape& output_shape,
                                  int8* output_data) {
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
-    return reference_integer_ops::BroadcastAdd4DSlow(
+    return reference_integer_ops::BroadcastAdd6DSlow(
         params, input1_shape, input1_data, input2_shape, input2_data,
         output_shape, output_data);
   }
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 8665bd231cc..fa0ba57288b 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -26,7 +26,9 @@ limitations under the License.
 #include <memory>
 #include <tuple>
 #include <type_traits>
+#include <utility>
 
+#include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/reference/add.h"
@@ -73,7 +75,7 @@ using reference_ops::Broadcast4DSlowLess;
 using reference_ops::Broadcast4DSlowLessEqual;
 using reference_ops::Broadcast4DSlowLessEqualWithScaling;
 using reference_ops::Broadcast4DSlowLessWithScaling;
-using reference_ops::BroadcastAdd4DSlow;
+using reference_ops::BroadcastAdd6DSlow;
 using reference_ops::BroadcastMul4DSlow;
 using reference_ops::BroadcastSub16POTSlow;
 using reference_ops::BroadcastSubSlow;
@@ -137,41 +139,41 @@ MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
   return MatrixMap<Scalar>(data, rows, cols);
 }
 
+static inline void swap_data(ArithmeticParams& arithmetic_params) {
+  std::swap(arithmetic_params.input1_offset, arithmetic_params.input2_offset);
+  std::swap(arithmetic_params.input1_shift, arithmetic_params.input2_shift);
+  std::swap(arithmetic_params.input1_multiplier,
+            arithmetic_params.input2_multiplier);
+}
+
 template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
-inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
-                                    const RuntimeShape& unswitched_input1_shape,
-                                    const T* unswitched_input1_data,
-                                    const RuntimeShape& unswitched_input2_shape,
-                                    const T* unswitched_input2_data,
-                                    const RuntimeShape& output_shape,
-                                    T* output_data, ElementwiseF elementwise_f,
-                                    ScalarBroadcastF scalar_broadcast_f) {
-  ArithmeticParams switched_params = unswitched_params;
-  switched_params.input1_offset = unswitched_params.input2_offset;
-  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
-  switched_params.input1_shift = unswitched_params.input2_shift;
-  switched_params.input2_offset = unswitched_params.input1_offset;
-  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
-  switched_params.input2_shift = unswitched_params.input1_shift;
+TFLITE_NOINLINE void BinaryBroadcastFiveFold(
+    const ArithmeticParams& unswitched_params,
+    const RuntimeShape& unswitched_input1_shape,
+    const T* unswitched_input1_data,
+    const RuntimeShape& unswitched_input2_shape,
+    const T* unswitched_input2_data, const RuntimeShape& output_shape,
+    T* output_data, ElementwiseF elementwise_f,
+    ScalarBroadcastF scalar_broadcast_f) {
+  ArithmeticParams& params = const_cast<ArithmeticParams&>(unswitched_params);
+  const T* input1_data_ptr = unswitched_input1_data;
+  const T* input2_data_reset = unswitched_input2_data;
 
   const bool use_unswitched =
       unswitched_params.broadcast_category ==
       tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const ArithmeticParams& params =
-      use_unswitched ? unswitched_params : switched_params;
-  const T* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const T* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+  if (!use_unswitched) {
+    swap_data(params);  // Swap in-place temporarily; will revert before return.
+    input1_data_ptr = unswitched_input2_data;
+    input2_data_reset = unswitched_input1_data;
+  }
 
   // Fivefold nested loops. The second input resets its position for each
   // iteration of the second loop. The first input resets its position at the
   // beginning of the fourth loop. The innermost loop is an elementwise add of
   // sections of the arrays.
   T* output_data_ptr = output_data;
-  const T* input1_data_ptr = input1_data;
-  const T* input2_data_reset = input2_data;
+
   // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
   // between input shapes. y3 for input 1 is always broadcast, and so the
   // dimension there is 1, whereas optionally y1 might be broadcast for
@@ -229,6 +231,10 @@ inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
       input2_data_reset = input2_data_ptr;
     }
   }
+
+  if (!use_unswitched) {
+    swap_data(params);  // Revert the referenced params to the original state.
+  }
 }
 
 #ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
@@ -1833,7 +1839,7 @@ inline typename std::enable_if<is_int32_or_int64<T>::value, void>::type Add(
                              .cwiseMax(activation_min)
                              .cwiseMin(activation_max);
   } else {
-    reference_ops::BroadcastAdd4DSlow<T>(params, input1_shape, input1_data,
+    reference_ops::BroadcastAdd6DSlow<T>(params, input1_shape, input1_data,
                                          input2_shape, input2_data,
                                          output_shape, output_data);
   }
@@ -1845,7 +1851,7 @@ inline void BroadcastAddDispatch(
     const T* input1_data, const RuntimeShape& input2_shape,
     const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
-    return BroadcastAdd4DSlow(params, input1_shape, input1_data, input2_shape,
+    return BroadcastAdd6DSlow(params, input1_shape, input1_data, input2_shape,
                               input2_data, output_shape, output_data);
   }
 
@@ -4275,7 +4281,7 @@ inline void BatchToSpaceND(
 }
 
 template <typename T>
-void TypedMemset(void* ptr, T value, size_t num) {
+TFLITE_NOINLINE void TypedMemset(void* ptr, T value, size_t num) {
   // Optimization for common cases where memset() will suffice.
   if (value == 0 || std::is_same<T, uint8_t>::value) {
     memset(ptr, value, num * sizeof(T));
diff --git a/tensorflow/lite/kernels/internal/optimized/reduce_utils.h b/tensorflow/lite/kernels/internal/optimized/reduce_utils.h
index 51e30da014f..f89d06b0f91 100644
--- a/tensorflow/lite/kernels/internal/optimized/reduce_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/reduce_utils.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include <algorithm>
+#include <cstring>
 
 namespace tflite {
 namespace reduce_utils {
diff --git a/tensorflow/lite/kernels/internal/portable_tensor.h b/tensorflow/lite/kernels/internal/portable_tensor.h
index 45135b1f78c..1eee6217d15 100644
--- a/tensorflow/lite/kernels/internal/portable_tensor.h
+++ b/tensorflow/lite/kernels/internal/portable_tensor.h
@@ -23,10 +23,6 @@ limitations under the License.
 
 namespace tflite {
 
-inline RuntimeShape GetTensorShape(std::vector<int32_t> data) {
-  return RuntimeShape(data.size(), data.data());
-}
-
 // A list of tensors in a format that can be used by kernels like split and
 // concatenation.
 template <typename T>
diff --git a/tensorflow/lite/kernels/internal/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/portable_tensor_utils.cc
index a9cfee8e7d7..024043d75d3 100644
--- a/tensorflow/lite/kernels/internal/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/portable_tensor_utils.cc
@@ -70,13 +70,19 @@ void ApplySignbitToVector(const float* __restrict__ vector, int v_size,
 
 void UnpackDenseInt4IntoInt8(const int8_t* src_buffer, int num_elements,
                              int8_t* dst_buffer) {
-  for (int i = 0; i < num_elements; i += 2) {
+  for (int i = 0; i < num_elements / 2; i++) {
+    int8_t byte = src_buffer[i];
     // Shift left first so that sign is properly extended when shifted right
-    dst_buffer[i] = static_cast<int8_t>(src_buffer[i / 2] << 4) >> 4;
-    // Break early if the tensor has odd length and the higher nibble should be
-    // ignored.
-    if (i + 1 == num_elements) break;
-    dst_buffer[i + 1] = static_cast<int8_t>(src_buffer[i / 2]) >> 4;
+    int8_t lower = static_cast<int8_t>(byte << 4) >> 4;
+    int8_t higher = byte >> 4;
+    dst_buffer[2 * i] = lower;
+    dst_buffer[2 * i + 1] = higher;
+  }
+
+  // If the buffer size is odd, extract the final lower nibble.
+  if (num_elements % 2 != 0) {
+    dst_buffer[num_elements - 1] =
+        static_cast<int8_t>(src_buffer[num_elements / 2] << 4) >> 4;
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/reference/add.h b/tensorflow/lite/kernels/internal/reference/add.h
index 7cb8cde2af4..b89a57b9689 100644
--- a/tensorflow/lite/kernels/internal/reference/add.h
+++ b/tensorflow/lite/kernels/internal/reference/add.h
@@ -198,16 +198,16 @@ template <typename T,
           // For unquantized add for small integers, explictly set to true.
           bool dummy = false>
 inline typename std::enable_if<!is_small_integer<T>::value || dummy, void>::type
-BroadcastAdd4DSlow(const ArithmeticParams& params,
+BroadcastAdd6DSlow(const ArithmeticParams& params,
                    const RuntimeShape& input1_shape, const T* input1_data,
                    const RuntimeShape& input2_shape, const T* input2_data,
                    const RuntimeShape& output_shape, T* output_data) {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
+  NdArrayDesc<6> desc1;
+  NdArrayDesc<6> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
   const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
+      RuntimeShape::ExtendedShape(6, output_shape);
 
   T activation_min, activation_max;
   GetActivationParams(params, &activation_min, &activation_max);
@@ -223,18 +223,64 @@ BroadcastAdd4DSlow(const ArithmeticParams& params,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax<T>(
-                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
-                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+  size_t input1_offset_a = 0;
+  size_t input2_offset_a = 0;
+  size_t output_offset_a = 0;
+  for (int a = 0; a < extended_output_shape.Dims(0); ++a) {
+    size_t input1_offset_d = input1_offset_a;
+    size_t input2_offset_d = input2_offset_a;
+    size_t output_offset_d = output_offset_a;
+    for (int d = 0; d < extended_output_shape.Dims(1); ++d) {
+      size_t input1_offset_b = input1_offset_d;
+      size_t input2_offset_b = input2_offset_d;
+      size_t output_offset_b = output_offset_d;
+      for (int b = 0; b < extended_output_shape.Dims(2); ++b) {
+        size_t input1_offset_y = input1_offset_b;
+        size_t input2_offset_y = input2_offset_b;
+        size_t output_offset_y = output_offset_b;
+        for (int y = 0; y < extended_output_shape.Dims(3); ++y) {
+          size_t input1_offset_x = input1_offset_y;
+          size_t input2_offset_x = input2_offset_y;
+          size_t output_offset_x = output_offset_y;
+          for (int x = 0; x < extended_output_shape.Dims(4); ++x) {
+            size_t input1_offset_c = input1_offset_x;
+            size_t input2_offset_c = input2_offset_x;
+            size_t output_offset_c = output_offset_x;
+            for (int c = 0; c < extended_output_shape.Dims(5); ++c) {
+              output_data[output_offset_c] = ActivationFunctionWithMinMax<T>(
+                  input1_data[input1_offset_c] + input2_data[input2_offset_c],
                   activation_min, activation_max);
+              input1_offset_c += desc1.strides[5];
+              input2_offset_c += desc2.strides[5];
+              ++output_offset_c;
+            }
+            input1_offset_x += desc1.strides[4];
+            input2_offset_x += desc2.strides[4];
+            output_offset_x += extended_output_shape.Dims(5);
+          }
+          input1_offset_y += desc1.strides[3];
+          input2_offset_y += desc2.strides[3];
+          output_offset_y +=
+              extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
         }
+        input1_offset_b += desc1.strides[2];
+        input2_offset_b += desc2.strides[2];
+        output_offset_b += extended_output_shape.Dims(3) *
+                           extended_output_shape.Dims(4) *
+                           extended_output_shape.Dims(5);
       }
+      input1_offset_d += desc1.strides[1];
+      input2_offset_d += desc2.strides[1];
+      output_offset_d +=
+          extended_output_shape.Dims(2) * extended_output_shape.Dims(3) *
+          extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
     }
+    input1_offset_a += desc1.strides[0];
+    input2_offset_a += desc2.strides[0];
+    output_offset_a +=
+        extended_output_shape.Dims(1) * extended_output_shape.Dims(2) *
+        extended_output_shape.Dims(3) * extended_output_shape.Dims(4) *
+        extended_output_shape.Dims(5);
   }
 }
 
@@ -243,16 +289,16 @@ BroadcastAdd4DSlow(const ArithmeticParams& params,
 // choice of the shift (20 or 15, accordingly - see add.cc for more comments).
 template <typename T>
 inline typename std::enable_if<is_small_integer<T>::value, void>::type
-BroadcastAdd4DSlow(const ArithmeticParams& params,
+BroadcastAdd6DSlow(const ArithmeticParams& params,
                    const RuntimeShape& input1_shape, const T* input1_data,
                    const RuntimeShape& input2_shape, const T* input2_data,
                    const RuntimeShape& output_shape, T* output_data) {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
+  NdArrayDesc<6> desc1;
+  NdArrayDesc<6> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
   const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
+      RuntimeShape::ExtendedShape(6, output_shape);
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -265,44 +311,98 @@ BroadcastAdd4DSlow(const ArithmeticParams& params,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32_t input1_val =
-              params.input1_offset +
-              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32_t input2_val =
-              params.input2_offset +
-              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32_t shifted_input1_val =
-              input1_val * (1 << params.left_shift);
-          const int32_t shifted_input2_val =
-              input2_val * (1 << params.left_shift);
-          const int32_t scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input1_val, params.input1_multiplier,
-                  params.input1_shift);
-          const int32_t scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input2_val, params.input2_multiplier,
-                  params.input2_shift);
-          const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
-          const int32_t raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  raw_sum, params.output_multiplier, params.output_shift) +
-              params.output_offset;
-          const int32_t clamped_output =
-              std::min(params.quantized_activation_max,
-                       std::max(params.quantized_activation_min, raw_output));
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<T>(clamped_output);
+  size_t input1_offset_a = 0;
+  size_t input2_offset_a = 0;
+  size_t output_offset_a = 0;
+  for (int a = 0; a < extended_output_shape.Dims(0); ++a) {
+    size_t input1_offset_d = input1_offset_a;
+    size_t input2_offset_d = input2_offset_a;
+    size_t output_offset_d = output_offset_a;
+    for (int d = 0; d < extended_output_shape.Dims(1); ++d) {
+      size_t input1_offset_b = input1_offset_d;
+      size_t input2_offset_b = input2_offset_d;
+      size_t output_offset_b = output_offset_d;
+      for (int b = 0; b < extended_output_shape.Dims(2); ++b) {
+        size_t input1_offset_y = input1_offset_b;
+        size_t input2_offset_y = input2_offset_b;
+        size_t output_offset_y = output_offset_b;
+        for (int y = 0; y < extended_output_shape.Dims(3); ++y) {
+          size_t input1_offset_x = input1_offset_y;
+          size_t input2_offset_x = input2_offset_y;
+          size_t output_offset_x = output_offset_y;
+          for (int x = 0; x < extended_output_shape.Dims(4); ++x) {
+            size_t input1_offset_c = input1_offset_x;
+            size_t input2_offset_c = input2_offset_x;
+            size_t output_offset_c = output_offset_x;
+            for (int c = 0; c < extended_output_shape.Dims(5); ++c) {
+              const int32_t input1_val =
+                  params.input1_offset + input1_data[input1_offset_c];
+              const int32_t input2_val =
+                  params.input2_offset + input2_data[input2_offset_c];
+              const int32_t shifted_input1_val =
+                  input1_val * (1 << params.left_shift);
+              const int32_t shifted_input2_val =
+                  input2_val * (1 << params.left_shift);
+              const int32_t scaled_input1_val =
+                  MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                      shifted_input1_val, params.input1_multiplier,
+                      params.input1_shift);
+              const int32_t scaled_input2_val =
+                  MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                      shifted_input2_val, params.input2_multiplier,
+                      params.input2_shift);
+              const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+              const int32_t raw_output =
+                  MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                      raw_sum, params.output_multiplier, params.output_shift) +
+                  params.output_offset;
+              const int32_t clamped_output = std::min(
+                  params.quantized_activation_max,
+                  std::max(params.quantized_activation_min, raw_output));
+              output_data[output_offset_c] = static_cast<T>(clamped_output);
+              input1_offset_c += desc1.strides[5];
+              input2_offset_c += desc2.strides[5];
+              ++output_offset_c;
+            }
+            input1_offset_x += desc1.strides[4];
+            input2_offset_x += desc2.strides[4];
+            output_offset_x += extended_output_shape.Dims(5);
+          }
+          input1_offset_y += desc1.strides[3];
+          input2_offset_y += desc2.strides[3];
+          output_offset_y +=
+              extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
         }
+        input1_offset_b += desc1.strides[2];
+        input2_offset_b += desc2.strides[2];
+        output_offset_b += extended_output_shape.Dims(3) *
+                           extended_output_shape.Dims(4) *
+                           extended_output_shape.Dims(5);
       }
+      input1_offset_d += desc1.strides[1];
+      input2_offset_d += desc2.strides[1];
+      output_offset_d +=
+          extended_output_shape.Dims(2) * extended_output_shape.Dims(3) *
+          extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
     }
+    input1_offset_a += desc1.strides[0];
+    input2_offset_a += desc2.strides[0];
+    output_offset_a +=
+        extended_output_shape.Dims(1) * extended_output_shape.Dims(2) *
+        extended_output_shape.Dims(3) * extended_output_shape.Dims(4) *
+        extended_output_shape.Dims(5);
   }
 }
 
+template <typename T>
+inline void BroadcastAdd4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
+  return BroadcastAdd6DSlow(params, input1_shape, input1_data, input2_shape,
+                            input2_data, output_shape, output_data);
+}
+
 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
                                  const RuntimeShape& unswitched_input1_shape,
                                  const uint8_t* unswitched_input1_data,
diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.cc b/tensorflow/lite/kernels/internal/reference/comparisons.cc
new file mode 100644
index 00000000000..86b4a6af0c0
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/comparisons.cc
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+
+namespace tflite {
+namespace reference_ops {
+
+BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
+    const RuntimeShape& unextended_input1_shape,
+    const RuntimeShape& unextended_input2_shape,
+    const RuntimeShape& unextended_output_shape) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+  return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1,
+          desc2};
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.h b/tensorflow/lite/kernels/internal/reference/comparisons.h
index d3b8c115e0d..35583195551 100644
--- a/tensorflow/lite/kernels/internal/reference/comparisons.h
+++ b/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -112,20 +112,11 @@ struct BroadcastComparison4DSlowCommon {
   NdArrayDesc<4> desc2;
 };
 
-inline BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
+TFLITE_NOINLINE
+BroadcastComparison4DSlowCommon BroadcastComparison4DSlowPreprocess(
     const RuntimeShape& unextended_input1_shape,
     const RuntimeShape& unextended_input2_shape,
-    const RuntimeShape& unextended_output_shape) {
-  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
-                                      unextended_input2_shape, &desc1, &desc2);
-  return {RuntimeShape::ExtendedShape(4, unextended_output_shape), desc1,
-          desc2};
-}
+    const RuntimeShape& unextended_output_shape);
 
 template <typename T, ComparisonFn<T> F>
 inline void BroadcastComparison4DSlowImpl(
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
index ec2969b46b4..579964dc30d 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -48,18 +48,18 @@ void ElementWise(int size, const ArithmeticParams& params, const T* input1_data,
 }
 // TODO(b/270589088): move to a more appropriate file. (b/270589088#comment2)
 template <typename T>
-void BroadcastBinaryFunction4DSlow(
+void BroadcastBinaryFunction6DSlow(
     const ArithmeticParams& params, const RuntimeShape& input1_shape,
     const T* input1_data, const RuntimeShape& input2_shape,
     const T* input2_data, const RuntimeShape& output_shape, T* output_data,
     void (*check_arithmetic_params)(const ArithmeticParams&),
     T (*binary_func)(T, T, const ArithmeticParams&)) {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
+  NdArrayDesc<6> desc1;
+  NdArrayDesc<6> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
   const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
+      RuntimeShape::ExtendedShape(6, output_shape);
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -72,19 +72,79 @@ void BroadcastBinaryFunction4DSlow(
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          output_data[Offset(extended_output_shape, b, y, x, c)] = binary_func(
-              input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-              input2_data[SubscriptToIndex(desc2, b, y, x, c)], params);
+  size_t input1_offset_a = 0;
+  size_t input2_offset_a = 0;
+  size_t output_offset_a = 0;
+  for (int a = 0; a < extended_output_shape.Dims(0); ++a) {
+    size_t input1_offset_d = input1_offset_a;
+    size_t input2_offset_d = input2_offset_a;
+    size_t output_offset_d = output_offset_a;
+    for (int d = 0; d < extended_output_shape.Dims(1); ++d) {
+      size_t input1_offset_b = input1_offset_d;
+      size_t input2_offset_b = input2_offset_d;
+      size_t output_offset_b = output_offset_d;
+      for (int b = 0; b < extended_output_shape.Dims(2); ++b) {
+        size_t input1_offset_y = input1_offset_b;
+        size_t input2_offset_y = input2_offset_b;
+        size_t output_offset_y = output_offset_b;
+        for (int y = 0; y < extended_output_shape.Dims(3); ++y) {
+          size_t input1_offset_x = input1_offset_y;
+          size_t input2_offset_x = input2_offset_y;
+          size_t output_offset_x = output_offset_y;
+          for (int x = 0; x < extended_output_shape.Dims(4); ++x) {
+            size_t input1_offset_c = input1_offset_x;
+            size_t input2_offset_c = input2_offset_x;
+            size_t output_offset_c = output_offset_x;
+            for (int c = 0; c < extended_output_shape.Dims(5); ++c) {
+              output_data[output_offset_c] =
+                  binary_func(input1_data[input1_offset_c],
+                              input2_data[input2_offset_c], params);
+              input1_offset_c += desc1.strides[5];
+              input2_offset_c += desc2.strides[5];
+              ++output_offset_c;
+            }
+            input1_offset_x += desc1.strides[4];
+            input2_offset_x += desc2.strides[4];
+            output_offset_x += extended_output_shape.Dims(5);
+          }
+          input1_offset_y += desc1.strides[3];
+          input2_offset_y += desc2.strides[3];
+          output_offset_y +=
+              extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
         }
+        input1_offset_b += desc1.strides[2];
+        input2_offset_b += desc2.strides[2];
+        output_offset_b += extended_output_shape.Dims(3) *
+                           extended_output_shape.Dims(4) *
+                           extended_output_shape.Dims(5);
       }
+      input1_offset_d += desc1.strides[1];
+      input2_offset_d += desc2.strides[1];
+      output_offset_d +=
+          extended_output_shape.Dims(2) * extended_output_shape.Dims(3) *
+          extended_output_shape.Dims(4) * extended_output_shape.Dims(5);
     }
+    input1_offset_a += desc1.strides[0];
+    input2_offset_a += desc2.strides[0];
+    output_offset_a +=
+        extended_output_shape.Dims(1) * extended_output_shape.Dims(2) *
+        extended_output_shape.Dims(3) * extended_output_shape.Dims(4) *
+        extended_output_shape.Dims(5);
   }
 }
 
+template <typename T>
+void BroadcastBinaryFunction4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data,
+    void (*check_arithmetic_params)(const ArithmeticParams&),
+    T (*binary_func)(T, T, const ArithmeticParams&)) {
+  BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape,
+                                input2_data, output_shape, output_data,
+                                check_arithmetic_params, binary_func);
+}
+
 inline int8_t AddFunc(int8_t x, int8_t y, const ArithmeticParams& params) {
   const int32_t input1_val = params.input1_offset + x;
   const int32_t input2_val = params.input2_offset + y;
@@ -128,6 +188,18 @@ inline void Add(const ArithmeticParams& params,
   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
+inline void BroadcastAdd6DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape,
+                                input2_data, output_shape, output_data,
+                                CheckArithmeticParams, AddFunc);
+}
+
 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                                const RuntimeShape& input1_shape,
                                const int8_t* input1_data,
@@ -135,7 +207,7 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                                const int8_t* input2_data,
                                const RuntimeShape& output_shape,
                                int8_t* output_data) {
-  BroadcastBinaryFunction4DSlow(params, input1_shape, input1_data, input2_shape,
+  BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape,
                                 input2_data, output_shape, output_data,
                                 CheckArithmeticParams, AddFunc);
 }
diff --git a/tensorflow/lite/kernels/internal/reference/mul.h b/tensorflow/lite/kernels/internal/reference/mul.h
index 531977327fc..2767fef26f6 100644
--- a/tensorflow/lite/kernels/internal/reference/mul.h
+++ b/tensorflow/lite/kernels/internal/reference/mul.h
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ inline void Mul(const ArithmeticParams& params,
   const int flat_size =
       MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
+    output_data[i] = ActivationFunctionWithMinMax<T>(
         input1_data[i] * input2_data[i], output_activation_min,
         output_activation_max);
   }
@@ -128,14 +128,18 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
   }
 }
 
-template <typename T>
-void BroadcastMul4DSlow(const ArithmeticParams& params,
-                        const RuntimeShape& unextended_input1_shape,
-                        const T* input1_data,
-                        const RuntimeShape& unextended_input2_shape,
-                        const T* input2_data,
-                        const RuntimeShape& unextended_output_shape,
-                        T* output_data) {
+template <typename T,
+          // For unquantized mul on small integers, explictly set to true.
+          bool enable_for_short_integers = false>
+inline typename std::enable_if<
+    !is_small_integer<T>::value || enable_for_short_integers, void>::type
+BroadcastMul4DSlow(const ArithmeticParams& params,
+                   const RuntimeShape& unextended_input1_shape,
+                   const T* input1_data,
+                   const RuntimeShape& unextended_input2_shape,
+                   const T* input2_data,
+                   const RuntimeShape& unextended_output_shape,
+                   T* output_data) {
   T output_activation_min;
   T output_activation_max;
   GetActivationParams(params, &output_activation_min, &output_activation_max);
@@ -167,7 +171,7 @@ void BroadcastMul4DSlow(const ArithmeticParams& params,
       for (int x = 0; x < output_shape.Dims(2); ++x) {
         for (int c = 0; c < output_shape.Dims(3); ++c) {
           output_data[Offset(output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax(
+              ActivationFunctionWithMinMax<T>(
                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
                   output_activation_min, output_activation_max);
diff --git a/tensorflow/lite/kernels/internal/runtime_shape.cc b/tensorflow/lite/kernels/internal/runtime_shape.cc
new file mode 100644
index 00000000000..a03539d724b
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/runtime_shape.cc
@@ -0,0 +1,49 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+
+#include <cstring>
+
+namespace tflite {
+
+RuntimeShape::~RuntimeShape() {
+  if (size_ > kMaxSmallSize) {
+    delete[] dims_pointer_;
+  }
+}
+
+int32_t RuntimeShape::Dims(int i) const {
+  TFLITE_DCHECK_GE(i, 0);
+  TFLITE_DCHECK_LT(i, size_);
+  return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i];
+}
+
+void RuntimeShape::ReplaceWith(int dimensions_count, const int32_t* dims_data) {
+  Resize(dimensions_count);
+  int32_t* dst_dims = DimsData();
+  std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
+}
+
+int RuntimeShape::FlatSize() const {
+  int buffer_size = 1;
+  const int* dims_data = reinterpret_cast<const int*>(DimsData());
+  for (int i = 0; i < size_; i++) {
+    buffer_size *= dims_data[i];
+  }
+  return buffer_size;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/runtime_shape.h b/tensorflow/lite/kernels/internal/runtime_shape.h
index 855c305d270..533f070977a 100644
--- a/tensorflow/lite/kernels/internal/runtime_shape.h
+++ b/tensorflow/lite/kernels/internal/runtime_shape.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstring>
 #include <initializer_list>
 #include <iterator>
+#include <memory>
 
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 
@@ -32,9 +33,9 @@ struct Dims {
 
 class RuntimeShape {
  public:
-  // Shapes with dimensions up to 5 are stored directly in the structure, while
+  // Shapes with dimensions up to 6 are stored directly in the structure, while
   // larger shapes are separately allocated.
-  static constexpr int kMaxSmallSize = 5;
+  static constexpr int kMaxSmallSize = 6;
 
   RuntimeShape& operator=(RuntimeShape const&) = delete;
 
@@ -76,18 +77,12 @@ class RuntimeShape {
                0;
   }
 
-  ~RuntimeShape() {
-    if (size_ > kMaxSmallSize) {
-      delete[] dims_pointer_;
-    }
-  }
+  ~RuntimeShape();
 
   inline int32_t DimensionsCount() const { return size_; }
-  inline int32_t Dims(int i) const {
-    TFLITE_DCHECK_GE(i, 0);
-    TFLITE_DCHECK_LT(i, size_);
-    return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i];
-  }
+
+  int32_t Dims(int i) const;
+
   inline void SetDim(int i, int32_t val) {
     TFLITE_DCHECK_GE(i, 0);
     TFLITE_DCHECK_LT(i, size_);
@@ -108,20 +103,32 @@ class RuntimeShape {
   inline const int32_t* DimsDataUpTo5D() const { return dims_; }
 
   inline void Resize(int dimensions_count) {
-    if (size_ > kMaxSmallSize) {
-      delete[] dims_pointer_;
-    }
+    const int32_t old_size = size_;
     size_ = dimensions_count;
-    if (dimensions_count > kMaxSmallSize) {
-      dims_pointer_ = new int32_t[dimensions_count];
+
+    if (old_size <= kMaxSmallSize) {
+      if (dimensions_count <= kMaxSmallSize) {
+        return;
+      } else {  // Small to big.
+        int32_t* new_big_data = new int32_t[dimensions_count];
+        memcpy(new_big_data, dims_, sizeof(int32_t) * old_size);
+        dims_pointer_ = new_big_data;
+      }
+    } else {
+      if (dimensions_count > kMaxSmallSize && dimensions_count <= old_size) {
+        return;
+      }
+      std::unique_ptr<int32_t[]> old_data(dims_pointer_);
+      if (dimensions_count <= old_size) {  // Big to small.
+        memcpy(dims_, old_data.get(), sizeof(int32_t) * dimensions_count);
+      } else {  // Big to bigger.
+        dims_pointer_ = new int32_t[dimensions_count];
+        memcpy(dims_pointer_, old_data.get(), sizeof(int32_t) * old_size);
+      }
     }
   }
 
-  inline void ReplaceWith(int dimensions_count, const int32_t* dims_data) {
-    Resize(dimensions_count);
-    int32_t* dst_dims = DimsData();
-    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
-  }
+  void ReplaceWith(int dimensions_count, const int32_t* dims_data);
 
   template <typename T>
   inline void BuildFrom(const T& src_iterable) {
@@ -151,14 +158,7 @@ class RuntimeShape {
 
   // Returns the total count of elements, that is the size when flattened into a
   // vector.
-  inline int FlatSize() const {
-    int buffer_size = 1;
-    const int* dims_data = reinterpret_cast<const int*>(DimsData());
-    for (int i = 0; i < size_; i++) {
-      buffer_size *= dims_data[i];
-    }
-    return buffer_size;
-  }
+  int FlatSize() const;
 
   bool operator!=(const RuntimeShape& comp) const { return !((*this) == comp); }
 
diff --git a/tensorflow/lite/kernels/internal/runtime_shape_test.cc b/tensorflow/lite/kernels/internal/runtime_shape_test.cc
new file mode 100644
index 00000000000..ba4f23b34bc
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/runtime_shape_test.cc
@@ -0,0 +1,251 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <numeric>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
+
+using testing::Each;
+using testing::ElementsAreArray;
+
+namespace tflite {
+namespace {
+
+constexpr int kSmallSize = RuntimeShape::kMaxSmallSize;
+constexpr int kBigSize = RuntimeShape::kMaxSmallSize + 1;
+
+std::vector<int32_t> IotaVector(int size, int start = 0) {
+  std::vector<int32_t> vec(size);
+  absl::c_iota(vec, start);
+  return vec;
+}
+
+absl::Span<const int32_t> AsSpan(const RuntimeShape& shape) {
+  return absl::Span<const int32_t>(shape.DimsData(), shape.DimensionsCount());
+}
+
+class RuntimeShapeTest : public testing::TestWithParam<int> {};
+
+TEST(RuntimeShapeTest, TestDefaultConstructor) {
+  const RuntimeShape shape;
+  EXPECT_EQ(shape.DimensionsCount(), 0);
+}
+
+TEST_P(RuntimeShapeTest, TestConstructorWithSize) {
+  const int size = GetParam();
+  const RuntimeShape shape(size);
+  EXPECT_EQ(shape.DimensionsCount(), size);
+}
+
+TEST_P(RuntimeShapeTest, TestConstructorWithSizeAndDefaultValue) {
+  const int size = GetParam();
+  const RuntimeShape shape(size, 34);
+  EXPECT_EQ(shape.DimensionsCount(), size);
+  EXPECT_THAT(AsSpan(shape), Each(34));
+}
+
+TEST_P(RuntimeShapeTest, TestConstructorFromCArray) {
+  const int size = GetParam();
+  const std::vector<int32_t> src = IotaVector(size);
+  const RuntimeShape shape(size, src.data());
+  EXPECT_EQ(shape.DimensionsCount(), size);
+  EXPECT_THAT(AsSpan(shape), ElementsAreArray(src));
+}
+
+TEST(RuntimeShapeTest, TestConstructorFromSmallInitList) {
+  std::initializer_list<int> init{1, 2, 3};
+  // Ensure we are testing a small initializer list.
+  ASSERT_LE(init.size(), RuntimeShape::kMaxSmallSize);
+  const RuntimeShape shape(init);
+  EXPECT_EQ(shape.DimensionsCount(), init.size());
+  EXPECT_THAT(AsSpan(shape), ElementsAreArray(init));
+}
+
+TEST(RuntimeShapeTest, TestConstructorFromBigInitList) {
+  std::initializer_list<int> init{1, 2, 3, 4, 5, 6, 7, 8, 9};
+  // Ensure we are testing a big initializer list.
+  ASSERT_GT(init.size(), RuntimeShape::kMaxSmallSize);
+  const RuntimeShape shape(init);
+  EXPECT_EQ(shape.DimensionsCount(), init.size());
+  EXPECT_THAT(AsSpan(shape), ElementsAreArray(init));
+}
+
+TEST_P(RuntimeShapeTest, TestCopyConstructorFromShape) {
+  const int size = GetParam();
+  const RuntimeShape src(size, 34);
+  const RuntimeShape dst(src);
+  EXPECT_EQ(dst.DimensionsCount(), src.DimensionsCount());
+  EXPECT_THAT(AsSpan(dst), ElementsAreArray(AsSpan(src)));
+}
+
+TEST_P(RuntimeShapeTest, TestEqualityOperator) {
+  const int size = GetParam();
+  const RuntimeShape shape1(size, 34);
+  const RuntimeShape shape2(size, 34);
+  EXPECT_TRUE(shape1 == shape2);
+  EXPECT_FALSE(shape1 != shape2);
+}
+
+TEST_P(RuntimeShapeTest, TestEqualityOperatorDifferentSizes) {
+  const int size = GetParam();
+  const RuntimeShape shape1(size, 34);
+  const RuntimeShape shape2(size + 1, 34);
+  EXPECT_FALSE(shape1 == shape2);
+  EXPECT_TRUE(shape1 != shape2);
+}
+
+TEST_P(RuntimeShapeTest, TestEqualityOperatorDifferentValues) {
+  const int size = GetParam();
+  const RuntimeShape shape1(size, 34);
+  const RuntimeShape shape2(size, 43);
+  EXPECT_FALSE(shape1 == shape2);
+  EXPECT_TRUE(shape1 != shape2);
+}
+
+TEST_P(RuntimeShapeTest, TestSetterGetter) {
+  const int size = GetParam();
+  RuntimeShape shape(size);
+  for (int i = 0; i < size; ++i) {
+    shape.SetDim(i, i);
+    EXPECT_EQ(shape.Dims(i), i);
+  }
+  EXPECT_THAT(AsSpan(shape), ElementsAreArray(IotaVector(size)));
+}
+
+TEST(RuntimeShapeTest, TestResizeSmallSmall) {
+  ASSERT_GE(kSmallSize, 1);
+  RuntimeShape shape(kSmallSize - 1, 23);
+  shape.Resize(kSmallSize);
+  EXPECT_EQ(shape.DimensionsCount(), kSmallSize);
+  EXPECT_THAT(absl::Span<const int32_t>(shape.DimsData(), kSmallSize - 1),
+              Each(23));
+}
+
+TEST(RuntimeShapeTest, TestResizeSmallBig) {
+  RuntimeShape shape(kSmallSize, 23);
+  shape.Resize(kBigSize);
+  EXPECT_EQ(shape.DimensionsCount(), kBigSize);
+  EXPECT_THAT(absl::Span<const int32_t>(shape.DimsData(), kSmallSize),
+              Each(23));
+}
+
+TEST(RuntimeShapeTest, TestResizeBigSmall) {
+  RuntimeShape shape(kBigSize, 23);
+  shape.Resize(kSmallSize);
+  EXPECT_EQ(shape.DimensionsCount(), kSmallSize);
+  EXPECT_THAT(absl::Span<const int32_t>(shape.DimsData(), kSmallSize),
+              Each(23));
+}
+
+TEST(RuntimeShapeTest, TestResizeDownBigBig) {
+  RuntimeShape shape(kBigSize + 3, 23);
+  shape.Resize(kBigSize);
+  EXPECT_EQ(shape.DimensionsCount(), kBigSize);
+  EXPECT_THAT(absl::Span<const int32_t>(shape.DimsData(), kBigSize), Each(23));
+}
+
+TEST(RuntimeShapeTest, TestResizeUpBigBig) {
+  RuntimeShape shape(kBigSize, 23);
+  shape.Resize(kBigSize + 1);
+  EXPECT_EQ(shape.DimensionsCount(), kBigSize + 1);
+  EXPECT_THAT(absl::Span<const int32_t>(shape.DimsData(), kBigSize), Each(23));
+}
+
+TEST_P(RuntimeShapeTest, TestReplaceWith) {
+  static_assert(
+      RuntimeShape::kMaxSmallSize > 2,
+      "kMaxSmallSize should be greater than 2 for this test to work.");
+  const int size = GetParam();
+  for (const int offset : {-2, 2}) {
+    const std::vector<int32_t> src =
+        IotaVector(offset + RuntimeShape::kMaxSmallSize);
+    RuntimeShape shape(size);
+    shape.ReplaceWith(src.size(), src.data());
+    EXPECT_EQ(shape.DimensionsCount(), src.size());
+    EXPECT_THAT(AsSpan(shape), testing::ElementsAreArray(src));
+  }
+}
+
+TEST_P(RuntimeShapeTest, TestBuildFrom) {
+  const int size = GetParam();
+  const std::vector<int32_t> src = IotaVector(size);
+  RuntimeShape shape;
+  shape.BuildFrom(src);
+  EXPECT_EQ(shape.DimensionsCount(), src.size());
+  EXPECT_THAT(AsSpan(shape), testing::ElementsAreArray(src));
+}
+
+TEST(RuntimeShapeTest, TestExtendedShapeSmall) {
+  ASSERT_GE(kSmallSize, 2);
+  const std::vector<int32_t> dims = IotaVector(kSmallSize - 2);
+  const RuntimeShape src(dims.size(), dims.data());
+  const RuntimeShape extended = RuntimeShape::ExtendedShape(kSmallSize, src);
+  EXPECT_EQ(extended.DimensionsCount(), kSmallSize);
+  EXPECT_EQ(extended.Dims(0), 1);
+  EXPECT_EQ(extended.Dims(1), 1);
+  EXPECT_THAT(absl::Span<const int32_t>(extended.DimsData() + 2, dims.size()),
+              ElementsAreArray(dims));
+}
+
+TEST(RuntimeShapeTest, TestExtendedShapeBig) {
+  ASSERT_GE(kSmallSize, 2);
+  const std::vector<int32_t> dims = IotaVector(kBigSize);
+  const RuntimeShape src(dims.size(), dims.data());
+  const RuntimeShape extended = RuntimeShape::ExtendedShape(kBigSize + 2, src);
+  EXPECT_EQ(extended.DimensionsCount(), kBigSize + 2);
+  EXPECT_EQ(extended.Dims(0), 1);
+  EXPECT_EQ(extended.Dims(1), 1);
+  EXPECT_THAT(absl::Span<const int32_t>(extended.DimsData() + 2, dims.size()),
+              ElementsAreArray(dims));
+}
+
+TEST(RuntimeShapeTest, TestExtendedShapeSmallToBig) {
+  const std::vector<int32_t> dims = IotaVector(kSmallSize);
+  const RuntimeShape src(dims.size(), dims.data());
+  const RuntimeShape extended = RuntimeShape::ExtendedShape(kBigSize, src);
+  EXPECT_EQ(extended.DimensionsCount(), kBigSize);
+  EXPECT_THAT(
+      absl::Span<const int32_t>(extended.DimsData(), kBigSize - kSmallSize),
+      Each(1));
+  EXPECT_THAT(absl::Span<const int32_t>(
+                  extended.DimsData() + kBigSize - kSmallSize, dims.size()),
+              ElementsAreArray(dims));
+}
+
+TEST_P(RuntimeShapeTest, TestFlatSize) {
+  const std::vector<int32_t> src = IotaVector(kSmallSize);
+  const RuntimeShape shape(src.size(), src.data());
+  EXPECT_EQ(shape.FlatSize(),
+            std::reduce(src.begin(), src.end(), 1, std::multiplies<int>{}));
+}
+
+INSTANTIATE_TEST_SUITE_P(BigSmall, RuntimeShapeTest,
+                         testing::Values(kSmallSize, kBigSize),
+                         [](const testing::TestParamInfo<int>& info) {
+                           return info.param == kSmallSize ? "Small" : "Big";
+                         });
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/tensor_ctypes.cc b/tensorflow/lite/kernels/internal/tensor_ctypes.cc
new file mode 100644
index 00000000000..6bd58fc1f3b
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/tensor_ctypes.cc
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+
+#include <vector>
+
+namespace tflite {
+
+RuntimeShape GetTensorShape(const TfLiteTensor* tensor) {
+  if (tensor == nullptr) {
+    return RuntimeShape();
+  }
+
+  TfLiteIntArray* dims = tensor->dims;
+  const int dims_size = dims->size;
+  const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
+  return RuntimeShape(dims_size, dims_data);
+}
+
+RuntimeShape GetTensorShape(std::vector<int32_t> data) {
+  return RuntimeShape(data.size(), data.data());
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/tensor_ctypes.h b/tensorflow/lite/kernels/internal/tensor_ctypes.h
index 7e639b919ce..9a7205c0901 100644
--- a/tensorflow/lite/kernels/internal/tensor_ctypes.h
+++ b/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -15,7 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
 
+#include <vector>
+
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -31,16 +34,8 @@ inline const T* GetTensorData(const TfLiteTensor* tensor) {
                            : nullptr;
 }
 
-inline RuntimeShape GetTensorShape(const TfLiteTensor* tensor) {
-  if (tensor == nullptr) {
-    return RuntimeShape();
-  }
-
-  TfLiteIntArray* dims = tensor->dims;
-  const int dims_size = dims->size;
-  const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
-  return RuntimeShape(dims_size, dims_data);
-}
+TFLITE_NOINLINE RuntimeShape GetTensorShape(const TfLiteTensor* tensor);
+RuntimeShape GetTensorShape(std::vector<int32_t> data);
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index d2265ad2b39..043a8513637 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -1025,6 +1025,12 @@ inline void SetActivationParams(int32_t min, int32_t max, P* params) {
   params->quantized_activation_max = max;
 }
 
+template <typename P>
+inline void SetActivationParams(uint32_t min, uint32_t max, P* params) {
+  params->quantized_activation_min = min;
+  params->quantized_activation_max = max;
+}
+
 template <typename P>
 inline void SetActivationParams(int16_t min, int16_t max, P* params) {
   params->int16_activation_min = min;
@@ -1043,6 +1049,12 @@ inline void GetActivationParams(const P& params, int32_t* min, int32_t* max) {
   *max = params.quantized_activation_max;
 }
 
+template <typename P>
+inline void GetActivationParams(const P& params, uint32_t* min, uint32_t* max) {
+  *min = params.quantized_activation_min;
+  *max = params.quantized_activation_max;
+}
+
 template <typename P>
 inline void GetActivationParams(const P& params, int16_t* min, int16_t* max) {
   *min = params.int16_activation_min;
diff --git a/tensorflow/lite/kernels/irfft2d.cc b/tensorflow/lite/kernels/irfft2d.cc
index 057fca106fa..fd80666efff 100644
--- a/tensorflow/lite/kernels/irfft2d.cc
+++ b/tensorflow/lite/kernels/irfft2d.cc
@@ -207,7 +207,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Exit early if fft_length is a non-const tensor. Set output tensor and
   // temporary tensors to dynamic, so that their tensor sizes can be determined
   // in Eval.
-  if (!IsConstantTensor(fft_length)) {
+  if (!IsConstantOrPersistentTensor(fft_length)) {
     TfLiteTensor* fft_integer_working_area;
     TF_LITE_ENSURE_OK(
         context, GetTemporarySafe(context, node, kFftIntegerWorkingAreaTensor,
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index 3a56d171474..7f9936a48ef 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -193,6 +193,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             CpuBackendContext::GetFromContext(context);
         pthreadpool_t threadpool =
             cpu_backend_context->get_xnnpack_threadpool();
+        threadpool = nullptr;
         enum xnn_status status = xnn_status_invalid_parameter;
         if (std::is_same<OpType, MaximumOp>::value) {
           status = xnn_run_maximum_nd_f32(
diff --git a/tensorflow/lite/kernels/mirror_pad.cc b/tensorflow/lite/kernels/mirror_pad.cc
index 45384ff60dd..c2e72e1e2af 100644
--- a/tensorflow/lite/kernels/mirror_pad.cc
+++ b/tensorflow/lite/kernels/mirror_pad.cc
@@ -289,7 +289,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, output_tensor->params.zero_point, 0);
   }
 
-  if (!IsConstantTensor(padding_matrix)) {
+  if (!IsConstantOrPersistentTensor(padding_matrix)) {
     SetTensorToDynamic(output_tensor);
     return kTfLiteOk;
   }
diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
index cfea589e952..9c6e55e4025 100644
--- a/tensorflow/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@ limitations under the License.
 #include <complex>
 
 #include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
@@ -124,7 +125,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
-      output->type == kTfLiteInt16) {
+      (output->quantization.type != kTfLiteNoQuantization &&
+       output->type == kTfLiteInt16)) {
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
         context, params->activation, output, &data->output_activation_min,
         &data->output_activation_max));
@@ -177,6 +179,12 @@ void EvalMul(TfLiteContext* context, TfLiteNode* node, TfLiteMulParams* params,
         TF_LITE_MUL(optimized_ops, Mul, int32_t);
       }
     }
+  } else if (output->type == kTfLiteUInt32) {
+    if (need_broadcast) {
+      TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, uint32_t);
+    } else {
+      TF_LITE_MUL(reference_ops, Mul, uint32_t);
+    }
   } else if (output->type == kTfLiteFloat32) {
     if (kernel_type == kReference) {
       if (need_broadcast) {
@@ -203,6 +211,7 @@ void EvalMul(TfLiteContext* context, TfLiteNode* node, TfLiteMulParams* params,
             CpuBackendContext::GetFromContext(context);
         pthreadpool_t threadpool =
             cpu_backend_context->get_xnnpack_threadpool();
+        threadpool = nullptr;
         float output_min = -std::numeric_limits<float>::infinity();
         float output_max = std::numeric_limits<float>::infinity();
         // NOTE: In the case of NaN inputs the behavior is platform-dependent.
@@ -234,6 +243,23 @@ void EvalMul(TfLiteContext* context, TfLiteNode* node, TfLiteMulParams* params,
         TF_LITE_MUL(optimized_ops, Mul, float);
       }
     }
+  } else if (output->type == kTfLiteInt16) {
+    int16_t output_activation_min, output_activation_max;
+    CalculateActivationRange(params->activation, &output_activation_min,
+                             &output_activation_max);
+    SetActivationParams(output_activation_min, output_activation_max,
+                        &op_params);
+    if (need_broadcast) {
+      reference_ops::BroadcastMul4DSlow<int16_t, true>(
+          op_params, GetTensorShape(input1), GetTensorData<int16_t>(input1),
+          GetTensorShape(input2), GetTensorData<int16_t>(input2),
+          GetTensorShape(output), GetTensorData<int16_t>(output));
+    } else {
+      reference_ops::Mul<int16_t>(
+          op_params, GetTensorShape(input1), GetTensorData<int16_t>(input1),
+          GetTensorShape(input2), GetTensorData<int16_t>(input2),
+          GetTensorShape(output), GetTensorData<int16_t>(output));
+    }
   } else if (output->type == kTfLiteInt64) {
     if (need_broadcast) {
       TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, int64_t);
@@ -361,8 +387,11 @@ template <KernelType kernel_type>
 TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node, OpData* data,
                       TfLiteMulParams* params, const TfLiteTensor* input1,
                       const TfLiteTensor* input2, TfLiteTensor* output) {
+  bool output_quantized = output->quantization.type != kTfLiteNoQuantization;
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32 ||
-      output->type == kTfLiteInt64 || output->type == kTfLiteComplex64) {
+      output->type == kTfLiteInt64 || output->type == kTfLiteComplex64 ||
+      (!output_quantized && output->type == kTfLiteInt16) ||
+      output->type == kTfLiteUInt32) {
     EvalMul<kernel_type>(context, node, params, data, input1, input2, output);
   } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
              output->type == kTfLiteInt16) {
diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc
index 05426272bae..ccb058503e6 100644
--- a/tensorflow/lite/kernels/mul_test.cc
+++ b/tensorflow/lite/kernels/mul_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -88,6 +88,20 @@ class IntegerMulOpModel : public BaseMulOpModel<int32_t> {
   std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
 };
 
+class UnsignedInteger32BitMulOpModel : public BaseMulOpModel<uint32_t> {
+ public:
+  using BaseMulOpModel::BaseMulOpModel;
+
+  std::vector<uint32_t> GetOutput() { return ExtractVector<uint32_t>(output_); }
+};
+
+class Integer16BitMulOpModel : public BaseMulOpModel<int16_t> {
+ public:
+  using BaseMulOpModel::BaseMulOpModel;
+
+  std::vector<int16_t> GetOutput() { return ExtractVector<int16_t>(output_); }
+};
+
 // For quantized Mul, the error shouldn't exceed (2*step + step^2).
 // The param min=-1.0 & max=1.0 is used in the following tests.
 // The tolerance value is ~0.0157.
@@ -346,6 +360,86 @@ TEST_P(MulOpTest, IntegerNoActivation) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({-20, 4, 21, 40}));
 }
 
+TEST_P(MulOpTest, Int16ActivationRELU_N1_TO_1) {
+  bool constant_tensors = GetParam();
+  if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
+    // NNAPI does not support graphs with all constant inputs.
+    return;
+  }
+  Integer16BitMulOpModel m(
+      {TensorType_INT16, {1, 2, 2, 1}}, {TensorType_INT16, {1, 2, 2, 1}},
+      {TensorType_INT16, {}}, ActivationFunctionType_RELU_N1_TO_1,
+      {-20, 2, 7, 8}, {1, 2, 3, 5}, constant_tensors);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1, 1, 1, 1}));
+}
+
+TEST_P(MulOpTest, Int16VariousInputShapes) {
+  bool constant_tensors = GetParam();
+  if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
+    // NNAPI does not support graphs with all constant inputs.
+    return;
+  }
+  const std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    Integer16BitMulOpModel m(
+        {TensorType_INT16, test_shapes[i]}, {TensorType_INT16, test_shapes[i]},
+        {TensorType_INT16, {}}, ActivationFunctionType_NONE,
+        {-20, 2, 7, 8, 11, 20}, {1, 2, 3, 5, 11, 1}, constant_tensors);
+    ASSERT_EQ(m.Invoke(), kTfLiteOk);
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({-20, 4, 21, 40, 121, 20}))
+        << "With shape number " << i;
+  }
+}
+
+TEST_P(MulOpTest, Int16WithBroadcast) {
+  bool constant_tensors = GetParam();
+  if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
+    // NNAPI does not support graphs with all constant inputs.
+    return;
+  }
+  const std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    Integer16BitMulOpModel m({TensorType_INT16, test_shapes[i]},
+                             {TensorType_INT16, {}},  // always a scalar
+                             {TensorType_INT16, {}},
+                             ActivationFunctionType_NONE,
+                             {-20, 2, 7, 8, 11, 20}, {1}, constant_tensors);
+    ASSERT_EQ(m.Invoke(), kTfLiteOk);
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({-20, 2, 7, 8, 11, 20}))
+        << "With shape number " << i;
+  }
+}
+
+TEST_P(MulOpTest, 16BitIntegerNoActivation) {
+  bool constant_tensors = GetParam();
+  if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
+    // NNAPI does not support graphs with all constant inputs.
+    return;
+  }
+  Integer16BitMulOpModel m({TensorType_INT16, {4}}, {TensorType_INT16, {4}},
+                           {TensorType_INT16, {}}, ActivationFunctionType_NONE,
+                           {-20, 2, 7, 8}, {1, 2, 3, 5}, constant_tensors);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-20, 4, 21, 40}));
+}
+
+TEST_P(MulOpTest, 32BitUnsignedIntegerNoActivation) {
+  bool constant_tensors = GetParam();
+  if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
+    // NNAPI does not support graphs with all constant inputs.
+    return;
+  }
+  UnsignedInteger32BitMulOpModel m(
+      {TensorType_UINT32, {1, 2, 2, 1}}, {TensorType_UINT32, {1, 2, 2, 1}},
+      {TensorType_UINT32, {}}, ActivationFunctionType_NONE, {20, 2, 7, 8},
+      {1, 2, 3, 5}, constant_tensors);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({20, 4, 21, 40}));
+}
+
 TEST_P(MulOpTest, ComplexBaseTest) {
   bool constant_tensors = GetParam();
   if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
diff --git a/tensorflow/lite/kernels/non_max_suppression.cc b/tensorflow/lite/kernels/non_max_suppression.cc
index 5779c2b2812..1d9443b520b 100644
--- a/tensorflow/lite/kernels/non_max_suppression.cc
+++ b/tensorflow/lite/kernels/non_max_suppression.cc
@@ -100,7 +100,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                  &input_max_output_size));
   TF_LITE_ENSURE_EQ(context, input_max_output_size->type, kTfLiteInt32);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_max_output_size), 0);
-  const bool is_max_output_size_const = IsConstantTensor(input_max_output_size);
+  const bool is_max_output_size_const =
+      IsConstantOrPersistentTensor(input_max_output_size);
   int max_output_size_value = 0;
   if (is_max_output_size_const) {
     max_output_size_value = *GetTensorData<int>(input_max_output_size);
@@ -214,7 +215,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                  &input_max_output_size));
   const int max_output_size_value = *GetTensorData<int>(input_max_output_size);
   TF_LITE_ENSURE(context, (max_output_size_value >= 0));
-  const bool is_max_output_size_const = IsConstantTensor(input_max_output_size);
+  const bool is_max_output_size_const =
+      IsConstantOrPersistentTensor(input_max_output_size);
   const TfLiteTensor* input_iou_threshold;
   TF_LITE_ENSURE_OK(context,
                     GetInputSafe(context, node, kInputTensorIouThreshold,
diff --git a/tensorflow/lite/kernels/one_hot.cc b/tensorflow/lite/kernels/one_hot.cc
index 2707d45312e..0177c073c3f 100644
--- a/tensorflow/lite/kernels/one_hot.cc
+++ b/tensorflow/lite/kernels/one_hot.cc
@@ -157,7 +157,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_TYPES_EQ(context, op_context.off_value->type,
                           op_context.dtype);
 
-  if (!IsConstantTensor(op_context.depth)) {
+  if (!IsConstantOrPersistentTensor(op_context.depth)) {
     SetTensorToDynamic(op_context.output);
     return kTfLiteOk;
   }
diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
index 7cb63136904..e4bc806a577 100644
--- a/tensorflow/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -48,8 +48,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, data->axis >= 0);
 
   if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32 &&
-      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt8 &&
-      input0->type != kTfLiteInt16 && input0->type != kTfLiteInt64) {
+      input0->type != kTfLiteUInt8 && input0->type != kTfLiteUInt32 &&
+      input0->type != kTfLiteInt8 && input0->type != kTfLiteInt16 &&
+      input0->type != kTfLiteInt64) {
     TF_LITE_KERNEL_LOG(context, "Type '%s' is not supported by pack.",
                        TfLiteTypeGetName(input0->type));
     return kTfLiteError;
@@ -123,6 +124,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return PackImpl<uint8_t>(context, node, output, data->values_count,
                                data->axis);
     }
+    case kTfLiteUInt32: {
+      return PackImpl<uint32_t>(context, node, output, data->values_count,
+                                data->axis);
+    }
     case kTfLiteInt8: {
       return PackImpl<int8_t>(context, node, output, data->values_count,
                               data->axis);
diff --git a/tensorflow/lite/kernels/pack_test.cc b/tensorflow/lite/kernels/pack_test.cc
index a1a47b6088d..f198a8fbc1d 100644
--- a/tensorflow/lite/kernels/pack_test.cc
+++ b/tensorflow/lite/kernels/pack_test.cc
@@ -110,6 +110,47 @@ TEST(PackOpTest, FloatFiveDimensions) {
                                 15, 16, 25, 26, 27, 28, 29, 30, 31, 32}));
 }
 
+// uint32 tests.
+TEST(PackOpTest, UInt32ThreeInputs) {
+  PackOpModel<uint32_t> model({TensorType_UINT32, {2}}, 0, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
+}
+
+TEST(PackOpTest, UInt32ThreeInputsDifferentAxis) {
+  PackOpModel<uint32_t> model({TensorType_UINT32, {2}}, 1, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(PackOpTest, UInt32ThreeInputsNegativeAxis) {
+  PackOpModel<uint32_t> model({TensorType_UINT32, {2}}, -1, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(PackOpTest, UInt32MultilDimensions) {
+  PackOpModel<uint32_t> model({TensorType_UINT32, {2, 3}}, 1, 2);
+  model.SetInput(0, {1, 2, 3, 4, 5, 6});
+  model.SetInput(1, {7, 8, 9, 10, 11, 12});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+
 // int32 tests.
 TEST(PackOpTest, Int32ThreeInputs) {
   PackOpModel<int32_t> model({TensorType_INT32, {2}}, 0, 3);
diff --git a/tensorflow/lite/kernels/pad.cc b/tensorflow/lite/kernels/pad.cc
index e13cbbe4421..9ba0d9b3e74 100644
--- a/tensorflow/lite/kernels/pad.cc
+++ b/tensorflow/lite/kernels/pad.cc
@@ -126,7 +126,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // unranked input. Set output tensor to dynamic so output size can be
   // determined in Eval.
   if (NumDimensions(op_context.input) == 0 ||
-      !IsConstantTensor(op_context.paddings)) {
+      !IsConstantOrPersistentTensor(op_context.paddings)) {
     SetTensorToDynamic(op_context.output);
     return kTfLiteOk;
   }
diff --git a/tensorflow/lite/kernels/parse_example/BUILD b/tensorflow/lite/kernels/parse_example/BUILD
index 83dc6e0fbb4..a4ad08d79a0 100644
--- a/tensorflow/lite/kernels/parse_example/BUILD
+++ b/tensorflow/lite/kernels/parse_example/BUILD
@@ -24,14 +24,14 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     features = tf_features_nolayering_check_if_ios(),
     deps = [
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@flatbuffers",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite:string_util",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@flatbuffers",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -57,16 +57,16 @@ tf_cc_test(
     features = tf_features_nolayering_check_if_ios(),
     deps = [
         ":parse_example",
-        "@flatbuffers",
-        "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/core/api:op_resolver",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
         "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/api:op_resolver",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/kernels:test_main",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:string_util",
+        "@flatbuffers",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
diff --git a/tensorflow/lite/kernels/random_ops.cc b/tensorflow/lite/kernels/random_ops.cc
index d8ed2eb6e5b..fde1b6771c9 100644
--- a/tensorflow/lite/kernels/random_ops.cc
+++ b/tensorflow/lite/kernels/random_ops.cc
@@ -17,11 +17,11 @@ limitations under the License.
 #include <cstdint>
 #include <random>
 
-#include "tensorflow/core/lib/random/philox_random.h"
-#include "tensorflow/core/lib/random/random_distributions_utils.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/tsl/lib/random/philox_random.h"
+#include "tensorflow/tsl/lib/random/random_distributions_utils.h"
 
 namespace tflite {
 namespace ops {
@@ -30,7 +30,7 @@ namespace random {
 
 namespace {
 
-using Generator = ::tensorflow::random::PhiloxRandom;
+using Generator = ::tsl::random::PhiloxRandom;
 
 enum RandomType { kRandomUniform, kRandomStandardNormal, kMultinomial };
 
@@ -68,7 +68,7 @@ void GenerateRandomUniformNumbers(
     typename Generator::ResultType samples = rng();
     const int rng_net_size = std::min(rng_size, buffer_size - current_size);
     for (int i = 0; i < rng_net_size; i++) {
-      buffer[current_size + i] = tensorflow::random::Uint32ToFloat(samples[i]);
+      buffer[current_size + i] = tsl::random::Uint32ToFloat(samples[i]);
     }
     current_size += rng_net_size;
   }
@@ -85,9 +85,9 @@ void GenerateRandomStandardNormalNumbers(
     typename Generator::ResultType samples = rng();
     const int rng_net_size = std::min(rng_size, buffer_size - current_size);
     for (int i = 0; i < rng_net_size; i += 2) {
-      tensorflow::random::BoxMullerFloat(samples[i], samples[i + 1],
-                                         &buffer[current_size + i],
-                                         &buffer[current_size + i + 1]);
+      tsl::random::BoxMullerFloat(samples[i], samples[i + 1],
+                                  &buffer[current_size + i],
+                                  &buffer[current_size + i + 1]);
     }
     current_size += rng_net_size;
   }
@@ -153,7 +153,7 @@ void GenerateMultinomialNumbers(Generator& rng, int batch_size,
       x1 = rng_results[used_rng_results_index + 1];
       used_rng_results_index += 2;
       const double to_find =
-          (tensorflow::random::Uint64ToDouble(x0, x1) * cumulative_total);
+          (tsl::random::Uint64ToDouble(x0, x1) * cumulative_total);
       auto found_iter = std::upper_bound(cdf.begin(), cdf.end(), to_find);
       output_row[j] = std::distance(cdf.begin(), found_iter);
     }
@@ -185,7 +185,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   InitializeOpData(node);
 
   TfLiteTensor* output = GetOutput(context, node, 0);
-  if (!IsConstantTensor(shape)) {
+  if (!IsConstantOrPersistentTensor(shape)) {
     SetTensorToDynamic(output);
     return kTfLiteOk;
   }
@@ -214,7 +214,8 @@ TfLiteStatus PrepareMultinomial(TfLiteContext* context, TfLiteNode* node) {
   InitializeOpData(node);
 
   TfLiteTensor* output = GetOutput(context, node, 0);
-  if (!IsConstantTensor(logits) || !IsConstantTensor(num_samples)) {
+  if (!IsConstantOrPersistentTensor(logits) ||
+      !IsConstantOrPersistentTensor(num_samples)) {
     SetTensorToDynamic(output);
     return kTfLiteOk;
   }
diff --git a/tensorflow/lite/kernels/random_standard_normal_custom.cc b/tensorflow/lite/kernels/random_standard_normal_custom.cc
index af695146cec..1ea5142ffed 100644
--- a/tensorflow/lite/kernels/random_standard_normal_custom.cc
+++ b/tensorflow/lite/kernels/random_standard_normal_custom.cc
@@ -63,7 +63,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, shape->type, kTfLiteInt32);
   TF_LITE_ENSURE_EQ(context, NumDimensions(shape), 1);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  if (!IsConstantTensor(shape)) {
+  if (!IsConstantOrPersistentTensor(shape)) {
     SetTensorToDynamic(output);
     return kTfLiteOk;
   }
diff --git a/tensorflow/lite/kernels/random_uniform_custom.cc b/tensorflow/lite/kernels/random_uniform_custom.cc
index 3b7b94ec910..05ed2c196a2 100644
--- a/tensorflow/lite/kernels/random_uniform_custom.cc
+++ b/tensorflow/lite/kernels/random_uniform_custom.cc
@@ -76,7 +76,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                  input->type == kTfLiteInt32 || input->type == kTfLiteInt64);
   TF_LITE_ENSURE_EQ(context, tflite::NumDimensions(input), 1);
   TfLiteTensor* output = tflite::GetOutput(context, node, 0);
-  if (!IsConstantTensor(input)) {
+  if (!IsConstantOrPersistentTensor(input)) {
     SetTensorToDynamic(output);
     return kTfLiteOk;
   }
diff --git a/tensorflow/lite/kernels/range.cc b/tensorflow/lite/kernels/range.cc
index 2321ec50990..e0f22795c7d 100644
--- a/tensorflow/lite/kernels/range.cc
+++ b/tensorflow/lite/kernels/range.cc
@@ -155,7 +155,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       IsConstantOrPersistentTensor(limit) &&
       IsConstantOrPersistentTensor(delta)) {
     SetTensorToPersistentRo(output);
-    ResizeOutput(context, start, limit, delta, output);
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutput(context, start, limit, delta, output));
 
     op_data->noop = true;
     return EvalImpl(context, start, delta, output);
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index a37f4170f5a..ad0f0c10500 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -41,6 +41,7 @@ namespace ops {
 namespace builtin {
 namespace reduce {
 
+const int kMaxConstantOutputTensorSize = 8;
 // This file has reference implementation of reduce_* operators.
 enum KernelType {
   kReference,
@@ -52,6 +53,8 @@ struct OpData {
   int shift;
   // The index of the temporary tensor where the quantized inputs are cached.
   int scratch_tensor_index;
+  // Indicates that 'Eval' is a noop as the output as written during 'Prepare'.
+  bool noop;
 };
 
 struct OpContext {
@@ -79,6 +82,9 @@ void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
 
+template <KernelType kernel_type>
+TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node);
+
 // Resizes the temp tensor that stores resolved axis.
 TfLiteStatus ResizeTempAxis(TfLiteContext* context, OpContext* op_context,
                             TfLiteTensor* resolved_axis) {
@@ -95,14 +101,15 @@ TfLiteStatus ResizeTempAccum(TfLiteContext* context, OpContext* op_context,
   return context->ResizeTensor(context, temp_accum, size);
 }
 
-// Resizes output array based on the input size and resolved axis.
-TfLiteStatus ResizeOutputTensor(TfLiteContext* context, OpContext* op_context) {
+// Returns the output shape.
+TfLiteStatus GetOutputShape(TfLiteContext* context, OpContext* op_context,
+                            TfLiteIntArray** output_shape) {
   size_t num_axis = NumElements(op_context->axis);
   const TfLiteIntArray* input_dims = op_context->input->dims;
   int input_num_dims = NumDimensions(op_context->input);
   if (input_num_dims == 0) {
-    return context->ResizeTensor(context, op_context->output,
-                                 TfLiteIntArrayCreate(0));
+    *output_shape = TfLiteIntArrayCreate(0);
+    return kTfLiteOk;
   }
   const int* axis = GetTensorData<int>(op_context->axis);
   if (op_context->params->keep_dims) {
@@ -121,7 +128,8 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context, OpContext* op_context) {
         output_dims->data[idx] = input_dims->data[idx];
       }
     }
-    return context->ResizeTensor(context, op_context->output, output_dims);
+    *output_shape = output_dims;
+    return kTfLiteOk;
   } else {
     // Calculates size of reducing axis.
     int num_reduce_axis = num_axis;
@@ -159,10 +167,18 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context, OpContext* op_context) {
         output_dims->data[idx - num_skip_axis] = input_dims->data[idx];
       }
     }
-    return context->ResizeTensor(context, op_context->output, output_dims);
+    *output_shape = output_dims;
+    return kTfLiteOk;
   }
 }
 
+// Resizes output array based on the input size and resolved axis.
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context, OpContext* op_context) {
+  TfLiteIntArray* output_dims;
+  TF_LITE_ENSURE_OK(context, GetOutputShape(context, op_context, &output_dims));
+  return context->ResizeTensor(context, op_context->output, output_dims);
+}
+
 // Resizes the temp tensor that stores normalized dims.
 TfLiteStatus ResizeTempDims(TfLiteContext* context, OpContext* op_context,
                             TfLiteTensor* normalized_dims) {
@@ -238,6 +254,24 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
   TF_LITE_ENSURE_TYPES_EQ(context, op_context.axis->type, kTfLiteInt32);
   TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  data->noop = IsConstantOrPersistentTensor(op_context.input) &&
+               IsConstantOrPersistentTensor(op_context.axis);
+  if (data->noop) {
+    // Constant reductions should only be used for small outputs, typically
+    // coming from Shape tensors. Constant reductions on larger tensors could
+    // increase memory usage due to the output not being stored in the Arena.
+    TfLiteIntArray* output_shape;
+    TF_LITE_ENSURE_OK(context,
+                      GetOutputShape(context, &op_context, &output_shape));
+    int output_num_elements = 1;
+    for (int i = 0; i < output_shape->size; ++i) {
+      output_num_elements *= output_shape->data[i];
+    }
+    data->noop &= output_num_elements <= kMaxConstantOutputTensorSize;
+    TfLiteIntArrayFree(output_shape);
+  }
 
   if (op_context.input->type == kTfLiteInt16) {
     TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point, 0);
@@ -254,6 +288,7 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
   if (!IsConstantOrPersistentTensor(op_context.input)) {
     SetTensorToDynamic(normalized_dims);
   } else {
+    TfLiteTensorDataFree(normalized_dims);
     normalized_dims->allocation_type = kTfLiteArenaRw;
     TF_LITE_ENSURE_OK(context,
                       ResizeTempDims(context, &op_context, normalized_dims));
@@ -264,6 +299,7 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
     SetTensorToDynamic(resolved_axis);
     return kTfLiteOk;
   }
+  TfLiteTensorDataFree(resolved_axis);
   resolved_axis->allocation_type = kTfLiteArenaRw;
   TF_LITE_ENSURE_OK(context,
                     ResizeTempAxis(context, &op_context, resolved_axis));
@@ -356,8 +392,30 @@ TfLiteStatus PrepareProd(TfLiteContext* context, TfLiteNode* node) {
     QuantizeMultiplier(scaling, &data->multiplier, &data->shift);
   }
 
-  temp_prod->allocation_type = kTfLiteArenaRw;
-  return ResizeTempAccum(context, &op_context, temp_prod);
+  if (data->noop) {
+    SetTensorToDynamic(temp_prod);
+    SetTensorToPersistentRo(op_context.output);
+    TF_LITE_ENSURE_OK(context,
+                      ResizeTempAccum(context, &op_context, temp_prod));
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+
+    TfLiteTensor* resolved_axis;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/1, &resolved_axis));
+    SetTensorToDynamic(resolved_axis);
+    TF_LITE_ENSURE_OK(context,
+                      ResizeTempAxis(context, &op_context, resolved_axis));
+    TfLiteTensor* normalized_dims;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/3,
+                                                &normalized_dims));
+    SetTensorToDynamic(normalized_dims);
+    TF_LITE_ENSURE_OK(context,
+                      ResizeTempDims(context, &op_context, normalized_dims));
+    return EvalImpl<kGenericOptimized>(context, node);
+  } else {
+    temp_prod->allocation_type = kTfLiteArenaRw;
+    return ResizeTempAccum(context, &op_context, temp_prod);
+  }
 }
 
 void ResolveAxis(const int* axis_data, int axis_count,
@@ -989,6 +1047,14 @@ TfLiteStatus EvalQuantizedProd(TfLiteContext* context, TfLiteNode* node,
 
 template <KernelType kernel_type>
 TfLiteStatus EvalProd(TfLiteContext* context, TfLiteNode* node) {
+  if (reinterpret_cast<const OpData*>(node->user_data)->noop) {
+    return kTfLiteOk;
+  }
+  return EvalImpl<kernel_type>(context, node);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
   // As we need to support both quantized and non-quantized int8/int16 inputs,
   // we separate the evaluation between EvalQuantizedProd for quantized
diff --git a/tensorflow/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc
index 8bd742659b6..359634cde14 100644
--- a/tensorflow/lite/kernels/reduce_test.cc
+++ b/tensorflow/lite/kernels/reduce_test.cc
@@ -38,6 +38,10 @@ using MeanOpDynamicModel = BaseDynamicOpModel<BuiltinOperator_MEAN>;
 using SumOpConstModel = BaseConstOpModel<BuiltinOperator_SUM>;
 using SumOpDynamicModel = BaseDynamicOpModel<BuiltinOperator_SUM>;
 
+template <typename T>
+using ProdOpFullyConstModel =
+    BaseFullyConstOpModel<T, BuiltinOperator_REDUCE_PROD, true>;
+
 using ProdOpConstModel = BaseConstOpModel<BuiltinOperator_REDUCE_PROD, true>;
 using ProdOpDynamicModel =
     BaseDynamicOpModel<BuiltinOperator_REDUCE_PROD, true>;
@@ -889,6 +893,34 @@ TEST(ConstInt8SumOpTest, Rescale) {
 }
 
 // Tests for reduce_prod
+TEST(FullyConstFloatProdOpTest, SmallInput) {
+  const std::vector<int32_t> data = {2, 3, 4};
+  ProdOpFullyConstModel<int32_t> m({TensorType_INT32, {3}}, data,
+                                   {TensorType_INT32, {1}}, {1}, {0}, false);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({24}));
+  EXPECT_EQ(m.GetOutputTensor(0)->allocation_type, kTfLitePersistentRo);
+}
+
+TEST(FullyConstFloatProdOpTest, LargeInput) {
+  const std::vector<int32_t> data = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  ProdOpFullyConstModel<int32_t> m({TensorType_INT32, {9}}, data,
+                                   {TensorType_INT32, {1}}, {1}, {0}, false);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({362880}));
+  EXPECT_EQ(m.GetOutputTensor(0)->allocation_type, kTfLitePersistentRo);
+}
+
+TEST(ConstFloatProdOpTest, AllInputsAreConstantLargeOutput) {
+  const std::vector<int> data = {1,  2,  3,  4,  5,  6,  7,  8,  9,
+                                 10, 11, 12, 13, 14, 15, 16, 17, 18};
+  ProdOpFullyConstModel<int> m({TensorType_INT32, {2, 1, 3, 3}}, data,
+                               {TensorType_INT32, {1}}, {1}, {1}, false);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3, 3}));
+  EXPECT_THAT(m.GetOutput<int>(), ElementsAreArray(data));
+  EXPECT_EQ(m.GetOutputTensor(0)->allocation_type, kTfLiteArenaRw);
+}
 
 TEST(ConstFloatProdOpTest, NotKeepDimsLarge) {
   const std::vector<float> data = {
diff --git a/tensorflow/lite/kernels/reduce_test_common.h b/tensorflow/lite/kernels/reduce_test_common.h
index c7f6b60e64c..ce24f4392fb 100644
--- a/tensorflow/lite/kernels/reduce_test_common.h
+++ b/tensorflow/lite/kernels/reduce_test_common.h
@@ -41,6 +41,10 @@ class BaseOpModel : public SingleOpModel {
                          GetZeroPoint(output_));
   }
 
+  const TfLiteTensor* GetOutputTensor(int index) {
+    return interpreter_->output_tensor(index);
+  }
+
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
   int Input() { return input_; }
@@ -87,6 +91,28 @@ class BaseConstOpModel : public BaseOpModel {
   }
 };
 
+// Model for the tests case where the input and axis are const tensors.
+template <typename InputType, BuiltinOperator op_code,
+          bool symmetric_int16_scaling = false>
+class BaseFullyConstOpModel : public BaseOpModel {
+ public:
+  BaseFullyConstOpModel(TensorData input, std::vector<InputType> input_data,
+                        TensorData output,
+                        std::initializer_list<int> axis_shape,
+                        std::initializer_list<int> axis, bool keep_dims) {
+    if (symmetric_int16_scaling) {
+      SymmetricInt16Scaling(input);
+      SymmetricInt16Scaling(output);
+    }
+    input_ = AddConstInput(input, input_data);
+    axis_ = AddConstInput(TensorType_INT32, axis, axis_shape);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op_code, BuiltinOptions_ReducerOptions,
+                 CreateReducerOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+};
+
 // Model for the tests case where axis is a dynamic tensor.
 template <BuiltinOperator op_code, bool symmetric_int16_scaling = false>
 class BaseDynamicOpModel : public BaseOpModel {
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index a7b6472b1de..0d3205cb815 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -181,6 +181,9 @@ TfLiteRegistration* Register_CALL_ONCE();
 TfLiteRegistration* Register_VAR_HANDLE();
 TfLiteRegistration* Register_READ_VARIABLE();
 TfLiteRegistration* Register_ASSIGN_VARIABLE();
+TfLiteRegistration* Register_BITCAST();
+TfLiteRegistration* Register_BITWISE_XOR();
+TfLiteRegistration* Register_RIGHT_SHIFT();
 
 namespace {
 
@@ -278,13 +281,13 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
   AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND,
              Register_SPACE_TO_BATCH_ND_REF(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND,
              Register_BATCH_TO_SPACE_ND_REF(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_MUL, Register_MUL_REF(), /* min_version = */ 1,
-             /* max_version = */ 6);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2NORM_REF(),
              /* min_version = */ 1,
              /* max_version = */ 2);
@@ -351,7 +354,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_LOG, Register_LOG());
   AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX_REF(),
              /* min_version = */ 1,
@@ -528,6 +531,9 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
   AddBuiltin(BuiltinOperator_VAR_HANDLE, Register_VAR_HANDLE());
   AddBuiltin(BuiltinOperator_READ_VARIABLE, Register_READ_VARIABLE());
   AddBuiltin(BuiltinOperator_ASSIGN_VARIABLE, Register_ASSIGN_VARIABLE());
+  AddBuiltin(BuiltinOperator_BITCAST, Register_BITCAST());
+  AddBuiltin(BuiltinOperator_BITWISE_XOR, Register_BITWISE_XOR());
+  AddBuiltin(BuiltinOperator_RIGHT_SHIFT, Register_RIGHT_SHIFT());
   AddCustom("NumericVerify",
             tflite::ops::custom::Register_NUMERIC_VERIFY_REF());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
diff --git a/tensorflow/lite/kernels/resize_bilinear.cc b/tensorflow/lite/kernels/resize_bilinear.cc
index b04500f6afa..d78b5247d8a 100644
--- a/tensorflow/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/lite/kernels/resize_bilinear.cc
@@ -80,7 +80,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // integers.
   output->type = input->type;
 
-  if (!IsConstantTensor(size)) {
+  if (!IsConstantOrPersistentTensor(size)) {
     SetTensorToDynamic(output);
     return kTfLiteOk;
   }
diff --git a/tensorflow/lite/kernels/rfft2d.cc b/tensorflow/lite/kernels/rfft2d.cc
index ab01bf72fd6..97f9480325c 100644
--- a/tensorflow/lite/kernels/rfft2d.cc
+++ b/tensorflow/lite/kernels/rfft2d.cc
@@ -207,7 +207,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Exit early if fft_length is a non-const tensor. Set output tensor and
   // temporary tensors to dynamic, so that their tensor sizes can be determined
   // in Eval.
-  if (!IsConstantTensor(fft_length)) {
+  if (!IsConstantOrPersistentTensor(fft_length)) {
     TfLiteTensor* fft_integer_working_area;
     TF_LITE_ENSURE_OK(
         context, GetTemporarySafe(context, node, kFftIntegerWorkingAreaTensor,
diff --git a/tensorflow/lite/kernels/right_shift.cc b/tensorflow/lite/kernels/right_shift.cc
new file mode 100644
index 00000000000..7c3c7dc6cf9
--- /dev/null
+++ b/tensorflow/lite/kernels/right_shift.cc
@@ -0,0 +1,222 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <climits>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace right_shift {
+
+// Input/output tensor index.
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+// Op data for right shift op.
+struct OpData {
+  bool requires_broadcast = false;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
+
+  output->type = input1->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <typename T>
+T RightShift(T x, T y) {
+  // Avoids UB: don't shift by larger than the bitwidth of T.
+  T y_clamped = y;
+  if (y_clamped < 0) {
+    y_clamped = 0;
+  } else if (y_clamped > sizeof(T) * CHAR_BIT - 1) {
+    y_clamped = sizeof(T) * CHAR_BIT - 1;
+  }
+  // Technically right shifts of signed integers are not necessarily
+  // arithmetic shifts according to the C++ standard. However in practice most
+  // implementations are arithmetic shifts. If this proves to be a problem in
+  // practice, we may need to use an alternative implementation.
+  return x >> y_clamped;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+
+  const TfLiteType type = output->type;
+  switch (type) {
+    case kTfLiteUInt8: {
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastBinaryFunction4DSlow<uint8_t, uint8_t, uint8_t>(
+            GetTensorShape(input1), GetTensorData<uint8_t>(input1),
+            GetTensorShape(input2), GetTensorData<uint8_t>(input2),
+            GetTensorShape(output), GetTensorData<uint8_t>(output), RightShift);
+      } else {
+        reference_ops::BinaryFunction<uint8_t, uint8_t, uint8_t>(
+            GetTensorShape(input1), GetTensorData<uint8_t>(input1),
+            GetTensorShape(input2), GetTensorData<uint8_t>(input2),
+            GetTensorShape(output), GetTensorData<uint8_t>(output), RightShift);
+      }
+      break;
+    }
+    case kTfLiteInt8: {
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastBinaryFunction4DSlow<int8_t, int8_t, int8_t>(
+            GetTensorShape(input1), GetTensorData<int8_t>(input1),
+            GetTensorShape(input2), GetTensorData<int8_t>(input2),
+            GetTensorShape(output), GetTensorData<int8_t>(output), RightShift);
+      } else {
+        reference_ops::BinaryFunction<int8_t, int8_t, int8_t>(
+            GetTensorShape(input1), GetTensorData<int8_t>(input1),
+            GetTensorShape(input2), GetTensorData<int8_t>(input2),
+            GetTensorShape(output), GetTensorData<int8_t>(output), RightShift);
+      }
+      break;
+    }
+    case kTfLiteUInt16: {
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastBinaryFunction4DSlow<uint16_t, uint16_t,
+                                                     uint16_t>(
+            GetTensorShape(input1), GetTensorData<uint16_t>(input1),
+            GetTensorShape(input2), GetTensorData<uint16_t>(input2),
+            GetTensorShape(output), GetTensorData<uint16_t>(output),
+            RightShift);
+      } else {
+        reference_ops::BinaryFunction<uint16_t, uint16_t, uint16_t>(
+            GetTensorShape(input1), GetTensorData<uint16_t>(input1),
+            GetTensorShape(input2), GetTensorData<uint16_t>(input2),
+            GetTensorShape(output), GetTensorData<uint16_t>(output),
+            RightShift);
+      }
+      break;
+    }
+    case kTfLiteInt16: {
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastBinaryFunction4DSlow<int16_t, int16_t, int16_t>(
+            GetTensorShape(input1), GetTensorData<int16_t>(input1),
+            GetTensorShape(input2), GetTensorData<int16_t>(input2),
+            GetTensorShape(output), GetTensorData<int16_t>(output), RightShift);
+      } else {
+        reference_ops::BinaryFunction<int16_t, int16_t, int16_t>(
+            GetTensorShape(input1), GetTensorData<int16_t>(input1),
+            GetTensorShape(input2), GetTensorData<int16_t>(input2),
+            GetTensorShape(output), GetTensorData<int16_t>(output), RightShift);
+      }
+      break;
+    }
+    case kTfLiteUInt32: {
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastBinaryFunction4DSlow<uint32_t, uint32_t,
+                                                     uint32_t>(
+            GetTensorShape(input1), GetTensorData<uint32_t>(input1),
+            GetTensorShape(input2), GetTensorData<uint32_t>(input2),
+            GetTensorShape(output), GetTensorData<uint32_t>(output),
+            RightShift);
+      } else {
+        reference_ops::BinaryFunction<uint32_t, uint32_t, uint32_t>(
+            GetTensorShape(input1), GetTensorData<uint32_t>(input1),
+            GetTensorShape(input2), GetTensorData<uint32_t>(input2),
+            GetTensorShape(output), GetTensorData<uint32_t>(output),
+            RightShift);
+      }
+      break;
+    }
+    case kTfLiteInt32: {
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastBinaryFunction4DSlow<int32_t, int32_t, int32_t>(
+            GetTensorShape(input1), GetTensorData<int32_t>(input1),
+            GetTensorShape(input2), GetTensorData<int32_t>(input2),
+            GetTensorShape(output), GetTensorData<int32_t>(output), RightShift);
+      } else {
+        reference_ops::BinaryFunction<int32_t, int32_t, int32_t>(
+            GetTensorShape(input1), GetTensorData<int32_t>(input1),
+            GetTensorShape(input2), GetTensorData<int32_t>(input2),
+            GetTensorShape(output), GetTensorData<int32_t>(output), RightShift);
+      }
+      break;
+    }
+    default:
+      TF_LITE_KERNEL_LOG(context,
+                         "RightShift currently only supports "
+                         "8-bit/16-bit/32-bit integer/unsigned integer, got %s",
+                         TfLiteTypeGetName(type));
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace right_shift
+
+TfLiteRegistration* Register_RIGHT_SHIFT() {
+  static TfLiteRegistration r = {right_shift::Init, right_shift::Free,
+                                 right_shift::Prepare, right_shift::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/right_shift_test.cc b/tensorflow/lite/kernels/right_shift_test.cc
new file mode 100644
index 00000000000..e57f5261db0
--- /dev/null
+++ b/tensorflow/lite/kernels/right_shift_test.cc
@@ -0,0 +1,160 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class RightShiftOpModel : public SingleOpModel {
+ public:
+  RightShiftOpModel(std::initializer_list<int> input1_shape,
+                    std::initializer_list<int> input2_shape,
+                    TensorType tensor_type) {
+    input1_ = AddInput(tensor_type);
+    input2_ = AddInput(tensor_type);
+    output_ = AddOutput(tensor_type);
+    SetBuiltinOp(BuiltinOperator_RIGHT_SHIFT, BuiltinOptions_RightShiftOptions,
+                 CreateRightShiftOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape});
+  }
+
+  int input1() const { return input1_; }
+  int input2() const { return input2_; }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(RightShiftOpTest, SimpleTestInt8) {
+  RightShiftOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT8);
+  model.PopulateTensor<int8_t>(model.input1(), {-1, -5, -3, -14});
+  model.PopulateTensor<int8_t>(model.input2(), {5, 0, 7, 11});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<int8_t>(), ElementsAreArray({-1, -5, -1, -1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(RightShiftOpTest, SimpleTestInt16) {
+  RightShiftOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT16);
+  model.PopulateTensor<int16_t>(model.input1(), {-1, -5, -3, -14});
+  model.PopulateTensor<int16_t>(model.input2(), {5, 0, 7, 11});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<int16_t>(), ElementsAreArray({-1, -5, -1, -1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(RightShiftOpTest, SimpleTestInt32) {
+  RightShiftOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int32_t>(model.input1(), {-1, -5, -3, -14});
+  model.PopulateTensor<int32_t>(model.input2(), {5, 0, 7, 11});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<int32_t>(), ElementsAreArray({-1, -5, -1, -1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(RightShiftOpTest, SimpleTestUInt8) {
+  RightShiftOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_UINT8);
+  model.PopulateTensor<uint8_t>(model.input1(), {1, 5, 3, 14});
+  model.PopulateTensor<uint8_t>(model.input2(), {5, 0, 7, 11});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<uint8_t>(), ElementsAreArray({0, 5, 0, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(RightShiftOpTest, SimpleTestUInt16) {
+  RightShiftOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_UINT16);
+  model.PopulateTensor<uint16_t>(model.input1(), {1, 5, 3, 14});
+  model.PopulateTensor<uint16_t>(model.input2(), {5, 0, 7, 11});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<uint16_t>(), ElementsAreArray({0, 5, 0, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(RightShiftOpTest, SimpleTestUInt32) {
+  RightShiftOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_UINT32);
+  model.PopulateTensor<uint32_t>(model.input1(), {1, 5, 3, 14});
+  model.PopulateTensor<uint32_t>(model.input2(), {5, 0, 7, 11});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<uint32_t>(), ElementsAreArray({0, 5, 0, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(BitwiseXorOpTest, BroadcastRhsInt) {
+  RightShiftOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  model.PopulateTensor<int32_t>(model.input1(), {-1, -5, -3, -14});
+  model.PopulateTensor<int32_t>(model.input2(), {2});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<int32_t>(), ElementsAreArray({-1, -2, -1, -4}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(BitwiseXorOpTest, BroadcastLhsInt) {
+  RightShiftOpModel model({1, 1, 1, 1}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int32_t>(model.input1(), {4});
+  model.PopulateTensor<int32_t>(model.input2(), {1, -2, 3, -4});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<int32_t>(), ElementsAreArray({2, 4, 0, 4}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(BitwiseXorOpTest, BroadcastRhsUInt) {
+  RightShiftOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_UINT32);
+  model.PopulateTensor<uint32_t>(model.input1(), {5, 0, 7, 11});
+  model.PopulateTensor<uint32_t>(model.input2(), {2});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<uint32_t>(), ElementsAreArray({1, 0, 1, 2}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(BitwiseXorOpTest, BroadcastLhsUInt) {
+  RightShiftOpModel model({1, 1, 1, 1}, {1, 1, 1, 4}, TensorType_UINT32);
+  model.PopulateTensor<uint32_t>(model.input1(), {4});
+  model.PopulateTensor<uint32_t>(model.input2(), {1, 2, 3, 4});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<uint32_t>(), ElementsAreArray({2, 1, 0, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/segment_sum.cc b/tensorflow/lite/kernels/segment_sum.cc
index 6eba89ab838..f92638e8987 100644
--- a/tensorflow/lite/kernels/segment_sum.cc
+++ b/tensorflow/lite/kernels/segment_sum.cc
@@ -77,7 +77,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                  data->type == kTfLiteInt32 || data->type == kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, segment_ids->type, kTfLiteInt32);
 
-  if (!IsConstantTensor(data) || !IsConstantTensor(segment_ids)) {
+  if (!IsConstantOrPersistentTensor(data) ||
+      !IsConstantOrPersistentTensor(segment_ids)) {
     SetTensorToDynamic(output);
     return kTfLiteOk;
   }
diff --git a/tensorflow/lite/kernels/select.cc b/tensorflow/lite/kernels/select.cc
index 5563f74ae0a..fb01871b051 100644
--- a/tensorflow/lite/kernels/select.cc
+++ b/tensorflow/lite/kernels/select.cc
@@ -162,6 +162,9 @@ TfLiteStatus SelectEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt8:                                                        \
       TF_LITE_SELECT(int8_t, op);                                            \
       break;                                                                 \
+    case kTfLiteUInt32:                                                      \
+      TF_LITE_SELECT(uint32_t, op);                                          \
+      break;                                                                 \
     case kTfLiteInt16:                                                       \
       TF_LITE_SELECT(int16_t, op);                                           \
       break;                                                                 \
diff --git a/tensorflow/lite/kernels/select_test.cc b/tensorflow/lite/kernels/select_test.cc
index e81d12800fe..2805f58ed5d 100644
--- a/tensorflow/lite/kernels/select_test.cc
+++ b/tensorflow/lite/kernels/select_test.cc
@@ -124,6 +124,19 @@ TEST(SelectOpTest, SelectUInt8) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
 }
 
+TEST(SelectOpTest, SelectUInt32) {
+  SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                      TensorType_UINT32);
+
+  model.PopulateTensor<bool>(model.input1(), {false, true, false, false});
+  model.PopulateTensor<uint32_t>(model.input2(), {1, 2, 3, 4});
+  model.PopulateTensor<uint32_t>(model.input3(), {5, 6, 7, 8});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<uint32_t>(), ElementsAreArray({5, 2, 7, 8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
 TEST(SelectOpTest, SelectInt8) {
   SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
                       TensorType_INT8);
@@ -205,6 +218,30 @@ TEST(SelectOpTest, ScalarTrueConditionInt32) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 2, 1}));
 }
 
+TEST(SelectOpTest, ScalarFalseConditionFloat32) {
+  SelectOpModel model({1}, {1, 1, 2, 2}, {1, 1, 2, 2}, TensorType_FLOAT32);
+
+  model.PopulateTensor<bool>(model.input1(), {false});
+  model.PopulateTensor<float>(model.input2(), {1, 2, 3, 4});
+  model.PopulateTensor<float>(model.input3(), {5, 6, 7, 8});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<float>(), ElementsAreArray({5, 6, 7, 8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 2}));
+}
+
+TEST(SelectOpTest, ScalarTrueConditionFloat32) {
+  SelectOpModel model({1}, {1, 1, 2, 2}, {1, 1, 2, 2}, TensorType_FLOAT32);
+
+  model.PopulateTensor<bool>(model.input1(), {true});
+  model.PopulateTensor<float>(model.input2(), {1, 2, 3, 4});
+  model.PopulateTensor<float>(model.input3(), {5, 6, 7, 8});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<float>(), ElementsAreArray({1, 2, 3, 4}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 2}));
+}
+
 TEST(SelectOpTest, RankZeroSelectInt32) {
   SelectOpModel model({1}, {1, 2, 2, 1}, {1, 2, 2, 1}, TensorType_INT32);
 
@@ -257,6 +294,19 @@ TEST(SelectV2OpTest, SelectUInt8) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
 }
 
+TEST(SelectV2OpTest, SelectUInt32) {
+  SelectV2OpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                        TensorType_UINT32);
+
+  model.PopulateTensor<bool>(model.input1(), {false, true, false, false});
+  model.PopulateTensor<uint32_t>(model.input2(), {1, 2, 3, 4});
+  model.PopulateTensor<uint32_t>(model.input3(), {5, 6, 7, 8});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput<uint32_t>(), ElementsAreArray({5, 2, 7, 8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
 TEST(SelectV2OpTest, SelectInt8) {
   SelectV2OpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
                         TensorType_INT8);
diff --git a/tensorflow/lite/kernels/shim/BUILD b/tensorflow/lite/kernels/shim/BUILD
index eea99ae4d3f..65d1c550991 100644
--- a/tensorflow/lite/kernels/shim/BUILD
+++ b/tensorflow/lite/kernels/shim/BUILD
@@ -1,9 +1,11 @@
 # This package is a shim layer over TF and TFLite op kernels.
 
 load("//tensorflow:tensorflow.bzl", "if_mobile", "if_not_mobile", "tf_cc_test", "tf_copts")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_compatible_with = get_compatible_with_portable(),
     # The public targets are marked public individually
     default_visibility = ["//tensorflow:__subpackages__"],
     licenses = ["notice"],
@@ -91,8 +93,8 @@ tf_cc_test(
     srcs = ["tf_tensor_view_test.cc"],
     deps = [
         ":tf_tensor_view",
-        "@com_google_googletest//:gtest_main",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
     ] + if_mobile([
         "//tensorflow/core:portable_tensorflow_lib_lite",
     ]) + if_not_mobile([
@@ -217,7 +219,7 @@ cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@flatbuffers//:public_headers_lib",
+        "@flatbuffers",
     ],
 )
 
@@ -248,7 +250,7 @@ cc_test(
     srcs = ["test_util_test.cc"],
     deps = [
         ":test_util",
-        "//tensorflow/lite:cc_api",
+        "//tensorflow/lite:cc_api_experimental",
         "//tensorflow/lite/core:framework_stable",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/kernels/shim/test_op/BUILD b/tensorflow/lite/kernels/shim/test_op/BUILD
index b771b2414e5..ecb8b7e3b9f 100644
--- a/tensorflow/lite/kernels/shim/test_op/BUILD
+++ b/tensorflow/lite/kernels/shim/test_op/BUILD
@@ -148,7 +148,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:ops_testutil",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest_main",
-        "@flatbuffers//:public_headers_lib",
+        "@flatbuffers",
     ],
 )
 
diff --git a/tensorflow/lite/kernels/shim/tf_op_shim.cc b/tensorflow/lite/kernels/shim/tf_op_shim.cc
index cd2972f42f3..770a120209c 100644
--- a/tensorflow/lite/kernels/shim/tf_op_shim.cc
+++ b/tensorflow/lite/kernels/shim/tf_op_shim.cc
@@ -100,7 +100,7 @@ TensorViewOr TfInvokeContext::GetOutput(const int idx,
   for (int i = 0; i < shape->size(); ++i) shape_64[i] = (*shape)[i];
   auto status = context_->allocate_output(
       idx, ::tensorflow::TensorShape(shape_64), &output_t);
-  if (!status.ok()) return ToAbslStatus(status);
+  if (!status.ok()) return tsl::ToAbslStatus(status);
   SH_ASSIGN_OR_RETURN(const TfTensorView& tensor_view,
                       TensorView::New(output_t));
   return std::make_unique<TfTensorView>(std::move(tensor_view));
diff --git a/tensorflow/lite/kernels/shim/tflite_op_wrapper_test.cc b/tensorflow/lite/kernels/shim/tflite_op_wrapper_test.cc
index 5d03db8657a..10635e28f86 100644
--- a/tensorflow/lite/kernels/shim/tflite_op_wrapper_test.cc
+++ b/tensorflow/lite/kernels/shim/tflite_op_wrapper_test.cc
@@ -35,6 +35,10 @@ namespace shim {
 namespace op_wrapper {
 namespace {
 
+#ifndef EXPECT_OK
+#define EXPECT_OK(x) EXPECT_TRUE(x.ok());
+#endif
+
 // Tests the created type of the variant is correct.
 class VariantOpTest : public ::testing::Test {
  public:
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index b11573395e6..2b9a773238e 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -150,7 +150,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
   // Postpone allocation of output if any of the indexing tensors is not
   // constant, or the input tensor has dynamic dimension.
-  if (!(IsConstantTensor(begin) && IsConstantTensor(size)) ||
+  if (!(IsConstantOrPersistentTensor(begin) &&
+        IsConstantOrPersistentTensor(size)) ||
       HasUnspecifiedDimension(input)) {
     SetTensorToDynamic(output);
     return kTfLiteOk;
@@ -244,6 +245,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
       TF_LITE_SLICE(uint8_t);
       break;
+    case kTfLiteUInt32:
+      TF_LITE_SLICE(uint32_t);
+      break;
     case kTfLiteBool:
       TF_LITE_SLICE(bool);
       break;
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index e6a8251f560..80b35f59281 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -232,6 +232,16 @@ TEST_P(SliceOpTest, SliceUint8) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
 }
 
+TEST_P(SliceOpTest, SliceUint32) {
+  SliceOpModel<uint32_t, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
+                                    {2, 1, -1, 1}, TensorType_INT32,
+                                    TensorType_UINT32, GetParam());
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
 TEST_P(SliceOpTest, SliceInt8) {
   SliceOpModel<int8_t, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
                                   {2, 1, -1, 1}, TensorType_INT32,
diff --git a/tensorflow/lite/kernels/space_to_batch_nd.cc b/tensorflow/lite/kernels/space_to_batch_nd.cc
index ed9ec21c4a1..5e05a32fd54 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd.cc
@@ -104,8 +104,22 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_TYPES_EQ(context, op_context.input->type,
                           op_context.output->type);
 
-  if (!IsConstantTensor(op_context.block_shape) ||
-      !IsConstantTensor(op_context.paddings)) {
+  if (op_context.input->type == kTfLiteUInt8 ||
+      op_context.input->type == kTfLiteInt8 ||
+      op_context.input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, op_context.input->params.scale,
+                      op_context.output->params.scale);
+    TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point,
+                      op_context.output->params.zero_point);
+  }
+
+  if (op_context.input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point, 0);
+  }
+
+  if (!IsConstantOrPersistentTensor(op_context.block_shape) ||
+      !IsConstantOrPersistentTensor(op_context.paddings)) {
     SetTensorToDynamic(op_context.output);
     return kTfLiteOk;
   }
@@ -158,6 +172,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                   op_context.output->params.zero_point);
       }
       break;
+    case kTfLiteInt16:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int16_t,
+                                  op_context.output->params.zero_point);
+      } else {
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int16_t,
+                                  op_context.output->params.zero_point);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int32_t, 0);
diff --git a/tensorflow/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
index eb261057ae3..699177ad713 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
@@ -222,17 +222,17 @@ TEST(SpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
                              }));
 }
 
-class QuantizedSpaceToBatchNDOpTest : public ::testing::Test {
- protected:
-  std::vector<Matcher<float>> DequantizedArrayNear(
-      const std::vector<float>& values, const float min, const float max) {
-    const float quantization_tolerance = (max - min) / 255.0;
-    return ArrayFloatNear(values, quantization_tolerance);
-  }
-};
+template <typename integer_dtype = int8_t>
+std::vector<Matcher<float>> DequantizedArrayNear(
+    const std::vector<float>& values, const float min, const float max) {
+  const float quantization_tolerance =
+      (max - min) / (std::numeric_limits<integer_dtype>::max() -
+                     std::numeric_limits<integer_dtype>::min());
+  return ArrayFloatNear(values, quantization_tolerance);
+}
 
 #if GTEST_HAS_DEATH_TEST
-TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
+TEST(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
   EXPECT_DEATH(SpaceToBatchNDOpConstModel m(
@@ -242,69 +242,74 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
 }
 #endif
 
-TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTestUint8) {
-  SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
-                               {3, 2}, {1, 0, 2, 0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput<uint8_t>(
+template <typename integer_dtype>
+void SimplePaddingConstTestQuant() {
+  const float kMin = -1;
+  const float kMax =
+      std::numeric_limits<integer_dtype>::max() /
+      static_cast<float>(std::numeric_limits<integer_dtype>::max() + 1);
+  SpaceToBatchNDOpConstModel m(
+      {GetTensorType<integer_dtype>(), {1, 5, 2, 1}, 1.0f * kMin, 1.0f * kMax},
+      {3, 2}, {1, 0, 2, 0},
+      {GetTensorType<integer_dtype>(), {}, 1.0f * kMin, 1.0f * kMax});
+  m.SetQuantizedInput<integer_dtype>(
       {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
-              ElementsAreArray(DequantizedArrayNear(
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
+              ElementsAreArray(DequantizedArrayNear<integer_dtype>(
                   {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
                    0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
                   -1.0, 1.0)));
 }
 
-TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTestInt8) {
-  SpaceToBatchNDOpConstModel m({TensorType_INT8, {1, 5, 2, 1}, -1.0, 1.0},
-                               {3, 2}, {1, 0, 2, 0},
-                               {TensorType_INT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput<int8_t>(
-      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
-  ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
-              ElementsAreArray(DequantizedArrayNear(
-                  {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
-                   0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
-                  -1.0, 1.0)));
+TEST(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTestUint8) {
+  SimplePaddingConstTestQuant<uint8_t>();
 }
 
-TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTestUint8) {
-  SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput<uint8_t>(
+TEST(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTestInt8) {
+  SimplePaddingConstTestQuant<int8_t>();
+}
+
+TEST(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTestInt16) {
+  SimplePaddingConstTestQuant<int16_t>();
+}
+
+template <typename integer_dtype>
+void SimplePaddingDynamicTestQuant() {
+  const float kMin = -1;
+  const float kMax =
+      std::numeric_limits<integer_dtype>::max() /
+      static_cast<float>(std::numeric_limits<integer_dtype>::max() + 1);
+  SpaceToBatchNDOpDynamicModel m(
+      {GetTensorType<integer_dtype>(), {1, 5, 2, 1}, 1.0f * kMin, 1.0f * kMax},
+      {GetTensorType<integer_dtype>(), {}, 1.0f * kMin, 1.0f * kMax});
+  m.SetQuantizedInput<integer_dtype>(
       {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
   m.SetBlockShape({3, 2});
   m.SetPaddings({1, 0, 2, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
-              ElementsAreArray(DequantizedArrayNear(
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
+              ElementsAreArray(DequantizedArrayNear<integer_dtype>(
                   {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
                    0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
                   -1.0, 1.0)));
 }
 
-TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTestInt8) {
-  SpaceToBatchNDOpDynamicModel m({TensorType_INT8, {1, 5, 2, 1}, -1.0, 1.0},
-                                 {TensorType_INT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput<int8_t>(
-      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
-  m.SetBlockShape({3, 2});
-  m.SetPaddings({1, 0, 2, 0});
-  ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
-              ElementsAreArray(DequantizedArrayNear(
-                  {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
-                   0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
-                  -1.0, 1.0)));
+TEST(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTestUint8) {
+  SimplePaddingDynamicTestQuant<uint8_t>();
 }
 
-TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingConstTest) {
+TEST(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTestInt8) {
+  SimplePaddingDynamicTestQuant<int8_t>();
+}
+
+TEST(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTestInt16) {
+  SimplePaddingDynamicTestQuant<int16_t>();
+}
+
+TEST(QuantizedSpaceToBatchNDOpTest, ComplexPaddingConstTest) {
   SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0},
                                {3, 2}, {1, 1, 2, 4},
                                {TensorType_UINT8, {}, -1.0, 1.0});
@@ -321,7 +326,7 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingConstTest) {
                   -1.0, 1.0)));
 }
 
-TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
+TEST(QuantizedSpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
   SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0},
                                  {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput<uint8_t>({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
diff --git a/tensorflow/lite/kernels/sparse_to_dense_test.cc b/tensorflow/lite/kernels/sparse_to_dense_test.cc
index 247dfe30a4b..02687dc3e31 100644
--- a/tensorflow/lite/kernels/sparse_to_dense_test.cc
+++ b/tensorflow/lite/kernels/sparse_to_dense_test.cc
@@ -96,7 +96,9 @@ TEST_P(SparseToDenseOpModelTest, ZeroDimensionTest) {
   SparseToDenseOpModel<float> m({1}, {1}, {1}, 0, TensorType_INT32,
                                 TensorType_FLOAT32, {5}, GetParam());
   m.PopulateTensor<int32_t>(m.indices(), {3});
-  m.PopulateTensor<int32_t>(m.output_shape(), {5});
+  if (GetParam() != TestType::kConstant) {
+    m.PopulateTensor<int32_t>(m.output_shape(), {5});
+  }
   m.PopulateTensor<float>(m.values(), {7});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   ASSERT_EQ(m.IsDynamicOutput(), GetParam() == TestType::kDynamic);
@@ -108,7 +110,9 @@ TEST_P(SparseToDenseOpModelTest, OneDimensionTest) {
   SparseToDenseOpModel<float> m({3}, {1}, {3}, 0, TensorType_INT32,
                                 TensorType_FLOAT32, {7}, GetParam());
   m.PopulateTensor<int32_t>(m.indices(), {1, 3, 5});
-  m.PopulateTensor<int32_t>(m.output_shape(), {7});
+  if (GetParam() != TestType::kConstant) {
+    m.PopulateTensor<int32_t>(m.output_shape(), {7});
+  }
   m.PopulateTensor<float>(m.values(), {2, 4, 6});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   ASSERT_EQ(m.IsDynamicOutput(), GetParam() == TestType::kDynamic);
@@ -120,7 +124,9 @@ TEST_P(SparseToDenseOpModelTest, TwoDimensionsTest) {
   SparseToDenseOpModel<float> m({3, 3}, {3}, {3}, 0, TensorType_INT32,
                                 TensorType_FLOAT32, {3, 3, 3}, GetParam());
   m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
-  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  if (GetParam() != TestType::kConstant) {
+    m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  }
   m.PopulateTensor<float>(m.values(), {2, 4, 6});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   ASSERT_EQ(m.IsDynamicOutput(), GetParam() == TestType::kDynamic);
@@ -134,7 +140,9 @@ TEST_P(SparseToDenseOpModelTest, Int64IndexTest) {
   SparseToDenseOpModel<float> m({3, 3}, {3}, {3}, -1, TensorType_INT64,
                                 TensorType_FLOAT32, {3, 3, 3}, GetParam());
   m.PopulateTensor<int64_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
-  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  if (GetParam() != TestType::kConstant) {
+    m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  }
   m.PopulateTensor<float>(m.values(), {2, 4, 6});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   ASSERT_EQ(m.IsDynamicOutput(), GetParam() == TestType::kDynamic);
@@ -149,7 +157,9 @@ TEST_P(SparseToDenseOpModelTest, DefaultValueTest) {
   SparseToDenseOpModel<float> m({3, 3}, {3}, {3}, -1, TensorType_INT32,
                                 TensorType_FLOAT32, {3, 3, 3}, GetParam());
   m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
-  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  if (GetParam() != TestType::kConstant) {
+    m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  }
   m.PopulateTensor<float>(m.values(), {2, 4, 6});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   ASSERT_EQ(m.IsDynamicOutput(), GetParam() == TestType::kDynamic);
@@ -164,7 +174,9 @@ TEST_P(SparseToDenseOpModelTest, Int32ValueTest) {
   SparseToDenseOpModel<int32_t> m({3, 3}, {3}, {3}, -1, TensorType_INT32,
                                   TensorType_INT32, {3, 3, 3}, GetParam());
   m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
-  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  if (GetParam() != TestType::kConstant) {
+    m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  }
   m.PopulateTensor<int32_t>(m.values(), {2, 4, 6});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   ASSERT_EQ(m.IsDynamicOutput(), GetParam() == TestType::kDynamic);
@@ -179,7 +191,9 @@ TEST_P(SparseToDenseOpModelTest, Int64ValueTest) {
   SparseToDenseOpModel<int64_t> m({3, 3}, {3}, {3}, -1, TensorType_INT32,
                                   TensorType_INT64, {3, 3, 3}, GetParam());
   m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
-  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  if (GetParam() != TestType::kConstant) {
+    m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  }
   m.PopulateTensor<int64_t>(m.values(), {2, 4, 6});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   ASSERT_EQ(m.IsDynamicOutput(), GetParam() == TestType::kDynamic);
@@ -194,7 +208,9 @@ TEST_P(SparseToDenseOpModelTest, Int8ValueTest) {
   SparseToDenseOpModel<int8_t> m({3, 3}, {3}, {3}, -1, TensorType_INT32,
                                  TensorType_INT8, {3, 3, 3}, GetParam());
   m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
-  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  if (GetParam() != TestType::kConstant) {
+    m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  }
   m.PopulateTensor<int8_t>(m.values(), {2, 4, 6});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   ASSERT_EQ(m.IsDynamicOutput(), GetParam() == TestType::kDynamic);
@@ -209,7 +225,9 @@ TEST_P(SparseToDenseOpModelTest, UInt8ValueTest) {
   SparseToDenseOpModel<uint8_t> m({3, 3}, {3}, {3}, 1, TensorType_INT32,
                                   TensorType_UINT8, {3, 3, 3}, GetParam());
   m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
-  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  if (GetParam() != TestType::kConstant) {
+    m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  }
   m.PopulateTensor<uint8_t>(m.values(), {2, 4, 6});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   ASSERT_EQ(m.IsDynamicOutput(), GetParam() == TestType::kDynamic);
diff --git a/tensorflow/lite/kernels/split.cc b/tensorflow/lite/kernels/split.cc
index 03dc10b8f8e..1491f4bbb98 100644
--- a/tensorflow/lite/kernels/split.cc
+++ b/tensorflow/lite/kernels/split.cc
@@ -96,7 +96,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // If we know the contents of the 'axis' tensor, resize all outputs.
   // Otherwise, wait until Eval().
-  if (IsConstantTensor(op_context.axis)) {
+  if (IsConstantOrPersistentTensor(op_context.axis)) {
     return ResizeOutputTensors(context, node, op_context.axis, op_context.input,
                                op_context.params->num_splits);
   } else {
diff --git a/tensorflow/lite/kernels/split_v.cc b/tensorflow/lite/kernels/split_v.cc
index 786b17e4538..ceef07508f5 100644
--- a/tensorflow/lite/kernels/split_v.cc
+++ b/tensorflow/lite/kernels/split_v.cc
@@ -150,8 +150,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // If we know the contents of the 'size_splits' tensor and the 'axis' tensor,
   // resize all outputs. Otherwise, wait until Eval().
-  if (IsConstantTensor(op_context.size_splits) &&
-      IsConstantTensor(op_context.axis)) {
+  if (IsConstantOrPersistentTensor(op_context.size_splits) &&
+      IsConstantOrPersistentTensor(op_context.axis)) {
     return ResizeOutputTensors(context, node, op_context.input,
                                op_context.size_splits, op_context.axis);
   } else {
@@ -164,8 +164,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   // When the 'size_splits' and the 'axis' tensor is non-const we can't resize
   // output tensors in Prepare(), and we have to do it now.
-  if (!IsConstantTensor(op_context.axis) ||
-      !IsConstantTensor(op_context.size_splits)) {
+  if (!IsConstantOrPersistentTensor(op_context.axis) ||
+      !IsConstantOrPersistentTensor(op_context.size_splits)) {
     TF_LITE_ENSURE_OK(
         context, ResizeOutputTensors(context, node, op_context.input,
                                      op_context.size_splits, op_context.axis));
diff --git a/tensorflow/lite/kernels/squared_difference.cc b/tensorflow/lite/kernels/squared_difference.cc
index d26e4992e3b..f49d5763568 100644
--- a/tensorflow/lite/kernels/squared_difference.cc
+++ b/tensorflow/lite/kernels/squared_difference.cc
@@ -250,6 +250,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       CpuBackendContext* cpu_backend_context =
           CpuBackendContext::GetFromContext(context);
       pthreadpool_t threadpool = cpu_backend_context->get_xnnpack_threadpool();
+      threadpool = nullptr;
       const enum xnn_status status = xnn_run_squared_difference_nd_f32(
           num_input1_dims, input1_shape.data(), num_input2_dims,
           input2_shape.data(), GetTensorData<float>(input1),
diff --git a/tensorflow/lite/kernels/strided_slice.cc b/tensorflow/lite/kernels/strided_slice.cc
index a3b9e8241f8..2d5244e02ca 100644
--- a/tensorflow/lite/kernels/strided_slice.cc
+++ b/tensorflow/lite/kernels/strided_slice.cc
@@ -264,6 +264,11 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
           op_params, op_context.effective_input_shape, op_context.input,
           GetTensorShape(op_context.output), op_context.output);
       break;
+    case kTfLiteUInt32:
+      reference_ops::StridedSlice<uint32_t>(
+          op_params, op_context.effective_input_shape, op_context.input,
+          GetTensorShape(op_context.output), op_context.output);
+      break;
     case kTfLiteInt8:
       reference_ops::StridedSlice<int8_t>(
           op_params, op_context.effective_input_shape, op_context.input,
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index 7109a92d16e..e4f0b694507 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <initializer_list>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -65,7 +66,7 @@ class StridedSliceOpModel : public SingleOpModel {
                      use_simple_allocator);
     if (!const_tensors) {
       if (!input_data.empty()) {
-        SetInput(input_data);
+        SetInput(input_data, std::is_same<std::string, input_type>());
       }
       SetBegin(begin_data);
       SetEnd(end_data);
@@ -74,11 +75,11 @@ class StridedSliceOpModel : public SingleOpModel {
   }
 
   template <typename T>
-  void SetInput(const std::vector<T> data) {
+  void SetInput(const std::vector<T> data, std::false_type) {
     PopulateTensor<input_type>(input_, data);
   }
-  template <>
-  void SetInput(const std::vector<std::string> data) {
+  template <typename T>
+  void SetInput(const std::vector<T> data, std::true_type) {
     PopulateStringTensor(input_, data);
   }
   void SetBegin(const std::vector<int32_t> data) {
@@ -94,8 +95,8 @@ class StridedSliceOpModel : public SingleOpModel {
   std::vector<input_type> GetOutput() {
     return ExtractVector<input_type>(output_);
   }
-  std::vector<string> GetStringOutput() {
-    return ExtractVector<string>(output_);
+  std::vector<std::string> GetStringOutput() {
+    return ExtractVector<std::string>(output_);
   }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
@@ -110,7 +111,8 @@ class StridedSliceOpModel : public SingleOpModel {
 template <typename T>
 class StridedSliceOpTest : public ::testing::Test {};
 
-using DataTypes = ::testing::Types<float, uint8_t, int8_t, int16_t, int32_t>;
+using DataTypes =
+    ::testing::Types<float, uint8_t, uint32_t, int8_t, int16_t, int32_t>;
 TYPED_TEST_SUITE(StridedSliceOpTest, DataTypes);
 
 #if GTEST_HAS_DEATH_TEST
@@ -766,7 +768,8 @@ TYPED_TEST(StridedSliceOpTest, RunTwice) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 4, 5}));
 
   auto setup_inputs = [&m]() {
-    m.template SetInput<TypeParam>({1, 2, 3, 4, 5, 6});
+    m.template SetInput<TypeParam>({1, 2, 3, 4, 5, 6},
+                                   std::is_same<std::string, TypeParam>());
     m.SetBegin({1, 0});
     m.SetEnd({2, 2});
     m.SetStrides({1, 1});
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index dd90cf5cb58..3c02bc48bf7 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
@@ -51,6 +50,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/logging.h"
 #include "tensorflow/lite/tools/versioning/op_version.h"
 #include "tensorflow/lite/version.h"
+#include "tensorflow/tsl/platform/logging.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 522878f6acb..21dddfc1ecc 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <complex>
+#include <cstdlib>
 #include <functional>
 #include <initializer_list>
 #include <limits>
@@ -40,7 +41,6 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/types/span.h"
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
@@ -55,6 +55,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
 #include "tensorflow/lite/type_to_tflitetype.h"
 #include "tensorflow/lite/util.h"
+#include "tensorflow/tsl/platform/logging.h"
 
 namespace tflite {
 
@@ -82,6 +83,7 @@ inline std::vector<T> Quantize(const std::vector<float>& data, float scale,
     max = 7;
   }
 
+  q.reserve(data.size());
   for (const auto& f : data) {
     q.push_back(static_cast<T>(std::max<float>(
         min, std::min<float>(max, std::round(zero_point + (f / scale))))));
@@ -214,7 +216,7 @@ class AccelerationValidator {
 
 class SingleOpModel {
  public:
-  SingleOpModel() {}
+  SingleOpModel() = default;
   ~SingleOpModel();
 
   // Set a delegate that is applied right after graph is prepared. This is
@@ -495,6 +497,15 @@ class SingleOpModel {
     PopulateTensor(index, /*offset=*/0, q.data(), q.data() + q.size());
   }
 
+  void SignedSymmetricQuantizeAndPopulate4Bit(int index,
+                                              const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    t->type = kTfLiteInt4;
+    std::vector<int8_t> q =
+        Quantize<int8_t>(data, t->params.scale, t->params.zero_point, t->type);
+    PopulateTensor4bit(index, /*offset=*/0, q.data(), q.data() + q.size());
+  }
+
   // Quantize and populate data for filter with per channel quantization.
   void PerChannelSymmetricQuantizeAndPopulate(
       int index, const std::vector<float>& input_data) {
@@ -687,6 +698,10 @@ class SingleOpModel {
   int CountNumberOfDelegatedPartitions() const;
   int GetNumberOfAppliedDelegates() const { return num_applied_delegates_; }
 
+  // Tell TF Lite runtime to apply default delegates (i.e. XNNPACK delegate)
+  // when handling this op-level model.
+  void SetApplyDefaultDelegates() { bypass_default_delegates_ = false; }
+
  protected:
   int32_t GetTensorSize(int index) const;
 
@@ -771,65 +786,6 @@ class SingleOpModel {
     return id;
   }
 
- private:
-  // Populates the tensor starting at offset using given data.
-  template <typename T, typename Container>
-  void PopulateTensorImpl(int index, int offset, const Container& data) {
-    T* v = interpreter_->typed_tensor<T>(index);
-    if (!v) {
-      auto* t = interpreter_->tensor(index);
-      CHECK(t) << "No tensor with index " << index << ".";
-      CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
-      CHECK_EQ(t->type, typeToTfLiteType<T>())
-          << "Type mismatch for tensor with index " << index << ". Requested "
-          << TfLiteTypeGetName(typeToTfLiteType<T>()) << ", got "
-          << TfLiteTypeGetName(t->type) << ".";
-      LOG(FATAL) << "Unknown tensor error.";
-    }
-    absl::c_copy(data, v + offset);
-  }
-
-  void PackInt4ValuesDenselyInPlace(uint8_t* src_buffer, int buffer_size) {
-    for (int i = 0; i < buffer_size; ++i) {
-      if (i % 2 == 0) {
-        src_buffer[i / 2] = src_buffer[i] & 0x0F;
-      } else {
-        src_buffer[i / 2] |= src_buffer[i] << 4;
-      }
-    }
-    // the rest of the buffer should be empty since half of it is packed with
-    // the values
-    memset(src_buffer + (buffer_size + 1) / 2, 0, buffer_size / 2);
-  }
-
-  int ElementCount(TfLiteIntArray& dims) {
-    int result = 1;
-    for (int i = 0; i < dims.size; ++i) {
-      result *= dims.data[i];
-    }
-    return result;
-  }
-
-  // Partially populates the tensor, starting at the given offset.
-  void PopulateTensor4bit(int index, int offset, int8_t* begin, int8_t* end) {
-    auto data = absl::Span<int8_t>(begin, end - begin);
-    TfLiteTensor* tensor_ptr = interpreter_->tensor(index);
-    uint8_t* v = nullptr;
-    if (tensor_ptr) {
-      v = reinterpret_cast<uint8_t*>(tensor_ptr->data.data);
-    }
-
-    if (!v) {
-      auto* t = interpreter_->tensor(index);
-      CHECK(t) << "No tensor with index " << index << ".";
-      CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
-      LOG(FATAL) << "Unknown tensor error.";
-    }
-    absl::c_copy(data, v + offset);
-    PackInt4ValuesDenselyInPlace(v, ElementCount(*tensor_ptr->dims));
-    tensor_ptr->bytes = ((ElementCount(*tensor_ptr->dims) + 1) / 2);
-  }
-
   template <typename T>
   std::pair<float, int32_t> QuantizationParams(
       float f_min, float f_max, TfLiteType type = kTfLiteNoType) {
@@ -910,6 +866,65 @@ class SingleOpModel {
     return {scale, zero_point};
   }
 
+ private:
+  // Populates the tensor starting at offset using given data.
+  template <typename T, typename Container>
+  void PopulateTensorImpl(int index, int offset, const Container& data) {
+    T* v = interpreter_->typed_tensor<T>(index);
+    if (!v) {
+      auto* t = interpreter_->tensor(index);
+      CHECK(t) << "No tensor with index " << index << ".";
+      CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
+      CHECK_EQ(t->type, typeToTfLiteType<T>())
+          << "Type mismatch for tensor with index " << index << ". Requested "
+          << TfLiteTypeGetName(typeToTfLiteType<T>()) << ", got "
+          << TfLiteTypeGetName(t->type) << ".";
+      LOG(FATAL) << "Unknown tensor error.";
+    }
+    absl::c_copy(data, v + offset);
+  }
+
+  void PackInt4ValuesDenselyInPlace(uint8_t* src_buffer, int buffer_size) {
+    for (int i = 0; i < buffer_size; ++i) {
+      if (i % 2 == 0) {
+        src_buffer[i / 2] = src_buffer[i] & 0x0F;
+      } else {
+        src_buffer[i / 2] |= src_buffer[i] << 4;
+      }
+    }
+    // the rest of the buffer should be empty since half of it is packed with
+    // the values
+    memset(src_buffer + (buffer_size + 1) / 2, 0, buffer_size / 2);
+  }
+
+  int ElementCount(TfLiteIntArray& dims) {
+    int result = 1;
+    for (int i = 0; i < dims.size; ++i) {
+      result *= dims.data[i];
+    }
+    return result;
+  }
+
+  // Partially populates the tensor, starting at the given offset.
+  void PopulateTensor4bit(int index, int offset, int8_t* begin, int8_t* end) {
+    auto data = absl::Span<int8_t>(begin, end - begin);
+    TfLiteTensor* tensor_ptr = interpreter_->tensor(index);
+    uint8_t* v = nullptr;
+    if (tensor_ptr) {
+      v = reinterpret_cast<uint8_t*>(tensor_ptr->data.data);
+    }
+
+    if (!v) {
+      auto* t = interpreter_->tensor(index);
+      CHECK(t) << "No tensor with index " << index << ".";
+      CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
+      LOG(FATAL) << "Unknown tensor error.";
+    }
+    absl::c_copy(data, v + offset);
+    PackInt4ValuesDenselyInPlace(v, ElementCount(*tensor_ptr->dims));
+    tensor_ptr->bytes = ((ElementCount(*tensor_ptr->dims) + 1) / 2);
+  }
+
   int AddTensorPerChannelQuant(const TensorData& t) {
     // type does not matter when adding empty data.
     return AddTensorPerChannelQuant<uint8_t>(t, nullptr, 0);
@@ -1019,7 +1034,8 @@ class SingleOpModel {
 
   // Whether to bypass the application of TF Lite default delegates (i.e.
   // XNNPACK delegate) at rutnime.
-  bool bypass_default_delegates_ = false;
+  // True by default as delegated graphs are tested elsewhere.
+  bool bypass_default_delegates_ = true;
 };
 
 // Populate string tensors.
@@ -1165,7 +1181,7 @@ struct TypeUnion<uint8_t> {
 class MultiOpModel : public SingleOpModel {
  public:
   MultiOpModel() : SingleOpModel() {}
-  ~MultiOpModel() {}
+  ~MultiOpModel() = default;
 
   void AddBuiltinOp(BuiltinOperator type, BuiltinOptions builtin_options_type,
                     const flatbuffers::Offset<void>& builtin_options,
@@ -1204,7 +1220,7 @@ class DimsAreMatcher {
 
   // Required method to implement for matcher objects. We overload on
   // both `TfLiteTensor*` and `TfLiteIntArray` for flexibility.
-  bool MatchAndExplain(TfLiteIntArray* arg,
+  bool MatchAndExplain(const TfLiteIntArray* arg,
                        testing::MatchResultListener* result_listener) const {
     if (arg == nullptr) {
       *result_listener << "dims are null";
@@ -1217,7 +1233,7 @@ class DimsAreMatcher {
     return false;
   }
 
-  bool MatchAndExplain(TfLiteTensor* arg,
+  bool MatchAndExplain(const TfLiteTensor* arg,
                        testing::MatchResultListener* result_listener) const {
     return MatchAndExplain(arg->dims, result_listener);
   }
diff --git a/tensorflow/lite/kernels/tile_test.cc b/tensorflow/lite/kernels/tile_test.cc
index de399112af9..b0387d0651a 100644
--- a/tensorflow/lite/kernels/tile_test.cc
+++ b/tensorflow/lite/kernels/tile_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <initializer_list>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -60,31 +61,33 @@ class TileOpConstModel : public TileOpBaseModel {
                    std::initializer_list<InputType> input_data,
                    TensorType input_type, TensorType multiply_type,
                    std::initializer_list<MultipliersType> multipliers_data) {
-    SetupInput(input_shape, input_data, input_type);
+    SetupInput(input_shape, input_data, input_type,
+               std::is_same<std::string, InputType>());
     multipliers_ = AddConstInput(multiply_type, multipliers_data,
                                  {static_cast<int>(multipliers_data.size())});
     output_ = AddOutput(input_type);
     SetBuiltinOp(BuiltinOperator_TILE, BuiltinOptions_TileOptions, 0);
     BuildInterpreter({input_shape, {static_cast<int>(input_shape.size())}});
-    PopulateInpute(input_data);
+    PopulateInput(input_data, std::is_same<std::string, InputType>());
   }
 
  private:
   template <typename T>
   void SetupInput(std::initializer_list<int> input_shape,
-                  std::initializer_list<T> input_data, TensorType input_type) {
+                  std::initializer_list<T> input_data, TensorType input_type,
+                  std::false_type) {
     input_ = AddConstInput(input_type, input_data, input_shape);
   }
-  template <>
+  template <typename T>
   void SetupInput(std::initializer_list<int> input_shape,
-                  std::initializer_list<std::string> input_data,
-                  TensorType input_type) {
+                  std::initializer_list<T> input_data, TensorType input_type,
+                  std::true_type) {
     input_ = AddInput(input_type);
   }
   template <typename T>
-  void PopulateInpute(std::initializer_list<T> input_data) {}
-  template <>
-  void PopulateInpute(std::initializer_list<std::string> input_data) {
+  void PopulateInput(std::initializer_list<T> input_data, std::false_type) {}
+  template <typename T>
+  void PopulateInput(std::initializer_list<T> input_data, std::true_type) {
     SetInput(input_data);
   }
 };
diff --git a/tensorflow/lite/kernels/topk_v2.cc b/tensorflow/lite/kernels/topk_v2.cc
index 4ef4cdbfbca..d1e0a7f87c9 100644
--- a/tensorflow/lite/kernels/topk_v2.cc
+++ b/tensorflow/lite/kernels/topk_v2.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <iterator>
 #include <vector>
 
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
@@ -37,11 +38,19 @@ namespace {
 TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* top_k;
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTopK, &top_k));
-  // INT32 number of top results is supported.
-  TF_LITE_ENSURE_TYPES_EQ(context, top_k->type, kTfLiteInt32);
+  // INT32 and INT16 number of top result index is supported.
+  TF_LITE_ENSURE(context,
+                 top_k->type == kTfLiteInt32 || top_k->type == kTfLiteInt16);
   // Check that the tensor contains only one value.
   TF_LITE_ENSURE_EQ(context, NumElements(top_k), 1);
-  const int32 k = *GetTensorData<int32_t>(top_k);
+
+  int32 k;
+  if (top_k->type == kTfLiteInt16) {
+    k = *GetTensorData<int16_t>(top_k);
+  } else {
+    // top_k->type == kTfLiteInt32
+    k = *GetTensorData<int32_t>(top_k);
+  }
 
   const TfLiteTensor* input;
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
@@ -68,7 +77,6 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(
       context, GetOutputSafe(context, node, kOutputValues, &output_values));
   // Force output types.
-  output_indexes->type = kTfLiteInt32;
   output_values->type = input->type;
   auto resize_tensor = [context](TfLiteTensor* tensor, TfLiteIntArray* new_size,
                                  TfLiteIntArray* delete_on_error) {
@@ -89,7 +97,7 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
 
 // Class that collects indices of top k values.  Based on template
 // tensorflow::gtl::TopN<> but, for optimization, it re-uses the same container.
-template <typename T>
+template <typename T, typename Tidx>
 class TopContainer {
  public:
   TopContainer() = delete;
@@ -103,8 +111,8 @@ class TopContainer {
     is_heap_ = false;
   }
 
-  void push(int32 a) {
-    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+  void push(Tidx a) {
+    auto comparator = [this](Tidx a, Tidx b) { return compare_fun(a, b); };
     if (!is_heap_) {
       container_.push_back(a);
       if (container_.size() == k_ + 1) {
@@ -128,8 +136,8 @@ class TopContainer {
     }
   }
 
-  const std::vector<int32>& sorted_result() {
-    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+  const std::vector<Tidx>& sorted_result() {
+    auto comparator = [this](Tidx a, Tidx b) { return compare_fun(a, b); };
     if (!is_heap_) {
       // Note: due to the way we defined compare_fun (see comments for that
       // function) std::sort puts the indices from container_ in decreasing
@@ -148,7 +156,7 @@ class TopContainer {
   // seen so far.  If more than k elements are pushed, then elements are
   // maintained in a min-heap order: container_.front() is
   // the index of the smallest of the top-k elements see so far.
-  std::vector<int32> container_;
+  std::vector<Tidx> container_;
 
   // Once more than k elements are pushed, the container becomes a min heap,
   // and is_heap_ becomes true.
@@ -161,7 +169,7 @@ class TopContainer {
   // Intuitively, compare_fun(a, b) returns true iff values_[b] < values_[a]
   // (notice the inversion of direction, not a typo); ties (==) are broken in
   // favor of earlier elements (i.e., a < b).
-  bool compare_fun(int32 a, int32 b) const {
+  bool compare_fun(Tidx a, Tidx b) const {
     if (values_[b] < values_[a]) {
       return true;
     } else if (values_[b] > values_[a]) {
@@ -173,10 +181,10 @@ class TopContainer {
 };
 
 // Mostly modeled on tensorflow/core/kernels/topk_op.cc for CPU.
-template <typename T>
+template <typename T, typename Tidx = int32>
 void TopK(int32 row_size, int32 num_rows, const T* data, int32 k,
-          int32* output_indexes, T* output_values) {
-  TopContainer<T> topc(k, row_size);
+          Tidx* output_indexes, T* output_values) {
+  TopContainer<T, Tidx> topc(k, row_size);
   for (int row = 0; row < num_rows; ++row) {
     const T* values_row = data + row * row_size;
     topc.start_collecting(values_row);
@@ -185,7 +193,7 @@ void TopK(int32 row_size, int32 num_rows, const T* data, int32 k,
     }
 
     // Prepare output buffers.
-    int32* indexes_row = output_indexes + row * k;
+    Tidx* indexes_row = output_indexes + row * k;
     T* output_row = output_values + row * k;
     // We always assume that the output is sorted.
     const auto& top_k = topc.sorted_result();
@@ -211,7 +219,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* top_k;
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTopK, &top_k));
-  TF_LITE_ENSURE_TYPES_EQ(context, top_k->type, kTfLiteInt32);
+
+  TF_LITE_ENSURE(context,
+                 top_k->type != kTfLiteInt32 || top_k->type != kTfLiteInt16);
 
   // Set output dynamic if the `top_k` tensor is not constant, or the input has
   // dynamic dimensions (indicated by dims signature).
@@ -230,6 +240,55 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+template <typename idx_type>
+TfLiteStatus TopKImpl(TfLiteContext* context, TfLiteNode* node, int32_t k,
+                      idx_type* output_indexes) {
+  // The tensor can have more than 2 dimensions or even be a vector, the code
+  // anyway calls the internal dimension as row;
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output_values;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputValues, &output_values));
+
+  const int32 row_size = input->dims->data[input->dims->size - 1];
+  int32 num_rows = 1;
+  for (int i = 0; i < input->dims->size - 1; ++i) {
+    num_rows *= input->dims->data[i];
+  }
+  switch (output_values->type) {
+    case kTfLiteFloat32:
+      TopK(row_size, num_rows, GetTensorData<float>(input), k, output_indexes,
+           GetTensorData<float>(output_values));
+      break;
+    case kTfLiteUInt8:
+      TopK(row_size, num_rows, GetTensorData<uint8_t>(input), k, output_indexes,
+           output_values->data.uint8);
+      break;
+    case kTfLiteInt8:
+      TopK(row_size, num_rows, GetTensorData<int8_t>(input), k, output_indexes,
+           output_values->data.int8);
+      break;
+    case kTfLiteInt16:
+      TopK(row_size, num_rows, GetTensorData<int16_t>(input), k, output_indexes,
+           output_values->data.i16);
+      break;
+    case kTfLiteInt32:
+      TopK(row_size, num_rows, GetTensorData<int32_t>(input), k, output_indexes,
+           output_values->data.i32);
+      break;
+    case kTfLiteInt64:
+      TopK(row_size, num_rows, GetTensorData<int64_t>(input), k, output_indexes,
+           output_values->data.i64);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s is currently not supported by TopK.",
+                         TfLiteTypeGetName(output_values->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output_values;
   TF_LITE_ENSURE_OK(
@@ -242,45 +301,38 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
   const TfLiteTensor* top_k;
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTopK, &top_k));
-  const int32 k = top_k->data.i32[0];
-  // The tensor can have more than 2 dimensions or even be a vector, the code
-  // anyway calls the internal dimension as row;
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  const int32 row_size = input->dims->data[input->dims->size - 1];
-  int32 num_rows = 1;
-  for (int i = 0; i < input->dims->size - 1; ++i) {
-    num_rows *= input->dims->data[i];
-  }
-  switch (output_values->type) {
-    case kTfLiteFloat32:
-      TopK(row_size, num_rows, GetTensorData<float>(input), k,
-           output_indexes->data.i32, GetTensorData<float>(output_values));
-      break;
-    case kTfLiteUInt8:
-      TopK(row_size, num_rows, input->data.uint8, k, output_indexes->data.i32,
-           output_values->data.uint8);
-      break;
-    case kTfLiteInt8:
-      TopK(row_size, num_rows, input->data.int8, k, output_indexes->data.i32,
-           output_values->data.int8);
-      break;
+  int32 k;
+
+  switch (top_k->type) {
     case kTfLiteInt32:
-      TopK(row_size, num_rows, input->data.i32, k, output_indexes->data.i32,
-           output_values->data.i32);
+      k = top_k->data.i32[0];
       break;
-    case kTfLiteInt64:
-      TopK(row_size, num_rows, input->data.i64, k, output_indexes->data.i32,
-           output_values->data.i64);
+    case kTfLiteInt16:
+      k = top_k->data.i16[0];
       break;
     default:
-      TF_LITE_KERNEL_LOG(context, "Type %s is currently not supported by TopK.",
+      TF_LITE_KERNEL_LOG(context,
+                         "Type %s is currently not supported k Type by TopK.",
                          TfLiteTypeGetName(output_values->type));
       return kTfLiteError;
   }
 
+  switch (output_indexes->type) {
+    case kTfLiteInt32: {
+      return TopKImpl(context, node, k, GetTensorData<int32_t>(output_indexes));
+    } break;
+    case kTfLiteInt16: {
+      return TopKImpl(context, node, k, GetTensorData<int16_t>(output_indexes));
+    } break;
+    default:
+      TF_LITE_KERNEL_LOG(
+          context, "Output index type %s is currently not supported by TopK.",
+          TfLiteTypeGetName(output_values->type));
+  }
+
   return kTfLiteOk;
 }
+
 }  // namespace topk_v2
 TfLiteRegistration* Register_TOPK_V2() {
   static TfLiteRegistration r = {nullptr, nullptr, topk_v2::Prepare,
diff --git a/tensorflow/lite/kernels/transpose.cc b/tensorflow/lite/kernels/transpose.cc
index f316c28fe44..a22427c7a81 100644
--- a/tensorflow/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@@ -91,7 +91,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_TYPES_EQ(context, op_context.input->type,
                           op_context.output->type);
 
-  if (!IsConstantTensor(op_context.perm)) {
+  if (!IsConstantOrPersistentTensor(op_context.perm)) {
     SetTensorToDynamic(op_context.output);
     return kTfLiteOk;
   }
@@ -116,6 +116,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   CpuBackendContext* cpu_backend_context =
       CpuBackendContext::GetFromContext(context);
   pthreadpool_t threadpool = cpu_backend_context->get_xnnpack_threadpool();
+  // TODO (grantjensen): Add threading.
   threadpool = nullptr;
   std::array<size_t, kTransposeMaxDimensions> xnn_input_shape;
   std::array<size_t, kTransposeMaxDimensions> xnn_perm;
@@ -152,7 +153,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         status = xnn_run_transpose_nd_x32(
             GetTensorData<int32_t>(op_context.input),
             GetTensorData<int32_t>(op_context.output), size,
-            xnn_input_shape.data(), xnn_perm.data(), /*flags=*/0, threadpool);
+            xnn_input_shape.data(), xnn_perm.data(),
+            /*flags=*/XNN_FLAG_YIELD_WORKERS, threadpool);
         if (status != xnn_status_success) {
           TFLITE_LOG(
               TFLITE_LOG_INFO,
@@ -180,7 +182,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         status = xnn_run_transpose_nd_x8(
             GetTensorData<int8_t>(op_context.input),
             GetTensorData<int8_t>(op_context.output), size,
-            xnn_input_shape.data(), xnn_perm.data(), /*flags=*/0, threadpool);
+            xnn_input_shape.data(), xnn_perm.data(),
+            /*flags=*/XNN_FLAG_YIELD_WORKERS, threadpool);
         if (status != xnn_status_success) {
           TFLITE_LOG(
               TFLITE_LOG_INFO,
@@ -201,7 +204,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         status = xnn_run_transpose_nd_x16(
             GetTensorData<int16_t>(op_context.input),
             GetTensorData<int16_t>(op_context.output), size,
-            xnn_input_shape.data(), xnn_perm.data(), /*flags=*/0, threadpool);
+            xnn_input_shape.data(), xnn_perm.data(),
+            /*flags=*/XNN_FLAG_YIELD_WORKERS, threadpool);
         if (status != xnn_status_success) {
           TFLITE_LOG(
               TFLITE_LOG_INFO,
diff --git a/tensorflow/lite/kernels/unsorted_segment.cc b/tensorflow/lite/kernels/unsorted_segment.cc
index 42369ccfb9e..a6568fa6267 100644
--- a/tensorflow/lite/kernels/unsorted_segment.cc
+++ b/tensorflow/lite/kernels/unsorted_segment.cc
@@ -106,8 +106,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, segment_ids->type, kTfLiteInt32);
   TF_LITE_ENSURE_EQ(context, num_segments->type, kTfLiteInt32);
 
-  if (IsDynamicTensor(data) || !IsConstantTensor(segment_ids) ||
-      !IsConstantTensor(num_segments)) {
+  if (IsDynamicTensor(data) || !IsConstantOrPersistentTensor(segment_ids) ||
+      !IsConstantOrPersistentTensor(num_segments)) {
     SetTensorToDynamic(output);
     return kTfLiteOk;
   }
diff --git a/tensorflow/lite/kernels/where.cc b/tensorflow/lite/kernels/where.cc
index 892a6522a4c..0e7db9e6c4a 100644
--- a/tensorflow/lite/kernels/where.cc
+++ b/tensorflow/lite/kernels/where.cc
@@ -64,7 +64,7 @@ TfLiteStatus PrepareOutput(TfLiteContext* context,
 
   // Exit early if cond is a non-const tensor. Set output tensor to dynamic so
   // output size can be determined in Eval.
-  if (!IsConstantTensor(cond_tensor)) {
+  if (!IsConstantOrPersistentTensor(cond_tensor)) {
     SetTensorToDynamic(output);
     return kTfLiteOk;
   }
diff --git a/tensorflow/lite/kernels/while.cc b/tensorflow/lite/kernels/while.cc
index 5af1be67511..4e76ecf278a 100644
--- a/tensorflow/lite/kernels/while.cc
+++ b/tensorflow/lite/kernels/while.cc
@@ -100,8 +100,7 @@ TfLiteStatus CopyTensorsData(TfLiteContext* context, Subgraph* src_subgraph,
     if (IsDynamicTensor(dst_tensor)) {
       TfLiteTensorRealloc(src_tensor->bytes, dst_tensor);
     }
-    TF_LITE_ENSURE_EQ(context, src_tensor->bytes, dst_tensor->bytes);
-    TfLiteTensorCopy(src_tensor, dst_tensor);
+    TF_LITE_ENSURE_OK(context, TfLiteTensorCopy(src_tensor, dst_tensor));
   }
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/mmap_allocation.cc b/tensorflow/lite/mmap_allocation.cc
index a324374aaa4..3d1a7f03e71 100644
--- a/tensorflow/lite/mmap_allocation.cc
+++ b/tensorflow/lite/mmap_allocation.cc
@@ -88,6 +88,7 @@ MMAPAllocation::MMAPAllocation(ErrorReporter* error_reporter, int owned_fd,
 #endif
 
   offset_in_buffer_ = offset % pagesize;
+  offset_of_buffer_in_file_ = offset - offset_in_buffer_;
 
   size_t file_size = GetFdSizeBytes(mmap_fd_);
   if (length + offset > file_size) {
diff --git a/tensorflow/lite/objc/BUILD.apple b/tensorflow/lite/objc/BUILD.apple
index 887fbcebdbd..3f26e3ad9b6 100644
--- a/tensorflow/lite/objc/BUILD.apple
+++ b/tensorflow/lite/objc/BUILD.apple
@@ -73,6 +73,24 @@ objc_library(
     alwayslink = 1,
 )
 
+objc_library(
+    name = "TensorFlowLite_without_op_resolver",
+    srcs = SOURCES,
+    hdrs = API_HEADERS,
+    copts = RELEASE_COPTS,
+    tags = TFL_DEFAULT_TAGS,
+    visibility = ios_visibility_allowlist(),
+    deps = [
+        "//tensorflow/lite/core/c:c_api_experimental_without_op_resolver",
+        "//tensorflow/lite/core/c:c_api_without_op_resolver",
+        "//tensorflow/lite/delegates/coreml:coreml_delegate",
+        "//tensorflow/lite/delegates/gpu:metal_delegate",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "//third_party/apple_frameworks:Foundation",
+    ],
+    alwayslink = 1,
+)
+
 # NOTE: This test target name must be lower-cased in order to match it with the
 # directory name. (See: b/174508866)
 ios_unit_test(
@@ -103,10 +121,9 @@ objc_library(
         "//tensorflow/lite:testdata/add_quantized.bin",
         "//tensorflow/lite:testdata/multi_signatures.bin",
     ],
+    sdk_frameworks = ["XCTest"],
     tags = TFL_DEFAULT_TAGS + ["builder_default_ios_x86_64"],
-    deps = [
-        ":TensorFlowLite",
-    ],
+    deps = [":TensorFlowLite"],
 )
 
 ios_application(
@@ -138,8 +155,8 @@ objc_library(
     ],
     module_name = "TestApp",
     tags = TFL_DEFAULT_TAGS + [
-        "manual",
         "builder_default_ios_x86_64",
+        "manual",
     ],
     deps = [
         ":TensorFlowLite",
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index 706efec0329..968f7e8ccdb 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -363,6 +363,8 @@ const char* AllocTypeName(TfLiteAllocationType type) {
       return "kTfLitePersistentRo";
     case kTfLiteCustom:
       return "kTfLiteCustom";
+    case kTfLiteVariantObject:
+      return "kTfLiteVariantObject";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/profiling/subgraph_tensor_profiler.cc b/tensorflow/lite/profiling/subgraph_tensor_profiler.cc
index 5858097b8ad..930eff188ba 100644
--- a/tensorflow/lite/profiling/subgraph_tensor_profiler.cc
+++ b/tensorflow/lite/profiling/subgraph_tensor_profiler.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/profiling/subgraph_tensor_profiler.h"
 
+#include <cstring>
+
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 34a6fee2f69..81e4697ea84 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -1,6 +1,7 @@
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_py_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_contrib_test", "pytype_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -37,7 +38,6 @@ py_library(
     deps = [
         "//tensorflow/lite/python/interpreter_wrapper:_pywrap_tensorflow_interpreter_wrapper",
         "//tensorflow/lite/python/metrics",
-        "//tensorflow/python:util",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -64,7 +64,7 @@ py_test(
         "//tensorflow/lite/python/testdata:_pywrap_test_registerer",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:resource_loader",
         "//third_party/py/numpy",
     ],
 )
@@ -90,6 +90,8 @@ py_library(
     deps = [
         ":tflite_convert_lib",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/util:keras_deps",
         "@absl_py//absl:app",
     ],
 )
@@ -103,8 +105,8 @@ py_library(
         ":lite",
         "//tensorflow/lite/toco/logging:gen_html",
         "//tensorflow/lite/toco/logging:toco_conversion_log_proto_py",
-        "//tensorflow/python:util",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/util:keras_deps",
         "@absl_py//absl:app",
     ],
 )
@@ -131,6 +133,9 @@ py_test(
     python_version = "PY3",
     deps = [
         ":test_util",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
 
@@ -162,11 +167,12 @@ py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:tf2",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:resource_loader",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/trackable:autotrackable",
@@ -203,12 +209,13 @@ py_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:tag_constants",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:keras_deps",
         "//tensorflow/python/util:tf_export",
         "@absl_py//absl/logging",
     ],
@@ -232,6 +239,8 @@ py_test(
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
 
@@ -253,10 +262,12 @@ py_test(
         ":lite_v2_test_util",
         ":test_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_options_proto_py",
         "//tensorflow/lite/python/testdata:_pywrap_test_registerer",
         "//tensorflow/lite/python/testdata:double_op",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
 
@@ -328,13 +339,13 @@ py_test(
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:convert_to_constants",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
+        "//tensorflow/python:while_loop",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -347,8 +358,10 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:keras_deps",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -361,7 +374,6 @@ py_library(
     deps = [
         "//tensorflow/python:_pywrap_toco_api",
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:util",
     ],
 )
 
@@ -372,10 +384,12 @@ py_library(
     deps = [
         "//tensorflow/lite/toco:toco_flags_proto_py",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python/util:all_util",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+pytype_strict_library(
     name = "convert",
     srcs = ["convert.py"],
     srcs_version = "PY3",
@@ -385,15 +399,16 @@ py_library(
         ":lite_constants",
         ":util",
         ":wrap_toco",
+        "//tensorflow/lite/python/metrics:converter_error_data_proto_py",
         "//tensorflow/lite/python/metrics:metrics_wrapper",
         "//tensorflow/lite/toco:model_flags_proto_py",
         "//tensorflow/lite/toco:toco_flags_proto_py",
         "//tensorflow/lite/toco/python:toco_from_protos",
         "//tensorflow/lite/tools:flatbuffer_utils",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:platform",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:resource_loader",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -401,30 +416,37 @@ py_library(
 py_library(
     name = "op_hint",
     srcs = ["op_hint.py"],
-    srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:graph_util",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:all_util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_test(
+pytype_strict_contrib_test(
     name = "convert_test",
     srcs = ["convert_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY3",
     deps = [
         ":convert",
         ":interpreter",
         ":op_hint",
+        "//tensorflow/lite/python/metrics:converter_error_data_proto_py",
+        "//tensorflow/lite/python/metrics:metrics_wrapper",
+        "//tensorflow/lite/toco:toco_flags_proto_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
+        "//tensorflow/python/framework:graph_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -439,7 +461,7 @@ py_library(
         ":convert_phase",
         ":util",
         "//tensorflow/python:graph_util",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model",
     ],
 )
@@ -490,7 +512,7 @@ py_library(
     srcs_version = "PY3",
     visibility = ["//tensorflow/lite/schema:utils_friends"],
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:all_util",
     ],
 )
 
@@ -535,5 +557,6 @@ py_test(
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 59fc9ca90c8..79c9cf7d205 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -21,8 +21,10 @@ import os as _os
 import platform as _platform
 import subprocess as _subprocess
 import tempfile as _tempfile
+from typing import Optional
 import warnings
 
+from tensorflow.compiler.mlir.quantization.stablehlo import quantization_options_pb2 as quant_opts_pb2
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import util
 from tensorflow.lite.python import wrap_toco
@@ -30,6 +32,7 @@ from tensorflow.lite.python.convert_phase import Component
 from tensorflow.lite.python.convert_phase import convert_phase
 from tensorflow.lite.python.convert_phase import ConverterError
 from tensorflow.lite.python.convert_phase import SubComponent
+from tensorflow.lite.python.metrics import converter_error_data_pb2
 from tensorflow.lite.python.metrics.wrapper import metrics_wrapper as _metrics_wrapper
 from tensorflow.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.lite.toco import toco_flags_pb2 as _conversion_flags_pb2
@@ -43,7 +46,8 @@ from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 
 def _is_quantized_input_stats_required(
-    conversion_flags: _conversion_flags_pb2.TocoFlags()) -> bool:
+    conversion_flags: _conversion_flags_pb2.TocoFlags,
+) -> bool:
   """Checks if the `quantized_input_stats` flag is required for conversion.
 
   Args:
@@ -53,17 +57,19 @@ def _is_quantized_input_stats_required(
     True, if the `inference_type` or the `inference_input_type` is a quantized
     type and it is not post training quantization, else False.
   """
-  quantized_inference_types = ([
-      _types_pb2.QUANTIZED_UINT8, _types_pb2.QUANTIZED_INT8
-  ])
-  return ((conversion_flags.inference_type in quantized_inference_types or
-           conversion_flags.inference_input_type in quantized_inference_types)
-          and not conversion_flags.post_training_quantize)
+  quantized_inference_types = [
+      _types_pb2.QUANTIZED_UINT8,
+      _types_pb2.QUANTIZED_INT8,
+  ]
+  return (
+      conversion_flags.inference_type in quantized_inference_types
+      or conversion_flags.inference_input_type in quantized_inference_types
+  ) and not conversion_flags.post_training_quantize
 
 
-def convert_tensor_tf_type_to_tflite_type(tf_type: dtypes.DType,
-                                          usage: str = ""
-                                         ) -> _types_pb2.IODataType:
+def convert_tensor_tf_type_to_tflite_type(
+    tf_type: dtypes.DType, usage: str = ""
+) -> _types_pb2.IODataType:
   """Convert tensor type from tf type to tflite type.
 
   Args:
@@ -97,15 +103,17 @@ def convert_tensor_tf_type_to_tflite_type(tf_type: dtypes.DType,
   if tflite_type is None:
     raise ValueError(
         "Unsupported TensorFlow type `{0}` provided for the {1}".format(
-            tf_type, usage))
+            tf_type, usage
+        )
+    )
   return tflite_type
 
 
 # Only a few restricted tensor types are allowed for explicitly setting
 # inference/input/output types.
-def convert_inference_tf_type_to_tflite_type(tf_type: dtypes.DType,
-                                             usage: str = ""
-                                            ) -> _types_pb2.IODataType:
+def convert_inference_tf_type_to_tflite_type(
+    tf_type: dtypes.DType, usage: str = ""
+) -> _types_pb2.IODataType:
   """Convert inference type from tf type to tflite type.
 
   Args:
@@ -128,7 +136,9 @@ def convert_inference_tf_type_to_tflite_type(tf_type: dtypes.DType,
   if tflite_type is None:
     raise ValueError(
         "Unsupported TensorFlow type `{0}` provided for the {1}".format(
-            tf_type, usage))
+            tf_type, usage
+        )
+    )
   return tflite_type
 
 
@@ -138,14 +148,15 @@ if lite_constants.EXPERIMENTAL_USE_TOCO_API_DIRECTLY:
   _deprecated_conversion_binary = ""
 else:
   _deprecated_conversion_binary = _resource_loader.get_path_to_datafile(
-      "../toco/python/toco_from_protos")
+      "../toco/python/toco_from_protos"
+  )
   if not _os.path.exists(_deprecated_conversion_binary):
     _deprecated_conversion_binary = "toco_from_protos"
 
 
 def _try_convert_to_unicode(output):
   if output is None:
-    return u""
+    return ""
 
   if isinstance(output, bytes):
     try:
@@ -161,6 +172,7 @@ class OpsSet(enum.Enum):
 
   WARNING: Experimental interface, subject to change.
   """
+
   # Convert model using TensorFlow Lite builtin ops.
   TFLITE_BUILTINS = "TFLITE_BUILTINS"
 
@@ -185,7 +197,8 @@ class OpsSet(enum.Enum):
   # They are only compatible with CPU execution, and have not been optimized for
   # production.
   EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = (
-      "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8")
+      "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+  )
 
   # Convert model using only stablehlo ops.
   # This option can not be combined with other OpsSets.
@@ -205,17 +218,19 @@ class OpsSet(enum.Enum):
 
 
 @convert_phase(Component.OPTIMIZE_TFLITE_MODEL, SubComponent.QUANTIZE)
-def mlir_quantize(input_data_str,
-                  disable_per_channel=False,
-                  fully_quantize=False,
-                  inference_type=_types_pb2.QUANTIZED_INT8,
-                  input_data_type=dtypes.float32,
-                  output_data_type=dtypes.float32,
-                  enable_numeric_verify=False,
-                  enable_whole_model_verify=False,
-                  denylisted_ops=None,
-                  denylisted_nodes=None,
-                  enable_variable_quantization=False):
+def mlir_quantize(
+    input_data_str,
+    disable_per_channel=False,
+    fully_quantize=False,
+    inference_type=_types_pb2.QUANTIZED_INT8,
+    input_data_type=dtypes.float32,
+    output_data_type=dtypes.float32,
+    enable_numeric_verify=False,
+    enable_whole_model_verify=False,
+    denylisted_ops=None,
+    denylisted_nodes=None,
+    enable_variable_quantization=False,
+):
   """Quantize `input_data_str` with calibration results.
 
   Args:
@@ -246,11 +261,18 @@ def mlir_quantize(input_data_str,
     inputs and outputs.
   """
   return wrap_toco.wrapped_experimental_mlir_quantize(
-      input_data_str, disable_per_channel, fully_quantize, inference_type,
+      input_data_str,
+      disable_per_channel,
+      fully_quantize,
+      inference_type,
       convert_tensor_tf_type_to_tflite_type(input_data_type),
       convert_tensor_tf_type_to_tflite_type(output_data_type),
-      enable_numeric_verify, enable_whole_model_verify, denylisted_ops,
-      denylisted_nodes, enable_variable_quantization)
+      enable_numeric_verify,
+      enable_whole_model_verify,
+      denylisted_ops,
+      denylisted_nodes,
+      enable_variable_quantization,
+  )
 
 
 @convert_phase(Component.OPTIMIZE_TFLITE_MODEL, SubComponent.SPARSIFY)
@@ -279,23 +301,24 @@ def register_custom_opdefs(custom_opdefs_list):
   return wrap_toco.wrapped_register_custom_opdefs(custom_opdefs_list)
 
 
-def convert(model_flags_str,
-            conversion_flags_str,
-            input_data_str,
-            debug_info_str=None,
-            enable_mlir_converter=True):
+def convert(
+    model_flags: _model_flags_pb2.ModelFlags,
+    conversion_flags: _conversion_flags_pb2.TocoFlags,
+    input_data_str: Optional[str] = None,
+    debug_info_str: Optional[str] = None,
+    enable_mlir_converter: bool = True,
+):
   """Converts `input_data_str` to a TFLite model.
 
   Args:
-    model_flags_str: Serialized proto describing model properties, see
-      `model_flags.proto`.
-    conversion_flags_str: Serialized proto describing conversion properties, see
+    model_flags: Proto describing model properties, see `model_flags.proto`.
+    conversion_flags: Proto describing conversion properties, see
       `toco/toco_flags.proto`.
     input_data_str: Input data in serialized form (e.g. a graphdef is common, or
       it can be hlo text or proto)
     debug_info_str: Serialized `GraphDebugInfo` proto describing logging
-      information. (default None)
-    enable_mlir_converter: Enables MLIR-based conversion. (default True)
+      information.
+    enable_mlir_converter: Enables MLIR-based conversion.
 
   Returns:
     Converted model in serialized form (e.g. a TFLITE model is common).
@@ -310,28 +333,54 @@ def convert(model_flags_str,
   # pipeline surfaces errors instead, and can be safely run in-process.
   if enable_mlir_converter or not _deprecated_conversion_binary:
     try:
-      model_str = wrap_toco.wrapped_toco_convert(model_flags_str,
-                                                 conversion_flags_str,
-                                                 input_data_str, debug_info_str,
-                                                 enable_mlir_converter)
-      return model_str
+      return wrap_toco.wrapped_toco_convert(
+          model_flags.SerializeToString(),
+          conversion_flags.SerializeToString(),
+          input_data_str,
+          debug_info_str,
+          enable_mlir_converter,
+      )
     except Exception as e:
       converter_error = ConverterError(str(e))
+
       for error_data in _metrics_wrapper.retrieve_collected_errors():
         converter_error.append_error(error_data)
+        # Seldom we encounter the case where an unsupported
+        # `StatefulPartitionedCallOp` is not inlined and remains in the final
+        # IR. If this occurs we can set `guarantee_all_funcs_one_use` and retry.
+        # This makes the converter copy functions definitions called by
+        # multiple StatefulPartitionedCall, thus allowing them to be properly
+        # inlined.
+        if (
+            error_data.error_code
+            == converter_error_data_pb2.ConverterErrorData.ERROR_STATEFUL_PARTITIONED_CALL_IN_FINAL_IR
+            and not conversion_flags.guarantee_all_funcs_one_use
+        ):
+          conversion_flags.guarantee_all_funcs_one_use = True
+          return convert(
+              model_flags,
+              conversion_flags,
+              input_data_str,
+              debug_info_str,
+              enable_mlir_converter,
+          )
       raise converter_error
 
-  return _run_deprecated_conversion_binary(model_flags_str,
-                                           conversion_flags_str, input_data_str,
-                                           debug_info_str)
+  return _run_deprecated_conversion_binary(
+      model_flags.SerializeToString(),
+      conversion_flags.SerializeToString(),
+      input_data_str,
+      debug_info_str,
+  )
 
 
-@convert_phase(Component.CONVERT_TF_TO_TFLITE_MODEL,
-               SubComponent.CONVERT_GRAPHDEF_USING_DEPRECATED_CONVERTER)
-def _run_deprecated_conversion_binary(model_flags_str,
-                                      conversion_flags_str,
-                                      input_data_str,
-                                      debug_info_str=None):
+@convert_phase(
+    Component.CONVERT_TF_TO_TFLITE_MODEL,
+    SubComponent.CONVERT_GRAPHDEF_USING_DEPRECATED_CONVERTER,
+)
+def _run_deprecated_conversion_binary(
+    model_flags_str, conversion_flags_str, input_data_str, debug_info_str=None
+):
   """Convert `input_data_str` using deprecated conversion binary.
 
   Args:
@@ -364,16 +413,23 @@ Alternative, use virtualenv.""")
   # Windows and TemporaryFile are not that useful together,
   # since you cannot have two readers/writers. So we have to
   # make the temporaries and close and delete them explicitly.
-  conversion_filename, model_filename, input_filename, output_filename = (None,
-                                                                          None,
-                                                                          None,
-                                                                          None)
+  conversion_filename, model_filename, input_filename, output_filename = (
+      None,
+      None,
+      None,
+      None,
+  )
   try:
     # Build all input files
-    with _tempfile.NamedTemporaryFile(delete=False) as fp_conversion, \
-             _tempfile.NamedTemporaryFile(delete=False) as fp_model, \
-             _tempfile.NamedTemporaryFile(delete=False) as fp_input, \
-             _tempfile.NamedTemporaryFile(delete=False) as fp_debug:
+    with _tempfile.NamedTemporaryFile(
+        delete=False
+    ) as fp_conversion, _tempfile.NamedTemporaryFile(
+        delete=False
+    ) as fp_model, _tempfile.NamedTemporaryFile(
+        delete=False
+    ) as fp_input, _tempfile.NamedTemporaryFile(
+        delete=False
+    ) as fp_debug:
       conversion_filename = fp_conversion.name
       input_filename = fp_input.name
       model_filename = fp_model.name
@@ -416,7 +472,8 @@ Alternative, use virtualenv.""")
         shell=True,
         stdout=_subprocess.PIPE,
         stderr=_subprocess.STDOUT,
-        close_fds=not is_windows)
+        close_fds=not is_windows,
+    )
     stdout, stderr = proc.communicate()
     exitcode = proc.returncode
     if exitcode == 0:
@@ -429,7 +486,10 @@ Alternative, use virtualenv.""")
   finally:
     # Must manually cleanup files.
     for filename in [
-        conversion_filename, input_filename, model_filename, output_filename
+        conversion_filename,
+        input_filename,
+        model_filename,
+        output_filename,
     ]:
       try:
         _os.unlink(filename)
@@ -437,13 +497,15 @@ Alternative, use virtualenv.""")
         pass
 
 
-def build_model_flags(change_concat_input_ranges=False,
-                      allow_nonexistent_arrays=False,
-                      saved_model_dir=None,
-                      saved_model_version=0,
-                      saved_model_tags=None,
-                      saved_model_exported_names=None,
-                      **_):
+def build_model_flags(
+    change_concat_input_ranges=False,
+    allow_nonexistent_arrays=False,
+    saved_model_dir=None,
+    saved_model_version=0,
+    saved_model_tags=None,
+    saved_model_exported_names=None,
+    **_
+):
   """Builds the model flags object from params.
 
   Args:
@@ -481,41 +543,44 @@ def build_model_flags(change_concat_input_ranges=False,
   return model_flags
 
 
-def build_conversion_flags(inference_type=dtypes.float32,
-                           inference_input_type=None,
-                           input_format=lite_constants.TENSORFLOW_GRAPHDEF,
-                           output_format=lite_constants.TFLITE,
-                           default_ranges_stats=None,
-                           drop_control_dependency=True,
-                           reorder_across_fake_quant=False,
-                           allow_custom_ops=False,
-                           post_training_quantize=False,
-                           quantize_to_float16=False,
-                           dump_graphviz_dir=None,
-                           dump_graphviz_video=False,
-                           target_ops=None,
-                           conversion_summary_dir=None,
-                           select_user_tf_ops=None,
-                           allow_all_select_tf_ops=False,
-                           enable_tflite_resource_variables=True,
-                           unfold_batchmatmul=True,
-                           lower_tensor_list_ops=True,
-                           default_to_single_batch_in_tensor_list_ops=False,
-                           accumulation_type=None,
-                           allow_bfloat16=False,
-                           unfold_large_splat_constant=False,
-                           supported_backends=None,
-                           disable_per_channel_quantization=False,
-                           enable_mlir_dynamic_range_quantizer=False,
-                           tf_quantization_mode=None,
-                           disable_infer_tensor_range=False,
-                           use_fake_quant_num_bits=False,
-                           enable_dynamic_update_slice=False,
-                           preserve_assert_op=False,
-                           guarantee_all_funcs_one_use=False,
-                           enable_mlir_variable_quantization=False,
-                           disable_fuse_mul_and_fc=False,
-                           **_):
+def build_conversion_flags(
+    inference_type=dtypes.float32,
+    inference_input_type=None,
+    input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+    output_format=lite_constants.TFLITE,
+    default_ranges_stats=None,
+    drop_control_dependency=True,
+    reorder_across_fake_quant=False,
+    allow_custom_ops=False,
+    post_training_quantize=False,
+    quantize_to_float16=False,
+    dump_graphviz_dir=None,
+    dump_graphviz_video=False,
+    target_ops=None,
+    conversion_summary_dir=None,
+    select_user_tf_ops=None,
+    allow_all_select_tf_ops=False,
+    enable_tflite_resource_variables=True,
+    unfold_batchmatmul=True,
+    lower_tensor_list_ops=True,
+    default_to_single_batch_in_tensor_list_ops=False,
+    accumulation_type=None,
+    allow_bfloat16=False,
+    unfold_large_splat_constant=False,
+    supported_backends=None,
+    disable_per_channel_quantization=False,
+    enable_mlir_dynamic_range_quantizer=False,
+    tf_quantization_mode=None,
+    disable_infer_tensor_range=False,
+    use_fake_quant_num_bits=False,
+    enable_dynamic_update_slice=False,
+    preserve_assert_op=False,
+    guarantee_all_funcs_one_use=False,
+    enable_mlir_variable_quantization=False,
+    disable_fuse_mul_and_fc=False,
+    quantization_options: Optional[quant_opts_pb2.QuantizationOptions] = None,
+    **_
+):
   """Builds protocol buffer describing a conversion of a model.
 
   Typically this is to convert from TensorFlow GraphDef to TFLite, in which
@@ -550,7 +615,8 @@ def build_conversion_flags(inference_type=dtypes.float32,
       False)
     post_training_quantize: Boolean indicating whether to quantize the weights
       of the converted float model. Model size will be reduced and there will be
-      latency improvements (at the cost of accuracy). (default False)
+      latency improvements (at the cost of accuracy). (default False) If
+      quantization_options is set, all quantization arg will be ignored.
     quantize_to_float16: Boolean indicating whether to convert float buffers to
       float16. (default False)
     dump_graphviz_dir: Full filepath of folder to dump the graphs at various
@@ -605,6 +671,11 @@ def build_conversion_flags(inference_type=dtypes.float32,
       graph.
     disable_fuse_mul_and_fc: Disable fusing input multiplication with
       fullyconnected operations. Useful when quantizing weights.
+    quantization_options: Config to indicate quantization options of each
+      components (ex: weight, bias, activation). This can be a preset method or
+      a custom method, and allows finer, modular control. This option will
+      override any other existing quantization flags. We plan on gradually
+      migrating all quantization-related specs into this option.
 
   Returns:
     conversion_flags: protocol buffer describing the conversion process.
@@ -613,11 +684,14 @@ def build_conversion_flags(inference_type=dtypes.float32,
   """
   conversion_flags = _conversion_flags_pb2.TocoFlags()
   conversion_flags.inference_type = convert_inference_tf_type_to_tflite_type(
-      inference_type, usage="inference_type flag")
+      inference_type, usage="inference_type flag"
+  )
   if inference_input_type:
     conversion_flags.inference_input_type = (
         convert_inference_tf_type_to_tflite_type(
-            inference_input_type, usage="inference_input_type flag"))
+            inference_input_type, usage="inference_input_type flag"
+        )
+    )
   else:
     conversion_flags.inference_input_type = conversion_flags.inference_type
   conversion_flags.input_format = input_format
@@ -641,30 +715,36 @@ def build_conversion_flags(inference_type=dtypes.float32,
     if OpsSet.EXPERIMENTAL_STABLEHLO_OPS in target_ops:
       conversion_flags.convert_to_stablehlo = True
     if OpsSet.EXPERIMENTAL_STABLEHLO_OPS in target_ops and len(target_ops) > 1:
-      raise ValueError("StableHLO Ops set can not be specified with other Ops "
-                       "set together")
+      raise ValueError(
+          "StableHLO Ops set can not be specified with other Ops set together"
+      )
   if conversion_summary_dir:
     conversion_flags.conversion_summary_dir = conversion_summary_dir
   if select_user_tf_ops:
     conversion_flags.select_user_tf_ops.extend(select_user_tf_ops)
   conversion_flags.allow_all_select_tf_ops = allow_all_select_tf_ops
   conversion_flags.enable_tflite_resource_variables = (
-      enable_tflite_resource_variables)
+      enable_tflite_resource_variables
+  )
   conversion_flags.unfold_batchmatmul = unfold_batchmatmul
   conversion_flags.lower_tensor_list_ops = lower_tensor_list_ops
   conversion_flags.default_to_single_batch_in_tensor_list_ops = (
-      default_to_single_batch_in_tensor_list_ops)
+      default_to_single_batch_in_tensor_list_ops
+  )
   if accumulation_type:
     conversion_flags.accumulation_type = convert_tensor_tf_type_to_tflite_type(
-        accumulation_type, usage="accumulation_type flag")
+        accumulation_type, usage="accumulation_type flag"
+    )
   conversion_flags.allow_bfloat16 = allow_bfloat16
   conversion_flags.unfold_large_splat_constant = unfold_large_splat_constant
   if supported_backends:
     conversion_flags.supported_backends.extend(supported_backends)
   conversion_flags.disable_per_channel_quantization = (
-      disable_per_channel_quantization)
+      disable_per_channel_quantization
+  )
   conversion_flags.enable_mlir_dynamic_range_quantizer = (
-      enable_mlir_dynamic_range_quantizer)
+      enable_mlir_dynamic_range_quantizer
+  )
   conversion_flags.enable_dynamic_update_slice = enable_dynamic_update_slice
   conversion_flags.preserve_assert_op = preserve_assert_op
   conversion_flags.guarantee_all_funcs_one_use = guarantee_all_funcs_one_use
@@ -673,16 +753,24 @@ def build_conversion_flags(inference_type=dtypes.float32,
   conversion_flags.disable_infer_tensor_range = disable_infer_tensor_range
   conversion_flags.use_fake_quant_num_bits = use_fake_quant_num_bits
   conversion_flags.enable_mlir_variable_quantization = (
-      enable_mlir_variable_quantization)
+      enable_mlir_variable_quantization
+  )
   conversion_flags.disable_fuse_mul_and_fc = disable_fuse_mul_and_fc
+  if quantization_options:
+    conversion_flags.quantization_options.CopyFrom(quantization_options)
   return conversion_flags
 
 
-@convert_phase(Component.CONVERT_TF_TO_TFLITE_MODEL,
-               SubComponent.CONVERT_GRAPHDEF)
-def convert_graphdef_with_arrays(input_data, input_arrays_with_shape,
-                                 output_arrays, control_output_arrays,
-                                 **kwargs):
+@convert_phase(
+    Component.CONVERT_TF_TO_TFLITE_MODEL, SubComponent.CONVERT_GRAPHDEF
+)
+def convert_graphdef_with_arrays(
+    input_data,
+    input_arrays_with_shape,
+    output_arrays,
+    control_output_arrays,
+    **kwargs
+):
   """Convert a frozen GraphDef that can't be loaded in TF.
 
   Conversion can be customized by providing arguments that are forwarded to
@@ -718,13 +806,15 @@ def convert_graphdef_with_arrays(input_data, input_arrays_with_shape,
     input_array = model_flags.input_arrays.add()
     if _is_quantized_input_stats_required(conversion_flags):
       if quantized_input_stats:
-        input_array.mean_value, input_array.std_value = (
-            quantized_input_stats[idx])
+        input_array.mean_value, input_array.std_value = quantized_input_stats[
+            idx
+        ]
       else:
         raise ValueError(
             "The `quantized_input_stats` flag must be defined when either "
             "`inference_type` flag or `inference_input_type` flag is set to "
-            "tf.int8 or tf.uint8.")
+            "tf.int8 or tf.uint8."
+        )
     input_array.name = name
     input_array.shape.dims.extend(list(map(int, shape)))
 
@@ -736,16 +826,18 @@ def convert_graphdef_with_arrays(input_data, input_arrays_with_shape,
       model_flags.control_output_arrays.append(name)
 
   data = convert(
-      model_flags.SerializeToString(),
-      conversion_flags.SerializeToString(),
+      model_flags,
+      conversion_flags,
       input_data.SerializeToString(),
       debug_info_str=None,
-      enable_mlir_converter=enable_mlir_converter)
+      enable_mlir_converter=enable_mlir_converter,
+  )
   return data
 
 
-@convert_phase(Component.CONVERT_TF_TO_TFLITE_MODEL,
-               SubComponent.CONVERT_GRAPHDEF)
+@convert_phase(
+    Component.CONVERT_TF_TO_TFLITE_MODEL, SubComponent.CONVERT_GRAPHDEF
+)
 def convert_graphdef(input_data, input_tensors, output_tensors, **kwargs):
   """Convert a frozen GraphDef model using the TF Lite converter.
 
@@ -781,17 +873,21 @@ def convert_graphdef(input_data, input_tensors, output_tensors, **kwargs):
     else:
       input_array.name = util.get_tensor_name(input_tensor)
     input_array.data_type = convert_tensor_tf_type_to_tflite_type(
-        input_tensor.dtype, usage="input type of the TensorFlow model")
+        input_tensor.dtype, usage="input type of the TensorFlow model"
+    )
 
     if _is_quantized_input_stats_required(conversion_flags):
       if quantized_input_stats:
-        input_array.mean_value, input_array.std_value = (
-            quantized_input_stats[idx])
+        input_array.mean_value, input_array.std_value = quantized_input_stats[
+            idx
+        ]
       else:
         # We should ideally raise an error here, but we don't as it would break
         # several models/projects that depend on this workflow.
-        warnings.warn("Statistics for quantized inputs were expected, but not "
-                      "specified; continuing anyway.")
+        warnings.warn(
+            "Statistics for quantized inputs were expected, but not "
+            "specified; continuing anyway."
+        )
 
     if input_shapes is None:
       shape = input_tensor.shape
@@ -802,8 +898,9 @@ def convert_graphdef(input_data, input_tensors, output_tensors, **kwargs):
       # Create shapes with -1 for unknown dimensions.
       dims = []
       for dim in shape:
-        if (dim is None or
-            (isinstance(dim, tensor_shape.Dimension) and dim.value is None)):
+        if dim is None or (
+            isinstance(dim, tensor_shape.Dimension) and dim.value is None
+        ):
           dims.append(-1)
         else:
           dims.append(int(dim))
@@ -819,31 +916,35 @@ def convert_graphdef(input_data, input_tensors, output_tensors, **kwargs):
       model_flags.output_arrays.append(util.get_tensor_name(output_tensor))
 
   data = convert(
-      model_flags.SerializeToString(),
-      conversion_flags.SerializeToString(),
+      model_flags,
+      conversion_flags,
       input_data.SerializeToString(),
       debug_info_str=debug_info.SerializeToString() if debug_info else None,
-      enable_mlir_converter=enable_mlir_converter)
+      enable_mlir_converter=enable_mlir_converter,
+  )
   return data
 
 
-@convert_phase(Component.CONVERT_TF_TO_TFLITE_MODEL,
-               SubComponent.CONVERT_SAVED_MODEL)
+@convert_phase(
+    Component.CONVERT_TF_TO_TFLITE_MODEL, SubComponent.CONVERT_SAVED_MODEL
+)
 def convert_saved_model(**kwargs):
   """Converts a SavedModel using TF Lite converter."""
   model_flags = build_model_flags(**kwargs)
   conversion_flags = build_conversion_flags(**kwargs)
   data = convert(
-      model_flags.SerializeToString(),
-      conversion_flags.SerializeToString(),
+      model_flags,
+      conversion_flags,
       input_data_str=None,
       debug_info_str=None,
-      enable_mlir_converter=True)
+      enable_mlir_converter=True,
+  )
   return data
 
 
-@convert_phase(Component.CONVERT_TF_TO_TFLITE_MODEL,
-               SubComponent.CONVERT_JAX_HLO)
+@convert_phase(
+    Component.CONVERT_TF_TO_TFLITE_MODEL, SubComponent.CONVERT_JAX_HLO
+)
 def convert_jax_hlo(input_content, input_names, is_proto_format, **kwargs):
   """Converts a Jax hlo-based model using TFLite converter."""
   model_flags = _model_flags_pb2.ModelFlags()
@@ -860,11 +961,12 @@ def convert_jax_hlo(input_content, input_names, is_proto_format, **kwargs):
 
   conversion_flags = build_conversion_flags(**kwargs)
   data = convert(
-      model_flags.SerializeToString(),
-      conversion_flags.SerializeToString(),
+      model_flags,
+      conversion_flags,
       input_data_str=input_content,
       debug_info_str=None,
-      enable_mlir_converter=True)
+      enable_mlir_converter=True,
+  )
   return data
 
 
@@ -892,8 +994,9 @@ def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
     Defined in `convert`.
   """
   kwargs["enable_mlir_converter"] = kwargs.get("enable_mlir_converter", False)
-  return convert_graphdef(input_data, input_tensors, output_tensors, *args,
-                          **kwargs)
+  return convert_graphdef(
+      input_data, input_tensors, output_tensors, *args, **kwargs
+  )
 
 
 def deduplicate_readonly_buffers(tflite_model):
@@ -931,8 +1034,10 @@ def deduplicate_readonly_buffers(tflite_model):
         # Ignore mutable tensors.
         if op.mutatingVariableInputs is not None:
           # Ignore invalid tensors.
-          if (i < len(op.mutatingVariableInputs) and
-              op.mutatingVariableInputs[i]):
+          if (
+              i < len(op.mutatingVariableInputs)
+              and op.mutatingVariableInputs[i]
+          ):
             continue
         # Ignore variable tensors.
         if subgraph.tensors[input_tensor_idx].isVariable:
@@ -962,9 +1067,11 @@ def deduplicate_readonly_buffers(tflite_model):
 
   # Ignore invalid negative index or zero-sized buffers.
   for buffer_idx in read_only_buffer_indices.copy():
-    if (buffer_idx < 0 or (model.buffers[buffer_idx].data is None or
-                           isinstance(model.buffers[buffer_idx].data, list) or
-                           model.buffers[buffer_idx].data.size == 0)):
+    if buffer_idx < 0 or (
+        model.buffers[buffer_idx].data is None
+        or isinstance(model.buffers[buffer_idx].data, list)
+        or model.buffers[buffer_idx].data.size == 0
+    ):
       read_only_buffer_indices.discard(buffer_idx)
 
   class BufferIndex:
@@ -978,16 +1085,20 @@ def deduplicate_readonly_buffers(tflite_model):
   read_only_buffers = list(
       map(
           lambda index: BufferIndex(  # pylint: disable=g-long-lambda
-              index, model.buffers[index].data.size,
-              hashlib.md5(model.buffers[index].data.data.tobytes()).hexdigest()
+              index,
+              model.buffers[index].data.size,
+              hashlib.md5(model.buffers[index].data.data.tobytes()).hexdigest(),
           ),
-          read_only_buffer_indices))
+          read_only_buffer_indices,
+      )
+  )
 
   # Sort read_only_buffers by buffer size & hash in descending order.
   read_only_buffers = sorted(
       read_only_buffers,
       key=lambda buffer: (buffer.size, buffer.hash_value),
-      reverse=True)
+      reverse=True,
+  )
 
   # Create a map of duplicate buffers (same size and same type).
   # eg: In [1, 2, 3, 4, 5, 6] if (1, 4, 6) and (2, 5) are each, groups of buffer
@@ -999,7 +1110,7 @@ def deduplicate_readonly_buffers(tflite_model):
       continue
     # This buffer is unique. Scan rest of the list to find duplicates
     # of this buffer and mark them accordingly.
-    for buffer_j in read_only_buffers[i + 1:]:
+    for buffer_j in read_only_buffers[i + 1 :]:
       if buffer_j.idx in duplicate_buffer_map:
         continue
       if buffer_i.size != buffer_j.size:
@@ -1017,8 +1128,9 @@ def deduplicate_readonly_buffers(tflite_model):
       for input_tensor in op.inputs:
         buffer_idx = subgraph.tensors[input_tensor].buffer
         if buffer_idx in duplicate_buffer_map:
-          subgraph.tensors[input_tensor].buffer = (
-              duplicate_buffer_map[buffer_idx])
+          subgraph.tensors[input_tensor].buffer = duplicate_buffer_map[
+              buffer_idx
+          ]
 
   # Nullify the unused buffers.
   for idx in duplicate_buffer_map:
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index 51cce1c1de9..faac3422f63 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -13,11 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 """TensorFlow Lite Python Interface: Sanity check."""
+from unittest import mock
 import numpy as np
 
 from tensorflow.lite.python import convert
 from tensorflow.lite.python import op_hint
 from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.lite.python.metrics import converter_error_data_pb2
+from tensorflow.lite.python.metrics.wrapper import metrics_wrapper
+from tensorflow.lite.toco import toco_flags_pb2 as _conversion_flags_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -31,6 +35,30 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+def _mock_wrapped_toco_convert(
+    unused_model_flags_str="",
+    conversion_flags_str="",
+    unused_input_data_str="",
+    unused_debug_info_str="",
+    unused_enable_mlir_converter=True,
+):
+  # Simulate the converter throwing and error when
+  # `guarantee_all_funcs_one_use` is not set.
+  if not _conversion_flags_pb2.TocoFlags.FromString(
+      conversion_flags_str
+  ).guarantee_all_funcs_one_use:
+    raise Exception()
+  else:
+    return bytes("A model", encoding="utf-8")
+
+
+def _mock_retrieve_errors():
+  err_data = converter_error_data_pb2.ConverterErrorData(
+      error_code=converter_error_data_pb2.ConverterErrorData.ERROR_STATEFUL_PARTITIONED_CALL_IN_FINAL_IR
+  )
+  return [err_data]
+
+
 class ConvertTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
@@ -45,6 +73,61 @@ class ConvertTest(test_util.TensorFlowTestCase):
         sess.graph_def, input_tensors=[in_tensor], output_tensors=[out_tensor])
     self.assertTrue(tflite_model)
 
+  @mock.patch.object(
+      convert,
+      "_deprecated_conversion_binary",
+      new="tocos_from_proto",
+  )
+  @mock.patch.object(
+      convert,
+      "_run_deprecated_conversion_binary",
+      autospec=True,
+  )
+  def testBasicDeprecatedConversionBinary(self, mock_func):
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32
+      )
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
+
+    convert.convert_graphdef(
+        sess.graph_def,
+        input_tensors=[in_tensor],
+        output_tensors=[out_tensor],
+        enable_mlir_converter=False,
+    )
+    mock_func.assert_called_once()
+
+  @mock.patch.object(
+      convert.wrap_toco, "wrapped_toco_convert", new=_mock_wrapped_toco_convert
+  )
+  @mock.patch.object(
+      metrics_wrapper, "retrieve_collected_errors", new=_mock_retrieve_errors
+  )
+  # This test wants to check that in the case of the converter throwing an
+  # `ERROR_STATEFUL_PARTITIONED_CALL_IN_FINAL_IR` error, it will
+  # retry conversion with the `guarantee_all_funcs_one_use` flag.
+  # We can wrap the convert call in order to assert it is called appropriately.
+  @mock.patch.object(convert, "convert", wraps=convert.convert)
+  def testConversionStatefulPartitionRetry(self, mock_convert):
+    with ops.Graph().as_default():
+      in_tensor = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32
+      )
+      out_tensor = in_tensor + in_tensor
+      sess = session.Session()
+
+    model = convert.convert_graphdef(
+        sess.graph_def,
+        input_tensors=[in_tensor],
+        output_tensors=[out_tensor],
+        enable_mlir_converter=True,
+        guarantee_all_funcs_one_use=False,
+    )
+    self.assertTrue(str(model, encoding="utf-8"), "A model")
+    self.assertEqual(mock_convert.call_count, 2)
+
   def testQuantization(self):
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(
@@ -85,14 +168,14 @@ class ConvertTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(input_details))
     self.assertEqual("input", input_details[0]["name"])
     self.assertEqual(np.float32, input_details[0]["dtype"])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]["shape"]).all())
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]["shape"]).all())  # type: ignore
     self.assertEqual((0., 0.), input_details[0]["quantization"])
 
     output_details = interpreter.get_output_details()
     self.assertEqual(1, len(output_details))
     self.assertEqual("add", output_details[0]["name"])
     self.assertEqual(np.float32, output_details[0]["dtype"])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]["shape"]).all())
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]["shape"]).all())  # type: ignore
     self.assertEqual((0., 0.), output_details[0]["quantization"])
 
   def testGraphDefQuantization(self):
@@ -125,13 +208,13 @@ class ConvertTest(test_util.TensorFlowTestCase):
     self.assertEqual(2, len(input_details))
     self.assertEqual("inputA", input_details[0]["name"])
     self.assertEqual(np.uint8, input_details[0]["dtype"])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]["shape"]).all())
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]["shape"]).all())  # type: ignore
     self.assertEqual((1., 0.),
                      input_details[0]["quantization"])  # scale, zero_point
 
     self.assertEqual("inputB", input_details[1]["name"])
     self.assertEqual(np.uint8, input_details[1]["dtype"])
-    self.assertTrue(([1, 16, 16, 3] == input_details[1]["shape"]).all())
+    self.assertTrue(([1, 16, 16, 3] == input_details[1]["shape"]).all())  # type: ignore
     self.assertEqual((1., 0.),
                      input_details[1]["quantization"])  # scale, zero_point
 
@@ -139,7 +222,7 @@ class ConvertTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(output_details))
     self.assertEqual("output", output_details[0]["name"])
     self.assertEqual(np.uint8, output_details[0]["dtype"])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]["shape"]).all())
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]["shape"]).all())  # type: ignore
     self.assertGreater(output_details[0]["quantization"][0], 0)  # scale
 
   def testGraphDefQuantizationInvalid(self):
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index 02ce079c7d2..e8864a8595d 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -86,11 +86,11 @@ pybind_extension(
     link_in_framework = True,
     deps = [
         ":interpreter_wrapper_lib",
-        "@pybind11",
-        "//tensorflow/lite/core:framework_stable",
-        "//third_party/python_runtime:headers",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework_stable",
         "//tensorflow/python:pybind11_lib",
+        "//third_party/python_runtime:headers",
+        "@pybind11",
     ] + select({
         ":tflite_pip_with_flex": ["//tensorflow/lite/delegates/flex:delegate"],
         "//conditions:default": [],
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 4521ab0b1a7..bce64edb32f 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <sstream>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
@@ -208,8 +209,7 @@ InterpreterWrapper* InterpreterWrapper::CreateInterpreterWrapper(
   std::unique_ptr<tflite::MutableOpResolver> resolver;
   switch (op_resolver_id) {
     case kBuiltinOpResolver:
-      resolver = std::make_unique<
-          tflite::ops::builtin::BuiltinOpResolverWithXNNPACK>();
+      resolver = std::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
       break;
     case kBuiltinRefOpResolver:
       resolver = std::make_unique<tflite::ops::builtin::BuiltinRefOpResolver>();
@@ -256,7 +256,7 @@ InterpreterWrapper::InterpreterWrapper(
       resolver_(std::move(resolver)),
       interpreter_(std::move(interpreter)) {}
 
-InterpreterWrapper::~InterpreterWrapper() {}
+InterpreterWrapper::~InterpreterWrapper() = default;
 
 // LINT.IfChange
 static constexpr int kUndeterminedSubgraphIndex = -1;
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 8f0c166fd6d..857e472556d 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -18,6 +18,7 @@ import enum
 import functools
 import pprint
 import shutil
+import sys
 import tempfile
 import time
 import warnings
@@ -77,6 +78,7 @@ from tensorflow.python.client import session as _session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function as _def_function
 from tensorflow.python.eager import function as _function
+from tensorflow.python.framework import byte_swap_tensor as bst
 from tensorflow.python.framework import convert_to_constants as _convert_to_constants
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
@@ -171,8 +173,8 @@ class RepresentativeDataset:
     """Creates a representative dataset.
 
     Args:
-      input_gen: A generator function that generates input samples for the
-        model and has the same order, type and shape as the inputs to the model.
+      input_gen: A generator function that generates input samples for the model
+        and has the same order, type and shape as the inputs to the model.
         Usually, this is a small subset of a few hundred samples randomly
         chosen, in no particular order, from the training or evaluation dataset.
     """
@@ -198,16 +200,18 @@ class TargetSpec:
       that may not be linked in by default with the TF ops that are provided
       when using the SELECT_TF_OPS path. The client is responsible for linking
       these ops into the target runtime.
-    experimental_supported_backends: Experimental flag, subject to change.
-      Set containing names of supported backends. Currently only "GPU" is
-      supported, more options will be available later.
+    experimental_supported_backends: Experimental flag, subject to change. Set
+      containing names of supported backends. Currently only "GPU" is supported,
+      more options will be available later.
   """
 
-  def __init__(self,
-               supported_ops=None,
-               supported_types=None,
-               experimental_select_user_tf_ops=None,
-               experimental_supported_backends=None):
+  def __init__(
+      self,
+      supported_ops=None,
+      supported_types=None,
+      experimental_select_user_tf_ops=None,
+      experimental_supported_backends=None,
+  ):
     if supported_ops is None:
       supported_ops = {OpsSet.TFLITE_BUILTINS}
     self.supported_ops = supported_ops
@@ -230,24 +234,31 @@ class TargetSpec:
 class QuantizationMode:
   """QuantizationMode determines the quantization type from user options."""
 
-  def __init__(self,
-               optimizations,
-               target_spec,
-               representative_dataset,
-               graph_def,
-               disable_per_channel=False,
-               experimental_new_dynamic_range_quantizer=False,
-               experimental_low_bit_qat=False,
-               full_integer_quantization_bias_type=None,
-               experimental_mlir_variable_quantization=False):
+  def __init__(
+      self,
+      optimizations,
+      target_spec,
+      representative_dataset,
+      graph_def,
+      disable_per_channel=False,
+      experimental_new_dynamic_range_quantizer=False,
+      experimental_low_bit_qat=False,
+      full_integer_quantization_bias_type=None,
+      experimental_mlir_variable_quantization=False,
+  ):
     self._optimizations = optimizations
     for deprecated_optimization in [
-        Optimize.OPTIMIZE_FOR_SIZE, Optimize.OPTIMIZE_FOR_LATENCY
+        Optimize.OPTIMIZE_FOR_SIZE,
+        Optimize.OPTIMIZE_FOR_LATENCY,
     ]:
       if deprecated_optimization in self._optimizations:
         logging.warning(
-            "Optimization option %s is deprecated, please use optimizations="
-            "[Optimize.DEFAULT] instead.", deprecated_optimization)
+            (
+                "Optimization option %s is deprecated, please use"
+                " optimizations=[Optimize.DEFAULT] instead."
+            ),
+            deprecated_optimization,
+        )
 
     self._target_spec = target_spec
     self._representative_dataset = representative_dataset
@@ -257,87 +268,117 @@ class QuantizationMode:
     self._disable_per_channel = disable_per_channel
 
     self._enable_new_dynamic_range_quantizer = (
-        experimental_new_dynamic_range_quantizer)
+        experimental_new_dynamic_range_quantizer
+    )
     # Allow training with lower than 8 bit weights to be converted
     # to constants with trained scale.
     self._experimental_low_bit_qat = experimental_low_bit_qat
 
-    self._full_integer_quantization_bias_type = full_integer_quantization_bias_type
+    self._full_integer_quantization_bias_type = (
+        full_integer_quantization_bias_type
+    )
     self._validate_full_integer_quantization_bias_type()
 
     self.enable_mlir_variable_quantization = (
-        experimental_mlir_variable_quantization)
+        experimental_mlir_variable_quantization
+    )
 
   def is_post_training_int8_only_quantization(self):
-    return (self.is_any_optimization_enabled() and
-            self._representative_dataset is not None and
-            not self._is_int16x8_target_required() and
-            not self.is_allow_float() and
-            self._is_int8_target_required())
+    return (
+        self.is_any_optimization_enabled()
+        and self._representative_dataset is not None
+        and not self._is_int16x8_target_required()
+        and not self.is_allow_float()
+        and self._is_int8_target_required()
+    )
 
   def is_post_training_int8_quantization_with_float_fallback(self):
-    return (self.is_any_optimization_enabled() and
-            self._representative_dataset is not None and
-            not self._is_int16x8_target_required() and
-            self.is_allow_float() and
-            self._smallest_supported_type() == _dtypes.int8)
+    return (
+        self.is_any_optimization_enabled()
+        and self._representative_dataset is not None
+        and not self._is_int16x8_target_required()
+        and self.is_allow_float()
+        and self._smallest_supported_type() == _dtypes.int8
+    )
 
   def is_post_training_int8_quantization(self):
-    return (self.is_post_training_int8_only_quantization() or
-            self.is_post_training_int8_quantization_with_float_fallback())
+    return (
+        self.is_post_training_int8_only_quantization()
+        or self.is_post_training_int8_quantization_with_float_fallback()
+    )
 
   def is_post_training_int16x8_only_quantization(self):
-    return (self.is_any_optimization_enabled() and
-            self._representative_dataset is not None and
-            self._is_int16x8_target_required() and
-            not self.is_allow_float())
+    return (
+        self.is_any_optimization_enabled()
+        and self._representative_dataset is not None
+        and self._is_int16x8_target_required()
+        and not self.is_allow_float()
+    )
 
   def is_post_training_int16x8_quantization_with_float_fallback(self):
-    return (self.is_any_optimization_enabled() and
-            self._representative_dataset is not None and
-            self._is_int16x8_target_required() and
-            self.is_allow_float())
+    return (
+        self.is_any_optimization_enabled()
+        and self._representative_dataset is not None
+        and self._is_int16x8_target_required()
+        and self.is_allow_float()
+    )
 
   def is_post_training_int16x8_quantization(self):
-    return (self.is_post_training_int16x8_only_quantization() or
-            self.is_post_training_int16x8_quantization_with_float_fallback())
+    return (
+        self.is_post_training_int16x8_only_quantization()
+        or self.is_post_training_int16x8_quantization_with_float_fallback()
+    )
 
   def is_post_training_integer_quantization(self):
-    return (self.is_post_training_int8_quantization() or
-            self.is_post_training_int16x8_quantization())
+    return (
+        self.is_post_training_int8_quantization()
+        or self.is_post_training_int16x8_quantization()
+    )
 
   def is_low_bit_quantize_aware_training(self):
-    return (self.is_any_optimization_enabled() and
-            self.is_quantization_aware_trained_model() and
-            self._experimental_low_bit_qat)
+    return (
+        self.is_any_optimization_enabled()
+        and self.is_quantization_aware_trained_model()
+        and self._experimental_low_bit_qat
+    )
 
   def is_quantization_aware_training(self):
-    return (self.is_any_optimization_enabled() and
-            self.is_quantization_aware_trained_model() and
-            not self.is_low_bit_quantize_aware_training())
+    return (
+        self.is_any_optimization_enabled()
+        and self.is_quantization_aware_trained_model()
+        and not self.is_low_bit_quantize_aware_training()
+    )
 
   def is_integer_quantization(self):
-    return (self.is_post_training_integer_quantization() or
-            self.is_quantization_aware_training() or
-            self.is_low_bit_quantize_aware_training())
+    return (
+        self.is_post_training_integer_quantization()
+        or self.is_quantization_aware_training()
+        or self.is_low_bit_quantize_aware_training()
+    )
 
   def is_post_training_dynamic_range_quantization(self):
     # Post-training dynamic range quantization is only enabled if post-training
     # int8 quantization and training time quantization was not done.
-    return (self.is_any_optimization_enabled() and
-            self._representative_dataset is None and
-            not self.is_quantization_aware_trained_model() and
-            self._smallest_supported_type() == _dtypes.int8)
+    return (
+        self.is_any_optimization_enabled()
+        and self._representative_dataset is None
+        and not self.is_quantization_aware_trained_model()
+        and self._smallest_supported_type() == _dtypes.int8
+    )
 
   def is_post_training_float16_quantization(self):
-    return (self.is_any_optimization_enabled() and
-            self._smallest_supported_type().size == 2 and
-            _dtypes.float16 in self._target_spec.supported_types)
+    return (
+        self.is_any_optimization_enabled()
+        and self._smallest_supported_type().size == 2
+        and _dtypes.float16 in self._target_spec.supported_types
+    )
 
   def is_bfloat16_quantization(self):
-    return (self.is_any_optimization_enabled() and
-            self._smallest_supported_type().size == 2 and
-            _dtypes.bfloat16 in self._target_spec.supported_types)
+    return (
+        self.is_any_optimization_enabled()
+        and self._smallest_supported_type().size == 2
+        and _dtypes.bfloat16 in self._target_spec.supported_types
+    )
 
   def activations_type(self):
     if self.is_integer_quantization():
@@ -365,68 +406,65 @@ class QuantizationMode:
     if self.is_integer_quantization():
       is_low_bit_qat = self.is_low_bit_quantize_aware_training()
       return {
-          "inference_type": (inference_ty if inference_ty is not None else
-                             self.activations_type()),
-          "inference_input_type":
-              _dtypes.float32,
-          "post_training_quantize":
-              False,  # disable dynamic range quantization
-          "quantize_to_float16":
-              False,  # disable float16 quantization
-          "disable_infer_tensor_range":
-              is_low_bit_qat,
-          "use_fake_quant_num_bits":
-              is_low_bit_qat,
-          "enable_mlir_variable_quantization":
-              self.enable_mlir_variable_quantization,
+          "inference_type": (
+              inference_ty
+              if inference_ty is not None
+              else self.activations_type()
+          ),
+          "inference_input_type": _dtypes.float32,
+          "post_training_quantize": False,  # disable dynamic range quantization
+          "quantize_to_float16": False,  # disable float16 quantization
+          "disable_infer_tensor_range": is_low_bit_qat,
+          "use_fake_quant_num_bits": is_low_bit_qat,
+          "enable_mlir_variable_quantization": (
+              self.enable_mlir_variable_quantization
+          ),
       }
     elif self.is_post_training_dynamic_range_quantization():
       return {
-          "inference_type":
-              _dtypes.float32,
-          "inference_input_type":
-              _dtypes.float32,
-          "post_training_quantize":
-              True,  # enable dynamic range quantization
-          "quantize_to_float16":
-              False,  # disable float16 quantization
+          "inference_type": _dtypes.float32,
+          "inference_input_type": _dtypes.float32,
+          "post_training_quantize": True,  # enable dynamic range quantization
+          "quantize_to_float16": False,  # disable float16 quantization
           # experimental: disable per-channel (per-axis) quantization.
-          "disable_per_channel_quantization":
-              self._disable_per_channel,
-          "enable_mlir_dynamic_range_quantizer":
-              self._enable_new_dynamic_range_quantizer,
-          "enable_mlir_variable_quantization":
+          "disable_per_channel_quantization": self._disable_per_channel,
+          "enable_mlir_dynamic_range_quantizer": (
+              self._enable_new_dynamic_range_quantizer
+          ),
+          "enable_mlir_variable_quantization": (
               self.enable_mlir_variable_quantization
+          ),
       }
     elif self.is_post_training_float16_quantization():
       return {
-          "inference_type":
-              _dtypes.float32,
-          "inference_input_type":
-              _dtypes.float32,
-          "post_training_quantize":
-              True,
-          "quantize_to_float16":
-              True,  # enable float16 quantization
-          "accumulation_type":
-              self._target_spec._experimental_supported_accumulation_type,  # pylint: disable=protected-access
-          "allow_bfloat16":
-              self.is_bfloat16_quantization(),
-          "enable_mlir_dynamic_range_quantizer":
-              self._enable_new_dynamic_range_quantizer,
-          "enable_mlir_variable_quantization":
+          "inference_type": _dtypes.float32,
+          "inference_input_type": _dtypes.float32,
+          "post_training_quantize": True,
+          "quantize_to_float16": True,  # enable float16 quantization
+          # pylint: disable=protected-access
+          "accumulation_type": (
+              self._target_spec._experimental_supported_accumulation_type
+          ),
+          # pylint: enable=protected-access
+          "allow_bfloat16": self.is_bfloat16_quantization(),
+          "enable_mlir_dynamic_range_quantizer": (
+              self._enable_new_dynamic_range_quantizer
+          ),
+          "enable_mlir_variable_quantization": (
               self.enable_mlir_variable_quantization
+          ),
       }
     else:
       # Note this might still trigger (uint8) quantization to be compatible with
       # the old converter.
       return {
           "inference_type": (
-              inference_ty if inference_ty is not None else _dtypes.float32),
+              inference_ty if inference_ty is not None else _dtypes.float32
+          ),
           "inference_input_type": inference_input_ty,
           "post_training_quantize": False,  # enable dynamic range quantization
           "quantize_to_float16": False,  # disable float16 quantization
-          "allow_bfloat16": self.is_bfloat16_quantization()
+          "allow_bfloat16": self.is_bfloat16_quantization(),
       }
 
   # Below are helpers for the above functions.
@@ -437,28 +475,37 @@ class QuantizationMode:
       return
 
     # Validate target_spec attibute.
-    if (set(self._target_spec.supported_ops) == {OpsSet.TFLITE_BUILTINS_INT8}
-        and not (set(self._target_spec.supported_types) == set() or
-                 set(self._target_spec.supported_types) == {_dtypes.int8})):
+    if set(self._target_spec.supported_ops) == {
+        OpsSet.TFLITE_BUILTINS_INT8
+    } and not (
+        set(self._target_spec.supported_types) == set()
+        or set(self._target_spec.supported_types) == {_dtypes.int8}
+    ):
       raise ValueError(
           "As full integer quantization has been enabled by setting "
           "`target_spec.supported_ops`={tf.lite.OpsSet.TFLITE_BUILTINS_INT8}, "
           "thus `target_spec.supported_types` should be left uninitizalized "
-          "or set to {tf.int8}.")
+          "or set to {tf.int8}."
+      )
     if set(self._target_spec.supported_types) == {_dtypes.int8}:
       self._target_spec.supported_ops = {OpsSet.TFLITE_BUILTINS_INT8}
 
     # Check if representative_dataset is specified.
-    if (not self._representative_dataset and
-        not self.is_quantization_aware_training()):
-      raise ValueError("For full integer quantization, a "
-                       "`representative_dataset` must be specified.")
+    if (
+        not self._representative_dataset
+        and not self.is_quantization_aware_training()
+    ):
+      raise ValueError(
+          "For full integer quantization, a "
+          "`representative_dataset` must be specified."
+      )
 
     # Update represenative dataset to the expected format.
     if self._representative_dataset:
       if not isinstance(self._representative_dataset, RepresentativeDataset):
         self._representative_dataset = RepresentativeDataset(
-            self._representative_dataset)
+            self._representative_dataset
+        )
 
   def _validate_full_integer_quantization_bias_type(self):
     """Validates bias type for full interger quantization."""
@@ -468,40 +515,50 @@ class QuantizationMode:
 
     if self.activations_type() == _dtypes.float32:
       raise ValueError(
-          "`full_integer_quantization_bias_type` is only supported for full integer quantization."
+          "`full_integer_quantization_bias_type` is only supported for full"
+          " integer quantization."
       )
 
     if self.activations_type() == _dtypes.int8 and bias_type != _dtypes.int32:
       raise ValueError(
-          f"Expected bias type to be `dtypes.int32` for Int8Quant. "
-          f"Current setting bias type: {bias_type}")
+          "Expected bias type to be `dtypes.int32` for Int8Quant. "
+          f"Current setting bias type: {bias_type}"
+      )
 
-    if self.activations_type(
-    ) == _dtypes.int16 and bias_type != _dtypes.int32 and bias_type != _dtypes.int64:
+    if (
+        self.activations_type() == _dtypes.int16
+        and bias_type != _dtypes.int32
+        and bias_type != _dtypes.int64
+    ):
       raise ValueError(
-          f"Expected bias type to be `dtypes.int32` or `dtypes.int64` for "
-          f"Int16Quant. Current setting bias type: {bias_type}")
+          "Expected bias type to be `dtypes.int32` or `dtypes.int64` for "
+          f"Int16Quant. Current setting bias type: {bias_type}"
+      )
 
   def _is_int8_target_required(self):
-    return (OpsSet.TFLITE_BUILTINS_INT8 in set(
-        self._target_spec.supported_ops)) or (set(
-            self._target_spec.supported_types) == set([_dtypes.int8]))
+    return (
+        OpsSet.TFLITE_BUILTINS_INT8 in set(self._target_spec.supported_ops)
+    ) or (set(self._target_spec.supported_types) == set([_dtypes.int8]))
 
   def _is_int16x8_target_required(self):
-    return (OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
-            in set(self._target_spec.supported_ops))
+    return (
+        OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        in set(self._target_spec.supported_ops)
+    )
 
   def is_allow_float(self):
-    return (OpsSet.TFLITE_BUILTINS in set(
-        self._target_spec.supported_ops)) or (OpsSet.SELECT_TF_OPS in set(
-            self._target_spec.supported_ops))
+    return (OpsSet.TFLITE_BUILTINS in set(self._target_spec.supported_ops)) or (
+        OpsSet.SELECT_TF_OPS in set(self._target_spec.supported_ops)
+    )
 
   def is_any_optimization_enabled(self):
     return bool(
         set(self._optimizations).intersection([
-            Optimize.OPTIMIZE_FOR_LATENCY, Optimize.OPTIMIZE_FOR_SIZE,
-            Optimize.DEFAULT
-        ]))
+            Optimize.OPTIMIZE_FOR_LATENCY,
+            Optimize.OPTIMIZE_FOR_SIZE,
+            Optimize.DEFAULT,
+        ])
+    )
 
   def _smallest_supported_type(self):
     if self._target_spec.supported_types:
@@ -565,6 +622,8 @@ class TFLiteConverterBase:
     # If unset, bias:int32 is by default except 16x8 quant.
     # For 16x8 quant, bias:int64 is used to prevent any overflow by default.
     self._experimental_full_integer_quantization_bias_type = None
+    # Provides specs for quantization, whether preset or custom.
+    self._experimental_quantization_options = None
     # Initializes conversion metadata.
     self.exclude_conversion_metadata = False
     self._metadata = conversion_metdata_fb.ConversionMetadataT()
@@ -604,8 +663,9 @@ class TFLiteConverterBase:
     if not self.experimental_new_converter:
       optimizers.append("constfold")
 
-    is_only_flex_enabled = (
-        set([OpsSet.SELECT_TF_OPS]) == set(self.target_spec.supported_ops))
+    is_only_flex_enabled = set([OpsSet.SELECT_TF_OPS]) == set(
+        self.target_spec.supported_ops
+    )
     if is_only_flex_enabled:
       # The layout optimizer turns NHCW to NCHW. This provides performance
       # optimizations when Flex mode is enabled. However, this is not compatible
@@ -613,36 +673,49 @@ class TFLiteConverterBase:
       optimizers.append("layout")
     return _get_grappler_config(optimizers)
 
-  def _quantize(self, result, input_type, output_type, activations_type,
-                bias_type, allow_float, enable_variable_quantization):
+  def _quantize(
+      self,
+      result,
+      input_type,
+      output_type,
+      activations_type,
+      bias_type,
+      allow_float,
+      enable_variable_quantization,
+  ):
     """Quantize the model."""
     # pylint: disable=protected-access
     custom_op_registerers_by_name = [
-        x for x in self.target_spec._experimental_custom_op_registerers
+        x
+        for x in self.target_spec._experimental_custom_op_registerers
         if isinstance(x, str)
     ]
     custom_op_registerers_by_func = [
-        x for x in self.target_spec._experimental_custom_op_registerers
+        x
+        for x in self.target_spec._experimental_custom_op_registerers
         if not isinstance(x, str)
     ]
     # pylint: enable=protected-access
     if not isinstance(self.representative_dataset, RepresentativeDataset):
       self.representative_dataset = RepresentativeDataset(
-          self.representative_dataset)
+          self.representative_dataset
+      )
 
     # Add intermediate tensors to the model if needed.
     result = _calibrator.add_intermediate_tensors(result)
-    calibrate_quantize = _calibrator.Calibrator(result,
-                                                custom_op_registerers_by_name,
-                                                custom_op_registerers_by_func)
+    calibrate_quantize = _calibrator.Calibrator(
+        result, custom_op_registerers_by_name, custom_op_registerers_by_func
+    )
     if self._experimental_calibrate_only or self.experimental_new_quantizer:
       calibrated = calibrate_quantize.calibrate(
-          self.representative_dataset.input_gen)
+          self.representative_dataset.input_gen
+      )
 
     if self._experimental_calibrate_only:
       return calibrated
     elif self.experimental_new_quantizer and (
-        activations_type != _dtypes.int16):
+        activations_type != _dtypes.int16
+    ):
       # TODO(b/175659372): remove the activations_type restriction and enable
       # it for all the activation types.
       return _mlir_quantize(
@@ -650,7 +723,8 @@ class TFLiteConverterBase:
           self._experimental_disable_per_channel,
           input_data_type=input_type,
           output_data_type=output_type,
-          enable_variable_quantization=enable_variable_quantization)
+          enable_variable_quantization=enable_variable_quantization,
+      )
     else:
       return calibrate_quantize.calibrate_and_quantize(
           self.representative_dataset.input_gen,
@@ -659,7 +733,8 @@ class TFLiteConverterBase:
           allow_float,
           activations_type,
           bias_type,
-          disable_per_channel=self._experimental_disable_per_channel)
+          disable_per_channel=self._experimental_disable_per_channel,
+      )
 
   def _is_unknown_shapes_allowed(self):
     # Unknown dimensions are only allowed with the new converter.
@@ -672,42 +747,35 @@ class TFLiteConverterBase:
       {key str: val}
     """
     args = {
-        "input_format":
-            constants.TENSORFLOW_GRAPHDEF,
-        "allow_custom_ops":
-            self.allow_custom_ops,
-        "debug_info":
-            self._debug_info,
-        "target_ops":
-            self.target_spec.supported_ops,
-        "enable_mlir_converter":
-            self.experimental_new_converter,
-        "select_user_tf_ops":
-            self.target_spec.experimental_select_user_tf_ops,
-        "supported_backends":
-            self.target_spec.experimental_supported_backends,
-        "unfold_batchmatmul":
-            not self._experimental_disable_batchmatmul_unfold,
-        "lower_tensor_list_ops":
-            self._experimental_lower_tensor_list_ops,
-        "unfold_large_splat_constant":
-            self._experimental_unfold_large_splat_constant,
-        "default_to_single_batch_in_tensor_list_ops":
-            self._experimental_default_to_single_batch_in_tensor_list_ops,
-        "tf_quantization_mode":
-            self._experimental_tf_quantization_mode,
-        "experimental_enable_resource_variables":
-            self.experimental_enable_resource_variables,
-        "enable_dynamic_update_slice":
-            self._experimental_enable_dynamic_update_slice,
-        "preserve_assert_op":
-            self._experimental_preserve_assert_op,
-        "guarantee_all_funcs_one_use":
-            self._experimental_guarantee_all_funcs_one_use,
-        "allow_all_select_tf_ops":
-            self._experimental_allow_all_select_tf_ops,
-        "disable_fuse_mul_and_fc":
-            self._experimental_disable_fuse_mul_and_fc,
+        "input_format": constants.TENSORFLOW_GRAPHDEF,
+        "allow_custom_ops": self.allow_custom_ops,
+        "debug_info": self._debug_info,
+        "target_ops": self.target_spec.supported_ops,
+        "enable_mlir_converter": self.experimental_new_converter,
+        "select_user_tf_ops": self.target_spec.experimental_select_user_tf_ops,
+        "supported_backends": self.target_spec.experimental_supported_backends,
+        "unfold_batchmatmul": not self._experimental_disable_batchmatmul_unfold,
+        "lower_tensor_list_ops": self._experimental_lower_tensor_list_ops,
+        "unfold_large_splat_constant": (
+            self._experimental_unfold_large_splat_constant
+        ),
+        "default_to_single_batch_in_tensor_list_ops": (
+            self._experimental_default_to_single_batch_in_tensor_list_ops
+        ),
+        "tf_quantization_mode": self._experimental_tf_quantization_mode,
+        "experimental_enable_resource_variables": (
+            self.experimental_enable_resource_variables
+        ),
+        "enable_dynamic_update_slice": (
+            self._experimental_enable_dynamic_update_slice
+        ),
+        "preserve_assert_op": self._experimental_preserve_assert_op,
+        "guarantee_all_funcs_one_use": (
+            self._experimental_guarantee_all_funcs_one_use
+        ),
+        "allow_all_select_tf_ops": self._experimental_allow_all_select_tf_ops,
+        "disable_fuse_mul_and_fc": self._experimental_disable_fuse_mul_and_fc,
+        "quantization_options": self._experimental_quantization_options,
     }
 
     if self.saved_model_dir:
@@ -724,7 +792,8 @@ class TFLiteConverterBase:
     meta_graph = saved_model_proto.meta_graphs[0]
     for function in meta_graph.graph_def.library.function:
       if function.attr.get("_implements", None) or function.attr.get(
-          "api_implements", None):
+          "api_implements", None
+      ):
         return True
     return False
 
@@ -740,15 +809,20 @@ class TFLiteConverterBase:
       return
     if self.saved_model_dir:
       try:
-        saved_model_proto, _ = (
-            _parse_saved_model_with_debug_info(self.saved_model_dir))
+        saved_model_proto, _ = _parse_saved_model_with_debug_info(
+            self.saved_model_dir
+        )
       except OSError:
         # If it fails to read the given saved model, it will fall back to the
         # frozen graph def path.
         self.saved_model_dir = None
         return
-      if (not always_enable_saved_model_import and
-          not self._contains_function_with_implements_attr(saved_model_proto)):
+      if (
+          not always_enable_saved_model_import
+          and not self._contains_function_with_implements_attr(
+              saved_model_proto
+          )
+      ):
         self.saved_model_dir = None
         return
 
@@ -760,8 +834,11 @@ class TFLiteConverterBase:
         logging.warning("SavedModel schema version is zero.")
         return
       if self._saved_model_version not in [1, 2]:
-        raise ValueError("SavedModel file format({0}) is not supported".format(
-            self._saved_model_version))
+        raise ValueError(
+            "SavedModel file format({0}) is not supported".format(
+                self._saved_model_version
+            )
+        )
 
   def _sparsify_model(self):
     return Optimize.EXPERIMENTAL_SPARSITY in self.optimizations
@@ -782,58 +859,64 @@ class TFLiteConverterBase:
   def _get_original_model_type(self):
     """One-time getter to return original model type and set it to NONE."""
     model_type = TFLiteConverterBase._original_model_type
-    TFLiteConverterBase._original_model_type = conversion_metdata_fb.ModelType.NONE
+    TFLiteConverterBase._original_model_type = (
+        conversion_metdata_fb.ModelType.NONE
+    )
     return model_type
 
-  def _save_conversion_params_metric(self,
-                                     graph_def=None,
-                                     inference_type=None,
-                                     inference_input_type=None):
+  def _save_conversion_params_metric(
+      self, graph_def=None, inference_type=None, inference_input_type=None
+  ):
     """Set conversion parameter metrics."""
     converter_kwargs = self._collected_converter_params
     converter_kwargs.update(self._get_base_converter_args())
 
     # Optimization parameters.
     quant_mode = QuantizationMode(
-        self.optimizations, self.target_spec, self.representative_dataset,
-        graph_def, self._experimental_disable_per_channel,
+        self.optimizations,
+        self.target_spec,
+        self.representative_dataset,
+        graph_def,
+        self._experimental_disable_per_channel,
         self.experimental_new_dynamic_range_quantizer,
         self._experimental_low_bit_qat,
         self._experimental_full_integer_quantization_bias_type,
-        self._experimental_variable_quantization)
+        self._experimental_variable_quantization,
+    )
     converter_kwargs.update({
-        "tf_version":
-            self._metadata.environment.tensorflowVersion,
-        "api_version":
-            self._metadata.environment.apiVersion,
-        "original_model_format":
-            self._metadata.environment.modelType,
-        "optimization_default":
-            quant_mode.is_any_optimization_enabled(),
-        "optimization_post_training_dynamic_range":
-            quant_mode.is_post_training_dynamic_range_quantization(),
-        "optimization_post_training_float16":
-            quant_mode.is_post_training_float16_quantization(),
-        "optimization_post_training_integer_quantize":
-            quant_mode.is_post_training_integer_quantization(),
-        "optimization_qat":
-            quant_mode.is_quantization_aware_training(),
-        "optimization_low_bit_qat":
-            quant_mode.is_low_bit_quantize_aware_training(),
-        "optimization_sparsify":
-            self._sparsify_model(),
-        "activations_type":
-            quant_mode.activations_type()
+        "tf_version": self._metadata.environment.tensorflowVersion,
+        "api_version": self._metadata.environment.apiVersion,
+        "original_model_format": self._metadata.environment.modelType,
+        "optimization_default": quant_mode.is_any_optimization_enabled(),
+        "optimization_post_training_dynamic_range": (
+            quant_mode.is_post_training_dynamic_range_quantization()
+        ),
+        "optimization_post_training_float16": (
+            quant_mode.is_post_training_float16_quantization()
+        ),
+        "optimization_post_training_integer_quantize": (
+            quant_mode.is_post_training_integer_quantization()
+        ),
+        "optimization_qat": quant_mode.is_quantization_aware_training(),
+        "optimization_low_bit_qat": (
+            quant_mode.is_low_bit_quantize_aware_training()
+        ),
+        "optimization_sparsify": self._sparsify_model(),
+        "activations_type": quant_mode.activations_type(),
     })
     converter_kwargs.update(
-        quant_mode.converter_flags(inference_type, inference_input_type))
+        quant_mode.converter_flags(inference_type, inference_input_type)
+    )
 
     # pylint: disable=protected-access
     if self.target_spec._experimental_supported_accumulation_type:
-      converter_kwargs.update({
-          "accumulation_type":
-              self.target_spec._experimental_supported_accumulation_type
-      })
+      converter_kwargs.update(
+          {
+              "accumulation_type": (
+                  self.target_spec._experimental_supported_accumulation_type
+              )
+          }
+      )
     # pylint: enable=protected-access
 
     def format_element(elem):
@@ -856,31 +939,37 @@ class TFLiteConverterBase:
     # Set conversion option metadata.
     self._metadata.options.allowCustomOps = self.allow_custom_ops
     self._metadata.options.enableSelectTfOps = (
-        OpsSet.SELECT_TF_OPS in self.target_spec.supported_ops)
-    self._metadata.options.forceSelectTfOps = (
-        set([OpsSet.SELECT_TF_OPS]) == set(self.target_spec.supported_ops))
+        OpsSet.SELECT_TF_OPS in self.target_spec.supported_ops
+    )
+    self._metadata.options.forceSelectTfOps = set(
+        [OpsSet.SELECT_TF_OPS]
+    ) == set(self.target_spec.supported_ops)
     self._metadata.options.modelOptimizationModes = []
 
     if quant_mode.is_post_training_float16_quantization():
       self._metadata.options.modelOptimizationModes.append(
-          conversion_metdata_fb.ModelOptimizationMode.PTQ_FLOAT16)
+          conversion_metdata_fb.ModelOptimizationMode.PTQ_FLOAT16
+      )
 
     if quant_mode.is_post_training_dynamic_range_quantization():
       self._metadata.options.modelOptimizationModes.append(
-          conversion_metdata_fb.ModelOptimizationMode.PTQ_DYNAMIC_RANGE)
+          conversion_metdata_fb.ModelOptimizationMode.PTQ_DYNAMIC_RANGE
+      )
 
     if quant_mode.is_post_training_int8_quantization():
       self._metadata.options.modelOptimizationModes.append(
-          conversion_metdata_fb.ModelOptimizationMode.PTQ_FULL_INTEGER)
+          conversion_metdata_fb.ModelOptimizationMode.PTQ_FULL_INTEGER
+      )
 
     if quant_mode.is_post_training_int16x8_quantization():
       self._metadata.options.modelOptimizationModes.append(
-          conversion_metdata_fb.ModelOptimizationMode.PTQ_INT16)
+          conversion_metdata_fb.ModelOptimizationMode.PTQ_INT16
+      )
 
     if quant_mode.is_quantization_aware_training():
       self._metadata.options.modelOptimizationModes.append(
-          conversion_metdata_fb.ModelOptimizationMode
-          .QUANTIZATION_AWARE_TRAINING)
+          conversion_metdata_fb.ModelOptimizationMode.QUANTIZATION_AWARE_TRAINING
+      )
 
   def _set_conversion_latency_metric(self, value):
     self._tflite_metrics.set_converter_latency(value)
@@ -899,17 +988,26 @@ class TFLiteConverterBase:
         q_bias_type = quant_mode.bias_type()
         q_allow_float = quant_mode.is_allow_float()
         q_variable_quantization = quant_mode.enable_mlir_variable_quantization
-        model = self._quantize(model, q_in_type, q_out_type, q_activations_type,
-                               q_bias_type, q_allow_float,
-                               q_variable_quantization)
+        model = self._quantize(
+            model,
+            q_in_type,
+            q_out_type,
+            q_activations_type,
+            q_bias_type,
+            q_allow_float,
+            q_variable_quantization,
+        )
 
       m_in_type = in_type if in_type else _dtypes.float32
       m_out_type = out_type if out_type else _dtypes.float32
       # Skip updating model io types if MLIR quantizer already takes care of it
-      if not (quant_mode.is_post_training_integer_quantization() and
-              self.experimental_new_quantizer and quant_io and
-              (m_in_type in [_dtypes.int8, _dtypes.uint8, _dtypes.float32]) and
-              (m_out_type in [_dtypes.int8, _dtypes.uint8, _dtypes.float32])):
+      if not (
+          quant_mode.is_post_training_integer_quantization()
+          and self.experimental_new_quantizer
+          and quant_io
+          and (m_in_type in [_dtypes.int8, _dtypes.uint8, _dtypes.float32])
+          and (m_out_type in [_dtypes.int8, _dtypes.uint8, _dtypes.float32])
+      ):
         model = _modify_model_io_type(model, m_in_type, m_out_type)
 
     if self._sparsify_model():
@@ -922,7 +1020,8 @@ class TFLiteConverterBase:
       # utilized.
       logging.warning(
           "Buffer deduplication procedure will be skipped when flatbuffer "
-          "library is not properly loaded")
+          "library is not properly loaded"
+      )
 
     return model
 
@@ -959,6 +1058,7 @@ class TFLiteConverterBase:
 
 def _export_metrics(convert_func):
   """The decorator around convert function to export metrics."""
+
   @functools.wraps(convert_func)
   def wrapper(self, *args, **kwargs):
     # pylint: disable=protected-access
@@ -987,15 +1087,23 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
         all_types = default_types + [_dtypes.int16]
       else:
         all_types = default_types + [_dtypes.int8, _dtypes.uint8]
-      if (self.inference_input_type not in all_types or
-          self.inference_output_type not in all_types):
+      if (
+          self.inference_input_type not in all_types
+          or self.inference_output_type not in all_types
+      ):
         all_types_names = ["tf." + t.name for t in all_types]
-        raise ValueError("The inference_input_type and inference_output_type "
-                         "must be in {}.".format(all_types_names))
-    elif (self.inference_input_type not in default_types or
-          self.inference_output_type not in default_types):
-      raise ValueError("The inference_input_type and inference_output_type "
-                       "must be tf.float32.")
+        raise ValueError(
+            "The inference_input_type and inference_output_type "
+            "must be in {}.".format(all_types_names)
+        )
+    elif (
+        self.inference_input_type not in default_types
+        or self.inference_output_type not in default_types
+    ):
+      raise ValueError(
+          "The inference_input_type and inference_output_type "
+          "must be tf.float32."
+      )
 
   @convert_phase(Component.PREPARE_TF_MODEL, SubComponent.LOAD_SAVED_MODEL)
   def _load_saved_model(self, saved_model_dir, saved_model_tags):
@@ -1017,7 +1125,8 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
     meta_graph = saved_model.get_meta_graph_def_from_tags(saved_model_tags)
     graph_def = meta_graph.graph_def
     signature_def = meta_graph.signature_def[
-        _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+        _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    ]
     input_tensors = [
         graph.get_tensor_by_name(signature_def.inputs[key].name)
         for key in signature_def.inputs
@@ -1036,19 +1145,21 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
       graph_def: The TensorFlow GraphDef.
       input_tensors: List of input tensors.
     Raise:
-      ValueError:
-        Input shape is not specified.
-        Invalid quantization parameters.
+      ValueError: Input shape is not specified. Invalid quantization parameters.
     """
     # Update conversion params with graph_def.
     self._save_conversion_params_metric(graph_def)
     self._quant_mode = QuantizationMode(
-        self.optimizations, self.target_spec, self.representative_dataset,
-        graph_def, self._experimental_disable_per_channel,
+        self.optimizations,
+        self.target_spec,
+        self.representative_dataset,
+        graph_def,
+        self._experimental_disable_per_channel,
         self.experimental_new_dynamic_range_quantizer,
         self._experimental_low_bit_qat,
         self._experimental_full_integer_quantization_bias_type,
-        self._experimental_variable_quantization)
+        self._experimental_variable_quantization,
+    )
     self._validate_inference_input_output_types(self._quant_mode)
 
     if not self._is_unknown_shapes_allowed():
@@ -1060,25 +1171,31 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
           raise ValueError(
               "None is only supported in the 1st dimension. Tensor '{0}' has "
               "invalid shape '{1}'.".format(
-                  _get_tensor_name(tensor), shape_list))
+                  _get_tensor_name(tensor), shape_list
+              )
+          )
         elif shape_list and shape_list[0] is None:
           # Set the batch size to 1 if undefined.
           shape = tensor.shape.as_list()
           shape[0] = 1
           tensor.set_shape(shape)
 
-    if (self._trackable_obj is None or
-        not hasattr(self._trackable_obj, "graph_debug_info")):
+    if self._trackable_obj is None or not hasattr(
+        self._trackable_obj, "graph_debug_info"
+    ):
       self._debug_info = _get_debug_info(
-          _build_debug_info_func(self._funcs[0].graph), graph_def)
+          _build_debug_info_func(self._funcs[0].graph), graph_def
+      )
     else:
       self._debug_info = _get_debug_info(
           _convert_debug_info_func(self._trackable_obj.graph_debug_info),
-          graph_def)
+          graph_def,
+      )
 
   @convert_phase(Component.PREPARE_TF_MODEL, SubComponent.OPTIMIZE_TF_MODEL)
-  def _optimize_tf_model(self, graph_def, input_tensors, output_tensors,
-                         frozen_func):
+  def _optimize_tf_model(
+      self, graph_def, input_tensors, output_tensors, frozen_func
+  ):
     """Run a Grappler pass to optimize the TensorFlow graph.
 
     Args:
@@ -1100,7 +1217,8 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
           input_tensors,
           output_tensors,
           config=grappler_config,
-          graph=frozen_func.graph)
+          graph=frozen_func.graph,
+      )
     return graph_def
 
   def _convert_from_saved_model(self, graph_def):
@@ -1116,23 +1234,29 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
     self._save_conversion_params_metric(graph_def)
     # Get quantization options and do some sanity checks.
     quant_mode = QuantizationMode(
-        self.optimizations, self.target_spec, self.representative_dataset,
-        graph_def, self._experimental_disable_per_channel,
+        self.optimizations,
+        self.target_spec,
+        self.representative_dataset,
+        graph_def,
+        self._experimental_disable_per_channel,
         self.experimental_new_dynamic_range_quantizer,
         self._experimental_low_bit_qat,
         self._experimental_full_integer_quantization_bias_type,
-        self._experimental_variable_quantization)
+        self._experimental_variable_quantization,
+    )
     self._validate_inference_input_output_types(quant_mode)
     converter_kwargs = {
-        "enable_tflite_resource_variables":
+        "enable_tflite_resource_variables": (
             self.experimental_enable_resource_variables
+        )
     }
     converter_kwargs.update(self._get_base_converter_args())
     converter_kwargs.update(quant_mode.converter_flags())
 
     result = _convert_saved_model(**converter_kwargs)
     return self._optimize_tflite_model(
-        result, quant_mode, quant_io=self.experimental_new_quantizer)
+        result, quant_mode, quant_io=self.experimental_new_quantizer
+    )
 
   def convert(self, graph_def, input_tensors, output_tensors):
     """Converts a TensorFlow GraphDef based on instance variables.
@@ -1159,21 +1283,26 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
       logging.warning(
           "Please consider switching to the new converter by setting "
           "experimental_new_converter=True. "
-          "The old converter is deprecated.")
+          "The old converter is deprecated."
+      )
     else:
-      logging.info("Using new converter: If you encounter a problem "
-                   "please file a bug. You can opt-out "
-                   "by setting experimental_new_converter=False")
+      logging.info(
+          "Using new converter: If you encounter a problem "
+          "please file a bug. You can opt-out "
+          "by setting experimental_new_converter=False"
+      )
 
     # Converts model.
     result = _convert_graphdef(
         input_data=graph_def,
         input_tensors=input_tensors,
         output_tensors=output_tensors,
-        **converter_kwargs)
+        **converter_kwargs,
+    )
 
     return self._optimize_tflite_model(
-        result, self._quant_mode, quant_io=self.experimental_new_quantizer)
+        result, self._quant_mode, quant_io=self.experimental_new_quantizer
+    )
 
 
 class TFLiteSavedModelConverterV2(TFLiteConverterBaseV2):
@@ -1183,11 +1312,13 @@ class TFLiteSavedModelConverterV2(TFLiteConverterBaseV2):
       saved_model_dir: Directory of the SavedModel.
   """
 
-  def __init__(self,
-               saved_model_dir,
-               saved_model_tags=None,
-               saved_model_exported_names=None,
-               trackable_obj=None):
+  def __init__(
+      self,
+      saved_model_dir,
+      saved_model_tags=None,
+      saved_model_exported_names=None,
+      trackable_obj=None,
+  ):
     """Constructor for TFLiteConverter.
 
     Args:
@@ -1225,27 +1356,36 @@ class TFLiteSavedModelConverterV2(TFLiteConverterBaseV2):
         Invalid quantization parameters.
     """
     graph_def, input_tensors, output_tensors = self._load_saved_model(
-        self.saved_model_dir, self._saved_model_tags)
+        self.saved_model_dir, self._saved_model_tags
+    )
     # If we can't use saved model importer, then fallback
     # to frozen graph conversion path.
     if self.saved_model_dir is None or not self.experimental_new_converter:
       graph_def, _, _, _ = _freeze_saved_model(
-          self.saved_model_dir, None, None, None, self._saved_model_tags,
-          _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY)
+          self.saved_model_dir,
+          None,
+          None,
+          None,
+          self._saved_model_tags,
+          _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+      )
       # We make sure to clear the saved_model_dir as there is some
       # legacy code down in the caller that checks this.
       # TODO(b/162537905): Clean these indirect dependencies.
       self.saved_model_dir = None
-      return super(TFLiteSavedModelConverterV2,
-                   self).convert(graph_def, input_tensors, output_tensors)
+      return super(TFLiteSavedModelConverterV2, self).convert(
+          graph_def, input_tensors, output_tensors
+      )
 
     if self._trackable_obj is None:
       self._debug_info = _get_debug_info(
-          _build_debug_info_func(self._funcs[0].graph), graph_def)
+          _build_debug_info_func(self._funcs[0].graph), graph_def
+      )
     else:
       self._debug_info = _get_debug_info(
           _convert_debug_info_func(self._trackable_obj.graph_debug_info),
-          graph_def)
+          graph_def,
+      )
 
     return self._convert_from_saved_model(graph_def)
 
@@ -1269,8 +1409,9 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
     self._trackable_obj = trackable_obj
     self.experimental_lower_to_saved_model = True
 
-  @convert_phase(Component.PREPARE_TF_MODEL,
-                 SubComponent.CONVERT_KERAS_TO_SAVED_MODEL)
+  @convert_phase(
+      Component.PREPARE_TF_MODEL, SubComponent.CONVERT_KERAS_TO_SAVED_MODEL
+  )
   def _convert_keras_to_saved_model(self, output_dir):
     """Save Keras model to the SavedModel format.
 
@@ -1286,7 +1427,8 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
       _saved_model.save(
           self._keras_model,
           output_dir,
-          options=_save_options.SaveOptions(save_debug_info=True))
+          options=_save_options.SaveOptions(save_debug_info=True),
+      )
     except Exception:  # pylint: disable=broad-except
       # When storing the given keras model to a saved model is failed, let's
       # use original keras model conversion pipeline.
@@ -1297,10 +1439,12 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
         _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
     ]
     self._parse_saved_model_args(
-        always_enable_saved_model_import=self.experimental_lower_to_saved_model)
+        always_enable_saved_model_import=self.experimental_lower_to_saved_model
+    )
     if self.saved_model_dir:
       graph_def, input_tensors, output_tensors = self._load_saved_model(
-          self.saved_model_dir, self._saved_model_tags)
+          self.saved_model_dir, self._saved_model_tags
+      )
       self._trackable_obj = _load(self.saved_model_dir, self._saved_model_tags)
       return graph_def, input_tensors, output_tensors
     return None, None, None
@@ -1326,7 +1470,8 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
       # signature including the batch dimension specified by the user.
       # TODO(b/169898786): Use the Keras public API when TFLite moves out of TF
       input_signature = _model_input_signature(
-          self._keras_model, keep_original_batch_size=True)
+          self._keras_model, keep_original_batch_size=True
+      )
 
     # TODO(b/169898786): Use the Keras public API when TFLite moves out of TF
     func = _trace_model_call(self._keras_model, input_signature)
@@ -1335,10 +1480,13 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
 
     frozen_func, graph_def = (
         _convert_to_constants.convert_variables_to_constants_v2_as_graph(
-            self._funcs[0], lower_control_flow=False))
+            self._funcs[0], lower_control_flow=False
+        )
+    )
 
     input_tensors = [
-        tensor for tensor in frozen_func.inputs
+        tensor
+        for tensor in frozen_func.inputs
         if tensor.dtype != _dtypes.resource
     ]
     output_tensors = frozen_func.outputs
@@ -1353,10 +1501,12 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
     temp_dir = tempfile.mkdtemp()
     try:
       graph_def, input_tensors, output_tensors = (
-          self._convert_keras_to_saved_model(temp_dir))
+          self._convert_keras_to_saved_model(temp_dir)
+      )
       if self.saved_model_dir:
-        return super(TFLiteKerasModelConverterV2,
-                     self).convert(graph_def, input_tensors, output_tensors)
+        return super(TFLiteKerasModelConverterV2, self).convert(
+            graph_def, input_tensors, output_tensors
+        )
     finally:
       shutil.rmtree(temp_dir, True)
 
@@ -1378,13 +1528,16 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
       return saved_model_convert_result
 
     graph_def, input_tensors, output_tensors, frozen_func = (
-        self._freeze_keras_model())
+        self._freeze_keras_model()
+    )
 
-    graph_def = self._optimize_tf_model(graph_def, input_tensors,
-                                        output_tensors, frozen_func)
+    graph_def = self._optimize_tf_model(
+        graph_def, input_tensors, output_tensors, frozen_func
+    )
 
-    return super(TFLiteKerasModelConverterV2,
-                 self).convert(graph_def, input_tensors, output_tensors)
+    return super(TFLiteKerasModelConverterV2, self).convert(
+        graph_def, input_tensors, output_tensors
+    )
 
 
 class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
@@ -1407,8 +1560,9 @@ class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
     self._trackable_obj = trackable_obj
     self.experimental_lower_to_saved_model = True
 
-  @convert_phase(Component.PREPARE_TF_MODEL,
-                 SubComponent.FREEZE_CONCRETE_FUNCTION)
+  @convert_phase(
+      Component.PREPARE_TF_MODEL, SubComponent.FREEZE_CONCRETE_FUNCTION
+  )
   def _freeze_concrete_function(self):
     """Convert the given ConcreteFunction to frozen graph.
 
@@ -1427,23 +1581,30 @@ class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
       raise ValueError("No ConcreteFunction is specified.")
 
     if len(self._funcs) > 1:
-      raise ValueError("This converter can only convert a single "
-                       "ConcreteFunction. Converting multiple functions is "
-                       "under development.")
+      raise ValueError(
+          "This converter can only convert a single "
+          "ConcreteFunction. Converting multiple functions is "
+          "under development."
+      )
 
     frozen_func, graph_def = (
         _convert_to_constants.convert_variables_to_constants_v2_as_graph(
-            self._funcs[0], lower_control_flow=False))
+            self._funcs[0], lower_control_flow=False
+        )
+    )
 
     input_tensors = [
-        tensor for tensor in frozen_func.inputs
+        tensor
+        for tensor in frozen_func.inputs
         if tensor.dtype != _dtypes.resource
     ]
     output_tensors = frozen_func.outputs
     return graph_def, input_tensors, output_tensors, frozen_func
 
-  @convert_phase(Component.PREPARE_TF_MODEL,
-                 SubComponent.CONVERT_CONCRETE_FUNCTIONS_TO_SAVED_MODEL)
+  @convert_phase(
+      Component.PREPARE_TF_MODEL,
+      SubComponent.CONVERT_CONCRETE_FUNCTIONS_TO_SAVED_MODEL,
+  )
   def _convert_concrete_functions_to_saved_model(self, output_dir):
     """Save concrete functions to the SavedModel format.
 
@@ -1464,17 +1625,19 @@ class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
     # Without the provided trackable obj, it is not able to serialize the given
     # concrete functions as a saved model format. Also when trackable obj is
     # a function, use the original concrete function conversion pipline.
-    if (not self._trackable_obj or
-        isinstance(self._trackable_obj, (_function.ConcreteFunction,
-                                         _def_function.Function))):
+    if not self._trackable_obj or isinstance(
+        self._trackable_obj,
+        (_function.ConcreteFunction, _def_function.Function),
+    ):
       return None, None, None
 
     signatures = {}
     signature_keys = []
     try:
       if len(self._funcs) == 1:
-        signatures[_signature_constants
-                   .DEFAULT_SERVING_SIGNATURE_DEF_KEY] = self._funcs[0]
+        signatures[_signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = (
+            self._funcs[0]
+        )
         signature_keys = [
             _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
         ]
@@ -1487,7 +1650,8 @@ class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
           self._trackable_obj,
           output_dir,
           signatures=signatures,
-          options=_save_options.SaveOptions(save_debug_info=True))
+          options=_save_options.SaveOptions(save_debug_info=True),
+      )
     except Exception:  # pylint: disable=broad-except
       # When storing the given concrete function to a saved model is failed,
       # let's use original concrete function conversion pipeline.
@@ -1499,7 +1663,8 @@ class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
     self._parse_saved_model_args(always_enable_saved_model_import=True)
     if self.saved_model_dir:
       graph_def, input_tensors, output_tensors = self._load_saved_model(
-          self.saved_model_dir, self._saved_model_tags)
+          self.saved_model_dir, self._saved_model_tags
+      )
       self._trackable_obj = _load(self.saved_model_dir, self._saved_model_tags)
       return graph_def, input_tensors, output_tensors
     return None, None, None
@@ -1513,7 +1678,8 @@ class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
     temp_dir = tempfile.mkdtemp()
     try:
       graph_def, input_tensors, _ = (
-          self._convert_concrete_functions_to_saved_model(temp_dir))
+          self._convert_concrete_functions_to_saved_model(temp_dir)
+      )
       if self.saved_model_dir:
         self._validate_inputs(graph_def, input_tensors)
         return self._convert_from_saved_model(graph_def)
@@ -1541,13 +1707,16 @@ class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
         return saved_model_convert_result
 
     graph_def, input_tensors, output_tensors, frozen_func = (
-        self._freeze_concrete_function())
+        self._freeze_concrete_function()
+    )
 
-    graph_def = self._optimize_tf_model(graph_def, input_tensors,
-                                        output_tensors, frozen_func)
+    graph_def = self._optimize_tf_model(
+        graph_def, input_tensors, output_tensors, frozen_func
+    )
 
-    return super(TFLiteFrozenGraphConverterV2,
-                 self).convert(graph_def, input_tensors, output_tensors)
+    return super(TFLiteFrozenGraphConverterV2, self).convert(
+        graph_def, input_tensors, output_tensors
+    )
 
 
 class TFLiteJaxConverterV2(TFLiteConverterBaseV2):
@@ -1561,24 +1730,28 @@ class TFLiteJaxConverterV2(TFLiteConverterBaseV2):
         model params should already be inlined. (e.g., `serving_func =
         functools.partial(model, params=params)`)
       inputs: Array of input tensor placeholders tuple,s like `jnp.zeros`. For
-        example, wrapped in an array like
-        "[('input1', input1), ('input2', input2)]]".
-    Jax function is polymorphic, for example:
+        example, wrapped in an array like "[('input1', input1), ('input2',
+        input2)]]".
+
+    Jax functions are polymorphic, for example:
+
     ```python
     def add(a, b):
       return a + b
     ```
+
     Will yield different computations if different input signatures are passed
     in: Pass `add(10.0, 20.0)` will yield a scalar `add` while pass
-      `add(np.random((100, 1)), np.random(100, 100))` will yield a broadcasting
-      add.  We will need the input information to do tracing for the converter
-      to properly convert the model. So it's important to pass in the desired
-      `input placeholders` with the correct input shape/type.
+    `add(np.random((100, 1)), np.random(100, 100))` will yield a broadcasting
+    add.  We will need the input information to do tracing for the converter
+    to properly convert the model. So it's important to pass in the desired
+    `input placeholders` with the correct input shape/type.
+
+    In the converted tflite model, the function name will be default to "main",
+    the output names will be the traced outputs. The output ordering shall
+    match the serving function.
+    """  # fmt: skip
 
-    In the converted tflite model:
-    Currently: the function name will be default to main, the output names will
-    be the traced outputs. The output ordering shall match the serving function.
-    """
     super(TFLiteJaxConverterV2, self).__init__()
     self._serving_funcs = serving_funcs
     self._inputs = inputs
@@ -1598,7 +1771,6 @@ class TFLiteJaxConverterV2(TFLiteConverterBaseV2):
         Input tensors are not specified.
         The truth value of an array with more than one element is ambiguous.
         Failed to convert the given Jax function to hlo.
-
     """
     if not _xla_computation:
       raise ImportError("Cannot import xla_computation from jax.")
@@ -1610,13 +1782,16 @@ class TFLiteJaxConverterV2(TFLiteConverterBaseV2):
       raise ValueError("Input tensors are not specified.")
 
     if len(self._inputs) != len(self._serving_funcs):
-      msg = ("Input tensor mapping len {} does not match serving func len {}."
-             .format(len(self._inputs), len(self._serving_funcs)))
+      msg = (
+          "Input tensor mapping len {} does not match serving func len {}."
+          .format(len(self._inputs), len(self._serving_funcs))
+      )
       raise ValueError(msg)
 
     if not isinstance(self._inputs, (tuple, list)):
       raise ValueError(
-          "Input tensors should be pass in a tuple list wrapped in an array.")
+          "Input tensors should be pass in a tuple list wrapped in an array."
+      )
 
     # TODO(b/197690428): Support multiple functions.
     # Currently only support one serving function.
@@ -1635,7 +1810,8 @@ class TFLiteJaxConverterV2(TFLiteConverterBaseV2):
     try:
       xla_compuation = _xla_computation(self._serving_funcs[0], backend="cpu")
       hlo_proto = xla_compuation(
-          *ordered_inputs).as_serialized_hlo_module_proto()
+          *ordered_inputs
+      ).as_serialized_hlo_module_proto()
     except Exception:  # pylint: disable=broad-except
       raise ValueError("Failed to convert the given Jax function to hlo.")
 
@@ -1644,19 +1820,21 @@ class TFLiteJaxConverterV2(TFLiteConverterBaseV2):
     converter_kwargs = {
         "input_content": hlo_proto,
         "input_names": input_names,
-        "is_proto_format": True
+        "is_proto_format": True,
     }
     converter_kwargs.update(self._get_base_converter_args())
 
     # Get quantization options and do some checks.
-    quant_mode = QuantizationMode(self.optimizations, self.target_spec,
-                                  self.representative_dataset, None)
+    quant_mode = QuantizationMode(
+        self.optimizations, self.target_spec, self.representative_dataset, None
+    )
     self._validate_inference_input_output_types(quant_mode)
     converter_kwargs.update(quant_mode.converter_flags())
     result = _convert_jax_hlo(**converter_kwargs)
 
     return self._optimize_tflite_model(
-        result, quant_mode, quant_io=self.experimental_new_quantizer)
+        result, quant_mode, quant_io=self.experimental_new_quantizer
+    )
 
 
 @_tf_export("lite.TFLiteConverter", v1=[])
@@ -1699,8 +1877,8 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
       MLIR-based quantization conversion instead of Flatbuffer-based conversion.
       (default True)
     experimental_enable_resource_variables: Experimental flag, subject to
-      change. Enables
-      [resource variables](https://tensorflow.org/guide/migrate/tf1_vs_tf2#resourcevariables_instead_of_referencevariables)
+      change. Enables [resource
+      variables](https://tensorflow.org/guide/migrate/tf1_vs_tf2#resourcevariables_instead_of_referencevariables)
       to be converted by this converter. This is only allowed if the
       from_saved_model interface is used. (default True)
 
@@ -1708,8 +1886,8 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
 
   ```python
   # Converting a SavedModel to a TensorFlow Lite model.
-    converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-    tflite_model = converter.convert()
+  converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+  tflite_model = converter.convert()
 
   # Converting a tf.Keras model to a TensorFlow Lite model.
   converter = tf.lite.TFLiteConverter.from_keras_model(model)
@@ -1720,11 +1898,11 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
   tflite_model = converter.convert()
 
   # Converting a Jax model to a TensorFlow Lite model.
-  converter = tf.lite.TFLiteConverter.experimental_from_jax([func], [[
-      ('input1', input1), ('input2', input2)]])
+  converter = tf.lite.TFLiteConverter.experimental_from_jax(
+      [func], [[ ('input1', input1), ('input2', input2)]])
   tflite_model = converter.convert()
   ```
-  """
+  """  # fmt: skip
 
   # pylint: disable=useless-super-delegation
   def __init__(self, funcs, trackable_obj=None):
@@ -1762,20 +1940,24 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.TF_CONCRETE_FUNCTIONS)
+        conversion_metdata_fb.ModelType.TF_CONCRETE_FUNCTIONS
+    )
     # pylint: enable=protected-access
     if trackable_obj is None:
       logging.warning(
           "Please consider providing the trackable_obj argument in the "
           "from_concrete_functions. Providing without the trackable_obj "
           "argument is deprecated and it will use the deprecated conversion "
-          "path.")
+          "path."
+      )
     for func in funcs:
       if not isinstance(func, _function.ConcreteFunction):
         message = "This function takes in a list of ConcreteFunction."
         if isinstance(func, _def_function.Function):
-          message += (" To get the ConcreteFunction from a Function,"
-                      " call get_concrete_function.")
+          message += (
+              " To get the ConcreteFunction from a Function,"
+              " call get_concrete_function."
+          )
         raise ValueError(message)
     return cls(funcs, trackable_obj)
 
@@ -1801,7 +1983,8 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.TF_SAVED_MODEL)
+        conversion_metdata_fb.ModelType.TF_SAVED_MODEL
+    )
     # pylint: enable=protected-access
     # When run without eager enabled, this will return the legacy
     # TFLiteConverter.
@@ -1812,10 +1995,13 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
           raise ValueError("Only support a single signature key.")
         else:
           signature_key = signature_keys[0]
-      logging.warning("Invoking the TF1 implementation of TFLiteConverter "
-                      "because eager is disabled. Consider enabling eager.")
+      logging.warning(
+          "Invoking the TF1 implementation of TFLiteConverter "
+          "because eager is disabled. Consider enabling eager."
+      )
       return TFLiteConverter.from_saved_model(
-          saved_model_dir, signature_key=signature_key, tag_set=tags)
+          saved_model_dir, signature_key=signature_key, tag_set=tags
+      )
 
     # Ensures any graphs created in Eager mode are able to run. This is required
     # in order to create a tf.estimator.Exporter that exports a TFLite model.
@@ -1830,16 +2016,33 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
     if not signature_keys:
       raise ValueError("Only support at least one signature key.")
 
+    # Distinguishes SavedModel artifacts created by `model.export`
+    # from SavedModel created by `model.save`/`tf.saved_model.save`.
+    if (
+        len(signature_keys) > 1
+        and hasattr(saved_model, "serve")  # `model.export` default endpoint
+        and not hasattr(saved_model, "_default_save_signature")
+        # `_default_save_signature` does not exist for `model.export` artifacts.
+    ):
+      # Default `serve` endpoint for `model.export` should be copied
+      # to `serving_default` to prevent issues in TF Lite serving.
+      saved_model.serving_default = saved_model.serve
+      delattr(saved_model, "serve")
+      signature_keys = ["serving_default"]
+
     funcs = []
     for key in signature_keys:
       if key not in saved_model.signatures:
-        raise ValueError("Invalid signature key '{}' found. Valid keys are "
-                         "'{}'.".format(key, ",".join(saved_model.signatures)))
+        raise ValueError(
+            "Invalid signature key '{}' found. Valid keys are '{}'.".format(
+                key, ",".join(saved_model.signatures)
+            )
+        )
       funcs.append(saved_model.signatures[key])
 
-    saved_model_converter = TFLiteSavedModelConverterV2(saved_model_dir, tags,
-                                                        signature_keys,
-                                                        saved_model)
+    saved_model_converter = TFLiteSavedModelConverterV2(
+        saved_model_dir, tags, signature_keys, saved_model
+    )
     if saved_model_converter.saved_model_dir:
       return saved_model_converter
 
@@ -1857,7 +2060,8 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.KERAS_MODEL)
+        conversion_metdata_fb.ModelType.KERAS_MODEL
+    )
     # pylint: enable=protected-access
     return TFLiteKerasModelConverterV2(model)
 
@@ -1879,7 +2083,8 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.JAX)
+        conversion_metdata_fb.ModelType.JAX
+    )
     # pylint: enable=protected-access
     return TFLiteJaxConverterV2(serving_funcs, inputs)
 
@@ -1928,30 +2133,38 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
 
   def __setattr__(self, name, value):
     if name == "post_training_quantize":
-      warnings.warn("Property %s is deprecated, "
-                    "please use optimizations=[Optimize.DEFAULT]"
-                    " instead." % name)
+      warnings.warn(
+          "Property %s is deprecated, "
+          "please use optimizations=[Optimize.DEFAULT]"
+          " instead." % name
+      )
       if value:
         self.optimizations = [Optimize.DEFAULT]
       else:
         self.optimizations = []
       return
     if name == "target_ops":
-      warnings.warn("Property %s is deprecated, please use "
-                    "target_spec.supported_ops instead." % name)
+      warnings.warn(
+          "Property %s is deprecated, please use "
+          "target_spec.supported_ops instead." % name
+      )
       self.target_spec.supported_ops = value
       return
     object.__setattr__(self, name, value)
 
   def __getattribute__(self, name):
     if name == "post_training_quantize":
-      warnings.warn("Property %s is deprecated, "
-                    "please use optimizations=[Optimize.DEFAULT]"
-                    " instead." % name)
+      warnings.warn(
+          "Property %s is deprecated, "
+          "please use optimizations=[Optimize.DEFAULT]"
+          " instead." % name
+      )
       return Optimize.DEFAULT in set(self.optimizations)
     if name == "target_ops":
-      warnings.warn("Property %s is deprecated, please use "
-                    "target_spec.supported_ops instead." % name)
+      warnings.warn(
+          "Property %s is deprecated, please use "
+          "target_spec.supported_ops instead." % name
+      )
       return self.target_spec.supported_ops
     return object.__getattribute__(self, name)
 
@@ -1961,19 +2174,23 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
     quantized_types = frozenset({_dtypes.int8, _dtypes.uint8})
 
     requires_quantized_input_stats = (
-        (converter_kwargs["inference_type"] in quantized_types or
-         converter_kwargs["inference_input_type"] in quantized_types) and
-        not quant_mode.is_post_training_integer_quantization())
+        converter_kwargs["inference_type"] in quantized_types
+        or converter_kwargs["inference_input_type"] in quantized_types
+    ) and not quant_mode.is_post_training_integer_quantization()
 
-    if (requires_quantized_input_stats and
-        not converter_kwargs["quantized_input_stats"]):
+    if (
+        requires_quantized_input_stats
+        and not converter_kwargs["quantized_input_stats"]
+    ):
       raise ValueError(
           "The `quantized_input_stats` flag must be defined when either "
           "`inference_type` flag or `inference_input_type` flag is set to "
           "tf.int8 or tf.uint8. Currently, `inference_type={}` and "
           "`inference_input_type={}`.".format(
               _get_tf_type_name(converter_kwargs["inference_type"]),
-              _get_tf_type_name(converter_kwargs["inference_input_type"])))
+              _get_tf_type_name(converter_kwargs["inference_input_type"]),
+          )
+      )
 
   @convert_phase(Component.PREPARE_TF_MODEL, SubComponent.VALIDATE_INPUTS)
   def _validate_inputs(self, input_tensors, quantized_input_stats):
@@ -1990,20 +2207,25 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
         Quantization input stats is required but not provided.
     """
 
-    if (not self._is_unknown_shapes_allowed() and self._has_valid_tensors()):
+    if not self._is_unknown_shapes_allowed() and self._has_valid_tensors():
       # Checks dimensions in input tensor.
       for tensor in input_tensors:
         shape = tensor.shape
         if not shape:
-          raise ValueError("Provide an input shape for input array "
-                           "'{0}'.".format(_get_tensor_name(tensor)))
+          raise ValueError(
+              "Provide an input shape for input array '{0}'.".format(
+                  _get_tensor_name(tensor)
+              )
+          )
         # Note that shape_list might be empty for scalar shapes.
         shape_list = shape.as_list()
         if None in shape_list[1:]:
           raise ValueError(
               "None is only supported in the 1st dimension. Tensor '{0}' has "
               "invalid shape '{1}'.".format(
-                  _get_tensor_name(tensor), shape_list))
+                  _get_tensor_name(tensor), shape_list
+              )
+          )
         elif shape_list and shape_list[0] is None:
           self._set_batch_size(batch_size=1)
 
@@ -2019,14 +2241,17 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
           invalid_stats.append(name)
 
       if invalid_stats:
-        raise ValueError("Quantization input stats are not available for input "
-                         "tensors '{0}'.".format(",".join(invalid_stats)))
+        raise ValueError(
+            "Quantization input stats are not available for input "
+            "tensors '{0}'.".format(",".join(invalid_stats))
+        )
     else:
       self._quantized_stats = None
 
   @convert_phase(Component.PREPARE_TF_MODEL, SubComponent.OPTIMIZE_TF_MODEL)
-  def _optimize_tf_model(self, graph_def, input_tensors, output_tensors,
-                         quant_mode):
+  def _optimize_tf_model(
+      self, graph_def, input_tensors, output_tensors, quant_mode
+  ):
     """Run a Grappler pass to optimize the TensorFlow graph.
 
     Args:
@@ -2054,7 +2279,8 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
           graph,
           input_tensors,
           output_tensors,
-          config=self._grappler_config(["function"]))
+          config=self._grappler_config(["function"]),
+      )
       return optimized_graph
     except Exception:  # pylint: disable=broad-except
       return graph_def
@@ -2074,23 +2300,29 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
     self._validate_inputs(self._input_tensors, self.quantized_input_stats)
 
     quant_mode = QuantizationMode(
-        self.optimizations, self.target_spec, self.representative_dataset,
-        self._graph_def, self._experimental_disable_per_channel,
+        self.optimizations,
+        self.target_spec,
+        self.representative_dataset,
+        self._graph_def,
+        self._experimental_disable_per_channel,
         self.experimental_new_dynamic_range_quantizer,
         self._experimental_low_bit_qat,
         self._experimental_full_integer_quantization_bias_type,
-        self._experimental_variable_quantization)
+        self._experimental_variable_quantization,
+    )
 
-    optimized_graph = self._optimize_tf_model(self._graph_def,
-                                              self._input_tensors,
-                                              self._output_tensors, quant_mode)
+    optimized_graph = self._optimize_tf_model(
+        self._graph_def, self._input_tensors, self._output_tensors, quant_mode
+    )
 
     self._debug_info = _get_debug_info(self._debug_info_func, optimized_graph)
 
     converter_kwargs = self._get_base_converter_args()
     converter_kwargs.update(
-        quant_mode.converter_flags(self.inference_type,
-                                   self.inference_input_type))
+        quant_mode.converter_flags(
+            self.inference_type, self.inference_input_type
+        )
+    )
     converter_kwargs.update({
         "output_format": self.output_format,
         "quantized_input_stats": self._quantized_stats,
@@ -2108,28 +2340,34 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
       logging.warning(
           "Please consider switching to the new converter by setting "
           "experimental_new_converter=True. "
-          "The old converter is deprecated.")
+          "The old converter is deprecated."
+      )
     else:
-      logging.info("Using experimental converter: If you encountered a problem "
-                   "please file a bug. You can opt-out "
-                   "by setting experimental_new_converter=False")
+      logging.info(
+          "Using experimental converter: If you encountered a problem "
+          "please file a bug. You can opt-out "
+          "by setting experimental_new_converter=False"
+      )
     # Converts model.
     if self._has_valid_tensors():
       result = _convert_graphdef(
           input_data=optimized_graph,
           input_tensors=self._input_tensors,
           output_tensors=self._output_tensors,
-          **converter_kwargs)
+          **converter_kwargs,
+      )
     else:
       result = _convert_graphdef_with_arrays(
           input_data=optimized_graph,
           input_arrays_with_shape=self._input_arrays_with_shape,
           output_arrays=self._output_arrays,
           control_output_arrays=self._control_output_arrays,
-          **converter_kwargs)
+          **converter_kwargs,
+      )
 
     return self._optimize_tflite_model(
-        result, quant_mode, quant_io=self.experimental_new_quantizer)
+        result, quant_mode, quant_io=self.experimental_new_quantizer
+    )
 
   def get_input_arrays(self):
     """Returns a list of the names of the input tensors.
@@ -2161,8 +2399,10 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
       ValueError: input_tensor is not defined.
     """
     if not self._has_valid_tensors():
-      raise ValueError("The batch size cannot be set for this model. Please "
-                       "use input_shapes parameter.")
+      raise ValueError(
+          "The batch size cannot be set for this model. Please "
+          "use input_shapes parameter."
+      )
 
     for tensor in self._input_tensors:
       shape = tensor.shape.as_list()
@@ -2184,7 +2424,8 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
       logging.warning(
           "`conversion_summary_dir` does not work with unknown shapes. "
           "Graphs with unknown shapes might be different than when this flag "
-          "is disabled.")
+          "is disabled."
+      )
       return False
     return True
 
@@ -2199,10 +2440,9 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
         "dump_graphviz_video": self.dump_graphviz_video,
         "conversion_summary_dir": self.conversion_summary_dir,
     })
-    super(TFLiteConverterBaseV1,
-          self)._save_conversion_params_metric(self._graph_def,
-                                               self.inference_type,
-                                               self.inference_input_type)
+    super(TFLiteConverterBaseV1, self)._save_conversion_params_metric(
+        self._graph_def, self.inference_type, self.inference_input_type
+    )
 
 
 class TFLiteSavedModelConverter(TFLiteConverterBaseV1):
@@ -2212,11 +2452,13 @@ class TFLiteSavedModelConverter(TFLiteConverterBaseV1):
       saved_model_dir: Directory of the SavedModel.
   """
 
-  def __init__(self,
-               saved_model_dir,
-               saved_model_tags,
-               saved_model_exported_names,
-               experimental_debug_info_func=None):
+  def __init__(
+      self,
+      saved_model_dir,
+      saved_model_tags,
+      saved_model_exported_names,
+      experimental_debug_info_func=None,
+  ):
     """Constructor for TFLiteConverter.
 
     Args:
@@ -2232,8 +2474,9 @@ class TFLiteSavedModelConverter(TFLiteConverterBaseV1):
     Raises:
       ValueError: Invalid arguments.
     """
-    super(TFLiteSavedModelConverter,
-          self).__init__(experimental_debug_info_func)
+    super(TFLiteSavedModelConverter, self).__init__(
+        experimental_debug_info_func
+    )
     self.saved_model_dir = saved_model_dir
     self._saved_model_tags = saved_model_tags
     self._saved_model_exported_names = saved_model_exported_names
@@ -2245,8 +2488,14 @@ class TFLiteSavedModelConverter(TFLiteConverterBaseV1):
 
     signature_key = self._saved_model_exported_names[0]
 
-    result = _freeze_saved_model(self.saved_model_dir, None, None, None,
-                                 self._saved_model_tags, signature_key)
+    result = _freeze_saved_model(
+        self.saved_model_dir,
+        None,
+        None,
+        None,
+        self._saved_model_tags,
+        signature_key,
+    )
     self._graph_def = result[0]
     self._input_tensors = result[1]
     self._output_tensors = result[2]
@@ -2276,12 +2525,14 @@ class TFLiteSavedModelConverter(TFLiteConverterBaseV1):
 class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
   """Converts the given SavedModel into TensorFlow Lite model."""
 
-  def __init__(self,
-               model_file,
-               input_arrays=None,
-               input_shapes=None,
-               output_arrays=None,
-               custom_objects=None):
+  def __init__(
+      self,
+      model_file,
+      input_arrays=None,
+      input_shapes=None,
+      output_arrays=None,
+      custom_objects=None,
+  ):
     """Constructor for TFLiteConverter.
 
     Args:
@@ -2291,7 +2542,7 @@ class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
       input_shapes: Dict of strings representing input tensor names to list of
         integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
         Automatically determined when input shapes is None (e.g., {"foo" :
-          None}). (default None)
+        None}). (default None)
       output_arrays: List of output tensors to freeze graph with. Uses output
         arrays from SignatureDef when none are provided. (default None)
       custom_objects: Dict mapping names (strings) to custom classes or
@@ -2300,22 +2551,27 @@ class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
     Raises:
       ValueError: Invalid arguments.
     """
-    super(TFLiteKerasModelConverter,
-          self).__init__(experimental_debug_info_func=None)
+    super(TFLiteKerasModelConverter, self).__init__(
+        experimental_debug_info_func=None
+    )
     # Handles Keras when Eager mode is enabled.
     if context.executing_eagerly():
       if input_arrays or output_arrays:
-        raise ValueError("`input_arrays` and `output_arrays` are unsupported "
-                         "with Eager mode. If your model requires any of these "
-                         "parameters, please use disable_eager_execution().")
+        raise ValueError(
+            "`input_arrays` and `output_arrays` are unsupported "
+            "with Eager mode. If your model requires any of these "
+            "parameters, please use disable_eager_execution()."
+        )
 
-      keras_model = keras_deps.get_load_model_function()(model_file,
-                                                         custom_objects)
+      keras_model = keras_deps.get_load_model_function()(
+          model_file, custom_objects
+      )
       function = _trace_model_call(keras_model)
       concrete_func = function.get_concrete_function()
 
       frozen_func = _convert_to_constants.convert_variables_to_constants_v2(
-          concrete_func, lower_control_flow=False)
+          concrete_func, lower_control_flow=False
+      )
       _set_tensor_shapes(frozen_func.inputs, input_shapes)
       self._keras_model = keras_model
       self._graph_def = frozen_func.graph.as_graph_def()
@@ -2326,8 +2582,9 @@ class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
 
     # Handles Keras when Eager mode is disabled.
     keras_deps.get_clear_session_function()()
-    keras_model = keras_deps.get_load_model_function()(model_file,
-                                                       custom_objects)
+    keras_model = keras_deps.get_load_model_function()(
+        model_file, custom_objects
+    )
     sess = keras_deps.get_get_session_function()()
 
     # Get input and output tensors.
@@ -2365,7 +2622,8 @@ class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
     tag_set = set([_tag_constants.SERVING])
     signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
     graph_def, input_tensors, output_tensors, sess_graph = _freeze_saved_model(
-        output_dir, None, None, None, tag_set, signature_key)
+        output_dir, None, None, None, tag_set, signature_key
+    )
 
     self.saved_model_dir = output_dir
     self._saved_model_tags = tag_set
@@ -2414,13 +2672,15 @@ class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
 class TFLiteFrozenGraphConverter(TFLiteConverterBaseV1):
   """Converts the given frozen graph def into TensorFlow Lite model."""
 
-  def __init__(self,
-               graph_def,
-               input_tensors,
-               output_tensors,
-               input_arrays_with_shape=None,
-               output_arrays=None,
-               experimental_debug_info_func=None):
+  def __init__(
+      self,
+      graph_def,
+      input_tensors,
+      output_tensors,
+      input_arrays_with_shape=None,
+      output_arrays=None,
+      experimental_debug_info_func=None,
+  ):
     """Constructor for TFLiteConverter.
 
     Args:
@@ -2429,10 +2689,9 @@ class TFLiteFrozenGraphConverter(TFLiteConverterBaseV1):
         `foo.shape` and `foo.dtype`.
       output_tensors: List of output tensors (only .name is used from this).
       input_arrays_with_shape: Tuple of strings representing input tensor names
-        and list of integers representing input shapes
-        (e.g., [("foo", [1, 16, 16, 3])]). Use only when graph cannot be loaded
-          into TensorFlow and when `input_tensors` and `output_tensors` are
-          None. (default None)
+        and list of integers representing input shapes (e.g., [("foo", [1, 16,
+        16, 3])]). Use only when graph cannot be loaded into TensorFlow and when
+        `input_tensors` and `output_tensors` are None. (default None)
       output_arrays: List of output tensors to freeze graph with. Use only when
         graph cannot be loaded into TensorFlow and when `input_tensors` and
         `output_tensors` are None. (default None)
@@ -2442,8 +2701,9 @@ class TFLiteFrozenGraphConverter(TFLiteConverterBaseV1):
     Raises:
       ValueError: Invalid arguments.
     """
-    super(TFLiteFrozenGraphConverter,
-          self).__init__(experimental_debug_info_func)
+    super(TFLiteFrozenGraphConverter, self).__init__(
+        experimental_debug_info_func
+    )
     self._graph_def = graph_def
     self._input_tensors = input_tensors
     self._output_tensors = output_tensors
@@ -2455,13 +2715,17 @@ class TFLiteFrozenGraphConverter(TFLiteConverterBaseV1):
       self._output_arrays = output_arrays
 
     if input_tensors is not None and input_arrays_with_shape is not None:
-      logging.warning("input_arrays_with_shape will be ignored when both the "
-                      "given input_tensors and input_arrays_with_shape are not "
-                      "None.")
+      logging.warning(
+          "input_arrays_with_shape will be ignored when both the "
+          "given input_tensors and input_arrays_with_shape are not "
+          "None."
+      )
 
     if output_tensors is not None and output_arrays is not None:
-      logging.warning("output_arrays will be ignored when both the given "
-                      "output_tensors and output_arrays are not None.")
+      logging.warning(
+          "output_arrays will be ignored when both the given "
+          "output_tensors and output_arrays are not None."
+      )
 
   @_export_metrics
   def convert(self):
@@ -2477,12 +2741,14 @@ class TFLiteFrozenGraphConverter(TFLiteConverterBaseV1):
         None value for dimension in input_tensor.
     """
     if not self._has_valid_tensors():
-      if not self._input_arrays_with_shape or not (self._output_arrays or
-                                                   self._control_output_arrays):
+      if not self._input_arrays_with_shape or not (
+          self._output_arrays or self._control_output_arrays
+      ):
         raise ValueError(
             "If input_tensors and output_tensors are None, both "
             "input_arrays_with_shape and output_arrays|control_output_arrays "
-            "must be defined.")
+            "must be defined."
+        )
     return super(TFLiteFrozenGraphConverter, self).convert()
 
 
@@ -2519,9 +2785,9 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
       (default is the value assigned to `inference_type`, must be in
       {tf.float32, tf.int8, tf.uint8})
     quantized_input_stats: Map of input tensor names to a tuple of floats
-      representing the mean and standard deviation of the training data.
-      (e.g., {"foo" : (0., 1.)}). Required if `inference_input_type` is tf.int8
-        or tf.uint8. (default None)
+      representing the mean and standard deviation of the training data. (e.g.,
+      {"foo" : (0., 1.)}). Required if `inference_input_type` is tf.int8 or
+      tf.uint8. (default None)
     default_ranges_stats: Tuple of integers (min, max) representing range values
       for all numeric arrays without a specified range. Intended for
       experimenting with quantization via "dummy quantization". (default None)
@@ -2563,45 +2829,33 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
       MLIR-based conversion. (default True)
     experimental_new_quantizer: Experimental flag, subject to change. Enables
       MLIR-based quantization conversion instead of Flatbuffer-based conversion.
-      (default True)
-
-  Example usage:
-
-    ```python
-    # Converting a GraphDef from session.
-    converter = tf.compat.v1.lite.TFLiteConverter.from_session(
-      sess, in_tensors, out_tensors)
-    tflite_model = converter.convert()
-    open("converted_model.tflite", "wb").write(tflite_model)
-
-    # Converting a GraphDef from file.
-    converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
-      graph_def_file, input_arrays, output_arrays)
-    tflite_model = converter.convert()
-    open("converted_model.tflite", "wb").write(tflite_model)
-
-    # Converting a SavedModel.
-    converter = tf.compat.v1.lite.TFLiteConverter.from_saved_model(
-        saved_model_dir)
-    tflite_model = converter.convert()
-    open("converted_model.tflite", "wb").write(tflite_model)
-
-    # Converting a tf.keras model.
-    converter = tf.compat.v1.lite.TFLiteConverter.from_keras_model_file(
-        keras_model)
-    tflite_model = converter.convert()
-    open("converted_model.tflite", "wb").write(tflite_model)
-    ```
+      (default True)  Example usage:  ```python # Converting a GraphDef from
+      session. converter = tf.compat.v1.lite.TFLiteConverter.from_session( sess,
+      in_tensors, out_tensors) tflite_model = converter.convert()
+      open("converted_model.tflite", "wb").write(tflite_model)  # Converting a
+      GraphDef from file. converter =
+      tf.compat.v1.lite.TFLiteConverter.from_frozen_graph( graph_def_file,
+      input_arrays, output_arrays) tflite_model = converter.convert()
+      open("converted_model.tflite", "wb").write(tflite_model)  # Converting a
+      SavedModel. converter =
+      tf.compat.v1.lite.TFLiteConverter.from_saved_model( saved_model_dir)
+      tflite_model = converter.convert() open("converted_model.tflite",
+      "wb").write(tflite_model)  # Converting a tf.keras model. converter =
+      tf.compat.v1.lite.TFLiteConverter.from_keras_model_file( keras_model)
+      tflite_model = converter.convert() open("converted_model.tflite",
+      "wb").write(tflite_model) ```
   """
 
   # pylint: disable=useless-super-delegation
-  def __init__(self,
-               graph_def,
-               input_tensors,
-               output_tensors,
-               input_arrays_with_shape=None,
-               output_arrays=None,
-               experimental_debug_info_func=None):
+  def __init__(
+      self,
+      graph_def,
+      input_tensors,
+      output_tensors,
+      input_arrays_with_shape=None,
+      output_arrays=None,
+      experimental_debug_info_func=None,
+  ):
     """Constructor for TFLiteConverter.
 
     Args:
@@ -2610,10 +2864,9 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
         `foo.shape` and `foo.dtype`.
       output_tensors: List of output tensors (only .name is used from this).
       input_arrays_with_shape: Tuple of strings representing input tensor names
-        and list of integers representing input shapes
-        (e.g., [("foo" : [1, 16, 16, 3])]). Use only when graph cannot be loaded
-          into TensorFlow and when `input_tensors` and `output_tensors` are
-          None. (default None)
+        and list of integers representing input shapes (e.g., [("foo" : [1, 16,
+        16, 3])]). Use only when graph cannot be loaded into TensorFlow and when
+        `input_tensors` and `output_tensors` are None. (default None)
       output_arrays: List of output tensors to freeze graph with. Use only when
         graph cannot be loaded into TensorFlow and when `input_tensors` and
         `output_tensors` are None. (default None)
@@ -2623,10 +2876,14 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     Raises:
       ValueError: Invalid arguments.
     """
-    super(TFLiteConverter,
-          self).__init__(graph_def, input_tensors, output_tensors,
-                         input_arrays_with_shape, output_arrays,
-                         experimental_debug_info_func)
+    super(TFLiteConverter, self).__init__(
+        graph_def,
+        input_tensors,
+        output_tensors,
+        input_arrays_with_shape,
+        output_arrays,
+        experimental_debug_info_func,
+    )
 
   @classmethod
   def from_session(cls, sess, input_tensors, output_tensors):
@@ -2643,21 +2900,21 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.TF_SESSION)
+        conversion_metdata_fb.ModelType.TF_SESSION
+    )
     # pylint: enable=protected-access
     graph_def = _freeze_graph(sess, input_tensors, output_tensors)
     return cls(
         graph_def,
         input_tensors,
         output_tensors,
-        experimental_debug_info_func=_build_debug_info_func(sess.graph))
+        experimental_debug_info_func=_build_debug_info_func(sess.graph),
+    )
 
   @classmethod
-  def from_frozen_graph(cls,
-                        graph_def_file,
-                        input_arrays,
-                        output_arrays,
-                        input_shapes=None):
+  def from_frozen_graph(
+      cls, graph_def_file, input_arrays, output_arrays, input_shapes=None
+  ):
     """Creates a TFLiteConverter class from a file containing a frozen GraphDef.
 
     Args:
@@ -2667,7 +2924,7 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
       input_shapes: Dict of strings representing input tensor names to list of
         integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
         Automatically determined when input shapes is None (e.g., {"foo" :
-          None}). (default None)
+        None}). (default None)
 
     Returns:
       TFLiteConverter class.
@@ -2683,7 +2940,8 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.TF_GRAPH_DEF)
+        conversion_metdata_fb.ModelType.TF_GRAPH_DEF
+    )
     # pylint: enable=protected-access
     with _ops.Graph().as_default():
       with _session.Session() as sess:
@@ -2706,7 +2964,11 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
             _text_format.Merge(file_content, graph_def)
           except (_text_format.ParseError, DecodeError):
             raise IOError(
-                "Unable to parse input file '{}'.".format(graph_def_file))
+                "Unable to parse input file '{}'.".format(graph_def_file)
+            )
+
+        if sys.byteorder == "big":
+          bst.swap_tensor_content_in_graph_node(graph_def, "little", "big")
 
         # Handles models with custom TFLite ops that cannot be resolved in
         # TensorFlow.
@@ -2723,9 +2985,11 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
 
           # Get input and output tensors.
           input_tensors = _get_tensors_from_tensor_names(
-              sess.graph, input_arrays)
+              sess.graph, input_arrays
+          )
           output_tensors = _get_tensors_from_tensor_names(
-              sess.graph, output_arrays)
+              sess.graph, output_arrays
+          )
           _set_tensor_shapes(input_tensors, input_shapes)
 
           return cls(sess.graph_def, input_tensors, output_tensors)
@@ -2733,8 +2997,10 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
           if not input_shapes:
             raise ValueError("input_shapes must be defined for this model.")
           if set(input_arrays) != set(input_shapes.keys()):
-            raise ValueError("input_shapes must contain a value for each item "
-                             "in input_array.")
+            raise ValueError(
+                "input_shapes must contain a value for each item "
+                "in input_array."
+            )
 
           input_arrays_with_shape = [
               (name, input_shapes[name]) for name in input_arrays
@@ -2744,16 +3010,19 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
               input_tensors=None,
               output_tensors=None,
               input_arrays_with_shape=input_arrays_with_shape,
-              output_arrays=output_arrays)
+              output_arrays=output_arrays,
+          )
 
   @classmethod
-  def from_saved_model(cls,
-                       saved_model_dir,
-                       input_arrays=None,
-                       input_shapes=None,
-                       output_arrays=None,
-                       tag_set=None,
-                       signature_key=None):
+  def from_saved_model(
+      cls,
+      saved_model_dir,
+      input_arrays=None,
+      input_shapes=None,
+      output_arrays=None,
+      tag_set=None,
+      signature_key=None,
+  ):
     """Creates a TFLiteConverter class from a SavedModel.
 
     Args:
@@ -2763,7 +3032,7 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
       input_shapes: Dict of strings representing input tensor names to list of
         integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
         Automatically determined when input shapes is None (e.g., {"foo" :
-          None}). (default None)
+        None}). (default None)
       output_arrays: List of output tensors to freeze graph with. Uses output
         arrays from SignatureDef when none are provided. (default None)
       tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
@@ -2777,34 +3046,45 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.TF_SAVED_MODEL)
+        conversion_metdata_fb.ModelType.TF_SAVED_MODEL
+    )
     # pylint: enable=protected-access
     if tag_set is None:
       tag_set = set([_tag_constants.SERVING])
     if signature_key is None:
       signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
-    saved_model_converter = TFLiteSavedModelConverter(saved_model_dir, tag_set,
-                                                      [signature_key])
+    saved_model_converter = TFLiteSavedModelConverter(
+        saved_model_dir, tag_set, [signature_key]
+    )
     if saved_model_converter.saved_model_dir:
       return saved_model_converter
 
-    result = _freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
-                                 output_arrays, tag_set, signature_key)
+    result = _freeze_saved_model(
+        saved_model_dir,
+        input_arrays,
+        input_shapes,
+        output_arrays,
+        tag_set,
+        signature_key,
+    )
 
     return cls(
         graph_def=result[0],
         input_tensors=result[1],
         output_tensors=result[2],
-        experimental_debug_info_func=_build_debug_info_func(result[3]))
+        experimental_debug_info_func=_build_debug_info_func(result[3]),
+    )
 
   @classmethod
-  def from_keras_model_file(cls,
-                            model_file,
-                            input_arrays=None,
-                            input_shapes=None,
-                            output_arrays=None,
-                            custom_objects=None):
+  def from_keras_model_file(
+      cls,
+      model_file,
+      input_arrays=None,
+      input_shapes=None,
+      output_arrays=None,
+      custom_objects=None,
+  ):
     """Creates a TFLiteConverter class from a tf.keras model file.
 
     Args:
@@ -2814,7 +3094,7 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
       input_shapes: Dict of strings representing input tensor names to list of
         integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
         Automatically determined when input shapes is None (e.g., {"foo" :
-          None}). (default None)
+        None}). (default None)
       output_arrays: List of output tensors to freeze graph with. Uses output
         arrays from SignatureDef when none are provided. (default None)
       custom_objects: Dict mapping names (strings) to custom classes or
@@ -2825,10 +3105,12 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.KERAS_MODEL)
+        conversion_metdata_fb.ModelType.KERAS_MODEL
+    )
     # pylint: enable=protected-access
-    return TFLiteKerasModelConverter(model_file, input_arrays, input_shapes,
-                                     output_arrays, custom_objects)
+    return TFLiteKerasModelConverter(
+        model_file, input_arrays, input_shapes, output_arrays, custom_objects
+    )
 
   # pylint: disable=useless-super-delegation
   def convert(self):
@@ -2854,47 +3136,56 @@ class TocoConverter:
   """
 
   @classmethod
-  @_deprecation.deprecated(None,
-                           "Use `lite.TFLiteConverter.from_session` instead.")
+  @_deprecation.deprecated(
+      None, "Use `lite.TFLiteConverter.from_session` instead."
+  )
   def from_session(cls, sess, input_tensors, output_tensors):
     """Creates a TocoConverter class from a TensorFlow Session."""
     return TFLiteConverter.from_session(sess, input_tensors, output_tensors)
 
   @classmethod
   @_deprecation.deprecated(
-      None, "Use `lite.TFLiteConverter.from_frozen_graph` instead.")
-  def from_frozen_graph(cls,
-                        graph_def_file,
-                        input_arrays,
-                        output_arrays,
-                        input_shapes=None):
+      None, "Use `lite.TFLiteConverter.from_frozen_graph` instead."
+  )
+  def from_frozen_graph(
+      cls, graph_def_file, input_arrays, output_arrays, input_shapes=None
+  ):
     """Creates a TocoConverter class from a file containing a frozen graph."""
-    return TFLiteConverter.from_frozen_graph(graph_def_file, input_arrays,
-                                             output_arrays, input_shapes)
+    return TFLiteConverter.from_frozen_graph(
+        graph_def_file, input_arrays, output_arrays, input_shapes
+    )
 
   @classmethod
   @_deprecation.deprecated(
-      None, "Use `lite.TFLiteConverter.from_saved_model` instead.")
-  def from_saved_model(cls,
-                       saved_model_dir,
-                       input_arrays=None,
-                       input_shapes=None,
-                       output_arrays=None,
-                       tag_set=None,
-                       signature_key=None):
+      None, "Use `lite.TFLiteConverter.from_saved_model` instead."
+  )
+  def from_saved_model(
+      cls,
+      saved_model_dir,
+      input_arrays=None,
+      input_shapes=None,
+      output_arrays=None,
+      tag_set=None,
+      signature_key=None,
+  ):
     """Creates a TocoConverter class from a SavedModel."""
-    return TFLiteConverter.from_saved_model(saved_model_dir, input_arrays,
-                                            input_shapes, output_arrays,
-                                            tag_set, signature_key)
+    return TFLiteConverter.from_saved_model(
+        saved_model_dir,
+        input_arrays,
+        input_shapes,
+        output_arrays,
+        tag_set,
+        signature_key,
+    )
 
   @classmethod
   @_deprecation.deprecated(
-      None, "Use `lite.TFLiteConverter.from_keras_model_file` instead.")
-  def from_keras_model_file(cls,
-                            model_file,
-                            input_arrays=None,
-                            input_shapes=None,
-                            output_arrays=None):
+      None, "Use `lite.TFLiteConverter.from_keras_model_file` instead."
+  )
+  def from_keras_model_file(
+      cls, model_file, input_arrays=None, input_shapes=None, output_arrays=None
+  ):
     """Creates a TocoConverter class from a tf.keras model file."""
-    return TFLiteConverter.from_keras_model_file(model_file, input_arrays,
-                                                 input_shapes, output_arrays)
+    return TFLiteConverter.from_keras_model_file(
+        model_file, input_arrays, input_shapes, output_arrays
+    )
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 7b7a62d8387..71ccdc5657b 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -31,6 +31,7 @@ import tensorflow as tf
 if hasattr(sys, 'setdlopenflags') and hasattr(sys, 'getdlopenflags'):
   sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
+from tensorflow.compiler.mlir.quantization.stablehlo import quantization_options_pb2 as quant_opts_pb2
 from tensorflow.lite.python import conversion_metadata_schema_py_generated as metadata_fb
 from tensorflow.lite.python import convert
 from tensorflow.lite.python import lite
@@ -63,6 +64,9 @@ from tensorflow.python.saved_model.loader_impl import parse_saved_model
 from tensorflow.python.saved_model.save import save
 from tensorflow.python.trackable import autotrackable
 
+# Type alias for preset quantization method protobuf enums.
+_PresetQuantizationMethod = quant_opts_pb2.PresetQuantizationMethod.PresetMethod
+
 # Only run jax related tests when we can import jax.
 DISABLE_JAX_TEST = False
 try:
@@ -2239,6 +2243,39 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
     actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
     self.assertEqual(expected_value, actual_value)
 
+  @test_util.run_v2_only
+  def testKerasSequentialModelExport(self):
+    """Test a simple sequential tf.Keras model with `model.export` usage."""
+    input_data = tf.constant(1., shape=[1, 1])
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[2.], [4.]])
+
+    model = tf.keras.models.Sequential([
+        tf.keras.layers.Dropout(0.2),
+        tf.keras.layers.Dense(1),
+    ])
+    model.compile(optimizer='sgd', loss='mean_squared_error')
+    model.fit(x, y, epochs=1)
+
+    export_dir = os.path.join(self.get_temp_dir(), 'exported_model')
+    model.export(export_dir)
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_saved_model(export_dir)
+    tflite_model = converter.convert()
+
+    # Validate endpoints following `.export` to TFLite conversion.
+    interpreter = Interpreter(model_content=tflite_model)
+    signature_defs = interpreter.get_signature_list()
+    self.assertLen(signature_defs, 1)
+    self.assertEqual(next(iter(signature_defs)), 'serving_default')
+
+    # Check values from converted model.
+    expected_value = model.predict(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value, actual_value)
+
   @test_util.run_v2_only
   def testGraphDebugInfo(self):
     """Test a SavedModel has debug info captured."""
@@ -2645,6 +2682,43 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
     else:
       self.assertEqual(np.int8, quantized_weight['dtype'])
 
+  @parameterized.named_parameters(
+      (
+          '_Float16Quantization',
+          _PresetQuantizationMethod.FLOAT16,
+      ),
+  )
+  @test_util.run_v2_only
+  def testMlirStableHLOPresetQuantizationMethod(
+      self, preset_quantization_method
+  ):
+    k_num_filters = 38
+    model = tf.keras.models.Sequential(
+        [tf.keras.layers.Conv2D(k_num_filters, (3, 3), activation='relu')]
+    )
+    model.build(input_shape=(1, 5, 5, 3))
+    saved_model_dir = os.path.join(self.get_temp_dir(), 'conv_saved_model')
+    save(model, saved_model_dir)
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            preset_quantization_method=quant_opts_pb2.PresetQuantizationMethod(
+                preset_method=preset_quantization_method
+            )
+        )
+    )
+
+    converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+    converter._experimental_quantization_options = quantization_options
+
+    converter.target_spec.supported_ops = [
+        tf.lite.OpsSet.EXPERIMENTAL_STABLEHLO_OPS
+    ]
+    converter.exclude_conversion_metadata = True
+    converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_stablehlo_model = converter.convert()
+    self.assertIsNotNone(quantized_stablehlo_model)
+
 
 class FromKerasModelTest(lite_v2_test_util.ModelTest):
 
diff --git a/tensorflow/lite/python/metrics/converter_error_data.proto b/tensorflow/lite/python/metrics/converter_error_data.proto
index 56257d3f333..ef98bc54a9e 100644
--- a/tensorflow/lite/python/metrics/converter_error_data.proto
+++ b/tensorflow/lite/python/metrics/converter_error_data.proto
@@ -26,6 +26,7 @@ message ConverterErrorData {
     ERROR_NEEDS_FLEX_OPS = 1;
     ERROR_NEEDS_CUSTOM_OPS = 2;
     ERROR_UNSUPPORTED_CONTROL_FLOW_V1 = 3;
+    ERROR_STATEFUL_PARTITIONED_CALL_IN_FINAL_IR = 4;
 
     // 200- 209 error codes are reserved for backend(delegate) compatibility.
     // Backend compatibility is checked at MlirToFlatBufferTranslateFunction()
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index c2a32d44480..36caf347a66 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -59,7 +59,7 @@ py_library(
         "//tensorflow/lite/python:convert_phase",
         "//tensorflow/lite/python:interpreter",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:lazy_loader",
         "//third_party/py/numpy",
     ],
 )
@@ -79,7 +79,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:resource_loader",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index f2c5ae704fe..d2ce2ee6eae 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -26,9 +26,13 @@ from tensorflow.python.util.lazy_loader import LazyLoader
 # break dependencies. Must use double quotes to match code internal rewrite
 # rule.
 _calibration_wrapper = LazyLoader(
-    "_calibration_wrapper", globals(),
-    "tensorflow.lite.python.optimize."
-    "_pywrap_tensorflow_lite_calibration_wrapper")
+    "_calibration_wrapper",
+    globals(),
+    (
+        "tensorflow.lite.python.optimize."
+        "_pywrap_tensorflow_lite_calibration_wrapper"
+    ),
+)
 
 
 def add_intermediate_tensors(model_content):
@@ -42,10 +46,12 @@ class Calibrator:
   This is an internal class, not a public interface.
   """
 
-  def __init__(self,
-               model_content,
-               custom_op_registerers_by_name=None,
-               custom_op_registerers_by_func=None):
+  def __init__(
+      self,
+      model_content,
+      custom_op_registerers_by_name=None,
+      custom_op_registerers_by_func=None,
+  ):
     """Constructor.
 
     Args:
@@ -65,10 +71,11 @@ class Calibrator:
     if custom_op_registerers_by_func is None:
       custom_op_registerers_by_func = []
     try:
-      self._calibrator = (
-          _calibration_wrapper.CalibrationWrapper(
-              model_content, custom_op_registerers_by_name,
-              custom_op_registerers_by_func))
+      self._calibrator = _calibration_wrapper.CalibrationWrapper(
+          model_content,
+          custom_op_registerers_by_name,
+          custom_op_registerers_by_func,
+      )
       self._model_content = model_content
     except Exception as e:
       raise ValueError("Failed to parse the model: %s." % e)
@@ -81,7 +88,8 @@ class Calibrator:
     signature_runner = self._interpreter.get_signature_runner(signature_key)
     input_details = sorted(
         signature_runner.get_input_details().items(),
-        key=lambda item: item[1]["index"])
+        key=lambda item: item[1]["index"],
+    )
     for input_name, _ in input_details:
       input_array.append(inputs[input_name])
     return input_array
@@ -93,15 +101,18 @@ class Calibrator:
     for sample in dataset_gen():
       if isinstance(sample, tuple):
         if not isinstance(sample[1], dict):
-          raise ValueError("You need to provide either a dictionary with input "
-                           "names and values in the second arugment in the "
-                           "tuple")
+          raise ValueError(
+              "You need to provide either a dictionary with input "
+              "names and values in the second arugment in the "
+              "tuple"
+          )
         # Convert signature based inputs to the tensor index based data.
         if self._interpreter is None:
           self._interpreter = Interpreter(model_content=self._model_content)
         signature_key = sample[0]
         input_array = self._create_input_array_from_dict(
-            signature_key, sample[1])
+            signature_key, sample[1]
+        )
       elif isinstance(sample, dict):
         # Convert signature based inputs to the tensor index based data.
         if self._interpreter is None:
@@ -112,19 +123,22 @@ class Calibrator:
         signature_key = None
         input_array = sample
       else:
-        raise ValueError("You need to provide either a dictionary with input "
-                         "names and values, a tuple with signature key and a "
-                         "dictionary with input names and values, or an array "
-                         "with input values in the order of input tensors of "
-                         "the graph in the representative_dataset function. "
-                         "Unsupported value from dataset: {}.".format(sample))
+        raise ValueError(
+            "You need to provide either a dictionary with input "
+            "names and values, a tuple with signature key and a "
+            "dictionary with input names and values, or an array "
+            "with input values in the order of input tensors of "
+            "the graph in the representative_dataset function. "
+            "Unsupported value from dataset: {}.".format(sample)
+        )
 
       if signature_key not in initialized:
         initialized[signature_key] = True
         if resize_input:
           if signature_key is not None:
-            self._calibrator.Prepare([list(s.shape) for s in input_array],
-                                     signature_key)
+            self._calibrator.Prepare(
+                [list(s.shape) for s in input_array], signature_key
+            )
           else:
             self._calibrator.Prepare([list(s.shape) for s in input_array])
         else:
@@ -137,17 +151,21 @@ class Calibrator:
       else:
         self._calibrator.FeedTensor(input_array)
 
-  @convert_phase(Component.OPTIMIZE_TFLITE_MODEL,
-                 SubComponent.QUANTIZE_USING_DEPRECATED_QUANTIZER)
-  def calibrate_and_quantize(self,
-                             dataset_gen,
-                             input_type,
-                             output_type,
-                             allow_float,
-                             activations_type=dtypes.int8,
-                             bias_type=dtypes.int32,
-                             resize_input=True,
-                             disable_per_channel=False):
+  @convert_phase(
+      Component.OPTIMIZE_TFLITE_MODEL,
+      SubComponent.QUANTIZE_USING_DEPRECATED_QUANTIZER,
+  )
+  def calibrate_and_quantize(
+      self,
+      dataset_gen,
+      input_type,
+      output_type,
+      allow_float,
+      activations_type=dtypes.int8,
+      bias_type=dtypes.int32,
+      resize_input=True,
+      disable_per_channel=False,
+  ):
     """Calibrates the model with specified generator and then quantizes it.
 
     The input shapes of the calibrator are resized with the calibration data if
@@ -161,33 +179,40 @@ class Calibrator:
       input_type: A tf.dtype representing the desired real-value input type.
       output_type: A tf.dtype representing the desired real-value output type.
       allow_float: A boolean. False if the resulting model cannot perform float
-                   computation, useful when targeting an integer-only backend.
-                   If False, an error will be thrown if an operation cannot be
-                   quantized, otherwise the model will fallback to float ops.
+        computation, useful when targeting an integer-only backend. If False, an
+        error will be thrown if an operation cannot be quantized, otherwise the
+        model will fallback to float ops.
       activations_type: A tf.dtype representing the desired type for
-                   activations.
+        activations.
       bias_type: A tf.dtype representing the desired type for bias.
       resize_input: A boolean. True if the shape of the sample data is different
         from the input.
       disable_per_channel: A boolean. True if disabling per-channel
-                   quantization.
+        quantization.
     """
     self._feed_tensors(dataset_gen, resize_input)
     return self._calibrator.QuantizeModel(
         np.dtype(input_type.as_numpy_dtype()).num,
-        np.dtype(output_type.as_numpy_dtype()).num, allow_float,
+        np.dtype(output_type.as_numpy_dtype()).num,
+        allow_float,
         np.dtype(activations_type.as_numpy_dtype()).num,
-        np.dtype(bias_type.as_numpy_dtype()).num, disable_per_channel)
+        np.dtype(bias_type.as_numpy_dtype()).num,
+        disable_per_channel,
+    )
 
-  @convert_phase(Component.OPTIMIZE_TFLITE_MODEL,
-                 SubComponent.QUANTIZE_USING_DEPRECATED_QUANTIZER)
-  def calibrate_and_quantize_single(self,
-                                    dataset_gen,
-                                    input_type,
-                                    output_type,
-                                    allow_float,
-                                    op_output_name,
-                                    resize_input=True):
+  @convert_phase(
+      Component.OPTIMIZE_TFLITE_MODEL,
+      SubComponent.QUANTIZE_USING_DEPRECATED_QUANTIZER,
+  )
+  def calibrate_and_quantize_single(
+      self,
+      dataset_gen,
+      input_type,
+      output_type,
+      allow_float,
+      op_output_name,
+      resize_input=True,
+  ):
     """Calibrates the model with specified generator and then quantizes it.
 
     Only the single op with output op_output_name will be quantized.
@@ -211,7 +236,10 @@ class Calibrator:
     self._feed_tensors(dataset_gen, resize_input)
     return self._calibrator.QuantizeModel(
         np.dtype(input_type.as_numpy_dtype()).num,
-        np.dtype(output_type.as_numpy_dtype()).num, allow_float, op_output_name)
+        np.dtype(output_type.as_numpy_dtype()).num,
+        allow_float,
+        op_output_name,
+    )
 
   @convert_phase(Component.OPTIMIZE_TFLITE_MODEL, SubComponent.CALIBRATE)
   def calibrate(self, dataset_gen):
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index 6d87bc386fd..9266dbdce4a 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -30,10 +30,12 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       # Activation type Int8
       ('UseActivationTypeInt8', dtypes.int8),
       # Activation type Int16
-      ('UseActivationTypeInt16', dtypes.int16))
+      ('UseActivationTypeInt16', dtypes.int16),
+  )
   def test_calibration_with_quantization(self, activations_type):
     model_path = resource_loader.get_path_to_datafile(
-        'test_data/mobilenet_like_model.bin')
+        'test_data/mobilenet_like_model.bin'
+    )
     float_model = open(model_path, 'rb').read()
     quantizer = _calibrator.Calibrator(float_model)
 
@@ -42,21 +44,21 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       for _ in range(10):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
 
-    quantized_model = quantizer.calibrate_and_quantize(input_gen,
-                                                       dtypes.float32,
-                                                       dtypes.float32,
-                                                       False,
-                                                       activations_type)
+    quantized_model = quantizer.calibrate_and_quantize(
+        input_gen, dtypes.float32, dtypes.float32, False, activations_type
+    )
     self.assertIsNotNone(quantized_model)
 
   @parameterized.named_parameters(
       # Activation type Int8
       ('UseActivationTypeInt8', dtypes.int8),
       # Activation type Int16
-      ('UseActivationTypeInt16', dtypes.int16))
+      ('UseActivationTypeInt16', dtypes.int16),
+  )
   def test_calibration_with_quantization_allow_float(self, activations_type):
     model_path = resource_loader.get_path_to_datafile(
-        'test_data/mobilenet_like_model.bin')
+        'test_data/mobilenet_like_model.bin'
+    )
     float_model = open(model_path, 'rb').read()
     quantizer = _calibrator.Calibrator(float_model)
 
@@ -65,16 +67,15 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       for _ in range(10):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
 
-    quantized_model = quantizer.calibrate_and_quantize(input_gen,
-                                                       dtypes.float32,
-                                                       dtypes.float32,
-                                                       True,
-                                                       activations_type)
+    quantized_model = quantizer.calibrate_and_quantize(
+        input_gen, dtypes.float32, dtypes.float32, True, activations_type
+    )
     self.assertIsNotNone(quantized_model)
 
   def test_calibration_with_quantization_single_op(self):
     model_path = resource_loader.get_path_to_datafile(
-        'test_data/mobilenet_like_model.bin')
+        'test_data/mobilenet_like_model.bin'
+    )
     float_model = open(model_path, 'rb').read()
     quantizer = _calibrator.Calibrator(float_model)
 
@@ -84,35 +85,42 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
 
     quantized_model = quantizer.calibrate_and_quantize_single(
-        input_gen, dtypes.float32, dtypes.float32, True, 'conv2d_8/BiasAdd')
+        input_gen, dtypes.float32, dtypes.float32, True, 'conv2d_8/BiasAdd'
+    )
     self.assertIsNotNone(quantized_model)
 
   def test_calibration_with_string_input(self):
     model_path = resource_loader.get_path_to_datafile(
-        'test_data/string_input_flex_model.bin')
+        'test_data/string_input_flex_model.bin'
+    )
     with open(model_path, 'rb') as fp:
       model_with_string_input = fp.read()
     quantizer = _calibrator.Calibrator(model_with_string_input)
+
     # Input generator for the model.
     def input_gen():
       for i in range(10):
-        yield [np.array(u'Test' + str(i))]
+        yield [np.array('Test' + str(i))]
 
     quantized_model = quantizer.calibrate_and_quantize_single(
-        input_gen, dtypes.float32, dtypes.float32, True, 'Identity')
+        input_gen, dtypes.float32, dtypes.float32, True, 'Identity'
+    )
     self.assertIsNotNone(quantized_model)
 
   @parameterized.named_parameters(
       # Activation type Int8
       ('UseActivationTypeInt8 - EnableMlirQuantizer', dtypes.int8),
       # Activation type Int16
-      ('UseActivationTypeInt16 - DisableEnableMlirQuantizer', dtypes.int16))
+      ('UseActivationTypeInt16 - DisableEnableMlirQuantizer', dtypes.int16),
+  )
   def test_calibration_with_quantization_multiple_inputs(
-      self, activations_type):
+      self, activations_type
+  ):
     # Load multi add model from test data.
     # This model has 4 inputs of size (1, 8, 8, 3).
     model_path = resource_loader.get_path_to_datafile(
-        '../../testdata/multi_add.bin')
+        '../../testdata/multi_add.bin'
+    )
     float_model = open(model_path, 'rb').read()
     quantizer = _calibrator.Calibrator(float_model)
 
@@ -121,11 +129,9 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       for _ in range(10):
         yield [np.ones(shape=(1, 8, 8, 3), dtype=np.float32) for _ in range(4)]
 
-    quantized_model = quantizer.calibrate_and_quantize(input_gen,
-                                                       dtypes.float32,
-                                                       dtypes.float32,
-                                                       False,
-                                                       activations_type)
+    quantized_model = quantizer.calibrate_and_quantize(
+        input_gen, dtypes.float32, dtypes.float32, False, activations_type
+    )
     self.assertIsNotNone(quantized_model)
 
   def test_invalid_model_buffer(self):
@@ -136,7 +142,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   # TODO(fengliuai): enable mlir quantizer
   def test_empty_calibrator_gen(self):
     model_path = resource_loader.get_path_to_datafile(
-        'test_data/mobilenet_like_model.bin')
+        'test_data/mobilenet_like_model.bin'
+    )
     float_model = open(model_path, 'rb').read()
     quantizer = _calibrator.Calibrator(float_model)
 
@@ -145,12 +152,14 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield i
 
     with self.assertRaises(RuntimeError):
-      quantizer.calibrate_and_quantize(empty_input_gen, dtypes.float32,
-                                       dtypes.float32, False)
+      quantizer.calibrate_and_quantize(
+          empty_input_gen, dtypes.float32, dtypes.float32, False
+      )
 
   def test_invalid_shape_calibrator_gen(self):
     model_path = resource_loader.get_path_to_datafile(
-        'test_data/mobilenet_like_model.bin')
+        'test_data/mobilenet_like_model.bin'
+    )
     float_model = open(model_path, 'rb').read()
     quantizer = _calibrator.Calibrator(float_model)
 
@@ -167,11 +176,13 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           False,
           activations_type=dtypes.int8,
           bias_type=dtypes.int32,
-          resize_input=False)
+          resize_input=False,
+      )
 
   def test_invalid_type_calibrator_gen(self):
     model_path = resource_loader.get_path_to_datafile(
-        'test_data/mobilenet_like_model.bin')
+        'test_data/mobilenet_like_model.bin'
+    )
     float_model = open(model_path, 'rb').read()
     quantizer = _calibrator.Calibrator(float_model)
 
@@ -181,12 +192,14 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.int32)]
 
     with self.assertRaises(ValueError):
-      quantizer.calibrate_and_quantize(input_gen, dtypes.float32,
-                                       dtypes.float32, False, dtypes.int8)
+      quantizer.calibrate_and_quantize(
+          input_gen, dtypes.float32, dtypes.float32, False, dtypes.int8
+      )
 
   def test_calibration(self):
     model_path = resource_loader.get_path_to_datafile(
-        'test_data/mobilenet_like_model.bin')
+        'test_data/mobilenet_like_model.bin'
+    )
     float_model = open(model_path, 'rb').read()
     quantizer = _calibrator.Calibrator(float_model)
 
@@ -200,7 +213,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def test_add_intermediate_tensors(self):
     model_path = resource_loader.get_path_to_datafile(
-        'test_data/mobilenet_like_model.bin')
+        'test_data/mobilenet_like_model.bin'
+    )
     model = open(model_path, 'rb').read()
     added_model = _calibrator.add_intermediate_tensors(model)
     self.assertIsNotNone(added_model)
diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index 12b07d113ca..25eccb4fa54 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -21,8 +21,8 @@ import sys
 from absl import logging
 
 import flatbuffers
+from tensorflow.core.framework import graph_debug_info_pb2
 from tensorflow.core.protobuf import config_pb2 as _config_pb2
-from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.core.protobuf import meta_graph_pb2 as _meta_graph_pb2
 from tensorflow.lite.python import conversion_metadata_schema_py_generated as conversion_metadata_fb
 from tensorflow.lite.python import schema_py_generated as schema_fb
@@ -351,7 +351,7 @@ def build_debug_info_func(original_graph):
           useful_ops.append((func, original_graph.get_operation_by_name(name)))
         else:
           sub_func = original_graph._get_function(func)  # pylint: disable=protected-access
-          if isinstance(sub_func, function._EagerDefinedFunction):  # pylint: disable=protected-access
+          if isinstance(sub_func, function.AtomicFunction):  # pylint: disable=protected-access
             useful_ops.append(
                 (func, sub_func.graph.get_operation_by_name(name)))
           else:
diff --git a/tensorflow/lite/python/util_test.py b/tensorflow/lite/python/util_test.py
index abd8306586d..7baab193814 100644
--- a/tensorflow/lite/python/util_test.py
+++ b/tensorflow/lite/python/util_test.py
@@ -26,8 +26,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import while_loop
 from tensorflow.python.platform import test
 
 
@@ -105,7 +105,7 @@ class UtilTest(test_util.TensorFlowTestCase):
       i = array_ops.placeholder(dtype=dtypes.int32, shape=())
       c = lambda i: math_ops.less(i, 10)
       b = lambda i: math_ops.add(i, 1)
-      control_flow_ops.while_loop(c, b, [i])
+      while_loop.while_loop(c, b, [i])
       sess = session.Session()
 
     new_graph_def = convert_to_constants.disable_lower_using_switch_merge(
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 72ec3e8f363..d3b0753c1bc 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -46,7 +46,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
 
diff --git a/tensorflow/lite/schema/builtin_ops_header/README.md b/tensorflow/lite/schema/builtin_ops_header/README.md
index b5e902bdfa7..a330da5ccd7 100644
--- a/tensorflow/lite/schema/builtin_ops_header/README.md
+++ b/tensorflow/lite/schema/builtin_ops_header/README.md
@@ -11,5 +11,5 @@ bazel run \
   tensorflow/lite/builtin_ops.h &&
 bazel run \
   //tensorflow/lite/schema/builtin_ops_list:generate > \
-  tensorflow/lite/core/shims/builtin_ops_list.inc
+  tensorflow/lite/kernels/builtin_ops_list.inc
 ```
diff --git a/tensorflow/lite/schema/builtin_ops_list/BUILD b/tensorflow/lite/schema/builtin_ops_list/BUILD
index c6353878116..bad882f73d6 100644
--- a/tensorflow/lite/schema/builtin_ops_list/BUILD
+++ b/tensorflow/lite/schema/builtin_ops_list/BUILD
@@ -1,5 +1,5 @@
 # This package contains code to auto-generate the contents of the file
-#   tensorflow/lite/core/shims:builtin_ops_list.inc
+#   tensorflow/lite/kernels:builtin_ops_list.inc
 # from the BuiltinOperator enum in the FlatBuffer schema,
 # and a test to verify that the checked-in copy remains up-to-date.
 
@@ -45,7 +45,7 @@ cc_test(
     name = "consistency_test",
     srcs = ["consistency_test.cc"],
     data = [
-        "//tensorflow/lite/core/shims:builtin_ops_list.inc",
+        "//tensorflow/lite/kernels:builtin_ops_list.inc",
     ],
     deps = [
         ":generator",
diff --git a/tensorflow/lite/schema/builtin_ops_list/README.md b/tensorflow/lite/schema/builtin_ops_list/README.md
index dd9a7534905..f1e29ec39f9 100644
--- a/tensorflow/lite/schema/builtin_ops_list/README.md
+++ b/tensorflow/lite/schema/builtin_ops_list/README.md
@@ -11,5 +11,5 @@ bazel run \
   tensorflow/lite/builtin_ops.h &&
 bazel run \
   //tensorflow/lite/schema/builtin_ops_list:generate > \
-  tensorflow/lite/core/shims/builtin_ops_list.inc
+  tensorflow/lite/kernels/builtin_ops_list.inc
 ```
diff --git a/tensorflow/lite/schema/builtin_ops_list/consistency_test.cc b/tensorflow/lite/schema/builtin_ops_list/consistency_test.cc
index ccb85c54abd..e2e74a7cd21 100644
--- a/tensorflow/lite/schema/builtin_ops_list/consistency_test.cc
+++ b/tensorflow/lite/schema/builtin_ops_list/consistency_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace {
 
 const char* kHeaderFileName =
-    "tensorflow/lite/core/shims/builtin_ops_list.inc";
+    "tensorflow/lite/kernels/builtin_ops_list.inc";
 
 // The test ensures that `builtin_ops_list.inc` header is consistent with the
 // FlatBuffer schema definition. When the schema is modified, it's required to
diff --git a/tensorflow/lite/schema/conversion_metadata_generated.h b/tensorflow/lite/schema/conversion_metadata_generated.h
old mode 100644
new mode 100755
index f0d6d8e97a9..20dfff1671b
--- a/tensorflow/lite/schema/conversion_metadata_generated.h
+++ b/tensorflow/lite/schema/conversion_metadata_generated.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,6 +20,13 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffers.h"
 
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 1 &&
+              FLATBUFFERS_VERSION_REVISION == 21,
+             "Non-compatible flatbuffers version included");
+
 namespace tflite {
 
 struct Environment;
@@ -78,7 +85,7 @@ inline const char * const *EnumNamesModelType() {
 }
 
 inline const char *EnumNameModelType(ModelType e) {
-  if (flatbuffers::IsOutRange(e, ModelType_NONE, ModelType_JAX)) return "";
+  if (::flatbuffers::IsOutRange(e, ModelType_NONE, ModelType_JAX)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesModelType()[index];
 }
@@ -124,14 +131,14 @@ inline const char *EnumNameModelOptimizationMode(ModelOptimizationMode e) {
   }
 }
 
-struct EnvironmentT : public flatbuffers::NativeTable {
+struct EnvironmentT : public ::flatbuffers::NativeTable {
   typedef Environment TableType;
   std::string tensorflow_version{};
   uint32_t api_version = 0;
   tflite::ModelType model_type = tflite::ModelType_NONE;
 };
 
-struct Environment FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Environment FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef EnvironmentT NativeTableType;
   typedef EnvironmentBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -139,8 +146,8 @@ struct Environment FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_API_VERSION = 6,
     VT_MODEL_TYPE = 8
   };
-  const flatbuffers::String *tensorflow_version() const {
-    return GetPointer<const flatbuffers::String *>(VT_TENSORFLOW_VERSION);
+  const ::flatbuffers::String *tensorflow_version() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_TENSORFLOW_VERSION);
   }
   uint32_t api_version() const {
     return GetField<uint32_t>(VT_API_VERSION, 0);
@@ -148,7 +155,7 @@ struct Environment FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::ModelType model_type() const {
     return static_cast<tflite::ModelType>(GetField<int32_t>(VT_MODEL_TYPE, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_TENSORFLOW_VERSION) &&
            verifier.VerifyString(tensorflow_version()) &&
@@ -156,16 +163,16 @@ struct Environment FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int32_t>(verifier, VT_MODEL_TYPE, 4) &&
            verifier.EndTable();
   }
-  EnvironmentT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(EnvironmentT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Environment> Pack(flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  EnvironmentT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EnvironmentT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Environment> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct EnvironmentBuilder {
   typedef Environment Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_tensorflow_version(flatbuffers::Offset<flatbuffers::String> tensorflow_version) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_tensorflow_version(::flatbuffers::Offset<::flatbuffers::String> tensorflow_version) {
     fbb_.AddOffset(Environment::VT_TENSORFLOW_VERSION, tensorflow_version);
   }
   void add_api_version(uint32_t api_version) {
@@ -174,20 +181,20 @@ struct EnvironmentBuilder {
   void add_model_type(tflite::ModelType model_type) {
     fbb_.AddElement<int32_t>(Environment::VT_MODEL_TYPE, static_cast<int32_t>(model_type), 0);
   }
-  explicit EnvironmentBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit EnvironmentBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Environment> Finish() {
+  ::flatbuffers::Offset<Environment> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Environment>(end);
+    auto o = ::flatbuffers::Offset<Environment>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Environment> CreateEnvironment(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> tensorflow_version = 0,
+inline ::flatbuffers::Offset<Environment> CreateEnvironment(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> tensorflow_version = 0,
     uint32_t api_version = 0,
     tflite::ModelType model_type = tflite::ModelType_NONE) {
   EnvironmentBuilder builder_(_fbb);
@@ -197,8 +204,8 @@ inline flatbuffers::Offset<Environment> CreateEnvironment(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Environment> CreateEnvironmentDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Environment> CreateEnvironmentDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *tensorflow_version = nullptr,
     uint32_t api_version = 0,
     tflite::ModelType model_type = tflite::ModelType_NONE) {
@@ -210,61 +217,61 @@ inline flatbuffers::Offset<Environment> CreateEnvironmentDirect(
       model_type);
 }
 
-flatbuffers::Offset<Environment> CreateEnvironment(flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Environment> CreateEnvironment(::flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SparsityBlockSizeT : public flatbuffers::NativeTable {
+struct SparsityBlockSizeT : public ::flatbuffers::NativeTable {
   typedef SparsityBlockSize TableType;
   std::vector<uint32_t> values{};
 };
 
-struct SparsityBlockSize FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SparsityBlockSize FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SparsityBlockSizeT NativeTableType;
   typedef SparsityBlockSizeBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_VALUES = 4
   };
-  const flatbuffers::Vector<uint32_t> *values() const {
-    return GetPointer<const flatbuffers::Vector<uint32_t> *>(VT_VALUES);
+  const ::flatbuffers::Vector<uint32_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_VALUES);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_VALUES) &&
            verifier.VerifyVector(values()) &&
            verifier.EndTable();
   }
-  SparsityBlockSizeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SparsityBlockSizeT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SparsityBlockSize> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SparsityBlockSizeT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SparsityBlockSizeT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SparsityBlockSize> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SparsityBlockSizeBuilder {
   typedef SparsityBlockSize Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_values(flatbuffers::Offset<flatbuffers::Vector<uint32_t>> values) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> values) {
     fbb_.AddOffset(SparsityBlockSize::VT_VALUES, values);
   }
-  explicit SparsityBlockSizeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SparsityBlockSizeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SparsityBlockSize> Finish() {
+  ::flatbuffers::Offset<SparsityBlockSize> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SparsityBlockSize>(end);
+    auto o = ::flatbuffers::Offset<SparsityBlockSize>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSize(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<uint32_t>> values = 0) {
+inline ::flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSize(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> values = 0) {
   SparsityBlockSizeBuilder builder_(_fbb);
   builder_.add_values(values);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSizeDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSizeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<uint32_t> *values = nullptr) {
   auto values__ = values ? _fbb.CreateVector<uint32_t>(*values) : 0;
   return tflite::CreateSparsityBlockSize(
@@ -272,9 +279,9 @@ inline flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSizeDirect(
       values__);
 }
 
-flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSize(flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSize(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ConversionOptionsT : public flatbuffers::NativeTable {
+struct ConversionOptionsT : public ::flatbuffers::NativeTable {
   typedef ConversionOptions TableType;
   std::vector<tflite::ModelOptimizationMode> model_optimization_modes{};
   bool allow_custom_ops = false;
@@ -287,7 +294,7 @@ struct ConversionOptionsT : public flatbuffers::NativeTable {
   ConversionOptionsT &operator=(ConversionOptionsT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct ConversionOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ConversionOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ConversionOptionsT NativeTableType;
   typedef ConversionOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -297,8 +304,8 @@ struct ConversionOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_FORCE_SELECT_TF_OPS = 10,
     VT_SPARSITY_BLOCK_SIZES = 12
   };
-  const flatbuffers::Vector<int32_t> *model_optimization_modes() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_MODEL_OPTIMIZATION_MODES);
+  const ::flatbuffers::Vector<int32_t> *model_optimization_modes() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_MODEL_OPTIMIZATION_MODES);
   }
   bool allow_custom_ops() const {
     return GetField<uint8_t>(VT_ALLOW_CUSTOM_OPS, 0) != 0;
@@ -309,10 +316,10 @@ struct ConversionOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool force_select_tf_ops() const {
     return GetField<uint8_t>(VT_FORCE_SELECT_TF_OPS, 0) != 0;
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::SparsityBlockSize>> *sparsity_block_sizes() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::SparsityBlockSize>> *>(VT_SPARSITY_BLOCK_SIZES);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SparsityBlockSize>> *sparsity_block_sizes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SparsityBlockSize>> *>(VT_SPARSITY_BLOCK_SIZES);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_MODEL_OPTIMIZATION_MODES) &&
            verifier.VerifyVector(model_optimization_modes()) &&
@@ -324,16 +331,16 @@ struct ConversionOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyVectorOfTables(sparsity_block_sizes()) &&
            verifier.EndTable();
   }
-  ConversionOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ConversionOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ConversionOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ConversionOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ConversionOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ConversionOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ConversionOptionsBuilder {
   typedef ConversionOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_model_optimization_modes(flatbuffers::Offset<flatbuffers::Vector<int32_t>> model_optimization_modes) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_model_optimization_modes(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> model_optimization_modes) {
     fbb_.AddOffset(ConversionOptions::VT_MODEL_OPTIMIZATION_MODES, model_optimization_modes);
   }
   void add_allow_custom_ops(bool allow_custom_ops) {
@@ -345,27 +352,27 @@ struct ConversionOptionsBuilder {
   void add_force_select_tf_ops(bool force_select_tf_ops) {
     fbb_.AddElement<uint8_t>(ConversionOptions::VT_FORCE_SELECT_TF_OPS, static_cast<uint8_t>(force_select_tf_ops), 0);
   }
-  void add_sparsity_block_sizes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SparsityBlockSize>>> sparsity_block_sizes) {
+  void add_sparsity_block_sizes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SparsityBlockSize>>> sparsity_block_sizes) {
     fbb_.AddOffset(ConversionOptions::VT_SPARSITY_BLOCK_SIZES, sparsity_block_sizes);
   }
-  explicit ConversionOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ConversionOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ConversionOptions> Finish() {
+  ::flatbuffers::Offset<ConversionOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ConversionOptions>(end);
+    auto o = ::flatbuffers::Offset<ConversionOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ConversionOptions> CreateConversionOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> model_optimization_modes = 0,
+inline ::flatbuffers::Offset<ConversionOptions> CreateConversionOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> model_optimization_modes = 0,
     bool allow_custom_ops = false,
     bool enable_select_tf_ops = false,
     bool force_select_tf_ops = false,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SparsityBlockSize>>> sparsity_block_sizes = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SparsityBlockSize>>> sparsity_block_sizes = 0) {
   ConversionOptionsBuilder builder_(_fbb);
   builder_.add_sparsity_block_sizes(sparsity_block_sizes);
   builder_.add_model_optimization_modes(model_optimization_modes);
@@ -375,15 +382,15 @@ inline flatbuffers::Offset<ConversionOptions> CreateConversionOptions(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<ConversionOptions> CreateConversionOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ConversionOptions> CreateConversionOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int32_t> *model_optimization_modes = nullptr,
     bool allow_custom_ops = false,
     bool enable_select_tf_ops = false,
     bool force_select_tf_ops = false,
-    const std::vector<flatbuffers::Offset<tflite::SparsityBlockSize>> *sparsity_block_sizes = nullptr) {
+    const std::vector<::flatbuffers::Offset<tflite::SparsityBlockSize>> *sparsity_block_sizes = nullptr) {
   auto model_optimization_modes__ = model_optimization_modes ? _fbb.CreateVector<int32_t>(*model_optimization_modes) : 0;
-  auto sparsity_block_sizes__ = sparsity_block_sizes ? _fbb.CreateVector<flatbuffers::Offset<tflite::SparsityBlockSize>>(*sparsity_block_sizes) : 0;
+  auto sparsity_block_sizes__ = sparsity_block_sizes ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SparsityBlockSize>>(*sparsity_block_sizes) : 0;
   return tflite::CreateConversionOptions(
       _fbb,
       model_optimization_modes__,
@@ -393,9 +400,9 @@ inline flatbuffers::Offset<ConversionOptions> CreateConversionOptionsDirect(
       sparsity_block_sizes__);
 }
 
-flatbuffers::Offset<ConversionOptions> CreateConversionOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ConversionOptions> CreateConversionOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ConversionMetadataT : public flatbuffers::NativeTable {
+struct ConversionMetadataT : public ::flatbuffers::NativeTable {
   typedef ConversionMetadata TableType;
   std::unique_ptr<tflite::EnvironmentT> environment{};
   std::unique_ptr<tflite::ConversionOptionsT> options{};
@@ -405,7 +412,7 @@ struct ConversionMetadataT : public flatbuffers::NativeTable {
   ConversionMetadataT &operator=(ConversionMetadataT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct ConversionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ConversionMetadata FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ConversionMetadataT NativeTableType;
   typedef ConversionMetadataBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -418,7 +425,7 @@ struct ConversionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::ConversionOptions *options() const {
     return GetPointer<const tflite::ConversionOptions *>(VT_OPTIONS);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_ENVIRONMENT) &&
            verifier.VerifyTable(environment()) &&
@@ -426,51 +433,51 @@ struct ConversionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyTable(options()) &&
            verifier.EndTable();
   }
-  ConversionMetadataT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ConversionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ConversionMetadata> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ConversionMetadataT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ConversionMetadataT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ConversionMetadata> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ConversionMetadataBuilder {
   typedef ConversionMetadata Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_environment(flatbuffers::Offset<tflite::Environment> environment) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_environment(::flatbuffers::Offset<tflite::Environment> environment) {
     fbb_.AddOffset(ConversionMetadata::VT_ENVIRONMENT, environment);
   }
-  void add_options(flatbuffers::Offset<tflite::ConversionOptions> options) {
+  void add_options(::flatbuffers::Offset<tflite::ConversionOptions> options) {
     fbb_.AddOffset(ConversionMetadata::VT_OPTIONS, options);
   }
-  explicit ConversionMetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ConversionMetadataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ConversionMetadata> Finish() {
+  ::flatbuffers::Offset<ConversionMetadata> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ConversionMetadata>(end);
+    auto o = ::flatbuffers::Offset<ConversionMetadata>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<tflite::Environment> environment = 0,
-    flatbuffers::Offset<tflite::ConversionOptions> options = 0) {
+inline ::flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tflite::Environment> environment = 0,
+    ::flatbuffers::Offset<tflite::ConversionOptions> options = 0) {
   ConversionMetadataBuilder builder_(_fbb);
   builder_.add_options(options);
   builder_.add_environment(environment);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-inline EnvironmentT *Environment::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline EnvironmentT *Environment::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<EnvironmentT>(new EnvironmentT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Environment::UnPackTo(EnvironmentT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Environment::UnPackTo(EnvironmentT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = tensorflow_version(); if (_e) _o->tensorflow_version = _e->str(); }
@@ -478,14 +485,14 @@ inline void Environment::UnPackTo(EnvironmentT *_o, const flatbuffers::resolver_
   { auto _e = model_type(); _o->model_type = _e; }
 }
 
-inline flatbuffers::Offset<Environment> Environment::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Environment> Environment::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateEnvironment(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Environment> CreateEnvironment(flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Environment> CreateEnvironment(::flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const EnvironmentT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EnvironmentT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _tensorflow_version = _o->tensorflow_version.empty() ? 0 : _fbb.CreateString(_o->tensorflow_version);
   auto _api_version = _o->api_version;
   auto _model_type = _o->model_type;
@@ -496,26 +503,26 @@ inline flatbuffers::Offset<Environment> CreateEnvironment(flatbuffers::FlatBuffe
       _model_type);
 }
 
-inline SparsityBlockSizeT *SparsityBlockSize::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SparsityBlockSizeT *SparsityBlockSize::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SparsityBlockSizeT>(new SparsityBlockSizeT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SparsityBlockSize::UnPackTo(SparsityBlockSizeT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SparsityBlockSize::UnPackTo(SparsityBlockSizeT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } }
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } else { _o->values.resize(0); } }
 }
 
-inline flatbuffers::Offset<SparsityBlockSize> SparsityBlockSize::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SparsityBlockSize> SparsityBlockSize::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSparsityBlockSize(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSize(flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSize(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SparsityBlockSizeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SparsityBlockSizeT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
   return tflite::CreateSparsityBlockSize(
       _fbb,
@@ -528,7 +535,7 @@ inline ConversionOptionsT::ConversionOptionsT(const ConversionOptionsT &o)
         enable_select_tf_ops(o.enable_select_tf_ops),
         force_select_tf_ops(o.force_select_tf_ops) {
   sparsity_block_sizes.reserve(o.sparsity_block_sizes.size());
-  for (const auto &v : o.sparsity_block_sizes) { sparsity_block_sizes.emplace_back((v) ? new tflite::SparsityBlockSizeT(*v) : nullptr); }
+  for (const auto &sparsity_block_sizes_ : o.sparsity_block_sizes) { sparsity_block_sizes.emplace_back((sparsity_block_sizes_) ? new tflite::SparsityBlockSizeT(*sparsity_block_sizes_) : nullptr); }
 }
 
 inline ConversionOptionsT &ConversionOptionsT::operator=(ConversionOptionsT o) FLATBUFFERS_NOEXCEPT {
@@ -540,35 +547,35 @@ inline ConversionOptionsT &ConversionOptionsT::operator=(ConversionOptionsT o) F
   return *this;
 }
 
-inline ConversionOptionsT *ConversionOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ConversionOptionsT *ConversionOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ConversionOptionsT>(new ConversionOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ConversionOptions::UnPackTo(ConversionOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ConversionOptions::UnPackTo(ConversionOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = model_optimization_modes(); if (_e) { _o->model_optimization_modes.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->model_optimization_modes[_i] = static_cast<tflite::ModelOptimizationMode>(_e->Get(_i)); } } }
+  { auto _e = model_optimization_modes(); if (_e) { _o->model_optimization_modes.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->model_optimization_modes[_i] = static_cast<tflite::ModelOptimizationMode>(_e->Get(_i)); } } else { _o->model_optimization_modes.resize(0); } }
   { auto _e = allow_custom_ops(); _o->allow_custom_ops = _e; }
   { auto _e = enable_select_tf_ops(); _o->enable_select_tf_ops = _e; }
   { auto _e = force_select_tf_ops(); _o->force_select_tf_ops = _e; }
-  { auto _e = sparsity_block_sizes(); if (_e) { _o->sparsity_block_sizes.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->sparsity_block_sizes[_i]) { _e->Get(_i)->UnPackTo(_o->sparsity_block_sizes[_i].get(), _resolver); } else { _o->sparsity_block_sizes[_i] = std::unique_ptr<tflite::SparsityBlockSizeT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
+  { auto _e = sparsity_block_sizes(); if (_e) { _o->sparsity_block_sizes.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->sparsity_block_sizes[_i]) { _e->Get(_i)->UnPackTo(_o->sparsity_block_sizes[_i].get(), _resolver); } else { _o->sparsity_block_sizes[_i] = std::unique_ptr<tflite::SparsityBlockSizeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->sparsity_block_sizes.resize(0); } }
 }
 
-inline flatbuffers::Offset<ConversionOptions> ConversionOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ConversionOptions> ConversionOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateConversionOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ConversionOptions> CreateConversionOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ConversionOptions> CreateConversionOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ConversionOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _model_optimization_modes = _o->model_optimization_modes.size() ? _fbb.CreateVectorScalarCast<int32_t>(flatbuffers::data(_o->model_optimization_modes), _o->model_optimization_modes.size()) : 0;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ConversionOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _model_optimization_modes = _o->model_optimization_modes.size() ? _fbb.CreateVectorScalarCast<int32_t>(::flatbuffers::data(_o->model_optimization_modes), _o->model_optimization_modes.size()) : 0;
   auto _allow_custom_ops = _o->allow_custom_ops;
   auto _enable_select_tf_ops = _o->enable_select_tf_ops;
   auto _force_select_tf_ops = _o->force_select_tf_ops;
-  auto _sparsity_block_sizes = _o->sparsity_block_sizes.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::SparsityBlockSize>> (_o->sparsity_block_sizes.size(), [](size_t i, _VectorArgs *__va) { return CreateSparsityBlockSize(*__va->__fbb, __va->__o->sparsity_block_sizes[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _sparsity_block_sizes = _o->sparsity_block_sizes.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SparsityBlockSize>> (_o->sparsity_block_sizes.size(), [](size_t i, _VectorArgs *__va) { return CreateSparsityBlockSize(*__va->__fbb, __va->__o->sparsity_block_sizes[i].get(), __va->__rehasher); }, &_va ) : 0;
   return tflite::CreateConversionOptions(
       _fbb,
       _model_optimization_modes,
@@ -589,27 +596,27 @@ inline ConversionMetadataT &ConversionMetadataT::operator=(ConversionMetadataT o
   return *this;
 }
 
-inline ConversionMetadataT *ConversionMetadata::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ConversionMetadataT *ConversionMetadata::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ConversionMetadataT>(new ConversionMetadataT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ConversionMetadata::UnPackTo(ConversionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ConversionMetadata::UnPackTo(ConversionMetadataT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = environment(); if (_e) { if(_o->environment) { _e->UnPackTo(_o->environment.get(), _resolver); } else { _o->environment = std::unique_ptr<tflite::EnvironmentT>(_e->UnPack(_resolver)); } } }
-  { auto _e = options(); if (_e) { if(_o->options) { _e->UnPackTo(_o->options.get(), _resolver); } else { _o->options = std::unique_ptr<tflite::ConversionOptionsT>(_e->UnPack(_resolver)); } } }
+  { auto _e = environment(); if (_e) { if(_o->environment) { _e->UnPackTo(_o->environment.get(), _resolver); } else { _o->environment = std::unique_ptr<tflite::EnvironmentT>(_e->UnPack(_resolver)); } } else if (_o->environment) { _o->environment.reset(); } }
+  { auto _e = options(); if (_e) { if(_o->options) { _e->UnPackTo(_o->options.get(), _resolver); } else { _o->options = std::unique_ptr<tflite::ConversionOptionsT>(_e->UnPack(_resolver)); } } else if (_o->options) { _o->options.reset(); } }
 }
 
-inline flatbuffers::Offset<ConversionMetadata> ConversionMetadata::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ConversionMetadata> ConversionMetadata::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateConversionMetadata(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(::flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ConversionMetadataT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ConversionMetadataT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _environment = _o->environment ? CreateEnvironment(_fbb, _o->environment.get(), _rehasher) : 0;
   auto _options = _o->options ? CreateConversionOptions(_fbb, _o->options.get(), _rehasher) : 0;
   return tflite::CreateConversionMetadata(
@@ -619,44 +626,44 @@ inline flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(flatbuff
 }
 
 inline const tflite::ConversionMetadata *GetConversionMetadata(const void *buf) {
-  return flatbuffers::GetRoot<tflite::ConversionMetadata>(buf);
+  return ::flatbuffers::GetRoot<tflite::ConversionMetadata>(buf);
 }
 
 inline const tflite::ConversionMetadata *GetSizePrefixedConversionMetadata(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<tflite::ConversionMetadata>(buf);
+  return ::flatbuffers::GetSizePrefixedRoot<tflite::ConversionMetadata>(buf);
 }
 
 inline bool VerifyConversionMetadataBuffer(
-    flatbuffers::Verifier &verifier) {
+    ::flatbuffers::Verifier &verifier) {
   return verifier.VerifyBuffer<tflite::ConversionMetadata>(nullptr);
 }
 
 inline bool VerifySizePrefixedConversionMetadataBuffer(
-    flatbuffers::Verifier &verifier) {
+    ::flatbuffers::Verifier &verifier) {
   return verifier.VerifySizePrefixedBuffer<tflite::ConversionMetadata>(nullptr);
 }
 
 inline void FinishConversionMetadataBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<tflite::ConversionMetadata> root) {
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::ConversionMetadata> root) {
   fbb.Finish(root);
 }
 
 inline void FinishSizePrefixedConversionMetadataBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<tflite::ConversionMetadata> root) {
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::ConversionMetadata> root) {
   fbb.FinishSizePrefixed(root);
 }
 
 inline std::unique_ptr<tflite::ConversionMetadataT> UnPackConversionMetadata(
     const void *buf,
-    const flatbuffers::resolver_function_t *res = nullptr) {
+    const ::flatbuffers::resolver_function_t *res = nullptr) {
   return std::unique_ptr<tflite::ConversionMetadataT>(GetConversionMetadata(buf)->UnPack(res));
 }
 
 inline std::unique_ptr<tflite::ConversionMetadataT> UnPackSizePrefixedConversionMetadata(
     const void *buf,
-    const flatbuffers::resolver_function_t *res = nullptr) {
+    const ::flatbuffers::resolver_function_t *res = nullptr) {
   return std::unique_ptr<tflite::ConversionMetadataT>(GetSizePrefixedConversionMetadata(buf)->UnPack(res));
 }
 
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 05a906db041..e7cef7ad2a1 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -413,7 +413,10 @@ enum BuiltinOperator : int32 {
   UNSORTED_SEGMENT_SUM = 155,
   ATAN2 = 156,
   UNSORTED_SEGMENT_MIN = 157,
-  SIGN = 158
+  SIGN = 158,
+  BITCAST = 159,
+  BITWISE_XOR = 160,
+  RIGHT_SHIFT = 161,
 }
 // LINT.ThenChange(nnapi_linter/linter.proto)
 
@@ -541,7 +544,10 @@ union BuiltinOptions {
   UnsortedSegmentMinOptions,
   UnsortedSegmentSumOptions,
   ATan2Options,
-  SignOptions
+  SignOptions,
+  BitcastOptions,
+  BitwiseXorOptions,
+  RightShiftOptions,
 }
 
 // LINT.IfChange
@@ -1178,6 +1184,14 @@ table UnsortedSegmentMinOptions{
 table SignOptions {
 }
 
+table BitcastOptions {
+}
+
+table BitwiseXorOptions {
+}
+
+table RightShiftOptions {
+}
 
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index 7ceae97edc2..f03fdd0fc13 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,6 +20,13 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffers.h"
 
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 1 &&
+              FLATBUFFERS_VERSION_REVISION == 21,
+             "Non-compatible flatbuffers version included");
+
 namespace tflite {
 
 struct CustomQuantization;
@@ -550,6 +557,18 @@ struct SignOptions;
 struct SignOptionsBuilder;
 struct SignOptionsT;
 
+struct BitcastOptions;
+struct BitcastOptionsBuilder;
+struct BitcastOptionsT;
+
+struct BitwiseXorOptions;
+struct BitwiseXorOptionsBuilder;
+struct BitwiseXorOptionsT;
+
+struct RightShiftOptions;
+struct RightShiftOptionsBuilder;
+struct RightShiftOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeBuilder;
 struct OperatorCodeT;
@@ -655,7 +674,7 @@ inline const char * const *EnumNamesTensorType() {
 }
 
 inline const char *EnumNameTensorType(TensorType e) {
-  if (flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_INT4)) return "";
+  if (::flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_INT4)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesTensorType()[index];
 }
@@ -685,7 +704,7 @@ inline const char * const *EnumNamesQuantizationDetails() {
 }
 
 inline const char *EnumNameQuantizationDetails(QuantizationDetails e) {
-  if (flatbuffers::IsOutRange(e, QuantizationDetails_NONE, QuantizationDetails_CustomQuantization)) return "";
+  if (::flatbuffers::IsOutRange(e, QuantizationDetails_NONE, QuantizationDetails_CustomQuantization)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesQuantizationDetails()[index];
 }
@@ -733,8 +752,8 @@ struct QuantizationDetailsUnion {
     }
   }
 
-  static void *UnPack(const void *obj, QuantizationDetails type, const flatbuffers::resolver_function_t *resolver);
-  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+  static void *UnPack(const void *obj, QuantizationDetails type, const ::flatbuffers::resolver_function_t *resolver);
+  ::flatbuffers::Offset<void> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
 
   tflite::CustomQuantizationT *AsCustomQuantization() {
     return type == QuantizationDetails_CustomQuantization ?
@@ -746,8 +765,8 @@ struct QuantizationDetailsUnion {
   }
 };
 
-bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type);
-bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+bool VerifyQuantizationDetails(::flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type);
+bool VerifyQuantizationDetailsVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
 
 enum DimensionType : int8_t {
   DimensionType_DENSE = 0,
@@ -774,7 +793,7 @@ inline const char * const *EnumNamesDimensionType() {
 }
 
 inline const char *EnumNameDimensionType(DimensionType e) {
-  if (flatbuffers::IsOutRange(e, DimensionType_DENSE, DimensionType_SPARSE_CSR)) return "";
+  if (::flatbuffers::IsOutRange(e, DimensionType_DENSE, DimensionType_SPARSE_CSR)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesDimensionType()[index];
 }
@@ -810,7 +829,7 @@ inline const char * const *EnumNamesSparseIndexVector() {
 }
 
 inline const char *EnumNameSparseIndexVector(SparseIndexVector e) {
-  if (flatbuffers::IsOutRange(e, SparseIndexVector_NONE, SparseIndexVector_Uint8Vector)) return "";
+  if (::flatbuffers::IsOutRange(e, SparseIndexVector_NONE, SparseIndexVector_Uint8Vector)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesSparseIndexVector()[index];
 }
@@ -874,8 +893,8 @@ struct SparseIndexVectorUnion {
     }
   }
 
-  static void *UnPack(const void *obj, SparseIndexVector type, const flatbuffers::resolver_function_t *resolver);
-  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+  static void *UnPack(const void *obj, SparseIndexVector type, const ::flatbuffers::resolver_function_t *resolver);
+  ::flatbuffers::Offset<void> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
 
   tflite::Int32VectorT *AsInt32Vector() {
     return type == SparseIndexVector_Int32Vector ?
@@ -903,8 +922,8 @@ struct SparseIndexVectorUnion {
   }
 };
 
-bool VerifySparseIndexVector(flatbuffers::Verifier &verifier, const void *obj, SparseIndexVector type);
-bool VerifySparseIndexVectorVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+bool VerifySparseIndexVector(::flatbuffers::Verifier &verifier, const void *obj, SparseIndexVector type);
+bool VerifySparseIndexVectorVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
 
 enum BuiltinOperator : int32_t {
   BuiltinOperator_ADD = 0,
@@ -1066,11 +1085,14 @@ enum BuiltinOperator : int32_t {
   BuiltinOperator_ATAN2 = 156,
   BuiltinOperator_UNSORTED_SEGMENT_MIN = 157,
   BuiltinOperator_SIGN = 158,
+  BuiltinOperator_BITCAST = 159,
+  BuiltinOperator_BITWISE_XOR = 160,
+  BuiltinOperator_RIGHT_SHIFT = 161,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_SIGN
+  BuiltinOperator_MAX = BuiltinOperator_RIGHT_SHIFT
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[159] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[162] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -1230,13 +1252,16 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[159] {
     BuiltinOperator_UNSORTED_SEGMENT_SUM,
     BuiltinOperator_ATAN2,
     BuiltinOperator_UNSORTED_SEGMENT_MIN,
-    BuiltinOperator_SIGN
+    BuiltinOperator_SIGN,
+    BuiltinOperator_BITCAST,
+    BuiltinOperator_BITWISE_XOR,
+    BuiltinOperator_RIGHT_SHIFT
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOperator() {
-  static const char * const names[160] = {
+  static const char * const names[163] = {
     "ADD",
     "AVERAGE_POOL_2D",
     "CONCATENATION",
@@ -1396,13 +1421,16 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "ATAN2",
     "UNSORTED_SEGMENT_MIN",
     "SIGN",
+    "BITCAST",
+    "BITWISE_XOR",
+    "RIGHT_SHIFT",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
-  if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_SIGN)) return "";
+  if (::flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_RIGHT_SHIFT)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOperator()[index];
 }
@@ -1532,11 +1560,14 @@ enum BuiltinOptions : uint8_t {
   BuiltinOptions_UnsortedSegmentSumOptions = 121,
   BuiltinOptions_ATan2Options = 122,
   BuiltinOptions_SignOptions = 123,
+  BuiltinOptions_BitcastOptions = 124,
+  BuiltinOptions_BitwiseXorOptions = 125,
+  BuiltinOptions_RightShiftOptions = 126,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_SignOptions
+  BuiltinOptions_MAX = BuiltinOptions_RightShiftOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[124] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[127] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -1661,13 +1692,16 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[124] {
     BuiltinOptions_UnsortedSegmentMinOptions,
     BuiltinOptions_UnsortedSegmentSumOptions,
     BuiltinOptions_ATan2Options,
-    BuiltinOptions_SignOptions
+    BuiltinOptions_SignOptions,
+    BuiltinOptions_BitcastOptions,
+    BuiltinOptions_BitwiseXorOptions,
+    BuiltinOptions_RightShiftOptions
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOptions() {
-  static const char * const names[125] = {
+  static const char * const names[128] = {
     "NONE",
     "Conv2DOptions",
     "DepthwiseConv2DOptions",
@@ -1792,13 +1826,16 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "UnsortedSegmentSumOptions",
     "ATan2Options",
     "SignOptions",
+    "BitcastOptions",
+    "BitwiseXorOptions",
+    "RightShiftOptions",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOptions(BuiltinOptions e) {
-  if (flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_SignOptions)) return "";
+  if (::flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_RightShiftOptions)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOptions()[index];
 }
@@ -2299,6 +2336,18 @@ template<> struct BuiltinOptionsTraits<tflite::SignOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SignOptions;
 };
 
+template<> struct BuiltinOptionsTraits<tflite::BitcastOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BitcastOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BitwiseXorOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BitwiseXorOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RightShiftOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RightShiftOptions;
+};
+
 template<typename T> struct BuiltinOptionsUnionTraits {
   static const BuiltinOptions enum_value = BuiltinOptions_NONE;
 };
@@ -2795,6 +2844,18 @@ template<> struct BuiltinOptionsUnionTraits<tflite::SignOptionsT> {
   static const BuiltinOptions enum_value = BuiltinOptions_SignOptions;
 };
 
+template<> struct BuiltinOptionsUnionTraits<tflite::BitcastOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BitcastOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::BitwiseXorOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BitwiseXorOptions;
+};
+
+template<> struct BuiltinOptionsUnionTraits<tflite::RightShiftOptionsT> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RightShiftOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -2822,8 +2883,8 @@ struct BuiltinOptionsUnion {
     }
   }
 
-  static void *UnPack(const void *obj, BuiltinOptions type, const flatbuffers::resolver_function_t *resolver);
-  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+  static void *UnPack(const void *obj, BuiltinOptions type, const ::flatbuffers::resolver_function_t *resolver);
+  ::flatbuffers::Offset<void> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
 
   tflite::Conv2DOptionsT *AsConv2DOptions() {
     return type == BuiltinOptions_Conv2DOptions ?
@@ -3809,10 +3870,34 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_SignOptions ?
       reinterpret_cast<const tflite::SignOptionsT *>(value) : nullptr;
   }
+  tflite::BitcastOptionsT *AsBitcastOptions() {
+    return type == BuiltinOptions_BitcastOptions ?
+      reinterpret_cast<tflite::BitcastOptionsT *>(value) : nullptr;
+  }
+  const tflite::BitcastOptionsT *AsBitcastOptions() const {
+    return type == BuiltinOptions_BitcastOptions ?
+      reinterpret_cast<const tflite::BitcastOptionsT *>(value) : nullptr;
+  }
+  tflite::BitwiseXorOptionsT *AsBitwiseXorOptions() {
+    return type == BuiltinOptions_BitwiseXorOptions ?
+      reinterpret_cast<tflite::BitwiseXorOptionsT *>(value) : nullptr;
+  }
+  const tflite::BitwiseXorOptionsT *AsBitwiseXorOptions() const {
+    return type == BuiltinOptions_BitwiseXorOptions ?
+      reinterpret_cast<const tflite::BitwiseXorOptionsT *>(value) : nullptr;
+  }
+  tflite::RightShiftOptionsT *AsRightShiftOptions() {
+    return type == BuiltinOptions_RightShiftOptions ?
+      reinterpret_cast<tflite::RightShiftOptionsT *>(value) : nullptr;
+  }
+  const tflite::RightShiftOptionsT *AsRightShiftOptions() const {
+    return type == BuiltinOptions_RightShiftOptions ?
+      reinterpret_cast<const tflite::RightShiftOptionsT *>(value) : nullptr;
+  }
 };
 
-bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
-bool VerifyBuiltinOptionsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+bool VerifyBuiltinOptions(::flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
+bool VerifyBuiltinOptionsVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types);
 
 enum Padding : int8_t {
   Padding_SAME = 0,
@@ -3839,7 +3924,7 @@ inline const char * const *EnumNamesPadding() {
 }
 
 inline const char *EnumNamePadding(Padding e) {
-  if (flatbuffers::IsOutRange(e, Padding_SAME, Padding_VALID)) return "";
+  if (::flatbuffers::IsOutRange(e, Padding_SAME, Padding_VALID)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesPadding()[index];
 }
@@ -3881,7 +3966,7 @@ inline const char * const *EnumNamesActivationFunctionType() {
 }
 
 inline const char *EnumNameActivationFunctionType(ActivationFunctionType e) {
-  if (flatbuffers::IsOutRange(e, ActivationFunctionType_NONE, ActivationFunctionType_SIGN_BIT)) return "";
+  if (::flatbuffers::IsOutRange(e, ActivationFunctionType_NONE, ActivationFunctionType_SIGN_BIT)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesActivationFunctionType()[index];
 }
@@ -3914,7 +3999,7 @@ inline const char * const *EnumNamesLSHProjectionType() {
 }
 
 inline const char *EnumNameLSHProjectionType(LSHProjectionType e) {
-  if (flatbuffers::IsOutRange(e, LSHProjectionType_UNKNOWN, LSHProjectionType_DENSE)) return "";
+  if (::flatbuffers::IsOutRange(e, LSHProjectionType_UNKNOWN, LSHProjectionType_DENSE)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesLSHProjectionType()[index];
 }
@@ -3944,7 +4029,7 @@ inline const char * const *EnumNamesFullyConnectedOptionsWeightsFormat() {
 }
 
 inline const char *EnumNameFullyConnectedOptionsWeightsFormat(FullyConnectedOptionsWeightsFormat e) {
-  if (flatbuffers::IsOutRange(e, FullyConnectedOptionsWeightsFormat_DEFAULT, FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8)) return "";
+  if (::flatbuffers::IsOutRange(e, FullyConnectedOptionsWeightsFormat_DEFAULT, FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesFullyConnectedOptionsWeightsFormat()[index];
 }
@@ -3974,7 +4059,7 @@ inline const char * const *EnumNamesLSTMKernelType() {
 }
 
 inline const char *EnumNameLSTMKernelType(LSTMKernelType e) {
-  if (flatbuffers::IsOutRange(e, LSTMKernelType_FULL, LSTMKernelType_BASIC)) return "";
+  if (::flatbuffers::IsOutRange(e, LSTMKernelType_FULL, LSTMKernelType_BASIC)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesLSTMKernelType()[index];
 }
@@ -4007,7 +4092,7 @@ inline const char * const *EnumNamesCombinerType() {
 }
 
 inline const char *EnumNameCombinerType(CombinerType e) {
-  if (flatbuffers::IsOutRange(e, CombinerType_SUM, CombinerType_SQRTN)) return "";
+  if (::flatbuffers::IsOutRange(e, CombinerType_SUM, CombinerType_SQRTN)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesCombinerType()[index];
 }
@@ -4037,7 +4122,7 @@ inline const char * const *EnumNamesMirrorPadMode() {
 }
 
 inline const char *EnumNameMirrorPadMode(MirrorPadMode e) {
-  if (flatbuffers::IsOutRange(e, MirrorPadMode_REFLECT, MirrorPadMode_SYMMETRIC)) return "";
+  if (::flatbuffers::IsOutRange(e, MirrorPadMode_REFLECT, MirrorPadMode_SYMMETRIC)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesMirrorPadMode()[index];
 }
@@ -4064,64 +4149,64 @@ inline const char * const *EnumNamesCustomOptionsFormat() {
 }
 
 inline const char *EnumNameCustomOptionsFormat(CustomOptionsFormat e) {
-  if (flatbuffers::IsOutRange(e, CustomOptionsFormat_FLEXBUFFERS, CustomOptionsFormat_FLEXBUFFERS)) return "";
+  if (::flatbuffers::IsOutRange(e, CustomOptionsFormat_FLEXBUFFERS, CustomOptionsFormat_FLEXBUFFERS)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesCustomOptionsFormat()[index];
 }
 
-struct CustomQuantizationT : public flatbuffers::NativeTable {
+struct CustomQuantizationT : public ::flatbuffers::NativeTable {
   typedef CustomQuantization TableType;
   std::vector<uint8_t> custom{};
 };
 
-struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef CustomQuantizationT NativeTableType;
   typedef CustomQuantizationBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_CUSTOM = 4
   };
-  const flatbuffers::Vector<uint8_t> *custom() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM);
+  const ::flatbuffers::Vector<uint8_t> *custom() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_CUSTOM);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_CUSTOM) &&
            verifier.VerifyVector(custom()) &&
            verifier.EndTable();
   }
-  CustomQuantizationT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(CustomQuantizationT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<CustomQuantization> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  CustomQuantizationT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CustomQuantizationT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CustomQuantization> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct CustomQuantizationBuilder {
   typedef CustomQuantization Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_custom(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_custom(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom) {
     fbb_.AddOffset(CustomQuantization::VT_CUSTOM, custom);
   }
-  explicit CustomQuantizationBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit CustomQuantizationBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CustomQuantization> Finish() {
+  ::flatbuffers::Offset<CustomQuantization> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CustomQuantization>(end);
+    auto o = ::flatbuffers::Offset<CustomQuantization>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom = 0) {
+inline ::flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom = 0) {
   CustomQuantizationBuilder builder_(_fbb);
   builder_.add_custom(custom);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantizationDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<CustomQuantization> CreateCustomQuantizationDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<uint8_t> *custom = nullptr) {
   if (custom) { _fbb.ForceVectorAlignment(custom->size(), sizeof(uint8_t), 16); }
   auto custom__ = custom ? _fbb.CreateVector<uint8_t>(*custom) : 0;
@@ -4130,9 +4215,9 @@ inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantizationDirect(
       custom__);
 }
 
-flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(::flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct QuantizationParametersT : public flatbuffers::NativeTable {
+struct QuantizationParametersT : public ::flatbuffers::NativeTable {
   typedef QuantizationParameters TableType;
   std::vector<float> min{};
   std::vector<float> max{};
@@ -4142,7 +4227,7 @@ struct QuantizationParametersT : public flatbuffers::NativeTable {
   int32_t quantized_dimension = 0;
 };
 
-struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef QuantizationParametersT NativeTableType;
   typedef QuantizationParametersBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -4154,17 +4239,17 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
     VT_DETAILS = 14,
     VT_QUANTIZED_DIMENSION = 16
   };
-  const flatbuffers::Vector<float> *min() const {
-    return GetPointer<const flatbuffers::Vector<float> *>(VT_MIN);
+  const ::flatbuffers::Vector<float> *min() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_MIN);
   }
-  const flatbuffers::Vector<float> *max() const {
-    return GetPointer<const flatbuffers::Vector<float> *>(VT_MAX);
+  const ::flatbuffers::Vector<float> *max() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_MAX);
   }
-  const flatbuffers::Vector<float> *scale() const {
-    return GetPointer<const flatbuffers::Vector<float> *>(VT_SCALE);
+  const ::flatbuffers::Vector<float> *scale() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_SCALE);
   }
-  const flatbuffers::Vector<int64_t> *zero_point() const {
-    return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_ZERO_POINT);
+  const ::flatbuffers::Vector<int64_t> *zero_point() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_ZERO_POINT);
   }
   tflite::QuantizationDetails details_type() const {
     return static_cast<tflite::QuantizationDetails>(GetField<uint8_t>(VT_DETAILS_TYPE, 0));
@@ -4179,7 +4264,7 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   int32_t quantized_dimension() const {
     return GetField<int32_t>(VT_QUANTIZED_DIMENSION, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_MIN) &&
            verifier.VerifyVector(min()) &&
@@ -4195,9 +4280,9 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
            VerifyField<int32_t>(verifier, VT_QUANTIZED_DIMENSION, 4) &&
            verifier.EndTable();
   }
-  QuantizationParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(QuantizationParametersT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<QuantizationParameters> Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  QuantizationParametersT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(QuantizationParametersT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<QuantizationParameters> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 template<> inline const tflite::CustomQuantization *QuantizationParameters::details_as<tflite::CustomQuantization>() const {
@@ -4206,48 +4291,48 @@ template<> inline const tflite::CustomQuantization *QuantizationParameters::deta
 
 struct QuantizationParametersBuilder {
   typedef QuantizationParameters Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_min(flatbuffers::Offset<flatbuffers::Vector<float>> min) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_min(::flatbuffers::Offset<::flatbuffers::Vector<float>> min) {
     fbb_.AddOffset(QuantizationParameters::VT_MIN, min);
   }
-  void add_max(flatbuffers::Offset<flatbuffers::Vector<float>> max) {
+  void add_max(::flatbuffers::Offset<::flatbuffers::Vector<float>> max) {
     fbb_.AddOffset(QuantizationParameters::VT_MAX, max);
   }
-  void add_scale(flatbuffers::Offset<flatbuffers::Vector<float>> scale) {
+  void add_scale(::flatbuffers::Offset<::flatbuffers::Vector<float>> scale) {
     fbb_.AddOffset(QuantizationParameters::VT_SCALE, scale);
   }
-  void add_zero_point(flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point) {
+  void add_zero_point(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> zero_point) {
     fbb_.AddOffset(QuantizationParameters::VT_ZERO_POINT, zero_point);
   }
   void add_details_type(tflite::QuantizationDetails details_type) {
     fbb_.AddElement<uint8_t>(QuantizationParameters::VT_DETAILS_TYPE, static_cast<uint8_t>(details_type), 0);
   }
-  void add_details(flatbuffers::Offset<void> details) {
+  void add_details(::flatbuffers::Offset<void> details) {
     fbb_.AddOffset(QuantizationParameters::VT_DETAILS, details);
   }
   void add_quantized_dimension(int32_t quantized_dimension) {
     fbb_.AddElement<int32_t>(QuantizationParameters::VT_QUANTIZED_DIMENSION, quantized_dimension, 0);
   }
-  explicit QuantizationParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit QuantizationParametersBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<QuantizationParameters> Finish() {
+  ::flatbuffers::Offset<QuantizationParameters> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<QuantizationParameters>(end);
+    auto o = ::flatbuffers::Offset<QuantizationParameters>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<float>> min = 0,
-    flatbuffers::Offset<flatbuffers::Vector<float>> max = 0,
-    flatbuffers::Offset<flatbuffers::Vector<float>> scale = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0,
+inline ::flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> min = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> max = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> scale = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> zero_point = 0,
     tflite::QuantizationDetails details_type = tflite::QuantizationDetails_NONE,
-    flatbuffers::Offset<void> details = 0,
+    ::flatbuffers::Offset<void> details = 0,
     int32_t quantized_dimension = 0) {
   QuantizationParametersBuilder builder_(_fbb);
   builder_.add_quantized_dimension(quantized_dimension);
@@ -4260,14 +4345,14 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<float> *min = nullptr,
     const std::vector<float> *max = nullptr,
     const std::vector<float> *scale = nullptr,
     const std::vector<int64_t> *zero_point = nullptr,
     tflite::QuantizationDetails details_type = tflite::QuantizationDetails_NONE,
-    flatbuffers::Offset<void> details = 0,
+    ::flatbuffers::Offset<void> details = 0,
     int32_t quantized_dimension = 0) {
   auto min__ = min ? _fbb.CreateVector<float>(*min) : 0;
   auto max__ = max ? _fbb.CreateVector<float>(*max) : 0;
@@ -4284,61 +4369,61 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersD
       quantized_dimension);
 }
 
-flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct Int32VectorT : public flatbuffers::NativeTable {
+struct Int32VectorT : public ::flatbuffers::NativeTable {
   typedef Int32Vector TableType;
   std::vector<int32_t> values{};
 };
 
-struct Int32Vector FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Int32Vector FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef Int32VectorT NativeTableType;
   typedef Int32VectorBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_VALUES = 4
   };
-  const flatbuffers::Vector<int32_t> *values() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_VALUES);
+  const ::flatbuffers::Vector<int32_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_VALUES);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_VALUES) &&
            verifier.VerifyVector(values()) &&
            verifier.EndTable();
   }
-  Int32VectorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(Int32VectorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Int32Vector> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  Int32VectorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Int32VectorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Int32Vector> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct Int32VectorBuilder {
   typedef Int32Vector Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_values(flatbuffers::Offset<flatbuffers::Vector<int32_t>> values) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> values) {
     fbb_.AddOffset(Int32Vector::VT_VALUES, values);
   }
-  explicit Int32VectorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit Int32VectorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Int32Vector> Finish() {
+  ::flatbuffers::Offset<Int32Vector> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Int32Vector>(end);
+    auto o = ::flatbuffers::Offset<Int32Vector>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Int32Vector> CreateInt32Vector(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> values = 0) {
+inline ::flatbuffers::Offset<Int32Vector> CreateInt32Vector(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> values = 0) {
   Int32VectorBuilder builder_(_fbb);
   builder_.add_values(values);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Int32Vector> CreateInt32VectorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Int32Vector> CreateInt32VectorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int32_t> *values = nullptr) {
   auto values__ = values ? _fbb.CreateVector<int32_t>(*values) : 0;
   return tflite::CreateInt32Vector(
@@ -4346,61 +4431,61 @@ inline flatbuffers::Offset<Int32Vector> CreateInt32VectorDirect(
       values__);
 }
 
-flatbuffers::Offset<Int32Vector> CreateInt32Vector(flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Int32Vector> CreateInt32Vector(::flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct Uint16VectorT : public flatbuffers::NativeTable {
+struct Uint16VectorT : public ::flatbuffers::NativeTable {
   typedef Uint16Vector TableType;
   std::vector<uint16_t> values{};
 };
 
-struct Uint16Vector FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Uint16Vector FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef Uint16VectorT NativeTableType;
   typedef Uint16VectorBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_VALUES = 4
   };
-  const flatbuffers::Vector<uint16_t> *values() const {
-    return GetPointer<const flatbuffers::Vector<uint16_t> *>(VT_VALUES);
+  const ::flatbuffers::Vector<uint16_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<uint16_t> *>(VT_VALUES);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_VALUES) &&
            verifier.VerifyVector(values()) &&
            verifier.EndTable();
   }
-  Uint16VectorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(Uint16VectorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Uint16Vector> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  Uint16VectorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Uint16VectorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Uint16Vector> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct Uint16VectorBuilder {
   typedef Uint16Vector Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_values(flatbuffers::Offset<flatbuffers::Vector<uint16_t>> values) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<uint16_t>> values) {
     fbb_.AddOffset(Uint16Vector::VT_VALUES, values);
   }
-  explicit Uint16VectorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit Uint16VectorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Uint16Vector> Finish() {
+  ::flatbuffers::Offset<Uint16Vector> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Uint16Vector>(end);
+    auto o = ::flatbuffers::Offset<Uint16Vector>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Uint16Vector> CreateUint16Vector(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<uint16_t>> values = 0) {
+inline ::flatbuffers::Offset<Uint16Vector> CreateUint16Vector(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint16_t>> values = 0) {
   Uint16VectorBuilder builder_(_fbb);
   builder_.add_values(values);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Uint16Vector> CreateUint16VectorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Uint16Vector> CreateUint16VectorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<uint16_t> *values = nullptr) {
   if (values) { _fbb.ForceVectorAlignment(values->size(), sizeof(uint16_t), 4); }
   auto values__ = values ? _fbb.CreateVector<uint16_t>(*values) : 0;
@@ -4409,61 +4494,61 @@ inline flatbuffers::Offset<Uint16Vector> CreateUint16VectorDirect(
       values__);
 }
 
-flatbuffers::Offset<Uint16Vector> CreateUint16Vector(flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Uint16Vector> CreateUint16Vector(::flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct Uint8VectorT : public flatbuffers::NativeTable {
+struct Uint8VectorT : public ::flatbuffers::NativeTable {
   typedef Uint8Vector TableType;
   std::vector<uint8_t> values{};
 };
 
-struct Uint8Vector FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Uint8Vector FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef Uint8VectorT NativeTableType;
   typedef Uint8VectorBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_VALUES = 4
   };
-  const flatbuffers::Vector<uint8_t> *values() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_VALUES);
+  const ::flatbuffers::Vector<uint8_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_VALUES);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_VALUES) &&
            verifier.VerifyVector(values()) &&
            verifier.EndTable();
   }
-  Uint8VectorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(Uint8VectorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Uint8Vector> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  Uint8VectorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Uint8VectorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Uint8Vector> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct Uint8VectorBuilder {
   typedef Uint8Vector Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_values(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> values) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> values) {
     fbb_.AddOffset(Uint8Vector::VT_VALUES, values);
   }
-  explicit Uint8VectorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit Uint8VectorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Uint8Vector> Finish() {
+  ::flatbuffers::Offset<Uint8Vector> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Uint8Vector>(end);
+    auto o = ::flatbuffers::Offset<Uint8Vector>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Uint8Vector> CreateUint8Vector(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> values = 0) {
+inline ::flatbuffers::Offset<Uint8Vector> CreateUint8Vector(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> values = 0) {
   Uint8VectorBuilder builder_(_fbb);
   builder_.add_values(values);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Uint8Vector> CreateUint8VectorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Uint8Vector> CreateUint8VectorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<uint8_t> *values = nullptr) {
   if (values) { _fbb.ForceVectorAlignment(values->size(), sizeof(uint8_t), 4); }
   auto values__ = values ? _fbb.CreateVector<uint8_t>(*values) : 0;
@@ -4472,9 +4557,9 @@ inline flatbuffers::Offset<Uint8Vector> CreateUint8VectorDirect(
       values__);
 }
 
-flatbuffers::Offset<Uint8Vector> CreateUint8Vector(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Uint8Vector> CreateUint8Vector(::flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct DimensionMetadataT : public flatbuffers::NativeTable {
+struct DimensionMetadataT : public ::flatbuffers::NativeTable {
   typedef DimensionMetadata TableType;
   tflite::DimensionType format = tflite::DimensionType_DENSE;
   int32_t dense_size = 0;
@@ -4482,7 +4567,7 @@ struct DimensionMetadataT : public flatbuffers::NativeTable {
   tflite::SparseIndexVectorUnion array_indices{};
 };
 
-struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef DimensionMetadataT NativeTableType;
   typedef DimensionMetadataBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -4531,7 +4616,7 @@ struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::Uint8Vector *array_indices_as_Uint8Vector() const {
     return array_indices_type() == tflite::SparseIndexVector_Uint8Vector ? static_cast<const tflite::Uint8Vector *>(array_indices()) : nullptr;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FORMAT, 1) &&
            VerifyField<int32_t>(verifier, VT_DENSE_SIZE, 4) &&
@@ -4543,9 +4628,9 @@ struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifySparseIndexVector(verifier, array_indices(), array_indices_type()) &&
            verifier.EndTable();
   }
-  DimensionMetadataT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(DimensionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<DimensionMetadata> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  DimensionMetadataT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DimensionMetadataT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DimensionMetadata> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 template<> inline const tflite::Int32Vector *DimensionMetadata::array_segments_as<tflite::Int32Vector>() const {
@@ -4574,8 +4659,8 @@ template<> inline const tflite::Uint8Vector *DimensionMetadata::array_indices_as
 
 struct DimensionMetadataBuilder {
   typedef DimensionMetadata Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_format(tflite::DimensionType format) {
     fbb_.AddElement<int8_t>(DimensionMetadata::VT_FORMAT, static_cast<int8_t>(format), 0);
   }
@@ -4585,34 +4670,34 @@ struct DimensionMetadataBuilder {
   void add_array_segments_type(tflite::SparseIndexVector array_segments_type) {
     fbb_.AddElement<uint8_t>(DimensionMetadata::VT_ARRAY_SEGMENTS_TYPE, static_cast<uint8_t>(array_segments_type), 0);
   }
-  void add_array_segments(flatbuffers::Offset<void> array_segments) {
+  void add_array_segments(::flatbuffers::Offset<void> array_segments) {
     fbb_.AddOffset(DimensionMetadata::VT_ARRAY_SEGMENTS, array_segments);
   }
   void add_array_indices_type(tflite::SparseIndexVector array_indices_type) {
     fbb_.AddElement<uint8_t>(DimensionMetadata::VT_ARRAY_INDICES_TYPE, static_cast<uint8_t>(array_indices_type), 0);
   }
-  void add_array_indices(flatbuffers::Offset<void> array_indices) {
+  void add_array_indices(::flatbuffers::Offset<void> array_indices) {
     fbb_.AddOffset(DimensionMetadata::VT_ARRAY_INDICES, array_indices);
   }
-  explicit DimensionMetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit DimensionMetadataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<DimensionMetadata> Finish() {
+  ::flatbuffers::Offset<DimensionMetadata> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<DimensionMetadata>(end);
+    auto o = ::flatbuffers::Offset<DimensionMetadata>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::DimensionType format = tflite::DimensionType_DENSE,
     int32_t dense_size = 0,
     tflite::SparseIndexVector array_segments_type = tflite::SparseIndexVector_NONE,
-    flatbuffers::Offset<void> array_segments = 0,
+    ::flatbuffers::Offset<void> array_segments = 0,
     tflite::SparseIndexVector array_indices_type = tflite::SparseIndexVector_NONE,
-    flatbuffers::Offset<void> array_indices = 0) {
+    ::flatbuffers::Offset<void> array_indices = 0) {
   DimensionMetadataBuilder builder_(_fbb);
   builder_.add_array_indices(array_indices);
   builder_.add_array_segments(array_segments);
@@ -4623,9 +4708,9 @@ inline flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(::flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SparsityParametersT : public flatbuffers::NativeTable {
+struct SparsityParametersT : public ::flatbuffers::NativeTable {
   typedef SparsityParameters TableType;
   std::vector<int32_t> traversal_order{};
   std::vector<int32_t> block_map{};
@@ -4636,7 +4721,7 @@ struct SparsityParametersT : public flatbuffers::NativeTable {
   SparsityParametersT &operator=(SparsityParametersT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct SparsityParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SparsityParameters FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SparsityParametersT NativeTableType;
   typedef SparsityParametersBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -4644,16 +4729,16 @@ struct SparsityParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_BLOCK_MAP = 6,
     VT_DIM_METADATA = 8
   };
-  const flatbuffers::Vector<int32_t> *traversal_order() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_TRAVERSAL_ORDER);
+  const ::flatbuffers::Vector<int32_t> *traversal_order() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_TRAVERSAL_ORDER);
   }
-  const flatbuffers::Vector<int32_t> *block_map() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BLOCK_MAP);
+  const ::flatbuffers::Vector<int32_t> *block_map() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_BLOCK_MAP);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::DimensionMetadata>> *dim_metadata() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::DimensionMetadata>> *>(VT_DIM_METADATA);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::DimensionMetadata>> *dim_metadata() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::DimensionMetadata>> *>(VT_DIM_METADATA);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_TRAVERSAL_ORDER) &&
            verifier.VerifyVector(traversal_order()) &&
@@ -4664,40 +4749,40 @@ struct SparsityParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyVectorOfTables(dim_metadata()) &&
            verifier.EndTable();
   }
-  SparsityParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SparsityParametersT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SparsityParameters> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SparsityParametersT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SparsityParametersT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SparsityParameters> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SparsityParametersBuilder {
   typedef SparsityParameters Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_traversal_order(flatbuffers::Offset<flatbuffers::Vector<int32_t>> traversal_order) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_traversal_order(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> traversal_order) {
     fbb_.AddOffset(SparsityParameters::VT_TRAVERSAL_ORDER, traversal_order);
   }
-  void add_block_map(flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_map) {
+  void add_block_map(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> block_map) {
     fbb_.AddOffset(SparsityParameters::VT_BLOCK_MAP, block_map);
   }
-  void add_dim_metadata(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::DimensionMetadata>>> dim_metadata) {
+  void add_dim_metadata(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::DimensionMetadata>>> dim_metadata) {
     fbb_.AddOffset(SparsityParameters::VT_DIM_METADATA, dim_metadata);
   }
-  explicit SparsityParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SparsityParametersBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SparsityParameters> Finish() {
+  ::flatbuffers::Offset<SparsityParameters> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SparsityParameters>(end);
+    auto o = ::flatbuffers::Offset<SparsityParameters>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> traversal_order = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_map = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::DimensionMetadata>>> dim_metadata = 0) {
+inline ::flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> traversal_order = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> block_map = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::DimensionMetadata>>> dim_metadata = 0) {
   SparsityParametersBuilder builder_(_fbb);
   builder_.add_dim_metadata(dim_metadata);
   builder_.add_block_map(block_map);
@@ -4705,14 +4790,14 @@ inline flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<SparsityParameters> CreateSparsityParametersDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<SparsityParameters> CreateSparsityParametersDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int32_t> *traversal_order = nullptr,
     const std::vector<int32_t> *block_map = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::DimensionMetadata>> *dim_metadata = nullptr) {
+    const std::vector<::flatbuffers::Offset<tflite::DimensionMetadata>> *dim_metadata = nullptr) {
   auto traversal_order__ = traversal_order ? _fbb.CreateVector<int32_t>(*traversal_order) : 0;
   auto block_map__ = block_map ? _fbb.CreateVector<int32_t>(*block_map) : 0;
-  auto dim_metadata__ = dim_metadata ? _fbb.CreateVector<flatbuffers::Offset<tflite::DimensionMetadata>>(*dim_metadata) : 0;
+  auto dim_metadata__ = dim_metadata ? _fbb.CreateVector<::flatbuffers::Offset<tflite::DimensionMetadata>>(*dim_metadata) : 0;
   return tflite::CreateSparsityParameters(
       _fbb,
       traversal_order__,
@@ -4720,16 +4805,16 @@ inline flatbuffers::Offset<SparsityParameters> CreateSparsityParametersDirect(
       dim_metadata__);
 }
 
-flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct VariantSubTypeT : public flatbuffers::NativeTable {
+struct VariantSubTypeT : public ::flatbuffers::NativeTable {
   typedef VariantSubType TableType;
   std::vector<int32_t> shape{};
   tflite::TensorType type = tflite::TensorType_FLOAT32;
   bool has_rank = false;
 };
 
-struct VariantSubType FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct VariantSubType FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef VariantSubTypeT NativeTableType;
   typedef VariantSubTypeBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -4737,8 +4822,8 @@ struct VariantSubType FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_TYPE = 6,
     VT_HAS_RANK = 8
   };
-  const flatbuffers::Vector<int32_t> *shape() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_SHAPE);
+  const ::flatbuffers::Vector<int32_t> *shape() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHAPE);
   }
   tflite::TensorType type() const {
     return static_cast<tflite::TensorType>(GetField<int8_t>(VT_TYPE, 0));
@@ -4746,7 +4831,7 @@ struct VariantSubType FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool has_rank() const {
     return GetField<uint8_t>(VT_HAS_RANK, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_SHAPE) &&
            verifier.VerifyVector(shape()) &&
@@ -4754,16 +4839,16 @@ struct VariantSubType FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<uint8_t>(verifier, VT_HAS_RANK, 1) &&
            verifier.EndTable();
   }
-  VariantSubTypeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(VariantSubTypeT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<VariantSubType> Pack(flatbuffers::FlatBufferBuilder &_fbb, const VariantSubTypeT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  VariantSubTypeT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(VariantSubTypeT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<VariantSubType> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const VariantSubTypeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct VariantSubTypeBuilder {
   typedef VariantSubType Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_shape(flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_shape(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape) {
     fbb_.AddOffset(VariantSubType::VT_SHAPE, shape);
   }
   void add_type(tflite::TensorType type) {
@@ -4772,20 +4857,20 @@ struct VariantSubTypeBuilder {
   void add_has_rank(bool has_rank) {
     fbb_.AddElement<uint8_t>(VariantSubType::VT_HAS_RANK, static_cast<uint8_t>(has_rank), 0);
   }
-  explicit VariantSubTypeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit VariantSubTypeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<VariantSubType> Finish() {
+  ::flatbuffers::Offset<VariantSubType> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<VariantSubType>(end);
+    auto o = ::flatbuffers::Offset<VariantSubType>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<VariantSubType> CreateVariantSubType(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape = 0,
+inline ::flatbuffers::Offset<VariantSubType> CreateVariantSubType(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape = 0,
     tflite::TensorType type = tflite::TensorType_FLOAT32,
     bool has_rank = false) {
   VariantSubTypeBuilder builder_(_fbb);
@@ -4795,8 +4880,8 @@ inline flatbuffers::Offset<VariantSubType> CreateVariantSubType(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<VariantSubType> CreateVariantSubTypeDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<VariantSubType> CreateVariantSubTypeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int32_t> *shape = nullptr,
     tflite::TensorType type = tflite::TensorType_FLOAT32,
     bool has_rank = false) {
@@ -4808,9 +4893,9 @@ inline flatbuffers::Offset<VariantSubType> CreateVariantSubTypeDirect(
       has_rank);
 }
 
-flatbuffers::Offset<VariantSubType> CreateVariantSubType(flatbuffers::FlatBufferBuilder &_fbb, const VariantSubTypeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<VariantSubType> CreateVariantSubType(::flatbuffers::FlatBufferBuilder &_fbb, const VariantSubTypeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct TensorT : public flatbuffers::NativeTable {
+struct TensorT : public ::flatbuffers::NativeTable {
   typedef Tensor TableType;
   std::vector<int32_t> shape{};
   tflite::TensorType type = tflite::TensorType_FLOAT32;
@@ -4828,7 +4913,7 @@ struct TensorT : public flatbuffers::NativeTable {
   TensorT &operator=(TensorT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef TensorT NativeTableType;
   typedef TensorBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -4843,8 +4928,8 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_HAS_RANK = 20,
     VT_VARIANT_TENSORS = 22
   };
-  const flatbuffers::Vector<int32_t> *shape() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_SHAPE);
+  const ::flatbuffers::Vector<int32_t> *shape() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHAPE);
   }
   tflite::TensorType type() const {
     return static_cast<tflite::TensorType>(GetField<int8_t>(VT_TYPE, 0));
@@ -4852,8 +4937,8 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint32_t buffer() const {
     return GetField<uint32_t>(VT_BUFFER, 0);
   }
-  const flatbuffers::String *name() const {
-    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
   }
   const tflite::QuantizationParameters *quantization() const {
     return GetPointer<const tflite::QuantizationParameters *>(VT_QUANTIZATION);
@@ -4864,16 +4949,16 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::SparsityParameters *sparsity() const {
     return GetPointer<const tflite::SparsityParameters *>(VT_SPARSITY);
   }
-  const flatbuffers::Vector<int32_t> *shape_signature() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_SHAPE_SIGNATURE);
+  const ::flatbuffers::Vector<int32_t> *shape_signature() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHAPE_SIGNATURE);
   }
   bool has_rank() const {
     return GetField<uint8_t>(VT_HAS_RANK, 0) != 0;
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::VariantSubType>> *>(VT_VARIANT_TENSORS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>> *>(VT_VARIANT_TENSORS);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_SHAPE) &&
            verifier.VerifyVector(shape()) &&
@@ -4894,16 +4979,16 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyVectorOfTables(variant_tensors()) &&
            verifier.EndTable();
   }
-  TensorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Tensor> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  TensorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TensorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Tensor> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct TensorBuilder {
   typedef Tensor Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_shape(flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_shape(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape) {
     fbb_.AddOffset(Tensor::VT_SHAPE, shape);
   }
   void add_type(tflite::TensorType type) {
@@ -4912,50 +4997,50 @@ struct TensorBuilder {
   void add_buffer(uint32_t buffer) {
     fbb_.AddElement<uint32_t>(Tensor::VT_BUFFER, buffer, 0);
   }
-  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
     fbb_.AddOffset(Tensor::VT_NAME, name);
   }
-  void add_quantization(flatbuffers::Offset<tflite::QuantizationParameters> quantization) {
+  void add_quantization(::flatbuffers::Offset<tflite::QuantizationParameters> quantization) {
     fbb_.AddOffset(Tensor::VT_QUANTIZATION, quantization);
   }
   void add_is_variable(bool is_variable) {
     fbb_.AddElement<uint8_t>(Tensor::VT_IS_VARIABLE, static_cast<uint8_t>(is_variable), 0);
   }
-  void add_sparsity(flatbuffers::Offset<tflite::SparsityParameters> sparsity) {
+  void add_sparsity(::flatbuffers::Offset<tflite::SparsityParameters> sparsity) {
     fbb_.AddOffset(Tensor::VT_SPARSITY, sparsity);
   }
-  void add_shape_signature(flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape_signature) {
+  void add_shape_signature(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape_signature) {
     fbb_.AddOffset(Tensor::VT_SHAPE_SIGNATURE, shape_signature);
   }
   void add_has_rank(bool has_rank) {
     fbb_.AddElement<uint8_t>(Tensor::VT_HAS_RANK, static_cast<uint8_t>(has_rank), 0);
   }
-  void add_variant_tensors(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors) {
+  void add_variant_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors) {
     fbb_.AddOffset(Tensor::VT_VARIANT_TENSORS, variant_tensors);
   }
-  explicit TensorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit TensorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Tensor> Finish() {
+  ::flatbuffers::Offset<Tensor> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Tensor>(end);
+    auto o = ::flatbuffers::Offset<Tensor>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Tensor> CreateTensor(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape = 0,
+inline ::flatbuffers::Offset<Tensor> CreateTensor(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape = 0,
     tflite::TensorType type = tflite::TensorType_FLOAT32,
     uint32_t buffer = 0,
-    flatbuffers::Offset<flatbuffers::String> name = 0,
-    flatbuffers::Offset<tflite::QuantizationParameters> quantization = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<tflite::QuantizationParameters> quantization = 0,
     bool is_variable = false,
-    flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape_signature = 0,
+    ::flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape_signature = 0,
     bool has_rank = false,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors = 0) {
   TensorBuilder builder_(_fbb);
   builder_.add_variant_tensors(variant_tensors);
   builder_.add_shape_signature(shape_signature);
@@ -4970,22 +5055,22 @@ inline flatbuffers::Offset<Tensor> CreateTensor(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Tensor> CreateTensorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Tensor> CreateTensorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int32_t> *shape = nullptr,
     tflite::TensorType type = tflite::TensorType_FLOAT32,
     uint32_t buffer = 0,
     const char *name = nullptr,
-    flatbuffers::Offset<tflite::QuantizationParameters> quantization = 0,
+    ::flatbuffers::Offset<tflite::QuantizationParameters> quantization = 0,
     bool is_variable = false,
-    flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
+    ::flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
     const std::vector<int32_t> *shape_signature = nullptr,
     bool has_rank = false,
-    const std::vector<flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors = nullptr) {
+    const std::vector<::flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors = nullptr) {
   auto shape__ = shape ? _fbb.CreateVector<int32_t>(*shape) : 0;
   auto name__ = name ? _fbb.CreateString(name) : 0;
   auto shape_signature__ = shape_signature ? _fbb.CreateVector<int32_t>(*shape_signature) : 0;
-  auto variant_tensors__ = variant_tensors ? _fbb.CreateVector<flatbuffers::Offset<tflite::VariantSubType>>(*variant_tensors) : 0;
+  auto variant_tensors__ = variant_tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::VariantSubType>>(*variant_tensors) : 0;
   return tflite::CreateTensor(
       _fbb,
       shape__,
@@ -5000,9 +5085,9 @@ inline flatbuffers::Offset<Tensor> CreateTensorDirect(
       variant_tensors__);
 }
 
-flatbuffers::Offset<Tensor> CreateTensor(flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Tensor> CreateTensor(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct Conv2DOptionsT : public flatbuffers::NativeTable {
+struct Conv2DOptionsT : public ::flatbuffers::NativeTable {
   typedef Conv2DOptions TableType;
   tflite::Padding padding = tflite::Padding_SAME;
   int32_t stride_w = 0;
@@ -5012,7 +5097,7 @@ struct Conv2DOptionsT : public flatbuffers::NativeTable {
   int32_t dilation_h_factor = 1;
 };
 
-struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef Conv2DOptionsT NativeTableType;
   typedef Conv2DOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -5041,7 +5126,7 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t dilation_h_factor() const {
     return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
@@ -5051,15 +5136,15 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR, 4) &&
            verifier.EndTable();
   }
-  Conv2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(Conv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Conv2DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  Conv2DOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Conv2DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Conv2DOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct Conv2DOptionsBuilder {
   typedef Conv2DOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_padding(tflite::Padding padding) {
     fbb_.AddElement<int8_t>(Conv2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
   }
@@ -5078,19 +5163,19 @@ struct Conv2DOptionsBuilder {
   void add_dilation_h_factor(int32_t dilation_h_factor) {
     fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
   }
-  explicit Conv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit Conv2DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Conv2DOptions> Finish() {
+  ::flatbuffers::Offset<Conv2DOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Conv2DOptions>(end);
+    auto o = ::flatbuffers::Offset<Conv2DOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::Padding padding = tflite::Padding_SAME,
     int32_t stride_w = 0,
     int32_t stride_h = 0,
@@ -5107,9 +5192,9 @@ inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct Conv3DOptionsT : public flatbuffers::NativeTable {
+struct Conv3DOptionsT : public ::flatbuffers::NativeTable {
   typedef Conv3DOptions TableType;
   tflite::Padding padding = tflite::Padding_SAME;
   int32_t stride_d = 0;
@@ -5121,7 +5206,7 @@ struct Conv3DOptionsT : public flatbuffers::NativeTable {
   int32_t dilation_h_factor = 1;
 };
 
-struct Conv3DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Conv3DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef Conv3DOptionsT NativeTableType;
   typedef Conv3DOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -5158,7 +5243,7 @@ struct Conv3DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t dilation_h_factor() const {
     return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_D, 4) &&
@@ -5170,15 +5255,15 @@ struct Conv3DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR, 4) &&
            verifier.EndTable();
   }
-  Conv3DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(Conv3DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Conv3DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv3DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  Conv3DOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Conv3DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Conv3DOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Conv3DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct Conv3DOptionsBuilder {
   typedef Conv3DOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_padding(tflite::Padding padding) {
     fbb_.AddElement<int8_t>(Conv3DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
   }
@@ -5203,19 +5288,19 @@ struct Conv3DOptionsBuilder {
   void add_dilation_h_factor(int32_t dilation_h_factor) {
     fbb_.AddElement<int32_t>(Conv3DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
   }
-  explicit Conv3DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit Conv3DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Conv3DOptions> Finish() {
+  ::flatbuffers::Offset<Conv3DOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Conv3DOptions>(end);
+    auto o = ::flatbuffers::Offset<Conv3DOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Conv3DOptions> CreateConv3DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Conv3DOptions> CreateConv3DOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::Padding padding = tflite::Padding_SAME,
     int32_t stride_d = 0,
     int32_t stride_w = 0,
@@ -5236,9 +5321,9 @@ inline flatbuffers::Offset<Conv3DOptions> CreateConv3DOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<Conv3DOptions> CreateConv3DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Conv3DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Conv3DOptions> CreateConv3DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Conv3DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct Pool2DOptionsT : public flatbuffers::NativeTable {
+struct Pool2DOptionsT : public ::flatbuffers::NativeTable {
   typedef Pool2DOptions TableType;
   tflite::Padding padding = tflite::Padding_SAME;
   int32_t stride_w = 0;
@@ -5248,7 +5333,7 @@ struct Pool2DOptionsT : public flatbuffers::NativeTable {
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
 };
 
-struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef Pool2DOptionsT NativeTableType;
   typedef Pool2DOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -5277,7 +5362,7 @@ struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
@@ -5287,15 +5372,15 @@ struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            verifier.EndTable();
   }
-  Pool2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(Pool2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Pool2DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  Pool2DOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Pool2DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Pool2DOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct Pool2DOptionsBuilder {
   typedef Pool2DOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_padding(tflite::Padding padding) {
     fbb_.AddElement<int8_t>(Pool2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
   }
@@ -5314,19 +5399,19 @@ struct Pool2DOptionsBuilder {
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(Pool2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
-  explicit Pool2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit Pool2DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Pool2DOptions> Finish() {
+  ::flatbuffers::Offset<Pool2DOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Pool2DOptions>(end);
+    auto o = ::flatbuffers::Offset<Pool2DOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::Padding padding = tflite::Padding_SAME,
     int32_t stride_w = 0,
     int32_t stride_h = 0,
@@ -5343,9 +5428,9 @@ inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct DepthwiseConv2DOptionsT : public flatbuffers::NativeTable {
+struct DepthwiseConv2DOptionsT : public ::flatbuffers::NativeTable {
   typedef DepthwiseConv2DOptions TableType;
   tflite::Padding padding = tflite::Padding_SAME;
   int32_t stride_w = 0;
@@ -5356,7 +5441,7 @@ struct DepthwiseConv2DOptionsT : public flatbuffers::NativeTable {
   int32_t dilation_h_factor = 1;
 };
 
-struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef DepthwiseConv2DOptionsT NativeTableType;
   typedef DepthwiseConv2DOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -5389,7 +5474,7 @@ struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   int32_t dilation_h_factor() const {
     return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
@@ -5400,15 +5485,15 @@ struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
            VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR, 4) &&
            verifier.EndTable();
   }
-  DepthwiseConv2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(DepthwiseConv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<DepthwiseConv2DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  DepthwiseConv2DOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DepthwiseConv2DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DepthwiseConv2DOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct DepthwiseConv2DOptionsBuilder {
   typedef DepthwiseConv2DOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_padding(tflite::Padding padding) {
     fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
   }
@@ -5430,19 +5515,19 @@ struct DepthwiseConv2DOptionsBuilder {
   void add_dilation_h_factor(int32_t dilation_h_factor) {
     fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
   }
-  explicit DepthwiseConv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit DepthwiseConv2DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<DepthwiseConv2DOptions> Finish() {
+  ::flatbuffers::Offset<DepthwiseConv2DOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<DepthwiseConv2DOptions>(end);
+    auto o = ::flatbuffers::Offset<DepthwiseConv2DOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::Padding padding = tflite::Padding_SAME,
     int32_t stride_w = 0,
     int32_t stride_h = 0,
@@ -5461,16 +5546,16 @@ inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ConcatEmbeddingsOptionsT : public flatbuffers::NativeTable {
+struct ConcatEmbeddingsOptionsT : public ::flatbuffers::NativeTable {
   typedef ConcatEmbeddingsOptions TableType;
   int32_t num_channels = 0;
   std::vector<int32_t> num_columns_per_channel{};
   std::vector<int32_t> embedding_dim_per_channel{};
 };
 
-struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ConcatEmbeddingsOptionsT NativeTableType;
   typedef ConcatEmbeddingsOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -5481,13 +5566,13 @@ struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Ta
   int32_t num_channels() const {
     return GetField<int32_t>(VT_NUM_CHANNELS, 0);
   }
-  const flatbuffers::Vector<int32_t> *num_columns_per_channel() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_NUM_COLUMNS_PER_CHANNEL);
+  const ::flatbuffers::Vector<int32_t> *num_columns_per_channel() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_NUM_COLUMNS_PER_CHANNEL);
   }
-  const flatbuffers::Vector<int32_t> *embedding_dim_per_channel() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_EMBEDDING_DIM_PER_CHANNEL);
+  const ::flatbuffers::Vector<int32_t> *embedding_dim_per_channel() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_EMBEDDING_DIM_PER_CHANNEL);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_NUM_CHANNELS, 4) &&
            VerifyOffset(verifier, VT_NUM_COLUMNS_PER_CHANNEL) &&
@@ -5496,40 +5581,40 @@ struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Ta
            verifier.VerifyVector(embedding_dim_per_channel()) &&
            verifier.EndTable();
   }
-  ConcatEmbeddingsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ConcatEmbeddingsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ConcatEmbeddingsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ConcatEmbeddingsOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ConcatEmbeddingsOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ConcatEmbeddingsOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ConcatEmbeddingsOptionsBuilder {
   typedef ConcatEmbeddingsOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_num_channels(int32_t num_channels) {
     fbb_.AddElement<int32_t>(ConcatEmbeddingsOptions::VT_NUM_CHANNELS, num_channels, 0);
   }
-  void add_num_columns_per_channel(flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel) {
+  void add_num_columns_per_channel(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> num_columns_per_channel) {
     fbb_.AddOffset(ConcatEmbeddingsOptions::VT_NUM_COLUMNS_PER_CHANNEL, num_columns_per_channel);
   }
-  void add_embedding_dim_per_channel(flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel) {
+  void add_embedding_dim_per_channel(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> embedding_dim_per_channel) {
     fbb_.AddOffset(ConcatEmbeddingsOptions::VT_EMBEDDING_DIM_PER_CHANNEL, embedding_dim_per_channel);
   }
-  explicit ConcatEmbeddingsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ConcatEmbeddingsOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ConcatEmbeddingsOptions> Finish() {
+  ::flatbuffers::Offset<ConcatEmbeddingsOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ConcatEmbeddingsOptions>(end);
+    auto o = ::flatbuffers::Offset<ConcatEmbeddingsOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t num_channels = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> num_columns_per_channel = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> embedding_dim_per_channel = 0) {
   ConcatEmbeddingsOptionsBuilder builder_(_fbb);
   builder_.add_embedding_dim_per_channel(embedding_dim_per_channel);
   builder_.add_num_columns_per_channel(num_columns_per_channel);
@@ -5537,8 +5622,8 @@ inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOption
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t num_channels = 0,
     const std::vector<int32_t> *num_columns_per_channel = nullptr,
     const std::vector<int32_t> *embedding_dim_per_channel = nullptr) {
@@ -5551,14 +5636,14 @@ inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOption
       embedding_dim_per_channel__);
 }
 
-flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct LSHProjectionOptionsT : public flatbuffers::NativeTable {
+struct LSHProjectionOptionsT : public ::flatbuffers::NativeTable {
   typedef LSHProjectionOptions TableType;
   tflite::LSHProjectionType type = tflite::LSHProjectionType_UNKNOWN;
 };
 
-struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef LSHProjectionOptionsT NativeTableType;
   typedef LSHProjectionOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -5567,52 +5652,52 @@ struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   tflite::LSHProjectionType type() const {
     return static_cast<tflite::LSHProjectionType>(GetField<int8_t>(VT_TYPE, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_TYPE, 1) &&
            verifier.EndTable();
   }
-  LSHProjectionOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LSHProjectionOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LSHProjectionOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  LSHProjectionOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LSHProjectionOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LSHProjectionOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LSHProjectionOptionsBuilder {
   typedef LSHProjectionOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_type(tflite::LSHProjectionType type) {
     fbb_.AddElement<int8_t>(LSHProjectionOptions::VT_TYPE, static_cast<int8_t>(type), 0);
   }
-  explicit LSHProjectionOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit LSHProjectionOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<LSHProjectionOptions> Finish() {
+  ::flatbuffers::Offset<LSHProjectionOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<LSHProjectionOptions>(end);
+    auto o = ::flatbuffers::Offset<LSHProjectionOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::LSHProjectionType type = tflite::LSHProjectionType_UNKNOWN) {
   LSHProjectionOptionsBuilder builder_(_fbb);
   builder_.add_type(type);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SVDFOptionsT : public flatbuffers::NativeTable {
+struct SVDFOptionsT : public ::flatbuffers::NativeTable {
   typedef SVDFOptions TableType;
   int32_t rank = 0;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
   bool asymmetric_quantize_inputs = false;
 };
 
-struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SVDFOptionsT NativeTableType;
   typedef SVDFOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -5629,22 +5714,22 @@ struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool asymmetric_quantize_inputs() const {
     return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_RANK, 4) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
            verifier.EndTable();
   }
-  SVDFOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SVDFOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SVDFOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SVDFOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SVDFOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SVDFOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SVDFOptionsBuilder {
   typedef SVDFOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_rank(int32_t rank) {
     fbb_.AddElement<int32_t>(SVDFOptions::VT_RANK, rank, 0);
   }
@@ -5654,19 +5739,19 @@ struct SVDFOptionsBuilder {
   void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
     fbb_.AddElement<uint8_t>(SVDFOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
-  explicit SVDFOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SVDFOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SVDFOptions> Finish() {
+  ::flatbuffers::Offset<SVDFOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SVDFOptions>(end);
+    auto o = ::flatbuffers::Offset<SVDFOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t rank = 0,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
     bool asymmetric_quantize_inputs = false) {
@@ -5677,15 +5762,15 @@ inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct RNNOptionsT : public flatbuffers::NativeTable {
+struct RNNOptionsT : public ::flatbuffers::NativeTable {
   typedef RNNOptions TableType;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
   bool asymmetric_quantize_inputs = false;
 };
 
-struct RNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct RNNOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef RNNOptionsT NativeTableType;
   typedef RNNOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -5698,40 +5783,40 @@ struct RNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool asymmetric_quantize_inputs() const {
     return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
            verifier.EndTable();
   }
-  RNNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(RNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<RNNOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  RNNOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RNNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<RNNOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct RNNOptionsBuilder {
   typedef RNNOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(RNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
   void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
     fbb_.AddElement<uint8_t>(RNNOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
-  explicit RNNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit RNNOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<RNNOptions> Finish() {
+  ::flatbuffers::Offset<RNNOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<RNNOptions>(end);
+    auto o = ::flatbuffers::Offset<RNNOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<RNNOptions> CreateRNNOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
     bool asymmetric_quantize_inputs = false) {
   RNNOptionsBuilder builder_(_fbb);
@@ -5740,16 +5825,16 @@ inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<RNNOptions> CreateRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<RNNOptions> CreateRNNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SequenceRNNOptionsT : public flatbuffers::NativeTable {
+struct SequenceRNNOptionsT : public ::flatbuffers::NativeTable {
   typedef SequenceRNNOptions TableType;
   bool time_major = false;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
   bool asymmetric_quantize_inputs = false;
 };
 
-struct SequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SequenceRNNOptionsT NativeTableType;
   typedef SequenceRNNOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -5766,22 +5851,22 @@ struct SequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool asymmetric_quantize_inputs() const {
     return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_TIME_MAJOR, 1) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
            verifier.EndTable();
   }
-  SequenceRNNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SequenceRNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SequenceRNNOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SequenceRNNOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SequenceRNNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SequenceRNNOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SequenceRNNOptionsBuilder {
   typedef SequenceRNNOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_time_major(bool time_major) {
     fbb_.AddElement<uint8_t>(SequenceRNNOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 0);
   }
@@ -5791,19 +5876,19 @@ struct SequenceRNNOptionsBuilder {
   void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
     fbb_.AddElement<uint8_t>(SequenceRNNOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
-  explicit SequenceRNNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SequenceRNNOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SequenceRNNOptions> Finish() {
+  ::flatbuffers::Offset<SequenceRNNOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SequenceRNNOptions>(end);
+    auto o = ::flatbuffers::Offset<SequenceRNNOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool time_major = false,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
     bool asymmetric_quantize_inputs = false) {
@@ -5814,9 +5899,9 @@ inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BidirectionalSequenceRNNOptionsT : public flatbuffers::NativeTable {
+struct BidirectionalSequenceRNNOptionsT : public ::flatbuffers::NativeTable {
   typedef BidirectionalSequenceRNNOptions TableType;
   bool time_major = false;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
@@ -5824,7 +5909,7 @@ struct BidirectionalSequenceRNNOptionsT : public flatbuffers::NativeTable {
   bool asymmetric_quantize_inputs = false;
 };
 
-struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BidirectionalSequenceRNNOptionsT NativeTableType;
   typedef BidirectionalSequenceRNNOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -5845,7 +5930,7 @@ struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuf
   bool asymmetric_quantize_inputs() const {
     return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_TIME_MAJOR, 1) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
@@ -5853,15 +5938,15 @@ struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuf
            VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
            verifier.EndTable();
   }
-  BidirectionalSequenceRNNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BidirectionalSequenceRNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BidirectionalSequenceRNNOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BidirectionalSequenceRNNOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BidirectionalSequenceRNNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BidirectionalSequenceRNNOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BidirectionalSequenceRNNOptionsBuilder {
   typedef BidirectionalSequenceRNNOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_time_major(bool time_major) {
     fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 0);
   }
@@ -5874,19 +5959,19 @@ struct BidirectionalSequenceRNNOptionsBuilder {
   void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
     fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
-  explicit BidirectionalSequenceRNNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BidirectionalSequenceRNNOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BidirectionalSequenceRNNOptions> Finish() {
+  ::flatbuffers::Offset<BidirectionalSequenceRNNOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BidirectionalSequenceRNNOptions>(end);
+    auto o = ::flatbuffers::Offset<BidirectionalSequenceRNNOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool time_major = false,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
     bool merge_outputs = false,
@@ -5899,9 +5984,9 @@ inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalS
   return builder_.Finish();
 }
 
-flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct FullyConnectedOptionsT : public flatbuffers::NativeTable {
+struct FullyConnectedOptionsT : public ::flatbuffers::NativeTable {
   typedef FullyConnectedOptions TableType;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
   tflite::FullyConnectedOptionsWeightsFormat weights_format = tflite::FullyConnectedOptionsWeightsFormat_DEFAULT;
@@ -5909,7 +5994,7 @@ struct FullyConnectedOptionsT : public flatbuffers::NativeTable {
   bool asymmetric_quantize_inputs = false;
 };
 
-struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef FullyConnectedOptionsT NativeTableType;
   typedef FullyConnectedOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -5930,7 +6015,7 @@ struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tabl
   bool asymmetric_quantize_inputs() const {
     return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            VerifyField<int8_t>(verifier, VT_WEIGHTS_FORMAT, 1) &&
@@ -5938,15 +6023,15 @@ struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tabl
            VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
            verifier.EndTable();
   }
-  FullyConnectedOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(FullyConnectedOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<FullyConnectedOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  FullyConnectedOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FullyConnectedOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FullyConnectedOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct FullyConnectedOptionsBuilder {
   typedef FullyConnectedOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
@@ -5959,19 +6044,19 @@ struct FullyConnectedOptionsBuilder {
   void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
     fbb_.AddElement<uint8_t>(FullyConnectedOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
-  explicit FullyConnectedOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit FullyConnectedOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<FullyConnectedOptions> Finish() {
+  ::flatbuffers::Offset<FullyConnectedOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<FullyConnectedOptions>(end);
+    auto o = ::flatbuffers::Offset<FullyConnectedOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
     tflite::FullyConnectedOptionsWeightsFormat weights_format = tflite::FullyConnectedOptionsWeightsFormat_DEFAULT,
     bool keep_num_dims = false,
@@ -5984,14 +6069,14 @@ inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SoftmaxOptionsT : public flatbuffers::NativeTable {
+struct SoftmaxOptionsT : public ::flatbuffers::NativeTable {
   typedef SoftmaxOptions TableType;
   float beta = 0.0f;
 };
 
-struct SoftmaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SoftmaxOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SoftmaxOptionsT NativeTableType;
   typedef SoftmaxOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -6000,51 +6085,51 @@ struct SoftmaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   float beta() const {
     return GetField<float>(VT_BETA, 0.0f);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<float>(verifier, VT_BETA, 4) &&
            verifier.EndTable();
   }
-  SoftmaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SoftmaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SoftmaxOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SoftmaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SoftmaxOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SoftmaxOptionsBuilder {
   typedef SoftmaxOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_beta(float beta) {
     fbb_.AddElement<float>(SoftmaxOptions::VT_BETA, beta, 0.0f);
   }
-  explicit SoftmaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SoftmaxOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SoftmaxOptions> Finish() {
+  ::flatbuffers::Offset<SoftmaxOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SoftmaxOptions>(end);
+    auto o = ::flatbuffers::Offset<SoftmaxOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     float beta = 0.0f) {
   SoftmaxOptionsBuilder builder_(_fbb);
   builder_.add_beta(beta);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ConcatenationOptionsT : public flatbuffers::NativeTable {
+struct ConcatenationOptionsT : public ::flatbuffers::NativeTable {
   typedef ConcatenationOptions TableType;
   int32_t axis = 0;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
 };
 
-struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ConcatenationOptionsT NativeTableType;
   typedef ConcatenationOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -6057,40 +6142,40 @@ struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            verifier.EndTable();
   }
-  ConcatenationOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ConcatenationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ConcatenationOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ConcatenationOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ConcatenationOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ConcatenationOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ConcatenationOptionsBuilder {
   typedef ConcatenationOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_axis(int32_t axis) {
     fbb_.AddElement<int32_t>(ConcatenationOptions::VT_AXIS, axis, 0);
   }
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(ConcatenationOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
-  explicit ConcatenationOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ConcatenationOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ConcatenationOptions> Finish() {
+  ::flatbuffers::Offset<ConcatenationOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ConcatenationOptions>(end);
+    auto o = ::flatbuffers::Offset<ConcatenationOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t axis = 0,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
   ConcatenationOptionsBuilder builder_(_fbb);
@@ -6099,15 +6184,15 @@ inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct AddOptionsT : public flatbuffers::NativeTable {
+struct AddOptionsT : public ::flatbuffers::NativeTable {
   typedef AddOptions TableType;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
   bool pot_scale_int16 = true;
 };
 
-struct AddOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct AddOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef AddOptionsT NativeTableType;
   typedef AddOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -6120,40 +6205,40 @@ struct AddOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool pot_scale_int16() const {
     return GetField<uint8_t>(VT_POT_SCALE_INT16, 1) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16, 1) &&
            verifier.EndTable();
   }
-  AddOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(AddOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<AddOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  AddOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AddOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<AddOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct AddOptionsBuilder {
   typedef AddOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(AddOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
   void add_pot_scale_int16(bool pot_scale_int16) {
     fbb_.AddElement<uint8_t>(AddOptions::VT_POT_SCALE_INT16, static_cast<uint8_t>(pot_scale_int16), 1);
   }
-  explicit AddOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit AddOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<AddOptions> Finish() {
+  ::flatbuffers::Offset<AddOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<AddOptions>(end);
+    auto o = ::flatbuffers::Offset<AddOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<AddOptions> CreateAddOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<AddOptions> CreateAddOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
     bool pot_scale_int16 = true) {
   AddOptionsBuilder builder_(_fbb);
@@ -6162,14 +6247,14 @@ inline flatbuffers::Offset<AddOptions> CreateAddOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<AddOptions> CreateAddOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<AddOptions> CreateAddOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct MulOptionsT : public flatbuffers::NativeTable {
+struct MulOptionsT : public ::flatbuffers::NativeTable {
   typedef MulOptions TableType;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
 };
 
-struct MulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct MulOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef MulOptionsT NativeTableType;
   typedef MulOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -6178,50 +6263,50 @@ struct MulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            verifier.EndTable();
   }
-  MulOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<MulOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MulOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MulOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MulOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct MulOptionsBuilder {
   typedef MulOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(MulOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
-  explicit MulOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit MulOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<MulOptions> Finish() {
+  ::flatbuffers::Offset<MulOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<MulOptions>(end);
+    auto o = ::flatbuffers::Offset<MulOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<MulOptions> CreateMulOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<MulOptions> CreateMulOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
   MulOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<MulOptions> CreateMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<MulOptions> CreateMulOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct L2NormOptionsT : public flatbuffers::NativeTable {
+struct L2NormOptionsT : public ::flatbuffers::NativeTable {
   typedef L2NormOptions TableType;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
 };
 
-struct L2NormOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct L2NormOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef L2NormOptionsT NativeTableType;
   typedef L2NormOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -6230,45 +6315,45 @@ struct L2NormOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            verifier.EndTable();
   }
-  L2NormOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(L2NormOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<L2NormOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  L2NormOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(L2NormOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<L2NormOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct L2NormOptionsBuilder {
   typedef L2NormOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(L2NormOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
-  explicit L2NormOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit L2NormOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<L2NormOptions> Finish() {
+  ::flatbuffers::Offset<L2NormOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<L2NormOptions>(end);
+    auto o = ::flatbuffers::Offset<L2NormOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
   L2NormOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(::flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct LocalResponseNormalizationOptionsT : public flatbuffers::NativeTable {
+struct LocalResponseNormalizationOptionsT : public ::flatbuffers::NativeTable {
   typedef LocalResponseNormalizationOptions TableType;
   int32_t radius = 0;
   float bias = 0.0f;
@@ -6276,7 +6361,7 @@ struct LocalResponseNormalizationOptionsT : public flatbuffers::NativeTable {
   float beta = 0.0f;
 };
 
-struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef LocalResponseNormalizationOptionsT NativeTableType;
   typedef LocalResponseNormalizationOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -6297,7 +6382,7 @@ struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS : private flatb
   float beta() const {
     return GetField<float>(VT_BETA, 0.0f);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_RADIUS, 4) &&
            VerifyField<float>(verifier, VT_BIAS, 4) &&
@@ -6305,15 +6390,15 @@ struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS : private flatb
            VerifyField<float>(verifier, VT_BETA, 4) &&
            verifier.EndTable();
   }
-  LocalResponseNormalizationOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LocalResponseNormalizationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LocalResponseNormalizationOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  LocalResponseNormalizationOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LocalResponseNormalizationOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LocalResponseNormalizationOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LocalResponseNormalizationOptionsBuilder {
   typedef LocalResponseNormalizationOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_radius(int32_t radius) {
     fbb_.AddElement<int32_t>(LocalResponseNormalizationOptions::VT_RADIUS, radius, 0);
   }
@@ -6326,19 +6411,19 @@ struct LocalResponseNormalizationOptionsBuilder {
   void add_beta(float beta) {
     fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BETA, beta, 0.0f);
   }
-  explicit LocalResponseNormalizationOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit LocalResponseNormalizationOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<LocalResponseNormalizationOptions> Finish() {
+  ::flatbuffers::Offset<LocalResponseNormalizationOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<LocalResponseNormalizationOptions>(end);
+    auto o = ::flatbuffers::Offset<LocalResponseNormalizationOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t radius = 0,
     float bias = 0.0f,
     float alpha = 0.0f,
@@ -6351,9 +6436,9 @@ inline flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalRespons
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct LSTMOptionsT : public flatbuffers::NativeTable {
+struct LSTMOptionsT : public ::flatbuffers::NativeTable {
   typedef LSTMOptions TableType;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
   float cell_clip = 0.0f;
@@ -6362,7 +6447,7 @@ struct LSTMOptionsT : public flatbuffers::NativeTable {
   bool asymmetric_quantize_inputs = false;
 };
 
-struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef LSTMOptionsT NativeTableType;
   typedef LSTMOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -6387,7 +6472,7 @@ struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool asymmetric_quantize_inputs() const {
     return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            VerifyField<float>(verifier, VT_CELL_CLIP, 4) &&
@@ -6396,15 +6481,15 @@ struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
            verifier.EndTable();
   }
-  LSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LSTMOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  LSTMOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LSTMOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LSTMOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LSTMOptionsBuilder {
   typedef LSTMOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(LSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
@@ -6420,19 +6505,19 @@ struct LSTMOptionsBuilder {
   void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
     fbb_.AddElement<uint8_t>(LSTMOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
-  explicit LSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit LSTMOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<LSTMOptions> Finish() {
+  ::flatbuffers::Offset<LSTMOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<LSTMOptions>(end);
+    auto o = ::flatbuffers::Offset<LSTMOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
     float cell_clip = 0.0f,
     float proj_clip = 0.0f,
@@ -6447,9 +6532,9 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct UnidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
+struct UnidirectionalSequenceLSTMOptionsT : public ::flatbuffers::NativeTable {
   typedef UnidirectionalSequenceLSTMOptions TableType;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
   float cell_clip = 0.0f;
@@ -6459,7 +6544,7 @@ struct UnidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
   bool diagonal_recurrent_tensors = false;
 };
 
-struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef UnidirectionalSequenceLSTMOptionsT NativeTableType;
   typedef UnidirectionalSequenceLSTMOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -6488,7 +6573,7 @@ struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatb
   bool diagonal_recurrent_tensors() const {
     return GetField<uint8_t>(VT_DIAGONAL_RECURRENT_TENSORS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            VerifyField<float>(verifier, VT_CELL_CLIP, 4) &&
@@ -6498,15 +6583,15 @@ struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatb
            VerifyField<uint8_t>(verifier, VT_DIAGONAL_RECURRENT_TENSORS, 1) &&
            verifier.EndTable();
   }
-  UnidirectionalSequenceLSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  UnidirectionalSequenceLSTMOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct UnidirectionalSequenceLSTMOptionsBuilder {
   typedef UnidirectionalSequenceLSTMOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(UnidirectionalSequenceLSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
@@ -6525,19 +6610,19 @@ struct UnidirectionalSequenceLSTMOptionsBuilder {
   void add_diagonal_recurrent_tensors(bool diagonal_recurrent_tensors) {
     fbb_.AddElement<uint8_t>(UnidirectionalSequenceLSTMOptions::VT_DIAGONAL_RECURRENT_TENSORS, static_cast<uint8_t>(diagonal_recurrent_tensors), 0);
   }
-  explicit UnidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit UnidirectionalSequenceLSTMOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Finish() {
+  ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<UnidirectionalSequenceLSTMOptions>(end);
+    auto o = ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
     float cell_clip = 0.0f,
     float proj_clip = 0.0f,
@@ -6554,9 +6639,9 @@ inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirection
   return builder_.Finish();
 }
 
-flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
+struct BidirectionalSequenceLSTMOptionsT : public ::flatbuffers::NativeTable {
   typedef BidirectionalSequenceLSTMOptions TableType;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
   float cell_clip = 0.0f;
@@ -6566,7 +6651,7 @@ struct BidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
   bool asymmetric_quantize_inputs = false;
 };
 
-struct BidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BidirectionalSequenceLSTMOptionsT NativeTableType;
   typedef BidirectionalSequenceLSTMOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -6595,7 +6680,7 @@ struct BidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbu
   bool asymmetric_quantize_inputs() const {
     return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            VerifyField<float>(verifier, VT_CELL_CLIP, 4) &&
@@ -6605,15 +6690,15 @@ struct BidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbu
            VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
            verifier.EndTable();
   }
-  BidirectionalSequenceLSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BidirectionalSequenceLSTMOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BidirectionalSequenceLSTMOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BidirectionalSequenceLSTMOptionsBuilder {
   typedef BidirectionalSequenceLSTMOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(BidirectionalSequenceLSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
@@ -6632,19 +6717,19 @@ struct BidirectionalSequenceLSTMOptionsBuilder {
   void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
     fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
-  explicit BidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BidirectionalSequenceLSTMOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BidirectionalSequenceLSTMOptions> Finish() {
+  ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BidirectionalSequenceLSTMOptions>(end);
+    auto o = ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
     float cell_clip = 0.0f,
     float proj_clip = 0.0f,
@@ -6661,15 +6746,15 @@ inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectional
   return builder_.Finish();
 }
 
-flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ResizeBilinearOptionsT : public flatbuffers::NativeTable {
+struct ResizeBilinearOptionsT : public ::flatbuffers::NativeTable {
   typedef ResizeBilinearOptions TableType;
   bool align_corners = false;
   bool half_pixel_centers = false;
 };
 
-struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ResizeBilinearOptionsT NativeTableType;
   typedef ResizeBilinearOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -6682,40 +6767,40 @@ struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tabl
   bool half_pixel_centers() const {
     return GetField<uint8_t>(VT_HALF_PIXEL_CENTERS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_ALIGN_CORNERS, 1) &&
            VerifyField<uint8_t>(verifier, VT_HALF_PIXEL_CENTERS, 1) &&
            verifier.EndTable();
   }
-  ResizeBilinearOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ResizeBilinearOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ResizeBilinearOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ResizeBilinearOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ResizeBilinearOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ResizeBilinearOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ResizeBilinearOptionsBuilder {
   typedef ResizeBilinearOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_align_corners(bool align_corners) {
     fbb_.AddElement<uint8_t>(ResizeBilinearOptions::VT_ALIGN_CORNERS, static_cast<uint8_t>(align_corners), 0);
   }
   void add_half_pixel_centers(bool half_pixel_centers) {
     fbb_.AddElement<uint8_t>(ResizeBilinearOptions::VT_HALF_PIXEL_CENTERS, static_cast<uint8_t>(half_pixel_centers), 0);
   }
-  explicit ResizeBilinearOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ResizeBilinearOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ResizeBilinearOptions> Finish() {
+  ::flatbuffers::Offset<ResizeBilinearOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ResizeBilinearOptions>(end);
+    auto o = ::flatbuffers::Offset<ResizeBilinearOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool align_corners = false,
     bool half_pixel_centers = false) {
   ResizeBilinearOptionsBuilder builder_(_fbb);
@@ -6724,15 +6809,15 @@ inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ResizeNearestNeighborOptionsT : public flatbuffers::NativeTable {
+struct ResizeNearestNeighborOptionsT : public ::flatbuffers::NativeTable {
   typedef ResizeNearestNeighborOptions TableType;
   bool align_corners = false;
   bool half_pixel_centers = false;
 };
 
-struct ResizeNearestNeighborOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ResizeNearestNeighborOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ResizeNearestNeighborOptionsT NativeTableType;
   typedef ResizeNearestNeighborOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -6745,40 +6830,40 @@ struct ResizeNearestNeighborOptions FLATBUFFERS_FINAL_CLASS : private flatbuffer
   bool half_pixel_centers() const {
     return GetField<uint8_t>(VT_HALF_PIXEL_CENTERS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_ALIGN_CORNERS, 1) &&
            VerifyField<uint8_t>(verifier, VT_HALF_PIXEL_CENTERS, 1) &&
            verifier.EndTable();
   }
-  ResizeNearestNeighborOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ResizeNearestNeighborOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ResizeNearestNeighborOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ResizeNearestNeighborOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ResizeNearestNeighborOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ResizeNearestNeighborOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ResizeNearestNeighborOptionsBuilder {
   typedef ResizeNearestNeighborOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_align_corners(bool align_corners) {
     fbb_.AddElement<uint8_t>(ResizeNearestNeighborOptions::VT_ALIGN_CORNERS, static_cast<uint8_t>(align_corners), 0);
   }
   void add_half_pixel_centers(bool half_pixel_centers) {
     fbb_.AddElement<uint8_t>(ResizeNearestNeighborOptions::VT_HALF_PIXEL_CENTERS, static_cast<uint8_t>(half_pixel_centers), 0);
   }
-  explicit ResizeNearestNeighborOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ResizeNearestNeighborOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ResizeNearestNeighborOptions> Finish() {
+  ::flatbuffers::Offset<ResizeNearestNeighborOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ResizeNearestNeighborOptions>(end);
+    auto o = ::flatbuffers::Offset<ResizeNearestNeighborOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool align_corners = false,
     bool half_pixel_centers = false) {
   ResizeNearestNeighborOptionsBuilder builder_(_fbb);
@@ -6787,14 +6872,14 @@ inline flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeig
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct CallOptionsT : public flatbuffers::NativeTable {
+struct CallOptionsT : public ::flatbuffers::NativeTable {
   typedef CallOptions TableType;
   uint32_t subgraph = 0;
 };
 
-struct CallOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct CallOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef CallOptionsT NativeTableType;
   typedef CallOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -6803,175 +6888,175 @@ struct CallOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint32_t subgraph() const {
     return GetField<uint32_t>(VT_SUBGRAPH, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint32_t>(verifier, VT_SUBGRAPH, 4) &&
            verifier.EndTable();
   }
-  CallOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(CallOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<CallOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  CallOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CallOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CallOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct CallOptionsBuilder {
   typedef CallOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_subgraph(uint32_t subgraph) {
     fbb_.AddElement<uint32_t>(CallOptions::VT_SUBGRAPH, subgraph, 0);
   }
-  explicit CallOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit CallOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CallOptions> Finish() {
+  ::flatbuffers::Offset<CallOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CallOptions>(end);
+    auto o = ::flatbuffers::Offset<CallOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CallOptions> CreateCallOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<CallOptions> CreateCallOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     uint32_t subgraph = 0) {
   CallOptionsBuilder builder_(_fbb);
   builder_.add_subgraph(subgraph);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<CallOptions> CreateCallOptions(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<CallOptions> CreateCallOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct PadOptionsT : public flatbuffers::NativeTable {
+struct PadOptionsT : public ::flatbuffers::NativeTable {
   typedef PadOptions TableType;
 };
 
-struct PadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct PadOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef PadOptionsT NativeTableType;
   typedef PadOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  PadOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(PadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<PadOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  PadOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PadOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<PadOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct PadOptionsBuilder {
   typedef PadOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit PadOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit PadOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<PadOptions> Finish() {
+  ::flatbuffers::Offset<PadOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<PadOptions>(end);
+    auto o = ::flatbuffers::Offset<PadOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<PadOptions> CreatePadOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<PadOptions> CreatePadOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   PadOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<PadOptions> CreatePadOptions(flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<PadOptions> CreatePadOptions(::flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct PadV2OptionsT : public flatbuffers::NativeTable {
+struct PadV2OptionsT : public ::flatbuffers::NativeTable {
   typedef PadV2Options TableType;
 };
 
-struct PadV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct PadV2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef PadV2OptionsT NativeTableType;
   typedef PadV2OptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  PadV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(PadV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<PadV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  PadV2OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PadV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<PadV2Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct PadV2OptionsBuilder {
   typedef PadV2Options Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit PadV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit PadV2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<PadV2Options> Finish() {
+  ::flatbuffers::Offset<PadV2Options> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<PadV2Options>(end);
+    auto o = ::flatbuffers::Offset<PadV2Options>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<PadV2Options> CreatePadV2Options(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<PadV2Options> CreatePadV2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   PadV2OptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<PadV2Options> CreatePadV2Options(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<PadV2Options> CreatePadV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ReshapeOptionsT : public flatbuffers::NativeTable {
+struct ReshapeOptionsT : public ::flatbuffers::NativeTable {
   typedef ReshapeOptions TableType;
   std::vector<int32_t> new_shape{};
 };
 
-struct ReshapeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ReshapeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ReshapeOptionsT NativeTableType;
   typedef ReshapeOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NEW_SHAPE = 4
   };
-  const flatbuffers::Vector<int32_t> *new_shape() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_NEW_SHAPE);
+  const ::flatbuffers::Vector<int32_t> *new_shape() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_NEW_SHAPE);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NEW_SHAPE) &&
            verifier.VerifyVector(new_shape()) &&
            verifier.EndTable();
   }
-  ReshapeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ReshapeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ReshapeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ReshapeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReshapeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ReshapeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ReshapeOptionsBuilder {
   typedef ReshapeOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_new_shape(flatbuffers::Offset<flatbuffers::Vector<int32_t>> new_shape) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_new_shape(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> new_shape) {
     fbb_.AddOffset(ReshapeOptions::VT_NEW_SHAPE, new_shape);
   }
-  explicit ReshapeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ReshapeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ReshapeOptions> Finish() {
+  ::flatbuffers::Offset<ReshapeOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ReshapeOptions>(end);
+    auto o = ::flatbuffers::Offset<ReshapeOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> new_shape = 0) {
+inline ::flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> new_shape = 0) {
   ReshapeOptionsBuilder builder_(_fbb);
   builder_.add_new_shape(new_shape);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ReshapeOptions> CreateReshapeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int32_t> *new_shape = nullptr) {
   auto new_shape__ = new_shape ? _fbb.CreateVector<int32_t>(*new_shape) : 0;
   return tflite::CreateReshapeOptions(
@@ -6979,94 +7064,94 @@ inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptionsDirect(
       new_shape__);
 }
 
-flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SpaceToBatchNDOptionsT : public flatbuffers::NativeTable {
+struct SpaceToBatchNDOptionsT : public ::flatbuffers::NativeTable {
   typedef SpaceToBatchNDOptions TableType;
 };
 
-struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SpaceToBatchNDOptionsT NativeTableType;
   typedef SpaceToBatchNDOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  SpaceToBatchNDOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SpaceToBatchNDOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SpaceToBatchNDOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SpaceToBatchNDOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SpaceToBatchNDOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SpaceToBatchNDOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SpaceToBatchNDOptionsBuilder {
   typedef SpaceToBatchNDOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit SpaceToBatchNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SpaceToBatchNDOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SpaceToBatchNDOptions> Finish() {
+  ::flatbuffers::Offset<SpaceToBatchNDOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SpaceToBatchNDOptions>(end);
+    auto o = ::flatbuffers::Offset<SpaceToBatchNDOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   SpaceToBatchNDOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BatchToSpaceNDOptionsT : public flatbuffers::NativeTable {
+struct BatchToSpaceNDOptionsT : public ::flatbuffers::NativeTable {
   typedef BatchToSpaceNDOptions TableType;
 };
 
-struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BatchToSpaceNDOptionsT NativeTableType;
   typedef BatchToSpaceNDOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  BatchToSpaceNDOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BatchToSpaceNDOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BatchToSpaceNDOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BatchToSpaceNDOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BatchToSpaceNDOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BatchToSpaceNDOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BatchToSpaceNDOptionsBuilder {
   typedef BatchToSpaceNDOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit BatchToSpaceNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BatchToSpaceNDOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BatchToSpaceNDOptions> Finish() {
+  ::flatbuffers::Offset<BatchToSpaceNDOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BatchToSpaceNDOptions>(end);
+    auto o = ::flatbuffers::Offset<BatchToSpaceNDOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   BatchToSpaceNDOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SkipGramOptionsT : public flatbuffers::NativeTable {
+struct SkipGramOptionsT : public ::flatbuffers::NativeTable {
   typedef SkipGramOptions TableType;
   int32_t ngram_size = 0;
   int32_t max_skip_size = 0;
   bool include_all_ngrams = false;
 };
 
-struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SkipGramOptionsT NativeTableType;
   typedef SkipGramOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -7083,22 +7168,22 @@ struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool include_all_ngrams() const {
     return GetField<uint8_t>(VT_INCLUDE_ALL_NGRAMS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_NGRAM_SIZE, 4) &&
            VerifyField<int32_t>(verifier, VT_MAX_SKIP_SIZE, 4) &&
            VerifyField<uint8_t>(verifier, VT_INCLUDE_ALL_NGRAMS, 1) &&
            verifier.EndTable();
   }
-  SkipGramOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SkipGramOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SkipGramOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SkipGramOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SkipGramOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SkipGramOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SkipGramOptionsBuilder {
   typedef SkipGramOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_ngram_size(int32_t ngram_size) {
     fbb_.AddElement<int32_t>(SkipGramOptions::VT_NGRAM_SIZE, ngram_size, 0);
   }
@@ -7108,19 +7193,19 @@ struct SkipGramOptionsBuilder {
   void add_include_all_ngrams(bool include_all_ngrams) {
     fbb_.AddElement<uint8_t>(SkipGramOptions::VT_INCLUDE_ALL_NGRAMS, static_cast<uint8_t>(include_all_ngrams), 0);
   }
-  explicit SkipGramOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SkipGramOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SkipGramOptions> Finish() {
+  ::flatbuffers::Offset<SkipGramOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SkipGramOptions>(end);
+    auto o = ::flatbuffers::Offset<SkipGramOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t ngram_size = 0,
     int32_t max_skip_size = 0,
     bool include_all_ngrams = false) {
@@ -7131,14 +7216,14 @@ inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SpaceToDepthOptionsT : public flatbuffers::NativeTable {
+struct SpaceToDepthOptionsT : public ::flatbuffers::NativeTable {
   typedef SpaceToDepthOptions TableType;
   int32_t block_size = 0;
 };
 
-struct SpaceToDepthOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SpaceToDepthOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SpaceToDepthOptionsT NativeTableType;
   typedef SpaceToDepthOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -7147,50 +7232,50 @@ struct SpaceToDepthOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   int32_t block_size() const {
     return GetField<int32_t>(VT_BLOCK_SIZE, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_BLOCK_SIZE, 4) &&
            verifier.EndTable();
   }
-  SpaceToDepthOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SpaceToDepthOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SpaceToDepthOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SpaceToDepthOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SpaceToDepthOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SpaceToDepthOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SpaceToDepthOptionsBuilder {
   typedef SpaceToDepthOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_block_size(int32_t block_size) {
     fbb_.AddElement<int32_t>(SpaceToDepthOptions::VT_BLOCK_SIZE, block_size, 0);
   }
-  explicit SpaceToDepthOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SpaceToDepthOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SpaceToDepthOptions> Finish() {
+  ::flatbuffers::Offset<SpaceToDepthOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SpaceToDepthOptions>(end);
+    auto o = ::flatbuffers::Offset<SpaceToDepthOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t block_size = 0) {
   SpaceToDepthOptionsBuilder builder_(_fbb);
   builder_.add_block_size(block_size);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct DepthToSpaceOptionsT : public flatbuffers::NativeTable {
+struct DepthToSpaceOptionsT : public ::flatbuffers::NativeTable {
   typedef DepthToSpaceOptions TableType;
   int32_t block_size = 0;
 };
 
-struct DepthToSpaceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct DepthToSpaceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef DepthToSpaceOptionsT NativeTableType;
   typedef DepthToSpaceOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -7199,51 +7284,51 @@ struct DepthToSpaceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   int32_t block_size() const {
     return GetField<int32_t>(VT_BLOCK_SIZE, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_BLOCK_SIZE, 4) &&
            verifier.EndTable();
   }
-  DepthToSpaceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(DepthToSpaceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<DepthToSpaceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  DepthToSpaceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DepthToSpaceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DepthToSpaceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct DepthToSpaceOptionsBuilder {
   typedef DepthToSpaceOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_block_size(int32_t block_size) {
     fbb_.AddElement<int32_t>(DepthToSpaceOptions::VT_BLOCK_SIZE, block_size, 0);
   }
-  explicit DepthToSpaceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit DepthToSpaceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<DepthToSpaceOptions> Finish() {
+  ::flatbuffers::Offset<DepthToSpaceOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<DepthToSpaceOptions>(end);
+    auto o = ::flatbuffers::Offset<DepthToSpaceOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t block_size = 0) {
   DepthToSpaceOptionsBuilder builder_(_fbb);
   builder_.add_block_size(block_size);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SubOptionsT : public flatbuffers::NativeTable {
+struct SubOptionsT : public ::flatbuffers::NativeTable {
   typedef SubOptions TableType;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
   bool pot_scale_int16 = true;
 };
 
-struct SubOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SubOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SubOptionsT NativeTableType;
   typedef SubOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -7256,40 +7341,40 @@ struct SubOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool pot_scale_int16() const {
     return GetField<uint8_t>(VT_POT_SCALE_INT16, 1) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16, 1) &&
            verifier.EndTable();
   }
-  SubOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SubOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SubOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SubOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SubOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SubOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SubOptionsBuilder {
   typedef SubOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(SubOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
   void add_pot_scale_int16(bool pot_scale_int16) {
     fbb_.AddElement<uint8_t>(SubOptions::VT_POT_SCALE_INT16, static_cast<uint8_t>(pot_scale_int16), 1);
   }
-  explicit SubOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SubOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SubOptions> Finish() {
+  ::flatbuffers::Offset<SubOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SubOptions>(end);
+    auto o = ::flatbuffers::Offset<SubOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SubOptions> CreateSubOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<SubOptions> CreateSubOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
     bool pot_scale_int16 = true) {
   SubOptionsBuilder builder_(_fbb);
@@ -7298,14 +7383,14 @@ inline flatbuffers::Offset<SubOptions> CreateSubOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SubOptions> CreateSubOptions(flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SubOptions> CreateSubOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct DivOptionsT : public flatbuffers::NativeTable {
+struct DivOptionsT : public ::flatbuffers::NativeTable {
   typedef DivOptions TableType;
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
 };
 
-struct DivOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct DivOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef DivOptionsT NativeTableType;
   typedef DivOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -7314,89 +7399,89 @@ struct DivOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            verifier.EndTable();
   }
-  DivOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(DivOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<DivOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  DivOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DivOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DivOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct DivOptionsBuilder {
   typedef DivOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(DivOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
-  explicit DivOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit DivOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<DivOptions> Finish() {
+  ::flatbuffers::Offset<DivOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<DivOptions>(end);
+    auto o = ::flatbuffers::Offset<DivOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<DivOptions> CreateDivOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<DivOptions> CreateDivOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
   DivOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<DivOptions> CreateDivOptions(flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<DivOptions> CreateDivOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct TopKV2OptionsT : public flatbuffers::NativeTable {
+struct TopKV2OptionsT : public ::flatbuffers::NativeTable {
   typedef TopKV2Options TableType;
 };
 
-struct TopKV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct TopKV2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef TopKV2OptionsT NativeTableType;
   typedef TopKV2OptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  TopKV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(TopKV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<TopKV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  TopKV2OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TopKV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TopKV2Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct TopKV2OptionsBuilder {
   typedef TopKV2Options Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit TopKV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit TopKV2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<TopKV2Options> Finish() {
+  ::flatbuffers::Offset<TopKV2Options> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<TopKV2Options>(end);
+    auto o = ::flatbuffers::Offset<TopKV2Options>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   TopKV2OptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct EmbeddingLookupSparseOptionsT : public flatbuffers::NativeTable {
+struct EmbeddingLookupSparseOptionsT : public ::flatbuffers::NativeTable {
   typedef EmbeddingLookupSparseOptions TableType;
   tflite::CombinerType combiner = tflite::CombinerType_SUM;
 };
 
-struct EmbeddingLookupSparseOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct EmbeddingLookupSparseOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef EmbeddingLookupSparseOptionsT NativeTableType;
   typedef EmbeddingLookupSparseOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -7405,51 +7490,51 @@ struct EmbeddingLookupSparseOptions FLATBUFFERS_FINAL_CLASS : private flatbuffer
   tflite::CombinerType combiner() const {
     return static_cast<tflite::CombinerType>(GetField<int8_t>(VT_COMBINER, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_COMBINER, 1) &&
            verifier.EndTable();
   }
-  EmbeddingLookupSparseOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(EmbeddingLookupSparseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<EmbeddingLookupSparseOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  EmbeddingLookupSparseOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EmbeddingLookupSparseOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EmbeddingLookupSparseOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct EmbeddingLookupSparseOptionsBuilder {
   typedef EmbeddingLookupSparseOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_combiner(tflite::CombinerType combiner) {
     fbb_.AddElement<int8_t>(EmbeddingLookupSparseOptions::VT_COMBINER, static_cast<int8_t>(combiner), 0);
   }
-  explicit EmbeddingLookupSparseOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit EmbeddingLookupSparseOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<EmbeddingLookupSparseOptions> Finish() {
+  ::flatbuffers::Offset<EmbeddingLookupSparseOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<EmbeddingLookupSparseOptions>(end);
+    auto o = ::flatbuffers::Offset<EmbeddingLookupSparseOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::CombinerType combiner = tflite::CombinerType_SUM) {
   EmbeddingLookupSparseOptionsBuilder builder_(_fbb);
   builder_.add_combiner(combiner);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(::flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct GatherOptionsT : public flatbuffers::NativeTable {
+struct GatherOptionsT : public ::flatbuffers::NativeTable {
   typedef GatherOptions TableType;
   int32_t axis = 0;
   int32_t batch_dims = 0;
 };
 
-struct GatherOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct GatherOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef GatherOptionsT NativeTableType;
   typedef GatherOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -7462,40 +7547,40 @@ struct GatherOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t batch_dims() const {
     return GetField<int32_t>(VT_BATCH_DIMS, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
            VerifyField<int32_t>(verifier, VT_BATCH_DIMS, 4) &&
            verifier.EndTable();
   }
-  GatherOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(GatherOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<GatherOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  GatherOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GatherOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GatherOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct GatherOptionsBuilder {
   typedef GatherOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_axis(int32_t axis) {
     fbb_.AddElement<int32_t>(GatherOptions::VT_AXIS, axis, 0);
   }
   void add_batch_dims(int32_t batch_dims) {
     fbb_.AddElement<int32_t>(GatherOptions::VT_BATCH_DIMS, batch_dims, 0);
   }
-  explicit GatherOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit GatherOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<GatherOptions> Finish() {
+  ::flatbuffers::Offset<GatherOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<GatherOptions>(end);
+    auto o = ::flatbuffers::Offset<GatherOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<GatherOptions> CreateGatherOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<GatherOptions> CreateGatherOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t axis = 0,
     int32_t batch_dims = 0) {
   GatherOptionsBuilder builder_(_fbb);
@@ -7504,131 +7589,131 @@ inline flatbuffers::Offset<GatherOptions> CreateGatherOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<GatherOptions> CreateGatherOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<GatherOptions> CreateGatherOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct TransposeOptionsT : public flatbuffers::NativeTable {
+struct TransposeOptionsT : public ::flatbuffers::NativeTable {
   typedef TransposeOptions TableType;
 };
 
-struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef TransposeOptionsT NativeTableType;
   typedef TransposeOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  TransposeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(TransposeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<TransposeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  TransposeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TransposeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TransposeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct TransposeOptionsBuilder {
   typedef TransposeOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit TransposeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit TransposeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<TransposeOptions> Finish() {
+  ::flatbuffers::Offset<TransposeOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<TransposeOptions>(end);
+    auto o = ::flatbuffers::Offset<TransposeOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   TransposeOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ExpOptionsT : public flatbuffers::NativeTable {
+struct ExpOptionsT : public ::flatbuffers::NativeTable {
   typedef ExpOptions TableType;
 };
 
-struct ExpOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ExpOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ExpOptionsT NativeTableType;
   typedef ExpOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  ExpOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ExpOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ExpOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ExpOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ExpOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ExpOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ExpOptionsBuilder {
   typedef ExpOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit ExpOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ExpOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ExpOptions> Finish() {
+  ::flatbuffers::Offset<ExpOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ExpOptions>(end);
+    auto o = ::flatbuffers::Offset<ExpOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ExpOptions> CreateExpOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<ExpOptions> CreateExpOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   ExpOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ExpOptions> CreateExpOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ExpOptions> CreateExpOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct CosOptionsT : public flatbuffers::NativeTable {
+struct CosOptionsT : public ::flatbuffers::NativeTable {
   typedef CosOptions TableType;
 };
 
-struct CosOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct CosOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef CosOptionsT NativeTableType;
   typedef CosOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  CosOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(CosOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<CosOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  CosOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CosOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CosOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct CosOptionsBuilder {
   typedef CosOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit CosOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit CosOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CosOptions> Finish() {
+  ::flatbuffers::Offset<CosOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CosOptions>(end);
+    auto o = ::flatbuffers::Offset<CosOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CosOptions> CreateCosOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<CosOptions> CreateCosOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   CosOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<CosOptions> CreateCosOptions(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<CosOptions> CreateCosOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ReducerOptionsT : public flatbuffers::NativeTable {
+struct ReducerOptionsT : public ::flatbuffers::NativeTable {
   typedef ReducerOptions TableType;
   bool keep_dims = false;
 };
 
-struct ReducerOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ReducerOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ReducerOptionsT NativeTableType;
   typedef ReducerOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -7637,97 +7722,97 @@ struct ReducerOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool keep_dims() const {
     return GetField<uint8_t>(VT_KEEP_DIMS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_KEEP_DIMS, 1) &&
            verifier.EndTable();
   }
-  ReducerOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ReducerOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ReducerOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ReducerOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReducerOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ReducerOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ReducerOptionsBuilder {
   typedef ReducerOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_keep_dims(bool keep_dims) {
     fbb_.AddElement<uint8_t>(ReducerOptions::VT_KEEP_DIMS, static_cast<uint8_t>(keep_dims), 0);
   }
-  explicit ReducerOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ReducerOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ReducerOptions> Finish() {
+  ::flatbuffers::Offset<ReducerOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ReducerOptions>(end);
+    auto o = ::flatbuffers::Offset<ReducerOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ReducerOptions> CreateReducerOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ReducerOptions> CreateReducerOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool keep_dims = false) {
   ReducerOptionsBuilder builder_(_fbb);
   builder_.add_keep_dims(keep_dims);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ReducerOptions> CreateReducerOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ReducerOptions> CreateReducerOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SqueezeOptionsT : public flatbuffers::NativeTable {
+struct SqueezeOptionsT : public ::flatbuffers::NativeTable {
   typedef SqueezeOptions TableType;
   std::vector<int32_t> squeeze_dims{};
 };
 
-struct SqueezeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SqueezeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SqueezeOptionsT NativeTableType;
   typedef SqueezeOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_SQUEEZE_DIMS = 4
   };
-  const flatbuffers::Vector<int32_t> *squeeze_dims() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_SQUEEZE_DIMS);
+  const ::flatbuffers::Vector<int32_t> *squeeze_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SQUEEZE_DIMS);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_SQUEEZE_DIMS) &&
            verifier.VerifyVector(squeeze_dims()) &&
            verifier.EndTable();
   }
-  SqueezeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SqueezeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SqueezeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SqueezeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SqueezeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SqueezeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SqueezeOptionsBuilder {
   typedef SqueezeOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_squeeze_dims(flatbuffers::Offset<flatbuffers::Vector<int32_t>> squeeze_dims) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_squeeze_dims(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> squeeze_dims) {
     fbb_.AddOffset(SqueezeOptions::VT_SQUEEZE_DIMS, squeeze_dims);
   }
-  explicit SqueezeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SqueezeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SqueezeOptions> Finish() {
+  ::flatbuffers::Offset<SqueezeOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SqueezeOptions>(end);
+    auto o = ::flatbuffers::Offset<SqueezeOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> squeeze_dims = 0) {
+inline ::flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> squeeze_dims = 0) {
   SqueezeOptionsBuilder builder_(_fbb);
   builder_.add_squeeze_dims(squeeze_dims);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int32_t> *squeeze_dims = nullptr) {
   auto squeeze_dims__ = squeeze_dims ? _fbb.CreateVector<int32_t>(*squeeze_dims) : 0;
   return tflite::CreateSqueezeOptions(
@@ -7735,14 +7820,14 @@ inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptionsDirect(
       squeeze_dims__);
 }
 
-flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SplitOptionsT : public flatbuffers::NativeTable {
+struct SplitOptionsT : public ::flatbuffers::NativeTable {
   typedef SplitOptions TableType;
   int32_t num_splits = 0;
 };
 
-struct SplitOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SplitOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SplitOptionsT NativeTableType;
   typedef SplitOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -7751,50 +7836,50 @@ struct SplitOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t num_splits() const {
     return GetField<int32_t>(VT_NUM_SPLITS, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_NUM_SPLITS, 4) &&
            verifier.EndTable();
   }
-  SplitOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SplitOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SplitOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SplitOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SplitOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SplitOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SplitOptionsBuilder {
   typedef SplitOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_num_splits(int32_t num_splits) {
     fbb_.AddElement<int32_t>(SplitOptions::VT_NUM_SPLITS, num_splits, 0);
   }
-  explicit SplitOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SplitOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SplitOptions> Finish() {
+  ::flatbuffers::Offset<SplitOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SplitOptions>(end);
+    auto o = ::flatbuffers::Offset<SplitOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SplitOptions> CreateSplitOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<SplitOptions> CreateSplitOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t num_splits = 0) {
   SplitOptionsBuilder builder_(_fbb);
   builder_.add_num_splits(num_splits);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SplitOptions> CreateSplitOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SplitOptions> CreateSplitOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SplitVOptionsT : public flatbuffers::NativeTable {
+struct SplitVOptionsT : public ::flatbuffers::NativeTable {
   typedef SplitVOptions TableType;
   int32_t num_splits = 0;
 };
 
-struct SplitVOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SplitVOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SplitVOptionsT NativeTableType;
   typedef SplitVOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -7803,45 +7888,45 @@ struct SplitVOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t num_splits() const {
     return GetField<int32_t>(VT_NUM_SPLITS, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_NUM_SPLITS, 4) &&
            verifier.EndTable();
   }
-  SplitVOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SplitVOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SplitVOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SplitVOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SplitVOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SplitVOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SplitVOptionsBuilder {
   typedef SplitVOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_num_splits(int32_t num_splits) {
     fbb_.AddElement<int32_t>(SplitVOptions::VT_NUM_SPLITS, num_splits, 0);
   }
-  explicit SplitVOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SplitVOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SplitVOptions> Finish() {
+  ::flatbuffers::Offset<SplitVOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SplitVOptions>(end);
+    auto o = ::flatbuffers::Offset<SplitVOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t num_splits = 0) {
   SplitVOptionsBuilder builder_(_fbb);
   builder_.add_num_splits(num_splits);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct StridedSliceOptionsT : public flatbuffers::NativeTable {
+struct StridedSliceOptionsT : public ::flatbuffers::NativeTable {
   typedef StridedSliceOptions TableType;
   int32_t begin_mask = 0;
   int32_t end_mask = 0;
@@ -7850,7 +7935,7 @@ struct StridedSliceOptionsT : public flatbuffers::NativeTable {
   int32_t shrink_axis_mask = 0;
 };
 
-struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef StridedSliceOptionsT NativeTableType;
   typedef StridedSliceOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -7875,7 +7960,7 @@ struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   int32_t shrink_axis_mask() const {
     return GetField<int32_t>(VT_SHRINK_AXIS_MASK, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_BEGIN_MASK, 4) &&
            VerifyField<int32_t>(verifier, VT_END_MASK, 4) &&
@@ -7884,15 +7969,15 @@ struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
            VerifyField<int32_t>(verifier, VT_SHRINK_AXIS_MASK, 4) &&
            verifier.EndTable();
   }
-  StridedSliceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(StridedSliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<StridedSliceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  StridedSliceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StridedSliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StridedSliceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct StridedSliceOptionsBuilder {
   typedef StridedSliceOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_begin_mask(int32_t begin_mask) {
     fbb_.AddElement<int32_t>(StridedSliceOptions::VT_BEGIN_MASK, begin_mask, 0);
   }
@@ -7908,19 +7993,19 @@ struct StridedSliceOptionsBuilder {
   void add_shrink_axis_mask(int32_t shrink_axis_mask) {
     fbb_.AddElement<int32_t>(StridedSliceOptions::VT_SHRINK_AXIS_MASK, shrink_axis_mask, 0);
   }
-  explicit StridedSliceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit StridedSliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<StridedSliceOptions> Finish() {
+  ::flatbuffers::Offset<StridedSliceOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<StridedSliceOptions>(end);
+    auto o = ::flatbuffers::Offset<StridedSliceOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t begin_mask = 0,
     int32_t end_mask = 0,
     int32_t ellipsis_mask = 0,
@@ -7935,54 +8020,54 @@ inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct LogSoftmaxOptionsT : public flatbuffers::NativeTable {
+struct LogSoftmaxOptionsT : public ::flatbuffers::NativeTable {
   typedef LogSoftmaxOptions TableType;
 };
 
-struct LogSoftmaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LogSoftmaxOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef LogSoftmaxOptionsT NativeTableType;
   typedef LogSoftmaxOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  LogSoftmaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LogSoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LogSoftmaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  LogSoftmaxOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogSoftmaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LogSoftmaxOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LogSoftmaxOptionsBuilder {
   typedef LogSoftmaxOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit LogSoftmaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LogSoftmaxOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<LogSoftmaxOptions> Finish() {
+  ::flatbuffers::Offset<LogSoftmaxOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<LogSoftmaxOptions>(end);
+    auto o = ::flatbuffers::Offset<LogSoftmaxOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   LogSoftmaxOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct CastOptionsT : public flatbuffers::NativeTable {
+struct CastOptionsT : public ::flatbuffers::NativeTable {
   typedef CastOptions TableType;
   tflite::TensorType in_data_type = tflite::TensorType_FLOAT32;
   tflite::TensorType out_data_type = tflite::TensorType_FLOAT32;
 };
 
-struct CastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct CastOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef CastOptionsT NativeTableType;
   typedef CastOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -7995,40 +8080,40 @@ struct CastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::TensorType out_data_type() const {
     return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUT_DATA_TYPE, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_IN_DATA_TYPE, 1) &&
            VerifyField<int8_t>(verifier, VT_OUT_DATA_TYPE, 1) &&
            verifier.EndTable();
   }
-  CastOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(CastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<CastOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  CastOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CastOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CastOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct CastOptionsBuilder {
   typedef CastOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_in_data_type(tflite::TensorType in_data_type) {
     fbb_.AddElement<int8_t>(CastOptions::VT_IN_DATA_TYPE, static_cast<int8_t>(in_data_type), 0);
   }
   void add_out_data_type(tflite::TensorType out_data_type) {
     fbb_.AddElement<int8_t>(CastOptions::VT_OUT_DATA_TYPE, static_cast<int8_t>(out_data_type), 0);
   }
-  explicit CastOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit CastOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CastOptions> Finish() {
+  ::flatbuffers::Offset<CastOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CastOptions>(end);
+    auto o = ::flatbuffers::Offset<CastOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CastOptions> CreateCastOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<CastOptions> CreateCastOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::TensorType in_data_type = tflite::TensorType_FLOAT32,
     tflite::TensorType out_data_type = tflite::TensorType_FLOAT32) {
   CastOptionsBuilder builder_(_fbb);
@@ -8037,131 +8122,131 @@ inline flatbuffers::Offset<CastOptions> CreateCastOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<CastOptions> CreateCastOptions(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<CastOptions> CreateCastOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct DequantizeOptionsT : public flatbuffers::NativeTable {
+struct DequantizeOptionsT : public ::flatbuffers::NativeTable {
   typedef DequantizeOptions TableType;
 };
 
-struct DequantizeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct DequantizeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef DequantizeOptionsT NativeTableType;
   typedef DequantizeOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  DequantizeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(DequantizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<DequantizeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  DequantizeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DequantizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DequantizeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct DequantizeOptionsBuilder {
   typedef DequantizeOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit DequantizeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit DequantizeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<DequantizeOptions> Finish() {
+  ::flatbuffers::Offset<DequantizeOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<DequantizeOptions>(end);
+    auto o = ::flatbuffers::Offset<DequantizeOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   DequantizeOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct MaximumMinimumOptionsT : public flatbuffers::NativeTable {
+struct MaximumMinimumOptionsT : public ::flatbuffers::NativeTable {
   typedef MaximumMinimumOptions TableType;
 };
 
-struct MaximumMinimumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct MaximumMinimumOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef MaximumMinimumOptionsT NativeTableType;
   typedef MaximumMinimumOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  MaximumMinimumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<MaximumMinimumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MaximumMinimumOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MaximumMinimumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MaximumMinimumOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct MaximumMinimumOptionsBuilder {
   typedef MaximumMinimumOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit MaximumMinimumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit MaximumMinimumOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<MaximumMinimumOptions> Finish() {
+  ::flatbuffers::Offset<MaximumMinimumOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<MaximumMinimumOptions>(end);
+    auto o = ::flatbuffers::Offset<MaximumMinimumOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   MaximumMinimumOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct TileOptionsT : public flatbuffers::NativeTable {
+struct TileOptionsT : public ::flatbuffers::NativeTable {
   typedef TileOptions TableType;
 };
 
-struct TileOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct TileOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef TileOptionsT NativeTableType;
   typedef TileOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  TileOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(TileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<TileOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  TileOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TileOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TileOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct TileOptionsBuilder {
   typedef TileOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit TileOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit TileOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<TileOptions> Finish() {
+  ::flatbuffers::Offset<TileOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<TileOptions>(end);
+    auto o = ::flatbuffers::Offset<TileOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<TileOptions> CreateTileOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<TileOptions> CreateTileOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   TileOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<TileOptions> CreateTileOptions(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<TileOptions> CreateTileOptions(::flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ArgMaxOptionsT : public flatbuffers::NativeTable {
+struct ArgMaxOptionsT : public ::flatbuffers::NativeTable {
   typedef ArgMaxOptions TableType;
   tflite::TensorType output_type = tflite::TensorType_FLOAT32;
 };
 
-struct ArgMaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ArgMaxOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ArgMaxOptionsT NativeTableType;
   typedef ArgMaxOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -8170,50 +8255,50 @@ struct ArgMaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::TensorType output_type() const {
     return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUTPUT_TYPE, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_OUTPUT_TYPE, 1) &&
            verifier.EndTable();
   }
-  ArgMaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ArgMaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ArgMaxOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArgMaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ArgMaxOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ArgMaxOptionsBuilder {
   typedef ArgMaxOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_output_type(tflite::TensorType output_type) {
     fbb_.AddElement<int8_t>(ArgMaxOptions::VT_OUTPUT_TYPE, static_cast<int8_t>(output_type), 0);
   }
-  explicit ArgMaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ArgMaxOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ArgMaxOptions> Finish() {
+  ::flatbuffers::Offset<ArgMaxOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ArgMaxOptions>(end);
+    auto o = ::flatbuffers::Offset<ArgMaxOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::TensorType output_type = tflite::TensorType_FLOAT32) {
   ArgMaxOptionsBuilder builder_(_fbb);
   builder_.add_output_type(output_type);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ArgMinOptionsT : public flatbuffers::NativeTable {
+struct ArgMinOptionsT : public ::flatbuffers::NativeTable {
   typedef ArgMinOptions TableType;
   tflite::TensorType output_type = tflite::TensorType_FLOAT32;
 };
 
-struct ArgMinOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ArgMinOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ArgMinOptionsT NativeTableType;
   typedef ArgMinOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -8222,318 +8307,318 @@ struct ArgMinOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::TensorType output_type() const {
     return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUTPUT_TYPE, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_OUTPUT_TYPE, 1) &&
            verifier.EndTable();
   }
-  ArgMinOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ArgMinOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ArgMinOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ArgMinOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArgMinOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ArgMinOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ArgMinOptionsBuilder {
   typedef ArgMinOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_output_type(tflite::TensorType output_type) {
     fbb_.AddElement<int8_t>(ArgMinOptions::VT_OUTPUT_TYPE, static_cast<int8_t>(output_type), 0);
   }
-  explicit ArgMinOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ArgMinOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ArgMinOptions> Finish() {
+  ::flatbuffers::Offset<ArgMinOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ArgMinOptions>(end);
+    auto o = ::flatbuffers::Offset<ArgMinOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::TensorType output_type = tflite::TensorType_FLOAT32) {
   ArgMinOptionsBuilder builder_(_fbb);
   builder_.add_output_type(output_type);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct GreaterOptionsT : public flatbuffers::NativeTable {
+struct GreaterOptionsT : public ::flatbuffers::NativeTable {
   typedef GreaterOptions TableType;
 };
 
-struct GreaterOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct GreaterOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef GreaterOptionsT NativeTableType;
   typedef GreaterOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  GreaterOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(GreaterOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<GreaterOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  GreaterOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GreaterOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GreaterOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct GreaterOptionsBuilder {
   typedef GreaterOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit GreaterOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit GreaterOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<GreaterOptions> Finish() {
+  ::flatbuffers::Offset<GreaterOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<GreaterOptions>(end);
+    auto o = ::flatbuffers::Offset<GreaterOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   GreaterOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct GreaterEqualOptionsT : public flatbuffers::NativeTable {
+struct GreaterEqualOptionsT : public ::flatbuffers::NativeTable {
   typedef GreaterEqualOptions TableType;
 };
 
-struct GreaterEqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct GreaterEqualOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef GreaterEqualOptionsT NativeTableType;
   typedef GreaterEqualOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  GreaterEqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(GreaterEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<GreaterEqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  GreaterEqualOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GreaterEqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GreaterEqualOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct GreaterEqualOptionsBuilder {
   typedef GreaterEqualOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit GreaterEqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit GreaterEqualOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<GreaterEqualOptions> Finish() {
+  ::flatbuffers::Offset<GreaterEqualOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<GreaterEqualOptions>(end);
+    auto o = ::flatbuffers::Offset<GreaterEqualOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   GreaterEqualOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct LessOptionsT : public flatbuffers::NativeTable {
+struct LessOptionsT : public ::flatbuffers::NativeTable {
   typedef LessOptions TableType;
 };
 
-struct LessOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LessOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef LessOptionsT NativeTableType;
   typedef LessOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  LessOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LessOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LessOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  LessOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LessOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LessOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LessOptionsBuilder {
   typedef LessOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit LessOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LessOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<LessOptions> Finish() {
+  ::flatbuffers::Offset<LessOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<LessOptions>(end);
+    auto o = ::flatbuffers::Offset<LessOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<LessOptions> CreateLessOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<LessOptions> CreateLessOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   LessOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<LessOptions> CreateLessOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct LessEqualOptionsT : public flatbuffers::NativeTable {
+struct LessEqualOptionsT : public ::flatbuffers::NativeTable {
   typedef LessEqualOptions TableType;
 };
 
-struct LessEqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LessEqualOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef LessEqualOptionsT NativeTableType;
   typedef LessEqualOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  LessEqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LessEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LessEqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  LessEqualOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LessEqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LessEqualOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LessEqualOptionsBuilder {
   typedef LessEqualOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit LessEqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LessEqualOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<LessEqualOptions> Finish() {
+  ::flatbuffers::Offset<LessEqualOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<LessEqualOptions>(end);
+    auto o = ::flatbuffers::Offset<LessEqualOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   LessEqualOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct NegOptionsT : public flatbuffers::NativeTable {
+struct NegOptionsT : public ::flatbuffers::NativeTable {
   typedef NegOptions TableType;
 };
 
-struct NegOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct NegOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef NegOptionsT NativeTableType;
   typedef NegOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  NegOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(NegOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<NegOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  NegOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NegOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<NegOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct NegOptionsBuilder {
   typedef NegOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit NegOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NegOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<NegOptions> Finish() {
+  ::flatbuffers::Offset<NegOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<NegOptions>(end);
+    auto o = ::flatbuffers::Offset<NegOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<NegOptions> CreateNegOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<NegOptions> CreateNegOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   NegOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<NegOptions> CreateNegOptions(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<NegOptions> CreateNegOptions(::flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SelectOptionsT : public flatbuffers::NativeTable {
+struct SelectOptionsT : public ::flatbuffers::NativeTable {
   typedef SelectOptions TableType;
 };
 
-struct SelectOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SelectOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SelectOptionsT NativeTableType;
   typedef SelectOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  SelectOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SelectOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SelectOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SelectOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SelectOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SelectOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SelectOptionsBuilder {
   typedef SelectOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit SelectOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SelectOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SelectOptions> Finish() {
+  ::flatbuffers::Offset<SelectOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SelectOptions>(end);
+    auto o = ::flatbuffers::Offset<SelectOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SelectOptions> CreateSelectOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<SelectOptions> CreateSelectOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   SelectOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SelectOptions> CreateSelectOptions(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SelectOptions> CreateSelectOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SliceOptionsT : public flatbuffers::NativeTable {
+struct SliceOptionsT : public ::flatbuffers::NativeTable {
   typedef SliceOptions TableType;
 };
 
-struct SliceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SliceOptionsT NativeTableType;
   typedef SliceOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  SliceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SliceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SliceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SliceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SliceOptionsBuilder {
   typedef SliceOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit SliceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SliceOptions> Finish() {
+  ::flatbuffers::Offset<SliceOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SliceOptions>(end);
+    auto o = ::flatbuffers::Offset<SliceOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SliceOptions> CreateSliceOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<SliceOptions> CreateSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   SliceOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SliceOptions> CreateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SliceOptions> CreateSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct TransposeConvOptionsT : public flatbuffers::NativeTable {
+struct TransposeConvOptionsT : public ::flatbuffers::NativeTable {
   typedef TransposeConvOptions TableType;
   tflite::Padding padding = tflite::Padding_SAME;
   int32_t stride_w = 0;
@@ -8541,7 +8626,7 @@ struct TransposeConvOptionsT : public flatbuffers::NativeTable {
   tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
 };
 
-struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef TransposeConvOptionsT NativeTableType;
   typedef TransposeConvOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -8562,7 +8647,7 @@ struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
@@ -8570,15 +8655,15 @@ struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            verifier.EndTable();
   }
-  TransposeConvOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(TransposeConvOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<TransposeConvOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  TransposeConvOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TransposeConvOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TransposeConvOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct TransposeConvOptionsBuilder {
   typedef TransposeConvOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_padding(tflite::Padding padding) {
     fbb_.AddElement<int8_t>(TransposeConvOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
   }
@@ -8591,19 +8676,19 @@ struct TransposeConvOptionsBuilder {
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(TransposeConvOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
-  explicit TransposeConvOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit TransposeConvOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<TransposeConvOptions> Finish() {
+  ::flatbuffers::Offset<TransposeConvOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<TransposeConvOptions>(end);
+    auto o = ::flatbuffers::Offset<TransposeConvOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::Padding padding = tflite::Padding_SAME,
     int32_t stride_w = 0,
     int32_t stride_h = 0,
@@ -8616,53 +8701,53 @@ inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ExpandDimsOptionsT : public flatbuffers::NativeTable {
+struct ExpandDimsOptionsT : public ::flatbuffers::NativeTable {
   typedef ExpandDimsOptions TableType;
 };
 
-struct ExpandDimsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ExpandDimsOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ExpandDimsOptionsT NativeTableType;
   typedef ExpandDimsOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  ExpandDimsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ExpandDimsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ExpandDimsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ExpandDimsOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ExpandDimsOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ExpandDimsOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ExpandDimsOptionsBuilder {
   typedef ExpandDimsOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit ExpandDimsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ExpandDimsOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ExpandDimsOptions> Finish() {
+  ::flatbuffers::Offset<ExpandDimsOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ExpandDimsOptions>(end);
+    auto o = ::flatbuffers::Offset<ExpandDimsOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   ExpandDimsOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SparseToDenseOptionsT : public flatbuffers::NativeTable {
+struct SparseToDenseOptionsT : public ::flatbuffers::NativeTable {
   typedef SparseToDenseOptions TableType;
   bool validate_indices = false;
 };
 
-struct SparseToDenseOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SparseToDenseOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SparseToDenseOptionsT NativeTableType;
   typedef SparseToDenseOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -8671,128 +8756,128 @@ struct SparseToDenseOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   bool validate_indices() const {
     return GetField<uint8_t>(VT_VALIDATE_INDICES, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_VALIDATE_INDICES, 1) &&
            verifier.EndTable();
   }
-  SparseToDenseOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SparseToDenseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SparseToDenseOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SparseToDenseOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SparseToDenseOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SparseToDenseOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SparseToDenseOptionsBuilder {
   typedef SparseToDenseOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_validate_indices(bool validate_indices) {
     fbb_.AddElement<uint8_t>(SparseToDenseOptions::VT_VALIDATE_INDICES, static_cast<uint8_t>(validate_indices), 0);
   }
-  explicit SparseToDenseOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SparseToDenseOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SparseToDenseOptions> Finish() {
+  ::flatbuffers::Offset<SparseToDenseOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SparseToDenseOptions>(end);
+    auto o = ::flatbuffers::Offset<SparseToDenseOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool validate_indices = false) {
   SparseToDenseOptionsBuilder builder_(_fbb);
   builder_.add_validate_indices(validate_indices);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct EqualOptionsT : public flatbuffers::NativeTable {
+struct EqualOptionsT : public ::flatbuffers::NativeTable {
   typedef EqualOptions TableType;
 };
 
-struct EqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct EqualOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef EqualOptionsT NativeTableType;
   typedef EqualOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  EqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(EqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<EqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  EqualOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<EqualOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct EqualOptionsBuilder {
   typedef EqualOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit EqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit EqualOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<EqualOptions> Finish() {
+  ::flatbuffers::Offset<EqualOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<EqualOptions>(end);
+    auto o = ::flatbuffers::Offset<EqualOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<EqualOptions> CreateEqualOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<EqualOptions> CreateEqualOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   EqualOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<EqualOptions> CreateEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<EqualOptions> CreateEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct NotEqualOptionsT : public flatbuffers::NativeTable {
+struct NotEqualOptionsT : public ::flatbuffers::NativeTable {
   typedef NotEqualOptions TableType;
 };
 
-struct NotEqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct NotEqualOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef NotEqualOptionsT NativeTableType;
   typedef NotEqualOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  NotEqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(NotEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<NotEqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  NotEqualOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NotEqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<NotEqualOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct NotEqualOptionsBuilder {
   typedef NotEqualOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit NotEqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NotEqualOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<NotEqualOptions> Finish() {
+  ::flatbuffers::Offset<NotEqualOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<NotEqualOptions>(end);
+    auto o = ::flatbuffers::Offset<NotEqualOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   NotEqualOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ShapeOptionsT : public flatbuffers::NativeTable {
+struct ShapeOptionsT : public ::flatbuffers::NativeTable {
   typedef ShapeOptions TableType;
   tflite::TensorType out_type = tflite::TensorType_FLOAT32;
 };
 
-struct ShapeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ShapeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ShapeOptionsT NativeTableType;
   typedef ShapeOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -8801,123 +8886,123 @@ struct ShapeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::TensorType out_type() const {
     return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUT_TYPE, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_OUT_TYPE, 1) &&
            verifier.EndTable();
   }
-  ShapeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ShapeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ShapeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ShapeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ShapeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ShapeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ShapeOptionsBuilder {
   typedef ShapeOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_out_type(tflite::TensorType out_type) {
     fbb_.AddElement<int8_t>(ShapeOptions::VT_OUT_TYPE, static_cast<int8_t>(out_type), 0);
   }
-  explicit ShapeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ShapeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ShapeOptions> Finish() {
+  ::flatbuffers::Offset<ShapeOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ShapeOptions>(end);
+    auto o = ::flatbuffers::Offset<ShapeOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ShapeOptions> CreateShapeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ShapeOptions> CreateShapeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::TensorType out_type = tflite::TensorType_FLOAT32) {
   ShapeOptionsBuilder builder_(_fbb);
   builder_.add_out_type(out_type);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ShapeOptions> CreateShapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ShapeOptions> CreateShapeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct RankOptionsT : public flatbuffers::NativeTable {
+struct RankOptionsT : public ::flatbuffers::NativeTable {
   typedef RankOptions TableType;
 };
 
-struct RankOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct RankOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef RankOptionsT NativeTableType;
   typedef RankOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  RankOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(RankOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<RankOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  RankOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RankOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<RankOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct RankOptionsBuilder {
   typedef RankOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit RankOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit RankOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<RankOptions> Finish() {
+  ::flatbuffers::Offset<RankOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<RankOptions>(end);
+    auto o = ::flatbuffers::Offset<RankOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<RankOptions> CreateRankOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<RankOptions> CreateRankOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   RankOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<RankOptions> CreateRankOptions(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<RankOptions> CreateRankOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct PowOptionsT : public flatbuffers::NativeTable {
+struct PowOptionsT : public ::flatbuffers::NativeTable {
   typedef PowOptions TableType;
 };
 
-struct PowOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct PowOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef PowOptionsT NativeTableType;
   typedef PowOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  PowOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(PowOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<PowOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  PowOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PowOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<PowOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct PowOptionsBuilder {
   typedef PowOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit PowOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit PowOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<PowOptions> Finish() {
+  ::flatbuffers::Offset<PowOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<PowOptions>(end);
+    auto o = ::flatbuffers::Offset<PowOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<PowOptions> CreatePowOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<PowOptions> CreatePowOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   PowOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<PowOptions> CreatePowOptions(flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<PowOptions> CreatePowOptions(::flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct FakeQuantOptionsT : public flatbuffers::NativeTable {
+struct FakeQuantOptionsT : public ::flatbuffers::NativeTable {
   typedef FakeQuantOptions TableType;
   float min = 0.0f;
   float max = 0.0f;
@@ -8925,7 +9010,7 @@ struct FakeQuantOptionsT : public flatbuffers::NativeTable {
   bool narrow_range = false;
 };
 
-struct FakeQuantOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct FakeQuantOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef FakeQuantOptionsT NativeTableType;
   typedef FakeQuantOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -8946,7 +9031,7 @@ struct FakeQuantOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool narrow_range() const {
     return GetField<uint8_t>(VT_NARROW_RANGE, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<float>(verifier, VT_MIN, 4) &&
            VerifyField<float>(verifier, VT_MAX, 4) &&
@@ -8954,15 +9039,15 @@ struct FakeQuantOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<uint8_t>(verifier, VT_NARROW_RANGE, 1) &&
            verifier.EndTable();
   }
-  FakeQuantOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(FakeQuantOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<FakeQuantOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  FakeQuantOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FakeQuantOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FakeQuantOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct FakeQuantOptionsBuilder {
   typedef FakeQuantOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_min(float min) {
     fbb_.AddElement<float>(FakeQuantOptions::VT_MIN, min, 0.0f);
   }
@@ -8975,19 +9060,19 @@ struct FakeQuantOptionsBuilder {
   void add_narrow_range(bool narrow_range) {
     fbb_.AddElement<uint8_t>(FakeQuantOptions::VT_NARROW_RANGE, static_cast<uint8_t>(narrow_range), 0);
   }
-  explicit FakeQuantOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit FakeQuantOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<FakeQuantOptions> Finish() {
+  ::flatbuffers::Offset<FakeQuantOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<FakeQuantOptions>(end);
+    auto o = ::flatbuffers::Offset<FakeQuantOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     float min = 0.0f,
     float max = 0.0f,
     int32_t num_bits = 0,
@@ -9000,15 +9085,15 @@ inline flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct PackOptionsT : public flatbuffers::NativeTable {
+struct PackOptionsT : public ::flatbuffers::NativeTable {
   typedef PackOptions TableType;
   int32_t values_count = 0;
   int32_t axis = 0;
 };
 
-struct PackOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct PackOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef PackOptionsT NativeTableType;
   typedef PackOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -9021,40 +9106,40 @@ struct PackOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t axis() const {
     return GetField<int32_t>(VT_AXIS, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_VALUES_COUNT, 4) &&
            VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
            verifier.EndTable();
   }
-  PackOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(PackOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<PackOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  PackOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PackOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<PackOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct PackOptionsBuilder {
   typedef PackOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_values_count(int32_t values_count) {
     fbb_.AddElement<int32_t>(PackOptions::VT_VALUES_COUNT, values_count, 0);
   }
   void add_axis(int32_t axis) {
     fbb_.AddElement<int32_t>(PackOptions::VT_AXIS, axis, 0);
   }
-  explicit PackOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit PackOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<PackOptions> Finish() {
+  ::flatbuffers::Offset<PackOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<PackOptions>(end);
+    auto o = ::flatbuffers::Offset<PackOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<PackOptions> CreatePackOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<PackOptions> CreatePackOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t values_count = 0,
     int32_t axis = 0) {
   PackOptionsBuilder builder_(_fbb);
@@ -9063,53 +9148,53 @@ inline flatbuffers::Offset<PackOptions> CreatePackOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<PackOptions> CreatePackOptions(flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<PackOptions> CreatePackOptions(::flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct LogicalOrOptionsT : public flatbuffers::NativeTable {
+struct LogicalOrOptionsT : public ::flatbuffers::NativeTable {
   typedef LogicalOrOptions TableType;
 };
 
-struct LogicalOrOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LogicalOrOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef LogicalOrOptionsT NativeTableType;
   typedef LogicalOrOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  LogicalOrOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LogicalOrOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LogicalOrOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  LogicalOrOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogicalOrOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LogicalOrOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LogicalOrOptionsBuilder {
   typedef LogicalOrOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit LogicalOrOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LogicalOrOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<LogicalOrOptions> Finish() {
+  ::flatbuffers::Offset<LogicalOrOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<LogicalOrOptions>(end);
+    auto o = ::flatbuffers::Offset<LogicalOrOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   LogicalOrOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct OneHotOptionsT : public flatbuffers::NativeTable {
+struct OneHotOptionsT : public ::flatbuffers::NativeTable {
   typedef OneHotOptions TableType;
   int32_t axis = 0;
 };
 
-struct OneHotOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct OneHotOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef OneHotOptionsT NativeTableType;
   typedef OneHotOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -9118,207 +9203,207 @@ struct OneHotOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t axis() const {
     return GetField<int32_t>(VT_AXIS, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
            verifier.EndTable();
   }
-  OneHotOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(OneHotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<OneHotOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  OneHotOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(OneHotOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<OneHotOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct OneHotOptionsBuilder {
   typedef OneHotOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_axis(int32_t axis) {
     fbb_.AddElement<int32_t>(OneHotOptions::VT_AXIS, axis, 0);
   }
-  explicit OneHotOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit OneHotOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<OneHotOptions> Finish() {
+  ::flatbuffers::Offset<OneHotOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<OneHotOptions>(end);
+    auto o = ::flatbuffers::Offset<OneHotOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t axis = 0) {
   OneHotOptionsBuilder builder_(_fbb);
   builder_.add_axis(axis);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(::flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct AbsOptionsT : public flatbuffers::NativeTable {
+struct AbsOptionsT : public ::flatbuffers::NativeTable {
   typedef AbsOptions TableType;
 };
 
-struct AbsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct AbsOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef AbsOptionsT NativeTableType;
   typedef AbsOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  AbsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(AbsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<AbsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  AbsOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AbsOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<AbsOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct AbsOptionsBuilder {
   typedef AbsOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit AbsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit AbsOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<AbsOptions> Finish() {
+  ::flatbuffers::Offset<AbsOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<AbsOptions>(end);
+    auto o = ::flatbuffers::Offset<AbsOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<AbsOptions> CreateAbsOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<AbsOptions> CreateAbsOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   AbsOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<AbsOptions> CreateAbsOptions(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<AbsOptions> CreateAbsOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct HardSwishOptionsT : public flatbuffers::NativeTable {
+struct HardSwishOptionsT : public ::flatbuffers::NativeTable {
   typedef HardSwishOptions TableType;
 };
 
-struct HardSwishOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct HardSwishOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef HardSwishOptionsT NativeTableType;
   typedef HardSwishOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  HardSwishOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(HardSwishOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<HardSwishOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  HardSwishOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HardSwishOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HardSwishOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct HardSwishOptionsBuilder {
   typedef HardSwishOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit HardSwishOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit HardSwishOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<HardSwishOptions> Finish() {
+  ::flatbuffers::Offset<HardSwishOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<HardSwishOptions>(end);
+    auto o = ::flatbuffers::Offset<HardSwishOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   HardSwishOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct LogicalAndOptionsT : public flatbuffers::NativeTable {
+struct LogicalAndOptionsT : public ::flatbuffers::NativeTable {
   typedef LogicalAndOptions TableType;
 };
 
-struct LogicalAndOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LogicalAndOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef LogicalAndOptionsT NativeTableType;
   typedef LogicalAndOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  LogicalAndOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LogicalAndOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LogicalAndOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  LogicalAndOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogicalAndOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LogicalAndOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LogicalAndOptionsBuilder {
   typedef LogicalAndOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit LogicalAndOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LogicalAndOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<LogicalAndOptions> Finish() {
+  ::flatbuffers::Offset<LogicalAndOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<LogicalAndOptions>(end);
+    auto o = ::flatbuffers::Offset<LogicalAndOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   LogicalAndOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct LogicalNotOptionsT : public flatbuffers::NativeTable {
+struct LogicalNotOptionsT : public ::flatbuffers::NativeTable {
   typedef LogicalNotOptions TableType;
 };
 
-struct LogicalNotOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LogicalNotOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef LogicalNotOptionsT NativeTableType;
   typedef LogicalNotOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  LogicalNotOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LogicalNotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LogicalNotOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  LogicalNotOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogicalNotOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LogicalNotOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LogicalNotOptionsBuilder {
   typedef LogicalNotOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit LogicalNotOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LogicalNotOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<LogicalNotOptions> Finish() {
+  ::flatbuffers::Offset<LogicalNotOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<LogicalNotOptions>(end);
+    auto o = ::flatbuffers::Offset<LogicalNotOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   LogicalNotOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct UnpackOptionsT : public flatbuffers::NativeTable {
+struct UnpackOptionsT : public ::flatbuffers::NativeTable {
   typedef UnpackOptions TableType;
   int32_t num = 0;
   int32_t axis = 0;
 };
 
-struct UnpackOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct UnpackOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef UnpackOptionsT NativeTableType;
   typedef UnpackOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -9331,40 +9416,40 @@ struct UnpackOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t axis() const {
     return GetField<int32_t>(VT_AXIS, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_NUM, 4) &&
            VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
            verifier.EndTable();
   }
-  UnpackOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(UnpackOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<UnpackOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  UnpackOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnpackOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UnpackOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct UnpackOptionsBuilder {
   typedef UnpackOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_num(int32_t num) {
     fbb_.AddElement<int32_t>(UnpackOptions::VT_NUM, num, 0);
   }
   void add_axis(int32_t axis) {
     fbb_.AddElement<int32_t>(UnpackOptions::VT_AXIS, axis, 0);
   }
-  explicit UnpackOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit UnpackOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<UnpackOptions> Finish() {
+  ::flatbuffers::Offset<UnpackOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<UnpackOptions>(end);
+    auto o = ::flatbuffers::Offset<UnpackOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t num = 0,
     int32_t axis = 0) {
   UnpackOptionsBuilder builder_(_fbb);
@@ -9373,248 +9458,248 @@ inline flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct FloorDivOptionsT : public flatbuffers::NativeTable {
+struct FloorDivOptionsT : public ::flatbuffers::NativeTable {
   typedef FloorDivOptions TableType;
 };
 
-struct FloorDivOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct FloorDivOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef FloorDivOptionsT NativeTableType;
   typedef FloorDivOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  FloorDivOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(FloorDivOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<FloorDivOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  FloorDivOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FloorDivOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FloorDivOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct FloorDivOptionsBuilder {
   typedef FloorDivOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit FloorDivOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit FloorDivOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<FloorDivOptions> Finish() {
+  ::flatbuffers::Offset<FloorDivOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<FloorDivOptions>(end);
+    auto o = ::flatbuffers::Offset<FloorDivOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   FloorDivOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SquareOptionsT : public flatbuffers::NativeTable {
+struct SquareOptionsT : public ::flatbuffers::NativeTable {
   typedef SquareOptions TableType;
 };
 
-struct SquareOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SquareOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SquareOptionsT NativeTableType;
   typedef SquareOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  SquareOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SquareOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SquareOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SquareOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SquareOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SquareOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SquareOptionsBuilder {
   typedef SquareOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit SquareOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SquareOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SquareOptions> Finish() {
+  ::flatbuffers::Offset<SquareOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SquareOptions>(end);
+    auto o = ::flatbuffers::Offset<SquareOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SquareOptions> CreateSquareOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<SquareOptions> CreateSquareOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   SquareOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SquareOptions> CreateSquareOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SquareOptions> CreateSquareOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ZerosLikeOptionsT : public flatbuffers::NativeTable {
+struct ZerosLikeOptionsT : public ::flatbuffers::NativeTable {
   typedef ZerosLikeOptions TableType;
 };
 
-struct ZerosLikeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ZerosLikeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ZerosLikeOptionsT NativeTableType;
   typedef ZerosLikeOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  ZerosLikeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ZerosLikeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ZerosLikeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ZerosLikeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ZerosLikeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ZerosLikeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ZerosLikeOptionsBuilder {
   typedef ZerosLikeOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit ZerosLikeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ZerosLikeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ZerosLikeOptions> Finish() {
+  ::flatbuffers::Offset<ZerosLikeOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ZerosLikeOptions>(end);
+    auto o = ::flatbuffers::Offset<ZerosLikeOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   ZerosLikeOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct FillOptionsT : public flatbuffers::NativeTable {
+struct FillOptionsT : public ::flatbuffers::NativeTable {
   typedef FillOptions TableType;
 };
 
-struct FillOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct FillOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef FillOptionsT NativeTableType;
   typedef FillOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  FillOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(FillOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<FillOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  FillOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FillOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FillOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct FillOptionsBuilder {
   typedef FillOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit FillOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit FillOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<FillOptions> Finish() {
+  ::flatbuffers::Offset<FillOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<FillOptions>(end);
+    auto o = ::flatbuffers::Offset<FillOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<FillOptions> CreateFillOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<FillOptions> CreateFillOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   FillOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<FillOptions> CreateFillOptions(flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<FillOptions> CreateFillOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct FloorModOptionsT : public flatbuffers::NativeTable {
+struct FloorModOptionsT : public ::flatbuffers::NativeTable {
   typedef FloorModOptions TableType;
 };
 
-struct FloorModOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct FloorModOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef FloorModOptionsT NativeTableType;
   typedef FloorModOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  FloorModOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(FloorModOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<FloorModOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  FloorModOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FloorModOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<FloorModOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct FloorModOptionsBuilder {
   typedef FloorModOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit FloorModOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit FloorModOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<FloorModOptions> Finish() {
+  ::flatbuffers::Offset<FloorModOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<FloorModOptions>(end);
+    auto o = ::flatbuffers::Offset<FloorModOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   FloorModOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct RangeOptionsT : public flatbuffers::NativeTable {
+struct RangeOptionsT : public ::flatbuffers::NativeTable {
   typedef RangeOptions TableType;
 };
 
-struct RangeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct RangeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef RangeOptionsT NativeTableType;
   typedef RangeOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  RangeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(RangeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<RangeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  RangeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RangeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<RangeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct RangeOptionsBuilder {
   typedef RangeOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit RangeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit RangeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<RangeOptions> Finish() {
+  ::flatbuffers::Offset<RangeOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<RangeOptions>(end);
+    auto o = ::flatbuffers::Offset<RangeOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<RangeOptions> CreateRangeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<RangeOptions> CreateRangeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   RangeOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<RangeOptions> CreateRangeOptions(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<RangeOptions> CreateRangeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct LeakyReluOptionsT : public flatbuffers::NativeTable {
+struct LeakyReluOptionsT : public ::flatbuffers::NativeTable {
   typedef LeakyReluOptions TableType;
   float alpha = 0.0f;
 };
 
-struct LeakyReluOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LeakyReluOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef LeakyReluOptionsT NativeTableType;
   typedef LeakyReluOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -9623,89 +9708,89 @@ struct LeakyReluOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   float alpha() const {
     return GetField<float>(VT_ALPHA, 0.0f);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<float>(verifier, VT_ALPHA, 4) &&
            verifier.EndTable();
   }
-  LeakyReluOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LeakyReluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LeakyReluOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  LeakyReluOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LeakyReluOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<LeakyReluOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LeakyReluOptionsBuilder {
   typedef LeakyReluOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_alpha(float alpha) {
     fbb_.AddElement<float>(LeakyReluOptions::VT_ALPHA, alpha, 0.0f);
   }
-  explicit LeakyReluOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit LeakyReluOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<LeakyReluOptions> Finish() {
+  ::flatbuffers::Offset<LeakyReluOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<LeakyReluOptions>(end);
+    auto o = ::flatbuffers::Offset<LeakyReluOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     float alpha = 0.0f) {
   LeakyReluOptionsBuilder builder_(_fbb);
   builder_.add_alpha(alpha);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SquaredDifferenceOptionsT : public flatbuffers::NativeTable {
+struct SquaredDifferenceOptionsT : public ::flatbuffers::NativeTable {
   typedef SquaredDifferenceOptions TableType;
 };
 
-struct SquaredDifferenceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SquaredDifferenceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SquaredDifferenceOptionsT NativeTableType;
   typedef SquaredDifferenceOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  SquaredDifferenceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SquaredDifferenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SquaredDifferenceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SquaredDifferenceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SquaredDifferenceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SquaredDifferenceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SquaredDifferenceOptionsBuilder {
   typedef SquaredDifferenceOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit SquaredDifferenceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SquaredDifferenceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SquaredDifferenceOptions> Finish() {
+  ::flatbuffers::Offset<SquaredDifferenceOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SquaredDifferenceOptions>(end);
+    auto o = ::flatbuffers::Offset<SquaredDifferenceOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   SquaredDifferenceOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct MirrorPadOptionsT : public flatbuffers::NativeTable {
+struct MirrorPadOptionsT : public ::flatbuffers::NativeTable {
   typedef MirrorPadOptions TableType;
   tflite::MirrorPadMode mode = tflite::MirrorPadMode_REFLECT;
 };
 
-struct MirrorPadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct MirrorPadOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef MirrorPadOptionsT NativeTableType;
   typedef MirrorPadOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -9714,50 +9799,50 @@ struct MirrorPadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::MirrorPadMode mode() const {
     return static_cast<tflite::MirrorPadMode>(GetField<int8_t>(VT_MODE, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_MODE, 1) &&
            verifier.EndTable();
   }
-  MirrorPadOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MirrorPadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<MirrorPadOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MirrorPadOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MirrorPadOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MirrorPadOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct MirrorPadOptionsBuilder {
   typedef MirrorPadOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_mode(tflite::MirrorPadMode mode) {
     fbb_.AddElement<int8_t>(MirrorPadOptions::VT_MODE, static_cast<int8_t>(mode), 0);
   }
-  explicit MirrorPadOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit MirrorPadOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<MirrorPadOptions> Finish() {
+  ::flatbuffers::Offset<MirrorPadOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<MirrorPadOptions>(end);
+    auto o = ::flatbuffers::Offset<MirrorPadOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::MirrorPadMode mode = tflite::MirrorPadMode_REFLECT) {
   MirrorPadOptionsBuilder builder_(_fbb);
   builder_.add_mode(mode);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct UniqueOptionsT : public flatbuffers::NativeTable {
+struct UniqueOptionsT : public ::flatbuffers::NativeTable {
   typedef UniqueOptions TableType;
   tflite::TensorType idx_out_type = tflite::TensorType_INT32;
 };
 
-struct UniqueOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct UniqueOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef UniqueOptionsT NativeTableType;
   typedef UniqueOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -9766,207 +9851,207 @@ struct UniqueOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::TensorType idx_out_type() const {
     return static_cast<tflite::TensorType>(GetField<int8_t>(VT_IDX_OUT_TYPE, 2));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_IDX_OUT_TYPE, 1) &&
            verifier.EndTable();
   }
-  UniqueOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(UniqueOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<UniqueOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  UniqueOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UniqueOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UniqueOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct UniqueOptionsBuilder {
   typedef UniqueOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_idx_out_type(tflite::TensorType idx_out_type) {
     fbb_.AddElement<int8_t>(UniqueOptions::VT_IDX_OUT_TYPE, static_cast<int8_t>(idx_out_type), 2);
   }
-  explicit UniqueOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit UniqueOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<UniqueOptions> Finish() {
+  ::flatbuffers::Offset<UniqueOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<UniqueOptions>(end);
+    auto o = ::flatbuffers::Offset<UniqueOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     tflite::TensorType idx_out_type = tflite::TensorType_INT32) {
   UniqueOptionsBuilder builder_(_fbb);
   builder_.add_idx_out_type(idx_out_type);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ReverseV2OptionsT : public flatbuffers::NativeTable {
+struct ReverseV2OptionsT : public ::flatbuffers::NativeTable {
   typedef ReverseV2Options TableType;
 };
 
-struct ReverseV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ReverseV2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ReverseV2OptionsT NativeTableType;
   typedef ReverseV2OptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  ReverseV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ReverseV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ReverseV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ReverseV2OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReverseV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ReverseV2Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ReverseV2OptionsBuilder {
   typedef ReverseV2Options Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit ReverseV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ReverseV2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ReverseV2Options> Finish() {
+  ::flatbuffers::Offset<ReverseV2Options> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ReverseV2Options>(end);
+    auto o = ::flatbuffers::Offset<ReverseV2Options>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   ReverseV2OptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct AddNOptionsT : public flatbuffers::NativeTable {
+struct AddNOptionsT : public ::flatbuffers::NativeTable {
   typedef AddNOptions TableType;
 };
 
-struct AddNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct AddNOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef AddNOptionsT NativeTableType;
   typedef AddNOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  AddNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(AddNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<AddNOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  AddNOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AddNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<AddNOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct AddNOptionsBuilder {
   typedef AddNOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit AddNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit AddNOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<AddNOptions> Finish() {
+  ::flatbuffers::Offset<AddNOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<AddNOptions>(end);
+    auto o = ::flatbuffers::Offset<AddNOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<AddNOptions> CreateAddNOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<AddNOptions> CreateAddNOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   AddNOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<AddNOptions> CreateAddNOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<AddNOptions> CreateAddNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct GatherNdOptionsT : public flatbuffers::NativeTable {
+struct GatherNdOptionsT : public ::flatbuffers::NativeTable {
   typedef GatherNdOptions TableType;
 };
 
-struct GatherNdOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct GatherNdOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef GatherNdOptionsT NativeTableType;
   typedef GatherNdOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  GatherNdOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(GatherNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<GatherNdOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  GatherNdOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GatherNdOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GatherNdOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct GatherNdOptionsBuilder {
   typedef GatherNdOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit GatherNdOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit GatherNdOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<GatherNdOptions> Finish() {
+  ::flatbuffers::Offset<GatherNdOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<GatherNdOptions>(end);
+    auto o = ::flatbuffers::Offset<GatherNdOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   GatherNdOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct WhereOptionsT : public flatbuffers::NativeTable {
+struct WhereOptionsT : public ::flatbuffers::NativeTable {
   typedef WhereOptions TableType;
 };
 
-struct WhereOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct WhereOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef WhereOptionsT NativeTableType;
   typedef WhereOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  WhereOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(WhereOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<WhereOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  WhereOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(WhereOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<WhereOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct WhereOptionsBuilder {
   typedef WhereOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit WhereOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit WhereOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<WhereOptions> Finish() {
+  ::flatbuffers::Offset<WhereOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<WhereOptions>(end);
+    auto o = ::flatbuffers::Offset<WhereOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<WhereOptions> CreateWhereOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<WhereOptions> CreateWhereOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   WhereOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<WhereOptions> CreateWhereOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<WhereOptions> CreateWhereOptions(::flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ReverseSequenceOptionsT : public flatbuffers::NativeTable {
+struct ReverseSequenceOptionsT : public ::flatbuffers::NativeTable {
   typedef ReverseSequenceOptions TableType;
   int32_t seq_dim = 0;
   int32_t batch_dim = 0;
 };
 
-struct ReverseSequenceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ReverseSequenceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ReverseSequenceOptionsT NativeTableType;
   typedef ReverseSequenceOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -9979,40 +10064,40 @@ struct ReverseSequenceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   int32_t batch_dim() const {
     return GetField<int32_t>(VT_BATCH_DIM, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_SEQ_DIM, 4) &&
            VerifyField<int32_t>(verifier, VT_BATCH_DIM, 4) &&
            verifier.EndTable();
   }
-  ReverseSequenceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ReverseSequenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ReverseSequenceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ReverseSequenceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReverseSequenceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ReverseSequenceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ReverseSequenceOptionsBuilder {
   typedef ReverseSequenceOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_seq_dim(int32_t seq_dim) {
     fbb_.AddElement<int32_t>(ReverseSequenceOptions::VT_SEQ_DIM, seq_dim, 0);
   }
   void add_batch_dim(int32_t batch_dim) {
     fbb_.AddElement<int32_t>(ReverseSequenceOptions::VT_BATCH_DIM, batch_dim, 0);
   }
-  explicit ReverseSequenceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ReverseSequenceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ReverseSequenceOptions> Finish() {
+  ::flatbuffers::Offset<ReverseSequenceOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ReverseSequenceOptions>(end);
+    auto o = ::flatbuffers::Offset<ReverseSequenceOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t seq_dim = 0,
     int32_t batch_dim = 0) {
   ReverseSequenceOptionsBuilder builder_(_fbb);
@@ -10021,132 +10106,132 @@ inline flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct MatrixDiagOptionsT : public flatbuffers::NativeTable {
+struct MatrixDiagOptionsT : public ::flatbuffers::NativeTable {
   typedef MatrixDiagOptions TableType;
 };
 
-struct MatrixDiagOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct MatrixDiagOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef MatrixDiagOptionsT NativeTableType;
   typedef MatrixDiagOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  MatrixDiagOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MatrixDiagOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<MatrixDiagOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MatrixDiagOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MatrixDiagOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MatrixDiagOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct MatrixDiagOptionsBuilder {
   typedef MatrixDiagOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit MatrixDiagOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit MatrixDiagOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<MatrixDiagOptions> Finish() {
+  ::flatbuffers::Offset<MatrixDiagOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<MatrixDiagOptions>(end);
+    auto o = ::flatbuffers::Offset<MatrixDiagOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   MatrixDiagOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct QuantizeOptionsT : public flatbuffers::NativeTable {
+struct QuantizeOptionsT : public ::flatbuffers::NativeTable {
   typedef QuantizeOptions TableType;
 };
 
-struct QuantizeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct QuantizeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef QuantizeOptionsT NativeTableType;
   typedef QuantizeOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  QuantizeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(QuantizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<QuantizeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  QuantizeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(QuantizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<QuantizeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct QuantizeOptionsBuilder {
   typedef QuantizeOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit QuantizeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit QuantizeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<QuantizeOptions> Finish() {
+  ::flatbuffers::Offset<QuantizeOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<QuantizeOptions>(end);
+    auto o = ::flatbuffers::Offset<QuantizeOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   QuantizeOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct MatrixSetDiagOptionsT : public flatbuffers::NativeTable {
+struct MatrixSetDiagOptionsT : public ::flatbuffers::NativeTable {
   typedef MatrixSetDiagOptions TableType;
 };
 
-struct MatrixSetDiagOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct MatrixSetDiagOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef MatrixSetDiagOptionsT NativeTableType;
   typedef MatrixSetDiagOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  MatrixSetDiagOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MatrixSetDiagOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<MatrixSetDiagOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MatrixSetDiagOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MatrixSetDiagOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<MatrixSetDiagOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct MatrixSetDiagOptionsBuilder {
   typedef MatrixSetDiagOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit MatrixSetDiagOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit MatrixSetDiagOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<MatrixSetDiagOptions> Finish() {
+  ::flatbuffers::Offset<MatrixSetDiagOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<MatrixSetDiagOptions>(end);
+    auto o = ::flatbuffers::Offset<MatrixSetDiagOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   MatrixSetDiagOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct IfOptionsT : public flatbuffers::NativeTable {
+struct IfOptionsT : public ::flatbuffers::NativeTable {
   typedef IfOptions TableType;
   int32_t then_subgraph_index = 0;
   int32_t else_subgraph_index = 0;
 };
 
-struct IfOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct IfOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef IfOptionsT NativeTableType;
   typedef IfOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -10159,40 +10244,40 @@ struct IfOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t else_subgraph_index() const {
     return GetField<int32_t>(VT_ELSE_SUBGRAPH_INDEX, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_THEN_SUBGRAPH_INDEX, 4) &&
            VerifyField<int32_t>(verifier, VT_ELSE_SUBGRAPH_INDEX, 4) &&
            verifier.EndTable();
   }
-  IfOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(IfOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<IfOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  IfOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(IfOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<IfOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct IfOptionsBuilder {
   typedef IfOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_then_subgraph_index(int32_t then_subgraph_index) {
     fbb_.AddElement<int32_t>(IfOptions::VT_THEN_SUBGRAPH_INDEX, then_subgraph_index, 0);
   }
   void add_else_subgraph_index(int32_t else_subgraph_index) {
     fbb_.AddElement<int32_t>(IfOptions::VT_ELSE_SUBGRAPH_INDEX, else_subgraph_index, 0);
   }
-  explicit IfOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit IfOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<IfOptions> Finish() {
+  ::flatbuffers::Offset<IfOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<IfOptions>(end);
+    auto o = ::flatbuffers::Offset<IfOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<IfOptions> CreateIfOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<IfOptions> CreateIfOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t then_subgraph_index = 0,
     int32_t else_subgraph_index = 0) {
   IfOptionsBuilder builder_(_fbb);
@@ -10201,14 +10286,14 @@ inline flatbuffers::Offset<IfOptions> CreateIfOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<IfOptions> CreateIfOptions(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<IfOptions> CreateIfOptions(::flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct CallOnceOptionsT : public flatbuffers::NativeTable {
+struct CallOnceOptionsT : public ::flatbuffers::NativeTable {
   typedef CallOnceOptions TableType;
   int32_t init_subgraph_index = 0;
 };
 
-struct CallOnceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct CallOnceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef CallOnceOptionsT NativeTableType;
   typedef CallOnceOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -10217,51 +10302,51 @@ struct CallOnceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t init_subgraph_index() const {
     return GetField<int32_t>(VT_INIT_SUBGRAPH_INDEX, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_INIT_SUBGRAPH_INDEX, 4) &&
            verifier.EndTable();
   }
-  CallOnceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(CallOnceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<CallOnceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  CallOnceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CallOnceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CallOnceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct CallOnceOptionsBuilder {
   typedef CallOnceOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_init_subgraph_index(int32_t init_subgraph_index) {
     fbb_.AddElement<int32_t>(CallOnceOptions::VT_INIT_SUBGRAPH_INDEX, init_subgraph_index, 0);
   }
-  explicit CallOnceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit CallOnceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CallOnceOptions> Finish() {
+  ::flatbuffers::Offset<CallOnceOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CallOnceOptions>(end);
+    auto o = ::flatbuffers::Offset<CallOnceOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t init_subgraph_index = 0) {
   CallOnceOptionsBuilder builder_(_fbb);
   builder_.add_init_subgraph_index(init_subgraph_index);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct WhileOptionsT : public flatbuffers::NativeTable {
+struct WhileOptionsT : public ::flatbuffers::NativeTable {
   typedef WhileOptions TableType;
   int32_t cond_subgraph_index = 0;
   int32_t body_subgraph_index = 0;
 };
 
-struct WhileOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct WhileOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef WhileOptionsT NativeTableType;
   typedef WhileOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -10274,40 +10359,40 @@ struct WhileOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t body_subgraph_index() const {
     return GetField<int32_t>(VT_BODY_SUBGRAPH_INDEX, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_COND_SUBGRAPH_INDEX, 4) &&
            VerifyField<int32_t>(verifier, VT_BODY_SUBGRAPH_INDEX, 4) &&
            verifier.EndTable();
   }
-  WhileOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(WhileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<WhileOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  WhileOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(WhileOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<WhileOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct WhileOptionsBuilder {
   typedef WhileOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_cond_subgraph_index(int32_t cond_subgraph_index) {
     fbb_.AddElement<int32_t>(WhileOptions::VT_COND_SUBGRAPH_INDEX, cond_subgraph_index, 0);
   }
   void add_body_subgraph_index(int32_t body_subgraph_index) {
     fbb_.AddElement<int32_t>(WhileOptions::VT_BODY_SUBGRAPH_INDEX, body_subgraph_index, 0);
   }
-  explicit WhileOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit WhileOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<WhileOptions> Finish() {
+  ::flatbuffers::Offset<WhileOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<WhileOptions>(end);
+    auto o = ::flatbuffers::Offset<WhileOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<WhileOptions> CreateWhileOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<WhileOptions> CreateWhileOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t cond_subgraph_index = 0,
     int32_t body_subgraph_index = 0) {
   WhileOptionsBuilder builder_(_fbb);
@@ -10316,250 +10401,250 @@ inline flatbuffers::Offset<WhileOptions> CreateWhileOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<WhileOptions> CreateWhileOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<WhileOptions> CreateWhileOptions(::flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct NonMaxSuppressionV4OptionsT : public flatbuffers::NativeTable {
+struct NonMaxSuppressionV4OptionsT : public ::flatbuffers::NativeTable {
   typedef NonMaxSuppressionV4Options TableType;
 };
 
-struct NonMaxSuppressionV4Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct NonMaxSuppressionV4Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef NonMaxSuppressionV4OptionsT NativeTableType;
   typedef NonMaxSuppressionV4OptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  NonMaxSuppressionV4OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(NonMaxSuppressionV4OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<NonMaxSuppressionV4Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  NonMaxSuppressionV4OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NonMaxSuppressionV4OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<NonMaxSuppressionV4Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct NonMaxSuppressionV4OptionsBuilder {
   typedef NonMaxSuppressionV4Options Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit NonMaxSuppressionV4OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NonMaxSuppressionV4OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<NonMaxSuppressionV4Options> Finish() {
+  ::flatbuffers::Offset<NonMaxSuppressionV4Options> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<NonMaxSuppressionV4Options>(end);
+    auto o = ::flatbuffers::Offset<NonMaxSuppressionV4Options>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   NonMaxSuppressionV4OptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct NonMaxSuppressionV5OptionsT : public flatbuffers::NativeTable {
+struct NonMaxSuppressionV5OptionsT : public ::flatbuffers::NativeTable {
   typedef NonMaxSuppressionV5Options TableType;
 };
 
-struct NonMaxSuppressionV5Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct NonMaxSuppressionV5Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef NonMaxSuppressionV5OptionsT NativeTableType;
   typedef NonMaxSuppressionV5OptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  NonMaxSuppressionV5OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(NonMaxSuppressionV5OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<NonMaxSuppressionV5Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  NonMaxSuppressionV5OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NonMaxSuppressionV5OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<NonMaxSuppressionV5Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct NonMaxSuppressionV5OptionsBuilder {
   typedef NonMaxSuppressionV5Options Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit NonMaxSuppressionV5OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NonMaxSuppressionV5OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<NonMaxSuppressionV5Options> Finish() {
+  ::flatbuffers::Offset<NonMaxSuppressionV5Options> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<NonMaxSuppressionV5Options>(end);
+    auto o = ::flatbuffers::Offset<NonMaxSuppressionV5Options>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   NonMaxSuppressionV5OptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ScatterNdOptionsT : public flatbuffers::NativeTable {
+struct ScatterNdOptionsT : public ::flatbuffers::NativeTable {
   typedef ScatterNdOptions TableType;
 };
 
-struct ScatterNdOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ScatterNdOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ScatterNdOptionsT NativeTableType;
   typedef ScatterNdOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  ScatterNdOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ScatterNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ScatterNdOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ScatterNdOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ScatterNdOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ScatterNdOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ScatterNdOptionsBuilder {
   typedef ScatterNdOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit ScatterNdOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ScatterNdOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ScatterNdOptions> Finish() {
+  ::flatbuffers::Offset<ScatterNdOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ScatterNdOptions>(end);
+    auto o = ::flatbuffers::Offset<ScatterNdOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   ScatterNdOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SelectV2OptionsT : public flatbuffers::NativeTable {
+struct SelectV2OptionsT : public ::flatbuffers::NativeTable {
   typedef SelectV2Options TableType;
 };
 
-struct SelectV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SelectV2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SelectV2OptionsT NativeTableType;
   typedef SelectV2OptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  SelectV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SelectV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SelectV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SelectV2OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SelectV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SelectV2Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SelectV2OptionsBuilder {
   typedef SelectV2Options Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit SelectV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SelectV2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SelectV2Options> Finish() {
+  ::flatbuffers::Offset<SelectV2Options> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SelectV2Options>(end);
+    auto o = ::flatbuffers::Offset<SelectV2Options>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   SelectV2OptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct DensifyOptionsT : public flatbuffers::NativeTable {
+struct DensifyOptionsT : public ::flatbuffers::NativeTable {
   typedef DensifyOptions TableType;
 };
 
-struct DensifyOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct DensifyOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef DensifyOptionsT NativeTableType;
   typedef DensifyOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  DensifyOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(DensifyOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<DensifyOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  DensifyOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DensifyOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DensifyOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct DensifyOptionsBuilder {
   typedef DensifyOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit DensifyOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit DensifyOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<DensifyOptions> Finish() {
+  ::flatbuffers::Offset<DensifyOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<DensifyOptions>(end);
+    auto o = ::flatbuffers::Offset<DensifyOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   DensifyOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SegmentSumOptionsT : public flatbuffers::NativeTable {
+struct SegmentSumOptionsT : public ::flatbuffers::NativeTable {
   typedef SegmentSumOptions TableType;
 };
 
-struct SegmentSumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SegmentSumOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SegmentSumOptionsT NativeTableType;
   typedef SegmentSumOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  SegmentSumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SegmentSumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SegmentSumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SegmentSumOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SegmentSumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SegmentSumOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SegmentSumOptionsBuilder {
   typedef SegmentSumOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit SegmentSumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SegmentSumOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SegmentSumOptions> Finish() {
+  ::flatbuffers::Offset<SegmentSumOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SegmentSumOptions>(end);
+    auto o = ::flatbuffers::Offset<SegmentSumOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   SegmentSumOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BatchMatMulOptionsT : public flatbuffers::NativeTable {
+struct BatchMatMulOptionsT : public ::flatbuffers::NativeTable {
   typedef BatchMatMulOptions TableType;
   bool adj_x = false;
   bool adj_y = false;
   bool asymmetric_quantize_inputs = false;
 };
 
-struct BatchMatMulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BatchMatMulOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BatchMatMulOptionsT NativeTableType;
   typedef BatchMatMulOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -10576,22 +10661,22 @@ struct BatchMatMulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool asymmetric_quantize_inputs() const {
     return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_ADJ_X, 1) &&
            VerifyField<uint8_t>(verifier, VT_ADJ_Y, 1) &&
            VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
            verifier.EndTable();
   }
-  BatchMatMulOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BatchMatMulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BatchMatMulOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BatchMatMulOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BatchMatMulOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BatchMatMulOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BatchMatMulOptionsBuilder {
   typedef BatchMatMulOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_adj_x(bool adj_x) {
     fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ADJ_X, static_cast<uint8_t>(adj_x), 0);
   }
@@ -10601,19 +10686,19 @@ struct BatchMatMulOptionsBuilder {
   void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
     fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
-  explicit BatchMatMulOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BatchMatMulOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BatchMatMulOptions> Finish() {
+  ::flatbuffers::Offset<BatchMatMulOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BatchMatMulOptions>(end);
+    auto o = ::flatbuffers::Offset<BatchMatMulOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool adj_x = false,
     bool adj_y = false,
     bool asymmetric_quantize_inputs = false) {
@@ -10624,15 +10709,15 @@ inline flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct CumsumOptionsT : public flatbuffers::NativeTable {
+struct CumsumOptionsT : public ::flatbuffers::NativeTable {
   typedef CumsumOptions TableType;
   bool exclusive = false;
   bool reverse = false;
 };
 
-struct CumsumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct CumsumOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef CumsumOptionsT NativeTableType;
   typedef CumsumOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -10645,40 +10730,40 @@ struct CumsumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool reverse() const {
     return GetField<uint8_t>(VT_REVERSE, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_EXCLUSIVE, 1) &&
            VerifyField<uint8_t>(verifier, VT_REVERSE, 1) &&
            verifier.EndTable();
   }
-  CumsumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(CumsumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<CumsumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  CumsumOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CumsumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<CumsumOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct CumsumOptionsBuilder {
   typedef CumsumOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_exclusive(bool exclusive) {
     fbb_.AddElement<uint8_t>(CumsumOptions::VT_EXCLUSIVE, static_cast<uint8_t>(exclusive), 0);
   }
   void add_reverse(bool reverse) {
     fbb_.AddElement<uint8_t>(CumsumOptions::VT_REVERSE, static_cast<uint8_t>(reverse), 0);
   }
-  explicit CumsumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit CumsumOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CumsumOptions> Finish() {
+  ::flatbuffers::Offset<CumsumOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CumsumOptions>(end);
+    auto o = ::flatbuffers::Offset<CumsumOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool exclusive = false,
     bool reverse = false) {
   CumsumOptionsBuilder builder_(_fbb);
@@ -10687,94 +10772,94 @@ inline flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BroadcastToOptionsT : public flatbuffers::NativeTable {
+struct BroadcastToOptionsT : public ::flatbuffers::NativeTable {
   typedef BroadcastToOptions TableType;
 };
 
-struct BroadcastToOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BroadcastToOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BroadcastToOptionsT NativeTableType;
   typedef BroadcastToOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  BroadcastToOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BroadcastToOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BroadcastToOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BroadcastToOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BroadcastToOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BroadcastToOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BroadcastToOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BroadcastToOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BroadcastToOptionsBuilder {
   typedef BroadcastToOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit BroadcastToOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BroadcastToOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BroadcastToOptions> Finish() {
+  ::flatbuffers::Offset<BroadcastToOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BroadcastToOptions>(end);
+    auto o = ::flatbuffers::Offset<BroadcastToOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BroadcastToOptions> CreateBroadcastToOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<BroadcastToOptions> CreateBroadcastToOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   BroadcastToOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<BroadcastToOptions> CreateBroadcastToOptions(flatbuffers::FlatBufferBuilder &_fbb, const BroadcastToOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BroadcastToOptions> CreateBroadcastToOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BroadcastToOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct Rfft2dOptionsT : public flatbuffers::NativeTable {
+struct Rfft2dOptionsT : public ::flatbuffers::NativeTable {
   typedef Rfft2dOptions TableType;
 };
 
-struct Rfft2dOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Rfft2dOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef Rfft2dOptionsT NativeTableType;
   typedef Rfft2dOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  Rfft2dOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(Rfft2dOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Rfft2dOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Rfft2dOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  Rfft2dOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Rfft2dOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Rfft2dOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Rfft2dOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct Rfft2dOptionsBuilder {
   typedef Rfft2dOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit Rfft2dOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit Rfft2dOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Rfft2dOptions> Finish() {
+  ::flatbuffers::Offset<Rfft2dOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Rfft2dOptions>(end);
+    auto o = ::flatbuffers::Offset<Rfft2dOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Rfft2dOptions> CreateRfft2dOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<Rfft2dOptions> CreateRfft2dOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   Rfft2dOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<Rfft2dOptions> CreateRfft2dOptions(flatbuffers::FlatBufferBuilder &_fbb, const Rfft2dOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Rfft2dOptions> CreateRfft2dOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Rfft2dOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct HashtableOptionsT : public flatbuffers::NativeTable {
+struct HashtableOptionsT : public ::flatbuffers::NativeTable {
   typedef HashtableOptions TableType;
   int32_t table_id = 0;
   tflite::TensorType key_dtype = tflite::TensorType_FLOAT32;
   tflite::TensorType value_dtype = tflite::TensorType_FLOAT32;
 };
 
-struct HashtableOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct HashtableOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef HashtableOptionsT NativeTableType;
   typedef HashtableOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -10791,22 +10876,22 @@ struct HashtableOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::TensorType value_dtype() const {
     return static_cast<tflite::TensorType>(GetField<int8_t>(VT_VALUE_DTYPE, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_TABLE_ID, 4) &&
            VerifyField<int8_t>(verifier, VT_KEY_DTYPE, 1) &&
            VerifyField<int8_t>(verifier, VT_VALUE_DTYPE, 1) &&
            verifier.EndTable();
   }
-  HashtableOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(HashtableOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<HashtableOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const HashtableOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  HashtableOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HashtableOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HashtableOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct HashtableOptionsBuilder {
   typedef HashtableOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_table_id(int32_t table_id) {
     fbb_.AddElement<int32_t>(HashtableOptions::VT_TABLE_ID, table_id, 0);
   }
@@ -10816,19 +10901,19 @@ struct HashtableOptionsBuilder {
   void add_value_dtype(tflite::TensorType value_dtype) {
     fbb_.AddElement<int8_t>(HashtableOptions::VT_VALUE_DTYPE, static_cast<int8_t>(value_dtype), 0);
   }
-  explicit HashtableOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit HashtableOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<HashtableOptions> Finish() {
+  ::flatbuffers::Offset<HashtableOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<HashtableOptions>(end);
+    auto o = ::flatbuffers::Offset<HashtableOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<HashtableOptions> CreateHashtableOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<HashtableOptions> CreateHashtableOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t table_id = 0,
     tflite::TensorType key_dtype = tflite::TensorType_FLOAT32,
     tflite::TensorType value_dtype = tflite::TensorType_FLOAT32) {
@@ -10839,145 +10924,145 @@ inline flatbuffers::Offset<HashtableOptions> CreateHashtableOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<HashtableOptions> CreateHashtableOptions(flatbuffers::FlatBufferBuilder &_fbb, const HashtableOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<HashtableOptions> CreateHashtableOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct HashtableFindOptionsT : public flatbuffers::NativeTable {
+struct HashtableFindOptionsT : public ::flatbuffers::NativeTable {
   typedef HashtableFindOptions TableType;
 };
 
-struct HashtableFindOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct HashtableFindOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef HashtableFindOptionsT NativeTableType;
   typedef HashtableFindOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  HashtableFindOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(HashtableFindOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<HashtableFindOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const HashtableFindOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  HashtableFindOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HashtableFindOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HashtableFindOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableFindOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct HashtableFindOptionsBuilder {
   typedef HashtableFindOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit HashtableFindOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit HashtableFindOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<HashtableFindOptions> Finish() {
+  ::flatbuffers::Offset<HashtableFindOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<HashtableFindOptions>(end);
+    auto o = ::flatbuffers::Offset<HashtableFindOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<HashtableFindOptions> CreateHashtableFindOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<HashtableFindOptions> CreateHashtableFindOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   HashtableFindOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<HashtableFindOptions> CreateHashtableFindOptions(flatbuffers::FlatBufferBuilder &_fbb, const HashtableFindOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<HashtableFindOptions> CreateHashtableFindOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableFindOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct HashtableImportOptionsT : public flatbuffers::NativeTable {
+struct HashtableImportOptionsT : public ::flatbuffers::NativeTable {
   typedef HashtableImportOptions TableType;
 };
 
-struct HashtableImportOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct HashtableImportOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef HashtableImportOptionsT NativeTableType;
   typedef HashtableImportOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  HashtableImportOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(HashtableImportOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<HashtableImportOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const HashtableImportOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  HashtableImportOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HashtableImportOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HashtableImportOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableImportOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct HashtableImportOptionsBuilder {
   typedef HashtableImportOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit HashtableImportOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit HashtableImportOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<HashtableImportOptions> Finish() {
+  ::flatbuffers::Offset<HashtableImportOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<HashtableImportOptions>(end);
+    auto o = ::flatbuffers::Offset<HashtableImportOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<HashtableImportOptions> CreateHashtableImportOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<HashtableImportOptions> CreateHashtableImportOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   HashtableImportOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<HashtableImportOptions> CreateHashtableImportOptions(flatbuffers::FlatBufferBuilder &_fbb, const HashtableImportOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<HashtableImportOptions> CreateHashtableImportOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableImportOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct HashtableSizeOptionsT : public flatbuffers::NativeTable {
+struct HashtableSizeOptionsT : public ::flatbuffers::NativeTable {
   typedef HashtableSizeOptions TableType;
 };
 
-struct HashtableSizeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct HashtableSizeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef HashtableSizeOptionsT NativeTableType;
   typedef HashtableSizeOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  HashtableSizeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(HashtableSizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<HashtableSizeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const HashtableSizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  HashtableSizeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(HashtableSizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<HashtableSizeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableSizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct HashtableSizeOptionsBuilder {
   typedef HashtableSizeOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit HashtableSizeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit HashtableSizeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<HashtableSizeOptions> Finish() {
+  ::flatbuffers::Offset<HashtableSizeOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<HashtableSizeOptions>(end);
+    auto o = ::flatbuffers::Offset<HashtableSizeOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<HashtableSizeOptions> CreateHashtableSizeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<HashtableSizeOptions> CreateHashtableSizeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   HashtableSizeOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<HashtableSizeOptions> CreateHashtableSizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const HashtableSizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<HashtableSizeOptions> CreateHashtableSizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableSizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct VarHandleOptionsT : public flatbuffers::NativeTable {
+struct VarHandleOptionsT : public ::flatbuffers::NativeTable {
   typedef VarHandleOptions TableType;
   std::string container{};
   std::string shared_name{};
 };
 
-struct VarHandleOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct VarHandleOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef VarHandleOptionsT NativeTableType;
   typedef VarHandleOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_CONTAINER = 4,
     VT_SHARED_NAME = 6
   };
-  const flatbuffers::String *container() const {
-    return GetPointer<const flatbuffers::String *>(VT_CONTAINER);
+  const ::flatbuffers::String *container() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CONTAINER);
   }
-  const flatbuffers::String *shared_name() const {
-    return GetPointer<const flatbuffers::String *>(VT_SHARED_NAME);
+  const ::flatbuffers::String *shared_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_SHARED_NAME);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_CONTAINER) &&
            verifier.VerifyString(container()) &&
@@ -10985,44 +11070,44 @@ struct VarHandleOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(shared_name()) &&
            verifier.EndTable();
   }
-  VarHandleOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(VarHandleOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<VarHandleOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const VarHandleOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  VarHandleOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(VarHandleOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<VarHandleOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const VarHandleOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct VarHandleOptionsBuilder {
   typedef VarHandleOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_container(flatbuffers::Offset<flatbuffers::String> container) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_container(::flatbuffers::Offset<::flatbuffers::String> container) {
     fbb_.AddOffset(VarHandleOptions::VT_CONTAINER, container);
   }
-  void add_shared_name(flatbuffers::Offset<flatbuffers::String> shared_name) {
+  void add_shared_name(::flatbuffers::Offset<::flatbuffers::String> shared_name) {
     fbb_.AddOffset(VarHandleOptions::VT_SHARED_NAME, shared_name);
   }
-  explicit VarHandleOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit VarHandleOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<VarHandleOptions> Finish() {
+  ::flatbuffers::Offset<VarHandleOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<VarHandleOptions>(end);
+    auto o = ::flatbuffers::Offset<VarHandleOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> container = 0,
-    flatbuffers::Offset<flatbuffers::String> shared_name = 0) {
+inline ::flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> container = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> shared_name = 0) {
   VarHandleOptionsBuilder builder_(_fbb);
   builder_.add_shared_name(shared_name);
   builder_.add_container(container);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *container = nullptr,
     const char *shared_name = nullptr) {
   auto container__ = container ? _fbb.CreateString(container) : 0;
@@ -11033,93 +11118,93 @@ inline flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptionsDirect(
       shared_name__);
 }
 
-flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptions(flatbuffers::FlatBufferBuilder &_fbb, const VarHandleOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptions(::flatbuffers::FlatBufferBuilder &_fbb, const VarHandleOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ReadVariableOptionsT : public flatbuffers::NativeTable {
+struct ReadVariableOptionsT : public ::flatbuffers::NativeTable {
   typedef ReadVariableOptions TableType;
 };
 
-struct ReadVariableOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ReadVariableOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ReadVariableOptionsT NativeTableType;
   typedef ReadVariableOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  ReadVariableOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ReadVariableOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ReadVariableOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReadVariableOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ReadVariableOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReadVariableOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ReadVariableOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReadVariableOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ReadVariableOptionsBuilder {
   typedef ReadVariableOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit ReadVariableOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ReadVariableOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ReadVariableOptions> Finish() {
+  ::flatbuffers::Offset<ReadVariableOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ReadVariableOptions>(end);
+    auto o = ::flatbuffers::Offset<ReadVariableOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ReadVariableOptions> CreateReadVariableOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<ReadVariableOptions> CreateReadVariableOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   ReadVariableOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ReadVariableOptions> CreateReadVariableOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReadVariableOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ReadVariableOptions> CreateReadVariableOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReadVariableOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct AssignVariableOptionsT : public flatbuffers::NativeTable {
+struct AssignVariableOptionsT : public ::flatbuffers::NativeTable {
   typedef AssignVariableOptions TableType;
 };
 
-struct AssignVariableOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct AssignVariableOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef AssignVariableOptionsT NativeTableType;
   typedef AssignVariableOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  AssignVariableOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(AssignVariableOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<AssignVariableOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AssignVariableOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  AssignVariableOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AssignVariableOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<AssignVariableOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AssignVariableOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct AssignVariableOptionsBuilder {
   typedef AssignVariableOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit AssignVariableOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit AssignVariableOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<AssignVariableOptions> Finish() {
+  ::flatbuffers::Offset<AssignVariableOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<AssignVariableOptions>(end);
+    auto o = ::flatbuffers::Offset<AssignVariableOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<AssignVariableOptions> CreateAssignVariableOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<AssignVariableOptions> CreateAssignVariableOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   AssignVariableOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<AssignVariableOptions> CreateAssignVariableOptions(flatbuffers::FlatBufferBuilder &_fbb, const AssignVariableOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<AssignVariableOptions> CreateAssignVariableOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AssignVariableOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct RandomOptionsT : public flatbuffers::NativeTable {
+struct RandomOptionsT : public ::flatbuffers::NativeTable {
   typedef RandomOptions TableType;
   int64_t seed = 0;
   int64_t seed2 = 0;
 };
 
-struct RandomOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct RandomOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef RandomOptionsT NativeTableType;
   typedef RandomOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -11132,40 +11217,40 @@ struct RandomOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int64_t seed2() const {
     return GetField<int64_t>(VT_SEED2, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int64_t>(verifier, VT_SEED, 8) &&
            VerifyField<int64_t>(verifier, VT_SEED2, 8) &&
            verifier.EndTable();
   }
-  RandomOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(RandomOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<RandomOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RandomOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  RandomOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RandomOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<RandomOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RandomOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct RandomOptionsBuilder {
   typedef RandomOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_seed(int64_t seed) {
     fbb_.AddElement<int64_t>(RandomOptions::VT_SEED, seed, 0);
   }
   void add_seed2(int64_t seed2) {
     fbb_.AddElement<int64_t>(RandomOptions::VT_SEED2, seed2, 0);
   }
-  explicit RandomOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit RandomOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<RandomOptions> Finish() {
+  ::flatbuffers::Offset<RandomOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<RandomOptions>(end);
+    auto o = ::flatbuffers::Offset<RandomOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<RandomOptions> CreateRandomOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<RandomOptions> CreateRandomOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int64_t seed = 0,
     int64_t seed2 = 0) {
   RandomOptionsBuilder builder_(_fbb);
@@ -11174,61 +11259,61 @@ inline flatbuffers::Offset<RandomOptions> CreateRandomOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<RandomOptions> CreateRandomOptions(flatbuffers::FlatBufferBuilder &_fbb, const RandomOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<RandomOptions> CreateRandomOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RandomOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BucketizeOptionsT : public flatbuffers::NativeTable {
+struct BucketizeOptionsT : public ::flatbuffers::NativeTable {
   typedef BucketizeOptions TableType;
   std::vector<float> boundaries{};
 };
 
-struct BucketizeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct BucketizeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BucketizeOptionsT NativeTableType;
   typedef BucketizeOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_BOUNDARIES = 4
   };
-  const flatbuffers::Vector<float> *boundaries() const {
-    return GetPointer<const flatbuffers::Vector<float> *>(VT_BOUNDARIES);
+  const ::flatbuffers::Vector<float> *boundaries() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_BOUNDARIES);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_BOUNDARIES) &&
            verifier.VerifyVector(boundaries()) &&
            verifier.EndTable();
   }
-  BucketizeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BucketizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<BucketizeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BucketizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BucketizeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BucketizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BucketizeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BucketizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BucketizeOptionsBuilder {
   typedef BucketizeOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_boundaries(flatbuffers::Offset<flatbuffers::Vector<float>> boundaries) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_boundaries(::flatbuffers::Offset<::flatbuffers::Vector<float>> boundaries) {
     fbb_.AddOffset(BucketizeOptions::VT_BOUNDARIES, boundaries);
   }
-  explicit BucketizeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BucketizeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<BucketizeOptions> Finish() {
+  ::flatbuffers::Offset<BucketizeOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<BucketizeOptions>(end);
+    auto o = ::flatbuffers::Offset<BucketizeOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<float>> boundaries = 0) {
+inline ::flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> boundaries = 0) {
   BucketizeOptionsBuilder builder_(_fbb);
   builder_.add_boundaries(boundaries);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<float> *boundaries = nullptr) {
   auto boundaries__ = boundaries ? _fbb.CreateVector<float>(*boundaries) : 0;
   return tflite::CreateBucketizeOptions(
@@ -11236,14 +11321,14 @@ inline flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptionsDirect(
       boundaries__);
 }
 
-flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const BucketizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BucketizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct GeluOptionsT : public flatbuffers::NativeTable {
+struct GeluOptionsT : public ::flatbuffers::NativeTable {
   typedef GeluOptions TableType;
   bool approximate = false;
 };
 
-struct GeluOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct GeluOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef GeluOptionsT NativeTableType;
   typedef GeluOptionsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -11252,318 +11337,435 @@ struct GeluOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool approximate() const {
     return GetField<uint8_t>(VT_APPROXIMATE, 0) != 0;
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_APPROXIMATE, 1) &&
            verifier.EndTable();
   }
-  GeluOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(GeluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<GeluOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  GeluOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GeluOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<GeluOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct GeluOptionsBuilder {
   typedef GeluOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_approximate(bool approximate) {
     fbb_.AddElement<uint8_t>(GeluOptions::VT_APPROXIMATE, static_cast<uint8_t>(approximate), 0);
   }
-  explicit GeluOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit GeluOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<GeluOptions> Finish() {
+  ::flatbuffers::Offset<GeluOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<GeluOptions>(end);
+    auto o = ::flatbuffers::Offset<GeluOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<GeluOptions> CreateGeluOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<GeluOptions> CreateGeluOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     bool approximate = false) {
   GeluOptionsBuilder builder_(_fbb);
   builder_.add_approximate(approximate);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<GeluOptions> CreateGeluOptions(flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<GeluOptions> CreateGeluOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct DynamicUpdateSliceOptionsT : public flatbuffers::NativeTable {
+struct DynamicUpdateSliceOptionsT : public ::flatbuffers::NativeTable {
   typedef DynamicUpdateSliceOptions TableType;
 };
 
-struct DynamicUpdateSliceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct DynamicUpdateSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef DynamicUpdateSliceOptionsT NativeTableType;
   typedef DynamicUpdateSliceOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  DynamicUpdateSliceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(DynamicUpdateSliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<DynamicUpdateSliceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  DynamicUpdateSliceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DynamicUpdateSliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<DynamicUpdateSliceOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct DynamicUpdateSliceOptionsBuilder {
   typedef DynamicUpdateSliceOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit DynamicUpdateSliceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit DynamicUpdateSliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<DynamicUpdateSliceOptions> Finish() {
+  ::flatbuffers::Offset<DynamicUpdateSliceOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<DynamicUpdateSliceOptions>(end);
+    auto o = ::flatbuffers::Offset<DynamicUpdateSliceOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   DynamicUpdateSliceOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct UnsortedSegmentProdOptionsT : public flatbuffers::NativeTable {
+struct UnsortedSegmentProdOptionsT : public ::flatbuffers::NativeTable {
   typedef UnsortedSegmentProdOptions TableType;
 };
 
-struct UnsortedSegmentProdOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct UnsortedSegmentProdOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef UnsortedSegmentProdOptionsT NativeTableType;
   typedef UnsortedSegmentProdOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  UnsortedSegmentProdOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(UnsortedSegmentProdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<UnsortedSegmentProdOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  UnsortedSegmentProdOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnsortedSegmentProdOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UnsortedSegmentProdOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct UnsortedSegmentProdOptionsBuilder {
   typedef UnsortedSegmentProdOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit UnsortedSegmentProdOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit UnsortedSegmentProdOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<UnsortedSegmentProdOptions> Finish() {
+  ::flatbuffers::Offset<UnsortedSegmentProdOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<UnsortedSegmentProdOptions>(end);
+    auto o = ::flatbuffers::Offset<UnsortedSegmentProdOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   UnsortedSegmentProdOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct UnsortedSegmentMaxOptionsT : public flatbuffers::NativeTable {
+struct UnsortedSegmentMaxOptionsT : public ::flatbuffers::NativeTable {
   typedef UnsortedSegmentMaxOptions TableType;
 };
 
-struct UnsortedSegmentMaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct UnsortedSegmentMaxOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef UnsortedSegmentMaxOptionsT NativeTableType;
   typedef UnsortedSegmentMaxOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  UnsortedSegmentMaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(UnsortedSegmentMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<UnsortedSegmentMaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  UnsortedSegmentMaxOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnsortedSegmentMaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UnsortedSegmentMaxOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct UnsortedSegmentMaxOptionsBuilder {
   typedef UnsortedSegmentMaxOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit UnsortedSegmentMaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit UnsortedSegmentMaxOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<UnsortedSegmentMaxOptions> Finish() {
+  ::flatbuffers::Offset<UnsortedSegmentMaxOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<UnsortedSegmentMaxOptions>(end);
+    auto o = ::flatbuffers::Offset<UnsortedSegmentMaxOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<UnsortedSegmentMaxOptions> CreateUnsortedSegmentMaxOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<UnsortedSegmentMaxOptions> CreateUnsortedSegmentMaxOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   UnsortedSegmentMaxOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<UnsortedSegmentMaxOptions> CreateUnsortedSegmentMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<UnsortedSegmentMaxOptions> CreateUnsortedSegmentMaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct UnsortedSegmentSumOptionsT : public flatbuffers::NativeTable {
+struct UnsortedSegmentSumOptionsT : public ::flatbuffers::NativeTable {
   typedef UnsortedSegmentSumOptions TableType;
 };
 
-struct UnsortedSegmentSumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct UnsortedSegmentSumOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef UnsortedSegmentSumOptionsT NativeTableType;
   typedef UnsortedSegmentSumOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  UnsortedSegmentSumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(UnsortedSegmentSumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<UnsortedSegmentSumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentSumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  UnsortedSegmentSumOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnsortedSegmentSumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UnsortedSegmentSumOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentSumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct UnsortedSegmentSumOptionsBuilder {
   typedef UnsortedSegmentSumOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit UnsortedSegmentSumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit UnsortedSegmentSumOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<UnsortedSegmentSumOptions> Finish() {
+  ::flatbuffers::Offset<UnsortedSegmentSumOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<UnsortedSegmentSumOptions>(end);
+    auto o = ::flatbuffers::Offset<UnsortedSegmentSumOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<UnsortedSegmentSumOptions> CreateUnsortedSegmentSumOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<UnsortedSegmentSumOptions> CreateUnsortedSegmentSumOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   UnsortedSegmentSumOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<UnsortedSegmentSumOptions> CreateUnsortedSegmentSumOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentSumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<UnsortedSegmentSumOptions> CreateUnsortedSegmentSumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentSumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ATan2OptionsT : public flatbuffers::NativeTable {
+struct ATan2OptionsT : public ::flatbuffers::NativeTable {
   typedef ATan2Options TableType;
 };
 
-struct ATan2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ATan2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ATan2OptionsT NativeTableType;
   typedef ATan2OptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  ATan2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ATan2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ATan2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ATan2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ATan2OptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ATan2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ATan2Options> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ATan2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ATan2OptionsBuilder {
   typedef ATan2Options Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit ATan2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ATan2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<ATan2Options> Finish() {
+  ::flatbuffers::Offset<ATan2Options> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<ATan2Options>(end);
+    auto o = ::flatbuffers::Offset<ATan2Options>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<ATan2Options> CreateATan2Options(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<ATan2Options> CreateATan2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   ATan2OptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ATan2Options> CreateATan2Options(flatbuffers::FlatBufferBuilder &_fbb, const ATan2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<ATan2Options> CreateATan2Options(::flatbuffers::FlatBufferBuilder &_fbb, const ATan2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct UnsortedSegmentMinOptionsT : public flatbuffers::NativeTable {
+struct UnsortedSegmentMinOptionsT : public ::flatbuffers::NativeTable {
   typedef UnsortedSegmentMinOptions TableType;
 };
 
-struct UnsortedSegmentMinOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct UnsortedSegmentMinOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef UnsortedSegmentMinOptionsT NativeTableType;
   typedef UnsortedSegmentMinOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  UnsortedSegmentMinOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(UnsortedSegmentMinOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<UnsortedSegmentMinOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMinOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  UnsortedSegmentMinOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnsortedSegmentMinOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<UnsortedSegmentMinOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMinOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct UnsortedSegmentMinOptionsBuilder {
   typedef UnsortedSegmentMinOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit UnsortedSegmentMinOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit UnsortedSegmentMinOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<UnsortedSegmentMinOptions> Finish() {
+  ::flatbuffers::Offset<UnsortedSegmentMinOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<UnsortedSegmentMinOptions>(end);
+    auto o = ::flatbuffers::Offset<UnsortedSegmentMinOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<UnsortedSegmentMinOptions> CreateUnsortedSegmentMinOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<UnsortedSegmentMinOptions> CreateUnsortedSegmentMinOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   UnsortedSegmentMinOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<UnsortedSegmentMinOptions> CreateUnsortedSegmentMinOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMinOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<UnsortedSegmentMinOptions> CreateUnsortedSegmentMinOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMinOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SignOptionsT : public flatbuffers::NativeTable {
+struct SignOptionsT : public ::flatbuffers::NativeTable {
   typedef SignOptions TableType;
 };
 
-struct SignOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SignOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SignOptionsT NativeTableType;
   typedef SignOptionsBuilder Builder;
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  SignOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SignOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SignOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SignOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SignOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SignOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SignOptionsBuilder {
   typedef SignOptions Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  explicit SignOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SignOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SignOptions> Finish() {
+  ::flatbuffers::Offset<SignOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SignOptions>(end);
+    auto o = ::flatbuffers::Offset<SignOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SignOptions> CreateSignOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+inline ::flatbuffers::Offset<SignOptions> CreateSignOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
   SignOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SignOptions> CreateSignOptions(flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SignOptions> CreateSignOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct OperatorCodeT : public flatbuffers::NativeTable {
+struct BitcastOptionsT : public ::flatbuffers::NativeTable {
+  typedef BitcastOptions TableType;
+};
+
+struct BitcastOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BitcastOptionsT NativeTableType;
+  typedef BitcastOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  BitcastOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BitcastOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BitcastOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BitcastOptionsBuilder {
+  typedef BitcastOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BitcastOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BitcastOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BitcastOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BitcastOptions> CreateBitcastOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  BitcastOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BitcastOptions> CreateBitcastOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BitwiseXorOptionsT : public ::flatbuffers::NativeTable {
+  typedef BitwiseXorOptions TableType;
+};
+
+struct BitwiseXorOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BitwiseXorOptionsT NativeTableType;
+  typedef BitwiseXorOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  BitwiseXorOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BitwiseXorOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<BitwiseXorOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BitwiseXorOptionsBuilder {
+  typedef BitwiseXorOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BitwiseXorOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BitwiseXorOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BitwiseXorOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BitwiseXorOptions> CreateBitwiseXorOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  BitwiseXorOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<BitwiseXorOptions> CreateBitwiseXorOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct RightShiftOptionsT : public ::flatbuffers::NativeTable {
+  typedef RightShiftOptions TableType;
+};
+
+struct RightShiftOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RightShiftOptionsT NativeTableType;
+  typedef RightShiftOptionsBuilder Builder;
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  RightShiftOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RightShiftOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<RightShiftOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RightShiftOptionsBuilder {
+  typedef RightShiftOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit RightShiftOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RightShiftOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<RightShiftOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RightShiftOptions> CreateRightShiftOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  RightShiftOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<RightShiftOptions> CreateRightShiftOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OperatorCodeT : public ::flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   int8_t deprecated_builtin_code = 0;
   std::string custom_code{};
@@ -11571,7 +11773,7 @@ struct OperatorCodeT : public flatbuffers::NativeTable {
   tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator_ADD;
 };
 
-struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct OperatorCode FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef OperatorCodeT NativeTableType;
   typedef OperatorCodeBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -11583,8 +11785,8 @@ struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int8_t deprecated_builtin_code() const {
     return GetField<int8_t>(VT_DEPRECATED_BUILTIN_CODE, 0);
   }
-  const flatbuffers::String *custom_code() const {
-    return GetPointer<const flatbuffers::String *>(VT_CUSTOM_CODE);
+  const ::flatbuffers::String *custom_code() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CUSTOM_CODE);
   }
   int32_t version() const {
     return GetField<int32_t>(VT_VERSION, 1);
@@ -11592,7 +11794,7 @@ struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::BuiltinOperator builtin_code() const {
     return static_cast<tflite::BuiltinOperator>(GetField<int32_t>(VT_BUILTIN_CODE, 0));
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_DEPRECATED_BUILTIN_CODE, 1) &&
            VerifyOffset(verifier, VT_CUSTOM_CODE) &&
@@ -11601,19 +11803,19 @@ struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int32_t>(verifier, VT_BUILTIN_CODE, 4) &&
            verifier.EndTable();
   }
-  OperatorCodeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(OperatorCodeT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<OperatorCode> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  OperatorCodeT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(OperatorCodeT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<OperatorCode> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct OperatorCodeBuilder {
   typedef OperatorCode Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_deprecated_builtin_code(int8_t deprecated_builtin_code) {
     fbb_.AddElement<int8_t>(OperatorCode::VT_DEPRECATED_BUILTIN_CODE, deprecated_builtin_code, 0);
   }
-  void add_custom_code(flatbuffers::Offset<flatbuffers::String> custom_code) {
+  void add_custom_code(::flatbuffers::Offset<::flatbuffers::String> custom_code) {
     fbb_.AddOffset(OperatorCode::VT_CUSTOM_CODE, custom_code);
   }
   void add_version(int32_t version) {
@@ -11622,21 +11824,21 @@ struct OperatorCodeBuilder {
   void add_builtin_code(tflite::BuiltinOperator builtin_code) {
     fbb_.AddElement<int32_t>(OperatorCode::VT_BUILTIN_CODE, static_cast<int32_t>(builtin_code), 0);
   }
-  explicit OperatorCodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit OperatorCodeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<OperatorCode> Finish() {
+  ::flatbuffers::Offset<OperatorCode> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<OperatorCode>(end);
+    auto o = ::flatbuffers::Offset<OperatorCode>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int8_t deprecated_builtin_code = 0,
-    flatbuffers::Offset<flatbuffers::String> custom_code = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> custom_code = 0,
     int32_t version = 1,
     tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator_ADD) {
   OperatorCodeBuilder builder_(_fbb);
@@ -11647,8 +11849,8 @@ inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     int8_t deprecated_builtin_code = 0,
     const char *custom_code = nullptr,
     int32_t version = 1,
@@ -11662,9 +11864,9 @@ inline flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
       builtin_code);
 }
 
-flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<OperatorCode> CreateOperatorCode(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct OperatorT : public flatbuffers::NativeTable {
+struct OperatorT : public ::flatbuffers::NativeTable {
   typedef Operator TableType;
   uint32_t opcode_index = 0;
   std::vector<int32_t> inputs{};
@@ -11676,7 +11878,7 @@ struct OperatorT : public flatbuffers::NativeTable {
   std::vector<int32_t> intermediates{};
 };
 
-struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Operator FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef OperatorT NativeTableType;
   typedef OperatorBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -11693,11 +11895,11 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint32_t opcode_index() const {
     return GetField<uint32_t>(VT_OPCODE_INDEX, 0);
   }
-  const flatbuffers::Vector<int32_t> *inputs() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUTS);
+  const ::flatbuffers::Vector<int32_t> *inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INPUTS);
   }
-  const flatbuffers::Vector<int32_t> *outputs() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
+  const ::flatbuffers::Vector<int32_t> *outputs() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
   }
   tflite::BuiltinOptions builtin_options_type() const {
     return static_cast<tflite::BuiltinOptions>(GetField<uint8_t>(VT_BUILTIN_OPTIONS_TYPE, 0));
@@ -12075,19 +12277,28 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::SignOptions *builtin_options_as_SignOptions() const {
     return builtin_options_type() == tflite::BuiltinOptions_SignOptions ? static_cast<const tflite::SignOptions *>(builtin_options()) : nullptr;
   }
-  const flatbuffers::Vector<uint8_t> *custom_options() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
+  const tflite::BitcastOptions *builtin_options_as_BitcastOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BitcastOptions ? static_cast<const tflite::BitcastOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BitwiseXorOptions *builtin_options_as_BitwiseXorOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_BitwiseXorOptions ? static_cast<const tflite::BitwiseXorOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RightShiftOptions *builtin_options_as_RightShiftOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_RightShiftOptions ? static_cast<const tflite::RightShiftOptions *>(builtin_options()) : nullptr;
+  }
+  const ::flatbuffers::Vector<uint8_t> *custom_options() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
   tflite::CustomOptionsFormat custom_options_format() const {
     return static_cast<tflite::CustomOptionsFormat>(GetField<int8_t>(VT_CUSTOM_OPTIONS_FORMAT, 0));
   }
-  const flatbuffers::Vector<uint8_t> *mutating_variable_inputs() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_MUTATING_VARIABLE_INPUTS);
+  const ::flatbuffers::Vector<uint8_t> *mutating_variable_inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_MUTATING_VARIABLE_INPUTS);
   }
-  const flatbuffers::Vector<int32_t> *intermediates() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INTERMEDIATES);
+  const ::flatbuffers::Vector<int32_t> *intermediates() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INTERMEDIATES);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint32_t>(verifier, VT_OPCODE_INDEX, 4) &&
            VerifyOffset(verifier, VT_INPUTS) &&
@@ -12106,9 +12317,9 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyVector(intermediates()) &&
            verifier.EndTable();
   }
-  OperatorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(OperatorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Operator> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  OperatorT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(OperatorT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Operator> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 template<> inline const tflite::Conv2DOptions *Operator::builtin_options_as<tflite::Conv2DOptions>() const {
@@ -12603,59 +12814,71 @@ template<> inline const tflite::SignOptions *Operator::builtin_options_as<tflite
   return builtin_options_as_SignOptions();
 }
 
+template<> inline const tflite::BitcastOptions *Operator::builtin_options_as<tflite::BitcastOptions>() const {
+  return builtin_options_as_BitcastOptions();
+}
+
+template<> inline const tflite::BitwiseXorOptions *Operator::builtin_options_as<tflite::BitwiseXorOptions>() const {
+  return builtin_options_as_BitwiseXorOptions();
+}
+
+template<> inline const tflite::RightShiftOptions *Operator::builtin_options_as<tflite::RightShiftOptions>() const {
+  return builtin_options_as_RightShiftOptions();
+}
+
 struct OperatorBuilder {
   typedef Operator Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_opcode_index(uint32_t opcode_index) {
     fbb_.AddElement<uint32_t>(Operator::VT_OPCODE_INDEX, opcode_index, 0);
   }
-  void add_inputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs) {
+  void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> inputs) {
     fbb_.AddOffset(Operator::VT_INPUTS, inputs);
   }
-  void add_outputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs) {
+  void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> outputs) {
     fbb_.AddOffset(Operator::VT_OUTPUTS, outputs);
   }
   void add_builtin_options_type(tflite::BuiltinOptions builtin_options_type) {
     fbb_.AddElement<uint8_t>(Operator::VT_BUILTIN_OPTIONS_TYPE, static_cast<uint8_t>(builtin_options_type), 0);
   }
-  void add_builtin_options(flatbuffers::Offset<void> builtin_options) {
+  void add_builtin_options(::flatbuffers::Offset<void> builtin_options) {
     fbb_.AddOffset(Operator::VT_BUILTIN_OPTIONS, builtin_options);
   }
-  void add_custom_options(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options) {
+  void add_custom_options(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom_options) {
     fbb_.AddOffset(Operator::VT_CUSTOM_OPTIONS, custom_options);
   }
   void add_custom_options_format(tflite::CustomOptionsFormat custom_options_format) {
     fbb_.AddElement<int8_t>(Operator::VT_CUSTOM_OPTIONS_FORMAT, static_cast<int8_t>(custom_options_format), 0);
   }
-  void add_mutating_variable_inputs(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> mutating_variable_inputs) {
+  void add_mutating_variable_inputs(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> mutating_variable_inputs) {
     fbb_.AddOffset(Operator::VT_MUTATING_VARIABLE_INPUTS, mutating_variable_inputs);
   }
-  void add_intermediates(flatbuffers::Offset<flatbuffers::Vector<int32_t>> intermediates) {
+  void add_intermediates(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> intermediates) {
     fbb_.AddOffset(Operator::VT_INTERMEDIATES, intermediates);
   }
-  explicit OperatorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit OperatorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Operator> Finish() {
+  ::flatbuffers::Offset<Operator> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Operator>(end);
+    auto o = ::flatbuffers::Offset<Operator>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Operator> CreateOperator(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Operator> CreateOperator(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     uint32_t opcode_index = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> outputs = 0,
     tflite::BuiltinOptions builtin_options_type = tflite::BuiltinOptions_NONE,
-    flatbuffers::Offset<void> builtin_options = 0,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options = 0,
+    ::flatbuffers::Offset<void> builtin_options = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom_options = 0,
     tflite::CustomOptionsFormat custom_options_format = tflite::CustomOptionsFormat_FLEXBUFFERS,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> mutating_variable_inputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> intermediates = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> mutating_variable_inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> intermediates = 0) {
   OperatorBuilder builder_(_fbb);
   builder_.add_intermediates(intermediates);
   builder_.add_mutating_variable_inputs(mutating_variable_inputs);
@@ -12669,13 +12892,13 @@ inline flatbuffers::Offset<Operator> CreateOperator(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Operator> CreateOperatorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Operator> CreateOperatorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     uint32_t opcode_index = 0,
     const std::vector<int32_t> *inputs = nullptr,
     const std::vector<int32_t> *outputs = nullptr,
     tflite::BuiltinOptions builtin_options_type = tflite::BuiltinOptions_NONE,
-    flatbuffers::Offset<void> builtin_options = 0,
+    ::flatbuffers::Offset<void> builtin_options = 0,
     const std::vector<uint8_t> *custom_options = nullptr,
     tflite::CustomOptionsFormat custom_options_format = tflite::CustomOptionsFormat_FLEXBUFFERS,
     const std::vector<uint8_t> *mutating_variable_inputs = nullptr,
@@ -12698,9 +12921,9 @@ inline flatbuffers::Offset<Operator> CreateOperatorDirect(
       intermediates__);
 }
 
-flatbuffers::Offset<Operator> CreateOperator(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Operator> CreateOperator(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SubGraphT : public flatbuffers::NativeTable {
+struct SubGraphT : public ::flatbuffers::NativeTable {
   typedef SubGraph TableType;
   std::vector<std::unique_ptr<tflite::TensorT>> tensors{};
   std::vector<int32_t> inputs{};
@@ -12713,7 +12936,7 @@ struct SubGraphT : public flatbuffers::NativeTable {
   SubGraphT &operator=(SubGraphT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SubGraph FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SubGraphT NativeTableType;
   typedef SubGraphBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -12723,22 +12946,22 @@ struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_OPERATORS = 10,
     VT_NAME = 12
   };
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::Tensor>> *tensors() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::Tensor>> *>(VT_TENSORS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Tensor>> *tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Tensor>> *>(VT_TENSORS);
   }
-  const flatbuffers::Vector<int32_t> *inputs() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUTS);
+  const ::flatbuffers::Vector<int32_t> *inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INPUTS);
   }
-  const flatbuffers::Vector<int32_t> *outputs() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
+  const ::flatbuffers::Vector<int32_t> *outputs() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::Operator>> *operators() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::Operator>> *>(VT_OPERATORS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Operator>> *operators() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Operator>> *>(VT_OPERATORS);
   }
-  const flatbuffers::String *name() const {
-    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_TENSORS) &&
            verifier.VerifyVector(tensors()) &&
@@ -12754,48 +12977,48 @@ struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyString(name()) &&
            verifier.EndTable();
   }
-  SubGraphT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SubGraphT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SubGraph> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SubGraphT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SubGraphT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SubGraph> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SubGraphBuilder {
   typedef SubGraph Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_tensors(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Tensor>>> tensors) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Tensor>>> tensors) {
     fbb_.AddOffset(SubGraph::VT_TENSORS, tensors);
   }
-  void add_inputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs) {
+  void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> inputs) {
     fbb_.AddOffset(SubGraph::VT_INPUTS, inputs);
   }
-  void add_outputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs) {
+  void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> outputs) {
     fbb_.AddOffset(SubGraph::VT_OUTPUTS, outputs);
   }
-  void add_operators(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Operator>>> operators) {
+  void add_operators(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Operator>>> operators) {
     fbb_.AddOffset(SubGraph::VT_OPERATORS, operators);
   }
-  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
     fbb_.AddOffset(SubGraph::VT_NAME, name);
   }
-  explicit SubGraphBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SubGraphBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SubGraph> Finish() {
+  ::flatbuffers::Offset<SubGraph> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SubGraph>(end);
+    auto o = ::flatbuffers::Offset<SubGraph>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SubGraph> CreateSubGraph(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Tensor>>> tensors = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Operator>>> operators = 0,
-    flatbuffers::Offset<flatbuffers::String> name = 0) {
+inline ::flatbuffers::Offset<SubGraph> CreateSubGraph(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Tensor>>> tensors = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> outputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Operator>>> operators = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0) {
   SubGraphBuilder builder_(_fbb);
   builder_.add_name(name);
   builder_.add_operators(operators);
@@ -12805,17 +13028,17 @@ inline flatbuffers::Offset<SubGraph> CreateSubGraph(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<SubGraph> CreateSubGraphDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<flatbuffers::Offset<tflite::Tensor>> *tensors = nullptr,
+inline ::flatbuffers::Offset<SubGraph> CreateSubGraphDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::Tensor>> *tensors = nullptr,
     const std::vector<int32_t> *inputs = nullptr,
     const std::vector<int32_t> *outputs = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::Operator>> *operators = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::Operator>> *operators = nullptr,
     const char *name = nullptr) {
-  auto tensors__ = tensors ? _fbb.CreateVector<flatbuffers::Offset<tflite::Tensor>>(*tensors) : 0;
+  auto tensors__ = tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Tensor>>(*tensors) : 0;
   auto inputs__ = inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0;
   auto outputs__ = outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0;
-  auto operators__ = operators ? _fbb.CreateVector<flatbuffers::Offset<tflite::Operator>>(*operators) : 0;
+  auto operators__ = operators ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Operator>>(*operators) : 0;
   auto name__ = name ? _fbb.CreateString(name) : 0;
   return tflite::CreateSubGraph(
       _fbb,
@@ -12826,61 +13049,61 @@ inline flatbuffers::Offset<SubGraph> CreateSubGraphDirect(
       name__);
 }
 
-flatbuffers::Offset<SubGraph> CreateSubGraph(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SubGraph> CreateSubGraph(::flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct BufferT : public flatbuffers::NativeTable {
+struct BufferT : public ::flatbuffers::NativeTable {
   typedef Buffer TableType;
   std::vector<uint8_t> data{};
 };
 
-struct Buffer FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Buffer FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef BufferT NativeTableType;
   typedef BufferBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DATA = 4
   };
-  const flatbuffers::Vector<uint8_t> *data() const {
-    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  const ::flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_DATA);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DATA) &&
            verifier.VerifyVector(data()) &&
            verifier.EndTable();
   }
-  BufferT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Buffer> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  BufferT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BufferT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Buffer> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BufferBuilder {
   typedef Buffer Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_data(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data) {
     fbb_.AddOffset(Buffer::VT_DATA, data);
   }
-  explicit BufferBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit BufferBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Buffer> Finish() {
+  ::flatbuffers::Offset<Buffer> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Buffer>(end);
+    auto o = ::flatbuffers::Offset<Buffer>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Buffer> CreateBuffer(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data = 0) {
+inline ::flatbuffers::Offset<Buffer> CreateBuffer(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data = 0) {
   BufferBuilder builder_(_fbb);
   builder_.add_data(data);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Buffer> CreateBufferDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Buffer> CreateBufferDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<uint8_t> *data = nullptr) {
   if (data) { _fbb.ForceVectorAlignment(data->size(), sizeof(uint8_t), 16); }
   auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
@@ -12889,63 +13112,63 @@ inline flatbuffers::Offset<Buffer> CreateBufferDirect(
       data__);
 }
 
-flatbuffers::Offset<Buffer> CreateBuffer(flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Buffer> CreateBuffer(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct MetadataT : public flatbuffers::NativeTable {
+struct MetadataT : public ::flatbuffers::NativeTable {
   typedef Metadata TableType;
   std::string name{};
   uint32_t buffer = 0;
 };
 
-struct Metadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Metadata FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef MetadataT NativeTableType;
   typedef MetadataBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NAME = 4,
     VT_BUFFER = 6
   };
-  const flatbuffers::String *name() const {
-    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
   }
   uint32_t buffer() const {
     return GetField<uint32_t>(VT_BUFFER, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
            VerifyField<uint32_t>(verifier, VT_BUFFER, 4) &&
            verifier.EndTable();
   }
-  MetadataT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MetadataT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Metadata> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MetadataT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MetadataT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Metadata> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MetadataT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct MetadataBuilder {
   typedef Metadata Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
     fbb_.AddOffset(Metadata::VT_NAME, name);
   }
   void add_buffer(uint32_t buffer) {
     fbb_.AddElement<uint32_t>(Metadata::VT_BUFFER, buffer, 0);
   }
-  explicit MetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit MetadataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Metadata> Finish() {
+  ::flatbuffers::Offset<Metadata> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Metadata>(end);
+    auto o = ::flatbuffers::Offset<Metadata>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Metadata> CreateMetadata(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> name = 0,
+inline ::flatbuffers::Offset<Metadata> CreateMetadata(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
     uint32_t buffer = 0) {
   MetadataBuilder builder_(_fbb);
   builder_.add_buffer(buffer);
@@ -12953,8 +13176,8 @@ inline flatbuffers::Offset<Metadata> CreateMetadata(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Metadata> CreateMetadataDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Metadata> CreateMetadataDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *name = nullptr,
     uint32_t buffer = 0) {
   auto name__ = name ? _fbb.CreateString(name) : 0;
@@ -12964,63 +13187,63 @@ inline flatbuffers::Offset<Metadata> CreateMetadataDirect(
       buffer);
 }
 
-flatbuffers::Offset<Metadata> CreateMetadata(flatbuffers::FlatBufferBuilder &_fbb, const MetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Metadata> CreateMetadata(::flatbuffers::FlatBufferBuilder &_fbb, const MetadataT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct TensorMapT : public flatbuffers::NativeTable {
+struct TensorMapT : public ::flatbuffers::NativeTable {
   typedef TensorMap TableType;
   std::string name{};
   uint32_t tensor_index = 0;
 };
 
-struct TensorMap FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct TensorMap FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef TensorMapT NativeTableType;
   typedef TensorMapBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NAME = 4,
     VT_TENSOR_INDEX = 6
   };
-  const flatbuffers::String *name() const {
-    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
   }
   uint32_t tensor_index() const {
     return GetField<uint32_t>(VT_TENSOR_INDEX, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_NAME) &&
            verifier.VerifyString(name()) &&
            VerifyField<uint32_t>(verifier, VT_TENSOR_INDEX, 4) &&
            verifier.EndTable();
   }
-  TensorMapT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(TensorMapT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<TensorMap> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  TensorMapT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TensorMapT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<TensorMap> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct TensorMapBuilder {
   typedef TensorMap Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
     fbb_.AddOffset(TensorMap::VT_NAME, name);
   }
   void add_tensor_index(uint32_t tensor_index) {
     fbb_.AddElement<uint32_t>(TensorMap::VT_TENSOR_INDEX, tensor_index, 0);
   }
-  explicit TensorMapBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit TensorMapBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<TensorMap> Finish() {
+  ::flatbuffers::Offset<TensorMap> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<TensorMap>(end);
+    auto o = ::flatbuffers::Offset<TensorMap>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<TensorMap> CreateTensorMap(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::String> name = 0,
+inline ::flatbuffers::Offset<TensorMap> CreateTensorMap(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
     uint32_t tensor_index = 0) {
   TensorMapBuilder builder_(_fbb);
   builder_.add_tensor_index(tensor_index);
@@ -13028,8 +13251,8 @@ inline flatbuffers::Offset<TensorMap> CreateTensorMap(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<TensorMap> CreateTensorMapDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<TensorMap> CreateTensorMapDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     const char *name = nullptr,
     uint32_t tensor_index = 0) {
   auto name__ = name ? _fbb.CreateString(name) : 0;
@@ -13039,9 +13262,9 @@ inline flatbuffers::Offset<TensorMap> CreateTensorMapDirect(
       tensor_index);
 }
 
-flatbuffers::Offset<TensorMap> CreateTensorMap(flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<TensorMap> CreateTensorMap(::flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SignatureDefT : public flatbuffers::NativeTable {
+struct SignatureDefT : public ::flatbuffers::NativeTable {
   typedef SignatureDef TableType;
   std::vector<std::unique_ptr<tflite::TensorMapT>> inputs{};
   std::vector<std::unique_ptr<tflite::TensorMapT>> outputs{};
@@ -13053,7 +13276,7 @@ struct SignatureDefT : public flatbuffers::NativeTable {
   SignatureDefT &operator=(SignatureDefT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct SignatureDef FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SignatureDef FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef SignatureDefT NativeTableType;
   typedef SignatureDefBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -13062,19 +13285,19 @@ struct SignatureDef FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_SIGNATURE_KEY = 8,
     VT_SUBGRAPH_INDEX = 12
   };
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>> *inputs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>> *>(VT_INPUTS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>> *inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>> *>(VT_INPUTS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>> *outputs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>> *>(VT_OUTPUTS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>> *outputs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>> *>(VT_OUTPUTS);
   }
-  const flatbuffers::String *signature_key() const {
-    return GetPointer<const flatbuffers::String *>(VT_SIGNATURE_KEY);
+  const ::flatbuffers::String *signature_key() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_SIGNATURE_KEY);
   }
   uint32_t subgraph_index() const {
     return GetField<uint32_t>(VT_SUBGRAPH_INDEX, 0);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_INPUTS) &&
            verifier.VerifyVector(inputs()) &&
@@ -13087,43 +13310,43 @@ struct SignatureDef FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<uint32_t>(verifier, VT_SUBGRAPH_INDEX, 4) &&
            verifier.EndTable();
   }
-  SignatureDefT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SignatureDefT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SignatureDef> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SignatureDefT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SignatureDefT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<SignatureDef> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SignatureDefBuilder {
   typedef SignatureDef Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_inputs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>> inputs) {
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>>> inputs) {
     fbb_.AddOffset(SignatureDef::VT_INPUTS, inputs);
   }
-  void add_outputs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>> outputs) {
+  void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>>> outputs) {
     fbb_.AddOffset(SignatureDef::VT_OUTPUTS, outputs);
   }
-  void add_signature_key(flatbuffers::Offset<flatbuffers::String> signature_key) {
+  void add_signature_key(::flatbuffers::Offset<::flatbuffers::String> signature_key) {
     fbb_.AddOffset(SignatureDef::VT_SIGNATURE_KEY, signature_key);
   }
   void add_subgraph_index(uint32_t subgraph_index) {
     fbb_.AddElement<uint32_t>(SignatureDef::VT_SUBGRAPH_INDEX, subgraph_index, 0);
   }
-  explicit SignatureDefBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit SignatureDefBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<SignatureDef> Finish() {
+  ::flatbuffers::Offset<SignatureDef> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<SignatureDef>(end);
+    auto o = ::flatbuffers::Offset<SignatureDef>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<SignatureDef> CreateSignatureDef(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>> inputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>> outputs = 0,
-    flatbuffers::Offset<flatbuffers::String> signature_key = 0,
+inline ::flatbuffers::Offset<SignatureDef> CreateSignatureDef(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>>> outputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> signature_key = 0,
     uint32_t subgraph_index = 0) {
   SignatureDefBuilder builder_(_fbb);
   builder_.add_subgraph_index(subgraph_index);
@@ -13133,14 +13356,14 @@ inline flatbuffers::Offset<SignatureDef> CreateSignatureDef(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<SignatureDef> CreateSignatureDefDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<flatbuffers::Offset<tflite::TensorMap>> *inputs = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::TensorMap>> *outputs = nullptr,
+inline ::flatbuffers::Offset<SignatureDef> CreateSignatureDefDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::TensorMap>> *inputs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::TensorMap>> *outputs = nullptr,
     const char *signature_key = nullptr,
     uint32_t subgraph_index = 0) {
-  auto inputs__ = inputs ? _fbb.CreateVector<flatbuffers::Offset<tflite::TensorMap>>(*inputs) : 0;
-  auto outputs__ = outputs ? _fbb.CreateVector<flatbuffers::Offset<tflite::TensorMap>>(*outputs) : 0;
+  auto inputs__ = inputs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TensorMap>>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TensorMap>>(*outputs) : 0;
   auto signature_key__ = signature_key ? _fbb.CreateString(signature_key) : 0;
   return tflite::CreateSignatureDef(
       _fbb,
@@ -13150,9 +13373,9 @@ inline flatbuffers::Offset<SignatureDef> CreateSignatureDefDirect(
       subgraph_index);
 }
 
-flatbuffers::Offset<SignatureDef> CreateSignatureDef(flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<SignatureDef> CreateSignatureDef(::flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct ModelT : public flatbuffers::NativeTable {
+struct ModelT : public ::flatbuffers::NativeTable {
   typedef Model TableType;
   uint32_t version = 0;
   std::vector<std::unique_ptr<tflite::OperatorCodeT>> operator_codes{};
@@ -13168,7 +13391,7 @@ struct ModelT : public flatbuffers::NativeTable {
   ModelT &operator=(ModelT o) FLATBUFFERS_NOEXCEPT;
 };
 
-struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct Model FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef ModelT NativeTableType;
   typedef ModelBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
@@ -13184,28 +13407,28 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   uint32_t version() const {
     return GetField<uint32_t>(VT_VERSION, 0);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::OperatorCode>> *operator_codes() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::OperatorCode>> *>(VT_OPERATOR_CODES);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::OperatorCode>> *operator_codes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::OperatorCode>> *>(VT_OPERATOR_CODES);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::SubGraph>> *subgraphs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::SubGraph>> *>(VT_SUBGRAPHS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SubGraph>> *subgraphs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SubGraph>> *>(VT_SUBGRAPHS);
   }
-  const flatbuffers::String *description() const {
-    return GetPointer<const flatbuffers::String *>(VT_DESCRIPTION);
+  const ::flatbuffers::String *description() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DESCRIPTION);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>> *buffers() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>> *>(VT_BUFFERS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>> *buffers() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>> *>(VT_BUFFERS);
   }
-  const flatbuffers::Vector<int32_t> *metadata_buffer() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_METADATA_BUFFER);
+  const ::flatbuffers::Vector<int32_t> *metadata_buffer() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_METADATA_BUFFER);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>> *metadata() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>> *>(VT_METADATA);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>> *metadata() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>> *>(VT_METADATA);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>> *signature_defs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>> *>(VT_SIGNATURE_DEFS);
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>> *signature_defs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>> *>(VT_SIGNATURE_DEFS);
   }
-  bool Verify(flatbuffers::Verifier &verifier) const {
+  bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint32_t>(verifier, VT_VERSION, 4) &&
            VerifyOffset(verifier, VT_OPERATOR_CODES) &&
@@ -13229,60 +13452,60 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyVectorOfTables(signature_defs()) &&
            verifier.EndTable();
   }
-  ModelT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Model> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ModelT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Model> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ModelBuilder {
   typedef Model Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
   void add_version(uint32_t version) {
     fbb_.AddElement<uint32_t>(Model::VT_VERSION, version, 0);
   }
-  void add_operator_codes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::OperatorCode>>> operator_codes) {
+  void add_operator_codes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::OperatorCode>>> operator_codes) {
     fbb_.AddOffset(Model::VT_OPERATOR_CODES, operator_codes);
   }
-  void add_subgraphs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SubGraph>>> subgraphs) {
+  void add_subgraphs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SubGraph>>> subgraphs) {
     fbb_.AddOffset(Model::VT_SUBGRAPHS, subgraphs);
   }
-  void add_description(flatbuffers::Offset<flatbuffers::String> description) {
+  void add_description(::flatbuffers::Offset<::flatbuffers::String> description) {
     fbb_.AddOffset(Model::VT_DESCRIPTION, description);
   }
-  void add_buffers(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>> buffers) {
+  void add_buffers(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>>> buffers) {
     fbb_.AddOffset(Model::VT_BUFFERS, buffers);
   }
-  void add_metadata_buffer(flatbuffers::Offset<flatbuffers::Vector<int32_t>> metadata_buffer) {
+  void add_metadata_buffer(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> metadata_buffer) {
     fbb_.AddOffset(Model::VT_METADATA_BUFFER, metadata_buffer);
   }
-  void add_metadata(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>>> metadata) {
+  void add_metadata(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>>> metadata) {
     fbb_.AddOffset(Model::VT_METADATA, metadata);
   }
-  void add_signature_defs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>>> signature_defs) {
+  void add_signature_defs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>>> signature_defs) {
     fbb_.AddOffset(Model::VT_SIGNATURE_DEFS, signature_defs);
   }
-  explicit ModelBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit ModelBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<Model> Finish() {
+  ::flatbuffers::Offset<Model> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<Model>(end);
+    auto o = ::flatbuffers::Offset<Model>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<Model> CreateModel(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Model> CreateModel(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     uint32_t version = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::OperatorCode>>> operator_codes = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SubGraph>>> subgraphs = 0,
-    flatbuffers::Offset<flatbuffers::String> description = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>> buffers = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> metadata_buffer = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>>> metadata = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>>> signature_defs = 0) {
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::OperatorCode>>> operator_codes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SubGraph>>> subgraphs = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> description = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>>> buffers = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> metadata_buffer = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>>> metadata = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>>> signature_defs = 0) {
   ModelBuilder builder_(_fbb);
   builder_.add_signature_defs(signature_defs);
   builder_.add_metadata(metadata);
@@ -13295,23 +13518,23 @@ inline flatbuffers::Offset<Model> CreateModel(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Model> CreateModelDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
+inline ::flatbuffers::Offset<Model> CreateModelDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
     uint32_t version = 0,
-    const std::vector<flatbuffers::Offset<tflite::OperatorCode>> *operator_codes = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::SubGraph>> *subgraphs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::OperatorCode>> *operator_codes = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::SubGraph>> *subgraphs = nullptr,
     const char *description = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::Buffer>> *buffers = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::Buffer>> *buffers = nullptr,
     const std::vector<int32_t> *metadata_buffer = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::Metadata>> *metadata = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::SignatureDef>> *signature_defs = nullptr) {
-  auto operator_codes__ = operator_codes ? _fbb.CreateVector<flatbuffers::Offset<tflite::OperatorCode>>(*operator_codes) : 0;
-  auto subgraphs__ = subgraphs ? _fbb.CreateVector<flatbuffers::Offset<tflite::SubGraph>>(*subgraphs) : 0;
+    const std::vector<::flatbuffers::Offset<tflite::Metadata>> *metadata = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::SignatureDef>> *signature_defs = nullptr) {
+  auto operator_codes__ = operator_codes ? _fbb.CreateVector<::flatbuffers::Offset<tflite::OperatorCode>>(*operator_codes) : 0;
+  auto subgraphs__ = subgraphs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SubGraph>>(*subgraphs) : 0;
   auto description__ = description ? _fbb.CreateString(description) : 0;
-  auto buffers__ = buffers ? _fbb.CreateVector<flatbuffers::Offset<tflite::Buffer>>(*buffers) : 0;
+  auto buffers__ = buffers ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Buffer>>(*buffers) : 0;
   auto metadata_buffer__ = metadata_buffer ? _fbb.CreateVector<int32_t>(*metadata_buffer) : 0;
-  auto metadata__ = metadata ? _fbb.CreateVector<flatbuffers::Offset<tflite::Metadata>>(*metadata) : 0;
-  auto signature_defs__ = signature_defs ? _fbb.CreateVector<flatbuffers::Offset<tflite::SignatureDef>>(*signature_defs) : 0;
+  auto metadata__ = metadata ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Metadata>>(*metadata) : 0;
+  auto signature_defs__ = signature_defs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SignatureDef>>(*signature_defs) : 0;
   return tflite::CreateModel(
       _fbb,
       version,
@@ -13324,28 +13547,28 @@ inline flatbuffers::Offset<Model> CreateModelDirect(
       signature_defs__);
 }
 
-flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+::flatbuffers::Offset<Model> CreateModel(::flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-inline CustomQuantizationT *CustomQuantization::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline CustomQuantizationT *CustomQuantization::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<CustomQuantizationT>(new CustomQuantizationT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void CustomQuantization::UnPackTo(CustomQuantizationT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void CustomQuantization::UnPackTo(CustomQuantizationT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = custom(); if (_e) { _o->custom.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->custom.begin()); } }
 }
 
-inline flatbuffers::Offset<CustomQuantization> CustomQuantization::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CustomQuantization> CustomQuantization::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateCustomQuantization(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(::flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CustomQuantizationT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CustomQuantizationT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   _fbb.ForceVectorAlignment(_o->custom.size(), sizeof(uint8_t), 16);
   auto _custom = _o->custom.size() ? _fbb.CreateVector(_o->custom) : 0;
   return tflite::CreateCustomQuantization(
@@ -13353,32 +13576,32 @@ inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuff
       _custom);
 }
 
-inline QuantizationParametersT *QuantizationParameters::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline QuantizationParametersT *QuantizationParameters::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<QuantizationParametersT>(new QuantizationParametersT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void QuantizationParameters::UnPackTo(QuantizationParametersT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void QuantizationParameters::UnPackTo(QuantizationParametersT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = min(); if (_e) { _o->min.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->min[_i] = _e->Get(_i); } } }
-  { auto _e = max(); if (_e) { _o->max.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->max[_i] = _e->Get(_i); } } }
-  { auto _e = scale(); if (_e) { _o->scale.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scale[_i] = _e->Get(_i); } } }
-  { auto _e = zero_point(); if (_e) { _o->zero_point.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zero_point[_i] = _e->Get(_i); } } }
+  { auto _e = min(); if (_e) { _o->min.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->min[_i] = _e->Get(_i); } } else { _o->min.resize(0); } }
+  { auto _e = max(); if (_e) { _o->max.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->max[_i] = _e->Get(_i); } } else { _o->max.resize(0); } }
+  { auto _e = scale(); if (_e) { _o->scale.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scale[_i] = _e->Get(_i); } } else { _o->scale.resize(0); } }
+  { auto _e = zero_point(); if (_e) { _o->zero_point.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zero_point[_i] = _e->Get(_i); } } else { _o->zero_point.resize(0); } }
   { auto _e = details_type(); _o->details.type = _e; }
   { auto _e = details(); if (_e) _o->details.value = tflite::QuantizationDetailsUnion::UnPack(_e, details_type(), _resolver); }
   { auto _e = quantized_dimension(); _o->quantized_dimension = _e; }
 }
 
-inline flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateQuantizationParameters(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const QuantizationParametersT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const QuantizationParametersT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _min = _o->min.size() ? _fbb.CreateVector(_o->min) : 0;
   auto _max = _o->max.size() ? _fbb.CreateVector(_o->max) : 0;
   auto _scale = _o->scale.size() ? _fbb.CreateVector(_o->scale) : 0;
@@ -13397,52 +13620,52 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
       _quantized_dimension);
 }
 
-inline Int32VectorT *Int32Vector::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline Int32VectorT *Int32Vector::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<Int32VectorT>(new Int32VectorT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Int32Vector::UnPackTo(Int32VectorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Int32Vector::UnPackTo(Int32VectorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } }
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } else { _o->values.resize(0); } }
 }
 
-inline flatbuffers::Offset<Int32Vector> Int32Vector::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Int32Vector> Int32Vector::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateInt32Vector(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Int32Vector> CreateInt32Vector(flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Int32Vector> CreateInt32Vector(::flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Int32VectorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Int32VectorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
   return tflite::CreateInt32Vector(
       _fbb,
       _values);
 }
 
-inline Uint16VectorT *Uint16Vector::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline Uint16VectorT *Uint16Vector::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<Uint16VectorT>(new Uint16VectorT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Uint16Vector::UnPackTo(Uint16VectorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Uint16Vector::UnPackTo(Uint16VectorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } }
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } else { _o->values.resize(0); } }
 }
 
-inline flatbuffers::Offset<Uint16Vector> Uint16Vector::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Uint16Vector> Uint16Vector::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateUint16Vector(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Uint16Vector> CreateUint16Vector(flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Uint16Vector> CreateUint16Vector(::flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Uint16VectorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Uint16VectorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   _fbb.ForceVectorAlignment(_o->values.size(), sizeof(uint16_t), 4);
   auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
   return tflite::CreateUint16Vector(
@@ -13450,26 +13673,26 @@ inline flatbuffers::Offset<Uint16Vector> CreateUint16Vector(flatbuffers::FlatBuf
       _values);
 }
 
-inline Uint8VectorT *Uint8Vector::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline Uint8VectorT *Uint8Vector::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<Uint8VectorT>(new Uint8VectorT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Uint8Vector::UnPackTo(Uint8VectorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Uint8Vector::UnPackTo(Uint8VectorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = values(); if (_e) { _o->values.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->values.begin()); } }
 }
 
-inline flatbuffers::Offset<Uint8Vector> Uint8Vector::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Uint8Vector> Uint8Vector::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateUint8Vector(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Uint8Vector> CreateUint8Vector(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Uint8Vector> CreateUint8Vector(::flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Uint8VectorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Uint8VectorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   _fbb.ForceVectorAlignment(_o->values.size(), sizeof(uint8_t), 4);
   auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
   return tflite::CreateUint8Vector(
@@ -13477,13 +13700,13 @@ inline flatbuffers::Offset<Uint8Vector> CreateUint8Vector(flatbuffers::FlatBuffe
       _values);
 }
 
-inline DimensionMetadataT *DimensionMetadata::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline DimensionMetadataT *DimensionMetadata::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<DimensionMetadataT>(new DimensionMetadataT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void DimensionMetadata::UnPackTo(DimensionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void DimensionMetadata::UnPackTo(DimensionMetadataT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = format(); _o->format = _e; }
@@ -13494,14 +13717,14 @@ inline void DimensionMetadata::UnPackTo(DimensionMetadataT *_o, const flatbuffer
   { auto _e = array_indices(); if (_e) _o->array_indices.value = tflite::SparseIndexVectorUnion::UnPack(_e, array_indices_type(), _resolver); }
 }
 
-inline flatbuffers::Offset<DimensionMetadata> DimensionMetadata::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DimensionMetadata> DimensionMetadata::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateDimensionMetadata(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(::flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DimensionMetadataT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DimensionMetadataT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _format = _o->format;
   auto _dense_size = _o->dense_size;
   auto _array_segments_type = _o->array_segments.type;
@@ -13522,7 +13745,7 @@ inline SparsityParametersT::SparsityParametersT(const SparsityParametersT &o)
       : traversal_order(o.traversal_order),
         block_map(o.block_map) {
   dim_metadata.reserve(o.dim_metadata.size());
-  for (const auto &v : o.dim_metadata) { dim_metadata.emplace_back((v) ? new tflite::DimensionMetadataT(*v) : nullptr); }
+  for (const auto &dim_metadata_ : o.dim_metadata) { dim_metadata.emplace_back((dim_metadata_) ? new tflite::DimensionMetadataT(*dim_metadata_) : nullptr); }
 }
 
 inline SparsityParametersT &SparsityParametersT::operator=(SparsityParametersT o) FLATBUFFERS_NOEXCEPT {
@@ -13532,31 +13755,31 @@ inline SparsityParametersT &SparsityParametersT::operator=(SparsityParametersT o
   return *this;
 }
 
-inline SparsityParametersT *SparsityParameters::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SparsityParametersT *SparsityParameters::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SparsityParametersT>(new SparsityParametersT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SparsityParameters::UnPackTo(SparsityParametersT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SparsityParameters::UnPackTo(SparsityParametersT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = traversal_order(); if (_e) { _o->traversal_order.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->traversal_order[_i] = _e->Get(_i); } } }
-  { auto _e = block_map(); if (_e) { _o->block_map.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->block_map[_i] = _e->Get(_i); } } }
-  { auto _e = dim_metadata(); if (_e) { _o->dim_metadata.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->dim_metadata[_i]) { _e->Get(_i)->UnPackTo(_o->dim_metadata[_i].get(), _resolver); } else { _o->dim_metadata[_i] = std::unique_ptr<tflite::DimensionMetadataT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
+  { auto _e = traversal_order(); if (_e) { _o->traversal_order.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->traversal_order[_i] = _e->Get(_i); } } else { _o->traversal_order.resize(0); } }
+  { auto _e = block_map(); if (_e) { _o->block_map.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->block_map[_i] = _e->Get(_i); } } else { _o->block_map.resize(0); } }
+  { auto _e = dim_metadata(); if (_e) { _o->dim_metadata.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->dim_metadata[_i]) { _e->Get(_i)->UnPackTo(_o->dim_metadata[_i].get(), _resolver); } else { _o->dim_metadata[_i] = std::unique_ptr<tflite::DimensionMetadataT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->dim_metadata.resize(0); } }
 }
 
-inline flatbuffers::Offset<SparsityParameters> SparsityParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SparsityParameters> SparsityParameters::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSparsityParameters(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(::flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SparsityParametersT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SparsityParametersT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _traversal_order = _o->traversal_order.size() ? _fbb.CreateVector(_o->traversal_order) : 0;
   auto _block_map = _o->block_map.size() ? _fbb.CreateVector(_o->block_map) : 0;
-  auto _dim_metadata = _o->dim_metadata.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::DimensionMetadata>> (_o->dim_metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateDimensionMetadata(*__va->__fbb, __va->__o->dim_metadata[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _dim_metadata = _o->dim_metadata.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::DimensionMetadata>> (_o->dim_metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateDimensionMetadata(*__va->__fbb, __va->__o->dim_metadata[i].get(), __va->__rehasher); }, &_va ) : 0;
   return tflite::CreateSparsityParameters(
       _fbb,
       _traversal_order,
@@ -13564,28 +13787,28 @@ inline flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(flatbuff
       _dim_metadata);
 }
 
-inline VariantSubTypeT *VariantSubType::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline VariantSubTypeT *VariantSubType::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<VariantSubTypeT>(new VariantSubTypeT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void VariantSubType::UnPackTo(VariantSubTypeT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void VariantSubType::UnPackTo(VariantSubTypeT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } }
+  { auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } else { _o->shape.resize(0); } }
   { auto _e = type(); _o->type = _e; }
   { auto _e = has_rank(); _o->has_rank = _e; }
 }
 
-inline flatbuffers::Offset<VariantSubType> VariantSubType::Pack(flatbuffers::FlatBufferBuilder &_fbb, const VariantSubTypeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<VariantSubType> VariantSubType::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const VariantSubTypeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateVariantSubType(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<VariantSubType> CreateVariantSubType(flatbuffers::FlatBufferBuilder &_fbb, const VariantSubTypeT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<VariantSubType> CreateVariantSubType(::flatbuffers::FlatBufferBuilder &_fbb, const VariantSubTypeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const VariantSubTypeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const VariantSubTypeT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0;
   auto _type = _o->type;
   auto _has_rank = _o->has_rank;
@@ -13607,7 +13830,7 @@ inline TensorT::TensorT(const TensorT &o)
         shape_signature(o.shape_signature),
         has_rank(o.has_rank) {
   variant_tensors.reserve(o.variant_tensors.size());
-  for (const auto &v : o.variant_tensors) { variant_tensors.emplace_back((v) ? new tflite::VariantSubTypeT(*v) : nullptr); }
+  for (const auto &variant_tensors_ : o.variant_tensors) { variant_tensors.emplace_back((variant_tensors_) ? new tflite::VariantSubTypeT(*variant_tensors_) : nullptr); }
 }
 
 inline TensorT &TensorT::operator=(TensorT o) FLATBUFFERS_NOEXCEPT {
@@ -13624,35 +13847,35 @@ inline TensorT &TensorT::operator=(TensorT o) FLATBUFFERS_NOEXCEPT {
   return *this;
 }
 
-inline TensorT *Tensor::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline TensorT *Tensor::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<TensorT>(new TensorT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Tensor::UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Tensor::UnPackTo(TensorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } }
+  { auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } else { _o->shape.resize(0); } }
   { auto _e = type(); _o->type = _e; }
   { auto _e = buffer(); _o->buffer = _e; }
   { auto _e = name(); if (_e) _o->name = _e->str(); }
-  { auto _e = quantization(); if (_e) { if(_o->quantization) { _e->UnPackTo(_o->quantization.get(), _resolver); } else { _o->quantization = std::unique_ptr<tflite::QuantizationParametersT>(_e->UnPack(_resolver)); } } }
+  { auto _e = quantization(); if (_e) { if(_o->quantization) { _e->UnPackTo(_o->quantization.get(), _resolver); } else { _o->quantization = std::unique_ptr<tflite::QuantizationParametersT>(_e->UnPack(_resolver)); } } else if (_o->quantization) { _o->quantization.reset(); } }
   { auto _e = is_variable(); _o->is_variable = _e; }
-  { auto _e = sparsity(); if (_e) { if(_o->sparsity) { _e->UnPackTo(_o->sparsity.get(), _resolver); } else { _o->sparsity = std::unique_ptr<tflite::SparsityParametersT>(_e->UnPack(_resolver)); } } }
-  { auto _e = shape_signature(); if (_e) { _o->shape_signature.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape_signature[_i] = _e->Get(_i); } } }
+  { auto _e = sparsity(); if (_e) { if(_o->sparsity) { _e->UnPackTo(_o->sparsity.get(), _resolver); } else { _o->sparsity = std::unique_ptr<tflite::SparsityParametersT>(_e->UnPack(_resolver)); } } else if (_o->sparsity) { _o->sparsity.reset(); } }
+  { auto _e = shape_signature(); if (_e) { _o->shape_signature.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape_signature[_i] = _e->Get(_i); } } else { _o->shape_signature.resize(0); } }
   { auto _e = has_rank(); _o->has_rank = _e; }
-  { auto _e = variant_tensors(); if (_e) { _o->variant_tensors.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->variant_tensors[_i]) { _e->Get(_i)->UnPackTo(_o->variant_tensors[_i].get(), _resolver); } else { _o->variant_tensors[_i] = std::unique_ptr<tflite::VariantSubTypeT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
+  { auto _e = variant_tensors(); if (_e) { _o->variant_tensors.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->variant_tensors[_i]) { _e->Get(_i)->UnPackTo(_o->variant_tensors[_i].get(), _resolver); } else { _o->variant_tensors[_i] = std::unique_ptr<tflite::VariantSubTypeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->variant_tensors.resize(0); } }
 }
 
-inline flatbuffers::Offset<Tensor> Tensor::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Tensor> Tensor::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateTensor(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Tensor> CreateTensor(flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Tensor> CreateTensor(::flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TensorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TensorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0;
   auto _type = _o->type;
   auto _buffer = _o->buffer;
@@ -13662,7 +13885,7 @@ inline flatbuffers::Offset<Tensor> CreateTensor(flatbuffers::FlatBufferBuilder &
   auto _sparsity = _o->sparsity ? CreateSparsityParameters(_fbb, _o->sparsity.get(), _rehasher) : 0;
   auto _shape_signature = _o->shape_signature.size() ? _fbb.CreateVector(_o->shape_signature) : 0;
   auto _has_rank = _o->has_rank;
-  auto _variant_tensors = _o->variant_tensors.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::VariantSubType>> (_o->variant_tensors.size(), [](size_t i, _VectorArgs *__va) { return CreateVariantSubType(*__va->__fbb, __va->__o->variant_tensors[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _variant_tensors = _o->variant_tensors.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::VariantSubType>> (_o->variant_tensors.size(), [](size_t i, _VectorArgs *__va) { return CreateVariantSubType(*__va->__fbb, __va->__o->variant_tensors[i].get(), __va->__rehasher); }, &_va ) : 0;
   return tflite::CreateTensor(
       _fbb,
       _shape,
@@ -13677,13 +13900,13 @@ inline flatbuffers::Offset<Tensor> CreateTensor(flatbuffers::FlatBufferBuilder &
       _variant_tensors);
 }
 
-inline Conv2DOptionsT *Conv2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline Conv2DOptionsT *Conv2DOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<Conv2DOptionsT>(new Conv2DOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Conv2DOptions::UnPackTo(Conv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Conv2DOptions::UnPackTo(Conv2DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = padding(); _o->padding = _e; }
@@ -13694,14 +13917,14 @@ inline void Conv2DOptions::UnPackTo(Conv2DOptionsT *_o, const flatbuffers::resol
   { auto _e = dilation_h_factor(); _o->dilation_h_factor = _e; }
 }
 
-inline flatbuffers::Offset<Conv2DOptions> Conv2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Conv2DOptions> Conv2DOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateConv2DOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Conv2DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Conv2DOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _padding = _o->padding;
   auto _stride_w = _o->stride_w;
   auto _stride_h = _o->stride_h;
@@ -13718,13 +13941,13 @@ inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(flatbuffers::FlatB
       _dilation_h_factor);
 }
 
-inline Conv3DOptionsT *Conv3DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline Conv3DOptionsT *Conv3DOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<Conv3DOptionsT>(new Conv3DOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Conv3DOptions::UnPackTo(Conv3DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Conv3DOptions::UnPackTo(Conv3DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = padding(); _o->padding = _e; }
@@ -13737,14 +13960,14 @@ inline void Conv3DOptions::UnPackTo(Conv3DOptionsT *_o, const flatbuffers::resol
   { auto _e = dilation_h_factor(); _o->dilation_h_factor = _e; }
 }
 
-inline flatbuffers::Offset<Conv3DOptions> Conv3DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv3DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Conv3DOptions> Conv3DOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Conv3DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateConv3DOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Conv3DOptions> CreateConv3DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Conv3DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Conv3DOptions> CreateConv3DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Conv3DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Conv3DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Conv3DOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _padding = _o->padding;
   auto _stride_d = _o->stride_d;
   auto _stride_w = _o->stride_w;
@@ -13765,13 +13988,13 @@ inline flatbuffers::Offset<Conv3DOptions> CreateConv3DOptions(flatbuffers::FlatB
       _dilation_h_factor);
 }
 
-inline Pool2DOptionsT *Pool2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline Pool2DOptionsT *Pool2DOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<Pool2DOptionsT>(new Pool2DOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Pool2DOptions::UnPackTo(Pool2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Pool2DOptions::UnPackTo(Pool2DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = padding(); _o->padding = _e; }
@@ -13782,14 +14005,14 @@ inline void Pool2DOptions::UnPackTo(Pool2DOptionsT *_o, const flatbuffers::resol
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
 }
 
-inline flatbuffers::Offset<Pool2DOptions> Pool2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Pool2DOptions> Pool2DOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreatePool2DOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Pool2DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Pool2DOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _padding = _o->padding;
   auto _stride_w = _o->stride_w;
   auto _stride_h = _o->stride_h;
@@ -13806,13 +14029,13 @@ inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(flatbuffers::FlatB
       _fused_activation_function);
 }
 
-inline DepthwiseConv2DOptionsT *DepthwiseConv2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline DepthwiseConv2DOptionsT *DepthwiseConv2DOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<DepthwiseConv2DOptionsT>(new DepthwiseConv2DOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void DepthwiseConv2DOptions::UnPackTo(DepthwiseConv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void DepthwiseConv2DOptions::UnPackTo(DepthwiseConv2DOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = padding(); _o->padding = _e; }
@@ -13824,14 +14047,14 @@ inline void DepthwiseConv2DOptions::UnPackTo(DepthwiseConv2DOptionsT *_o, const
   { auto _e = dilation_h_factor(); _o->dilation_h_factor = _e; }
 }
 
-inline flatbuffers::Offset<DepthwiseConv2DOptions> DepthwiseConv2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DepthwiseConv2DOptions> DepthwiseConv2DOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateDepthwiseConv2DOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DepthwiseConv2DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DepthwiseConv2DOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _padding = _o->padding;
   auto _stride_w = _o->stride_w;
   auto _stride_h = _o->stride_h;
@@ -13850,28 +14073,28 @@ inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
       _dilation_h_factor);
 }
 
-inline ConcatEmbeddingsOptionsT *ConcatEmbeddingsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ConcatEmbeddingsOptionsT *ConcatEmbeddingsOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ConcatEmbeddingsOptionsT>(new ConcatEmbeddingsOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ConcatEmbeddingsOptions::UnPackTo(ConcatEmbeddingsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ConcatEmbeddingsOptions::UnPackTo(ConcatEmbeddingsOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = num_channels(); _o->num_channels = _e; }
-  { auto _e = num_columns_per_channel(); if (_e) { _o->num_columns_per_channel.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->num_columns_per_channel[_i] = _e->Get(_i); } } }
-  { auto _e = embedding_dim_per_channel(); if (_e) { _o->embedding_dim_per_channel.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->embedding_dim_per_channel[_i] = _e->Get(_i); } } }
+  { auto _e = num_columns_per_channel(); if (_e) { _o->num_columns_per_channel.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->num_columns_per_channel[_i] = _e->Get(_i); } } else { _o->num_columns_per_channel.resize(0); } }
+  { auto _e = embedding_dim_per_channel(); if (_e) { _o->embedding_dim_per_channel.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->embedding_dim_per_channel[_i] = _e->Get(_i); } } else { _o->embedding_dim_per_channel.resize(0); } }
 }
 
-inline flatbuffers::Offset<ConcatEmbeddingsOptions> ConcatEmbeddingsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ConcatEmbeddingsOptions> ConcatEmbeddingsOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateConcatEmbeddingsOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ConcatEmbeddingsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ConcatEmbeddingsOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _num_channels = _o->num_channels;
   auto _num_columns_per_channel = _o->num_columns_per_channel.size() ? _fbb.CreateVector(_o->num_columns_per_channel) : 0;
   auto _embedding_dim_per_channel = _o->embedding_dim_per_channel.size() ? _fbb.CreateVector(_o->embedding_dim_per_channel) : 0;
@@ -13882,39 +14105,39 @@ inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOption
       _embedding_dim_per_channel);
 }
 
-inline LSHProjectionOptionsT *LSHProjectionOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline LSHProjectionOptionsT *LSHProjectionOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<LSHProjectionOptionsT>(new LSHProjectionOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void LSHProjectionOptions::UnPackTo(LSHProjectionOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LSHProjectionOptions::UnPackTo(LSHProjectionOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = type(); _o->type = _e; }
 }
 
-inline flatbuffers::Offset<LSHProjectionOptions> LSHProjectionOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LSHProjectionOptions> LSHProjectionOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLSHProjectionOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LSHProjectionOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LSHProjectionOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _type = _o->type;
   return tflite::CreateLSHProjectionOptions(
       _fbb,
       _type);
 }
 
-inline SVDFOptionsT *SVDFOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SVDFOptionsT *SVDFOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SVDFOptionsT>(new SVDFOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SVDFOptions::UnPackTo(SVDFOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SVDFOptions::UnPackTo(SVDFOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = rank(); _o->rank = _e; }
@@ -13922,14 +14145,14 @@ inline void SVDFOptions::UnPackTo(SVDFOptionsT *_o, const flatbuffers::resolver_
   { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
 }
 
-inline flatbuffers::Offset<SVDFOptions> SVDFOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SVDFOptions> SVDFOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSVDFOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SVDFOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SVDFOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _rank = _o->rank;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
@@ -13940,27 +14163,27 @@ inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(flatbuffers::FlatBuffe
       _asymmetric_quantize_inputs);
 }
 
-inline RNNOptionsT *RNNOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline RNNOptionsT *RNNOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<RNNOptionsT>(new RNNOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void RNNOptions::UnPackTo(RNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void RNNOptions::UnPackTo(RNNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
   { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
 }
 
-inline flatbuffers::Offset<RNNOptions> RNNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<RNNOptions> RNNOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateRNNOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<RNNOptions> CreateRNNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RNNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const RNNOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
   return tflite::CreateRNNOptions(
@@ -13969,13 +14192,13 @@ inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(flatbuffers::FlatBufferB
       _asymmetric_quantize_inputs);
 }
 
-inline SequenceRNNOptionsT *SequenceRNNOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SequenceRNNOptionsT *SequenceRNNOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SequenceRNNOptionsT>(new SequenceRNNOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SequenceRNNOptions::UnPackTo(SequenceRNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SequenceRNNOptions::UnPackTo(SequenceRNNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = time_major(); _o->time_major = _e; }
@@ -13983,14 +14206,14 @@ inline void SequenceRNNOptions::UnPackTo(SequenceRNNOptionsT *_o, const flatbuff
   { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
 }
 
-inline flatbuffers::Offset<SequenceRNNOptions> SequenceRNNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SequenceRNNOptions> SequenceRNNOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSequenceRNNOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SequenceRNNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SequenceRNNOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _time_major = _o->time_major;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
@@ -14001,13 +14224,13 @@ inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(flatbuff
       _asymmetric_quantize_inputs);
 }
 
-inline BidirectionalSequenceRNNOptionsT *BidirectionalSequenceRNNOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BidirectionalSequenceRNNOptionsT *BidirectionalSequenceRNNOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BidirectionalSequenceRNNOptionsT>(new BidirectionalSequenceRNNOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BidirectionalSequenceRNNOptions::UnPackTo(BidirectionalSequenceRNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BidirectionalSequenceRNNOptions::UnPackTo(BidirectionalSequenceRNNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = time_major(); _o->time_major = _e; }
@@ -14016,14 +14239,14 @@ inline void BidirectionalSequenceRNNOptions::UnPackTo(BidirectionalSequenceRNNOp
   { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
 }
 
-inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> BidirectionalSequenceRNNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BidirectionalSequenceRNNOptions> BidirectionalSequenceRNNOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBidirectionalSequenceRNNOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceRNNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BidirectionalSequenceRNNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BidirectionalSequenceRNNOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _time_major = _o->time_major;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _merge_outputs = _o->merge_outputs;
@@ -14036,13 +14259,13 @@ inline flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalS
       _asymmetric_quantize_inputs);
 }
 
-inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<FullyConnectedOptionsT>(new FullyConnectedOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void FullyConnectedOptions::UnPackTo(FullyConnectedOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void FullyConnectedOptions::UnPackTo(FullyConnectedOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
@@ -14051,14 +14274,14 @@ inline void FullyConnectedOptions::UnPackTo(FullyConnectedOptionsT *_o, const fl
   { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
 }
 
-inline flatbuffers::Offset<FullyConnectedOptions> FullyConnectedOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<FullyConnectedOptions> FullyConnectedOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateFullyConnectedOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FullyConnectedOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FullyConnectedOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _weights_format = _o->weights_format;
   auto _keep_num_dims = _o->keep_num_dims;
@@ -14071,53 +14294,53 @@ inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(fl
       _asymmetric_quantize_inputs);
 }
 
-inline SoftmaxOptionsT *SoftmaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SoftmaxOptionsT *SoftmaxOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SoftmaxOptionsT>(new SoftmaxOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SoftmaxOptions::UnPackTo(SoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SoftmaxOptions::UnPackTo(SoftmaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = beta(); _o->beta = _e; }
 }
 
-inline flatbuffers::Offset<SoftmaxOptions> SoftmaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SoftmaxOptions> SoftmaxOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSoftmaxOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SoftmaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SoftmaxOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _beta = _o->beta;
   return tflite::CreateSoftmaxOptions(
       _fbb,
       _beta);
 }
 
-inline ConcatenationOptionsT *ConcatenationOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ConcatenationOptionsT *ConcatenationOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ConcatenationOptionsT>(new ConcatenationOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ConcatenationOptions::UnPackTo(ConcatenationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ConcatenationOptions::UnPackTo(ConcatenationOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = axis(); _o->axis = _e; }
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
 }
 
-inline flatbuffers::Offset<ConcatenationOptions> ConcatenationOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ConcatenationOptions> ConcatenationOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateConcatenationOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ConcatenationOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ConcatenationOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _axis = _o->axis;
   auto _fused_activation_function = _o->fused_activation_function;
   return tflite::CreateConcatenationOptions(
@@ -14126,27 +14349,27 @@ inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(flat
       _fused_activation_function);
 }
 
-inline AddOptionsT *AddOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline AddOptionsT *AddOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<AddOptionsT>(new AddOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void AddOptions::UnPackTo(AddOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void AddOptions::UnPackTo(AddOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
   { auto _e = pot_scale_int16(); _o->pot_scale_int16 = _e; }
 }
 
-inline flatbuffers::Offset<AddOptions> AddOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<AddOptions> AddOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateAddOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<AddOptions> CreateAddOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<AddOptions> CreateAddOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AddOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const AddOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _pot_scale_int16 = _o->pot_scale_int16;
   return tflite::CreateAddOptions(
@@ -14155,65 +14378,65 @@ inline flatbuffers::Offset<AddOptions> CreateAddOptions(flatbuffers::FlatBufferB
       _pot_scale_int16);
 }
 
-inline MulOptionsT *MulOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline MulOptionsT *MulOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<MulOptionsT>(new MulOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void MulOptions::UnPackTo(MulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MulOptions::UnPackTo(MulOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
 }
 
-inline flatbuffers::Offset<MulOptions> MulOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MulOptions> MulOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateMulOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MulOptions> CreateMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MulOptions> CreateMulOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MulOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MulOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   return tflite::CreateMulOptions(
       _fbb,
       _fused_activation_function);
 }
 
-inline L2NormOptionsT *L2NormOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline L2NormOptionsT *L2NormOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<L2NormOptionsT>(new L2NormOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void L2NormOptions::UnPackTo(L2NormOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void L2NormOptions::UnPackTo(L2NormOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
 }
 
-inline flatbuffers::Offset<L2NormOptions> L2NormOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<L2NormOptions> L2NormOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateL2NormOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(::flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const L2NormOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const L2NormOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   return tflite::CreateL2NormOptions(
       _fbb,
       _fused_activation_function);
 }
 
-inline LocalResponseNormalizationOptionsT *LocalResponseNormalizationOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline LocalResponseNormalizationOptionsT *LocalResponseNormalizationOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<LocalResponseNormalizationOptionsT>(new LocalResponseNormalizationOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void LocalResponseNormalizationOptions::UnPackTo(LocalResponseNormalizationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LocalResponseNormalizationOptions::UnPackTo(LocalResponseNormalizationOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = radius(); _o->radius = _e; }
@@ -14222,14 +14445,14 @@ inline void LocalResponseNormalizationOptions::UnPackTo(LocalResponseNormalizati
   { auto _e = beta(); _o->beta = _e; }
 }
 
-inline flatbuffers::Offset<LocalResponseNormalizationOptions> LocalResponseNormalizationOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LocalResponseNormalizationOptions> LocalResponseNormalizationOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLocalResponseNormalizationOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LocalResponseNormalizationOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LocalResponseNormalizationOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _radius = _o->radius;
   auto _bias = _o->bias;
   auto _alpha = _o->alpha;
@@ -14242,13 +14465,13 @@ inline flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalRespons
       _beta);
 }
 
-inline LSTMOptionsT *LSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline LSTMOptionsT *LSTMOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<LSTMOptionsT>(new LSTMOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void LSTMOptions::UnPackTo(LSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LSTMOptions::UnPackTo(LSTMOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
@@ -14258,14 +14481,14 @@ inline void LSTMOptions::UnPackTo(LSTMOptionsT *_o, const flatbuffers::resolver_
   { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
 }
 
-inline flatbuffers::Offset<LSTMOptions> LSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LSTMOptions> LSTMOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLSTMOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LSTMOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _cell_clip = _o->cell_clip;
   auto _proj_clip = _o->proj_clip;
@@ -14280,13 +14503,13 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBuffe
       _asymmetric_quantize_inputs);
 }
 
-inline UnidirectionalSequenceLSTMOptionsT *UnidirectionalSequenceLSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline UnidirectionalSequenceLSTMOptionsT *UnidirectionalSequenceLSTMOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<UnidirectionalSequenceLSTMOptionsT>(new UnidirectionalSequenceLSTMOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void UnidirectionalSequenceLSTMOptions::UnPackTo(UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void UnidirectionalSequenceLSTMOptions::UnPackTo(UnidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
@@ -14297,14 +14520,14 @@ inline void UnidirectionalSequenceLSTMOptions::UnPackTo(UnidirectionalSequenceLS
   { auto _e = diagonal_recurrent_tensors(); _o->diagonal_recurrent_tensors = _e; }
 }
 
-inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> UnidirectionalSequenceLSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> UnidirectionalSequenceLSTMOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateUnidirectionalSequenceLSTMOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UnidirectionalSequenceLSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UnidirectionalSequenceLSTMOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _cell_clip = _o->cell_clip;
   auto _proj_clip = _o->proj_clip;
@@ -14321,13 +14544,13 @@ inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirection
       _diagonal_recurrent_tensors);
 }
 
-inline BidirectionalSequenceLSTMOptionsT *BidirectionalSequenceLSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BidirectionalSequenceLSTMOptionsT *BidirectionalSequenceLSTMOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BidirectionalSequenceLSTMOptionsT>(new BidirectionalSequenceLSTMOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BidirectionalSequenceLSTMOptions::UnPackTo(BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BidirectionalSequenceLSTMOptions::UnPackTo(BidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
@@ -14338,14 +14561,14 @@ inline void BidirectionalSequenceLSTMOptions::UnPackTo(BidirectionalSequenceLSTM
   { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
 }
 
-inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> BidirectionalSequenceLSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> BidirectionalSequenceLSTMOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBidirectionalSequenceLSTMOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BidirectionalSequenceLSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BidirectionalSequenceLSTMOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _cell_clip = _o->cell_clip;
   auto _proj_clip = _o->proj_clip;
@@ -14362,27 +14585,27 @@ inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectional
       _asymmetric_quantize_inputs);
 }
 
-inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ResizeBilinearOptionsT>(new ResizeBilinearOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ResizeBilinearOptions::UnPackTo(ResizeBilinearOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ResizeBilinearOptions::UnPackTo(ResizeBilinearOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = align_corners(); _o->align_corners = _e; }
   { auto _e = half_pixel_centers(); _o->half_pixel_centers = _e; }
 }
 
-inline flatbuffers::Offset<ResizeBilinearOptions> ResizeBilinearOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ResizeBilinearOptions> ResizeBilinearOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateResizeBilinearOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ResizeBilinearOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ResizeBilinearOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _align_corners = _o->align_corners;
   auto _half_pixel_centers = _o->half_pixel_centers;
   return tflite::CreateResizeBilinearOptions(
@@ -14391,27 +14614,27 @@ inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(fl
       _half_pixel_centers);
 }
 
-inline ResizeNearestNeighborOptionsT *ResizeNearestNeighborOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ResizeNearestNeighborOptionsT *ResizeNearestNeighborOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ResizeNearestNeighborOptionsT>(new ResizeNearestNeighborOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ResizeNearestNeighborOptions::UnPackTo(ResizeNearestNeighborOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ResizeNearestNeighborOptions::UnPackTo(ResizeNearestNeighborOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = align_corners(); _o->align_corners = _e; }
   { auto _e = half_pixel_centers(); _o->half_pixel_centers = _e; }
 }
 
-inline flatbuffers::Offset<ResizeNearestNeighborOptions> ResizeNearestNeighborOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ResizeNearestNeighborOptions> ResizeNearestNeighborOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateResizeNearestNeighborOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ResizeNearestNeighborOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ResizeNearestNeighborOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _align_corners = _o->align_corners;
   auto _half_pixel_centers = _o->half_pixel_centers;
   return tflite::CreateResizeNearestNeighborOptions(
@@ -14420,157 +14643,157 @@ inline flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeig
       _half_pixel_centers);
 }
 
-inline CallOptionsT *CallOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline CallOptionsT *CallOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<CallOptionsT>(new CallOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void CallOptions::UnPackTo(CallOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void CallOptions::UnPackTo(CallOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = subgraph(); _o->subgraph = _e; }
 }
 
-inline flatbuffers::Offset<CallOptions> CallOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CallOptions> CallOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateCallOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CallOptions> CreateCallOptions(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CallOptions> CreateCallOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CallOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CallOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _subgraph = _o->subgraph;
   return tflite::CreateCallOptions(
       _fbb,
       _subgraph);
 }
 
-inline PadOptionsT *PadOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline PadOptionsT *PadOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<PadOptionsT>(new PadOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void PadOptions::UnPackTo(PadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void PadOptions::UnPackTo(PadOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<PadOptions> PadOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<PadOptions> PadOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreatePadOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<PadOptions> CreatePadOptions(flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<PadOptions> CreatePadOptions(::flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PadOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const PadOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreatePadOptions(
       _fbb);
 }
 
-inline PadV2OptionsT *PadV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline PadV2OptionsT *PadV2Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<PadV2OptionsT>(new PadV2OptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void PadV2Options::UnPackTo(PadV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void PadV2Options::UnPackTo(PadV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<PadV2Options> PadV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<PadV2Options> PadV2Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreatePadV2Options(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<PadV2Options> CreatePadV2Options(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<PadV2Options> CreatePadV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PadV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const PadV2OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreatePadV2Options(
       _fbb);
 }
 
-inline ReshapeOptionsT *ReshapeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ReshapeOptionsT *ReshapeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ReshapeOptionsT>(new ReshapeOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ReshapeOptions::UnPackTo(ReshapeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ReshapeOptions::UnPackTo(ReshapeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = new_shape(); if (_e) { _o->new_shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->new_shape[_i] = _e->Get(_i); } } }
+  { auto _e = new_shape(); if (_e) { _o->new_shape.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->new_shape[_i] = _e->Get(_i); } } else { _o->new_shape.resize(0); } }
 }
 
-inline flatbuffers::Offset<ReshapeOptions> ReshapeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ReshapeOptions> ReshapeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateReshapeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReshapeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ReshapeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _new_shape = _o->new_shape.size() ? _fbb.CreateVector(_o->new_shape) : 0;
   return tflite::CreateReshapeOptions(
       _fbb,
       _new_shape);
 }
 
-inline SpaceToBatchNDOptionsT *SpaceToBatchNDOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SpaceToBatchNDOptionsT *SpaceToBatchNDOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SpaceToBatchNDOptionsT>(new SpaceToBatchNDOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SpaceToBatchNDOptions::UnPackTo(SpaceToBatchNDOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SpaceToBatchNDOptions::UnPackTo(SpaceToBatchNDOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<SpaceToBatchNDOptions> SpaceToBatchNDOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SpaceToBatchNDOptions> SpaceToBatchNDOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSpaceToBatchNDOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SpaceToBatchNDOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SpaceToBatchNDOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateSpaceToBatchNDOptions(
       _fbb);
 }
 
-inline BatchToSpaceNDOptionsT *BatchToSpaceNDOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BatchToSpaceNDOptionsT *BatchToSpaceNDOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BatchToSpaceNDOptionsT>(new BatchToSpaceNDOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BatchToSpaceNDOptions::UnPackTo(BatchToSpaceNDOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BatchToSpaceNDOptions::UnPackTo(BatchToSpaceNDOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<BatchToSpaceNDOptions> BatchToSpaceNDOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BatchToSpaceNDOptions> BatchToSpaceNDOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBatchToSpaceNDOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BatchToSpaceNDOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BatchToSpaceNDOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateBatchToSpaceNDOptions(
       _fbb);
 }
 
-inline SkipGramOptionsT *SkipGramOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SkipGramOptionsT *SkipGramOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SkipGramOptionsT>(new SkipGramOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SkipGramOptions::UnPackTo(SkipGramOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SkipGramOptions::UnPackTo(SkipGramOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = ngram_size(); _o->ngram_size = _e; }
@@ -14578,14 +14801,14 @@ inline void SkipGramOptions::UnPackTo(SkipGramOptionsT *_o, const flatbuffers::r
   { auto _e = include_all_ngrams(); _o->include_all_ngrams = _e; }
 }
 
-inline flatbuffers::Offset<SkipGramOptions> SkipGramOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SkipGramOptions> SkipGramOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSkipGramOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SkipGramOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SkipGramOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _ngram_size = _o->ngram_size;
   auto _max_skip_size = _o->max_skip_size;
   auto _include_all_ngrams = _o->include_all_ngrams;
@@ -14596,79 +14819,79 @@ inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(flatbuffers::F
       _include_all_ngrams);
 }
 
-inline SpaceToDepthOptionsT *SpaceToDepthOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SpaceToDepthOptionsT *SpaceToDepthOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SpaceToDepthOptionsT>(new SpaceToDepthOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SpaceToDepthOptions::UnPackTo(SpaceToDepthOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SpaceToDepthOptions::UnPackTo(SpaceToDepthOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = block_size(); _o->block_size = _e; }
 }
 
-inline flatbuffers::Offset<SpaceToDepthOptions> SpaceToDepthOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SpaceToDepthOptions> SpaceToDepthOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSpaceToDepthOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SpaceToDepthOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SpaceToDepthOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _block_size = _o->block_size;
   return tflite::CreateSpaceToDepthOptions(
       _fbb,
       _block_size);
 }
 
-inline DepthToSpaceOptionsT *DepthToSpaceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline DepthToSpaceOptionsT *DepthToSpaceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<DepthToSpaceOptionsT>(new DepthToSpaceOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void DepthToSpaceOptions::UnPackTo(DepthToSpaceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void DepthToSpaceOptions::UnPackTo(DepthToSpaceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = block_size(); _o->block_size = _e; }
 }
 
-inline flatbuffers::Offset<DepthToSpaceOptions> DepthToSpaceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DepthToSpaceOptions> DepthToSpaceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateDepthToSpaceOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DepthToSpaceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DepthToSpaceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DepthToSpaceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _block_size = _o->block_size;
   return tflite::CreateDepthToSpaceOptions(
       _fbb,
       _block_size);
 }
 
-inline SubOptionsT *SubOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SubOptionsT *SubOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SubOptionsT>(new SubOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SubOptions::UnPackTo(SubOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SubOptions::UnPackTo(SubOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
   { auto _e = pot_scale_int16(); _o->pot_scale_int16 = _e; }
 }
 
-inline flatbuffers::Offset<SubOptions> SubOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SubOptions> SubOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSubOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SubOptions> CreateSubOptions(flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SubOptions> CreateSubOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SubOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SubOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _pot_scale_int16 = _o->pot_scale_int16;
   return tflite::CreateSubOptions(
@@ -14677,102 +14900,102 @@ inline flatbuffers::Offset<SubOptions> CreateSubOptions(flatbuffers::FlatBufferB
       _pot_scale_int16);
 }
 
-inline DivOptionsT *DivOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline DivOptionsT *DivOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<DivOptionsT>(new DivOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void DivOptions::UnPackTo(DivOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void DivOptions::UnPackTo(DivOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
 }
 
-inline flatbuffers::Offset<DivOptions> DivOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DivOptions> DivOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateDivOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<DivOptions> CreateDivOptions(flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DivOptions> CreateDivOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DivOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DivOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   return tflite::CreateDivOptions(
       _fbb,
       _fused_activation_function);
 }
 
-inline TopKV2OptionsT *TopKV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline TopKV2OptionsT *TopKV2Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<TopKV2OptionsT>(new TopKV2OptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void TopKV2Options::UnPackTo(TopKV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void TopKV2Options::UnPackTo(TopKV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<TopKV2Options> TopKV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<TopKV2Options> TopKV2Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateTopKV2Options(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const TopKV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TopKV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TopKV2OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateTopKV2Options(
       _fbb);
 }
 
-inline EmbeddingLookupSparseOptionsT *EmbeddingLookupSparseOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline EmbeddingLookupSparseOptionsT *EmbeddingLookupSparseOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<EmbeddingLookupSparseOptionsT>(new EmbeddingLookupSparseOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void EmbeddingLookupSparseOptions::UnPackTo(EmbeddingLookupSparseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void EmbeddingLookupSparseOptions::UnPackTo(EmbeddingLookupSparseOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = combiner(); _o->combiner = _e; }
 }
 
-inline flatbuffers::Offset<EmbeddingLookupSparseOptions> EmbeddingLookupSparseOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<EmbeddingLookupSparseOptions> EmbeddingLookupSparseOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateEmbeddingLookupSparseOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(::flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const EmbeddingLookupSparseOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EmbeddingLookupSparseOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _combiner = _o->combiner;
   return tflite::CreateEmbeddingLookupSparseOptions(
       _fbb,
       _combiner);
 }
 
-inline GatherOptionsT *GatherOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline GatherOptionsT *GatherOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<GatherOptionsT>(new GatherOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void GatherOptions::UnPackTo(GatherOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void GatherOptions::UnPackTo(GatherOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = axis(); _o->axis = _e; }
   { auto _e = batch_dims(); _o->batch_dims = _e; }
 }
 
-inline flatbuffers::Offset<GatherOptions> GatherOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GatherOptions> GatherOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateGatherOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<GatherOptions> CreateGatherOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GatherOptions> CreateGatherOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GatherOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GatherOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _axis = _o->axis;
   auto _batch_dims = _o->batch_dims;
   return tflite::CreateGatherOptions(
@@ -14781,186 +15004,186 @@ inline flatbuffers::Offset<GatherOptions> CreateGatherOptions(flatbuffers::FlatB
       _batch_dims);
 }
 
-inline TransposeOptionsT *TransposeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline TransposeOptionsT *TransposeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<TransposeOptionsT>(new TransposeOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void TransposeOptions::UnPackTo(TransposeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void TransposeOptions::UnPackTo(TransposeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<TransposeOptions> TransposeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<TransposeOptions> TransposeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateTransposeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TransposeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TransposeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateTransposeOptions(
       _fbb);
 }
 
-inline ExpOptionsT *ExpOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ExpOptionsT *ExpOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ExpOptionsT>(new ExpOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ExpOptions::UnPackTo(ExpOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ExpOptions::UnPackTo(ExpOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<ExpOptions> ExpOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ExpOptions> ExpOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateExpOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ExpOptions> CreateExpOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ExpOptions> CreateExpOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ExpOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ExpOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateExpOptions(
       _fbb);
 }
 
-inline CosOptionsT *CosOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline CosOptionsT *CosOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<CosOptionsT>(new CosOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void CosOptions::UnPackTo(CosOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void CosOptions::UnPackTo(CosOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<CosOptions> CosOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CosOptions> CosOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateCosOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CosOptions> CreateCosOptions(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CosOptions> CreateCosOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CosOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CosOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateCosOptions(
       _fbb);
 }
 
-inline ReducerOptionsT *ReducerOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ReducerOptionsT *ReducerOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ReducerOptionsT>(new ReducerOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ReducerOptions::UnPackTo(ReducerOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ReducerOptions::UnPackTo(ReducerOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = keep_dims(); _o->keep_dims = _e; }
 }
 
-inline flatbuffers::Offset<ReducerOptions> ReducerOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ReducerOptions> ReducerOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateReducerOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ReducerOptions> CreateReducerOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ReducerOptions> CreateReducerOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReducerOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReducerOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ReducerOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _keep_dims = _o->keep_dims;
   return tflite::CreateReducerOptions(
       _fbb,
       _keep_dims);
 }
 
-inline SqueezeOptionsT *SqueezeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SqueezeOptionsT *SqueezeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SqueezeOptionsT>(new SqueezeOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SqueezeOptions::UnPackTo(SqueezeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SqueezeOptions::UnPackTo(SqueezeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = squeeze_dims(); if (_e) { _o->squeeze_dims.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->squeeze_dims[_i] = _e->Get(_i); } } }
+  { auto _e = squeeze_dims(); if (_e) { _o->squeeze_dims.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->squeeze_dims[_i] = _e->Get(_i); } } else { _o->squeeze_dims.resize(0); } }
 }
 
-inline flatbuffers::Offset<SqueezeOptions> SqueezeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SqueezeOptions> SqueezeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSqueezeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SqueezeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SqueezeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _squeeze_dims = _o->squeeze_dims.size() ? _fbb.CreateVector(_o->squeeze_dims) : 0;
   return tflite::CreateSqueezeOptions(
       _fbb,
       _squeeze_dims);
 }
 
-inline SplitOptionsT *SplitOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SplitOptionsT *SplitOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SplitOptionsT>(new SplitOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SplitOptions::UnPackTo(SplitOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SplitOptions::UnPackTo(SplitOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = num_splits(); _o->num_splits = _e; }
 }
 
-inline flatbuffers::Offset<SplitOptions> SplitOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SplitOptions> SplitOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSplitOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SplitOptions> CreateSplitOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SplitOptions> CreateSplitOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SplitOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SplitOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _num_splits = _o->num_splits;
   return tflite::CreateSplitOptions(
       _fbb,
       _num_splits);
 }
 
-inline SplitVOptionsT *SplitVOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SplitVOptionsT *SplitVOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SplitVOptionsT>(new SplitVOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SplitVOptions::UnPackTo(SplitVOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SplitVOptions::UnPackTo(SplitVOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = num_splits(); _o->num_splits = _e; }
 }
 
-inline flatbuffers::Offset<SplitVOptions> SplitVOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SplitVOptions> SplitVOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSplitVOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SplitVOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SplitVOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _num_splits = _o->num_splits;
   return tflite::CreateSplitVOptions(
       _fbb,
       _num_splits);
 }
 
-inline StridedSliceOptionsT *StridedSliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline StridedSliceOptionsT *StridedSliceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<StridedSliceOptionsT>(new StridedSliceOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void StridedSliceOptions::UnPackTo(StridedSliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void StridedSliceOptions::UnPackTo(StridedSliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = begin_mask(); _o->begin_mask = _e; }
@@ -14970,14 +15193,14 @@ inline void StridedSliceOptions::UnPackTo(StridedSliceOptionsT *_o, const flatbu
   { auto _e = shrink_axis_mask(); _o->shrink_axis_mask = _e; }
 }
 
-inline flatbuffers::Offset<StridedSliceOptions> StridedSliceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<StridedSliceOptions> StridedSliceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateStridedSliceOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const StridedSliceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StridedSliceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _begin_mask = _o->begin_mask;
   auto _end_mask = _o->end_mask;
   auto _ellipsis_mask = _o->ellipsis_mask;
@@ -14992,50 +15215,50 @@ inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(flatbu
       _shrink_axis_mask);
 }
 
-inline LogSoftmaxOptionsT *LogSoftmaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline LogSoftmaxOptionsT *LogSoftmaxOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<LogSoftmaxOptionsT>(new LogSoftmaxOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void LogSoftmaxOptions::UnPackTo(LogSoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LogSoftmaxOptions::UnPackTo(LogSoftmaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<LogSoftmaxOptions> LogSoftmaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LogSoftmaxOptions> LogSoftmaxOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLogSoftmaxOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogSoftmaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LogSoftmaxOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateLogSoftmaxOptions(
       _fbb);
 }
 
-inline CastOptionsT *CastOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline CastOptionsT *CastOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<CastOptionsT>(new CastOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void CastOptions::UnPackTo(CastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void CastOptions::UnPackTo(CastOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = in_data_type(); _o->in_data_type = _e; }
   { auto _e = out_data_type(); _o->out_data_type = _e; }
 }
 
-inline flatbuffers::Offset<CastOptions> CastOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CastOptions> CastOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateCastOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CastOptions> CreateCastOptions(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CastOptions> CreateCastOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CastOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CastOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _in_data_type = _o->in_data_type;
   auto _out_data_type = _o->out_data_type;
   return tflite::CreateCastOptions(
@@ -15044,295 +15267,295 @@ inline flatbuffers::Offset<CastOptions> CreateCastOptions(flatbuffers::FlatBuffe
       _out_data_type);
 }
 
-inline DequantizeOptionsT *DequantizeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline DequantizeOptionsT *DequantizeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<DequantizeOptionsT>(new DequantizeOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void DequantizeOptions::UnPackTo(DequantizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void DequantizeOptions::UnPackTo(DequantizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<DequantizeOptions> DequantizeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DequantizeOptions> DequantizeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateDequantizeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DequantizeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DequantizeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateDequantizeOptions(
       _fbb);
 }
 
-inline MaximumMinimumOptionsT *MaximumMinimumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline MaximumMinimumOptionsT *MaximumMinimumOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<MaximumMinimumOptionsT>(new MaximumMinimumOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void MaximumMinimumOptions::UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MaximumMinimumOptions::UnPackTo(MaximumMinimumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<MaximumMinimumOptions> MaximumMinimumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MaximumMinimumOptions> MaximumMinimumOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateMaximumMinimumOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MaximumMinimumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MaximumMinimumOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateMaximumMinimumOptions(
       _fbb);
 }
 
-inline TileOptionsT *TileOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline TileOptionsT *TileOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<TileOptionsT>(new TileOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void TileOptions::UnPackTo(TileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void TileOptions::UnPackTo(TileOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<TileOptions> TileOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<TileOptions> TileOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateTileOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<TileOptions> CreateTileOptions(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<TileOptions> CreateTileOptions(::flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TileOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TileOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateTileOptions(
       _fbb);
 }
 
-inline ArgMaxOptionsT *ArgMaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ArgMaxOptionsT *ArgMaxOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ArgMaxOptionsT>(new ArgMaxOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ArgMaxOptions::UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ArgMaxOptions::UnPackTo(ArgMaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = output_type(); _o->output_type = _e; }
 }
 
-inline flatbuffers::Offset<ArgMaxOptions> ArgMaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ArgMaxOptions> ArgMaxOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateArgMaxOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ArgMaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ArgMaxOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _output_type = _o->output_type;
   return tflite::CreateArgMaxOptions(
       _fbb,
       _output_type);
 }
 
-inline ArgMinOptionsT *ArgMinOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ArgMinOptionsT *ArgMinOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ArgMinOptionsT>(new ArgMinOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ArgMinOptions::UnPackTo(ArgMinOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ArgMinOptions::UnPackTo(ArgMinOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = output_type(); _o->output_type = _e; }
 }
 
-inline flatbuffers::Offset<ArgMinOptions> ArgMinOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ArgMinOptions> ArgMinOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateArgMinOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ArgMinOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ArgMinOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ArgMinOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _output_type = _o->output_type;
   return tflite::CreateArgMinOptions(
       _fbb,
       _output_type);
 }
 
-inline GreaterOptionsT *GreaterOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline GreaterOptionsT *GreaterOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<GreaterOptionsT>(new GreaterOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void GreaterOptions::UnPackTo(GreaterOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void GreaterOptions::UnPackTo(GreaterOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<GreaterOptions> GreaterOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GreaterOptions> GreaterOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateGreaterOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GreaterOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GreaterOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateGreaterOptions(
       _fbb);
 }
 
-inline GreaterEqualOptionsT *GreaterEqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline GreaterEqualOptionsT *GreaterEqualOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<GreaterEqualOptionsT>(new GreaterEqualOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void GreaterEqualOptions::UnPackTo(GreaterEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void GreaterEqualOptions::UnPackTo(GreaterEqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<GreaterEqualOptions> GreaterEqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GreaterEqualOptions> GreaterEqualOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateGreaterEqualOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GreaterEqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GreaterEqualOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateGreaterEqualOptions(
       _fbb);
 }
 
-inline LessOptionsT *LessOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline LessOptionsT *LessOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<LessOptionsT>(new LessOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void LessOptions::UnPackTo(LessOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LessOptions::UnPackTo(LessOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<LessOptions> LessOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LessOptions> LessOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLessOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LessOptions> CreateLessOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LessOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LessOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateLessOptions(
       _fbb);
 }
 
-inline LessEqualOptionsT *LessEqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline LessEqualOptionsT *LessEqualOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<LessEqualOptionsT>(new LessEqualOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void LessEqualOptions::UnPackTo(LessEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LessEqualOptions::UnPackTo(LessEqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<LessEqualOptions> LessEqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LessEqualOptions> LessEqualOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLessEqualOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LessEqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LessEqualOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateLessEqualOptions(
       _fbb);
 }
 
-inline NegOptionsT *NegOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline NegOptionsT *NegOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<NegOptionsT>(new NegOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void NegOptions::UnPackTo(NegOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void NegOptions::UnPackTo(NegOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<NegOptions> NegOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<NegOptions> NegOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateNegOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<NegOptions> CreateNegOptions(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<NegOptions> CreateNegOptions(::flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const NegOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const NegOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateNegOptions(
       _fbb);
 }
 
-inline SelectOptionsT *SelectOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SelectOptionsT *SelectOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SelectOptionsT>(new SelectOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SelectOptions::UnPackTo(SelectOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SelectOptions::UnPackTo(SelectOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<SelectOptions> SelectOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SelectOptions> SelectOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSelectOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SelectOptions> CreateSelectOptions(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SelectOptions> CreateSelectOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SelectOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SelectOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateSelectOptions(
       _fbb);
 }
 
-inline SliceOptionsT *SliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SliceOptionsT *SliceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SliceOptionsT>(new SliceOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SliceOptions::UnPackTo(SliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SliceOptions::UnPackTo(SliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<SliceOptions> SliceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SliceOptions> SliceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSliceOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SliceOptions> CreateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SliceOptions> CreateSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SliceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SliceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateSliceOptions(
       _fbb);
 }
 
-inline TransposeConvOptionsT *TransposeConvOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline TransposeConvOptionsT *TransposeConvOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<TransposeConvOptionsT>(new TransposeConvOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void TransposeConvOptions::UnPackTo(TransposeConvOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void TransposeConvOptions::UnPackTo(TransposeConvOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = padding(); _o->padding = _e; }
@@ -15341,14 +15564,14 @@ inline void TransposeConvOptions::UnPackTo(TransposeConvOptionsT *_o, const flat
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
 }
 
-inline flatbuffers::Offset<TransposeConvOptions> TransposeConvOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<TransposeConvOptions> TransposeConvOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateTransposeConvOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(::flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TransposeConvOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TransposeConvOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _padding = _o->padding;
   auto _stride_w = _o->stride_w;
   auto _stride_h = _o->stride_h;
@@ -15361,180 +15584,180 @@ inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flat
       _fused_activation_function);
 }
 
-inline ExpandDimsOptionsT *ExpandDimsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ExpandDimsOptionsT *ExpandDimsOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ExpandDimsOptionsT>(new ExpandDimsOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ExpandDimsOptions::UnPackTo(ExpandDimsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ExpandDimsOptions::UnPackTo(ExpandDimsOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<ExpandDimsOptions> ExpandDimsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ExpandDimsOptions> ExpandDimsOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateExpandDimsOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ExpandDimsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ExpandDimsOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateExpandDimsOptions(
       _fbb);
 }
 
-inline SparseToDenseOptionsT *SparseToDenseOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SparseToDenseOptionsT *SparseToDenseOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SparseToDenseOptionsT>(new SparseToDenseOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SparseToDenseOptions::UnPackTo(SparseToDenseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SparseToDenseOptions::UnPackTo(SparseToDenseOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = validate_indices(); _o->validate_indices = _e; }
 }
 
-inline flatbuffers::Offset<SparseToDenseOptions> SparseToDenseOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SparseToDenseOptions> SparseToDenseOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSparseToDenseOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SparseToDenseOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SparseToDenseOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _validate_indices = _o->validate_indices;
   return tflite::CreateSparseToDenseOptions(
       _fbb,
       _validate_indices);
 }
 
-inline EqualOptionsT *EqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline EqualOptionsT *EqualOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<EqualOptionsT>(new EqualOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void EqualOptions::UnPackTo(EqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void EqualOptions::UnPackTo(EqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<EqualOptions> EqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<EqualOptions> EqualOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateEqualOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<EqualOptions> CreateEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<EqualOptions> CreateEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const EqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const EqualOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateEqualOptions(
       _fbb);
 }
 
-inline NotEqualOptionsT *NotEqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline NotEqualOptionsT *NotEqualOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<NotEqualOptionsT>(new NotEqualOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void NotEqualOptions::UnPackTo(NotEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void NotEqualOptions::UnPackTo(NotEqualOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<NotEqualOptions> NotEqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<NotEqualOptions> NotEqualOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateNotEqualOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(::flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const NotEqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const NotEqualOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateNotEqualOptions(
       _fbb);
 }
 
-inline ShapeOptionsT *ShapeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ShapeOptionsT *ShapeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ShapeOptionsT>(new ShapeOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ShapeOptions::UnPackTo(ShapeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ShapeOptions::UnPackTo(ShapeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = out_type(); _o->out_type = _e; }
 }
 
-inline flatbuffers::Offset<ShapeOptions> ShapeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ShapeOptions> ShapeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateShapeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ShapeOptions> CreateShapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ShapeOptions> CreateShapeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ShapeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ShapeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _out_type = _o->out_type;
   return tflite::CreateShapeOptions(
       _fbb,
       _out_type);
 }
 
-inline RankOptionsT *RankOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline RankOptionsT *RankOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<RankOptionsT>(new RankOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void RankOptions::UnPackTo(RankOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void RankOptions::UnPackTo(RankOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<RankOptions> RankOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<RankOptions> RankOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateRankOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<RankOptions> CreateRankOptions(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<RankOptions> CreateRankOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RankOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const RankOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateRankOptions(
       _fbb);
 }
 
-inline PowOptionsT *PowOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline PowOptionsT *PowOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<PowOptionsT>(new PowOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void PowOptions::UnPackTo(PowOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void PowOptions::UnPackTo(PowOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<PowOptions> PowOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<PowOptions> PowOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreatePowOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<PowOptions> CreatePowOptions(flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<PowOptions> CreatePowOptions(::flatbuffers::FlatBufferBuilder &_fbb, const PowOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PowOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const PowOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreatePowOptions(
       _fbb);
 }
 
-inline FakeQuantOptionsT *FakeQuantOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline FakeQuantOptionsT *FakeQuantOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<FakeQuantOptionsT>(new FakeQuantOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void FakeQuantOptions::UnPackTo(FakeQuantOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void FakeQuantOptions::UnPackTo(FakeQuantOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = min(); _o->min = _e; }
@@ -15543,14 +15766,14 @@ inline void FakeQuantOptions::UnPackTo(FakeQuantOptionsT *_o, const flatbuffers:
   { auto _e = narrow_range(); _o->narrow_range = _e; }
 }
 
-inline flatbuffers::Offset<FakeQuantOptions> FakeQuantOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<FakeQuantOptions> FakeQuantOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateFakeQuantOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FakeQuantOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FakeQuantOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FakeQuantOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _min = _o->min;
   auto _max = _o->max;
   auto _num_bits = _o->num_bits;
@@ -15563,27 +15786,27 @@ inline flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(flatbuffers:
       _narrow_range);
 }
 
-inline PackOptionsT *PackOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline PackOptionsT *PackOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<PackOptionsT>(new PackOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void PackOptions::UnPackTo(PackOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void PackOptions::UnPackTo(PackOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = values_count(); _o->values_count = _e; }
   { auto _e = axis(); _o->axis = _e; }
 }
 
-inline flatbuffers::Offset<PackOptions> PackOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<PackOptions> PackOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreatePackOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<PackOptions> CreatePackOptions(flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<PackOptions> CreatePackOptions(::flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PackOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const PackOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _values_count = _o->values_count;
   auto _axis = _o->axis;
   return tflite::CreatePackOptions(
@@ -15592,168 +15815,168 @@ inline flatbuffers::Offset<PackOptions> CreatePackOptions(flatbuffers::FlatBuffe
       _axis);
 }
 
-inline LogicalOrOptionsT *LogicalOrOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline LogicalOrOptionsT *LogicalOrOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<LogicalOrOptionsT>(new LogicalOrOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void LogicalOrOptions::UnPackTo(LogicalOrOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LogicalOrOptions::UnPackTo(LogicalOrOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<LogicalOrOptions> LogicalOrOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LogicalOrOptions> LogicalOrOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLogicalOrOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogicalOrOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LogicalOrOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateLogicalOrOptions(
       _fbb);
 }
 
-inline OneHotOptionsT *OneHotOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline OneHotOptionsT *OneHotOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<OneHotOptionsT>(new OneHotOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void OneHotOptions::UnPackTo(OneHotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void OneHotOptions::UnPackTo(OneHotOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = axis(); _o->axis = _e; }
 }
 
-inline flatbuffers::Offset<OneHotOptions> OneHotOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<OneHotOptions> OneHotOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateOneHotOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(::flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OneHotOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const OneHotOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _axis = _o->axis;
   return tflite::CreateOneHotOptions(
       _fbb,
       _axis);
 }
 
-inline AbsOptionsT *AbsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline AbsOptionsT *AbsOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<AbsOptionsT>(new AbsOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void AbsOptions::UnPackTo(AbsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void AbsOptions::UnPackTo(AbsOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<AbsOptions> AbsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<AbsOptions> AbsOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateAbsOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<AbsOptions> CreateAbsOptions(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<AbsOptions> CreateAbsOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AbsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const AbsOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateAbsOptions(
       _fbb);
 }
 
-inline HardSwishOptionsT *HardSwishOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline HardSwishOptionsT *HardSwishOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<HardSwishOptionsT>(new HardSwishOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void HardSwishOptions::UnPackTo(HardSwishOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void HardSwishOptions::UnPackTo(HardSwishOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<HardSwishOptions> HardSwishOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<HardSwishOptions> HardSwishOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateHardSwishOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HardSwishOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const HardSwishOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HardSwishOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateHardSwishOptions(
       _fbb);
 }
 
-inline LogicalAndOptionsT *LogicalAndOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline LogicalAndOptionsT *LogicalAndOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<LogicalAndOptionsT>(new LogicalAndOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void LogicalAndOptions::UnPackTo(LogicalAndOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LogicalAndOptions::UnPackTo(LogicalAndOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<LogicalAndOptions> LogicalAndOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LogicalAndOptions> LogicalAndOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLogicalAndOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogicalAndOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LogicalAndOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateLogicalAndOptions(
       _fbb);
 }
 
-inline LogicalNotOptionsT *LogicalNotOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline LogicalNotOptionsT *LogicalNotOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<LogicalNotOptionsT>(new LogicalNotOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void LogicalNotOptions::UnPackTo(LogicalNotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LogicalNotOptions::UnPackTo(LogicalNotOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<LogicalNotOptions> LogicalNotOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LogicalNotOptions> LogicalNotOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLogicalNotOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogicalNotOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LogicalNotOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateLogicalNotOptions(
       _fbb);
 }
 
-inline UnpackOptionsT *UnpackOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline UnpackOptionsT *UnpackOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<UnpackOptionsT>(new UnpackOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void UnpackOptions::UnPackTo(UnpackOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void UnpackOptions::UnPackTo(UnpackOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = num(); _o->num = _e; }
   { auto _e = axis(); _o->axis = _e; }
 }
 
-inline flatbuffers::Offset<UnpackOptions> UnpackOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UnpackOptions> UnpackOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateUnpackOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UnpackOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UnpackOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _num = _o->num;
   auto _axis = _o->axis;
   return tflite::CreateUnpackOptions(
@@ -15762,358 +15985,358 @@ inline flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(flatbuffers::FlatB
       _axis);
 }
 
-inline FloorDivOptionsT *FloorDivOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline FloorDivOptionsT *FloorDivOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<FloorDivOptionsT>(new FloorDivOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void FloorDivOptions::UnPackTo(FloorDivOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void FloorDivOptions::UnPackTo(FloorDivOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<FloorDivOptions> FloorDivOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<FloorDivOptions> FloorDivOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateFloorDivOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FloorDivOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FloorDivOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateFloorDivOptions(
       _fbb);
 }
 
-inline SquareOptionsT *SquareOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SquareOptionsT *SquareOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SquareOptionsT>(new SquareOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SquareOptions::UnPackTo(SquareOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SquareOptions::UnPackTo(SquareOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<SquareOptions> SquareOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SquareOptions> SquareOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSquareOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SquareOptions> CreateSquareOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SquareOptions> CreateSquareOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SquareOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SquareOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SquareOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateSquareOptions(
       _fbb);
 }
 
-inline ZerosLikeOptionsT *ZerosLikeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ZerosLikeOptionsT *ZerosLikeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ZerosLikeOptionsT>(new ZerosLikeOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ZerosLikeOptions::UnPackTo(ZerosLikeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ZerosLikeOptions::UnPackTo(ZerosLikeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<ZerosLikeOptions> ZerosLikeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ZerosLikeOptions> ZerosLikeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateZerosLikeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ZerosLikeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ZerosLikeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ZerosLikeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateZerosLikeOptions(
       _fbb);
 }
 
-inline FillOptionsT *FillOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline FillOptionsT *FillOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<FillOptionsT>(new FillOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void FillOptions::UnPackTo(FillOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void FillOptions::UnPackTo(FillOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<FillOptions> FillOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<FillOptions> FillOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateFillOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<FillOptions> CreateFillOptions(flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<FillOptions> CreateFillOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FillOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FillOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateFillOptions(
       _fbb);
 }
 
-inline FloorModOptionsT *FloorModOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline FloorModOptionsT *FloorModOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<FloorModOptionsT>(new FloorModOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void FloorModOptions::UnPackTo(FloorModOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void FloorModOptions::UnPackTo(FloorModOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<FloorModOptions> FloorModOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<FloorModOptions> FloorModOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateFloorModOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(::flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FloorModOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const FloorModOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateFloorModOptions(
       _fbb);
 }
 
-inline RangeOptionsT *RangeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline RangeOptionsT *RangeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<RangeOptionsT>(new RangeOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void RangeOptions::UnPackTo(RangeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void RangeOptions::UnPackTo(RangeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<RangeOptions> RangeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<RangeOptions> RangeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateRangeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<RangeOptions> CreateRangeOptions(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<RangeOptions> CreateRangeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RangeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const RangeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateRangeOptions(
       _fbb);
 }
 
-inline LeakyReluOptionsT *LeakyReluOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline LeakyReluOptionsT *LeakyReluOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<LeakyReluOptionsT>(new LeakyReluOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void LeakyReluOptions::UnPackTo(LeakyReluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LeakyReluOptions::UnPackTo(LeakyReluOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = alpha(); _o->alpha = _e; }
 }
 
-inline flatbuffers::Offset<LeakyReluOptions> LeakyReluOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LeakyReluOptions> LeakyReluOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLeakyReluOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(::flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LeakyReluOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const LeakyReluOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _alpha = _o->alpha;
   return tflite::CreateLeakyReluOptions(
       _fbb,
       _alpha);
 }
 
-inline SquaredDifferenceOptionsT *SquaredDifferenceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SquaredDifferenceOptionsT *SquaredDifferenceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SquaredDifferenceOptionsT>(new SquaredDifferenceOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SquaredDifferenceOptions::UnPackTo(SquaredDifferenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SquaredDifferenceOptions::UnPackTo(SquaredDifferenceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<SquaredDifferenceOptions> SquaredDifferenceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SquaredDifferenceOptions> SquaredDifferenceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSquaredDifferenceOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SquaredDifferenceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SquaredDifferenceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateSquaredDifferenceOptions(
       _fbb);
 }
 
-inline MirrorPadOptionsT *MirrorPadOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline MirrorPadOptionsT *MirrorPadOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<MirrorPadOptionsT>(new MirrorPadOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void MirrorPadOptions::UnPackTo(MirrorPadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MirrorPadOptions::UnPackTo(MirrorPadOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = mode(); _o->mode = _e; }
 }
 
-inline flatbuffers::Offset<MirrorPadOptions> MirrorPadOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MirrorPadOptions> MirrorPadOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateMirrorPadOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MirrorPadOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MirrorPadOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _mode = _o->mode;
   return tflite::CreateMirrorPadOptions(
       _fbb,
       _mode);
 }
 
-inline UniqueOptionsT *UniqueOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline UniqueOptionsT *UniqueOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<UniqueOptionsT>(new UniqueOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void UniqueOptions::UnPackTo(UniqueOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void UniqueOptions::UnPackTo(UniqueOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = idx_out_type(); _o->idx_out_type = _e; }
 }
 
-inline flatbuffers::Offset<UniqueOptions> UniqueOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UniqueOptions> UniqueOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateUniqueOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UniqueOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UniqueOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _idx_out_type = _o->idx_out_type;
   return tflite::CreateUniqueOptions(
       _fbb,
       _idx_out_type);
 }
 
-inline ReverseV2OptionsT *ReverseV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ReverseV2OptionsT *ReverseV2Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ReverseV2OptionsT>(new ReverseV2OptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ReverseV2Options::UnPackTo(ReverseV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ReverseV2Options::UnPackTo(ReverseV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<ReverseV2Options> ReverseV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ReverseV2Options> ReverseV2Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateReverseV2Options(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReverseV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ReverseV2OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateReverseV2Options(
       _fbb);
 }
 
-inline AddNOptionsT *AddNOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline AddNOptionsT *AddNOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<AddNOptionsT>(new AddNOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void AddNOptions::UnPackTo(AddNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void AddNOptions::UnPackTo(AddNOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<AddNOptions> AddNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<AddNOptions> AddNOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateAddNOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<AddNOptions> CreateAddNOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<AddNOptions> CreateAddNOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AddNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const AddNOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateAddNOptions(
       _fbb);
 }
 
-inline GatherNdOptionsT *GatherNdOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline GatherNdOptionsT *GatherNdOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<GatherNdOptionsT>(new GatherNdOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void GatherNdOptions::UnPackTo(GatherNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void GatherNdOptions::UnPackTo(GatherNdOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<GatherNdOptions> GatherNdOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GatherNdOptions> GatherNdOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateGatherNdOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GatherNdOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GatherNdOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateGatherNdOptions(
       _fbb);
 }
 
-inline WhereOptionsT *WhereOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline WhereOptionsT *WhereOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<WhereOptionsT>(new WhereOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void WhereOptions::UnPackTo(WhereOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void WhereOptions::UnPackTo(WhereOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<WhereOptions> WhereOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<WhereOptions> WhereOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateWhereOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<WhereOptions> CreateWhereOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<WhereOptions> CreateWhereOptions(::flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const WhereOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const WhereOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateWhereOptions(
       _fbb);
 }
 
-inline ReverseSequenceOptionsT *ReverseSequenceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ReverseSequenceOptionsT *ReverseSequenceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ReverseSequenceOptionsT>(new ReverseSequenceOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ReverseSequenceOptions::UnPackTo(ReverseSequenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ReverseSequenceOptions::UnPackTo(ReverseSequenceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = seq_dim(); _o->seq_dim = _e; }
   { auto _e = batch_dim(); _o->batch_dim = _e; }
 }
 
-inline flatbuffers::Offset<ReverseSequenceOptions> ReverseSequenceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ReverseSequenceOptions> ReverseSequenceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateReverseSequenceOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReverseSequenceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReverseSequenceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ReverseSequenceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _seq_dim = _o->seq_dim;
   auto _batch_dim = _o->batch_dim;
   return tflite::CreateReverseSequenceOptions(
@@ -16122,96 +16345,96 @@ inline flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(
       _batch_dim);
 }
 
-inline MatrixDiagOptionsT *MatrixDiagOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline MatrixDiagOptionsT *MatrixDiagOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<MatrixDiagOptionsT>(new MatrixDiagOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void MatrixDiagOptions::UnPackTo(MatrixDiagOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MatrixDiagOptions::UnPackTo(MatrixDiagOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<MatrixDiagOptions> MatrixDiagOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MatrixDiagOptions> MatrixDiagOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateMatrixDiagOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixDiagOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MatrixDiagOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MatrixDiagOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateMatrixDiagOptions(
       _fbb);
 }
 
-inline QuantizeOptionsT *QuantizeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline QuantizeOptionsT *QuantizeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<QuantizeOptionsT>(new QuantizeOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void QuantizeOptions::UnPackTo(QuantizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void QuantizeOptions::UnPackTo(QuantizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<QuantizeOptions> QuantizeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<QuantizeOptions> QuantizeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateQuantizeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const QuantizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const QuantizeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const QuantizeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateQuantizeOptions(
       _fbb);
 }
 
-inline MatrixSetDiagOptionsT *MatrixSetDiagOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline MatrixSetDiagOptionsT *MatrixSetDiagOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<MatrixSetDiagOptionsT>(new MatrixSetDiagOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void MatrixSetDiagOptions::UnPackTo(MatrixSetDiagOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MatrixSetDiagOptions::UnPackTo(MatrixSetDiagOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<MatrixSetDiagOptions> MatrixSetDiagOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MatrixSetDiagOptions> MatrixSetDiagOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateMatrixSetDiagOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(::flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MatrixSetDiagOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MatrixSetDiagOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateMatrixSetDiagOptions(
       _fbb);
 }
 
-inline IfOptionsT *IfOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline IfOptionsT *IfOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<IfOptionsT>(new IfOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void IfOptions::UnPackTo(IfOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void IfOptions::UnPackTo(IfOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = then_subgraph_index(); _o->then_subgraph_index = _e; }
   { auto _e = else_subgraph_index(); _o->else_subgraph_index = _e; }
 }
 
-inline flatbuffers::Offset<IfOptions> IfOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<IfOptions> IfOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateIfOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<IfOptions> CreateIfOptions(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<IfOptions> CreateIfOptions(::flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const IfOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const IfOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _then_subgraph_index = _o->then_subgraph_index;
   auto _else_subgraph_index = _o->else_subgraph_index;
   return tflite::CreateIfOptions(
@@ -16220,53 +16443,53 @@ inline flatbuffers::Offset<IfOptions> CreateIfOptions(flatbuffers::FlatBufferBui
       _else_subgraph_index);
 }
 
-inline CallOnceOptionsT *CallOnceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline CallOnceOptionsT *CallOnceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<CallOnceOptionsT>(new CallOnceOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void CallOnceOptions::UnPackTo(CallOnceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void CallOnceOptions::UnPackTo(CallOnceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = init_subgraph_index(); _o->init_subgraph_index = _e; }
 }
 
-inline flatbuffers::Offset<CallOnceOptions> CallOnceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CallOnceOptions> CallOnceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateCallOnceOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CallOnceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CallOnceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CallOnceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _init_subgraph_index = _o->init_subgraph_index;
   return tflite::CreateCallOnceOptions(
       _fbb,
       _init_subgraph_index);
 }
 
-inline WhileOptionsT *WhileOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline WhileOptionsT *WhileOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<WhileOptionsT>(new WhileOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void WhileOptions::UnPackTo(WhileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void WhileOptions::UnPackTo(WhileOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = cond_subgraph_index(); _o->cond_subgraph_index = _e; }
   { auto _e = body_subgraph_index(); _o->body_subgraph_index = _e; }
 }
 
-inline flatbuffers::Offset<WhileOptions> WhileOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<WhileOptions> WhileOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateWhileOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<WhileOptions> CreateWhileOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<WhileOptions> CreateWhileOptions(::flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const WhileOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const WhileOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _cond_subgraph_index = _o->cond_subgraph_index;
   auto _body_subgraph_index = _o->body_subgraph_index;
   return tflite::CreateWhileOptions(
@@ -16275,151 +16498,151 @@ inline flatbuffers::Offset<WhileOptions> CreateWhileOptions(flatbuffers::FlatBuf
       _body_subgraph_index);
 }
 
-inline NonMaxSuppressionV4OptionsT *NonMaxSuppressionV4Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline NonMaxSuppressionV4OptionsT *NonMaxSuppressionV4Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<NonMaxSuppressionV4OptionsT>(new NonMaxSuppressionV4OptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void NonMaxSuppressionV4Options::UnPackTo(NonMaxSuppressionV4OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void NonMaxSuppressionV4Options::UnPackTo(NonMaxSuppressionV4OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<NonMaxSuppressionV4Options> NonMaxSuppressionV4Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<NonMaxSuppressionV4Options> NonMaxSuppressionV4Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateNonMaxSuppressionV4Options(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV4OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const NonMaxSuppressionV4OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const NonMaxSuppressionV4OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateNonMaxSuppressionV4Options(
       _fbb);
 }
 
-inline NonMaxSuppressionV5OptionsT *NonMaxSuppressionV5Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline NonMaxSuppressionV5OptionsT *NonMaxSuppressionV5Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<NonMaxSuppressionV5OptionsT>(new NonMaxSuppressionV5OptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void NonMaxSuppressionV5Options::UnPackTo(NonMaxSuppressionV5OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void NonMaxSuppressionV5Options::UnPackTo(NonMaxSuppressionV5OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<NonMaxSuppressionV5Options> NonMaxSuppressionV5Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<NonMaxSuppressionV5Options> NonMaxSuppressionV5Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateNonMaxSuppressionV5Options(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(::flatbuffers::FlatBufferBuilder &_fbb, const NonMaxSuppressionV5OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const NonMaxSuppressionV5OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const NonMaxSuppressionV5OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateNonMaxSuppressionV5Options(
       _fbb);
 }
 
-inline ScatterNdOptionsT *ScatterNdOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ScatterNdOptionsT *ScatterNdOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ScatterNdOptionsT>(new ScatterNdOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ScatterNdOptions::UnPackTo(ScatterNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ScatterNdOptions::UnPackTo(ScatterNdOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<ScatterNdOptions> ScatterNdOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ScatterNdOptions> ScatterNdOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateScatterNdOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ScatterNdOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ScatterNdOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ScatterNdOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateScatterNdOptions(
       _fbb);
 }
 
-inline SelectV2OptionsT *SelectV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SelectV2OptionsT *SelectV2Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SelectV2OptionsT>(new SelectV2OptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SelectV2Options::UnPackTo(SelectV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SelectV2Options::UnPackTo(SelectV2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<SelectV2Options> SelectV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SelectV2Options> SelectV2Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSelectV2Options(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(::flatbuffers::FlatBufferBuilder &_fbb, const SelectV2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SelectV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SelectV2OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateSelectV2Options(
       _fbb);
 }
 
-inline DensifyOptionsT *DensifyOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline DensifyOptionsT *DensifyOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<DensifyOptionsT>(new DensifyOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void DensifyOptions::UnPackTo(DensifyOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void DensifyOptions::UnPackTo(DensifyOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<DensifyOptions> DensifyOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DensifyOptions> DensifyOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateDensifyOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DensifyOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DensifyOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DensifyOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateDensifyOptions(
       _fbb);
 }
 
-inline SegmentSumOptionsT *SegmentSumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SegmentSumOptionsT *SegmentSumOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SegmentSumOptionsT>(new SegmentSumOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SegmentSumOptions::UnPackTo(SegmentSumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SegmentSumOptions::UnPackTo(SegmentSumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<SegmentSumOptions> SegmentSumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SegmentSumOptions> SegmentSumOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSegmentSumOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SegmentSumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SegmentSumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SegmentSumOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateSegmentSumOptions(
       _fbb);
 }
 
-inline BatchMatMulOptionsT *BatchMatMulOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BatchMatMulOptionsT *BatchMatMulOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BatchMatMulOptionsT>(new BatchMatMulOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BatchMatMulOptions::UnPackTo(BatchMatMulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BatchMatMulOptions::UnPackTo(BatchMatMulOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = adj_x(); _o->adj_x = _e; }
@@ -16427,14 +16650,14 @@ inline void BatchMatMulOptions::UnPackTo(BatchMatMulOptionsT *_o, const flatbuff
   { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
 }
 
-inline flatbuffers::Offset<BatchMatMulOptions> BatchMatMulOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BatchMatMulOptions> BatchMatMulOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBatchMatMulOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BatchMatMulOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BatchMatMulOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BatchMatMulOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _adj_x = _o->adj_x;
   auto _adj_y = _o->adj_y;
   auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
@@ -16445,27 +16668,27 @@ inline flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(flatbuff
       _asymmetric_quantize_inputs);
 }
 
-inline CumsumOptionsT *CumsumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline CumsumOptionsT *CumsumOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<CumsumOptionsT>(new CumsumOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void CumsumOptions::UnPackTo(CumsumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void CumsumOptions::UnPackTo(CumsumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = exclusive(); _o->exclusive = _e; }
   { auto _e = reverse(); _o->reverse = _e; }
 }
 
-inline flatbuffers::Offset<CumsumOptions> CumsumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CumsumOptions> CumsumOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateCumsumOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const CumsumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CumsumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const CumsumOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _exclusive = _o->exclusive;
   auto _reverse = _o->reverse;
   return tflite::CreateCumsumOptions(
@@ -16474,59 +16697,59 @@ inline flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(flatbuffers::FlatB
       _reverse);
 }
 
-inline BroadcastToOptionsT *BroadcastToOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BroadcastToOptionsT *BroadcastToOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BroadcastToOptionsT>(new BroadcastToOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BroadcastToOptions::UnPackTo(BroadcastToOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BroadcastToOptions::UnPackTo(BroadcastToOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<BroadcastToOptions> BroadcastToOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BroadcastToOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BroadcastToOptions> BroadcastToOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BroadcastToOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBroadcastToOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BroadcastToOptions> CreateBroadcastToOptions(flatbuffers::FlatBufferBuilder &_fbb, const BroadcastToOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BroadcastToOptions> CreateBroadcastToOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BroadcastToOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BroadcastToOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BroadcastToOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateBroadcastToOptions(
       _fbb);
 }
 
-inline Rfft2dOptionsT *Rfft2dOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline Rfft2dOptionsT *Rfft2dOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<Rfft2dOptionsT>(new Rfft2dOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Rfft2dOptions::UnPackTo(Rfft2dOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Rfft2dOptions::UnPackTo(Rfft2dOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<Rfft2dOptions> Rfft2dOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Rfft2dOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Rfft2dOptions> Rfft2dOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const Rfft2dOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateRfft2dOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Rfft2dOptions> CreateRfft2dOptions(flatbuffers::FlatBufferBuilder &_fbb, const Rfft2dOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Rfft2dOptions> CreateRfft2dOptions(::flatbuffers::FlatBufferBuilder &_fbb, const Rfft2dOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Rfft2dOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const Rfft2dOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateRfft2dOptions(
       _fbb);
 }
 
-inline HashtableOptionsT *HashtableOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline HashtableOptionsT *HashtableOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<HashtableOptionsT>(new HashtableOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void HashtableOptions::UnPackTo(HashtableOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void HashtableOptions::UnPackTo(HashtableOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = table_id(); _o->table_id = _e; }
@@ -16534,14 +16757,14 @@ inline void HashtableOptions::UnPackTo(HashtableOptionsT *_o, const flatbuffers:
   { auto _e = value_dtype(); _o->value_dtype = _e; }
 }
 
-inline flatbuffers::Offset<HashtableOptions> HashtableOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const HashtableOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<HashtableOptions> HashtableOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateHashtableOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<HashtableOptions> CreateHashtableOptions(flatbuffers::FlatBufferBuilder &_fbb, const HashtableOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<HashtableOptions> CreateHashtableOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const HashtableOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HashtableOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _table_id = _o->table_id;
   auto _key_dtype = _o->key_dtype;
   auto _value_dtype = _o->value_dtype;
@@ -16552,96 +16775,96 @@ inline flatbuffers::Offset<HashtableOptions> CreateHashtableOptions(flatbuffers:
       _value_dtype);
 }
 
-inline HashtableFindOptionsT *HashtableFindOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline HashtableFindOptionsT *HashtableFindOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<HashtableFindOptionsT>(new HashtableFindOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void HashtableFindOptions::UnPackTo(HashtableFindOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void HashtableFindOptions::UnPackTo(HashtableFindOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<HashtableFindOptions> HashtableFindOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const HashtableFindOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<HashtableFindOptions> HashtableFindOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableFindOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateHashtableFindOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<HashtableFindOptions> CreateHashtableFindOptions(flatbuffers::FlatBufferBuilder &_fbb, const HashtableFindOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<HashtableFindOptions> CreateHashtableFindOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableFindOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const HashtableFindOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HashtableFindOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateHashtableFindOptions(
       _fbb);
 }
 
-inline HashtableImportOptionsT *HashtableImportOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline HashtableImportOptionsT *HashtableImportOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<HashtableImportOptionsT>(new HashtableImportOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void HashtableImportOptions::UnPackTo(HashtableImportOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void HashtableImportOptions::UnPackTo(HashtableImportOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<HashtableImportOptions> HashtableImportOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const HashtableImportOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<HashtableImportOptions> HashtableImportOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableImportOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateHashtableImportOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<HashtableImportOptions> CreateHashtableImportOptions(flatbuffers::FlatBufferBuilder &_fbb, const HashtableImportOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<HashtableImportOptions> CreateHashtableImportOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableImportOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const HashtableImportOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HashtableImportOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateHashtableImportOptions(
       _fbb);
 }
 
-inline HashtableSizeOptionsT *HashtableSizeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline HashtableSizeOptionsT *HashtableSizeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<HashtableSizeOptionsT>(new HashtableSizeOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void HashtableSizeOptions::UnPackTo(HashtableSizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void HashtableSizeOptions::UnPackTo(HashtableSizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<HashtableSizeOptions> HashtableSizeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const HashtableSizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<HashtableSizeOptions> HashtableSizeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableSizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateHashtableSizeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<HashtableSizeOptions> CreateHashtableSizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const HashtableSizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<HashtableSizeOptions> CreateHashtableSizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const HashtableSizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const HashtableSizeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const HashtableSizeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateHashtableSizeOptions(
       _fbb);
 }
 
-inline VarHandleOptionsT *VarHandleOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline VarHandleOptionsT *VarHandleOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<VarHandleOptionsT>(new VarHandleOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void VarHandleOptions::UnPackTo(VarHandleOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void VarHandleOptions::UnPackTo(VarHandleOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = container(); if (_e) _o->container = _e->str(); }
   { auto _e = shared_name(); if (_e) _o->shared_name = _e->str(); }
 }
 
-inline flatbuffers::Offset<VarHandleOptions> VarHandleOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const VarHandleOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<VarHandleOptions> VarHandleOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const VarHandleOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateVarHandleOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptions(flatbuffers::FlatBufferBuilder &_fbb, const VarHandleOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptions(::flatbuffers::FlatBufferBuilder &_fbb, const VarHandleOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const VarHandleOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const VarHandleOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _container = _o->container.empty() ? 0 : _fbb.CreateString(_o->container);
   auto _shared_name = _o->shared_name.empty() ? 0 : _fbb.CreateString(_o->shared_name);
   return tflite::CreateVarHandleOptions(
@@ -16650,73 +16873,73 @@ inline flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptions(flatbuffers:
       _shared_name);
 }
 
-inline ReadVariableOptionsT *ReadVariableOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ReadVariableOptionsT *ReadVariableOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ReadVariableOptionsT>(new ReadVariableOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ReadVariableOptions::UnPackTo(ReadVariableOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ReadVariableOptions::UnPackTo(ReadVariableOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<ReadVariableOptions> ReadVariableOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReadVariableOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ReadVariableOptions> ReadVariableOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ReadVariableOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateReadVariableOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ReadVariableOptions> CreateReadVariableOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReadVariableOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ReadVariableOptions> CreateReadVariableOptions(::flatbuffers::FlatBufferBuilder &_fbb, const ReadVariableOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReadVariableOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ReadVariableOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateReadVariableOptions(
       _fbb);
 }
 
-inline AssignVariableOptionsT *AssignVariableOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline AssignVariableOptionsT *AssignVariableOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<AssignVariableOptionsT>(new AssignVariableOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void AssignVariableOptions::UnPackTo(AssignVariableOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void AssignVariableOptions::UnPackTo(AssignVariableOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<AssignVariableOptions> AssignVariableOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AssignVariableOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<AssignVariableOptions> AssignVariableOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const AssignVariableOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateAssignVariableOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<AssignVariableOptions> CreateAssignVariableOptions(flatbuffers::FlatBufferBuilder &_fbb, const AssignVariableOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<AssignVariableOptions> CreateAssignVariableOptions(::flatbuffers::FlatBufferBuilder &_fbb, const AssignVariableOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AssignVariableOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const AssignVariableOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateAssignVariableOptions(
       _fbb);
 }
 
-inline RandomOptionsT *RandomOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline RandomOptionsT *RandomOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<RandomOptionsT>(new RandomOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void RandomOptions::UnPackTo(RandomOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void RandomOptions::UnPackTo(RandomOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = seed(); _o->seed = _e; }
   { auto _e = seed2(); _o->seed2 = _e; }
 }
 
-inline flatbuffers::Offset<RandomOptions> RandomOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RandomOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<RandomOptions> RandomOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RandomOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateRandomOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<RandomOptions> CreateRandomOptions(flatbuffers::FlatBufferBuilder &_fbb, const RandomOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<RandomOptions> CreateRandomOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RandomOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RandomOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const RandomOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _seed = _o->seed;
   auto _seed2 = _o->seed2;
   return tflite::CreateRandomOptions(
@@ -16725,226 +16948,295 @@ inline flatbuffers::Offset<RandomOptions> CreateRandomOptions(flatbuffers::FlatB
       _seed2);
 }
 
-inline BucketizeOptionsT *BucketizeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BucketizeOptionsT *BucketizeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BucketizeOptionsT>(new BucketizeOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void BucketizeOptions::UnPackTo(BucketizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BucketizeOptions::UnPackTo(BucketizeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = boundaries(); if (_e) { _o->boundaries.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->boundaries[_i] = _e->Get(_i); } } }
+  { auto _e = boundaries(); if (_e) { _o->boundaries.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->boundaries[_i] = _e->Get(_i); } } else { _o->boundaries.resize(0); } }
 }
 
-inline flatbuffers::Offset<BucketizeOptions> BucketizeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BucketizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BucketizeOptions> BucketizeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BucketizeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBucketizeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const BucketizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BucketizeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BucketizeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BucketizeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _boundaries = _o->boundaries.size() ? _fbb.CreateVector(_o->boundaries) : 0;
   return tflite::CreateBucketizeOptions(
       _fbb,
       _boundaries);
 }
 
-inline GeluOptionsT *GeluOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline GeluOptionsT *GeluOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<GeluOptionsT>(new GeluOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void GeluOptions::UnPackTo(GeluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void GeluOptions::UnPackTo(GeluOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = approximate(); _o->approximate = _e; }
 }
 
-inline flatbuffers::Offset<GeluOptions> GeluOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GeluOptions> GeluOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateGeluOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<GeluOptions> CreateGeluOptions(flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<GeluOptions> CreateGeluOptions(::flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GeluOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const GeluOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _approximate = _o->approximate;
   return tflite::CreateGeluOptions(
       _fbb,
       _approximate);
 }
 
-inline DynamicUpdateSliceOptionsT *DynamicUpdateSliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline DynamicUpdateSliceOptionsT *DynamicUpdateSliceOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<DynamicUpdateSliceOptionsT>(new DynamicUpdateSliceOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void DynamicUpdateSliceOptions::UnPackTo(DynamicUpdateSliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void DynamicUpdateSliceOptions::UnPackTo(DynamicUpdateSliceOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<DynamicUpdateSliceOptions> DynamicUpdateSliceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DynamicUpdateSliceOptions> DynamicUpdateSliceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateDynamicUpdateSliceOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(::flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DynamicUpdateSliceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const DynamicUpdateSliceOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateDynamicUpdateSliceOptions(
       _fbb);
 }
 
-inline UnsortedSegmentProdOptionsT *UnsortedSegmentProdOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline UnsortedSegmentProdOptionsT *UnsortedSegmentProdOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<UnsortedSegmentProdOptionsT>(new UnsortedSegmentProdOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void UnsortedSegmentProdOptions::UnPackTo(UnsortedSegmentProdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void UnsortedSegmentProdOptions::UnPackTo(UnsortedSegmentProdOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<UnsortedSegmentProdOptions> UnsortedSegmentProdOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UnsortedSegmentProdOptions> UnsortedSegmentProdOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateUnsortedSegmentProdOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentProdOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentProdOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateUnsortedSegmentProdOptions(
       _fbb);
 }
 
-inline UnsortedSegmentMaxOptionsT *UnsortedSegmentMaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline UnsortedSegmentMaxOptionsT *UnsortedSegmentMaxOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<UnsortedSegmentMaxOptionsT>(new UnsortedSegmentMaxOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void UnsortedSegmentMaxOptions::UnPackTo(UnsortedSegmentMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void UnsortedSegmentMaxOptions::UnPackTo(UnsortedSegmentMaxOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<UnsortedSegmentMaxOptions> UnsortedSegmentMaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UnsortedSegmentMaxOptions> UnsortedSegmentMaxOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMaxOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateUnsortedSegmentMaxOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<UnsortedSegmentMaxOptions> CreateUnsortedSegmentMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UnsortedSegmentMaxOptions> CreateUnsortedSegmentMaxOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMaxOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentMaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentMaxOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateUnsortedSegmentMaxOptions(
       _fbb);
 }
 
-inline UnsortedSegmentSumOptionsT *UnsortedSegmentSumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline UnsortedSegmentSumOptionsT *UnsortedSegmentSumOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<UnsortedSegmentSumOptionsT>(new UnsortedSegmentSumOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void UnsortedSegmentSumOptions::UnPackTo(UnsortedSegmentSumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void UnsortedSegmentSumOptions::UnPackTo(UnsortedSegmentSumOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<UnsortedSegmentSumOptions> UnsortedSegmentSumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentSumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UnsortedSegmentSumOptions> UnsortedSegmentSumOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentSumOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateUnsortedSegmentSumOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<UnsortedSegmentSumOptions> CreateUnsortedSegmentSumOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentSumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UnsortedSegmentSumOptions> CreateUnsortedSegmentSumOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentSumOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentSumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentSumOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateUnsortedSegmentSumOptions(
       _fbb);
 }
 
-inline ATan2OptionsT *ATan2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ATan2OptionsT *ATan2Options::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ATan2OptionsT>(new ATan2OptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void ATan2Options::UnPackTo(ATan2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ATan2Options::UnPackTo(ATan2OptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<ATan2Options> ATan2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ATan2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ATan2Options> ATan2Options::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ATan2OptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateATan2Options(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ATan2Options> CreateATan2Options(flatbuffers::FlatBufferBuilder &_fbb, const ATan2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<ATan2Options> CreateATan2Options(::flatbuffers::FlatBufferBuilder &_fbb, const ATan2OptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ATan2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ATan2OptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateATan2Options(
       _fbb);
 }
 
-inline UnsortedSegmentMinOptionsT *UnsortedSegmentMinOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline UnsortedSegmentMinOptionsT *UnsortedSegmentMinOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<UnsortedSegmentMinOptionsT>(new UnsortedSegmentMinOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void UnsortedSegmentMinOptions::UnPackTo(UnsortedSegmentMinOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void UnsortedSegmentMinOptions::UnPackTo(UnsortedSegmentMinOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<UnsortedSegmentMinOptions> UnsortedSegmentMinOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMinOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UnsortedSegmentMinOptions> UnsortedSegmentMinOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMinOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateUnsortedSegmentMinOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<UnsortedSegmentMinOptions> CreateUnsortedSegmentMinOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMinOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<UnsortedSegmentMinOptions> CreateUnsortedSegmentMinOptions(::flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentMinOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentMinOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentMinOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateUnsortedSegmentMinOptions(
       _fbb);
 }
 
-inline SignOptionsT *SignOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SignOptionsT *SignOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SignOptionsT>(new SignOptionsT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SignOptions::UnPackTo(SignOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SignOptions::UnPackTo(SignOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<SignOptions> SignOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SignOptions> SignOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSignOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SignOptions> CreateSignOptions(flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SignOptions> CreateSignOptions(::flatbuffers::FlatBufferBuilder &_fbb, const SignOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SignOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SignOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   return tflite::CreateSignOptions(
       _fbb);
 }
 
-inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BitcastOptionsT *BitcastOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BitcastOptionsT>(new BitcastOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BitcastOptions::UnPackTo(BitcastOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<BitcastOptions> BitcastOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBitcastOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BitcastOptions> CreateBitcastOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BitcastOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BitcastOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateBitcastOptions(
+      _fbb);
+}
+
+inline BitwiseXorOptionsT *BitwiseXorOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BitwiseXorOptionsT>(new BitwiseXorOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BitwiseXorOptions::UnPackTo(BitwiseXorOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<BitwiseXorOptions> BitwiseXorOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBitwiseXorOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<BitwiseXorOptions> CreateBitwiseXorOptions(::flatbuffers::FlatBufferBuilder &_fbb, const BitwiseXorOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BitwiseXorOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateBitwiseXorOptions(
+      _fbb);
+}
+
+inline RightShiftOptionsT *RightShiftOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<RightShiftOptionsT>(new RightShiftOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void RightShiftOptions::UnPackTo(RightShiftOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline ::flatbuffers::Offset<RightShiftOptions> RightShiftOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRightShiftOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<RightShiftOptions> CreateRightShiftOptions(::flatbuffers::FlatBufferBuilder &_fbb, const RightShiftOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const RightShiftOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateRightShiftOptions(
+      _fbb);
+}
+
+inline OperatorCodeT *OperatorCode::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<OperatorCodeT>(new OperatorCodeT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void OperatorCode::UnPackTo(OperatorCodeT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void OperatorCode::UnPackTo(OperatorCodeT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = deprecated_builtin_code(); _o->deprecated_builtin_code = _e; }
@@ -16953,14 +17245,14 @@ inline void OperatorCode::UnPackTo(OperatorCodeT *_o, const flatbuffers::resolve
   { auto _e = builtin_code(); _o->builtin_code = _e; }
 }
 
-inline flatbuffers::Offset<OperatorCode> OperatorCode::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<OperatorCode> OperatorCode::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateOperatorCode(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<OperatorCode> CreateOperatorCode(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OperatorCodeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const OperatorCodeT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _deprecated_builtin_code = _o->deprecated_builtin_code;
   auto _custom_code = _o->custom_code.empty() ? 0 : _fbb.CreateString(_o->custom_code);
   auto _version = _o->version;
@@ -16973,34 +17265,34 @@ inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBuf
       _builtin_code);
 }
 
-inline OperatorT *Operator::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline OperatorT *Operator::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<OperatorT>(new OperatorT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Operator::UnPackTo(OperatorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Operator::UnPackTo(OperatorT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = opcode_index(); _o->opcode_index = _e; }
-  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } }
-  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } }
+  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } else { _o->inputs.resize(0); } }
+  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } else { _o->outputs.resize(0); } }
   { auto _e = builtin_options_type(); _o->builtin_options.type = _e; }
   { auto _e = builtin_options(); if (_e) _o->builtin_options.value = tflite::BuiltinOptionsUnion::UnPack(_e, builtin_options_type(), _resolver); }
   { auto _e = custom_options(); if (_e) { _o->custom_options.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->custom_options.begin()); } }
   { auto _e = custom_options_format(); _o->custom_options_format = _e; }
-  { auto _e = mutating_variable_inputs(); if (_e) { _o->mutating_variable_inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->mutating_variable_inputs[_i] = _e->Get(_i) != 0; } } }
-  { auto _e = intermediates(); if (_e) { _o->intermediates.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->intermediates[_i] = _e->Get(_i); } } }
+  { auto _e = mutating_variable_inputs(); if (_e) { _o->mutating_variable_inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->mutating_variable_inputs[_i] = _e->Get(_i) != 0; } } else { _o->mutating_variable_inputs.resize(0); } }
+  { auto _e = intermediates(); if (_e) { _o->intermediates.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->intermediates[_i] = _e->Get(_i); } } else { _o->intermediates.resize(0); } }
 }
 
-inline flatbuffers::Offset<Operator> Operator::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Operator> Operator::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateOperator(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Operator> CreateOperator(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Operator> CreateOperator(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OperatorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const OperatorT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _opcode_index = _o->opcode_index;
   auto _inputs = _o->inputs.size() ? _fbb.CreateVector(_o->inputs) : 0;
   auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
@@ -17028,9 +17320,9 @@ inline SubGraphT::SubGraphT(const SubGraphT &o)
         outputs(o.outputs),
         name(o.name) {
   tensors.reserve(o.tensors.size());
-  for (const auto &v : o.tensors) { tensors.emplace_back((v) ? new tflite::TensorT(*v) : nullptr); }
+  for (const auto &tensors_ : o.tensors) { tensors.emplace_back((tensors_) ? new tflite::TensorT(*tensors_) : nullptr); }
   operators.reserve(o.operators.size());
-  for (const auto &v : o.operators) { operators.emplace_back((v) ? new tflite::OperatorT(*v) : nullptr); }
+  for (const auto &operators_ : o.operators) { operators.emplace_back((operators_) ? new tflite::OperatorT(*operators_) : nullptr); }
 }
 
 inline SubGraphT &SubGraphT::operator=(SubGraphT o) FLATBUFFERS_NOEXCEPT {
@@ -17042,34 +17334,34 @@ inline SubGraphT &SubGraphT::operator=(SubGraphT o) FLATBUFFERS_NOEXCEPT {
   return *this;
 }
 
-inline SubGraphT *SubGraph::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SubGraphT *SubGraph::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SubGraphT>(new SubGraphT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SubGraph::UnPackTo(SubGraphT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SubGraph::UnPackTo(SubGraphT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = tensors(); if (_e) { _o->tensors.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->tensors[_i]) { _e->Get(_i)->UnPackTo(_o->tensors[_i].get(), _resolver); } else { _o->tensors[_i] = std::unique_ptr<tflite::TensorT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
-  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } }
-  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } }
-  { auto _e = operators(); if (_e) { _o->operators.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operators[_i]) { _e->Get(_i)->UnPackTo(_o->operators[_i].get(), _resolver); } else { _o->operators[_i] = std::unique_ptr<tflite::OperatorT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
+  { auto _e = tensors(); if (_e) { _o->tensors.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->tensors[_i]) { _e->Get(_i)->UnPackTo(_o->tensors[_i].get(), _resolver); } else { _o->tensors[_i] = std::unique_ptr<tflite::TensorT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->tensors.resize(0); } }
+  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } else { _o->inputs.resize(0); } }
+  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } else { _o->outputs.resize(0); } }
+  { auto _e = operators(); if (_e) { _o->operators.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operators[_i]) { _e->Get(_i)->UnPackTo(_o->operators[_i].get(), _resolver); } else { _o->operators[_i] = std::unique_ptr<tflite::OperatorT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->operators.resize(0); } }
   { auto _e = name(); if (_e) _o->name = _e->str(); }
 }
 
-inline flatbuffers::Offset<SubGraph> SubGraph::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SubGraph> SubGraph::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSubGraph(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SubGraph> CreateSubGraph(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SubGraph> CreateSubGraph(::flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SubGraphT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _tensors = _o->tensors.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::Tensor>> (_o->tensors.size(), [](size_t i, _VectorArgs *__va) { return CreateTensor(*__va->__fbb, __va->__o->tensors[i].get(), __va->__rehasher); }, &_va ) : 0;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SubGraphT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _tensors = _o->tensors.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Tensor>> (_o->tensors.size(), [](size_t i, _VectorArgs *__va) { return CreateTensor(*__va->__fbb, __va->__o->tensors[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _inputs = _o->inputs.size() ? _fbb.CreateVector(_o->inputs) : 0;
   auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
-  auto _operators = _o->operators.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::Operator>> (_o->operators.size(), [](size_t i, _VectorArgs *__va) { return CreateOperator(*__va->__fbb, __va->__o->operators[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _operators = _o->operators.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Operator>> (_o->operators.size(), [](size_t i, _VectorArgs *__va) { return CreateOperator(*__va->__fbb, __va->__o->operators[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
   return tflite::CreateSubGraph(
       _fbb,
@@ -17080,26 +17372,26 @@ inline flatbuffers::Offset<SubGraph> CreateSubGraph(flatbuffers::FlatBufferBuild
       _name);
 }
 
-inline BufferT *Buffer::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BufferT *Buffer::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<BufferT>(new BufferT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Buffer::UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Buffer::UnPackTo(BufferT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = data(); if (_e) { _o->data.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->data.begin()); } }
 }
 
-inline flatbuffers::Offset<Buffer> Buffer::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Buffer> Buffer::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBuffer(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Buffer> CreateBuffer(flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Buffer> CreateBuffer(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BufferT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BufferT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   _fbb.ForceVectorAlignment(_o->data.size(), sizeof(uint8_t), 16);
   auto _data = _o->data.size() ? _fbb.CreateVector(_o->data) : 0;
   return tflite::CreateBuffer(
@@ -17107,27 +17399,27 @@ inline flatbuffers::Offset<Buffer> CreateBuffer(flatbuffers::FlatBufferBuilder &
       _data);
 }
 
-inline MetadataT *Metadata::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline MetadataT *Metadata::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<MetadataT>(new MetadataT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Metadata::UnPackTo(MetadataT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Metadata::UnPackTo(MetadataT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = name(); if (_e) _o->name = _e->str(); }
   { auto _e = buffer(); _o->buffer = _e; }
 }
 
-inline flatbuffers::Offset<Metadata> Metadata::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Metadata> Metadata::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const MetadataT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateMetadata(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Metadata> CreateMetadata(flatbuffers::FlatBufferBuilder &_fbb, const MetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Metadata> CreateMetadata(::flatbuffers::FlatBufferBuilder &_fbb, const MetadataT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MetadataT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const MetadataT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
   auto _buffer = _o->buffer;
   return tflite::CreateMetadata(
@@ -17136,27 +17428,27 @@ inline flatbuffers::Offset<Metadata> CreateMetadata(flatbuffers::FlatBufferBuild
       _buffer);
 }
 
-inline TensorMapT *TensorMap::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline TensorMapT *TensorMap::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<TensorMapT>(new TensorMapT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void TensorMap::UnPackTo(TensorMapT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void TensorMap::UnPackTo(TensorMapT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = name(); if (_e) _o->name = _e->str(); }
   { auto _e = tensor_index(); _o->tensor_index = _e; }
 }
 
-inline flatbuffers::Offset<TensorMap> TensorMap::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<TensorMap> TensorMap::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateTensorMap(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<TensorMap> CreateTensorMap(flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<TensorMap> CreateTensorMap(::flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TensorMapT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const TensorMapT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
   auto _tensor_index = _o->tensor_index;
   return tflite::CreateTensorMap(
@@ -17169,9 +17461,9 @@ inline SignatureDefT::SignatureDefT(const SignatureDefT &o)
       : signature_key(o.signature_key),
         subgraph_index(o.subgraph_index) {
   inputs.reserve(o.inputs.size());
-  for (const auto &v : o.inputs) { inputs.emplace_back((v) ? new tflite::TensorMapT(*v) : nullptr); }
+  for (const auto &inputs_ : o.inputs) { inputs.emplace_back((inputs_) ? new tflite::TensorMapT(*inputs_) : nullptr); }
   outputs.reserve(o.outputs.size());
-  for (const auto &v : o.outputs) { outputs.emplace_back((v) ? new tflite::TensorMapT(*v) : nullptr); }
+  for (const auto &outputs_ : o.outputs) { outputs.emplace_back((outputs_) ? new tflite::TensorMapT(*outputs_) : nullptr); }
 }
 
 inline SignatureDefT &SignatureDefT::operator=(SignatureDefT o) FLATBUFFERS_NOEXCEPT {
@@ -17182,31 +17474,31 @@ inline SignatureDefT &SignatureDefT::operator=(SignatureDefT o) FLATBUFFERS_NOEX
   return *this;
 }
 
-inline SignatureDefT *SignatureDef::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SignatureDefT *SignatureDef::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<SignatureDefT>(new SignatureDefT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void SignatureDef::UnPackTo(SignatureDefT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SignatureDef::UnPackTo(SignatureDefT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->inputs[_i]) { _e->Get(_i)->UnPackTo(_o->inputs[_i].get(), _resolver); } else { _o->inputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
-  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->outputs[_i]) { _e->Get(_i)->UnPackTo(_o->outputs[_i].get(), _resolver); } else { _o->outputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
+  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->inputs[_i]) { _e->Get(_i)->UnPackTo(_o->inputs[_i].get(), _resolver); } else { _o->inputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->inputs.resize(0); } }
+  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->outputs[_i]) { _e->Get(_i)->UnPackTo(_o->outputs[_i].get(), _resolver); } else { _o->outputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->outputs.resize(0); } }
   { auto _e = signature_key(); if (_e) _o->signature_key = _e->str(); }
   { auto _e = subgraph_index(); _o->subgraph_index = _e; }
 }
 
-inline flatbuffers::Offset<SignatureDef> SignatureDef::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SignatureDef> SignatureDef::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSignatureDef(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SignatureDef> CreateSignatureDef(flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<SignatureDef> CreateSignatureDef(::flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SignatureDefT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _inputs = _o->inputs.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::TensorMap>> (_o->inputs.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorMap(*__va->__fbb, __va->__o->inputs[i].get(), __va->__rehasher); }, &_va ) : 0;
-  auto _outputs = _o->outputs.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::TensorMap>> (_o->outputs.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorMap(*__va->__fbb, __va->__o->outputs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const SignatureDefT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _inputs = _o->inputs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TensorMap>> (_o->inputs.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorMap(*__va->__fbb, __va->__o->inputs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _outputs = _o->outputs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TensorMap>> (_o->outputs.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorMap(*__va->__fbb, __va->__o->outputs[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _signature_key = _o->signature_key.empty() ? 0 : _fbb.CreateString(_o->signature_key);
   auto _subgraph_index = _o->subgraph_index;
   return tflite::CreateSignatureDef(
@@ -17222,15 +17514,15 @@ inline ModelT::ModelT(const ModelT &o)
         description(o.description),
         metadata_buffer(o.metadata_buffer) {
   operator_codes.reserve(o.operator_codes.size());
-  for (const auto &v : o.operator_codes) { operator_codes.emplace_back((v) ? new tflite::OperatorCodeT(*v) : nullptr); }
+  for (const auto &operator_codes_ : o.operator_codes) { operator_codes.emplace_back((operator_codes_) ? new tflite::OperatorCodeT(*operator_codes_) : nullptr); }
   subgraphs.reserve(o.subgraphs.size());
-  for (const auto &v : o.subgraphs) { subgraphs.emplace_back((v) ? new tflite::SubGraphT(*v) : nullptr); }
+  for (const auto &subgraphs_ : o.subgraphs) { subgraphs.emplace_back((subgraphs_) ? new tflite::SubGraphT(*subgraphs_) : nullptr); }
   buffers.reserve(o.buffers.size());
-  for (const auto &v : o.buffers) { buffers.emplace_back((v) ? new tflite::BufferT(*v) : nullptr); }
+  for (const auto &buffers_ : o.buffers) { buffers.emplace_back((buffers_) ? new tflite::BufferT(*buffers_) : nullptr); }
   metadata.reserve(o.metadata.size());
-  for (const auto &v : o.metadata) { metadata.emplace_back((v) ? new tflite::MetadataT(*v) : nullptr); }
+  for (const auto &metadata_ : o.metadata) { metadata.emplace_back((metadata_) ? new tflite::MetadataT(*metadata_) : nullptr); }
   signature_defs.reserve(o.signature_defs.size());
-  for (const auto &v : o.signature_defs) { signature_defs.emplace_back((v) ? new tflite::SignatureDefT(*v) : nullptr); }
+  for (const auto &signature_defs_ : o.signature_defs) { signature_defs.emplace_back((signature_defs_) ? new tflite::SignatureDefT(*signature_defs_) : nullptr); }
 }
 
 inline ModelT &ModelT::operator=(ModelT o) FLATBUFFERS_NOEXCEPT {
@@ -17245,41 +17537,41 @@ inline ModelT &ModelT::operator=(ModelT o) FLATBUFFERS_NOEXCEPT {
   return *this;
 }
 
-inline ModelT *Model::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ModelT *Model::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ModelT>(new ModelT());
   UnPackTo(_o.get(), _resolver);
   return _o.release();
 }
 
-inline void Model::UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Model::UnPackTo(ModelT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = version(); _o->version = _e; }
-  { auto _e = operator_codes(); if (_e) { _o->operator_codes.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operator_codes[_i]) { _e->Get(_i)->UnPackTo(_o->operator_codes[_i].get(), _resolver); } else { _o->operator_codes[_i] = std::unique_ptr<tflite::OperatorCodeT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
-  { auto _e = subgraphs(); if (_e) { _o->subgraphs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->subgraphs[_i]) { _e->Get(_i)->UnPackTo(_o->subgraphs[_i].get(), _resolver); } else { _o->subgraphs[_i] = std::unique_ptr<tflite::SubGraphT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
+  { auto _e = operator_codes(); if (_e) { _o->operator_codes.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->operator_codes[_i]) { _e->Get(_i)->UnPackTo(_o->operator_codes[_i].get(), _resolver); } else { _o->operator_codes[_i] = std::unique_ptr<tflite::OperatorCodeT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->operator_codes.resize(0); } }
+  { auto _e = subgraphs(); if (_e) { _o->subgraphs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->subgraphs[_i]) { _e->Get(_i)->UnPackTo(_o->subgraphs[_i].get(), _resolver); } else { _o->subgraphs[_i] = std::unique_ptr<tflite::SubGraphT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->subgraphs.resize(0); } }
   { auto _e = description(); if (_e) _o->description = _e->str(); }
-  { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->buffers[_i]) { _e->Get(_i)->UnPackTo(_o->buffers[_i].get(), _resolver); } else { _o->buffers[_i] = std::unique_ptr<tflite::BufferT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
-  { auto _e = metadata_buffer(); if (_e) { _o->metadata_buffer.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->metadata_buffer[_i] = _e->Get(_i); } } }
-  { auto _e = metadata(); if (_e) { _o->metadata.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metadata[_i]) { _e->Get(_i)->UnPackTo(_o->metadata[_i].get(), _resolver); } else { _o->metadata[_i] = std::unique_ptr<tflite::MetadataT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
-  { auto _e = signature_defs(); if (_e) { _o->signature_defs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->signature_defs[_i]) { _e->Get(_i)->UnPackTo(_o->signature_defs[_i].get(), _resolver); } else { _o->signature_defs[_i] = std::unique_ptr<tflite::SignatureDefT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
+  { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->buffers[_i]) { _e->Get(_i)->UnPackTo(_o->buffers[_i].get(), _resolver); } else { _o->buffers[_i] = std::unique_ptr<tflite::BufferT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->buffers.resize(0); } }
+  { auto _e = metadata_buffer(); if (_e) { _o->metadata_buffer.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->metadata_buffer[_i] = _e->Get(_i); } } else { _o->metadata_buffer.resize(0); } }
+  { auto _e = metadata(); if (_e) { _o->metadata.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metadata[_i]) { _e->Get(_i)->UnPackTo(_o->metadata[_i].get(), _resolver); } else { _o->metadata[_i] = std::unique_ptr<tflite::MetadataT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->metadata.resize(0); } }
+  { auto _e = signature_defs(); if (_e) { _o->signature_defs.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->signature_defs[_i]) { _e->Get(_i)->UnPackTo(_o->signature_defs[_i].get(), _resolver); } else { _o->signature_defs[_i] = std::unique_ptr<tflite::SignatureDefT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->signature_defs.resize(0); } }
 }
 
-inline flatbuffers::Offset<Model> Model::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Model> Model::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   return CreateModel(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline ::flatbuffers::Offset<Model> CreateModel(::flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ModelT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ModelT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _version = _o->version;
-  auto _operator_codes = _o->operator_codes.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::OperatorCode>> (_o->operator_codes.size(), [](size_t i, _VectorArgs *__va) { return CreateOperatorCode(*__va->__fbb, __va->__o->operator_codes[i].get(), __va->__rehasher); }, &_va ) : 0;
-  auto _subgraphs = _o->subgraphs.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::SubGraph>> (_o->subgraphs.size(), [](size_t i, _VectorArgs *__va) { return CreateSubGraph(*__va->__fbb, __va->__o->subgraphs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _operator_codes = _o->operator_codes.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::OperatorCode>> (_o->operator_codes.size(), [](size_t i, _VectorArgs *__va) { return CreateOperatorCode(*__va->__fbb, __va->__o->operator_codes[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _subgraphs = _o->subgraphs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SubGraph>> (_o->subgraphs.size(), [](size_t i, _VectorArgs *__va) { return CreateSubGraph(*__va->__fbb, __va->__o->subgraphs[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _description = _o->description.empty() ? 0 : _fbb.CreateString(_o->description);
-  auto _buffers = _o->buffers.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::Buffer>> (_o->buffers.size(), [](size_t i, _VectorArgs *__va) { return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _buffers = _o->buffers.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Buffer>> (_o->buffers.size(), [](size_t i, _VectorArgs *__va) { return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _metadata_buffer = _o->metadata_buffer.size() ? _fbb.CreateVector(_o->metadata_buffer) : 0;
-  auto _metadata = _o->metadata.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::Metadata>> (_o->metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateMetadata(*__va->__fbb, __va->__o->metadata[i].get(), __va->__rehasher); }, &_va ) : 0;
-  auto _signature_defs = _o->signature_defs.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::SignatureDef>> (_o->signature_defs.size(), [](size_t i, _VectorArgs *__va) { return CreateSignatureDef(*__va->__fbb, __va->__o->signature_defs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _metadata = _o->metadata.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Metadata>> (_o->metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateMetadata(*__va->__fbb, __va->__o->metadata[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _signature_defs = _o->signature_defs.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SignatureDef>> (_o->signature_defs.size(), [](size_t i, _VectorArgs *__va) { return CreateSignatureDef(*__va->__fbb, __va->__o->signature_defs[i].get(), __va->__rehasher); }, &_va ) : 0;
   return tflite::CreateModel(
       _fbb,
       _version,
@@ -17292,7 +17584,7 @@ inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_f
       _signature_defs);
 }
 
-inline bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type) {
+inline bool VerifyQuantizationDetails(::flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type) {
   switch (type) {
     case QuantizationDetails_NONE: {
       return true;
@@ -17305,10 +17597,10 @@ inline bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const voi
   }
 }
 
-inline bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+inline bool VerifyQuantizationDetailsVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
   if (!values || !types) return !values && !types;
   if (values->size() != types->size()) return false;
-  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
     if (!VerifyQuantizationDetails(
         verifier,  values->Get(i), types->GetEnum<QuantizationDetails>(i))) {
       return false;
@@ -17317,7 +17609,7 @@ inline bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, con
   return true;
 }
 
-inline void *QuantizationDetailsUnion::UnPack(const void *obj, QuantizationDetails type, const flatbuffers::resolver_function_t *resolver) {
+inline void *QuantizationDetailsUnion::UnPack(const void *obj, QuantizationDetails type, const ::flatbuffers::resolver_function_t *resolver) {
   (void)resolver;
   switch (type) {
     case QuantizationDetails_CustomQuantization: {
@@ -17328,7 +17620,7 @@ inline void *QuantizationDetailsUnion::UnPack(const void *obj, QuantizationDetai
   }
 }
 
-inline flatbuffers::Offset<void> QuantizationDetailsUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+inline ::flatbuffers::Offset<void> QuantizationDetailsUnion::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher) const {
   (void)_rehasher;
   switch (type) {
     case QuantizationDetails_CustomQuantization: {
@@ -17363,7 +17655,7 @@ inline void QuantizationDetailsUnion::Reset() {
   type = QuantizationDetails_NONE;
 }
 
-inline bool VerifySparseIndexVector(flatbuffers::Verifier &verifier, const void *obj, SparseIndexVector type) {
+inline bool VerifySparseIndexVector(::flatbuffers::Verifier &verifier, const void *obj, SparseIndexVector type) {
   switch (type) {
     case SparseIndexVector_NONE: {
       return true;
@@ -17384,10 +17676,10 @@ inline bool VerifySparseIndexVector(flatbuffers::Verifier &verifier, const void
   }
 }
 
-inline bool VerifySparseIndexVectorVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+inline bool VerifySparseIndexVectorVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
   if (!values || !types) return !values && !types;
   if (values->size() != types->size()) return false;
-  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
     if (!VerifySparseIndexVector(
         verifier,  values->Get(i), types->GetEnum<SparseIndexVector>(i))) {
       return false;
@@ -17396,7 +17688,7 @@ inline bool VerifySparseIndexVectorVector(flatbuffers::Verifier &verifier, const
   return true;
 }
 
-inline void *SparseIndexVectorUnion::UnPack(const void *obj, SparseIndexVector type, const flatbuffers::resolver_function_t *resolver) {
+inline void *SparseIndexVectorUnion::UnPack(const void *obj, SparseIndexVector type, const ::flatbuffers::resolver_function_t *resolver) {
   (void)resolver;
   switch (type) {
     case SparseIndexVector_Int32Vector: {
@@ -17415,7 +17707,7 @@ inline void *SparseIndexVectorUnion::UnPack(const void *obj, SparseIndexVector t
   }
 }
 
-inline flatbuffers::Offset<void> SparseIndexVectorUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+inline ::flatbuffers::Offset<void> SparseIndexVectorUnion::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher) const {
   (void)_rehasher;
   switch (type) {
     case SparseIndexVector_Int32Vector: {
@@ -17476,7 +17768,7 @@ inline void SparseIndexVectorUnion::Reset() {
   type = SparseIndexVector_NONE;
 }
 
-inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type) {
+inline bool VerifyBuiltinOptions(::flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type) {
   switch (type) {
     case BuiltinOptions_NONE: {
       return true;
@@ -17973,14 +18265,26 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const tflite::SignOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_BitcastOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitcastOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitwiseXorOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      auto ptr = reinterpret_cast<const tflite::RightShiftOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return true;
   }
 }
 
-inline bool VerifyBuiltinOptionsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+inline bool VerifyBuiltinOptionsVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<uint8_t> *types) {
   if (!values || !types) return !values && !types;
   if (values->size() != types->size()) return false;
-  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
     if (!VerifyBuiltinOptions(
         verifier,  values->Get(i), types->GetEnum<BuiltinOptions>(i))) {
       return false;
@@ -17989,7 +18293,7 @@ inline bool VerifyBuiltinOptionsVector(flatbuffers::Verifier &verifier, const fl
   return true;
 }
 
-inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, const flatbuffers::resolver_function_t *resolver) {
+inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, const ::flatbuffers::resolver_function_t *resolver) {
   (void)resolver;
   switch (type) {
     case BuiltinOptions_Conv2DOptions: {
@@ -18484,11 +18788,23 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const tflite::SignOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_BitcastOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitcastOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitwiseXorOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      auto ptr = reinterpret_cast<const tflite::RightShiftOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
 
-inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+inline ::flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ::flatbuffers::rehasher_function_t *_rehasher) const {
   (void)_rehasher;
   switch (type) {
     case BuiltinOptions_Conv2DOptions: {
@@ -18983,6 +19299,18 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const tflite::SignOptionsT *>(value);
       return CreateSignOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_BitcastOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitcastOptionsT *>(value);
+      return CreateBitcastOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitwiseXorOptionsT *>(value);
+      return CreateBitwiseXorOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      auto ptr = reinterpret_cast<const tflite::RightShiftOptionsT *>(value);
+      return CreateRightShiftOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -19481,6 +19809,18 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) :
       value = new tflite::SignOptionsT(*reinterpret_cast<tflite::SignOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_BitcastOptions: {
+      value = new tflite::BitcastOptionsT(*reinterpret_cast<tflite::BitcastOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      value = new tflite::BitwiseXorOptionsT(*reinterpret_cast<tflite::BitwiseXorOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      value = new tflite::RightShiftOptionsT(*reinterpret_cast<tflite::RightShiftOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -20103,6 +20443,21 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_BitcastOptions: {
+      auto ptr = reinterpret_cast<tflite::BitcastOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<tflite::BitwiseXorOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RightShiftOptions: {
+      auto ptr = reinterpret_cast<tflite::RightShiftOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
@@ -20110,11 +20465,11 @@ inline void BuiltinOptionsUnion::Reset() {
 }
 
 inline const tflite::Model *GetModel(const void *buf) {
-  return flatbuffers::GetRoot<tflite::Model>(buf);
+  return ::flatbuffers::GetRoot<tflite::Model>(buf);
 }
 
 inline const tflite::Model *GetSizePrefixedModel(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<tflite::Model>(buf);
+  return ::flatbuffers::GetSizePrefixedRoot<tflite::Model>(buf);
 }
 
 inline const char *ModelIdentifier() {
@@ -20122,22 +20477,22 @@ inline const char *ModelIdentifier() {
 }
 
 inline bool ModelBufferHasIdentifier(const void *buf) {
-  return flatbuffers::BufferHasIdentifier(
+  return ::flatbuffers::BufferHasIdentifier(
       buf, ModelIdentifier());
 }
 
 inline bool SizePrefixedModelBufferHasIdentifier(const void *buf) {
-  return flatbuffers::BufferHasIdentifier(
+  return ::flatbuffers::BufferHasIdentifier(
       buf, ModelIdentifier(), true);
 }
 
 inline bool VerifyModelBuffer(
-    flatbuffers::Verifier &verifier) {
+    ::flatbuffers::Verifier &verifier) {
   return verifier.VerifyBuffer<tflite::Model>(ModelIdentifier());
 }
 
 inline bool VerifySizePrefixedModelBuffer(
-    flatbuffers::Verifier &verifier) {
+    ::flatbuffers::Verifier &verifier) {
   return verifier.VerifySizePrefixedBuffer<tflite::Model>(ModelIdentifier());
 }
 
@@ -20146,26 +20501,26 @@ inline const char *ModelExtension() {
 }
 
 inline void FinishModelBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<tflite::Model> root) {
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::Model> root) {
   fbb.Finish(root, ModelIdentifier());
 }
 
 inline void FinishSizePrefixedModelBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<tflite::Model> root) {
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::Model> root) {
   fbb.FinishSizePrefixed(root, ModelIdentifier());
 }
 
 inline std::unique_ptr<tflite::ModelT> UnPackModel(
     const void *buf,
-    const flatbuffers::resolver_function_t *res = nullptr) {
+    const ::flatbuffers::resolver_function_t *res = nullptr) {
   return std::unique_ptr<tflite::ModelT>(GetModel(buf)->UnPack(res));
 }
 
 inline std::unique_ptr<tflite::ModelT> UnPackSizePrefixedModel(
     const void *buf,
-    const flatbuffers::resolver_function_t *res = nullptr) {
+    const ::flatbuffers::resolver_function_t *res = nullptr) {
   return std::unique_ptr<tflite::ModelT>(GetSizePrefixedModel(buf)->UnPack(res));
 }
 
diff --git a/tensorflow/lite/special_rules.bzl b/tensorflow/lite/special_rules.bzl
index 5034ac6e5a6..954b4a6d923 100644
--- a/tensorflow/lite/special_rules.bzl
+++ b/tensorflow/lite/special_rules.bzl
@@ -32,15 +32,15 @@ def ios_visibility_allowlist():
 
 def internal_visibility_allowlist():
     """Grant public visibility to internal targets so that other repos can depend on them."""
-    return [
-        "//visibility:public",
-    ]
+    return ["//visibility:public"]
+
+def jni_utils_visibility_allowlist():
+    """Returns a list of packages that can depend on tensorflow/lite/java/src/main/native:jni_utils."""
+    return ["//tensorflow/lite:__subpackages__"]
 
 def nonportable_visibility_allowlist():
     """Grant public visibility to nonportable targets so that other repos can depend on them."""
-    return [
-        "//visibility:public",
-    ]
+    return ["//visibility:public"]
 
 def op_resolver_internal_visibility_allowlist():
     """Returns a list of packages that can depend on tensorflow/lite/core/api:op_resolver_internal.
diff --git a/tensorflow/lite/tensorflow_profiler_logger.cc b/tensorflow/lite/tensorflow_profiler_logger.cc
deleted file mode 100644
index e24fafbdbae..00000000000
--- a/tensorflow/lite/tensorflow_profiler_logger.cc
+++ /dev/null
@@ -1,384 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/tensorflow_profiler_logger.h"
-
-#include <stdlib.h>
-
-#include <algorithm>
-#include <atomic>
-#include <memory>
-#include <string>
-
-#include "base/addressmap.h"
-#include "base/examine_stack.h"
-#include "base/low_level_alloc.h"
-#include "base/malloc_hook.h"
-#include "absl/base/call_once.h"
-#include "absl/debugging/stacktrace.h"
-#include "absl/debugging/symbolize.h"
-#include "absl/synchronization/mutex.h"
-#include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
-#include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-
-namespace tflite {
-namespace {
-
-struct Statistics {
-  int64_t total_bytes_allocated = 0LL;
-  int64_t peak_bytes_in_use = 0LL;
-};
-static Statistics g_stat_heap;
-static Statistics g_stat_dynamic;
-static Statistics g_stat_arena;
-
-static char g_current_op_name[256];
-
-static absl::Mutex g_api_mutex(
-    absl::kConstInit);  // To make sure public APIs are called synchronously.
-
-struct HeapEvent {
-  int64_t timestamp;
-  const void* ptr;
-  void* stack[3];
-  size_t size;
-  bool is_alloc;
-};
-
-constexpr int kThresholdToCapture = 1024 * 8;
-constexpr int MAX_HEAP_EVENTS = 200000;
-static HeapEvent g_heap_events[MAX_HEAP_EVENTS];
-
-static std::atomic<int> g_heap_event_idx = 0;
-static std::atomic<bool> g_pause_heap_monitor = false;
-static absl::once_flag g_install_hooks_once;
-
-// Low-level allocation that bypasses the hooks.
-static LowLevelAlloc::Arena* g_map_memory;
-
-static void* RawMalloc(size_t bytes) {
-  return LowLevelAlloc::AllocWithArena(bytes, g_map_memory);
-}
-
-static void RawFree(void* p) { LowLevelAlloc::Free(p); }
-
-// Address map recorded by OnMemoryAlloc() hook.
-static AddressMap<size_t>* g_allocation_map;
-
-// Adds memory trace information for TensorFlow profiler.
-// `stat`: Statistics object for the (de)allocation.
-// `is_allocating`: Whether memory is being allocated or deallocated.
-// `allocation_bytes`: The number of bytes being allocated or deallocated.
-// `requested_bytes`: The number of bytes requested for allocation/deallocation.
-// `tensor_id`: A unique ID for the tensor being allocated or deallocated.
-//              Usually the memory address should be used.
-// `name`: The name of the tensor being allocated or deallocated.
-// `dims`: The dimension of the tensor in a string form.
-std::string AddTraceMeInternal(Statistics* stat, bool is_allocating,
-                               const std::string& allocator_name,
-                               int64_t tensor_id, const std::string& name,
-                               const std::string& dims,
-                               int64_t allocation_bytes,
-                               int64_t requested_bytes) {
-  if (is_allocating) {
-    stat->total_bytes_allocated += allocation_bytes;
-  } else {
-    stat->total_bytes_allocated -= allocation_bytes;
-  }
-  stat->peak_bytes_in_use =
-      std::max(stat->peak_bytes_in_use, stat->total_bytes_allocated);
-  int64_t total_bytes_allocated = stat->total_bytes_allocated;
-  int64_t peak_bytes_in_use = stat->peak_bytes_in_use;
-
-  std::string res = tensorflow::profiler::TraceMeEncode(
-      is_allocating ? "MemoryAllocation" : "MemoryDeallocation",
-      // Note that all of these fields are necessary for profiling UI.
-      {{"allocator_name", allocator_name},  // name shown on 'Memory ID'
-       {"bytes_allocated", total_bytes_allocated},
-       {"peak_bytes_in_use", peak_bytes_in_use},
-       {"requested_bytes", requested_bytes},
-       {"allocation_bytes", allocation_bytes},
-       // Note: addr is used as a key to match alloc and dealloc.
-       {"addr", tensor_id},
-       // Note that we're using tensor.name not op name here.
-       {"tf_op", name},
-       {"shape", dims}});
-  // Note: bytes_reserved, fragmentation, data_type, region_type, id
-  // can be potentially useful but not added.
-  return res;
-}
-
-void AddTraceMe(bool is_allocating, TfLiteTensor* tensor,
-                size_t allocation_bytes) {
-  if (tensor == nullptr || allocation_bytes == 0) return;
-  int64_t tensor_id = reinterpret_cast<int64_t>(tensor->data.raw);
-  std::string name;
-  if (g_current_op_name[0]) {
-    name = g_current_op_name;
-  }
-  if (tensor->name) {
-    name += ":";
-    name += tensor->name;
-  }
-  std::string dims = tensor->dims ? GetShapeDebugString(tensor->dims) : "[]";
-  int64_t requested_bytes = is_allocating ? allocation_bytes : 0;
-  const std::string allocator_name = "_tflite_native_dynamic";
-
-  tensorflow::profiler::TraceMe::InstantActivity(
-      [is_allocating, allocator_name, tensor_id, name, dims, allocation_bytes,
-       requested_bytes]() {
-        return AddTraceMeInternal(&g_stat_dynamic, is_allocating,
-                                  allocator_name, tensor_id, name, dims,
-                                  allocation_bytes, requested_bytes);
-      },
-      /*level=*/tensorflow::profiler::TraceMeLevel::kInfo);
-}
-
-void AddArenaTrace(bool is_allocating, int subgraph_index, int arena_id,
-                   size_t allocation_bytes) {
-  std::string name = "Subgraph" + std::to_string(subgraph_index);
-  int64_t tensor_id = arena_id;
-  std::string dims = "";
-  int64_t requested_bytes = is_allocating ? allocation_bytes : 0;
-  const std::string allocator_name = "_tflite_arena";
-
-  tensorflow::profiler::TraceMe::InstantActivity(
-      [is_allocating, allocator_name, tensor_id, name, dims, allocation_bytes,
-       requested_bytes]() {
-        return AddTraceMeInternal(&g_stat_arena, is_allocating, allocator_name,
-                                  tensor_id, name, dims, allocation_bytes,
-                                  requested_bytes);
-      },
-      /*level=*/tensorflow::profiler::TraceMeLevel::kInfo);
-}
-
-void AddHeapTraceMe(int64_t now, const char* op_name, bool is_allocating,
-                    const void* address, size_t allocation_bytes) {
-  int64_t tensor_id = reinterpret_cast<int64_t>(address);
-  const std::string name = op_name;
-  const std::string dims = "[]";
-  int64_t requested_bytes = is_allocating ? allocation_bytes : 0;
-  const std::string allocator_name = "native_heap";
-
-  std::string res =
-      AddTraceMeInternal(&g_stat_heap, is_allocating, allocator_name, tensor_id,
-                         name, dims, allocation_bytes, requested_bytes);
-
-  tensorflow::profiler::TraceMeRecorder::Record(
-      {res, /*start_time=*/now, /*end_time=*/now});
-}
-
-char* GetOpnameFromStacks(void* stack[3]) {
-  char symbol1[64];
-  if (!absl::Symbolize(stack[0], symbol1, sizeof(symbol1))) {
-    snprintf(symbol1, sizeof(symbol1), "%p", stack[0]);
-  }
-  char symbol2[64];
-  if (!absl::Symbolize(stack[1], symbol2, sizeof(symbol2))) {
-    snprintf(symbol2, sizeof(symbol2), "%p", stack[1]);
-  }
-  char symbol3[64];
-  if (!absl::Symbolize(stack[2], symbol3, sizeof(symbol3))) {
-    snprintf(symbol3, sizeof(symbol3), "%p", stack[2]);
-  }
-  static char op_name[256];
-  snprintf(op_name, sizeof(op_name), "%s/%s/%s", symbol1, symbol2, symbol3);
-  return op_name;
-}
-
-// Hook for malloc().
-void OnMemoryAlloc(const void* ptr, size_t num_bytes) {
-  if (g_pause_heap_monitor || num_bytes < kThresholdToCapture ||
-      g_heap_event_idx >= MAX_HEAP_EVENTS)
-    return;
-
-  HeapEvent* current_event = &g_heap_events[g_heap_event_idx++];
-  current_event->timestamp = absl::GetCurrentTimeNanos();
-  current_event->ptr = ptr;
-  current_event->size = num_bytes;
-  g_allocation_map->Insert(ptr, num_bytes);
-  current_event->is_alloc = true;
-  absl::GetStackTrace(current_event->stack,
-                      /* max_depth = */ ABSL_ARRAYSIZE(current_event->stack),
-                      /* skip_count = */ 3);
-}
-
-// Hook for free().
-void OnMemoryDealloc(const void* ptr) {
-  if (g_pause_heap_monitor || ptr == nullptr ||
-      g_heap_event_idx >= MAX_HEAP_EVENTS)
-    return;
-  size_t free_size;
-  if (!g_allocation_map->FindAndRemove(ptr, &free_size)) return;
-
-  HeapEvent* current_event = &g_heap_events[g_heap_event_idx++];
-  current_event->timestamp = absl::GetCurrentTimeNanos();
-  current_event->ptr = ptr;
-  current_event->size = 0;  // Will figure out later.
-  current_event->size = free_size;
-  current_event->is_alloc = false;
-  absl::GetStackTrace(current_event->stack,
-                      /* max_depth = */ ABSL_ARRAYSIZE(current_event->stack),
-                      /* skip_count = */ 3);
-}
-
-// Set g_pause_heap_monitor to true and returns the old value.
-inline bool DisableHeapMonitor() ABSL_EXCLUSIVE_LOCKS_REQUIRED(g_api_mutex) {
-  bool old_g_heap_monitor = g_pause_heap_monitor;
-  g_pause_heap_monitor = true;
-  return old_g_heap_monitor;
-}
-
-// Restore g_pause_heap_monitor to the old status.
-inline void RestoreHeapMonitor(bool old_g_heap_monitor)
-    ABSL_EXCLUSIVE_LOCKS_REQUIRED(g_api_mutex) {
-  g_pause_heap_monitor = old_g_heap_monitor;
-}
-
-}  // namespace
-
-void OnTfLiteOpPrepare(const char* op_name, int subgraph_index,
-                       int node_index) {
-  snprintf(g_current_op_name, sizeof(g_current_op_name), "%sPrepare_%d",
-           op_name, node_index);
-  // Updates TF's current annotation object by creating scoped annotation obj.
-  tensorflow::profiler::ScopedMemoryDebugAnnotation annotation(
-      g_current_op_name);
-}
-
-tensorflow::profiler::TraceMe* OnTfLiteSubgraphInvoke(const char* name,
-                                                      int index) {
-  absl::MutexLock lock(&g_api_mutex);
-
-  absl::call_once(g_install_hooks_once, [] {
-    if (g_map_memory == nullptr) g_map_memory = LowLevelAlloc::NewArena(0);
-    g_allocation_map = new AddressMap<size_t>(RawMalloc, RawFree);
-
-    MallocHook::AddNewHook(&OnMemoryAlloc);
-    MallocHook::AddDeleteHook(&OnMemoryDealloc);
-  });
-
-  // Disable heap monitoring to ignore heap activity of this function.
-  bool old_g_heap_monitor = DisableHeapMonitor();
-
-  tensorflow::profiler::TraceMe* trace_me =
-      new tensorflow::profiler::TraceMe([name, index]() {
-        char eventName[256];
-        snprintf(eventName, sizeof(eventName), "Subgraph%d", index);
-        return tensorflow::profiler::TraceMeEncode(
-            eventName, {{"subgraph_name", name}, {"subgraph_index", index}});
-      });
-  RestoreHeapMonitor(old_g_heap_monitor);
-  return trace_me;
-}
-
-void PauseHeapMonitoring(bool pause) {
-  absl::MutexLock lock(&g_api_mutex);
-
-  g_pause_heap_monitor = pause;
-}
-
-void OnTfLiteInterpreterEnd() {
-  absl::MutexLock lock(&g_api_mutex);
-
-  MallocHook::RemoveNewHook(&OnMemoryAlloc);
-  MallocHook::RemoveDeleteHook(&OnMemoryDealloc);
-
-  printf("Heap monitor captured %d events\n", g_heap_event_idx.load());
-  for (int i = 0; i < g_heap_event_idx; i++) {
-    HeapEvent* event = &g_heap_events[i];
-    AddHeapTraceMe(event->timestamp, GetOpnameFromStacks(event->stack),
-                   event->is_alloc, event->ptr, event->size);
-  }
-}
-
-void OnTfLiteSubgraphInvokeEnd(tensorflow::profiler::TraceMe* trace_me) {
-  absl::MutexLock lock(&g_api_mutex);
-
-  // Disable heap monitoring to ignore heap activity of this function.
-  bool old_g_heap_monitor = DisableHeapMonitor();
-  delete trace_me;
-  RestoreHeapMonitor(old_g_heap_monitor);
-}
-
-tensorflow::profiler::TraceMe* OnTfLiteOpInvoke(const char* op_name,
-                                                int subgraph_index,
-                                                int node_index) {
-  absl::MutexLock lock(&g_api_mutex);
-
-  // Disable heap monitoring to ignore heap activity of this function.
-  bool old_g_heap_monitor = DisableHeapMonitor();
-
-  snprintf(g_current_op_name, sizeof(g_current_op_name), "%s_%d", op_name,
-           node_index);
-  // Updates TF's current annotation object by creating scoped annotation obj.
-  tensorflow::profiler::ScopedMemoryDebugAnnotation annotation(
-      g_current_op_name);
-
-  tensorflow::profiler::TraceMe* trace_me = new tensorflow::profiler::TraceMe(
-      [op_name, subgraph_index, node_index]() {
-        char eventName[256];
-        // TF ops should have "<detail>:<op_name>" format.
-        snprintf(eventName, sizeof(eventName), "%s:%s", op_name, op_name);
-        return tensorflow::profiler::TraceMeEncode(
-            eventName, {{"is_eager", 0},
-                        {"subgraph_index", subgraph_index},
-                        {"node_index", node_index}});
-      });
-  RestoreHeapMonitor(old_g_heap_monitor);
-  return trace_me;
-}
-
-void OnTfLiteOpInvokeEnd(tensorflow::profiler::TraceMe* trace_me) {
-  absl::MutexLock lock(&g_api_mutex);
-
-  // Disable heap monitoring to ignore heap activity of this function.
-  bool old_g_heap_monitor = DisableHeapMonitor();
-  delete trace_me;
-  RestoreHeapMonitor(old_g_heap_monitor);
-}
-
-void OnTfLiteTensorAlloc(TfLiteTensor* tensor, size_t num_bytes) {
-  absl::MutexLock lock(&g_api_mutex);
-
-  AddTraceMe(/*is_allocating=*/true, tensor, num_bytes);
-}
-
-void OnTfLiteTensorDealloc(TfLiteTensor* tensor) {
-  absl::MutexLock lock(&g_api_mutex);
-
-  if (tensor != nullptr) {
-    size_t num_bytes = tensor->bytes;
-    AddTraceMe(/*is_allocating=*/false, tensor, num_bytes);
-  }
-}
-
-void OnTfLiteArenaAlloc(int subgraph_index, int arena_id, size_t num_bytes) {
-  absl::MutexLock lock(&g_api_mutex);
-
-  if (num_bytes == 0) return;
-  AddArenaTrace(/*is_allocating=*/true, subgraph_index, arena_id, num_bytes);
-}
-
-void OnTfLiteArenaDealloc(int subgraph_index, int arena_id, size_t num_bytes) {
-  absl::MutexLock lock(&g_api_mutex);
-
-  if (num_bytes == 0) return;
-  AddArenaTrace(/*is_allocating=*/false, subgraph_index, arena_id, num_bytes);
-}
-
-}  // namespace tflite
diff --git a/tensorflow/lite/testdata/src/BUILD b/tensorflow/lite/testdata/src/BUILD
new file mode 100644
index 00000000000..2e6a0e43856
--- /dev/null
+++ b/tensorflow/lite/testdata/src/BUILD
@@ -0,0 +1,15 @@
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+py_binary(
+    name = "intermediate_tensor_output",
+    srcs = ["intermediate_tensor_output.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "@absl_py//absl:app",
+    ],
+)
diff --git a/tensorflow/lite/testdata/src/intermediate_tensor_output.py b/tensorflow/lite/testdata/src/intermediate_tensor_output.py
new file mode 100644
index 00000000000..666879e3b8b
--- /dev/null
+++ b/tensorflow/lite/testdata/src/intermediate_tensor_output.py
@@ -0,0 +1,47 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A model whose intermediate tensor is also used as a model output."""
+
+import tensorflow as tf
+
+
+@tf.function(
+    input_signature=[
+        tf.TensorSpec(shape=[1, 4, 4, 4], dtype=tf.float32),
+        tf.TensorSpec(shape=[1, 4, 4, 4], dtype=tf.float32),
+    ]
+)
+def func(a, b):
+  c = a + b
+  d = c + a
+  e = d + a
+  f = e + a
+  return c, f
+
+
+def main():
+  converter = tf.lite.TFLiteConverter.from_concrete_functions(
+      [func.get_concrete_function()]
+  )
+  converter.target_spec = tf.lite.TargetSpec()
+  tflite_model = converter.convert()
+  model_path = '/tmp/intermediate_tensor_output.tflite'
+  with open(model_path, 'wb') as f:
+    f.write(tflite_model)
+  print(f'TFLite model {model_path} is generated.\n')
+
+
+if __name__ == '__main__':
+  main()
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 0990888c812..4df75d71f80 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -92,11 +92,11 @@ _test_size_override = {
         ":tflite_driver",
         ":tflite_driver_delegate_providers",
         ":util",
-        "@com_google_googletest//:gtest",
-        "@com_googlesource_code_re2//:re2",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest",
+        "@com_googlesource_code_re2//:re2",
     ] + select({
         "//conditions:default": [
             "//tensorflow/core:framework_internal",
@@ -264,23 +264,24 @@ cc_library(
     hdrs = ["tflite_driver.h"],
     deps = [
         ":join",
+        ":result_expectations",
         ":split",
         ":test_runner",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/lite/core/c:c_api_types",
-        "//tensorflow/lite/core:framework",
-        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/kernels:custom_ops",
-        "//tensorflow/lite/kernels/gradient:gradient_ops",
         "//tensorflow/lite/kernels:reference_ops",
         "//tensorflow/lite/kernels:test_delegate_providers_lib",
-        "//tensorflow/lite/kernels/parse_example:parse_example",
+        "//tensorflow/lite/kernels/gradient:gradient_ops",
+        "//tensorflow/lite/kernels/parse_example",
         "//tensorflow/lite/kernels/perception:perception_ops",
         "//tensorflow/lite/tools/evaluation:utils",
+        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:ios": [],
         "//conditions:default": ["//tensorflow/lite/delegates/flex:delegate"],
@@ -351,10 +352,10 @@ cc_library(
     name = "util",
     hdrs = ["util.h"],
     deps = [
-        "//tensorflow/core/platform:logging",
         "//tensorflow/lite:error_reporter",
         "//tensorflow/lite:string",
         "//tensorflow/lite/core/api",
+        "//tensorflow/tsl/platform:logging",
     ],
 )
 
@@ -386,9 +387,9 @@ cc_library(
         ":join",
         ":split",
         ":test_runner",
+        "//tensorflow/lite:string_util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
-        "//tensorflow/lite:string_util",
     ] + select({
         "//conditions:default": [
             "//tensorflow/core:core_cpu",
@@ -590,6 +591,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "result_expectations",
+    srcs = ["result_expectations.cc"],
+    hdrs = ["result_expectations.h"],
+    deps = [
+        ":split",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 # A selective built tflite for testing.
 tflite_custom_cc_library(
     name = "test_tflite_lib",
diff --git a/tensorflow/lite/testing/build_def.bzl b/tensorflow/lite/testing/build_def.bzl
index d84d4b6ddc1..1010935705b 100644
--- a/tensorflow/lite/testing/build_def.bzl
+++ b/tensorflow/lite/testing/build_def.bzl
@@ -21,6 +21,8 @@ def generated_test_models():
         "avg_pool3d",
         "batch_to_space_nd",
         "batchmatmul",
+        "bitcast",
+        "bitwise_xor",
         "broadcast_args",
         "broadcast_gradient_args",
         "broadcast_to",
@@ -47,7 +49,7 @@ def generated_test_models():
         "depth_to_space",
         "depthwiseconv",
         "div",
-        "dynamic_rnn",
+        # copybara:uncomment(b/275574740) "dynamic_rnn",
         "dynamic_update_slice",
         "einsum",
         "elu",
@@ -62,6 +64,7 @@ def generated_test_models():
         "floor_div",
         "floor_mod",
         "fully_connected",
+        "fully_connected_4bit_hybrid",
         "fused_batch_norm",
         "gather",
         "gather_nd",
@@ -89,7 +92,7 @@ def generated_test_models():
         "logical_and",
         "logical_or",
         "logical_xor",
-        "lstm",
+        # copybara:uncomment(b/275574740) "lstm",
         "matrix_band_part",
         "matrix_diag",
         "matrix_set_diag",
@@ -135,6 +138,7 @@ def generated_test_models():
         "reverse_v2",
         "rfft",
         "rfft2d",
+        "right_shift",
         "roll",
         "roll_with_constant",
         "round",
@@ -160,7 +164,7 @@ def generated_test_models():
         "squared_difference",
         "squeeze",
         "static_hashtable",
-        "static_rnn_with_control_flow_v2",
+        # copybara:uncomment(b/275574740) "static_rnn_with_control_flow_v2",
         "stft",
         "strided_slice",
         "strided_slice_1d_exhaustive",
@@ -180,7 +184,7 @@ def generated_test_models():
         "topk",
         "transpose",
         "transpose_conv",
-        "unfused_gru",
+        # copybara:uncomment(b/275574740) "unfused_gru",
         "unique",
         "unpack",
         "unroll_batch_matmul",
@@ -575,7 +579,7 @@ def gen_zipped_test_file(name, file, flags = ""):
                 " --zip_to_output {0} {1} $(@D)").format(file, flags)),
         outs = [file],
         # `exec_tools` is required for PY3 compatibility in place of `tools`.
-        exec_tools = [
+        tools = [
             "//tensorflow/lite/testing:generate_examples",
         ],
     )
diff --git a/tensorflow/lite/testing/generate_examples_lib.py b/tensorflow/lite/testing/generate_examples_lib.py
index c1162d4061f..23d687abaa1 100644
--- a/tensorflow/lite/testing/generate_examples_lib.py
+++ b/tensorflow/lite/testing/generate_examples_lib.py
@@ -45,6 +45,8 @@ from tensorflow.lite.testing.op_tests.atan2 import make_atan2_tests
 from tensorflow.lite.testing.op_tests.batch_to_space_nd import make_batch_to_space_nd_tests
 from tensorflow.lite.testing.op_tests.batchmatmul import make_batchmatmul_tests
 from tensorflow.lite.testing.op_tests.binary_op import make_add_tests, make_div_tests, make_sub_tests, make_mul_tests, make_pow_tests, make_floor_div_tests, make_floor_mod_tests, make_squared_difference_tests
+from tensorflow.lite.testing.op_tests.bitcast import make_bitcast_tests
+from tensorflow.lite.testing.op_tests.bitwise_xor import make_bitwise_xor_tests
 from tensorflow.lite.testing.op_tests.broadcast_args import make_broadcast_args_tests
 from tensorflow.lite.testing.op_tests.broadcast_gradient_args import make_broadcast_gradient_args_tests
 from tensorflow.lite.testing.op_tests.broadcast_to import make_broadcast_to_tests
@@ -82,6 +84,7 @@ from tensorflow.lite.testing.op_tests.eye import make_eye_tests
 from tensorflow.lite.testing.op_tests.fill import make_fill_tests, make_fill_16_tests
 from tensorflow.lite.testing.op_tests.floor import make_floor_tests
 from tensorflow.lite.testing.op_tests.fully_connected import make_fully_connected_tests
+from tensorflow.lite.testing.op_tests.fully_connected_4bit_hybrid import make_fully_connected_4bit_hybrid_tests
 from tensorflow.lite.testing.op_tests.fused_batch_norm import make_fused_batch_norm_tests
 from tensorflow.lite.testing.op_tests.gather import make_gather_tests
 from tensorflow.lite.testing.op_tests.gather_nd import make_gather_nd_tests
@@ -148,6 +151,7 @@ from tensorflow.lite.testing.op_tests.reverse_sequence import make_reverse_seque
 from tensorflow.lite.testing.op_tests.reverse_v2 import make_reverse_v2_tests
 from tensorflow.lite.testing.op_tests.rfft import make_rfft_tests
 from tensorflow.lite.testing.op_tests.rfft2d import make_rfft2d_tests
+from tensorflow.lite.testing.op_tests.right_shift import make_right_shift_tests
 from tensorflow.lite.testing.op_tests.roll import make_roll_tests
 from tensorflow.lite.testing.op_tests.roll import make_roll_with_constant_tests
 from tensorflow.lite.testing.op_tests.round import make_round_tests
@@ -278,6 +282,8 @@ class Options:
     self.enable_dynamic_update_slice = False
     # Whether to disable unrolling batch matmul.
     self.disable_batchmatmul_unfold = False
+    # Experimental low bit options
+    self.experimental_low_bit_qat = False
 
 
 def _prepare_dir(options):
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index 5a933f5b0fe..2075cc49b81 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -162,7 +162,7 @@ class ArchiveEnvironment : public ::testing::Environment {
     proc.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
     proc.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
     if (!proc.Start())
-      return tensorflow::Status(tensorflow::error::UNKNOWN,
+      return tensorflow::Status(absl::StatusCode::kUnknown,
                                 "unzip couldn't start");
     string out, err;
     int status = proc.Communicate(nullptr, &out, &err);
@@ -170,7 +170,7 @@ class ArchiveEnvironment : public ::testing::Environment {
       *out_dir = dir;
       return ::tensorflow::OkStatus();
     } else {
-      return tensorflow::Status(tensorflow::error::UNKNOWN,
+      return tensorflow::Status(absl::StatusCode::kUnknown,
                                 "unzip failed. "
                                 "stdout:\n" +
                                     out + "\nstderr:\n" + err);
@@ -185,7 +185,7 @@ class ArchiveEnvironment : public ::testing::Environment {
       temporary_directories_.push_back(*temporary);
       return ::tensorflow::OkStatus();
     }
-    return tensorflow::Status(tensorflow::error::UNKNOWN,
+    return tensorflow::Status(absl::StatusCode::kUnknown,
                               "make temporary directory failed");
   }
 
@@ -221,7 +221,7 @@ tensorflow::Status ReadManifest(const string& original_file, const string& dir,
   }
   if (!added) {
     string message = "Test had no examples: " + original_file;
-    return tensorflow::Status(tensorflow::error::UNKNOWN, message);
+    return tensorflow::Status(absl::StatusCode::kUnknown, message);
   }
   return ::tensorflow::OkStatus();
 }
@@ -230,7 +230,7 @@ tensorflow::Status ReadManifest(const string& original_file, const string& dir,
 std::vector<string> UnarchiveAndFindTestNames(const string& zip_file,
                                               const string& tar_file) {
   if (zip_file.empty() && tar_file.empty()) {
-    TF_CHECK_OK(tensorflow::Status(tensorflow::error::UNKNOWN,
+    TF_CHECK_OK(tensorflow::Status(absl::StatusCode::kUnknown,
                                    "Neither zip_file nor tar_file was given"));
   }
   string decompress_tmp_dir;
diff --git a/tensorflow/lite/testing/mlir_convert.py b/tensorflow/lite/testing/mlir_convert.py
index 0940844c92b..12c5677f12d 100644
--- a/tensorflow/lite/testing/mlir_convert.py
+++ b/tensorflow/lite/testing/mlir_convert.py
@@ -75,6 +75,11 @@ def mlir_convert(
   if test_params.get("dynamic_range_quantize", False):
     converter.optimizations = [tf.lite.Optimize.DEFAULT]
 
+  if options.experimental_low_bit_qat:
+    converter._experimental_low_bit_qat = (   # pylint: disable=protected-access
+        True
+    )
+
   if test_params.get("fully_quantize", False):
     converter.optimizations = [tf.lite.Optimize.DEFAULT]
 
diff --git a/tensorflow/lite/testing/op_tests/batch_to_space_nd.py b/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
index fbd85541e58..ee1efd87493 100644
--- a/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
+++ b/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
@@ -44,6 +44,16 @@ def make_batch_to_space_nd_tests(options):
           "constant_crops": [True],
           "dynamic_range_quantize": [True, False],
       },
+      {
+          "dtype": [tf.float32],
+          "input_shape": [[1, 3, 3, 1]],
+          "block_shape": [[1, 1]],
+          "crops": [[[0, 0], [0, 0]], [[1, 1], [1, 1]]],
+          "constant_block_shape": [True],
+          "constant_crops": [True],
+          "fully_quantize": [True],
+          "quant_16x8": [False, True],
+      },
       # 3D use case.
       {
           "dtype": [tf.float32],
@@ -54,6 +64,16 @@ def make_batch_to_space_nd_tests(options):
           "constant_crops": [True],
           "dynamic_range_quantize": [True, False],
       },
+      {
+          "dtype": [tf.float32],
+          "input_shape": [[1, 3, 3]],
+          "block_shape": [[1]],
+          "crops": [[[0, 0]], [[1, 1]]],
+          "constant_block_shape": [True],
+          "constant_crops": [True],
+          "fully_quantize": [True],
+          "quant_16x8": [False, True],
+      },
   ]
 
   if options.run_with_flex:
@@ -99,7 +119,12 @@ def make_batch_to_space_nd_tests(options):
 
   def build_inputs(parameters, sess, inputs, outputs):
     values = [
-        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+        create_tensor_data(
+            parameters["dtype"],
+            parameters["input_shape"],
+            min_value=-1.0,
+            max_value=1.0,
+        )
     ]
     if not parameters["constant_block_shape"]:
       values.append(np.array(parameters["block_shape"]))
diff --git a/tensorflow/lite/testing/op_tests/binary_op.py b/tensorflow/lite/testing/op_tests/binary_op.py
index 7f9ce60c752..dc9761c52e1 100644
--- a/tensorflow/lite/testing/op_tests/binary_op.py
+++ b/tensorflow/lite/testing/op_tests/binary_op.py
@@ -322,6 +322,22 @@ def make_mul_tests(options):
           "fully_quantize": [False],
           "dynamic_range_quantize": [False],
       },
+      {
+          "dtype": [tf.int16],
+          "input_shape_1": [[1, 3, 3, 3]],
+          "input_shape_2": [[3], [1, 3, 3, 3]],
+          "activation": [False],
+          "fully_quantize": [False],
+          "dynamic_range_quantize": [False],
+      },
+      {
+          "dtype": [tf.uint32],
+          "input_shape_1": [[1, 3, 3, 3]],
+          "input_shape_2": [[3], [1, 3, 3, 3]],
+          "activation": [False],
+          "fully_quantize": [False],
+          "dynamic_range_quantize": [False],
+      },
   ]
   make_binary_op_tests(
       options,
diff --git a/tensorflow/lite/testing/op_tests/bitcast.py b/tensorflow/lite/testing/op_tests/bitcast.py
new file mode 100644
index 00000000000..2d4405cfc41
--- /dev/null
+++ b/tensorflow/lite/testing/op_tests/bitcast.py
@@ -0,0 +1,86 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test configs for bitcast."""
+import tensorflow as tf
+from tensorflow.lite.testing.zip_test_utils import create_tensor_data
+from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
+from tensorflow.lite.testing.zip_test_utils import register_make_test_function
+
+
+@register_make_test_function()
+def make_bitcast_tests(options):
+  """Generate examples for bitcast."""
+  test_parameters = [
+      {
+          "input_dtype": [tf.int32],
+          "output_dtype": [tf.uint32],
+          "input_shape": [[], [1], [1, 2], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.uint32],
+          "output_dtype": [tf.int32],
+          "input_shape": [[], [1], [1, 2], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.uint32],
+          "output_dtype": [tf.int16],
+          "input_shape": [[], [1], [1, 2], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.int16],
+          "output_dtype": [tf.uint32],
+          "input_shape": [[2], [1, 2], [1, 2, 2], [3, 4, 5, 6, 2]],
+      },
+      {
+          "input_dtype": [tf.int32],
+          "output_dtype": [tf.int16],
+          "input_shape": [[], [1], [1, 2], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.uint16],
+          "output_dtype": [tf.uint32],
+          "input_shape": [[2], [1, 2], [1, 2, 2], [3, 4, 5, 6, 2]],
+      },
+      {
+          "input_dtype": [tf.float32],
+          "output_dtype": [tf.int16],
+          "input_shape": [[], [1], [1, 2], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.int16],
+          "output_dtype": [tf.float32],
+          "input_shape": [[2], [1, 2], [1, 2, 2], [3, 4, 5, 6, 2]],
+      }
+  ]
+
+  def build_graph(parameters):
+    """Build the bitcast testing graph."""
+    input_value = tf.compat.v1.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"],
+    )
+    out = tf.bitcast(input_value, parameters["output_dtype"])
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(
+        parameters["input_dtype"], parameters["input_shape"]
+    )
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value]))
+    )
+
+  make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
diff --git a/tensorflow/lite/testing/op_tests/bitwise_xor.py b/tensorflow/lite/testing/op_tests/bitwise_xor.py
new file mode 100644
index 00000000000..3c6f6a2fa4c
--- /dev/null
+++ b/tensorflow/lite/testing/op_tests/bitwise_xor.py
@@ -0,0 +1,78 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test configs for bitwise_xor operator."""
+import tensorflow as tf
+from tensorflow.lite.testing.zip_test_utils import create_tensor_data
+from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
+from tensorflow.lite.testing.zip_test_utils import register_make_test_function
+
+
+@register_make_test_function()
+def make_bitwise_xor_tests(options):
+  """Generate examples for bitwise_xor."""
+  test_parameters = [
+      {
+          "input_dtype": [
+              tf.uint8,
+              tf.int8,
+              tf.uint16,
+              tf.int16,
+              tf.uint32,
+              tf.int32,
+          ],
+          "input_shape_pair": [
+              ([], []),
+              ([2, 3, 4], [2, 3, 4]),
+              ([1, 1, 1, 3], [1, 1, 1, 3]),
+              ([5, 5], [1]),
+              ([10], [2, 4, 10]),
+              ([2, 3, 3], [2, 3]),  # this test case is intended to fail
+          ],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the bitwise_xor testing graph."""
+    input_value1 = tf.compat.v1.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0],
+    )
+    input_value2 = tf.compat.v1.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1],
+    )
+    out = tf.bitwise.bitwise_xor(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(
+        parameters["input_dtype"], parameters["input_shape_pair"][0]
+    )
+    input_value2 = create_tensor_data(
+        parameters["input_dtype"], parameters["input_shape_pair"][1]
+    )
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2]))
+    )
+
+  make_zip_of_tests(
+      options,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=6,
+  )
diff --git a/tensorflow/lite/testing/op_tests/fully_connected_4bit_hybrid.py b/tensorflow/lite/testing/op_tests/fully_connected_4bit_hybrid.py
new file mode 100644
index 00000000000..a5611e2d5af
--- /dev/null
+++ b/tensorflow/lite/testing/op_tests/fully_connected_4bit_hybrid.py
@@ -0,0 +1,86 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test configs for fully_connected_4bit."""
+import numpy as np
+import tensorflow as tf
+from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
+from tensorflow.lite.testing.zip_test_utils import register_make_test_function
+
+
+@register_make_test_function()
+def make_fully_connected_4bit_hybrid_tests(options):
+  """Make a set of tests to do fully_connected."""
+
+  test_parameters = [
+      # Simple 3x3 test.
+      {
+          "shape1": [[3, 3]],
+          "shape2": [[3, 3]],
+          "dynamic_range_quantize": [True],
+      },
+      # Use optimized kernel.
+      {
+          "shape1": [[40, 42]],
+          "shape2": [[42, 40]],
+          "dynamic_range_quantize": [True],
+      },
+      # No optimization.
+      {
+          "shape1": [[1, 40]],
+          "shape2": [[40, 3]],
+          "dynamic_range_quantize": [True],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build a matmul graph given `parameters`."""
+    input_tensor1 = tf.compat.v1.placeholder(
+        dtype=tf.float32, name="input1", shape=parameters["shape1"]
+    )
+    # Create a float filter with no quantization loss.
+    float_data = np.random.uniform(-1, 1, parameters["shape2"])
+    scale = np.abs(float_data).max() / 7.0
+    int_data = np.round(float_data / scale)
+    input_tensor2 = tf.constant(int_data, dtype=tf.float32)
+    quantized = tf.quantization.fake_quant_with_min_max_vars(
+        input_tensor2, min=-7, max=7, num_bits=4, narrow_range=True
+    )
+    out = tf.matmul(input_tensor1, quantized)
+    return [input_tensor1], [out]
+
+  def create_input_data(parameters):
+    """Create a float input with no quantization loss."""
+    float_data = np.random.random(parameters["shape1"]).astype(np.float32)
+    scale = np.abs(float_data).max(axis=1, keepdims=True) / 127.0
+    return np.round(float_data / scale)
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    # pylint: disable=g-doc-return-or-yield, g-doc-args
+    """Build list of input values.
+
+    Use the specialized method, as dynamic range quantization will cause
+    differing outputs from TF, which does not quantize inputs.
+    """
+    values = [create_input_data(parameters)]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  options.experimental_low_bit_qat = True
+  make_zip_of_tests(
+      options,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=0,
+  )
diff --git a/tensorflow/lite/testing/op_tests/right_shift.py b/tensorflow/lite/testing/op_tests/right_shift.py
new file mode 100644
index 00000000000..866c037eb5f
--- /dev/null
+++ b/tensorflow/lite/testing/op_tests/right_shift.py
@@ -0,0 +1,78 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test configs for right_shift operator."""
+import tensorflow as tf
+from tensorflow.lite.testing.zip_test_utils import create_tensor_data
+from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
+from tensorflow.lite.testing.zip_test_utils import register_make_test_function
+
+
+@register_make_test_function()
+def make_right_shift_tests(options):
+  """Generate examples for right_shift."""
+  test_parameters = [
+      {
+          "input_dtype": [
+              tf.uint8,
+              tf.int8,
+              tf.uint16,
+              tf.int16,
+              tf.uint32,
+              tf.int32,
+          ],
+          "input_shape_pair": [
+              ([], []),
+              ([2, 3, 4], [2, 3, 4]),
+              ([1, 1, 1, 3], [1, 1, 1, 3]),
+              ([5, 5], [1]),
+              ([10], [2, 4, 10]),
+              ([2, 3, 3], [2, 3]),  # this test case is intended to fail
+          ],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the right_shift testing graph."""
+    input_value1 = tf.compat.v1.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0],
+    )
+    input_value2 = tf.compat.v1.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1],
+    )
+    out = tf.bitwise.right_shift(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(
+        parameters["input_dtype"], parameters["input_shape_pair"][0]
+    )
+    input_value2 = create_tensor_data(
+        parameters["input_dtype"], parameters["input_shape_pair"][1]
+    )
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2]))
+    )
+
+  make_zip_of_tests(
+      options,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=6,
+  )
diff --git a/tensorflow/lite/testing/op_tests/space_to_batch_nd.py b/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
index d17d4e456f8..b7a8b18b315 100644
--- a/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
+++ b/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
@@ -42,6 +42,25 @@ def make_space_to_batch_nd_tests(options):
           "constant_block_shape": [True, False],
           "constant_paddings": [True, False],
       },
+      {
+          "dtype": [tf.float32],
+          "input_shape": [[1, 4, 4, 1]],
+          "block_shape": [[2, 2]],
+          "paddings": [[[0, 0], [0, 0]]],
+          "constant_block_shape": [True],
+          "constant_paddings": [True],
+          "dynamic_range_quantize": [True, False],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape": [[1, 4, 4, 1]],
+          "block_shape": [[2, 2]],
+          "paddings": [[[0, 0], [0, 0]]],
+          "constant_block_shape": [True],
+          "constant_paddings": [True],
+          "fully_quantize": [True],
+          "quant_16x8": [False, True],
+      },
       # Non-4D use case: 1 bath dimension, 3 spatial dimensions, 2 others.
       {
           "dtype": [tf.float32],
@@ -93,7 +112,12 @@ def make_space_to_batch_nd_tests(options):
 
   def build_inputs(parameters, sess, inputs, outputs):
     values = [
-        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+        create_tensor_data(
+            parameters["dtype"],
+            parameters["input_shape"],
+            min_value=-1.0,
+            max_value=1.0,
+        )
     ]
     if not parameters["constant_block_shape"]:
       values.append(np.array(parameters["block_shape"]))
diff --git a/tensorflow/lite/testing/op_tests/topk.py b/tensorflow/lite/testing/op_tests/topk.py
index 7a107dad30e..89ed9dfa9b9 100644
--- a/tensorflow/lite/testing/op_tests/topk.py
+++ b/tensorflow/lite/testing/op_tests/topk.py
@@ -25,9 +25,11 @@ def make_topk_tests(options):
   """Make a set of tests to do topk."""
 
   test_parameters = [{
-      "input_dtype": [tf.float32, tf.int32],
+      "input_dtype": [tf.float32, tf.int32, tf.int16],
+      "input_k_dtype": [tf.int32, tf.int16],
       "input_shape": [[10], [5, 20]],
       "input_k": [None, 1, 3],
+      "output_index_dtype": [tf.int32, tf.int16],
   }]
 
   def build_graph(parameters):
@@ -35,25 +37,43 @@ def make_topk_tests(options):
     input_value = tf.compat.v1.placeholder(
         dtype=parameters["input_dtype"],
         name="input",
-        shape=parameters["input_shape"])
+        shape=parameters["input_shape"],
+    )
     if parameters["input_k"] is not None:
-      k = tf.compat.v1.placeholder(dtype=tf.int32, name="input_k", shape=[])
+      k = tf.compat.v1.placeholder(
+          dtype=parameters["input_k_dtype"], name="input_k", shape=[]
+      )
       inputs = [input_value, k]
     else:
-      k = tf.constant(3, name="k")
+      k = tf.constant(3, name="k", dtype=parameters["input_k_dtype"])
       inputs = [input_value]
-    out = tf.nn.top_k(input_value, k)
+    out = tf.nn.top_k(
+        input_value, k, index_type=parameters["output_index_dtype"]
+    )
     return inputs, [out[1]]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_value = create_tensor_data(parameters["input_dtype"],
-                                     parameters["input_shape"])
+    input_value = create_tensor_data(
+        parameters["input_dtype"], parameters["input_shape"]
+    )
     if parameters["input_k"] is not None:
-      k = np.array(parameters["input_k"], dtype=np.int32)
+      k = np.array(
+          parameters["input_k"],
+          dtype=parameters["input_k_dtype"].as_numpy_dtype,
+      )
       return [input_value, k], sess.run(
-          outputs, feed_dict=dict(zip(inputs, [input_value, k])))
+          outputs, feed_dict=dict(zip(inputs, [input_value, k]))
+      )
     else:
       return [input_value], sess.run(
-          outputs, feed_dict=dict(zip(inputs, [input_value])))
+          outputs, feed_dict=dict(zip(inputs, [input_value]))
+      )
 
-  make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
+  # TF currently does not support infering int16 scalar from tensor,
+  # i.e. input_k = None x input_k_dtype = int16 cases.
+  make_zip_of_tests(
+      options,
+      test_parameters,
+      build_graph,
+      build_inputs,
+  )
diff --git a/tensorflow/lite/testing/result_expectations.cc b/tensorflow/lite/testing/result_expectations.cc
new file mode 100644
index 00000000000..0e32f35cb22
--- /dev/null
+++ b/tensorflow/lite/testing/result_expectations.cc
@@ -0,0 +1,281 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/result_expectations.h"
+
+#include <cmath>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+// Returns the value in the given position in a tensor.
+template <typename T>
+T Value(void* data, int index) {
+  return static_cast<T*>(data)[index];
+}
+
+bool InterpretAsQuantized(const TfLiteTensor& tensor) {
+  if (tensor.quantization.type == kTfLiteNoQuantization) return false;
+
+  // Quantized single-op models with uint8 input/output type are only used for
+  // EdgeTPU tests.
+  // EdgeTPU tests need to read the quantized values as-is to check for
+  // bit-exactness. As a result we don't interpret the tensor as quantized.
+  // TODO(b/176121243): Add an option to interpret uint8 buffers as
+  // non-quantized type and set if from the child class.
+  if (tensor.type == kTfLiteUInt8) return false;
+
+  if (tensor.quantization.params != nullptr) {
+    auto* quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(tensor.quantization.params);
+    if (quantization->scale != nullptr && quantization->scale->size == 1 &&
+        quantization->zero_point != nullptr &&
+        quantization->zero_point->size == 1) {
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace
+
+DataExpectation::DataExpectation(double relative_threshold,
+                                 double absolute_threshold,
+                                 int quantization_error_multiplier)
+    : data_(nullptr, nullptr),
+      num_elements_(0),
+      relative_threshold_(relative_threshold),
+      absolute_threshold_(absolute_threshold),
+      quantization_error_multiplier_(quantization_error_multiplier) {}
+
+bool DataExpectation::Check(bool verbose, const TfLiteTensor& tensor) {
+  if (InterpretAsQuantized(tensor)) {
+    return QuantizedCheck(verbose, tensor);
+  }
+
+  switch (tensor.type) {
+    case kTfLiteFloat32:
+      return TypedCheck<float, float>(verbose, tensor);
+    case kTfLiteInt32:
+      return TypedCheck<int32_t, float>(verbose, tensor);
+    case kTfLiteUInt32:
+      return TypedCheck<uint32_t, float>(verbose, tensor);
+    case kTfLiteInt64:
+      return TypedCheck<int64_t, float>(verbose, tensor);
+    case kTfLiteUInt64:
+      return TypedCheck<uint64_t, float>(verbose, tensor);
+    case kTfLiteUInt8:
+      return TypedCheck<uint8_t, float>(verbose, tensor);
+    case kTfLiteInt8:
+      return TypedCheck<int8_t, float>(verbose, tensor);
+    case kTfLiteUInt16:
+      return TypedCheck<uint16_t, float>(verbose, tensor);
+    case kTfLiteInt16:
+      return TypedCheck<int16_t, float>(verbose, tensor);
+    case kTfLiteBool:
+      return TypedCheck<bool, float>(verbose, tensor);
+    case kTfLiteString:
+      return TypedCheckString(verbose, tensor);
+    case kTfLiteComplex64:
+      return TypedCheck<std::complex<float>, std::complex<float>>(verbose,
+                                                                  tensor);
+    case kTfLiteComplex128:
+      return TypedCheck<std::complex<double>, std::complex<double>>(verbose,
+                                                                    tensor);
+    case kTfLiteFloat64:
+      return TypedCheck<double, double>(verbose, tensor);
+    case kTfLiteFloat16:
+      return TypedCheck<Eigen::half, float>(verbose, tensor);
+    default:
+      fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
+      return false;
+  }
+}
+
+bool DataExpectation::CompareTwoValuesHelper(float v1, float v2) {
+  if (std::isnan(v1) || std::isnan(v2)) {
+    return !(std::isnan(v1) && std::isnan(v2));
+  }
+
+  float diff = std::abs(v1 - v2);
+  bool error_is_large = false;
+  // For very small numbers, try absolute error, otherwise go with
+  // relative.
+  if (std::abs(v2) < relative_threshold_) {
+    error_is_large = (diff > absolute_threshold_);
+  } else {
+    error_is_large = (diff > relative_threshold_ * std::abs(v2));
+  }
+  return error_is_large;
+}
+
+bool DataExpectation::CompareTwoValuesHelper(double v1, double v2) {
+  if (std::isnan(v1) || std::isnan(v2)) {
+    return !(std::isnan(v1) && std::isnan(v2));
+  }
+
+  double diff = std::abs(v1 - v2);
+  bool error_is_large = false;
+  // For very small numbers, try absolute error, otherwise go with
+  // relative.
+  if (std::abs(v2) < relative_threshold_) {
+    error_is_large = (diff > absolute_threshold_);
+  } else {
+    error_is_large = (diff > relative_threshold_ * std::abs(v2));
+  }
+  return error_is_large;
+}
+
+template <typename T, typename TS>
+bool DataExpectation::TypedCheck(bool verbose, const TfLiteTensor& tensor) {
+  size_t tensor_size = tensor.bytes / sizeof(T);
+
+  if (tensor_size != num_elements_) {
+    std::cerr << "Expected a tensor with " << num_elements_ << " elements, got "
+              << tensor_size << std::endl;
+    std::cerr << "while checking tensor " << tensor.name << std::endl;
+    return false;
+  }
+
+  bool good_output = true;
+  for (int i = 0; i < tensor_size; ++i) {
+    TS computed = Value<T>(tensor.data.raw, i);
+    TS reference = Value<T>(data_.get(), i);
+    if (CompareTwoValues(computed, reference)) {
+      good_output = false;
+      if (verbose) {
+        std::cerr << "  Tensor[" << tensor.name << "] index " << i << ": got "
+                  << computed << ", but expected " << reference << std::endl;
+      }
+    }
+  }
+  return good_output;
+}
+
+bool DataExpectation::TypedCheckString(bool verbose,
+                                       const TfLiteTensor& tensor) {
+  if (tensor.data.raw == nullptr) {
+    if (verbose) {
+      std::cerr << "  got empty string" << std::endl;
+    }
+    return false;
+  }
+  int expected_num_strings = GetStringCount(data_.get());
+  int returned_num_strings = GetStringCount(&tensor);
+  if (expected_num_strings != returned_num_strings) {
+    if (verbose) {
+      std::cerr << "  string count differ: got " << returned_num_strings
+                << ", but expected " << expected_num_strings << std::endl;
+    }
+    return false;
+  }
+  for (int i = 0; i < returned_num_strings; ++i) {
+    auto expected_ref = GetString(data_.get(), i);
+    auto returned_ref = GetString(&tensor, i);
+    if (expected_ref.len != returned_ref.len) {
+      if (verbose) {
+        std::cerr << "  index " << i << ": got string of size "
+                  << returned_ref.len << ", but expected size "
+                  << expected_ref.len << std::endl;
+      }
+      return false;
+    }
+    if (strncmp(expected_ref.str, returned_ref.str, returned_ref.len) != 0) {
+      if (verbose) {
+        std::cerr << "  index " << i << ": strings are different" << std::endl;
+      }
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool DataExpectation::QuantizedCheck(bool verbose, const TfLiteTensor& tensor) {
+  auto* quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(tensor.quantization.params);
+  const float scale = quantization->scale->data[0];
+  const int32_t zero_point = quantization->zero_point->data[0];
+
+  bool good_result = true;
+  int int_size = tensor.type == kTfLiteInt8 ? 1 : 2;
+  for (int i = 0; i < tensor.bytes / int_size; i++) {
+    int32_t computed =
+        tensor.type == kTfLiteInt8 ? tensor.data.int8[i] : tensor.data.i16[i];
+    const float dequantized =
+        static_cast<float>(scale * (computed - zero_point));
+    int error_multiplier = quantization_error_multiplier_;
+    // If we are doing int16 symmetric quantization of activations, we need to
+    // bump up the potential error. Since the weights are quantized to 8 bits
+    // and the activations are 16bits, the output is could be getting
+    // effectively 8bit error instead of 16bit error. So we need to multiply the
+    // error mulitplier by 255 (the difference in number of values between a
+    // 16bit and 8bit number)
+    if (tensor.type == kTfLiteInt16) error_multiplier *= 255;
+    const float reference = Value<float>(data_.get(), i);
+    if (std::abs(dequantized - reference) > error_multiplier * scale) {
+      if (verbose) {
+        std::cerr << "  index " << i << ": got " << dequantized
+                  << ", but expected " << reference << std::endl;
+      }
+      good_result = false;
+    }
+  }
+  return good_result;
+}
+
+ShapeExpectation::ShapeExpectation(const std::string& csv_values)
+    : shape_(testing::Split<int32_t>(csv_values, ",")) {}
+
+bool ShapeExpectation::CheckShape(bool verbose, const TfLiteTensor& tensor) {
+  bool valid = true;
+  if (tensor.dims->size == shape_.size()) {
+    for (int i = 0; i < shape_.size(); ++i) {
+      if (shape_[i] != tensor.dims->data[i]) {
+        valid = false;
+      }
+    }
+  } else {
+    valid = false;
+  }
+  if (!valid && verbose) {
+    std::cerr << "Incorrect output shape while checking tensor " << tensor.name
+              << std::endl;
+    std::cerr << "TFLite output shape: ";
+    for (int i = 0; i < tensor.dims->size; ++i) {
+      std::cerr << tensor.dims->data[i] << ", ";
+    }
+    std::cerr << std::endl;
+    std::cerr << "Expected output shape: ";
+    for (int i = 0; i < shape_.size(); ++i) {
+      std::cerr << shape_[i] << ", ";
+    }
+    std::cerr << std::endl;
+  }
+  return valid;
+}
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/result_expectations.h b/tensorflow/lite/testing/result_expectations.h
new file mode 100644
index 00000000000..081d142a212
--- /dev/null
+++ b/tensorflow/lite/testing/result_expectations.h
@@ -0,0 +1,148 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_RESULT_EXPECTATIONS_H_
+#define TENSORFLOW_LITE_TESTING_RESULT_EXPECTATIONS_H_
+
+#include <complex>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/escaping.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+// Class for comparing the values of expectations against the values computed by
+// the model.
+class DataExpectation {
+ public:
+  //  Constructs a DataExpectation with the given relative threshold, absolute
+  //  threshold, and quantization error multiplier.
+  //
+  //  The relative threshold is the maximum allowed difference between the
+  //  expected value and the actual value, expressed as a percentage of the
+  //  expected value. The absolute threshold is the maximum allowed difference
+  //  between the expected value and the actual value, in absolute terms. The
+  //  quantization error multiplier is the factor by which the expected value
+  //  should be quantized.
+  DataExpectation(double relative_threshold, double absolute_threshold,
+                  int quantization_error_multiplier);
+
+  //  Sets the data for the tensor. The data is expected to be in CSV format,
+  //  with each value separated by a comma. The function will split the CSV
+  //  values into a vector of values and then set the data for the tensor to the
+  //  vector.
+  template <typename T>
+  void SetData(const std::string& csv_values) {
+    const auto values = testing::Split<T>(csv_values, ",");
+    num_elements_ = values.size();
+    data_ = make_type_erased_array<T>(num_elements_);
+    SetTensorData(values, data_.get());
+  }
+
+  //  Checks the data against the expectation.
+  //
+  //  Returns true if the data matches the expectation, false otherwise.
+  bool Check(bool verbose, const TfLiteTensor& tensor);
+
+ private:
+  bool CompareTwoValuesHelper(float v1, float v2);
+
+  bool CompareTwoValuesHelper(double v1, double v2);
+
+  bool CompareTwoValues(std::complex<float> v1, std::complex<float> v2) {
+    return CompareTwoValues(v1.real(), v2.real()) ||
+           CompareTwoValues(v1.imag(), v2.imag());
+  }
+
+  bool CompareTwoValues(std::complex<double> v1, std::complex<double> v2) {
+    return CompareTwoValues(v1.real(), v2.real()) ||
+           CompareTwoValues(v1.imag(), v2.imag());
+  }
+
+  bool CompareTwoValues(float v1, float v2) {
+    return CompareTwoValuesHelper(v1, v2);
+  }
+
+  bool CompareTwoValues(double v1, double v2) {
+    return CompareTwoValuesHelper(v1, v2);
+  }
+
+  // Creates a type-erased array.
+  template <typename T>
+  std::unique_ptr<void, void (*)(void*)> make_type_erased_array(size_t size) {
+    return std::unique_ptr<void, void (*)(void*)>(
+        static_cast<void*>(new T[size]),
+        [](void* data) { delete[] static_cast<T*>(data); });
+  }
+
+  template <typename T>
+  void SetTensorData(const std::vector<T>& values, void* data) {
+    T* input_ptr = static_cast<T*>(data);
+    std::copy(values.begin(), values.end(), input_ptr);
+  }
+
+  template <typename T, typename TS>
+  bool TypedCheck(bool verbose, const TfLiteTensor& tensor);
+
+  bool TypedCheckString(bool verbose, const TfLiteTensor& tensor);
+  bool QuantizedCheck(bool verbose, const TfLiteTensor& tensor);
+
+  std::unique_ptr<void, void (*)(void*)> data_;
+  size_t num_elements_;
+  double relative_threshold_;
+  double absolute_threshold_;
+  int quantization_error_multiplier_;
+};
+
+// SetData specializations.
+template <>
+inline void DataExpectation::SetData<std::string>(
+    const std::string& csv_values) {
+  std::string s = absl::HexStringToBytes(csv_values);
+  data_ = make_type_erased_array<char>(s.size());
+  memcpy(data_.get(), s.data(), s.size());
+}
+
+// Class for comparing the expected shape against the shape of data computed by
+// the model.
+class ShapeExpectation {
+ public:
+  //  Constructs a ShapeExpectation with the given shape.
+  //
+  //  The shape is a vector of integers, where each integer represents the
+  //  size of a dimension.
+  explicit ShapeExpectation(const std::string& csv_values);
+
+  //  Checks the shape of the data against the expectation.
+  //
+  //  Returns true if the shape of the data matches the expectation, false
+  //  otherwise.
+  bool CheckShape(bool verbose, const TfLiteTensor& tensor);
+
+ private:
+  std::vector<int32_t> shape_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_RESULT_EXPECTATIONS_H_
diff --git a/tensorflow/lite/testing/tf_driver.cc b/tensorflow/lite/testing/tf_driver.cc
index f2a83fc3333..8d1a8d6f6e8 100644
--- a/tensorflow/lite/testing/tf_driver.cc
+++ b/tensorflow/lite/testing/tf_driver.cc
@@ -149,7 +149,7 @@ void TfDriver::LoadModel(const string& bin_file_path) {
   session_.reset(tensorflow::NewSession(options));
   auto status = session_->Create(graphdef);
   if (!status.ok()) {
-    Invalidate("Failed to create session. " + status.error_message());
+    Invalidate(absl::StrCat("Failed to create session. ", status.message()));
   }
 }
 
@@ -197,8 +197,8 @@ void TfDriver::Invoke(const std::vector<std::pair<string, string>>& inputs) {
   auto status = session_->Run({input_tensors_.begin(), input_tensors_.end()},
                               output_names_, {}, &output_tensors_);
   if (!status.ok()) {
-    Invalidate(absl::StrCat("TensorFlow failed to run graph:",
-                            status.error_message()));
+    Invalidate(
+        absl::StrCat("TensorFlow failed to run graph:", status.message()));
   }
 }
 
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 3238486ab36..627a37c9152 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -17,8 +17,10 @@ limitations under the License.
 #include <algorithm>
 #include <complex>
 #include <cstdlib>
+#include <iostream>
 #include <iterator>
 #include <memory>
+#include <ostream>
 #include <string>
 #include <utility>
 #include <vector>
@@ -68,27 +70,12 @@ const char kDefaultSignatureKey[] = "serving_default";
 // for most quantized op tests to pass.
 const int kQuantizationErrorMultiplier = 4;
 
-// Returns the value in the given position in a tensor.
-template <typename T>
-T Value(void* data, int index) {
-  return static_cast<T*>(data)[index];
-}
-
 template <typename T>
 void SetTensorData(const std::vector<T>& values, void* data) {
   T* input_ptr = static_cast<T*>(data);
   std::copy(values.begin(), values.end(), input_ptr);
 }
 
-// Implement type erasure with unique_ptr with custom deleter
-using unique_void_ptr = std::unique_ptr<void, void (*)(void*)>;
-
-template <typename T>
-unique_void_ptr make_type_erased_array(size_t size) {
-  return unique_void_ptr(static_cast<void*>(new T[size]),
-                         [](void* data) { delete[] static_cast<T*>(data); });
-}
-
 bool InterpretAsQuantized(const TfLiteTensor& tensor) {
   if (tensor.quantization.type == kTfLiteNoQuantization) return false;
 
@@ -113,277 +100,6 @@ bool InterpretAsQuantized(const TfLiteTensor& tensor) {
 }
 }  // namespace
 
-class TfLiteDriver::DataExpectation {
- public:
-  DataExpectation(double relative_threshold, double absolute_threshold,
-                  int quantization_error_multiplier)
-      : data_(nullptr, nullptr),
-        num_elements_(0),
-        relative_threshold_(relative_threshold),
-        absolute_threshold_(absolute_threshold),
-        quantization_error_multiplier_(quantization_error_multiplier) {}
-
-  template <typename T>
-  void SetData(const string& csv_values) {
-    const auto& values = testing::Split<T>(csv_values, ",");
-    num_elements_ = values.size();
-    data_ = make_type_erased_array<T>(num_elements_);
-    SetTensorData(values, data_.get());
-  }
-
-  bool Check(bool verbose, const TfLiteTensor& tensor);
-
- private:
-  bool CompareTwoValuesHelper(float v1, float v2) {
-    if (std::isnan(v1) || std::isnan(v2)) {
-      return !(std::isnan(v1) && std::isnan(v2));
-    }
-
-    float diff = std::abs(v1 - v2);
-    bool error_is_large = false;
-    // For very small numbers, try absolute error, otherwise go with
-    // relative.
-    if (std::abs(v2) < relative_threshold_) {
-      error_is_large = (diff > absolute_threshold_);
-    } else {
-      error_is_large = (diff > relative_threshold_ * std::abs(v2));
-    }
-    return error_is_large;
-  }
-
-  bool CompareTwoValuesHelper(double v1, double v2) {
-    if (std::isnan(v1) || std::isnan(v2)) {
-      return !(std::isnan(v1) && std::isnan(v2));
-    }
-
-    double diff = std::abs(v1 - v2);
-    bool error_is_large = false;
-    // For very small numbers, try absolute error, otherwise go with
-    // relative.
-    if (std::abs(v2) < relative_threshold_) {
-      error_is_large = (diff > absolute_threshold_);
-    } else {
-      error_is_large = (diff > relative_threshold_ * std::abs(v2));
-    }
-    return error_is_large;
-  }
-
-  bool CompareTwoValues(std::complex<float> v1, std::complex<float> v2) {
-    return CompareTwoValues(v1.real(), v2.real()) ||
-           CompareTwoValues(v1.imag(), v2.imag());
-  }
-
-  bool CompareTwoValues(std::complex<double> v1, std::complex<double> v2) {
-    return CompareTwoValues(v1.real(), v2.real()) ||
-           CompareTwoValues(v1.imag(), v2.imag());
-  }
-
-  bool CompareTwoValues(float v1, float v2) {
-    return CompareTwoValuesHelper(v1, v2);
-  }
-
-  bool CompareTwoValues(double v1, double v2) {
-    return CompareTwoValuesHelper(v1, v2);
-  }
-
-  template <typename T, typename TS>
-  bool TypedCheck(bool verbose, const TfLiteTensor& tensor) {
-    size_t tensor_size = tensor.bytes / sizeof(T);
-
-    if (tensor_size != num_elements_) {
-      std::cerr << "Expected a tensor with " << num_elements_
-                << " elements, got " << tensor_size << std::endl;
-      std::cerr << "while checking tensor " << tensor.name << std::endl;
-      return false;
-    }
-
-    bool good_output = true;
-    for (int i = 0; i < tensor_size; ++i) {
-      TS computed = Value<T>(tensor.data.raw, i);
-      TS reference = Value<T>(data_.get(), i);
-      if (CompareTwoValues(computed, reference)) {
-        good_output = false;
-        if (verbose) {
-          std::cerr << "  Tensor[" << tensor.name << "] index " << i << ": got "
-                    << computed << ", but expected " << reference << std::endl;
-        }
-      }
-    }
-    return good_output;
-  }
-
-  bool TypedCheckString(bool verbose, const TfLiteTensor& tensor);
-  bool QuantizedCheck(bool verbose, const TfLiteTensor& tensor);
-
-  unique_void_ptr data_;
-  size_t num_elements_;
-  double relative_threshold_;
-  double absolute_threshold_;
-  int quantization_error_multiplier_;
-};
-
-class TfLiteDriver::ShapeExpectation {
- public:
-  explicit ShapeExpectation(const string& csv_values)
-      : shape_(testing::Split<int32_t>(csv_values, ",")) {}
-
-  bool CheckShape(bool verbose, const TfLiteTensor& tensor) {
-    bool valid = true;
-    if (tensor.dims->size == shape_.size()) {
-      for (int i = 0; i < shape_.size(); ++i) {
-        if (shape_[i] != tensor.dims->data[i]) {
-          valid = false;
-        }
-      }
-    } else {
-      valid = false;
-    }
-    if (!valid && verbose) {
-      std::cerr << "Incorrect output shape while checking tensor "
-                << tensor.name << std::endl;
-      std::cerr << "TFLite output shape: ";
-      for (int i = 0; i < tensor.dims->size; ++i) {
-        std::cerr << tensor.dims->data[i] << ", ";
-      }
-      std::cerr << std::endl;
-      std::cerr << "Expected output shape: ";
-      for (int i = 0; i < shape_.size(); ++i) {
-        std::cerr << shape_[i] << ", ";
-      }
-      std::cerr << std::endl;
-    }
-    return valid;
-  }
-
- private:
-  std::vector<int32_t> shape_;
-};
-
-template <>
-void TfLiteDriver::DataExpectation::SetData<string>(const string& csv_values) {
-  string s = absl::HexStringToBytes(csv_values);
-  data_ = make_type_erased_array<char>(s.size());
-  memcpy(data_.get(), s.data(), s.size());
-}
-
-bool TfLiteDriver::DataExpectation::TypedCheckString(
-    bool verbose, const TfLiteTensor& tensor) {
-  if (tensor.data.raw == nullptr) {
-    if (verbose) {
-      std::cerr << "  got empty string" << std::endl;
-    }
-    return false;
-  }
-  int expected_num_strings = GetStringCount(data_.get());
-  int returned_num_strings = GetStringCount(&tensor);
-  if (expected_num_strings != returned_num_strings) {
-    if (verbose) {
-      std::cerr << "  string count differ: got " << returned_num_strings
-                << ", but expected " << expected_num_strings << std::endl;
-    }
-    return false;
-  }
-  for (int i = 0; i < returned_num_strings; ++i) {
-    auto expected_ref = GetString(data_.get(), i);
-    auto returned_ref = GetString(&tensor, i);
-    if (expected_ref.len != returned_ref.len) {
-      if (verbose) {
-        std::cerr << "  index " << i << ": got string of size "
-                  << returned_ref.len << ", but expected size "
-                  << expected_ref.len << std::endl;
-      }
-      return false;
-    }
-    if (strncmp(expected_ref.str, returned_ref.str, returned_ref.len) != 0) {
-      if (verbose) {
-        std::cerr << "  index " << i << ": strings are different" << std::endl;
-      }
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool TfLiteDriver::DataExpectation::QuantizedCheck(bool verbose,
-                                                   const TfLiteTensor& tensor) {
-  auto* quantization =
-      reinterpret_cast<TfLiteAffineQuantization*>(tensor.quantization.params);
-  const float scale = quantization->scale->data[0];
-  const int32_t zero_point = quantization->zero_point->data[0];
-
-  bool good_result = true;
-  int int_size = tensor.type == kTfLiteInt8 ? 1 : 2;
-  for (int i = 0; i < tensor.bytes / int_size; i++) {
-    int32_t computed =
-        tensor.type == kTfLiteInt8 ? tensor.data.int8[i] : tensor.data.i16[i];
-    const float dequantized =
-        static_cast<float>(scale * (computed - zero_point));
-    int error_multiplier = quantization_error_multiplier_;
-    // If we are doing int16 symmetric quantization of activations, we need to
-    // bump up the potential error. Since the weights are quantized to 8 bits
-    // and the activations are 16bits, the output is could be getting
-    // effectively 8bit error instead of 16bit error. So we need to multiply the
-    // error mulitplier by 255 (the difference in number of values between a
-    // 16bit and 8bit number)
-    if (tensor.type == kTfLiteInt16) error_multiplier *= 255;
-    const float reference = Value<float>(data_.get(), i);
-    if (std::abs(dequantized - reference) > error_multiplier * scale) {
-      if (verbose) {
-        std::cerr << "  index " << i << ": got " << dequantized
-                  << ", but expected " << reference << std::endl;
-      }
-      good_result = false;
-    }
-  }
-  return good_result;
-}
-
-bool TfLiteDriver::DataExpectation::Check(bool verbose,
-                                          const TfLiteTensor& tensor) {
-  if (InterpretAsQuantized(tensor)) {
-    return QuantizedCheck(verbose, tensor);
-  }
-
-  switch (tensor.type) {
-    case kTfLiteFloat32:
-      return TypedCheck<float, float>(verbose, tensor);
-    case kTfLiteInt32:
-      return TypedCheck<int32_t, float>(verbose, tensor);
-    case kTfLiteUInt32:
-      return TypedCheck<uint32_t, float>(verbose, tensor);
-    case kTfLiteInt64:
-      return TypedCheck<int64_t, float>(verbose, tensor);
-    case kTfLiteUInt64:
-      return TypedCheck<uint64_t, float>(verbose, tensor);
-    case kTfLiteUInt8:
-      return TypedCheck<uint8_t, float>(verbose, tensor);
-    case kTfLiteInt8:
-      return TypedCheck<int8_t, float>(verbose, tensor);
-    case kTfLiteUInt16:
-      return TypedCheck<uint16_t, float>(verbose, tensor);
-    case kTfLiteInt16:
-      return TypedCheck<int16_t, float>(verbose, tensor);
-    case kTfLiteBool:
-      return TypedCheck<bool, float>(verbose, tensor);
-    case kTfLiteString:
-      return TypedCheckString(verbose, tensor);
-    case kTfLiteComplex64:
-      return TypedCheck<std::complex<float>, std::complex<float>>(verbose,
-                                                                  tensor);
-    case kTfLiteComplex128:
-      return TypedCheck<std::complex<double>, std::complex<double>>(verbose,
-                                                                    tensor);
-    case kTfLiteFloat64:
-      return TypedCheck<double, double>(verbose, tensor);
-    case kTfLiteFloat16:
-      return TypedCheck<Eigen::half, float>(verbose, tensor);
-    default:
-      fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
-      return false;
-  }
-}
-
 /* static */
 bool TfLiteDriver::InitTestDelegateProviders(int* argc, const char** argv) {
   return tflite::KernelTestDelegateProviders::Get()->InitFromCmdlineArgs(argc,
@@ -451,8 +167,8 @@ void TfLiteDriver::AllocateTensors() {
   }
 }
 
-void TfLiteDriver::LoadModel(const string& bin_file_path,
-                             const string& signature) {
+void TfLiteDriver::LoadModel(const std::string& bin_file_path,
+                             const std::string& signature) {
   if (!IsValid()) return;
 
   model_ = FlatBufferModel::BuildFromFile(GetFullPath(bin_file_path).c_str());
@@ -494,11 +210,12 @@ void TfLiteDriver::LoadModel(const string& bin_file_path,
   }
 }
 
-void TfLiteDriver::LoadModel(const string& bin_file_path) {
+void TfLiteDriver::LoadModel(const std::string& bin_file_path) {
   LoadModel(bin_file_path, kDefaultSignatureKey);
 }
 
-void TfLiteDriver::ReshapeTensor(const string& name, const string& csv_values) {
+void TfLiteDriver::ReshapeTensor(const std::string& name,
+                                 const std::string& csv_values) {
   if (!IsValid()) return;
   if (signature_runner_->ResizeInputTensor(
           name.c_str(), testing::Split<int>(csv_values, ",")) != kTfLiteOk) {
@@ -515,7 +232,7 @@ void TfLiteDriver::ResetTensor(const std::string& name) {
 }
 
 void TfLiteDriver::Invoke(
-    const std::vector<std::pair<string, string>>& inputs) {
+    const std::vector<std::pair<std::string, std::string>>& inputs) {
   if (!IsValid()) return;
   for (const auto& input : inputs) {
     SetInput(input.first, input.second);
@@ -525,14 +242,15 @@ void TfLiteDriver::Invoke(
   }
 }
 
-string TfLiteDriver::ReadOutput(const string& name) {
+std::string TfLiteDriver::ReadOutput(const std::string& name) {
   if (!IsValid()) return "";
   return TensorValueToCsvString(signature_runner_->output_tensor(name.c_str()));
 }
 
 bool TfLiteDriver::CheckResults(
-    const std::vector<std::pair<string, string>>& expected_outputs,
-    const std::vector<std::pair<string, string>>& expected_output_shapes) {
+    const std::vector<std::pair<std::string, std::string>>& expected_outputs,
+    const std::vector<std::pair<std::string, std::string>>&
+        expected_output_shapes) {
   if (!IsValid()) return false;
   bool success = true;
   for (const auto& output : expected_outputs) {
@@ -577,16 +295,17 @@ bool TfLiteDriver::CheckResults(
   return success;
 }
 
-std::vector<string> TfLiteDriver::GetOutputNames() {
+std::vector<std::string> TfLiteDriver::GetOutputNames() {
   if (!IsValid()) return {};
-  std::vector<string> names;
+  std::vector<std::string> names;
   for (const auto* name : signature_runner_->output_names()) {
     names.push_back(name);
   }
   return names;
 }
 
-void TfLiteDriver::SetInput(const string& name, const string& csv_values) {
+void TfLiteDriver::SetInput(const std::string& name,
+                            const std::string& csv_values) {
   auto id = signature_inputs_[name];
   auto* tensor = signature_runner_->input_tensor(name.c_str());
   switch (tensor->type) {
@@ -657,7 +376,7 @@ void TfLiteDriver::SetInput(const string& name, const string& csv_values) {
       break;
     }
     case kTfLiteString: {
-      string s = absl::HexStringToBytes(csv_values);
+      std::string s = absl::HexStringToBytes(csv_values);
 
       DeallocateStringTensor(tensors_to_deallocate_[id]);
       AllocateStringTensor(id, s.size(), tensor);
@@ -708,8 +427,8 @@ void TfLiteDriver::SetQuantizationErrorMultiplier(
   quantization_error_multiplier_ = quantization_error_multiplier;
 }
 
-void TfLiteDriver::SetExpectation(const string& name,
-                                  const string& csv_values) {
+void TfLiteDriver::SetExpectation(const std::string& name,
+                                  const std::string& csv_values) {
   auto id = signature_outputs_[name];
   auto* tensor = signature_runner_->output_tensor(name.c_str());
   if (expected_output_.count(id) != 0) {
@@ -755,7 +474,7 @@ void TfLiteDriver::SetExpectation(const string& name,
       expected_output_[id]->SetData<bool>(csv_values);
       break;
     case kTfLiteString:
-      expected_output_[id]->SetData<string>(csv_values);
+      expected_output_[id]->SetData<std::string>(csv_values);
       break;
     case kTfLiteFloat64:
       expected_output_[id]->SetData<double>(csv_values);
@@ -777,8 +496,8 @@ void TfLiteDriver::SetExpectation(const string& name,
   }
 }
 
-void TfLiteDriver::SetShapeExpectation(const string& name,
-                                       const string& csv_values) {
+void TfLiteDriver::SetShapeExpectation(const std::string& name,
+                                       const std::string& csv_values) {
   auto id = signature_outputs_[name];
   if (expected_output_shape_.count(id) != 0) {
     Invalidate(
@@ -791,7 +510,7 @@ void TfLiteDriver::ResetLSTMStateTensors() {
   interpreter_->ResetVariableTensors();
 }
 
-string TfLiteDriver::TensorValueToCsvString(const TfLiteTensor* tensor) {
+std::string TfLiteDriver::TensorValueToCsvString(const TfLiteTensor* tensor) {
   int num_elements = 1;
 
   for (int i = 0; i < tensor->dims->size; ++i) {
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index 55f2a2d151e..8a35bbef747 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -18,8 +18,11 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/testing/result_expectations.h"
 #if !defined(__APPLE__)
 #include "tensorflow/lite/delegates/flex/delegate.h"
 #endif
@@ -57,18 +60,21 @@ class TfLiteDriver : public TestRunner {
                         bool reference_kernel = false);
   ~TfLiteDriver() override;
 
-  void LoadModel(const string& bin_file_path) override;
-  void LoadModel(const string& bin_file_path, const string& signature) override;
+  void LoadModel(const std::string& bin_file_path) override;
+  void LoadModel(const std::string& bin_file_path,
+                 const std::string& signature) override;
 
-  void ReshapeTensor(const string& name, const string& csv_values) override;
+  void ReshapeTensor(const std::string& name,
+                     const std::string& csv_values) override;
   void ResetTensor(const std::string& name) override;
-  string ReadOutput(const string& name) override;
-  void Invoke(const std::vector<std::pair<string, string>>& inputs) override;
+  std::string ReadOutput(const std::string& name) override;
+  void Invoke(
+      const std::vector<std::pair<std::string, std::string>>& inputs) override;
   bool CheckResults(
-      const std::vector<std::pair<string, string>>& expected_outputs,
-      const std::vector<std::pair<string, string>>& expected_output_shapes)
-      override;
-  std::vector<string> GetOutputNames() override;
+      const std::vector<std::pair<std::string, std::string>>& expected_outputs,
+      const std::vector<std::pair<std::string, std::string>>&
+          expected_output_shapes) override;
+  std::vector<std::string> GetOutputNames() override;
 
   void AllocateTensors() override;
   void SetThreshold(double relative_threshold, double absolute_threshold);
@@ -78,9 +84,10 @@ class TfLiteDriver : public TestRunner {
   Interpreter::TfLiteDelegatePtr delegate_;
 
  private:
-  void SetInput(const string& name, const string& csv_values);
-  void SetExpectation(const string& name, const string& csv_values);
-  void SetShapeExpectation(const string& name, const string& csv_values);
+  void SetInput(const std::string& name, const std::string& csv_values);
+  void SetExpectation(const std::string& name, const std::string& csv_values);
+  void SetShapeExpectation(const std::string& name,
+                           const std::string& csv_values);
   void DeallocateStringTensor(TfLiteTensor* t) {
     if (t) {
       free(t->data.raw);
@@ -95,13 +102,10 @@ class TfLiteDriver : public TestRunner {
 
   void ResetLSTMStateTensors();
   // Formats tensor value to string in csv format.
-  string TensorValueToCsvString(const TfLiteTensor* tensor);
+  std::string TensorValueToCsvString(const TfLiteTensor* tensor);
 
-  class DataExpectation;
-  class ShapeExpectation;
-
-  std::map<string, uint32_t> signature_inputs_;
-  std::map<string, uint32_t> signature_outputs_;
+  std::map<std::string, uint32_t> signature_inputs_;
+  std::map<std::string, uint32_t> signature_outputs_;
   std::unique_ptr<OpResolver> resolver_;
   std::unique_ptr<FlatBufferModel> model_;
   std::unique_ptr<Interpreter> interpreter_;
diff --git a/tensorflow/lite/testing/tflite_driver_test.cc b/tensorflow/lite/testing/tflite_driver_test.cc
index 8aebfde2564..15dce579e0d 100644
--- a/tensorflow/lite/testing/tflite_driver_test.cc
+++ b/tensorflow/lite/testing/tflite_driver_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/testing/tflite_driver.h"
 
+#include <memory>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
diff --git a/tensorflow/lite/testing/util.h b/tensorflow/lite/testing/util.h
index f72f284c3d3..9a42d9b6cf1 100644
--- a/tensorflow/lite/testing/util.h
+++ b/tensorflow/lite/testing/util.h
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <cstdio>
 
-#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/tsl/platform/logging.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 6cbb617fb7d..8ad59febb16 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -27,7 +27,10 @@ tf_proto_library(
     name = "toco_flags_proto",
     srcs = ["toco_flags.proto"],
     cc_api_version = 2,
-    protodeps = [":types_proto"],
+    protodeps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_options_proto",
+        ":types_proto",
+    ],
     visibility = ["//visibility:public"],
 )
 
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 2260cfd18a1..9f93ee1b36e 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -111,7 +111,7 @@ inline void RunGraphTransformations(
     Model* model, const std::string& msg,
     const GraphTransformationsSet& transformations) {
   auto s = RunGraphTransformationsWithStatus(model, msg, transformations);
-  CHECK(s.ok()) << s.error_message();
+  CHECK(s.ok()) << s.message();
 }
 
 #define DECLARE_GRAPH_TRANSFORMATION(GTName)                     \
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
index a646e42bb92..c7f90964647 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
@@ -41,7 +41,7 @@ bool ComputeRandomUniformArray(Model* model, RandomUniformOperator* op) {
   // We use the same random number generator and distribution as TensorFlow to
   // produce the exact same values given the same seeds. See
   // tensorflow::functor::FillPhiloxRandomTask<Distribution, false> in
-  // //third_party/tensorflow/core/kernels/random_op.cc for the implementation.
+  // //tensorflow/core/kernels/random_op.cc for the implementation.
   tensorflow::random::PhiloxRandom generator(op->seed, op->seed2);
   Distribution dist;
 
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index b933000a363..01c63f1f6ae 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -29,7 +29,7 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
                   Array* output_array) {
   // The TensorFlow documentation for StridedSlice is a bit ambiguous in places
   // (https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/strided-slice).
-  // Use the source code at /third_party/tensorflow/core/util/strided_op.cc as
+  // Use the source code at tensorflow/core/util/strided_op.cc as
   // "master documentation".
 
   CHECK(input_array.data_type == Type);
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
index 1db1e9e1420..390a3a76ddd 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
@@ -61,7 +61,7 @@ namespace toco {
   const bool predicate_value = predicate_data[0];
 
   // From the TensorFlow docs on .switch() in
-  // third_party/tensorflow/python/ops/control_flow_ops.py
+  // tensorflow/python/ops/control_flow_ops.py
   //
   //    If `pred` is false, the `data` input is forwarded to the first output.
   //    Otherwise, the data goes to the second output.
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index 88612a9bd7c..3d02ec907a0 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -2773,7 +2773,7 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
     StripZeroOutputIndexFromInputs(&node);
     auto status = internal::ImportTensorFlowNode(
         node, tf_import_flags, model_flags, model, converter_map);
-    CHECK(status.ok()) << status.error_message();
+    CHECK(status.ok()) << status.message();
   }
 
   ResolveModelFlags(model_flags, model);
diff --git a/tensorflow/lite/toco/import_tensorflow_test.cc b/tensorflow/lite/toco/import_tensorflow_test.cc
index 2df12ea5baa..e39ae062f8d 100644
--- a/tensorflow/lite/toco/import_tensorflow_test.cc
+++ b/tensorflow/lite/toco/import_tensorflow_test.cc
@@ -246,7 +246,7 @@ TEST_P(ShapeImportTest, ShapeElementIsNegative) {
   BuildConstNode({1, -2, 10}, GetParam(), 0, &node);
   auto status = ImportNode(node);
   EXPECT_EQ(
-      status.error_message(),
+      status.message(),
       "Tensor shape should not include negative values\n\t (while processing "
       "node 'Node1')");
 }
@@ -282,7 +282,7 @@ TEST_P(ShapeImportTest, ShapeElementTooLarge) {
   NodeDef node;
   BuildConstNode({3000000000}, GetParam(), 0, &node);
   auto status = ImportNode(node);
-  EXPECT_EQ(status.error_message(),
+  EXPECT_EQ(status.message(),
             "Shape element overflows\n\t (while processing node 'Node1')");
 }
 
@@ -290,7 +290,7 @@ TEST_P(ShapeImportTest, ShapeTooLarge) {
   NodeDef node;
   BuildConstNode({1000000, 2000000, 2000000, 2000000}, GetParam(), 0, &node);
   auto status = ImportNode(node);
-  EXPECT_EQ(status.error_message(),
+  EXPECT_EQ(status.message(),
             "Tensor shape is too large\n\t (while processing node 'Node1')");
 }
 
@@ -307,7 +307,7 @@ class ContentImportTest : public ::testing::Test {
   std::vector<DataType<T>> ImportAndGetData(const NodeDef& node) {
     Model model;
     auto status = ImportNode(node, &model);
-    CHECK(status.ok()) << status.error_message();
+    CHECK(status.ok()) << status.message();
     const auto& array = model.GetArray("Node1");
     return array.GetBuffer<T>().data;
   }
@@ -450,7 +450,7 @@ class TensorContentTest : public ::testing::Test {
   std::vector<DataType<T>> ImportAndGetData(const NodeDef& node) {
     Model model;
     auto status = ImportNode(node, &model);
-    CHECK(status.ok()) << status.error_message();
+    CHECK(status.ok()) << status.message();
     const auto& nodearray = model.GetArray("Node1");
     return nodearray.GetBuffer<T>().data;
   }
diff --git a/tensorflow/lite/toco/logging/BUILD b/tensorflow/lite/toco/logging/BUILD
index 2967c45c245..c4291422639 100644
--- a/tensorflow/lite/toco/logging/BUILD
+++ b/tensorflow/lite/toco/logging/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
@@ -61,7 +62,7 @@ filegroup(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "gen_html",
     srcs = ["gen_html.py"],
     data = [
@@ -69,13 +70,18 @@ py_library(
     ],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
+    deps = [
+        ":toco_conversion_log_proto_py",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:resource_loader",
+    ],
 )
 
-py_test(
+py_strict_test(
     name = "gen_html_test",
     srcs = ["gen_html_test.py"],
     data = [
-        "//tensorflow/lite/toco/logging:template.html",
+        ":template.html",
         "//tensorflow/lite/toco/logging/testdata:generated.html",
         "//tensorflow/lite/toco/logging/testdata:toco_log_after.pb",
         "//tensorflow/lite/toco/logging/testdata:toco_log_before.pb",
@@ -88,6 +94,9 @@ py_test(
         ":gen_html",
         ":toco_conversion_log_proto_py",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
 
diff --git a/tensorflow/lite/toco/logging/conversion_log_util.cc b/tensorflow/lite/toco/logging/conversion_log_util.cc
index 02ea86d6487..28c30e37f03 100644
--- a/tensorflow/lite/toco/logging/conversion_log_util.cc
+++ b/tensorflow/lite/toco/logging/conversion_log_util.cc
@@ -210,7 +210,7 @@ std::string GetModelHash(const Model& model) {
 
 // This function scans through the error message string, extracts the part about
 // missing ops and prunes away all other information in the error info.
-std::string SanitizeErrorMessage(const std::string& error_message) {
+std::string SanitizeErrorMessage(absl::string_view error_message) {
   const std::string s1 = "Ops that can be supported by the flex runtime";
   const std::string s2 = "Ops that need custom implementation";
   std::string pruned_message;
diff --git a/tensorflow/lite/toco/logging/conversion_log_util.h b/tensorflow/lite/toco/logging/conversion_log_util.h
index 96c1bea5059..3a4ce352a81 100644
--- a/tensorflow/lite/toco/logging/conversion_log_util.h
+++ b/tensorflow/lite/toco/logging/conversion_log_util.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/toco/logging/toco_conversion_log.pb.h"
 #include "tensorflow/lite/toco/model.h"
 
@@ -26,7 +27,7 @@ namespace toco {
 
 // This function scans through the error message string, extracts the part about
 // missing ops and prunes away all other information in the error info.
-std::string SanitizeErrorMessage(const std::string& error_message);
+std::string SanitizeErrorMessage(absl::string_view error_message);
 
 // Populates the TocoConversionLog proto after analyzing the model.
 void PopulateConversionLog(const Model& model, TocoConversionLog* log);
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index 7dcc5e09be6..f1172e6e5be 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -36,35 +36,35 @@ cc_library(
         "//tensorflow/python:__subpackages__",
     ],
     deps = [
-        "@com_google_protobuf//:protobuf_headers",
-        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
-        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/c:kernels",
-        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/compiler/mlir/lite/metrics:error_collector",
+        "//tensorflow/compiler/mlir/lite/python:flatbuffer_to_mlir",
+        "//tensorflow/compiler/mlir/lite/python:graphdef_to_tfl_flatbuffer",
+        "//tensorflow/compiler/mlir/lite/python:jax_to_tfl_flatbuffer",
+        "//tensorflow/compiler/mlir/lite/python:saved_model_to_tfl_flatbuffer",
+        "//tensorflow/compiler/mlir/lite/quantization/lite:quantize_model",
+        "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/lite/core/c:common",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite/core/api",
-        "//tensorflow/lite/python/interpreter_wrapper:python_utils",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/python/interpreter_wrapper:python_error_reporter",
+        "//tensorflow/lite/python/interpreter_wrapper:python_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/toco/logging:conversion_log_util",
         "//tensorflow/lite/toco:model_flags_proto_cc",
         "//tensorflow/lite/toco:toco_convert",
-        "//tensorflow/lite/toco/logging:toco_conversion_log_proto_cc",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/toco:toco_graphviz_dump_options",
         "//tensorflow/lite/toco:toco_port",
         "//tensorflow/lite/toco:toco_tooling",
         "//tensorflow/lite/toco:tooling_util",
         "//tensorflow/lite/toco:types_proto_cc",
-        "//tensorflow/compiler/mlir/lite/python:flatbuffer_to_mlir",
-        "//tensorflow/compiler/mlir/lite/python:graphdef_to_tfl_flatbuffer",
-        "//tensorflow/compiler/mlir/lite/python:saved_model_to_tfl_flatbuffer",
-        "//tensorflow/compiler/mlir/lite/python:jax_to_tfl_flatbuffer",
-        "//tensorflow/compiler/mlir/lite/quantization/lite:quantize_model",
-        "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
-        "//tensorflow/compiler/mlir/lite/metrics:error_collector",
+        "//tensorflow/lite/toco/logging:conversion_log_util",
+        "//tensorflow/lite/toco/logging:toco_conversion_log_proto_cc",
+        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_protobuf//:protobuf_headers",
     ] + select({
         # This is required when running `tflite_convert` from `bazel`.
         # It requires to link with TensorFlow Ops to get the op definitions.
@@ -97,10 +97,11 @@ py_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:_pywrap_toco_api",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:pywrap_tensorflow",
         "@absl_py//absl:app",
+        "//tensorflow/python:_pywrap_toco_api",
+        "//tensorflow/python:pywrap_tensorflow",
+        # Needed to provide PyArray_API
+        "//third_party/py/numpy",  # buildcleaner: keep
     ],
 )
 
@@ -116,5 +117,7 @@ tf_py_test(
         "//tensorflow:tensorflow_py",
         "//tensorflow/lite/toco:model_flags_proto_py",
         "//tensorflow/lite/toco:toco_flags_proto_py",
+        "//tensorflow/python/platform:resource_loader",
+        "//tensorflow/python/platform:test",
     ],
 )
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index e8489ef9b9f..c6339e81cb0 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -57,7 +57,7 @@ void PopulateConversionLogHelper(const toco::ModelFlags& model_flags,
                                  toco::TocoFlags* toco_flags,
                                  const std::string& input_contents_txt,
                                  const std::string& output_file_contents_txt,
-                                 const std::string& error_message,
+                                 absl::string_view error_message,
                                  GraphVizDumpOptions* dump_options) {
   // Make sure the graphviz file will be dumped under the same folder.
   dump_options->dump_graphviz = toco_flags->conversion_summary_dir();
@@ -212,7 +212,7 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
       if (!toco_flags.conversion_summary_dir().empty()) {
         PopulateConversionLogHelper(
             model_flags, &toco_flags, input_contents_txt,
-            output_file_contents_txt, status.error_message(), &dump_options);
+            output_file_contents_txt, status.message(), &dump_options);
       }
     }
   } else {
@@ -221,7 +221,7 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
   }
 
   if (!status.ok()) {
-    PyErr_SetString(PyExc_Exception, status.error_message().c_str());
+    PyErr_SetString(PyExc_Exception, tsl::NullTerminatedMessage(status));
     return nullptr;
   }
   if (extended_return && !enable_mlir_converter) {
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index d2591fbf26a..b15b6f5a807 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -148,6 +148,9 @@ tf_cc_test(
     srcs = [
         "import_test.cc",
     ],
+    tags = [
+        "no_oss",  # TODO(b/273558651): Enable after updating flatbuffer version.
+    ],
     deps = [
         ":import",
         "//tensorflow/core:ops",
diff --git a/tensorflow/lite/toco/tflite/export.h b/tensorflow/lite/toco/tflite/export.h
index 05261829a2b..12d7c0b8c93 100644
--- a/tensorflow/lite/toco/tflite/export.h
+++ b/tensorflow/lite/toco/tflite/export.h
@@ -57,7 +57,7 @@ inline void Export(const Model& model, bool allow_custom_ops,
   params.quantize_weights =
       quantize_weights ? QuantizedBufferType::INT8 : QuantizedBufferType::NONE;
   auto status = Export(model, output_file_contents, params);
-  if (!status.ok()) LOG(QFATAL) << status.error_message();
+  if (!status.ok()) LOG(QFATAL) << status.message();
 }
 
 // This is for backward-compatibility.
@@ -71,7 +71,7 @@ inline void Export(
   params.quantize_weights =
       quantize_weights ? QuantizedBufferType::INT8 : QuantizedBufferType::NONE;
   auto status = Export(model, output_file_contents, params, ops_by_type);
-  if (!status.ok()) LOG(QFATAL) << status.error_message();
+  if (!status.ok()) LOG(QFATAL) << status.message();
 }
 
 // This is for backward-compatibility.
@@ -80,7 +80,7 @@ inline void Export(const Model& model, std::string* output_file_contents) {
   ExportParams params;
   params.allow_custom_ops = true;
   auto status = Export(model, output_file_contents, params);
-  if (!status.ok()) LOG(QFATAL) << status.error_message();
+  if (!status.ok()) LOG(QFATAL) << status.message();
 }
 
 namespace details {
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index 5a98f97b07b..bf88b8e3bab 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -170,7 +170,7 @@ class ExportTest : public ::testing::Test {
     std::string result;
     auto status = Export(input_model_, &result, params);
     if (!status.ok()) {
-      LOG(INFO) << status.error_message();
+      LOG(INFO) << status.message();
       return names;
     }
 
@@ -240,7 +240,7 @@ TEST_F(ExportTest, UnsupportedFunctionality) {
   params.allow_dynamic_tensors = false;
   auto status = ExportAndReturnStatus(params);
   EXPECT_EQ(status.code(), ::tensorflow::error::UNIMPLEMENTED);
-  EXPECT_THAT(status.error_message(),
+  EXPECT_THAT(status.message(),
               HasSubstr("Unsupported flag: allow_dynamic_tensors."));
 }
 
@@ -306,7 +306,7 @@ TEST_F(ExportTest, UnsupportedControlFlowErrors) {
   std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
-  EXPECT_EQ(status.error_message(),
+  EXPECT_EQ(status.message(),
             "We are continually in the process of adding support to TensorFlow "
             "Lite for more ops. It would be helpful if you could inform us of "
             "how this conversion went by opening a github issue at "
@@ -329,7 +329,7 @@ TEST_F(ExportTest, UnsupportedOpsAndNeedEnableFlex) {
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(
-      status.error_message(),
+      status.message(),
       "We are continually in the process of adding support to TensorFlow Lite "
       "for more ops. It would be helpful if you could inform us of how this "
       "conversion went by opening a github issue at "
@@ -359,7 +359,7 @@ TEST_F(ExportTest, UnsupportedOpsNeedCustomImplementation) {
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(
-      status.error_message(),
+      status.message(),
       "We are continually in the process of adding support to TensorFlow Lite "
       "for more ops. It would be helpful if you could inform us of how this "
       "conversion went by opening a github issue at "
@@ -389,7 +389,7 @@ TEST_F(ExportTest, UnsupportedControlFlowAndCustomOpsErrors) {
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(
-      status.error_message(),
+      status.message(),
       "We are continually in the process of adding support to TensorFlow Lite "
       "for more ops. It would be helpful if you could inform us of how this "
       "conversion went by opening a github issue at "
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 99172b9a1c3..32640722d49 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -662,11 +662,13 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
     const float input1_scale = input1_quant ? input1_quant->scale : 0.0f;
     const float input2_scale = input2_quant ? input2_quant->scale : 0.0f;
     const float output_scale = output_quant ? output_quant->scale : 0.0f;
+    const bool input_quantized = input1_quant || input2_quant;
     ::tflite::OpSignature op_sig =
         GetVersioningOpSig(builtin_op(), op_signature);
     op_sig.ext_options.mul.input1_scale = input1_scale;
     op_sig.ext_options.mul.input2_scale = input2_scale;
     op_sig.ext_options.mul.output_scale = output_scale;
+    op_sig.ext_options.mul.input_quantized = input_quantized;
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
diff --git a/tensorflow/lite/toco/toco.cc b/tensorflow/lite/toco/toco.cc
index 18800c7b726..f1486b66e3f 100644
--- a/tensorflow/lite/toco/toco.cc
+++ b/tensorflow/lite/toco/toco.cc
@@ -50,7 +50,7 @@ int main(int argc, char** argv) {
   toco::port::InitGoogle(argv[0], effective_argc, &effective_argv, true);
   auto status = toco::Convert(parsed_toco_flags, parsed_model_flags);
   if (!status.ok()) {
-    fprintf(stderr, "%s\n", status.error_message().c_str());
+    fprintf(stderr, "%s\n", tsl::NullTerminatedMessage(status));
     fflush(stderr);
     return 1;
   }
diff --git a/tensorflow/lite/toco/toco_convert_test.cc b/tensorflow/lite/toco/toco_convert_test.cc
index d60ebc334c3..8206ca15c99 100644
--- a/tensorflow/lite/toco/toco_convert_test.cc
+++ b/tensorflow/lite/toco/toco_convert_test.cc
@@ -27,7 +27,7 @@ namespace {
 TEST(TocoTest, MissingInputFile) {
   ParsedTocoFlags toco_flags;
   ParsedModelFlags model_flags;
-  EXPECT_DEATH(Convert(toco_flags, model_flags).ok(),
+  EXPECT_DEATH(EXPECT_TRUE(Convert(toco_flags, model_flags).ok()),
                "Missing required flag --input_file");
 }
 
@@ -38,8 +38,9 @@ TEST(TocoTest, BadInputFormat) {
   std::string input;
   std::string output;
 
-  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
-               "Unhandled input_format='FILE_FORMAT_UNKNOWN'");
+  EXPECT_DEATH(
+      EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok()),
+      "Unhandled input_format='FILE_FORMAT_UNKNOWN'");
 }
 
 TEST(TocoTest, MissingOutputArrays) {
@@ -50,9 +51,10 @@ TEST(TocoTest, MissingOutputArrays) {
   std::string input;
   std::string output;
 
-  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
-               "This model does not define output arrays, so a --output_arrays "
-               "flag must be given on the command-line");
+  EXPECT_DEATH(
+      EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok()),
+      "This model does not define output arrays, so a --output_arrays "
+      "flag must be given on the command-line");
 }
 
 TEST(TocoTest, BadOutputArray) {
@@ -64,9 +66,10 @@ TEST(TocoTest, BadOutputArray) {
   std::string input;
   std::string output;
 
-  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
-               "Specified output array .output1. is not produced by any op "
-               "in this graph. Is it a typo");
+  EXPECT_DEATH(
+      EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok()),
+      "Specified output array .output1. is not produced by any op "
+      "in this graph. Is it a typo");
 }
 
 TEST(TocoTest, BadOutputFormat) {
@@ -87,8 +90,9 @@ TEST(TocoTest, BadOutputFormat) {
 
   std::string output;
 
-  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
-               "Unhandled output_format='FILE_FORMAT_UNKNOWN'");
+  EXPECT_DEATH(
+      EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok()),
+      "Unhandled output_format='FILE_FORMAT_UNKNOWN'");
 }
 
 TEST(TocoTest, SimpleFloatModel) {
diff --git a/tensorflow/lite/toco/toco_flags.proto b/tensorflow/lite/toco/toco_flags.proto
index d01692c8559..ea44a7e06c3 100644
--- a/tensorflow/lite/toco/toco_flags.proto
+++ b/tensorflow/lite/toco/toco_flags.proto
@@ -15,17 +15,18 @@ syntax = "proto2";
 
 package toco;
 
+import "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.proto";
 import "tensorflow/lite/toco/types.proto";
 
 // Supported I/O file formats. Some formats may be input-only or output-only.
 enum FileFormat {
   FILE_FORMAT_UNKNOWN = 0;
 
-  // GraphDef, third_party/tensorflow/core/framework/graph.proto
+  // GraphDef, tensorflow/core/framework/graph.proto
   TENSORFLOW_GRAPHDEF = 1;
 
   // Tensorflow's mobile inference model.
-  // third_party/tensorflow/lite/schema/schema.fbs
+  // tensorflow/lite/schema/schema.fbs
   TFLITE = 2;
 
   // GraphViz
@@ -38,7 +39,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 54.
+// Next ID to use: 55.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -310,4 +311,9 @@ message TocoFlags {
 
   // If true, disable folding mul->fc as in layer norm during optimize pass.
   optional bool disable_fuse_mul_and_fc = 53 [default = false];
+
+  // Indicates the quantization specs. Quantization spec can be set to either
+  // a preset method or a custom method.
+  // Note: This is an experimental feature
+  optional stablehlo.quantization.QuantizationOptions quantization_options = 54;
 }
diff --git a/tensorflow/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
index 97158662a94..289decfe926 100644
--- a/tensorflow/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -75,7 +75,7 @@ namespace file {
 tensorflow::Status ToStatus(const absl::Status& uts) {
   if (!uts.ok()) {
     return tensorflow::Status(absl::StatusCode(::util::RetrieveErrorCode(uts)),
-                              uts.error_message());
+                              uts.message());
   }
   return ::tensorflow::OkStatus();
 }
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index f21554ac7dd..2cf306f159f 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -497,7 +497,7 @@ tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
       }
       auto status = toco::tflite::Export(model, output_file_contents, params);
       if (!status.ok()) {
-        LOG(ERROR) << status.error_message();
+        LOG(ERROR) << status.message();
       }
       return status;
     } break;
diff --git a/tensorflow/lite/toco/toco_tooling.h b/tensorflow/lite/toco/toco_tooling.h
index 581df4b14fd..5577e20a53b 100644
--- a/tensorflow/lite/toco/toco_tooling.h
+++ b/tensorflow/lite/toco/toco_tooling.h
@@ -35,7 +35,7 @@ tensorflow::Status TransformWithStatus(const TocoFlags& toco_flags,
                                        Model* model);
 inline void Transform(const TocoFlags& toco_flags, Model* model) {
   auto s = TransformWithStatus(toco_flags, model);
-  CHECK(s.ok()) << s.error_message();
+  CHECK(s.ok()) << s.message();
 }
 
 // Exports the Model, which must be of the 'lowered' form returned by
@@ -50,7 +50,7 @@ inline void Export(const TocoFlags& toco_flags, const Model& model,
                    std::string* output_file_contents) {
   auto status = Export(toco_flags, model, true, output_file_contents);
   if (!status.ok()) {
-    LOG(QFATAL) << status.error_message();
+    LOG(QFATAL) << status.message();
   }
 }
 
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index c687d9a66ae..4e548f92e46 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -584,7 +584,7 @@ void DumpGraphvizVideoFrame(const Model& model) {
             dump_options.dump_graphviz,
             toco::port::StringF("toco_video_%05d.dot", dump_id)),
         graphviz_dump, port::file::Defaults());
-    QCHECK(result.ok()) << result.error_message();
+    QCHECK(result.ok()) << result.message();
     dump_id++;
   }
 }
@@ -604,7 +604,7 @@ void LogDump(int log_level, const std::string& message, const Model& model) {
             absl::StrCat("toco_", absl::StrReplaceAll(message, {{" ", "_"}}),
                          ".dot")),
         graphviz_dump, port::file::Defaults());
-    QCHECK(result.ok()) << result.error_message();
+    QCHECK(result.ok()) << result.message();
   }
 
   if (!VLOG_IS_ON(log_level)) {
diff --git a/tensorflow/lite/toco/tooling_util_test.cc b/tensorflow/lite/toco/tooling_util_test.cc
index c9129464089..c8b37ff5eca 100644
--- a/tensorflow/lite/toco/tooling_util_test.cc
+++ b/tensorflow/lite/toco/tooling_util_test.cc
@@ -116,10 +116,10 @@ TEST(NumElementsTest, Int) {
   EXPECT_EQ(count, 0);
 
   status = NumElements(std::vector<int>{1, 2, -3}, &count);
-  EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
+  EXPECT_EQ(status.message(), kNegativeValuesMessage);
 
   status = NumElements(std::vector<int>{1024, 1024, 2048}, &count);
-  EXPECT_EQ(status.error_message(), kLargeTensorMessage);
+  EXPECT_EQ(status.message(), kLargeTensorMessage);
 }
 
 TEST(NumElementsTest, Int32) {
@@ -131,10 +131,10 @@ TEST(NumElementsTest, Int32) {
   EXPECT_EQ(count, 2146435072);
 
   status = NumElements(std::vector<int32_t>{1, 2, -3}, &count);
-  EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
+  EXPECT_EQ(status.message(), kNegativeValuesMessage);
 
   status = NumElements(std::vector<int32_t>{1024, 1024, 2048}, &count);
-  EXPECT_EQ(status.error_message(), kLargeTensorMessage);
+  EXPECT_EQ(status.message(), kLargeTensorMessage);
 }
 
 TEST(NumElementsTest, Int64) {
@@ -146,10 +146,10 @@ TEST(NumElementsTest, Int64) {
   EXPECT_EQ(count, 9223090561878065152LL);
 
   status = NumElements(std::vector<int64_t>{1, 2, -3}, &count);
-  EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
+  EXPECT_EQ(status.message(), kNegativeValuesMessage);
 
   status = NumElements(std::vector<int64_t>{16777216, 16777216, 32768}, &count);
-  EXPECT_EQ(status.error_message(), kLargeTensorMessage);
+  EXPECT_EQ(status.message(), kLargeTensorMessage);
 }
 
 TEST(NumElementsTest, UnsignedInt32) {
@@ -161,10 +161,10 @@ TEST(NumElementsTest, UnsignedInt32) {
   EXPECT_EQ(count, 4292870144);
 
   status = NumElements(std::vector<int>{1, 2, -3}, &count);
-  EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
+  EXPECT_EQ(status.message(), kNegativeValuesMessage);
 
   status = NumElements(std::vector<uint32_t>{1024, 2048, 2048}, &count);
-  EXPECT_EQ(status.error_message(), kLargeTensorMessage);
+  EXPECT_EQ(status.message(), kLargeTensorMessage);
 }
 
 TEST(NumElementsTest, UnsignedInt64) {
@@ -177,11 +177,11 @@ TEST(NumElementsTest, UnsignedInt64) {
   EXPECT_EQ(count, 18446462598732840960ULL);
 
   status = NumElements(std::vector<int>{1, 2, -3}, &count);
-  EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
+  EXPECT_EQ(status.message(), kNegativeValuesMessage);
 
   status =
       NumElements(std::vector<uint64_t>{16777216, 16777216, 65536}, &count);
-  EXPECT_EQ(status.error_message(), kLargeTensorMessage);
+  EXPECT_EQ(status.message(), kLargeTensorMessage);
 }
 
 TEST(NumElementsTest, Scalar) {
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 96a66074411..7d292b431a5 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite", "verifier_internal_visibility_allowlist")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
@@ -81,7 +81,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:resource_loader",
         "//third_party/py/numpy",
     ],
 )
@@ -129,7 +129,7 @@ py_library(
     deps = [
         "//tensorflow/lite/python:schema_py",
         "//tensorflow/lite/python:schema_util",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:gfile",
         "@flatbuffers//:runtime_py",
     ],
 )
@@ -208,7 +208,7 @@ cc_library_with_tflite(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts_warnings(),
     tflite_deps = [
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite:framework_stable",
     ],
     deps = [
         ":verifier_internal",
@@ -230,10 +230,6 @@ cc_library_with_tflite(
     hdrs = ["verifier_internal.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts_warnings(),
-    visibility = verifier_internal_visibility_allowlist() + [
-        "//tensorflow/lite/core/shims:__subpackages__",
-        "//tensorflow/lite/java/src/main/native:__pkg__",
-    ],
     deps = ["//tensorflow/lite/core/tools:verifier_internal"],
 )
 
@@ -411,7 +407,7 @@ cc_library(
 cc_test(
     name = "model_loader_test",
     srcs = ["model_loader_test.cc"],
-    data = ["//tensorflow/lite/java/demo/app/src/main/assets:mobilenet_v1_1.0_224.tflite"],
+    data = ["@tflite_mobilenet_float//:mobilenet_v1_1.0_224.tflite"],
     deps = [
         ":model_loader",
         "//tensorflow/lite:model_builder",
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index e138632bafa..95ca87a9e87 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -176,6 +176,22 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "benchmark_tflite_model_lib_test",
+    srcs = [
+        "benchmark_tflite_model_test.cc",
+    ],
+    data = ["@tflite_mobilenet_float//:mobilenet_v1_1.0_224.tflite"],
+    deps = [
+        ":benchmark_model_lib",
+        ":benchmark_params",
+        ":benchmark_tflite_model_lib",
+        "//tensorflow/lite/tools:tool_params",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "benchmark_multirun_stats_recorder",
     hdrs = ["benchmark_multirun_stats_recorder.h"],
@@ -191,19 +207,19 @@ cc_library(
     hdrs = ["benchmark_performance_options.h"],
     copts = common_copts,
     deps = [
-        ":benchmark_multirun_stats_recorder",
         ":benchmark_model_lib",
+        ":benchmark_multirun_stats_recorder",
         ":benchmark_params",
         ":benchmark_utils",
-        "//tensorflow/lite/core/c:c_api_types",
-        "//tensorflow/lite/tools:logging",
-        "@com_google_absl//absl/memory",
         "//tensorflow/core/util:stats_calculator_portable",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
         "//tensorflow/lite/nnapi:nnapi_util",
         "//tensorflow/lite/profiling:time",
         "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:logging",
+        "@com_google_absl//absl/memory",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/lite/delegates/gpu:delegate",
diff --git a/tensorflow/lite/tools/benchmark/CMakeLists.txt b/tensorflow/lite/tools/benchmark/CMakeLists.txt
index 423c95fa9f5..a9f75078298 100644
--- a/tensorflow/lite/tools/benchmark/CMakeLists.txt
+++ b/tensorflow/lite/tools/benchmark/CMakeLists.txt
@@ -54,7 +54,7 @@ endif()
 if(TFLITE_ENABLE_XNNPACK)
   list(APPEND TFLITE_BENCHMARK_SRCS
     ${TFLITE_SOURCE_DIR}/tools/delegates/xnnpack_delegate_provider.cc
-    ${TFLITE_SOURCE_DIR}/core/experimental/acceleration/configuration/c/xnnpack_plugin.cc)
+    ${TFLITE_SOURCE_DIR}/core/acceleration/configuration/c/xnnpack_plugin.cc)
 else()
   set(TFLITE_BENCHMARK_CC_OPTIONS "-DTFLITE_WITHOUT_XNNPACK")
 endif()  # TFLITE_ENABLE_XNNPACK
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc
index 5e8950f6e3a..9b449aa8f20 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc
@@ -294,6 +294,8 @@ TfLiteStatus BenchmarkModel::Run() {
 
   if (model_size_mb > 0) {
     TFLITE_LOG(INFO) << "The input model file size (MB): " << model_size_mb;
+  } else {
+    TFLITE_LOG(WARN) << "Failed to get the input model file size.";
   }
   TFLITE_LOG(INFO) << "Initialized session in " << startup_latency_us / 1e3
                    << "ms.";
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 83faabd9607..cc534c6efe7 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "ruy/profiler/profiler.h"  // from @ruy
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
@@ -538,8 +539,22 @@ uint64_t BenchmarkTfLiteModel::ComputeInputBytes() {
 }
 
 int64_t BenchmarkTfLiteModel::MayGetModelFileSize() {
-  std::ifstream in_file(params_.Get<std::string>("graph"),
-                        std::ios::binary | std::ios::ate);
+  std::string fd_or_graph_path = params_.Get<std::string>("graph");
+  // Path can be one of the following:
+  // 1) File descriptor path: path must be in the format of
+  // "fd:%model_fd%:%model_offset%:%model_size%".
+  // 2) File path: path to the model file.
+  // Please see tensorflow/lite/tools/model_loader.h for more information.
+  std::vector<absl::string_view> parts = absl::StrSplit(fd_or_graph_path, ':');
+  if (!parts.empty() && parts[0] == "fd") {
+    int64_t model_size = -1;
+    if (parts.size() != 4 || !absl::SimpleAtoi(parts[3], &model_size)) {
+      TFLITE_LOG(ERROR) << "Failed to parse model file size: "
+                        << fd_or_graph_path;
+    }
+    return model_size;
+  }
+  std::ifstream in_file(fd_or_graph_path, std::ios::binary | std::ios::ate);
   return in_file.tellg();
 }
 
@@ -894,7 +909,7 @@ std::unique_ptr<tflite::OpResolver> BenchmarkTfLiteModel::GetOpResolver()
     resolver =
         new tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates();
   } else {
-    resolver = new tflite::ops::builtin::BuiltinOpResolverWithXNNPACK();
+    resolver = new tflite::ops::builtin::BuiltinOpResolver();
   }
   RegisterSelectedOps(resolver);
   return std::unique_ptr<tflite::OpResolver>(resolver);
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model_test.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model_test.cc
new file mode 100644
index 00000000000..94d73cc84b8
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#include <string>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_model.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_params.h"
+#include "tensorflow/lite/tools/tool_params.h"
+
+namespace tflite {
+namespace benchmark {
+namespace {
+
+static constexpr char kModelPath[] =
+    "../tflite_mobilenet_float/"
+    "mobilenet_v1_1.0_224.tflite";
+
+class TestBenchmarkListener : public BenchmarkListener {
+ public:
+  void OnBenchmarkEnd(const BenchmarkResults& results) override {
+    results_ = results;
+  }
+
+  BenchmarkResults results_;
+};
+
+TEST(BenchmarkTfLiteModelTest, GetModelSizeFromPathSucceeded) {
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+  params.Set<std::string>("graph", kModelPath);
+  params.Set<int>("num_runs", 1);
+  params.Set<int>("warmup_runs", 0);
+  BenchmarkTfLiteModel benchmark = BenchmarkTfLiteModel(std::move(params));
+  TestBenchmarkListener listener;
+  benchmark.AddListener(&listener);
+
+  benchmark.Run();
+
+  EXPECT_GE(listener.results_.model_size_mb(), 0);
+}
+
+TEST(BenchmarkTfLiteModelTest, GetModelSizeFromFileDescriptorSucceeded) {
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+  int fd = open(kModelPath, O_RDONLY);
+  ASSERT_GE(fd, 0);
+  int model_offset = 0;
+  struct stat stat_buf = {0};
+  ASSERT_EQ(fstat(fd, &stat_buf), 0);
+  params.Set<std::string>("graph", absl::StrCat("fd:", fd, ":", model_offset,
+                                                ":", stat_buf.st_size));
+  params.Set<int>("num_runs", 1);
+  params.Set<int>("warmup_runs", 0);
+  BenchmarkTfLiteModel benchmark = BenchmarkTfLiteModel(std::move(params));
+  TestBenchmarkListener listener;
+  benchmark.AddListener(&listener);
+
+  benchmark.Run();
+
+  EXPECT_EQ(listener.results_.model_size_mb(), stat_buf.st_size / 1e6);
+}
+
+}  // namespace
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/BUILD
index 5f313cf0828..336196059ae 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/BUILD
@@ -17,8 +17,8 @@ android_library(
     ]),
     visibility = ["//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:__subpackages__"],
     deps = [
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs_android",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_java_proto_lite",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs_android",
+        "//tensorflow/lite/acceleration/configuration:configuration_java_proto_lite",
         "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto:delegate_performance_java_proto_lite",
         "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native:benchmark_native",
         "@flatbuffers//:runtime_android",
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md
index 05f4e6037dc..fb871d73a8a 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md
@@ -128,6 +128,7 @@ to the file directory of Delegate Performance Benchmark:
 
     ```
     adb shell 'echo "{
+      \"delegate\": \"NONE\",  // Replace NONE with the test target delegate type.
       \"stable_delegate_loader_settings\": {
         \"delegate_path\": \"/data/data/org.tensorflow.lite.benchmark.delegateperformance/files/libtensorflowlite_sample_stable_delegate.so\"
       }
@@ -160,7 +161,7 @@ delegate.
 
 -   `tflite_settings_files`: `str` (required) the comma-delimited paths to the
     JSON-encoded delegate `TFLiteSettings` file(s), which is defined in
-    [configuration.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/acceleration/configuration/configuration.proto).
+    [configuration.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/acceleration/configuration/configuration.proto).
 -   Additional optional command-line flags are documented
     [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/README.md)
     and can be appended to the `args` string (note that all args must be nested
@@ -178,7 +179,8 @@ of the test target delegate and a reference delegate breach the thresholds:
 1.  Average inference latency: average time for the inferences after warmup in
     the benchmark run.
 
-Please see
+When the test target delegate type is the same as the reference delegate, the
+checks are more strict. Otherwise, the checks are relaxed. Please see
 [BenchmarkResultType.java](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkResultType.java)
 for the meanings of `PASS`, `PASS_WITH_WARNING` and `FAIL`.
 
@@ -244,7 +246,7 @@ for the meanings of `PASS`, `PASS_WITH_WARNING` and `FAIL`.
     mobilenet_v1_1.0_224       | overall_memory_total_allocated_mebibyte        | 0.0                                                            | 0.0                               | 0.0%      | N/A
     mobilenet_v1_1.0_224       | overall_memory_in_use_mebibyte                 | 28.22168                                                       | 23.295578                         | 21.1%     | N/A
     mobilenet_v1_1.0_224       | startup_overhead_latency_us                    | 533227.9                                                       | 1880926.5                         | -71.7%    | PASS
-    mobilenet_v1_1.0_224       | delegate_summary                               |                                                                |                                   |           | PASS
+    mobilenet_v1_1.0_224       | delegate_summary                               |                                                                |                                   |           | PASS (strict)
     mobilenet_v1_1.0_224       | model_summary                                  | PASS                                                           |                                   |           |
     mobilenet_v1_1.0_224_quant | model_size_megabyte                            | -1.0E-6                                                        | -1.0E-6                           | 0.0%      | N/A
     mobilenet_v1_1.0_224_quant | initialization_latency_us                      | 25318.0                                                        | 8271.0                            | 206.1%    | N/A
@@ -263,7 +265,7 @@ for the meanings of `PASS`, `PASS_WITH_WARNING` and `FAIL`.
     mobilenet_v1_1.0_224_quant | overall_memory_total_allocated_mebibyte        | 0.0                                                            | 0.0                               | 0.0%      | N/A
     mobilenet_v1_1.0_224_quant | overall_memory_in_use_mebibyte                 | 3.3774261                                                      | 3.38266                           | -0.2%     | N/A
     mobilenet_v1_1.0_224_quant | startup_overhead_latency_us                    | 35977.797                                                      | 17407.312                         | 106.7%    | FAIL
-    mobilenet_v1_1.0_224_quant | delegate_summary                               |                                                                |                                   |           | FAIL
+    mobilenet_v1_1.0_224_quant | delegate_summary                               |                                                                |                                   |           | FAIL (strict)
     mobilenet_v1_1.0_224_quant | model_summary                                  | FAIL                                                           |                                   |           |
 
 #### Accuracy benchmarking
@@ -272,7 +274,7 @@ for the meanings of `PASS`, `PASS_WITH_WARNING` and `FAIL`.
 
 -   `tflite_settings_files`: `str` (required) the comma-delimited paths to the
     JSON-encoded delegate `TFLiteSettings` file(s), which is defined in
-    [configuration.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/acceleration/configuration/configuration.proto).
+    [configuration.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/acceleration/configuration/configuration.proto).
     The first path is the test target delegate and all other paths are treated
     as reference delegates. The test target delegate will be compared against
     each reference delegate.
@@ -285,7 +287,8 @@ models. The metric scripts generate an "ok" result by aggregating the outcomes
 for every model and every delegate. The accuracy benchmark generates a `PASS`,
 `PASS_WITH_WARNING`, or `FAIL` recommendation by aggregating the "ok" results.
 
-Please see
+When the test target delegate type is the same as the reference delegate, the
+checks are more strict. Otherwise, the checks are relaxed. Please see
 [BenchmarkResultType.java](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkResultType.java)
 for the meanings of `PASS`, `PASS_WITH_WARNING` and `FAIL`.
 
@@ -325,11 +328,25 @@ for the meanings of `PASS`, `PASS_WITH_WARNING` and `FAIL`.
     mobilenet_v1_1.0_224_quant_with_validation | symmetric_kl_divergence(average) | 0.049423933                                                    | 0.049423933                       | 0.0%   | N/A
     mobilenet_v1_1.0_224_quant_with_validation | ok                               | 0.0                                                            | 0.0                               | N/A    | PASS
     mobilenet_v1_1.0_224_quant_with_validation | max_memory_kb                    | 0.0                                                            | 0.0                               | 0.0%   | N/A
-    mobilenet_v1_1.0_224_quant_with_validation | delegate_summary                 |                                                                |                                   |        | PASS
+    mobilenet_v1_1.0_224_quant_with_validation | delegate_summary                 |                                                                |                                   |        | PASS (strict)
     mobilenet_v1_1.0_224_quant_with_validation | model_summary                    | PASS                                                           |                                   |        |
     mobilenet_v1_1.0_224_with_validation       | mse(average)                     | 1.0577066E-16                                                  | 1.0577066E-16                     | 0.0%   | N/A
     mobilenet_v1_1.0_224_with_validation       | symmetric_kl_divergence(average) | 7.2540787E-9                                                   | 7.2540787E-9                      | 0.0%   | N/A
     mobilenet_v1_1.0_224_with_validation       | ok                               | 0.0                                                            | 0.0                               | N/A    | PASS
     mobilenet_v1_1.0_224_with_validation       | max_memory_kb                    | 0.0                                                            | 0.0                               | 0.0%   | N/A
-    mobilenet_v1_1.0_224_with_validation       | delegate_summary                 |                                                                |                                   |        | PASS
+    mobilenet_v1_1.0_224_with_validation       | delegate_summary                 |                                                                |                                   |        | PASS (strict)
     mobilenet_v1_1.0_224_with_validation       | model_summary                    | PASS                                                           |                                   |        |
+
+## FAQ
+
+### 1. What does a delegate summary result with a `(strict)` suffix mean?
+
+The `(strict)` suffix is added to reference delegates that have the same
+delegate type as the test target delegate. The purpose of the suffix is to let a
+user know that the performance metrics are being checked to a higher standard.
+The expectation is that the test target delegate is better, or at least not
+substantially worse, than the reference delegate in all metrics.
+
+Please see
+[BenchmarkResultType.java](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkResultType.java)
+for more details.
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/build_defs.bzl b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/build_defs.bzl
index 49641872995..a8c5e3f35bc 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/build_defs.bzl
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/build_defs.bzl
@@ -7,3 +7,25 @@ def latency_benchmark_extra_deps():
 def accuracy_benchmark_extra_deps():
     """Defines extra dependencies for accuracy benchmark. Currently empty."""
     return []
+
+def latency_benchmark_extra_models():
+    """Defines extra models for latency benchmark. Currently empty.
+
+    Returns a list of tuples where each tuple has two fields: 1) the model name and 2) the model target label. Example:
+    [
+        ("model1.tflite", "@repo//package:model1.tflite"),
+        ("model2.tflite", "@repo//package:model2.tflite"),
+    ]
+    """
+    return []
+
+def accuracy_benchmark_extra_models():
+    """Defines extra models for accuracy benchmark. Currently empty.
+
+    Returns a list of tuples where each tuple has two fields: 1) the model name and 2) the model target label. Example:
+    [
+        ("model1_with_validation.tflite", "@repo//package:model1_with_validation.tflite"),
+        ("model2_with_validation.tflite", "@repo//package:model2_with_validation.tflite"),
+    ]
+    """
+    return []
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD
index be5d84c4850..bb25d68bd4c 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD
@@ -2,6 +2,7 @@
 #  Holds model-specific files. The app will bundle the files into assets.
 
 load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:proto.bzl", "proto_data")
+load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:build_defs.bzl", "accuracy_benchmark_extra_models", "latency_benchmark_extra_models")
 load("//tensorflow/lite/experimental/acceleration/mini_benchmark:build_defs.bzl", "validation_model")
 
 package(
@@ -27,14 +28,26 @@ validation_model(
 
 # Migrate the models into assets folder.
 ACCURACY_MODELS = [
-    "mobilenet_v1_1.0_224_with_validation.tflite",
-    "mobilenet_v1_1.0_224_quant_with_validation.tflite",
-]
+    (
+        "mobilenet_v1_1.0_224_with_validation.tflite",
+        ":mobilenet_v1_1.0_224_with_validation.tflite",
+    ),
+    (
+        "mobilenet_v1_1.0_224_quant_with_validation.tflite",
+        ":mobilenet_v1_1.0_224_quant_with_validation.tflite",
+    ),
+] + accuracy_benchmark_extra_models()
 
 LATENCY_MODELS = [
-    "@tflite_mobilenet_float//:mobilenet_v1_1.0_224.tflite",
-    "@tflite_mobilenet_quant//:mobilenet_v1_1.0_224_quant.tflite",
-]
+    (
+        "mobilenet_v1_1.0_224.tflite",
+        "@tflite_mobilenet_float//:mobilenet_v1_1.0_224.tflite",
+    ),
+    (
+        "mobilenet_v1_1.0_224_quant.tflite",
+        "@tflite_mobilenet_quant//:mobilenet_v1_1.0_224_quant.tflite",
+    ),
+] + latency_benchmark_extra_models()
 
 COPY_CMD = """
   srcs=($(SRCS))
@@ -47,15 +60,15 @@ COPY_CMD = """
 
 genrule(
     name = "accuracy_models",
-    srcs = [":%s" % model for model in ACCURACY_MODELS],
-    outs = ["assets/accuracy/%s" % model for model in ACCURACY_MODELS],
+    srcs = [target for _, target in ACCURACY_MODELS],
+    outs = ["assets/accuracy/%s" % name for name, _ in ACCURACY_MODELS],
     cmd = COPY_CMD,
 )
 
 genrule(
     name = "latency_models",
-    srcs = [model for model in LATENCY_MODELS],
-    outs = ["assets/latency/%s" % model.split(":")[-1] for model in LATENCY_MODELS],
+    srcs = [target for _, target in LATENCY_MODELS],
+    outs = ["assets/latency/%s" % name for name, _ in LATENCY_MODELS],
     cmd = COPY_CMD,
 )
 
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/DelegateMetricsEntry.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/DelegateMetricsEntry.java
index 3ca66d51d6e..ba3897af375 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/DelegateMetricsEntry.java
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/DelegateMetricsEntry.java
@@ -54,16 +54,23 @@ final class DelegateMetricsEntry {
   //    - FAIL: All regression thresholds are breached.
   private final BenchmarkResultType result;
   private final boolean isTestTarget;
+  /**
+   * The value is {@code true} when the results are generated from comparing two delegates of the
+   * same type. Otherwise, the value is {@code false}.
+   */
+  private final boolean isStrictCriteria;
 
   private DelegateMetricsEntry(
       String delegateIdentifier,
       Map<String, MetricsEntry> metrics,
       BenchmarkResultType result,
-      boolean isTestTarget) {
+      boolean isTestTarget,
+      boolean isStrictCriteria) {
     this.delegateIdentifier = delegateIdentifier;
     this.metrics = metrics;
     this.result = result;
     this.isTestTarget = isTestTarget;
+    this.isStrictCriteria = isStrictCriteria;
   }
 
   /** Returns an identifer to the delegate involved in the computation. */
@@ -85,6 +92,14 @@ final class DelegateMetricsEntry {
     return isTestTarget;
   }
 
+  /**
+   * Returns {@code true} when the results are generated from comparing two delegates of the same
+   * type. Otherwise, returns {@code false}.
+   */
+  boolean isStrictCriteria() {
+    return isStrictCriteria;
+  }
+
   JSONObject toJsonObject() throws JSONException {
     JSONObject jsonObject = new JSONObject();
     jsonObject.put("delegate_identifier", delegateIdentifier);
@@ -95,6 +110,7 @@ final class DelegateMetricsEntry {
     }
     jsonObject.put("metrics", metricsObject);
     jsonObject.put("is_test_target", isTestTarget);
+    jsonObject.put("is_strict_criteria", isStrictCriteria);
     return jsonObject;
   }
 
@@ -102,7 +118,9 @@ final class DelegateMetricsEntry {
       String delegateIdentifier,
       Map<String, MetricsEntry> metrics,
       BenchmarkResultType result,
-      boolean isTestTarget) {
-    return new DelegateMetricsEntry(delegateIdentifier, metrics, result, isTestTarget);
+      boolean isTestTarget,
+      boolean isStrictCriteria) {
+    return new DelegateMetricsEntry(
+        delegateIdentifier, metrics, result, isTestTarget, isStrictCriteria);
   }
 }
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/HtmlWriter.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/HtmlWriter.java
index e03e4d6b30f..75bea1a1ae6 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/HtmlWriter.java
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/HtmlWriter.java
@@ -91,6 +91,20 @@ final class HtmlWriter implements ReportWriter {
             .append(".status-NOT_APPLICABLE {\n")
             .append("  background-color: grey;\n")
             .append("}\n")
+            .append(".ls-info-box {\n")
+            .append("  padding: 5px 10px;\n")
+            .append("  margin: 10px;\n")
+            .append("  border-style: solid;\n")
+            .append("  border-left-width: 5px;\n")
+            .append("  border-radius: 4px;\n")
+            .append("  color: #666;\n")
+            .append("  font-size: 13px;\n")
+            .append("  line-height: 1.3\n")
+            .append("}\n")
+            .append(".ls-info-box {\n")
+            .append("  background-color: #f0f0f0;\n")
+            .append("  border-left-color: silver\n")
+            .append("}\n")
             .append("</style>\n")
             .append("</head>\n")
             .append("<body>\n")
@@ -98,7 +112,7 @@ final class HtmlWriter implements ReportWriter {
             .append("<h1>Delegate Performance Benchmark Report</h1>\n");
     // Summary table
     sb.append("<table>\n").append("<tr>\n").append("<td>Summary</td>\n");
-    addResultCell(report.result(), sb);
+    addResultCell(report.result(), /* isStrictCriteria= */ null, sb);
     sb.append("</tr>\n").append("</table>\n");
 
     // Heading row for the detailed metric table. It is structured as below:
@@ -118,7 +132,20 @@ final class HtmlWriter implements ReportWriter {
     for (ModelBenchmarkReportInterface modelReport : modelReports) {
       writerModelReport(modelReport, sb);
     }
-    sb.append("</tbody>\n").append("</table>\n").append("</body>\n").append("</html>\n");
+    sb.append("</tbody>\n")
+        .append("</table>\n")
+        .append("<div class=\"ls-info-box\">\n")
+        .append("<p>\n")
+        .append(
+            "When the test target delegate type is the same as the reference delegate, the checks"
+                + " are more strict. Otherwise, the checks are relaxed. Please see \n")
+        .append(
+            "<a href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkResultType.java\">BenchmarkResultType.java</a>\n")
+        .append(" for the meanings of PASS, PASS_WITH_WARNING and FAIL.\n")
+        .append("</p>\n")
+        .append("</div>\n")
+        .append("</body>\n")
+        .append("</html>\n");
     try (PrintWriter writer = new PrintWriter(filePath)) {
       writer.write(sb.toString());
     } catch (IOException e) {
@@ -142,7 +169,7 @@ final class HtmlWriter implements ReportWriter {
           sb.append("<td>").append(metricEntry.value()).append("</td>\n");
           if (!delegateMetricsEntry.isTestTarget()) {
             sb.append("<td>").append(metricEntry.regression()).append("</td>\n");
-            addResultCell(metricEntry.result(), sb);
+            addResultCell(metricEntry.result(), /* isStrictCriteria= */ null, sb);
           }
         }
         sb.append("</tr>\n");
@@ -153,22 +180,23 @@ final class HtmlWriter implements ReportWriter {
         sb.append("<td/>\n");
         if (!delegateMetricsEntry.isTestTarget()) {
           sb.append("<td/>\n");
-          addResultCell(delegateMetricsEntry.result(), sb);
+          addResultCell(delegateMetricsEntry.result(), delegateMetricsEntry.isStrictCriteria(), sb);
         }
       }
       sb.append("</tr>\n<tr>\n<td>").append(modelName).append("</td>\n<td>model_summary</td>\n");
-      addResultCell(modelReport.result(), sb);
+      addResultCell(modelReport.result(), /* isStrictCriteria= */ null, sb);
       sb.append("</tr>\n");
     }
   }
 
   /** Adds a colored result cell to the table. */
-  private void addResultCell(BenchmarkResultType result, StringBuilder sb) {
-    sb.append("<td class=\"status-")
-        .append(result.name())
-        .append("\">")
-        .append(result)
-        .append("</td>\n");
+  private void addResultCell(
+      BenchmarkResultType result, Boolean isStrictCriteria, StringBuilder sb) {
+    sb.append("<td class=\"status-").append(result.name()).append("\">").append(result);
+    if (isStrictCriteria != null && isStrictCriteria) {
+      sb.append(" (strict)");
+    }
+    sb.append("</td>\n");
   }
 
   static ReportWriter create(String destinationFolderPath) {
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/ModelBenchmarkReport.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/ModelBenchmarkReport.java
index 093bd2bc56c..38f8cd2fbd1 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/ModelBenchmarkReport.java
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/ModelBenchmarkReport.java
@@ -117,7 +117,8 @@ public abstract class ModelBenchmarkReport implements ModelBenchmarkReportInterf
             testTarget.delegateIdentifier(),
             metrics,
             BenchmarkResultType.NOT_APPLICABLE,
-            testTarget.isTestTarget()));
+            testTarget.isTestTarget(),
+            /* isStrictCriteria= */ false));
 
     // Processes the reference delegate results. Compute the performance regressions by comparing
     // them with the results from the test target delegate.
@@ -149,7 +150,8 @@ public abstract class ModelBenchmarkReport implements ModelBenchmarkReportInterf
               entry.delegateIdentifier(),
               referenceMetrics,
               referenceDelegateResult,
-              entry.isTestTarget()));
+              entry.isTestTarget(),
+              sameDelegateType));
     }
     result = DelegatePerformanceBenchmark.aggregateResults(/* strict= */ true, referenceResults);
   }
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/RawDelegateMetricsEntry.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/RawDelegateMetricsEntry.java
index b24eac2098e..982de6eb686 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/RawDelegateMetricsEntry.java
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/RawDelegateMetricsEntry.java
@@ -26,7 +26,7 @@ import tflite.Delegate;
  */
 final class RawDelegateMetricsEntry {
   // The name of the delegate. The available names are listed in
-  // tensorflow/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+  // tensorflow/tensorflow/lite/acceleration/configuration/configuration.proto
   // TODO(b/267431570): consider replacing the field with an Enum value.
   private final String delegateName;
   // Specifies the path to the delegate settings file.
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD
index 3846ee8a7fe..133f41aee6c 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD
@@ -17,8 +17,8 @@ tflite_jni_binary(
     deps = [
         ":accuracy_benchmark",
         ":latency_benchmark",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/delegates/utils/experimental/stable_delegate:tflite_settings_json_parser",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/java/jni",
         "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto:delegate_performance_cc_proto",
         "@flatbuffers",
@@ -35,14 +35,14 @@ cc_library(
     srcs = ["latency_benchmark.cc"],
     hdrs = ["latency_benchmark.h"],
     deps = [
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "//tensorflow/core/util:stats_calculator_portable",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/profiling:memory_info",
         "//tensorflow/lite/tools/benchmark:benchmark_tflite_model_lib",
         "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto:delegate_performance_cc_proto",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ] + latency_benchmark_extra_deps(),
 )
 
@@ -52,25 +52,25 @@ cc_library(
     hdrs = ["accuracy_benchmark.h"],
     deps = [
         ":status_codes",
-        "@flatbuffers//:flatbuffers",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:nnapi_plugin",
+        "//tensorflow/lite/acceleration/configuration:stable_delegate_plugin",
+        "//tensorflow/lite/acceleration/configuration:xnnpack_plugin",
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:tflite_settings_json_parser",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:blocking_validator_runner",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:status_codes",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:validator_runner_entrypoint",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:validator_runner_options",
-        "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin",
-        "//tensorflow/lite/experimental/acceleration/configuration:stable_delegate_plugin",
-        "//tensorflow/lite/experimental/acceleration/configuration:xnnpack_plugin",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:tool_params",
-        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:tflite_settings_json_parser",
+        "@flatbuffers",
     ] + select({
         # On Android, as the validation runs in a separate process as a
         # different binary, any TFLite delegates to be validated need to
         # include corresponding delegate plugins.
         clean_dep("//tensorflow:android"): [
-            "//tensorflow/lite/experimental/acceleration/configuration:gpu_plugin",
+            "//tensorflow/lite/acceleration/configuration:gpu_plugin",
         ],
         "//conditions:default": [],
     }) + accuracy_benchmark_extra_deps(),
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.cc
index 128e1ca4c7a..9d3c4d1c529 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.cc
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.cc
@@ -20,9 +20,7 @@ limitations under the License.
 #include <sys/stat.h>
 
 #include <cstddef>
-#include <ctime>
 #include <fstream>
-#include <iomanip>
 #include <iterator>
 #include <sstream>
 #include <string>
@@ -31,7 +29,7 @@ limitations under the License.
 
 #include "flatbuffers/buffer.h"  // from @flatbuffers
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h"
@@ -53,16 +51,7 @@ flatbuffers::Offset<BenchmarkEvent> Benchmark(
   options.model_offset = model_offset;
   options.model_size = model_size;
   options.data_directory_path = result_path;
-
-  // This is a mitigation to prevent file lock collisions between the previous
-  // benchmark runs and a new benchmark run.
-  // TODO(b/265406729): Remove the mitigation.
-  // Example path: "storage_path_2019-02-01_12:12:18.fb"
-  std::stringstream ss;
-  std::time_t t = std::time(nullptr);
-  ss << result_path << "/storage_path_"
-     << std::put_time(std::localtime(&t), "%F_%T.fb");
-  options.storage_path = ss.str();
+  options.storage_path = result_path + "/storage_path.fb";
   int return_code = std::remove(options.storage_path.c_str());
   if (return_code) {
     TFLITE_LOG_PROD(TFLITE_LOG_WARNING,
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h
index a2aee7bbb75..63e85cf7002 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 namespace benchmark {
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/delegate_performance_benchmark_jni.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/delegate_performance_benchmark_jni.cc
index d3e768ab822..e538cc16884 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/delegate_performance_benchmark_jni.cc
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/delegate_performance_benchmark_jni.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.pb.h"
 #include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h"
 #include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h"
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.cc
index 2ababe219ae..4a55700c731 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.cc
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/core/util/stats_calculator.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/profiling/memory_info.h"
@@ -86,8 +86,11 @@ class DelegatePerformanceReportingListener : public BenchmarkListener {
     profiling::memory::MemoryUsage overall_mem_usage =
         results.overall_mem_usage();
 
-    AddMetric(/*name=*/"model_size_megabyte",
-              /*value=*/results.model_size_mb());
+    if (results.model_size_mb() > 0) {
+      // Ignores invalid model sizes to avoid confusions.
+      AddMetric(/*name=*/"model_size_megabyte",
+                /*value=*/results.model_size_mb());
+    }
     AddMetric(/*name=*/"initialization_latency_us",
               /*value=*/results.startup_latency_us());
     AddMetric(/*name=*/"warmup_latency_average_us", /*value=*/warmup_us.avg());
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h
index 2122975ff0a..68eed8d57e2 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.pb.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/BUILD
index d9f81eda17a..446566c991f 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/BUILD
@@ -26,12 +26,12 @@ cc_test(
     srcs = ["latency_benchmark_test.cc"],
     data = [
         "//tensorflow/lite/delegates/utils/experimental/sample_stable_delegate:tensorflowlite_sample_stable_delegate",
-        "//tensorflow/lite/java/demo/app/src/main/assets:mobilenet_v1_1.0_224.tflite",
         "//tensorflow/lite/tools/delegates/experimental/stable_delegate:test_sample_stable_delegate_settings.json",
+        "@tflite_mobilenet_float//:mobilenet_v1_1.0_224.tflite",
     ],
     deps = [
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/delegates/utils/experimental/stable_delegate:tflite_settings_json_parser",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto:delegate_performance_cc_proto",
         "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native:latency_benchmark",
         "@com_google_googletest//:gtest_main",
@@ -48,8 +48,8 @@ cc_test(
         "//tensorflow/lite/tools/delegates/experimental/stable_delegate:test_sample_stable_delegate_settings.json",
     ],
     deps = [
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/delegates/utils/experimental/stable_delegate:tflite_settings_json_parser",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_mobilenet_validation_model",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:mini_benchmark_test_helper",
         "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native:accuracy_benchmark",
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/accuracy_benchmark_test.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/accuracy_benchmark_test.cc
index b31c2e1cf20..aec6053a07b 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/accuracy_benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/accuracy_benchmark_test.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/buffer.h"  // from @flatbuffers
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h"
 #include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/status_codes.h"
@@ -77,7 +77,7 @@ TEST_F(AccuracyBenchmarkTest, FailedWithInvalidModelFileDescriptor) {
   flatbuffers::FlatBufferBuilder builder;
   std::vector<std::string> args;
   const TFLiteSettings* tflite_settings = parser.Parse(
-      "third_party/tensorflow/lite/tools/delegates/experimental/"
+      "tensorflow/lite/tools/delegates/experimental/"
       "stable_delegate/test_sample_stable_delegate_settings.json");
 
   flatbuffers::Offset<BenchmarkEvent> offset =
@@ -104,7 +104,7 @@ TEST_F(AccuracyBenchmarkTest, SucceedWithSampleStableDelegate) {
   delegates::utils::TfLiteSettingsJsonParser parser;
   flatbuffers::FlatBufferBuilder builder;
   const TFLiteSettings* tflite_settings = parser.Parse(
-      "third_party/tensorflow/lite/tools/delegates/experimental/"
+      "tensorflow/lite/tools/delegates/experimental/"
       "stable_delegate/test_sample_stable_delegate_settings.json");
 
   flatbuffers::Offset<BenchmarkEvent> offset = Benchmark(
@@ -128,7 +128,7 @@ TEST_F(AccuracyBenchmarkTest, SucceedWithEmbeddedValidationAndXNNPack) {
   delegates::utils::TfLiteSettingsJsonParser parser;
   flatbuffers::FlatBufferBuilder builder;
   const TFLiteSettings* tflite_settings = parser.Parse(
-      "third_party/tensorflow/lite/delegates/utils/experimental/"
+      "tensorflow/lite/delegates/utils/experimental/"
       "stable_delegate/test_xnnpack_settings.json");
 
   flatbuffers::Offset<BenchmarkEvent> offset = Benchmark(
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/latency_benchmark_test.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/latency_benchmark_test.cc
index 40c3fd47ab7..2b9dd6c5d0d 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/latency_benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/latency_benchmark_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/buffer.h"  // from @flatbuffers
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.pb.h"
 
 namespace tflite {
@@ -32,10 +32,10 @@ namespace latency {
 namespace {
 
 static constexpr char kModelPath[] =
-    "third_party/tensorflow/lite/java/demo/app/src/main/assets/"
+    "../tflite_mobilenet_float/"
     "mobilenet_v1_1.0_224.tflite";
 static constexpr char kSettingsFilePath[] =
-    "third_party/tensorflow/lite/tools/delegates/experimental/stable_delegate/"
+    "tensorflow/lite/tools/delegates/experimental/stable_delegate/"
     "test_sample_stable_delegate_settings.json";
 
 class LatencyBenchmarkTest : public ::testing::Test {
diff --git a/tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake b/tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake
index fe91b029e3e..92e746afafe 100644
--- a/tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake
+++ b/tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake
@@ -24,7 +24,7 @@ OverridableFetchContent_Declare(
   abseil-cpp
   GIT_REPOSITORY https://github.com/abseil/abseil-cpp
   # Sync with tensorflow/third_party/absl/workspace.bzl
-  GIT_TAG 273292d1cfc0a94a65082ee350509af1d113344d
+  GIT_TAG b971ac5250ea8de900eae9f95e06548d14cd95fe
   GIT_SHALLOW TRUE
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
diff --git a/tensorflow/lite/tools/cmake/modules/eigen.cmake b/tensorflow/lite/tools/cmake/modules/eigen.cmake
index 603c6ec238b..d58a7e42f5a 100644
--- a/tensorflow/lite/tools/cmake/modules/eigen.cmake
+++ b/tensorflow/lite/tools/cmake/modules/eigen.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   eigen
   GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
   # Sync with tensorflow/third_party/eigen3/workspace.bzl
-  GIT_TAG 8fe61900015e8cf61f2a0287be4ccd3be76393b9
+  GIT_TAG b0f877f8e01e90a5b0f3a79d46ea234899f8b499
   # It's not currently (cmake 3.17) possible to shallow clone with a GIT TAG
   # as cmake attempts to git checkout the commit hash after the clone
   # which doesn't work as it's a shallow clone hence a different commit hash.
diff --git a/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake b/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
index ad0ddcd38c1..f1ed4fb1748 100644
--- a/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
+++ b/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   flatbuffers
   GIT_REPOSITORY https://github.com/google/flatbuffers
   # Sync with tensorflow/third_party/flatbuffers/workspace.bzl
-  GIT_TAG v2.0.6
+  GIT_TAG v23.1.21
   GIT_SHALLOW TRUE
   GIT_PROGRESS TRUE
   SOURCE_DIR "${CMAKE_BINARY_DIR}/flatbuffers"
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index 6e5da7af773..eeff72caaf2 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG 06b2705f1b3e1ba0f161dd2979e2901ce93014e3
+  GIT_TAG b9d4073a6913891ce9cbd8965c8d506075d2a45a
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/delegates/BUILD b/tensorflow/lite/tools/delegates/BUILD
index 5e7179184cd..f2bddabb62e 100644
--- a/tensorflow/lite/tools/delegates/BUILD
+++ b/tensorflow/lite/tools/delegates/BUILD
@@ -18,7 +18,7 @@ cc_library_with_tflite(
     ],
     copts = common_copts,
     tflite_deps = [
-        "//tensorflow/lite/core/shims:common",
+        "//tensorflow/lite/c:common",
     ],
     deps = [
         "//tensorflow/lite/tools:command_line_flags",
@@ -205,7 +205,7 @@ cc_test_with_tflite(
         ":delegate_provider_hdr",
         ":delegate_provider_lib",
         ":xnnpack_delegate_provider",
-        "//tensorflow/lite/core/shims:c_shims_test_util",
+        "//tensorflow/lite/c:test_util",
     ],
     deps = [
         ":nnapi_delegate_provider",
diff --git a/tensorflow/lite/tools/delegates/compatibility/gpu/BUILD b/tensorflow/lite/tools/delegates/compatibility/gpu/BUILD
index 1cdac219ce6..71f07f8fccd 100644
--- a/tensorflow/lite/tools/delegates/compatibility/gpu/BUILD
+++ b/tensorflow/lite/tools/delegates/compatibility/gpu/BUILD
@@ -15,6 +15,7 @@ cc_test(
         "//tensorflow/lite:testdata/add.bin",
         "//tensorflow/lite:testdata/conv3d_huge_im2col.bin",
     ],
+    tags = ["no_oss"],  #TODO(b/276295784): Re-enable when fixed.
     deps = [
         ":gpu_delegate_compatibility_checker",
         "//tensorflow/core/platform:resource_loader",
diff --git a/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker_test.cc b/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker_test.cc
index 1641c69db75..b4f70cc8ade 100644
--- a/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker_test.cc
+++ b/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker_test.cc
@@ -26,6 +26,10 @@ limitations under the License.
 namespace tflite {
 namespace tools {
 
+#ifndef EXPECT_OK
+#define EXPECT_OK(x) EXPECT_TRUE(x.ok());
+#endif
+
 namespace {
 
 class AddOpModel : public SingleOpModel {
@@ -59,9 +63,11 @@ TEST(GpuDelegateCompatibilityCheckerTest, CheckOnlineMode) {
 
   GpuDelegateCompatibilityChecker gpu_dcc;
   // Online mode is not supported by GPU DCC
-  EXPECT_THAT(gpu_dcc.checkModelCompatibilityOnline(fb_model.get(),
-                                                    &compatibility_result),
-              testing::status::StatusIs(absl::StatusCode::kUnimplemented));
+  EXPECT_EQ(
+      gpu_dcc
+          .checkModelCompatibilityOnline(fb_model.get(), &compatibility_result)
+          .code(),
+      absl::StatusCode::kUnimplemented);
 }
 
 TEST(GpuDelegateCompatibilityCheckerTest, CompatibleModelOfflineMode) {
diff --git a/tensorflow/lite/tools/delegates/compatibility/nnapi/BUILD b/tensorflow/lite/tools/delegates/compatibility/nnapi/BUILD
index f5d85cbd785..28ac221e5fb 100644
--- a/tensorflow/lite/tools/delegates/compatibility/nnapi/BUILD
+++ b/tensorflow/lite/tools/delegates/compatibility/nnapi/BUILD
@@ -68,6 +68,7 @@ cc_test(
     data = [
         "//tensorflow/lite:testdata/add.bin",
     ],
+    tags = ["no_oss"],  #TODO(b/276295784): Re-enable when fixed.
     deps = [
         ":nnapi_delegate_compatibility_checker",
         "//tensorflow/core/platform:resource_loader",
diff --git a/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_delegate_compatibility_checker_test.cc b/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_delegate_compatibility_checker_test.cc
index d748ca3169d..eda9b232bde 100644
--- a/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_delegate_compatibility_checker_test.cc
+++ b/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_delegate_compatibility_checker_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <limits>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -29,6 +30,10 @@ limitations under the License.
 namespace tflite {
 namespace tools {
 
+#ifndef EXPECT_OK
+#define EXPECT_OK(x) EXPECT_TRUE(x.ok());
+#endif
+
 namespace {
 
 class AddOpModel : public SingleOpModel {
@@ -83,33 +88,33 @@ TEST_F(NnapiDccTest, ValidRuntimeFeatureLevel) {
 TEST_F(NnapiDccTest, InvalidRuntimeFeatureLevel) {
   std::unordered_map dcc_configs = nnapi_dcc_.getDccConfigurations();
   dcc_configs["nnapi-runtime_feature_level"] = "03";
-  EXPECT_THAT(nnapi_dcc_.setDccConfigurations(dcc_configs),
-              testing::status::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_EQ(nnapi_dcc_.setDccConfigurations(dcc_configs).code(),
+            absl::StatusCode::kInvalidArgument);
 
   dcc_configs["nnapi-runtime_feature_level"] = "a";
-  EXPECT_THAT(nnapi_dcc_.setDccConfigurations(dcc_configs),
-              testing::status::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_EQ(nnapi_dcc_.setDccConfigurations(dcc_configs).code(),
+            absl::StatusCode::kInvalidArgument);
 
   dcc_configs["nnapi-runtime_feature_level"] = "28123497123489123841212344516";
-  EXPECT_THAT(nnapi_dcc_.setDccConfigurations(dcc_configs),
-              testing::status::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_EQ(nnapi_dcc_.setDccConfigurations(dcc_configs).code(),
+            absl::StatusCode::kInvalidArgument);
 
   dcc_configs["nnapi-runtime_feature_level"] = "30.0";
-  EXPECT_THAT(nnapi_dcc_.setDccConfigurations(dcc_configs),
-              testing::status::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_EQ(nnapi_dcc_.setDccConfigurations(dcc_configs).code(),
+            absl::StatusCode::kInvalidArgument);
 
   dcc_configs["nnapi-runtime_feature_level"] = "-30";
-  EXPECT_THAT(nnapi_dcc_.setDccConfigurations(dcc_configs),
-              testing::status::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_EQ(nnapi_dcc_.setDccConfigurations(dcc_configs).code(),
+            absl::StatusCode::kInvalidArgument);
 
   dcc_configs["nnapi-runtime_feature_level"] = "9";
-  EXPECT_THAT(nnapi_dcc_.setDccConfigurations(dcc_configs),
-              testing::status::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_EQ(nnapi_dcc_.setDccConfigurations(dcc_configs).code(),
+            absl::StatusCode::kInvalidArgument);
 
   dcc_configs.clear();
   dcc_configs["nnapi-runtim_feature_level"] = "8";
-  EXPECT_THAT(nnapi_dcc_.setDccConfigurations(dcc_configs),
-              testing::status::StatusIs(absl::StatusCode::kInvalidArgument));
+  EXPECT_EQ(nnapi_dcc_.setDccConfigurations(dcc_configs).code(),
+            absl::StatusCode::kInvalidArgument);
 }
 
 TEST_F(NnapiDccTest, CompatibleModelOnlineMode) {
diff --git a/tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.proto b/tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.proto
index f69952e6172..b24c975375f 100644
--- a/tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.proto
+++ b/tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.proto
@@ -51,7 +51,7 @@ enum CompatibilityFailureType {
   // is specified in the compatibility failure message.
   // For more details on each operator version see
   // the GetBuiltinOperatorVersion function in
-  // third_party/tensorflow/lite/tools/versioning/op_version.cc.
+  // tensorflow/lite/tools/versioning/op_version.cc.
   // Applied DDC(s): NNAPI
   DCC_UNSUPPORTED_OPERATOR_VERSION = 7;
   // The given input operand type is not supported for the current
diff --git a/tensorflow/lite/tools/delegates/delegate_provider.cc b/tensorflow/lite/tools/delegates/delegate_provider.cc
index b4bc235b66a..960c7247d7c 100644
--- a/tensorflow/lite/tools/delegates/delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/delegate_provider.cc
@@ -59,7 +59,15 @@ ProvidedDelegateList::CreateAllRankedDelegates(const ToolParams& params) const {
     // It's possible that a delegate of certain type won't be created as
     // user-specified tool params tells not to.
     if (ptr_rank.first == nullptr) continue;
-    TFLITE_LOG(INFO) << provider->GetName() << " delegate created.";
+
+    static bool already_logged = false;
+    if (!already_logged) {
+      TFLITE_LOG(INFO) << provider->GetName() << " delegate created.";
+#ifndef NDEBUG
+      provider->LogParams(params, /*verbose=*/false);
+#endif
+      already_logged = true;
+    }
 
     ProvidedDelegateList::ProvidedDelegate info;
     info.provider = provider.get();
diff --git a/tensorflow/lite/tools/delegates/delegate_provider.h b/tensorflow/lite/tools/delegates/delegate_provider.h
index 8ba630faea1..6fcca4effec 100644
--- a/tensorflow/lite/tools/delegates/delegate_provider.h
+++ b/tensorflow/lite/tools/delegates/delegate_provider.h
@@ -18,9 +18,10 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
-#include "tensorflow/lite/core/shims/c/common.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/logging.h"
 #include "tensorflow/lite/tools/tool_params.h"
diff --git a/tensorflow/lite/tools/delegates/delegate_provider_test.cc b/tensorflow/lite/tools/delegates/delegate_provider_test.cc
index 1b50f2ebcc6..18997101b48 100644
--- a/tensorflow/lite/tools/delegates/delegate_provider_test.cc
+++ b/tensorflow/lite/tools/delegates/delegate_provider_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/core/shims/c/shims_test_util.h"
+#include "tensorflow/lite/c/test_util.h"
 #include "tensorflow/lite/tools/tool_params.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD b/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD
index e517583c273..8aace544f5f 100644
--- a/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD
@@ -9,7 +9,7 @@ cc_library(
     copts = tflite_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/lite/core/shims:common",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
     ] + select({
@@ -17,11 +17,11 @@ cc_library(
         # extended to support Windows.
         "//tensorflow:windows": [],
         "//conditions:default": [
-            "//tensorflow/lite/core/shims:delegate_plugin",
+            "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+            "//tensorflow/lite/acceleration/configuration/c:delegate_plugin",
+            "//tensorflow/lite/acceleration/configuration/c:stable_delegate",
             "//tensorflow/lite/delegates/utils/experimental/stable_delegate:delegate_loader",
             "//tensorflow/lite/delegates/utils/experimental/stable_delegate:tflite_settings_json_parser",
-            "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-            "//tensorflow/lite/experimental/acceleration/configuration/c:stable_delegate",
         ],
     }),
     # Statically registers itself with DelegateProviderRegistrar.
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider.cc b/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider.cc
index c0c1f5147bd..d8715648758 100644
--- a/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider.cc
@@ -14,20 +14,21 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <map>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/core/shims/c/common.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
 
 #if !defined(_WIN32)
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/c/stable_delegate.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
 #include "tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #endif  // !defined(_WIN32)
 
 namespace tflite {
@@ -37,35 +38,98 @@ namespace tools {
 // library loader doesn't support Windows platform.
 #if !defined(_WIN32)
 namespace {
-TfLiteDelegatePtr CreateStableDelegate(const std::string& settings_file_path) {
-  TfLiteDelegatePtr null_delegate = CreateNullDelegate();
-  if (settings_file_path.empty()) {
-    return null_delegate;
+
+// Parses the JSON settings, loads the appropriate stable delegate plugin,
+// and uses the stable delegate plugin to create a stable delegate.
+// This assumes that the settings in the given json_settings_file_path
+// will not to change during the lifetime of the program.
+TfLiteDelegatePtr CreateStableDelegate(
+    const std::string& json_settings_file_path);
+
+// Class that encapulates the stable delegate cache management.
+class StableDelegatePluginLoader {
+ public:
+  // Returns a singleton instance of this class.
+  static StableDelegatePluginLoader& GetInstance() {
+    static StableDelegatePluginLoader* const instance =
+        new StableDelegatePluginLoader;
+    return *instance;
   }
-  delegates::utils::TfLiteSettingsJsonParser parser;
-  const TFLiteSettings* tflite_settings = parser.Parse(settings_file_path);
+
+  // As per ::tflite::tools::CreateStableDelegate, above.
+  TfLiteDelegatePtr CreateStableDelegate(
+      const std::string& json_settings_file_path);
+
+ private:
+  struct CacheEntry {
+    const TfLiteStableDelegate* stable_delegate = nullptr;
+    delegates::utils::TfLiteSettingsJsonParser parser;  // Owns parsed_settings.
+    const TFLiteSettings* parsed_settings = nullptr;
+  };
+
+  StableDelegatePluginLoader() = default;
+  const CacheEntry* LoadStableDelegatePlugin(
+      const std::string& json_settings_file_path);
+
+  std::map<std::string /*settings_file_path*/, CacheEntry> cache_;
+};
+
+const StableDelegatePluginLoader::CacheEntry*
+StableDelegatePluginLoader::LoadStableDelegatePlugin(
+    const std::string& json_settings_file_path) {
+  auto it = cache_.find(json_settings_file_path);
+  if (it != cache_.end()) {
+    return &it->second;
+  }
+  CacheEntry result;
+  const TFLiteSettings* tflite_settings =
+      result.parser.Parse(json_settings_file_path);
+  result.parsed_settings = tflite_settings;
   if (!tflite_settings || !tflite_settings->stable_delegate_loader_settings() ||
       !tflite_settings->stable_delegate_loader_settings()->delegate_path()) {
     TFLITE_LOG(ERROR) << "Invalid TFLiteSettings for the stable delegate.";
-    return null_delegate;
+    result.stable_delegate = nullptr;
+  } else {
+    std::string delegate_path =
+        tflite_settings->stable_delegate_loader_settings()
+            ->delegate_path()
+            ->str();
+    result.stable_delegate =
+        delegates::utils::LoadDelegateFromSharedLibrary(delegate_path);
+    if (!result.stable_delegate || !result.stable_delegate->delegate_plugin) {
+      TFLITE_LOG(ERROR) << "Failed to load stable ABI delegate from stable ABI "
+                           "delegate binary ("
+                        << delegate_path << ").";
+    }
   }
-  std::string delegate_path = tflite_settings->stable_delegate_loader_settings()
-                                  ->delegate_path()
-                                  ->str();
-  auto stable_delegate_pointer =
-      delegates::utils::LoadDelegateFromSharedLibrary(delegate_path);
-  if (!stable_delegate_pointer || !stable_delegate_pointer->delegate_plugin) {
-    TFLITE_LOG(ERROR)
-        << "Failed to load stable ABI delegate pointer from stable ABI "
-           "delegate binary ("
-        << delegate_path << ".";
-    return null_delegate;
+  auto it2 = cache_.emplace(json_settings_file_path, std::move(result)).first;
+  return &it2->second;
+}
+
+TfLiteDelegatePtr CreateStableDelegate(
+    const std::string& json_settings_file_path) {
+  return StableDelegatePluginLoader::GetInstance().CreateStableDelegate(
+      json_settings_file_path);
+}
+
+TfLiteDelegatePtr StableDelegatePluginLoader::CreateStableDelegate(
+    const std::string& json_settings_file_path) {
+  if (json_settings_file_path.empty()) {
+    return CreateNullDelegate();
+  }
+  const CacheEntry* entry =
+      StableDelegatePluginLoader::GetInstance().LoadStableDelegatePlugin(
+          json_settings_file_path);
+  if (!entry || !entry->stable_delegate ||
+      !entry->stable_delegate->delegate_plugin) {
+    return CreateNullDelegate();
   }
   const TfLiteOpaqueDelegatePlugin* delegate_plugin =
-      stable_delegate_pointer->delegate_plugin;
-  return TfLiteDelegatePtr(delegate_plugin->create(tflite_settings),
+      entry->stable_delegate->delegate_plugin;
+  return TfLiteDelegatePtr(delegate_plugin->create(entry->parsed_settings),
                            delegate_plugin->destroy);
 }
+
 }  // namespace
 #endif  // !defined(_WIN32)
 
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_sample_stable_delegate_settings.json b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_sample_stable_delegate_settings.json
index 5e7d0394446..e3f665057e5 100755
--- a/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_sample_stable_delegate_settings.json
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_sample_stable_delegate_settings.json
@@ -1,11 +1,11 @@
 // Test only sample stable delegate settings file.
 //
 // This file follows the TFLiteSettings message structure in
-// tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+// tensorflow/lite/acceleration/configuration/configuration.proto
 // The stable delegate provider unit tests use it.
 {
   "delegate": "NONE",
   "stable_delegate_loader_settings": {
-    "delegate_path": "third_party/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so"
+    "delegate_path": "tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so"
   }
 }
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_stable_xnnpack_settings.json b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_stable_xnnpack_settings.json
index c96368d93bc..dfb55657b5d 100755
--- a/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_stable_xnnpack_settings.json
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_stable_xnnpack_settings.json
@@ -1,12 +1,12 @@
 // Test only stable XNNPack delegate settings file.
 //
 // This file follows the TFLiteSettings message structure in
-// tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+// tensorflow/lite/acceleration/configuration/configuration.proto
 // The stable delegate provider unit tests use it.
 {
   "delegate": "XNNPACK",
   "stable_delegate_loader_settings": {
-    "delegate_path": "third_party/tensorflow/lite/delegates/utils/experimental/stable_delegate/libtensorflowlite_stable_xnnpack_delegate.so"
+    "delegate_path": "tensorflow/lite/delegates/utils/experimental/stable_delegate/libtensorflowlite_stable_xnnpack_delegate.so"
   },
   "xnnpack_settings": {
     "num_threads": 5
diff --git a/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
index 3c65adf8140..4450b3531c5 100644
--- a/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
@@ -148,6 +148,12 @@ TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
     }
     gpu_opts.max_delegated_partitions =
         params.Get<int>("max_delegated_partitions");
+#ifdef TFLITE_DEBUG_DELEGATE
+    gpu_opts.first_delegate_node_index =
+        params.Get<int>("first_delegate_node_index");
+    gpu_opts.last_delegate_node_index =
+        params.Get<int>("last_delegate_node_index");
+#endif  // TFLITE_DEBUG_DELEGATE
 
     // Serialization.
     std::string serialize_dir =
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index 462337ffa7d..7db05c48aa1 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -63,16 +63,16 @@ cc_library_with_stable_tflite_abi(
         "//conditions:default": [],
     }),
     tflite_deps = [
-        "//tensorflow/lite/core/shims:c_api",
-        "//tensorflow/lite/core/shims:delegate_plugin",
-        "//tensorflow/lite/core/shims:common",
+        "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
         "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
         "//tensorflow/lite/tools/delegates:delegate_provider_lib",
     ],
     tflite_deps_selects = [{
         "//tensorflow/lite:tflite_with_xnnpack_explicit_false": [],
         "//conditions:default": [
-            "//tensorflow/lite/core/shims:xnnpack_plugin",
+            "//tensorflow/lite/experimental/acceleration/configuration/c:xnnpack_plugin",
         ],
     }],
     deps = select({
diff --git a/tensorflow/lite/tools/evaluation/stages/BUILD b/tensorflow/lite/tools/evaluation/stages/BUILD
index 35f7d70d875..359eff6ca4b 100644
--- a/tensorflow/lite/tools/evaluation/stages/BUILD
+++ b/tensorflow/lite/tools/evaluation/stages/BUILD
@@ -39,17 +39,17 @@ cc_library(
     hdrs = ["image_preprocessing_stage.h"],
     copts = tflite_copts(),
     deps = [
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:tflite_portable_logging",
-        "//tensorflow/tsl/util:stats_calculator_portable",
-        "//tensorflow/lite/profiling:time",
-        "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/profiling:time",
+        "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:preprocessing_steps_cc_proto",
+        "//tensorflow/tsl/util:stats_calculator_portable",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_jpeg_internal",
@@ -118,6 +118,7 @@ cc_library(
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "//tensorflow/tsl/util:stats_calculator_portable",
+        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
diff --git a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.cc b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.cc
index 613358e846c..c79a97f2bc2 100644
--- a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.cc
@@ -98,12 +98,13 @@ TfLiteStatus InferenceProfilerStage::Init(
   for (int i = 0; i < model_info_->inputs.size(); ++i) {
     const TfLiteType model_input_type = model_info_->inputs[i]->type;
     if (model_input_type == kTfLiteUInt8 || model_input_type == kTfLiteInt8 ||
+        model_input_type == kTfLiteInt64 ||
         model_input_type == kTfLiteFloat32 ||
         model_input_type == kTfLiteFloat16) {
     } else {
-      LOG(ERROR)
-          << "InferenceProfilerStage only supports float16/float32/int8/uint8 "
-             "input types";
+      LOG(ERROR) << "InferenceProfilerStage only supports "
+                    "float16/float32/int8/uint8/int64 "
+                    "input types";
       return kTfLiteError;
     }
     auto* input_shape = model_info_->inputs[i]->dims;
@@ -116,6 +117,7 @@ TfLiteStatus InferenceProfilerStage::Init(
     uint8_tensors_.emplace_back();
     int8_tensors_.emplace_back();
     float16_tensors_.emplace_back();
+    int64_tensors_.emplace_back();
   }
   // Preprocess output metadata for calculating diffs later.
   for (int i = 0; i < model_info_->outputs.size(); ++i) {
@@ -155,6 +157,11 @@ TfLiteStatus InferenceProfilerStage::Run() {
           input_num_elements_[i], std::numeric_limits<int8_t>::min(),
           std::numeric_limits<int8_t>::max(), &int8_tensors_[i]);
       input_ptrs.push_back(int8_tensors_[i].data());
+    } else if (model_input_type == kTfLiteInt64) {
+      GenerateRandomGaussianData(
+          input_num_elements_[i], std::numeric_limits<int64_t>::min(),
+          std::numeric_limits<int64_t>::max(), &int64_tensors_[i]);
+      input_ptrs.push_back(int64_tensors_[i].data());
     } else if (model_input_type == kTfLiteFloat32) {
       GenerateRandomGaussianData(input_num_elements_[i], -1, 1,
                                  &(float_tensors_[i]));
@@ -168,9 +175,9 @@ TfLiteStatus InferenceProfilerStage::Run() {
       }
       input_ptrs.push_back(float16_tensors_[i].data());
     } else {
-      LOG(ERROR)
-          << "InferenceProfilerStage only supports float16/float32/int8/uint8 "
-             "input types";
+      LOG(ERROR) << "InferenceProfilerStage only supports "
+                    "float16/float32/int8/uint8/int64 "
+                    "input types";
       return kTfLiteError;
     }
   }
diff --git a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
index 86c3368f7cd..2c2a5d30f2a 100644
--- a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
@@ -65,6 +65,7 @@ class InferenceProfilerStage : public EvaluationStage {
   std::vector<std::vector<int8_t>> int8_tensors_;
   std::vector<std::vector<uint8_t>> uint8_tensors_;
   std::vector<std::vector<uint16_t>> float16_tensors_;
+  std::vector<std::vector<int64_t>> int64_tensors_;
 };
 
 }  // namespace evaluation
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
index 0e1cc66950b..8580dabdeb3 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
@@ -20,12 +20,21 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/base/attributes.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/profiling/time.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 
+void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
+
+// Version with Weak linker attribute doing nothing: if someone links this
+// library with another definition of this function (presumably to actually
+// register custom ops), that version will be used instead.
+void ABSL_ATTRIBUTE_WEAK
+RegisterSelectedOps(::tflite::MutableOpResolver* resolver) {}
+
 namespace tflite {
 namespace evaluation {
 namespace {
@@ -131,6 +140,7 @@ TfLiteStatus TfliteInferenceStage::Init(
     resolver_ = std::make_unique<
         ops::builtin::BuiltinOpResolverWithoutDefaultDelegates>();
   }
+  RegisterSelectedOps(resolver_.get());
   InterpreterBuilder(*model_, *resolver_)(&interpreter_);
   if (!interpreter_) {
     LOG(ERROR) << "Could not build interpreter";
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
index da7e5c20bea..eaa91196b63 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
@@ -126,7 +126,7 @@ the following steps:
 ILSVRC_2012_DEVKIT_DIR=[set to path to ILSVRC 2012 devkit]
 VALIDATION_LABELS=[set to  path to output]
 
-python third_party/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/generate_validation_labels.py \
+python tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/generate_validation_labels.py \
 --ilsvrc_devkit_dir=${ILSVRC_2012_DEVKIT_DIR} \
 --validation_labels_output=${VALIDATION_LABELS}
 ```
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 45eac2845ca..b0b8ae74d88 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #endif
 
 #ifndef TFLITE_WITHOUT_XNNPACK
-#include "tensorflow/lite/core/shims/c/common.h"
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h"
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #endif  // !defined(TFLITE_WITHOUT_XNNPACK)
 
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index af57c1c1189..c598b89bc2b 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -53,7 +53,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #endif  // !defined(TFLITE_WITHOUT_XNNPACK)
 
-#include "tensorflow/lite/core/shims/c/common.h"
+#include "tensorflow/lite/c/common.h"
 
 namespace tflite {
 namespace evaluation {
diff --git a/tensorflow/lite/tools/model_loader.cc b/tensorflow/lite/tools/model_loader.cc
index 209eaf5a55d..0d095485996 100644
--- a/tensorflow/lite/tools/model_loader.cc
+++ b/tensorflow/lite/tools/model_loader.cc
@@ -130,7 +130,8 @@ bool PipeModelLoader::InitInternal() {
 
 #endif  // !_WIN32
 
-std::unique_ptr<ModelLoader> CreateModelLoaderFromPath(absl::string_view path) {
+std::unique_ptr<ModelLoader> CreateModelLoaderFromPath(
+    const std::string& path) {
   std::vector<absl::string_view> parts = absl::StrSplit(path, ':');
   if (parts.empty()) {
     return nullptr;
@@ -142,7 +143,8 @@ std::unique_ptr<ModelLoader> CreateModelLoaderFromPath(absl::string_view path) {
     if (parts.size() != 4 || !absl::SimpleAtoi(parts[1], &model_fd) ||
         !absl::SimpleAtoi(parts[2], &model_offset) ||
         !absl::SimpleAtoi(parts[3], &model_size)) {
-      TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to parse model path: %s", path);
+      TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to parse model path: %s",
+                      path.c_str());
       return nullptr;
     }
     return std::make_unique<MmapModelLoader>(model_fd, model_offset,
@@ -154,7 +156,8 @@ std::unique_ptr<ModelLoader> CreateModelLoaderFromPath(absl::string_view path) {
     if (parts.size() != 4 || !absl::SimpleAtoi(parts[1], &read_fd) ||
         !absl::SimpleAtoi(parts[2], &write_fd) ||
         !absl::SimpleAtoi(parts[3], &model_size)) {
-      TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to parse model path: %s", path);
+      TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to parse model path: %s",
+                      path.c_str());
       return nullptr;
     }
     // If set, close the write pipe for the read process / thread.
@@ -169,7 +172,8 @@ std::unique_ptr<ModelLoader> CreateModelLoaderFromPath(absl::string_view path) {
     size_t model_size;
     if (parts.size() != 3 || !absl::SimpleAtoi(parts[1], &buffer_handle) ||
         !absl::SimpleAtoi(parts[2], &model_size)) {
-      TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to parse model path: %s", path);
+      TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to parse model path: %s",
+                      path.c_str());
       return nullptr;
     }
     return std::make_unique<BufferModelLoader>(
diff --git a/tensorflow/lite/tools/model_loader.h b/tensorflow/lite/tools/model_loader.h
index 96ee3bdeabd..b185c5d17c1 100644
--- a/tensorflow/lite/tools/model_loader.h
+++ b/tensorflow/lite/tools/model_loader.h
@@ -166,7 +166,7 @@ class PipeModelLoader : public ModelLoader {
 // "buffer:%buffer_handle%:%buffer_size%". This model loader does not own the
 // buffer_handle, and the caller needs to ensure the buffer_handle out-lives the
 // model loader.
-std::unique_ptr<ModelLoader> CreateModelLoaderFromPath(absl::string_view path);
+std::unique_ptr<ModelLoader> CreateModelLoaderFromPath(const std::string& path);
 
 }  // namespace tools
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/model_loader_test.cc b/tensorflow/lite/tools/model_loader_test.cc
index bf040c48386..9e0447fb587 100644
--- a/tensorflow/lite/tools/model_loader_test.cc
+++ b/tensorflow/lite/tools/model_loader_test.cc
@@ -33,7 +33,7 @@ namespace tools {
 namespace {
 
 static constexpr char kModelPath[] =
-    "third_party/tensorflow/lite/java/demo/app/src/main/assets/"
+    "../tflite_mobilenet_float/"
     "mobilenet_v1_1.0_224.tflite";
 
 using ::testing::IsNull;
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 20d0497bac5..ea85a81d428 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -114,6 +114,14 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant, int number_of_bits) {
     }
     case BuiltinOperator_BATCH_TO_SPACE_ND:
     case BuiltinOperator_SPACE_TO_BATCH_ND:
+      // We skip inputs 1 and 2 since they aren't real valued (they are shapes).
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
+      property.restrict_same_input_output_scale = [](TensorType) {
+        return true;
+      };
+      property.version = 2;
+      break;
     case BuiltinOperator_SPACE_TO_DEPTH:
       // We skip inputs 1 and 2 since they aren't real valued (they are shapes).
       property.inputs = {{0, tensor_property_default}};
diff --git a/tensorflow/lite/tools/serialization/BUILD b/tensorflow/lite/tools/serialization/BUILD
index 168d79417aa..42aac2dfd1b 100644
--- a/tensorflow/lite/tools/serialization/BUILD
+++ b/tensorflow/lite/tools/serialization/BUILD
@@ -87,7 +87,6 @@ cc_test(
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/kernels:builtin_ops",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:subgraph_test_util",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
diff --git a/tensorflow/lite/tools/serialization/option_writer_generator.cc b/tensorflow/lite/tools/serialization/option_writer_generator.cc
index bb05aa4555f..78c88b62b03 100644
--- a/tensorflow/lite/tools/serialization/option_writer_generator.cc
+++ b/tensorflow/lite/tools/serialization/option_writer_generator.cc
@@ -25,8 +25,8 @@ limitations under the License.
 namespace tflite {
 namespace {
 // This is generated by grepping
-//  cat  third_party/tensorflow/lite/c/builtin_op_data.h | grep "^} TfLite" |
-//  sed 's/^} \(TfLite.*\)Params;/\1Params/g' | grep -v "^}" | sed
+//  cat third_party/tensorflow/lite/core/c/builtin_op_data.h | grep "^} TfLite"
+//  | sed 's/^} \(TfLite.*\)Params;/\1Params/g' | grep -v "^}" | sed
 //  's/\(.*\)/"\1",/g' | sort
 static const char* param_structs[] = {"TfLiteAddParams",
                                       "TfLiteArgMaxParams",
@@ -98,6 +98,8 @@ static const char* param_structs[] = {"TfLiteAddParams",
                                       "TfLiteVarHandleParams",
                                       "TfLiteUnsortedSegmentSumParams",
                                       "TfLiteUnsortedSegmentMinParams",
+                                      "TfLiteBitwiseXorParams",
+                                      "TfLiteRightShiftParams",
                                       nullptr};
 }  // namespace
 
@@ -220,6 +222,9 @@ class OpOptionData {
     op_to_option_["BROADCAST_ARGS"] = "";
     op_to_option_["GELU"] = "";
     op_to_option_["DYNAMIC_UPDATE_SLICE"] = "";
+    op_to_option_["BITCAST"] = "";
+    op_to_option_["BITWISE_XOR"] = "";
+    op_to_option_["RIGHT_SHIFT"] = "";
 
     // TODO(aselle): These are undesirable hacks. Consider changing C structs
     option_to_struct_["Pool2DOptions"] = "TfLitePoolParams";
diff --git a/tensorflow/lite/tools/serialization/writer_lib_test.cc b/tensorflow/lite/tools/serialization/writer_lib_test.cc
index 84b6299af7b..0d5651ccf43 100644
--- a/tensorflow/lite/tools/serialization/writer_lib_test.cc
+++ b/tensorflow/lite/tools/serialization/writer_lib_test.cc
@@ -31,9 +31,9 @@ limitations under the License.
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/core/model.h"
-#include "tensorflow/lite/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/kernels/subgraph_test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/testing/util.h"
diff --git a/tensorflow/lite/tools/signature/BUILD b/tensorflow/lite/tools/signature/BUILD
index b1b47b9edf7..3915f3b96c1 100644
--- a/tensorflow/lite/tools/signature/BUILD
+++ b/tensorflow/lite/tools/signature/BUILD
@@ -23,14 +23,14 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
     ]) + [
-        "@com_google_protobuf//:protobuf",
-        "@com_google_absl//absl/memory",
-        "@flatbuffers",
         "//tensorflow/core:protos_all_cc_impl",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@com_google_protobuf//:protobuf",
+        "@flatbuffers",
     ],
 )
 
@@ -42,6 +42,7 @@ cc_test(
         "//tensorflow/lite:testdata/add.bin",
     ],
     tags = [
+        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
@@ -92,6 +93,7 @@ py_test(
     srcs_version = "PY3",
     tags = [
         "no_mac",
+        "no_oss",
     ],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/lite/tools/signature/signature_def_util.cc b/tensorflow/lite/tools/signature/signature_def_util.cc
index 743e65818f8..9d36d2d08e9 100644
--- a/tensorflow/lite/tools/signature/signature_def_util.cc
+++ b/tensorflow/lite/tools/signature/signature_def_util.cc
@@ -136,7 +136,7 @@ Status GetSignatureDefMap(const Model* model,
     auto status = ReadSignatureDefMap(model, metadata, &signature_defs);
     if (status != ::tensorflow::OkStatus()) {
       return tensorflow::errors::Internal("Error reading signature def map: ",
-                                          status.error_message());
+                                          status.message());
     }
     for (const auto& entry : signature_defs) {
       tensorflow::SignatureDef signature_def;
diff --git a/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
index 0da559a99a6..a0c80eb2615 100644
--- a/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
+++ b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
@@ -41,7 +41,7 @@ py::bytes WrappedSetSignatureDefMap(
   }
   auto status = tflite::SetSignatureDefMap(model, signature_def_map, &data);
   if (status != ::tensorflow::OkStatus()) {
-    throw std::invalid_argument(status.error_message());
+    throw std::invalid_argument(std::string(status.message()));
   }
   return py::bytes(data);
 }
diff --git a/tensorflow/lite/tools/versioning/BUILD b/tensorflow/lite/tools/versioning/BUILD
index 64830a66403..74875c04112 100644
--- a/tensorflow/lite/tools/versioning/BUILD
+++ b/tensorflow/lite/tools/versioning/BUILD
@@ -46,6 +46,7 @@ tf_cc_test(
     deps = [
         ":versioning",
         "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_fbs_with_mutable",
diff --git a/tensorflow/lite/tools/versioning/gpu_compatibility.cc b/tensorflow/lite/tools/versioning/gpu_compatibility.cc
index 0a1f078e7cd..ae795071961 100644
--- a/tensorflow/lite/tools/versioning/gpu_compatibility.cc
+++ b/tensorflow/lite/tools/versioning/gpu_compatibility.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/versioning/gpu_compatibility.h"
 
 #include <string>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -440,6 +441,11 @@ absl::Status CheckGpuDelegateCompatibility(const OpSignature& op_sig) {
       const TfLiteAddParams* tf_options;
       return RetrieveBuiltinData(op_sig, &tf_options);
     }
+    case kTfLiteBuiltinAddN: {
+      return op_sig.inputs.size() == 2
+                 ? absl::OkStatus()
+                 : absl::UnimplementedError("ADD_N only supports 2 inputs.");
+    }
 
     case kTfLiteBuiltinAveragePool2d:
       return CheckPooling2DGpuDelegateCompatibility(op_sig);
@@ -567,6 +573,28 @@ absl::Status CheckGpuDelegateCompatibility(const OpSignature& op_sig) {
       return absl::OkStatus();
     }
 
+    case kTfLiteBuiltinGather:
+      if (!CheckInputsConstsOutputs(op_sig, /*required_runtime_inputs=*/2,
+                                    /*required_const_inputs=*/0,
+                                    /*required_outputs=*/1)
+               .ok() &&
+          !CheckInputsConstsOutputs(op_sig, /*required_runtime_inputs=*/1,
+                                    /*required_const_inputs=*/1,
+                                    /*required_outputs=*/1)
+               .ok()) {
+        return absl::InvalidArgumentError(
+            "Op can only handle 1 or 2 operand(s).");
+      }
+      if (op_sig.inputs.at(0).type == kTfLiteInt32) {
+        return absl::UnimplementedError("Does not accept INT32 input.\n");
+      }
+      if (op_sig.inputs[1].dims.size() != 1) {
+        return absl::UnimplementedError("Only support 1D indices\n");
+      }
+      return op_sig.inputs.at(1).type == kTfLiteInt32
+                 ? absl::OkStatus()
+                 : absl::UnimplementedError("Only accept INT32 indices\n");
+
     case kTfLiteBuiltinHardSwish:
       return CheckInputsOutputs(op_sig, /*required_runtime_inputs=*/1,
                                 /*required_outputs=*/1);
@@ -863,6 +891,7 @@ absl::Status CheckGpuDelegateCompatibility(const OpSignature& op_sig) {
     case kTfLiteBuiltinElu:
     case kTfLiteBuiltinExp:
     case kTfLiteBuiltinFloor:
+    case kTfLiteBuiltinGelu:
     case kTfLiteBuiltinLog:
     case kTfLiteBuiltinLogistic:  // Sigmoid
     case kTfLiteBuiltinNeg:
diff --git a/tensorflow/lite/tools/versioning/op_signature.cc b/tensorflow/lite/tools/versioning/op_signature.cc
index 86bbab7c2be..5b8523622ef 100644
--- a/tensorflow/lite/tools/versioning/op_signature.cc
+++ b/tensorflow/lite/tools/versioning/op_signature.cc
@@ -189,6 +189,9 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
         op_sig.ext_options.mul.input2_scale = input2_qunt->scale()->Get(0);
         op_sig.ext_options.mul.output_scale = output_quant->scale()->Get(0);
       }
+      if (input1_quant || input2_qunt) {
+        op_sig.ext_options.mul.input_quantized = true;
+      }
     } break;
 
     case BuiltinOperator_CONV_2D: {
diff --git a/tensorflow/lite/tools/versioning/op_signature.h b/tensorflow/lite/tools/versioning/op_signature.h
index 7cb911e727a..6e097672733 100644
--- a/tensorflow/lite/tools/versioning/op_signature.h
+++ b/tensorflow/lite/tools/versioning/op_signature.h
@@ -57,6 +57,7 @@ typedef struct {
       float input1_scale;
       float input2_scale;
       float output_scale;
+      bool input_quantized;
     } mul;
     struct {
       int32_t num_dims;
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index ad0d2c8869a..978c9854953 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -267,6 +267,12 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_MUL:
+      // Version 7 supports int16 and uint32 inputs
+      if ((op_sig.inputs.at(0).type == kTfLiteInt16 &&
+           !op_sig.ext_options.mul.input_quantized) ||
+          op_sig.inputs.at(0).type == kTfLiteUInt32) {
+        return 7;
+      }
       // Version 6 supports complex32 inputs
       if (op_sig.inputs.at(0).type == kTfLiteComplex64) {
         return 6;
@@ -403,6 +409,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_SLICE:
+      if (op_sig.inputs.at(0).type == kTfLiteUInt32) {
+        return 6;
+      }
       if (op_sig.inputs.at(0).dims.size() > 4) {
         return 5;
       }
@@ -514,6 +523,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       auto strided_slice_params =
           reinterpret_cast<TfLiteStridedSliceParams*>(op_sig.builtin_data);
       TFLITE_DCHECK(strided_slice_params != nullptr);
+      if (op_sig.inputs.at(0).type == kTfLiteUInt32) {
+        return 7;
+      }
       if (strided_slice_params->ellipsis_mask != 0 ||
           strided_slice_params->new_axis_mask != 0) {
         return 6;
@@ -590,11 +602,13 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       if (op_sig.inputs.at(0).type == kTfLiteInt8) {
         return 2;
       }
-
       if (op_sig.inputs.at(0).type == kTfLiteInt16 &&
           op_sig.outputs.at(0).type == kTfLiteInt16) {
         return 3;
       }
+      if (op_sig.inputs.at(0).type == kTfLiteUInt32) {
+        return 4;
+      }
       return 1;
 
     case BuiltinOperator_TILE:
@@ -614,6 +628,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
 
     case BuiltinOperator_SPACE_TO_BATCH_ND:
     case BuiltinOperator_BATCH_TO_SPACE_ND:
+      if (op_sig.inputs.at(0).type == kTfLiteInt16) {
+        return 4;
+      }
       if (op_sig.inputs.at(0).dims.size() != 4) {
         return 3;
       }
@@ -890,6 +907,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_SELECT: {
+      if (op_sig.inputs.at(0).type == kTfLiteUInt32) {
+        return 4;
+      }
       if (op_sig.inputs.at(0).dims.size() == 5 ||
           op_sig.inputs.at(1).dims.size() == 5 ||
           op_sig.inputs.at(2).dims.size() == 5)
@@ -909,11 +929,16 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
     }
+    case BuiltinOperator_SELECT_V2: {
+      if (op_sig.inputs.at(0).type == kTfLiteUInt32) {
+        return 2;
+      }
+      return 1;
+    }
     case BuiltinOperator_SPACE_TO_DEPTH:
     case BuiltinOperator_SPLIT_V:
     case BuiltinOperator_SUM:
     case BuiltinOperator_LOG_SOFTMAX:
-    case BuiltinOperator_TOPK_V2:
     case BuiltinOperator_GREATER:
     case BuiltinOperator_LESS_EQUAL:
     case BuiltinOperator_RSQRT:
@@ -923,6 +948,16 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
         return 2;
       }
       return 1;
+    case BuiltinOperator_TOPK_V2:
+      if (op_sig.inputs.at(0).type == kTfLiteInt16 ||
+          op_sig.inputs.at(1).type == kTfLiteInt16 ||
+          op_sig.outputs.at(1).type == kTfLiteInt16) {
+        return 3;
+      }
+      if (op_sig.inputs.at(0).type == kTfLiteInt8) {
+        return 2;
+      }
+      return 1;
 
     case BuiltinOperator_EXP:
     case BuiltinOperator_REDUCE_PROD:
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 7653d822749..9caf1c7e57f 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -209,7 +210,21 @@ TEST(OpVersionTest, VersioningGreaterEqualTest) {
 }
 
 TEST(OpVersionTest, VersioningSpaceToBatchNDTest) {
-  SimpleVersioningTest(BuiltinOperator_NOT_EQUAL);
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_SPACE_TO_BATCH_ND,
+  };
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt16, 3);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt16, 4);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt8, 3);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt8, 4);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt8, 3);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt8, 4);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
 }
 
 TEST(OpVersionTest, VersioningLogSoftmaxTest) {
@@ -217,7 +232,26 @@ TEST(OpVersionTest, VersioningLogSoftmaxTest) {
 }
 
 TEST(OpVersionTest, VersioningPackTest) {
-  SimpleVersioningTest(BuiltinOperator_PACK);
+  OpSignature fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_PACK;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt8);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_PACK;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt16);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt16);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_PACK;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+
+  fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_PACK;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
 }
 
 TEST(OpVersionTest, VersioningUnpackTest) {
@@ -270,14 +304,14 @@ TEST(OpVersionTest, VersioningBatchToSpaceNDTest) {
   OpSignature fake_op_sig = {
       .op = BuiltinOperator_BATCH_TO_SPACE_ND,
   };
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt16, 3);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt16, 4);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
   fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt8, 3);
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
   fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt8, 4);
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
-
-  fake_op_sig = {
-      .op = BuiltinOperator_BATCH_TO_SPACE_ND,
-  };
   fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt8, 3);
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
   fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt8, 4);
@@ -290,12 +324,11 @@ TEST(OpVersionTest, VersioningTanhTest) {
 
 TEST(OpVersionTest, VersioningStridedSliceTest) {
   TfLiteStridedSliceParams strided_slice_params = {};
-  OpSignature fake_op_sig = {
-      .op = BuiltinOperator_STRIDED_SLICE,
-      .inputs = CreateOpSignatureTensorSpecs(kTfLiteInt8),
-      .outputs = CreateOpSignatureTensorSpecs(kTfLiteInt8),
-      .builtin_data = reinterpret_cast<void*>(&strided_slice_params),
-  };
+  OpSignature fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_STRIDED_SLICE;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt8);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt8);
+  fake_op_sig.builtin_data = reinterpret_cast<void*>(&strided_slice_params);
   strided_slice_params.ellipsis_mask = 0;
   strided_slice_params.new_axis_mask = 2;
   fake_op_sig.ext_options.strided_slice.num_dims = 5;
@@ -309,6 +342,9 @@ TEST(OpVersionTest, VersioningStridedSliceTest) {
 
   fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt8);
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 7);
 }
 
 TEST(OpVersionTest, VersioningSpaceToDepthTest) {
@@ -347,6 +383,11 @@ TEST(OpVersionTest, VersioningSliceTest) {
   };
   fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt8, 4);
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_SLICE;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt32, 4);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 6);
 }
 
 TEST(OpVersionTest, VersioningLogisticTest) {
@@ -469,6 +510,21 @@ TEST(OpVersionTest, VersioningSubTest) {
   SimpleVersioningTest(BuiltinOperator_SUB);
 }
 
+TEST(OpVersionTest, VersioningMUL7TestInt16) {
+  OpSignature fake_op_sig;
+  fake_op_sig.op = BuiltinOperator_MUL;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt16);
+  fake_op_sig.ext_options.mul.input_quantized = false;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 7);
+}
+
+TEST(OpVersionTest, VersioningMUL7TestUInt32) {
+  OpSignature fake_op_sig;
+  fake_op_sig.op = BuiltinOperator_MUL;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 7);
+}
+
 TEST(OpVersionTest, VersioningMUL6Test) {
   OpSignature fake_op_sig;
   fake_op_sig.op = BuiltinOperator_MUL;
@@ -522,28 +578,47 @@ TEST(OpVersionTest, VersioningConcatenationTest) {
 }
 
 TEST(OpVersionTest, VersioningSelectTest) {
-  OpSignature fake_op_sig = {
-      .op = BuiltinOperator_SELECT,
-      .inputs = CreateOpSignatureTensorSpecs(
-          std::vector<TfLiteType>{kTfLiteUInt8, kTfLiteUInt8, kTfLiteUInt8}, 5),
-      .outputs = CreateOpSignatureTensorSpecs(kTfLiteUInt8),
-  };
+  OpSignature fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_SELECT;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(
+      std::vector<TfLiteType>{kTfLiteUInt32, kTfLiteUInt32, kTfLiteUInt32}, 5);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteUInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+
+  fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_SELECT;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(
+      std::vector<TfLiteType>{kTfLiteUInt8, kTfLiteUInt8, kTfLiteUInt8}, 5);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteUInt8);
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
-  fake_op_sig = {
-      .op = BuiltinOperator_SELECT,
-      .inputs = CreateOpSignatureTensorSpecs(
-          std::vector<TfLiteType>{kTfLiteInt8, kTfLiteInt8, kTfLiteInt8}, 4),
-      .outputs = CreateOpSignatureTensorSpecs(kTfLiteInt8),
-  };
+
+  fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_SELECT;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(
+      std::vector<TfLiteType>{kTfLiteInt8, kTfLiteInt8, kTfLiteInt8}, 4);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt8);
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
-  fake_op_sig = {
-      .op = BuiltinOperator_SELECT,
-      .inputs = CreateOpSignatureTensorSpecs(
-          std::vector<TfLiteType>{kTfLiteFloat32, kTfLiteFloat32,
-                                  kTfLiteFloat32},
-          4),
-      .outputs = CreateOpSignatureTensorSpecs(kTfLiteFloat32),
-  };
+
+  fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_SELECT;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(
+      std::vector<TfLiteType>{kTfLiteFloat32, kTfLiteFloat32, kTfLiteFloat32},
+      4);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteFloat32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+}
+
+TEST(OpVersionTest, VersioningSelectV2Test) {
+  OpSignature fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_SELECT_V2;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(
+      std::vector<TfLiteType>{kTfLiteUInt32, kTfLiteUInt32, kTfLiteUInt32}, 5);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteUInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+  fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_SELECT_V2;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(
+      std::vector<TfLiteType>{kTfLiteInt32, kTfLiteInt32, kTfLiteInt32}, 5);
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
 }
 
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 342672d572e..8ee32b4a76a 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -89,6 +89,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_SPACE_TO_BATCH_ND, 1}, "1.6.0"},
            {{BuiltinOperator_SPACE_TO_BATCH_ND, 2}, "1.14.0"},
            {{BuiltinOperator_SPACE_TO_BATCH_ND, 3}, "2.3.0"},
+           {{BuiltinOperator_SPACE_TO_BATCH_ND, 4}, "2.12.0"},
            {{BuiltinOperator_SUB, 1}, "1.6.0"},
            {{BuiltinOperator_SUB, 2}, "1.14.0"},
            {{BuiltinOperator_SUB, 3}, "2.3.0"},
@@ -100,6 +101,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_BATCH_TO_SPACE_ND, 1}, "1.6.0"},
            {{BuiltinOperator_BATCH_TO_SPACE_ND, 2}, "1.14.0"},
            {{BuiltinOperator_BATCH_TO_SPACE_ND, 3}, "2.3.0"},
+           {{BuiltinOperator_BATCH_TO_SPACE_ND, 4}, "2.12.0"},
            {{BuiltinOperator_CAST, 1}, "1.5.0"},
            {{BuiltinOperator_CAST, 2}, "2.7.0"},
            {{BuiltinOperator_CAST, 3}, "2.8.0"},
@@ -162,6 +164,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_MUL, 4}, "2.3.0"},
            {{BuiltinOperator_MUL, 5}, "2.6.0"},
            {{BuiltinOperator_MUL, 6}, "2.11.0"},
+           {{BuiltinOperator_MUL, 7}, "2.13.0"},
            {{BuiltinOperator_NON_MAX_SUPPRESSION_V4, 1}, "2.1.0"},
            {{BuiltinOperator_NON_MAX_SUPPRESSION_V5, 1}, "2.1.0"},
            {{BuiltinOperator_PAD, 1}, "1.5.0"},
@@ -244,8 +247,10 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_STRIDED_SLICE, 4}, "2.2.0"},
            {{BuiltinOperator_STRIDED_SLICE, 5}, "2.5.0"},
            {{BuiltinOperator_STRIDED_SLICE, 6}, "2.6.0"},
+           {{BuiltinOperator_STRIDED_SLICE, 7}, "2.14.0"},
            {{BuiltinOperator_TOPK_V2, 1}, "1.7.0"},
            {{BuiltinOperator_TOPK_V2, 2}, "1.14.0"},
+           {{BuiltinOperator_TOPK_V2, 3}, "2.13.0"},
            {{BuiltinOperator_ARG_MAX, 1}, "1.9.0"},
            {{BuiltinOperator_ARG_MAX, 2}, "1.14.0"},
            {{BuiltinOperator_ARG_MAX, 3}, "2.9.0"},
@@ -263,12 +268,14 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_PACK, 1}, "1.11.0"},
            {{BuiltinOperator_PACK, 2}, "1.14.0"},
            {{BuiltinOperator_PACK, 3}, "2.3.0"},
+           {{BuiltinOperator_PACK, 4}, "2.13.0"},
            {{BuiltinOperator_SHAPE, 1}, "1.10.0"},
            {{BuiltinOperator_SLICE, 1}, "1.14.0"},
            {{BuiltinOperator_SLICE, 2}, "1.14.0"},
            {{BuiltinOperator_SLICE, 3}, "1.14.0"},
            {{BuiltinOperator_SLICE, 4}, "2.4.0"},
            {{BuiltinOperator_SLICE, 5}, "2.5.0"},
+           {{BuiltinOperator_SLICE, 6}, "2.14.0"},
            {{BuiltinOperator_TANH, 1}, "1.14.0"},
            {{BuiltinOperator_TANH, 2}, "1.14.0"},
            {{BuiltinOperator_TANH, 3}, "2.3.0"},
@@ -323,7 +330,9 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_SELECT, 1}, "1.14.0"},
            {{BuiltinOperator_SELECT, 2}, "1.14.0"},
            {{BuiltinOperator_SELECT, 3}, "2.12.0"},
+           {{BuiltinOperator_SELECT, 4}, "2.12.0"},
            {{BuiltinOperator_SELECT_V2, 1}, "2.2.0"},
+           {{BuiltinOperator_SELECT_V2, 2}, "2.12.0"},
            {{BuiltinOperator_IF, 1}, "1.15.0"},
            {{BuiltinOperator_FLOOR_DIV, 1}, "1.14.0"},
            {{BuiltinOperator_FLOOR_DIV, 2}, "1.14.0"},
@@ -407,7 +416,10 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_UNSORTED_SEGMENT_SUM, 1}, "2.10.0"},
            {{BuiltinOperator_ATAN2, 1}, "2.10.0"},
            {{BuiltinOperator_SIGN, 1}, "2.11.0"},
-           {{BuiltinOperator_SIGN, 2}, "2.12.0"}});
+           {{BuiltinOperator_SIGN, 2}, "2.12.0"},
+           {{BuiltinOperator_BITCAST, 1}, "2.13.0"},
+           {{BuiltinOperator_BITWISE_XOR, 1}, "2.13.0"},
+           {{BuiltinOperator_RIGHT_SHIFT, 1}, "2.13.0"}});
 
   std::pair<BuiltinOperator, int> version_key = {op_code, op_version};
   auto it = op_version_map->find(version_key);
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index e7834aee496..54de233d09a 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -5,6 +5,7 @@ tensorflow/api_template_v1.__init__:.py
 tensorflow/compat_template.__init__:.py
 tensorflow/compat_template_v1.__init__:.py
 tensorflow/compiler/mlir/glob_lit_test.bzl:
+tensorflow/compiler/mlir/quantization/stablehlo/internal_visibility_allowlist.bzl:
 tensorflow/compiler/mlir/quantization/tensorflow/internal_visibility_allowlist.bzl:
 tensorflow/compiler/mlir/tfrt/transforms/tpu_passes.h:
 tensorflow/compiler/xla/glob_lit_test.bzl:
@@ -51,6 +52,13 @@ tensorflow/go/tsl/profiler/protobuf/BUILD:
 tensorflow/go/tsl/protobuf/BUILD:
 tensorflow/java/README.md:
 tensorflow/java/src/main/native/BUILD:
+tensorflow/lite/acceleration/configuration/c/delegate_plugin.h:
+tensorflow/lite/acceleration/configuration/c/gpu_plugin.h:
+tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h:
+tensorflow/lite/acceleration/configuration/c/stable_delegate.h:
+tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h:
+tensorflow/lite/acceleration/configuration/configuration_generated.h:
+tensorflow/lite/acceleration/configuration/delegate_registry.h:
 tensorflow/lite/async/c/BUILD:
 tensorflow/lite/async/c/async_kernel.h:
 tensorflow/lite/async/c/async_signature_runner.h:
@@ -58,6 +66,7 @@ tensorflow/lite/async/c/task.h:
 tensorflow/lite/async/c/types.h:
 tensorflow/lite/async/interop/c/BUILD:
 tensorflow/lite/async/interop/c/attribute_map.h:
+tensorflow/lite/async/interop/c/constants.h:
 tensorflow/lite/async/interop/c/types.h:
 tensorflow/lite/c/builtin_op_data.h:
 tensorflow/lite/c/c_api.h:
@@ -65,40 +74,19 @@ tensorflow/lite/c/c_api_experimental.h:
 tensorflow/lite/c/c_api_opaque.h:
 tensorflow/lite/c/c_api_types.h:
 tensorflow/lite/c/common.h:
+tensorflow/lite/c/jni/BUILD:
+tensorflow/lite/c/jni/jni_utils.cc:
+tensorflow/lite/c/jni/jni_utils.h:
 tensorflow/lite/c/test_util.cc:
 tensorflow/lite/c/test_util.h:
+tensorflow/lite/core/acceleration/configuration/c/special_rules.bzl:
 tensorflow/lite/core/c/special_rules.bzl:
 tensorflow/lite/core/experimental/acceleration/configuration/c/special_rules.bzl:
 tensorflow/lite/core/interpreter.h:
 tensorflow/lite/core/interpreter_builder.h:
 tensorflow/lite/core/model_builder.h:
 tensorflow/lite/core/shims/BUILD:
-tensorflow/lite/core/shims/c/builtin_op_data.h:
-tensorflow/lite/core/shims/c/c_api.h:
-tensorflow/lite/core/shims/c/c_api_experimental.h:
-tensorflow/lite/core/shims/c/c_api_opaque.h:
-tensorflow/lite/core/shims/c/c_api_types.h:
-tensorflow/lite/core/shims/c/common.h:
-tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h:
-tensorflow/lite/core/shims/c/experimental/acceleration/configuration/gpu_plugin.h:
-tensorflow/lite/core/shims/c/experimental/acceleration/configuration/nnapi_plugin.h:
-tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h:
-tensorflow/lite/core/shims/c/shims_test_util.cc:
-tensorflow/lite/core/shims/c/shims_test_util.h:
-tensorflow/lite/core/shims/cc/create_op_resolver.h:
-tensorflow/lite/core/shims/cc/experimental/acceleration/configuration/delegate_registry.h:
-tensorflow/lite/core/shims/cc/interpreter.h:
-tensorflow/lite/core/shims/cc/interpreter_builder.h:
-tensorflow/lite/core/shims/cc/kernels/builtin_op_kernels.h:
-tensorflow/lite/core/shims/cc/kernels/register.h:
-tensorflow/lite/core/shims/cc/model.h:
-tensorflow/lite/core/shims/cc/model_builder.h:
-tensorflow/lite/core/shims/cc/shims_test_util.h:
-tensorflow/lite/core/shims/cc/tools/verifier.h:
-tensorflow/lite/core/shims/cc/tools/verifier_internal.h:
 tensorflow/lite/core/shims/cc_library_with_tflite.bzl:
-tensorflow/lite/core/shims/jni/jni_utils.cc:
-tensorflow/lite/core/shims/jni/jni_utils.h:
 tensorflow/lite/core/special_rules.bzl:
 tensorflow/lite/create_op_resolver.h:
 tensorflow/lite/delegates/coreml/builders/BUILD:
@@ -109,13 +97,7 @@ tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h:
 tensorflow/lite/delegates/hexagon/hexagon_nn/BUILD:
 tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD:
 tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external.cc:
-tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h:
-tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h:
-tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h:
-tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h:
-tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h:
 tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h:
-tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h:
 tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg.h:
 tensorflow/lite/experimental/acceleration/mini_benchmark/special_rules.bzl:
 tensorflow/lite/interpreter.h:
@@ -204,8 +186,11 @@ tensorflow/tools/toolchains/win/tf_win_02042022/BUILD:
 tensorflow/tools/toolchains/win/tf_win_02082022/BUILD:
 tensorflow/tools/toolchains/win/tf_win_02212022/BUILD:
 tensorflow/tools/toolchains/win/tf_win_02232023/BUILD:
+tensorflow/tools/toolchains/win/tf_win_04052023/BUILD:
 tensorflow/tools/toolchains/win/tf_win_04062022/BUILD:
 tensorflow/tools/toolchains/win/tf_win_04112022/BUILD:
+tensorflow/tools/toolchains/win/tf_win_04192023/BUILD:
+tensorflow/tools/toolchains/win/tf_win_05022023/BUILD:
 tensorflow/tools/toolchains/win/tf_win_06152021/BUILD:
 tensorflow/tools/toolchains/win/tf_win_06152022/BUILD:
 tensorflow/tools/toolchains/win/tf_win_06212021/BUILD:
@@ -310,7 +295,6 @@ third_party/mkl/BUILD:
 third_party/mkl/build_defs.bzl:
 third_party/mkl_dnn/LICENSE:
 third_party/mkl_dnn/build_defs.bzl:
-third_party/mkl_dnn/mkldnn.BUILD:
 third_party/mkl_dnn/mkldnn_acl.BUILD:
 third_party/mkl_dnn/mkldnn_v1.BUILD:
 third_party/mpi/.gitignore:
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index fd639541c06..5ef7fd7e90e 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -17,6 +17,7 @@ load(
     "tf_additional_profiler_deps",
 )
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
+load("//third_party/mkl_dnn:build_defs.bzl", "if_onednn_v3")
 
 # TODO(mdan): Break into per-directory files.
 
@@ -72,9 +73,22 @@ py_library(
         "//tensorflow/tools/quantization:__pkg__",  # TODO(b/34059704): remove when fixed
     ],
     deps = [
+        ":gradient_checker_v2",
         ":no_contrib",
         "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/platform:app",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:resource_loader",
+        "//tensorflow/python/platform:sysconfig",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/summary:summary_py",
         "//tensorflow/python/tpu:tpu_estimator",
+        "//tensorflow/python/util:all_util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -161,10 +175,11 @@ py_library(
         ":nccl_ops",
         ":nn",
         ":ops",
-        ":platform",
         ":proto_ops",
         ":pywrap_tensorflow",
         ":pywrap_tfe",
+        ":random_crop_ops",
+        ":ref_variable",
         ":rnn_ops_gen",
         ":script_ops",
         ":sendrecv_ops_gen",
@@ -177,16 +192,20 @@ py_library(
         ":tensor_array_ops",
         ":training",
         ":uniform_quant_ops_gen",
+        ":variable_v1",
         ":weights_broadcast_ops",
         ":while_v2",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/autograph",
         "//tensorflow/python/autograph/utils:testing",
         "//tensorflow/python/client",
         "//tensorflow/python/client:_pywrap_events_writer",
         "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/compat",
         "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/compiler",
+        "//tensorflow/python/compiler/mlir",
+        "//tensorflow/python/compiler/tensorrt:init_py",
+        "//tensorflow/python/compiler/xla:compiler_py",
         "//tensorflow/python/data",
         "//tensorflow/python/debug:debug_py",
         "//tensorflow/python/distribute",
@@ -220,26 +239,43 @@ py_library(
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
-        "//tensorflow/python/ops/linalg/sparse",
+        "//tensorflow/python/ops/linalg/sparse:sparse_py",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/ops/numpy_ops:numpy",
         "//tensorflow/python/ops/parallel_for",
         "//tensorflow/python/ops/ragged",
         "//tensorflow/python/ops/signal",
+        "//tensorflow/python/ops/signal:dct_ops",
+        "//tensorflow/python/ops/signal:fft_ops",
+        "//tensorflow/python/ops/signal:mel_ops",
+        "//tensorflow/python/ops/signal:mfcc_ops",
+        "//tensorflow/python/ops/signal:reconstruction_ops",
+        "//tensorflow/python/ops/signal:shape_ops",
+        "//tensorflow/python/ops/signal:spectral_ops",
+        "//tensorflow/python/ops/signal:util_ops",
+        "//tensorflow/python/ops/signal:window_ops",
         "//tensorflow/python/platform:_pywrap_stacktrace_handler",
+        "//tensorflow/python/platform:app",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:resource_loader",
+        "//tensorflow/python/platform:sysconfig",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler",
         "//tensorflow/python/profiler:profiler_client",
         "//tensorflow/python/profiler:profiler_v2",
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/saved_model",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/summary:summary_py",
         "//tensorflow/python/summary/writer",
         "//tensorflow/python/tools:module_util",
         "//tensorflow/python/tools/api/generator:create_python_api",
         "//tensorflow/python/tpu:tpu_noestimator",
         "//tensorflow/python/training:saver_test_utils",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/types:data",
+        "//tensorflow/python/types:distribute",
+        "//tensorflow/python/types:trace",
         "//tensorflow/python/util:_pywrap_checkpoint_reader",
         "//tensorflow/python/util:_pywrap_kernel_registry",
         "//tensorflow/python/util:_pywrap_nest",
@@ -247,6 +283,10 @@ py_library(
         "//tensorflow/python/util:_pywrap_tfprof",
         "//tensorflow/python/util:_pywrap_transform_graph",
         "//tensorflow/python/util:_pywrap_util_port",
+        "//tensorflow/python/util:all_util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -257,7 +297,7 @@ py_strict_library(
         "//tensorflow:__pkg__",
     ],
     deps = [
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/util:core",
     ],
 )
@@ -282,18 +322,8 @@ py_library(
         "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/distribute/failure_handling:failure_handling_lib",
         "//tensorflow/python/distribute/failure_handling:preemption_watcher",
-    ],
-)
-
-alias(
-    name = "util",
-    actual = "//tensorflow/python/util:util",
-    deprecation = "This target has been split. Depend on the sub-targets in //third_party/tensorflow/python/util instead.",
-    visibility = visibility + [
-        "//tensorflow:__pkg__",
-        "//third_party/py/tensorflow_core:__subpackages__",
-        "//third_party/py/tf_agents:__subpackages__",
-        "//third_party/py/tfx:__subpackages__",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -445,7 +475,11 @@ py_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
     ],
 )
 
@@ -496,7 +530,10 @@ py_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -743,6 +780,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":batch_ops_gen",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -814,7 +853,11 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "parsing_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python/autograph/operators:__pkg__",
+        "//tensorflow/python/data/ops:__pkg__",
+    ],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -1018,6 +1061,7 @@ py_library(
     deps = [
         ":array_ops",
         ":array_ops_gen",
+        ":cond",
         ":control_flow_ops",
         ":control_flow_util",
         ":math_ops",
@@ -1029,10 +1073,12 @@ py_library(
         "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
     ],
 )
@@ -1045,6 +1091,7 @@ cuda_py_test(
     deps = [
         ":array_ops",
         ":client_testlib",
+        ":gradient_checker_v2",
         ":gradients",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
@@ -1068,6 +1115,7 @@ py_library(
         ":math_ops_gen",
         ":shape_util",
         "//tensorflow/python/compat",
+        "//tensorflow/python/eager:record",
         "//tensorflow/python/framework:common_shapes",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
@@ -1077,7 +1125,12 @@ py_library(
         "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -1089,7 +1142,8 @@ py_library(
     deps = [
         ":array_ops_gen",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1100,7 +1154,6 @@ py_library(
     deps = [
         ":bitwise_ops_gen",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util",
     ],
 )
 
@@ -1135,8 +1188,12 @@ py_library(
     deps = [
         ":set_ops_gen",
         "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//tensorflow/python/util",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1149,6 +1206,11 @@ py_library(
         ":candidate_sampling_ops_gen",
         ":math_ops",
         "//tensorflow/python/framework",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1158,12 +1220,20 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":array_ops",
+        ":cond",
         ":control_flow_ops",
         ":math_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -1177,7 +1247,15 @@ py_library(
         ":math_ops",
         ":nn_ops_gen",
         ":numerics",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1189,7 +1267,10 @@ tf_py_test(
     deps = [
         ":client_testlib",
         ":clip_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -1200,8 +1281,10 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":clustering_ops_gen",
+        ":cond",
         ":ops",
         ":training",
+        ":variable_v1",
         "//tensorflow/python/framework",
     ],
 )
@@ -1214,7 +1297,7 @@ tf_py_test(
     deps = [
         ":client_testlib",
         ":clustering_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -1237,8 +1320,15 @@ tf_py_test(
     deps = [
         ":client_testlib",
         ":collective_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        ":variable_v1",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:kernels",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -1258,8 +1348,9 @@ tf_py_test(
     deps = [
         ":client_testlib",
         ":collective_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:kernels",
+        "//tensorflow/python/framework:ops",
         "//third_party/py/numpy",
     ],
 )
@@ -1278,7 +1369,12 @@ cuda_py_test(
     deps = [
         ":client_testlib",
         ":collective_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -1297,7 +1393,9 @@ cuda_py_test(
     deps = [
         ":client_testlib",
         ":collective_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
         "//third_party/py/numpy",
     ],
 )
@@ -1308,6 +1406,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":array_ops",
+        ":cond",
         ":control_flow_ops_gen",
         ":logging_ops_gen",
         ":math_ops_gen",
@@ -1315,7 +1414,7 @@ py_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/python/util:tf_should_use",
     ],
@@ -1331,8 +1430,10 @@ py_library(
         ":control_flow_ops_gen",
         ":control_flow_util",
         ":math_ops",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
     ],
 )
 
@@ -1352,7 +1453,6 @@ py_library(
         ":control_flow_util",
         ":functional_ops_gen",
         ":math_ops",
-        ":platform",
         ":tensor_array_ops",
         ":while_loop",
         "//tensorflow/core:protos_all_py",
@@ -1367,8 +1467,13 @@ py_library(
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:type_spec",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:lazy_loader",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
     ],
 )
 
@@ -1377,15 +1482,16 @@ py_library(
     srcs = ["ops/control_flow_case.py"],
     srcs_version = "PY3",
     deps = [
-        ":array_ops",
+        ":array_ops_stack",
+        ":cond",
         ":control_flow_assert",
         ":math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/platform",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -1400,6 +1506,7 @@ py_library(
         ":math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:lazy_loader",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -1417,8 +1524,8 @@ py_library(
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/platform",
-        "//tensorflow/python/types",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:lazy_loader",
@@ -1440,8 +1547,11 @@ py_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:type_spec",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:lazy_loader",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
     ],
 )
 
@@ -1450,11 +1560,11 @@ py_library(
     srcs = ["ops/control_flow_util.py"],
     srcs_version = "PY3",
     deps = [
-        ":platform",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "control_flow_util_v2",
     srcs = ["ops/control_flow_util_v2.py"],
     srcs_version = "PY3",
@@ -1462,12 +1572,19 @@ py_library(
     deps = [
         ":control_flow_util",
         ":control_flow_v2_func_graphs",
+        ":gradients_util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager/polymorphic_function:atomic_function",
+        "//tensorflow/python/eager/polymorphic_function:monomorphic_function",
+        "//tensorflow/python/eager/polymorphic_function:tracing_compiler",
+        "//tensorflow/python/eager/polymorphic_function:transform",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:function_def_to_graph",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:keras_deps",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
 
@@ -1488,7 +1605,8 @@ py_library(
         ":control_flow_util",
         ":control_flow_util_v2",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1554,14 +1672,13 @@ py_library(
         ":optional_ops_gen",
         ":pywrap_tensorflow",
         "//tensorflow/python/compat",
-        "//tensorflow/python/eager:function",
         "//tensorflow/python/framework:auto_control_deps_utils",
         "//tensorflow/python/framework:c_api_util",
         "//tensorflow/python/framework:function",
         "//tensorflow/python/framework:function_def_to_graph",
         "//tensorflow/python/framework:graph_to_function_def",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -1583,18 +1700,20 @@ py_library(
         ":list_ops",
         ":map_ops",
         ":tensor_array_ops",
-        "//tensorflow/core/function/capture:capture_container",
         "//tensorflow/python/client:pywrap_tf_session",
-        "//tensorflow/python/eager:function",
         "//tensorflow/python/framework:auto_control_deps_utils",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:function_def_to_graph",
+        "//tensorflow/python/framework:func_graph",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:variable_utils",
     ],
 )
 
@@ -1605,12 +1724,17 @@ py_library(
     deps = [
         ":count_ops_gen",
         "//tensorflow/python/compat",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "bincount_ops_test",
     size = "small",
     srcs = ["ops/bincount_ops_test.py"],
@@ -1618,6 +1742,7 @@ tf_py_test(
     deps = [
         ":bincount_ops",
         ":platform_test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1628,9 +1753,21 @@ py_library(
     deps = [
         ":array_ops",
         ":ctc_ops_gen",
+        ":functional_ops",
         ":nn_grad",
         "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1640,7 +1777,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":cudnn_rnn_ops_gen",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
@@ -1652,7 +1789,9 @@ py_library(
         ":array_ops",
         ":data_flow_ops",
         ":math_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
@@ -1666,9 +1805,16 @@ py_library(
         ":data_flow_ops_gen",
         ":math_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1682,17 +1828,21 @@ py_library(
         ":data_flow_grad",
         ":data_flow_ops",
         ":math_ops",
-        ":platform",
         ":resource_variable_ops",
         ":sparse_ops",
         ":variables",
         "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/ops/ragged:ragged_functional_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1705,9 +1855,10 @@ py_library(
     deps = [
         ":protos_all_py",
         "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
         "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -1727,7 +1878,13 @@ py_library(
         "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/eager:forwardprop",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:record",
         "//tensorflow/python/eager:tape",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
     ],
 )
 
@@ -1753,15 +1910,14 @@ py_library(
         ":math_grad",
         ":math_ops",
         ":optional_grad",
-        ":platform",
         ":random_grad",
         ":tensor_array_ops",
         ":unconnected_gradients",
         ":while_loop",
         "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1779,18 +1935,24 @@ py_library(
         ":default_gradient",
         ":functional_ops_gen",
         ":math_ops",
-        ":platform",
         ":resource_variable_ops",
         ":unconnected_gradients",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:backprop_util",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:composite_tensor_gradient",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
         "//third_party/py/numpy",
     ],
 )
@@ -1815,6 +1977,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":array_ops",
+        ":cond",
         ":control_flow_ops",
         ":control_flow_util",
         ":data_flow_ops_gen",
@@ -1831,7 +1994,7 @@ py_library(
     srcs = ["ops/unconnected_gradients.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1841,7 +2004,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":filesystem_ops_gen",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1853,7 +2016,11 @@ py_library(
         ":array_ops",
         ":clip_ops",
         ":math_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1864,7 +2031,8 @@ py_library(
     deps = [
         ":array_ops",
         ":image_ops_gen",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
@@ -1879,6 +2047,7 @@ py_library(
         ":array_ops",
         ":check_ops",
         ":clip_ops",
+        ":cond",
         ":control_flow_case",
         ":control_flow_ops",
         ":gradients",
@@ -1891,8 +2060,17 @@ py_library(
         ":variables",
         ":while_loop",
         "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//tensorflow/python/util",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -1910,7 +2088,10 @@ py_library(
         ":random_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/util",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -1929,7 +2110,7 @@ py_library(
         ":stateless_random_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -1944,14 +2125,21 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "io_ops",
     srcs = ["ops/io_ops.py"],
     srcs_version = "PY3",
     deps = [
+        ":data_flow_ops_gen",
         ":io_ops_gen",
         ":lib",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        ":parsing_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1961,10 +2149,12 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":array_ops",
+        ":cond",
         ":control_flow_ops",
         ":linalg_ops",
         ":math_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/ops/linalg:linalg_impl",
     ],
 )
@@ -1975,12 +2165,17 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":array_ops",
+        ":cond",
         ":linalg_ops_gen",
         ":linalg_ops_impl",
         ":map_fn",
         ":math_ops",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -1993,7 +2188,9 @@ py_library(
         ":array_ops",
         ":math_ops",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
@@ -2004,7 +2201,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":manip_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
@@ -2015,7 +2212,11 @@ py_library(
     deps = [
         ":manip_ops_gen",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -2026,11 +2227,18 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":logging_ops_gen",
-        ":platform",
         ":string_ops",
         "//tensorflow/python/compat",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//tensorflow/python/util",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2047,10 +2255,16 @@ py_library(
         "//tensorflow/python/checkpoint:saveable_compat",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/saved_model/registration",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2068,7 +2282,8 @@ py_library(
         "//tensorflow/python/compat",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
         "//third_party/py/numpy",
     ],
@@ -2078,7 +2293,11 @@ py_library(
     name = "op_selector",
     srcs = ["ops/op_selector.py"],
     srcs_version = "PY3",
-    deps = ["//tensorflow/python/framework:ops"],
+    deps = [
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:object_identity",
+    ],
 )
 
 py_library(
@@ -2106,7 +2325,15 @@ py_library(
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:lazy_loader",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:traceback_utils",
         "//third_party/py/numpy",
     ],
 )
@@ -2119,7 +2346,8 @@ py_library(
         ":array_ops",
         ":control_flow_ops",
         ":math_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/util:tf_should_use",
     ],
 )
@@ -2138,14 +2366,20 @@ py_library(
         "//tensorflow/python/checkpoint:tensor_callable",
         "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:record",
         "//tensorflow/python/eager:tape",
         "//tensorflow/python/framework:auto_control_deps_utils",
+        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor",
         "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/saved_model:nested_structure_coder",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2161,7 +2395,9 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2173,6 +2409,8 @@ py_library(
         ":array_ops",
         ":handle_data_util",
         ":list_ops_gen",
+        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
+        "//tensorflow/python/util:lazy_loader",
         "//third_party/py/numpy",
     ],
 )
@@ -2183,6 +2421,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":array_ops",
+        ":cond",
         ":map_ops_gen",
     ],
 )
@@ -2197,6 +2436,7 @@ py_library(
     deps = [
         ":array_ops",
         ":candidate_sampling_ops",
+        ":cond",
         ":ctc_ops",
         ":embedding_ops",
         ":math_ops",
@@ -2205,9 +2445,14 @@ py_library(
         ":nn_ops_gen",
         ":sparse_ops",
         ":variables",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:device_context",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2223,7 +2468,8 @@ py_library(
         ":nn_ops_gen",
         ":sparse_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
     ],
 )
@@ -2236,15 +2482,24 @@ py_library(
         ":array_ops",
         ":math_ops",
         ":nn_ops_gen",
-        ":platform",
         ":random_ops",
         ":stateless_random_ops",
         ":variables",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:fast_tensor_util",
         "//tensorflow/python/framework:graph_util",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/platform:device_context",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -2257,7 +2512,12 @@ py_library(
         ":array_ops",
         ":control_flow_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2267,8 +2527,14 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/ops/ragged:ragged_math_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2284,7 +2550,12 @@ py_library(
         ":parsing_ops_gen",
         ":sparse_ops",
         "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2293,9 +2564,12 @@ py_library(
     srcs = ["ops/partitioned_variables.py"],
     srcs_version = "PY3",
     deps = [
-        ":platform",
         ":variable_scope",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2326,6 +2600,28 @@ py_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "random_crop_ops",
+    srcs = ["ops/random_crop_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_assert",
+        ":control_flow_ops",
+        ":math_ops",
+        ":random_ops",
+        ":stateless_random_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2341,24 +2637,32 @@ py_library(
         ":variables",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "stateless_random_ops",
     srcs = ["ops/stateless_random_ops.py"],
     srcs_version = "PY3",
     deps = [
         ":array_ops",
         ":array_ops_stack",
+        ":bitwise_ops",
         ":math_ops",
         ":random_index_shuffle_ops_gen",
         ":shape_util",
         ":stateless_random_ops_gen",
         ":stateless_random_ops_v2_gen",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -2368,6 +2672,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":array_ops",
+        ":cond",
         ":control_flow_ops",
         ":control_flow_util",
         ":control_flow_util_v2",
@@ -2377,8 +2682,16 @@ py_library(
         ":variable_scope",
         ":while_loop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//tensorflow/python/util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2406,12 +2719,28 @@ py_library(
     deps = [
         ":array_ops",
         ":script_ops_gen",
-        "//tensorflow/python/autograph/operators",
+        "//tensorflow/python/autograph/operators:py_builtins",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:backprop_util",
-        "//tensorflow/python/eager:tape",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/eager:record",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/lib/core:_pywrap_py_func",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
         "//third_party/py/numpy",
     ],
 )
@@ -2422,7 +2751,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":sdca_ops_gen",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
@@ -2433,8 +2762,12 @@ py_library(
     deps = [
         ":array_ops",
         ":data_flow_ops_gen",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//tensorflow/python/util",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2460,8 +2793,9 @@ py_library(
         ":math_ops",
         ":sparse_ops",
         ":sparse_ops_gen",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
     ],
 )
 
@@ -2475,9 +2809,20 @@ py_library(
         ":control_flow_ops",
         ":math_ops",
         ":sparse_ops_gen",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//tensorflow/python/util",
+        ":special_math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -2494,8 +2839,11 @@ tf_py_test(
         ":sparse_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -2509,6 +2857,12 @@ py_library(
         ":math_ops",
         ":nn_ops",
         "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -2523,7 +2877,12 @@ tf_py_test(
         ":client_testlib",
         ":random_ops",
         ":sort_ops",
-        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -2535,11 +2894,17 @@ py_library(
     deps = [
         ":array_ops",
         ":check_ops",
+        ":cond",
         ":control_flow_ops",
         ":math_ops",
         ":sparse_ops",
         "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2551,10 +2916,14 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":array_ops",
+        ":cond",
         ":control_flow_ops",
         ":math_ops",
         ":sets",
         "//tensorflow/python/framework",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2568,6 +2937,7 @@ py_library(
     deps = [
         ":array_ops",
         ":check_ops",
+        ":cond",
         ":confusion_matrix",
         ":control_flow_ops",
         ":math_ops",
@@ -2576,12 +2946,17 @@ py_library(
         ":sparse_ops",
         ":state_ops",
         ":variable_scope",
+        ":variable_v1",
         ":variables",
         ":weights_broadcast_ops",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//tensorflow/python/util",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2594,10 +2969,15 @@ py_library(
         ":check_ops",
         ":control_flow_ops",
         ":math_ops",
-        ":platform",
         ":special_math_ops_gen",
         "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
         "@opt_einsum_archive//:opt_einsum",
     ],
 )
@@ -2608,7 +2988,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":rnn_ops_gen",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
@@ -2638,7 +3018,9 @@ py_test(
     deps = [
         ":client_testlib",
         ":script_ops",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
@@ -2666,6 +3048,7 @@ py_library(
         ":batch_ops",
         ":check_ops",
         ":clip_ops",
+        ":cond",
         ":confusion_matrix",
         ":control_flow_case",
         ":control_flow_ops",
@@ -2707,15 +3090,12 @@ py_library(
         ":tensor_array_ops",
         ":variable_scope",
         ":variables",
-        "//tensorflow/python/compiler",
+        "//tensorflow/python/compiler/tensorrt:trt_convert_py",
         "//tensorflow/python/eager:wrap_function",
-        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
-        "//tensorflow/python/ops/linalg/sparse",
         "//tensorflow/python/ops/ragged",
         "//tensorflow/python/ops/structured",
-        "//tensorflow/python/util",
     ],
 )
 
@@ -2723,7 +3103,7 @@ py_library(
     name = "state_grad",
     srcs = ["ops/state_grad.py"],
     srcs_version = "PY3",
-    deps = ["//tensorflow/python/framework:for_generated_wrappers"],
+    deps = ["//tensorflow/python/framework:ops"],
 )
 
 py_library(
@@ -2735,9 +3115,11 @@ py_library(
         ":math_ops_gen",
         ":resource_variable_ops_gen",
         ":state_ops_gen",
+        "//tensorflow/python/framework:fast_tensor_util",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2747,9 +3129,16 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":string_ops_gen",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//tensorflow/python/util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2775,7 +3164,10 @@ py_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:smart_cond",
         "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2784,12 +3176,16 @@ py_library(
     srcs = ["ops/template.py"],
     srcs_version = "PY3",
     deps = [
-        ":platform",
         ":variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//tensorflow/python/util",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2799,7 +3195,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":tensor_array_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
@@ -2823,7 +3219,9 @@ py_library(
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:type_spec_registry",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/util:tf_export",
         "//tensorflow/python/util:tf_should_use",
     ],
 )
@@ -2832,7 +3230,10 @@ py_library(
     name = "composite_tensor_ops",
     srcs = ["ops/composite_tensor_ops.py"],
     srcs_version = "PY2AND3",
-    deps = ["composite_tensor_ops_gen"],
+    deps = [
+        "composite_tensor_ops_gen",
+        "//tensorflow/python/util:nest",
+    ],
 )
 
 py_library(
@@ -2842,7 +3243,6 @@ py_library(
     deps = [
         ":array_ops",
         ":init_ops",
-        ":platform",
         ":resource_variable_ops",
         ":tf2",
         ":variables",
@@ -2852,7 +3252,12 @@ py_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:function_utils",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -2862,8 +3267,11 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":array_ops",
+        ":array_ops_stack",
         ":control_flow_ops",
         ":math_ops",
+        ":math_ops_gen",
+        ":pywrap_tensorflow",
         ":state_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
@@ -2871,8 +3279,55 @@ py_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:tf_should_use",
+        "//tensorflow/python/util:traceback_utils",
+    ],
+)
+
+py_library(
+    name = "ref_variable",
+    srcs = ["ops/ref_variable.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":resource_variable_ops",
+        ":state_ops",
+        ":state_ops_gen",
+        ":variable_scope",
+        ":variable_v1",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+    ],
+)
+
+py_library(
+    name = "variable_v1",
+    srcs = ["ops/variable_v1.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":cond",
+        ":state_ops",
+        ":variable_scope",
+        ":variables",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_export",
         "//tensorflow/python/util:tf_should_use",
     ],
 )
@@ -2884,8 +3339,14 @@ py_library(
     deps = [
         ":array_ops",
         ":gradients",
-        ":platform",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -2897,8 +3358,11 @@ py_library(
     deps = [
         ":array_ops",
         ":gradients",
-        ":platform",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -2935,6 +3399,7 @@ cuda_py_test(
     ],
     deps = [
         ":array_ops",
+        ":cond",
         ":cond_v2",
         ":control_flow_case",
         ":control_flow_ops",
@@ -2953,9 +3418,16 @@ cuda_py_test(
         ":while_loop",
         ":while_v2",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -2984,8 +3456,10 @@ cuda_py_test(
         ":gradients",
         ":resource_variable_ops",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
     ],
 )
@@ -2999,11 +3473,15 @@ cuda_py_test(
     deps = [
         ":array_ops",
         ":client_testlib",
+        ":gradient_checker_v2",
         ":math_ops",
         ":nn_grad",
         ":nn_ops",
-        ":platform",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -3017,11 +3495,13 @@ cuda_py_test(
     deps = [
         ":array_grad",
         ":array_ops",
+        ":cond",
         ":control_flow_grad",
         ":control_flow_ops",
         ":data_flow_grad",
         ":data_flow_ops",
         ":functional_ops",
+        ":gradient_checker_v2",
         ":gradients",
         ":init_ops",
         ":list_ops",
@@ -3031,17 +3511,25 @@ cuda_py_test(
         ":nn_grad",
         ":nn_ops",
         ":platform_test",
+        ":ref_variable",
         ":state_grad",
         ":state_ops",
         ":tensor_array_grad",
         ":tensor_array_ops",
         ":unconnected_gradients",
         ":variable_scope",
+        ":variable_v1",
         ":while_loop",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -3056,6 +3544,7 @@ cuda_py_test(
     shard_count = 5,
     deps = [
         ":image_grad_test_base",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -3068,6 +3557,7 @@ cuda_py_test(
     shard_count = 5,
     deps = [
         ":image_grad_test_base",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -3078,9 +3568,13 @@ py_library(
     deps = [
         ":client_testlib",
         ":errors",
+        ":gradient_checker_v2",
         ":gradients",
         ":image_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -3110,8 +3604,12 @@ cuda_py_test(
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -3130,6 +3628,8 @@ cuda_py_test(
         ":resource_variable_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -3146,7 +3646,11 @@ cuda_py_test(
         ":init_ops_v2",
         ":random_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -3164,10 +3668,14 @@ cuda_py_test(
     deps = [
         ":array_ops",
         ":client_testlib",
+        ":gradient_checker_v2",
         ":math_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -3188,8 +3696,11 @@ cuda_py_test(
         ":platform_test",
         ":variables",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
@@ -3207,7 +3718,6 @@ cuda_py_test(
     deps = [
         ":math_ops",
         ":platform_test",
-        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
@@ -3232,7 +3742,10 @@ cuda_py_test(
         ":nn",
         ":nn_grad",
         ":nn_ops_gen",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -3256,7 +3769,11 @@ cuda_py_test(
         ":nn",
         ":nn_grad",
         "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -3275,7 +3792,10 @@ cuda_py_test(
         ":gradients",
         ":nn",
         ":nn_grad",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -3293,13 +3813,19 @@ cuda_py_test(
     deps = [
         ":array_ops",
         ":client_testlib",
+        ":gradient_checker_v2",
         ":nn",
         ":nn_grad",
         ":nn_ops",
         ":partitioned_variables",
         ":variable_scope",
         ":variables",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -3315,6 +3841,9 @@ py_test(
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/distribute:test_util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -3330,7 +3859,9 @@ cuda_py_test(
         ":gradients",
         ":nn",
         ":nn_grad",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -3344,6 +3875,9 @@ py_test(
         ":array_ops",
         ":client_testlib",
         "//tensorflow/python/client",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -3363,7 +3897,10 @@ cuda_py_test(
     deps = [
         ":math_ops",
         ":platform_test",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
@@ -3383,10 +3920,18 @@ cuda_py_test(
     deps = [
         ":array_ops",
         ":client_testlib",
+        ":gradient_checker_v2",
         ":math_ops",
         ":special_math_ops",
         "//tensorflow/python/client",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -3398,8 +3943,14 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         ":platform_test",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
 )
@@ -3547,48 +4098,52 @@ pywrap_tensorflow_macro(
         "@upb//:__subpackages__",
         "@XNNPACK//:__subpackages__",
         "@zlib//:__subpackages__",
-    ] + tsl_async_value_deps(),
+    ] + tsl_async_value_deps() + if_onednn_v3(["@onednn_v3//:__subpackages__"]),
     win_def_file = ":pywrap_tensorflow_filtered_def_file",
     deps = [
-        ":safe_ptr",
-        "//tensorflow/python/grappler:cost_analyzer_lib",
-        "//tensorflow/python/grappler:model_analyzer_lib",
-        "//tensorflow/python/util:cpp_nest",
-        "//tensorflow/python/util:cpp_python_util",
-        "//tensorflow/python/util:function_parameter_canonicalizer",
-        "//tensorflow/python/util:kernel_registry",
-        "//tensorflow/python/lib/core:py_func_lib",
-        "//tensorflow/python/lib/core:pybind11_absl",
-        "//tensorflow/python/lib/core:pybind11_lib",
-        "//tensorflow/python/lib/core:pybind11_status",
-        "//tensorflow/python/lib/core:pybind11_proto",
-        "//tensorflow/python/framework:op_def_util_cc",
-        "//tensorflow/python/framework:python_api_dispatcher",
-        "//tensorflow/python/framework:python_api_info",
-        "//tensorflow/python/framework:python_api_parameter_converter",
-        "//tensorflow/python/framework:python_op_gen",
-        "//tensorflow/python/framework:python_tensor_converter",
-        "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//third_party/python_runtime:headers",
+        ":safe_pyobject_ptr",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
         "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/c:env",
+        "//tensorflow/c:kernels",
+        "//tensorflow/c:kernels_experimental",
+        "//tensorflow/c:logging",
+        "//tensorflow/c:ops",
         "//tensorflow/c:python_api",
+        "//tensorflow/c:safe_ptr",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
-        "//tensorflow/c/experimental/ops",
-        "//tensorflow/core/config:flag_defs",
-        "//tensorflow/core/config:flags",
+        "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "//tensorflow/c/experimental/gradients",
         "//tensorflow/c/experimental/gradients/tape",
+        "//tensorflow/c/experimental/ops",
+        "//tensorflow/c/experimental/stream_executor",
         "//tensorflow/cc/saved_model:fingerprinting_impl",
-        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
+        "//tensorflow/cc/saved_model:loader_lite_impl",
+        "//tensorflow/cc/saved_model:metrics_impl",
+        "//tensorflow/compiler/mlir/python:mlir",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc",
+        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
+        "//tensorflow/compiler/tf2tensorrt:op_converter_registry_impl",
+        "//tensorflow/compiler/tf2xla:tf2xla_opset",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
+        "//tensorflow/core:framework_internal_impl",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal_impl",
+        "//tensorflow/core:reader_base",
+        "//tensorflow/core/common_runtime:core_cpu_impl",
+        "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
+        "//tensorflow/core/common_runtime/pluggable_device:pluggable_device_runtime_impl",
+        "//tensorflow/core/config:flag_defs",
+        "//tensorflow/core/config:flags",
         "//tensorflow/core/data/service:dispatcher_client",
         "//tensorflow/core/data/service:grpc_util",
-        "//tensorflow/core/data/service:server_lib",
         "//tensorflow/core/data/service:py_utils",
+        "//tensorflow/core/data/service:server_lib",
+        "//tensorflow/core/debug",
+        "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/function/runtime_client:runtime_client_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:grappler_item_builder",
@@ -3597,51 +4152,49 @@ pywrap_tensorflow_macro(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_memory",
         "//tensorflow/core/grappler/graph_analyzer:graph_analyzer_tool",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:reader_base",
         "//tensorflow/core/kernels:data_service_ops",
-        "//tensorflow/core/debug",
-        "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/platform:stacktrace_handler",
-        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
-        "//tensorflow/tsl/profiler/rpc/client:profiler_client_impl",
-        "//tensorflow/tsl/python/lib/core",
-        "//tensorflow/core/profiler/internal:print_model_analysis",
+        "//tensorflow/core/profiler:profiler_impl",
         "//tensorflow/core/profiler/backends/cpu:python_tracer",
+        "//tensorflow/core/profiler/internal:print_model_analysis",
+        "//tensorflow/core/util:determinism",
+        "//tensorflow/distribute/experimental/rpc/kernels:rpc_ops",
         "//tensorflow/dtensor/cc:dtensor_device_cc",
         "//tensorflow/dtensor/cc:tensor_layout",
-        "//tensorflow/distribute/experimental/rpc/kernels:rpc_ops",
         "//tensorflow/lite/delegates/flex:delegate",
         "//tensorflow/lite/kernels/shim:tf_kernel_shim",
-        "//tensorflow/tools/graph_transforms:transform_graph_lib",
         "//tensorflow/lite/toco/python:toco_python_api",
         "//tensorflow/python/client:tf_session_helper",
         "//tensorflow/python/eager:pywrap_tfe_lib",
-        "//tensorflow/compiler/mlir/python:mlir",
+        "//tensorflow/python/framework:op_def_util_cc",
+        "//tensorflow/python/framework:python_api_dispatcher",
+        "//tensorflow/python/framework:python_api_info",
+        "//tensorflow/python/framework:python_api_parameter_converter",
+        "//tensorflow/python/framework:python_op_gen",
+        "//tensorflow/python/framework:python_tensor_converter",
+        "//tensorflow/python/grappler:cost_analyzer_lib",
+        "//tensorflow/python/grappler:model_analyzer_lib",
+        "//tensorflow/python/lib/core:py_func_lib",
+        "//tensorflow/python/lib/core:pybind11_absl",
+        "//tensorflow/python/lib/core:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_proto",
+        "//tensorflow/python/lib/core:pybind11_status",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr",
+        "//tensorflow/python/util:cpp_nest",
+        "//tensorflow/python/util:cpp_python_util",
+        "//tensorflow/python/util:function_parameter_canonicalizer",
+        "//tensorflow/python/util:kernel_registry",
+        "//tensorflow/tools/graph_transforms:transform_graph_lib",
+        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
+        "//tensorflow/tsl/profiler/rpc/client:profiler_client_impl",
+        "//tensorflow/tsl/python/lib/core",
+        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:optional",
-        "//tensorflow/c/experimental/filesystem:filesystem_interface",
-        "//tensorflow/c/experimental/stream_executor",
-        "//tensorflow/c:env",
-        "//tensorflow/c:kernels",
-        "//tensorflow/c:kernels_experimental",
-        "//tensorflow/c:logging",
-        "//tensorflow/c:ops",
-        "//tensorflow/cc/saved_model:loader_lite_impl",
-        "//tensorflow/cc/saved_model:metrics_impl",
-        "//tensorflow/compiler/tf2tensorrt:op_converter_registry_impl",
-        "//tensorflow/core/common_runtime:core_cpu_impl",
-        "//tensorflow/core:framework_internal_impl",
-        "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
-        "//tensorflow/core/common_runtime/pluggable_device:pluggable_device_runtime_impl",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
-        "//tensorflow/core:lib_internal_impl",
-        "//tensorflow/core/profiler:profiler_impl",
-        "//tensorflow/core/util:determinism",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
+        "@com_google_absl//absl/types:span",
     ] + if_static([
         "//tensorflow/core/platform:tensor_float_32_utils",
         "//tensorflow/core/platform:enable_tf2_utils",
@@ -3664,49 +4217,36 @@ pywrap_tensorflow_macro(
 filegroup(
     name = "win_lib_files_for_exported_symbols",
     srcs = [
-        "//tensorflow/tsl/python/lib/core:bfloat16_lib",  # bfloat16
-        "//tensorflow/tsl/python/lib/core:custom_casts_lib",
-        "//tensorflow/tsl/python/lib/core:float8_lib",  # float8_e4m3fn, float8_e5m2
-        "//tensorflow/python/grappler:cost_analyzer_lib",
-        "//tensorflow/python/util:cpp_nest",
-        "//tensorflow/python/util:cpp_python_util",
-        "//tensorflow/python/util:kernel_registry",
-        "//tensorflow/python/grappler:model_analyzer_lib",  # model_analyzer
         ":ndarray_tensor",  # checkpoint_reader
-        "//tensorflow/tsl/python/lib/core:numpy",  # checkpoint_reader
         ":py_exception_registry",  # py_exception_registry
-        "//tensorflow/python/lib/core:py_func_lib",
-        "//tensorflow/python/framework:op_def_util_cc",  # op_def_util
-        "//tensorflow/python/framework:python_api_dispatcher",  # python_api_dispatcher
-        "//tensorflow/python/framework:python_tensor_converter",  # python_tensor_converter
-        "//tensorflow/python/framework:python_op_gen",  # python_op_gen
-        ":safe_ptr",  # checkpoint_reader
+        ":safe_pyobject_ptr",  # checkpoint_reader
         "//tensorflow/c:checkpoint_reader",  # checkpoint_reader
         "//tensorflow/c:python_api",  # tf_session
+        "//tensorflow/c:safe_ptr",  # checkpoint_reader
         "//tensorflow/c:tf_status_helper",  # tfe
-        "//tensorflow/cc/saved_model:metrics_impl",  # SavedModel metrics
         "//tensorflow/cc/saved_model:fingerprinting_impl",  # SavedModel fingerprinting
-        "//tensorflow/compiler/jit:get_compiler_ir",  # tfe
-        "//tensorflow/core/platform:statusor",  # tfe
-        "//tensorflow/core/platform:cpu_feature_guard",  # cpu_feature_guard
+        "//tensorflow/cc/saved_model:metrics_impl",  # SavedModel metrics
         "//tensorflow/compiler/jit:flags",  # tfe
+        "//tensorflow/compiler/jit:get_compiler_ir",  # tfe
         "//tensorflow/compiler/mlir/python:mlir",  # mlir
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc_impl",  # quantization
+        "//tensorflow/compiler/tf2xla:tf2xla_opset",  # pywrap_xla_ops
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl",  # stat_summarizer
+        "//tensorflow/core:framework_internal_impl",  # op_def_registry
+        "//tensorflow/core:lib_internal_impl",  # device_lib
+        "//tensorflow/core:op_gen_lib",  # tf_session
         "//tensorflow/core/common_runtime:graph_constructor",  # tf_session
         "//tensorflow/core/common_runtime:quantize_training",  # quantize_training
         "//tensorflow/core/common_runtime:session_options",  # device_lib, tfe, tf_session
         "//tensorflow/core/common_runtime:session_state",  # tf_session
+        "//tensorflow/core/common_runtime/eager:context",  # tfe
+        "//tensorflow/core/common_runtime/eager:eager_executor",  # tfe
+        "//tensorflow/core/common_runtime/eager:tensor_handle",  # tfe
         "//tensorflow/core/config:flag_defs",  # flags_api
         "//tensorflow/core/config:flags",  # flags_api
         "//tensorflow/core/data/service:dispatcher_client",  # dispatcher_client
         "//tensorflow/core/data/service:grpc_util",  # grpc_util
         "//tensorflow/core/data/service:server_lib",  # server_lib
-        "//tensorflow/core:framework_internal_impl",  # op_def_registry
-        "//tensorflow/core:lib_internal_impl",  # device_lib
-        "//tensorflow/core:op_gen_lib",  # tf_session
-        "//tensorflow/core/common_runtime/eager:context",  # tfe
-        "//tensorflow/core/common_runtime/eager:tensor_handle",  # tfe
-        "//tensorflow/core/common_runtime/eager:eager_executor",  # tfe
         "//tensorflow/core/framework:attr_value_proto_cc",  # tf_text
         "//tensorflow/core/framework:op_def_proto_cc",  # tf_text
         "//tensorflow/core/function/runtime_client:runtime_client_cc",  # runtime_client_pybind
@@ -3726,27 +4266,42 @@ filegroup(
         "//tensorflow/core/grappler/graph_analyzer:graph_analyzer_tool",  # graph_analyzer
         "//tensorflow/core/grappler/optimizers:meta_optimizer",  # tf_optimizer
         "//tensorflow/core/grappler/utils:topological_sort",  # tf_item
-        "//tensorflow/core/util:determinism",  # determinism
-        "//tensorflow/tsl/platform:tensor_float_32_utils",  # tensor_float_32
-        "//tensorflow/core/profiler/internal:print_model_analysis",  # tfprof
+        "//tensorflow/core/platform:cpu_feature_guard",  # cpu_feature_guard
+        "//tensorflow/core/platform:statusor",  # tfe
         "//tensorflow/core/profiler/backends/cpu:traceme_recorder_impl",  # profiler
-        "//tensorflow/tsl/profiler/lib:profiler_session_impl",  # profiler
-        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",  # profiler
+        "//tensorflow/core/profiler/internal:print_model_analysis",  # tfprof
         "//tensorflow/core/profiler/rpc/client:profiler_client_impl",  # profiler
-        "//tensorflow/tsl/profiler/rpc/client:profiler_client_impl",
-        "//tensorflow/python/framework:python_api_info",  # python_api_info
-        "//tensorflow/python/framework:python_api_parameter_converter",  # python_api_parameter_converter
+        "//tensorflow/core/util:determinism",  # determinism
         "//tensorflow/core/util:port",  # util_port
         "//tensorflow/core/util/tensor_bundle",  # checkpoint_reader
+        "//tensorflow/dtensor/cc:dtensor_device_cc",  # DTensor
+        "//tensorflow/dtensor/cc:tensor_layout",  # DTensor
         "//tensorflow/lite/kernels/shim:shape",  # tf_text
         "//tensorflow/lite/kernels/shim:tf_op_shim",  # tf_text
         "//tensorflow/lite/toco/python:toco_python_api",  # toco
         "//tensorflow/python/client:tf_session_helper",  # tf_session
         "//tensorflow/python/eager:pywrap_tfe_lib",  # pywrap_tfe_lib
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl",  # stat_summarizer
+        "//tensorflow/python/framework:op_def_util_cc",  # op_def_util
+        "//tensorflow/python/framework:python_api_dispatcher",  # python_api_dispatcher
+        "//tensorflow/python/framework:python_api_info",  # python_api_info
+        "//tensorflow/python/framework:python_api_parameter_converter",  # python_api_parameter_converter
+        "//tensorflow/python/framework:python_op_gen",  # python_op_gen
+        "//tensorflow/python/framework:python_tensor_converter",  # python_tensor_converter
+        "//tensorflow/python/grappler:cost_analyzer_lib",
+        "//tensorflow/python/grappler:model_analyzer_lib",  # model_analyzer
+        "//tensorflow/python/lib/core:py_func_lib",
+        "//tensorflow/python/util:cpp_nest",
+        "//tensorflow/python/util:cpp_python_util",
+        "//tensorflow/python/util:kernel_registry",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",  # transform_graph
-        "//tensorflow/dtensor/cc:dtensor_device_cc",  # DTensor
-        "//tensorflow/dtensor/cc:tensor_layout",  # DTensor
+        "//tensorflow/tsl/platform:tensor_float_32_utils",  # tensor_float_32
+        "//tensorflow/tsl/profiler/lib:profiler_session_impl",  # profiler
+        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",  # profiler
+        "//tensorflow/tsl/profiler/rpc/client:profiler_client_impl",
+        "//tensorflow/tsl/python/lib/core:bfloat16_lib",  # bfloat16
+        "//tensorflow/tsl/python/lib/core:custom_casts_lib",
+        "//tensorflow/tsl/python/lib/core:float8_lib",  # float8_e4m3fn, float8_e5m2
+        "//tensorflow/tsl/python/lib/core:numpy",  # checkpoint_reader
     ] + if_xla_available([
         "//tensorflow/compiler/aot:tfcompile_lib",  # tfcompile
         "//tensorflow/compiler/xla:status_macros",  # tfcompile
@@ -3872,17 +4427,11 @@ py_library(
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
-        ":platform",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-alias(
-    name = "summary",
-    actual = "//tensorflow/python/summary:summary",
-    visibility = ["//visibility:public"],
-)
-
 alias(
     name = "fake_summary_writer",
     actual = "//tensorflow/python/summary/writer:fake_summary_writer",
@@ -3903,7 +4452,8 @@ tf_py_test(
     deps = [
         ":array_ops",
         ":client_testlib",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//third_party/py/numpy",
     ],
 )
@@ -3919,7 +4469,8 @@ tf_py_test(
     deps = [
         ":array_ops",
         ":client_testlib",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//third_party/py/numpy",
     ],
 )
@@ -3935,7 +4486,9 @@ tf_py_test(
     deps = [
         ":client_testlib",
         ":nn_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//third_party/py/numpy",
     ],
 )
@@ -3977,7 +4530,8 @@ cuda_py_test(
         ":state_ops",
         ":state_ops_gen",
         "//tensorflow/python/client",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
     ],
 )
 
@@ -3994,11 +4548,12 @@ cuda_py_test(
         ":nn",
         ":nn_grad",
         ":nn_ops_gen",
-        ":platform",
         ":random_ops",
         ":variables",
         "//tensorflow/python/client",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
@@ -4011,11 +4566,11 @@ cuda_py_test(
         ":array_ops",
         ":client_testlib",
         ":collective_ops",
-        ":platform",
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
@@ -4029,11 +4584,10 @@ cuda_py_test(
         ":client_testlib",
         ":control_flow_ops",
         ":gradients",
-        ":platform",
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
@@ -4044,9 +4598,11 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":client_testlib",
+        ":cond",
         ":control_flow_ops",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
     ],
 )
@@ -4061,13 +4617,15 @@ cuda_py_test(
         ":client_testlib",
         ":control_flow_ops",
         ":nn_ops",
-        ":platform",
         ":platform_benchmark",
         ":random_ops",
+        ":variable_v1",
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:flags",
         "//third_party/py/numpy",
     ],
 )
@@ -4081,12 +4639,12 @@ cuda_py_test(
         ":array_ops",
         ":client_testlib",
         ":control_flow_ops",
-        ":platform",
         ":platform_benchmark",
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -4101,12 +4659,12 @@ cuda_py_test(
         ":array_ops",
         ":client_testlib",
         ":control_flow_ops",
-        ":platform",
         ":platform_benchmark",
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
         "//third_party/py/numpy",
     ],
 )
@@ -4117,7 +4675,10 @@ cuda_py_test(
     srcs = ["ops/matmul_benchmark.py"],
     main = "ops/matmul_benchmark.py",
     python_version = "PY3",
-    deps = [":matmul_benchmark_main_lib"],
+    deps = [
+        ":matmul_benchmark_main_lib",
+        "//tensorflow/python/platform:client_testlib",
+    ],
 )
 
 py_library(
@@ -4129,13 +4690,13 @@ py_library(
         ":client_testlib",
         ":control_flow_ops",
         ":math_ops",
-        ":platform",
         ":platform_benchmark",
         ":random_ops",
+        ":variable_v1",
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
@@ -4149,9 +4710,12 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":client_testlib",
+        ":gradient_checker_v2",
         ":nn_grad",
         ":nn_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -4176,7 +4740,8 @@ py_library(
         ":nccl_ops_gen",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
@@ -4214,6 +4779,7 @@ cuda_py_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -4259,9 +4825,10 @@ tf_python_pybind_extension(
     srcs = ["mlir_wrapper.cc"],
     hdrs = [
         "//tensorflow/c:headers",
+        "//tensorflow/c:safe_ptr_hdr",
         "//tensorflow/c/eager:headers",
         "//tensorflow/compiler/mlir/python:pywrap_mlir_hdrs",
-        "//tensorflow/python/lib/core:safe_ptr_hdr",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
     ],
     deps = [
         "//tensorflow/core:protos_all_cc",
@@ -4314,6 +4881,7 @@ cc_library(
         "//tensorflow/python/lib/core:basic_hdrs",
         "//tensorflow/tsl/python/lib/core:basic_hdrs",
         "//tensorflow/c:headers",
+        "//tensorflow/c:safe_ptr_hdr",
         "//tensorflow/c/eager:headers",
         "//tensorflow/c/eager:pywrap_required_hdrs",
         "//tensorflow/c/experimental/ops:pywrap_required_hdrs",
@@ -4351,6 +4919,7 @@ tf_python_pybind_extension(
     srcs = ["tfe_wrapper.cc"],
     hdrs = [
         "//tensorflow/c:headers",
+        "//tensorflow/c:safe_ptr_hdr",
         "//tensorflow/c/eager:headers",
         "//tensorflow/c/eager:pywrap_required_hdrs",
         "//tensorflow/c/experimental/ops:pywrap_required_hdrs",
@@ -4360,7 +4929,7 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
         "//tensorflow/python/lib/core:py_exception_registry_hdr",
-        "//tensorflow/python/lib/core:safe_ptr_hdr",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "//tensorflow/python/util:util_hdr",
         "//tensorflow/tsl/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
@@ -4373,30 +4942,30 @@ tf_python_pybind_extension(
     }),
     static_deps = tf_python_pybind_static_deps(),
     deps = [
-        "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/py/numpy:headers",
-        "//tensorflow/python/lib/core:pybind11_status",
+        "//tensorflow/c:pywrap_required_hdrs",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal_hdrs_only",
+        "//tensorflow/compiler/jit:flags_headers_only",
+        "//tensorflow/compiler/jit:get_compiler_ir_hdrs_only",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib_headers_for_pybind",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core/config:flags_headers",
         "//tensorflow/core/framework:pywrap_required_hdrs",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/platform",
+        "//tensorflow/python/lib/core:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_status",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
-        "//tensorflow/c:pywrap_required_hdrs",
         "@pybind11",
-        "//third_party/python_runtime:headers",
-        "//tensorflow/compiler/jit:flags_headers_only",
-        "//tensorflow/compiler/jit:get_compiler_ir_hdrs_only",
-        "//tensorflow/c/eager:tfe_tensorhandle_internal_hdrs_only",
-        "//tensorflow/core/common_runtime:core_cpu_headers_lib",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:lib_headers_for_pybind",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform",
-        "//tensorflow/core/lib/llvm_rtti",
     ] + if_static(
         extra_deps = [
             "//tensorflow/core/protobuf:eager_service_proto_cc",
@@ -4413,15 +4982,66 @@ tf_python_pybind_extension(
     ),
 )
 
+py_library(
+    name = "pywrap_tfe_monitoring_reader",
+    testonly = True,
+    srcs = ["pywrap_tfe_monitoring_reader.py"],
+    srcs_version = "PY3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":_pywrap_tfe_monitoring_reader",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_pywrap_tfe_monitoring_reader",
+    testonly = True,
+    srcs = ["tfe_wrapper_monitoring_reader.cc"],
+    hdrs = [
+        "//tensorflow/c/eager:headers_monitoring_reader",
+        "//tensorflow/c/eager:pywrap_headers_monitoring_reader",
+        "//tensorflow/python/util:util_hdr",
+    ],
+    deps = [
+        "//tensorflow/c:c_api",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental_reader",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal_hdrs_only",
+        "//tensorflow/compiler/jit:flags_headers_only",
+        "//tensorflow/compiler/jit:get_compiler_ir_hdrs_only",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib_headers_for_pybind",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:core_cpu_headers_lib",
+        "//tensorflow/core/config:flags_headers",
+        "//tensorflow/core/framework:pywrap_required_hdrs",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/platform",
+        "//tensorflow/python/lib/core:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_status",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+        "@pybind11",
+    ],
+)
+
 tf_python_pybind_extension(
     name = "_pywrap_parallel_device",
     srcs = [
         "//tensorflow/c:headers",
+        "//tensorflow/c:safe_ptr_hdr",
         "//tensorflow/c/eager:headers",
         "//tensorflow/c/eager/parallel_device:headers",
         "//tensorflow/c/eager/parallel_device:sources",
         "//tensorflow/python/distribute/parallel_device:pywrap_parallel_device.cc",
-        "//tensorflow/python/lib/core:safe_ptr_hdr",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
     ],
     visibility = ["//tensorflow/python/distribute/parallel_device:__pkg__"],
     deps = [
@@ -4505,12 +5125,6 @@ py_library(
     deps = ["//tensorflow/python/platform:test"],
 )
 
-alias(
-    name = "platform",
-    actual = "//tensorflow/python/platform:platform",
-    visibility = ["//visibility:public"],
-)
-
 alias(
     name = "client_testlib",
     actual = "//tensorflow/python/platform:client_testlib",
@@ -4567,8 +5181,8 @@ alias(
 )
 
 alias(
-    name = "safe_ptr",
-    actual = "//tensorflow/python/lib/core:safe_ptr",
+    name = "safe_pyobject_ptr",
+    actual = "//tensorflow/python/lib/core:safe_pyobject_ptr",
 )
 
 alias(
@@ -4625,6 +5239,7 @@ alias(
 alias(
     name = "dtypes",
     actual = "//tensorflow/python/framework:dtypes",
+    visibility = visibility + ["//smartass:__subpackages__"],
 )
 
 alias(
@@ -4640,6 +5255,7 @@ alias(
 alias(
     name = "function",
     actual = "//tensorflow/python/framework:function",
+    visibility = visibility + ["//smartass/brain:__subpackages__"],
 )
 
 alias(
@@ -4750,6 +5366,7 @@ alias(
 alias(
     name = "tensor_shape",
     actual = "//tensorflow/python/framework:tensor_shape",
+    visibility = visibility + ["//smartass/brain/configure/python:__pkg__"],
 )
 
 alias(
@@ -4868,11 +5485,6 @@ alias(
     actual = "//tensorflow/python/training:checkpoint_management",
 )
 
-alias(
-    name = "distribute",
-    actual = "//tensorflow/python/training:distribute",
-)
-
 alias(
     name = "py_checkpoint_reader",
     actual = "//tensorflow/python/training:py_checkpoint_reader",
@@ -4972,7 +5584,7 @@ alias(
 py_library(
     name = "dtensor",
     deps = [
-        "//tensorflow/dtensor/python:core",
+        "//tensorflow/dtensor/python:dtensor",
     ],
 )
 
diff --git a/tensorflow/python/autograph/BUILD b/tensorflow/python/autograph/BUILD
index c4128631e6e..01306a77aa1 100644
--- a/tensorflow/python/autograph/BUILD
+++ b/tensorflow/python/autograph/BUILD
@@ -5,18 +5,6 @@ package(
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//visibility:private"],
-)
-
 py_strict_library(
     name = "autograph",
     srcs = [
@@ -25,13 +13,15 @@ py_strict_library(
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:util",
         "//tensorflow/python/autograph/converters:__init__",
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/impl",
-        "//tensorflow/python/autograph/lang",
-        "//tensorflow/python/autograph/operators",
+        "//tensorflow/python/autograph/impl:api",
+        "//tensorflow/python/autograph/lang:directives",
+        "//tensorflow/python/autograph/lang:special_functions",
+        "//tensorflow/python/autograph/operators:__init__",
+        "//tensorflow/python/autograph/pyct/common_transformers",
         "//tensorflow/python/autograph/utils:__init__",
         "//tensorflow/python/autograph/utils:ag_logging",
+        "//tensorflow/python/util:all_util",
     ],
 )
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 58b4722a43a..57a0e14df85 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -20,8 +20,8 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/lang",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/lang:directives",
+        "//tensorflow/python/autograph/pyct:templates",
         "@gast_archive//:gast",
     ],
 )
@@ -32,8 +32,11 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:templates",
+        "//tensorflow/python/autograph/pyct/static_analysis:activity",
+        "//tensorflow/python/autograph/pyct/static_analysis:annos",
     ],
 )
 
@@ -43,7 +46,7 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:templates",
         "@gast_archive//:gast",
     ],
 )
@@ -54,7 +57,8 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:templates",
         "@gast_archive//:gast",
     ],
 )
@@ -65,8 +69,11 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:templates",
+        "//tensorflow/python/autograph/pyct/static_analysis:activity",
+        "//tensorflow/python/autograph/pyct/static_analysis:annos",
     ],
 )
 
@@ -76,7 +83,8 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:templates",
         "@gast_archive//:gast",
     ],
 )
@@ -87,7 +95,10 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:templates",
         "//tensorflow/python/autograph/utils:ag_logging",
         "@gast_archive//:gast",
     ],
@@ -99,9 +110,18 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/lang",
-        "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/autograph/lang:directives",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:ast_util",
+        "//tensorflow/python/autograph/pyct:cfg",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:templates",
+        "//tensorflow/python/autograph/pyct/static_analysis:activity",
+        "//tensorflow/python/autograph/pyct/static_analysis:annos",
+        "//tensorflow/python/autograph/pyct/static_analysis:liveness",
+        "//tensorflow/python/autograph/pyct/static_analysis:reaching_definitions",
+        "//tensorflow/python/autograph/pyct/static_analysis:reaching_fndefs",
         "@gast_archive//:gast",
     ],
 )
@@ -112,8 +132,8 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/lang",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/lang:directives",
+        "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/util:tf_inspect",
         "@gast_archive//:gast",
     ],
@@ -125,7 +145,8 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:templates",
         "@gast_archive//:gast",
     ],
 )
@@ -136,9 +157,18 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/lang",
-        "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/autograph/lang:directives",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:cfg",
+        "//tensorflow/python/autograph/pyct:origin_info",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:templates",
+        "//tensorflow/python/autograph/pyct/static_analysis:activity",
+        "//tensorflow/python/autograph/pyct/static_analysis:annos",
+        "//tensorflow/python/autograph/pyct/static_analysis:liveness",
+        "//tensorflow/python/autograph/pyct/static_analysis:reaching_definitions",
+        "//tensorflow/python/autograph/pyct/static_analysis:reaching_fndefs",
         "@gast_archive//:gast",
     ],
 )
@@ -149,8 +179,12 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:templates",
+        "//tensorflow/python/autograph/pyct/static_analysis:activity",
+        "//tensorflow/python/autograph/pyct/static_analysis:annos",
         "@gast_archive//:gast",
     ],
 )
@@ -161,9 +195,13 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/lang",
-        "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/autograph/lang:directives",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:templates",
+        "//tensorflow/python/autograph/pyct/static_analysis:activity",
+        "//tensorflow/python/autograph/pyct/static_analysis:annos",
         "@gast_archive//:gast",
     ],
 )
@@ -174,8 +212,12 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:templates",
+        "//tensorflow/python/autograph/pyct/static_analysis:activity",
+        "//tensorflow/python/autograph/pyct/static_analysis:annos",
         "@gast_archive//:gast",
     ],
 )
@@ -186,7 +228,7 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python/autograph/core:converter",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:templates",
         "@gast_archive//:gast",
     ],
 )
@@ -216,7 +258,7 @@ py_strict_test(
         ":break_statements",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:anno",
     ],
 )
 
@@ -289,8 +331,8 @@ py_strict_test(
         ":directives",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
-        "//tensorflow/python/autograph/lang",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/lang:directives",
+        "//tensorflow/python/autograph/pyct:anno",
     ],
 )
 
@@ -305,7 +347,7 @@ py_strict_test(
         "//tensorflow/python/autograph/core:ag_ctx",
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/core:test_lib",
-        "//tensorflow/python/autograph/impl",
+        "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/framework:constant_op",
     ],
 )
@@ -334,7 +376,8 @@ py_strict_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:list_ops",
         "//tensorflow/python/autograph/core:test_lib",
-        "//tensorflow/python/autograph/lang",
+        "//tensorflow/python/autograph/lang:directives",
+        "//tensorflow/python/autograph/lang:special_functions",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
     ],
@@ -379,7 +422,7 @@ py_strict_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:list_ops",
         "//tensorflow/python/autograph/core:test_lib",
-        "//tensorflow/python/autograph/lang",
+        "//tensorflow/python/autograph/lang:directives",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
     ],
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 051b605ed24..f27800a176c 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -1,5 +1,4 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
-load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -16,7 +15,7 @@ py_strict_library(
     deps = [
         ":config",
         ":converter",
-        "//tensorflow/python/autograph/impl",
+        "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -37,7 +36,7 @@ py_strict_library(
     srcs = ["unsupported_features_checker.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:errors",
         "@gast_archive//:gast",
     ],
 )
@@ -63,7 +62,7 @@ py_strict_library(
         ":ag_ctx",
         ":converter",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python/autograph/operators",
+        "//tensorflow/python/autograph/operators:variables",
         "//tensorflow/python/framework:auto_control_deps",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/util:nest",
@@ -75,7 +74,11 @@ py_strict_library(
     srcs = ["converter.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:ast_util",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:templates",
+        "//tensorflow/python/autograph/pyct:transformer",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -89,7 +92,10 @@ py_strict_test(
         ":converter",
         ":test_lib",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:loader",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:templates",
     ],
 )
 
@@ -107,9 +113,3 @@ py_strict_test(
         "//tensorflow/python/framework:constant_op",
     ],
 )
-
-pytype_strict_library(
-    name = "config_oss",
-    srcs = ["config.oss.py"],
-    deps = [":config_lib"],
-)
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index e245a5c0b8e..c0cda6ed2d3 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
@@ -5,31 +6,12 @@ package(
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//visibility:private"],
-)
-
-py_library(
-    name = "impl",
-    srcs = [
-        "api.py",
-        "conversion.py",
-    ],
-    srcs_version = "PY3",
-    # TODO(mdan): fix the cyclic dependency issue when build_cleaner is applied on this library.
-    tags = ["nofixdeps"],
+py_strict_library(
+    name = "api",
+    srcs = ["api.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
+        ":conversion",
         "//tensorflow/python/autograph/converters:asserts",
         "//tensorflow/python/autograph/converters:break_statements",
         "//tensorflow/python/autograph/converters:call_trees",
@@ -44,15 +26,44 @@ py_library(
         "//tensorflow/python/autograph/converters:slices",
         "//tensorflow/python/autograph/converters:variables",
         "//tensorflow/python/autograph/core:ag_ctx",
-        "//tensorflow/python/autograph/core:config",
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/core:function_wrappers",
         "//tensorflow/python/autograph/core:unsupported_features_checker",
-        "//tensorflow/python/autograph/operators",
-        "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/autograph/lang:special_functions",
+        "//tensorflow/python/autograph/operators:__init__",
+        "//tensorflow/python/autograph/operators:py_builtins",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:cache",
+        "//tensorflow/python/autograph/pyct:cfg",
+        "//tensorflow/python/autograph/pyct:error_utils",
+        "//tensorflow/python/autograph/pyct:errors",
+        "//tensorflow/python/autograph/pyct:inspect_utils",
+        "//tensorflow/python/autograph/pyct:origin_info",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:transpiler",
+        "//tensorflow/python/autograph/pyct/static_analysis:activity",
+        "//tensorflow/python/autograph/pyct/static_analysis:reaching_definitions",
+        "//tensorflow/python/autograph/utils:__init__",
         "//tensorflow/python/autograph/utils:ag_logging",
-        "@gast_archive//:gast",
+        "//tensorflow/python/eager/polymorphic_function:tf_method_target",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:tf_stack",
+    ],
+)
+
+py_strict_library(
+    name = "conversion",
+    srcs = ["conversion.py"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/python/autograph/core:config",
+        "//tensorflow/python/autograph/pyct:cache",
+        "//tensorflow/python/autograph/pyct:inspect_utils",
+        "//tensorflow/python/autograph/utils:ag_logging",
+        "//tensorflow/python/eager/polymorphic_function:tf_method_target",
+        "//tensorflow/python/util:tf_inspect",
     ],
 )
 
@@ -62,11 +73,27 @@ tf_py_test(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":impl",
+        ":api",
+        ":conversion",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/autograph/core:ag_ctx",
+        "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/core:test_lib",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:errors",
+        "//tensorflow/python/autograph/pyct:inspect_utils",
+        "//tensorflow/python/autograph/pyct:parser",
         "//tensorflow/python/autograph/utils:ag_logging",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:_errors_test_helper",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/util:function_utils",
+        "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
@@ -75,9 +102,14 @@ tf_py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
     deps = [
-        ":impl",
+        ":api",
+        ":conversion",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/autograph/core:config",
+        "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/impl/testing:pybind_for_testing",
-        "@gast_archive//:gast",
+        "//tensorflow/python/autograph/utils:__init__",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework:constant_op",
     ],
 )
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index cd433c8ec20..b60c457599e 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -55,7 +55,7 @@ from tensorflow.python.autograph.pyct import transpiler
 from tensorflow.python.autograph.pyct.static_analysis import activity
 from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
 from tensorflow.python.autograph.utils import ag_logging as logging
-from tensorflow.python.eager import function
+from tensorflow.python.eager.polymorphic_function import tf_method_target
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
@@ -390,7 +390,7 @@ def converted_call(f, args, kwargs, caller_fn_scope=None, options=None):
 
       f_self = getattr(f, '__self__', None)
       if f_self is not None:
-        if isinstance(f_self, function.TfMethodTarget):
+        if isinstance(f_self, tf_method_target.TfMethodTarget):
           f_self = f_self.target
         effective_args = (f_self,) + effective_args
 
@@ -451,7 +451,8 @@ def _call_unconverted(f, args, kwargs, options, update_cache=True):
   if update_cache:
     conversion.cache_allowlisted(f, options)
 
-  if inspect.ismethod(f) and isinstance(f.__self__, function.TfMethodTarget):
+  if (inspect.ismethod(f) and
+      isinstance(f.__self__, tf_method_target.TfMethodTarget)):
     return f.__self__.call(args, kwargs)
 
   if kwargs is not None:
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index 6ef9f06542c..1332240d9df 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -1299,13 +1299,13 @@ class ApiTest(test.TestCase):
 
       with patch.object(ag_ctx, 'INSPECT_SOURCE_SUPPORTED', False):
         with self.assertRaisesRegex(tf_errors.OperatorNotAllowedInGraphError,
-                                    'AutoGraph is unavailable in this runtime'):
+                                    'source code may not be visible'):
           test_func(2)
       warning_log_mock.assert_not_called()
 
       with patch.object(ag_ctx, 'INSPECT_SOURCE_SUPPORTED', True):
         with self.assertRaisesRegex(tf_errors.OperatorNotAllowedInGraphError,
-                                    'AutoGraph did convert this function'):
+                                    'using an unsupported feature'):
           test_func(2)
       warning_log_mock.called_once_with('AutoGraph could not transform')
 
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 45950590475..b2dd4c678c4 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -23,7 +23,7 @@ from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.pyct import cache
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.utils import ag_logging as logging
-from tensorflow.python.eager import function
+from tensorflow.python.eager.polymorphic_function import tf_method_target
 from tensorflow.python.util import tf_inspect
 
 
@@ -179,7 +179,7 @@ def is_allowlisted(
     # longer be allowed.
 
     owner_class = inspect_utils.getmethodclass(o)
-    if owner_class is function.TfMethodTarget:
+    if owner_class is tf_method_target.TfMethodTarget:
       owner_class = o.__self__.target_class
     if owner_class is not None:
       if issubclass(owner_class, unittest.TestCase):
diff --git a/tensorflow/python/autograph/lang/BUILD b/tensorflow/python/autograph/lang/BUILD
index 97ffac49511..c0247a4f779 100644
--- a/tensorflow/python/autograph/lang/BUILD
+++ b/tensorflow/python/autograph/lang/BUILD
@@ -1,42 +1,42 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//visibility:private"],
-)
-
-py_library(
-    name = "lang",
-    srcs = [
-        "directives.py",
-        "special_functions.py",
-    ],
-    srcs_version = "PY3",
+py_strict_library(
+    name = "directives",
+    srcs = ["directives.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python/autograph/operators",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_test(
+py_strict_library(
+    name = "special_functions",
+    srcs = ["special_functions.py"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/python/autograph/operators:data_structures",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:tensor_util",
+    ],
+)
+
+py_strict_test(
     name = "special_functions_test",
     srcs = ["special_functions_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":lang",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:list_ops",
+        "//tensorflow/python/autograph/lang:special_functions",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_util",
+        "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 580ea094856..70c6ff17876 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -1,84 +1,182 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//visibility:private"],
-)
-
-py_library(
-    name = "operators",
-    srcs = [
-        "__init__.py",
-        "conditional_expressions.py",
-        "control_flow.py",
-        "data_structures.py",
-        "exceptions.py",
-        "logical.py",
-        "py_builtins.py",
-        "slices.py",
-        "variables.py",
-    ],
-    srcs_version = "PY3",
+py_strict_library(
+    name = "py_builtins",
+    srcs = ["py_builtins.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_util",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:list_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops_gen",
+        "//tensorflow/python:string_ops_gen",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/autograph/utils:tensors",
+        "//tensorflow/python/autograph/utils:type_registry",
+    ],
+)
+
+py_strict_library(
+    name = "exceptions",
+    srcs = ["exceptions.py"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/python:control_flow_assert",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/util:tf_inspect",
+    ],
+)
+
+py_strict_library(
+    name = "__init__",
+    srcs = ["__init__.py"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        ":conditional_expressions",
+        ":control_flow",
+        ":data_structures",
+        ":exceptions",
+        ":logical",
+        ":py_builtins",
+        ":slices",
+        ":variables",
+    ],
+)
+
+py_strict_library(
+    name = "logical",
+    srcs = ["logical.py"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/python:cond",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
+py_strict_library(
+    name = "variables",
+    srcs = ["variables.py"],
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_strict_library(
+    name = "data_structures",
+    srcs = ["data_structures.py"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:list_ops",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variables",
+    ],
+)
+
+py_strict_library(
+    name = "conditional_expressions",
+    srcs = ["conditional_expressions.py"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        ":control_flow",
+        "//tensorflow/python:cond",
+        "//tensorflow/python/autograph/utils:tensors",
+    ],
+)
+
+py_strict_library(
+    name = "control_flow",
+    srcs = ["control_flow.py"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        ":py_builtins",
+        ":variables",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
+        "//tensorflow/python:control_flow_assert",
+        "//tensorflow/python:control_flow_util",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:while_loop",
         "//tensorflow/python/autograph/utils:ag_logging",
         "//tensorflow/python/autograph/utils:misc",
         "//tensorflow/python/autograph/utils:tensors",
         "//tensorflow/python/autograph/utils:type_registry",
-        "//tensorflow/python/data/experimental/ops:cardinality",
-        "//tensorflow/python/data/experimental/ops:scan_ops",
-        "//tensorflow/python/data/experimental/ops:take_while_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/types:distribute",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:variable_utils",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_library(
+    name = "slices",
+    srcs = ["slices.py"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:list_ops",
+        "//tensorflow/python:string_ops_gen",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
+py_strict_test(
     name = "data_structures_test",
     srcs = ["data_structures_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":operators",
+        ":data_structures",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:list_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "conditional_expressions_test",
     srcs = ["conditional_expressions_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":operators",
+        ":conditional_expressions",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
     python_version = "PY3",
@@ -87,71 +185,102 @@ py_test(
         "no_gpu",  # b/127001953
     ],
     deps = [
-        ":operators",
+        ":control_flow",
+        ":variables",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_assert",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python/autograph/utils:ag_logging",
         "//tensorflow/python/autograph/utils:testing",
-        # TODO(b/145618471): Remove this transitive dependency.
-        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "exceptions_test",
     srcs = ["exceptions_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":operators",
+        ":exceptions",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "logical_test",
     srcs = ["logical_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":operators",
+        ":logical",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "py_builtins_test",
     srcs = ["py_builtins_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":operators",
+        ":data_structures",
+        ":py_builtins",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/core:function_wrappers",
-        # TODO(b/145618471): Remove this transitive dependency.
-        "//tensorflow/python/distribute:input_lib",
-        "//tensorflow/python/ops/parallel_for:control_flow_ops",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "slices_test",
     srcs = ["slices_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":operators",
+        ":slices",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:list_ops",
+        "//tensorflow/python/framework:constant_op",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "variables_test",
     srcs = ["variables_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":operators",
+        ":variables",
         "//tensorflow/python:client_testlib",
     ],
 )
+
+pytype_strict_library(
+    name = "dispatch_context",
+    srcs = ["dispatch_context.py"],
+)
diff --git a/tensorflow/python/autograph/operators/conditional_expressions.py b/tensorflow/python/autograph/operators/conditional_expressions.py
index 8d4207791d4..28fd328834c 100644
--- a/tensorflow/python/autograph/operators/conditional_expressions.py
+++ b/tensorflow/python/autograph/operators/conditional_expressions.py
@@ -17,7 +17,7 @@
 
 from tensorflow.python.autograph.operators import control_flow
 from tensorflow.python.autograph.utils import tensors
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond as tf_cond
 
 
 def if_exp(cond, if_true, if_false, expr_repr):
@@ -45,7 +45,7 @@ def _tf_if_exp(cond, if_true, if_false, expr_repr):
       control_flow.verify_single_cond_var(expr_repr, true_val[0], false_val[0])
     return false_val[0]
 
-  return control_flow_ops.cond(cond, true_fn, false_fn)
+  return tf_cond.cond(cond, true_fn, false_fn)
 
 
 def _py_if_exp(cond, if_true, if_false):
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index fe6e7369721..2dc30e8e1ed 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -71,14 +71,16 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import while_loop
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.types import distribute
 from tensorflow.python.util import nest
@@ -118,7 +120,7 @@ def _is_none_or_undef(value):
 def _verify_tf_condition(cond, tag):
   """Ensures that the condition can be used in a TF control flow."""
   extra_hint = 'to check for None, use `is not None`'
-  cond = ops.convert_to_tensor_v2(cond)
+  cond = tensor_conversion.convert_to_tensor_v2(cond)
 
   if cond.dtype != dtypes.bool:
     raise ValueError(
@@ -213,11 +215,11 @@ def _verify_single_loop_var(
     raise ValueError("'{}' is None at the end of the iteration.".format(name))
 
   if isinstance(init, (bool, int, float, str, np.ndarray)):
-    init = ops.convert_to_tensor_v2(init)
+    init = tensor_conversion.convert_to_tensor_v2(init)
   if isinstance(entry, (bool, int, float, str, np.ndarray)):
-    entry = ops.convert_to_tensor_v2(entry)
+    entry = tensor_conversion.convert_to_tensor_v2(entry)
   if isinstance(exit_, (bool, int, float, str, np.ndarray)):
-    exit_ = ops.convert_to_tensor_v2(exit_)
+    exit_ = tensor_conversion.convert_to_tensor_v2(exit_)
 
   if (not tensor_util.is_tf_type(entry) or
       not tensor_util.is_tf_type(exit_)):
@@ -329,10 +331,10 @@ def verify_single_cond_var(name, body_var, orelse_var):
         "'{}' is None at the end of the else branch.".format(name))
 
   if isinstance(body_var, (bool, int, float, str, np.ndarray)):
-    body_var = ops.convert_to_tensor_v2(body_var)
+    body_var = tensor_conversion.convert_to_tensor_v2(body_var)
 
   if isinstance(orelse_var, (bool, int, float, str, np.ndarray)):
-    orelse_var = ops.convert_to_tensor_v2(orelse_var)
+    orelse_var = tensor_conversion.convert_to_tensor_v2(orelse_var)
 
   if (not tensor_util.is_tf_type(body_var) or
       not tensor_util.is_tf_type(orelse_var)):
@@ -541,7 +543,7 @@ def _known_len_tf_for_stmt(
   def aug_test():
     main_test = iterate_index < n
     if extra_test is not None:
-      return control_flow_ops.cond(main_test, extra_test, lambda: False)
+      return tf_cond.cond(main_test, extra_test, lambda: False)
     return main_test
 
   _add_max_iterations_hint(opts, n)
@@ -589,7 +591,7 @@ def _tf_ragged_for_stmt(
   def aug_test():
     main_test = iterate_index < n
     if extra_test is not None:
-      return control_flow_ops.cond(main_test, extra_test, lambda: False)
+      return tf_cond.cond(main_test, extra_test, lambda: False)
     return main_test
 
   _add_max_iterations_hint(opts, n)
@@ -649,7 +651,7 @@ def _tf_range_for_stmt(
           math_ops.logical_and(delta < 0, iterate > limit))
 
     if extra_test is not None:
-      main_test = control_flow_ops.cond(main_test, extra_test, lambda: False)
+      main_test = tf_cond.cond(main_test, extra_test, lambda: False)
     return main_test
 
   _add_max_iterations_hint(
@@ -707,7 +709,7 @@ def _tf_iterator_for_stmt(
     # Calling set_state so that get_state() _tf_while_loop sees the conditional
     # tensors.
     aug_set_state(
-        control_flow_ops.cond(has_next, main_path, noop_path))
+        tf_cond.cond(has_next, main_path, noop_path))
 
   def aug_test():
     # This value takes a complicated path to get here:
@@ -715,7 +717,7 @@ def _tf_iterator_for_stmt(
     #   -> current_iteration_body -> set_state -> has_next
     main_test = has_next
     if extra_test is not None:
-      return control_flow_ops.cond(main_test, extra_test, lambda: False)
+      return tf_cond.cond(main_test, extra_test, lambda: False)
     return main_test
 
   _tf_while_stmt(
@@ -822,7 +824,7 @@ class _PythonLoopChecker(object):
     self.check_op_count_after_iteration = False
 
   def _get_ops(self):
-    return ops.get_default_graph().get_operations()
+    return set(ops.get_default_graph().get_operations())
 
   def _check_unroll_limits(self):
     if self.iterations > PYTHON_MAX_ITERATIONS:
@@ -1068,7 +1070,7 @@ def _try_handling_undefineds(body, get_state, set_state, init_vars, nulls,
       def autocast_to_tensor(v):
         if isinstance(
             v, (int, float, bool, str, list, tuple, np.ndarray, np.generic)):
-          init_val = ops.convert_to_tensor_v2(v)
+          init_val = tensor_conversion.convert_to_tensor_v2(v)
           return array_ops.placeholder(init_val.dtype, init_val.shape)
         return v
       autocast_init_vars = nest.map_structure(autocast_to_tensor, init_vars)
@@ -1200,8 +1202,8 @@ def _tf_while_stmt(test, body, get_state, set_state, symbol_names, opts):
   else:
     aug_init_vars = init_vars
 
-  final_loop_vars = control_flow_ops.while_loop(
-      aug_test, aug_body, aug_init_vars, **while_loop_opts)
+  final_loop_vars = while_loop.while_loop(aug_test, aug_body, aug_init_vars,
+                                          **while_loop_opts)
 
   if require_one_iteration:
     with ops.control_dependencies([
@@ -1308,7 +1310,7 @@ def _tf_if_stmt(
       _verify_tf_cond_vars(new_body_vars_[0], new_orelse_vars, symbol_names)
     return new_orelse_vars
 
-  final_cond_vars = control_flow_ops.cond(
+  final_cond_vars = tf_cond.cond(
       cond, aug_body, aug_orelse, strict=True)
   final_cond_vars = final_cond_vars + init_vars[nouts:]
 
diff --git a/tensorflow/python/autograph/operators/data_structures.py b/tensorflow/python/autograph/operators/data_structures.py
index 3604c6e1836..375d3179ab7 100644
--- a/tensorflow/python/autograph/operators/data_structures.py
+++ b/tensorflow/python/autograph/operators/data_structures.py
@@ -21,7 +21,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
 
@@ -203,7 +203,7 @@ def _tf_tensor_list_append(list_, x):
         element_shape=array_ops.shape(tensor_x),
         element_dtype=tensor_x.dtype)
 
-  list_ = control_flow_ops.cond(
+  list_ = cond.cond(
       list_ops.tensor_list_length(list_) > 0,
       lambda: list_,
       empty_list_of_elements_like_x,
diff --git a/tensorflow/python/autograph/operators/logical.py b/tensorflow/python/autograph/operators/logical.py
index 8a839c9cbd1..73608807223 100644
--- a/tensorflow/python/autograph/operators/logical.py
+++ b/tensorflow/python/autograph/operators/logical.py
@@ -15,7 +15,7 @@
 """Logical boolean operators: not, and, or."""
 
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import gen_math_ops
 
 
@@ -47,7 +47,7 @@ def and_(a, b):
 def _tf_lazy_and(cond, b):
   """Lazy-eval equivalent of "and" for Tensors."""
   # TODO(mdan): Enforce cond is scalar here?
-  return control_flow_ops.cond(cond, b, lambda: cond)
+  return tf_cond.cond(cond, b, lambda: cond)
 
 
 def _py_lazy_and(cond, b):
@@ -66,7 +66,7 @@ def or_(a, b):
 def _tf_lazy_or(cond, b):
   """Lazy-eval equivalent of "or" for Tensors."""
   # TODO(mdan): Enforce cond is scalar here?
-  return control_flow_ops.cond(cond, lambda: cond, b)
+  return tf_cond.cond(cond, lambda: cond, b)
 
 
 def _py_lazy_or(cond, b):
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index b721f43eddf..96ee2d53a92 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -26,8 +26,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_parsing_ops
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import list_ops
@@ -273,8 +273,8 @@ def _tf_tensor_len(s):
     with ops.control_dependencies([control_flow_assert.Assert(False, [msg])]):
       return constant_op.constant(0, dtype=dtypes.int32)
 
-  return control_flow_ops.cond(rank > 0, lambda: array_ops.shape(s)[0],
-                               raise_zero_rank_error)
+  return cond.cond(rank > 0, lambda: array_ops.shape(s)[0],
+                   raise_zero_rank_error)
 
 
 def _py_len(s):
@@ -413,7 +413,7 @@ def _py_enumerate(s, start=0):
   return enumerate(s, start)
 
 
-def zip_(*iterables):
+def zip_(*iterables, strict=False):
   zip_fn = _py_zip
   # If the overridden function is not the same across all iterables, use _py_zip
   for x in iterables:
@@ -422,11 +422,15 @@ def zip_(*iterables):
       zip_fn = _py_zip
       break
     zip_fn = zip_override
-  return zip_fn(*iterables)
+  return zip_fn(*iterables, strict=strict)
 
 
-def _py_zip(*iterables):
-  return zip(*iterables)
+def _py_zip(*iterables, strict=False):
+  if strict:
+    return zip(*iterables, strict=True)
+  else:
+    # Python < 3.10 doesn't have `strict` kwarg.
+    return zip(*iterables)
 
 
 def map_(fn, *iterables):
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index 8ca2e0d9c2b..138c11eb9c8 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -1,135 +1,283 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//visibility:private"],
-)
-
-py_library(
-    name = "pyct",
-    srcs = [
-        "__init__.py",
-        "anno.py",
-        "ast_util.py",
-        "cache.py",
-        "cfg.py",
-        "error_utils.py",
-        "errors.py",
-        "gast_util.py",
-        "inspect_utils.py",
-        "loader.py",
-        "naming.py",
-        "origin_info.py",
-        "parser.py",
-        "pretty_printer.py",
-        "qual_names.py",
-        "templates.py",
-        "transformer.py",
-        "transpiler.py",
-    ],
-    srcs_version = "PY3",
+py_strict_library(
+    name = "templates",
+    srcs = ["templates.py"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python/autograph/pyct/common_transformers",
+        ":anno",
+        ":ast_util",
+        ":parser",
+        ":qual_names",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_strict_library(
+    name = "transpiler",
+    srcs = ["transpiler.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cache",
+        ":inspect_utils",
+        ":loader",
+        ":naming",
+        ":origin_info",
+        ":parser",
+        ":templates",
+        ":transformer",
+        "//tensorflow/python/autograph/utils:ag_logging",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_strict_library(
+    name = "ast_util",
+    srcs = ["ast_util.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":anno",
+        ":parser",
+        ":qual_names",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_strict_library(
+    name = "loader",
+    srcs = ["loader.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":origin_info",
+        ":parser",
+    ],
+)
+
+py_strict_library(
+    name = "gast_util",
+    srcs = ["gast_util.py"],
+    visibility = ["//visibility:public"],
+    deps = ["@gast_archive//:gast"],
+)
+
+py_strict_library(
+    name = "__init__",
+    srcs = ["__init__.py"],
+    visibility = ["//visibility:public"],
+)
+
+py_strict_library(
+    name = "parser",
+    srcs = ["parser.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":errors",
+        ":inspect_utils",
+        "//tensorflow/python/util:tf_inspect",
         "@astunparse_archive//:astunparse",
         "@gast_archive//:gast",
+    ],
+)
+
+py_strict_library(
+    name = "naming",
+    srcs = ["naming.py"],
+    visibility = ["//visibility:public"],
+    deps = [":qual_names"],
+)
+
+py_strict_library(
+    name = "inspect_utils",
+    srcs = ["inspect_utils.py"],
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/python/util:tf_inspect"],
+)
+
+py_strict_library(
+    name = "origin_info",
+    srcs = ["origin_info.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":anno",
+        ":ast_util",
+        ":parser",
+        ":pretty_printer",
+        "//tensorflow/python/util:tf_inspect",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_strict_library(
+    name = "anno",
+    srcs = ["anno.py"],
+    visibility = ["//visibility:public"],
+    deps = ["@gast_archive//:gast"],
+)
+
+py_strict_library(
+    name = "errors",
+    srcs = ["errors.py"],
+    visibility = ["//visibility:public"],
+)
+
+py_strict_library(
+    name = "transformer",
+    srcs = ["transformer.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":anno",
+        ":parser",
+        ":pretty_printer",
+        ":templates",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_strict_library(
+    name = "qual_names",
+    srcs = ["qual_names.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":anno",
+        ":parser",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_strict_library(
+    name = "cfg",
+    srcs = ["cfg.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":anno",
+        "@astunparse_archive//:astunparse",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_strict_library(
+    name = "error_utils",
+    srcs = ["error_utils.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":origin_info",
+        "//tensorflow/python/util:traceback_utils",
+    ],
+)
+
+py_strict_library(
+    name = "cache",
+    srcs = ["cache.py"],
+    visibility = ["//visibility:public"],
+)
+
+py_strict_library(
+    name = "pretty_printer",
+    srcs = ["pretty_printer.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@gast_archive//:gast",
         "@termcolor_archive//:termcolor",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "anno_test",
     srcs = ["anno_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":anno",
         "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ast_util_test",
     srcs = ["ast_util_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":anno",
+        ":ast_util",
+        ":loader",
+        ":parser",
+        ":pretty_printer",
+        ":qual_names",
         "//tensorflow/python:client_testlib",
         "@gast_archive//:gast",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "cache_test",
     srcs = ["cache_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":cache",
         "//tensorflow/python:client_testlib",
-        "@gast_archive//:gast",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "cfg_test",
     srcs = ["cfg_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":cfg",
+        ":parser",
         "//tensorflow/python:client_testlib",
         "@gast_archive//:gast",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "loader_test",
     srcs = ["loader_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":ast_util",
+        ":loader",
+        ":parser",
+        ":pretty_printer",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/util:tf_inspect",
         "@gast_archive//:gast",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "error_utils_test",
     srcs = ["error_utils_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":error_utils",
+        ":origin_info",
         "//tensorflow/python:client_testlib",
-        "@gast_archive//:gast",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "inspect_utils_test",
     srcs = ["inspect_utils_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":inspect_utils",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct/testing",
-        "@gast_archive//:gast",
+        "//tensorflow/python/autograph/pyct/testing:basic_definitions",
+        "//tensorflow/python/autograph/pyct/testing:decorators",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/lib:__init__",
     ],
 )
 
@@ -139,94 +287,112 @@ sh_test(
     tags = ["no_oss"],
 )
 
-py_test(
+py_strict_test(
     name = "naming_test",
     srcs = ["naming_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":naming",
         "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "origin_info_test",
     srcs = ["origin_info_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":anno",
+        ":inspect_utils",
+        ":origin_info",
+        ":parser",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct/testing",
+        "//tensorflow/python/autograph/pyct/testing:basic_definitions",
+        "//tensorflow/python/util:tf_inspect",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "parser_test",
     srcs = ["parser_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":ast_util",
+        ":errors",
+        ":parser",
+        ":pretty_printer",
         "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "pretty_printer_test",
     srcs = ["pretty_printer_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":pretty_printer",
         "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "qual_names_test",
     srcs = ["qual_names_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":anno",
+        ":parser",
+        ":qual_names",
         "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "templates_test",
     srcs = ["templates_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":loader",
+        ":parser",
+        ":qual_names",
+        ":templates",
         "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
         "@gast_archive//:gast",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "transformer_test",
     srcs = ["transformer_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":anno",
+        ":origin_info",
+        ":parser",
+        ":transformer",
         "//tensorflow/python:client_testlib",
         "@gast_archive//:gast",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "transpiler_test",
     srcs = ["transpiler_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":pyct",
+        ":transformer",
+        ":transpiler",
         "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
     ],
 )
diff --git a/tensorflow/python/autograph/pyct/common_transformers/BUILD b/tensorflow/python/autograph/pyct/common_transformers/BUILD
index 2ddf42c38b7..f5f0bcedc72 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/BUILD
+++ b/tensorflow/python/autograph/pyct/common_transformers/BUILD
@@ -5,18 +5,6 @@ package(
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//visibility:private"],
-)
-
 py_library(
     name = "common_transformers",
     srcs = [
@@ -24,7 +12,12 @@ py_library(
     ],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
-    deps = ["@gast_archive//:gast"],
+    deps = [
+        "//tensorflow/python/autograph/pyct:gast_util",
+        "//tensorflow/python/autograph/pyct:templates",
+        "//tensorflow/python/autograph/pyct:transformer",
+        "@gast_archive//:gast",
+    ],
 )
 
 py_test(
@@ -34,8 +27,11 @@ py_test(
     srcs_version = "PY3",
     tags = ["no_oss"],
     deps = [
+        ":common_transformers",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:loader",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:transformer",
         "@gast_archive//:gast",
     ],
 )
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index 719c2068bf9..4aadeae18e0 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -1,87 +1,168 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//visibility:private"],
-)
-
-py_library(
-    name = "static_analysis",
-    srcs = [
-        "activity.py",
-        "annos.py",
-        "liveness.py",
-        "reaching_definitions.py",
-        "reaching_fndefs.py",
-        "type_inference.py",
-    ],
-    srcs_version = "PY3",
+py_strict_library(
+    name = "liveness",
+    srcs = ["liveness.py"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python/autograph/pyct",
+        ":annos",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:cfg",
+        "//tensorflow/python/autograph/pyct:transformer",
         "@gast_archive//:gast",
     ],
 )
 
-py_test(
+py_strict_library(
+    name = "reaching_fndefs",
+    srcs = ["reaching_fndefs.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:cfg",
+        "//tensorflow/python/autograph/pyct:transformer",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_strict_test(
+    name = "reaching_fndefs_test",
+    srcs = ["reaching_fndefs_test.py"],
+    deps = [
+        ":activity",
+        ":reaching_definitions",
+        ":reaching_fndefs",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:cfg",
+        "//tensorflow/python/autograph/pyct:naming",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:transformer",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+py_strict_library(
+    name = "activity",
+    srcs = ["activity.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":annos",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:transformer",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_strict_library(
+    name = "type_inference",
+    srcs = ["type_inference.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":activity",
+        ":annos",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:cfg",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:transformer",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_strict_library(
+    name = "reaching_definitions",
+    srcs = ["reaching_definitions.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:cfg",
+        "//tensorflow/python/autograph/pyct:transformer",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_strict_library(
+    name = "annos",
+    srcs = ["annos.py"],
+    visibility = ["//visibility:public"],
+)
+
+py_strict_test(
     name = "activity_test",
     srcs = ["activity_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":static_analysis",
+        ":activity",
+        ":annos",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:naming",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:transformer",
         "@gast_archive//:gast",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "liveness_test",
     testonly = True,
     srcs = ["liveness_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":static_analysis",
+        ":activity",
+        ":liveness",
+        ":reaching_fndefs",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:cfg",
+        "//tensorflow/python/autograph/pyct:naming",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:transformer",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "reaching_definitions_test",
     srcs = ["reaching_definitions_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":static_analysis",
+        ":activity",
+        ":reaching_definitions",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:cfg",
+        "//tensorflow/python/autograph/pyct:naming",
+        "//tensorflow/python/autograph/pyct:parser",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:transformer",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "type_inference_test",
     srcs = ["type_inference_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":static_analysis",
+        ":activity",
+        ":reaching_definitions",
+        ":reaching_fndefs",
+        ":type_inference",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct",
-        "@gast_archive//:gast",
+        "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/autograph/pyct:cfg",
+        "//tensorflow/python/autograph/pyct:qual_names",
+        "//tensorflow/python/autograph/pyct:transpiler",
     ],
 )
diff --git a/tensorflow/python/autograph/pyct/testing/BUILD b/tensorflow/python/autograph/pyct/testing/BUILD
index 6643f39b653..d3729cf0fc8 100644
--- a/tensorflow/python/autograph/pyct/testing/BUILD
+++ b/tensorflow/python/autograph/pyct/testing/BUILD
@@ -1,33 +1,11 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//visibility:private"],
-)
-
-py_library(
-    name = "testing",
-    srcs = [
-        "basic_definitions.py",
-        "decorators.py",
-    ],
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-)
-
-py_library(
+py_strict_library(
     name = "codegen",
     srcs = [
         "codegen.py",
@@ -35,13 +13,25 @@ py_library(
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct:templates",
         "//third_party/py/numpy",
         "@gast_archive//:gast",
     ],
 )
 
-py_test(
+py_strict_library(
+    name = "decorators",
+    srcs = ["decorators.py"],
+    visibility = ["//visibility:public"],
+)
+
+py_strict_library(
+    name = "basic_definitions",
+    srcs = ["basic_definitions.py"],
+    visibility = ["//visibility:public"],
+)
+
+py_strict_test(
     name = "codegen_test",
     size = "large",
     srcs = ["codegen_test.py"],
@@ -56,7 +46,6 @@ py_test(
     deps = [
         ":codegen",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/pyct",
-        "@gast_archive//:gast",
+        "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/autograph/utils/BUILD b/tensorflow/python/autograph/utils/BUILD
index f03b01b2877..76e126104d3 100644
--- a/tensorflow/python/autograph/utils/BUILD
+++ b/tensorflow/python/autograph/utils/BUILD
@@ -30,7 +30,7 @@ py_strict_library(
     srcs = ["ag_logging.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/checkpoint/BUILD b/tensorflow/python/checkpoint/BUILD
index 12b2e6ba5ff..1a9d83f6e1e 100644
--- a/tensorflow/python/checkpoint/BUILD
+++ b/tensorflow/python/checkpoint/BUILD
@@ -1,11 +1,12 @@
 # Description:
 #   Utilities for reading and writing object-based checkpoints.
 
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
 load(
     "//tensorflow/tools/test:performance.bzl",
     "tf_py_logged_benchmark",
 )
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -15,10 +16,10 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "checkpoint_lib",
     deps = [
-        ":checkpoint_core",
+        ":checkpoint",
         ":checkpoint_management",
         ":checkpoint_options",
         ":functional_saver",
@@ -28,32 +29,36 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "async_checkpoint_helper",
     srcs = ["async_checkpoint_helper.py"],
     srcs_version = "PY3",
-    deps = ["//tensorflow/python/tpu:tpu_embedding_v2"],
+    deps = [
+        ":checkpoint_context",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:executor",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/saved_model:pywrap_saved_model",
+        "//tensorflow/python/util:object_identity",
+        "@absl_py//absl/logging",
+    ],
 )
 
-py_library(
+py_strict_library(
     name = "checkpoint",
+    srcs = ["checkpoint.py"],
     srcs_version = "PY3",
     deps = [
         ":async_checkpoint_helper",
-        ":checkpoint_core",
-    ],
-)
-
-py_library(
-    name = "checkpoint_core",
-    srcs = [
-        "checkpoint.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
         ":checkpoint_context",
+        ":checkpoint_management",
         ":checkpoint_options",
-        ":checkpoint_view",
         ":functional_saver",
         ":graph_view",
         ":restore",
@@ -69,27 +74,34 @@ py_library(
         "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops_gen",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:saver",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:path_helpers",
+        "//tensorflow/python/saved_model:pywrap_saved_model",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:data_structures",
+        "//tensorflow/python/training:py_checkpoint_reader",
+        "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "checkpoint_test",
     srcs = ["checkpoint_test.py"],
     tags = [
@@ -98,8 +110,10 @@ tf_py_test(
     ],
     deps = [
         ":checkpoint",
+        ":checkpoint_management",
         ":checkpoint_options",
         ":graph_view",
+        ":save_util",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -107,30 +121,27 @@ tf_py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:saver",
-        "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:template",
-        "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
-        "//tensorflow/python/checkpoint:checkpoint_management",
-        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:stack",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/trackable:base",
+        "//tensorflow/python/training:checkpoint_utils",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "checkpoint_context",
     srcs = [
         "checkpoint_context.py",
@@ -138,7 +149,7 @@ py_library(
     srcs_version = "PY3",
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "checkpoint_with_v1_optimizers_test",
     srcs = ["checkpoint_with_v1_optimizers_test.py"],
     deps = [
@@ -158,16 +169,20 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "checkpoint_metrics_test",
     srcs = ["checkpoint_metrics_test.py"],
     deps = [
         ":checkpoint",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:pywrap_saved_model",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "checkpoint_view",
     srcs = ["checkpoint_view.py"],
     srcs_version = "PY3",
@@ -175,106 +190,112 @@ py_library(
     deps = [
         ":trackable_view",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training:py_checkpoint_reader",
+        "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "checkpoint_view_test",
     srcs = ["checkpoint_view_test.py"],
     tags = ["no_pip"],
     deps = [
+        ":checkpoint",
         ":checkpoint_view",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/trackable:autotrackable",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "graph_view",
     srcs = ["graph_view.py"],
     srcs_version = "PY3",
     deps = [
         ":save_util_v1",
         ":trackable_view",
-        "//tensorflow/python:util",
         "//tensorflow/python/trackable:base",
-        "//tensorflow/python/trackable:converter",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "save_util",
     srcs = ["save_util.py"],
     srcs_version = "PY3",
     deps = [
+        ":graph_view",
         ":save_util_v1",
         ":saveable_compat",
+        ":util",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:python_state",
         "//tensorflow/python/trackable:trackable_utils",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:object_identity",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "save_util_v1",
     srcs = ["save_util_v1.py"],
     srcs_version = "PY3",
     deps = [
         ":saveable_compat",
+        ":util",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:python_state",
         "//tensorflow/python/trackable:trackable_utils",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/util:object_identity",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "save_util_v1_test",
     srcs = ["save_util_v1_test.py"],
     deps = [
         ":graph_view",
         ":save_util_v1",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/util:object_identity",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "trackable_view",
     srcs = ["trackable_view.py"],
     srcs_version = "PY3",
     tags = ["no_pip"],
     deps = [
-        "//tensorflow/python:util",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:converter",
+        "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "trackable_view_test",
     srcs = ["trackable_view_test.py"],
     deps = [
@@ -284,21 +305,20 @@ tf_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "util",
     srcs = ["util.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/trackable:trackable_utils",
-        "//tensorflow/python/training:optimizer",
+        "//tensorflow/python/util:object_identity",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "restore",
     srcs = ["restore.py"],
     srcs_version = "PY3",
@@ -307,39 +327,51 @@ py_library(
         ":functional_saver",
         ":save_util_v1",
         ":saveable_compat",
-        ":util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:platform",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model/registration",
+        "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:constants",
         "//tensorflow/python/trackable:python_state",
         "//tensorflow/python/trackable:trackable_utils",
         "//tensorflow/python/training/saving:saveable_object_util",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:object_identity",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "restore_test",
     srcs = ["restore_test.py"],
     deps = [
+        ":checkpoint",
         ":restore",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/module",
+        "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/training/saving:saveable_object",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "benchmarks_test",
     srcs = ["benchmarks_test.py"],
     deps = [
         ":checkpoint",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python/module",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/training:py_checkpoint_reader",
     ],
 )
 
@@ -348,31 +380,44 @@ tf_py_logged_benchmark(
     target = "//tensorflow/python/checkpoint:benchmarks_test",
 )
 
-py_library(
+py_strict_library(
     name = "checkpoint_options",
     srcs = ["checkpoint_options.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "functional_saver",
     srcs = ["functional_saver.py"],
     srcs_version = "PY3",
     deps = [
         ":checkpoint_options",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:trackable_utils",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "functional_saver_test",
     size = "medium",
     srcs = [
@@ -381,13 +426,22 @@ cuda_py_test(
     deps = [
         ":checkpoint_options",
         ":functional_saver",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/trackable:trackable_utils",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/training/saving:saveable_object_util",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tensor_callable",
     srcs = ["tensor_callable.py"],
     srcs_version = "PY3",
@@ -396,7 +450,7 @@ py_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tensor_callable_test",
     srcs = ["tensor_callable_test.py"],
     deps = [
@@ -409,24 +463,27 @@ tf_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "checkpoint_management",
     srcs = ["checkpoint_management.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:checkpoint_state_py",
         "//tensorflow/python/training:training_util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "checkpoint_management_test",
     size = "small",
     srcs = [
@@ -435,27 +492,30 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":checkpoint",
+        ":checkpoint_management",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training:checkpoint_management",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:checkpoint_state_py",
         "//tensorflow/python/training:saver",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "saveable_compat",
     srcs = [
         "saveable_compat.py",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "saveable_compat_test",
     srcs = [
         "saveable_compat_test.py",
@@ -467,25 +527,40 @@ tf_py_test(
     tags = ["no_pip"],
     deps = [
         ":checkpoint",
+        ":generate_checkpoint_lib",
         ":saveable_compat",
-        ":testdata/generate_checkpoint",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/trackable:base",
+        "//tensorflow/python/training:checkpoint_utils",
         "//tensorflow/python/training/saving:saveable_object",
     ],
 )
 
-py_binary(
-    name = "testdata/generate_checkpoint",
+py_strict_binary(
+    name = "generate_checkpoint",
     srcs = ["testdata/generate_checkpoint.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:checkpoint",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/module",
+        "@absl_py//absl:app",
+    ],
+)
+
+py_strict_library(
+    name = "generate_checkpoint_lib",
+    srcs = ["testdata/generate_checkpoint.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python:checkpoint",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
diff --git a/tensorflow/python/checkpoint/async_checkpoint_helper.py b/tensorflow/python/checkpoint/async_checkpoint_helper.py
index a190e63950a..1096edf276a 100644
--- a/tensorflow/python/checkpoint/async_checkpoint_helper.py
+++ b/tensorflow/python/checkpoint/async_checkpoint_helper.py
@@ -23,6 +23,7 @@ import weakref
 
 from absl import logging
 
+from tensorflow.python.checkpoint import checkpoint_context
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute.sharded_variable import ShardedVariable
 from tensorflow.python.eager import context
@@ -33,8 +34,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops.resource_variable_ops import UninitializedVariable
 from tensorflow.python.ops.variables import Variable
 from tensorflow.python.saved_model.pywrap_saved_model import metrics
-from tensorflow.python.tpu.tpu_embedding_v2 import TPUEmbedding
-from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.util import object_identity
 
 # Captures the timestamp of the first Checkpoint instantiation or end of a write
@@ -45,6 +44,10 @@ _END_TIME_OF_LAST_ASYNC_WRITE_LOCK = threading.Lock()
 # API label for cell names used in async checkpoint metrics.
 _ASYNC_CHECKPOINT = "async_checkpoint"
 
+# Name of TPUEmbedding attribute. This is a temporary workaround
+# to identify TPUEmbedding while avoiding import cycles.
+_TPU_EMBEDDING_ATTR = "_create_copy_for_async_checkpoint"
+
 
 def _get_duration_microseconds(start_time_seconds, end_time_seconds):
   """Calculate the duration between start and end time.
@@ -119,6 +122,8 @@ class AsyncCheckpointHelper:
     # Register to join the async save thread upon exit.
     atexit.register(self._join_async_save_thread)
 
+    self._async_error = None
+
     global _END_TIME_OF_LAST_ASYNC_WRITE
     with _END_TIME_OF_LAST_ASYNC_WRITE_LOCK:
       if _END_TIME_OF_LAST_ASYNC_WRITE is None:
@@ -132,7 +137,8 @@ class AsyncCheckpointHelper:
                       hangining the accelerators idle during function tracing.
     """
     for accelerator_var, cpu_var in self._object_map.items():
-      if isinstance(accelerator_var, (ShardedVariable, TPUEmbedding)):
+      if isinstance(accelerator_var, ShardedVariable) or hasattr(
+          accelerator_var, _TPU_EMBEDDING_ATTR):
         # Skip for SharededVariable and TPUEmbedding as their sub-variables will
         # be copied over separately through other entries in the object map.
         continue
@@ -147,7 +153,8 @@ class AsyncCheckpointHelper:
                       hangining the accelerators idle during function tracing.
     """
     for accelerator_var, cpu_var in self._object_map.items():
-      if isinstance(accelerator_var, (ShardedVariable, TPUEmbedding)):
+      if isinstance(accelerator_var, ShardedVariable) or hasattr(
+          accelerator_var, _TPU_EMBEDDING_ATTR):
         # Skip for SharededVariable and TPUEmbedding as their sub-variables will
         # be copied over separately through other entries in the object map.
         continue
@@ -174,7 +181,7 @@ class AsyncCheckpointHelper:
 
       if isinstance(current_trackable, (Variable, ShardedVariable)):
         self._copy_trackable(current_trackable)
-      if isinstance(current_trackable, TPUEmbedding):
+      if hasattr(current_trackable, _TPU_EMBEDDING_ATTR):
         self._handle_tpu_embedding(current_trackable)
 
       for child in current_trackable._trackable_children(
@@ -200,7 +207,7 @@ class AsyncCheckpointHelper:
     for v in self._checkpoint_items.values():
       if isinstance(v, (Variable, ShardedVariable)):
         self._copy_trackable(v)
-      elif isinstance(v, TPUEmbedding):
+      elif hasattr(v, _TPU_EMBEDDING_ATTR):
         self._handle_tpu_embedding(v)
       to_traverse.append(v)
       visited.add(v)
@@ -208,10 +215,9 @@ class AsyncCheckpointHelper:
 
     # Copy for the slot variables.
     for current_trackable in self._original_nodes:
-      if (isinstance(current_trackable, optimizer_v1.Optimizer)
-          # Note: dir() is used rather than hasattr() here to avoid triggering
-          # custom __getattr__ code, see b/152031870 for context.
-          or "get_slot_names" in dir(current_trackable)):
+      # Note: dir() is used rather than hasattr() here to avoid triggering
+      # custom __getattr__ code, see b/152031870 for context.
+      if "get_slot_names" in dir(current_trackable):
         slot_names = current_trackable.get_slot_names()
         for slot_name in slot_names:
           for original_variable in self._original_nodes:
@@ -240,6 +246,19 @@ class AsyncCheckpointHelper:
         target=self._async_save, daemon=True)
     self._async_save_thread.start()
 
+  def _check_async_thread_error(self):
+    """Expose the most recent error from the async saving thread to the caller.
+    """
+    if self._async_error:
+      e = self._async_error
+      self._async_error = None
+      logging.error("Propagating the most recent error from the async thread "
+                    "before joining: %s", str(e))
+      # This allows the registered at-exit method '_join_async_save_thread' to
+      # acquire the semaphore instead of timing out.
+      self._writer_sem.release()
+      raise e
+
   def _join_async_save_thread(self):
     """Join the async save thread.
 
@@ -249,6 +268,9 @@ class AsyncCheckpointHelper:
     3). Trigger the async save thread to check and fail the while-predicate.
     4). Join the async save thread. (The thread may finish before joining.)
     """
+    # Expose the async thread error (if any) before joining the thread.
+    self._check_async_thread_error()
+
     if self._writer_sem.acquire(timeout=300):  # Step-1.
       self._async_save_thread_shutdown = True  # Step-2.
       self._reader_sem.release()  # Step-3.
@@ -275,17 +297,21 @@ class AsyncCheckpointHelper:
         # would clear the placement policy and make localhost the default
         # placement, while the main thread's default placement would be the
         # master worker's CPU:0.
-        with ops.device(self._default_device):
-          if self._use_checkpoint_save:
-            self._checkpoint.save(self._save_file_prefix,
-                                  self._checkpoint_options)
-          else:
-            self._checkpoint._write(  # pylint: disable=protected-access
-                self._save_file_prefix,
-                options=self._checkpoint_options,
-                write_done_callback=self._async_write_done_callback)
-        # Allow the next checkpoint event to overwrite the cpu-copied variables.
-        self._writer_sem.release()
+        try:
+          with ops.device(self._default_device):
+            with checkpoint_context.async_metrics_context():
+              if self._use_checkpoint_save:
+                self._checkpoint.save(self._save_file_prefix,
+                                      self._checkpoint_options)
+              else:
+                self._checkpoint._write(  # pylint: disable=protected-access
+                    self._save_file_prefix,
+                    options=self._checkpoint_options,
+                    write_done_callback=self._async_write_done_callback)
+        except Exception as e:   # # pylint: disable=broad-except
+          self._async_error = e
+        finally:
+          self._writer_sem.release()
 
         async_save_end_time = time.time()
         metrics.AddAsyncCheckpointWriteDuration(
@@ -362,19 +388,24 @@ class AsyncCheckpointHelper:
     Raises:
       AttributeError: if the input trackable is not TPUEmbedding type.
     """
-    if not isinstance(tpu_embedding, TPUEmbedding):
-      raise AttributeError("Expecting TPUEmbedding type; got %s" %
-                           type(tpu_embedding))
+    if not hasattr(
+        tpu_embedding, _TPU_EMBEDDING_ATTR
+    ) or not callable(tpu_embedding._create_copy_for_async_checkpoint):  # pylint: disable=protected-access
+      raise AttributeError(
+          "Expecting TPUEmbedding type; got %s" % type(tpu_embedding)
+      )
 
     # Create a dummy TPUEmbedding object and add it to the object_map. This is
     # to prevent the TPUEmbedding's save_callback from being triggered because
     # the embedding values have already being retrieved by AsyncCheckpoint.
     # pylint: disable=protected-access
-    new_embedding = TPUEmbedding(
+    new_embedding = tpu_embedding._create_copy_for_async_checkpoint(
         feature_config=tpu_embedding._feature_config,
-        optimizer=tpu_embedding._table_config[0].optimizer,
-        pipeline_execution_with_tensor_core=tpu_embedding
-        ._pipeline_execution_with_tensor_core)
+        optimizer=tpu_embedding._table_config[0]
+        if tpu_embedding._table_config
+        else None,
+        pipeline_execution_with_tensor_core=tpu_embedding._pipeline_execution_with_tensor_core,
+    )
     self._object_map[tpu_embedding] = new_embedding
     # pylint: enable=protected-access
 
@@ -435,6 +466,12 @@ class AsyncCheckpointHelper:
     if self._writer_sem.acquire():
       self._copy_to_cpu()
 
+    # Surface the error from the async thread, if any.
+    # This step should come after the sem acquision step in the above, so that
+    # it makes sure it waits until the previous async save finishes storing the
+    # error.
+    self._check_async_thread_error()
+
     # Trigger the async thread to checkpoint the cpu-copied variables.
     # Need to wait until the weight copying finishes before checkpoint save.
     context.async_wait()
@@ -484,6 +521,12 @@ class AsyncCheckpointHelper:
     if self._writer_sem.acquire():
       self._copy_to_cpu()
 
+    # Surface the error from the async thread, if any.
+    # This step should come after the sem acquision step in the above, so that
+    # it makes sure it waits until the previous async save finishes storing the
+    # error.
+    self._check_async_thread_error()
+
     # Retrieve the save counter from the underlying checkpoint object to
     # re-construct the full path of the checkpoint file.
     # This step has to happen before triggerting the underlying checkpoint;
diff --git a/tensorflow/python/checkpoint/checkpoint.py b/tensorflow/python/checkpoint/checkpoint.py
index 48357f322b3..6d7205b7acb 100644
--- a/tensorflow/python/checkpoint/checkpoint.py
+++ b/tensorflow/python/checkpoint/checkpoint.py
@@ -24,6 +24,7 @@ import time
 import weakref
 
 from tensorflow.core.protobuf import trackable_object_graph_pb2
+from tensorflow.python.checkpoint import async_checkpoint_helper
 from tensorflow.python.checkpoint import checkpoint_context
 from tensorflow.python.checkpoint import checkpoint_management
 from tensorflow.python.checkpoint import checkpoint_options
@@ -48,7 +49,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import path_helpers
@@ -65,7 +66,6 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -494,7 +494,7 @@ def _default_getter(name,
                                           shape_list,
                                           dtype=dtype)
 
-    return variables.VariableV1(
+    return variable_v1.VariableV1(
         initial_value=initial_value,
         name=name,
         dtype=variable_dtype,
@@ -1412,7 +1412,6 @@ class TrackableSaver:
     global _ASYNC_CHECKPOINT_THREAD
     if _ASYNC_CHECKPOINT_THREAD is not None:
       _ASYNC_CHECKPOINT_THREAD.join()
-
     reader = py_checkpoint_reader.NewCheckpointReader(save_path)
     graph_building = not context.executing_eagerly()
     if graph_building:
@@ -2275,12 +2274,10 @@ class Checkpoint(autotrackable.AutoTrackable):
   def _async_checkpointer(self):
     """Returns an instantiated AsyncCheckpointHelper."""
     if self._async_checkpointer_impl is None:
-      ach = LazyLoader(
-          "async_checkpoint_helper", globals(),
-          "tensorflow.python.checkpoint.async_checkpoint_helper"
-      )
-      self._async_checkpointer_impl = ach.AsyncCheckpointHelper(
-          Checkpoint, **self._kwargs)
+      self._async_checkpointer_impl = (
+          async_checkpoint_helper.AsyncCheckpointHelper(
+              Checkpoint,
+              **self._kwargs))
 
     return self._async_checkpointer_impl
 
@@ -2303,7 +2300,18 @@ class Checkpoint(autotrackable.AutoTrackable):
     # 2. running in eager mode
     if options and options.experimental_enable_async_checkpoint:
       self._checkpoint_options = options
-      if context.executing_eagerly():
+      if checkpoint_context.in_preemption_save_context():
+        # Make sure all in-progress writes have completed before saving the
+        # final preemption checkpoint.
+        if self._async_checkpointer_impl is not None:
+          self._async_checkpointer_impl.sync()
+        # Additional work done will not be saved in a future checkpoint, so
+        # we use regular sync checkpoint to avoid overhead of dispatching
+        # checkpoint write to a new thread.
+        logging.warning(
+            "Switching to regular sync checkpoint for preemption checkpoint."
+        )
+      elif context.executing_eagerly():
         return self._async_checkpointer()._write(  # pylint: disable=protected-access
             file_prefix, options, write_done_callback)
       else:
@@ -2325,20 +2333,21 @@ class Checkpoint(autotrackable.AutoTrackable):
 
     end_time = time.time()
 
-    # This records the time checkpoint._write() blocks on the main thread.
-    metrics.AddCheckpointWriteDuration(
-        api_label=_CHECKPOINT_V2,
-        microseconds=_get_duration_microseconds(start_time, end_time),
-    )
+    if not checkpoint_context.in_async_metrics_context():
+      # This records the time checkpoint._write() blocks on the main thread.
+      metrics.AddCheckpointWriteDuration(
+          api_label=_CHECKPOINT_V2,
+          microseconds=_get_duration_microseconds(start_time, end_time),
+      )
 
     global _END_TIME_OF_LAST_WRITE
     with _END_TIME_OF_LAST_WRITE_LOCK:
-      metrics.AddTrainingTimeSaved(
-          api_label=_CHECKPOINT_V2,
-          microseconds=_get_duration_microseconds(
-              _END_TIME_OF_LAST_WRITE, end_time
-          ),
-      )
+      if not checkpoint_context.in_async_metrics_context():
+        metrics.AddTrainingTimeSaved(
+            api_label=_CHECKPOINT_V2,
+            microseconds=_get_duration_microseconds(
+                _END_TIME_OF_LAST_WRITE, end_time)
+        )
       if checkpoint_context.in_preemption_save_context():
         _preemption_checkpoint_saved_time_usecs.get_cell().increase_by(
             _get_duration_microseconds(_END_TIME_OF_LAST_WRITE, end_time)
@@ -2362,6 +2371,13 @@ class Checkpoint(autotrackable.AutoTrackable):
     self._maybe_create_save_counter()
     return self._save_counter
 
+  def sync(self):
+    """Wait for any outstanding save or restore operations."""
+    # Subclasses of Checkpoint may not have `_async_checkpointer_impl` so use
+    # `getattr` for safer check.
+    if getattr(self, "_async_checkpointer_impl", None) is not None:
+      self._async_checkpointer_impl.sync()
+
   def save(self, file_prefix, options=None):
     # pylint:disable=line-too-long
     """Saves a training checkpoint and provides basic checkpoint management.
@@ -2408,7 +2424,18 @@ class Checkpoint(autotrackable.AutoTrackable):
     # 2. running in eager mode
     if options and options.experimental_enable_async_checkpoint:
       self._checkpoint_options = options
-      if context.executing_eagerly():
+      if checkpoint_context.in_preemption_save_context():
+        # Make sure all in-progress writes have completed before saving the
+        # final preemption checkpoint.
+        if self._async_checkpointer_impl is not None:
+          self._async_checkpointer_impl.sync()
+        # Additional work done will not be saved in a future checkpoint, so
+        # we use regular sync checkpoint to avoid overhead of dispatching
+        # checkpoint write to a new thread.
+        logging.warning(
+            "Switching to regular sync checkpoint for preemption checkpoint."
+        )
+      elif context.executing_eagerly():
         return self._async_checkpointer().save(file_prefix, options)
       else:
         logging.warning(
diff --git a/tensorflow/python/checkpoint/checkpoint_context.py b/tensorflow/python/checkpoint/checkpoint_context.py
index c42dca992a0..0cd27d7889a 100644
--- a/tensorflow/python/checkpoint/checkpoint_context.py
+++ b/tensorflow/python/checkpoint/checkpoint_context.py
@@ -19,10 +19,10 @@ import threading
 
 
 class PreemptionSaveContext(threading.local):
-  """A context for monitoring saving checkpoint pre-preemption."""
+  """A context for saving checkpoint upon preemption."""
 
   def __init__(self):
-    super(PreemptionSaveContext, self).__init__()
+    super().__init__()
     self._in_preemption_save_context = False
 
   def enter_preemption_save_context(self):
@@ -49,3 +49,37 @@ def preemption_save_context():
 
 def in_preemption_save_context():
   return _preemption_save_context.in_preemption_save_context()
+
+
+class AsyncMetricsContext(threading.local):
+  """A context for controlling metrics recording when async checkpoint is used.
+  """
+
+  def __init__(self):
+    super().__init__()
+    self._in_async_metrics_context = False
+
+  def enter_async_metrics_context(self):
+    self._in_async_metrics_context = True
+
+  def exit_async_metrics_context(self):
+    self._in_async_metrics_context = False
+
+  def in_async_metrics_context(self):
+    return self._in_async_metrics_context
+
+
+_async_metrics_context = AsyncMetricsContext()
+
+
+@contextlib.contextmanager
+def async_metrics_context():
+  _async_metrics_context.enter_async_metrics_context()
+  try:
+    yield
+  finally:
+    _async_metrics_context.exit_async_metrics_context()
+
+
+def in_async_metrics_context():
+  return _async_metrics_context.in_async_metrics_context()
diff --git a/tensorflow/python/checkpoint/checkpoint_management.py b/tensorflow/python/checkpoint/checkpoint_management.py
index 5f0668b7c9a..88723c51caf 100644
--- a/tensorflow/python/checkpoint/checkpoint_management.py
+++ b/tensorflow/python/checkpoint/checkpoint_management.py
@@ -877,3 +877,8 @@ class CheckpointManager(object):
       logging.info(
           "Customized initialization is done through the passed `init_fn`.")
     return None
+
+  def sync(self):
+    """Wait for any outstanding save or restore operations."""
+    if self._checkpoint:
+      self._checkpoint.sync()
diff --git a/tensorflow/python/checkpoint/checkpoint_test.py b/tensorflow/python/checkpoint/checkpoint_test.py
index fcdab1ba39c..3fe2a540b56 100644
--- a/tensorflow/python/checkpoint/checkpoint_test.py
+++ b/tensorflow/python/checkpoint/checkpoint_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -220,7 +221,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     save_path = checkpoint.save(file_prefix=prefix, options=ckpt_options)
     # TODO(chienchunh): Identify why sync needs to be called here.
     if enable_async_ckpt:
-      checkpoint._async_checkpointer().sync()
+      checkpoint.sync()
     self.evaluate(v.non_dep_variable.assign(43.))
     self.evaluate(v.mirrored.assign(44.))
     checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
@@ -230,7 +231,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     save_path = checkpoint.save(file_prefix=prefix, options=ckpt_options)
     # TODO(chienchunh): Identify why sync needs to be called here.
     if enable_async_ckpt:
-      checkpoint._async_checkpointer().sync()
+      checkpoint.sync()
     self.evaluate(v.non_dep_variable.assign(45.))
     checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
     self.assertEqual(44., self.evaluate(v.non_dep_variable))
@@ -1220,7 +1221,7 @@ class SerializeToTensorTest(test.TestCase):
 
     with self.cached_session() as sess:
       root = autotrackable.AutoTrackable()
-      root.v = variables_lib.VariableV1(5, use_resource=False)
+      root.v = variable_v1.VariableV1(5, use_resource=False)
       sess.run(root.v.initializer)
       ckpt = trackable_utils.Checkpoint(root)
       ckpt_path = os.path.join(self.get_temp_dir(), "ckpt")
diff --git a/tensorflow/python/checkpoint/restore.py b/tensorflow/python/checkpoint/restore.py
index 2b21d16852a..9649bf6cc0f 100644
--- a/tensorflow/python/checkpoint/restore.py
+++ b/tensorflow/python/checkpoint/restore.py
@@ -594,6 +594,13 @@ def _queue_children_for_restoration(checkpoint_position, visit_queue):
   # pylint: disable=protected-access
   trackable = checkpoint_position.trackable
   for child in checkpoint_position.object_proto.children:
+    # trackable._lookup_dependency can be expensive so first check if this node
+    # already has an object correspondence. If so we skip this node.
+    correspondence = checkpoint_position.checkpoint.object_by_proto_id.get(
+        child.node_id, None
+    )
+    if correspondence is not None:
+      continue
     child_position = checkpoint_position.create_child_position(child.node_id)
     local_object = trackable._lookup_dependency(child.local_name)
     child_proto = child_position.object_proto
diff --git a/tensorflow/python/checkpoint/util.py b/tensorflow/python/checkpoint/util.py
index 012a2dffcec..eb37585f88e 100644
--- a/tensorflow/python/checkpoint/util.py
+++ b/tensorflow/python/checkpoint/util.py
@@ -18,7 +18,6 @@ from tensorflow.core.protobuf import trackable_object_graph_pb2
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.trackable import trackable_utils
-from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.util import object_identity
 
 
@@ -27,11 +26,10 @@ def serialize_slot_variables(trackable_objects, node_ids, object_names):
   non_slot_objects = list(trackable_objects)
   slot_variables = object_identity.ObjectIdentityDictionary()
   for trackable in non_slot_objects:
-    if (isinstance(trackable, optimizer_v1.Optimizer)
         # TODO(b/110718070): Fix Keras imports.
         # Note: dir() is used rather than hasattr() here to avoid triggering
         # custom __getattr__ code, see b/152031870 for context.
-        or "get_slot_names" in dir(trackable)):
+    if "get_slot_names" in dir(trackable):
       slot_names = trackable.get_slot_names()
       for slot_name in slot_names:
         for original_variable_node_id, original_variable in enumerate(
diff --git a/tensorflow/python/client/BUILD b/tensorflow/python/client/BUILD
index b5b11ee4ca2..2c785fe0fc1 100644
--- a/tensorflow/python/client/BUILD
+++ b/tensorflow/python/client/BUILD
@@ -1,7 +1,7 @@
 # Contains targets that expose client session APIs
-
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test", "tf_python_pybind_extension")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test", "tf_python_pybind_extension")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 
 package(
@@ -12,7 +12,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "pywrap_tf_session",
     srcs = ["pywrap_tf_session.py"],
     srcs_version = "PY3",
@@ -30,6 +30,7 @@ tf_python_pybind_extension(
     hdrs = [
         "tf_session_helper.h",
         "//tensorflow/c:headers",
+        "//tensorflow/c:safe_ptr_hdr",
         "//tensorflow/c/eager:headers",
         "//tensorflow/c/eager:pywrap_required_hdrs",
         "//tensorflow/c/experimental/ops:pywrap_required_hdrs",
@@ -37,31 +38,32 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
-        "//tensorflow/python/lib/core:safe_ptr_hdr",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "//tensorflow/tsl/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/tsl/python/lib/core:numpy_hdr",
     ],
     deps = [
-        "//third_party/eigen3",
-        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
+        "//tensorflow/c:pywrap_required_hdrs",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib_headers_for_pybind",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:core_cpu_headers_lib",
+        "//tensorflow/core/config:flags_headers",
+        "//tensorflow/core/framework:pywrap_required_hdrs",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/util:version_info",
         "//tensorflow/python/lib/core:pybind11_lib",
         "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//tensorflow/core/framework:pywrap_required_hdrs",
-        "//third_party/py/numpy:headers",
         "//tensorflow/tsl/python/lib/core:numpy",
-        "//tensorflow/c:pywrap_required_hdrs",
-        "@pybind11",
+        "//third_party/eigen3",
+        "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core/common_runtime:core_cpu_headers_lib",
-        "//tensorflow/core/config:flags_headers",
-        "//tensorflow/core:lib_headers_for_pybind",
-        "//tensorflow/core/util:version_info",
         "@com_google_absl//absl/types:optional",
-        "//tensorflow/core/lib/llvm_rtti",
+        "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:absl_casters",
+        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
     ] + if_static(
         extra_deps = [
             "//tensorflow/core/protobuf:eager_service_proto_cc",
@@ -111,7 +113,7 @@ tf_python_pybind_extension(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "client",
     srcs = [
         "client_lib.py",
@@ -121,34 +123,34 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":_pywrap_device_lib",
-        "//tensorflow/python:platform",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:session",
-        "//tensorflow/python:session_ops",
-        "//tensorflow/python/framework",
         "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//tensorflow/python/util",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:build_info",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "events_writer_test",
     size = "small",
     srcs = ["events_writer_test.py"],
     python_version = "PY3",
     deps = [
+        ":_pywrap_events_writer",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/util:compat",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "device_lib",
     srcs = ["device_lib.py"],
     srcs_version = "PY3",
@@ -173,7 +175,7 @@ tf_python_pybind_extension(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "device_lib_test",
     size = "small",
     srcs = [
@@ -183,9 +185,9 @@ cuda_py_test(
     deps = [
         ":client",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:test",
     ],
 )
 
@@ -218,6 +220,7 @@ tf_cuda_library(
         ":session_ref",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:safe_ptr",
         "//tensorflow/c:tf_buffer",
         "//tensorflow/c:tf_buffer_internal",
         "//tensorflow/c:tf_status_helper",
@@ -231,7 +234,7 @@ tf_cuda_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python:ndarray_tensor",
         "//tensorflow/python:ndarray_tensor_bridge",
-        "//tensorflow/python:safe_ptr",
+        "//tensorflow/python:safe_pyobject_ptr",
         "//tensorflow/python/framework:test_ops_kernels",
         "//tensorflow/tsl/python/lib/core:numpy",
         "//third_party/py/numpy:headers",
@@ -240,35 +243,43 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
-py_library(
+py_strict_library(
     name = "session",
     srcs = ["session.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":pywrap_tf_session",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:mixed_precision_global_state",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:session_ops",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:c_api_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:error_interpolation",
         "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:stack",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training/experimental:mixed_precision_global_state",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
         "@wrapt",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "timeline",
     srcs = ["timeline.py"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:build_info",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
@@ -285,7 +296,7 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "session_test",
     size = "medium",
     srcs = ["session_test.py"],
@@ -298,28 +309,45 @@ tf_py_test(
         "no_windows",
     ],
     deps = [
-        ":client",
+        ":session",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_ops_gen",
         "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:gradients_impl",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
+        "//tensorflow/python:while_loop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:stack",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/util",
+        "//tensorflow/python/framework:versions",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "session_clusterspec_prop_test",
     size = "small",
     srcs = ["session_clusterspec_prop_test.py"],
@@ -333,23 +361,26 @@ tf_py_test(
         "notap",
     ],
     deps = [
-        ":client",
+        ":session",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/training:server_lib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "session_list_devices_test",
     size = "small",
     srcs = ["session_list_devices_test.py"],
@@ -360,15 +391,18 @@ tf_py_test(
         "no_pip_gpu",
     ],
     deps = [
-        ":client",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-        "//tensorflow/python/framework",
+        ":pywrap_tf_session",
+        ":session",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "session_partial_run_test",
     size = "small",
     srcs = ["session_partial_run_test.py"],
@@ -379,21 +413,19 @@ tf_py_test(
         "no_windows",
     ],
     deps = [
-        ":client",
+        ":session",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/util",
-        "@six_archive//:six",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "timeline_test",
     size = "small",
     srcs = ["timeline_test.py"],
@@ -405,14 +437,18 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,  # Graph structure is different with autojit
     deps = [
         ":client",
+        ":session",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "virtual_gpu_test",
     size = "small",
     srcs = ["virtual_gpu_test.py"],
@@ -422,28 +458,34 @@ cuda_py_test(
         "no_windows_gpu",
     ],
     deps = [
-        ":client",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "session_benchmark",
     srcs = ["session_benchmark.py"],
     grpc_enabled = True,
     main = "session_benchmark.py",
     python_version = "PY3",
     deps = [
-        ":client",
+        ":session",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/training:server_lib",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/client/debug_events_writer_wrapper.cc b/tensorflow/python/client/debug_events_writer_wrapper.cc
index 029c30a277b..252927e096e 100644
--- a/tensorflow/python/client/debug_events_writer_wrapper.cc
+++ b/tensorflow/python/client/debug_events_writer_wrapper.cc
@@ -30,7 +30,7 @@ PYBIND11_MODULE(_pywrap_debug_events_writer, m) {
 
   m.def("Init",
         [](const std::string& dump_root, const std::string& tfdbg_run_id,
-           const int64 circular_buffer_size) {
+           const int64_t circular_buffer_size) {
           DebugEventsWriter* writer = DebugEventsWriter::GetDebugEventsWriter(
               dump_root, tfdbg_run_id, circular_buffer_size);
           if (!writer->Init().ok()) {
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 1dfc15b91c0..1fc17686a7b 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -1389,7 +1389,7 @@ class BaseSession(SessionInterface):
           node_def = op.node_def
         except KeyError:
           pass
-      message = error_interpolation.interpolate(message, self._graph)
+      message = error_interpolation.interpolate_graph(message, self._graph)
       if 'only supports NHWC tensor format' in message:
         message += ('\nA possible workaround: Try disabling Grappler optimizer'
                     '\nby modifying the config for creating the session eg.'
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
index 419e2226312..075d69e78bc 100644
--- a/tensorflow/python/client/session_partial_run_test.py
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -265,6 +265,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testPartialRunMissingPlaceholderFeedExceptionDist(self):
+    self.skipTest('Flaky test. Short term b/278768411, long term b/280102873')
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunMissingPlaceholderFeedException(
         session.Session(server.target))
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 3e3e4fefdab..99ef1a2dd6e 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -55,7 +55,9 @@ from tensorflow.python.ops import math_ops
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
+from tensorflow.python.ops import while_loop
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
@@ -1050,7 +1052,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with session.Session():
       a = constant_op.constant(1.0, shape=[1, 2])
       b = constant_op.constant(2.0, shape=[1, 2], name='b')
-      v = variables.VariableV1(a, a.dtype)
+      v = variable_v1.VariableV1(a, a.dtype)
       assign_a_to_v = state_ops.assign(v, a)
 
       self.evaluate(assign_a_to_v)
@@ -1167,7 +1169,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       with ops.colocate_with(x):
         y = array_ops.placeholder(dtype=dtypes.float32)
       with ops.device('/cpu:0'):
-        z = control_flow_ops.while_loop(
+        z = while_loop.while_loop(
             lambda x, y: x < 10, lambda x, y: (x + 1, x * y), [x, y])
       with graph._attr_scope({'_a': attr_value_pb2.AttrValue(b=False)}):
         gradients_impl.gradients(z, [x, y])
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index 93b9e7ea5b6..27556def827 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_buffer_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -34,7 +35,7 @@ limitations under the License.
 #include "tensorflow/python/client/session_ref.h"
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 namespace tensorflow {
 
@@ -65,8 +66,8 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
                            TF_Buffer* run_outputs) {
   // 1. Convert the feed inputs to the appropriate form for TF_Run.
   if (!PyDict_Check(feed_dict)) {
-    Set_TF_Status_from_Status(out_status,
-                              errors::InvalidArgument(kFeedDictErrorMsg));
+    tsl::Set_TF_Status_from_Status(out_status,
+                                   errors::InvalidArgument(kFeedDictErrorMsg));
     return;
   }
 
@@ -83,8 +84,8 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
   while (PyDict_Next(feed_dict, &pos, &key, &value)) {
     char* key_string = PyBytes_AsString(key);
     if (!key_string) {
-      Set_TF_Status_from_Status(out_status,
-                                errors::InvalidArgument(kFeedDictErrorMsg));
+      tsl::Set_TF_Status_from_Status(
+          out_status, errors::InvalidArgument(kFeedDictErrorMsg));
       return;
     }
     input_names.push_back(key_string);
@@ -92,7 +93,7 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
     inputs_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
     s = NdarrayToTensor(nullptr /*ctx*/, value, &inputs_safe.back());
     if (!s.ok()) {
-      Set_TF_Status_from_Status(out_status, s);
+      tsl::Set_TF_Status_from_Status(out_status, s);
       return;
     }
     inputs_unsafe.push_back(inputs_safe.back().get());
@@ -145,7 +146,7 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
     PyObject* py_array;
     s = TF_TensorToPyArray(std::move(tf_outputs_safe[i]), &py_array);
     if (!s.ok()) {
-      Set_TF_Status_from_Status(out_status, s);
+      tsl::Set_TF_Status_from_Status(out_status, s);
       return;
     }
     py_outputs_safe.emplace_back(
@@ -179,7 +180,7 @@ void MakeCallableHelper(tensorflow::Session* session,
   if (callable_options != nullptr &&
       !callable_options_proto.ParseFromArray(callable_options->data,
                                              callable_options->length)) {
-    Set_TF_Status_from_Status(
+    tsl::Set_TF_Status_from_Status(
         out_status,
         errors::InvalidArgument("Unparseable CallableOptions proto"));
     return;
@@ -187,7 +188,7 @@ void MakeCallableHelper(tensorflow::Session* session,
   tensorflow::Session::CallableHandle handle;
   Status s = session->MakeCallable(callable_options_proto, &handle);
   if (!s.ok()) {
-    Set_TF_Status_from_Status(out_status, s);
+    tsl::Set_TF_Status_from_Status(out_status, s);
     return;
   }
   *out_handle = handle;
@@ -222,14 +223,14 @@ void RunCallableHelper(tensorflow::Session* session, int64_t handle,
     for (Py_ssize_t i = 0; i < len; ++i) {
       PyObject* elem = PySequence_Fast_GET_ITEM(feed_values, i);
       if (!elem) {
-        Set_TF_Status_from_Status(
+        tsl::Set_TF_Status_from_Status(
             out_status, errors::Internal("Could not get feed value ", i));
         return;
       }
       Tensor t;
       s = NdarrayToTensor(elem, &t);
       if (!s.ok()) {
-        Set_TF_Status_from_Status(out_status, s);
+        tsl::Set_TF_Status_from_Status(out_status, s);
         return;
       }
       input_tensors.push_back(std::move(t));
@@ -246,7 +247,7 @@ void RunCallableHelper(tensorflow::Session* session, int64_t handle,
   Py_END_ALLOW_THREADS;
 
   if (!s.ok()) {
-    Set_TF_Status_from_Status(out_status, s);
+    tsl::Set_TF_Status_from_Status(out_status, s);
     return;
   }
 
@@ -254,7 +255,7 @@ void RunCallableHelper(tensorflow::Session* session, int64_t handle,
   if (run_metadata != nullptr) {
     s = MessageToBuffer(run_metadata_proto, run_metadata);
     if (!s.ok()) {
-      Set_TF_Status_from_Status(out_status, s);
+      tsl::Set_TF_Status_from_Status(out_status, s);
       return;
     }
   }
@@ -268,7 +269,7 @@ void RunCallableHelper(tensorflow::Session* session, int64_t handle,
     PyObject* py_array;
     s = TensorToNdarray(output, &py_array);
     if (!s.ok()) {
-      Set_TF_Status_from_Status(out_status, s);
+      tsl::Set_TF_Status_from_Status(out_status, s);
       return;
     }
     py_outputs_safe.push_back(
@@ -303,11 +304,13 @@ void TF_SessionRunCallable(TF_Session* session, int64_t handle,
 
 void TF_DeprecatedSessionReleaseCallable(TF_DeprecatedSession* session,
                                          int64_t handle, TF_Status* status) {
-  Set_TF_Status_from_Status(status, session->session->ReleaseCallable(handle));
+  tsl::Set_TF_Status_from_Status(status,
+                                 session->session->ReleaseCallable(handle));
 }
 void TF_SessionReleaseCallable(TF_Session* session, int64_t handle,
                                TF_Status* status) {
-  Set_TF_Status_from_Status(status, session->session->ReleaseCallable(handle));
+  tsl::Set_TF_Status_from_Status(status,
+                                 session->session->ReleaseCallable(handle));
 }
 
 // Wrapper for TF_PRunSetup that converts the arguments to appropriate types.
@@ -380,7 +383,7 @@ void TF_SessionRun_wrapper_helper(TF_Session* session, const char* handle,
     input_vals_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
     s = NdarrayToTensor(nullptr, ndarray, &input_vals_safe.back());
     if (!s.ok()) {
-      Set_TF_Status_from_Status(out_status, s);
+      tsl::Set_TF_Status_from_Status(out_status, s);
       return;
     }
     input_vals.push_back(input_vals_safe.back().get());
@@ -418,7 +421,7 @@ void TF_SessionRun_wrapper_helper(TF_Session* session, const char* handle,
     PyObject* py_array;
     s = TF_TensorToPyArray(std::move(output_vals_safe[i]), &py_array);
     if (!s.ok()) {
-      Set_TF_Status_from_Status(out_status, s);
+      tsl::Set_TF_Status_from_Status(out_status, s);
       return;
     }
     py_outputs_safe.emplace_back(
@@ -585,7 +588,7 @@ TF_Function* TF_GraphToFunction_wrapper(
     const NameVector& control_output_names, const TF_FunctionOptions* opts,
     const char* description, TF_Status* out_status) {
   if (!output_names.empty() && output_names.size() != outputs.size()) {
-    Set_TF_Status_from_Status(
+    tsl::Set_TF_Status_from_Status(
         out_status,
         errors::InvalidArgument(
             "output names must be either empty or equal in size to outputs. ",
@@ -649,7 +652,8 @@ std::vector<TF_Output> TF_CreatePlaceholders(TF_Graph* graph, PyObject* dtypes,
   std::vector<TF_Output> outputs;
   dtypes = PySequence_Fast(dtypes, "dtypes must be a sequence");
   if (dtypes == nullptr) {
-    Set_TF_Status_from_Status(status, errors::Internal("dtypes is nullptr"));
+    tsl::Set_TF_Status_from_Status(status,
+                                   errors::Internal("dtypes is nullptr"));
     return outputs;
   }
   Safe_PyObjectPtr dtypes_holder(make_safe(dtypes));
@@ -658,8 +662,8 @@ std::vector<TF_Output> TF_CreatePlaceholders(TF_Graph* graph, PyObject* dtypes,
   for (size_t i = 0; i < len; i++) {
     PyObject* dtype = PySequence_Fast_GET_ITEM(dtypes, i);
     if (!dtype) {
-      Set_TF_Status_from_Status(status,
-                                errors::Internal("Could not get dtype ", i));
+      tsl::Set_TF_Status_from_Status(
+          status, errors::Internal("Could not get dtype ", i));
       return outputs;
     }
 #if PY_MAJOR_VERSION >= 3
@@ -709,7 +713,7 @@ PyObject* TF_TryEvaluateConstant_wrapper(TF_Graph* graph, TF_Output output,
   Safe_TF_TensorPtr safe_result_tensor(result_tensor);
   PyObject* out;
   Status s = TF_TensorToPyArray(std::move(safe_result_tensor), &out);
-  Set_TF_Status_from_Status(status, s);
+  tsl::Set_TF_Status_from_Status(status, s);
   if (!s.ok()) Py_RETURN_NONE;
   return PyArray_Return(reinterpret_cast<PyArrayObject*>(out));
 }
diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc
index d8856d016b4..1b2b894b8cc 100644
--- a/tensorflow/python/client/tf_session_wrapper.cc
+++ b/tensorflow/python/client/tf_session_wrapper.cc
@@ -13,34 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Must be at top (before any system includes and Python.h).
-// clang-format off
+#include <cstdint>
+#include <memory>
+#include <stdexcept>
+#include <string>
+
+#include "Python.h"
+#include "absl/types/optional.h"
+#include "third_party/eigen3/Eigen/Core"
 #include "pybind11/attr.h"  // from @pybind11
+#include "pybind11/cast.h"  // from @pybind11
 #include "pybind11/chrono.h"  // from @pybind11
 #include "pybind11/complex.h"  // from @pybind11
 #include "pybind11/detail/common.h"  // from @pybind11
 #include "pybind11/functional.h"  // from @pybind11
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/numpy.h"  // from @pybind11
+#include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
-#include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
 #include "pybind11/stl.h"  // from @pybind11
-
-// clang-format on
-#include "Python.h"
-
-// Must be included first
-// clang-format off
-#include "tensorflow/tsl/platform/mutex.h"
-#include "tensorflow/tsl/python/lib/core/numpy.h" //NOLINT
-// clang-format on
-
-#include "absl/types/optional.h"
-#include "third_party/eigen3/Eigen/Core"
+#include "pybind11/stl_bind.h"  // from @pybind11
+#include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_experimental.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/python_api.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/framework/full_type.pb.h"
@@ -50,10 +47,13 @@ limitations under the License.
 #include "tensorflow/python/client/tf_session_helper.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/python/lib/core/numpy.h"
 
 namespace pybind11 {
 namespace detail {
+
 // Convert between absl::optional and python.
 //
 // pybind11 supports std::optional, and absl::optional is meant to be a
@@ -66,6 +66,7 @@ struct type_caster<absl::optional<T>>
 template <>
 struct type_caster<absl::nullopt_t> : public void_caster<absl::nullopt_t> {};
 #endif
+
 }  // namespace detail
 }  // namespace pybind11
 
@@ -98,9 +99,162 @@ tensorflow::NameVector ConvertPyListToNameVector(
 
 namespace py = pybind11;
 
+// TODO(power) -- share these with JAX (see python_utils.h)
+template <typename Func, typename... Extra>
+pybind11::object property_readonly(Func&& get, const char* doc = "") {
+  pybind11::handle property_class(
+      reinterpret_cast<PyObject*>(&PyProperty_Type));
+  return property_class(
+      pybind11::cpp_function(std::forward<Func>(get),
+                             py::return_value_policy::reference_internal),
+      pybind11::none(), pybind11::none(), doc);
+}
+
+template <typename GetFunc, typename SetFunc>
+pybind11::object property(GetFunc&& get, SetFunc&& set) {
+  pybind11::handle property_class(
+      reinterpret_cast<PyObject*>(&PyProperty_Type));
+  return property_class(
+      pybind11::cpp_function(std::forward<GetFunc>(get),
+                             py::return_value_policy::reference_internal),
+      pybind11::cpp_function(std::forward<SetFunc>(set)), pybind11::none(), "");
+}
+
+template <typename Constructor>
+pybind11::object def_static(Constructor&& constructor) {
+  return pybind11::staticmethod(
+      pybind11::cpp_function(std::forward<Constructor>(constructor)));
+}
+
+template <typename Func, typename... Extra>
+pybind11::object method(pybind11::object type, Func&& function,
+                        const Extra&... extra) {
+  return pybind11::cpp_function(std::forward<Func>(function),
+                                pybind11::is_method(type), extra...);
+}
+
+// Construct a "TF" Python object. This covers the boiler-plate for Python type
+// generation. The type is assumed to be a GC type (containing other types).
+// To add the required Python type fields, classes definitions must start with
+//
+// TFObject_Head(classname)
+//
+// Required attributes/methods:
+//
+// Constructor(PyObject* args, PyObject* kw)
+// ~Destructor
+// Clear()
+// Visit(visitproc visit, void* arg)
+//
+// Individual methods/attributes are added to the type later, as seen below.
+template <class T>
+void MakeTfObjectType(PyObject** py_type) {
+  py::str name = py::str(T::kTypeName);
+  py::str qualname = py::str(T::kTypeName);
+  PyHeapTypeObject* heap_type = reinterpret_cast<PyHeapTypeObject*>(
+      PyType_Type.tp_alloc(&PyType_Type, 0));
+
+  heap_type->ht_name = name.release().ptr();
+  heap_type->ht_qualname = qualname.release().ptr();
+
+  PyTypeObject* type = &heap_type->ht_type;
+  type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE |
+                   Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_BASETYPE;
+  type->tp_name = T::kTypeName;
+  type->tp_basicsize = sizeof(T);
+
+  type->tp_new = [](PyTypeObject* subtype, PyObject* args,
+                    PyObject* kwds) -> PyObject* {
+    T* self = reinterpret_cast<T*>(subtype->tp_alloc(subtype, 0));
+    if (!self) return nullptr;
+
+    // PyType_GenericAlloc (the default implementation of tp_alloc) by default
+    // enables the garbage collector immediately for our object. This makes
+    // initialization extremely tricky as we need to avoid having the object
+    // in an invalid intermediate state.
+    //
+    // We disable the GC here until initialization is finished.
+    PyObject_GC_UnTrack(self);
+    new (self) T(args, kwds);
+    self->dict = PyDict_New();
+    PyObject_GC_Track(self);
+
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
+    return reinterpret_cast<PyObject*>(self);
+  };
+
+  type->tp_dealloc = [](PyObject* self) {
+    VLOG(3) << "Destroy: " << T::kTypeName;
+    PyObject_GC_UnTrack(self);
+    PyTypeObject* tp = Py_TYPE(self);
+    PyObject_ClearWeakRefs(self);
+
+    T* o = reinterpret_cast<T*>(self);
+    Py_CLEAR(o->dict);
+    o->~T();
+
+    tp->tp_free(self);
+    Py_DECREF(tp);
+  };
+
+  type->tp_traverse = [](PyObject* self, visitproc visit, void* arg) {
+    VLOG(3) << "Visit: " << T::kTypeName;
+    T* o = reinterpret_cast<T*>(self);
+    Py_VISIT(Py_TYPE(self));
+    Py_VISIT(o->dict);
+    return o->Visit(visit, arg);
+  };
+
+  type->tp_clear = [](PyObject* self) {
+    VLOG(3) << "Clear: " << T::kTypeName;
+    T* o = reinterpret_cast<T*>(self);
+    Py_CLEAR(o->dict);
+    o->Clear();
+    return 0;
+  };
+
+  type->tp_weaklistoffset = offsetof(T, weakrefs);
+
+  // All TF objects use a dictionary today, so we initialize it at construction.
+  // If some types become fully C++ based or require only thin Python wrappers,
+  // we can instead defer dictionary creation using a custom getter/setter.
+  type->tp_dictoffset = offsetof(T, dict);
+
+  // type->tp_getset = &tp_getset[0];
+  type->tp_descr_get = nullptr;
+  type->tp_descr_set = nullptr;
+  type->tp_call = nullptr;
+  type->tp_vectorcall_offset = 0;
+
+  type->tp_repr = nullptr;
+
+  if (PyType_Ready(type) != 0) {
+    PyErr_Print();
+    LOG(FATAL) << "Failed to build type.";  // Crash ok. In module init.
+  }
+  *py_type = reinterpret_cast<PyObject*>(type);
+}
+
+#define TFObject_HEAD(typename) \
+  PyObject_HEAD;                \
+  PyObject* dict = nullptr;     \
+  PyObject* weakrefs = nullptr; \
+  static PyObject* py_type;     \
+  static constexpr const char* kTypeName = #typename;
+
+struct PyGraph;
+struct PyOperation;
+struct PyTensor;
+
+// Bind operation maps opaquely to avoid copying.
+typedef absl::flat_hash_map<int64_t, py::object> OpsByIdMap;
+typedef absl::flat_hash_map<std::string, py::object> OpsByNameMap;
+
+PYBIND11_MAKE_OPAQUE(TF_Operation);
 PYBIND11_MAKE_OPAQUE(TF_Graph);
 PYBIND11_MAKE_OPAQUE(TF_Session);
-PYBIND11_MAKE_OPAQUE(TF_Operation);
 PYBIND11_MAKE_OPAQUE(TF_Buffer);
 PYBIND11_MAKE_OPAQUE(TF_ImportGraphDefOptions);
 PYBIND11_MAKE_OPAQUE(TF_ImportGraphDefResults);
@@ -113,174 +267,552 @@ PYBIND11_MAKE_OPAQUE(TF_Server);
 PYBIND11_MAKE_OPAQUE(TF_DeviceList);
 PYBIND11_MAKE_OPAQUE(TF_Status);
 
-// Helper class to move byte buffers to/from Python. This is used when working
-// with serialized protobufs.
-// TODO(b/269622008) -- Resume using this once protobuf parsing is fixed.
-struct StringBuffer {
-  explicit StringBuffer(std::string&& b) : buf_(std::move(b)) {}
-  std::string buf_;
+PYBIND11_MAKE_OPAQUE(OpsByIdMap);
+PYBIND11_MAKE_OPAQUE(OpsByNameMap);
+
+// Convert the given handle to a TF object type.
+template <class T>
+T* AsPyTfObject(py::handle handle) {
+  if (handle.get_type() == T::py_type) {
+    return reinterpret_cast<T*>(handle.ptr());
+  }
+  if (PyType_IsSubtype(Py_TYPE(handle.ptr()),
+                       reinterpret_cast<PyTypeObject*>(T::py_type))) {
+    return reinterpret_cast<T*>(handle.ptr());
+  }
+  // The tf_should_use wrapper masquerades as a base class, and forwards
+  // attribute lookups to an underlying class. This should be removed (it is
+  // slow, confusing, and not so relevant with TF2), or at least moved to the
+  // C++ wrapper classes (it is only used on Tensor and Operation). In the
+  // meantime, use a custom caster to handle the cases where we are passed a
+  // `tf_should_use` instead of the original class.
+  if (py::hasattr(handle, "_tf_should_use_wrapped_value")) {
+    return AsPyTfObject<T>(py::getattr(handle, "_tf_should_use_wrapped_value"));
+  }
+
+  throw std::runtime_error(
+      absl::StrCat("Expected a ", T::kTypeName, " got ",
+                   py::cast<std::string>(py::str(handle))));
+}
+
+template <class T>
+py::object AsPyObject(T* obj) {
+  return py::reinterpret_borrow<py::object>(reinterpret_cast<PyObject*>(obj));
+}
+
+// Reference counting helper for PyTfObjects.
+//
+// Similar to the pybind holder types, this manages the Python reference
+// counting while allowing access to the underlying PyTfObject type.
+//
+// As a special case to support Dismantle(), this allows setting our underlying
+// pointer to None when clearing the type. Direct access to attributes is not
+// allowed after this point.
+template <class T>
+class tf_handle {
+ public:
+  tf_handle() : obj_(nullptr) {}
+  explicit tf_handle(PyObject* obj) : obj_(nullptr) {
+    obj_ = AsPyTfObject<T>(obj);
+    Py_INCREF(obj);
+  }
+  ~tf_handle() { Py_CLEAR(obj_); }
+
+  tf_handle(const tf_handle<T>& other) { Reset(other.obj_); }
+
+  tf_handle<T>& operator=(tf_handle<T>&& other) {
+    if (this == &other) {
+      return *this;
+    }
+    obj_ = other.obj_;
+    other.obj_ = nullptr;
+  }
+
+  tf_handle<T>& operator=(const tf_handle<T>& other) {
+    Reset(other.ptr());
+    return *this;
+  }
+
+  tf_handle<T>& operator=(PyObject* obj) {
+    Reset(obj);
+    return *this;
+  }
+
+  void Destroy() {
+    Py_INCREF(Py_None);
+    Py_CLEAR(obj_);
+    obj_ = reinterpret_cast<T*>(Py_None);
+  }
+
+  void Reset(PyObject* obj) {
+    if (obj == reinterpret_cast<PyObject*>(obj_)) {
+      return;
+    }
+    Py_INCREF(obj);
+    Py_CLEAR(obj_);
+    obj_ = AsPyTfObject<T>(obj);
+  }
+
+  void Clear() { Py_CLEAR(obj_); }
+
+  T* operator->() {
+    if (reinterpret_cast<PyObject*>(obj_) == Py_None) {
+      throw std::runtime_error("Tried to deference None as a TF type.");
+    }
+    return obj_;
+  }
+  PyObject* ptr() const { return reinterpret_cast<PyObject*>(obj_); }
+
+  py::handle borrow() { return py::reinterpret_borrow<py::object>(ptr()); }
+  py::handle steal() { return py::reinterpret_steal<py::object>(ptr()); }
+
+ private:
+  T* obj_;
 };
 
-// Handle objects for C++ classes. These wrap the equivalent TF C API or C++
-// classes: GraphHandle for Graph, OperationHandle for TF_Operation = Node, etc.
-//
-// These are being used to help transition TF functionality out of Python and
-// into native C++ classes.
+namespace pybind11 {
+namespace detail {
 
-class TensorHandle {};
+#define TF_CASTER(TfObject)                                           \
+  template <>                                                         \
+  struct type_caster<TfObject> : public type_caster_base<TfObject> {  \
+   public:                                                            \
+    using base = type_caster_base<TfObject>;                          \
+    bool load(py::handle src, bool convert) {                         \
+      value = AsPyTfObject<TfObject>(src);                            \
+      return true;                                                    \
+    }                                                                 \
+    static py::handle cast(TfObject* src, return_value_policy policy, \
+                           py::handle parent) {                       \
+      PyObject* src_obj = reinterpret_cast<PyObject*>(src);           \
+      return py::reinterpret_borrow<py::object>(src_obj);             \
+    }                                                                 \
+  };
+
+TF_CASTER(PyGraph);
+TF_CASTER(PyOperation);
+TF_CASTER(PyTensor);
+
+}  // namespace detail
+}  // namespace pybind11
+
+// TF_Operation's are owned by their graph.
+struct TF_OperationDeleter {
+  void operator()(TF_Operation* op) {}
+};
+
+struct PyGraph {
+  TFObject_HEAD(PyGraph);
+
+  TF_Graph* graph;
+
+  // The C++ graph maintains an ID for every node, however our Python code has
+  // _also_ previously assigned a node ID, which is independent and different
+  // from the C++ ID. Moreover, the Python IDs are _dense_ and the Python
+  // implementation relies on the `ops_by_id` map having "insertion order"
+  // for the implementation of `get_operations` and auto control-deps.
+  //
+  // To keep compatibility and improve performance, we use 3 collections:
+  //
+  // * A py::list which tracks operations in insertion order.
+  // * A flat-map from C++ ID to PyOperation.
+  // * A flat-map from std::string to PyOperation.
+  py::list op_list;
+
+  // Operation ownership is maintained in ops_by_id.
+  OpsByIdMap ops_by_id;
+  OpsByNameMap ops_by_name;
+
+  PyGraph(PyObject* args, PyObject* kwds) {
+    graph = TF_NewGraph();
 
-class GraphHandle {
- public:
-  GraphHandle() : graph_(TF_NewGraph()) {
     // By default shape inference functions are required, however this breaks
     // many custom ops. Disable this check for Python graphs.
-    graph_->refiner.set_require_shape_inference_fns(false);
+    tsl::mutex_lock l(graph->mu);
+    graph->refiner.set_require_shape_inference_fns(false);
   }
-  virtual ~GraphHandle() { TF_DeleteGraph(graph_); }
+
+  ~PyGraph() {
+    Clear();
+    TF_DeleteGraph(graph);
+  }
+
+  void Dismantle();
+
+  void Clear() {
+    Py_CLEAR(op_list.release().ptr());
+    for (auto it = ops_by_id.begin(); it != ops_by_id.end(); ++it) {
+      Py_CLEAR(it->second.release().ptr());
+    }
+    ops_by_id.clear();
+    for (auto it = ops_by_name.begin(); it != ops_by_name.end(); ++it) {
+      Py_CLEAR(it->second.release().ptr());
+    }
+    ops_by_name.clear();
+  }
+
+  int Visit(visitproc visit, void* arg) {
+    Py_VISIT(op_list.ptr());
+    for (auto it = ops_by_id.begin(); it != ops_by_id.end(); ++it) {
+      Py_VISIT(it->second.ptr());
+    }
+    for (auto it = ops_by_name.begin(); it != ops_by_name.end(); ++it) {
+      Py_VISIT(it->second.ptr());
+    }
+    return 0;
+  }
+
+  int64_t add_op(py::object obj);
+
+  py::list operations() { return op_list; }
+  int64_t num_operations() const { return op_list.size(); }
+
+  // Return operations that are part of the Graph, but do not yet have
+  // OperationHandle's. This logic is only invoked when importing an existing
+  // GraphDef into Python. It should be removed once all logic moves to C++.
+  std::vector<TF_Operation*> new_operations() {
+    tsl::mutex_lock l(graph->mu);
+    std::vector<TF_Operation*> ops;
+
+    // SUBTLE: `op_nodes` skips the SOURCE and SINK nodes
+    for (auto n : graph->graph.op_nodes()) {
+      if (ops_by_name.find(n->name()) == ops_by_name.end()) {
+        ops.push_back(reinterpret_cast<TF_Operation*>(n));
+      }
+    }
+    return ops;
+  }
+
+  py::object get_operation_by_name(const std::string& name) {
+    tsl::mutex_lock l(graph->mu);
+    auto it = ops_by_name.find(name);
+    if (it == ops_by_name.end()) {
+      throw py::key_error();
+    }
+    return it->second;
+  }
+
+  int version() const { return ops_by_id.size(); }
 
   py::bytes version_def() const {
-    tsl::mutex_lock l(graph_->mu);
-    return py::bytes(graph_->graph.versions().SerializeAsString());
+    tsl::mutex_lock l(graph->mu);
+    return py::bytes(graph->graph.versions().SerializeAsString());
   }
 
   tsl::StatusOr<py::bytes> _op_def_for_type(
-      const std::string& type_name) const {
-    tsl::mutex_lock l(graph_->mu);
+      const std::string& kTypeName) const {
+    tsl::mutex_lock l(graph->mu);
     const tensorflow::OpDef* op_def;
     TF_RETURN_IF_ERROR(
-        graph_->graph.op_registry()->LookUpOpDef(type_name, &op_def));
+        graph->graph.op_registry()->LookUpOpDef(kTypeName, &op_def));
     return py::bytes(op_def->SerializeAsString());
   }
 
   void add_control_input(tensorflow::Node* src, tensorflow::Node* dst) {
-    tsl::mutex_lock l(graph_->mu);
+    tsl::mutex_lock l(graph->mu);
 
-    graph_->graph.AddControlEdge(src, dst);
+    graph->graph.AddControlEdge(src, dst);
     record_mutation(*dst, "adding control edge");
   }
 
   void remove_all_control_inputs(const tensorflow::Node& node) {
-    tsl::mutex_lock l(graph_->mu);
+    tsl::mutex_lock l(graph->mu);
     std::vector<const tensorflow::Edge*> control_edges;
     for (const tensorflow::Edge* edge : node.in_edges()) {
       if (!edge->IsControlEdge()) continue;
       control_edges.push_back(edge);
     }
     for (const tensorflow::Edge* edge : control_edges) {
-      graph_->graph.RemoveControlEdge(edge);
+      graph->graph.RemoveControlEdge(edge);
     }
   }
 
   void record_mutation(const tensorflow::Node& node, const std::string& reason)
-      TF_EXCLUSIVE_LOCKS_REQUIRED(graph_->mu) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
     tensorflow::RecordMutation(
-        graph_, reinterpret_cast<const TF_Operation&>(node), reason.c_str());
+        graph, reinterpret_cast<const TF_Operation&>(node), reason.c_str());
   }
 
-  TF_Graph* tf_graph() { return graph_; }
-
- private:
-  TF_Graph* graph_;
+  TF_Graph* tf_graph() { return graph; }
 };
 
-struct TF_OperationDeleter {
-  void operator()(TF_Operation* op) {}
-};
+struct PyOperation {
+  TFObject_HEAD(PyOperation);
 
-class OperationHandle {
- public:
-  OperationHandle() = default;
-  OperationHandle(TF_Operation* op, GraphHandle* graph)
-      : graph_(graph), op_(op) {}
+  TF_Operation* tf_op = nullptr;
+  py::list outputs;
 
-  virtual ~OperationHandle() = default;
-  const TF_Operation* op() { return op_; }
+  // N.B. initialized later by Python.
+  tf_handle<PyGraph> graph;
+  py::function tensor_fn;
 
-  TF_Output _tf_output(int idx) const { return TF_Output{op_, idx}; }
-  TF_Input _tf_input(int idx) const { return TF_Input{op_, idx}; }
-
-  void add_control_input(OperationHandle* input) {
-    graph_->add_control_input(&input->op_->node, &op_->node);
+  PyOperation(PyObject* args, PyObject* kwds) {
+    PyObject *py_op, *py_tensor_fn;
+    if (!PyArg_ParseTuple(args, "OO", &py_op, &py_tensor_fn)) {
+      return;
+    }
+    tf_op = py::cast<TF_Operation*>(py_op);
+    tensor_fn = py::cast<py::function>(py_tensor_fn);
   }
 
+  ~PyOperation() { Clear(); }
+
+  void Clear() {
+    Py_CLEAR(outputs.release().ptr());
+    graph.Clear();
+  }
+
+  void Dismantle();
+
+  int Visit(visitproc visit, void* arg) {
+    Py_VISIT(graph.ptr());
+    Py_VISIT(outputs.ptr());
+    return 0;
+  }
+
+  void _init_outputs() {
+    int num_outputs = TF_OperationNumOutputs(tf_op);
+    for (int i = 0; i < num_outputs; ++i) {
+      auto dtype = TF_OperationOutputType(TF_Output{tf_op, i});
+      outputs.append(tensor_fn(AsPyObject(this), i, dtype));
+    }
+  }
+
+  tsl::Status _add_outputs(py::list dtypes, py::list shapes);
+
+  const TF_Operation* op() { return tf_op; }
+
+  TF_Output _tf_output(int idx) const { return TF_Output{tf_op, idx}; }
+  TF_Input _tf_input(int idx) const { return TF_Input{tf_op, idx}; }
+
   py::bytes node_def() {
-    return py::bytes(op_->node.def().SerializeAsString());
+    return py::bytes(tf_op->node.def().SerializeAsString());
   }
 
   py::bytes op_def() const {
-    return py::bytes(op_->node.op_def().SerializeAsString());
+    return py::bytes(tf_op->node.op_def().SerializeAsString());
   }
 
-  bool is_stateful() const { return op_->node.op_def().is_stateful(); }
+  bool is_stateful() const { return tf_op->node.op_def().is_stateful(); }
 
-  const std::string& type() { return op_->node.type_string(); }
+  const std::string& type() { return tf_op->node.type_string(); }
+
+  void add_control_input(PyOperation* input) {
+    graph->add_control_input(&input->tf_op->node, &tf_op->node);
+  }
 
   void add_control_inputs(py::iterable inputs);
 
+  py::list control_inputs() {
+    py::list output;
+    for (const auto* edge : tf_op->node.in_edges()) {
+      if (edge->IsControlEdge() && !edge->src()->IsSource()) {
+        output.append(graph->ops_by_id[edge->src()->id()]);
+      }
+    }
+    return output;
+  }
+  py::list control_outputs() {
+    py::list output;
+    for (const auto* edge : tf_op->node.out_edges()) {
+      if (edge->IsControlEdge() && !edge->dst()->IsSink()) {
+        output.append(graph->ops_by_id[edge->dst()->id()]);
+      }
+    }
+    return output;
+  }
+
   void remove_all_control_inputs() {
-    graph_->remove_all_control_inputs(op_->node);
+    graph->remove_all_control_inputs(tf_op->node);
   }
 
   void set_device(const std::string& device) {
-    tsl::mutex_lock l(graph_->tf_graph()->mu);
-    op_->node.set_requested_device(device);
-    graph_->record_mutation(op_->node, "setting device");
+    tsl::mutex_lock l(graph->graph->mu);
+    tf_op->node.set_requested_device(device);
+    graph->record_mutation(tf_op->node, "setting device");
   }
 
-  const std::string& device() { return op_->node.requested_device(); }
-  const std::string& name() { return op_->node.name(); }
-
- private:
-  // graph_ is always a ops.Graph(GraphHandle). Since we expose the Graph as
-  // the `.graph` property, we need to preserve the Python class; Pybind doesn't
-  // do this and it instead returns a new GraphHandle wrapper each time. To
-  // work around this, we preserve the original graph object that's passed in
-  // to return to consumers.
-  //
-  // Once all Graph operations are migrated to C++ we can remove this.
-  // py::object py_graph_;
-  GraphHandle* graph_;
-  TF_Operation* op_;
+  const std::string& device() { return tf_op->node.requested_device(); }
+  const std::string& name() { return tf_op->node.name(); }
 };
 
-namespace pybind11 {
-namespace detail {
-// B(XXX)
-// The tf_should_use wrapper masquerades as a base class, and forwards attribute
-// lookups to an underlying class. This should be removed (it is slow,
-// confusing, and not so relevant with TF2), or at least moved to the C++
-// wrapper classes (it is only used on Tensor and Operation). In the
-// meantime, use a custom caster to handle the cases where we are passed a
-// `tf_should_use` instead of the original class.
-template <>
-struct type_caster<OperationHandle> : public type_caster_base<OperationHandle> {
- public:
-  using base = type_caster_base<OperationHandle>;
-  bool load(handle src, bool convert) {
-    if (py::hasattr(src, "_tf_should_use_wrapped_value")) {
-      return base::load(py::getattr(src, "_tf_should_use_wrapped_value"),
-                        convert);
+struct PyTensor {
+  TFObject_HEAD(PyTensor);
+
+  py::object tf_output = py::none();
+  py::object name = py::none();
+  py::object dtype = py::none();
+  py::object shape_val = py::none();
+  py::object uid = py::none();
+
+  tf_handle<PyOperation> op;
+  tf_handle<PyGraph> graph;
+
+  int value_index = -1;
+
+  PyTensor(PyObject* args, PyObject* kwds) {
+    PyObject *py_op, *py_index, *py_dtype, *py_uid;
+    if (!PyArg_ParseTuple(args, "OOOO", &py_op, &py_index, &py_dtype,
+                          &py_uid)) {
+      return;
     }
-    return base::load(src, convert);
+    dtype = py::reinterpret_borrow<py::object>(py_dtype);
+    value_index = py::cast<int>(py::handle(py_index));
+    op = py_op;
+    graph = op->graph;
+    name = py::str(absl::StrCat(op->name(), ":", value_index));
+    tf_output = py::cast(TF_Output{op->tf_op, value_index});
+    uid = py::reinterpret_borrow<py::object>(py_uid);
+  }
+  ~PyTensor() { Clear(); }
+
+  void Clear() {
+    Py_CLEAR(tf_output.release().ptr());
+    Py_CLEAR(name.release().ptr());
+    Py_CLEAR(dtype.release().ptr());
+    Py_CLEAR(shape_val.release().ptr());
+    Py_CLEAR(uid.release().ptr());
+    op.Clear();
+    graph.Clear();
   }
 
-  static handle cast(OperationHandle* src, return_value_policy policy,
-                     handle parent) {
-    return base::cast(src, policy, parent);
+  int Visit(visitproc visit, void* arg) {
+    Py_VISIT(op.ptr());
+    Py_VISIT(tf_output.ptr());
+    Py_VISIT(graph.ptr());
+    Py_VISIT(name.ptr());
+    Py_VISIT(dtype.ptr());
+    Py_VISIT(shape_val.ptr());
+    Py_VISIT(uid.ptr());
+    return 0;
+  }
+
+  tsl::StatusOr<py::object> shape() {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    bool unknown_shape = false;
+    auto dims = tensorflow::TF_GraphGetTensorShapeHelper(
+        graph->tf_graph(), TF_Output{op->tf_op, value_index}, status.get(),
+        &unknown_shape);
+    if (!status.get()->status.ok()) {
+      return status.get()->status;
+    }
+
+    py::list py_list;
+    for (int64_t dim : dims) {
+      py_list.append(dim == -1 ? py::none() : py::cast(dim));
+    }
+
+    return py::make_tuple(py_list, py::cast(unknown_shape));
+  }
+
+  tsl::Status set_shape(py::iterable shape, bool unknown_shape) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    std::vector<int64_t> dims;
+    if (!unknown_shape) {
+      for (py::handle dim : shape) {
+        if (dim.is_none()) {
+          dims.push_back(-1);
+        } else {
+          dims.push_back(py::cast<int64_t>(dim));
+        }
+      }
+    }
+    tensorflow::TF_GraphSetTensorShape_wrapper(
+        graph->tf_graph(), TF_Output{op->tf_op, value_index}, dims,
+        unknown_shape, status.get());
+    return status.get()->status;
+  }
+
+  int64_t rank() {
+    tsl::mutex_lock l(graph->graph->mu);
+    tensorflow::shape_inference::InferenceContext* ic =
+        graph->graph->refiner.GetContext(&op->tf_op->node);
+
+    tensorflow::shape_inference::ShapeHandle shape = ic->output(value_index);
+    if (ic->RankKnown(shape)) {
+      return ic->Rank(shape);
+    }
+    return -1;
+  }
+
+  py::list consumers() {
+    py::list out;
+    for (const auto* edge : op->tf_op->node.out_edges()) {
+      if (edge->src_output() != value_index) {
+        continue;
+      }
+      out.append(graph->ops_by_id[edge->dst()->id()]);
+    }
+    return out;
   }
 };
 
-}  // namespace detail
-}  // namespace pybind11
+PyObject* PyOperation::py_type = nullptr;
+PyObject* PyTensor::py_type = nullptr;
+PyObject* PyGraph::py_type = nullptr;
 
-void OperationHandle::add_control_inputs(py::iterable inputs) {
-  tsl::mutex_lock l(graph_->tf_graph()->mu);
-  for (py::handle input : inputs) {
-    auto* input_handle = py::cast<OperationHandle*>(input);
-    graph_->tf_graph()->graph.AddControlEdge(&input_handle->op_->node,
-                                             &op_->node);
+void PyOperation::Dismantle() {
+  outputs = py::list();
+  PyDict_Clear(dict);
+  graph.Destroy();
+}
+
+tsl::Status PyOperation::_add_outputs(py::list dtypes, py::list shapes) {
+  int orig_outputs = outputs.size();
+  for (int i = 0; i < dtypes.size(); ++i) {
+    py::object tensor =
+        tensor_fn(AsPyObject(this), orig_outputs + i, dtypes[i]);
+
+    // The passed in `shapes` may be TensorShapes, convert them to lists if
+    // needed.
+    bool unknown_shape;
+    py::object dims;
+    if (py::hasattr(shapes[i], "as_list")) {
+      unknown_shape = shapes[i].attr("rank").is_none();
+      if (!unknown_shape) {
+        dims = shapes[i].attr("as_list")();
+      } else {
+        dims = py::list();
+      }
+    } else {
+      unknown_shape = false;
+      dims = shapes[i];
+    }
+    TF_RETURN_IF_ERROR(
+        AsPyTfObject<PyTensor>(tensor)->set_shape(dims, unknown_shape));
+    outputs.append(tensor);
   }
-  graph_->record_mutation(op_->node, "adding control input");
+  return tsl::OkStatus();
+}
+
+void PyOperation::add_control_inputs(py::iterable inputs) {
+  tsl::mutex_lock l(graph->tf_graph()->mu);
+  for (py::handle input : inputs) {
+    auto* input_handle = py::cast<PyOperation*>(input);
+    graph->tf_graph()->graph.AddControlEdge(&input_handle->tf_op->node,
+                                            &tf_op->node);
+  }
+  graph->record_mutation(tf_op->node, "adding control input");
+}
+
+void PyGraph::Dismantle() {
+  for (auto& op : op_list) {
+    AsPyTfObject<PyOperation>(op.ptr())->Dismantle();
+  }
+  op_list = py::list();
+  ops_by_id.clear();
+  ops_by_name.clear();
+}
+
+int64_t PyGraph::add_op(py::object obj) {
+  PyOperation* op_handle = AsPyTfObject<PyOperation>(obj);
+  int64_t op_id = op_handle->tf_op->node.id();
+  op_list.append(obj);
+  ops_by_id[op_id] = obj;
+  ops_by_name[op_handle->name()] = obj;
+  return op_id;
 }
 
 PYBIND11_MODULE(_pywrap_tf_session, m) {
@@ -289,79 +821,226 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   // Numpy initialization code for array checks.
   tsl::ImportNumpy();
 
-  py::class_<StringBuffer>(m, "StringBuffer", py::buffer_protocol())
-      .def_buffer([](StringBuffer& b) -> py::buffer_info {
-        return py::buffer_info(
-            b.buf_.data(),                             // pointer to buffer
-            1,                                         // value size
-            py::format_descriptor<uint8_t>::format(),  // format descriptor
-            1,                                         // rank
-            {b.buf_.size()},                           // dims
-            {1}                                        // stride
-        );
+  py::bind_map<OpsByIdMap>(m, "OpsById");
+  py::bind_map<OpsByNameMap>(m, "OpsByName");
+
+  py::str module_name(m.attr("__name__"));
+
+  MakeTfObjectType<PyGraph>(&PyGraph::py_type);
+  py::object c_graph = py::reinterpret_borrow<py::object>(PyGraph::py_type);
+  m.attr("PyGraph") = c_graph;
+  c_graph.attr("__module__") = module_name;
+  c_graph.attr("Dismantle") = method(c_graph, [](py::handle handle) {
+    AsPyTfObject<PyGraph>(handle)->Dismantle();
+  });
+  c_graph.attr("_version_def") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyGraph>(handle)->version_def();
+  });
+  c_graph.attr("version") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyGraph>(handle)->version();
+  });
+  c_graph.attr("_op_def_for_type") =
+      method(c_graph, [](py::handle handle, std::string type) {
+        return AsPyTfObject<PyGraph>(handle)->_op_def_for_type(type);
       });
+  c_graph.attr("_nodes_by_name") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyGraph>(handle)->ops_by_name;
+  });
+  c_graph.attr("_nodes_by_id") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyGraph>(handle)->ops_by_id;
+  });
+  c_graph.attr("_get_operation_by_name") =
+      method(c_graph, [](py::handle handle, std::string name) {
+        return AsPyTfObject<PyGraph>(handle)->get_operation_by_name(name);
+      });
+  c_graph.attr("get_operations") = method(c_graph, [](py::handle handle) {
+    auto ops = AsPyTfObject<PyGraph>(handle)->operations();
+    py::list copy;
+    for (auto& op : ops) {
+      copy.append(op);
+    }
+    return copy;
+  });
+  c_graph.attr("operations") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyGraph>(handle)->operations();
+  });
+  c_graph.attr("new_operations") = method(c_graph, [](py::handle handle) {
+    return AsPyTfObject<PyGraph>(handle)->new_operations();
+  });
+  c_graph.attr("num_operations") = method(c_graph, [](py::handle handle) {
+    return AsPyTfObject<PyGraph>(handle)->num_operations();
+  });
+  c_graph.attr("_add_op") =
+      method(c_graph, [](py::handle handle, py::object op) {
+        return AsPyTfObject<PyGraph>(handle)->add_op(op);
+      });
+
+  MakeTfObjectType<PyOperation>(&PyOperation::py_type);
+  py::object c_op = py::reinterpret_borrow<py::object>(PyOperation::py_type);
+  m.attr("PyOperation") = c_op;
+  c_op.attr("__module__") = module_name;
+  c_op.attr("_tf_output") = method(c_op, [](py::handle handle, int index) {
+    return AsPyTfObject<PyOperation>(handle)->_tf_output(index);
+  });
+  c_op.attr("_tf_input") = method(c_op, [](py::handle handle, int index) {
+    return AsPyTfObject<PyOperation>(handle)->_tf_input(index);
+  });
+  c_op.attr("_set_device_from_string") =
+      method(c_op, [](py::handle handle, std::string device) {
+        return AsPyTfObject<PyOperation>(handle)->set_device(device);
+      });
+  c_op.attr("_add_control_input") =
+      method(c_op, [](py::handle handle, py::handle input) {
+        return AsPyTfObject<PyOperation>(handle)->add_control_input(
+            AsPyTfObject<PyOperation>(input));
+      });
+  c_op.attr("_add_control_inputs") =
+      method(c_op, [](py::handle handle, py::iterable inputs) {
+        return AsPyTfObject<PyOperation>(handle)->add_control_inputs(inputs);
+      });
+  c_op.attr("_remove_all_control_inputs") = method(c_op, [](py::handle handle) {
+    return AsPyTfObject<PyOperation>(handle)->remove_all_control_inputs();
+  });
+  c_op.attr("outputs") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyOperation>(handle)->outputs;
+  });
+  c_op.attr("graph") = property(
+      [](py::handle handle) {
+        return AsPyTfObject<PyOperation>(handle)->graph.borrow();
+      },
+      [](py::handle handle, py::handle graph) {
+        auto op = AsPyTfObject<PyOperation>(handle);
+        op->graph = graph.ptr();
+      });
+  c_op.attr("_c_op") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyOperation>(handle)->tf_op;
+  });
+  c_op.attr("_is_stateful") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyOperation>(handle)->is_stateful();
+  });
+  c_op.attr("_op_def") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyOperation>(handle)->op_def();
+  });
+  c_op.attr("_node_def") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyOperation>(handle)->node_def();
+  });
+  c_op.attr("type") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyOperation>(handle)->type();
+  });
+  c_op.attr("name") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyOperation>(handle)->name();
+  });
+  c_op.attr("device") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyOperation>(handle)->device();
+  });
+  c_op.attr("_control_outputs") = property_readonly([](py::handle handle) {
+    return AsPyTfObject<PyOperation>(handle)->control_outputs();
+  });
+  c_op.attr("_init_outputs") = method(c_op, [](py::handle handle) {
+    return AsPyTfObject<PyOperation>(handle)->_init_outputs();
+  });
+  c_op.attr("_add_outputs") =
+      method(c_op, [](py::handle handle, py::list dtypes, py::list shapes) {
+        return AsPyTfObject<PyOperation>(handle)->_add_outputs(dtypes, shapes);
+      });
+  c_op.attr("control_inputs") = property_readonly(
+      [](py::handle handle) {
+        return AsPyTfObject<PyOperation>(handle)->control_inputs();
+      },
+      R"doc(
+    The `Operation` objects on which this op has a control dependency.
+
+    Before this op is executed, TensorFlow will ensure that the
+    operations in `self.control_inputs` have finished executing. This
+    mechanism can be used to run ops sequentially for performance
+    reasons, or to ensure that the side effects of an op are observed
+    in the correct order.
+
+    Returns:
+      A list of `Operation` objects.
+  )doc");
+
+  [&m, &module_name]() {
+    MakeTfObjectType<PyTensor>(&PyTensor::py_type);
+    py::object c_tensor = py::reinterpret_borrow<py::object>(PyTensor::py_type);
+    m.attr("PyTensor") = c_tensor;
+    c_tensor.attr("__module__") = module_name;
+    c_tensor.attr("device") = property_readonly([](py::handle handle) {
+      return AsPyTfObject<PyTensor>(handle)->op->device();
+    });
+    c_tensor.attr("ndim") = property_readonly([](py::handle handle) {
+      return AsPyTfObject<PyTensor>(handle)->rank();
+    });
+    c_tensor.attr("_rank") = method(c_tensor, [](py::handle handle) {
+      return AsPyTfObject<PyTensor>(handle)->rank();
+    });
+    c_tensor.attr("_shape") = property_readonly([](py::handle handle) {
+      return AsPyTfObject<PyTensor>(handle)->shape();
+    });
+    c_tensor.attr("_dtype") = property_readonly([](py::handle handle) {
+      return AsPyTfObject<PyTensor>(handle)->dtype;
+    });
+    c_tensor.attr("_name") = property(
+        [](py::handle handle) { return AsPyTfObject<PyTensor>(handle)->name; },
+        [](py::handle handle, py::object name) {
+          AsPyTfObject<PyTensor>(handle)->name = name;
+        });
+    c_tensor.attr("_shape_val") = property(
+        [](py::handle handle) {
+          auto py_tensor = AsPyTfObject<PyTensor>(handle);
+          return py_tensor->shape_val;
+        },
+        [](py::handle handle, py::object shape) {
+          AsPyTfObject<PyTensor>(handle)->shape_val = shape;
+        });
+    c_tensor.attr("_id") = property(
+        [](py::handle handle) { return AsPyTfObject<PyTensor>(handle)->uid; },
+        [](py::handle handle, py::object uid) {
+          AsPyTfObject<PyTensor>(handle)->uid = uid;
+        });
+    c_tensor.attr("graph") =
+        property_readonly([](py::handle handle) -> py::handle {
+          auto& graph = AsPyTfObject<PyTensor>(handle)->graph;
+          if (graph.ptr() != nullptr) {
+            return graph.borrow();
+          }
+          return py::none();
+        });
+    c_tensor.attr("_as_tf_output") = method(c_tensor, [](py::handle handle) {
+      return AsPyTfObject<PyTensor>(handle)->tf_output;
+    });
+    c_tensor.attr("_op") =
+        property_readonly([](py::handle handle) -> py::handle {
+          auto& op = AsPyTfObject<PyTensor>(handle)->op;
+          if (op.ptr() != nullptr) {
+            return op.borrow();
+          }
+          return py::none();
+        });
+    c_tensor.attr("op") =
+        property_readonly([](py::handle handle) -> py::handle {
+          auto& op = AsPyTfObject<PyTensor>(handle)->op;
+          if (op.ptr() != nullptr) {
+            return op.borrow();
+          }
+          return py::none();
+        });
+    c_tensor.attr("_set_shape") = method(c_tensor, [](py::handle handle,
+                                                      py::iterable shape,
+                                                      bool unknown_shape) {
+      return AsPyTfObject<PyTensor>(handle)->set_shape(shape, unknown_shape);
+    });
+    c_tensor.attr("value_index") = property_readonly([](py::handle handle) {
+      return AsPyTfObject<PyTensor>(handle)->value_index;
+    });
+    c_tensor.attr("consumers") = method(c_tensor, [](py::handle handle) {
+      return AsPyTfObject<PyTensor>(handle)->consumers();
+    });
+  }();
 
   py::class_<TF_Operation, std::unique_ptr<TF_Operation, TF_OperationDeleter>>
       TF_Operation_class(m, "TF_Operation");
 
-  // Tensor, Operation, and Graph participate in a reference cycle:
-  //
-  // Graph._nodes_by_id -> Operation
-  // Operation._graph -> Graph
-  // Operation.outputs -> Tensor
-  // Tensor.op -> Operation
-  //
-  // This doesn't map well to standard C++ reference counting. To account for
-  // this, we override the default object behavior to interact with the Python
-  // garbage collector as necessary.
-  py::class_<GraphHandle>(m, "GraphHandle")
-      .def(py::init<>())
-      .def_property_readonly("_version_def", &GraphHandle::version_def,
-                             py::return_value_policy::move)
-      .def("_op_def_for_type", &GraphHandle::_op_def_for_type,
-           py::return_value_policy::move);
-
-  py::class_<OperationHandle>(
-      m, "OperationHandle",
-      py::custom_type_setup([](PyHeapTypeObject* heap_type) {
-        // Disabled until graph memory management moves to C++
-        // auto* type = &heap_type->ht_type;
-        // type->tp_flags |= Py_TPFLAGS_HAVE_GC;
-        // type->tp_traverse = [](PyObject* self_base, visitproc visit,
-        //                        void* arg) {
-        //   auto& self =
-        //   py::cast<OperationHandle&>(py::handle(self_base));
-        //   Py_VISIT(self.py_graph_.ptr());
-        //   Py_VISIT(Py_TYPE(self_base));
-        //   PyObject*& dict = *_PyObject_GetDictPtr(self_base);
-        //   Py_VISIT(dict);
-        //   return 0;
-        // };
-        // type->tp_clear = [](PyObject* self_base) {
-        //   auto& self =
-        //   py::cast<OperationHandle&>(py::handle(self_base));
-        //   self.py_graph_ = py::none();
-        //   return 0;
-        // };
-      }))
-      .def(py::init<TF_Operation*, GraphHandle*>())
-      .def("_tf_output", &OperationHandle::_tf_output)
-      .def("_tf_input", &OperationHandle::_tf_input)
-      .def("_set_device_from_string", &OperationHandle::set_device)
-      .def("_add_control_input", &OperationHandle::add_control_input)
-      .def("_add_control_inputs", &OperationHandle::add_control_inputs)
-      .def("_remove_all_control_inputs",
-           &OperationHandle::remove_all_control_inputs)
-      .def_property_readonly("_c_op", &OperationHandle::op)
-      .def_property_readonly("_is_stateful", &OperationHandle::is_stateful)
-      .def_property_readonly("_op_def", &OperationHandle::op_def,
-                             py::return_value_policy::move)
-      .def_property_readonly("_node_def", &OperationHandle::node_def,
-                             py::return_value_policy::move)
-      .def_property_readonly("type", &OperationHandle::type)
-      .def_property_readonly("name", &OperationHandle::name)
-      .def_property_readonly("device", &OperationHandle::device);
-
   py::class_<TF_Output>(m, "TF_Output")
       .def(py::init<>())
       .def_readwrite("oper", &TF_Output::oper)
@@ -393,14 +1072,9 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   // because they uses Python method(s) that expect the GIL to be held
   // (at least PyArray_Return, maybe others).
 
-  // Do not release GIL.
-  m.def("TF_OperationGetControlInputs_wrapper",
-        tensorflow::TF_OperationGetControlInputs_wrapper);
   // Do not release GIL.
   m.def("TF_OperationGetControlOutputs_wrapper",
         tensorflow::TF_OperationGetControlOutputs_wrapper);
-  m.def("TF_OperationOutputConsumers_wrapper",
-        tensorflow::TF_OperationOutputConsumers_wrapper);
   // Do not release GIL.
   m.def("GetOperationInputs", tensorflow::GetOperationInputs);
   // Do not release GIL.
@@ -447,7 +1121,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   m.def(
       "TF_GraphToFunction_wrapper",
-      [](GraphHandle* fn_body, const char* fn_name, bool append_hash_to_fn_name,
+      [](PyGraph* fn_body, const char* fn_name, bool append_hash_to_fn_name,
          absl::optional<std::vector<TF_Operation*>> opers_opt,
          const std::vector<TF_Output>& inputs,
          const std::vector<TF_Output>& outputs,
@@ -479,54 +1153,8 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
       },
       py::return_value_policy::reference);
 
-  m.def("TF_GraphGetTensorShapeHelper", [](GraphHandle* graph,
-                                           TF_Output output) {
-    tensorflow::Safe_TF_StatusPtr status =
-        tensorflow::make_safe(TF_NewStatus());
-    bool unknown_shape;
-
-    auto result = tensorflow::TF_GraphGetTensorShapeHelper(
-        graph->tf_graph(), output, status.get(), &unknown_shape);
-    tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
-
-    // Create a python list from InlinedVector
-    py::list py_list;
-    for (size_t i = 0; i < result.size(); ++i) {
-      py_list.append(py::cast(result[i]));
-    }
-
-    // Return a tuple.
-    py::tuple result_tuple = py::make_tuple(py_list, py::cast(unknown_shape));
-    return result_tuple;
-  });
-
-  m.def("TF_GraphSetTensorShape_wrapper",
-        [](GraphHandle* graph, TF_Output output,
-           const std::vector<int64_t>& dims, bool unknown_shape) {
-          tensorflow::Safe_TF_StatusPtr status =
-              tensorflow::make_safe(TF_NewStatus());
-
-          // Release GIL.
-          py::gil_scoped_release release;
-          tensorflow::TF_GraphSetTensorShape_wrapper(
-              graph->tf_graph(), output, dims, unknown_shape, status.get());
-          tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
-        });
-
-  m.def("TF_GraphGetTensorShape_wrapper",
-        [](GraphHandle* graph, TF_Output output,
-           const std::vector<int64_t>& dims, bool unknown_shape) {
-          tensorflow::Safe_TF_StatusPtr status =
-              tensorflow::make_safe(TF_NewStatus());
-          // Release GIL.
-          py::gil_scoped_release release;
-          tensorflow::TF_GraphSetTensorShape_wrapper(
-              graph->tf_graph(), output, dims, unknown_shape, status.get());
-          tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
-        });
-
   m.def("TF_GraphSetOutputHandleShapesAndTypes_wrapper",
-        [](GraphHandle* graph, TF_Output output,
+        [](PyGraph* graph, TF_Output output,
            const std::vector<absl::optional<std::vector<int64_t>>>& shapes,
            const std::vector<int>& ranks, py::handle& types) {
           tensorflow::Safe_TF_StatusPtr status =
@@ -573,7 +1201,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   // Do not release GIL.
   m.def("TF_CreatePlaceholders",
-        [](GraphHandle* graph, py::handle& dtypes, const char* prefix) {
+        [](PyGraph* graph, py::handle& dtypes, const char* prefix) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           auto output = tensorflow::TF_CreatePlaceholders(
@@ -584,7 +1212,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   m.def(
       "TF_NewSession",
-      [](GraphHandle* graph, const TF_SessionOptions* opts) {
+      [](PyGraph* graph, const TF_SessionOptions* opts) {
         tensorflow::Safe_TF_StatusPtr status =
             tensorflow::make_safe(TF_NewStatus());
         // Release GIL.
@@ -597,7 +1225,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   m.def(
       "TF_NewSessionRef",
-      [](GraphHandle* graph, const TF_SessionOptions* opts) {
+      [](PyGraph* graph, const TF_SessionOptions* opts) {
         tensorflow::Safe_TF_StatusPtr status =
             tensorflow::make_safe(TF_NewStatus());
         // Release GIL.
@@ -631,7 +1259,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   // Do not release GIL.
   m.def("TF_TryEvaluateConstant_wrapper",
-        [](GraphHandle* graph, const TF_Output output) {
+        [](PyGraph* graph, const TF_Output output) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           auto result = tensorflow::TF_TryEvaluateConstant_wrapper(
@@ -649,7 +1277,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
     tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
   });
 
-  m.def("GetHandleShapeAndType", [](GraphHandle* graph, TF_Output output) {
+  m.def("GetHandleShapeAndType", [](PyGraph* graph, TF_Output output) {
     std::string output_string =
         tensorflow::GetHandleShapeAndType(graph->tf_graph(), output);
     // Override default py3 behavior of attempting to encode into Unicode as
@@ -658,7 +1286,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   });
 
   m.def("SetHandleShapeAndType",
-        [](GraphHandle* graph, TF_Output output, py::bytes proto) {
+        [](PyGraph* graph, TF_Output output, py::bytes proto) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           tensorflow::Safe_TF_BufferPtr buf =
@@ -816,29 +1444,9 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
     tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
   });
 
-  m.def("TF_NewGraph", TF_NewGraph, py::return_value_policy::reference,
-        py::call_guard<py::gil_scoped_release>());
-  // Note: Do not use gil_scoped_release here which eventually (re)aquires the
-  // GIL. As graphs may be (automatically) freed from threads still running
-  // after Python already started to finalize this will lead to
-  // force-termination. See
-  // https://github.com/tensorflow/tensorflow/issues/50853
-  m.def("TF_DeleteGraph", TF_DeleteGraph);
-
-  m.def("TF_GraphGetOpDef",
-        [](GraphHandle* graph, const char* op_name, TF_Buffer* output_op_def) {
-          tensorflow::Safe_TF_StatusPtr status =
-              tensorflow::make_safe(TF_NewStatus());
-          // Release GIL.
-          py::gil_scoped_release release;
-          TF_GraphGetOpDef(graph->tf_graph(), op_name, output_op_def,
-                           status.get());
-          tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
-        });
-
   m.def(
       "TF_NewOperation",
-      [](GraphHandle* graph, const char* op_type, const char* oper_name) {
+      [](PyGraph* graph, const char* op_type, const char* oper_name) {
         tensorflow::Safe_TF_StatusPtr status =
             tensorflow::make_safe(TF_NewStatus());
         // Release GIL.
@@ -964,8 +1572,8 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
       },
       py::return_value_policy::reference);
 
-  m.def("SetAttr", [](GraphHandle* graph, TF_Operation* op,
-                      const char* attr_name, TF_Buffer* attr_value_proto) {
+  m.def("SetAttr", [](PyGraph* graph, TF_Operation* op, const char* attr_name,
+                      TF_Buffer* attr_value_proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     // Release GIL.
@@ -976,7 +1584,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   });
 
   m.def("ClearAttr",
-        [](GraphHandle* graph, TF_Operation* op, const char* attr_name) {
+        [](PyGraph* graph, TF_Operation* op, const char* attr_name) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           // Release GIL.
@@ -988,7 +1596,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   // Note: users should prefer using tf.cast or equivalent, and only when
   // it's infeasible to set the type via OpDef's type constructor and
   // inference function.
-  m.def("SetFullType", [](GraphHandle* graph, TF_Operation* op,
+  m.def("SetFullType", [](PyGraph* graph, TF_Operation* op,
                           const std::string& serialized_full_type) {
     tensorflow::FullTypeDef proto;
     proto.ParseFromString(serialized_full_type);
@@ -1034,7 +1642,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   m.def("TF_AddControlInput", TF_AddControlInput);
 
-  m.def("UpdateEdge", [](GraphHandle* graph, TF_Output new_src, TF_Input dst) {
+  m.def("UpdateEdge", [](PyGraph* graph, TF_Output new_src, TF_Input dst) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     // Release GIL.
@@ -1069,7 +1677,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   m.def(
       "TF_GraphImportGraphDefWithResults",
-      [](GraphHandle* graph, const TF_Buffer* graph_def,
+      [](PyGraph* graph, const TF_Buffer* graph_def,
          const TF_ImportGraphDefOptions* options) {
         tensorflow::Safe_TF_StatusPtr status =
             tensorflow::make_safe(TF_NewStatus());
@@ -1082,7 +1690,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   m.def(
       "TF_GraphNextOperation",
-      [](GraphHandle* graph, size_t pos) {
+      [](PyGraph* graph, size_t pos) {
         tensorflow::Safe_TF_StatusPtr status =
             tensorflow::make_safe(TF_NewStatus());
         auto output = TF_GraphNextOperation(graph->tf_graph(), &pos);
@@ -1124,15 +1732,14 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
       },
       py::return_value_policy::reference);
 
-  m.def("TF_GraphToGraphDef",
-        [](GraphHandle* graph, TF_Buffer* output_graph_def) {
-          tensorflow::Safe_TF_StatusPtr status =
-              tensorflow::make_safe(TF_NewStatus());
-          // Release GIL.
-          py::gil_scoped_release release;
-          TF_GraphToGraphDef(graph->tf_graph(), output_graph_def, status.get());
-          tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
-        });
+  m.def("TF_GraphToGraphDef", [](PyGraph* graph, TF_Buffer* output_graph_def) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    // Release GIL.
+    py::gil_scoped_release release;
+    TF_GraphToGraphDef(graph->tf_graph(), output_graph_def, status.get());
+    tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
+  });
 
   m.def("TF_OperationNumInputs", TF_OperationNumInputs,
         py::call_guard<py::gil_scoped_release>());
@@ -1167,26 +1774,25 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
           tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
         });
 
-  m.def("TF_GraphCopyFunction", [](GraphHandle* graph, const TF_Function* func,
-                                   const TF_Function* grad) {
-    tensorflow::Safe_TF_StatusPtr status =
-        tensorflow::make_safe(TF_NewStatus());
-    // Release GIL.
-    py::gil_scoped_release release;
-    TF_GraphCopyFunction(graph->tf_graph(), func, grad, status.get());
-    tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
-  });
-
-  m.def("TF_GraphRemoveFunction",
-        [](GraphHandle* graph, const char* func_name) {
+  m.def("TF_GraphCopyFunction",
+        [](PyGraph* graph, const TF_Function* func, const TF_Function* grad) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           // Release GIL.
           py::gil_scoped_release release;
-          TF_GraphRemoveFunction(graph->tf_graph(), func_name, status.get());
+          TF_GraphCopyFunction(graph->tf_graph(), func, grad, status.get());
           tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
         });
 
+  m.def("TF_GraphRemoveFunction", [](PyGraph* graph, const char* func_name) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    // Release GIL.
+    py::gil_scoped_release release;
+    TF_GraphRemoveFunction(graph->tf_graph(), func_name, status.get());
+    tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
+  });
+
   m.def(
       "TF_FunctionImportFunctionDef",
       [](py::bytes proto) {
@@ -1407,7 +2013,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   m.def("TF_DeleteDeviceList", TF_DeleteDeviceList);
 
   m.def("AddWhileInputHack",
-        [](GraphHandle* graph, TF_Output new_src, TF_Operation* dst) {
+        [](PyGraph* graph, TF_Output new_src, TF_Operation* dst) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           // Release GIL for threading.
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index 79d3318c5c1..9f12f0eb6fe 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -1,19 +1,18 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "v2_compat",
     srcs = ["v2_compat.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python:control_flow_v2_toggles",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tf2",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/data/experimental/ops:counter",
@@ -23,21 +22,25 @@ py_library(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "compat",
     srcs = ["compat.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "compat_test",
     size = "small",
     srcs = ["compat_test.py"],
@@ -45,20 +48,18 @@ tf_py_test(
     deps = [
         ":compat",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_v2_toggles",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variable_scope",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "disable_v2_behavior_test",
     size = "small",
     srcs = ["disable_v2_behavior_test.py"],
     deps = [
         ":v2_compat",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:_pywrap_tf2",
     ],
 )
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 54f8323c3fe..9b5eedcdb4a 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 3, 15)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 5, 7)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/BUILD b/tensorflow/python/compiler/BUILD
index 4ae440e6179..1b113a0be7a 100644
--- a/tensorflow/python/compiler/BUILD
+++ b/tensorflow/python/compiler/BUILD
@@ -1,5 +1,6 @@
 # Description:
 # Python APIs for various Tensorflow backends.
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -7,13 +8,8 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "compiler",
     srcs = ["__init__.py"],
     srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python/compiler/mlir",
-        "//tensorflow/python/compiler/tensorrt:init_py",
-        "//tensorflow/python/compiler/xla:compiler_py",
-    ],
 )
diff --git a/tensorflow/python/compiler/mlir/BUILD b/tensorflow/python/compiler/mlir/BUILD
index 629646292c0..2277bf0740d 100644
--- a/tensorflow/python/compiler/mlir/BUILD
+++ b/tensorflow/python/compiler/mlir/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,7 +6,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "mlir",
     srcs = ["mlir.py"],
     srcs_version = "PY3",
@@ -16,17 +16,27 @@ py_library(
     ],
 )
 
-py_test(
+py_strict_test(
     name = "mlir_test",
     srcs = ["mlir_test.py"],
+    data = [
+        "multi_add.tflite",
+    ],
     python_version = "PY3",
+    tags = [
+        "no_pip",
+    ],
     deps = [
         ":mlir",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:logging_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:pywrap_mlir",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
diff --git a/tensorflow/python/compiler/mlir/mlir_test.py b/tensorflow/python/compiler/mlir/mlir_test.py
index a146e7ebed4..3f3bf1037a5 100644
--- a/tensorflow/python/compiler/mlir/mlir_test.py
+++ b/tensorflow/python/compiler/mlir/mlir_test.py
@@ -14,6 +14,7 @@
 # =============================================================================
 """Tests for python.compiler.mlir."""
 
+import os
 from tensorflow.python.compiler.mlir import mlir
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
@@ -22,7 +23,9 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
+from tensorflow.python.pywrap_mlir import experimental_tflite_to_tosa_bytecode
 from tensorflow.python.pywrap_mlir import import_graphdef
 
 
@@ -126,7 +129,7 @@ class MLIRConcreteFunctionImportTest(test.TestCase):
         tensor_spec.TensorSpec(None, dtypes.float32))
     mlir_module = mlir.convert_function(concrete_function, show_debug_info=True)
     self.assertRegex(mlir_module, r'func @.*sqr.*\(')
-    self.assertRegex(mlir_module, r'loc11 = loc\(".*mlir_test.py":123:1\)')
+    self.assertRegex(mlir_module, r'loc11 = loc\(".*mlir_test.py":.*:1\)')
     self.assertRegex(mlir_module, r'callsite\(#loc11')
 
   @test_util.run_v2_only
@@ -159,5 +162,19 @@ class MLIRConcreteFunctionImportTest(test.TestCase):
     self.assertRegex(mlir_module, r'tf_executor.fetch.*: !tf_executor.control')
 
 
+class MLIRFlatbufferImportTest(test.TestCase):
+
+  def testImport(self):
+    """Tests the basic flow of `experimental_tflite_to_tosa_bytecode`."""
+    filename = os.path.join(self.get_temp_dir(), "multi_add_tosa.mlirbc")
+    experimental_tflite_to_tosa_bytecode(
+        resource_loader.get_path_to_datafile("multi_add.tflite"), filename
+    )
+    with open(filename, mode="rb") as f:
+      chunk = f.read(4)
+    # Just verify output is bytecode.
+    self.assertEqual(b"ML\xefR", chunk)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/compiler/mlir/multi_add.tflite b/tensorflow/python/compiler/mlir/multi_add.tflite
new file mode 100644
index 00000000000..e5048a32812
Binary files /dev/null and b/tensorflow/python/compiler/mlir/multi_add.tflite differ
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 470ed4831c4..40df3ee8efd 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -3,7 +3,8 @@
 #   and provide TensorRT operators and converter package.
 #   APIs are meant to change over time.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 # cuda_py_test and cuda_py_tests enable XLA tests by default. We can't
 # combine XLA with TensorRT currently and should set
@@ -19,17 +20,17 @@ exports_files(glob([
     "test/testdata/*",
 ]))
 
-py_library(
+py_strict_library(
     name = "init_py",
     srcs = ["__init__.py"],
     srcs_version = "PY3",
     deps = [
-        ":tf_trt_integration_test_base",
+        ":tf_trt_integration_test_base",  # build_cleaner: keep
         ":trt_convert_py",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "trt_convert_py",
     srcs = [
         "trt_convert.py",
@@ -38,44 +39,74 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
-        "//tensorflow/compiler/tf2tensorrt:trt_engine_instance_proto_py",
         "//tensorflow/compiler/tf2tensorrt:trt_ops_loader",
-        "//tensorflow/python:convert_to_constants",
-        "//tensorflow/python:func_graph",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:session",
-        "//tensorflow/python:tf_optimizer",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:resource_variable_ops_gen",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:convert_to_constants",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/grappler:tf_optimizer",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:tag_constants",
+        "//tensorflow/python/trackable:asset",
+        "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/trackable:resource",
+        "//tensorflow/python/training:saver",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:lazy_loader",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tf_trt_integration_test_base",
     srcs = ["//tensorflow/python/compiler/tensorrt/test:tf_trt_integration_test_base_srcs"],
     srcs_version = "PY3",
     deps = [
         ":trt_convert_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/profiler:trace",
         "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
         "//tensorflow/python/tools:saved_model_utils",
+        "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/util:nest",
+        "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "trt_convert_test",
     srcs = ["trt_convert_test.py"],
     data = [
@@ -90,25 +121,42 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":trt_convert_py",
+        "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
         "//tensorflow/compiler/tf2tensorrt:trt_engine_instance_proto_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:graph_util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops_gen",
+        "//tensorflow/python:variables",
         "//tensorflow/python/compiler/tensorrt/test:test_utils",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:convert_to_constants",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
-        "//tensorflow/python/tools:freeze_graph_lib",
         "//tensorflow/python/tools:saved_model_utils",
+        "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/util:lazy_loader",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "quantization_mnist_test",
     srcs = ["//tensorflow/python/compiler/tensorrt/test:quantization_mnist_test_srcs"],
     data = [
@@ -127,8 +175,42 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base",
-        "//tensorflow/python:client_testlib",
+        ":trt_convert_py",
+        "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:convert_to_constants",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/keras:metrics",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:signature_def_utils",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//tensorflow/python/saved_model:utils",
+        "//tensorflow/python/summary:summary_py",
+        "//tensorflow/python/training:adam",
+        "//tensorflow/python/training:checkpoint_management",
+        "//tensorflow/python/training:saver",
+        "//tensorflow/python/training:training_util",
     ],
 )
diff --git a/tensorflow/python/compiler/tensorrt/model_tests/BUILD b/tensorflow/python/compiler/tensorrt/model_tests/BUILD
index 162ab2e4ecf..03923d2bdcf 100644
--- a/tensorflow/python/compiler/tensorrt/model_tests/BUILD
+++ b/tensorflow/python/compiler/tensorrt/model_tests/BUILD
@@ -2,6 +2,8 @@
 #   Run sample models with TensorRT through TF-TRT bridge. Test TensorRT
 #   numerics and latency.
 
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
@@ -12,19 +14,22 @@ exports_files(glob([
     "models/*",
 ]))
 
-py_library(
+py_strict_library(
     name = "model_handler",
     srcs = ["model_handler.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:session",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/compiler/tensorrt:trt_convert_py",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:convert_to_constants",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:tag_constants",
@@ -32,18 +37,19 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "result_analyzer",
     srcs = ["result_analyzer.py"],
     srcs_version = "PY3",
     deps = [
         ":model_handler",
         "//tensorflow/python/compiler/tensorrt:trt_convert_py",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "run_models",
     srcs = ["run_models.py"],
     data = ["sample_model/saved_model.pb"],
@@ -54,9 +60,11 @@ py_binary(
         ":result_analyzer",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:config",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/compiler/tensorrt:trt_convert_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:tag_constants",
         "@absl_py//absl:app",
diff --git a/tensorflow/python/compiler/tensorrt/tensorrt.bzl b/tensorflow/python/compiler/tensorrt/tensorrt.bzl
deleted file mode 100644
index f61f3ad6337..00000000000
--- a/tensorflow/python/compiler/tensorrt/tensorrt.bzl
+++ /dev/null
@@ -1,41 +0,0 @@
-"""Functions to initialize tensorrt python tests."""
-
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
-
-# buildifier: disable=unnamed-macro
-def tensorrt_py_test(name, tags, test_names = []):
-    """A helper function that creates the Python TF-TRT unittests
-
-    Args:
-      name: str, the name of the group of tests.
-      tags: a list of tags that will be used by each test.
-      test_names: a list of tests OR a string representing the name of the test.
-                  If test_names is an empty list, test_names = [name]
-                  will be used.
-    """
-
-    if type(test_names) != "list":
-        if type(test_names) != "string":
-            fail("Parameter 'test_names' of tensorrt_py_test(name = '" + name +
-                 "', test_names = ...) should be a list or a string, got " +
-                 "'" + type(test_names) + "'")
-        test_names = [test_names]
-    elif test_names == []:
-        test_names = [name]
-
-    for test_name in test_names:
-        # cuda_py_test enable XLA tests by default. TensorRT can't combine XLA with
-        # TensorRT currently and should set xla_enable_strict_auto_jit to False to
-        # disable XLA tests.
-        cuda_py_test(
-            name = test_name,
-            srcs = [test_name + ".py"],
-            python_version = "PY3",
-            tags = tags,
-            xla_enable_strict_auto_jit = False,
-            deps = [
-                "//tensorflow/python:client_testlib",
-                "//tensorflow/python:framework_test_lib",
-                "//tensorflow/python/compiler/tensorrt:tf_trt_integration_test_base",
-            ],
-        )
diff --git a/tensorflow/python/compiler/tensorrt/test/BUILD b/tensorflow/python/compiler/tensorrt/test/BUILD
index 3284b9e892c..a68db4d3f59 100644
--- a/tensorflow/python/compiler/tensorrt/test/BUILD
+++ b/tensorflow/python/compiler/tensorrt/test/BUILD
@@ -3,7 +3,8 @@
 #   and provide TensorRT operators and converter package.
 #   APIs are meant to change over time.
 
-load("//tensorflow/python/compiler/tensorrt:tensorrt.bzl", "tensorrt_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -15,18 +16,43 @@ exports_files(
     glob(["*test.py"]),
 )
 
-py_library(
+py_strict_library(
     name = "test_utils",
-    srcs = [
-        "test_utils.py",
-    ],
+    srcs = ["test_utils.py"],
     srcs_version = "PY3",
 )
 
-filegroup(
+py_strict_library(
     name = "tf_trt_integration_test_base_srcs",
     srcs = ["tf_trt_integration_test_base.py"],
     visibility = ["//tensorflow/python/compiler/tensorrt:__pkg__"],
+    deps = [
+        "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/compiler/tensorrt:trt_convert_py",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/profiler:trace",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:signature_def_utils",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//tensorflow/python/saved_model:utils",
+        "//tensorflow/python/tools:saved_model_utils",
+        "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/util:nest",
+        "//third_party/py/numpy",
+    ],
 )
 
 filegroup(
@@ -62,39 +88,6 @@ filegroup(
     visibility = ["//tensorflow/python/compiler/tensorrt:__pkg__"],
 )
 
-oss_tests = [
-    "batch_matmul_test",
-    "biasadd_matmul_test",
-    "binary_tensor_weight_broadcast_test",
-    "bool_test",
-    "cast_test",
-    "concatenation_test",
-    "const_broadcast_test",
-    "data_dependent_shape_test",
-    "dynamic_input_shapes_test",
-    "identity_output_test",
-    "int32_test",
-    "lru_cache_test",
-    "memory_alignment_test",
-    "multi_connection_neighbor_engine_test",
-    "neighboring_engine_test",
-    "quantization_test",
-    "rank_two_test",
-    "reshape_transpose_test",
-    "topk_test",
-    "trt_engine_op_shape_test",
-    "trt_mode_test",
-    "unary_test",
-    "vgg_block_nchw_test",
-    "vgg_block_test",
-]
-
-no_oss_tests = [
-    "base_test",
-    "conv2d_test",  # "conv2d_test.py",  # b/198501457
-]
-# "base_test.py", # TODO(b/165611343): Need to address the failures for CUDA 11 in OSS build.
-
 base_tags = [
     "no_cuda_on_cpu_tap",
     "no_rocm",
@@ -102,36 +95,527 @@ base_tags = [
     "nomac",
 ]
 
-tensorrt_py_test(
-    name = "oss_tests",
-    tags = base_tags,
-    test_names = oss_tests,
+cuda_py_strict_test(
+    name = "base_test",
+    srcs = ["base_test.py"],
+    python_version = "PY3",
+    tags = base_tags + ["no_oss"],  # TODO(b/165611343): Need to address the failures for CUDA 11 in OSS build.
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/compiler/tensorrt:trt_convert_py",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+    ],
 )
 
-tensorrt_py_test(
-    name = "no_oss_tests",
+cuda_py_strict_test(
+    name = "conv2d_test",
+    srcs = ["conv2d_test.py"],
+    python_version = "PY3",
+    tags = base_tags + ["no_oss"],  # b/198501457
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "combined_nms_test",
+    srcs = ["combined_nms_test.py"],
+    python_version = "PY3",
     tags = base_tags + ["no_oss"],
-    test_names = no_oss_tests + ["combined_nms_test"],
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python/compiler/tensorrt:trt_convert_py",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
 )
 
-tensorrt_py_test(
+cuda_py_strict_test(
     name = "tf_function_test",
+    srcs = ["tf_function_test.py"],
+    python_version = "PY3",
     tags = base_tags + [
         "manual",  # TODO(b/231239602): re-enable once naming issue is resolved.
-        "notap",  # TODO(b/231239602): re-enable once naming issue is resolved.
         "no_oss",  # TODO(b/231239602): re-enable once naming issue is resolved.
+        "notap",  # TODO(b/231239602): re-enable once naming issue is resolved.
+    ],
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/compiler/tensorrt:trt_convert_py",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/saved_model:constants",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/util:compat",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "batch_matmul_test",
+    srcs = ["batch_matmul_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "biasadd_matmul_test",
+    srcs = ["biasadd_matmul_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "binary_tensor_weight_broadcast_test",
+    srcs = ["binary_tensor_weight_broadcast_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "bool_test",
+    srcs = ["bool_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/compiler/tensorrt:trt_convert_py",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "cast_test",
+    srcs = ["cast_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "concatenation_test",
+    srcs = ["concatenation_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "const_broadcast_test",
+    srcs = ["const_broadcast_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "data_dependent_shape_test",
+    srcs = ["data_dependent_shape_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "dynamic_input_shapes_test",
+    srcs = ["dynamic_input_shapes_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "identity_output_test",
+    srcs = ["identity_output_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "int32_test",
+    srcs = ["int32_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "lru_cache_test",
+    srcs = ["lru_cache_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "memory_alignment_test",
+    srcs = ["memory_alignment_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "multi_connection_neighbor_engine_test",
+    srcs = ["multi_connection_neighbor_engine_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "neighboring_engine_test",
+    srcs = ["neighboring_engine_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "quantization_test",
+    srcs = ["quantization_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "rank_two_test",
+    srcs = ["rank_two_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "reshape_transpose_test",
+    srcs = ["reshape_transpose_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "topk_test",
+    srcs = ["topk_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "trt_engine_op_shape_test",
+    srcs = ["trt_engine_op_shape_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:tag_constants",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "trt_mode_test",
+    srcs = ["trt_mode_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "unary_test",
+    srcs = ["unary_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "vgg_block_nchw_test",
+    srcs = ["vgg_block_nchw_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "vgg_block_test",
+    srcs = ["vgg_block_test.py"],
+    python_version = "PY3",
+    tags = base_tags,
+    xla_enable_strict_auto_jit = False,
+    deps = [
+        ":tf_trt_integration_test_base_srcs",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
 test_suite(
     name = "tf_trt_integration_test",
-    tests = oss_tests + [
+    tests = [
+        "batch_matmul_test",
+        "biasadd_matmul_test",
+        "binary_tensor_weight_broadcast_test",
+        "bool_test",
+        "cast_test",
         "combined_nms_test",
+        "concatenation_test",
+        "const_broadcast_test",
+        "data_dependent_shape_test",
+        "dynamic_input_shapes_test",
+        "identity_output_test",
+        "int32_test",
+        "lru_cache_test",
+        "memory_alignment_test",
+        "multi_connection_neighbor_engine_test",
+        "neighboring_engine_test",
+        "quantization_test",
+        "rank_two_test",
+        "reshape_transpose_test",
         "tf_function_test",
+        "topk_test",
+        "trt_engine_op_shape_test",
+        "trt_mode_test",
+        "unary_test",
+        "vgg_block_nchw_test",
+        "vgg_block_test",
     ],
 )
 
 test_suite(
     name = "tf_trt_integration_test_no_oss",
-    tests = no_oss_tests,
+    tests = [
+        "base_test",
+        "conv2d_test",
+    ],
 )
diff --git a/tensorflow/python/compiler/tensorrt/test/base_test.py b/tensorflow/python/compiler/tensorrt/test/base_test.py
index 997c24937ff..203ea5cb8d3 100644
--- a/tensorflow/python/compiler/tensorrt/test/base_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/base_test.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 
+from tensorflow.python.compiler.tensorrt import utils as trt_utils
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -122,6 +123,12 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
     # format to NCHW format under four dimentional input.
     self.DisableNonTrtOptimizers()
 
+  def ShouldRunTest(self, run_params):
+    return (
+        trt_utils.is_linked_tensorrt_version_greater_equal(8),
+        "Test is non-hermetic with TensorRT 7",
+    )
+
 
 class SimpleMultiEnginesTest2(trt_test.TfTrtIntegrationTestBase):
 
diff --git a/tensorflow/python/compiler/tensorrt/test/trt_engine_op_shape_test.py b/tensorflow/python/compiler/tensorrt/test/trt_engine_op_shape_test.py
index 4e1bc59a16e..d4150176f28 100644
--- a/tensorflow/python/compiler/tensorrt/test/trt_engine_op_shape_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/trt_engine_op_shape_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python import saved_model
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
@@ -26,6 +25,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 
@@ -53,9 +53,9 @@ class TRTEngineOpInputOutputShapeTest(trt_test.TfTrtIntegrationTestBase):
 
     def get_func_from_saved_model(saved_model_dir):
       try:  # Necessary for `bazel run ...`
-        saved_model_load_fn = saved_model.load.load
+        saved_model_load_fn = load.load
       except AttributeError:  # All the other cases
-        saved_model_load_fn = saved_model.load
+        saved_model_load_fn = load
       saved_model_loaded = saved_model_load_fn(
           saved_model_dir, tags=[tag_constants.SERVING])
       graph_func = saved_model_loaded.signatures[
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index cd69b42e6a9..659dd54bc30 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -841,12 +841,51 @@ class TrtGraphConverter(object):
     # Ignore other meta graphs from the input SavedModel.
     saved_model_builder.save()
 
-
 def _get_resource_handle(name, device):
   with ops.device(device):
     return gen_trt_ops.create_trt_resource_handle(resource_name=name)
 
 
+def _remove_native_segments(input_func):
+  """Remove native segments from the input TF-TRT Converted Function.
+
+  Args:
+    input_func: provide the concrete function with native segment nodes. The
+      transformed output func will not contain any native segment nodes. All the
+      TRTEngineOp references will be deleted and reset to default empty func.
+  """
+  input_graph_def = input_func.graph.as_graph_def()
+  # Deleting the Native Segment node in each TRTEngineOp node.
+  nodes_deleted = 0
+  for func_id in reversed(range(len(input_graph_def.library.function))):
+    f = input_graph_def.library.function[func_id]
+    if "native_segment" in f.signature.name:
+      nodes_deleted += 1
+      while context.context().has_function(f.signature.name):
+        context.context().remove_function(f.signature.name)
+      del input_graph_def.library.function[func_id]
+
+  logging.info(
+      "Found and deleted native segments from "
+      f"{nodes_deleted} TRTEngineOp nodes."
+  )
+
+  # Deleting the references to `<EngineName>_native_segment`s.
+  # This helps TRTEngineOp constructor to not look for native segment handles
+  # during construction of graph for inference.
+  for node in input_graph_def.node:
+    if node.op == "TRTEngineOp":
+      del node.attr["segment_func"]
+  for func in input_graph_def.library.function:
+    for node in func.node_def:
+      if node.op == "TRTEngineOp":
+        del node.attr["segment_func"]
+  # Reconstruct the converted_func with the new graph
+  new_func = _construct_function_from_graph_def(input_func, input_graph_def)
+
+  return new_func
+
+
 class _TRTEngineResource(resource.TrackableResource):
   """Class to track the serialized engines resource."""
 
@@ -907,8 +946,8 @@ def _construct_function_from_graph_def(func, graph_def, frozen_func=None):
       context.context().remove_function(f.signature.name)
 
   captures = {
-      c.internal.name.split(":")[0]: c.external
-      for c in frozen_func.graph._function_captures.by_val_captures.values()  # pylint: disable = protected-access
+      c[1].name.split(":")[0]: c[0]
+      for c in frozen_func.graph.captures
   }
   new_func = wrap_function.function_from_graph_def(
       graph_def, [tensor.name for tensor in frozen_func.inputs],
@@ -1532,6 +1571,23 @@ class TrtGraphConverterV2(object):
       RuntimeError: if the needed calibration hasn't been done.
     """
     assert self._converted
+
+    # 'remove_native_segments': setting this value to True removes native segments
+    # associated with each TRT engine. This option can be used to reduce the size
+    # of the converted model. Please note that a converted model without native
+    # segments can't be used for collecting profiles, building or re-converting.
+    # The reduced model can only be used for inference when no native segments
+    # are required for computation. When remove_native_segments flag is set to
+    # True, the converted_graph_def needs to be reduced before saved_model
+    # function serialization.
+    if trt_utils.is_experimental_feature_activated("remove_native_segments"):
+      logging.info(
+          "'remove_native_segments' experimental feature is enabled"
+          " during saving of converted SavedModel."
+      )
+      self._converted_func = _remove_native_segments(self._converted_func)
+      self._converted_graph_def = self._converted_func.graph.as_graph_def()
+
     if self._need_calibration and not self._calibrated:
       raise RuntimeError("A model that requires INT8 calibration has to be "
                          "built before saving it. Call build() to build and "
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
index 5ac364c6aca..934d323cb4f 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -702,6 +702,41 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     del root_with_trt
     gc.collect()  # Force GC to destroy the TRT engine cache.
 
+  @test_util.run_v2_only
+  def testTrtGraphConverter_RemoveNativeSegments(self):
+    """Test case for trt_convert._remove_native_segment()."""
+    np_input = np.random.random_sample([5, 3]).astype(np.float32)
+    # Create a model and save it.
+    input_saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    root = self._GetShapeOpModel()
+    expected_output = root.run(np_input)
+    save.save(root, input_saved_model_dir, signatures=root.run)
+
+    # Run TRT conversion.
+    converter = trt_convert.TrtGraphConverterV2(
+        input_saved_model_dir,
+        precision_mode=trt_convert.TrtPrecisionMode.FP32,
+        allow_build_at_runtime=False,
+        minimum_segment_size=1,
+    )
+
+    def _input_fn():
+      yield (np_input,)
+
+    graph_func = converter.convert()
+    converter.build(_input_fn)
+    # Load and verify the reduced converted model.
+    output_saved_model_dir2 = self.mkdtemp()
+    with test_utils.experimental_feature_scope("remove_native_segments"):
+      converter.save(output_saved_model_dir2)
+    saved_model_loaded = load.load(output_saved_model_dir2)
+    graph_func_after = saved_model_loaded.signatures["serving_default"]
+    actual_output = graph_func_after(x=np_input)["output_0"]
+    self.assertAllClose(expected_output, actual_output, atol=1e-6, rtol=1e-6)
+    del graph_func
+    del root
+    gc.collect()  # Force GC to destroy the TRT engine cache.
+
   @test_util.run_v2_only
   def testTrtGraphConverter_DestroyEngineCache(self):
     """Test case for trt_convert.TrtGraphConverter()."""
diff --git a/tensorflow/python/compiler/xla/BUILD b/tensorflow/python/compiler/xla/BUILD
index 1abed7489ee..2690207f40f 100644
--- a/tensorflow/python/compiler/xla/BUILD
+++ b/tensorflow/python/compiler/xla/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,7 +7,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "compiler_py",
     srcs = [
         "__init__.py",
@@ -16,12 +17,13 @@ py_library(
     deps = [
         ":xla",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "jit_test",
     size = "small",
     srcs = ["jit_test.py"],
@@ -31,20 +33,21 @@ cuda_py_test(
     xla_enabled = True,
     deps = [
         ":compiler_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:op_def_registry",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -55,21 +58,26 @@ py_library(
     deps = [
         "//tensorflow/compiler/jit:xla_ops_py",
         "//tensorflow/compiler/jit/ops:xla_ops_grad",
+        "//tensorflow/core:protos_all_py",
         # Do not remove: required to run xla ops on Cloud.
-        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/compiler/tf2xla/python:xla",  # build_cleaner: keep
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute:summary_op_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "xla_test",
     srcs = ["xla_test.py"],
     tags = [
@@ -79,23 +87,26 @@ cuda_py_test(
     xla_enabled = True,
     deps = [
         ":xla",
-        "//tensorflow/compiler/tests:xla_test",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:control_flow_util",
+        "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/summary",
-        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python:while_loop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/summary:__init__",
+        "//tensorflow/python/tpu:tpu_feed",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "jit_compile_test",
     srcs = ["jit_compile_test.py"],
     python_version = "PY3",
@@ -105,9 +116,39 @@ cuda_py_test(
     ],
     xla_enabled = True,
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+tf_py_strict_test(
+    name = "pjrt_compile_test",
+    srcs = ["pjrt_compile_test.py"],
+    env = {
+        "TF_XLA_FLAGS": "--tf_xla_use_device_api --tf_xla_enable_xla_devices",
+    },
+    python_version = "PY3",
+    tags = [
+        "config-cuda-only",
+        "gpu",
+        "no_oss",
+        "requires-gpu-nvidia",
+        "xla",
+    ],
+    xla_enable_strict_auto_jit = False,
+    xla_enabled = True,
+    deps = [
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
     ],
 )
diff --git a/tensorflow/python/compiler/xla/experimental/BUILD b/tensorflow/python/compiler/xla/experimental/BUILD
index 1d92f028123..485d06300c4 100644
--- a/tensorflow/python/compiler/xla/experimental/BUILD
+++ b/tensorflow/python/compiler/xla/experimental/BUILD
@@ -1,10 +1,12 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "xla_sharding",
     srcs = ["xla_sharding.py"],
     srcs_version = "PY3",
@@ -12,20 +14,24 @@ py_library(
     deps = [
         "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/compiler/xla:xla_data_proto_py",
-        "//tensorflow/compiler/xla/python_api:types",
-        "//tensorflow/compiler/xla/python_api:xla_shape",
+        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "xla_sharding_test",
     srcs = ["xla_sharding_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":xla_sharding",
-        "//tensorflow:tensorflow_py",
+        "//tensorflow/compiler/xla:xla_data_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:absltest",
     ],
diff --git a/tensorflow/python/compiler/xla/jit_compile_test.py b/tensorflow/python/compiler/xla/jit_compile_test.py
index cf074579dbe..59cbcdd5b9c 100644
--- a/tensorflow/python/compiler/xla/jit_compile_test.py
+++ b/tensorflow/python/compiler/xla/jit_compile_test.py
@@ -61,7 +61,7 @@ class JitCompileTest(test.TestCase):
 
       # Check that the must-compile attribute gets correctly propagated to the
       # created derivatives.
-      self.assertTrue(forward.definition.attr["_XlaMustCompile"])
+      self.assertTrue(forward.cached_definition.attr["_XlaMustCompile"])
       self.assertTrue(backward.function_def.attr["_XlaMustCompile"])
 
   def testBasicInt32(self):
diff --git a/tensorflow/python/compiler/xla/pjrt_compile_test.py b/tensorflow/python/compiler/xla/pjrt_compile_test.py
new file mode 100644
index 00000000000..e8d47977e08
--- /dev/null
+++ b/tensorflow/python/compiler/xla/pjrt_compile_test.py
@@ -0,0 +1,93 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests single device compilation + execution using the Device API (aka PjRt).
+
+This feature is still under active development and is protected behind the
+`--tf_xla_use_device_api` flag in the `TF_XLA_FLAGS` environment variable.
+"""
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+
+
+class PjrtCompileTest(test.TestCase):
+
+  def test_compile_on_demand(self):
+    if not test.is_gpu_available() or not test.is_built_with_gpu_support():
+      test.skipTest("Test only applicable on GPU")
+
+    with ops.device("/device:XLA_GPU:0"):
+      a = constant_op.constant([1.0, 2.0])
+      b = constant_op.constant([2.0, 3.0])
+      c = a + b
+      self.assertAllClose([3.0, 5.0], c, atol=1e-05)
+
+      v = variables.Variable([0.0, 1.0])
+      v.assign([1.0, 2.0])
+      self.assertAllClose([1.0, 2.0], v.value(), atol=1e-05)
+      v.assign_add([1.0, 2.0])
+      self.assertAllClose([2.0, 4.0], v.value(), atol=1e-05)
+
+      d = c + v
+      self.assertAllClose([5.0, 9.0], d, atol=1e-05)
+
+  # Tests compilation and execution of a jit_compiled function using PjRt.
+  def test_xla_local_launch(self):
+    if not test.is_gpu_available() or not test.is_built_with_gpu_support():
+      test.skipTest("Test only applicable on GPU")
+
+    @def_function.function(jit_compile=True)
+    def foo(x, y):
+      return x + y + 1
+
+    @def_function.function(jit_compile=True)
+    def bar(x, y):
+      x.assign(y)
+      y.assign_add([1.0, 1.0])
+
+    # Currently PjRt only supports compilation and execution for the XLA_GPU
+    # device to unblock development. Support for non-XLA devices (CPU/GPU/single
+    # core TPU) is going to be added soon, after which support for XLA_* devices
+    # will be dropped.
+    # TODO(b/255826209): Modify the test as we progress towards supporting
+    # non-XLA devices.
+    with ops.device("/device:XLA_GPU:0"):
+      # Function call with scalars
+      self.assertEqual(self.evaluate(foo(1, 2)), 4)
+
+      # Function call with tensors
+      a = constant_op.constant([1.0, 2.0])
+      b = constant_op.constant([2.0, 3.0])
+      self.assertAllClose([4.0, 6.0], foo(a, b), atol=1e-05)
+
+      # Function call with variables
+      x = variables.Variable([0.0, 1.0])
+      y = variables.Variable([1.0, 2.0])
+      self.assertAllClose([2.0, 4.0], foo(x, y), atol=1e-05)
+
+      # Function call with constant and variable
+      self.assertAllClose([2.0, 4.0], foo(a, x), atol=1e-05)
+
+      # Function call that updates variables
+      bar(x, y)
+      self.assertAllClose([1.0, 2.0], x.value(), atol=1e-05)
+      self.assertAllClose([2.0, 3.0], y.value(), atol=1e-05)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/compiler/xla/xla_test.py b/tensorflow/python/compiler/xla/xla_test.py
index bd92f676fd3..86e1a1703ec 100644
--- a/tensorflow/python/compiler/xla/xla_test.py
+++ b/tensorflow/python/compiler/xla/xla_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import while_loop
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import tpu_feed
 
@@ -167,7 +168,7 @@ class XLACompileContextTest(test.TestCase, parameterized.TestCase):
       self.assertNotIn(op1.op, op3.op.control_inputs)
       return op3
 
-    control_flow_ops.while_loop(
+    while_loop.while_loop(
         cond=lambda i: math_ops.less(i, 10), body=while_body, loop_vars=[i])
 
   @test_util.build_as_function_and_v1_graph
diff --git a/tensorflow/python/data/BUILD b/tensorflow/python/data/BUILD
index 79382b9aab1..3e51c0e96a4 100644
--- a/tensorflow/python/data/BUILD
+++ b/tensorflow/python/data/BUILD
@@ -1,19 +1,20 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "data",
     srcs = ["__init__.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:util",
         "//tensorflow/python/data/experimental",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/ops:readers",
     ],
 )
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index a81a909fc87..3e23f6d8d9f 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,7 +7,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "benchmark_base",
     srcs = ["benchmark_base.py"],
     srcs_version = "PY3",
@@ -14,94 +15,101 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:session",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "meta_benchmark",
     srcs = ["meta_benchmark.py"],
     deps = [
-        ":benchmark_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:session",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "batch_benchmark",
     srcs = ["batch_benchmark.py"],
     deps = [
         ":benchmark_base",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "filter_benchmark",
     srcs = ["filter_benchmark.py"],
     deps = [
         ":benchmark_base",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "from_tensor_slices_benchmark",
     srcs = ["from_tensor_slices_benchmark.py"],
     deps = [
         ":benchmark_base",
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python/data/experimental/ops:get_single_element",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:structured_function",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:sparse_tensor",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "interleave_benchmark",
     srcs = ["interleave_benchmark.py"],
     deps = [
         ":benchmark_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python/data/experimental/ops:interleave_ops",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "list_files_benchmark",
     srcs = ["list_files_benchmark.py"],
     deps = [
         ":benchmark_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "map_benchmark",
     srcs = ["map_benchmark.py"],
     deps = [
         ":benchmark_base",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:map_fn",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:constant_op",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "prefetch_benchmark",
     srcs = ["prefetch_benchmark.py"],
     deps = [
@@ -110,11 +118,12 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "range_benchmark",
     srcs = ["range_benchmark.py"],
     deps = [
         ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
     ],
 )
diff --git a/tensorflow/python/data/experimental/BUILD b/tensorflow/python/data/experimental/BUILD
index a2d35f49e51..db74ab487db 100644
--- a/tensorflow/python/data/experimental/BUILD
+++ b/tensorflow/python/data/experimental/BUILD
@@ -1,18 +1,27 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "experimental",
     srcs = ["__init__.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
         "//tensorflow/python/data/experimental/ops:dataset_ops",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
         "//tensorflow/python/data/experimental/ops:lookup_ops",
+        "//tensorflow/python/data/experimental/ops:parsing_ops",
+        "//tensorflow/python/data/experimental/ops:random_ops",
         "//tensorflow/python/data/experimental/service",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:optional_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/util:all_util",
     ],
 )
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index eb33370e2bc..29a2c9c9f8a 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_binary")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,41 +7,39 @@ package(
     licenses = ["notice"],
 )
 
-py_binary(
+py_strict_binary(
     name = "autotune_benchmark_binary",
     srcs = ["autotune_benchmark.py"],
     main = "autotune_benchmark.py",
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
+        "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "autotune_benchmark",
     srcs = ["autotune_benchmark.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "parameter_value_benchmark",
     srcs = ["parameter_value_benchmark.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -48,132 +47,102 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "csv_dataset_benchmark",
     srcs = ["csv_dataset_benchmark.py"],
     tags = ["no_pip"],
     deps = [
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:session",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/ops:readers",
-        "//third_party/py/numpy",
+        "//tensorflow/python/platform:gfile",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "map_and_batch_benchmark",
     srcs = ["map_and_batch_benchmark.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "map_defun_benchmark",
     srcs = ["map_defun_benchmark.py"],
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/experimental/ops:map_defun",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:def_function",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "matching_files_benchmark",
     size = "small",
     srcs = ["matching_files_benchmark.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/experimental/ops:matching_files",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "optimize_benchmark",
     srcs = ["optimize_benchmark.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:options",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "rejection_resample_benchmark",
     srcs = ["rejection_resample_benchmark.py"],
     tags = ["no_pip"],
     deps = [
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/experimental/ops:resampling",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "snapshot_dataset_benchmark",
     srcs = ["snapshot_dataset_benchmark.py"],
     deps = [
-        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/experimental/ops:snapshot",
-        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "unbatch_benchmark",
     srcs = ["unbatch_benchmark.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python/data/benchmarks:benchmark_base",
-        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index ebbbcf22c94..57eaec1a1e0 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -1,12 +1,12 @@
 # Definitions are loaded separately so that copybara can pattern match (and modify) each definition.
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "assert_cardinality_test",
     size = "small",
     srcs = ["assert_cardinality_test.py"],
@@ -16,11 +16,14 @@ tf_py_test(
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "assert_next_test",
     size = "small",
     srcs = ["assert_next_test.py"],
@@ -30,10 +33,13 @@ tf_py_test(
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "assert_prev_test",
     size = "small",
     srcs = ["assert_prev_test.py"],
@@ -43,62 +49,85 @@ tf_py_test(
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "auto_shard_dataset_test",
     size = "medium",
     srcs = ["auto_shard_dataset_test.py"],
     shard_count = 4,
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python/data/experimental/ops:cardinality",
         "//tensorflow/python/data/experimental/ops:distribute",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/experimental/ops:unique",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/kernel_tests:tf_record_test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/lib/io:lib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "checkpoint_input_pipeline_hook_test",
     size = "medium",
     srcs = ["checkpoint_input_pipeline_hook_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/estimator:estimator_py",
-    ],
-)
-
-tf_py_test(
-    name = "compression_ops_test",
-    size = "small",
-    srcs = ["compression_ops_test.py"],
-    deps = [
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python/data/experimental/ops:compression_ops",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/training:saver",
+        "//tensorflow/python/training:training_util",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+tf_py_strict_test(
+    name = "compression_ops_test",
+    size = "small",
+    srcs = ["compression_ops_test.py"],
+    deps = [
+        "//tensorflow/python/data/experimental/ops:compression_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_strict_test(
     name = "copy_to_device_test",
     size = "small",
     srcs = ["copy_to_device_test.py"],
@@ -109,16 +138,22 @@ cuda_py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/compat",
         "//tensorflow/python/data/experimental/ops:prefetching_ops",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "csv_dataset_test",
     size = "medium",
     srcs = ["csv_dataset_test.py"],
@@ -128,7 +163,6 @@ tf_py_test(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python/data/experimental/ops:error_ops",
         "//tensorflow/python/data/experimental/ops:readers",
@@ -136,27 +170,30 @@ tf_py_test(
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "dense_to_sparse_batch_test",
     size = "medium",
     srcs = ["dense_to_sparse_batch_test.py"],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "from_list_test",
     size = "small",
     srcs = ["from_list_test.py"],
@@ -164,22 +201,20 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/experimental/ops:from_list",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
-        "//tensorflow/python/ops/ragged",
-        "//tensorflow/python/ops/ragged:ragged_factory_ops",
-        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "group_by_reducer_test",
     size = "small",
     srcs = ["group_by_reducer_test.py"],
@@ -197,28 +232,33 @@ tf_py_test(
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "index_shuffle_test",
     size = "large",
     srcs = ["index_shuffle_test.py"],
     shard_count = 12,
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/experimental/ops:shuffle_ops",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "io_test",
     size = "medium",
     srcs = ["io_test.py"],
@@ -227,17 +267,18 @@ tf_py_test(
     ],
     deps = [
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
         "//tensorflow/python/data/experimental/ops:io",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "lookup_ops_test",
     size = "small",
     srcs = ["lookup_ops_test.py"],
@@ -245,16 +286,18 @@ tf_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/experimental/ops:lookup_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "make_batched_features_dataset_test",
     size = "small",
     srcs = ["make_batched_features_dataset_test.py"],
@@ -269,13 +312,16 @@ tf_py_test(
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/kernel_tests:tf_record_test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "make_csv_dataset_test",
     size = "small",
     srcs = ["make_csv_dataset_test.py"],
@@ -284,15 +330,17 @@ tf_py_test(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "make_saveable_from_iterator_test",
     size = "small",
     srcs = ["make_saveable_from_iterator_test.py"],
@@ -300,29 +348,33 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_combinations",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
-        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/training:saver",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "make_tf_record_dataset_test",
     size = "medium",
     srcs = ["make_tf_record_dataset_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/kernel_tests:tf_record_test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "map_and_batch_test",
     size = "medium",
     srcs = ["map_and_batch_test.py"],
@@ -330,24 +382,31 @@ tf_py_test(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond_v2",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_util",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:pywrap_sanitizers",
         "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "map_defun_op_test",
     size = "small",
     srcs = ["map_defun_op_test.py"],
@@ -360,36 +419,37 @@ tf_py_test(
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:map_defun",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor_spec",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "matching_files_dataset_test",
     size = "small",
     srcs = ["matching_files_dataset_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/experimental/ops:matching_files",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "model_dataset_test",
     size = "small",
     srcs = ["model_dataset_test.py"],
@@ -399,30 +459,33 @@ tf_py_test(
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "non_serializable_test",
     size = "small",
     srcs = ["non_serializable_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "parallel_interleave_test",
     size = "medium",
     srcs = ["parallel_interleave_test.py"],
     shard_count = 8,
     deps = [
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -435,10 +498,13 @@ tf_py_test(
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "parse_example_dataset_test",
     size = "medium",
     srcs = ["parse_example_dataset_test.py"],
@@ -450,19 +516,22 @@ tf_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:parsing_ops",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/kernel_tests:tf_record_test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "prefetch_to_device_test",
     size = "small",
     srcs = ["prefetch_to_device_test.py"],
@@ -475,26 +544,31 @@ cuda_py_test(
         "//tensorflow/python/data/experimental/ops:prefetching_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "prefetch_with_slack_test",
     size = "small",
     srcs = ["prefetch_with_slack_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "rebatch_dataset_test",
     size = "medium",
     srcs = ["rebatch_dataset_test.py"],
@@ -502,110 +576,148 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:image_ops",
+        "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "replicate_test",
     size = "medium",
     srcs = ["replicate_test.py"],
     grpc_enabled = True,
     tags = ["no_oss"],
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_combinations",
+        "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/training:server_lib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "shuffle_and_repeat_test",
     size = "medium",
     srcs = ["shuffle_and_repeat_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/experimental/ops:shuffle_ops",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:random_seed",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sleep_test",
     size = "small",
     srcs = ["sleep_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sql_dataset_test",
     size = "medium",
     srcs = ["sql_dataset_test.py"],
     shard_count = 4,
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
         "@org_sqlite//:python",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tf_record_writer_test",
     size = "medium",
     srcs = ["tf_record_writer_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:lib",
-        "//tensorflow/python:util",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/experimental/ops:writers",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "variant_test",
     size = "small",
     srcs = ["variant_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "wrap_unwrap_test",
     size = "small",
     srcs = ["wrap_unwrap_test.py"],
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:ops",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/checkpoint_input_pipeline_hook_test.py b/tensorflow/python/data/experimental/kernel_tests/checkpoint_input_pipeline_hook_test.py
index 6779d41edb6..036f7fa51dd 100644
--- a/tensorflow/python/data/experimental/kernel_tests/checkpoint_input_pipeline_hook_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/checkpoint_input_pipeline_hook_test.py
@@ -19,17 +19,16 @@ from tensorflow.python.checkpoint import checkpoint_management
 from tensorflow.python.data.experimental.ops import iterator_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
-from tensorflow_estimator.python.estimator import estimator
-from tensorflow_estimator.python.estimator import model_fn
 
 
 # TODO(b/123904664)
@@ -42,12 +41,12 @@ class CheckpointInputPipelineHookTest(test.TestCase, parameterized.TestCase):
     del config
     global_step = training_util.get_or_create_global_step()
     update_global_step_op = global_step.assign_add(1)
-    latest_feature = variables.VariableV1(
+    latest_feature = variable_v1.VariableV1(
         0, name='latest_feature', dtype=dtypes.int64)
     store_latest_feature_op = latest_feature.assign(features)
     ops.add_to_collection('my_vars', global_step)
     ops.add_to_collection('my_vars', latest_feature)
-    return model_fn.EstimatorSpec(
+    return estimator_lib.EstimatorSpec(
         mode='train',
         train_op=control_flow_ops.group(
             [update_global_step_op, store_latest_feature_op]),
@@ -73,7 +72,7 @@ class CheckpointInputPipelineHookTest(test.TestCase, parameterized.TestCase):
     def _input_fn():
       return dataset_ops.Dataset.range(10)
 
-    est = estimator.Estimator(model_fn=self._model_fn)
+    est = estimator_lib.Estimator(model_fn=self._model_fn)
 
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (2, 1))
@@ -88,7 +87,7 @@ class CheckpointInputPipelineHookTest(test.TestCase, parameterized.TestCase):
       iterator = ds.make_one_shot_iterator()
       return iterator.get_next()
 
-    est = estimator.Estimator(model_fn=self._model_fn)
+    est = estimator_lib.Estimator(model_fn=self._model_fn)
 
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (2, 1))
@@ -101,7 +100,7 @@ class CheckpointInputPipelineHookTest(test.TestCase, parameterized.TestCase):
     def _input_fn():
       return dataset_ops.Dataset.range(10)
 
-    est = estimator.Estimator(model_fn=self._model_fn)
+    est = estimator_lib.Estimator(model_fn=self._model_fn)
 
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (2, 1))
@@ -117,7 +116,7 @@ class CheckpointInputPipelineHookTest(test.TestCase, parameterized.TestCase):
     def _input_fn():
       return constant_op.constant(1, dtype=dtypes.int64)
 
-    est = estimator.Estimator(model_fn=self._model_fn)
+    est = estimator_lib.Estimator(model_fn=self._model_fn)
 
     with self.assertRaises(ValueError):
       est.train(
diff --git a/tensorflow/python/data/experimental/kernel_tests/io_test.py b/tensorflow/python/data/experimental/kernel_tests/io_test.py
index ffb29ec036f..db94ff420cf 100644
--- a/tensorflow/python/data/experimental/kernel_tests/io_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/io_test.py
@@ -50,36 +50,38 @@ class IOTest(test_base.DatasetTestBase, parameterized.TestCase):
     shutil.rmtree(self._save_dir)
 
   @combinations.generate(
-      combinations.times(test_base.eager_only_combinations(),
+      combinations.times(test_base.default_test_combinations(),
                          combinations.combine(compression=[None, "GZIP"])))
   def testBasic(self, compression):
     dataset = dataset_ops.Dataset.range(42)
-    io.save(dataset, self._test_dir, compression=compression)
+    self.evaluate(io.save(dataset, self._test_dir, compression=compression))
     dataset2 = io.load(
         self._test_dir, dataset.element_spec, compression=compression)
     self.assertDatasetProduces(dataset2, range(42))
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testCardinality(self):
     dataset = dataset_ops.Dataset.range(42)
-    io.save(dataset, self._test_dir)
+    self.evaluate(io.save(dataset, self._test_dir))
     dataset2 = io.load(self._test_dir, dataset.element_spec)
     self.assertEqual(self.evaluate(dataset2.cardinality()), 42)
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testCustomShardFunction(self):
     dataset = dataset_ops.Dataset.range(42)
-    io.save(dataset, self._test_dir, shard_func=lambda x: x // 21)
+    self.evaluate(
+        io.save(dataset, self._test_dir, shard_func=lambda x: x // 21)
+    )
     dataset2 = io.load(self._test_dir, dataset.element_spec)
     expected = []
     for i in range(21):
       expected.extend([i, i + 21])
     self.assertDatasetProduces(dataset2, expected)
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testCustomReaderFunction(self):
     dataset = dataset_ops.Dataset.range(42)
-    io.save(dataset, self._test_dir, shard_func=lambda x: x % 7)
+    self.evaluate(io.save(dataset, self._test_dir, shard_func=lambda x: x % 7))
     dataset2 = io.load(
         self._test_dir,
         dataset.element_spec,
@@ -105,7 +107,7 @@ class IOTest(test_base.DatasetTestBase, parameterized.TestCase):
         self._test_dir, dataset.element_spec, compression=compression)
     self.assertDatasetProduces(dataset, range(42))
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testElementSpecOptional(self):
     range_dataset = dataset_ops.Dataset.range(42)
     dict_dataset = dataset_ops.Dataset.from_tensor_slices({"a": [1, 2],
@@ -113,22 +115,15 @@ class IOTest(test_base.DatasetTestBase, parameterized.TestCase):
     tuple_dataset = dataset_ops.Dataset.from_tensor_slices(([1, 2], [3, 4]))
     dataset = dataset_ops.Dataset.zip((range_dataset, dict_dataset,
                                        tuple_dataset))
-    io.save(dataset, self._test_dir)
+    self.evaluate(io.save(dataset, self._test_dir))
     dataset_loaded = io.load(self._test_dir)
     self.assertDatasetsEqual(dataset, dataset_loaded)
 
-  @combinations.generate(test_base.graph_only_combinations())
-  def testElementSpecRequired(self):
-    dataset = dataset_ops.Dataset.range(42)
-    io.save(dataset, self._test_dir)
-    with self.assertRaises(ValueError):
-      _ = io.load(self._test_dir)
-
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testRepeatAndPrefetch(self):
     """This test reproduces github.com/tensorflow/tensorflow/issues/49165."""
     dataset1 = dataset_ops.Dataset.from_tensor_slices(np.random.rand(16, 32))
-    io.save(dataset1, self._test_dir)
+    self.evaluate(io.save(dataset1, self._test_dir))
     dataset = io.load(self._test_dir)
     dataset = dataset.shuffle(buffer_size=16)
     dataset = dataset.batch(16)
@@ -145,11 +140,11 @@ class LoadCheckpointTest(IOTest, checkpoint_test_base.CheckpointTestBase):
     return io.load(self._save_dir)
 
   @combinations.generate(
-      combinations.times(test_base.eager_only_combinations(),
+      combinations.times(test_base.default_test_combinations(),
                          checkpoint_test_base.default_test_combinations()))
   def test(self, verify_fn):
     dataset = dataset_ops.Dataset.range(42)
-    io.save(dataset, self._save_dir)
+    self.evaluate(io.save(dataset, self._save_dir))
     verify_fn(self, self._build_ds, num_outputs=42)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index f1c580af631..883a72c31e1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -33,7 +33,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
@@ -381,7 +381,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     def map_fn(x):
       previous_control_flow_v2_value = control_flow_util.ENABLE_CONTROL_FLOW_V2
       control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
-      return_value = control_flow_ops.cond(x < 50, lambda: x + 1, lambda: x * x)
+      return_value = cond.cond(x < 50, lambda: x + 1, lambda: x * x)
       control_flow_util.ENABLE_CONTROL_FLOW_V2 = previous_control_flow_v2_value
       return return_value
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index b45b68f4b27..41f60a7676f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -32,8 +32,8 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
@@ -139,8 +139,8 @@ tf_py_test(
     srcs = ["make_deterministic_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/grappler_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/grappler_test.py
index 95e272ee49c..465eee4358e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/grappler_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/grappler_test.py
@@ -23,7 +23,7 @@ from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
@@ -44,9 +44,9 @@ class GrapplerTest(test_base.DatasetTestBase, parameterized.TestCase):
 
       size = array_ops.size(parsed)
       value = math_ops.cast(parsed, dtypes.bool)
-      return control_flow_ops.cond(size > 0,
-                                   lambda: array_ops.reshape(value, []),
-                                   lambda: array_ops.zeros([], dtypes.bool))
+      return cond.cond(size > 0,
+                       lambda: array_ops.reshape(value, []),
+                       lambda: array_ops.zeros([], dtypes.bool))
 
     dataset = dataset.map(parse_fn)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/make_deterministic_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/make_deterministic_test.py
index 41b768a0cff..74a072b3d95 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/make_deterministic_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/make_deterministic_test.py
@@ -33,7 +33,7 @@ from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import script_ops
@@ -293,7 +293,7 @@ class MakeDeterministicTest(test_base.DatasetTestBase, parameterized.TestCase):
         def interleave_fn(filename):
           # Test function that uses control flow. The True branch is never taken
           concat = string_ops.string_join([filename, "abc"])
-          return control_flow_ops.cond(
+          return cond.cond(
               math_ops.equal(filename, "abc"),
               lambda: reader_ops.TextLineDataset(concat),
               lambda: reader_ops.TextLineDataset(filename))
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/BUILD b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
index 84c4dd3a67d..f1e3bcc88b1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
@@ -86,6 +86,7 @@ tf_py_test(
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/data",
         "//tensorflow/python/data/experimental/ops:data_service_ops",
@@ -211,14 +212,16 @@ tf_py_test(
 
 tf_py_test(
     name = "distributed_save_test",
-    size = "small",
+    size = "medium",
     srcs = ["distributed_save_test.py"],
+    shard_count = 4,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_combinations",
         "//tensorflow/python/data/experimental/kernel_tests/service:test_base",
         "//tensorflow/python/data/experimental/ops:data_service_ops",
         "//tensorflow/python/data/experimental/ops:distributed_save_op",
+        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:errors",
@@ -233,11 +236,6 @@ tf_py_test(
     size = "medium",
     srcs = ["distributed_save_ft_test.py"],
     shard_count = 17,
-    # TODO(b/250921378): Fix sanitizers.
-    tags = [
-        "noasan",
-        "nomsan",
-    ],
     deps = [
         ":test_base",
         "//tensorflow/python:platform_test",
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/data_service_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/service/data_service_ops_test.py
index 474543edee1..67f54a03b4e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/data_service_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/data_service_ops_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -536,6 +537,69 @@ class DataServiceOpsTest(
       time.sleep(0.1)
     self.assertEqual(cluster.workers[0].num_tasks(), 1)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDontGcJobsWithVisitationGuarantees(self):
+    cluster = self.make_test_cluster(
+        num_workers=1,
+        job_gc_check_interval_ms=50,
+        job_gc_timeout_ms=20,
+    )
+    num_elements = 1000
+    it1 = iter(
+        self.make_distributed_range_dataset(
+            num_elements,
+            cluster,
+            job_name="test1",
+        )
+    )
+    it2 = iter(
+        self.make_distributed_range_dataset(
+            num_elements,
+            cluster,
+            job_name="test2",
+            processing_mode=data_service_ops.ShardingPolicy.DYNAMIC,
+        )
+    )
+    self.assertEqual(cluster.workers[0].num_tasks(), 2)
+    del it1
+    del it2
+    # Check that only the first job is gced. The second job will not be gced
+    # because it has a sharding policy with visitation guarantees.
+    while cluster.workers[0].num_tasks() > 1:
+      time.sleep(0.1)
+    self.assertEqual(cluster.workers[0].num_tasks(), 1)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testGcDynamicShardingJobIfRequested(self):
+    dispatcher = server_lib.DispatchServer(
+        service_config_pb2.DispatcherConfig(
+            protocol="grpc",
+            job_gc_check_interval_ms=50,
+            job_gc_timeout_ms=20,
+            gc_dynamic_sharding_jobs=True,
+        )
+    )
+    dispatcher_address = dispatcher.target.split("://")[1]
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(
+            dispatcher_address=dispatcher_address, heartbeat_interval_ms=100
+        )
+    )
+
+    num_elements = 1000
+    dataset = dataset_ops.Dataset.range(num_elements)
+    dataset = dataset.apply(
+        data_service_ops._distribute(
+            processing_mode=data_service_ops.ShardingPolicy.DYNAMIC,
+            service=dispatcher.target,
+        )
+    )
+    it = iter(dataset)
+    self.assertEqual(worker._num_tasks(), 1)
+    del it
+    while worker._num_tasks() > 0:
+      time.sleep(0.1)
+
   @combinations.generate(test_base.eager_only_combinations())
   def testGcAndRecreate(self):
     cluster = self.make_test_cluster(
@@ -1190,7 +1254,7 @@ class DataServiceOpsTest(
     cluster = self.make_test_cluster(num_workers=1)
     if not use_resource:
       with variable_scope.variable_scope("foo", use_resource=False):
-        v = variables.VariableV1(10, dtype=dtypes.int64)
+        v = variable_v1.VariableV1(10, dtype=dtypes.int64)
     else:
       v = variables.Variable(10, dtype=dtypes.int64)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_ft_test.py b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_ft_test.py
index a0fb77442d7..b82cfcefef9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_ft_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_ft_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Fault tolerance tests for tf.data service snapshots."""
 import os
+import pathlib
 import tempfile
 import time
 
@@ -79,20 +80,29 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
     )
     return cluster, ds
 
-  def splits_dir(self, stream_idx=0):
+  def splits_dir(self, stream_idx=0, worker=0):
+    stream_name = f"stream_{stream_idx}"
+    self._make_stream_dir(stream_name, worker=worker)
     return os.path.join(
         self._path,
         "streams",
-        f"stream_{stream_idx}",
+        stream_name,
         "splits",
     )
 
-  def source_dir(self, stream_idx=0, source_idx=0):
+  def source_dir(self, stream_idx=0, source_idx=0, worker=0):
     return os.path.join(
-        self.splits_dir(stream_idx),
+        self.splits_dir(stream_idx, worker=worker),
         f"source_{source_idx}",
     )
 
+  def _make_stream_dir(self, stream_name, worker=0):
+    stream_dir = os.path.join(self._path, "streams", stream_name)
+    os.makedirs(stream_dir)
+    pathlib.Path(os.path.join(stream_dir, "owner_worker")).write_text(
+        f"{worker}"
+    )
+
   @combinations.generate(test_base.eager_only_combinations())
   def testSnapshotRecoverySucceeds(self):
     cluster, _ = self.setup()
@@ -118,7 +128,7 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
   )
   def testSnapshotRecoveryFailsWithBadStreamName(self, bad_stream_dir_name):
     cluster, _ = self.setup(num_workers=0)
-    os.makedirs(os.path.join(self._path, "streams", bad_stream_dir_name))
+    self._make_stream_dir(bad_stream_dir_name)
     with self.assertRaisesRegex(ValueError, "can't parse"):
       cluster.restart_dispatcher()
 
@@ -190,45 +200,53 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
   def testSnapshotRecoveryFailsWithDuplicateGlobalIndexInSplitName(self):
     cluster, _ = self.setup(num_workers=0)
     write_file(os.path.join(self.source_dir(stream_idx=0), "split_0_1"))
-    write_file(os.path.join(self.source_dir(stream_idx=1), "split_0_1"))
+    write_file(
+        os.path.join(self.source_dir(stream_idx=1, worker=1), "split_0_1")
+    )
     with self.assertRaisesRegex(ValueError, "found duplicate global"):
       cluster.restart_dispatcher()
 
   @combinations.generate(test_base.eager_only_combinations())
-  def testWorkersRetainStreamAssignmentsAfterDispatcherRestart(self):
-    n = 5
-    cluster, _ = self.setup(num_workers=n, ds_size=10000)
-    assignments = get_stream_assignments(cluster, n)
-    cluster.restart_dispatcher()
-    while len(cluster.snapshot_streams(self._path)) != n:
-      time.sleep(0.1)
-    for i in range(n):
-      self.assertEqual(get_stream_assignment(cluster, i), assignments[i])
+  def testSnapshotRecoveryFailsWithDuplicateWorkerAssignment(self):
+    cluster, _ = self.setup(num_workers=0)
+    write_file(os.path.join(self.source_dir(stream_idx=0), "split_0_1"))
+    write_file(os.path.join(self.source_dir(stream_idx=1), "split_0_1"))
+    with self.assertRaisesRegex(ValueError, "worker is already assigned"):
+      cluster.restart_dispatcher()
 
   @combinations.generate(test_base.eager_only_combinations())
-  def testOrphanGetsReassigned(self):
+  def testStreamsReassignedAfterDispatcherRestart(self):
     n = 5
     cluster, _ = self.setup(num_workers=n, ds_size=10000)
-    assignments = get_stream_assignments(cluster, n)
-    cluster.stop_worker(0)
-    while cluster.snapshot_streams(self._path)[assignments[0]].state != _ORPHAN:
+    get_streams = lambda: cluster.snapshot_streams(self._path)
+    while len(get_streams()) != n:
       time.sleep(0.1)
-    cluster.add_worker(start=True)
-    self.assertCountEqual(
-        [get_stream_assignment(cluster, i) for i in range(1, n + 1)],
-        range(n),
-    )
+    cluster.restart_dispatcher()
+    streams = get_streams()
+    while len(streams) != n:
+      time.sleep(0.1)
+      streams = get_streams()
+    self.assertCountEqual([stream.index for stream in streams], range(n))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testLargeMultiSourceSnapshotRecoversAndCompletes(self):
     n = 5
     cluster, _ = self.setup(num_workers=n, ds_size=1000, num_sources=3)
+    get_stream_assignments(cluster, n)  # Block until all workers have streams.
+    cluster.stop_worker(0)
     cluster.restart_dispatcher()
-    self._wait_for_snapshot(cluster)
-    self.assertTrue(os.path.exists(os.path.join(self._path, "DONE")))
+    cluster.restart_worker(0)
+    self._wait_for_snapshot()
+    self.assertTrue(self._snapshot_is_done())
 
-  def _wait_for_snapshot(self, cluster):
-    while not os.path.exists(os.path.join(self._path, "DONE")):
+  def _snapshot_is_done(self):
+    return os.path.exists(os.path.join(self._path, "DONE"))
+
+  def _snapshot_has_error(self):
+    return os.path.exists(os.path.join(self._path, "ERROR"))
+
+  def _wait_for_snapshot(self):
+    while not (self._snapshot_is_done() or self._snapshot_has_error()):
       time.sleep(0.1)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_test.py b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_test.py
index ba451cf0113..5d22cb67181 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.data.experimental.kernel_tests.service import test_base as data_service_test_base
 from tensorflow.python.data.experimental.ops import data_service_ops
 from tensorflow.python.data.experimental.ops import distributed_save_op
+from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import combinations
@@ -36,9 +37,8 @@ from tensorflow.python.platform import test
 # TODO(mpcallanan): Restructure this and snapshot_ft_test.py to share more.
 
 
-class DistributedSaveTest(
-    data_service_test_base.TestBase, parameterized.TestCase
-):
+class DistributedSaveTestBase:
+  """Base class for setting up snapshot directories."""
 
   def setUp(self):
     super().setUp()
@@ -54,21 +54,56 @@ class DistributedSaveTest(
     except FileNotFoundError:
       pass
 
-  @combinations.generate(test_base.eager_only_combinations())
-  def testSimple(self):
+
+class DistributedSaveTest(
+    DistributedSaveTestBase,
+    data_service_test_base.TestBase,
+    parameterized.TestCase,
+):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(num_workers=[1, 3], num_elements=[0, 10, 10000]),
+      )
+  )
+  def testSaveLoad(self, num_workers, num_elements):
+    cluster = data_service_test_base.TestCluster(num_workers=num_workers)
+    dataset = dataset_ops.Dataset.range(num_elements)
+    self.evaluate(distributed_save_op.distributed_save(
+        dataset, self._test_dir, cluster.dispatcher_address()))
+    _wait_for_snapshot(self._test_dir)
+
+    dataset = dataset_ops.Dataset.load(self._test_dir)
+
+    multiple_workers = num_workers > 1
+    multiple_chunks = num_elements > 10
+    ignore_order = multiple_workers or multiple_chunks
+    self.assertDatasetProduces(
+        dataset, list(range(num_elements)), assert_items_equal=ignore_order
+    )
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(compression=[None, "AUTO", "GZIP"]),
+      )
+  )
+  def testCompression(self, compression):
     cluster = data_service_test_base.TestCluster(num_workers=1)
     dataset = dataset_ops.Dataset.range(10)
-    distributed_save_op.distributed_save(
-        dataset, self._test_dir, cluster.dispatcher_address()
-    )
-    self._wait_for_snapshot(cluster)
+    self.evaluate(distributed_save_op.distributed_save(
+        dataset,
+        self._test_dir,
+        cluster.dispatcher_address(),
+        compression=compression,
+    ))
+    _wait_for_snapshot(self._test_dir)
 
     dataset = dataset_ops.Dataset.load(self._test_dir)
     self.assertDatasetProduces(dataset, list(range(10)))
 
-  # TODO(mpcallanan): Add test for multiple workers.
-
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testChooseFromDatasets(self):
     cluster = data_service_test_base.TestCluster(num_workers=1)
     datasets = [
@@ -78,22 +113,65 @@ class DistributedSaveTest(
     ]
     choice_dataset = dataset_ops.Dataset.range(3).repeat()
     dataset = dataset_ops.Dataset.choose_from_datasets(datasets, choice_dataset)
-    distributed_save_op.distributed_save(
+    self.evaluate(distributed_save_op.distributed_save(
         dataset, self._test_dir, cluster.dispatcher_address()
-    )
-    self._wait_for_snapshot(cluster)
+    ))
+    _wait_for_snapshot(self._test_dir)
 
     dataset = dataset_ops.Dataset.load(self._test_dir)
     self.assertDatasetProduces(dataset, ["a", "b", "c"] * 5)
 
-  @combinations.generate(test_base.eager_only_combinations())
-  def testDistributedLoad(self):
+  @combinations.generate(test_base.default_test_combinations())
+  def testLoadWithCustomReaderFunc(self):
+    # TODO(b/250921378): Currently, all the unit tests only write one chunk
+    # since the test dataset is small. The maximum chunk size is a C++ constant.
+    # To test saving/loading multiple chunks in Python, we need a way to inject
+    # the maximum chunk size. In this test, we simulate multiple chunks by
+    # writing a snapshot and copying its output files.
     cluster = data_service_test_base.TestCluster(num_workers=1)
     dataset = dataset_ops.Dataset.range(10)
-    distributed_save_op.distributed_save(
+    self.evaluate(distributed_save_op.distributed_save(
         dataset, self._test_dir, cluster.dispatcher_address()
+    ))
+    _wait_for_snapshot(self._test_dir)
+
+    chunks_dir = os.path.join(self._test_dir, "chunks")
+    files = os.listdir(chunks_dir)
+    for i in range(2):
+      for file in files:
+        shutil.copy(
+            os.path.join(chunks_dir, file),
+            os.path.join(chunks_dir, f"{file}_{i}"),
+        )
+
+    def custom_reader_func(datasets):
+      datasets = datasets.shuffle(3)
+      return datasets.interleave(
+          lambda x: x, num_parallel_calls=dataset_ops.AUTOTUNE
+      )
+
+    dataset = dataset_ops.Dataset.load(
+        self._test_dir, reader_func=custom_reader_func
     )
-    self._wait_for_snapshot(cluster)
+    self.assertDatasetProduces(
+        dataset, list(range(10)) * 3, assert_items_equal=True
+    )
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(num_workers=[1, 3]),
+      )
+  )
+  def testDistributedLoad(self, num_workers):
+    cluster = data_service_test_base.TestCluster(num_workers=num_workers)
+    dataset = dataset_ops.Dataset.range(10)
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset, self._test_dir, cluster.dispatcher_address()
+        )
+    )
+    _wait_for_snapshot(self._test_dir)
 
     dataset = dataset_ops.Dataset.load(self._test_dir)
     dataset = dataset.apply(
@@ -102,28 +180,61 @@ class DistributedSaveTest(
             cluster.dispatcher_address(),
         )
     )
-    self.assertDatasetProduces(dataset, list(range(10)))
+    ignore_order = num_workers > 0
+    self.assertDatasetProduces(
+        dataset, list(range(10)) * num_workers, assert_items_equal=ignore_order)
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
+  def testSnapshotDoesNotExist(self):
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    with self.assertRaises(errors.NotFoundError):
+      dataset = dataset_ops.Dataset.load(self._test_dir)
+      dataset = dataset.apply(
+          data_service_ops.distribute(
+              data_service_ops.ShardingPolicy.OFF,
+              cluster.dispatcher_address(),
+          )
+      )
+      self.getDatasetOutput(dataset)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testDuplicateSnapshot(self):
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    dataset = dataset_ops.Dataset.range(10)
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError, "already started or completed"):
+      self.evaluate(
+          distributed_save_op.distributed_save(
+              dataset, self._test_dir, cluster.dispatcher_address()))
+      self.evaluate(
+          distributed_save_op.distributed_save(
+              dataset, self._test_dir, cluster.dispatcher_address()))
+
+  @combinations.generate(test_base.default_test_combinations())
   def testWorkerFailure(self):
     cluster = data_service_test_base.TestCluster(num_workers=1)
     components = np.array([1.0, 2.0, 3.0, np.nan, 5.0]).astype(np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices(components)
     dataset = dataset.map(lambda x: array_ops.check_numerics(x, "message"))
-    distributed_save_op.distributed_save(
-        dataset, self._test_dir, cluster.dispatcher_address()
-    )
-    self._wait_for_error(cluster)
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset, self._test_dir, cluster.dispatcher_address()))
+    _wait_for_error(self._test_dir)
 
-  @combinations.generate(test_base.eager_only_combinations())
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError, "the save job failed to write it."):
+      dataset = dataset_ops.Dataset.load(self._test_dir)
+      self.getDatasetOutput(dataset)
+
+  @combinations.generate(test_base.default_test_combinations())
   def testBadDispatcherAddress(self):
     dataset = dataset_ops.Dataset.range(10)
     with self.assertRaisesRegex(ValueError, "must be a string"):
-      distributed_save_op.distributed_save(dataset, "", 1)
+      self.evaluate(distributed_save_op.distributed_save(dataset, "", 1))
     with self.assertRaisesRegex(ValueError, "must not be empty"):
-      distributed_save_op.distributed_save(dataset, "", "")
+      self.evaluate(distributed_save_op.distributed_save(dataset, "", ""))
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testBadCardinality(self):
     cluster = data_service_test_base.TestCluster(num_workers=1)
     dataset = dataset_ops.Dataset.range(10).repeat()
@@ -131,17 +242,48 @@ class DistributedSaveTest(
         errors.InvalidArgumentError,
         "Saving an infinite dataset is not allowed",
     ):
-      distributed_save_op.distributed_save(
+      self.evaluate(distributed_save_op.distributed_save(
           dataset, self._test_dir, cluster.dispatcher_address()
+      ))
+
+
+class LoadCheckpointTest(
+    DistributedSaveTestBase,
+    data_service_test_base.TestBase,
+    checkpoint_test_base.CheckpointTestBase,
+    parameterized.TestCase,
+):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
       )
+  )
+  def testLoadCheckpoint(self, verify_fn):
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    dataset = dataset_ops.Dataset.range(10)
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset, self._test_dir, cluster.dispatcher_address()
+        )
+    )
+    _wait_for_snapshot(self._test_dir)
 
-  def _wait_for_snapshot(self, cluster):
-    while not os.path.exists(os.path.join(self._test_dir, "DONE")):
-      time.sleep(0.1)
+    def _build_ds():
+      return dataset_ops.Dataset.load(self._test_dir)
 
-  def _wait_for_error(self, cluster):
-    while not os.path.exists(os.path.join(self._test_dir, "ERROR")):
-      time.sleep(0.1)
+    verify_fn(self, _build_ds, num_outputs=10)
+
+
+def _wait_for_snapshot(snapshot_path):
+  while not os.path.exists(os.path.join(snapshot_path, "DONE")):
+    time.sleep(0.1)
+
+
+def _wait_for_error(snapshot_path):
+  while not os.path.exists(os.path.join(snapshot_path, "ERROR")):
+    time.sleep(0.1)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/dynamic_sharding_test.py b/tensorflow/python/data/experimental/kernel_tests/service/dynamic_sharding_test.py
index 9cb099c6a93..30656a1563b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/dynamic_sharding_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/dynamic_sharding_test.py
@@ -211,6 +211,26 @@ class DynamicShardingTest(data_service_test_base.TestBase,
     self.assertDatasetProduces(
         ds, list(zip(range(smaller_num_elements), range(smaller_num_elements))))
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testImbalancedZipAndRepeat(self):
+    smaller_num_elements = 200
+    larger_num_elements = 1000
+    repetitions = 3
+
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    a = dataset_ops.Dataset.range(smaller_num_elements)
+    b = dataset_ops.Dataset.range(larger_num_elements)
+
+    ds = dataset_ops.Dataset.zip((a, b))
+    ds = ds.repeat(repetitions)
+    ds = self._make_dynamic_sharding_dataset(ds, cluster)
+
+    expected = repetitions * (
+        list(zip(range(smaller_num_elements), range(smaller_num_elements)))
+    )
+
+    self.assertDatasetProduces(ds, expected)
+
   @combinations.generate(test_base.default_test_combinations())
   def testImbalancedZipMultiWorker(self):
     smaller_num_elements = 200
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/test_base.py b/tensorflow/python/data/experimental/kernel_tests/service/test_base.py
index e7cae487920..0c4e5f3938b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/test_base.py
@@ -31,9 +31,10 @@ TMP_WORK_DIR = "tmp_work_dir_placeholder"
 NO_WORK_DIR = ""
 # We use a faster than normal heartbeat interval so that tests run faster.
 TEST_HEARTBEAT_INTERVAL_MS = 100
-TEST_DISPATCHER_TIMEOUT_MS = 1000
+TEST_DISPATCHER_TIMEOUT_MS = 5000
 TEST_WORKER_TIMEOUT_MS = 200
 TEST_JOB_GC_CHECK_INTERNAL_MS = 1000
+TEST_SNAPSHOT_MAX_CHUNK_SIZE_BYTES = 16 << 10  # 16 KB
 PROTOCOL = "grpc"
 
 
@@ -45,12 +46,15 @@ def all_cluster_configurations():
   return with_work_dir + without_work_dir
 
 
-def _make_worker(dispatcher_address,
-                 data_transfer_protocol,
-                 shutdown_quiet_period_ms=0,
-                 port=0,
-                 worker_tags=None,
-                 cross_trainer_cache_size_bytes=None):
+def _make_worker(
+    dispatcher_address,
+    data_transfer_protocol,
+    shutdown_quiet_period_ms=0,
+    port=0,
+    worker_tags=None,
+    cross_trainer_cache_size_bytes=None,
+    snapshot_max_chunk_size_bytes=TEST_SNAPSHOT_MAX_CHUNK_SIZE_BYTES,
+):
   """Creates a worker server."""
   defaults = server_lib.WorkerConfig(dispatcher_address=dispatcher_address)
   config_proto = service_config_pb2.WorkerConfig(
@@ -64,7 +68,9 @@ def _make_worker(dispatcher_address,
       data_transfer_protocol=data_transfer_protocol,
       data_transfer_address=defaults.worker_address,
       shutdown_quiet_period_ms=shutdown_quiet_period_ms,
-      cross_trainer_cache_size_bytes=cross_trainer_cache_size_bytes)
+      cross_trainer_cache_size_bytes=cross_trainer_cache_size_bytes,
+      snapshot_max_chunk_size_bytes=snapshot_max_chunk_size_bytes,
+  )
   return server_lib.WorkerServer(config_proto, start=False)
 
 
@@ -72,13 +78,16 @@ def _make_worker(dispatcher_address,
 class TestWorker:
   """A tf.data service worker."""
 
-  def __init__(self,
-               dispatcher_address,
-               shutdown_quiet_period_ms,
-               data_transfer_protocol=None,
-               port=0,
-               worker_tags=None,
-               cross_trainer_cache_size_bytes=None):
+  def __init__(
+      self,
+      dispatcher_address,
+      shutdown_quiet_period_ms,
+      data_transfer_protocol=None,
+      port=0,
+      worker_tags=None,
+      cross_trainer_cache_size_bytes=None,
+      snapshot_max_chunk_size_bytes=TEST_SNAPSHOT_MAX_CHUNK_SIZE_BYTES,
+  ):
     self._dispatcher_address = dispatcher_address
     self._shutdown_quiet_period_ms = shutdown_quiet_period_ms
     self._server = _make_worker(
@@ -87,7 +96,9 @@ class TestWorker:
         shutdown_quiet_period_ms,
         port=port,
         worker_tags=worker_tags,
-        cross_trainer_cache_size_bytes=cross_trainer_cache_size_bytes)
+        cross_trainer_cache_size_bytes=cross_trainer_cache_size_bytes,
+        snapshot_max_chunk_size_bytes=snapshot_max_chunk_size_bytes,
+    )
     self._running = False
     self._data_transfer_protocol = data_transfer_protocol
 
@@ -140,6 +151,7 @@ class TestCluster:
       job_gc_timeout_ms=None,
       worker_timeout_ms=TEST_WORKER_TIMEOUT_MS,
       worker_shutdown_quiet_period_ms=0,
+      snapshot_max_chunk_size_bytes=TEST_SNAPSHOT_MAX_CHUNK_SIZE_BYTES,
       start=True,
       data_transfer_protocol=None,
   ):
@@ -162,6 +174,8 @@ class TestCluster:
         considering it missing, in milliseconds.
       worker_shutdown_quiet_period_ms: When shutting down a worker, how long to
         wait for the gRPC server to process the final requests.
+      snapshot_max_chunk_size_bytes: The maximum size of a distributed snapshot
+        chunk file.
       start: Whether to immediately start the servers in the cluster. If
         `False`, the servers can be started later by calling
         `start_dispatcher()` and `start_workers()`.
@@ -171,7 +185,11 @@ class TestCluster:
     if work_dir == TMP_WORK_DIR:
       work_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
     self._worker_shutdown_quiet_period_ms = worker_shutdown_quiet_period_ms
+    self._snapshot_max_chunk_size_bytes = snapshot_max_chunk_size_bytes
     self._data_transfer_protocol = data_transfer_protocol
+    self._job_gc_check_interval_ms = job_gc_check_interval_ms
+    self._job_gc_timeout_ms = job_gc_timeout_ms
+    self._worker_timeout_ms = worker_timeout_ms
     self.dispatcher = server_lib.DispatchServer(
         server_lib.DispatcherConfig(
             port=dispatcher_port,
@@ -193,9 +211,12 @@ class TestCluster:
     return self.dispatcher.target.split("://")[1]
 
   def add_worker(self, start=True):
-    worker = TestWorker(self.dispatcher_address(),
-                        self._worker_shutdown_quiet_period_ms,
-                        self._data_transfer_protocol)
+    worker = TestWorker(
+        self.dispatcher_address(),
+        self._worker_shutdown_quiet_period_ms,
+        self._data_transfer_protocol,
+        snapshot_max_chunk_size_bytes=self._snapshot_max_chunk_size_bytes,
+    )
     if start:
       worker.start()
     self.workers.append(worker)
@@ -211,6 +232,9 @@ class TestCluster:
     # pylint: disable=protected-access
     self.dispatcher._stop()
 
+  def restart_worker(self, index):
+    self.workers[index].restart()
+
   def stop_worker(self, index):
     self.workers[index].stop()
 
@@ -235,7 +259,12 @@ class TestCluster:
             port=port,
             work_dir=self.dispatcher._config.work_dir,
             protocol=PROTOCOL,
-            fault_tolerant_mode=self.dispatcher._config.fault_tolerant_mode))
+            fault_tolerant_mode=self.dispatcher._config.fault_tolerant_mode,
+            job_gc_check_interval_ms=self._job_gc_check_interval_ms,
+            job_gc_timeout_ms=self._job_gc_timeout_ms,
+            worker_timeout_ms=self._worker_timeout_ms,
+        )
+    )
 
   def num_registered_workers(self):
     return self.dispatcher._num_workers()
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index d00fbda16d4..b06ac88b86e 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -1,10 +1,12 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "batching",
     srcs = ["batching.py"],
     srcs_version = "PY3",
@@ -14,47 +16,54 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:structured_function",
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "cardinality",
     srcs = ["cardinality.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "compression_ops",
     srcs = ["compression_ops.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "counter",
     srcs = ["counter.py"],
     srcs_version = "PY3",
     deps = [
-        ":scan_ops",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python:tf2",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "data_service_ops",
     srcs = [
         "data_service_ops.py",
@@ -62,136 +71,150 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":compression_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tf2",
         "//tensorflow/python/data/experimental/service:_pywrap_server_lib",
         "//tensorflow/python/data/experimental/service:_pywrap_utils",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/ops:structured_function",
-        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "distributed_save_op",
     srcs = [
         "distributed_save_op.py",
     ],
+    visibility = ["//tensorflow:internal"],
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/util",
+        "//tensorflow/python/saved_model:nested_structure_coder",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "distribute",
     srcs = [
         "distribute.py",
     ],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/types",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/types:data",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "enumerate_ops",
     srcs = ["enumerate_ops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "error_ops",
     srcs = ["error_ops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "from_list",
     srcs = ["from_list.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:experimental_dataset_ops_gen",
-    ],
-)
-
-py_library(
-    name = "get_single_element",
-    srcs = ["get_single_element.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/types",
-    ],
-)
-
-py_library(
-    name = "grouping",
-    srcs = ["grouping.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:function",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:structure",
-    ],
-)
-
-py_library(
-    name = "interleave_ops",
-    srcs = ["interleave_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":random_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tf2",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python/data/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
+    name = "get_single_element",
+    srcs = ["get_single_element.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/types:data",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "grouping",
+    srcs = ["grouping.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:structured_function",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "interleave_ops",
+    srcs = ["interleave_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
     name = "io",
     srcs = [
         "io.py",
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python/checkpoint",
-        "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "iterator_ops",
     srcs = [
         "iterator_ops.py",
@@ -199,17 +222,18 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:basic_session_run_hooks",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:saver",
         "//tensorflow/python:session_run_hook",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:optional_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "lookup_ops",
     srcs = [
         "lookup_ops.py",
@@ -217,11 +241,17 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":cardinality",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "map_defun",
     srcs = ["map_defun.py"],
     srcs_version = "PY3",
@@ -232,126 +262,136 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "matching_files",
     srcs = ["matching_files.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "parsing_ops",
     srcs = ["parsing_ops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "prefetching_ops",
     srcs = ["prefetching_ops.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:structured_function",
+        "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "random_access",
     srcs = ["random_access.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "random_ops",
     srcs = [
         "random_ops.py",
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python:tf2",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "readers",
     srcs = [
         "readers.py",
     ],
     srcs_version = "PY3",
     deps = [
-        ":batching",
         ":error_ops",
-        ":interleave_ops",
         ":parsing_ops",
-        ":shuffle_ops",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
+        "//tensorflow/python:tf2",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "resampling",
     srcs = ["resampling.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "scan_ops",
     srcs = ["scan_ops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "shuffle_ops",
     srcs = [
         "shuffle_ops.py",
@@ -365,14 +405,21 @@ py_library(
     deps = [
         ":random_access",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:random_seed",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "snapshot",
     srcs = [
         "snapshot.py",
@@ -382,20 +429,25 @@ py_library(
     deps = [
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "take_while_ops",
     srcs = ["take_while_ops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "testing",
     srcs = ["testing.py"],
     srcs_version = "PY3",
@@ -404,21 +456,23 @@ py_library(
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:dtypes",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "unique",
     srcs = [
         "unique.py",
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "writers",
     srcs = [
         "writers.py",
@@ -426,12 +480,18 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/types",
+        "//tensorflow/python/data/util:convert",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/types:data",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "dataset_ops",
     srcs_version = "PY3",
     deps = [
@@ -462,7 +522,6 @@ py_library(
         ":unique",
         ":writers",
         "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:debug_mode",
         "//tensorflow/python/data/util:nest",
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index 776836ca6c3..350ae950ce7 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -34,7 +34,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import string_ops
-from tensorflow.python.util import lazy_loader
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.util.tf_export import tf_export
 
 COMPRESSION_AUTO = "AUTO"
@@ -42,12 +42,6 @@ COMPRESSION_NONE = None
 _PARALLEL_EPOCHS = "parallel_epochs"
 _DISTRIBUTED_EPOCH = "distributed_epoch"
 
-# TODO(b/176933539): Use the regular import.
-# TODO(b/238903802): Use TypeSpec serialization methods directly.
-nested_structure_coder = lazy_loader.LazyLoader(
-    "nested_structure_coder", globals(),
-    "tensorflow.python.saved_model.nested_structure_coder")
-
 
 @tf_export("data.experimental.service.ShardingPolicy")
 class ShardingPolicy(enum.IntEnum):
diff --git a/tensorflow/python/data/experimental/ops/distributed_save_op.py b/tensorflow/python/data/experimental/ops/distributed_save_op.py
index fdab2f6f1e1..3e986a72681 100644
--- a/tensorflow/python/data/experimental/ops/distributed_save_op.py
+++ b/tensorflow/python/data/experimental/ops/distributed_save_op.py
@@ -15,27 +15,19 @@
 """Distributed saving of a dataset to disk."""
 
 from tensorflow.core.protobuf import snapshot_pb2
-from tensorflow.python.eager import context
 from tensorflow.python.ops import gen_experimental_dataset_ops
-from tensorflow.python.util import lazy_loader
-
-# TODO(b/176933539): Use regular import.
 # TODO(b/238903802): Use TypeSpec serialization methods directly.
-nested_structure_coder = lazy_loader.LazyLoader(
-    "nested_structure_coder", globals(),
-    "tensorflow.python.saved_model.nested_structure_coder")
+from tensorflow.python.saved_model import nested_structure_coder
 
 
 # TODO(b/250921378): Add example to docstring and export to TF API.
-def distributed_save(dataset,
-                     directory,
-                     dispatcher_address,
-                     compression="AUTO"):
+def distributed_save(dataset, path, dispatcher_address, compression="AUTO"):
   """Initiates the process of distributedly saving a dataset to disk.
 
   Args:
     dataset: The `tf.data.Dataset` to save.
-    directory: A string indicating the directory to which to save `dataset`.
+    path: A string indicating the filepath of the directory to which to save
+      `dataset`.
     dispatcher_address: A string indicating the address of the dispatcher for
       the tf.data service instance used to save `dataset`.
     compression: (Optional.) A string indicating whether and how to compress the
@@ -44,17 +36,11 @@ def distributed_save(dataset,
       used.  If `None`, the `dataset` materialization is not compressed.
 
   Returns:
-    `None`.
+    An operation which when executed performs the distributed save.
 
   Raises:
-    RuntimeError: If not in eager mode.
     ValueError: If `dispatcher_address` is invalid.
   """
-  if not context.executing_eagerly():
-    raise RuntimeError(
-        "tf.data `distributed_save` API must be run in the eager mode."
-    )
-
   if not isinstance(dispatcher_address, str):
     raise ValueError("`dispatcher_address` must be a string, but is a "
                      f"{type(dispatcher_address)} ({dispatcher_address}")
@@ -67,9 +53,9 @@ def distributed_save(dataset,
       compression=compression,
   )
 
-  gen_experimental_dataset_ops.distributed_save(
+  return gen_experimental_dataset_ops.distributed_save(
       dataset._variant_tensor,  # pylint: disable=protected-access
-      directory=directory,
+      directory=path,
       address=dispatcher_address,
       metadata=metadata.SerializeToString(),
   )
diff --git a/tensorflow/python/data/experimental/ops/io.py b/tensorflow/python/data/experimental/ops/io.py
index 9eb3630d0fd..35554a1464c 100644
--- a/tensorflow/python/data/experimental/ops/io.py
+++ b/tensorflow/python/data/experimental/ops/io.py
@@ -16,17 +16,11 @@
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.util import deprecation
-from tensorflow.python.util import lazy_loader
 from tensorflow.python.util.tf_export import tf_export
 
 COMPRESSION_GZIP = "GZIP"
 COMPRESSION_SNAPPY = "NONE"
 DATASET_SPEC_FILENAME = "dataset_spec.pb"
-# TODO(b/176933539): Use the regular import.
-# TODO(b/238903802): Use TypeSpec serialization methods directly.
-nested_structure_coder = lazy_loader.LazyLoader(
-    "nested_structure_coder", globals(),
-    "tensorflow.python.saved_model.nested_structure_coder")
 
 
 @tf_export("data.experimental.save", v1=[])
@@ -100,10 +94,15 @@ def save(dataset,
       then checkpointing will not be performed. The `save()` implementation
       creates a `tf.train.Checkpoint` object internally, so users should not
       set the `checkpoint` argument in `checkpoint_args`.
+
+  Returns:
+    An operation which when executed performs the save. When writing
+    checkpoints, returns None. The return value is useful in unit tests.
+
   Raises:
     ValueError if `checkpoint` is passed into `checkpoint_args`.
   """
-  dataset.save(path, compression, shard_func, checkpoint_args)
+  return dataset.save(path, compression, shard_func, checkpoint_args)
 
 
 @tf_export("data.experimental.load", v1=[])
diff --git a/tensorflow/python/data/experimental/ops/prefetching_ops.py b/tensorflow/python/data/experimental/ops/prefetching_ops.py
index ec3a57abab1..5335de4c5a4 100644
--- a/tensorflow/python/data/experimental/ops/prefetching_ops.py
+++ b/tensorflow/python/data/experimental/ops/prefetching_ops.py
@@ -18,7 +18,6 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import structured_function
 from tensorflow.python.data.util import structure
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
 from tensorflow.python.framework import device as framework_device
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -154,9 +153,9 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
 
     next_func_concrete = _next_func.get_concrete_function()  # pylint: disable=protected-access
 
-    @function.defun_with_attributes(
+    @def_function.function(
         input_signature=[tensor_spec.TensorSpec([], dtypes.string)],
-        attributes={"experimental_ints_on_device": True})
+        experimental_attributes={"experimental_ints_on_device": True})
     def _remote_next_func(string_handle):
       return functional_ops.remote_call(
           target=self._source_device,
@@ -164,7 +163,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
           Tout=self._input_dataset._flat_types,  # pylint: disable=protected-access
           f=next_func_concrete)
 
-    self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
+    self._next_func = _remote_next_func.get_concrete_function()
     self._next_captured_args = self._next_func.captured_inputs
 
     @def_function.function(
diff --git a/tensorflow/python/data/experimental/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
index 7d5f7c793ba..1f2d6cc19c5 100644
--- a/tensorflow/python/data/experimental/service/BUILD
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test", "tf_python_pybind_extension")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test", "tf_python_pybind_extension")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -20,7 +21,7 @@ tf_python_pybind_extension(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "server_lib",
     srcs = ["server_lib.py"],
     srcs_version = "PY3",
@@ -30,17 +31,19 @@ py_library(
     deps = [
         ":_pywrap_server_lib",
         ":_pywrap_utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "server_lib_test",
     srcs = ["server_lib_test.py"],
     deps = [
         ":server_lib",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:errors",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/profiler:profiler_client",
     ],
 )
@@ -56,7 +59,7 @@ tf_python_pybind_extension(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "service",
     srcs = ["__init__.py"],
     srcs_version = "PY3",
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 0345cec944d..01e662c8dcd 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:strict.default.bzl", "py_strict_test")
 
 # Definitions are loaded separately so that copybara can pattern match (and modify) each definition.
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -9,7 +9,7 @@ package(
     licenses = ["notice"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "as_numpy_iterator_test",
     size = "small",
     srcs = ["as_numpy_iterator_test.py"],
@@ -18,17 +18,18 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_combinations",
+        "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/ops/ragged:ragged_tensor_value",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "batch_test",
     size = "medium",
     srcs = ["batch_test.py"],
@@ -41,17 +42,25 @@ tf_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/ops/ragged:ragged_concat_ops",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_math_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "bucket_by_sequence_length_test",
     size = "small",
     srcs = ["bucket_by_sequence_length_test.py"],
@@ -64,12 +73,13 @@ tf_py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:combinations",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "cache_test",
     size = "medium",
     srcs = ["cache_test.py"],
@@ -81,32 +91,39 @@ tf_py_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "cardinality_test",
     size = "small",
     srcs = ["cardinality_test.py"],
     deps = [
         ":test_base",
-        "//tensorflow/python/data/experimental/ops:cardinality",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "checkpoint_test",
     size = "small",
     srcs = ["checkpoint_test.py"],
@@ -114,13 +131,27 @@ tf_py_test(
     deps = [
         ":test_base",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:io_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/data/experimental/ops:grouping",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python/data/experimental/ops:scan_ops",
+        "//tensorflow/python/data/experimental/ops:take_while_ops",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:gfile",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -129,54 +160,52 @@ py_library(
     srcs = ["checkpoint_test_base.py"],
     srcs_version = "PY3",
     deps = [
-        ":test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_combinations",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/checkpoint:checkpoint_options",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/ops/ragged:ragged_tensor_value",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/training:saver",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "choose_from_datasets_test",
     size = "small",
     srcs = ["choose_from_datasets_test.py"],
     shard_count = 12,
     deps = [
+        ":checkpoint_test_base",
+        ":test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:random_seed",
         "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
-        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "concatenate_test",
     size = "medium",
     srcs = ["concatenate_test.py"],
@@ -186,14 +215,17 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "counter_test",
     size = "small",
     srcs = ["counter_test.py"],
@@ -204,10 +236,12 @@ tf_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "dataset_spec_test",
     size = "small",
     srcs = ["dataset_spec_test.py"],
@@ -215,35 +249,52 @@ tf_py_test(
         ":test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "dataset_test",
     size = "medium",
     srcs = ["dataset_test.py"],
     deps = [
         ":test_base",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:script_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:debug_mode",
+        "//tensorflow/python/data/ops:optional_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/lib/io:lib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "enumerate_test",
     size = "small",
     srcs = ["enumerate_test.py"],
@@ -253,14 +304,15 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "filter_test",
     size = "small",
     srcs = ["filter_test.py"],
@@ -268,19 +320,20 @@ tf_py_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "fixed_length_record_dataset_test",
     size = "medium",
     srcs = ["fixed_length_record_dataset_test.py"],
@@ -288,22 +341,16 @@ tf_py_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "flat_map_test",
     size = "small",
     srcs = ["flat_map_test.py"],
@@ -312,7 +359,6 @@ tf_py_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:errors",
@@ -327,14 +373,16 @@ tf_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops/ragged:ragged_conversion_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "from_generator_test",
     size = "small",
     srcs = ["from_generator_test.py"],
@@ -346,38 +394,38 @@ tf_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:script_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:sparse_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "from_sparse_tensor_slices_test",
     size = "small",
     srcs = ["from_sparse_tensor_slices_test.py"],
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "from_tensor_slices_test",
     size = "medium",
     srcs = ["from_tensor_slices_test.py"],
@@ -387,20 +435,19 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/framework:combinations",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
-        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "from_tensors_test",
     size = "small",
     srcs = ["from_tensors_test.py"],
@@ -417,32 +464,38 @@ tf_py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "get_single_element_test",
-    size = "small",
-    srcs = ["get_single_element_test.py"],
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/experimental/ops:grouping",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
+    name = "get_single_element_test",
+    size = "small",
+    srcs = ["get_single_element_test.py"],
+    deps = [
+        ":test_base",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_py_strict_test(
     name = "group_by_window_test",
     size = "medium",
     srcs = ["group_by_window_test.py"],
@@ -460,29 +513,35 @@ tf_py_test(
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "ignore_errors_test",
     size = "small",
     srcs = ["ignore_errors_test.py"],
     deps = [
+        ":checkpoint_test_base",
+        ":test_base",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:io_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
-        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "interleave_test",
     size = "small",
     srcs = ["interleave_test.py"],
@@ -490,21 +549,22 @@ tf_py_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:script_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "iterator_cluster_test",
     size = "small",
     srcs = ["iterator_cluster_test.py"],
@@ -531,10 +591,12 @@ tf_py_test(
         "//tensorflow/python:string_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "iterator_test",
     size = "small",
     srcs = ["iterator_test.py"],
@@ -545,66 +607,63 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:function",
         "//tensorflow/python:functional_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:io_ops",
+        "//tensorflow/python:gradients_impl",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:random_ops",
         "//tensorflow/python:script_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
-        "//tensorflow/python/checkpoint",
-        "//tensorflow/python/compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "len_test",
     size = "medium",
     srcs = ["len_test.py"],
     deps = [
         ":test_base",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_combinations",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "list_files_test",
     size = "small",
     srcs = ["list_files_test.py"],
     deps = [
         ":test_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "map_test",
     size = "medium",
     srcs = ["map_test.py"],
@@ -615,33 +674,47 @@ tf_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_case",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:functional_ops",
         "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:pywrap_sanitizers",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:script_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/ops/ragged:ragged_concat_ops",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "memory_cleanup_test",
     size = "medium",
     srcs = ["memory_cleanup_test.py"],
@@ -654,16 +727,18 @@ cuda_py_test(
     ],
     deps = [
         ":test_base",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/types:internal",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "multi_device_iterator_test",
     size = "medium",
     srcs = ["multi_device_iterator_test.py"],
@@ -675,24 +750,26 @@ cuda_py_test(
     ],
     deps = [
         ":test_base",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:executor",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "optional_test",
     size = "small",
     srcs = ["optional_test.py"],
@@ -705,28 +782,40 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:optional_ops",
+        "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "options_test",
     size = "small",
     srcs = ["options_test.py"],
     deps = [
         ":test_base",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "padded_batch_test",
     size = "medium",
     srcs = ["padded_batch_test.py"],
@@ -739,18 +828,21 @@ tf_py_test(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/ops/ragged:ragged_tensor_value",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "placement_test",
     size = "small",
     srcs = ["placement_test.py"],
@@ -758,85 +850,99 @@ cuda_py_test(
         ":test_base",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:test_ops",
+        "//tensorflow/python/data/experimental/ops:prefetching_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "prefetch_test",
     size = "medium",
     srcs = ["prefetch_test.py"],
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "ragged_batch_test",
     size = "small",
     srcs = ["ragged_batch_test.py"],
     shard_count = 4,
     deps = [
+        ":test_base",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops/ragged:ragged_concat_ops",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sparse_batch_test",
     size = "medium",
     srcs = ["sparse_batch_test.py"],
     deps = [
+        ":checkpoint_test_base",
+        ":test_base",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
-        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "random_test",
     size = "small",
     srcs = ["random_test.py"],
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:tf2",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "range_test",
     size = "medium",
     srcs = ["range_test.py"],
@@ -846,14 +952,15 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "reduce_test",
     size = "medium",
     srcs = ["reduce_test.py"],
@@ -867,34 +974,42 @@ cuda_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "rejection_resample_test",
     size = "medium",
     srcs = ["rejection_resample_test.py"],
     shard_count = 10,
     deps = [
+        ":test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "repeat_test",
     size = "medium",
     srcs = ["repeat_test.py"],
@@ -902,40 +1017,42 @@ tf_py_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sample_from_datasets_test",
     size = "medium",
     srcs = ["sample_from_datasets_test.py"],
     shard_count = 24,
     deps = [
+        ":checkpoint_test_base",
+        ":test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:random_seed",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
-        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "scan_test",
     size = "medium",
     srcs = ["scan_test.py"],
@@ -944,22 +1061,28 @@ cuda_py_test(
         ":test_base",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_v2_toggles",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:test_ops",
+        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "shard_test",
     size = "medium",
     srcs = ["shard_test.py"],
@@ -969,35 +1092,49 @@ tf_py_test(
         ":test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "shuffle_test",
     size = "medium",
     srcs = ["shuffle_test.py"],
-    shard_count = 4,
+    shard_count = 8,
+    tags = ["requires-mem:16g"],
     deps = [
         ":checkpoint_test_base",
         ":test_base",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:pywrap_sanitizers",
         "//tensorflow/python:random_seed",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/data/experimental/ops:iterator_ops",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/training:saver",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "skip_test",
     size = "small",
     srcs = ["skip_test.py"],
@@ -1005,17 +1142,18 @@ tf_py_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "snapshot_test",
     size = "medium",
     srcs = ["snapshot_test.py"],
@@ -1028,20 +1166,22 @@ tf_py_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
+        ":tf_record_test_base",
+        "//tensorflow/python:array_ops_gen",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/experimental/ops:snapshot",
-        "//tensorflow/python/data/kernel_tests:tf_record_test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "take_test",
     size = "small",
     srcs = ["take_test.py"],
@@ -1049,33 +1189,33 @@ tf_py_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "take_while_test",
     size = "small",
     srcs = ["take_while_test.py"],
     shard_count = 4,
     deps = [
+        ":checkpoint_test_base",
+        ":test_base",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
-        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1088,22 +1228,29 @@ py_library(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_combinations",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python/data/experimental/ops:lookup_ops",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:test_mode",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "text_line_dataset_test",
     size = "small",
     srcs = ["text_line_dataset_test.py"],
@@ -1111,20 +1258,17 @@ tf_py_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tf_record_dataset_test",
     size = "small",
     srcs = ["tf_record_dataset_test.py"],
@@ -1133,16 +1277,13 @@ tf_py_test(
         ":checkpoint_test_base",
         ":test_base",
         ":tf_record_test_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/framework:combinations",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -1153,22 +1294,18 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
+        ":test_base",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:lib",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/experimental/ops:readers",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/util:compat",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "unbatch_test",
     size = "medium",
     srcs = ["unbatch_test.py"],
@@ -1180,21 +1317,20 @@ tf_py_test(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "unique_test",
     size = "medium",
     srcs = ["unique_test.py"],
@@ -1204,23 +1340,20 @@ tf_py_test(
         ":test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_strict_test(
     name = "io_test",
     srcs = ["io_test.py"],
-    tags = [
-        "notap",  # b/272281090
-    ],
     deps = [
         ":checkpoint_test_base",
         ":test_base",
         "//tensorflow/python:variables",
-        "//tensorflow/python/data/experimental/ops:io",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
@@ -1244,7 +1377,7 @@ py_strict_test(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "window_test",
     size = "medium",
     srcs = ["window_test.py"],
@@ -1259,12 +1392,14 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "zip_test",
     size = "small",
     srcs = ["zip_test.py"],
@@ -1272,11 +1407,14 @@ tf_py_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:tensor_shape",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/data/kernel_tests/batch_test.py b/tensorflow/python/data/kernel_tests/batch_test.py
index 7a712883245..541ebaed8c3 100644
--- a/tensorflow/python/data/kernel_tests/batch_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_test.py
@@ -303,13 +303,14 @@ class BatchCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                      multiplier=15.0,
                      tensor_slice_len=2,
                      batch_size=2,
+                     num_parallel_calls=None,
                      options=None):
     components = (np.arange(tensor_slice_len), np.array([[1, 2, 3]]) *
                   np.arange(tensor_slice_len)[:, np.newaxis],
                   np.array(multiplier) * np.arange(tensor_slice_len))
 
-    dataset = dataset_ops.Dataset.from_tensor_slices(components).batch(
-        batch_size)
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    dataset = dataset.batch(batch_size, num_parallel_calls=num_parallel_calls)
     if options:
       dataset = dataset.with_options(options)
     return dataset
@@ -318,16 +319,24 @@ class BatchCheckpointTest(checkpoint_test_base.CheckpointTestBase,
       combinations.times(
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
-          combinations.combine(symbolic_checkpoint=[False, True])))
-  def test(self, verify_fn, symbolic_checkpoint):
+          combinations.combine(
+              symbolic_checkpoint=[False, True], num_parallel_calls=[None, 4]
+          ),
+      )
+  )
+  def test(self, verify_fn, symbolic_checkpoint, num_parallel_calls):
     tensor_slice_len = 8
     batch_size = 2
     options = options_lib.Options()
     options.experimental_symbolic_checkpoint = symbolic_checkpoint
     num_outputs = tensor_slice_len // batch_size
     verify_fn(
-        self, lambda: self._build_dataset(15.0, tensor_slice_len, batch_size,
-                                          options), num_outputs)
+        self,
+        lambda: self._build_dataset(
+            15.0, tensor_slice_len, batch_size, num_parallel_calls, options
+        ),
+        num_outputs,
+    )
 
   def _sparse(self, i):
     return sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/python/data/kernel_tests/checkpoint_test_base.py b/tensorflow/python/data/kernel_tests/checkpoint_test_base.py
index 0c2614299a6..5f42c9fc005 100644
--- a/tensorflow/python/data/kernel_tests/checkpoint_test_base.py
+++ b/tensorflow/python/data/kernel_tests/checkpoint_test_base.py
@@ -19,6 +19,7 @@ import os
 import numpy as np
 from tensorflow.python.checkpoint import checkpoint as tracking_util
 from tensorflow.python.checkpoint import checkpoint_management
+from tensorflow.python.checkpoint import checkpoint_options
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import options as options_lib
@@ -424,7 +425,11 @@ class CheckpointTestBase(test.TestCase):
           with self.assertRaises(StopIteration):
             next(iterator)
         if save_checkpoint_at_end or i < len(break_points):
-          ckpt_path = ckpt.save(self._ckpt_path())
+          # TODO(b/275117275): Verify if TF2 async checkpoint works.
+          ckpt_options = checkpoint_options.CheckpointOptions()
+          ckpt_options.experimental_enable_async_checkpoint = False
+          ckpt_options.enable_async = False
+          ckpt_path = ckpt.save(self._ckpt_path(), options=ckpt_options)
           ckpt_saved = True
     else:
       def get_ops():
diff --git a/tensorflow/python/data/kernel_tests/interleave_test.py b/tensorflow/python/data/kernel_tests/interleave_test.py
index 28bef8337e1..892062b6e55 100644
--- a/tensorflow/python/data/kernel_tests/interleave_test.py
+++ b/tensorflow/python/data/kernel_tests/interleave_test.py
@@ -385,8 +385,9 @@ class InterleaveTest(test_base.DatasetTestBase, parameterized.TestCase):
           map_fn, num_parallel_calls=num_parallel_calls)
 
 
-class InterleaveDatasetCheckpointTest(checkpoint_test_base.CheckpointTestBase,
-                                      parameterized.TestCase):
+class InterleaveCheckpointTest(
+    checkpoint_test_base.CheckpointTestBase, parameterized.TestCase
+):
 
   @combinations.generate(
       combinations.times(
@@ -409,10 +410,9 @@ class InterleaveDatasetCheckpointTest(checkpoint_test_base.CheckpointTestBase,
       dataset = dataset.interleave(
           lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x), cycle_length,
           block_length, num_parallel_calls)
-      if num_parallel_calls is None:
-        options = options_lib.Options()
-        options.experimental_symbolic_checkpoint = symbolic_checkpoint
-        dataset = dataset.with_options(options)
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      dataset = dataset.with_options(options)
       return dataset
 
     num_outputs = np.sum(input_values) * num_repeats
diff --git a/tensorflow/python/data/kernel_tests/io_test.py b/tensorflow/python/data/kernel_tests/io_test.py
index 76724a35ea7..a76cfd0a09d 100644
--- a/tensorflow/python/data/kernel_tests/io_test.py
+++ b/tensorflow/python/data/kernel_tests/io_test.py
@@ -46,82 +46,88 @@ class IOTest(test_base.DatasetTestBase, parameterized.TestCase):
     shutil.rmtree(self._save_dir)
 
   @combinations.generate(
-      combinations.times(test_base.eager_only_combinations(),
-                         combinations.combine(compression=[None, "GZIP"])))
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(compression=[None, "GZIP"]),
+      )
+  )
   def testBasic(self, compression):
     dataset = dataset_ops.Dataset.range(42)
-    dataset.save(self._test_dir, compression=compression)
+    self.evaluate(dataset.save(self._test_dir, compression=compression))
     dataset2 = dataset_ops.Dataset.load(
-        self._test_dir, dataset.element_spec, compression=compression)
+        self._test_dir, dataset.element_spec, compression=compression
+    )
     self.assertDatasetProduces(dataset2, range(42))
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testCardinality(self):
     dataset = dataset_ops.Dataset.range(42)
-    dataset.save(self._test_dir)
+    self.evaluate(dataset.save(self._test_dir))
     dataset2 = dataset_ops.Dataset.load(self._test_dir, dataset.element_spec)
     self.assertEqual(self.evaluate(dataset2.cardinality()), 42)
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testCustomShardFunction(self):
     dataset = dataset_ops.Dataset.range(42)
-    dataset.save(self._test_dir, shard_func=lambda x: x // 21)
+    self.evaluate(dataset.save(self._test_dir, shard_func=lambda x: x // 21))
     dataset2 = dataset_ops.Dataset.load(self._test_dir, dataset.element_spec)
     expected = []
     for i in range(21):
       expected.extend([i, i + 21])
     self.assertDatasetProduces(dataset2, expected)
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testCustomReaderFunction(self):
     dataset = dataset_ops.Dataset.range(42)
-    dataset.save(self._test_dir, shard_func=lambda x: x % 7)
+    self.evaluate(dataset.save(self._test_dir, shard_func=lambda x: x % 7))
     dataset2 = dataset_ops.Dataset.load(
         self._test_dir,
         dataset.element_spec,
-        reader_func=lambda x: x.flat_map(lambda y: y))
+        reader_func=lambda x: x.flat_map(lambda y: y),
+    )
     expected = []
     for i in range(7):
       expected.extend(range(i, 42, 7))
     self.assertDatasetProduces(dataset2, expected)
 
   @combinations.generate(
-      combinations.times(test_base.eager_only_combinations(),
-                         combinations.combine(compression=[None, "GZIP"])))
+      combinations.times(
+          test_base.eager_only_combinations(),
+          combinations.combine(compression=[None, "GZIP"]),
+      )
+  )
   def testSaveInsideFunction(self, compression):
     dataset = dataset_ops.Dataset.range(42)
+
     @def_function.function
     def save_fn():
       dataset.save(self._test_dir, compression=compression)
+
     save_fn()
     dataset = dataset_ops.Dataset.load(
-        self._test_dir, dataset.element_spec, compression=compression)
+        self._test_dir, dataset.element_spec, compression=compression
+    )
     self.assertDatasetProduces(dataset, range(42))
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testElementSpecOptional(self):
     range_dataset = dataset_ops.Dataset.range(42)
-    dict_dataset = dataset_ops.Dataset.from_tensor_slices({"a": [1, 2],
-                                                           "b": [3, 4]})
+    dict_dataset = dataset_ops.Dataset.from_tensor_slices(
+        {"a": [1, 2], "b": [3, 4]}
+    )
     tuple_dataset = dataset_ops.Dataset.from_tensor_slices(([1, 2], [3, 4]))
-    dataset = dataset_ops.Dataset.zip((range_dataset, dict_dataset,
-                                       tuple_dataset))
-    dataset.save(self._test_dir)
+    dataset = dataset_ops.Dataset.zip(
+        (range_dataset, dict_dataset, tuple_dataset)
+    )
+    self.evaluate(dataset.save(self._test_dir))
     dataset_loaded = dataset_ops.Dataset.load(self._test_dir)
     self.assertDatasetsEqual(dataset, dataset_loaded)
 
-  @combinations.generate(test_base.graph_only_combinations())
-  def testElementSpecRequired(self):
-    dataset = dataset_ops.Dataset.range(42)
-    dataset.save(self._test_dir)
-    with self.assertRaises(ValueError):
-      _ = dataset_ops.Dataset.load(self._test_dir)
-
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testRepeatAndPrefetch(self):
     """This test reproduces github.com/tensorflow/tensorflow/issues/49165."""
     dataset1 = dataset_ops.Dataset.from_tensor_slices(np.random.rand(16, 32))
-    dataset1.save(self._test_dir)
+    self.evaluate(dataset1.save(self._test_dir))
     dataset = dataset_ops.Dataset.load(self._test_dir)
     dataset = dataset.shuffle(buffer_size=16)
     dataset = dataset.batch(16)
@@ -138,11 +144,14 @@ class LoadCheckpointTest(IOTest, checkpoint_test_base.CheckpointTestBase):
     return dataset_ops.Dataset.load(self._save_dir)
 
   @combinations.generate(
-      combinations.times(test_base.eager_only_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+      )
+  )
   def test(self, verify_fn):
     dataset = dataset_ops.Dataset.range(42)
-    dataset.save(self._save_dir)
+    self.evaluate(dataset.save(self._save_dir))
     verify_fn(self, self._build_ds, num_outputs=42)
 
 
@@ -185,5 +194,6 @@ class SaveCheckpointTest(IOTest, checkpoint_test_base.CheckpointTestBase):
       dataset.save(
           dataset, self._save_dir, checkpoint_args=checkpoint_args)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index b3a34372e7d..608764cff7a 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -28,7 +28,6 @@ from tensorflow.python import pywrap_sanitizers
 from tensorflow.python import tf2
 from tensorflow.python.checkpoint import checkpoint as trackable_utils
 from tensorflow.python.checkpoint import checkpoint_management
-from tensorflow.python.data.experimental.ops import from_list
 from tensorflow.python.data.experimental.ops import random_access
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
@@ -45,8 +44,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_case
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import map_fn
@@ -54,9 +53,11 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -631,7 +632,7 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
         return x // 2
 
       def defaults_two():
-        return control_flow_ops.cond(
+        return cond.cond(
             math_ops.equal(math_ops.mod(x, 2), 0),
             multiply,
             divide,
@@ -705,7 +706,7 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
         return x // 2
 
       def defaults_two():
-        return control_flow_ops.cond(
+        return cond.cond(
             math_ops.equal(math_ops.mod(x, 2), 0),
             multiply,
             divide,
@@ -1170,7 +1171,8 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     if hasattr(dataset, "map_with_legacy_function"):
       # NOTE: In the legacy function, resource is captured by value.
       with self.assertRaisesWithPredicateMatch(
-          AttributeError, "'Tensor' object has no attribute 'assign_add'"):
+          AttributeError, ".*Tensor.* object has no attribute 'assign_add'"
+      ):
         dataset.map_with_legacy_function(func)
 
     dataset = dataset.map(func)
@@ -1240,9 +1242,9 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.cached_session(config=config):
 
       with ops.device("/device:CPU:0"):
-        a = variables.VariableV1(3.0)
+        a = variable_v1.VariableV1(3.0)
       with ops.device("/device:CPU:1"):
-        b = variables.VariableV1(5.0)
+        b = variable_v1.VariableV1(5.0)
 
       def func(_):
         nonlocal a, b
@@ -1343,14 +1345,13 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
         pywrap_sanitizers.is_tsan_enabled() or
         pywrap_sanitizers.is_msan_enabled()):
       self.skipTest("Skip to avoid OOM when using sanitizers.")
-    # Tensors of size 512M.
-    dataset = from_list.from_list(
-        [
-            random_ops.random_uniform((128, 1024, 1024), dtype=dtypes.float32)
-            for _ in range(5)
-        ]
+    dataset = dataset_ops.Dataset.range(10).batch(2)
+    dataset = dataset.map(
+        # Create tensors of size 512M.
+        lambda seed: stateless_random_ops.stateless_random_uniform(
+            (128, 1024, 1024), seed, dtype=dtypes.float32
+        )
     )
-
     # Set parallelism to 5 to exceed the 2GB protobuf limit
     dataset = dataset.map(lambda x: x * 2, num_parallel_calls=5)
     iterator = iter(dataset)
diff --git a/tensorflow/python/data/kernel_tests/placement_test.py b/tensorflow/python/data/kernel_tests/placement_test.py
index ce8c38105c2..6c9efc53f24 100644
--- a/tensorflow/python/data/kernel_tests/placement_test.py
+++ b/tensorflow/python/data/kernel_tests/placement_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -88,7 +88,7 @@ class PlacementTest(test_base.DatasetTestBase, parameterized.TestCase):
 
       c = constant_op.constant(2)
       with ops.device("/cpu:0"):
-        a = control_flow_ops.cond(math_ops.equal(c, 2), fn, fn)
+        a = cond.cond(math_ops.equal(c, 2), fn, fn)
         iterator = iter(a)
         nxt = next(iterator)
       return nxt
@@ -108,7 +108,7 @@ class PlacementTest(test_base.DatasetTestBase, parameterized.TestCase):
 
       c = constant_op.constant(2)
       with ops.colocate_with(dataset._variant_tensor):  # pylint:disable=protected-access
-        a = control_flow_ops.cond(math_ops.equal(c, 2), fn, fn)
+        a = cond.cond(math_ops.equal(c, 2), fn, fn)
         iterator = iter(a)
         nxt = next(iterator)
       return nxt
@@ -122,7 +122,7 @@ class PlacementTest(test_base.DatasetTestBase, parameterized.TestCase):
     def f():
       dataset = dataset_ops.Dataset.range(8)
       c = constant_op.constant(2)
-      a = control_flow_ops.cond(
+      a = cond.cond(
           math_ops.equal(c, 2),
           lambda: dataset.map(lambda x: x + 1),
           lambda: dataset.map(lambda x: x + 2),
diff --git a/tensorflow/python/data/kernel_tests/scan_test.py b/tensorflow/python/data/kernel_tests/scan_test.py
index 860ad84d91f..ecc57bc22f6 100644
--- a/tensorflow/python/data/kernel_tests/scan_test.py
+++ b/tensorflow/python/data/kernel_tests/scan_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
@@ -140,7 +140,7 @@ class ScanTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     def scan_fn(ta, x):
       updated = ta.write(ta.size(), x)
-      next_iter = control_flow_ops.cond(
+      next_iter = cond.cond(
           math_ops.equal(x % 3, 0), empty, lambda: updated)
       return (next_iter, updated.stack())
 
@@ -176,7 +176,7 @@ class ScanTest(test_base.DatasetTestBase, parameterized.TestCase):
       updated = ta.write(ta.size(), x)
       # Here, capture empty_ta from outside the function.  However, it may be
       # either a TF1-style TensorArray or an Eager-style TensorArray.
-      next_iter = control_flow_ops.cond(
+      next_iter = cond.cond(
           math_ops.equal(x % 3, 0), lambda: empty_ta, lambda: updated)
       return (next_iter, updated.stack())
 
diff --git a/tensorflow/python/data/kernel_tests/shuffle_test.py b/tensorflow/python/data/kernel_tests/shuffle_test.py
index 7806554f801..1e08720a957 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_test.py
@@ -18,6 +18,7 @@ import functools
 
 from absl.testing import parameterized
 import numpy as np
+from tensorflow.python import pywrap_sanitizers
 from tensorflow.python.checkpoint import checkpoint as trackable_utils
 from tensorflow.python.checkpoint import checkpoint_management
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
@@ -25,6 +26,7 @@ from tensorflow.python.data.experimental.ops import random_access
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
@@ -34,11 +36,23 @@ from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver as saver_lib
 
 
+def make_variable_size_dataset(per_epoch_data):
+  repeat_counter = [0]
+
+  def gen():
+    for each in per_epoch_data[repeat_counter[0]]:
+      yield each
+    repeat_counter[0] += 1
+
+  return dataset_ops.Dataset.from_generator(gen, dtypes.int64)
+
+
 class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.default_test_combinations())
@@ -149,8 +163,9 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.default_test_combinations())
   def testDefaultArguments(self):
     components = [0, 1, 2, 3, 4]
-    dataset = dataset_ops.Dataset.from_tensor_slices(components).shuffle(
-        5).repeat()
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components).shuffle(5).repeat()
+    )
     get_next = self.getNext(dataset)
     counts = collections.defaultdict(lambda: 0)
     for _ in range(10):
@@ -160,6 +175,62 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
     for i in range(5):
       self.assertEqual(10, counts[i])
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testUnknownCardinality(self):
+    components = [0, 1, 2, 3, 4]
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).shuffle(
+        dataset_ops.UNKNOWN
+    )
+    get_next = self.getNext(dataset)
+    counts = collections.defaultdict(lambda: 0)
+    for _ in range(1):
+      for _ in range(5):
+        counts[self.evaluate(get_next())] += 1
+
+    for i in range(5):
+      self.assertEqual(1, counts[i])
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testUnknownCardinalityWithRepeatedShuffle(self):
+    components = [0, 1, 2, 3, 4]
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .shuffle(dataset_ops.UNKNOWN)
+        .repeat()
+    )
+    get_next = self.getNext(dataset)
+    counts = collections.defaultdict(lambda: 0)
+    for _ in range(10):
+      for _ in range(5):
+        counts[self.evaluate(get_next())] += 1
+
+    for i in range(5):
+      self.assertEqual(10, counts[i])
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testUnknownCardinalityWithIncreasingBufferSize(self):
+    epoch_1 = list(range(5))
+    epoch_2 = list(range(10, 17))
+    epoch_3 = list(range(20, 28))
+
+    ds = make_variable_size_dataset([epoch_1, epoch_2, epoch_3])
+    ds = ds.shuffle(dataset_ops.UNKNOWN).repeat(3)
+
+    expected = epoch_1 + epoch_2 + epoch_3
+    self.assertDatasetProduces(ds, expected, assert_items_equal=True)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testUnknownCardinalityWithVariableBufferSize(self):
+    epoch_1 = list(range(5))
+    epoch_2 = list(range(10, 13))
+    epoch_3 = list(range(20, 27))
+
+    ds = make_variable_size_dataset([epoch_1, epoch_2, epoch_3])
+    ds = ds.shuffle(dataset_ops.UNKNOWN).repeat(3)
+
+    expected = epoch_1 + epoch_2 + epoch_3
+    self.assertDatasetProduces(ds, expected, assert_items_equal=True)
+
   @combinations.generate(test_base.default_test_combinations())
   def testInputInitializations(self):
     num_rounds = 3
@@ -399,19 +470,28 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertNotEqual(shuffle_1, shuffle_2)
 
   @combinations.generate(test_base.eager_only_combinations())
-  def testCheckpointLargeShuffleBuffer(self):
-    # Tensor of size 512M
-    dataset = dataset_ops.Dataset.from_tensors(
-        array_ops.ones((128, 1024, 1024), dtype=dtypes.float32))
-    dataset = dataset.repeat()
-    # Set shuffle buffer size to 5 to exceed the 2GB protobuf limit.
-    dataset = dataset.shuffle(5)
+  def testCheckpointLargeBuffer(self):
+    if (pywrap_sanitizers.is_asan_enabled() or
+        pywrap_sanitizers.is_tsan_enabled() or
+        pywrap_sanitizers.is_msan_enabled()):
+      self.skipTest("Skip to avoid OOM when using sanitizers.")
+    dataset = dataset_ops.Dataset.range(12).batch(2)
+    dataset = dataset.map(
+        # Create tensors of size 512M.
+        lambda seed: stateless_random_ops.stateless_random_uniform(
+            (128, 1024, 1024), seed, dtype=dtypes.float32
+        )
+    )
+    dataset = dataset.shuffle(buffer_size=6)
     iterator = iter(dataset)
-    next(iterator)  # request an element to fill the shuffle buffer
+    next(iterator)  # Request an element to fill the shuffle buffer
     ckpt = trackable_utils.Checkpoint(iterator=iterator)
     manager = checkpoint_management.CheckpointManager(
         ckpt, self.get_temp_dir(), max_to_keep=1)
     manager.save()
+    del dataset
+    del iterator
+    manager.restore_or_initialize()
 
   @combinations.generate(test_base.default_test_combinations())
   def testName(self):
@@ -429,32 +509,60 @@ class ShuffleCheckpointTest(checkpoint_test_base.CheckpointTestBase,
       buffer_size=5,
       seed=None,
       reshuffle_each_iteration=None,
+      symbolic_checkpoint=None,
   ):
-    return dataset_ops.Dataset.range(range_limit).shuffle(
-        buffer_size,
-        seed=seed,
-        reshuffle_each_iteration=reshuffle_each_iteration).repeat(num_repeats)
+    dataset = (
+        dataset_ops.Dataset.range(range_limit)
+        .shuffle(
+            buffer_size,
+            seed=seed,
+            reshuffle_each_iteration=reshuffle_each_iteration,
+        )
+        .repeat(num_repeats)
+    )
+
+    if symbolic_checkpoint:
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      dataset = dataset.with_options(options)
+
+    return dataset
 
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
           combinations.combine(
+              symbolic_checkpoint=[True, False],
               reshuffle_each_iteration=[True, False],
-              buffer_size=[1, 3, 5, 8, 10])))
-  def test(self, verify_fn, reshuffle_each_iteration, buffer_size):
+              buffer_size=[1, 3, 5, 8, 10, dataset_ops.UNKNOWN],
+          ),
+      )
+  )
+  def test(
+      self,
+      verify_fn,
+      symbolic_checkpoint,
+      reshuffle_each_iteration,
+      buffer_size,
+  ):
     seed = 55
     range_limit = 5
     num_repeats = 2
     num_outputs = range_limit * num_repeats
     # pylint: disable=g-long-lambda
     verify_fn(
-        self, lambda: self._build_shuffle_dataset(
+        self,
+        lambda: self._build_shuffle_dataset(
             range_limit=range_limit,
             num_repeats=num_repeats,
             buffer_size=buffer_size,
             seed=seed,
-            reshuffle_each_iteration=reshuffle_each_iteration), num_outputs)
+            reshuffle_each_iteration=reshuffle_each_iteration,
+            symbolic_checkpoint=symbolic_checkpoint,
+        ),
+        num_outputs,
+    )
 
   @combinations.generate(
       combinations.combine(
diff --git a/tensorflow/python/data/kernel_tests/zip_test.py b/tensorflow/python/data/kernel_tests/zip_test.py
index 330b619d24b..c4f270f5964 100644
--- a/tensorflow/python/data/kernel_tests/zip_test.py
+++ b/tensorflow/python/data/kernel_tests/zip_test.py
@@ -134,9 +134,47 @@ class ZipTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.zip((x, y), name="zip")
     self.assertDatasetProduces(dataset, [(4, 2)])
 
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations())
+  )
+  def testZipWithArgsAndDataset(self):
+    with self.assertRaisesRegex(
+        TypeError, r"Both `\*args` and `datasets` cannot be set."
+    ):
+      dataset_ops.Dataset.zip(
+          dataset_ops.Dataset.range(1, 4),
+          dataset_ops.Dataset.range(4, 7),
+          datasets=(
+              dataset_ops.Dataset.range(1, 4),
+              dataset_ops.Dataset.range(4, 7),
+          ),
+      )
 
-class ZipCheckpointTest(checkpoint_test_base.CheckpointTestBase,
-                        parameterized.TestCase):
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations())
+  )
+  def testZipBasicWithNoInput(self):
+    with self.assertRaisesRegex(
+        TypeError, r"Must pass at least one dataset to `zip`."
+    ):
+      dataset_ops.Dataset.zip()
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations())
+  )
+  def InvalidZipInputList(self):
+    with self.assertRaisesRegex(
+        TypeError,
+        r"Invalid input to `zip`. Inputs are expected to be (nested)"
+        r" structures of `tf.data.Dataset` objects. Python `list` is"
+        r" not supported and you should use `tuple` instead.",
+    ):
+      dataset_ops.Dataset.zip([1, 2, 3], [4, 5, 6])
+
+
+class ZipCheckpointTest(
+    checkpoint_test_base.CheckpointTestBase, parameterized.TestCase
+):
 
   def _build_dataset(self, arr, options=None):
     components = [
@@ -200,6 +238,19 @@ class ZipRandomAccessTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.times(test_base.default_test_combinations()))
+  def testZipBasicWithoutTuple(self):
+    dataset = dataset_ops.Dataset.zip(
+        dataset_ops.Dataset.range(1, 4), dataset_ops.Dataset.range(4, 7)
+    )
+    expected_dataset = [(1, 4), (2, 5), (3, 6)]
+    for i in range(3):
+      self.assertEqual(
+          self.evaluate(random_access.at(dataset, index=i)), expected_dataset[i]
+      )
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations())
+  )
   def testZipEqual(self):
     components = [
         np.tile(np.array([[1], [2], [3], [4]]), 20),
@@ -246,6 +297,28 @@ class ZipRandomAccessTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(random_access.at(dataset, index=4))
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testNestedWithoutTuple(self):
+    components = [
+        np.tile(np.array([[1], [2], [3], [4]]), 20),
+        np.tile(np.array([[12], [13], [14], [15]]), 22),
+        np.array([37.0, 38.0, 39.0, 40.0]),
+    ]
+    datasets = [
+        dataset_ops.Dataset.from_tensor_slices(component)
+        for component in components
+    ]
+    dataset = dataset_ops.Dataset.zip(datasets[0], (datasets[1], datasets[2]))
+    for i in range(4):
+      result1, (result2, result3) = self.evaluate(
+          random_access.at(dataset, index=i)
+      )
+      self.assertAllEqual(components[0][i], result1)
+      self.assertAllEqual(components[1][i], result2)
+      self.assertAllEqual(components[2][i], result3)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(random_access.at(dataset, index=4))
+
   @combinations.generate(test_base.default_test_combinations())
   def testNamedTuple(self):
     Foo = collections.namedtuple("Foo", ["x", "y"])
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index c80bc1c0ce1..65381907c5f 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -1,10 +1,12 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "structured_function",
     srcs = ["structured_function.py"],
     srcs_version = "PY3",
@@ -12,23 +14,32 @@ py_library(
         ":debug_mode",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
-        "//tensorflow/python:util",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/util:function_utils",
+        "//tensorflow/python/util:lazy_loader",
+        "//tensorflow/python/util:variable_utils",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "debug_mode",
     srcs = ["debug_mode.py"],
+    deps = [
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/util:tf_export",
+    ],
 )
 
-py_library(
+py_strict_library(
     name = "test_mode",
     srcs = ["test_mode.py"],
 )
 
-py_library(
+py_strict_library(
     name = "dataset_ops",
     # Grouped together due to mutual dependencies, to avoid dependency cycles.
     srcs = [
@@ -80,49 +91,66 @@ py_library(
         ":options",
         ":structured_function",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
+        "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:parsing_ops_gen",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:script_ops",
         "//tensorflow/python:smart_cond",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:stateless_random_ops_gen",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
+        "//tensorflow/python:string_ops_gen",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/autograph/operators:control_flow",
+        "//tensorflow/python/autograph/operators:py_builtins",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/data/experimental/ops:take_while_ops",
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:options",
         "//tensorflow/python/data/util:random_seed",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:sparse",  # build_cleaner: keep
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/data/util:traverse",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:auto_control_deps",
+        "//tensorflow/python/framework:auto_control_deps_utils",
+        "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/ops/ragged:ragged_tensor",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/trackable:asset",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:resource",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:data",
+        "//tensorflow/python/types:trace",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:lazy_loader",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "iterator_ops",
     # Grouped together due to mutual dependencies, to avoid dependency cycles.
     srcs = [
@@ -134,24 +162,38 @@ py_library(
         ":optional_ops",
         ":options",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:cond",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:saver",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/compat",
+        "//tensorflow/python/autograph/operators:control_flow",
+        "//tensorflow/python/autograph/operators:py_builtins",
+        "//tensorflow/python/checkpoint:saveable_compat",
         "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_utils",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/trackable:base",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:lazy_loader",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "multi_device_iterator_ops",
     srcs = ["multi_device_iterator_ops.py"],
     srcs_version = "PY3",
@@ -165,50 +207,58 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:functional_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_spec",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_utils",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "optional_ops",
     srcs = ["optional_ops.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:optional_ops_gen",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "options",
     srcs = ["options.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:test_mode",
+        ":test_mode",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/util:options",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+        "@absl_py//absl/logging",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "readers",
     srcs = ["readers.py"],
     srcs_version = "PY3",
     deps = [
         ":dataset_ops",
+        ":structured_function",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
@@ -217,11 +267,10 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:util",
-        "//tensorflow/python/compat",
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/framework:type_spec",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:data",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/data/ops/dataset_autograph.py b/tensorflow/python/data/ops/dataset_autograph.py
index 12c03c7baa9..b6a1acb7d42 100644
--- a/tensorflow/python/data/ops/dataset_autograph.py
+++ b/tensorflow/python/data/ops/dataset_autograph.py
@@ -21,8 +21,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import nest
@@ -96,8 +96,8 @@ def _tf_ag_dataset_for_stmt(
 
     if extra_test is not None:
       extra_cond = extra_test()
-      new_loop_vars = control_flow_ops.cond(extra_cond, main_path,
-                                            lambda: loop_vars)
+      new_loop_vars = cond.cond(extra_cond, main_path,
+                                lambda: loop_vars)
     else:
       # TODO(mdan): the optimizer should be able to remove an invariant cond?
       extra_cond = (constant_op.constant(True),)  # dummy value, unused
@@ -156,7 +156,9 @@ def _tf_ag_dataset_enumerate(ds, start=0):
   return ds.enumerate(start)
 
 
-def _tf_ag_dataset_zip(*iterables):
+def _tf_ag_dataset_zip(*iterables, strict=False):
+  if strict:
+    raise ValueError("strict zip not supported by Dataset")
   return dataset_ops.DatasetV2.zip(iterables)
 
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 2d2a6d2a631..44f4b0a7541 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -50,8 +50,8 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import gen_parsing_ops
@@ -1017,7 +1017,7 @@ class DatasetV2(
     # pylint: enable=g-import-not-at-top,protected-access
 
   @staticmethod
-  def zip(datasets, name=None):
+  def zip(*args, datasets=None, name=None):
     """Creates a `Dataset` by zipping together the given datasets.
 
     This method has similar semantics to the built-in `zip()` function
@@ -1026,14 +1026,14 @@ class DatasetV2(
     nesting mechanisms are documented
     [here] (https://www.tensorflow.org/guide/data#dataset_structure).
 
-    >>> # The nested structure of the `datasets` argument determines the
-    >>> # structure of elements in the resulting dataset.
+    >>> # The datasets or nested structure of datasets `*args` argument
+    >>> # determines the structure of elements in the resulting dataset.
     >>> a = tf.data.Dataset.range(1, 4)  # ==> [ 1, 2, 3 ]
     >>> b = tf.data.Dataset.range(4, 7)  # ==> [ 4, 5, 6 ]
-    >>> ds = tf.data.Dataset.zip((a, b))
+    >>> ds = tf.data.Dataset.zip(a, b)
     >>> list(ds.as_numpy_iterator())
     [(1, 4), (2, 5), (3, 6)]
-    >>> ds = tf.data.Dataset.zip((b, a))
+    >>> ds = tf.data.Dataset.zip(b, a)
     >>> list(ds.as_numpy_iterator())
     [(4, 1), (5, 2), (6, 3)]
     >>>
@@ -1041,7 +1041,7 @@ class DatasetV2(
     >>> c = tf.data.Dataset.range(7, 13).batch(2)  # ==> [ [7, 8],
     ...                                            #       [9, 10],
     ...                                            #       [11, 12] ]
-    >>> ds = tf.data.Dataset.zip((a, b, c))
+    >>> ds = tf.data.Dataset.zip(a, b, c)
     >>> for element in ds.as_numpy_iterator():
     ...   print(element)
     (1, 4, array([7, 8]))
@@ -1051,12 +1051,16 @@ class DatasetV2(
     >>> # The number of elements in the resulting dataset is the same as
     >>> # the size of the smallest dataset in `datasets`.
     >>> d = tf.data.Dataset.range(13, 15)  # ==> [ 13, 14 ]
-    >>> ds = tf.data.Dataset.zip((a, d))
+    >>> ds = tf.data.Dataset.zip(a, d)
     >>> list(ds.as_numpy_iterator())
     [(1, 13), (2, 14)]
 
     Args:
-      datasets: A (nested) structure of datasets.
+      *args: Datasets or nested structures of datasets to zip together. This
+        can't be set if `datasets` is set.
+      datasets: A (nested) structure of datasets. This can't be set if `*args`
+        is set. Note that this exists only for backwards compatibility and it is
+        preferred to use *args.
       name: (Optional.) A name for the tf.data operation.
 
     Returns:
@@ -1066,6 +1070,15 @@ class DatasetV2(
     # dataset_ops).
     # pylint: disable=g-import-not-at-top,protected-access
     from tensorflow.python.data.ops import zip_op
+
+    if not args and datasets is None:
+      raise TypeError("Must pass at least one dataset to `zip`.")
+    if args and datasets is not None:
+      raise TypeError("Both `*args` and `datasets` cannot be set.")
+    if len(args) == 1:
+      datasets = args[0]
+    elif len(args) > 1:
+      datasets = args
     return zip_op._zip(datasets, name)
     # pylint: enable=g-import-not-at-top,protected-access
 
@@ -1432,9 +1445,29 @@ class DatasetV2(
     # [1, 0, 2]
     ```
 
+    ### Fully shuffling all the data
+
+    To shuffle an entire dataset, set `buffer_size=dataset.cardinality(). This
+    is equivalent to setting the `buffer_size` equal to the number of elements
+    in the dataset, resulting in uniform shuffle.
+
+    Note: `shuffle(dataset.cardinality())` loads the full dataset into memory so
+    that it can be shuffled. This will cause a memory overflow (OOM) error if
+    the dataset is too large, so full-shuffle should only be used for datasets
+    that are known to fit in the memory, such as datasets of filenames or other
+    small datasets.
+
+    ```python
+    dataset = tf.data.Dataset.range(20)
+    dataset = dataset.shuffle(dataset.cardinality())
+    # [18, 4, 9, 2, 17, 8, 5, 10, 0, 6, 16, 3, 19, 7, 14, 11, 15, 13, 12, 1]
+    ```
+
     Args:
       buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        elements from this dataset from which the new dataset will sample.
+        elements from this dataset from which the new dataset will sample. To
+        uniformly shuffle the entire dataset, use
+        `buffer_size=dataset.cardinality()`.
       seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random
         seed that will be used to create the distribution. See
         `tf.random.set_seed` for behavior.
@@ -1699,6 +1732,10 @@ class DatasetV2(
           implementation creates a `tf.train.Checkpoint` object internally, so
           users should not set the `checkpoint` argument in `checkpoint_args`.
 
+    Returns:
+      An operation which when executed performs the save. When writing
+      checkpoints, returns None. The return value is useful in unit tests.
+
     Raises:
       ValueError if `checkpoint` is passed into `checkpoint_args`.
     """
@@ -1706,7 +1743,7 @@ class DatasetV2(
     # dataset_ops).
     # pylint: disable=g-import-not-at-top,protected-access
     from tensorflow.python.data.ops import save_op
-    save_op._save(self, path, compression, shard_func, checkpoint_args)
+    return save_op._save(self, path, compression, shard_func, checkpoint_args)
     # pylint: enable=g-import-not-at-top,protected-access
 
   @staticmethod
@@ -3937,8 +3974,8 @@ class DatasetV1(DatasetV2, data_types.DatasetV1):
 
   @staticmethod
   @functools.wraps(DatasetV2.zip)
-  def zip(datasets, name=None):
-    return DatasetV1Adapter(DatasetV2.zip(datasets, name=name))
+  def zip(*args, datasets=None, name=None):
+    return DatasetV1Adapter(DatasetV2.zip(*args, datasets=datasets, name=name))
 
   @functools.wraps(DatasetV2.concatenate)
   def concatenate(self, dataset, name=None):
@@ -4675,6 +4712,14 @@ class _NumpyIterator(tracking_base.Trackable):
     # pylint: disable=protected-access
     return self._iterator._restore_from_tensors(restored_tensors)
 
+  def _save(self):
+    # pylint: disable=protected-access
+    return self._iterator._save()
+
+  def _restore(self, state):
+    # pylint: disable=protected-access
+    return self._iterator._restore(state)
+
 
 class _VariantTracker(resource_lib.CapturableResource):
   """Allows export of functions capturing a Dataset in SavedModels.
@@ -4856,7 +4901,7 @@ def _filter_ds(dataset,
 
   def maybe_warn_on_large_rejection(accept_dist, initial_dist):
     proportion_rejected = math_ops.reduce_sum((1 - accept_dist) * initial_dist)
-    return control_flow_ops.cond(
+    return cond.cond(
         math_ops.less(proportion_rejected, .5),
         lambda: accept_dist,
         lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
diff --git a/tensorflow/python/data/ops/iterator_autograph.py b/tensorflow/python/data/ops/iterator_autograph.py
index 5baac27c591..f566a4ecd9e 100644
--- a/tensorflow/python/data/ops/iterator_autograph.py
+++ b/tensorflow/python/data/ops/iterator_autograph.py
@@ -20,9 +20,9 @@ import numpy as np
 from tensorflow.python.autograph.operators import control_flow
 from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.util import nest
 
 
@@ -51,7 +51,7 @@ def _verify_spec_compatible(input_name, spec_name, input_, spec):
 
   # TODO(mdan): Use TensorCompatible when ready.
   if isinstance(input_, (bool, int, float, str, np.ndarray)):
-    input_ = ops.convert_to_tensor_v2(input_)
+    input_ = tensor_conversion.convert_to_tensor_v2(input_)
 
   input_dtype = getattr(input_, "dtype", None)
 
@@ -105,7 +105,7 @@ def _next_tf_iterator(iterator, default=py_builtins.UNSPECIFIED):
   _verify_structure_compatible(
       "the default argument", "the iterate", default, iterator.element_spec
   )
-  return control_flow_ops.cond(
+  return cond.cond(
       opt_iterate.has_value(), opt_iterate.get_value, lambda: default
   )
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 0bade33ecf0..bae8dde3949 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.framework import type_utils
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import parsing_ops
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.training.saver import BaseSaverBuilder
@@ -784,6 +785,26 @@ class OwnedIterator(IteratorBase):
       except AttributeError:
         return structure.from_compatible_tensor_list(self._element_spec, ret)
 
+  def _save(self):
+    external_state_policy = None
+    if (
+        self._dataset
+        and self._dataset.options().experimental_external_state_policy
+    ):
+      external_state_policy = (
+          self._dataset.options().experimental_external_state_policy.value
+      )
+    state_variant = gen_dataset_ops.serialize_iterator(
+        self._iterator_resource, external_state_policy
+    )
+    return parsing_ops.serialize_tensor(state_variant)
+
+  def _restore(self, state):
+    state_variant = parsing_ops.parse_tensor(state, dtypes.variant)
+    return gen_dataset_ops.deserialize_iterator(
+        self._iterator_resource, state_variant
+    )
+
   @property
   def _type_spec(self):
     return IteratorSpec(self.element_spec)
diff --git a/tensorflow/python/data/ops/load_op.py b/tensorflow/python/data/ops/load_op.py
index a10ce151744..94206d28c60 100644
--- a/tensorflow/python/data/ops/load_op.py
+++ b/tensorflow/python/data/ops/load_op.py
@@ -18,7 +18,6 @@ import os
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import structured_function
-from tensorflow.python.eager import context
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.platform import gfile
 # TODO(b/238903802): Use TypeSpec serialization methods directly.
@@ -42,9 +41,6 @@ class _LoadDataset(dataset_ops.DatasetSource):
 
     self._path = path
     if element_spec is None:
-      if not context.executing_eagerly():
-        raise ValueError(
-            "In graph mode the `element_spec` argument must be provided.")
       with gfile.GFile(
           os.path.join(path, dataset_ops.DATASET_SPEC_FILENAME), "rb") as f:
         encoded_spec = f.read()
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index fdf8c2816bd..7bc5ed7a334 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -20,7 +20,6 @@ from tensorflow.python.data.ops import prefetch_op
 from tensorflow.python.data.util import structure
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -87,9 +86,9 @@ class _PerDeviceGenerator(dataset_ops.DatasetV2):
     next_func_concrete = _next_func.get_concrete_function()
 
     # TODO(b/124254153): Enable autograph once the overhead is low enough.
-    @function.defun_with_attributes(
+    @def_function.function(
         input_signature=[tensor_spec.TensorSpec([], dtypes.string)],
-        attributes={"experimental_ints_on_device": True},
+        experimental_attributes={"experimental_ints_on_device": True},
         autograph=False)  # Pure graph code.
     def _remote_next_func(string_handle):
       return_values = functional_ops.remote_call(
diff --git a/tensorflow/python/data/ops/options.py b/tensorflow/python/data/ops/options.py
index 20fd02e9dc1..57ee74edd4c 100644
--- a/tensorflow/python/data/ops/options.py
+++ b/tensorflow/python/data/ops/options.py
@@ -15,6 +15,7 @@
 """API for specifying `tf.data` options."""
 
 import enum
+import platform
 
 from absl import logging
 
@@ -468,7 +469,7 @@ class ThreadingOptions(options_lib.OptionsBase):
   """Represents options for dataset threading.
 
   You can set the threading options of a dataset through the
-  `experimental_threading` property of `tf.data.Options`; the property is
+  `threading` property of `tf.data.Options`; the property is
   an instance of `tf.data.ThreadingOptions`.
 
   ```python
@@ -637,6 +638,11 @@ class Options(options_lib.OptionsBase):
       #                 "Use options.deterministic instead.")
       super(Options, self).__setattr__("deterministic", value)
       return
+    if name == "experimental_symbolic_checkpoint":
+      # TODO(b/276269493): Add support for MacOS.
+      if platform.system() == "Darwin":
+        logging.warning("Symbolic checkpointing is not supported on MacOS.")
+        return
     super(Options, self).__setattr__(name, value)
 
   def _to_proto(self):
diff --git a/tensorflow/python/data/ops/save_op.py b/tensorflow/python/data/ops/save_op.py
index 0518b6af356..c5a63477aee 100644
--- a/tensorflow/python/data/ops/save_op.py
+++ b/tensorflow/python/data/ops/save_op.py
@@ -56,7 +56,7 @@ def _save(input_dataset,
   else:
     dataset, shard_func, use_shard_func, path = set_save_dataset_attributes(
         input_dataset, shard_func, path)
-    ged_ops.save_dataset(
+    return ged_ops.save_dataset(
         dataset._variant_tensor,   # pylint: disable=protected-access
         path=path,
         shard_func_other_args=shard_func.captured_inputs,
diff --git a/tensorflow/python/data/ops/structured_function.py b/tensorflow/python/data/ops/structured_function.py
index b1ad61d51e5..f566a07e029 100644
--- a/tensorflow/python/data/ops/structured_function.py
+++ b/tensorflow/python/data/ops/structured_function.py
@@ -21,7 +21,6 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function as eager_function
 
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
@@ -190,17 +189,23 @@ class StructuredFunctionWrapper():
 
     def trace_py_function(defun_kwargs):
       # First we trace the function to infer the output structure.
-      @eager_function.defun_with_attributes(
-          input_signature=structure.get_flat_tensor_specs(
-              self._input_structure),
-          autograph=False,
-          attributes=defun_kwargs)
       def unused(*args):  # pylint: disable=missing-docstring,unused-variable
         ret = wrapper_helper(*args)
         ret = structure.to_tensor_list(self._output_structure, ret)
         return [ops.convert_to_tensor(t) for t in ret]
 
-      _ = unused.get_concrete_function()
+      func_name = defun_kwargs.pop("func_name", "unused")
+      tf_function = def_function.Function(
+          python_function=unused,
+          name=func_name,
+          input_signature=structure.get_flat_tensor_specs(
+              self._input_structure
+          ),
+          autograph=False,
+          experimental_attributes=defun_kwargs,
+      )
+
+      _ = tf_function.get_concrete_function()
 
       def py_function_wrapper(*args):
         nested_args = structure.from_compatible_tensor_list(
@@ -215,11 +220,11 @@ class StructuredFunctionWrapper():
 
       # Next we trace the function wrapped in `eager_py_func` to force eager
       # execution.
-      @eager_function.defun_with_attributes(
+      @def_function.function(
           input_signature=structure.get_flat_tensor_specs(
               self._input_structure),
           autograph=False,
-          attributes=defun_kwargs)
+          experimental_attributes=defun_kwargs)
       def wrapped_fn(*args):  # pylint: disable=missing-docstring
         return script_ops.eager_py_func(
             py_function_wrapper, args,
@@ -229,17 +234,23 @@ class StructuredFunctionWrapper():
 
     def trace_tf_function(defun_kwargs):
       # Note: wrapper_helper will apply autograph based on context.
-      @eager_function.defun_with_attributes(
-          input_signature=structure.get_flat_tensor_specs(
-              self._input_structure),
-          autograph=False,
-          attributes=defun_kwargs)
       def wrapped_fn(*args):  # pylint: disable=missing-docstring
         ret = wrapper_helper(*args)
         ret = structure.to_tensor_list(self._output_structure, ret)
         return [ops.convert_to_tensor(t) for t in ret]
 
-      return wrapped_fn.get_concrete_function
+      func_name = defun_kwargs.pop("func_name", "wrapped_fn")
+      tf_function = def_function.Function(
+          python_function=wrapped_fn,
+          name=func_name,
+          input_signature=structure.get_flat_tensor_specs(
+              self._input_structure
+          ),
+          autograph=False,
+          experimental_attributes=defun_kwargs,
+      )
+
+      return tf_function.get_concrete_function
 
     if use_legacy_function:
       defun_kwargs.update({"func_name": func_name + "_" + str(ops.uid())})
diff --git a/tensorflow/python/data/ops/zip_op.py b/tensorflow/python/data/ops/zip_op.py
index 3bbaeb4be8e..b148cf014fd 100644
--- a/tensorflow/python/data/ops/zip_op.py
+++ b/tensorflow/python/data/ops/zip_op.py
@@ -32,22 +32,26 @@ class _ZipDataset(dataset_ops.DatasetV2):
     for ds in nest.flatten(datasets):
       if not isinstance(ds, data_types.DatasetV2):
         if isinstance(ds, list):
-          raise TypeError("Invalid `datasets`. `datasets` is expected to be a "
-                          "(nested) structure of `tf.data.Dataset` objects. "
-                          "Python `list` is not supported and you should use "
-                          "`tuple` instead.")
+          raise TypeError(
+              "Invalid input to `zip`. Inputs are expected to be (nested)"
+              " structures of `tf.data.Dataset` objects. Python `list` is"
+              " not supported and you should use `tuple` instead."
+          )
         else:
-          raise TypeError(f"Invalid `datasets`. `datasets` is expected to be a "
-                          f"(nested) structure of `tf.data.Dataset` objects "
-                          f"but encountered object of type {type(ds)}.")
+          raise TypeError(
+              "Invalid input to `zip`. Inputs are expected to be (nested)"
+              " structures of `tf.data.Dataset` objects but"
+              f" encountered object of type {type(ds)}."
+          )
     self._datasets = datasets
     self._structure = nest.pack_sequence_as(
-        self._datasets,
-        [ds.element_spec for ds in nest.flatten(self._datasets)])
+        self._datasets, [ds.element_spec for ds in nest.flatten(self._datasets)]
+    )
     self._name = name
     variant_tensor = gen_dataset_ops.zip_dataset(
         [ds._variant_tensor for ds in nest.flatten(self._datasets)],
-        **self._common_args)
+        **self._common_args,
+    )
     super().__init__(variant_tensor)
 
   def _inputs(self):
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 16e83070e1c..308af0d1873 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.bzl", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,14 +7,16 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "nest",
     srcs = ["nest.py"],
     srcs_version = "PY3",
-    deps = ["//tensorflow/python:util"],
+    deps = [
+        "//tensorflow/python/util:nest_util",
+    ],
 )
 
-py_test(
+py_strict_test(
     name = "nest_test",
     size = "small",
     srcs = ["nest_test.py"],
@@ -31,7 +34,7 @@ py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "sparse",
     srcs = ["sparse.py"],
     srcs_version = "PY3",
@@ -41,12 +44,11 @@ py_library(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/user_ops:ops",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "sparse_test",
     size = "small",
     srcs = ["sparse_test.py"],
@@ -63,28 +65,32 @@ py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "structure",
     srcs = ["structure.py"],
     srcs_version = "PY3",
     deps = [
         ":nest",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_spec_registry",
         "//tensorflow/python/ops/ragged:ragged_tensor",
-        "//tensorflow/python/user_ops:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/types:internal",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
         "@wrapt",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "structure_test",
     size = "small",
     srcs = ["structure_test.py"],
@@ -109,19 +115,21 @@ py_test(
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:ragged_tensor_value",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
         "@wrapt",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "options",
     srcs = ["options.py"],
     srcs_version = "PY3",
+    deps = ["@absl_py//absl/logging"],
 )
 
-py_test(
+py_strict_test(
     name = "options_test",
     size = "small",
     srcs = ["options_test.py"],
@@ -134,7 +142,7 @@ py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "convert",
     srcs = ["convert.py"],
     srcs_version = "PY3",
@@ -146,7 +154,7 @@ py_library(
     ],
 )
 
-py_test(
+py_strict_test(
     name = "convert_test",
     size = "small",
     srcs = ["convert_test.py"],
@@ -156,23 +164,26 @@ py_test(
         ":convert",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/util:compat",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "random_seed",
     srcs = ["random_seed.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "random_seed_test",
     size = "small",
     srcs = ["random_seed_test.py"],
@@ -182,20 +193,20 @@ py_test(
         ":random_seed",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:util",
         "//tensorflow/python/data/kernel_tests:test_base",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "traverse",
     srcs = ["traverse.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python/framework:dtypes",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "traverse_test",
     size = "small",
     srcs = ["traverse_test.py"],
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index cf166bf2a63..21b27229532 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -8,19 +8,21 @@
 #   For a user interface walkthrough, see https://www.tensorflow.org/guide/debugger
 # ":grpc_debug_server": Server interface for grpc:// debug URLs.
 
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "debug_py",
     srcs = ["__init__.py"],
     srcs_version = "PY3",
+    tags = ["keep_dep"],  # Generated files need dependencies that build_cleaner would remove
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:util",
         "//tensorflow/python/debug/lib:check_numerics_callback",
         "//tensorflow/python/debug/lib:debug_data",
         "//tensorflow/python/debug/lib:debug_events_monitors",
@@ -37,16 +39,17 @@ py_library(
         "//tensorflow/python/debug/wrappers:grpc_wrapper",
         "//tensorflow/python/debug/wrappers:hooks",
         "//tensorflow/python/debug/wrappers:local_cli_wrapper",
+        "//tensorflow/python/util:all_util",
     ],
 )
 
 # Transitive dependencies of this target will be included in the pip package.
-py_library(
+py_strict_library(
     name = "debug_pip",
     data = ["//tensorflow/python/debug/lib:grpc_tensorflow_server"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python/debug:debug_py",
+        ":debug_py",
         "//tensorflow/python/debug/cli:cli_test_utils",
         "//tensorflow/python/debug/cli:offline_analyzer_lib",
         "//tensorflow/python/debug/lib:grpc_debug_test_server",
diff --git a/tensorflow/python/debug/README.md b/tensorflow/python/debug/README.md
index 360590278f6..20d2893f520 100644
--- a/tensorflow/python/debug/README.md
+++ b/tensorflow/python/debug/README.md
@@ -2,8 +2,8 @@
 
 [TOC]
 
-TensorFlow Debugger (TFDBG) is a specialized debugger for TensorFlow's computation
-runtime. TFDBG in TensorFlow 2.x provides access to:
+TensorFlow Debugger (TFDBG) is a specialized debugger for TensorFlow's
+computation runtime. TFDBG in TensorFlow 2.x provides access to:
 
 - Tensor values during [eager](https://www.tensorflow.org/guide/eager) and
   [graph](https://www.tensorflow.org/api_docs/python/tf/Graph) execution.
@@ -31,7 +31,7 @@ V2](https://www.tensorflow.org/tensorboard/debugger_v2).
 
 ## Known issues and limitations
 
-1.  Using `tf.debugging.experimental.enable_dumpp_debug_info()` leads to
+1.  Using `tf.debugging.experimental.enable_dump_debug_info()` leads to
     performance penalty on your TensorFlow program. The amount of slowdown
     varied depending on whether you are using TensorFlow on CPU, GPUs, or TPUs.
     The performance penalty is the highest on TPUs, followed by GPUs, and lowest
diff --git a/tensorflow/python/debug/cli/BUILD b/tensorflow/python/debug/cli/BUILD
index 0c0bdfec6d7..a99ccc98761 100644
--- a/tensorflow/python/debug/cli/BUILD
+++ b/tensorflow/python/debug/cli/BUILD
@@ -100,9 +100,9 @@ py_library(
         ":debugger_cli_common",
         ":tensor_format",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
         "//tensorflow/python:variables",
         "//tensorflow/python/debug/lib:common",
+        "//tensorflow/python/platform:gfile",
         "//third_party/py/numpy",
     ],
 )
@@ -139,7 +139,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":debugger_cli_common",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:gfile",
     ],
 )
 
@@ -148,8 +148,8 @@ py_library(
     srcs = ["debugger_cli_common.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/platform:gfile",
         "//third_party/py/numpy",
     ],
 )
@@ -169,7 +169,6 @@ py_library(
     deps = [
         ":analyzer_cli",
         "//tensorflow/python",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/python:platform",
         "//tensorflow/python/debug/lib:debug_data",
     ],
 )
@@ -190,8 +189,8 @@ py_test(
         ":debugger_cli_common",
         ":tensor_format",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:gfile",
         "//third_party/py/numpy",
     ],
 )
@@ -209,8 +208,8 @@ py_test(
         ":readline_ui",
         ":ui_factory",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:gfile",
     ],
 )
 
@@ -223,9 +222,9 @@ py_test(
     deps = [
         ":debugger_cli_common",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/platform:gfile",
         "//third_party/py/numpy",
     ],
 )
@@ -239,9 +238,9 @@ py_test(
     deps = [
         ":cli_config",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:gfile",
     ],
 )
 
@@ -341,13 +340,14 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:variable_v1",
+        "//tensorflow/python:while_loop",
         "//tensorflow/python/debug/lib:debug_data",
         "//tensorflow/python/debug/lib:debug_utils",
         "//tensorflow/python/debug/lib:source_utils",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
@@ -363,13 +363,13 @@ py_test(
         ":profile_analyzer_cli",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
-        "//tensorflow/python:util",
+        "//tensorflow/python:while_loop",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index cf9e91cc438..3fee04b50b9 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -37,7 +37,8 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
+from tensorflow.python.ops import while_loop as while_loop_tf
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
@@ -611,11 +612,11 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
       v_name = "simple_mul_add/v"
 
       u_init = constant_op.constant(u_init_val, shape=[2, 2], name="u_init")
-      u = variables.VariableV1(u_init, name=u_name)
+      u = variable_v1.VariableV1(u_init, name=u_name)
       cls._u_line_number = line_number_above()
 
       v_init = constant_op.constant(v_init_val, shape=[2, 1], name="v_init")
-      v = variables.VariableV1(v_init, name=v_name)
+      v = variable_v1.VariableV1(v_init, name=v_name)
       cls._v_line_number = line_number_above()
 
       w = math_ops.matmul(u, v, name="simple_mul_add/matmul")
@@ -624,7 +625,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
       x = math_ops.add(w, w, name="simple_mul_add/add")
       cls._x_line_number = line_number_above()
 
-      a = variables.VariableV1([1, 3, 3, 7], name="a")
+      a = variable_v1.VariableV1([1, 3, 3, 7], name="a")
 
       u.initializer.run()
       v.initializer.run()
@@ -1410,7 +1411,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     # Verify the annotation of the line that creates u.
     index = self._findSourceLine(out, self._u_line_number)
     self.assertEqual(
-        ["L%d         u = variables.VariableV1(u_init, name=u_name)" %
+        ["L%d         u = variable_v1.VariableV1(u_init, name=u_name)" %
          self._u_line_number,
          "    simple_mul_add/u",
          "    simple_mul_add/u/Assign",
@@ -1427,7 +1428,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     # Verify the annotation of the line that creates v.
     index = self._findSourceLine(out, self._v_line_number)
     self.assertEqual(
-        ["L%d         v = variables.VariableV1(v_init, name=v_name)" %
+        ["L%d         v = variable_v1.VariableV1(v_init, name=v_name)" %
          self._v_line_number,
          "    simple_mul_add/v"],
         out.lines[index : index + 2])
@@ -1464,7 +1465,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
     # Verify the annotation of the line that creates u.
     index = self._findSourceLine(out, self._u_line_number)
     self.assertEqual(
-        ["L%d         u = variables.VariableV1(u_init, name=u_name)" %
+        ["L%d         u = variable_v1.VariableV1(u_init, name=u_name)" %
          self._u_line_number,
          "    simple_mul_add/u/read:0",
          "    simple_mul_add/u:0"],
@@ -1486,7 +1487,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
 
     index = self._findSourceLine(out, self._u_line_number)
     self.assertEqual(
-        ["L%d         u = variables.VariableV1(u_init, name=u_name)" %
+        ["L%d         u = variable_v1.VariableV1(u_init, name=u_name)" %
          self._u_line_number,
          "    simple_mul_add/u",
          "    simple_mul_add/u/Assign",
@@ -1509,7 +1510,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
 
     index = self._findSourceLine(out, self._u_line_number)
     self.assertEqual(
-        ["L%d         u = variables.VariableV1(u_init, name=u_name)" %
+        ["L%d         u = variable_v1.VariableV1(u_init, name=u_name)" %
          self._u_line_number,
          "    simple_mul_add/u",
          "    (... Omitted 2 of 3 op(s) ...) +5"],
@@ -1624,7 +1625,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
 
     with session.Session(config=no_rewrite_session_config()) as sess:
       with ops.device("CPU:0"):
-        x = variables.VariableV1([1, 3, 3, 7], name="x")
+        x = variable_v1.VariableV1([1, 3, 3, 7], name="x")
         _, idx = array_ops.unique(x, name="x_unique")
         idx_times_two = math_ops.multiply(idx, 2, name="idx_times_two")
         self.evaluate(x.initializer)
@@ -1730,7 +1731,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
     with session.Session(config=no_rewrite_session_config()) as sess:
       x_init_val = np.array([5.0, 3.0])
       x_init = constant_op.constant(x_init_val, shape=[2])
-      x = variables.VariableV1(x_init, name="control_deps/x")
+      x = variable_v1.VariableV1(x_init, name="control_deps/x")
 
       y = math_ops.add(x, x, name="control_deps/y")
       y = control_flow_ops.with_dependencies(
@@ -2052,7 +2053,7 @@ class AnalyzerCLIWhileLoopTest(test_util.TensorFlowTestCase):
       loop_var = constant_op.constant(0, name="while_loop_test/loop_var")
       cond = lambda loop_var: math_ops.less(loop_var, 10)
       body = lambda loop_var: math_ops.add(loop_var, 1)
-      while_loop = control_flow_ops.while_loop(
+      while_loop = while_loop_tf.while_loop(
           cond, body, [loop_var], parallel_iterations=1)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
diff --git a/tensorflow/python/debug/cli/debugger_cli_common.py b/tensorflow/python/debug/cli/debugger_cli_common.py
index 61665b74b47..48696e5fdb7 100644
--- a/tensorflow/python/debug/cli/debugger_cli_common.py
+++ b/tensorflow/python/debug/cli/debugger_cli_common.py
@@ -16,7 +16,6 @@
 import copy
 import os
 import re
-import sre_constants
 import traceback
 
 import numpy as np
@@ -402,7 +401,7 @@ def regex_find(orig_screen_output, regex, font_attr):
 
   try:
     re_prog = re.compile(regex)
-  except sre_constants.error:
+  except re.error:
     raise ValueError("Invalid regular expression: \"%s\"" % regex)
 
   regex_match_lines = []
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
index 6fc9f4540e0..6a50d42b9d8 100644
--- a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
@@ -25,8 +25,8 @@ from tensorflow.python.debug.cli import profile_analyzer_cli
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import while_loop
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
@@ -334,7 +334,7 @@ class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
       self.loop_body_lineno = _line_number_above()
       x = constant_op.constant(0, name="x")
       self.x_lineno = _line_number_above()
-      loop = control_flow_ops.while_loop(loop_cond, loop_body, [x])
+      loop = while_loop.while_loop(loop_cond, loop_body, [x])
       self.loop_lineno = _line_number_above()
       self.assertEqual(
           10, sess.run(loop, options=options, run_metadata=run_metadata))
diff --git a/tensorflow/python/debug/examples/v1/BUILD b/tensorflow/python/debug/examples/v1/BUILD
index f7e3cc0c04d..3daaf7872a0 100644
--- a/tensorflow/python/debug/examples/v1/BUILD
+++ b/tensorflow/python/debug/examples/v1/BUILD
@@ -1,9 +1,11 @@
+load("//tensorflow:strict.default.bzl", "py_strict_binary")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-py_binary(
+py_strict_binary(
     name = "debug_fibonacci",
     srcs = ["debug_fibonacci.py"],
     python_version = "PY3",
@@ -15,7 +17,7 @@ py_binary(
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "debug_errors",
     srcs = ["debug_errors.py"],
     python_version = "PY3",
@@ -27,20 +29,21 @@ py_binary(
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "debug_tflearn_iris",
     srcs = ["debug_tflearn_iris.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
-        # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v1_estimator",
+        # copybara:uncomment_begin(google-only)
+        # "//third_party/py/tensorflow:tensorflow_compat_v1_estimator",  # build_cleaner:keep
+        # copybara:uncomment_end
         "//tensorflow/python/debug:debug_py",
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "debug_keras",
     srcs = ["debug_keras.py"],
     python_version = "PY3",
@@ -52,7 +55,7 @@ py_binary(
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "debug_mnist",
     srcs = ["debug_mnist_v1.py"],
     main = "debug_mnist_v1.py",
@@ -61,7 +64,6 @@ py_binary(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/debug:debug_py",
-        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/python/debug/examples/v2/BUILD b/tensorflow/python/debug/examples/v2/BUILD
index fc220fbdd08..cfb21420f26 100644
--- a/tensorflow/python/debug/examples/v2/BUILD
+++ b/tensorflow/python/debug/examples/v2/BUILD
@@ -1,9 +1,11 @@
+load("//tensorflow:strict.default.bzl", "py_strict_binary")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-py_binary(
+py_strict_binary(
     name = "debug_fibonacci_v2",
     srcs = ["debug_fibonacci_v2.py"],
     python_version = "PY3",
@@ -12,10 +14,11 @@ py_binary(
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/debug:debug_py",
         "//third_party/py/numpy",
+        "@absl_py//absl:app",
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "debug_mnist_v2",
     srcs = ["debug_mnist_v2.py"],
     python_version = "PY3",
@@ -24,6 +27,7 @@ py_binary(
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/debug:debug_py",
         "//third_party/py/numpy",
+        "@absl_py//absl:app",
     ],
 )
 
diff --git a/tensorflow/python/debug/examples/v2/debug_fibonacci_v2.py b/tensorflow/python/debug/examples/v2/debug_fibonacci_v2.py
index 9db3b46eb0a..37e6f2c2cd0 100644
--- a/tensorflow/python/debug/examples/v2/debug_fibonacci_v2.py
+++ b/tensorflow/python/debug/examples/v2/debug_fibonacci_v2.py
@@ -16,7 +16,7 @@
 import argparse
 import sys
 
-import absl
+from absl import app
 import numpy as np
 import tensorflow.compat.v2 as tf
 
@@ -83,4 +83,4 @@ if __name__ == "__main__":
 
   FLAGS, unparsed = parser.parse_known_args()
 
-  absl.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/v2/debug_mnist_v2.py b/tensorflow/python/debug/examples/v2/debug_mnist_v2.py
index 50b010f6e00..d7953cf3b18 100644
--- a/tensorflow/python/debug/examples/v2/debug_mnist_v2.py
+++ b/tensorflow/python/debug/examples/v2/debug_mnist_v2.py
@@ -21,7 +21,7 @@ and nans) appear in nodes of the graph during training.
 import argparse
 import sys
 
-import absl
+from absl import app
 import tensorflow.compat.v2 as tf
 
 IMAGE_SIZE = 28
@@ -237,4 +237,4 @@ def main(_):
 
 if __name__ == "__main__":
   FLAGS, unparsed = parse_args()
-  absl.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/lib/BUILD b/tensorflow/python/debug/lib/BUILD
index 45b47605566..b09e3cb51b6 100644
--- a/tensorflow/python/debug/lib/BUILD
+++ b/tensorflow/python/debug/lib/BUILD
@@ -23,6 +23,9 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:debug_ops_gen",
         "//tensorflow/python:op_callbacks",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -38,6 +41,10 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:debug_ops_gen",
         "//tensorflow/python:op_callbacks",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:tf_stack",
         "//third_party/py/numpy",
     ],
 )
@@ -68,6 +75,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "//tensorflow/python:lib",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -101,8 +109,8 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "//tensorflow/python:op_def_registry",
-        "//tensorflow/python:platform",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
@@ -119,8 +127,10 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "//tensorflow/python:op_def_registry",
-        "//tensorflow/python:platform",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
@@ -134,7 +144,6 @@ py_library(
         ":debug_graphs",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
     ],
 )
 
@@ -160,8 +169,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
@@ -183,6 +192,7 @@ py_library(
         ":common",
         ":debug_service_pb2_grpc",
         "//tensorflow/core/debug:debug_service_proto_py",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler:tfprof_logger",
     ],
 )
@@ -226,6 +236,7 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
     ],
 )
@@ -246,6 +257,7 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/platform:test",
     ],
 )
 
@@ -274,8 +286,8 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:gfile",
         "//third_party/py/numpy",
     ],
 )
@@ -292,9 +304,11 @@ cuda_py_test(
     deps = [
         ":check_numerics_callback",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -318,6 +332,8 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -379,7 +395,7 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:variable_v1",
         "//third_party/py/numpy",
     ],
 )
@@ -406,10 +422,10 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
         "//tensorflow/python:while_v2",
+        "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
@@ -436,8 +452,8 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
 
@@ -467,6 +483,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:errors",
@@ -480,6 +497,7 @@ py_library(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:training",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
         "//third_party/py/numpy",
@@ -504,6 +522,8 @@ py_library(
         ":debug_graphs",
         ":debug_service_pb2_grpc",
         "//tensorflow/core/debug:debug_service_proto_py",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -520,6 +540,7 @@ py_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:errors",
         "//tensorflow/python:variables",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -536,6 +557,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
     ],
 )
@@ -555,7 +577,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:variable_v1",
     ],
 )
 
@@ -570,7 +592,7 @@ cuda_py_test(
         ":debug_utils",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:training",
diff --git a/tensorflow/python/debug/lib/check_numerics_callback.py b/tensorflow/python/debug/lib/check_numerics_callback.py
index fe22b6b9f8f..7c8b2e4e934 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_debug_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -230,7 +231,8 @@ class CheckNumericsCallback(object):
     # Used only under V1 graph mode, where we can't rely on auto control
     # dependency to execute the debug tensors and hence need to attach the debug
     # tensors as control dependencies of the ops that consume the Placeholder.
-    self._placeholder_to_debug_tensor = dict()
+    self._placeholder_to_debug_tensor = (
+        object_identity.ObjectIdentityDictionary())
 
   def callback(self,
                op_type,
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index 0dbd7f862b5..831bdcf422e 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
@@ -144,7 +144,7 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = variables.Variable(10.0, name="x")
       y = variables.Variable(20.0, name="y")
-      cond = control_flow_ops.cond(
+      cond = tf_cond.cond(
           x > y, lambda: math_ops.add(x, 1), lambda: math_ops.add(y, 1))
       self.evaluate(x.initializer)
       self.evaluate(y.initializer)
diff --git a/tensorflow/python/debug/lib/debug_grappler_test.py b/tensorflow/python/debug/lib/debug_grappler_test.py
index 8db43ad6d5a..6c47c2405a0 100644
--- a/tensorflow/python/debug/lib/debug_grappler_test.py
+++ b/tensorflow/python/debug/lib/debug_grappler_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -60,7 +61,9 @@ class SessionDebugGrapplerInteractionTest(test_util.TensorFlowTestCase):
   def testArithmeticOptimizationActive(self):
     """Tests that tfdbg can dump the tensor from nodes created by Grappler."""
     with session.Session(config=_grappler_enabled_session_config()) as sess:
-      u = variables.VariableV1([[1, 2], [3, 4]], name="u", dtype=dtypes.float32)
+      u = variable_v1.VariableV1([[1, 2], [3, 4]],
+                                 name="u",
+                                 dtype=dtypes.float32)
       # The next two ops should be optimized by Grappler into a single op:
       # either an AddN op or a Mul op.
       x = math_ops.add(u, u)
diff --git a/tensorflow/python/debug/lib/debug_utils_test.py b/tensorflow/python/debug/lib/debug_utils_test.py
index e20cfce4509..151547830ce 100644
--- a/tensorflow/python/debug/lib/debug_utils_test.py
+++ b/tensorflow/python/debug/lib/debug_utils_test.py
@@ -23,7 +23,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import googletest
 
 
@@ -43,8 +43,8 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
       cls._b_init = constant_op.constant(
           cls._b_init_val, shape=[2, 1], name="b_init")
 
-      cls._a = variables.VariableV1(cls._a_init, name="a1")
-      cls._b = variables.VariableV1(cls._b_init, name="b")
+      cls._a = variable_v1.VariableV1(cls._a_init, name="a1")
+      cls._b = variable_v1.VariableV1(cls._b_init, name="b")
       cls._c = constant_op.constant(cls._c_val, shape=[2, 1], name="c")
 
       # Matrix product of a and b.
diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py
index 1121a49b0ad..7abbd55dc56 100644
--- a/tensorflow/python/debug/lib/debug_v2_ops_test.py
+++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py
@@ -461,7 +461,7 @@ class DebugNumericSummaryV2Test(test_util.TensorFlowTestCase):
     for mode in modes:
       debug_mode = debug_event_pb2.TensorDebugMode.Name(mode)
       with test_util.deterministic_ops():
-        if test_util.is_gpu_available(cuda_only=True):
+        if test_util.config.list_physical_devices("GPU"):
           with self.assertRaisesRegex(
               errors_impl.UnimplementedError, "Determinism is not yet "
               "supported for DebugNumericSummaryV2 when tensor_debug_mode is "
diff --git a/tensorflow/python/debug/lib/dumping_callback.py b/tensorflow/python/debug/lib/dumping_callback.py
index fc088de4303..9f9afad5b59 100644
--- a/tensorflow/python/debug/lib/dumping_callback.py
+++ b/tensorflow/python/debug/lib/dumping_callback.py
@@ -21,10 +21,9 @@ import socket
 import threading
 import uuid
 
-
+from tensorflow.core.framework import graph_debug_info_pb2
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.protobuf import debug_event_pb2
-from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.python.debug.lib import debug_events_writer
 from tensorflow.python.debug.lib import op_callbacks_common
 from tensorflow.python.debug.lib import source_utils
@@ -38,6 +37,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_debug_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_stack
 from tensorflow.python.util.tf_export import tf_export
 
@@ -116,26 +116,15 @@ class _DumpingCallback(object):
     # Used only under V1 graph mode, where we can't rely on auto control
     # dependency to execute the debug tensors and hence need to attach the debug
     # tensors as control dependencies of the ops that consume the Placeholder.
-    self._placeholder_to_debug_tensor = dict()
+    self._placeholder_to_debug_tensor = (
+        object_identity.ObjectIdentityDictionary())
     self._writer = None
 
-  def function_callback(self, function, name, graph, inputs, outputs):
-    """A callback to be called on creation of Functions.
-
-    Used to establish a join between function name and graph (context) ID.
-
-    Args:
-      function: The just-created Function.
-      name: Name of the function.
-      graph: FuncGraph, the graph containing the operations in the function.
-      inputs: the tensors in the graph to be used as inputs to the function
-      outputs: the tensors in the graph which will be outputs from the function
-    """
-    del name, inputs, outputs
-
-    graph_id = self._get_context_id(graph)
+  def function_callback(self, function):
+    """A callback to be called on creation of ConcreteFunctions."""
+    graph_id = self._get_context_id(function.graph)
     with self._context_lock:
-      # NOTE(cais): We currently store the function (_EagerDefinedFunction)
+      # NOTE(cais): We currently store the function (ConcreteFunction)
       # as keys of this dict, because weakrefs to them sometimes become
       # unreferenceable by the time the op callback is called. This approach
       # may cause memory leaks due to the holding of the functions. If that's
@@ -854,7 +843,7 @@ def enable_dump_debug_info(dump_root,
                                                op_regex,
                                                tensor_dtypes)
     op_callbacks.add_op_callback(_state.dumping_callback.callback)
-    function_lib.add_function_callback(
+    function_lib.CONCRETE_FUNCTION_CALLBACKS.append(
         _state.dumping_callback.function_callback)
 
   if _state.dumping_callback.dump_root != dump_root:
@@ -885,8 +874,13 @@ def disable_dump_debug_info():
     tfdbg_run_id = _state.dumping_callback.tfdbg_run_id
     debug_events_writer.DebugEventsWriter(dump_root, tfdbg_run_id).Close()
     op_callbacks.remove_op_callback(_state.dumping_callback.callback)
-    function_lib.remove_function_callback(
-        _state.dumping_callback.function_callback)
+    if (
+        _state.dumping_callback.function_callback
+        in function_lib.CONCRETE_FUNCTION_CALLBACKS
+    ):
+      function_lib.CONCRETE_FUNCTION_CALLBACKS.remove(
+          _state.dumping_callback.function_callback
+      )
     delattr(_state, "dumping_callback")
     logging.info("Disabled dumping callback in thread %s (dump root: %s)",
                  threading.current_thread().name, dump_root)
diff --git a/tensorflow/python/debug/lib/session_debug_file_test.py b/tensorflow/python/debug/lib/session_debug_file_test.py
index 679c61c3430..063c4247e88 100644
--- a/tensorflow/python/debug/lib/session_debug_file_test.py
+++ b/tensorflow/python/debug/lib/session_debug_file_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import googletest
 
 
@@ -56,9 +56,9 @@ class SessionDebugFileTest(session_debug_testlib.SessionDebugTestBase):
       v_name = "diff_Watch/v"
 
       u_init = constant_op.constant(u_init_val, shape=[2, 2])
-      u = variables.VariableV1(u_init, name=u_name)
+      u = variable_v1.VariableV1(u_init, name=u_name)
       v_init = constant_op.constant(v_init_val, shape=[2, 1])
-      v = variables.VariableV1(v_init, name=v_name)
+      v = variable_v1.VariableV1(v_init, name=v_name)
 
       w = math_ops.matmul(u, v, name="diff_Watch/matmul")
 
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index aacd19e3ba7..20556a41585 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -43,6 +44,7 @@ from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
 import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
@@ -66,7 +68,7 @@ class _RNNCellForTest(rnn_cell_impl.RNNCell):
   def __init__(self, input_output_size, state_size):
     self._input_output_size = input_output_size
     self._state_size = state_size
-    self._w = variables.VariableV1(1.0, dtype=dtypes.float32, name="w")
+    self._w = variable_v1.VariableV1(1.0, dtype=dtypes.float32, name="w")
 
   @property
   def output_size(self):
@@ -179,9 +181,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       w_name = "w"
 
       u_init = constant_op.constant(u_init_val, shape=[2, 2])
-      u = variables.VariableV1(u_init, name=u_name)
+      u = variable_v1.VariableV1(u_init, name=u_name)
       v_init = constant_op.constant(v_init_val, shape=[2, 1])
-      v = variables.VariableV1(v_init, name=v_name)
+      v = variable_v1.VariableV1(v_init, name=v_name)
 
       w = math_ops.matmul(u, v, name=w_name)
 
@@ -218,8 +220,8 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testCopyNodesHaveCorrectDebugOpsAndURLsAttributeValues(self):
     with session.Session() as sess:
-      u = variables.VariableV1(2.1, name="u")
-      v = variables.VariableV1(20.0, name="v")
+      u = variable_v1.VariableV1(2.1, name="u")
+      v = variable_v1.VariableV1(20.0, name="v")
       w = math_ops.multiply(u, v, name="w")
 
       sess.run(variables.global_variables_initializer())
@@ -321,8 +323,8 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
       str1_name = "str1"
       str2_name = "str2"
-      str1 = variables.VariableV1(str1_init, name=str1_name)
-      str2 = variables.VariableV1(str2_init, name=str2_name)
+      str1 = variable_v1.VariableV1(str1_init, name=str1_name)
+      str2 = variable_v1.VariableV1(str2_init, name=str2_name)
       # Concatenate str1 and str2
       str_concat = math_ops.add(str1, str2, name="str_concat")
 
@@ -384,9 +386,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       s_name = "%s/s" % op_namespace
 
       u_init = constant_op.constant(u_init_val, shape=[2, 2])
-      u = variables.VariableV1(u_init, name=u_name)
+      u = variable_v1.VariableV1(u_init, name=u_name)
       s_init = constant_op.constant(s_init_val)
-      s = variables.VariableV1(s_init, name=s_name)
+      s = variable_v1.VariableV1(s_init, name=s_name)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_urls = self._debug_urls()
@@ -436,7 +438,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
       u_init_val = np.array(11.0)
       u_init = constant_op.constant(u_init_val)
-      u = variables.VariableV1(u_init, name=u_name)
+      u = variable_v1.VariableV1(u_init, name=u_name)
 
       # "v" is the increment.
       v_name = "testDumpToFileWhileLoop/v"
@@ -444,7 +446,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
       v_init_val = np.array(2.0)
       v_init = constant_op.constant(v_init_val)
-      v = variables.VariableV1(v_init, name=v_name)
+      v = variable_v1.VariableV1(v_init, name=v_name)
 
       u.initializer.run()
       v.initializer.run()
@@ -601,9 +603,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugCondWatchingWholeGraphWorks(self):
     with session.Session() as sess:
-      x = variables.VariableV1(10.0, name="x")
-      y = variables.VariableV1(20.0, name="y")
-      cond = control_flow_ops.cond(
+      x = variable_v1.VariableV1(10.0, name="x")
+      y = variable_v1.VariableV1(20.0, name="y")
+      cond = tf_cond.cond(
           x > y, lambda: math_ops.add(x, 1), lambda: math_ops.add(y, 1))
 
       sess.run(variables.global_variables_initializer())
@@ -624,9 +626,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       z_name = "testFindNodesWithBadTensorValues/z"
 
       u_init = constant_op.constant([2.0, 4.0])
-      u = variables.VariableV1(u_init, name=u_name)
+      u = variable_v1.VariableV1(u_init, name=u_name)
       v_init = constant_op.constant([2.0, 1.0])
-      v = variables.VariableV1(v_init, name=v_name)
+      v = variable_v1.VariableV1(v_init, name=v_name)
 
       # Expected output: [0.0, 3.0]
       w = math_ops.subtract(u, v, name=w_name)
@@ -674,9 +676,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       z_name = "testFindInfOrNanWithOpNameExclusion/z"
 
       u_init = constant_op.constant([2.0, 4.0])
-      u = variables.VariableV1(u_init, name=u_name)
+      u = variable_v1.VariableV1(u_init, name=u_name)
       v_init = constant_op.constant([2.0, 1.0])
-      v = variables.VariableV1(v_init, name=v_name)
+      v = variable_v1.VariableV1(v_init, name=v_name)
 
       # Expected output: [0.0, 3.0]
       w = math_ops.subtract(u, v, name=w_name)
@@ -719,7 +721,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       w_name = "testDumpGraphStructureLookup/w"
 
       u_init = constant_op.constant([2.0, 4.0])
-      u = variables.VariableV1(u_init, name=u_name)
+      u = variable_v1.VariableV1(u_init, name=u_name)
       v = math_ops.add(u, u, name=v_name)
       w = math_ops.add(v, v, name=w_name)
 
@@ -853,9 +855,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testGraphPathFindingOnControlEdgesWorks(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
-      v1 = variables.VariableV1(1.0, name="v1")
-      v2 = variables.VariableV1(2.0, name="v2")
-      v3 = variables.VariableV1(3.0, name="v3")
+      v1 = variable_v1.VariableV1(1.0, name="v1")
+      v2 = variable_v1.VariableV1(2.0, name="v2")
+      v3 = variable_v1.VariableV1(3.0, name="v3")
       a = math_ops.add(v1, v2, name="a")
       with ops.control_dependencies([a]):
         c = math_ops.subtract(v3, v3, name="c")
@@ -869,8 +871,8 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testGraphPathFindingReverseRefEdgeWorks(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
-      v = variables.VariableV1(10.0, name="v")
-      delta = variables.VariableV1(1.0, name="delta")
+      v = variable_v1.VariableV1(10.0, name="v")
+      delta = variable_v1.VariableV1(1.0, name="delta")
       inc_v = state_ops.assign_add(v, delta, name="inc_v")
 
       sess.run(variables.global_variables_initializer())
@@ -888,7 +890,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       w_name = "testDumpCausalityCheck/w"
 
       u_init = constant_op.constant([2.0, 4.0])
-      u = variables.VariableV1(u_init, name=u_name)
+      u = variable_v1.VariableV1(u_init, name=u_name)
       v = math_ops.add(u, u, name=v_name)
       w = math_ops.add(v, v, name=w_name)
 
@@ -974,7 +976,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       w_name = "oneOfTwoSlots/w"
       y_name = "oneOfTwoSlots/y"
 
-      x = variables.VariableV1([1, 3, 3, 7], dtype=dtypes.int32, name=x_name)
+      x = variable_v1.VariableV1([1, 3, 3, 7], dtype=dtypes.int32, name=x_name)
       sess.run(x.initializer)
 
       unique_x, indices, _ = array_ops.unique_with_counts(x, name=u_name)
@@ -1033,9 +1035,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
     with session.Session(config=no_rewrite_session_config()) as sess:
       u_init = constant_op.constant(10.0)
-      u = variables.VariableV1(u_init, name="gdo/u")
+      u = variable_v1.VariableV1(u_init, name="gdo/u")
       v_init = constant_op.constant(20.0)
-      v = variables.VariableV1(v_init, name="gdo/v")
+      v = variable_v1.VariableV1(v_init, name="gdo/v")
 
       w = math_ops.multiply(u, v, name="gdo/w")
       # gdo stands for GradientDescentOptimizer.
@@ -1079,7 +1081,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
     with session.Session() as sess:
       x_init = constant_op.constant([2, 2, 3, 5, 5])
-      x = variables.VariableV1(x_init, name="unconnected/x")
+      x = variable_v1.VariableV1(x_init, name="unconnected/x")
 
       # The UniqueOp (tf.unique) has two output slots. Use only slot 0 in the
       # graph. Let the debugger watch the unused slot 1.
@@ -1219,14 +1221,13 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugNumericSummaryOnInitializedTensorGivesCorrectResult(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
-      a = variables.VariableV1(
-          [
-              np.nan, np.nan, 0.0, 0.0, 0.0, -1.0, -3.0, 3.0, 7.0, -np.inf,
-              -np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.nan, np.nan
-          ],
-          dtype=np.float32,
-          name="numeric_summary/a")
-      b = variables.VariableV1(
+      a = variable_v1.VariableV1([
+          np.nan, np.nan, 0.0, 0.0, 0.0, -1.0, -3.0, 3.0, 7.0, -np.inf, -np.inf,
+          np.inf, np.inf, np.inf, np.inf, np.inf, np.nan, np.nan
+      ],
+                                 dtype=np.float32,
+                                 name="numeric_summary/a")
+      b = variable_v1.VariableV1(
           [0.0] * 18, dtype=np.float32, name="numeric_summary/b")
       c = math_ops.add(a, b, name="numeric_summary/c")
 
@@ -1243,8 +1244,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugNumericSummaryOnUninitializedTensorGivesCorrectResult(self):
     with session.Session() as sess:
-      a = variables.VariableV1(
-          [42], dtype=np.float32, name="numeric_summary_uninit/a")
+      a = variable_v1.VariableV1([42],
+                                 dtype=np.float32,
+                                 name="numeric_summary_uninit/a")
 
       _, dump = self._debug_run_and_get_dump(
           sess, a.initializer, debug_ops=["DebugNumericSummary"])
@@ -1269,9 +1271,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugNumericSummaryFailureIsToleratedWhenOrdered(self):
     with session.Session() as sess:
-      a = variables.VariableV1("1", name="a")
-      b = variables.VariableV1("3", name="b")
-      c = variables.VariableV1("2", name="c")
+      a = variable_v1.VariableV1("1", name="a")
+      b = variable_v1.VariableV1("3", name="b")
+      c = variable_v1.VariableV1("2", name="c")
 
       d = math_ops.add(a, b, name="d")
       e = math_ops.add(d, c, name="e")
@@ -1307,9 +1309,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugNumericSummaryInvalidAttributesStringAreCaught(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
-      a = variables.VariableV1(10.0, name="a")
-      b = variables.VariableV1(0.0, name="b")
-      c = variables.VariableV1(0.0, name="c")
+      a = variable_v1.VariableV1(10.0, name="a")
+      b = variable_v1.VariableV1(0.0, name="b")
+      c = variable_v1.VariableV1(0.0, name="c")
 
       x = math_ops.divide(a, b, name="x")
       y = math_ops.multiply(x, c, name="y")
@@ -1355,9 +1357,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugNumericSummaryMuteOnHealthyMutesOnlyHealthyTensorDumps(self):
     with session.Session(config=no_rewrite_session_config()) as sess:
-      a = variables.VariableV1(10.0, name="a")
-      b = variables.VariableV1(0.0, name="b")
-      c = variables.VariableV1(0.0, name="c")
+      a = variable_v1.VariableV1(10.0, name="a")
+      b = variable_v1.VariableV1(0.0, name="b")
+      c = variable_v1.VariableV1(0.0, name="c")
 
       x = math_ops.divide(a, b, name="x")
       y = math_ops.multiply(x, c, name="y")
@@ -1390,8 +1392,8 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
 
   def testDebugNumericSummaryMuteOnHealthyAndCustomBoundsWork(self):
     with session.Session() as sess:
-      a = variables.VariableV1([10.0, 10.0], name="a")
-      b = variables.VariableV1([10.0, 2.0], name="b")
+      a = variable_v1.VariableV1([10.0, 10.0], name="a")
+      b = variable_v1.VariableV1([10.0, 2.0], name="b")
 
       x = math_ops.add(a, b, name="x")  # [20.0, 12.0]
       y = math_ops.divide(x, b, name="y")  # [2.0, 6.0]
@@ -1430,9 +1432,9 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
   def testLookUpNodePythonTracebackWorks(self):
     with session.Session() as sess:
       u_init = constant_op.constant(10.0)
-      u = variables.VariableV1(u_init, name="traceback/u")
+      u = variable_v1.VariableV1(u_init, name="traceback/u")
       v_init = constant_op.constant(20.0)
-      v = variables.VariableV1(v_init, name="traceback/v")
+      v = variable_v1.VariableV1(v_init, name="traceback/v")
 
       w = math_ops.multiply(u, v, name="traceback/w")
 
@@ -1481,7 +1483,7 @@ class DebugConcurrentRunCallsTest(test_util.TensorFlowTestCase):
       self.skipTest("No testing concurrent runs on a single GPU.")
 
     with session.Session() as sess:
-      v = variables.VariableV1(30.0, name="v")
+      v = variable_v1.VariableV1(30.0, name="v")
       constants = []
       for i in range(self._num_concurrent_runs):
         constants.append(constant_op.constant(1.0, name="c%d" % i))
diff --git a/tensorflow/python/debug/wrappers/BUILD b/tensorflow/python/debug/wrappers/BUILD
index 5032d862d76..cf94a2d0865 100644
--- a/tensorflow/python/debug/wrappers/BUILD
+++ b/tensorflow/python/debug/wrappers/BUILD
@@ -17,6 +17,9 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python/debug/lib:debug_utils",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -28,8 +31,8 @@ py_library(
     deps = [
         ":framework",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
         "//tensorflow/python/debug/lib:debug_data",
+        "//tensorflow/python/platform:gfile",
     ],
 )
 
@@ -102,9 +105,9 @@ py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/debug/lib:debug_data",
+        "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
@@ -123,12 +126,12 @@ py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python/debug/lib:debug_data",
+        "//tensorflow/python/platform:gfile",
     ],
 )
 
@@ -152,6 +155,7 @@ py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/debug/cli:cli_shared",
         "//tensorflow/python/debug/cli:debugger_cli_common",
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index f80ec1577a8..0eca5aa8b48 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import monitored_session
@@ -42,7 +42,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
   def setUp(self):
     self.session_root = tempfile.mkdtemp()
 
-    self.v = variables.VariableV1(10.0, dtype=dtypes.float32, name="v")
+    self.v = variable_v1.VariableV1(10.0, dtype=dtypes.float32, name="v")
     self.delta = constant_op.constant(1.0, dtype=dtypes.float32, name="delta")
     self.eta = constant_op.constant(-1.4, dtype=dtypes.float32, name="eta")
     self.inc_v = state_ops.assign_add(self.v, self.delta, name="inc_v")
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index aa3d0491a45..e1e27467be3 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import monitored_session
@@ -134,8 +135,8 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
   def setUp(self):
     self._tmp_dir = tempfile.mkdtemp()
 
-    self.v = variables.VariableV1(10.0, name="v")
-    self.w = variables.VariableV1(21.0, name="w")
+    self.v = variable_v1.VariableV1(10.0, name="v")
+    self.w = variable_v1.VariableV1(21.0, name="w")
     self.delta = constant_op.constant(1.0, name="delta")
     self.inc_v = state_ops.assign_add(self.v, self.delta, name="inc_v")
 
@@ -351,7 +352,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
   def testDebuggingMakeCallableTensorRunnerWorks(self):
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
         [["run"], ["run"]], self.sess, dump_root=self._tmp_dir)
-    v = variables.VariableV1(42)
+    v = variable_v1.VariableV1(42)
     tensor_runner = wrapped_sess.make_callable(v)
     self.sess.run(v.initializer)
 
@@ -375,7 +376,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
   def testDebuggingMakeCallableOperationRunnerWorks(self):
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
         [["run"], ["run"]], self.sess, dump_root=self._tmp_dir)
-    v = variables.VariableV1(10.0)
+    v = variable_v1.VariableV1(10.0)
     inc_v = state_ops.assign_add(v, 1.0)
     op_runner = wrapped_sess.make_callable(inc_v.op)
     self.sess.run(v.initializer)
@@ -396,7 +397,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(wrapped_sess.observers["debug_dumps"]))
 
   def testDebuggingMakeCallableFromOptionsWithZeroFeedWorks(self):
-    variable_1 = variables.VariableV1(
+    variable_1 = variable_v1.VariableV1(
         10.5, dtype=dtypes.float32, name="variable_1")
     a = math_ops.add(variable_1, variable_1, "callable_a")
     math_ops.add(a, a, "callable_b")
@@ -475,7 +476,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
       self.assertIn("callable_b", node_names)
 
   def testDebugMakeCallableFromOptionsWithCustomOptionsAndMetadataWorks(self):
-    variable_1 = variables.VariableV1(
+    variable_1 = variable_v1.VariableV1(
         10.5, dtype=dtypes.float32, name="variable_1")
     a = math_ops.add(variable_1, variable_1, "callable_a")
     math_ops.add(a, a, "callable_b")
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 3aa78f219d1..034f73ff6aa 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1,8 +1,8 @@
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
-load("//tensorflow/core/platform:distribute.bzl", "distribute_py_test")
-load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
+load("//tensorflow/core/platform:distribute.bzl", "distribute_py_strict_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -13,7 +13,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "distribute_test_lib_pip",
     srcs_version = "PY3",
     deps = [
@@ -28,7 +28,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "cross_device_ops",
     srcs = ["cross_device_ops.py"],
     srcs_version = "PY3",
@@ -46,16 +46,17 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:kernels",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python/client:device_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/tools/docs:doc_controls",
+        "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
@@ -69,22 +70,22 @@ pytype_strict_library(
         ":values",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:collective_ops",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nccl_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:backprop_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/types",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/types:core",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "device_util",
     srcs = ["device_util.py"],
     srcs_version = "PY3",
@@ -92,10 +93,11 @@ py_library(
         "//tensorflow/python:device",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "device_util_test",
     srcs = ["device_util_test.py"],
     python_version = "PY3",
@@ -103,7 +105,6 @@ cuda_py_test(
         ":combinations",
         ":device_util",
         ":multi_worker_test_base",
-        ":multi_worker_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:extra_py_tests_deps",
@@ -114,7 +115,7 @@ cuda_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "distribute",
     srcs_version = "PY3",
     deps = [
@@ -134,11 +135,10 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "distribute_lib",
     srcs = [
         "distribute_lib.py",
-        "distribution_strategy_context.py",
     ],
     srcs_version = "PY3",
     deps = [
@@ -151,18 +151,37 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ref_variable",
         "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:util",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/autograph/core:ag_ctx",
+        "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/types:distribute",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:lazy_loader",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
         "//tensorflow/tools/docs:doc_controls",
+        "@six_archive//:six",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "distribute_lib_test",
     size = "small",
     srcs = ["distribute_lib_test.py"],
@@ -173,20 +192,25 @@ py_test(
         ":distribute_lib",
         ":input_lib",
         ":reduce_util",
-        ":values",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/distribute/v1:input_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/util:nest",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "distribute_config",
     srcs = [
         "distribute_config.py",
@@ -195,7 +219,7 @@ py_library(
     deps = [],
 )
 
-py_library(
+py_strict_library(
     name = "distribute_coordinator",
     srcs = [
         "distribute_coordinator.py",
@@ -205,13 +229,15 @@ py_library(
         ":distribute_coordinator_context",
         ":multi_worker_util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session",
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:coordinator",
+        "//tensorflow/python/training:monitored_session",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "distribute_coordinator_test",
     size = "medium",
     srcs = ["distribute_coordinator_test.py"],
@@ -222,21 +248,23 @@ py_test(
     ],
     deps = [
         ":distribute_coordinator",
+        ":distribute_coordinator_context",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:distributed_framework_test_lib",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/framework:errors",
+        "@six_archive//:six",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "distribute_coordinator_context",
     srcs = [
         "distribute_coordinator_context.py",
@@ -245,70 +273,54 @@ py_library(
     deps = [],
 )
 
-py_library(
+py_strict_library(
     name = "mirrored_run",
     srcs = ["mirrored_run.py"],
     srcs_version = "PY3",
     deps = [
-        ":device_util",
         ":distribute_lib",
-        ":reduce_util",
+        ":distribute_utils",
         ":shared_variable_creator",
-        ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:config",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:device",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/autograph/core:ag_ctx",
-        "//tensorflow/python/autograph/impl",
+        "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:traceback_utils",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "distribute_utils",
     srcs = ["distribute_utils.py"],
     srcs_version = "PY3",
     deps = [
-        ":device_util",
         ":distribute_lib",
-        ":ps_values",
         ":reduce_util",
-        ":sharded_variable",
-        ":shared_variable_creator",
         ":tpu_values",
         ":values",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:config",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:device",
-        "//tensorflow/python:dtypes",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:record",
+        "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/ops/losses:losses_impl",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tpu_util",
     srcs = ["tpu_util.py"],
     deps = [
@@ -319,13 +331,14 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "mirrored_strategy",
     srcs = ["mirrored_strategy.py"],
     srcs_version = "PY3",
     deps = [
         ":collective_util",
         ":cross_device_ops",
+        ":cross_device_utils",
         ":device_util",
         ":distribute_lib",
         ":distribute_utils",
@@ -336,22 +349,28 @@ py_library(
         ":numpy_dataset",
         ":reduce_util",
         ":values",
+        ":values_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_util",
         "//tensorflow/python:device",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:while_loop",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/eager:record",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "parameter_server_strategy",
     srcs = ["parameter_server_strategy.py"],
     srcs_version = "PY3",
@@ -371,19 +390,20 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:device",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "central_storage_strategy",
     srcs = ["central_storage_strategy.py"],
     srcs_version = "PY3",
@@ -392,34 +412,34 @@ py_library(
         ":device_util",
         ":distribute_lib",
         ":parameter_server_strategy",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "one_device_strategy",
     srcs = ["one_device_strategy.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":device_util",
         ":distribute_lib",
+        ":distribute_utils",
         ":input_lib",
         ":input_util",
         ":numpy_dataset",
-        ":reduce_util",
-        ":values",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/distribute/v1:input_lib",
-        "//tensorflow/python/eager:context",
-        "@six_archive//:six",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "collective_all_reduce_strategy",
     srcs = ["collective_all_reduce_strategy.py"],
     srcs_version = "PY3",
@@ -444,31 +464,33 @@ py_library(
         "//tensorflow/python:control_flow_util",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:device",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/tpu:tpu_strategy_util",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "multi_worker_util",
     srcs = [
         "multi_worker_util.py",
     ],
     srcs_version = "PY3",
     deps = [
+        ":distribute_coordinator_context",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:training_server_lib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "multi_worker_continuous_run_test",
     srcs = [
         "multi_worker_continuous_run_test.py",
@@ -484,21 +506,22 @@ cuda_py_test(
         ":multi_process_runner",
         ":multi_worker_test_base",
         ":reduce_util",
-        ":strategy_combinations",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:config",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "@dill_archive//:dill",  # build_cleaner: keep
     ],
 )
 
-py_library(
+py_strict_library(
     name = "numpy_dataset",
     srcs = ["numpy_dataset.py"],
     srcs_version = "PY3",
@@ -506,15 +529,16 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "numpy_dataset_test",
     size = "small",
     srcs = ["numpy_dataset_test.py"],
@@ -523,13 +547,13 @@ py_test(
     deps = [
         ":numpy_dataset",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python/eager:test",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "input_lib",
     srcs = ["input_lib.py"],
     srcs_version = "PY3",
@@ -541,12 +565,12 @@ py_library(
         ":reduce_util",
         ":values",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:while_loop",
-        "//tensorflow/python/autograph/operators",
+        "//tensorflow/python/autograph/operators:py_builtins",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:cardinality",
         "//tensorflow/python/data/experimental/ops:distribute",
@@ -564,45 +588,51 @@ py_library(
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/ops/ragged:ragged_tensor",
-        "//tensorflow/python/platform",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/tools/docs:doc_controls",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/types:distribute",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
         "@six_archive//:six",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "input_ops",
     srcs = ["input_ops.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/types",
+        "//tensorflow/python/data/experimental/ops:distribute",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/data/util:traverse",
+        "//tensorflow/python/framework:op_def_registry",
+        "//tensorflow/python/types:data",
+        "//tensorflow/python/types:distribute",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "input_ops_test",
     srcs = ["input_ops_test.py"],
     python_version = "PY3",
     deps = [
         ":input_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/util:compat",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "multi_worker_util_test",
     srcs = ["multi_worker_util_test.py"],
     python_version = "PY3",
@@ -610,18 +640,12 @@ py_test(
     deps = [
         ":multi_worker_util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:test",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tpu_strategy",
     srcs = ["tpu_strategy.py"],
     srcs_version = "PY3",
@@ -649,16 +673,17 @@ py_library(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/autograph/core:ag_ctx",
-        "//tensorflow/python/autograph/impl",
+        "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/compiler/xla/experimental:xla_sharding",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/tpu:device_assignment",
@@ -666,13 +691,16 @@ py_library(
         "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/tpu:tpu_py",
         "//tensorflow/python/tpu/ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
         "@absl_py//absl/logging",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "random_generator_test",
     srcs = ["random_generator_test.py"],
     main = "random_generator_test.py",
@@ -684,19 +712,37 @@ distribute_py_test(
     ],
     tpu_tags = [
         "no_oss",
+        "no_gpu",  # b/276511061
     ],
     xla_enable_strict_auto_jit = False,  # PSStrategy doesn't work on _xla tests
     deps = [
+        ":combinations",
+        ":mirrored_strategy",
+        ":multi_process_runner",
+        ":sharded_variable",
+        ":strategy_combinations",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:stateful_random_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python/checkpoint",
         "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/distribute/coordinator:cluster_coordinator",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_combinations_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/module",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/util:deprecation",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_strategy_test",
     srcs = ["tpu_strategy_test.py"],
     args = ["--tpu_use_tfrt=true"],
@@ -704,16 +750,55 @@ tpu_py_test(
     python_version = "PY3",
     tags = ["no_oss"],
     deps = [
+        ":distribute_lib",
+        ":reduce_util",
         ":strategy_test_lib",
         ":tpu_strategy",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        ":tpu_values",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_switch_case",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/tpu:device_assignment",
+        "//tensorflow/python/tpu:tpu_hardware_feature",
+        "//tensorflow/python/tpu:tpu_replication",
+        "//tensorflow/python/tpu:tpu_strategy_util",
+        "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/util:nest",
+        "@absl_py//absl/logging",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_strategy_compilation_test",
     srcs = ["tpu_strategy_compilation_test.py"],
     disable_experimental = True,
@@ -722,14 +807,18 @@ tpu_py_test(
     tags = ["no_oss"],
     deps = [
         ":tpu_strategy",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/tpu:tpu_strategy_util",
     ],
 )
 
 # Used only by estimator.
-py_library(
+py_strict_library(
     name = "estimator_training",
     srcs = [
         "estimator_training.py",
@@ -738,11 +827,14 @@ py_library(
     deps = [
         ":distribute_coordinator",
         ":distribute_coordinator_context",
+        ":multi_worker_util",
         "//tensorflow/python:training",
+        "//tensorflow/python/platform:tf_logging",
+        "@six_archive//:six",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "reduce_util",
     srcs = ["reduce_util.py"],
     srcs_version = "PY3",
@@ -752,17 +844,17 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "collective_util",
     srcs = ["collective_util.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "collective_util_test",
     srcs = ["collective_util_test.py"],
     deps = [
@@ -771,13 +863,13 @@ tf_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "shared_variable_creator",
     srcs = ["shared_variable_creator.py"],
     srcs_version = "PY3",
 )
 
-py_test(
+py_strict_test(
     name = "shared_variable_creator_test",
     srcs = ["shared_variable_creator_test.py"],
     python_version = "PY3",
@@ -786,11 +878,12 @@ py_test(
         ":shared_variable_creator",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python/eager:test",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "summary_op_util",
     srcs = ["summary_op_util.py"],
     srcs_version = "PY3",
@@ -801,7 +894,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "packed_distributed_variable",
     srcs = ["packed_distributed_variable.py"],
     srcs_version = "PY3",
@@ -815,7 +908,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "values",
     srcs = ["values.py"],
     srcs_version = "PY3",
@@ -836,41 +929,36 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/eager:record",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/saved_model:nested_structure_coder",
-        "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training/saving:saveable_object",
-        "//tensorflow/python/training/saving:saveable_object_util",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/types:distribute",
+        "//tensorflow/python/types:trace",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "values_v2",
     srcs = ["values_v2.py"],
     deps = [
         ":device_util",
-        ":distribute_lib",
-        ":reduce_util",
         ":tpu_util",
-        ":values",
         ":values_util",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:tensor_conversion_registry",
-        "//tensorflow/python/types",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "values_v2_test",
     srcs = ["values_v2_test.py"],
     tags = [
@@ -881,32 +969,47 @@ distribute_py_test(
         ":strategy_combinations",
         ":test_util",
         ":values_v2",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ps_values",
     srcs = ["ps_values.py"],
     srcs_version = "PY3",
     deps = [
         ":distribute_lib",
+        ":distribute_utils",
         ":values",
         ":values_util",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute/coordinator:coordinator_context",
-        "//tensorflow/python/keras/saving/saved_model:load_context",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/trackable:base",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:lazy_loader",
+        "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "values_util",
     srcs = ["values_util.py"],
     srcs_version = "PY3",
@@ -918,12 +1021,14 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model:save_options",
+        "//tensorflow/python/training/saving:saveable_object",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tpu_values",
     srcs = ["tpu_values.py"],
     srcs_version = "PY3",
@@ -942,7 +1047,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tpu_replicated_variable",
     srcs = ["tpu_replicated_variable.py"],
     srcs_version = "PY3",
@@ -955,13 +1060,15 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/compiler/xla/experimental:xla_sharding",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/trackable:base",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_replicated_variable_test",
     srcs = ["tpu_replicated_variable_test.py"],
     python_version = "PY3",
@@ -977,7 +1084,7 @@ tpu_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "combinations",
     srcs = ["combinations.py"],
     srcs_version = "PY3",
@@ -995,17 +1102,20 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_combinations_lib",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session",
         "//tensorflow/python:tf_decorator",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
+        "@absl_py//absl:app",
         "@six_archive//:six",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "combinations_test",
     srcs = ["combinations_test.py"],
     python_version = "PY3",
@@ -1021,7 +1131,7 @@ distribute_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "strategy_combinations",
     srcs = ["strategy_combinations.py"],
     srcs_version = "PY3",
@@ -1043,13 +1153,16 @@ py_library(
         ":sharded_variable",
         ":test_util",
         ":tpu_strategy",
-        "//tensorflow/python:platform",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:tf2",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:remote",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:flags",
         "//tensorflow/python/tpu:device_assignment",
         "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/training:server_lib",
@@ -1057,12 +1170,15 @@ py_library(
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "strategy_combinations_test",
     srcs = ["strategy_combinations_test.py"],
     disable_mlir_bridge = False,
     python_version = "PY3",
-    tags = ["no_cuda_asan"],  # TODO(b/195246941) b/196591124
+    tags = [
+        "no_cuda_asan",  # TODO(b/195246941) b/196591124
+        "no_gpu",  # b/276511061
+    ],
     deps = [
         ":central_storage_strategy",
         ":collective_all_reduce_strategy",
@@ -1083,7 +1199,7 @@ distribute_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "multi_worker_test_base",
     srcs = ["multi_worker_test_base.py"],
     srcs_version = "PY3",
@@ -1095,18 +1211,22 @@ py_library(
         "//tensorflow/python:distributed_framework_test_lib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session",
         "//tensorflow/python:training_lib",
-        "//tensorflow/python:util",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:remote",
-        "//third_party/py/numpy",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+        "@six_archive//:six",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "multi_worker_test_base_test",
     srcs = ["multi_worker_test_base_test.py"],
     srcs_version = "PY3",
@@ -1114,11 +1234,15 @@ tf_py_test(
         "no_oss",  # TODO(b/170834611)
     ],
     deps = [
+        ":multi_process_runner",
         ":multi_worker_test_base",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/eager:test",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "checkpoint_utils_test",
     size = "medium",
     srcs = ["checkpoint_utils_test.py"],
@@ -1134,10 +1258,12 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/training:saver",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "checkpointing_test",
     srcs = ["checkpointing_test.py"],
     main = "checkpointing_test.py",
@@ -1147,12 +1273,15 @@ distribute_py_test(
     deps = [
         ":combinations",
         ":strategy_combinations",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "input_lib_test",
     srcs = ["input_lib_test.py"],
     disable_mlir_bridge = False,
@@ -1163,32 +1292,47 @@ distribute_py_test(
         "no_cuda_asan",  # TODO(b/214574707): times out
     ],
     deps = [
-        ":collective_all_reduce_strategy",
         ":combinations",
+        ":device_util",
+        ":distribute_lib",
+        ":distribute_utils",
         ":input_lib",
+        ":input_ops",
         ":input_util",
-        ":mirrored_strategy",
-        ":multi_worker_test_base",
+        ":multi_worker_util",
         ":reduce_util",
         ":strategy_combinations",
         ":test_util",
-        ":values",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/experimental/ops:data_service_ops",
+        "//tensorflow/python/data/experimental/service:server_lib",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:extension_type",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "input_lib_type_spec_test",
     srcs = ["input_lib_type_spec_test.py"],
     main = "input_lib_type_spec_test.py",
@@ -1197,49 +1341,52 @@ distribute_py_test(
         "multi_and_single_gpu",
     ],
     deps = [
-        ":collective_all_reduce_strategy",
         ":combinations",
-        ":input_lib",
-        ":mirrored_strategy",
-        ":multi_worker_test_base",
-        ":reduce_util",
+        ":distribute_lib",
         ":strategy_combinations",
+        ":test_util",
         ":tpu_strategy",
         ":values",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tf2",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cross_device_utils_test",
     srcs = ["cross_device_utils_test.py"],
     python_version = "PY3",
     deps = [
         ":combinations",
         ":cross_device_utils",
-        ":strategy_combinations",
+        ":device_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:test_lib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cross_device_ops_test",
     srcs = ["cross_device_ops_test.py"],
     python_version = "PY3",
@@ -1254,41 +1401,52 @@ cuda_py_test(
         ":combinations",
         ":cross_device_ops",
         ":cross_device_utils",
+        ":device_util",
         ":multi_process_runner",
         ":multi_worker_test_base",
         ":reduce_util",
         ":test_util",
         ":values",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:collective_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:indexed_slices",
-        "//tensorflow/python:util",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
+        "@dill_archive//:dill",  # build_cleaner: keep
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "one_device_strategy_test",
     srcs = ["one_device_strategy_test.py"],
     grpc_enabled = True,
     python_version = "PY3",
     deps = [
         ":combinations",
+        ":distribute_lib",
         ":strategy_combinations",
         ":strategy_test_lib",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute/v1:input_lib",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:device",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "sharded_variable",
     srcs = ["sharded_variable.py"],
     srcs_version = "PY3",
@@ -1305,18 +1463,19 @@ py_library(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:type_spec",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
-        "//tensorflow/python/saved_model:revived_types",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sharded_variable_test",
     size = "small",
     srcs = ["sharded_variable_test.py"],
@@ -1326,28 +1485,31 @@ tf_py_test(
     ],
     deps = [
         ":combinations",
-        ":multi_worker_test_base",
+        ":distribute_lib",
         ":parameter_server_strategy_v2",
         ":sharded_variable",
         ":test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
-        "//tensorflow/python/distribute/coordinator:cluster_coordinator",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/module",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
@@ -1356,10 +1518,12 @@ tf_py_test(
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/util:nest",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "strategy_test_lib",
     srcs = ["strategy_test_lib.py"],
     srcs_version = "PY3",
@@ -1381,8 +1545,8 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -1392,13 +1556,15 @@ py_library(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/lib/io:lib",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/training:training_util",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "values_test",
     size = "medium",
     srcs = ["values_test.py"],
@@ -1434,11 +1600,12 @@ distribute_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:dtypes",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "distributed_variable_test",
     size = "medium",
     srcs = ["distributed_variable_test.py"],
@@ -1472,7 +1639,7 @@ distribute_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:indexed_slices",
@@ -1487,12 +1654,12 @@ distribute_py_test(
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model:save_options",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:core",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "mirrored_values_test",
     size = "medium",
     srcs = ["mirrored_values_test.py"],
@@ -1506,31 +1673,25 @@ distribute_py_test(
     ],
     deps = [
         ":combinations",
-        ":distribute_lib",
-        ":distribute_utils",
-        ":packed_distributed_variable",
         ":strategy_combinations",
         ":strategy_test_lib",
         ":test_util",
-        ":tpu_strategy",
         ":tpu_values",
         ":values",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:saver",
-        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/types",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "per_replica_test",
     size = "medium",
     srcs = ["per_replica_test.py"],
@@ -1543,37 +1704,24 @@ distribute_py_test(
         "noasan",  # TODO(b/337374867) fails with -fsanitize=null
     ],
     deps = [
-        ":collective_all_reduce_strategy",
         ":combinations",
-        ":distribute_lib",
-        ":distribute_utils",
-        ":packed_distributed_variable",
-        ":parameter_server_strategy",
-        ":ps_values",
-        ":strategy_combinations",
         ":test_util",
-        ":tpu_strategy",
-        ":tpu_values",
         ":values",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:tf2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/types",
+        "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "distribute_utils_test",
     srcs = ["distribute_utils_test.py"],
     disable_mlir_bridge = False,
@@ -1590,6 +1738,7 @@ distribute_py_test(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/saved_model/model_utils:mode_keys",
@@ -1598,7 +1747,7 @@ distribute_py_test(
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "vars_test",
     size = "medium",
     srcs = ["vars_test.py"],
@@ -1609,6 +1758,7 @@ distribute_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
+        ":collective_all_reduce_strategy",
         ":combinations",
         ":distribute_lib",
         ":strategy_combinations",
@@ -1616,6 +1766,7 @@ distribute_py_test(
         ":test_util",
         ":values",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -1624,6 +1775,7 @@ distribute_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
@@ -1631,11 +1783,12 @@ distribute_py_test(
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/util:variable_utils",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "ps_values_test",
     size = "medium",
     srcs = ["ps_values_test.py"],
@@ -1648,7 +1801,7 @@ distribute_py_test(
         ":combinations",
         ":ps_values",
         ":strategy_combinations",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
@@ -1656,7 +1809,7 @@ distribute_py_test(
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "moving_averages_test",
     srcs = ["moving_averages_test.py"],
     main = "moving_averages_test.py",
@@ -1667,7 +1820,6 @@ distribute_py_test(
         "notpu",  # TODO(b/210145904)
     ],
     deps = [
-        ":collective_all_reduce_strategy",
         ":combinations",
         ":strategy_combinations",
         ":strategy_test_lib",
@@ -1681,7 +1833,7 @@ distribute_py_test(
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "custom_training_loop_gradient_test",
     srcs = ["custom_training_loop_gradient_test.py"],
     disable_mlir_bridge = False,
@@ -1692,14 +1844,18 @@ distribute_py_test(
     deps = [
         ":combinations",
         ":strategy_combinations",
-        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "custom_training_loop_input_test",
     srcs = ["custom_training_loop_input_test.py"],
     main = "custom_training_loop_input_test.py",
@@ -1713,16 +1869,32 @@ distribute_py_test(
     ],
     deps = [
         ":combinations",
+        ":device_util",
+        ":distribute_lib",
+        ":reduce_util",
         ":strategy_combinations",
+        ":test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:errors",
+        "//tensorflow/python:map_fn",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "single_loss_example",
     srcs = ["single_loss_example.py"],
     srcs_version = "PY3",
@@ -1734,10 +1906,11 @@ py_library(
         "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "step_fn",
     srcs = ["step_fn.py"],
     srcs_version = "PY3",
@@ -1748,7 +1921,7 @@ py_library(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "warm_starting_util_test",
     size = "medium",
     srcs = ["warm_starting_util_test.py"],
@@ -1764,10 +1937,12 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/training:saver",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "remote_mirrored_strategy_eager_test",
     srcs = ["remote_mirrored_strategy_eager_test.py"],
     python_version = "PY3",
@@ -1776,24 +1951,15 @@ cuda_py_test(
     ],
     deps = [
         ":combinations",
-        ":distribute_lib",
         ":mirrored_strategy",
         ":multi_worker_test_base",
         ":strategy_test_lib",
-        ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "mirrored_strategy_test",
     srcs = ["mirrored_strategy_test.py"],
     python_version = "PY3",
@@ -1803,29 +1969,50 @@ cuda_py_test(
         "no_windows_gpu",  # TODO(b/130551176)
     ],
     deps = [
+        ":collective_util",
         ":combinations",
+        ":cross_device_ops",
+        ":device_util",
         ":distribute_lib",
+        ":distribute_utils",
         ":mirrored_strategy",
         ":multi_worker_test_base",
+        ":reduce_util",
         ":strategy_combinations",
         ":strategy_test_lib",
+        ":test_util",
         ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:state_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
+        "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/autograph/core:test_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute/v1:input_lib",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/util:traceback_utils",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "mirrored_variable_test",
     srcs = ["mirrored_variable_test.py"],
     python_version = "PY3",
@@ -1847,21 +2034,25 @@ cuda_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:func_graph",
+        "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:rnn_cell",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "metrics_v1_test",
     srcs = ["metrics_v1_test.py"],
     main = "metrics_v1_test.py",
@@ -1882,7 +2073,7 @@ distribute_py_test(
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "zero_batch_test",
     srcs = ["zero_batch_test.py"],
     disable_mlir_bridge = False,
@@ -1892,12 +2083,23 @@ distribute_py_test(
     ],
     deps = [
         ":combinations",
-        ":multi_worker_test_base",
         ":strategy_combinations",
+        ":test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:gradient_descent",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "collective_all_reduce_strategy_test",
     srcs = ["collective_all_reduce_strategy_test.py"],
     python_version = "PY3",
@@ -1909,17 +2111,17 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
     deps = [
         ":collective_all_reduce_strategy",
+        ":collective_util",
         ":combinations",
-        ":cross_device_utils",
         ":distribute_lib",
         ":distribute_utils",
-        ":input_lib",
         ":multi_worker_test_base",
         ":multi_worker_util",
         ":reduce_util",
         ":strategy_combinations",
         ":strategy_test_lib",
         ":test_util",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -1930,19 +2132,23 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:training_server_lib",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
         "//tensorflow/python/tpu:tpu_lib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "collective_all_reduce_strategy_test_tpu",
     srcs = ["collective_all_reduce_strategy_test.py"],
     # FIXME(b/227404010): On TFRT TPU, eager CollectiveReduceV2 is broken.
@@ -1951,6 +2157,7 @@ tpu_py_test(
     python_version = "PY3",
     deps = [
         ":collective_all_reduce_strategy",
+        ":collective_util",
         ":combinations",
         ":cross_device_utils",
         ":distribute_lib",
@@ -1962,6 +2169,7 @@ tpu_py_test(
         ":strategy_combinations",
         ":strategy_test_lib",
         ":test_util",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -1972,19 +2180,24 @@ tpu_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:init_ops_v2",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/tpu:tpu_strategy_util",
+        "//tensorflow/python/training:server_lib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "parameter_server_strategy_test",
     srcs = ["parameter_server_strategy_test.py"],
     python_version = "PY3",
@@ -1999,13 +2212,13 @@ cuda_py_test(
         ":combinations",
         ":device_util",
         ":distribute_lib",
+        ":distribute_utils",
         ":multi_worker_test_base",
         ":multi_worker_util",
         ":parameter_server_strategy",
         ":ps_values",
         ":reduce_util",
         ":strategy_test_lib",
-        ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2022,15 +2235,18 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "multi_process_runner",
     srcs = ["multi_process_runner.py"],
     srcs_version = "PY3",
@@ -2039,18 +2255,20 @@ py_library(
     ],
     deps = [
         ":multi_process_lib",
-        "//tensorflow/python:client_testlib",
+        ":multi_worker_util",
         "//tensorflow/python:tf2",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/util:tf_export",
         "@absl_py//absl/logging",
-        "@dill_archive//:dill",
+        "@dill_archive//:dill",  # build_cleaner: keep
         "@six_archive//:six",
-        "@tblib_archive//:tblib",
+        "@tblib_archive//:tblib",  # build_cleaner: keep
     ],
 )
 
-py_library(
+py_strict_library(
     name = "multi_process_lib",
     srcs = ["multi_process_lib.py"],
     srcs_version = "PY3",
@@ -2061,7 +2279,7 @@ py_library(
     ],
 )
 
-py_test(
+py_strict_test(
     name = "packed_distributed_variable_test",
     srcs = ["packed_distributed_variable_test.py"],
     tags = [
@@ -2076,10 +2294,12 @@ py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "multi_process_runner_test",
     srcs = ["multi_process_runner_test.py"],
     python_version = "PY3",
@@ -2099,7 +2319,7 @@ cuda_py_test(
     ],
 )
 
-py_test(
+py_strict_test(
     name = "multi_process_runner_no_init_test",
     srcs = ["multi_process_runner_no_init_test.py"],
     python_version = "PY3",
@@ -2110,7 +2330,7 @@ py_test(
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "strategy_common_test",
     srcs = ["strategy_common_test.py"],
     disable_mlir_bridge = False,
@@ -2127,7 +2347,6 @@ distribute_py_test(
         ":strategy_combinations",
         ":strategy_test_lib",
         ":test_util",
-        ":tpu_strategy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -2136,13 +2355,14 @@ distribute_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "strategy_gather_test",
     srcs = ["strategy_gather_test.py"],
     disable_mlir_bridge = False,
@@ -2155,26 +2375,30 @@ distribute_py_test(
     ],
     xla_enable_strict_auto_jit = True,
     deps = [
+        ":central_storage_strategy",
         ":collective_all_reduce_strategy",
         ":combinations",
-        ":multi_worker_test_base",
-        ":reduce_util",
+        ":distribute_lib",
+        ":mirrored_strategy",
         ":strategy_combinations",
-        ":strategy_test_lib",
         ":test_util",
+        ":tpu_strategy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python:gradients_impl",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "tf_function_test",
     srcs = ["tf_function_test.py"],
     disable_mlir_bridge = False,
@@ -2195,32 +2419,36 @@ distribute_py_test(
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/platform:flags",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model:save_options",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "test_util",
     srcs = ["test_util.py"],
     srcs_version = "PY3",
     deps = [
         ":collective_all_reduce_strategy",
         ":multi_process_runner",
+        ":multi_worker_test_base",
         ":tpu_strategy",
         ":values",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:config",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/util:nest",
         "@absl_py//absl:app",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "test_util_test",
     srcs = ["test_util_test.py"],
     disable_mlir_bridge = False,
@@ -2233,36 +2461,53 @@ distribute_py_test(
         ":test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:ops",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "parameter_server_strategy_v2",
     srcs = ["parameter_server_strategy_v2.py"],
     srcs_version = "PY3",
     deps = [
+        ":cross_device_ops",
+        ":device_util",
         ":distribute_lib",
-        ":distribute_utils",
+        ":input_lib",
         ":input_util",
+        ":mirrored_run",
+        ":multi_worker_util",
         ":parameter_server_strategy",
+        ":ps_values",
         ":sharded_variable",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
+        ":values",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tf_decorator",
-        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute/coordinator:cluster_coordinator",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:remote",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/util:keras_deps",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "parameter_server_strategy_v2_test",
     srcs = ["parameter_server_strategy_v2_test.py"],
     python_version = "PY3",
@@ -2272,32 +2517,43 @@ distribute_py_test(
         "notpu",
     ],
     deps = [
-        ":combinations",
+        ":distribute_lib",
+        ":multi_process_runner",
         ":multi_worker_test_base",
         ":parameter_server_strategy_v2",
+        ":ps_values",
         ":sharded_variable",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:embedding_ops",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops_v2",
         "//tensorflow/python:linalg_ops_impl",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/module",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/training:server_lib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "distributed_table_test",
     srcs = ["distributed_table_test.py"],
     python_version = "PY3",
@@ -2312,36 +2568,33 @@ distribute_py_test(
     deps = [
         ":combinations",
         ":device_util",
+        ":multi_process_runner",
         ":multi_worker_test_base",
         ":parameter_server_strategy_v2",
         ":ps_values",
-        ":test_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/distribute/coordinator:cluster_coordinator",
         "//tensorflow/python/distribute/coordinator:coordinator_context",
-        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/keras/saving",
         "//tensorflow/python/module",
-        "//tensorflow/python/platform",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
-        "//tensorflow/python/training:server_lib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_strategy_model_parallelism_test",
     srcs = ["tpu_strategy_model_parallelism_test.py"],
     disable_experimental = True,  # b/202779350
@@ -2351,26 +2604,36 @@ tpu_py_test(
     tags = ["no_oss"],
     deps = [
         ":distribute_lib",
+        ":packed_distributed_variable",
         ":strategy_test_lib",
+        ":tpu_replicated_variable",
         ":tpu_strategy",
         ":tpu_values",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/compiler/xla/experimental:xla_sharding",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:summary_test_util",
         "//tensorflow/python/module",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:flags",
         "//tensorflow/python/tpu:device_assignment",
         "//tensorflow/python/tpu:tpu_lib",
-        "//tensorflow/python/training:checkpoint_management",
+        "//tensorflow/python/tpu:tpu_replication",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -2385,7 +2648,7 @@ pytype_strict_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "merge_call_interim",
     srcs = [
         "merge_call_interim.py",
@@ -2397,17 +2660,19 @@ py_library(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "template_mirrored_strategy_test",
     size = "small",
     srcs = ["template_mirrored_strategy_test.py"],
     deps = [
+        ":distribute_lib",
+        ":mirrored_strategy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:template",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
index 3dcbd9fb7a4..a5a6bea57f4 100644
--- a/tensorflow/python/distribute/cluster_resolver/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -1,6 +1,6 @@
 # Description: Operations defined for Cluster Resolvers
-
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -10,7 +10,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "cluster_resolver_lib",
     srcs = [
         "__init__.py",
@@ -25,106 +25,118 @@ py_library(
         ":slurm_cluster_resolver_py",
         ":tfconfig_cluster_resolver_py",
         ":tpu_cluster_resolver_py",
-        "//tensorflow/python:util",
     ],  # placeholder for google-internal dependencies,,
 )
 
-py_library(
+py_strict_library(
     name = "base_cluster_resolver_py",
     srcs = ["cluster_resolver.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_export",
+        "@six_archive//:six",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "gce_cluster_resolver_py",
     srcs = ["gce_cluster_resolver.py"],
     srcs_version = "PY3",
     deps = [
         ":base_cluster_resolver_py",
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tfconfig_cluster_resolver_py",
     srcs = ["tfconfig_cluster_resolver.py"],
     srcs_version = "PY3",
     deps = [
         ":base_cluster_resolver_py",
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "sagemaker_cluster_resolver_py",
     srcs = ["sagemaker_cluster_resolver.py"],
     srcs_version = "PY3",
     deps = [
         ":base_cluster_resolver_py",
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tpu_cluster_resolver_py",
     srcs = ["tpu_cluster_resolver.py"],
     srcs_version = "PY3",
-    deps = ["//tensorflow/python/distribute/cluster_resolver/tpu:tpu_cluster_resolver_py"],
+    deps = [
+        "//tensorflow/python/distribute/cluster_resolver/tpu:tpu_cluster_resolver_py",
+        "//tensorflow/python/util:tf_export",
+    ],
 )
 
-py_library(
+py_strict_library(
     name = "slurm_cluster_resolver_py",
     srcs = ["slurm_cluster_resolver.py"],
     srcs_version = "PY3",
     deps = [
         ":base_cluster_resolver_py",
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "kubernetes_cluster_resolver_py",
     srcs = ["kubernetes_cluster_resolver.py"],
     srcs_version = "PY3",
     deps = [
         ":base_cluster_resolver_py",
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "base_cluster_resolver_py_test",
     srcs = ["cluster_resolver_test.py"],
     main = "cluster_resolver_test.py",
     deps = [
         ":base_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "gce_cluster_resolver_py_test",
     size = "small",
     srcs = ["gce_cluster_resolver_test.py"],
     main = "gce_cluster_resolver_test.py",
     deps = [
+        ":base_cluster_resolver_py",
         ":gce_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:training_server_lib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tfconfig_cluster_resolver_py_test",
     size = "small",
     srcs = ["tfconfig_cluster_resolver_test.py"],
@@ -133,14 +145,15 @@ tf_py_test(
     deps = [
         ":tfconfig_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sagemaker_cluster_resolver_py_test",
     size = "small",
     srcs = ["sagemaker_cluster_resolver_test.py"],
@@ -149,14 +162,12 @@ tf_py_test(
     deps = [
         ":sagemaker_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:training_server_lib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "slurm_cluster_resolver_py_test",
     size = "small",
     srcs = ["slurm_cluster_resolver_test.py"],
@@ -165,14 +176,11 @@ tf_py_test(
     deps = [
         ":slurm_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:training_server_lib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "kubernetes_cluster_resolver_py_test",
     size = "small",
     srcs = ["kubernetes_cluster_resolver_test.py"],
@@ -180,15 +188,13 @@ tf_py_test(
     deps = [
         ":kubernetes_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
 # copybara:uncomment_begin(google-only)
-# tf_py_test(
+# tf_py_strict_test(
 #     name = "brain_jobs_cluster_resolver_test",
 #     size = "small",
 #     srcs = ["brain_jobs_cluster_resolver_test.py"],
@@ -197,9 +203,7 @@ tf_py_test(
 #     deps = [
 #         ":brain_jobs_cluster_resolver_py",
 #         "//tensorflow/python:client_testlib",
-#         "//tensorflow/python:framework_for_generated_wrappers",
 #         "//tensorflow/python:framework_test_lib",
-#         "//tensorflow/python:platform_test",
 #         "//tensorflow/python:training_server_lib",
 #     ],
 # )
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
index f43fba3a59a..42d008e274e 100644
--- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
@@ -14,12 +14,10 @@
 # ==============================================================================
 """Tests for Cluster Resolvers."""
 
-from tensorflow.python import framework
 from tensorflow.python.client import session
-from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
-from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
-from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
-from tensorflow.python.eager.context import LogicalDevice
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver
+from tensorflow.python.eager import context
+from tensorflow.python.framework import config
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -27,7 +25,7 @@ from tensorflow.python.training import server_lib
 mock = test.mock
 
 
-class MockBaseClusterResolver(ClusterResolver):
+class MockBaseClusterResolver(cluster_resolver.ClusterResolver):
 
   def cluster_spec(self):
     return None
@@ -42,15 +40,15 @@ class MockBaseClusterResolver(ClusterResolver):
 @test_util.run_all_in_graph_and_eager_modes
 class BaseClusterResolverTest(test.TestCase):
 
-  @mock.patch.object(framework.config, "list_logical_devices")
+  @mock.patch.object(config, "list_logical_devices")
   @mock.patch.object(session.BaseSession, "list_devices")
   def testNumAcceleratorsSuccess(self, mock_list_devices,
                                  mock_eager_list_devices):
     devices = [
-        LogicalDevice("/job:worker/task:0/device:GPU:0", "GPU"),
-        LogicalDevice("/job:worker/task:0/device:GPU:1", "GPU"),
-        LogicalDevice("/job:worker/task:0/device:GPU:2", "GPU"),
-        LogicalDevice("/job:worker/task:0/device:GPU:3", "GPU"),
+        context.LogicalDevice("/job:worker/task:0/device:GPU:0", "GPU"),
+        context.LogicalDevice("/job:worker/task:0/device:GPU:1", "GPU"),
+        context.LogicalDevice("/job:worker/task:0/device:GPU:2", "GPU"),
+        context.LogicalDevice("/job:worker/task:0/device:GPU:3", "GPU"),
     ]
     device_list = [
         session._DeviceAttributes(d.name, d.device_type, 1024, 0)
@@ -62,19 +60,19 @@ class BaseClusterResolverTest(test.TestCase):
     resolver = MockBaseClusterResolver()
     self.assertEqual(resolver.num_accelerators(), {"GPU": 4})
 
-  @mock.patch.object(framework.config, "list_logical_devices")
+  @mock.patch.object(config, "list_logical_devices")
   @mock.patch.object(session.BaseSession, "list_devices")
   def testNumAcceleratorsMultiDeviceSuccess(self, mock_list_devices,
                                             mock_eager_list_devices):
     devices = [
-        LogicalDevice("/job:worker/task:0/device:TPU:0", "TPU"),
-        LogicalDevice("/job:worker/task:0/device:TPU:1", "TPU"),
-        LogicalDevice("/job:worker/task:0/device:TPU:2", "TPU"),
-        LogicalDevice("/job:worker/task:0/device:TPU:3", "TPU"),
-        LogicalDevice("/job:worker/task:0/device:GPU:0", "GPU"),
-        LogicalDevice("/job:worker/task:0/device:GPU:1", "GPU"),
-        LogicalDevice("/job:worker/task:0/device:GPU:2", "GPU"),
-        LogicalDevice("/job:worker/task:0/device:GPU:3", "GPU"),
+        context.LogicalDevice("/job:worker/task:0/device:TPU:0", "TPU"),
+        context.LogicalDevice("/job:worker/task:0/device:TPU:1", "TPU"),
+        context.LogicalDevice("/job:worker/task:0/device:TPU:2", "TPU"),
+        context.LogicalDevice("/job:worker/task:0/device:TPU:3", "TPU"),
+        context.LogicalDevice("/job:worker/task:0/device:GPU:0", "GPU"),
+        context.LogicalDevice("/job:worker/task:0/device:GPU:1", "GPU"),
+        context.LogicalDevice("/job:worker/task:0/device:GPU:2", "GPU"),
+        context.LogicalDevice("/job:worker/task:0/device:GPU:3", "GPU"),
     ]
     device_list = [
         session._DeviceAttributes(d.name, d.device_type, 1024, 0)
@@ -86,19 +84,19 @@ class BaseClusterResolverTest(test.TestCase):
     resolver = MockBaseClusterResolver()
     self.assertEqual(resolver.num_accelerators(), {"TPU": 4, "GPU": 4})
 
-  @mock.patch.object(framework.config, "list_logical_devices")
+  @mock.patch.object(config, "list_logical_devices")
   @mock.patch.object(session.BaseSession, "list_devices")
   def testNumAcceleratorsFilterTasks(self, mock_list_devices,
                                      mock_eager_list_devices):
     devices = [
-        LogicalDevice("/job:worker1/task:0/device:TPU:0", "TPU"),
-        LogicalDevice("/job:worker1/task:0/device:TPU:1", "TPU"),
-        LogicalDevice("/job:worker1/task:0/device:GPU:0", "GPU"),
-        LogicalDevice("/job:worker1/task:0/device:GPU:1", "GPU"),
-        LogicalDevice("/job:worker2/task:1/device:TPU:2", "TPU"),
-        LogicalDevice("/job:worker2/task:2/device:TPU:3", "TPU"),
-        LogicalDevice("/job:worker2/task:3/device:GPU:2", "GPU"),
-        LogicalDevice("/job:worker2/task:4/device:GPU:3", "GPU"),
+        context.LogicalDevice("/job:worker1/task:0/device:TPU:0", "TPU"),
+        context.LogicalDevice("/job:worker1/task:0/device:TPU:1", "TPU"),
+        context.LogicalDevice("/job:worker1/task:0/device:GPU:0", "GPU"),
+        context.LogicalDevice("/job:worker1/task:0/device:GPU:1", "GPU"),
+        context.LogicalDevice("/job:worker2/task:1/device:TPU:2", "TPU"),
+        context.LogicalDevice("/job:worker2/task:2/device:TPU:3", "TPU"),
+        context.LogicalDevice("/job:worker2/task:3/device:GPU:2", "GPU"),
+        context.LogicalDevice("/job:worker2/task:4/device:GPU:3", "GPU"),
     ]
     device_list = [
         session._DeviceAttributes(d.name, d.device_type, 1024, 0)
@@ -136,8 +134,8 @@ class UnionClusterResolverTest(test.TestCase):
         "ps": ["ps0:2222", "ps1:2222"],
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
-    simple_resolver = SimpleClusterResolver(base_cluster_spec)
-    union_resolver = UnionClusterResolver(simple_resolver)
+    simple_resolver = cluster_resolver.SimpleClusterResolver(base_cluster_spec)
+    union_resolver = cluster_resolver.UnionClusterResolver(simple_resolver)
 
     expected_proto = """
     job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
@@ -155,10 +153,12 @@ class UnionClusterResolverTest(test.TestCase):
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
 
-    simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
-                                            task_id=1, environment="cloud",
-                                            num_accelerators={"GPU": 8},
-                                            rpc_layer="grpc")
+    simple_resolver = cluster_resolver.SimpleClusterResolver(
+        base_cluster_spec, task_type="ps",
+        task_id=1, environment="cloud",
+        num_accelerators={"GPU": 8},
+        rpc_layer="grpc",
+    )
 
     self.assertEqual(simple_resolver.task_type, "ps")
     self.assertEqual(simple_resolver.task_id, 1)
@@ -172,10 +172,12 @@ class UnionClusterResolverTest(test.TestCase):
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
 
-    simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
-                                            task_id=1, environment="cloud",
-                                            num_accelerators={"GPU": 8},
-                                            rpc_layer="grpc")
+    simple_resolver = cluster_resolver.SimpleClusterResolver(
+        base_cluster_spec, task_type="ps",
+        task_id=1, environment="cloud",
+        num_accelerators={"GPU": 8},
+        rpc_layer="grpc",
+    )
 
     simple_resolver.task_type = "worker"
     simple_resolver.task_id = 2
@@ -191,7 +193,7 @@ class UnionClusterResolverTest(test.TestCase):
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
 
-    simple_resolver = SimpleClusterResolver(base_cluster_spec)
+    simple_resolver = cluster_resolver.SimpleClusterResolver(base_cluster_spec)
     actual_master = simple_resolver.master("worker", 0, rpc_layer="grpc")
     self.assertEqual(actual_master, "grpc://worker0:2222")
 
@@ -201,7 +203,7 @@ class UnionClusterResolverTest(test.TestCase):
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
 
-    simple_resolver = SimpleClusterResolver(base_cluster_spec)
+    simple_resolver = cluster_resolver.SimpleClusterResolver(base_cluster_spec)
     actual_master = simple_resolver.master("worker", 2, rpc_layer="grpc")
     self.assertEqual(actual_master, "grpc://worker2:2222")
 
@@ -211,7 +213,7 @@ class UnionClusterResolverTest(test.TestCase):
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
 
-    simple_resolver = SimpleClusterResolver(base_cluster_spec)
+    simple_resolver = cluster_resolver.SimpleClusterResolver(base_cluster_spec)
     actual_master = simple_resolver.master("worker", 2)
     self.assertEqual(actual_master, "worker2:2222")
 
@@ -220,21 +222,25 @@ class UnionClusterResolverTest(test.TestCase):
         "ps": ["ps0:2222", "ps1:2222"],
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
-    resolver1 = SimpleClusterResolver(cluster_spec_1, task_type="ps",
-                                      task_id=1, environment="cloud",
-                                      num_accelerators={"GPU": 8},
-                                      rpc_layer="grpc")
+    resolver1 = cluster_resolver.SimpleClusterResolver(
+        cluster_spec_1, task_type="ps",
+        task_id=1, environment="cloud",
+        num_accelerators={"GPU": 8},
+        rpc_layer="grpc",
+    )
 
     cluster_spec_2 = server_lib.ClusterSpec({
         "ps": ["ps2:2222", "ps3:2222"],
         "worker": ["worker3:2222", "worker4:2222", "worker5:2222"]
     })
-    resolver2 = SimpleClusterResolver(cluster_spec_2, task_type="worker",
-                                      task_id=2, environment="local",
-                                      num_accelerators={"GPU": 16},
-                                      rpc_layer="http")
+    resolver2 = cluster_resolver.SimpleClusterResolver(
+        cluster_spec_2, task_type="worker",
+        task_id=2, environment="local",
+        num_accelerators={"GPU": 16},
+        rpc_layer="http",
+    )
 
-    union_resolver = UnionClusterResolver(resolver1, resolver2)
+    union_resolver = cluster_resolver.UnionClusterResolver(resolver1, resolver2)
 
     self.assertEqual(union_resolver.task_type, "ps")
     self.assertEqual(union_resolver.task_id, 1)
@@ -264,10 +270,11 @@ class UnionClusterResolverTest(test.TestCase):
             "worker2:2222"
         ]
     })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+    cluster_resolver_1 = cluster_resolver.SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = cluster_resolver.SimpleClusterResolver(cluster_spec_2)
 
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    union_cluster = cluster_resolver.UnionClusterResolver(
+        cluster_resolver_1, cluster_resolver_2)
     cluster_spec = union_cluster.cluster_spec()
 
     expected_proto = """
@@ -293,10 +300,11 @@ class UnionClusterResolverTest(test.TestCase):
             "worker2:2222"
         ]
     })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+    cluster_resolver_1 = cluster_resolver.SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = cluster_resolver.SimpleClusterResolver(cluster_spec_2)
 
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    union_cluster = cluster_resolver.UnionClusterResolver(
+        cluster_resolver_1, cluster_resolver_2)
 
     unspecified_master = union_cluster.master()
     self.assertEqual(unspecified_master, "")
@@ -321,10 +329,11 @@ class UnionClusterResolverTest(test.TestCase):
             "worker2:2222"
         ]
     })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+    cluster_resolver_1 = cluster_resolver.SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = cluster_resolver.SimpleClusterResolver(cluster_spec_2)
 
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    union_cluster = cluster_resolver.UnionClusterResolver(
+        cluster_resolver_1, cluster_resolver_2)
     cluster_spec = union_cluster.cluster_spec()
 
     expected_proto = """
@@ -350,10 +359,11 @@ class UnionClusterResolverTest(test.TestCase):
             7: "worker2:2222"
         }
     })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+    cluster_resolver_1 = cluster_resolver.SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = cluster_resolver.SimpleClusterResolver(cluster_spec_2)
 
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    union_cluster = cluster_resolver.UnionClusterResolver(
+        cluster_resolver_1, cluster_resolver_2)
     self.assertRaises(KeyError, union_cluster.cluster_spec)
 
   def testOverlappingDictAndListThrowError(self):
@@ -370,10 +380,11 @@ class UnionClusterResolverTest(test.TestCase):
             3: "worker2:2222"
         }
     })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+    cluster_resolver_1 = cluster_resolver.SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = cluster_resolver.SimpleClusterResolver(cluster_spec_2)
 
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    union_cluster = cluster_resolver.UnionClusterResolver(
+        cluster_resolver_1, cluster_resolver_2)
     self.assertRaises(KeyError, union_cluster.cluster_spec)
 
   def testOverlappingJobNonOverlappingKey(self):
@@ -390,10 +401,11 @@ class UnionClusterResolverTest(test.TestCase):
             7: "worker2:2222"
         }
     })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+    cluster_resolver_1 = cluster_resolver.SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = cluster_resolver.SimpleClusterResolver(cluster_spec_2)
 
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    union_cluster = cluster_resolver.UnionClusterResolver(
+        cluster_resolver_1, cluster_resolver_2)
     cluster_spec = union_cluster.cluster_spec()
 
     expected_proto = """
@@ -419,10 +431,11 @@ class UnionClusterResolverTest(test.TestCase):
             7: "worker2:2222"
         }
     })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+    cluster_resolver_1 = cluster_resolver.SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = cluster_resolver.SimpleClusterResolver(cluster_spec_2)
 
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    union_cluster = cluster_resolver.UnionClusterResolver(
+        cluster_resolver_1, cluster_resolver_2)
     cluster_spec = union_cluster.cluster_spec()
 
     expected_proto = """
@@ -443,8 +456,9 @@ class UnionClusterResolverTest(test.TestCase):
         }
     })
 
-    base_cluster_resolver = SimpleClusterResolver(base_cluster_spec)
-    union_cluster = UnionClusterResolver(base_cluster_resolver)
+    base_cluster_resolver = cluster_resolver.SimpleClusterResolver(
+        base_cluster_spec)
+    union_cluster = cluster_resolver.UnionClusterResolver(base_cluster_resolver)
     cluster_spec = union_cluster.cluster_spec()
 
     expected_proto = """
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
index f97d58f5994..b2a20241488 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
@@ -16,10 +16,10 @@
 
 import os
 
-from tensorflow.python import framework
 from tensorflow.python.client import session
-from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
-from tensorflow.python.eager.context import LogicalDevice
+from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
+from tensorflow.python.eager import context
+from tensorflow.python.framework import config
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -55,7 +55,7 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver()
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
     expected_proto = """
     job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
                      tasks { key: 1 value: 'ps1:2222' } }
@@ -80,7 +80,7 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver()
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
     expected_proto = """
     job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
                      tasks { key: 1 value: 'ps1:2222' } }
@@ -103,7 +103,7 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver()
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
     self.assertEqual('ps0:2222', cluster_resolver.master())
 
   def testSpecifiedTaskTypeAndIndexMasterRead(self):
@@ -120,7 +120,7 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver()
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
     self.assertEqual('worker1:2222', cluster_resolver.master('worker', 1))
 
   def testSessionMasterRead(self):
@@ -138,7 +138,7 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver()
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
     self.assertEqual('sessionmaster:2222', cluster_resolver.master())
 
   def testRpcLayerRead(self):
@@ -156,7 +156,7 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver()
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
     self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
 
   def testTaskTypeIndexRpcRead(self):
@@ -174,7 +174,7 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver()
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
     self.assertEqual('ps', cluster_resolver.task_type)
     self.assertEqual(0, cluster_resolver.task_id)
     self.assertEqual('grpc', cluster_resolver.rpc_layer)
@@ -194,7 +194,8 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_id=0)
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver(
+        task_type='ps', task_id=0)
 
     self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
     self.assertEqual('ps', cluster_resolver.task_type)
@@ -223,7 +224,7 @@ class TFConfigClusterResolverTest(test.TestCase):
       }
     }
     """
-    cluster_resolver = TFConfigClusterResolver()
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
     self.assertEqual('123456', cluster_resolver.task_type)
 
   def testTaskIndexCastToInteger(self):
@@ -240,7 +241,7 @@ class TFConfigClusterResolverTest(test.TestCase):
       }
     }
     """
-    cluster_resolver = TFConfigClusterResolver()
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
     self.assertEqual(1, cluster_resolver.task_id)
 
   def testTaskIndexOverride(self):
@@ -255,7 +256,8 @@ class TFConfigClusterResolverTest(test.TestCase):
       }
     }
     """
-    cluster_resolver = TFConfigClusterResolver(task_id=1)
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver(
+        task_id=1)
     self.assertEqual(1, cluster_resolver.task_id)
 
   def testZeroItemsInClusterSpecMasterRead(self):
@@ -263,7 +265,7 @@ class TFConfigClusterResolverTest(test.TestCase):
     {}
     """
 
-    cluster_resolver = TFConfigClusterResolver()
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
     self.assertEqual('', cluster_resolver.master())
 
   def testOneItemInClusterSpecMasterRead(self):
@@ -275,10 +277,10 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver()
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
     self.assertEqual('', cluster_resolver.master())
 
-  @mock.patch.object(framework.config, 'list_logical_devices')
+  @mock.patch.object(config, 'list_logical_devices')
   @mock.patch.object(session.BaseSession, 'list_devices')
   def testNumAcceleratorsFilterTasksByEnvVar(self, mock_list_devices,
                                              mock_eager_list_devices):
@@ -297,14 +299,14 @@ class TFConfigClusterResolverTest(test.TestCase):
     """
 
     devices = [
-        LogicalDevice('/job:worker1/task:0/device:TPU:0', 'TPU'),
-        LogicalDevice('/job:worker1/task:0/device:TPU:1', 'TPU'),
-        LogicalDevice('/job:worker1/task:0/device:GPU:0', 'GPU'),
-        LogicalDevice('/job:worker1/task:0/device:GPU:1', 'GPU'),
-        LogicalDevice('/job:worker2/task:1/device:TPU:2', 'TPU'),
-        LogicalDevice('/job:worker2/task:2/device:TPU:3', 'TPU'),
-        LogicalDevice('/job:worker2/task:3/device:GPU:2', 'GPU'),
-        LogicalDevice('/job:worker2/task:4/device:GPU:3', 'GPU'),
+        context.LogicalDevice('/job:worker1/task:0/device:TPU:0', 'TPU'),
+        context.LogicalDevice('/job:worker1/task:0/device:TPU:1', 'TPU'),
+        context.LogicalDevice('/job:worker1/task:0/device:GPU:0', 'GPU'),
+        context.LogicalDevice('/job:worker1/task:0/device:GPU:1', 'GPU'),
+        context.LogicalDevice('/job:worker2/task:1/device:TPU:2', 'TPU'),
+        context.LogicalDevice('/job:worker2/task:2/device:TPU:3', 'TPU'),
+        context.LogicalDevice('/job:worker2/task:3/device:GPU:2', 'GPU'),
+        context.LogicalDevice('/job:worker2/task:4/device:GPU:3', 'GPU'),
     ]
     device_list = [
         session._DeviceAttributes(d.name, d.device_type, 1024, 0)
@@ -313,7 +315,7 @@ class TFConfigClusterResolverTest(test.TestCase):
     mock_eager_list_devices.return_value = devices
     mock_list_devices.return_value = device_list
 
-    resolver = TFConfigClusterResolver()
+    resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
 
     # By default we read from TF_CONFIG
     self.assertEqual(resolver.num_accelerators(), {'TPU': 2, 'GPU': 2})
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
index 4fcba89ebee..787bb4ae418 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
@@ -1,6 +1,6 @@
 # Description: OSS only cluster resolvers
-
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_rpc_deps",
@@ -14,19 +14,24 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "tpu_cluster_resolver_py",
     srcs = ["tpu_cluster_resolver.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/core/protobuf/tpu:topology_proto_py",
         "//tensorflow/python:training_server_lib",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/tpu:tpu_system_metadata",
         "//tensorflow/python/tpu/client",
+        "//tensorflow/python/util:compat",
     ] + tf_additional_rpc_deps(),
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tpu_cluster_resolver_py_test",
     size = "small",
     srcs = ["tpu_cluster_resolver_test.py"],
@@ -35,11 +40,16 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         ":tpu_cluster_resolver_py",
+        "//tensorflow/core/protobuf/tpu:topology_proto_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/tpu/client",
+        "//tensorflow/python/util:compat",
     ],
 )
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
index 02b79670499..7be3249fe4f 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
@@ -20,10 +20,10 @@ import six
 from six.moves.urllib.error import URLError
 
 from tensorflow.core.protobuf.tpu import topology_pb2
-from tensorflow.python import framework
 from tensorflow.python.client import session
 from tensorflow.python.distribute.cluster_resolver.tpu import tpu_cluster_resolver as resolver
-from tensorflow.python.eager.context import LogicalDevice
+from tensorflow.python.eager import context
+from tensorflow.python.framework import config
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
@@ -275,10 +275,10 @@ class TPUClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
-    with self.assertRaises(ValueError) as context:
+    with self.assertRaises(ValueError) as error_context:
       cluster_resolver.cluster_spec()
 
-    self.assertIn('Could not lookup TPU metadata', str(context.exception))
+    self.assertIn('Could not lookup TPU metadata', str(error_context.exception))
 
   def testNewNetworkEndpointFormat(self):
     tpu_map = {
@@ -607,20 +607,20 @@ class TPUClusterResolverTest(test.TestCase):
               1: [1, 2]
           })
 
-  @mock.patch.object(framework.config, 'list_logical_devices')
+  @mock.patch.object(config, 'list_logical_devices')
   @mock.patch.object(session.BaseSession, 'list_devices')
   @mock.patch.object(resolver, 'is_running_in_gce', mock_is_not_running_in_gce)
   def testNumAcceleratorsSuccess(self, mock_list_devices,
                                  mock_eager_list_devices):
     devices = [
-        LogicalDevice('/job:tpu_worker/task:0/device:TPU:0', 'TPU'),
-        LogicalDevice('/job:tpu_worker/task:1/device:TPU:1', 'TPU'),
-        LogicalDevice('/job:tpu_worker/task:2/device:TPU:0', 'TPU'),
-        LogicalDevice('/job:tpu_worker/task:3/device:TPU:1', 'TPU'),
-        LogicalDevice('/job:tpu_worker/task:0/device:TPU:4', 'TPU'),
-        LogicalDevice('/job:tpu_worker/task:1/device:TPU:5', 'TPU'),
-        LogicalDevice('/job:tpu_worker/task:2/device:TPU:4', 'TPU'),
-        LogicalDevice('/job:tpu_worker/task:3/device:TPU:5', 'TPU'),
+        context.LogicalDevice('/job:tpu_worker/task:0/device:TPU:0', 'TPU'),
+        context.LogicalDevice('/job:tpu_worker/task:1/device:TPU:1', 'TPU'),
+        context.LogicalDevice('/job:tpu_worker/task:2/device:TPU:0', 'TPU'),
+        context.LogicalDevice('/job:tpu_worker/task:3/device:TPU:1', 'TPU'),
+        context.LogicalDevice('/job:tpu_worker/task:0/device:TPU:4', 'TPU'),
+        context.LogicalDevice('/job:tpu_worker/task:1/device:TPU:5', 'TPU'),
+        context.LogicalDevice('/job:tpu_worker/task:2/device:TPU:4', 'TPU'),
+        context.LogicalDevice('/job:tpu_worker/task:3/device:TPU:5', 'TPU'),
     ]
     device_list = [
         session._DeviceAttributes(d.name, d.device_type, 1024, 0)
@@ -662,7 +662,7 @@ class TPUClusterResolverTest(test.TestCase):
         service=self.mock_service_client(tpu_map=tpu_map))
     self.assertEqual(cluster_resolver.num_accelerators(), {'TPU': 2})
 
-  @mock.patch.object(framework.config, 'list_logical_devices')
+  @mock.patch.object(config, 'list_logical_devices')
   @mock.patch.object(session.BaseSession, 'list_devices')
   @mock.patch.object(resolver, 'is_running_in_gce', mock_is_not_running_in_gce)
   def testNumAcceleratorsRetryFailure(self, mock_list_devices,
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 2e876de0c9b..99c796d5f6f 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -27,7 +27,6 @@ from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import input_util
 from tensorflow.python.distribute import mirrored_strategy
@@ -35,9 +34,8 @@ from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
-from tensorflow.python.distribute.cluster_resolver import ClusterResolver
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
-from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
+from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
 from tensorflow.python.distribute.v1 import input_lib as input_lib_v1
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device as tf_device
@@ -202,7 +200,8 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
   def _from_local_devices(cls, devices, communication_options=None):
     """A convenience method to create an object with a list of devices."""
     obj = cls(communication_options=communication_options)
-    obj.extended._initialize_local(TFConfigClusterResolver(), devices=devices)  # pylint: disable=protected-access
+    obj.extended._initialize_local(  # pylint: disable=protected-access
+        tfconfig_cluster_resolver.TFConfigClusterResolver(), devices=devices)
     return obj
 
   @property
@@ -262,7 +261,7 @@ class _CollectiveAllReduceStrategyExperimental(
       communication=collective_util.CommunicationImplementation.AUTO):
     """A convenience method to create an object with a list of devices."""
     obj = cls(communication)
-    obj.extended._initialize_local(TFConfigClusterResolver(), devices=devices)  # pylint: disable=protected-access
+    obj.extended._initialize_local(tfconfig_cluster_resolver.TFConfigClusterResolver(), devices=devices)  # pylint: disable=protected-access
     return obj
 
 
@@ -330,8 +329,8 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
       raise ValueError(
           "cluster_resolver and devices cannot be set at the same time")
 
-    self._cluster_resolver = cluster_resolver or TFConfigClusterResolver()
-    if not isinstance(self._cluster_resolver, ClusterResolver):
+    self._cluster_resolver = cluster_resolver or tfconfig_cluster_resolver.TFConfigClusterResolver()
+    if not isinstance(self._cluster_resolver, cluster_resolver_lib.ClusterResolver):
       raise ValueError("cluster_resolver must be an instance of "
                        "tf.distribute.cluster_resolver.ClusterResolver")
     distribute_lib.StrategyExtendedV1.__init__(self, container_strategy)
@@ -361,7 +360,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
   def _initialize_local_devices(self, cluster_resolver, worker_device):
     # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
     # some cases.
-    if isinstance(cluster_resolver, TFConfigClusterResolver):
+    if isinstance(cluster_resolver, tfconfig_cluster_resolver.TFConfigClusterResolver):
       num_gpus = context.num_gpus()
       num_tpus = 0
     else:
@@ -737,7 +736,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
       ValueError: if `task_type` is not in the `cluster_spec`.
     """
     if cluster_spec:
-      cluster_resolver = SimpleClusterResolver(
+      cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
           cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
           task_type=task_type,
           task_id=task_id,
@@ -852,7 +851,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
       # collective ops are to be launched sequentially.
       return super()._replica_ctx_all_reduce(reduce_op, value, options)
 
-    replica_context = ds_context.get_replica_context()
+    replica_context = distribute_lib.get_replica_context()
     assert replica_context, (
         "`StrategyExtended._replica_ctx_all_reduce` must be called in a "
         "replica context")
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 66b07e7581b..4ad7a8e5397 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -23,20 +23,18 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute import test_util
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute.v1 import input_lib as input_lib_v1
 from tensorflow.python.eager import context
 from tensorflow.python.framework import config as tf_config
@@ -55,7 +53,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import tpu_strategy_util
-from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.training import server_lib
 
 
 CollectiveAllReduceStrategy = (
@@ -81,15 +79,17 @@ def create_test_objects(cluster_spec=None,
     tpu_strategy_util.initialize_tpu_system()
 
   if cluster_spec and task_type and task_id is not None:
-    cluster_resolver = SimpleClusterResolver(
+    cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
         cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
         task_type=task_type,
         task_id=task_id,
         num_accelerators={'GPU': num_gpus, 'TPU': num_tpus})
     target = 'grpc://' + cluster_spec[task_type][task_id]
   else:
-    cluster_resolver = SimpleClusterResolver(
-        ClusterSpec({}), num_accelerators={'GPU': num_gpus, 'TPU': num_tpus})
+    cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
+        server_lib.ClusterSpec({}),
+        num_accelerators={'GPU': num_gpus, 'TPU': num_tpus},
+    )
     target = ''
 
   strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
@@ -716,7 +716,7 @@ class CollectiveAllReduceStrategyV2Test(test.TestCase, parameterized.TestCase):
   def test_replica_id_in_sync_group(self, strategy):
 
     def replica_fn():
-      replica_ctx = distribution_strategy_context.get_replica_context()
+      replica_ctx = distribute_lib.get_replica_context()
       return replica_ctx.replica_id_in_sync_group, replica_ctx._replica_id
 
     results = test_util.gather(strategy, strategy.run(replica_fn))
@@ -764,7 +764,7 @@ class ExperimentalCompatibilityTest(test.TestCase):
 
 def _replica_id_f32():
   return math_ops.cast(
-      distribution_strategy_context.get_replica_context()
+      distribute_lib.get_replica_context()
       .replica_id_in_sync_group, dtypes.float32)
 
 
diff --git a/tensorflow/python/distribute/coordinator/BUILD b/tensorflow/python/distribute/coordinator/BUILD
index e410a792953..257cf9161a5 100644
--- a/tensorflow/python/distribute/coordinator/BUILD
+++ b/tensorflow/python/distribute/coordinator/BUILD
@@ -1,5 +1,6 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
-load("//tensorflow/core/platform:distribute.bzl", "distribute_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
+load("//tensorflow/core/platform:distribute.bzl", "distribute_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -7,7 +8,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "cluster_coordinator",
     srcs = ["cluster_coordinator.py"],
     srcs_version = "PY3",
@@ -21,54 +22,68 @@ py_library(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:func_graph",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training_server_lib",
-        "//tensorflow/python:util",
-        "//tensorflow/python/distribute:input_lib",
-        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:executor",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
         "@six_archive//:six",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "coordinator_context",
     srcs = [
         "coordinator_context.py",
     ],
     srcs_version = "PY3",
-    deps = [":remote_value"],
+    deps = [
+        ":remote_value",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_export",
+    ],
 )
 
-py_library(
+py_strict_library(
     name = "remote_value",
     srcs = ["remote_value.py"],
     srcs_version = "PY3",
-    deps = [],
+    deps = ["//tensorflow/python/util:tf_export"],
 )
 
-py_library(
+py_strict_library(
     name = "values",
     srcs = ["values.py"],
     srcs_version = "PY3",
     deps = [
         ":remote_value",
-        "//tensorflow/python:framework",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "cluster_coordinator_test",
     srcs = ["cluster_coordinator_test.py"],
     python_version = "PY3",
@@ -86,38 +101,41 @@ distribute_py_test(
     ],
     deps = [
         ":cluster_coordinator",
+        ":coordinator_context",
         ":remote_value",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training_lib",
-        "//tensorflow/python:training_server_lib",
-        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:distribute_utils",
+        "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:multi_worker_test_base",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:tf_logging",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
-    name = "fault_tolerance_test",
-    srcs = ["fault_tolerance_test.py"],
-    flaky = True,
-    python_version = "PY3",
-    shard_count = 36,
-    tags = [
-        "no_oss",  # TODO(b/219580021)
-        "noasan",  # Multi-process runner does not work with test sanitizers
-        "nomac",  # TODO(b/177065434)
-    ],
+py_strict_library(
+    name = "fault_tolerance_test_base",
+    srcs = ["fault_tolerance_test_base.py"],
+    srcs_version = "PY3",
     deps = [
         ":cluster_coordinator",
         "//tensorflow/python:array_ops",
@@ -126,32 +144,72 @@ tf_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
-        "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute:test_util",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training:training_lib",
     ],
 )
 
-py_library(
+tf_py_strict_test(
+    name = "fault_tolerance_test",
+    srcs = ["fault_tolerance_test.py"],
+    flaky = True,
+    python_version = "PY3",
+    shard_count = 40,  # = number of tests, so one shard = one test
+    tags = [
+        "no_oss",  # TODO(b/219580021)
+        "noasan",  # Multi-process runner does not work with test sanitizers
+        "nomac",  # TODO(b/177065434)
+    ],
+    deps = [
+        ":fault_tolerance_test_base",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+tf_py_strict_test(
+    name = "fault_tolerance_coordination_service_test",
+    srcs = ["fault_tolerance_coordination_service_test.py"],
+    flaky = True,
+    python_version = "PY3",
+    shard_count = 41,
+    tags = [
+        # Inherit tags from fault_tolerance_test
+        "no_oss",  # TODO(b/219580021)
+        "noasan",  # Multi-process runner does not work with test sanitizers
+        "nomac",  # TODO(b/177065434)
+    ],
+    deps = [
+        ":cluster_coordinator",
+        ":fault_tolerance_test_base",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_strict_library(
     name = "metric_utils",
     srcs = ["metric_utils.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "metric_utils_test",
     srcs = ["metric_utils_test.py"],
     python_version = "PY3",
@@ -161,34 +219,36 @@ tf_py_test(
         "//tensorflow/python:training_server_lib",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:parameter_server_strategy_v2",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "utils",
     srcs = ["utils.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "remote_eager_lib",
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
 )
 
-py_library(
+py_strict_library(
     name = "watchdog",
     srcs = ["watchdog.py"],
     srcs_version = "PY3",
-    deps = [],
+    deps = ["@absl_py//absl/logging"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "watchdog_test",
     srcs = ["watchdog_test.py"],
     python_version = "PY3",
@@ -198,10 +258,11 @@ tf_py_test(
     deps = [
         ":watchdog",
         "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "get_task_states_test",
     srcs = ["get_task_states_test.py"],
     flaky = True,
@@ -212,10 +273,17 @@ tf_py_test(
         "nomac",  # TODO(b/177065434)
     ],
     deps = [
+        ":cluster_coordinator",
+        ":utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:parameter_server_strategy_v2",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/training:server_lib",
     ],
 )
diff --git a/tensorflow/python/distribute/coordinator/cluster_coordinator.py b/tensorflow/python/distribute/coordinator/cluster_coordinator.py
index 85bb77e397b..355c355dfdd 100644
--- a/tensorflow/python/distribute/coordinator/cluster_coordinator.py
+++ b/tensorflow/python/distribute/coordinator/cluster_coordinator.py
@@ -30,6 +30,7 @@ from six.moves import queue
 from tensorflow.python.distribute.coordinator import coordinator_context
 from tensorflow.python.distribute.coordinator import metric_utils
 from tensorflow.python.distribute.coordinator import remote_value
+from tensorflow.python.distribute.coordinator import utils
 from tensorflow.python.distribute.coordinator import values as values_lib
 from tensorflow.python.distribute.coordinator import watchdog
 from tensorflow.python.eager import cancellation
@@ -46,6 +47,10 @@ from tensorflow.python.util.tf_export import tf_export
 
 # Maximum time for failed worker to come back is 1 hour
 _WORKER_MAXIMUM_RECOVERY_SEC = 3600
+# How often to poll task states from the coordination service. In testing, a
+# value of 1 led to some spurious reports of unavailability, so a higher value
+# is used. Refer to the discussion in b/249134783 for more.
+_POLL_FREQ_IN_SEC = 5
 
 # Maximum size for queued closures, "infinite" if set to 0.
 # When the maximum queue size is reached, further schedule calls will become
@@ -121,6 +126,20 @@ class ClosureAbortedError(Exception):
     self.with_traceback(original_exception.__traceback__)
 
 
+class PSUnavailableError(errors.UnavailableError):
+  """Specifies that a parameter server is the unavailable task."""
+
+  def __init__(self, original_exception):
+    assert isinstance(original_exception, errors.UnavailableError)
+    # TF Errors should have init args set as attributes for serialization.
+    self.original_exception = original_exception
+    super().__init__(
+        original_exception.node_def,
+        original_exception.op,
+        original_exception.message,
+    )
+
+
 def _get_error_from_remote_values(structure):
   """Attempts to return errors from `RemoteValue`s. Rebuilds them if needed."""
   errors_in_structure = []
@@ -412,7 +431,7 @@ class _CoordinatedClosureQueue(object):
     if tag is not None:
       with self._queue_lock:
         self._tagged_queue[tag].put(closure, block=False)
-        self._closures_queued_condition.notifyAll()
+        self._closures_queued_condition.notify_all()
     else:
       with self._put_wait_lock, self._queue_lock:
         self._queue_free_slot_condition.wait_for(lambda: not self._queue.full())
@@ -528,6 +547,262 @@ class _CoordinatedClosureQueue(object):
     self._tagged_queue[tag] = queue.Queue()
 
 
+class CoordinationServicePreemptionHandler(object):
+  """Handles preemptions of workers and parameter servers.
+
+  Starts a thread to regularly poll the coordination service (hosted on PS 0)
+  for task states. When a worker's task state reflects an error, it inspects the
+  error. If the error is recoverable (i.e. a preemption), it waits for the
+  worker to recover, then updates the server def. Otherwise, it raises the error
+  to the user.
+
+  A worker error is detected to be recoverable if it is the result of missing a
+  heartbeat that workers regularly send to the coordination service.
+
+  The thread also checks for parameter server errors. If these are detected, the
+  thread and coordinator shutdown. To resume training in this case, the whole
+  job must be restarted and resumed from the latest checkpoint.
+  """
+
+  def __init__(self, server_def, cluster):
+    self._server_def = server_def
+    self._cluster = cluster
+    self._cluster_update_lock = threading.Lock()
+    self._cluster_due_for_update_or_finish = threading.Event()
+    self._worker_up_cond = threading.Condition(self._cluster_update_lock)
+
+    self._next_task_state_cond = threading.Condition()
+    self._task_states = None
+
+    self._error_from_recovery = None
+    self._should_preemption_thread_run = True
+    self._task_state_poller_thread = utils.RepeatedTimer(
+        interval=_POLL_FREQ_IN_SEC,
+        function=self._get_task_states)
+    self._preemption_handler_thread = threading.Thread(
+        target=self._preemption_handler,
+        name="WorkerPreemptionHandler",
+        daemon=True)
+    self._preemption_handler_thread.start()
+
+    self._num_workers = self._cluster._num_workers
+    self._num_ps = self._cluster._num_ps
+
+  def stop(self):
+    """Ensure the worker preemption thread is closed."""
+    self._task_state_poller_thread.stop()
+    self._should_preemption_thread_run = False
+    with self._cluster_update_lock:
+      self._cluster_due_for_update_or_finish.set()
+    # TODO(yuefengz): The preemption handler thread shouldn't be terminated
+    # asynchronously since it touches eager context which is a process-wide
+    # singleton. The problem is in OSS unit tests will time out.
+
+  @contextlib.contextmanager
+  def wait_on_failure(self,
+                      on_failure_fn=None,
+                      on_transient_failure_fn=None,
+                      on_recovery_fn=None,
+                      worker_device_name="(unknown)"):
+    """Catches errors during closure execution and handles them.
+
+    Args:
+      on_failure_fn: an optional function to run if preemption happens.
+      on_transient_failure_fn: an optional function to run if transient failure
+        happens.
+      on_recovery_fn: an optional function to run when a worker is recovered
+        from preemption.
+      worker_device_name: the device name of the worker instance that is passing
+        through the failure.
+
+    Yields:
+      None.
+    """
+    assert self._should_preemption_thread_run
+    try:
+      yield
+    except (errors.OpError, ClosureInputError,
+            ClosureAbortedError) as e:
+      # The next state could reflect stale heartbeats, so wait for two rounds.
+      # Example:
+      # - Worker sends healthy heartbeat at T=0.
+      # - Coordination service receives healthy heartbeat at T=0.
+      # - Worker gets preempted at T=0.1.
+      # - Coordinator catches error at T=0.2, and waits here for next states.
+      # - Coordinator polls states at T=1.9. Heartbeat time has not elapsed yet,
+      #   so coordination service does not know it is down yet.
+      # - Coordination service learns of worker unavailability at T=2, the next
+      #   heartbeat.
+      # - Coordinator polls states at T=3.9 and learns of worker unavailability.
+      with self._next_task_state_cond:
+        # Give some buffer time to make sure task states are updated during the
+        # wait interval
+        self._next_task_state_cond.wait(_POLL_FREQ_IN_SEC * 1.25)
+      with self._next_task_state_cond:
+        self._next_task_state_cond.wait(_POLL_FREQ_IN_SEC * 1.25)
+
+      # Check for coordination service failure
+      if not self._task_states:
+        self._log_ps_failure_and_raise(e, 0)
+
+      worker_states = self._task_states[:self._num_workers]
+      ps_states = self._task_states[self._num_workers:]
+
+      # Check for PS failure
+      if any(ps_states):
+        failed_ps_index = [
+            ix for ix, ps_state in enumerate(ps_states) if ps_state
+        ]
+        self._log_ps_failure_and_raise(e, failed_ps_index[0])
+
+      # Check for preemption of this worker
+      worker_ix = int(worker_device_name.split(":")[-1])
+      if worker_states[worker_ix]:
+        # Raise error if all closures are being cancelled
+        if self._cluster.closure_queue._cancellation_mgr.is_cancelled:  # pylint: disable=protected-access
+          if isinstance(e, errors.CancelledError):
+            raise e
+          # It's possible the caught error `e` here is due to worker preemption
+          # and is thus not a `CancelledError`, because a different
+          # unrecoverable error on another worker caused closure cancellation,
+          # while this thread was waiting for task states. So raise a new
+          # CancelledError.
+          else:
+            raise errors.CancelledError(
+                None, None, "The corresponding function was cancelled while "
+                "attempting to recover from worker failure.")
+        # Else, preemption
+        self._handle_failure_and_recovery(e, on_failure_fn,
+                                          on_transient_failure_fn,
+                                          on_recovery_fn, worker_device_name)
+        return
+
+      #  else, if timeout: log
+      if self._cluster._record_and_ignore_transient_timeouts(e):  # pylint: disable=protected-access
+        logging.error(
+            "Remote function on worker %s failed with %r:%s\n"
+            "This derived error is ignored and not reported to users.",
+            worker_device_name, e, e)
+        if on_transient_failure_fn:
+          on_transient_failure_fn()
+        return
+      raise e
+
+  def _handle_failure_and_recovery(self,
+                                   e,
+                                   on_failure_fn,
+                                   on_transient_failure_fn,
+                                   on_recovery_fn,
+                                   worker_device_name):
+    """Call failure fn, wait for cluster to recover, then call recovery fn.
+
+    Args:
+      e: the Exception thrown during closure execution.
+      on_failure_fn: an optional function to run if preemption happens.
+      on_transient_failure_fn: an optional function to run if transient failure
+        happens.
+      on_recovery_fn: an optional function to run when a worker is recovered
+        from preemption.
+      worker_device_name: the device name of the worker instance that is passing
+        through the failure.
+    """
+    if on_failure_fn:
+      on_failure_fn(e)
+    # update server def
+    with self._cluster_update_lock:
+      self._cluster_due_for_update_or_finish.set()
+      self._worker_up_cond.wait(_WORKER_MAXIMUM_RECOVERY_SEC)
+      if self._error_from_recovery:
+        # TODO(yuefengz): there is only one worker that will get this error.
+        # Ideally we should let all workers notified by `_worker_up_cond` get
+        # this error.
+        try:
+          raise self._error_from_recovery
+        finally:
+          self._error_from_recovery = None
+      logging.info("Worker %s has been recovered.", worker_device_name)
+
+    if on_recovery_fn:
+      logging.info("Worker %s calling on_recovery_fn", worker_device_name)
+      with self.wait_on_failure(
+          on_recovery_fn=on_recovery_fn,
+          on_transient_failure_fn=on_transient_failure_fn,
+          worker_device_name=worker_device_name):
+        on_recovery_fn()
+
+  def _log_ps_failure_and_raise(self, e, ps_index):
+    logging.info("Parameter server failure detected at PS task %d", ps_index)
+    self.stop()
+    raise PSUnavailableError(e)
+
+  def _get_task_states(self):
+    try:
+      self._task_states = context.context().get_task_states(
+          [("worker", self._num_workers), ("ps", self._num_ps)]
+      )
+    except errors.UnavailableError:
+      # Coordination service is down
+      self._task_states = None
+    with self._next_task_state_cond:
+      self._next_task_state_cond.notify_all()
+
+  def _preemption_handler(self):
+    """A loop that handles preemption.
+
+    This loop waits for signal of worker preemption and upon worker preemption,
+    it waits until all workers are back and updates the cluster about the
+    restarted workers.
+    """
+    assert self._should_preemption_thread_run
+    while True:
+      self._cluster_due_for_update_or_finish.wait()
+      if not self._should_preemption_thread_run:
+        logging.info("Stopping the failure handing thread.")
+        break
+
+      with self._cluster_update_lock:
+        try:
+          # TODO(haoyuzhang): support partial cluster recovery
+          logging.info("Cluster now being recovered.")
+          context.context().update_server_def(self._server_def)
+
+          # Cluster updated successfully, clear the update signal, and notify
+          # all workers that they are recovered from failure.
+          logging.info("Cluster successfully recovered.")
+          self._notify_cluster_update()
+        except Exception as e:  # pylint: disable=broad-except
+          logging.info("Error occurred while updating server def: %s", e)
+          # Wait for the next set of states from the task state poller
+          with self._next_task_state_cond:
+            self._next_task_state_cond.wait(_POLL_FREQ_IN_SEC * 2)
+          # If a PS is preempted, set the error
+          if not self._task_states:
+            self._error_from_recovery = e
+          else:
+            ps_states = self._task_states[self._num_workers:]
+            # Check for PS failure
+            if any(ps_states):
+              self._error_from_recovery = e
+          # Else, likely another worker failed. Just log and retry
+          self._notify_cluster_update()
+          # NOTE: Since the first RPC (GetStatus) of update_server_def is
+          # currently blocking by default, error should only happen if:
+          # (1) More workers failed while waiting for the previous workers to
+          #     come back;
+          # (2) Worker failed when exchanging subsequent RPCs after the first
+          #     RPC returns.
+          # Consider adding backoff retry logic if we see the error logged
+          # too frequently.
+          logging.error("Cluster update failed with error: %s. Retrying...", e)
+
+  def _notify_cluster_update(self):
+    self._worker_up_cond.notify_all()
+    # The check for _should_preemption_thread_run is necessary since the
+    # `stop` may have already set _cluster_due_for_update_or_finish.
+    if self._should_preemption_thread_run:
+      self._cluster_due_for_update_or_finish.clear()
+
+
 class WorkerPreemptionHandler(object):
   """Handles worker preemptions."""
 
@@ -978,8 +1253,17 @@ class Cluster(object):
     self._transient_timeouts_count = 0
 
     self.closure_queue = _CoordinatedClosureQueue()
-    self.failure_handler = WorkerPreemptionHandler(context.get_server_def(),
-                                                   self)
+    # Set this environment variable to use an experimental
+    # integration with the runtime coordination service to aid in failure
+    # detection and handling. This will not affect the functionality of
+    # the strategy or cluster coordinator, but is off by default.
+    if os.getenv("TF_PSS_ENABLE_COORDINATION_SERVICE"):
+      self.failure_handler = CoordinationServicePreemptionHandler(
+          context.get_server_def(), self,
+      )
+    else:
+      self.failure_handler = WorkerPreemptionHandler(context.get_server_def(),
+                                                     self)
     worker_device_strings = [
         "/job:worker/replica:0/task:%d" % i for i in range(self._num_workers)
     ]
@@ -1419,6 +1703,8 @@ def _extract_failed_ps_instances(err_msg):
 
 def _is_ps_failure(error):
   """Whether the error is considered a parameter server failure."""
+  if isinstance(error, PSUnavailableError):
+    return True
 
   # For an `ClosureInputError` or `ClosureAbortedError`, extract
   # the original error and assess it accordingly.
diff --git a/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py b/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py
index 7be37e38441..51e2a430beb 100644
--- a/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py
+++ b/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py
@@ -28,12 +28,12 @@ from absl.testing import parameterized
 
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import parameter_server_strategy_v2
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute.coordinator import cluster_coordinator as coordinator_lib
 from tensorflow.python.distribute.coordinator import coordinator_context
 from tensorflow.python.distribute.coordinator import remote_value
@@ -44,12 +44,12 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -489,7 +489,7 @@ def make_coordinator(num_workers, num_ps):
   cluster_def['chief'] = [
       'localhost:%d' % test_util.pick_unused_port()
   ]
-  cluster_resolver = SimpleClusterResolver(
+  cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
       ClusterSpec(cluster_def), rpc_layer='grpc')
   strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
       cluster_resolver)
@@ -733,11 +733,12 @@ class ClusterCoordinatorTest(
     self.assertNotAllEqual(elements_in_iterator_1, elements_in_iterator_2)
 
   def testPerWorkerValue(self):
-    self.skipTest('b/168569314')
     var_shape = tuple()
     var_dtype = dtypes.float32
     var_name = 'var'
 
+    # This should not be a tf.function, as variable creation is prohibited in
+    # tf.functions in general.
     def create_var():
       var = variables.Variable(
           initial_value=0.0, dtype=var_dtype, name=var_name)
@@ -745,16 +746,17 @@ class ClusterCoordinatorTest(
       return var
 
     worker_local_var = self.coordinator._create_per_worker_resources(create_var)
-
     # The following is a workaround to allow `worker_local_var` to be passed in
-    # as args to the `coordinator.schedule` method which requires tensor specs
-    # to trace tf.function but _create_worker_resources' return values don't
-    # have tensor specs. We can get rid of this workaround once
-    # _create_worker_resources is able to infer the tensor spec of the return
-    # value of the function passed in. See b/154675763.
+    # as args to the `coordinator.schedule` method which requires specs
+    # to trace a tf.function, but _create_worker_resources' return values don't
+    # have specs when its input function is not a tf.function. We can get rid of
+    # this workaround once _create_worker_resources is able to infer the tensor
+    # spec of the return value of the (non-tf) function passed in. See
+    # b/154675763.
     for var in worker_local_var._values:
-      var._type_spec = tensor_spec.TensorSpec(var_shape, var_dtype, var_name)
+      var._type_spec = resource_variable_ops.VariableSpec(var_shape, var_dtype)
 
+    @def_function.function
     def worker_fn(var):
       var.assign_add(1.0)
 
@@ -767,8 +769,36 @@ class ClusterCoordinatorTest(
     var_sum = sum(self.coordinator.fetch(worker_local_var._values))
     self.assertEqual(var_sum, 10.0)
 
-  def testDisallowRemoteValueAsInput(self):
+  def testPerWorkerVariableCreation(self):
+    var_dtype = dtypes.float32
+    var_name = 'var'
 
+    with self.strategy.scope():
+      var = variables.Variable(
+          initial_value=0.0, dtype=var_dtype, name=var_name,
+          per_worker_variable=True)
+
+    # Use per-worker variable as a capture
+    @def_function.function
+    def worker_fn():
+      var.assign_add(1.0)
+      return var
+
+    num_closures = 10
+    for _ in range(num_closures):
+      self.coordinator.schedule(worker_fn)
+    self.coordinator.join()
+
+    # Verify placement of variables
+    devices = [wv._get_values().device for wv in var._per_worker_vars._values]
+    expected_devices = [f'/job:worker/replica:0/task:{ix}/device:CPU:0'
+                        for ix in range(self.strategy._num_workers)]  # pylint: disable=protected-access
+    self.assertAllEqual(devices, expected_devices)
+
+    result_sum = sum(var.read_all()).numpy()
+    self.assertEqual(result_sum, num_closures)
+
+  def testDisallowRemoteValueAsInput(self):
     @def_function.function
     def func_0():
       return 1.0
@@ -1184,9 +1214,9 @@ class StrategyIntegrationTest(test.TestCase, parameterized.TestCase):
     self.assertAlmostEqual(v2.read_value().numpy(), 0.8, delta=1e-6)
 
   def testRunAndReduce(self):
-    self.assertFalse(distribution_strategy_context.in_cross_replica_context())
+    self.assertFalse(distribute_lib.in_cross_replica_context())
     with self.strategy.scope():
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
+      self.assertTrue(distribute_lib.in_cross_replica_context())
       v = variables.Variable(initial_value=1.)
 
       expected_result = (4. * self.strategy.num_replicas_in_sync,
@@ -1198,7 +1228,7 @@ class StrategyIntegrationTest(test.TestCase, parameterized.TestCase):
         def replica_fn(input_tensor):
           # Within `replica_fn`, it has to be in a replica context.
           self.assertFalse(
-              distribution_strategy_context.in_cross_replica_context())
+              distribute_lib.in_cross_replica_context())
           return input_tensor + v, input_tensor - v
 
         run_result = self.strategy.run(replica_fn, args=(input_tensor,))
@@ -1218,9 +1248,9 @@ class StrategyIntegrationTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(result.fetch(), expected_result)
 
   def testRunAndReduceWithAssignAdd(self):
-    self.assertFalse(distribution_strategy_context.in_cross_replica_context())
+    self.assertFalse(distribute_lib.in_cross_replica_context())
     with self.strategy.scope():
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
+      self.assertTrue(distribute_lib.in_cross_replica_context())
       v = variables.Variable(initial_value=1.)
       v1 = variables.Variable(
           initial_value=0.,
@@ -1235,7 +1265,7 @@ class StrategyIntegrationTest(test.TestCase, parameterized.TestCase):
         def replica_fn(input_tensor):
           # Within `replica_fn`, it has to be in a replica context.
           self.assertFalse(
-              distribution_strategy_context.in_cross_replica_context())
+              distribute_lib.in_cross_replica_context())
 
           v1.assign_add(input_tensor)
           return input_tensor + v, input_tensor - v
@@ -1258,9 +1288,9 @@ class StrategyIntegrationTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(v1, 6.)
 
   def testVariableAggregation(self):
-    self.assertFalse(distribution_strategy_context.in_cross_replica_context())
+    self.assertFalse(distribute_lib.in_cross_replica_context())
     with self.strategy.scope():
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
+      self.assertTrue(distribute_lib.in_cross_replica_context())
       v = variables.Variable(
           initial_value=1.,
           aggregation=variable_scope.VariableAggregation.SUM)
@@ -1270,7 +1300,7 @@ class StrategyIntegrationTest(test.TestCase, parameterized.TestCase):
 
         def replica_fn():
           value = math_ops.cast(
-              distribution_strategy_context.get_replica_context()
+              distribute_lib.get_replica_context()
               .replica_id_in_sync_group + 1, v.dtype)
           v.assign(value)
 
@@ -1284,9 +1314,9 @@ class StrategyIntegrationTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(v, expected_result)
 
   def testVariableCaching(self):
-    self.assertFalse(distribution_strategy_context.in_cross_replica_context())
+    self.assertFalse(distribute_lib.in_cross_replica_context())
     with self.strategy.scope():
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
+      self.assertTrue(distribute_lib.in_cross_replica_context())
       v = variables.Variable(
           initial_value=1.,
           aggregation=variable_scope.VariableAggregation.ONLY_FIRST_REPLICA)
diff --git a/tensorflow/python/distribute/coordinator/fault_tolerance_coordination_service_test.py b/tensorflow/python/distribute/coordinator/fault_tolerance_coordination_service_test.py
new file mode 100644
index 00000000000..5083e18959c
--- /dev/null
+++ b/tensorflow/python/distribute/coordinator/fault_tolerance_coordination_service_test.py
@@ -0,0 +1,61 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fault tolerance tests for coordination service-based failure handling."""
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute.coordinator import cluster_coordinator
+from tensorflow.python.distribute.coordinator import fault_tolerance_test_base
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+
+
+class BaseCoordinationServiceTest(
+    fault_tolerance_test_base.BaseFaultToleranceTest):
+  """Modify some tests to have stronger checks."""
+
+  def setUp(self, num_workers, num_ps):
+    super().setUp(num_workers=num_workers, num_ps=num_ps, use_cs=True)
+
+  def testJoinRaisesUnavailableErrorAtPsFailure(self):
+    self._run_and_kill_ps_task()
+    with self.assertRaises(cluster_coordinator.PSUnavailableError):
+      self.cluster_coord.join()
+
+  def testScheduleRaisesUnavailableErrorAtPsFailure(self):
+    self._run_and_kill_ps_task()
+    with self.assertRaises(cluster_coordinator.PSUnavailableError):
+      self.cluster_coord.schedule(def_function.function(lambda: None))
+
+
+class SingleWorkerCoordinationServiceTest(
+    BaseCoordinationServiceTest, test.TestCase
+):
+
+  def setUp(self):
+    super().setUp(num_workers=1, num_ps=1)
+
+
+class MultiWorkerCoordinationServiceTest(
+    BaseCoordinationServiceTest, test.TestCase
+):
+
+  def setUp(self):
+    super().setUp(num_workers=2, num_ps=2)
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/coordinator/fault_tolerance_test.py b/tensorflow/python/distribute/coordinator/fault_tolerance_test.py
index f3cb7ffea17..488b047ac50 100644
--- a/tensorflow/python/distribute/coordinator/fault_tolerance_test.py
+++ b/tensorflow/python/distribute/coordinator/fault_tolerance_test.py
@@ -14,678 +14,15 @@
 # ==============================================================================
 """Fault tolerance test for parameter server training in TF2."""
 
-import gc
-import sys
-import threading
-import time
 
 from tensorflow.python.compat import v2_compat
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import multi_process_runner
-from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute import parameter_server_strategy_v2
-from tensorflow.python.distribute import test_util
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
-from tensorflow.python.distribute.coordinator import cluster_coordinator
-from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
+from tensorflow.python.distribute.coordinator import fault_tolerance_test_base
 from tensorflow.python.eager import test
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import coordinator as thread_coordinator
-from tensorflow.python.training import server_lib
 
-_RPC_ERROR_FROM_WORKER = "GRPC error information from remote target /job:worker"
-_RPC_ERROR_FROM_PS = "GRPC error information from remote target /job:ps"
-_WORKER_PREEMPTION_THREAD_NAME = "WorkerPreemptionHandler"
-_WORKER_THREAD_PREFIX = "WorkerClosureProcessingLoop"
 
-
-class Model(object):
-
-  def __init__(self, coordinator):
-    self.cluster_coord = coordinator
-    self.strategy = self.cluster_coord.strategy
-    with self.cluster_coord.strategy.scope():
-      self.build()
-
-  def build(self):
-    self.w = variables.Variable(
-        initial_value=random_ops.random_uniform((10, 10)), dtype=dtypes.float32)
-    self.iterations = variables.Variable(initial_value=0, dtype=dtypes.int32)
-    # Allow external control to make the model run its train_fn in an infinite
-    # loop. This allows us to reliably test worker preemption in the middle of
-    # function execution.
-    self.do_infinite_step = variables.Variable(False)
-
-    self.rebuild_iterators()
-
-  def rebuild_iterators(self, use_dataset_fn=True):
-
-    if use_dataset_fn:
-
-      def dataset_fn():
-        data = random_ops.random_uniform((10, 10))
-        dataset = dataset_ops.DatasetV2.from_tensors([data]).repeat()
-        return dataset
-
-      def distribute_dataset_fn():
-        return self.cluster_coord.strategy.distribute_datasets_from_function(
-            lambda _: dataset_fn())
-
-      self.iterator = iter(
-          self.cluster_coord.create_per_worker_dataset(distribute_dataset_fn))
-      self.iterator2 = iter(
-          self.cluster_coord.create_per_worker_dataset(distribute_dataset_fn))
-    else:
-      data = random_ops.random_uniform((10, 10))
-      dataset = dataset_ops.DatasetV2.from_tensors([data]).repeat()
-
-      self.iterator = iter(
-          self.cluster_coord.create_per_worker_dataset(dataset))
-      self.iterator2 = iter(
-          self.cluster_coord.create_per_worker_dataset(dataset))
-
-  def _train_fn_internal(self, iterator, iterator2):
-    x = math_ops.matmul(array_ops.squeeze(next(iterator)), self.w)
-    x = math_ops.matmul(array_ops.squeeze(next(iterator2)), x)
-    x = math_ops.matmul(random_ops.random_uniform((10, 10)), x)
-    self.w.assign_add(x)
-
-  @def_function.function
-  def train_fn(self, iterator, iterator2):
-    self._train_fn_internal(iterator, iterator2)
-    while self.do_infinite_step:
-      self._train_fn_internal(iterator, iterator2)
-    self.iterations.assign_add(1)
-
-  def schedule_training_functions(self, num_steps):
-    with self.strategy.scope():
-      for _ in range(num_steps):
-        self.cluster_coord.schedule(
-            self.train_fn, args=(self.iterator, self.iterator2))
-
-  def join_training_functions(self):
-    self.do_infinite_step.assign(False)
-    self.cluster_coord.join()
-
-
-class BaseFaultToleranceTest(object):  # pylint: disable=missing-docstring
-
-  def setUp(self, num_workers, num_ps):
-    super(BaseFaultToleranceTest, self).setUp()
-
-    self._cluster = multi_worker_test_base.create_multi_process_cluster(
-        num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
-    self._cluster_def = self._cluster.cluster_resolver.cluster_spec().as_dict()
-    self._cluster_def["chief"] = [
-        "localhost:%d" % multi_worker_test_base.pick_unused_port()
-    ]
-    cluster_resolver = SimpleClusterResolver(
-        server_lib.ClusterSpec(self._cluster_def), rpc_layer="grpc")
-
-    # The strategy's constructor would connect to the cluster.
-    self.strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
-        cluster_resolver)
-    self.cluster_coord = cluster_coordinator.ClusterCoordinator(self.strategy)
-
-    self.thread_coord = thread_coordinator.Coordinator(
-        clean_stop_exception_types=[])
-    self.num_workers = num_workers
-    self.num_ps = num_ps
-
-  def tearDown(self):
-    super(BaseFaultToleranceTest, self).tearDown()
-    self._cluster.stop()
-    self._cluster = None
-
-  def _restart(self, downtime_secs, job):
-    """Kills `job` (index: 0) and restarts it after `downtime_secs`.
-
-    Args:
-      downtime_secs: secs before restarting the job.
-      job: a string specifying the job to restart.
-    """
-    self._cluster.kill_task(job, 0)
-    time.sleep(downtime_secs)
-    self.assertFalse(context.check_alive("/job:%s/replica:0/task:0" % job))
-    self._cluster.start_task(job, 0)
-    while not context.check_alive("/job:%s/replica:0/task:0" % job):
-      time.sleep(1)
-
-  def _restart_in_thread(self, downtime_secs, restart_job):
-
-    def _restart_fn():
-      with self.thread_coord.stop_on_exception():
-        self._restart(downtime_secs, restart_job)
-
-    restart_thread = threading.Thread(target=_restart_fn)
-    restart_thread.start()
-    return restart_thread
-
-  def _ensure_threads_closed(self):
-    """Ensures worker and preemption threads are closed."""
-    # Worker and preemption threads should exist before releasing
-    # ClusterCoordinator.
-    running_threads = test_util.get_running_threads()
-    self.assertTrue(
-        test_util.has_thread(_WORKER_THREAD_PREFIX, running_threads))
-    self.assertIn(_WORKER_PREEMPTION_THREAD_NAME, running_threads)
-
-    # Print object graph if ClusterCoordinator may leak.
-    if sys.getrefcount(self.cluster_coord) > 2:
-      try:
-        test_util.show_backref(self.cluster_coord)
-      except:  # pylint: disable=bare-except
-        pass
-
-    # Wait for threads to close.
-    self.cluster_coord = None
-    self.strategy = None
-    gc.collect()
-    time.sleep(1)
-
-    # Verify thread names.
-    running_threads = test_util.get_running_threads()
-    self.assertNotIn(_WORKER_PREEMPTION_THREAD_NAME, running_threads)
-    self.assertFalse(
-        test_util.has_thread(_WORKER_THREAD_PREFIX, running_threads),
-        "Worker thread is not stopped properly.")
-
-  def _create_model_and_run_indefinitely(self):
-    model = Model(self.cluster_coord)
-    model.do_infinite_step.assign(True)
-    model.schedule_training_functions(10)
-    # Model does infinite training step, so at this moment, we expect to have
-    # `self.num_workers` infinite closures inflight, and `10-self.num_workers`
-    # closures in the queue.
-    while (self.cluster_coord._cluster.closure_queue._inflight_closure_count <
-           self.num_workers):
-      time.sleep(0.1)
-    return model
-
-  def testClusterCoordinatorDestroyed(self):
-    self._ensure_threads_closed()
-
-  def testWorkerPreemptionBetweenFunctions(self):
-    model = Model(self.cluster_coord)
-    model.schedule_training_functions(2)
-    model.join_training_functions()
-    self.assertEqual(model.iterations.numpy(), 2)
-
-    self._restart(downtime_secs=2, job="worker")
-
-    model.schedule_training_functions(2)
-    model.join_training_functions()
-    self.assertEqual(model.iterations.numpy(), 4)
-
-  def testWorkerPreemptionMidstFunction(self):
-    model = Model(self.cluster_coord)
-    model.do_infinite_step.assign(True)
-
-    model.schedule_training_functions(4)
-    # Model does infinite training step, so at this moment, we expect to have
-    # `self.num_workers` infinite closures inflight, and `4-self.num_workers`
-    # closures in the queue.
-    while (self.cluster_coord._cluster.closure_queue._inflight_closure_count <
-           self.num_workers):
-      time.sleep(0.1)
-    self.assertFalse(self.cluster_coord.done())
-    self._restart(downtime_secs=2, job="worker")
-    model.join_training_functions()
-    self.assertGreaterEqual(model.iterations.numpy(), 4)
-
-  def testOneWorkerPreemptionWithCancellation(self):
-
-    @def_function.function
-    def normal_function():
-      x = random_ops.random_uniform((2, 10))
-      y = random_ops.random_uniform((10, 2))
-      return math_ops.reduce_mean(math_ops.matmul(x, y))
-
-    @def_function.function
-    def error_function():
-      x = random_ops.random_uniform((2, 10))
-      y = random_ops.random_uniform((10, 2))
-      check_ops.assert_non_positive_v2(
-          math_ops.reduce_sum(math_ops.matmul(x, y)))
-      return x
-
-    @def_function.function
-    def long_function():
-      x = random_ops.random_uniform((1000, 1000))
-      for _ in math_ops.range(10000):
-        a = random_ops.random_uniform((1000, 1000))
-        b = random_ops.random_uniform((1000, 1000))
-        x += math_ops.matmul(a, b)
-      return x
-
-    for _ in range(3):
-      self.cluster_coord.schedule(normal_function)
-    long_function_result = self.cluster_coord.schedule(long_function)
-    self.cluster_coord.schedule(error_function)
-
-    time.sleep(1)  # Let it run a couple steps.
-    self._restart(1, "worker")
-
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.cluster_coord.join()
-
-    with self.assertRaises(errors.CancelledError):
-      long_function_result.fetch()
-
-    for _ in range(3):
-      self.cluster_coord.schedule(normal_function)
-    self.cluster_coord.join()
-
-    # The cluster is likely still being recovered since `join` returned early
-    # due to the error_function.
-    failure_handler = self.cluster_coord._cluster.failure_handler
-    failure_handler.stop()
-    failure_handler._preemption_handler_thread.join()
-
-  def testHandleDatasetCreationFailureWithDatasetFn(self):
-    model = Model(self.cluster_coord)
-
-    restart_thread = self._restart_in_thread(5, "worker")
-
-    model.schedule_training_functions(3)
-    model.rebuild_iterators()
-    model.schedule_training_functions(3)
-    model.rebuild_iterators()
-    model.schedule_training_functions(3)
-
-    model.join_training_functions()
-
-    self.thread_coord.join([restart_thread])
-    self.assertGreaterEqual(model.iterations.numpy(), 3)
-
-  # TODO(yuefengz): consider using combinations when there is more code
-  # duplication.
-  def testHandleDatasetCreationFailureWithDataset(self):
-    model = Model(self.cluster_coord)
-
-    restart_thread = self._restart_in_thread(5, "worker")
-
-    model.schedule_training_functions(3)
-    model.rebuild_iterators(use_dataset_fn=False)
-    model.schedule_training_functions(3)
-    model.rebuild_iterators(use_dataset_fn=False)
-    model.schedule_training_functions(3)
-
-    model.join_training_functions()
-
-    self.thread_coord.join([restart_thread])
-    self.assertGreaterEqual(model.iterations.numpy(), 3)
-
-  def testWorkerPreemptionErrorType(self):
-
-    @def_function.function
-    def worker_train_fn():
-      x = random_ops.random_uniform((2, 10))
-      y = random_ops.random_uniform((10, 2))
-      return math_ops.reduce_mean(math_ops.matmul(x, y))
-
-    def run_fn():
-      with self.thread_coord.stop_on_exception():
-        with ops.device("/job:worker/replica:0/task:0"):
-          for _ in range(3):
-            for _ in range(3):
-              worker_train_fn()
-            time.sleep(5)
-
-    run_thread = threading.Thread(target=run_fn)
-    run_thread.start()
-    time.sleep(1)  # Let it run a couple steps.
-    self._restart(2, "worker")
-
-    try:
-      self.thread_coord.join([run_thread])
-    except (errors.UnavailableError, errors.AbortedError) as e:
-      logging.info("Got exception %r, error message is %s", e, e)
-
-      self.assertIn(_RPC_ERROR_FROM_WORKER, str(e))  # pylint: disable=g-assert-in-except
-      self.assertNotIn(_RPC_ERROR_FROM_PS, str(e))
-
-      self.assertTrue("failed to connect to all addresses" in str(e) or
-                      "Unable to find a context_id" in str(e) or
-                      "Socket closed" in str(e) or
-                      "Connection reset by peer" in str(e) or
-                      "Transport closed" in str(e))
-
-  def testWorkerPreemptionErrorTypeWithPythonFunction(self):
-
-    def worker_train_fn():
-      x = random_ops.random_uniform((2, 10))
-      y = random_ops.random_uniform((10, 2))
-      return math_ops.reduce_mean(math_ops.matmul(x, y))
-
-    def run_fn():
-      with self.thread_coord.stop_on_exception():
-        with ops.device("/job:worker/replica:0/task:0"):
-          for _ in range(3):
-            for _ in range(3):
-              worker_train_fn()
-            time.sleep(5)
-
-    run_thread = threading.Thread(target=run_fn)
-    run_thread.start()
-    time.sleep(1)  # Let it run a couple steps.
-    self._restart(2, "worker")
-
-    try:
-      self.thread_coord.join([run_thread])
-    except (errors.UnavailableError, errors.AbortedError) as e:
-      logging.info("Got exception %r, error message is %s", e, e)
-
-      self.assertIn(_RPC_ERROR_FROM_WORKER, str(e))  # pylint: disable=g-assert-in-except
-      self.assertNotIn(_RPC_ERROR_FROM_PS, str(e))
-
-      self.assertTrue("failed to connect to all addresses" in str(e) or
-                      "Unable to find a context_id" in str(e) or
-                      "Socket closed" in str(e) or
-                      "Connection reset by peer" in str(e) or
-                      "Transport closed" in str(e))
-
-  def testPSPreemptionErrorType(self):
-
-    with ops.device("/job:ps/replica:0/task:0"):
-      v = variables.Variable(
-          initial_value=random_ops.random_uniform((2, 10)),
-          dtype=dtypes.float32)
-
-    @def_function.function
-    def worker_train_fn():
-      y = random_ops.random_uniform((10, 2))
-      return math_ops.reduce_mean(math_ops.matmul(v, y))
-
-    def run_fn():
-      with self.thread_coord.stop_on_exception():
-        with ops.device("/job:worker/replica:0/task:0"):
-          for _ in range(3):
-            for _ in range(3):
-              worker_train_fn()
-            time.sleep(5)
-
-    run_thread = threading.Thread(target=run_fn)
-    run_thread.start()
-    time.sleep(1)  # Let it run a couple steps.
-
-    # Use a short restart delay to cover the case that RPC channel is reused
-    self._restart(1, "ps")
-
-    try:
-      self.thread_coord.join([run_thread])
-    except (errors.UnavailableError, errors.AbortedError) as e:
-      logging.info("Got exception %r, error message is %s", e, e)
-      self.assertIn(_RPC_ERROR_FROM_PS, str(e))  # pylint: disable=g-assert-in-except
-
-      if isinstance(e, errors.UnavailableError):
-        self.assertTrue("failed to connect to all addresses" in str(e) or
-                        "Socket closed" in str(e) or
-                        "Connection reset by peer" in str(e) or
-                        "Transport closed" in str(e))
-
-      if isinstance(e, errors.AbortedError):
-        self.assertTrue(
-            "RecvTensor expects a different device incarnation" in str(e) or
-            "Unable to find a context_id" in str(e))
-      self._ensure_threads_closed()
-
-  def testTwoWorkersPreempted(self):
-    if self.num_workers < 2:
-      self.skipTest("Worker number is less than 2.")
-    model = self._create_model_and_run_indefinitely()
-
-    self.assertFalse(self.cluster_coord.done())
-    self._cluster.kill_task("worker", 0)
-    self._cluster.kill_task("worker", 1)
-    time.sleep(2)
-    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
-    self.assertFalse(context.check_alive("/job:worker/replica:0/task:1"))
-    self._cluster.start_task("worker", 0)
-    self._cluster.start_task("worker", 1)
-    time.sleep(2)
-    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
-    self.assertTrue(context.check_alive("/job:worker/replica:0/task:1"))
-
-    model.join_training_functions()
-    self.assertGreaterEqual(model.iterations.numpy(), 10)
-
-  def testWorkerContinuousFailure(self):
-    model = self._create_model_and_run_indefinitely()
-
-    self.assertFalse(self.cluster_coord.done())
-    self._cluster.kill_task("worker", 0)
-    time.sleep(2)
-    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
-    self._cluster.start_task("worker", 0)
-    time.sleep(2)
-    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
-    self._cluster.kill_task("worker", 0)
-    time.sleep(2)
-    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
-    self._cluster.start_task("worker", 0)
-    time.sleep(2)
-    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
-
-    model.join_training_functions()
-    self.assertGreaterEqual(model.iterations.numpy(), 10)
-
-  def testPSFailureWhileRecoveryFromWokerFailure(self):
-    model = self._create_model_and_run_indefinitely()
-
-    time.sleep(1)
-    self.assertFalse(self.cluster_coord.done())
-
-    def kill(task):
-      self._cluster.kill_task(task, 0)
-      self.sleep(1)
-      self._cluster.start_task(task, 0)
-
-    kill_thread_1 = threading.Thread(target=kill, args=("worker",))
-    kill_thread_2 = threading.Thread(target=kill, args=("ps",))
-    kill_thread_1.start()
-    kill_thread_2.start()
-    kill_thread_1.join()
-    kill_thread_2.join()
-
-    with self.assertRaises(
-        (errors.UnavailableError, errors.InvalidArgumentError)):
-      model.join_training_functions()
-
-  def testNumpyFetchedAfterWorkerFailure(self):
-
-    with self.strategy.scope():
-      v = variables.Variable(initial_value=0, dtype=dtypes.int32)
-
-    @def_function.function
-    def worker_fn():
-      return v + 1, v - 1
-
-    remote_value = self.cluster_coord.schedule(worker_fn)
-    # Attempt to fetch before killing worker task should succeed.
-    self.assertEqual((1, -1), remote_value.fetch())
-    self._cluster.kill_task("worker", 0)
-    # So should attempt to fetch after killing worker task.
-    self.assertEqual((1, -1), remote_value.fetch())
-
-  def testTensorGotAfterWorkerFailure(self):
-
-    with self.strategy.scope():
-      v = variables.Variable(initial_value=0, dtype=dtypes.int32)
-
-    @def_function.function
-    def worker_fn():
-      return v + 1, v - 1
-
-    remote_value = self.cluster_coord.schedule(worker_fn)
-
-    # Attempt to fetch before killing worker task should succeed.
-    fetched = remote_value.get()[0]
-    self.assertIsInstance(fetched, ops.Tensor)
-    self.assertEqual(fetched.device, "/job:chief/replica:0/task:0/device:CPU:0")
-    self.assertEqual((1, -1), remote_value.get())
-    remote_value.get()[0].numpy()
-
-    # As well as the remote tensors that point to worker0 or worker1.
-    values = remote_value._values[0]
-    self.assertIsInstance(values, ops.Tensor)
-    self.assertRegex(values.device,
-                     "/job:worker/replica:0/task:[0-1]/device:CPU:0")
-    self.assertEqual((1, -1), remote_value._values)
-    remote_value._values[0].numpy()
-
-    # Terminate the workers and wait a little so that they are indeed killed.
-    for i in range(self.num_workers):
-      self._cluster.kill_task("worker", i)
-    time.sleep(5)
-
-    # Attempt to fetch after killing worker tasks should succeed as well.
-    remote_value.get()[0].numpy()
-    self.assertEqual((1, -1), remote_value.get())
-
-    # Attempting to copy the tensor from worker now should fail.
-    with self.assertRaises(errors.UnavailableError) as cm:
-      remote_value._values[0].numpy()
-    self.assertIn("failed to connect to all addresses", cm.exception.message)
-    self.assertIn("/job:worker/replica:0/task:", cm.exception.message)
-
-  def testFetchFromPSAfterWorkerFailure(self):
-    # Test for flaky failures when reading from a parameter server while a
-    # worker is recovering.
-    # Place some variables on PSes using distribute_datasets_from_function,
-    # kill a worker, and continuously poll one of those variables.
-
-    model = Model(self.cluster_coord)
-
-    # kill the worker after a delay to make sure variable reading runs while
-    # worker is up, while it's down, and while it restarts
-    def kill_after_delay():
-      time.sleep(3)
-      logging.info("Killing worker 0")
-      self._cluster.kill_task("worker", 0)
-      time.sleep(1)
-      logging.info("Restarting worker 0")
-      self._cluster.start_task("worker", 0)
-
-    kill_thread = threading.Thread(target=kill_after_delay)
-    kill_thread.start()
-
-    model.do_infinite_step.assign(True)
-    model.schedule_training_functions(1)
-
-    num_reads = 0
-    num_reads_after_restart = 0
-    read_interval_secs = 0.1
-    worker_has_stopped = False
-    # limit runtime of the test: stop after doing a few reads after worker
-    # is back up, or after a fixed maximum number of reads
-    while num_reads_after_restart <= 5 and num_reads < 200:
-      worker_up = context.check_alive("/job:worker/replica:0/task:0")
-      if not worker_up:
-        worker_has_stopped = True
-      if worker_up and worker_has_stopped:
-        num_reads_after_restart += 1
-
-      model.join_training_functions()
-      start = time.time()
-      while time.time() < start + read_interval_secs:
-        model.iterations.read_value()
-
-      num_reads += 1
-      # run another epoch
-      model.do_infinite_step.assign(True)
-      model.schedule_training_functions(1)
-
-  def testClusterStateNotDisrupted(self):
-    # This test has side effects and can disrupt other tests, even if the
-    # resource created by it will not be used in following tests.
-    # TODO(b/155209534): enable this test.
-    # self.testPSPreemptionErrorType()
-
-    self.thread_coord = thread_coordinator.Coordinator(
-        clean_stop_exception_types=[])
-    self.testWorkerPreemptionMidstFunction()
-
-    self.thread_coord = thread_coordinator.Coordinator(
-        clean_stop_exception_types=[])
-    self.testWorkerPreemptionErrorType()
-
-    # In previous tests, workers may fail after training is done. But the
-    # following tests start with creating resources where failure is not
-    # handled.
-    # TODO(b/153888707): enable the following two tests.
-    # self.testTwoWorkersPreempted()
-    # self.testWorkerContinuousFailure()
-
-  def testJoinRaisesUnavailableErrorAtPsFailure(self):
-    self._create_model_and_run_indefinitely()
-    self._cluster.kill_task("ps", 0)
-    while self.cluster_coord._cluster.closure_queue._error is None:
-      time.sleep(1)
-    with self.assertRaises((errors.UnavailableError, errors.NotFoundError,
-                            errors.FailedPreconditionError)):
-      self.cluster_coord.join()
-
-  def testScheduleRaisesUnavailableErrorAtPsFailure(self):
-    self._create_model_and_run_indefinitely()
-    self._cluster.kill_task("ps", 0)
-    while self.cluster_coord._cluster.closure_queue._error is None:
-      time.sleep(1)
-    with self.assertRaises((errors.UnavailableError, errors.NotFoundError,
-                            errors.FailedPreconditionError)):
-      self.cluster_coord.schedule(def_function.function(lambda: None))
-
-  def testWorkerExecutionAfterPsFailureRaisesExpectedError(self):
-    model = self._create_model_and_run_indefinitely()
-    for i in range(self.num_ps):
-      self._cluster.kill_task("ps", i)
-    while self.cluster_coord._cluster.closure_queue._error is None:
-      time.sleep(1)
-
-    @def_function.function
-    def trivial_function():
-      return model.iterations + 1
-
-    for i in range(self.num_workers):
-      try:
-        with ops.device("/job:worker/replica:0/task:{}".format(i)):
-          trivial_function()
-      except Exception as e:  # pylint: disable=broad-except
-        if cluster_coordinator._is_ps_failure(e):
-          if i < self.num_workers - 1:
-            continue
-          return
-      raise AssertionError("Executing a function after PS fails, should "
-                           "result in a PS failure.")
-
-  def testAsyncWaitIsNoOp(self):
-    if self.num_workers < 2:
-      self.skipTest("Worker number is less than 2.")
-    model = self._create_model_and_run_indefinitely()
-
-    self.assertFalse(self.cluster_coord.done())
-    self._cluster.kill_task("worker", 0)
-    time.sleep(2)
-    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
-    # Should pass without exception even with failed remote workers
-    context.async_wait()
-
-    model.join_training_functions()
-    self.assertGreaterEqual(model.iterations.numpy(), 10)
-
-    self._cluster.start_task("worker", 0)
-
-
-class MultiWorkerFaultToleranceTest(BaseFaultToleranceTest, test.TestCase):
+class MultiWorkerFaultToleranceTest(
+    fault_tolerance_test_base.BaseFaultToleranceTest, test.TestCase):
   """Multi worker fault tolerance tests.
 
   This covers the ordinary cases where multiple workers and PS are used.
@@ -695,7 +32,8 @@ class MultiWorkerFaultToleranceTest(BaseFaultToleranceTest, test.TestCase):
     super(MultiWorkerFaultToleranceTest, self).setUp(2, 2)
 
 
-class SingleWorkerFaultToleranceTest(BaseFaultToleranceTest, test.TestCase):
+class SingleWorkerFaultToleranceTest(
+    fault_tolerance_test_base.BaseFaultToleranceTest, test.TestCase):
   """Single worker fault tolerance tests.
 
   This covers the cases that ensure training can continue in a single-worker
diff --git a/tensorflow/python/distribute/coordinator/fault_tolerance_test_base.py b/tensorflow/python/distribute/coordinator/fault_tolerance_test_base.py
new file mode 100644
index 00000000000..510f04f29bb
--- /dev/null
+++ b/tensorflow/python/distribute/coordinator/fault_tolerance_test_base.py
@@ -0,0 +1,696 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fault tolerance test base class for parameter server training in TF2."""
+
+import gc
+import os
+import sys
+import threading
+import time
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute import test_util
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.coordinator import cluster_coordinator
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import coordinator as thread_coordinator
+from tensorflow.python.training import server_lib
+
+
+_RPC_ERROR_FROM_WORKER = "GRPC error information from remote target /job:worker"
+_RPC_ERROR_FROM_PS = "GRPC error information from remote target /job:ps"
+_WORKER_PREEMPTION_THREAD_NAME = "WorkerPreemptionHandler"
+_WORKER_THREAD_PREFIX = "WorkerClosureProcessingLoop"
+
+
+class Model(object):
+
+  def __init__(self, coordinator):
+    self.cluster_coord = coordinator
+    self.strategy = self.cluster_coord.strategy
+    with self.cluster_coord.strategy.scope():
+      self.build()
+
+  def build(self):
+    self.w = variables.Variable(
+        initial_value=random_ops.random_uniform((10, 10)), dtype=dtypes.float32)
+    self.iterations = variables.Variable(initial_value=0, dtype=dtypes.int32)
+    # Allow external control to make the model run its train_fn in an infinite
+    # loop. This allows us to reliably test worker preemption in the middle of
+    # function execution.
+    self.do_infinite_step = variables.Variable(False)
+
+    self.rebuild_iterators()
+
+  def rebuild_iterators(self, use_dataset_fn=True):
+    if use_dataset_fn:
+
+      def dataset_fn():
+        data = random_ops.random_uniform((10, 10))
+        dataset = dataset_ops.DatasetV2.from_tensors([data]).repeat()
+        return dataset
+
+      def distribute_dataset_fn():
+        return self.cluster_coord.strategy.distribute_datasets_from_function(
+            lambda _: dataset_fn())
+
+      self.iterator = iter(
+          self.cluster_coord.create_per_worker_dataset(distribute_dataset_fn))
+      self.iterator2 = iter(
+          self.cluster_coord.create_per_worker_dataset(distribute_dataset_fn))
+    else:
+      data = random_ops.random_uniform((10, 10))
+      dataset = dataset_ops.DatasetV2.from_tensors([data]).repeat()
+
+      self.iterator = iter(
+          self.cluster_coord.create_per_worker_dataset(dataset))
+      self.iterator2 = iter(
+          self.cluster_coord.create_per_worker_dataset(dataset))
+
+  def _train_fn_internal(self, iterator, iterator2):
+    x = math_ops.matmul(array_ops.squeeze(next(iterator)), self.w)
+    x = math_ops.matmul(array_ops.squeeze(next(iterator2)), x)
+    x = math_ops.matmul(random_ops.random_uniform((10, 10)), x)
+    self.w.assign_add(x)
+
+  @def_function.function
+  def train_fn(self, iterator, iterator2):
+    self._train_fn_internal(iterator, iterator2)
+    while self.do_infinite_step:
+      self._train_fn_internal(iterator, iterator2)
+    self.iterations.assign_add(1)
+
+  def schedule_training_functions(self, num_steps):
+    with self.strategy.scope():
+      for _ in range(num_steps):
+        self.cluster_coord.schedule(
+            self.train_fn, args=(self.iterator, self.iterator2))
+
+  def join_training_functions(self):
+    self.do_infinite_step.assign(False)
+    self.cluster_coord.join()
+
+
+class BaseFaultToleranceTest(object):  # pylint: disable=missing-docstring
+
+  def setUp(self, num_workers, num_ps, use_cs=False):
+    super(BaseFaultToleranceTest, self).setUp()
+
+    self._cluster = multi_worker_test_base.create_multi_process_cluster(
+        num_workers=num_workers,
+        num_ps=num_ps,
+        rpc_layer="grpc",
+        stream_output=True,
+    )
+    self._cluster_def = self._cluster.cluster_resolver.cluster_spec().as_dict()
+    self._cluster_def["chief"] = [
+        "localhost:%d" % multi_worker_test_base.pick_unused_port()
+    ]
+    cluster_resolver = SimpleClusterResolver(
+        server_lib.ClusterSpec(self._cluster_def), rpc_layer="grpc"
+    )
+
+    if use_cs:
+      os.environ["TF_PSS_ENABLE_COORDINATION_SERVICE"] = "1"
+    # The strategy's constructor would connect to the cluster.
+    self.strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        cluster_resolver
+    )
+    self.cluster_coord = cluster_coordinator.ClusterCoordinator(self.strategy)
+
+    self.thread_coord = thread_coordinator.Coordinator(
+        clean_stop_exception_types=[]
+    )
+    self.num_workers = num_workers
+    self.num_ps = num_ps
+
+  def tearDown(self):
+    super(BaseFaultToleranceTest, self).tearDown()
+    self._cluster.stop()
+    self._cluster = None
+
+  def _restart(self, downtime_secs, job):
+    """Kills `job` (index: 0) and restarts it after `downtime_secs`.
+
+    Args:
+      downtime_secs: secs before restarting the job.
+      job: a string specifying the job to restart.
+    """
+    self._cluster.kill_task(job, 0)
+    time.sleep(downtime_secs)
+    self.assertFalse(context.check_alive("/job:%s/replica:0/task:0" % job))
+    self._cluster.start_task(job, 0)
+    while not context.check_alive("/job:%s/replica:0/task:0" % job):
+      time.sleep(1)
+
+  def _restart_in_thread(self, downtime_secs, restart_job):
+
+    def _restart_fn():
+      with self.thread_coord.stop_on_exception():
+        self._restart(downtime_secs, restart_job)
+
+    restart_thread = threading.Thread(target=_restart_fn)
+    restart_thread.start()
+    return restart_thread
+
+  def _ensure_threads_closed(self):
+    """Ensures worker and preemption threads are closed."""
+    # Worker and preemption threads should exist before releasing
+    # ClusterCoordinator.
+    running_threads = test_util.get_running_threads()
+    self.assertTrue(
+        test_util.has_thread(_WORKER_THREAD_PREFIX, running_threads))
+    self.assertIn(_WORKER_PREEMPTION_THREAD_NAME, running_threads)
+
+    # Print object graph if ClusterCoordinator may leak.
+    if sys.getrefcount(self.cluster_coord) > 2:
+      try:
+        test_util.show_backref(self.cluster_coord)
+      except:  # pylint: disable=bare-except
+        pass
+
+    # Wait for threads to close.
+    self.cluster_coord = None
+    self.strategy = None
+    gc.collect()
+    time.sleep(1)
+
+    # Verify thread names.
+    running_threads = test_util.get_running_threads()
+    self.assertNotIn(_WORKER_PREEMPTION_THREAD_NAME, running_threads)
+    self.assertFalse(
+        test_util.has_thread(_WORKER_THREAD_PREFIX, running_threads),
+        "Worker thread is not stopped properly.")
+
+  def _create_model_and_run_indefinitely(self):
+    model = Model(self.cluster_coord)
+    model.do_infinite_step.assign(True)
+    model.schedule_training_functions(10)
+    # Model does infinite training step, so at this moment, we expect to have
+    # `self.num_workers` infinite closures inflight, and `10-self.num_workers`
+    # closures in the queue.
+    while (self.cluster_coord._cluster.closure_queue._inflight_closure_count <
+           self.num_workers):
+      time.sleep(0.1)
+    return model
+
+  def testClusterCoordinatorDestroyed(self):
+    self._ensure_threads_closed()
+
+  def testWorkerPreemptionBetweenFunctions(self):
+    model = Model(self.cluster_coord)
+    model.schedule_training_functions(2)
+    model.join_training_functions()
+    self.assertEqual(model.iterations.numpy(), 2)
+
+    self._restart(downtime_secs=2, job="worker")
+
+    model.schedule_training_functions(2)
+    model.join_training_functions()
+    self.assertEqual(model.iterations.numpy(), 4)
+
+  def testWorkerPreemptionMidstFunction(self):
+    model = Model(self.cluster_coord)
+    model.do_infinite_step.assign(True)
+
+    model.schedule_training_functions(4)
+    # Model does infinite training step, so at this moment, we expect to have
+    # `self.num_workers` infinite closures inflight, and `4-self.num_workers`
+    # closures in the queue.
+    while (self.cluster_coord._cluster.closure_queue._inflight_closure_count <
+           self.num_workers):
+      time.sleep(0.1)
+    self.assertFalse(self.cluster_coord.done())
+    self._restart(downtime_secs=2, job="worker")
+    model.join_training_functions()
+    self.assertGreaterEqual(model.iterations.numpy(), 4)
+
+  def testOneWorkerPreemptionWithCancellation(self):
+
+    @def_function.function
+    def normal_function():
+      x = random_ops.random_uniform((2, 10))
+      y = random_ops.random_uniform((10, 2))
+      return math_ops.reduce_mean(math_ops.matmul(x, y))
+
+    @def_function.function
+    def error_function():
+      x = random_ops.random_uniform((2, 10))
+      y = random_ops.random_uniform((10, 2))
+      check_ops.assert_non_positive_v2(
+          math_ops.reduce_sum(math_ops.matmul(x, y)))
+      return x
+
+    @def_function.function
+    def long_function():
+      x = random_ops.random_uniform((1000, 1000))
+      for _ in math_ops.range(10000):
+        a = random_ops.random_uniform((1000, 1000))
+        b = random_ops.random_uniform((1000, 1000))
+        x += math_ops.matmul(a, b)
+      return x
+
+    for _ in range(3):
+      self.cluster_coord.schedule(normal_function)
+    long_function_result = self.cluster_coord.schedule(long_function)
+    self.cluster_coord.schedule(error_function)
+
+    time.sleep(1)  # Let it run a couple steps.
+    self._restart(2, "worker")
+
+    # InvalidArgumentError thrown from the error_function.
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.cluster_coord.join()
+
+    # CancelledError thrown by ClusterCoordinator after cancelling due to user
+    # error.
+    with self.assertRaises(errors.CancelledError):
+      long_function_result.fetch()
+
+    for _ in range(3):
+      self.cluster_coord.schedule(normal_function)
+    self.cluster_coord.join()
+
+    # The cluster is likely still being recovered since `join` returned early
+    # due to the error_function.
+    failure_handler = self.cluster_coord._cluster.failure_handler
+    failure_handler.stop()
+    failure_handler._preemption_handler_thread.join()
+
+  def testHandleDatasetCreationFailureWithDatasetFn(self):
+    model = Model(self.cluster_coord)
+
+    restart_thread = self._restart_in_thread(5, "worker")
+
+    model.schedule_training_functions(3)
+    model.rebuild_iterators()
+    model.schedule_training_functions(3)
+    model.rebuild_iterators()
+    model.schedule_training_functions(3)
+
+    model.join_training_functions()
+
+    self.thread_coord.join([restart_thread])
+    self.assertGreaterEqual(model.iterations.numpy(), 3)
+
+  # TODO(yuefengz): consider using combinations when there is more code
+  # duplication.
+  def testHandleDatasetCreationFailureWithDataset(self):
+    model = Model(self.cluster_coord)
+
+    restart_thread = self._restart_in_thread(5, "worker")
+
+    model.schedule_training_functions(3)
+    model.rebuild_iterators(use_dataset_fn=False)
+    model.schedule_training_functions(3)
+    model.rebuild_iterators(use_dataset_fn=False)
+    model.schedule_training_functions(3)
+
+    model.join_training_functions()
+
+    self.thread_coord.join([restart_thread])
+    self.assertGreaterEqual(model.iterations.numpy(), 3)
+
+  def testWorkerPreemptionErrorType(self):
+
+    @def_function.function
+    def worker_train_fn():
+      x = random_ops.random_uniform((2, 10))
+      y = random_ops.random_uniform((10, 2))
+      return math_ops.reduce_mean(math_ops.matmul(x, y))
+
+    def run_fn():
+      with self.thread_coord.stop_on_exception():
+        with ops.device("/job:worker/replica:0/task:0"):
+          for _ in range(3):
+            for _ in range(3):
+              worker_train_fn()
+            time.sleep(5)
+
+    run_thread = threading.Thread(target=run_fn)
+    run_thread.start()
+    time.sleep(1)  # Let it run a couple steps.
+    self._restart(2, "worker")
+
+    try:
+      self.thread_coord.join([run_thread])
+    except (errors.UnavailableError, errors.AbortedError) as e:
+      logging.info("Got exception %r, error message is %s", e, e)
+
+      self.assertIn(_RPC_ERROR_FROM_WORKER, str(e))  # pylint: disable=g-assert-in-except
+      self.assertNotIn(_RPC_ERROR_FROM_PS, str(e))
+
+      self.assertTrue("failed to connect to all addresses" in str(e) or
+                      "Unable to find a context_id" in str(e) or
+                      "Socket closed" in str(e) or
+                      "Connection reset by peer" in str(e) or
+                      "Transport closed" in str(e))
+
+  def testWorkerPreemptionErrorTypeWithPythonFunction(self):
+
+    def worker_train_fn():
+      x = random_ops.random_uniform((2, 10))
+      y = random_ops.random_uniform((10, 2))
+      return math_ops.reduce_mean(math_ops.matmul(x, y))
+
+    def run_fn():
+      with self.thread_coord.stop_on_exception():
+        with ops.device("/job:worker/replica:0/task:0"):
+          for _ in range(3):
+            for _ in range(3):
+              worker_train_fn()
+            time.sleep(5)
+
+    run_thread = threading.Thread(target=run_fn)
+    run_thread.start()
+    time.sleep(1)  # Let it run a couple steps.
+    self._restart(2, "worker")
+
+    try:
+      self.thread_coord.join([run_thread])
+    except (errors.UnavailableError, errors.AbortedError) as e:
+      logging.info("Got exception %r, error message is %s", e, e)
+
+      self.assertIn(_RPC_ERROR_FROM_WORKER, str(e))  # pylint: disable=g-assert-in-except
+      self.assertNotIn(_RPC_ERROR_FROM_PS, str(e))
+
+      self.assertTrue("failed to connect to all addresses" in str(e) or
+                      "Unable to find a context_id" in str(e) or
+                      "Socket closed" in str(e) or
+                      "Connection reset by peer" in str(e) or
+                      "Transport closed" in str(e))
+
+  def testPSPreemptionErrorType(self):
+
+    with ops.device("/job:ps/replica:0/task:0"):
+      v = variables.Variable(
+          initial_value=random_ops.random_uniform((2, 10)),
+          dtype=dtypes.float32)
+
+    @def_function.function
+    def worker_train_fn():
+      y = random_ops.random_uniform((10, 2))
+      return math_ops.reduce_mean(math_ops.matmul(v, y))
+
+    def run_fn():
+      with self.thread_coord.stop_on_exception():
+        with ops.device("/job:worker/replica:0/task:0"):
+          for _ in range(3):
+            for _ in range(3):
+              worker_train_fn()
+            time.sleep(5)
+
+    run_thread = threading.Thread(target=run_fn)
+    run_thread.start()
+    time.sleep(1)  # Let it run a couple steps.
+
+    # Use a short restart delay to cover the case that RPC channel is reused
+    self._restart(1, "ps")
+
+    try:
+      self.thread_coord.join([run_thread])
+    except (errors.UnavailableError, errors.AbortedError) as e:
+      logging.info("Got exception %r, error message is %s", e, e)
+      self.assertIn(_RPC_ERROR_FROM_PS, str(e))  # pylint: disable=g-assert-in-except
+
+      if isinstance(e, errors.UnavailableError):
+        self.assertTrue("failed to connect to all addresses" in str(e) or
+                        "Socket closed" in str(e) or
+                        "Connection reset by peer" in str(e) or
+                        "Transport closed" in str(e))
+
+      if isinstance(e, errors.AbortedError):
+        self.assertTrue(
+            "RecvTensor expects a different device incarnation" in str(e) or
+            "Unable to find a context_id" in str(e))
+      self._ensure_threads_closed()
+
+  def testTwoWorkersPreempted(self):
+    if self.num_workers < 2:
+      self.skipTest("Worker number is less than 2.")
+    model = self._create_model_and_run_indefinitely()
+
+    self.assertFalse(self.cluster_coord.done())
+    self._cluster.kill_task("worker", 0)
+    self._cluster.kill_task("worker", 1)
+    time.sleep(2)
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:1"))
+    self._cluster.start_task("worker", 0)
+    self._cluster.start_task("worker", 1)
+    time.sleep(2)
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:1"))
+
+    model.join_training_functions()
+    self.assertGreaterEqual(model.iterations.numpy(), 10)
+
+  def testWorkerContinuousFailure(self):
+    model = self._create_model_and_run_indefinitely()
+
+    self.assertFalse(self.cluster_coord.done())
+    self._cluster.kill_task("worker", 0)
+    time.sleep(2)
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
+    self._cluster.start_task("worker", 0)
+    time.sleep(2)
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
+    self._cluster.kill_task("worker", 0)
+    time.sleep(2)
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
+    self._cluster.start_task("worker", 0)
+    time.sleep(2)
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
+
+    model.join_training_functions()
+    self.assertGreaterEqual(model.iterations.numpy(), 10)
+
+  def testPSFailureWhileRecoveryFromWokerFailure(self):
+    model = self._create_model_and_run_indefinitely()
+
+    time.sleep(1)
+    self.assertFalse(self.cluster_coord.done())
+
+    def kill(task):
+      self._cluster.kill_task(task, 0)
+      self.sleep(1)
+      self._cluster.start_task(task, 0)
+
+    kill_thread_1 = threading.Thread(target=kill, args=("worker",))
+    kill_thread_2 = threading.Thread(target=kill, args=("ps",))
+    kill_thread_1.start()
+    kill_thread_2.start()
+    kill_thread_1.join()
+    kill_thread_2.join()
+
+    with self.assertRaises(
+        (errors.UnavailableError, errors.InvalidArgumentError)):
+      model.join_training_functions()
+
+  def testNumpyFetchedAfterWorkerFailure(self):
+
+    with self.strategy.scope():
+      v = variables.Variable(initial_value=0, dtype=dtypes.int32)
+
+    @def_function.function
+    def worker_fn():
+      return v + 1, v - 1
+
+    remote_value = self.cluster_coord.schedule(worker_fn)
+    # Attempt to fetch before killing worker task should succeed.
+    self.assertEqual((1, -1), remote_value.fetch())
+    self._cluster.kill_task("worker", 0)
+    # So should attempt to fetch after killing worker task.
+    self.assertEqual((1, -1), remote_value.fetch())
+
+  def testTensorGotAfterWorkerFailure(self):
+
+    with self.strategy.scope():
+      v = variables.Variable(initial_value=0, dtype=dtypes.int32)
+
+    @def_function.function
+    def worker_fn():
+      return v + 1, v - 1
+
+    remote_value = self.cluster_coord.schedule(worker_fn)
+
+    # Attempt to fetch before killing worker task should succeed.
+    fetched = remote_value.get()[0]
+    self.assertIsInstance(fetched, ops.Tensor)
+    self.assertEqual(fetched.device, "/job:chief/replica:0/task:0/device:CPU:0")
+    self.assertEqual((1, -1), remote_value.get())
+    remote_value.get()[0].numpy()
+
+    # As well as the remote tensors that point to worker0 or worker1.
+    values = remote_value._values[0]
+    self.assertIsInstance(values, ops.Tensor)
+    self.assertRegex(values.device,
+                     "/job:worker/replica:0/task:[0-1]/device:CPU:0")
+    self.assertEqual((1, -1), remote_value._values)
+    remote_value._values[0].numpy()
+
+    # Terminate the workers and wait a little so that they are indeed killed.
+    for i in range(self.num_workers):
+      self._cluster.kill_task("worker", i)
+    time.sleep(5)
+
+    # Attempt to fetch after killing worker tasks should succeed as well.
+    remote_value.get()[0].numpy()
+    self.assertEqual((1, -1), remote_value.get())
+
+    # Attempting to copy the tensor from worker now should fail.
+    with self.assertRaises(errors.UnavailableError) as cm:
+      remote_value._values[0].numpy()
+    self.assertIn("failed to connect to all addresses", cm.exception.message)
+    self.assertIn("/job:worker/replica:0/task:", cm.exception.message)
+
+  def testFetchFromPSAfterWorkerFailure(self):
+    # Test for flaky failures when reading from a parameter server while a
+    # worker is recovering.
+    # Place some variables on PSes, kill a worker, and continuously poll one of
+    # those variables.
+
+    model = Model(self.cluster_coord)
+
+    # kill the worker after a delay to make sure variable reading runs while
+    # worker is up, while it's down, and while it restarts
+    def kill_after_delay():
+      time.sleep(3)
+      logging.info("Killing worker 0")
+      self._cluster.kill_task("worker", 0)
+      time.sleep(1)
+      logging.info("Restarting worker 0")
+      self._cluster.start_task("worker", 0)
+
+    kill_thread = threading.Thread(target=kill_after_delay)
+    kill_thread.start()
+
+    model.do_infinite_step.assign(True)
+    model.schedule_training_functions(1)
+
+    num_reads = 0
+    num_reads_after_restart = 0
+    read_interval_secs = 0.1
+    worker_has_stopped = False
+    # limit runtime of the test: stop after doing a few reads after worker
+    # is back up, or after a fixed maximum number of reads
+    while num_reads_after_restart <= 5 and num_reads < 200:
+      worker_up = context.check_alive("/job:worker/replica:0/task:0")
+      if not worker_up:
+        worker_has_stopped = True
+      if worker_up and worker_has_stopped:
+        num_reads_after_restart += 1
+
+      model.join_training_functions()
+      start = time.time()
+      while time.time() < start + read_interval_secs:
+        model.iterations.read_value()
+
+      num_reads += 1
+      # run another epoch
+      model.do_infinite_step.assign(True)
+      model.schedule_training_functions(1)
+
+  def testClusterStateNotDisrupted(self):
+    # This test has side effects and can disrupt other tests, even if the
+    # resource created by it will not be used in following tests.
+    # TODO(b/155209534): enable this test.
+    # self.testPSPreemptionErrorType()
+
+    self.thread_coord = thread_coordinator.Coordinator(
+        clean_stop_exception_types=[])
+    self.testWorkerPreemptionMidstFunction()
+
+    self.thread_coord = thread_coordinator.Coordinator(
+        clean_stop_exception_types=[])
+    self.testWorkerPreemptionErrorType()
+
+    # In previous tests, workers may fail after training is done. But the
+    # following tests start with creating resources where failure is not
+    # handled.
+    # TODO(b/153888707): enable the following two tests.
+    # self.testTwoWorkersPreempted()
+    # self.testWorkerContinuousFailure()
+
+  def _run_and_kill_ps_task(self):
+    self._create_model_and_run_indefinitely()
+    self._cluster.kill_task("ps", 0)
+    while self.cluster_coord._cluster.closure_queue._error is None:
+      time.sleep(1)
+    logging.info("Trying to join, expecting error")
+
+  def testJoinRaisesUnavailableErrorAtPsFailure(self):
+    self._run_and_kill_ps_task()
+    with self.assertRaises((errors.UnavailableError, errors.NotFoundError,
+                            errors.FailedPreconditionError)):
+      self.cluster_coord.join()
+
+  def testScheduleRaisesUnavailableErrorAtPsFailure(self):
+    self._run_and_kill_ps_task()
+    with self.assertRaises((errors.UnavailableError, errors.NotFoundError,
+                            errors.FailedPreconditionError)):
+      self.cluster_coord.schedule(def_function.function(lambda: None))
+
+  def testWorkerExecutionAfterPsFailureRaisesExpectedError(self):
+    model = self._create_model_and_run_indefinitely()
+    for i in range(self.num_ps):
+      self._cluster.kill_task("ps", i)
+    while self.cluster_coord._cluster.closure_queue._error is None:
+      time.sleep(1)
+
+    @def_function.function
+    def trivial_function():
+      return model.iterations + 1
+
+    for i in range(self.num_workers):
+      try:
+        with ops.device("/job:worker/replica:0/task:{}".format(i)):
+          trivial_function()
+      except Exception as e:  # pylint: disable=broad-except
+        if cluster_coordinator._is_ps_failure(e):  # pylint: disable=protected-access
+          if i < self.num_workers - 1:
+            continue
+          return
+      raise AssertionError("Executing a function after PS fails, should "
+                           "result in a PS failure.")
+
+  def testAsyncWaitIsNoOp(self):
+    if self.num_workers < 2:
+      self.skipTest("Worker number is less than 2.")
+    model = self._create_model_and_run_indefinitely()
+
+    self.assertFalse(self.cluster_coord.done())
+    self._cluster.kill_task("worker", 0)
+    time.sleep(2)
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
+    # Should pass without exception even with failed remote workers
+    context.async_wait()
+
+    model.join_training_functions()
+    self.assertGreaterEqual(model.iterations.numpy(), 10)
+
+    self._cluster.start_task("worker", 0)
diff --git a/tensorflow/python/distribute/coordinator/get_task_states_test.py b/tensorflow/python/distribute/coordinator/get_task_states_test.py
index 7d69a9cbc1a..5b04771f314 100644
--- a/tensorflow/python/distribute/coordinator/get_task_states_test.py
+++ b/tensorflow/python/distribute/coordinator/get_task_states_test.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """Get task states test for parameter server strategy in TF2."""
 
-import threading
 import time
 
 from tensorflow.core.lib.core import error_codes_pb2
@@ -22,50 +21,18 @@ from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import parameter_server_strategy_v2
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute.coordinator import cluster_coordinator
+from tensorflow.python.distribute.coordinator import utils
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import errors
 from tensorflow.python.training import server_lib
 
 _PULL_FREQ_IN_SEC = 2
-_COORDINATION_ERROR_PAYLOAD_KEY = "type.googleapis.com/tensorflow.CoordinationServiceError"
-
-
-# TODO(b/249134783): This should be in a common util since it may also be used
-# by the main code.
-class RepeatedTimer(object):
-  """Threaded Repeated Timer from http://shortn/_3hMZTFr1Iv."""
-
-  def __init__(self, interval, function, *args):
-    self._timer = None
-    self.interval = interval
-    self.function = function
-    self.args = args
-    self.start_time = time.time()
-    self.is_running = False
-    self.start()
-
-  def _get_duration_sec(self):
-    return int(time.time() - self.start_time)
-
-  def _run(self):
-    self.is_running = False
-    self.start()
-    self.function(*self.args)
-
-  def start(self):
-    if not self.is_running:
-      self._timer = threading.Timer(self.interval, self._run)
-      self._timer.start()
-      self.is_running = True
-
-  def stop(self):
-    duration = self._get_duration_sec()
-    self._timer.cancel()
-    self.is_running = False
-    return duration
+_COORDINATION_ERROR_PAYLOAD_KEY = (
+    "type.googleapis.com/tensorflow.CoordinationServiceError"
+)
 
 
 class GetTaskStatesTest(object):  # pylint: disable=missing-docstring
@@ -79,13 +46,14 @@ class GetTaskStatesTest(object):  # pylint: disable=missing-docstring
     self._cluster_def["chief"] = [
         "localhost:%d" % multi_worker_test_base.pick_unused_port()
     ]
-    cluster_resolver = SimpleClusterResolver(
+    cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
         server_lib.ClusterSpec(self._cluster_def), rpc_layer="grpc")
 
     context.context().configure_coordination_service(
         service_type="standalone",
         service_leader="/job:ps/replica:0/task:0",
-        heartbeat_timeout_in_ms=_PULL_FREQ_IN_SEC * 1000)
+        heartbeat_timeout_in_ms=_PULL_FREQ_IN_SEC * 1000,
+        allow_new_incarnation_to_reconnect=True)
     self.strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
         cluster_resolver)
     self.cluster_coord = cluster_coordinator.ClusterCoordinator(self.strategy)
@@ -94,7 +62,7 @@ class GetTaskStatesTest(object):  # pylint: disable=missing-docstring
     self.num_ps = num_ps
 
     self.states = None
-    self.polling_thread = RepeatedTimer(
+    self.polling_thread = utils.RepeatedTimer(
         interval=_PULL_FREQ_IN_SEC, function=self.get_task_states)
 
   def tearDown(self):
@@ -131,6 +99,31 @@ class GetTaskStatesTest(object):  # pylint: disable=missing-docstring
     for state in self.states:
       self.assertIsNone(state)
 
+  def testPSPreempted(self):
+    self._cluster.kill_task("ps", 1)
+    time.sleep(_PULL_FREQ_IN_SEC * 2)
+    self.assertLen(self.states, self.num_workers + self.num_ps)
+    state_ix = self.num_workers + 1
+    self.assertIsInstance(self.states[state_ix], errors.UnavailableError)
+    self.assertIn("/job:ps/replica:0/task:1", self.states[state_ix]._message)
+    self.assertEqual(self.states[state_ix]._error_code,
+                     error_codes_pb2.UNAVAILABLE)
+    # Simulate the restart of all the tasks.
+    self._cluster.kill_task("ps", 0)
+    for index in range(2, self.num_ps):
+      self._cluster.kill_task("ps", index)
+    for index in range(self.num_workers):
+      self._cluster.kill_task("worker", index)
+    for index in range(self.num_ps):
+      self._cluster.start_task("ps", index)
+    for index in range(self.num_workers):
+      self._cluster.start_task("worker", index)
+    context.context().update_server_def(context.get_server_def())
+    time.sleep(_PULL_FREQ_IN_SEC * 2)
+    self.assertLen(self.states, self.num_workers + self.num_ps)
+    for state in self.states:
+      self.assertIsNone(state)
+
   def testCoordinationServicePreempted(self):
     self._cluster.kill_task("ps", 0)
     time.sleep(_PULL_FREQ_IN_SEC * 2)
diff --git a/tensorflow/python/distribute/coordinator/metric_utils_test.py b/tensorflow/python/distribute/coordinator/metric_utils_test.py
index 6663dd413d4..836e29c6ea6 100644
--- a/tensorflow/python/distribute/coordinator/metric_utils_test.py
+++ b/tensorflow/python/distribute/coordinator/metric_utils_test.py
@@ -17,7 +17,7 @@
 import time
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import parameter_server_strategy_v2
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute.coordinator import cluster_coordinator as coordinator_lib
 from tensorflow.python.distribute.coordinator import metric_utils
 from tensorflow.python.eager import def_function
@@ -39,7 +39,7 @@ class MetricUtilsTest(test.TestCase):
     cluster_def['chief'] = [
         'localhost:%d' % multi_worker_test_base.pick_unused_port()
     ]
-    cluster_resolver = SimpleClusterResolver(
+    cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
         ClusterSpec(cluster_def), rpc_layer=self.get_rpc_layer())
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
         cluster_resolver)
diff --git a/tensorflow/python/distribute/coordinator/utils.py b/tensorflow/python/distribute/coordinator/utils.py
index 864956c3e81..499e2351881 100644
--- a/tensorflow/python/distribute/coordinator/utils.py
+++ b/tensorflow/python/distribute/coordinator/utils.py
@@ -16,6 +16,9 @@
 
 Parameter server training in TF2 is currently under development.
 """
+import threading
+import time
+
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 
@@ -40,3 +43,37 @@ def start_server(cluster_resolver, protocol):
 
   # Blocking the process that starts a server from exiting.
   server.join()
+
+
+class RepeatedTimer(object):
+  """Threaded Repeated Timer from http://shortn/_3hMZTFr1Iv."""
+
+  def __init__(self, interval, function, *args):
+    self._timer = None
+    self.interval = interval
+    self.function = function
+    self.args = args
+    self.start_time = time.time()
+    self.is_running = False
+    self.start()
+
+  def _get_duration_sec(self):
+    return int(time.time() - self.start_time)
+
+  def _run(self):
+    self.is_running = False
+    self.start()
+    self.function(*self.args)
+
+  def start(self):
+    if not self.is_running:
+      self._timer = threading.Timer(self.interval, self._run)
+      self._timer.start()
+      self.is_running = True
+
+  def stop(self):
+    duration = self._get_duration_sec()
+    self._timer.cancel()
+    self.is_running = False
+    return duration
+
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index d0d4fe8cd73..0e60c337a8d 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -20,6 +20,7 @@ import multiprocessing.dummy
 import multiprocessing.pool
 import threading
 
+import numpy as np
 import six
 
 from tensorflow.python.client import device_lib
@@ -91,9 +92,9 @@ def reduce_non_distributed_value(reduce_op,
   # If the same value is present on all replicas then the PerReplica value will
   # be a single value. We also handle the case when `value` is a single value
   # and equal to 0.
-  # TODO:(b/138823479): handle the tensor value properly.
-  if not tensor_util.is_tf_type(value) and value == 0:
-    return 0
+  # TODO(b/138823479): handle the tensor value properly.
+  if not tensor_util.is_tf_type(value) and np.all(value == 0):
+    return np.zeros(value.shape, dtype=value.dtype)
   # If there is only a single value and the reduce op is MEAN,
   # that value should be on all destinations.
   if reduce_op == reduce_util.ReduceOp.MEAN:
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index e1cdc072196..dc0c5aad701 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -44,7 +44,7 @@ from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import nest
 
@@ -911,7 +911,7 @@ class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
                                       options)
           return math_ops.add_n(self.as_list(reduced)) / len(devices)
 
-        return control_flow_ops.cond(
+        return cond.cond(
             array_ops.identity(False), cond_body, cond_body)
 
       num_replicas = num_processes * len(devices)
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index 459ec9458ff..b203aec3bc7 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nccl_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -194,26 +194,45 @@ class CollectiveKeys(object):
     self._group_key = group_key_start
     self._instance_key_table = {}
     self._lock = threading.Lock()
+    self._known_groups = {}
 
   def get_group_key(self, devices):
+    """Returns a group key for the list of local devices.
+
+    The same group key is returned if the list of local devices is the same.
+
+    Args:
+      devices: a list of local canonical device strings in a collective group.
+
+    Returns:
+      a group key.
+    """
+    with self._lock:
+      devices_key = ','.join(devices)
+      if devices_key not in self._known_groups:
+        self._known_groups[devices_key] = self._get_new_group_key(devices)
+      return self._known_groups[devices_key]
+
+  def _get_new_group_key(self, devices):
     """Returns a new group key.
 
     The caller should store and reuse the same group key for the same set of
     devices. Calling this method always returns a new group key.
 
+    This method is not thread-safe.
+
     Args:
       devices: a list of canonical device strings in a collective group.
 
     Returns:
       a new group key.
     """
-    with self._lock:
-      new_key = self._group_key
-      self._group_key += 1
-      self._instance_key_table[new_key] = {}
-      for device in devices:
-        self._instance_key_table[new_key][device] = INSTANCE_KEY_START_NUMBER
-      return new_key
+    new_key = self._group_key
+    self._group_key += 1
+    self._instance_key_table[new_key] = {}
+    for device in devices:
+      self._instance_key_table[new_key][device] = INSTANCE_KEY_START_NUMBER
+    return new_key
 
   def get_instance_key(self, group_key, device):
     """Returns a new instance key for use in defining a collective op.
@@ -323,7 +342,7 @@ class CollectiveReplicaLauncher(object):
 
   def _get_ordering_token(self):
     if self._use_ordering_token():
-      return self._ordering_token.handle
+      return self._ordering_token.handle  # pytype: disable=attribute-error
 
   def can_order_nccl(self):
     """Whether this launcher can order NCCL operations."""
@@ -558,7 +577,7 @@ class CollectiveReplicaLauncher(object):
                                                   all_lengths[i]])
         return array_ops.concat(split_tensors, 0)
 
-      return control_flow_ops.cond(
+      return cond.cond(
           math_ops.equal(
               math_ops.reduce_max(all_lengths),
               math_ops.reduce_min(all_lengths)),
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 69916b06a87..66d1bb2ac4b 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -189,6 +189,7 @@ reasonable default behavior.
 # pylint: enable=line-too-long
 
 import collections
+import contextlib
 import copy
 import enum  # pylint: disable=g-bad-import-order
 import functools
@@ -197,17 +198,18 @@ import weakref
 
 import six
 
+from tensorflow.python import tf2
 from tensorflow.python.autograph.core import ag_ctx as autograph_ctx
 from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context as eager_context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import monitoring
+from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
@@ -218,9 +220,10 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import ref_variable
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.types import distribute as ds_types
@@ -290,7 +293,7 @@ def _require_cross_replica_or_default_context_extended(extended,
 
 def _wrong_strategy_scope(strategy, context):
   # Figure out the right error message.
-  if not distribution_strategy_context.has_strategy():
+  if not has_strategy():
     raise RuntimeError(
         'Need to be inside "with strategy.scope()" for %s' %
         (strategy,))
@@ -331,6 +334,387 @@ def _require_strategy_scope_extended(extended):
   _wrong_strategy_scope(strategy, context)
 
 
+_creating_default_strategy_singleton = False
+
+# ------------------------------------------------------------------------------
+# Internal API for setting the current thread mode as being either in a
+# replica or cross-replica context for a particular tf.distribute.Strategy.
+
+
+class _ThreadMode(object):
+
+  def __init__(self, dist, cross, replica):
+    self.strategy = dist
+    self.cross_replica_context = cross
+    self.replica_context = replica
+
+
+class _CrossReplicaThreadMode(_ThreadMode):
+
+  def __init__(self, strategy):
+    _ThreadMode.__init__(self, strategy, strategy, None)
+
+
+class _InReplicaThreadMode(_ThreadMode):
+
+  def __init__(self, replica_ctx):
+    _ThreadMode.__init__(self, replica_ctx.strategy, None, replica_ctx)
+
+
+def _push_per_thread_mode(context):
+  ops.get_default_graph()._distribution_strategy_stack.append(context)  # pylint: disable=protected-access
+
+
+def _pop_per_thread_mode():
+  ops.get_default_graph()._distribution_strategy_stack.pop(-1)  # pylint: disable=protected-access
+
+
+class _DefaultReplicaThreadMode(_ThreadMode):
+  """Type of default value returned by `_get_per_thread_mode()`.
+
+  Used when the thread-local stack is empty.
+  """
+
+  def __init__(self):
+    _ThreadMode.__init__(self, _get_default_strategy(), None,
+                         _get_default_replica_context())
+
+
+def _get_per_thread_mode():
+  try:
+    return ops.get_default_graph()._distribution_strategy_stack[-1]  # pylint: disable=protected-access
+  except (AttributeError, IndexError):
+    return _get_default_replica_mode()
+
+
+_variable_sync_on_read_context = threading.local()
+
+
+@tf_export("__internal__.distribute.variable_sync_on_read_context", v1=[])
+@contextlib.contextmanager
+def variable_sync_on_read_context():
+  """A context that forces SyncOnReadVariable to aggregate upon reading.
+
+  This context is useful if one wants to read the aggregated value out of a
+  SyncOnReadVariable in replica context. By default the aggregation is turned
+  off per the definition of SyncOnReadVariable.
+
+  When reading a SyncOnReadVariable in cross-replica context, aggregation is
+  always turned on so there is no need for such context.
+
+  By reading a SyncOnReadVariable, we mean:
+    1. Convert the variable to a tensor using `convert_to_tensor`.
+    2. Calling `variable.value()` or `variable.read_value()`.
+
+  Example usage:
+
+  ```
+  strategy = tf.distribute.MirroredStrategy(devices=["GPU:0", "GPU:1"])
+  with strategy.scope():
+    v = tf.Variable(1.0, synchronization=tf.VariableSynchronization.ON_READ,
+      aggregation=tf.VariableAggregation.SUM)
+
+  def replica_fn():
+    return v + 10.0
+
+  non_aggregated = strategy.run(replica_fn)
+  print(non_aggregated) # PerReplica: {0: 11.0, 1: 11.0}
+
+  def replica_fn():
+    with variable_sync_on_read_context():
+      return v + 10.0
+
+  aggregated = strategy.run(replica_fn)
+  print(aggregated) # PerReplica: {0: 12.0, 1: 12.0}
+  ```
+
+  Yields:
+    Context manager for aggregating SyncOnReadVariable upon reading.
+  """
+  try:
+    _variable_sync_on_read_context.entered = True
+    yield
+  finally:
+    _variable_sync_on_read_context.entered = False
+
+
+def in_variable_sync_on_read_context():
+  try:
+    return _variable_sync_on_read_context.entered
+  except AttributeError:
+    return False
+
+# ------------------------------------------------------------------------------
+# Public API for accessing the current thread mode
+
+
+@tf_export("distribute.get_replica_context")
+def get_replica_context():
+  """Returns the current `tf.distribute.ReplicaContext` or `None`.
+
+  Returns `None` if in a cross-replica context.
+
+  Note that execution:
+
+  1. starts in the default (single-replica) replica context (this function
+     will return the default `ReplicaContext` object);
+  2. switches to cross-replica context (in which case this will return
+     `None`) when entering a `with tf.distribute.Strategy.scope():` block;
+  3. switches to a (non-default) replica context inside `strategy.run(fn, ...)`;
+  4. if `fn` calls `get_replica_context().merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-replica context (and again
+     this function will return `None`).
+
+  Most `tf.distribute.Strategy` methods may only be executed in
+  a cross-replica context, in a replica context you should use the
+  API of the `tf.distribute.ReplicaContext` object returned by this
+  method instead.
+
+  ```
+  assert tf.distribute.get_replica_context() is not None  # default
+  with strategy.scope():
+    assert tf.distribute.get_replica_context() is None
+
+    def f():
+      replica_context = tf.distribute.get_replica_context()  # for strategy
+      assert replica_context is not None
+      tf.print("Replica id: ", replica_context.replica_id_in_sync_group,
+               " of ", replica_context.num_replicas_in_sync)
+
+    strategy.run(f)
+  ```
+
+  Returns:
+    The current `tf.distribute.ReplicaContext` object when in a replica context
+    scope, else `None`.
+
+    Within a particular block, exactly one of these two things will be true:
+
+    * `get_replica_context()` returns non-`None`, or
+    * `tf.distribute.is_cross_replica_context()` returns True.
+  """
+  return _get_per_thread_mode().replica_context
+
+
+def get_cross_replica_context():
+  """Returns the current tf.distribute.Strategy if in a cross-replica context.
+
+  DEPRECATED: Please use `in_cross_replica_context()` and
+  `get_strategy()` instead.
+
+  Returns:
+    Returns the current `tf.distribute.Strategy` object in a cross-replica
+    context, or `None`.
+
+    Exactly one of `get_replica_context()` and `get_cross_replica_context()`
+    will return `None` in a particular block.
+  """
+  return _get_per_thread_mode().cross_replica_context
+
+
+@tf_export("distribute.in_cross_replica_context")
+def in_cross_replica_context():
+  """Returns `True` if in a cross-replica context.
+
+  See `tf.distribute.get_replica_context` for details.
+
+  ```
+  assert not tf.distribute.in_cross_replica_context()
+  with strategy.scope():
+    assert tf.distribute.in_cross_replica_context()
+
+    def f():
+      assert not tf.distribute.in_cross_replica_context()
+
+    strategy.run(f)
+  ```
+
+  Returns:
+    `True` if in a cross-replica context (`get_replica_context()` returns
+    `None`), or `False` if in a replica context (`get_replica_context()` returns
+    non-`None`).
+  """
+  return _get_per_thread_mode().cross_replica_context is not None
+
+
+@tf_export("distribute.get_strategy")
+def get_strategy():
+  """Returns the current `tf.distribute.Strategy` object.
+
+  Typically only used in a cross-replica context:
+
+  ```
+  if tf.distribute.in_cross_replica_context():
+    strategy = tf.distribute.get_strategy()
+    ...
+  ```
+
+  Returns:
+    A `tf.distribute.Strategy` object. Inside a `with strategy.scope()` block,
+    it returns `strategy`, otherwise it returns the default (single-replica)
+    `tf.distribute.Strategy` object.
+  """
+  return _get_per_thread_mode().strategy
+
+
+@tf_export("distribute.has_strategy")
+def has_strategy():
+  """Return if there is a current non-default `tf.distribute.Strategy`.
+
+  ```
+  assert not tf.distribute.has_strategy()
+  with strategy.scope():
+    assert tf.distribute.has_strategy()
+  ```
+
+  Returns:
+    True if inside a `with strategy.scope():`.
+  """
+  return get_strategy() is not _get_default_strategy()
+
+
+def get_strategy_and_replica_context():
+  per_thread_mode = _get_per_thread_mode()
+  return (per_thread_mode.strategy, per_thread_mode.replica_context)
+
+
+@tf_export("distribute.experimental_set_strategy")
+def experimental_set_strategy(strategy):
+  """Set a `tf.distribute.Strategy` as current without `with strategy.scope()`.
+
+  ```
+  tf.distribute.experimental_set_strategy(strategy1)
+  f()
+  tf.distribute.experimental_set_strategy(strategy2)
+  g()
+  tf.distribute.experimental_set_strategy(None)
+  h()
+  ```
+
+  is equivalent to:
+
+  ```
+  with strategy1.scope():
+    f()
+  with strategy2.scope():
+    g()
+  h()
+  ```
+
+  In general, you should use the `with strategy.scope():` API, but this
+  alternative may be convenient in notebooks where you would have to put
+  each cell in a `with strategy.scope():` block.
+
+  Note: This should only be called outside of any TensorFlow scope to
+  avoid improper nesting.
+
+  Args:
+    strategy: A `tf.distribute.Strategy` object or None.
+
+  Raises:
+    RuntimeError: If called inside a `with strategy.scope():`.
+  """
+  old_scope = ops.get_default_graph()._global_distribute_strategy_scope  # pylint: disable=protected-access
+  if old_scope is not None:
+    old_scope.__exit__(None, None, None)
+    ops.get_default_graph()._global_distribute_strategy_scope = None  # pylint: disable=protected-access
+  if has_strategy():
+    raise RuntimeError(
+        "Must not be called inside a `tf.distribute.Strategy` scope.")
+  if strategy is not None:
+    new_scope = strategy.scope()
+    new_scope.__enter__()
+    ops.get_default_graph()._global_distribute_strategy_scope = new_scope  # pylint: disable=protected-access
+
+
+# ------------------------------------------------------------------------------
+# Internal helpers.
+
+
+@contextlib.contextmanager
+def enter_or_assert_strategy(strategy):
+  if has_strategy():
+    _assert_strategy(strategy)
+    yield
+  else:
+    with strategy.scope():
+      yield
+
+
+# ------------------------------------------------------------------------------
+# Defaults that are used when no tf.distribute.Strategy is explicitly created.
+# We create them lazily in a function so that we can workaround the circular
+# dependency on distribute_lib. See lazy loader at the top of this file.
+
+_defaults = {
+    "strategy": None,
+    "replica_context": None,
+    "replica_mode": None
+}
+# Note: These need to be different locks since _get_default_replica_context
+# calls _get_default_strategy inside its lock, and them using the same lock
+# can lead to deadlock.
+_default_strategy_lock = threading.Lock()
+_default_replica_context_lock = threading.Lock()
+_default_replica_mode_lock = threading.Lock()
+
+
+def _assert_strategy(strategy):
+  if not has_strategy():
+    raise RuntimeError('Need to be inside "with strategy.scope()" for %s' %
+                       (strategy,))
+  current_strategy = get_strategy()
+  if current_strategy is not strategy:
+    raise RuntimeError(
+        "Mixing different tf.distribute.Strategy objects: %s is not %s" %
+        (current_strategy, strategy))
+
+
+def _get_default_strategy():
+  if _defaults["strategy"] is None:
+    # Avoid race condition causing two defaults to be created
+    with _default_strategy_lock:
+      if _defaults["strategy"] is None:
+        # pylint: disable=protected-access
+        # Make sure distribute_lib module is loaded by accessing some member.
+        global _creating_default_strategy_singleton
+        _creating_default_strategy_singleton = True
+        if tf2.enabled():
+          _defaults["strategy"] = _DefaultDistributionStrategy()
+        else:
+          _defaults["strategy"] = (
+              _DefaultDistributionStrategyV1())
+        _creating_default_strategy_singleton = False
+        # pylint: enable=protected-access
+  return _defaults["strategy"]
+
+
+def _get_default_replica_context():
+  if _defaults["replica_context"] is None:
+    # Avoid race condition causing two defaults to be created
+    with _default_replica_context_lock:
+      if _defaults["replica_context"] is None:
+        # pylint: disable=protected-access
+        _defaults["replica_context"] = _DefaultReplicaContext(
+            _get_default_strategy(), replica_id_in_sync_group=0)
+        # pylint: enable=protected-access
+  return _defaults["replica_context"]
+
+
+def _get_default_replica_mode():
+  if _defaults["replica_mode"] is None:
+    # Avoid race condition causing two defaults to be created
+    with _default_replica_mode_lock:
+      if _defaults["replica_mode"] is None:
+        _defaults["replica_mode"] = _DefaultReplicaThreadMode()
+  return _defaults["replica_mode"]
+
+
+# Aliases for compatibility with old names.
+get_distribution_strategy = get_strategy
+has_distribution_strategy = has_strategy
+
+
 # ------------------------------------------------------------------------------
 # Internal context managers used to implement the DistributionStrategy
 # base class
@@ -342,26 +726,28 @@ class _CurrentDistributionContext(object):
   Also: overrides the variable creator and optionally the current device.
   """
 
-  def __init__(self,
-               strategy,
-               var_creator_scope,
-               var_scope=None,
-               resource_creator_scope=None,
-               default_device=None):
-    self._context = distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
+  def __init__(
+      self,
+      strategy,
+      var_creator_scope,
+      var_scope=None,
+      resource_creator_scope=None,
+      default_device_scope=None,
+  ):
+    self._context = _CrossReplicaThreadMode(  # pylint: disable=protected-access
         strategy)
     self._var_creator_scope = var_creator_scope
     self._var_scope = var_scope
     self._resource_creator_scope = resource_creator_scope
-    if default_device:
-      self._device_scope = ops.device(default_device)
+    if default_device_scope:
+      self._device_scope = default_device_scope
     else:
       self._device_scope = None
     self._same_scope_again_count = 0
 
   def __enter__(self):
     # Allow this scope to be entered if this strategy is already in scope.
-    if distribution_strategy_context.has_strategy():
+    if has_strategy():
       _require_cross_replica_or_default_context_extended(
           self._context.strategy.extended)
       self._same_scope_again_count += 1
@@ -559,7 +945,7 @@ class ValueContext(object):
   def __init__(self,
                replica_id_in_sync_group=0,
                num_replicas_in_sync=1):
-    """Initializes an ValueContext object.
+    """Initializes a ValueContext object.
 
     Args:
       replica_id_in_sync_group: the current replica_id, should be an int in
@@ -2085,6 +2471,11 @@ class StrategyExtendedV2(object):
     """Returns one or a list of ops.resource_creator_scope for some Strategy."""
     return None
 
+  def _default_device_scope(self):
+    if self._default_device:
+      return ops.device(self._default_device)
+    return None
+
   def _container_strategy(self):
     """Get the containing `tf.distribute.Strategy`.
 
@@ -2128,8 +2519,8 @@ class StrategyExtendedV2(object):
         elif (isinstance(kwargs["initial_value"], functools.partial) and
               isinstance(kwargs["initial_value"].func,
                          trackable.CheckpointInitialValueCallable)):
-          # Some libraries (e.g, Keras) create partial function out of initializer
-          # to bind shape/dtype, for example:
+          # Some libraries (e.g., Keras) create partial function out of
+          # initializer to bind shape/dtype, for example:
           #  initial_val = functools.partial(initializer, shape, dtype=dtype)
           # Therefore to get the restore_uid we need to examine the "func" of
           # the partial function.
@@ -2165,9 +2556,11 @@ class StrategyExtendedV2(object):
         variable_scope.variable_creator_scope(creator_with_resource_vars),
         variable_scope.variable_scope(
             variable_scope.get_variable_scope(),
-            custom_getter=distributed_getter),
+            custom_getter=distributed_getter,
+        ),
         strategy.extended._resource_creator_scope(),  # pylint: disable=protected-access
-        self._default_device)
+        self._default_device_scope(),
+    )
 
   def _allow_variable_partition(self):
     return False
@@ -2295,36 +2688,36 @@ class StrategyExtendedV2(object):
     It can be used in `tf.distribute.ReplicaContext.merge_call` to write code
     that works for all `tf.distribute.Strategy`.
 
-    >>> @tf.function
-    ... def step_fn(var):
-    ...
-    ...   def merge_fn(strategy, value, var):
-    ...     # All-reduce the value. Note that `value` here is a
-    ...     # `tf.distribute.DistributedValues`.
-    ...     reduced = strategy.extended.reduce_to(tf.distribute.ReduceOp.SUM,
-    ...         value, destinations=var)
-    ...     strategy.extended.update(var, lambda var, value: var.assign(value),
-    ...         args=(reduced,))
-    ...
-    ...   value = tf.identity(1.)
-    ...   tf.distribute.get_replica_context().merge_call(merge_fn,
-    ...     args=(value, var))
-    >>>
-    >>> def run(strategy):
-    ...   with strategy.scope():
-    ...     v = tf.Variable(0.)
-    ...     strategy.run(step_fn, args=(v,))
-    ...     return v
-    >>>
-    >>> run(tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"]))
+    @tf.function
+    def step_fn(var):
+
+      def merge_fn(strategy, value, var):
+        # All-reduce the value. Note that `value` here is a
+        # `tf.distribute.DistributedValues`.
+        reduced = strategy.extended.reduce_to(tf.distribute.ReduceOp.SUM,
+            value, destinations=var)
+        strategy.extended.update(var, lambda var, value: var.assign(value),
+            args=(reduced,))
+
+      value = tf.identity(1.)
+      tf.distribute.get_replica_context().merge_call(merge_fn,
+        args=(value, var))
+
+    def run(strategy):
+      with strategy.scope():
+        v = tf.Variable(0.)
+        strategy.run(step_fn, args=(v,))
+        return v
+
+    run(tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"]))
     MirroredVariable:{
       0: <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=2.0>,
       1: <tf.Variable 'Variable/replica_1:0' shape=() dtype=float32, numpy=2.0>
     }
-    >>> run(tf.distribute.experimental.CentralStorageStrategy(
-    ...     compute_devices=["GPU:0", "GPU:1"], parameter_device="CPU:0"))
+    run(tf.distribute.experimental.CentralStorageStrategy(
+        compute_devices=["GPU:0", "GPU:1"], parameter_device="CPU:0"))
     <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=2.0>
-    >>> run(tf.distribute.OneDeviceStrategy("GPU:0"))
+    run(tf.distribute.OneDeviceStrategy("GPU:0"))
     <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>
 
     Args:
@@ -2347,16 +2740,19 @@ class StrategyExtendedV2(object):
     Returns:
       A tensor or value reduced to `destinations`.
     """
-    if options is None:
-      options = collective_util.Options()
-    _require_cross_replica_or_default_context_extended(self)
-    assert not isinstance(destinations, (list, tuple))
-    assert not isinstance(reduce_op, variable_scope.VariableAggregation)
-    if isinstance(reduce_op, six.string_types):
-      reduce_op = reduce_util.ReduceOp(reduce_op.upper())
-    assert (reduce_op == reduce_util.ReduceOp.SUM or
-            reduce_op == reduce_util.ReduceOp.MEAN)
-    return self._reduce_to(reduce_op, value, destinations, options)
+    with monitoring.MonitoredTimer(
+        distributed_api_time_counter.get_cell(self.__class__.__name__, "Reduce_to_eagerly")
+    ) if not ops.inside_function() else contextlib.nullcontext():
+      if options is None:
+        options = collective_util.Options()
+      _require_cross_replica_or_default_context_extended(self)
+      assert not isinstance(destinations, (list, tuple))
+      assert not isinstance(reduce_op, variable_scope.VariableAggregation)
+      if isinstance(reduce_op, six.string_types):
+        reduce_op = reduce_util.ReduceOp(reduce_op.upper())
+      assert (reduce_op == reduce_util.ReduceOp.SUM or
+              reduce_op == reduce_util.ReduceOp.MEAN)
+      return self._reduce_to(reduce_op, value, destinations, options)
 
   def _reduce_to(self, reduce_op, value, destinations, options):
     raise NotImplementedError("must be implemented in descendants")
@@ -2378,36 +2774,36 @@ class StrategyExtendedV2(object):
 
     See `reduce_to` for more information.
 
-    >>> @tf.function
-    ... def step_fn(var):
-    ...
-    ...   def merge_fn(strategy, value, var):
-    ...     # All-reduce the value. Note that `value` here is a
-    ...     # `tf.distribute.DistributedValues`.
-    ...     reduced = strategy.extended.batch_reduce_to(
-    ...         tf.distribute.ReduceOp.SUM, [(value, var)])[0]
-    ...     strategy.extended.update(var, lambda var, value: var.assign(value),
-    ...         args=(reduced,))
-    ...
-    ...   value = tf.identity(1.)
-    ...   tf.distribute.get_replica_context().merge_call(merge_fn,
-    ...     args=(value, var))
-    >>>
-    >>> def run(strategy):
-    ...   with strategy.scope():
-    ...     v = tf.Variable(0.)
-    ...     strategy.run(step_fn, args=(v,))
-    ...     return v
-    >>>
-    >>> run(tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"]))
+    @tf.function
+    def step_fn(var):
+
+      def merge_fn(strategy, value, var):
+        # All-reduce the value. Note that `value` here is a
+        # `tf.distribute.DistributedValues`.
+        reduced = strategy.extended.batch_reduce_to(
+            tf.distribute.ReduceOp.SUM, [(value, var)])[0]
+        strategy.extended.update(var, lambda var, value: var.assign(value),
+            args=(reduced,))
+
+      value = tf.identity(1.)
+      tf.distribute.get_replica_context().merge_call(merge_fn,
+        args=(value, var))
+
+    def run(strategy):
+      with strategy.scope():
+        v = tf.Variable(0.)
+        strategy.run(step_fn, args=(v,))
+        return v
+
+    run(tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"]))
     MirroredVariable:{
       0: <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=2.0>,
       1: <tf.Variable 'Variable/replica_1:0' shape=() dtype=float32, numpy=2.0>
     }
-    >>> run(tf.distribute.experimental.CentralStorageStrategy(
-    ...     compute_devices=["GPU:0", "GPU:1"], parameter_device="CPU:0"))
+    run(tf.distribute.experimental.CentralStorageStrategy(
+        compute_devices=["GPU:0", "GPU:1"], parameter_device="CPU:0"))
     <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=2.0>
-    >>> run(tf.distribute.OneDeviceStrategy("GPU:0"))
+    run(tf.distribute.OneDeviceStrategy("GPU:0"))
     <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>
 
     Args:
@@ -2425,13 +2821,16 @@ class StrategyExtendedV2(object):
     Returns:
       A list of reduced values, one per pair in `value_destination_pairs`.
     """
-    if options is None:
-      options = collective_util.Options()
-    _require_cross_replica_or_default_context_extended(self)
-    assert not isinstance(reduce_op, variable_scope.VariableAggregation)
-    if isinstance(reduce_op, six.string_types):
-      reduce_op = reduce_util.ReduceOp(reduce_op.upper())
-    return self._batch_reduce_to(reduce_op, value_destination_pairs, options)
+    with monitoring.MonitoredTimer(
+        distributed_api_time_counter.get_cell(self.__class__.__name__, "Batch_reduce_to_eagerly")
+    ) if not ops.inside_function() else contextlib.nullcontext():
+      if options is None:
+        options = collective_util.Options()
+      _require_cross_replica_or_default_context_extended(self)
+      assert not isinstance(reduce_op, variable_scope.VariableAggregation)
+      if isinstance(reduce_op, six.string_types):
+        reduce_op = reduce_util.ReduceOp(reduce_op.upper())
+      return self._batch_reduce_to(reduce_op, value_destination_pairs, options)
 
   def _batch_reduce_to(self, reduce_op, value_destination_pairs, options):
     return [
@@ -2462,7 +2861,7 @@ class StrategyExtendedV2(object):
     """
     if options is None:
       options = collective_util.Options()
-    replica_context = distribution_strategy_context.get_replica_context()
+    replica_context = get_replica_context()
     assert replica_context, (
         "`StrategyExtended._replica_ctx_all_reduce` must be called in"
         " a replica context")
@@ -2478,7 +2877,7 @@ class StrategyExtendedV2(object):
     """Run `fn` with `args` and `kwargs` to update `var`."""
     # This method is called by ReplicaContext.update. Strategies who'd like to
     # remove merge_call in this path should override this method.
-    replica_context = distribution_strategy_context.get_replica_context()
+    replica_context = get_replica_context()
     if not replica_context:
       raise ValueError("`StrategyExtended._replica_ctx_update` must be called "
                        "in a replica context.")
@@ -2598,10 +2997,10 @@ class StrategyExtendedV2(object):
     # `update` can be called in a replica context.
     if kwargs is None:
       kwargs = {}
-    replica_context = distribution_strategy_context.get_replica_context()
+    replica_context = get_replica_context()
     # pylint: disable=protected-access
     if (replica_context is None or replica_context is
-        distribution_strategy_context._get_default_replica_context()):
+        _get_default_replica_context()):
       fn = autograph.tf_convert(
           fn, autograph_ctx.control_status_ctx(), convert_by_default=False)
       with self._container_strategy().scope():
@@ -3010,7 +3409,7 @@ class ReplicaContextBase(object):
         accepts a `Tensor` only to be compatible with `tpu.replicate`.
     """
     self._strategy = strategy
-    self._thread_context = distribution_strategy_context._InReplicaThreadMode(  # pylint: disable=protected-access
+    self._thread_context = _InReplicaThreadMode(  # pylint: disable=protected-access
         self)
     if not (replica_id_in_sync_group is None or
             tensor_util.is_tf_type(replica_id_in_sync_group) or
@@ -3083,7 +3482,7 @@ class ReplicaContextBase(object):
   def _merge_call(self, merge_fn, args, kwargs):
     """Default implementation for single replica."""
     _push_per_thread_mode(  # thread-local, so not needed with multiple threads
-        distribution_strategy_context._CrossReplicaThreadMode(self._strategy))  # pylint: disable=protected-access
+        _CrossReplicaThreadMode(self._strategy))  # pylint: disable=protected-access
     try:
       return merge_fn(self._strategy, *args, **kwargs)
     finally:
@@ -3141,7 +3540,7 @@ class ReplicaContextBase(object):
     NOTE: For `tf.distribute.MirroredStrategy` and
     `tf.distribute.experimental.MultiWorkerMirroredStrategy`, this returns a
     nested
-    list of device strings, e.g, [["GPU:0"]].
+    list of device strings, e.g., [["GPU:0"]].
     """
     require_replica_context(self)
     return (device_util.current(),)
@@ -3535,9 +3934,6 @@ def _batch_reduce_destination(x):
 # ------------------------------------------------------------------------------
 
 
-_creating_default_strategy_singleton = False
-
-
 class _DefaultDistributionStrategyV1(StrategyV1):
   """Default `tf.distribute.Strategy` if none is explicitly selected."""
 
@@ -3587,7 +3983,7 @@ class _DefaultDistributionContext(object):
 
   def __enter__(self):
     # Allow this scope to be entered if this strategy is already in scope.
-    if distribution_strategy_context.has_strategy():
+    if has_strategy():
       raise RuntimeError("Must not nest tf.distribute.Strategy scopes.")
     if self._nested_count == 0:
       self._var_creator_scope.__enter__()
@@ -3647,8 +4043,8 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
   def _experimental_make_numpy_dataset(self, numpy_input, session):
     numpy_flat = nest.flatten(numpy_input)
     vars_flat = tuple(
-        variable_scope.variable(array_ops.zeros(i.shape, i.dtype),
-                                trainable=False, use_resource=True)
+        variable_v1.VariableV1(array_ops.zeros(i.shape, i.dtype),
+                               trainable=False, use_resource=True)
         for i in numpy_flat
     )
     for v, i in zip(vars_flat, numpy_flat):
@@ -3793,28 +4189,30 @@ class _DefaultReplicaContext(ReplicaContext):
 # So here we catch any attempts to deserialize variables
 # when using distribution strategies.
 # pylint: disable=protected-access
-_original_from_proto = resource_variable_ops._from_proto_fn
+_original_from_proto = ref_variable._from_proto_fn
 
 
 def _from_proto_fn(v, import_scope=None):
-  if distribution_strategy_context.has_strategy():
+  if has_strategy():
     raise NotImplementedError(
         "Deserialization of variables is not yet supported when using a "
         "tf.distribute.Strategy.")
   else:
     return _original_from_proto(v, import_scope=import_scope)
 
-resource_variable_ops._from_proto_fn = _from_proto_fn
+ref_variable._from_proto_fn = _from_proto_fn
 # pylint: enable=protected-access
 
 
-#-------------------------------------------------------------------------------
-# Shorthand for some methods from distribution_strategy_context.
-_push_per_thread_mode = distribution_strategy_context._push_per_thread_mode  # pylint: disable=protected-access
-_get_per_thread_mode = distribution_strategy_context._get_per_thread_mode  # pylint: disable=protected-access
-_pop_per_thread_mode = distribution_strategy_context._pop_per_thread_mode  # pylint: disable=protected-access
-_get_default_replica_mode = (
-    distribution_strategy_context._get_default_replica_mode)  # pylint: disable=protected-access
+def get_local_results_or_value_container(variable):
+  strategy, context = get_strategy_and_replica_context()
+  if context:
+    return [strategy.extended.value_container(variable)]
+  else:
+    return strategy.experimental_local_results(variable)
+
+
+tape.register_watched_variable_resolver(get_local_results_or_value_container)
 
 
 # ------------------------------------------------------------------------------
@@ -3832,3 +4230,6 @@ distribution_strategy_input_api_counter = monitoring.Counter(
 distributed_variable_creation_time_counter = monitoring.Counter(
     "/tensorflow/api/distribution_strategy/distributed_variable_creation_time_usecs",
     "Time to create distributed variables (us).", "strategy", "if_graph_building")
+distributed_api_time_counter = monitoring.Counter(
+    "/tensorflow/api/distribution_strategy/distributed_variable_api_time_usecs",
+    "Time spent on an API (us).", "strategy", "api")
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index 1fd72b839d9..2d4e776fd9e 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -20,16 +20,16 @@ from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import reduce_util
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute.v1 import input_lib as input_lib_v1
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -121,12 +121,13 @@ class _TestExtended(distribute_lib.StrategyExtendedV1):
 
 
 def _assert_in_default_state(t):
-  t.assertIs(ds_context._get_default_replica_context(),
-             ds_context.get_replica_context())
-  t.assertIs(None, ds_context.get_cross_replica_context())
-  t.assertFalse(ds_context.in_cross_replica_context())
-  t.assertIs(ds_context._get_default_strategy(), ds_context.get_strategy())
-  t.assertFalse(ds_context.has_strategy())
+  t.assertIs(distribute_lib._get_default_replica_context(),
+             distribute_lib.get_replica_context())
+  t.assertIs(None, distribute_lib.get_cross_replica_context())
+  t.assertFalse(distribute_lib.in_cross_replica_context())
+  t.assertIs(
+      distribute_lib._get_default_strategy(), distribute_lib.get_strategy())
+  t.assertFalse(distribute_lib.has_strategy())
 
 
 def _run_in_and_out_of_scope(unbound_test_method):
@@ -155,18 +156,18 @@ class TestStrategyTest(test.TestCase):
     dist = _TestStrategy()
 
     def run_fn():
-      replica_context = ds_context.get_replica_context()
+      replica_context = distribute_lib.get_replica_context()
       self.assertIsNotNone(replica_context)
-      self.assertIs(None, ds_context.get_cross_replica_context())
-      self.assertFalse(ds_context.in_cross_replica_context())
-      self.assertTrue(ds_context.has_strategy())
-      self.assertIs(dist, ds_context.get_strategy())
+      self.assertIs(None, distribute_lib.get_cross_replica_context())
+      self.assertFalse(distribute_lib.in_cross_replica_context())
+      self.assertTrue(distribute_lib.has_strategy())
+      self.assertIs(dist, distribute_lib.get_strategy())
       self.assertEqual("foo", replica_context.merge_call(None, test_arg="foo"))
       expected_value = _get_test_variable(
           "bar", variable_scope.VariableSynchronization.AUTO,
           variable_scope.VariableAggregation.NONE)
       self.assertDictEqual(expected_value,
-                           variable_scope.variable(1.0, name="bar"))
+                           variable_v1.VariableV1(1.0, name="bar"))
 
     dist.extended.call_for_each_replica(run_fn)
     with dist.scope():
@@ -177,16 +178,16 @@ class TestStrategyTest(test.TestCase):
     _assert_in_default_state(self)
     dist = _TestStrategy()
     with dist.scope():
-      self.assertIs(None, ds_context.get_replica_context())
-      self.assertIs(dist, ds_context.get_cross_replica_context())
-      self.assertTrue(ds_context.in_cross_replica_context())
-      self.assertTrue(ds_context.has_strategy())
-      self.assertIs(dist, ds_context.get_strategy())
+      self.assertIs(None, distribute_lib.get_replica_context())
+      self.assertIs(dist, distribute_lib.get_cross_replica_context())
+      self.assertTrue(distribute_lib.in_cross_replica_context())
+      self.assertTrue(distribute_lib.has_strategy())
+      self.assertIs(dist, distribute_lib.get_strategy())
       expected_value = _get_test_variable(
           "baz", variable_scope.VariableSynchronization.AUTO,
           variable_scope.VariableAggregation.NONE)
       self.assertDictEqual(expected_value,
-                           variable_scope.variable(1.0, name="baz"))
+                           variable_v1.VariableV1(1.0, name="baz"))
     _assert_in_default_state(self)
 
   def testScopeDeviceNestingError(self):
@@ -196,7 +197,7 @@ class TestStrategyTest(test.TestCase):
     dist.extended._default_device = "/device:GPU:0"
     scope = dist.scope()
     scope.__enter__()
-    self.assertIs(dist, ds_context.get_strategy())
+    self.assertIs(dist, distribute_lib.get_strategy())
     with ops.device("/device:CPU:0"):
       with self.assertRaisesRegex(RuntimeError, "Device scope nesting error"):
         scope.__exit__(None, None, None)
@@ -212,7 +213,7 @@ class TestStrategyTest(test.TestCase):
     dist = _TestStrategy()
     scope = dist.scope()
     scope.__enter__()
-    self.assertIs(dist, ds_context.get_strategy())
+    self.assertIs(dist, distribute_lib.get_strategy())
     with variable_scope.variable_creator_scope(creator):
       with self.assertRaisesRegex(RuntimeError,
                                   "Variable creator scope nesting error"):
@@ -229,7 +230,7 @@ class TestStrategyTest(test.TestCase):
       dist = _TestStrategy()
       scope = dist.scope()
       scope.__enter__()
-      self.assertIs(dist, ds_context.get_strategy())
+      self.assertIs(dist, distribute_lib.get_strategy())
       with variable_scope.variable_scope("AA"):
         with self.assertRaisesRegex(RuntimeError,
                                     "Variable scope nesting error"):
@@ -245,7 +246,7 @@ class TestStrategyTest(test.TestCase):
           variable_scope.VariableAggregation.MEAN)
       self.assertDictEqual(
           expected_value,
-          variable_scope.variable(
+          variable_v1.VariableV1(
               1.0,
               name="baz",
               synchronization=variable_scope.VariableSynchronization.ON_WRITE,
@@ -256,20 +257,20 @@ class TestStrategyTest(test.TestCase):
     _assert_in_default_state(self)
     dist = _TestStrategy()
     dist2 = _TestStrategy()
-    ds_context.experimental_set_strategy(dist)
-    self.assertIs(None, ds_context.get_replica_context())
-    self.assertIs(dist, ds_context.get_cross_replica_context())
-    self.assertTrue(ds_context.in_cross_replica_context())
-    self.assertTrue(ds_context.has_strategy())
-    self.assertIs(dist, ds_context.get_strategy())
+    distribute_lib.experimental_set_strategy(dist)
+    self.assertIs(None, distribute_lib.get_replica_context())
+    self.assertIs(dist, distribute_lib.get_cross_replica_context())
+    self.assertTrue(distribute_lib.in_cross_replica_context())
+    self.assertTrue(distribute_lib.has_strategy())
+    self.assertIs(dist, distribute_lib.get_strategy())
     expected_value = _get_test_variable(
         "baz", variable_scope.VariableSynchronization.AUTO,
         variable_scope.VariableAggregation.NONE)
     self.assertDictEqual(expected_value,
-                         variable_scope.variable(1.0, name="baz"))
-    ds_context.experimental_set_strategy(dist2)
-    self.assertIs(dist2, ds_context.get_strategy())
-    ds_context.experimental_set_strategy(None)
+                         variable_v1.VariableV1(1.0, name="baz"))
+    distribute_lib.experimental_set_strategy(dist2)
+    self.assertIs(dist2, distribute_lib.get_strategy())
+    distribute_lib.experimental_set_strategy(None)
     _assert_in_default_state(self)
 
   def testSetStrategyInScope(self):
@@ -279,15 +280,15 @@ class TestStrategyTest(test.TestCase):
       with self.assertRaisesRegex(
           RuntimeError,
           "Must not be called inside a `tf.distribute.Strategy` scope"):
-        ds_context.experimental_set_strategy(_TestStrategy())
+        distribute_lib.experimental_set_strategy(_TestStrategy())
       with self.assertRaisesRegex(
           RuntimeError,
           "Must not be called inside a `tf.distribute.Strategy` scope"):
-        ds_context.experimental_set_strategy(dist)
+        distribute_lib.experimental_set_strategy(dist)
       with self.assertRaisesRegex(
           RuntimeError,
           "Must not be called inside a `tf.distribute.Strategy` scope"):
-        ds_context.experimental_set_strategy(None)
+        distribute_lib.experimental_set_strategy(None)
     _assert_in_default_state(self)
 
   def testSameScopeNesting(self):
@@ -295,14 +296,14 @@ class TestStrategyTest(test.TestCase):
     dist = _TestStrategy()
     scope_a = dist.scope()
     with scope_a:
-      self.assertIs(dist, ds_context.get_strategy())
+      self.assertIs(dist, distribute_lib.get_strategy())
       scope_b = dist.scope()
       with scope_b:
-        self.assertIs(dist, ds_context.get_strategy())
+        self.assertIs(dist, distribute_lib.get_strategy())
         with scope_a:
-          self.assertIs(dist, ds_context.get_strategy())
-        self.assertIs(dist, ds_context.get_strategy())
-      self.assertIs(dist, ds_context.get_strategy())
+          self.assertIs(dist, distribute_lib.get_strategy())
+        self.assertIs(dist, distribute_lib.get_strategy())
+      self.assertIs(dist, distribute_lib.get_strategy())
       dist2 = _TestStrategy()
       scope2 = dist2.scope()
       with self.assertRaisesRegex(
@@ -311,7 +312,7 @@ class TestStrategyTest(test.TestCase):
           pass
     _assert_in_default_state(self)
     with scope_b:
-      self.assertIs(dist, ds_context.get_strategy())
+      self.assertIs(dist, distribute_lib.get_strategy())
     _assert_in_default_state(self)
 
   @_run_in_and_out_of_scope
@@ -435,7 +436,8 @@ class TestStrategyTest(test.TestCase):
         "ps": ["ps0:2222", "ps1:2222"],
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
-    cluster_resolver = SimpleClusterResolver(base_cluster_spec)
+    cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
+        base_cluster_spec)
     dist.extended._cluster_resolver = cluster_resolver
     self.assertIs(dist.cluster_resolver, cluster_resolver)
 
@@ -460,16 +462,16 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
     _assert_in_default_state(self)
 
     def merge_fn(dist, s):
-      self.assertIs(ds_context._get_default_strategy(), dist)
-      self.assertIs(None, ds_context.get_replica_context())
-      self.assertIs(dist, ds_context.get_cross_replica_context())
-      self.assertTrue(ds_context.in_cross_replica_context())
-      self.assertIs(dist, ds_context.get_strategy())
-      self.assertFalse(ds_context.has_strategy())
+      self.assertIs(distribute_lib._get_default_strategy(), dist)
+      self.assertIs(None, distribute_lib.get_replica_context())
+      self.assertIs(dist, distribute_lib.get_cross_replica_context())
+      self.assertTrue(distribute_lib.in_cross_replica_context())
+      self.assertIs(dist, distribute_lib.get_strategy())
+      self.assertFalse(distribute_lib.has_strategy())
       return "foo_" + s
 
-    replica_ctx = ds_context.get_replica_context()
-    self.assertIs(ds_context._get_default_replica_context(), replica_ctx)
+    replica_ctx = distribute_lib.get_replica_context()
+    self.assertIs(distribute_lib._get_default_replica_context(), replica_ctx)
     self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, args=("bar",)))
     _assert_in_default_state(self)
 
@@ -482,7 +484,7 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
 
     @def_function.function  # AutoGraph is default-on only within tf.function
     def test_fn():
-      replica_ctx = ds_context.get_replica_context()
+      replica_ctx = distribute_lib.get_replica_context()
       replica_ctx.merge_call(merge_fn, args=("bar",))
 
     test_fn()
@@ -492,9 +494,9 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
 
     test_strategy = _TestStrategy2()
     with test_strategy.scope():
-      variable_scope.variable(1.0, name="before")
+      variable_v1.VariableV1(1.0, name="before")
 
-    default_strategy = ds_context._get_default_strategy()
+    default_strategy = distribute_lib._get_default_strategy()
     scope = default_strategy.scope()
     with scope:
       _assert_in_default_state(self)
@@ -502,7 +504,7 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
       with test_strategy.scope():
         with self.assertRaisesRegex(
             RuntimeError, "Mixing different tf.distribute.Strategy objects"):
-          variable_scope.variable(1.0, name="error")
+          variable_v1.VariableV1(1.0, name="error")
 
       with scope:
         _assert_in_default_state(self)
@@ -510,16 +512,16 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
         with test_strategy.scope():
           with self.assertRaisesRegex(
               RuntimeError, "Mixing different tf.distribute.Strategy objects"):
-            variable_scope.variable(1.0, name="also_error")
+            variable_v1.VariableV1(1.0, name="also_error")
 
       _assert_in_default_state(self)
 
     _assert_in_default_state(self)
     with test_strategy.scope():
-      variable_scope.variable(1.0, name="after")
+      variable_v1.VariableV1(1.0, name="after")
 
   def testExperimentalRunV2(self):
-    default_strategy = ds_context._get_default_strategy()
+    default_strategy = distribute_lib._get_default_strategy()
     dataset = dataset_ops.Dataset.range(10).batch(2)
     iterator = default_strategy.extended._make_dataset_iterator(dataset)
     next_val = iterator.get_next()
@@ -532,7 +534,7 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testDistributedDatasets(self):
-    default_strategy = ds_context._get_default_strategy()
+    default_strategy = distribute_lib._get_default_strategy()
     if context.executing_eagerly():
       dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2)
       dist_dataset = default_strategy.experimental_distribute_dataset(
@@ -549,7 +551,7 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testDistributedDatasetsFromFunction(self):
-    default_strategy = ds_context._get_default_strategy()
+    default_strategy = distribute_lib._get_default_strategy()
     if context.executing_eagerly():
       dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2)
       dist_dataset_from_func = \
@@ -566,11 +568,13 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(tf_api_version=1))
   def testV1(self):
-    self.assertIsInstance(ds_context.get_strategy(), distribute_lib.StrategyV1)
+    self.assertIsInstance(
+        distribute_lib.get_strategy(), distribute_lib.StrategyV1)
 
   @combinations.generate(combinations.combine(tf_api_version=2))
   def testV2(self):
-    self.assertIsInstance(ds_context.get_strategy(), distribute_lib.Strategy)
+    self.assertIsInstance(
+        distribute_lib.get_strategy(), distribute_lib.Strategy)
 
 
 class InputContextTest(test.TestCase):
diff --git a/tensorflow/python/distribute/distribute_utils.py b/tensorflow/python/distribute/distribute_utils.py
index bcf0017957e..c0af12f2341 100644
--- a/tensorflow/python/distribute/distribute_utils.py
+++ b/tensorflow/python/distribute/distribute_utils.py
@@ -18,12 +18,12 @@ from collections import abc
 import contextlib
 import threading
 
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import tpu_values as tpu_values_lib
 from tensorflow.python.distribute import values as values_lib
 from tensorflow.python.distribute.reduce_util import ReduceOp
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
+from tensorflow.python.eager import record
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -47,7 +47,7 @@ def get_loss_reduction():
     `tf.distribute.ReduceOp` corresponding to the last loss reduction for
     estimator and v1 optimizer use case. `tf.distribute.ReduceOp.SUM` otherwise.
   """
-  if not distribution_strategy_context.get_strategy()._scale_loss_for_estimator:  # pylint: disable=protected-access
+  if not distribute_lib.get_strategy()._scale_loss_for_estimator:  # pylint: disable=protected-access
     # If we are not in Estimator context then return 'SUM'. We do not need to
     # scale loss in the optimizer.
     return ReduceOp.SUM
@@ -342,7 +342,7 @@ def create_mirrored_variable(strategy, real_mirrored_creator, class_mapping,
   # TODO(josh11b,apassos): It would be better if variable initialization
   # was never recorded on the tape instead of having to do this manually
   # here.
-  with tape.stop_recording():
+  with record.stop_recording():
     value_list = real_mirrored_creator(**kwargs)
     # MirroredVariable is recreated during saved_model loading, and its
     # component variables (value_list) will have None initializer. We
@@ -393,17 +393,11 @@ def create_mirrored_variable(strategy, real_mirrored_creator, class_mapping,
 # Return True if the Value is Mirrored or the Variable is replicated and kept in
 # sync.
 def is_mirrored(val):
-  if isinstance(val, values_lib.DistributedVariable):
-    if val._policy:  # pylint: disable=protected-access
-      return val._policy._is_mirrored()  # pylint: disable=protected-access
-  return isinstance(val, values_lib.Mirrored)
+  return (getattr(val, "_is_mirrored", lambda: False))()
 
 
 def is_sync_on_read(val):
-  if isinstance(val, values_lib.DistributedVariable):
-    if val._policy:  # pylint: disable=protected-access
-      return not val._policy._is_mirrored()  # pylint: disable=protected-access
-  return not isinstance(val, values_lib.Mirrored)
+  return not is_mirrored(val)
 
 
 class CachingScopeLocal(threading.local):
diff --git a/tensorflow/python/distribute/distribute_utils_test.py b/tensorflow/python/distribute/distribute_utils_test.py
index 0fdc254e830..0ec36703eb9 100644
--- a/tensorflow/python/distribute/distribute_utils_test.py
+++ b/tensorflow/python/distribute/distribute_utils_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.saved_model.model_utils import mode_keys
 
 
@@ -154,7 +155,7 @@ class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
       ))
   def testMirroredContainer(self, distribution):
     with distribution.scope():
-      v = variable_scope.variable(
+      v = variable_v1.VariableV1(
           1., aggregation=variable_scope.VariableAggregation.SUM)
     self.assertTrue(distribute_utils.is_distributed_variable(v))
     self.assertTrue(distribute_utils.is_distributed_variable(
diff --git a/tensorflow/python/distribute/distribution_strategy_context.py b/tensorflow/python/distribute/distribution_strategy_context.py
deleted file mode 100644
index e0b1be97610..00000000000
--- a/tensorflow/python/distribute/distribution_strategy_context.py
+++ /dev/null
@@ -1,408 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility to get tf.distribute.Strategy related contexts."""
-
-import contextlib
-import threading
-
-from tensorflow.python import tf2
-from tensorflow.python.framework import ops
-from tensorflow.python.util.lazy_loader import LazyLoader
-from tensorflow.python.util.tf_export import tf_export
-
-
-# There is a circular dependency between this and the `distribute_lib` module.
-# So we load it lazily to work around this.
-distribute_lib = LazyLoader(
-    "distribute_lib", globals(),
-    "tensorflow.python.distribute.distribute_lib")
-
-# ------------------------------------------------------------------------------
-# Internal API for setting the current thread mode as being either in a
-# replica or cross-replica context for a particular tf.distribute.Strategy.
-
-
-class _ThreadMode(object):
-
-  def __init__(self, dist, cross, replica):
-    self.strategy = dist
-    self.cross_replica_context = cross
-    self.replica_context = replica
-
-
-class _CrossReplicaThreadMode(_ThreadMode):
-
-  def __init__(self, strategy):
-    _ThreadMode.__init__(self, strategy, strategy, None)
-
-
-class _InReplicaThreadMode(_ThreadMode):
-
-  def __init__(self, replica_ctx):
-    _ThreadMode.__init__(self, replica_ctx.strategy, None, replica_ctx)
-
-
-def _push_per_thread_mode(context):
-  ops.get_default_graph()._distribution_strategy_stack.append(context)  # pylint: disable=protected-access
-
-
-def _pop_per_thread_mode():
-  ops.get_default_graph()._distribution_strategy_stack.pop(-1)  # pylint: disable=protected-access
-
-
-class _DefaultReplicaThreadMode(_ThreadMode):
-  """Type of default value returned by `_get_per_thread_mode()`.
-
-  Used when the thread-local stack is empty.
-  """
-
-  def __init__(self):
-    _ThreadMode.__init__(self, _get_default_strategy(), None,
-                         _get_default_replica_context())
-
-
-def _get_per_thread_mode():
-  try:
-    return ops.get_default_graph()._distribution_strategy_stack[-1]  # pylint: disable=protected-access
-  except (AttributeError, IndexError):
-    return _get_default_replica_mode()
-
-
-_variable_sync_on_read_context = threading.local()
-
-
-@tf_export("__internal__.distribute.variable_sync_on_read_context", v1=[])
-@contextlib.contextmanager
-def variable_sync_on_read_context():
-  """A context that forces SyncOnReadVariable to aggregate upon reading.
-
-  This context is useful if one wants to read the aggregated value out of a
-  SyncOnReadVariable in replica context. By default the aggregation is turned
-  off per the definition of SyncOnReadVariable.
-
-  When reading a SyncOnReadVariable in cross-replica context, aggregation is
-  always turned on so there is no need for such context.
-
-  By reading a SyncOnReadVariable, we mean:
-    1. Convert the variable to a tensor using `convert_to_tensor`.
-    2. Calling `variable.value()` or `variable.read_value()`.
-
-  Example usage:
-
-  ```
-  strategy = tf.distribute.MirroredStrategy(devices=["GPU:0", "GPU:1"])
-  with strategy.scope():
-    v = tf.Variable(1.0, synchronization=tf.VariableSynchronization.ON_READ,
-      aggregation=tf.VariableAggregation.SUM)
-
-  def replica_fn():
-    return v + 10.0
-
-  non_aggregated = strategy.run(replica_fn)
-  print(non_aggregated) # PerReplica: {0: 11.0, 1: 11.0}
-
-  def replica_fn():
-    with variable_sync_on_read_context():
-      return v + 10.0
-
-  aggregated = strategy.run(replica_fn)
-  print(aggregated) # PerReplica: {0: 12.0, 1: 12.0}
-  ```
-
-  Yields:
-    Context manager for aggregating SyncOnReadVariable upon reading.
-  """
-  try:
-    _variable_sync_on_read_context.entered = True
-    yield
-  finally:
-    _variable_sync_on_read_context.entered = False
-
-
-def in_variable_sync_on_read_context():
-  try:
-    return _variable_sync_on_read_context.entered
-  except AttributeError:
-    return False
-
-# ------------------------------------------------------------------------------
-# Public API for accessing the current thread mode
-
-
-@tf_export("distribute.get_replica_context")
-def get_replica_context():
-  """Returns the current `tf.distribute.ReplicaContext` or `None`.
-
-  Returns `None` if in a cross-replica context.
-
-  Note that execution:
-
-  1. starts in the default (single-replica) replica context (this function
-     will return the default `ReplicaContext` object);
-  2. switches to cross-replica context (in which case this will return
-     `None`) when entering a `with tf.distribute.Strategy.scope():` block;
-  3. switches to a (non-default) replica context inside `strategy.run(fn, ...)`;
-  4. if `fn` calls `get_replica_context().merge_call(merge_fn, ...)`, then
-     inside `merge_fn` you are back in the cross-replica context (and again
-     this function will return `None`).
-
-  Most `tf.distribute.Strategy` methods may only be executed in
-  a cross-replica context, in a replica context you should use the
-  API of the `tf.distribute.ReplicaContext` object returned by this
-  method instead.
-
-  ```
-  assert tf.distribute.get_replica_context() is not None  # default
-  with strategy.scope():
-    assert tf.distribute.get_replica_context() is None
-
-    def f():
-      replica_context = tf.distribute.get_replica_context()  # for strategy
-      assert replica_context is not None
-      tf.print("Replica id: ", replica_context.replica_id_in_sync_group,
-               " of ", replica_context.num_replicas_in_sync)
-
-    strategy.run(f)
-  ```
-
-  Returns:
-    The current `tf.distribute.ReplicaContext` object when in a replica context
-    scope, else `None`.
-
-    Within a particular block, exactly one of these two things will be true:
-
-    * `get_replica_context()` returns non-`None`, or
-    * `tf.distribute.is_cross_replica_context()` returns True.
-  """
-  return _get_per_thread_mode().replica_context
-
-
-def get_cross_replica_context():
-  """Returns the current tf.distribute.Strategy if in a cross-replica context.
-
-  DEPRECATED: Please use `in_cross_replica_context()` and
-  `get_strategy()` instead.
-
-  Returns:
-    Returns the current `tf.distribute.Strategy` object in a cross-replica
-    context, or `None`.
-
-    Exactly one of `get_replica_context()` and `get_cross_replica_context()`
-    will return `None` in a particular block.
-  """
-  return _get_per_thread_mode().cross_replica_context
-
-
-@tf_export("distribute.in_cross_replica_context")
-def in_cross_replica_context():
-  """Returns `True` if in a cross-replica context.
-
-  See `tf.distribute.get_replica_context` for details.
-
-  ```
-  assert not tf.distribute.in_cross_replica_context()
-  with strategy.scope():
-    assert tf.distribute.in_cross_replica_context()
-
-    def f():
-      assert not tf.distribute.in_cross_replica_context()
-
-    strategy.run(f)
-  ```
-
-  Returns:
-    `True` if in a cross-replica context (`get_replica_context()` returns
-    `None`), or `False` if in a replica context (`get_replica_context()` returns
-    non-`None`).
-  """
-  return _get_per_thread_mode().cross_replica_context is not None
-
-
-@tf_export("distribute.get_strategy")
-def get_strategy():
-  """Returns the current `tf.distribute.Strategy` object.
-
-  Typically only used in a cross-replica context:
-
-  ```
-  if tf.distribute.in_cross_replica_context():
-    strategy = tf.distribute.get_strategy()
-    ...
-  ```
-
-  Returns:
-    A `tf.distribute.Strategy` object. Inside a `with strategy.scope()` block,
-    it returns `strategy`, otherwise it returns the default (single-replica)
-    `tf.distribute.Strategy` object.
-  """
-  return _get_per_thread_mode().strategy
-
-
-@tf_export("distribute.has_strategy")
-def has_strategy():
-  """Return if there is a current non-default `tf.distribute.Strategy`.
-
-  ```
-  assert not tf.distribute.has_strategy()
-  with strategy.scope():
-    assert tf.distribute.has_strategy()
-  ```
-
-  Returns:
-    True if inside a `with strategy.scope():`.
-  """
-  return get_strategy() is not _get_default_strategy()
-
-
-def get_strategy_and_replica_context():
-  per_thread_mode = _get_per_thread_mode()
-  return (per_thread_mode.strategy, per_thread_mode.replica_context)
-
-
-@tf_export("distribute.experimental_set_strategy")
-def experimental_set_strategy(strategy):
-  """Set a `tf.distribute.Strategy` as current without `with strategy.scope()`.
-
-  ```
-  tf.distribute.experimental_set_strategy(strategy1)
-  f()
-  tf.distribute.experimental_set_strategy(strategy2)
-  g()
-  tf.distribute.experimental_set_strategy(None)
-  h()
-  ```
-
-  is equivalent to:
-
-  ```
-  with strategy1.scope():
-    f()
-  with strategy2.scope():
-    g()
-  h()
-  ```
-
-  In general, you should use the `with strategy.scope():` API, but this
-  alternative may be convenient in notebooks where you would have to put
-  each cell in a `with strategy.scope():` block.
-
-  Note: This should only be called outside of any TensorFlow scope to
-  avoid improper nesting.
-
-  Args:
-    strategy: A `tf.distribute.Strategy` object or None.
-
-  Raises:
-    RuntimeError: If called inside a `with strategy.scope():`.
-  """
-  old_scope = ops.get_default_graph()._global_distribute_strategy_scope  # pylint: disable=protected-access
-  if old_scope is not None:
-    old_scope.__exit__(None, None, None)
-    ops.get_default_graph()._global_distribute_strategy_scope = None  # pylint: disable=protected-access
-  if has_strategy():
-    raise RuntimeError(
-        "Must not be called inside a `tf.distribute.Strategy` scope.")
-  if strategy is not None:
-    new_scope = strategy.scope()
-    new_scope.__enter__()
-    ops.get_default_graph()._global_distribute_strategy_scope = new_scope  # pylint: disable=protected-access
-
-
-# ------------------------------------------------------------------------------
-# Internal helpers.
-
-
-@contextlib.contextmanager
-def enter_or_assert_strategy(strategy):
-  if has_strategy():
-    _assert_strategy(strategy)
-    yield
-  else:
-    with strategy.scope():
-      yield
-
-
-# ------------------------------------------------------------------------------
-# Defaults that are used when no tf.distribute.Strategy is explicitly created.
-# We create them lazily in a function so that we can workaround the circular
-# dependency on distribute_lib. See lazy loader at the top of this file.
-
-_defaults = {
-    "strategy": None,
-    "replica_context": None,
-    "replica_mode": None
-}
-# Note: These need to be different locks since _get_default_replica_context
-# calls _get_default_strategy inside its lock, and them using the same lock
-# can lead to deadlock.
-_default_strategy_lock = threading.Lock()
-_default_replica_context_lock = threading.Lock()
-_default_replica_mode_lock = threading.Lock()
-
-
-def _assert_strategy(strategy):
-  if not has_strategy():
-    raise RuntimeError('Need to be inside "with strategy.scope()" for %s' %
-                       (strategy,))
-  current_strategy = get_strategy()
-  if current_strategy is not strategy:
-    raise RuntimeError(
-        "Mixing different tf.distribute.Strategy objects: %s is not %s" %
-        (current_strategy, strategy))
-
-
-def _get_default_strategy():
-  if _defaults["strategy"] is None:
-    # Avoid race condition causing two defaults to be created
-    with _default_strategy_lock:
-      if _defaults["strategy"] is None:
-        # pylint: disable=protected-access
-        # Make sure distribute_lib module is loaded by accessing some member.
-        _ = distribute_lib._creating_default_strategy_singleton
-        distribute_lib._creating_default_strategy_singleton = True
-        if tf2.enabled():
-          _defaults["strategy"] = distribute_lib._DefaultDistributionStrategy()
-        else:
-          _defaults["strategy"] = (
-              distribute_lib._DefaultDistributionStrategyV1())
-        distribute_lib._creating_default_strategy_singleton = False
-        # pylint: enable=protected-access
-  return _defaults["strategy"]
-
-
-def _get_default_replica_context():
-  if _defaults["replica_context"] is None:
-    # Avoid race condition causing two defaults to be created
-    with _default_replica_context_lock:
-      if _defaults["replica_context"] is None:
-        # pylint: disable=protected-access
-        _defaults["replica_context"] = distribute_lib._DefaultReplicaContext(
-            _get_default_strategy(), replica_id_in_sync_group=0)
-        # pylint: enable=protected-access
-  return _defaults["replica_context"]
-
-
-def _get_default_replica_mode():
-  if _defaults["replica_mode"] is None:
-    # Avoid race condition causing two defaults to be created
-    with _default_replica_mode_lock:
-      if _defaults["replica_mode"] is None:
-        _defaults["replica_mode"] = _DefaultReplicaThreadMode()
-  return _defaults["replica_mode"]
-
-
-# Aliases for compatibility with old names.
-get_distribution_strategy = get_strategy
-has_distribution_strategy = has_strategy
diff --git a/tensorflow/python/distribute/experimental/BUILD b/tensorflow/python/distribute/experimental/BUILD
index ca915b8dc6a..b33ad7c4b65 100644
--- a/tensorflow/python/distribute/experimental/BUILD
+++ b/tensorflow/python/distribute/experimental/BUILD
@@ -1,5 +1,8 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 load(
     "//tensorflow/dtensor:build_defs.bzl",
+    "GPU_2DEVS_BACKEND",
+    "TPU_V3_DONUT_BACKEND",
     "dtensor_test",
 )
 
@@ -9,7 +12,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "experimental",
     srcs = [
         "__init__.py",
@@ -25,33 +28,30 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "mirrored_strategy",
     srcs = ["mirrored_strategy.py"],
     deps = [
+        ":dtensor_strategy_extended",
         ":dtensor_util",
         "//tensorflow/dtensor/python:api",
         "//tensorflow/dtensor/python:config",
-        "//tensorflow/dtensor/python:d_variable",
-        "//tensorflow/dtensor/python:input_util",
-        "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:mesh_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/distribute:distribute_utils",
         "//tensorflow/python/distribute:reduce_util",
-        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util",
     ],
 )
 
 dtensor_test(
     name = "mirrored_strategy_test",
     srcs = ["mirrored_strategy_test.py"],
+    shard_count = {"tpu": 2},
     tags = ["no_pip"],
     deps = [
         ":dtensor_util",
@@ -72,17 +72,27 @@ dtensor_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "dtensor_util",
     srcs = ["dtensor_util.py"],
     deps = [
+        "//tensorflow/dtensor/python:accelerator_util",
         "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:input_util",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_util",
     ],
 )
 
@@ -98,9 +108,83 @@ dtensor_test(
         "//tensorflow/dtensor/python/tests:test_util",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_library(
+    name = "dtensor_strategy_extended",
+    srcs = ["dtensor_strategy_extended.py"],
+    deps = [
+        ":dtensor_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:config",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:input_util",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python/data/experimental/ops:distribute",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:distribute_utils",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:nest",
+    ],
+)
+
+py_library(
+    name = "multi_worker_mirrored_strategy",
+    srcs = ["multi_worker_mirrored_strategy.py"],
+    deps = [
+        ":dtensor_strategy_extended",
+        ":dtensor_util",
+        "//tensorflow/dtensor/python:config",
+        "//tensorflow/dtensor/python:mesh_util",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
+    ],
+)
+
+dtensor_test(
+    name = "multi_worker_mirrored_strategy_test",
+    srcs = ["multi_worker_mirrored_strategy_test.py"],
+    additional_backends = [
+        GPU_2DEVS_BACKEND,
+        TPU_V3_DONUT_BACKEND,
+    ],
+    disable = [
+        "gpu",  # multi-client gpu is tested via GPU_2DEVS_BACKEND.
+        "tpu",  # multi-client tpu is tested via TPU_V3_DONUT_BACKEND.
+    ],
+    disable_tfrt = [
+        "cpu",  # TODO(b/217969210): Re-enable in TFRT CPU.
+        GPU_2DEVS_BACKEND,  # TODO(b/230679405): Re-enable in TFRT GPU.
+    ],
+    tags = [
+        "no_pip",
+        "no_windows",
+        "nosan",
+    ],  # b/195537906
+    deps = [
+        ":dtensor_util",
+        ":multi_worker_mirrored_strategy",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python/tests:multi_client_test_util",
+        "//tensorflow/dtensor/python/tests:test_util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/distribute/experimental/dtensor_strategy_extended.py b/tensorflow/python/distribute/experimental/dtensor_strategy_extended.py
new file mode 100644
index 00000000000..f6abf22f224
--- /dev/null
+++ b/tensorflow/python/distribute/experimental/dtensor_strategy_extended.py
@@ -0,0 +1,278 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implement a StrategyExtended based on the DTensor low level API."""
+
+import functools
+
+from tensorflow.dtensor.python import api as d_api
+from tensorflow.dtensor.python import config as d_config
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import input_util
+from tensorflow.dtensor.python import layout
+from tensorflow.python.data.experimental.ops import distribute
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribute_utils
+from tensorflow.python.distribute.experimental import dtensor_util
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.util import nest
+
+
+class DTensorStrategyExtended(distribute_lib.StrategyExtendedV2):
+  """Strategy extension that support both single and multi worker strategy."""
+  # Note that the unit test for this class is via the strategy interface.
+
+  def __init__(self, container_strategy, mesh):
+    super().__init__(container_strategy)
+    self._mesh = mesh
+
+  def _create_variable(self, next_creator, **kwargs):
+    # Make sure the pop the `use_resource` which is not supported by the
+    # base tf.Variable. The `use_resource` is added by
+    # creator_with_resource_vars in distribute_lib.py
+    kwargs.pop('use_resource', None)
+
+    # Ignore the colocate_with for the mirrored strategy. Each of the device
+    # will get same copy of variable in the DTensor's case.
+    # `colocate_with` is added when user call:
+    # strategy.extended.colocate_vars_with(variable)
+    kwargs.pop('colocate_with', None)
+
+    # Ignore expected_shape, which is from the v1 Variable. Keras was somehow
+    # using the v1 Variable, but didn't specify that value particularly.
+    kwargs.pop('expected_shape', None)
+
+    # Make sure to call DVariable initializer under the scope so that it will
+    # have the proper replicated layout. The initial_value is multi-typed,
+    # eg it can be a tensor, or a python/numpy type, or a callable that
+    # produce tensor/python/numpy types. In all those cases, we need to wrap
+    # them invoke convert_to_tensor() under the scope so that the proper
+    # layout can be assigned.
+
+    # TODO(scottzhu): The layout information should be injected via kwargs, or
+    # lazily set later.
+    initial_value = kwargs.pop('initial_value')
+    dtype = kwargs.get('dtype', None)
+    def new_initial_value():
+      if callable(initial_value):
+        init_var = ops.convert_to_tensor(initial_value(), dtype=dtype)
+      else:
+        init_var = ops.convert_to_tensor(initial_value, dtype=dtype)
+      rank = init_var.shape.rank
+      return d_api.copy_to_mesh(
+          init_var, layout.Layout.replicated(self._mesh, rank))
+
+    return d_variable.DVariable(new_initial_value, **kwargs)
+
+  @property
+  def _num_replicas_in_sync(self):
+    # The mesh should be 1D with batch sharding only.
+    # In the model parallel case, it should only return the size of
+    # batch dimension.
+    return self._mesh.size
+
+  def value_container(self, value):
+    return value
+
+  @property
+  def worker_devices(self):
+    # Note that in either single worker (MirroredStrategy) or multi worker (
+    # MultiWorkerMirroredStrategy), worker_devices refers to the local worker
+    # devices.
+    return tuple(self._mesh.local_devices())
+
+  @property
+  def parameter_devices(self):
+    # Same as the worker_devices.
+    return self.worker_devices
+
+  def _in_multi_worker_mode(self):
+    return d_config.num_clients() > 1
+
+  def _get_local_replica_id(self, replica_id_in_sync_group):
+    return replica_id_in_sync_group
+
+  def _default_device_scope(self):
+    return d_api.default_mesh(self._mesh)
+
+  def _experimental_distribute_dataset(self, dataset, options):
+    # Strategy always assume the user input data is a batched dataset for
+    # experimental_distribute_dataset().
+    # TODO(yuefengz): Add check for whether a dataset is batched for all
+    # strategies.
+
+    # TODO(b/265198795): Support dataset already batched to global batch size.
+    # Since DTensorDataset doesn't support batched dataset that is already
+    # batched global batch size, it only supports dataset that is batched to
+    # local batch size, we need to infer the batch size, and unbatch the dataset
+    # until the b/265198795 is resolved.
+    batch_size = distribute.compute_batch_size(dataset)
+
+    # There are multiple case that the batch is not static, eg partial batch,
+    # or uneven batch, in all those case, it will return -1.
+    if batch_size.numpy() < 0:
+      # When we don't have a static batch size.
+      raise ValueError('DTensor strategy requires a static batch size for now.'
+                       'The dynamic batch size will be supported in future')
+    # Unbatch the dataset for now since the DTensorDataset has some limitation
+    # about the local batch size as well as the mesh size.
+    dataset = dataset.unbatch()
+
+    def _create_batch_layout(tensor_spec):
+      # For unbatched dataset, the new layout need to have +1 rank for
+      # the batched result.
+      rank = len(tensor_spec.shape) + 1
+      return layout.Layout.batch_sharded(
+          self._mesh, batch_dim=dtensor_util.DEFAULT_BATCH_MESH_DIM_NAME,
+          rank=rank)
+
+    layouts = nest.map_structure(_create_batch_layout, dataset.element_spec)
+
+    return input_util.DTensorDataset(
+        dataset=dataset,
+        mesh=self._mesh,
+        layouts=layouts,
+        global_batch_size=batch_size,
+        dataset_already_batched=False,
+        batch_dim=dtensor_util.DEFAULT_BATCH_MESH_DIM_NAME,
+        # TODO(scottzhu): Add prefetch support by inspecting the input dataset.
+        prefetch=None,
+        tf_data_service_config=None
+    )
+
+  def _make_dataset_iterator(self, dataset):
+    raise NotImplementedError(
+        'Strategy.make_dataset_iterator() is deprecated, and only available '
+        'in the V1 API.')
+
+  def _make_input_fn_iterator(self, input_fn, replication_mode):
+    raise NotImplementedError(
+        'Strategy.make_input_fn_iterator() is deprecated, and only available '
+        'in the V1 API.')
+
+  def _distribute_datasets_from_function(self, dataset_fn, options):
+    # TODO(scottzhu): Implement the logic for options in future
+    del options
+    # Single worker for now, this will change when deal with different input
+    # options or multiple workers.
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=1,
+        input_pipeline_id=0,
+        num_replicas_in_sync=self._num_replicas_in_sync
+    )
+    dataset = dataset_fn(input_context)
+
+    # Note that the dataset should already batched to local per-relica batch
+    def _create_batch_layout(tensor_spec):
+      rank = len(tensor_spec.shape)
+      return layout.Layout.batch_sharded(
+          self._mesh, batch_dim=dtensor_util.DEFAULT_BATCH_MESH_DIM_NAME,
+          rank=rank)
+
+    layouts = nest.map_structure(_create_batch_layout, dataset.element_spec)
+
+    batch_size = distribute.compute_batch_size(dataset)
+    # There are multiple case that the batch is not static, eg partial batch,
+    # or uneven batch, in all those case, it will return -1.
+    if batch_size.numpy() < 0:
+      # When we don't have a static batch size.
+      raise ValueError('DTensor strategy requires a static batch size for now.'
+                       'The dynamic batch size will be supported in future')
+    global_batch_size = batch_size.numpy() * self._num_replicas_in_sync
+
+    return input_util.DTensorDataset(
+        dataset=dataset,
+        mesh=self._mesh,
+        layouts=layouts,
+        global_batch_size=global_batch_size,
+        dataset_already_batched=True,
+        batch_dim=dtensor_util.DEFAULT_BATCH_MESH_DIM_NAME,
+        # TODO(scottzhu): Add prefetch support by inspecting the input dataset.
+        prefetch=None,
+        tf_data_service_config=None
+    )
+
+  def _experimental_distribute_values_from_function(self, value_fn):
+    per_replica_values = []
+    # Note that in the multi-worker setting, this function only return the
+    # slide of DistributedValue for the current worker.
+    for i in range(self._mesh.num_local_devices()):
+      # In the case of 2 worker with 2 local devices on each worker,
+      # worker 0 will get 0 and 1 for replica_id.
+      # worker 1 will get 2 and 3 for replica_id.
+      replica_id = d_config.client_id() * self._mesh.num_local_devices() + i
+      per_replica_values.append(value_fn(
+          distribute_lib.ValueContext(replica_id,
+                                      self._num_replicas_in_sync)))
+    # Instead of using the DistributeVariable, return a DTensor instead since
+    # the run() will expect a DTensor instance.
+    result = distribute_utils.regroup(per_replica_values, always_wrap=True)
+    map_fn = functools.partial(dtensor_util.convert_per_replica_to_dtensor,
+                               mesh=self._mesh)
+    return nest.map_structure(map_fn, result)
+
+  def call_for_each_replica(self, fn, args, kwargs):
+    """Run `fn` once per replica.
+
+    This is a method that expected by the strategy base class in its `run()`.
+
+    Args:
+      fn: function to run (will be run once per replica).
+      args: Tuple or list with positional arguments for `fn`.
+      kwargs: Dict with keyword arguments for `fn`.
+
+    Returns:
+      Merged return value of `fn` across all replicas.
+    """
+    # Comparing to the existing MirroredStrategy, which will run the fn on
+    # each of the replica with individual thread, the DTensor will just run
+    # the fn once with the DTensor inputs, and the distribution will be handled
+    # by the DTensor.
+
+    distribute_lib._require_cross_replica_or_default_context_extended(self)   # pylint: disable=protected-access
+    if kwargs is None:
+      kwargs = {}
+
+    # For any value that is not DTensor, eg normal tf.Tensor or
+    # DistributedValues, we need to convert them into DTensor.
+    map_fn = functools.partial(dtensor_util.convert_inputs_to_dtensor,
+                               mesh=self._mesh)
+    d_args = nest.map_structure(map_fn, args)
+    d_kwargs = nest.map_structure(map_fn, kwargs)
+
+    with self._container_strategy().scope():
+      with dtensor_util.DTensorReplicaContext(self._container_strategy()):
+        dtensor_result = fn(*d_args, **d_kwargs)
+
+    return nest.map_structure(
+        dtensor_util.DTensorDistributedValue,
+        dtensor_result)
+
+  def _gather_to_implementation(self, value, destinations, axis, options):
+    if isinstance(value, dtensor_util.DTensorDistributedValue):
+      value = value.get_dtensor()
+    if not d_api.is_dtensor(value):
+      # This is the current behavior for mirrored strategy, should we raise an
+      # error for unsupported types?
+      return value
+
+    # Unpack the dtensor components and gather the tensors on the axis
+    components = d_api.unpack(value)
+    return array_ops.concat(components, axis=axis)
+
+  def _use_merge_call(self):
+    # This is method for V1 StrategyExtended by still used by
+    # tf.__internal__.distribute.strategy_supports_no_merge_call
+    return False
diff --git a/tensorflow/python/distribute/experimental/dtensor_util.py b/tensorflow/python/distribute/experimental/dtensor_util.py
index f59f4d2dd6b..990670f9394 100644
--- a/tensorflow/python/distribute/experimental/dtensor_util.py
+++ b/tensorflow/python/distribute/experimental/dtensor_util.py
@@ -14,13 +14,25 @@
 # ==============================================================================
 """Utilities for strategies that are backed by DTensor."""
 
+from tensorflow.dtensor.python import accelerator_util
 from tensorflow.dtensor.python import api as d_api
+from tensorflow.dtensor.python import input_util
+from tensorflow.dtensor.python import layout
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import values
+from tensorflow.python.distribute import values as values_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import tensor_conversion_registry
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import summary_ops_v2
 
 
-class DTensorDistributedValue(values.DistributedValues):
+# Default dimension name used for the mesh created when user provide a list
+# of devices. For mirrored strategy, it should be a 1D mesh with batch dim only.
+DEFAULT_BATCH_MESH_DIM_NAME = "batch"
+
+
+class DTensorDistributedValue(values_lib.DistributedValues):
   """DistributedValue backed by a DTensor instance.
 
   This class is useful to align the interface between DTensor and tf.distribute.
@@ -31,10 +43,17 @@ class DTensorDistributedValue(values.DistributedValues):
   """
 
   def __init__(self, dtensor):
-    if not d_api.is_dtensor(dtensor):
-      raise ValueError("The DTensorDistributedValue can only be built with "
-                       f"DTensor instance, got {type(dtensor)}")
-    super().__init__(d_api.unpack(dtensor))
+    if context.executing_eagerly():
+      if not d_api.is_dtensor(dtensor):
+        raise ValueError("The DTensorDistributedValue can only be built with "
+                         f"DTensor instance, got {type(dtensor)}")
+      super().__init__(d_api.unpack(dtensor))
+    else:
+      # We can't unpack the dtensor instance for now due to graph context.
+      # We will treat the dtensor instance as one global instance and let it
+      # return as a global replica instance.
+      # TODO(feyu): Support unpack in the graph context.
+      super().__init__([dtensor,])
     self._dtensor = dtensor
 
   def get_dtensor(self):
@@ -48,6 +67,26 @@ class DTensorDistributedValue(values.DistributedValues):
     return self._values
 
 
+def _dtensor_distributed_value_to_tensor(
+    var, dtype=None, name=None, as_ref=False):
+  del name
+  dtensor = var.get_dtensor()
+  if dtype is not None and not dtype.is_compatible_with(dtensor.dtype):
+    raise ValueError(
+        "Incompatible type conversion requested to type {!r} for variable "
+        "of type {!r}".format(dtype.name, dtensor.dtype.name))
+  if as_ref:
+    raise NotImplementedError(
+        "PerReplica doesn't support being used as a reference.")
+  return dtensor
+
+
+# Register a conversion function to provide a useful error message when users
+# try to use PerReplica values in the wrong contexts
+tensor_conversion_registry.register_tensor_conversion_function(
+    DTensorDistributedValue, _dtensor_distributed_value_to_tensor)
+
+
 class DTensorReplicaContext(distribute_lib.ReplicaContext):
   """ReplicaContext for strategy that is backed by DTensor.
 
@@ -80,7 +119,10 @@ class DTensorReplicaContext(distribute_lib.ReplicaContext):
 
   @property
   def replica_id_in_sync_group(self):
-    raise NotImplementedError(self._UNSUPPORTED_ERROR_MSG)
+    # Since there is only one global context for DTensor, we always return a
+    # constant value here. This value is needed by the RNG which try to generate
+    # different seed for different replica.
+    return 0
 
   @property
   def _replica_id(self):
@@ -97,3 +139,95 @@ class DTensorReplicaContext(distribute_lib.ReplicaContext):
 
   def _update(self, var, fn, args=(), kwargs=None, group=True):
     raise NotImplementedError(self._UNSUPPORTED_ERROR_MSG)
+
+
+def initialize_accelerator_system_once(device_type):
+  # Initialize the GPU/TPU before creating the mesh.
+  # Note that this method will also trigger the creation of the pairing
+  # virtual host CPUs, which is needed by dataset and checkpoint.
+  if not accelerator_util.is_initialized():
+    # TODO(feyu): Add a method in accelerator_util to check the initialized
+    # mesh device types.
+    accelerator_util.initialize_accelerator_system(
+        device_type,
+        experimental_reset_context=True)
+
+
+def convert_inputs_to_dtensor(inputs, mesh):
+  """Convert any input types to DTensor instance."""
+  if isinstance(inputs, DTensorDistributedValue):
+    return inputs.get_dtensor()
+  elif isinstance(inputs, values_lib.DistributedValues):
+    return convert_per_replica_to_dtensor(inputs, mesh)
+  elif isinstance(inputs, input_util._DTensorIterator):   # pylint: disable=protected-access
+    return inputs
+  elif tensor_util.is_tensor(inputs):
+    if context.executing_eagerly():
+      if d_api.is_dtensor(inputs):
+        return inputs
+      else:
+        # For a non-dtensor input in eager context, we could choose to replica
+        # them into per-replica and then pack them into dtensor. However, this
+        # will cause an eager/graph discrepancy since we can't do this check in
+        # the graph context. For now, we will ask user to provide a distributed
+        # value for inputs.
+        _raise_unsupported_input_type_error(inputs)
+    else:
+      # For graph context, since we can't check if they are dtensor or not. We
+      # will assume the value is already distributed. This is a critical use
+      # case for keras, where all the inputs are pre-distributed via strategy,
+      # and the train function execute within graph context.
+      return inputs
+  else:
+    # For any other types.
+    _raise_unsupported_input_type_error(inputs)
+
+
+def _raise_unsupported_input_type_error(inputs):
+  raise ValueError("Unsupported input types for MirroredStrategy. "
+                   "Please use `strategy.distribute_dataset` or "
+                   "`strategy.distribute_values_from_function` to "
+                   f"distribute inputs. Received input type: {type(inputs)}")
+
+
+def is_distributed_value(value):
+  return isinstance(
+      value, values_lib.DistributedValues) or d_api.is_dtensor(value)
+
+
+def convert_per_replica_to_dtensor(per_replica_value, mesh):
+  """Convert a PerReplica result to a DTensor instance.
+
+  Args:
+    per_replica_value: A PerReplica instance whose value will be converted
+      to DTensor.
+    mesh: The mesh used for layout creation.
+
+  Returns:
+    A DTensor instance that packed from per_replica_value with batch sharded
+      layout.
+  """
+  values = per_replica_value.values
+  if isinstance(values[0], (float, int)):
+    rank = 0
+  else:
+    rank = len(values[0].shape)
+
+  if rank == 0:
+    result = []
+    # dtensor.pack requires each component to have same rank as the packed
+    # result. When the individual value is scalar, it needs to be expanded into
+    # 1D tensor.
+    for v in values:
+      result.append(array_ops.expand_dims_v2(v, axis=0))
+    rank += 1
+  else:
+    result = list(values)   # dtensor.pack requires a list as input.
+
+  # TODO(scottzhu): Note that the result tensor could be a partial value and
+  # not always batch shard or fully replicaed. See
+  # http://screenshot/6ERkXyX95KqftCw as an example.
+  batch_layout = layout.Layout.batch_sharded(
+      mesh, batch_dim=DEFAULT_BATCH_MESH_DIM_NAME, rank=rank)
+
+  return d_api.pack(result, batch_layout)
diff --git a/tensorflow/python/distribute/experimental/dtensor_util_test.py b/tensorflow/python/distribute/experimental/dtensor_util_test.py
index 11f8ba6fbc3..e486a777084 100644
--- a/tensorflow/python/distribute/experimental/dtensor_util_test.py
+++ b/tensorflow/python/distribute/experimental/dtensor_util_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values as values_lib
 from tensorflow.python.distribute.experimental import dtensor_util
 from tensorflow.python.distribute.experimental import mirrored_strategy
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 
@@ -43,9 +44,9 @@ class DTensorDistributedValueTest(test_util.DTensorBaseTest):
 
     tensor_1 = constant_op.constant([1.0])
     tensor_2 = constant_op.constant([2.0])
-    batch_layout = layout.Layout.batch_sharded(
+    self.batch_layout = layout.Layout.batch_sharded(
         self.mesh, batch_dim='batch', rank=1)
-    self.dtensor = d_api.pack([tensor_1, tensor_2], batch_layout)
+    self.dtensor = d_api.pack([tensor_1, tensor_2], self.batch_layout)
 
   @parameterized.named_parameters([
       ('py_floats', [1.0, 2.0]),
@@ -70,6 +71,19 @@ class DTensorDistributedValueTest(test_util.DTensorBaseTest):
     self.assertAllClose(per_replica_result[0], constant_op.constant([1.0]))
     self.assertAllClose(per_replica_result[1], constant_op.constant([2.0]))
 
+  def test_graph_behavior(self):
+
+    @def_function.function
+    def run_fn(input_dtensor):
+      return dtensor_util.DTensorDistributedValue(input_dtensor)
+
+    result = run_fn(self.dtensor)
+    # When it cross the boundary of tf.function, it will be unwrapped and
+    # return a dtensor instance directly.
+    self.assertTrue(d_api.is_dtensor(result))
+    self.assertDTensorEqual(constant_op.constant([1.0, 2.0]),
+                            self.batch_layout, result)
+
 
 class DTensorReplicaContextTest(test_util.DTensorBaseTest):
 
@@ -92,8 +106,7 @@ class DTensorReplicaContextTest(test_util.DTensorBaseTest):
 
     self.assertEqual(replica_context.num_replicas_in_sync, 2)
 
-    with self.assertRaisesRegex(NotImplementedError, expected_error):
-      _ = replica_context.replica_id_in_sync_group
+    self.assertEqual(replica_context.replica_id_in_sync_group, 0)
 
     with self.assertRaisesRegex(NotImplementedError, expected_error):
       replica_context.merge_call(None)
diff --git a/tensorflow/python/distribute/experimental/mirrored_strategy.py b/tensorflow/python/distribute/experimental/mirrored_strategy.py
index 8ddbc58c7c7..7a91b7edce6 100644
--- a/tensorflow/python/distribute/experimental/mirrored_strategy.py
+++ b/tensorflow/python/distribute/experimental/mirrored_strategy.py
@@ -17,30 +17,20 @@
 This is an experiment to validate the viability of the DTensor API, and expose
 any potential feature gaps between the current API and the need.
 """
-import functools
 
 from tensorflow.dtensor.python import api as d_api
 from tensorflow.dtensor.python import config as d_config
-from tensorflow.dtensor.python import d_variable
-from tensorflow.dtensor.python import input_util
-from tensorflow.dtensor.python import layout
 from tensorflow.dtensor.python import mesh_util
-from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import reduce_util
-from tensorflow.python.distribute import values as values_lib
+from tensorflow.python.distribute.experimental import dtensor_strategy_extended
 from tensorflow.python.distribute.experimental import dtensor_util
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util import nest
-
-# Default dimension name used for the mesh created when user provide a list
-# of devices. For mirrored strategy, it should be a 1D mesh with batch dim only.
-_DEFAULT_BATCH_MESH_DIM_NAME = 'batch'
 
 
 class MirroredStrategy(distribute_lib.Strategy):
@@ -71,7 +61,8 @@ class MirroredStrategy(distribute_lib.Strategy):
     if not mesh:
       mesh = self._build_mesh_from_device_list(devices)
 
-    extended = MirroredExtended(container_strategy=self, mesh=mesh)
+    extended = dtensor_strategy_extended.DTensorStrategyExtended(
+        container_strategy=self, mesh=mesh)
     super().__init__(extended)
     self._mesh = mesh
     self._devices = devices
@@ -91,15 +82,18 @@ class MirroredStrategy(distribute_lib.Strategy):
   @classmethod
   def _build_mesh_from_device_list(cls, devices):
     if devices:
+      device_type = tf_device.DeviceSpec.from_string(devices[0]).device_type
+      dtensor_util.initialize_accelerator_system_once(device_type)
       mesh = mesh_util.create_mesh(
-          mesh_dims=[(_DEFAULT_BATCH_MESH_DIM_NAME, len(devices))],
+          mesh_dims=[(dtensor_util.DEFAULT_BATCH_MESH_DIM_NAME, len(devices))],
           devices=devices)
     else:
       # Trying to detect if there is any GPU/TPUs attached.
       device_type = d_config.preferred_device_type()
       devices = d_config.local_devices(device_type)
+      dtensor_util.initialize_accelerator_system_once(device_type)
       mesh = mesh_util.create_mesh(
-          mesh_dims=[(_DEFAULT_BATCH_MESH_DIM_NAME, len(devices))],
+          mesh_dims=[(dtensor_util.DEFAULT_BATCH_MESH_DIM_NAME, len(devices))],
           device_type=device_type)
     return mesh
 
@@ -114,7 +108,7 @@ class MirroredStrategy(distribute_lib.Strategy):
     if isinstance(reduce_op, str):
       reduce_op = reduce_util.ReduceOp(reduce_op.upper())
 
-    distributed_input = _is_distributed_value(value)
+    distributed_input = dtensor_util.is_distributed_value(value)
     if not distributed_input and axis is None:
       # For any value that isn't distributed and doesn't need a reduction within
       # the replica.
@@ -127,7 +121,7 @@ class MirroredStrategy(distribute_lib.Strategy):
             cross_device_ops_lib.reduce_non_distributed_value(
                 reduce_op, value, destinations, self.num_replicas_in_sync))
 
-    value = _convert_inputs_to_dtensor(value, self._mesh)
+    value = dtensor_util.convert_inputs_to_dtensor(value, self._mesh)
     # At this point, the value is a DTensor instance now.
     # There will be a final reduction step cross replica. In order to maintain
     # the shape of each local replica, we need to add a new dim to the front.
@@ -161,299 +155,3 @@ class MirroredStrategy(distribute_lib.Strategy):
     # value as the original MirroredStrategy, but with a different type. User
     # might want a tf.Tensor for the status quo.
     return value
-
-
-class MirroredExtended(distribute_lib.StrategyExtendedV2):
-  """Strategy extension contains the concrete logic for variable creation."""
-
-  def __init__(self, container_strategy, mesh):
-    super().__init__(container_strategy)
-    self._mesh = mesh
-
-  def _create_variable(self, next_creator, **kwargs):
-    # Make sure the pop the `use_resource` which is not supported by the
-    # base tf.Variable. The `use_resource` is added by
-    # creator_with_resource_vars in distribute_lib.py
-    kwargs.pop('use_resource', None)
-
-    # Ignore the colocate_with for the mirrored strategy. Each of the device
-    # will get same copy of variable in the DTensor's case.
-    # `colocate_with` is added when user call:
-    # strategy.extended.colocate_vars_with(variable)
-    kwargs.pop('colocate_with', None)
-
-    # Ignore expected_shape, which is from the v1 Variable. Keras was somehow
-    # using the v1 Variable, but didn't specify that value particularly.
-    kwargs.pop('expected_shape', None)
-
-    # Make sure to call DVariable initializer under the scope so that it will
-    # have the proper replicated layout. The initial_value is multi-typed,
-    # eg it can be a tensor, or a python/numpy type, or a callable that
-    # produce tensor/python/numpy types. In all those cases, we need to wrap
-    # them invoke convert_to_tensor() under the scope so that the proper
-    # layout can be assigned.
-
-    # TODO(scottzhu): The layout information should be injected via kwargs, or
-    # lazily set later.
-    initial_value = kwargs.pop('initial_value')
-    dtype = kwargs.get('dtype', None)
-    def new_initial_value():
-      if callable(initial_value):
-        init_var = ops.convert_to_tensor(initial_value(), dtype=dtype)
-      else:
-        init_var = ops.convert_to_tensor(initial_value, dtype=dtype)
-      rank = init_var.shape.rank
-      return d_api.copy_to_mesh(
-          init_var, layout.Layout.replicated(self._mesh, rank))
-
-    return d_variable.DVariable(new_initial_value, **kwargs)
-
-  @property
-  def _num_replicas_in_sync(self):
-    # The mesh should be 1D with batch sharding only.
-    # In the model parallel case, it should only return the size of
-    # batch dimension.
-    return self._mesh.size
-
-  def value_container(self, value):
-    return value
-
-  @property
-  def worker_devices(self):
-    # Note that we return the local device here since this is a single worker
-    # setting, and the local devices will be all the devices in the current
-    # mesh. In the multi-worker mirrored strategy, this value should be
-    # expanded to the global device list.
-    return tuple(self._mesh.local_devices())
-
-  @property
-  def parameter_devices(self):
-    # Same as the worker_devices.
-    return self.worker_devices
-
-  def _in_multi_worker_mode(self):
-    # This method is mostly used in the input relate context and high level API.
-    # In the single client mesh DTensor context, this is False.
-    return False
-
-  def _get_local_replica_id(self, replica_id_in_sync_group):
-    return replica_id_in_sync_group
-
-  def _experimental_distribute_dataset(self, dataset, options):
-    # Strategy always assume the user input data is a batched dataset for
-    # experimental_distribute_dataset().
-    # TODO(yuefengz): Add check for whether a dataset is batched for all
-    # strategies.
-
-    # TODO(b/265198795): Support dataset already batched to global batch size.
-    # Since DTensorDataset doesn't support batched dataset that is already
-    # batched global batch size, it only supports dataset that is batched to
-    # local batch size, we need to infer the batch size, and unbatch the dataset
-    # until the b/265198795 is resolved.
-    batch_size = distribute.compute_batch_size(dataset)
-
-    # There are multiple case that the batch is not static, eg partial batch,
-    # or uneven batch, in all those case, it will return -1.
-    if batch_size.numpy() < 0:
-      # When we don't have a static batch size.
-      raise ValueError('DTensor strategy requires a static batch size for now.'
-                       'The dynamic batch size will be supported in future')
-    # Unbatch the dataset for now since the DTensorDataset has some limitation
-    # about the local batch size as well as the mesh size.
-    dataset = dataset.unbatch()
-
-    def _create_batch_layout(tensor_spec):
-      # For unbatched dataset, the new layout need to have +1 rank for
-      # the batched result.
-      rank = len(tensor_spec.shape) + 1
-      return layout.Layout.batch_sharded(
-          self._mesh, batch_dim=_DEFAULT_BATCH_MESH_DIM_NAME, rank=rank)
-
-    layouts = nest.map_structure(_create_batch_layout, dataset.element_spec)
-
-    return input_util.DTensorDataset(
-        dataset=dataset,
-        mesh=self._mesh,
-        layouts=layouts,
-        global_batch_size=batch_size,
-        dataset_already_batched=False,
-        batch_dim=_DEFAULT_BATCH_MESH_DIM_NAME,
-        # TODO(scottzhu): Add prefetch support by inspecting the input dataset.
-        prefetch=None,
-        tf_data_service_config=None
-    )
-
-  def _make_dataset_iterator(self, dataset):
-    raise NotImplementedError(
-        'Strategy.make_dataset_iterator() is deprecated, and only available '
-        'in the V1 API.')
-
-  def _make_input_fn_iterator(self, input_fn, replication_mode):
-    raise NotImplementedError(
-        'Strategy.make_input_fn_iterator() is deprecated, and only available '
-        'in the V1 API.')
-
-  def _distribute_datasets_from_function(self, dataset_fn, options):
-    # TODO(scottzhu): Implement the logic for options in future
-    del options
-    # Single worker for now, this will change when deal with different input
-    # options or multiple workers.
-    input_context = distribute_lib.InputContext(
-        num_input_pipelines=1,
-        input_pipeline_id=0,
-        num_replicas_in_sync=self._num_replicas_in_sync
-    )
-    dataset = dataset_fn(input_context)
-
-    # Note that the dataset should already batched to local per-relica batch
-    def _create_batch_layout(tensor_spec):
-      rank = len(tensor_spec.shape)
-      return layout.Layout.batch_sharded(
-          self._mesh, batch_dim=_DEFAULT_BATCH_MESH_DIM_NAME, rank=rank)
-
-    layouts = nest.map_structure(_create_batch_layout, dataset.element_spec)
-
-    batch_size = distribute.compute_batch_size(dataset)
-    # There are multiple case that the batch is not static, eg partial batch,
-    # or uneven batch, in all those case, it will return -1.
-    if batch_size.numpy() < 0:
-      # When we don't have a static batch size.
-      raise ValueError('DTensor strategy requires a static batch size for now.'
-                       'The dynamic batch size will be supported in future')
-    global_batch_size = batch_size.numpy() * self._num_replicas_in_sync
-
-    return input_util.DTensorDataset(
-        dataset=dataset,
-        mesh=self._mesh,
-        layouts=layouts,
-        global_batch_size=global_batch_size,
-        dataset_already_batched=True,
-        batch_dim=_DEFAULT_BATCH_MESH_DIM_NAME,
-        # TODO(scottzhu): Add prefetch support by inspecting the input dataset.
-        prefetch=None,
-        tf_data_service_config=None
-    )
-
-  def _experimental_distribute_values_from_function(self, value_fn):
-    per_replica_values = []
-    for replica_id in range(self._num_replicas_in_sync):
-      per_replica_values.append(value_fn(
-          distribute_lib.ValueContext(replica_id,
-                                      self._num_replicas_in_sync)))
-    # Instead of using the DistributeVariable, return a DTensor instead since
-    # the run() will expect a DTensor instance.
-    result = distribute_utils.regroup(per_replica_values, always_wrap=True)
-    map_fn = functools.partial(_convert_per_replica_to_dtensor, mesh=self._mesh)
-    return nest.map_structure(map_fn, result)
-
-  def call_for_each_replica(self, fn, args, kwargs):
-    """Run `fn` once per replica.
-
-    This is a method that expected by the strategy base class in its `run()`.
-
-    Args:
-      fn: function to run (will be run once per replica).
-      args: Tuple or list with positional arguments for `fn`.
-      kwargs: Dict with keyword arguments for `fn`.
-
-    Returns:
-      Merged return value of `fn` across all replicas.
-    """
-    # Comparing to the existing MirroredStrategy, which will run the fn on
-    # each of the replica with individual thread, the DTensor will just run
-    # the fn once with the DTensor inputs, and the distribution will be handled
-    # by the DTensor.
-
-    distribute_lib._require_cross_replica_or_default_context_extended(self)   # pylint: disable=protected-access
-    if kwargs is None:
-      kwargs = {}
-
-    # For any value that is not DTensor, eg normal tf.Tensor or
-    # DistributedValues, we need to convert them into DTensor.
-    map_fn = functools.partial(_convert_inputs_to_dtensor, mesh=self._mesh)
-    d_args = nest.map_structure(map_fn, args)
-    d_kwargs = nest.map_structure(map_fn, kwargs)
-
-    with d_api.default_mesh(self._mesh):
-      with self._container_strategy().scope():
-        with dtensor_util.DTensorReplicaContext(self._container_strategy()):
-          dtensor_result = fn(*d_args, **d_kwargs)
-
-    return nest.map_structure(
-        dtensor_util.DTensorDistributedValue,
-        dtensor_result)
-
-  def _gather_to_implementation(self, value, destinations, axis, options):
-    if isinstance(value, dtensor_util.DTensorDistributedValue):
-      value = value.get_dtensor()
-    if not d_api.is_dtensor(value):
-      # This is the current behavior for mirrored strategy, should we raise an
-      # error for unsupported types?
-      return value
-
-    # Unpack the dtensor components and gather the tensors on the axis
-    components = d_api.unpack(value)
-    return array_ops.concat(components, axis=axis)
-
-
-def _convert_inputs_to_dtensor(inputs, mesh):
-  """Convert any input types to DTensor instance."""
-  if d_api.is_dtensor(inputs):
-    return inputs
-  elif isinstance(inputs, dtensor_util.DTensorDistributedValue):
-    return inputs.get_dtensor()
-  elif isinstance(inputs, values_lib.DistributedValues):
-    return _convert_per_replica_to_dtensor(inputs, mesh)
-  elif isinstance(inputs, input_util._DTensorIterator):   # pylint: disable=protected-access
-    return inputs
-  else:
-    # For the rest of the types, we will convert it to dtensor.
-    # Any of the inputs will be replicate to all the devices.
-    # we infer the num_replica_in_sync from the mesh size, and this is only
-    # going to apply for the data parallel training
-    num_replica_in_sync = mesh.dim_size(_DEFAULT_BATCH_MESH_DIM_NAME)
-    values = [inputs for _ in range(num_replica_in_sync)]
-    return _convert_per_replica_to_dtensor(values_lib.PerReplica(values), mesh)
-
-
-def _is_distributed_value(value):
-  return isinstance(
-      value, values_lib.DistributedValues) or d_api.is_dtensor(value)
-
-
-def _convert_per_replica_to_dtensor(per_replica_value, mesh):
-  """Convert a PerReplica result to a DTensor instance.
-
-  Args:
-    per_replica_value: A PerReplica instance whose value will be converted
-      to DTensor.
-    mesh: The mesh used for layout creation.
-
-  Returns:
-    A DTensor instance that packed from per_replica_value with batch sharded
-      layout.
-  """
-  values = per_replica_value.values
-  if isinstance(values[0], (float, int)):
-    rank = 0
-  else:
-    rank = len(values[0].shape)
-
-  if rank == 0:
-    result = []
-    # dtensor.pack requires each component to have same rank as the packed
-    # result. When the individual value is scalar, it needs to be expanded into
-    # 1D tensor.
-    for v in values:
-      result.append(array_ops.expand_dims_v2(v, axis=0))
-    rank += 1
-  else:
-    result = list(values)   # dtensor.pack requires a list as input.
-
-  # TODO(scottzhu): Note that the result tensor could be a partial value and
-  # not always batch shard or fully replicaed. See
-  # http://screenshot/6ERkXyX95KqftCw as an example.
-  batch_layout = layout.Layout.batch_sharded(
-      mesh, batch_dim=_DEFAULT_BATCH_MESH_DIM_NAME, rank=rank)
-
-  return d_api.pack(result, batch_layout)
diff --git a/tensorflow/python/distribute/experimental/mirrored_strategy_test.py b/tensorflow/python/distribute/experimental/mirrored_strategy_test.py
index bcb419fef6b..501fe6b6a23 100644
--- a/tensorflow/python/distribute/experimental/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/experimental/mirrored_strategy_test.py
@@ -24,7 +24,6 @@ from tensorflow.dtensor.python import mesh_util
 from tensorflow.dtensor.python.tests import test_util
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute.experimental import dtensor_util
 from tensorflow.python.distribute.experimental import mirrored_strategy
@@ -32,6 +31,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import stateless_random_ops
@@ -141,17 +141,47 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     def replica_fn(inputs):
       return inputs * 2.0
 
-    result = strategy.run(replica_fn, args=(tensor_input,))
-    self.assertIsInstance(result, dtensor_util.DTensorDistributedValue)
-    self.assertLen(result.values, 2)
-    self.assertAllClose(result.values[0], constant_op.constant([6.0]))
-    self.assertAllClose(result.values[1], constant_op.constant([6.0]))
+    with self.assertRaisesRegex(
+        ValueError, 'Unsupported input types for MirroredStrategy.'):
+      strategy.run(replica_fn, args=(tensor_input,))
+
+  def test_run_with_graph_tensor_inputs(self):
+    # Note that this is potentially a sharp edge for the user, since the eager
+    # test case was raising an error, but the graph context will run, by treat
+    # the inputs as a global inputs.
+    # TODO(scottzhu): Mitigate this eager/graph behavior difference in future.
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+
+    @def_function.function
+    def replica_fn(inputs):
+      return inputs * 2.0
+
+    @def_function.function
+    def run_fn():
+      tensor_input = constant_op.constant(3.0)
+      return strategy.run(replica_fn, args=(tensor_input,))
+
+    with strategy.scope():
+      result = run_fn()
+    self.assertEqual(result, constant_op.constant(6.0))
+
+  def test_run_with_unsupported_input_types(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    random_inputs = [123, '456']
+
+    @def_function.function
+    def replica_fn(inputs):
+      return inputs * 2.0
+
+    with self.assertRaisesRegex(
+        ValueError, 'Unsupported input types for MirroredStrategy.'):
+      strategy.run(replica_fn, args=(random_inputs,))
 
   def test_run_with_distribute_value_input(self):
     strategy = mirrored_strategy.MirroredStrategy(self.mesh)
 
     def value_fn(value_context):
-      return value_context.num_replicas_in_sync
+      return value_context.replica_id_in_sync_group
     distributed_values = (
         strategy.experimental_distribute_values_from_function(
             value_fn))
@@ -166,9 +196,9 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     # Note that the scalar value from
     # experimental_distribute_values_from_function will be up rank to 1D since
     # batched shared dtensor need at least be 1D. So the result from the
-    # strategy.run is [4], instead of just 4.
-    self.assertAllClose(result.values[0], constant_op.constant([4]))
-    self.assertAllClose(result.values[1], constant_op.constant([4]))
+    # strategy.run is [0], instead of just 0.
+    self.assertAllClose(result.values[0], constant_op.constant([0]))
+    self.assertAllClose(result.values[1], constant_op.constant([2]))
 
   def test_nested_structure_output(self):
     strategy = mirrored_strategy.MirroredStrategy(self.mesh)
@@ -211,8 +241,10 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
 
     strategy = mirrored_strategy.MirroredStrategy(self.mesh)
     tensor_input = constant_op.constant(3.0)
+    d_tensor_input = strategy.experimental_distribute_values_from_function(
+        lambda _: tensor_input)
 
-    result_1 = strategy.run(replica_fn_1, args=(tensor_input,))
+    result_1 = strategy.run(replica_fn_1, args=(d_tensor_input,))
     self.assertIsInstance(result_1, dtensor_util.DTensorDistributedValue)
     self.assertLen(result_1.values, 2)
     self.assertAllClose(result_1.values[0], constant_op.constant([6.0]))
@@ -241,19 +273,21 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     strategy = mirrored_strategy.MirroredStrategy(self.mesh)
 
     tensor_input = constant_op.constant(3)
+    d_tensor_input = strategy.experimental_distribute_values_from_function(
+        lambda _: tensor_input)
 
     @def_function.function
     def replica_fn(inputs):
-      replica_context = distribution_strategy_context.get_replica_context()
+      replica_context = distribute_lib.get_replica_context()
       self.assertIsInstance(replica_context, dtensor_util.DTensorReplicaContext)
       return inputs * replica_context.num_replicas_in_sync
 
     # Default replica context
-    self.assertIsNotNone(distribution_strategy_context.get_replica_context())
+    self.assertIsNotNone(distribute_lib.get_replica_context())
     with strategy.scope():
-      self.assertIsNone(distribution_strategy_context.get_replica_context())
+      self.assertIsNone(distribute_lib.get_replica_context())
 
-      result = strategy.run(replica_fn, args=(tensor_input,))
+      result = strategy.run(replica_fn, args=(d_tensor_input,))
 
     self.assertLen(result.values, 2)
     self.assertAllClose(result.values[0], constant_op.constant([6]))
@@ -291,28 +325,17 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     strategy = mirrored_strategy.MirroredStrategy(self.mesh)
     tensor_input = constant_op.constant([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0]])
 
-    result = strategy.reduce(reduce_util.ReduceOp.MEAN, tensor_input, axis=None)
-    self.assertAllClose(result, tensor_input)
-
-    result = strategy.reduce(reduce_util.ReduceOp.MEAN, tensor_input, axis=0)
-    self.assertAllClose(result, constant_op.constant([5.0, 6.0]))
-
-    result = strategy.reduce(reduce_util.ReduceOp.MEAN, tensor_input, axis=1)
-    self.assertAllClose(result, constant_op.constant([3.5, 5.5, 7.5]))
+    with self.assertRaisesRegex(
+        ValueError, 'Unsupported input types for MirroredStrategy.'):
+      strategy.reduce(reduce_util.ReduceOp.MEAN, tensor_input, axis=0)
 
   def test_reduce_sum_non_dtensor_value(self):
     strategy = mirrored_strategy.MirroredStrategy(self.mesh)
     tensor_input = constant_op.constant([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0]])
 
     with self.assertRaisesRegex(
-        ValueError, 'cannot be reduced with the given reduce op ReduceOp.SUM'):
-      strategy.reduce(reduce_util.ReduceOp.SUM, tensor_input, axis=None)
-
-    result = strategy.reduce(reduce_util.ReduceOp.SUM, tensor_input, axis=0)
-    self.assertAllClose(result, constant_op.constant([30.0, 36.0]))
-
-    result = strategy.reduce(reduce_util.ReduceOp.SUM, tensor_input, axis=1)
-    self.assertAllClose(result, constant_op.constant([14.0, 22.0, 30.0]))
+        ValueError, 'Unsupported input types for MirroredStrategy.'):
+      strategy.reduce(reduce_util.ReduceOp.SUM, tensor_input, axis=0)
 
   def test_reduce_mean_distribute_value(self):
     strategy = mirrored_strategy.MirroredStrategy(self.mesh)
@@ -401,6 +424,49 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     result = strategy.reduce(reduce_util.ReduceOp.MEAN, tensor_input, axis=None)
     self.assertIn('CPU:0', result.device)
 
+  def test_experimental_local_results(self):
+    @def_function.function
+    def replica_fn():
+      return constant_op.constant([3.0])
+
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    result = strategy.run(replica_fn)
+    local_result = strategy.experimental_local_results(result)
+
+    self.assertIsInstance(local_result, tuple)
+    self.assertLen(local_result, 2)
+    self.assertEqual(local_result[0], constant_op.constant([3.0]))
+    self.assertEqual(local_result[1], constant_op.constant([3.0]))
+
+  def test_experimental_local_results_with_inputs(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    array_value = np.array([3., 2.])
+    def value_fn(ctx):
+      value = array_value[ctx.replica_id_in_sync_group]
+      return {'a': value,
+              'b': constant_op.constant([value + 1.0, value + 2.0])}
+    distributed_values = (
+        strategy.experimental_distribute_values_from_function(
+            value_fn))
+
+    @def_function.function
+    def replica_fn(inputs):
+      result = {}
+      for key in inputs:
+        result[key] = inputs[key] * 2.0
+      return result
+
+    result = strategy.run(replica_fn, args=(distributed_values,))
+    local_result = strategy.experimental_local_results(result)
+    self.assertIsInstance(local_result, tuple)
+    self.assertLen(local_result, 2)
+    self.assertDictEqual(local_result[0],
+                         {'a': constant_op.constant([6.0]),
+                          'b': constant_op.constant([8.0, 10.0])})
+    self.assertDictEqual(local_result[1],
+                         {'a': constant_op.constant([4.0]),
+                          'b': constant_op.constant([6.0, 8.0])})
+
 
 class InvalidMeshTest(test_util.DTensorBaseTest):
 
@@ -431,7 +497,6 @@ class StrategyCreationTest(test_util.DTensorBaseTest):
     self.device_type = device_type
 
   def test_explicit_device_list(self):
-
     device_list = [f'/{self.device_type}:{i}' for i in range(2)]
     strategy = mirrored_strategy.MirroredStrategy(devices=device_list)
     mesh = strategy._mesh
@@ -444,6 +509,8 @@ class StrategyCreationTest(test_util.DTensorBaseTest):
     self.assertIn(
         f'/job:localhost/replica:0/task:0/device:{self.device_type}:1',
         mesh.local_devices()[1])
+    # Also make sure the host mesh works since it is required by dataset
+    self.assertIsNotNone(mesh.host_mesh())
 
   def test_implicit_device_list(self):
     strategy = mirrored_strategy.MirroredStrategy()
@@ -456,6 +523,8 @@ class StrategyCreationTest(test_util.DTensorBaseTest):
     self.assertIn(
         f'/job:localhost/replica:0/task:0/device:{self.device_type}:1',
         mesh.local_devices()[1])
+    # Also make sure the host mesh works since it is required by dataset
+    self.assertIsNotNone(mesh.host_mesh())
 
   def test_mesh_with_device_list(self):
     device_list = [f'/{self.device_type}:{i}' for i in range(2)]
@@ -554,9 +623,16 @@ class StrategyDatasetTest(test_util.DTensorBaseTest):
     strategy = mirrored_strategy.MirroredStrategy(self.mesh)
     distributed_dataset = strategy.distribute_datasets_from_function(
         dataset_fn, None)
+    iterator = iter(distributed_dataset)
 
-    element = next(iter(distributed_dataset))
-    batched_image, batched_label = element
+    self.assertEqual(distributed_dataset.element_spec,
+                     (tensor_spec.TensorSpec(shape=(8, 8, 8, 3),
+                                             dtype=dtypes.float32, name=None),
+                      tensor_spec.TensorSpec(shape=(8, 1),
+                                             dtype=dtypes.float32, name=None)))
+    self.assertEqual(distributed_dataset.element_spec, iterator.element_spec)
+
+    batched_image, batched_label = next(iterator)
     self.assertEqual(batched_image.shape, [global_batch_size, 8, 8, 3])
     self.assertEqual(batched_label.shape, [global_batch_size, 1])
 
diff --git a/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy.py b/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy.py
new file mode 100644
index 00000000000..9b31da145b0
--- /dev/null
+++ b/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy.py
@@ -0,0 +1,148 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implement a MultiMirroredStrategy based on the DTensor low level API.
+
+This is an experiment to validate the viability of the DTensor API, and expose
+any potential feature gaps between the current API and the need.
+"""
+
+import os
+
+from tensorflow.dtensor.python import config as d_config
+from tensorflow.dtensor.python import mesh_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
+from tensorflow.python.distribute.experimental import dtensor_strategy_extended
+from tensorflow.python.distribute.experimental import dtensor_util
+
+
+class MultiWorkerMirroredStrategy(distribute_lib.Strategy):
+  """A distribution strategy for synchronous training on multiple workers.
+
+  This strategy implements synchronous distributed training across multiple
+  workers, each with potentially multiple GPUs. Similar to
+  `tf.distribute.MirroredStrategy`, it replicates all variables and computations
+  to each local device. The difference is that it uses a distributed collective
+  implementation (e.g. all-reduce), so that multiple workers can work together.
+  """
+
+  def __init__(self, mesh=None, cluster_resolver=None,
+               communication_options=None):
+    """Creates the strategy.
+
+    Args:
+      mesh: optional Dtensor global mesh for the computation. Note that either
+        `mesh` or the `cluster_resolver` should be provided. and not both.
+      cluster_resolver: optional
+        `tf.distribute.cluster_resolver.ClusterResolver`. In case neither `mesh`
+        nor `cluster_resolver` are provided,
+        `tf.distribute.cluster_resolver.TFConfigClusterResolver` is used.
+      communication_options: currently ignore.
+    """
+    self._validate_init_args(mesh, cluster_resolver)
+    if not mesh:
+      if not cluster_resolver:
+        # Use the TFConfigClusterResolver as default
+        cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
+      dtensor_env_var = _parse_dtensor_env_var_from_cluster_resolver(
+          cluster_resolver)
+      _config_dtensor_env_var(dtensor_env_var)
+      mesh = _build_distributed_mesh(dtensor_util.DEFAULT_BATCH_MESH_DIM_NAME)
+    extended = dtensor_strategy_extended.DTensorStrategyExtended(
+        container_strategy=self, mesh=mesh)
+    super().__init__(extended)
+    self._mesh = mesh
+    self._cluster_resolver = cluster_resolver
+
+  @classmethod
+  def _validate_init_args(cls, mesh, cluster_resolver):
+    if mesh and cluster_resolver:
+      raise ValueError('Mesh and cluster_resolver can not be provided at the '
+                       f'same time. Received mesh = {mesh}, cluster_resolver = '
+                       f'{cluster_resolver}')
+    if mesh and len(mesh.shape()) != 1:
+      raise ValueError('The mesh for MultiWorkerMirroredStrategy must be 1D, '
+                       f'received: {len(mesh.shape())}D')
+
+
+def _parse_dtensor_env_var_from_cluster_resolver(cluster_resolver):
+  """Parse the env vars for Dtensor based on the cluster resolver.
+
+  In the multi-client setting, each of the DTensor jobs need to aware of each
+  other, and the interface to setup those values are via the envvars. The
+  value used by dtensor are different from the existing
+  `MultiWorkerMirroredStrategy`. This function will parse the value from
+  cluster resolver, and populate the corresponding value for DTensor jobs in the
+  `os.environ`.
+
+  Args:
+    cluster_resolver: A `tf.distribute.cluster_resolver.ClusterResolver`
+      instance.
+
+  Returns:
+    A dict of {Str:Str} which contains all the env vars needed by DTensor jobs.
+    The value is for verification purpose.
+
+  Raises:
+    The value parsed from existing cluster spec is not valid.
+  """
+  result = {}
+
+  # Retrieve the number of host, cluster config from the resolver.
+  cluster_spec = multi_worker_util.normalize_cluster_spec(
+      cluster_resolver.cluster_spec())
+  # Export all the necessary envvars for dtensor
+  # Get all the jobs from the cluster spec. Note that the in the normal
+  # setting, it could be multiple worker devices without chief, and the
+  # worker 0 will be the chief, or an explicit chief with multiple worker job.
+  dtensor_jobs = []
+  if 'chief' in cluster_spec.jobs:
+    dtensor_jobs.extend(cluster_spec.job_tasks('chief'))
+  if 'worker' in cluster_spec.jobs:
+    dtensor_jobs.extend(cluster_spec.job_tasks('worker'))
+
+  if None in dtensor_jobs:
+    raise ValueError('Unexpected dtensor job address from cluster spec: '
+                     f'{cluster_spec}')
+  result['DTENSOR_JOBS'] = ','.join(dtensor_jobs)
+  result['DTENSOR_NUM_CLIENTS'] = str(len(dtensor_jobs))
+
+  if cluster_resolver.task_type == 'chief':
+    dtensor_client_id = 0
+  elif cluster_resolver.task_type == 'worker':
+    dtensor_client_id = cluster_resolver.task_id
+    if 'chief' in cluster_spec.jobs:
+      dtensor_client_id += 1
+  result['DTENSOR_CLIENT_ID'] = str(dtensor_client_id)
+  result['DTENSOR_JOB_NAME'] = 'worker'
+
+  return result
+
+
+def _config_dtensor_env_var(dtensor_env_vars):
+  for k, v in dtensor_env_vars.items():
+    os.environ[k] = v
+
+
+def _build_distributed_mesh(batch_dim_name):
+  device_type = d_config.preferred_device_type()
+  local_devices = d_config.local_devices(device_type)
+  number_clients = d_config.num_clients()
+  dtensor_util.initialize_accelerator_system_once(device_type)
+  # This assumes each client has same number of devices.
+  mesh_dims = [(batch_dim_name, len(local_devices) * number_clients)]
+  return mesh_util.create_distributed_mesh(
+      mesh_dims, device_type=device_type)
diff --git a/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy_test.py b/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy_test.py
new file mode 100644
index 00000000000..1851d2000fa
--- /dev/null
+++ b/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy_test.py
@@ -0,0 +1,307 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for MultiWorkerMirroredStrategy backed by DTensor API."""
+
+import json
+import os
+
+from absl import flags
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import layout
+from tensorflow.dtensor.python.tests import multi_client_test_util
+from tensorflow.dtensor.python.tests import test_backend_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
+from tensorflow.python.distribute.experimental import dtensor_util
+from tensorflow.python.distribute.experimental import multi_worker_mirrored_strategy as mwms
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test as tf_test
+
+
+class MultiWorkerMirroredStrategyTest(tf_test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.num_client = flags.FLAGS.num_clients
+    self.num_local_devices = flags.FLAGS.num_local_devices
+
+    tf_config = json.loads(os.environ['TF_CONFIG'])
+    self.client_id = int(tf_config['task']['index'])
+
+  def test_strategy_creation_with_default_cluster_resolver(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    mesh = strategy._mesh
+    self.assertIsNotNone(mesh)
+    self.assertLen(mesh.global_device_ids(),
+                   self.num_client * self.num_local_devices)
+    self.assertLen(mesh.local_device_ids(), self.num_local_devices)
+    self.assertIsInstance(strategy._cluster_resolver,
+                          tfconfig_cluster_resolver.TFConfigClusterResolver)
+
+  def test_invalid_init_arguments(self):
+    mesh = object()
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
+
+    with self.assertRaisesRegex(
+        ValueError,
+        'Mesh and cluster_resolver can not be provided at the same time'):
+      mwms.MultiWorkerMirroredStrategy(
+          mesh=mesh,
+          cluster_resolver=cluster_resolver)
+
+  def test_parse_dtensor_env_var_from_cluster_resolver(self):
+    cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
+
+    dtensor_env_vars = mwms._parse_dtensor_env_var_from_cluster_resolver(
+        cluster_resolver)
+
+    tf_config = json.loads(os.environ['TF_CONFIG'])
+    worker_jobs = ','.join(tf_config['cluster']['worker'])
+    client_id = tf_config['task']['index']
+
+    self.assertLen(dtensor_env_vars, 4)
+    self.assertEqual(dtensor_env_vars['DTENSOR_JOBS'], worker_jobs)
+    self.assertEqual(dtensor_env_vars['DTENSOR_NUM_CLIENTS'],
+                     str(self.num_client))
+    self.assertEqual(dtensor_env_vars['DTENSOR_CLIENT_ID'], client_id)
+    self.assertEqual(dtensor_env_vars['DTENSOR_JOB_NAME'], 'worker')
+
+  @parameterized.named_parameters([
+      ('py_floats', lambda: [1.0, 2.0], True),
+      ('np_floats', lambda: np.array([1.0, 2.0]), True),
+      ('tf_const', lambda: constant_op.constant([1.0, 2.0]), True),
+      ('py_floats_callable', lambda: [1.0, 2.0], False),
+      ('np_floats_callable', lambda: np.array([1.0, 2.0]), False),
+      ('tf_const_callable', lambda: constant_op.constant([1.0, 2.0]), False),
+  ])
+  def test_variable_creation(self, init_value, convert_callable):
+    if convert_callable:
+      init_value = init_value()
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    with strategy.scope():
+      v = variables.Variable(init_value)
+
+    self.assertIsInstance(v, d_variable.DVariable)
+    self.assertIsNotNone(v.layout)
+    self.assertEqual(v.layout, layout.Layout.replicated(strategy._mesh, rank=1))
+
+  def test_strategy_extension(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    self.assertIsInstance(strategy.extended, distribute_lib.StrategyExtendedV2)
+
+  def test_num_replica_in_sync(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    self.assertEqual(strategy.num_replicas_in_sync,
+                     self.num_client * self.num_local_devices)
+
+  def test_worker_devices(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    worker_devices = strategy.extended.worker_devices
+    self.assertLen(worker_devices, self.num_local_devices)
+    self.assertEqual(worker_devices, tuple(strategy._mesh.local_devices()))
+
+  def test_parameter_devices(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    parameter_devices = strategy.extended.parameter_devices
+    self.assertLen(parameter_devices, self.num_local_devices)
+    self.assertEqual(parameter_devices, tuple(strategy._mesh.local_devices()))
+
+  def test_variable_created_in_scope(self):
+    strategy1 = mwms.MultiWorkerMirroredStrategy()
+    with strategy1.scope():
+      v1 = variables.Variable(constant_op.constant([1.0, 2.0]))
+
+    v2 = variables.Variable(constant_op.constant([1.0, 2.0]))
+
+    strategy2 = mwms.MultiWorkerMirroredStrategy()
+    with strategy2.scope():
+      v3 = variables.Variable(constant_op.constant([1.0, 2.0]))
+
+    self.assertTrue(strategy1.extended.variable_created_in_scope(v1))
+    self.assertFalse(strategy1.extended.variable_created_in_scope(v2))
+    self.assertFalse(strategy1.extended.variable_created_in_scope(v3))
+    self.assertTrue(strategy2.extended.variable_created_in_scope(v3))
+
+  def test_colocate_vars_with(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    with strategy.scope():
+      v1 = variables.Variable(constant_op.constant([1.0, 2.0]))
+      with strategy.extended.colocate_vars_with(v1):
+        v2 = variables.Variable(constant_op.constant([2.0, 3.0]))
+
+    # We assert the layout for the variable, and make sure they are same.
+    self.assertEqual(v1.layout, v2.layout)
+
+  def test_in_multi_worker_mode(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    self.assertTrue(strategy.extended._in_multi_worker_mode())
+
+  def test_run_with_distribute_value_input(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+
+    def value_fn(value_context):
+      return value_context.replica_id_in_sync_group
+    distributed_values = (
+        strategy.experimental_distribute_values_from_function(
+            value_fn))
+
+    @def_function.function
+    def replica_fn(inputs):
+      return inputs * 2
+
+    result = strategy.run(replica_fn, args=(distributed_values,))
+    self.assertIsInstance(result, dtensor_util.DTensorDistributedValue)
+    self.assertLen(result.values, self.num_local_devices)
+    # Note that the scalar value from
+    # experimental_distribute_values_from_function will be up rank to 1D since
+    # batched shared dtensor need at least be 1D.
+
+    for i in range(self.num_local_devices):
+      self.assertAllClose(
+          result.values[i],
+          constant_op.constant(
+              [(self.client_id * self.num_local_devices + i) * 2]))
+
+  def test_nested_structure_output(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    def value_fn(ctx):
+      value = float(ctx.num_replicas_in_sync)
+      return {'a': value,
+              'b': constant_op.constant([value + 1.0, value + 2.0])}
+    distributed_values = (
+        strategy.experimental_distribute_values_from_function(
+            value_fn))
+
+    @def_function.function
+    def replica_fn(inputs):
+      result = {}
+      for key in inputs:
+        result[key] = inputs[key] * 2.0
+      return result
+
+    result = strategy.run(replica_fn, args=(distributed_values,))
+    self.assertLen(result.keys(), 2)
+    self.assertIsInstance(result['a'], dtensor_util.DTensorDistributedValue)
+    self.assertLen(result['a'].values, self.num_local_devices)
+    for i in range(self.num_local_devices):
+      self.assertAllClose(
+          result['a'].values[i],
+          constant_op.constant([strategy.num_replicas_in_sync * 2.0]))
+
+    self.assertIsInstance(result['b'], dtensor_util.DTensorDistributedValue)
+    self.assertLen(result['b'].values, self.num_local_devices)
+    for i in range(self.num_local_devices):
+      self.assertAllClose(
+          result['b'].values[i],
+          constant_op.constant([(strategy.num_replicas_in_sync + 1.0) * 2.0,
+                                (strategy.num_replicas_in_sync + 2.0) * 2.0]))
+
+  def test_inputs_with_dtensor_distribute_values(self):
+
+    @def_function.function
+    def replica_fn_1(inputs):
+      return inputs * 2.0
+
+    @def_function.function
+    def replica_fn_2(inputs):
+      return inputs + 1.0
+
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    tensor_input = constant_op.constant(3.0)
+    d_tensor_input = strategy.experimental_distribute_values_from_function(
+        lambda _: tensor_input)
+
+    result_1 = strategy.run(replica_fn_1, args=(d_tensor_input,))
+    self.assertIsInstance(result_1, dtensor_util.DTensorDistributedValue)
+    self.assertLen(result_1.values, self.num_local_devices)
+    for i in range(self.num_local_devices):
+      self.assertAllClose(result_1.values[i], constant_op.constant([6.0]))
+
+    result_2 = strategy.run(replica_fn_2, args=(result_1,))
+    self.assertIsInstance(result_2, dtensor_util.DTensorDistributedValue)
+    self.assertLen(result_2.values, self.num_local_devices)
+    for i in range(self.num_local_devices):
+      self.assertAllClose(result_2.values[i], constant_op.constant([7.0]))
+
+  def test_get_replica_context(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+
+    tensor_input = constant_op.constant(3)
+    d_tensor_input = strategy.experimental_distribute_values_from_function(
+        lambda _: tensor_input)
+
+    @def_function.function
+    def replica_fn(inputs):
+      replica_context = distribute_lib.get_replica_context()
+      self.assertIsInstance(replica_context, dtensor_util.DTensorReplicaContext)
+      return inputs * replica_context.num_replicas_in_sync
+
+    # Default replica context
+    self.assertIsNotNone(distribute_lib.get_replica_context())
+    with strategy.scope():
+      self.assertIsNone(distribute_lib.get_replica_context())
+
+      result = strategy.run(replica_fn, args=(d_tensor_input,))
+
+    self.assertLen(result.values, self.num_local_devices)
+    for i in range(self.num_local_devices):
+      self.assertAllClose(
+          result.values[i],
+          constant_op.constant([3 * strategy.num_replicas_in_sync]))
+
+
+def client_config_function(config_params):
+  client_id = config_params['client_id']
+  worker_jobs = config_params['worker_jobs']
+  num_devices = config_params['num_devices']
+
+  os.environ['TF_CONFIG'] = json.dumps({
+      'cluster': {
+          'worker': worker_jobs
+      },
+      'task': {'type': 'worker', 'index': f'{client_id}'}
+  })
+
+  if config.list_physical_devices('GPU'):
+    device_type = 'GPU'
+  elif test_util.is_tpu_present():
+    device_type = 'TPU'
+  else:
+    device_type = 'CPU'
+
+  # reset_logical_devices
+  test_util.reset_context()
+  if device_type != 'TPU':
+    # Configure virtual devices. This does not initialize the TensorFlow
+    # context.
+    test_util.reset_logical_devices(device_type, num_devices)
+
+  # Validates the correct number of devices are created.
+  logical_devices = test_util.list_local_logical_devices(device_type)
+  assert len(logical_devices) == num_devices, (
+      logical_devices,
+      f'Test is mis-configured: expecting {num_devices} logical_devices.')
+
+
+if __name__ == '__main__':
+  test_backend_util.handle_test_main(
+      multi_client_test_util.multi_client_main, client_config_function)
diff --git a/tensorflow/python/distribute/experimental/rpc/BUILD b/tensorflow/python/distribute/experimental/rpc/BUILD
index 978b3a53777..c63145ed1d1 100644
--- a/tensorflow/python/distribute/experimental/rpc/BUILD
+++ b/tensorflow/python/distribute/experimental/rpc/BUILD
@@ -1,7 +1,7 @@
 # Python bindings for RPC client and server ops.
 
-load("//tensorflow:pytype.default.bzl", "pytype_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -9,7 +9,7 @@ package(
     licenses = ["notice"],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "rpc_ops",
     srcs = [
         "rpc_ops.py",
@@ -18,17 +18,24 @@ pytype_library(
     deps = [
         "//tensorflow/distribute/experimental/rpc/kernels:gen_rpc_ops",
         "//tensorflow/distribute/experimental/rpc/proto:tf_rpc_service_proto_py",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "rpc_ops_test",
     size = "medium",
     srcs = ["rpc_ops_test.py"],
@@ -38,9 +45,20 @@ tf_py_test(
     ],
     deps = [
         ":rpc_ops",
+        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:memory_checker",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:nest",
     ],
 )
diff --git a/tensorflow/python/distribute/failure_handling/BUILD b/tensorflow/python/distribute/failure_handling/BUILD
index 133a392d032..4c444c94f99 100644
--- a/tensorflow/python/distribute/failure_handling/BUILD
+++ b/tensorflow/python/distribute/failure_handling/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -8,45 +9,49 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "failure_handling_lib",
     srcs = [
         "failure_handling.py",
     ],
     srcs_version = "PY3",
     deps = [
+        ":check_preemption_py",
         ":failure_handling_util",
-        "//tensorflow/python:lib",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_context",
         "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:multi_worker_util",
-        "//tensorflow/python/distribute/failure_handling:check_preemption_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:ops",
         "//tensorflow/python/lib/io:lib",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/tools/docs:doc_controls",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "failure_handling_util",
     srcs = [
         "failure_handling_util.py",
     ],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:tf_logging",
+        "@six_archive//:six",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "preemption_watcher",
     srcs = ["preemption_watcher.py"],
     srcs_version = "PY3",
@@ -60,7 +65,7 @@ py_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "failure_handler_test",
     timeout = "long",
     srcs = ["failure_handler_test.py"],
@@ -71,28 +76,33 @@ tf_py_test(
     ],
     deps = [
         ":failure_handling_lib",
+        ":failure_handling_util",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
-        "//tensorflow/python/checkpoint:checkpoint_context",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:one_device_strategy",
         "//tensorflow/python/distribute:test_util",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/module",
-        "//tensorflow/python/platform",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:server_lib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "gce_failure_handler_test",
     srcs = ["gce_failure_handler_test.py"],
     shard_count = 8,
@@ -103,11 +113,27 @@ tf_py_test(
     deps = [
         ":failure_handling_lib",
         ":failure_handling_util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
-        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:one_device_strategy",
         "//tensorflow/python/distribute:test_util",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/module",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:server_lib",
+        "@absl_py//absl/testing:parameterized",
+        "@dill_archive//:dill",  # build_cleaner: keep
     ],
 )
 
diff --git a/tensorflow/python/distribute/failure_handling/failure_handler_test.py b/tensorflow/python/distribute/failure_handling/failure_handler_test.py
index 68528f9c4ec..6123512df49 100644
--- a/tensorflow/python/distribute/failure_handling/failure_handler_test.py
+++ b/tensorflow/python/distribute/failure_handling/failure_handler_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.checkpoint import checkpoint as tracking_util
 from tensorflow.python.checkpoint import checkpoint_management
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
@@ -182,7 +182,7 @@ class PreemptionCheckpointTest(test.TestCase, parameterized.TestCase):
         @def_function.function
         def train_step():
           if cluster_spec and (
-              distribution_strategy_context.get_distribution_strategy(
+              distribute_lib.get_distribution_strategy(
               ).cluster_resolver.task_id == raise_app_error_on_worker):
             raise errors_impl.ResourceExhaustedError(
                 node_def=None, op=None, message='Running out of resources')
@@ -228,7 +228,7 @@ class PreemptionCheckpointTest(test.TestCase, parameterized.TestCase):
           if api_wrapping_train:
             preemption_handler.run(distributed_train_step, epoch, step)
           else:
-            preemption_handler._save_checkpoint_if_preempted()
+            preemption_handler.save_checkpoint_if_preempted()
             distributed_train_step(epoch, step)
         # Add some randomness to when preemption actually happens. We should
         # trigger it for sure if the training is coming to an end and it hasn't
diff --git a/tensorflow/python/distribute/failure_handling/failure_handling.py b/tensorflow/python/distribute/failure_handling/failure_handling.py
index 1f1a28615f7..5bb6256cc6d 100644
--- a/tensorflow/python/distribute/failure_handling/failure_handling.py
+++ b/tensorflow/python/distribute/failure_handling/failure_handling.py
@@ -698,7 +698,7 @@ class PreemptionCheckpointHandler(object):
   def _maybe_set_received_own_sigterm(self):
     """Claim earliest preemption if no one else has done it before."""
     if self._local_mode:
-      logging.info('Received termination notice.',
+      logging.info('Member %s has received termination notice.',
                    self._id_in_cluster)
       self._received_own_sigterm_time = time.time()
       self._received_own_sigterm.set()
@@ -927,7 +927,7 @@ class PreemptionCheckpointHandler(object):
   # Disabling line-too-long check since we do not want to break the line when
   # converted to public documentation.
   # pylint: disable=line-too-long
-  def _save_checkpoint_if_preempted(self, *args, **kwargs):
+  def save_checkpoint_if_preempted(self, *args, **kwargs):
     """Saves a checkpoint if a preemption signal has been made available.
 
     This is an alternative API for `PreemptionCheckpointHandler.run` and
@@ -951,9 +951,10 @@ class PreemptionCheckpointHandler(object):
       preemption_handler = tf.distribute.experimental.PreemptionCheckpointHandler(cluster_resolver, checkpoint_manager)
 
     while trained_step.numpy() < NUM_STEPS:
+      # Train STEPS_IN_FUNCTION steps at once.
       train_multi_step_function()
       trained_step.assign_add(STEPS_IN_FUNCTION)
-      preemption_handler._save_checkpoint_if_preempted()
+      preemption_handler.save_checkpoint_if_preempted()
     ```
 
     Args:
diff --git a/tensorflow/python/distribute/failure_handling/gce_failure_handler_test.py b/tensorflow/python/distribute/failure_handling/gce_failure_handler_test.py
index 8f81c6eece6..2f921581b93 100644
--- a/tensorflow/python/distribute/failure_handling/gce_failure_handler_test.py
+++ b/tensorflow/python/distribute/failure_handling/gce_failure_handler_test.py
@@ -220,7 +220,7 @@ class GceFailureHandlingTest(test.TestCase, parameterized.TestCase):
                 max([int(ckpt_index) for ckpt_index in checkpoint_index]), 1)
 
         else:
-          # Test if arguments to _save_checkpoint_if_preempted are passed
+          # Test if arguments to save_checkpoint_if_preempted are passed
           # successfully.
           self.assertEqual(
               max([int(ckpt_index) for ckpt_index in checkpoint_index]),
@@ -237,7 +237,7 @@ class GceFailureHandlingTest(test.TestCase, parameterized.TestCase):
             preemption_handler.run(distributed_train_step, epoch, step)
 
           else:
-            preemption_handler._save_checkpoint_if_preempted(
+            preemption_handler.save_checkpoint_if_preempted(
                 checkpoint_number=preemption_handler.total_run_calls)
             distributed_train_step(epoch, step)
 
diff --git a/tensorflow/python/distribute/failure_handling/preemption_watcher.py b/tensorflow/python/distribute/failure_handling/preemption_watcher.py
index 0b04139d5ad..d8435baa992 100644
--- a/tensorflow/python/distribute/failure_handling/preemption_watcher.py
+++ b/tensorflow/python/distribute/failure_handling/preemption_watcher.py
@@ -24,6 +24,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import monitoring
 from tensorflow.python.framework.errors import AbortedError
 from tensorflow.python.framework.errors import CancelledError
+from tensorflow.python.framework.errors import InternalError
 from tensorflow.python.framework.errors import UnavailableError
 from tensorflow.python.util.tf_export import tf_export
 
@@ -127,5 +128,10 @@ class PreemptionWatcher:
       return
     try:
       context.context().get_config_key_value("BLOCK_TILL_EXIT")
+    except InternalError as e:
+      # Ensure that internal error is related to coordination service.
+      if "Coordination service is not enabled." not in e.message:
+        raise
+      logging.info("Workers exited.")
     except (AbortedError, CancelledError, UnavailableError):
       logging.info("Workers exited.")
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 8f074bce3c8..2313fefc522 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -31,7 +31,6 @@ from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import input_ops
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
@@ -48,7 +47,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import while_loop
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -237,8 +236,15 @@ class DistributedIteratorBase(collections_abc.Iterator,
   """Common implementation for all input iterators."""
 
   # pylint: disable=super-init-not-called
-  def __init__(self, input_workers, iterators, strategy, cardinality,
-               enable_get_next_as_optional):
+  def __init__(
+      self,
+      input_workers,
+      iterators,
+      strategy,
+      cardinality,
+      enable_get_next_as_optional,
+      replica_order=None,
+  ):
     assert isinstance(input_workers, InputWorkers)
     if not input_workers.worker_devices:
       raise ValueError("Should have at least one worker for input iterator.")
@@ -248,6 +254,7 @@ class DistributedIteratorBase(collections_abc.Iterator,
     self._strategy = strategy
     self._cardinality = cardinality
     self._enable_get_next_as_optional = enable_get_next_as_optional
+    self._replica_order = replica_order
 
   def next(self):
     return self.__next__()
@@ -286,6 +293,10 @@ class DistributedIteratorBase(collections_abc.Iterator,
     def _create_optional_with_dummy():
       value_list = _get_value_or_dummy(
           self._input_workers, optional_list, produce_dummy=True)
+
+      if self._replica_order is not None:
+        value_list = self._reorder_replicas(value_list)
+
       per_replica = _create_per_replica(value_list, self._strategy)
       return optional_ops.Optional.from_value(per_replica)
 
@@ -295,7 +306,7 @@ class DistributedIteratorBase(collections_abc.Iterator,
     num_replicas_with_values = _calculate_replicas_with_values(
         self._strategy, self._input_workers, optional_list)
 
-    return control_flow_ops.cond(
+    return tf_cond.cond(
         num_replicas_with_values > 0,
         _create_optional_with_dummy,
         _create_empty_optional,
@@ -303,9 +314,9 @@ class DistributedIteratorBase(collections_abc.Iterator,
 
   def get_next(self, name=None):
     """Returns the next input from the iterator for all replicas."""
-    with distribution_strategy_context.enter_or_assert_strategy(
+    with distribute_lib.enter_or_assert_strategy(
         self._strategy):
-      if distribution_strategy_context.get_replica_context() is not None:
+      if distribute_lib.get_replica_context() is not None:
         raise ValueError("next(iterator) should be called from outside of "
                          "replica_fn. e.g. strategy.run(replica_fn, "
                          "args=(next(iterator),))")
@@ -323,6 +334,10 @@ class DistributedIteratorBase(collections_abc.Iterator,
     def _value_or_dummy():
       value_list = _get_value_or_dummy(
           self._input_workers, optional_list, produce_dummy=True)
+
+      if self._replica_order is not None:
+        value_list = self._reorder_replicas(value_list)
+
       return _create_per_replica(value_list, self._strategy)
 
     def _eof():
@@ -330,7 +345,7 @@ class DistributedIteratorBase(collections_abc.Iterator,
       # so we need to call GetNext to raise EOFError.
       return self._get_next_no_partial_batch_handling()
 
-    return control_flow_ops.cond(
+    return tf_cond.cond(
         num_replicas_with_values > 0, _value_or_dummy, _eof, strict=True)
 
   def _get_next_no_partial_batch_handling(self, name=None):
@@ -344,8 +359,20 @@ class DistributedIteratorBase(collections_abc.Iterator,
       with ops.device(worker):
         # Make `replicas` a flat list of values across all replicas.
         replicas.extend(self._iterators[i].get_next_as_list(new_name))
+
+    if self._replica_order is not None:
+      replicas = self._reorder_replicas(replicas)
+
     return _create_per_replica(replicas, self._strategy)
 
+  def _reorder_replicas(self, replicas):
+    assert len(self._replica_order) == len(
+        replicas
+    ), "replica order size ({}) != replicas size ({})!".format(
+        len(self._replica_order), len(replicas)
+    )
+    return [replicas[i] for i in self._replica_order]
+
 
 class DistributedDatasetAndIteratorSpec(type_spec.TypeSpec):
   """Common Type specification for `DistributedDataset and DistributedDatasetsFromFunction."""
@@ -355,13 +382,16 @@ class DistributedDatasetAndIteratorSpec(type_spec.TypeSpec):
       "_enable_get_next_as_optional", "_options", "_canonicalize_devices"
   ]
 
-  def __init__(self,
-               input_workers,
-               element_spec,
-               strategy,
-               options,
-               cardinality=cardinality_lib.UNKNOWN,
-               enable_get_next_as_optional=None):
+  def __init__(
+      self,
+      input_workers,
+      element_spec,
+      strategy,
+      options,
+      cardinality=cardinality_lib.UNKNOWN,
+      enable_get_next_as_optional=None,
+      replica_order=None,
+  ):
     # We don't want to allow deserialization of this class because we don't
     # serialize the strategy object. Currently the only places where
     # _deserialize is called is when we save/restore using SavedModels.
@@ -380,6 +410,7 @@ class DistributedDatasetAndIteratorSpec(type_spec.TypeSpec):
                                              "_canonicalize_devices", True)
       else:
         self._canonicalize_devices = True
+      self._replica_order = replica_order
 
   def _serialize(self):
     # We cannot serialize the strategy object so we convert it to an id that we
@@ -524,7 +555,9 @@ class DistributedIteratorSpec(DistributedDatasetAndIteratorSpec):
         strategy=self._strategy,
         cardinality=self._cardinality,
         enable_get_next_as_optional=self._enable_get_next_as_optional,
-        options=self._options)
+        options=self._options,
+        replica_order=self._replica_order,
+    )
 
   @staticmethod
   def from_value(value):
@@ -542,15 +575,18 @@ class DistributedIterator(DistributedIteratorBase,
                           composite_tensor.CompositeTensor):
   """Input Iterator for a distributed dataset."""
 
-  def __init__(self,
-               input_workers=None,
-               iterators=None,
-               strategy=None,
-               components=None,
-               element_spec=None,
-               cardinality=cardinality_lib.UNKNOWN,
-               enable_get_next_as_optional=False,
-               options=None):
+  def __init__(
+      self,
+      input_workers=None,
+      iterators=None,
+      strategy=None,
+      components=None,
+      element_spec=None,
+      cardinality=cardinality_lib.UNKNOWN,
+      enable_get_next_as_optional=False,
+      options=None,
+      replica_order=None,
+  ):
     if input_workers is None:
       raise ValueError("`input_workers` should be "
                        "provided.")
@@ -569,13 +605,19 @@ class DistributedIterator(DistributedIteratorBase,
       self._strategy = strategy
       self._cardinality = cardinality
       self._enable_get_next_as_optional = enable_get_next_as_optional
+      self._replica_order = replica_order
     else:
       if (components is not None and element_spec is not None):
         raise ValueError(error_message)
 
-      super(DistributedIterator,
-            self).__init__(input_workers, iterators, strategy, cardinality,
-                           enable_get_next_as_optional)
+      super(DistributedIterator, self).__init__(
+          input_workers,
+          iterators,
+          strategy,
+          cardinality,
+          enable_get_next_as_optional,
+          replica_order,
+      )
 
   @property
   def element_spec(self):
@@ -594,11 +636,15 @@ class DistributedIterator(DistributedIteratorBase,
     # Note that we use actual element_spec instead of the rebatched-as-dynamic
     # one to create DistributedIteratorSpec, to be consistent with the
     # underlying iterators' specs.
-    return DistributedIteratorSpec(self._input_workers, self._element_spec,
-                                   self._strategy,
-                                   self._options,
-                                   self._cardinality,
-                                   self._enable_get_next_as_optional)
+    return DistributedIteratorSpec(
+        self._input_workers,
+        self._element_spec,
+        self._strategy,
+        self._options,
+        self._cardinality,
+        self._enable_get_next_as_optional,
+        self._replica_order,
+    )
 
 
 class _IterableInput(collections_abc.Iterable,
@@ -664,7 +710,9 @@ class DistributedDatasetSpec(DistributedDatasetAndIteratorSpec):
         components=components,
         element_spec=self._element_spec,
         enable_get_next_as_optional=self._enable_get_next_as_optional,
-        options=self._options)
+        options=self._options,
+        replica_order=self._replica_order,
+    )
 
   @staticmethod
   def from_value(value):
@@ -681,17 +729,20 @@ class DistributedDatasetSpec(DistributedDatasetAndIteratorSpec):
 class DistributedDataset(_IterableInput, composite_tensor.CompositeTensor):
   """Distributed dataset that supports prefetching to multiple devices."""
 
-  def __init__(self,
-               input_workers,
-               strategy,
-               dataset=None,
-               num_replicas_in_sync=None,
-               input_context=None,
-               components=None,
-               element_spec=None,
-               enable_get_next_as_optional=None,
-               build=True,
-               options=None):
+  def __init__(
+      self,
+      input_workers,
+      strategy,
+      dataset=None,
+      num_replicas_in_sync=None,
+      input_context=None,
+      components=None,
+      element_spec=None,
+      enable_get_next_as_optional=None,
+      build=True,
+      options=None,
+      replica_order=None,
+  ):
     """Distribute the dataset on all workers.
 
     If `num_replicas_in_sync` is not None, we split each batch of the dataset
@@ -708,10 +759,10 @@ class DistributedDataset(_IterableInput, composite_tensor.CompositeTensor):
         DistributedDataset. Use this when contructing DistributedDataset from a
         new `tf.data.Dataset`. Use components when constructing using
         DistributedDatasetSpec.
-      num_replicas_in_sync: Optional integer. If this is not None, the value
-        is used to decide how to rebatch datasets into smaller batches so that
-        the total batch size for each step (across all workers and replicas)
-        adds up to `dataset`'s batch size.
+      num_replicas_in_sync: Optional integer. If this is not None, the value is
+        used to decide how to rebatch datasets into smaller batches so that the
+        total batch size for each step (across all workers and replicas) adds up
+        to `dataset`'s batch size.
       input_context: `InputContext` for sharding. Only pass this in for between
         graph multi-worker cases where there is only one `input_worker`. In
         these cases, we will shard based on the `input_pipeline_id` and
@@ -728,6 +779,8 @@ class DistributedDataset(_IterableInput, composite_tensor.CompositeTensor):
         This is only useful for `ParameterServerStrategy` now.
       options: `tf.distribute.InputOptions` used to control options on how this
         dataset is distributed.
+      replica_order: the order of the replicas, which will be used to reorder
+        the iterators to match the device order.
     """
     super(DistributedDataset, self).__init__(input_workers=input_workers)
     if input_workers is None or strategy is None:
@@ -742,6 +795,7 @@ class DistributedDataset(_IterableInput, composite_tensor.CompositeTensor):
     self._options = options
     self._input_context = input_context
     self._num_replicas_in_sync = num_replicas_in_sync
+    self._replica_order = replica_order
 
     if dataset is not None:
       self._original_dataset = dataset
@@ -902,7 +956,7 @@ class DistributedDataset(_IterableInput, composite_tensor.CompositeTensor):
               dataset, num_replicas_in_sync).prefetch(num_replicas_per_worker)
 
         with ops.colocate_with(dataset._variant_tensor):
-          return control_flow_ops.cond(
+          return tf_cond.cond(
               math_ops.not_equal(batch_size, -1),
               true_fn=apply_rebatch,
               false_fn=apply_legacy_rebatch)
@@ -944,7 +998,9 @@ class DistributedDataset(_IterableInput, composite_tensor.CompositeTensor):
         self._strategy,
         cardinality=self._cardinality,
         enable_get_next_as_optional=self._enable_get_next_as_optional,
-        options=self._options)
+        options=self._options,
+        replica_order=self._replica_order,
+    )
     iterator._element_spec = self._element_spec  # pylint: disable=protected-access
 
     # When async eager is enabled, sometimes the iterator may not finish
@@ -1022,15 +1078,18 @@ class DistributedDatasetsFromFunction(_IterableInput,
                                       composite_tensor.CompositeTensor):
   """Inputs created from dataset function."""
 
-  def __init__(self,
-               input_workers,
-               strategy,
-               input_contexts=None,
-               dataset_fn=None,
-               options=None,
-               components=None,
-               element_spec=None,
-               build=True):
+  def __init__(
+      self,
+      input_workers,
+      strategy,
+      input_contexts=None,
+      dataset_fn=None,
+      options=None,
+      components=None,
+      element_spec=None,
+      build=True,
+      replica_order=None,
+  ):
     """Makes an iterable from datasets created by the given function.
 
     Args:
@@ -1056,12 +1115,15 @@ class DistributedDatasetsFromFunction(_IterableInput,
         from components.
       build: whether to build underlying datasets when this object is created.
         This is only useful for `ParameterServerStrategy` now.
+      replica_order: the order of the replicas, which will be used to reorder
+        the iterators to match the device order.
     """
     super(DistributedDatasetsFromFunction, self).__init__(
         input_workers=input_workers)
     self._input_workers = input_workers
     self._strategy = strategy
     self._options = options
+    self._replica_order = replica_order
     if dataset_fn is not None and components is not None:
       raise ValueError("Only one of dataset_fn or components should be set")
     if dataset_fn is None and components is None:
@@ -1170,7 +1232,9 @@ class DistributedDatasetsFromFunction(_IterableInput,
         strategy=self._strategy,
         cardinality=self._cardinality,
         enable_get_next_as_optional=self._enable_get_next_as_optional,
-        options=self._options)
+        options=self._options,
+        replica_order=self._replica_order,
+    )
     iterator._element_spec = self._element_spec  # pylint: disable=protected-access
 
     # When async eager is enabled, sometimes the iterator may not finish
@@ -1281,7 +1345,7 @@ def _get_value_or_dummy(input_workers, optional_list, produce_dummy):
           if produce_dummy:
             # pylint: disable=cell-var-from-loop
             value_list.append(
-                control_flow_ops.cond(
+                tf_cond.cond(
                     optional_list[i][j].has_value(),
                     lambda: optional_list[i][j].get_value(),  # pylint: disable=unnecessary-lambda
                     lambda: _dummy_tensor_fn(optional_list[i][j].element_spec),
@@ -1754,12 +1818,12 @@ class MultiStepContext(object):
         `_last_step_outputs_reduce_ops` for later interpreting of the
         outputs as already reduced or not.
     """
-    if distribution_strategy_context.in_cross_replica_context():
+    if distribute_lib.in_cross_replica_context():
       self._last_step_outputs_reduce_ops[name] = reduce_op
       if reduce_op is None:
         self._last_step_outputs[name] = output
       else:
-        distribution = distribution_strategy_context.get_strategy()
+        distribution = distribute_lib.get_strategy()
         self._last_step_outputs[name] = distribution.reduce(reduce_op, output,
                                                             axis=None)
     else:
@@ -1772,7 +1836,7 @@ class MultiStepContext(object):
         # the replicas are trying to set the same value).
         self._last_step_outputs_reduce_ops[name] = reduce_op
 
-      distribution_strategy_context.get_replica_context().merge_call(
+      distribute_lib.get_replica_context().merge_call(
           merge_fn, args=(output,))
 
   @property
@@ -1782,7 +1846,7 @@ class MultiStepContext(object):
 
   def set_non_tensor_output(self, name, output):
     """Set `output` with `name` to be captured as a non tensor output."""
-    if distribution_strategy_context.in_cross_replica_context():
+    if distribute_lib.in_cross_replica_context():
       self._non_tensor_outputs[name] = output
     else:
       def merge_fn(distribution, value):
@@ -1790,7 +1854,7 @@ class MultiStepContext(object):
         # in a list as reduction doesn't make sense on non tensors.
         self._non_tensor_outputs[name] = (
             distribution.experimental_local_results(value))
-      distribution_strategy_context.get_replica_context().merge_call(
+      distribute_lib.get_replica_context().merge_call(
           merge_fn, args=(output,))
 
 
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 599b1deb0d0..38b8c95529a 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -560,8 +560,8 @@ class DistributedIteratorTest(DistributedIteratorTestBase,
       combinations.combine(
           mode=["eager"],
           distribution=[
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
               strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_two_cpus,
           ],
           use_iterator=[False, True]))
   def testIteratorAndDatasetEnumerateError(self, distribution, use_iterator):
@@ -586,8 +586,8 @@ class DistributedIteratorTest(DistributedIteratorTestBase,
       combinations.combine(
           mode=["eager"],
           distribution=[
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
               strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_two_cpus,
           ]))
   def testIterableIteratorError(self, distribution):
     dataset = dataset_ops.Dataset.range(10).batch(2)
@@ -1599,11 +1599,6 @@ class DistributedIteratorPerDeviceTest(DistributedIteratorTestBase,
                                        parameterized.TestCase):
   """Tests for PER_WORKER and PER_REPLICA's InputOptions variants."""
 
-  def setUp(self):
-    context._reset_context()
-    strategy_combinations.set_virtual_cpus_to_at_least(3)
-    super(DistributedIteratorPerDeviceTest, self).setUp()
-
   @combinations.generate(
       combinations.combine(
           input_options=[
@@ -1623,7 +1618,7 @@ class DistributedIteratorPerDeviceTest(DistributedIteratorTestBase,
               strategy_combinations.mirrored_strategy_with_two_gpus,
               strategy_combinations
               .mirrored_strategy_with_two_gpus_no_merge_call,
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+              strategy_combinations.mirrored_strategy_with_two_cpus,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
           ]))
   def testDevicePlacementForPerWorkerValuesWithPrefetch(self, distribution,
@@ -1649,7 +1644,7 @@ class DistributedIteratorPerDeviceTest(DistributedIteratorTestBase,
               strategy_combinations.mirrored_strategy_with_two_gpus,
               strategy_combinations
               .mirrored_strategy_with_two_gpus_no_merge_call,
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+              strategy_combinations.mirrored_strategy_with_two_cpus,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
           ],
           input_options=[
@@ -1701,7 +1696,7 @@ class DistributedIteratorPerDeviceTest(DistributedIteratorTestBase,
               strategy_combinations.mirrored_strategy_with_two_gpus,
               strategy_combinations
               .mirrored_strategy_with_two_gpus_no_merge_call,
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+              strategy_combinations.mirrored_strategy_with_two_cpus,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
           ]))
   def testDevicePlacementForInvalidCombinations(self, distribution,
@@ -1732,7 +1727,7 @@ class DistributedIteratorPerDeviceTest(DistributedIteratorTestBase,
               strategy_combinations.mirrored_strategy_with_two_gpus,
               strategy_combinations
               .mirrored_strategy_with_two_gpus_no_merge_call,
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+              strategy_combinations.mirrored_strategy_with_two_cpus,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
           ]))
   def testPrefetchBufferSizeInputOptions(self, distribution, input_options):
@@ -1769,7 +1764,7 @@ class DistributedIteratorPerDeviceTest(DistributedIteratorTestBase,
               strategy_combinations.mirrored_strategy_with_two_gpus,
               strategy_combinations
               .mirrored_strategy_with_two_gpus_no_merge_call,
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+              strategy_combinations.mirrored_strategy_with_two_cpus,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
           ]))
   def testOutputValuesForPerWorkerInputOptions(self, distribution,
@@ -1812,7 +1807,7 @@ class DistributedIteratorPerDeviceTest(DistributedIteratorTestBase,
               strategy_combinations.mirrored_strategy_with_two_gpus,
               strategy_combinations
               .mirrored_strategy_with_two_gpus_no_merge_call,
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+              strategy_combinations.mirrored_strategy_with_two_cpus,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
           ]))
   def testOutputValuesForPerReplicaInputOptions(self, distribution,
diff --git a/tensorflow/python/distribute/input_util.py b/tensorflow/python/distribute/input_util.py
index cff3bebc528..1dda2a5fd81 100644
--- a/tensorflow/python/distribute/input_util.py
+++ b/tensorflow/python/distribute/input_util.py
@@ -19,13 +19,16 @@ from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute.v1 import input_lib as input_lib_v1
 
 
-def get_distributed_dataset(dataset,
-                            input_workers,
-                            strategy,
-                            num_replicas_in_sync=None,
-                            input_context=None,
-                            options=None,
-                            build=True):
+def get_distributed_dataset(
+    dataset,
+    input_workers,
+    strategy,
+    num_replicas_in_sync=None,
+    input_context=None,
+    options=None,
+    build=True,
+    replica_order=None,
+):
   """Returns a distributed dataset from the given tf.data.Dataset instance.
 
   This is a common function that is used by all strategies to return a
@@ -52,6 +55,8 @@ def get_distributed_dataset(dataset,
       options on how this dataset is distributed.
     build: whether to build underlying datasets when a DistributedDataset is
       created. This is only useful for `ParameterServerStrategy` now.
+    replica_order: the order of the replicas, which will be used to reorder the
+      iterators to match the device order.
 
   Returns:
     A distributed dataset instance.
@@ -64,7 +69,9 @@ def get_distributed_dataset(dataset,
         num_replicas_in_sync=num_replicas_in_sync,
         input_context=input_context,
         build=build,
-        options=options)
+        options=options,
+        replica_order=replica_order,
+    )
   else:
     return input_lib_v1.DistributedDatasetV1(
         dataset,
@@ -75,12 +82,15 @@ def get_distributed_dataset(dataset,
         options=options)
 
 
-def get_distributed_datasets_from_function(dataset_fn,
-                                           input_workers,
-                                           input_contexts,
-                                           strategy,
-                                           options=None,
-                                           build=True):
+def get_distributed_datasets_from_function(
+    dataset_fn,
+    input_workers,
+    input_contexts,
+    strategy,
+    options=None,
+    build=True,
+    replica_order=None,
+):
   """Returns a distributed dataset from the given input function.
 
   This is a common function that is used by all strategies to return a
@@ -103,6 +113,8 @@ def get_distributed_datasets_from_function(dataset_fn,
     build: whether to build underlying datasets when a
       `DistributedDatasetFromFunction` is created. This is only useful for
       `ParameterServerStrategy` now.
+    replica_order: the order of the replicas, which will be used to reorder the
+      iterators to match the device order.
 
   Returns:
     A distributed dataset instance.
@@ -136,6 +148,7 @@ def get_distributed_datasets_from_function(dataset_fn,
         dataset_fn=dataset_fn,
         options=options,
         build=build,
+        replica_order=replica_order,
     )
   else:
     return input_lib_v1.DistributedDatasetsFromFunctionV1(
diff --git a/tensorflow/python/distribute/integration_test/BUILD b/tensorflow/python/distribute/integration_test/BUILD
index a7f7674fdf0..f64e0bbb407 100644
--- a/tensorflow/python/distribute/integration_test/BUILD
+++ b/tensorflow/python/distribute/integration_test/BUILD
@@ -1,13 +1,14 @@
-load("//tensorflow/core/platform:distribute.bzl", "distribute_py_test")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
-load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow/core/platform:distribute.bzl", "distribute_py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "saved_model_test",
     srcs = ["saved_model_test.py"],
     tags = [
@@ -26,11 +27,12 @@ distribute_py_test(
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:errors",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "mwms_peer_failure_test",
     size = "medium",
     srcs = ["mwms_peer_failure_test.py"],
@@ -44,7 +46,6 @@ cuda_py_test(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
-        "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:test_util",
@@ -52,14 +53,13 @@ cuda_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "mwms_peer_failure_test_lib",
     srcs = ["mwms_peer_failure_test.py"],
     visibility = ["//learning/brain/runtime/python:__pkg__"],
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
-        "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:test_util",
@@ -67,7 +67,7 @@ py_library(
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_memory_test",
     size = "medium",
     srcs = ["tpu_memory_test.py"],
@@ -79,8 +79,7 @@ tpu_py_test(
     tags = ["no_oss"],
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
+        "//tensorflow/python/platform:flags",
     ],
 )
diff --git a/tensorflow/python/distribute/merge_call_interim.py b/tensorflow/python/distribute/merge_call_interim.py
index 6cac8b3903b..23337f4739c 100644
--- a/tensorflow/python/distribute/merge_call_interim.py
+++ b/tensorflow/python/distribute/merge_call_interim.py
@@ -13,16 +13,16 @@
 # limitations under the License.
 # ==============================================================================
 """A module for interm merge-call related internal APIs."""
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("__internal__.distribute.strategy_supports_no_merge_call", v1=[])
 def strategy_supports_no_merge_call():
   """Returns if the current `Strategy` can operate in pure replica context."""
-  if not distribution_strategy_context.has_strategy():
+  if not distribute_lib.has_strategy():
     return True
-  strategy = distribution_strategy_context.get_strategy()
+  strategy = distribute_lib.get_strategy()
   return not strategy.extended._use_merge_call()  # pylint: disable=protected-access
 
 
@@ -50,5 +50,5 @@ def maybe_merge_call(fn, strategy, *args, **kwargs):
   if strategy_supports_no_merge_call():
     return fn(strategy, *args, **kwargs)
   else:
-    return distribution_strategy_context.get_replica_context().merge_call(
+    return distribute_lib.get_replica_context().merge_call(
         fn, args=args, kwargs=kwargs)
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index e91b3ab4c00..3be22ea3083 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -23,7 +23,6 @@ from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import input_util
 from tensorflow.python.distribute import mirrored_run
@@ -32,10 +31,10 @@ from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute import values_util
-from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
 from tensorflow.python.distribute.v1 import input_lib as input_lib_v1
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
+from tensorflow.python.eager import record
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as tf_device
@@ -190,7 +189,7 @@ def all_local_devices(num_gpus=None):
 
 def all_devices():
   devices = []
-  tfconfig = TFConfigClusterResolver()
+  tfconfig = tfconfig_cluster_resolver.TFConfigClusterResolver()
   if tfconfig.cluster_spec().as_dict():
     devices = _cluster_spec_to_device_list(tfconfig.cluster_spec(),
                                            context.num_gpus())
@@ -311,10 +310,6 @@ class MirroredStrategyV1(distribute_lib.StrategyV1):  # pylint: disable=g-missin
 class MirroredExtended(distribute_lib.StrategyExtendedV1):
   """Implementation of MirroredStrategy."""
 
-  # If this is set to True, use NCCL collective ops instead of NCCL cross device
-  # ops.
-  _prefer_collective_ops = False
-
   def __init__(self, container_strategy, devices=None, cross_device_ops=None):
     super(MirroredExtended, self).__init__(container_strategy)
     if context.executing_eagerly():
@@ -322,7 +317,11 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
         raise RuntimeError("In-graph multi-worker training with "
                            "`MirroredStrategy` is not supported in eager mode.")
       else:
-        if TFConfigClusterResolver().cluster_spec().as_dict():
+        if (
+            tfconfig_cluster_resolver.TFConfigClusterResolver()
+            .cluster_spec()
+            .as_dict()
+        ):
           # if you are executing in eager mode, only the single machine code
           # path is supported.
           logging.info("Initializing local devices since in-graph multi-worker "
@@ -335,11 +334,11 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
 
     assert devices, ("Got an empty `devices` list and unable to recognize "
                      "any local devices.")
-    self._cross_device_ops = cross_device_ops
-    self._collective_ops_in_use = False
+
     self._collective_key_base = container_strategy._collective_key_base
     self._communication_options = collective_util.Options(
         implementation=collective_util.CommunicationImplementation.NCCL)
+    self._cross_device_ops = cross_device_ops
     self._initialize_strategy(devices)
 
     # TODO(b/128995245): Enable last partial batch support in graph mode.
@@ -363,23 +362,24 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     devices = tuple(device_util.resolve(d) for d in devices)
     assert len(set(devices)) == len(devices), (
         "No duplicates allowed in `devices` argument: %s" % (devices,))
-    if _is_device_list_single_worker(devices):
-      self._initialize_single_worker(devices)
-      self._collective_ops = self._make_collective_ops_with_fallbacks()
-      if self._prefer_collective_ops and (
-          isinstance(self._cross_device_ops, cross_device_ops_lib.NcclAllReduce)
-          or isinstance(self._inferred_cross_device_ops,
-                        cross_device_ops_lib.NcclAllReduce)):
-        self._collective_ops_in_use = True
-        self._inferred_cross_device_ops = None
-      logging.info("Using MirroredStrategy with devices %r", devices)
-    else:
-      self._initialize_multi_worker(devices)
+
+    self._initialize_single_worker(devices)
+
+    self._collective_ops = self._make_collective_ops_with_fallbacks()
+    # If cross_device_ops is not provided, set it to collective op by default.
+    if not self._cross_device_ops:
+      self._cross_device_ops = self._collective_ops
 
   def _make_collective_ops_with_fallbacks(self):
     self._collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=1 + self._collective_key_base)
 
+    if not ops.executing_eagerly_outside_functions() and any(
+        "gpu" not in d.lower() for d in self._devices):
+      # In TF1/Session, fall back to ReductionToOneDevice() if there are
+      # non-GPU devices or virtual GPUs are used.
+      return cross_device_ops_lib.ReductionToOneDevice()
+
     # Use ReductionToOneDevice() if mixed devices are used.
     if any("cpu" in d.lower() for d in self._devices) and any(
         "gpu" in d.lower() for d in self._devices):
@@ -394,7 +394,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
       physical_gpus = context.context().list_physical_devices(device_type="GPU")
       logical_gpus = context.context().list_logical_devices(device_type="GPU")
       # Use RING collective ops if virtual devices are used.
-      if len(physical_gpus) != len(logical_gpus):
+      if len(physical_gpus) < len(logical_gpus):
         self._communication_options = collective_util.Options(
             implementation=collective_util.CommunicationImplementation.RING)
 
@@ -411,11 +411,8 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     self._input_workers_devices = (
         (device_util.canonicalize("/device:CPU:0", devices[0]), devices),)
 
-    self._inferred_cross_device_ops = None if self._cross_device_ops else (
-        cross_device_ops_lib.select_cross_device_ops(devices))
     self._host_input_device = numpy_dataset.SingleDevice(
         self._input_workers_devices[0][0])
-    self._is_multi_worker_training = False
     device_spec = tf_device.DeviceSpec.from_string(
         self._input_workers_devices[0][0])
     # Ensures when we enter strategy.scope() we use the correct default device
@@ -423,6 +420,8 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
       self._default_device = "/job:%s/replica:%d/task:%d" % (
           device_spec.job, device_spec.replica, device_spec.task)
 
+    logging.info("Using MirroredStrategy with devices %r", devices)
+
   def _initialize_multi_worker(self, devices):
     """Initializes the object for multi-worker training."""
     device_dict = _group_device_list(devices)
@@ -545,7 +544,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
           with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
             # Don't record operations (e.g. other variable reads) during
             # variable creation.
-            with tape.stop_recording():
+            with record.stop_recording():
               v = next_creator(**kwargs)
           assert not isinstance(v, values.DistributedVariable)
           value_list.append(v)
@@ -725,22 +724,31 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     return updated_config
 
   def _get_cross_device_ops(self, value):
+    # Always use CollectiveAllReduce when XLA is enabled, since other cross
+    # device ops don't have as good support on XLA.
     if not self._use_merge_call():
+      if not isinstance(self._cross_device_ops,
+                        cross_device_ops_lib.CollectiveAllReduce):
+        logging.warning(
+            "Under XLA context, MirroredStrategy uses CollectiveAllReduce op. "
+            "Although %r is provided to initialize MirroredStrategy, it is "
+            "ignored in XLA. Please use CollectiveAllReduce(or default option) "
+            "in the future, since other cross device ops are not well "
+            "supported on XLA.", self._cross_device_ops
+        )
       return self._collective_ops
 
-    if self._collective_ops_in_use:
-      if isinstance(value, values.DistributedValues):
-        value_int32 = True in {
-            dtypes.as_dtype(v.dtype) == dtypes.int32 for v in value.values
-        }
-      else:
-        value_int32 = dtypes.as_dtype(value.dtype) == dtypes.int32
-      if value_int32:
-        return cross_device_ops_lib.ReductionToOneDevice()
-      else:
-        return self._collective_ops
+    if isinstance(value, values.DistributedValues):
+      value_int32 = True in {
+          dtypes.as_dtype(v.dtype) == dtypes.int32 for v in value.values
+      }
+    else:
+      value_int32 = dtypes.as_dtype(value.dtype) == dtypes.int32
 
-    return self._cross_device_ops or self._inferred_cross_device_ops
+    if value_int32:
+      return cross_device_ops_lib.ReductionToOneDevice()
+    else:
+      return self._cross_device_ops
 
   def _gather_to_implementation(self, value, destinations, axis, options):
     if not isinstance(value, values.DistributedValues):
@@ -765,10 +773,11 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
         # be 0.
         return cross_device_ops_lib.reduce_non_distributed_value(
             reduce_op, value, destinations, self._num_replicas_in_sync)
-      if self._use_merge_call() and self._collective_ops_in_use and ((
+
+      if self._use_merge_call() and (
           not cross_device_ops_lib._devices_match(value, destinations) or  # pylint: disable=protected-access
           any("cpu" in d.lower()
-              for d in cross_device_ops_lib.get_devices_from(destinations)))):
+              for d in cross_device_ops_lib.get_devices_from(destinations))):
         return cross_device_ops_lib.ReductionToOneDevice().reduce(
             reduce_op, value, destinations)
       return self._get_cross_device_ops(value).reduce(
@@ -822,7 +831,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
       # collective ops are to be launched sequentially.
       return super()._replica_ctx_all_reduce(reduce_op, value, options)
 
-    replica_context = distribution_strategy_context.get_replica_context()
+    replica_context = distribute_lib.get_replica_context()
     assert replica_context, (
         "`StrategyExtended._replica_ctx_all_reduce` must be called in a "
         "replica context")
@@ -836,7 +845,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     if self._use_merge_call():
       return super()._replica_ctx_update(var, fn, args, kwargs, group)
 
-    replica_context = distribution_strategy_context.get_replica_context()
+    replica_context = distribute_lib.get_replica_context()
     assert replica_context
     replica_id = values_util.get_current_replica_id_as_int()
     name = "update_%d" % replica_id
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index 989490d72c0..39912b62cb4 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -29,7 +29,6 @@ from tensorflow.python.distribute import cross_device_ops as cross_device_ops_li
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
@@ -49,10 +48,12 @@ from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util as util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
 from tensorflow.python.training import server_lib
@@ -315,9 +316,16 @@ class MirroredCollectiveOpTest(strategy_test_lib.DistributionTestBase,
     @def_function.function
     def fn():
       strategy = mirrored_strategy.MirroredStrategy(["CPU:0", "CPU:1"])
-      self.assertEqual(
-          strategy.extended._collective_ops._options.implementation,
-          collective_util.CommunicationImplementation.RING)
+      if ops.executing_eagerly_outside_functions():
+        self.assertIsInstance(
+            strategy.extended._collective_ops,
+            cross_device_ops_lib.CollectiveAllReduce)
+        self.assertEqual(
+            strategy.extended._collective_ops._options.implementation,
+            collective_util.CommunicationImplementation.RING)
+      else:
+        self.assertIsInstance(strategy.extended._collective_ops,
+                              cross_device_ops_lib.ReductionToOneDevice)
     fn()
 
   def testMixedDevices(self):
@@ -333,15 +341,19 @@ class MirroredCollectiveOpTest(strategy_test_lib.DistributionTestBase,
     @def_function.function
     def fn():
       strategy = mirrored_strategy.MirroredStrategy(["GPU:0", "GPU:1"])
+      self.assertIsInstance(
+          strategy.extended._collective_ops,
+          cross_device_ops_lib.CollectiveAllReduce)
       self.assertEqual(
           strategy.extended._collective_ops._options.implementation,
           collective_util.CommunicationImplementation.NCCL)
     fn()
 
   def testVirtualGpu(self):
-    physical_gpus = context.context().list_physical_devices(device_type="GPU")
     # Logical devices cannot be changed after context initialization.
     context._reset_context()
+
+    physical_gpus = context.context().list_physical_devices(device_type="GPU")
     context.context().set_logical_device_configuration(physical_gpus[1], [
         context.LogicalDeviceConfiguration(memory_limit=1024),
         context.LogicalDeviceConfiguration(memory_limit=1024)
@@ -349,12 +361,143 @@ class MirroredCollectiveOpTest(strategy_test_lib.DistributionTestBase,
     @def_function.function
     def fn():
       strategy = mirrored_strategy.MirroredStrategy(["GPU:0", "GPU:1", "GPU:2"])
-      self.assertEqual(
-          strategy.extended._collective_ops._options.implementation,
-          collective_util.CommunicationImplementation.RING)
+      if ops.executing_eagerly_outside_functions():
+        self.assertIsInstance(
+            strategy.extended._collective_ops,
+            cross_device_ops_lib.CollectiveAllReduce)
+        self.assertEqual(
+            strategy.extended._collective_ops._options.implementation,
+            collective_util.CommunicationImplementation.RING)
+      else:
+        self.assertEqual(strategy.extended._collective_ops,
+                         cross_device_ops_lib.ReductionToOneDevice)
     fn()
 
 
+@combinations.generate(
+    combinations.combine(
+        mode=["graph", "eager"], required_gpus=[2], use_default=[True, False]))
+class MirroredGetCrossDeviceOpTest(
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase, parameterized.TestCase):
+
+  def tearDown(self):
+    super().tearDown()
+    context._reset_context()
+
+  def testGpusCollectiveOp(self, use_default):
+
+    @def_function.function(jit_compile=util.is_xla_enabled())
+    def fn(var, use_default):
+
+      if use_default or util.is_xla_enabled():
+        self.assertIsInstance(
+            strategy.extended._get_cross_device_ops(var),
+            cross_device_ops_lib.CollectiveAllReduce)
+      else:
+        self.assertIsInstance(
+            strategy.extended._get_cross_device_ops(var),
+            cross_device_ops_lib.NcclAllReduce)
+
+    strategy = mirrored_strategy.MirroredStrategy(
+        ["GPU:0", "GPU:1"],
+        cross_device_ops=None
+        if use_default else cross_device_ops_lib.NcclAllReduce())
+    with strategy.scope():
+      var = variables.Variable(1.)
+
+    fn(var, use_default)
+
+  def testVirtualGpusCollectiveOp(self, use_default):
+    # Logical devices cannot be changed after context initialization.
+    context._reset_context()
+
+    physical_gpus = context.context().list_physical_devices(device_type="GPU")
+    context.context().set_logical_device_configuration(physical_gpus[1], [
+        context.LogicalDeviceConfiguration(memory_limit=1024),
+        context.LogicalDeviceConfiguration(memory_limit=1024)
+    ])
+
+    @def_function.function(jit_compile=util.is_xla_enabled())
+    def fn(var, use_default):
+
+      if use_default or util.is_xla_enabled():
+        self.assertIsInstance(
+            strategy.extended._get_cross_device_ops(var),
+            cross_device_ops_lib.CollectiveAllReduce)
+        self.assertEqual(
+            strategy.extended._get_cross_device_ops(
+                var)._options.implementation,
+            collective_util.CommunicationImplementation.RING)
+      else:
+        self.assertIsInstance(
+            strategy.extended._get_cross_device_ops(var),
+            cross_device_ops_lib.NcclAllReduce)
+
+    strategy = mirrored_strategy.MirroredStrategy(
+        ["GPU:0", "GPU:1", "GPU:2"],
+        cross_device_ops=None
+        if use_default else cross_device_ops_lib.NcclAllReduce())
+
+    with strategy.scope():
+      var = variables.Variable(1.)
+
+    fn(var, use_default)
+
+  def testCpusCollectiveOp(self, use_default):
+    del use_default
+    if util.is_xla_enabled():
+      self.skipTest("Only expected to run under non-XLA context.")
+
+    @def_function.function(jit_compile=True)
+    def fn(var):
+
+      if not ops.executing_eagerly_outside_functions():
+        self.assertIsInstance(
+            strategy.extended._get_cross_device_ops(var),
+            cross_device_ops_lib.ReductionToOneDevice)
+      else:
+        self.assertIsInstance(
+            strategy.extended._get_cross_device_ops(var),
+            cross_device_ops_lib.CollectiveAllReduce)
+
+    strategy = mirrored_strategy.MirroredStrategy(["CPU:0", "CPU:1"])
+    with strategy.scope():
+      var = variables.Variable(1.)
+
+    fn(var)
+
+  def testMixedDevicesCollectiveOp(self, use_default):
+    del use_default
+    if util.is_xla_enabled():
+      self.skipTest("All devices should be identical in XLA context.")
+
+    # XLA is not supported if devices are not of the same type.
+    strategy = mirrored_strategy.MirroredStrategy(["CPU:0", "GPU:0"])
+    with strategy.scope():
+      var = variables.Variable(1.)
+
+    self.assertIsInstance(
+        strategy.extended._get_cross_device_ops(var),
+        cross_device_ops_lib.ReductionToOneDevice)
+
+  def testMirroredStrategyInt32VariableCollectiveOp(self, use_default):
+    if util.is_xla_enabled():
+      self.skipTest("Only expected to run under non-XLA context.")
+
+    strategy = mirrored_strategy.MirroredStrategy(
+        ["GPU:0", "GPU:1"],
+        cross_device_ops=None
+        if use_default else cross_device_ops_lib.NcclAllReduce())
+    with strategy.scope():
+      # CollevtiveOp does not support int32 on GPU.
+      var = variables.Variable(1)
+
+    self.assertIsInstance(
+        strategy.extended._get_cross_device_ops(var),
+        cross_device_ops_lib.ReductionToOneDevice)
+
+
 def one_device_combinations():
   return combinations.combine(
       distribution=[
@@ -422,10 +565,10 @@ class MirroredStrategyVariableCreatorStackTest(
 
       with variable_scope.variable_creator_scope(thread_creator_fn):
         # Create a variable in this scope.
-        v = variable_scope.variable(1.0)
+        v = variable_v1.VariableV1(1.0)
 
         # This will pause the current thread, and execute the other thread.
-        ds_context.get_replica_context().merge_call(lambda _: _)
+        distribute_lib.get_replica_context().merge_call(lambda _: _)
       return v
 
     def main_thread_creator(next_creator, **kwargs):
@@ -476,7 +619,7 @@ class MirroredStrategyCallForEachReplicaTest(test.TestCase):
     @def_function.function
     def model_fn():
       traces.append(1)
-      return ds_context.get_replica_context().replica_id_in_sync_group
+      return distribute_lib.get_replica_context().replica_id_in_sync_group
 
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
@@ -490,7 +633,7 @@ class MirroredStrategyCallForEachReplicaTest(test.TestCase):
     @def_function.function
     def model_fn():
       traces.append(1)
-      return ds_context.get_replica_context().replica_id_in_sync_group
+      return distribute_lib.get_replica_context().replica_id_in_sync_group
 
     @def_function.function
     def step():
@@ -513,7 +656,8 @@ class MirroredStrategyCallForEachReplicaTest(test.TestCase):
     def model_fn():
 
       def body_fn(i):
-        return ds_context.get_replica_context().merge_call(merge_fn, args=(i,))
+        return distribute_lib.get_replica_context().merge_call(
+            merge_fn, args=(i,))
 
       return while_loop.while_loop_v2(lambda i: i < 2, body_fn, [0])
 
@@ -532,7 +676,8 @@ class MirroredStrategyCallForEachReplicaTest(test.TestCase):
       @def_function.function
       def model_fn_nested():
         t = constant_op.constant(1)
-        return ds_context.get_replica_context().merge_call(merge_fn, args=(t,))
+        return distribute_lib.get_replica_context().merge_call(
+            merge_fn, args=(t,))
 
       return model_fn_nested()
 
@@ -547,7 +692,7 @@ class MirroredStrategyCallForEachReplicaTest(test.TestCase):
 
     @def_function.function
     def model_fn():
-      ds_context.get_replica_context().merge_call(merge_fn)
+      distribute_lib.get_replica_context().merge_call(merge_fn)
       return 0.
 
     with distribution.scope():
@@ -590,7 +735,7 @@ class MirroredStrategyNameScopeTest(test.TestCase):
     def model_fn():
       with ops.name_scope("foo"):
         a = constant_op.constant(1.0, name="a")
-        ds_context.get_replica_context().merge_call(lambda _: _)
+        distribute_lib.get_replica_context().merge_call(lambda _: _)
         b = constant_op.constant(1.0, name="b")
       return a, b
 
@@ -608,7 +753,7 @@ class MirroredStrategyNameScopeTest(test.TestCase):
     def model_fn():
       with ops.name_scope(None, "foo"):
         a = constant_op.constant(1.0, name="a")
-        ds_context.get_replica_context().merge_call(lambda _: _)
+        distribute_lib.get_replica_context().merge_call(lambda _: _)
         b = constant_op.constant(2.0, name="b")
       return a, b
 
@@ -621,25 +766,25 @@ class MirroredStrategyNameScopeTest(test.TestCase):
         self.assertEqual("foo/" + name + ":0", v0.name)
         self.assertEqual("replica_1/foo/" + name + ":0", v1.name)
 
-  # variable_scope.variable() respects name scopes when creating
+  # variable_v1.VariableV1() respects name scopes when creating
   # variables. On the other hand variable_scope.get_variable() ignores name
   # scopes but respects variable scope when creating variables. We test both
   # methods of creating variables to make sure that we have the same
   # variable names in both cases.
   def testNameScopeWithVariable(self, distribution):
     def in_cross_replica(_):
-      c = variable_scope.variable(1.0, name="c")
+      c = variable_v1.VariableV1(1.0, name="c")
       return c
 
     def model_fn():
-      b = variable_scope.variable(1.0, name="b")
+      b = variable_v1.VariableV1(1.0, name="b")
       with ops.name_scope("foo"):
-        c = ds_context.get_replica_context().merge_call(in_cross_replica)
+        c = distribute_lib.get_replica_context().merge_call(in_cross_replica)
       return b, c
 
     with context.graph_mode(), distribution.scope():
       with ops.name_scope("main"):
-        a = variable_scope.variable(1.0, name="a")
+        a = variable_v1.VariableV1(1.0, name="a")
         result = distribution.extended.call_for_each_replica(model_fn)
       result_b = result[0]
       result_c = result[1]
@@ -663,7 +808,7 @@ class MirroredStrategyNameScopeTest(test.TestCase):
     def model_fn():
       b = variable_scope.get_variable("b", [1])
       with ops.name_scope("foo"):
-        c = ds_context.get_replica_context().merge_call(in_cross_replica)
+        c = distribute_lib.get_replica_context().merge_call(in_cross_replica)
       return b, c
 
     with context.graph_mode(), distribution.scope():
@@ -693,7 +838,7 @@ class MirroredStrategyNameScopeTest(test.TestCase):
     def model_fn():
       b = variable_scope.get_variable("b", [1])
       with variable_scope.variable_scope("foo"):
-        c = ds_context.get_replica_context().merge_call(in_cross_replica)
+        c = distribute_lib.get_replica_context().merge_call(in_cross_replica)
       return b, c
 
     with context.graph_mode(), distribution.scope():
@@ -732,8 +877,8 @@ class MirroredThreeDeviceDistributionTest(
 
   def testThreeDevices(self, distribution):
     def model_fn():
-      v = variable_scope.variable(1.0, name="foo")
-      ds_context.get_replica_context().merge_call(lambda _: _)
+      v = variable_v1.VariableV1(1.0, name="foo")
+      distribute_lib.get_replica_context().merge_call(lambda _: _)
       return v
 
     with distribution.scope():
@@ -755,7 +900,7 @@ class MirroredVariableUpdateTest(test.TestCase):
   def testAssignMirroredVarReplicaContextWithoutAggregationType(self,
                                                                 distribution):
     def var_fn():
-      v = variable_scope.variable(1.0, name="foo")
+      v = variable_v1.VariableV1(1.0, name="foo")
       return v
 
     with distribution.scope():
@@ -774,7 +919,7 @@ class MirroredVariableUpdateTest(test.TestCase):
     # Test that we don't reduce a non-per-replica value with the "sum"
     # aggregation type.
     def var_fn():
-      v = variable_scope.variable(
+      v = variable_v1.VariableV1(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.SUM)
       return v
 
@@ -800,7 +945,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
   def testAssignMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
-      return variable_scope.variable(1.0, name="foo")
+      return variable_v1.VariableV1(1.0, name="foo")
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
@@ -812,7 +957,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
   def testAssignMirroredVarReplicaContext(self, distribution):
     def var_fn():
-      return variable_scope.variable(
+      return variable_v1.VariableV1(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
     with distribution.scope():
@@ -823,7 +968,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
       def model_fn():
         value = math_ops.cast(
-            ds_context.get_replica_context().replica_id_in_sync_group,
+            distribute_lib.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign(value)
 
@@ -833,7 +978,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
   def testAssignMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
-      return variable_scope.variable(
+      return variable_v1.VariableV1(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
     with distribution.scope():
@@ -851,7 +996,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
   def testAssignAddMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
-      return variable_scope.variable(1.0, name="foo")
+      return variable_v1.VariableV1(1.0, name="foo")
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
@@ -893,7 +1038,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
   def testAssignAddMirroredVarReplicaContext(self, distribution):
     def var_fn():
-      return variable_scope.variable(
+      return variable_v1.VariableV1(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
     with distribution.scope():
@@ -904,7 +1049,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
       def model_fn():
         value = math_ops.cast(
-            ds_context.get_replica_context().replica_id_in_sync_group,
+            distribute_lib.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign_add(value)
 
@@ -914,7 +1059,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
   def testAssignAddMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
-      return variable_scope.variable(
+      return variable_v1.VariableV1(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
     with distribution.scope():
@@ -932,7 +1077,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
   def testAssignSubMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
-      return variable_scope.variable(5.0, name="foo")
+      return variable_v1.VariableV1(5.0, name="foo")
 
     with distribution.scope():
       mirrored_var = distribution.extended.call_for_each_replica(var_fn)
@@ -956,7 +1101,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
   def testAssignSubMirroredVarReplicaContext(self, distribution):
     def var_fn():
-      return variable_scope.variable(
+      return variable_v1.VariableV1(
           5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
     with distribution.scope():
@@ -967,7 +1112,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
       def model_fn():
         value = math_ops.cast(
-            ds_context.get_replica_context().replica_id_in_sync_group,
+            distribute_lib.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign_sub(value)
 
@@ -977,7 +1122,7 @@ class MirroredVariableUpdateTest(test.TestCase):
 
   def testAssignSubMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
-      return variable_scope.variable(
+      return variable_v1.VariableV1(
           5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
     with distribution.scope():
@@ -1007,7 +1152,7 @@ class MirroredAndSyncOnReadVariableInitializerTest(test.TestCase):
     # upon construction instead of once the initialization op is run.
     with context.graph_mode():
       def var_fn():
-        v = variable_scope.variable(1.0, name="foo")
+        v = variable_v1.VariableV1(1.0, name="foo")
         return v
 
       with distribution.scope():
@@ -1022,7 +1167,7 @@ class MirroredAndSyncOnReadVariableInitializerTest(test.TestCase):
     # upon construction instead of once the initialization op is run.
     with context.graph_mode():
       def model_fn():
-        v_sum = variable_scope.variable(
+        v_sum = variable_v1.VariableV1(
             1.0,
             synchronization=variable_scope.VariableSynchronization.ON_READ,
             aggregation=variable_scope.VariableAggregation.SUM)
@@ -1048,7 +1193,7 @@ class SyncOnReadVariableAssignTest(test.TestCase):
 
   def testAssignReplicaLocalVarSumAggregation(self, distribution):
     def model_fn():
-      v_sum = variable_scope.variable(
+      v_sum = variable_v1.VariableV1(
           1.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           aggregation=variable_scope.VariableAggregation.SUM)
@@ -1075,7 +1220,7 @@ class SyncOnReadVariableAssignTest(test.TestCase):
 
   def testAssignReplicaLocalVarMeanAggregation(self, distribution):
     def model_fn():
-      v_sum = variable_scope.variable(
+      v_sum = variable_v1.VariableV1(
           1.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           aggregation=variable_scope.VariableAggregation.MEAN)
@@ -1102,9 +1247,9 @@ class MockModel(object):
 
   def __init__(self, two_variables=False):
     self.variables = []
-    self.variables.append(variable_scope.variable(1.25, name="dummy_var1"))
+    self.variables.append(variable_v1.VariableV1(1.25, name="dummy_var1"))
     if two_variables:
-      self.variables.append(variable_scope.variable(2.0, name="dummy_var2"))
+      self.variables.append(variable_v1.VariableV1(2.0, name="dummy_var2"))
 
   def __call__(self, factor=2):
     x = factor * self.variables[0]
@@ -1287,7 +1432,7 @@ class MultiWorkerMirroredStrategyTestWithChief(
                                 {"TF_CONFIG": json.dumps(tf_config)}):
         strategy = mirrored_strategy.MirroredStrategy()
         if context.num_gpus() == 0:
-          self.assertIsInstance(strategy.extended._inferred_cross_device_ops,
+          self.assertIsInstance(strategy.extended._cross_device_ops,
                                 cross_device_ops_lib.ReductionToOneDevice)
       self.skipTest("b/130551176, run the following once fixed.")
       self._test_minimize_loss_graph(strategy, learning_rate=0.05)
@@ -1339,8 +1484,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   def testBackwardFunctionDevicePlacement(self, distribution):
     with distribution.scope():
-      w = variable_scope.variable([1.5], name="w")
-      b = variable_scope.variable([0.5], name="b")
+      w = variable_v1.VariableV1([1.5], name="w")
+      b = variable_v1.VariableV1([0.5], name="b")
 
     @def_function.function
     def forward(x, w, b):
@@ -1402,14 +1547,14 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
 
 def _replica_id():
-  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  replica_id = distribute_lib.get_replica_context().replica_id_in_sync_group
   if not isinstance(replica_id, ops.Tensor):
     replica_id = constant_op.constant(replica_id)
   return array_ops.identity(replica_id)
 
 
 def _replica_id_as_int():
-  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  replica_id = distribute_lib.get_replica_context().replica_id_in_sync_group
   if isinstance(replica_id, ops.Tensor):
     replica_id = tensor_util.constant_value(replica_id)
   return replica_id
diff --git a/tensorflow/python/distribute/mirrored_variable_test.py b/tensorflow/python/distribute/mirrored_variable_test.py
index fe6f400289c..169ba521139 100644
--- a/tensorflow/python/distribute/mirrored_variable_test.py
+++ b/tensorflow/python/distribute/mirrored_variable_test.py
@@ -17,8 +17,8 @@
 from tensorflow.python.checkpoint import checkpoint as tracking_util
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
@@ -38,13 +38,14 @@ from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import save
 
 
 def _replica_id():
-  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  replica_id = distribute_lib.get_replica_context().replica_id_in_sync_group
   if not isinstance(replica_id, ops.Tensor):
     replica_id = constant_op.constant(replica_id)
   return replica_id
@@ -104,12 +105,12 @@ class MirroredVariableCreationTest(test.TestCase):
   def testVariableInFuncGraph(self, distribution):
 
     def model_fn():
-      v = variable_scope.variable(2.0, name="bar")
-      ds_context.get_replica_context().merge_call(lambda _: _)
+      v = variable_v1.VariableV1(2.0, name="bar")
+      distribute_lib.get_replica_context().merge_call(lambda _: _)
       return v
 
     with func_graph.FuncGraph("fg").as_default(), distribution.scope():
-      v1 = variable_scope.variable(1.0, name="foo")
+      v1 = variable_v1.VariableV1(1.0, name="foo")
       v2 = distribution.extended.call_for_each_replica(model_fn)
 
     self._test_mv_properties(v1, "foo:0", distribution)
@@ -125,7 +126,7 @@ class MirroredVariableCreationTest(test.TestCase):
       if v[0] is None:
         init_val = array_ops.zeros([])
         v[0] = variables.Variable(init_val)
-      ds_context.get_replica_context().merge_call(lambda _: _)
+      distribute_lib.get_replica_context().merge_call(lambda _: _)
       return v[0]
 
     @def_function.function(autograph=False)
@@ -145,7 +146,7 @@ class MirroredVariableCreationTest(test.TestCase):
       if v[0] is None:
         init_val = array_ops.zeros([])
         v[0] = variables.Variable(init_val)
-      ds_context.get_replica_context().merge_call(lambda _: _)
+      distribute_lib.get_replica_context().merge_call(lambda _: _)
       return v[0]
 
     @def_function.function(autograph=False, jit_compile=True)
@@ -165,8 +166,8 @@ class MirroredVariableCreationTest(test.TestCase):
       # This variable should be created only once across the threads because of
       # special variable_creator functions used by
       # `distribution.extended.call_for_each_replica`.
-      v = variable_scope.variable(1.0, name="foo")
-      ds_context.get_replica_context().merge_call(lambda _: _)
+      v = variable_v1.VariableV1(1.0, name="foo")
+      distribute_lib.get_replica_context().merge_call(lambda _: _)
       return v
 
     with distribution.scope():
@@ -176,8 +177,8 @@ class MirroredVariableCreationTest(test.TestCase):
   def testUnnamedVariable(self, distribution):
 
     def model_fn():
-      v = variable_scope.variable(1.0)
-      ds_context.get_replica_context().merge_call(lambda _: _)
+      v = variable_v1.VariableV1(1.0)
+      distribute_lib.get_replica_context().merge_call(lambda _: _)
       return v
 
     with distribution.scope():
@@ -189,8 +190,8 @@ class MirroredVariableCreationTest(test.TestCase):
     def model_fn():
       vs = []
       for i in range(5):
-        vs.append(variable_scope.variable(1.0, name="foo" + str(i)))
-      ds_context.get_replica_context().merge_call(lambda _: _)
+        vs.append(variable_v1.VariableV1(1.0, name="foo" + str(i)))
+      distribute_lib.get_replica_context().merge_call(lambda _: _)
       return vs
 
     with distribution.scope():
@@ -202,11 +203,11 @@ class MirroredVariableCreationTest(test.TestCase):
 
     def model_fn():
       vs = []
-      vs.append(variable_scope.variable(1.0, name="foo/bar"))
-      vs.append(variable_scope.variable(1.0, name="foo_1/bar"))
-      vs.append(variable_scope.variable(1.0, name="foo_1/bar_1"))
-      vs.append(variable_scope.variable(1.0, name="foo/bar_1"))
-      ds_context.get_replica_context().merge_call(lambda _: _)
+      vs.append(variable_v1.VariableV1(1.0, name="foo/bar"))
+      vs.append(variable_v1.VariableV1(1.0, name="foo_1/bar"))
+      vs.append(variable_v1.VariableV1(1.0, name="foo_1/bar_1"))
+      vs.append(variable_v1.VariableV1(1.0, name="foo/bar_1"))
+      distribute_lib.get_replica_context().merge_call(lambda _: _)
       return vs
 
     with distribution.scope():
@@ -223,8 +224,8 @@ class MirroredVariableCreationTest(test.TestCase):
 
     def model_fn():
       replica_id = self.evaluate(_replica_id())
-      v = variable_scope.variable(1.0, name="foo_" + str(replica_id))
-      ds_context.get_replica_context().merge_call(lambda _: _)
+      v = variable_v1.VariableV1(1.0, name="foo_" + str(replica_id))
+      distribute_lib.get_replica_context().merge_call(lambda _: _)
       return v
 
     with distribution.scope():
@@ -236,17 +237,17 @@ class MirroredVariableCreationTest(test.TestCase):
   def testWithVariableAndVariableScope(self, distribution):
 
     def model_fn():
-      v0 = variable_scope.variable(1.0, name="var0", aggregation=None)
+      v0 = variable_v1.VariableV1(1.0, name="var0", aggregation=None)
       with variable_scope.variable_scope("common"):
-        v1 = variable_scope.variable(1.0, name="var1")
+        v1 = variable_v1.VariableV1(1.0, name="var1")
         # This will pause the current thread, and execute the other thread.
-        ds_context.get_replica_context().merge_call(lambda _: _)
-        v2 = variable_scope.variable(
+        distribute_lib.get_replica_context().merge_call(lambda _: _)
+        v2 = variable_v1.VariableV1(
             1.0,
             name="var2",
             synchronization=variable_scope.VariableSynchronization.ON_READ,
             aggregation=variable_scope.VariableAggregation.SUM)
-        v3 = variable_scope.variable(
+        v3 = variable_v1.VariableV1(
             1.0,
             name="var3",
             synchronization=variable_scope.VariableSynchronization.ON_WRITE,
@@ -255,7 +256,7 @@ class MirroredVariableCreationTest(test.TestCase):
       return v0, v1, v2, v3
 
     with distribution.scope():
-      v = variable_scope.variable(1.0, name="var-main0")
+      v = variable_v1.VariableV1(1.0, name="var-main0")
       self.assertEqual("var-main0:0", v.name)
 
       result = distribution.extended.call_for_each_replica(model_fn)
@@ -279,7 +280,7 @@ class MirroredVariableCreationTest(test.TestCase):
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.get_variable("var1", [1])
         # This will pause the current thread, and execute the other thread.
-        ds_context.get_replica_context().merge_call(lambda _: _)
+        distribute_lib.get_replica_context().merge_call(lambda _: _)
         v2 = variable_scope.get_variable(
             "var2", [1],
             synchronization=variable_scope.VariableSynchronization.ON_READ,
@@ -315,12 +316,12 @@ class MirroredVariableCreationTest(test.TestCase):
 
     def create_fn():
       aggregation = variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
-      v0 = variable_scope.variable(
+      v0 = variable_v1.VariableV1(
           2.0,
           name="on_read",
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           aggregation=aggregation)
-      v1 = variable_scope.variable(
+      v1 = variable_v1.VariableV1(
           3.0,
           name="on_write",
           synchronization=variable_scope.VariableSynchronization.ON_WRITE,
@@ -416,7 +417,7 @@ class MirroredVariableCreationTest(test.TestCase):
       with self.assertRaisesRegex(
           ValueError, "`NONE` variable synchronization mode is not "
           "supported with "):
-        variable_scope.variable(
+        variable_v1.VariableV1(
             1.0,
             name="v",
             synchronization=variable_scope.VariableSynchronization.NONE)
@@ -426,7 +427,7 @@ class MirroredVariableCreationTest(test.TestCase):
       with self.assertRaisesRegex(
           ValueError, "Invalid variable synchronization mode: Invalid for "
           "variable: v"):
-        variable_scope.variable(1.0, name="v", synchronization="Invalid")
+        variable_v1.VariableV1(1.0, name="v", synchronization="Invalid")
 
   def testInvalidAggregationWithGetVariable(self, distribution):
     with distribution.scope():
@@ -443,7 +444,7 @@ class MirroredVariableCreationTest(test.TestCase):
       with self.assertRaisesRegex(
           ValueError, "Invalid variable aggregation mode: invalid for "
           "variable: v"):
-        variable_scope.variable(
+        variable_v1.VariableV1(
             1.0,
             name="v",
             synchronization=variable_scope.VariableSynchronization.ON_WRITE,
@@ -452,8 +453,8 @@ class MirroredVariableCreationTest(test.TestCase):
   def testNonMatchingVariableCreation(self, distribution):
 
     def model_fn(name):
-      v = variable_scope.variable(1.0, name=name)
-      ds_context.get_replica_context().merge_call(lambda _: _)
+      v = variable_v1.VariableV1(1.0, name=name)
+      distribute_lib.get_replica_context().merge_call(lambda _: _)
       return v
 
     with distribution.scope():
@@ -470,11 +471,11 @@ class MirroredVariableCreationTest(test.TestCase):
 
     def model_fn():
       replica_id = self.evaluate(_replica_id())
-      v_sum = variable_scope.variable(
+      v_sum = variable_v1.VariableV1(
           1.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           aggregation=variable_scope.VariableAggregation.SUM)
-      v_mean = variable_scope.variable(
+      v_mean = variable_v1.VariableV1(
           4.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           aggregation=variable_scope.VariableAggregation.MEAN)
@@ -567,7 +568,7 @@ class MirroredVariableCreationTest(test.TestCase):
   def testSyncOnReadVariableUpdate(self, distribution):
 
     def model_fn():
-      v_sum = variable_scope.variable(
+      v_sum = variable_v1.VariableV1(
           1.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           aggregation=variable_scope.VariableAggregation.SUM)
@@ -606,8 +607,8 @@ class MirroredVariableCreationTest(test.TestCase):
 
   def testVarDistributeStrategy(self, distribution):
     with distribution.scope():
-      mirrored = variable_scope.variable(1.0)
-      sync_on_read = variable_scope.variable(
+      mirrored = variable_v1.VariableV1(1.0)
+      sync_on_read = variable_v1.VariableV1(
           1.0, synchronization=variable_scope.VariableSynchronization.ON_READ)
       self.assertIs(distribution, mirrored.distribute_strategy)
       self.assertIs(distribution, sync_on_read.distribute_strategy)
diff --git a/tensorflow/python/distribute/multi_worker_test_base.py b/tensorflow/python/distribute/multi_worker_test_base.py
index 9615a8e74cc..061db2de755 100644
--- a/tensorflow/python/distribute/multi_worker_test_base.py
+++ b/tensorflow/python/distribute/multi_worker_test_base.py
@@ -32,8 +32,8 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import multi_process_runner
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
-from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
+from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import remote
 from tensorflow.python.framework import errors
@@ -199,7 +199,7 @@ class MultiProcessCluster(object):
     self._mpr_manager = multi_process_runner.manager()
 
     def task_function(start_events, finish_events):
-      cluster_resolver = TFConfigClusterResolver()
+      cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
       cluster_spec = cluster_resolver.cluster_spec()
       task_type = cluster_resolver.task_type
       task_id = cluster_resolver.task_id
@@ -341,7 +341,7 @@ def create_multi_process_cluster(num_workers,
       has_eval=has_eval)
 
   cluster = MultiProcessCluster(
-      SimpleClusterResolver(
+      cluster_resolver_lib.SimpleClusterResolver(
           server_lib.ClusterSpec(cluster_spec), rpc_layer=rpc_layer),
       stream_output=stream_output,
       collective_leader=collective_leader)
diff --git a/tensorflow/python/distribute/multi_worker_util.py b/tensorflow/python/distribute/multi_worker_util.py
index 70991c6bcb0..082dbdf7abb 100644
--- a/tensorflow/python/distribute/multi_worker_util.py
+++ b/tensorflow/python/distribute/multi_worker_util.py
@@ -195,6 +195,10 @@ def coordination_leader(cluster_spec):
   if not cluster_spec.as_dict():
     return ""
 
+  # Use PS 0 if parameter servers are in the cluster
+  if "ps" in cluster_spec.jobs:
+    return "/job:ps/replica:0/task:0"
+
   # Use chief if chief is in the cluster.
   if "chief" in cluster_spec.jobs:
     return "/job:chief/replica:0/task:0"
diff --git a/tensorflow/python/distribute/numpy_dataset.py b/tensorflow/python/distribute/numpy_dataset.py
index 605db5d68b7..8133e21c733 100644
--- a/tensorflow/python/distribute/numpy_dataset.py
+++ b/tensorflow/python/distribute/numpy_dataset.py
@@ -22,6 +22,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.util import nest
 
 
@@ -78,8 +79,8 @@ def one_host_numpy_dataset(numpy_input, colocate_with, session):
 
   numpy_flat = nest.flatten(numpy_input)
   with variable_scope.variable_creator_scope(create_colocated_variable):
-    vars_flat = tuple(variable_scope.variable(array_ops.zeros(i.shape, i.dtype),
-                                              trainable=False)
+    vars_flat = tuple(variable_v1.VariableV1(array_ops.zeros(i.shape, i.dtype),
+                                             trainable=False)
                       for i in numpy_flat)
   for v, i in zip(vars_flat, numpy_flat):
     init_var_from_numpy(v, i, session)
diff --git a/tensorflow/python/distribute/numpy_dataset_test.py b/tensorflow/python/distribute/numpy_dataset_test.py
index 8f4517b684d..65383a45147 100644
--- a/tensorflow/python/distribute/numpy_dataset_test.py
+++ b/tensorflow/python/distribute/numpy_dataset_test.py
@@ -19,7 +19,7 @@ import numpy as np
 from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 
 
 class InitVarFromNumpyTest(test.TestCase):
@@ -29,7 +29,7 @@ class InitVarFromNumpyTest(test.TestCase):
     with self.cached_session() as session:
       x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
       initial = np.zeros_like(x)
-      var_x = variable_scope.variable(initial)
+      var_x = variable_v1.VariableV1(initial)
       numpy_dataset.init_var_from_numpy(var_x, x, session)
       val = self.evaluate(var_x.value())
       # Verify that the numpy value is copied to the variable.
diff --git a/tensorflow/python/distribute/parallel_device/BUILD b/tensorflow/python/distribute/parallel_device/BUILD
index 6b3f12ffddb..fc148be837c 100644
--- a/tensorflow/python/distribute/parallel_device/BUILD
+++ b/tensorflow/python/distribute/parallel_device/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow/core/platform:distribute.bzl", "distribute_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow/core/platform:distribute.bzl", "distribute_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -11,7 +12,7 @@ exports_files(
     visibility = ["//tensorflow/python:__pkg__"],
 )
 
-py_library(
+py_strict_library(
     name = "parallel_device",
     srcs = ["parallel_device.py"],
     srcs_version = "PY3",
@@ -21,12 +22,20 @@ py_library(
     ],
     deps = [
         "//tensorflow/python:_pywrap_parallel_device",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/tpu/ops",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:variable_utils",
     ],
 )
 
-distribute_py_test(
+distribute_py_strict_test(
     name = "parallel_device_test",
     srcs = ["parallel_device_test.py"],
     # We don't only use TPU Runtime on parallel device, we use c_api_tfrt instead.
@@ -39,10 +48,29 @@ distribute_py_test(
     ],
     deps = [
         ":parallel_device",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:collective_ops",
+        "//tensorflow/python:control_flow_switch_case",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:stateful_random_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/module",
-        "//tensorflow/python/tpu",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/tpu:tpu_strategy_util",
+        "//tensorflow/python/util:nest",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/distribute/parallel_device/pywrap_parallel_device.cc b/tensorflow/python/distribute/parallel_device/pywrap_parallel_device.cc
index 5f5d67f15bc..064f9996d98 100644
--- a/tensorflow/python/distribute/parallel_device/pywrap_parallel_device.cc
+++ b/tensorflow/python/distribute/parallel_device/pywrap_parallel_device.cc
@@ -21,10 +21,11 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/parallel_device/parallel_device.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/python/lib/core/py_exception_registry.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 namespace py = pybind11;
 
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index 9d69e8990e6..8d66ad7fdfd 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -28,8 +28,8 @@ from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import values
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
-from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
+from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
 from tensorflow.python.distribute.v1 import input_lib as input_lib_v1
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device as tf_device
@@ -108,7 +108,7 @@ class ParameterServerStrategyV1(distribute_lib.StrategyV1):
         `tf.distribute.cluster_resolver.TFConfigClusterResolver`.
     """
     if cluster_resolver is None:
-      cluster_resolver = TFConfigClusterResolver()
+      cluster_resolver = tfconfig_cluster_resolver.TFConfigClusterResolver()
     super(ParameterServerStrategyV1, self).__init__(
         ParameterServerStrategyExtended(
             self, cluster_resolver=cluster_resolver))
@@ -202,7 +202,8 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
     """
     # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
     # some cases.
-    if isinstance(cluster_resolver, TFConfigClusterResolver):
+    if isinstance(
+        cluster_resolver, tfconfig_cluster_resolver.TFConfigClusterResolver):
       num_gpus = context.num_gpus()
     else:
       num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
@@ -602,7 +603,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
     if cluster_spec:
       # Use the num_gpus_per_worker recorded in constructor since _configure
       # doesn't take num_gpus.
-      cluster_resolver = SimpleClusterResolver(
+      cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
           cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
           task_type=task_type,
           task_id=task_id,
diff --git a/tensorflow/python/distribute/parameter_server_strategy_test.py b/tensorflow/python/distribute/parameter_server_strategy_test.py
index 9b5e105bb07..d1818933264 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_test.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_test.py
@@ -25,14 +25,13 @@ from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_test_lib
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute.v1 import input_lib as input_lib_v1
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -60,7 +59,7 @@ PS = run_config.TaskType.PS
 
 
 def _get_replica_id_integer():
-  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  replica_id = distribute_lib.get_replica_context().replica_id_in_sync_group
   if isinstance(replica_id, ops.Tensor):
     replica_id = tensor_util.constant_value(replica_id)
   return replica_id
@@ -75,7 +74,7 @@ def create_test_objects(cluster_spec=None,
   if num_gpus is None:
     num_gpus = context.num_gpus()
   if cluster_spec and task_type and task_id is not None:
-    cluster_resolver = SimpleClusterResolver(
+    cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
         cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
         task_type=task_type,
         task_id=task_id,
@@ -187,8 +186,8 @@ class ParameterServerStrategyTestBase(
           g = e + 1.0
         self.assertEqual(g.device, worker_device + '/device:CPU:1')
 
-        # This ops.colocate_with will be ignored when defining a variable but not
-        # for a normal tensor.
+        # This ops.colocate_with will be ignored when defining a variable
+        # but not for a normal tensor.
         with ops.colocate_with(x):
           u = variable_scope.get_variable('u', initializer=30.0)
           v = variable_scope.get_variable('v', initializer=30.0)
@@ -341,8 +340,8 @@ class ParameterServerStrategyTestBase(
           g = e + 1.0
         self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1'))
 
-        # This ops.colocate_with will be ignored when defining a variable but not
-        # for a normal tensor.
+        # This ops.colocate_with will be ignored when defining a variable
+        # but not for a normal tensor.
         with ops.colocate_with(x):
           u = variable_scope.get_variable('u', initializer=30.0)
           h = f + 1.0
@@ -739,7 +738,7 @@ class ParameterServerStrategyTest(
   def testEagerCustomTrainingUnimplementedError(self):
     cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=2)
-    cluster_resolver = SimpleClusterResolver(
+    cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
         cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
         task_type='worker',
         task_id=1,
diff --git a/tensorflow/python/distribute/parameter_server_strategy_v2.py b/tensorflow/python/distribute/parameter_server_strategy_v2.py
index 2b145e3f18b..2c39613505a 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_v2.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_v2.py
@@ -33,6 +33,7 @@ from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.coordinator import cluster_coordinator
+from tensorflow.python.eager import context
 from tensorflow.python.eager import remote
 from tensorflow.python.framework import config
 from tensorflow.python.framework import device as tf_device
@@ -48,9 +49,14 @@ from tensorflow.python.util import keras_deps
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tsl.protobuf import coordination_config_pb2
 
 
 ALLOWED_TASK_TYPES = ("chief", "worker", "ps")
+# This sets the coordination service's internal heartbeat timeout. In testing, a
+# value of 1 led to some spurious reports of unavailability, so a higher value
+# is used. Refer to the discussion in b/249134783 for more.
+_HEARTBEAT_TIMEOUT_SECS = 5
 
 
 @tf_export(
@@ -467,6 +473,8 @@ class ParameterServerStrategyV2(distribute_lib.Strategy):
         "`tf.distribute.experimental.ParameterServerStrategy` is initialized "
         "with cluster_spec: %s", cluster_resolver.cluster_spec())
 
+    if os.getenv("TF_PSS_ENABLE_COORDINATION_SERVICE"):
+      self._configure_coordination_service(cluster_resolver.cluster_spec())
     # TODO(b/167894802): Make coordinator, worker, and ps names customizable.
     self._connect_to_cluster(coordinator_name="chief")
     self._extended = ParameterServerStrategyV2Extended(self, cluster_resolver,
@@ -480,6 +488,23 @@ class ParameterServerStrategyV2(distribute_lib.Strategy):
     # Used to check if isinstance() without having to import this module
     self._is_parameter_server_strategy_v2 = True
 
+  def _configure_coordination_service(self, cluster_spec):
+    if context.context().coordination_service is None:
+      coordinated_jobs = ["worker", "ps"]
+      coordinated_job_config = []
+      for job in coordinated_jobs:
+        if job in cluster_spec.jobs:
+          coordinated_job_config.append(
+              coordination_config_pb2.CoordinatedJob(
+                  name=job,
+                  num_tasks=cluster_spec.num_tasks(job)))
+      context.context().configure_coordination_service(
+          service_type="standalone",
+          service_leader=multi_worker_util.coordination_leader(
+              cluster_spec),
+          heartbeat_timeout_in_ms=_HEARTBEAT_TIMEOUT_SECS * 1000,
+          allow_new_incarnation_to_reconnect=True)
+
   def _connect_to_cluster(self, coordinator_name):
     if coordinator_name in ["worker", "ps"]:
       raise ValueError("coordinator name should not be 'worker' or 'ps'.")
@@ -623,6 +648,11 @@ class ParameterServerStrategyV2Extended(
 
       return variable_creator_single_replica
 
+  def _create_per_worker_variable(self, next_creator, **kwargs):
+    """Create an unsynced, unaggregated variable on each worker."""
+    return ps_values.PerWorkerVariable(
+        self._container_strategy(), next_creator, **kwargs)
+
   def _create_variable(self, next_creator, **kwargs):
     """Implements StrategyExtendedV2._create_variable.
 
@@ -642,6 +672,9 @@ class ParameterServerStrategyV2Extended(
     Returns:
       A `Variable` or `ShardedVariable`.
     """
+    if kwargs.pop("per_worker_variable", False):
+      logging.info("Creating per worker variable")
+      return self._create_per_worker_variable(next_creator, **kwargs)
 
     var_creator = self._create_var_creator(next_creator, **kwargs)
     if "colocate_with" in kwargs:  # Never partition colocated_with variables.
diff --git a/tensorflow/python/distribute/parameter_server_strategy_v2_test.py b/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
index f2c921fd778..80a797aebe5 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
@@ -25,13 +25,13 @@ from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.checkpoint import checkpoint as tracking_util
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import sharded_variable
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
@@ -51,7 +51,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.saved_model import save as tf_save
 from tensorflow.python.trackable import autotrackable
-from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.training import server_lib
 
 
 class ParameterServerStrategyV2Test(test.TestCase):
@@ -361,7 +361,7 @@ class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
       self.assertIsInstance(v, variables.Variable)
 
     def replica_fn():
-      replica_id = distribution_strategy_context.get_replica_context(
+      replica_id = distribute_lib.get_replica_context(
       ).replica_id_in_sync_group
       val = array_ops.reshape(
           math_ops.cast(replica_id + 10, dtype=v.dtype), [1])
@@ -394,7 +394,7 @@ class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
       self.assertIsInstance(v.variables[0], variables.Variable)
 
     def replica_fn():
-      replica_id = distribution_strategy_context.get_replica_context(
+      replica_id = distribute_lib.get_replica_context(
       ).replica_id_in_sync_group
       val = array_ops.reshape(
           math_ops.cast(replica_id + 10, dtype=v.dtype), [1])
@@ -656,16 +656,18 @@ class ClusterTypeNameTest(test.TestCase):
     cluster_def["some_arbitrary_name"] = [
         "localhost:%d" % multi_worker_test_base.pick_unused_port()
     ]
-    cluster_resolver = SimpleClusterResolver(
-        ClusterSpec(cluster_def), rpc_layer="grpc")
+    cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
+        server_lib.ClusterSpec(cluster_def), rpc_layer="grpc")
     with self.assertRaisesRegexp(ValueError, "Disallowed task type found in"):
       parameter_server_strategy_v2.ParameterServerStrategyV2(cluster_resolver)
 
   def testArbitraryCurrentTaskType(self):
     cluster_def = multi_worker_test_base.create_cluster_spec(
         num_workers=1, num_ps=1, has_chief=True)
-    cluster_resolver = SimpleClusterResolver(
-        ClusterSpec(cluster_def), rpc_layer="grpc", task_type="foobar")
+    cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
+        server_lib.ClusterSpec(cluster_def),
+        rpc_layer="grpc", task_type="foobar",
+    )
     with self.assertRaisesRegexp(ValueError, "Unrecognized task_type: foobar"):
       parameter_server_strategy_v2.ParameterServerStrategyV2(cluster_resolver)
 
@@ -674,8 +676,8 @@ class ClusterTypeNameTest(test.TestCase):
         num_workers=1, num_ps=1)
     chief_ports = [multi_worker_test_base.pick_unused_port() for _ in range(3)]
     cluster_def["chief"] = ["localhost:%s" % port for port in chief_ports]
-    cluster_resolver = SimpleClusterResolver(
-        ClusterSpec(cluster_def),
+    cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
+        server_lib.ClusterSpec(cluster_def),
         rpc_layer="grpc",
         task_type="chief",
         task_id=1)
@@ -686,8 +688,10 @@ class ClusterTypeNameTest(test.TestCase):
   def testLessThanOneWorker(self):
     cluster_def = multi_worker_test_base.create_cluster_spec(
         num_workers=0, num_ps=1, has_chief=True)
-    cluster_resolver = SimpleClusterResolver(
-        ClusterSpec(cluster_def), rpc_layer="grpc", task_type="ps", task_id=0)
+    cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
+        server_lib.ClusterSpec(cluster_def),
+        rpc_layer="grpc", task_type="ps", task_id=0,
+    )
     with self.assertRaisesRegexp(ValueError,
                                  "There must be at least one worker."):
       parameter_server_strategy_v2.ParameterServerStrategyV2(cluster_resolver)
@@ -695,8 +699,8 @@ class ClusterTypeNameTest(test.TestCase):
   def testLessThanOnePs(self):
     cluster_def = multi_worker_test_base.create_cluster_spec(
         num_workers=1, num_ps=0, has_chief=True)
-    cluster_resolver = SimpleClusterResolver(
-        ClusterSpec(cluster_def),
+    cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
+        server_lib.ClusterSpec(cluster_def),
         rpc_layer="grpc",
         task_type="worker",
         task_id=0)
diff --git a/tensorflow/python/distribute/per_replica_test.py b/tensorflow/python/distribute/per_replica_test.py
index 6f74b1b05e8..fd9ded4f584 100644
--- a/tensorflow/python/distribute/per_replica_test.py
+++ b/tensorflow/python/distribute/per_replica_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.util import nest
 
 
@@ -115,7 +115,7 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
     per_replica_2 = values_lib.PerReplica((constant_op.constant(["b", "c"]),))
     condition = array_ops.placeholder_with_default(True, [])
 
-    result = control_flow_ops.cond(
+    result = cond.cond(
         condition, lambda: per_replica_1, lambda: per_replica_2)
 
     self.assertLen(result.values, 1)
@@ -127,7 +127,7 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
     per_replica_2 = values_lib.PerReplica(("b",))
     condition = array_ops.placeholder_with_default(True, [])
 
-    result = control_flow_ops.cond(
+    result = cond.cond(
         condition, lambda: per_replica_1, lambda: per_replica_2)
 
     self.assertLen(result.values, 1)
@@ -140,7 +140,7 @@ class PerReplicaTest(test.TestCase, parameterized.TestCase):
     condition = array_ops.placeholder(dtypes.bool, [])
 
     with self.assertRaisesRegex(TypeError, "Could not build a TypeSpec for"):
-      control_flow_ops.cond(
+      cond.cond(
           condition, lambda: per_replica_1, lambda: per_replica_2)
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/distribute/ps_values.py b/tensorflow/python/distribute/ps_values.py
index f4e187d406e..bff49f31df8 100644
--- a/tensorflow/python/distribute/ps_values.py
+++ b/tensorflow/python/distribute/ps_values.py
@@ -16,6 +16,7 @@
 
 import contextlib
 import copy
+import functools
 import threading
 import weakref
 
@@ -23,7 +24,6 @@ import numpy as np
 
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute import values_util
 from tensorflow.python.distribute.coordinator import coordinator_context
@@ -31,6 +31,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_conversion_registry
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
@@ -41,6 +42,7 @@ from tensorflow.python.trackable import base as trackable
 from tensorflow.python.types import core
 from tensorflow.python.util.lazy_loader import LazyLoader
 
+
 load_context = LazyLoader(
     "load_context", globals(),
     "tensorflow.python.keras.saving.saved_model.load_context"
@@ -82,7 +84,7 @@ class AggregatingVariable(resource_variable_ops.BaseResourceVariable,
     Raises:
       RuntimeError: If trying to deepcopy into a different strategy.
     """
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
       v = copy.deepcopy(self._v, memo)
 
     copied_variable = type(self)(
@@ -105,9 +107,9 @@ class AggregatingVariable(resource_variable_ops.BaseResourceVariable,
     return getattr(self._v, name)
 
   def _assign_func(self, *args, **kwargs):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
       f = kwargs.pop("f")
-      if ds_context.in_cross_replica_context():
+      if distribute_lib.in_cross_replica_context():
         if distribute_lib.get_update_replica_id() is not None:
           # We are calling an assign function in an update context.
           return f(self._v, *args, **kwargs)
@@ -117,7 +119,7 @@ class AggregatingVariable(resource_variable_ops.BaseResourceVariable,
         return self._distribute_strategy.extended.update(
             self, f, args=args, kwargs=kwargs)
       else:
-        replica_context = ds_context.get_replica_context()
+        replica_context = distribute_lib.get_replica_context()
         assert replica_context
         # We are calling an assign function in replica context.
         # We reduce the value we want to assign/add/sub. More details about how
@@ -549,6 +551,90 @@ tensor_conversion_registry.register_tensor_conversion_function(
 CachingVariable._overload_overloadable_operators()  # pylint: disable=protected-access
 
 
+class PerWorkerVariable(resource_variable_ops.BaseResourceVariable):
+  """A wrapper around unsynced variables created on workers.
+
+  Overrides the Variable's handle to use the appropriate worker's variable
+  handle at call time. In doing so this class can support the built-in
+  `Variable` methods, but it is experimental.
+
+  All per-worker values can be read and retrieved as a list via
+  `PerWorkerVariable.read_all()`.
+  """
+
+  def __init__(self, strategy, next_creator, **kwargs):
+    self._coordinator = strategy._cluster_coordinator
+    self._per_worker_vars = None
+    self._next_creator = functools.partial(next_creator, **kwargs)
+
+    self._coordinator_instance = next_creator(**kwargs)
+
+    # Set ResourceVariable attributes based on kwargs
+    if kwargs.get("in_graph_mode") is None:
+      with ops.init_scope():
+        self._in_graph_mode = not context.executing_eagerly()
+    else:
+      self._in_graph_mode = kwargs["in_graph_mode"]
+
+    self._cached_value = None
+    self._shape = (
+        tensor_shape.as_shape(kwargs["shape"]) if kwargs.get("shape") else None
+    )
+    self._dtype = (
+        dtypes.as_dtype(kwargs["dtype"]) if kwargs.get("dtype") else None
+    )
+    self._trainable = False  # not supported
+    self._unique_id = kwargs.get("unique_id")
+    if kwargs.get("handle_name") is None:
+      self._handle_name = "Variable:0"
+    else:
+      self._handle_name = kwargs["handle_name"] + ":0"
+
+  @classmethod
+  def _variable_call(cls, *args, **kwargs):
+    """Override to be a no-op to avoid metaclass creating ResourceVariables."""
+    return None
+
+  @property
+  def handle(self):
+    self._maybe_create_per_worker_vars()
+    closure, spec = self.handle_call_time_value()
+    return ops.get_default_graph().capture_call_time_value(
+        closure,
+        spec)
+
+  def handle_call_time_value(self):
+    """Returns a closure to run for a handle at call time and its spec.
+
+    This function is called in self.handle to create a placeholder
+    which returns a handle on some worker or on the coordinator.
+    """
+
+    def closure():
+      dispatch_context = coordinator_context.get_current_dispatch_context()
+      if dispatch_context:
+        remote_value = self._per_worker_vars._values[  # pylint: disable=protected-access
+            dispatch_context.worker_index]
+        ret = dispatch_context.maybe_get_remote_value(remote_value)
+        return ret.handle
+      else:
+        # Only needed for tracing
+        return self._coordinator_instance.handle
+
+    return closure, tensor_spec.TensorSpec(
+        shape=self.shape, dtype=dtypes.resource)
+
+  def _maybe_create_per_worker_vars(self):
+    """Create variable on each worker if it hasn't been created."""
+    if not self._per_worker_vars:
+      self._per_worker_vars = (
+          self._coordinator._create_per_worker_resources(self._next_creator))  # pylint: disable=protected-access
+
+  def read_all(self):
+    """Synchronously read variables from all workers into a list of Tensors."""
+    return [wv.get() for wv in self._per_worker_vars._values]  # pylint: disable=protected-access
+
+
 class DistributedTable(lookup_ops.StaticHashTable):
   """A distributed StaticHashTable for ParameterServerStrategy.
 
diff --git a/tensorflow/python/distribute/ps_values_test.py b/tensorflow/python/distribute/ps_values_test.py
index 820007b74a0..6fae4438ecb 100644
--- a/tensorflow/python/distribute/ps_values_test.py
+++ b/tensorflow/python/distribute/ps_values_test.py
@@ -21,7 +21,7 @@ from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
-from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables as variables_lib
 
 
@@ -43,7 +43,7 @@ class AggregatingVariableTest(test.TestCase, parameterized.TestCase):
 
   def testAssignAdd(self, distribution):
     with distribution.scope():
-      v = variable_scope.variable(
+      v = variable_v1.VariableV1(
           1, aggregation=variables_lib.VariableAggregation.MEAN)
     self.evaluate(variables_lib.global_variables_initializer())
 
diff --git a/tensorflow/python/distribute/sharded_variable.py b/tensorflow/python/distribute/sharded_variable.py
index f42973ba5c4..cd055a446b7 100644
--- a/tensorflow/python/distribute/sharded_variable.py
+++ b/tensorflow/python/distribute/sharded_variable.py
@@ -908,3 +908,30 @@ def embedding_lookup(params,
   return embedding_ops.embedding_lookup(params.variables, ids,
                                         partition_strategy, name,
                                         validate_indices, max_norm)
+
+
+# Separately override safe_embedding_lookup_sparse, to avoid conversion of
+# ShardedVariable to tensor.
+@dispatch.dispatch_for_api(embedding_ops.safe_embedding_lookup_sparse)
+def safe_embedding_lookup_sparse(
+    embedding_weights: ShardedVariable,
+    sparse_ids,
+    sparse_weights=None,
+    combiner='mean',
+    default_id=None,
+    name=None,
+    partition_strategy='div',
+    max_norm=None,
+    allow_fast_lookup=False,
+):
+  """Pass the individual shard variables as a list."""
+  return embedding_ops.safe_embedding_lookup_sparse(
+      embedding_weights.variables,
+      sparse_ids,
+      sparse_weights=sparse_weights,
+      combiner=combiner,
+      default_id=default_id,
+      name=name,
+      partition_strategy=partition_strategy,
+      max_norm=max_norm,
+      allow_fast_lookup=allow_fast_lookup)
diff --git a/tensorflow/python/distribute/sharded_variable_test.py b/tensorflow/python/distribute/sharded_variable_test.py
index 114ee1f1b4a..4c83bc49b32 100644
--- a/tensorflow/python/distribute/sharded_variable_test.py
+++ b/tensorflow/python/distribute/sharded_variable_test.py
@@ -22,10 +22,10 @@ from tensorflow.python.checkpoint import checkpoint as util
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.distribute import sharded_variable
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute.test_util import get_cluster_def
 from tensorflow.python.distribute.test_util import TestClusterParams
 from tensorflow.python.eager import context
@@ -51,7 +51,7 @@ from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.trackable import autotrackable
-from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 
 # We create one cluster to share between tests. The cluster should be large
@@ -553,17 +553,16 @@ class ShardedVariableTest(test.TestCase, parameterized.TestCase):
       return embedding_ops.safe_embedding_lookup_sparse_v2(
           sv, sp_ids, sp_weights)
 
-    # TODO(chenkai): Add safe_sparse_lookup to the list. Currently
-    # ShardedVariable is converted to a tensor in safe_sparse_lookup.
-    for func in [lookup, sparse_lookup]:
+    for func in [lookup, sparse_lookup, safe_sparse_lookup]:
       num_gather_ops = 0
       for op in func.get_concrete_function().graph.get_operations():
         if op.type == 'ResourceGather':
           num_gather_ops += 1
       self.assertEqual(
-          num_gather_ops, len(v), 'Number of ResourceGather op does not match'
-          ' expected, possibly due to ShardedVariable accidentally being'
-          ' converted to tensor in embedding_lookup ops.')
+          num_gather_ops, len(v), 'Number of ResourceGather op '
+          f'({num_gather_ops}) does not match expected ({len(v)}), possibly '
+          'due to ShardedVariable accidentally being converted to tensor in '
+          'embedding_lookup ops.')
 
     self.assertAllEqual(lookup(), [[1., 2.], [7., 8.], [9., 10.]])
     self.assertAllClose(sparse_lookup(), [[4., 5.], [9., 10.], [3., 4.]])
@@ -705,7 +704,8 @@ class ShardedVariableSaveLoadTest(test.TestCase, parameterized.TestCase):
   def setUp(self):
     super().setUp()
     cluster_def = get_cluster_def(test_cluster_params, num_workers=2, num_ps=3)
-    self.cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def))
+    self.cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
+        server_lib.ClusterSpec(cluster_def))
 
   def tearDown(self):
     super().tearDown()
@@ -719,7 +719,7 @@ class ShardedVariableSaveLoadTest(test.TestCase, parameterized.TestCase):
           variable_partitioner=sharded_variable.FixedShardsPartitioner(
               num_shards))
     else:
-      strategy = ds_context._get_default_strategy()
+      strategy = distribute_lib._get_default_strategy()
     return strategy
 
   @combinations.generate(
diff --git a/tensorflow/python/distribute/shared_variable_creator_test.py b/tensorflow/python/distribute/shared_variable_creator_test.py
index b2cc8016b9f..915810d0ef3 100644
--- a/tensorflow/python/distribute/shared_variable_creator_test.py
+++ b/tensorflow/python/distribute/shared_variable_creator_test.py
@@ -18,6 +18,7 @@ from tensorflow.python.distribute import shared_variable_creator
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 
 
 class CanonicalizeVariableNameTest(test.TestCase):
@@ -53,13 +54,13 @@ class SharedVariableCreatorTest(test.TestCase):
       creator_fns.append(creator_fn)
 
     with variable_scope.variable_creator_scope(creator_fns[0]):
-      v0 = variable_scope.variable(1.0, name="foo")
+      v0 = variable_v1.VariableV1(1.0, name="foo")
 
     with variable_scope.variable_creator_scope(creator_fns[1]):
-      v1 = variable_scope.variable(1.0, name="foo")
+      v1 = variable_v1.VariableV1(1.0, name="foo")
 
     with variable_scope.variable_creator_scope(creator_fns[2]):
-      v2 = variable_scope.variable(1.0, name="foo")
+      v2 = variable_v1.VariableV1(1.0, name="foo")
 
     # v1 and v2 should be same as v0
     self.assertIs(v1, v0)
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 82914448096..4a2bd812976 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -22,7 +22,7 @@ from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import cluster_resolver
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import mirrored_strategy as mirrored_lib
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
@@ -34,6 +34,7 @@ from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import remote
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util as framework_test_util
 from tensorflow.python.platform import flags
@@ -131,6 +132,18 @@ def _get_tpu_strategy_creator(steps_per_run,
 
 
 def _mirrored_strategy_with_collective_key_base(devices):
+  required_cpus_nums = sum(
+      1
+      for d in devices
+      if tf_device.DeviceSpec.from_string(d).device_type == "CPU"
+  )
+
+  # If required virtual CPUs are not setup yet, config the logical devices.
+  if required_cpus_nums > len(context.context().list_logical_devices("CPU")):
+    context._reset_context()  # pylint: disable=protected-access
+    test_util.set_logical_devices_to_at_least("CPU", required_cpus_nums)
+
+  # Increase collective base key to avoid key collision across subtests.
   mirrored_lib.MirroredStrategyV1._collective_key_base += 100000
   mirrored_lib.MirroredStrategy._collective_key_base += 100000
   return MirroredStrategy(devices)
@@ -338,7 +351,7 @@ _four_worker_pool = _deferred_pool_runner(
 # pylint: disable=g-long-lambda
 default_strategy = combinations.NamedDistribution(
     "Default",
-    distribution_strategy_context._get_default_strategy,  # pylint: disable=protected-access
+    distribute_lib._get_default_strategy,  # pylint: disable=protected-access
     required_gpus=None)
 one_device_strategy = combinations.NamedDistribution(
     "OneDeviceCPU", lambda: OneDeviceStrategy("/cpu:0"), required_gpus=None)
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index 36ad1aaab66..3311e78a30b 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -18,7 +18,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
@@ -51,7 +51,7 @@ class StrategyTest(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def f():
-      return ds_context.get_replica_context().replica_id_in_sync_group
+      return distribute_lib.get_replica_context().replica_id_in_sync_group
 
     @def_function.function
     def g():
@@ -76,7 +76,7 @@ class StrategyTest(test.TestCase, parameterized.TestCase):
         def replica_fn():
 
           with ops.init_scope():
-            y = ds_context.get_replica_context().merge_call(merge_fn)
+            y = distribute_lib.get_replica_context().merge_call(merge_fn)
             z = y + 1
             return z
 
@@ -176,7 +176,7 @@ class StrategyLocalResultTest(test.TestCase):
     @def_function.function
     def model_fn():
       return distribution.extended._get_local_replica_id(
-          ds_context.get_replica_context().replica_id_in_sync_group)
+          distribute_lib.get_replica_context().replica_id_in_sync_group)
 
     with distribution.scope():
       result = distribution.run(model_fn)
@@ -189,7 +189,7 @@ class StrategyLocalResultTest(test.TestCase):
     @def_function.function
     def model_fn():
       replica_id = distribution.extended._get_local_replica_id(
-          ds_context.get_replica_context().replica_id_in_sync_group)
+          distribute_lib.get_replica_context().replica_id_in_sync_group)
       return {
           'a': math_ops.cast(replica_id + 1, dtype=float),
           'b': math_ops.cast(replica_id + 2, dtype=float)
@@ -282,7 +282,7 @@ class ReplicaCtxUpdateTest(test.TestCase, parameterized.TestCase):
     def replica_fn():
       value = array_ops.constant(2.)
       python_literal = 1.
-      replica_context = ds_context.get_replica_context()
+      replica_context = distribute_lib.get_replica_context()
       fn_sets = {
           'assign': lambda var, value: var.assign(value),
           'assign_add': lambda var, value: var.assign_add(value),
@@ -423,7 +423,7 @@ class ReplicaCtxAllReduceTest(test.TestCase, parameterized.TestCase):
 
     @tf_function
     def replica_fn():
-      replica_context = ds_context.get_replica_context()
+      replica_context = distribute_lib.get_replica_context()
       replica_id = replica_context.replica_id_in_sync_group
       var.assign(math_ops.cast(replica_id, dtype=float) * 3.0)
 
@@ -472,7 +472,7 @@ class AllReduceTest(test.TestCase, parameterized.TestCase):
 
       def replica_fn():
         value = array_ops.identity(1.0)
-        rep_ctx = ds_context.get_replica_context()
+        rep_ctx = distribute_lib.get_replica_context()
         reduced = rep_ctx.all_reduce(reduce_util.ReduceOp.SUM, value)
         return reduced
 
@@ -493,7 +493,7 @@ class AllReduceTest(test.TestCase, parameterized.TestCase):
             values=array_ops.identity([[1.0]]),
             indices=array_ops.identity([0]),
             dense_shape=array_ops.identity([5, 1]))
-        rep_ctx = ds_context.get_replica_context()
+        rep_ctx = distribute_lib.get_replica_context()
         reduced = rep_ctx.all_reduce(reduce_util.ReduceOp.MEAN, value)
         return reduced
 
@@ -526,7 +526,7 @@ class AllReduceTest(test.TestCase, parameterized.TestCase):
             values=array_ops.identity([[2.0]]),
             indices=array_ops.identity([0]),
             dense_shape=array_ops.identity([5, 1]))
-        rep_ctx = ds_context.get_replica_context()
+        rep_ctx = distribute_lib.get_replica_context()
         reduced = rep_ctx.all_reduce(reduce_util.ReduceOp.SUM, [value1, value2])
         return reduced
 
@@ -569,7 +569,7 @@ class AllReduceTest(test.TestCase, parameterized.TestCase):
                      values=array_ops.identity([[2.0]]),
                      indices=array_ops.identity([1]),
                      dense_shape=array_ops.identity([5, 1])))
-        rep_ctx = ds_context.get_replica_context()
+        rep_ctx = distribute_lib.get_replica_context()
         reduced = rep_ctx.all_reduce(reduce_util.ReduceOp.SUM, value)
         return reduced
 
diff --git a/tensorflow/python/distribute/strategy_gather_test.py b/tensorflow/python/distribute/strategy_gather_test.py
index bb6e9803228..94f1c082dfa 100644
--- a/tensorflow/python/distribute/strategy_gather_test.py
+++ b/tensorflow/python/distribute/strategy_gather_test.py
@@ -19,7 +19,7 @@ from absl.testing import parameterized
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import test_util
@@ -49,7 +49,7 @@ from tensorflow.python.util import nest
             strategy_combinations.mirrored_strategy_with_one_cpu,
             strategy_combinations.mirrored_strategy_with_one_gpu,
             strategy_combinations.mirrored_strategy_with_two_gpus,
-            strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+            strategy_combinations.mirrored_strategy_with_two_cpus,
             strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
             strategy_combinations.multi_worker_mirrored_2x2_gpu,
             strategy_combinations.multi_worker_mirrored_2x1_cpu,
@@ -273,7 +273,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         lambda _: array_ops.identity(value_on_replica))
 
     def replica_fn(per_replica_value):
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       local_value = array_ops.identity(per_replica_value)
       return ctx.all_gather(local_value, axis=axis)
 
@@ -345,7 +345,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def replica_fn(per_replica_value):
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       return ctx.all_gather(array_ops.identity(per_replica_value), axis=axis)
 
     result = strategy.experimental_local_results(
@@ -372,7 +372,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
 
     def run(value):
       value_identity = array_ops.identity(value)
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       return ctx.all_gather(value_identity, axis=0)
 
     if not pure_eager:
@@ -400,7 +400,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
 
     def run(value):
       value_identity = array_ops.identity(value)
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       return ctx.all_gather(value_identity, axis=1)
 
     if not pure_eager:
@@ -439,7 +439,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
     def run(value):
       value_1 = array_ops.identity(value)
       value_3 = array_ops.identity(value_2)
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       return ctx.all_gather([value_1, value_3], axis=axis)
 
     if not pure_eager:
@@ -458,7 +458,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
 
     def run():
       value_identity = array_ops.identity(single_value)
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       return ctx.all_gather([value_identity, value_identity], axis=axis)
 
     if not pure_eager:
@@ -494,7 +494,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
 
     def run(value):
       value_identity = array_ops.identity(value)
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       return ctx.all_gather(value_identity, axis=0)
 
     if not pure_eager:
@@ -517,7 +517,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         values=[[1., 2.]], indices=[2], dense_shape=dense_shape)
 
     def replica_fn(value):
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       return ctx.all_gather(value, axis=0)
 
     with self.assertRaisesRegex(
@@ -546,7 +546,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
 
     def run(value):
       value_identity = array_ops.identity(value)
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       return ctx.all_gather(value_identity, axis=0)
 
     if not pure_eager:
@@ -584,7 +584,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
 
     def all_gather_fn(value):
       axis = 1
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       return ctx.all_gather(array_ops.identity(value), axis)
 
     gradient_comp = sum(range(1, strategy.num_replicas_in_sync + 1))
@@ -616,7 +616,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
 
     def all_gather_fn(value):
       axis = 1
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       return ctx.all_gather(array_ops.identity(value), axis)
 
     gradient_comp = sum(range(1, strategy.num_replicas_in_sync + 1))
diff --git a/tensorflow/python/distribute/strategy_test_lib.py b/tensorflow/python/distribute/strategy_test_lib.py
index 25f3f6f754f..911fccc7e56 100644
--- a/tensorflow/python/distribute/strategy_test_lib.py
+++ b/tensorflow/python/distribute/strategy_test_lib.py
@@ -27,7 +27,6 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy as mwms_lib
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import mirrored_strategy as mirrored_lib
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import tpu_strategy
@@ -47,6 +46,7 @@ from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.training import optimizer
@@ -77,7 +77,7 @@ def _raise_exception_fn(_=None):
 # Must be the argument to a distribution.extended.call_for_each_replica() call,
 # calls a get_replica_context().merge_call() that raises an exception.
 def _merge_raises_fn():
-  ds_context.get_replica_context().merge_call(_raise_exception_fn)
+  distribute_lib.get_replica_context().merge_call(_raise_exception_fn)
 
 
 # Must be the argument to a get_replica_context().merge_call() call, calls
@@ -91,7 +91,7 @@ def _call_raises_fn(dist):
 # calls a get_replica_context().merge_call() that calls a
 # call_for_each_replica() that raises an exception.
 def _merge_call_raises_fn():
-  ds_context.get_replica_context().merge_call(_call_raises_fn)
+  distribute_lib.get_replica_context().merge_call(_call_raises_fn)
 
 
 # Must be the argument to a get_replica_context().merge_call() call, calls
@@ -106,7 +106,7 @@ def _call_merge_raises_fn(dist):
 # call_for_each_replica() that calls a get_replica_context().merge_call() that
 # raises an exception.
 def _merge_call_merge_raises_fn():
-  ds_context.get_replica_context().merge_call(_call_merge_raises_fn)
+  distribute_lib.get_replica_context().merge_call(_call_merge_raises_fn)
 
 
 def _events_from_logdir(test_case, logdir):
@@ -272,7 +272,7 @@ class DistributionTestBase(test.TestCase):
     def run_fn():
       """Function executed for each replica."""
       with summary_writer.as_default():
-        replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+        replica_id = distribute_lib.get_replica_context().replica_id_in_sync_group
         return summary_ops.write("a", replica_id)
 
     with self.cached_session() as sess, d.scope(), \
@@ -306,7 +306,7 @@ class DistributionTestBase(test.TestCase):
 
       def mark_devices_fn():
         replica_id = self.evaluate(
-            ds_context.get_replica_context().replica_id_in_sync_group)
+            distribute_lib.get_replica_context().replica_id_in_sync_group)
         self.assertLess(replica_id, len(d.extended.worker_devices))
         self.assertFalse(expected_devices[replica_id])
         expected_devices[replica_id] = True
@@ -485,7 +485,7 @@ class DistributionTestBase(test.TestCase):
         run_and_concatenate(strategy, i)
 
   def _test_trainable_variable(self, strategy):
-    for cls in [variables.VariableV1, variables.Variable]:
+    for cls in [variable_v1.VariableV1, variables.Variable]:
       with strategy.scope():
         v1 = cls(1.0)
         self.assertEqual(True, v1.trainable)
@@ -628,7 +628,7 @@ class TwoDeviceDistributionTestBase(test.TestCase):
 
   def _test_run(self, strategy, run_in_function=False):
     out1 = strategy.run(_maybe_run_in_function(
-        lambda: ds_context.get_replica_context().replica_id_in_sync_group + 1,
+        lambda: distribute_lib.get_replica_context().replica_id_in_sync_group + 1,
         run_in_function))
     self.assertAllEqual([1, 2], self.evaluate(strategy.unwrap(out1)))
 
@@ -816,10 +816,10 @@ class RemoteSingleWorkerMirroredStrategyBase(DistributionTestBase):
 
 
 def _all_sum(value):
-  ctx = ds_context.get_replica_context()
+  ctx = distribute_lib.get_replica_context()
   return ctx.all_reduce(reduce_util.ReduceOp.SUM, value)
 
 
 def _all_mean(value):
-  ctx = ds_context.get_replica_context()
+  ctx = distribute_lib.get_replica_context()
   return ctx.all_reduce(reduce_util.ReduceOp.MEAN, value)
diff --git a/tensorflow/python/distribute/summary_op_util.py b/tensorflow/python/distribute/summary_op_util.py
index e659d832e51..59e619a871e 100644
--- a/tensorflow/python/distribute/summary_op_util.py
+++ b/tensorflow/python/distribute/summary_op_util.py
@@ -15,7 +15,7 @@
 """Contains utility functions used by summary ops in distribution strategy."""
 
 
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 
@@ -33,7 +33,7 @@ def skip_summary():
   # TODO(priyag): Add a new optional argument that will provide multiple
   # alternatives to override default behavior. (e.g. run on last replica,
   # compute sum or mean across replicas).
-  replica_context = distribution_strategy_context.get_replica_context()
+  replica_context = distribute_lib.get_replica_context()
   if not replica_context:
     return False
   # TODO(b/118385803): when replica_id of _TPUReplicaContext is properly
diff --git a/tensorflow/python/distribute/template_mirrored_strategy_test.py b/tensorflow/python/distribute/template_mirrored_strategy_test.py
index 3f90bcc98d0..0e522619208 100644
--- a/tensorflow/python/distribute/template_mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/template_mirrored_strategy_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for make_template used with MirroredStrategy."""
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -36,7 +36,7 @@ class TemplateMirroredStrategyTest(test.TestCase):
       def fn():
         var1 = variable_scope.get_variable(
             "var1", shape=[], initializer=init_ops.constant_initializer(21.))
-        ds_context.get_replica_context().merge_call(lambda _: ())
+        distribute_lib.get_replica_context().merge_call(lambda _: ())
         var2 = variable_scope.get_variable(
             "var2", shape=[], initializer=init_ops.constant_initializer(2.))
         return var1 * var2
diff --git a/tensorflow/python/distribute/tpu_replicated_variable.py b/tensorflow/python/distribute/tpu_replicated_variable.py
index 2162da19e5a..37d24e08d2b 100644
--- a/tensorflow/python/distribute/tpu_replicated_variable.py
+++ b/tensorflow/python/distribute/tpu_replicated_variable.py
@@ -23,6 +23,7 @@ import contextlib
 from tensorflow.python.compiler.xla.experimental import xla_sharding
 from tensorflow.python.distribute import tpu_util
 from tensorflow.python.eager import context
+from tensorflow.python.framework import config
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.ops import control_flow_ops
@@ -179,25 +180,6 @@ class TPUReplicatedVariable(variables_lib.Variable):
     resource_list.append(self)
     return resource_list
 
-  def _export_to_saved_model_graph(self, object_map=None,
-                                   tensor_map=None,
-                                   options=None,
-                                   **kwargs):
-    """For implementing `Trackable`."""
-    first_var = self._vars[0]
-    resource_list = first_var._export_to_saved_model_graph(  # pylint:disable=protected-access
-        object_map=object_map,
-        tensor_map=tensor_map,
-        options=options)
-    for v in self._vars[1:]:
-      object_map[v] = object_map[first_var]
-      tensor_map[v.handle] = tensor_map[first_var.handle]
-      resource_list.append(v.handle)
-    object_map[self] = object_map[first_var]
-    tensor_map[self] = tensor_map[first_var.handle]
-    resource_list.append(self)
-    return resource_list
-
   def _gather_saveables_for_saved_model(self):
     return {trackable.VARIABLE_VALUE_KEY: self._vars[0]}
 
@@ -215,9 +197,18 @@ class TPUReplicatedVariable(variables_lib.Variable):
                                 'outside tpu context or save context')
     else:
       with tpu_util.outside_or_skip_tpu_context():
-        return xla_sharding.replicate(
-            tpu_partition_ops.tpu_partitioned_input(
-                [v.handle for v in self._vars], partition_dim=-1))
+        packed_var = getattr(self, '_packed_var', None)
+
+        # TODO(b/202047549): Enable packed variables with soft device placement
+        if packed_var is None or config.get_soft_device_placement():
+          tensor = tpu_partition_ops.tpu_partitioned_input_v2(
+              [v.handle for v in self._vars],
+              partition_dims=[], is_packed=False)
+        else:
+          tensor = tpu_partition_ops.tpu_partitioned_input_v2(
+              [packed_var.packed_handle], partition_dims=[], is_packed=True)
+
+      return xla_sharding.replicate(tensor)
 
   def _read_variable_op(self):
     return gen_resource_variable_ops.read_variable_op(self.handle, self.dtype)
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 9777fa3ab04..ff097916d2b 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -39,12 +39,13 @@ from tensorflow.python.distribute import tpu_replicated_variable
 from tensorflow.python.distribute import tpu_util
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values
-from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver as tpu_cluster_resolver_lib
 from tensorflow.python.distribute.v1 import input_lib as input_lib_v1
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import device_spec
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
@@ -95,10 +96,12 @@ def validate_run_function(fn):
   # Otherwise we return an error, because we don't support eagerly running
   # run in TPUStrategy.
 
-  if context.executing_eagerly() \
-      and not isinstance(fn, def_function.Function) \
-      and not isinstance(fn, function.ConcreteFunction) \
-      and not (callable(fn) and isinstance(fn.__call__, def_function.Function)):
+  if (context.executing_eagerly()
+      and not isinstance(fn, def_function.Function)
+      and not isinstance(fn, function.ConcreteFunction)
+      and not (
+          callable(fn) and isinstance(fn.__call__, def_function.Function))
+      ):
     raise NotImplementedError(
         "TPUStrategy.run(fn, ...) does not support pure eager "
         "execution. please make sure the function passed into "
@@ -355,7 +358,10 @@ class TPUStrategyV2(distribute_lib.Strategy):
             self,
             tpu_cluster_resolver,
             device_assignment=experimental_device_assignment,
-            use_spmd_for_xla_partitioning=experimental_spmd_xla_partitioning))
+            use_spmd_for_xla_partitioning=experimental_spmd_xla_partitioning,
+            enable_data_reorder=experimental_device_assignment is not None,
+        )
+    )
     distribute_lib.distribution_strategy_gauge.get_cell("V2").set("TPUStrategy")
     distribute_lib.distribution_strategy_replica_gauge.get_cell(
         "num_workers").set(self.extended.num_hosts)
@@ -364,10 +370,7 @@ class TPUStrategyV2(distribute_lib.Strategy):
     # Packed variable is used to reduce the overhead of function execution.
     # For a DistributedVariable, only one variable handle is captured into a
     # function graph. It's only supported in eager mode.
-    # Packed variable is currently not supported when SPMD is enabled.
-    # TODO(b/202047549): enable Packed variable in SPMD mode.
-    self._enable_packed_variable_in_eager_mode = (
-        not experimental_spmd_xla_partitioning)
+    self._enable_packed_variable_in_eager_mode = True
 
   def run(self, fn, args=(), kwargs=None, options=None):
     """Run the computation defined by `fn` on each TPU replica.
@@ -690,11 +693,16 @@ class TPUStrategy(distribute_lib.Strategy):
     """
     logging.warning(
         "`tf.distribute.experimental.TPUStrategy` is deprecated, please use "
-        " the non experimental symbol `tf.distribute.TPUStrategy` instead.")
+        "the non-experimental symbol `tf.distribute.TPUStrategy` instead.")
 
     super(TPUStrategy, self).__init__(
         TPUExtended(
-            self, tpu_cluster_resolver, device_assignment=device_assignment))
+            self,
+            tpu_cluster_resolver,
+            device_assignment=device_assignment,
+            enable_data_reorder=device_assignment is not None,
+        )
+    )
     distribute_lib.distribution_strategy_gauge.get_cell("V2").set("TPUStrategy")
     distribute_lib.distribution_strategy_replica_gauge.get_cell(
         "num_workers").set(self.extended.num_hosts)
@@ -839,16 +847,19 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
 class TPUExtended(distribute_lib.StrategyExtendedV1):
   """Implementation of TPUStrategy."""
 
-  def __init__(self,
-               container_strategy,
-               tpu_cluster_resolver=None,
-               steps_per_run=None,
-               device_assignment=None,
-               use_spmd_for_xla_partitioning=False):
+  def __init__(
+      self,
+      container_strategy,
+      tpu_cluster_resolver=None,
+      steps_per_run=None,
+      device_assignment=None,
+      use_spmd_for_xla_partitioning=False,
+      enable_data_reorder=False,
+  ):
     super(TPUExtended, self).__init__(container_strategy)
 
     if tpu_cluster_resolver is None:
-      tpu_cluster_resolver = TPUClusterResolver("")
+      tpu_cluster_resolver = tpu_cluster_resolver_lib.TPUClusterResolver("")
 
     if steps_per_run is None:
       # TODO(frankchn): Warn when we are being used by DS/Keras and this is
@@ -905,6 +916,15 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
       self._host_input_worker_devices.setdefault(host_device, [])
       self._host_input_worker_devices[host_device].append(host_device)
 
+    # Create the replica order based on the assigned device order.
+    # This replica order will be used to match the IteratorGetNext ops
+    # with the device assigment.
+    self._replica_order = (
+        self._get_replica_order(self._tpu_devices[:, 0])
+        if enable_data_reorder
+        else None
+    )
+
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
     self.steps_per_run = steps_per_run
@@ -928,6 +948,48 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     # Flag to enable XLA SPMD partitioning.
     self._use_spmd_for_xla_partitioning = use_spmd_for_xla_partitioning
 
+  def _get_replica_order(self, tpu_devices):
+    """Get the replica order based on the tpu device order.
+
+    For example, if the tpu_devices are:
+    '/job:worker/replica:0/task:0/device:TPU:0',
+    '/job:worker/replica:0/task:0/device:TPU:2',
+    '/job:worker/replica:0/task:1/device:TPU:0',
+    '/job:worker/replica:0/task:1/device:TPU:2',
+    '/job:worker/replica:0/task:1/device:TPU:6',
+    '/job:worker/replica:0/task:1/device:TPU:4',
+    '/job:worker/replica:0/task:0/device:TPU:6',
+    '/job:worker/replica:0/task:0/device:TPU:4',
+
+    the returned replica order will be:
+    [0, 1, 7, 6, 2, 3, 5, 4]
+
+    This replica order will be used to reorder the data returned by the
+    iterators,
+    so that they can be placed on the same node as their computation graphs.
+
+    Args:
+      tpu_devices (List[str]): A list of tpu device names in the order of
+        replicas.
+
+    Returns:
+      A list containing the order ids of corresponding TPU devices.
+    """
+    devices_with_ids = []
+    for i, tpu_device in enumerate(tpu_devices):
+      spec = tf_device.DeviceSpec.from_string(tpu_device)
+      devices_with_ids.append((
+          (
+              spec.job,
+              spec.replica,
+              spec.device_type,
+              spec.task,
+              spec.device_index,
+          ),
+          i,
+      ))
+    return [i for _, i in sorted(devices_with_ids)]
+
   def _validate_colocate_with_variable(self, colocate_with_variable):
     distribute_utils.validate_colocate(colocate_with_variable, self)
 
@@ -1003,7 +1065,9 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         self._get_input_workers(options),
         self._container_strategy(),
         num_replicas_in_sync=self._num_replicas_in_sync,
-        options=options)
+        options=options,
+        replica_order=self._replica_order,
+    )
 
   def _distribute_datasets_from_function(self, dataset_fn, options):
     if (options and options.experimental_replication_mode ==
@@ -1027,7 +1091,9 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         input_workers,
         input_contexts,
         self._container_strategy(),
-        options=options)
+        options=options,
+        replica_order=self._replica_order,
+    )
 
     # We can only check after the dataset_fn is called.
     if options is None or options.experimental_fetch_to_device:
diff --git a/tensorflow/python/distribute/tpu_strategy_model_parallelism_test.py b/tensorflow/python/distribute/tpu_strategy_model_parallelism_test.py
index f68cfc08110..8b834f23bc9 100644
--- a/tensorflow/python/distribute/tpu_strategy_model_parallelism_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_model_parallelism_test.py
@@ -20,8 +20,11 @@ from absl.testing import parameterized
 
 from tensorflow.python.checkpoint import checkpoint as util
 from tensorflow.python.checkpoint import checkpoint_management
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.compiler.xla.experimental import xla_sharding
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import packed_distributed_variable as packed
 from tensorflow.python.distribute import strategy_test_lib
+from tensorflow.python.distribute import tpu_replicated_variable
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
@@ -39,6 +42,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.tpu import device_assignment as device_assignment_lib
@@ -79,6 +83,66 @@ class TPUStrategyModelParallelismTest(
     strategy_test_lib.TwoDeviceDistributionTestBase,
     parameterized.TestCase):
 
+  @parameterized.named_parameters([("packed", True), ("unpacked", False)])
+  def test_spmd_variable_structure(self, enable_packing):
+    strategy, num_replicas = get_tpu_strategy(enable_spmd=True)
+
+    # pylint: disable=protected-access
+    if enable_packing:
+      self.assertTrue(strategy._enable_packed_variable_in_eager_mode,
+                      "packed variables should be enabled by default")
+    else:
+      strategy._enable_packed_variable_in_eager_mode = False
+    # pylint: enable=protected-access
+
+    tensor = constant_op.constant([[0., 1.], [2., 3.]])
+
+    # Test TPUMirroredVariable and TPUSyncOnReadVariable
+    with strategy.scope():
+      v = variables.Variable(
+          tensor, name="v", synchronization=vs.VariableSynchronization.ON_READ)
+      w = variables.Variable(
+          tensor, name="w", synchronization=vs.VariableSynchronization.ON_WRITE)
+
+    def test_read(x):
+      @def_function.function
+      def fn():
+        return x.read_value()
+
+      results = strategy.run(fn)
+      results = strategy.experimental_local_results(results)
+
+      for i in range(num_replicas):
+        self.assertAllClose(results[i], tensor)
+
+    def test_structure(values):
+      for i, value in enumerate(values):
+        self.assertIsInstance(
+            value, tpu_replicated_variable.TPUReplicatedVariable)
+        packed_var = getattr(value, "_packed_var", None)
+        if enable_packing:
+          if i == 0:
+            self.assertIsInstance(packed_var, packed.PackedDistributedVariable)
+          else:
+            self.assertIs(packed_var, values[0]._packed_var,  # pylint: disable=protected-access
+                          "all vals should share the same packed var instance")
+        else:
+          self.assertIsNone(packed_var)
+
+      if enable_packing:
+        # pylint: disable=protected-access
+        resources = sum((value._vars for value in values), [])
+        dist_vars = packed_var._distributed_variables
+        # pylint: enable=protected-access
+        self.assertLen(resources, len(dist_vars))
+        for dist_var, resource in zip(dist_vars, resources):
+          self.assertIs(dist_var, resource)
+
+    test_read(v)
+    test_structure(v.values)
+    test_read(w)
+    test_structure(w.values)
+
   def test_logical_device_assignment(self):
     strategy, num_replicas = get_tpu_strategy()
     with strategy.scope():
@@ -97,7 +161,7 @@ class TPUStrategyModelParallelismTest(
 
     @def_function.function
     def f(x):
-      replica_ctx = distribution_strategy_context.get_replica_context()
+      replica_ctx = distribute_lib.get_replica_context()
       with replica_ctx.experimental_logical_device(0):
         y = v * x
       with replica_ctx.experimental_logical_device(1):
@@ -123,8 +187,8 @@ class TPUStrategyModelParallelismTest(
       def __init__(self, v, w):
         super(PartitionedModel, self).__init__()
 
-        assert distribution_strategy_context.has_strategy()
-        strategy = distribution_strategy_context.get_strategy()
+        assert distribute_lib.has_strategy()
+        strategy = distribute_lib.get_strategy()
 
         with strategy.extended.experimental_logical_device(0):
           self.v = variables.Variable(v)
@@ -132,7 +196,7 @@ class TPUStrategyModelParallelismTest(
           self.w = variables.Variable(w)
 
       def __call__(self, x):
-        replica_ctx = distribution_strategy_context.get_replica_context()
+        replica_ctx = distribute_lib.get_replica_context()
         with replica_ctx.experimental_logical_device(0):
           y = self.v * x
         with replica_ctx.experimental_logical_device(1):
@@ -426,5 +490,32 @@ class TPUStrategyModelParallelismTest(
         (arg + 3) * num_replicas,
         self.evaluate(strategy.reduce("SUM", result, axis=None)))
 
+  # Tests auto_to_manual_spmd_partition and manual_to_auto_spmd_partition.
+  # The internal versions of these ops are XlaSpmdFullToShardShape and
+  # XlaSpmdShardToFullShape.
+  def test_manual_sharding_ops(self):
+    strategy, num_replicas = get_tpu_strategy(enable_spmd=True)
+
+    @def_function.function
+    def fn(x):
+      x_split = strategy.experimental_split_to_logical_devices(x, [1, 2])
+      split_sharding = xla_sharding.get_op_sharding(x_split.op)
+      x_manual = xla_sharding.auto_to_manual_spmd_partition(
+          x_split, split_sharding
+      )
+      y_manual = x_manual + 1
+      y_split = xla_sharding.manual_to_auto_spmd_partition(
+          y_manual, split_sharding, (2, 2)
+      )
+      return y_split
+
+    arg = constant_op.constant(0, shape=(2, 2), dtype=dtypes.int64)
+    result = strategy.run(fn, args=(arg,))
+    self.assertAllEqual(
+        (arg + 1) * num_replicas,
+        self.evaluate(strategy.reduce("SUM", result, axis=None)),
+    )
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index a8e7436595b..115ae1e4aaa 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -18,9 +18,9 @@ from absl import logging
 from absl.testing import parameterized
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
@@ -28,7 +28,6 @@ from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import composite_tensor
@@ -62,7 +61,6 @@ from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 
-
 FLAGS = flags.FLAGS
 flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
 flags.DEFINE_string("project", None, "Name of GCP project with TPU.")
@@ -188,7 +186,7 @@ class TPUTest(test.TestCase):
     with ops.device("/device:TPU:0"):
       a = variables.Variable(1)
 
-    @function.defun_with_attributes(attributes={"_noinline": True})
+    @def_function.function(experimental_attributes={"_noinline": True})
     def get_a_plus_one():
       return a + 1
 
@@ -224,13 +222,14 @@ class TPUTest(test.TestCase):
     def foo():
       return 1 + 1
 
-    func1 = function.defun_with_attributes(
-        foo, attributes={"_XlaMustCompile": False})
-    func2 = function.defun_with_attributes(
-        foo, attributes={
+    func1 = def_function.function(foo, jit_compile=False)
+    func2 = def_function.function(
+        foo,
+        jit_compile=False,
+        experimental_attributes={
             "_OutputsOnOpDevice": True,
-            "_XlaMustCompile": False
-        })
+        },
+    )
 
     with ops.device("/device:TPU:0"):
       ret1 = func1()
@@ -766,6 +765,8 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
           synchronization=variables.VariableSynchronization.ON_READ,
           aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
 
+    self.assertFalse(w._is_mirrored())
+
     @def_function.function
     def run(iterator):
 
@@ -774,7 +775,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
         return w
 
       def all_reduce(x):
-        ctx = distribution_strategy_context.get_replica_context()
+        ctx = distribute_lib.get_replica_context()
         return ctx.all_reduce("SUM", w) + x
 
       outputs = strategy.run(computation, args=(next(iterator),))
@@ -1145,6 +1146,57 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
     strategy = get_tpu_strategy(enable_packed_var)
     self.assertIsNotNone(strategy.cluster_resolver)
 
+  def test_replica_order_for_distribute_datasets_from_function(
+      self, enable_packed_var
+  ):
+    def _create_dataset(strategy):
+      def dataset_fn(ctx):
+        del ctx
+        return dataset_ops.Dataset.range(2)
+
+      return strategy.distribute_datasets_from_function(dataset_fn)
+
+    values = self._test_replica_order(_create_dataset).values
+
+    self.assertLen(values, 2)
+    self.assertEqual(1, values[0].numpy())
+    self.assertEqual(0, values[1].numpy())
+
+  def test_replica_order_for_experimental_distribute_dataset(
+      self, enable_packed_var
+  ):
+    def _create_dataset(strategy):
+      dataset = dataset_ops.Dataset.range(2).batch(2)
+      return strategy.experimental_distribute_dataset(dataset)
+
+    values = self._test_replica_order(_create_dataset).values
+
+    self.assertLen(values, 2)
+    self.assertEqual(1, values[0].numpy())
+    self.assertEqual(0, values[1].numpy())
+
+  def _test_replica_order(self, create_dist_dataset_fn):
+    tf2.enable()
+
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    device_assignment = device_assignment_lib.DeviceAssignment(
+        topology, core_assignment=[[[0, 0, 0, 1]], [[0, 0, 0, 0]]]
+    )
+    strategy = tpu_lib.TPUStrategyV2(
+        resolver, experimental_device_assignment=device_assignment
+    )
+
+    dist_dataset = create_dist_dataset_fn(strategy)
+    iterator = iter(dist_dataset)
+
+    @def_function.function
+    def test_iterators_order(iterator):
+      return next(iterator)
+
+    return test_iterators_order(iterator)
+
 
 @test_util.with_eager_op_as_function
 class TPUStrategyDataPrefetchTest(test.TestCase):
diff --git a/tensorflow/python/distribute/tpu_values.py b/tensorflow/python/distribute/tpu_values.py
index d503df0455d..42302d32d66 100644
--- a/tensorflow/python/distribute/tpu_values.py
+++ b/tensorflow/python/distribute/tpu_values.py
@@ -67,10 +67,6 @@ class TPUVariableMixin(object):
   def _get_as_operand(self):
     return self.read_value()
 
-  def _is_mirrored(self):
-    raise NotImplementedError(
-        "`TPUVariableMixin._is_mirrored()` must be implemented by subclasses.")
-
   @property
   def handle(self):
     """The handle by which this variable can be accessed."""
@@ -153,9 +149,6 @@ class TPUVariableMixin(object):
 class TPUDistributedVariable(TPUVariableMixin, values.DistributedVariable):
   """DistributedVariable subclass for TPUStrategy."""
 
-  def _is_mirrored(self):
-    return self._policy._is_mirrored()  # pylint: disable=protected-access
-
   def assign_sub(self, value, use_locking=False, name=None, read_value=True):
     if values_util.is_saving_non_distributed():
       return self._primary.assign_sub(value, use_locking, name, read_value)
@@ -343,9 +336,6 @@ class TPUMirroredVariable(TPUVariableMixin, values.MirroredVariable):
       return self._primary.scatter_update(*args, **kwargs)
     raise NotImplementedError
 
-  def _is_mirrored(self):
-    return True
-
 
 class TPUSyncOnReadVariable(TPUVariableMixin, values.SyncOnReadVariable):
   """Holds a map from replica to variables whose values are reduced on save."""
@@ -373,9 +363,6 @@ class TPUSyncOnReadVariable(TPUVariableMixin, values.SyncOnReadVariable):
       return tpu_util.make_raw_assign_fn(
           gen_resource_variable_ops.assign_variable_op)(self, *args, **kwargs)
 
-  def _is_mirrored(self):
-    return False
-
 
 # Common method between OnWrite and Mirrored variables.
 def assign_sub(var, value, use_locking=False, name=None, read_value=True):
@@ -524,9 +511,6 @@ class TPUOnWritePolicy(values.OnWritePolicy):
                              "scatter_update", var, sparse_delta, use_locking,
                              name)
 
-  def _is_mirrored(self):
-    return True
-
 
 class TPUOnReadPolicy(values.OnReadPolicy):
   """Policy defined for `tf.VariableSynchronization.ON_READ` synchronization.
@@ -561,9 +545,6 @@ class TPUOnReadPolicy(values.OnReadPolicy):
       return tpu_util.make_raw_assign_fn(
           gen_resource_variable_ops.assign_variable_op)(var, *args, **kwargs)
 
-  def _is_mirrored(self):
-    return False
-
   def scatter_sub(self, *args, **kwargs):
     raise NotImplementedError
 
diff --git a/tensorflow/python/distribute/v1/BUILD b/tensorflow/python/distribute/v1/BUILD
index ee9601a540d..c07eb8d6356 100644
--- a/tensorflow/python/distribute/v1/BUILD
+++ b/tensorflow/python/distribute/v1/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -8,7 +8,7 @@ package(
     licenses = ["notice"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cross_device_ops_test",
     srcs = ["cross_device_ops_test.py"],
     python_version = "PY3",
@@ -17,6 +17,7 @@ cuda_py_test(
         "no_windows_gpu",  # b/216367668
     ],
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:collective_ops",
         "//tensorflow/python:constant_op",
@@ -35,16 +36,16 @@ cuda_py_test(
         "//tensorflow/python/distribute:multi_worker_util",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:strategy_combinations",
-        "//tensorflow/python/distribute:test_util",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:indexed_slices",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "all_reduce",
     srcs = [
         "all_reduce.py",
@@ -55,25 +56,25 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nccl_ops",
+        "//tensorflow/python/framework:device",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "all_reduce_test",
     srcs = ["all_reduce_test.py"],
     deps = [
         ":all_reduce",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -91,7 +92,7 @@ pytype_strict_library(
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:data",
+        "//tensorflow/python/util:deprecation",
     ],
 )
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 640f7d8755f..1c332c71550 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -21,12 +21,11 @@ import weakref
 from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import packed_distributed_variable as packed
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
+from tensorflow.python.eager import record
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -52,7 +51,7 @@ def _on_write_update_replica(var, update_fn, value, **kwargs):
   if var.aggregation == vs.VariableAggregation.NONE:
     return update_fn(var._get_on_device_or_primary(), value, **kwargs)  # pylint: disable=protected-access
 
-  if not ds_context.get_strategy().extended._use_merge_call():  # pylint: disable=protected-access
+  if not distribute_lib.get_strategy().extended._use_merge_call():  # pylint: disable=protected-access
     # Don't allow MEAN with non float dtype, since it may cause unexpected
     # precision loss. Python3 and NumPy automatically upcast integers to
     # float in division, but we should always preserve the type.
@@ -68,7 +67,7 @@ def _on_write_update_replica(var, update_fn, value, **kwargs):
         value, var.aggregation, var)
     values_util.mark_as_unsaveable()
 
-    return ds_context.get_replica_context()._update(  # pylint: disable=protected-access
+    return distribute_lib.get_replica_context()._update(  # pylint: disable=protected-access
         var,
         update_fn,
         args=(aggregated_value,),
@@ -98,7 +97,7 @@ def _on_write_update_replica(var, update_fn, value, **kwargs):
       v = values_util.apply_aggregation(strategy, value, var.aggregation, var)
       return var._update_cross_replica(update_fn, v, **kwargs)  # pylint: disable=protected-access
 
-    return ds_context.get_replica_context().merge_call(
+    return distribute_lib.get_replica_context().merge_call(
         merge_fn, args=(value,), kwargs=kwargs)
 
 
@@ -118,11 +117,12 @@ def apply_aggregation_replica_context(value, aggregation, destinations):
           strategy.experimental_local_results(value)[0],
           destinations=destinations)
 
-    return ds_context.get_replica_context().merge_call(merge_fn, args=(value,))
+    return distribute_lib.get_replica_context().merge_call(
+        merge_fn, args=(value,))
 
   else:
     reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
-    aggregated_value = ds_context.get_strategy(  # pylint: disable=protected-access
+    aggregated_value = distribute_lib.get_strategy(  # pylint: disable=protected-access
     ).extended._replica_ctx_all_reduce(reduce_op, value)
     return aggregated_value
 
@@ -373,7 +373,8 @@ def _per_replica_to_tensor(var, dtype=None, name=None, as_ref=False):
   if as_ref:
     raise NotImplementedError(
         "PerReplica doesn't support being used as a reference.")
-  if ds_context.in_cross_replica_context() or not ds_context.has_strategy():
+  if (distribute_lib.in_cross_replica_context() or
+      not distribute_lib.has_strategy()):
     raise ValueError("It looks like you are using a PerReplica object while "
                      "not inside a replica context, which is not supported. "
                      "Try running your op or function inside a replica context "
@@ -406,7 +407,7 @@ class PerReplicaSpec(type_spec.TypeSpec):
     return self._value_specs
 
   def _to_components(self, value):
-    replica_context = ds_context.get_replica_context()
+    replica_context = distribute_lib.get_replica_context()
     if replica_context is not None and replica_context.num_replicas_in_sync > 1:
       raise ValueError(
           "Flattening a PerReplica to components is not supported in replica "
@@ -440,6 +441,9 @@ class Mirrored(DistributedDelegate, ds_types.Mirrored):
       return conv_fn()
     return obj
 
+  def _is_mirrored(self):
+    return True
+
 
 class DistributedVarOp(object):
   """A class that looks like `tf.Operation`."""
@@ -523,7 +527,19 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     if ops.executing_eagerly_outside_functions() and getattr(
         strategy, "_enable_packed_variable_in_eager_mode", False):
       name = "%s/packed/" % self._common_name
-      self._packed_var = packed.PackedDistributedVariable(values, name=name)
+      if hasattr(values[0], "_vars"):
+        # Handle when the resource variables are "nested" underneath another
+        # layer of values, e.g., TPUReplicatedVariable, by packing all them
+        # together and pushing the packed var down a level
+        # pylint: disable=protected-access
+        packed_var = packed.PackedDistributedVariable(
+            sum((value._vars for value in values), []), name=name)
+        for value in values:
+          value._packed_var = packed_var
+        self._packed_var = None
+        # pylint: enable=protected-access
+      else:
+        self._packed_var = packed.PackedDistributedVariable(values, name=name)
     else:
       self._packed_var = None
 
@@ -560,7 +576,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     Raises:
       RuntimeError: If trying to deepcopy into a different strategy.
     """
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
       new_values = []
 
       for value in self._values:
@@ -624,6 +640,9 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
   def initialized_value(self):
     return self._get_on_device_or_primary().initialized_value()
 
+  def _is_mirrored(self):
+    return (self._policy is not None) and (self._policy._is_mirrored())  # pylint: disable=protected-access
+
   @property
   def initial_value(self):
     return self._get_on_device_or_primary().initial_value
@@ -728,7 +747,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     # We want cross-replica code that does some var.op.X calls
     # to work (even if the current device isn't in self._devices), but
     # other uses of var.op in a cross-replica context to fail.
-    if ds_context.in_cross_replica_context():
+    if distribute_lib.in_cross_replica_context():
       return DistributedVarOp(self._primary.op.name, self._primary.op.graph,
                               self._primary.op.traceback, self._primary.op.type)
     return self._get().op
@@ -739,9 +758,11 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   def _get_replica(self, replica_id):
     """Returns the value on a device with the given replica_id."""
+    value = self._values[replica_id]
     if self._use_packed_variable():
-      return self._packed_var.on_device(self._devices[replica_id])
-    return self._values[replica_id]
+      return self._packed_var.on_device(value.device)
+    else:
+      return value
 
   def _get(self):
     """Returns the value for the current device or raises a ValueError."""
@@ -771,7 +792,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
   def read_value(self):
     if values_util.is_saving_non_distributed():
       return self._primary.read_value()
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
       return array_ops.identity(self._get())
 
   def value(self):
@@ -990,8 +1011,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     """
     if values_util.is_saving_non_distributed():
       return update_fn(self._primary, value, **kwargs)
-    with ds_context.enter_or_assert_strategy(self.distribute_strategy):
-      if ds_context.in_cross_replica_context():
+    with distribute_lib.enter_or_assert_strategy(self.distribute_strategy):
+      if distribute_lib.in_cross_replica_context():
         update_replica_id = distribute_lib.get_update_replica_id()
         if update_replica_id is not None:
           replica_value = self._get_replica(update_replica_id)
@@ -1010,7 +1031,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     if values_util.is_saving_non_distributed():
       return ops.convert_to_tensor(
           self._primary, dtype=dtype, name=name, as_ref=as_ref)
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
       return ops.convert_to_tensor(
           self._get(), dtype=dtype, name=name, as_ref=as_ref)
 
@@ -1073,9 +1094,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     """
     resource_variable_ops.write_object_proto_for_resource_variable(
         self, proto, options)
-    if self._policy:
-      if self._policy._is_mirrored():  # pylint: disable=protected-access
-        self._policy._write_object_proto(self, proto, options)  # pylint: disable=protected-access
+    if self._is_mirrored():
+      values_util.write_object_proto(self, proto, options)
 
   @property
   def is_distributed_variable(self):
@@ -1086,7 +1106,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     graph = concrete_function.graph
     # Add given distributed variable to captures with given placeholder.
     graph.replace_capture(self, internal_capture)
-    tape.record_operation(
+    record.record_operation(
         "captured_value", [internal_capture], [self],
         backward_function=lambda x: [x],
         forward_function=lambda x: [x])
@@ -1140,6 +1160,9 @@ class _MirroredSaveable(saveable_object.SaveableObject):
 class MirroredVariable(DistributedVariable, Mirrored):
   """Holds a map from replica to variables whose values are kept in sync."""
 
+  def _is_mirrored(self):
+    return Mirrored._is_mirrored(self)  # Use correct parent class.
+
   def _update_replica(self, update_fn, value, **kwargs):
     return _on_write_update_replica(self, update_fn, value, **kwargs)
 
@@ -1196,27 +1219,6 @@ class MirroredVariable(DistributedVariable, Mirrored):
 
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
-  def _write_object_proto(self, proto, options):
-    """Update a SavedObject proto for the caller.
-
-    If a DistributedVariable object supports this method, it will be called when
-    saving with a pre-built `SavedObject` proto representing the object, plus an
-    instance of `SaveOptions`. This method is then free to modify that proto
-    instance.
-
-    `DistributedVariable` with `AUTO` or `ON_WRITE` synchronization optionally
-    write out information about their components to the
-    `experimental_distributed_variable_components` field of a
-    `SavedVariable` (depending on the `SaveOptions` variable policy).
-
-    Args:
-      proto: A pre-built `SavedObject` proto for this object. It is assumed this
-        will be a `SavedVariable` instance.
-      options: A `SaveOptions` instance.
-    """
-    super(MirroredVariable, self)._write_object_proto(proto, options)
-    values_util.write_object_proto(self, proto, options)
-
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     """Converts a variable to a tensor."""
     # TODO(b/154017756): Make _dense_var_to_tensor consistent between ON_READ
@@ -1267,7 +1269,7 @@ class SyncOnReadVariable(DistributedVariable):
     If called under default replica-context or cross-replica context, returns
     the synced value.
     """
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
       return super(SyncOnReadVariable, self)._get()
 
   # TODO(b/154017756): Make assign behaivor in cross replica context consistent
@@ -1275,8 +1277,8 @@ class SyncOnReadVariable(DistributedVariable):
   def assign_sub(self, value, use_locking=False, name=None, read_value=True):
     if values_util.is_saving_non_distributed():
       return self._primary.assign_sub(value, use_locking, name, read_value)
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if (ds_context.in_cross_replica_context() and
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
+      if (distribute_lib.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
         values_util.mark_as_unsaveable()
         return values_util.on_read_assign_sub_cross_replica(
@@ -1288,8 +1290,8 @@ class SyncOnReadVariable(DistributedVariable):
   def assign_add(self, value, use_locking=False, name=None, read_value=True):
     if values_util.is_saving_non_distributed():
       return self._primary.assign_add(value, use_locking, name, read_value)
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if (ds_context.in_cross_replica_context() and
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
+      if (distribute_lib.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
         values_util.mark_as_unsaveable()
         return values_util.on_read_assign_add_cross_replica(
@@ -1301,8 +1303,8 @@ class SyncOnReadVariable(DistributedVariable):
   def assign(self, value, use_locking=False, name=None, read_value=True):
     if values_util.is_saving_non_distributed():
       return self._primary.assign(value, use_locking, name, read_value)
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if (ds_context.in_cross_replica_context() and
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
+      if (distribute_lib.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
         values_util.mark_as_unsaveable()
         return values_util.on_read_assign_cross_replica(
@@ -1351,14 +1353,14 @@ class SyncOnReadVariable(DistributedVariable):
     self._scatter_not_implemented("scatter_update")
 
   def value(self):
-    if ds_context.in_variable_sync_on_read_context():
+    if distribute_lib.in_variable_sync_on_read_context():
       raise NotImplementedError(
           "call `variable.value()` inside variable_sync_on_read_context is not "
           "supported")
     if values_util.is_saving_non_distributed():
       return self._primary.value()
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if (ds_context.in_cross_replica_context() and
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
+      if (distribute_lib.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
         if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
           return self._get_replica(0).value()
@@ -1368,7 +1370,7 @@ class SyncOnReadVariable(DistributedVariable):
         return self._get_on_device_or_primary().value()
 
   def read_value(self):
-    if ds_context.in_variable_sync_on_read_context():
+    if distribute_lib.in_variable_sync_on_read_context():
       raise NotImplementedError(
           "call `variable.read_value()` inside variable_sync_on_read_context is"
           " not supported")
@@ -1381,7 +1383,7 @@ class SyncOnReadVariable(DistributedVariable):
       return self._get_replica(0)
     if self._aggregation == vs.VariableAggregation.SUM:
       values_util.mark_as_unsaveable()
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
       return self._distribute_strategy.reduce(
           reduce_util.ReduceOp.from_variable_aggregation(self._aggregation),
           self,
@@ -1391,8 +1393,8 @@ class SyncOnReadVariable(DistributedVariable):
     if values_util.is_saving_non_distributed():
       return self._primary._as_graph_element()  # pylint: disable=protected-access
     # pylint: disable=protected-access
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      if ds_context.in_cross_replica_context():
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
+      if distribute_lib.in_cross_replica_context():
         return ops.convert_to_tensor(self._get_cross_replica())
     return self._get()._as_graph_element()
 
@@ -1416,10 +1418,10 @@ class SyncOnReadVariable(DistributedVariable):
     if values_util.is_saving_non_distributed():
       return ops.convert_to_tensor(
           self._primary, dtype=dtype, name=name, as_ref=as_ref)
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      replica_context = ds_context.get_replica_context()
+    with distribute_lib.enter_or_assert_strategy(self._distribute_strategy):
+      replica_context = distribute_lib.get_replica_context()
       if (replica_context is not None and
-          ds_context.in_variable_sync_on_read_context()):
+          distribute_lib.in_variable_sync_on_read_context()):
         if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
           return ops.convert_to_tensor(
               self._get_replica(0), dtype=dtype, name=name, as_ref=as_ref)
@@ -1527,8 +1529,8 @@ class OnReadPolicy(VariablePolicy):
     return False
 
   def value(self, var):
-    with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if (ds_context.in_cross_replica_context() and
+    with distribute_lib.enter_or_assert_strategy(var.distribute_strategy):
+      if (distribute_lib.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
         if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
           return var._get_replica(0).value()  # pylint: disable=protected-access
@@ -1537,8 +1539,8 @@ class OnReadPolicy(VariablePolicy):
         return var._get_on_device_or_primary().value()  # pylint: disable=protected-access
 
   def _as_graph_element(self, var):
-    with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if ds_context.in_cross_replica_context():
+    with distribute_lib.enter_or_assert_strategy(var.distribute_strategy):
+      if distribute_lib.in_cross_replica_context():
         return ops.convert_to_tensor(var._get_cross_replica())  # pylint: disable=protected-access
     return var._get()._as_graph_element()  # pylint: disable=protected-access
 
@@ -1547,7 +1549,7 @@ class OnReadPolicy(VariablePolicy):
       return var._get_replica(0)  # pylint: disable=protected-access
     if self._aggregation == vs.VariableAggregation.SUM:
       values_util.mark_as_unsaveable()
-    with ds_context.enter_or_assert_strategy(var.distribute_strategy):
+    with distribute_lib.enter_or_assert_strategy(var.distribute_strategy):
       return var.distribute_strategy.reduce(
           reduce_util.ReduceOp.from_variable_aggregation(self._aggregation),
           var,
@@ -1567,8 +1569,8 @@ class OnReadPolicy(VariablePolicy):
                  name=None,
                  read_value=True):
     """Subtracts a value from this variable."""
-    with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if (ds_context.in_cross_replica_context() and
+    with distribute_lib.enter_or_assert_strategy(var.distribute_strategy):
+      if (distribute_lib.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
         values_util.mark_as_unsaveable()
         return values_util.on_read_assign_sub_cross_replica(
@@ -1588,8 +1590,8 @@ class OnReadPolicy(VariablePolicy):
                  name=None,
                  read_value=True):
     """Adds a value to this variable."""
-    with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if (ds_context.in_cross_replica_context() and
+    with distribute_lib.enter_or_assert_strategy(var.distribute_strategy):
+      if (distribute_lib.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
         values_util.mark_as_unsaveable()
         return values_util.on_read_assign_add_cross_replica(
@@ -1603,8 +1605,8 @@ class OnReadPolicy(VariablePolicy):
             read_value=read_value)
 
   def assign(self, var, value, use_locking=False, name=None, read_value=True):
-    with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      if (ds_context.in_cross_replica_context() and
+    with distribute_lib.enter_or_assert_strategy(var.distribute_strategy):
+      if (distribute_lib.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
         values_util.mark_as_unsaveable()
         return values_util.on_read_assign_cross_replica(
@@ -1754,27 +1756,6 @@ class OnWritePolicy(VariablePolicy):
   def get_restore_ops(self, var, tensor):
     return values_util.get_on_write_restore_ops(var, tensor)
 
-  def _write_object_proto(self, var, proto, options):
-    """Update a SavedObject proto for the caller.
-
-    If a DistributedVariable object supports this method, it will be called when
-    saving with a pre-built `SavedObject` proto representing the object, plus an
-    instance of `SaveOptions`. This method is then free to modify that proto
-    instance.
-
-    `DistributedVariable` with `AUTO` or `ON_WRITE` synchronization optionally
-    write out information about their components to the
-    `experimental_distributed_variable_components` field of a
-    `SavedVariable` (depending on the `SaveOptions` variable policy).
-
-    Args:
-      var : A DistributedVariable object
-      proto: A pre-built `SavedObject` proto for this object. It is assumed this
-        will be a `SavedVariable` instance.
-      options: A `SaveOptions` instance.
-    """
-    values_util.write_object_proto(var, proto, options)
-
 
 class PerWorkerResource():
   """A per-worker CapturableResource class for non-ParameterServer strategy.
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
index 4585d421070..664e9405403 100644
--- a/tensorflow/python/distribute/values_util.py
+++ b/tensorflow/python/distribute/values_util.py
@@ -15,7 +15,6 @@
 """Utility functions used by values.py and ps_values.py."""
 
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
@@ -179,8 +178,8 @@ def assign_on_each_device(var, assign_func, value, read_value):
 
 
 def on_read_assign_sub_cross_replica(var, value, read_value=True):
-  with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-    if ds_context.in_cross_replica_context():
+  with distribute_lib.enter_or_assert_strategy(var.distribute_strategy):
+    if distribute_lib.in_cross_replica_context():
       if var.aggregation == vs.VariableAggregation.SUM:
         raise ValueError(
             "SyncOnReadVariable does not support `assign_sub` in "
@@ -191,8 +190,8 @@ def on_read_assign_sub_cross_replica(var, value, read_value=True):
 
 
 def on_read_assign_add_cross_replica(var, value, read_value=True):
-  with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-    if ds_context.in_cross_replica_context():
+  with distribute_lib.enter_or_assert_strategy(var.distribute_strategy):
+    if distribute_lib.in_cross_replica_context():
       if var.aggregation == vs.VariableAggregation.SUM:
         raise ValueError(
             "SyncOnReadVariable does not support `assign_add` in "
@@ -204,8 +203,8 @@ def on_read_assign_add_cross_replica(var, value, read_value=True):
 
 def on_read_assign_cross_replica(var, value, read_value=True):
   """Return the value of the variable in cross replica context."""
-  with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-    if ds_context.in_cross_replica_context():
+  with distribute_lib.enter_or_assert_strategy(var.distribute_strategy):
+    if distribute_lib.in_cross_replica_context():
       # To preserve the sum across save and restore, we have to divide the
       # total across all devices when restoring a variable that was summed
       # when saving.
@@ -283,7 +282,7 @@ def scatter_update(var, sparse_delta, use_locking=False, name=None):
 
 def get_current_replica_id_as_int():
   """Returns the current replica ID as an integer, or `None`."""
-  replica_context = ds_context.get_replica_context()
+  replica_context = distribute_lib.get_replica_context()
   if replica_context:
     replica_id = replica_context._replica_id  # pylint: disable=protected-access
     if not isinstance(replica_id, int):
@@ -309,7 +308,7 @@ def assign_sub_on_device(device, variable, tensor):
 
 
 def assert_replica_context(strategy):
-  replica_context = ds_context.get_replica_context()
+  replica_context = distribute_lib.get_replica_context()
   if not replica_context:
     raise RuntimeError(
         "Replica-local variables may only be assigned in a replica context.")
diff --git a/tensorflow/python/distribute/vars_test.py b/tensorflow/python/distribute/vars_test.py
index 91dce8b380d..e1472fe6773 100644
--- a/tensorflow/python/distribute/vars_test.py
+++ b/tensorflow/python/distribute/vars_test.py
@@ -22,7 +22,7 @@ from tensorflow.python.checkpoint import checkpoint as trackable_utils
 from tensorflow.python.checkpoint import checkpoint_management as ckpt_manager
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute import test_util
@@ -40,6 +40,7 @@ from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.util import variable_utils
@@ -111,7 +112,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
           variables_lib.VariableAggregation.SUM):
         continue
       with distribution.scope():
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             0.,
             aggregation=aggregation)
       self.evaluate(variables_lib.global_variables_initializer())
@@ -125,9 +126,9 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
   def testAssignOnWriteVar(self, distribution, experimental_run_tf_function):
 
     with distribution.scope():
-      v_to_assign = variable_scope.variable(
+      v_to_assign = variable_v1.VariableV1(
           2., aggregation=variables_lib.VariableAggregation.MEAN)
-      v_to_assign_sub = variable_scope.variable(
+      v_to_assign_sub = variable_v1.VariableV1(
           -2., aggregation=variables_lib.VariableAggregation.MEAN)
 
     def assign(fn, v, update_value, cross_replica):
@@ -156,7 +157,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
       if aggregation == variables_lib.VariableAggregation.SUM:
         continue
       with distribution.scope():
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             0.,
             aggregation=aggregation)
       self.evaluate(variables_lib.global_variables_initializer())
@@ -209,7 +210,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
         # cross replica context
         continue
       with distribution.scope():
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             0.,
             aggregation=aggregation)
       self.evaluate(variables_lib.global_variables_initializer())
@@ -268,7 +269,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
     ]
     for aggregation in aggregations:
       with distribution.scope():
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             0.,
             aggregation=aggregation)
       self.evaluate(variables_lib.global_variables_initializer())
@@ -292,7 +293,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
     ]
     for aggregation in aggregations:
       with distribution.scope():
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             2.,
             aggregation=aggregation)
       self.evaluate(variables_lib.global_variables_initializer())
@@ -344,7 +345,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
                   collective_all_reduce_strategy.CollectiveAllReduceExtended):
       self.skipTest("b/212945803")
     with distribution.scope():
-      v = variable_scope.variable(
+      v = variable_v1.VariableV1(
           15.,
           synchronization=variables_lib.VariableSynchronization.ON_WRITE,
           aggregation=variables_lib.VariableAggregation.ONLY_FIRST_REPLICA)
@@ -352,7 +353,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def assign():
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       return v.assign(math_ops.cast(replica_id, dtypes.float32))
 
@@ -400,7 +401,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
       self.skipTest("b/212954197")
 
     with distribution.scope():
-      v = variable_scope.variable(
+      v = variable_v1.VariableV1(
           1, aggregation=variables_lib.VariableAggregation.SUM)
       self.evaluate(variables_lib.global_variables_initializer())
 
@@ -488,7 +489,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def scatter_sub():
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       value = indexed_slices.IndexedSlices(
           values=array_ops_stack.stack([
@@ -513,7 +514,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def scatter_add():
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       value = indexed_slices.IndexedSlices(
           values=array_ops_stack.stack([replica_id, replica_id + 1]),
@@ -534,7 +535,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def scatter_div():
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       value = indexed_slices.IndexedSlices(
           values=array_ops.reshape(replica_id + 2, [1]),
@@ -555,7 +556,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def scatter_mul():
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       value = indexed_slices.IndexedSlices(
           values=array_ops.reshape(
@@ -738,7 +739,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
       ]:
         continue
       with distribution.scope():
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             0.,
             synchronization=variables_lib.VariableSynchronization.ON_READ,
             aggregation=aggregation)
@@ -753,9 +754,9 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
   def testAssignOnReadVar(self, distribution, experimental_run_tf_function):
 
     with distribution.scope():
-      v_to_assign = variable_scope.variable(
+      v_to_assign = variable_v1.VariableV1(
           2., aggregation=variables_lib.VariableAggregation.MEAN)
-      v_to_assign_sub = variable_scope.variable(
+      v_to_assign_sub = variable_v1.VariableV1(
           -2., aggregation=variables_lib.VariableAggregation.MEAN)
 
     def assign(fn, v, update_value, cross_replica):
@@ -794,7 +795,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
       if aggregation == variables_lib.VariableAggregation.SUM:
         continue
       with distribution.scope():
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             0.,
             aggregation=aggregation)
       self.evaluate(variables_lib.global_variables_initializer())
@@ -849,7 +850,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
       # just do value * num replicas error is 1. is not a distributed value and
       # is unsupported for aggregation SUM
       with distribution.scope():
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             0.,
             synchronization=variables_lib.VariableSynchronization.ON_READ,
             aggregation=aggregation)
@@ -895,7 +896,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
       ]:
         continue
       with distribution.scope():
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             0.,
             synchronization=variables_lib.VariableSynchronization.ON_READ,
             aggregation=aggregation)
@@ -909,7 +910,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(strategy_with_var_policy())
   def testAssignWithAggregationSum(self, distribution):
     with distribution.scope():
-      v = variable_scope.variable(
+      v = variable_v1.VariableV1(
           0.,
           synchronization=variables_lib.VariableSynchronization.ON_READ,
           aggregation=variables_lib.VariableAggregation.SUM)
@@ -922,7 +923,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(strategy_with_var_policy())
   def testAssignAddSubWithAggregationSum(self, distribution):
     with distribution.scope():
-      v = variable_scope.variable(
+      v = variable_v1.VariableV1(
           0.,
           synchronization=variables_lib.VariableSynchronization.ON_READ,
           aggregation=variables_lib.VariableAggregation.SUM)
@@ -945,7 +946,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
     ]
     for aggregation in aggregations:
       with distribution.scope():
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             0.,
             synchronization=variables_lib.VariableSynchronization.ON_READ,
             aggregation=aggregation)
@@ -972,14 +973,14 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
         resolver = tpu_cluster_resolver.TPUClusterResolver("")
         tpu_strategy_util.initialize_tpu_system(resolver)
       with distribution.scope():
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             0.,
             synchronization=variables_lib.VariableSynchronization.ON_READ,
             aggregation=aggregation)
       self.evaluate(variables_lib.global_variables_initializer())
 
       def assign(v=v):
-        ctx = ds_context.get_replica_context()
+        ctx = distribute_lib.get_replica_context()
         replica_id = ctx.replica_id_in_sync_group
         return v.assign(math_ops.cast(replica_id, dtypes.float32))
 
@@ -1004,14 +1005,14 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def testAllReduce(self, distribution, experimental_run_tf_function):
     with distribution.scope():
-      v = variable_scope.variable(
+      v = variable_v1.VariableV1(
           2.,
           synchronization=variables_lib.VariableSynchronization.ON_WRITE,
           aggregation=variables_lib.VariableAggregation.MEAN)
     self.evaluate(variables_lib.global_variables_initializer())
 
     def all_reduce():
-      ctx = ds_context.get_replica_context()
+      ctx = distribute_lib.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       return ctx.all_reduce("SUM", v) + math_ops.cast(replica_id,
                                                       dtypes.float32)
@@ -1037,14 +1038,14 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
     ]
     for aggregation in aggregations:
       with distribution.scope():
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             0.,
             synchronization=variables_lib.VariableSynchronization.ON_READ,
             aggregation=aggregation)
       self.evaluate(variables_lib.global_variables_initializer())
 
       def assign(var=v):
-        ctx = ds_context.get_replica_context()
+        ctx = distribute_lib.get_replica_context()
         replica_id = ctx.replica_id_in_sync_group
         return var.assign(math_ops.cast(replica_id, dtypes.float32))
 
@@ -1061,7 +1062,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(strategy_with_var_policy())
   def testReadValueWithAggregationNoneInCrossReplicaContext(self, distribution):
     with distribution.scope():
-      v = variable_scope.variable(
+      v = variable_v1.VariableV1(
           0.,
           synchronization=variables_lib.VariableSynchronization.ON_READ,
           aggregation=variables_lib.VariableAggregation.NONE)
@@ -1097,7 +1098,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
   def testOperatorOverride(self, distribution):
 
     with distribution.scope():
-      v = variable_scope.variable(
+      v = variable_v1.VariableV1(
           0.0,
           synchronization=variables_lib.VariableSynchronization.ON_READ,
           aggregation=variables_lib.VariableAggregation.MEAN)
@@ -1105,7 +1106,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
       @def_function.function
       def assign():
-        ctx = ds_context.get_replica_context()
+        ctx = distribute_lib.get_replica_context()
         replica_id = ctx.replica_id_in_sync_group
         return v.assign(math_ops.cast(replica_id, dtypes.float32))
 
@@ -1147,7 +1148,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
         @def_function.function
         def assign_fn():
           cluster_resolver = strategy.cluster_resolver
-          replica_ctx = ds_context.get_replica_context()
+          replica_ctx = distribute_lib.get_replica_context()
           if ((cluster_resolver and cluster_resolver.task_type == "worker") or
               math_ops.equal(replica_ctx.replica_id_in_sync_group,
                              constant_op.constant(1))):
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index c0cf81210fc..cc47580a6fc 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -38,6 +38,7 @@ cc_library(
     deps = [
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:safe_ptr",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/c/eager:c_api_internal",
@@ -60,7 +61,6 @@ cc_library(
         "//tensorflow/python:pybind11_status",
         "//tensorflow/python/lib/core:py_seq_tensor",
         "//tensorflow/python/lib/core:py_util",
-        "//tensorflow/python/lib/core:safe_ptr",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/python/util:cpp_python_util",
         "//tensorflow/python/util:stack_trace",
@@ -156,6 +156,7 @@ py_library(
     deps = [
         "//tensorflow/python:errors",
         "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
@@ -177,6 +178,7 @@ cuda_py_test(
     deps = [
         ":cancellation",
         ":test",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -199,6 +201,7 @@ py_library(
         "//third_party/py/jax_tpu_embedding:__subpackages__",
     ],
     deps = [
+        ":cancellation",
         ":executor",
         ":monitoring",
         "//tensorflow/core:protos_all_py",
@@ -206,12 +209,16 @@ py_library(
         "//tensorflow/python:device",
         "//tensorflow/python:device_spec",
         "//tensorflow/python:errors",
-        "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:tf2",
         "//tensorflow/python:tfrt_utils",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:function_utils",
+        "//tensorflow/python/util:is_in_graph_mode",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -222,13 +229,14 @@ tf_python_pybind_extension(
     srcs = ["custom_device_testutil.cc"],
     deps = [
         "//tensorflow/c:c_api",
+        "//tensorflow/c:safe_ptr",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/c/eager:custom_device_testutil",
         "//tensorflow/python:pybind11_lib",
         "//tensorflow/python/lib/core:pybind11_status",
-        "//tensorflow/python/lib/core:safe_ptr",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/python/util:cpp_python_util",
         "//third_party/python_runtime:headers",
         "@pybind11",
@@ -252,7 +260,10 @@ py_test(
     deps = [
         ":context",
         ":custom_device_testutil",
-        ":test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -266,6 +277,7 @@ cuda_py_test(
         ":context",
         ":test",
         "//tensorflow/compiler/xla/service:hlo_proto_py",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -278,7 +290,8 @@ py_library(
         "//tensorflow/python:c_api_util",
         "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -299,8 +312,11 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":context",
-        "//tensorflow/python:util",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler/internal:_pywrap_profiler",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -313,11 +329,14 @@ cuda_py_test(
         "no_windows",  # TODO(b/184424727): Re-enable this.
     ],
     deps = [
+        ":profiler",
         ":test",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_py",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python/profiler:profiler_v2",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler:trace",
+        "//tensorflow/tsl/profiler/protobuf:trace_events_proto_py",
     ],
 )
 
@@ -330,8 +349,8 @@ py_library(
         "//tensorflow/core/profiler:internal",
     ],
     deps = [
-        "//tensorflow/python:util",
         "//tensorflow/python/profiler/internal:_pywrap_profiler",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -350,11 +369,41 @@ py_test(
     ],
 )
 
+cuda_py_test(
+    name = "small_constants_optimizer_test",
+    size = "medium",
+    srcs = ["small_constants_optimizer_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":context",
+        ":test",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/eager/polymorphic_function:monomorphic_function",
+    ],
+)
+
 py_library(
     name = "tape",
     srcs = ["tape.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:pywrap_tfe",
+    ],
+)
+
+py_library(
+    name = "record",
+    srcs = ["record.py"],
+    srcs_version = "PY3",
+    visibility = ["//tensorflow:internal"],
+    deps = ["//tensorflow/python:pywrap_tfe"],
 )
 
 cuda_py_test(
@@ -384,10 +433,12 @@ cuda_py_test(
         ":backprop",
         ":backprop_util",
         ":context",
+        ":record",
         ":test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_grad",
@@ -409,8 +460,12 @@ cuda_py_test(
     deps = [
         ":forwardprop",
         ":forwardprop_util",
+        ":record",
         ":test",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -429,6 +484,7 @@ cuda_py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -457,7 +513,8 @@ py_library(
         "//tensorflow/python:lib",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
+        "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -490,6 +547,7 @@ cuda_py_test(
 
 py_library(
     name = "framework_for_generated_wrappers",
+    deprecation = "Depending on this target can cause build dependency cycles. Depend on the fine-grained sub-targets instead.",
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
@@ -507,8 +565,9 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python/eager/polymorphic_function:monomorphic_function",
-        "//tensorflow/python/eager/polymorphic_function:quarantine",
+        "//tensorflow/python/eager/polymorphic_function:tf_method_target",
         "//tensorflow/python/eager/polymorphic_function:tracing_compiler",
+        "//tensorflow/python/eager/polymorphic_function:transform",
     ],
 )
 
@@ -533,8 +592,13 @@ py_library(
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:unconnected_gradients",
-        "//tensorflow/python:util",
         "//tensorflow/python/ops/parallel_for:control_flow_ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
     ],
 )
 
@@ -556,10 +620,11 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":forwardprop_util",
-        "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:util",
         "//tensorflow/python/ops/parallel_for:control_flow_ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -578,7 +643,10 @@ py_library(
     srcs = ["benchmarks_test_base.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
-    deps = [":test"],
+    deps = [
+        ":test",
+        "//tensorflow/python/platform:flags",
+    ],
 )
 
 cuda_py_test(
@@ -593,9 +661,12 @@ cuda_py_test(
         ":function",
         ":remote",
         ":test",
+        "//tensorflow/python:cond",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
@@ -617,6 +688,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python/data",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
 
@@ -671,13 +743,13 @@ tf_py_logged_benchmark(
 )
 
 tf_py_test(
-    name = "tape_test",
-    srcs = ["tape_test.py"],
+    name = "record_test",
+    srcs = ["record_test.py"],
     python_version = "PY3",
     deps = [
         ":backprop",
         ":context",
-        ":tape",
+        ":record",
         ":test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -739,7 +811,7 @@ py_library(
     deps = [
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:unconnected_gradients",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -771,6 +843,9 @@ py_library(
     deps = [
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:op_selector",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -784,7 +859,7 @@ tf_py_test(
         ":test",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -795,13 +870,14 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":context",
-        ":function",
         ":lift_to_graph",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:template",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:nested_structure_coder",
-        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/trackable:data_structures",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -844,9 +920,11 @@ py_library(
     deps = [
         ":context",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/platform:remote_utils",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ] + tf_additional_rpc_deps(),
 )
 
@@ -874,6 +952,7 @@ cuda_py_test(
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:test_ops",
         "//tensorflow/python:while_loop",
+        "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -894,6 +973,7 @@ cuda_py_test(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -914,6 +994,7 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/distribute/failure_handling:check_preemption_py",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1001,6 +1082,6 @@ py_test(
         ":gradient_input_output_exclusions",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 7f68fc33336..230da35ad08 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -23,7 +23,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import backprop_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import tape as tape_lib
+from tensorflow.python.eager import record
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -37,7 +37,7 @@ from tensorflow.python.framework.memory_checker import MemoryChecker
 from tensorflow.python.layers.pooling import max_pooling3d
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import functional_ops
@@ -836,7 +836,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
 
     with backprop.GradientTape() as g:
       g.watch(x)
-      y = control_flow_ops.cond(x < x, true_fn, false_fn)
+      y = tf_cond.cond(x < x, true_fn, false_fn)
 
     if not context.executing_eagerly():
       with self.assertRaisesRegex(NotImplementedError, 'tf.gradients'):
@@ -1084,7 +1084,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     x = constant_op.constant(1.)
     with backprop.GradientTape() as g:
       g.watch(x)
-      tape_lib.record_operation('InvalidBackprop', [y], [x], lambda dy: [])
+      record.record_operation('InvalidBackprop', [y], [x], lambda dy: [])
     with self.assertRaisesRegex(errors_impl.InternalError,
                                 'InvalidBackprop.*too few gradients'):
       g.gradient(y, x)
@@ -1933,7 +1933,7 @@ class JacobianTest(test.TestCase):
 
     @def_function.function
     def f(x):
-      y = control_flow_ops.cond(x > 0., lambda: x**3., lambda: x**2.)
+      y = tf_cond.cond(x > 0., lambda: x**3., lambda: x**2.)
       return y
 
     with backprop.GradientTape(persistent=True) as tape:
diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index 19bdd3eb54e..c649fd3c0b3 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -23,7 +23,7 @@ import tensorflow as tf
 
 from tensorflow.python.client import device_lib
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
+from tensorflow.python.eager import record
 from tensorflow.python.eager.benchmarks.resnet50 import resnet50
 from tensorflow.python.eager.benchmarks.resnet50 import resnet50_test_util
 from tensorflow.python.framework import test_util
@@ -41,7 +41,7 @@ def compute_gradients(model, images, labels, num_replicas=1):
   # TODO(b/110991947): We can mistakenly trace the gradient call in
   # multi-threaded environment. Explicitly disable recording until
   # this is fixed.
-  with tape.stop_recording():
+  with record.stop_recording():
     grads = grad_tape.gradient(loss, model.variables)
   return grads
 
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 13f9c2a1c8e..1280db4320a 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -47,7 +47,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_array_ops
@@ -1398,7 +1398,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           def else_branch():
             return 0.
 
-          return control_flow_ops.cond(
+          return cond.cond(
               constant_op.constant(True), then_branch, else_branch)
 
         return fn_with_many_reads_inner()
diff --git a/tensorflow/python/eager/cancellation.py b/tensorflow/python/eager/cancellation.py
index 40d66b9c164..a8956d7a683 100644
--- a/tensorflow/python/eager/cancellation.py
+++ b/tensorflow/python/eager/cancellation.py
@@ -35,5 +35,28 @@ class CancellationManager(object):
     pywrap_tfe.TFE_CancellationManagerStartCancel(self._impl)
 
   def get_cancelable_function(self, concrete_function):
-    # pylint: disable=protected-access
-    return concrete_function._experimental_with_cancellation_manager(self)
+    def cancellable(*args, **kwargs):
+      with CancellationManagerContext(self):
+        return concrete_function(*args, **kwargs)
+    return cancellable
+
+_active_context = None
+
+
+def context():
+  return _active_context
+
+
+class CancellationManagerContext:
+  """A Python context for wrapping a cancellable ConcreteFunction."""
+
+  def __init__(self, cancellation_manager):
+    self._cancellation_manager = cancellation_manager
+
+  def __enter__(self):
+    global _active_context
+    _active_context = self._cancellation_manager
+
+  def __exit__(self, exc_type, exc_value, exc_tb):
+    global _active_context
+    _active_context = None
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 4b983ce4e0a..8ace82d3510 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -18,6 +18,7 @@ import collections
 import contextlib
 import copy
 import gc
+import itertools
 import os
 import random
 import threading
@@ -26,23 +27,28 @@ from absl import logging
 import numpy as np
 
 from tensorflow.core.framework import function_pb2
+from tensorflow.core.framework import graph_debug_info_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tfe
 from tensorflow.python import tf2
 from tensorflow.python.client import pywrap_tf_session
+from tensorflow.python.eager import cancellation
+from tensorflow.python.eager import execute
 from tensorflow.python.eager import executor
 from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import tfrt_utils
 from tensorflow.python.util import compat
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 from tensorflow.tsl.protobuf import coordination_config_pb2
 
+
 GRAPH_MODE = 0
 EAGER_MODE = 1
 
@@ -192,6 +198,15 @@ class FunctionCallOptions:
                        "config_pb2.ConfigProto, or a serialized string of that "
                        "proto or None. got: {}".format(type(config)))
 
+  def as_attrs(self):
+    if self.config_proto_serialized is None:
+      config = function_utils.get_disabled_rewriter_config()
+    else:
+      config = self.config_proto_serialized
+    executor_type = self.executor_type or ""
+
+    return {"executor_type": executor_type, "config_proto": config}
+
 
 # Map from context_id (an int) to _TensorCaches.
 # Dicts are thread safe in CPython.
@@ -751,7 +766,8 @@ class Context:
                                      cluster_register_timeout_in_ms=0,
                                      heartbeat_timeout_in_ms=0,
                                      shutdown_barrier_timeout_in_ms=0,
-                                     coordinated_jobs=None):
+                                     coordinated_jobs=None,
+                                     allow_new_incarnation_to_reconnect=False):
     """Enable distributed coordination service with specified configs."""
     if self._context_handle:
       logging.warning("Configuring coordination service type may not be "
@@ -764,6 +780,8 @@ class Context:
     config.cluster_register_timeout_in_ms = cluster_register_timeout_in_ms
     config.heartbeat_timeout_in_ms = heartbeat_timeout_in_ms
     config.shutdown_barrier_timeout_in_ms = shutdown_barrier_timeout_in_ms
+    config.allow_new_incarnation_to_reconnect = (
+        allow_new_incarnation_to_reconnect)
     if coordinated_jobs is not None:
       if isinstance(coordinated_jobs, list):
         config.coordinated_job_list.extend(coordinated_jobs)
@@ -1326,6 +1344,20 @@ class Context:
     self.ensure_initialized()
     pywrap_tfe.TFE_ContextAddFunction(self._handle, c_func)
 
+  def get_c_function(self, name):
+    """Get a C API TF_Function from the context.
+
+    Args:
+      name: Name of the function to get.
+
+    Returns:
+      A ScopedTFFunction wrapping the C API TF_Function.
+    """
+    self.ensure_initialized()
+    return c_api_util.ScopedTFFunction(
+        pywrap_tfe.TFE_ContextGetFunction(self._handle, name), name
+    )
+
   def add_function_def(self, fdef):
     """Add a function definition to the context.
 
@@ -1360,6 +1392,26 @@ class Context:
 
     return function_def
 
+  def get_graph_debug_info(self, name):
+    """Get GraphDebugInfo associated with a function from the context.
+
+    Args:
+      name: function signature name.
+
+    Returns:
+      The requested GraphDebugInfo.
+
+    Raises:
+      tf.errors.NotFoundError: if name is not the name of a registered function.
+    """
+    with c_api_util.tf_buffer() as buffer_:
+      pywrap_tfe.TFE_ContextGetGraphDebugInfo(self._handle, name, buffer_)
+      proto_data = pywrap_tf_session.TF_GetBuffer(buffer_)
+    graph_debug_info = graph_debug_info_pb2.GraphDebugInfo()
+    graph_debug_info.ParseFromString(proto_data)
+
+    return graph_debug_info
+
   def is_custom_device(self, device_name):
     """Calls TFE_IsCustomDevice. See the non-member function."""
     self.ensure_initialized()
@@ -1409,6 +1461,42 @@ class Context:
     self.ensure_initialized()
     return bool(pywrap_tfe.TFE_ContextHasFunction(self._handle, name))
 
+  @property
+  def function_scope_id(self):
+    """Returns an id that is unique to each scope holding functions."""
+    return id(self._context_handle)
+
+  def call_function(self, name, tensor_inputs, num_outputs):
+    """Calls the function associated with the given name."""
+    attrs = tuple(
+        itertools.chain(
+            *self.function_call_options.as_attrs().items()
+        )
+    )
+
+    cancellation_context = cancellation.context()
+    if cancellation_context is None:
+      outputs = execute.execute(
+          name.decode("utf-8"),
+          num_outputs=num_outputs,
+          inputs=tensor_inputs,
+          attrs=attrs,
+          ctx=self,
+      )
+    else:
+      outputs = execute.execute_with_cancellation(
+          name.decode("utf-8"),
+          num_outputs=num_outputs,
+          inputs=tensor_inputs,
+          attrs=attrs,
+          ctx=self,
+          cancellation_manager=cancellation_context,
+      )
+    # Empty list means no function outputs so return None
+    outputs = outputs or None
+
+    return outputs
+
   def add_op_callback(self, callback):
     """Add a post-op callback to the context.
 
@@ -2737,6 +2825,11 @@ def add_c_function(c_func):
   context().add_c_function(c_func)
 
 
+def get_c_function(name):
+  """Get a C API TF_Function from the context."""
+  return context().get_c_function(name)
+
+
 def remove_function(name):
   """Remove a function from the context."""
   context().remove_function(name)
diff --git a/tensorflow/python/eager/core.py b/tensorflow/python/eager/core.py
index 83db830e110..9519fdc0000 100644
--- a/tensorflow/python/eager/core.py
+++ b/tensorflow/python/eager/core.py
@@ -16,6 +16,7 @@
 
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
 
 # Trace of execution and memory usage.
 _active_trace = None
@@ -24,10 +25,15 @@ _active_trace = None
 def _status_to_exception(status):
   try:
     error_class = errors.exception_type_from_error_code(status.code)
-    return error_class(None, None, status.message, status.payloads)
+    e = error_class(None, None, status.message, status.payloads)
+    logging.error_log("%s: %s" % (e.__class__.__name__, e))
+    return e
   except KeyError:
-    return errors.UnknownError(None, None, status.message, status.code,
-                               status.payloads)
+    e = errors.UnknownError(
+        None, None, status.message, status.code, status.payloads
+    )
+    logging.error_log("%s: %s" % (e.__class__.__name__, e))
+    return e
 
 
 class _NotOkStatusException(Exception):
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 0c2c6afc856..64a2a6ad06d 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging
 
 
 def execute(op_name, num_outputs, inputs, attrs=None):
@@ -113,7 +114,7 @@ class TFETest(test_util.TensorFlowTestCase):
       constant_b = constant_op.constant(1.0)
 
       ops.disable_tensor_equality()
-      self._test_hashable(constant_a, constant_b, True)
+      self._test_hashable(constant_a, constant_b, False)
       _v1_check(constant_a, constant_b)
       ops.enable_tensor_equality()
       _v2_check(constant_a, constant_b)
@@ -164,7 +165,7 @@ class TFETest(test_util.TensorFlowTestCase):
       constant_b = constant_op.constant(float('nan'))
 
       ops.disable_tensor_equality()
-      self._test_hashable(constant_a, constant_b, True)
+      self._test_hashable(constant_a, constant_b, False)
       _v1_check(constant_a, constant_b)
       ops.enable_tensor_equality()
       _v2_check(constant_a, constant_b)
@@ -1066,6 +1067,71 @@ class TFETest(test_util.TensorFlowTestCase):
         empty_handle.shape.as_list())
 
 
+@test_util.with_eager_op_as_function
+class SendRecvTest(test_util.TensorFlowTestCase):
+
+  cpu_device = '/job:localhost/replica:0/task:0/device:CPU:0'
+
+  def _send(self, tensor, tensor_name, to_device):
+    return execute(
+        b'_Send', num_outputs=0, inputs=[tensor],
+        attrs=('T', tensor.dtype.as_datatype_enum,
+               'tensor_name', tensor_name,
+               'send_device', tensor.device,
+               'send_device_incarnation', 0,
+               'recv_device', to_device,
+               'client_terminated', True))
+
+  def _recv(self, dtype, tensor_name, from_device):
+    device_name = context.context().device_name
+    if not device_name:
+      device_name = self.cpu_device
+    return execute(
+        b'_Recv', num_outputs=1, inputs=[],
+        attrs=('tensor_type', dtype.as_datatype_enum,
+               'tensor_name', tensor_name,
+               'send_device', from_device,
+               'send_device_incarnation', 0,
+               'recv_device', device_name,
+               'client_terminated', False))[0]
+
+  def setUp(self):
+    super(SendRecvTest, self).setUp()
+    context._reset_context()
+    configure_virtual_cpus()
+
+  @test_util.disable_tfrt('Send/Receive not supported in TFRT yet.')
+  def testBasic(self):
+    with ops.device(self.cpu_device):
+      t0 = constant_op.constant(1.0)
+      t1 = constant_op.constant(2.0)
+    self._send(t0, 't0', self.cpu_device)
+    self._send(t1, 't1', self.cpu_device)
+    self.assertAllEqual(
+        self._recv(dtypes.float32, 't0', self.cpu_device),
+        1.0)
+    self.assertAllEqual(
+        self._recv(dtypes.float32, 't1', self.cpu_device),
+        2.0)
+
+  @test_util.run_gpu_only
+  @test_util.disable_tfrt('Send/Receive not supported in TFRT yet.')
+  def testLocalCrossDevice(self):
+    gpu_device_name = '/job:localhost/replica:0/task:0/device:GPU:0'
+    with ops.device('GPU:0'):
+      t0 = array_ops.identity(1.0)
+      self._send(t0, 't0', self.cpu_device)
+    with ops.device('cpu:0'):
+      self.assertAllEqual(
+          self._recv(dtypes.float32, 't0', gpu_device_name),
+          1.0)
+      self._send(constant_op.constant(2.0), 't1', gpu_device_name)
+    with ops.device('GPU:0'):
+      self.assertAllEqual(
+          self._recv(dtypes.float32, 't1', self.cpu_device),
+          2.0)
+
+
 class EagerTensorCacheTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -1082,5 +1148,51 @@ class EagerTensorCacheTest(test_util.TensorFlowTestCase):
     self.assertIsNotNone(cache.get('2'))
 
 
+class StatusToExceptionTest(test.TestCase):
+
+  def testStatusToException(self):
+    with test.mock.patch.object(
+        tf_logging, 'error_log', autospec=True
+    ) as mock_error_log:
+      # define input to _status_to_exception
+      error_message = 'Test Message'
+      test_class_code = errors.UNIMPLEMENTED
+      test_class_code_to_exception = errors.exception_type_from_error_code(
+          test_class_code
+      )
+      error = core._NotOkStatusException(error_message, test_class_code, None)
+      # call to _status_to_exception
+      exception = core._status_to_exception(error)
+      # validate return value
+      self.assertEqual(error_message, exception.message)
+      self.assertEqual(
+          test_class_code_to_exception.__name__,
+          exception.__class__.__name__,
+      )
+      # verify call to error log library
+      mock_error_log.assert_called_with(str(error))
+
+  def testStatusToExceptionUnknownError(self):
+    with test.mock.patch.object(
+        tf_logging, 'error_log', autospec=True
+    ) as mock_error_log:
+      # define input to _status_to_exception
+      error_message = 'Test Message'
+      invalid_class_code = 1000
+      error = core._NotOkStatusException(
+          error_message, invalid_class_code, None
+      )
+      # call to _status_to_exception
+      exception = core._status_to_exception(error)
+      # validate return value
+      self.assertEqual(error_message, exception.message)
+      self.assertEqual(
+          errors.UnknownError.__name__,
+          exception.__class__.__name__,
+      )
+      # verify call to error log library
+      mock_error_log.assert_called_with(str(error))
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/custom_device_testutil.cc b/tensorflow/python/eager/custom_device_testutil.cc
index 072c586540e..3579149978e 100644
--- a/tensorflow/python/eager/custom_device_testutil.cc
+++ b/tensorflow/python/eager/custom_device_testutil.cc
@@ -21,12 +21,13 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/python/lib/core/py_exception_registry.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 #include "tensorflow/python/util/util.h"
 
 namespace py = pybind11;
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index bc97219f416..94236fa66fd 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -19,8 +19,9 @@ from tensorflow.core.framework import tensor_pb2
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.eager import core
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.types import core as core_types
 from tensorflow.python.util import compat
 
 
@@ -56,9 +57,7 @@ def quick_execute(op_name, num_outputs, inputs, attrs, ctx, name=None):
       e.message += " name: " + name
     raise core._status_to_exception(e) from None
   except TypeError as e:
-    keras_symbolic_tensors = [
-        x for x in inputs if ops._is_keras_symbolic_tensor(x)
-    ]
+    keras_symbolic_tensors = [x for x in inputs if _is_keras_symbolic_tensor(x)]
     if keras_symbolic_tensors:
       raise core._SymbolicException(
           "Inputs to eager execution function cannot be Keras symbolic "
@@ -110,9 +109,7 @@ def execute_with_cancellation(op_name,
       e.message += " name: " + name
     raise core._status_to_exception(e) from None
   except TypeError as e:
-    keras_symbolic_tensors = [
-        x for x in inputs if ops._is_keras_symbolic_tensor(x)
-    ]
+    keras_symbolic_tensors = [x for x in inputs if _is_keras_symbolic_tensor(x)]
     if keras_symbolic_tensors:
       raise core._SymbolicException(
           "Inputs to eager execution function cannot be Keras symbolic "
@@ -224,11 +221,11 @@ def make_tensor(v, arg_name):
 
 def args_to_matching_eager(l, ctx, allowed_dtypes, default_dtype=None):
   """Convert sequence `l` to eager same-type Tensors."""
+  del ctx  # Unused
   if (not l) and (default_dtype is not None):
     return default_dtype, []  # List is empty; assume default dtype.
-  EagerTensor = ops.EagerTensor  # pylint: disable=invalid-name
   for x in l:
-    if not isinstance(x, EagerTensor):
+    if not isinstance(x, core_types.Value):
       break
   else:  # note: intentional for-else
     return l[0]._datatype_enum(), l  # pylint: disable=protected-access
@@ -236,7 +233,7 @@ def args_to_matching_eager(l, ctx, allowed_dtypes, default_dtype=None):
   # Is some input already a Tensor with a dtype?
   dtype = None
   for t in l:
-    if isinstance(t, EagerTensor):
+    if isinstance(t, core_types.Value):
       dtype = t.dtype
       break
 
@@ -251,7 +248,7 @@ def args_to_matching_eager(l, ctx, allowed_dtypes, default_dtype=None):
       # and see if it matches an allowed dtypes. Some ops like ConcatV2 may
       # not list allowed dtypes, in which case we should skip this.
       if dtype is None and allowed_dtypes:
-        tensor = ops.convert_to_tensor(t, ctx=ctx)
+        tensor = tensor_conversion_registry.convert(t)
         # If we did not match an allowed dtype, try again with the default
         # dtype. This could be because we have an empty tensor and thus we
         # picked the wrong type.
@@ -259,18 +256,19 @@ def args_to_matching_eager(l, ctx, allowed_dtypes, default_dtype=None):
           tensor = None
 
       if tensor is None:
-        tensor = ops.convert_to_tensor(
-            t, dtype, preferred_dtype=default_dtype, ctx=ctx)
+        tensor = tensor_conversion_registry.convert(
+            t, dtype, preferred_dtype=default_dtype
+        )
 
       ret.append(tensor)
       if dtype is None:
         dtype = tensor.dtype
   else:
-    ret = [ops.convert_to_tensor(t, dtype, ctx=ctx) for t in l]
+    ret = [tensor_conversion_registry.convert(t, dtype) for t in l]
 
   # TODO(slebedev): consider removing this as it leaks a Keras concept.
   # pylint: disable=protected-access
-  keras_symbolic_tensors = [x for x in ret if ops._is_keras_symbolic_tensor(x)]
+  keras_symbolic_tensors = [x for x in ret if _is_keras_symbolic_tensor(x)]
   if keras_symbolic_tensors:
     raise core._SymbolicException(
         "Using symbolic output of a Keras layer during eager execution "
@@ -280,13 +278,15 @@ def args_to_matching_eager(l, ctx, allowed_dtypes, default_dtype=None):
 
 
 def convert_to_mixed_eager_tensors(values, ctx):
-  v = [ops.convert_to_tensor(t, ctx=ctx) for t in values]
+  del ctx  # Unused
+  v = [tensor_conversion_registry.convert(t) for t in values]
   types = [t._datatype_enum() for t in v]  # pylint: disable=protected-access
   return types, v
 
 
 def args_to_mixed_eager_tensors(lists, ctx):
   """Converts a list of same-length lists of values to eager tensors."""
+  del ctx  # Unused
   assert len(lists) > 1
 
   # Generate an error if len(lists[i]) is not the same for all i.
@@ -304,20 +304,26 @@ def args_to_mixed_eager_tensors(lists, ctx):
     dtype = None
     # If any list has a Tensor, use that dtype
     for l in lists:
-      if isinstance(l[i], ops.EagerTensor):
+      if isinstance(l[i], core_types.Value):
         dtype = l[i].dtype
         break
     if dtype is None:
       # Convert the first one and use its dtype.
-      lists_ret[0].append(ops.convert_to_tensor(lists[0][i], ctx=ctx))
+      lists_ret[0].append(tensor_conversion_registry.convert(lists[0][i]))
       dtype = lists_ret[0][i].dtype
       for j in range(1, len(lists)):
         lists_ret[j].append(
-            ops.convert_to_tensor(lists[j][i], dtype=dtype, ctx=ctx))
+            tensor_conversion_registry.convert(lists[j][i], dtype=dtype)
+        )
     else:
       # Convert everything to the found dtype.
       for j in range(len(lists)):
         lists_ret[j].append(
-            ops.convert_to_tensor(lists[j][i], dtype=dtype, ctx=ctx))
+            tensor_conversion_registry.convert(lists[j][i], dtype=dtype)
+        )
     types.append(dtype.as_datatype_enum)
   return types, lists_ret
+
+
+def _is_keras_symbolic_tensor(x):
+  return hasattr(x, "graph") and getattr(x.graph, "name", None) == "keras_graph"
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 84ca536fc5e..c2c13530c0d 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import forwardprop
 from tensorflow.python.eager import forwardprop_util
-from tensorflow.python.eager import tape as tape_lib
+from tensorflow.python.eager import record
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -738,19 +738,19 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     c_tangent = constant_op.constant(2.)
     with forwardprop.ForwardAccumulator(c, c_tangent) as acc:
       with backprop.GradientTape() as tape:
-        self.assertFalse(tape_lib.should_record_backprop([c]))
+        self.assertFalse(record.should_record_backprop([c]))
         self.assertEqual(1, pywrap_tfe.TFE_Py_TapeSetPossibleGradientTypes([c]))
         tape.watch(c)
         self.assertEqual(2, pywrap_tfe.TFE_Py_TapeSetPossibleGradientTypes([c]))
-        self.assertTrue(tape_lib.should_record_backprop([c]))
-        with tape_lib.stop_recording():
+        self.assertTrue(record.should_record_backprop([c]))
+        with record.stop_recording():
           self.assertEqual(0,
                            pywrap_tfe.TFE_Py_TapeSetPossibleGradientTypes([c]))
-          self.assertFalse(tape_lib.should_record_backprop([c]))
+          self.assertFalse(record.should_record_backprop([c]))
           d = c * 2.
         self.assertEqual(2, pywrap_tfe.TFE_Py_TapeSetPossibleGradientTypes([c]))
-        self.assertTrue(tape_lib.should_record_backprop([c]))
-        self.assertFalse(tape_lib.should_record_backprop([d]))
+        self.assertTrue(record.should_record_backprop([c]))
+        self.assertFalse(record.should_record_backprop([d]))
         self.assertIsNone(acc.jvp(d))
       self.assertIsNone(tape.gradient(d, c))
 
@@ -761,7 +761,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     with forwardprop.ForwardAccumulator(c, c_tangent) as acc:
       with backprop.GradientTape(persistent=True) as tape:
         tape.watch(c)
-        with tape_lib.stop_recording():
+        with record.stop_recording():
           two = constant_op.constant(2.)
           d = c * two
           three = constant_op.constant(3.)
@@ -770,12 +770,12 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
         self.assertIsNone(acc.jvp(e))
         self.assertIsNone(tape.gradient(d, c))
         self.assertIsNone(tape.gradient(e, c))
-        tape_lib.record_operation_forwardprop_only(
+        record.record_operation_forwardprop_only(
             "CustomForwardMul", [d], [c, two], lambda dd: (two * dd, c * dd),
             None)
-        tape_lib.record_operation_backprop_only("CustomBackwardMul", [e],
-                                                [c, three], lambda de:
-                                                (three * de, c * de))
+        record.record_operation_backprop_only("CustomBackwardMul", [e],
+                                              [c, three], lambda de:
+                                              (three * de, c * de))
         self.assertAllClose(4., acc.jvp(d))
         self.assertIsNone(acc.jvp(e))
         self.assertIsNone(tape.gradient(d, c))
@@ -826,7 +826,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
         with gradient_tape as tape:
           tape.watch(c)
           d = math_ops.cos(c)
-          self.assertFalse(tape_lib.should_record_backprop((acc.jvp(d),)))
+          self.assertFalse(record.should_record_backprop((acc.jvp(d),)))
           e = math_ops.cos(acc.jvp(d))
           math_ops.cos(e)
           weak_e = weakref.ref(e)
@@ -851,7 +851,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
       with forward_accumulator as acc:
         tape.watch(c)
         d = math_ops.cos(c)
-        self.assertTrue(tape_lib.should_record_backprop((acc.jvp(d),)))
+        self.assertTrue(record.should_record_backprop((acc.jvp(d),)))
       self.assertAllClose(-.1 * math_ops.cos(1.), tape.gradient(acc.jvp(d), c))
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
@@ -862,10 +862,10 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose([10.], packed_input_tangents)
       d = constant_op.constant(2.)
       d_tangent = constant_op.constant(3.)
-      tape_lib.record_operation_forwardprop_only("FunctionWithInlineJVPs",
-                                                 [d] + [d_tangent],
-                                                 [c] + packed_input_tangents,
-                                                 None, (((0, 1),),))
+      record.record_operation_forwardprop_only("FunctionWithInlineJVPs",
+                                               [d] + [d_tangent],
+                                               [c] + packed_input_tangents,
+                                               None, (((0, 1),),))
       self.assertAllClose(3., acc.jvp(d))
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
@@ -874,12 +874,12 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     d = constant_op.constant(2.)
     e = constant_op.constant(3.)
     with forwardprop.ForwardAccumulator(c, 10.) as acc:
-      tape_lib.record_operation("ForwardIsSpecial", [d], [c], None,
-                                lambda jvp: [-2. * jvp])
+      record.record_operation("ForwardIsSpecial", [d], [c], None,
+                              lambda jvp: [-2. * jvp])
       self.assertAllClose(-20., acc.jvp(d))
-      tape_lib.record_operation("ForwardIsSpecial2", [], [], None, lambda: [])
-      tape_lib.record_operation("ForwardIsSpecial3", [e], [d], None,
-                                lambda x: [x])
+      record.record_operation("ForwardIsSpecial2", [], [], None, lambda: [])
+      record.record_operation("ForwardIsSpecial3", [e], [d], None,
+                              lambda x: [x])
       self.assertAllClose(-20., acc.jvp(e))
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
@@ -1118,7 +1118,7 @@ class BatchTests(test.TestCase, parameterized.TestCase):
       with batch_acc as acc:
         tape.watch(x)
         y = math_ops.cos(x)
-        self.assertTrue(tape_lib.should_record_backprop((acc.jvp(y),)))
+        self.assertTrue(record.should_record_backprop((acc.jvp(y),)))
         jvps = acc.jvp(y)
       d2y_dx2 = [tape.gradient(dy_dx, x) for dy_dx in jvps]
     self.assertAllClose(expected, d2y_dx2)
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index ce5a8e449f3..ce3e6a0e693 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -25,19 +25,14 @@ from tensorflow.python.eager.polymorphic_function.monomorphic_function import _I
 # Function Classes
 from tensorflow.python.eager.polymorphic_function.monomorphic_function import ConcreteFunction
 from tensorflow.python.eager.polymorphic_function.tracing_compiler import TracingCompiler as Function
-from tensorflow.python.eager.polymorphic_function.atomic_function import EagerDefinedFunction as _EagerDefinedFunction
+from tensorflow.python.eager.polymorphic_function.atomic_function import from_func_graph
+from tensorflow.python.eager.polymorphic_function.atomic_function import AtomicFunction
 
 # Utilities
-from tensorflow.python.eager.polymorphic_function.tracing_compiler import TfMethodTarget
+from tensorflow.python.eager.polymorphic_function.tf_method_target import TfMethodTarget
 from tensorflow.python.eager.polymorphic_function.monomorphic_function import _inference_name
 
 # TODO(b/244360504): Remove in favor of graph transformation API.
 # QUARANTINED - Function Callback Modification API
-from tensorflow.python.eager.polymorphic_function.quarantine import add_function_callback
-from tensorflow.python.eager.polymorphic_function.quarantine import clear_function_callbacks
-from tensorflow.python.eager.polymorphic_function.quarantine import remove_function_callback
-from tensorflow.python.eager.polymorphic_function.atomic_function import function_callbacks as _function_callbacks
-
-# TODO(b/258247871): Remove in favor of tf.function.
-# QUARANTINED - Defun API
-from tensorflow.python.eager.polymorphic_function.quarantine import defun_with_attributes
+from tensorflow.python.eager.polymorphic_function.transform import FUNC_GRAPH_TRANSFORMS
+from tensorflow.python.eager.polymorphic_function.transform import CONCRETE_FUNCTION_CALLBACKS
diff --git a/tensorflow/python/eager/gradient_input_output_exclusions.py b/tensorflow/python/eager/gradient_input_output_exclusions.py
index 757f4a9b451..5b9baf3545c 100644
--- a/tensorflow/python/eager/gradient_input_output_exclusions.py
+++ b/tensorflow/python/eager/gradient_input_output_exclusions.py
@@ -88,17 +88,15 @@ _EXCLUDED_OPS = [
     "While",
     "StatelessWhile",
     "Case",
-
     # TF Lite. These ops only appear in OSS.
     # TODO(srbs): Find a better way to filter these out.
     "AudioMicrofrontend",
-
     # DTensor Ops with custom gradient functions.
     # Note that these ops only appear in OSS, and fails the test in OSS.
     "CopyToMesh",
     "CopyToMeshGrad",
     "Relayout",
-    "RelayoutGrad",
+    "RelayoutLike",
 ]
 
 
diff --git a/tensorflow/python/eager/memory_tests/memory_test.py b/tensorflow/python/eager/memory_tests/memory_test.py
index b72b4944002..ee5104ef27b 100644
--- a/tensorflow/python/eager/memory_tests/memory_test.py
+++ b/tensorflow/python/eager/memory_tests/memory_test.py
@@ -43,7 +43,8 @@ class MemoryTest(test.TestCase):
       inputs = Variable(array_ops.zeros([32, 100], dtypes.float32))
       del inputs
 
-    memory_test_util.assert_no_leak(f, num_iters=10000)
+    memory_test_util.assert_no_leak(
+        f, num_iters=10000, increase_threshold_absolute_mb=10)
 
   def testMemoryLeakInFunction(self):
     if not memory_test_util.memory_profiler_is_available():
diff --git a/tensorflow/python/eager/memory_tests/memory_test_util.py b/tensorflow/python/eager/memory_tests/memory_test_util.py
index caea4d3bbe2..7d584661b34 100644
--- a/tensorflow/python/eager/memory_tests/memory_test_util.py
+++ b/tensorflow/python/eager/memory_tests/memory_test_util.py
@@ -39,7 +39,7 @@ def _instance_count_by_class():
   return counter
 
 
-def assert_no_leak(f, num_iters=100000, increase_threshold_absolute_mb=10):
+def assert_no_leak(f, num_iters=100000, increase_threshold_absolute_mb=25):
   """Assert memory usage doesn't increase beyond given threshold for f."""
 
   with context.eager_mode():
diff --git a/tensorflow/python/eager/monitoring.py b/tensorflow/python/eager/monitoring.py
index 079a606b90c..b4d5d4e610f 100644
--- a/tensorflow/python/eager/monitoring.py
+++ b/tensorflow/python/eager/monitoring.py
@@ -472,19 +472,39 @@ MonitoredTimerSections = []
 class MonitoredTimer(object):
   """A context manager to measure the walltime and increment a Counter cell."""
 
-  __slots__ = ["cell", "t", "monitored_section_name"]
+  __slots__ = [
+      "cell",
+      "t",
+      "monitored_section_name",
+      "_counting",
+      "_avoid_repetitive_counting",
+  ]
 
-  def __init__(self, cell, monitored_section_name=None):
+  def __init__(
+      self, cell, monitored_section_name=None, avoid_repetitive_counting=False
+  ):
     """Creates a new MonitoredTimer.
 
     Args:
       cell: the cell associated with the time metric that will be inremented.
       monitored_section_name: name of action being monitored here.
+      avoid_repetitive_counting: when set to True, if already in a monitored
+        timer section with the same monitored_section_name, skip counting.
     """
     self.cell = cell
     self.monitored_section_name = monitored_section_name
+    self._avoid_repetitive_counting = avoid_repetitive_counting
+    self._counting = True
 
   def __enter__(self):
+    if (
+        self._avoid_repetitive_counting
+        and self.monitored_section_name
+        and self.monitored_section_name in MonitoredTimerSections
+    ):
+      self._counting = False
+      return self
+
     self.t = time.time()
     if self.monitored_section_name:
       MonitoredTimerSections.append(self.monitored_section_name)
@@ -493,10 +513,11 @@ class MonitoredTimer(object):
 
   def __exit__(self, exception_type, exception_value, traceback):
     del exception_type, exception_value, traceback
-    micro_seconds = (time.time() - self.t) * 1000000
-    self.cell.increase_by(int(micro_seconds))
-    if self.monitored_section_name:
-      MonitoredTimerSections.remove(self.monitored_section_name)
+    if self._counting:
+      micro_seconds = (time.time() - self.t) * 1000000
+      self.cell.increase_by(int(micro_seconds))
+      if self.monitored_section_name:
+        MonitoredTimerSections.remove(self.monitored_section_name)
 
 
 def monitored_timer(cell):
diff --git a/tensorflow/python/eager/monitoring_test.py b/tensorflow/python/eager/monitoring_test.py
index 213730d4acf..410787d29fc 100644
--- a/tensorflow/python/eager/monitoring_test.py
+++ b/tensorflow/python/eager/monitoring_test.py
@@ -126,6 +126,26 @@ class MonitoringTest(test_util.TensorFlowTestCase):
     )
     self.assertGreater(counter2.get_cell().value(), 0)
 
+  def test_repetitive_monitored_timer(self):
+    counter = monitoring.Counter('test/ctxmgr', 'test context manager')
+    with monitoring.MonitoredTimer(
+        counter.get_cell(),
+        monitored_section_name='action1',
+        avoid_repetitive_counting=True,
+    ):
+      time.sleep(1)
+      with monitoring.MonitoredTimer(
+          counter.get_cell(),
+          monitored_section_name='action1',
+          avoid_repetitive_counting=True,
+      ):
+        time.sleep(1)
+
+      # The inner section is not timed.
+      self.assertEqual(counter.get_cell().value(), 0)
+
+    self.assertGreater(counter.get_cell().value(), 0)
+
   def test_function_decorator(self):
     counter = monitoring.Counter('test/funcdecorator', 'test func decorator')
 
diff --git a/tensorflow/python/eager/polymorphic_function/BUILD b/tensorflow/python/eager/polymorphic_function/BUILD
index 3a9f694fecf..9a875cac943 100644
--- a/tensorflow/python/eager/polymorphic_function/BUILD
+++ b/tensorflow/python/eager/polymorphic_function/BUILD
@@ -13,24 +13,44 @@ py_library(
     deps = [],
 )
 
+py_library(
+    name = "autograph_util",
+    srcs = ["autograph_util.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/autograph/core:converter",
+        "//tensorflow/python/autograph/impl:api",
+        "//tensorflow/python/util:tf_decorator",
+    ],
+)
+
+py_library(
+    name = "transform",
+    srcs = ["transform.py"],
+    srcs_version = "PY3",
+    visibility = ["//tensorflow/python:__subpackages__"],
+    deps = [],
+)
+
 py_library(
     name = "atomic_function",
     srcs = ["atomic_function.py"],
     srcs_version = "PY3",
-    visibility = ["//tensorflow/python/eager:__pkg__"],
+    visibility = ["//tensorflow/python:__subpackages__"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:functional_ops",
         "//tensorflow/python:handle_data_util",
         "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:execute",
-        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/eager:record",
         "//tensorflow/python/framework:c_api_util",
         "//tensorflow/python/framework:error_interpolation",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:function_utils",
     ],
 )
 
@@ -38,7 +58,7 @@ py_library(
     name = "monomorphic_function",
     srcs = ["monomorphic_function.py"],
     srcs_version = "PY3",
-    visibility = ["//tensorflow/python/eager:__pkg__"],
+    visibility = ["//tensorflow/python:__subpackages__"],
     deps = [
         ":atomic_function",
         ":attributes",
@@ -55,22 +75,24 @@ py_library(
         "//tensorflow/python:handle_data_util",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:backprop_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:forwardprop_util",
         "//tensorflow/python/eager:graph_only_ops",
-        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/eager:record",
         "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:type_spec",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/trackable:base",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
     ],
 )
 
@@ -78,13 +100,32 @@ py_library(
     name = "tracing_compiler",
     srcs = ["tracing_compiler.py"],
     srcs_version = "PY3",
-    visibility = ["//tensorflow/python/eager:__pkg__"],
+    visibility = ["//tensorflow/python:__subpackages__"],
     deps = [
         ":attributes",
         ":function_context",
         ":monomorphic_function",
+        ":tf_method_target",
+        ":transform",
         "//tensorflow/core/function/capture:capture_container",
         "//tensorflow/core/function/polymorphism:function_cache",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:lazy_loader",
+        "//tensorflow/python/util:tf_decorator",
+    ],
+)
+
+py_library(
+    name = "tf_method_target",
+    srcs = ["tf_method_target.py"],
+    srcs_version = "PY3",
+    visibility = [
+        "//tensorflow/python/autograph/impl:__pkg__",
+        "//tensorflow/python/eager:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/python/util:tf_inspect",
     ],
 )
 
@@ -95,23 +136,31 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":attributes",
+        ":autograph_util",
         ":compiler_ir",
         ":eager_function_run",
         ":function_spec",
+        "//tensorflow/python:cond",
         "//tensorflow/python:cond_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:control_flow_util",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:while_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
         "//tensorflow/python/distribute/parallel_device",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:lift_to_graph",
         "//tensorflow/python/eager/polymorphic_function:monomorphic_function",
         "//tensorflow/python/eager/polymorphic_function:tracing_compiler",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/trackable:base",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:traceback_utils",
     ],
 )
 
@@ -130,6 +179,7 @@ cuda_py_test(
         ":polymorphic_function",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:data_flow_ops",
@@ -159,6 +209,9 @@ cuda_py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model:save_options",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -203,12 +256,14 @@ tf_xla_py_test(
         "//tensorflow/compiler/tests:xla_test",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:collective_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -243,28 +298,18 @@ cuda_py_test(
     shard_count = 5,
     deps = [
         ":polymorphic_function",
+        "//tensorflow/python:cond",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
-    name = "quarantine",
-    srcs = ["quarantine.py"],
-    srcs_version = "PY3",
-    visibility = ["//tensorflow/python/eager:__pkg__"],
-    deps = [
-        ":atomic_function",
-        ":eager_function_run",
-        ":tracing_compiler",
-    ],
-)
-
 py_library(
     name = "composite_tensor_utils",
     srcs = ["composite_tensor_utils.py"],
@@ -272,7 +317,8 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python/framework:composite_tensor",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -284,7 +330,6 @@ tf_py_test(
     deps = [
         ":monomorphic_function",
         ":polymorphic_function",
-        ":quarantine",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:constant_op",
@@ -311,6 +356,9 @@ tf_py_test(
         "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -326,6 +374,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -341,6 +390,7 @@ cuda_py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -407,7 +457,7 @@ py_library(
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
@@ -423,6 +473,7 @@ tf_py_test(
         "//tensorflow/core/function/polymorphism:function_type",
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
 
@@ -438,7 +489,7 @@ py_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -471,6 +522,6 @@ tf_xla_py_test(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
     ],
 )
diff --git a/tensorflow/python/eager/polymorphic_function/atomic_function.py b/tensorflow/python/eager/polymorphic_function/atomic_function.py
index bc676072ae4..c368761a1f7 100644
--- a/tensorflow/python/eager/polymorphic_function/atomic_function.py
+++ b/tensorflow/python/eager/polymorphic_function/atomic_function.py
@@ -14,28 +14,62 @@
 # ==============================================================================
 """Implementation for AtomicFunction."""
 
+import dataclasses
+import traceback
+from typing import Any
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.function import trace_type
+from tensorflow.core.function.polymorphism import function_type as function_type_lib
 from tensorflow.python.client import pywrap_tf_session
 from tensorflow.python.eager import context
-from tensorflow.python.eager import execute
-from tensorflow.python.eager import tape
-from tensorflow.python.framework import c_api_util
+from tensorflow.python.eager import record
+from tensorflow.python.eager.polymorphic_function import attributes as attributes_lib
+from tensorflow.python.framework import auto_control_deps_utils as acd
 from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import functional_ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import handle_data_util
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
 
 
-class _InterpolateFunctionError(object):
-  """Context Manager that interpolates the exception from 'top_level_func'."""
+class InterpolateRuntimeError(object):
+  """Context Manager that interpolates exceptions received by AtomicFunction."""
 
-  __slots__ = ["_func"]
+  DENY_LIST_PHRASES = ["<embedded"]
 
   def __init__(self, top_level_func):
     self._func = top_level_func
 
+  def interpolate(self, message, node_names, graph_debug_info):
+    """Uses the GraphDebugInfo to generate an error message."""
+    error_message = ["Graph execution error:", ""]
+    for node_name in node_names:
+      error_message.append(
+          f"Detected at node {node_name} defined at (most recent call last):"
+      )
+      if node_name in graph_debug_info.traces:
+        stack_trace = graph_debug_info.traces[node_name]
+        tb_frames = []
+        for frame in stack_trace.file_line_cols:
+          tb_frames.append(
+              traceback.FrameSummary(
+                  graph_debug_info.files[frame.file_index],
+                  frame.line,
+                  frame.func,
+              )
+          )
+          for formatted_frame in traceback.format_list(tb_frames):
+            if not any(p in formatted_frame for p in self.DENY_LIST_PHRASES):
+              error_message.append(formatted_frame)
+      else:
+        error_message.append("<stack traces unavailable>")
+
+    error_message.append(message.strip())
+    return "\n".join(error_message)
+
   def __enter__(self):
     pass
 
@@ -43,19 +77,24 @@ class _InterpolateFunctionError(object):
     if not exc or not isinstance(exc, errors.OpError):
       return False
     message = compat.as_text(exc.message)
-    _, func_tags, _ = error_interpolation.parse_message(message)
-    g = None
+    parsed_message, func_tags, node_tags = error_interpolation.parse_message(
+        message
+    )
+    deepest_func = None
     for func_tag in func_tags:
       # TODO(mdan): Tests should cover this.
       if func_tag.name == compat.as_str(self._func.name):
-        g = self._func.graph
-      elif g:
-        next_func = g._get_function(func_tag.name)  # pylint: disable=protected-access
-        if next_func is not None and isinstance(next_func,
-                                                EagerDefinedFunction):
-          g = next_func.graph
-    if g:
-      exc._message = error_interpolation.interpolate(message, g)  # pylint: disable=protected-access
+        deepest_func = self._func
+      elif deepest_func:
+        next_func = deepest_func.graph._get_function(func_tag.name)  # pylint: disable=protected-access
+        if next_func is not None and isinstance(next_func, AtomicFunction):
+          deepest_func = next_func
+    if deepest_func:
+      exc._message = self.interpolate(
+          parsed_message,
+          [t.name for t in node_tags],
+          deepest_func.graph_debug_info,
+      )
     return False
 
 
@@ -63,165 +102,110 @@ class _InterpolateFunctionError(object):
 function_callbacks = set()
 
 
-# TODO(apassos) get rid of this by splitting framework.function._DefinedFunction
-# so it doesn't have the definition-generating logic and is just a container for
-# an already-defined function.
-class EagerDefinedFunction(object):
-  """Callable with the interface of `framework.function._DefinedFunction`.
+# TODO(fmuham): Lower to FunctionRecord or remove otherwise.
+@dataclasses.dataclass(frozen=True)
+class GraphArtifacts:
+  control_captures: Any
+  graph: Any
+  stateful_ops: Any
 
-  `_EagerDefinedFunction` encapsulates a function definition and its properties,
-  and it provides a method for calling the encapsulated function. Some Ops
-  take functions as attributes, which have type `func`; an instance of this
-  class may be provided as the value of these `func` attributes.
+# Maps the scope_id and name in runtime to the number of AtomicFunctions.
+RUNTIME_FUNCTION_REFS = {}
+
+
+class AtomicFunction:
+  """A Python callable for functions in the TF Runtime.
+
+  Supports tf.function features such as structured value inputs and outputs,
+  captures and control dependencies.
+
+  Lowest level abstraction in the Python tf.function implementation.
   """
+  __slots__ = [
+      "_name",
+      "_bound_context",
+      "_function_type",
+      "_graph_artifacts",
+      "_cached_definition",
+  ]
 
-  def __init__(self, name, graph, inputs, outputs, attrs):
-    """Initializes an eager defined function.
-
-    Args:
-      name: str, the name for the created function.
-      graph: Graph, the graph containing the operations in the function
-      inputs: the tensors in the graph to be used as inputs to the function
-      outputs: the tensors in the graph which will be outputs from the function
-      attrs: dict mapping names of attributes to their AttrValue values
-    """
-    for function_callback in function_callbacks:
-      function_callback(self, name, graph, tuple(inputs), tuple(outputs))
-
-    input_ops = set(arg.op for arg in inputs)
-    operations = [op for op in graph.get_operations() if op not in input_ops]
-
-    graph_output_names = graph._output_names  # pylint: disable=protected-access
-    if (graph_output_names is not None and
-        all(ops.tensor_id(t) in graph_output_names for t in outputs)):
-      output_names = [
-          compat.as_bytes(graph_output_names[ops.tensor_id(t)]) for t in outputs
-      ]
-      if len(set(output_names)) != len(output_names):
-        # There are duplicate names for some reason, probably an invalid
-        # signature. Revert to auto-naming.
-        output_names = []
-    else:
-      output_names = []
-    with graph._c_graph.get() as c_graph:  # pylint: disable=protected-access
-      fn = pywrap_tf_session.TF_GraphToFunction_wrapper(
-          c_graph,
-          compat.as_str(name),
-          False,
-          [o._c_op for o in operations],  # pylint: disable=protected-access
-          [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
-          [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
-          output_names,
-          [o._c_op for o in graph.control_outputs],  # pylint: disable=protected-access
-          [],  # control_output_names
-          None,
-          compat.as_str(""))
-
-    for attr_name, attr_value in attrs.items():
-      serialized = attr_value.SerializeToString()
-      # TODO(iga): this creates and deletes a new TF_Status for every attr.
-      # It might be worth creating a convenient way to re-use status.
-      pywrap_tf_session.TF_FunctionSetAttrValueProto(
-          fn, compat.as_str(attr_name), serialized
-      )
-
-    # TODO(fmuham): pull from eager context instead.
-    self._c_func = c_api_util.ScopedTFFunction(fn, name)
-
+  def __init__(self, name, bound_context, function_type, graph_artifacts):
     self._name = compat.as_bytes(name)
-    self._bound_context = context.context()
-    self._bound_context.add_c_function(fn)
+    self._bound_context = bound_context
+    self._function_type = function_type
+    self._graph_artifacts = graph_artifacts
+    self._cached_definition = None
 
-    # NOTE(feyu): Do not cache signature and definition at initialization to
-    # save memory usage of concrete functions never called through Python. We
-    # cache them on the first call of .definition and .signature.
-    signature = self._get_definition().signature
-
-    self._num_outputs = len(signature.output_arg)
-    self._output_types = [o.type for o in signature.output_arg]
-    self._output_shapes = [o.shape for o in outputs]
-    self._control_captures = graph._function_captures.control  # pylint: disable=protected-access
-    # Shallow copy outputs since ConcreteFunction may mutate it.
-    self._func_graph_outputs = list(outputs)
-    self.grad_func_name = None
-    self.python_grad_func = None
-    self._grad_func = None
-    self.graph = graph
-    self._stateful_ops = tuple(op for op in operations if op._is_stateful)  # pylint: disable=protected-access
+    ref_key = (self._bound_context.function_scope_id, self.name)
+    if ref_key not in RUNTIME_FUNCTION_REFS:
+      RUNTIME_FUNCTION_REFS[ref_key] = 1
+    else:
+      RUNTIME_FUNCTION_REFS[ref_key] += 1
 
   @property
-  def signature(self):
-    try:
-      return self._signature
-    except AttributeError:
-      self._signature = self.definition.signature
-    return self._signature
+  def _c_func(self):
+    return context.get_c_function(self.name)
+
+  @property
+  def function_type(self):
+    return self._function_type
+
+  # TODO(fmuham): Remove this property.
+  @property
+  def graph(self):
+    return self._graph_artifacts.graph
+
+  # TODO(fmuham): Remove this property.
+  @property
+  def stateful_ops(self):
+    return self._graph_artifacts.stateful_ops
 
   @property
   def definition(self):
-    try:
-      return self._definition
-    except AttributeError:
-      self._definition = self._get_definition()
-    return self._definition
-
-  def _get_definition(self):
+    """Current FunctionDef in the Runtime."""
     return self._bound_context.get_function_def(self.name)
 
-  def add_to_graph(self, g=None, overwrite=False):
-    """Add the function to the current context or a graph, if supplied.
+  # TODO(fmuham): Move caching to dependent code and remove method.
+  @property
+  def cached_definition(self):
+    """Cached FunctionDef (not guaranteed to be fresh)."""
+    if self._cached_definition is None:
+      self._cached_definition = self.definition
 
-    Args:
-      g: the graph to add the function to. If not supplied, the function will
-        be added to the current context.
-      overwrite: A bool. If True, this function will overwrite any existing
-        function of the same signature name in the graph `g` or context.
-    """
-    # pylint: disable=protected-access
-    if not g and self._bound_context.executing_eagerly():
-      ctx = self._bound_context
-      if ctx.has_function(self.name):
-        if overwrite:
-          ctx.remove_function(self.name)
-          ctx.add_function_def(self.definition)
-      else:
-        ctx.add_function_def(self.definition)
-    else:
-      if g._is_function(self.name):
-        if overwrite:
-          g._remove_function(self.name)
-          g._add_function(self)
-      else:
-        g._add_function(self)
+    return self._cached_definition
 
-      for f in self.graph._functions.values():
-        if g._is_function(f.name):
-          if overwrite:
-            g._remove_function(f.name)
-            g._add_function(f)
-        else:
-          g._add_function(f)
-    # pylint: enable=protected-access
+  @property
+  def graph_debug_info(self):
+    return self._bound_context.get_graph_debug_info(self.name)
 
   @property
   def name(self):
+    """Name represented in UTF-8 encoded bytes."""
     return self._name
 
   @property
-  def stateful_ops(self):
-    return self._stateful_ops
+  def graph_call_attrs(self):
+    """Returns a dictionary of attributes needed to add a call in graph."""
+    attrs = {
+        "is_stateful": len(self.stateful_ops) > 0,  # pylint: disable=g-explicit-length-test
+        "tout": [
+            o.dtype.as_datatype_enum for o in self.function_type.flat_outputs
+        ],
+        "xla_compile_attr": self.cached_definition.attr.get(
+            attributes_lib.XLA_COMPILE, None
+        ),
+    }
+    attrs.update(self._bound_context.function_call_options.as_attrs())
+    return attrs
 
-  def call(self, ctx, args, cancellation_manager=None):
+  def __call__(self, *args):
     """Calls this function with `args` as inputs.
 
     `ConcreteFunction` execution respects device annotations only if the
     function won't be compiled with xla.
 
     Args:
-      ctx: a Context object
-      args: a list of arguments to supply this function with.
-      cancellation_manager: a `CancellationManager` object that can be used to
-        cancel function execution.
+      *args: arguments to call this function with.
 
     Returns:
       The outputs of the function call.
@@ -231,82 +215,262 @@ class EagerDefinedFunction(object):
       FunctionAlreadyGarbageCollectedError: if the function is no longer
         available to be called because it has been garbage collected.
     """
-    if len(args) != len(self.signature.input_arg):
+    if len(args) != len(self.cached_definition.signature.input_arg):
       raise ValueError(
-          f"Signature specifies {len(list(self.signature.input_arg))} "
-          f"arguments, got: {len(args)}.")
+          "Signature specifies"
+          f" {len(list(self.cached_definition.signature.input_arg))} arguments,"
+          f" got: {len(args)}."
+      )
 
-    function_call_options = ctx.function_call_options
-    if function_call_options.config_proto_serialized is None:
-      config = function_utils.get_disabled_rewriter_config()
-    else:
-      config = function_call_options.config_proto_serialized
-    executor_type = function_call_options.executor_type or ""
+    with InterpolateRuntimeError(self):
+      with ops.control_dependencies(self._graph_artifacts.control_captures):
+        # The caller must use record_operation to record this operation in the
+        # eager case, so we enforce the same requirement for the non-eager
+        # case by explicitly pausing recording. We don't have a gradient
+        # registered for PartitionedCall, so recording this operation confuses
+        # forwardprop code (GradientTape manages to ignore it).
+        with record.stop_recording():
+          if self._bound_context.executing_eagerly():
+            outputs = self._bound_context.call_function(
+                self.name,
+                list(args),
+                len(self.function_type.flat_outputs),
+            )
+          else:
+            outputs = make_call_op_in_graph(self, list(args))
 
-    executing_eagerly = ctx.executing_eagerly()
-    attrs = ("executor_type", executor_type, "config_proto", config)
-    if executing_eagerly:
-      with _InterpolateFunctionError(self):
-        if cancellation_manager is None:
-          outputs = execute.execute(
-              str(self.signature.name),
-              num_outputs=self._num_outputs,
-              inputs=args,
-              attrs=attrs,
-              ctx=ctx)
-        else:
-          outputs = execute.execute_with_cancellation(
-              str(self.signature.name),
-              num_outputs=self._num_outputs,
-              inputs=args,
-              attrs=attrs,
-              ctx=ctx,
-              cancellation_manager=cancellation_manager)
-      # Replace empty list with None
-      outputs = outputs or None
-    else:
-      # TODO(akshayka): Either remove this if the FunctionLibraryRuntime
-      # creates `PartitionedCallOp` kernels by default, or remove the previous
-      # branch if a TPU kernel is registered for `PartitionedCall`.
-      with _InterpolateFunctionError(self):
-        with ops.control_dependencies(self._control_captures):
-          # The caller must use record_operation to record this operation in the
-          # eager case, so we enforce the same requirement for the non-eager
-          # case by explicitly pausing recording. We don't have a gradient
-          # registered for PartitionedCall, so recording this operation confuses
-          # forwardprop code (GradientTape manages to ignore it).
-          with tape.stop_recording():
-            outputs = functional_ops.partitioned_call(
-                args=args,
-                f=self,
-                tout=self._output_types,
-                executing_eagerly=executing_eagerly,
-                config=config,
-                executor_type=executor_type)
+    for i, output_type in enumerate(self.function_type.flat_outputs):
+      handle_data = output_type.dtype._handle_data
+      if handle_data:
+        handle_data_util.set_handle_data(outputs[i], handle_data)
 
-    for i, func_graph_output in enumerate(self._func_graph_outputs):
-      handle_data_util.copy_handle_data(func_graph_output, outputs[i])
-    if executing_eagerly:
-      return outputs
-    else:
-      # TODO(b/128924522): This additional set_shape should not be
-      # necessary. ShapeRefiner likely needs to inspect handle_data. Remove this
-      # once that's done.
-      for i, shape in enumerate(self._output_shapes):
-        outputs[i].set_shape(shape)
-      return outputs
+    # TODO(fmuham): Use FunctionType cast here for all cases.
+    if not self._bound_context.executing_eagerly():
+      for i, output_type in enumerate(self.function_type.flat_outputs):
+        outputs[i].set_shape(output_type.shape)
+
+    return outputs
 
   def __del__(self):
-    try:
-      self._bound_context.remove_function(self.name)
-    except TypeError:
-      # Suppress some exceptions, mainly for the case when we're running on
-      # module deletion. Things that can go wrong include the context module
-      # already being unloaded, self._handle._handle_data no longer being
-      # valid, and so on. Printing warnings in these cases is silly
-      # (exceptions raised from __del__ are printed as warnings to stderr).
-      pass  # 'NoneType' object is not callable when the handle has been
-      # partially unloaded.
-    except AttributeError:
-      pass  # 'NoneType' object has no attribute 'eager_mode' when context has
-      # been unloaded. Will catch other module unloads as well.
+    key = (self._bound_context.function_scope_id, self.name)
+    RUNTIME_FUNCTION_REFS[key] -= 1
+    if RUNTIME_FUNCTION_REFS[key] < 0:
+      raise RuntimeError(
+          f"AtomicFunction Refcounting for {self.name} is invalid."
+      )
+
+    if RUNTIME_FUNCTION_REFS[key] == 0:
+      try:
+        self._bound_context.remove_function(self.name)
+        RUNTIME_FUNCTION_REFS.pop(key)
+      except TypeError:
+        # Suppress some exceptions, mainly for the case when we're running on
+        # module deletion. Things that can go wrong include the context module
+        # already being unloaded, self._handle._handle_data no longer being
+        # valid, and so on. Printing warnings in these cases is silly
+        # (exceptions raised from __del__ are printed as warnings to stderr).
+        pass  # 'NoneType' object is not callable when the handle has been
+        # partially unloaded.
+      except AttributeError:
+        pass  # 'NoneType' object has no attribute 'eager_mode' when context has
+        # been unloaded. Will catch other module unloads as well.
+
+
+def _set_read_only_resource_inputs_attr(op, func_graph):
+  """Sets the list of resource inputs which are read-only.
+
+  This is used by AutomaticControlDependencies.
+
+  Args:
+    op: PartitionedCall Operation.
+    func_graph: FuncGraph.
+  """
+  read_only_indices = acd.get_read_only_resource_input_indices_graph(func_graph)
+  ops.set_int_list_attr(op, acd.READ_ONLY_RESOURCE_INPUTS_ATTR,
+                        read_only_indices)
+
+
+def partitioned_call_op(
+    name,
+    args,
+    is_stateful,
+    tout,
+    config=None,
+    executor_type=None,
+    xla_compile_attr=None,
+):
+  """Generates a function call op respecting device annotations.
+
+  Args:
+    name: Name of the function to call.
+    args: The arguments of the function, including captured inputs.
+    is_stateful: If the function is stateful.
+    tout: a list containing the output dtypes enums
+    config: (Optional) A `tensorflow::ConfigProto` proto, serialized. If `None`,
+      all optimizations are disabled. Currently only handled for eager defined
+      functions.
+    executor_type: (Optional) A string for the name of the executor to be used
+      in the function call. If not set, or set to an empty string, the default
+      tensorflow executor will be used.
+    xla_compile_attr: (Optional) value of the XLA compilation attribute.
+
+  Returns:
+    Returns the operation.
+  """
+  if config is None:
+    config = function_utils.get_disabled_rewriter_config()
+
+  if executor_type is None:
+    executor_type = ""
+
+  # The generated binding returns an empty list for functions that don't
+  # return any Tensors, hence the need to use `create_op` directly.
+  args = [ops.convert_to_tensor(x) for x in args]
+  tin_attr = attr_value_pb2.AttrValue(
+      list=attr_value_pb2.AttrValue.ListValue(
+          type=[x.dtype.as_datatype_enum for x in args]))
+  tout_attr = attr_value_pb2.AttrValue(
+      list=attr_value_pb2.AttrValue.ListValue(type=tout))
+  func_attr = attr_value_pb2.AttrValue(
+      func=attr_value_pb2.NameAttrList(name=name))
+  executor_type_attr = attr_value_pb2.AttrValue(
+      s=compat.as_bytes(executor_type))
+
+  # When running in graph mode, the graph and function graphs are optimized
+  # (i.e. run through grappler) per the session options, so we can disable any
+  # eager-specific rewriting.
+  config_proto = attr_value_pb2.AttrValue(s=config)
+
+  op_name = "StatefulPartitionedCall" if is_stateful else "PartitionedCall"
+
+  # Propagate the attribute indicating the need to compile from function to the
+  # call itself.
+  op_attrs = {
+      "Tin": tin_attr,
+      "Tout": tout_attr,
+      "f": func_attr,
+      "config_proto": config_proto,
+      "executor_type": executor_type_attr,
+  }
+  if xla_compile_attr is not None:
+    op_attrs[attributes_lib.XLA_COMPILE] = xla_compile_attr
+
+  op = ops.get_default_graph().create_op(
+      op_name, args, tout, name=op_name, attrs=op_attrs
+  )
+  return op
+
+
+def make_call_op_in_graph(atomic, tensor_inputs):
+  """Adds an AtomicFunction to graph."""
+  graph = ops.get_default_graph()
+  graph._add_function_recursive(atomic)  # pylint: disable=protected-access
+
+  function_call_attrs = atomic.graph_call_attrs
+  op = partitioned_call_op(
+      name=atomic.name,
+      args=tensor_inputs,
+      is_stateful=function_call_attrs["is_stateful"],
+      tout=function_call_attrs["tout"],
+      config=function_call_attrs["config_proto"],
+      executor_type=function_call_attrs["executor_type"],
+      xla_compile_attr=function_call_attrs["xla_compile_attr"],
+  )
+  _set_read_only_resource_inputs_attr(op, atomic.graph)
+  if hasattr(atomic.graph, "collective_manager_ids_used"):
+    ops.set_int_list_attr(
+        op,
+        acd.COLLECTIVE_MANAGER_IDS,
+        atomic.graph.collective_manager_ids_used,
+    )
+  return op.outputs if op.outputs else op
+
+
+def from_func_graph(
+    name, graph, inputs, outputs, attrs, overwrite=False
+):
+  """Initializes an AtomicFunction from FuncGraph.
+
+  Args:
+    name: str, the name for the created function.
+    graph: Graph, the graph containing the operations in the function
+    inputs: the tensors in the graph to be used as inputs to the function
+    outputs: the tensors in the graph which will be outputs from the function
+    attrs: dict mapping names of attributes to their AttrValue values
+    overwrite: overwrites function definition in the current context if needed
+
+  Returns:
+    An AtomicFunction instance.
+  """
+  input_ops = set(arg.op for arg in inputs)
+  operations = [op for op in graph.get_operations() if op not in input_ops]
+
+  graph_output_names = graph._output_names  # pylint: disable=protected-access
+  if graph_output_names is not None and all(
+      ops.tensor_id(t) in graph_output_names for t in outputs
+  ):
+    output_names = [
+        compat.as_bytes(graph_output_names[ops.tensor_id(t)]) for t in outputs
+    ]
+    if len(set(output_names)) != len(output_names):
+      # There are duplicate names for some reason, probably an invalid
+      # signature. Revert to auto-naming.
+      output_names = []
+  else:
+    output_names = []
+  with graph._c_graph.get() as c_graph:  # pylint: disable=protected-access
+    fn = pywrap_tf_session.TF_GraphToFunction_wrapper(
+        c_graph,
+        compat.as_str(name),
+        False,
+        [o._c_op for o in operations],  # pylint: disable=protected-access
+        [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
+        [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
+        output_names,
+        [o._c_op for o in graph.control_outputs],  # pylint: disable=protected-access
+        [],  # control_output_names
+        None,
+        compat.as_str(""),
+    )
+
+  for attr_name, attr_value in attrs.items():
+    serialized = attr_value.SerializeToString()
+    pywrap_tf_session.TF_FunctionSetAttrValueProto(
+        fn, compat.as_str(attr_name), serialized
+    )
+
+  name = compat.as_bytes(name)
+  bound_context = context.context()
+
+  if overwrite and bound_context.has_function(name):
+    bound_context.remove_function(name)
+
+  bound_context.add_c_function(fn)
+  pywrap_tf_session.TF_DeleteFunction(fn)
+
+  graph_artifacts = GraphArtifacts(
+      control_captures=graph.function_captures.control,
+      graph=graph,
+      stateful_ops=tuple(op for op in operations if op._is_stateful),  # pylint: disable=protected-access
+  )
+
+  if graph.structured_input_signature is not None:
+    input_signature = graph.structured_input_signature
+  else:
+    input_signature = (
+        tuple(tensor_spec.TensorSpec.from_tensor(i) for i in inputs),
+        {},
+    )
+
+  # TODO(fmuham): Include output structure info from structured_outputs
+  output_signature = tuple(
+      trace_type.from_value(o) for o in outputs
+  )
+
+  function_type = function_type_lib.from_structured_signature(
+      input_signature,
+      output_signature,
+      graph.function_captures.capture_types,
+  )
+
+  return AtomicFunction(name, bound_context, function_type, graph_artifacts)
diff --git a/tensorflow/python/eager/polymorphic_function/attributes.py b/tensorflow/python/eager/polymorphic_function/attributes.py
index 927616224cd..eadce647191 100644
--- a/tensorflow/python/eager/polymorphic_function/attributes.py
+++ b/tensorflow/python/eager/polymorphic_function/attributes.py
@@ -39,6 +39,7 @@ ORIGINAL_FUNCTION_NAME = "_original_func_name"
 OUTPUTS_ON_OP_DEVICE = "_OutputsOnOpDevice"
 QUANTIZED_COMPOSITE_FUNCTION = "tf_quant.composite_function"
 QUANTIZED_OPS = "tf_quant.quantized_ops"
+RUNTIME_CONSTANT_OPTIMIZATION = "runtime_constant_optimization"
 SHARED_RENDEZVOUS = "shared_rendezvous"
 TF_DATA_FUNCTION = "_tf_data_function"
 TFTRT_ALLOW_BUILD_AT_RUNTIME = "_tftrt_allow_build_at_runtime"
@@ -64,17 +65,18 @@ POLYMORPHIC_FUNCTION_ALLOWLIST = frozenset({
     API_PREFERRED_DEVICE,
     GO_BACKWARDS,
     IMPLEMENTS,
+    INTS_ON_DEVICE,
+    NO_INLINE,
+    RUNTIME_CONSTANT_OPTIMIZATION,
+    TF_DATA_FUNCTION,
     TIME_MAJOR,
+    OUTPUTS_ON_OP_DEVICE,
 })
 
 TRACING_COMPILER_ALLOWLIST = frozenset().union(
     POLYMORPHIC_FUNCTION_ALLOWLIST,
     {
-        INTS_ON_DEVICE,
-        NO_INLINE,
-        OUTPUTS_ON_OP_DEVICE,
         SHARED_RENDEZVOUS,
-        TF_DATA_FUNCTION,
         XLA_COMPILE,
     },
 )
diff --git a/tensorflow/python/eager/polymorphic_function/autograph_util.py b/tensorflow/python/eager/polymorphic_function/autograph_util.py
new file mode 100644
index 00000000000..958dea68e69
--- /dev/null
+++ b/tensorflow/python/eager/polymorphic_function/autograph_util.py
@@ -0,0 +1,59 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=unidiomatic-typecheck
+"""Autograph utility functions for polymorphic_function."""
+
+from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.impl import api
+from tensorflow.python.util import tf_decorator
+
+
+def py_func_from_autograph(
+    python_func,
+    autograph_options=None,
+):
+  """Compile a python function using autograph, for use with FuncGraph.
+
+  Args:
+    python_func: the Python function to compile.
+    autograph_options: additional knobs to control when `autograph=True`.
+      See https://www.tensorflow.org/guide/autograph for more information.
+  Returns:
+    python_func, converted using autograph.
+  """
+  _, original_func = tf_decorator.unwrap(python_func)
+
+  def autograph_handler(*args, **kwargs):
+    """Calls a converted version of original_func."""
+    try:
+      return api.converted_call(
+          original_func,
+          args,
+          kwargs,
+          options=converter.ConversionOptions(
+              recursive=True,
+              optional_features=autograph_options,
+              user_requested=True,
+          ))
+    except Exception as e:  # pylint:disable=broad-except
+      if hasattr(e, "ag_error_metadata"):
+        raise e.ag_error_metadata.to_exception(e)
+      else:
+        raise
+
+  # Wrapping around a decorator allows checks like tf_inspect.getargspec
+  # to be accurate.
+  converted_func = tf_decorator.make_decorator(original_func, autograph_handler)
+  return tf_decorator.rewrap(python_func, original_func, converted_func)
diff --git a/tensorflow/python/eager/polymorphic_function/compiler_ir.py b/tensorflow/python/eager/polymorphic_function/compiler_ir.py
index 69bf65df89b..11d3d1e1c2c 100644
--- a/tensorflow/python/eager/polymorphic_function/compiler_ir.py
+++ b/tensorflow/python/eager/polymorphic_function/compiler_ir.py
@@ -41,13 +41,11 @@ def make_handledata_tensor_specs(resource_vars):
   trace_type_inputs = trace_type.from_value(
       tuple(resource_vars), inner_context
   ).components
-  handledata_mapping = inner_context.get_handledata_mapping()
 
   def to_resource_spec(traced_input):
     try:
-      my_id = id(traced_input)
-      handled_data = handledata_mapping[my_id]
-      shape_and_type = handled_data.shape_and_type[0]
+      handle_data = traced_input.dtype._handle_data  # pylint: disable=protected-access
+      shape_and_type = handle_data.shape_and_type[0]
       spec = tensor_spec.TensorSpec(
           shape=shape_and_type.shape, dtype=shape_and_type.dtype
       )
diff --git a/tensorflow/python/eager/polymorphic_function/eager_function_run.py b/tensorflow/python/eager/polymorphic_function/eager_function_run.py
index 042634a0862..19a4a5867d7 100644
--- a/tensorflow/python/eager/polymorphic_function/eager_function_run.py
+++ b/tensorflow/python/eager/polymorphic_function/eager_function_run.py
@@ -15,6 +15,7 @@
 # pylint: disable=unidiomatic-typecheck
 """Eager semantics for polymorphic function."""
 
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -73,3 +74,38 @@ def run_functions_eagerly(run_eagerly):
   """
   global RUN_FUNCTIONS_EAGERLY
   RUN_FUNCTIONS_EAGERLY = bool(run_eagerly)
+
+
+@deprecation.deprecated(
+    None, "Use `tf.config.run_functions_eagerly` instead of the experimental "
+    "version.")
+@tf_export("config.experimental_run_functions_eagerly")
+def experimental_run_functions_eagerly(run_eagerly):
+  """Enables / disables eager execution of `tf.function`s.
+
+  Calling `tf.config.experimental_run_functions_eagerly(True)` will make all
+  invocations of `tf.function` run eagerly instead of running as a traced graph
+  function.
+
+  See `tf.config.run_functions_eagerly` for an example.
+
+  Note: This flag has no effect on functions passed into tf.data transformations
+  as arguments. tf.data functions are never executed eagerly and are always
+  executed as a compiled Tensorflow Graph.
+
+  Args:
+    run_eagerly: Boolean. Whether to run functions eagerly.
+
+  Returns:
+    None
+  """
+  return run_functions_eagerly(run_eagerly)
+
+
+@deprecation.deprecated(
+    None,
+    "Use tf.config.functions_run_eagerly instead of the experimental version.")
+@tf_export("config.experimental_functions_run_eagerly")
+def experimental_functions_run_eagerly():
+  """Returns the value of the `experimental_run_functions_eagerly` setting."""
+  return functions_run_eagerly()
diff --git a/tensorflow/python/eager/polymorphic_function/function_spec.py b/tensorflow/python/eager/polymorphic_function/function_spec.py
index dc374eb0bc7..472c631778e 100644
--- a/tensorflow/python/eager/polymorphic_function/function_spec.py
+++ b/tensorflow/python/eager/polymorphic_function/function_spec.py
@@ -337,17 +337,10 @@ class FunctionSpec(object):
     Returns:
       A `string`.
     """
-    args = list(self._arg_names)
+    summary = f"{self._function_type!r}"
     if default_values:
-      for (i, default) in self._arg_indices_to_default_values.items():
-        args[i] += "={}".format(default)
-    if self._fullargspec.kwonlyargs:
-      args.append("*")
-      for arg_name in self._fullargspec.kwonlyargs:
-        args.append(arg_name)
-        if default_values and arg_name in self._fullargspec.kwonlydefaults:
-          args[-1] += "={}".format(self._fullargspec.kwonlydefaults[arg_name])
-    return f"{self._name}({', '.join(args)})"
+      summary += f", defaults: {self.default_values!r}"
+    return summary
 
   def canonicalize_function_inputs(self, args, kwargs):
     """Canonicalizes `args` and `kwargs`.
diff --git a/tensorflow/python/eager/polymorphic_function/function_spec_test.py b/tensorflow/python/eager/polymorphic_function/function_spec_test.py
index a2ffb57beba..60501a0de0d 100644
--- a/tensorflow/python/eager/polymorphic_function/function_spec_test.py
+++ b/tensorflow/python/eager/polymorphic_function/function_spec_test.py
@@ -513,6 +513,29 @@ class FunctionSpecTest(test.TestCase, parameterized.TestCase):
                 type_constraint[2])
         ]))
 
+  def test_spec_summary(self):
+    input_signature = (
+        tensor_spec.TensorSpec(shape=None),
+        tensor_spec.TensorSpec(shape=None),
+    )
+
+    @dummy_tf_decorator
+    def foo(x=2, y=3):  # pylint: disable=unused-argument
+      pass
+
+    spec = function_spec.FunctionSpec.from_function_and_signature(
+        foo, input_signature
+    )
+    self.assertEqual(
+        spec.signature_summary(True),
+        'FunctionType(parameters=[Parameter(name=x, kind=POSITIONAL_OR_KEYWORD,'
+        ' optional=True, type_constraint=TensorSpec(shape=<unknown>,'
+        ' dtype=tf.float32, name=None)), Parameter(name=y,'
+        ' kind=POSITIONAL_OR_KEYWORD, optional=True,'
+        ' type_constraint=TensorSpec(shape=<unknown>, dtype=tf.float32,'
+        " name=None))], captures=OrderedDict()), defaults: {'x': 2, 'y': 3}",
+    )
+
 
 # TODO(fmuham): Remove when is_same_structure is removed.
 class SameStructureTest(test.TestCase):
diff --git a/tensorflow/python/eager/polymorphic_function/gradients_test.py b/tensorflow/python/eager/polymorphic_function/gradients_test.py
index 901272783f7..214b5f153d3 100644
--- a/tensorflow/python/eager/polymorphic_function/gradients_test.py
+++ b/tensorflow/python/eager/polymorphic_function/gradients_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad
@@ -271,7 +271,7 @@ class FunctionGradientsTest(test.TestCase, parameterized.TestCase):
 
     @polymorphic_function.function
     def f(x):
-      return control_flow_ops.cond(x > 0.5, lambda: 2 * x, lambda: 3 * x)
+      return cond.cond(x > 0.5, lambda: 2 * x, lambda: 3 * x)
 
     with backprop.GradientTape() as t:
       x = constant_op.constant(1.0)
diff --git a/tensorflow/python/eager/polymorphic_function/monomorphic_function.py b/tensorflow/python/eager/polymorphic_function/monomorphic_function.py
index 39e2eef60da..f787643b4b6 100644
--- a/tensorflow/python/eager/polymorphic_function/monomorphic_function.py
+++ b/tensorflow/python/eager/polymorphic_function/monomorphic_function.py
@@ -24,7 +24,7 @@ from tensorflow.python import pywrap_tfe
 from tensorflow.python.eager import backprop_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import forwardprop_util
-from tensorflow.python.eager import tape
+from tensorflow.python.eager import record
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.eager.polymorphic_function import atomic_function
 from tensorflow.python.eager.polymorphic_function import attributes as attributes_lib
@@ -136,7 +136,7 @@ def _create_forward_backward_with_graph(attrs, forward_graph, backwards_graph):
       attributes_lib.BACKWARD_FUNCTION:
       backward_function.name})
   forward_function_attr.update(common_attributes)
-  forward_function = atomic_function.EagerDefinedFunction(
+  forward_function = atomic_function.from_func_graph(
       forward_function_name, forward_graph, forward_graph.inputs,
       forward_graph.outputs, forward_function_attr)
   return forward_function, backward_function
@@ -152,7 +152,7 @@ class _DelayedRewriteGradientFunctions(object):
     # function generation.
     self._cached_function_pairs = {}
     self._func_graph = func_graph
-    self._inference_function = atomic_function.EagerDefinedFunction(
+    self._inference_function = atomic_function.from_func_graph(
         _inference_name(self._func_graph.name), self._func_graph,
         self._func_graph.inputs, self._func_graph.outputs, attrs)
     self._attrs = attrs
@@ -186,7 +186,7 @@ class _DelayedRewriteGradientFunctions(object):
     Returns:
       A pair of (forward_function, backward_function):
         forward_function: A re-generated inference function (an
-          EagerDefinedFunction) to account for new side outputs, if any extra
+          AtomicFunction) to account for new side outputs, if any extra
           were required when building the backward pass.
         backward_function: A ConcreteFunction that Takes `num_doutputs`
           arguments and returns gradients with respect to inputs of the forward
@@ -239,18 +239,31 @@ class _DelayedRewriteGradientFunctions(object):
     forward_function, backwards_function = self.forward_backward(len(doutputs))
     if not backwards_function.outputs:
       return backwards_function.structured_outputs
-    forward_function.add_to_graph(op.graph)
+
+    op.graph._add_function_recursive(forward_function)  # pylint: disable=protected-access
 
     # pylint: disable=protected-access
     # Rewrite an inference call op to be a forward call op
     op._set_func_attr("f", forward_function.name)
-    op._set_type_list_attr("Tout", forward_function._output_types)
+    op._set_type_list_attr(
+        "Tout",
+        [
+            o.dtype.as_datatype_enum
+            for o in forward_function.function_type.flat_outputs
+        ],
+    )
+    truncated_outputs = forward_function.function_type.flat_outputs[
+        len(op.outputs) :
+    ]
     op._add_outputs(
-        forward_function._output_types[len(op.outputs):],
-        forward_function._output_shapes[len(op.outputs):])
+        [o.dtype.as_datatype_enum for o in truncated_outputs],
+        [o.shape for o in truncated_outputs],
+    )
     for i in range(len(op.outputs)):
-      func_graph_output = forward_function._func_graph_outputs[i]
-      handle_data_util.copy_handle_data(func_graph_output, op.outputs[i])
+      output_type = forward_function.function_type.flat_outputs[i]
+      handle_data = output_type.dtype._handle_data
+      if handle_data:
+        handle_data_util.set_handle_data(op.outputs[i], handle_data)
     # pylint: enable=protected-access
 
     capture_mapping = dict(
@@ -306,7 +319,7 @@ class _DelayedRewriteGradientFunctions(object):
         instead.
 
     Returns:
-      An atomic_function.EagerDefinedFunction.
+      An atomic_function.AtomicFunction.
     """
     del inference_args  # unused
     if input_tangents:
@@ -339,9 +352,12 @@ class _DelayedRewriteGradientFunctions(object):
         operation.
     """
     backward_function, to_record = self._backward(flat_outputs)
-    tape.record_operation(self._inference_function.signature.name,
-                          to_record, inference_args + input_tangents,
-                          backward_function)
+    record.record_operation(
+        self._inference_function.cached_definition.signature.name,
+        to_record,
+        inference_args + input_tangents,
+        backward_function,
+    )
 
 
 # Contains information about a forward function wrapped to compute jvps.
@@ -516,7 +532,7 @@ class _TapeGradientFunctions(object):
           else:
             forward_wrapper_graph.inputs.append(input_placeholder)
         for inp, arg in zip(forward_wrapper_graph.inputs, inference_args):
-          tape.record_operation(
+          record.record_operation(
               "captured_value", [inp], [arg],
               backward_function=lambda x: [x],
               forward_function=lambda x: [x])
@@ -536,7 +552,7 @@ class _TapeGradientFunctions(object):
             tensor_shape.TensorShape(
                 external_jvp.shape).assert_is_compatible_with(
                     jvp_placeholder.shape)
-            tape.record_operation(
+            record.record_operation(
                 "captured_value",
                 [jvp_placeholder],
                 [external_jvp],
@@ -548,8 +564,7 @@ class _TapeGradientFunctions(object):
         with ops.get_default_graph()._override_gradient_function(  # pylint: disable=protected-access
             {"PartitionedCall": gradient_function,
              "StatefulPartitionedCall": gradient_function}):
-          forward_outputs = forward_function.call(context.context(),
-                                                  forward_inputs)
+          forward_outputs = forward_function(*forward_inputs)
           if isinstance(forward_outputs, ops.Operation):
             # _wrapped_backward_function expects a list, but if the function has
             # no outputs its call() returns an Operation. We need to undo that
@@ -565,8 +580,8 @@ class _TapeGradientFunctions(object):
       #
       # TODO(allenl): It might be better to explicitly stop backward recording
       # so we don't use the second-order tape cases unnecessarily.
-      tape.record_operation_forwardprop_only(
-          forward_function.signature.name,
+      record.record_operation_forwardprop_only(
+          forward_function.cached_definition.signature.name,
           forward_outputs, forward_inputs, py_backward, None)
       output_indices, output_tangents = (
           pywrap_tfe.TFE_Py_PackJVPs(forward_outputs))
@@ -690,7 +705,7 @@ class _TapeGradientFunctions(object):
         `inference_args`.
 
     Returns:
-      A forward atomic_function.EagerDefinedFunction.
+      A forward atomic_function.AtomicFunction.
     """
     if self._forward is None:
       (self._forward, self._forward_graph, self._backward,
@@ -793,19 +808,19 @@ class _TapeGradientFunctions(object):
     backward_function, to_record = self._wrap_backward_function(
         self._forward_graph, self._backward, flat_outputs)
     if self._forwardprop_output_indices:
-      tape.record_operation_backprop_only(
-          self._forward.signature.name,
+      record.record_operation_backprop_only(
+          self._forward.cached_definition.signature.name,
           to_record, inference_args,
           backward_function)
-      tape.record_operation_forwardprop_only(
-          self._forward.signature.name,
+      record.record_operation_forwardprop_only(
+          self._forward.cached_definition.signature.name,
           flat_outputs, inference_args + input_tangents,
           backward_function,
           self._forwardprop_output_indices)
     else:
-      tape.record_operation(self._forward.signature.name,
-                            to_record, inference_args + input_tangents,
-                            backward_function)
+      record.record_operation(self._forward.cached_definition.signature.name,
+                              to_record, inference_args + input_tangents,
+                              backward_function)
 
 
 class _FirstOrderTapeGradientFunctions(_TapeGradientFunctions):
@@ -1168,32 +1183,28 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
     """
     return self._call_impl(args, kwargs)
 
-  def _call_impl(self, args, kwargs, cancellation_manager=None):
+  def _call_impl(self, args, kwargs):
     """See `__call__` for details."""
     with trace.Trace(self._func_graph.name, tf_function_call="concrete"):
       # Construct the list of input tensors: check if the structured signature
       # applies first; and if not, then use the flat signature.
       if self._function_spec is not None:
         try:
-          return self._call_with_structured_signature(args, kwargs,
-                                                      cancellation_manager)
+          return self._call_with_structured_signature(args, kwargs)
         except TypeError as structured_err:
           try:
-            return self._call_with_flat_signature(args, kwargs,
-                                                  cancellation_manager)
+            return self._call_with_flat_signature(args, kwargs)
           except TypeError:
             raise structured_err
 
-      return self._call_with_flat_signature(args, kwargs, cancellation_manager)
+      return self._call_with_flat_signature(args, kwargs)
 
-  def _call_with_flat_signature(self, args, kwargs, cancellation_manager):
+  def _call_with_flat_signature(self, args, kwargs):
     """Executes the wrapped function with the flat signature.
 
     Args:
       args: Positional arguments to the concrete function.
       kwargs: Keyword arguments to the concrete function.
-      cancellation_manager: A `CancellationManager` that can be used to cancel
-        function invocation.
 
     Returns:
       The result of applying the function on the Tensors/Variables contained in
@@ -1238,16 +1249,14 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
         raise TypeError(f"{self._flat_signature_summary()}: expected argument "
                         f"#{i}(zero-based) to be a Tensor; "
                         f"got {type(arg).__name__} ({arg}).")
-    return self._call_flat(args, self.captured_inputs, cancellation_manager)
+    return self._call_flat(args, self.captured_inputs)
 
-  def _call_with_structured_signature(self, args, kwargs, cancellation_manager):
+  def _call_with_structured_signature(self, args, kwargs):
     """Executes the wrapped function with the structured signature.
 
     Args:
       args: Positional arguments to the concrete function.
       kwargs: Keyword arguments to the concrete function.
-      cancellation_manager: A `CancellationManager` that can be used to cancel
-        function invocation.
 
     Returns:
       The result of applying the function on the Tensors/Variables contained in
@@ -1260,10 +1269,9 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
         self._function_spec.canonicalize_function_inputs(args, kwargs))
     return self._call_flat(
         filtered_flat_args,
-        captured_inputs=self.captured_inputs,
-        cancellation_manager=cancellation_manager)
+        captured_inputs=self.captured_inputs)
 
-  def _call_flat(self, args, captured_inputs, cancellation_manager=None):
+  def _call_flat(self, args, captured_inputs):
     """Executes the wrapped function.
 
     Args:
@@ -1274,9 +1282,6 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
         calling this method.
       captured_inputs: the captured inputs that are also part of the input args
         to the actual execution. By default, it should be self._captured_inputs.
-      cancellation_manager: (Optional.) A `CancellationManager` that can be
-        used to cancel function invocation.
-
     Returns:
       The result of applying the TF function to `args`.
 
@@ -1291,7 +1296,7 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
     if default_graph.building_function and not self._func_graph.saveable:
       default_graph.mark_as_unsaveable(self._func_graph.saving_errors)
 
-    if (tape.could_possibly_record() or
+    if (record.could_possibly_record() or
         hasattr(default_graph, "watch_variable")):
       for v in self._func_graph.variables:
         resource_variable_ops.variable_accessed(v)
@@ -1341,41 +1346,22 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
     if (possible_gradient_type == gradients_util.POSSIBLE_GRADIENT_TYPES_NONE
         and executing_eagerly):
       # No tape is watching; skip to running the function.
-      return self._build_call_outputs(self._inference_function.call(
-          ctx, args, cancellation_manager=cancellation_manager))
+      return self._build_call_outputs(self._inference_function(*args))
     forward_backward = self._select_forward_and_backward_functions(
         args,
         possible_gradient_type,
         executing_eagerly)
     forward_function, args_with_tangents = forward_backward.forward()
     if executing_eagerly:
-      flat_outputs = forward_function.call(
-          ctx, args_with_tangents, cancellation_manager=cancellation_manager)
+      flat_outputs = forward_function(*args_with_tangents)
     else:
       with default_graph._override_gradient_function(  # pylint: disable=protected-access
           {"PartitionedCall": self._get_gradient_function(),
            "StatefulPartitionedCall": self._get_gradient_function()}):
-        flat_outputs = forward_function.call(ctx, args_with_tangents)
+        flat_outputs = forward_function(*args_with_tangents)
     forward_backward.record(flat_outputs)
     return self._build_call_outputs(flat_outputs)
 
-  def _experimental_with_cancellation_manager(self, cancellation_manager):
-    """Returns a callable that invokes a cancellable version of this function.
-
-    Args:
-      cancellation_manager: A `CancellationManager` object that can be used to
-        cancel function invocation.
-
-    Returns:
-      A callable with the same signature as this concrete function.
-    """
-
-    def cancellable_call(*args, **kwargs):
-      return self._call_impl(
-          args, kwargs, cancellation_manager=cancellation_manager)
-
-    return cancellable_call
-
   @property
   def name(self):
     """`ConcreteFunction` name."""
@@ -1552,7 +1538,7 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
   @property
   def function_def(self):
     """Returns a `FunctionDef` object representing this function."""
-    return self._delayed_rewrite_functions.forward().definition
+    return self._delayed_rewrite_functions.forward().cached_definition
 
   @property
   def output_shapes(self):
@@ -1588,16 +1574,18 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
 
     if not context.executing_eagerly() and not g:
       g = ops.get_default_graph()
-    self._delayed_rewrite_functions.forward().add_to_graph(g, overwrite)
+
+    if g is not None:
+      g._add_function_recursive(self._delayed_rewrite_functions.forward())  # pylint: disable=protected-access
 
   def add_gradient_functions_to_graph(self, g=None):
     """Add forward/backward functions to graph `g` or the current context."""
     if not context.executing_eagerly() and not g:
       g = ops.get_default_graph()
-    self._delayed_rewrite_functions.forward().add_to_graph(g)
+    g._add_function_recursive(self._delayed_rewrite_functions.forward())  # pylint: disable=protected-access
     forward_function, backward_function = (
         self._delayed_rewrite_functions.forward_backward())
-    forward_function.add_to_graph(g)
+    g._add_function_recursive(forward_function)  # pylint: disable=protected-access
     backward_function.add_to_graph(g)
 
   def _get_gradient_function(self):
@@ -1619,7 +1607,7 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
 
     Returns:
       An object with a `forward` method returning a tuple of (forward_function :
-      EagerDefinedFunction, augmented_arguments : List), and a corresponding
+      AtomicFunction, augmented_arguments : List), and a corresponding
       `record` method which takes outputs from the forward function and records
       the operation. forward_function should be called with augmented_arguments.
     """
@@ -1627,7 +1615,7 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
       input_tangents = forwardprop_util.pack_tangents(args)
     else:
       input_tangents = forwardprop_util.TangentInfo()
-    need_gradients_for_jvps = tape.should_record_backprop(
+    need_gradients_for_jvps = record.should_record_backprop(
         input_tangents.tangents)
     # Allows re-use of forward and backward function pairs depending on the
     # tapes and forward accumulators watching its inputs.
diff --git a/tensorflow/python/eager/polymorphic_function/polymorphic_function.py b/tensorflow/python/eager/polymorphic_function/polymorphic_function.py
index cdfa4bba05b..e5cfcab203f 100644
--- a/tensorflow/python/eager/polymorphic_function/polymorphic_function.py
+++ b/tensorflow/python/eager/polymorphic_function/polymorphic_function.py
@@ -75,6 +75,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.eager import monitoring
 from tensorflow.python.eager.polymorphic_function import attributes as attributes_lib
+from tensorflow.python.eager.polymorphic_function import autograph_util
 from tensorflow.python.eager.polymorphic_function import compiler_ir
 from tensorflow.python.eager.polymorphic_function import eager_function_run
 from tensorflow.python.eager.polymorphic_function import function_spec as function_spec_lib
@@ -85,6 +86,7 @@ from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
@@ -216,19 +218,20 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
   supported.
   """
 
-  def __init__(self,
-               initial_value=None,
-               trainable=None,
-               caching_device=None,
-               name=None,
-               dtype=None,
-               constraint=None,
-               add_initializers_to=None,
-               lifted_initializer_graph=None,
-               synchronization=None,
-               aggregation=None,
-               shape=None,
-               **unused_kwargs):
+  def __init__(
+      self,
+      initial_value=None,
+      trainable=None,
+      caching_device=None,
+      name=None,
+      dtype=None,
+      constraint=None,
+      add_initializers_to=None,
+      synchronization=None,
+      aggregation=None,
+      shape=None,
+      **unused_kwargs,
+  ):
     """Creates a variable.
 
     Args:
@@ -236,8 +239,8 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
         which is the initial value for the Variable. The initial value must have
         a shape specified unless `validate_shape` is set to False. Can also be a
         callable with no argument that returns the initial value when called.
-        (Note that initializer functions from init_ops.py must first be bound
-         to a shape before being used here.)
+        (Note that initializer functions from init_ops.py must first be bound to
+        a shape before being used here.)
       trainable: If `True`, GradientTapes automatically watch uses of this
         Variable.
       caching_device: Optional device string or function describing where the
@@ -247,26 +250,24 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
         deduplicate copying through `Switch` and other conditional statements.
       name: Optional name for the variable. Defaults to `'Variable'` and gets
         uniquified automatically.
-      dtype: If set, initial_value will be converted to the given type.
-        If None, either the datatype will be kept (if initial_value is
-       a Tensor) or float32 will be used (if it is a Python object convertible
-       to a Tensor).
+      dtype: If set, initial_value will be converted to the given type. If None,
+        either the datatype will be kept (if initial_value is a Tensor) or
+        float32 will be used (if it is a Python object convertible to a Tensor).
       constraint: An optional projection function to be applied to the variable
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
       add_initializers_to: if not None and not in legacy graph mode, the
         initializer tensor will be added to this map in addition to adding the
         assignment to the function.
-      lifted_initializer_graph: FuncGraph to try to lift initializers to.
-      synchronization: Indicates when a distributed variable will be
-        aggregated. Accepted values are constants defined in the class
+      synchronization: Indicates when a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize.
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
@@ -384,7 +385,7 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
         # Capture the handle ahead of time in order to avoid querying the shape
         # of the handle which helps async execution performance
         graph.capture(self._handle, shape=())
-        control_flow_ops.cond(
+        cond.cond(
             resource_variable_ops.var_is_initialized_op(self._handle),
             not_assign_fn, assign_fn)
 
@@ -432,22 +433,6 @@ def _evaluate_var_is_initialized(variables):
   return var_is_initialized
 
 
-class FunctionDeleter:
-  """An object responsible for cleaning up the function graph."""
-
-  __slots__ = ["func_graph"]
-
-  def __init__(self, func_graph):
-    self.func_graph = func_graph
-
-  def __del__(self):
-    try:
-      func_graph_module.dismantle_func_graph(self.func_graph)
-    except:  # pylint: disable=bare-except
-      # Note: bare except here because this can be noisy at shutdown time.
-      pass
-
-
 class OptionalXlaContext:
   """Wrapper for XLA context optionally applied under a context manager."""
 
@@ -657,6 +642,10 @@ class Function(core.GenericFunction, trackable.Trackable):
     except AttributeError:
       name = "function"
 
+    if self._autograph:
+      fn = autograph_util.py_func_from_autograph(
+          fn, self._experimental_autograph_options)
+
     return tracing_compiler.TracingCompiler(
         fn,
         name,
@@ -682,7 +671,6 @@ class Function(core.GenericFunction, trackable.Trackable):
       add_initializers_to: Where to collect variable initializers, if not None.
     """
     created_variables = []
-    lifted_initializer_graph = func_graph_module.FuncGraph("initializer")
 
     def variable_capturing_scope(next_creator, **kwds):
       """Creates UnliftedInitializerVariables and saves references to them."""
@@ -692,8 +680,8 @@ class Function(core.GenericFunction, trackable.Trackable):
       if not enable_variable_lifting:
         return next_creator(**kwds)
       v = UnliftedInitializerVariable(
-          add_initializers_to=add_initializers_to,
-          lifted_initializer_graph=lifted_initializer_graph, **kwds)
+          add_initializers_to=add_initializers_to, **kwds
+      )
       created_variables.append(weakref.ref(v))
       return v
 
@@ -702,8 +690,6 @@ class Function(core.GenericFunction, trackable.Trackable):
         variable_capturing_scope)
     self._variable_creation_fn._name = self._name  # pylint: disable=protected-access
     # Force the definition of the function for these arguments
-    self._lifted_initializer_graph = lifted_initializer_graph
-    self._graph_deleter = FunctionDeleter(self._lifted_initializer_graph)
     self._concrete_variable_creation_fn = (
         self._variable_creation_fn    # pylint: disable=protected-access
         ._get_concrete_function_internal_garbage_collected(
@@ -921,7 +907,7 @@ class Function(core.GenericFunction, trackable.Trackable):
                 v.handle))
       # We want to call no_variable_creation if possible because it avoids
       # recomputing potentially expensive initializers.
-      return control_flow_ops.cond(
+      return cond.cond(
           condition,
           lambda: self._no_variable_creation_fn(*inner_args, **inner_kwds),
           functools.partial(
diff --git a/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py b/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py
index fca480c0a46..f2b7e9d4aed 100644
--- a/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py
+++ b/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py
@@ -18,6 +18,7 @@ import functools
 import itertools
 import multiprocessing.pool
 import pickle
+import platform
 import re
 import sys
 import time
@@ -62,9 +63,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_random_ops
@@ -920,7 +921,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       return n - 1
 
     @polymorphic_function.function(input_signature=signature)
-    def cond(n):
+    def cond_fn(n):
       return n > 0
 
     # Instead of calling the send & recv functions directly we want to call them
@@ -928,9 +929,9 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # while boundary.
     @polymorphic_function.function
     def fn(n):
-      functional_ops.While([n], cond.get_concrete_function(),
+      functional_ops.While([n], cond_fn.get_concrete_function(),
                            send_body.get_concrete_function())
-      return functional_ops.While([n], cond.get_concrete_function(),
+      return functional_ops.While([n], cond_fn.get_concrete_function(),
                                   recv_body.get_concrete_function())
 
     # Use a graph context since functions will not be automatically inlined
@@ -987,8 +988,18 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(a, math_ops.matmul(t, t).numpy())
     self.assertAllEqual(b['b'].numpy(), 1.0)
 
-  def testGraphFunctionNoneOutput(self):
+  def testZipStrictBuiltin(self):
+    major, minor, _ = platform.python_version_tuple()
+    if not (major == '3' and int(minor) >= 10):
+      self.skipTest('strict zip is only supported in Python 3.10+')
 
+    @polymorphic_function.function
+    def foo(x):
+      return list(zip([x], [x], strict=True))
+
+    self.assertEqual(foo(2)[0][0].numpy(), 2)
+
+  def testGraphFunctionNoneOutput(self):
     @polymorphic_function.function
     def fn(unused_a, unused_b):
       return None
@@ -1101,7 +1112,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     v = variables.Variable(1.0)
 
     def trivial_function():
-      return control_flow_ops.cond(
+      return cond.cond(
           array_ops.placeholder_with_default(True, ()), v.read_value,
           v.read_value)
 
@@ -1878,7 +1889,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         # two sets of functions, each of them are (inference, forward, backward)
         functions = list(graph._functions.values())
         captured_function_names = [
-            f.definition.signature.name for f in functions
+            f.cached_definition.signature.name for f in functions
         ]
         expected_func_name_regex = [
             '.*inference.*py_composite.*',
@@ -4923,46 +4934,8 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(ValueError):
       lazy_capture()
 
-  def testMaybeCreateCapturePlaceholderWithValidCapture(self):
-
-    @polymorphic_function.function
-    def f():
-      func = lambda: x
-      # TODO(b/263520817): Remove access to private attribute.
-      return ops.get_default_graph(
-          )._function_captures._create_capture_placeholder(func)
-
-    x = {
-        'tensor': constant_op.constant(0),
-        'list': [constant_op.constant(1), 2],
-        'dict': {
-            'float': constant_op.constant(0.5)
-        }
-    }
-
-    out = f()
-    # tf.function output should have same structure/values with the side input
-    self.assertEqual(x['tensor'].numpy(), out['tensor'].numpy())
-    self.assertEqual(x['list'][0].numpy(), out['list'][0].numpy())
-    self.assertEqual(x['list'][1], out['list'][1].numpy())
-    self.assertEqual(x['dict']['float'].numpy(), out['dict']['float'].numpy())
-
-  def testMaybeCreateCapturePlaceholderWithInvalidCapture(self):
-
-    @polymorphic_function.function
-    def f():
-      func = lambda: x
-      # TODO(b/263520817): Remove access to private attribute.
-      return ops.get_default_graph(
-          )._function_captures._create_capture_placeholder(func)
-
-    # Set is not supported
-    x = set([1, 2])
-    with self.assertRaises(NotImplementedError):
-      f()
-
   @parameterized.parameters(
-      (1, int, 2, int, 2),
+      (1, int, 2, int, 1),
       (1, constant_op.constant, 2, constant_op.constant, 1))
   def testRetraceLogicWithSideInputs(self, val_before, type_before, val_after,
                                      type_after, expected_len):
@@ -4991,7 +4964,7 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
     _ = f()
     x = 2
     _ = f()
-    self.assertLen(total_function_cache(f), 2)
+    self.assertLen(total_function_cache(f), 1)
 
   def testFunctoolsLruCache(self):
     self.skipTest(
diff --git a/tensorflow/python/eager/polymorphic_function/polymorphic_function_xla_jit_test.py b/tensorflow/python/eager/polymorphic_function/polymorphic_function_xla_jit_test.py
index 4a6d6abf08b..6ef4047b88c 100644
--- a/tensorflow/python/eager/polymorphic_function/polymorphic_function_xla_jit_test.py
+++ b/tensorflow/python/eager/polymorphic_function/polymorphic_function_xla_jit_test.py
@@ -28,8 +28,8 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -136,7 +136,7 @@ class FunctionTest(xla_test.XLATestCase):
       # Check that the must-compile attribute gets correctly propagated to the
       # created derivatives.
       self.assertTrue(backward.function_def.attr['_XlaMustCompile'])
-      self.assertTrue(forward.definition.attr['_XlaMustCompile'])
+      self.assertTrue(forward.cached_definition.attr['_XlaMustCompile'])
 
   # Calling function with jit_compile=True from
   # jit_compile=False should compile the inner func.
@@ -365,8 +365,8 @@ class FunctionTest(xla_test.XLATestCase):
         x = ops.convert_to_tensor(x)
 
         def body(i, a):
-          return i + 1, control_flow_ops.cond(i > 2, lambda: a + (x**2),
-                                              lambda: a + 3)
+          return i + 1, cond.cond(i > 2, lambda: a + (x**2),
+                                  lambda: a + 3)
 
         return while_loop.while_loop(
             lambda i, *_: i < 10,
@@ -1153,7 +1153,7 @@ class FunctionTest(xla_test.XLATestCase):
 
         @polymorphic_function.function(jit_compile=True, autograph=False)
         def f(x):
-          return control_flow_ops.cond(
+          return cond.cond(
               math_ops.reduce_all(x > 1), lambda: 1. / x, lambda: x)
 
         v = variables.Variable([[2.]])
diff --git a/tensorflow/python/eager/polymorphic_function/quarantine.py b/tensorflow/python/eager/polymorphic_function/quarantine.py
deleted file mode 100644
index e077e3b4de8..00000000000
--- a/tensorflow/python/eager/polymorphic_function/quarantine.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Internal APIs to be removed in the future."""
-
-from tensorflow.python.eager.polymorphic_function import atomic_function
-from tensorflow.python.eager.polymorphic_function import eager_function_run
-from tensorflow.python.eager.polymorphic_function import tracing_compiler
-from tensorflow.python.util import deprecation
-from tensorflow.python.util import tf_decorator
-from tensorflow.python.util.tf_export import tf_export
-
-
-# TODO(b/258247871): Remove in favor of tf.function.
-@tf_export("__internal__.function.defun_with_attributes", v1=[])
-def defun_with_attributes(func=None,
-                          input_signature=None,
-                          attributes=None,
-                          autograph=True,
-                          experimental_autograph_options=None,
-                          jit_compile=None,
-                          reduce_retracing=False):
-  """Compiles a Python function into a callable TensorFlow graph.
-
-  This function supports adding extra function attributes. See detailed
-  documentation in defun(). Currently this is not exposed in public API since we
-  don't expect user to directly use attributes, and attribute won't work by
-  itself. This assumption might change in future.
-
-  Args:
-    func: function to be compiled.
-    input_signature: same as defun()'s input_signature.
-    attributes: A dictionary of arguments which will be added to function def as
-      attributes. Currently only support primitive types as value, and only
-      allowlisted attribute name is allowed. Unallowlisted attribute name or
-      unsupported value will result into ValueError. `func_name` is also one of
-      the allowlisted argument which is a python string, and sets the name for
-      this `ConcreteFunction` in the graph.
-    autograph: same as defun()'s autograph.
-    experimental_autograph_options: same as defun()'s
-      experimental_autograph_options.
-    jit_compile: same as defun()'s jit_compile.
-    reduce_retracing: same as defun()'s reduce_retracing
-
-  Returns:
-    Same as the return value of defun, with attributes added to the function in
-    graph.
-  """
-
-  # TODO(apassos): deal with captured global state. Deal with control flow.
-  def decorated(function):
-    try:
-      if attributes:
-        name = attributes.pop("func_name", function.__name__)
-      else:
-        name = function.__name__
-    except AttributeError:
-      name = "function"
-    return tf_decorator.make_decorator(
-        function,
-        tracing_compiler.TracingCompiler(
-            function,
-            name,
-            input_signature=input_signature,
-            attributes=attributes,
-            autograph=autograph,
-            autograph_options=experimental_autograph_options,
-            jit_compile=jit_compile,
-            reduce_retracing=reduce_retracing))
-
-  # This code path is for the `foo = tfe.defun(foo, ...)` use case
-  if func is not None:
-    return decorated(func)
-
-  # This code path is for the
-  #
-  # @tfe.defun(...)
-  # def foo(...):
-  #    ...
-  #
-  # use case, which is equivalent to `foo = tfe.defun(...)(foo)`
-  return decorated
-
-
-# TODO(b/244360504): Remove this API in favour of the graph transformation API.
-def add_function_callback(function_callback):
-  """Add a callback function for Function creation.
-
-  The callback function has the signature:
-
-    `def function_callback(function, name, graph, inputs, outputs):`
-
-  where:
-  - `function`: _EagerDefinedFunction being created before finalizing the graph.
-      Do not modify the function directly but instead modify the graph.
-  - `name`: name of the function.
-  - `graph`: Graph of the function.
-  - `inputs`: `tuple` of tensors used as inputs to the function.
-  - `outputs`: `tuple` of tensors used as outputs from the function.
-
-  The callback is at the top of the `_EagerDefinedFunction` construction, giving
-  callback an opportunity to make the last edits to the graph. Do not make
-  changes to `graph, inputs`, and `outputs` manually, but, instead, set the
-  `graph` as the default then define ops.
-
-  Repeated registration of the same callback function is idempotent.
-  After a callback is added, it can be removed with the
-  `remove_function_callback()` method.
-
-  Args:
-    function_callback: The callback to add.
-  """
-  atomic_function.function_callbacks.add(function_callback)
-
-
-# TODO(b/244360504): Remove this API in favour of the graph transformation API.
-def remove_function_callback(function_callback):
-  """Remove an already-added function callback.
-
-  See the doc string of `add_function_callback()` for more information.
-
-  Args:
-    function_callback: The callback to remove.
-  """
-  atomic_function.function_callbacks.remove(function_callback)
-
-
-# TODO(b/244360504): Remove this API in favour of the graph transformation API.
-def clear_function_callbacks():
-  """Clear all function callbacks, if any have been regisered."""
-  atomic_function.function_callbacks.clear()
-
-
-@deprecation.deprecated(
-    None, "Use `tf.config.run_functions_eagerly` instead of the experimental "
-    "version.")
-@tf_export("config.experimental_run_functions_eagerly")
-def experimental_run_functions_eagerly(run_eagerly):
-  """Enables / disables eager execution of `tf.function`s.
-
-  Calling `tf.config.experimental_run_functions_eagerly(True)` will make all
-  invocations of `tf.function` run eagerly instead of running as a traced graph
-  function.
-
-  See `tf.config.run_functions_eagerly` for an example.
-
-  Note: This flag has no effect on functions passed into tf.data transformations
-  as arguments. tf.data functions are never executed eagerly and are always
-  executed as a compiled Tensorflow Graph.
-
-  Args:
-    run_eagerly: Boolean. Whether to run functions eagerly.
-
-  Returns:
-    None
-  """
-  return eager_function_run.run_functions_eagerly(run_eagerly)
-
-
-@deprecation.deprecated(
-    None,
-    "Use tf.config.functions_run_eagerly instead of the experimental version.")
-@tf_export("config.experimental_functions_run_eagerly")
-def experimental_functions_run_eagerly():
-  """Returns the value of the `experimental_run_functions_eagerly` setting."""
-  return eager_function_run.functions_run_eagerly()
diff --git a/tensorflow/python/eager/polymorphic_function/quarantine_test.py b/tensorflow/python/eager/polymorphic_function/quarantine_test.py
index dbad7876507..76a5a50f1ce 100644
--- a/tensorflow/python/eager/polymorphic_function/quarantine_test.py
+++ b/tensorflow/python/eager/polymorphic_function/quarantine_test.py
@@ -16,7 +16,6 @@
 import copy
 import functools
 import itertools
-import os
 import weakref
 
 from absl.testing import parameterized
@@ -27,9 +26,8 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager.polymorphic_function import atomic_function
 from tensorflow.python.eager.polymorphic_function import polymorphic_function
-from tensorflow.python.eager.polymorphic_function import quarantine
+from tensorflow.python.eager.polymorphic_function import tracing_compiler
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -45,7 +43,6 @@ from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -61,6 +58,7 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
+
 try:
   import attr  # pylint:disable=g-import-not-at-top
 except ImportError:
@@ -75,6 +73,33 @@ def total_function_cache(defined):
 # TracingCompiler.
 class DefunTest(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes
+  def testBackwardNoneGradient(self):
+    model = variables.Variable(1.0, name='model')
+    count = variables.Variable(0)
+
+    @polymorphic_function.function
+    def forward_pass(value):
+      count.assign_add(1)
+      residuals = value - model
+      loss = 0.5 * math_ops.reduce_mean(math_ops.pow(residuals, 2))
+      # Note: count is an integer, so its doutput will be None
+      return loss, count
+
+    def reduce_fn(x):
+      if context.executing_eagerly():
+        with backprop.GradientTape() as t:
+          loss, count = forward_pass(x)
+        return t.gradient(loss, model), count
+      loss, count = forward_pass(x)
+      grad_only = gradients_impl.gradients(loss, model)
+      return grad_only, count
+
+    g, _ = reduce_fn(constant_op.constant([7.0]))
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(nest.flatten(self.evaluate(g)), [-6.0])
+
   def testExternalControlDependency(self):
     with ops.Graph().as_default(), self.test_session():
       v = variables.Variable(1.0)
@@ -82,7 +107,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
 
       op = v.assign_add(1.0)
 
-      @quarantine.defun_with_attributes
+      @polymorphic_function.function
       def f():
         with ops.control_dependencies([op]):
           return 1.0
@@ -93,7 +118,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
   def testInputShapeFunctionRelaxation(self):
     unknown_dim = [False]
 
-    @quarantine.defun_with_attributes(reduce_retracing=True)
+    @polymorphic_function.function(reduce_retracing=True)
     def func(a):
       if a._shape_tuple()[0] is None:
         unknown_dim[0] = True
@@ -113,7 +138,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
   def testNestedInputShapeFunctionRelaxation(self):
     unknown_dim = [False]
 
-    @quarantine.defun_with_attributes(reduce_retracing=True)
+    @polymorphic_function.function(reduce_retracing=True)
     def func(a_, b_=None):
       del a_  # Only used to check which cache is used.
       self.assertEqual(b_[0]._shape_tuple(), ())
@@ -149,30 +174,14 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertTrue(unknown_dim[0])
     self.assertLen(total_function_cache(func), 4)
 
-  def testFuncName(self):
-
-    @quarantine.defun_with_attributes(attributes={'func_name': 'multiply'})
-    def add(x, y):
-      _ = x * y
-      return x + y
-
-    @quarantine.defun_with_attributes
-    def add_2(x, y):
-      _ = x * y
-      return x + y
-
-    self.assertEqual(add._name, 'multiply')
-    self.assertEqual(add_2._name, 'add_2')
-
   def testNestedFunctionGraphNotOutOfDate(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def f():
-      return constant_op.constant(1.)
+      return constant_op.constant(1.0)
 
     class _Model(object):
 
-      @quarantine.defun_with_attributes
+      @polymorphic_function.function
       def g(self):
         self.f = f.get_concrete_function()
 
@@ -189,25 +198,23 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertIs(ops.get_default_graph(), concrete.graph.outer_graph)
 
   def testGraphEagerIsolation(self):
-
-    @quarantine.defun_with_attributes
-    def f():
+    def f_py():
       self.v = variables.Variable(1.0)
       return self.v.read_value()
 
+    f = tracing_compiler.TracingCompiler(f_py, 'f')
     self.assertAllEqual(f(), 1.0)
 
     with ops.Graph().as_default():
       self.assertEqual(f().shape, ())
 
   def testDefunNumpyArraysConvertedToTensors(self):
-
     def f(x):
       self.assertIsInstance(x, ops.Tensor)
       return x
 
     x = random_ops.random_uniform([2, 2]).numpy()
-    defined = quarantine.defun_with_attributes(f)
+    defined = polymorphic_function.function(f)
     defined(x)
     self.assertLen(total_function_cache(defined), 1)
 
@@ -223,32 +230,31 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     tf_zeros = array_ops.zeros([])
 
     # Test that the numpy array is properly an argument to the graph function.
-    self.assertEqual(1., defined(np_ones).numpy())
+    self.assertEqual(1.0, defined(np_ones).numpy())
     self.assertLen(total_function_cache(defined), 2)
-    self.assertEqual(0., defined(np_zeros).numpy())
-    self.assertEqual(1., defined(tf_ones).numpy())
-    self.assertEqual(0., defined(tf_zeros).numpy())
+    self.assertEqual(0.0, defined(np_zeros).numpy())
+    self.assertEqual(1.0, defined(tf_ones).numpy())
+    self.assertEqual(0.0, defined(tf_zeros).numpy())
     self.assertLen(total_function_cache(defined), 2)
 
     # Test that mutable inputs are supported.
     mutable = numpy.ones([], numpy.float32)
-    self.assertEqual(1., defined(mutable).numpy())
+    self.assertEqual(1.0, defined(mutable).numpy())
     mutable.fill(0)
-    self.assertEqual(0., defined(mutable).numpy())
+    self.assertEqual(0.0, defined(mutable).numpy())
 
     class MyNdarray(numpy.ndarray):
       pass
 
     # Test that the subclasses of ndarray are converted too.
-    self.assertEqual(1., defined(np_ones.view(MyNdarray)).numpy())
-    self.assertEqual(0., defined(np_zeros.view(MyNdarray)).numpy())
+    self.assertEqual(1.0, defined(np_ones.view(MyNdarray)).numpy())
+    self.assertEqual(0.0, defined(np_zeros.view(MyNdarray)).numpy())
 
     # We should not have triggered any re-tracing of the python function.
     self.assertLen(total_function_cache(defined), 2)
 
   def testNumpyDtypeInputSupported(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def f(x, dtype):
       return constant_op.constant(dtype(x))
 
@@ -258,14 +264,13 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(f(2, numpy.int32).numpy(), numpy.int32(2))
 
   def testDefunNumpyArraysConvertedToTensorsInKwargs(self):
-
     def f(**kwargs):
       x = kwargs.pop('x')
       self.assertIsInstance(x, ops.Tensor)
       return x
 
     x = random_ops.random_uniform([2, 2]).numpy()
-    defined = quarantine.defun_with_attributes(f)
+    defined = polymorphic_function.function(f)
     defined(x=x)
     self.assertLen(total_function_cache(defined), 1)
 
@@ -276,16 +281,14 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertLen(total_function_cache(defined), 1)
 
     # Test that the numpy array is properly an argument to the graph function.
-    self.assertEqual(1., defined(x=numpy.ones([])).numpy())
-    self.assertEqual(0., defined(x=numpy.zeros([])).numpy())
-    self.assertEqual(1., defined(x=array_ops.ones([])).numpy())
-    self.assertEqual(0., defined(x=array_ops.zeros([])).numpy())
+    self.assertEqual(1.0, defined(x=numpy.ones([])).numpy())
+    self.assertEqual(0.0, defined(x=numpy.zeros([])).numpy())
+    self.assertEqual(1.0, defined(x=array_ops.ones([])).numpy())
+    self.assertEqual(0.0, defined(x=array_ops.zeros([])).numpy())
 
   def testFuncListAttr(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def test_function(val):
-
       def fn1():
         return array_ops.ones([10])
 
@@ -297,10 +300,15 @@ class DefunTest(test.TestCase, parameterized.TestCase):
       fn4 = functools.partial(fn3, x=4)
       fn5 = functools.partial(fn3, 5)
 
-      return gen_functional_ops.case(val, [], [dtypes.float32], [
-          quarantine.defun_with_attributes(f).get_concrete_function()
-          for f in (fn1, fn2, fn3, fn4, fn5)
-      ])
+      return gen_functional_ops.case(
+          val,
+          [],
+          [dtypes.float32],
+          [
+              polymorphic_function.function(f).get_concrete_function()
+              for f in (fn1, fn2, fn3, fn4, fn5)
+          ],
+      )
 
     ones = array_ops.ones([10])
     self.assertAllEqual([ones], test_function(0))
@@ -312,10 +320,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
 
   @test_util.enable_control_flow_v2
   def testVariableInLoopInFunction(self):
-
-    @quarantine.defun_with_attributes
-    def test_function():
-
+    def test_function_py():
       def loop_test(_):
         return False
 
@@ -324,17 +329,21 @@ class DefunTest(test.TestCase, parameterized.TestCase):
 
       return while_loop.while_loop(loop_test, loop_body, [0.0])
 
+    test_function = tracing_compiler.TracingCompiler(
+        test_function_py, 'test_function'
+    )
+
     self.assertEqual(test_function().shape, [])
 
   @test_util.run_in_graph_and_eager_modes
   def testDefunForcesResourceVariables(self):
-
     def variable_creator():
       self.v = variables.Variable(0.0)
       return self.v.read_value()
 
-    self.v = None
-    defined = quarantine.defun_with_attributes(variable_creator)
+    defined = tracing_compiler.TracingCompiler(
+        variable_creator, 'variable_creator'
+    )
     defined()  # Create the variable.
     self.assertIsInstance(self.v, resource_variable_ops.ResourceVariable)
 
@@ -352,7 +361,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
       gpu_result = math_ops.reduce_sum(array_ops.gather(v_gpu, [1, 2]))
       return cpu_result, gpu_result
 
-    defined = quarantine.defun_with_attributes(sum_gather)
+    defined = polymorphic_function.function(sum_gather)
     if not context.executing_eagerly():
       self.evaluate(variables.global_variables_initializer())
     expected = self.evaluate(sum_gather())
@@ -360,14 +369,13 @@ class DefunTest(test.TestCase, parameterized.TestCase):
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testCallOptionsMemory(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def model(x):
-      return x + constant_op.constant(1.)
+      return x + constant_op.constant(1.0)
 
     # This happens with a lot of option toggles, e.g. soft device placement
     context.context().function_call_options = None
-    model(constant_op.constant(2.))
+    model(constant_op.constant(2.0))
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testLayerInDefun(self):
@@ -375,14 +383,14 @@ class DefunTest(test.TestCase, parameterized.TestCase):
         filters=1,
         kernel_size=2,
         kernel_initializer=init_ops.ones_initializer(),
-        bias_initializer=init_ops.zeros_initializer())
+        bias_initializer=init_ops.zeros_initializer(),
+    )
 
-    @quarantine.defun_with_attributes
     def model(x):
       return conv(x)
 
     x = array_ops.ones([1, 2, 2, 1])
-    y = model(x)
+    y = tracing_compiler.TracingCompiler(model, 'model')(x)
 
     if not context.executing_eagerly():
       self.evaluate(variables.global_variables_initializer())
@@ -391,17 +399,16 @@ class DefunTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testVariablesPlacedOnOutsideDevice(self):
-
     class _Obj(object):
 
       def __init__(self):
         self.v = None
 
-      @quarantine.defun_with_attributes
+      @polymorphic_function.function
       def f(self):
         if self.v is None:
-          self.v = variables.Variable(1.)
-        return self.v + 1.
+          self.v = variables.Variable(1.0)
+        return self.v + 1.0
 
     has_device = _Obj()
     with ops.device('cpu:0'):
@@ -409,7 +416,6 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertIn('CPU', has_device.v.device)
 
   def testCacheObjectHashCollisions(self):
-
     class Foo:
 
       def __hash__(self):
@@ -418,7 +424,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     def func(foo):
       return constant_op.constant([id(foo)])
 
-    defined = quarantine.defun_with_attributes(func)
+    defined = polymorphic_function.function(func)
     foo_1 = Foo()
     defined(foo_1)
     self.assertLen(total_function_cache(defined), 1)
@@ -428,11 +434,10 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertLen(total_function_cache(defined), 2)
 
   def testCacheTensorDtypeCollision(self):
-
     def func(t):
       return t + t
 
-    defined = quarantine.defun_with_attributes(func)
+    defined = polymorphic_function.function(func)
     t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
     defined(t)
     self.assertLen(total_function_cache(defined), 1)
@@ -442,11 +447,10 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertLen(total_function_cache(defined), 2)
 
   def testCacheTensorShapeCollision(self):
-
     def func(t):
       return t + t
 
-    defined = quarantine.defun_with_attributes(func)
+    defined = polymorphic_function.function(func)
     t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
     defined(t)
     self.assertLen(total_function_cache(defined), 1)
@@ -456,11 +460,10 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertLen(total_function_cache(defined), 2)
 
   def testCacheTensorShapeDtypeCollision(self):
-
     def func(t):
       return t + t
 
-    defined = quarantine.defun_with_attributes(func)
+    defined = polymorphic_function.function(func)
     t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
     defined(t)
     self.assertLen(total_function_cache(defined), 1)
@@ -470,12 +473,11 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertLen(total_function_cache(defined), 2)
 
   def testCacheTensorUnknownShapesCollisionRelaxedShapes(self):
-
     def func(t):
       return t + t
 
     with context.graph_mode(), self.cached_session():
-      defined = quarantine.defun_with_attributes(func, reduce_retracing=True)
+      defined = polymorphic_function.function(func, reduce_retracing=True)
 
       p = array_ops.placeholder(dtype=dtypes.float32, shape=[])
       defined(p)
@@ -497,14 +499,13 @@ class DefunTest(test.TestCase, parameterized.TestCase):
       self.assertLen(total_function_cache(defined), 2)
 
   def testPythonFunctionWithDefaultArgs(self):
-
     def func(foo, bar=1, baz=2):
       del foo
       del bar
       del baz
       return
 
-    defined = quarantine.defun_with_attributes(func)
+    defined = polymorphic_function.function(func)
     defined(0, baz=20)
     self.assertLen(total_function_cache(defined), 1)
 
@@ -527,27 +528,25 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertLen(total_function_cache(defined), 3)
 
   def testFunctoolsPartialUnwrappedCorrectly(self):
-
     def full_function(a, b, c=3):
       return a, b, c
 
     partial = functools.partial(full_function, 1, c=4)
     a, b, c = partial(2)
 
-    defined = quarantine.defun_with_attributes(partial)
+    defined = polymorphic_function.function(partial)
     func_a, func_b, func_c = defined(2)
     self.assertEqual(func_a.numpy(), a)
     self.assertEqual(func_b.numpy(), b)
     self.assertEqual(func_c.numpy(), c)
 
   def testInputSignatureWithMatchingInputs(self):
-
     def foo(a):
       self.assertEqual(a.shape, (2,))
       return a
 
     signature = [tensor_spec.TensorSpec(shape=(2,), dtype=dtypes.float32)]
-    defined = quarantine.defun_with_attributes(foo, input_signature=signature)
+    defined = polymorphic_function.function(foo, input_signature=signature)
     a = array_ops.ones([2])
     self.assertAllEqual(a, defined(a))
     self.assertLen(total_function_cache(defined), 1)
@@ -556,7 +555,9 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(
         a,
         defined.get_concrete_function(
-            tensor_spec.TensorSpec((2,), dtype=dtypes.float32))(a))
+            tensor_spec.TensorSpec((2,), dtype=dtypes.float32)
+        )(a),
+    )
     self.assertLen(total_function_cache(defined), 1)
 
     def bar(a):
@@ -564,7 +565,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
       return a
 
     signature = [tensor_spec.TensorSpec((2, None), dtypes.float32)]
-    defined = quarantine.defun_with_attributes(bar, input_signature=signature)
+    defined = polymorphic_function.function(bar, input_signature=signature)
     a = array_ops.ones([2, 1])
     out = defined(a)
     self.assertLen(total_function_cache(defined), 1)
@@ -577,8 +578,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(out, b)
 
   def testInputSignatureWithDictInPositionalArgs(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def f(*_args, **_kwargs):
       return None
 
@@ -590,11 +590,11 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertLen(total_function_cache(f), 2)
 
   def testInputSignatureWithCompatibleInputs(self):
-
     rank2_spec = tensor_spec.TensorSpec(
-        shape=(None, None), dtype=dtypes.float32)
+        shape=(None, None), dtype=dtypes.float32
+    )
 
-    @quarantine.defun_with_attributes(input_signature=[rank2_spec])
+    @polymorphic_function.function(input_signature=[rank2_spec])
     def func(a):
       self.assertEqual([None, None], a.shape.as_list())
       return array_ops.shape(a)
@@ -613,14 +613,15 @@ class DefunTest(test.TestCase, parameterized.TestCase):
       func([['wrong dtype']])
 
   def testNestedInputSignatures(self):
-
     def expected_foo(a, b):
       return [a, b]
 
-    @quarantine.defun_with_attributes(input_signature=[
-        [tensor_spec.TensorSpec((2, None), dtypes.float32)] * 2,
-        tensor_spec.TensorSpec((1,), dtypes.float32),
-    ])
+    @polymorphic_function.function(
+        input_signature=[
+            [tensor_spec.TensorSpec((2, None), dtypes.float32)] * 2,
+            tensor_spec.TensorSpec((1,), dtypes.float32),
+        ]
+    )
     def foo(a, b):
       self.assertEqual(a[0]._shape_tuple(), (2, None))
       self.assertEqual(a[1]._shape_tuple(), (2, None))
@@ -661,15 +662,16 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(out[1], c)
 
   def testNestedInputSignaturesWithDict(self):
-
     def expected_bar(a):
       return a
 
-    @quarantine.defun_with_attributes(input_signature=[{
-        'a': tensor_spec.TensorSpec((2, None), dtypes.float32),
-        'b': tensor_spec.TensorSpec((2, None), dtypes.float32),
-        'c': tensor_spec.TensorSpec((1,), dtypes.float32)
-    }])
+    @polymorphic_function.function(
+        input_signature=[{
+            'a': tensor_spec.TensorSpec((2, None), dtypes.float32),
+            'b': tensor_spec.TensorSpec((2, None), dtypes.float32),
+            'c': tensor_spec.TensorSpec((1,), dtypes.float32),
+        }]
+    )
     def bar(a):
       self.assertEqual(a['a']._shape_tuple(), (2, None))
       self.assertEqual(a['b']._shape_tuple(), (2, None))
@@ -697,7 +699,6 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(out['c'], expected['c'])
 
   def testInputSignatureMustBeSequenceOfTensorSpecs(self):
-
     def foo(a, b):
       del a
       del b
@@ -705,34 +706,38 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     # Signatures must be either lists or tuples on their outermost levels.
     signature = {'t1': tensor_spec.TensorSpec([], dtypes.float32)}
     with self.assertRaisesRegex(
-        TypeError, 'input_signature must be either a '
-        'tuple or a list.*'):
-      quarantine.defun_with_attributes(foo, input_signature=signature)
+        TypeError, 'input_signature must be either a tuple or a list.*'
+    ):
+      polymorphic_function.function(foo, input_signature=signature)
 
   def testInputsIncompatibleWithNestedSignatureRaisesError(self):
-
     def foo(a, b):
       return [a, b]
 
-    signature = [[tensor_spec.TensorSpec((1,), dtypes.float32)] * 2,
-                 [tensor_spec.TensorSpec((1,), dtypes.float32)] * 2]
-    defined = quarantine.defun_with_attributes(foo, input_signature=signature)
+    signature = [
+        [tensor_spec.TensorSpec((1,), dtypes.float32)] * 2,
+        [tensor_spec.TensorSpec((1,), dtypes.float32)] * 2,
+    ]
+    defined = polymorphic_function.function(foo, input_signature=signature)
     a = array_ops.ones([1])
 
-    with self.assertRaisesRegex(TypeError,
-                                'Binding inputs to tf.function `foo` failed'):
+    with self.assertRaisesRegex(
+        TypeError, 'Binding inputs to tf.function `foo` failed'
+    ):
       defined([a, a, a], [a])
 
-    with self.assertRaisesRegex(TypeError,
-                                'Binding inputs to tf.function `foo` failed'):
+    with self.assertRaisesRegex(
+        TypeError, 'Binding inputs to tf.function `foo` failed'
+    ):
       defined([a], [a, a, a])
     defined([a, a], [a, a])
 
   def testUnderspecifiedInputSignature(self):
-
-    @quarantine.defun_with_attributes(input_signature=[
-        tensor_spec.TensorSpec([], dtypes.float32),
-    ])
+    @polymorphic_function.function(
+        input_signature=[
+            tensor_spec.TensorSpec([], dtypes.float32),
+        ]
+    )
     def foo(a, training=True):
       if training:
         return a
@@ -741,21 +746,20 @@ class DefunTest(test.TestCase, parameterized.TestCase):
 
     x = constant_op.constant(1.0)
     with self.assertRaisesRegex(
-        TypeError, 'Binding inputs to tf.function `foo` failed'):
+        TypeError, 'Binding inputs to tf.function `foo` failed'
+    ):
       foo(x, training=False)
 
     self.assertAllEqual(x.numpy(), foo(x).numpy())
 
   def testInputSignatureWithPartialFunction(self):
-
     def full_function(a, b, c=3.0):
       return a, b, c
 
     partial = functools.partial(full_function, 1, c=4)
     a, b, c = partial(2.0)
     signature = [tensor_spec.TensorSpec([], dtypes.float32)]
-    defined = quarantine.defun_with_attributes(
-        partial, input_signature=signature)
+    defined = polymorphic_function.function(partial, input_signature=signature)
     x = constant_op.constant(2.0)
     func_a, func_b, func_c = defined(x)
     self.assertEqual(func_a.numpy(), a)
@@ -763,11 +767,12 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(func_c.numpy(), c)
 
   def testInputSignatureWithKeywordPositionalArgs(self):
-
-    @quarantine.defun_with_attributes(input_signature=[
-        tensor_spec.TensorSpec([], dtypes.float32),
-        tensor_spec.TensorSpec([], dtypes.int64)
-    ])
+    @polymorphic_function.function(
+        input_signature=[
+            tensor_spec.TensorSpec([], dtypes.float32),
+            tensor_spec.TensorSpec([], dtypes.int64),
+        ]
+    )
     def foo(flt, integer):
       return flt, integer
 
@@ -795,22 +800,21 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(out2.numpy(), 2)
 
   def testInputSignatureWithKeywordArgs(self):
-
     def foo(a, b, **kwargs):
       del kwargs
       return a, b
 
-    x = quarantine.defun_with_attributes(
+    x = polymorphic_function.function(
         foo,
         input_signature=[
             tensor_spec.TensorSpec([], dtypes.float32),
-            tensor_spec.TensorSpec([], dtypes.int32)
-        ]).get_concrete_function()
+            tensor_spec.TensorSpec([], dtypes.int32),
+        ],
+    ).get_concrete_function()
     result = x(constant_op.constant(5.0), constant_op.constant(5))
     self.assertAllEqual(result, [5.0, 5])
 
   def testInputSignatureWithCompositeTensors(self):
-
     def f(rt):
       self.assertEqual(rt.values.shape.as_list(), [None])
       self.assertEqual(rt.row_splits.shape.as_list(), [4])
@@ -819,7 +823,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     signature = [
         ragged_tensor.RaggedTensorSpec(shape=[3, None], dtype=dtypes.int32)
     ]
-    defined = quarantine.defun_with_attributes(f, input_signature=signature)
+    defined = polymorphic_function.function(f, input_signature=signature)
     rt1 = ragged_factory_ops.constant([[1], [], [2, 3, 4]])
     out1 = defined(rt1)
     self.assertLen(total_function_cache(defined), 1)
@@ -855,7 +859,6 @@ class DefunTest(test.TestCase, parameterized.TestCase):
       defined(rt5)
 
   def testInputSignatureWithKeywordOnlyArgs(self):
-
     def f(a, b, c=3, *, d=4):
       self.assertIsInstance(a, ops.Tensor)
       self.assertIsInstance(b, ops.Tensor)
@@ -867,20 +870,22 @@ class DefunTest(test.TestCase, parameterized.TestCase):
         tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32),
         tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32),
     ]
-    defined = quarantine.defun_with_attributes(f, input_signature=signature)
+    defined = polymorphic_function.function(f, input_signature=signature)
     self.assertEqual(defined(1, 2).numpy(), 10)
 
-    defined = quarantine.defun_with_attributes(
-        functools.partial(f, c=4), input_signature=signature)
+    defined = polymorphic_function.function(
+        functools.partial(f, c=4), input_signature=signature
+    )
     self.assertEqual(defined(1, 2).numpy(), 11)
 
-    defined = quarantine.defun_with_attributes(
-        functools.partial(f, d=5), input_signature=signature)
+    defined = polymorphic_function.function(
+        functools.partial(f, d=5), input_signature=signature
+    )
     self.assertEqual(defined(1, 2).numpy(), 11)
 
-    defined = quarantine.defun_with_attributes(
-        functools.partial(f, d=array_ops.constant(5)),
-        input_signature=signature)
+    defined = polymorphic_function.function(
+        functools.partial(f, d=array_ops.constant(5)), input_signature=signature
+    )
     self.assertEqual(defined(1, 2).numpy(), 11)
 
     mod = module.Module()
@@ -889,7 +894,8 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     result = loaded.signatures['serving_default'](
         a=array_ops.constant(1),
         b=array_ops.constant(2),
-        d=array_ops.constant(5))
+        d=array_ops.constant(5),
+    )
     self.assertEqual(result['output_0'].numpy(), 11)
 
   def testInputSignatureWithKeywordOnlyArgsNoDefaults(self):
@@ -908,7 +914,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
             ' have a default value'
         ),
     ):
-      quarantine.defun_with_attributes(test_func, input_signature=signature)
+      polymorphic_function.function(test_func, input_signature=signature)
 
     test_func_lambda = lambda a, *, b: a + b
     with self.assertRaisesRegex(
@@ -918,17 +924,14 @@ class DefunTest(test.TestCase, parameterized.TestCase):
             ' have a default value'
         ),
     ):
-      quarantine.defun_with_attributes(
-          test_func_lambda, input_signature=signature
-      )
+      polymorphic_function.function(test_func_lambda, input_signature=signature)
 
   def testTensorKeywordArguments(self):
-
     def foo(a, b):
       del a
       return b
 
-    defined = quarantine.defun_with_attributes(foo)
+    defined = polymorphic_function.function(foo)
     a = constant_op.constant(2.0)
     b = constant_op.constant([1.0, 2.0])
     one = defined(a, b)
@@ -970,20 +973,19 @@ class DefunTest(test.TestCase, parameterized.TestCase):
         ValueError,
         'TracingCompiler does not support `experimental_1` as an attribute.',
     ):
-      quarantine.defun_with_attributes(
-          add, attributes={'experimental_1': 'value1'}
+      tracing_compiler.TracingCompiler(
+          add, 'add', attributes={'experimental_1': 'value1'}
       )
 
   def testRegisterFunction(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def add(x, y):
       return math_ops.add(x, y)
 
     def matmul(x, y):
       return math_ops.matmul(x, y)
 
-    defun_matmul = quarantine.defun_with_attributes(matmul)
+    defun_matmul = polymorphic_function.function(matmul)
 
     with context.graph_mode(), self.cached_session():
       with ops.get_default_graph().as_default():
@@ -1002,7 +1004,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
         # two sets of functions, each of them are (inference, forward, backward)
         functions = list(graph._functions.values())
         captured_function_names = [
-            f.definition.signature.name for f in functions
+            f.cached_definition.signature.name for f in functions
         ]
         expected_func_name_regex = [
             '.*inference.*matmul.*',
@@ -1013,23 +1015,28 @@ class DefunTest(test.TestCase, parameterized.TestCase):
             '.*inference.*backward.*add.*',
         ]
         for i in range(len(functions)):
-          self.assertRegex(captured_function_names[i],
-                           expected_func_name_regex[i])
+          self.assertRegex(
+              captured_function_names[i], expected_func_name_regex[i]
+          )
 
         # Check the forward and backward function has the correct attributes.
         self.assertEqual(
-            functions[1].definition.attr['backward_function_name'].s,
-            functions[2].name)
+            functions[1].cached_definition.attr['backward_function_name'].s,
+            functions[2].name,
+        )
         self.assertEqual(
-            functions[2].definition.attr['forward_function_name'].s,
-            functions[1].name)
+            functions[2].cached_definition.attr['forward_function_name'].s,
+            functions[1].name,
+        )
 
         self.assertEqual(
-            functions[4].definition.attr['backward_function_name'].s,
-            functions[5].name)
+            functions[4].cached_definition.attr['backward_function_name'].s,
+            functions[5].name,
+        )
         self.assertEqual(
-            functions[5].definition.attr['forward_function_name'].s,
-            functions[4].name)
+            functions[5].cached_definition.attr['forward_function_name'].s,
+            functions[4].name,
+        )
 
         sq = defun_matmul(t, t)
         double = add(t, t)
@@ -1040,28 +1047,31 @@ class DefunTest(test.TestCase, parameterized.TestCase):
         self.assertLen(graph._functions, 6)
         functions = list(graph._functions.values())
         for i in range(len(functions)):
-          self.assertEqual(captured_function_names[i],
-                           functions[i].definition.signature.name)
+          self.assertEqual(
+              captured_function_names[i],
+              functions[i].cached_definition.signature.name,
+          )
 
   def testRegisterConcreteFunction(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def py_add(x, y):
       return math_ops.add(x, y)
 
     py_add(array_ops.ones([]), array_ops.ones([]))
     add = py_add.get_concrete_function(
         tensor_spec.TensorSpec(None, dtypes.float32),
-        tensor_spec.TensorSpec(None, dtypes.float32))
+        tensor_spec.TensorSpec(None, dtypes.float32),
+    )
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def py_composite(x, y):
       return x, add(x, y)
 
     py_composite(array_ops.ones([]), array_ops.ones([]))
     composite = py_composite.get_concrete_function(
         tensor_spec.TensorSpec(None, dtypes.float32),
-        tensor_spec.TensorSpec(None, dtypes.float32))
+        tensor_spec.TensorSpec(None, dtypes.float32),
+    )
 
     with context.graph_mode(), self.cached_session():
       with ops.get_default_graph().as_default():
@@ -1075,7 +1085,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
         # two sets of functions, each of them are (inference, forward, backward)
         functions = list(graph._functions.values())
         captured_function_names = [
-            f.definition.signature.name for f in functions
+            f.cached_definition.signature.name for f in functions
         ]
         expected_func_name_regex = [
             '.*inference.*py_composite.*',
@@ -1085,8 +1095,9 @@ class DefunTest(test.TestCase, parameterized.TestCase):
             '.*inference.*backward.*py_composite.*',
             '.*inference.*backward.*py_add.*',
         ]
-        for expected, found in zip(expected_func_name_regex,
-                                   captured_function_names):
+        for expected, found in zip(
+            expected_func_name_regex, captured_function_names
+        ):
           self.assertRegex(found, expected)
 
         composite_t, composite_double = composite(t, t)
@@ -1108,10 +1119,13 @@ class DefunTest(test.TestCase, parameterized.TestCase):
 
       v = resource_variable_ops.ResourceVariable(0.0)
 
-    for captured, op_type in [(large_tensor, 'Placeholder'),
-                              (small_tensor, 'Const'), (v, 'Placeholder')]:
+    for captured, op_type in [
+        (large_tensor, 'Placeholder'),
+        (small_tensor, 'Const'),
+        (v, 'Placeholder'),
+    ]:
 
-      @quarantine.defun_with_attributes
+      @polymorphic_function.function
       def test_fn():
         return captured + 1  # pylint: disable=cell-var-from-loop
 
@@ -1121,16 +1135,16 @@ class DefunTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(internal_captures[0].op.type, op_type)
 
   def testRegisterFunctionWithInputSignature(self):
-
     def matmul(x, y):
       return math_ops.matmul(x, y)
 
-    defun_matmul = quarantine.defun_with_attributes(
+    defun_matmul = polymorphic_function.function(
         matmul,
         input_signature=[
             tensor_spec.TensorSpec(shape=(2, 2), dtype=dtypes.float32),
-            tensor_spec.TensorSpec(shape=(2, 2), dtype=dtypes.float32)
-        ])
+            tensor_spec.TensorSpec(shape=(2, 2), dtype=dtypes.float32),
+        ],
+    )
     with context.graph_mode(), self.cached_session():
       with ops.get_default_graph().as_default():
         t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
@@ -1150,11 +1164,10 @@ class DefunTest(test.TestCase, parameterized.TestCase):
         self.assertLen(graph._functions, 3)
 
   def testRegisterFunctionWithCache(self):
-
     def matmul(x, y):
       return math_ops.matmul(x, y)
 
-    defun_matmul = quarantine.defun_with_attributes(matmul)
+    defun_matmul = polymorphic_function.function(matmul)
 
     with context.graph_mode(), self.cached_session():
       with ops.get_default_graph().as_default():
@@ -1174,8 +1187,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
         self.assertLen(graph._functions, 3)
 
   def testCallingFunctionWithDifferentVariables(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def foo(v):
       v.assign_add(1.0)
       return v.read_value()
@@ -1190,7 +1202,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
 
     w = resource_variable_ops.ResourceVariable(0.0)
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def bar(v):
       del v
       return constant_op.constant(1.0)
@@ -1200,8 +1212,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(float(graph_function(w)), 1.0)
 
   def testCallingFunctionWithNonTensorsFails(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def foo(x):
       return x
 
@@ -1211,14 +1222,14 @@ class DefunTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters([
       (
-          quarantine.defun_with_attributes(
-              attributes={
+          polymorphic_function.function(
+              experimental_attributes={
                   'api_implements': 'random_boost',
                   'api_preferred_device': 'CPU',
               }
           ),
-          quarantine.defun_with_attributes(
-              attributes={
+          polymorphic_function.function(
+              experimental_attributes={
                   'api_implements': 'random_boost',
                   'api_preferred_device': 'GPU',
               }
@@ -1248,11 +1259,13 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     rewrites.implementation_selector = rewriter_config_pb2.RewriterConfig.ON
     rewrites.min_graph_nodes = -1
     graph_options = config_pb2.GraphOptions(
-        rewrite_options=rewrites, build_cost_model=1)
+        rewrite_options=rewrites, build_cost_model=1
+    )
     config_proto = config_pb2.ConfigProto(graph_options=graph_options)
 
     with context.graph_mode(), self.cached_session(
-        config=config_proto, graph=ops.Graph(), use_gpu=True):
+        config=config_proto, graph=ops.Graph(), use_gpu=True
+    ):
 
       @cpu_decorator
       def cpu_boost(x):
@@ -1276,8 +1289,10 @@ class DefunTest(test.TestCase, parameterized.TestCase):
         # Grappler fallback to use the CPU impl even called with GPU function.
         self.assertEqual(y_value, 3.0)
 
-  @test_util.disable_tfrt('b/174712583: TFRT doesn\'t support behavior '
-                          'equivalent to implementation_selector for function')
+  @test_util.disable_tfrt(
+      "b/174712583: TFRT doesn't support behavior "
+      'equivalent to implementation_selector for function'
+  )
   def testSwapImplementationInEager(self):
     if not context.executing_eagerly():
       self.skipTest('eager only')
@@ -1289,24 +1304,28 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     context.context().set_optimizer_experimental_options({
         'min_graph_nodes': -1,
         'implementation_selector': True,
-        'disable_meta_optimizer': False
+        'disable_meta_optimizer': False,
     })
 
-    @quarantine.defun_with_attributes(attributes={
-        'api_implements': 'foo',
-        'api_preferred_device': 'CPU'
-    })
+    @polymorphic_function.function(
+        experimental_attributes={
+            'api_implements': 'foo',
+            'api_preferred_device': 'CPU',
+        }
+    )
     def on_cpu(x):
       return x + 2
 
-    @quarantine.defun_with_attributes(attributes={
-        'api_implements': 'foo',
-        'api_preferred_device': 'GPU'
-    })
+    @polymorphic_function.function(
+        experimental_attributes={
+            'api_implements': 'foo',
+            'api_preferred_device': 'GPU',
+        }
+    )
     def on_gpu(x):
       return x + 4
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def run_on_cpu(t):
       concrete_func = on_cpu.get_concrete_function(t)
       concrete_func.add_to_graph()
@@ -1320,11 +1339,11 @@ class DefunTest(test.TestCase, parameterized.TestCase):
   def testDefunFunctionSeparateGraphs(self):
     with context.graph_mode():
 
-      @quarantine.defun_with_attributes
+      @polymorphic_function.function
       def add(x):
         return x + 5
 
-      @quarantine.defun_with_attributes
+      @polymorphic_function.function
       def maybe_add(x, should_add):
         if should_add:
           return add(x)
@@ -1348,8 +1367,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
         self.assertLen(total_function_cache(add), 2)
 
   def testCacheKeyOverlappingShapes(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def defined(t):
       return t
 
@@ -1358,7 +1376,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     defined(array_ops.zeros([1, 21]))
     self.assertLen(total_function_cache(defined), 2)
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def defined_again(t):
       return defined(t)
 
@@ -1368,25 +1386,24 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertLen(total_function_cache(defined_again), 2)
 
   def testCacheTensorSpecIdenticalToTensor(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def defined(t):
       return t
 
     z = array_ops.zeros([2, 2])
     z_spec = tensor_spec.TensorSpec.from_tensor(z)
     self.assertIs(
-        defined.get_concrete_function(z_spec), defined.get_concrete_function(z))
+        defined.get_concrete_function(z_spec), defined.get_concrete_function(z)
+    )
 
   def testCacheKeyNestedLists(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def defined(l):
       return l
 
-    a = constant_op.constant(1.)
-    b = constant_op.constant(2.)
-    c = constant_op.constant(3.)
+    a = constant_op.constant(1.0)
+    b = constant_op.constant(2.0)
+    c = constant_op.constant(3.0)
     defined([[a], b, c])
     self.assertLen(total_function_cache(defined), 1)
 
@@ -1402,31 +1419,35 @@ class DefunTest(test.TestCase, parameterized.TestCase):
       a = attr.ib()
       b = attr.ib()
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def defined(l):
       return l
 
     defined(
         TestClass(
-            constant_op.constant(1.),
-            [constant_op.constant(2.),
-             constant_op.constant(3.)]))
+            constant_op.constant(1.0),
+            [constant_op.constant(2.0), constant_op.constant(3.0)],
+        )
+    )
     self.assertLen(total_function_cache(defined), 1)
     defined(
         TestClass(
-            constant_op.constant(1.),
-            [constant_op.constant(2.),
-             constant_op.constant(3.)]))
+            constant_op.constant(1.0),
+            [constant_op.constant(2.0), constant_op.constant(3.0)],
+        )
+    )
     self.assertLen(total_function_cache(defined), 1)
 
     defined(
-        TestClass([constant_op.constant(1.),
-                   constant_op.constant(2.)], constant_op.constant(3.)))
+        TestClass(
+            [constant_op.constant(1.0), constant_op.constant(2.0)],
+            constant_op.constant(3.0),
+        )
+    )
     self.assertLen(total_function_cache(defined), 2)
 
   def testDistinctVariablesNoRetracing(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def defined(a, b, c):
       return a + b + c
 
@@ -1444,8 +1465,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertLen(total_function_cache(defined), 1)
 
   def testRetracingOnDifferentVaribleCombinationPatterns(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def defined(a, b, c):
       return a + b + c
 
@@ -1472,8 +1492,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertLen(total_function_cache(defined), 3)
 
   def testDeepcopyVariableNoRetracing(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def defined(a, b, c):
       return a + b + c
 
@@ -1488,10 +1507,9 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertLen(total_function_cache(defined), 1)
 
   def testDecoratedMethodInspect(self):
-
     class DefunnedMiniModel:
 
-      @quarantine.defun_with_attributes
+      @polymorphic_function.function
       def call(self, inputs, training=True):
         pass
 
@@ -1501,14 +1519,13 @@ class DefunTest(test.TestCase, parameterized.TestCase):
 
   @test_util.disable_tfrt('b/173429686')
   def testExecutorType(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def add_five(x):
       return x + 5
 
     self.assertEqual(
-        5,
-        add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy())
+        5, add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy()
+    )
 
     with self.assertRaisesRegex(errors.NotFoundError, 'NON_EXISTENT_EXECUTOR'):
       with context.function_executor_type('NON_EXISTENT_EXECUTOR'):
@@ -1517,13 +1534,12 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     for executor_type in ('', 'DEFAULT', None):
       with context.function_executor_type(executor_type):
         self.assertAllEqual(
-            5,
-            add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy())
+            5, add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy()
+        )
 
   @test_util.assert_no_garbage_created
   def testReferenceCycles(self):
-
-    fn = quarantine.defun_with_attributes(lambda x: 2. * x)
+    fn = polymorphic_function.function(lambda x: 2.0 * x)
 
     fn(constant_op.constant(4.0))
     weak_fn = weakref.ref(fn)
@@ -1535,36 +1551,37 @@ class DefunTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testShapeCaching(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def func(x):
       return array_ops.shape(x)
 
-    @quarantine.defun_with_attributes(
-        input_signature=[tensor_spec.TensorSpec([None, None], dtypes.float32)])
+    @polymorphic_function.function(
+        input_signature=[tensor_spec.TensorSpec([None, None], dtypes.float32)]
+    )
     def calls_func(x):
       return func(x)
 
     self.assertAllEqual([1, 1], self.evaluate(func(array_ops.zeros([1, 1]))))
     self.assertAllEqual([2, 2], self.evaluate(func(array_ops.zeros([2, 2]))))
-    self.assertAllEqual([3, 3],
-                        self.evaluate(calls_func(array_ops.zeros([3, 3]))))
+    self.assertAllEqual(
+        [3, 3], self.evaluate(calls_func(array_ops.zeros([3, 3])))
+    )
 
   def testLimitedRetracing(self):
     trace_count = [0]
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def func(x):
       trace_count[0] += 1
       return x
 
     for _ in range(50):
-      func(constant_op.constant(3.))
-      func(constant_op.constant(4.))
-      func(constant_op.constant([[1., 2.]]))
+      func(constant_op.constant(3.0))
+      func(constant_op.constant(4.0))
+      func(constant_op.constant([[1.0, 2.0]]))
       func(constant_op.constant([[]]))
-      func(constant_op.constant([[3., 4.], [5., 6.]]))
-      func(constant_op.constant([[3., 4.], [5., 6.], [7., 8.]]))
+      func(constant_op.constant([[3.0, 4.0], [5.0, 6.0]]))
+      func(constant_op.constant([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]))
     # Tracing more than twice per input doesn't make sense.
     self.assertLess(trace_count[0], 13)
 
@@ -1580,7 +1597,7 @@ class DefunCollectionTest(test.TestCase):
         ops.add_to_collection('x', x)
         ops.add_to_collection('y', y)
 
-        @quarantine.defun_with_attributes
+        @polymorphic_function.function
         def fn():
           x_const = constant_op.constant(ops.get_collection('x')[0])
           y_const = constant_op.constant(ops.get_collection('y')[0])
@@ -1599,7 +1616,7 @@ class DefunCollectionTest(test.TestCase):
       with self.session(graph=g):
         v = resource_variable_ops.ResourceVariable(1.0)
 
-        @quarantine.defun_with_attributes
+        @polymorphic_function.function
         def f():
           return v.read_value()
 
@@ -1612,7 +1629,7 @@ class DefunCollectionTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g):
 
-        @quarantine.defun_with_attributes
+        @polymorphic_function.function
         def f():
           v = resource_variable_ops.ResourceVariable(2.0)
           return v
@@ -1629,7 +1646,7 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
   def testMultiDeviceOutput(self):
     """Tests that functions can produce outputs on multiple devices."""
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def func(a, b, transpose_a):
       with ops.device('/device:CPU:0'):
         m1 = math_ops.matmul(a, b, transpose_a=transpose_a)
@@ -1646,8 +1663,7 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_gpu_only
   def testEmptyBody(self):
-
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def func(a, b):
       return b, a
 
@@ -1675,7 +1691,6 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
 
     There is experimental support for `ints_on_device` in
     FunctionLibraryRuntime now. We can try that.
-
     """
     with ops.device('/device:CPU:0'):
       int_cpu = constant_op.constant(3, dtype=dtypes.int32)
@@ -1683,7 +1698,7 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
     with ops.device('/device:GPU:0'):
       int_gpu = constant_op.constant(7, dtype=dtypes.int32)
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def func(int_cpu, resource, int_gpu):
       with ops.device('/device:CPU:0'):
         m1 = int_cpu * resource + int_gpu
@@ -1709,7 +1724,7 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
   def testMultiDeviceColocateWith(self):
     """Tests that function's outputs respect colocation constraints."""
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def func(a, b):
       with ops.colocate_with(a):
         ra = 2 * a
@@ -1739,7 +1754,7 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
       g1 = resource_variable_ops.ResourceVariable(3.0)
       g2 = resource_variable_ops.ResourceVariable(5.0)
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def func(resource1, resource2):
       with ops.device('/device:CPU:0'):
         result1 = resource1 * g2
@@ -1768,7 +1783,7 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
     with ops.device('/device:GPU:0'):
       g1 = resource_variable_ops.ResourceVariable(3.0)
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def func(resource1, resource2):
       with ops.device('/device:CPU:0'):
         result1 = resource1 * 5
@@ -1785,7 +1800,8 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
     def check_handle(handle, expected_value):
       self.assertRegex(handle.backing_device, 'CPU')
       tensor = gen_resource_variable_ops.read_variable_op(
-          handle, dtypes.float32)
+          handle, dtypes.float32
+      )
       self.assertEqual(tensor.numpy(), expected_value)
 
     # Check that handles returned from functions are on CPU and an op using
@@ -1817,14 +1833,13 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
     with ops.device('/device:GPU:0'):
       g1 = resource_variable_ops.ResourceVariable(3.0)
 
-    @quarantine.defun_with_attributes(attributes={
-        '_noinline': True,
-        '_nospecialize': True
-    })
+    @polymorphic_function.function(
+        experimental_attributes={'_noinline': True, '_nospecialize': True}
+    )
     def inner(resource1):
       return resource1 * 2, resource1.handle
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def outer(resource1):
       with ops.device('/device:CPU:0'):
         r1, _ = inner(resource1)
@@ -1848,12 +1863,12 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
     with ops.device('/device:GPU:0'):
       g1 = resource_variable_ops.ResourceVariable(3.0)
 
-    @quarantine.defun_with_attributes(attributes={'_noinline': True})
+    @polymorphic_function.function(experimental_attributes={'_noinline': True})
     def inner(resource1):
       resource1.assign_add(2.0)
       return resource1 * 2, resource1.handle
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def outer(resource1):
       with ops.device('/device:CPU:0'):
         r1, res1 = inner(resource1)
@@ -1867,7 +1882,8 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
     def check_handle(handle, expected_value):
       self.assertRegex(handle.backing_device, 'CPU')
       tensor = gen_resource_variable_ops.read_variable_op(
-          handle, dtypes.float32)
+          handle, dtypes.float32
+      )
       self.assertEqual(tensor.numpy(), expected_value)
 
     # Check that handles returned from functions are on CPU and an op using
@@ -1895,7 +1911,7 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
     for tensor in [cg0, cg1]:
       self.assertRegex(tensor.backing_device, 'GPU:0')
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def func(rc0, cc0, cg0, rc1, cg1, rg0, rg1, cc1):
       with ops.device('/device:CPU:0'):
         m1 = rc0 * cg0
@@ -1931,7 +1947,7 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
       g2 = constant_op.constant(13.0)
       g3 = constant_op.constant(17.0)
 
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def func(g1, g2, c1, g3, c2):  # pylint: disable=unused-argument
       # arguments g1 and g2 are unused and can be pruned by grappler.
       return c1 * g3 * c2
@@ -1940,222 +1956,119 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(result.numpy(), 5.0 * 7.0 * 17.0)
 
 
-class FunctionCallbackTest(test.TestCase, parameterized.TestCase):
-
-  def testAddFunctionCallback(self):
-    functions = []
-
-    def function_callback(f, name, graph, inputs, outputs):
-      del name, graph, inputs, outputs
-      functions.append(f)
-
-    @polymorphic_function.function
-    def plus_one(x):
-      return x + 1
-
-    try:
-      quarantine.add_function_callback(function_callback)
-      x_float32 = numpy.array(3.0, dtype=numpy.float32)
-      self.assertAllClose(plus_one(x_float32), 4.0)
-      self.assertLen(functions, 1)
-      # Function is already created. Executing it again should not invoke the
-      # function callback.
-      self.assertAllClose(plus_one(x_float32), 4.0)
-      self.assertLen(functions, 1)
-      # Signature change leads to a new Function being built.
-      x_float64 = numpy.array(3.0, dtype=numpy.float64)
-      self.assertAllClose(plus_one(x_float64), 4.0)
-      self.assertLen(functions, 2)
-    finally:
-      quarantine.clear_function_callbacks()
-
-  def testFunctionCallbackAddOps(self):
-    file_name = os.path.join(self.get_temp_dir(), 'test')
-
-    def function_callback(f, name, graph, inputs, outputs):
-      del f, name, inputs
-
-      with graph.as_default():
-        printer = logging_ops.print_v2(
-            'hello', output_stream='file://' + file_name)
-        outputs[0].op._add_control_input(printer)
-
-    @polymorphic_function.function
-    def plus_one(x):
-      return x + 1
-
-    self.addCleanup(quarantine.clear_function_callbacks)
-    quarantine.add_function_callback(function_callback)
-    x_float32 = numpy.array(3.0, dtype=numpy.float32)
-
-    self.assertAllClose(plus_one(x_float32), 4.0)
-
-    with open(file_name, 'r') as f:
-      self.assertEqual(f.read().strip(), 'hello')
-
-  def testRemoveFunctionCallback(self):
-    functions_1 = []
-
-    def function_callback_1(f, name, graph, inputs, outputs):
-      del name, graph, inputs, outputs
-      functions_1.append(f)
-
-    functions_2 = []
-
-    def function_callback_2(f, name, graph, inputs, outputs):
-      del name, graph, inputs, outputs
-      functions_2.append(f)
-
-    @polymorphic_function.function
-    def plus_one(x):
-      return x + 1
-
-    try:
-      quarantine.add_function_callback(function_callback_1)
-      quarantine.add_function_callback(function_callback_2)
-      self.assertAllClose(plus_one(numpy.array(3.0, dtype=numpy.float32)), 4.0)
-      self.assertLen(functions_1, 1)
-      self.assertLen(functions_2, 1)
-      quarantine.remove_function_callback(function_callback_1)
-      # The 1st callback should not be invokved after remove_function_callback()
-      # is called.
-      self.assertAllClose(plus_one(numpy.array(3.0, dtype=numpy.float64)), 4.0)
-      self.assertLen(functions_1, 1)
-      self.assertLen(functions_2, 2)
-    finally:
-      quarantine.clear_function_callbacks()
-
-  def testClearFunctionCallbacks(self):
-    quarantine.add_function_callback(lambda f: None)
-    quarantine.add_function_callback(lambda f: None)
-    self.assertLen(atomic_function.function_callbacks, 2)
-    quarantine.clear_function_callbacks()
-    self.assertEmpty(atomic_function.function_callbacks)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testBackwardNoneGradient(self):
-    model = variables.Variable(1.0, name='model')
-    count = variables.Variable(0)
-
-    @quarantine.defun_with_attributes
-    def forward_pass(value):
-      count.assign_add(1)
-      residuals = value - model
-      loss = 0.5 * math_ops.reduce_mean(math_ops.pow(residuals, 2))
-      # Note: count is an integer, so its doutput will be None
-      return loss, count
-
-    def reduce_fn(x):
-      if context.executing_eagerly():
-        with backprop.GradientTape() as t:
-          loss, count = forward_pass(x)
-        return t.gradient(loss, model), count
-      loss, count = forward_pass(x)
-      grad_only = gradients_impl.gradients(loss, model)
-      return grad_only, count
-
-    g, _ = reduce_fn(constant_op.constant([7.0]))
-
-    self.evaluate(variables.global_variables_initializer())
-    self.assertAllEqual(nest.flatten(self.evaluate(g)), [-6.0])
-
-
 class DefunArgumentNamingTest(test.TestCase, parameterized.TestCase):
   """Tests for recognizable export signatures from concrete functions."""
 
   def testBasic(self):
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def fn(a, b):
       return a + b, a * b
+
     # Call the function to make def_function happy
     fn(array_ops.ones([]), array_ops.ones([]))
 
     fn_op = fn.get_concrete_function(
         tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['a', 'b'],
-        [inp.op.name for inp in fn_op.inputs])
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+    )
+    self.assertEqual(['a', 'b'], [inp.op.name for inp in fn_op.inputs])
     self.assertEqual(
         [b'a', b'b'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs],
+    )
     self.assertLen(fn_op.graph.structured_outputs, 2)
     self.assertAllClose(
-        [3., 2.],
-        fn_op(constant_op.constant(1.), constant_op.constant(2.)))
+        [3.0, 2.0], fn_op(constant_op.constant(1.0), constant_op.constant(2.0))
+    )
     self.assertAllClose(
-        [3., 2.],
-        fn_op(a=constant_op.constant(1.), b=constant_op.constant(2.)))
+        [3.0, 2.0],
+        fn_op(a=constant_op.constant(1.0), b=constant_op.constant(2.0)),
+    )
 
   def testVariable(self):
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def fn(a, b):
       return a + b, a * b
+
     # Call the function to make def_function happy
     fn(array_ops.ones([]), array_ops.ones([]))
 
     fn_op = fn.get_concrete_function(
         tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
-        variables.Variable(1.))
-    self.assertEqual(
-        ['a', 'b'],
-        [inp.op.name for inp in fn_op.inputs])
+        variables.Variable(1.0),
+    )
+    self.assertEqual(['a', 'b'], [inp.op.name for inp in fn_op.inputs])
     self.assertEqual(
         [b'a', b'b'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs],
+    )
     self.assertLen(fn_op.graph.structured_outputs, 2)
 
   def testDictReturned(self):
-    @quarantine.defun_with_attributes
-    def fn(x, z=(1., 2.), y=3.):
+    @polymorphic_function.function
+    def fn(x, z=(1.0, 2.0), y=3.0):
       z1, z2 = z
       return {'alpha': x + y + z1, 'beta': x * y + z2}
+
     # Call the function to make def_function happy
     fn(array_ops.ones([]))
 
     fn_op = fn.get_concrete_function(
         x=tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
-        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['x', 'y'],
-        [inp.op.name for inp in fn_op.inputs])
+        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+    )
+    self.assertEqual(['x', 'y'], [inp.op.name for inp in fn_op.inputs])
     self.assertEqual(
         [b'x', b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
-    self.assertEqual({'alpha', 'beta'},
-                     set(fn_op.graph.structured_outputs.keys()))
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs],
+    )
+    self.assertEqual(
+        {'alpha', 'beta'}, set(fn_op.graph.structured_outputs.keys())
+    )
 
     fn_op2 = fn.get_concrete_function(
-        z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
-                                  name='z_first'),
-           tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
-                                  name='z_second')),
+        z=(
+            tensor_spec.TensorSpec(
+                shape=(None,), dtype=dtypes.float32, name='z_first'
+            ),
+            tensor_spec.TensorSpec(
+                shape=(), dtype=dtypes.float32, name='z_second'
+            ),
+        ),
         y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='custom'),
-        x=4.)
+        x=4.0,
+    )
     self.assertEqual(
         ['z_first', 'z_second', 'custom'],
-        [inp.op.name for inp in fn_op2.inputs])
+        [inp.op.name for inp in fn_op2.inputs],
+    )
     self.assertEqual(
         [b'z_first', b'z_second', b'custom'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op2.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op2.inputs],
+    )
 
     fn_op3 = fn.get_concrete_function(
         tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='custom'),
-        z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
-                                  name='z1'),
-           tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z2')),
-        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+        z=(
+            tensor_spec.TensorSpec(
+                shape=(None,), dtype=dtypes.float32, name='z1'
+            ),
+            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z2'),
+        ),
+        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+    )
     self.assertEqual(
-        ['custom', 'z1', 'z2', 'y'],
-        [inp.op.name for inp in fn_op3.inputs])
+        ['custom', 'z1', 'z2', 'y'], [inp.op.name for inp in fn_op3.inputs]
+    )
     self.assertEqual(
         [b'custom', b'z1', b'z2', b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in fn_op3.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op3.inputs],
+    )
 
   def testMethod(self):
     class HasMethod(object):
 
-      @quarantine.defun_with_attributes
+      @polymorphic_function.function
       def method(self, x):
         return x
 
@@ -2163,43 +2076,46 @@ class DefunArgumentNamingTest(test.TestCase, parameterized.TestCase):
     # Call the function to make def_function happy
     HasMethod.method(has_method, array_ops.ones([]))
     class_op = HasMethod.method.get_concrete_function(
-        has_method, tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['x'],
-        [inp.op.name for inp in class_op.inputs])
+        has_method, tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32)
+    )
+    self.assertEqual(['x'], [inp.op.name for inp in class_op.inputs])
     self.assertEqual(
         [b'x'],
-        [inp.op.get_attr('_user_specified_name') for inp in class_op.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in class_op.inputs],
+    )
     # Call the function to make def_function happy
     has_method.method(array_ops.ones([]))
     method_op = has_method.method.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
-    self.assertEqual(
-        ['x'],
-        [inp.op.name for inp in method_op.inputs])
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32)
+    )
+    self.assertEqual(['x'], [inp.op.name for inp in method_op.inputs])
     self.assertEqual(
         [b'x'],
-        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs],
+    )
     # TODO(allenl): It should be possible to override names when exporting. Do
     # TensorSpec names need to go in cache keys? Or maybe get_concrete_function
     # should always retrace?
     self.skipTest('Not working')
     method_op = has_method.method.get_concrete_function(
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='y'))
-    self.assertEqual(
-        ['y'],
-        [inp.op.name for inp in method_op.inputs])
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='y')
+    )
+    self.assertEqual(['y'], [inp.op.name for inp in method_op.inputs])
     self.assertEqual(
         [b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs],
+    )
 
   def testMethodSignature(self):
-
     class HasMethod(object):
 
-      @quarantine.defun_with_attributes(
-          input_signature=(tensor_spec.TensorSpec(
-              shape=None, dtype=dtypes.float64, name='y'),))
+      @polymorphic_function.function(
+          input_signature=(
+              tensor_spec.TensorSpec(
+                  shape=None, dtype=dtypes.float64, name='y'
+              ),
+          )
+      )
       def method(self, x):
         hash(self)  # No weak proxies passed as `self`
         return x
@@ -2208,22 +2124,20 @@ class DefunArgumentNamingTest(test.TestCase, parameterized.TestCase):
     # Call the function to make def_function happy
     has_method.method(array_ops.ones([], dtype=dtypes.float64))
     method_op = has_method.method.get_concrete_function()
-    self.assertEqual(
-        ['y'],
-        [inp.op.name for inp in method_op.inputs])
+    self.assertEqual(['y'], [inp.op.name for inp in method_op.inputs])
     self.assertEqual(
         [b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs],
+    )
     method_op2 = has_method.method.get_concrete_function()
-    self.assertEqual(
-        ['y'],
-        [inp.op.name for inp in method_op2.inputs])
+    self.assertEqual(['y'], [inp.op.name for inp in method_op2.inputs])
     self.assertEqual(
         [b'y'],
-        [inp.op.get_attr('_user_specified_name') for inp in method_op2.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in method_op2.inputs],
+    )
 
   def testVariadic(self):
-    @quarantine.defun_with_attributes
+    @polymorphic_function.function
     def variadic_fn(x, *args, **kwargs):
       return x + math_ops.add_n(list(args) + list(kwargs.values()))
 
@@ -2233,59 +2147,66 @@ class DefunArgumentNamingTest(test.TestCase, parameterized.TestCase):
         tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
         tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
         tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
-        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
-                               name='second_variadic'),
+        tensor_spec.TensorSpec(
+            shape=(), dtype=dtypes.float32, name='second_variadic'
+        ),
         z=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
-        zz=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='cust'))
+        zz=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='cust'),
+    )
     self.assertEqual(
         ['x', 'y', 'args_1', 'second_variadic', 'z', 'cust'],
-        [inp.op.name for inp in variadic_op.inputs])
+        [inp.op.name for inp in variadic_op.inputs],
+    )
     self.assertEqual(
         [b'x', b'y', b'args_1', b'second_variadic', b'z', b'cust'],
-        [inp.op.get_attr('_user_specified_name') for inp in variadic_op.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in variadic_op.inputs],
+    )
 
   def testVariadicInputSignature(self):
-    @quarantine.defun_with_attributes(
+    @polymorphic_function.function(
         input_signature=(
             tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32),
             tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
             tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
             tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z'),
-        ))
+        )
+    )
     def variadic_fn(x, *args):
       return x + math_ops.add_n(list(args))
 
     # Call the function to make def_function happy
-    variadic_fn(array_ops.ones([]), array_ops.ones([]),
-                array_ops.ones([]), array_ops.ones([]))
+    variadic_fn(
+        array_ops.ones([]),
+        array_ops.ones([]),
+        array_ops.ones([]),
+        array_ops.ones([]),
+    )
     variadic_op = variadic_fn.get_concrete_function()
     self.assertIn(b'variadic_fn', variadic_op.name)
     self.assertEqual(
-        ['x', 'y', 'args_1', 'z'],
-        [inp.op.name for inp in variadic_op.inputs])
+        ['x', 'y', 'args_1', 'z'], [inp.op.name for inp in variadic_op.inputs]
+    )
     self.assertEqual(
         [b'x', b'y', b'args_1', b'z'],
-        [inp.op.get_attr('_user_specified_name')
-         for inp in variadic_op.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in variadic_op.inputs],
+    )
 
 
 class DevicePlacementTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testMultipleDeviceCheck(self):
-
     def f():
       with ops.device('cpu'):
         return test_ops.device_placement_op()
 
-    func = quarantine.defun_with_attributes(f)
+    func = polymorphic_function.function(f)
     with ops.device('cpu:0'):
       output = self.evaluate(func())
       self.assertIn(compat.as_bytes('CPU:0'), output)
 
   @test_util.run_in_graph_and_eager_modes
   def testDeviceAnnotationsRespected(self):
-
     def multi_device_fn():
       with ops.device('/cpu:0'):
         s0 = test_ops.device_placement_op()
@@ -2296,7 +2217,7 @@ class DevicePlacementTest(test.TestCase, parameterized.TestCase):
       s3 = test_ops.device_placement_op()
       return s0, s1, s2, s3
 
-    defined = quarantine.defun_with_attributes(multi_device_fn)
+    defined = polymorphic_function.function(multi_device_fn)
     outputs = self.evaluate(defined())
     self.assertLen(total_function_cache(defined), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
@@ -2325,12 +2246,15 @@ def setUpModule():
   ops.enable_eager_execution()
   cpus = config.list_physical_devices('CPU')
   # Set 4 virtual CPUs
-  config.set_logical_device_configuration(cpus[0], [
-      context.LogicalDeviceConfiguration(),
-      context.LogicalDeviceConfiguration(),
-      context.LogicalDeviceConfiguration(),
-      context.LogicalDeviceConfiguration()
-  ])
+  config.set_logical_device_configuration(
+      cpus[0],
+      [
+          context.LogicalDeviceConfiguration(),
+          context.LogicalDeviceConfiguration(),
+          context.LogicalDeviceConfiguration(),
+          context.LogicalDeviceConfiguration(),
+      ],
+  )
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/polymorphic_function/tf_method_target.py b/tensorflow/python/eager/polymorphic_function/tf_method_target.py
new file mode 100644
index 00000000000..23b9a127366
--- /dev/null
+++ b/tensorflow/python/eager/polymorphic_function/tf_method_target.py
@@ -0,0 +1,51 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module for the TFMethodTarget Class."""
+
+import weakref
+
+from tensorflow.python.util import tf_inspect
+
+
+# When a method is bound to objects of this type, it allows AutoGraph to
+# recover a weak reference the original method's self pointer, so that it can
+# execute it consistent with class_method_to_instance_method's
+# bound_method_wrapper.
+# TODO(b/119246461): This is not pretty. Use a descriptor instead?
+class TfMethodTarget:
+  """Binding target for methods replaced by function and defun."""
+
+  __slots__ = ("weakrefself_target__", "weakrefself_func__")
+
+  def __init__(self, target, original_python_function):
+    self.weakrefself_target__ = target
+    self.weakrefself_func__ = weakref.ref(original_python_function)
+
+  @property
+  def target(self):
+    return self.weakrefself_target__()
+
+  @property
+  def target_class(self):
+    true_self = self.weakrefself_target__()
+    if tf_inspect.isclass(true_self):
+      # Class method
+      return true_self
+    else:
+      return true_self.__class__
+
+  def call(self, args, kwargs):
+    wrapped_fn = self.weakrefself_func__()
+    return wrapped_fn(self.weakrefself_target__(), *args, **kwargs)
diff --git a/tensorflow/python/eager/polymorphic_function/tracing_compiler.py b/tensorflow/python/eager/polymorphic_function/tracing_compiler.py
index 78afcf95473..b9708a3fedf 100644
--- a/tensorflow/python/eager/polymorphic_function/tracing_compiler.py
+++ b/tensorflow/python/eager/polymorphic_function/tracing_compiler.py
@@ -30,6 +30,8 @@ from tensorflow.python.eager.polymorphic_function import attributes as attribute
 from tensorflow.python.eager.polymorphic_function import function_context
 from tensorflow.python.eager.polymorphic_function import function_spec
 from tensorflow.python.eager.polymorphic_function import monomorphic_function
+from tensorflow.python.eager.polymorphic_function import tf_method_target
+from tensorflow.python.eager.polymorphic_function import transform
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
@@ -37,7 +39,7 @@ from tensorflow.python.profiler import trace
 from tensorflow.python.util import compat
 from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
+
 
 # Loaded lazily due to a circular dependency (roughly
 # tf.function->autograph->->dataset->tf.function).
@@ -301,19 +303,22 @@ class TracingCompiler:
     else:
       arg_names = base_arg_names
 
+    traced_func_graph = func_graph_module.func_graph_from_py_func(
+        self._name,
+        self._python_function,
+        args,
+        kwargs,
+        None,
+        func_graph=func_graph,
+        arg_names=arg_names,
+        capture_by_value=self._capture_by_value,
+        create_placeholders=False,
+    )
+
+    transform.apply_func_graph_transforms(traced_func_graph)
+
     concrete_function = monomorphic_function.ConcreteFunction(
-        func_graph_module.func_graph_from_py_func(
-            self._name,
-            self._python_function,
-            args,
-            kwargs,
-            None,
-            func_graph=func_graph,
-            autograph=self._autograph,
-            autograph_options=self._autograph_options,
-            arg_names=arg_names,
-            capture_by_value=self._capture_by_value,
-            create_placeholders=False),
+        traced_func_graph,
         self._function_attributes,
         spec=self.function_spec,
         # Tell the ConcreteFunction to clean up its graph once it goes out of
@@ -321,6 +326,9 @@ class TracingCompiler:
         # places (like Keras) where the FuncGraph lives longer than the
         # ConcreteFunction.
         shared_func_graph=False)
+
+    transform.call_concrete_function_callbacks(concrete_function)
+
     return concrete_function
 
   def _maybe_define_function(self, args, kwargs):
@@ -388,10 +396,9 @@ class TracingCompiler:
                 current_func_context, lookup_func_type)
           else:
             target_func_type = lookup_func_type
-          handledata_mapping = lookup_func_context.get_handledata_mapping()
           placeholder_mapping = lookup_func_context.get_placeholder_mapping()
           placeholder_context = trace_type.InternalPlaceholderContext(
-              func_graph, placeholder_mapping, handledata_mapping)
+              func_graph, placeholder_mapping)
           with func_graph.as_default():
             placeholder_bound_args = target_func_type.placeholder_arguments(
                 placeholder_context)
@@ -402,7 +409,7 @@ class TracingCompiler:
               args, kwargs, func_graph)
 
           # TODO(b/263520817): Remove access to private attribute.
-          graph_capture_container = concrete_function.graph._function_captures  # pylint: disable=protected-access
+          graph_capture_container = concrete_function.graph.function_captures
           # Maintain the list of all captures
           self._func_captures.merge_by_ref_with(graph_capture_container)
           # Get current active captures snapshot
@@ -418,38 +425,6 @@ class TracingCompiler:
           return concrete_function, filtered_flat_args
 
 
-# When a method is bound to objects of this type, it allows AutoGraph to
-# recover a weak reference the original method's self pointer, so that it can
-# execute it consistent with class_method_to_instance_method's
-# bound_method_wrapper.
-# TODO(b/119246461): This is not pretty. Use a descriptor instead?
-class TfMethodTarget:
-  """Binding target for methods replaced by function and defun."""
-
-  __slots__ = ("weakrefself_target__", "weakrefself_func__")
-
-  def __init__(self, target, original_python_function):
-    self.weakrefself_target__ = target
-    self.weakrefself_func__ = weakref.ref(original_python_function)
-
-  @property
-  def target(self):
-    return self.weakrefself_target__()
-
-  @property
-  def target_class(self):
-    true_self = self.weakrefself_target__()
-    if tf_inspect.isclass(true_self):
-      # Class method
-      return true_self
-    else:
-      return true_self.__class__
-
-  def call(self, args, kwargs):
-    wrapped_fn = self.weakrefself_func__()
-    return wrapped_fn(self.weakrefself_target__(), *args, **kwargs)
-
-
 def class_method_to_instance_method(original_function, instance):
   """Constructs a new `TracingCompiler` with `self` bound."""
   weak_instance = weakref.ref(instance)
@@ -458,7 +433,8 @@ def class_method_to_instance_method(original_function, instance):
   # bound method to be unhashable.
   bound_method = types_lib.MethodType(
       original_function.python_function,
-      TfMethodTarget(weak_instance, original_function.python_function))
+      tf_method_target.TfMethodTarget(weak_instance,
+                                      original_function.python_function))
 
   # original_function is expected to be either `TracingCompiler` or
   # def_function.Function
diff --git a/tensorflow/python/eager/polymorphic_function/transform.py b/tensorflow/python/eager/polymorphic_function/transform.py
new file mode 100644
index 00000000000..c7d1a30a222
--- /dev/null
+++ b/tensorflow/python/eager/polymorphic_function/transform.py
@@ -0,0 +1,32 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""tf.function transformations implementation."""
+
+# TODO(fmuham): Move this logic to core/function when layered.
+# TODO(fmuham): Deprecate and migrate these as AtomicFunction transformations.
+FUNC_GRAPH_TRANSFORMS = []
+CONCRETE_FUNCTION_CALLBACKS = []
+
+
+def apply_func_graph_transforms(func_graph):
+  """Applies registered transformations to FuncGraph."""
+  for transform in FUNC_GRAPH_TRANSFORMS:
+    transform(func_graph)
+
+
+def call_concrete_function_callbacks(concrete_fn):
+  """Calls registered callbacks against new ConcreteFunctions."""
+  for callback in CONCRETE_FUNCTION_CALLBACKS:
+    callback(concrete_fn)
diff --git a/tensorflow/python/eager/profiler_test.py b/tensorflow/python/eager/profiler_test.py
index faca4640151..287ef1bacd2 100644
--- a/tensorflow/python/eager/profiler_test.py
+++ b/tensorflow/python/eager/profiler_test.py
@@ -16,7 +16,6 @@
 
 import os
 
-from tensorflow.core.profiler.protobuf import trace_events_pb2
 from tensorflow.python.eager import profiler
 from tensorflow.python.eager import test
 from tensorflow.python.framework import config
@@ -24,6 +23,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import gfile
 from tensorflow.python.profiler import trace
+from tensorflow.tsl.profiler.protobuf import trace_events_pb2
 
 
 @test_util.with_eager_op_as_function
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index 9efe44421d8..d2e1c1296ad 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -429,7 +429,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 484> a = {{
+  static std::array<OpIndexInfo, 485> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -485,6 +485,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"Ceil"},
       {"CheckNumerics"},
       {"CheckNumericsV2"},
+      {"ClipByValue"},
       {"CollectivePermute"},
       {"Complex"},
       {"CompositeTensorVariantFromComponents"},
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index f0e8f442db5..d090509b0bd 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -42,7 +43,7 @@ limitations under the License.
 #include "tensorflow/python/lib/core/py_exception_registry.h"
 #include "tensorflow/python/lib/core/py_seq_tensor.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 // forward declare
 struct EagerTensor;
@@ -197,7 +198,7 @@ int ConvertDeviceName(PyObject* obj, const char** dst) {
 
 void RaiseExceptionTypeFromTFStatus(TF_Status* tf_status) {
   auto status = tensorflow::StatusFromTF_Status(tf_status);
-  SetRegisteredErrFromStatus(status);
+  tsl::SetRegisteredErrFromStatus(status);
 }
 
 }  // namespace
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 504db54293f..4c91ee0717d 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/eager/tfe_op_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -51,7 +52,7 @@ limitations under the License.
 #include "tensorflow/python/eager/pywrap_tensor.h"
 #include "tensorflow/python/eager/pywrap_tfe.h"
 #include "tensorflow/python/lib/core/py_util.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 #include "tensorflow/python/util/stack_trace.h"
 #include "tensorflow/python/util/util.h"
 
@@ -1031,7 +1032,7 @@ std::string FormatErrorStatusStackTrace(const tensorflow::Status& status) {
   std::vector<tensorflow::StackFrame> stack_trace =
       tensorflow::errors::GetStackTrace(status);
 
-  if (stack_trace.empty()) return status.error_message();
+  if (stack_trace.empty()) return std::string(status.message());
 
   PyObject* linecache = PyImport_ImportModule("linecache");
   PyObject* getline =
@@ -1058,7 +1059,7 @@ std::string FormatErrorStatusStackTrace(const tensorflow::Status& status) {
   Py_DecRef(getline);
   Py_DecRef(linecache);
 
-  result << '\n' << status.error_message();
+  result << '\n' << status.message();
   return result.str();
 }
 
@@ -1103,7 +1104,7 @@ int MaybeRaiseExceptionFromTFStatus(TF_Status* status, PyObject* exception) {
 int MaybeRaiseExceptionFromStatus(const tensorflow::Status& status,
                                   PyObject* exception) {
   if (status.ok()) return 0;
-  const char* msg = status.error_message().c_str();
+  const char* msg = tsl::NullTerminatedMessage(status);
   if (exception == nullptr) {
     tensorflow::mutex_lock l(exception_class_mutex);
     if (exception_class != nullptr) {
diff --git a/tensorflow/python/eager/record.py b/tensorflow/python/eager/record.py
new file mode 100644
index 00000000000..ea6c4ae4f83
--- /dev/null
+++ b/tensorflow/python/eager/record.py
@@ -0,0 +1,121 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gradient record utilities."""
+
+import contextlib
+
+from tensorflow.python import pywrap_tfe
+
+
+class VariableWatcher(object):
+  """A scope that tracks all trainable variable accesses within it.
+
+  This explicitly ignores variables that are not marked as trainable.
+
+  Sample usage:
+
+  var = tf.Variable(0.0)
+  with VariableWatcher() as variable_watcher:
+    var.assign_add(1.0)
+
+  assert variable_watcher.watched_variables == [var]
+  """
+
+  __slots__ = ["_variable_watcher"]
+
+  def __init__(self):
+    self._variable_watcher = None
+
+  def __enter__(self):
+    self._variable_watcher = pywrap_tfe.TFE_Py_VariableWatcherNew()
+    return self
+
+  def __exit__(self, typ, value, traceback):
+    pywrap_tfe.TFE_Py_VariableWatcherRemove(self._variable_watcher)
+
+  def watched_variables(self):
+    """Returns a tuple of variables accessed under this scope."""
+    return pywrap_tfe.TFE_Py_VariableWatcherWatchedVariables(
+        self._variable_watcher)
+
+
+@contextlib.contextmanager
+def stop_recording():
+  """Stop all gradient recording (backprop and forwardprop)."""
+  is_stopped = pywrap_tfe.TFE_Py_TapeSetIsStopped()
+  try:
+    if not is_stopped:
+      pywrap_tfe.TFE_Py_TapeSetStopOnThread()
+    yield
+  finally:
+    if not is_stopped:
+      pywrap_tfe.TFE_Py_TapeSetRestartOnThread()
+
+
+def should_record_backprop(tensors):
+  """Returns true if any tape in the stack watches any of these tensors.
+
+  Only takes GradientTapes into account, not forward accumulators.
+
+  Args:
+    tensors: Tensors to check, typically inputs to an operation.
+
+  Returns:
+    Boolean, whether any tape watches any of `tensors`.
+  """
+  return pywrap_tfe.TFE_Py_TapeSetShouldRecordBackprop(tensors)
+
+
+def record_operation(op_type, output_tensors, input_tensors, backward_function,
+                     forward_function=None):
+  """Records the operation on all tapes in the stack."""
+  pywrap_tfe.TFE_Py_TapeSetRecordOperation(op_type, output_tensors,
+                                           input_tensors, backward_function,
+                                           forward_function)
+
+
+def record_operation_backprop_only(op_type, output_tensors, input_tensors,
+                                   backward_function):
+  """Records the operation on all backward tapes in the stack."""
+  pywrap_tfe.TFE_Py_TapeSetRecordOperationBackprop(op_type, output_tensors,
+                                                   input_tensors,
+                                                   backward_function)
+
+
+def record_operation_forwardprop_only(op_type, output_tensors, input_tensors,
+                                      backward_function,
+                                      forwardprop_output_indices):
+  """Records the operation on all forward accumulators in the stack.
+
+  Args:
+    op_type: a string for the operation type, used in the backprop code
+    output_tensors: a list of Python Tensor objects output by the operation
+    input_tensors: a list of input Tensors to the recorded operation
+    backward_function: the function to be called to, given the gradients of the
+      output tensors, produce the gradients of the input tensors. This function
+      is automatically transposed to produce output gradients given input
+      gradients.
+    forwardprop_output_indices: indicates any output_tensors which contain JVPs.
+      Typically these will have come from TFE_Py_PackForwardGradients. May be
+      None or an empty sequence if there are no JVP outputs from the operation.
+  """
+  pywrap_tfe.TFE_Py_TapeSetRecordOperationForwardprop(
+      op_type, output_tensors, input_tensors, backward_function,
+      forwardprop_output_indices)
+
+
+def could_possibly_record():
+  """Returns True if any tape is active."""
+  return not pywrap_tfe.TFE_Py_TapeSetIsEmpty()
diff --git a/tensorflow/python/eager/tape_test.py b/tensorflow/python/eager/record_test.py
similarity index 95%
rename from tensorflow/python/eager/tape_test.py
rename to tensorflow/python/eager/record_test.py
index 56f3e99d6d2..1c677ba7ae9 100644
--- a/tensorflow/python/eager/tape_test.py
+++ b/tensorflow/python/eager/record_test.py
@@ -17,7 +17,7 @@
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
+from tensorflow.python.eager import record
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -169,7 +169,7 @@ class VariableWatcherTest(test.TestCase):
   def testBasic(self):
     var1 = variables.Variable(0.0)
     var2 = variables.Variable(1.0)
-    with tape.VariableWatcher() as variable_watcher:
+    with record.VariableWatcher() as variable_watcher:
       var1.assign_add(1.0)
       var2.assign_add(2.0)
 
@@ -178,7 +178,7 @@ class VariableWatcherTest(test.TestCase):
   def testNonTrainableVariables(self):
     var1 = variables.Variable(0.0)
     var2 = variables.Variable(1.0, trainable=False)
-    with tape.VariableWatcher() as variable_watcher:
+    with record.VariableWatcher() as variable_watcher:
       var1.assign_add(1.0)
       var2.assign_add(2.0)
 
@@ -187,9 +187,9 @@ class VariableWatcherTest(test.TestCase):
   def testMultipleScopes(self):
     var1 = variables.Variable(0.0)
     var2 = variables.Variable(1.0)
-    with tape.VariableWatcher() as variable_watcher1:
+    with record.VariableWatcher() as variable_watcher1:
       var1.assign_add(1.0)
-      with tape.VariableWatcher() as variable_watcher2:
+      with record.VariableWatcher() as variable_watcher2:
         var2.assign_add(2.0)
 
     # variable_watcher1 should see both vars and variable_watcher2 only sees
@@ -198,7 +198,7 @@ class VariableWatcherTest(test.TestCase):
     self.assertAllEqual(variable_watcher2.watched_variables(), (var2,))
 
   def testCreateVariables(self):
-    with tape.VariableWatcher() as variable_watcher:
+    with record.VariableWatcher() as variable_watcher:
       var1 = variables.Variable(0.0)
       var2 = variables.Variable(1.0)
       var1.assign_add(1.0)
diff --git a/tensorflow/python/eager/remote.py b/tensorflow/python/eager/remote.py
index ec69af99fec..b24a952b669 100644
--- a/tensorflow/python/eager/remote.py
+++ b/tensorflow/python/eager/remote.py
@@ -201,13 +201,30 @@ def connect_to_cluster(cluster_spec_or_resolver,
           service_leader=service_leader,
           enable_health_check=False)
 
+  default_session_config = copy.deepcopy(context.context().config)
+
+  for name in cluster_spec.jobs:
+    # assuming any of the non-local job is the worker jobs.
+    # should we use cluster_spec_or_resolver.get_job_name() instead when
+    # it is available?
+    # maybe consolicate this with the 'master' logic below
+    if name == job_name:
+      continue
+
+    default_session_config.experimental.collective_group_leader = (
+        f"/job:{name}/replica:0/task:0"
+    )
+
+  logging.info("default session config: %s", default_session_config)
+
   server_def = ServerDef(
       cluster=cluster_def,
       job_name=job_name,
       task_index=task_index,
       protocol=protocol,
-      default_session_config=context.context().config,
-      cluster_device_filters=cluster_device_filters)
+      default_session_config=default_session_config,
+      cluster_device_filters=cluster_device_filters,
+  )
 
   if is_server_def_changed:
     context.set_server_def(server_def)
diff --git a/tensorflow/python/eager/small_constants_optimizer_test.py b/tensorflow/python/eager/small_constants_optimizer_test.py
new file mode 100644
index 00000000000..d42417bde25
--- /dev/null
+++ b/tensorflow/python/eager/small_constants_optimizer_test.py
@@ -0,0 +1,175 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import contextlib
+import timeit
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@contextlib.contextmanager
+def options(optimizer_options):
+  old_opts = context.context().get_optimizer_experimental_options()
+  context.context().set_optimizer_experimental_options(optimizer_options)
+  try:
+    yield
+  finally:
+    context.context().set_optimizer_experimental_options(old_opts)
+
+
+class FunctionTest(test.TestCase):
+
+  @test_util.run_v2_only
+  def test_grappler_optimization(self):
+    @polymorphic_function.function
+    def brancher(inp):
+      x = constant_op.constant(1)
+      for _ in range(1000):
+        if inp:
+          x = x + constant_op.constant(1)
+        else:
+          x = x + constant_op.constant(2)
+      return x
+
+    @polymorphic_function.function
+    def brancher_true():
+      left = constant_op.constant(True)
+      x = constant_op.constant(1)
+      for _ in range(1000):
+        if left:
+          x = x + constant_op.constant(1)
+        else:
+          x = x + constant_op.constant(2)
+      return x
+
+    x = constant_op.constant(True)
+    self.assertEqual(brancher(x), brancher_true())  # Trace each function once.
+
+    benchmark = min(timeit.repeat(lambda: brancher(x), repeat=5, number=100))
+    opt_benchmark = min(timeit.repeat(brancher_true, repeat=5, number=100))
+
+    # Constant folded execution is usually 15 - 20 times faster. Here we check
+    # for a 3x speedup to account for various machines the test might run on.
+    self.assertLess(opt_benchmark * 3, benchmark)
+
+  @test_util.run_v2_only
+  def test_small_constants_optimization_with_grappler(self):
+    def func(inp):
+      x = constant_op.constant(1)
+      for _ in range(1000):
+        if inp:
+          x = x + constant_op.constant(1)
+        else:
+          x = x + constant_op.constant(2)
+      return x
+
+    brancher = polymorphic_function.function(func)
+    brancher_opt = polymorphic_function.function(
+        func, experimental_attributes={'runtime_constant_optimization': True}
+    )
+
+    # Trace each function once.
+    with ops.device_v2('CPU'):
+      x = constant_op.constant(True)
+    self.assertEqual(brancher(x), brancher_opt(x))
+
+    benchmark = min(timeit.repeat(lambda: brancher(x), repeat=5, number=100))
+    opt_benchmark = min(
+        timeit.repeat(lambda: brancher_opt(x), repeat=5, number=100)
+    )
+
+    # Constant folded execution is usually 15 - 20 times faster. Here we check
+    # for a 2x speedup to account for various machines the test might run on.
+    # Specially the kokoro machines seems to run much slower.
+    self.assertLess(opt_benchmark * 2, benchmark)
+
+  @test_util.run_v2_only
+  @test_util.run_gpu_only
+  def test_small_constants_optimization_disabled(self):
+    @polymorphic_function.function(
+        experimental_attributes={'runtime_constant_optimization': True}
+    )
+    def func(inp):
+      return inp
+
+    x = constant_op.constant(True)
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        (
+            'Expecting boolean tensor to be on host when'
+            ' small_constants_optimizer is enabled.'
+        ),
+    ):
+      func(x)
+
+  @test_util.run_v2_only
+  def test_small_constants_optimization_invalid_input(self):
+    @polymorphic_function.function(
+        experimental_attributes={'runtime_constant_optimization': True}
+    )
+    def func(inp):
+      return inp
+
+    with ops.device_v2('CPU'):
+      x = constant_op.constant([True, True])
+    # runtime_constant_optimization should not crash when the tf.function
+    # is passed in a boolean tensor having > 1 element.
+    self.assertAllEqual(func(x), x)
+
+  @test_util.run_v2_only
+  def test_small_constants_optimization_without_grappler(self):
+    def func(inp):
+      x = constant_op.constant(1)
+      for _ in range(1000):
+        if inp:
+          x = x + constant_op.constant(1)
+        else:
+          x = x + constant_op.constant(2)
+      return x
+
+    brancher = polymorphic_function.function(func)
+    brancher_opt = polymorphic_function.function(
+        func, experimental_attributes={'runtime_constant_optimization': True}
+    )
+
+    # Trace each function once.
+    with ops.device_v2('CPU'):
+      x = constant_op.constant(True)
+    self.assertEqual(brancher(x), brancher_opt(x))
+
+    # Disable grappler and check that performance is still good with
+    # small_constants_optimizer.
+    with options({'disable_meta_optimizer': True}):
+      benchmark = min(timeit.repeat(lambda: brancher(x), repeat=5, number=100))
+      opt_benchmark = min(
+          timeit.repeat(lambda: brancher_opt(x), repeat=5, number=100)
+      )
+
+    # Constant folded execution is usually 150x times faster (against a base
+    # that has no grappler optimization). Here we check
+    # for a 5x speedup to account for various machines the test might run on.
+    # Specially the kokoro machines seems to run much slower.
+    self.assertLess(opt_benchmark * 5, benchmark)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index 06c42a47b93..194ccd5a911 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -14,18 +14,7 @@
 # ==============================================================================
 """Gradient tape utilities."""
 
-import contextlib
-
 from tensorflow.python import pywrap_tfe
-from tensorflow.python.util.lazy_loader import LazyLoader
-
-# There is a circular dependency between this, ops.py, and
-# distribution_strategy_context.
-# TODO(b/117329403): Remove this circular dependency.
-distribution_strategy_context = LazyLoader(
-    "distribution_strategy_context", globals(),
-    "tensorflow.python.distribute."
-    "distribution_strategy_context")
 
 
 class Tape(object):
@@ -56,46 +45,30 @@ def watch(tape, tensor):
   pywrap_tfe.TFE_Py_TapeWatch(tape._tape, tensor)  # pylint: disable=protected-access
 
 
-class VariableWatcher(object):
-  """A scope that tracks all trainable variable accesses within it.
+def default_get_variables(variable):
+  return [variable]
 
-  This explicitly ignores variables that are not marked as trainable.
+# Gets a list of changed variables. Can be overriden using
+# register_variables_override. An example of overriding is for getting the
+# varibles within a distributed context.
+_variables_override = default_get_variables
 
-  Sample usage:
 
-  var = tf.Variable(0.0)
-  with VariableWatcher() as variable_watcher:
-    var.assign_add(1.0)
+def register_watched_variable_resolver(resolver):
+  """Registers the resolver to be used to get the list of variables to watch.
 
-  assert variable_watcher.watched_variables == [var]
+  Args:
+    resolver: callable, takes a Variable and returns a list of Variables that
+      shall be watched.
   """
-
-  __slots__ = ["_variable_watcher"]
-
-  def __init__(self):
-    self._variable_watcher = None
-
-  def __enter__(self):
-    self._variable_watcher = pywrap_tfe.TFE_Py_VariableWatcherNew()
-    return self
-
-  def __exit__(self, typ, value, traceback):
-    pywrap_tfe.TFE_Py_VariableWatcherRemove(self._variable_watcher)
-
-  def watched_variables(self):
-    """Returns a tuple of variables accessed under this scope."""
-    return pywrap_tfe.TFE_Py_VariableWatcherWatchedVariables(
-        self._variable_watcher)
+  global _variables_override
+  assert _variables_override is default_get_variables
+  _variables_override = resolver
 
 
 def watch_variable(tape, variable):
   """Marks this variable to be watched by the given tape."""
-  strategy, context = (
-      distribution_strategy_context.get_strategy_and_replica_context())
-  if context:
-    variables = [strategy.extended.value_container(variable)]
-  else:
-    variables = strategy.experimental_local_results(variable)
+  variables = _variables_override(variable)
   for var in variables:
     pywrap_tfe.TFE_Py_TapeWatchVariable(tape._tape, var)  # pylint: disable=protected-access
     pywrap_tfe.TFE_Py_VariableWatcherVariableAccessed(var)
@@ -107,12 +80,7 @@ def variable_accessed(variable):
   Args:
     variable: variable to be watched.
   """
-  strategy, context = (
-      distribution_strategy_context.get_strategy_and_replica_context())
-  if context:
-    variables = [strategy.extended.value_container(variable)]
-  else:
-    variables = strategy.experimental_local_results(variable)
+  variables = _variables_override(variable)
   for var in variables:
     pywrap_tfe.TFE_Py_TapeVariableAccessed(var)
     pywrap_tfe.TFE_Py_VariableWatcherVariableAccessed(var)
@@ -126,16 +94,10 @@ def variables_accessed(variables):
   Args:
     variables: iterable of variables to mark as accessed.
   """
-  strategy, context = (
-      distribution_strategy_context.get_strategy_and_replica_context())
   accessed = []
-  if context:
-    accessed = [strategy.extended.value_container(variable)
-                for variable in variables if variable.trainable]
-  else:
-    for variable in variables:
-      if variable.trainable:
-        accessed.extend(strategy.experimental_local_results(variable))
+  for variable in variables:
+    if variable.trainable:
+      accessed.extend(_variables_override(variable))
 
   for var in accessed:
     pywrap_tfe.TFE_Py_TapeVariableAccessed(var)
@@ -145,78 +107,3 @@ def variables_accessed(variables):
 def pop_tape(tape):
   """Pops the given tape in the stack."""
   pywrap_tfe.TFE_Py_TapeSetRemove(tape._tape)  # pylint: disable=protected-access
-
-
-@contextlib.contextmanager
-def stop_recording():
-  """Stop all gradient recording (backprop and forwardprop)."""
-  is_stopped = pywrap_tfe.TFE_Py_TapeSetIsStopped()
-  try:
-    if not is_stopped:
-      pywrap_tfe.TFE_Py_TapeSetStopOnThread()
-    yield
-  finally:
-    if not is_stopped:
-      pywrap_tfe.TFE_Py_TapeSetRestartOnThread()
-
-
-def should_record_backprop(tensors):
-  """Returns true if any tape in the stack watches any of these tensors.
-
-  Only takes GradientTapes into account, not forward accumulators.
-
-  Args:
-    tensors: Tensors to check, typically inputs to an operation.
-
-  Returns:
-    Boolean, whether any tape watches any of `tensors`.
-  """
-  return pywrap_tfe.TFE_Py_TapeSetShouldRecordBackprop(tensors)
-
-
-def record_operation(op_type, output_tensors, input_tensors, backward_function,
-                     forward_function=None):
-  """Records the operation on all tapes in the stack."""
-  pywrap_tfe.TFE_Py_TapeSetRecordOperation(op_type, output_tensors,
-                                           input_tensors, backward_function,
-                                           forward_function)
-
-
-def record_operation_backprop_only(op_type, output_tensors, input_tensors,
-                                   backward_function):
-  """Records the operation on all backward tapes in the stack."""
-  pywrap_tfe.TFE_Py_TapeSetRecordOperationBackprop(op_type, output_tensors,
-                                                   input_tensors,
-                                                   backward_function)
-
-
-def record_operation_forwardprop_only(op_type, output_tensors, input_tensors,
-                                      backward_function,
-                                      forwardprop_output_indices):
-  """Records the operation on all forward accumulators in the stack.
-
-  Args:
-    op_type: a string for the operation type, used in the backprop code
-    output_tensors: a list of Python Tensor objects output by the operation
-    input_tensors: a list of input Tensors to the recorded operation
-    backward_function: the function to be called to, given the gradients of the
-      output tensors, produce the gradients of the input tensors. This function
-      is automatically transposed to produce output gradients given input
-      gradients.
-    forwardprop_output_indices: indicates any output_tensors which contain JVPs.
-      Typically these will have come from TFE_Py_PackForwardGradients. May be
-      None or an empty sequence if there are no JVP outputs from the operation.
-  """
-  pywrap_tfe.TFE_Py_TapeSetRecordOperationForwardprop(
-      op_type, output_tensors, input_tensors, backward_function,
-      forwardprop_output_indices)
-
-
-def delete_trace(tensor_id):
-  """Deletes traces for this Tensor from all tapes in the stack."""
-  pywrap_tfe.TFE_Py_TapeSetDeleteTrace(tensor_id)
-
-
-def could_possibly_record():
-  """Returns True if any tape is active."""
-  return not pywrap_tfe.TFE_Py_TapeSetIsEmpty()
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 6c83fe66d1e..153dfb87f5e 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -227,7 +227,7 @@ class WrappedFunction(function.ConcreteFunction):
     self._signature = signature
     super(WrappedFunction, self).__init__(fn_graph, attrs=attrs)
 
-  def _call_impl(self, args, kwargs, cancellation_manager=None):
+  def _call_impl(self, args, kwargs):
     if self._arg_keywords is None:
       if kwargs:
         raise NotImplementedError(
@@ -240,8 +240,7 @@ class WrappedFunction(function.ConcreteFunction):
             args[i] = ops.convert_to_tensor(arg, self._signature[i].dtype)
       return self._call_flat(args, self.captured_inputs)
     else:
-      return super(WrappedFunction, self)._call_impl(
-          args, kwargs, cancellation_manager)
+      return super(WrappedFunction, self)._call_impl(args, kwargs)
 
   def prune(self, feeds, fetches, name=None, input_signature=None):
     """Extract a subgraph of this function's underlying graph.
@@ -368,6 +367,13 @@ class WrappedFunction(function.ConcreteFunction):
     # reconstituted into their original composite form.
     pruned_graph.structured_outputs = nest.map_structure(
         _structured_output_mapping, fetches, expand_composites=True)
+
+    if input_signature:
+      # canonicalize the signature before setting
+      args, kwargs = input_signature
+      args = () if args is None else args
+      input_signature = (args, kwargs)
+
     pruned_graph.structured_input_signature = input_signature
     pruned_fn = WrappedFunction(
         pruned_graph, variable_holder=self._variable_holder)
diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py
index f49f4336f74..c605c3fc0ef 100644
--- a/tensorflow/python/eager/wrap_function_test.py
+++ b/tensorflow/python/eager/wrap_function_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -414,7 +415,8 @@ class WrappedGraphTest(test.TestCase):
   def testCollections(self):
 
     def fn(x):
-      v = variables.VariableV1(3, name='v', trainable=False, collections=['a'])
+      v = variable_v1.VariableV1(
+          3, name='v', trainable=False, collections=['a'])
       v2 = variable_scope.get_variable(
           'v', initializer=init_ops.Constant(4), shape=[], dtype=dtypes.int32,
           collections=['a', 'b'])
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index c2f1317df0d..c461b4b532c 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -13,7 +13,6 @@ py_library(
     deps = [
         ":feature_column",
         ":feature_column_v2",
-        "//tensorflow/python:util",
     ],
 )
 
@@ -25,6 +24,7 @@ py_library(
         ":utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:embedding_ops",
@@ -35,7 +35,6 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
@@ -43,10 +42,15 @@ py_library(
         "//tensorflow/python:template",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
         "@six_archive//:six",
     ],
 )
@@ -64,6 +68,7 @@ py_library(
         ":utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:embedding_ops",
@@ -72,20 +77,24 @@ py_library(
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:data_structures",
         "//tensorflow/python/trackable:resource",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -205,7 +214,7 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -239,7 +248,7 @@ py_test(
         ":feature_column_v2",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -249,7 +258,6 @@ tf_py_test(
     deps = [
         ":feature_column_v2",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:util",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index cee1aa9e6e0..0f82f1fbad2 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -144,7 +144,7 @@ from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import lookup_ops
@@ -2255,7 +2255,7 @@ class _LazyBuilder(object):
             message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
                 key, feature_tensor))
     ]):
-      return control_flow_ops.cond(
+      return cond.cond(
           math_ops.equal(1, array_ops.rank(feature_tensor)),
           lambda: expand_dims(feature_tensor), lambda: feature_tensor)
 
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 5845feecec9..5cc4682e2e5 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -143,7 +143,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import lookup_ops
@@ -2564,7 +2564,7 @@ class FeatureTransformationCache(object):
             message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
                 key, feature_tensor))
     ]):
-      return control_flow_ops.cond(
+      return cond.cond(
           math_ops.equal(1, array_ops.rank(feature_tensor)),
           lambda: expand_dims(feature_tensor), lambda: feature_tensor)
 
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index ccac7a69bb9..cde578c83e7 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -2516,7 +2516,7 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
       self.assertIsInstance(cols_to_vars[some_embedding_column][0],
-                            variables_lib.VariableV1)
+                            variables_lib.Variable)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
 
   def test_fills_cols_to_vars_shared_embedding(self):
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index 5fb8564cc62..eef7d3636b4 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -25,6 +25,8 @@ visibility = [
     "//third_party/py/tensorflow_numerics:__subpackages__",
     "//tensorflow_models/google:__subpackages__",
     "//learning/brain/google/data:__subpackages__",
+    "//learning/brain/experimental/tfq:__subpackages__",
+    "//learning/brain/google/python/ops:__subpackages__",
 ]
 
 package(
@@ -69,8 +71,8 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:io_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:resource_loader",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -107,7 +109,6 @@ cc_library(
         ":op_reg_offset_proto_cc",
         "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:stringpiece",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -159,20 +160,6 @@ cc_library(
     alwayslink = 1,
 )
 
-tf_cc_test(
-    name = "python_op_gen_test",
-    srcs = ["python_op_gen_test.cc"],
-    deps = [
-        ":op_reg_offset_proto_cc",
-        ":python_op_gen",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
 tf_proto_library(
     name = "op_reg_offset_proto",
     srcs = ["op_reg_offset.proto"],
@@ -234,9 +221,11 @@ tf_cc_test(
 
 py_library(
     name = "for_generated_wrappers",
+    deprecation = "Depending on this target can cause build dependency cycles. Depend on the fine-grained sub-targets instead.",
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
+        ":byte_swap_tensor",
         ":device",
         ":device_spec",
         ":dtypes",
@@ -256,9 +245,11 @@ py_library(
 # circular dependencies, as "function" uses generated op wrappers.
 py_library(
     name = "for_generated_wrappers_v2",
+    deprecation = "Depending on this target can cause build dependency cycles. Depend on the fine-grained sub-targets instead.",
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
+        ":byte_swap_tensor",
         ":constant_op",
         ":device",
         ":device_spec",
@@ -284,8 +275,8 @@ py_library(
     deps = [
         ":ops",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
@@ -300,14 +291,13 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
+        ":_errors_test_helper",
         ":_pywrap_python_api_dispatcher",
         ":_pywrap_python_api_info",
         ":_pywrap_python_api_parameter_converter",
         ":_pywrap_python_op_gen",
-        ":_errors_test_helper",
         ":composite_tensor",
         ":config",
-        ":convert_to_constants",
         ":cpp_shape_inference_proto_py",
         ":errors",
         ":fast_tensor_util",
@@ -316,15 +306,24 @@ py_library(
         ":graph_util",
         ":random_seed",
         ":sparse_tensor",
+        ":tensor",
         ":tensor_spec",
         ":tensor_util",
         ":type_spec",
-        "//third_party/py/numpy",
+        "//tensorflow/python:_pywrap_py_exception_registry",
+        "//tensorflow/python:_pywrap_quantize_training",
+        "//tensorflow/python:_pywrap_utils",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:pywrap_mlir",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python/client:_pywrap_debug_events_writer",
         "//tensorflow/python/client:_pywrap_events_writer",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/lib/core:_pywrap_py_func",  # TODO(b/142001480): remove once the bug is fixed.
         "//tensorflow/python/platform:_pywrap_stacktrace_handler",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:_pywrap_checkpoint_reader",
         "//tensorflow/python/util:_pywrap_kernel_registry",
         "//tensorflow/python/util:_pywrap_nest",
@@ -332,21 +331,24 @@ py_library(
         "//tensorflow/python/util:_pywrap_tfprof",
         "//tensorflow/python/util:_pywrap_transform_graph",
         "//tensorflow/python/util:_pywrap_util_port",
-        "//tensorflow/python/util:util",
-        "//tensorflow/python:_pywrap_py_exception_registry",
-        "//tensorflow/python:_pywrap_quantize_training",
-        "//tensorflow/python:_pywrap_utils",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:pywrap_mlir",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:pywrap_tf_session",
-        "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
     ] + if_xla_available([
         "//tensorflow/python:_pywrap_tfcompile",
     ]),
 )
 
+py_library(
+    name = "byte_swap_tensor",
+    srcs = ["byte_swap_tensor.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":dtypes",
+    ],
+)
+
 py_library(
     name = "c_api_util",
     srcs = ["c_api_util.py"],
@@ -354,6 +356,8 @@ py_library(
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
 
@@ -374,10 +378,13 @@ py_library(
         ":dtypes",
         ":ops",
         ":tensor_shape",
+        ":tensor_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:execute",
         "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/profiler:trace",
+        "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -386,7 +393,7 @@ py_library(
     srcs = ["device_spec.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -420,6 +427,7 @@ py_library(
         "//tensorflow/python/lib/core:_pywrap_custom_casts",
         "//tensorflow/python/lib/core:_pywrap_float8",
         "//tensorflow/python/types:doc_typealias",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -434,7 +442,10 @@ py_library(
         ":c_api_util",
         "//tensorflow/python:_pywrap_py_exception_registry",
         "//tensorflow/python:pywrap_tf_session",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -463,7 +474,9 @@ py_library(
         "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:function_utils",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
 
@@ -483,6 +496,7 @@ py_library(
     srcs_version = "PY3",
     visibility = visibility,
     deps = [
+        ":cpp_shape_inference_proto_py",
         ":framework",
         ":function",
         ":ops",
@@ -530,7 +544,8 @@ py_library(
         ":ops",
         ":tensor_util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -549,9 +564,12 @@ py_library(
         ":ops",
         ":tensor_util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/grappler:tf_optimizer",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -565,7 +583,7 @@ py_library(
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:pywrap_tf_session",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -582,8 +600,10 @@ py_library(
         ":tensor_shape",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/config:flags_py",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
 
@@ -669,6 +689,7 @@ tf_py_test(
     tags = ["no_pip"],
     deps = [
         ":_py_context_manager",
+        "//tensorflow/python/platform:test",
     ],
 )
 
@@ -713,6 +734,7 @@ tf_py_test(
         "no_pip",
         "no_windows",  # TODO(b/184424727): Enable this test on Windows.
     ],
+    deps = ["//tensorflow/python/platform:test"],
 )
 
 cc_library(
@@ -753,28 +775,28 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "//tensorflow/tsl/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/tsl/python/lib/core:numpy_hdr",
     ],
     deps = [
-        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@pybind11",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:lib",
+        "//tensorflow/c:pywrap_required_hdrs",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_headers_for_pybind",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core/config:flags_headers",
-        "//tensorflow/core:lib_headers_for_pybind",
         "//third_party/py/numpy:headers",
-        "//tensorflow/c:pywrap_required_hdrs",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@pybind11",
     ] + if_static(
         extra_deps = [
             "//tensorflow/core/protobuf:eager_service_proto_cc",
@@ -800,6 +822,8 @@ tf_py_test(
         ":_pywrap_python_api_parameter_converter",
         ":_pywrap_python_tensor_converter",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -839,28 +863,28 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "//tensorflow/tsl/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/tsl/python/lib/core:numpy_hdr",
     ],
     deps = [
-        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@pybind11",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:lib",
+        "//tensorflow/c:pywrap_required_hdrs",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_headers_for_pybind",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core/config:flags_headers",
-        "//tensorflow/core:lib_headers_for_pybind",
         "//third_party/py/numpy:headers",
-        "//tensorflow/c:pywrap_required_hdrs",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@pybind11",
     ] + if_static(
         extra_deps = [
             "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
@@ -886,6 +910,7 @@ tf_py_test(
         ":_pywrap_python_api_info",
         ":_pywrap_python_tensor_converter",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:test",
     ],
 )
 
@@ -911,9 +936,11 @@ cc_library(
 tf_python_pybind_extension(
     name = "_pywrap_python_api_dispatcher",
     srcs = ["python_api_dispatcher_wrapper.cc"],
-    hdrs = ["python_api_dispatcher.h"],
-    deps = [
+    hdrs = [
+        "python_api_dispatcher.h",
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
+    ],
+    deps = [
         "//tensorflow/python/util:function_parameter_canonicalizer_hdrs",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/container:flat_hash_map",
@@ -932,6 +959,7 @@ tf_py_test(
     deps = [
         ":_pywrap_python_api_dispatcher",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:test",
     ],
 )
 
@@ -964,28 +992,28 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "//tensorflow/tsl/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/tsl/python/lib/core:numpy_hdr",
     ],
     deps = [
-        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@pybind11",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:lib",
+        "//tensorflow/c:pywrap_required_hdrs",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_headers_for_pybind",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core/config:flags_headers",
-        "//tensorflow/core:lib_headers_for_pybind",
         "//third_party/py/numpy:headers",
-        "//tensorflow/c:pywrap_required_hdrs",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@pybind11",
     ] + if_static(
         extra_deps = [
             "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
@@ -1010,6 +1038,7 @@ tf_py_test(
     deps = [
         ":_pywrap_python_tensor_converter",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:test",
     ],
 )
 
@@ -1027,7 +1056,6 @@ py_library(
         ":op_def_registry",
         ":registry",
         ":stack",
-        ":tensor_conversion",
         ":tensor_conversion_registry",
         ":tensor_shape",
         ":tensor_util",
@@ -1037,14 +1065,24 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:control_flow_util",
         "//tensorflow/python:handle_data_util",
-        "//tensorflow/python:platform",
         "//tensorflow/python:tf2",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:monitoring",
-        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/eager:record",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:trace",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:decorator_utils",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:function_utils",
+        "//tensorflow/python/util:lazy_loader",
+        "//tensorflow/python/util:lock_util",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:tf_stack",
+        "//tensorflow/python/util:traceback_utils",
     ],
 )
 
@@ -1067,6 +1105,7 @@ cuda_py_test(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python/eager:execute",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
@@ -1081,14 +1120,15 @@ py_library(
         ":dtypes",
         ":tensor_conversion_registry",
         ":tensor_shape",
+        ":tensor_spec",
         ":type_spec",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:tf2",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/saved_model:nested_structure_coder",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:internal",
+        "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -1100,6 +1140,7 @@ tf_py_test(
     deps = [
         ":composite_tensor_gradient",
         "//tensorflow/python:ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:test",
     ],
 )
@@ -1110,8 +1151,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":tensor_conversion_registry",
-        "//tensorflow/python/util",
         "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -1124,8 +1165,7 @@ py_library(
     visibility = visibility + ["//learning/brain/experimental:__subpackages__"],
     deps = [
         ":dtypes",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -1147,18 +1187,21 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/capture:capture_container",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:handle_data_util",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/autograph",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:execute",
-        "//tensorflow/python/platform",
         "//tensorflow/python/saved_model:save_context",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
         "//third_party/py/numpy",
     ],
 )
@@ -1173,7 +1216,9 @@ py_library(
         ":sparse_tensor",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
 
@@ -1183,6 +1228,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":dtypes",
+        "//tensorflow/python/util:object_identity",
     ],
 )
 
@@ -1194,6 +1240,7 @@ tf_py_test(
     deps = [
         ":auto_control_deps",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_switch_case",
         "//tensorflow/python:sendrecv_ops_gen",
         "//tensorflow/python:while_loop",
@@ -1207,7 +1254,10 @@ py_library(
     deps = [
         ":errors",
         ":ops",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:_pywrap_determinism",
+        "//tensorflow/python/util:_pywrap_tensor_float_32_execution",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1229,15 +1279,14 @@ cuda_py_test(
         ":ops",
         ":test_lib",
         ":test_ops",
-        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:def_function",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
 
@@ -1257,9 +1306,9 @@ cuda_py_test(
         ":errors",
         ":ops",
         ":test_lib",
-        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
 
@@ -1276,8 +1325,6 @@ tpu_py_test(
         ":constant_op",
         ":test_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/util",
     ] + tf_additional_xla_deps_py(),
 )
 
@@ -1288,6 +1335,8 @@ py_library(
     deps = [
         ":config",
         ":ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1296,10 +1345,8 @@ py_library(
     srcs = ["registry.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:platform",
-        "//tensorflow/python/util",
-        # TODO(mdan): Remove this once the transitive dependency is fixed.
-        "//tensorflow/python/util:tf_stack",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -1309,9 +1356,11 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":tensor_util",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_case",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1325,8 +1374,10 @@ tf_py_test(
         ":ops",
         ":smart_cond",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
+        "//tensorflow/python/platform:test",
     ],
 )
 
@@ -1343,7 +1394,9 @@ py_library(
         ":type_spec_registry",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/saved_model:nested_structure_coder",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:internal",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1355,6 +1408,9 @@ py_library(
     deps = [
         ":dtypes",
         ":tensor_util",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1371,6 +1427,7 @@ py_test(
         ":test_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -1379,7 +1436,7 @@ py_library(
     srcs = ["composite_tensor_gradient.py"],
     deps = [
         ":composite_tensor",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -1400,6 +1457,9 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:composite_tensor_ops",
         "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1414,6 +1474,7 @@ py_library(
         ":tensor_shape",
         ":tensor_spec",
         ":type_spec",
+        "//tensorflow/python/util:type_annotations",
     ],
 )
 
@@ -1434,8 +1495,9 @@ py_library(
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python:tf2",
         "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:nested_structure_coder",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1451,10 +1513,17 @@ pytype_library(
         ":tensor_shape",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/trace_type",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:nested_structure_coder",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/types:internal",
+        "//tensorflow/python/types:trace",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -1465,7 +1534,7 @@ pytype_library(
     srcs_version = "PY3",
     visibility = visibility + ["//third_party/py/tensorflow_gnn:__subpackages__"],
     deps = [
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:internal",
     ],
 )
 
@@ -1483,6 +1552,8 @@ pytype_library(
         ":type_spec",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/structured:structured_tensor",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -1503,15 +1574,18 @@ py_library(
         ":common_shapes",
         ":dtypes",
         ":tensor_shape",
+        ":tensor_util",
         ":type_spec",
         ":type_spec_registry",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python/eager:graph_only_ops",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:nested_structure_coder",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:internal",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -1532,8 +1606,11 @@ py_library(
         ":errors",
         ":tensor_shape",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/types:internal",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1543,7 +1620,6 @@ py_library(
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python/util",
     ],
 )
 
@@ -1553,6 +1629,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1581,7 +1658,6 @@ py_library(
         "//tensorflow/core/config:flags_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:pywrap_sanitizers",
         "//tensorflow/python:pywrap_tf_session",
@@ -1598,7 +1674,15 @@ py_library(
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:ragged_tensor_value",
         "//tensorflow/python/platform:_pywrap_stacktrace_handler",
-        "//tensorflow/python/util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:_pywrap_util_port",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:traceback_utils",
+        "//tensorflow/python/util/protobuf",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1650,7 +1734,7 @@ py_library(
         ":test_combinations_lib",
         "//tensorflow/python:tf2",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -1659,7 +1743,8 @@ py_library(
     srcs = ["test_combinations.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1684,7 +1769,10 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":_python_memory_checker_helper",
+        # copybara:uncomment "//tensorflow/python/platform:cpp_memory_checker",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:trace",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
 
@@ -1704,9 +1792,9 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         ":constant_op",
-        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
 
@@ -1735,6 +1823,7 @@ tf_py_test(
         ":errors",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -1764,9 +1853,11 @@ tf_py_test(
         ":framework",
         ":subscribe",
         ":test_lib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:variable_v1",
     ],
 )
 
@@ -1801,8 +1892,10 @@ cuda_py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_ops_gen",
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:gradients",
         "//tensorflow/python:init_ops",
@@ -1810,11 +1903,12 @@ cuda_py_test(
         "//tensorflow/python:logging_ops_gen",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -1845,6 +1939,7 @@ tf_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_grad",
@@ -1880,16 +1975,17 @@ tf_py_test(
         ":framework",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
+        "//tensorflow/python/platform:gfile",
     ],
 )
 
@@ -1907,6 +2003,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1921,7 +2018,7 @@ tf_py_test(
         ":test_ops",
         ":traceable_stack",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
 
@@ -1937,10 +2034,10 @@ tf_kernel_library(
     hdrs = ["test_ops.h"],
     gpu_srcs = ["test_ops.cu.cc"],
     deps = [
-        "@com_google_absl//absl/time",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/time",
     ] + if_static(["//tensorflow/core/platform:tensor_float_32_utils"]),
     alwayslink = 1,
 )
@@ -1990,6 +2087,7 @@ tf_py_test(
         ":test_ops_2",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/trace_type",
+        "//tensorflow/python:cond",
         "//tensorflow/python:cond_v2",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:gradients",
@@ -2002,7 +2100,7 @@ tf_py_test(
         "//tensorflow/python:while_v2",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -2029,6 +2127,7 @@ tf_py_test(
         ":framework",
         ":test_lib",
         ":test_namespace_ops",
+        "//tensorflow/python/platform:test",
     ],
 )
 
@@ -2039,6 +2138,7 @@ tf_py_test(
     main = "python_op_gen_annotation_test.py",
     python_version = "PY3",
     deps = [
+        "//tensorflow/python/platform:test",
         "//tensorflow/security/fuzzing/py:annotation_types",
     ],
 )
@@ -2067,7 +2167,13 @@ tf_py_test(
         ":test_lib",
         ":type_spec_registry",
         ":type_utils",
+        "//tensorflow/python:cond",
         "//tensorflow/python:while_loop",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -2082,6 +2188,7 @@ tf_py_test(
     deps = [
         ":extension_type_field",
         ":test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -2095,6 +2202,7 @@ tf_py_test(
     deps = [
         ":immutable_dict",
         ":test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -2129,6 +2237,8 @@ tf_py_test(
         ":type_utils",
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
 )
@@ -2225,6 +2335,7 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:shape_util",
         "//tensorflow/python:state_ops_gen",
+        "//tensorflow/python:variable_v1",
         "//third_party/py/numpy",
     ],
 )
@@ -2284,6 +2395,7 @@ tf_py_test(
         # regression b/149433910 .
         ":memory_checker",
         ":_memory_checker_test_helper",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -2322,6 +2434,7 @@ tf_py_test(
         ":op_def_library_pybind",
         ":test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -2384,7 +2497,8 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:state_ops_gen",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -2399,10 +2513,15 @@ tf_py_test(
         ":convert_to_constants",
         ":test_lib",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_switch_case",
         "//tensorflow/python:control_flow_v2_toggles",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:ref_variable",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:while_loop",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -2440,7 +2559,7 @@ pyx_library(
     name = "fast_tensor_util",
     srcs = ["fast_tensor_util.pyx"],
     cc_deps = ["//third_party/py/numpy:headers"],
-    py_deps = ["//tensorflow/python:util"],
+    py_deps = ["//tensorflow/python/util:compat"],
 )
 
 py_test(
@@ -2454,6 +2573,8 @@ py_test(
         "//tensorflow/python:tf2",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/platform:_pywrap_tf2",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -2484,6 +2605,6 @@ py_library(
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:gfile",
     ],
 )
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index ce20c5f0a8d..9a23b038dd0 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -274,7 +274,7 @@ class AutomaticControlDependencies(object):
     self._graph = g
     g._add_control_dependencies = True  # pylint: disable=protected-access
     g.experimental_acd_manager = self
-    self._n_operations = len(g.get_operations())
+    self._n_operations = g.num_operations()
     return self
 
   def _process_switch(self, switch_op, ops_which_must_run,
diff --git a/tensorflow/python/framework/auto_control_deps_test.py b/tensorflow/python/framework/auto_control_deps_test.py
index 7b20f53309d..daa838f8a90 100644
--- a/tensorflow/python/framework/auto_control_deps_test.py
+++ b/tensorflow/python/framework/auto_control_deps_test.py
@@ -24,7 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_switch_case
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_resource_variable_ops
@@ -325,7 +325,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
       def else_branch():
         return array_ops.zeros([], v.dtype)
 
-      return control_flow_ops.cond(
+      return cond.cond(
           constant_op.constant(True), then_branch, else_branch)
 
     self._testVariableReadInFunctionalOp(build_functional_op, "If")
@@ -340,7 +340,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
       def else_branch():
         return gen_resource_variable_ops.read_variable_op(v.handle, v.dtype)
 
-      return control_flow_ops.cond(
+      return cond.cond(
           constant_op.constant(False), then_branch, else_branch)
 
     self._testVariableReadInFunctionalOp(build_functional_op, "If")
@@ -445,7 +445,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
           def else_branch():
             return array_ops.zeros([], v.dtype)
 
-          return control_flow_ops.cond(
+          return cond.cond(
               constant_op.constant(True), then_branch, else_branch)
 
         return inner_fn()
@@ -516,7 +516,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
       def else_branch():
         return array_ops.zeros([], v.dtype)
 
-      return control_flow_ops.cond(
+      return cond.cond(
           constant_op.constant(True), then_branch, else_branch)
 
     self._testVariableWriteInFunctionalOp(build_functional_op, "If")
@@ -532,7 +532,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
         gen_resource_variable_ops.assign_variable_op(v.handle, v + 1)
         return gen_resource_variable_ops.read_variable_op(v.handle, v.dtype)
 
-      return control_flow_ops.cond(
+      return cond.cond(
           constant_op.constant(False), then_branch, else_branch)
 
     self._testVariableWriteInFunctionalOp(build_functional_op, "If")
@@ -643,7 +643,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
           def else_branch():
             return array_ops.zeros([], v.dtype)
 
-          return control_flow_ops.cond(
+          return cond.cond(
               constant_op.constant(True), then_branch, else_branch)
 
         return inner_fn()
@@ -669,7 +669,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
           v.assign(v + 4)
           return 1.0
 
-        control_flow_ops.cond(p, true_fn, false_fn)
+        cond.cond(p, true_fn, false_fn)
         val = v.read_value()
         val = c.mark_as_return(val)
       self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
@@ -691,7 +691,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
           v.assign(v + 4)
           return 1.0
 
-        control_flow_ops.cond(p, true_fn, false_fn)
+        cond.cond(p, true_fn, false_fn)
         one = constant_op.constant(1.0)
         one = c.mark_as_return(one)
       one.eval(feed_dict={p: False})
@@ -722,10 +722,10 @@ class AutomaticControlDependenciesTest(test.TestCase):
             v.assign(v * 3, name="false_false")
             return 3.0
 
-          control_flow_ops.cond(q, inner_true_fn, inner_false_fn)
+          cond.cond(q, inner_true_fn, inner_false_fn)
           return 1.0
 
-        control_flow_ops.cond(p, true_fn, false_fn)
+        cond.cond(p, true_fn, false_fn)
         with ops.name_scope("final"):
           val = v.read_value()
         val = c.mark_as_return(val)
@@ -749,7 +749,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
           v.assign(v + 4)
           return 1.0
 
-        control_flow_ops.cond(p, true_fn, false_fn)
+        cond.cond(p, true_fn, false_fn)
         val = v.read_value()
         val = c.mark_as_return(val)
       self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
@@ -771,7 +771,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
           v.assign(v + 4)
           return 1.0
 
-        control_flow_ops.cond(p, true_fn, false_fn)
+        cond.cond(p, true_fn, false_fn)
         val = v.read_value()
         val = c.mark_as_return(val)
       self.assertAllEqual(val.eval(feed_dict={p: False}), 6.0)
@@ -792,7 +792,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
           v.assign(v + 4)
           return 1.0
 
-        control_flow_ops.cond(p, true_fn, false_fn)
+        cond.cond(p, true_fn, false_fn)
         v.assign(v * 2)
         val = v.read_value()
         val = c.mark_as_return(val)
diff --git a/tensorflow/python/framework/byte_swap_tensor.py b/tensorflow/python/framework/byte_swap_tensor.py
new file mode 100644
index 00000000000..432744c89bd
--- /dev/null
+++ b/tensorflow/python/framework/byte_swap_tensor.py
@@ -0,0 +1,103 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilities for byte swapping the tensor content."""
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.framework import dtypes
+
+# Based on tensor_bundle/byte_swap.cc
+byte_swappable = [
+    dtypes.float16,
+    dtypes.float32,
+    dtypes.float64,
+    dtypes.bfloat16,
+    dtypes.complex64,
+    dtypes.complex128,
+    dtypes.uint16,
+    dtypes.uint32,
+    dtypes.uint64,
+    dtypes.int16,
+    dtypes.int32,
+    dtypes.int64,
+    dtypes.qint16,
+    dtypes.quint16,
+    dtypes.qint32,
+]
+
+
+def byte_swap_tensor_content(tensor, from_endiness, to_endiness):
+  """Byte swaps.
+
+  Args:
+    tensor: Target tensor to change endiness.
+    from_endiness: The original endianness format. "big" or "little"
+    to_endiness: The target endianness format. "big" or "little"
+  """
+  if tensor.dtype in byte_swappable:
+    tshape = tensor.tensor_shape.dim
+    tensor_bytes = tensor.tensor_content
+    if tensor_bytes:
+      tensor_size = 1
+      for sz in tshape:
+        if sz.size != 0:
+          tensor_size = tensor_size * sz.size
+      chunksize = int(len(tensor_bytes) / tensor_size)
+      # Split tensor_data into chunks for byte swapping.
+      to_swap = [
+          tensor_bytes[i : i + chunksize]
+          for i in range(0, len(tensor_bytes), chunksize)
+      ]
+      # Swap and replace tensor_content.
+      tensor.tensor_content = b"".join(
+          [
+              int.from_bytes(byteswap, from_endiness).to_bytes(
+                  chunksize, to_endiness
+              )
+              for byteswap in to_swap
+          ]
+      )
+
+
+def swap_tensor_content_in_graph_function(
+    graph_def, from_endiness, to_endiness
+):
+  """Fix endiness of tensor contents.
+
+  Args:
+    graph_def: Target graph_def to change endiness.
+    from_endiness: The original endianness format. "big" or "little"
+    to_endiness: The target endianness format. "big" or "little"
+  """
+  if isinstance(graph_def, meta_graph_pb2.MetaGraphDef):
+    functions = graph_def.graph_def.library.function
+  elif isinstance(graph_def, graph_pb2.GraphDef):
+    functions = graph_def.library.function
+  else:
+    return
+  for function in functions:
+    node_def = function.node_def
+    for node in node_def:
+      if node.op == "Const":
+        tensor = node.attr["value"].tensor
+        byte_swap_tensor_content(tensor, from_endiness, to_endiness)
+
+
+def swap_tensor_content_in_graph_node(graph_def, from_endiness, to_endiness):
+  for node in graph_def.node:
+    if node.op == "Const":
+      tensor = node.attr["value"].tensor
+      byte_swap_tensor_content(tensor, from_endiness, to_endiness)
diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
index f764b4ebf93..9a866366f45 100644
--- a/tensorflow/python/framework/c_api_util.py
+++ b/tensorflow/python/framework/c_api_util.py
@@ -235,42 +235,3 @@ def tf_output(c_op, index):
   ret.oper = c_op
   ret.index = index
   return ret
-
-
-def tf_operations(graph):
-  """Generator that yields every TF_Operation in `graph`.
-
-  Args:
-    graph: Graph
-
-  Yields:
-    wrapped TF_Operation
-  """
-  # pylint: disable=protected-access
-  pos = 0
-  with graph._c_graph.get() as c_graph:
-    c_op, pos = c_api.TF_GraphNextOperation(c_graph, pos)
-    while c_op is not None:
-      yield c_op
-      c_op, pos = c_api.TF_GraphNextOperation(c_graph, pos)
-  # pylint: enable=protected-access
-
-
-def new_tf_operations(graph):
-  """Generator that yields newly-added TF_Operations in `graph`.
-
-  Specifically, yields TF_Operations that don't have associated Operations in
-  `graph`. This is useful for processing nodes added by the C API.
-
-  Args:
-    graph: Graph
-
-  Yields:
-    wrapped TF_Operation
-  """
-  # TODO(b/69679162): do this more efficiently
-  for c_op in tf_operations(graph):
-    try:
-      graph._get_operation_by_tf_operation(c_op)  # pylint: disable=protected-access
-    except KeyError:
-      yield c_op
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index d64f592c14a..33289c37283 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -42,26 +42,27 @@ def enable_tensor_float_32_execution(enabled):
   """Enable or disable the use of TensorFloat-32 on supported hardware.
 
   [TensorFloat-32](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format),
-  or TF32 for short, is a math mode for NVIDIA Ampere GPUs. TensorFloat-32
-  execution causes certain float32 ops, such as matrix multiplications and
-  convolutions, to run much faster on Ampere GPUs but with reduced precision.
-  This reduced precision should not impact convergence of deep learning models
-  in practice.
+  or TF32 for short, is a math mode for NVIDIA Ampere GPUs and above.
+  TensorFloat-32 execution causes certain float32 ops, such as matrix
+  multiplications and convolutions, to run much faster on such GPUs but with
+  reduced precision. This reduced precision should not impact convergence of
+  deep learning models in practice.
 
   TensorFloat-32 is enabled by default. TensorFloat-32 is only supported on
-  Ampere GPUs, so all other hardware will use the full float32 precision
-  regardless of whether TensorFloat-32 is enabled or not. If you want to use the
-  full float32 precision on Ampere, you can disable TensorFloat-32 execution
-  with this function. For example:
+  NVIDIA GPUs starting with the Ampere generation, so older NVIDIA GPUs and
+  other hardware will use the full float32 precision regardless of whether
+  TensorFloat-32 is enabled or not. If you want to use the full float32
+  precision on all GPUs, you can disable TensorFloat-32 execution with this
+  function. For example:
 
   ```python
-  x = tf.fill((2, 2), 1.0001)
-  y = tf.fill((2, 2), 1.)
+  x = tf.fill((1024, 1024), 1.0001)
+  y = tf.fill((1024, 1024), 1.)
   # TensorFloat-32 is enabled, so matmul is run with reduced precision
-  print(tf.linalg.matmul(x, y))  # [[2., 2.], [2., 2.]]
+  print(tf.linalg.matmul(x, y)[0, 0])  # 1024.0
   tf.config.experimental.enable_tensor_float_32_execution(False)
   # Matmul is run with full precision
-  print(tf.linalg.matmul(x, y))  # [[2.0002, 2.0002], [2.0002, 2.0002]]
+  print(tf.linalg.matmul(x, y)[0, 0])  # ~1024.1
   ```
 
   To check whether TensorFloat-32 execution is currently enabled, use
@@ -73,8 +74,7 @@ def enable_tensor_float_32_execution(enabled):
   utilizing the GPU's tensor cores. TensorFloat-32 has the same dynamic range as
   float32, meaning it is no more likely to underflow or overflow than float32.
   Ops still use float32 accumulation when TensorFloat-32 is enabled. Enabling or
-  disabling TensorFloat-32 only affects Ampere GPUs and subsequent GPUs that
-  support TensorFloat-32.
+  disabling TensorFloat-32 only affects Ampere GPUs and above.
 
   Note TensorFloat-32 is not always used in supported ops, as only inputs of
   certain shapes are supported. Support for more input shapes and more ops may
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index a94ac690384..57169491567 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -877,16 +877,21 @@ class TensorFloat32Test(test.TestCase):
       self.skipTest('TensorFloat-32 requires an NVIDIA GPU with compute '
                     'capability of at least 8.0')
 
+  # Size of each dimension of matrices to test. cuBLAS does not use TF32 for
+  # small matrices, so we must choose a large enough size to cause TF32 to be
+  # used.
+  DIM = 2 ** 10
+
   def test_tensor_float_32_enabled(self):
     self._skip_if_tensor_float_32_unsupported()
     self.assertTrue(config.tensor_float_32_execution_enabled())
 
-    x = array_ops.fill((8, 8), 1 + 2**-20)
-    y = array_ops.ones((8, 8))
+    x = array_ops.fill((self.DIM, self.DIM), 1 + 2**-12)
+    y = array_ops.ones((self.DIM, self.DIM))
     out = math_ops.matmul(x, y)
-    # In TensorFloat-32, each element of x is rounded to 1, so the output will
-    # be 8s.
-    expected = array_ops.fill((8, 8), 8)
+    # In TensorFloat-32, each element of x is rounded to 1, so each output
+    # element should be self.DIM.
+    expected = array_ops.fill((self.DIM, self.DIM), float(self.DIM))
     self.assertAllEqual(out, expected)
 
   def test_tensor_float_32_disabled(self):
@@ -895,11 +900,11 @@ class TensorFloat32Test(test.TestCase):
     config.enable_tensor_float_32_execution(False)
     self.assertFalse(config.tensor_float_32_execution_enabled())
 
-    x = array_ops.fill((8, 8), 1 + 2**-20)
-    y = array_ops.ones((8, 8))
+    x = array_ops.fill((self.DIM, self.DIM), 1 + 2**-12)
+    y = array_ops.ones((self.DIM, self.DIM))
     out = math_ops.matmul(x, y)
-    expected = array_ops.fill((8, 8), 8 * (1 + 2**-20))
-    self.assertAllEqual(out, expected)
+    expected = array_ops.fill((self.DIM, self.DIM), self.DIM * (1 + 2**-12))
+    self.assertAllClose(out, expected, rtol=2**-13, atol=0)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index e1c7b351681..2766b1bc683 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -19,14 +19,18 @@ See the [constants guide](https://tensorflow.org/api_guides/python/constant_op).
 
 # Must be separate from array_ops to avoid a cyclic dependency.
 
+import numpy as np
 from tensorflow.core.framework import types_pb2
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.profiler import trace
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -78,6 +82,11 @@ def convert_to_eager_tensor(value, ctx, dtype=None):
   Raises:
     TypeError: if `dtype` is not compatible with the type of t.
   """
+  if isinstance(value, np.ndarray):
+    # Make a copy explicitly because the EagerTensor might share the underlying
+    # memory with the input array. Without this copy, users will be able to
+    # modify the EagerTensor after its creation by changing the input array.
+    value = value.copy()
   if isinstance(value, ops.EagerTensor):
     if dtype is not None and value.dtype != dtype:
       raise TypeError(f"Expected tensor {value} with dtype {dtype!r}, but got "
@@ -385,3 +394,41 @@ def _dimension_tensor_conversion_function(d,
 
 tensor_conversion_registry.register_tensor_conversion_function(
     tensor_shape.Dimension, _dimension_tensor_conversion_function, 100)
+
+
+class _ConstantTensorCodec:
+  """Codec for Tensor."""
+
+  def can_encode(self, pyobj):
+    return isinstance(pyobj, ops.Tensor)
+
+  def do_encode(self, tensor_value, encode_fn):
+    """Returns an encoded `TensorProto` for the given `tf.Tensor`."""
+    del encode_fn
+    encoded_tensor = struct_pb2.StructuredValue()
+    if isinstance(tensor_value, ops.EagerTensor):
+      encoded_tensor.tensor_value.CopyFrom(
+          tensor_util.make_tensor_proto(tensor_value.numpy())
+      )
+    else:
+      if tensor_value.op.type == "Const":
+        encoded_tensor.tensor_value.CopyFrom(tensor_value.op.get_attr("value"))
+      else:
+        raise nested_structure_coder.NotEncodableError(
+            f"No encoder for object {str(tensor_value)} of type"
+            f" {type(tensor_value)}."
+        )
+    return encoded_tensor
+
+  def can_decode(self, value):
+    return value.HasField("tensor_value")
+
+  def do_decode(self, value, decode_fn):
+    """Returns the `tf.Tensor` encoded by the proto `value`."""
+    del decode_fn
+    tensor_proto = value.tensor_value
+    tensor = constant(tensor_util.MakeNdarray(tensor_proto))
+    return tensor
+
+
+nested_structure_coder.register_codec(_ConstantTensorCodec())
diff --git a/tensorflow/python/framework/constant_op_test.py b/tensorflow/python/framework/constant_op_test.py
index d7e817c54c7..3fe0611dc61 100644
--- a/tensorflow/python/framework/constant_op_test.py
+++ b/tensorflow/python/framework/constant_op_test.py
@@ -15,6 +15,7 @@
 """Tests for tensorflow.python.framework.constant_op."""
 
 from absl.testing import parameterized
+import numpy as np
 
 from google.protobuf import text_format
 
@@ -83,6 +84,15 @@ class ConstantOpTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllClose(3.14, f_using_eagerconst(constant_op.constant(3.14)))
 
+  def test_np_array_memory_not_shared(self):
+    # An arbitrarily large loop number to test memory sharing
+    for _ in range(10000):
+      x = np.arange(10)
+      xt = constant_op.constant(x)
+      x[3] = 42
+      # Changing the input array after `xt` is created should not affect `xt`
+      self.assertEqual(xt.numpy()[3], 3)
+
   def test_eager_const_grad_error(self):
 
     @def_function.function
diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
index a0d953d9ca0..9ec5bcd808a 100644
--- a/tensorflow/python/framework/convert_to_constants.py
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -25,6 +25,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_util
@@ -36,16 +37,9 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.saver import export_meta_graph
 from tensorflow.python.util import deprecation
-from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import tf_export
 
-# Lazy load the single eager module to avoid introducing new dependencies for
-# graph_util:convert_variables_to_constants (eg in
-# tensorflow/contrib/session_bundle:session_bundle_py_test).
-wrap_function = lazy_loader.LazyLoader(
-    "wrap_function", globals(),
-    "tensorflow.python.eager.wrap_function")
 
 # Used in _FunctionConverterDataInGraph().
 VAR_ASSIGN_COLLECTION = "extra_var_assign_ops"
diff --git a/tensorflow/python/framework/convert_to_constants_test.py b/tensorflow/python/framework/convert_to_constants_test.py
index f6c4d40de29..80057414b79 100644
--- a/tensorflow/python/framework/convert_to_constants_test.py
+++ b/tensorflow/python/framework/convert_to_constants_test.py
@@ -41,16 +41,18 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_case
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_switch_case
 from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import ref_variable
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
 from tensorflow.python.ops import while_v2
@@ -327,9 +329,9 @@ class VariablesToConstantsTest(test.TestCase):
     with export_graph.as_default():
       start = array_ops.placeholder(
           shape=[1, 1], dtype=dtypes.float32, name="start")
-      distractor = variables.RefVariable(-1., name="distractor")
-      v = variables.RefVariable(3., name="v")
-      local_variable = variables.VariableV1(
+      distractor = ref_variable.RefVariable(-1., name="distractor")
+      v = ref_variable.RefVariable(3., name="v")
+      local_variable = variable_v1.VariableV1(
           1.,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           trainable=False,
@@ -381,7 +383,7 @@ class VariablesToConstantsTest(test.TestCase):
         tensor_spec.TensorSpec(shape=(), dtype=dtypes.bool)
     ])
     def model(x, b):
-      return control_flow_ops.cond(
+      return cond.cond(
           b, true_fn=lambda: true_fn(x), false_fn=lambda: false_fn(x))
 
     root, output_func = self._freezeModel(model)
@@ -746,7 +748,7 @@ class ConvertVariablesToConstantsV2SessionTest(test.TestCase):
             tensor_spec.TensorSpec(shape=(), dtype=dtypes.bool)
         ])
         def model(x, b):
-          return control_flow_ops.cond(
+          return cond.cond(
               b, true_fn=lambda: true_fn(x), false_fn=lambda: false_fn(x))
 
         root, output_func = self._freezeModel(model)
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 1689dac435f..57dc976cf56 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -33,6 +33,7 @@ from tensorflow.python.types import trace
 from tensorflow.core.function import trace_type
 from tensorflow.tools.docs import doc_controls
 from tensorflow.tsl.python.lib.core import pywrap_bfloat16
+from tensorflow.python.framework import cpp_shape_inference_pb2
 
 _np_bfloat16 = pywrap_bfloat16.bfloat16_type()
 _np_float8_e4m3fn = _pywrap_float8.TF_float8_e4m3fn_type()
@@ -64,7 +65,20 @@ class DType(
 
   See `tf.dtypes` for a complete list of `DType`'s defined.
   """
-  __slots__ = ()
+  __slots__ = ["_handle_data"]
+
+  def __init__(self, type_enum, handle_data=None):
+    super().__init__(type_enum)
+
+    # Resource and Variant dtypes have additional handle data information that
+    # is necessary for manipulating those Tensors.
+    if handle_data is not None and not isinstance(
+        handle_data,
+        cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData,
+    ):
+      raise TypeError("handle_data must be of the type HandleData proto.")
+
+    self._handle_data = handle_data
 
   @property
   def _is_ref_dtype(self):
@@ -803,7 +817,10 @@ def as_dtype(type_value):
     TypeError: If `type_value` cannot be converted to a `DType`.
   """
   if isinstance(type_value, DType):
-    return _INTERN_TABLE[type_value.as_datatype_enum]
+    if type_value._handle_data is None:  # pylint:disable=protected-access
+      return _INTERN_TABLE[type_value.as_datatype_enum]
+    else:
+      return type_value
 
   if isinstance(type_value, np.dtype):
     try:
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index 3ea2f128385..e4c8964f68c 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -24,7 +24,7 @@ import re
 import site
 import traceback
 
-from tensorflow.core.protobuf import graph_debug_info_pb2
+from tensorflow.core.framework import graph_debug_info_pb2
 
 _NAME_REGEX = r"[A-Za-z0-9_.][A-Za-z0-9_.\-/]*?"
 _TAG_REGEX = fr"{{{{(?P<type>{_NAME_REGEX}) (?P<name>{_NAME_REGEX})}}}}"
@@ -435,7 +435,7 @@ def _build_node_error_message(op):
   return "\n".join(node_error_message)
 
 
-def interpolate(message, graph):
+def interpolate_graph(message, graph):
   """Interpolates an error message.
 
   The error message can contain tags of form `{{node_type node_name}}`
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index 9a6d6e81ecd..84151e39e86 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -232,14 +232,14 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
     with ops.Graph().as_default():
       constant_op.constant(1, name="One")
       normal_string = "This is just a normal string"
-      interpolated_string = error_interpolation.interpolate(
+      interpolated_string = error_interpolation.interpolate_graph(
           normal_string, ops.get_default_graph())
       self.assertIn(normal_string, interpolated_string)
 
   def testOneTagWithAFakeNameResultsInPlaceholders(self):
     with ops.Graph().as_default():
       one_tag_string = "{{node MinusOne}}"
-      interpolated_string = error_interpolation.interpolate(
+      interpolated_string = error_interpolation.interpolate_graph(
           one_tag_string, ops.get_default_graph())
       self.assertIn(one_tag_string, interpolated_string)
 
@@ -249,7 +249,7 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
       constant_op.constant(1, name="One")
       constant_op.constant(2, name="Two")
       one_tag_with_a_fake_function_tag = "{{function_node fake}}{{node One}}"
-      interpolated_string = error_interpolation.interpolate(
+      interpolated_string = error_interpolation.interpolate_graph(
           one_tag_with_a_fake_function_tag, ops.get_default_graph())
       # Fragments the expression to avoid matching the pattern itself.
       expected_regex = re.compile(rf"node 'One'.*{defined_at}", re.DOTALL)
@@ -264,7 +264,7 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
       constant_op.constant(2, name="Two")
       constant_op.constant(3, name="Three")
       two_tags_no_seps = "{{node One}}{{node Three}}"
-      interpolated_string = error_interpolation.interpolate(
+      interpolated_string = error_interpolation.interpolate_graph(
           two_tags_no_seps, ops.get_default_graph())
       # Fragments the expression to avoid matching the pattern itself.
       expected_regex = re.compile(
@@ -278,7 +278,7 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
       constant_op.constant(2, name="Two")
       constant_op.constant(3, name="Three")
       two_tags_with_seps = ";;;{{node Two}},,,{{node Three}};;;"
-      interpolated_string = error_interpolation.interpolate(
+      interpolated_string = error_interpolation.interpolate_graph(
           two_tags_with_seps, ops.get_default_graph())
       # Fragments the expression to avoid matching the pattern itself.
       expected_regex = re.compile(
@@ -291,7 +291,7 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
       constant_op.constant(1, name="One")
       constant_op.constant(2, name="Two")
       newline = "\n\n;;;{{node One}};;;"
-      interpolated_string = error_interpolation.interpolate(
+      interpolated_string = error_interpolation.interpolate_graph(
           newline, ops.get_default_graph())
       expected_regex = re.compile(rf"node 'One'.*{defined_at}", re.DOTALL)
       self.assertRegex(interpolated_string, expected_regex)
diff --git a/tensorflow/python/framework/errors_test_helper.cc b/tensorflow/python/framework/errors_test_helper.cc
index af756f83b16..2c4ea592d51 100644
--- a/tensorflow/python/framework/errors_test_helper.cc
+++ b/tensorflow/python/framework/errors_test_helper.cc
@@ -19,7 +19,7 @@ limitations under the License.
 namespace tensorflow {
 PYBIND11_MODULE(_errors_test_helper, m) {
   m.def("TestRaiseFromStatus", [](int code) {
-    tensorflow::Status status(static_cast<tensorflow::error::Code>(code),
+    tensorflow::Status status(static_cast<absl::StatusCode>(code),
                               "test message");
     status.SetPayload("key1", absl::Cord("value1"));
     status.SetPayload("key2", absl::Cord("value2"));
diff --git a/tensorflow/python/framework/experimental/unified_api.cc b/tensorflow/python/framework/experimental/unified_api.cc
index 5d53c921061..dddc322610e 100644
--- a/tensorflow/python/framework/experimental/unified_api.cc
+++ b/tensorflow/python/framework/experimental/unified_api.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -39,7 +40,7 @@ limitations under the License.
 #include "tensorflow/python/eager/pywrap_tensor.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 namespace py = pybind11;
 
@@ -257,6 +258,7 @@ PYBIND11_MODULE(_unified_api, m) {
     return t;
   });
 
-  py::class_<AbstractFunction, RefCountPtr<AbstractFunction>> AbstractFunction(
-      m, "AbstractFunction");
+  py::class_<AbstractFunction,
+             std::unique_ptr<AbstractFunction, tsl::core::RefCountDeleter>>
+      AbstractFunction(m, "AbstractFunction");
 }
diff --git a/tensorflow/python/framework/extension_type.py b/tensorflow/python/framework/extension_type.py
index 4e5d2eb522b..2759039a397 100644
--- a/tensorflow/python/framework/extension_type.py
+++ b/tensorflow/python/framework/extension_type.py
@@ -110,22 +110,24 @@ class ExtensionType(
   contains type annotations for all instance variables.  The following type
   annotations are supported:
 
-  Type                 | Example
-  -------------------- | --------------------------------------------
-  Python integers      | `i: int`
-  Python floats        | `f: float`
-  Python strings       | `s: str`
-  Python booleans      | `b: bool`
-  Python None          | `n: None`
-  Tensors              | `t: tf.Tensor`
-  Composite Tensors    | `rt: tf.RaggedTensor`
-  Extension Types      | `m: MyMaskedTensor`
-  Tensor shapes        | `shape: tf.TensorShape`
-  Tensor dtypes        | `dtype: tf.DType`
-  Type unions          | `length: typing.Union[int, float]`
-  Tuples               | `params: typing.Tuple[int, float, int, int]`
-  Tuples w/ Ellipsis   | `lengths: typing.Tuple[int, ...]`
-  Mappings             | `tags: typing.Mapping[str, str]`
+  Type                      | Example
+  ------------------------- | --------------------------------------------
+  Python integers           | `i: int`
+  Python floats             | `f: float`
+  Python strings            | `s: str`
+  Python booleans           | `b: bool`
+  Python None               | `n: None`
+  Python tuple              | `params: tuple[int, float, int, int]`
+  Python tuple w/ Ellipsis  | `lengths: tuple[int, ...]`
+  Tensors                   | `t: tf.Tensor`
+  Composite Tensors         | `rt: tf.RaggedTensor`
+  Extension Types           | `m: MyMaskedTensor`
+  Tensor shapes             | `shape: tf.TensorShape`
+  Tensor dtypes             | `dtype: tf.DType`
+  Type unions               | `length: typing.Union[int, float]`
+  Tuples                    | `params: typing.Tuple[int, float, int, int]`
+  Tuples w/ Ellipsis        | `lengths: typing.Tuple[int, ...]`
+  Mappings                  | `tags: typing.Mapping[str, str]`
 
   Fields annotated with `typing.Mapping` will be stored using an immutable
   mapping type.
diff --git a/tensorflow/python/framework/extension_type_field.py b/tensorflow/python/framework/extension_type_field.py
index 2eed8e989f8..80774535f39 100644
--- a/tensorflow/python/framework/extension_type_field.py
+++ b/tensorflow/python/framework/extension_type_field.py
@@ -73,6 +73,16 @@ class Sentinel(object):
 _NoneType = type(None)
 
 
+def _issubclass(cls, clsinfo):
+  """Internal issubclass that doesn't raise TypeError."""
+  try:
+    return issubclass(cls, clsinfo)
+  except TypeError:
+    # issubclass with GenericAlias instances raises TypeError. For example,
+    # `issubclass(tuple[int], composite_tensor.CompositeTensor)`.
+    return False
+
+
 # ==============================================================================
 # ExtensionTypeField
 # ==============================================================================
@@ -146,7 +156,7 @@ def validate_field_value_type(value_type,
     return
   elif (value_type in (ops.Tensor, tensor_shape.TensorShape) or
         (isinstance(value_type, type) and
-         issubclass(value_type, composite_tensor.CompositeTensor))):
+         _issubclass(value_type, composite_tensor.CompositeTensor))):
     if in_mapping_key:
       raise TypeError(f'Mapping had a key {value_type.__name__!r} with type '
                       f'{type(value_type).__name__!r}')
@@ -280,7 +290,7 @@ def _convert_value(value, expected_type, path,
   if expected_type is ops.Tensor:
     return _convert_tensor(value, path, context)
   elif (isinstance(expected_type, type) and
-        issubclass(expected_type, composite_tensor.CompositeTensor)):
+        _issubclass(expected_type, composite_tensor.CompositeTensor)):
     return _convert_composite_tensor(value, expected_type, path, context)
   elif expected_type is tensor_shape.TensorShape:
     try:
@@ -338,7 +348,7 @@ def _convert_composite_tensor(value, expected_type, path, context):
   """Converts `value` to a value of type `expected_type`."""
   if context == _ConversionContext.SPEC:
     if not (isinstance(value, type_spec.TypeSpec) and
-            issubclass(value.value_type, expected_type)):
+            _issubclass(value.value_type, expected_type)):
       raise TypeError(f'{"".join(path)}: expected a TypeSpec for '
                       f'{expected_type.__name__!r}, got '
                       f'{type(value).__name__!r}')
diff --git a/tensorflow/python/framework/extension_type_field_test.py b/tensorflow/python/framework/extension_type_field_test.py
index 50e268d2957..a892ce9097d 100644
--- a/tensorflow/python/framework/extension_type_field_test.py
+++ b/tensorflow/python/framework/extension_type_field_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests for tf.framework.extension_type_field."""
 
+import sys
 import typing
 from absl.testing import parameterized
 import numpy as np
@@ -30,6 +31,13 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import googletest
 
 
+if sys.version_info >= (3, 9):
+  _TUPLE = tuple
+else:
+  # Remove this branch once TF drops support for Python < 3.9.
+  _TUPLE = typing.Tuple
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class ExtensionTypeFieldTest(test_util.TensorFlowTestCase,
                              parameterized.TestCase):
@@ -49,6 +57,8 @@ class ExtensionTypeFieldTest(test_util.TensorFlowTestCase,
       ('seq', typing.Tuple[typing.Union[int, float], ...], (33, 12.8, 9, 0)),
       ('seq', typing.Tuple[typing.Union[int, float],
                            ...], [33, 12.8, 9, 0], (33, 12.8, 9, 0)),
+      ('seq', _TUPLE[typing.Union[int, float], ...], (33, 12.8, 9, 0)),
+      ('seq', _TUPLE[typing.Union[int, float], ...], (33, 12.8, 9, 0)),
       ('s', tensor_shape.TensorShape, [1, 2], tensor_shape.TensorShape([1, 2])),
       ('dtype', dtypes.DType, np.int32, dtypes.int32),
   ])
@@ -78,6 +88,9 @@ class ExtensionTypeFieldTest(test_util.TensorFlowTestCase,
       ('seq', typing.Tuple[typing.Union[int, float], ...], [33, 12.8, 'zero'],
        (r'default value for seq\[2\]: expected '
         r"typing.Union\[int, float\], got 'str'")),
+      ('seq', _TUPLE[typing.Union[int, float], ...], [33, 12.8, 'zero'],
+       (r'default value for seq\[2\]: expected '
+        r"typing.Union\[int, float\], got 'str'")),
       ('t', tensor_spec.TensorSpec(None, dtypes.int32),
        lambda: constant_op.constant(0.0),
        'Unsupported type annotation TensorSpec.*'),
@@ -143,11 +156,16 @@ class ValidateFieldPyTypeTest(test_util.TensorFlowTestCase,
       dict(tp=typing.Union[int, float]),
       dict(tp=typing.Tuple[int, ...]),
       dict(tp=typing.Tuple[int, int]),
+      dict(tp=_TUPLE[int, ...]),
+      dict(tp=_TUPLE[int, int]),
       dict(tp=typing.Mapping[int, int]),
       dict(tp=typing.Mapping[str, int]),
       dict(tp=typing.Union[int, 'A'], allow_forward_references=True),
       dict(tp=typing.Mapping['A', int], allow_forward_references=True),
       dict(tp=typing.Union[int, typing.Tuple[typing.Tuple[int, int], ...]]),
+      dict(tp=typing.Union[int, _TUPLE[_TUPLE[int, int], ...]]),
+      dict(tp=typing.Union[int, _TUPLE[typing.Tuple[int, int], ...]]),
+      dict(tp=typing.Union[int, typing.Tuple[_TUPLE[int, int], ...]]),
   ])
   def testValidPytype(self, tp, allow_forward_references=False):
     extension_type_field.validate_field_value_type(
@@ -162,6 +180,9 @@ class ValidateFieldPyTypeTest(test_util.TensorFlowTestCase,
       dict(
           tp=typing.Tuple[typing.Tuple[int, int, dict], ...],
           error="Unsupported type annotation 'dict'"),
+      dict(
+          tp=_TUPLE[_TUPLE[int, int, dict], ...],
+          error="Unsupported type annotation 'dict'"),
       dict(tp='A', error='Unresolved forward reference .*'),
       dict(tp=typing.Union[int, 'A'], error='Unresolved forward reference .*'),
       dict(tp=typing.Mapping[ops.Tensor, int],
@@ -208,12 +229,17 @@ class FieldValueConverterTest(test_util.TensorFlowTestCase,
        ragged_tensor.RaggedTensor),
       ([1, 2, 3], typing.Tuple[int, ...], (1, 2, 3)),
       ((1, 2, 3), typing.Tuple[int, int, int], (1, 2, 3)),
+      ([1, 2, 3], _TUPLE[int, ...], (1, 2, 3)),
+      ((1, 2, 3), _TUPLE[int, int, int], (1, 2, 3)),
       ({
           'a': 12
       }, typing.Mapping[str, int]),
       ({
           'a': (12, 3.0)
       }, typing.Mapping[str, typing.Tuple[int, float]]),
+      ({
+          'a': (12, 3.0)
+      }, typing.Mapping[str, _TUPLE[int, float]]),
       (tensor_shape.TensorShape([1, 2]), tensor_shape.TensorShape,
        tensor_shape.TensorShape([1, 2])),
       ([1, 2], tensor_shape.TensorShape, tensor_shape.TensorShape([1, 2])),
@@ -241,12 +267,17 @@ class FieldValueConverterTest(test_util.TensorFlowTestCase,
       (ragged_tensor.RaggedTensorSpec([5, None]), ragged_tensor.RaggedTensor),
       ([1, 2, 3], typing.Tuple[int, ...], (1, 2, 3)),
       ((1, 2, 3), typing.Tuple[int, int, int], (1, 2, 3)),
+      ([1, 2, 3], _TUPLE[int, ...], (1, 2, 3)),
+      ((1, 2, 3), _TUPLE[int, int, int], (1, 2, 3)),
       ({
           'a': 12
       }, typing.Mapping[str, int]),
       ({
           'a': (12, 3.0)
       }, typing.Mapping[str, typing.Tuple[int, float]]),
+      ({
+          'a': (12, 3.0)
+      }, typing.Mapping[str, _TUPLE[int, float]]),
       (tensor_shape.TensorShape([1, 2]), tensor_shape.TensorShape,
        tensor_shape.TensorShape([1, 2])),
       ([1, 2], tensor_shape.TensorShape, tensor_shape.TensorShape([1, 2])),
@@ -271,6 +302,8 @@ class FieldValueConverterTest(test_util.TensorFlowTestCase,
       (12, float, "x: expected 'float', got 'int'"),
       ([1, 2, 3.0], typing.Tuple[int, ...],
        r"x\[2\]: expected 'int', got 'float'"),
+      ([1, 2, 3.0], _TUPLE[int, ...],
+       r"x\[2\]: expected 'int', got 'float'"),
       ('foo', tensor_shape.TensorShape,
        "x: expected 'tf.TensorShape', got 'str'"),
       ('foo', dtypes.DType, "x: expected 'tf.DType', got 'str'"),
@@ -286,6 +319,8 @@ class FieldValueConverterTest(test_util.TensorFlowTestCase,
         extension_type_field.ExtensionTypeField('x', int),
         extension_type_field.ExtensionTypeField(
             'y', typing.Tuple[typing.Union[int, bool], ...]),
+        extension_type_field.ExtensionTypeField(
+            'y', _TUPLE[typing.Union[int, bool], ...]),
         extension_type_field.ExtensionTypeField('z', ops.Tensor)
     ]
     field_values = {'x': 1, 'y': [1, True, 3], 'z': [[1, 2], [3, 4], [5, 6]]}
@@ -301,6 +336,8 @@ class FieldValueConverterTest(test_util.TensorFlowTestCase,
         extension_type_field.ExtensionTypeField('x', int),
         extension_type_field.ExtensionTypeField(
             'y', typing.Tuple[typing.Union[int, bool], ...]),
+        extension_type_field.ExtensionTypeField(
+            'y', _TUPLE[typing.Union[int, bool], ...]),
         extension_type_field.ExtensionTypeField('z', ops.Tensor)
     ]
     field_values = {
diff --git a/tensorflow/python/framework/extension_type_test.py b/tensorflow/python/framework/extension_type_test.py
index 90282efb3cb..b4f6e62e126 100644
--- a/tensorflow/python/framework/extension_type_test.py
+++ b/tensorflow/python/framework/extension_type_test.py
@@ -43,7 +43,7 @@ from tensorflow.python.framework.type_utils import fulltypes_for_flat_tensors
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import while_loop
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -518,9 +518,9 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     x = MaskedTensorV1([1, 2, 3, 4], [True, False, True, False])
     y = MaskedTensorV1([5, 6, 7, 8], [False, True, True, False])
 
-    x_2 = control_flow_ops.cond(
+    x_2 = tf_cond.cond(
         constant_op.constant(True), lambda: x, lambda: y)
-    y_2 = control_flow_ops.cond(
+    y_2 = tf_cond.cond(
         constant_op.constant(False), lambda: x, lambda: y)
 
     self.assertAllEqual(x.values, x_2.values)
@@ -540,8 +540,8 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           array_ops.where_v2(mt.mask, 100, mt.values * 2),
           math_ops.logical_not(mt.mask))
 
-    x = control_flow_ops.cond(constant_op.constant(True), true_fn, false_fn)
-    y = control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
+    x = tf_cond.cond(constant_op.constant(True), true_fn, false_fn)
+    y = tf_cond.cond(constant_op.constant(False), true_fn, false_fn)
 
     self.assertAllEqual(x.values, [1, -1, 3, -1])
     self.assertAllEqual(x.mask, [False, False, False, True])
@@ -568,7 +568,7 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       # In eager mode, tf.cond eagerly runs either true_fn or false_fn, and
       # ignores the other one; so it doesn't detect any type mismatches
       # between the two outcomes.  (See _eager_cond_implementation in
-      # control_flow_ops.py.)
+      # cond.py.)
       return
 
     a = lambda: MaskedTensorV1([1, 2, 3], [True, True, False])
@@ -580,16 +580,16 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         ValueError,
         'Incompatible return values of true_fn and false_fn: The two '
         "structures don't have the same nested structure"):
-      control_flow_ops.cond(constant_op.constant(True), a, b)
+      tf_cond.cond(constant_op.constant(True), a, b)
     with self.assertRaisesRegex(
         TypeError, 'Incompatible return types of true_fn and false_fn: The two '
         "structures don't have the same nested structure"):
-      control_flow_ops.cond(constant_op.constant(True), a, c)
+      tf_cond.cond(constant_op.constant(True), a, c)
     with self.assertRaisesRegex(
         ValueError,
         'Incompatible return values of true_fn and false_fn: The two '
         "structures don't have the same nested structure"):
-      control_flow_ops.cond(constant_op.constant(True), a, d)
+      tf_cond.cond(constant_op.constant(True), a, d)
 
   def testCondPacked(self):
     x = MaskedTensorV2([1, 2, 3, 4], [True, False, True, False])
@@ -597,9 +597,9 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     x = extension_type.pack(x)
     y = extension_type.pack(y)
 
-    x_2 = control_flow_ops.cond(
+    x_2 = tf_cond.cond(
         constant_op.constant(True), lambda: x, lambda: y)
-    y_2 = control_flow_ops.cond(
+    y_2 = tf_cond.cond(
         constant_op.constant(False), lambda: x, lambda: y)
 
     self.assertAllEqual(x.values, x_2.values)
@@ -609,7 +609,7 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     a = MaskedTensorV2([1, 2, 3, 4], [True, False, True, False])
     b = extension_type.pack(a)
-    b = control_flow_ops.cond(
+    b = tf_cond.cond(
         constant_op.constant(True), lambda: array_ops.size(a.mask),
         lambda: array_ops.size(a.values))
     self.assertAllEqual(b, 4)
@@ -619,7 +619,7 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     # the value.  See the comment in `ExtensionType.__getattr__` for details.
     c = MaskedTensorV2([1, 2, 3, 4], [True, False, True, False])
     c = extension_type.pack(c)
-    d = control_flow_ops.cond(
+    d = tf_cond.cond(
         constant_op.constant(False), lambda: array_ops.size(c.mask),
         lambda: array_ops.size(c.values))
     self.assertAllEqual(d, 4)
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 18441458002..51899e49f7d 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """FuncGraph and related functionality."""
 
-import collections as py_collections
 import traceback
 from typing import Any, Callable, Hashable
 import weakref
@@ -26,6 +25,7 @@ from tensorflow.python.eager import execute
 from tensorflow.python.eager.polymorphic_function import composite_tensor_utils
 from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import indexed_slices
@@ -37,6 +37,7 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.saved_model import save_context
+from tensorflow.python.types import core
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
@@ -213,10 +214,6 @@ class FuncGraph(ops.Graph):
     # preserve the output names in the signature of a serialized+deserialized
     # function. Private at the moment mostly because it's often out of date.
     self._output_names = None
-    # Maps arbitrary key -> (closure, nest of placeholders), where at function
-    # call time the value of closure() will be used to feed the nest of
-    # placeholders.
-    self._deferred_captures = py_collections.OrderedDict()
     # Inherit capture-by-value from outer graph.
     if capture_by_value is not None:
       self.capture_by_value = capture_by_value
@@ -353,7 +350,7 @@ class FuncGraph(ops.Graph):
     """
     if key is None:
       key = object()
-    if key not in self._deferred_captures:
+    if key not in self._function_captures.by_ref_internal:
       trace_ctx = trace_type.InternalTracingContext(True)
       spec = trace_type.from_value(spec, trace_ctx)
 
@@ -394,8 +391,13 @@ class FuncGraph(ops.Graph):
         return spec._to_tensors(ret_nest)  # pylint: disable=protected-access
 
       wrapped_closure.output_spec = spec
-      self._deferred_captures[key] = (wrapped_closure, placeholder)
-    return self._deferred_captures[key][1]
+      self._function_captures.add_or_replace(
+          key=key,
+          external=wrapped_closure,
+          internal=placeholder,
+          tracetype=spec,
+          is_by_ref=True)
+    return self._function_captures.by_ref_internal[key]
 
   def control_dependencies(self, control_inputs):
     """Handles control dependencies.
@@ -764,9 +766,17 @@ class FuncGraph(ops.Graph):
         are replaced with placehoders, and non-tensors remain the same.
 
     """
+    if context.executing_eagerly():
+      return func()
 
-    placeholder = self._function_captures.capture_by_ref(
-        func, identifier)
+    def maybe_convert_to_tensor():
+      value = func()
+      if not (isinstance(value, core.Value) or isinstance(value, core.Symbol)):
+        value = constant_op.constant(value)
+      return value
+
+    placeholder = self._function_captures._capture_by_ref(  # pylint: disable=protected-access
+        self, maybe_convert_to_tensor, identifier)
     return placeholder
 
   @property
@@ -782,13 +792,19 @@ class FuncGraph(ops.Graph):
       placeholder: Provided placeholder for the tensor.
     """
     self._function_captures.add_or_replace(
-        tensor, placeholder, id(tensor), False)
+        key=id(tensor),
+        external=tensor,
+        internal=placeholder,
+        is_by_ref=False)
     self.inputs.append(placeholder)
 
   def replace_capture(self, tensor, placeholder):
     """Replace already existing capture."""
     self._function_captures.add_or_replace(
-        tensor, placeholder, id(tensor), False)
+        key=id(tensor),
+        external=tensor,
+        internal=placeholder,
+        is_by_ref=False)
 
   def replace_capture_with_deferred_capture(self,
                                             tensor,
@@ -838,30 +854,32 @@ class FuncGraph(ops.Graph):
   @property
   def external_captures(self):
     """External tensors captured by this function."""
-    captures = self._function_captures.by_val_captures.values()
-    return [c.external for c in captures]
+    return list(self._function_captures.by_val_external.values())
 
   @property
   def internal_captures(self):
     """Placeholders in this function corresponding captured tensors."""
-    captures = self._function_captures.by_val_captures.values()
-    return [c.internal for c in captures]
+    return list(self._function_captures.by_val_internal.values())
 
   @property
   def deferred_external_captures(self):
     """Ordered nest of tensors whose placeholders will be fed at call time."""
-    return [c[0] for c in self._deferred_captures.values()]
+    return list(self._function_captures.by_ref_external.values())
 
   @property
   def deferred_internal_captures(self):
     """List of nest of placeholders which at call time will be fed."""
-    return [c[1] for c in self._deferred_captures.values()]
+    return list(self._function_captures.by_ref_internal.values())
 
   @property
   def variable_captures(self):
     """Map of python object ids of variables to variables which are captured."""
     return self.variables
 
+  @property
+  def function_captures(self):
+    return self._function_captures
+
   def mark_as_unsaveable(self, error_message):
     """Marks this FuncGraph as unsaveable.
 
@@ -887,8 +905,6 @@ class FuncGraph(ops.Graph):
     """Returns set of errors preventing this FuncGraph from being saved."""
     return self._saving_errors
 
-  # TODO(b/263520817): Add function_captures property.
-
   def _add_scope_exit_callback(self, fn):
     """Add a function to call when this graph exits the default scope."""
     if not callable(fn):
@@ -907,8 +923,6 @@ def func_graph_from_py_func(name,
                             kwargs,
                             signature=None,
                             func_graph=None,
-                            autograph=False,
-                            autograph_options=None,
                             add_control_dependencies=True,
                             arg_names=None,
                             op_return_value=None,
@@ -931,10 +945,6 @@ def func_graph_from_py_func(name,
       inputs.
     func_graph: Optional. An instance of FuncGraph. If provided, we will use
       this graph else a new one is built and returned.
-    autograph: whether to use autograph to compile `python_func`.
-      See https://www.tensorflow.org/guide/autograph for more information.
-    autograph_options: additional knobs to control when `autograph=True`.
-      See https://www.tensorflow.org/guide/autograph for more information.
     add_control_dependencies: If True, automatically adds control dependencies
       to ensure program order matches execution order and stateful ops always
       execute.
@@ -1041,62 +1051,28 @@ def func_graph_from_py_func(name,
         x = deps_ctx.mark_as_return(x)
       return x
 
-    try:
-      if autograph:
-        from tensorflow.python import autograph  # pylint: disable=g-import-not-at-top
-        _, original_func = tf_decorator.unwrap(python_func)
+    _, original_func = tf_decorator.unwrap(python_func)
+    func_outputs = python_func(*func_args, **func_kwargs)
 
-        def autograph_handler(*args, **kwargs):
-          """Calls a converted version of original_func."""
-          # TODO(mdan): Push this block higher in tf.function's call stack.
-          try:
-            return autograph.converted_call(
-                original_func,
-                args,
-                kwargs,
-                options=autograph.ConversionOptions(
-                    recursive=True,
-                    optional_features=autograph_options,
-                    user_requested=True,
-                ))
-          except Exception as e:  # pylint:disable=broad-except
-            if hasattr(e, "ag_error_metadata"):
-              raise e.ag_error_metadata.to_exception(e)
-            else:
-              raise
+    # invariant: `func_outputs` contains only Tensors, CompositeTensors,
+    # TensorArrays and `None`s.
+    func_outputs = variable_utils.convert_variables_to_tensors(func_outputs)
+    func_outputs = nest.map_structure(
+        convert, func_outputs, expand_composites=True)
 
-        # Wrapping around a decorator allows checks like tf_inspect.getargspec
-        # to be accurate.
-        converted_func = tf_decorator.make_decorator(original_func,
-                                                     autograph_handler)
-        python_func = tf_decorator.rewrap(python_func, original_func,
-                                          converted_func)
-
-      else:
-        _, original_func = tf_decorator.unwrap(python_func)
-
-      func_outputs = python_func(*func_args, **func_kwargs)
-
-      # invariant: `func_outputs` contains only Tensors, CompositeTensors,
-      # TensorArrays and `None`s.
-      func_outputs = variable_utils.convert_variables_to_tensors(func_outputs)
-      func_outputs = nest.map_structure(
-          convert, func_outputs, expand_composites=True)
-
-      # flatten and unflatten func_args and func_kwargs to maintain parity
-      # from flattening which sorts by key
-      func_args = nest.pack_sequence_as(
-          func_args,
-          nest.flatten(func_args, expand_composites=True),
-          expand_composites=True)
-      func_kwargs = nest.pack_sequence_as(
-          func_kwargs,
-          nest.flatten(func_kwargs, expand_composites=True),
-          expand_composites=True)
-      check_func_mutation(func_args_before, func_kwargs_before, func_args,
-                          func_kwargs, original_func)
-    finally:
-      current_scope.set_use_resource(default_use_resource)
+    # flatten and unflatten func_args and func_kwargs to maintain parity
+    # from flattening which sorts by key
+    func_args = nest.pack_sequence_as(
+        func_args,
+        nest.flatten(func_args, expand_composites=True),
+        expand_composites=True)
+    func_kwargs = nest.pack_sequence_as(
+        func_kwargs,
+        nest.flatten(func_kwargs, expand_composites=True),
+        expand_composites=True)
+    check_func_mutation(func_args_before, func_kwargs_before, func_args,
+                        func_kwargs, original_func)
+    current_scope.set_use_resource(default_use_resource)
 
     inputs = []
     for arg in composite_tensor_utils.flatten_with_variables([func_args,
@@ -1105,10 +1081,11 @@ def func_graph_from_py_func(name,
         # Even if an argument variable was not used in the function, we've
         # already manually captured the resource Tensor when creating argument
         # placeholders.
-        capture = func_graph._function_captures.pop(id(arg.handle))  # pylint: disable=protected-access
-        if capture is None:
+        capture = func_graph._function_captures.pop(id(arg.handle), False)  # pylint: disable=protected-access
+        assert len(capture) >= 2
+        resource_placeholder = capture[1]
+        if resource_placeholder is None:
           continue
-        resource_placeholder = capture.internal
         inputs.append(resource_placeholder)
       elif isinstance(arg, ops.Tensor):
         inputs.append(arg)
@@ -1254,10 +1231,9 @@ def _create_placeholders(args, kwargs, arg_names=None):
   arg_trace_types = trace_type.from_value(tuple(args), signature_context)
   kwarg_trace_types = trace_type.from_value(kwargs, signature_context)
 
-  handledata_mapping = signature_context.get_handledata_mapping()
   placeholder_mapping = signature_context.get_placeholder_mapping()
   placeholder_context = trace_type.InternalPlaceholderContext(
-      ops.get_default_graph(), handledata_mapping, placeholder_mapping)
+      ops.get_default_graph(), placeholder_mapping)
 
   if arg_names is None:
     arg_names = [None] * len(arg_trace_types.components)
@@ -1289,8 +1265,7 @@ def dismantle_func_graph(func_graph):
     func_graph: A `FuncGraph` object to destroy. `func_graph` is unusable after
       this function.
   """
-  func_graph._function_captures.by_val_captures.clear()  # pylint: disable=protected-access
-  func_graph._deferred_captures.clear()  # pylint: disable=protected-access
+  func_graph._function_captures.clear()  # pylint: disable=protected-access
   ops.dismantle_graph(func_graph)
 
 
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index a41ae1fc9f3..b09c176e80e 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -248,6 +248,7 @@ class _DefinedFunction(object):
   Attributes:
     name: The function name.
     definition: The definition of this function. A FunctionDef proto.
+    cached_definition: Same as definition. Needed to match AtomicFunction API.
     grad_func_name: If not None, the name of this function's gradient function.
     python_grad_func: A python callable implementing the gradient of
       the function python-side.
@@ -339,6 +340,10 @@ class _DefinedFunction(object):
     self._create_definition_if_needed()
     return self._func_name
 
+  @property
+  def cached_definition(self):
+    return self.definition
+
   @property
   def definition(self):
     """Function definition proto."""
@@ -582,7 +587,7 @@ class _DefinedFunction(object):
 
     # Ensures related sub-routines are defined in 'g', too.
     for f in self._sub_functions.values():
-      f.add_to_graph(g)
+      g._add_function_recursive(f)  # pylint: disable=protected-access
 
     # Adds its gradient function, too.
     if self._grad_func:
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
index 0b410f3f65e..61049e4d83a 100644
--- a/tensorflow/python/framework/function_def_to_graph.py
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -168,10 +168,56 @@ def get_function_def(fname, graph):
     # Graph mode: use outer graphs as the single source of truth.
     while graph is not None:
       if graph._is_function(fname):  # pylint: disable=protected-access
-        return graph._get_function(fname).definition  # pylint: disable=protected-access
+        return graph._get_function(fname).cached_definition  # pylint: disable=protected-access
       graph = getattr(graph, "outer_graph", None)
 
 
+def copy_function_def_to_graph_def_recursively(
+    func_name, graph_def, copied_functions, default_graph=None):
+  """Recursively copies `FunctionDef`s to `GraphDef`.
+
+  It copies the outermost `FunctionDef` and all nested `FunctionDef`s to
+  `graph_def`. The `copied_function` enforces that every `FunctionDef` will be
+  copied at most once. The `FunctionDef`s will be found from `default_graph` if
+  this function was called in graph mode or from eager context if this function
+  was called in eager mode.
+
+  Args:
+    func_name: The signature name of FunctionDef to be copied to `graph_def`.
+    graph_def: The GraphDef that will contain all `FunctionDef`s in its library.
+    copied_functions: A set contains all copied function names.
+    default_graph: The `tf.Graph` where all `FunctionDef`s will be found
+      in graph mode. Not used in eager mode.
+  """
+  # Custom ops may contain a func attr with an empty fname.
+  if func_name and not is_function(func_name, default_graph):
+    raise ValueError(f"Function {func_name} was not found. Please make "
+                     "sure the FunctionDef `fdef` is correct.")
+
+  # If `copied_functions` contains `func_name`, the FunctionDef has already
+  # been added to GraphDef so we simply return here.
+  if func_name in copied_functions:
+    return
+
+  copied_functions.add(func_name)
+  func_def = get_function_def(func_name, default_graph)
+  graph_def.library.function.add().CopyFrom(func_def)
+
+  for node_def in func_def.node_def:
+    op_def = default_graph.op_def_for_type(node_def.op)
+    for attr in op_def.attr:
+      if attr.type == "func":
+        func_name = node_def.attr[attr.name].func.name
+        copy_function_def_to_graph_def_recursively(
+            func_name, graph_def, copied_functions, default_graph)
+
+      elif attr.type == "list(func)":
+        for fn in node_def.attr[attr.name].list.func:
+          func_name = fn.name
+          copy_function_def_to_graph_def_recursively(
+              func_name, graph_def, copied_functions, default_graph)
+
+
 def function_def_to_graph_def(
     fdef, input_shapes=None, include_library_functions=False
 ):
@@ -191,10 +237,10 @@ def function_def_to_graph_def(
       function inputs. If specified, its length must match length of
       `fdef.signature.input_arg`. If a shape is None, the corresponding input
       placeholder will have unknown shape.
-    include_library_functions: Optional. Whether to include library functions in
-      the output GraphDef. In graph mode, the library functions will be found
-      from outer graph. In eager mode, the library functions will be found from
-      eager context.
+    include_library_functions: Optional. If enabled, copy `fdef` and its
+      nested `FunctionDef`s to the library functions of the returned `GraphDef`.
+      In graph mode, the functions will be found from outer graph. In eager
+      mode, the functions will be found from eager context.
 
   Returns:
     A tuple of (GraphDef, dict<string, string>). The dict contains a mapping
@@ -267,7 +313,7 @@ def function_def_to_graph_def(
       graph = graph.outer_graph
 
     if f is not None:
-      fdef = f.definition
+      fdef = f.cached_definition
       op_def = fdef.signature
       if node_def.op not in copied_functions:
         # Since this function is referenced as an op type, we have no choice but
@@ -275,7 +321,7 @@ def function_def_to_graph_def(
         # it.
         graph_def.library.function.add().CopyFrom(fdef)
         copied_functions.add(node_def.op)
-        if f.grad_func_name:
+        if getattr(f, "grad_func_name", None):
           grad_def = function_pb2.GradientDef()
           grad_def.function_name = f.name
           grad_def.gradient_func = f.grad_func_name
@@ -290,10 +336,9 @@ def function_def_to_graph_def(
         if fname and not is_function(fname, default_graph):
           raise ValueError(f"Function {fname} was not found. Please make sure "
                            "the FunctionDef `fdef` is correct.")
-        if include_library_functions and fname not in copied_functions:
-          fdef = get_function_def(fname, default_graph)
-          graph_def.library.function.add().CopyFrom(fdef)
-          copied_functions.add(fname)
+        if include_library_functions:
+          copy_function_def_to_graph_def_recursively(
+              fname, graph_def, copied_functions, default_graph)
 
       elif attr.type == "list(func)":
         for fn in node_def.attr[attr.name].list.func:
@@ -302,10 +347,9 @@ def function_def_to_graph_def(
           if fname and not is_function(fname, default_graph):
             raise ValueError(f"Function {fname} was not found. Please make "
                              "sure the FunctionDef `fdef` is correct.")
-          if include_library_functions and fname not in copied_functions:
-            fdef = get_function_def(fname, default_graph)
-            graph_def.library.function.add().CopyFrom(fdef)
-            copied_functions.add(fname)
+          if include_library_functions:
+            copy_function_def_to_graph_def_recursively(
+                fname, graph_def, copied_functions, default_graph)
 
     # Iterate over output_args in op_def to build the map.
     # Index of the output tensor in the flattened list of *all* output
diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py
index e583c22769b..26f13e06aca 100644
--- a/tensorflow/python/framework/function_def_to_graph_test.py
+++ b/tensorflow/python/framework/function_def_to_graph_test.py
@@ -141,6 +141,52 @@ class FunctionDefToGraphTest(test.TestCase):
     self.assertLen(graph_def.library.function, 1)
     self.assertEqual(graph_def.library.function[0].signature.name, gname)
 
+  def testCopyFunctionDefToGraphDefRecursively(self):
+    @def_function.function
+    def inner(x):
+      return x + 1
+
+    @def_function.function
+    def middle(x):
+      return inner(x) + 1
+
+    @def_function.function
+    def outer(x):
+      return middle(x) + 1
+
+    @def_function.function
+    def target_func(x):
+      return x
+
+    target_graph_def = target_func.get_concrete_function(1).graph.as_graph_def()
+
+    self.assertEmpty(target_graph_def.library.function)
+
+    concrete_outer = outer.get_concrete_function(1)
+    default_graph = ops.get_default_graph()
+    # Copy FunctionDefs in `outer` to the default graph.
+    concrete_outer.add_to_graph(default_graph)
+    outer_function_name = concrete_outer.function_def.signature.name
+    copied_functions = set()
+    # Copy all FunctionDefs from `outer` to `target_func`.
+    function_def_to_graph.copy_function_def_to_graph_def_recursively(
+        outer_function_name, target_graph_def, copied_functions, default_graph
+    )
+
+    outer_graph_def = concrete_outer.graph.as_graph_def()
+    nested_function_names = {
+        f.signature.name for f in outer_graph_def.library.function
+    }
+    expected_function_names = {outer_function_name} | nested_function_names
+
+    self.assertEqual(copied_functions, expected_function_names)
+
+    target_function_names = {
+        f.signature.name for f in target_graph_def.library.function
+    }
+
+    self.assertEqual(target_function_names, expected_function_names)
+
 
 class FunctionDefToGraphDefTest(test.TestCase):
 
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 99123de9151..47167d4d694 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -35,9 +35,10 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.framework.errors import InvalidArgumentError
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
@@ -48,6 +49,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
 from tensorflow.python.platform import test
@@ -113,7 +115,7 @@ class FunctionTest(test.TestCase):
       return a
 
     with ops.Graph().as_default():
-      var = variables.VariableV1([18.0])
+      var = variable_v1.VariableV1([18.0])
       call = MyIdentityFunc(var._ref())  # pylint: disable=protected-access
       self.assertEqual("MyIdentity", call.op.name)
       for cfg in _OptimizerOptions():
@@ -282,7 +284,7 @@ class FunctionTest(test.TestCase):
       tf_logging.info("cfg = %s", cfg)
       with session.Session(graph=g, config=cfg) as sess:
         out, = sess.run(dlogits, {logits: x, labels: y})
-      self.assertAllClose(out, np.exp(prob - y))
+      self.assertAllClose(out, np.exp(prob - y), rtol=1e-5)
 
   @test_util.disable_xla("b/124286351")  # No error is raised
   def testCustomGradientError(self):
@@ -417,7 +419,7 @@ class FunctionTest(test.TestCase):
     def Foo(x):
       y = logging_ops.Print(x, [], "Hello")
       with ops.control_dependencies([y]):
-        z = control_flow_ops.no_op()
+        z = gen_control_flow_ops.no_op()
       with ops.control_dependencies([z]):
         return x * 2
 
@@ -493,7 +495,7 @@ class FunctionTest(test.TestCase):
     with ops.device("CPU"):
       pred = array_ops.placeholder(dtypes.bool)
       x = array_ops.placeholder(dtypes.int32)
-      cond = control_flow_ops.cond(pred, lambda: x + 1, lambda: AssertFail(x))
+      cond = tf_cond.cond(pred, lambda: x + 1, lambda: AssertFail(x))
       # pylint: disable=unnecessary-lambda
       loop = while_loop.while_loop(lambda y: pred, lambda y: AssertFail(y), [x])
       # pylint: enable=unnecessary-lambda
@@ -816,7 +818,7 @@ class FunctionTest(test.TestCase):
 
       @function.Defun(dtypes.bool)
       def Foo(pred):
-        return control_flow_ops.cond(pred, lambda: x, lambda: x + 1)
+        return tf_cond.cond(pred, lambda: x, lambda: x + 1)
 
       y = Foo(True)
       z = Foo(False)
diff --git a/tensorflow/python/framework/graph_io.py b/tensorflow/python/framework/graph_io.py
index 2580131bf1b..05b764bb5eb 100644
--- a/tensorflow/python/framework/graph_io.py
+++ b/tensorflow/python/framework/graph_io.py
@@ -16,8 +16,10 @@
 """Utility functions for reading/writing graphs."""
 import os
 import os.path
+import sys
 
 from google.protobuf import text_format
+from tensorflow.python.framework import byte_swap_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import tf_export
@@ -58,6 +60,16 @@ def write_graph(graph_or_graph_def, logdir, name, as_text=True):
   else:
     graph_def = graph_or_graph_def
 
+  if sys.byteorder == 'big':
+    if hasattr(graph_def, 'node'):
+      byte_swap_tensor.swap_tensor_content_in_graph_node(
+          graph_def, 'big', 'little'
+      )
+    else:
+      byte_swap_tensor.swap_tensor_content_in_graph_function(
+          graph_def, 'big', 'little'
+      )
+
   # gcs does not have the concept of directory at the moment.
   if not logdir.startswith('gs:'):
     file_io.recursive_create_dir(logdir)
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index de0112166e9..944f2133a19 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -27,9 +27,9 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.util import compat
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 # Utility device function to use for testing
@@ -100,13 +100,13 @@ class GraphUtilTest(test.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testNestedDeviceFunctions(self):
     with ops.Graph().as_default():
-      var_0 = variables.VariableV1(0)
+      var_0 = variable_v1.VariableV1(0)
       with ops.device(TestDeviceFuncPinVariableToCpu):
-        var_1 = variables.VariableV1(1)
+        var_1 = variable_v1.VariableV1(1)
         with ops.device(lambda op: "/device:GPU:0"):
-          var_2 = variables.VariableV1(2)
+          var_2 = variable_v1.VariableV1(2)
         with ops.device("/device:GPU:0"):  # Implicit merging device function.
-          var_3 = variables.VariableV1(3)
+          var_3 = variable_v1.VariableV1(3)
 
     self.assertDeviceEqual(var_0.device, None)
     self.assertDeviceEqual(var_1.device, "/device:CPU:0")
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 4a177b44062..acef1db0607 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -278,7 +278,7 @@ def _ProcessNewOps(graph):
     # implementing a compatibility function for device specs in python.
     for coloc_op_name in coloc_op_list:
       try:
-        coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name)  # pylint: disable=protected-access
+        coloc_op = graph._get_operation_by_name(coloc_op_name)  # pylint: disable=protected-access
       except KeyError:
         # Do not error in TF2 if the colocation cannot be guaranteed
         if tf2.enabled() or control_flow_util.EnableControlFlowV2(graph):
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 0f42aca8876..f22d43b6768 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -433,8 +433,8 @@ class ImportGraphDefTest(test.TestCase):
         return importer.import_graph_def(graph_def, return_elements=[r.name])[0]
 
       pred = array_ops.placeholder(dtypes.bool)
-      out = control_flow_ops.cond(pred, ImportFn,
-                                  lambda: constant_op.constant(1))
+      out = cond.cond(pred, ImportFn,
+                      lambda: constant_op.constant(1))
       with self.cached_session() as sess:
         self.assertEqual(sess.run(out, {pred: True}), 10)
         self.assertEqual(sess.run(out, {pred: False}), 1)
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 721f00c88b5..f621119ab97 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -18,6 +18,7 @@ import copy
 from packaging import version as packaging_version  # pylint: disable=g-bad-import-order
 import os.path
 import re
+import sys
 
 from google.protobuf.any_pb2 import Any
 from google.protobuf import text_format
@@ -29,11 +30,13 @@ from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.client import pywrap_tf_session as c_api
 from tensorflow.python.eager import context
+from tensorflow.python.framework import byte_swap_tensor as bst
 from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor
 from tensorflow.python.framework import versions
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import tf_logging as logging
@@ -228,32 +231,6 @@ SAVE_AND_RESTORE_OPS = ["SaveV2",
                         "LegacyRestore", "LegacyRestoreSlice"]
 
 
-def _op_name(tensor_name):
-  """Extract the Op name from a Tensor name.
-
-  The Op name is everything before a colon, if present,
-  not including any ^ prefix denoting a control dependency.
-
-  Args:
-    tensor_name: the full name of a Tensor in the graph.
-  Returns:
-    The name of the Op of which the given Tensor is an output.
-  Raises:
-    ValueError: if tensor_name is None or empty.
-  """
-  if not tensor_name:
-    raise ValueError(
-        f"Tensor name cannot be empty or None. Received: {tensor_name}.")
-
-  # Control dependency inputs start with ^.
-  if tensor_name.startswith("^"):
-    tensor_name = tensor_name[1:]
-  if ":" in tensor_name:
-    op_name, _ = tensor_name.split(":")
-    return op_name
-  return tensor_name
-
-
 def _get_scope(node_name):
   """Extract the scope name from a node name.
 
@@ -301,7 +278,8 @@ def _find_extraneous_saver_nodes(graph_def, saver_def):
 
   # load the graph DAG in minimal form, without initializing a full Graph object
   nodes = {
-      node_def.name: (set(_op_name(x) for x in node_def.input), node_def.op)
+      node_def.name: (
+          set(tensor.get_op_name(x) for x in node_def.input), node_def.op)
       for node_def in graph_def.node
   }
 
@@ -309,8 +287,8 @@ def _find_extraneous_saver_nodes(graph_def, saver_def):
   retain_scope_restore = None
   # It's possible to have no saver if the graph has no Variables
   if saver_def is not None:
-    save_op_name = _op_name(saver_def.save_tensor_name)
-    restore_op_name = _op_name(saver_def.restore_op_name)
+    save_op_name = tensor.get_op_name(saver_def.save_tensor_name)
+    restore_op_name = tensor.get_op_name(saver_def.restore_op_name)
 
     # The save and restore scopes should always be the same, but if they differ
     # for some reason, we retain them both to be safe.
@@ -636,6 +614,8 @@ def read_meta_graph_file(filename):
     file_content = f.read()
   try:
     meta_graph_def.ParseFromString(file_content)
+    if sys.byteorder == "big":
+      bst.swap_tensor_content_in_graph_function(meta_graph_def, "little", "big")
     return meta_graph_def
   except Exception:  # pylint: disable=broad-except
     pass
@@ -643,6 +623,8 @@ def read_meta_graph_file(filename):
   # Next try to read it as a text file.
   try:
     text_format.Merge(file_content.decode("utf-8"), meta_graph_def)
+    if sys.byteorder == "big":
+      bst.swap_tensor_content_in_graph_function(meta_graph_def, "little", "big")
   except text_format.ParseError as e:
     raise IOError(f"Cannot parse file {filename}: {str(e)}.")
 
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 69bcec8484b..fe04a6f5c4c 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -349,12 +349,12 @@ class ScopedMetaGraphTest(test.TestCase):
               random_ops.truncated_normal(
                   [28, 128], stddev=1.0 / math.sqrt(float(28))),
               name="weights")
-        # The use of control_flow_ops.cond here is purely for adding test
+        # The use of cond.cond here is purely for adding test
         # coverage the save and restore of control flow context (which doesn't
         # make any sense here from a machine learning perspective).  The typical
         # biases is a simple Variable without the conditions.
         biases1 = variables.Variable(
-            control_flow_ops.cond(
+            cond.cond(
                 math_ops.less(random.random(), 0.5),
                 lambda: array_ops.ones([128]), lambda: array_ops.zeros([128])),
             name="biases")
@@ -367,7 +367,7 @@ class ScopedMetaGraphTest(test.TestCase):
                 [128, 32], stddev=1.0 / math.sqrt(float(128))),
             name="weights")
 
-        # The use of control_flow_ops.while_loop here is purely for adding test
+        # The use of while_loop.while_loop here is purely for adding test
         # coverage the save and restore of control flow context (which doesn't
         # make any sense here from a machine learning perspective).  The typical
         # biases is a simple Variable without the conditions.
@@ -970,7 +970,8 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
       self.assertEqual(len(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES)),
                        2)
       with self.assertRaisesRegex(
-          AttributeError, "'Tensor' object has no attribute 'initializer'"):
+          AttributeError, "has no attribute 'initializer'"
+      ):
         initializer = variables.local_variables_initializer()
 
 
diff --git a/tensorflow/python/framework/op_def_library_test.py b/tensorflow/python/framework/op_def_library_test.py
index 07fb7890c2a..f7321a53975 100644
--- a/tensorflow/python/framework/op_def_library_test.py
+++ b/tensorflow/python/framework/op_def_library_test.py
@@ -18,7 +18,7 @@
 from google.protobuf import text_format
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import tensor_shape_pb2
-from tensorflow.python.eager import function as eager_function
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -416,10 +416,10 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
 
   def testAttrFuncWithFuncWithAttrs(self):
     with ops.Graph().as_default():
-      @eager_function.defun_with_attributes(
+      @def_function.function(
           input_signature=(tensor_spec.TensorSpec(None, dtypes.float32),),
           autograph=False,
-          attributes={"_implements": 15})
+          experimental_attributes={"_implements": 15})
       def fn(x):
         return 2 + x
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 3c2c5756f5a..cba9e780fc7 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -44,7 +44,7 @@ from tensorflow.python.client import pywrap_tf_session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import monitoring
-from tensorflow.python.eager import tape
+from tensorflow.python.eager import record
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import device as pydev
@@ -53,7 +53,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import registry
 from tensorflow.python.framework import stack
-from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -84,9 +83,6 @@ from tensorflow.python.util.tf_export import tf_export
 tensor_spec = LazyLoader(
     "tensor_spec", globals(),
     "tensorflow.python.framework.tensor_spec")
-ag_ctx = LazyLoader(
-    "ag_ctx", globals(),
-    "tensorflow.python.autograph.core.ag_ctx")
 
 
 # Temporary global switches determining if we should enable the work-in-progress
@@ -357,9 +353,7 @@ class Tensor(internal.NativeObject, core_tf_types.Symbol):
   should not rely on this behaviour.
 
   For more on Tensors, see the [guide](https://tensorflow.org/guide/tensor).
-
   """
-
   # List of Python operators that we allow to override.
   OVERLOADABLE_OPERATORS = {
       # Binary.
@@ -403,35 +397,6 @@ class Tensor(internal.NativeObject, core_tf_types.Symbol):
   # Whether to allow hashing or numpy-style equality
   _USE_EQUALITY = tf2.enabled()
 
-  def __init__(self, op, value_index, dtype):
-    """Creates a new `Tensor`.
-
-    Args:
-      op: An `Operation`. `Operation` that computes this tensor.
-      value_index: An `int`. Index of the operation's endpoint that produces
-        this tensor.
-      dtype: A `DType`. Type of elements stored in this tensor.
-
-    Raises:
-      TypeError: If the op is not an `Operation`.
-    """
-    if not isinstance(op, Operation):
-      raise TypeError(f"op needs to be an Operation. "
-                      f"An instance of type {type(op).__name__} is provided.")
-
-    self._op = op
-    self._value_index = value_index
-    self._dtype = dtypes.as_dtype(dtype)
-    # This will be set by self._as_tf_output().
-    self._tf_output = None
-    # This will be set by self.shape().
-    self._shape_val = None
-    # List of operations that use this Tensor as input.  We maintain this list
-    # to easily navigate a computation graph.
-    self._consumers = []
-    self._id = uid()
-    self._name = None
-
   def __getattr__(self, name):
     if name in {"T", "astype", "ravel", "transpose", "reshape", "clip", "size",
                 "tolist", "data"}:
@@ -439,45 +404,19 @@ class Tensor(internal.NativeObject, core_tf_types.Symbol):
       raise AttributeError(
           f"{type(self).__name__} object has no attribute '{name}'. " + """
         If you are looking for numpy-related methods, please run the following:
-        from tensorflow.python.ops.numpy_ops import np_config
-        np_config.enable_numpy_behavior()
+        tf.experimental.numpy.experimental_enable_numpy_behavior()
       """)
     self.__getattribute__(name)
 
-  @staticmethod
-  def _create_with_tf_output(op, value_index, dtype, tf_output):
-    ret = Tensor(op, value_index, dtype)
-    ret._tf_output = tf_output
-    return ret
-
-  @property
-  def op(self):
-    """The `Operation` that produces this tensor as an output."""
-    return self._op
-
   @property
   def dtype(self):
     """The `DType` of elements in this tensor."""
     return self._dtype
 
-  @property
-  def graph(self):
-    """The `Graph` that contains this tensor."""
-    return self._op.graph
-
   @property
   def name(self):
-    """The string name of this tensor."""
-    if self._name is None:
-      assert self._op.name
-      self._name = "%s:%d" % (self._op.name, self._value_index)
     return self._name
 
-  @property
-  def device(self):
-    """The name of the device on which this tensor will be produced, or None."""
-    return self._op.device
-
   @property
   def shape(self):
     """Returns a `tf.TensorShape` that represents the shape of this tensor.
@@ -498,85 +437,35 @@ class Tensor(internal.NativeObject, core_tf_types.Symbol):
     See `tf.Tensor.get_shape()`, and `tf.TensorShape` for details and examples.
     """
     if self._shape_val is None:
-      self._shape_val = self._c_api_shape()
+      dims, unknown_shape = self._shape
+      if unknown_shape:
+        self._shape_val = tensor_shape.unknown_shape()
+      else:
+        self._shape_val = tensor_shape.TensorShape(dims)
     return self._shape_val
 
-  def _c_api_shape(self):
-    """Returns the TensorShape of this tensor according to the C API."""
-    with self._op.graph._c_graph.get() as c_graph:  # pylint: disable=protected-access
-      shape_vec, unknown_shape = pywrap_tf_session.TF_GraphGetTensorShapeHelper(
-          c_graph, self._as_tf_output())
-    if unknown_shape:
-      return tensor_shape.unknown_shape()
-    else:
-      shape_vec = [None if d == -1 else d for d in shape_vec]
-      return tensor_shape.TensorShape(shape_vec)
-
   @property
   def ndim(self):
-    """Returns the number of Tensor dimensions."""
-    return self.shape.ndims
+    return self.shape.rank
 
-  @property
-  def _shape(self):
-    logging.warning("Tensor._shape is private, use Tensor.shape "
-                    "instead. Tensor._shape will eventually be removed.")
-    return self.shape
-
-  @_shape.setter
-  def _shape(self, value):
-    raise ValueError(
-        "Tensor._shape cannot be assigned, use Tensor.set_shape instead.")
-
-  def _disallow_when_autograph_unavailable(self, task):
+  def _disallow(self, task):
     raise errors.OperatorNotAllowedInGraphError(
-        f"{task} is not allowed: AutoGraph is unavailable in this runtime. See"
+        f"{task} is not allowed."
+        " You can attempt the following resolutions to the problem:"
+        " If you are running in Graph mode, use Eager execution mode"
+        " or decorate this function with @tf.function."
+        " If you are using AutoGraph, you can try decorating this function"
+        " with @tf.function. If that does not work, then you may be using"
+        " an unsupported feature or your source code may not be visible"
+        " to AutoGraph. See"
         " https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code"
         " for more information.")
 
-  def _disallow_when_autograph_disabled(self, task):
-    raise errors.OperatorNotAllowedInGraphError(
-        f"{task} is not allowed: AutoGraph is disabled in this function."
-        " Try decorating it directly with @tf.function.")
-
-  def _disallow_when_autograph_enabled(self, task):
-    raise errors.OperatorNotAllowedInGraphError(
-        f"{task} is not allowed: AutoGraph did convert this function. This"
-        " might indicate you are trying to use an unsupported feature.")
-
-  def _disallow_in_graph_mode(self, task):
-    raise errors.OperatorNotAllowedInGraphError(
-        f"{task} is not allowed in Graph execution. Use Eager execution or"
-        " decorate this function with @tf.function.")
-
   def _disallow_bool_casting(self):
-    if not ag_ctx.INSPECT_SOURCE_SUPPORTED:
-      self._disallow_when_autograph_unavailable(
-          "Using a symbolic `tf.Tensor` as a Python `bool`")
-    elif ag_ctx.control_status_ctx().status == ag_ctx.Status.DISABLED:
-      self._disallow_when_autograph_disabled(
-          "Using a symbolic `tf.Tensor` as a Python `bool`")
-    elif ag_ctx.control_status_ctx().status == ag_ctx.Status.ENABLED:
-      self._disallow_when_autograph_enabled(
-          "Using a symbolic `tf.Tensor` as a Python `bool`")
-    else:
-      # Default: V1-style Graph execution.
-      self._disallow_in_graph_mode(
-          "Using a symbolic `tf.Tensor` as a Python `bool`")
+    self._disallow("Using a symbolic `tf.Tensor` as a Python `bool`")
 
   def _disallow_iteration(self):
-    if not ag_ctx.INSPECT_SOURCE_SUPPORTED:
-      self._disallow_when_autograph_unavailable(
-          "Iterating over a symbolic `tf.Tensor`")
-    elif ag_ctx.control_status_ctx().status == ag_ctx.Status.DISABLED:
-      self._disallow_when_autograph_disabled(
-          "Iterating over a symbolic `tf.Tensor`")
-    elif ag_ctx.control_status_ctx().status == ag_ctx.Status.ENABLED:
-      self._disallow_when_autograph_enabled(
-          "Iterating over a symbolic `tf.Tensor`")
-    else:
-      # Default: V1-style Graph execution.
-      self._disallow_in_graph_mode("Iterating over a symbolic `tf.Tensor`")
+    self._disallow("Iterating over a symbolic `tf.Tensor`")
 
   def __iter__(self):
     if not context.executing_eagerly():
@@ -604,17 +493,9 @@ class Tensor(internal.NativeObject, core_tf_types.Symbol):
       return None
     return tuple(shape)
 
-  def _rank(self):
-    """Integer rank of this Tensor, if known, else None.
-
-    Returns:
-      Integer rank or None
-    """
-    return self.shape.ndims
-
   def _record_tape(self, capture):
     """Connect this graph tensor with capture for gradients calculation."""
-    tape.record_operation(
+    record.record_operation(
         "captured_value",
         [self], [capture],
         backward_function=lambda x: [x],
@@ -837,33 +718,7 @@ class Tensor(internal.NativeObject, core_tf_types.Symbol):
           dim_list.append(-1)
         else:
           dim_list.append(dim.value)
-    try:
-      with self._op.graph._c_graph.get() as c_graph:  # pylint: disable=protected-access
-        pywrap_tf_session.TF_GraphSetTensorShape_wrapper(
-            c_graph, self._as_tf_output(), dim_list, unknown_shape)
-    except errors.InvalidArgumentError as e:
-      # Convert to ValueError for backwards compatibility.
-      raise ValueError(e.message)
-
-  @property
-  def value_index(self):
-    """The index of this tensor in the outputs of its `Operation`."""
-    return self._value_index
-
-  def consumers(self):
-    """Returns a list of `Operation`s that consume this tensor.
-
-    Returns:
-      A list of `Operation`s.
-    """
-    consumer_names = pywrap_tf_session.TF_OperationOutputConsumers_wrapper(
-        self._as_tf_output())
-    # pylint: disable=protected-access
-    return [
-        self.graph._get_operation_by_name_unsafe(name)
-        for name in consumer_names
-    ]
-    # pylint: enable=protected-access
+    self._set_shape(dim_list, unknown_shape)
 
   def _as_node_def_input(self):
     """Return a value to use for the NodeDef "input" attribute.
@@ -878,21 +733,10 @@ class Tensor(internal.NativeObject, core_tf_types.Symbol):
       a string.
     """
     assert self._op.name
-    if self._value_index == 0:
+    if self.value_index == 0:
       return self._op.name
     else:
-      return "%s:%d" % (self._op.name, self._value_index)
-
-  def _as_tf_output(self):
-    # pylint: disable=protected-access
-    # NOTE: Beyond preventing unnecessary (re-)allocation, the cached object
-    # also guarantees that a dictionary of tf_output objects will retain a
-    # deterministic (yet unsorted) order which prevents memory blowup in the
-    # cache of executor(s) stored for every session.
-    if self._tf_output is None:
-      self._tf_output = c_api_util.tf_output(self.op._c_op, self.value_index)
-    return self._tf_output
-    # pylint: enable=protected-access
+      return "%s:%d" % (self._op.name, self.value_index)
 
   def __str__(self):
     return "Tensor(\"%s\"%s%s%s)" % (
@@ -908,20 +752,12 @@ class Tensor(internal.NativeObject, core_tf_types.Symbol):
 
   def __hash__(self):
     g = getattr(self, "graph", None)
-    if (Tensor._USE_EQUALITY and executing_eagerly_outside_functions() and
-        (g is None or g.building_function)):
+    if (Tensor._USE_EQUALITY and (g is None or g.building_function)):
       raise TypeError("Tensor is unhashable. "
                       "Instead, use tensor.ref() as the key.")
     else:
       return id(self)
 
-  def __copy__(self):
-    # TODO(b/77597810): get rid of Tensor copies.
-    cls = self.__class__
-    result = cls.__new__(cls)
-    result.__dict__.update(self.__dict__)
-    return result
-
   # NOTE(mrry): This enables the Tensor's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
   # accords the Tensor class higher priority than an ndarray, or a
@@ -1050,13 +886,12 @@ class Tensor(internal.NativeObject, core_tf_types.Symbol):
     return object_identity.Reference(self)
 
   def __tf_tracing_type__(self, signature_context):
-    spec = tensor_spec.TensorSpec(
-        self.shape, self.dtype).__tf_tracing_type__(signature_context)
-    # TODO(b/263894631): Store handle data in the TensorSpec itself. Once
-    # implemented, the following section under the if condition can be removed.
     if self.dtype == dtypes.resource or self.dtype == dtypes.variant:
       handle_data = handle_data_util.get_handle_data(self)
-      signature_context.add_handledata(id(spec), handle_data)
+      dtype = dtypes.DType(self.dtype._type_enum, handle_data)
+    else:
+      dtype = self.dtype
+    spec = tensor_spec.TensorSpec(self.shape, dtype)
     return spec
 
   def __tf_tensor__(
@@ -1071,6 +906,24 @@ class Tensor(internal.NativeObject, core_tf_types.Symbol):
     return self
 
 
+@tf_export("__internal__.SymbolicTensor")
+class SymbolicTensor(pywrap_tf_session.PyTensor, Tensor):
+  """A symbolic tensor from a graph or tf.function."""
+
+  def __new__(cls, op, value_index, dtype, unique_id=None):
+    if unique_id is None:
+      unique_id = uid()
+    return pywrap_tf_session.PyTensor.__new__(
+        SymbolicTensor, op, value_index, dtypes.as_dtype(dtype), unique_id
+    )
+
+  def __copy__(self):
+    cls = self.__class__
+    result = cls.__new__(cls, self.op, self.value_index, self.dtype, self._id)
+    result.__dict__.update(self.__dict__)
+    return result
+
+
 def _create_graph_constant(
     value, dtype, shape, name, verify_shape, allow_broadcast
 ):
@@ -1096,9 +949,7 @@ def _create_graph_constant(
   return const_tensor
 
 
-# TODO(agarwal): consider getting rid of this.
-# TODO(mdan): This object should not subclass ops.Tensor.
-class _EagerTensorBase(Tensor, core_tf_types.Value):
+class _EagerTensorBase(Tensor, internal.NativeObject, core_tf_types.Value):
   """Base class for EagerTensor."""
 
   # __complex__, __int__, __float__ and __index__ may copy the tensor to CPU and
@@ -1169,6 +1020,11 @@ class _EagerTensorBase(Tensor, core_tf_types.Value):
 
     return np.array(a, dtype=dtype)
 
+  def __hash__(self) -> int:
+    # EagerTensors are never hashable.
+    raise TypeError("Tensor is unhashable. "
+                    "Instead, use tensor.ref() as the key.")
+
   def _numpy_internal(self):
     raise NotImplementedError()
 
@@ -1301,7 +1157,7 @@ class _EagerTensorBase(Tensor, core_tf_types.Value):
             if hasattr(dresult, "_copy") else dresult
         ]
 
-      tape.record_operation("_copy", [new_tensor], [self], grad_fun)
+      record.record_operation("_copy", [new_tensor], [self], grad_fun)
     return new_tensor
     # pylint: enable=protected-access
 
@@ -1431,22 +1287,6 @@ EagerTensor = tf_export("__internal__.EagerTensor", v1=[])(
     pywrap_tfe.TFE_Py_InitEagerTensor(_EagerTensorBase))
 
 
-convert_to_tensor_v1_with_dispatch = (
-    tensor_conversion.convert_to_tensor_v1_with_dispatch
-)
-
-
-convert_to_tensor_v1 = tensor_conversion.convert_to_tensor_v1
-
-
-convert_to_tensor_v2_with_dispatch = (
-    tensor_conversion.convert_to_tensor_v2_with_dispatch
-)
-
-
-convert_to_tensor_v2 = tensor_conversion.convert_to_tensor_v2
-
-
 def _add_error_prefix(msg, *, name=None):
   return msg if name is None else f"{name}: {msg}"
 
@@ -1505,27 +1345,30 @@ def pack_eager_tensors(tensors, ctx=None):
     raise ValueError(
         "Computing gradients through pack_eager_tensors is not supported.")
 
-  tape.record_operation("pack_eager_tensors", [packed_tensor], tensors,
-                        grad_fun)
+  record.record_operation("pack_eager_tensors", [packed_tensor], tensors,
+                          grad_fun)
 
   return packed_tensor
 
 
 @profiler_trace.trace_wrapper("convert_to_tensor")
-def convert_to_tensor(value,
-                      dtype=None,
-                      name=None,
-                      as_ref=False,
-                      preferred_dtype=None,
-                      dtype_hint=None,
-                      # TODO(b/268347915): Remove argument.
-                      ctx=None,  # pylint: disable=unused-argument
-                      accepted_result_types=(Tensor,)):
+def convert_to_tensor(
+    value,
+    dtype=None,
+    name=None,
+    as_ref=False,
+    preferred_dtype=None,
+    dtype_hint=None,
+    # TODO(b/268347915): Remove argument.
+    ctx=None,  # pylint: disable=unused-argument
+    accepted_result_types=(Tensor,),
+):
   """Implementation of the public convert_to_tensor."""
   # TODO(b/142518781): Fix all call-sites and remove redundant arg
   preferred_dtype = preferred_dtype or dtype_hint
   return tensor_conversion_registry.convert(
-      value, dtype, name, as_ref, preferred_dtype, accepted_result_types)
+      value, dtype, name, as_ref, preferred_dtype, accepted_result_types
+  )
 
 
 internal_convert_to_tensor = convert_to_tensor
@@ -1842,7 +1685,7 @@ def _create_c_op(graph,
 
 
 @tf_export("Operation")
-class Operation(pywrap_tf_session.OperationHandle):
+class Operation(pywrap_tf_session.PyOperation):
   """Represents a graph node that performs computation on tensors.
 
   An `Operation` is a node in a `tf.Graph` that takes zero or more `Tensor`
@@ -1970,7 +1813,7 @@ class Operation(pywrap_tf_session.OperationHandle):
 
     # Initialize c_op from node_def and other inputs
     c_op = _create_c_op(g, node_def, inputs, control_input_ops, op_def=op_def)
-    self = Operation(c_op, g)
+    self = Operation(c_op, SymbolicTensor)
     self._init(g)
 
     self._original_op = original_op
@@ -2000,13 +1843,12 @@ class Operation(pywrap_tf_session.OperationHandle):
     Returns:
       an Operation object.
     """
-    self = Operation(c_op, g)
-    self._init(g)  # pylint: disable=protected-access
+    self = Operation(c_op, SymbolicTensor)
+    self._init(g)
     return self
 
   def _init(self, graph):
     """Initializes Operation from a TF_Operation."""
-
     self.graph = graph
     self._original_op = None
 
@@ -2028,16 +1870,8 @@ class Operation(pywrap_tf_session.OperationHandle):
     # 3. Gradient name registered by op.type.
     self._gradient_function = None
 
-    # Initialize self._outputs.
-    num_outputs = pywrap_tf_session.TF_OperationNumOutputs(self._c_op)
-    self._outputs = []
-    for i in range(num_outputs):
-      tf_output = c_api_util.tf_output(self._c_op, i)
-      output_type = pywrap_tf_session.TF_OperationOutputType(tf_output)
-      tensor = Tensor._create_with_tf_output(self, i, output_type, tf_output)  # pylint: disable=protected-access
-      self._outputs.append(tensor)
-
-    self._id_value = self.graph._add_op(self, self.name)  # pylint: disable=protected-access
+    self._init_outputs()
+    self._id_value = self.graph._add_op(self)  # pylint: disable=protected-access
 
   def _control_flow_post_processing(self, input_tensors=None):
     """Add this op to its control flow context.
@@ -2221,6 +2055,7 @@ class Operation(pywrap_tf_session.OperationHandle):
     """
     if not isinstance(tensor, Tensor):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
+
     _assert_same_graph(self, tensor)
 
     # Reset cached inputs.
@@ -2257,23 +2092,6 @@ class Operation(pywrap_tf_session.OperationHandle):
             tensor._as_tf_output(),  # pylint: disable=protected-access
             self._c_op)
 
-  def _add_outputs(self, types, shapes):
-    """Adds new Tensors to self.outputs.
-
-    Note: this is generally unsafe to use. This is used in certain situations in
-    conjunction with _set_type_list_attr.
-
-    Args:
-      types: list of DTypes
-      shapes: list of TensorShapes
-    """
-    assert len(types) == len(shapes)
-    orig_num_outputs = len(self.outputs)
-    for i in range(len(types)):
-      t = Tensor(self, orig_num_outputs + i, types[i])
-      self._outputs.append(t)
-      t.set_shape(shapes[i])
-
   def __str__(self):
     return str(self.node_def)
 
@@ -2284,11 +2102,6 @@ class Operation(pywrap_tf_session.OperationHandle):
     """Raises a helpful error."""
     raise TypeError("can't convert Operation '{}' to Tensor".format(self.name))
 
-  @property
-  def outputs(self):
-    """The list of `Tensor` objects representing the outputs of this op."""
-    return self._outputs
-
   @property
   def inputs(self):
     """The sequence of `Tensor` objects representing the data inputs of this op."""
@@ -2310,49 +2123,6 @@ class Operation(pywrap_tf_session.OperationHandle):
     ]
     return input_types
 
-  @property
-  def control_inputs(self):
-    """The `Operation` objects on which this op has a control dependency.
-
-    Before this op is executed, TensorFlow will ensure that the
-    operations in `self.control_inputs` have finished executing. This
-    mechanism can be used to run ops sequentially for performance
-    reasons, or to ensure that the side effects of an op are observed
-    in the correct order.
-
-    Returns:
-      A list of `Operation` objects.
-
-    """
-    control_c_ops = pywrap_tf_session.TF_OperationGetControlInputs_wrapper(
-        self._c_op)
-    # pylint: disable=protected-access
-    return [
-        self.graph._get_operation_by_name_unsafe(
-            pywrap_tf_session.TF_OperationName(c_op)) for c_op in control_c_ops
-    ]
-    # pylint: enable=protected-access
-
-  @property
-  def _control_outputs(self):
-    """The `Operation` objects which have a control dependency on this op.
-
-    Before any of the ops in self._control_outputs can execute tensorflow will
-    ensure self has finished executing.
-
-    Returns:
-      A list of `Operation` objects.
-
-    """
-    control_c_ops = pywrap_tf_session.TF_OperationGetControlOutputs_wrapper(
-        self._c_op)
-    # pylint: disable=protected-access
-    return [
-        self.graph._get_operation_by_name_unsafe(
-            pywrap_tf_session.TF_OperationName(c_op)) for c_op in control_c_ops
-    ]
-    # pylint: enable=protected-access
-
   @property
   def traceback(self):
     """Returns the call stack from when this operation was constructed."""
@@ -2799,7 +2569,7 @@ def resource_creator_scope(resource_type, resource_creator):
 
 
 @tf_export("Graph")
-class Graph(pywrap_tf_session.GraphHandle):
+class Graph(pywrap_tf_session.PyGraph):
   """A TensorFlow computation, represented as a dataflow graph.
 
   Graphs are used by `tf.function`s to represent the function's computations.
@@ -2861,10 +2631,6 @@ class Graph(pywrap_tf_session.GraphHandle):
     # Similarly, if one or more Session.run calls are going on, all mutate ops
     # have to wait until all Session.run calls have finished.
     self._group_lock = lock_util.GroupLock(num_groups=2)
-    self._nodes_by_id = {}  # GUARDED_BY(self._lock)
-    self._next_id_counter = 0  # GUARDED_BY(self._lock)
-    self._nodes_by_name = {}  # GUARDED_BY(self._lock)
-    self._version = 0  # GUARDED_BY(self._lock)
     # Maps a name used in the graph to the next id to use for that name.
     self._names_in_use = {}
     self._stack_state_is_thread_local = False
@@ -3122,41 +2888,6 @@ class Graph(pywrap_tf_session.GraphHandle):
     if self._finalized:
       raise RuntimeError("Graph is finalized and cannot be modified.")
 
-  def _add_op(self, op, op_name):
-    """Adds 'op' to the graph and returns the unique ID for the added Operation.
-
-    Args:
-      op: the Operation to add.
-      op_name: the name of the Operation.
-
-    Returns:
-      An integer that is a unique ID for the added Operation.
-    """
-    self._check_not_finalized()
-    with self._lock:
-      self._next_id_counter += 1
-      op_id = self._next_id_counter
-      self._nodes_by_id[op_id] = op
-      self._nodes_by_name[op_name] = op
-      self._version = max(self._version, op_id)
-      return op_id
-
-  @property
-  def version(self):
-    """Returns a version number that increases as ops are added to the graph.
-
-    Note that this is unrelated to the
-    `tf.Graph.graph_def_versions`.
-
-    Returns:
-       An integer version that increases as ops are added to the graph.
-    """
-    if self._finalized:
-      return self._version
-
-    with self._lock:
-      return self._version
-
   @property
   def graph_def_versions(self):
     # pylint: disable=line-too-long
@@ -3226,11 +2957,11 @@ class Graph(pywrap_tf_session.GraphHandle):
     """If this graph contains functions, copy them to `graph_def`."""
     bytesize = starting_bytesize
     for f in self._functions.values():
-      bytesize += f.definition.ByteSize()
+      bytesize += f.cached_definition.ByteSize()
       if bytesize >= (1 << 31) or bytesize < 0:
         raise ValueError("GraphDef cannot be larger than 2GB.")
-      graph_def.library.function.extend([f.definition])
-      if f.grad_func_name:
+      graph_def.library.function.extend([f.cached_definition])
+      if getattr(f, "grad_func_name", None):
         grad_def = function_pb2.GradientDef()
         grad_def.function_name = f.name
         grad_def.gradient_func = f.grad_func_name
@@ -3277,7 +3008,7 @@ class Graph(pywrap_tf_session.GraphHandle):
 
       if add_shapes:
         for node in graph.node:
-          op = self._nodes_by_name[node.name]
+          op = self._get_operation_by_name(node.name)
           if op.outputs:
             node.attr["_output_shapes"].list.shape.extend(
                 [output.get_shape().as_proto() for output in op.outputs])
@@ -3328,7 +3059,7 @@ class Graph(pywrap_tf_session.GraphHandle):
             node.attr["_output_shapes"].list.shape.extend(
                 [output.get_shape().as_proto() for output in outputs])
 
-    return graph, self._version
+    return graph, self.version
 
   def as_graph_def(self, from_version=None, add_shapes=False):
     # pylint: disable=line-too-long
@@ -3381,6 +3112,25 @@ class Graph(pywrap_tf_session.GraphHandle):
     """
     return self._functions.get(compat.as_str(name), None)
 
+  def _add_function_recursive(self, function, overwrite=False):
+    """Adds function to the graph including other functions in its graph."""
+
+    if self._is_function(function.name):
+      if overwrite:
+        self._remove_function(function.name)
+        self._add_function(function)
+    else:
+      self._add_function(function)
+
+    if hasattr(function, "graph"):
+      for f in function.graph._functions.values():  # pylint: disable=protected-access
+        if self._is_function(f.name):
+          if overwrite:
+            self._remove_function(f.name)
+            self._add_function(f)
+        else:
+          self._add_function(f)
+
   def _add_function(self, function):
     """Adds a function to the graph.
 
@@ -3397,16 +3147,18 @@ class Graph(pywrap_tf_session.GraphHandle):
     self._check_not_finalized()
 
     name = function.name
-    # Sanity checks on gradient definition.
-    if (function.grad_func_name is not None) and (function.python_grad_func is
-                                                  not None):
+    # Sanity checks on gradient definition for deprecated _DefinedFunction.
+    if getattr(function, "grad_func_name", None) and getattr(
+        function, "python_grad_func", None
+    ):
       raise ValueError("Gradient defined twice for function %s" % name)
 
     # Add function to graph
     # pylint: disable=protected-access
     with self._c_graph.get() as c_graph:
       with function._c_func.get() as func:
-        if function._grad_func:
+        if getattr(function, "_grad_func", None):
+          # For deprecated _DefinedFunction.
           with function._grad_func._c_func.get() as gradient:
             pywrap_tf_session.TF_GraphCopyFunction(c_graph, func, gradient)
         else:
@@ -3705,7 +3457,7 @@ class Graph(pywrap_tf_session.GraphHandle):
     # be created before its inputs.
     new_ops = [
         self._create_op_from_tf_operation(c_op, compute_device=compute_devices)
-        for c_op in c_api_util.new_tf_operations(self)
+        for c_op in self.new_operations()
     ]
 
     # pylint: disable=protected-access
@@ -3788,12 +3540,15 @@ class Graph(pywrap_tf_session.GraphHandle):
           raise ValueError("The name %s looks a like a Tensor name, but is "
                            "not a valid one. Tensor names must be of the "
                            "form \"<op_name>:<output_index>\"." % repr(name))
-        if op_name in self._nodes_by_name:
-          op = self._nodes_by_name[op_name]
-        else:
-          raise KeyError("The name %s refers to a Tensor which does not "
-                         "exist. The operation, %s, does not exist in the "
-                         "graph." % (repr(name), repr(op_name)))
+        try:
+          op = self._get_operation_by_name(op_name)
+        except KeyError as exc:
+          raise KeyError(
+              "The name %s refers to a Tensor which does not "
+              "exist. The operation, %s, does not exist in the "
+              "graph." % (repr(name), repr(op_name))
+          ) from exc
+
         try:
           return op.outputs[out_n]
         except:
@@ -3809,18 +3564,23 @@ class Graph(pywrap_tf_session.GraphHandle):
 
       elif ":" not in name and allow_operation:
         # Looks like an Operation name and can be an Operation.
-        if name not in self._nodes_by_name:
-          raise KeyError("The name %s refers to an Operation not in the "
-                         "graph." % repr(name))
-        return self._nodes_by_name[name]
+        try:
+          op = self._get_operation_by_name(name)
+        except KeyError as exc:
+          raise KeyError(
+              "The name %s refers to an Operation not in the graph."
+              % repr(name)
+          ) from exc
+        return op
 
       elif ":" not in name and not allow_operation:
         # Looks like an Operation name but can't be an Operation.
-        if name in self._nodes_by_name:
+        try:
+          op = self._get_operation_by_name(name)
           # Yep, it's an Operation name
           err_msg = ("The name %s refers to an Operation, not a %s." %
                      (repr(name), types_str))
-        else:
+        except KeyError:
           err_msg = ("The name %s looks like an (invalid) Operation name, "
                      "not a %s." % (repr(name), types_str))
         err_msg += (" Tensor names must be of the form "
@@ -3842,24 +3602,6 @@ class Graph(pywrap_tf_session.GraphHandle):
       raise TypeError("Can not convert a %s into a %s." %
                       (type(obj).__name__, types_str))
 
-  def get_operations(self):
-    """Return the list of operations in the graph.
-
-    You can modify the operations in place, but modifications
-    to the list such as inserts/delete have no effect on the
-    list of operations known to the graph.
-
-    This method may be called concurrently from multiple threads.
-
-    Returns:
-      A list of Operations.
-    """
-    if self._finalized:
-      return list(self._nodes_by_id.values())
-
-    with self._lock:
-      return list(self._nodes_by_id.values())
-
   def get_operation_by_name(self, name):
     """Returns the `Operation` with the given `name`.
 
@@ -3881,32 +3623,9 @@ class Graph(pywrap_tf_session.GraphHandle):
                       type(name).__name__)
     return self.as_graph_element(name, allow_tensor=False, allow_operation=True)
 
-  def _get_operation_by_name_unsafe(self, name):
-    """Returns the `Operation` with the given `name`.
-
-    This is a internal unsafe version of get_operation_by_name. It skips many
-    checks and does not have user friendly error messages but runs considerably
-    faster. This method may be called concurrently from multiple threads.
-
-    Args:
-      name: The name of the `Operation` to return.
-
-    Returns:
-      The `Operation` with the given `name`.
-
-    Raises:
-      KeyError: If `name` does not correspond to an operation in this graph.
-    """
-
-    if self._finalized:
-      return self._nodes_by_name[name]
-
-    with self._lock:
-      return self._nodes_by_name[name]
-
   def _get_operation_by_tf_operation(self, tf_oper):
     op_name = pywrap_tf_session.TF_OperationName(tf_oper)
-    return self._get_operation_by_name_unsafe(op_name)
+    return self._get_operation_by_name(op_name)
 
   def get_tensor_by_name(self, name):
     """Returns the `Tensor` with the given `name`.
@@ -3945,10 +3664,6 @@ class Graph(pywrap_tf_session.GraphHandle):
     op = self._get_operation_by_tf_operation(tf_output.oper)
     return op.outputs[tf_output.index]
 
-  @property
-  def _last_id(self):
-    return self._next_id_counter
-
   def op_def_for_type(self, type):  # pylint: disable=redefined-builtin
     """Returns the `OpDef` proto for `type`. `type` is a string."""
     # NOTE: No locking is required because the lookup and insertion operations
@@ -4433,9 +4148,9 @@ class Graph(pywrap_tf_session.GraphHandle):
       # offset refers to the stack frame used for storing code location.
       # We use 4, the sum of 1 to use our caller's stack frame and 3
       # to jump over layers of context managers above us.
+      self._colocation_stack.push_obj(op, offset=4)
       if device_only_candidate is not None:
         self._colocation_stack.push_obj(device_only_candidate, offset=4)
-      self._colocation_stack.push_obj(op, offset=4)
     elif not ignore_existing:
       raise ValueError("Trying to reset colocation (op is None) but "
                        "ignore_existing is not True")
@@ -5655,7 +5370,7 @@ def init_scope():
 
   if context.executing_eagerly():
     # Fastpath.
-    with tape.stop_recording():
+    with record.stop_recording():
       yield
   else:
     # Retrieve the active name scope: entering an `init_scope` preserves
@@ -5674,7 +5389,7 @@ def init_scope():
     try:
       with outer_context(), name_scope(
           scope, skip_on_eager=False), control_dependencies(
-              None), tape.stop_recording():
+              None), record.stop_recording():
         context_manager = NullContextmanager
         context_manager_input = None
         if not context.executing_eagerly():
@@ -6106,12 +5821,8 @@ def _get_graph_from_inputs(op_input_list, graph=None):
   #    informative error if a mismatch is found.
   original_graph_element = None
   for op_input in op_input_list:
-    # Determine if this is a valid graph_element.
-    # TODO(josh11b): Note that we exclude subclasses of Tensor. Need to clean this
-    # up.
     graph_element = None
-    if (isinstance(op_input, (Operation, internal.NativeObject)) and
-        ((not isinstance(op_input, Tensor)) or type(op_input) == Tensor)):  # pylint: disable=unidiomatic-typecheck
+    if isinstance(op_input, (Operation, SymbolicTensor)):
       graph_element = op_input
     else:
       graph_element = _as_graph_element(op_input)
@@ -6284,13 +5995,7 @@ def dismantle_graph(graph):
       after this function runs.
   """
   graph._functions.clear()  # pylint: disable=protected-access
-
-  # Now clean up Operation<->Graph reference cycles by clearing all of the
-  # attributes for the Graph and its ops.
-  graph_operations = graph.get_operations()
-  for op in graph_operations:
-    op.__dict__ = {}
-  graph.__dict__ = {}
+  graph.Dismantle()
 
 
 @tf_export(v1=["add_to_collection"])
@@ -6426,9 +6131,9 @@ def name_scope(name, default_name=None, values=None, skip_on_eager=True):
   if values:
     # The presence of a graph tensor in `values` overrides the context.
     # TODO(slebedev): this is Keras-specific and should be removed.
-    # pylint: disable=unidiomatic-typecheck
-    graph_value = next((value for value in values if type(value) == Tensor),
-                       None)
+    graph_value = next(
+        (value for value in values if is_symbolic_tensor(value)), None
+    )
     # pylint: enable=unidiomatic-typecheck
     if graph_value is not None:
       return graph_value.graph.name_scope(name)
@@ -6846,10 +6551,6 @@ def _op_to_colocate_with(v, graph):
     return convert_to_tensor(v, as_ref=True).op, None
 
 
-def _is_keras_symbolic_tensor(x):
-  return hasattr(x, "graph") and getattr(x.graph, "name", None) == "keras_graph"
-
-
 # Helper functions for op wrapper modules generated by `python_op_gen`.
 
 
@@ -7021,3 +6722,16 @@ def _copy_handle_data_to_arg_def(tensor, arg_def):
     proto = arg_def.handle_data.add()
     proto.dtype = shape_and_type.dtype
     proto.shape.CopyFrom(handle_data.shape_and_type[0].shape)
+
+
+@tf_export("is_symbolic_tensor", v1=["is_symbolic_tensor"])
+def is_symbolic_tensor(tensor):
+  """Test if `tensor` is a symbolic Tensor.
+
+  Args:
+    tensor: a tensor-like object
+
+  Returns:
+    True if `tensor` is a symbolic tensor (not an eager tensor).
+  """
+  return isinstance(tensor, SymbolicTensor)
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 45d07f39b0e..bc29d36e86e 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -53,7 +53,8 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import resources
@@ -132,12 +133,15 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
         ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.float32]
     )
     t = op.outputs[0]
-    with self.assertRaisesRegex(TypeError, "Iterating.*not allowed in Graph"):
+    with self.assertRaisesRegex(
+        TypeError, "Iterating.*not allowed.*Graph mode"):
       next(iter(t))
-    with self.assertRaisesRegex(TypeError, "Iterating.*AutoGraph did convert"):
+    with self.assertRaisesRegex(
+        TypeError, "Iterating.*AutoGraph.*unsupported feature"):
       with ag_ctx.ControlStatusCtx(ag_ctx.Status.ENABLED):
         next(iter(t))
-    with self.assertRaisesRegex(TypeError, "Iterating.*AutoGraph is disabled"):
+    with self.assertRaisesRegex(
+        TypeError, "Iterating.*AutoGraph.*not be visible"):
       with ag_ctx.ControlStatusCtx(ag_ctx.Status.DISABLED):
         next(iter(t))
 
@@ -146,15 +150,15 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
         ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.bool]
     )
     t = op.outputs[0]
-    with self.assertRaisesRegex(TypeError,
-                                "Using.*as a.*bool.*not allowed in Graph"):
+    with self.assertRaisesRegex(
+        TypeError, "Using.*as a.*bool.*not allowed.*Graph mode"):
       bool(t)
-    with self.assertRaisesRegex(TypeError,
-                                "Using.*as a.*bool.*AutoGraph did convert"):
+    with self.assertRaisesRegex(
+        TypeError, "Using.*as a.*bool.*AutoGraph.*unsupported feature"):
       with ag_ctx.ControlStatusCtx(ag_ctx.Status.ENABLED):
         bool(t)
-    with self.assertRaisesRegex(TypeError,
-                                "Using.*as a.*bool.*AutoGraph is disabled"):
+    with self.assertRaisesRegex(
+        TypeError, "Using.*as a.*bool.*AutoGraph.*not be visible"):
       with ag_ctx.ControlStatusCtx(ag_ctx.Status.DISABLED):
         bool(t)
 
@@ -696,13 +700,13 @@ class OperationTest(test_util.TensorFlowTestCase):
     float_t, label_str_t = op.values()
     self.assertEqual(dtypes.float32, float_t.dtype)
     self.assertEqual(op, float_t.op)
-    self.assertEqual(0, float_t._value_index)
+    self.assertEqual(0, float_t.value_index)
     self.assertEqual(0, len(float_t.consumers()))
     self.assertEqual("myop", float_t._as_node_def_input())
 
     self.assertEqual(dtypes.string, label_str_t.dtype)
     self.assertEqual(op, label_str_t.op)
-    self.assertEqual(1, label_str_t._value_index)
+    self.assertEqual(1, label_str_t.value_index)
     self.assertEqual(0, len(label_str_t.consumers()))
     self.assertEqual("myop:1", label_str_t._as_node_def_input())
 
@@ -925,7 +929,7 @@ class OperationTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testNoConvert(self):
     # Operation cannot be converted to Tensor.
-    op = control_flow_ops.no_op()
+    op = gen_control_flow_ops.no_op()
     with self.assertRaisesRegex(TypeError,
                                 "can't convert Operation '.+' to Tensor"):
       ops.convert_to_tensor(op)
@@ -1336,7 +1340,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
         self.assertLen(new_ops, 1)
         return x
 
-      control_flow_ops.cond(x < 10, true_fn, lambda: x)
+      cond.cond(x < 10, true_fn, lambda: x)
 
     op = g.get_operation_by_name("cond/myop")
     self.assertIsNotNone(op)
@@ -3066,7 +3070,7 @@ class GraphTest(test_util.TensorFlowTestCase):
 class AttrScopeTest(test_util.TensorFlowTestCase):
 
   def _get_test_attrs(self):
-    x = control_flow_ops.no_op()
+    x = gen_control_flow_ops.no_op()
     try:
       a = compat.as_text(x.get_attr("_A"))
     except ValueError:
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index d1f5d3b3f00..304f976aa1a 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -304,6 +304,7 @@ class GenPythonOp {
   const string function_name_;
   const int num_outs_;
   python_op_gen_internal::GeneratedCodeAnnotator* annotator_ = nullptr;
+  uint32_t def_offset_start_ = 0;
 
   // Return value from Code() is prelude_ + result_.
   string prelude_;  // Code before function definition
@@ -1160,7 +1161,7 @@ string GenPythonOp::Code() {
     param_names_.push_back(param_and_default.first);
   }
 
-  std::unordered_map<string, string> type_annotations = GetTypeAnnotations();
+  std::unordered_map<string, string> type_annotations;
 
   string parameters;
   // Param can be an input or an attr
@@ -1237,44 +1238,15 @@ string GenPythonOp::Code() {
     return result_;
   }
 
+  if (annotator_ != nullptr) {
+    // prelude_ will be prepended.
+    def_offset_start_ += prelude_.length();
+    annotator_->AddAnnotation(op_def_, function_name_, def_offset_start_);
+  }
+
   return prelude_ + result_;
 }
 
-std::unordered_map<string, string> GenPythonOp::GetTypeAnnotations() {
-  std::unordered_map<string, string> type_annotations;
-  // Map attrs to TypeVars
-  for (const auto& attr : op_def_.attr()) {
-    if (attr.type() == "type") {
-      const string type_var_name =
-          AvoidPythonReserved("TV_" + op_def_.name() + "_" + attr.name());
-      type_annotations[attr.name()] = type_var_name;
-    } else if (attr.type() == "bool" || attr.type() == "float" ||
-               attr.type() == "int" || attr.type() == "bytes") {
-      type_annotations[attr.name()] = attr.type();
-    } else if (attr.type() == "string") {
-      type_annotations[attr.name()] = "str";
-    }
-  }
-
-  // Map input Tensors to their types
-  for (const auto& arg : op_def_.input_arg()) {
-    // TODO(rahulkamat): Add type annotations to args that accept a sequence of
-    // Tensors
-    if (!arg.type_list_attr().empty()) continue;
-    type_annotations[arg.name()] = GetArgAnnotation(arg, type_annotations);
-  }
-
-  // TODO(rahulkamat): Add type annotations to handle return types of a sequence
-  // of Tensors. Map output Tensor to its type
-  if (op_def_.output_arg_size() == 1) {
-    const auto& arg = op_def_.output_arg(0);
-    if (arg.number_attr().empty() && arg.type_list_attr().empty())
-      type_annotations[arg.name()] = GetArgAnnotation(arg, type_annotations);
-  }
-
-  return type_annotations;
-}
-
 // Generate TypeVars using attrs
 void GenPythonOp::GenerateTypeVars(
     const std::unordered_map<string, string>& type_annotations) {
@@ -1662,7 +1634,6 @@ bool GenPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error,
     const std::unordered_map<string, string>& type_annotations) {
-  GenerateTypeVars(type_annotations);
   if (api_def_.visibility() == ApiDef::VISIBLE) {
     strings::StrAppend(&result_, "@_dispatch.add_fallback_dispatch_list\n");
     strings::StrAppend(&result_, "@_dispatch.add_type_based_api_dispatcher\n");
@@ -1672,11 +1643,9 @@ bool GenPythonOp::AddEagerFastPathAndGraphCode(
   if (annotator_ != nullptr) {
     // The generated function name will start at the character after
     // the current cursor + len("def ")
-    annotator_->AddAnnotation(op_def_, function_name_,
-                              /*offset_start =*/result_.length() + 5);
+    def_offset_start_ = result_.length() + 4;
   }
   AddDefLine(function_name_, parameters);
-  AddReturnTypeAnnotation(type_annotations);
   AddDocStringDescription();
   AddDocStringArgs();
   AddDocStringInputs();
@@ -1717,7 +1686,6 @@ bool GenPythonOp::AddEagerFallbackCode(
   AddDefLine(
       strings::StrCat(function_name_, kEagerFallbackSuffix),
       strings::StrCat(parameters, parameters.empty() ? "" : ", ", "ctx"));
-  AddReturnTypeAnnotation(type_annotations);
   if (!eager_not_allowed_error.empty()) {
     strings::StrAppend(&result_, "  ", eager_not_allowed_error);
     return true;
@@ -2036,12 +2004,8 @@ from tensorflow.python.util.deprecation import deprecated_endpoints
 from tensorflow.python.util import dispatch as _dispatch
 from tensorflow.python.util.tf_export import tf_export
 
-from typing import TypeVar, List
+from typing import TypeVar
 )");
-  if (annotate) {
-    annotator.SetBase(result.length());
-  }
-
   for (const auto& op_def : ops.op()) {
     const auto* api_def = api_defs.GetApiDef(op_def.name());
 
@@ -2082,12 +2046,12 @@ from typing import TypeVar, List
       continue;
     }
 
-    strings::StrAppend(&result,
-                       GetEagerPythonOp(op_def, *api_def, function_name,
-                                        annotate ? &annotator : nullptr));
     if (annotate) {
       annotator.SetBase(result.length());
     }
+    strings::StrAppend(&result,
+                       GetEagerPythonOp(op_def, *api_def, function_name,
+                                        annotate ? &annotator : nullptr));
   }
 
   if (annotate) {
@@ -2125,28 +2089,4 @@ string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   return GetPythonOpsImpl(ops, api_def_map, OpRegOffsets(), {}, {});
 }
 
-string GetSingleTensorArgAnnotation(
-    const OpDef::ArgDef& arg,
-    const std::unordered_map<string, string>& type_annotations) {
-  if (!arg.type_attr().empty()) {
-    // Get the correct TypeVar if arg maps to an attr
-    return "_atypes.TensorFuzzingAnnotation[" +
-           type_annotations.at(arg.type_attr()) + "]";
-  } else {
-    // Get the dtype of the Tensor
-    const string py_dtype = DataTypeToPython(arg.type(), "_dtypes.");
-    return "_atypes.TensorFuzzingAnnotation[" + dtype_type.at(py_dtype) + "]";
-  }
-}
-
-string GetArgAnnotation(
-    const OpDef::ArgDef& arg,
-    const std::unordered_map<string, string>& type_annotations) {
-  if (!arg.number_attr().empty()) {
-    return strings::StrCat(
-        "List[", GetSingleTensorArgAnnotation(arg, type_annotations), "]");
-  }
-  return GetSingleTensorArgAnnotation(arg, type_annotations);
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/python/framework/python_op_gen_annotation_test.py b/tensorflow/python/framework/python_op_gen_annotation_test.py
index cffd5649896..11ac518e06b 100644
--- a/tensorflow/python/framework/python_op_gen_annotation_test.py
+++ b/tensorflow/python/framework/python_op_gen_annotation_test.py
@@ -23,18 +23,6 @@ from tensorflow.python.platform import googletest
 
 class PythonOpGetTest(googletest.TestCase):
 
-  def test_type_annotation_not_empty_for_internal_op(self):
-    for internal_op in [
-        data_flow_ops.dynamic_stitch,
-        gen_nn_ops._fused_batch_norm,
-        gen_math_ops.add,
-    ]:
-      sig = inspect.signature(internal_op)
-      for key in sig.parameters:
-        if key == "name":
-          continue
-        assert sig.parameters[key].annotation != inspect.Signature.empty
-
   def test_type_annotation_empty_for_imported_op(self):
     for imported_op in [
         data_flow_ops.DynamicStitch,
diff --git a/tensorflow/python/framework/python_op_gen_annotator.cc b/tensorflow/python/framework/python_op_gen_annotator.cc
index 5ac4aa61f46..4de83ed8cc0 100644
--- a/tensorflow/python/framework/python_op_gen_annotator.cc
+++ b/tensorflow/python/framework/python_op_gen_annotator.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "absl/strings/escaping.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/python/framework/kythe_metadata.pb.h"
 #include "tensorflow/python/framework/op_reg_offset.pb.h"
 
@@ -69,7 +67,7 @@ string GeneratedCodeAnnotator::BuildKytheMetadata() {
 
     VName* vname = meta->mutable_source_vname();
     vname->set_signature(absl::StrFormat(
-        "@%d:%d@tensorflow/op#%s#%s#%s", offsets.source_start,
+        "@%d:%d@tensorflow_op#%s#%s#%s", offsets.source_start,
         offsets.source_end, name, kKytheCorpus, offsets.file_path));
     vname->set_corpus(std::string(kKytheCorpus));
     vname->set_path(offsets.file_path);
diff --git a/tensorflow/python/framework/python_op_gen_annotator_test.cc b/tensorflow/python/framework/python_op_gen_annotator_test.cc
index 8fae183abdb..7548690009a 100644
--- a/tensorflow/python/framework/python_op_gen_annotator_test.cc
+++ b/tensorflow/python/framework/python_op_gen_annotator_test.cc
@@ -76,6 +76,10 @@ TEST(PythonOpGenAnnotatorTest, AddAnnotationWithSourceOffsets) {
 
   EXPECT_EQ(actual.meta(0).type(), MappingRule::ANCHOR_ANCHOR);
   EXPECT_EQ(actual.meta(0).edge(), "/kythe/edge/imputes");
+  EXPECT_EQ(
+      actual.meta(0).source_vname().signature(),
+      absl::StrFormat("@7:11@tensorflow_op#fake_op#%s#file/path/to/fake_op.cc",
+                      kKytheCorpus));
   EXPECT_EQ(actual.meta(0).source_vname().path(), "file/path/to/fake_op.cc");
   EXPECT_EQ(actual.meta(0).source_begin(), 7);
   EXPECT_EQ(actual.meta(0).source_end(), 11);
@@ -110,7 +114,7 @@ TEST(PythonOpGenAnnotatorTest, AddAnnotationWithSourceOffsetsAndNonZeroBase) {
   EXPECT_EQ(actual.meta(0).edge(), "/kythe/edge/imputes");
   EXPECT_EQ(
       actual.meta(0).source_vname().signature(),
-      absl::StrFormat("@7:11@tensorflow/op#fake_op#%s#file/path/to/fake_op.cc",
+      absl::StrFormat("@7:11@tensorflow_op#fake_op#%s#file/path/to/fake_op.cc",
                       kKytheCorpus));
   EXPECT_EQ(actual.meta(0).source_vname().path(), "file/path/to/fake_op.cc");
   EXPECT_EQ(actual.meta(0).source_begin(), 7);
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index ff3d28ab84b..868bbb1a410 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -37,10 +37,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/python/framework/op_reg_offset.pb.h"
 #include "tensorflow/python/framework/python_op_gen.h"
-#include "tensorflow/tsl/lib/io/buffered_inputstream.h"
-#include "tensorflow/tsl/lib/io/random_inputstream.h"
 #include "tensorflow/tsl/platform/errors.h"
-#include "tensorflow/tsl/platform/protobuf.h"
 #include "tensorflow/tsl/platform/str_util.h"
 #include "tensorflow/tsl/util/command_line_flags.h"
 
diff --git a/tensorflow/python/framework/python_op_gen_test.cc b/tensorflow/python/framework/python_op_gen_test.cc
index ce454b3f3a4..86f958d011e 100644
--- a/tensorflow/python/framework/python_op_gen_test.cc
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -18,10 +18,14 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/python/framework/kythe_metadata.pb.h"
 #include "tensorflow/python/framework/op_reg_offset.pb.h"
 
 namespace tensorflow {
@@ -485,6 +489,15 @@ TEST(PythonOpGen, InsertCommentsForSourceFileLocation) {
                   "Original C++ source file: some_ops.cc, another_ops.cc");
 }
 
+GeneratedCodeInfo DecodeAnnotation(string anno) {
+  std::vector<string> sp = absl::StrSplit(anno, ':');
+  string gci_str;
+  absl::Base64Unescape(sp[1], &gci_str);
+  GeneratedCodeInfo gci;
+  gci.ParseFromString(gci_str);
+  return gci;
+}
+
 TEST(PythonOpGen, GenerateMetadataWhenOpRegOffsetsIsPresent) {
   constexpr char kBaseOpDef[] = R"(
   op {
@@ -502,11 +515,63 @@ TEST(PythonOpGen, GenerateMetadataWhenOpRegOffsetsIsPresent) {
   offset->set_name("Baz");
   offset->set_filepath("some_ops.cc");
   offset->set_start(0);
-  offset->set_end(0);
+  offset->set_end(3);
 
   string code = GetPythonOps(op_defs, api_def_map, offsets, {}, {});
 
-  ExpectHasSubstr(code, "# kythe.proto.metadata.GeneratedCodeInfo:");
+  std::vector<string> sp = absl::StrSplit(code, '\n');
+  string last_line = sp.back();
+  ASSERT_TRUE(absl::StrContains(last_line,
+                                "# kythe.proto.metadata.GeneratedCodeInfo:"));
+  GeneratedCodeInfo gci = DecodeAnnotation(last_line);
+
+  EXPECT_EQ(gci.meta_size(), 1);
+  EXPECT_EQ(gci.meta(0).source_begin(), 0);
+  EXPECT_EQ(gci.meta(0).source_end(), 3);
+  EXPECT_EQ(gci.meta(0).target_begin(), 1212);
+  EXPECT_EQ(gci.meta(0).target_end(), 1215);
+}
+
+TEST(PythonOpGen, GenerateMetadataForMultipleOutputOp) {
+  constexpr char kBaseOpDef[] = R"(
+  op {
+    name: "Baz"
+    output_arg {
+      name: "output1"
+      type: DT_BOOL
+    }
+    output_arg {
+      name: "output2"
+      type: DT_BOOL
+    }
+  }
+  )";
+
+  OpList op_defs;
+  OpRegistry::Global()->Export(false, &op_defs);
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
+  ApiDefMap api_def_map(op_defs);
+
+  OpRegOffsets offsets;
+  auto* offset = offsets.add_offsets();
+  offset->set_name("Baz");
+  offset->set_filepath("some_ops.cc");
+  offset->set_start(0);
+  offset->set_end(3);
+
+  string code = GetPythonOps(op_defs, api_def_map, offsets, {}, {});
+
+  std::vector<string> sp = absl::StrSplit(code, '\n');
+  string last_line = sp.back();
+  ASSERT_TRUE(absl::StrContains(last_line,
+                                "# kythe.proto.metadata.GeneratedCodeInfo:"));
+  GeneratedCodeInfo gci = DecodeAnnotation(last_line);
+
+  EXPECT_EQ(gci.meta_size(), 1);
+  EXPECT_EQ(gci.meta(0).source_begin(), 0);
+  EXPECT_EQ(gci.meta(0).source_end(), 3);
+  EXPECT_EQ(gci.meta(0).target_begin(), 1289);
+  EXPECT_EQ(gci.meta(0).target_end(), 1292);
 }
 
 TEST(PythonOpGen, NotGenerateMetadataWhenOpRegOffsetsIsEmpty) {
diff --git a/tensorflow/python/framework/smart_cond.py b/tensorflow/python/framework/smart_cond.py
index 95368d382c2..67708b3aece 100644
--- a/tensorflow/python/framework/smart_cond.py
+++ b/tensorflow/python/framework/smart_cond.py
@@ -16,8 +16,8 @@
 
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_case
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -54,8 +54,8 @@ def smart_cond(pred, true_fn=None, false_fn=None, name=None):
     else:
       return false_fn()
   else:
-    return control_flow_ops.cond(pred, true_fn=true_fn, false_fn=false_fn,
-                                 name=name)
+    return cond.cond(pred, true_fn=true_fn, false_fn=false_fn,
+                     name=name)
 
 
 def smart_constant_value(pred):
diff --git a/tensorflow/python/framework/smart_cond_test.py b/tensorflow/python/framework/smart_cond_test.py
index 7162a4eaecf..234d5448a9a 100644
--- a/tensorflow/python/framework/smart_cond_test.py
+++ b/tensorflow/python/framework/smart_cond_test.py
@@ -20,7 +20,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
@@ -146,9 +146,9 @@ class SmartConstantValueTest(test_util.TensorFlowTestCase):
   def testCond(self):
     with ops.Graph().as_default():
       pred = array_ops.placeholder_with_default(True, shape=())
-      x = control_flow_ops.cond(pred,
-                                lambda: constant_op.constant(1),
-                                lambda: constant_op.constant(2))
+      x = cond.cond(pred,
+                    lambda: constant_op.constant(1),
+                    lambda: constant_op.constant(2))
       self.assertIsNone(smart_cond.smart_constant_value(x))
 
 
diff --git a/tensorflow/python/framework/subscribe.py b/tensorflow/python/framework/subscribe.py
index 47f040e7e71..e68412b4982 100644
--- a/tensorflow/python/framework/subscribe.py
+++ b/tensorflow/python/framework/subscribe.py
@@ -42,7 +42,7 @@ def _recursive_apply(tensors, apply_fn):
     `TypeError` if undefined type in the tensors structure.
   """
   tensors_type = type(tensors)
-  if tensors_type is ops.Tensor:
+  if isinstance(tensors, ops.Tensor):
     return apply_fn(tensors)
   elif isinstance(tensors, variables.Variable):
     return apply_fn(tensors.value())
diff --git a/tensorflow/python/framework/subscribe_test.py b/tensorflow/python/framework/subscribe_test.py
index 0cf76378249..b9b99586c20 100644
--- a/tensorflow/python/framework/subscribe_test.py
+++ b/tensorflow/python/framework/subscribe_test.py
@@ -23,12 +23,12 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import subscribe
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import tensor_array_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import googletest
 
 
@@ -214,8 +214,8 @@ class SubscribeTest(test_util.TensorFlowTestCase):
   @test_util.run_v1_only('b/120545219')
   def testSubscribeVariable(self):
     """Confirm that variables can be subscribed."""
-    v1 = variables.VariableV1(0.0)
-    v2 = variables.VariableV1(4.0)
+    v1 = variable_v1.VariableV1(0.0)
+    v2 = variable_v1.VariableV1(4.0)
     add = math_ops.add(v1, v2)
     assign_v1 = v1.assign(3.0)
 
@@ -348,7 +348,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     x1 = math_ops.add(c1, c2)
     x2 = math_ops.multiply(c1, c2)
 
-    cond = control_flow_ops.cond(
+    cond = tf_cond.cond(
         x1 < x2,
         lambda: math_ops.add(c1, c2, name='then'),
         lambda: math_ops.subtract(c1, c2, name='else'),
diff --git a/tensorflow/python/framework/tensor.py b/tensorflow/python/framework/tensor.py
index 01c351cc8e7..942fb67151a 100644
--- a/tensorflow/python/framework/tensor.py
+++ b/tensorflow/python/framework/tensor.py
@@ -66,6 +66,32 @@ def sanitize_spec_name(name: str) -> str:
     return "tensor_" + swapped
 
 
+def get_op_name(tensor_name):
+  """Extract the Op name from a Tensor name.
+
+  The Op name is everything before a colon, if present,
+  not including any ^ prefix denoting a control dependency.
+
+  Args:
+    tensor_name: the full name of a Tensor in the graph.
+  Returns:
+    The name of the Op of which the given Tensor is an output.
+  Raises:
+    ValueError: if tensor_name is None or empty.
+  """
+  if not tensor_name:
+    raise ValueError(
+        f"Tensor name cannot be empty or None. Received: {tensor_name}.")
+
+  # Control dependency inputs start with ^.
+  if tensor_name.startswith("^"):
+    tensor_name = tensor_name[1:]
+  if ":" in tensor_name:
+    op_name, _ = tensor_name.split(":")
+    return op_name
+  return tensor_name
+
+
 class DenseSpec(type_spec.TypeSpec):
   """Describes a dense object with shape, dtype, and name."""
 
@@ -238,7 +264,7 @@ class TensorSpec(DenseSpec, type_spec.BatchableTypeSpec,
     if placeholder_context.with_none_control_dependencies:
       # Note: setting ops.control_dependencies(None) ensures we always put
       # capturing placeholders outside of any control flow context.
-      with ops.control_dependencies(None):
+      with context_graph.control_dependencies(None):
         placeholder = self._graph_placeholder(context_graph, name=name)
     else:
       placeholder = self._graph_placeholder(context_graph, name=name)
@@ -249,15 +275,14 @@ class TensorSpec(DenseSpec, type_spec.BatchableTypeSpec,
       placeholder.op._set_attr(  # pylint: disable=protected-access
           "_user_specified_name",
           attr_value_pb2.AttrValue(s=compat.as_bytes(name)))
-    # TODO(b/263894631): Add an assertion for a TensorSpec of type resource or
-    # variant which must have handle data associated with it.
-    if ((self.dtype == dtypes.resource or self.dtype == dtypes.variant)
-        and placeholder_context.has_handledata(id(self))):
-      handle_data = placeholder_context.get_handledata(id(self))
-      if (handle_data is not None
-          and handle_data.is_set
-          and handle_data.shape_and_type):
-        handle_data_util.set_handle_data(placeholder, handle_data)
+
+    handle_data = self.dtype._handle_data  # pylint: disable=protected-access
+    if (
+        handle_data is not None
+        and handle_data.is_set
+        and handle_data.shape_and_type
+    ):
+      handle_data_util.set_handle_data(placeholder, handle_data)
 
     # Record the composite device as an attribute to the placeholder.
     # This attribute would be propagated into the arg_attr of the FunctionDef.
@@ -305,6 +330,9 @@ class TensorSpec(DenseSpec, type_spec.BatchableTypeSpec,
     assert isinstance(value, ops.Tensor)
     return [value]
 
+  def _flatten(self):
+    return [self]
+
   def _cast(self, value, casting_context):
     """Cast value to a tensor that is a subtype of this TensorSpec."""
     # This method is mainly used to cast Python primitives to tensor.
diff --git a/tensorflow/python/framework/tensor_test.py b/tensorflow/python/framework/tensor_test.py
index 6f87b3df073..460bfb76c12 100644
--- a/tensorflow/python/framework/tensor_test.py
+++ b/tensorflow/python/framework/tensor_test.py
@@ -353,6 +353,50 @@ class TensorSpecTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaises(AssertionError):
       _ = spec._cast([1, 2, 3], ctx)
 
+  @parameterized.named_parameters(
+      (
+          "list",
+          [[
+              [tensor.TensorSpec(None, name="a")],
+              [
+                  tensor.TensorSpec(None, name="b"),
+                  tensor.TensorSpec(None, name="c"),
+              ],
+          ]],
+      ),
+      (
+          "tuple",
+          ((
+              (tensor.TensorSpec(None, name="a"),),
+              (
+                  tensor.TensorSpec(None, name="b"),
+                  tensor.TensorSpec(None, name="c"),
+              ),
+          )),
+      ),
+      (
+          "dict",
+          {
+              "a": {"key": tensor.TensorSpec(None, name="a")},
+              "b": [tensor.TensorSpec(None, name="b")],
+              "c": (tensor.TensorSpec(None, name="c"),),
+          },
+      ),
+  )
+  def testFlatten(self, structure):
+    generated_type = trace_type.from_value(
+        structure, trace_type.InternalTracingContext(is_legacy_signature=True)
+    )
+    flattened = generated_type._flatten()
+    self.assertEqual(
+        flattened,
+        [
+            tensor.TensorSpec(None, name="a"),
+            tensor.TensorSpec(None, name="b"),
+            tensor.TensorSpec(None, name="c"),
+        ],
+    )
+
 
 class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 76d7e986372..0ff9bae6250 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import shape_util
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
@@ -1118,7 +1119,7 @@ class ConstantValueTest(test.TestCase):
     self.assertIsNone(tensor_util.constant_value(var))
 
   def testVariableV1(self):
-    var = variables.VariableV1(1.0, name="variable_node")
+    var = variable_v1.VariableV1(1.0, name="variable_node")
     self.assertIsNone(tensor_util.constant_value(var))
 
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index b484b8002bf..47ca9771a28 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -48,7 +48,6 @@ from tensorflow.python.compat.compat import forward_compatibility_horizon
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import tape
 from tensorflow.python.framework import _test_metrics_util
 from tensorflow.python.framework import config
 from tensorflow.python.framework import device as pydev
@@ -961,15 +960,13 @@ def assert_no_garbage_created(f):
     The decorated function.
   """
 
+  # FIXME(power) -- Update documentation, we no longer care if garbage is
+  # created, we only want to verify we don't have memory leaks.
   def decorator(self, **kwargs):
     """Sets DEBUG_SAVEALL, runs the test, and checks for new garbage."""
-    # Force-load `distribution_strategy_context` to prevent GC at
-    # test time when using eager. Remove once b/117329403 is resolved.
-    tape.distribution_strategy_context.get_strategy()
-
     gc.disable()
     previous_debug_flags = gc.get_debug()
-    gc.set_debug(gc.DEBUG_SAVEALL)
+    gc.set_debug(gc.DEBUG_UNCOLLECTABLE)
     gc.collect()
     previous_garbage = len(gc.garbage)
     result = f(self, **kwargs)
@@ -2552,9 +2549,10 @@ class TensorFlowTestCase(googletest.TestCase):
       a: a proto.
       b: another proto.
       msg: Optional message to report on failure.
-      relative_tolerance: float, relative tolerance. If this is not provided,
-        then all floats are compared using string comparison otherwise, floating
-        point comparisons are done using the relative tolerance provided.
+      relative_tolerance: float. The allowable difference between the two values
+        being compared is determined by multiplying the relative tolerance by
+        the maximum of the two values. If this is not provided, then all floats
+        are compared using string comparison.
     """
     if not compare.ProtoEq(a, b):
       compare.assertProtoEqual(
@@ -2582,9 +2580,10 @@ class TensorFlowTestCase(googletest.TestCase):
       expected_message_maybe_ascii: proto message in original or ascii form.
       message: the message to validate.
       msg: Optional message to report on failure.
-      relative_tolerance: float, relative tolerance. If this is not provided,
-        then all floats are compared using string comparison otherwise, floating
-        point comparisons are done using the relative tolerance provided.
+      relative_tolerance: float. The allowable difference between the two values
+        being compared is determined by multiplying the relative tolerance by
+        the maximum of the two values. If this is not provided, then all floats
+        are compared using string comparison.
     """
     if isinstance(expected_message_maybe_ascii, type(message)):
       expected_message = expected_message_maybe_ascii
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 80d48d9c715..b6d28f4a24d 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -218,10 +218,31 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         relative_tolerance=1e-7,
     )
 
+  def test_float_relative_tolerance_nan(self):
+    pb1 = compare_test_pb2.Floats(float_=float("nan"))
+    pb2 = compare_test_pb2.Floats(float_=float("nan"))
+    self.assertProtoEquals(pb1, pb2, relative_tolerance=1e-7)
+    pb2 = compare_test_pb2.Floats(float_=2)
+    self.assertRaises(
+        AssertionError,
+        self.assertProtoEquals,
+        pb1,
+        pb2,
+        relative_tolerance=1e-7,
+    )
+
   def test_float_relative_tolerance_inf(self):
     pb1 = compare_test_pb2.Floats(float_=float("inf"))
     pb2 = compare_test_pb2.Floats(float_=float("inf"))
     self.assertProtoEquals(pb1, pb2, relative_tolerance=1e-5)
+    pb1 = compare_test_pb2.Floats(float_=1)
+    self.assertRaises(
+        AssertionError,
+        self.assertProtoEquals,
+        pb1,
+        pb2,
+        relative_tolerance=1e-7,
+    )
 
   def test_float_relative_tolerance_denormal(self):
     pb1 = compare_test_pb2.Floats(
@@ -1107,26 +1128,6 @@ class GraphAndEagerNoVariableSharing(test_util.TensorFlowTestCase):
 
 class GarbageCollectionTest(test_util.TensorFlowTestCase):
 
-  def test_no_reference_cycle_decorator(self):
-
-    class ReferenceCycleTest(object):
-
-      def __init__(inner_self):  # pylint: disable=no-self-argument
-        inner_self.assertEqual = self.assertEqual  # pylint: disable=invalid-name
-
-      @test_util.assert_no_garbage_created
-      def test_has_cycle(self):
-        a = []
-        a.append(a)
-
-      @test_util.assert_no_garbage_created
-      def test_has_no_cycle(self):
-        pass
-
-    with self.assertRaises(AssertionError):
-      ReferenceCycleTest().test_has_cycle()
-
-    ReferenceCycleTest().test_has_no_cycle()
 
   @test_util.run_in_graph_and_eager_modes
   def test_no_leaked_tensor_decorator(self):
diff --git a/tensorflow/python/framework/type_spec.py b/tensorflow/python/framework/type_spec.py
index 76aa5cb8f08..93e678699cd 100644
--- a/tensorflow/python/framework/type_spec.py
+++ b/tensorflow/python/framework/type_spec.py
@@ -245,6 +245,9 @@ class TypeSpec(
     return [arg for arg in nest.flatten(value, expand_composites=True)
             if isinstance(arg, core_types.Symbol)]
 
+  def _flatten(self):
+    return nest.flatten(self._component_specs, expand_composites=True)
+
   def _cast(self, value, casting_context):
     if casting_context.allow_specs and isinstance(value, TypeSpec):
       assert value.is_subtype_of(self), f"Can not cast {value!r} to {self!r}"
diff --git a/tensorflow/python/grappler/BUILD b/tensorflow/python/grappler/BUILD
index da071ac1462..d981c18d15b 100644
--- a/tensorflow/python/grappler/BUILD
+++ b/tensorflow/python/grappler/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "get_compatible_with_cloud", "tf_py_test", "tf_pybind_cc_library_wrapper", "tf_python_pybind_extension")
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "get_compatible_with_cloud", "tf_py_strict_test", "tf_pybind_cc_library_wrapper", "tf_python_pybind_extension")
 load("//tensorflow/core/platform:build_config.bzl", "tf_protos_grappler")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
@@ -17,12 +18,12 @@ cc_library(
     compatible_with = get_compatible_with_cloud(),
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler/costs:analytical_cost_estimator",
-        "//tensorflow/core/grappler/costs:measuring_cost_estimator",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:analytical_cost_estimator",
         "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/grappler/costs:measuring_cost_estimator",
         "//tensorflow/core/grappler/costs:utils",
     ] + tf_protos_grappler(),
     alwayslink = 1,
@@ -90,15 +91,14 @@ tf_python_pybind_extension(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tf_item",
-    srcs = [
-        "item.py",
-    ],
+    srcs = ["item.py"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         ":_pywrap_tf_item",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/core/grappler/costs:op_performance_data_py",
     ],
 )
@@ -114,21 +114,19 @@ tf_python_pybind_extension(
         "//tensorflow/core/grappler/utils:pywrap_required_hdrs",
     ],
     deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:core_cpu_headers_lib",
+        "//tensorflow/core/common_runtime/gpu:gpu_id",
         "//tensorflow/python/lib/core:pybind11_status",
         "@pybind11",
-        "//tensorflow/core/common_runtime:core_cpu_headers_lib",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core/common_runtime/gpu:gpu_id",
-        "//tensorflow/core:protos_all_cc",
     ] + if_not_windows(["//tensorflow/core/grappler/costs:graph_properties"]),  # b/148556093,
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "item_test",
     size = "small",
-    srcs = [
-        "item_test.py",
-    ],
+    srcs = ["item_test.py"],
     python_version = "PY3",
     tags = [
         "grappler",
@@ -136,19 +134,25 @@ tf_py_test(
     ],
     deps = [
         ":tf_item",
-        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops_gen",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "datasets_test",
     size = "medium",
-    srcs = [
-        "datasets_test.py",
-    ],
+    srcs = ["datasets_test.py"],
     python_version = "PY3",
     tags = [
         "grappler",
@@ -156,24 +160,25 @@ tf_py_test(
     ],
     deps = [
         ":tf_item",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/data",
-        "//tensorflow/python/framework:combinations",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tf_cluster",
-    srcs = [
-        "cluster.py",
-    ],
+    srcs = ["cluster.py"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         ":_pywrap_tf_cluster",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/core/grappler/costs:op_performance_data_py",
     ],
 )
@@ -200,12 +205,10 @@ tf_python_pybind_extension(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cluster_test",
     size = "small",
-    srcs = [
-        "cluster_test.py",
-    ],
+    srcs = ["cluster_test.py"],
     python_version = "PY3",
     shard_count = 10,
     tags = [
@@ -220,21 +223,24 @@ cuda_py_test(
         ":tf_cluster",
         ":tf_item",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tf_optimizer",
-    srcs = [
-        "tf_optimizer.py",
-    ],
+    srcs = ["tf_optimizer.py"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         ":_pywrap_tf_optimizer",
         ":tf_cluster",
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
@@ -267,12 +273,10 @@ tf_python_pybind_extension(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tf_optimizer_test",
     size = "small",
-    srcs = [
-        "tf_optimizer_test.py",
-    ],
+    srcs = ["tf_optimizer_test.py"],
     python_version = "PY3",
     tags = [
         "grappler",
@@ -282,20 +286,24 @@ tf_py_test(
         ":tf_item",
         ":tf_optimizer",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:while_loop",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "memory_optimizer_test",
     size = "medium",
-    srcs = [
-        "memory_optimizer_test.py",
-    ],
+    srcs = ["memory_optimizer_test.py"],
     python_version = "PY3",
     tags = [
         "grappler",
@@ -309,55 +317,55 @@ tf_py_test(
         "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:random_seed",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "constant_folding_test",
     size = "medium",
-    srcs = [
-        "constant_folding_test.py",
-    ],
+    srcs = ["constant_folding_test.py"],
     python_version = "PY3",
     tags = [
         "grappler",
     ],
     deps = [
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:while_loop",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "arithmetic_optimizer_test",
     size = "small",
-    srcs = [
-        "arithmetic_optimizer_test.py",
-    ],
+    srcs = ["arithmetic_optimizer_test.py"],
     python_version = "PY3",
     tags = [
         "grappler",
     ],
     xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -395,11 +403,9 @@ cuda_py_test(
 #     xla_enable_strict_auto_jit = False,
 # )
 
-py_library(
+py_strict_library(
     name = "cost_analyzer",
-    srcs = [
-        "cost_analyzer.py",
-    ],
+    srcs = ["cost_analyzer.py"],
     srcs_version = "PY3",
     deps = [
         ":_pywrap_cost_analyzer",
@@ -408,22 +414,24 @@ py_library(
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "cost_analyzer_tool",
-    srcs = [
-        "cost_analyzer_tool.py",
-    ],
+    srcs = ["cost_analyzer_tool.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":cost_analyzer",
         ":tf_optimizer",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/training:saver",
+        "@absl_py//absl:app",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "cost_analyzer_test",
     size = "small",
     srcs = ["cost_analyzer_test.py"],
@@ -437,22 +445,23 @@ tf_py_test(
     ],
     deps = [
         ":cost_analyzer",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
         "//tensorflow/python:nn_grad",
+        "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "model_analyzer",
     srcs = [
         "model_analyzer.py",
@@ -461,7 +470,7 @@ py_library(
     deps = [":_pywrap_model_analyzer"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "model_analyzer_test",
     size = "small",
     srcs = ["model_analyzer_test.py"],
@@ -471,17 +480,16 @@ tf_py_test(
     ],
     deps = [
         ":model_analyzer",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "auto_mixed_precision_test",
     size = "medium",
     srcs = [
@@ -495,44 +503,59 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:training",
+        "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/platform:sysconfig",
+        "//tensorflow/python/util:_pywrap_utils",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "remapper_test",
     size = "medium",
-    srcs = [
-        "remapper_test.py",
-    ],
+    srcs = ["remapper_test.py"],
     python_version = "PY3",
     tags = ["grappler"],
     # This test analyzes the graph, but XLA changes the names of nodes.
     xla_enable_strict_auto_jit = False,
     deps = [
+        ":tf_optimizer",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:ops",
+        "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:sysconfig",
+        "//tensorflow/python/util:_pywrap_utils",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -545,15 +568,13 @@ tf_python_pybind_extension(
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "graph_analyzer",
-    srcs = [
-        "graph_analyzer.py",
-    ],
+    srcs = ["graph_analyzer.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":_pywrap_graph_analyzer",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "@absl_py//absl:app",
     ],
 )
diff --git a/tensorflow/python/grappler/cluster_wrapper.cc b/tensorflow/python/grappler/cluster_wrapper.cc
index 90e61d857b4..3279c0a4e85 100644
--- a/tensorflow/python/grappler/cluster_wrapper.cc
+++ b/tensorflow/python/grappler/cluster_wrapper.cc
@@ -58,7 +58,7 @@ tensorflow::Status _GetOpPerformanceDataAndRunTime(
   if (!status.ok()) return status;
 
   tensorflow::RunMetadata run_metadata;
-  MaybeRaiseRegisteredFromStatus(
+  tsl::MaybeRaiseRegisteredFromStatus(
       cost_measure->PredictCosts(item.graph, &run_metadata, costs));
 
   if (op_performance_data) {
@@ -89,7 +89,7 @@ PYBIND11_MODULE(_pywrap_tf_cluster, m) {
           cluster->DisableDetailedStats(disable_detailed_stats);
           cluster->AllowSoftPlacement(allow_soft_placement);
           cluster->SetNumWarmupSteps(10);
-          MaybeRaiseRegisteredFromStatus(cluster->Provision());
+          tsl::MaybeRaiseRegisteredFromStatus(cluster->Provision());
           return cluster.release();
         });
 
@@ -116,7 +116,7 @@ PYBIND11_MODULE(_pywrap_tf_cluster, m) {
           {
             // TODO(petebu): Do we need to hold the GIL here?
             py::gil_scoped_acquire acquire;
-            MaybeRaiseRegisteredFromStatus(cluster->Provision());
+            tsl::MaybeRaiseRegisteredFromStatus(cluster->Provision());
           }
           return cluster.release();
         });
@@ -160,7 +160,7 @@ PYBIND11_MODULE(_pywrap_tf_cluster, m) {
          tensorflow::grappler::GrapplerItem* item)
           -> std::unordered_map<std::string, std::vector<std::string>> {
         if (cluster == nullptr || item == nullptr) {
-          MaybeRaiseRegisteredFromStatus(tensorflow::Status(
+          tsl::MaybeRaiseRegisteredFromStatus(tensorflow::Status(
               tensorflow::errors::Internal("You need both a cluster and an "
                                            "item to get supported devices.")));
         }
@@ -271,7 +271,7 @@ PYBIND11_MODULE(_pywrap_tf_cluster, m) {
           tensorflow::StepStats step_stats;
           if (generate_timeline) {
             tensorflow::RunMetadata metadata;
-            MaybeRaiseRegisteredFromStatus(
+            tsl::MaybeRaiseRegisteredFromStatus(
                 cluster->Run(item->graph, item->feed, item->fetch, &metadata));
             step_stats = metadata.step_stats();
           }
@@ -298,7 +298,7 @@ PYBIND11_MODULE(_pywrap_tf_cluster, m) {
           -> std::unordered_map<std::string,
                                 std::tuple<int64_t, std::vector<MemoryUsage>>> {
         if (item == nullptr || cluster == nullptr) {
-          MaybeRaiseRegisteredFromStatus(
+          tsl::MaybeRaiseRegisteredFromStatus(
               tensorflow::Status(tensorflow::errors::Internal(
                   "You need both a cluster and an item to determine peak "
                   "memory usage.")));
@@ -306,9 +306,9 @@ PYBIND11_MODULE(_pywrap_tf_cluster, m) {
         tensorflow::grappler::GraphMemory memory(*item);
 
         if (cluster->DetailedStatsEnabled()) {
-          MaybeRaiseRegisteredFromStatus(memory.InferDynamically(cluster));
+          tsl::MaybeRaiseRegisteredFromStatus(memory.InferDynamically(cluster));
         } else {
-          MaybeRaiseRegisteredFromStatus(
+          tsl::MaybeRaiseRegisteredFromStatus(
               memory.InferStatically(cluster->GetDevices()));
         }
 
diff --git a/tensorflow/python/grappler/cost_analyzer.cc b/tensorflow/python/grappler/cost_analyzer.cc
index e0e8784ad7b..f6890169f4f 100644
--- a/tensorflow/python/grappler/cost_analyzer.cc
+++ b/tensorflow/python/grappler/cost_analyzer.cc
@@ -53,7 +53,7 @@ void CostAnalyzer::PredictCosts(CostEstimator* cost_estimator,
   *total_time = costs.execution_time.count();
   if (!status.ok()) {
     LOG(ERROR) << "Could not estimate the cost for item " << item_->id << ": "
-               << status.error_message();
+               << status.message();
     return;
   }
 }
diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
index 85243be6532..b5bfd2d9f79 100644
--- a/tensorflow/python/grappler/cost_analyzer_tool.py
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -21,7 +21,6 @@ from absl import app
 
 from google.protobuf import message
 from google.protobuf import text_format
-from tensorflow.contrib.fused_conv.ops import gen_fused_conv2d_bias_activation_op  # pylint: disable=unused-import
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py
index d016395f09e..e9cff9fd194 100644
--- a/tensorflow/python/grappler/item_test.py
+++ b/tensorflow/python/grappler/item_test.py
@@ -25,7 +25,7 @@ from tensorflow.python.grappler import item
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import test
 
 
@@ -108,7 +108,7 @@ class ItemTest(test.TestCase):
   def testColocationConstraints(self):
     with ops.Graph().as_default() as g:
       c = constant_op.constant([10])
-      v = variables.VariableV1([3], dtype=dtypes.int32)
+      v = variable_v1.VariableV1([3], dtype=dtypes.int32)
       i = gen_array_ops.ref_identity(v)
       a = state_ops.assign(i, c)
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
diff --git a/tensorflow/python/grappler/item_wrapper.cc b/tensorflow/python/grappler/item_wrapper.cc
index 1b7a44cbd34..fe7b67ed46a 100644
--- a/tensorflow/python/grappler/item_wrapper.cc
+++ b/tensorflow/python/grappler/item_wrapper.cc
@@ -135,8 +135,9 @@ PYBIND11_MODULE(_pywrap_tf_item, m) {
                 "buffer");
           }
           if (metagraph.collection_def().count("train_op") == 0) {
-            MaybeRaiseRegisteredFromStatus(tensorflow::errors::InvalidArgument(
-                "train_op not specified in the metagraph"));
+            tsl::MaybeRaiseRegisteredFromStatus(
+                tensorflow::errors::InvalidArgument(
+                    "train_op not specified in the metagraph"));
           }
 
           tensorflow::grappler::ItemConfig cfg;
@@ -146,7 +147,7 @@ PYBIND11_MODULE(_pywrap_tf_item, m) {
               tensorflow::grappler::GrapplerItemFromMetaGraphDef(
                   "item", metagraph, cfg);
           if (item == nullptr) {
-            MaybeRaiseRegisteredFromStatus(
+            tsl::MaybeRaiseRegisteredFromStatus(
                 tensorflow::errors::InvalidArgument("Invalid metagraph"));
           }
           return item.release();
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 7bdb1c4054f..e1d51e137bb 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import training as train
@@ -37,8 +38,8 @@ class MemoryOptimizerSwapTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testNoSwapping(self):
     """Make sure the graph is preserved when there is nothing to swap."""
-    a = variables.VariableV1(10, name='a')
-    b = variables.VariableV1(20, name='b')
+    a = variable_v1.VariableV1(10, name='a')
+    b = variable_v1.VariableV1(20, name='b')
     c = math_ops.add_n([a, b], name='c')
     d = math_ops.add_n([b, c], name='d')
     train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
@@ -63,8 +64,8 @@ class MemoryOptimizerSwapTest(test.TestCase):
   def testSimpleSwap(self):
     """Check that the swap annotations are followed."""
     with ops.device('/gpu:0'):
-      a = variables.VariableV1(10, name='a')
-      b = variables.VariableV1(20, name='b')
+      a = variable_v1.VariableV1(10, name='a')
+      b = variable_v1.VariableV1(20, name='b')
       c = math_ops.add_n([a, b], name='c')
       d = math_ops.add_n([b, c], name='d')
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
index 306240917ce..774afe4fc9c 100644
--- a/tensorflow/python/grappler/tf_optimizer_test.py
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.grappler import item as gitem
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import while_loop
 from tensorflow.python.platform import test
 
@@ -59,7 +59,7 @@ class PyWrapOptimizeGraphTest(test.TestCase):
   def testKeepNodes(self):
     g = ops.Graph()
     with g.as_default():
-      a1 = variables.VariableV1(
+      a1 = variable_v1.VariableV1(
           1.0)  # Must be preserved since it's in the collection 'variables'.
       a2 = constant_op.constant(0, shape=[50, 50], name='keep')
       ops.add_to_collection('a2', a2)  # Explicitly add to collection.
diff --git a/tensorflow/python/grappler/tf_optimizer_wrapper.cc b/tensorflow/python/grappler/tf_optimizer_wrapper.cc
index a2ecd3e5f42..bf47fab9069 100644
--- a/tensorflow/python/grappler/tf_optimizer_wrapper.cc
+++ b/tensorflow/python/grappler/tf_optimizer_wrapper.cc
@@ -98,7 +98,7 @@ PYBIND11_MODULE(_pywrap_tf_optimizer, m) {
             tensorflow::grappler::MetaOptimizer optimizer(cpu_device,
                                                           config_proto);
 
-            MaybeRaiseRegisteredFromStatusWithGIL(
+            tsl::MaybeRaiseRegisteredFromStatusWithGIL(
                 optimizer.Optimize(cluster, *grappler_item, &out_graph));
             if (strip_default_attributes) {
               tensorflow::StripDefaultAttributes(
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 80ac1bacda1..d32373636b1 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -39,6 +39,7 @@ py_library(
         "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/keras/saving",
         "//tensorflow/python/keras/utils",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/util:nest",
     ],
@@ -55,6 +56,7 @@ py_library(
         "//tensorflow/python:check_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:ctc_ops",
@@ -71,7 +73,6 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:nn",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:sparse_ops",
@@ -81,18 +82,20 @@ py_library(
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training_lib",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/distribute:distribute_coordinator",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras/distribute:distribute_coordinator_utils",
         "//tensorflow/python/keras/engine:keras_tensor",
         "//tensorflow/python/keras/utils:control_flow_util",
         "//tensorflow/python/keras/utils:object_identity",
         "//tensorflow/python/keras/utils:tf_contextlib",
         "//tensorflow/python/keras/utils:tf_inspect",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
@@ -161,6 +164,8 @@ py_library(
         "//tensorflow/python/keras/protobuf:projector_config_proto_py",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:mode_keys",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:profiler_v2",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
@@ -192,7 +197,9 @@ py_library(
         ":backend",
         "//tensorflow/python:errors",
         "//tensorflow/python/keras/utils:engine_utils",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:profiler_v2",
+        "//tensorflow/python/summary:summary_py",
     ],
 )
 
@@ -217,6 +224,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":backend",
+        "//tensorflow/python:cond",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/util:dispatch",
     ],
@@ -247,6 +256,7 @@ py_library(
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras/distribute",
         "//tensorflow/python/keras/engine:base_layer",
         "//tensorflow/python/keras/engine:base_layer_utils",
@@ -272,12 +282,12 @@ py_library(
         ":backend",
         ":metrics",
         ":optimizers",
-        "//tensorflow/python:platform",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/engine:base_layer",
         "//tensorflow/python/keras/saving",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/keras/utils:version_utils",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 72ebfdf7097..bf012e88e43 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -32,7 +32,7 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import tf2
 from tensorflow.python.client import session as session_module
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager.context import get_config
 from tensorflow.python.framework import composite_tensor
@@ -43,6 +43,7 @@ from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
@@ -56,6 +57,7 @@ from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import ctc_ops as ctc
 from tensorflow.python.ops import functional_ops
@@ -72,6 +74,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.ops import while_loop
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -704,9 +707,9 @@ def _get_session(op_input_list=()):
         _SESSION.session.graph is not _current_graph(op_input_list)):
       # If we are creating the Session inside a tf.distribute.Strategy scope,
       # we ask the strategy for the right session options to use.
-      if distribution_strategy_context.has_strategy():
+      if distribute_lib.has_strategy():
         configure_and_create_distributed_session(
-            distribution_strategy_context.get_strategy())
+            distribute_lib.get_strategy())
       else:
         _SESSION.session = session_module.Session(
             config=get_default_session_config())
@@ -940,7 +943,7 @@ def _to_tensor(x, dtype):
   Returns:
       A tensor.
   """
-  return ops.convert_to_tensor_v2_with_dispatch(x, dtype=dtype)
+  return tensor_conversion.convert_to_tensor_v2_with_dispatch(x, dtype=dtype)
 
 
 @keras_export('keras.backend.is_sparse')
@@ -1183,7 +1186,7 @@ def _initialize_variables(session):
     # This step is expensive, so we only run it on variables not already
     # marked as initialized.
     is_initialized = session.run(
-        [variables_module.is_variable_initialized(v) for v in candidate_vars])
+        [variable_v1.is_variable_initialized(v) for v in candidate_vars])
     # TODO(kathywu): Some metric variables loaded from SavedModel are never
     # actually used, and do not have an initializer.
     should_be_initialized = [
@@ -4584,7 +4587,7 @@ def switch(condition, then_expression, else_expression):
         return else_expression
     else:
       else_expression_fn = else_expression
-    x = control_flow_ops.cond(condition, then_expression_fn, else_expression_fn)
+    x = cond.cond(condition, then_expression_fn, else_expression_fn)
   else:
     # tf.where needs its condition tensor
     # to be the same shape as its two
@@ -4855,8 +4858,8 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   [0. 0. 0.]
 
   """
-  target = ops.convert_to_tensor_v2_with_dispatch(target)
-  output = ops.convert_to_tensor_v2_with_dispatch(output)
+  target = tensor_conversion.convert_to_tensor_v2_with_dispatch(target)
+  output = tensor_conversion.convert_to_tensor_v2_with_dispatch(output)
   target.shape.assert_is_compatible_with(output.shape)
 
   # Use logits whenever they are available. `softmax` and `sigmoid`
@@ -4916,8 +4919,8 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   Raises:
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
-  target = ops.convert_to_tensor_v2_with_dispatch(target)
-  output = ops.convert_to_tensor_v2_with_dispatch(output)
+  target = tensor_conversion.convert_to_tensor_v2_with_dispatch(target)
+  output = tensor_conversion.convert_to_tensor_v2_with_dispatch(output)
 
   # Use logits whenever they are available. `softmax` and `sigmoid`
   # activations cache logits on the `output` Tensor.
@@ -5003,8 +5006,8 @@ def binary_crossentropy(target, output, from_logits=False):
   Returns:
       A tensor.
   """
-  target = ops.convert_to_tensor_v2_with_dispatch(target)
-  output = ops.convert_to_tensor_v2_with_dispatch(output)
+  target = tensor_conversion.convert_to_tensor_v2_with_dispatch(target)
+  output = tensor_conversion.convert_to_tensor_v2_with_dispatch(output)
 
   # Use logits whenever they are available. `softmax` and `sigmoid`
   # activations cache logits on the `output` Tensor.
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index a2530362caf..9d414ceb148 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -32,7 +32,7 @@ from tensorflow.python.checkpoint import checkpoint_management
 from tensorflow.python.checkpoint import checkpoint_options as checkpoint_options_lib
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.distribute import tpu_strategy
@@ -571,7 +571,7 @@ class CallbackList:
   def _disallow_batch_hooks_in_ps_strategy(self):
     """Error out if batch-level callbacks are passed with PSStrategy."""
     # pylint: disable=protected-access
-    strategy = ds_context.get_strategy()
+    strategy = distribute_lib.get_strategy()
     if strategy._should_use_with_coordinator:
       unsupported_callbacks = []
       for cb in self.callbacks:
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 04eb3e82b2f..25bf7eb3729 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -30,6 +30,7 @@ py_library(
     deps = [
         ":distribute_coordinator_utils",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute:distribute_coordinator",
         "//tensorflow/python/distribute:distribute_lib",
@@ -49,6 +50,7 @@ py_library(
         "//tensorflow/python/keras/mixed_precision:policy",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:mode_keys",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:data_structures",
         "//tensorflow/tools/docs:doc_controls",
     ],
@@ -106,10 +108,10 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session",
         "//tensorflow/python:training_server_lib",
         "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
diff --git a/tensorflow/python/keras/distribute/distributed_file_utils.py b/tensorflow/python/keras/distribute/distributed_file_utils.py
index 0cc35f60fab..93615bbc67b 100644
--- a/tensorflow/python/keras/distribute/distributed_file_utils.py
+++ b/tensorflow/python/keras/distribute/distributed_file_utils.py
@@ -46,7 +46,7 @@ Experimental. API is subject to change.
 
 import os
 
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.lib.io import file_io
 
 
@@ -81,8 +81,8 @@ def write_dirpath(dirpath, strategy):
     The writing dir path that should be used to save with distribution.
   """
   if strategy is None:
-    # Infer strategy from `distribution_strategy_context` if not given.
-    strategy = distribution_strategy_context.get_strategy()
+    # Infer strategy from `distribute_lib` if not given.
+    strategy = distribute_lib.get_strategy()
   if strategy is None:
     # If strategy is still not available, this is not in distributed training.
     # Fallback to original dirpath.
@@ -104,8 +104,8 @@ def remove_temp_dirpath(dirpath, strategy):
     strategy: The tf.distribute strategy object currently used.
   """
   if strategy is None:
-    # Infer strategy from `distribution_strategy_context` if not given.
-    strategy = distribution_strategy_context.get_strategy()
+    # Infer strategy from `distribute_lib` if not given.
+    strategy = distribute_lib.get_strategy()
   if strategy is None:
     # If strategy is still not available, this is not in distributed training.
     # Fallback to no-op.
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils.py b/tensorflow/python/keras/distribute/distributed_training_utils.py
index 35630c4cad4..f40fd5fdbf8 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils.py
@@ -15,7 +15,7 @@
 """Utilities related to distributed training."""
 # pylint:disable=protected-access
 
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import values as values_lib
 from tensorflow.python.keras import backend
 from tensorflow.python.ops import variables
@@ -48,12 +48,12 @@ def call_replica_local_fn(fn, *args, **kwargs):
   if 'strategy' in kwargs:
     strategy = kwargs.pop('strategy')
   else:
-    if ds_context.has_strategy():
-      strategy = ds_context.get_strategy()
+    if distribute_lib.has_strategy():
+      strategy = distribute_lib.get_strategy()
 
   # TODO(b/120571621): TPUStrategy does not implement replica-local variables.
   is_tpu = backend.is_tpu_strategy(strategy)
-  if ((not is_tpu) and strategy and ds_context.in_cross_replica_context()):
+  if ((not is_tpu) and strategy and distribute_lib.in_cross_replica_context()):
     with strategy.scope():
       return strategy.extended.call_for_each_replica(fn, args, kwargs)
   return fn(*args, **kwargs)
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils_v1.py b/tensorflow/python/keras/distribute/distributed_training_utils_v1.py
index 2e5259e2195..e0d6d38570d 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils_v1.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils_v1.py
@@ -42,7 +42,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -385,7 +385,7 @@ def _wait_for_variable_initialization(session):
 
   while True:
     is_initialized = session.run(
-        [variables.is_variable_initialized(v) for v in candidate_vars])
+        [variable_v1.is_variable_initialized(v) for v in candidate_vars])
     uninitialized_vars = []
     for flag, v in zip(is_initialized, candidate_vars):
       if not flag:
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index edd939edaa7..063f6cddaba 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -55,6 +55,7 @@ py_library(
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute/coordinator:cluster_coordinator",
         "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:callbacks",
@@ -79,12 +80,13 @@ py_library(
         "//tensorflow/python/module",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:ragged_util",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/saved_model:constants",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/trackable:data_structures",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:data",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
@@ -103,6 +105,7 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tf2",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
@@ -126,6 +129,8 @@ py_library(
         ":input_spec",
         ":node",
         "//third_party/py/numpy",
+        "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
@@ -137,7 +142,6 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:func_graph",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_spec",
@@ -145,7 +149,7 @@ py_library(
         "//tensorflow/python:tf2",
         "//tensorflow/python:variables",
         "//tensorflow/python/autograph/core:ag_ctx",
-        "//tensorflow/python/autograph/impl",
+        "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/eager:context",
@@ -184,10 +188,12 @@ py_library(
     deps = [
         "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras/utils:dataset_creator",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:tf_utils",
-        "//tensorflow/python/types",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/types:data",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
@@ -243,7 +249,6 @@ py_library(
     deps = [
         ":base_layer_utils",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras/utils:tf_utils",
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index ae736c74791..48e0862fe1b 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -30,7 +30,7 @@ from tensorflow.core.framework import node_def_pb2
 from tensorflow.python import tf2
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.impl import api as autograph
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -39,6 +39,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
@@ -59,10 +60,8 @@ from tensorflow.python.keras.utils import object_identity
 from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
-# A module that only depends on `keras.layers` import these from here.
 from tensorflow.python.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
 from tensorflow.python.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
-
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -79,6 +78,8 @@ from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
+# A module that only depends on `keras.layers` import these from here.
+
 # pylint: disable=g-inconsistent-quotes
 metrics_mod = generic_utils.LazyLoader(
     "metrics_mod", globals(),
@@ -1063,7 +1064,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         # Don't call `ops.convert_to_tensor` on all `inputs` because
         # `SparseTensors` can't be converted to `Tensor`.
         if isinstance(x, (np_arrays.ndarray, np.ndarray, float, int)):
-          return ops.convert_to_tensor_v2_with_dispatch(x)
+          return tensor_conversion.convert_to_tensor_v2_with_dispatch(x)
         return x
 
       inputs = nest.map_structure(_convert_non_tensor, inputs)
@@ -1484,8 +1485,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       if loss is None:
         return None  # Will be filtered out when computing the .losses property
       if not tensor_util.is_tf_type(loss):
-        loss = ops.convert_to_tensor_v2_with_dispatch(
-            loss, dtype=backend.floatx())
+        loss = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+            loss, dtype=backend.floatx()
+        )
       loss._unconditional_loss = True  # pylint: disable=protected-access
       return loss
 
@@ -1502,8 +1504,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         continue
       if not tensor_util.is_tf_type(loss) and not isinstance(
           loss, keras_tensor.KerasTensor):
-        loss = ops.convert_to_tensor_v2_with_dispatch(
-            loss, dtype=backend.floatx())
+        loss = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+            loss, dtype=backend.floatx()
+        )
       # TF Functions should take the eager path.
       if ((tf_utils.is_symbolic_tensor(loss) or
            isinstance(loss, keras_tensor.KerasTensor)) and
@@ -2290,7 +2293,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # confusion, we disallow the 'mixed_float16' policy with unsupported
       # strategies. This is because 'mixed_float16' requires loss scaling for
       # numeric stability.
-      strategy = ds_context.get_strategy()
+      strategy = distribute_lib.get_strategy()
       raise ValueError('Mixed precision is not supported with the '
                        'tf.distribute.Strategy: %s. Either stop using mixed '
                        'precision by removing the use of the "%s" policy or '
@@ -3261,7 +3264,7 @@ def _in_functional_construction_mode(layer, inputs, args, kwargs, input_list):
 
 def _convert_numpy_or_python_types(x):
   if isinstance(x, (np_arrays.ndarray, np.ndarray, float, int)):
-    return ops.convert_to_tensor_v2_with_dispatch(x)
+    return tensor_conversion.convert_to_tensor_v2_with_dispatch(x)
   return x
 
 
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 6661c5cfb01..f18cce3609b 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -18,7 +18,7 @@ import functools
 import threading
 
 from tensorflow.python import tf2
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -30,6 +30,7 @@ from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.trackable import base as tracking
@@ -124,7 +125,7 @@ def make_variable(name,
   # TODO(apassos,rohanj) figure out how to remove collections from here so we
   # can remove the V1.
   variable_shape = tensor_shape.TensorShape(shape)
-  return tf_variables.VariableV1(
+  return variable_v1.VariableV1(
       initial_value=init_val,
       name=name,
       trainable=trainable,
@@ -247,7 +248,7 @@ def _create_keras_history_helper(tensors, processed_ops, created_layers):
           # Treat any value not originating from a `keras.Input` as
           # a constant. Variables cannot be supported.
           ds_with_session = (
-              distribution_strategy_context.in_cross_replica_context() and
+              distribute_lib.in_cross_replica_context() and
               not ops.executing_eagerly_outside_functions())
           using_xla = control_flow_util.GraphOrParentsInXlaContext(
               ops.get_default_graph())
@@ -790,7 +791,7 @@ class TrackableWeightHandler(object):
     if not isinstance(trackable, tracking.Trackable):
       raise ValueError('%s is not a Trackable object.' % (trackable,))
     self._trackable = trackable
-    self._distribute_strategy = distribution_strategy_context.get_strategy()
+    self._distribute_strategy = distribute_lib.get_strategy()
 
     saveables = saveable_object_util.saveable_objects_from_trackable(
         trackable).values()
@@ -863,7 +864,7 @@ class StaticTableHandler(TrackableWeightHandler):
   def __init__(self, getter_lambda):  # pylint: disable=super-init-not-called
     self._num_tensors = 2
     self._getter = getter_lambda
-    self._distribute_strategy = distribution_strategy_context.get_strategy()
+    self._distribute_strategy = distribute_lib.get_strategy()
 
     def raise_error(_):
       raise RuntimeError('This layer contains a static lookup table, which '
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index bda10fedde7..6836bdfd9ee 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -25,13 +25,14 @@ import numpy as np
 
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.impl import api as autograph
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
@@ -691,10 +692,10 @@ class Layer(base_layer.Layer):
     # Accept NumPy and scalar inputs by converting to Tensors.
     if any(isinstance(x, (np.ndarray, float, int)) for x in input_list):
       def _convert_non_tensor(x):
-        # Don't call `ops.convert_to_tensor` on all `inputs` because
-        # `SparseTensors` can't be converted to `Tensor`.
+        # Don't call `tensor_conversion.convert_to_tensor` on all `inputs`
+        # because `SparseTensors` can't be converted to `Tensor`.
         if isinstance(x, (np.ndarray, float, int)):
-          return ops.convert_to_tensor_v2_with_dispatch(x)
+          return tensor_conversion.convert_to_tensor_v2_with_dispatch(x)
         return x
       inputs = nest.map_structure(_convert_non_tensor, inputs)
       input_list = nest.flatten(inputs)
@@ -1035,8 +1036,9 @@ class Layer(base_layer.Layer):
       if loss is None:
         return None  # Will be filtered out when computing the .losses property
       if not tensor_util.is_tf_type(loss):
-        loss = ops.convert_to_tensor_v2_with_dispatch(
-            loss, dtype=backend.floatx())
+        loss = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+            loss, dtype=backend.floatx()
+        )
       loss._unconditional_loss = (inputs is None)  # pylint: disable=protected-access
       return loss
 
@@ -1051,8 +1053,9 @@ class Layer(base_layer.Layer):
       if loss is None:
         continue
       if not tensor_util.is_tf_type(loss):
-        loss = ops.convert_to_tensor_v2_with_dispatch(
-            loss, dtype=backend.floatx())
+        loss = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+            loss, dtype=backend.floatx()
+        )
       # TF Functions should take the eager path.
       if (tf_utils.is_symbolic_tensor(loss) and
           not base_layer_utils.is_in_tf_function()):
@@ -1180,8 +1183,8 @@ class Layer(base_layer.Layer):
           'to pass a value to `inputs` as it is being automatically inferred.')
     call_context = base_layer_utils.call_context()
 
-    if (ds_context.has_strategy() and
-        ds_context.in_cross_replica_context() and
+    if (distribute_lib.has_strategy() and
+        distribute_lib.in_cross_replica_context() and
         # When saving the model, the distribution strategy context should be
         # ignored, following the default path for adding updates.
         not call_context.saving):
@@ -1213,7 +1216,7 @@ class Layer(base_layer.Layer):
       elif hasattr(x, 'op'):
         update = x.op
       else:
-        update = ops.convert_to_tensor_v2_with_dispatch(x)
+        update = tensor_conversion.convert_to_tensor_v2_with_dispatch(x)
 
       reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, [update])
       update._unconditional_update = update not in reachable
@@ -1766,7 +1769,7 @@ class Layer(base_layer.Layer):
       # confusion, we disallow the 'mixed_float16' policy with unsupported
       # strategies. This is because 'mixed_float16' requires loss scaling for
       # numeric stability.
-      strategy = ds_context.get_strategy()
+      strategy = distribute_lib.get_strategy()
       raise ValueError('Mixed precision is not supported with the '
                        'tf.distribute.Strategy: %s. Either stop using mixed '
                        'precision by removing the use of the "%s" policy or '
diff --git a/tensorflow/python/keras/engine/compile_utils.py b/tensorflow/python/keras/engine/compile_utils.py
index 51112b37ce4..81f202d4d8b 100644
--- a/tensorflow/python/keras/engine/compile_utils.py
+++ b/tensorflow/python/keras/engine/compile_utils.py
@@ -16,7 +16,7 @@
 
 import copy
 
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.keras import losses as losses_mod
 from tensorflow.python.keras import metrics as metrics_mod
 from tensorflow.python.keras.utils import generic_utils
@@ -206,7 +206,7 @@ class LossesContainer(Container):
       loss_metric_value = loss_value
       # Correct for the `Mean` loss metrics counting each replica as a batch.
       if loss_obj.reduction == losses_utils.ReductionV2.SUM:
-        loss_metric_value *= ds_context.get_strategy().num_replicas_in_sync
+        loss_metric_value *= distribute_lib.get_strategy().num_replicas_in_sync
 
       if batch_dim is None:
         if tf_utils.is_ragged(y_t):
diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index 353d94b4ab8..3119c98c29e 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -27,7 +27,7 @@ from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import options as options_lib
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -35,6 +35,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import training_utils
@@ -1035,7 +1036,9 @@ def _process_tensorlike(inputs):
       dtype = None
       if issubclass(x.dtype.type, np.floating):
         dtype = backend.floatx()
-      return ops.convert_to_tensor_v2_with_dispatch(x, dtype=dtype)
+      return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          x, dtype=dtype
+      )
     elif _is_scipy_sparse(x):
       return _scipy_sparse_to_sparse_tensor(x)
     return x
@@ -1158,10 +1161,10 @@ class DataHandler(object):
         max_queue_size=max_queue_size,
         workers=workers,
         use_multiprocessing=use_multiprocessing,
-        distribution_strategy=ds_context.get_strategy(),
+        distribution_strategy=distribute_lib.get_strategy(),
         model=model)
 
-    strategy = ds_context.get_strategy()
+    strategy = distribute_lib.get_strategy()
 
     self._current_step = 0
     self._step_increment = self._steps_per_execution_value - 1
@@ -1417,8 +1420,9 @@ def _make_class_weight_map_fn(class_weight):
         "than the number of classes, found {}").format(class_weight)
     raise ValueError(error_msg)
 
-  class_weight_tensor = ops.convert_to_tensor_v2_with_dispatch(
-      [class_weight[int(c)] for c in class_ids])
+  class_weight_tensor = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      [class_weight[int(c)] for c in class_ids]
+  )
 
   def _class_weights_map_fn(*data):
     """Convert `class_weight` to `sample_weight`."""
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index ff7fff06300..b659860cbe4 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -15,7 +15,7 @@
 # pylint: disable=protected-access
 """Input layer code (`Input` and `InputLayer`)."""
 
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
@@ -112,7 +112,7 @@ class InputLayer(base_layer.Layer):
     self._init_ragged = ragged
     self._init_type_spec = type_spec
 
-    strategy = distribution_strategy_context.get_strategy()
+    strategy = distribute_lib.get_strategy()
     if strategy and batch_size is not None and \
         distributed_training_utils.global_batch_size_supported(strategy):
       if batch_size % strategy.num_replicas_in_sync != 0:
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index fd9bc44661f..a8216ffe65c 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -26,7 +26,7 @@ from tensorflow.python.checkpoint import checkpoint as trackable_utils
 from tensorflow.python.checkpoint import checkpoint_management
 from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import values as ds_values
 from tensorflow.python.distribute.coordinator import cluster_coordinator
 from tensorflow.python.eager import backprop
@@ -293,8 +293,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     self._maybe_create_attribute('optimizer', None)
 
     # Model must be created under scope of DistStrat it will be trained with.
-    if ds_context.has_strategy():
-      self._distribution_strategy = ds_context.get_strategy()
+    if distribute_lib.has_strategy():
+      self._distribution_strategy = distribute_lib.get_strategy()
     else:
       self._distribution_strategy = None
 
@@ -724,7 +724,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
   @property
   def distribute_strategy(self):
     """The `tf.distribute.Strategy` this model was created under."""
-    return self._distribution_strategy or ds_context.get_strategy()
+    return self._distribution_strategy or distribute_lib.get_strategy()
 
   @property
   def run_eagerly(self):
@@ -2651,8 +2651,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
                       (invalid_kwargs,))
 
     # Model must be created and compiled with the same DistStrat.
-    if self.built and ds_context.has_strategy():
-      strategy = ds_context.get_strategy()
+    if self.built and distribute_lib.has_strategy():
+      strategy = distribute_lib.get_strategy()
       for v in self.variables:
         if not strategy.extended.variable_created_in_scope(v):
           raise ValueError(
diff --git a/tensorflow/python/keras/engine/training_distributed_v1.py b/tensorflow/python/keras/engine/training_distributed_v1.py
index 4b32c482a36..2d3477464db 100644
--- a/tensorflow/python/keras/engine/training_distributed_v1.py
+++ b/tensorflow/python/keras/engine/training_distributed_v1.py
@@ -16,7 +16,7 @@
 # pylint: disable=protected-access
 
 import numpy as np
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import context
@@ -324,7 +324,7 @@ def experimental_tpu_test_loop(model,
     else:
       targets = None
 
-    (distribution_strategy_context.get_replica_context().merge_call(
+    (distribute_lib.get_replica_context().merge_call(
         _build_model, args=(model, mode, inputs, targets)))
 
     (_, outputs, updates, _) = _per_replica_execution_function(
@@ -471,7 +471,7 @@ def experimental_tpu_predict_loop(model,
   def _predict_step_fn(inputs):
     """A fn that returns output of single prediction step."""
 
-    (distribution_strategy_context.get_replica_context().merge_call(
+    (distribute_lib.get_replica_context().merge_call(
         _build_model, args=(model, mode, inputs)))
 
     (_, outputs, updates, _) = _per_replica_execution_function(
diff --git a/tensorflow/python/keras/engine/training_eager_v1.py b/tensorflow/python/keras/engine/training_eager_v1.py
index 566ee770299..4390559f2fe 100644
--- a/tensorflow/python/keras/engine/training_eager_v1.py
+++ b/tensorflow/python/keras/engine/training_eager_v1.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from tensorflow.python.eager.backprop import GradientTape
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine import training_utils_v1
@@ -118,7 +118,9 @@ def _model_loss(model,
   if any(
       isinstance(input_t, (np.ndarray, float, int))
       for input_t in nest.flatten(inputs)):
-    inputs = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch, inputs)
+    inputs = nest.map_structure(
+        tensor_conversion.convert_to_tensor_v2_with_dispatch, inputs
+    )
 
   outs = model(inputs, **kwargs)
   outs = nest.flatten(outs)
@@ -128,11 +130,14 @@ def _model_loss(model,
         targets, outs)
   # TODO(sallymatson/psv): check if we should do same mismatch fix for weights
   if sample_weights:
-    sample_weights = [
-        training_utils_v1.cast_if_floating_dtype(
-            ops.convert_to_tensor_v2_with_dispatch(val))
-        if val is not None else None for val in sample_weights
-    ]
+    new_sample_weights = []
+    for val in sample_weights:
+      if val is not None:
+        new_sample_weights.append(training_utils_v1.cast_if_floating_dtype(
+            tensor_conversion.convert_to_tensor_v2_with_dispatch(val)))
+      else:
+        new_sample_weights.append(None)
+    sample_weights = new_sample_weights
 
   masks = [getattr(t, '_keras_mask', None) for t in outs]
   targets = nest.flatten(targets)
diff --git a/tensorflow/python/keras/engine/training_utils_v1.py b/tensorflow/python/keras/engine/training_utils_v1.py
index 64cf3b77e3e..0d0a896212d 100644
--- a/tensorflow/python/keras/engine/training_utils_v1.py
+++ b/tensorflow/python/keras/engine/training_utils_v1.py
@@ -34,9 +34,9 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
@@ -1044,8 +1044,9 @@ def standardize_weights(y,
       class_sample_weight = math_ops.cast(class_sample_weight, backend.floatx())
       if sample_weight is not None:
         sample_weight = math_ops.cast(
-            ops.convert_to_tensor_v2_with_dispatch(sample_weight),
-            backend.floatx())
+            tensor_conversion.convert_to_tensor_v2_with_dispatch(sample_weight),
+            backend.floatx(),
+        )
     else:
       y_classes = y
       if len(y.shape) == 2:
@@ -1355,7 +1356,7 @@ def check_steps_argument(input_data, steps, steps_name):
 
 def cast_single_tensor(x, dtype=None):
   if isinstance(x, np.ndarray):
-    x = ops.convert_to_tensor_v2_with_dispatch(x)
+    x = tensor_conversion.convert_to_tensor_v2_with_dispatch(x)
   dtype = dtype or backend.floatx()
   if x.dtype.is_floating:
     return math_ops.cast(x, dtype=dtype)
@@ -1381,7 +1382,7 @@ def cast_if_floating_dtype_and_mismatch(targets, outputs):
   new_targets = []
   for target, out in zip(targets, outputs):
     if isinstance(target, np.ndarray):
-      target = ops.convert_to_tensor_v2_with_dispatch(target)
+      target = tensor_conversion.convert_to_tensor_v2_with_dispatch(target)
     if target.dtype != out.dtype:
       new_targets.append(cast_single_tensor(target, dtype=out.dtype))
     else:
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 1153a06fd4a..f0843908e4b 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -22,7 +22,7 @@ import numpy as np
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.eager import context
@@ -144,9 +144,9 @@ class Model(training_lib.Model):
     self._distribution_strategy = None
     self._compile_time_distribution_strategy = None
     if (ops.executing_eagerly_outside_functions() and
-        distribution_strategy_context.has_strategy()):
+        distribute_lib.has_strategy()):
       self._set_strategy(
-          distribution_strategy_context.get_strategy())
+          distribute_lib.get_strategy())
 
     # This flag is used to track if the user is using the deprecated path of
     # passing distribution strategy to compile rather than creating the model
@@ -348,14 +348,14 @@ class Model(training_lib.Model):
       self._distribution_strategy = distribute
       self._compile_distribution = True
     else:
-      if distribution_strategy_context.has_strategy():
+      if distribute_lib.has_strategy():
         # When the user builds the model in the DS scope and cross replica
         # context we want distribution strategy to be set but when building the
         # replica copies of the models internally we should not be compiling
         # with distribution strategy and use the default compilation path.
-        if distribution_strategy_context.in_cross_replica_context():
+        if distribute_lib.in_cross_replica_context():
           self._distribution_strategy = (
-              distribution_strategy_context.get_strategy())
+              distribute_lib.get_strategy())
 
     if isinstance(self._distribution_strategy,
                   parameter_server_strategy.ParameterServerStrategyV1):
@@ -1057,7 +1057,7 @@ class Model(training_lib.Model):
     # the Eager code path.  The expected way to get here is to call `fit` that
     # calls `train_on_batch` on each replica.
     if (self._distribution_strategy and
-        distribution_strategy_context.in_cross_replica_context()):
+        distribute_lib.in_cross_replica_context()):
       raise NotImplementedError('`train_on_batch` is not supported for models '
                                 'distributed with tf.distribute.Strategy.')
     # Validate and standardize user data.
@@ -1139,7 +1139,7 @@ class Model(training_lib.Model):
     self._check_call_args('test_on_batch')
 
     if (self._distribution_strategy and
-        distribution_strategy_context.in_cross_replica_context()):
+        distribute_lib.in_cross_replica_context()):
       raise NotImplementedError('`test_on_batch` is not supported for models '
                                 'distributed with tf.distribute.Strategy.')
     # Validate and standardize user data.
@@ -1194,7 +1194,7 @@ class Model(training_lib.Model):
     self._check_call_args('predict_on_batch')
 
     if (self._distribution_strategy and
-        distribution_strategy_context.in_cross_replica_context()):
+        distribute_lib.in_cross_replica_context()):
       raise NotImplementedError(
           '`predict_on_batch` is not supported for models distributed with'
           ' tf.distribute.Strategy.')
@@ -2817,8 +2817,8 @@ class Model(training_lib.Model):
     strategy = self._distribution_strategy
 
     # Otherwise, use the strategy whose scope this is in.
-    if not strategy and distribution_strategy_context.has_strategy():
-      strategy = distribution_strategy_context.get_strategy()
+    if not strategy and distribute_lib.has_strategy():
+      strategy = distribute_lib.get_strategy()
     return strategy and strategy.extended._in_multi_worker_mode()  # pylint: disable=protected-access
 
   @property
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index c616778d6e5..25ff540d3b9 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -8,7 +8,7 @@ package(
         "//tensorflow/python/distribute:__pkg__",
         "//tensorflow/python/feature_column:__pkg__",
         "//tensorflow/python/keras:__subpackages__",
-        "//tensorflow/python/training/tracking:__pkg__",
+        "//tensorflow/python/trackable:__pkg__",
         "//tensorflow/tools/pip_package:__pkg__",
         "//tensorflow_models/official/projects/residual_mobilenet/modeling/backbones:__pkg__",
     ],
@@ -120,13 +120,13 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:platform",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:standard_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:base_layer",
@@ -139,6 +139,7 @@ py_library(
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:nest",
@@ -160,6 +161,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:base_layer",
         "//tensorflow/python/keras/utils:tf_utils",
@@ -224,10 +226,9 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/distribute:distribute_lib",
@@ -241,6 +242,7 @@ py_library(
         "//tensorflow/python/keras/initializers",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:data_structures",
         "//tensorflow/python/util:nest",
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index b2dc5669152..1132b78dfe4 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -28,8 +28,8 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend as K
@@ -206,7 +206,7 @@ class Dropout(Layer):
     noise_shape = []
     for i, value in enumerate(self.noise_shape):
       noise_shape.append(concrete_inputs_shape[i] if value is None else value)
-    return ops.convert_to_tensor_v2_with_dispatch(noise_shape)
+    return tensor_conversion.convert_to_tensor_v2_with_dispatch(noise_shape)
 
   def call(self, inputs, training=None):
     if training is None:
diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py
index cd5e9db4add..8a570f7245b 100644
--- a/tensorflow/python/keras/layers/dense_attention.py
+++ b/tensorflow/python/keras/layers/dense_attention.py
@@ -19,7 +19,7 @@ Attention is formed by three tensors: Query, Key and Value.
 """
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine.base_layer import Layer
@@ -183,7 +183,7 @@ class BaseDenseAttention(Layer):
       q_mask = mask[0]
       if q_mask is None:
         return None
-      return ops.convert_to_tensor_v2_with_dispatch(q_mask)
+      return tensor_conversion.convert_to_tensor_v2_with_dispatch(q_mask)
     return None
 
   def _validate_call_args(self, inputs, mask):
diff --git a/tensorflow/python/keras/layers/legacy_rnn/BUILD b/tensorflow/python/keras/layers/legacy_rnn/BUILD
index 388a9566cba..bb20b172976 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/BUILD
+++ b/tensorflow/python/keras/layers/legacy_rnn/BUILD
@@ -32,18 +32,19 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:platform",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras/engine:input_spec",
         "//tensorflow/python/keras/initializers",
         "//tensorflow/python/keras/legacy_tf_layers:layers_base",
         "//tensorflow/python/keras/saving",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
@@ -63,6 +64,7 @@ py_library(
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/util:nest",
     ],
diff --git a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py
index 3ffd247d0ac..ed41c9f2b19 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py
+++ b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import activations
@@ -283,7 +284,9 @@ class RNNCell(base_layer.Layer):
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
     if inputs is not None:
       # Validate the given batch_size and dtype against inputs if provided.
-      inputs = ops.convert_to_tensor_v2_with_dispatch(inputs, name="inputs")
+      inputs = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          inputs, name="inputs"
+      )
       if batch_size is not None:
         if tensor_util.is_tf_type(batch_size):
           static_batch_size = tensor_util.constant_value(
diff --git a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py
index d8478b7cfe5..b07a97fa8db 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py
+++ b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py
@@ -20,6 +20,7 @@ import types as python_types
 import warnings
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras.utils import generic_utils
@@ -112,7 +113,7 @@ class DropoutWrapperBase(object):
     with ops.name_scope_v2("DropoutWrapperInit"):
 
       def tensor_and_const_value(v):
-        tensor_value = ops.convert_to_tensor_v2_with_dispatch(v)
+        tensor_value = tensor_conversion.convert_to_tensor_v2_with_dispatch(v)
         const_value = tensor_util.constant_value(tensor_value)
         return (tensor_value, const_value)
 
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index a3a8d99244e..d3c5fec5048 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -21,7 +21,7 @@ import warnings
 
 import numpy as np
 
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -38,7 +38,7 @@ from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -437,7 +437,7 @@ class RNN(Layer):
     self._num_constants = 0
 
     if stateful:
-      if ds_context.has_strategy():
+      if distribute_lib.has_strategy():
         raise ValueError('RNNs with stateful=True not yet supported with '
                          'tf.distribute.Strategy.')
 
@@ -859,10 +859,10 @@ class RNN(Layer):
         non_zero_count = math_ops.add_n([math_ops.count_nonzero_v2(s)
                                          for s in nest.flatten(self.states)])
         # Set strict = True to keep the original structure of the state.
-        initial_state = control_flow_ops.cond(non_zero_count > 0,
-                                              true_fn=lambda: self.states,
-                                              false_fn=lambda: initial_state,
-                                              strict=True)
+        initial_state = cond.cond(non_zero_count > 0,
+                                  true_fn=lambda: self.states,
+                                  false_fn=lambda: initial_state,
+                                  strict=True)
       else:
         initial_state = self.states
     elif initial_state is None:
diff --git a/tensorflow/python/keras/legacy_tf_layers/base.py b/tensorflow/python/keras/legacy_tf_layers/base.py
index 8fba78fa4ca..eeb5626aaeb 100644
--- a/tensorflow/python/keras/legacy_tf_layers/base.py
+++ b/tensorflow/python/keras/legacy_tf_layers/base.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
-from tensorflow.python.util.tf_export import tf_export
 
 # Avoid breaking users who directly import this symbol from this file.
 # TODO(fchollet): remove this.
@@ -42,7 +41,6 @@ _KERAS_STYLE_SCOPE = False
 
 @keras_export(
     v1=['keras.__internal__.legacy.layers.experimental.keras_style_scope'])
-@tf_export(v1=['layers.experimental.keras_style_scope'])
 @tf_contextlib.contextmanager
 def keras_style_scope():
   """Use Keras-style variable management.
@@ -113,7 +111,6 @@ def keras_style_scope():
 
 @keras_export(
     v1=['keras.__internal__.legacy.layers.experimental.set_keras_style'])
-@tf_export(v1=['layers.experimental.set_keras_style'])
 def set_keras_style():
   """Use Keras-style variable management.
 
@@ -157,7 +154,6 @@ def _is_in_keras_style_scope():
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.Layer'])
-@tf_export(v1=['layers.Layer'])
 class Layer(base_layer.Layer):
   """Base layer class.
 
diff --git a/tensorflow/python/keras/legacy_tf_layers/convolutional.py b/tensorflow/python/keras/legacy_tf_layers/convolutional.py
index f1dbdd8ec08..f43f854334d 100644
--- a/tensorflow/python/keras/legacy_tf_layers/convolutional.py
+++ b/tensorflow/python/keras/legacy_tf_layers/convolutional.py
@@ -20,11 +20,9 @@ from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.keras.legacy_tf_layers import base
 from tensorflow.python.ops import init_ops
 from tensorflow.python.util.tf_export import keras_export
-from tensorflow.python.util.tf_export import tf_export
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.Conv1D'])
-@tf_export(v1=['layers.Conv1D'])
 class Conv1D(keras_layers.Conv1D, base.Layer):
   """1D convolution layer (e.g. temporal convolution).
 
@@ -117,7 +115,6 @@ class Conv1D(keras_layers.Conv1D, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.conv1d'])
-@tf_export(v1=['layers.conv1d'])
 def conv1d(inputs,
            filters,
            kernel_size,
@@ -224,7 +221,6 @@ def conv1d(inputs,
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.Conv2D'])
-@tf_export(v1=['layers.Conv2D'])
 class Conv2D(keras_layers.Conv2D, base.Layer):
   """2D convolution layer (e.g. spatial convolution over images).
 
@@ -324,7 +320,6 @@ class Conv2D(keras_layers.Conv2D, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.conv2d'])
-@tf_export(v1=['layers.conv2d'])
 def conv2d(inputs,
            filters,
            kernel_size,
@@ -438,7 +433,6 @@ def conv2d(inputs,
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.Conv3D'])
-@tf_export(v1=['layers.Conv3D'])
 class Conv3D(keras_layers.Conv3D, base.Layer):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
@@ -539,7 +533,6 @@ class Conv3D(keras_layers.Conv3D, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.conv3d'])
-@tf_export(v1=['layers.conv3d'])
 def conv3d(inputs,
            filters,
            kernel_size,
@@ -654,7 +647,6 @@ def conv3d(inputs,
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.SeparableConv1D'])
-@tf_export(v1=['layers.SeparableConv1D'])
 class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
   """Depthwise separable 1D convolution.
 
@@ -765,7 +757,6 @@ class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.SeparableConv2D'])
-@tf_export(v1=['layers.SeparableConv2D'])
 class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
   """Depthwise separable 2D convolution.
 
@@ -881,7 +872,6 @@ class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.separable_conv1d'])
-@tf_export(v1=['layers.separable_conv1d'])
 def separable_conv1d(inputs,
                      filters,
                      kernel_size,
@@ -1005,7 +995,6 @@ def separable_conv1d(inputs,
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.separable_conv2d'])
-@tf_export(v1=['layers.separable_conv2d'])
 def separable_conv2d(inputs,
                      filters,
                      kernel_size,
@@ -1134,7 +1123,6 @@ def separable_conv2d(inputs,
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.Conv2DTranspose'])
-@tf_export(v1=['layers.Conv2DTranspose'])
 class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
   """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
 
@@ -1223,7 +1211,6 @@ class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.conv2d_transpose'])
-@tf_export(v1=['layers.conv2d_transpose'])
 def conv2d_transpose(inputs,
                      filters,
                      kernel_size,
@@ -1325,7 +1312,6 @@ def conv2d_transpose(inputs,
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.Conv3DTranspose'])
-@tf_export(v1=['layers.Conv3DTranspose'])
 class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
   """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
 
@@ -1411,7 +1397,6 @@ class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.conv3d_transpose'])
-@tf_export(v1=['layers.conv3d_transpose'])
 def conv3d_transpose(inputs,
                      filters,
                      kernel_size,
diff --git a/tensorflow/python/keras/legacy_tf_layers/core.py b/tensorflow/python/keras/legacy_tf_layers/core.py
index 44510c69565..2564a2da8ce 100644
--- a/tensorflow/python/keras/legacy_tf_layers/core.py
+++ b/tensorflow/python/keras/legacy_tf_layers/core.py
@@ -23,11 +23,9 @@ from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.keras.legacy_tf_layers import base
 from tensorflow.python.ops import init_ops
 from tensorflow.python.util.tf_export import keras_export
-from tensorflow.python.util.tf_export import tf_export
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.Dense'])
-@tf_export(v1=['layers.Dense'])
 class Dense(keras_layers.Dense, base.Layer):
   """Densely-connected layer class.
 
@@ -109,7 +107,6 @@ class Dense(keras_layers.Dense, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.dense'])
-@tf_export(v1=['layers.dense'])
 def dense(
     inputs, units,
     activation=None,
@@ -188,7 +185,6 @@ def dense(
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.Dropout'])
-@tf_export(v1=['layers.Dropout'])
 class Dropout(keras_layers.Dropout, base.Layer):
   """Applies Dropout to the input.
 
@@ -228,7 +224,6 @@ class Dropout(keras_layers.Dropout, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.dropout'])
-@tf_export(v1=['layers.dropout'])
 def dropout(inputs,
             rate=0.5,
             noise_shape=None,
@@ -274,7 +269,6 @@ def dropout(inputs,
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.Flatten'])
-@tf_export(v1=['layers.Flatten'])
 class Flatten(keras_layers.Flatten, base.Layer):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
@@ -301,7 +295,6 @@ class Flatten(keras_layers.Flatten, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.flatten'])
-@tf_export(v1=['layers.flatten'])
 def flatten(inputs, name=None, data_format='channels_last'):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
diff --git a/tensorflow/python/keras/legacy_tf_layers/pooling.py b/tensorflow/python/keras/legacy_tf_layers/pooling.py
index 1c34a93ae2b..b7134abe8f0 100644
--- a/tensorflow/python/keras/legacy_tf_layers/pooling.py
+++ b/tensorflow/python/keras/legacy_tf_layers/pooling.py
@@ -19,11 +19,9 @@ import warnings
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.keras.legacy_tf_layers import base
 from tensorflow.python.util.tf_export import keras_export
-from tensorflow.python.util.tf_export import tf_export
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.AveragePooling1D'])
-@tf_export(v1=['layers.AveragePooling1D'])
 class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
   """Average Pooling layer for 1D inputs.
 
@@ -57,7 +55,6 @@ class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.average_pooling1d'])
-@tf_export(v1=['layers.average_pooling1d'])
 def average_pooling1d(inputs, pool_size, strides,
                       padding='valid', data_format='channels_last',
                       name=None):
@@ -96,7 +93,6 @@ def average_pooling1d(inputs, pool_size, strides,
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.MaxPooling1D'])
-@tf_export(v1=['layers.MaxPooling1D'])
 class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
   """Max Pooling layer for 1D inputs.
 
@@ -130,7 +126,6 @@ class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.max_pooling1d'])
-@tf_export(v1=['layers.max_pooling1d'])
 def max_pooling1d(inputs, pool_size, strides,
                   padding='valid', data_format='channels_last',
                   name=None):
@@ -169,7 +164,6 @@ def max_pooling1d(inputs, pool_size, strides,
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.AveragePooling2D'])
-@tf_export(v1=['layers.AveragePooling2D'])
 class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
   """Average pooling layer for 2D inputs (e.g. images).
 
@@ -203,7 +197,6 @@ class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.average_pooling2d'])
-@tf_export(v1=['layers.average_pooling2d'])
 def average_pooling2d(inputs,
                       pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -245,7 +238,6 @@ def average_pooling2d(inputs,
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.MaxPooling2D'])
-@tf_export(v1=['layers.MaxPooling2D'])
 class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
   """Max pooling layer for 2D inputs (e.g. images).
 
@@ -279,7 +271,6 @@ class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.max_pooling2d'])
-@tf_export(v1=['layers.max_pooling2d'])
 def max_pooling2d(inputs,
                   pool_size, strides,
                   padding='valid', data_format='channels_last',
@@ -321,7 +312,6 @@ def max_pooling2d(inputs,
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.AveragePooling3D'])
-@tf_export(v1=['layers.AveragePooling3D'])
 class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
   """Average pooling layer for 3D inputs (e.g. volumes).
 
@@ -357,7 +347,6 @@ class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.average_pooling3d'])
-@tf_export(v1=['layers.average_pooling3d'])
 def average_pooling3d(inputs,
                       pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -401,7 +390,6 @@ def average_pooling3d(inputs,
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.MaxPooling3D'])
-@tf_export(v1=['layers.MaxPooling3D'])
 class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
   """Max pooling layer for 3D inputs (e.g. volumes).
 
@@ -437,7 +425,6 @@ class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
 
 
 @keras_export(v1=['keras.__internal__.legacy.layers.max_pooling3d'])
-@tf_export(v1=['layers.max_pooling3d'])
 def max_pooling3d(inputs,
                   pool_size, strides,
                   padding='valid', data_format='channels_last',
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 07b5666f24a..bd40507cda8 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -20,11 +20,12 @@ import functools
 
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.impl import api as autograph
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
@@ -33,7 +34,7 @@ from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import losses_impl
@@ -191,7 +192,7 @@ class Loss:
   def _get_reduction(self):
     """Handles `AUTO` reduction cases and returns the reduction value."""
     if (not self._allow_sum_over_batch_size and
-        distribution_strategy_context.has_strategy() and
+        distribute_lib.has_strategy() and
         (self.reduction == losses_utils.ReductionV2.AUTO or
          self.reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE)):
       raise ValueError(
@@ -1213,7 +1214,7 @@ def mean_squared_error(y_true, y_pred):
   Returns:
     Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return backend.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
 
@@ -1269,7 +1270,7 @@ def _ragged_tensor_apply_loss(loss_fn, y_true, y_pred, y_pred_extra_dim=False):
   def _wrapper(inputs, ragged_output):
     _, y_pred = inputs
     if isinstance(y_pred, ragged_tensor.RaggedTensor):
-      return control_flow_ops.cond(
+      return cond.cond(
           rt_is_equiv_dense(y_pred),
           lambda: _call_loss(_convert_to_dense(inputs), ragged_output),
           lambda: _call_loss(inputs, ragged_output))
@@ -1341,7 +1342,7 @@ def mean_absolute_error(y_true, y_pred):
   Returns:
     Mean absolute error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return backend.mean(math_ops.abs(y_pred - y_true), axis=-1)
 
@@ -1380,7 +1381,7 @@ def mean_absolute_percentage_error(y_true, y_pred):
   Returns:
     Mean absolute percentage error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   diff = math_ops.abs(
       (y_true - y_pred) / backend.maximum(math_ops.abs(y_true),
@@ -1426,7 +1427,7 @@ def mean_squared_logarithmic_error(y_true, y_pred):
   Returns:
     Mean squared logarithmic error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   first_log = math_ops.log(backend.maximum(y_pred, backend.epsilon()) + 1.)
   second_log = math_ops.log(backend.maximum(y_true, backend.epsilon()) + 1.)
@@ -1483,7 +1484,7 @@ def squared_hinge(y_true, y_pred):
   Returns:
      Squared hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   y_true = _maybe_convert_labels(y_true)
   return backend.mean(
@@ -1516,7 +1517,7 @@ def hinge(y_true, y_pred):
   Returns:
     Hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   y_true = _maybe_convert_labels(y_true)
   return backend.mean(math_ops.maximum(1. - y_true * y_pred, 0.), axis=-1)
@@ -1549,7 +1550,7 @@ def categorical_hinge(y_true, y_pred):
   Returns:
     Categorical hinge loss values.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   pos = math_ops.reduce_sum(y_true * y_pred, axis=-1)
   neg = math_ops.reduce_max((1. - y_true) * y_pred, axis=-1)
@@ -1584,7 +1585,9 @@ def huber(y_true, y_pred, delta=1.0):
   delta = math_ops.cast(delta, dtype=backend.floatx())
   error = math_ops.subtract(y_pred, y_true)
   abs_error = math_ops.abs(error)
-  half = ops.convert_to_tensor_v2_with_dispatch(0.5, dtype=abs_error.dtype)
+  half = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      0.5, dtype=abs_error.dtype
+  )
   return backend.mean(
       array_ops.where_v2(abs_error <= delta, half * math_ops.square(error),
                          delta * abs_error - half * math_ops.square(delta)),
@@ -1621,7 +1624,7 @@ def log_cosh(y_true, y_pred):
   Returns:
     Logcosh error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
 
   def _logcosh(x):
@@ -1664,10 +1667,11 @@ def categorical_crossentropy(y_true,
   Returns:
     Categorical crossentropy loss value.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
-  label_smoothing = ops.convert_to_tensor_v2_with_dispatch(
-      label_smoothing, dtype=backend.floatx())
+  label_smoothing = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      label_smoothing, dtype=backend.floatx()
+  )
 
   def _smooth_labels():
     num_classes = math_ops.cast(array_ops.shape(y_true)[-1], y_pred.dtype)
@@ -1747,7 +1751,7 @@ def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
   Returns:
     Sparse categorical crossentropy loss value.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return backend.sparse_categorical_crossentropy(
       y_true, y_pred, from_logits=from_logits, axis=axis)
@@ -1808,10 +1812,11 @@ def binary_crossentropy(y_true,
   Returns:
     Binary crossentropy loss value. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
-  label_smoothing = ops.convert_to_tensor_v2_with_dispatch(
-      label_smoothing, dtype=backend.floatx())
+  label_smoothing = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      label_smoothing, dtype=backend.floatx()
+  )
 
   def _smooth_labels():
     return y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
@@ -1895,7 +1900,7 @@ def kl_divergence(y_true, y_pred):
   Raises:
     TypeError: If `y_true` cannot be cast to the `y_pred.dtype`.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   y_true = backend.clip(y_true, backend.epsilon(), 1)
   y_pred = backend.clip(y_pred, backend.epsilon(), 1)
@@ -1931,7 +1936,7 @@ def poisson(y_true, y_pred):
   Raises:
     InvalidArgumentError: If `y_true` and `y_pred` have incompatible shapes.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return backend.mean(
       y_pred - y_true * math_ops.log(y_pred + backend.epsilon()), axis=-1)
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 3bcd45fb27a..2a3fc0ce872 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -24,12 +24,13 @@ import numpy as np
 
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.impl import api as autograph
-from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend
@@ -293,8 +294,8 @@ class Metric(base_layer.Layer, metaclass=abc.ABCMeta):
       initializer=None,
       dtype=None):
     """Adds state variable. Only for use by subclasses."""
-    if distribute_ctx.has_strategy():
-      strategy = distribute_ctx.get_strategy()
+    if distribute_lib.has_strategy():
+      strategy = distribute_lib.get_strategy()
     else:
       strategy = None
 
@@ -1023,7 +1024,7 @@ class _ConfusionMatrixConditionCount(Metric):
       result = self.accumulator[0]
     else:
       result = self.accumulator
-    return ops.convert_to_tensor_v2_with_dispatch(result)
+    return tensor_conversion.convert_to_tensor_v2_with_dispatch(result)
 
   def reset_state(self):
     num_thresholds = len(to_list(self.thresholds))
@@ -3442,7 +3443,7 @@ def binary_accuracy(y_true, y_pred, threshold=0.5):
   Returns:
     Binary accuracy values. shape = `[batch_size, d0, .. dN-1]`
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
   threshold = math_ops.cast(threshold, y_pred.dtype)
   y_pred = math_ops.cast(y_pred > threshold, y_pred.dtype)
   return backend.mean(math_ops.equal(y_true, y_pred), axis=-1)
@@ -3500,8 +3501,8 @@ def sparse_categorical_accuracy(y_true, y_pred):
   Returns:
     Sparse categorical accuracy values.
   """
-  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
-  y_true = ops.convert_to_tensor_v2_with_dispatch(y_true)
+  y_pred = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_true = tensor_conversion.convert_to_tensor_v2_with_dispatch(y_true)
   y_pred_rank = y_pred.shape.ndims
   y_true_rank = y_true.shape.ndims
   # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
@@ -3568,8 +3569,12 @@ def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   Returns:
     Sparse top K categorical accuracy value.
   """
-  y_pred_rank = ops.convert_to_tensor_v2_with_dispatch(y_pred).shape.ndims
-  y_true_rank = ops.convert_to_tensor_v2_with_dispatch(y_true).shape.ndims
+  y_pred_rank = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      y_pred
+  ).shape.ndims
+  y_true_rank = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      y_true
+  ).shape.ndims
   # Flatten y_pred to (batch_size, num_samples) and y_true to (num_samples,)
   if (y_true_rank is not None) and (y_pred_rank is not None):
     if y_pred_rank > 2:
diff --git a/tensorflow/python/keras/mixed_precision/BUILD b/tensorflow/python/keras/mixed_precision/BUILD
index 5edbf8f4dbe..44707c72527 100644
--- a/tensorflow/python/keras/mixed_precision/BUILD
+++ b/tensorflow/python/keras/mixed_precision/BUILD
@@ -89,9 +89,10 @@ py_library(
         "//tensorflow/python/distribute:distribute_utils",
         "//tensorflow/python/distribute:ps_values",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/keras/distribute",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:core",
     ],
 )
 
@@ -111,11 +112,14 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":loss_scale",
+        "//tensorflow/python:cond",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/distribute:one_device_strategy",
         "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/trackable:base_delegate",
         "@absl_py//absl/testing:parameterized",
@@ -127,7 +131,9 @@ py_library(
     srcs = ["test_util.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python:cond",
         "//tensorflow/python:framework",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras",
     ],
 )
diff --git a/tensorflow/python/keras/mixed_precision/autocast_variable.py b/tensorflow/python/keras/mixed_precision/autocast_variable.py
index f542eb9da64..08cd03495f6 100644
--- a/tensorflow/python/keras/mixed_precision/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/autocast_variable.py
@@ -17,6 +17,7 @@
 import threading
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.ops import math_ops
@@ -139,15 +140,17 @@ class AutoCastVariable(variables.Variable, core.Tensor):
       raise ValueError('Cannot convert AutoCastVariable to a tensor if '
                        'as_ref=True is passed to convert_to_tensor')
     if not self._should_cast():
-      return ops.convert_to_tensor_v2_with_dispatch(self._variable, dtype=dtype,
-                                                    name=name)
+      return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          self._variable, dtype=dtype, name=name
+      )
     if dtype is not None and not dtype.is_compatible_with(self._cast_dtype):
       raise ValueError(
           'Incompatible type conversion requested to type {!r} for '
           'AutoCastVariable which is casted to type {!r}'.format(
               dtype.name, self._cast_dtype.name))
-    val = ops.convert_to_tensor_v2_with_dispatch(
-        self._variable, dtype=self._variable.dtype, name=name)
+    val = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        self._variable, dtype=self._variable.dtype, name=name
+    )
     return math_ops.cast(val, self._cast_dtype)
 
   def _should_act_as_resource_variable(self):
diff --git a/tensorflow/python/keras/mixed_precision/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/loss_scale_optimizer.py
index 0cff5cb2062..9799c411796 100644
--- a/tensorflow/python/keras/mixed_precision/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/loss_scale_optimizer.py
@@ -15,7 +15,7 @@
 """Contains the loss scaling optimizer class."""
 
 from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.distribute import tpu_strategy
@@ -25,14 +25,16 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.mixed_precision import loss_scale as keras_loss_scale_module
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.optimizer_v2 import utils as optimizer_utils
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.trackable import base as trackable
@@ -63,8 +65,13 @@ class _UnwrapPreventer(object):
 
 def _is_all_finite(grads):
   """Returns a scalar boolean tensor indicating if all gradients are finite."""
+  def raw_values(g):
+    return g.values if isinstance(g, indexed_slices.IndexedSlices) else g
+
   is_finite_per_grad = [
-      math_ops.reduce_all(math_ops.is_finite(g)) for g in grads if g is not None
+      math_ops.reduce_all(math_ops.is_finite(raw_values(g)))
+      for g in grads
+      if g is not None
   ]
   return math_ops.reduce_all(is_finite_per_grad)
 
@@ -88,7 +95,7 @@ def _op_in_graph_mode(tensor):
 
 def _assign_if_finite(var, value):
   """Assigns a value to a variable if the value is finite."""
-  return control_flow_ops.cond(
+  return cond.cond(
       math_ops.is_finite(value), lambda: _op_in_graph_mode(var.assign(value)),
       control_flow_ops.no_op)
 
@@ -131,7 +138,7 @@ class _DynamicLossScaleState(trackable.Trackable):
     Raises:
       RuntimeError: If a weight with `name` has already been added.
     """
-    variable = variable_scope.variable(
+    variable = variable_v1.VariableV1(
         initial_value=initial_value,
         name=name,
         dtype=dtype,
@@ -207,7 +214,9 @@ class _DynamicLossScaleState(trackable.Trackable):
 
   def __call__(self):
     """Returns the current loss scale as a scalar `float32` tensor."""
-    return ops.convert_to_tensor_v2_with_dispatch(self._current_loss_scale)
+    return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        self._current_loss_scale
+    )
 
   def update(self, grads):
     """Updates the value of the loss scale.
@@ -224,9 +233,9 @@ class _DynamicLossScaleState(trackable.Trackable):
         step.
     """
     grads = nest.flatten(grads)
-    if distribution_strategy_context.has_strategy(
-    ) and distribution_strategy_context.in_cross_replica_context():
-      distribution = distribution_strategy_context.get_strategy()
+    if distribute_lib.has_strategy(
+    ) and distribute_lib.in_cross_replica_context():
+      distribution = distribute_lib.get_strategy()
       is_finite_per_replica = distribution.extended.call_for_each_replica(
           _is_all_finite, args=(grads,))
       # Each replica computed the same `is_finite` value, since `grads` is
@@ -246,7 +255,7 @@ class _DynamicLossScaleState(trackable.Trackable):
             _assign_if_finite(self.current_loss_scale, new_loss_scale),
             self.counter.assign(0))
 
-      return control_flow_ops.cond(
+      return cond.cond(
           self.counter + 1 >= self.growth_steps,
           incr_loss_scale,
           lambda: _op_in_graph_mode(self.counter.assign_add(1)))
@@ -260,8 +269,8 @@ class _DynamicLossScaleState(trackable.Trackable):
           self.counter.assign(0),
           self.current_loss_scale.assign(new_loss_scale))
 
-    update_op = control_flow_ops.cond(is_finite, update_if_finite_grads,
-                                      update_if_not_finite_grads)
+    update_op = cond.cond(is_finite, update_if_finite_grads,
+                          update_if_not_finite_grads)
     should_apply_gradients = is_finite
     return update_op, should_apply_gradients
 
@@ -458,10 +467,13 @@ class LossScaleOptimizer(base_delegate.DelegatingTrackableMixin,
   def loss_scale(self):
     """The current loss scale as a float32 scalar tensor."""
     if isinstance(self._loss_scale, _DynamicLossScaleState):
-      return ops.convert_to_tensor_v2_with_dispatch(
-          self._loss_scale.current_loss_scale)
+      return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          self._loss_scale.current_loss_scale
+      )
     else:
-      return ops.convert_to_tensor_v2_with_dispatch(self._loss_scale)
+      return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          self._loss_scale
+      )
 
   @property
   def dynamic_counter(self):
@@ -592,7 +604,7 @@ class LossScaleOptimizer(base_delegate.DelegatingTrackableMixin,
                       grads_and_vars,
                       name=None,
                       experimental_aggregate_gradients=True):
-    if distribution_strategy_context.in_cross_replica_context():
+    if distribute_lib.in_cross_replica_context():
       raise ValueError('apply_gradients() must be called in a replica context.')
     # We check for the strategy here despite already checking in the constructor
     # as frequently the optimizer is created outside the strategy's scope.
@@ -658,7 +670,7 @@ class LossScaleOptimizer(base_delegate.DelegatingTrackableMixin,
         maybe_apply_op = smart_cond.smart_cond(should_apply_grads, apply_fn,
                                                do_not_apply_fn)
         return control_flow_ops.group(maybe_apply_op, loss_scale_update_op)
-      return distribution_strategy_context.get_replica_context().merge_call(
+      return distribute_lib.get_replica_context().merge_call(
           _apply_gradients_cross_replica,
           args=(grads, wrapped_vars, name))
 
@@ -713,7 +725,7 @@ class LossScaleOptimizer(base_delegate.DelegatingTrackableMixin,
 
   def _raise_if_strategy_unsupported(self):
     if not strategy_supports_loss_scaling():
-      strategy = distribution_strategy_context.get_strategy()
+      strategy = distribute_lib.get_strategy()
       if isinstance(strategy,
                     (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1,
                      tpu_strategy.TPUStrategyV2)):
@@ -1107,9 +1119,9 @@ def _multiply_gradient(gradient, scale):
 
 def strategy_supports_loss_scaling():
   """Returns True if the current Strategy supports loss scaling."""
-  if not distribution_strategy_context.has_strategy():
+  if not distribute_lib.has_strategy():
     return True
-  strategy = distribution_strategy_context.get_strategy()
+  strategy = distribute_lib.get_strategy()
   # Strategies are supported if either there is only one replica or if variables
   # are replicated per device. Otherwise, the current model.fit() implementation
   # and most custom training loops incorrectly unscale the gradients. Currently,
diff --git a/tensorflow/python/keras/mixed_precision/test_util.py b/tensorflow/python/keras/mixed_precision/test_util.py
index 1d940ac2093..4cee06195e3 100644
--- a/tensorflow/python/keras/mixed_precision/test_util.py
+++ b/tensorflow/python/keras/mixed_precision/test_util.py
@@ -16,11 +16,12 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import nest
@@ -52,8 +53,9 @@ def create_identity_with_grad_check_fn(expected_gradient, expected_dtype=None):
       if expected_dtype:
         assert dx.dtype == expected_dtype, (
             'dx.dtype should be %s but is: %s' % (expected_dtype, dx.dtype))
-      expected_tensor = ops.convert_to_tensor_v2_with_dispatch(
-          expected_gradient, dtype=dx.dtype, name='expected_gradient')
+      expected_tensor = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          expected_gradient, dtype=dx.dtype, name='expected_gradient'
+      )
       # Control dependency is to ensure input is available. It's possible the
       # dataset will throw a StopIteration to indicate there is no more data, in
       # which case we don't want to run the assertion.
@@ -91,7 +93,7 @@ def create_identity_with_nan_gradients_fn(have_nan_gradients):
     """Function whose gradient is NaN iff `have_nan_gradients` is True."""
     x = array_ops.identity(x)
     def grad(dx):
-      return control_flow_ops.cond(
+      return cond.cond(
           have_nan_gradients,
           lambda: dx * float('NaN'),
           lambda: dx
diff --git a/tensorflow/python/keras/optimizer_v1.py b/tensorflow/python/keras/optimizer_v1.py
index 90ae12d93ae..3296f9a50b0 100644
--- a/tensorflow/python/keras/optimizer_v1.py
+++ b/tensorflow/python/keras/optimizer_v1.py
@@ -19,7 +19,7 @@
 For more examples see the base class `tf.compat.v1.keras.optimizers.Optimizer`.
 """
 
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
@@ -806,7 +806,7 @@ class TFOptimizer(Optimizer, trackable.Trackable):
     return self.optimizer.compute_gradients(loss, params)
 
   def get_updates(self, loss, params):
-    if distribution_strategy_context.has_strategy():
+    if distribute_lib.has_strategy():
       self.updates = []
 
       if not params:
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index 46e021d4e20..a51071f0895 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -8,7 +8,8 @@ package(
         "//tensorflow/python:__pkg__",
         "//tensorflow/python/distribute:__pkg__",
         "//tensorflow/python/keras:__subpackages__",
-        "//tensorflow/python/training/tracking:__pkg__",
+        "//tensorflow/python/tpu/tests:__pkg__",
+        "//tensorflow/python/trackable:__pkg__",
     ],
     licenses = ["notice"],
 )
@@ -49,6 +50,7 @@ py_library(
         "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:values",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:backend_config",
         "//tensorflow/python/keras/engine:base_layer_utils",
@@ -66,11 +68,12 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_case",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras/utils:generic_utils",
     ],
 )
@@ -85,6 +88,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
index 51e57f810a5..611537b2b8f 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -16,8 +16,7 @@
 # pylint: disable=g-classes-have-attributes
 
 import numpy as np
-
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
@@ -90,9 +89,12 @@ class Adadelta(optimizer_v2.OptimizerV2):
     super(Adadelta, self)._prepare_local(var_device, var_dtype, apply_state)
     apply_state[(var_device, var_dtype)].update(
         dict(
-            epsilon=ops.convert_to_tensor_v2_with_dispatch(
-                self.epsilon, var_dtype),
-            rho=array_ops.identity(self._get_hyper('rho', var_dtype))))
+            epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype
+            ),
+            rho=array_ops.identity(self._get_hyper('rho', var_dtype)),
+        )
+    )
 
   def set_weights(self, weights):
     params = self.weights
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index 12572d1ec05..8bf410ef0f1 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
@@ -92,10 +92,13 @@ class Adagrad(optimizer_v2.OptimizerV2):
     super(Adagrad, self)._prepare_local(var_device, var_dtype, apply_state)
     apply_state[(var_device, var_dtype)].update(
         dict(
-            epsilon=ops.convert_to_tensor_v2_with_dispatch(
-                self.epsilon, var_dtype),
+            epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype
+            ),
             neg_lr_t=-apply_state[(var_device, var_dtype)]['lr_t'],
-            zero=array_ops.zeros((), dtype=dtypes.int64)))
+            zero=array_ops.zeros((), dtype=dtypes.int64),
+        )
+    )
 
   def set_weights(self, weights):
     params = self.weights
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index dca77cbd505..ffd2801117c 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -18,6 +18,7 @@
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
@@ -142,14 +143,17 @@ class Adam(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)].update(
         dict(
             lr=lr,
-            epsilon=ops.convert_to_tensor_v2_with_dispatch(
-                self.epsilon, var_dtype),
+            epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype
+            ),
             beta_1_t=beta_1_t,
             beta_1_power=beta_1_power,
             one_minus_beta_1_t=1 - beta_1_t,
             beta_2_t=beta_2_t,
             beta_2_power=beta_2_power,
-            one_minus_beta_2_t=1 - beta_2_t))
+            one_minus_beta_2_t=1 - beta_2_t,
+        )
+    )
 
   def set_weights(self, weights):
     params = self.weights
@@ -395,14 +399,17 @@ class NonFusedAdam(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)].update(
         dict(
             lr=lr,
-            epsilon=ops.convert_to_tensor_v2_with_dispatch(
-                self.epsilon, var_dtype),
+            epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype
+            ),
             beta_1_t=beta_1_t,
             beta_1_power=beta_1_power,
             one_minus_beta_1_t=1 - beta_1_t,
             beta_2_t=beta_2_t,
             beta_2_power=beta_2_power,
-            one_minus_beta_2_t=1 - beta_2_t))
+            one_minus_beta_2_t=1 - beta_2_t,
+        )
+    )
 
   def set_weights(self, weights):
     params = self.weights
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
index 84bbd3bfde9..83622a50518 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -17,6 +17,7 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
@@ -119,13 +120,16 @@ class Adamax(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)].update(
         dict(
             neg_scaled_lr=-lr_t / (1 - beta_1_power),
-            epsilon=ops.convert_to_tensor_v2_with_dispatch(
-                self.epsilon, var_dtype),
+            epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype
+            ),
             beta_1_t=beta_1_t,
             beta_1_power=beta_1_power,
             one_minus_beta_1_t=1 - beta_1_t,
             beta_2_t=beta_2_t,
-            zero=array_ops.zeros((), dtype=dtypes.int64)))
+            zero=array_ops.zeros((), dtype=dtypes.int64),
+        )
+    )
 
   def _resource_apply_dense(self, grad, var, apply_state=None):
     var_device, var_dtype = var.device, var.dtype.base_dtype
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
index bc4e1af488a..08438301d17 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
@@ -19,10 +19,11 @@ import math
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_case
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.util import nest
@@ -176,8 +177,11 @@ class ExponentialDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "ExponentialDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
-          self.initial_learning_rate, name="initial_learning_rate")
+      initial_learning_rate = (
+          tensor_conversion.convert_to_tensor_v2_with_dispatch(
+              self.initial_learning_rate, name="initial_learning_rate"
+          )
+      )
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
       decay_rate = math_ops.cast(self.decay_rate, dtype)
@@ -270,11 +274,15 @@ class PiecewiseConstantDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "PiecewiseConstant"):
-      boundaries = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch,
-                                      nest.flatten(self.boundaries))
-      values = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch,
-                                  nest.flatten(self.values))
-      x_recomp = ops.convert_to_tensor_v2_with_dispatch(step)
+      boundaries = nest.map_structure(
+          tensor_conversion.convert_to_tensor_v2_with_dispatch,
+          nest.flatten(self.boundaries),
+      )
+      values = nest.map_structure(
+          tensor_conversion.convert_to_tensor_v2_with_dispatch,
+          nest.flatten(self.values),
+      )
+      x_recomp = tensor_conversion.convert_to_tensor_v2_with_dispatch(step)
       for i, b in enumerate(boundaries):
         if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
           # We cast the boundaries to have the same type as the step
@@ -407,8 +415,11 @@ class PolynomialDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "PolynomialDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
-          self.initial_learning_rate, name="initial_learning_rate")
+      initial_learning_rate = (
+          tensor_conversion.convert_to_tensor_v2_with_dispatch(
+              self.initial_learning_rate, name="initial_learning_rate"
+          )
+      )
       dtype = initial_learning_rate.dtype
       end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
       power = math_ops.cast(self.power, dtype)
@@ -527,8 +538,11 @@ class InverseTimeDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "InverseTimeDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
-          self.initial_learning_rate, name="initial_learning_rate")
+      initial_learning_rate = (
+          tensor_conversion.convert_to_tensor_v2_with_dispatch(
+              self.initial_learning_rate, name="initial_learning_rate"
+          )
+      )
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
       decay_rate = math_ops.cast(self.decay_rate, dtype)
@@ -622,8 +636,11 @@ class CosineDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "CosineDecay"):
-      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
-          self.initial_learning_rate, name="initial_learning_rate")
+      initial_learning_rate = (
+          tensor_conversion.convert_to_tensor_v2_with_dispatch(
+              self.initial_learning_rate, name="initial_learning_rate"
+          )
+      )
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
 
@@ -722,8 +739,11 @@ class CosineDecayRestarts(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "SGDRDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
-          self.initial_learning_rate, name="initial_learning_rate")
+      initial_learning_rate = (
+          tensor_conversion.convert_to_tensor_v2_with_dispatch(
+              self.initial_learning_rate, name="initial_learning_rate"
+          )
+      )
       dtype = initial_learning_rate.dtype
       first_decay_steps = math_ops.cast(self.first_decay_steps, dtype)
       alpha = math_ops.cast(self.alpha, dtype)
@@ -749,7 +769,7 @@ class CosineDecayRestarts(LearningRateSchedule):
 
         return i_restart, completed_fraction
 
-      i_restart, completed_fraction = control_flow_ops.cond(
+      i_restart, completed_fraction = cond.cond(
           math_ops.equal(t_mul, 1.0),
           lambda: compute_step(completed_fraction, geometric=False),
           lambda: compute_step(completed_fraction, geometric=True))
@@ -859,8 +879,11 @@ class LinearCosineDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "LinearCosineDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
-          self.initial_learning_rate, name="initial_learning_rate")
+      initial_learning_rate = (
+          tensor_conversion.convert_to_tensor_v2_with_dispatch(
+              self.initial_learning_rate, name="initial_learning_rate"
+          )
+      )
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
       num_periods = math_ops.cast(self.num_periods, dtype)
@@ -985,8 +1008,11 @@ class NoisyLinearCosineDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "NoisyLinearCosineDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
-          self.initial_learning_rate, name="initial_learning_rate")
+      initial_learning_rate = (
+          tensor_conversion.convert_to_tensor_v2_with_dispatch(
+              self.initial_learning_rate, name="initial_learning_rate"
+          )
+      )
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
       initial_variance = math_ops.cast(self.initial_variance, dtype)
diff --git a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py
index ccd61451911..d60879f76e9 100644
--- a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py
+++ b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py
@@ -18,7 +18,7 @@ import functools
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import nest
@@ -145,11 +145,14 @@ def piecewise_constant(x, boundaries, values, name=None):
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  boundaries = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch,
-                                  nest.flatten(boundaries))
-  values = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch,
-                              nest.flatten(values))
-  x_recomp = ops.convert_to_tensor_v2_with_dispatch(x)
+  boundaries = nest.map_structure(
+      tensor_conversion.convert_to_tensor_v2_with_dispatch,
+      nest.flatten(boundaries),
+  )
+  values = nest.map_structure(
+      tensor_conversion.convert_to_tensor_v2_with_dispatch, nest.flatten(values)
+  )
+  x_recomp = tensor_conversion.convert_to_tensor_v2_with_dispatch(x)
   # Avoid explicit conversion to x's dtype. This could result in faulty
   # comparisons, for example if floats are converted to integers.
   for i, b in enumerate(boundaries):
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
index 94fadfa5351..5c9cbb92e41 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -16,6 +16,7 @@
 # pylint: disable=g-classes-have-attributes
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
@@ -126,17 +127,19 @@ class Nadam(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)] = dict(
         lr_t=lr_t,
         neg_lr_t=-lr_t,  # pylint: disable=invalid-unary-operand-type
-        epsilon=ops.convert_to_tensor_v2_with_dispatch(self.epsilon, var_dtype),
+        epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch(
+            self.epsilon, var_dtype
+        ),
         beta_1_t=beta_1_t,
         beta_2_t=beta_2_t,
         m_t=m_t,
         m_t_1=m_t_1,
         one_minus_beta_1_t=1 - beta_1_t,
         one_minus_beta_2_t=1 - beta_2_t,
-        one_minus_m_t=1. - m_t,
-        one_minus_m_schedule_new=1. - m_schedule_new,
-        one_minus_m_schedule_next=1. - m_schedule_next,
-        v_t_prime_denominator=1. - math_ops.pow(beta_2_t, local_step),
+        one_minus_m_t=1.0 - m_t,
+        one_minus_m_schedule_new=1.0 - m_schedule_new,
+        one_minus_m_schedule_next=1.0 - m_schedule_next,
+        v_t_prime_denominator=1.0 - math_ops.pow(beta_2_t, local_step),
     )
 
   def _prepare(self, var_list):
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 42841858b76..b00af22388d 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -21,7 +21,7 @@ import functools
 import warnings
 
 from tensorflow.python.distribute import central_storage_strategy
-from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.distribute import values as ds_values
@@ -393,8 +393,8 @@ class OptimizerV2(trackable.Trackable):
     self._hypers_created = False
     # Store the distribution strategy object if the optimizer is created inside
     # strategy scope, so it could be used to create variables later.
-    if distribute_ctx.has_strategy():
-      self._distribution_strategy = distribute_ctx.get_strategy()
+    if distribute_lib.has_strategy():
+      self._distribution_strategy = distribute_lib.get_strategy()
     else:
       self._distribution_strategy = None
 
@@ -645,13 +645,13 @@ class OptimizerV2(trackable.Trackable):
         # gradients
         return control_flow_ops.no_op()
 
-      if distribute_ctx.in_cross_replica_context():
+      if distribute_lib.in_cross_replica_context():
         raise RuntimeError(
             "`apply_gradients() cannot be called in cross-replica context. "
             "Use `tf.distribute.Strategy.run` to enter replica "
             "context.")
 
-      strategy = distribute_ctx.get_strategy()
+      strategy = distribute_lib.get_strategy()
       if (not experimental_aggregate_gradients and strategy and
           isinstance(strategy,
                      (parameter_server_strategy.ParameterServerStrategyV1,
@@ -672,7 +672,7 @@ class OptimizerV2(trackable.Trackable):
         return self._distributed_apply(strategy, grads_and_vars, name,
                                        apply_state)
       else:
-        return distribute_ctx.get_replica_context().merge_call(
+        return distribute_lib.get_replica_context().merge_call(
             functools.partial(self._distributed_apply, apply_state=apply_state),
             args=(grads_and_vars,),
             kwargs={
@@ -718,7 +718,7 @@ class OptimizerV2(trackable.Trackable):
               var.op.name):
             update_op = distribution.extended.update(
                 var, apply_grad_to_update_var, args=(grad,), group=False)
-            if distribute_ctx.in_cross_replica_context():
+            if distribute_lib.in_cross_replica_context():
               # In cross-replica context, extended.update returns a list of
               # update ops from all replicas (group=False).
               update_ops.extend(update_op)
@@ -900,7 +900,7 @@ class OptimizerV2(trackable.Trackable):
         initial_value = initializer
 
       with self._distribution_strategy_scope():
-        strategy = distribute_ctx.get_strategy()
+        strategy = distribute_lib.get_strategy()
         if not strategy.extended.variable_created_in_scope(var):
           raise ValueError(
               "Trying to create optimizer slot variable under the scope for "
@@ -1410,7 +1410,7 @@ class OptimizerV2(trackable.Trackable):
   @contextlib.contextmanager
   def _distribution_strategy_scope(self):
     """Returns the `tf.distribute.Strategy` this optimizer was created under."""
-    if self._distribution_strategy and not distribute_ctx.has_strategy():
+    if self._distribution_strategy and not distribute_lib.has_strategy():
       with self._distribution_strategy.scope():
         yield self._distribution_strategy.scope()
     else:
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index b85390dd082..a0d9d07febe 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
@@ -164,11 +165,14 @@ class RMSprop(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)].update(
         dict(
             neg_lr_t=-apply_state[(var_device, var_dtype)]["lr_t"],
-            epsilon=ops.convert_to_tensor_v2_with_dispatch(
-                self.epsilon, var_dtype),
+            epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype
+            ),
             rho=rho,
             momentum=array_ops.identity(self._get_hyper("momentum", var_dtype)),
-            one_minus_rho=1. - rho))
+            one_minus_rho=1.0 - rho,
+        )
+    )
 
   def _resource_apply_dense(self, grad, var, apply_state=None):
     var_device, var_dtype = var.device, var.dtype.base_dtype
diff --git a/tensorflow/python/keras/optimizer_v2/utils.py b/tensorflow/python/keras/optimizer_v2/utils.py
index 0679c976ddd..0ee3139c561 100644
--- a/tensorflow/python/keras/optimizer_v2/utils.py
+++ b/tensorflow/python/keras/optimizer_v2/utils.py
@@ -15,7 +15,7 @@
 """Optimizer utilities."""
 
 from tensorflow.python.distribute import central_storage_strategy
-from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -35,11 +35,11 @@ def all_reduce_sum_gradients(grads_and_vars):
   if filtered_grads_and_vars:
     if strategy_supports_no_merge_call():
       grads = [pair[0] for pair in filtered_grads_and_vars]
-      reduced = distribute_ctx.get_strategy().extended._replica_ctx_all_reduce(  # pylint: disable=protected-access
+      reduced = distribute_lib.get_strategy().extended._replica_ctx_all_reduce(  # pylint: disable=protected-access
           ds_reduce_util.ReduceOp.SUM, grads)
     else:
       # TODO(b/183257003): Remove this branch
-      reduced = distribute_ctx.get_replica_context().merge_call(
+      reduced = distribute_lib.get_replica_context().merge_call(
           _all_reduce_sum_fn, args=(filtered_grads_and_vars,))
   else:
     reduced = []
@@ -88,7 +88,7 @@ def make_gradient_clipnorm_fn(clipnorm):
 
   def gradient_clipnorm_fn(grads_and_vars):
 
-    if isinstance(distribute_ctx.get_strategy(),
+    if isinstance(distribute_lib.get_strategy(),
                   (central_storage_strategy.CentralStorageStrategy,
                    central_storage_strategy.CentralStorageStrategyV1)):
       raise ValueError(
@@ -109,7 +109,7 @@ def make_global_gradient_clipnorm_fn(clipnorm):
 
   def gradient_clipnorm_fn(grads_and_vars):
 
-    if isinstance(distribute_ctx.get_strategy(),
+    if isinstance(distribute_lib.get_strategy(),
                   (central_storage_strategy.CentralStorageStrategy,
                    central_storage_strategy.CentralStorageStrategyV1)):
       raise ValueError(
@@ -130,7 +130,7 @@ def make_gradient_clipvalue_fn(clipvalue):
 
   def gradient_clipvalue_fn(grads_and_vars):
 
-    if isinstance(distribute_ctx.get_strategy(),
+    if isinstance(distribute_lib.get_strategy(),
                   (central_storage_strategy.CentralStorageStrategy,
                    central_storage_strategy.CentralStorageStrategyV1)):
       raise ValueError(
@@ -151,7 +151,7 @@ def _all_reduce_sum_fn(distribution, grads_and_vars):
 
 def strategy_supports_no_merge_call():
   """Returns if the current Strategy can operate in pure replica context."""
-  if not distribute_ctx.has_strategy():
+  if not distribute_lib.has_strategy():
     return True
-  strategy = distribute_ctx.get_strategy()
+  strategy = distribute_lib.get_strategy()
   return not strategy.extended._use_merge_call()  # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
index 995bb5f619a..9a418b8da51 100644
--- a/tensorflow/python/keras/saving/BUILD
+++ b/tensorflow/python/keras/saving/BUILD
@@ -34,7 +34,6 @@ py_library(
     deps = [
         "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:saver",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/checkpoint:graph_view",
@@ -51,6 +50,8 @@ py_library(
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:metrics_utils",
         "//tensorflow/python/keras/utils:mode_keys",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model",
     ],
 )
diff --git a/tensorflow/python/keras/saving/saved_model/BUILD b/tensorflow/python/keras/saving/saved_model/BUILD
index a07ac518f04..fce066a818a 100644
--- a/tensorflow/python/keras/saving/saved_model/BUILD
+++ b/tensorflow/python/keras/saving/saved_model/BUILD
@@ -52,8 +52,9 @@ py_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
diff --git a/tensorflow/python/keras/saving/utils_v1/BUILD b/tensorflow/python/keras/saving/utils_v1/BUILD
index 995ea13d90d..2e5059ebc59 100644
--- a/tensorflow/python/keras/saving/utils_v1/BUILD
+++ b/tensorflow/python/keras/saving/utils_v1/BUILD
@@ -39,8 +39,9 @@ py_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
diff --git a/tensorflow/python/keras/utils/BUILD b/tensorflow/python/keras/utils/BUILD
index 73d12bef350..a19f717f734 100644
--- a/tensorflow/python/keras/utils/BUILD
+++ b/tensorflow/python/keras/utils/BUILD
@@ -73,6 +73,7 @@ py_library(
     deps = [
         ":data_utils",
         ":io_utils",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras:backend",
     ],
 )
@@ -155,12 +156,13 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:weights_broadcast_ops",
+        "//tensorflow/python/distribute",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:ragged_util",
@@ -234,7 +236,7 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:data",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/keras/utils/control_flow_util.py b/tensorflow/python/keras/utils/control_flow_util.py
index eb2f4b5dc91..0730cd6bc77 100644
--- a/tensorflow/python/keras/utils/control_flow_util.py
+++ b/tensorflow/python/keras/utils/control_flow_util.py
@@ -20,7 +20,7 @@ This file is copied from tensorflow/python/ops/control_flow_util.py.
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond as smart_module
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import variables
 
 
@@ -104,7 +104,7 @@ def smart_cond(pred, true_fn=None, false_fn=None, name=None):  # pylint: disable
     TypeError: If `true_fn` or `false_fn` is not callable.
   """
   if isinstance(pred, variables.Variable):
-    return control_flow_ops.cond(
+    return cond.cond(
         pred, true_fn=true_fn, false_fn=false_fn, name=name)
   return smart_module.smart_cond(
       pred, true_fn=true_fn, false_fn=false_fn, name=name)
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
index b9cdbcfa807..4439c3c2d91 100644
--- a/tensorflow/python/keras/utils/losses_utils.py
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -15,12 +15,13 @@
 # pylint: disable=protected-access
 """Utilities related to loss functions."""
 
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import keras_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util.tf_export import keras_export
@@ -116,9 +117,11 @@ def remove_squeezable_dimensions(
   """
   with backend.name_scope(name or 'remove_squeezable_dimensions'):
     if not isinstance(predictions, ragged_tensor.RaggedTensor):
-      predictions = ops.convert_to_tensor_v2_with_dispatch(predictions)
+      predictions = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          predictions
+      )
     if not isinstance(labels, ragged_tensor.RaggedTensor):
-      labels = ops.convert_to_tensor_v2_with_dispatch(labels)
+      labels = tensor_conversion.convert_to_tensor_v2_with_dispatch(labels)
     predictions_shape = predictions.shape
     predictions_rank = predictions_shape.ndims
     labels_shape = labels.shape
@@ -138,13 +141,13 @@ def remove_squeezable_dimensions(
     rank_diff = array_ops.rank(predictions) - array_ops.rank(labels)
     if (predictions_rank is None) or (
         predictions_shape.dims[-1].is_compatible_with(1)):
-      predictions = control_flow_ops.cond(
+      predictions = cond.cond(
           math_ops.equal(expected_rank_diff + 1, rank_diff),
           lambda: array_ops.squeeze(predictions, [-1]),
           lambda: predictions)
     if (labels_rank is None) or (
         labels_shape.dims[-1].is_compatible_with(1)):
-      labels = control_flow_ops.cond(
+      labels = cond.cond(
           math_ops.equal(expected_rank_diff - 1, rank_diff),
           lambda: array_ops.squeeze(labels, [-1]),
           lambda: labels)
@@ -196,9 +199,9 @@ def squeeze_or_expand_dimensions(y_pred, y_true=None, sample_weight=None):
       squeeze_dims = lambda: remove_squeezable_dimensions(  # pylint: disable=g-long-lambda
           y_true, y_pred)
       is_last_dim_1 = math_ops.equal(1, array_ops.shape(y_pred)[-1])
-      maybe_squeeze_dims = lambda: control_flow_ops.cond(  # pylint: disable=g-long-lambda
+      maybe_squeeze_dims = lambda: cond.cond(  # pylint: disable=g-long-lambda
           is_last_dim_1, squeeze_dims, lambda: (y_true, y_pred))
-      y_true, y_pred = control_flow_ops.cond(
+      y_true, y_pred = cond.cond(
           math_ops.equal(1, rank_diff), maybe_squeeze_dims, squeeze_dims)
 
   if sample_weight is None:
@@ -224,17 +227,17 @@ def squeeze_or_expand_dimensions(y_pred, y_true=None, sample_weight=None):
 
   def _maybe_expand_weights():
     expand_weights = lambda: array_ops.expand_dims(sample_weight, [-1])
-    return control_flow_ops.cond(
+    return cond.cond(
         math_ops.equal(rank_diff, -1), expand_weights, lambda: sample_weight)
 
   def _maybe_adjust_weights():
-    return control_flow_ops.cond(
+    return cond.cond(
         math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
         _maybe_expand_weights)
 
   # squeeze or expand last dim of `sample_weight` if its rank differs by 1
   # from the new rank of `y_pred`.
-  sample_weight = control_flow_ops.cond(
+  sample_weight = cond.cond(
       math_ops.equal(weights_rank_tensor, 0), lambda: sample_weight,
       _maybe_adjust_weights)
   return y_pred, y_true, sample_weight
@@ -310,11 +313,13 @@ def compute_weighted_loss(losses,
 
     if not isinstance(losses,
                       (keras_tensor.KerasTensor, ragged_tensor.RaggedTensor)):
-      losses = ops.convert_to_tensor_v2_with_dispatch(losses)
+      losses = tensor_conversion.convert_to_tensor_v2_with_dispatch(losses)
     input_dtype = losses.dtype
 
     if not isinstance(sample_weight, keras_tensor.KerasTensor):
-      sample_weight = ops.convert_to_tensor_v2_with_dispatch(sample_weight)
+      sample_weight = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          sample_weight
+      )
 
     # TODO(psv): Handle casting here in a better way, eg. if losses is float64
     # we do not want to lose precision.
@@ -335,7 +340,7 @@ def compute_weighted_loss(losses,
 def scale_loss_for_distribution(loss_value):
   """Scales and returns the given loss value by the number of replicas."""
   num_replicas = (
-      distribution_strategy_context.get_strategy().num_replicas_in_sync)
+      distribute_lib.get_strategy().num_replicas_in_sync)
   if num_replicas > 1:
     loss_value *= (1. / num_replicas)
   return loss_value
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index ee6ad01f424..cc1621b826a 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -15,17 +15,16 @@
 # pylint: disable=protected-access
 """Utils related to keras metrics."""
 
+from enum import Enum
 import functools
 import weakref
-
-from enum import Enum
-
 import numpy as np
 
 from tensorflow.python.compat import compat
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -74,12 +73,12 @@ def update_state_wrapper(update_state_fn):
 
   def decorated(metric_obj, *args, **kwargs):
     """Decorated function with `add_update()`."""
-    strategy = distribution_strategy_context.get_strategy()
+    strategy = distribute_lib.get_strategy()
 
     for weight in metric_obj.weights:
       if (backend.is_tpu_strategy(strategy) and
           not strategy.extended.variable_created_in_scope(weight)
-          and not distribution_strategy_context.in_cross_replica_context()):
+          and not distribute_lib.in_cross_replica_context()):
         raise ValueError(
             'Trying to run metric.update_state in replica context when '
             'the metric was not created in TPUStrategy scope. '
@@ -115,8 +114,8 @@ def result_wrapper(result_fn):
 
   def decorated(metric_obj, *args):
     """Decorated function with merge_call."""
-    has_strategy = distribution_strategy_context.has_strategy()
-    replica_context = distribution_strategy_context.get_replica_context()
+    has_strategy = distribute_lib.has_strategy()
+    replica_context = distribute_lib.get_replica_context()
 
     # The purpose of using `merge_call` to call `result()` is to trigger cross
     # replica aggregation of metric state variables (SyncOnReadVariable). After
@@ -142,9 +141,9 @@ def result_wrapper(result_fn):
     # compiled functions are not inlined (hence #2 is okay).
 
     if (not has_strategy or replica_context is None or
-        not distribution_strategy_context.get_strategy(
+        not distribute_lib.get_strategy(
         ).extended._use_merge_call()):
-      with distribution_strategy_context.variable_sync_on_read_context():
+      with distribute_lib.variable_sync_on_read_context():
         raw_result = result_fn(*args)
         # Results need to be wrapped in a `tf.identity` op to ensure
         # correct execution order.
@@ -595,8 +594,9 @@ def update_confusion_matrix_variables(variables_to_update,
     # details.
     thresholds_with_epsilon = thresholds[0] < 0.0 or thresholds[-1] > 1.0
 
-  thresholds = ops.convert_to_tensor_v2_with_dispatch(
-      thresholds, dtype=variable_dtype)
+  thresholds = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      thresholds, dtype=variable_dtype
+  )
   num_thresholds = thresholds.shape.as_list()[0]
 
   if multi_label:
diff --git a/tensorflow/python/keras/utils/object_identity.py b/tensorflow/python/keras/utils/object_identity.py
index 06f085d6278..eabbe228385 100644
--- a/tensorflow/python/keras/utils/object_identity.py
+++ b/tensorflow/python/keras/utils/object_identity.py
@@ -15,6 +15,7 @@
 # ==============================================================================
 
 import collections
+from typing import Any, Set
 import weakref
 
 
@@ -181,6 +182,26 @@ class ObjectIdentitySet(collections.abc.MutableSet):
   def __init__(self, *args):
     self._storage = set(self._wrap_key(obj) for obj in list(*args))
 
+  def __le__(self, other: Set[Any]) -> bool:
+    if not isinstance(other, Set):
+      return NotImplemented
+    if len(self) > len(other):
+      return False
+    for item in self._storage:
+      if item not in other:
+        return False
+    return True
+
+  def __ge__(self, other: Set[Any]) -> bool:
+    if not isinstance(other, Set):
+      return NotImplemented
+    if len(self) < len(other):
+      return False
+    for item in other:
+      if item not in self:
+        return False
+    return True
+
   @staticmethod
   def _from_storage(storage):
     result = ObjectIdentitySet()
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 50c25123a4c..c752d2352c0 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -20,8 +20,8 @@ cuda_py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_benchmark",
+        "//tensorflow/python/platform:gfile",
     ],
 )
 
@@ -103,6 +103,7 @@ tf_py_test(
         "//tensorflow/python:composite_tensor_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/platform:test",
     ],
 )
 
@@ -113,6 +114,7 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:control_flow_v2_toggles",
@@ -127,6 +129,7 @@ cuda_py_test(
         "//tensorflow/python/data/experimental/ops:prefetching_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:tf_logging",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -171,6 +174,7 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:logging_ops",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
@@ -219,6 +223,7 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
diff --git a/tensorflow/python/kernel_tests/array_ops/BUILD b/tensorflow/python/kernel_tests/array_ops/BUILD
index 5391850b919..c5631b443df 100644
--- a/tensorflow/python/kernel_tests/array_ops/BUILD
+++ b/tensorflow/python/kernel_tests/array_ops/BUILD
@@ -18,21 +18,35 @@ cuda_py_test(
     ],
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
+        "//tensorflow/python:gradient_checker_v2",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:list_ops",
         "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:test_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -110,6 +124,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:gradient_checker_v2",
         "//third_party/py/numpy",
     ],
 )
@@ -126,6 +141,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
@@ -167,10 +183,10 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
@@ -185,7 +201,7 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
@@ -198,7 +214,6 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
         "//third_party/py/numpy",
     ],
 )
@@ -219,6 +234,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -234,7 +250,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -257,6 +273,7 @@ tf_py_test(
     size = "small",
     srcs = ["fingerprint_op_test.py"],
     deps = [
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -271,6 +288,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:ref_variable",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -282,14 +300,16 @@ cuda_py_test(
     srcs = ["gather_op_test.py"],
     shard_count = 3,
     deps = [
-        "@absl_py//absl/testing:parameterized",
-        "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:ref_variable",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:tensor_spec",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
 
@@ -416,6 +436,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -449,6 +470,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//third_party/py/numpy",
     ],
 )
@@ -461,6 +483,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//third_party/py/numpy",
     ],
 )
@@ -474,6 +497,8 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradient_checker_v2",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
 )
@@ -491,7 +516,6 @@ cuda_py_test(
         "//tensorflow/python:io_ops_gen",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_grad",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:sparse_ops",
         "//third_party/py/numpy",
@@ -508,10 +532,12 @@ cuda_py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -526,6 +552,7 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:ref_variable",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
@@ -593,7 +620,9 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -622,6 +651,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -656,6 +686,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
         "//third_party/py/numpy",
     ],
@@ -673,7 +704,6 @@ tf_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:weights_broadcast_ops",
         "//third_party/py/numpy",
     ],
@@ -688,6 +718,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
index a7cf0fd884c..7090bd7a3c4 100644
--- a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
@@ -48,6 +48,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
 from tensorflow.python.platform import test as test_lib
@@ -893,6 +894,14 @@ class StridedSliceShapeTest(test_util.TensorFlowTestCase):
 
       _ = f.get_concrete_function(tensor_spec.TensorSpec(None, dtypes.float32))
 
+  def testScalarInput(self):
+    c = constant_op.constant(3)
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError),
+        "Attempting to slice scalar input.",
+    ):
+      array_ops.strided_slice(c, [0], [1])
+
   def tensorShapeEqual(self, x, y):
     self.assertTrue(x is not None and y is not None or x is None and y is None)
     self.assertEqual(x.as_list(), y.as_list())
@@ -1335,7 +1344,7 @@ class SliceAssignTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
     too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
     too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
-    v = variables.VariableV1(init_val)
+    v = variable_v1.VariableV1(init_val)
     with self.assertRaises((ValueError, TypeError)):
       self.evaluate(v[:].assign(too_small_val))
     with self.assertRaises((ValueError, TypeError)):
diff --git a/tensorflow/python/kernel_tests/array_ops/cast_op_test.py b/tensorflow/python/kernel_tests/array_ops/cast_op_test.py
index a48d70b9717..558b0e181a3 100644
--- a/tensorflow/python/kernel_tests/array_ops/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/cast_op_test.py
@@ -263,6 +263,30 @@ class SaturateCastTest(test.TestCase):
         correct = np.maximum(out_type.min, np.minimum(out_type.max, x))
         self.assertAllEqual(correct, y)
 
+  def testSaturateAvoidsUndefinedBehavior(self):
+    # E.g. float32 -> uint32, float32 cannot represent uint32 max, so trying
+    # to clip to that range can introduce values still outside those
+    # representable by uint32, which result in UB when followed by a cast.
+    out_type = dtypes.uint32
+    in_type = dtypes.float32
+    lo, hi = out_type.min, out_type.max
+    x = constant_op.constant(
+        [lo - 1, lo, lo + 1, lo // 2, hi // 2, hi - 1, hi, hi + 1],
+        dtype=in_type,
+    )
+    y = math_ops.saturate_cast(x, out_type)
+    x, y = self.evaluate([x, y])
+
+    # Ensure that we are at most one representable input element away from the
+    # true answer.  Note that we need to undo numpy type promotion in clip().
+    np_out_type = out_type.as_numpy_dtype
+    np_in_type = in_type.as_numpy_dtype
+    abs_limit = np.nextafter(
+        np.abs(np.clip(x, out_type.min, out_type.max).astype(np_in_type)),
+        np_in_type(0),
+    ).astype(np_out_type)
+    self.assertTrue(np.all(np.abs(y) >= abs_limit))
+
   @test_util.disable_xla("Clamp is not implemented for C128 in XLA")
   def testSaturateComplexToComplex(self):
     in_types = (dtypes.complex64, dtypes.complex128)
diff --git a/tensorflow/python/kernel_tests/array_ops/gather_nd_op_test.py b/tensorflow/python/kernel_tests/array_ops/gather_nd_op_test.py
index 2a301a9087b..f90c0ee84c0 100644
--- a/tensorflow/python/kernel_tests/array_ops/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/gather_nd_op_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import ref_variable
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -373,7 +374,8 @@ class GatherNdTest(test.TestCase):
   @test_util.run_v1_only("RefVariable is not supported in v2")
   def testGatherNdRefVariable(self):
     with self.cached_session():
-      v = variables.RefVariable(constant_op.constant([[1, 2], [3, 4], [5, 6]]))
+      v = ref_variable.RefVariable(
+          constant_op.constant([[1, 2], [3, 4], [5, 6]]))
       self.evaluate(variables.global_variables_initializer())
       gather = array_ops.gather_nd(v, [[0, 1], [2, 0]])
       if not context.executing_eagerly():  # .op doesn't make sense in Eager
diff --git a/tensorflow/python/kernel_tests/array_ops/gather_op_test.py b/tensorflow/python/kernel_tests/array_ops/gather_op_test.py
index aecdeac49c1..7c4f27c1cda 100644
--- a/tensorflow/python/kernel_tests/array_ops/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/gather_op_test.py
@@ -18,7 +18,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,6 +30,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import ref_variable
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -680,11 +680,10 @@ class GatherTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("RefVariable is not supported in v2")
   def testGatherRefVariable(self):
     with self.cached_session():
-      v = variables.RefVariable(constant_op.constant([[1, 2], [3, 4], [5, 6]]))
+      v = ref_variable.RefVariable(
+          constant_op.constant([[1, 2], [3, 4], [5, 6]]))
       self.evaluate(variables.global_variables_initializer())
       gather = array_ops.gather(v, [0, 2])
-      if not context.executing_eagerly():  # .op doesn't make sense in Eager
-        self.assertEqual("GatherV2", gather.op.name)
       self.assertAllEqual([[1, 2], [5, 6]], gather)
 
   @test_util.run_in_graph_and_eager_modes
@@ -694,8 +693,6 @@ class GatherTest(test.TestCase, parameterized.TestCase):
           constant_op.constant([[1, 2], [3, 4], [5, 6]]))
       self.evaluate(variables.global_variables_initializer())
       gather = array_ops.gather(v, [0, 2])
-      if not context.executing_eagerly():  # .op doesn't make sense in Eager
-        self.assertEqual("ResourceGather", gather.op.inputs[0].op.type)
       self.assertAllEqual([[1, 2], [5, 6]], gather)
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
index 25fd53740c7..1c009421be1 100644
--- a/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -147,7 +148,7 @@ class StatefulScatterNdTest(test.TestCase):
         new = ref.copy()
         np_scatter(new, indices, updates)
         # Scatter via tensorflow
-        ref_var = variables.VariableV1(ref)
+        ref_var = variable_v1.VariableV1(ref)
         self.evaluate(ref_var.initializer)
         self.evaluate(tf_scatter(ref_var, indices, updates))
 
@@ -285,7 +286,7 @@ class StatefulScatterNdTest(test.TestCase):
       params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
       updates = np.array([-3, -4, -5]).astype(np.float32)
       with test_util.device(use_gpu=False):
-        ref = variables.VariableV1(params)
+        ref = variable_v1.VariableV1(params)
         self.evaluate(ref.initializer)
 
         # Indices all in range, no problem.
diff --git a/tensorflow/python/kernel_tests/array_ops/scatter_ops_test.py b/tensorflow/python/kernel_tests/array_ops/scatter_ops_test.py
index 093ccb84a69..833091c4a29 100644
--- a/tensorflow/python/kernel_tests/array_ops/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/scatter_ops_test.py
@@ -20,6 +20,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ref_variable
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -342,7 +343,7 @@ class ScatterTest(test.TestCase):
                          "implementation")
   @test_util.run_cuda_only
   def testDeterminismExceptionThrowing(self):
-    v = variables.RefVariable(np.array([1., 2., 3.]))
+    v = ref_variable.RefVariable(np.array([1., 2., 3.]))
     indices = np.array([0, 0, 0])
     updates = np.array([-3, -4, -5]).astype(np.float32)
     with test_util.deterministic_ops():
diff --git a/tensorflow/python/kernel_tests/array_ops/slice_op_test.py b/tensorflow/python/kernel_tests/array_ops/slice_op_test.py
index d1eee67f6dc..bd89fc076f6 100644
--- a/tensorflow/python/kernel_tests/array_ops/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/slice_op_test.py
@@ -181,7 +181,8 @@ class SliceTest(test.TestCase):
     input_val = 0
     # Test with constant input; shape inference fails.
     with self.assertRaisesWithPredicateMatch(
-        (ValueError, errors_impl.InvalidArgumentError), "out of range"):
+        (ValueError, errors_impl.InvalidArgumentError),
+        "Attempting to slice scalar input."):
       constant_op.constant(input_val)[:].get_shape()
 
     # Test evaluating with non-constant input; kernel execution fails.
diff --git a/tensorflow/python/kernel_tests/array_ops/split_op_test.py b/tensorflow/python/kernel_tests/array_ops/split_op_test.py
index ba164e6ecd1..bc910afc2e4 100644
--- a/tensorflow/python/kernel_tests/array_ops/split_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/split_op_test.py
@@ -224,7 +224,7 @@ class SplitOpTest(test.TestCase):
       tf_ans = array_ops.split(value=x, num_or_size_splits=num, axis=dim)
       out = self.evaluate(tf_ans)
     self.assertEqual(num, len(np_ans))
-    self.assertEqual(num, len(np_ans))
+    self.assertEqual(num, len(tf_ans))
     self.assertEqual(num, len(out))
     for i in range(num):
       self.assertAllEqual(np_ans[i], out[i])
diff --git a/tensorflow/python/kernel_tests/control_flow/BUILD b/tensorflow/python/kernel_tests/control_flow/BUILD
index 08aa91478a4..00179723a2b 100644
--- a/tensorflow/python/kernel_tests/control_flow/BUILD
+++ b/tensorflow/python/kernel_tests/control_flow/BUILD
@@ -16,6 +16,7 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:cond_v2",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -29,6 +30,7 @@ cuda_py_test(
         "//tensorflow/python:while_loop",
         "//tensorflow/python:while_v2",
         "//tensorflow/python/compat",
+        "//tensorflow/python/util:compat",
     ] + tf_additional_xla_deps_py(),
 )
 
@@ -46,6 +48,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops_gen",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:cond_v2",
         "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:control_flow_case",
@@ -58,6 +61,7 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:functional_ops",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:logging_ops_gen",
@@ -69,11 +73,12 @@ cuda_py_test(
         "//tensorflow/python:state_ops_gen",
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
         "//tensorflow/python:while_v2",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -85,6 +90,7 @@ tf_py_test(
     srcs = ["control_flow_util_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:control_flow_ops_gen",
         "//tensorflow/python:control_flow_util",
@@ -99,9 +105,9 @@ tf_py_test(
     srcs = ["control_flow_util_v2_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:cond_v2",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:control_flow_util_v2",
         "//tensorflow/python:while_loop",
         "//tensorflow/python:while_v2",
@@ -133,6 +139,7 @@ cuda_py_test(
         "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:executor",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
@@ -210,6 +217,7 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:functional_ops",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients_impl",
         "//tensorflow/python:list_ops",
         "//tensorflow/python:math_ops",
diff --git a/tensorflow/python/kernel_tests/control_flow/cond_v2_test.py b/tensorflow/python/kernel_tests/control_flow/cond_v2_test.py
index d8e17970836..db9869b147c 100644
--- a/tensorflow/python/kernel_tests/control_flow/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/control_flow/cond_v2_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.module import module as module_lib
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
@@ -65,7 +66,7 @@ class CondV2Test(test.TestCase):
     with self.session(graph=ops.get_default_graph()) as sess:
       pred = array_ops.placeholder(dtypes.bool, name="pred")
 
-      expected = control_flow_ops.cond(
+      expected = tf_cond.cond(
           array_ops.squeeze_v2(pred), true_fn, false_fn, name="expected")
       actual = cond_v2.cond_v2(pred, true_fn, false_fn, name="actual")
 
@@ -1198,7 +1199,7 @@ class CondV2Test(test.TestCase):
       def verify_no_optional_ops(op, branch_name):
         branch_function = ops.get_default_graph()._get_function(
             op.get_attr(branch_name).name)
-        function_def = branch_function.definition
+        function_def = branch_function.cached_definition
         for node_def in function_def.node_def:
           self.assertNotIn(node_def.op, _OPTIONAL_OPS)
 
@@ -1246,7 +1247,7 @@ class CondV2Test(test.TestCase):
     def false_fn():
       return ((x,), y * 3.0)
 
-    output = control_flow_ops.cond(
+    output = tf_cond.cond(
         constant_op.constant(False), true_fn, false_fn)
     self.assertEqual(self.evaluate(output[0][0]), 1.)
     self.assertEqual(self.evaluate(output[1]), 9.)
@@ -1266,7 +1267,7 @@ class CondV2Test(test.TestCase):
     with self.assertRaisesRegex(
         TypeError, "true_fn and false_fn arguments to tf.cond must have the "
         "same number, type, and overall structure of return values."):
-      control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
+      tf_cond.cond(constant_op.constant(False), true_fn, false_fn)
 
   @test_util.enable_control_flow_v2
   def testCondAndTensorArray(self):
@@ -1281,7 +1282,7 @@ class CondV2Test(test.TestCase):
       def if_false():
         return output.write(i, x[i])
 
-      output = control_flow_ops.cond(x[i] > 0, if_true, if_false)
+      output = tf_cond.cond(x[i] > 0, if_true, if_false)
       return i + 1, output
 
     _, output = while_loop.while_loop(
@@ -1308,7 +1309,7 @@ class CondV2Test(test.TestCase):
         def if_false():
           return output.write(i, x[i])
 
-        output = control_flow_ops.cond(x[i] > 0, if_true, if_false)
+        output = tf_cond.cond(x[i] > 0, if_true, if_false)
         return i + 1, output
 
       _, output = while_loop.while_loop(
@@ -1829,7 +1830,7 @@ class CaseTest(test.TestCase):
 
 def _cond(pred, true_fn, false_fn, name):
   if _is_old_cond():
-    return control_flow_ops.cond(pred, true_fn, false_fn, name=name)
+    return tf_cond.cond(pred, true_fn, false_fn, name=name)
   else:
     return cond_v2.cond_v2(pred, true_fn, false_fn, name=name)
 
diff --git a/tensorflow/python/kernel_tests/control_flow/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow/control_flow_ops_py_test.py
index 3ff4f760a35..d1d79afa66a 100644
--- a/tensorflow/python/kernel_tests/control_flow/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow/control_flow_ops_py_test.py
@@ -47,6 +47,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import control_flow_assert
 from tensorflow.python.ops import control_flow_case
 from tensorflow.python.ops import control_flow_ops
@@ -88,6 +89,7 @@ from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.util import nest
 from tensorflow.python.ops import control_flow_switch_case
+from tensorflow.python.ops import variable_v1
 
 
 def check_consumers(graph):
@@ -169,7 +171,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testRefIdentity(self):
     with self.cached_session():
-      v = variables.VariableV1(7)
+      v = variable_v1.VariableV1(7)
 
       v = control_flow_ops._Identity(v)
       op = state_ops.assign(v, 9)
@@ -182,7 +184,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testRefEnter(self):
     with self.cached_session():
-      v = variables.VariableV1(7)
+      v = variable_v1.VariableV1(7)
 
       enter_v = control_flow_ops._Enter(v, "foo_1", is_constant=True)
       nine = constant_op.constant(9)
@@ -196,7 +198,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testRefSwitch(self):
     with self.cached_session():
-      v = variables.VariableV1(7)
+      v = variable_v1.VariableV1(7)
 
       p = constant_op.constant(True)
       v1 = control_flow_ops._SwitchRefOrTensor(v._ref(), p)  # pylint: disable=protected-access
@@ -389,7 +391,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     fn1 = lambda: math_ops.add(values, 1)
     fn2 = lambda: math_ops.subtract(values, 1)
     with self.assertRaisesRegex(TypeError, "must not be a Python bool"):
-      _ = control_flow_ops.cond(False, fn1, fn2)
+      _ = tf_cond.cond(False, fn1, fn2)
 
   @test_util.run_deprecated_v1
   def testCondInt(self):
@@ -397,13 +399,13 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     v = constant_op.constant(10)
     fn1 = lambda: math_ops.add(v, 1)
     fn2 = lambda: math_ops.subtract(v, 1)
-    y = control_flow_ops.cond(p, fn1, fn2)
+    y = tf_cond.cond(p, fn1, fn2)
     grad = gradients_impl.gradients(y, [v])
     self.assertAllEqual([None], grad)
 
   def testCondOutputShape(self):
     x = constant_op.constant(1.0)
-    b = control_flow_ops.cond(
+    b = tf_cond.cond(
         constant_op.constant(True), lambda: math_ops.square(x),
         lambda: math_ops.subtract(x, 1.))
     self.assertEqual(b.shape, tensor_shape.TensorShape([]))
@@ -412,7 +414,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   def testFetchable(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
-      control_flow_ops.cond(
+      tf_cond.cond(
           constant_op.constant(True), lambda: x + 2, lambda: x + 0)
       graph = ops.get_default_graph()
       for op in graph.get_operations():
@@ -454,7 +456,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
           math_ops.add(x.values, 1), indices)
       fn2 = lambda: indexed_slices.IndexedSlices(
           math_ops.subtract(x.values, 1), indices)
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
 
       val = r.values
       ind = r.indices
@@ -469,7 +471,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       x = indexed_slices.IndexedSlices(values, indices)
       with self.assertRaisesRegex(TypeError,
                                   "Cannot reconcile tf.cond 0-th outputs"):
-        control_flow_ops.cond(
+        tf_cond.cond(
             constant_op.constant(True), lambda: indexed_slices.IndexedSlices(
                 math_ops.add(x.values, 1), indices),
             lambda: math_ops.add(x.values, 1), indices)
@@ -487,7 +489,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
         indices + 1, x.values + 1, dense_shape=shape)
     fn2 = lambda: sparse_tensor.SparseTensor(
         indices, x.values - 1, dense_shape=shape)
-    r = control_flow_ops.cond(pred, fn1, fn2)
+    r = tf_cond.cond(pred, fn1, fn2)
     self.assertAllEqual([3.0, 5.0], r.values)
     self.assertAllEqual([[1], [4]], r.indices)
     self.assertAllEqual(r.values.get_shape(), (2,))
@@ -497,7 +499,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     pred = math_ops.less(1, 2)
     fn1 = lambda: array_ops.concat([rt + 2, [[100]]], axis=0)
     fn2 = lambda: rt[:2] - 2
-    result = control_flow_ops.cond(pred, fn1, fn2)
+    result = tf_cond.cond(pred, fn1, fn2)
     self.assertAllEqual([3, 4, 5, 6, 7, 8, 100], result.values)
     self.assertAllEqual([0, 2, 3, 6, 7], result.row_splits)
 
@@ -515,14 +517,14 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
           return array_ops.identity(t)
 
       self.assertEqual(
-          1.0, self.evaluate(control_flow_ops.cond(rv, case, lambda: t)))
+          1.0, self.evaluate(tf_cond.cond(rv, case, lambda: t)))
 
   @test_util.run_deprecated_v1
   def testCondResourceGradShape(self):
     rv1 = resource_variable_ops.ResourceVariable([1.0, 2.0])
     rv2 = resource_variable_ops.ResourceVariable([3.0, 4.0])
     pred = constant_op.constant(True)
-    result = control_flow_ops.cond(pred, lambda: rv1, lambda: rv2)
+    result = tf_cond.cond(pred, lambda: rv1, lambda: rv2)
     grads = gradients_impl.gradients(result, [rv1, rv2])
     self.assertAllEqual(grads[0].shape.as_list(), [2])
     self.assertAllEqual(grads[1].shape.as_list(), [2])
@@ -533,7 +535,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       with ops.device(test.gpu_device_name()):
         pred = array_ops.placeholder(dtypes.bool, [])
         x = constant_op.constant([1.0, 2.0, 3.0])
-        y = control_flow_ops.cond(
+        y = tf_cond.cond(
             pred, lambda: map_fn.map_fn(lambda z: z * 2.0, x),
             lambda: constant_op.constant([1.0, 1.0, 1.0]))
         g = gradients_impl.gradients(y, x)[0]
@@ -553,7 +555,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
           math_ops.add(x.values, 1), i_32)
       fn2 = lambda: indexed_slices.IndexedSlices(
           math_ops.subtract(x.values, 1), i_64)
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
 
       val = r.values
       ind = r.indices
@@ -571,7 +573,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.less(1.0, 2.0)
       fn1 = lambda: math_ops.add(v, 1.0)
       fn2 = lambda: math_ops.subtract(x, 1.0)
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
 
       for op in x.graph.get_operations():
         if op.name == "cond/Add/Switch":
@@ -583,7 +585,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.less(1, 2)
       fn1 = lambda: math_ops.add(x, 1)
       fn2 = lambda: math_ops.subtract(x, 1)
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
 
       result = self.evaluate(r)
     self.assertAllEqual(11, result)
@@ -598,7 +600,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
 
     with self.cached_session():
       x = constant_op.constant(10)
-      r = control_flow_ops.cond(
+      r = tf_cond.cond(
           math_ops.less(1, 0), lambda: math_ops.add(x, 1),
           lambda: math_ops.subtract(x, 1))
       result = self.evaluate(r)
@@ -611,8 +613,8 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.less(1, 2)
       fn1 = lambda: math_ops.add(x, 1)
       fn2 = lambda: math_ops.subtract(x, 1)
-      fn3 = lambda: math_ops.add(control_flow_ops.cond(pred, fn1, fn2), 1)
-      r = control_flow_ops.cond(pred, fn3, fn2)
+      fn3 = lambda: math_ops.add(tf_cond.cond(pred, fn1, fn2), 1)
+      r = tf_cond.cond(pred, fn3, fn2)
 
       result = self.evaluate(r)
     self.assertAllEqual(12, result)
@@ -629,7 +631,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.greater(age, max_age)
       fn1 = lambda: [state_ops.assign(v1, 1).op, state_ops.assign(v2, 2).op]
       fn2 = lambda: [state_ops.assign(v3, 3).op, constant_op.constant(10).op]
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
       self.assertEqual(len(r), 2)
       return r[1]
 
@@ -658,7 +660,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       count = constant_op.constant(0, name="count")
 
       def body(i):
-        return control_flow_ops.cond(
+        return tf_cond.cond(
             alive, lambda: [math_ops.less(i, 3), math_ops.add(count, 1)],
             lambda: [alive, count])
 
@@ -675,7 +677,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.greater(age, 4)
       fn1 = lambda: age
       fn2 = lambda: v1
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
 
       self.evaluate(variables.global_variables_initializer())
       result = self.evaluate(r)
@@ -688,7 +690,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.less(1, 2)
       fn1 = lambda: [math_ops.add(x, 1), math_ops.add(x, 2)]
       fn2 = lambda: [y, y]
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
       self.assertAllEqual([11, 12], self.evaluate(r))
 
   @parameterized.parameters(dtypes.float32, dtypes.float64)
@@ -701,7 +703,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     pred = math_ops.greater(age, 4.)
     fn1 = lambda: age
     fn2 = lambda: v1
-    r = control_flow_ops.cond(pred, fn1, fn2)
+    r = tf_cond.cond(pred, fn1, fn2)
 
     grad = gradients_impl.gradients(r, v1)[0]
     self.evaluate(variables.global_variables_initializer())
@@ -717,7 +719,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       return math_ops.exp(x)
 
     with ops.device("CPU:0"):
-      r = control_flow_ops.cond(
+      r = tf_cond.cond(
           constant_op.constant(True), true_fn, lambda: 0.)
       self.assertIn("cpu", r.device.lower())
 
@@ -758,7 +760,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       with ops.device("CPU:0"):
         return arg + 1
 
-    r = control_flow_ops.cond(constant_op.constant(True), true_fn, lambda: 0.)
+    r = tf_cond.cond(constant_op.constant(True), true_fn, lambda: 0.)
 
     # Disable Loop_optimizer grappler pass for this test because it replaces
     # Switch with Identity when it's part of a dead branch.
@@ -794,7 +796,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     def true_fn():
       return cardinality.cardinality(arg)
 
-    r = control_flow_ops.cond(
+    r = tf_cond.cond(
         constant_op.constant(True), true_fn,
         lambda: constant_op.constant(0, dtypes.int64))
 
@@ -832,7 +834,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       with ops.device("CPU:0"):
         return arg + 1
 
-    r = control_flow_ops.cond(constant_op.constant(True), true_fn, lambda: 0.)
+    r = tf_cond.cond(constant_op.constant(True), true_fn, lambda: 0.)
 
     # Disable Loop_optimizer grappler pass for this test because it replaces
     # Switch with Identity when it's part of a dead branch.
@@ -869,7 +871,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
         return array_ops.identity(inputs["c"])
 
       pred = constant_op.constant(True)
-      return control_flow_ops.cond(
+      return tf_cond.cond(
           pred, lambda: true_fn(inputs), lambda: false_fn(inputs))
 
     # This was needed for backwards compatibility with TF2 Estimators which
@@ -920,7 +922,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.less(1, 2)
       fn1 = lambda: [math_ops.add(x, y), math_ops.add(x, y)]
       fn2 = lambda: [y, y]
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
       test_result = self.evaluate(r)
       self.assertListEqual([210, 210], test_result)
 
@@ -931,7 +933,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.less(1, 2)
       fn1 = lambda: (math_ops.add(x, y), math_ops.add(x, y))
       fn2 = lambda: (y, y)
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
       test_result = self.evaluate(r)
       self.assertTupleEqual((210, 210), test_result)
 
@@ -942,7 +944,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.less(1, 2)
       fn1 = lambda: {"a": math_ops.add(x, y), "b": math_ops.add(x, y)}
       fn2 = lambda: {"a": y, "b": y}
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
       test_result = self.evaluate(r)
       self.assertDictEqual({"a": 210, "b": 210}, test_result)
 
@@ -954,7 +956,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     fn2 = lambda: [[y, y]]
     # Pass strict=True flag as cond_v2 allows for tensors to be
     # in nested output structures as singletons
-    r = control_flow_ops.cond(pred, fn1, fn2, strict=True)
+    r = tf_cond.cond(pred, fn1, fn2, strict=True)
     test_result = self.evaluate(r)
     self.assertListEqual([[210, 210]], test_result)
 
@@ -965,7 +967,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.less(1, 2)
       fn1 = lambda: ((math_ops.add(x, y), math_ops.add(x, y)))
       fn2 = lambda: ((y, y))
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
       test_result = self.evaluate(r)
       self.assertTupleEqual(((210, 210)), test_result)
 
@@ -978,7 +980,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
                      "b": {"d": math_ops.add(x, y)}}
       fn2 = lambda: {"a": {"c": y},
                      "b": {"d": y}}
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
       test_result = self.evaluate(r)
       self.assertDictEqual({"a": {"c": 210}, "b": {"d": 210}}, test_result)
 
@@ -996,7 +998,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       with self.assertRaisesRegex(
           TypeError if control_flow_util.ENABLE_CONTROL_FLOW_V2 else ValueError,
           v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
-        control_flow_ops.cond(pred, fn1, fn2)
+        tf_cond.cond(pred, fn1, fn2)
 
   @test_util.run_v1_only("b/120545219")
   def testCondWithControl(self):
@@ -1009,7 +1011,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
           _ = a + 1
         return a + 2
 
-      r = control_flow_ops.cond(
+      r = tf_cond.cond(
           constant_op.constant(True), true_branch,
           lambda: constant_op.constant(1))
       result = sess.run(r, feed_dict={control_holder: 5.})
@@ -1051,7 +1053,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       def fn2():
         return control_flow_assert.Assert(False, ["Wrong branch!!!"])
 
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
       self.evaluate(r)
 
   def testCondRecvIdentity(self):
@@ -1067,7 +1069,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
         with ops.device("/cpu:0"):
           return control_flow_assert.Assert(False, ["Wrong branch!!!"])
 
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
       self.evaluate(r)
 
   @test_util.run_deprecated_v1
@@ -1087,7 +1089,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
 
     pred = array_ops.placeholder_with_default(
         constant_op.constant(True), shape=())
-    r = control_flow_ops.cond(pred, lambda: True, lambda: False)
+    r = tf_cond.cond(pred, lambda: True, lambda: False)
 
     with session.Session(config=config) as sess:
       r_value = sess.run(
@@ -1118,7 +1120,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.less(1, 2)
       fn1 = lambda: array_ops.identity(x)
       fn2 = lambda: array_ops.identity(x)
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
 
       grad = gradients_impl.gradients(r, [x])[0]
       self.assertAllEqual(1.0, self.evaluate(grad))
@@ -1137,7 +1139,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       def false_fn():
         return x * x
 
-      r = control_flow_ops.cond(pred, true_fn, false_fn)
+      r = tf_cond.cond(pred, true_fn, false_fn)
 
       self.assertAllEqual(r, 10000.)
       grad = gradients_impl.gradients(r, [x])[0]
@@ -1162,7 +1164,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       def false_fn():
         return x * x
 
-      r = control_flow_ops.cond(pred, true_fn, false_fn)
+      r = tf_cond.cond(pred, true_fn, false_fn)
 
       self.assertAllEqual(r, 10000.)
       grad = gradients_impl.gradients(r, [x])[0]
@@ -1184,13 +1186,13 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
         def inner_false_fn():
           return x * x
 
-        return control_flow_ops.cond(
+        return tf_cond.cond(
             constant_op.constant(True), inner_true_fn, inner_false_fn)
 
       def false_fn():
         return x * x
 
-      r = control_flow_ops.cond(pred, true_fn, false_fn)
+      r = tf_cond.cond(pred, true_fn, false_fn)
 
       self.assertAllEqual(r, 10000.)
       grad = gradients_impl.gradients(r, [x])[0]
@@ -1204,7 +1206,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.less(c, 2)
       fn1 = lambda: math_ops.multiply(x, 42.0)
       fn2 = lambda: math_ops.multiply(x, 3.0)
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
 
       grad = gradients_impl.gradients(r, [x])[0]
       self.assertAllEqual(42.0, grad.eval(feed_dict={c: 1}))
@@ -1225,7 +1227,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
 
       fn2 = lambda: math_ops.multiply(ox, 3.0)
       y = math_ops.multiply(7.0, ox)
-      r = control_flow_ops.cond(pred, lambda: fn1(y), fn2)
+      r = tf_cond.cond(pred, lambda: fn1(y), fn2)
 
       self.assertAllEqual(980.0, r.eval(feed_dict={c: 1}))
       self.assertAllEqual(30.0, r.eval(feed_dict={c: 3}))
@@ -1240,7 +1242,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       y = array_ops.placeholder(dtypes.float32)
 
       with ops.device("/cpu:0"):
-        z = control_flow_ops.cond(pred, lambda: x * y * 2.0, lambda: 2.0)
+        z = tf_cond.cond(pred, lambda: x * y * 2.0, lambda: 2.0)
 
       with ops.device("/cpu:1"):
         grad = gradients_impl.gradients(z, x)[0]
@@ -1263,15 +1265,15 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   def testNestedCond_Simple(self):
     with self.cached_session():
       x = constant_op.constant(0., name="X")
-      y = control_flow_ops.cond(
+      y = tf_cond.cond(
           constant_op.constant(True), lambda: x,
-          lambda: control_flow_ops.cond(x < 1., lambda: x, lambda: x))
+          lambda: tf_cond.cond(x < 1., lambda: x, lambda: x))
       result = gradients_impl.gradients(y, x)[0]
       self.assertEqual(1.0, self.evaluate(result))
 
-      z = control_flow_ops.cond(
+      z = tf_cond.cond(
           constant_op.constant(False), lambda: x,
-          lambda: control_flow_ops.cond(x < 1., lambda: x, lambda: x))
+          lambda: tf_cond.cond(x < 1., lambda: x, lambda: x))
       result = gradients_impl.gradients(z, x)[0]
       self.assertEqual(1.0, self.evaluate(result))
 
@@ -1283,7 +1285,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       pred = math_ops.less(c, 2)
       fn1 = lambda: array_ops.identity(v1)
       fn2 = lambda: array_ops.gather(v1, [1, 1])
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
       # The following `grad` is a Tensor since it is the aggregation of an
       # IndexedSlice and a Tensor. It is an `IndexedSlices` with control flow
       # v2.
@@ -1306,7 +1308,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     var = resource_variable_ops.ResourceVariable(
         np.ones((4, 2), dtype=np.float32))
     x = constant_op.constant(1.0)
-    r = control_flow_ops.cond(
+    r = tf_cond.cond(
         constant_op.constant(True),
         lambda: x * math_ops.reduce_sum(var.sparse_read([1, 2])),
         lambda: constant_op.constant(np.zeros((2, 3)),
@@ -1343,7 +1345,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
 
     @eager_def_function.function
     def foo():
-      r = control_flow_ops.cond(constant_op.constant(True), true_fn, false_fn)
+      r = tf_cond.cond(constant_op.constant(True), true_fn, false_fn)
       return gradients_impl.gradients(r, [var, x1, x2])
 
     grad = foo()
@@ -1369,7 +1371,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     def foo():
       return constant_op.constant("foo"), constant_op.constant(True)
 
-    r = control_flow_ops.cond(foo()[1], lambda: 1.0, lambda: 2.0)
+    r = tf_cond.cond(foo()[1], lambda: 1.0, lambda: 2.0)
     self.assertEqual(self.evaluate(r), 1.0)
 
   @test_util.run_v1_only("Tests Session.run() pruning logic.")
@@ -1377,7 +1379,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       value = constant_op.constant(37.0)
       predicate = constant_op.constant(True)
-      cond_output = control_flow_ops.cond(
+      cond_output = tf_cond.cond(
           predicate, lambda: constant_op.constant(0.0), lambda: value)
       result = array_ops.identity(cond_output)
       self.assertEqual(37.0, sess.run(result, feed_dict={predicate: False}))
@@ -1390,7 +1392,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       value = constant_op.constant(37.0)
       predicate = array_ops.placeholder_with_default(
           constant_op.constant(True), [])
-      cond_output = control_flow_ops.cond(
+      cond_output = tf_cond.cond(
           predicate, lambda: constant_op.constant(0.0), lambda: value)
       result = array_ops.identity(cond_output)
       self.assertAllEqual(37.0, sess.run(result, feed_dict={predicate: False}))
@@ -1421,11 +1423,11 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
         return constant_op.constant(10)
 
     def build_cond():
-      return control_flow_ops.cond(
+      return tf_cond.cond(
           constant_op.constant(True), branch_fn, lambda: 0)
 
     def build_nested_cond():
-      return control_flow_ops.cond(
+      return tf_cond.cond(
           constant_op.constant(True), build_cond, lambda: 0)
 
     # In v1 graph mode, pruning should make only "C" print.
@@ -1503,7 +1505,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
                        constant_op.constant(0)])
 
     def build_nested_while():
-      return control_flow_ops.cond(
+      return tf_cond.cond(
           constant_op.constant(True), build_while, lambda: [0, 0])
 
     # In v1 graph mode, pruning should make only "D" print.
@@ -1599,7 +1601,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testWhileWithRefs_1(self):
     with self.cached_session() as sess:
-      x = variables.VariableV1(0)._ref()  # pylint: disable=protected-access
+      x = variable_v1.VariableV1(0)._ref()  # pylint: disable=protected-access
       i = constant_op.constant(0)
       c = lambda i, x: math_ops.less(i, 100)
 
@@ -1755,7 +1757,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
         max_iter_holder.append(array_ops.placeholder(dtypes.int32, shape=()))
         return 1.0
 
-      _ = control_flow_ops.cond(
+      _ = tf_cond.cond(
           constant_op.constant(True), create_mi, create_mi)
 
       return while_loop_tf.while_loop(
@@ -2497,8 +2499,8 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       const_true = lambda: constant_op.constant(True)
       const_false = lambda: constant_op.constant(False)
-      cond = lambda i: control_flow_ops.cond(i > 0, const_true, const_false)
-      body = lambda i: control_flow_ops.cond(i > 0, lambda: i - 1, lambda: i)
+      cond = lambda i: tf_cond.cond(i > 0, const_true, const_false)
+      body = lambda i: tf_cond.cond(i > 0, lambda: i - 1, lambda: i)
 
       with ops.control_dependencies([control_flow_ops.no_op()]):
         loop = while_loop_tf.while_loop(cond, body, (constant_op.constant(5),))
@@ -2517,7 +2519,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
           return i < 4
 
         def loop_body(i):
-          some_cond = control_flow_ops.cond(
+          some_cond = tf_cond.cond(
               constant_op.constant(True),
               lambda: state_ops.assign(v, math_ops.square(v)), lambda: v)
           with ops.control_dependencies([some_cond]):
@@ -2547,7 +2549,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
         with ops.control_dependencies([loop]):
           return constant_op.constant(6.0)
 
-      r = control_flow_ops.cond(
+      r = tf_cond.cond(
           constant_op.constant(False), lambda: constant_op.constant(1.0),
           false_branch)
       self.evaluate(variables.global_variables_initializer())
@@ -2560,7 +2562,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       n = ops.convert_to_tensor(0, name="n")
       c = lambda x: math_ops.less(x, 10)
       b = lambda x: math_ops.add(x, 1)
-      r = control_flow_ops.cond(
+      r = tf_cond.cond(
           math_ops.less(0, 1), lambda: while_loop_tf.while_loop(c, b, [n]),
           lambda: n)
       self.assertAllEqual(10, self.evaluate(r))
@@ -2571,7 +2573,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       n = ops.convert_to_tensor(0)
       c = lambda x: math_ops.less(x, 10)
       b = lambda x: math_ops.add(x, 1)
-      r = control_flow_ops.cond(
+      r = tf_cond.cond(
           math_ops.less(1, 0), lambda: math_ops.add(n, 1),
           lambda: while_loop_tf.while_loop(c, b, [n]))
       self.assertAllEqual(10, self.evaluate(r))
@@ -2589,7 +2591,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
           x1 = math_ops.add(x, 1.0)
         return x1
 
-      r = control_flow_ops.cond(
+      r = tf_cond.cond(
           p,
           lambda: while_loop_tf.while_loop(c, b, [n]),
           lambda: math_ops.multiply(n, 2.0),
@@ -2614,7 +2616,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       c = lambda x: math_ops.less(x, n)
       # pylint: disable=undefined-variable
       # for OSS build
-      b = lambda x: control_flow_ops.cond(
+      b = lambda x: tf_cond.cond(
           constant_op.constant(True),
           lambda: math_ops.add(x, one), lambda: math_ops.subtract(x, one))
       # pylint: enable=undefined-variable
@@ -2626,7 +2628,8 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     with self.cached_session():
       n = ops.convert_to_tensor(0, name="n")
       c = lambda x: math_ops.less(x, 10)
-      b = lambda x: control_flow_ops.cond(constant_op.constant(True), lambda: math_ops.add(x, 1), lambda: n)
+      b = lambda x: tf_cond.cond(
+          constant_op.constant(True), lambda: math_ops.add(x, 1), lambda: n)
       r = while_loop_tf.while_loop(c, b, [n])
       self.assertAllEqual(10, self.evaluate(r))
 
@@ -2637,9 +2640,9 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       c = lambda x: math_ops.less(x, 10)
       # pylint: disable=undefined-variable
       # for OSS build
-      b = lambda x: control_flow_ops.cond(math_ops.less(0, 1),
-                                          lambda: math_ops.add(x, 1),
-                                          lambda: math_ops.subtract(x, 1))
+      b = lambda x: tf_cond.cond(math_ops.less(0, 1),
+                                 lambda: math_ops.add(x, 1),
+                                 lambda: math_ops.subtract(x, 1))
       # pylint: enable=undefined-variable
       r = while_loop_tf.while_loop(c, b, [n])
       self.assertAllEqual(10, self.evaluate(r))
@@ -2655,7 +2658,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       with ops.device("/cpu:0"):
         z = while_loop_tf.while_loop(
             lambda i, _: i < 3, lambda i, x:
-            (i + 1, control_flow_ops.cond(pred, lambda: x * 2.0, lambda: 10.0)),
+            (i + 1, tf_cond.cond(pred, lambda: x * 2.0, lambda: 10.0)),
             [0, x_init])
 
       with ops.device("/cpu:1"):
@@ -2953,7 +2956,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       c = lambda v: math_ops.less(v, 100.0)
       b = math_ops.square
       r = while_loop_tf.while_loop(c, b, [v], parallel_iterations=1)
-      r = control_flow_ops.cond(math_ops.less(1, 2), lambda: r, lambda: v)
+      r = tf_cond.cond(math_ops.less(1, 2), lambda: r, lambda: v)
 
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(1024.0, self.evaluate(r))
@@ -3108,7 +3111,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       c = lambda x: math_ops.less(x, 128.0)
 
       def b(x):
-        return control_flow_ops.cond(
+        return tf_cond.cond(
             constant_op.constant(True),
             lambda: math_ops.square(inner_loop(x)[1]),
             lambda: math_ops.multiply(x, 2.0))
@@ -3236,7 +3239,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       return x + var.sparse_read([1])[0]
 
     def body(i, x):
-      return (i + 1, control_flow_ops.cond(
+      return (i + 1, tf_cond.cond(
           math_ops.equal(i % 2, 0),
           lambda: foo(x, var1),
           lambda: foo(x, var2)))
@@ -3347,7 +3350,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
         r = while_loop_tf.while_loop(c, b, [n], [tensor_shape.unknown_shape()])
         return gradients_impl.gradients(r, x)[0]
 
-      r = control_flow_ops.cond(math_ops.less(1, 2), fn1, lambda: x)
+      r = tf_cond.cond(math_ops.less(1, 2), fn1, lambda: x)
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
   @test_util.disable_control_flow_v2("b/116340060")
@@ -3393,7 +3396,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       return i + 1, x
 
     def inner_body(j, x):
-      y = control_flow_ops.cond(math_ops.less(x, 1), lambda: 2 * x, lambda: x)
+      y = tf_cond.cond(math_ops.less(x, 1), lambda: 2 * x, lambda: x)
       return j + 1, gradients_impl.gradients(y, x)[0]
 
     i, x = while_loop_tf.while_loop(lambda i, x: i < 3, outer_body, [0, 0.0])
@@ -3413,7 +3416,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       return while_loop_tf.while_loop(
           lambda i, _: i < 3, lambda i, x:
           (i + 1,
-           control_flow_ops.cond(
+           tf_cond.cond(
                constant_op.constant(True), lambda: x + var, lambda: x)),
           [0, 0.0])[1]
 
@@ -3432,7 +3435,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
           # Outer loop body
           lambda i, y: (
               i + 1,
-              y + control_flow_ops.cond(
+              y + tf_cond.cond(
                   constant_op.constant(True),
                   # True branch
                   lambda: while_loop_tf.while_loop(
@@ -3768,9 +3771,9 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       c = lambda x: math_ops.less(x, n)
       # pylint: disable=undefined-variable
       # for OSS build
-      b = lambda x: control_flow_ops.cond(constant_op.constant(True),
-                                          lambda: math_ops.square(x),
-                                          lambda: math_ops.subtract(x, one))
+      b = lambda x: tf_cond.cond(constant_op.constant(True),
+                                 lambda: math_ops.square(x),
+                                 lambda: math_ops.subtract(x, one))
       # pylint: enable=undefined-variable
       r = while_loop_tf.while_loop(c, b, [v])
       r = gradients_impl.gradients(r, v)[0]
@@ -3790,9 +3793,9 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       c = lambda x: math_ops.less(x, n)
       # pylint: disable=undefined-variable
       # for OSS build
-      b = lambda x: control_flow_ops.cond(constant_op.constant(True),
-                                          lambda: math_ops.square(x),
-                                          lambda: math_ops.subtract(x, one))
+      b = lambda x: tf_cond.cond(constant_op.constant(True),
+                                 lambda: math_ops.square(x),
+                                 lambda: math_ops.subtract(x, one))
       # pylint: enable=undefined-variable
       r = while_loop_tf.while_loop(c, b, [v])
       r = gradients_impl.gradients(r, v)[0]
@@ -3828,7 +3831,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testWhileWithRefsWithGradients_1(self):
     with self.cached_session() as sess:
-      x = variables.VariableV1(0.)._ref()  # pylint: disable=protected-access
+      x = variable_v1.VariableV1(0.)._ref()  # pylint: disable=protected-access
       i = constant_op.constant(0)
       c = lambda i, x: math_ops.less(i, 10)
 
@@ -3840,7 +3843,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
 
       r = while_loop_tf.while_loop(c, body, [i, x], parallel_iterations=5)
 
-      grad_ys = [variables.VariableV1(73)._ref()]  # pylint: disable=protected-access
+      grad_ys = [variable_v1.VariableV1(73)._ref()]  # pylint: disable=protected-access
       grad = gradients_impl.gradients([r[1]], [x], grad_ys=grad_ys)
 
       self.evaluate(variables.global_variables_initializer())
@@ -4188,7 +4191,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       one = ops.convert_to_tensor(1, name="one")
       two = ops.convert_to_tensor(2, name="two")
       p = math_ops.greater_equal(c, 1)
-      i = control_flow_ops.cond(p, lambda: one, lambda: two)
+      i = tf_cond.cond(p, lambda: one, lambda: two)
       self.assertTrue(isinstance(i, ops.Tensor))
 
       # True case: c = 2 is >= 1
@@ -4210,7 +4213,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       def l1():
         return math_ops.reduce_sum(math_ops.abs(x))
 
-      i = control_flow_ops.cond(math_ops.equal(d, 2), l2, l1)
+      i = tf_cond.cond(math_ops.equal(d, 2), l2, l1)
       self.assertAllClose(4.0, i.eval(feed_dict={d: 1}))
       self.assertAllClose(2.0 * math.sqrt(2), i.eval(feed_dict={d: 2}))
 
@@ -4324,7 +4327,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       def b():
         return state_ops.assign(v, two)
 
-      i = control_flow_ops.cond(p, a, b)
+      i = tf_cond.cond(p, a, b)
       self.assertTrue(isinstance(i, ops.Tensor))
       self.evaluate(variables.global_variables_initializer())
 
@@ -4341,7 +4344,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testWithOpsDependencies(self):
     with self.cached_session() as sess:
-      v = variables.VariableV1(0.0)
+      v = variable_v1.VariableV1(0.0)
       c = constant_op.constant(10)
 
       # Fetching v directly will result in an uninitialized error
@@ -4365,7 +4368,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testWithTensorDependencies(self):
     with self.cached_session():
-      v = variables.VariableV1(0.0)
+      v = variable_v1.VariableV1(0.0)
       c1 = constant_op.constant(10)
       c2 = constant_op.constant(20)
 
@@ -4392,7 +4395,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testWithIndexedSlicesDependencies(self):
     with self.cached_session():
-      v = variables.VariableV1(
+      v = variable_v1.VariableV1(
           np.array([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]]).astype(np.float32))
       v_at_1 = indexed_slices.IndexedSlices(v, constant_op.constant([1]))
       gather_v_at_1 = array_ops.gather(v_at_1.values, v_at_1.indices)
@@ -4417,18 +4420,18 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     with ops.Graph().as_default():
       # device set on tensor => same device on dep.
       with ops.device("/job:ps"):
-        vd = variables.VariableV1([0.0])
+        vd = variable_v1.VariableV1([0.0])
       with_vd_dep = control_flow_ops.with_dependencies([vd.initializer], vd)
       self.assertTrue("/job:ps" in with_vd_dep.device)
 
       # No device set on tensor => no device on dep.
-      vnod = variables.VariableV1([0.0])
+      vnod = variable_v1.VariableV1([0.0])
       with_vnod_dep = control_flow_ops.with_dependencies([vnod.initializer],
                                                          vnod)
       self.assertDeviceEqual(None, with_vnod_dep.device)
 
       # device set on tensor, default device on graph => default device on dep.
-      vdef = variables.VariableV1([0.0], name="vdef")
+      vdef = variable_v1.VariableV1([0.0], name="vdef")
       with ops.device("/job:worker/device:GPU:1"):
         with_vdef_dep = control_flow_ops.with_dependencies([vdef.initializer],
                                                            vdef)
@@ -4439,8 +4442,8 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testGroup(self):
     with self.cached_session() as sess:
-      v1 = variables.VariableV1([0.0])
-      v2 = variables.VariableV1([1.0])
+      v1 = variable_v1.VariableV1([0.0])
+      v2 = variable_v1.VariableV1([1.0])
 
       # Group init1 and init2 and run.
       init = control_flow_ops.group(v1.initializer, v2.initializer)
@@ -4525,29 +4528,29 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     p1 = array_ops.placeholder(dtypes.float32)
     p2 = array_ops.placeholder(dtypes.float32)
     p3 = array_ops.placeholder(dtypes.float32)
-    v1 = variables.VariableV1(p1, validate_shape=False)
-    v2 = variables.VariableV1(p2, validate_shape=False)
-    v3 = variables.VariableV1(p3, validate_shape=False)
+    v1 = variable_v1.VariableV1(p1, validate_shape=False)
+    v2 = variable_v1.VariableV1(p2, validate_shape=False)
+    v3 = variable_v1.VariableV1(p3, validate_shape=False)
     self.assertIs(None, v1.get_shape().ndims)
     s = control_flow_ops.ref_select(index, [v1, v2, v3])
     self.assertIs(None, s.get_shape().ndims)
 
     # All inputs known but different.
-    v1 = variables.VariableV1([[1, 2]])
-    v2 = variables.VariableV1([[2], [1]])
+    v1 = variable_v1.VariableV1([[1, 2]])
+    v2 = variable_v1.VariableV1([[2], [1]])
     s = control_flow_ops.ref_select(index, [v1, v2])
     self.assertIs(None, s.get_shape().ndims)
 
     # All inputs known and same.
-    v1 = variables.VariableV1([[1, 2]])
-    v2 = variables.VariableV1([[1, 2]])
+    v1 = variable_v1.VariableV1([[1, 2]])
+    v2 = variable_v1.VariableV1([[1, 2]])
     s = control_flow_ops.ref_select(index, [v1, v2])
     self.assertEqual([1, 2], s.get_shape())
 
     # Possibly the same but not guaranteed.
-    v1 = variables.VariableV1([[1., 2.]])
+    v1 = variable_v1.VariableV1([[1., 2.]])
     p2 = array_ops.placeholder(dtypes.float32, shape=[None, 2])
-    v2 = variables.VariableV1(p2, validate_shape=False)
+    v2 = variable_v1.VariableV1(p2, validate_shape=False)
     s = control_flow_ops.ref_select(index, [v1, v2])
     self.assertEqual(None, s.get_shape())
 
@@ -4739,7 +4742,7 @@ class ControlFlowContextCheckTest(test.TestCase):
         cond_tensor.append(constant_op.constant(1))
       return cond_tensor[0]
 
-    control_flow_ops.cond(
+    tf_cond.cond(
         math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
     return cond_tensor[0]
 
@@ -4762,7 +4765,7 @@ class ControlFlowContextCheckTest(test.TestCase):
         "'while/Const_1' is in a while loop. See info log for more details."):
       # TODO(skyewm): this passes if we return while_tensor directly instead
       # of using it as input to another op.
-      control_flow_ops.cond(
+      tf_cond.cond(
           math_ops.less(1, 2), lambda: math_ops.add(1, while_tensor),
           lambda: constant_op.constant(0))
 
@@ -4799,7 +4802,7 @@ class ControlFlowContextCheckTest(test.TestCase):
         cond_tensor.append(constant_op.constant(1))
       return cond_tensor[0]
 
-    control_flow_ops.cond(math_ops.less(1, 2), branch_fn, branch_fn)
+    tf_cond.cond(math_ops.less(1, 2), branch_fn, branch_fn)
 
   @test_util.run_v1_only("b/120545219")
   def testValidWhileContext(self):
@@ -4817,7 +4820,7 @@ class ControlFlowContextCheckTest(test.TestCase):
     def body(_):
       cond_tensor = self._getCondTensor()
       # Create another cond containing the while loop for good measure
-      return control_flow_ops.cond(
+      return tf_cond.cond(
           math_ops.less(1, 2),
           lambda: while_loop_tf.while_loop(lambda i: i < 3,
                                            lambda i: i + cond_tensor, [0]),
@@ -4838,7 +4841,7 @@ class ControlFlowContextCheckTest(test.TestCase):
         ValueError,
         "Cannot use 'cond/while/Const_1' as input to 'cond/while_1/add' because"
         " they are in different while loops. See info log for more details."):
-      control_flow_ops.cond(
+      tf_cond.cond(
           math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
 
 
@@ -4848,11 +4851,11 @@ class TupleTest(test.TestCase):
   def testTensors(self):
     for v1_first in [True, False]:
       with self.cached_session():
-        v1 = variables.VariableV1([1.0])
+        v1 = variable_v1.VariableV1([1.0])
         add1 = math_ops.add(
             control_flow_ops.with_dependencies([v1.initializer], v1._ref()),  # pylint: disable=protected-access
             2.0)
-        v2 = variables.VariableV1([10.0])
+        v2 = variable_v1.VariableV1([10.0])
         add2 = math_ops.add(
             control_flow_ops.with_dependencies([v2.initializer], v2._ref()),  # pylint: disable=protected-access
             20.0)
@@ -4879,16 +4882,16 @@ class TupleTest(test.TestCase):
   def testIndexedSlices(self):
     for v1_first in [True, False]:
       with self.cached_session():
-        v1 = variables.VariableV1(
-            np.array([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]]).astype(
-                np.float32))
+        v1 = variable_v1.VariableV1(
+            np.array([[0.0, 1.0], [10.0, 11.0], [20.0,
+                                                 21.0]]).astype(np.float32))
         v1_at_1 = indexed_slices.IndexedSlices(
             control_flow_ops.with_dependencies([v1.initializer], v1._ref()),  # pylint: disable=protected-access
             constant_op.constant([1]))
 
-        v2 = variables.VariableV1(
-            np.array([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]]).astype(
-                np.float32))
+        v2 = variable_v1.VariableV1(
+            np.array([[0.1, 1.1], [10.1, 11.1], [20.1,
+                                                 21.1]]).astype(np.float32))
         v2_at_1 = indexed_slices.IndexedSlices(
             control_flow_ops.with_dependencies([v2.initializer], v2._ref()),  # pylint: disable=protected-access
             constant_op.constant([1]))
@@ -4918,7 +4921,7 @@ class TupleTest(test.TestCase):
 
   def testAcceptTensorsAsControlInputs(self):
     with self.cached_session():
-      var = variables.VariableV1(0)
+      var = variable_v1.VariableV1(0)
       assign = state_ops.assign(var, 1)
       t, = control_flow_ops.tuple(
           [constant_op.constant(0)], control_inputs=[assign])
@@ -5096,7 +5099,7 @@ class EagerTest(test.TestCase):
       pred = math_ops.less(1, 2)
       fn1 = lambda: [constant_op.constant(10)]
       fn2 = lambda: [constant_op.constant(20)]
-      r = control_flow_ops.cond(pred, fn1, fn2)
+      r = tf_cond.cond(pred, fn1, fn2)
 
       self.assertAllEqual(r.numpy(), 10)
       self.assertFalse(isinstance(r, list))
@@ -5110,7 +5113,7 @@ class EagerTest(test.TestCase):
         # TODO(b/111124878): this only needs to output one element.
         fn1 = lambda: (constant_op.constant(10), constant_op.constant(100))
         fn2 = lambda: (constant_op.constant(20), constant_op.constant(200))
-        return control_flow_ops.cond(constant_op.constant(pred), fn1, fn2)
+        return tf_cond.cond(constant_op.constant(pred), fn1, fn2)
 
       r = foo(True)
       self.assertAllEqual(r[0].numpy(), 10)
diff --git a/tensorflow/python/kernel_tests/control_flow/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow/control_flow_util_test.py
index 4a6e0f3de06..f75c057a953 100644
--- a/tensorflow/python/kernel_tests/control_flow/control_flow_util_test.py
+++ b/tensorflow/python/kernel_tests/control_flow/control_flow_util_test.py
@@ -20,6 +20,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_control_flow_ops
@@ -79,7 +80,7 @@ class ControlFlowUtilTest(test.TestCase):
 
         def b(x):
           with ops.name_scope("NestedCond"):
-            return control_flow_ops.cond(
+            return cond.cond(
                 math_ops.less(x, 100), lambda: math_ops.add(x, 1),
                 lambda: math_ops.add(x, 2))
 
@@ -89,7 +90,7 @@ class ControlFlowUtilTest(test.TestCase):
 
       x = array_ops.placeholder(dtypes.int32)
       with ops.name_scope("OuterCond"):
-        control_flow_ops.cond(
+        cond.cond(
             math_ops.less(x, 1000), lambda: while_loop(x),
             lambda: math_ops.add(x, 2))
     return g
diff --git a/tensorflow/python/kernel_tests/control_flow/control_flow_util_v2_test.py b/tensorflow/python/kernel_tests/control_flow/control_flow_util_v2_test.py
index 70951d55efe..449b70cff49 100644
--- a/tensorflow/python/kernel_tests/control_flow/control_flow_util_v2_test.py
+++ b/tensorflow/python/kernel_tests/control_flow/control_flow_util_v2_test.py
@@ -18,7 +18,7 @@
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2
 from tensorflow.python.ops import while_loop
@@ -40,8 +40,8 @@ class ControlFlowUtilV2Test(test.TestCase):
       def branch():
         self.assertEqual(control_flow_util_v2.in_defun(), expect_in_defun)
         return i + 1
-      return control_flow_ops.cond(constant_op.constant(True),
-                                   branch, lambda: 0)
+      return cond.cond(constant_op.constant(True),
+                       branch, lambda: 0)
 
     return while_loop.while_loop(lambda i: i < 4, body,
                                  [constant_op.constant(0)])
diff --git a/tensorflow/python/kernel_tests/control_flow/functional_ops_test.py b/tensorflow/python/kernel_tests/control_flow/functional_ops_test.py
index 66f0b736216..21a0d20b5d9 100644
--- a/tensorflow/python/kernel_tests/control_flow/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/control_flow/functional_ops_test.py
@@ -19,7 +19,6 @@ import numpy as np
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import cancellation
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function as eager_def_function
@@ -38,12 +37,10 @@ from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
-from tensorflow.python.util import compat
 
 
 # pylint: disable=invalid-name
@@ -1035,212 +1032,6 @@ class FunctionalOpsTest(test.TestCase):
     self.assertAllEqual(run(), [[1.]])
 
 
-# TODO(akshayka): Replace `function.Defun` with tf.contrib.eager.defun` in the
-# below test cases.
-class PartitionedCallTest(test.TestCase):
-
-  @test_util.run_deprecated_v1
-  def testRemoteDeviceInPartitionedCallOp(self):
-    workers, _ = test_util.create_local_cluster(2, 0)
-
-    worker0_device = "/job:worker/replica:0/task:0/cpu:0"
-    worker1_device = "/job:worker/replica:0/task:1/cpu:0"
-
-    @eager_def_function.function
-    def f(a, b):
-      return a + b
-
-    with session.Session(workers[0].target) as sess:
-      with ops.device(worker0_device):
-        a = variable_scope.get_variable(
-            "a", initializer=constant_op.constant(1.), use_resource=True)
-      with ops.device(worker1_device):
-        b = variable_scope.get_variable(
-            "b", initializer=constant_op.constant(1.), use_resource=True)
-
-      sess.run(variables.global_variables_initializer())
-
-    config = config_pb2.ConfigProto()
-    config.share_cluster_devices_in_session = True
-
-    with session.Session(workers[0].target, config=config) as sess:
-      res = sess.run(f(a, b))
-
-    self.assertEqual(res, 2)
-
-  @test_util.run_deprecated_v1
-  def testBasicSingleDevice(self):
-
-    @function.Defun(*[dtypes.float32] * 2)
-    def Body(x, y):
-      with ops.device("/cpu:0"):
-        a = x + x
-        b = y + y
-        return a + b
-
-    output, = self.evaluate(
-        functional_ops.partitioned_call(
-            args=[constant_op.constant(1.),
-                  constant_op.constant(2.)], f=Body))
-    self.assertEqual(output, 6.)
-
-  @test_util.run_deprecated_v1
-  def testBasicMultiDevice(self):
-    config = config_pb2.ConfigProto(device_count={"CPU": 3})
-
-    @function.Defun(*[dtypes.float32] * 2)
-    def Body(x, y):
-      # if x = 1, y = 2, ...
-      with ops.device("/cpu:0"):
-        # a:= 1 + 1 = 2
-        a = x + x
-      with ops.device("/cpu:1"):
-        # b:= 2 + 2 = 4
-        b = a + y
-      with ops.device("/cpu:2"):
-        # c:= 2 + 4 = 6
-        c = a + b
-      # a + b + c = 2 + 4 + 6 = 12
-      return a + b + c
-
-    with self.test_session(config=config):
-      output, = functional_ops.partitioned_call(
-          args=[constant_op.constant(1.),
-                constant_op.constant(2.)], f=Body)
-      self.assertEqual(self.evaluate(output), 12.)
-
-  @test_util.run_deprecated_v1
-  def testBasicMultiDeviceGPU(self):
-    if not test_util.is_gpu_available():
-      return
-
-    @function.Defun(*[dtypes.float32] * 2)
-    def Body(x, y):
-      with ops.device("/gpu:0"):
-        a = x + x
-        b = y + y
-      with ops.device("/cpu:0"):
-        c = a + b
-        return c
-
-    output, = self.evaluate(
-        functional_ops.partitioned_call(
-            args=[constant_op.constant(1.),
-                  constant_op.constant(2.)], f=Body))
-    self.assertEqual(output, 6.)
-
-  @test_util.run_deprecated_v1
-  def testBasicNoDeviceAnnotations(self):
-
-    @function.Defun(*[dtypes.float32] * 2)
-    def Body(x, y):
-      a = x + x
-      b = y + y
-      return a + b
-
-    output, = self.evaluate(
-        functional_ops.partitioned_call(
-            args=[constant_op.constant(1.),
-                  constant_op.constant(2.)], f=Body))
-    self.assertEqual(output, 6.)
-
-  @test_util.run_deprecated_v1
-  def testShardsRunOnRequestedDevices(self):
-    config = config_pb2.ConfigProto(device_count={"CPU": 4})
-
-    @function.Defun()
-    def Body():
-      # Serialize DT_RESOURCE handles as DT_STRINGs, which encode the device on
-      # which the resource was created, so that we can verify that ops were
-      # actually run on the requested devices.
-      #
-      # TODO(akshayka): Provide a cleaner, more idiomatic API for obtaining the
-      # name of the device on which a resource lives / for determining the
-      # device on which an op ran.
-      with ops.device("/cpu:0"):
-        s1 = iterator_ops.Iterator.from_structure(
-            (dtypes.float32,)).string_handle()
-      with ops.device("/cpu:1"):
-        s2 = iterator_ops.Iterator.from_structure(
-            (dtypes.float32,)).string_handle()
-      with ops.device("/cpu:2"):
-        s3 = iterator_ops.Iterator.from_structure(
-            (dtypes.float32,)).string_handle()
-      return s1, s2, s3
-
-    with self.test_session(config=config, use_gpu=True) as sess:
-      outputs = sess.run(functional_ops.partitioned_call(args=[], f=Body))
-    self.assertIn(compat.as_bytes("CPU:0"), outputs[0])
-    self.assertIn(compat.as_bytes("CPU:1"), outputs[1])
-    self.assertIn(compat.as_bytes("CPU:2"), outputs[2])
-
-  @test_util.run_deprecated_v1
-  def testAssignAddResourceVariable(self):
-
-    v = resource_variable_ops.ResourceVariable(1.0)
-
-    @function.Defun()
-    def AssignAdd():
-      v.assign_add(1.0)
-
-    op = functional_ops.partitioned_call(
-        args=AssignAdd.captured_inputs, f=AssignAdd)
-    _ = self.evaluate(variables.global_variables_initializer())
-    _ = self.evaluate(op)
-    value = self.evaluate(v.read_value())
-    self.assertEqual(value, 2.0)
-
-  @test_util.run_deprecated_v1
-  def testFunctionWithResourcesOnDifferentDevices(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPUs available.")
-
-    with ops.device("/cpu:0"):
-      v_cpu_zero = resource_variable_ops.ResourceVariable(
-          [0.0, 1.0, 2.0], name="v_cpu_zero")
-
-    with ops.device("/cpu:1"):
-      v_cpu_one = resource_variable_ops.ResourceVariable(
-          [0.0, 1.0, 2.0], name="v_cpu_one")
-
-    with ops.device("/gpu:0"):
-      v_gpu = resource_variable_ops.ResourceVariable(
-          [0.0, 1.0, 2.0], name="v_gpu")
-
-    def sum_gather():
-      cpu_result = math_ops.reduce_sum(array_ops.gather(v_cpu_zero, [1, 2]))
-      also_cpu_result = math_ops.reduce_sum(array_ops.gather(v_cpu_one, [1, 2]))
-      gpu_result = math_ops.reduce_sum(array_ops.gather(v_gpu, [1, 2]))
-      return cpu_result, also_cpu_result, gpu_result
-
-    defined = function.Defun()(sum_gather)
-    with self.test_session(
-        config=config_pb2.ConfigProto(
-            allow_soft_placement=False,
-            log_device_placement=True,
-            device_count={"CPU": 2})) as sess:
-      self.evaluate(variables.global_variables_initializer())
-      expected = self.evaluate(sum_gather())
-      result = sess.run(
-          functional_ops.partitioned_call(
-              args=defined.captured_inputs, f=defined))
-      self.assertAllEqual(expected, result)
-
-  # Use an invalid executor name to test the plumbing of the executor_type attr.
-  @test_util.run_v1_only("b/120545219")
-  def testExecutorTypeAttrExecutorNotFound(self):
-    @function.Defun(dtypes.int32)
-    def AddFive(x):
-      return x + 5
-
-    op = functional_ops.partitioned_call(
-        args=[constant_op.constant([1, 2, 3], dtype=dtypes.int32)],
-        f=AddFive,
-        executor_type="NON_EXISTENT_EXECUTOR")
-    with self.assertRaisesRegex(errors.NotFoundError, "NON_EXISTENT_EXECUTOR"):
-      self.evaluate(op)
-
-
 @test_util.run_all_in_graph_and_eager_modes
 @test_util.with_control_flow_v2
 class FunctionalOpsCaseTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/critical_section_test.py b/tensorflow/python/kernel_tests/critical_section_test.py
index 5ba56a8a0b6..571ed5c8328 100644
--- a/tensorflow/python/kernel_tests/critical_section_test.py
+++ b/tensorflow/python/kernel_tests/critical_section_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_assert
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_v2_toggles
@@ -80,16 +81,16 @@ class CriticalSectionTest(test.TestCase, parameterized.TestCase):
           nv = v.assign_add(a * b)
           with ops.control_dependencies([nv]):
             return array_ops.identity(c)
-      return control_flow_ops.cond(
+      return cond.cond(
           array_ops.identity(inner_cond), true_fn, lambda: c)
 
     def execute():
       return cs.execute(lambda: fn(1.0, 2.0))
 
     r = [
-        control_flow_ops.cond(array_ops.identity(outer_cond),
-                              execute,
-                              v.read_value)
+        cond.cond(array_ops.identity(outer_cond),
+                  execute,
+                  v.read_value)
         for _ in range(num_concurrent)
     ]
     # pylint: enable=cell-var-from-loop
diff --git a/tensorflow/python/kernel_tests/custom_ops/BUILD b/tensorflow/python/kernel_tests/custom_ops/BUILD
index eea611fd911..0523f13a2c3 100644
--- a/tensorflow/python/kernel_tests/custom_ops/BUILD
+++ b/tensorflow/python/kernel_tests/custom_ops/BUILD
@@ -26,7 +26,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
 
@@ -48,7 +48,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
 
@@ -70,6 +70,6 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/data_structures/BUILD b/tensorflow/python/kernel_tests/data_structures/BUILD
index 4bac2ae8456..ecd3fb3ab5c 100644
--- a/tensorflow/python/kernel_tests/data_structures/BUILD
+++ b/tensorflow/python/kernel_tests/data_structures/BUILD
@@ -86,7 +86,7 @@ tf_py_test(
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
@@ -99,7 +99,7 @@ tf_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
@@ -113,6 +113,7 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients_impl",
@@ -135,6 +136,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -148,6 +150,7 @@ tf_py_test(
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -163,6 +166,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients_impl",
@@ -187,7 +191,6 @@ cuda_py_test(
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
     ],
 )
 
@@ -246,7 +249,6 @@ cuda_py_test(
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
     ],
 )
 
@@ -259,8 +261,8 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:cond_v2",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops_gen",
         "//tensorflow/python:distributed_framework_test_lib",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/python/kernel_tests/data_structures/list_ops_test.py b/tensorflow/python/kernel_tests/data_structures/list_ops_test.py
index 6cc9a820365..3d64a891251 100644
--- a/tensorflow/python/kernel_tests/data_structures/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/data_structures/list_ops_test.py
@@ -23,7 +23,6 @@ from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -32,7 +31,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import gen_list_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
@@ -847,7 +846,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       m = constant_op.constant([1, 2, 3], dtype=dtypes.float32)
 
       def body(list_, m):
-        list_ = control_flow_ops.cond(
+        list_ = cond.cond(
             math_ops.equal(list_ops.tensor_list_length(list_), 0),
             lambda: list_ops.empty_tensor_list(m.shape, m.dtype), lambda: list_)
         list_ = list_ops.tensor_list_push_back(list_, m)
@@ -869,7 +868,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       m = constant_op.constant([1, 2, 3], dtype=dtypes.float32)
 
       def body(i, m, t1):
-        t1 = control_flow_ops.cond(
+        t1 = cond.cond(
             math_ops.equal(list_ops.tensor_list_length(t1), 0),
             lambda: list_ops.empty_tensor_list(m.shape, m.dtype), lambda: t1)
 
@@ -1762,7 +1761,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     # generated from map_fn.
     self.skipTest("b/150742232")
 
-    @function.defun_with_attributes(attributes={"_noinline": True})
+    @def_function.function(experimental_attributes={"_noinline": True})
     def generator(c):
       return list_ops.tensor_list_from_tensor(c, element_shape=[])
 
diff --git a/tensorflow/python/kernel_tests/data_structures/lookup_ops_test.py b/tensorflow/python/kernel_tests/data_structures/lookup_ops_test.py
index 54a34d7e473..7d72a80a37b 100644
--- a/tensorflow/python/kernel_tests/data_structures/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/data_structures/lookup_ops_test.py
@@ -40,7 +40,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import gen_lookup_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import map_fn
@@ -564,7 +564,7 @@ class StaticHashTableTest(BaseLookupTableTest, parameterized.TestCase):
       def false_fn():
         return constant_op.constant(0, dtype=dtypes.float32)
 
-      return beta * control_flow_ops.cond(
+      return beta * cond.cond(
           constant_op.constant(True), true_fn=true_fn, false_fn=false_fn)
 
     with backprop.GradientTape() as tape:
diff --git a/tensorflow/python/kernel_tests/data_structures/map_ops_test.py b/tensorflow/python/kernel_tests/data_structures/map_ops_test.py
index 839711108e1..5e17ba1b1f1 100644
--- a/tensorflow/python/kernel_tests/data_structures/map_ops_test.py
+++ b/tensorflow/python/kernel_tests/data_structures/map_ops_test.py
@@ -20,7 +20,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import map_ops
 from tensorflow.python.ops import sort_ops
 from tensorflow.python.platform import test
@@ -119,11 +119,11 @@ class MapOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     m = map_ops.tensor_map_insert(m, k, v)
 
     default_value = array_ops.zeros_like(v)
-    l = control_flow_ops.cond(
+    l = cond.cond(
         map_ops.tensor_map_has_key(m, k),
         lambda: map_ops.tensor_map_lookup(m, k, dtypes.float32),
         lambda: default_value)
-    l2 = control_flow_ops.cond(
+    l2 = cond.cond(
         map_ops.tensor_map_has_key(m, k2),
         lambda: map_ops.tensor_map_lookup(m, k, dtypes.float32),
         lambda: default_value)
diff --git a/tensorflow/python/kernel_tests/data_structures/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/data_structures/tensor_array_ops_test.py
index 443589fa630..8df9a81b522 100644
--- a/tensorflow/python/kernel_tests/data_structures/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/data_structures/tensor_array_ops_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
@@ -168,7 +168,7 @@ class TensorArrayTest(test.TestCase):
         return w2.concat()
 
       def _write(index, output):
-        elements = control_flow_ops.cond(
+        elements = cond.cond(
             math_ops.less(index, 3), _concat_1, _concat_2)
         return (index + 1, output.write(index, elements))
 
@@ -1083,7 +1083,7 @@ class TensorArrayTest(test.TestCase):
         c = lambda i, acc: i < 5
 
         def b(i, acc):
-          x1 = control_flow_ops.cond(
+          x1 = cond.cond(
               math_ops.equal(i, 0), lambda: x,
               lambda: math_ops.multiply(acc.read(i - 1), 2.0))
           return i + 1, acc.write(i, x1)
diff --git a/tensorflow/python/kernel_tests/image_ops/BUILD b/tensorflow/python/kernel_tests/image_ops/BUILD
index b23024d46c1..8d486d1c042 100644
--- a/tensorflow/python/kernel_tests/image_ops/BUILD
+++ b/tensorflow/python/kernel_tests/image_ops/BUILD
@@ -69,7 +69,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
 
diff --git a/tensorflow/python/kernel_tests/image_ops/extract_image_patches_grad_test.py b/tensorflow/python/kernel_tests/image_ops/extract_image_patches_grad_test.py
index 8d6cfd7a2aa..fcc21fea151 100644
--- a/tensorflow/python/kernel_tests/image_ops/extract_image_patches_grad_test.py
+++ b/tensorflow/python/kernel_tests/image_ops/extract_image_patches_grad_test.py
@@ -180,6 +180,33 @@ class ExtractImagePatchesGradTest(test.TestCase, parameterized.TestCase):
   def test_AllNone_Gradient(self):
     self._VariableShapeGradient([None, None, None, None])
 
+  def testJitCompile(self):
+    import tensorflow as tf
+
+    with test_util.AbstractGradientTape(use_tape=True) as tape:
+      shape = (4, 512, 512, 1)
+      ksize = 5
+
+      images = variables.Variable(np.random.uniform(size=shape), name='inputs')
+      tape.watch(images)
+
+      # Github issues: #59058, #59061
+      # tf.image.extract_image_patches() does not support backward pass
+      # when compiled with XLA.
+      extract_image_patches_jit = tf.function(
+          array_ops.extract_image_patches, jit_compile=True
+      )
+      patches = extract_image_patches_jit(
+          images,
+          ksizes=[1, ksize, ksize, 1],
+          strides=[1] * 4,
+          rates=[1] * 4,
+          padding='SAME',
+      )
+
+      gradients = tape.gradient(patches, images)
+      self.assertIsNotNone(gradients)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/io_ops/BUILD b/tensorflow/python/kernel_tests/io_ops/BUILD
index da283be8a08..3632138e411 100644
--- a/tensorflow/python/kernel_tests/io_ops/BUILD
+++ b/tensorflow/python/kernel_tests/io_ops/BUILD
@@ -21,10 +21,10 @@ tf_py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:platform",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:flags",
         "//third_party/py/numpy",
     ],
 )
@@ -50,7 +50,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:io_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -66,7 +66,7 @@ tf_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -83,7 +83,7 @@ tf_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -100,8 +100,8 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:lib",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -113,7 +113,6 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:io_ops",
-        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 13a71e70e05..6e292444d57 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -16,10 +16,11 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -32,6 +33,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -50,6 +52,7 @@ tf_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//third_party/py/numpy",
@@ -69,9 +72,10 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -85,6 +89,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
@@ -258,7 +263,7 @@ cuda_py_test(
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_circulant",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/ops/signal:fft_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -483,7 +488,6 @@ cuda_py_test(
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
         "//tensorflow/python/ops/linalg:linear_operator_toeplitz",
-        "//tensorflow/python/ops/signal",
         "//third_party/py/numpy",
     ],
 )
@@ -562,7 +566,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -578,6 +582,7 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python/ops/linalg:linalg_impl",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -592,6 +597,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -605,6 +611,7 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python/ops/linalg:linalg_impl",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -619,6 +626,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -632,6 +640,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -704,8 +713,10 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -723,6 +734,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//third_party/py/numpy",
@@ -764,9 +776,11 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients_impl",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
     # TODO(b/127344411): This test passes because XLA does not actually cluster
@@ -781,8 +795,10 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python/ops/linalg:linalg_impl",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -800,6 +816,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python/ops/linalg:linalg_impl",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/BUILD b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
index 1188fb43ea8..2b8acb0285c 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
@@ -20,6 +20,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/linalg/sparse:conjugate_gradient",
         "//third_party/py/numpy",
     ],
 )
@@ -33,7 +34,7 @@ cuda_py_test(
         "no_cuda_asan",  # TODO(b/190824595)
     ],
     deps = [
-        "//tensorflow/python/ops/linalg/sparse",
+        "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_ops",
     ],
 )
 
@@ -48,8 +49,7 @@ cuda_py_test(
         "no_gpu",  # b/203655060 (cuda 11.5 specific)
     ],
     deps = [
-        "//tensorflow/python/ops/linalg/sparse",
-        "//tensorflow/python/ops/linalg/sparse:gen_sparse_csr_matrix_ops",
+        "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_ops",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -61,7 +61,8 @@ cuda_py_test(
     main = "csr_sparse_matrix_grad_test.py",
     shard_count = 50,
     deps = [
-        "//tensorflow/python/ops/linalg/sparse",
+        "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_grad",
+        "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_ops",
     ],
 )
 
@@ -75,7 +76,8 @@ cuda_py_test(
         "no_cuda_asan",  # TODO(b/190824595)
     ],
     deps = [
-        "//tensorflow/python/ops/linalg/sparse",
+        "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_grad",
+        "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_ops",
     ],
 )
 
@@ -86,6 +88,7 @@ cuda_py_test(
     main = "csr_sparse_matrix_sparse_mat_mul_grad_test.py",
     shard_count = 50,
     deps = [
-        "//tensorflow/python/ops/linalg/sparse",
+        "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_grad",
+        "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_ops",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
index dabad92048a..7618f8cdd09 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
@@ -172,22 +172,52 @@ class CSRSparseMatrixOpsTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(a_dense_shape, a_st_rt_value.dense_shape)
 
   def testSparseTensorConversionInvalidInputShapes(self):
-    values = constant_op.constant(
-        0.554979503, shape=[5], dtype=dtypes.float32)
-    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
-                                "must be rank 1"):
+    values = constant_op.constant(0.554979503, shape=[5], dtype=dtypes.float32)
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError), "must be rank 1"
+    ):
       indices = constant_op.constant(0, shape=[5, 2], dtype=dtypes.int64)
       dense_shape = constant_op.constant(53, shape=[], dtype=dtypes.int64)
       csr = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix(
-          indices=indices, values=values, dense_shape=dense_shape)
+          indices=indices, values=values, dense_shape=dense_shape
+      )
       self.evaluate(csr)
 
-    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
-                                "must be rank 2"):
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError), "must be rank 2"
+    ):
       indices = constant_op.constant(0, shape=[5], dtype=dtypes.int64)
       dense_shape = constant_op.constant(53, shape=[1], dtype=dtypes.int64)
       csr = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix(
-          indices=indices, values=values, dense_shape=dense_shape)
+          indices=indices, values=values, dense_shape=dense_shape
+      )
+      self.evaluate(csr)
+
+    int32max = 2**31 - 1
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError),
+        "batch_size must be < Int32Max",
+    ):
+      indices = constant_op.constant(0, shape=[5, 3], dtype=dtypes.int64)
+      dense_shape = constant_op.constant(
+          [int32max, 1, 1], shape=[3], dtype=dtypes.int64
+      )
+      csr = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix(
+          indices=indices, values=values, dense_shape=dense_shape
+      )
+      self.evaluate(csr)
+
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError),
+        "csr row index size.*must be <= Int32Max",
+    ):
+      indices = constant_op.constant(0, shape=[5, 3], dtype=dtypes.int64)
+      dense_shape = constant_op.constant(
+          [(int32max // 2), 10, 1], shape=[3], dtype=dtypes.int64
+      )
+      csr = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix(
+          indices=indices, values=values, dense_shape=dense_shape
+      )
       self.evaluate(csr)
 
   # TODO(b/139491352): Add handle_data propagation to array_ops.identity.
diff --git a/tensorflow/python/kernel_tests/linalg/testdata/BUILD b/tensorflow/python/kernel_tests/linalg/testdata/BUILD
index e57e4503de8..baeaeb7e071 100644
--- a/tensorflow/python/kernel_tests/linalg/testdata/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/testdata/BUILD
@@ -10,15 +10,3 @@ filegroup(
     name = "self_adjoint_eig_op_test_files",
     srcs = ["self_adjoint_eig_fail_if_denorms_flushed.txt"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//visibility:private"],
-)
diff --git a/tensorflow/python/kernel_tests/math_ops/BUILD b/tensorflow/python/kernel_tests/math_ops/BUILD
index de1c4a089e8..068348441e5 100644
--- a/tensorflow/python/kernel_tests/math_ops/BUILD
+++ b/tensorflow/python/kernel_tests/math_ops/BUILD
@@ -4,7 +4,6 @@ load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
 
@@ -96,7 +95,9 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -110,6 +111,7 @@ cuda_py_test(
         "//tensorflow/python:bincount_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/platform:test",
     ],
 )
 
@@ -175,6 +177,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//third_party/py/numpy",
@@ -194,11 +197,11 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn_grad",
-        "//tensorflow/python:platform",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -220,8 +223,8 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn_grad",
-        "//tensorflow/python:platform",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -241,12 +244,13 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn_grad",
-        "//tensorflow/python:platform",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -295,12 +299,16 @@ cuda_py_test(
     size = "medium",
     srcs = ["matmul_op_test.py"],
     shard_count = 20,
-    tags = ["no_mac_arm64"],
+    tags = [
+        "no_armv7",  # b/280582913
+        "no_mac_arm64",
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
@@ -317,7 +325,6 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_benchmark",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
@@ -373,6 +380,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -387,6 +395,7 @@ cuda_py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:variables",
@@ -464,6 +473,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/math_ops/approx_topk_test.py b/tensorflow/python/kernel_tests/math_ops/approx_topk_test.py
index f1f5271e01d..a182f442df0 100644
--- a/tensorflow/python/kernel_tests/math_ops/approx_topk_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/approx_topk_test.py
@@ -236,6 +236,38 @@ class ApproxTopkTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaises((errors.InvalidArgumentError, ValueError)):
       fuzz_jit()
 
+  def test_b272094281(self):
+    @function(jit_compile=True)
+    def fuzz_jit():
+      return nn_ops.approx_max_k(
+          [],
+          9223372036854775807,
+          reduction_dimension=-4294967297 + 0x41,
+          reduction_input_size_override=-9223372036854775807,
+          aggregate_to_topk=False,
+      )
+
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      fuzz_jit()
+
+  @parameterized.parameters(
+      itertools.product(
+          [dtypes.float16, dtypes.bfloat16, dtypes.float32],
+          [1, 10],  # k
+          [100, 500],  # row_size
+          [1, 10, 128],  # num_rows
+      )
+  )
+  def test_nonjit(self, dtype, k, row_size, num_rows):
+    # Support regular topk semantics.
+    row = np.arange(row_size, dtype=np.float32)
+    db = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))
+    db_tensor = constant_op.constant(db, dtype=dtype)
+    _, idx = self.evaluate(nn_ops.approx_max_k(db_tensor, k))
+    sorted_idx = np.sort(idx)
+    expected = np.sort(np.argsort(-db)[:, :k])
+    self.assertAllEqual(sorted_idx, expected)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/math_ops/clip_ops_test.py b/tensorflow/python/kernel_tests/math_ops/clip_ops_test.py
index 0dce5afb63b..8186d7c7513 100644
--- a/tensorflow/python/kernel_tests/math_ops/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/clip_ops_test.py
@@ -16,13 +16,14 @@
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices as indexed_slices_lib
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -30,22 +31,6 @@ from tensorflow.python.platform import test
 
 class ClipTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
-  def testClipByValueGradient(self):
-    inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
-    outputs_1 = clip_ops.clip_by_value(inputs, 0.5, 3.5)
-    min_val = constant_op.constant([0.5, 0.5, 0.5, 0.5], dtype=dtypes.float32)
-    max_val = constant_op.constant([3.5, 3.5, 3.5, 3.5], dtype=dtypes.float32)
-    outputs_2 = clip_ops.clip_by_value(inputs, min_val, max_val)
-    with self.cached_session():
-      error_1 = gradient_checker.compute_gradient_error(inputs, [4], outputs_1,
-                                                        [4])
-      self.assertLess(error_1, 1e-4)
-
-      error_2 = gradient_checker.compute_gradient_error(inputs, [4], outputs_2,
-                                                        [4])
-      self.assertLess(error_2, 1e-4)
-
   # ClipByValue test
   def testClipByValue(self):
     with self.session():
@@ -149,6 +134,63 @@ class ClipTest(test.TestCase):
 
       self.assertAllClose(np_ans, tf_ans)
 
+  def testClipByValueGradient(self):
+    def grad(x, y, z, clip_fn):
+      x = constant_op.constant(x, dtype=dtypes.float32)
+      y = constant_op.constant(y, dtype=dtypes.float32)
+      z = constant_op.constant(z, dtype=dtypes.float32)
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        tape.watch(y)
+        tape.watch(z)
+        output = clip_fn(x, y, z)
+      return tape.gradient(output, [x, y, z])
+
+    for f in (clip_ops.clip_by_value, gen_math_ops._clip_by_value):
+      with self.subTest(f=f):
+        # Input: [Scalar, Scalar, Scalar]
+        xg, yg, zg = grad(0, -1, 1, clip_fn=f)
+        self.assertEqual(self.evaluate(xg), 1)
+        self.assertEqual(self.evaluate(yg), 0)
+        self.assertEqual(self.evaluate(zg), 0)
+
+        # Input: [Scalar, Scalar, Scalar]
+        xg, yg, zg = grad(2, -1, 1, clip_fn=f)
+        self.assertEqual(self.evaluate(xg), 0)
+        self.assertEqual(self.evaluate(yg), 0)
+        self.assertEqual(self.evaluate(zg), 1)
+
+        # Input: [Vector, Scalar, Scalar]
+        xg, yg, zg = grad([0, -2, 2, -2], -1, 1, clip_fn=f)
+        self.assertAllEqual(self.evaluate(xg), [1, 0, 0, 0])
+        self.assertEqual(self.evaluate(yg), 2)
+        self.assertEqual(self.evaluate(zg), 1)
+
+        # Input: [Vector, Vector, Scalar]
+        xg, yg, zg = grad([-1, -2, 0, 2], [-2, -1, -3, 0], 1, clip_fn=f)
+        self.assertAllEqual(self.evaluate(xg), [1, 0, 1, 0])
+        self.assertAllEqual(self.evaluate(yg), [0, 1, 0, 0])
+        self.assertEqual(self.evaluate(zg), 1)
+
+        # Input: [Vector, Vector, Vector]
+        xg, yg, zg = grad(
+            [-1, -2, 0, 2], [-2, -1, -3, 0], [1, 2, -1, 1], clip_fn=f
+        )
+        self.assertAllEqual(self.evaluate(xg), [1, 0, 0, 0])
+        self.assertAllEqual(self.evaluate(yg), [0, 1, 0, 0])
+        self.assertAllEqual(self.evaluate(zg), [0, 0, 1, 1])
+
+    # Only test the following with `clip_ops.clip_by_value`, as
+    # `gen_math_ops._clip_by_value` requires the min and max values to be
+    # scalar or the same shape as the input.
+
+    # Input: [Matrix, Vector, Matrix]
+    xg, yg, zg = grad([[-2, 3], [2, -1]], [-1, -2], [[1, 2], [3, 4]],
+                      clip_fn=clip_ops.clip_by_value)
+    self.assertAllEqual(self.evaluate(xg), [[0, 0], [1, 1]])
+    self.assertAllEqual(self.evaluate(yg), [1, 0])
+    self.assertAllEqual(self.evaluate(zg), [[0, 1], [0, 0]])
+
   def testClipByValueBadShape(self):
     with self.session():
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3, 1])
diff --git a/tensorflow/python/kernel_tests/math_ops/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/math_ops/cwise_ops_binary_test.py
index f9d1a9fe325..8a1d14be841 100644
--- a/tensorflow/python/kernel_tests/math_ops/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/cwise_ops_binary_test.py
@@ -239,6 +239,7 @@ class BinaryOpTest(test.TestCase):
   def testFloatDifferentShapes(self):
     x = np.array([1, 2, 3, 4]).reshape(2, 2).astype(np.float32)
     y = np.array([1, 2]).reshape(2, 1).astype(np.float32)
+    self._compareBoth(y, x, np.arctan2, math_ops.atan2)
     with self.cached_session() as sess:
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
diff --git a/tensorflow/python/kernel_tests/math_ops/cwise_ops_test.py b/tensorflow/python/kernel_tests/math_ops/cwise_ops_test.py
index 1b79d4857e0..a1a15bfc561 100644
--- a/tensorflow/python/kernel_tests/math_ops/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/cwise_ops_test.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors
@@ -24,6 +25,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -1329,5 +1331,40 @@ class PolyvalTest(test.TestCase):
       math_ops.polyval(coeffs, x)
 
 
+class SingularGradientOpTest(test.TestCase):
+
+  def testGradientAtSingularity(self):
+    if context.executing_eagerly():
+      self.skipTest(
+          "Only graph mode allows specifying gradient inputs directly"
+      )
+
+    ops_and_singularity = [
+        (math_ops.reciprocal, [0.0], [np.nan]),
+        (math_ops.rsqrt, [0.0], [np.nan]),
+        (math_ops.sqrt, [0.0], [np.nan]),
+        (math_ops.sqrt_grad, [0.0, 0.0], [np.nan, np.nan]),
+        (math_ops.reciprocal_grad, [1.0, 0.0], [-0.0, -0.0]),
+        (math_ops.tan, [np.pi / 2], [0.0]),
+        (math_ops.log, [0.0], [np.nan]),
+        (math_ops.log1p, [-1.0], [np.nan]),
+        (math_ops.acosh, [0.0], [np.nan]),
+        (math_ops.asin, [1.0], [np.nan]),
+        (math_ops.acos, [1.0], [np.nan]),
+        (math_ops.atan2, [0.0, 0.0], [np.nan, np.nan]),
+        (math_ops.div, [1.0, 0.0], [np.nan, np.nan]),
+        (math_ops.div_no_nan, [1.0, 0.0], [0.0, 0.0]),
+        (math_ops.real_div, [1.0, 0.0], [np.nan, np.nan]),
+        (math_ops.pow, [0.0, -1.0], [np.nan, np.nan]),
+    ]
+    for op, singularity, expected in ops_and_singularity:
+      with self.subTest(op=op.__name__):
+        args = [constant_op.constant(s) for s in singularity]
+        grad_y = constant_op.constant(0.0)
+        y = op(*args)
+        g = self.evaluate(gradients_impl.gradients(y, args, grad_ys=grad_y))
+        self.assertAllEqual(g, expected)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/math_ops/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/math_ops/cwise_ops_unary_test.py
index 2fb954b2a2e..29daaea0b16 100644
--- a/tensorflow/python/kernel_tests/math_ops/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/cwise_ops_unary_test.py
@@ -92,6 +92,9 @@ class UnaryOpTest(test.TestCase):
       if x.dtype in (np.complex64, np.complex128) and tf_func == math_ops.sign:
         return  # Return early
 
+      if tf_func == math_ops.round:
+        return  # Return early
+
       if x.dtype in (np.float16, dtypes_lib.bfloat16.as_numpy_dtype):
         s = list(np.shape(x))
         jacob_t, _ = gradient_checker.compute_gradient(
@@ -417,12 +420,14 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.negative, math_ops.negative)
     self._compareBoth(x, np.negative, _NEG)
     self._compareBoth(y, compute_f32(self._inv), math_ops.reciprocal)
+    self._compareCpu(x, np.round, math_ops.round)
     self._compareCpu(x, np.exp, math_ops.exp)
     self._compareCpu(x, np.expm1, math_ops.expm1)
     self._compareCpu(z, compute_f32(np.log), math_ops.log)
     self._compareCpu(z, compute_f32(np.log1p), math_ops.log1p)
     self._compareBoth(y, np.sign, math_ops.sign)
     self._compareCpu(z, self._rsqrt, math_ops.rsqrt)
+    self._compareCpu(x, np.square, math_ops.square)
     self._compareBoth(x, compute_f32(np.sin), math_ops.sin)
     self._compareBoth(x, compute_f32(np.cos), math_ops.cos)
     self._compareBoth(x, compute_f32(np.tan), math_ops.tan)
diff --git a/tensorflow/python/kernel_tests/math_ops/reduction_ops_test.py b/tensorflow/python/kernel_tests/math_ops/reduction_ops_test.py
index c409c4cffe5..7b48723adcf 100644
--- a/tensorflow/python/kernel_tests/math_ops/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/reduction_ops_test.py
@@ -162,7 +162,7 @@ class BaseReductionTest(test.TestCase):
     with self.cached_session() as sess:
       tf_ans = self._tf_reduce(x, reduction_axes, keepdims)
       out = sess.run(tf_ans, feed_dict)
-    self.assertAllClose(np_ans, out, rtol=rtol, atol=atol)
+    self.assertAllCloseAccordingToType(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
   def _compareAll(self,
@@ -311,6 +311,44 @@ class SumReductionTest(BaseReductionTest):
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
+  @test_util.run_deprecated_v1
+  def testFloat32BFloat16(self):
+    for dtype in [dtypes.float32, dtypes.bfloat16]:
+      dtype_np = np.float32 if dtype == dtypes.float32 else dtype.as_numpy_dtype
+      for rank in range(1, _MAX_RANK + 1):
+        np_arr = self._makeIncremental((2,) * rank, dtype)
+        self._compareAllAxes(np_arr)
+
+      for _ in range(10):
+        size_x = int(2 ** np.random.uniform(0, 7))
+        size_y = int(2 ** np.random.uniform(0, 7))
+
+        if size_x * size_y > 1e7:
+          size_y = int(1e7 / size_x)
+
+        arr = np.ones([size_x, size_y], dtype=dtype_np)
+        col_sum = np.sum(arr, axis=0)
+        row_sum = np.sum(arr, axis=1)
+
+        tf_row_sum = self._tf_reduce(arr, 1, False)
+        tf_col_sum = self._tf_reduce(arr, 0, False)
+        tf_out_row, tf_out_col = self.evaluate([tf_row_sum, tf_col_sum])
+        self.assertAllCloseAccordingToType(col_sum, tf_out_col)
+        self.assertAllCloseAccordingToType(row_sum, tf_out_row)
+
+      for size_x in [1, 3, 16]:
+        for size_y in [1, 3, 16]:
+          for size_z in [1, 3, 16]:
+            arr = np.ones([size_x, size_y, size_z], dtype=dtype_np)
+            sum_y = np.sum(arr, axis=1)
+            sum_xz = np.sum(arr, axis=(0, 2))
+
+            tf_sum_xz = self._tf_reduce(arr, [0, 2], False)
+            tf_sum_y = self._tf_reduce(arr, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = self.evaluate([tf_sum_xz, tf_sum_y])
+            self.assertAllCloseAccordingToType(sum_y, tf_out_sum_y)
+            self.assertAllCloseAccordingToType(sum_xz, tf_out_sum_xz)
+
   @test_util.run_deprecated_v1
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
@@ -525,10 +563,10 @@ class MeanReductionTest(BaseReductionTest):
       self._compareAllAxes(np_arr)
 
   @test_util.run_deprecated_v1
-  def testBfloat16(self):
+  def testBFloat16(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.bfloat16)
-      self._compareAllAxes(np_arr, rtol=1e-3, atol=1.)
+      self._compareAllAxes(np_arr)
 
   @test_util.run_deprecated_v1
   def testFloat64(self):
diff --git a/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_test.py
index dd623ecd6a3..d564f8c4fc4 100644
--- a/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_test.py
@@ -50,8 +50,16 @@ class SegmentReductionHelper(test.TestCase):
     return constant_op.constant(
         np_values, shape=input_shape, dtype=dtype), np_values
 
-  def _segmentReduce(self, indices, x, op1, op2=None, num_segments=None,
-                     initial_value=0):
+  def _segmentReduce(
+      self,
+      indices,
+      x,
+      op1,
+      op2=None,
+      num_segments=None,
+      initial_value=0,
+      empty_value=0,
+  ):
     if not x.size:
       return np.array([])
     indices = np.asarray(indices)
@@ -69,8 +77,8 @@ class SegmentReductionHelper(test.TestCase):
       else:
         output[index] = x_flat[i]
     # zero initialize values that are still uncalculated.
-    initial_value_slice = np.ones(slice_shape) * initial_value
-    output = [o if o is not None else initial_value_slice for o in output]
+    empty_value_slice = np.ones(slice_shape) * empty_value
+    output = [o if o is not None else empty_value_slice for o in output]
     if op2 is not None:
       output = [op2(o) for o in output]
     output = [o.reshape(slice_shape) for o in output]
@@ -188,6 +196,23 @@ class SegmentReductionOpTest(SegmentReductionHelper, parameterized.TestCase):
         tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
+  def testSegmentIdsHoleEmptyValue(self):
+    shape = [4, 4]
+    for use_gpu in [True, False]:
+      with self.cached_session(use_gpu=use_gpu):
+        tf_x, np_x = self._input(shape, dtype=dtypes_lib.float32)
+        indices = [0, 0, 3, 3]
+        np_ans = self._segmentReduce(
+            indices,
+            np_x,
+            np.max,
+            initial_value=-dtypes_lib.float32.min,
+            empty_value=0,
+        )
+        s = math_ops.segment_max(data=tf_x, segment_ids=indices)
+        tf_ans = self.evaluate(s)
+        self.assertAllClose(np_ans, tf_ans)
+
   @test_util.run_deprecated_v1
   def testSegmentIdsInvalid1(self):
     shape = [4, 4]
@@ -346,7 +371,9 @@ class UnsortedSegmentTest(SegmentReductionHelper, parameterized.TestCase):
                   np_op1,
                   np_op2,
                   num_segments=num_segments,
-                  initial_value=init_op(dtype))
+                  initial_value=init_op(dtype),
+                  empty_value=init_op(dtype),
+              )
               s = tf_op(tf_x, segment_ids=indices, num_segments=num_segments)
               tf_ans = self.evaluate(s)
               self.assertAllCloseAccordingToType(np_ans, tf_ans)
@@ -689,41 +716,42 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     segment_indices = [0, 2, 2, 2]
     tf_indices = [8, 3, 0, 9]
     num_segments = 5
-    with self.session():
-      for np_op1, np_op2, tf_op in ops_list:
-        np_ans = self._sparseSegmentReduce(
-            np_x,
-            tf_indices,
-            segment_indices,
-            np_op1,
-            np_op2,
-            num_segments=num_segments)
-        s = tf_op(
-            data=tf_x,
-            indices=tf_indices,
-            segment_ids=segment_indices,
-            num_segments=num_segments)
-        tf_ans = self.evaluate(s)
-        self.assertAllClose(np_ans, tf_ans)
+    for np_op1, np_op2, tf_op in ops_list:
+      np_ans = self._sparseSegmentReduce(
+          np_x,
+          tf_indices,
+          segment_indices,
+          np_op1,
+          np_op2,
+          num_segments=num_segments,
+      )
+      s = tf_op(
+          data=tf_x,
+          indices=tf_indices,
+          segment_ids=segment_indices,
+          num_segments=num_segments,
+      )
+      tf_ans = self.evaluate(s)
+      self.assertAllClose(np_ans, tf_ans)
 
   def testWithEmptySegments(self):
+    tf_x = constant_op.constant([], shape=[0, 4], dtype=dtypes_lib.float32)
+    ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
+    for tf_op in ops_list:
+      s = tf_op(data=tf_x, indices=[], segment_ids=[])
+      tf_ans = self.evaluate(s)
+      self.assertAllClose(np.zeros([0, 4]), tf_ans)
+
+  def testWithEmptySegmentsWithNumSegments(self):
     tf_x = constant_op.constant([], shape=[0, 4], dtype=dtypes_lib.float32)
     ops_list = [
         math_ops.sparse_segment_sum_with_num_segments,
-        math_ops.sparse_segment_mean_with_num_segments
+        math_ops.sparse_segment_mean_with_num_segments,
     ]
-    segment_indices = []
-    tf_indices = []
-    num_segments = 5
-    with self.session():
-      for tf_op in ops_list:
-        s = tf_op(
-            data=tf_x,
-            indices=tf_indices,
-            segment_ids=segment_indices,
-            num_segments=num_segments)
-        tf_ans = self.evaluate(s)
-        self.assertAllClose(np.zeros([5, 4]), tf_ans)
+    for tf_op in ops_list:
+      s = tf_op(data=tf_x, indices=[], segment_ids=[], num_segments=5)
+      tf_ans = self.evaluate(s)
+      self.assertAllClose(np.zeros([5, 4]), tf_ans)
 
   @test_util.run_in_graph_and_eager_modes
   def testSegmentScalarIdiRaisesInvalidArgumentError(self):
diff --git a/tensorflow/python/kernel_tests/math_ops/topk_op_test.py b/tensorflow/python/kernel_tests/math_ops/topk_op_test.py
index a28dc129396..19921c1cacf 100644
--- a/tensorflow/python/kernel_tests/math_ops/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/topk_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -35,18 +36,24 @@ from tensorflow.python.platform import test
 
 class TopKTest(test.TestCase):
 
-  def _validateTopK(self,
-                    inputs,
-                    k,
-                    expected_values,
-                    expected_indices,
-                    sorted=True):  # pylint: disable=redefined-builtin
+  def _validateTopK(
+      self,
+      inputs,
+      k,
+      expected_values,
+      expected_indices,
+      sorted=True,
+      index_type=dtypes.int32,
+  ):  # pylint: disable=redefined-builtin
     np_expected_values = np.array(expected_values)
     np_expected_indices = np.array(expected_indices)
     with self.cached_session():
-      values_op, indices_op = nn_ops.top_k(inputs, k, sorted=sorted)
+      values_op, indices_op = nn_ops.top_k(
+          inputs, k, sorted=sorted, index_type=index_type
+      )
       values, indices = self.evaluate([values_op, indices_op])
 
+      self.assertEqual(indices.dtype, index_type)
       self.assertShapeEqual(np_expected_values, values_op)
       self.assertShapeEqual(np_expected_indices, indices_op)
 
@@ -98,6 +105,27 @@ class TopKTest(test.TestCase):
     inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.4, 0.2]]
     self._validateTopK(inputs, 2, [[0.4, 0.3], [0.4, 0.3]], [[3, 1], [2, 1]])
 
+  def testOutputIndexType(self):
+    for index_type in [dtypes.int16, dtypes.int32, dtypes.int64]:
+      inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.4, 0.2]]
+      self._validateTopK(
+          inputs,
+          2,
+          [[0.4, 0.3], [0.4, 0.3]],
+          [[3, 1], [2, 1]],
+          index_type=index_type,
+      )
+
+  def testKType(self):
+    for ktype in [dtypes.int32, dtypes.int64, dtypes.int16]:
+      inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.4, 0.2]]
+      self._validateTopK(
+          inputs,
+          constant_op.constant(2, dtype=ktype),
+          [[0.4, 0.3], [0.4, 0.3]],
+          [[3, 1], [2, 1]],
+      )
+
   def testTop3(self):
     for k in range(3, 11, 2):
       for dim in range(512, 12288, 512):
@@ -193,21 +221,20 @@ class TopKTest(test.TestCase):
     self._validateTopK(inputs, 3, np.zeros([0, 3], dtype=np.float32),
                        np.zeros([0, 3], dtype=np.int32))
 
-  @test_util.run_deprecated_v1
   def testKNegative(self):
-    inputs = [[0.1, 0.2], [0.3, 0.4]]
-    with self.session():
-      k = array_ops.placeholder(dtypes.int32)
-      values, _ = nn_ops.top_k(inputs, k)
-      with self.assertRaisesOpError("Need k >= 0, got -7"):
-        values.eval(feed_dict={k: -7})
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError),
+        "Need k >= 0, got -7|non-negative",
+    ):
+      self.evaluate(nn_ops.top_k([[0.1, 0.2], [0.3, 0.4]], -7))
 
-  @test_util.run_deprecated_v1
   def testKTooLarge(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
-    with self.assertRaisesRegex(ValueError,
-                                r"must have last dimension >= k = 4"):
-      nn_ops.top_k(inputs, 4)
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError),
+        r"must have last dimension >= k = 4|must have at least k",
+    ):
+      self.evaluate(nn_ops.top_k(inputs, 4))
 
   @test_util.run_deprecated_v1
   def testTopKGradients(self):
diff --git a/tensorflow/python/kernel_tests/nn_ops/BUILD b/tensorflow/python/kernel_tests/nn_ops/BUILD
index c95524f4c25..b3b18117cf3 100644
--- a/tensorflow/python/kernel_tests/nn_ops/BUILD
+++ b/tensorflow/python/kernel_tests/nn_ops/BUILD
@@ -76,7 +76,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -89,6 +89,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
@@ -103,6 +104,7 @@ cuda_py_test(
     shard_count = 2,
     deps = [
         ":bias_op_base",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -112,6 +114,7 @@ cuda_py_test(
     srcs = ["bias_op_test.py"],
     deps = [
         ":bias_op_base",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -217,6 +220,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -239,9 +243,10 @@ cuda_py_test(
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
@@ -299,6 +304,7 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
     deps = [
         ":cudnn_deterministic_base",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -312,6 +318,7 @@ cuda_py_test(
     ],
     deps = [
         ":cudnn_deterministic_base",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -325,6 +332,7 @@ py_library(
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -339,6 +347,7 @@ cuda_py_test(
         ":depthwise_conv_op_base",
         "//tensorflow/python:random_ops",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -350,6 +359,7 @@ cuda_py_test(
     shard_count = 8,
     deps = [
         ":depthwise_conv_op_base",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -375,11 +385,11 @@ cuda_py_test(
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
@@ -461,6 +471,7 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
         "//third_party/py/numpy",
@@ -506,6 +517,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
         "//third_party/py/numpy",
@@ -528,6 +540,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -542,6 +555,7 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
@@ -563,26 +577,27 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_v2_toggles",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:rnn",
         "//tensorflow/python:rnn_cell",
         "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -627,6 +642,7 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn_ops",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/nn_ops/rnn_cell_test.py b/tensorflow/python/kernel_tests/nn_ops/rnn_cell_test.py
index 5774d95c1a1..527d33fc6cb 100644
--- a/tensorflow/python/kernel_tests/nn_ops/rnn_cell_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/rnn_cell_test.py
@@ -34,7 +34,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import gen_rnn_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
@@ -46,6 +46,7 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -172,8 +173,8 @@ class TestStateSaverWithCounters(TestStateSaver):
   @test_util.run_v1_only("b/124229375")
   def __init__(self, batch_size, state_size):
     super(TestStateSaverWithCounters, self).__init__(batch_size, state_size)
-    self._num_state_calls = variables_lib.VariableV1(0)
-    self._num_save_state_calls = variables_lib.VariableV1(0)
+    self._num_state_calls = variable_v1.VariableV1(0)
+    self._num_save_state_calls = variable_v1.VariableV1(0)
 
   def state(self, name):
     with ops.control_dependencies(
@@ -1359,6 +1360,25 @@ class LSTMTest(test.TestCase):
               cell_clip=cell_clip,
               use_peephole=use_peephole))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testLSTMBlockCellEmptyInputRaisesError(self):
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError, "is empty"):
+      self.evaluate(
+          gen_rnn_ops.lstm_block_cell(
+              x=constant_op.constant(0, shape=[2, 16], dtype=dtypes.half),
+              cs_prev=constant_op.constant(0, shape=[2, 0], dtype=dtypes.half),
+              h_prev=constant_op.constant(0, shape=[2, 0], dtype=dtypes.half),
+              w=constant_op.constant(0, shape=[16, 0], dtype=dtypes.half),
+              wci=constant_op.constant(0, shape=[5], dtype=dtypes.half),
+              wcf=constant_op.constant(0, shape=[16], dtype=dtypes.half),
+              wco=constant_op.constant(0, shape=[13], dtype=dtypes.half),
+              b=constant_op.constant(0, shape=[0], dtype=dtypes.half),
+              forget_bias=112.66590343649887,
+              cell_clip=67.12389445926587,
+              use_peephole=False,
+          )
+      )
+
   @test_util.run_in_graph_and_eager_modes
   def testLSTMBlockCellGradErrorHandling(self):
     use_peephole = False
@@ -2202,7 +2222,7 @@ class RawRNNTest(test.TestCase):
         elements_finished = (time_ >= sequence_length)
         finished = math_ops.reduce_all(elements_finished)
         # For the very final iteration, we must emit a dummy input
-        next_input = control_flow_ops.cond(
+        next_input = cond.cond(
             finished,
             lambda: array_ops.zeros([batch_size, input_depth], dtype=dtypes.float32),
             lambda: inputs_ta.read(time_))
@@ -2314,7 +2334,7 @@ class RawRNNTest(test.TestCase):
         elements_finished = array_ops.tile([time_ >= max_time], [batch_size])
         finished = math_ops.reduce_all(elements_finished)
         # For the very final iteration, we must emit a dummy input
-        next_input = control_flow_ops.cond(
+        next_input = cond.cond(
             finished,
             lambda: array_ops.zeros([batch_size, input_depth], dtype=dtypes.float32),
             lambda: inputs_ta.read(time_))
@@ -2357,7 +2377,7 @@ class RawRNNTest(test.TestCase):
         elements_finished = array_ops.tile([time_ >= max_time], [batch_size])
         finished = math_ops.reduce_all(elements_finished)
         # For the very final iteration, we must emit a dummy input
-        next_input = control_flow_ops.cond(
+        next_input = cond.cond(
             finished,
             lambda: array_ops.zeros([batch_size, input_depth], dtype=dtypes.float32),
             lambda: inputs_ta.read(time_))
@@ -2400,7 +2420,7 @@ class RawRNNTest(test.TestCase):
         elements_finished = array_ops.tile([time_ >= max_time], [batch_size])
         finished = math_ops.reduce_all(elements_finished)
         # For the very final iteration, we must emit a dummy input
-        next_input = control_flow_ops.cond(
+        next_input = cond.cond(
             finished,
             lambda: array_ops.zeros([batch_size, input_depth], dtype=dtypes.float32),
             lambda: inputs_ta.read(time_))
@@ -2466,7 +2486,7 @@ class RawRNNTest(test.TestCase):
         elements_finished = (time_ >= sequence_length)
         finished = math_ops.reduce_all(elements_finished)
         # For the very final iteration, we must emit a dummy input
-        next_input = control_flow_ops.cond(
+        next_input = cond.cond(
             finished,
             lambda: array_ops.zeros([batch_size, input_depth], dtype=dtypes.float32),
             lambda: inputs_ta.read(time_))
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 6b32fc74053..6bf7193c05c 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -21,6 +21,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import numerics
@@ -104,9 +105,9 @@ class NumericsTest(test.TestCase):
 
   def testControlFlowCond(self):
     predicate = array_ops.placeholder(dtypes.bool, shape=[])
-    _ = control_flow_ops.cond(predicate,
-                              lambda: constant_op.constant([37.]),
-                              lambda: constant_op.constant([42.]))
+    _ = cond.cond(predicate,
+                  lambda: constant_op.constant([37.]),
+                  lambda: constant_op.constant([42.]))
     with self.assertRaisesRegex(
         ValueError, r"`tf\.add_check_numerics_ops\(\) is not compatible with "
         r"TensorFlow control flow operations such as `tf\.cond\(\)` "
diff --git a/tensorflow/python/kernel_tests/quantization_ops/quantization_ops_test.py b/tensorflow/python/kernel_tests/quantization_ops/quantization_ops_test.py
index a928f9ea4b4..332d67ca763 100644
--- a/tensorflow/python/kernel_tests/quantization_ops/quantization_ops_test.py
+++ b/tensorflow/python/kernel_tests/quantization_ops/quantization_ops_test.py
@@ -432,7 +432,7 @@ class QuantizeAndDequantizeV3OpTest(test_util.TensorFlowTestCase):
           input_value[0].numpy(), quantized.numpy()[0], delta=0.05)
 
   @test_util.run_in_graph_and_eager_modes
-  def test_invalid_inputs(self):
+  def test_invalid_num_bits(self):
     input_value = constant_op.constant([-0.8, -0.5, 0, 0.3, 0.8, -2.0],
                                        shape=(6,),
                                        dtype=dtypes.float32),
@@ -455,13 +455,38 @@ class QuantizeAndDequantizeV3OpTest(test_util.TensorFlowTestCase):
       else:
         self.fail(
             "Raised exception other than expected: %s. "
-            "Expected exceptions are errors.InvalidArgumentError or ValueError",
-            ex.__name__)
+            "Expected exceptions are errors.InvalidArgumentError or ValueError"
+            % ex.__name__
+        )
     else:
       self.fail(
           "Did not raise an exception where it is expected to raise either "
           "a ValueError or errors.InvalidArgumentError.")
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_invalid_input_min_max_with_axis_specified(self):
+    input_value = (
+        constant_op.constant([1.8], shape=(1,), dtype=dtypes.float32),
+    )
+    input_min = constant_op.constant(1.0, shape=(), dtype=dtypes.float32)
+    input_max = constant_op.constant([2.0], shape=(1,), dtype=dtypes.float32)
+    num_bits = 8
+
+    # Test that running the op raises error. It raises different errors
+    # depending on whether the shape inference is run first or the op's
+    # Compute() is run first.
+    with self.assertRaisesRegex(
+        (errors.InvalidArgumentError, ValueError),
+        "Shape must be rank 1"):
+      array_ops.quantize_and_dequantize_v3(
+          input_value,
+          input_min,
+          input_max,
+          num_bits=num_bits,
+          axis=0,
+          range_given=True,
+      )
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 948bdc06579..b63f4082c3f 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -74,8 +74,8 @@ cuda_py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -94,8 +94,8 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
         "//tensorflow/python:stateful_random_ops",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -109,7 +109,7 @@ cuda_py_test(
     ],
     deps = [
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_crop_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -133,8 +133,8 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -152,9 +152,9 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_grad",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -182,8 +182,8 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -201,7 +201,7 @@ tf_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
@@ -218,7 +218,6 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:stateless_random_ops",
         "//third_party/py/numpy",
@@ -246,6 +245,7 @@ tf_py_test(
     name = "random_index_shuffle_test",
     srcs = ["random_index_shuffle_test.py"],
     shard_count = 10,
+    deps = ["//tensorflow/python/platform:client_testlib"],
 )
 
 py_library(
diff --git a/tensorflow/python/kernel_tests/random/random_crop_test.py b/tensorflow/python/kernel_tests/random/random_crop_test.py
index a169c4428c4..a14331e44b7 100644
--- a/tensorflow/python/kernel_tests/random/random_crop_test.py
+++ b/tensorflow/python/kernel_tests/random/random_crop_test.py
@@ -17,7 +17,7 @@
 import numpy as np
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import random_crop_ops
 from tensorflow.python.platform import test
 
 
@@ -29,7 +29,7 @@ class RandomCropTest(test.TestCase):
     for shape in (2, 1, 1), (2, 1, 3), (4, 5, 3):
       value = np.arange(0, np.prod(shape), dtype=np.int32).reshape(shape)
       with self.cached_session():
-        crop = random_ops.random_crop(value, shape).eval()
+        crop = random_crop_ops.random_crop(value, shape).eval()
         self.assertAllEqual(crop, value)
 
   def testContains(self):
@@ -40,7 +40,7 @@ class RandomCropTest(test.TestCase):
       value_set = set(
           tuple(value[i:i + 2, j:j + 3, k:k + 4].ravel())
           for i in range(2) for j in range(3) for k in range(4))
-      crop = random_ops.random_crop(value, size=target)
+      crop = random_crop_ops.random_crop(value, size=target)
       for _ in range(20):
         y = self.evaluate(crop)
         self.assertAllEqual(y.shape, target)
@@ -57,7 +57,7 @@ class RandomCropTest(test.TestCase):
     value = np.arange(size).reshape(shape)
 
     with self.cached_session():
-      crop = random_ops.random_crop(value, single, seed=7)
+      crop = random_crop_ops.random_crop(value, single, seed=7)
       counts = np.zeros(size, dtype=np.int32)
       for _ in range(num_samples):
         y = self.evaluate(crop)
@@ -79,7 +79,7 @@ class StatelessRandomCropTest(test.TestCase):
     # No random cropping is performed since the size is value.shape.
     for shape in (2, 1, 1), (2, 1, 3), (4, 5, 3):
       value = np.arange(0, np.prod(shape), dtype=np.int32).reshape(shape)
-      crop = random_ops.stateless_random_crop(value, shape, seed=(1, 2))
+      crop = random_crop_ops.stateless_random_crop(value, shape, seed=(1, 2))
       self.evaluate(crop)
       self.assertAllEqual(crop, value)
 
@@ -100,7 +100,8 @@ class StatelessRandomCropTest(test.TestCase):
       # Check that the result is valid by making sure that it is one of all
       # possible values for randomly cropping `value` with `target` shape.
       for seed in test_seeds:
-        crop = random_ops.stateless_random_crop(value, size=target, seed=seed)
+        crop = random_crop_ops.stateless_random_crop(
+            value, size=target, seed=seed)
         y = self.evaluate(crop)
         self.assertAllEqual(y.shape, target)
         self.assertIn(tuple(y.ravel()), value_set)
@@ -122,7 +123,8 @@ class StatelessRandomCropTest(test.TestCase):
       test_seed = (1, 2)
       observations = [[] for _ in range(iterations)]
       for observation in observations:
-        crop = random_ops.stateless_random_crop(value, single, seed=test_seed)
+        crop = random_crop_ops.stateless_random_crop(
+            value, single, seed=test_seed)
         counts = np.zeros(size, dtype=np.int32)
         for _ in range(num_samples):
           y = self.evaluate(crop)
@@ -144,7 +146,7 @@ class StatelessRandomCropTest(test.TestCase):
       for observation in observations:
         counts = np.zeros(size, dtype=np.int32)
         for seed in test_seeds:
-          crop = random_ops.stateless_random_crop(
+          crop = random_crop_ops.stateless_random_crop(
               value, single, seed=seed)
           y = self.evaluate(crop)
           self.assertAllEqual(y.shape, single)
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index 3bb584e6d14..b57ced82067 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -31,8 +31,8 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/ops/signal",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/ops/signal:dct_ops",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -51,8 +51,9 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/ops/signal:fft_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -64,7 +65,7 @@ cuda_py_test(
     deps = [
         ":test_util",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/ops/signal:mel_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -78,7 +79,7 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/ops/signal:mfcc_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -94,10 +95,11 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/ops/signal:reconstruction_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -116,7 +118,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/ops/signal:shape_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -136,11 +138,13 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/ops/signal:spectral_ops",
+        "//tensorflow/python/ops/signal:window_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -162,7 +166,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/ops/signal:window_ops",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/sparse_ops/BUILD b/tensorflow/python/kernel_tests/sparse_ops/BUILD
index bf3eeb2042e..2ce8ca27d37 100644
--- a/tensorflow/python/kernel_tests/sparse_ops/BUILD
+++ b/tensorflow/python/kernel_tests/sparse_ops/BUILD
@@ -216,12 +216,11 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:while_loop",
         "//third_party/py/numpy",
     ],
 )
@@ -238,6 +237,7 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:benchmark",
         "//third_party/py/numpy",
     ],
 )
@@ -309,6 +309,7 @@ py_library(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python/eager:backprop",
diff --git a/tensorflow/python/kernel_tests/sparse_ops/sparse_split_op_test.py b/tensorflow/python/kernel_tests/sparse_ops/sparse_split_op_test.py
index 36280bf771d..aaf63cac84b 100644
--- a/tensorflow/python/kernel_tests/sparse_ops/sparse_split_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops/sparse_split_op_test.py
@@ -18,6 +18,8 @@ import numpy as np
 
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
@@ -306,6 +308,33 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sp_tensors[0].dense_shape, [n // 2, 3])
       self.assertAllEqual(sp_tensors[1].dense_shape, [n // 2, 3])
 
+  def testSparseIndexOutOfBounds(self):
+    if test_util.is_gpu_available():
+      # On GPU, out-of-bounds indices are simply ignored.
+      self.evaluate(
+          gen_sparse_ops.sparse_split(
+              split_dim=1,
+              indices=[[0, 0], [1, 10], [-1, 2]],
+              values=[1.0, 2.0, 3.0],
+              shape=[3, 5],
+              num_split=2,
+          )
+      )
+    else:
+      # On CPU, out-of-bounds index raises error.
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), 'out of bounds'
+      ):
+        self.evaluate(
+            gen_sparse_ops.sparse_split(
+                split_dim=1,
+                indices=[[0, 0], [1, 10], [-1, 2]],
+                values=[1.0, 2.0, 3.0],
+                shape=[3, 5],
+                num_split=2,
+            )
+        )
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_ops/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_ops/sparse_tensor_dense_matmul_op_test.py
index cace10d194f..fc27f33c6fa 100644
--- a/tensorflow/python/kernel_tests/sparse_ops/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops/sparse_tensor_dense_matmul_op_test.py
@@ -29,9 +29,9 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import while_loop
 from tensorflow.python.platform import test
 
 
@@ -271,13 +271,13 @@ def _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(x, y, adjoint_a,
   v0 = constant_op.constant(0.0)
 
   def _timeit(iterations, _):
-    (_, final) = control_flow_ops.while_loop(
+    (_, final) = while_loop.while_loop(
         lambda t, _: t < iterations,
         body, (t0, v0),
         parallel_iterations=1,
         back_prop=False,
-        shape_invariants=(tensor_shape.TensorShape(()),
-                          tensor_shape.TensorShape(None)))
+        shape_invariants=(tensor_shape.TensorShape(
+            ()), tensor_shape.TensorShape(None)))
     return [final]
 
   return _timeit
@@ -298,13 +298,13 @@ def _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(x_ind, x_val, x_shape,
   v0 = constant_op.constant(0.0)
 
   def _timeit(iterations, _):
-    (_, final) = control_flow_ops.while_loop(
+    (_, final) = while_loop.while_loop(
         lambda t, _: t < iterations,
         body, (t0, v0),
         parallel_iterations=1,
         back_prop=False,
-        shape_invariants=(tensor_shape.TensorShape(()),
-                          tensor_shape.TensorShape(None)))
+        shape_invariants=(tensor_shape.TensorShape(
+            ()), tensor_shape.TensorShape(None)))
     return [final]
 
   return _timeit
diff --git a/tensorflow/python/kernel_tests/strings_ops/BUILD b/tensorflow/python/kernel_tests/strings_ops/BUILD
index 09f92f9c84e..f11e9cc2f2c 100644
--- a/tensorflow/python/kernel_tests/strings_ops/BUILD
+++ b/tensorflow/python/kernel_tests/strings_ops/BUILD
@@ -105,6 +105,7 @@ tf_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -154,9 +155,9 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/kernel_tests/strings_ops/as_string_op_test.py b/tensorflow/python/kernel_tests/strings_ops/as_string_op_test.py
index 4f5d0856d0e..7fd3bc41f79 100644
--- a/tensorflow/python/kernel_tests/strings_ops/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/strings_ops/as_string_op_test.py
@@ -16,68 +16,55 @@
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
 class AsStringOpTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testFloat(self):
     float_inputs_ = [
         0, 1, -1, 0.5, 0.25, 0.125, float("INF"), float("NAN"), float("-INF")
     ]
 
-    with self.cached_session():
-      for dtype in (dtypes.half, dtypes.bfloat16, dtypes.float32,
-                    dtypes.float64):
-        input_ = array_ops.placeholder(dtype)
+    for dtype in (dtypes.half, dtypes.bfloat16, dtypes.float32, dtypes.float64):
+      inputs = ops.convert_to_tensor(float_inputs_, dtype=dtype)
+      s = lambda strs: [x.decode("ascii") for x in self.evaluate(strs)]
+      result = string_ops.as_string(inputs, shortest=True)
+      self.assertAllEqual(s(result), ["%g" % x for x in float_inputs_])
 
-        output = string_ops.as_string(input_, shortest=True)
-        result = output.eval(feed_dict={input_: float_inputs_})
-        s = lambda strs: [x.decode("ascii") for x in strs]
-        self.assertAllEqual(s(result), ["%g" % x for x in float_inputs_])
+      result = string_ops.as_string(inputs, scientific=True)
+      self.assertAllEqual(s(result), ["%e" % x for x in float_inputs_])
 
-        output = string_ops.as_string(input_, scientific=True)
-        result = output.eval(feed_dict={input_: float_inputs_})
-        self.assertAllEqual(s(result), ["%e" % x for x in float_inputs_])
+      result = string_ops.as_string(inputs)
+      self.assertAllEqual(s(result), ["%f" % x for x in float_inputs_])
 
-        output = string_ops.as_string(input_)
-        result = output.eval(feed_dict={input_: float_inputs_})
-        self.assertAllEqual(s(result), ["%f" % x for x in float_inputs_])
+      result = string_ops.as_string(inputs, width=3)
+      self.assertAllEqual(s(result), ["%3f" % x for x in float_inputs_])
 
-        output = string_ops.as_string(input_, width=3)
-        result = output.eval(feed_dict={input_: float_inputs_})
-        self.assertAllEqual(s(result), ["%3f" % x for x in float_inputs_])
+      result = string_ops.as_string(inputs, width=3, fill="0")
+      self.assertAllEqual(s(result), ["%03f" % x for x in float_inputs_])
 
-        output = string_ops.as_string(input_, width=3, fill="0")
-        result = output.eval(feed_dict={input_: float_inputs_})
-        self.assertAllEqual(s(result), ["%03f" % x for x in float_inputs_])
+      result = string_ops.as_string(inputs, width=3, fill="0", shortest=True)
+      self.assertAllEqual(s(result), ["%03g" % x for x in float_inputs_])
 
-        output = string_ops.as_string(input_, width=3, fill="0", shortest=True)
-        result = output.eval(feed_dict={input_: float_inputs_})
-        self.assertAllEqual(s(result), ["%03g" % x for x in float_inputs_])
+      result = string_ops.as_string(inputs, precision=10, width=3)
+      self.assertAllEqual(s(result), ["%03.10f" % x for x in float_inputs_])
 
-        output = string_ops.as_string(input_, precision=10, width=3)
-        result = output.eval(feed_dict={input_: float_inputs_})
-        self.assertAllEqual(s(result), ["%03.10f" % x for x in float_inputs_])
+      result = string_ops.as_string(
+          inputs, precision=10, width=3, fill="0", shortest=True
+      )
+      self.assertAllEqual(s(result), ["%03.10g" % x for x in float_inputs_])
 
-        output = string_ops.as_string(
-            input_, precision=10, width=3, fill="0", shortest=True)
-        result = output.eval(feed_dict={input_: float_inputs_})
-        self.assertAllEqual(s(result), ["%03.10g" % x for x in float_inputs_])
+    with self.assertRaisesOpError("Cannot select both"):
+      self.evaluate(
+          string_ops.as_string(inputs, scientific=True, shortest=True)
+      )
 
-      with self.assertRaisesOpError("Cannot select both"):
-        output = string_ops.as_string(input_, scientific=True, shortest=True)
-        output.eval(feed_dict={input_: float_inputs_})
+    with self.assertRaisesOpError("Fill string must be one or fewer"):
+      self.evaluate(string_ops.as_string(inputs, fill="ab"))
 
-      with self.assertRaisesOpError("Fill string must be one or fewer"):
-        output = string_ops.as_string(input_, fill="ab")
-        output.eval(feed_dict={input_: float_inputs_})
-
-  @test_util.run_deprecated_v1
   def testInt(self):
     # Cannot use values outside -128..127 for test, because we're also
     # testing int8
@@ -85,144 +72,137 @@ class AsStringOpTest(test.TestCase):
     int_dtypes = [dtypes.int8, dtypes.int32, dtypes.int64]
     uint_inputs = [0, 1, 127, 255, 101]
     uint_dtypes = [dtypes.uint8, dtypes.uint32, dtypes.uint64]
-    s = lambda strs: [x.decode("ascii") for x in strs]
+    s = lambda strs: [x.decode("ascii") for x in self.evaluate(strs)]
 
-    with self.cached_session():
-      for dtypes_, inputs in [(int_dtypes, int_inputs),
-                              (uint_dtypes, uint_inputs)]:
-        for dtype in dtypes_:
-          input_ = array_ops.placeholder(dtype)
+    for dtypes_, inputs_ in [
+        (int_dtypes, int_inputs),
+        (uint_dtypes, uint_inputs),
+    ]:
+      for dtype in dtypes_:
+        inputs = ops.convert_to_tensor(inputs_, dtype=dtype)
+        result = string_ops.as_string(inputs)
+        self.assertAllEqual(s(result), ["%d" % x for x in inputs_])
 
-          output = string_ops.as_string(input_)
-          result = output.eval(feed_dict={input_: inputs})
-          self.assertAllEqual(s(result), ["%d" % x for x in inputs])
+        result = string_ops.as_string(inputs, width=3)
+        self.assertAllEqual(s(result), ["%3d" % x for x in inputs_])
 
-          output = string_ops.as_string(input_, width=3)
-          result = output.eval(feed_dict={input_: inputs})
-          self.assertAllEqual(s(result), ["%3d" % x for x in inputs])
+        result = string_ops.as_string(inputs, width=3, fill="0")
+        self.assertAllEqual(s(result), ["%03d" % x for x in inputs_])
 
-          output = string_ops.as_string(input_, width=3, fill="0")
-          result = output.eval(feed_dict={input_: inputs})
-          self.assertAllEqual(s(result), ["%03d" % x for x in inputs])
+      with self.assertRaisesOpError("scientific and shortest"):
+        self.evaluate(string_ops.as_string(inputs, scientific=True))
 
-        with self.assertRaisesOpError("scientific and shortest"):
-          output = string_ops.as_string(input_, scientific=True)
-          output.eval(feed_dict={input_: inputs})
+      with self.assertRaisesOpError("scientific and shortest"):
+        self.evaluate(string_ops.as_string(inputs, shortest=True))
 
-        with self.assertRaisesOpError("scientific and shortest"):
-          output = string_ops.as_string(input_, shortest=True)
-          output.eval(feed_dict={input_: inputs})
+      with self.assertRaisesOpError("precision not supported"):
+        self.evaluate(string_ops.as_string(inputs, precision=0))
 
-        with self.assertRaisesOpError("precision not supported"):
-          output = string_ops.as_string(input_, precision=0)
-          output.eval(feed_dict={input_: inputs})
-
-  @test_util.run_deprecated_v1
   def testLargeInt(self):
     # Cannot use values outside -128..127 for test, because we're also
     # testing int8
-    s = lambda strs: [x.decode("ascii") for x in strs]
+    s = lambda strs: [x.decode("ascii") for x in self.evaluate(strs)]
+    inputs = [np.iinfo(np.int32).min, np.iinfo(np.int32).max]
+    result = string_ops.as_string(inputs)
+    self.assertAllEqual(s(result), ["%d" % x for x in inputs])
 
-    with self.cached_session():
-      input_ = array_ops.placeholder(dtypes.int32)
-      int_inputs_ = [np.iinfo(np.int32).min, np.iinfo(np.int32).max]
-      output = string_ops.as_string(input_)
-      result = output.eval(feed_dict={input_: int_inputs_})
-      self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
+    inputs = [np.iinfo(np.int64).min, np.iinfo(np.int64).max]
+    result = string_ops.as_string(inputs)
+    self.assertAllEqual(s(result), ["%d" % x for x in inputs])
 
-      input_ = array_ops.placeholder(dtypes.int64)
-      int_inputs_ = [np.iinfo(np.int64).min, np.iinfo(np.int64).max]
-      output = string_ops.as_string(input_)
-      result = output.eval(feed_dict={input_: int_inputs_})
-      self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
-
-  @test_util.run_deprecated_v1
   def testHalfInt(self):
-    s = lambda strs: [x.decode("ascii") for x in strs]
+    s = lambda strs: [x.decode("ascii") for x in self.evaluate(strs)]
+    for dtype, np_dtype in [
+        (dtypes.int16, np.int16),
+        (dtypes.uint16, np.uint16),
+    ]:
+      inputs = [np.iinfo(np_dtype).min, np.iinfo(np_dtype).max]
+      result = string_ops.as_string(ops.convert_to_tensor(inputs, dtype=dtype))
+      self.assertAllEqual(s(result), ["%d" % x for x in inputs])
 
-    with self.cached_session():
-      for dtype, np_dtype in [(dtypes.int16, np.int16),
-                              (dtypes.uint16, np.uint16)]:
-        input_ = array_ops.placeholder(dtype)
-        int_inputs_ = [np.iinfo(np_dtype).min, np.iinfo(np_dtype).max]
-        output = string_ops.as_string(input_)
-        result = output.eval(feed_dict={input_: int_inputs_})
-        self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
-
-  @test_util.run_deprecated_v1
   def testBool(self):
     bool_inputs_ = [False, True]
-    s = lambda strs: [x.decode("ascii") for x in strs]
+    s = lambda strs: [x.decode("ascii") for x in self.evaluate(strs)]
+    result = string_ops.as_string(bool_inputs_)
+    self.assertAllEqual(s(result), ["false", "true"])
 
-    with self.cached_session():
-      for dtype in (dtypes.bool,):
-        input_ = array_ops.placeholder(dtype)
-
-        output = string_ops.as_string(input_)
-        result = output.eval(feed_dict={input_: bool_inputs_})
-        self.assertAllEqual(s(result), ["false", "true"])
-
-  @test_util.run_deprecated_v1
   def testComplex(self):
-    float_inputs_ = [
-        0, 1, -1, 0.5, 0.25, 0.125, complex("INF"), complex("NAN"),
-        complex("-INF")
+    inputs = [
+        0,
+        1,
+        -1,
+        0.5,
+        0.25,
+        0.125,
+        complex("INF"),
+        complex("NAN"),
+        complex("-INF"),
     ]
-    complex_inputs_ = [(x + (x + 1) * 1j) for x in float_inputs_]
+    complex_inputs_ = [(x + (x + 1) * 1j) for x in inputs]
 
-    with self.cached_session():
-      for dtype in (dtypes.complex64, dtypes.complex128):
-        input_ = array_ops.placeholder(dtype)
+    for dtype in (dtypes.complex64, dtypes.complex128):
+      inputs = ops.convert_to_tensor(complex_inputs_, dtype=dtype)
 
-        def clean_nans(s_l):
-          return [s.decode("ascii").replace("-nan", "nan") for s in s_l]
+      def clean_nans(s_l):
+        return [
+            s.decode("ascii").replace("-nan", "nan") for s in self.evaluate(s_l)
+        ]
 
-        output = string_ops.as_string(input_, shortest=True)
-        result = output.eval(feed_dict={input_: complex_inputs_})
-        self.assertAllEqual(
-            clean_nans(result),
-            ["(%g,%g)" % (x.real, x.imag) for x in complex_inputs_])
+      result = string_ops.as_string(inputs, shortest=True)
+      self.assertAllEqual(
+          clean_nans(result),
+          ["(%g,%g)" % (x.real, x.imag) for x in complex_inputs_],
+      )
 
-        output = string_ops.as_string(input_, scientific=True)
-        result = output.eval(feed_dict={input_: complex_inputs_})
-        self.assertAllEqual(
-            clean_nans(result),
-            ["(%e,%e)" % (x.real, x.imag) for x in complex_inputs_])
+      result = string_ops.as_string(inputs, scientific=True)
+      self.assertAllEqual(
+          clean_nans(result),
+          ["(%e,%e)" % (x.real, x.imag) for x in complex_inputs_],
+      )
 
-        output = string_ops.as_string(input_)
-        result = output.eval(feed_dict={input_: complex_inputs_})
-        self.assertAllEqual(
-            clean_nans(result),
-            ["(%f,%f)" % (x.real, x.imag) for x in complex_inputs_])
+      result = string_ops.as_string(inputs)
+      self.assertAllEqual(
+          clean_nans(result),
+          ["(%f,%f)" % (x.real, x.imag) for x in complex_inputs_],
+      )
 
-        output = string_ops.as_string(input_, width=3)
-        result = output.eval(feed_dict={input_: complex_inputs_})
-        self.assertAllEqual(
-            clean_nans(result),
-            ["(%03f,%03f)" % (x.real, x.imag) for x in complex_inputs_])
+      result = string_ops.as_string(inputs, width=3)
+      self.assertAllEqual(
+          clean_nans(result),
+          ["(%03f,%03f)" % (x.real, x.imag) for x in complex_inputs_],
+      )
 
-        output = string_ops.as_string(input_, width=3, fill="0", shortest=True)
-        result = output.eval(feed_dict={input_: complex_inputs_})
-        self.assertAllEqual(
-            clean_nans(result),
-            ["(%03g,%03g)" % (x.real, x.imag) for x in complex_inputs_])
+      result = string_ops.as_string(inputs, width=3, fill="0", shortest=True)
+      self.assertAllEqual(
+          clean_nans(result),
+          ["(%03g,%03g)" % (x.real, x.imag) for x in complex_inputs_],
+      )
 
-        output = string_ops.as_string(input_, precision=10, width=3)
-        result = output.eval(feed_dict={input_: complex_inputs_})
-        self.assertAllEqual(
-            clean_nans(result),
-            ["(%03.10f,%03.10f)" % (x.real, x.imag) for x in complex_inputs_])
+      result = string_ops.as_string(inputs, precision=10, width=3)
+      self.assertAllEqual(
+          clean_nans(result),
+          ["(%03.10f,%03.10f)" % (x.real, x.imag) for x in complex_inputs_],
+      )
 
-        output = string_ops.as_string(
-            input_, precision=10, width=3, fill="0", shortest=True)
-        result = output.eval(feed_dict={input_: complex_inputs_})
-        self.assertAllEqual(
-            clean_nans(result),
-            ["(%03.10g,%03.10g)" % (x.real, x.imag) for x in complex_inputs_])
+      result = string_ops.as_string(
+          inputs, precision=10, width=3, fill="0", shortest=True
+      )
+      self.assertAllEqual(
+          clean_nans(result),
+          ["(%03.10g,%03.10g)" % (x.real, x.imag) for x in complex_inputs_],
+      )
 
-      with self.assertRaisesOpError("Cannot select both"):
-        output = string_ops.as_string(input_, scientific=True, shortest=True)
-        output.eval(feed_dict={input_: complex_inputs_})
+    with self.assertRaisesOpError("Cannot select both"):
+      self.evaluate(
+          string_ops.as_string(inputs, scientific=True, shortest=True)
+      )
+
+  def testString(self):
+    self.assertAllEqual(string_ops.as_string("hello, world!"), "hello, world!")
+    widened_string = self.evaluate(
+        string_ops.as_string("hello, world!", width=20)
+    )
+    self.assertAllEqual(widened_string, "       hello, world!")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/strings_ops/string_join_op_test.py b/tensorflow/python/kernel_tests/strings_ops/string_join_op_test.py
index a341b820dfb..aae43dd02b4 100644
--- a/tensorflow/python/kernel_tests/strings_ops/string_join_op_test.py
+++ b/tensorflow/python/kernel_tests/strings_ops/string_join_op_test.py
@@ -13,37 +13,41 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for string_join_op."""
-from tensorflow.python.framework import test_util
+from tensorflow.python.framework import errors
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
 class StringJoinOpTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testStringJoin(self):
     input0 = ["a", "b"]
     input1 = "a"
     input2 = [["b"], ["c"]]
 
-    with self.cached_session():
-      output = string_ops.string_join([input0, input1])
-      self.assertAllEqual(output, [b"aa", b"ba"])
+    output = string_ops.string_join([input0, input1])
+    self.assertAllEqual(output, [b"aa", b"ba"])
 
-      output = string_ops.string_join([input0, input1], separator="--")
-      self.assertAllEqual(output, [b"a--a", b"b--a"])
+    output = string_ops.string_join([input0, input1], separator="--")
+    self.assertAllEqual(output, [b"a--a", b"b--a"])
 
-      output = string_ops.string_join([input0, input1, input0], separator="--")
-      self.assertAllEqual(output, [b"a--a--a", b"b--a--b"])
+    output = string_ops.string_join([input0, input1, input0], separator="--")
+    self.assertAllEqual(output, [b"a--a--a", b"b--a--b"])
 
-      output = string_ops.string_join([input1] * 4, separator="!")
-      self.assertEqual(self.evaluate(output), b"a!a!a!a")
+    output = string_ops.string_join([input1] * 4, separator="!")
+    self.assertEqual(self.evaluate(output), b"a!a!a!a")
 
-      output = string_ops.string_join([input2] * 2, separator="")
-      self.assertAllEqual(output, [[b"bb"], [b"cc"]])
+    output = string_ops.string_join([input2] * 2, separator="")
+    self.assertAllEqual(output, [[b"bb"], [b"cc"]])
 
-      with self.assertRaises(ValueError):  # Inconsistent shapes
-        string_ops.string_join([input0, input2]).eval()
+    output = string_ops.string_join([])
+    self.assertAllEqual(output, b"")
+
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError),
+        "shapes do not match|must be equal rank",
+    ):
+      self.evaluate(string_ops.string_join([input0, input2]))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/summary_ops/BUILD b/tensorflow/python/kernel_tests/summary_ops/BUILD
index b5835717861..ae84e7cdf4b 100644
--- a/tensorflow/python/kernel_tests/summary_ops/BUILD
+++ b/tensorflow/python/kernel_tests/summary_ops/BUILD
@@ -21,13 +21,14 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:tag_constants",
@@ -42,7 +43,7 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/summary:summary_py",
         "//third_party/py/numpy",
     ],
 )
@@ -57,7 +58,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:nn_grad",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/summary:summary_py",
         "//third_party/py/numpy",
     ],
 )
@@ -71,7 +72,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:logging_ops",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/summary:summary_py",
     ],
 )
 
@@ -85,7 +86,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/summary:summary_py",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
index 1a4452857f4..9bf77efcece 100644
--- a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
@@ -12,7 +12,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:variable_v1",
     ],
 )
 
@@ -23,7 +23,7 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:variable_v1",
         "//third_party/py/numpy",
     ],
 )
@@ -49,6 +49,7 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:variable_v1",
     ],
 )
 
@@ -76,7 +77,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:variable_v1",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/array_ops_test.py b/tensorflow/python/kernel_tests/v1_compat_tests/array_ops_test.py
index 71888d25dcb..799700c366d 100644
--- a/tensorflow/python/kernel_tests/v1_compat_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/array_ops_test.py
@@ -19,7 +19,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import test as test_lib
 
 
@@ -76,7 +76,7 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegex(
         errors.FailedPreconditionError,
         "Attempting to use uninitialized value Variable"):
-      v = variables.VariableV1([1, 2])
+      v = variable_v1.VariableV1([1, 2])
       self.evaluate(v[:].assign([1, 2]))
 
 
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/dense_update_ops_test.py b/tensorflow/python/kernel_tests/v1_compat_tests/dense_update_ops_test.py
index 7fc3046a55d..eb3dd44047e 100644
--- a/tensorflow/python/kernel_tests/v1_compat_tests/dense_update_ops_test.py
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/dense_update_ops_test.py
@@ -17,7 +17,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import test as test_lib
 
 
@@ -32,7 +32,7 @@ class AssignOpTest(test_util.TensorFlowTestCase):
   # Thus, test is enabled only for V1.
   def testAssignNonStrictShapeChecking(self):
     data = array_ops.fill([1024, 1024], 0)
-    p = variables.VariableV1([1])
+    p = variable_v1.VariableV1([1])
     a = state_ops.assign(p, data, validate_shape=False)
     self.evaluate(a)
     self.assertAllEqual(p, self.evaluate(data))
@@ -45,14 +45,14 @@ class AssignOpTest(test_util.TensorFlowTestCase):
 
   @test_util.run_v1_only("Variables need initialization only in V1,")
   def testInitRequiredAssignAdd(self):
-    p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
+    p = variable_v1.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
     a = state_ops.assign_add(p, array_ops.fill([1024, 1024], 0))
     with self.assertRaisesOpError("use uninitialized"):
       self.evaluate(a)
 
   @test_util.run_v1_only("Variables need initialization only in V1.")
   def testInitRequiredAssignSub(self):
-    p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
+    p = variable_v1.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
     a = state_ops.assign_sub(p, array_ops.fill([1024, 1024], 0))
     with self.assertRaisesOpError("use uninitialized"):
       self.evaluate(a)
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/identity_op_py_test.py b/tensorflow/python/kernel_tests/v1_compat_tests/identity_op_py_test.py
index 5ba7e7edcf0..1158d64a950 100644
--- a/tensorflow/python/kernel_tests/v1_compat_tests/identity_op_py_test.py
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/identity_op_py_test.py
@@ -18,7 +18,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import test
 
 
@@ -27,9 +27,8 @@ class IdentityOpTest(test.TestCase):
   @test_util.run_v1_only("Don't need to test VariableV1 in TF2.")
   def testRefIdentityShape(self):
     shape = [2, 3]
-    tensor = variables.VariableV1(
-        constant_op.constant(
-            [[1, 2, 3], [6, 5, 4]], dtype=dtypes.int32))
+    tensor = variable_v1.VariableV1(
+        constant_op.constant([[1, 2, 3], [6, 5, 4]], dtype=dtypes.int32))
     self.assertEqual(shape, tensor.get_shape())
     self.assertEqual(shape, gen_array_ops.ref_identity(tensor).get_shape())
 
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/v1_compat_tests/scatter_nd_ops_test.py
index f6732994b08..98f3c8141c9 100644
--- a/tensorflow/python/kernel_tests/v1_compat_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/scatter_nd_ops_test.py
@@ -20,7 +20,7 @@ import numpy as np
 
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import test
 
 
@@ -111,7 +111,7 @@ class StatefulScatterNdTest(test.TestCase):
         new = ref.copy()
         np_scatter(new, indices, updates)
         # Scatter via tensorflow
-        ref_var = variables.VariableV1(ref)
+        ref_var = variable_v1.VariableV1(ref)
         self.evaluate(ref_var.initializer)
         self.evaluate(tf_scatter(ref_var, indices, updates))
 
@@ -136,7 +136,7 @@ class StatefulScatterNdTest(test.TestCase):
       params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
       updates = np.array([-3, -4, -5]).astype(np.float32)
       with self.cached_session(use_gpu=False):
-        ref = variables.VariableV1(params)
+        ref = variable_v1.VariableV1(params)
         self.evaluate(ref.initializer)
 
         # Indices all in range, no problem.
diff --git a/tensorflow/python/kernel_tests/variables/BUILD b/tensorflow/python/kernel_tests/variables/BUILD
index 0604a5cd9cc..8857fdaf62a 100644
--- a/tensorflow/python/kernel_tests/variables/BUILD
+++ b/tensorflow/python/kernel_tests/variables/BUILD
@@ -45,11 +45,13 @@ tf_py_test(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -67,6 +69,7 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -75,8 +78,12 @@ cuda_py_test(
         "//tensorflow/python:memory_checker",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:test_ops",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
+        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -93,6 +100,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:state_ops_gen",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -105,7 +113,7 @@ tf_py_test(
     tags = ["no_windows"],
     deps = [
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -114,12 +122,14 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
@@ -132,17 +142,18 @@ tf_py_test(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:state_ops_gen",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/kernel_tests/variables/partitioned_variables_test.py b/tensorflow/python/kernel_tests/variables/partitioned_variables_test.py
index 5fde3de44b6..ec05c2d2fda 100644
--- a/tensorflow/python/kernel_tests/variables/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/variables/partitioned_variables_test.py
@@ -23,15 +23,23 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver as saver_lib
 
 
+def initialized_value(var):
+  return control_flow_ops.cond(
+      variable_v1.is_variable_initialized(var), var.read_value,
+      lambda: var.initial_value)
+
+
 class PartitionerCreatorsTest(test.TestCase):
 
   def testFixedSizePartitioner(self):
@@ -408,7 +416,7 @@ class PartitionedVariablesTestCase(test.TestCase):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([200, 40]))
       vs = partitioned_variables.create_partitioned_variables(
-          rnd.get_shape(), [1, 10], rnd.initialized_value())
+          rnd.get_shape(), [1, 10], initialized_value(rnd))
       self.evaluate(variables.global_variables_initializer())
       val = array_ops.concat(vs, 1)
       rnd = self.evaluate(rnd)
@@ -427,7 +435,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           random_ops.random_uniform([20, 43], dtype=dtypes.float64))
       var_lists = [
           partitioned_variables.create_partitioned_variables(
-              rnd.get_shape(), [1, i], rnd.initialized_value())
+              rnd.get_shape(), [1, i], initialized_value(rnd))
           for i in range(1, 10)
       ]
       self.evaluate(variables.global_variables_initializer())
@@ -463,7 +471,7 @@ class PartitionedVariablesTestCase(test.TestCase):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
       vs = partitioned_variables.create_partitioned_variables(
-          rnd.get_shape(), [1, 1], rnd.initialized_value())
+          rnd.get_shape(), [1, 1], initialized_value(rnd))
       self.evaluate(variables.global_variables_initializer())
       val = array_ops.concat(vs, 0)
       rnd = self.evaluate(rnd)
@@ -474,7 +482,7 @@ class PartitionedVariablesTestCase(test.TestCase):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
       vs = partitioned_variables.create_partitioned_variables(
-          rnd.get_shape(), [10, 1], rnd.initialized_value())
+          rnd.get_shape(), [10, 1], initialized_value(rnd))
       self.evaluate(variables.global_variables_initializer())
       val = array_ops.concat(vs, 0)
       rnd = self.evaluate(rnd)
@@ -524,25 +532,25 @@ class PartitionedVariablesTestCase(test.TestCase):
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
       with self.assertRaises(ValueError):
         partitioned_variables.create_partitioned_variables(
-            [10], [1, 1], rnd.initialized_value())
+            [10], [1, 1], initialized_value(rnd))
       with self.assertRaises(ValueError):
         partitioned_variables.create_partitioned_variables(
-            [10, 20], [1], rnd.initialized_value())
+            [10, 20], [1], initialized_value(rnd))
       with self.assertRaises(ValueError):
         partitioned_variables.create_partitioned_variables(
-            [10, 43], [1], rnd.initialized_value())
+            [10, 43], [1], initialized_value(rnd))
       with self.assertRaises(ValueError):
         partitioned_variables.create_partitioned_variables(
-            [10, 43], [1, 2, 3], rnd.initialized_value())
+            [10, 43], [1, 2, 3], initialized_value(rnd))
       with self.assertRaises(ValueError):
         partitioned_variables.create_partitioned_variables(
-            [10, 43], [11, 1], rnd.initialized_value())
+            [10, 43], [11, 1], initialized_value(rnd))
       with self.assertRaises(ValueError):
         partitioned_variables.create_partitioned_variables(
-            [10, 43], [20, 1], rnd.initialized_value())
+            [10, 43], [20, 1], initialized_value(rnd))
       with self.assertRaises(ValueError):
         partitioned_variables.create_partitioned_variables(
-            [10, 43], [1, 50], rnd.initialized_value())
+            [10, 43], [1, 50], initialized_value(rnd))
 
   @test_util.run_deprecated_v1
   def testControlDepsNone(self):
diff --git a/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
index 87e00ef1f57..995124fa4a4 100644
--- a/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
@@ -43,6 +43,7 @@ from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gradients_impl
@@ -53,6 +54,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
 from tensorflow.python.platform import test
@@ -198,13 +200,19 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     with context.eager_mode():
       variable = resource_variable_ops.ResourceVariable(1.0, name="eager-init")
       self.assertAllEqual(variable.numpy(), 1.0)
-      self.assertAllEqual(variable.initialized_value().numpy(), 1.0)
+      self.assertAllEqual(variable.read_value().numpy(), 1.0)
 
   def testInitializeVariableUsingInitializedValue(self):
     var1 = resource_variable_ops.ResourceVariable(1.0, name="var1")
-    var2 = resource_variable_ops.ResourceVariable(var1.initialized_value(),
-                                                  name="var2")
-    self.assertAllEqual(var2.initialized_value(), 1.0)
+    var2 = resource_variable_ops.ResourceVariable(
+        control_flow_ops.cond(
+            variable_v1.is_variable_initialized(var1), var1.read_value,
+            lambda: var1.initial_value),
+        name="var2")
+    self.assertAllEqual(
+        control_flow_ops.cond(
+            variable_v1.is_variable_initialized(var2), var2.read_value,
+            lambda: var2.initial_value), 1.0)
 
   def testEagerBool(self):
     with context.eager_mode():
@@ -434,7 +442,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
         return x + v
       def false():
         return 2.0 * v
-      return i + 1, control_flow_ops.cond(i > 0, true, false)
+      return i + 1, tf_cond.cond(i > 0, true, false)
 
     _, x = while_loop.while_loop(cond, body, [0, 0.0])
     # Computing gradients does not produce an exception:
@@ -512,7 +520,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     self.assertEqual(self.evaluate(read), [[2]])
 
   def testUseResource(self):
-    v = variables.VariableV1(1.0, use_resource=True)
+    v = variable_v1.VariableV1(1.0, use_resource=True)
     self.assertIsInstance(v, resource_variable_ops.ResourceVariable)
 
   def testEagerNoUseResource(self):
@@ -976,12 +984,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     with ops.Graph().as_default(), self.cached_session():
       # v describes a VariableDef-based variable without an initial value.
       v = resource_variable_ops.ResourceVariable(variable_def=v_def)
-      self.assertEqual(3.0, self.evaluate(v.initialized_value()))
+      self.assertEqual(3.0, self.evaluate(v.initial_value))
 
-      # initialized_value should not rerun the initializer_op if the variable
+      # read_value should not rerun the initializer_op if the variable
       # has already been initialized elsewhere.
       self.evaluate(v.assign(1.0))
-      self.assertEqual(1.0, v.initialized_value().eval())
+      self.assertEqual(1.0, v.read_value().eval())
 
     v_def.ClearField("initial_value_name")
     with ops.Graph().as_default(), self.cached_session():
@@ -990,9 +998,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       v = resource_variable_ops.ResourceVariable(variable_def=v_def)
       # We should also be able to re-export the variable to a new meta graph.
       self.assertProtoEquals(v_def, v.to_proto())
-      # But attempts to use initialized_value will result in errors.
+      # But attempts to use read_value will result in errors.
       with self.assertRaises(ValueError):
-        self.evaluate(v.initialized_value())
+        self.evaluate(
+            control_flow_ops.cond(
+                variable_v1.is_variable_initialized(v), v.read_value,
+                lambda: v.initial_value))
 
   def testTrainableInProto(self):
     with ops.Graph().as_default():
diff --git a/tensorflow/python/kernel_tests/variables/variable_ops_test.py b/tensorflow/python/kernel_tests/variables/variable_ops_test.py
index d5e06128eb5..7af1f3434ea 100644
--- a/tensorflow/python/kernel_tests/variables/variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variables/variable_ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -272,15 +273,15 @@ class VariableOpTest(test.TestCase):
     for use_gpu in [True, False]:
       with self.test_session(use_gpu=use_gpu):
         v0 = state_ops.variable_op([1, 2], dtypes.float32)
-        self.assertEqual(False, variables.is_variable_initialized(v0).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v0).eval())
         state_ops.assign(v0, [[2.0, 3.0]]).eval()
-        self.assertEqual(True, variables.is_variable_initialized(v0).eval())
+        self.assertEqual(True, variable_v1.is_variable_initialized(v0).eval())
 
 
   @test_util.run_deprecated_v1
   def testString(self):
     data = array_ops_stack.stack([b"data"])
-    buffer_var = variables.VariableV1(
+    buffer_var = variable_v1.VariableV1(
         initial_value=array_ops.zeros(shape=(), dtype=dtypes.string),
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
diff --git a/tensorflow/python/kernel_tests/variables/variable_scope_test.py b/tensorflow/python/kernel_tests/variables/variable_scope_test.py
index 100d536e4e6..41eb82ac071 100644
--- a/tensorflow/python/kernel_tests/variables/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variables/variable_scope_test.py
@@ -29,12 +29,13 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -458,11 +459,13 @@ class VariableScopeTest(test.TestCase):
     old = variable_scope._DEFAULT_USE_RESOURCE
     try:
       variable_scope.enable_resource_variables()
-      self.assertTrue(isinstance(variables_lib.VariableV1(1.0),
-                                 resource_variable_ops.ResourceVariable))
+      self.assertIsInstance(
+          variable_v1.VariableV1(1.0),
+          resource_variable_ops.ResourceVariable)
       variable_scope.disable_resource_variables()
-      self.assertFalse(isinstance(variables_lib.VariableV1(1.0),
-                                  resource_variable_ops.ResourceVariable))
+      self.assertNotIsInstance(
+          variable_v1.VariableV1(1.0),
+          resource_variable_ops.ResourceVariable)
     finally:
       variable_scope._DEFAULT_USE_RESOURCE = old
 
@@ -488,7 +491,7 @@ class VariableScopeTest(test.TestCase):
         var_dict["v2"] = v2
         return v2 + v0
 
-      add = control_flow_ops.cond(
+      add = cond.cond(
           math_ops.less(v0, 10), var_in_then_clause, var_in_else_clause)
       v1 = var_dict["v1"]
       v2 = var_dict["v2"]
@@ -1252,15 +1255,15 @@ class VariableScopeTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testGetTrainableVariablesWithVariable(self):
     with self.cached_session():
-      _ = variable_scope.variable(1.0, name="testGetTrainableVariables_a")
+      _ = variable_v1.VariableV1(1.0, name="testGetTrainableVariables_a")
       with variable_scope.variable_scope(
           "testGetTrainableVariables_foo") as scope:
-        _ = variable_scope.variable(1.0, name="testGetTrainableVariables_b")
-        _ = variable_scope.variable(
+        _ = variable_v1.VariableV1(1.0, name="testGetTrainableVariables_b")
+        _ = variable_v1.VariableV1(
             1.0, name="testGetTrainableVariables_c", trainable=False)
 
         # sync `ON_READ` sets trainable=False
-        _ = variable_scope.variable(
+        _ = variable_v1.VariableV1(
             1.0,
             name="testGetTrainableVariables_d",
             synchronization=variable_scope.VariableSynchronization.ON_READ)
@@ -1268,7 +1271,7 @@ class VariableScopeTest(test.TestCase):
             [v.name for v in scope.trainable_variables()],
             ["testGetTrainableVariables_foo/testGetTrainableVariables_b:0"])
 
-        _ = variable_scope.variable(
+        _ = variable_v1.VariableV1(
             1.0,
             name="testGetTrainableVariables_e",
             synchronization=variable_scope.VariableSynchronization.ON_READ,
@@ -1279,7 +1282,7 @@ class VariableScopeTest(test.TestCase):
         ])
 
         # All other sync values sets trainable=True
-        _ = variable_scope.variable(
+        _ = variable_v1.VariableV1(
             1.0,
             name="testGetTrainableVariables_f",
             synchronization=variable_scope.VariableSynchronization.ON_WRITE)
@@ -1681,7 +1684,7 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
 
     with variable_scope.variable_creator_scope(creator_a):
       with variable_scope.variable_creator_scope(creator_b):
-        variable_scope.variable(1.0, name="one_name")
+        variable_v1.VariableV1(1.0, name="one_name")
 
     self.assertEqual(variable_names[0], "forced_name")
 
diff --git a/tensorflow/python/kernel_tests/variables/variables_test.py b/tensorflow/python/kernel_tests/variables/variables_test.py
index 1c2fd913c02..bc7ea366294 100644
--- a/tensorflow/python/kernel_tests/variables/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables/variables_test.py
@@ -29,11 +29,13 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
 from tensorflow.python.platform import test
@@ -41,24 +43,30 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.util import compat
 
 
+def initialized_value(var):
+  return control_flow_ops.cond(
+      variable_v1.is_variable_initialized(var), var.read_value,
+      lambda: var.initial_value)
+
+
 class VariablesTestCase(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def testDistributeStrategy(self):
-    v = variables.VariableV1(0.0)
+    v = variable_v1.VariableV1(0.0)
     self.assertIsNone(v._distribute_strategy)
 
   @test_util.run_v1_only("b/120545219")
   def testInitialization(self):
     with self.cached_session():
-      var0 = variables.VariableV1(0.0)
+      var0 = variable_v1.VariableV1(0.0)
       self.assertEqual("Variable:0", var0.name)
       self.assertEqual("Variable", var0._shared_name)
       self.assertEqual([], var0.get_shape())
       self.assertEqual([], var0.get_shape())
       self.assertEqual([], var0.shape)
 
-      var1 = variables.VariableV1(1.1)
+      var1 = variable_v1.VariableV1(1.1)
       self.assertEqual("Variable_1:0", var1.name)
       self.assertEqual("Variable_1", var1._shared_name)
       self.assertEqual([], var1.get_shape())
@@ -85,14 +93,14 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
       self.assertEqual([3, 6], rnd.get_shape())
       self.assertEqual([3, 6], rnd.shape)
 
-      dep = variables.Variable(rnd.initialized_value(), name="dep")
+      dep = variables.Variable(initialized_value(rnd), name="dep")
       self.assertEqual("dep:0", dep.name)
       self.assertEqual([3, 6], dep.get_shape())
       self.assertEqual([3, 6], dep.get_shape())
       self.assertEqual([3, 6], dep.shape)
 
       # Currently have to set the shape manually for Add.
-      added_val = rnd.initialized_value() + dep.initialized_value() + 2.0
+      added_val = initialized_value(rnd) + initialized_value(dep) + 2.0
       added_val.set_shape(rnd.get_shape())
 
       depdep = variables.Variable(added_val, name="depdep")
@@ -120,10 +128,10 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def testIterableV1(self):
-    with self.assertRaisesRegex(TypeError, "not allowed in Graph"):
+    with self.assertRaisesRegex(TypeError, "not allowed.*Graph mode"):
       for _ in variables.Variable(0.0):
         pass
-    with self.assertRaisesRegex(TypeError, "not allowed in Graph"):
+    with self.assertRaisesRegex(TypeError, "not allowed.*Graph mode"):
       for _ in variables.Variable([0.0, 1.0]):
         pass
 
@@ -192,7 +200,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
 
   def testZeroSizeStringAssign(self):
     with self.cached_session() as sess:
-      array = variables.VariableV1(
+      array = variable_v1.VariableV1(
           initial_value=array_ops.zeros((0,), dtype=dtypes.string),
           name="foo",
           trainable=False,
@@ -244,7 +252,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
         # d get the control dep.
         d = constant_op.constant(2.0)
         # variables do not.
-        var_x = variables.VariableV1(2.0)
+        var_x = variable_v1.VariableV1(2.0)
       self.assertEqual([c.op], d.op.control_inputs)
       self.assertEqual([], var_x.initializer.control_inputs)
       self.assertEqual([], var_x.value().op.control_inputs)
@@ -267,7 +275,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
         var_dict["v2"] = v2
         return v2 + v0
 
-      add = control_flow_ops.cond(
+      add = tf_cond.cond(
           math_ops.less(v0, 10), var_in_then_clause, var_in_else_clause)
       v1 = var_dict["v1"]
       v2 = var_dict["v2"]
@@ -328,7 +336,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
   def testCachingDevice(self):
     with self.cached_session():
       var = variables.Variable(2.0)
-      self.assertEqual(var.device, var.initialized_value().device)
+      self.assertEqual(var.device, initialized_value(var).device)
 
       var_cached = variables.Variable(2.0, caching_device="/job:foo")
       self.assertFalse(var_cached.device.startswith("/job:foo"))
@@ -337,10 +345,10 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
   @test_util.run_deprecated_v1
   def testCollections(self):
     with self.cached_session():
-      var_x = variables.VariableV1(2.0)
-      var_y = variables.VariableV1(2.0, trainable=False)
-      var_z = variables.VariableV1(2.0, trainable=True)
-      var_t = variables.VariableV1(
+      var_x = variable_v1.VariableV1(2.0)
+      var_y = variable_v1.VariableV1(2.0, trainable=False)
+      var_z = variable_v1.VariableV1(2.0, trainable=True)
+      var_t = variable_v1.VariableV1(
           2.0,
           trainable=True,
           collections=[
@@ -354,9 +362,9 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
   def testCollectionsWithScope(self):
     with self.cached_session():
       with ops.name_scope("scope_1"):
-        var_x = variables.VariableV1(2.0)
+        var_x = variable_v1.VariableV1(2.0)
       with ops.name_scope("scope_2"):
-        var_y = variables.VariableV1(2.0)
+        var_y = variable_v1.VariableV1(2.0)
 
       self.assertEqual([var_x, var_y], variables.global_variables())
       self.assertEqual([var_x], variables.global_variables("scope_1"))
@@ -456,7 +464,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testColocation(self):
     with ops.device("/job:ps"):
-      var = variables.VariableV1(0, name="v")
+      var = variable_v1.VariableV1(0, name="v")
     with ops.device("/job:worker/task:7"):
       assign_op = var.assign(1)
     self.assertDeviceEqual("/job:ps", assign_op.device)
@@ -477,7 +485,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
         self.evaluate(v1)
 
       v2 = variables.Variable(
-          math_ops.negative(v1.initialized_value()), dtype=dtypes.float32)
+          math_ops.negative(initialized_value(v1)), dtype=dtypes.float32)
       self.assertEqual(v1.get_shape(), v2.get_shape())
       self.assertEqual(v1.shape, v2.shape)
       self.assertAllClose(np.negative(value), self.evaluate(v2.initial_value))
@@ -504,8 +512,8 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
   def testNoRefDataRace(self):
     with self.cached_session():
       a = variables.Variable([1, 2, 3], dtype=dtypes.float32)
-      b = variables.Variable(a.initialized_value() + 2)
-      c = variables.Variable(b.initialized_value() + 2)
+      b = variables.Variable(initialized_value(a) + 2)
+      c = variables.Variable(initialized_value(b) + 2)
       self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(self.evaluate(a), [1, 2, 3])
       self.assertAllEqual(self.evaluate(b), [3, 4, 5])
@@ -539,12 +547,12 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
     with ops.Graph().as_default(), self.cached_session() as sess:
       # v describes a VariableDef-based variable without an initial value.
       v = variables.Variable(variable_def=v_def)
-      self.assertEqual(3.0, self.evaluate(v.initialized_value()))
+      self.assertEqual(3.0, self.evaluate(initialized_value(v)))
 
       # initialized_value should not rerun the initializer_op if the variable
       # has already been initialized elsewhere.
       self.evaluate(v.assign(1.0))
-      self.assertEqual(1.0, self.evaluate(v.initialized_value()))
+      self.assertEqual(1.0, self.evaluate(initialized_value(v)))
 
     v_def.ClearField("initial_value_name")
     with ops.Graph().as_default(), self.cached_session() as sess:
@@ -555,7 +563,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
       self.assertProtoEquals(v_def, v.to_proto())
       # But attempts to use initialized_value will result in errors.
       with self.assertRaises(ValueError):
-        self.evaluate(v.initialized_value())
+        self.evaluate(initialized_value(v))
 
   def testTrainableInProto(self):
     with ops.Graph().as_default():
@@ -604,7 +612,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
 
   @test_util.run_v1_only("b/120545219")
   def testRepr(self):
-    var = variables.VariableV1(np.zeros((5, 5), np.float32), name="noop")
+    var = variable_v1.VariableV1(np.zeros((5, 5), np.float32), name="noop")
     self.assertEqual(
         "<tf.Variable 'noop:0' shape=(5, 5) dtype=float32_ref>",
         repr(var))
@@ -622,7 +630,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
     with ops.get_default_graph().as_default():
       create_variable()
 
-  @parameterized.parameters(variables.VariableV1, variables.Variable)
+  @parameterized.parameters(variable_v1.VariableV1, variables.Variable)
   def testTrainableVariable(self, cls):
     v1 = cls(1.0)
     self.assertEqual(True, v1.trainable)
@@ -659,8 +667,8 @@ class IsInitializedTest(test.TestCase):
   @test_util.run_v1_only("b/120545219")
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
-      v = variables.VariableV1([1, 2], name="v")
-      w = variables.VariableV1([3, 4], name="w")
+      v = variable_v1.VariableV1([1, 2], name="v")
+      w = variable_v1.VariableV1([3, 4], name="w")
       uninited = variables.report_uninitialized_variables()
       self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
       self.evaluate(w.initializer)
@@ -697,8 +705,8 @@ class ObsoleteIsInitializedTest(test.TestCase):
 
   def testVariables(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
-      v = variables.VariableV1([1, 2])
-      w = variables.VariableV1([3, 4])
+      v = variable_v1.VariableV1([1, 2])
+      w = variable_v1.VariableV1([3, 4])
       _ = v, w
       inited = variables.assert_variables_initialized()
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
@@ -708,8 +716,8 @@ class ObsoleteIsInitializedTest(test.TestCase):
 
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
-      v = variables.VariableV1([1, 2])
-      w = variables.VariableV1([3, 4])
+      v = variable_v1.VariableV1([1, 2])
+      w = variable_v1.VariableV1([3, 4])
       inited = variables.assert_variables_initialized([v])
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         inited.op.run()
diff --git a/tensorflow/python/layers/BUILD b/tensorflow/python/layers/BUILD
index 50212d40a9d..7dae801fb0e 100644
--- a/tensorflow/python/layers/BUILD
+++ b/tensorflow/python/layers/BUILD
@@ -29,6 +29,7 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/framework:smart_cond",
diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index 17c62ecad58..6ec66415686 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -15,7 +15,7 @@
 
 """Contains layer utilities for input validation and format conversion."""
 from tensorflow.python.framework import smart_cond as smart_module
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import variables
 
 
@@ -193,7 +193,7 @@ def smart_cond(pred, true_fn=None, false_fn=None, name=None):
     TypeError: If `true_fn` or `false_fn` is not callable.
   """
   if isinstance(pred, variables.Variable):
-    return control_flow_ops.cond(
+    return cond.cond(
         pred, true_fn=true_fn, false_fn=false_fn, name=name)
   return smart_module.smart_cond(
       pred, true_fn=true_fn, false_fn=false_fn, name=name)
diff --git a/tensorflow/python/lib/BUILD b/tensorflow/python/lib/BUILD
index a77f75adeba..f9fe87f046e 100644
--- a/tensorflow/python/lib/BUILD
+++ b/tensorflow/python/lib/BUILD
@@ -1,4 +1,5 @@
 # python/lib package
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 visibility = [
     "//tensorflow:__subpackages__",
@@ -9,3 +10,10 @@ package(
     default_visibility = visibility,
     licenses = ["notice"],
 )
+
+py_strict_library(
+    name = "__init__",
+    srcs = ["__init__.py"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [],
+)
diff --git a/tensorflow/python/lib/core/BUILD b/tensorflow/python/lib/core/BUILD
index 756a815b1fe..38ff93f242a 100644
--- a/tensorflow/python/lib/core/BUILD
+++ b/tensorflow/python/lib/core/BUILD
@@ -173,11 +173,6 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-filegroup(
-    name = "safe_ptr_hdr",
-    srcs = ["safe_ptr.h"],
-)
-
 filegroup(
     name = "ndarray_tensor_hdr",
     srcs = ["ndarray_tensor.h"],
@@ -190,7 +185,6 @@ filegroup(
         "ndarray_tensor_bridge.h",
         "py_exception_registry.h",
         "pybind11_status.h",
-        "safe_ptr.h",
         "safe_pyobject_ptr.h",
     ],
 )
@@ -203,7 +197,8 @@ cc_library(
         ":ndarray_tensor",
         ":ndarray_tensor_bridge",
         ":py_util",
-        ":safe_ptr",
+        ":safe_pyobject_ptr",
+        "//tensorflow/c:safe_ptr",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:tfe_context_internal",
@@ -241,23 +236,9 @@ cc_library(
     ],
 )
 
-cc_library(
+filegroup(
     name = "safe_pyobject_ptr_required_hdrs",
-    textual_hdrs = ["safe_pyobject_ptr.h"],
-)
-
-cc_library(
-    name = "safe_ptr",
-    srcs = [
-        "safe_ptr.cc",
-        "//tensorflow/c/eager:headers",
-    ],
-    hdrs = ["safe_ptr.h"],
-    deps = [
-        ":safe_pyobject_ptr",
-        "//tensorflow/c:c_api_no_xla",
-        "//third_party/python_runtime:headers",
-    ],
+    srcs = ["safe_pyobject_ptr.h"],
 )
 
 cc_library(
@@ -265,9 +246,9 @@ cc_library(
     hdrs = [
         "ndarray_tensor.h",
         "ndarray_tensor_bridge.h",
-        "safe_ptr.h",
         "safe_pyobject_ptr.h",
         "//tensorflow/c:headers",
+        "//tensorflow/c:safe_ptr_hdr",
         "//tensorflow/c/eager:headers",
         "//tensorflow/tsl/python/lib/core:numpy_hdr",
     ],
@@ -297,13 +278,14 @@ cc_library(
     ]),
     deps = [
         ":ndarray_tensor_bridge",
-        ":safe_ptr",
         "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:safe_ptr",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c:tf_tensor_internal",
         "//tensorflow/c/eager:tfe_context_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/tsl/python/lib/core:bfloat16_lib",
         "//tensorflow/tsl/python/lib/core:float8_lib",
         "//tensorflow/tsl/python/lib/core:numpy",
@@ -319,7 +301,8 @@ cc_library(
         ":ndarray_tensor",
         ":ndarray_tensor_bridge",
         ":py_util",
-        ":safe_ptr",
+        ":safe_pyobject_ptr",
+        "//tensorflow/c:safe_ptr",
         "//tensorflow/c:tensor_interface",
         "//tensorflow/c:tf_tensor_internal",
         "//tensorflow/c/eager:c_api_internal",
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index bab9e41b90b..461918e25cd 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/port.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 #include "tensorflow/tsl/python/lib/core/bfloat16.h"
 #include "tensorflow/tsl/python/lib/core/float8.h"
 
diff --git a/tensorflow/python/lib/core/ndarray_tensor.h b/tensorflow/python/lib/core/ndarray_tensor.h
index e7657778fa8..47964f5c76e 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.h
+++ b/tensorflow/python/lib/core/ndarray_tensor.h
@@ -17,9 +17,10 @@ limitations under the License.
 #define TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
 
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index b1249f306fb..73ddb067ef4 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 #include "tensorflow/python/lib/core/py_util.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 3be4d3fb31f..2e7e88aac08 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tensor_interface.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -34,7 +35,7 @@ limitations under the License.
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 #include "tensorflow/python/lib/core/py_util.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 namespace tensorflow {
 namespace {
 
@@ -695,7 +696,7 @@ TFE_TensorHandle* NumpyToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj) {
     PyErr_SetString(PyExc_ValueError,
                     tensorflow::strings::StrCat(
                         "Failed to convert a NumPy array to a Tensor (",
-                        status.error_message(), ").")
+                        status.message(), ").")
                         .c_str());
     return nullptr;
   }
@@ -779,7 +780,7 @@ TFE_TensorHandle* PySeqToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj,
   ConverterState state;
   Status status = InferShapeAndType(obj, &state);
   if (!status.ok()) {
-    PyErr_SetString(PyExc_ValueError, status.error_message().c_str());
+    PyErr_SetString(PyExc_ValueError, tsl::NullTerminatedMessage(status));
     return nullptr;
   }
   DataType requested_dtype = DT_INVALID;
@@ -908,7 +909,7 @@ TFE_TensorHandle* PySeqToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj,
   }
 
   if (!status.ok()) {
-    PyErr_SetString(PyExc_ValueError, status.error_message().c_str());
+    PyErr_SetString(PyExc_ValueError, tsl::NullTerminatedMessage(status));
     return nullptr;
   }
 
diff --git a/tensorflow/python/lib/core/pybind11_status.h b/tensorflow/python/lib/core/pybind11_status.h
index 9097d5e8055..2b2982ce9f9 100644
--- a/tensorflow/python/lib/core/pybind11_status.h
+++ b/tensorflow/python/lib/core/pybind11_status.h
@@ -71,17 +71,17 @@ inline pybind11::dict TFStatusPayloadToDict(TF_Status* status) {
 inline void MaybeRaiseFromStatus(const Status& status) {
   if (!status.ok()) {
     PyErr_SetString(internal::StatusToPyExc(status),
-                    status.error_message().c_str());
+                    tsl::NullTerminatedMessage(status));
     throw pybind11::error_already_set();
   }
 }
 
 inline void SetRegisteredErrFromStatus(const tensorflow::Status& status) {
-  PyErr_SetObject(tensorflow::PyExceptionRegistry::Lookup(status.raw_code()),
-                  pybind11::make_tuple(pybind11::none(), pybind11::none(),
-                                       status.error_message(),
-                                       internal::StatusPayloadToDict(status))
-                      .ptr());
+  PyErr_SetObject(
+      tensorflow::PyExceptionRegistry::Lookup(status.raw_code()),
+      pybind11::make_tuple(pybind11::none(), pybind11::none(), status.message(),
+                           internal::StatusPayloadToDict(status))
+          .ptr());
 }
 
 inline void SetRegisteredErrFromTFStatus(TF_Status* status) {
diff --git a/tensorflow/python/lib/io/BUILD b/tensorflow/python/lib/io/BUILD
index 2ea5aec78aa..df4b38b5db4 100644
--- a/tensorflow/python/lib/io/BUILD
+++ b/tensorflow/python/lib/io/BUILD
@@ -42,7 +42,9 @@ py_library(
         ":_pywrap_record_io",
         "//tensorflow/python:errors",
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
         "@six_archive//:six",
     ],
 )
@@ -92,7 +94,6 @@ tf_py_test(
     deps = [
         ":lib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:util",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/lib/io/record_io_wrapper.cc b/tensorflow/python/lib/io/record_io_wrapper.cc
index e18e7efc79a..074d7393f42 100644
--- a/tensorflow/python/lib/io/record_io_wrapper.cc
+++ b/tensorflow/python/lib/io/record_io_wrapper.cc
@@ -236,7 +236,7 @@ PYBIND11_MODULE(_pywrap_record_io, m) {
               py::gil_scoped_release release;
               status = PyRecordReader::New(filename, compression_type, &self);
             }
-            MaybeRaiseRegisteredFromStatus(status);
+            tsl::MaybeRaiseRegisteredFromStatus(status);
             return self;
           }))
       .def("__iter__", [](const py::object& self) { return self; })
@@ -258,7 +258,7 @@ PYBIND11_MODULE(_pywrap_record_io, m) {
                // __next__ calls.
                throw py::stop_iteration();
              }
-             MaybeRaiseRegisteredFromStatus(status);
+             tsl::MaybeRaiseRegisteredFromStatus(status);
              return py::bytes(record);
            })
       .def("close", [](PyRecordReader* self) { self->Close(); })
@@ -268,7 +268,7 @@ PYBIND11_MODULE(_pywrap_record_io, m) {
           py::gil_scoped_release release;
           status = self->Reopen();
         }
-        MaybeRaiseRegisteredFromStatus(status);
+        tsl::MaybeRaiseRegisteredFromStatus(status);
       });
 
   py::class_<PyRecordRandomReader>(m, "RandomRecordReader")
@@ -279,7 +279,7 @@ PYBIND11_MODULE(_pywrap_record_io, m) {
           py::gil_scoped_release release;
           status = PyRecordRandomReader::New(filename, &self);
         }
-        MaybeRaiseRegisteredFromStatus(status);
+        tsl::MaybeRaiseRegisteredFromStatus(status);
         return self;
       }))
       .def("read",
@@ -295,7 +295,7 @@ PYBIND11_MODULE(_pywrap_record_io, m) {
                throw py::index_error(tensorflow::strings::StrCat(
                    "Out of range at reading offset ", offset));
              }
-             MaybeRaiseRegisteredFromStatus(status);
+             tsl::MaybeRaiseRegisteredFromStatus(status);
              return py::make_tuple(py::bytes(record), temp_offset);
            })
       .def("close", [](PyRecordRandomReader* self) { self->Close(); });
@@ -333,13 +333,13 @@ PYBIND11_MODULE(_pywrap_record_io, m) {
               py::gil_scoped_release release;
               status = PyRecordWriter::New(filename, options, &self);
             }
-            MaybeRaiseRegisteredFromStatus(status);
+            tsl::MaybeRaiseRegisteredFromStatus(status);
             return self;
           }))
       .def("__enter__", [](const py::object& self) { return self; })
       .def("__exit__",
            [](PyRecordWriter* self, py::args) {
-             MaybeRaiseRegisteredFromStatus(self->Close());
+             tsl::MaybeRaiseRegisteredFromStatus(self->Close());
            })
       .def(
           "write",
@@ -349,15 +349,15 @@ PYBIND11_MODULE(_pywrap_record_io, m) {
               py::gil_scoped_release release;
               status = self->WriteRecord(record);
             }
-            MaybeRaiseRegisteredFromStatus(status);
+            tsl::MaybeRaiseRegisteredFromStatus(status);
           },
           py::arg("record"))
       .def("flush",
            [](PyRecordWriter* self) {
-             MaybeRaiseRegisteredFromStatus(self->Flush());
+             tsl::MaybeRaiseRegisteredFromStatus(self->Flush());
            })
       .def("close", [](PyRecordWriter* self) {
-        MaybeRaiseRegisteredFromStatus(self->Close());
+        tsl::MaybeRaiseRegisteredFromStatus(self->Close());
       });
 }
 
diff --git a/tensorflow/python/mlir_wrapper.cc b/tensorflow/python/mlir_wrapper.cc
index 7d554e692b8..8fe71b77ef1 100644
--- a/tensorflow/python/mlir_wrapper.cc
+++ b/tensorflow/python/mlir_wrapper.cc
@@ -15,11 +15,12 @@ limitations under the License.
 
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
+#include "pybind11/stl.h"  // from @pybind11
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/compiler/mlir/python/mlir.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
 
 PYBIND11_MODULE(_pywrap_mlir, m) {
   m.def("ImportGraphDef",
@@ -90,13 +91,15 @@ PYBIND11_MODULE(_pywrap_mlir, m) {
   m.def("ExperimentalConvertSavedModelV1ToMlir",
         [](const std::string &saved_model_path,
            const std::string &exported_names_str, const std::string &tags,
-           bool lift_variables, bool upgrade_legacy, bool show_debug_info) {
+           bool lift_variables, bool include_variables_in_initializers,
+           bool upgrade_legacy, bool show_debug_info) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           std::string output =
               tensorflow::ExperimentalConvertSavedModelV1ToMlir(
                   saved_model_path, exported_names_str, tags, lift_variables,
-                  upgrade_legacy, show_debug_info, status.get());
+                  include_variables_in_initializers, upgrade_legacy,
+                  show_debug_info, status.get());
           tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
           return output;
         });
@@ -119,4 +122,17 @@ PYBIND11_MODULE(_pywrap_mlir, m) {
     tensorflow::ExperimentalWriteBytecode(filename, mlir_txt, status.get());
     tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
   });
+
+  m.def("ExperimentalTFLiteToTosaBytecode",
+        [](const std::string &flatbuffer_file,
+           const std::string &tosa_bytecode_file, bool use_external_constant,
+           const std::vector<std::string> &ordered_input_arrays,
+           const std::vector<std::string> &ordered_output_arrays) {
+          tensorflow::Safe_TF_StatusPtr status =
+              tensorflow::make_safe(TF_NewStatus());
+          tensorflow::ExperimentalTFLiteToTosaBytecode(
+              flatbuffer_file, tosa_bytecode_file, use_external_constant,
+              ordered_input_arrays, ordered_output_arrays, status.get());
+          tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+        });
 };
diff --git a/tensorflow/python/module/BUILD b/tensorflow/python/module/BUILD
index b0a929223f1..0fb60967564 100644
--- a/tensorflow/python/module/BUILD
+++ b/tensorflow/python/module/BUILD
@@ -13,9 +13,11 @@ py_library(
     deps = [
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
diff --git a/tensorflow/python/modules_with_exports.py b/tensorflow/python/modules_with_exports.py
index 86c035ecf80..fb9663b7c64 100644
--- a/tensorflow/python/modules_with_exports.py
+++ b/tensorflow/python/modules_with_exports.py
@@ -43,6 +43,7 @@ from tensorflow.python.client.client_lib import *
 
 # Ops
 from tensorflow.python.ops.standard_ops import *  # pylint: disable=redefined-builtin
+from tensorflow.python.ops.random_crop_ops import *
 
 # Function
 from tensorflow.core.function.trace_type import *
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index ebb1dd33eab..4ae6802cc04 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
@@ -474,16 +474,16 @@ def _MatrixSetDiagGradV2(op, grad):
     diag_index = array_ops.reshape(op.inputs[2], [-1])  # Converts to vector.
     d_lower = diag_index[0]
     d_upper = diag_index[-1]  # Works both when len(diag_index) is 1 and 2.
-    y_offset = control_flow_ops.cond(
+    y_offset = cond.cond(
         math_ops.less(d_upper, 0), lambda: d_upper, lambda: 0)
-    x_offset = control_flow_ops.cond(
+    x_offset = cond.cond(
         math_ops.greater(d_lower, 0), lambda: -d_lower, lambda: 0)
 
     max_diag_len = math_ops.minimum(matrix_shape[0] + y_offset,
                                     matrix_shape[1] + x_offset)
     # pylint: disable=g-long-lambda
     # pyformat: disable
-    postfix = control_flow_ops.cond(
+    postfix = cond.cond(
         math_ops.equal(d_lower, d_upper),
         lambda: ops.convert_to_tensor([max_diag_len]),
         lambda: ops.convert_to_tensor([d_upper - d_lower + 1,
@@ -511,16 +511,16 @@ def _MatrixSetDiagGradV3(op, grad):
     diag_index = array_ops.reshape(op.inputs[2], [-1])  # Converts to vector.
     d_lower = diag_index[0]
     d_upper = diag_index[-1]  # Works both when len(diag_index) is 1 and 2.
-    y_offset = control_flow_ops.cond(
+    y_offset = cond.cond(
         math_ops.less(d_upper, 0), lambda: d_upper, lambda: 0)
-    x_offset = control_flow_ops.cond(
+    x_offset = cond.cond(
         math_ops.greater(d_lower, 0), lambda: -d_lower, lambda: 0)
 
     max_diag_len = math_ops.minimum(matrix_shape[0] + y_offset,
                                     matrix_shape[1] + x_offset)
     # pylint: disable=g-long-lambda
     # pyformat: disable
-    postfix = control_flow_ops.cond(
+    postfix = cond.cond(
         math_ops.equal(d_lower, d_upper),
         lambda: ops.convert_to_tensor([max_diag_len]),
         lambda: ops.convert_to_tensor([d_upper - d_lower + 1,
@@ -1023,44 +1023,27 @@ def _QuantizeAndDequantizeV3Grad(_, grad):
 @ops.RegisterGradient("ExtractImagePatches")
 def _ExtractImagePatchesGrad(op, grad):
   input_bhwc = array_ops.shape(op.inputs[0], out_type=dtypes.int64)
-  batch_size, rows_in, cols_in, channels = input_bhwc[0], input_bhwc[1], \
-                                           input_bhwc[2], input_bhwc[3]
+  batch_size, rows_in, cols_in, channels = array_ops_stack.unstack(input_bhwc)
+
+  output_bhwc = array_ops.shape(op.outputs[0], out_type=dtypes.int64)
+  rows_out, cols_out = array_ops_stack.unstack(output_bhwc[1:3])
+
+  _, ksize_r, ksize_c, _ = op.get_attr("ksizes")
 
   # Create indices matrix for input tensor.
   # Note that 0 is preserved for padding location,
   # so indices for input start from 1 to 1 + rows_in * cols_in.
-  input_indices_num = 1 + rows_in * cols_in
+  input_indices_num = rows_in * cols_in
+  # XLA version of extract_image_patches does not support int64,
+  # using float32 instead.
   input_idx = array_ops.reshape(
-      math_ops.range(1, input_indices_num, dtype=ops.dtypes.int64),
-      (1, rows_in, cols_in, 1))
+      math_ops.range(1, input_indices_num + 1, dtype=ops.dtypes.float32),
+      (1, rows_in, cols_in, 1),
+  )
   input_idx_patched = gen_array_ops.extract_image_patches(
       input_idx, op.get_attr("ksizes"), op.get_attr("strides"),
       op.get_attr("rates"), op.get_attr("padding"))
-
-  # Create indices matrix for output tensor.
-  output_bhwc = array_ops.shape(op.outputs[0], out_type=dtypes.int64)
-  rows_out, cols_out = output_bhwc[1], output_bhwc[2]
-  _, ksize_r, ksize_c, _ = op.get_attr("ksizes")
-  # Indices for output start from 0.
-  output_indices_num = rows_out * cols_out * ksize_r * ksize_c
-  output_idx = array_ops.reshape(
-      math_ops.range(output_indices_num, dtype=ops.dtypes.int64),
-      (1, rows_out, cols_out, ksize_r * ksize_c))
-
-  # Construct mapping table for indices: (input -> output).
-  idx_matrix = array_ops.concat([
-      array_ops.expand_dims(input_idx_patched, axis=-1),
-      array_ops.expand_dims(output_idx, axis=-1)
-  ],
-                                axis=-1)
-  idx_map = array_ops.reshape(idx_matrix, (-1, 2))
-
-  sp_shape = (input_indices_num, output_indices_num)
-  sp_mat_full = sparse_tensor.SparseTensor(
-      idx_map, array_ops.ones([output_indices_num], dtype=grad.dtype), sp_shape)
-  # Remove all padding locations [0, :].
-  sp_mat = sparse_ops.sparse_slice(sp_mat_full, (1, 0),
-                                   (input_indices_num - 1, output_indices_num))
+  input_idx_patched = math_ops.cast(input_idx_patched, dtypes.int64)
 
   grad_expanded = array_ops.transpose(
       array_ops.reshape(
@@ -1069,9 +1052,16 @@ def _ExtractImagePatchesGrad(op, grad):
       (1, 2, 3, 4, 0, 5))
   grad_flat = array_ops.reshape(grad_expanded, (-1, batch_size * channels))
 
-  jac = sparse_ops.sparse_tensor_dense_matmul(sp_mat, grad_flat)
+  # Shift all input indices back. Padding locations will have "-1" value
+  # which is fortunately ignored by segmented sum.
+  segment_ids = array_ops.reshape(input_idx_patched, [-1]) - 1
+  grad_out = math_ops.unsorted_segment_sum(
+      grad_flat, segment_ids, num_segments=input_indices_num
+  )
 
-  grad_out = array_ops.reshape(jac, (rows_in, cols_in, batch_size, channels))
+  grad_out = array_ops.reshape(
+      grad_out, (rows_in, cols_in, batch_size, channels)
+  )
   grad_out = array_ops.transpose(grad_out, (2, 0, 1, 3))
 
   return [grad_out]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 286945c9e2b..0408617c9be 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -19,7 +19,7 @@ import numbers
 import numpy as np
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
+from tensorflow.python.eager import record
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
@@ -1435,12 +1435,6 @@ def parallel_stack(values, name="parallel_stack"):
         [expand_dims(value, 0) for value in values], shape=output_shape)
 
 
-# This function is deprecated. Use the one in array_ops_stack.py instead.
-# TODO(b/269481974): Delete this function when all references have been moved.
-def stack(values, axis=0, name="stack"):
-  return array_ops_stack.stack(values, axis, name)
-
-
 # pylint: disable=invalid-name
 def _autopacking_helper(list_or_tuple, dtype, name):
   """Converts the given list or tuple to a tensor by packing.
@@ -1561,12 +1555,6 @@ tensor_conversion_registry.register_tensor_conversion_function(
     (list, tuple), _autopacking_conversion_function, 99)
 
 
-# This function is deprecated. Use the one in array_ops_stack.py instead.
-# TODO(b/269481974): Delete this function when all references have been moved.
-def unstack(value, num=None, axis=0, name="unstack"):
-  return array_ops_stack.unstack(value, num, axis, name)
-
-
 @tf_export("concat")
 @dispatch.add_dispatch_support
 def concat(values, axis, name="concat"):
@@ -6874,7 +6862,7 @@ def stop_gradient(input, name=None):  # pylint: disable=redefined-builtin
   # since the backward function doesn't run in the forward pass. Pausing the
   # tape around this op instructs any tf.GradientTapes to ignore the
   # forward-pass output of StopGradient, which may be much more efficient.
-  with tape.stop_recording():
+  with record.stop_recording():
     return gen_array_ops.stop_gradient(input, name=name)
 
 
diff --git a/tensorflow/python/ops/bincount_ops.py b/tensorflow/python/ops/bincount_ops.py
index 8f0efeca62c..577f14545de 100644
--- a/tensorflow/python/ops/bincount_ops.py
+++ b/tensorflow/python/ops/bincount_ops.py
@@ -116,8 +116,14 @@ def bincount(arr,
   """
   name = "bincount" if name is None else name
   with ops.name_scope(name):
-    # Somehow forward compatible needs to be False.
-    if not binary_output and axis is None:
+    # TODO(b/255381064) Remove the following block which uses older kernels for
+    # backwards compatibility for certain cases once all tests pass with the
+    # newer (dense_bincount, ragged_bincount and sparse_bincount) kernels.
+    if (
+        not isinstance(arr, ragged_tensor.RaggedTensor)
+        and not binary_output
+        and axis is None
+    ):
       arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
       array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr)) > 0
       output_size = math_ops.cast(array_is_nonempty, dtypes.int32) * (
@@ -157,10 +163,7 @@ def bincount(arr,
       raise ValueError(f"Unsupported value for argument axis={axis}. Only 0 and"
                        " -1 are currently supported.")
 
-    if isinstance(arr, ragged_tensor.RaggedTensor):
-      array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr.values)) > 0
-    else:
-      array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr)) > 0
+    array_is_nonempty = array_ops.size(arr) > 0
     if isinstance(arr, sparse_tensor.SparseTensor):
       output_size = math_ops.cast(array_is_nonempty, arr.dtype) * (
           math_ops.reduce_max(arr.values) + 1)
@@ -182,9 +185,12 @@ def bincount(arr,
           weights = validate_sparse_weights(arr, weights, dtype)
         arr = arr.values
       elif isinstance(arr, ragged_tensor.RaggedTensor):
-        if weights is not None:
-          weights = validate_ragged_weights(arr, weights, dtype)
-        arr = arr.values
+        # Flatten RaggedTensors with multiple ragged dimensions which use a
+        # nested RaggedTensor for the values tensor.
+        while isinstance(arr, ragged_tensor.RaggedTensor):
+          if weights is not None:
+            weights = validate_ragged_weights(arr, weights, dtype)
+          arr = arr.values
       else:
         if weights is not None:
           weights = array_ops.reshape(weights, [-1])
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 253f325bdb0..49291712b37 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -18,6 +18,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -30,6 +31,24 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
+def _ragged_factory(x):
+  return lambda: ragged_factory_ops.constant(x)
+
+
+def _adjust_expected_rank1(x, minlength, maxlength):
+  """Trim or pad an expected result based on minlength and maxlength."""
+  n = len(x)
+  if (minlength is not None) and (n < minlength):
+    x = x + [0] * (minlength - n)
+  if (maxlength is not None) and (n > maxlength):
+    x = x[:maxlength]
+  return x
+
+
+def _adjust_expected_rank2(x, minlength, maxlength):
+  return [_adjust_expected_rank1(i, minlength, maxlength) for i in x]
+
+
 class TestSparseCount(test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -776,6 +795,216 @@ class TestDenseBincount(test.TestCase, parameterized.TestCase):
             bincount_ops.bincount(
                 arr=x, weights=weights, minlength=size, axis=-1)))
 
+  @parameterized.product(
+      (
+          dict(
+              tid="_r2",
+              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
+              expected=[0, 1, 2, 3],  # no implied zeros
+          ),
+          dict(
+              tid="_r3",
+              x_factory=_ragged_factory([[[], [1]], [[2, 2], [3, 3, 3]]]),
+              expected=[0, 1, 2, 3],  # no implied zeros
+          ),
+      ),
+      (
+          dict(minlength=None, maxlength=None),
+          dict(minlength=3, maxlength=None),
+          dict(minlength=5, maxlength=None),
+          dict(minlength=None, maxlength=3),
+          dict(minlength=None, maxlength=5),
+          dict(minlength=2, maxlength=3),
+          dict(minlength=3, maxlength=5),
+          dict(minlength=5, maxlength=10),
+      ),
+  )
+  def test_default(self, x_factory, minlength, maxlength, expected, tid=None):
+    x = x_factory()
+    expected = _adjust_expected_rank1(expected, minlength, maxlength)
+    self.assertAllEqual(
+        expected,
+        self.evaluate(
+            bincount_ops.bincount(x, minlength=minlength, maxlength=maxlength)
+        ),
+    )
+    self.assertAllEqual(
+        expected,
+        self.evaluate(
+            bincount_ops.bincount(
+                x, minlength=minlength, maxlength=maxlength, axis=0
+            )
+        ),
+    )
+
+  @parameterized.product(
+      (
+          dict(
+              tid="_r2",
+              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
+              # no implied zeros
+              expected=[[0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3]],
+          ),
+      ),
+      (
+          dict(minlength=None, maxlength=None),
+          dict(minlength=3, maxlength=None),
+          dict(minlength=5, maxlength=None),
+          dict(minlength=None, maxlength=3),
+          dict(minlength=None, maxlength=5),
+          dict(minlength=2, maxlength=3),
+          dict(minlength=3, maxlength=5),
+          dict(minlength=5, maxlength=10),
+      ),
+  )
+  def test_axis_neg_1(self, tid, x_factory, minlength, maxlength, expected):
+    x = x_factory()
+    expected = _adjust_expected_rank2(expected, minlength, maxlength)
+    self.assertAllEqual(
+        expected,
+        self.evaluate(
+            bincount_ops.bincount(
+                x, minlength=minlength, maxlength=maxlength, axis=-1
+            )
+        ),
+    )
+
+  @parameterized.product(
+      (
+          dict(
+              tid="_r2",
+              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
+              weights_factory=_ragged_factory([[], [1], [2, 3], [4, 5, 6]]),
+              axis=None,
+              expected=[0, 1, 5, 15],  # no implied zeros
+          ),
+          dict(
+              tid="_r3",
+              x_factory=_ragged_factory([[[], [1]], [[2, 2], [3, 3, 3]]]),
+              weights_factory=_ragged_factory([[[], [1]], [[2, 3], [4, 5, 6]]]),
+              expected=[0, 1, 5, 15],  # no implied zeros
+              axis=None,
+          ),
+          dict(
+              tid="_r2_axis_neg_1",
+              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
+              weights_factory=_ragged_factory([[], [1], [2, 3], [4, 5, 6]]),
+              # no implied zeros
+              expected=[
+                  [0, 0, 0, 0],
+                  [0, 1, 0, 0],
+                  [0, 0, 5, 0],
+                  [0, 0, 0, 15],
+              ],
+              axis=-1,
+          ),
+      ),
+      (
+          dict(minlength=None, maxlength=None),
+          dict(minlength=3, maxlength=None),
+          dict(minlength=5, maxlength=None),
+          dict(minlength=None, maxlength=3),
+          dict(minlength=None, maxlength=5),
+          dict(minlength=2, maxlength=3),
+          dict(minlength=3, maxlength=5),
+          dict(minlength=5, maxlength=10),
+      ),
+  )
+  def test_weights(
+      self,
+      tid,
+      x_factory,
+      weights_factory,
+      minlength,
+      maxlength,
+      expected,
+      axis,
+  ):
+    if "GPU" in set([d.device_type for d in tf_config.list_physical_devices()]):
+      self.skipTest(
+          "b/263004039 The DenseBincount GPU kernel does not support weights."
+          " unsorted_segment_sum should be used instead on GPU."
+      )
+    x = x_factory()
+    weights = weights_factory()
+    if axis == -1:
+      expected = _adjust_expected_rank2(expected, minlength, maxlength)
+    else:
+      expected = _adjust_expected_rank1(expected, minlength, maxlength)
+    self.assertAllEqual(
+        expected,
+        self.evaluate(
+            bincount_ops.bincount(
+                x,
+                weights=weights,
+                minlength=minlength,
+                maxlength=maxlength,
+                axis=axis,
+            )
+        ),
+    )
+
+  @parameterized.product(
+      (
+          dict(
+              tid="_r2",
+              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
+              expected=[0, 1, 1, 1],  # no implied zeros
+              axis=None,
+          ),
+          dict(
+              tid="_r3",
+              x_factory=_ragged_factory([[[], [1]], [[2, 2], [3, 3, 3]]]),
+              expected=[0, 1, 1, 1],  # no implied zeros
+              axis=None,
+          ),
+          dict(
+              tid="_r2_axis_neg_1",
+              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
+              # no implied zeros
+              expected=[[0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
+              axis=-1,
+          ),
+      ),
+      (
+          dict(minlength=None, maxlength=None),
+          dict(minlength=3, maxlength=None),
+          dict(minlength=5, maxlength=None),
+          dict(minlength=None, maxlength=3),
+          dict(minlength=None, maxlength=5),
+          dict(minlength=2, maxlength=3),
+          dict(minlength=3, maxlength=5),
+          dict(minlength=5, maxlength=10),
+      ),
+  )
+  def test_binary_output(
+      self,
+      tid,
+      x_factory,
+      minlength,
+      maxlength,
+      expected,
+      axis=None,
+      skip=False,
+  ):
+    x = x_factory()
+    if axis == -1:
+      expected = _adjust_expected_rank2(expected, minlength, maxlength)
+    else:
+      expected = _adjust_expected_rank1(expected, minlength, maxlength)
+    self.assertAllEqual(
+        expected,
+        self.evaluate(
+            bincount_ops.bincount(
+                x,
+                minlength=minlength,
+                maxlength=maxlength,
+                binary_output=True,
+                axis=axis,
+            )
+        ),
+    )
+
 
 class TestSparseCountFailureModes(test.TestCase):
 
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 4e07f46f94b..102e66afb8a 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_assert
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -1575,7 +1576,7 @@ def _dimension_sizes(x):
     ]
     return sizes
   has_rank_zero = math_ops.equal(array_ops.rank(x), 0)
-  return control_flow_ops.cond(
+  return cond.cond(
       has_rank_zero, lambda: array_ops.constant([1]), lambda: dynamic_shape)
 
 
@@ -1943,7 +1944,7 @@ def _get_diff_for_monotonic_comparison(x):
   # With 2 or more elements, return x[1:] - x[:-1]
   s_len = array_ops.shape(x) - 1
   diff = lambda: array_ops.strided_slice(x, [1], [1] + s_len)- array_ops.strided_slice(x, [0], s_len)
-  return control_flow_ops.cond(is_shorter_than_two, short_result, diff)
+  return cond.cond(is_shorter_than_two, short_result, diff)
 
 
 @tf_export(
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 71a61f0a28e..4cbb30fa2cb 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -123,8 +123,7 @@ def clip_by_value(t, clip_value_min, clip_value_max,
   #     t, clip_value_min, clip_value_max, name=name)
 
 
-# TODO(scottzhu): switch to use new implementation in 2 weeks.
-# @ops.RegisterGradient("ClipByValue")
+@ops.RegisterGradient("ClipByValue")
 def _clip_by_value_grad(op, grad):
   """Returns grad of clip_by_value."""
   x = op.inputs[0]
@@ -138,15 +137,14 @@ def _clip_by_value_grad(op, grad):
   zeros = array_ops.zeros(gradshape, gdtype)
   xymask = math_ops.less(x, y)
   xzmask = math_ops.greater(x, z)
-  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
-  rx, rz = gen_array_ops.broadcast_gradient_args(sx, sz)
+  _, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  _, rz = gen_array_ops.broadcast_gradient_args(sx, sz)
   xgrad = array_ops.where(math_ops.logical_or(xymask, xzmask), zeros, grad)
   ygrad = array_ops.where(xymask, grad, zeros)
   zgrad = array_ops.where(xzmask, grad, zeros)
-  gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx)
   gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy)
   gz = array_ops.reshape(math_ops.reduce_sum(zgrad, rz), sz)
-  return (gx, gy, gz)
+  return xgrad, gy, gz
 
 
 @tf_export("clip_by_norm")
diff --git a/tensorflow/python/ops/clustering_ops.py b/tensorflow/python/ops/clustering_ops.py
index 9d575880444..049ae582ec7 100644
--- a/tensorflow/python/ops/clustering_ops.py
+++ b/tensorflow/python/ops/clustering_ops.py
@@ -20,13 +20,14 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed as random_seed_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_clustering_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import while_loop
 from tensorflow.python.ops.embedding_ops import embedding_lookup
 # go/tf-wildcard-import
@@ -289,29 +290,29 @@ class KMeans:
             cluster_centers_updated back to cluster_centers.
     """
     init_value = array_ops.placeholder_with_default([], shape=None)
-    cluster_centers = variable_scope.variable(
+    cluster_centers = variable_v1.VariableV1(
         init_value, name=CLUSTERS_VAR_NAME, validate_shape=False)
-    cluster_centers_initialized = variable_scope.variable(
+    cluster_centers_initialized = variable_v1.VariableV1(
         False, dtype=dtypes.bool, name='initialized')
 
     if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
       # Copy of cluster centers actively updated each step according to
       # mini-batch update rule.
-      cluster_centers_updated = variable_scope.variable(
+      cluster_centers_updated = variable_v1.VariableV1(
           init_value, name='clusters_updated', validate_shape=False)
       # How many steps till we copy the updated clusters to cluster_centers.
-      update_in_steps = variable_scope.variable(
+      update_in_steps = variable_v1.VariableV1(
           self._mini_batch_steps_per_iteration,
           dtype=dtypes.int64,
           name='update_in_steps')
       # Count of points assigned to cluster_centers_updated.
-      cluster_counts = variable_scope.variable(
+      cluster_counts = variable_v1.VariableV1(
           array_ops.zeros([num_clusters], dtype=dtypes.int64))
     else:
       cluster_centers_updated = cluster_centers
       update_in_steps = None
       cluster_counts = (
-          variable_scope.variable(
+          variable_v1.VariableV1(
               array_ops.ones([num_clusters], dtype=dtypes.int64))
           if self._use_mini_batch else None)
     return (cluster_centers, cluster_centers_initialized, cluster_counts,
@@ -430,7 +431,7 @@ class KMeans:
                   ]):
                     return array_ops.identity(update_in_steps)
 
-        return control_flow_ops.cond(
+        return cond.cond(
             update_in_steps <= 0, _f,
             lambda: state_ops.assign_sub(update_in_steps, 1))
     else:
@@ -687,7 +688,7 @@ class _InitializeClustersOpFactory:
 
       # Obtain a random point if there are no previously sampled centers.
       # Otherwise, construct a k-MC2 Markov chain.
-      new_centers = control_flow_ops.cond(
+      new_centers = cond.cond(
           math_ops.equal(self._num_selected, 0), _sample_random,
           _sample_kmc2_chain)
       # Assign new cluster centers to underlying variable.
@@ -709,9 +710,9 @@ class _InitializeClustersOpFactory:
     # remaining, choose the entire input dataset as centers. This can happen
     # with mini-batch. Otherwise, sample the batch according to the provided
     # sampler.
-    return control_flow_ops.cond(self._num_data <= self._num_remaining,
-                                 lambda: array_ops.concat(self._inputs, 0),
-                                 sampler)
+    return cond.cond(self._num_data <= self._num_remaining,
+                     lambda: array_ops.concat(self._inputs, 0),
+                     sampler)
 
   def _single_batch_sampler(self, sampler):
     # Enforce that there are at least as many data points as centers
@@ -742,7 +743,7 @@ class _InitializeClustersOpFactory:
     if self._distance_metric == COSINE_DISTANCE:
       new_centers = nn_impl.l2_normalize(new_centers, dim=1)
     # If cluster_centers is empty, it doesn't have the right shape for concat.
-    all_centers = control_flow_ops.cond(
+    all_centers = cond.cond(
         math_ops.equal(self._num_selected, 0), lambda: new_centers,
         lambda: array_ops.concat([self._cluster_centers, new_centers], 0))
     # TODO(ccolby): De-dupe all_centers?
@@ -761,14 +762,14 @@ class _InitializeClustersOpFactory:
         num_now_remaining = self._kmc2_multiple_centers()
       else:
         num_now_remaining = self._add_new_centers()
-      return control_flow_ops.cond(
+      return cond.cond(
           math_ops.equal(num_now_remaining, 0),
           lambda: state_ops.assign(self._cluster_centers_initialized, True),
           control_flow_ops.no_op)
 
   def op(self):
     """Returns the cluster initializer op."""
-    return control_flow_ops.cond(
+    return cond.cond(
         math_ops.equal(self._num_remaining, 0),
         lambda: check_ops.assert_equal(self._cluster_centers_initialized, True),
         self._initialize)
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index b4d90ee9ee1..a7566781dea 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
 from tensorflow.python.platform import test
@@ -193,9 +194,9 @@ class CollectiveOpTest(test.TestCase):
       for device in devices:
         with ops.device(device):
           loop_vars.append(
-              [variables.VariableV1((1 << i) * 1.) for i in range(num_vars)])
+              [variable_v1.VariableV1((1 << i) * 1.) for i in range(num_vars)])
       # This variable controls number of iterations.
-      loop_vars.append(variables.VariableV1(0.))
+      loop_vars.append(variable_v1.VariableV1(0.))
       def loop_body(dev0_tensors, dev1_tensors, loop_tensor):
         return_ops = []
         for i in range(len(devices)):
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index e087936dbec..f09a280a943 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -254,8 +254,8 @@ def _build_cond(pred,
 
   # Create the If op.
   with ops.control_dependencies(
-      list(true_graph._function_captures.control) + list(  # pylint: disable=protected-access
-          false_graph._function_captures.control)):  # pylint: disable=protected-access
+      list(true_graph.function_captures.control) + list(
+          false_graph.function_captures.control)):
     true_stateful_ops = [
         op for op in true_graph.get_operations() if op._is_stateful
     ]
@@ -333,7 +333,7 @@ def get_func_graphs(op):
       func_graph = util.get_func_graph(op, input_shapes, name_attr_list.name)
     for external_t, internal_t in zip(inputs, func_graph.inputs):
       handle_data_util.copy_handle_data(external_t, internal_t)
-    func_graph._function_captures.reset_captures(inputs, func_graph.inputs)
+    func_graph.function_captures.reset_captures(inputs, func_graph.inputs)
     # Link the op so that the gradient code can use it.
     func_graph._forward_cond = op
     return func_graph
@@ -584,7 +584,7 @@ def _make_inputs_match(branch_graphs, branch_inputs):
     branch_graph.inputs = input_list
 
     # Rewrite the FuncGraphs' state to reflect the new inputs.
-    branch_graph._function_captures.reset_captures(
+    branch_graph.function_captures.reset_captures(
         new_inputs, branch_graph.inputs)
 
   return new_inputs
@@ -1237,7 +1237,7 @@ def _build_case(branch_index,
 
   # Create the Case op.
   with ops.control_dependencies(
-      sum((list(bg._function_captures.control) for bg in branch_graphs), [])):  # pylint: disable=protected-access
+      sum((list(bg.function_captures.control) for bg in branch_graphs), [])):
 
     def _make_op(inputs):
       case_op, tensors = util.get_op_and_outputs(op_fn(
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index 0b37e2d579f..614ad5dfe62 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -19,6 +19,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
@@ -75,13 +76,13 @@ def remove_squeezable_dimensions(
     rank_diff = array_ops.rank(predictions) - array_ops.rank(labels)
     if (predictions_rank is None) or (
         predictions_shape.dims[-1].is_compatible_with(1)):
-      predictions = control_flow_ops.cond(
+      predictions = cond.cond(
           math_ops.equal(expected_rank_diff + 1, rank_diff),
           lambda: array_ops.squeeze(predictions, [-1]),
           lambda: predictions)
     if (labels_rank is None) or (
         labels_shape.dims[-1].is_compatible_with(1)):
-      labels = control_flow_ops.cond(
+      labels = cond.cond(
           math_ops.equal(expected_rank_diff - 1, rank_diff),
           lambda: array_ops.squeeze(labels, [-1]),
           lambda: labels)
diff --git a/tensorflow/python/ops/control_flow_assert.py b/tensorflow/python/ops/control_flow_assert.py
index 926b8de2d56..18928fa6f35 100644
--- a/tensorflow/python/ops/control_flow_assert.py
+++ b/tensorflow/python/ops/control_flow_assert.py
@@ -19,21 +19,15 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops.gen_control_flow_ops import no_op
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import tf_should_use
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
-# TODO(b/269483538): needed for references while refactors are in progress
-control_flow_ops = LazyLoader(
-    "control_flow_ops", globals(),
-    "tensorflow.python.ops.control_flow_ops")
-
-
 def _summarize_eager(tensor, summarize=None):
   """Returns a summarized string representation of eager `tensor`.
 
@@ -126,8 +120,11 @@ def Assert(condition, data, summarize=None, name=None):
         return gen_logging_ops._assert(  # pylint: disable=protected-access
             condition, data, summarize, name="Assert")
 
-      guarded_assert = control_flow_ops.cond(
-          condition, no_op, true_assert, name="AssertGuard")
+      guarded_assert = cond.cond(
+          condition,
+          gen_control_flow_ops.no_op,
+          true_assert,
+          name="AssertGuard")
       if context.executing_eagerly():
         return
       return guarded_assert.op
diff --git a/tensorflow/python/ops/control_flow_case.py b/tensorflow/python/ops/control_flow_case.py
index d08fdd140ed..a8d508f358d 100644
--- a/tensorflow/python/ops/control_flow_case.py
+++ b/tensorflow/python/ops/control_flow_case.py
@@ -20,21 +20,15 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_assert
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import dispatch
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
-# TODO(b/269483538): needed for references while refactors are in progress
-control_flow_ops = LazyLoader(
-    "control_flow_ops", globals(),
-    "tensorflow.python.ops.control_flow_ops")
-
-
 @tf_export("case", v1=[])
 @dispatch.add_dispatch_support
 def case_v2(pred_fn_pairs,
@@ -131,7 +125,7 @@ def case_v2(pred_fn_pairs,
                callable.
   """
   return _case_helper(
-      control_flow_ops.cond,
+      cond.cond,
       pred_fn_pairs,
       default,
       exclusive,
@@ -240,7 +234,7 @@ def case(pred_fn_pairs,
                callable.
   """
   return _case_helper(
-      control_flow_ops.cond,
+      cond.cond,
       pred_fn_pairs,
       default,
       exclusive,
@@ -257,7 +251,7 @@ def _assert_at_most_n_true(predicates, n, msg):
     n: maximum number of true predicates allowed.
     msg: Error message.
   """
-  preds_c = array_ops.stack(predicates, name="preds_c")
+  preds_c = array_ops_stack.stack(predicates, name="preds_c")
   num_true_conditions = math_ops.reduce_sum(
       math_ops.cast(preds_c, dtypes.int32), name="num_true_conds")
   condition = math_ops.less_equal(num_true_conditions,
@@ -294,7 +288,7 @@ def _case_create_default_action(predicates, actions):
                   "predicates are True: " % k)
     default_msg = ("Input error: "
                    "None of conditions evaluated as True:",
-                   array_ops.stack(predicates, name="preds_c"))
+                   array_ops_stack.stack(predicates, name="preds_c"))
     with ops.control_dependencies([
         _assert_at_most_n_true(  # pylint: disable=protected-access
             other_predicates, n=0, msg=others_msg),
diff --git a/tensorflow/python/ops/control_flow_ops_benchmark.py b/tensorflow/python/ops/control_flow_ops_benchmark.py
index 5c3f8a15ff4..12095250c2c 100644
--- a/tensorflow/python/ops/control_flow_ops_benchmark.py
+++ b/tensorflow/python/ops/control_flow_ops_benchmark.py
@@ -22,7 +22,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -44,8 +44,8 @@ class CondWithManyIntermediatesBenchmark(test.Benchmark):
                      for _ in range(self.NUM_INTERMEDIATES))
 
     # Use a dynamic predicate to make sure the cond isn't constant folded.
-    return control_flow_ops.cond(math_ops.not_equal(x, -1),
-                                 branch_fn, lambda: 0.0)
+    return cond.cond(math_ops.not_equal(x, -1),
+                     branch_fn, lambda: 0.0)
 
   def _benchmark_defun(self):
     """Benchmarks cond in a defun."""
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 6f5ddd7cc00..2bf129222ac 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -41,6 +41,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import control_flow_assert
 from tensorflow.python.ops import control_flow_case
 from tensorflow.python.ops import control_flow_ops
@@ -264,7 +265,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
 
     def body(it, cost):
       embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
-      cost = control_flow_ops.cond(
+      cost = tf_cond.cond(
           math_ops.equal(it, 3), lambda: math_ops.square(cost),
           (lambda: cost + math_ops.reduce_sum(embedding)))
       return it + 1, cost
@@ -369,7 +370,7 @@ class CondTest(test_util.TensorFlowTestCase):
   def testCondTrue(self):
     x = constant_op.constant(2)
     y = constant_op.constant(5)
-    z = control_flow_ops.cond(
+    z = tf_cond.cond(
         math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
         lambda: math_ops.add(y, 23))
     self.assertEqual(self.evaluate(z), 34)
@@ -377,7 +378,7 @@ class CondTest(test_util.TensorFlowTestCase):
   def testCondFalse(self):
     x = constant_op.constant(2)
     y = constant_op.constant(1)
-    z = control_flow_ops.cond(
+    z = tf_cond.cond(
         math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
         lambda: math_ops.add(y, 23))
     self.assertEqual(self.evaluate(z), 24)
@@ -385,7 +386,7 @@ class CondTest(test_util.TensorFlowTestCase):
   def testCondTrueLegacy(self):
     x = constant_op.constant(2)
     y = constant_op.constant(5)
-    z = control_flow_ops.cond(
+    z = tf_cond.cond(
         math_ops.less(x, y),
         fn1=lambda: math_ops.multiply(x, 17),
         fn2=lambda: math_ops.add(y, 23))
@@ -394,7 +395,7 @@ class CondTest(test_util.TensorFlowTestCase):
   def testCondFalseLegacy(self):
     x = constant_op.constant(2)
     y = constant_op.constant(1)
-    z = control_flow_ops.cond(
+    z = tf_cond.cond(
         math_ops.less(x, y),
         fn1=lambda: math_ops.multiply(x, 17),
         fn2=lambda: math_ops.add(y, 23))
@@ -407,7 +408,7 @@ class CondTest(test_util.TensorFlowTestCase):
     with test_util.use_gpu():
       bool_var = variable_scope.get_variable(
           "bool_var", dtype=dtypes.bool, initializer=True)
-      cond_on_bool_var = control_flow_ops.cond(
+      cond_on_bool_var = tf_cond.cond(
           pred=bool_var,
           true_fn=lambda: state_ops.assign(bool_var, False),
           false_fn=lambda: True)
@@ -418,22 +419,22 @@ class CondTest(test_util.TensorFlowTestCase):
   def testCondMissingArg1(self):
     x = constant_op.constant(1)
     with self.assertRaises(TypeError):
-      control_flow_ops.cond(True, false_fn=lambda: x)
+      tf_cond.cond(True, false_fn=lambda: x)
 
   def testCondMissingArg2(self):
     x = constant_op.constant(1)
     with self.assertRaises(TypeError):
-      control_flow_ops.cond(True, lambda: x)
+      tf_cond.cond(True, lambda: x)
 
   def testCondDuplicateArg1(self):
     x = constant_op.constant(1)
     with self.assertRaises(TypeError):
-      control_flow_ops.cond(True, lambda: x, lambda: x, fn1=lambda: x)
+      tf_cond.cond(True, lambda: x, lambda: x, fn1=lambda: x)
 
   def testCondDuplicateArg2(self):
     x = constant_op.constant(1)
     with self.assertRaises(TypeError):
-      control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
+      tf_cond.cond(True, lambda: x, lambda: x, fn2=lambda: x)
 
   @test_util.enable_control_flow_v2
   @test_util.run_in_graph_and_eager_modes
@@ -442,9 +443,9 @@ class CondTest(test_util.TensorFlowTestCase):
     with backprop.GradientTape(persistent=True) as tape:
       tape.watch(true_in)
       tape.watch(false_in)
-      cond_true = control_flow_ops.cond(
+      cond_true = tf_cond.cond(
           array_ops.constant(True), lambda: true_in**2., lambda: false_in**2.)
-      cond_false = control_flow_ops.cond(
+      cond_false = tf_cond.cond(
           array_ops.constant(False), lambda: true_in**2., lambda: false_in**2.)
     grads_true = tape.gradient(
         cond_true, [true_in, false_in], output_gradients=3.)
@@ -461,7 +462,7 @@ class CondTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default():
       writer = summary_ops_v2.create_file_writer(self.get_temp_dir())
       with writer.as_default(), summary_ops_v2.always_record_summaries():
-        op = control_flow_ops.cond(
+        op = tf_cond.cond(
             constant_op.constant(1) >= 0,
             lambda: control_flow_ops.group(summary_ops_v2.scalar("loss", 0.2)),
             control_flow_ops.no_op)
@@ -477,7 +478,7 @@ class ContextTest(test_util.TensorFlowTestCase):
     with self.cached_session() as sess:
       x = constant_op.constant(2)
       y = constant_op.constant(5)
-      control_flow_ops.cond(
+      tf_cond.cond(
           math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
           lambda: math_ops.add(y, 23))
       for op in sess.graph.get_operations():
@@ -592,7 +593,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
   def _testShape(self, fn_true, fn_false, expected_shape, strict=False):
     condition = array_ops.placeholder(dtypes.bool)
-    output_cond = control_flow_ops.cond(
+    output_cond = tf_cond.cond(
         condition, fn_true, fn_false, strict=strict)
     self.assertEqual(
         _raw_nested_shape(_get_nested_shape(output_cond)),
@@ -617,7 +618,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
       feed_dict = {}
 
     condition = array_ops.placeholder(dtypes.bool)
-    output_cond = control_flow_ops.cond(
+    output_cond = tf_cond.cond(
         condition, fn_true, fn_false, strict=strict)
     output_case = control_flow_case.case([(condition, fn_true)],
                                          fn_false,
@@ -691,10 +692,10 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     fn_tensor = lambda: constant_op.constant(1)
 
     with self.assertRaises(ValueError):
-      control_flow_ops.cond(constant_op.constant(True), fn_none, fn_tensor)
+      tf_cond.cond(constant_op.constant(True), fn_none, fn_tensor)
 
     with self.assertRaises(ValueError):
-      control_flow_ops.cond(constant_op.constant(True), fn_tensor, fn_none)
+      tf_cond.cond(constant_op.constant(True), fn_tensor, fn_none)
 
   @test_util.run_deprecated_v1
   def test_tensors(self):
@@ -871,11 +872,11 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     fn_tuple = lambda: (constant_op.constant(3),)
 
     with self.assertRaises(ValueError):
-      control_flow_ops.cond(
+      tf_cond.cond(
           constant_op.constant(True), fn_tensor, fn_list, strict=True)
 
     with self.assertRaises(TypeError):
-      control_flow_ops.cond(
+      tf_cond.cond(
           constant_op.constant(True), fn_list, fn_tuple, strict=True)
 
     with self.assertRaises(ValueError):
@@ -983,7 +984,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
   def test_cond_inside_while_loop(self):
 
     def body(i, matrix):
-      result_tuple, unused_matrix = control_flow_ops.cond(
+      result_tuple, unused_matrix = tf_cond.cond(
           constant_op.constant(True), lambda:
           (TestTuple(matrix * 2, matrix * 4), matrix), lambda:
           (TestTuple(matrix * 4, matrix * 2), matrix))
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 42156ea984f..5e925fb85a9 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -16,9 +16,11 @@
 """Utilities for V2 control flow."""
 
 from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python.data.util import structure  # pylint: disable=unused-import
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
+from tensorflow.python.eager.polymorphic_function import atomic_function
+from tensorflow.python.eager.polymorphic_function import monomorphic_function
+from tensorflow.python.eager.polymorphic_function import tracing_compiler
+from tensorflow.python.eager.polymorphic_function import transform
 from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework.func_graph import FuncGraph
@@ -28,7 +30,6 @@ from tensorflow.python.ops import gradients_util
 from tensorflow.python.util import keras_deps
 from tensorflow.python.util import tf_contextlib
 
-
 _EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE = None
 _DISABLE_LOWER_USING_SWITCH_MERGE = False
 
@@ -66,9 +67,11 @@ def create_new_tf_function(func_graph):
   Returns:
     The name of the new TF_Function.
   """
-  func = function._EagerDefinedFunction(  # pylint: disable=protected-access
+  transform.apply_func_graph_transforms(func_graph)
+  func = atomic_function.from_func_graph(
       func_graph.name, func_graph, func_graph.inputs, func_graph.outputs, {})
-  func.add_to_graph(func_graph.outer_graph)
+
+  func_graph.outer_graph._add_function_recursive(func)  # pylint: disable=protected-access
   return func_graph.name
 
 
@@ -162,7 +165,7 @@ def resource_input_index(tensor_name, input_names, node_defs, functions):
     tensor_name: the name of the resource tensor to be resolved to an input.
     input_names: a list of the names of all inputs to the function.
     node_defs: a dict mapping op name -> NodeDef for every op in the function.
-    functions: a dict mapping function name -> _EagerDefinedFunction.
+    functions: a dict mapping function name -> AtomicFunction.
 
   Returns:
     The index into input_names corresponding to `tensor_name`.
@@ -187,7 +190,7 @@ def resource_input_index(tensor_name, input_names, node_defs, functions):
 
     def _extract_input_index(function_attribute_name):
       func_name = node_def.attr[function_attribute_name].func.name
-      fdef = functions[func_name].definition
+      fdef = functions[func_name].cached_definition
       output_arg_name = fdef.signature.output_arg[output_idx].name
       output_tensor_name = fdef.ret[output_arg_name]
       return resource_input_index(
@@ -300,7 +303,7 @@ def get_func_graph(op, input_shapes, func_name):
   while graph is not None:
     func = graph._get_function(func_name)  # pylint: disable=protected-access
     if func is not None:
-      fdef = func.definition
+      fdef = func.cached_definition
       break
     if hasattr(graph, "outer_graph"):
       graph = graph.outer_graph
@@ -330,7 +333,9 @@ def get_func_graph(op, input_shapes, func_name):
     if operation.type in ["PartitionedCall", "StatefulPartitionedCall"]:
       f = graph._get_function(operation.get_attr("f").name)  # pylint: disable=protected-access
       try:
-        cf = function.ConcreteFunction(f.graph, attrs=f.definition.attr)
+        cf = monomorphic_function.ConcreteFunction(
+            f.graph, attrs=f.cached_definition.attr
+        )
       except AttributeError:
         # f is not found or f is a _DefinedFunction that doesn't have a graph.
         continue
@@ -383,10 +388,10 @@ def run_as_function_for_tape_gradients(make_op, inputs):
       # wrapped once, we stop wrapping to avoid infinite recursion.
       and not (ops.get_default_graph().building_function
                and "cflow_gradient_wrapper" in ops.get_default_graph().name)):
-    results = function.defun_with_attributes(
+    results = tracing_compiler.TracingCompiler(
         make_op,
-        autograph=False,
-        attributes=dict(func_name="cflow_gradient_wrapper"))(inputs)
+        "cflow_gradient_wrapper",
+        autograph=False)(inputs)
     return results
   else:
     return make_op(inputs)
diff --git a/tensorflow/python/ops/conv2d_benchmark.py b/tensorflow/python/ops/conv2d_benchmark.py
index c061d007bc8..299700f22f2 100644
--- a/tensorflow/python/ops/conv2d_benchmark.py
+++ b/tensorflow/python/ops/conv2d_benchmark.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
@@ -59,9 +60,9 @@ def build_graph(device, dtype, data_format, input_shape, filter_shape, strides,
     An array of tensors to run()
   """
   with ops.device("/%s:0" % device):
-    inp = variables.VariableV1(
+    inp = variable_v1.VariableV1(
         random_ops.truncated_normal(input_shape, dtype=dtype))
-    filt = variables.VariableV1(
+    filt = variable_v1.VariableV1(
         random_ops.truncated_normal(filter_shape, dtype=dtype))
 
     outputs = []
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 897b64d7a4b..92915eeecff 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -17,7 +17,7 @@
 import uuid
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function as function_eager
+from tensorflow.python.eager import def_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device
@@ -63,8 +63,8 @@ def _generate_defun_backend(unique_api_name, preferred_device, func):
       _DEFUN_API_NAME_ATTRIBUTE: unique_api_name,
       _DEFUN_DEVICE_ATTRIBUTE: preferred_device,
   }
-  return function_eager.defun_with_attributes(
-      func=func, attributes=function_attributes, autograph=False)
+  return def_function.function(
+      func=func, experimental_attributes=function_attributes, autograph=False)
 
 # pylint: disable=protected-access, invalid-name
 @tf_export(v1=["nn.ctc_loss"])
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index c7fb57707e7..2b943a6a8b0 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -16,7 +16,7 @@
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape as tape_lib
+from tensorflow.python.eager import record
 from tensorflow.python.framework import composite_tensor_gradient
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -421,7 +421,7 @@ def _graph_mode_decorator(f, args, kwargs):
       v.ref() for v in current_var_scope.global_variables() +
       current_var_scope.local_variables()
   ])
-  with tape_lib.VariableWatcher() as variable_watcher:
+  with record.VariableWatcher() as variable_watcher:
     result, grad_fn = f(*args)
 
   flat_args = composite_tensor_gradient.get_flat_tensors_for_gradients(
@@ -528,7 +528,7 @@ def _graph_mode_decorator(f, args, kwargs):
   for i, t in enumerate(original_tensors):
     if t.dtype == dtypes.resource and hasattr(t, "_handle_data"):
       all_tensors[i]._handle_data = t._handle_data  # pylint: disable=protected-access
-  tape_lib.record_operation(
+  record.record_operation(
       f.__name__, all_tensors, original_tensors, tape_grad_fn)
   for ot, t in zip(original_tensors, all_tensors):
     handle_data_util.copy_handle_data(ot, t)
@@ -539,7 +539,7 @@ def _graph_mode_decorator(f, args, kwargs):
 
 def _eager_mode_decorator(f, args, kwargs):
   """Implement custom gradient decorator for eager mode."""
-  with tape_lib.VariableWatcher() as variable_watcher:
+  with record.VariableWatcher() as variable_watcher:
     result, grad_fn = f(*args, **kwargs)
   flat_args = composite_tensor_gradient.get_flat_tensors_for_gradients(
       nest.flatten(args))
@@ -594,8 +594,8 @@ def _eager_mode_decorator(f, args, kwargs):
           f"gradients, but returned {len(flat_grads)} instead.")
     return flat_grads + variable_grads
 
-  tape_lib.record_operation(f.__name__, flat_result, recorded_inputs,
-                            actual_grad_fn)
+  record.record_operation(f.__name__, flat_result, recorded_inputs,
+                          actual_grad_fn)
   flat_result = composite_tensor_gradient.replace_flat_tensors_for_gradients(
       nest.flatten(result), flat_result)
   return nest.pack_sequence_as(result, flat_result)
@@ -711,7 +711,7 @@ def recompute_grad(f):
   def inner(*args, **kwargs):
     """Inner function closure for calculating gradients."""
     current_var_scope = variable_scope.get_variable_scope()
-    with tape_lib.stop_recording():
+    with record.stop_recording():
       result = f(*args, **kwargs)
 
     def grad_wrapper(*wrapper_args, variables=None):
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
index c485e897ee8..cff033e5b78 100644
--- a/tensorflow/python/ops/distributions/BUILD
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -1,10 +1,12 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "distributions",
     srcs = glob(
         ["*.py"],
@@ -21,36 +23,45 @@ py_library(
     deps = [
         ":util",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:special_math_ops",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "util",
     srcs = ["util.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:special_math_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index a2f45669df0..62f91b7003a 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -128,7 +129,7 @@ def same_dynamic_shape(a, b):
 
   # One of the shapes isn't fully defined, so we need to use the dynamic
   # shape.
-  return control_flow_ops.cond(
+  return tf_cond.cond(
       math_ops.equal(array_ops.rank(a), array_ops.rank(b)),
       all_shapes_equal, lambda: constant_op.constant(False))
 
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index ac7ac3988c3..f394ce2cc77 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -16,7 +16,6 @@
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
-from tensorflow.python.framework import auto_control_deps_utils as acd
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -32,10 +31,8 @@ from tensorflow.python.ops import while_loop
 from tensorflow.python.ops.gen_functional_ops import remote_call
 # pylint: enable=unused-import
 from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
-from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
-from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -1119,125 +1116,3 @@ def For(start,
     output_attr.list.i.extend(hostmem)
     ret[0].op._set_attr("_output_hostmem", output_attr)  # pylint: disable=protected-access
   return ret
-
-
-# pylint: enable=invalid-name,protected-access
-
-
-def partitioned_call(args,
-                     f,
-                     tout=None,
-                     executing_eagerly=None,
-                     config=None,
-                     executor_type=None):
-  """Executes a function while respecting device annotations.
-
-  Currently, only those functions that execute within the same address space
-  can be executed.
-
-  Args:
-    args: The arguments of the function, including captured inputs.
-    f: The function to execute; an instance of `_DefinedFunction` or
-      `_EagerDefinedFunction`.
-    tout: a list containing the output dtypes enums; if `None`, inferred from
-      the signature of `f`.
-    executing_eagerly: (Optional) A boolean indicating whether the context is
-      executing eagerly. If `None`, fetched from the global context.
-    config: (Optional) A `tensorflow::ConfigProto` proto, serialized. If `None`,
-      all optimizations are disabled. Currently only handled for eager defined
-      functions.
-    executor_type: (Optional) A string for the name of the executor to be used
-      in the function call. If not set, or set to an empty string, the default
-      tensorflow executor will be used.
-
-  Returns:
-    The list of `Tensor`s returned by invoking `f(args)`. If the function does
-    not return anything, then returns `None` if eager execution is enabled, or
-    the `Operation` if not.
-  """
-
-  if tout is None:
-    tout = tuple(x.type for x in f.definition.signature.output_arg)
-
-  if executing_eagerly is None:
-    executing_eagerly = context.executing_eagerly()
-
-  if config is None:
-    config = function_utils.get_disabled_rewriter_config()
-
-  if executor_type is None:
-    executor_type = ""
-
-  if executing_eagerly:
-    if f.stateful_ops:
-      outputs = gen_functional_ops.stateful_partitioned_call(
-          args=args,
-          Tout=tout,
-          f=f,
-          config_proto=config,
-          executor_type=executor_type)
-    else:
-      outputs = gen_functional_ops.partitioned_call(
-          args=args,
-          Tout=tout,
-          f=f,
-          config_proto=config,
-          executor_type=executor_type)
-    return outputs if outputs else None
-
-  # The generated binding returns an empty list for functions that don't
-  # return any Tensors, hence the need to use `create_op` directly.
-  args = [ops.convert_to_tensor(x) for x in args]
-  tin_attr = attr_value_pb2.AttrValue(
-      list=attr_value_pb2.AttrValue.ListValue(
-          type=[x.dtype.as_datatype_enum for x in args]))
-  tout_attr = attr_value_pb2.AttrValue(
-      list=attr_value_pb2.AttrValue.ListValue(type=tout))
-  func_attr = attr_value_pb2.AttrValue(
-      func=attr_value_pb2.NameAttrList(name=f.name))
-  executor_type_attr = attr_value_pb2.AttrValue(
-      s=compat.as_bytes(executor_type))
-
-  # When running in graph mode, the graph and function graphs are optimized
-  # (i.e. run through grappler) per the session options, so we can disable any
-  # eager-specific rewriting.
-  config_proto = attr_value_pb2.AttrValue(s=config)
-
-  graph = ops.get_default_graph()
-  f.add_to_graph(graph)
-  op_name = "StatefulPartitionedCall" if f.stateful_ops else "PartitionedCall"
-
-  # Propagate the attribute indicating the need to compile from function to the
-  # call itself.
-  xla_compile_attr = "_XlaMustCompile"
-  op_attrs = {
-      "Tin": tin_attr,
-      "Tout": tout_attr,
-      "f": func_attr,
-      "config_proto": config_proto,
-      "executor_type": executor_type_attr,
-  }
-  if xla_compile_attr in f.definition.attr:
-    op_attrs[xla_compile_attr] = f.definition.attr[xla_compile_attr]
-  op = graph.create_op(op_name, args, tout, name=op_name, attrs=op_attrs)
-  outputs = op.outputs
-  if hasattr(f, "graph"):
-    _set_read_only_resource_inputs_attr(op, f.graph)
-    if hasattr(f.graph, "collective_manager_ids_used"):
-      ops.set_int_list_attr(op, acd.COLLECTIVE_MANAGER_IDS,
-                            f.graph.collective_manager_ids_used)
-  return outputs if outputs else op
-
-
-def _set_read_only_resource_inputs_attr(op, func_graph):
-  """Sets the list of resource inputs which are read-only.
-
-  This is used by AutomaticControlDependencies.
-
-  Args:
-    op: PartitionedCall Operation.
-    func_graph: FuncGraph.
-  """
-  read_only_indices = acd.get_read_only_resource_input_indices_graph(func_graph)
-  ops.set_int_list_attr(op, acd.READ_ONLY_RESOURCE_INPUTS_ATTR,
-                        read_only_indices)
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 51d061f2585..62d5bde1eae 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -36,8 +36,8 @@ from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import data_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import data_flow_ops  # pylint: disable=unused-import
@@ -50,6 +50,7 @@ from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import ref_variable
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import state_ops
@@ -57,6 +58,7 @@ from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-im
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import unconnected_gradients
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
 from tensorflow.python.ops.nn_ops import bias_add
@@ -319,7 +321,7 @@ class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testVariableRefGradient(self):
     with ops.Graph().as_default():
       init = constant_op.constant(100.0)
-      var = variables.VariableV1(init)
+      var = variable_v1.VariableV1(init)
       gradient = gradients.gradients(var._ref(), var)
       self.assertIsNotNone(gradient)
 
@@ -943,7 +945,7 @@ class ResourceCondTest(test_util.TensorFlowTestCase):
       return output
 
     training = array_ops.placeholder_with_default(True, shape=())
-    output = control_flow_ops.cond(
+    output = cond.cond(
         training, TestFn, lambda: inputs)
 
     loss = output
@@ -1029,8 +1031,8 @@ class GetDependentVariablesTest(test_util.TensorFlowTestCase):
   def testGetVariableByName(self):
     with context.graph_mode():
       init = constant_op.constant(100.0)
-      var = variable_scope.variable(init, name="a/replica_1")
-      if isinstance(var, variables.RefVariable):
+      var = variable_v1.VariableV1(init, name="a/replica_1")
+      if isinstance(var, ref_variable.RefVariable):
         var._variable = array_ops.identity(var, name="a")
       else:
         var._handle = array_ops.identity(var, name="a")
@@ -1442,7 +1444,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           dtype="float32")
 
       conditional = array_ops.placeholder_with_default(True, shape=())
-      output = control_flow_ops.cond(
+      output = cond.cond(
           conditional, lambda: alpha * 2, lambda: alpha * 3)
 
       g, = gradients_impl.gradients(output, alpha)
diff --git a/tensorflow/python/ops/gradients_util.py b/tensorflow/python/ops/gradients_util.py
index a4257a9eaed..31c06e091eb 100644
--- a/tensorflow/python/ops/gradients_util.py
+++ b/tensorflow/python/ops/gradients_util.py
@@ -24,11 +24,9 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import composite_tensor_gradient
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function as framework_function
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework.func_graph import FuncGraph
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_state
@@ -312,10 +310,10 @@ def _MaybeCompile(scope, op, func, grad_fn):
   """Compile the calculation in grad_fn if op was marked as compiled."""
   scope = scope.rstrip("/").replace("/", "_")
   if func is not None:
-    xla_compile = func.definition.attr["_XlaCompile"].b
-    xla_separate_compiled_gradients = func.definition.attr[
+    xla_compile = func.cached_definition.attr["_XlaCompile"].b
+    xla_separate_compiled_gradients = func.cached_definition.attr[
         "_XlaSeparateCompiledGradients"].b
-    xla_scope = func.definition.attr["_XlaScope"].s.decode()
+    xla_scope = func.cached_definition.attr["_XlaScope"].s.decode()
   else:
     try:
       xla_compile = op.get_attr("_XlaCompile")
@@ -369,16 +367,14 @@ def _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs_set):
 
 
 def _IsFunction(graph):
-  return (isinstance(graph, FuncGraph) or
-          isinstance(graph, framework_function._FuncGraph))  # pylint: disable=protected-access
+  # isinstance check for FuncGraphs that avoids the explicit dependency
+  # on func_graph.py and function.py
+  return isinstance(graph, ops.Graph) and graph._building_function  # pylint: disable=protected-access
 
 
 def _Captures(func_graph):
-  if isinstance(func_graph, FuncGraph):
-    return func_graph.captures
-  else:
-    assert isinstance(func_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
-    return func_graph.captures
+  assert _IsFunction(func_graph)
+  return func_graph.captures
 
 
 def _MaybeCaptured(t):
@@ -516,11 +512,7 @@ def _GradientsHelper(ys,
   curr_graph = src_graph
   while _IsFunction(curr_graph):
     func_graphs.append(curr_graph)
-    if isinstance(curr_graph, FuncGraph):
-      curr_graph = curr_graph.outer_graph
-    else:
-      assert isinstance(curr_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
-      curr_graph = curr_graph._outer_graph  # pylint: disable=protected-access
+    curr_graph = curr_graph.outer_graph
 
   stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
   if grad_ys is None:
diff --git a/tensorflow/python/ops/handle_data_util.py b/tensorflow/python/ops/handle_data_util.py
index f42f283acf7..d2ece077a20 100644
--- a/tensorflow/python/ops/handle_data_util.py
+++ b/tensorflow/python/ops/handle_data_util.py
@@ -58,13 +58,18 @@ def copy_handle_data(source_t, target_t):
   if (target_t.dtype == dtypes.resource or
       target_t.dtype == dtypes.variant):
     handle_data = get_handle_data(source_t)
-    if (handle_data is not None
-        and handle_data.is_set
-        and handle_data.shape_and_type):
-      set_handle_data(target_t, handle_data)
+    set_handle_data(target_t, handle_data)
 
 
 def set_handle_data(target_t, handle_data):
+  """Sets handle data on the giver tensor."""
+  if (
+      handle_data is None
+      or not handle_data.is_set
+      or not handle_data.shape_and_type
+  ):
+    return
+
   # pylint: disable=protected-access
   if isinstance(target_t, core.Value):
     target_t._handle_data = handle_data
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index e6ebecbebb7..5020ae06b95 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import control_flow_assert
 from tensorflow.python.ops import control_flow_case
 from tensorflow.python.ops import control_flow_ops
@@ -512,7 +513,7 @@ def _random_flip(image, flip_index, random_func, scope_name):
     def f_rank3():
       uniform_random = random_func(shape=[], minval=0, maxval=1.0)
       mirror_cond = math_ops.less(uniform_random, .5)
-      result = control_flow_ops.cond(
+      result = tf_cond.cond(
           mirror_cond,
           lambda: array_ops.reverse(image, [flip_index]),
           lambda: image,
@@ -530,7 +531,7 @@ def _random_flip(image, flip_index, random_func, scope_name):
 
     if shape.ndims is None:
       rank = array_ops.rank(image)
-      return control_flow_ops.cond(math_ops.equal(rank, 3), f_rank3, f_rank4)
+      return tf_cond.cond(math_ops.equal(rank, 3), f_rank3, f_rank4)
     if shape.ndims == 3:
       return f_rank3()
     elif shape.ndims == 4:
@@ -642,7 +643,7 @@ def _flip(image, flip_index, scope_name):
 
     if shape.ndims is None:
       rank = array_ops.rank(image)
-      return control_flow_ops.cond(math_ops.equal(rank, 3), f_rank3, f_rank4)
+      return tf_cond.cond(math_ops.equal(rank, 3), f_rank3, f_rank4)
     elif shape.ndims == 3:
       return f_rank3()
     elif shape.ndims == 4:
@@ -708,7 +709,7 @@ def rot90(image, k=1, name=None):
       def f_rank4():
         return _rot90_4D(image, k, scope)
 
-      return control_flow_ops.cond(math_ops.equal(rank, 3), f_rank3, f_rank4)
+      return tf_cond.cond(math_ops.equal(rank, 3), f_rank3, f_rank4)
     elif shape.ndims == 3:
       return _rot90_3D(image, k, scope)
     elif shape.ndims == 4:
@@ -841,7 +842,7 @@ def transpose(image, name=None):
       def f_rank4():
         return array_ops.transpose(image, [0, 2, 1, 3], name=name)
 
-      return control_flow_ops.cond(math_ops.equal(rank, 3), f_rank3, f_rank4)
+      return tf_cond.cond(math_ops.equal(rank, 3), f_rank3, f_rank4)
     elif shape.ndims == 3:
       return array_ops.transpose(image, [1, 0, 2], name=name)
     elif shape.ndims == 4:
@@ -2498,6 +2499,8 @@ def convert_image_dtype(image, dtype, saturate=False, name=None):
   dtype = dtypes.as_dtype(dtype)
   if not dtype.is_floating and not dtype.is_integer:
     raise AttributeError('dtype must be either floating point or integer')
+  if not image.dtype.is_floating and not image.dtype.is_integer:
+    raise AttributeError('image dtype must be either floating point or integer')
   if dtype == image.dtype:
     return array_ops.identity(image, name=name)
 
@@ -3221,8 +3224,8 @@ tf_export(
 def encode_png(image, compression=-1, name=None):
   r"""PNG-encode an image.
 
-  `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-  where `channels` is:
+  `image` is a rank-N Tensor of type uint8 or uint16 with shape `batch_dims +
+  [height, width, channels]`, where `channels` is:
 
   *   1: for grayscale.
   *   2: for grayscale + alpha.
@@ -3235,7 +3238,7 @@ def encode_png(image, compression=-1, name=None):
 
   Args:
     image: A `Tensor`. Must be one of the following types: `uint8`, `uint16`.
-      3-D with shape `[height, width, channels]`.
+      Rank N >= 3 with shape `batch_dims + [height, width, channels]`.
     compression: An optional `int`. Defaults to `-1`. Compression level.
     name: A name for the operation (optional).
 
@@ -4551,9 +4554,9 @@ def ssim_multiscale(img1,
           remainder = tails[0] % divisor_tensor
           need_padding = math_ops.reduce_any(math_ops.not_equal(remainder, 0))
           # pylint: disable=cell-var-from-loop
-          padded = control_flow_ops.cond(need_padding,
-                                         lambda: do_pad(flat_imgs, remainder),
-                                         lambda: flat_imgs)
+          padded = tf_cond.cond(need_padding,
+                                lambda: do_pad(flat_imgs, remainder),
+                                lambda: flat_imgs)
           # pylint: enable=cell-var-from-loop
 
           downscaled = [
@@ -5627,11 +5630,11 @@ def non_max_suppression_padded_v2(boxes,
           value=boxes, num_or_size_splits=4, axis=2)
       y_1_is_min = math_ops.reduce_all(
           math_ops.less_equal(y_1[0, 0, 0], y_2[0, 0, 0]))
-      y_min, y_max = control_flow_ops.cond(
+      y_min, y_max = tf_cond.cond(
           y_1_is_min, lambda: (y_1, y_2), lambda: (y_2, y_1))
       x_1_is_min = math_ops.reduce_all(
           math_ops.less_equal(x_1[0, 0, 0], x_2[0, 0, 0]))
-      x_min, x_max = control_flow_ops.cond(
+      x_min, x_max = tf_cond.cond(
           x_1_is_min, lambda: (x_1, x_2), lambda: (x_2, x_1))
       boxes = array_ops.concat([y_min, x_min, y_max, x_max], axis=2)
   # TODO(@bhack): https://github.com/tensorflow/tensorflow/issues/56089
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index a0f1d3e7f40..2874e73f314 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -4771,6 +4771,32 @@ class PngTest(test_util.TensorFlowTestCase):
       self.assertEqual(2, image0.shape[-1])
       self.assertAllEqual(image0, image1)
 
+  def testBatchedEncodeSynthetic(self):
+    with self.cached_session():
+      image0 = simple_color_ramp()
+      image_stack = np.broadcast_to(image0, (3, 4) + image0.shape)
+
+      png0 = self.evaluate(image_ops.encode_png(image0, compression=7))
+      png_stack = self.evaluate(
+          image_ops.encode_png(image_stack, compression=7)
+      )
+
+      # PNG is lossless
+      expected = np.broadcast_to(png0, (3, 4))
+      self.assertAllEqual(png_stack, expected)
+
+  def testBatchedZeroLengthEncodeSynthetic(self):
+    with self.cached_session():
+      image0 = simple_color_ramp()
+      image_stack = np.broadcast_to(image0, (3, 4) + image0.shape)
+      image_stack = image_stack[:0]
+
+      png_stack = self.evaluate(
+          image_ops.encode_png(image_stack, compression=7)
+      )
+
+      self.assertAllEqual(png_stack.shape, (0, 4))
+
   def testShape(self):
     # Shape function requires placeholders and a graph.
     with ops.Graph().as_default():
diff --git a/tensorflow/python/ops/linalg/BUILD b/tensorflow/python/ops/linalg/BUILD
index 8f2dc500283..e26c3b25a87 100644
--- a/tensorflow/python/ops/linalg/BUILD
+++ b/tensorflow/python/ops/linalg/BUILD
@@ -14,6 +14,7 @@ py_strict_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:linalg_ops_gen",
@@ -21,6 +22,7 @@ py_strict_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:special_math_ops",
         "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python:while_loop",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
@@ -44,6 +46,7 @@ py_strict_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/module",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
@@ -62,7 +65,8 @@ py_strict_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -78,8 +82,9 @@ py_strict_library(
         "//tensorflow/python:check_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/ops/distributions:util",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/ops/signal:fft_ops",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -99,6 +104,7 @@ py_strict_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -156,6 +162,7 @@ py_strict_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:common_shapes",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
@@ -195,11 +202,12 @@ py_strict_library(
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:variables",
         "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_spec_registry",
         "//tensorflow/python/module",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:data_structures",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:dispatch",
@@ -262,6 +270,7 @@ py_strict_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:manip_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -278,7 +287,8 @@ py_strict_library(
         "//tensorflow/python:check_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/ops/signal:fft_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -296,6 +306,7 @@ py_strict_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python/framework:common_shapes",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -363,6 +374,7 @@ py_strict_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sort_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -426,6 +438,7 @@ py_strict_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -443,6 +456,7 @@ py_strict_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -537,6 +551,7 @@ py_strict_library(
         "//tensorflow/python:check_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 32a06978dab..66f0a5ac12a 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
@@ -30,6 +31,7 @@ from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.ops import while_loop
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
@@ -322,22 +324,22 @@ def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
 
     is_finite = math_ops.is_finite(math_ops.reduce_max(l1_norm))
     nan = constant_op.constant(np.nan, matrix.dtype)
-    result = control_flow_ops.cond(
+    result = tf_cond.cond(
         is_finite, lambda: linalg_ops.matrix_solve(-u + v, u + v),
         lambda: array_ops.fill(array_ops.shape(matrix), nan))
     max_squarings = math_ops.reduce_max(squarings)
     i = const(0.0)
 
     def c(i, _):
-      return control_flow_ops.cond(is_finite,
-                                   lambda: math_ops.less(i, max_squarings),
-                                   lambda: constant_op.constant(False))
+      return tf_cond.cond(is_finite,
+                          lambda: math_ops.less(i, max_squarings),
+                          lambda: constant_op.constant(False))
 
     def b(i, r):
       return i + 1, array_ops.where_v2(
           math_ops.less(i, squarings), math_ops.matmul(r, r), r)
 
-    _, result = control_flow_ops.while_loop(c, b, [i, result])
+    _, result = while_loop.while_loop(c, b, [i, result])
     if not matrix.shape.is_fully_defined():
       return array_ops.reshape(
           result,
@@ -1344,7 +1346,7 @@ def eigh_tridiagonal(alpha,
           # unrolled while loop.
           unroll_cnt = blocksize
           cond = lambda i, q, count: math_ops.less(i, n)
-          _, _, count = control_flow_ops.while_loop(
+          _, _, count = while_loop.while_loop(
               cond, unrolled_steps, [i, q, count], back_prop=False)
           return count
 
@@ -1453,9 +1455,9 @@ def eigh_tridiagonal(alpha,
           return i + 1, lower, upper
 
         # Start parallel binary searches.
-        _, lower, upper = control_flow_ops.while_loop(continue_binary_search,
-                                                      binary_search_step,
-                                                      [0, lower, upper])
+        _, lower, upper = while_loop.while_loop(continue_binary_search,
+                                                binary_search_step,
+                                                [0, lower, upper])
         return midpoint(lower, upper)
 
     def _compute_eigenvectors(alpha, beta, eigvals):
@@ -1532,7 +1534,7 @@ def eigh_tridiagonal(alpha,
                 eigenvectors, update_indices, vectors_to_update)
             return cluster_idx + 1, eigenvectors
 
-          _, eigenvectors = control_flow_ops.while_loop(
+          _, eigenvectors = while_loop.while_loop(
               lambda i, ev: math_ops.less(i, num_clusters),
               orthogonalize_cluster, [0, eigenvectors])
           return eigenvectors
@@ -1564,9 +1566,9 @@ def eigh_tridiagonal(alpha,
           v = orthogonalize_close_eigenvectors(v)
           return i + 1, v, nrm_v, nrm_v_old
 
-        _, v, nrm_v, _ = control_flow_ops.while_loop(continue_iteration,
-                                                     inverse_iteration_step,
-                                                     [0, v0, nrm_v, zero_nrm])
+        _, v, nrm_v, _ = while_loop.while_loop(continue_iteration,
+                                               inverse_iteration_step,
+                                               [0, v0, nrm_v, zero_nrm])
         return transpose(v)
 
     alpha = ops.convert_to_tensor(alpha, name='alpha')
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index f867b109d36..c2f58f52b33 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
@@ -421,7 +422,9 @@ class LinearOperator(
     # `shape` may be passed in if this can be pre-computed in a
     # more efficient manner, e.g. without excessive Tensor conversions.
     if self.tensor_rank is not None:
-      return ops.convert_to_tensor_v2_with_dispatch(self.tensor_rank)
+      return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          self.tensor_rank
+      )
     else:
       shape = self.shape_tensor() if shape is None else shape
       return array_ops.size(shape)
@@ -465,7 +468,7 @@ class LinearOperator(
     # more efficient manner, e.g. without excessive Tensor conversions.
     dim_value = tensor_shape.dimension_value(self.domain_dimension)
     if dim_value is not None:
-      return ops.convert_to_tensor_v2_with_dispatch(dim_value)
+      return tensor_conversion.convert_to_tensor_v2_with_dispatch(dim_value)
     else:
       shape = self.shape_tensor() if shape is None else shape
       return shape[-1]
@@ -509,7 +512,7 @@ class LinearOperator(
     # more efficient manner, e.g. without excessive Tensor conversions.
     dim_value = tensor_shape.dimension_value(self.range_dimension)
     if dim_value is not None:
-      return ops.convert_to_tensor_v2_with_dispatch(dim_value)
+      return tensor_conversion.convert_to_tensor_v2_with_dispatch(dim_value)
     else:
       shape = self.shape_tensor() if shape is None else shape
       return shape[-2]
@@ -677,7 +680,7 @@ class LinearOperator(
         return linear_operator_algebra.matmul(left_operator, right_operator)
 
     with self._name_scope(name):  # pylint: disable=not-callable
-      x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
+      x = tensor_conversion.convert_to_tensor_v2_with_dispatch(x, name="x")
       self._check_input_dtype(x)
 
       self_dim = -2 if adjoint else -1
@@ -724,7 +727,7 @@ class LinearOperator(
       A `Tensor` with shape `[..., M]` and same `dtype` as `self`.
     """
     with self._name_scope(name):  # pylint: disable=not-callable
-      x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
+      x = tensor_conversion.convert_to_tensor_v2_with_dispatch(x, name="x")
       self._check_input_dtype(x)
       self_dim = -2 if adjoint else -1
       tensor_shape.dimension_at_index(
@@ -870,7 +873,9 @@ class LinearOperator(
         return linear_operator_algebra.solve(left_operator, right_operator)
 
     with self._name_scope(name):  # pylint: disable=not-callable
-      rhs = ops.convert_to_tensor_v2_with_dispatch(rhs, name="rhs")
+      rhs = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          rhs, name="rhs"
+      )
       self._check_input_dtype(rhs)
 
       self_dim = -1 if adjoint else -2
@@ -927,7 +932,9 @@ class LinearOperator(
       NotImplementedError:  If `self.is_non_singular` or `is_square` is False.
     """
     with self._name_scope(name):  # pylint: disable=not-callable
-      rhs = ops.convert_to_tensor_v2_with_dispatch(rhs, name="rhs")
+      rhs = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          rhs, name="rhs"
+      )
       self._check_input_dtype(rhs)
       self_dim = -1 if adjoint else -2
       tensor_shape.dimension_at_index(
@@ -1090,7 +1097,7 @@ class LinearOperator(
       A `Tensor` with broadcast shape and same `dtype` as `self`.
     """
     with self._name_scope(name):  # pylint: disable=not-callable
-      x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
+      x = tensor_conversion.convert_to_tensor_v2_with_dispatch(x, name="x")
       self._check_input_dtype(x)
       return self._add_to_tensor(x)
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_diag.py b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
index 6196813fa13..a9e9fa61a89 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
@@ -17,6 +17,7 @@
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
@@ -276,8 +277,9 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
   def _shape_tensor(self):
     # Avoid messy broadcasting if possible.
     if self.shape.is_fully_defined():
-      return ops.convert_to_tensor_v2_with_dispatch(
-          self.shape.as_list(), dtype=dtypes.int32, name="shape")
+      return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          self.shape.as_list(), dtype=dtypes.int32, name="shape"
+      )
 
     domain_dimension = sum(self._block_domain_dimension_tensors())
     range_dimension = sum(self._block_range_dimension_tensors())
@@ -366,12 +368,12 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
       if linear_operator_util.arg_is_blockwise(block_dimensions, x, arg_dim):
         for i, block in enumerate(x):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor_v2_with_dispatch(block)
+            block = tensor_conversion.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[arg_dim])
             x[i] = block
       else:
-        x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
+        x = tensor_conversion.convert_to_tensor_v2_with_dispatch(x, name="x")
         self._check_input_dtype(x)
         op_dimension = (self.range_dimension if adjoint
                         else self.domain_dimension)
@@ -442,7 +444,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
       if linear_operator_util.arg_is_blockwise(block_dimensions, x, -1):
         for i, block in enumerate(x):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor_v2_with_dispatch(block)
+            block = tensor_conversion.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[-1])
             x[i] = block
@@ -450,7 +452,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
         y_mat = self.matmul(x_mat, adjoint=adjoint)
         return [array_ops.squeeze(y, axis=-1) for y in y_mat]
 
-      x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
+      x = tensor_conversion.convert_to_tensor_v2_with_dispatch(x, name="x")
       self._check_input_dtype(x)
       op_dimension = (self.range_dimension if adjoint
                       else self.domain_dimension)
@@ -567,12 +569,14 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
         split_rhs = rhs
         for i, block in enumerate(split_rhs):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor_v2_with_dispatch(block)
+            block = tensor_conversion.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[arg_dim])
             split_rhs[i] = block
       else:
-        rhs = ops.convert_to_tensor_v2_with_dispatch(rhs, name="rhs")
+        rhs = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+            rhs, name="rhs"
+        )
         self._check_input_dtype(rhs)
         op_dimension = (self.domain_dimension if adjoint
                         else self.range_dimension)
@@ -642,7 +646,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
       if linear_operator_util.arg_is_blockwise(block_dimensions, rhs, -1):
         for i, block in enumerate(rhs):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor_v2_with_dispatch(block)
+            block = tensor_conversion.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[-1])
             rhs[i] = block
@@ -650,7 +654,9 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
         solution_mat = self.solve(rhs_mat, adjoint=adjoint)
         return [array_ops.squeeze(x, axis=-1) for x in solution_mat]
 
-      rhs = ops.convert_to_tensor_v2_with_dispatch(rhs, name="rhs")
+      rhs = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          rhs, name="rhs"
+      )
       self._check_input_dtype(rhs)
       op_dimension = (self.domain_dimension if adjoint
                       else self.range_dimension)
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
index f9ebde59e2b..a90cdd559ad 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
@@ -17,6 +17,7 @@
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
@@ -382,8 +383,9 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
   def _shape_tensor(self):
     # Avoid messy broadcasting if possible.
     if self.shape.is_fully_defined():
-      return ops.convert_to_tensor_v2_with_dispatch(
-          self.shape.as_list(), dtype=dtypes.int32, name="shape")
+      return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          self.shape.as_list(), dtype=dtypes.int32, name="shape"
+      )
 
     domain_dimension = sum(self._block_domain_dimension_tensors())
     range_dimension = sum(self._block_range_dimension_tensors())
@@ -449,12 +451,12 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
       if linear_operator_util.arg_is_blockwise(block_dimensions, x, arg_dim):
         for i, block in enumerate(x):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor_v2_with_dispatch(block)
+            block = tensor_conversion.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[arg_dim])
             x[i] = block
       else:
-        x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
+        x = tensor_conversion.convert_to_tensor_v2_with_dispatch(x, name="x")
         self._check_input_dtype(x)
         op_dimension = (self.range_dimension if adjoint
                         else self.domain_dimension)
@@ -559,7 +561,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
       if linear_operator_util.arg_is_blockwise(block_dimensions, x, -1):
         for i, block in enumerate(x):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor_v2_with_dispatch(block)
+            block = tensor_conversion.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[-1])
             x[i] = block
@@ -567,7 +569,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
         y_mat = self.matmul(x_mat, adjoint=adjoint)
         return [array_ops.squeeze(y, axis=-1) for y in y_mat]
 
-      x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
+      x = tensor_conversion.convert_to_tensor_v2_with_dispatch(x, name="x")
       self._check_input_dtype(x)
       op_dimension = (self.range_dimension if adjoint
                       else self.domain_dimension)
@@ -690,7 +692,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
       if blockwise_arg:
         for i, block in enumerate(rhs):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor_v2_with_dispatch(block)
+            block = tensor_conversion.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[arg_dim])
             rhs[i] = block
@@ -700,7 +702,9 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
           split_rhs = rhs
 
       else:
-        rhs = ops.convert_to_tensor_v2_with_dispatch(rhs, name="rhs")
+        rhs = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+            rhs, name="rhs"
+        )
         self._check_input_dtype(rhs)
         op_dimension = (self.domain_dimension if adjoint
                         else self.range_dimension)
@@ -811,14 +815,16 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
       if linear_operator_util.arg_is_blockwise(block_dimensions, rhs, -1):
         for i, block in enumerate(rhs):
           if not isinstance(block, linear_operator.LinearOperator):
-            block = ops.convert_to_tensor_v2_with_dispatch(block)
+            block = tensor_conversion.convert_to_tensor_v2_with_dispatch(block)
             self._check_input_dtype(block)
             block_dimensions[i].assert_is_compatible_with(block.shape[-1])
             rhs[i] = block
         rhs_mat = [array_ops.expand_dims(block, axis=-1) for block in rhs]
         solution_mat = self.solve(rhs_mat, adjoint=adjoint)
         return [array_ops.squeeze(x, axis=-1) for x in solution_mat]
-      rhs = ops.convert_to_tensor_v2_with_dispatch(rhs, name="rhs")
+      rhs = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          rhs, name="rhs"
+      )
       self._check_input_dtype(rhs)
       op_dimension = (self.domain_dimension if adjoint
                       else self.range_dimension)
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
index 3685acbeb32..ca707c046f6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_circulant.py
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -18,6 +18,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -112,16 +113,19 @@ def exponential_power_convolution_kernel(
   """
   nd = len(grid_shape)
 
-  length_scale = ops.convert_to_tensor_v2_with_dispatch(
-      length_scale, name="length_scale")
+  length_scale = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      length_scale, name="length_scale"
+  )
   dtype = length_scale.dtype
 
   power = 2. if power is None else power
-  power = ops.convert_to_tensor_v2_with_dispatch(
-      power, name="power", dtype=dtype)
+  power = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      power, name="power", dtype=dtype
+  )
   divisor = power if divisor is None else divisor
-  divisor = ops.convert_to_tensor_v2_with_dispatch(
-      divisor, name="divisor", dtype=dtype)
+  divisor = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      divisor, name="divisor", dtype=dtype
+  )
 
   # With K = grid_shape[i], we implicitly assume the grid vertices along the
   # ith dimension are at:
@@ -143,8 +147,9 @@ def exponential_power_convolution_kernel(
 
   if zero_inflation:
     # delta.shape = grid_shape, delta[0, 0, 0] = 1., all other entries are 0.
-    zero_inflation = ops.convert_to_tensor_v2_with_dispatch(
-        zero_inflation, name="zero_inflation", dtype=dtype)
+    zero_inflation = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        zero_inflation, name="zero_inflation", dtype=dtype
+    )
     delta = array_ops.pad(
         array_ops.reshape(one, [1] * nd), [[0, dim - 1] for dim in grid_shape])
     kernel = (1. - zero_inflation) * kernel + zero_inflation * delta
@@ -504,7 +509,9 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
 
   def _broadcast_batch_dims(self, x, spectrum):
     """Broadcast batch dims of batch matrix `x` and spectrum."""
-    spectrum = ops.convert_to_tensor_v2_with_dispatch(spectrum, name="spectrum")
+    spectrum = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        spectrum, name="spectrum"
+    )
     # spectrum.shape = batch_shape + block_shape
     # First make spectrum a batch matrix with
     #   spectrum.shape = batch_shape + [prod(block_shape), 1]
@@ -539,8 +546,9 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
             math_ops.reduce_min(abs_singular_values, axis=-1))
 
   def _eigvals(self):
-    return ops.convert_to_tensor_v2_with_dispatch(
-        self._unblockify(self.spectrum))
+    return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        self._unblockify(self.spectrum)
+    )
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
     x = linalg.adjoint(x) if adjoint_arg else x
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index 5419757644f..f3d2a91b25e 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -15,6 +15,7 @@
 """`LinearOperator` acting like a diagonal matrix."""
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
@@ -254,7 +255,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
     return array_ops.matrix_set_diag(x, new_diag)
 
   def _eigvals(self):
-    return ops.convert_to_tensor_v2_with_dispatch(self.diag)
+    return tensor_conversion.convert_to_tensor_v2_with_dispatch(self.diag)
 
   def _cond(self):
     abs_diag = math_ops.abs(self.diag)
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index 48fd65e98f5..579f2d7c74d 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -16,6 +16,7 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
@@ -163,7 +164,9 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
         dtypes.complex128,
     ]
 
-    matrix = ops.convert_to_tensor_v2_with_dispatch(matrix, name="matrix")
+    matrix = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        matrix, name="matrix"
+    )
 
     dtype = matrix.dtype
     if dtype not in allowed_dtypes:
diff --git a/tensorflow/python/ops/linalg/linear_operator_householder.py b/tensorflow/python/ops/linalg/linear_operator_householder.py
index 6ddbaf1f6ba..b45a3295d90 100644
--- a/tensorflow/python/ops/linalg/linear_operator_householder.py
+++ b/tensorflow/python/ops/linalg/linear_operator_householder.py
@@ -16,6 +16,7 @@
 
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -202,8 +203,9 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
 
     # Note that because this is a reflection, it lies in O(n) (for real vector
     # spaces) or U(n) (for complex vector spaces), and thus is its own adjoint.
-    reflection_axis = ops.convert_to_tensor_v2_with_dispatch(
-        self.reflection_axis)
+    reflection_axis = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        self.reflection_axis
+    )
     x = linalg.adjoint(x) if adjoint_arg else x
     normalized_axis = nn.l2_normalize(reflection_axis, axis=-1)
     mat = normalized_axis[..., array_ops.newaxis]
@@ -233,8 +235,9 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
     return self._matmul(rhs, adjoint, adjoint_arg)
 
   def _to_dense(self):
-    reflection_axis = ops.convert_to_tensor_v2_with_dispatch(
-        self.reflection_axis)
+    reflection_axis = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        self.reflection_axis
+    )
     normalized_axis = nn.l2_normalize(reflection_axis, axis=-1)
     mat = normalized_axis[..., array_ops.newaxis]
     matrix = -2 * math_ops.matmul(mat, mat, adjoint_b=True)
@@ -242,8 +245,9 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
         matrix, 1. + array_ops.matrix_diag_part(matrix))
 
   def _diag_part(self):
-    reflection_axis = ops.convert_to_tensor_v2_with_dispatch(
-        self.reflection_axis)
+    reflection_axis = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        self.reflection_axis
+    )
     normalized_axis = nn.l2_normalize(reflection_axis, axis=-1)
     return 1. - 2 * normalized_axis * math_ops.conj(normalized_axis)
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index eb3ee686ad9..fc44b6c6344 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -18,6 +18,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -405,7 +406,9 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
       A `Tensor` with broadcast shape and same `dtype` as `self`.
     """
     with self._name_scope(name):  # pylint: disable=not-callable
-      mat = ops.convert_to_tensor_v2_with_dispatch(mat, name="mat")
+      mat = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          mat, name="mat"
+      )
       mat_diag = array_ops.matrix_diag_part(mat)
       new_diag = 1 + mat_diag
       return array_ops.matrix_set_diag(mat, new_diag)
@@ -760,7 +763,9 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
       multiplier_vector = array_ops.expand_dims(self.multiplier, -1)
 
       # Shape [C1,...,Cc, M, M]
-      mat = ops.convert_to_tensor_v2_with_dispatch(mat, name="mat")
+      mat = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          mat, name="mat"
+      )
 
       # Shape [C1,...,Cc, M]
       mat_diag = array_ops.matrix_diag_part(mat)
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index 25d52dd4e9c..a326b9f99a6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -15,6 +15,7 @@
 """Perturb a `LinearOperator` with a rank `K` update."""
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
@@ -365,11 +366,11 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
 
   def _get_uv_as_tensors(self):
     """Get (self.u, self.v) as tensors (in case they were refs)."""
-    u = ops.convert_to_tensor_v2_with_dispatch(self.u)
+    u = tensor_conversion.convert_to_tensor_v2_with_dispatch(self.u)
     if self.v is self.u:
       v = u
     else:
-      v = ops.convert_to_tensor_v2_with_dispatch(self.v)
+      v = tensor_conversion.convert_to_tensor_v2_with_dispatch(self.v)
     return u, v
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
diff --git a/tensorflow/python/ops/linalg/linear_operator_permutation.py b/tensorflow/python/ops/linalg/linear_operator_permutation.py
index c374a3afe81..31f07732ef1 100644
--- a/tensorflow/python/ops/linalg/linear_operator_permutation.py
+++ b/tensorflow/python/ops/linalg/linear_operator_permutation.py
@@ -18,6 +18,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -207,7 +208,7 @@ class LinearOperatorPermutation(linear_operator.LinearOperator):
     return array_ops.shape(perm)[-1]
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
-    perm = ops.convert_to_tensor_v2_with_dispatch(self.perm)
+    perm = tensor_conversion.convert_to_tensor_v2_with_dispatch(self.perm)
     if adjoint and not self.is_self_adjoint:
       # TODO(srvasude): invert_permutation doesn't work on batches so we use
       # argsort.
@@ -242,13 +243,13 @@ class LinearOperatorPermutation(linear_operator.LinearOperator):
     return self._matmul(rhs, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
 
   def _to_dense(self):
-    perm = ops.convert_to_tensor_v2_with_dispatch(self.perm)
+    perm = tensor_conversion.convert_to_tensor_v2_with_dispatch(self.perm)
     return math_ops.cast(math_ops.equal(
         math_ops.range(0, self._domain_dimension_tensor(perm)),
         perm[..., array_ops.newaxis]), self.dtype)
 
   def _diag_part(self):
-    perm = ops.convert_to_tensor_v2_with_dispatch(self.perm)
+    perm = tensor_conversion.convert_to_tensor_v2_with_dispatch(self.perm)
     return math_ops.cast(math_ops.equal(
         math_ops.range(0, self._domain_dimension_tensor(perm)),
         perm), self.dtype)
diff --git a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
index 84674a94401..ea87ef6a61b 100644
--- a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
+++ b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
@@ -16,6 +16,7 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
@@ -214,8 +215,8 @@ class LinearOperatorToeplitz(linear_operator.LinearOperator):
     # for more details.
     x = linalg.adjoint(x) if adjoint_arg else x
     expanded_x = array_ops.concat([x, array_ops.zeros_like(x)], axis=-2)
-    col = ops.convert_to_tensor_v2_with_dispatch(self.col)
-    row = ops.convert_to_tensor_v2_with_dispatch(self.row)
+    col = tensor_conversion.convert_to_tensor_v2_with_dispatch(self.col)
+    row = tensor_conversion.convert_to_tensor_v2_with_dispatch(self.row)
     circulant_col = array_ops.concat(
         [col,
          array_ops.zeros_like(col[..., 0:1]),
@@ -241,8 +242,8 @@ class LinearOperatorToeplitz(linear_operator.LinearOperator):
         [self.domain_dimension_tensor()], self.dtype)
 
   def _to_dense(self):
-    row = ops.convert_to_tensor_v2_with_dispatch(self.row)
-    col = ops.convert_to_tensor_v2_with_dispatch(self.col)
+    row = tensor_conversion.convert_to_tensor_v2_with_dispatch(self.row)
+    col = tensor_conversion.convert_to_tensor_v2_with_dispatch(self.col)
     total_shape = array_ops.broadcast_dynamic_shape(
         array_ops.shape(row), array_ops.shape(col))
     n = array_ops.shape(row)[-1]
diff --git a/tensorflow/python/ops/linalg/linear_operator_tridiag.py b/tensorflow/python/ops/linalg/linear_operator_tridiag.py
index 5565e878d14..b5353cfc9ff 100644
--- a/tensorflow/python/ops/linalg/linear_operator_tridiag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_tridiag.py
@@ -15,6 +15,7 @@
 """`LinearOperator` acting like a tridiagonal matrix."""
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
@@ -254,7 +255,9 @@ class LinearOperatorTridiag(linear_operator.LinearOperator):
           self.diagonals, linalg.adjoint(self.diagonals),
           message='Matrix was not equal to its adjoint.')]
     elif self.diagonals_format == _COMPACT:
-      diagonals = ops.convert_to_tensor_v2_with_dispatch(self.diagonals)
+      diagonals = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          self.diagonals
+      )
       asserts += [linear_operator_util.assert_zero_imag_part(
           diagonals[..., 1, :], message=diag_message)]
       # Roll the subdiagonal so the shifted argument is at the end.
@@ -362,7 +365,8 @@ class LinearOperatorTridiag(linear_operator.LinearOperator):
           padding_value=0.)
 
     diagonals = [
-        ops.convert_to_tensor_v2_with_dispatch(d) for d in self.diagonals
+        tensor_conversion.convert_to_tensor_v2_with_dispatch(d)
+        for d in self.diagonals
     ]
     diagonals = array_ops_stack.stack(diagonals, axis=-2)
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index 63016560f17..ae601ff8781 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -18,6 +18,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -111,8 +112,9 @@ def convert_nonref_to_tensor(value, dtype=None, dtype_hint=None, name=None):
           f"Argument `value` must be of dtype `{dtype_name(dtype_base)}` "
           f"Received: `{dtype_name(value_dtype_base)}`.")
     return value
-  return ops.convert_to_tensor_v2_with_dispatch(
-      value, dtype=dtype, dtype_hint=dtype_hint, name=name)
+  return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      value, dtype=dtype, dtype_hint=dtype_hint, name=name
+  )
 
 
 def base_dtype(dtype):
@@ -186,10 +188,12 @@ def assert_no_entries_with_modulus_zero(
     An `Op` that asserts `x` has no entries with modulus zero.
   """
   with ops.name_scope(name, values=[x]):
-    x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
+    x = tensor_conversion.convert_to_tensor_v2_with_dispatch(x, name="x")
     dtype = x.dtype.base_dtype
     should_be_nonzero = math_ops.abs(x)
-    zero = ops.convert_to_tensor_v2_with_dispatch(0, dtype=dtype.real_dtype)
+    zero = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        0, dtype=dtype.real_dtype
+    )
     return check_ops.assert_less(zero, should_be_nonzero, message=message)
 
 
@@ -205,13 +209,15 @@ def assert_zero_imag_part(x, message=None, name="assert_zero_imag_part"):
     An `Op` that asserts `x` has no entries with modulus zero.
   """
   with ops.name_scope(name, values=[x]):
-    x = ops.convert_to_tensor_v2_with_dispatch(x, name="x")
+    x = tensor_conversion.convert_to_tensor_v2_with_dispatch(x, name="x")
     dtype = x.dtype.base_dtype
 
     if dtype.is_floating:
       return control_flow_ops.no_op()
 
-    zero = ops.convert_to_tensor_v2_with_dispatch(0, dtype=dtype.real_dtype)
+    zero = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        0, dtype=dtype.real_dtype
+    )
     return check_ops.assert_equal(zero, math_ops.imag(x), message=message)
 
 
@@ -258,7 +264,9 @@ def shape_tensor(shape, name=None):
     dtype = dtypes.int32
   else:
     dtype = None
-  return ops.convert_to_tensor_v2_with_dispatch(shape, dtype=dtype, name=name)
+  return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      shape, dtype=dtype, name=name
+  )
 
 
 ################################################################################
@@ -320,7 +328,9 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
     batch_matrices = list(batch_matrices)
 
     for i, mat in enumerate(batch_matrices):
-      batch_matrices[i] = ops.convert_to_tensor_v2_with_dispatch(mat)
+      batch_matrices[i] = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          mat
+      )
       assert_is_batch_matrix(batch_matrices[i])
 
     if len(batch_matrices) < 2:
@@ -363,9 +373,12 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
 def matrix_solve_with_broadcast(matrix, rhs, adjoint=False, name=None):
   """Solve systems of linear equations."""
   with ops.name_scope(name, "MatrixSolveWithBroadcast", [matrix, rhs]):
-    matrix = ops.convert_to_tensor_v2_with_dispatch(matrix, name="matrix")
-    rhs = ops.convert_to_tensor_v2_with_dispatch(
-        rhs, name="rhs", dtype=matrix.dtype)
+    matrix = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        matrix, name="matrix"
+    )
+    rhs = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        rhs, name="rhs", dtype=matrix.dtype
+    )
 
     # If either matrix/rhs has extra dims, we can reshape to get rid of them.
     matrix, rhs, reshape_inv, still_need_to_transpose = _reshape_for_efficiency(
@@ -550,8 +563,12 @@ def arg_is_blockwise(block_dimensions, arg, arg_split_dim):
     if not any(nest.is_nested(x) for x in arg):
       return True
     else:
-      arg_dims = [ops.convert_to_tensor_v2_with_dispatch(
-          x).shape[arg_split_dim] for x in arg]
+      arg_dims = [
+          tensor_conversion.convert_to_tensor_v2_with_dispatch(x).shape[
+              arg_split_dim
+          ]
+          for x in arg
+      ]
       self_dims = [dim.value for dim in block_dimensions]
 
       # If none of the operator dimensions are known, interpret the input as
diff --git a/tensorflow/python/ops/linalg/sparse/BUILD b/tensorflow/python/ops/linalg/sparse/BUILD
index b27255f272b..10468612fc7 100644
--- a/tensorflow/python/ops/linalg/sparse/BUILD
+++ b/tensorflow/python/ops/linalg/sparse/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 # Description: Sparse CSR support for TensorFlow.
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
@@ -15,25 +17,64 @@ tf_gen_op_wrapper_py(
     deps = ["//tensorflow/core:sparse_csr_matrix_ops_op_lib"],
 )
 
-py_library(
-    name = "sparse",
-    srcs = [
-        "__init__.py",
-        "conjugate_gradient.py",
-        "sparse.py",
-        "sparse_csr_matrix_grad.py",
-        "sparse_csr_matrix_ops.py",
+py_strict_library(
+    name = "sparse_py",
+    srcs = ["sparse.py"],
+    deps = [
+        ":conjugate_gradient",
+        ":sparse_csr_matrix_grad",
+        ":sparse_csr_matrix_ops",
     ],
-    srcs_version = "PY3",
+)
+
+py_strict_library(
+    name = "sparse_csr_matrix_grad",
+    srcs = ["sparse_csr_matrix_grad.py"],
+    deps = [
+        ":sparse_csr_matrix_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+    ],
+)
+
+py_strict_library(
+    name = "__init__",
+    srcs = ["__init__.py"],
+)
+
+py_strict_library(
+    name = "conjugate_gradient",
+    srcs = ["conjugate_gradient.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:while_loop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops/linalg:linalg_impl",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "sparse_csr_matrix_ops",
+    srcs = ["sparse_csr_matrix_ops.py"],
     deps = [
         ":gen_sparse_csr_matrix_ops",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/ops/linalg:linalg_impl",
-        "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py b/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
index c002f9dfb6b..08f764797dd 100644
--- a/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
+++ b/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
@@ -20,8 +20,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import while_loop
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
@@ -129,8 +129,7 @@ def conjugate_gradient(operator,
     gamma0 = dot(r0, p0)
     i = constant_op.constant(0, dtype=dtypes.int32)
     state = cg_state(i=i, x=x, r=r0, p=p0, gamma=gamma0)
-    _, state = control_flow_ops.while_loop(
-        stopping_criterion, cg_step, [i, state])
+    _, state = while_loop.while_loop(stopping_criterion, cg_step, [i, state])
     return cg_state(
         state.i,
         x=state.x,
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 826c32cbab1..fcd8ec15f1e 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -37,7 +37,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -642,9 +642,9 @@ def _MatrixSolveLsGrad(op, grad):
     # We have to defer determining the shape to runtime and use
     # conditional execution of the appropriate graph.
     matrix_shape = array_ops.shape(op.inputs[0])[-2:]
-    return control_flow_ops.cond(matrix_shape[-2] >= matrix_shape[-1],
-                                 lambda: _Overdetermined(op, grad),
-                                 lambda: _Underdetermined(op, grad))
+    return cond.cond(matrix_shape[-2] >= matrix_shape[-1],
+                     lambda: _Overdetermined(op, grad),
+                     lambda: _Underdetermined(op, grad))
 
 
 @ops.RegisterGradient("BandedTriangularSolve")
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index e0875ff432e..f007c202125 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -19,7 +19,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops_impl
@@ -356,7 +356,7 @@ def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
         # We have to defer determining the shape to runtime and use
         # conditional execution of the appropriate graph.
         matrix_shape = array_ops.shape(matrix)[-2:]
-        return control_flow_ops.cond(
+        return cond.cond(
             matrix_shape[-2] >= matrix_shape[-1],
             lambda: _overdetermined(matrix, rhs, l2_regularizer),
             lambda: _underdetermined(matrix, rhs, l2_regularizer))
@@ -731,8 +731,11 @@ def norm(tensor,
       if is_matrix_norm and ord in [2, 2.0]:
         rank = array_ops.rank(tensor)
         positive_axis = map_fn.map_fn(
-            lambda i: control_flow_ops.cond(i >= 0, lambda: i, lambda: i + rank
-                                           ), ops.convert_to_tensor(axis))
+            lambda i: cond.cond(
+                i >= 0,
+                lambda: i,
+                lambda: i + rank),
+            ops.convert_to_tensor(axis))
         axes = math_ops.range(rank)
         perm_before = array_ops.concat([
             gen_array_ops.list_diff(axes, positive_axis, dtypes.int32)[0],
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index 6ebf2683ef6..5577b04e550 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -18,7 +18,6 @@
 import numpy as np
 
 from tensorflow.core.framework import full_type_pb2
-from tensorflow.python.compat import compat
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -31,12 +30,6 @@ from tensorflow.python.ops import handle_data_util
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_list_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.lazy_loader import LazyLoader
-
-# list_ops -> control_flow_ops -> tensor_array_ops -> list_ops
-control_flow_ops = LazyLoader(
-    "control_flow_ops", globals(),
-    "tensorflow.python.ops.control_flow_ops")
 
 
 ops.NotDifferentiable("TensorListConcatLists")
@@ -194,30 +187,15 @@ def tensor_list_set_item(input_handle,
                          resize_if_index_out_of_bounds=False,
                          name=None):
   """Sets `item` at `index` in input list."""
-  if not compat.forward_compatible(2023, 3, 10):
-    if resize_if_index_out_of_bounds:
-      input_list_size = gen_list_ops.tensor_list_length(input_handle)
-      # TODO(srbs): This could cause some slowdown. Consider fusing resize
-      # functionality in the SetItem op.
-      input_handle = control_flow_ops.cond(
-          index >= input_list_size,
-          lambda: gen_list_ops.tensor_list_resize(  # pylint: disable=g-long-lambda
-              input_handle, index + 1),
-          lambda: input_handle)
-    output_handle = gen_list_ops.tensor_list_set_item(
-        input_handle=input_handle, index=index, item=item, name=name)
-    handle_data_util.copy_handle_data(input_handle, output_handle)
-    return output_handle
-  else:
-    output_handle = gen_list_ops.tensor_list_set_item(
-        input_handle=input_handle,
-        index=index,
-        item=item,
-        name=name,
-        resize_if_index_out_of_bounds=resize_if_index_out_of_bounds,
-    )
-    handle_data_util.copy_handle_data(input_handle, output_handle)
-    return output_handle
+  output_handle = gen_list_ops.tensor_list_set_item(
+      input_handle=input_handle,
+      index=index,
+      item=item,
+      name=name,
+      resize_if_index_out_of_bounds=resize_if_index_out_of_bounds,
+  )
+  handle_data_util.copy_handle_data(input_handle, output_handle)
+  return output_handle
 
 
 @ops.RegisterGradient("TensorListPushBack")
@@ -324,7 +302,7 @@ def _TensorListSetItemGrad(op, dlist):
       element_shape=array_ops.shape(item),
       element_dtype=item.dtype,
   )
-  if compat.forward_compatible(2023, 3, 10) and op.get_attr(
+  if op.get_attr(
       "resize_if_index_out_of_bounds"
   ):
     input_list_size = gen_list_ops.tensor_list_length(input_list)
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
index b0e17ec812a..bda64481307 100644
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@@ -25,6 +25,7 @@ py_strict_library(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:confusion_matrix",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -42,6 +43,7 @@ py_strict_library(
     deps = [
         ":util",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:confusion_matrix",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -50,7 +52,8 @@ py_strict_library(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:weights_broadcast_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 0782b299453..e7a2b27faa4 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -18,6 +18,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import confusion_matrix
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -1034,7 +1035,7 @@ def _remove_squeezable_dimensions(
     rank_diff = array_ops.rank(weights) - array_ops.rank(labels)
     if (weights_rank is None) or (
         weights_rank > 0 and weights_shape.dims[-1].is_compatible_with(1)):
-      weights = control_flow_ops.cond(
+      weights = cond.cond(
           math_ops.equal(1, rank_diff),
           lambda: array_ops.squeeze(weights, [-1]),
           lambda: weights)
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index 4dee9256a00..2678b3ee7ca 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -20,8 +20,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import confusion_matrix
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
@@ -72,9 +72,9 @@ def squeeze_or_expand_dimensions(y_pred, y_true=None, sample_weight=None):
       squeeze_dims = lambda: confusion_matrix.remove_squeezable_dimensions(  # pylint: disable=g-long-lambda
           y_true, y_pred)
       is_last_dim_1 = math_ops.equal(1, array_ops.shape(y_pred)[-1])
-      maybe_squeeze_dims = lambda: control_flow_ops.cond(  # pylint: disable=g-long-lambda
+      maybe_squeeze_dims = lambda: cond.cond(  # pylint: disable=g-long-lambda
           is_last_dim_1, squeeze_dims, lambda: (y_true, y_pred))
-      y_true, y_pred = control_flow_ops.cond(
+      y_true, y_pred = cond.cond(
           math_ops.equal(1, rank_diff), maybe_squeeze_dims, squeeze_dims)
 
   if sample_weight is None:
@@ -100,17 +100,17 @@ def squeeze_or_expand_dimensions(y_pred, y_true=None, sample_weight=None):
 
   def _maybe_expand_weights():
     expand_weights = lambda: array_ops.expand_dims(sample_weight, [-1])
-    return control_flow_ops.cond(
+    return cond.cond(
         math_ops.equal(rank_diff, -1), expand_weights, lambda: sample_weight)
 
   def _maybe_adjust_weights():
-    return control_flow_ops.cond(
+    return cond.cond(
         math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
         _maybe_expand_weights)
 
   # squeeze or expand last dim of `sample_weight` if its rank differs by 1
   # from the new rank of `y_pred`.
-  sample_weight = control_flow_ops.cond(
+  sample_weight = cond.cond(
       math_ops.equal(weights_rank_tensor, 0), lambda: sample_weight,
       _maybe_adjust_weights)
   return y_pred, y_true, sample_weight
diff --git a/tensorflow/python/ops/map_ops.py b/tensorflow/python/ops/map_ops.py
index afc77cd02d7..f5d6e809b89 100644
--- a/tensorflow/python/ops/map_ops.py
+++ b/tensorflow/python/ops/map_ops.py
@@ -18,7 +18,7 @@
 # pylint: disable=wildcard-import
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import gen_map_ops
 from tensorflow.python.ops.gen_map_ops import *
 
@@ -59,7 +59,7 @@ def LookupGrad(op, dval):
 def InsertGrad(op, dmap):
   _, k, v = op.inputs
   key_grad = None
-  (value_grad, map_grad) = control_flow_ops.cond(
+  (value_grad, map_grad) = cond.cond(
       tensor_map_has_key(dmap, k), lambda:
       (tensor_map_lookup(dmap, k, v.dtype), tensor_map_erase(dmap, k, v.dtype)),
       lambda: (array_ops.zeros_like(v), dmap))
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index b04bd983adf..7ada2203bfe 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -256,9 +256,9 @@ def _MeanGrad(op, grad):
     factor = constant_op.constant(factor, dtype=sum_grad.dtype)
   else:
     input_shape = array_ops.shape(op.inputs[0])
-    output_shape = array_ops.shape(op.outputs[0])
-    factor = _safe_shape_div(
-        math_ops.reduce_prod(input_shape), math_ops.reduce_prod(output_shape))
+    input_rank = array_ops.size(input_shape)
+    axes = (op.inputs[1] + input_rank) % input_rank
+    factor = math_ops.reduce_prod(array_ops.gather(input_shape, axes))
   return math_ops.truediv(sum_grad, math_ops.cast(factor, sum_grad.dtype)), None
 
 
@@ -1255,8 +1255,20 @@ def _Atan2Grad(op, grad):
   y = op.inputs[0]
   x = op.inputs[1]
   with ops.control_dependencies([grad]):
+    (sx, rx, must_reduce_x), (sy, ry, must_reduce_y) = (
+        SmartBroadcastGradientArgs(x, y, grad)
+    )
+
     grad_inv = grad / (math_ops.square(x) + math_ops.square(y))
-    return x * grad_inv, -y * grad_inv
+
+    gx = -y * grad_inv
+    if must_reduce_x:
+      gx = array_ops.reshape(math_ops.reduce_sum(gx, rx), sx)
+
+    gy = x * grad_inv
+    if must_reduce_y:
+      gy = array_ops.reshape(math_ops.reduce_sum(gy, ry), sy)
+    return gy, gx
 
 
 @ops.RegisterGradient("AddN")
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 7377bb1bc37..4a93eec90e9 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -70,7 +70,6 @@ import builtins
 import numbers
 import numpy as np
 
-from tensorflow.python.compat import compat as tf_compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -1071,25 +1070,32 @@ def saturate_cast(value, dtype, name=None):
     # in_dtype is real, but out_dtype could be complex.
     out_real_dtype = dtype.real_dtype
     if in_dtype.min < out_real_dtype.min or in_dtype.max > out_real_dtype.max:
+      # The output min/max may not actually be representable in the
+      # in_dtype (e.g. casting float32 to uint32).  This can lead to undefined
+      # behavior when trying to cast a value outside the valid range of the
+      # target type. We work around this by nudging the min/max to fall within
+      # the valid output range.  The catch is that we may actually saturate
+      # to a value less than the true saturation limit, but this is the best we
+      # can do in order to avoid UB without introducing a separate SaturateCast
+      # op.
+      min_limit = in_dtype.as_numpy_dtype(out_real_dtype.min)
+      if min_limit < out_real_dtype.min:
+        min_limit = np.nextafter(
+            out_real_dtype.min, 0, dtype=in_dtype.as_numpy_dtype
+        )
 
-      # Wrap changes to maintain TensorFlow's forward-compatibility window.
-      if not dtype.is_complex and not tf_compat.forward_compatible(2023, 1, 16):
-        # Old behavior using max/min.
-        if in_dtype.min < dtype.min:
-          value = gen_math_ops.maximum(
-              value,
-              ops.convert_to_tensor(dtype.min, dtype=value.dtype, name="min"))
-        if in_dtype.max > dtype.max:
-          value = gen_math_ops.minimum(
-              value,
-              ops.convert_to_tensor(dtype.max, dtype=value.dtype, name="max"))
-      else:
-        # New behavior using clip.
-        value = gen_math_ops._clip_by_value(
-            value,
-            ops.convert_to_tensor(out_real_dtype.min, dtype=in_dtype),
-            ops.convert_to_tensor(out_real_dtype.max, dtype=in_dtype),
-            name="clamp")
+      max_limit = in_dtype.as_numpy_dtype(out_real_dtype.max)
+      if max_limit > out_real_dtype.max:
+        max_limit = np.nextafter(
+            out_real_dtype.max, 0, dtype=in_dtype.as_numpy_dtype
+        )
+
+      value = gen_math_ops._clip_by_value(
+          value,
+          ops.convert_to_tensor(min_limit, dtype=in_dtype),
+          ops.convert_to_tensor(max_limit, dtype=in_dtype),
+          name="clamp",
+      )
     return cast(value, dtype, name=name)
 
 
@@ -1695,17 +1701,38 @@ def div_no_nan(x, y, name=None):
   <tf.Tensor: shape=(), dtype=float32, numpy=0.0>
 
   Args:
-    x: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    y: A `Tensor` whose dtype is compatible with `x`.
+    x: A `Tensor` of a floating or integer dtype.
+    y: A `Tensor` with the same dtype as `x` and a compatible shape.
     name: A name for the operation (optional).
 
   Returns:
-    The element-wise value of the x divided by y.
+    The element-wise quotient as in `tf.math.divide(x, y)`,
+    except that division by zero produces `0.0`, not `nan`.
   """
 
   with ops.name_scope(name, "div_no_nan", [x, y]) as name:
-    x = ops.convert_to_tensor(x, name="x")
-    y = ops.convert_to_tensor(y, name="y", dtype=x.dtype.base_dtype)
+    if not tensor_util.is_tf_type(x) and tensor_util.is_tf_type(y):
+      # Treat this case specially like divide() does above.
+      y = ops.convert_to_tensor(y, name="y")
+      x = ops.convert_to_tensor(x, dtype=y.dtype.base_dtype, name="x")
+    else:
+      x = ops.convert_to_tensor(x, name="x")
+      y = ops.convert_to_tensor(y, dtype_hint=x.dtype.base_dtype, name="y")
+    x_dtype = x.dtype.base_dtype
+    y_dtype = y.dtype.base_dtype
+    if x_dtype != y_dtype:
+      raise TypeError(f"`x` and `y` must have the same dtype, "
+                      f"got {x_dtype!r} != {y_dtype!r}.")
+    try:
+      dtype = _TRUEDIV_TABLE[x_dtype]
+    except KeyError as e:
+      raise TypeError(
+          f"Invalid dtype {x_dtype!r} in tf.math.divide_no_nan. Expected one "
+          f"of {{{', '.join([repr(x) for x in _TRUEDIV_TABLE.keys()])}}}."
+      ) from e
+    if dtype is not None:
+      x = cast(x, dtype)
+      y = cast(y, dtype)
     return gen_math_ops.div_no_nan(x, y, name=name)
 
 
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 663836e6f9f..8196a9613ed 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -819,12 +819,26 @@ class DivAndModTest(test_util.TensorFlowTestCase):
 @test_util.run_all_in_graph_and_eager_modes
 class DivNoNanTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-  @parameterized.parameters((dtypes.bfloat16), (dtypes.float16),
-                            (dtypes.float32), (dtypes.float64),
-                            (dtypes.complex64), (dtypes.complex128))
+  _SUPPORTED_DTYPES = [dtypes.int8, dtypes.uint8,
+                       dtypes.int16, dtypes.uint16,
+                       dtypes.int32, dtypes.uint32,
+                       dtypes.int64, dtypes.uint64,
+                       dtypes.bfloat16, dtypes.float16,
+                       dtypes.float32, dtypes.float64,
+                       dtypes.complex64, dtypes.complex128]
+
+  @parameterized.parameters(*_SUPPORTED_DTYPES)
   def testBasic(self, dtype):
-    nums = np.arange(-10, 10, .25).reshape(80, 1)
-    divs = np.arange(-3, 3, .25).reshape(1, 24)
+    if dtype.is_unsigned:
+      nums = np.arange(0, 120, 3).reshape(40, 1)
+      divs = np.arange(0, 48, 4).reshape(1, 12)
+    elif dtype.is_integer:
+      nums = np.arange(-120, 120, 3).reshape(80, 1)
+      divs = np.arange(-48, 48, 4).reshape(1, 24)
+    else:
+      nums = np.arange(-10, 10, .25).reshape(80, 1)
+      divs = np.arange(-3, 3, .25).reshape(1, 24)
+    assert 0 in divs, "Bad test set-up"
 
     tf_nums = constant_op.constant(nums, dtype=dtype)
     tf_divs = constant_op.constant(divs, dtype=dtype)
@@ -840,6 +854,35 @@ class DivNoNanTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       tf_result = math_ops.div_no_nan(tf_nums, tf_divs)
       self.assertAllCloseAccordingToType(tf_result, np_result)
 
+  @parameterized.product(
+      type_x=_SUPPORTED_DTYPES + [float, int],
+      type_y=_SUPPORTED_DTYPES + [float, int])
+  def testSameSupportedTypesAsDivide(self, type_x, type_y):
+    def one(type_):
+      if type_ is int:
+        return 1
+      elif type_ is float:
+        return 1.0
+      else:
+        return constant_op.constant(1, dtype=type_)
+
+    x = one(type_x)
+    y = one(type_y)
+
+    divide_raises = False
+    try:
+      divide_result = math_ops.divide(x, y)
+    except TypeError:
+      divide_raises = True
+
+    if divide_raises:
+      with self.assertRaises(TypeError):
+        _ = math_ops.div_no_nan(x, y)
+    else:
+      divide_no_nan_result = math_ops.div_no_nan(x, y)
+      self.assertEqual(divide_no_nan_result.dtype, divide_result.dtype)
+      self.assertAllEqual(divide_no_nan_result, divide_result)
+
   @parameterized.parameters((dtypes.bfloat16), (dtypes.float16),
                             (dtypes.float32), (dtypes.float64),
                             (dtypes.complex64), (dtypes.complex128))
diff --git a/tensorflow/python/ops/matmul_benchmark.py b/tensorflow/python/ops/matmul_benchmark.py
index a6c95ef453d..2c9cf66713b 100644
--- a/tensorflow/python/ops/matmul_benchmark.py
+++ b/tensorflow/python/ops/matmul_benchmark.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -45,17 +46,17 @@ def build_graph(device, n, m, k, transpose_a, transpose_b, dtype):
   """
   with ops.device('%s' % device):
     if not transpose_a:
-      x = variables.VariableV1(random_ops.random_uniform([n, m], dtype=dtype),
-                               use_resource=False)
+      x = variable_v1.VariableV1(
+          random_ops.random_uniform([n, m], dtype=dtype), use_resource=False)
     else:
-      x = variables.VariableV1(random_ops.random_uniform([m, n], dtype=dtype),
-                               use_resource=False)
+      x = variable_v1.VariableV1(
+          random_ops.random_uniform([m, n], dtype=dtype), use_resource=False)
     if not transpose_b:
-      y = variables.VariableV1(random_ops.random_uniform([m, k], dtype=dtype),
-                               use_resource=False)
+      y = variable_v1.VariableV1(
+          random_ops.random_uniform([m, k], dtype=dtype), use_resource=False)
     else:
-      y = variables.VariableV1(random_ops.random_uniform([k, m], dtype=dtype),
-                               use_resource=False)
+      y = variable_v1.VariableV1(
+          random_ops.random_uniform([k, m], dtype=dtype), use_resource=False)
 
     z = math_ops.matmul(x, y, transpose_a=transpose_a, transpose_b=transpose_b)
     return control_flow_ops.group(z)
diff --git a/tensorflow/python/ops/memory_tests/BUILD b/tensorflow/python/ops/memory_tests/BUILD
index 6ec8b8ad49a..94e73c4c472 100644
--- a/tensorflow/python/ops/memory_tests/BUILD
+++ b/tensorflow/python/ops/memory_tests/BUILD
@@ -15,7 +15,6 @@ cuda_py_test(
     srcs = ["custom_gradient_memory_test.py"],
     xla_enable_strict_auto_jit = False,  # XLA are enabled explicitly in XLA memory tests.
     deps = [
-        "@absl_py//absl/testing:parameterized",
         "//tensorflow/compiler/xla/service:hlo_proto_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:gradients",
@@ -28,6 +27,7 @@ cuda_py_test(
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:test",
+        "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
 
@@ -41,19 +41,19 @@ tpu_py_test(
     disable_tfrt = True,
     main = "custom_gradient_memory_test.py",
     deps = [
-        "@absl_py//absl/testing:parameterized",
         "//tensorflow/compiler/xla/service:hlo_proto_py",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework:config",
-        "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/platform:test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:test",
         "//tensorflow/python/tpu:tpu_strategy_util",
+        "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index d96707b12b9..860a4e9b929 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -13,7 +13,7 @@
 # ==============================================================================
 """Implementation of tf.metrics module."""
 
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -21,14 +21,16 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import confusion_matrix
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sets
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
+from tensorflow.python.ops import variables
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.deprecation import deprecated
@@ -53,7 +55,7 @@ def metric_variable(shape, dtype, validate_shape=True, name=None):
       the final answer should be computed once instead of in every
       replica. Both of these are accomplished by running the computation
       of the final result value inside
-      `distribution_strategy_context.get_replica_context().merge_call(fn)`.
+      `distribute_lib.get_replica_context().merge_call(fn)`.
       Inside the `merge_call()`, ops are only added to the graph once
       and access to a sync on read variable in a computation returns
       the sum across all replicas.
@@ -70,15 +72,15 @@ def metric_variable(shape, dtype, validate_shape=True, name=None):
     `DistributionStrategy` scope a sync on read variable container.
   """
   # Note that synchronization "ON_READ" implies trainable=False.
-  return variable_scope.variable(
+  return variable_v1.VariableV1(
       lambda: array_ops.zeros(shape, dtype),
       trainable=False,
       collections=[
           ops.GraphKeys.LOCAL_VARIABLES, ops.GraphKeys.METRIC_VARIABLES
       ],
       validate_shape=validate_shape,
-      synchronization=variable_scope.VariableSynchronization.ON_READ,
-      aggregation=variable_scope.VariableAggregation.SUM,
+      synchronization=variables.VariableSynchronization.ON_READ,
+      aggregation=variables.VariableAggregation.SUM,
       name=name)
 
 
@@ -134,7 +136,7 @@ def _remove_squeezable_dimensions(predictions, labels, weights):
     rank_diff = weights_rank_tensor - array_ops.rank(predictions)
 
     def _maybe_expand_weights():
-      return control_flow_ops.cond(
+      return cond.cond(
           math_ops.equal(rank_diff, -1),
           lambda: array_ops.expand_dims(weights, [-1]), lambda: weights)
 
@@ -146,13 +148,13 @@ def _remove_squeezable_dimensions(predictions, labels, weights):
       maybe_squeeze_weights = lambda: array_ops.squeeze(weights, [-1])
 
     def _maybe_adjust_weights():
-      return control_flow_ops.cond(
+      return cond.cond(
           math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
           _maybe_expand_weights)
 
     # If weights are scalar, do nothing. Otherwise, try to add or remove a
     # dimension to match predictions.
-    weights = control_flow_ops.cond(
+    weights = cond.cond(
         math_ops.equal(weights_rank_tensor, 0), lambda: weights,
         _maybe_adjust_weights)
   return predictions, labels, weights
@@ -179,7 +181,7 @@ def _maybe_expand_labels(labels, predictions):
 
     # If sparse, expand sparse shape.
     if isinstance(labels, sparse_tensor.SparseTensor):
-      return control_flow_ops.cond(
+      return cond.cond(
           math_ops.equal(
               array_ops.rank(predictions),
               array_ops.size(labels.dense_shape) + 1),
@@ -204,7 +206,7 @@ def _maybe_expand_labels(labels, predictions):
             'same rank as labels rank or labels rank plus one .')
 
     # Otherwise, use dynamic shape.
-    return control_flow_ops.cond(
+    return cond.cond(
         math_ops.equal(array_ops.rank(predictions),
                        array_ops.rank(labels) + 1),
         lambda: array_ops.expand_dims(labels, -1, name=scope), lambda: labels)
@@ -306,7 +308,7 @@ def _aggregate_across_replicas(metrics_collections, metric_value_fn, *args):
       ops.add_to_collections(metrics_collections, metric_value)
     return metric_value
 
-  return distribution_strategy_context.get_replica_context().merge_call(
+  return distribute_lib.get_replica_context().merge_call(
       fn, args=args)
 
 
@@ -3398,7 +3400,7 @@ def _clean_out_of_range_indices(labels, num_classes):
 
   max_labels = math_ops.reduce_max(
       labels.values if _labels_is_sparse() else labels)
-  return control_flow_ops.cond(
+  return cond.cond(
       math_ops.greater_equal(max_labels, num_classes),
       _clean_labels_out_of_range,
       lambda: labels)
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index 683d56eb39d..72aa2fb7943 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -693,6 +693,15 @@ class WeightedMomentsTest(MomentsTest):
       self.assertAllCloseAccordingToType(expected_mean, mean_v)
       self.assertAllCloseAccordingToType(expected_variance, var_v)
 
+  def testAllZeroMasks(self):
+    x = np.random.normal(size=[8, 3, 4]).astype(np.float32)
+    weights = np.zeros(shape=[8, 3, 1]).astype(np.float32)
+    axes = (0, 1)
+
+    mean, var = nn_impl.weighted_moments(
+        x, axes, weights, keep_dims=False)
+    self.assertAllClose(mean, np.zeros(shape=[4]))
+    self.assertAllClose(var, np.zeros(shape=[4]))
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 534ab8836b4..6d87330dc02 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -16,7 +16,7 @@
 
 import math
 
-from tensorflow.python.distribute import distribution_strategy_context as ds
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -24,7 +24,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gen_array_ops  # pylint: disable=unused-import
@@ -437,7 +437,8 @@ def compute_average_loss(per_example_loss,
       first dimension of `losses`) * (number of replicas).
 
   Returns:
-    Scalar loss value.
+    Scalar loss value, obtained by summing the `per_example_loss` and dividing
+    by `global_batch_size`. If `global_batch_size` is zero, the result is zero.
   """  # pylint: disable=g-doc-exception
   per_example_loss = ops.convert_to_tensor(per_example_loss)
   input_dtype = per_example_loss.dtype
@@ -450,12 +451,13 @@ def compute_average_loss(per_example_loss,
     per_example_loss = math_ops.cast(per_example_loss, input_dtype)
 
     if global_batch_size is None:
-      if ds.has_strategy() and ds.in_cross_replica_context():
+      if (distribute_lib.has_strategy()
+          and distribute_lib.in_cross_replica_context()):
         raise RuntimeError(
             "You are calling `compute_average_loss` in cross replica context, "
             "while it was expected to be called in replica context.")
 
-      num_replicas = ds.get_strategy().num_replicas_in_sync
+      num_replicas = distribute_lib.get_strategy().num_replicas_in_sync
       per_replica_batch_size = array_ops.shape_v2(per_example_loss)[0]
       global_batch_size = per_replica_batch_size * num_replicas
 
@@ -464,11 +466,12 @@ def compute_average_loss(per_example_loss,
     check_ops.assert_integer_v2(
         global_batch_size,
         message="global_batch_size must be an integer.")
-    check_ops.assert_positive_v2(
-        global_batch_size, message="global_batch_size must be positive.")
+    check_ops.assert_non_negative_v2(
+        global_batch_size, message="global_batch_size must be non-negative.")
 
+    loss = math_ops.reduce_sum(per_example_loss)
     global_batch_size = math_ops.cast(global_batch_size, input_dtype)
-    return math_ops.reduce_sum(per_example_loss) / global_batch_size
+    return math_ops.div_no_nan(loss, global_batch_size)
 
 
 @tf_export("nn.scale_regularization_loss")
@@ -501,12 +504,13 @@ def scale_regularization_loss(regularization_loss):
   Returns:
     Scalar loss value.
   """  # pylint: disable=g-doc-exception
-  if ds.has_strategy() and ds.in_cross_replica_context():
+  if (distribute_lib.has_strategy()
+      and distribute_lib.in_cross_replica_context()):
     raise RuntimeError(
         "You are calling `scale_regularization_loss` in cross replica context, "
         "while it was expected to be called in replica context.")
 
-  num_replicas = ds.get_strategy().num_replicas_in_sync
+  num_replicas = distribute_lib.get_strategy().num_replicas_in_sync
   return math_ops.reduce_sum(regularization_loss) / num_replicas
 
 
@@ -749,7 +753,7 @@ def zero_fraction(value, name=None):
     value = ops.convert_to_tensor(value, name="value")
     size = array_ops.size(value, out_type=dtypes.int64)
     # If the count is small, we can save memory/CPU with an int32 reduction.
-    num_nonzero = control_flow_ops.cond(
+    num_nonzero = tf_cond.cond(
         size <= dtypes.int32.max,
         # pylint: disable=g-long-lambda
         true_fn=lambda: math_ops.cast(
@@ -1476,9 +1480,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
     sum_of_weights = math_ops.reduce_sum(
         broadcasted_weights, axes, name="sum_of_weights", keepdims=True)
 
-    divisor = math_ops.reciprocal(sum_of_weights, name="inv_weight_sum")
-
-    weighted_mean = math_ops.multiply(weighted_input_sum, divisor)
+    weighted_mean = math_ops.div_no_nan(weighted_input_sum, sum_of_weights)
 
     # Have the weighted mean; now on to variance:
     weighted_distsq = math_ops.reduce_sum(
@@ -1487,7 +1489,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
         name="weighted_distsq",
         keepdims=True)
 
-    weighted_variance = math_ops.multiply(weighted_distsq, divisor)
+    weighted_variance = math_ops.div_no_nan(weighted_distsq, sum_of_weights)
 
     if not keep_dims:
       weighted_mean = array_ops.squeeze(weighted_mean, axis=axes)
diff --git a/tensorflow/python/ops/nn_loss_scaling_utilities_test.py b/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
index fd9c80bc44f..f0a71a4b920 100644
--- a/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
+++ b/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
@@ -18,7 +18,6 @@ from absl.testing import parameterized
 
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.distribute import test_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,10 +30,6 @@ from tensorflow.python.platform import test as test_lib
 
 class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
 
-  def setUp(self):
-    test_util.set_logical_devices_to_at_least("CPU", 3)
-    super(LossUtilitiesTest, self).setUp()
-
   def testComputeAverageLossGlobalBatchSize(self):
     per_example_loss = [1, 2, 3, 4, 5]
     loss = nn_impl.compute_average_loss(per_example_loss, global_batch_size=10)
@@ -55,21 +50,21 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
   def testComputeAverageLossGlobalBatchSize_BatchSizeNegative(self):
     per_example_loss = [1, 2, 3, 4, 5]
     with self.assertRaisesWithPredicateMatch(
-        errors_impl.InvalidArgumentError, "global_batch_size must be positive"):
+        errors_impl.InvalidArgumentError,
+        "global_batch_size must be non-negative"):
       nn_impl.compute_average_loss(per_example_loss, global_batch_size=-1)
 
   def testComputeAverageLossGlobalBatchSize_BatchSizeZero(self):
     per_example_loss = [1, 2, 3, 4, 5]
-    with self.assertRaisesWithPredicateMatch(
-        errors_impl.InvalidArgumentError, "global_batch_size must be positive"):
-      nn_impl.compute_average_loss(per_example_loss, global_batch_size=0)
+    loss = nn_impl.compute_average_loss(per_example_loss, global_batch_size=0)
+    self.assertEqual(self.evaluate(loss), 0.0)
 
   @combinations.generate(
       combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
-          ],
-          mode=["graph", "eager"]))
+          distribution=[strategy_combinations.mirrored_strategy_with_two_cpus],
+          mode=["graph", "eager"],
+      )
+  )
   def testComputeAverageLossDefaultGlobalBatchSize(self, distribution):
     # Without strategy - num replicas = 1
     per_example_loss = constant_op.constant([2.5, 6.2, 5.])
@@ -85,10 +80,28 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
-          ],
-          mode=["graph", "eager"]))
+          distribution=[strategy_combinations.mirrored_strategy_with_two_cpus],
+          mode=["graph", "eager"],
+      )
+  )
+  def testComputeAverageLossDefaultGlobalBatchSizeEmptyBatch(self,
+                                                             distribution):
+    per_example_loss = constant_op.constant([], dtypes.float32)
+    loss = nn_impl.compute_average_loss(per_example_loss)
+    self.assertEqual(self.evaluate(loss), 0.0)
+
+    with distribution.scope():
+      per_replica_losses = distribution.run(
+          nn_impl.compute_average_loss, args=(per_example_loss,))
+      loss = distribution.reduce("SUM", per_replica_losses, axis=None)
+      self.assertAllClose(self.evaluate(loss), 0.0)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[strategy_combinations.mirrored_strategy_with_two_cpus],
+          mode=["graph", "eager"],
+      )
+  )
   def testComputeAverageLossSampleWeights(self, distribution):
     with distribution.scope():
       # Scalar sample weight
@@ -117,6 +130,33 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
       self.assertAllClose(
           self.evaluate(loss), (2. * 0.3 + 0.5 * 0.7 + 4. * 0.2 + 1. * 0.8) / 2)
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=[strategy_combinations.mirrored_strategy_with_two_cpus],
+          mode=["graph", "eager"],
+      )
+  )
+  def testComputeAverageLossSampleWeightsEmptyBatch(self, distribution):
+    empty_rank0 = constant_op.constant([], dtypes.float32)
+
+    with distribution.scope():
+      # Scalar sample weight
+      per_replica_losses = distribution.run(
+          nn_impl.compute_average_loss,
+          args=(empty_rank0,),
+          kwargs={"sample_weight": 2})
+      loss = distribution.reduce("SUM", per_replica_losses, axis=None)
+      self.assertAllClose(self.evaluate(loss), 0.0)
+
+      # Per example sample weight
+      per_replica_losses = distribution.run(
+          nn_impl.compute_average_loss,
+          args=(empty_rank0,),
+          kwargs={"sample_weight": empty_rank0})
+      loss = distribution.reduce("SUM", per_replica_losses, axis=None)
+      self.assertAllClose(
+          self.evaluate(loss), 0.0)
+
   def testComputeAverageLossInvalidSampleWeights(self):
     with self.assertRaisesIncompatibleShapesError(
         (ValueError, errors_impl.InvalidArgumentError)):
@@ -126,10 +166,10 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
-          ],
-          mode=["graph", "eager"]))
+          distribution=[strategy_combinations.mirrored_strategy_with_two_cpus],
+          mode=["graph", "eager"],
+      )
+  )
   def testComputeAverageLossDtype(self, distribution):
     with distribution.scope():
       per_example_loss = constant_op.constant([2., 4., 6.],
@@ -142,7 +182,7 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
       self.assertEqual(loss.dtype, dtypes.float64)
 
   def testComputeAverageLossInvalidRank(self):
-    per_example_loss = constant_op.constant(2)
+    per_example_loss = constant_op.constant(2.)
 
     # Static rank
     with self.assertRaisesRegex(
@@ -164,10 +204,10 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
-          ],
-          mode=["graph", "eager"]))
+          distribution=[strategy_combinations.mirrored_strategy_with_two_cpus],
+          mode=["graph", "eager"],
+      )
+  )
   def testComputeAverageLossInCrossReplicaContext(self, distribution):
     with distribution.scope():
       with self.assertRaisesRegex(
@@ -177,10 +217,10 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
-          ],
-          mode=["graph", "eager"]))
+          distribution=[strategy_combinations.mirrored_strategy_with_two_cpus],
+          mode=["graph", "eager"],
+      )
+  )
   def testScaleRegularizationLoss(self, distribution):
     # Without strategy - num replicas = 1
     reg_losses = constant_op.constant([2.5, 6.2, 5.])
@@ -196,10 +236,10 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
-          ],
-          mode=["graph", "eager"]))
+          distribution=[strategy_combinations.mirrored_strategy_with_two_cpus],
+          mode=["graph", "eager"],
+      )
+  )
   def testScaleRegularizationLossInCrossReplicaContext(self, distribution):
     with distribution.scope():
       with self.assertRaisesRegex(
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 2f620114c50..b258c9c4f61 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -5809,7 +5809,7 @@ def _dropout(x, rate, noise_shape, uniform_sampler, dummy_rng_step, name,
 
 @tf_export("math.top_k", "nn.top_k")
 @dispatch.add_dispatch_support
-def top_k(input, k=1, sorted=True, name=None):  # pylint: disable=redefined-builtin
+def top_k(input, k=1, sorted=True, index_type=dtypes.int32, name=None):  # pylint: disable=redefined-builtin
   """Finds values and indices of the `k` largest entries for the last dimension.
 
   If the input is a vector (rank=1), finds the `k` largest entries in the vector
@@ -5846,13 +5846,22 @@ def top_k(input, k=1, sorted=True, name=None):  # pylint: disable=redefined-buil
   ...                        k=3)
   >>> result.indices.numpy()
   array([0, 1, 3], dtype=int32)
+  
+  By default, indices are returned as type `int32`, however, this can be changed
+  by specifying the `index_type`.
+  
+  >>> result = tf.math.top_k([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0],
+  ...                        k=3, index_type=tf.int16)
+  >>> result.indices.numpy()
+  array([0, 1, 3], dtype=int16)
 
   Args:
     input: 1-D or higher `Tensor` with last dimension at least `k`.
-    k: 0-D `int32` `Tensor`.  Number of top elements to look for along the last
-      dimension (along each row for matrices).
+    k: 0-D `Tensor` of type `int16`, `int32` or `int64`.  Number of top element
+      to look for along the last dimension (along each row for matrices).
     sorted: If true the resulting `k` elements will be sorted by the values in
       descending order.
+    index_type: Optional dtype for output indices.
     name: Optional name for the operation.
 
   Returns:
@@ -5860,7 +5869,9 @@ def top_k(input, k=1, sorted=True, name=None):  # pylint: disable=redefined-buil
     values: The `k` largest elements along each last dimensional slice.
     indices: The indices of `values` within the last dimension of `input`.
   """
-  return gen_nn_ops.top_kv2(input, k=k, sorted=sorted, name=name)
+  return gen_nn_ops.top_kv2(
+      input, k=k, sorted=sorted, index_type=index_type, name=name
+  )
 
 
 @tf_export("math.approx_max_k", "nn.approx_max_k")
diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index 70c2ed690cf..5c35790ffd9 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -23,6 +23,7 @@ py_library(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:bitwise_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:control_flow_ops",
@@ -35,7 +36,9 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sort_ops",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
+        "//tensorflow/python:while_loop",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -45,7 +48,7 @@ cuda_py_test(
     srcs = ["np_dtypes_test.py"],
     deps = [
         ":numpy",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -59,7 +62,8 @@ cuda_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -73,7 +77,7 @@ cuda_py_test(
     ],
     deps = [
         ":numpy",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -83,6 +87,7 @@ cuda_py_test(
     srcs = ["np_logic_test.py"],
     deps = [
         ":numpy",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -92,7 +97,7 @@ cuda_py_test(
     srcs = ["np_math_ops_test.py"],
     deps = [
         ":numpy",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -103,7 +108,7 @@ cuda_py_test(
     srcs = ["np_random_test.py"],
     deps = [
         ":numpy",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -114,7 +119,7 @@ cuda_py_test(
     srcs = ["np_utils_test.py"],
     deps = [
         ":numpy",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD
index 2b5240196b7..609a5bd6bd1 100644
--- a/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD
+++ b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD
@@ -13,7 +13,6 @@ py_binary(
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_benchmark",
         "//tensorflow/python/ops/numpy_ops:numpy",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index 78500a18b24..c366dbba238 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -784,6 +784,20 @@ def squeeze(a, axis=None):
   return array_ops.squeeze(a, axis)
 
 
+@np_utils.np_doc('flatten', link=np_utils.NoLink())
+def flatten(a, order='C'):
+  a = asarray(a)
+  if order == 'C' or order == 'A' or order == 'K':
+    # Row major.
+    return array_ops.reshape(a, [-1])
+  elif order == 'F':
+    # Column major
+    return array_ops.reshape(array_ops.transpose(a), [-1])
+  else:
+    raise ValueError('order can only be C, A, K (all row major) or F '
+                     '(column major).')
+
+
 @np_utils.np_doc('transpose')
 def transpose(a, axes=None):
   a = asarray(a)
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index c8ab4179699..987f7738c17 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -18,6 +18,7 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.ops.numpy_ops import np_dtypes
 
 
@@ -42,7 +43,8 @@ def convert_to_tensor(value, dtype=None, dtype_hint=None):
     dtype = dtypes.uint64
   elif dtype is None and dtype_hint is None and isinstance(value, float):
     dtype = np_dtypes.default_float_type()
-  return ops.convert_to_tensor(value, dtype=dtype, dtype_hint=dtype_hint)
+  return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+      value, dtype=dtype, dtype_hint=dtype_hint)
 
 
 ndarray = ops.Tensor
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index 315bb9ee045..12f8595cc6c 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -29,12 +29,12 @@ from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sort_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops import while_loop
 from tensorflow.python.ops.numpy_ops import np_array_ops
 from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.numpy_ops import np_dtypes
@@ -508,8 +508,8 @@ def _tf_gcd(x1, x2):  # pylint: disable=missing-function-docstring
       array_ops.shape(x1), array_ops.shape(x2))
   x1 = array_ops.broadcast_to(x1, shape)
   x2 = array_ops.broadcast_to(x2, shape)
-  value, _ = control_flow_ops.while_loop(_gcd_cond_fn, _gcd_body_fn,
-                                         (math_ops.abs(x1), math_ops.abs(x2)))
+  value, _ = while_loop.while_loop(_gcd_cond_fn, _gcd_body_fn,
+                                   (math_ops.abs(x1), math_ops.abs(x2)))
   return value
 
 
@@ -1150,7 +1150,7 @@ def concatenate(arys, axis=0):
 @np_utils.np_doc_only('tile')
 def tile(a, reps):  # pylint: disable=missing-function-docstring
   a = np_array_ops.array(a)
-  reps = np_array_ops.array(reps, dtype=dtypes.int32).reshape([-1])
+  reps = array_ops.reshape(np_array_ops.array(reps, dtype=dtypes.int32), [-1])
 
   a_rank = array_ops.rank(a)
   reps_size = array_ops.size(reps)
@@ -1428,6 +1428,7 @@ def enable_numpy_methods_on_tensor():
   # TODO(b/178540516): Make a custom `setattr` that changes the method's
   #   docstring to the TF one.
   setattr(ops.Tensor, 'transpose', np_array_ops.transpose)
+  setattr(ops.Tensor, 'flatten', np_array_ops.flatten)
   setattr(ops.Tensor, 'reshape', np_array_ops._reshape_method_wrapper)  # pylint: disable=protected-access
   setattr(ops.Tensor, 'ravel', np_array_ops.ravel)
   setattr(ops.Tensor, 'clip', clip)
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops_test.py b/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
index 09121ef9756..a44b9b6b82e 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
@@ -350,12 +350,22 @@ class MathTest(test.TestCase, parameterized.TestCase):
     run_test(-1, -1000, num=5, endpoint=False)
 
   @parameterized.parameters([
-      'T', 'ndim', 'size', 'data', '__pos__', '__round__', 'tolist',
+      'T', 'ndim', 'size', 'data', '__pos__', '__round__', 'tolist', 'flatten',
       'transpose', 'reshape', 'ravel', 'clip', 'astype', 'max', 'mean', 'min'])
   def testNumpyMethodsOnTensor(self, np_method):
     a = ops.convert_to_tensor([1, 2])
     self.assertTrue(hasattr(a, np_method))
 
+  def testFlatten(self):
+    a1 = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+    a2 = ops.convert_to_tensor(a1)
+    self.assertAllEqual(a1.flatten('C'), a2.flatten('C'))
+    self.assertAllEqual(a1.flatten('F'), a2.flatten('F'))
+    self.assertAllEqual(a1.flatten('C'), a2.flatten('A'))
+    self.assertAllEqual(a1.flatten('C'), a2.flatten('K'))
+    with self.assertRaises(ValueError):
+      a2.flatten('invalid')
+
 
 if __name__ == '__main__':
   ops.enable_tensor_equality()
diff --git a/tensorflow/python/ops/numpy_ops/np_utils.py b/tensorflow/python/ops/numpy_ops/np_utils.py
index 6904a5558c3..09f343e13f7 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils.py
@@ -25,7 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.numpy_ops import np_dtypes
@@ -485,7 +485,7 @@ def _maybe_get_dtype(x):
   # value (not just dtype) of np.ndarray to decide the result type.
   if isinstance(x, numbers.Real):
     return x
-  if isinstance(x, (core.Tensor, indexed_slices.IndexedSlices)):
+  if isinstance(x, indexed_slices.IndexedSlices) or tensor_util.is_tf_type(x):
     return _to_numpy_type(x.dtype)
   if isinstance(x, dtypes.DType):
     return x.as_numpy_dtype
@@ -597,7 +597,7 @@ def cond(pred, true_fn, false_fn):
   """A version of tf.cond that tries to evaluate the condition."""
   v = get_static_value(pred)
   if v is None:
-    return control_flow_ops.cond(pred, true_fn, false_fn)
+    return tf_cond.cond(pred, true_fn, false_fn)
   if v:
     return true_fn()
   else:
diff --git a/tensorflow/python/ops/numpy_ops/np_utils_test.py b/tensorflow/python/ops/numpy_ops/np_utils_test.py
index a86684256b3..16349776157 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils_test.py
@@ -16,6 +16,7 @@
 
 from absl.testing import parameterized
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops.numpy_ops import np_utils
 from tensorflow.python.platform import test
 
@@ -113,6 +114,24 @@ f docstring.
 """
     self.assertEqual(expected, f.__doc__)
 
+  def testDtypeOfTensorLikeClass(self):
+
+    class TensorLike:
+
+      def __init__(self, dtype):
+        self._dtype = dtype
+
+      @property
+      def is_tensor_like(self):
+        return True
+
+      @property
+      def dtype(self):
+        return self._dtype
+
+    t = TensorLike(dtypes.float32)
+    self.assertEqual(np_utils._maybe_get_dtype(t), dtypes.float32)
+
   # pylint: disable=unused-variable
   def testSigMismatchIsError(self):
     """Tests that signature mismatch is an error (when configured so)."""
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 415d2dfbcdd..0487794321b 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -26,6 +26,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:control_flow_ops",
@@ -38,16 +39,22 @@ py_library(
         "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/framework:smart_cond",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
         "@absl_py//absl/flags",
     ],
 )
@@ -71,13 +78,17 @@ py_library(
         "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:while_loop",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
         "@absl_py//absl/flags",
     ],
 )
@@ -90,6 +101,7 @@ py_library(
     deps = [
         ":pfor_lib",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -97,12 +109,16 @@ py_library(
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/ops/numpy_ops:numpy",
         "//tensorflow/python/ops/signal",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
     ],
 )
 
@@ -114,8 +130,8 @@ py_library(
         ":pfor_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -131,16 +147,18 @@ cuda_py_test(
         ":test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:control_flow_v2_toggles",
+        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:util",
         "//tensorflow/python:while_loop",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -166,7 +184,8 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:while_loop",
-        "//tensorflow/python/compiler/xla",
+        "//tensorflow/python/compiler/xla:compiler_py",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -182,7 +201,6 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:backprop",
     ],
 )
@@ -198,7 +216,6 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:util",
     ],
 )
 
@@ -210,7 +227,7 @@ py_library(
         ":control_flow_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -227,6 +244,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index 42986831016..d4f102e0922 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -29,7 +29,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import while_loop
@@ -400,7 +400,7 @@ def _pfor_impl(loop_fn,
 
     with ops.name_scope("pfor"):
       if iters_value is None or iters_value % parallel_iterations:
-        output_tensors = control_flow_ops.cond(
+        output_tensors = cond.cond(
             math_ops.equal(num_remaining_iterations, 0),
             lambda: tiled_output_tensors,
             lambda: [array_ops.concat([x, y], axis=0)  # pylint: disable=g-long-lambda
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 155c11f2803..cee25369963 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -44,9 +44,9 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -2107,8 +2107,8 @@ class NestedControlFlowTest(PForTestCase):
       f = lambda x, y: (x, y)
 
     def _f(x, y):
-      return control_flow_ops.cond(y > split, lambda: f(x, y), lambda:
-                                   (x + 1., y))
+      return cond.cond(y > split, lambda: f(x, y), lambda:
+                       (x + 1., y))
 
     return _f
 
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index fe8b651fdbf..bdf76b72940 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -39,6 +39,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import control_flow_assert
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_switch_case
@@ -578,8 +579,8 @@ class WhileOp:
     for i, out_ta in enumerate(output_tas):
       inp = inputs[i]
       new_output_tas.append(
-          control_flow_ops.cond(not_all_done, lambda: out_ta,
-                                lambda: out_ta.write(0, inp)))
+          tf_cond.cond(not_all_done, lambda: out_ta,
+                       lambda: out_ta.write(0, inp)))
     # pylint: enable=cell-var-from-loop
     return not_all_done, indices, inputs, new_output_tas
 
@@ -651,7 +652,7 @@ class WhileOp:
       # not be used. Notice that the value returned by the loop is based on
       # TensorArrays and not directly on these returned values.
       # pylint: disable=cell-var-from-loop
-      new_output = control_flow_ops.cond(
+      new_output = tf_cond.cond(
           not_all_done,
           lambda: true_fn(control_inp, body_pfor, body_output, stacked),
           lambda: constant_op.constant([], dtype=out_dtype))
@@ -3881,7 +3882,7 @@ def _stack_tensor_list_shape(shape, first_dim):
       return first_dim
   else:
     shape = array_ops.reshape(shape, [-1])
-    return control_flow_ops.cond(
+    return tf_cond.cond(
         math_ops.reduce_any(shape < 0),
         lambda: constant_op.constant(-1),
         lambda: array_ops.concat([first_dim, shape], axis=0))
@@ -4116,7 +4117,7 @@ def _convert_tensor_list_concat_v2(pfor_input):
   # Note that element_shape attribute can have incomplete shapes. This doesn't
   # seem to work well when creating another list and then doing a concat on it.
   # Hence we try to find the dynamic shape here.
-  element_shape = control_flow_ops.cond(
+  element_shape = tf_cond.cond(
       length > 0, lambda: array_ops.shape(
           list_ops.tensor_list_get_item(handle, 0, element_dtype, None)),
       lambda: constant_op.constant([0, 0], dtype=dtypes.int32))
@@ -4487,6 +4488,7 @@ def _convert_function_call(func, converter, inputs):
   assert isinstance(func.graph, func_graph.FuncGraph), func
   assert isinstance(converter, PFor)
 
+  graph_outputs = func.graph.outputs[:len(func.function_type.flat_outputs)]
   # TODO(agarwal): consider caching this function definition.
   @def_function.function
   def f(*args):
@@ -4496,17 +4498,16 @@ def _convert_function_call(func, converter, inputs):
     for inp, arg in zip(func.graph.inputs, args):
       converter._add_conversion(inp, arg)
     # Convert output tensors.
-    return tuple(
-        [converter._convert_helper(x).t for x in func._func_graph_outputs])
+    return tuple([converter._convert_helper(x).t for x in graph_outputs])
 
   call_outputs = f(*inputs)
-  assert len(call_outputs) == len(func._func_graph_outputs)
+  assert len(call_outputs) == len(graph_outputs)
   outputs = []
-  for call_output, output_tensor in zip(call_outputs, func._func_graph_outputs):
+  for call_output, output_tensor in zip(call_outputs, graph_outputs):
     func_output = converter._convert_helper(output_tensor)
     outputs.append(
-        wrap(call_output, func_output.is_stacked,
-             func_output.is_sparse_stacked))
+        wrap(call_output, func_output.is_stacked, func_output.is_sparse_stacked)
+    )
   return outputs
 
 
@@ -4614,7 +4615,7 @@ def _convert_if(pfor_input):
       outputs.append(wrap(out, True))
     return outputs
   else:
-    outputs = control_flow_ops.cond(
+    outputs = tf_cond.cond(
         cond,
         lambda: _outputs_for_branch(then_branch.name, None, pfor_input, inputs),
         lambda: _outputs_for_branch(else_branch.name, None, pfor_input, inputs))
@@ -4755,7 +4756,7 @@ class WhileV2:
           if inp.is_stacked:
             # Shapes may be tf.constant(-1) for fully dynamic, in which case
             # slicing is an error.
-            element_shape = control_flow_ops.cond(
+            element_shape = tf_cond.cond(
                 math_ops.equal(array_ops.rank(element_shape), 0),
                 lambda: element_shape,
                 lambda: element_shape[1:])
@@ -4922,7 +4923,7 @@ class WhileV2:
 
     # If all are done, we simply return `new_inputs`. Else we need to run the
     # body function.
-    return control_flow_ops.cond(
+    return tf_cond.cond(
         not_all_done,
         true_fn,
         lambda: list(new_inputs)), mismatching_stacked_indices
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 93c345b8a65..7364c30be27 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -1,6 +1,5 @@
-load("//tensorflow:strict.default.bzl", "py_strict_test")
-load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
 
 package(
@@ -13,7 +12,7 @@ package(
 # RaggedTensor
 #-------------------------------------------------------------------------------
 
-py_library(
+py_strict_library(
     name = "ragged",
     srcs = ["__init__.py"],
     srcs_version = "PY3",
@@ -45,11 +44,10 @@ py_library(
         ":ragged_util",
         ":ragged_where_op",
         ":segment_id_ops",
-        "//tensorflow/python:util",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_array_ops",
     srcs = ["ragged_array_ops.py"],
     srcs_version = "PY3",
@@ -61,48 +59,48 @@ py_library(
         ":ragged_util",
         ":segment_id_ops",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:ragged_array_ops_gen",
         "//tensorflow/python:sort_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_check_ops",
     srcs = ["ragged_check_ops.py"],
     srcs_version = "PY3",
     deps = [
         ":ragged_tensor",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python/util:dispatch",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_batch_gather_ops",
     srcs = ["ragged_batch_gather_ops.py"],
     srcs_version = "PY3",
     deps = [
         ":ragged_gather_ops",
         ":ragged_tensor",
-        ":ragged_util",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/util:dispatch",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_batch_gather_with_default_op",
     srcs = [
         "ragged_batch_gather_with_default_op.py",
@@ -117,50 +115,67 @@ py_library(
         ":ragged_where_op",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_concat_ops",
     srcs = ["ragged_concat_ops.py"],
     srcs_version = "PY3",
     deps = [
-        ":ragged_array_ops",
         ":ragged_gather_ops",
         ":ragged_tensor",
         ":ragged_util",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:check_ops",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_conversion_ops",
     srcs = ["ragged_conversion_ops.py"],
     srcs_version = "PY3",
     deps = [
         ":ragged_tensor",
-        "//tensorflow/python:dtypes",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_conversion_ops_gen",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_embedding_ops",
     srcs = ["ragged_embedding_ops.py"],
     srcs_version = "PY3",
     deps = [
+        ":ragged_array_ops",
+        ":ragged_functional_ops",
         ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:dispatch",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_factory_ops",
     srcs = ["ragged_factory_ops.py"],
     srcs_version = "PY3",
@@ -168,47 +183,50 @@ py_library(
         ":ragged_tensor",
         ":ragged_tensor_value",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_functional_ops",
     srcs = ["ragged_functional_ops.py"],
     srcs_version = "PY3",
     deps = [
         ":ragged_config",
         ":ragged_tensor",
-        ":ragged_util",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_gather_ops",
     srcs = ["ragged_gather_ops.py"],
     srcs_version = "PY3",
     deps = [
         ":ragged_array_ops",
+        ":ragged_math_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:ragged_array_ops_gen",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:dispatch",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_getitem",
     srcs = ["ragged_getitem.py"],
     srcs_version = "PY3",
@@ -217,35 +235,44 @@ py_library(
         ":ragged_math_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_math_ops",
     srcs = ["ragged_math_ops.py"],
     srcs_version = "PY3",
     deps = [
         ":ragged_functional_ops",
         ":ragged_tensor",
-        ":ragged_util",
         ":segment_id_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
         "//tensorflow/python:ragged_math_ops_gen",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_operators",
     srcs = ["ragged_operators.py"],
     srcs_version = "PY3",
@@ -253,16 +280,16 @@ py_library(
         ":ragged_getitem",
         ":ragged_tensor",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_ops",
     srcs = ["ragged_ops.py"],
     srcs_version = "PY3",
     deps = [
-        ":dynamic_ragged_shape",
         ":ragged_array_ops",
         ":ragged_batch_gather_ops",
         ":ragged_batch_gather_with_default_op",
@@ -290,46 +317,58 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_string_ops",
     srcs = ["ragged_string_ops.py"],
     srcs_version = "PY3",
     deps = [
         ":ragged_array_ops",
+        ":ragged_functional_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:array_ops_stack",
+        "//tensorflow/python:cond",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python:string_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:lazy_loader",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_squeeze_op",
     srcs = ["ragged_squeeze_op.py"],
     srcs_version = "PY3",
     deps = [
         ":ragged_tensor",
-        ":ragged_util",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_assert",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_config",
     srcs = ["ragged_config.py"],
     srcs_version = "PY3",
 )
 
-py_library(
+py_strict_library(
     name = "row_partition",
     srcs = ["row_partition.py"],
     srcs_version = "PY3",
@@ -339,22 +378,26 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:bincount_ops",
         "//tensorflow/python:check_ops",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
+        "//tensorflow/python:ragged_math_ops_gen",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_spec_registry",
         "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_tensor",
     srcs = ["ragged_tensor.py"],
     srcs_version = "PY3",
@@ -365,31 +408,37 @@ py_library(
         ":row_partition",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:check_ops",
-        "//tensorflow/python:constant_op",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:ragged_conversion_ops_gen",
         "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:tensor_util",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:type_spec",
-        "//tensorflow/python:util",
         "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:composite_tensor_gradient",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_spec_registry",
         "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/types:internal",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/tools/docs:doc_controls",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_tensor_shape",
     srcs = ["ragged_tensor_shape.py"],
     srcs_version = "PY3",
@@ -399,18 +448,18 @@ py_library(
         ":ragged_tensor",
         ":ragged_util",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "dynamic_ragged_shape",
     srcs = ["dynamic_ragged_shape.py"],
     srcs_version = "PY3",
@@ -418,33 +467,38 @@ py_library(
         ":ragged_tensor",
         ":row_partition",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:check_ops",
-        "//tensorflow/python:constant_op",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:extension_type",
-        "//tensorflow/python/types",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "ragged_tensor_value",
-    srcs = ["ragged_tensor_value.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":row_partition",
-        "//tensorflow/python:util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
+    name = "ragged_tensor_value",
+    srcs = ["ragged_tensor_value.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":row_partition",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
     name = "ragged_util",
     srcs = ["ragged_util.py"],
     srcs_version = "PY3",
@@ -457,7 +511,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_where_op",
     srcs = ["ragged_where_op.py"],
     srcs_version = "PY3",
@@ -466,13 +520,15 @@ py_library(
         ":ragged_functional_ops",
         ":ragged_gather_ops",
         ":ragged_tensor",
+        ":ragged_tensor_shape",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:dispatch",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "segment_id_ops",
     srcs = ["segment_id_ops.py"],
     srcs_version = "PY3",
@@ -480,51 +536,48 @@ py_library(
         ":ragged_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:bincount_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_image_ops",
     srcs = ["ragged_image_ops.py"],
     srcs_version = "PY3",
     deps = [
         ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:map_fn",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:dispatch",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_map_ops",
     srcs = ["ragged_map_ops.py"],
     srcs_version = "PY3",
     deps = [
-        ":ragged_config",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python:map_fn",
+        "//tensorflow/python/util:nest",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_dispatch",
     srcs = ["ragged_dispatch.py"],
     srcs_version = "PY3",
@@ -543,31 +596,34 @@ py_library(
         "//tensorflow/python:array_ops",  # fixdeps: keep
         "//tensorflow/python:bitwise_ops",  # fixdeps: keep
         "//tensorflow/python:clip_ops",  # fixdeps: keep
-        "//tensorflow/python:dtypes",  # fixdeps: keep
-        "//tensorflow/python:framework_ops",  # fixdeps: keep
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",  # fixdeps: keep
         "//tensorflow/python:sparse_tensor",  # fixdeps: keep
         "//tensorflow/python:string_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",  # fixdeps: keep
+        "//tensorflow/python/framework:dtypes",  # fixdeps: keep
+        "//tensorflow/python/framework:ops",  # fixdeps: keep
+        "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",  # fixdeps: keep
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ragged_tensor_test_ops",
     srcs = ["ragged_tensor_test_ops.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:bitwise_ops",
+        "//tensorflow/python:bitwise_ops_gen",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
         "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:special_math_ops",
         "//tensorflow/python:string_ops",
     ],
 )
@@ -576,7 +632,7 @@ py_library(
 # RaggedTensor Tests
 #-------------------------------------------------------------------------------
 
-py_test(
+py_strict_test(
     name = "ragged_tensor_test",
     srcs = ["ragged_tensor_test.py"],
     python_version = "PY3",
@@ -587,30 +643,45 @@ py_test(
     ],
     deps = [
         ":ragged",  # fixdeps: keep
+        ":ragged_concat_ops",
         ":ragged_factory_ops",
+        ":ragged_gather_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
         ":ragged_tensor_value",
+        ":row_partition",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_grad",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:array_ops_stack",
+        "//tensorflow/python:cond",
+        "//tensorflow/python:gradients_impl",
+        "//tensorflow/python:map_fn",
+        "//tensorflow/python:math_grad",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_conversion_ops_gen",
         "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_utils",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_math_ops_test",
     size = "medium",
     srcs = ["ragged_math_ops_test.py"],
@@ -619,12 +690,20 @@ py_test(
         ":ragged",  # fixdeps: keep
         ":ragged_factory_ops",
         ":ragged_math_ops",
-        "//tensorflow/python:framework_test_lib",
+        ":ragged_string_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_getitem_test",
     size = "medium",
     timeout = "long",
@@ -638,27 +717,19 @@ py_test(
     deps = [
         ":ragged",  # fixdeps: keep
         ":ragged_factory_ops",
-        ":ragged_math_ops",
         ":ragged_tensor",
-        ":ragged_tensor_value",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "row_partition_test",
     srcs = ["row_partition_test.py"],
     python_version = "PY3",
@@ -670,21 +741,21 @@ py_test(
         ":ragged",  # fixdeps: keep
         ":row_partition",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_eager_test",
     size = "medium",
     srcs = ["ragged_eager_test.py"],
@@ -692,40 +763,45 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":ragged_factory_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_range_op_test",
     srcs = ["ragged_range_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":ragged_math_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:ragged_math_ops_gen",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_tensor_bounding_shape_op_test",
     srcs = ["ragged_tensor_bounding_shape_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":ragged_factory_ops",
-        ":ragged_tensor",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_row_lengths_op_test",
     srcs = ["ragged_row_lengths_op_test.py"],
     python_version = "PY3",
@@ -733,14 +809,14 @@ py_test(
     deps = [
         ":ragged_factory_ops",
         ":ragged_tensor",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_gather_op_test",
     srcs = ["ragged_gather_op_test.py"],
     python_version = "PY3",
@@ -749,18 +825,23 @@ py_test(
     deps = [
         ":ragged_factory_ops",
         ":ragged_gather_ops",
+        ":ragged_tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:gradients_impl",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_batch_gather_op_test",
     srcs = ["ragged_batch_gather_op_test.py"],
     python_version = "PY3",
@@ -771,17 +852,17 @@ py_test(
         ":ragged_factory_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_gather_nd_op_test",
     srcs = ["ragged_gather_nd_op_test.py"],
     python_version = "PY3",
@@ -790,43 +871,43 @@ py_test(
         ":ragged_factory_ops",
         ":ragged_gather_ops",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_row_splits_to_segment_ids_op_test",
     srcs = ["ragged_row_splits_to_segment_ids_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":segment_id_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_segment_ids_to_row_splits_op_test",
     srcs = ["ragged_segment_ids_to_row_splits_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":segment_id_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_from_tensor_op_test",
     srcs = ["ragged_from_tensor_op_test.py"],
     python_version = "PY3",
@@ -834,15 +915,17 @@ py_test(
     deps = [
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_to_sparse_op_test",
     srcs = ["ragged_to_sparse_op_test.py"],
     python_version = "PY3",
@@ -856,17 +939,17 @@ py_test(
         ":ragged_functional_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients_impl",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_from_sparse_op_test",
     srcs = ["ragged_from_sparse_op_test.py"],
     python_version = "PY3",
@@ -874,25 +957,37 @@ py_test(
     deps = [
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_to_tensor_op_test",
     srcs = ["ragged_to_tensor_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":ragged_factory_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:gradients_impl",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -903,7 +998,7 @@ tf_py_logged_benchmark(
     target = "//tensorflow/python/ops/ragged:ragged_to_tensor_op_test",
 )
 
-py_test(
+py_strict_test(
     name = "ragged_segment_op_test",
     srcs = ["ragged_segment_op_test.py"],
     python_version = "PY3",
@@ -913,15 +1008,15 @@ py_test(
         ":ragged_math_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_reduce_op_test",
     srcs = ["ragged_reduce_op_test.py"],
     python_version = "PY3",
@@ -930,17 +1025,17 @@ py_test(
         ":ragged_factory_ops",
         ":ragged_math_ops",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_map_flat_values_op_test",
     srcs = ["ragged_map_flat_values_op_test.py"],
     python_version = "PY3",
@@ -950,16 +1045,16 @@ py_test(
         ":ragged_functional_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_const_op_test",
     srcs = ["ragged_const_op_test.py"],
     python_version = "PY3",
@@ -968,34 +1063,30 @@ py_test(
         ":ragged",
         ":ragged_factory_ops",
         ":ragged_tensor",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "strings_reduce_join_op_test",
     srcs = ["strings_reduce_join_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":ragged",
         ":ragged_factory_ops",
         ":ragged_string_ops",
-        ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_constant_value_op_test",
     srcs = ["ragged_constant_value_op_test.py"],
     python_version = "PY3",
@@ -1006,14 +1097,15 @@ py_test(
     deps = [
         ":ragged_factory_ops",
         ":ragged_tensor_value",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "convert_to_tensor_or_ragged_tensor_op_test",
     srcs = ["convert_to_tensor_or_ragged_tensor_op_test.py"],
     python_version = "PY3",
@@ -1021,16 +1113,16 @@ py_test(
     deps = [
         ":ragged_factory_ops",
         ":ragged_tensor",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_boolean_mask_op_test",
     srcs = ["ragged_boolean_mask_op_test.py"],
     python_version = "PY3",
@@ -1039,16 +1131,16 @@ py_test(
         ":ragged_array_ops",
         ":ragged_factory_ops",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_concat_op_test",
     srcs = ["ragged_concat_op_test.py"],
     python_version = "PY3",
@@ -1057,17 +1149,17 @@ py_test(
         ":ragged_concat_ops",
         ":ragged_factory_ops",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_stack_op_test",
     srcs = ["ragged_stack_op_test.py"],
     python_version = "PY3",
@@ -1075,14 +1167,14 @@ py_test(
     deps = [
         ":ragged_concat_ops",
         ":ragged_factory_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_rank_op_test",
     srcs = ["ragged_rank_op_test.py"],
     python_version = "PY3",
@@ -1090,13 +1182,13 @@ py_test(
     deps = [
         ":ragged_array_ops",
         ":ragged_factory_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_tile_op_test",
     srcs = ["ragged_tile_op_test.py"],
     python_version = "PY3",
@@ -1105,15 +1197,15 @@ py_test(
         ":ragged_array_ops",
         ":ragged_factory_ops",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_util_test",
     srcs = ["ragged_util_test.py"],
     python_version = "PY3",
@@ -1121,15 +1213,16 @@ py_test(
     deps = [
         ":ragged_util",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_expand_dims_op_test",
     srcs = ["ragged_expand_dims_op_test.py"],
     python_version = "PY3",
@@ -1137,13 +1230,13 @@ py_test(
     deps = [
         ":ragged_array_ops",
         ":ragged_factory_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_where_op_test",
     srcs = ["ragged_where_op_test.py"],
     python_version = "PY3",
@@ -1151,49 +1244,54 @@ py_test(
     deps = [
         ":ragged_factory_ops",
         ":ragged_where_op",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_dispatch_test",
     srcs = ["ragged_dispatch_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
+        ":dynamic_ragged_shape",
         ":ragged",  # fixdeps: keep
         ":ragged_dispatch",
         ":ragged_factory_ops",
         ":ragged_tensor",
         ":ragged_tensor_test_ops",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tf2",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:random_seed",
-        "//tensorflow/python/util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_operators_test",
     srcs = ["ragged_operators_test.py"],
     python_version = "PY3",
@@ -1201,24 +1299,35 @@ py_test(
     deps = [
         ":ragged",  # fixdeps: keep
         ":ragged_factory_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_resize_image_op_test",
     srcs = ["ragged_resize_image_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":ragged",  # fixdeps: keep
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        ":ragged_concat_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_map_fn_op_test",
     size = "small",
     srcs = ["ragged_map_fn_op_test.py"],
@@ -1232,18 +1341,21 @@ py_test(
         ":ragged_math_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:array_ops_stack",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_tensor_shape_test",
     srcs = ["ragged_tensor_shape_test.py"],
     python_version = "PY3",
@@ -1253,15 +1365,15 @@ py_test(
         ":ragged_factory_ops",
         ":ragged_tensor",
         ":ragged_tensor_shape",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "dynamic_ragged_shape_test",
     size = "medium",
     srcs = ["dynamic_ragged_shape_test.py"],
@@ -1271,30 +1383,35 @@ py_test(
     deps = [
         ":dynamic_ragged_shape",
         ":ragged",  # fixdeps: keep
+        ":ragged_array_ops",
         ":ragged_factory_ops",
         ":ragged_tensor",
         ":row_partition",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:gradients_impl",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:string_ops",
         "//tensorflow/python/client:session",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_size_op_test",
     srcs = ["ragged_size_op_test.py"],
     python_version = "PY3",
@@ -1302,28 +1419,29 @@ py_test(
     deps = [
         ":ragged_array_ops",
         ":ragged_factory_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_placeholder_op_test",
     srcs = ["ragged_placeholder_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":ragged_factory_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_squeeze_op_test",
     srcs = ["ragged_squeeze_op_test.py"],
     python_version = "PY3",
@@ -1333,75 +1451,85 @@ py_test(
         ":ragged_factory_ops",
         ":ragged_squeeze_op",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "ragged_dynamic_partition_op_test",
     srcs = ["ragged_dynamic_partition_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":ragged_array_ops",
+        ":ragged_concat_ops",
         ":ragged_factory_ops",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_merge_dims_op_test",
     srcs = ["ragged_merge_dims_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":ragged_array_ops",
         ":ragged_factory_ops",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "string_ngrams_op_test",
     size = "small",
     srcs = ["string_ngrams_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
+        ":ragged_factory_ops",
         ":ragged_string_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_reverse_op_test",
     srcs = ["ragged_reverse_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":ragged_array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        ":ragged_factory_ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_cross_op_test",
     srcs = ["ragged_cross_op_test.py"],
     python_version = "PY3",
@@ -1411,14 +1539,26 @@ py_test(
     ],
     deps = [
         ":ragged_array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:ragged_array_ops_gen",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_one_hot_op_test",
     srcs = ["ragged_one_hot_op_test.py"],
     python_version = "PY3",
@@ -1428,17 +1568,17 @@ py_test(
         ":ragged_factory_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_print_op_test",
     srcs = ["ragged_print_op_test.py"],
     python_version = "PY3",
@@ -1448,17 +1588,18 @@ py_test(
         ":ragged_factory_ops",
         ":ragged_string_ops",
         ":ragged_tensor",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:logging_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_tensor_supported_values_test",
     srcs = ["ragged_tensor_supported_values_test.py"],
     python_version = "PY3",
@@ -1471,41 +1612,48 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:clip_ops",
-        "//tensorflow/python:composite_tensor",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:type_spec",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:extension_type",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/util:dispatch",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_matmul_op_test",
     srcs = ["ragged_matmul_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":ragged_concat_ops",
+        ":ragged_factory_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "ragged_split_op_test",
     srcs = ["ragged_split_op_test.py"],
     python_version = "PY3",
@@ -1513,39 +1661,6 @@ py_test(
     deps = [
         ":ragged_array_ops",
         ":ragged_tensor",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/platform:test",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-cuda_py_test(
-    name = "ragged_factory_ops_test",
-    srcs = ["ragged_factory_ops_test.py"],
-    deps = [
-        ":ragged_functional_ops",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/eager:def_function",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_strict_test(
-    name = "ragged_bitcast_op_test",
-    srcs = ["ragged_bitcast_op_test.py"],
-    deps = [
-        ":ragged_array_ops",
-        ":ragged_factory_ops",
-        ":ragged_tensor",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
@@ -1557,3 +1672,54 @@ py_strict_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+cuda_py_strict_test(
+    name = "ragged_factory_ops_test",
+    srcs = ["ragged_factory_ops_test.py"],
+    deps = [
+        ":ragged_factory_ops",
+        "//tensorflow/python:map_fn",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_strict_test(
+    name = "ragged_bitcast_op_test",
+    srcs = ["ragged_bitcast_op_test.py"],
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_strict_test(
+    name = "ragged_fill_empty_rows_op_test",
+    srcs = ["ragged_fill_empty_rows_op_test.py"],
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:gradient_checker",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/ops/ragged/dynamic_ragged_shape.py b/tensorflow/python/ops/ragged/dynamic_ragged_shape.py
index 55943ae4e29..b7848ca4f2f 100644
--- a/tensorflow/python/ops/ragged/dynamic_ragged_shape.py
+++ b/tensorflow/python/ops/ragged/dynamic_ragged_shape.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -2387,7 +2388,7 @@ def _broadcast_dynamic_shape_one_layer(a, b):
   can_broadcast_from_b = math_ops.equal(b_0, 1)
 
   def broadcast_not_from_a():
-    return control_flow_ops.cond(
+    return cond.cond(
         can_broadcast_from_b, true_fn=broadcast_from_b, false_fn=broadcast_noop)
 
   nrows_equal = math_ops.equal(a_0, b_0)
@@ -2398,7 +2399,7 @@ def _broadcast_dynamic_shape_one_layer(a, b):
   check_can_broadcast = check_ops.assert_equal(
       can_broadcast, True, message="Cannot broadcast")
 
-  results = control_flow_ops.cond(
+  results = cond.cond(
       can_broadcast_from_a,
       true_fn=broadcast_from_a,
       false_fn=broadcast_not_from_a)
@@ -2468,7 +2469,7 @@ def _broadcast_dynamic_shape_first_layer(a_0, b_0):
   can_broadcast_from_b = math_ops.equal(b_0, constant_op.constant(1, b_0.dtype))
 
   def broadcast_not_from_a():
-    return control_flow_ops.cond(
+    return cond.cond(
         can_broadcast_from_b, true_fn=broadcast_from_b, false_fn=broadcast_noop)
 
   # Ideally, this would only block control flow on broadcast_noop, but
@@ -2477,7 +2478,7 @@ def _broadcast_dynamic_shape_first_layer(a_0, b_0):
       math_ops.logical_or(can_broadcast_from_a, can_broadcast_from_b),
       math_ops.equal(a_0, b_0))
 
-  result = control_flow_ops.cond(
+  result = cond.cond(
       can_broadcast_from_a,
       true_fn=broadcast_from_a,
       false_fn=broadcast_not_from_a)
@@ -2588,7 +2589,7 @@ def _broadcast_dynamic_shape_next_layer_half_ragged(
   can_broadcast_a = math_ops.equal(a_1.uniform_row_length(), 1)
 
   [c_1_row_splits, ac_1_gather_index,
-   bc_1_gather_index] = control_flow_ops.cond(
+   bc_1_gather_index] = cond.cond(
        can_broadcast_a, true_fn=broadcast_a, false_fn=broadcast_noop)
 
   c_1 = RowPartition.from_row_splits(c_1_row_splits)
@@ -2685,7 +2686,7 @@ def _broadcast_dynamic_shape_next_layer_both_uniform(
   can_broadcast_b = math_ops.equal(b_1.uniform_row_length(), 1)
 
   def no_broadcast_a():
-    return control_flow_ops.cond(
+    return cond.cond(
         can_broadcast_b, true_fn=broadcast_b, false_fn=broadcast_noop)
 
   can_broadcast_a = math_ops.equal(a_1.uniform_row_length(), 1)
@@ -2698,7 +2699,7 @@ def _broadcast_dynamic_shape_next_layer_both_uniform(
                              b_1.uniform_row_length())), True)
   ]
 
-  result = control_flow_ops.cond(
+  result = cond.cond(
       can_broadcast_a, true_fn=broadcast_a, false_fn=no_broadcast_a)
 
   [c_1_uniform_row_length, ac_1_gather_index, bc_1_gather_index] = [
@@ -3001,7 +3002,7 @@ def _first_layer_gather_index(nrows_source, nrows_target):
       True,
       message="Cannot broadcast")
 
-  gather_index = control_flow_ops.cond(
+  gather_index = cond.cond(
       do_broadcast, true_fn=gi_broadcast_first, false_fn=gi_no_broadcast_first)
 
   return control_flow_ops.with_dependencies([can_broadcast], gather_index)
@@ -3058,7 +3059,7 @@ def _next_layer_gather_index(bc, original_rp, broadcast_rp):
 
   do_broadcast = math_ops.equal(original_rp.uniform_row_length(),
                                 constant_op.constant(1, original_rp.dtype))
-  gather_index = control_flow_ops.cond(
+  gather_index = cond.cond(
       do_broadcast, true_fn=gi_broadcast, false_fn=gi_no_broadcast)
 
   return gather_index
diff --git a/tensorflow/python/ops/ragged/ragged_embedding_ops.py b/tensorflow/python/ops/ragged/ragged_embedding_ops.py
index f879b6d56c5..9379c8a7352 100644
--- a/tensorflow/python/ops/ragged/ragged_embedding_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_embedding_ops.py
@@ -17,6 +17,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -366,7 +367,7 @@ def safe_embedding_lookup_sparse(
       # for use in Select.
       is_row_empty = array_ops.tile(
           array_ops.reshape(is_row_empty, [-1, 1]),
-          array_ops.stack([1, array_ops.shape(result)[1]]),
+          array_ops_stack.stack([1, array_ops.shape(result)[1]]),
       )
 
       result = array_ops.where(
diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
index 914959447a4..99c07f66724 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -21,7 +21,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
@@ -464,7 +464,7 @@ def _if_ge_zero(value, true_fn, false_fn):
   if isinstance(value, ops.Tensor):
     const_value = tensor_util.constant_value(value)
     if const_value is None:
-      return control_flow_ops.cond(value >= 0, true_fn, false_fn)
+      return cond.cond(value >= 0, true_fn, false_fn)
     else:
       value = const_value
   if value >= 0:
diff --git a/tensorflow/python/ops/ragged/ragged_image_ops.py b/tensorflow/python/ops/ragged/ragged_image_ops.py
index 04e7c2623e9..f139b97f55e 100644
--- a/tensorflow/python/ops/ragged/ragged_image_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_image_ops.py
@@ -20,7 +20,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
@@ -95,4 +95,4 @@ def _resize_images(resize_op, images, size, **kwargs):
     return resize_with_map()
   else:
     empty_batch = math_ops.equal(images.nrows(), 0)
-    return control_flow_ops.cond(empty_batch, empty_result, resize_with_map)
+    return cond.cond(empty_batch, empty_result, resize_with_map)
diff --git a/tensorflow/python/ops/ragged/ragged_map_ops.py b/tensorflow/python/ops/ragged/ragged_map_ops.py
index 38bd3b85882..06d9f0624d0 100644
--- a/tensorflow/python/ops/ragged/ragged_map_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_map_ops.py
@@ -14,14 +14,9 @@
 # ==============================================================================
 """Functional operations for RaggedTensors."""
 
+from tensorflow.python.ops import map_fn as map_fn_lib
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import nest
-from tensorflow.python.util.lazy_loader import LazyLoader
-
-
-map_fn_lib = LazyLoader(
-    "map_fn_lib", globals(),
-    "tensorflow.python.ops.map_fn")
 
 
 def map_fn(fn,
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index 960f4fee892..db6c057541d 100644
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -23,7 +23,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
@@ -924,7 +924,7 @@ def _ragged_tensor_to_string(string_tensor, summarize):
         string_tensor,
         fn_output_signature=tensor_spec.TensorSpec(None, dtypes.string))
   if summarize not in (-1, None):
-    pieces = control_flow_ops.cond(
+    pieces = cond.cond(
         _nrows(string_tensor) <= 2 * summarize,
         lambda: pieces,
         lambda: array_ops.concat(  # pylint: disable=g-long-lambda
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index fc620ee7cb0..1b14c17cbbd 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
@@ -37,8 +38,8 @@ from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_ragged_conversion_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_config
@@ -56,9 +57,9 @@ from tensorflow.tools.docs import doc_controls
 _convert_row_partition = RowPartition._convert_row_partition
 # pylint: enable=protected-access
 
-#===============================================================================
+# ===============================================================================
 # RaggedTensor
-#===============================================================================
+# ===============================================================================
 
 
 @tf_export("RaggedTensor")
@@ -1738,7 +1739,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
         # axes -- i.e., to be a no-op.)
         tensor_rank = array_ops.rank(tensor)
         reduce_axis = math_ops.range(2, tensor_rank)
-        has_default = control_flow_ops.cond(
+        has_default = cond.cond(
             tensor_rank > 2,
             lambda: math_ops.reduce_all(has_default_value, axis=reduce_axis),
             lambda: has_default_value)
@@ -2306,9 +2307,9 @@ def match_row_splits_dtypes(*tensors, **kwargs):
     return tensors
 
 
-#===============================================================================
+# ===============================================================================
 # RaggedTensorSpec
-#===============================================================================
+# ===============================================================================
 @tf_export("RaggedTensorSpec")
 @type_spec_registry.register("tf.RaggedTensorSpec")
 class RaggedTensorSpec(type_spec.BatchableTypeSpec):
@@ -2663,9 +2664,9 @@ type_spec.register_type_spec_from_value_converter(
     ragged_tensor_value.RaggedTensorValue, RaggedTensorSpec.from_value)
 
 
-#===============================================================================
+# ===============================================================================
 # Convert value -> tensor
-#===============================================================================
+# ===============================================================================
 def convert_to_tensor_or_ragged_tensor(value,
                                        dtype=None,
                                        preferred_dtype=None,
@@ -2705,8 +2706,9 @@ def convert_to_tensor_or_ragged_tensor(value,
       return RaggedTensor.from_nested_row_splits(
           flat_values, value.nested_row_splits, validate=False)
   else:
-    return ops.convert_to_tensor_v2_with_dispatch(
-        value=value, dtype=dtype, dtype_hint=preferred_dtype, name=name)
+    return tensor_conversion.convert_to_tensor_v2_with_dispatch(
+        value=value, dtype=dtype, dtype_hint=preferred_dtype, name=name
+    )
 
 
 def _convert_to_ragged_tensor_values(value):
@@ -2730,9 +2732,9 @@ def _convert_to_ragged_tensor_values(value):
     return convert_to_tensor_or_ragged_tensor(value, name="values")
 
 
-#===============================================================================
+# ===============================================================================
 # Register RaggedTensor for use with session.run.
-#===============================================================================
+# ===============================================================================
 def _ragged_tensor_value_from_components(components):
   components = list(components)
   value = components.pop()
@@ -2761,9 +2763,9 @@ session.register_session_run_conversion_functions(
     _ragged_tensor_session_feed_for_partial_run)
 
 
-#===============================================================================
+# ===============================================================================
 # RaggedTensorType
-#===============================================================================
+# ===============================================================================
 class RaggedTensorType:
   """Encoding of a static type for a `RaggedTensor`.
 
@@ -2794,9 +2796,9 @@ class RaggedTensorType:
                                              self.row_splits_dtype)
 
 
-#===============================================================================
+# ===============================================================================
 # Helper Functions
-#===============================================================================
+# ===============================================================================
 def _assert_sparse_indices_are_ragged_right(indices):
   """Checks that the given SparseTensor.indices tensor is ragged-right.
 
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 86324742351..a752de22a6d 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.framework.type_utils import fulltypes_for_flat_tensors
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import gen_ragged_conversion_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import map_fn
@@ -650,7 +650,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     # b/141166460
     rt = RaggedTensor.from_value_rowids([1, 2, 3], [0, 0, 2])
     c = array_ops.placeholder_with_default(True, None)
-    result = control_flow_ops.cond(c, lambda: rt, lambda: rt)
+    result = cond.cond(c, lambda: rt, lambda: rt)
     self.assertAllEqual(rt, result)
 
   def testGraphMismatch(self):
diff --git a/tensorflow/python/ops/ragged/row_partition.py b/tensorflow/python/ops/ragged/row_partition.py
index ce6e0fe5695..a64e3b18745 100644
--- a/tensorflow/python/ops/ragged/row_partition.py
+++ b/tensorflow/python/ops/ragged/row_partition.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
@@ -40,9 +41,9 @@ from tensorflow.python.ops.ragged import segment_id_ops
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.util.tf_export import tf_export
 
-#===============================================================================
+# ===============================================================================
 # RowPartition
-#===============================================================================
+# ===============================================================================
 # TODO(edloper): Consider removing row_starts and row_limits factory methods
 # and accessors from RowPartition.  In particular, these two encodings are
 # "second-class citizens": we never cache them, and if you do construct a
@@ -111,9 +112,9 @@ class RowPartition(composite_tensor.CompositeTensor):
   encoding, use `RowPartition.with_precomputed_<encoding>`.
   """
 
-  #=============================================================================
+  # =============================================================================
   # Constructor (private)
-  #=============================================================================
+  # =============================================================================
   def __init__(self,
                row_splits,
                row_lengths=None,
@@ -187,9 +188,9 @@ class RowPartition(composite_tensor.CompositeTensor):
     self._uniform_row_length = uniform_row_length
     self._nvals = nvals
 
-  #=============================================================================
+  # =============================================================================
   # Factory Methods
-  #=============================================================================
+  # =============================================================================
 
   @classmethod
   def from_value_rowids(cls,
@@ -695,8 +696,9 @@ class RowPartition(composite_tensor.CompositeTensor):
         partition.dtype == np.int32 and dtype is None):
       partition = ops.convert_to_tensor(partition, name=name)
     else:
-      partition = ops.convert_to_tensor_v2(
-          partition, dtype_hint=dtype_hint, dtype=dtype, name=name)
+      partition = tensor_conversion.convert_to_tensor_v2(
+          partition, dtype_hint=dtype_hint, dtype=dtype, name=name
+      )
     if partition.dtype not in (dtypes.int32, dtypes.int64):
       raise ValueError("%s must have dtype int32 or int64" % name)
 
@@ -724,9 +726,9 @@ class RowPartition(composite_tensor.CompositeTensor):
         uniform_row_length=self._uniform_row_length,
         internal=_row_partition_factory_key)
 
-  #=============================================================================
+  # =============================================================================
   # Accessors
-  #=============================================================================
+  # =============================================================================
 
   @property
   def dtype(self):
@@ -959,9 +961,9 @@ class RowPartition(composite_tensor.CompositeTensor):
       raise ValueError("_nrows.dtype=" + str(self._nrows.dtype) + ", not " +
                        str(my_dtype))
 
-  #=============================================================================
+  # =============================================================================
   # Transformation
-  #=============================================================================
+  # =============================================================================
 
   def with_dtype(self, dtype):
     """Returns a copy of this RowPartition with the given encoding dtype.
@@ -988,9 +990,9 @@ class RowPartition(composite_tensor.CompositeTensor):
         uniform_row_length=_cast_if_not_none(self._uniform_row_length, dtype),
         internal=_row_partition_factory_key)
 
-  #=============================================================================
+  # =============================================================================
   # String Encoding
-  #=============================================================================
+  # =============================================================================
 
   def __repr__(self):
     if self._uniform_row_length is not None:
@@ -999,9 +1001,9 @@ class RowPartition(composite_tensor.CompositeTensor):
     else:
       return f"tf.RowPartition(row_splits={self._row_splits})"
 
-  #=============================================================================
+  # =============================================================================
   # Precomputed Encodings
-  #=============================================================================
+  # =============================================================================
 
   def _has_precomputed_row_splits(self):
     """Returns true if `row_splits` has already been computed.
@@ -1192,18 +1194,18 @@ class RowPartition(composite_tensor.CompositeTensor):
         nvals=nvals,
         internal=_row_partition_factory_key)
 
-  #=============================================================================
+  # =============================================================================
   # Composite Tensor
-  #=============================================================================
+  # =============================================================================
 
   @property
   def _type_spec(self):
     return RowPartitionSpec.from_value(self)
 
 
-#===============================================================================
+# ===============================================================================
 # RowPartitionSpec
-#===============================================================================
+# ===============================================================================
 # TODO(edloper): Consider refactoring RowPartitionSpec to allow any combination
 # of precomputed row-partition encodings (rather than always using row_splits).
 
@@ -1393,9 +1395,9 @@ nested_structure_coder.register_codec(
 )
 
 
-#===============================================================================
+# ===============================================================================
 # Helper Functions
-#===============================================================================
+# ===============================================================================
 
 
 def _assert_monotonic_increasing(tensor, message=None):
diff --git a/tensorflow/python/ops/random_crop_ops.py b/tensorflow/python/ops/random_crop_ops.py
new file mode 100644
index 00000000000..7c103493e3c
--- /dev/null
+++ b/tensorflow/python/ops/random_crop_ops.py
@@ -0,0 +1,136 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for random tensor cropping."""
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_assert
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("image.random_crop", v1=["image.random_crop", "random_crop"])
+@dispatch.add_dispatch_support
+@deprecation.deprecated_endpoints("random_crop")
+def random_crop(value, size, seed=None, name=None):
+  """Randomly crops a tensor to a given size.
+
+  Slices a shape `size` portion out of `value` at a uniformly chosen offset.
+  Requires `value.shape >= size`.
+
+  If a dimension should not be cropped, pass the full size of that dimension.
+  For example, RGB images can be cropped with
+  `size = [crop_height, crop_width, 3]`.
+
+  Example usage:
+
+  >>> image = [[1, 2, 3], [4, 5, 6]]
+  >>> result = tf.image.random_crop(value=image, size=(1, 3))
+  >>> result.shape.as_list()
+  [1, 3]
+
+  For producing deterministic results given a `seed` value, use
+  `tf.image.stateless_random_crop`. Unlike using the `seed` param with
+  `tf.image.random_*` ops, `tf.image.stateless_random_*` ops guarantee the same
+  results given the same seed independent of how many times the function is
+  called, and independent of global seed settings (e.g. tf.random.set_seed).
+
+  Args:
+    value: Input tensor to crop.
+    size: 1-D tensor with size the rank of `value`.
+    seed: Python integer. Used to create a random seed. See
+      `tf.random.set_seed`
+      for behavior.
+    name: A name for this operation (optional).
+
+  Returns:
+    A cropped tensor of the same rank as `value` and shape `size`.
+  """
+  with ops.name_scope(name, "random_crop", [value, size]) as name:
+    value = ops.convert_to_tensor(value, name="value")
+    size = ops.convert_to_tensor(size, dtype=dtypes.int32, name="size")
+    shape = array_ops.shape(value)
+    check = control_flow_assert.Assert(
+        math_ops.reduce_all(shape >= size),
+        ["Need value.shape >= size, got ", shape, size],
+        summarize=1000)
+    shape = control_flow_ops.with_dependencies([check], shape)
+    limit = shape - size + 1
+    offset = random_ops.random_uniform(
+        array_ops.shape(shape),
+        dtype=size.dtype,
+        maxval=size.dtype.max,
+        seed=seed) % limit
+    return array_ops.slice(value, offset, size, name=name)
+
+
+@tf_export("image.stateless_random_crop", v1=[])
+@dispatch.add_dispatch_support
+def stateless_random_crop(value, size, seed, name=None):
+  """Randomly crops a tensor to a given size in a deterministic manner.
+
+  Slices a shape `size` portion out of `value` at a uniformly chosen offset.
+  Requires `value.shape >= size`.
+
+  If a dimension should not be cropped, pass the full size of that dimension.
+  For example, RGB images can be cropped with
+  `size = [crop_height, crop_width, 3]`.
+
+  Guarantees the same results given the same `seed` independent of how many
+  times the function is called, and independent of global seed settings (e.g.
+  `tf.random.set_seed`).
+
+  Usage Example:
+
+  >>> image = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]
+  >>> seed = (1, 2)
+  >>> tf.image.stateless_random_crop(value=image, size=(1, 2, 3), seed=seed)
+  <tf.Tensor: shape=(1, 2, 3), dtype=int32, numpy=
+  array([[[1, 2, 3],
+          [4, 5, 6]]], dtype=int32)>
+
+  Args:
+    value: Input tensor to crop.
+    size: 1-D tensor with size the rank of `value`.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+    name: A name for this operation (optional).
+
+  Returns:
+    A cropped tensor of the same rank as `value` and shape `size`.
+  """
+  with ops.name_scope(name, "random_crop", [value, size]) as name:
+    value = ops.convert_to_tensor(value, name="value")
+    size = ops.convert_to_tensor(size, dtype=dtypes.int32, name="size")
+    shape = array_ops.shape(value)
+    check = control_flow_assert.Assert(
+        math_ops.reduce_all(shape >= size),
+        ["Need value.shape >= size, got ", shape, size],
+        summarize=1000)
+    shape = control_flow_ops.with_dependencies([check], shape)
+    limit = shape - size + 1
+    offset = stateless_random_ops.stateless_random_uniform(
+        array_ops.shape(shape),
+        dtype=size.dtype,
+        maxval=size.dtype.max,
+        seed=seed) % limit
+    return array_ops.slice(value, offset, size, name=name)
+  
\ No newline at end of file
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index d0d85c23092..366b2e631ac 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -22,12 +22,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import shape_util
-from tensorflow.python.ops import stateless_random_ops
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
@@ -362,114 +359,6 @@ def random_shuffle(value, seed=None, name=None):
 ops.NotDifferentiable("RandomShuffle")
 
 
-@tf_export("image.random_crop", v1=["image.random_crop", "random_crop"])
-@dispatch.add_dispatch_support
-@deprecation.deprecated_endpoints("random_crop")
-def random_crop(value, size, seed=None, name=None):
-  """Randomly crops a tensor to a given size.
-
-  Slices a shape `size` portion out of `value` at a uniformly chosen offset.
-  Requires `value.shape >= size`.
-
-  If a dimension should not be cropped, pass the full size of that dimension.
-  For example, RGB images can be cropped with
-  `size = [crop_height, crop_width, 3]`.
-
-  Example usage:
-
-  >>> image = [[1, 2, 3], [4, 5, 6]]
-  >>> result = tf.image.random_crop(value=image, size=(1, 3))
-  >>> result.shape.as_list()
-  [1, 3]
-
-  For producing deterministic results given a `seed` value, use
-  `tf.image.stateless_random_crop`. Unlike using the `seed` param with
-  `tf.image.random_*` ops, `tf.image.stateless_random_*` ops guarantee the same
-  results given the same seed independent of how many times the function is
-  called, and independent of global seed settings (e.g. tf.random.set_seed).
-
-  Args:
-    value: Input tensor to crop.
-    size: 1-D tensor with size the rank of `value`.
-    seed: Python integer. Used to create a random seed. See
-      `tf.random.set_seed`
-      for behavior.
-    name: A name for this operation (optional).
-
-  Returns:
-    A cropped tensor of the same rank as `value` and shape `size`.
-  """
-  with ops.name_scope(name, "random_crop", [value, size]) as name:
-    value = ops.convert_to_tensor(value, name="value")
-    size = ops.convert_to_tensor(size, dtype=dtypes.int32, name="size")
-    shape = array_ops.shape(value)
-    check = control_flow_assert.Assert(
-        math_ops.reduce_all(shape >= size),
-        ["Need value.shape >= size, got ", shape, size],
-        summarize=1000)
-    shape = control_flow_ops.with_dependencies([check], shape)
-    limit = shape - size + 1
-    offset = random_uniform(
-        array_ops.shape(shape),
-        dtype=size.dtype,
-        maxval=size.dtype.max,
-        seed=seed) % limit
-    return array_ops.slice(value, offset, size, name=name)
-
-
-@tf_export("image.stateless_random_crop", v1=[])
-@dispatch.add_dispatch_support
-def stateless_random_crop(value, size, seed, name=None):
-  """Randomly crops a tensor to a given size in a deterministic manner.
-
-  Slices a shape `size` portion out of `value` at a uniformly chosen offset.
-  Requires `value.shape >= size`.
-
-  If a dimension should not be cropped, pass the full size of that dimension.
-  For example, RGB images can be cropped with
-  `size = [crop_height, crop_width, 3]`.
-
-  Guarantees the same results given the same `seed` independent of how many
-  times the function is called, and independent of global seed settings (e.g.
-  `tf.random.set_seed`).
-
-  Usage Example:
-
-  >>> image = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]
-  >>> seed = (1, 2)
-  >>> tf.image.stateless_random_crop(value=image, size=(1, 2, 3), seed=seed)
-  <tf.Tensor: shape=(1, 2, 3), dtype=int32, numpy=
-  array([[[1, 2, 3],
-          [4, 5, 6]]], dtype=int32)>
-
-  Args:
-    value: Input tensor to crop.
-    size: 1-D tensor with size the rank of `value`.
-    seed: A shape [2] Tensor, the seed to the random number generator. Must have
-      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
-    name: A name for this operation (optional).
-
-  Returns:
-    A cropped tensor of the same rank as `value` and shape `size`.
-  """
-  with ops.name_scope(name, "random_crop", [value, size]) as name:
-    value = ops.convert_to_tensor(value, name="value")
-    size = ops.convert_to_tensor(size, dtype=dtypes.int32, name="size")
-    shape = array_ops.shape(value)
-    check = control_flow_assert.Assert(
-        math_ops.reduce_all(shape >= size),
-        ["Need value.shape >= size, got ", shape, size],
-        summarize=1000)
-    shape = control_flow_ops.with_dependencies([check], shape)
-    limit = shape - size + 1
-    offset = stateless_random_ops.stateless_random_uniform(
-        array_ops.shape(shape),
-        dtype=size.dtype,
-        maxval=size.dtype.max,
-        seed=seed) % limit
-    return array_ops.slice(value, offset, size, name=name)
-
-
 @tf_export(v1=["random.multinomial", "multinomial"])
 @dispatch.add_dispatch_support
 @deprecation.deprecated(
diff --git a/tensorflow/python/ops/ref_variable.py b/tensorflow/python/ops/ref_variable.py
new file mode 100644
index 00000000000..15b60b3ed17
--- /dev/null
+++ b/tensorflow/python/ops/ref_variable.py
@@ -0,0 +1,1349 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RefVariable class."""
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import variable_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.framework import indexed_slices
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_state_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.trackable import base as trackable
+from tensorflow.python.types import core
+from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated
+
+
+def default_variable_creator(next_creator=None, **kwargs):
+  """Default variable creator."""
+  assert next_creator is None
+  initial_value = kwargs.get("initial_value", None)
+  trainable = kwargs.get("trainable", None)
+  collections = kwargs.get("collections", None)
+  validate_shape = kwargs.get("validate_shape", True)
+  caching_device = kwargs.get("caching_device", None)
+  name = kwargs.get("name", None)
+  variable_def = kwargs.get("variable_def", None)
+  dtype = kwargs.get("dtype", None)
+  expected_shape = kwargs.get("expected_shape", None)
+  import_scope = kwargs.get("import_scope", None)
+  constraint = kwargs.get("constraint", None)
+  use_resource = kwargs.get("use_resource", None)
+  synchronization = kwargs.get("synchronization", None)
+  aggregation = kwargs.get("aggregation", None)
+  shape = kwargs.get("shape", None)
+
+  if use_resource is None:
+    use_resource = variable_scope.get_variable_scope().use_resource
+  if use_resource is None:
+    use_resource = variable_scope._DEFAULT_USE_RESOURCE  # pylint: disable=protected-access
+  use_resource = use_resource or context.executing_eagerly()
+  if use_resource:
+    distribute_strategy = kwargs.get("distribute_strategy", None)
+    return resource_variable_ops.ResourceVariable(
+        initial_value=initial_value,
+        trainable=trainable,
+        collections=collections,
+        validate_shape=validate_shape,
+        caching_device=caching_device,
+        name=name,
+        dtype=dtype,
+        constraint=constraint,
+        variable_def=variable_def,
+        import_scope=import_scope,
+        distribute_strategy=distribute_strategy,
+        synchronization=synchronization,
+        aggregation=aggregation,
+        shape=shape)
+  else:
+    return RefVariable(
+        initial_value=initial_value,
+        trainable=trainable,
+        collections=collections,
+        validate_shape=validate_shape,
+        caching_device=caching_device,
+        name=name,
+        dtype=dtype,
+        constraint=constraint,
+        variable_def=variable_def,
+        expected_shape=expected_shape,
+        import_scope=import_scope,
+        synchronization=synchronization,
+        aggregation=aggregation,
+        shape=shape)
+
+
+variable_v1.default_variable_creator = default_variable_creator
+
+
+def _to_proto_fn(v, export_scope=None):
+  """Converts Variable and ResourceVariable to VariableDef for collections."""
+  return v.to_proto(export_scope=export_scope)
+
+
+def _from_proto_fn(v, import_scope=None):
+  """Creates Variable or ResourceVariable from VariableDef as needed."""
+  if v.is_resource:
+    return resource_variable_ops.ResourceVariable.from_proto(
+        v, import_scope=import_scope)
+  return variable_v1.VariableV1.from_proto(v, import_scope=import_scope)
+
+
+ops.register_proto_function(
+    ops.GraphKeys.GLOBAL_VARIABLES,
+    proto_type=variable_pb2.VariableDef,
+    to_proto=_to_proto_fn,
+    from_proto=_from_proto_fn)
+ops.register_proto_function(
+    ops.GraphKeys.TRAINABLE_VARIABLES,
+    proto_type=variable_pb2.VariableDef,
+    to_proto=_to_proto_fn,
+    from_proto=_from_proto_fn)
+ops.register_proto_function(
+    ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
+    proto_type=variable_pb2.VariableDef,
+    to_proto=_to_proto_fn,
+    from_proto=_from_proto_fn)
+ops.register_proto_function(
+    ops.GraphKeys.LOCAL_VARIABLES,
+    proto_type=variable_pb2.VariableDef,
+    to_proto=_to_proto_fn,
+    from_proto=_from_proto_fn)
+ops.register_proto_function(
+    ops.GraphKeys.MODEL_VARIABLES,
+    proto_type=variable_pb2.VariableDef,
+    to_proto=_to_proto_fn,
+    from_proto=_from_proto_fn)
+ops.register_proto_function(
+    ops.GraphKeys.GLOBAL_STEP,
+    proto_type=variable_pb2.VariableDef,
+    to_proto=_to_proto_fn,
+    from_proto=_from_proto_fn)
+ops.register_proto_function(
+    ops.GraphKeys.METRIC_VARIABLES,
+    proto_type=variable_pb2.VariableDef,
+    to_proto=_to_proto_fn,
+    from_proto=_from_proto_fn)
+
+
+# TODO(apassos): do not repeat all comments here
+class RefVariable(variable_v1.VariableV1, core.Tensor):
+  """Ref-based implementation of variables."""
+
+  def __init__(
+      self,  # pylint: disable=super-init-not-called
+      initial_value=None,
+      trainable=None,
+      collections=None,
+      validate_shape=True,
+      caching_device=None,
+      name=None,
+      variable_def=None,
+      dtype=None,
+      expected_shape=None,
+      import_scope=None,
+      constraint=None,
+      synchronization=None,
+      aggregation=None,
+      shape=None):
+    """Creates a new variable with value `initial_value`.
+
+    The new variable is added to the graph collections listed in `collections`,
+    which defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+
+    If `trainable` is `True` the variable is also added to the graph collection
+    `GraphKeys.TRAINABLE_VARIABLES`.
+
+    This constructor creates both a `variable` Op and an `assign` Op to set the
+    variable to its initial value.
+
+    Args:
+      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
+        which is the initial value for the Variable. The initial value must have
+        a shape specified unless `validate_shape` is set to False. Can also be a
+        callable with no argument that returns the initial value when called. In
+        that case, `dtype` must be specified. (Note that initializer functions
+        from init_ops.py must first be bound to a shape before being used here.)
+      trainable: If `True`, also adds the variable to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as the default
+        list of variables to use by the `Optimizer` classes. Defaults to `True`,
+        unless `synchronization` is set to `ON_READ`, in which case it defaults
+        to `False`.
+      collections: List of graph collections keys. The new variable is added to
+        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+      validate_shape: If `False`, allows the variable to be initialized with a
+        value of unknown shape. If `True`, the default, the shape of
+        `initial_value` must be known.
+      caching_device: Optional device string describing where the Variable
+        should be cached for reading.  Defaults to the Variable's device. If not
+        `None`, caches on another device.  Typical use is to cache on the device
+        where the Ops using the Variable reside, to deduplicate copying through
+        `Switch` and other conditional statements.
+      name: Optional name for the variable. Defaults to `'Variable'` and gets
+        uniquified automatically.
+      variable_def: `VariableDef` protocol buffer. If not `None`, recreates the
+        Variable object with its contents, referencing the variable's nodes in
+        the graph, which must already exist. The graph is not changed.
+        `variable_def` and the other arguments are mutually exclusive.
+      dtype: If set, initial_value will be converted to the given type. If
+        `None`, either the datatype will be kept (if `initial_value` is a
+        Tensor), or `convert_to_tensor` will decide.
+      expected_shape: A TensorShape. If set, initial_value is expected to have
+        this shape.
+      import_scope: Optional `string`. Name scope to add to the `Variable.` Only
+        used when initializing from protocol buffer.
+      constraint: An optional projection function to be applied to the variable
+        after being updated by an `Optimizer` (e.g. used to implement norm
+        constraints or value constraints for layer weights). The function must
+        take as input the unprojected Tensor representing the value of the
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
+      shape: (optional) The shape of this variable. If None, the shape of
+        `initial_value` will be used. When setting this argument to
+        `tf.TensorShape(None)` (representing an unspecified shape), the variable
+        can be assigned with values of different shapes.
+
+    Raises:
+      ValueError: If both `variable_def` and initial_value are specified.
+      ValueError: If the initial value is not specified, or does not have a
+        shape and `validate_shape` is `True`.
+      RuntimeError: If eager execution is enabled.
+    """
+    self._in_graph_mode = True
+    if variable_def:
+      # If variable_def is provided, recreates the variable from its fields.
+      if initial_value:
+        raise ValueError("variable_def and initial_value are mutually "
+                         "exclusive.")
+      self._init_from_proto(variable_def, import_scope=import_scope)
+    else:
+      # Create from initial_value.
+      self._init_from_args(
+          initial_value=initial_value,
+          trainable=trainable,
+          collections=collections,
+          validate_shape=validate_shape,
+          caching_device=caching_device,
+          name=name,
+          dtype=dtype,
+          expected_shape=expected_shape,
+          constraint=constraint,
+          synchronization=synchronization,
+          aggregation=aggregation,
+          shape=shape)
+
+  def __repr__(self):
+    if context.executing_eagerly() and not self._in_graph_mode:
+      return "<tf.Variable '%s' shape=%s dtype=%s, numpy=%s>" % (
+          self.name, self.get_shape(), self.dtype.name,
+          ops.numpy_text(self.read_value(), is_repr=True))
+    else:
+      return "<tf.Variable '%s' shape=%s dtype=%s>" % (
+          self.name, self.get_shape(), self.dtype.name)
+
+  def _init_from_args(self,
+                      initial_value=None,
+                      trainable=None,
+                      collections=None,
+                      validate_shape=True,
+                      caching_device=None,
+                      name=None,
+                      dtype=None,
+                      expected_shape=None,
+                      constraint=None,
+                      synchronization=None,
+                      aggregation=None,
+                      shape=None):
+    """Creates a new variable from arguments.
+
+    Args:
+      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
+        which is the initial value for the Variable. The initial value must have
+        a shape specified unless `validate_shape` is set to False. Can also be a
+        callable with no argument that returns the initial value when called.
+        (Note that initializer functions from init_ops.py must first be bound to
+        a shape before being used here.)
+      trainable: If `True`, also adds the variable to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as the default
+        list of variables to use by the `Optimizer` classes. Defaults to `True`,
+        unless `synchronization` is set to `ON_READ`, in which case it defaults
+        to `False`.
+      collections: List of graph collections keys. The new variable is added to
+        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+      validate_shape: If `False`, allows the variable to be initialized with a
+        value of unknown shape. If `True`, the default, the shape of
+        `initial_value` must be known.
+      caching_device: Optional device string or function describing where the
+        Variable should be cached for reading.  Defaults to the Variable's
+        device.  If not `None`, caches on another device.  Typical use is to
+        cache on the device where the Ops using the Variable reside, to
+        deduplicate copying through `Switch` and other conditional statements.
+      name: Optional name for the variable. Defaults to `'Variable'` and gets
+        uniquified automatically.
+      dtype: If set, initial_value will be converted to the given type. If None,
+        either the datatype will be kept (if initial_value is a Tensor) or
+        float32 will be used (if it is a Python object convertible to a Tensor).
+      expected_shape: Deprecated. Ignored.
+      constraint: An optional projection function to be applied to the variable
+        after being updated by an `Optimizer` (e.g. used to implement norm
+        constraints or value constraints for layer weights). The function must
+        take as input the unprojected Tensor representing the value of the
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
+      shape: (optional) The shape of this variable. If None, the shape of
+        `initial_value` will be used. When setting this argument to
+        `tf.TensorShape(None)` (representing an unspecified shape), the variable
+        can be assigned with values of different shapes.
+
+    Raises:
+      ValueError: If the initial value is not specified, or does not have a
+        shape and `validate_shape` is `True`.
+      RuntimeError: If lifted into the eager context.
+    """
+    _ = expected_shape
+    if initial_value is None:
+      raise ValueError("initial_value must be specified.")
+    init_from_fn = callable(initial_value)
+
+    if collections is None:
+      collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+    if not isinstance(collections, (list, tuple, set)):
+      raise ValueError(
+          "collections argument to Variable constructor must be a list, tuple, "
+          "or set. Got %s of type %s" % (collections, type(collections)))
+    if constraint is not None and not callable(constraint):
+      raise ValueError("The `constraint` argument must be a callable.")
+
+    # Store the graph key so optimizers know how to only retrieve variables from
+    # this graph.
+    self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
+      self._update_uid = initial_value.checkpoint_position.restore_uid
+      initial_value = initial_value.wrapped_value
+
+    synchronization, aggregation, trainable = (
+        variables.validate_synchronization_aggregation_trainable(
+            synchronization, aggregation, trainable, name))
+    self._synchronization = synchronization
+    self._aggregation = aggregation
+    self._trainable = trainable
+    if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
+      collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
+    with ops.init_scope():
+      # Ensure that we weren't lifted into the eager context.
+      if context.executing_eagerly():
+        raise RuntimeError(
+            "Reference variables are not supported when eager execution is "
+            "enabled. Please run `tf.compat.v1.enable_resource_variables()` to "
+            "switch to resource variables.")
+      with ops.name_scope(name, "Variable",
+                          [] if init_from_fn else [initial_value]) as name:
+
+        if init_from_fn:
+          # Use attr_scope and device(None) to simulate the behavior of
+          # colocate_with when the variable we want to colocate with doesn't
+          # yet exist.
+          true_name = ops.name_from_scope_name(name)  # pylint: disable=protected-access
+          attr = attr_value_pb2.AttrValue(
+              list=attr_value_pb2.AttrValue.ListValue(
+                  s=[compat.as_bytes("loc:@%s" % true_name)]))
+          # pylint: disable=protected-access
+          with ops.get_default_graph()._attr_scope({"_class": attr}):
+            with ops.name_scope("Initializer"), ops.device(None):
+              initial_value = initial_value()
+              if isinstance(initial_value, trackable.CheckpointInitialValue):
+                self._maybe_initialize_trackable()
+                self._update_uid = initial_value.checkpoint_position.restore_uid
+                initial_value = initial_value.wrapped_value
+              self._initial_value = ops.convert_to_tensor(
+                  initial_value, name="initial_value", dtype=dtype)
+              if shape is None:
+                shape = (
+                    self._initial_value.get_shape()
+                    if validate_shape else tensor_shape.unknown_shape())
+            self._variable = state_ops.variable_op_v2(
+                shape, self._initial_value.dtype.base_dtype, name=name)
+          # pylint: enable=protected-access
+
+        # Or get the initial value from a Tensor or Python object.
+        else:
+          self._initial_value = ops.convert_to_tensor(
+              initial_value, name="initial_value", dtype=dtype)
+          # pylint: disable=protected-access
+          if self._initial_value.op._get_control_flow_context() is not None:
+            raise ValueError(
+                "Initializer for variable %s is from inside a control-flow "
+                "construct, such as a loop or conditional. When creating a "
+                "variable inside a loop or conditional, use a lambda as the "
+                "initializer." % name)
+          if shape is None:
+            # pylint: enable=protected-access
+            shape = (
+                self._initial_value.get_shape()
+                if validate_shape else tensor_shape.unknown_shape())
+          # In this case, the variable op can't be created until after the
+          # initial_value has been converted to a Tensor with a known type.
+          self._variable = state_ops.variable_op_v2(
+              shape, self._initial_value.dtype.base_dtype, name=name)
+
+        # Cache the name in `self`, because some APIs call `Variable.name` in a
+        # tight loop, and this halves the cost.
+        self._name = self._variable.name
+
+        # Manually overrides the variable's shape with the initial value's.
+        if validate_shape:
+          initial_value_shape = self._initial_value.get_shape()
+          if not initial_value_shape.is_fully_defined():
+            raise ValueError("initial_value must have a shape specified: %s" %
+                             self._initial_value)
+
+        # If 'initial_value' makes use of other variables, make sure we don't
+        # have an issue if these other variables aren't initialized first by
+        # using their initialized_value() method.
+        self._initializer_op = state_ops.assign(
+            self._variable,
+            variables._try_guard_against_uninitialized_dependencies(  # pylint: disable=protected-access
+                name, self._initial_value),
+            validate_shape=validate_shape).op
+
+        # TODO(vrv): Change this class to not take caching_device, but
+        # to take the op to colocate the snapshot with, so we can use
+        # colocation rather than devices.
+        if caching_device is not None:
+          with ops.device(caching_device):
+            self._snapshot = array_ops.identity(self._variable, name="read")
+        else:
+          with ops.colocate_with(self._variable.op):
+            self._snapshot = array_ops.identity(self._variable, name="read")
+      ops.add_to_collections(collections, self)
+
+    self._caching_device = caching_device
+    self._save_slice_info = None
+    self._constraint = constraint
+
+  def _init_from_proto(self, variable_def, import_scope=None):
+    """Recreates the Variable object from a `VariableDef` protocol buffer.
+
+    Args:
+      variable_def: `VariableDef` protocol buffer, describing a variable whose
+        nodes already exists in the graph.
+      import_scope: Optional `string`. Name scope to add.
+    """
+    assert isinstance(variable_def, variable_pb2.VariableDef)
+    # Create from variable_def.
+    g = ops.get_default_graph()
+    self._variable = g.as_graph_element(
+        ops.prepend_name_scope(
+            variable_def.variable_name, import_scope=import_scope))
+    self._name = self._variable.name
+    self._initializer_op = g.as_graph_element(
+        ops.prepend_name_scope(
+            variable_def.initializer_name, import_scope=import_scope))
+    # Tests whether initial_value_name exists first for backwards compatibility.
+    if (hasattr(variable_def, "initial_value_name") and
+        variable_def.initial_value_name):
+      self._initial_value = g.as_graph_element(
+          ops.prepend_name_scope(
+              variable_def.initial_value_name, import_scope=import_scope))
+    else:
+      self._initial_value = None
+    synchronization, aggregation, trainable = (
+        variables.validate_synchronization_aggregation_trainable(
+            variable_def.synchronization, variable_def.aggregation,
+            variable_def.trainable, variable_def.variable_name))
+    self._synchronization = synchronization
+    self._aggregation = aggregation
+    self._trainable = trainable
+    self._snapshot = g.as_graph_element(
+        ops.prepend_name_scope(
+            variable_def.snapshot_name, import_scope=import_scope))
+    if variable_def.HasField("save_slice_info_def"):
+      self._save_slice_info = variables.Variable.SaveSliceInfo(
+          save_slice_info_def=variable_def.save_slice_info_def,
+          import_scope=import_scope)
+    else:
+      self._save_slice_info = None
+    self._caching_device = None
+    self._constraint = None
+
+  def _as_graph_element(self):
+    """Conversion function for Graph.as_graph_element()."""
+    return self._variable
+
+  def value(self):
+    """Returns the last snapshot of this variable.
+
+    You usually do not need to call this method as all ops that need the value
+    of the variable call it automatically through a `convert_to_tensor()` call.
+
+    Returns a `Tensor` which holds the value of the variable.  You can not
+    assign a new value to this tensor as it is not a reference to the variable.
+
+    To avoid copies, if the consumer of the returned value is on the same device
+    as the variable, this actually returns the live value of the variable, not
+    a copy.  Updates to the variable are seen by the consumer.  If the consumer
+    is on a different device it will get a copy of the variable.
+
+    Returns:
+      A `Tensor` containing the value of the variable.
+    """
+    return self._snapshot
+
+  def read_value(self):
+    """Returns the value of this variable, read in the current context.
+
+    Can be different from value() if it's on another device, with control
+    dependencies, etc.
+
+    Returns:
+      A `Tensor` containing the value of the variable.
+    """
+    return array_ops.identity(self._variable, name="read")
+
+  def _ref(self):
+    """Returns a reference to this variable.
+
+    You usually do not need to call this method as all ops that need a reference
+    to the variable call it automatically.
+
+    Returns is a `Tensor` which holds a reference to the variable.  You can
+    assign a new value to the variable by passing the tensor to an assign op.
+    See `tf.Variable.value` if you want to get the value of the
+    variable.
+
+    Returns:
+      A `Tensor` that is a reference to the variable.
+    """
+    return self._variable
+
+  def set_shape(self, shape):
+    """Overrides the shape for this variable.
+
+    Args:
+      shape: the `TensorShape` representing the overridden shape.
+    """
+    self._ref().set_shape(shape)
+    self.value().set_shape(shape)
+
+  @property
+  def trainable(self):
+    return self._trainable
+
+  @property
+  def synchronization(self):
+    return self._synchronization
+
+  @property
+  def aggregation(self):
+    return self._aggregation
+
+  def eval(self, session=None):
+    """In a session, computes and returns the value of this variable.
+
+    This is not a graph construction method, it does not add ops to the graph.
+
+    This convenience method requires a session where the graph
+    containing this variable has been launched. If no session is
+    passed, the default session is used.  See `tf.compat.v1.Session` for more
+    information on launching a graph and on sessions.
+
+    ```python
+    v = tf.Variable([1, 2])
+    init = tf.compat.v1.global_variables_initializer()
+
+    with tf.compat.v1.Session() as sess:
+        sess.run(init)
+        # Usage passing the session explicitly.
+        print(v.eval(sess))
+        # Usage with the default session.  The 'with' block
+        # above makes 'sess' the default session.
+        print(v.eval())
+    ```
+
+    Args:
+      session: The session to use to evaluate this variable. If none, the
+        default session is used.
+
+    Returns:
+      A numpy `ndarray` with a copy of the value of this variable.
+    """
+    return self._variable.eval(session=session)
+
+  @property
+  def initial_value(self):
+    """Returns the Tensor used as the initial value for the variable.
+
+    Note that this is different from `initialized_value()` which runs
+    the op that initializes the variable before returning its value.
+    This method returns the tensor that is used by the op that initializes
+    the variable.
+
+    Returns:
+      A `Tensor`.
+    """
+    return self._initial_value
+
+  @property
+  def constraint(self):
+    """Returns the constraint function associated with this variable.
+
+    Returns:
+      The constraint function that was passed to the variable constructor.
+      Can be `None` if no constraint was passed.
+    """
+    return self._constraint
+
+  def assign(self, value, use_locking=False, name=None, read_value=True):
+    """Assigns a new value to the variable.
+
+    This is essentially a shortcut for `assign(self, value)`.
+
+    Args:
+      value: A `Tensor`. The new value for this variable.
+      use_locking: If `True`, use locking during the assignment.
+      name: The name of the operation to be created
+      read_value: if True, will return something which evaluates to the new
+        value of the variable; if False will return the assign op.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the assignment has completed.
+    """
+    assign = state_ops.assign(
+        self._variable, value, use_locking=use_locking, name=name)
+    if read_value:
+      return assign
+    return assign.op
+
+  def assign_add(self, delta, use_locking=False, name=None, read_value=True):
+    """Adds a value to this variable.
+
+     This is essentially a shortcut for `assign_add(self, delta)`.
+
+    Args:
+      delta: A `Tensor`. The value to add to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: The name of the operation to be created
+      read_value: if True, will return something which evaluates to the new
+        value of the variable; if False will return the assign op.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the addition has completed.
+    """
+    assign = state_ops.assign_add(
+        self._variable, delta, use_locking=use_locking, name=name)
+    if read_value:
+      return assign
+    return assign.op
+
+  def assign_sub(self, delta, use_locking=False, name=None, read_value=True):
+    """Subtracts a value from this variable.
+
+    This is essentially a shortcut for `assign_sub(self, delta)`.
+
+    Args:
+      delta: A `Tensor`. The value to subtract from this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: The name of the operation to be created
+      read_value: if True, will return something which evaluates to the new
+        value of the variable; if False will return the assign op.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the subtraction has completed.
+    """
+    assign = state_ops.assign_sub(
+        self._variable, delta, use_locking=use_locking, name=name)
+    if read_value:
+      return assign
+    return assign.op
+
+  def scatter_sub(self, sparse_delta, use_locking=False, name=None):
+    """Subtracts `tf.IndexedSlices` from this variable.
+
+    Args:
+      sparse_delta: `tf.IndexedSlices` to be subtracted from this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      TypeError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
+      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
+    return gen_state_ops.scatter_sub(
+        self._variable,
+        sparse_delta.indices,
+        sparse_delta.values,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_add(self, sparse_delta, use_locking=False, name=None):
+    """Adds `tf.IndexedSlices` to this variable.
+
+    Args:
+      sparse_delta: `tf.IndexedSlices` to be added to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered addition has completed.
+
+    Raises:
+      TypeError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
+      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
+    return gen_state_ops.scatter_add(
+        self._variable,
+        sparse_delta.indices,
+        sparse_delta.values,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_max(self, sparse_delta, use_locking=False, name=None):
+    """Updates this variable with the max of `tf.IndexedSlices` and itself.
+
+    Args:
+      sparse_delta: `tf.IndexedSlices` to use as an argument of max with this
+        variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered maximization has completed.
+
+    Raises:
+      TypeError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
+      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
+    return gen_state_ops.scatter_max(
+        self._variable,
+        sparse_delta.indices,
+        sparse_delta.values,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_min(self, sparse_delta, use_locking=False, name=None):
+    """Updates this variable with the min of `tf.IndexedSlices` and itself.
+
+    Args:
+      sparse_delta: `tf.IndexedSlices` to use as an argument of min with this
+        variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered minimization has completed.
+
+    Raises:
+      TypeError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
+      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
+    return gen_state_ops.scatter_min(
+        self._variable,
+        sparse_delta.indices,
+        sparse_delta.values,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_mul(self, sparse_delta, use_locking=False, name=None):
+    """Multiply this variable by `tf.IndexedSlices`.
+
+    Args:
+      sparse_delta: `tf.IndexedSlices` to multiply this variable by.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered multiplication has completed.
+
+    Raises:
+      TypeError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
+      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
+    return gen_state_ops.scatter_mul(
+        self._variable,
+        sparse_delta.indices,
+        sparse_delta.values,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_div(self, sparse_delta, use_locking=False, name=None):
+    """Divide this variable by `tf.IndexedSlices`.
+
+    Args:
+      sparse_delta: `tf.IndexedSlices` to divide this variable by.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered division has completed.
+
+    Raises:
+      TypeError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
+      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
+    return gen_state_ops.scatter_div(
+        self._variable,
+        sparse_delta.indices,
+        sparse_delta.values,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `tf.IndexedSlices` to this variable.
+
+    Args:
+      sparse_delta: `tf.IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered assignment has completed.
+
+    Raises:
+      TypeError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
+      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
+    return gen_state_ops.scatter_update(
+        self._variable,
+        sparse_delta.indices,
+        sparse_delta.values,
+        use_locking=use_locking,
+        name=name)
+
+  def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `tf.IndexedSlices` to this variable batch-wise.
+
+    Analogous to `batch_gather`. This assumes that this variable and the
+    sparse_delta IndexedSlices have a series of leading dimensions that are the
+    same for all of them, and the updates are performed on the last dimension of
+    indices. In other words, the dimensions should be the following:
+
+    `num_prefix_dims = sparse_delta.indices.ndims - 1`
+    `batch_dim = num_prefix_dims + 1`
+    `sparse_delta.updates.shape = sparse_delta.indices.shape + var.shape[
+         batch_dim:]`
+
+    where
+
+    `sparse_delta.updates.shape[:num_prefix_dims]`
+    `== sparse_delta.indices.shape[:num_prefix_dims]`
+    `== var.shape[:num_prefix_dims]`
+
+    And the operation performed can be expressed as:
+
+    `var[i_1, ..., i_n,
+         sparse_delta.indices[i_1, ..., i_n, j]] = sparse_delta.updates[
+            i_1, ..., i_n, j]`
+
+    When sparse_delta.indices is a 1D tensor, this operation is equivalent to
+    `scatter_update`.
+
+    To avoid this operation one can looping over the first `ndims` of the
+    variable and using `scatter_update` on the subtensors that result of slicing
+    the first dimension. This is a valid option for `ndims = 1`, but less
+    efficient than this implementation.
+
+    Args:
+      sparse_delta: `tf.IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered assignment has completed.
+
+    Raises:
+      TypeError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return state_ops.batch_scatter_update(
+        self,
+        sparse_delta.indices,
+        sparse_delta.values,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_nd_sub(self, indices, updates, name=None):
+    """Applies sparse subtraction to individual values or slices in a Variable.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    For example, say we want to add 4 scattered elements to a rank-1 tensor to
+    8 elements. In Python, that update would look like this:
+
+    ```python
+        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        indices = tf.constant([[4], [3], [1] ,[7]])
+        updates = tf.constant([9, 10, 11, 12])
+        op = ref.scatter_nd_sub(indices, updates)
+        with tf.compat.v1.Session() as sess:
+          print sess.run(op)
+    ```
+
+    The resulting update to ref would look like this:
+
+        [1, -9, 3, -6, -6, 6, 7, -4]
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+    """
+    return gen_state_ops.scatter_nd_sub(
+        self._variable, indices, updates, use_locking=True, name=name)
+
+  def scatter_nd_add(self, indices, updates, name=None):
+    """Applies sparse addition to individual values or slices in a Variable.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    For example, say we want to add 4 scattered elements to a rank-1 tensor to
+    8 elements. In Python, that update would look like this:
+
+    ```python
+        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        indices = tf.constant([[4], [3], [1] ,[7]])
+        updates = tf.constant([9, 10, 11, 12])
+        add = ref.scatter_nd_add(indices, updates)
+        with tf.compat.v1.Session() as sess:
+          print sess.run(add)
+    ```
+
+    The resulting update to ref would look like this:
+
+        [1, 13, 3, 14, 14, 6, 7, 20]
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered addition has completed.
+    """
+    return gen_state_ops.scatter_nd_add(
+        self._variable, indices, updates, use_locking=True, name=name)
+
+  def scatter_nd_update(self, indices, updates, name=None):
+    """Applies sparse assignment to individual values or slices in a Variable.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    For example, say we want to add 4 scattered elements to a rank-1 tensor to
+    8 elements. In Python, that update would look like this:
+
+    ```python
+        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        indices = tf.constant([[4], [3], [1] ,[7]])
+        updates = tf.constant([9, 10, 11, 12])
+        op = ref.scatter_nd_update(indices, updates)
+        with tf.compat.v1.Session() as sess:
+          print sess.run(op)
+    ```
+
+    The resulting update to ref would look like this:
+
+        [1, 11, 3, 10, 9, 6, 7, 12]
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered assignment has completed.
+    """
+    return gen_state_ops.scatter_nd_update(
+        self._variable, indices, updates, use_locking=True, name=name)
+
+  def scatter_nd_max(self, indices, updates, name=None):
+    """Updates this variable with the max of `tf.IndexedSlices` and itself.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered addition has completed.
+    """
+    return gen_state_ops.scatter_nd_max(
+        self._variable, indices, updates, use_locking=True, name=name)
+
+  def scatter_nd_min(self, indices, updates, name=None):
+    """Updates this variable with the min of `tf.IndexedSlices` and itself.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered addition has completed.
+    """
+    return gen_state_ops.scatter_nd_min(
+        self._variable, indices, updates, use_locking=True, name=name)
+
+  def _strided_slice_assign(self, begin, end, strides, value, name, begin_mask,
+                            end_mask, ellipsis_mask, new_axis_mask,
+                            shrink_axis_mask):
+    return gen_array_ops.strided_slice_assign(
+        ref=self._ref(),
+        begin=begin,
+        end=end,
+        strides=strides,
+        value=value,
+        name=name,
+        begin_mask=begin_mask,
+        end_mask=end_mask,
+        ellipsis_mask=ellipsis_mask,
+        new_axis_mask=new_axis_mask,
+        shrink_axis_mask=shrink_axis_mask)
+
+  @deprecated(None, "Prefer Dataset.range instead.")
+  def count_up_to(self, limit):
+    """Increments this variable until it reaches `limit`.
+
+    When that Op is run it tries to increment the variable by `1`. If
+    incrementing the variable would bring it above `limit` then the Op raises
+    the exception `OutOfRangeError`.
+
+    If no error is raised, the Op outputs the value of the variable before
+    the increment.
+
+    This is essentially a shortcut for `count_up_to(self, limit)`.
+
+    Args:
+      limit: value at which incrementing the variable raises an error.
+
+    Returns:
+      A `Tensor` that will hold the variable value before the increment. If no
+      other Op modifies this variable, the values produced will all be
+      distinct.
+    """
+    return state_ops.count_up_to(self._variable, limit=limit)
+
+  # Conversion to tensor.
+  @staticmethod
+  def _TensorConversionFunction(v, dtype=None, name=None, as_ref=False):  # pylint: disable=invalid-name
+    """Utility function for converting a Variable to a Tensor."""
+    _ = name
+    if dtype and not dtype.is_compatible_with(v.dtype):
+      raise ValueError(
+          "Incompatible type conversion requested to type '%s' for variable "
+          "of type '%s'" % (dtype.name, v.dtype.name))
+    if as_ref:
+      return v._ref()  # pylint: disable=protected-access
+    else:
+      return v.value()
+
+  # NOTE(mrry): This enables the Variable's overloaded "right" binary
+  # operators to run when the left operand is an ndarray, because it
+  # accords the Variable class higher priority than an ndarray, or a
+  # numpy matrix.
+  # TODO(mrry): Convert this to using numpy's __numpy_ufunc__
+  # mechanism, which allows more control over how Variables interact
+  # with ndarrays.
+  __array_priority__ = 100
+
+  @property
+  def name(self):
+    """The name of this variable."""
+    return self._name
+
+  @property
+  def initializer(self):
+    """The initializer operation for this variable."""
+    return self._initializer_op
+
+  @property
+  def device(self):
+    """The device of this variable."""
+    return self._variable.device
+
+  @property
+  def dtype(self):
+    """The `DType` of this variable."""
+    return self._variable.dtype
+
+  @property
+  def op(self):
+    """The `Operation` of this variable."""
+    return self._variable.op
+
+  @property
+  def graph(self):
+    """The `Graph` of this variable."""
+    return self._variable.graph
+
+  @property
+  def _distribute_strategy(self):
+    """The `tf.distribute.Strategy` that this variable was created under."""
+    return None  # Ref variables are never created inside a strategy.
+
+  @property
+  def shape(self):
+    """The `TensorShape` of this variable.
+
+    Returns:
+      A `TensorShape`.
+    """
+    return self._variable.get_shape()
+
+  def to_proto(self, export_scope=None):
+    """Converts a `Variable` to a `VariableDef` protocol buffer.
+
+    Args:
+      export_scope: Optional `string`. Name scope to remove.
+
+    Returns:
+      A `VariableDef` protocol buffer, or `None` if the `Variable` is not
+      in the specified name scope.
+    """
+    if (export_scope is None or self._variable.name.startswith(export_scope)):
+      var_def = variable_pb2.VariableDef()
+      var_def.variable_name = ops.strip_name_scope(self._variable.name,
+                                                   export_scope)
+      if self._initial_value is not None:
+        # For backwards compatibility.
+        var_def.initial_value_name = ops.strip_name_scope(
+            self._initial_value.name, export_scope)
+      var_def.trainable = self.trainable
+      var_def.synchronization = self.synchronization.value
+      var_def.aggregation = self.aggregation.value
+      var_def.initializer_name = ops.strip_name_scope(self.initializer.name,
+                                                      export_scope)
+      var_def.snapshot_name = ops.strip_name_scope(self._snapshot.name,
+                                                   export_scope)
+      if self._save_slice_info:
+        var_def.save_slice_info_def.MergeFrom(
+            self._save_slice_info.to_proto(export_scope=export_scope))
+      return var_def
+    else:
+      return None
+
+  def __iadd__(self, other):
+    logging.log_first_n(
+        logging.WARN, "Variable += will be deprecated. Use variable.assign_add"
+        " if you want assignment to the variable value or 'x = x + y'"
+        " if you want a new python Tensor object.", 1)
+    return self + other
+
+  def __isub__(self, other):
+    logging.log_first_n(
+        logging.WARN, "Variable -= will be deprecated. Use variable.assign_sub"
+        " if you want assignment to the variable value or 'x = x - y'"
+        " if you want a new python Tensor object.", 1)
+    return self - other
+
+  def __imul__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable *= will be deprecated. Use `var.assign(var * other)`"
+        " if you want assignment to the variable value or `x = x * y`"
+        " if you want a new python Tensor object.", 1)
+    return self * other
+
+  def __idiv__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable /= will be deprecated. Use `var.assign(var / other)`"
+        " if you want assignment to the variable value or `x = x / y`"
+        " if you want a new python Tensor object.", 1)
+    return self / other
+
+  def __itruediv__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable /= will be deprecated. Use `var.assign(var / other)`"
+        " if you want assignment to the variable value or `x = x / y`"
+        " if you want a new python Tensor object.", 1)
+    return self / other
+
+  def __irealdiv__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable /= will be deprecated. Use `var.assign(var / other)`"
+        " if you want assignment to the variable value or `x = x / y`"
+        " if you want a new python Tensor object.", 1)
+    return self / other
+
+  def __ipow__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable **= will be deprecated. Use `var.assign(var ** other)`"
+        " if you want assignment to the variable value or `x = x ** y`"
+        " if you want a new python Tensor object.", 1)
+    return self**other
+
+  def _serialize_to_tensors(self):
+    """Implements Trackable._serialize_to_tensors."""
+    return {trackable.VARIABLE_VALUE_KEY: self}
+
+  def _restore_from_tensors(self, restored_tensors):
+    """Implements Trackable._restore_from_tensors."""
+    restored_tensor = restored_tensors[trackable.VARIABLE_VALUE_KEY]
+    return state_ops.assign(
+        self,
+        restored_tensor,
+        validate_shape=self.get_shape().is_fully_defined())
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+tensor_conversion_registry.register_tensor_conversion_function(
+    RefVariable, RefVariable._TensorConversionFunction)  # pylint: disable=protected-access
+
+
+variable_v1.set_variable_from_proto_fn(RefVariable)
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 37754aed29e..5aff05ac1b8 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -29,6 +29,7 @@ from tensorflow.python.checkpoint import tensor_callable
 from tensorflow.python.client import pywrap_tf_session
 from tensorflow.python.compat import compat as forward_compat
 from tensorflow.python.eager import context
+from tensorflow.python.eager import record
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import auto_control_deps_utils as acd
 from tensorflow.python.framework import composite_tensor
@@ -38,8 +39,8 @@ from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import indexed_slices
-from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor as tensor_module
 from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
@@ -333,7 +334,47 @@ def variable_accessed(variable):
     tape.variable_accessed(variable)
 
 
-class BaseResourceVariable(variables.VariableV1, core.Tensor):
+def default_variable_creator_v2(next_creator=None, **kwargs):
+  """Default variable creator."""
+  assert next_creator is None
+  initial_value = kwargs.get("initial_value", None)
+  trainable = kwargs.get("trainable", None)
+  validate_shape = kwargs.get("validate_shape", True)
+  caching_device = kwargs.get("caching_device", None)
+  name = kwargs.get("name", None)
+  variable_def = kwargs.get("variable_def", None)
+  dtype = kwargs.get("dtype", None)
+  import_scope = kwargs.get("import_scope", None)
+  constraint = kwargs.get("constraint", None)
+  distribute_strategy = kwargs.get("distribute_strategy", None)
+  synchronization = kwargs.get("synchronization", None)
+  aggregation = kwargs.get("aggregation", None)
+  shape = kwargs.get("shape", None)
+  experimental_enable_variable_lifting = kwargs.get(
+      "experimental_enable_variable_lifting", None)
+
+  return ResourceVariable(
+      initial_value=initial_value,
+      trainable=trainable,
+      validate_shape=validate_shape,
+      caching_device=caching_device,
+      name=name,
+      dtype=dtype,
+      constraint=constraint,
+      variable_def=variable_def,
+      import_scope=import_scope,
+      distribute_strategy=distribute_strategy,
+      synchronization=synchronization,
+      aggregation=aggregation,
+      shape=shape,
+      experimental_enable_variable_lifting=experimental_enable_variable_lifting,
+      )
+
+
+variables.default_variable_creator_v2 = default_variable_creator_v2
+
+
+class BaseResourceVariable(variables.Variable, core.Tensor):
   """A python variable from an existing handle."""
 
   # TODO(wangpeng): Deprecate `constraint` when callers no long pass it in.
@@ -755,7 +796,7 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     if not context.executing_eagerly():
       # Note that if a control flow context is active the input of the read op
       # might not actually be the handle. This line bypasses it.
-      tape.record_operation(
+      record.record_operation(
           "ReadVariableOp", [result], [self.handle],
           backward_function=lambda x: [x],
           forward_function=lambda x: [x])
@@ -807,8 +848,9 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
           value._handle_data = (  # pylint: disable=protected-access
               cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData(
                   is_set=True, shape_and_type=handle_data.shape_and_type[1:]))
+        return array_ops.identity(value)
 
-    return array_ops.identity(value)
+    return value
 
   def gather_nd(self, indices, name=None):
     """Reads the value of this variable sparsely, using `gather_nd`."""
@@ -2442,55 +2484,6 @@ def _GatherGrad(op, grad):
   return (indexed_slices.IndexedSlices(values, indices, params_shape), None)
 
 
-def _to_proto_fn(v, export_scope=None):
-  """Converts Variable and ResourceVariable to VariableDef for collections."""
-  return v.to_proto(export_scope=export_scope)
-
-
-def _from_proto_fn(v, import_scope=None):
-  """Creates Variable or ResourceVariable from VariableDef as needed."""
-  if v.is_resource:
-    return ResourceVariable.from_proto(v, import_scope=import_scope)
-  return variables.Variable.from_proto(v, import_scope=import_scope)
-
-
-ops.register_proto_function(
-    ops.GraphKeys.GLOBAL_VARIABLES,
-    proto_type=variable_pb2.VariableDef,
-    to_proto=_to_proto_fn,
-    from_proto=_from_proto_fn)
-ops.register_proto_function(
-    ops.GraphKeys.TRAINABLE_VARIABLES,
-    proto_type=variable_pb2.VariableDef,
-    to_proto=_to_proto_fn,
-    from_proto=_from_proto_fn)
-ops.register_proto_function(
-    ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
-    proto_type=variable_pb2.VariableDef,
-    to_proto=_to_proto_fn,
-    from_proto=_from_proto_fn)
-ops.register_proto_function(
-    ops.GraphKeys.LOCAL_VARIABLES,
-    proto_type=variable_pb2.VariableDef,
-    to_proto=_to_proto_fn,
-    from_proto=_from_proto_fn)
-ops.register_proto_function(
-    ops.GraphKeys.MODEL_VARIABLES,
-    proto_type=variable_pb2.VariableDef,
-    to_proto=_to_proto_fn,
-    from_proto=_from_proto_fn)
-ops.register_proto_function(
-    ops.GraphKeys.GLOBAL_STEP,
-    proto_type=variable_pb2.VariableDef,
-    to_proto=_to_proto_fn,
-    from_proto=_from_proto_fn)
-ops.register_proto_function(
-    ops.GraphKeys.METRIC_VARIABLES,
-    proto_type=variable_pb2.VariableDef,
-    to_proto=_to_proto_fn,
-    from_proto=_from_proto_fn)
-
-
 @tf_export("__internal__.ops.is_resource_variable", v1=[])
 def is_resource_variable(var):
   """"Returns True if `var` is to be considered a ResourceVariable."""
@@ -2770,7 +2763,7 @@ def write_object_proto_for_resource_variable(resource_variable,
                      f"{resource_variable.name} because of "
                      f"unexpected suffix in the name (expected ':0')"
                      f"which won't be restored.")
-  proto.variable.name = meta_graph._op_name(resource_variable.name)  # pylint: disable=protected-access
+  proto.variable.name = tensor_module.get_op_name(resource_variable.name)
   proto.variable.trainable = resource_variable.trainable
   proto.variable.dtype = resource_variable.dtype.as_datatype_enum
   proto.variable.synchronization = resource_variable.synchronization.value
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 43601987bd6..a0c3bee073d 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -21,8 +21,8 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2
 from tensorflow.python.ops import math_ops
@@ -248,7 +248,7 @@ def _rnn_step(time,
 
     flat_new_state = nest.flatten(new_state)
     flat_new_output = nest.flatten(new_output)
-    return control_flow_ops.cond(
+    return cond.cond(
         # if t < min_seq_len: calculate and return everything
         time < min_sequence_length,
         lambda: flat_new_output + flat_new_state,
@@ -270,7 +270,7 @@ def _rnn_step(time,
     final_output_and_state = _copy_some_through(new_output, new_state)
   else:
     empty_update = lambda: flat_zero_output + flat_state
-    final_output_and_state = control_flow_ops.cond(
+    final_output_and_state = cond.cond(
         # if t >= max_seq_len: copy all state through, output zeros
         time >= max_sequence_length,
         empty_update,
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 5ff1314b05c..c51cb6df5b5 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -27,7 +27,7 @@ from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import backprop_util
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape as tape_lib
+from tensorflow.python.eager import record
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -130,7 +130,7 @@ class EagerFunc:
   def __call__(self, device, token, args):
     """Calls `self._func` in eager mode, recording the tape if needed."""
     use_tape_cache = (
-        self._support_graph_mode_gradient or tape_lib.could_possibly_record())
+        self._support_graph_mode_gradient or record.could_possibly_record())
 
     if use_tape_cache:
       with backprop.GradientTape() as tape:
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index bad826bbd31..1f495afa70c 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -35,7 +35,8 @@ def set_size(a, validate_indices=True):
   Args:
     a: `SparseTensor`, with indices sorted in row-major order.
     validate_indices: Whether to validate the order and range of sparse indices
-      in `a`.
+      in `a`. Note that setting this to `false` allows for undefined behavior
+      when calling this function with invalid indices.
 
   Returns:
     `int32` `Tensor` of set sizes. For `a` ranked `n`, this is a `Tensor` with
diff --git a/tensorflow/python/ops/signal/BUILD b/tensorflow/python/ops/signal/BUILD
index 950478fa159..d1b8d460602 100644
--- a/tensorflow/python/ops/signal/BUILD
+++ b/tensorflow/python/ops/signal/BUILD
@@ -1,39 +1,164 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
 
-py_library(
-    name = "signal",
-    srcs = [
-        "dct_ops.py",
-        "fft_ops.py",
-        "mel_ops.py",
-        "mfcc_ops.py",
-        "reconstruction_ops.py",
-        "shape_ops.py",
-        "signal.py",
-        "spectral_ops.py",
-        "util_ops.py",
-        "window_ops.py",
+py_strict_library(
+    name = "dct_ops",
+    srcs = ["dct_ops.py"],
+    deps = [
+        ":fft_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:smart_cond",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
     ],
-    srcs_version = "PY3",
+)
+
+py_strict_library(
+    name = "fft_ops",
+    srcs = ["fft_ops.py"],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:manip_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:special_math_ops",
         "//tensorflow/python:spectral_ops_gen",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "mel_ops",
+    srcs = ["mel_ops.py"],
+    deps = [
+        ":shape_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "mfcc_ops",
+    srcs = ["mfcc_ops.py"],
+    deps = [
+        ":dct_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "reconstruction_ops",
+    srcs = ["reconstruction_ops.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "shape_ops",
+    srcs = ["shape_ops.py"],
+    deps = [
+        ":util_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "signal",
+    srcs = ["signal.py"],
+    deps = [
+        ":dct_ops",
+        ":fft_ops",
+        ":mel_ops",
+        ":mfcc_ops",
+        ":reconstruction_ops",
+        ":shape_ops",
+        ":spectral_ops",
+        ":window_ops",
+    ],
+)
+
+py_strict_library(
+    name = "spectral_ops",
+    srcs = ["spectral_ops.py"],
+    deps = [
+        ":dct_ops",
+        ":fft_ops",
+        ":reconstruction_ops",
+        ":shape_ops",
+        ":window_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "util_ops",
+    srcs = ["util_ops.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:while_loop",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+    ],
+)
+
+py_strict_library(
+    name = "window_ops",
+    srcs = ["window_ops.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/ops/signal/util_ops.py b/tensorflow/python/ops/signal/util_ops.py
index 20d8cf2942a..b0f3f2ef86b 100644
--- a/tensorflow/python/ops/signal/util_ops.py
+++ b/tensorflow/python/ops/signal/util_ops.py
@@ -21,8 +21,8 @@ import sys
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import while_loop
 
 
 def gcd(a, b, name=None):
@@ -65,5 +65,5 @@ def gcd(a, b, name=None):
 
     cond = lambda _, b: math_ops.greater(b, array_ops.zeros_like(b))
     body = lambda a, b: [b, math_ops.mod(a, b)]
-    a, b = control_flow_ops.while_loop(cond, body, [a, b], back_prop=False)
+    a, b = while_loop.while_loop(cond, body, [a, b], back_prop=False)
     return a
diff --git a/tensorflow/python/ops/signal/window_ops.py b/tensorflow/python/ops/signal/window_ops.py
index f17effa82be..0dafd1e0a45 100644
--- a/tensorflow/python/ops/signal/window_ops.py
+++ b/tensorflow/python/ops/signal/window_ops.py
@@ -21,7 +21,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.util import dispatch
@@ -239,7 +239,7 @@ def _raised_cosine_window(name, default_name, window_length, periodic,
 
     if window_length_const is not None:
       return math_ops.cast(a - b * math_ops.cos(cos_arg), dtype=dtype)
-    return control_flow_ops.cond(
+    return cond.cond(
         math_ops.equal(window_length, 1),
         lambda: array_ops.ones([window_length], dtype=dtype),
         lambda: math_ops.cast(a - b * math_ops.cos(cos_arg), dtype=dtype))
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 7bcc4e3835e..ff4b552044a 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -42,10 +42,10 @@ from tensorflow.python.ops.check_ops import *
 from tensorflow.python.ops.clip_ops import *
 from tensorflow.python.ops.special_math_ops import *
 # TODO(vrv): Switch to import * once we're okay with exposing the module.
+from tensorflow.python.ops.cond import cond
 from tensorflow.python.ops.confusion_matrix import confusion_matrix
 from tensorflow.python.ops.control_flow_assert import Assert
 from tensorflow.python.ops.control_flow_case import case
-from tensorflow.python.ops.control_flow_ops import cond
 from tensorflow.python.ops.control_flow_ops import group
 from tensorflow.python.ops.control_flow_ops import no_op
 from tensorflow.python.ops.control_flow_ops import tuple  # pylint: disable=redefined-builtin
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
index 211252a0ef9..16df7a1c8c7 100644
--- a/tensorflow/python/ops/stateful_random_ops.py
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Operations for generating random numbers."""
 
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
@@ -210,7 +210,7 @@ def _convert_to_state_tensor(t):
 
 
 def get_replica_id():
-  rctx = ds_context.get_replica_context()
+  rctx = distribute_lib.get_replica_context()
   if rctx is None:
     return None
   return rctx.replica_id_in_sync_group
@@ -445,8 +445,8 @@ class Generator(autotrackable.AutoTrackable):
         the same random state) across all architectures (CPU, GPU, XLA etc).
     """
     # TODO(b/175072242): Remove distribution-strategy dependencies in this file.
-    if ds_context.has_strategy():
-      self._distribution_strategy = ds_context.get_strategy()
+    if distribute_lib.has_strategy():
+      self._distribution_strategy = distribute_lib.get_strategy()
     else:
       self._distribution_strategy = None
     if copy_from is not None:
@@ -606,25 +606,25 @@ class Generator(autotrackable.AutoTrackable):
       # replica.
       return update_fn(self.state)
     if self._distribution_strategy is not None:
-      with ds_context.enter_or_assert_strategy(self._distribution_strategy):
-        if ds_context.in_cross_replica_context():
+      with distribute_lib.enter_or_assert_strategy(self._distribution_strategy):
+        if distribute_lib.in_cross_replica_context():
           # Code that operates on all replicas of a variable cannot be saved
           # without retracing.
           values_util.mark_as_unsaveable()
-        if (ds_context.in_cross_replica_context() or
+        if (distribute_lib.in_cross_replica_context() or
             "CentralStorage" in type(self._distribution_strategy).__name__):
           # In cross-replica context we need to use strategy.extended.update.
           # In CentralStorageStrategy we also need to use
           # strategy.extended.update (even for replica context),
           # because variable updates here must be within merge_call.
-          return ds_context.get_strategy().extended.update(
+          return distribute_lib.get_strategy().extended.update(
               self.state, update_fn)
     return update_fn(self.state)
 
   def _preprocess_key(self, key):
     if self._distribution_strategy is None:
       return key
-    with ds_context.enter_or_assert_strategy(self._distribution_strategy):
+    with distribute_lib.enter_or_assert_strategy(self._distribution_strategy):
       replica_id = get_replica_id()
       if replica_id is not None:
         replica_id = array_ops_stack.stack([replica_id, 0], axis=0)
diff --git a/tensorflow/python/ops/structured/BUILD b/tensorflow/python/ops/structured/BUILD
index 913d6015a4c..877a9d858ad 100644
--- a/tensorflow/python/ops/structured/BUILD
+++ b/tensorflow/python/ops/structured/BUILD
@@ -49,7 +49,6 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:type_spec",
-        "//tensorflow/python:util",
         "//tensorflow/python/framework:extension_type",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
@@ -58,6 +57,11 @@ py_library(
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:row_partition",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
@@ -92,10 +96,11 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:type_spec",
-        "//tensorflow/python:util",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:row_partition",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
         "//third_party/py/numpy",
     ],
 )
@@ -122,6 +127,7 @@ py_test(
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:row_partition",
+        "//tensorflow/python/util:dispatch",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -165,6 +171,7 @@ py_test(
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:row_partition",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/ops/v1_compat_tests/BUILD b/tensorflow/python/ops/v1_compat_tests/BUILD
index 27e5709e223..aba67c474ae 100644
--- a/tensorflow/python/ops/v1_compat_tests/BUILD
+++ b/tensorflow/python/ops/v1_compat_tests/BUILD
@@ -17,7 +17,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 2fa58b64d2b..bb0e73e735e 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -42,6 +42,7 @@ from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
+
 __all__ = [
     "AUTO_REUSE", "VariableScope", "get_variable_scope", "get_variable",
     "get_local_variable", "variable_scope", "variable_op_scope",
@@ -954,7 +955,7 @@ class _VariableStore:
     if use_resource is None:
       # Set the default value if unspecified.
       use_resource = _DEFAULT_USE_RESOURCE
-    v = variables.VariableV1(
+    v = _variable_v1(
         initial_value=init_val,
         name=name,
         trainable=trainable,
@@ -2678,113 +2679,22 @@ def _iter_slices(full_shape, num_slices, slice_dim):
     offset[slice_dim] += shape[slice_dim]
 
 
-def default_variable_creator(next_creator=None, **kwargs):
-  """Default variable creator."""
-  assert next_creator is None
-  initial_value = kwargs.get("initial_value", None)
-  trainable = kwargs.get("trainable", None)
-  collections = kwargs.get("collections", None)
-  validate_shape = kwargs.get("validate_shape", True)
-  caching_device = kwargs.get("caching_device", None)
-  name = kwargs.get("name", None)
-  variable_def = kwargs.get("variable_def", None)
-  dtype = kwargs.get("dtype", None)
-  expected_shape = kwargs.get("expected_shape", None)
-  import_scope = kwargs.get("import_scope", None)
-  constraint = kwargs.get("constraint", None)
-  use_resource = kwargs.get("use_resource", None)
-  synchronization = kwargs.get("synchronization", None)
-  aggregation = kwargs.get("aggregation", None)
-  shape = kwargs.get("shape", None)
-
-  if use_resource is None:
-    use_resource = get_variable_scope().use_resource
-  if use_resource is None:
-    use_resource = _DEFAULT_USE_RESOURCE
-  use_resource = use_resource or context.executing_eagerly()
-  if use_resource:
-    distribute_strategy = kwargs.get("distribute_strategy", None)
-    return resource_variable_ops.ResourceVariable(
-        initial_value=initial_value,
-        trainable=trainable,
-        collections=collections,
-        validate_shape=validate_shape,
-        caching_device=caching_device,
-        name=name,
-        dtype=dtype,
-        constraint=constraint,
-        variable_def=variable_def,
-        import_scope=import_scope,
-        distribute_strategy=distribute_strategy,
-        synchronization=synchronization,
-        aggregation=aggregation,
-        shape=shape)
-  else:
-    return variables.RefVariable(
-        initial_value=initial_value,
-        trainable=trainable,
-        collections=collections,
-        validate_shape=validate_shape,
-        caching_device=caching_device,
-        name=name,
-        dtype=dtype,
-        constraint=constraint,
-        variable_def=variable_def,
-        expected_shape=expected_shape,
-        import_scope=import_scope,
-        synchronization=synchronization,
-        aggregation=aggregation,
-        shape=shape)
-
-
-def default_variable_creator_v2(next_creator=None, **kwargs):
-  """Default variable creator."""
-  assert next_creator is None
-  initial_value = kwargs.get("initial_value", None)
-  trainable = kwargs.get("trainable", None)
-  validate_shape = kwargs.get("validate_shape", True)
-  caching_device = kwargs.get("caching_device", None)
-  name = kwargs.get("name", None)
-  variable_def = kwargs.get("variable_def", None)
-  dtype = kwargs.get("dtype", None)
-  import_scope = kwargs.get("import_scope", None)
-  constraint = kwargs.get("constraint", None)
-  distribute_strategy = kwargs.get("distribute_strategy", None)
-  synchronization = kwargs.get("synchronization", None)
-  aggregation = kwargs.get("aggregation", None)
-  shape = kwargs.get("shape", None)
-  experimental_enable_variable_lifting = kwargs.get(
-      "experimental_enable_variable_lifting", None)
-
-  return resource_variable_ops.ResourceVariable(
-      initial_value=initial_value,
-      trainable=trainable,
-      validate_shape=validate_shape,
-      caching_device=caching_device,
-      name=name,
-      dtype=dtype,
-      constraint=constraint,
-      variable_def=variable_def,
-      import_scope=import_scope,
-      distribute_strategy=distribute_strategy,
-      synchronization=synchronization,
-      aggregation=aggregation,
-      shape=shape,
-      experimental_enable_variable_lifting=experimental_enable_variable_lifting,
-      )
-
-
-variables.default_variable_creator = default_variable_creator
-variables.default_variable_creator_v2 = default_variable_creator_v2
-
-
 def _make_getter(captured_getter, captured_previous):
   """Gets around capturing loop variables in python being broken."""
   return lambda **kwargs: captured_getter(captured_previous, **kwargs)
 
 
+_variable_v1 = None
+
+
+def set_variable_v1(variable_v1):
+  """Sets a reference to variable_v1.VariableV1."""
+  global _variable_v1
+  _variable_v1 = variable_v1
+
+
 # TODO(apassos) remove forwarding symbol
-variable = variables.VariableV1
+variable = _variable_v1
 
 
 @tf_export(v1=["variable_creator_scope"])
diff --git a/tensorflow/python/ops/variable_v1.py b/tensorflow/python/ops/variable_v1.py
new file mode 100644
index 00000000000..d7d4f0e5dae
--- /dev/null
+++ b/tensorflow/python/ops/variable_v1.py
@@ -0,0 +1,325 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""VariableV1 class."""
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import cond
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.util import tf_should_use
+from tensorflow.python.util.tf_export import tf_export
+
+
+_variable_from_proto_fn = None
+
+
+def set_variable_from_proto_fn(variable_from_proto_fn):
+  """Set the variable class that variable proto defs will be converted to."""
+  global _variable_from_proto_fn
+  _variable_from_proto_fn = variable_from_proto_fn
+
+
+@tf_export(v1=["is_variable_initialized"])
+@tf_should_use.should_use_result
+def is_variable_initialized(variable):
+  """Tests if a variable has been initialized.
+
+  Args:
+    variable: A `Variable`.
+
+  Returns:
+    Returns a scalar boolean Tensor, `True` if the variable has been
+    initialized, `False` otherwise.
+  """
+  return state_ops.is_variable_initialized(variable)
+
+
+def default_variable_creator(_, **kwds):
+  del kwds
+  raise NotImplementedError("ref_variable needs to be imported")
+
+
+@tf_export(v1=["Variable"])
+class VariableV1(variables.Variable):
+  """See the [Variables Guide](https://tensorflow.org/guide/variables).
+
+  A variable maintains state in the graph across calls to `run()`. You add a
+  variable to the graph by constructing an instance of the class `Variable`.
+
+  The `Variable()` constructor requires an initial value for the variable,
+  which can be a `Tensor` of any type and shape. The initial value defines the
+  type and shape of the variable. After construction, the type and shape of
+  the variable are fixed. The value can be changed using one of the assign
+  methods.
+
+  If you want to change the shape of a variable later you have to use an
+  `assign` Op with `validate_shape=False`.
+
+  Just like any `Tensor`, variables created with `Variable()` can be used as
+  inputs for other Ops in the graph. Additionally, all the operators
+  overloaded for the `Tensor` class are carried over to variables, so you can
+  also add nodes to the graph by just doing arithmetic on variables.
+
+  ```python
+  import tensorflow as tf
+
+  # Create a variable.
+  w = tf.Variable(<initial-value>, name=<optional-name>)
+
+  # Use the variable in the graph like any Tensor.
+  y = tf.matmul(w, ...another variable or tensor...)
+
+  # The overloaded operators are available too.
+  z = tf.sigmoid(w + y)
+
+  # Assign a new value to the variable with `assign()` or a related method.
+  w.assign(w + 1.0)
+  w.assign_add(1.0)
+  ```
+
+  When you launch the graph, variables have to be explicitly initialized before
+  you can run Ops that use their value. You can initialize a variable by
+  running its *initializer op*, restoring the variable from a save file, or
+  simply running an `assign` Op that assigns a value to the variable. In fact,
+  the variable *initializer op* is just an `assign` Op that assigns the
+  variable's initial value to the variable itself.
+
+  ```python
+  # Launch the graph in a session.
+  with tf.compat.v1.Session() as sess:
+      # Run the variable initializer.
+      sess.run(w.initializer)
+      # ...you now can run ops that use the value of 'w'...
+  ```
+
+  The most common initialization pattern is to use the convenience function
+  `global_variables_initializer()` to add an Op to the graph that initializes
+  all the variables. You then run that Op after launching the graph.
+
+  ```python
+  # Add an Op to initialize global variables.
+  init_op = tf.compat.v1.global_variables_initializer()
+
+  # Launch the graph in a session.
+  with tf.compat.v1.Session() as sess:
+      # Run the Op that initializes global variables.
+      sess.run(init_op)
+      # ...you can now run any Op that uses variable values...
+  ```
+
+  If you need to create a variable with an initial value dependent on another
+  variable, use the other variable's `initialized_value()`. This ensures that
+  variables are initialized in the right order.
+
+  All variables are automatically collected in the graph where they are
+  created. By default, the constructor adds the new variable to the graph
+  collection `GraphKeys.GLOBAL_VARIABLES`. The convenience function
+  `global_variables()` returns the contents of that collection.
+
+  When building a machine learning model it is often convenient to distinguish
+  between variables holding the trainable model parameters and other variables
+  such as a `global step` variable used to count training steps. To make this
+  easier, the variable constructor supports a `trainable=<bool>` parameter. If
+  `True`, the new variable is also added to the graph collection
+  `GraphKeys.TRAINABLE_VARIABLES`. The convenience function
+  `trainable_variables()` returns the contents of this collection. The
+  various `Optimizer` classes use this collection as the default list of
+  variables to optimize.
+
+  WARNING: tf.Variable objects by default have a non-intuitive memory model. A
+  Variable is represented internally as a mutable Tensor which can
+  non-deterministically alias other Tensors in a graph. The set of operations
+  which consume a Variable and can lead to aliasing is undetermined and can
+  change across TensorFlow versions. Avoid writing code which relies on the
+  value of a Variable either changing or not changing as other operations
+  happen. For example, using Variable objects or simple functions thereof as
+  predicates in a `tf.cond` is dangerous and error-prone:
+
+  ```
+  v = tf.Variable(True)
+  tf.cond(v, lambda: v.assign(False), my_false_fn)  # Note: this is broken.
+  ```
+
+  Here, adding `use_resource=True` when constructing the variable will
+  fix any nondeterminism issues:
+  ```
+  v = tf.Variable(True, use_resource=True)
+  tf.cond(v, lambda: v.assign(False), my_false_fn)
+  ```
+
+  To use the replacement for variables which does
+  not have these issues:
+
+  * Add `use_resource=True` when constructing `tf.Variable`;
+  * Call `tf.compat.v1.get_variable_scope().set_use_resource(True)` inside a
+    `tf.compat.v1.variable_scope` before the `tf.compat.v1.get_variable()` call.
+  """
+
+  def __init__(
+      self,  # pylint: disable=super-init-not-called
+      initial_value=None,
+      trainable=None,
+      collections=None,
+      validate_shape=True,
+      caching_device=None,
+      name=None,
+      variable_def=None,
+      dtype=None,
+      expected_shape=None,
+      import_scope=None,
+      constraint=None,
+      use_resource=None,
+      synchronization=variables.VariableSynchronization.AUTO,
+      aggregation=variables.VariableAggregation.NONE,
+      shape=None):
+    """Creates a new variable with value `initial_value`.
+
+    The new variable is added to the graph collections listed in `collections`,
+    which defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+
+    If `trainable` is `True` the variable is also added to the graph collection
+    `GraphKeys.TRAINABLE_VARIABLES`.
+
+    This constructor creates both a `variable` Op and an `assign` Op to set the
+    variable to its initial value.
+
+    Args:
+      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
+        which is the initial value for the Variable. The initial value must have
+        a shape specified unless `validate_shape` is set to False. Can also be a
+        callable with no argument that returns the initial value when called. In
+        that case, `dtype` must be specified. (Note that initializer functions
+        from init_ops.py must first be bound to a shape before being used here.)
+      trainable: If `True`, also adds the variable to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as the default
+        list of variables to use by the `Optimizer` classes. Defaults to `True`,
+        unless `synchronization` is set to `ON_READ`, in which case it defaults
+        to `False`.
+      collections: List of graph collections keys. The new variable is added to
+        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+      validate_shape: If `False`, allows the variable to be initialized with a
+        value of unknown shape. If `True`, the default, the shape of
+        `initial_value` must be known.
+      caching_device: Optional device string describing where the Variable
+        should be cached for reading.  Defaults to the Variable's device. If not
+        `None`, caches on another device.  Typical use is to cache on the device
+        where the Ops using the Variable reside, to deduplicate copying through
+        `Switch` and other conditional statements.
+      name: Optional name for the variable. Defaults to `'Variable'` and gets
+        uniquified automatically.
+      variable_def: `VariableDef` protocol buffer. If not `None`, recreates the
+        Variable object with its contents, referencing the variable's nodes in
+        the graph, which must already exist. The graph is not changed.
+        `variable_def` and the other arguments are mutually exclusive.
+      dtype: If set, initial_value will be converted to the given type. If
+        `None`, either the datatype will be kept (if `initial_value` is a
+        Tensor), or `convert_to_tensor` will decide.
+      expected_shape: A TensorShape. If set, initial_value is expected to have
+        this shape.
+      import_scope: Optional `string`. Name scope to add to the `Variable.` Only
+        used when initializing from protocol buffer.
+      constraint: An optional projection function to be applied to the variable
+        after being updated by an `Optimizer` (e.g. used to implement norm
+        constraints or value constraints for layer weights). The function must
+        take as input the unprojected Tensor representing the value of the
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
+      use_resource: whether to use resource variables.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
+      shape: (optional) The shape of this variable. If None, the shape of
+        `initial_value` will be used. When setting this argument to
+        `tf.TensorShape(None)` (representing an unspecified shape), the variable
+        can be assigned with values of different shapes.
+
+    Raises:
+      ValueError: If both `variable_def` and initial_value are specified.
+      ValueError: If the initial value is not specified, or does not have a
+        shape and `validate_shape` is `True`.
+      RuntimeError: If eager execution is enabled.
+    """
+
+  SaveSliceInfo = variables.Variable.SaveSliceInfo
+
+  def initialized_value(self):
+    with ops.init_scope():
+      return cond.cond(
+          is_variable_initialized(self), self.read_value,
+          lambda: self.initial_value)
+
+  @staticmethod
+  def from_proto(variable_def, import_scope=None):
+    return _variable_from_proto_fn(
+        variable_def=variable_def, import_scope=import_scope)
+
+  @classmethod
+  def _variable_call(
+      cls,
+      initial_value=None,
+      trainable=None,
+      validate_shape=True,
+      caching_device=None,
+      name=None,
+      variable_def=None,
+      dtype=None,
+      import_scope=None,
+      constraint=None,
+      synchronization=variables.VariableSynchronization.AUTO,
+      aggregation=variables.VariableAggregation.NONE,
+      shape=None,
+      experimental_enable_variable_lifting=None,
+      expected_shape=None,
+      collections=None,
+      use_resource=None,
+      **kwargs,
+    ):
+    """VariableV1 class getter. Useful to force the signature."""
+    if cls is not VariableV1:
+      return None
+    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
+    for _, getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
+      previous_getter = variables._make_getter(getter, previous_getter)  # pylint: disable=protected-access
+
+    # Reset `aggregation` that is explicitly set as `None` to the enum NONE.
+    if aggregation is None:
+      aggregation = variables.VariableAggregation.NONE
+    return previous_getter(
+        initial_value=initial_value,
+        trainable=trainable,
+        validate_shape=validate_shape,
+        caching_device=caching_device,
+        name=name,
+        variable_def=variable_def,
+        dtype=dtype,
+        import_scope=import_scope,
+        constraint=constraint,
+        synchronization=synchronization,
+        aggregation=aggregation,
+        shape=shape,
+        experimental_enable_variable_lifting=experimental_enable_variable_lifting,
+        expected_shape=expected_shape,
+        collections=collections,
+        use_resource=use_resource,
+    )
+
+variable_scope.set_variable_v1(VariableV1)
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 7d4a5d00be5..3e6dc5e3a51 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -20,28 +20,21 @@ import functools
 import itertools
 import os
 
-from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.trackable import base as trackable
-from tensorflow.python.types import core
 from tensorflow.python.util import _pywrap_utils
-from tensorflow.python.util import compat
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util import traceback_utils
@@ -50,14 +43,9 @@ from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
-def default_variable_creator(_, **kwds):
-  del kwds
-  raise NotImplementedError("variable_scope needs to be imported")
-
-
 def default_variable_creator_v2(_, **kwds):
   del kwds
-  raise NotImplementedError("variable_scope needs to be imported")
+  raise NotImplementedError("resource_variable_ops needs to be imported")
 
 
 def _make_getter(captured_getter, captured_previous):
@@ -200,97 +188,13 @@ def validate_synchronization_aggregation_trainable(synchronization, aggregation,
 class VariableMetaclass(abc.ABCMeta):
   """Metaclass to allow construction of tf.Variable to be overridden."""
 
-  def _variable_v1_call(cls,
-                        initial_value=None,
-                        trainable=None,
-                        collections=None,
-                        validate_shape=True,
-                        caching_device=None,
-                        name=None,
-                        variable_def=None,
-                        dtype=None,
-                        expected_shape=None,
-                        import_scope=None,
-                        constraint=None,
-                        use_resource=None,
-                        synchronization=VariableSynchronization.AUTO,
-                        aggregation=VariableAggregation.NONE,
-                        shape=None,
-                        experimental_enable_variable_lifting=None):
-    """Call on Variable class. Useful to force the signature."""
-    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
-    for _, getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
-      previous_getter = _make_getter(getter, previous_getter)
-
-    # Reset `aggregation` that is explicitly set as `None` to the enum NONE.
-    if aggregation is None:
-      aggregation = VariableAggregation.NONE
-    return previous_getter(
-        initial_value=initial_value,
-        trainable=trainable,
-        collections=collections,
-        validate_shape=validate_shape,
-        caching_device=caching_device,
-        name=name,
-        variable_def=variable_def,
-        dtype=dtype,
-        expected_shape=expected_shape,
-        import_scope=import_scope,
-        constraint=constraint,
-        use_resource=use_resource,
-        synchronization=synchronization,
-        aggregation=aggregation,
-        shape=shape,
-        experimental_enable_variable_lifting=experimental_enable_variable_lifting,
-        )
-
-  def _variable_v2_call(cls,
-                        initial_value=None,
-                        trainable=None,
-                        validate_shape=True,
-                        caching_device=None,
-                        name=None,
-                        variable_def=None,
-                        dtype=None,
-                        import_scope=None,
-                        constraint=None,
-                        synchronization=VariableSynchronization.AUTO,
-                        aggregation=VariableAggregation.NONE,
-                        shape=None,
-                        experimental_enable_variable_lifting=None,
-                        ):
-    """Call on Variable class. Useful to force the signature."""
-    previous_getter = lambda **kws: default_variable_creator_v2(None, **kws)
-    for _, getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
-      previous_getter = _make_getter(getter, previous_getter)
-
-    # Reset `aggregation` that is explicitly set as `None` to the enum NONE.
-    if aggregation is None:
-      aggregation = VariableAggregation.NONE
-    return previous_getter(
-        initial_value=initial_value,
-        trainable=trainable,
-        validate_shape=validate_shape,
-        caching_device=caching_device,
-        name=name,
-        variable_def=variable_def,
-        dtype=dtype,
-        import_scope=import_scope,
-        constraint=constraint,
-        synchronization=synchronization,
-        aggregation=aggregation,
-        shape=shape,
-        experimental_enable_variable_lifting=experimental_enable_variable_lifting,
-        )
-
   @traceback_utils.filter_traceback
   def __call__(cls, *args, **kwargs):
-    if cls is VariableV1:
-      return cls._variable_v1_call(*args, **kwargs)
-    elif cls is Variable:
-      return cls._variable_v2_call(*args, **kwargs)
-    else:
-      return super(VariableMetaclass, cls).__call__(*args, **kwargs)
+    if hasattr(cls, "_variable_call") and callable(cls._variable_call):
+      variable_call = cls._variable_call(*args, **kwargs)
+      if variable_call is not None:
+        return variable_call
+    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
 
 
 @tf_export("Variable", v1=[])
@@ -1222,7 +1126,7 @@ class Variable(trackable.Trackable, metaclass=VariableMetaclass):
   @staticmethod
   def from_proto(variable_def, import_scope=None):
     """Returns a `Variable` object created from `variable_def`."""
-    return RefVariable(variable_def=variable_def, import_scope=import_scope)
+    raise NotImplementedError
 
   def _set_save_slice_info(self, save_slice_info):
     """Sets the slice info for this `Variable`.
@@ -1280,6 +1184,51 @@ class Variable(trackable.Trackable, metaclass=VariableMetaclass):
     """
     return object_identity.Reference(self)
 
+  @classmethod
+  def _variable_call(
+      cls,
+      initial_value=None,
+      trainable=None,
+      validate_shape=True,
+      caching_device=None,
+      name=None,
+      variable_def=None,
+      dtype=None,
+      import_scope=None,
+      constraint=None,
+      synchronization=VariableSynchronization.AUTO,
+      aggregation=VariableAggregation.NONE,
+      shape=None,
+      experimental_enable_variable_lifting=None,
+      **kwargs,
+    ):
+    """Variable class getter. Useful to force the signature."""
+    if cls is not Variable:
+      return None
+    previous_getter = lambda **kws: default_variable_creator_v2(None, **kws)
+    for _, getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
+      previous_getter = _make_getter(getter, previous_getter)
+
+    # Reset `aggregation` that is explicitly set as `None` to the enum NONE.
+    if aggregation is None:
+      aggregation = VariableAggregation.NONE
+    return previous_getter(
+        initial_value=initial_value,
+        trainable=trainable,
+        validate_shape=validate_shape,
+        caching_device=caching_device,
+        name=name,
+        variable_def=variable_def,
+        dtype=dtype,
+        import_scope=import_scope,
+        constraint=constraint,
+        synchronization=synchronization,
+        aggregation=aggregation,
+        shape=shape,
+        experimental_enable_variable_lifting=experimental_enable_variable_lifting,
+        **kwargs
+    )
+
   class SaveSliceInfo:
     """Information on how to save this Variable as a slice.
 
@@ -1366,1415 +1315,6 @@ Variable._OverloadAllOperators()  # pylint: disable=protected-access
 _pywrap_utils.RegisterType("Variable", Variable)
 
 
-@tf_export(v1=["Variable"])
-class VariableV1(Variable):
-  """See the [Variables Guide](https://tensorflow.org/guide/variables).
-
-  A variable maintains state in the graph across calls to `run()`. You add a
-  variable to the graph by constructing an instance of the class `Variable`.
-
-  The `Variable()` constructor requires an initial value for the variable,
-  which can be a `Tensor` of any type and shape. The initial value defines the
-  type and shape of the variable. After construction, the type and shape of
-  the variable are fixed. The value can be changed using one of the assign
-  methods.
-
-  If you want to change the shape of a variable later you have to use an
-  `assign` Op with `validate_shape=False`.
-
-  Just like any `Tensor`, variables created with `Variable()` can be used as
-  inputs for other Ops in the graph. Additionally, all the operators
-  overloaded for the `Tensor` class are carried over to variables, so you can
-  also add nodes to the graph by just doing arithmetic on variables.
-
-  ```python
-  import tensorflow as tf
-
-  # Create a variable.
-  w = tf.Variable(<initial-value>, name=<optional-name>)
-
-  # Use the variable in the graph like any Tensor.
-  y = tf.matmul(w, ...another variable or tensor...)
-
-  # The overloaded operators are available too.
-  z = tf.sigmoid(w + y)
-
-  # Assign a new value to the variable with `assign()` or a related method.
-  w.assign(w + 1.0)
-  w.assign_add(1.0)
-  ```
-
-  When you launch the graph, variables have to be explicitly initialized before
-  you can run Ops that use their value. You can initialize a variable by
-  running its *initializer op*, restoring the variable from a save file, or
-  simply running an `assign` Op that assigns a value to the variable. In fact,
-  the variable *initializer op* is just an `assign` Op that assigns the
-  variable's initial value to the variable itself.
-
-  ```python
-  # Launch the graph in a session.
-  with tf.compat.v1.Session() as sess:
-      # Run the variable initializer.
-      sess.run(w.initializer)
-      # ...you now can run ops that use the value of 'w'...
-  ```
-
-  The most common initialization pattern is to use the convenience function
-  `global_variables_initializer()` to add an Op to the graph that initializes
-  all the variables. You then run that Op after launching the graph.
-
-  ```python
-  # Add an Op to initialize global variables.
-  init_op = tf.compat.v1.global_variables_initializer()
-
-  # Launch the graph in a session.
-  with tf.compat.v1.Session() as sess:
-      # Run the Op that initializes global variables.
-      sess.run(init_op)
-      # ...you can now run any Op that uses variable values...
-  ```
-
-  If you need to create a variable with an initial value dependent on another
-  variable, use the other variable's `initialized_value()`. This ensures that
-  variables are initialized in the right order.
-
-  All variables are automatically collected in the graph where they are
-  created. By default, the constructor adds the new variable to the graph
-  collection `GraphKeys.GLOBAL_VARIABLES`. The convenience function
-  `global_variables()` returns the contents of that collection.
-
-  When building a machine learning model it is often convenient to distinguish
-  between variables holding the trainable model parameters and other variables
-  such as a `global step` variable used to count training steps. To make this
-  easier, the variable constructor supports a `trainable=<bool>` parameter. If
-  `True`, the new variable is also added to the graph collection
-  `GraphKeys.TRAINABLE_VARIABLES`. The convenience function
-  `trainable_variables()` returns the contents of this collection. The
-  various `Optimizer` classes use this collection as the default list of
-  variables to optimize.
-
-  WARNING: tf.Variable objects by default have a non-intuitive memory model. A
-  Variable is represented internally as a mutable Tensor which can
-  non-deterministically alias other Tensors in a graph. The set of operations
-  which consume a Variable and can lead to aliasing is undetermined and can
-  change across TensorFlow versions. Avoid writing code which relies on the
-  value of a Variable either changing or not changing as other operations
-  happen. For example, using Variable objects or simple functions thereof as
-  predicates in a `tf.cond` is dangerous and error-prone:
-
-  ```
-  v = tf.Variable(True)
-  tf.cond(v, lambda: v.assign(False), my_false_fn)  # Note: this is broken.
-  ```
-
-  Here, adding `use_resource=True` when constructing the variable will
-  fix any nondeterminism issues:
-  ```
-  v = tf.Variable(True, use_resource=True)
-  tf.cond(v, lambda: v.assign(False), my_false_fn)
-  ```
-
-  To use the replacement for variables which does
-  not have these issues:
-
-  * Add `use_resource=True` when constructing `tf.Variable`;
-  * Call `tf.compat.v1.get_variable_scope().set_use_resource(True)` inside a
-    `tf.compat.v1.variable_scope` before the `tf.compat.v1.get_variable()` call.
-  """
-
-  def __init__(
-      self,  # pylint: disable=super-init-not-called
-      initial_value=None,
-      trainable=None,
-      collections=None,
-      validate_shape=True,
-      caching_device=None,
-      name=None,
-      variable_def=None,
-      dtype=None,
-      expected_shape=None,
-      import_scope=None,
-      constraint=None,
-      use_resource=None,
-      synchronization=VariableSynchronization.AUTO,
-      aggregation=VariableAggregation.NONE,
-      shape=None):
-    """Creates a new variable with value `initial_value`.
-
-    The new variable is added to the graph collections listed in `collections`,
-    which defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-
-    If `trainable` is `True` the variable is also added to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES`.
-
-    This constructor creates both a `variable` Op and an `assign` Op to set the
-    variable to its initial value.
-
-    Args:
-      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
-        which is the initial value for the Variable. The initial value must have
-        a shape specified unless `validate_shape` is set to False. Can also be a
-        callable with no argument that returns the initial value when called. In
-        that case, `dtype` must be specified. (Note that initializer functions
-        from init_ops.py must first be bound to a shape before being used here.)
-      trainable: If `True`, also adds the variable to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as the default
-        list of variables to use by the `Optimizer` classes. Defaults to `True`,
-        unless `synchronization` is set to `ON_READ`, in which case it defaults
-        to `False`.
-      collections: List of graph collections keys. The new variable is added to
-        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-      validate_shape: If `False`, allows the variable to be initialized with a
-        value of unknown shape. If `True`, the default, the shape of
-        `initial_value` must be known.
-      caching_device: Optional device string describing where the Variable
-        should be cached for reading.  Defaults to the Variable's device. If not
-        `None`, caches on another device.  Typical use is to cache on the device
-        where the Ops using the Variable reside, to deduplicate copying through
-        `Switch` and other conditional statements.
-      name: Optional name for the variable. Defaults to `'Variable'` and gets
-        uniquified automatically.
-      variable_def: `VariableDef` protocol buffer. If not `None`, recreates the
-        Variable object with its contents, referencing the variable's nodes in
-        the graph, which must already exist. The graph is not changed.
-        `variable_def` and the other arguments are mutually exclusive.
-      dtype: If set, initial_value will be converted to the given type. If
-        `None`, either the datatype will be kept (if `initial_value` is a
-        Tensor), or `convert_to_tensor` will decide.
-      expected_shape: A TensorShape. If set, initial_value is expected to have
-        this shape.
-      import_scope: Optional `string`. Name scope to add to the `Variable.` Only
-        used when initializing from protocol buffer.
-      constraint: An optional projection function to be applied to the variable
-        after being updated by an `Optimizer` (e.g. used to implement norm
-        constraints or value constraints for layer weights). The function must
-        take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value (which must have
-        the same shape). Constraints are not safe to use when doing asynchronous
-        distributed training.
-      use_resource: whether to use resource variables.
-      synchronization: Indicates when a distributed a variable will be
-        aggregated. Accepted values are constants defined in the class
-        `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses when to
-        synchronize.
-      aggregation: Indicates how a distributed variable will be aggregated.
-        Accepted values are constants defined in the class
-        `tf.VariableAggregation`.
-      shape: (optional) The shape of this variable. If None, the shape of
-        `initial_value` will be used. When setting this argument to
-        `tf.TensorShape(None)` (representing an unspecified shape), the variable
-        can be assigned with values of different shapes.
-
-    Raises:
-      ValueError: If both `variable_def` and initial_value are specified.
-      ValueError: If the initial value is not specified, or does not have a
-        shape and `validate_shape` is `True`.
-      RuntimeError: If eager execution is enabled.
-    """
-
-  SaveSliceInfo = Variable.SaveSliceInfo
-
-  def initialized_value(self):
-    with ops.init_scope():
-      return control_flow_ops.cond(
-          is_variable_initialized(self), self.read_value,
-          lambda: self.initial_value)
-
-
-# TODO(apassos): do not repeat all comments here
-class RefVariable(VariableV1, core.Tensor):
-  """Ref-based implementation of variables."""
-
-  def __init__(
-      self,  # pylint: disable=super-init-not-called
-      initial_value=None,
-      trainable=None,
-      collections=None,
-      validate_shape=True,
-      caching_device=None,
-      name=None,
-      variable_def=None,
-      dtype=None,
-      expected_shape=None,
-      import_scope=None,
-      constraint=None,
-      synchronization=None,
-      aggregation=None,
-      shape=None):
-    """Creates a new variable with value `initial_value`.
-
-    The new variable is added to the graph collections listed in `collections`,
-    which defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-
-    If `trainable` is `True` the variable is also added to the graph collection
-    `GraphKeys.TRAINABLE_VARIABLES`.
-
-    This constructor creates both a `variable` Op and an `assign` Op to set the
-    variable to its initial value.
-
-    Args:
-      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
-        which is the initial value for the Variable. The initial value must have
-        a shape specified unless `validate_shape` is set to False. Can also be a
-        callable with no argument that returns the initial value when called. In
-        that case, `dtype` must be specified. (Note that initializer functions
-        from init_ops.py must first be bound to a shape before being used here.)
-      trainable: If `True`, also adds the variable to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as the default
-        list of variables to use by the `Optimizer` classes. Defaults to `True`,
-        unless `synchronization` is set to `ON_READ`, in which case it defaults
-        to `False`.
-      collections: List of graph collections keys. The new variable is added to
-        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-      validate_shape: If `False`, allows the variable to be initialized with a
-        value of unknown shape. If `True`, the default, the shape of
-        `initial_value` must be known.
-      caching_device: Optional device string describing where the Variable
-        should be cached for reading.  Defaults to the Variable's device. If not
-        `None`, caches on another device.  Typical use is to cache on the device
-        where the Ops using the Variable reside, to deduplicate copying through
-        `Switch` and other conditional statements.
-      name: Optional name for the variable. Defaults to `'Variable'` and gets
-        uniquified automatically.
-      variable_def: `VariableDef` protocol buffer. If not `None`, recreates the
-        Variable object with its contents, referencing the variable's nodes in
-        the graph, which must already exist. The graph is not changed.
-        `variable_def` and the other arguments are mutually exclusive.
-      dtype: If set, initial_value will be converted to the given type. If
-        `None`, either the datatype will be kept (if `initial_value` is a
-        Tensor), or `convert_to_tensor` will decide.
-      expected_shape: A TensorShape. If set, initial_value is expected to have
-        this shape.
-      import_scope: Optional `string`. Name scope to add to the `Variable.` Only
-        used when initializing from protocol buffer.
-      constraint: An optional projection function to be applied to the variable
-        after being updated by an `Optimizer` (e.g. used to implement norm
-        constraints or value constraints for layer weights). The function must
-        take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value (which must have
-        the same shape). Constraints are not safe to use when doing asynchronous
-        distributed training.
-      synchronization: Indicates when a distributed a variable will be
-        aggregated. Accepted values are constants defined in the class
-        `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses when to
-        synchronize.
-      aggregation: Indicates how a distributed variable will be aggregated.
-        Accepted values are constants defined in the class
-        `tf.VariableAggregation`.
-      shape: (optional) The shape of this variable. If None, the shape of
-        `initial_value` will be used. When setting this argument to
-        `tf.TensorShape(None)` (representing an unspecified shape), the variable
-        can be assigned with values of different shapes.
-
-    Raises:
-      ValueError: If both `variable_def` and initial_value are specified.
-      ValueError: If the initial value is not specified, or does not have a
-        shape and `validate_shape` is `True`.
-      RuntimeError: If eager execution is enabled.
-    """
-    self._in_graph_mode = True
-    if variable_def:
-      # If variable_def is provided, recreates the variable from its fields.
-      if initial_value:
-        raise ValueError("variable_def and initial_value are mutually "
-                         "exclusive.")
-      self._init_from_proto(variable_def, import_scope=import_scope)
-    else:
-      # Create from initial_value.
-      self._init_from_args(
-          initial_value=initial_value,
-          trainable=trainable,
-          collections=collections,
-          validate_shape=validate_shape,
-          caching_device=caching_device,
-          name=name,
-          dtype=dtype,
-          expected_shape=expected_shape,
-          constraint=constraint,
-          synchronization=synchronization,
-          aggregation=aggregation,
-          shape=shape)
-
-  def __repr__(self):
-    if context.executing_eagerly() and not self._in_graph_mode:
-      return "<tf.Variable '%s' shape=%s dtype=%s, numpy=%s>" % (
-          self.name, self.get_shape(), self.dtype.name,
-          ops.numpy_text(self.read_value(), is_repr=True))
-    else:
-      return "<tf.Variable '%s' shape=%s dtype=%s>" % (
-          self.name, self.get_shape(), self.dtype.name)
-
-  def _init_from_args(self,
-                      initial_value=None,
-                      trainable=None,
-                      collections=None,
-                      validate_shape=True,
-                      caching_device=None,
-                      name=None,
-                      dtype=None,
-                      expected_shape=None,
-                      constraint=None,
-                      synchronization=None,
-                      aggregation=None,
-                      shape=None):
-    """Creates a new variable from arguments.
-
-    Args:
-      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
-        which is the initial value for the Variable. The initial value must have
-        a shape specified unless `validate_shape` is set to False. Can also be a
-        callable with no argument that returns the initial value when called.
-        (Note that initializer functions from init_ops.py must first be bound to
-        a shape before being used here.)
-      trainable: If `True`, also adds the variable to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as the default
-        list of variables to use by the `Optimizer` classes. Defaults to `True`,
-        unless `synchronization` is set to `ON_READ`, in which case it defaults
-        to `False`.
-      collections: List of graph collections keys. The new variable is added to
-        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-      validate_shape: If `False`, allows the variable to be initialized with a
-        value of unknown shape. If `True`, the default, the shape of
-        `initial_value` must be known.
-      caching_device: Optional device string or function describing where the
-        Variable should be cached for reading.  Defaults to the Variable's
-        device.  If not `None`, caches on another device.  Typical use is to
-        cache on the device where the Ops using the Variable reside, to
-        deduplicate copying through `Switch` and other conditional statements.
-      name: Optional name for the variable. Defaults to `'Variable'` and gets
-        uniquified automatically.
-      dtype: If set, initial_value will be converted to the given type. If None,
-        either the datatype will be kept (if initial_value is a Tensor) or
-        float32 will be used (if it is a Python object convertible to a Tensor).
-      expected_shape: Deprecated. Ignored.
-      constraint: An optional projection function to be applied to the variable
-        after being updated by an `Optimizer` (e.g. used to implement norm
-        constraints or value constraints for layer weights). The function must
-        take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value (which must have
-        the same shape). Constraints are not safe to use when doing asynchronous
-        distributed training.
-      synchronization: Indicates when a distributed a variable will be
-        aggregated. Accepted values are constants defined in the class
-        `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses when to
-        synchronize.
-      aggregation: Indicates how a distributed variable will be aggregated.
-        Accepted values are constants defined in the class
-        `tf.VariableAggregation`.
-      shape: (optional) The shape of this variable. If None, the shape of
-        `initial_value` will be used. When setting this argument to
-        `tf.TensorShape(None)` (representing an unspecified shape), the variable
-        can be assigned with values of different shapes.
-
-    Raises:
-      ValueError: If the initial value is not specified, or does not have a
-        shape and `validate_shape` is `True`.
-      RuntimeError: If lifted into the eager context.
-    """
-    _ = expected_shape
-    if initial_value is None:
-      raise ValueError("initial_value must be specified.")
-    init_from_fn = callable(initial_value)
-
-    if collections is None:
-      collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-    if not isinstance(collections, (list, tuple, set)):
-      raise ValueError(
-          "collections argument to Variable constructor must be a list, tuple, "
-          "or set. Got %s of type %s" % (collections, type(collections)))
-    if constraint is not None and not callable(constraint):
-      raise ValueError("The `constraint` argument must be a callable.")
-
-    # Store the graph key so optimizers know how to only retrieve variables from
-    # this graph.
-    self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-    if isinstance(initial_value, trackable.CheckpointInitialValue):
-      self._maybe_initialize_trackable()
-      self._update_uid = initial_value.checkpoint_position.restore_uid
-      initial_value = initial_value.wrapped_value
-
-    synchronization, aggregation, trainable = (
-        validate_synchronization_aggregation_trainable(synchronization,
-                                                       aggregation, trainable,
-                                                       name))
-    self._synchronization = synchronization
-    self._aggregation = aggregation
-    self._trainable = trainable
-    if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
-      collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
-    with ops.init_scope():
-      # Ensure that we weren't lifted into the eager context.
-      if context.executing_eagerly():
-        raise RuntimeError(
-            "Reference variables are not supported when eager execution is "
-            "enabled. Please run `tf.compat.v1.enable_resource_variables()` to "
-            "switch to resource variables.")
-      with ops.name_scope(name, "Variable",
-                          [] if init_from_fn else [initial_value]) as name:
-
-        if init_from_fn:
-          # Use attr_scope and device(None) to simulate the behavior of
-          # colocate_with when the variable we want to colocate with doesn't
-          # yet exist.
-          true_name = ops.name_from_scope_name(name)  # pylint: disable=protected-access
-          attr = attr_value_pb2.AttrValue(
-              list=attr_value_pb2.AttrValue.ListValue(
-                  s=[compat.as_bytes("loc:@%s" % true_name)]))
-          # pylint: disable=protected-access
-          with ops.get_default_graph()._attr_scope({"_class": attr}):
-            with ops.name_scope("Initializer"), ops.device(None):
-              initial_value = initial_value()
-              if isinstance(initial_value, trackable.CheckpointInitialValue):
-                self._maybe_initialize_trackable()
-                self._update_uid = initial_value.checkpoint_position.restore_uid
-                initial_value = initial_value.wrapped_value
-              self._initial_value = ops.convert_to_tensor(
-                  initial_value, name="initial_value", dtype=dtype)
-              if shape is None:
-                shape = (
-                    self._initial_value.get_shape()
-                    if validate_shape else tensor_shape.unknown_shape())
-            self._variable = state_ops.variable_op_v2(
-                shape, self._initial_value.dtype.base_dtype, name=name)
-          # pylint: enable=protected-access
-
-        # Or get the initial value from a Tensor or Python object.
-        else:
-          self._initial_value = ops.convert_to_tensor(
-              initial_value, name="initial_value", dtype=dtype)
-          # pylint: disable=protected-access
-          if self._initial_value.op._get_control_flow_context() is not None:
-            raise ValueError(
-                "Initializer for variable %s is from inside a control-flow "
-                "construct, such as a loop or conditional. When creating a "
-                "variable inside a loop or conditional, use a lambda as the "
-                "initializer." % name)
-          if shape is None:
-            # pylint: enable=protected-access
-            shape = (
-                self._initial_value.get_shape()
-                if validate_shape else tensor_shape.unknown_shape())
-          # In this case, the variable op can't be created until after the
-          # initial_value has been converted to a Tensor with a known type.
-          self._variable = state_ops.variable_op_v2(
-              shape, self._initial_value.dtype.base_dtype, name=name)
-
-        # Cache the name in `self`, because some APIs call `Variable.name` in a
-        # tight loop, and this halves the cost.
-        self._name = self._variable.name
-
-        # Manually overrides the variable's shape with the initial value's.
-        if validate_shape:
-          initial_value_shape = self._initial_value.get_shape()
-          if not initial_value_shape.is_fully_defined():
-            raise ValueError("initial_value must have a shape specified: %s" %
-                             self._initial_value)
-
-        # If 'initial_value' makes use of other variables, make sure we don't
-        # have an issue if these other variables aren't initialized first by
-        # using their initialized_value() method.
-        self._initializer_op = state_ops.assign(
-            self._variable,
-            _try_guard_against_uninitialized_dependencies(
-                name, self._initial_value),
-            validate_shape=validate_shape).op
-
-        # TODO(vrv): Change this class to not take caching_device, but
-        # to take the op to colocate the snapshot with, so we can use
-        # colocation rather than devices.
-        if caching_device is not None:
-          with ops.device(caching_device):
-            self._snapshot = array_ops.identity(self._variable, name="read")
-        else:
-          with ops.colocate_with(self._variable.op):
-            self._snapshot = array_ops.identity(self._variable, name="read")
-      ops.add_to_collections(collections, self)
-
-    self._caching_device = caching_device
-    self._save_slice_info = None
-    self._constraint = constraint
-
-  def _init_from_proto(self, variable_def, import_scope=None):
-    """Recreates the Variable object from a `VariableDef` protocol buffer.
-
-    Args:
-      variable_def: `VariableDef` protocol buffer, describing a variable whose
-        nodes already exists in the graph.
-      import_scope: Optional `string`. Name scope to add.
-    """
-    assert isinstance(variable_def, variable_pb2.VariableDef)
-    # Create from variable_def.
-    g = ops.get_default_graph()
-    self._variable = g.as_graph_element(
-        ops.prepend_name_scope(
-            variable_def.variable_name, import_scope=import_scope))
-    self._name = self._variable.name
-    self._initializer_op = g.as_graph_element(
-        ops.prepend_name_scope(
-            variable_def.initializer_name, import_scope=import_scope))
-    # Tests whether initial_value_name exists first for backwards compatibility.
-    if (hasattr(variable_def, "initial_value_name") and
-        variable_def.initial_value_name):
-      self._initial_value = g.as_graph_element(
-          ops.prepend_name_scope(
-              variable_def.initial_value_name, import_scope=import_scope))
-    else:
-      self._initial_value = None
-    synchronization, aggregation, trainable = (
-        validate_synchronization_aggregation_trainable(
-            variable_def.synchronization, variable_def.aggregation,
-            variable_def.trainable, variable_def.variable_name))
-    self._synchronization = synchronization
-    self._aggregation = aggregation
-    self._trainable = trainable
-    self._snapshot = g.as_graph_element(
-        ops.prepend_name_scope(
-            variable_def.snapshot_name, import_scope=import_scope))
-    if variable_def.HasField("save_slice_info_def"):
-      self._save_slice_info = Variable.SaveSliceInfo(
-          save_slice_info_def=variable_def.save_slice_info_def,
-          import_scope=import_scope)
-    else:
-      self._save_slice_info = None
-    self._caching_device = None
-    self._constraint = None
-
-  def _as_graph_element(self):
-    """Conversion function for Graph.as_graph_element()."""
-    return self._variable
-
-  def value(self):
-    """Returns the last snapshot of this variable.
-
-    You usually do not need to call this method as all ops that need the value
-    of the variable call it automatically through a `convert_to_tensor()` call.
-
-    Returns a `Tensor` which holds the value of the variable.  You can not
-    assign a new value to this tensor as it is not a reference to the variable.
-
-    To avoid copies, if the consumer of the returned value is on the same device
-    as the variable, this actually returns the live value of the variable, not
-    a copy.  Updates to the variable are seen by the consumer.  If the consumer
-    is on a different device it will get a copy of the variable.
-
-    Returns:
-      A `Tensor` containing the value of the variable.
-    """
-    return self._snapshot
-
-  def read_value(self):
-    """Returns the value of this variable, read in the current context.
-
-    Can be different from value() if it's on another device, with control
-    dependencies, etc.
-
-    Returns:
-      A `Tensor` containing the value of the variable.
-    """
-    return array_ops.identity(self._variable, name="read")
-
-  def _ref(self):
-    """Returns a reference to this variable.
-
-    You usually do not need to call this method as all ops that need a reference
-    to the variable call it automatically.
-
-    Returns is a `Tensor` which holds a reference to the variable.  You can
-    assign a new value to the variable by passing the tensor to an assign op.
-    See `tf.Variable.value` if you want to get the value of the
-    variable.
-
-    Returns:
-      A `Tensor` that is a reference to the variable.
-    """
-    return self._variable
-
-  def set_shape(self, shape):
-    """Overrides the shape for this variable.
-
-    Args:
-      shape: the `TensorShape` representing the overridden shape.
-    """
-    self._ref().set_shape(shape)
-    self.value().set_shape(shape)
-
-  @property
-  def trainable(self):
-    return self._trainable
-
-  @property
-  def synchronization(self):
-    return self._synchronization
-
-  @property
-  def aggregation(self):
-    return self._aggregation
-
-  def eval(self, session=None):
-    """In a session, computes and returns the value of this variable.
-
-    This is not a graph construction method, it does not add ops to the graph.
-
-    This convenience method requires a session where the graph
-    containing this variable has been launched. If no session is
-    passed, the default session is used.  See `tf.compat.v1.Session` for more
-    information on launching a graph and on sessions.
-
-    ```python
-    v = tf.Variable([1, 2])
-    init = tf.compat.v1.global_variables_initializer()
-
-    with tf.compat.v1.Session() as sess:
-        sess.run(init)
-        # Usage passing the session explicitly.
-        print(v.eval(sess))
-        # Usage with the default session.  The 'with' block
-        # above makes 'sess' the default session.
-        print(v.eval())
-    ```
-
-    Args:
-      session: The session to use to evaluate this variable. If none, the
-        default session is used.
-
-    Returns:
-      A numpy `ndarray` with a copy of the value of this variable.
-    """
-    return self._variable.eval(session=session)
-
-  @property
-  def initial_value(self):
-    """Returns the Tensor used as the initial value for the variable.
-
-    Note that this is different from `initialized_value()` which runs
-    the op that initializes the variable before returning its value.
-    This method returns the tensor that is used by the op that initializes
-    the variable.
-
-    Returns:
-      A `Tensor`.
-    """
-    return self._initial_value
-
-  @property
-  def constraint(self):
-    """Returns the constraint function associated with this variable.
-
-    Returns:
-      The constraint function that was passed to the variable constructor.
-      Can be `None` if no constraint was passed.
-    """
-    return self._constraint
-
-  def assign(self, value, use_locking=False, name=None, read_value=True):
-    """Assigns a new value to the variable.
-
-    This is essentially a shortcut for `assign(self, value)`.
-
-    Args:
-      value: A `Tensor`. The new value for this variable.
-      use_locking: If `True`, use locking during the assignment.
-      name: The name of the operation to be created
-      read_value: if True, will return something which evaluates to the new
-        value of the variable; if False will return the assign op.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the assignment has completed.
-    """
-    assign = state_ops.assign(
-        self._variable, value, use_locking=use_locking, name=name)
-    if read_value:
-      return assign
-    return assign.op
-
-  def assign_add(self, delta, use_locking=False, name=None, read_value=True):
-    """Adds a value to this variable.
-
-     This is essentially a shortcut for `assign_add(self, delta)`.
-
-    Args:
-      delta: A `Tensor`. The value to add to this variable.
-      use_locking: If `True`, use locking during the operation.
-      name: The name of the operation to be created
-      read_value: if True, will return something which evaluates to the new
-        value of the variable; if False will return the assign op.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the addition has completed.
-    """
-    assign = state_ops.assign_add(
-        self._variable, delta, use_locking=use_locking, name=name)
-    if read_value:
-      return assign
-    return assign.op
-
-  def assign_sub(self, delta, use_locking=False, name=None, read_value=True):
-    """Subtracts a value from this variable.
-
-    This is essentially a shortcut for `assign_sub(self, delta)`.
-
-    Args:
-      delta: A `Tensor`. The value to subtract from this variable.
-      use_locking: If `True`, use locking during the operation.
-      name: The name of the operation to be created
-      read_value: if True, will return something which evaluates to the new
-        value of the variable; if False will return the assign op.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the subtraction has completed.
-    """
-    assign = state_ops.assign_sub(
-        self._variable, delta, use_locking=use_locking, name=name)
-    if read_value:
-      return assign
-    return assign.op
-
-  def scatter_sub(self, sparse_delta, use_locking=False, name=None):
-    """Subtracts `tf.IndexedSlices` from this variable.
-
-    Args:
-      sparse_delta: `tf.IndexedSlices` to be subtracted from this variable.
-      use_locking: If `True`, use locking during the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered subtraction has completed.
-
-    Raises:
-      TypeError: if `sparse_delta` is not an `IndexedSlices`.
-    """
-    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
-      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return gen_state_ops.scatter_sub(
-        self._variable,
-        sparse_delta.indices,
-        sparse_delta.values,
-        use_locking=use_locking,
-        name=name)
-
-  def scatter_add(self, sparse_delta, use_locking=False, name=None):
-    """Adds `tf.IndexedSlices` to this variable.
-
-    Args:
-      sparse_delta: `tf.IndexedSlices` to be added to this variable.
-      use_locking: If `True`, use locking during the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered addition has completed.
-
-    Raises:
-      TypeError: if `sparse_delta` is not an `IndexedSlices`.
-    """
-    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
-      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return gen_state_ops.scatter_add(
-        self._variable,
-        sparse_delta.indices,
-        sparse_delta.values,
-        use_locking=use_locking,
-        name=name)
-
-  def scatter_max(self, sparse_delta, use_locking=False, name=None):
-    """Updates this variable with the max of `tf.IndexedSlices` and itself.
-
-    Args:
-      sparse_delta: `tf.IndexedSlices` to use as an argument of max with this
-        variable.
-      use_locking: If `True`, use locking during the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered maximization has completed.
-
-    Raises:
-      TypeError: if `sparse_delta` is not an `IndexedSlices`.
-    """
-    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
-      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return gen_state_ops.scatter_max(
-        self._variable,
-        sparse_delta.indices,
-        sparse_delta.values,
-        use_locking=use_locking,
-        name=name)
-
-  def scatter_min(self, sparse_delta, use_locking=False, name=None):
-    """Updates this variable with the min of `tf.IndexedSlices` and itself.
-
-    Args:
-      sparse_delta: `tf.IndexedSlices` to use as an argument of min with this
-        variable.
-      use_locking: If `True`, use locking during the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered minimization has completed.
-
-    Raises:
-      TypeError: if `sparse_delta` is not an `IndexedSlices`.
-    """
-    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
-      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return gen_state_ops.scatter_min(
-        self._variable,
-        sparse_delta.indices,
-        sparse_delta.values,
-        use_locking=use_locking,
-        name=name)
-
-  def scatter_mul(self, sparse_delta, use_locking=False, name=None):
-    """Multiply this variable by `tf.IndexedSlices`.
-
-    Args:
-      sparse_delta: `tf.IndexedSlices` to multiply this variable by.
-      use_locking: If `True`, use locking during the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered multiplication has completed.
-
-    Raises:
-      TypeError: if `sparse_delta` is not an `IndexedSlices`.
-    """
-    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
-      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return gen_state_ops.scatter_mul(
-        self._variable,
-        sparse_delta.indices,
-        sparse_delta.values,
-        use_locking=use_locking,
-        name=name)
-
-  def scatter_div(self, sparse_delta, use_locking=False, name=None):
-    """Divide this variable by `tf.IndexedSlices`.
-
-    Args:
-      sparse_delta: `tf.IndexedSlices` to divide this variable by.
-      use_locking: If `True`, use locking during the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered division has completed.
-
-    Raises:
-      TypeError: if `sparse_delta` is not an `IndexedSlices`.
-    """
-    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
-      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return gen_state_ops.scatter_div(
-        self._variable,
-        sparse_delta.indices,
-        sparse_delta.values,
-        use_locking=use_locking,
-        name=name)
-
-  def scatter_update(self, sparse_delta, use_locking=False, name=None):
-    """Assigns `tf.IndexedSlices` to this variable.
-
-    Args:
-      sparse_delta: `tf.IndexedSlices` to be assigned to this variable.
-      use_locking: If `True`, use locking during the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered assignment has completed.
-
-    Raises:
-      TypeError: if `sparse_delta` is not an `IndexedSlices`.
-    """
-    if not isinstance(sparse_delta, indexed_slices.IndexedSlices):
-      raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return gen_state_ops.scatter_update(
-        self._variable,
-        sparse_delta.indices,
-        sparse_delta.values,
-        use_locking=use_locking,
-        name=name)
-
-  def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
-    """Assigns `tf.IndexedSlices` to this variable batch-wise.
-
-    Analogous to `batch_gather`. This assumes that this variable and the
-    sparse_delta IndexedSlices have a series of leading dimensions that are the
-    same for all of them, and the updates are performed on the last dimension of
-    indices. In other words, the dimensions should be the following:
-
-    `num_prefix_dims = sparse_delta.indices.ndims - 1`
-    `batch_dim = num_prefix_dims + 1`
-    `sparse_delta.updates.shape = sparse_delta.indices.shape + var.shape[
-         batch_dim:]`
-
-    where
-
-    `sparse_delta.updates.shape[:num_prefix_dims]`
-    `== sparse_delta.indices.shape[:num_prefix_dims]`
-    `== var.shape[:num_prefix_dims]`
-
-    And the operation performed can be expressed as:
-
-    `var[i_1, ..., i_n,
-         sparse_delta.indices[i_1, ..., i_n, j]] = sparse_delta.updates[
-            i_1, ..., i_n, j]`
-
-    When sparse_delta.indices is a 1D tensor, this operation is equivalent to
-    `scatter_update`.
-
-    To avoid this operation one can looping over the first `ndims` of the
-    variable and using `scatter_update` on the subtensors that result of slicing
-    the first dimension. This is a valid option for `ndims = 1`, but less
-    efficient than this implementation.
-
-    Args:
-      sparse_delta: `tf.IndexedSlices` to be assigned to this variable.
-      use_locking: If `True`, use locking during the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered assignment has completed.
-
-    Raises:
-      TypeError: if `sparse_delta` is not an `IndexedSlices`.
-    """
-    return state_ops.batch_scatter_update(
-        self,
-        sparse_delta.indices,
-        sparse_delta.values,
-        use_locking=use_locking,
-        name=name)
-
-  def scatter_nd_sub(self, indices, updates, name=None):
-    """Applies sparse subtraction to individual values or slices in a Variable.
-
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-    `indices` must be integer tensor, containing indices into `ref`.
-    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-    The innermost dimension of `indices` (with length `K`) corresponds to
-    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
-
-    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-    ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-    ```
-
-    For example, say we want to add 4 scattered elements to a rank-1 tensor to
-    8 elements. In Python, that update would look like this:
-
-    ```python
-        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-        indices = tf.constant([[4], [3], [1] ,[7]])
-        updates = tf.constant([9, 10, 11, 12])
-        op = ref.scatter_nd_sub(indices, updates)
-        with tf.compat.v1.Session() as sess:
-          print sess.run(op)
-    ```
-
-    The resulting update to ref would look like this:
-
-        [1, -9, 3, -6, -6, 6, 7, -4]
-
-    See `tf.scatter_nd` for more details about how to make updates to
-    slices.
-
-    Args:
-      indices: The indices to be used in the operation.
-      updates: The values to be used in the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered subtraction has completed.
-    """
-    return gen_state_ops.scatter_nd_sub(
-        self._variable, indices, updates, use_locking=True, name=name)
-
-  def scatter_nd_add(self, indices, updates, name=None):
-    """Applies sparse addition to individual values or slices in a Variable.
-
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-    `indices` must be integer tensor, containing indices into `ref`.
-    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-    The innermost dimension of `indices` (with length `K`) corresponds to
-    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
-
-    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-    ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-    ```
-
-    For example, say we want to add 4 scattered elements to a rank-1 tensor to
-    8 elements. In Python, that update would look like this:
-
-    ```python
-        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-        indices = tf.constant([[4], [3], [1] ,[7]])
-        updates = tf.constant([9, 10, 11, 12])
-        add = ref.scatter_nd_add(indices, updates)
-        with tf.compat.v1.Session() as sess:
-          print sess.run(add)
-    ```
-
-    The resulting update to ref would look like this:
-
-        [1, 13, 3, 14, 14, 6, 7, 20]
-
-    See `tf.scatter_nd` for more details about how to make updates to
-    slices.
-
-    Args:
-      indices: The indices to be used in the operation.
-      updates: The values to be used in the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered addition has completed.
-    """
-    return gen_state_ops.scatter_nd_add(
-        self._variable, indices, updates, use_locking=True, name=name)
-
-  def scatter_nd_update(self, indices, updates, name=None):
-    """Applies sparse assignment to individual values or slices in a Variable.
-
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-    `indices` must be integer tensor, containing indices into `ref`.
-    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-    The innermost dimension of `indices` (with length `K`) corresponds to
-    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
-
-    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-    ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-    ```
-
-    For example, say we want to add 4 scattered elements to a rank-1 tensor to
-    8 elements. In Python, that update would look like this:
-
-    ```python
-        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-        indices = tf.constant([[4], [3], [1] ,[7]])
-        updates = tf.constant([9, 10, 11, 12])
-        op = ref.scatter_nd_update(indices, updates)
-        with tf.compat.v1.Session() as sess:
-          print sess.run(op)
-    ```
-
-    The resulting update to ref would look like this:
-
-        [1, 11, 3, 10, 9, 6, 7, 12]
-
-    See `tf.scatter_nd` for more details about how to make updates to
-    slices.
-
-    Args:
-      indices: The indices to be used in the operation.
-      updates: The values to be used in the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered assignment has completed.
-    """
-    return gen_state_ops.scatter_nd_update(
-        self._variable, indices, updates, use_locking=True, name=name)
-
-  def scatter_nd_max(self, indices, updates, name=None):
-    """Updates this variable with the max of `tf.IndexedSlices` and itself.
-
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-    `indices` must be integer tensor, containing indices into `ref`.
-    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-    The innermost dimension of `indices` (with length `K`) corresponds to
-    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
-
-    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-    ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-    ```
-
-    See `tf.scatter_nd` for more details about how to make updates to
-    slices.
-
-    Args:
-      indices: The indices to be used in the operation.
-      updates: The values to be used in the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered addition has completed.
-    """
-    return gen_state_ops.scatter_nd_max(
-        self._variable, indices, updates, use_locking=True, name=name)
-
-  def scatter_nd_min(self, indices, updates, name=None):
-    """Updates this variable with the min of `tf.IndexedSlices` and itself.
-
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-    `indices` must be integer tensor, containing indices into `ref`.
-    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-    The innermost dimension of `indices` (with length `K`) corresponds to
-    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
-
-    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-    ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-    ```
-
-    See `tf.scatter_nd` for more details about how to make updates to
-    slices.
-
-    Args:
-      indices: The indices to be used in the operation.
-      updates: The values to be used in the operation.
-      name: the name of the operation.
-
-    Returns:
-      A `Tensor` that will hold the new value of this variable after
-      the scattered addition has completed.
-    """
-    return gen_state_ops.scatter_nd_min(
-        self._variable, indices, updates, use_locking=True, name=name)
-
-  def _strided_slice_assign(self, begin, end, strides, value, name, begin_mask,
-                            end_mask, ellipsis_mask, new_axis_mask,
-                            shrink_axis_mask):
-    return gen_array_ops.strided_slice_assign(
-        ref=self._ref(),
-        begin=begin,
-        end=end,
-        strides=strides,
-        value=value,
-        name=name,
-        begin_mask=begin_mask,
-        end_mask=end_mask,
-        ellipsis_mask=ellipsis_mask,
-        new_axis_mask=new_axis_mask,
-        shrink_axis_mask=shrink_axis_mask)
-
-  @deprecated(None, "Prefer Dataset.range instead.")
-  def count_up_to(self, limit):
-    """Increments this variable until it reaches `limit`.
-
-    When that Op is run it tries to increment the variable by `1`. If
-    incrementing the variable would bring it above `limit` then the Op raises
-    the exception `OutOfRangeError`.
-
-    If no error is raised, the Op outputs the value of the variable before
-    the increment.
-
-    This is essentially a shortcut for `count_up_to(self, limit)`.
-
-    Args:
-      limit: value at which incrementing the variable raises an error.
-
-    Returns:
-      A `Tensor` that will hold the variable value before the increment. If no
-      other Op modifies this variable, the values produced will all be
-      distinct.
-    """
-    return state_ops.count_up_to(self._variable, limit=limit)
-
-  # Conversion to tensor.
-  @staticmethod
-  def _TensorConversionFunction(v, dtype=None, name=None, as_ref=False):  # pylint: disable=invalid-name
-    """Utility function for converting a Variable to a Tensor."""
-    _ = name
-    if dtype and not dtype.is_compatible_with(v.dtype):
-      raise ValueError(
-          "Incompatible type conversion requested to type '%s' for variable "
-          "of type '%s'" % (dtype.name, v.dtype.name))
-    if as_ref:
-      return v._ref()  # pylint: disable=protected-access
-    else:
-      return v.value()
-
-  # NOTE(mrry): This enables the Variable's overloaded "right" binary
-  # operators to run when the left operand is an ndarray, because it
-  # accords the Variable class higher priority than an ndarray, or a
-  # numpy matrix.
-  # TODO(mrry): Convert this to using numpy's __numpy_ufunc__
-  # mechanism, which allows more control over how Variables interact
-  # with ndarrays.
-  __array_priority__ = 100
-
-  @property
-  def name(self):
-    """The name of this variable."""
-    return self._name
-
-  @property
-  def initializer(self):
-    """The initializer operation for this variable."""
-    return self._initializer_op
-
-  @property
-  def device(self):
-    """The device of this variable."""
-    return self._variable.device
-
-  @property
-  def dtype(self):
-    """The `DType` of this variable."""
-    return self._variable.dtype
-
-  @property
-  def op(self):
-    """The `Operation` of this variable."""
-    return self._variable.op
-
-  @property
-  def graph(self):
-    """The `Graph` of this variable."""
-    return self._variable.graph
-
-  @property
-  def _distribute_strategy(self):
-    """The `tf.distribute.Strategy` that this variable was created under."""
-    return None  # Ref variables are never created inside a strategy.
-
-  @property
-  def shape(self):
-    """The `TensorShape` of this variable.
-
-    Returns:
-      A `TensorShape`.
-    """
-    return self._variable.get_shape()
-
-  def to_proto(self, export_scope=None):
-    """Converts a `Variable` to a `VariableDef` protocol buffer.
-
-    Args:
-      export_scope: Optional `string`. Name scope to remove.
-
-    Returns:
-      A `VariableDef` protocol buffer, or `None` if the `Variable` is not
-      in the specified name scope.
-    """
-    if (export_scope is None or self._variable.name.startswith(export_scope)):
-      var_def = variable_pb2.VariableDef()
-      var_def.variable_name = ops.strip_name_scope(self._variable.name,
-                                                   export_scope)
-      if self._initial_value is not None:
-        # For backwards compatibility.
-        var_def.initial_value_name = ops.strip_name_scope(
-            self._initial_value.name, export_scope)
-      var_def.trainable = self.trainable
-      var_def.synchronization = self.synchronization.value
-      var_def.aggregation = self.aggregation.value
-      var_def.initializer_name = ops.strip_name_scope(self.initializer.name,
-                                                      export_scope)
-      var_def.snapshot_name = ops.strip_name_scope(self._snapshot.name,
-                                                   export_scope)
-      if self._save_slice_info:
-        var_def.save_slice_info_def.MergeFrom(
-            self._save_slice_info.to_proto(export_scope=export_scope))
-      return var_def
-    else:
-      return None
-
-  def __iadd__(self, other):
-    logging.log_first_n(
-        logging.WARN, "Variable += will be deprecated. Use variable.assign_add"
-        " if you want assignment to the variable value or 'x = x + y'"
-        " if you want a new python Tensor object.", 1)
-    return self + other
-
-  def __isub__(self, other):
-    logging.log_first_n(
-        logging.WARN, "Variable -= will be deprecated. Use variable.assign_sub"
-        " if you want assignment to the variable value or 'x = x - y'"
-        " if you want a new python Tensor object.", 1)
-    return self - other
-
-  def __imul__(self, other):
-    logging.log_first_n(
-        logging.WARN,
-        "Variable *= will be deprecated. Use `var.assign(var * other)`"
-        " if you want assignment to the variable value or `x = x * y`"
-        " if you want a new python Tensor object.", 1)
-    return self * other
-
-  def __idiv__(self, other):
-    logging.log_first_n(
-        logging.WARN,
-        "Variable /= will be deprecated. Use `var.assign(var / other)`"
-        " if you want assignment to the variable value or `x = x / y`"
-        " if you want a new python Tensor object.", 1)
-    return self / other
-
-  def __itruediv__(self, other):
-    logging.log_first_n(
-        logging.WARN,
-        "Variable /= will be deprecated. Use `var.assign(var / other)`"
-        " if you want assignment to the variable value or `x = x / y`"
-        " if you want a new python Tensor object.", 1)
-    return self / other
-
-  def __irealdiv__(self, other):
-    logging.log_first_n(
-        logging.WARN,
-        "Variable /= will be deprecated. Use `var.assign(var / other)`"
-        " if you want assignment to the variable value or `x = x / y`"
-        " if you want a new python Tensor object.", 1)
-    return self / other
-
-  def __ipow__(self, other):
-    logging.log_first_n(
-        logging.WARN,
-        "Variable **= will be deprecated. Use `var.assign(var ** other)`"
-        " if you want assignment to the variable value or `x = x ** y`"
-        " if you want a new python Tensor object.", 1)
-    return self**other
-
-  def _serialize_to_tensors(self):
-    """Implements Trackable._serialize_to_tensors."""
-    return {trackable.VARIABLE_VALUE_KEY: self}
-
-  def _restore_from_tensors(self, restored_tensors):
-    """Implements Trackable._restore_from_tensors."""
-    restored_tensor = restored_tensors[trackable.VARIABLE_VALUE_KEY]
-    return state_ops.assign(
-        self,
-        restored_tensor,
-        validate_shape=self.get_shape().is_fully_defined())
-
-
 def _try_guard_against_uninitialized_dependencies(name, initial_value):
   """Attempt to guard against dependencies on uninitialized variables.
 
@@ -3145,12 +1685,6 @@ class PartitionedVariable:
     return [assign.op for assign in assign_list]
 
 
-# Register a conversion function which reads the value of the variable,
-# allowing instances of the class to be used as tensors.
-tensor_conversion_registry.register_tensor_conversion_function(
-    RefVariable, RefVariable._TensorConversionFunction)  # pylint: disable=protected-access
-
-
 @tf_export(v1=["global_variables"])
 def global_variables(scope=None):
   """Returns global variables.
@@ -3401,21 +1935,6 @@ def initialize_local_variables():
   return local_variables_initializer()
 
 
-@tf_export(v1=["is_variable_initialized"])
-@tf_should_use.should_use_result
-def is_variable_initialized(variable):
-  """Tests if a variable has been initialized.
-
-  Args:
-    variable: A `Variable`.
-
-  Returns:
-    Returns a scalar boolean Tensor, `True` if the variable has been
-    initialized, `False` otherwise.
-  """
-  return state_ops.is_variable_initialized(variable)
-
-
 @tf_export(v1=["assert_variables_initialized"])
 @tf_should_use.should_use_result
 def assert_variables_initialized(var_list=None):
diff --git a/tensorflow/python/ops/weights_broadcast_ops.py b/tensorflow/python/ops/weights_broadcast_ops.py
index 345b2be0f9b..908558dc392 100644
--- a/tensorflow/python/ops/weights_broadcast_ops.py
+++ b/tensorflow/python/ops/weights_broadcast_ops.py
@@ -21,6 +21,7 @@ file includes operations for those broadcasting rules.
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_assert
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -48,7 +49,7 @@ def _has_valid_nonscalar_shape(
       (weights_rank, weights_shape, values_rank, values_shape)) as scope:
     is_same_rank = math_ops.equal(
         values_rank, weights_rank, name="is_same_rank")
-    return control_flow_ops.cond(
+    return cond.cond(
         is_same_rank,
         lambda: _has_valid_dims(weights_shape, values_shape),
         lambda: is_same_rank,
@@ -123,7 +124,7 @@ def assert_broadcastable(weights, values):
         "values.shape=", values.name, values_shape,
         "is_scalar=", is_scalar,
     )
-    is_valid_shape = control_flow_ops.cond(
+    is_valid_shape = cond.cond(
         is_scalar,
         lambda: is_scalar,
         lambda: _has_valid_nonscalar_shape(  # pylint: disable=g-long-lambda
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index eea0ede9eca..b0b505df8c0 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -22,7 +22,6 @@ performance parity.
 import collections
 
 from tensorflow.core.framework import attr_value_pb2
-from tensorflow.core.function.capture import capture_container
 from tensorflow.python.client import pywrap_tf_session as c_api
 from tensorflow.python.eager import backprop_util
 from tensorflow.python.framework import auto_control_deps_utils as acd
@@ -279,8 +278,8 @@ def while_loop(cond,
     _check_inputs_outputs_types_match(body_graph, flattened_loop_vars)
 
     with ops.control_dependencies(
-        list(cond_graph._function_captures.control) + list(  # pylint: disable=protected-access
-            body_graph._function_captures.control)):  # pylint: disable=protected-access
+        list(cond_graph.function_captures.control) + list(
+            body_graph.function_captures.control)):
       output_shapes = [t.shape for t in body_graph.outputs]
       orig_loop_vars_range = slice(first_loop_var_index,
                                    first_loop_var_index + num_flattened_outputs)
@@ -353,7 +352,7 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   ] + [None] * num_intermediates
 
   # Skip gradients with respect to the captures whenever possible.
-  if "skip_input_indices" in op.__dict__ and op.skip_input_indices is not None:
+  if getattr(op, "skip_input_indices", None) is not None:
     captures_start_index = (
         len(body_graph.inputs) - len(body_graph.internal_captures))
     for i in op.skip_input_indices:
@@ -1104,7 +1103,7 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
       return captured_tensor
 
     if tensor.graph is not self._forward_graph:
-      already_captured = id(tensor) in self._function_captures.by_val_captures  # pylint: disable=protected-access
+      already_captured = id(tensor) in self.function_captures.by_val_internal
       captured_tensor = super(_WhileBodyGradFuncGraph, self)._capture_helper(
           tensor, name)
       if not already_captured:
@@ -1320,18 +1319,19 @@ def _duplicate_body_captures_in_cond(cond_graph, body_graph_captures):
   ]
 
   tensors = []
-  for op, ph, dtype in zip(placeholder_ops, placeholders, types):
-    tensor = ops.Tensor._create_with_tf_output(op, 0, dtype, ph)
-    op._outputs = [tensor]
-    tensors.append(tensor)
+  for op in placeholder_ops:
+    tensors.append(op.outputs[0])
 
   # Update `cond_graph._captures` and `cond_graph.inputs` to contain the
   # newly created placeholders.
   tuples = zip(body_graph_captures, tensors)
   keys = [id(t) for t in body_graph_captures]
   for k, v in zip(keys, tuples):
-    capture = capture_container.CaptureContainer(v[0], v[1], k, False)
-    cond_graph._function_captures._by_val[k] = capture  # pylint: disable=protected-access
+    cond_graph._function_captures.add_or_replace(
+        key=k,
+        external=v[0],
+        internal=v[1],
+        is_by_ref=False)
   cond_graph.inputs.extend(tensors)
 
 
diff --git a/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py b/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
index 35894353ff7..64f7ed8d40d 100644
--- a/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
+++ b/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
@@ -19,6 +19,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_resource_variable_ops
@@ -96,7 +97,7 @@ def _rewrite_output_as_tensor(body_grad_graph, grad_output_slices):
     grad_output_slices: IndexedSlices output of body_grad_graph.
   """
   with body_grad_graph.as_default():
-    new_output = ops.convert_to_tensor_v2(grad_output_slices)
+    new_output = tensor_conversion.convert_to_tensor_v2(grad_output_slices)
 
   idx = _get_tensor_index_in_iterable(body_grad_graph.structured_outputs,
                                       grad_output_slices)
diff --git a/tensorflow/python/platform/BUILD b/tensorflow/python/platform/BUILD
index 62572602552..afe02fceb54 100644
--- a/tensorflow/python/platform/BUILD
+++ b/tensorflow/python/platform/BUILD
@@ -1,11 +1,17 @@
 # platform package
 
-load("//tensorflow:tensorflow.default.bzl", "pybind_extension", "tf_py_build_info_genrule", "tf_py_test", "tf_python_pybind_extension")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "if_oss",
+)
+load("//tensorflow:tensorflow.default.bzl", "pybind_extension", "tf_py_build_info_genrule", "tf_py_strict_test", "tf_python_pybind_extension")
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_lib_deps", "tf_proto_library", "tf_protos_grappler")  # @unused
 
 visibility = [
     "//tensorflow:__subpackages__",
     "//tensorflow/dtensor:dtensor-internal",
+    "//learning/brain/python/platform:__subpackages__",
 ]
 
 package(
@@ -19,89 +25,74 @@ tf_py_build_info_genrule(
     out = "build_info.py",
 )
 
-py_library(
+py_strict_library(
     name = "build_info",
     srcs = ["build_info.py"],
     srcs_version = "PY3",
 )
 
-py_library(
-    name = "platform",
-    srcs = glob(
-        [
-            "*.py",
-        ],
-        exclude = [
-            "*test.py",
-            "benchmark.py",  # In platform_benchmark.
-            "analytics.py",  # In platform_analytics.
-            "device_context.py",  # In platform_device_context.
-            "self_check.py",  # In self_check
-        ],
-    ) + ["build_info.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":_pywrap_cpu_feature_guard",
-        ":build_info",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:util",
-        "//tensorflow/python/lib/io:lib",
-        "//tensorflow/python/util:_pywrap_util_port",
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
-        "@rules_python//python/runfiles",
-    ],
-)
-
-py_library(
+py_strict_library(
     name = "self_check",
     srcs = ["self_check.py"],
     srcs_version = "PY3",
-    deps = [
-        ":_pywrap_cpu_feature_guard",
+    deps = if_oss([
         ":build_info",
-    ],
+        ":_pywrap_cpu_feature_guard",
+    ]),
 )
 
-py_library(
+py_strict_library(
     name = "benchmark",
     srcs = ["benchmark.py"],
     srcs_version = "PY3",
     deps = [
-        ":platform",
+        ":gfile",
+        ":tf_logging",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:tf_inspect",
+        "@absl_py//absl:app",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "analytics",
     srcs = ["analytics.py"],
     srcs_version = "PY3",
 )
 
-py_library(
+py_strict_library(
     name = "device_context",
     srcs = ["device_context.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "test",
     srcs = ["googletest.py"],
     srcs_version = "PY3",
     deps = [
         ":benchmark",
+        ":flags",
+        "//tensorflow/python/util:tf_export",
+    ] + if_oss([
+        ":tf_logging",
+        "@absl_py//absl:app",
         "@absl_py//absl/testing:absltest",
-    ],
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_inspect",
+    ]),
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "resource_loader_test",
     size = "small",
     srcs = ["resource_loader_test.py"],
@@ -114,12 +105,12 @@ tf_py_test(
         "no_windows",
     ],
     deps = [
-        ":platform",
+        ":resource_loader",
         ":test",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sysconfig_test",
     size = "small",
     srcs = ["sysconfig_test.py"],
@@ -134,13 +125,14 @@ tf_py_test(
         "no_windows",
     ],
     deps = [
-        ":platform",
+        ":client_testlib",
+        ":sysconfig",
         ":test",
         "//tensorflow:tensorflow_py",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "flags_test",
     size = "small",
     srcs = ["flags_test.py"],
@@ -151,11 +143,12 @@ tf_py_test(
     ],
     deps = [
         ":client_testlib",
-        ":platform",
+        ":flags",
+        "@absl_py//absl/flags",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "stacktrace_handler_test",
     size = "small",
     srcs = ["stacktrace_handler_test.py"],
@@ -167,11 +160,11 @@ tf_py_test(
     ],
     deps = [
         ":client_testlib",
-        ":platform",
+        ":tf_logging",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "app_test",
     size = "small",
     srcs = ["app_test.py"],
@@ -180,7 +173,10 @@ tf_py_test(
         "manual",
         "notap",
     ],
-    deps = [":platform"],
+    deps = [
+        ":app",
+        ":flags",
+    ],
 )
 
 tf_python_pybind_extension(
@@ -206,23 +202,121 @@ tf_python_pybind_extension(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "client_testlib",
     srcs = ["test.py"],
     srcs_version = "PY3",
     deps = [
         ":test",
-        "//tensorflow/python:client",
-        "//tensorflow/python:cond_v2",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradient_checker",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:util",
-        "//tensorflow/python:while_v2",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+py_strict_library(
+    name = "app",
+    srcs = ["app.py"],
+    deps = [
+        ":flags",
+        "//tensorflow/python/util:_pywrap_util_port",
+        "//tensorflow/python/util:tf_export",
+        "@absl_py//absl:app",
+    ],
+)
+
+# copybara:uncomment_begin(google-only)
+# py_strict_library(
+#     name = "cpp_memory_checker",
+#     srcs = ["cpp_memory_checker.py"],
+#     deps = [
+#         ":tf_logging",
+#         "//perftools/profiles/collector/heap:pywrap_alloc_recorder",
+#         "//perftools/profiles/proto:profile_py_pb2",
+#         "//pyglib:resources",
+#         "//tensorflow/python/profiler:trace",
+#     ],
+# )
+# copybara:uncomment_end
+
+py_strict_library(
+    name = "sysconfig",
+    srcs = ["sysconfig.py"],
+    deps = [
+        ":build_info",
+        "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/framework:versions",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "__init__",
+    srcs = ["__init__.py"],
+    deps = [
+    ],
+)
+
+py_strict_library(
+    name = "control_imports",
+    srcs = ["control_imports.py"],
+    deps = [
+    ],
+)
+
+py_strict_library(
+    name = "parameterized",
+    srcs = ["parameterized.py"],
+)
+
+py_strict_library(
+    name = "remote_utils",
+    srcs = ["remote_utils.py"],
+    deps = [
+    ],
+)
+
+py_strict_library(
+    name = "gfile",
+    srcs = ["gfile.py"],
+    deps = [
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "tf_logging",
+    srcs = ["tf_logging.py"],
+    deps = [
+        "//tensorflow/python/util:tf_export",
+        "@absl_py//absl/logging",
+    ],
+)
+
+py_strict_library(
+    name = "flags",
+    srcs = ["flags.py"],
+    deps = [
+        "//tensorflow/python/util:tf_decorator",
+        "@absl_py//absl/flags",
+    ],
+)
+
+py_strict_library(
+    name = "resource_loader",
+    srcs = ["resource_loader.py"],
+    deps = [
+        ":tf_logging",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:tf_inspect",
+    ] + if_oss([
+        "@rules_python//python/runfiles",
+    ]),
+)
+
+tf_py_strict_test(
     name = "build_info_test",
     size = "small",
     srcs = [
@@ -236,12 +330,13 @@ tf_py_test(
         "notap",
     ],
     deps = [
+        ":build_info",
         ":client_testlib",
-        ":platform",
+        "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "benchmark_test",
     size = "small",
     srcs = [
@@ -255,7 +350,14 @@ tf_py_test(
     ],
     deps = [
         ":client_testlib",
-        ":platform",
+        ":gfile",
+        ":tf_logging",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:tf_inspect",
+        "@absl_py//absl:app",
     ],
 )
 
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 1740436c6bd..ea58dd71dd1 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -37,6 +37,11 @@ _logger = None
 _logger_lock = threading.Lock()
 
 
+def error_log(error_msg, level=ERROR):
+  """Empty helper method."""
+  del error_msg, level
+
+
 def _get_caller(offset=3):
   """Returns a code and frame object for the lowest non-logging stack frame."""
   # Use sys._getframe().  This avoids creating a traceback object.
@@ -148,7 +153,8 @@ def get_logger():
       _interactive = False
       try:
         # This is only defined in interactive shells.
-        if _sys.ps1: _interactive = True
+        if _sys.ps1:
+          _interactive = True
       except AttributeError:
         # Even now, we may be in an interactive shell with `python -i`.
         _interactive = _sys.flags.interactive
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index e344c90a702..84b83fe8476 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -17,7 +17,7 @@ py_library(
         ":option_builder",
         ":tfprof_logger",
         "//tensorflow/core/profiler:protos_all_py",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -27,8 +27,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:c_api_util",
-        "//tensorflow/python:util",
         "//tensorflow/python/profiler/internal:_pywrap_profiler",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -50,9 +50,9 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python:errors",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler/internal:_pywrap_profiler",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -67,6 +67,7 @@ cuda_py_test(
         ":profiler_v2",
         "//tensorflow/python:constant_op",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler:trace",
     ],
 )
@@ -90,6 +91,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":tfprof_logger",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -103,6 +105,7 @@ py_library(
         "//tensorflow/core/profiler:protos_all_py",
         "//tensorflow/python:errors",
         "//tensorflow/python/util:_pywrap_tfprof",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -123,10 +126,11 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:distributed_framework_test_lib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
         "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler/internal:model_analyzer_testlib",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -141,8 +145,8 @@ cuda_py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler/internal:model_analyzer_testlib",
     ],
 )
@@ -154,8 +158,9 @@ py_library(
     deps = [
         "//tensorflow/core/profiler:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler/internal:flops_registry",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -183,6 +188,9 @@ py_library(
     deps = [
         ":model_analyzer",
         ":tfprof_logger",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/util:_pywrap_tfprof",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -200,8 +208,8 @@ cuda_py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler/internal:model_analyzer_testlib",
     ],
 )
diff --git a/tensorflow/python/profiler/integration_test/BUILD b/tensorflow/python/profiler/integration_test/BUILD
index a7556d93dfd..6c67c077591 100644
--- a/tensorflow/python/profiler/integration_test/BUILD
+++ b/tensorflow/python/profiler/integration_test/BUILD
@@ -29,12 +29,12 @@ cuda_py_test(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:profiler_client",
         "//tensorflow/python/profiler:profiler_v2",
     ],
diff --git a/tensorflow/python/profiler/internal/flops_registry.py b/tensorflow/python/profiler/internal/flops_registry.py
index 4e012e69268..18b30ee27d2 100644
--- a/tensorflow/python/profiler/internal/flops_registry.py
+++ b/tensorflow/python/profiler/internal/flops_registry.py
@@ -28,7 +28,7 @@ IMPLEMENTED_OPS = set([
     # Binary ops
     "Add", "Sub", "Mul", "RealDiv", "Maximum", "Minimum", "Pow", "RsqrtGrad",
     "GreaterEqual", "Greater", "LessEqual", "Less", "Equal", "NotEqual",
-    "SquaredDifference",
+    "SquaredDifference", "AddV2",
     # Reduction ops
     "Mean", "Sum", "ArgMax", "ArgMin", "BiasAddGrad",
     # Convolution and pooling
@@ -143,6 +143,7 @@ def _binary_per_element_op_flops(graph, node, ops_per_element=1):
 
 
 @ops.RegisterStatistics("Add", "flops")
+@ops.RegisterStatistics("AddV2", "flops")
 def _add_flops(graph, node):
   """Compute flops for Add operation."""
   return _binary_per_element_op_flops(graph, node)
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index a881b8c9c40..5152a4b5906 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -154,7 +154,7 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
                 std::move(xspace_paths),
                 /*xspaces=*/std::nullopt);
         if (!status_or_session_snapshot.ok()) {
-          LOG(ERROR) << status_or_session_snapshot.status().error_message();
+          LOG(ERROR) << status_or_session_snapshot.status().message();
           return py::make_tuple(py::bytes(""), py::bool_(false));
         }
 
@@ -168,7 +168,7 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
                   status_or_session_snapshot.value(), tool_name, tool_options);
         }
         if (!status_or_tool_data.ok()) {
-          LOG(ERROR) << status_or_tool_data.status().error_message();
+          LOG(ERROR) << status_or_tool_data.status().message();
           return py::make_tuple(py::bytes(""), py::bool_(false));
         }
         return py::make_tuple(py::bytes(status_or_tool_data.value()),
@@ -210,7 +210,7 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
               tensorflow::profiler::SessionSnapshot::Create(
                   std::move(xspace_paths), std::move(xspaces));
           if (!status_or_session_snapshot.ok()) {
-            LOG(ERROR) << status_or_session_snapshot.status().error_message();
+            LOG(ERROR) << status_or_session_snapshot.status().message();
             return py::make_tuple(py::bytes(""), py::bool_(false));
           }
 
@@ -220,7 +220,7 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
                   status_or_session_snapshot.value(), tool_name,
                   /*options=*/{});
           if (!status_or_tool_data.ok()) {
-            LOG(ERROR) << status_or_tool_data.status().error_message();
+            LOG(ERROR) << status_or_tool_data.status().message();
             return py::make_tuple(py::bytes(""), py::bool_(false));
           }
           return py::make_tuple(py::bytes(status_or_tool_data.value()),
diff --git a/tensorflow/python/pywrap_dtensor_device.cc b/tensorflow/python/pywrap_dtensor_device.cc
index c81a3ddb857..506b3db6c22 100644
--- a/tensorflow/python/pywrap_dtensor_device.cc
+++ b/tensorflow/python/pywrap_dtensor_device.cc
@@ -65,6 +65,15 @@ void CallDelete_DeviceInfo(PyObject* capsule) {
   destructor(PyCapsule_GetPointer(capsule, "TFE_CustomDevice_DeviceInfo"));
 }
 
+bool CheckResourceVariable(PyObject* item) {
+  if (tensorflow::swig::IsResourceVariable(item)) {
+    tensorflow::Safe_PyObjectPtr handle(
+        PyObject_GetAttrString(item, "_handle"));
+    return EagerTensor_CheckExact(handle.get());
+  }
+
+  return false;
+}
 // Supports 2 cases:
 //  i) input is an EagerTensor.
 //  ii) input is an arbitrary python list/tuple.
@@ -78,6 +87,11 @@ void ConvertToTensor(TFE_Context* ctx, PyObject* input,
     output_handle->reset(input);
     return;
   }
+  if (CheckResourceVariable(input)) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "Variable input is not supported.");
+    return;
+  }
   TFE_TensorHandle* handle =
       tensorflow::ConvertToEagerTensor(ctx, input, tensorflow::DT_INVALID);
   if (handle == nullptr) {
@@ -89,7 +103,8 @@ void ConvertToTensor(TFE_Context* ctx, PyObject* input,
 
 PYBIND11_MODULE(_pywrap_dtensor_device, m) {
   pybind11_protobuf::ImportNativeProtoCasters();
-  m.def("Allocate", [](const std::string& name) {
+  m.def("Allocate", [](const std::string& name, bool is_async,
+                       int in_flight_nodes_limit) {
     TFE_CustomDevice* device = new TFE_CustomDevice;
     std::unique_ptr<PyObject, decltype(&PyXDecref)> device_capsule(
         PyCapsule_New(device, "TFE_CustomDevice", &CallDelete_Device),
@@ -97,7 +112,8 @@ PYBIND11_MODULE(_pywrap_dtensor_device, m) {
     void* device_info = nullptr;
     std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
         TF_NewStatus(), TF_DeleteStatus);
-    AllocateDTensorDevice(name, device, &device_info, status.get());
+    AllocateDTensorDevice(name, device, &device_info, is_async,
+                          in_flight_nodes_limit, status.get());
     if (TF_GetCode(status.get()) != TF_OK) {
       PyErr_SetString(PyExc_ValueError, TF_Message(status.get()));
       throw py::error_already_set();
@@ -115,14 +131,13 @@ PYBIND11_MODULE(_pywrap_dtensor_device, m) {
         PyTuple_Pack(2, device_capsule.get(), device_info_capsule.get()));
   });
   m.def("AddMesh", [](const py::capsule& device_info,
-                      const std::string& serialized_mesh, bool is_async,
-                      bool is_host_mesh, int in_flight_nodes_limit) {
+                      const std::string& serialized_mesh, bool is_host_mesh) {
     std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
         TF_NewStatus(), TF_DeleteStatus);
     AddMesh(
         serialized_mesh,
         PyCapsule_GetPointer(device_info.ptr(), "TFE_CustomDevice_DeviceInfo"),
-        is_async, is_host_mesh, in_flight_nodes_limit, status.get());
+        is_host_mesh, status.get());
     if (TF_GetCode(status.get()) != TF_OK) {
       PyErr_SetString(PyExc_ValueError, TF_Message(status.get()));
       throw py::error_already_set();
@@ -381,29 +396,40 @@ PYBIND11_MODULE(_pywrap_dtensor_device, m) {
         });
   py::class_<Mesh>(m, "Mesh")
       .def(py::init(&Mesh::CreateMesh))
+      .def(py::init([](absl::string_view single_device) {
+             auto mesh = Mesh::GetSingleDeviceMesh(single_device);
+             if (!mesh.ok()) {
+               throw py::value_error(std::string(mesh.status().message()));
+             }
+             return *mesh;
+           }),
+           py::arg("single_device"), "Creates a single device mesh.")
       .def(py::init([](const tensorflow::dtensor::MeshProto& proto) {
              auto mesh = Mesh::ParseFromProto(proto);
              if (!mesh.ok()) {
-               throw py::value_error(mesh.status().error_message());
+               throw py::value_error(std::string(mesh.status().message()));
              }
              return *mesh;
            }),
-           "Returns a Mesh from a MeshProto.")
+           py::arg("mesh_proto"), "Returns a Mesh from a MeshProto.")
       .def(py::init([](std::string_view mesh_str) {
              auto mesh = Mesh::FromString(mesh_str);
              if (!mesh.ok()) {
-               throw py::value_error(mesh.status().error_message());
+               throw py::value_error(std::string(mesh.status().message()));
              }
              return *mesh;
            }),
-           "Returns a Mesh from a string.")
+           py::arg("mesh_str"), "Returns a Mesh from a string.")
       .def_property_readonly("name", &Mesh::name)
       .def_property_readonly("dim_names", &Mesh::MeshDimNames)
       .def_property_readonly("size", &Mesh::num_devices)
+      .def_property_readonly("single_device", &Mesh::single_device)
       .def("__contains__", &Mesh::IsMeshDim, py::arg("dim_name"))
       .def("__eq__", &Mesh::operator==)
       .def("to_string", &Mesh::ToString,
            "Returns string representation of Mesh.")
+      .def("is_single_device", &Mesh::IsSingleDevice,
+           "Returns True if the mesh represents a non-distributed device.")
       .def("contains_dim", &Mesh::IsMeshDim, py::arg("dim_name"),
            "Returns True if a Mesh contains the given dimension name.")
       .def(
@@ -411,7 +437,7 @@ PYBIND11_MODULE(_pywrap_dtensor_device, m) {
           [](const Mesh& mesh, std::string_view name) {
             auto dim_size = mesh.dim_size(name);
             if (!dim_size.ok()) {
-              throw py::value_error(dim_size.status().error_message());
+              throw py::value_error(std::string(dim_size.status().message()));
             }
             return *dim_size;
           },
@@ -438,47 +464,74 @@ PYBIND11_MODULE(_pywrap_dtensor_device, m) {
       .def("use_xla_spmd", &Mesh::use_xla_spmd,
            "Returns True if Mesh will use XLA for SPMD "
            "instead of DTensor SPMD.")
-      .def("as_proto", &Mesh::ToProto,
-           "Returns the MeshProto protobuf message.")
+      .def(
+          "as_proto",
+          [](const Mesh& mesh) {
+            auto mesh_proto = mesh.ToProto();
+            if (!mesh_proto.ok()) {
+              throw py::value_error(std::string(mesh_proto.status().message()));
+            }
+            return *mesh_proto;
+          },
+          "Returns the MeshProto protobuf message.")
       .def("device_location", [](const Mesh& mesh, int device_id) {
         auto location = mesh.device_location(device_id);
         if (!location.ok()) {
-          throw py::value_error(location.status().error_message());
+          throw py::value_error(std::string(location.status().message()));
         }
         return std::vector<int64_t>(location->begin(), location->end());
       });
   py::class_<Layout>(m, "Layout")
-      .def(py::init(
-          [](const std::vector<std::string>& sharding_specs, const Mesh& mesh) {
-            auto layout = Layout::GetLayout(sharding_specs, mesh);
-            if (!layout.ok()) {
-              throw py::value_error(layout.status().error_message());
-            }
-            return *layout;
-          }))
+      .def(py::init([](const std::vector<std::string>& sharding_specs,
+                       const Mesh& mesh) {
+             auto layout = Layout::GetLayout(sharding_specs, mesh);
+             if (!layout.ok()) {
+               throw py::value_error(std::string(layout.status().message()));
+             }
+             return *layout;
+           }),
+           py::arg("sharding_specs"), py::arg("mesh"))
       .def(py::init([](const tensorflow::dtensor::LayoutProto& proto) {
              auto layout = Layout::FromProto(proto);
              if (!layout.ok()) {
-               throw py::value_error(layout.status().error_message());
+               throw py::value_error(std::string(layout.status().message()));
              }
              return *layout;
            }),
-           "Returns a Layout from a LayoutProto.")
+           py::arg("layout_proto"), "Returns a Layout from a LayoutProto.")
       .def(py::init([](std::string_view layout_str) {
              auto layout = Layout::FromString(layout_str);
              if (!layout.ok()) {
-               throw py::value_error(layout.status().error_message());
+               throw py::value_error(std::string(layout.status().message()));
              }
              return *layout;
            }),
-           "Returns a Layout from a string.")
+           py::arg("layout_str"), "Returns a Layout from a string.")
       .def(py::init(&Layout::ReplicatedOnMesh), py::arg("mesh"),
            py::arg("rank"), "Returns a replicated layout.")
       .def(py::init(&Layout::BatchShardedOnMesh), py::arg("mesh"),
            py::arg("rank"), py::arg("batch_dim"), py::arg("axis"),
            "Returns a batch sharded layout.")
+      .def(py::init([](const Mesh& mesh) {
+             auto layout = Layout::GetSingleDeviceLayout(mesh);
+             if (!layout.ok()) {
+               throw py::value_error(std::string(layout.status().message()));
+             }
+             return *layout;
+           }),
+           py::arg("mesh"), "Returns a single device layout.")
       .def("__eq__", &Layout::operator==)
-      .def("as_proto", &Layout::ToProto)
+      .def(
+          "as_proto",
+          [](const Layout& layout) {
+            auto layout_proto = layout.ToProto();
+            if (!layout_proto.ok()) {
+              throw py::value_error(
+                  std::string(layout_proto.status().message()));
+            }
+            return *layout_proto;
+          },
+          "Returns the LayoutProto protobuf message.")
       .def("to_string", &Layout::ToString)
       .def_property_readonly("sharding_specs", &Layout::sharding_spec_strs)
       .def_property_readonly("rank", &Layout::rank)
@@ -487,6 +540,8 @@ PYBIND11_MODULE(_pywrap_dtensor_device, m) {
            "Returns True if all tensor axes are replicated.")
       .def("is_batch_parallel",
            [](const Layout& layout) { return layout.IsBatchParallel(); })
+      .def("is_single_device", &Layout::IsSingleDevice,
+           "Returns True if the Layout represents a non-distributed device.")
       .def(
           "num_shards",
           [](const Layout& layout, int dim) {
diff --git a/tensorflow/python/pywrap_mlir.py b/tensorflow/python/pywrap_mlir.py
index 24007fee201..8e940894ef6 100644
--- a/tensorflow/python/pywrap_mlir.py
+++ b/tensorflow/python/pywrap_mlir.py
@@ -84,6 +84,7 @@ def experimental_convert_saved_model_v1_to_mlir(
     exported_names,
     tags,
     lift_variables,
+    include_variables_in_initializers,
     upgrade_legacy,
     show_debug_info,
 ):
@@ -92,6 +93,7 @@ def experimental_convert_saved_model_v1_to_mlir(
       str(exported_names).encode('utf-8'),
       str(tags).encode('utf-8'),
       lift_variables,
+      include_variables_in_initializers,
       upgrade_legacy,
       show_debug_info,
   )
@@ -105,3 +107,23 @@ def experimental_run_pass_pipeline(mlir_txt, pass_pipeline, show_debug_info):
 
 def experimental_write_bytecode(filename, mlir_txt):
   return ExperimentalWriteBytecode(filename.encode('utf-8'), mlir_txt.encode())
+
+
+def experimental_tflite_to_tosa_bytecode(
+    flatbuffer,
+    bytecode,
+    use_external_constant=False,
+    ordered_input_arrays=None,
+    ordered_output_arrays=None,
+):
+  if ordered_input_arrays is None:
+    ordered_input_arrays = []
+  if ordered_output_arrays is None:
+    ordered_output_arrays = []
+  return ExperimentalTFLiteToTosaBytecode(
+      flatbuffer.encode('utf-8'),
+      bytecode.encode('utf-8'),
+      use_external_constant,
+      ordered_input_arrays,
+      ordered_output_arrays,
+  )
diff --git a/tensorflow/python/util/compat_internal.py b/tensorflow/python/pywrap_tfe_monitoring_reader.py
similarity index 52%
rename from tensorflow/python/util/compat_internal.py
rename to tensorflow/python/pywrap_tfe_monitoring_reader.py
index 6a2a9dc5ab7..701215d437a 100644
--- a/tensorflow/python/util/compat_internal.py
+++ b/tensorflow/python/pywrap_tfe_monitoring_reader.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,21 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functions for Python 2 vs. 3 compatibility that are private to TensorFlow."""
+"""Python module for TFE ops and functions exported by pybind11.
 
-from tensorflow.python.util.compat import as_str_any
+This module is created because we are splitting out eager bindings from
+pywrap_tensorflow. This is causing some issues where Graphs are not properly
+initialized when running eager code. Once the graph architecture has been
+removed from pywrap_tensorflow as well, we can remove this file.
+"""
 
-
-def path_to_str(path):
-  """Returns the file system path representation of a `PathLike` object,
-  else as it is.
-
-  Args:
-    path: An object that can be converted to path representation.
-
-  Returns:
-    A `str` object.
-  """
-  if hasattr(path, "__fspath__"):
-    path = as_str_any(path.__fspath__())
-  return path
+# pylint: disable=invalid-import-order,g-bad-import-order, wildcard-import, unused-import
+from tensorflow.python._pywrap_tfe_monitoring_reader import *
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index bf5e5929e49..7055b3a60c7 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -2,7 +2,7 @@
 # TensorFlow SavedModel.
 
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test", "tf_pybind_cc_library_wrapper", "tf_python_pybind_extension")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test", "tf_pybind_cc_library_wrapper", "tf_python_pybind_extension")
 load("//tensorflow:tensorflow.bzl", "if_google")
 
 package(
@@ -32,7 +32,6 @@ py_strict_library(
         ":simple_save",
         ":tag_constants",
         ":utils",
-        "//tensorflow/python:util",
         "//tensorflow/python/saved_model/model_utils",
         "//tensorflow/python/saved_model/registration",
     ],
@@ -70,20 +69,19 @@ py_strict_library(
     ],
     srcs_version = "PY3",
     deps = [
-        ":constants",
         ":fingerprinting_utils",
         ":path_helpers",
         ":pywrap_saved_model",
         ":signature_def_utils",
-        ":utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/core/config:flags_py",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:saver",
         "//tensorflow/python:variables",
-        "//tensorflow/python/util",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -102,17 +100,18 @@ py_strict_library(
         ":signature_def_utils",
         ":utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:saver",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "loader_test",
     size = "small",
     srcs = ["loader_test.py"],
@@ -121,24 +120,23 @@ tf_py_test(
         ":loader",
         ":signature_def_utils",
         ":utils",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lib",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/training:saver",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_strict_library(
     name = "simple_save",
-    srcs = [
-        "simple_save.py",
-    ],
+    srcs = ["simple_save.py"],
     srcs_version = "PY3",
     deps = [
         ":builder",
@@ -146,7 +144,7 @@ py_strict_library(
         ":signature_def_utils",
         ":tag_constants",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -160,15 +158,15 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "saved_model_test",
     size = "small",
     srcs = ["saved_model_test.py"],
@@ -182,21 +180,27 @@ tf_py_test(
         ":main_op",
         ":signature_def_utils",
         ":tag_constants",
+        ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:test_ops",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/training:saver_test_utils",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -207,7 +211,7 @@ py_strict_library(
     deps = [
         ":constants",
         "//tensorflow/python:lib",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -221,27 +225,40 @@ py_strict_library(
     deps = [
         ":nested_structure_coder",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:composite_tensor",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:byte_swap_tensor",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "utils_test",
     size = "small",
     srcs = ["utils_test.py"],
     deps = [
+        ":nested_structure_coder",
         ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
     ],
 )
 
@@ -258,12 +275,13 @@ py_strict_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "signature_def_utils_test",
     size = "small",
     srcs = ["signature_def_utils_test.py"],
@@ -274,11 +292,14 @@ tf_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "simple_save_test",
     size = "small",
     srcs = ["simple_save_test.py"],
@@ -295,38 +316,33 @@ tf_py_test(
 
 py_strict_library(
     name = "signature_serialization",
-    srcs = [
-        "signature_serialization.py",
-    ],
+    srcs = ["signature_serialization.py"],
     srcs_version = "PY3",
     deps = [
         ":function_serialization",
         ":revived_types",
         ":signature_constants",
-        "//tensorflow/core/function/capture:restore_captures",
         "//tensorflow/python:composite_tensor",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/trackable:base",
-        "//tensorflow/python/util",
-        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
         "@absl_py//absl/logging",
     ],
 )
 
 py_strict_library(
     name = "save_context",
-    srcs = [
-        "save_context.py",
-    ],
+    srcs = ["save_context.py"],
     srcs_version = "PY3",
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "save_context_test",
     srcs = ["save_context_test.py"],
     srcs_version = "PY3",
@@ -340,10 +356,7 @@ tf_py_test(
 
 py_strict_library(
     name = "save",
-    srcs = [
-        "save.py",
-        "tracing_utils.py",
-    ],
+    srcs = ["save.py"],
     srcs_version = "PY3",
     deps = [
         ":builder",
@@ -358,9 +371,9 @@ py_strict_library(
         ":signature_def_utils",
         ":signature_serialization",
         ":tag_constants",
+        ":tracing_utils",
         ":utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/core/config:flags_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -378,12 +391,11 @@ py_strict_library(
         "//tensorflow/python/checkpoint:functional_saver",
         "//tensorflow/python/checkpoint:graph_view",
         "//tensorflow/python/checkpoint:save_util_v1",
-        "//tensorflow/python/checkpoint:saveable_compat",
-        "//tensorflow/python/checkpoint:tensor_callable",
         "//tensorflow/python/checkpoint:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/eager/polymorphic_function:saved_model_exported_concrete",
         "//tensorflow/python/eager/polymorphic_function:saved_model_utils",
         "//tensorflow/python/saved_model/registration",
@@ -392,6 +404,7 @@ py_strict_library(
         "//tensorflow/python/trackable:resource",
         "//tensorflow/python/trackable:trackable_utils",
         "//tensorflow/python/training/saving:trace_saveable_util",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_export",
@@ -399,35 +412,76 @@ py_strict_library(
     ],
 )
 
-tf_py_test(
+py_strict_library(
+    name = "tracing_utils",
+    srcs = [
+        "tracing_utils.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/checkpoint:saveable_compat",
+        "//tensorflow/python/checkpoint:tensor_callable",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:function",
+    ],
+)
+
+tf_py_strict_test(
     name = "save_test",
     srcs = ["save_test.py"],
     deps = [
+        ":load",
         ":loader",
         ":save",
         ":save_options",
         ":signature_constants",
         ":tag_constants",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/config:flags_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_switch_case",
-        "//tensorflow/python:error_interpolation",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:versions",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/trackable:asset",
+        "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/training:saver",
+        "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_library(
     name = "load",
-    srcs = [
-        "load.py",
-    ],
+    srcs = ["load.py"],
     srcs_version = "PY3",
     tags = [
         "ignore_for_dep=third_party.py.keras.optimizers.optimizer_v2",
     ],
     deps = [
         ":fingerprinting",
+        ":fingerprinting_utils",
         ":function_deserialization",
         ":load_options",
         ":load_v1_in_v2",
@@ -447,7 +501,6 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_options",
@@ -458,6 +511,7 @@ py_library(
         "//tensorflow/python/distribute:values_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager/polymorphic_function:saved_model_utils",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:asset",
@@ -466,7 +520,9 @@ py_library(
         "//tensorflow/python/trackable:data_structures",
         "//tensorflow/python/trackable:resource",
         "//tensorflow/python/trackable:trackable_utils",
+        "//tensorflow/python/training:py_checkpoint_reader",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
         "@absl_py//absl/logging",
     ],
@@ -474,9 +530,7 @@ py_library(
 
 py_strict_library(
     name = "load_v1_in_v2",
-    srcs = [
-        "load_v1_in_v2.py",
-    ],
+    srcs = ["load_v1_in_v2.py"],
     srcs_version = "PY3",
     deps = [
         ":function_deserialization",
@@ -487,21 +541,21 @@ py_strict_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:func_graph",
-        "//tensorflow/python:platform",
         "//tensorflow/python:saver",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:lift_to_graph",
         "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:asset",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/trackable:resource",
         "//tensorflow/python/training:monitored_session",
+        "//tensorflow/python/util:nest",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "load_test",
     srcs = ["load_test.py"],
     shard_count = 10,
@@ -511,27 +565,61 @@ cuda_py_test(
     ],
     deps = [
         ":load",
+        ":load_options",
+        ":loader",
         ":save",
-        "@absl_py//absl/testing:parameterized",
-        "//tensorflow/python:while_loop",
+        ":save_options",
+        ":tag_constants",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond_v2",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:gradients",
         "//tensorflow/python:lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python:while_loop",
         "//tensorflow/python:while_v2",  # b/118513001
+        "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:saveable_compat",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:op_callbacks",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:versions",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/trackable:asset",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/trackable:resource",
+        "//tensorflow/python/training:monitored_session",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:tf_decorator",
+        "@absl_py//absl/testing:parameterized",
     ] + if_google([
         "//tensorflow/cc/experimental/tf2:runtime_pybind",
     ]),
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "load_v1_in_v2_test",
     srcs = ["load_v1_in_v2_test.py"],
     deps = [
@@ -540,23 +628,41 @@ tf_py_test(
         ":save",
         ":signature_def_utils",
         ":simple_save",
+        ":tag_constants",
         ":utils",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:ref_variable",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
-        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python:while_loop",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:lift_to_graph",
         "//tensorflow/python/eager:test",
-        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:versions",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/training:saver",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "load_optimizer_test",
     srcs = ["load_optimizer_test.py"],
     data = ["//tensorflow/cc/saved_model:saved_model_test_files"],
@@ -570,9 +676,7 @@ tf_py_test(
 
 py_strict_library(
     name = "revived_types",
-    srcs = [
-        "revived_types.py",
-    ],
+    srcs = ["revived_types.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
@@ -581,41 +685,38 @@ py_strict_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "revived_types_test",
     srcs = ["revived_types_test.py"],
     deps = [
         ":revived_types",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/trackable:autotrackable",
     ],
 )
 
 py_strict_library(
     name = "function_serialization",
-    srcs = [
-        "function_serialization.py",
-    ],
+    srcs = ["function_serialization.py"],
     srcs_version = "PY3",
     deps = [
         ":nested_structure_coder",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/core/function/capture:capture_container",
         "//tensorflow/python:func_graph",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/util:nest",
     ],
 )
 
 py_strict_library(
     name = "function_deserialization",
-    srcs = [
-        "function_deserialization.py",
-    ],
+    srcs = ["function_deserialization.py"],
     srcs_version = "PY3",
     deps = [
         ":nested_structure_coder",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:default_gradient",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:func_graph",
@@ -624,12 +725,13 @@ py_strict_library(
         "//tensorflow/python:op_def_registry",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:tf_decorator",
         "//tensorflow/python:type_spec",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager/polymorphic_function:function_spec",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
         "@absl_py//absl/logging",
     ],
 )
@@ -640,23 +742,35 @@ py_strict_library(
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:util",
         "//tensorflow/python/framework:type_spec_registry",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:internal",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "nested_structure_coder_test",
     srcs = ["nested_structure_coder_test.py"],
     deps = [
         ":nested_structure_coder",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:extension_type",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_spec_registry",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/types:internal",
     ],
 )
 
@@ -664,7 +778,7 @@ py_strict_library(
     name = "save_options",
     srcs = ["save_options.py"],
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -686,30 +800,43 @@ py_strict_library(
         ":constants",
         ":loader",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "method_name_updater_test",
     srcs = ["method_name_updater_test.py"],
     deps = [
+        ":constants",
+        ":loader",
         ":method_name_updater",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:compat",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "metrics_test",
     srcs = ["metrics_test.py"],
     deps = [
+        ":builder",
         ":fingerprinting",
+        ":load",
+        ":load_v1_in_v2",
+        ":loader",
         ":pywrap_saved_model",
+        ":save",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/trackable:autotrackable",
     ],
 )
 
@@ -744,7 +871,10 @@ tf_python_pybind_extension(
     #    }),
     #    static_deps = tf_python_pybind_static_deps(),
     features = ["-layering_check"],
-    visibility = ["//tensorflow/python/training:__subpackages__"],
+    visibility = [
+        "//tensorflow/python/checkpoint:__subpackages__",
+        "//tensorflow/python/training:__subpackages__",
+    ],
     deps = [
         ":pywrap_saved_model_headers",
         "//tensorflow/cc/experimental/libexport:save",
@@ -753,16 +883,17 @@ tf_python_pybind_extension(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "pywrap_saved_model_metrics_test",
     srcs = ["pywrap_saved_model_metrics_test.py"],
     deps = [
         ":pywrap_saved_model",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:test",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "keras_injection_test",
     size = "small",
     srcs = ["keras_injection_test.py"],
@@ -772,14 +903,17 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "pywrap_saved_model_fingerprinting_test",
     srcs = ["pywrap_saved_model_fingerprinting_test.py"],
     data = ["//tensorflow/cc/saved_model:saved_model_test_files"],
     tags = ["no_windows"],
     deps = [
         ":pywrap_saved_model",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -797,33 +931,45 @@ py_strict_library(
     name = "fingerprinting_utils",
     srcs = ["fingerprinting_utils.py"],
     deps = [
+        ":fingerprinting",
         ":pywrap_saved_model",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/config:flags_py",
         "//tensorflow/python/lib/io:lib",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:compat",
         "@absl_py//absl/logging",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "fingerprinting_test",
     size = "small",
     srcs = ["fingerprinting_test.py"],
     python_version = "PY3",
     deps = [
         ":fingerprinting",
+        ":fingerprinting_utils",
+        ":pywrap_saved_model",
+        ":save",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/config:flags_py",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/trackable:autotrackable",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tracing_utils_test",
     size = "small",
     srcs = ["tracing_utils_test.py"],
     python_version = "PY3",
     deps = [
-        ":save",
+        ":tracing_utils",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
diff --git a/tensorflow/python/saved_model/fingerprinting.py b/tensorflow/python/saved_model/fingerprinting.py
index 201ba79f219..dd8be59cfaa 100644
--- a/tensorflow/python/saved_model/fingerprinting.py
+++ b/tensorflow/python/saved_model/fingerprinting.py
@@ -68,13 +68,52 @@ class Fingerprint(object):
 
   @classmethod
   def from_proto(cls, proto):
-    return Fingerprint(
-        proto.saved_model_checksum,
-        proto.graph_def_program_hash,
-        proto.signature_def_hash,
-        proto.saved_object_graph_hash,
-        proto.checkpoint_hash,
-        proto.version)
+    """Constructs Fingerprint object from protocol buffer message."""
+    if isinstance(proto, bytes):
+      proto = fingerprint_pb2.FingerprintDef.FromString(proto)
+    try:
+      return Fingerprint(
+          proto.saved_model_checksum,
+          proto.graph_def_program_hash,
+          proto.signature_def_hash,
+          proto.saved_object_graph_hash,
+          proto.checkpoint_hash,
+          proto.version)
+    except AttributeError as e:
+      raise ValueError(
+          f"Given proto could not be deserialized as fingerprint."
+          f"{e}") from None
+
+  def __eq__(self, other):
+    if (isinstance(other, Fingerprint) or
+        isinstance(other, fingerprint_pb2.FingerprintDef)):
+      try:
+        return (
+            self.saved_model_checksum == other.saved_model_checksum and
+            self.graph_def_program_hash == other.graph_def_program_hash and
+            self.signature_def_hash == other.signature_def_hash and
+            self.saved_object_graph_hash == other.saved_object_graph_hash and
+            self.checkpoint_hash == other.checkpoint_hash)
+      except AttributeError:
+        pass
+    return False
+
+  def __str__(self):
+    return "\n".join([
+        f"SavedModel Fingerprint",
+        f"  saved_model_checksum: {self.saved_model_checksum}",
+        f"  graph_def_program_hash: {self.graph_def_program_hash}",
+        f"  signature_def_hash: {self.signature_def_hash}",
+        f"  saved_object_graph_hash: {self.saved_object_graph_hash}",
+        f"  checkpoint_hash: {self.checkpoint_hash}"
+    ])
+
+  def __repr__(self):
+    return (f"Fingerprint({self.saved_model_checksum}, "
+            f"{self.graph_def_program_hash}, "
+            f"{self.signature_def_hash}, "
+            f"{self.saved_object_graph_hash}, "
+            f"{self.checkpoint_hash})")
 
   def singleprint(self):
     """Canonical fingerprinting ID for a SavedModel.
@@ -85,7 +124,7 @@ class Fingerprint(object):
 
     Returns:
       The string concatenation of `graph_def_program_hash`,
-      `signature_def_hash`, and `saved_object_graph_hash`
+      `signature_def_hash`, `saved_object_graph_hash`, and `checkpoint_hash`
       fingerprint attributes (separated by '/').
 
     Raises:
@@ -129,7 +168,9 @@ def read_fingerprint(export_dir):
   """
   try:
     fingerprint = fingerprinting_pywrap.ReadSavedModelFingerprint(export_dir)
-  except fingerprinting_pywrap.FingerprintException as e:
+  except fingerprinting_pywrap.FileNotFoundException as e:
     raise FileNotFoundError(f"SavedModel Fingerprint Error: {e}") from None  # pylint: disable=raise-missing-from
+  except fingerprinting_pywrap.FingerprintException as e:
+    raise RuntimeError(f"SavedModel Fingerprint Error: {e}") from None  # pylint: disable=raise-missing-from
   return Fingerprint.from_proto(
       fingerprint_pb2.FingerprintDef().FromString(fingerprint))
diff --git a/tensorflow/python/saved_model/fingerprinting_test.py b/tensorflow/python/saved_model/fingerprinting_test.py
index a2756a52d04..941db29fddc 100644
--- a/tensorflow/python/saved_model/fingerprinting_test.py
+++ b/tensorflow/python/saved_model/fingerprinting_test.py
@@ -22,6 +22,7 @@ import shutil
 
 from tensorflow.core.config import flags
 from tensorflow.core.protobuf import fingerprint_pb2
+from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
@@ -29,8 +30,10 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.saved_model import fingerprinting
+from tensorflow.python.saved_model import fingerprinting_utils
 from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model.pywrap_saved_model import constants
+from tensorflow.python.saved_model.pywrap_saved_model import fingerprinting as fingerprinting_pywrap
 from tensorflow.python.trackable import autotrackable
 
 
@@ -69,6 +72,12 @@ class FingerprintingTest(test.TestCase):
       fingerprint_def.ParseFromString(f.read())
     return fingerprint_def
 
+  def _read_saved_model(self, filename):
+    saved_model_def = saved_model_pb2.SavedModel()
+    with file_io.FileIO(filename, "rb") as f:
+      saved_model_def.ParseFromString(f.read())
+    return saved_model_def
+
   def setUp(self):
     super().setUp()
     flags.config().saved_model_fingerprinting.reset(True)
@@ -131,35 +140,31 @@ class FingerprintingTest(test.TestCase):
     fingerprint = fingerprinting.read_fingerprint(save_dir)
 
     fingerprint_def = self._read_fingerprint(
-        file_io.join(save_dir, constants.FINGERPRINT_FILENAME)
-    )
+        file_io.join(save_dir, constants.FINGERPRINT_FILENAME))
 
-    self.assertEqual(
-        fingerprint.saved_model_checksum, fingerprint_def.saved_model_checksum
-    )
-    self.assertEqual(
-        fingerprint.graph_def_program_hash,
-        fingerprint_def.graph_def_program_hash,
-    )
-    self.assertEqual(
-        fingerprint.signature_def_hash, fingerprint_def.signature_def_hash
-    )
-    self.assertEqual(
-        fingerprint.saved_object_graph_hash,
-        fingerprint_def.saved_object_graph_hash,
-    )
-    self.assertEqual(
-        fingerprint.checkpoint_hash, fingerprint_def.checkpoint_hash
-    )
-    self.assertEqual(
-        fingerprint.version.producer, fingerprint_def.version.producer
-    )
+    self.assertEqual(fingerprint, fingerprint_def)
 
-  def test_read_fingerprint_api_invalid(self):
+  def test_read_fingerprint_file_not_found(self):
     with self.assertRaisesRegex(FileNotFoundError,
                                 "SavedModel Fingerprint Error"):
       fingerprinting.read_fingerprint("foo")
 
+  def test_write_fingerprint(self):
+    save_dir = os.path.join(self.get_temp_dir(), "model_and_fingerprint")
+    save.save_and_return_nodes(
+        self._create_model_with_data(), save_dir,
+        experimental_skip_checkpoint=True)  # checkpoint data won't be loaded*
+
+    new_dir = os.path.join(self.get_temp_dir(), "fingerprint_dir")
+    os.mkdir(new_dir)
+    serialized_model = self._read_saved_model(  # *here
+        os.path.join(save_dir, "saved_model.pb")).SerializeToString()
+    fingerprinting_utils.write_fingerprint(new_dir, serialized_model)
+
+    model_fingerprint = fingerprinting.read_fingerprint(save_dir)
+    solo_fingerprint = fingerprinting.read_fingerprint(new_dir)
+    self.assertEqual(model_fingerprint, solo_fingerprint)
+
   def test_valid_singleprint(self):
     save_dir = os.path.join(self.get_temp_dir(), "singleprint_model")
     save.save(self._create_model_with_data(), save_dir)
@@ -179,6 +184,30 @@ class FingerprintingTest(test.TestCase):
                                 "Encounted invalid fingerprint values"):
       fingerprint.singleprint()
 
+  def test_valid_from_proto(self):
+    save_dir = os.path.join(self.get_temp_dir(), "from_proto_model")
+    save.save(self._create_model_with_data(), save_dir)
+    fingerprint_def = fingerprint_pb2.FingerprintDef().FromString(
+        fingerprinting_pywrap.ReadSavedModelFingerprint(save_dir))
+    fingerprint = fingerprinting.Fingerprint.from_proto(fingerprint_def)
+    self.assertEqual(fingerprint, fingerprint_def)
+
+  def test_invalid_from_proto(self):
+    save_dir = os.path.join(self.get_temp_dir(), "from_proto_model")
+    save.save(self._create_model_with_data(), save_dir)
+    wrong_def = saved_model_pb2.SavedModel(
+        saved_model_schema_version=1)
+    with self.assertRaisesRegex(ValueError,
+                                "Given proto could not be deserialized as"):
+      fingerprinting.Fingerprint.from_proto(wrong_def)
+
+  def test_fingerprint_to_proto(self):
+    save_dir = os.path.join(self.get_temp_dir(), "from_proto_model")
+    save.save(self._create_model_with_data(), save_dir)
+    fingerprint = fingerprinting.read_fingerprint(save_dir)
+    fingerprint_def = fingerprinting_utils.to_proto(fingerprint)
+    self.assertEqual(fingerprint, fingerprint_def)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/fingerprinting_utils.py b/tensorflow/python/saved_model/fingerprinting_utils.py
index 68da45206fc..6d56c59263c 100644
--- a/tensorflow/python/saved_model/fingerprinting_utils.py
+++ b/tensorflow/python/saved_model/fingerprinting_utils.py
@@ -23,6 +23,7 @@ from absl import logging
 from tensorflow.core.config import flags
 from tensorflow.core.protobuf import fingerprint_pb2
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import fingerprinting
 from tensorflow.python.saved_model.pywrap_saved_model import constants
 from tensorflow.python.saved_model.pywrap_saved_model import fingerprinting as fingerprinting_pywrap
 from tensorflow.python.saved_model.pywrap_saved_model import metrics
@@ -44,14 +45,51 @@ def write_fingerprint(export_dir, saved_model_serialized):
     fingerprint_path = file_io.join(
         compat.as_str(export_dir),
         compat.as_str(constants.FINGERPRINT_FILENAME))
-    logging.info('Writing fingerprint to %s', fingerprint_path)
-    fingerprint_serialized = fingerprinting_pywrap.CreateFingerprintDef(
-        saved_model_serialized, export_dir)
+    logging.info("Writing fingerprint to %s", fingerprint_path)
+    try:
+      fingerprint_serialized = fingerprinting_pywrap.CreateFingerprintDef(
+          saved_model_serialized, export_dir)
+    except fingerprinting_pywrap.FingerprintException as e:
+      raise ValueError(e) from None
     file_io.atomic_write_string_to_file(fingerprint_path,
                                         fingerprint_serialized)
     # We need to deserialize the fingerprint in order to send its values.
     fingerprint_proto = fingerprint_pb2.FingerprintDef()
     fingerprint_proto.ParseFromString(fingerprint_serialized)
-    logging.info(fingerprint_proto)
-    metrics.SetWriteFingerprint(
-        saved_model_checksum=str(fingerprint_proto.saved_model_checksum))
+    metrics.SetWriteFingerprint(fingerprint=fingerprint_serialized)
+    fingerprint = fingerprinting.Fingerprint.from_proto(fingerprint_serialized)
+    metrics.SetWritePathAndSingleprint(path=export_dir,
+                                       singleprint=fingerprint.singleprint())
+
+
+def singleprint_from_saved_model(export_dir, saved_model_serialized):
+  """Returns the singleprint of a SavedModel in `export_dir`.
+
+  Args:
+    export_dir: The directory that contains the SavedModel.
+    saved_model_serialized: The serialized SavedModel proto.
+
+  Returns:
+    A string containing the singleprint of the SavedModel.
+
+  Raises:
+    ValueError: If a valid singleprint cannot be constructed from a SavedModel.
+  """
+  try:
+    fingerprint_serialized = fingerprinting_pywrap.CreateFingerprintDef(
+        saved_model_serialized, export_dir)
+  except fingerprinting_pywrap.FingerprintException as e:
+    raise ValueError(e) from None
+  fingerprint = fingerprinting.Fingerprint.from_proto(fingerprint_serialized)
+  return fingerprint.singleprint()
+
+
+def to_proto(fingerprint):
+  if not isinstance(fingerprint, fingerprinting.Fingerprint):
+    raise TypeError("Supplied value is not a Fingerprint.")
+  return fingerprint_pb2.FingerprintDef(
+      saved_model_checksum=fingerprint.saved_model_checksum,
+      graph_def_program_hash=fingerprint.graph_def_program_hash,
+      signature_def_hash=fingerprint.signature_def_hash,
+      saved_object_graph_hash=fingerprint.saved_object_graph_hash,
+      checkpoint_hash=fingerprint.checkpoint_hash)
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index 5ebf18a8ada..04eee47047b 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -17,6 +17,7 @@
 import collections
 import pprint
 import re
+
 from absl import logging
 
 from tensorflow.core.protobuf import saved_object_graph_pb2
@@ -29,6 +30,7 @@ from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import default_gradient
 from tensorflow.python.ops import resource_variable_ops
@@ -126,7 +128,9 @@ def _deserialize_function_spec_as_nonmethod(function_spec_proto):
       function_spec_proto.fullargspec)
 
   # Convert a method function into a non method.
-  if function_spec_proto.is_method:
+  if function_spec_proto.is_method or (
+      typeless_fullargspec.args and typeless_fullargspec.args[0] == "self"
+  ):
     if not typeless_fullargspec.args:
       raise NotImplementedError(
           "Cannot deserialize a method function without a named "
@@ -216,6 +220,9 @@ class RestoredFunction(def_function.Function):
     # via `tf.config.run_functions_eagerly`.
     return False
 
+  def _list_all_concrete_functions(self):
+    return self.concrete_functions
+
   def _list_all_concrete_functions_for_serialization(self):
     return self.concrete_functions
 
@@ -462,9 +469,24 @@ def _gen_gradient_func(func):
     # expects tensors. Replacing with zeros is correct since the `None` values
     # occur when the gradient is unconnected, and thus the gradient is
     # "statically proven to be zero." See `tf.UnconnectedGradients` for details.
+
+    def none_to_zero(x, t):
+      if x is not None:
+        return x
+
+      shape, dtype = default_gradient.shape_and_dtype(t)
+
+      if shape.is_fully_defined():
+        return default_gradient.zeros_like(t)
+
+      dims = []
+      if shape.rank is not None:
+        dims = [1 if d is None else d for d in shape.as_list()]
+
+      return array_ops.zeros(dims, dtype)
+
     result_grads = [
-        x if x is not None else default_gradient.zeros_like(t)
-        for (x, t) in zip(result_grads, func.graph.inputs)
+        none_to_zero(x, t) for (x, t) in zip(result_grads, func.graph.inputs)
     ]
 
     return func(*result_grads)
diff --git a/tensorflow/python/saved_model/function_serialization.py b/tensorflow/python/saved_model/function_serialization.py
index 9b9e43deb8f..e37e7344d16 100644
--- a/tensorflow/python/saved_model/function_serialization.py
+++ b/tensorflow/python/saved_model/function_serialization.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """Tools for serializing `Function`s."""
 
-from tensorflow.core.function.capture import capture_container
 from tensorflow.core.protobuf import saved_object_graph_pb2
 from tensorflow.python.eager import function as defun
 from tensorflow.python.framework import func_graph as func_graph_module
@@ -24,6 +23,14 @@ from tensorflow.python.util import nest
 
 def _serialize_function_spec(function_spec):
   """Serialize a FunctionSpec object into its proto representation."""
+  if (
+      function_spec.fullargspec.args
+      and function_spec.fullargspec.args[0] == "self"
+  ):
+    raise TypeError(
+        "Can not serialize tf.function with unbound 'self' parameter."
+    )
+
   proto = saved_object_graph_pb2.FunctionSpec()
 
   # Intentionally skip encoding annotations of a function because function
@@ -119,7 +126,6 @@ def wrap_cached_variables(concrete_function):
   """
   outer_graph = func_graph_module.FuncGraph(
       "{}_no_cache".format(concrete_function.graph.name))
-  captures = concrete_function.graph._function_captures._by_val  # pylint: disable=protected-access
   mapped_captures = None
   remapped_captures = {}
 
@@ -132,11 +138,15 @@ def wrap_cached_variables(concrete_function):
         continue
       cached_variable = cached_variable()
       new_cached_value = cached_variable.read_value()
-      remapped_captures[id(capture)] = captures[id(capture)]
-      captures[id(capture)] = capture_container.CaptureContainer(
-          new_cached_value,
-          placeholder,
-          id(capture))
+      key = id(capture)
+      external = concrete_function.graph.function_captures.by_val_external[key]
+      internal = concrete_function.graph.function_captures.by_val_internal[key]
+      remapped_captures[key] = [external, internal]
+      concrete_function.graph.function_captures.add_or_replace(
+          key=key,
+          external=new_cached_value,
+          internal=placeholder,
+          is_by_ref=False)
       mapped_captures = True
 
   if not mapped_captures:
@@ -166,5 +176,10 @@ def wrap_cached_variables(concrete_function):
 
   # Return the captures to their original values
   for key, capture in remapped_captures.items():
-    captures[key] = capture
+    external, internal = capture
+    concrete_function.graph._function_captures.add_or_replace(  # pylint: disable=protected-access
+        key=key,
+        external=external,
+        internal=internal,
+        is_by_ref=False)
   return fn
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 90203c0a972..b3c47f3c68c 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -21,14 +21,14 @@ import sys
 
 from absl import logging
 
+from tensorflow.core.framework import graph_debug_info_pb2
 from tensorflow.core.function.capture import restore_captures
-from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.python.checkpoint import checkpoint
 from tensorflow.python.checkpoint import checkpoint_options
 from tensorflow.python.checkpoint import graph_view
 from tensorflow.python.checkpoint import restore
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
@@ -45,6 +45,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import fingerprinting
+from tensorflow.python.saved_model import fingerprinting_utils
 from tensorflow.python.saved_model import function_deserialization
 from tensorflow.python.saved_model import load_options
 from tensorflow.python.saved_model import load_v1_in_v2
@@ -120,7 +121,7 @@ class _WrapperFunction(function.ConcreteFunction):
     # Shallow copy the concrete_function
     self.__dict__.update(vars(concrete_function))
 
-  def _call_flat(self, args, captured_inputs, cancellation_manager=None):
+  def _call_flat(self, args, captured_inputs):
 
     def get_handle(x):
       return x.handle if distribute_utils.is_distributed_variable(x) else x
@@ -129,7 +130,7 @@ class _WrapperFunction(function.ConcreteFunction):
       return _unused_handle() if distribute_utils.is_distributed_variable(x)   \
           else x
 
-    if (ds_context.get_replica_context() is not None or
+    if (distribute_lib.get_replica_context() is not None or
         values_util.is_saving_non_distributed()):
       # If we're in the replica context or are saving a non-distributed version
       # of the model, we resolve the captured variables to the corresponding
@@ -142,8 +143,7 @@ class _WrapperFunction(function.ConcreteFunction):
       captured_inputs = list(map(get_handle, captured_inputs))
     else:  # cross-replica context
       captured_inputs = list(map(get_unused_handle, captured_inputs))
-    return super(_WrapperFunction, self)._call_flat(args, captured_inputs,
-                                                    cancellation_manager)
+    return super()._call_flat(args, captured_inputs)
 
 
 class Loader(object):
@@ -168,6 +168,33 @@ class Loader(object):
     self._checkpoint_options = ckpt_options
     self._save_options = save_options
 
+    # Metagraph has a mapping from FunctionDef name to aliases
+    self._concrete_function_aliases = meta_graph.meta_info_def.function_aliases
+    self.function_aliases = {}
+    if self._save_options.experimental_load_function_aliases:
+      # Create a mapping from aliases to polymorphic restored functions or lists
+      # of concrete functions. This mapping can later be used with SaveOptions
+      # when re-saving the loaded object to a SavedModel. We start with a
+      # mapping from aliases to lists of concrete functions. Later in
+      # _recreate_function, on a entry by entry basis, we replace lists with
+      # polymorphic restored functions if the concrete function associated with
+      # a restored function is identical to a list of concrete functions in an
+      # entry.
+      concrete_func_list_by_alias = collections.defaultdict(list)
+      for concrete_func_name, alias in self._concrete_function_aliases.items():
+        if concrete_func_name not in self._concrete_functions:
+          logging.warn(
+              (
+                  "ConcreteFunction `%s` is listed in function alias but it"
+                  " is not found."
+              ),
+              concrete_func_name,
+          )
+          continue
+        concrete_function = self._concrete_functions[concrete_func_name]
+        concrete_func_list_by_alias[alias].append(concrete_function)
+      self.function_aliases = dict(concrete_func_list_by_alias)
+
     self._pretty_printer = checkpoint.ObjectGraphProtoPrettyPrinter(self._proto)
 
     # Stores user-defined node_filters argument.
@@ -681,6 +708,39 @@ class Loader(object):
         proto, self._concrete_functions)
     for name in proto.concrete_functions:
       self._setup_function_captures(name, dependencies)
+
+    # If the list of concrete functions associated with this polymorphic
+    # restored function is identical to a list of concrete functions found in
+    # the function alias mapping, we replace the latter with this restored
+    # function. Also see comments in the __init__ method.
+    if self._save_options.experimental_load_function_aliases:
+      if proto.concrete_functions and all(
+          name in self._concrete_function_aliases
+          for name in proto.concrete_functions
+      ):
+        alias = self._concrete_function_aliases[
+            next(iter(proto.concrete_functions))
+        ]
+        aliased = self.function_aliases.get(alias)
+        assert isinstance(aliased, list)
+        # Note that we cannot compare f.name below with proto.concrete_functions
+        # because the former is new name for the restored ConcreteFunction
+        # object while the latter is the old name in the original proto.
+        if set(f.name for f in aliased) == set(
+            f.name for f in fn._list_all_concrete_functions()  # pylint: disable=protected-access
+        ):
+          self.function_aliases[alias] = fn
+        else:
+          logging.warn(
+              (
+                  "Not aliasing '%s' to polymorphic restored function because"
+                  " of mismatched concrete functions: %s vs %s"
+              ),
+              alias,
+              set(f.name for f in aliased),
+              set(f.name for f in fn._list_all_concrete_functions()),  # pylint: disable=protected-access
+          )
+
     return fn, setattr
 
   def _recreate_bare_concrete_function(self, proto, dependencies):
@@ -995,12 +1055,29 @@ def load_partial(export_dir, filters, tags=None, options=None):
   # Read and log SavedModel checksum, if it is nonzero.
   try:
     fingerprint = fingerprinting.read_fingerprint(export_dir)
-    if fingerprint.saved_model_checksum != 0:
-      metrics.SetReadFingerprint(
-          saved_model_checksum=str(fingerprint.saved_model_checksum))
   except FileNotFoundError:
-    logging.error("Unable to load fingerprint when loading saved model.",
-                  exc_info=True)
+    logging.info(
+        "Fingerprint not found. Saved model loading will continue.")
+    singleprint = ""
+  except RuntimeError:
+    logging.exception(
+        "Fingerprint was found, but there was an error when reading the proto.")
+    singleprint = ""
+  else:
+    metrics.SetReadFingerprint(
+        fingerprint=fingerprinting_utils.to_proto(
+            fingerprint).SerializeToString())
+    singleprint = fingerprint.singleprint()
+  metrics.SetReadPathAndSingleprint(path=export_dir, singleprint=singleprint)
+
+  if options.experimental_load_function_aliases:
+    if hasattr(root, "function_aliases"):
+      raise ValueError(
+          "Could not load with experimental_load_function_aliases option"
+          " because the top-level object already has an attributed with name"
+          " 'function_aliases'"
+      )
+    root.function_aliases = loader.function_aliases
 
   if filters:
     return {node_id: loader.get(node_id) for node_id in filters}
diff --git a/tensorflow/python/saved_model/load_options.py b/tensorflow/python/saved_model/load_options.py
index c7aad0f1752..19810651317 100644
--- a/tensorflow/python/saved_model/load_options.py
+++ b/tensorflow/python/saved_model/load_options.py
@@ -28,13 +28,15 @@ class LoadOptions(object):
 
   # Define object attributes in __slots__ for improved memory and performance.
   __slots__ = ("allow_partial_checkpoint", "experimental_io_device",
-               "experimental_skip_checkpoint", "experimental_variable_policy")
+               "experimental_skip_checkpoint", "experimental_variable_policy",
+               "experimental_load_function_aliases")
 
   def __init__(self,
                allow_partial_checkpoint=False,
                experimental_io_device=None,
                experimental_skip_checkpoint=False,
-               experimental_variable_policy=None):
+               experimental_variable_policy=None,
+               experimental_load_function_aliases=False):
     """Creates an object that stores options for SavedModel loading.
 
     *When to set `allow_partial_checkpoint=True`?*
@@ -103,6 +105,9 @@ class LoadOptions(object):
         enum instance or one of its value strings (case is not important). See
         that enum documentation for details. A value of `None` corresponds to
         the default policy.
+      experimental_load_function_aliases: bool. Defaults to `False`. If set to
+        `True`, a `function_aliases` attribute will be added to the loaded
+        SavedModel object.
 
     Example:
 
@@ -117,3 +122,4 @@ class LoadOptions(object):
     self.experimental_skip_checkpoint = experimental_skip_checkpoint
     self.experimental_variable_policy = (
         save_options.VariablePolicy.from_obj(experimental_variable_policy))
+    self.experimental_load_function_aliases = experimental_load_function_aliases
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index e56d25eaced..6aba07fbe56 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -23,6 +23,7 @@ import os
 import pathlib
 import sys
 import tempfile
+import unittest
 import weakref
 
 from absl.testing import parameterized
@@ -75,6 +76,7 @@ from tensorflow.python.trackable import asset
 from tensorflow.python.trackable import autotrackable
 from tensorflow.python.trackable import resource
 from tensorflow.python.training import monitored_session
+from tensorflow.python.types import core as types_core
 from tensorflow.python.util import tf_inspect
 
 
@@ -524,6 +526,35 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
 
+  def test_function_with_defaults_input(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
+    @def_function.function(input_signature=[tensor_spec.TensorSpec([])])
+    def func(x=constant_op.constant(5.0)):
+      return x
+
+    root = autotrackable.AutoTrackable()
+    root.f = func
+
+    self.assertAllEqual(5.0, root.f())
+    self.assertAllEqual(7.0, root.f(7.0))
+
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+
+    self.assertEqual(5.0, imported.f().numpy())
+    self.assertEqual(7.0, imported.f(constant_op.constant(7.0)).numpy())
+
+    # imported.signatures with defaults are not supported.
+    # TODO(b/277814477) support defaults in loaded.signatures
+    # self.assertEqual(
+    #     {"output_0": 5.0},
+    #     self.evaluate(
+    #         imported.signatures["serving_default"]()
+    #     ),
+    # )
+
   def test_function_with_default_none_input(self, cycles, use_cpp_bindings):
     # TODO(b/264869228) Fix LoadTest
     if use_cpp_bindings:
@@ -559,6 +590,9 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
+    restored_concrete_functions = imported.f._list_all_concrete_functions()  # pylint: disable=protected-access
+    self.assertLen(restored_concrete_functions, 4)
+
     self.assertAllEqual(
         [0.0, 0.0, 0.0],
         imported.f(constant_op.constant([1, 2, 3]), None).numpy(),
@@ -599,6 +633,9 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
+    restored_concrete_functions = imported.f._list_all_concrete_functions()  # pylint: disable=protected-access
+    self.assertLen(restored_concrete_functions, 3)
+
     self.assertAllEqual(b"ab", imported.f("a", "b"))
     self.assertAllEqual(b"ab", imported.f("a", constant_op.constant("b")))
     self.assertAllEqual(b"ab", imported.f(constant_op.constant("a"), "b"))
@@ -1215,6 +1252,9 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
+    restored_concrete_functions = imported.f._list_all_concrete_functions()  # pylint: disable=protected-access
+    self.assertLen(restored_concrete_functions, 1)
+
     with self.assertRaisesRegex(
         TypeError, "Binding inputs to tf.function `f` failed"
     ):
@@ -1738,11 +1778,11 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     exported = ObjWithFunction()
 
-    with self.assertLogs(level="WARNING") as logs:
+    with self.assertLogs(level="INFO") as logs:
       imported = cycle(exported, cycles, use_cpp_bindings=use_cpp_bindings)
 
     expected_message = (
-        "WARNING:absl:Function `foo` contains input name(s) A-b, A/D with "
+        "INFO:absl:Function `foo` contains input name(s) A-b, A/D with "
         "unsupported characters which will be renamed to a_b, a_d in the "
         "SavedModel."
     )
@@ -2622,6 +2662,56 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllClose(grads, expected_grads)
 
+  def test_custom_gradients_with_none_grad_and_partial_shape(
+      self, cycles, use_cpp_bindings
+  ):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+    # https://github.com/google/jax/issues/7123
+
+    @custom_gradient.custom_gradient
+    def f(params, state):
+      def grad_fn(*args):
+        return args
+
+      return (params, state), grad_fn
+
+    @def_function.function(
+        input_signature=[
+            tensor_spec.TensorSpec(None, dtypes.float32),
+            tensor_spec.TensorSpec(None, dtypes.int32),
+        ]
+    )
+    def predict(params, state):
+      return f(params, state)
+
+    params = variables.Variable(1.0)
+    # None grads only appear when state is an int.
+    state = constant_op.constant(3, dtype=dtypes.int32)
+    with backprop.GradientTape() as tape:
+      tape.watch(params)
+      y = predict(params, state)
+      expected_grads = tape.gradient(y, params)
+
+    root = autotrackable.AutoTrackable()
+    root.fn = predict
+    loaded = cycle(
+        root,
+        cycles,
+        save_option=save_options.SaveOptions(
+            experimental_custom_gradients=True
+        ),
+        use_cpp_bindings=use_cpp_bindings,
+    )
+
+    with backprop.GradientTape() as tape:
+      tape.watch(params)
+      y = loaded.fn(params, state)
+      grads = tape.gradient(y, params)
+
+    self.assertAllClose(grads, expected_grads)
+
 
 @parameterized.named_parameters(*_test_params())
 class SingleCycleTests(test.TestCase, parameterized.TestCase):
@@ -2858,11 +2948,11 @@ class SingleCycleTests(test.TestCase, parameterized.TestCase):
         return a
 
     root = ObjWithFunction()
-    with self.assertLogs(level="WARNING") as logs:
+    with self.assertLogs(level="INFO") as logs:
       loaded = cycle(root, 1, use_cpp_bindings=use_cpp_bindings)
 
     expected_save_message = (
-        "WARNING:absl:Found untraced functions such as foo while saving "
+        "INFO:absl:Found untraced functions such as foo while saving "
         "(showing 1 of 1). These functions will not be directly callable after "
         "loading."
     )
@@ -3004,6 +3094,176 @@ class SingleCycleTests(test.TestCase, parameterized.TestCase):
         loaded(constant_op.constant(3, dtype=dtypes.int32)).numpy(),
     )
 
+  def test_function_aliases(self, use_cpp_bindings):
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
+    root = autotrackable.AutoTrackable()
+    root.f = def_function.function(
+        lambda x: 2 * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)],
+    )
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    options = save_options.SaveOptions(function_aliases={
+        "my_func": root.f,
+    })
+    save.save(root, save_dir, root.f, options=options)
+    loaded = test_load(
+        save_dir,
+        use_cpp_bindings=use_cpp_bindings,
+        options=load_options.LoadOptions(
+            experimental_load_function_aliases=True
+        ),
+    )
+    self.assertLen(loaded.function_aliases, 1)
+    self.assertIn("my_func", loaded.function_aliases)
+    self.assertEqual(loaded.function_aliases["my_func"](1.0).numpy(), 2.0)
+
+  def test_function_aliases_with_non_saved_function(self, use_cpp_bindings):
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
+    # `f` below will be aliased but not saved because is not tracked
+    f = def_function.function(lambda x: 2 * x)
+    root = autotrackable.AutoTrackable()
+    root.g = def_function.function(lambda x: 2 * f(x))
+    # Create two traces
+    root.g(constant_op.constant(1))
+    root.g(constant_op.constant(1.0, dtype=dtypes.float32))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    options = save_options.SaveOptions(
+        function_aliases={
+            "my_func": f,
+        }
+    )
+    save.save(root, save_dir, options=options)
+    loaded = test_load(
+        save_dir,
+        use_cpp_bindings=use_cpp_bindings,
+        options=load_options.LoadOptions(
+            experimental_load_function_aliases=True
+        ),
+    )
+    self.assertLen(loaded.function_aliases, 1)
+    self.assertIn("my_func", loaded.function_aliases)
+    self.assertLen(loaded.function_aliases["my_func"], 2)
+    self.assertIsInstance(
+        loaded.function_aliases["my_func"][0], types_core.ConcreteFunction
+    )
+    self.assertIsInstance(
+        loaded.function_aliases["my_func"][1], types_core.ConcreteFunction
+    )
+
+  @unittest.skip("skip until unexpected retracing is fixed/handled b/280121368")
+  def test_function_aliases_with_concrete_function(self, use_cpp_bindings):
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
+    # `f` below will be aliased but not saved because is not tracked
+    f = def_function.function(lambda x: 2 * x)
+    root = autotrackable.AutoTrackable()
+    root.g = def_function.function(lambda x: 2 * f(x))
+    # Create two traces
+    root.g(constant_op.constant(1))
+    root.g(constant_op.constant(1.0, dtype=dtypes.float32))
+    self.assertLen(f._list_all_concrete_functions(), 2)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    options = save_options.SaveOptions(
+        function_aliases={
+            "my_func": f.get_concrete_function(
+                tensor_spec.TensorSpec([], dtypes.float32)
+            ),
+        }
+    )
+    self.assertLen(f._list_all_concrete_functions(), 2)
+    save.save(root, save_dir, options=options)
+    loaded = test_load(
+        save_dir,
+        use_cpp_bindings=use_cpp_bindings,
+        options=load_options.LoadOptions(
+            experimental_load_function_aliases=True
+        ),
+    )
+    self.assertLen(loaded.function_aliases, 1)
+    self.assertIn("my_func", loaded.function_aliases)
+    self.assertLen(loaded.function_aliases["my_func"], 1)
+    self.assertIsInstance(
+        loaded.function_aliases["my_func"][0], types_core.ConcreteFunction
+    )
+
+  @unittest.skip("skip until unexpected retracing is fixed/handled b/280121368")
+  def test_function_aliases_with_concrete_functions(self, use_cpp_bindings):
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
+    # `f` below will be aliased but not saved because is not tracked
+    f = def_function.function(lambda x: 2 * x)
+    root = autotrackable.AutoTrackable()
+    root.g = def_function.function(lambda x: 2 * f(x))
+    # Create 3 traces for g, which will in turn create 3 traces for f.
+    root.g(x=constant_op.constant(1))
+    root.g(x=constant_op.constant(1.0, dtype=dtypes.float32))
+    root.g(x=constant_op.constant(1.0, dtype=dtypes.float16))
+    self.assertLen(f._list_all_concrete_functions(), 3)
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    options = save_options.SaveOptions(
+        function_aliases={
+            # Alias 2 out of 3 traces of f
+            "my_func": [
+                f.get_concrete_function(
+                    x=tensor_spec.TensorSpec([], dtypes.int32)
+                ),
+                f.get_concrete_function(
+                    x=tensor_spec.TensorSpec([], dtypes.float32)
+                ),
+            ],
+        }
+    )
+    self.assertLen(f._list_all_concrete_functions(), 3)
+    save.save(root, save_dir, options=options)
+    loaded = test_load(
+        save_dir,
+        use_cpp_bindings=use_cpp_bindings,
+        options=load_options.LoadOptions(
+            experimental_load_function_aliases=True
+        ),
+    )
+    self.assertLen(loaded.function_aliases, 1)
+    self.assertIn("my_func", loaded.function_aliases)
+    self.assertLen(loaded.function_aliases["my_func"], 2)
+    self.assertIsInstance(
+        loaded.function_aliases["my_func"][0], types_core.ConcreteFunction
+    )
+    self.assertIsInstance(
+        loaded.function_aliases["my_func"][1], types_core.ConcreteFunction
+    )
+
+  def test_function_aliases_name_collision(self, use_cpp_bindings):
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
+    root = autotrackable.AutoTrackable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.function_aliases = variables.Variable(1.0)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    options = save_options.SaveOptions(function_aliases={
+        "my_func": root.f,
+    })
+    save.save(root, save_dir, root.f, options=options)
+    with self.assertRaisesRegex(
+        ValueError, "Could not load with experimental_load_function_aliases"
+    ):
+      test_load(
+          save_dir,
+          use_cpp_bindings=use_cpp_bindings,
+          options=load_options.LoadOptions(
+              experimental_load_function_aliases=True
+          ),
+      )
+
 
 # TODO(b/264882754) Support Cpp bindings DeferredInitModuleVariablesTest
 class DeferredInitModuleVariablesTest(test.TestCase, parameterized.TestCase):
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
index e4b47375408..be46a79b8e3 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2_test.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -33,13 +33,15 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import ref_variable
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -62,7 +64,7 @@ class LoadTest(test.TestCase):
           shape=None, dtype=dtypes.float32, name="start"
       )
       if use_resource:
-        distractor = variables.RefVariable(-1.0, name="distractor")
+        distractor = ref_variable.RefVariable(-1.0, name="distractor")
         v = resource_variable_ops.ResourceVariable(3.0, name="v")
       else:
         # "distractor" gets saved in the checkpoint and so used in the restore
@@ -70,9 +72,9 @@ class LoadTest(test.TestCase):
         # node naming: it needs to be consistent (and ideally always the same as
         # the node in the original GraphDef) for the resource manager to find
         # the right variable.
-        distractor = variables.RefVariable(-1.0, name="distractor")
-        v = variables.RefVariable(3.0, name="v")
-      local_variable = variables.VariableV1(
+        distractor = ref_variable.RefVariable(-1.0, name="distractor")
+        v = ref_variable.RefVariable(3.0, name="v")
+      local_variable = variable_v1.VariableV1(
           1.0,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           trainable=False,
@@ -322,7 +324,7 @@ class LoadTest(test.TestCase):
       branch_selector = array_ops.placeholder(
           name="branch_selector", shape=[], dtype=dtypes.bool
       )
-      output = control_flow_ops.cond(
+      output = cond.cond(
           branch_selector,
           lambda: array_ops.ones([]),
           lambda: array_ops.zeros([]),
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 2f3c4e532f9..d15d3f516d0 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -21,7 +21,7 @@ import sys
 from google.protobuf import message
 from google.protobuf import text_format
 
-from tensorflow.core.protobuf import graph_debug_info_pb2
+from tensorflow.core.framework import graph_debug_info_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.framework import ops
diff --git a/tensorflow/python/saved_model/loader_test.py b/tensorflow/python/saved_model/loader_test.py
index 36af78309a6..52433cd9ada 100644
--- a/tensorflow/python/saved_model/loader_test.py
+++ b/tensorflow/python/saved_model/loader_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import builder as saved_model_builder
@@ -51,8 +52,8 @@ SAVED_MODEL_WITH_MAIN_OP = _get_export_dir("saved_model_with_main_op")
 def build_graph_helper():
   g = ops.Graph()
   with g.as_default():
-    x = variables.VariableV1(5, name="x")
-    y = variables.VariableV1(11, name="y")
+    x = variable_v1.VariableV1(5, name="x")
+    y = variable_v1.VariableV1(11, name="y")
     z = x + y
 
     foo_sig_def = signature_def_utils.build_signature_def({
@@ -177,8 +178,8 @@ class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
       self.export_graph_with_main_op(builder_cls)
       loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
       with self.session() as sess:
-        x = variables.VariableV1(0, name="x")
-        y = variables.VariableV1(0, name="y")
+        x = variable_v1.VariableV1(0, name="x")
+        y = variable_v1.VariableV1(0, name="y")
         z = x * y
 
         self.evaluate(variables.global_variables_initializer())
@@ -244,9 +245,9 @@ class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
     with ops.Graph().as_default():
       path = _get_export_dir("no_variable_saved_model")
       with session.Session(graph=ops.Graph()) as sess:
-        x = variables.VariableV1(
+        x = variable_v1.VariableV1(
             5, name="x", collections=["not_global_variable"])
-        y = variables.VariableV1(
+        y = variable_v1.VariableV1(
             11, name="y", collections=["not_global_variable"])
         self.assertFalse(variables._all_saveable_objects())
         z = x + y
diff --git a/tensorflow/python/saved_model/metrics_test.py b/tensorflow/python/saved_model/metrics_test.py
index 5cb53b4c3a6..4800ee5f793 100644
--- a/tensorflow/python/saved_model/metrics_test.py
+++ b/tensorflow/python/saved_model/metrics_test.py
@@ -20,6 +20,8 @@ API calls.
 
 import os
 
+from google.protobuf import json_format
+
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -115,19 +117,19 @@ class MetricsTests(test.TestCase):
   def test_save_sets_write_fingerprint_metric(self):
     exported_dir = self._create_save_v2_model()
     fingerprint = fingerprinting.read_fingerprint(exported_dir)
-
-    self.assertEqual(
-        metrics.GetWriteFingerprint(),
-        str(fingerprint.saved_model_checksum))
+    fingerprint_metric = fingerprinting.Fingerprint.from_proto(
+        json_format.Parse(metrics.GetWriteFingerprint(),
+                          fingerprinting.fingerprint_pb2.FingerprintDef()))
+    self.assertEqual(fingerprint, fingerprint_metric)
 
   def test_load_sets_read_fingerprint_metric(self):
     exported_dir = self._create_save_v2_model()
     load.load(exported_dir)
     fingerprint = fingerprinting.read_fingerprint(exported_dir)
-
-    self.assertEqual(
-        metrics.GetWriteFingerprint(),
-        str(fingerprint.saved_model_checksum))
+    fingerprint_metric = fingerprinting.Fingerprint.from_proto(
+        json_format.Parse(metrics.GetReadFingerprint(),
+                          fingerprinting.fingerprint_pb2.FingerprintDef()))
+    self.assertEqual(fingerprint, fingerprint_metric)
 
   def test_save_sets_write_path_metric(self):
     exported_dir = self._create_save_v2_model()
@@ -140,6 +142,21 @@ class MetricsTests(test.TestCase):
 
     self.assertEqual(metrics.GetReadPath(), exported_dir)
 
+  def test_save_sets_write_path_and_singleprint_metric(self):
+    exported_dir = self._create_save_v2_model()
+    fingerprint = fingerprinting.read_fingerprint(exported_dir)
+    singleprint = fingerprint.singleprint()
+    path_and_singleprint_metric = metrics.GetWritePathAndSingleprint()
+    self.assertEqual(path_and_singleprint_metric, (exported_dir, singleprint))
+
+  def test_save_sets_read_path_and_singleprint_metric(self):
+    exported_dir = self._create_save_v2_model()
+    load.load(exported_dir)
+    fingerprint = fingerprinting.read_fingerprint(exported_dir)
+    singleprint = fingerprint.singleprint()
+    path_and_singleprint_metric = metrics.GetReadPathAndSingleprint()
+    self.assertEqual(path_and_singleprint_metric, (exported_dir, singleprint))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/model_utils/BUILD b/tensorflow/python/saved_model/model_utils/BUILD
index c28419477fc..47425713b08 100644
--- a/tensorflow/python/saved_model/model_utils/BUILD
+++ b/tensorflow/python/saved_model/model_utils/BUILD
@@ -78,13 +78,16 @@ py_strict_library(
         ":export_output",
         ":mode_keys",
         "//tensorflow/python:op_selector",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
     ],
 )
 
@@ -113,7 +116,7 @@ py_strict_library(
     name = "mode_keys",
     srcs = ["mode_keys.py"],
     srcs_version = "PY3",
-    deps = ["//tensorflow/python:util"],
+    deps = ["//tensorflow/python/util:compat"],
 )
 
 py_strict_test(
diff --git a/tensorflow/python/saved_model/nested_structure_coder_test.py b/tensorflow/python/saved_model/nested_structure_coder_test.py
index 3a0be78de3e..5c766a82b83 100644
--- a/tensorflow/python/saved_model/nested_structure_coder_test.py
+++ b/tensorflow/python/saved_model/nested_structure_coder_test.py
@@ -21,6 +21,7 @@ import warnings
 from google.protobuf import text_format
 from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import extension_type
 from tensorflow.python.framework import ops
@@ -28,6 +29,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -382,6 +384,25 @@ class NestedStructureCoderTest(test.TestCase):
     decoded = nested_structure_coder.decode_proto(encoded)
     self.assertEqual(structure, decoded)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testEncodeDecodeTensor(self):
+    structure = constant_op.constant(1)
+    self.assertTrue(nested_structure_coder.can_encode(structure))
+    encoded = nested_structure_coder.encode_structure(structure)
+    expected_pbtxt = r"""
+      tensor_value {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    """
+    expected = struct_pb2.StructuredValue()
+    text_format.Parse(expected_pbtxt, expected)
+    self.assertEqual(expected, encoded)
+    decoded = nested_structure_coder.decode_proto(encoded)
+    self.assertAllEqual(structure, decoded)
+
   def testNotEncodable(self):
 
     class NotEncodable(object):
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
index 76766aa4103..9e33e5fd5f8 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
@@ -37,19 +37,40 @@ class FingerprintException : public std::exception {
   std::string message_ = "";
 };
 
+class FileNotFoundException : public std::exception {
+ public:
+  explicit FileNotFoundException(const char *m) : message_{m} {}
+  const char *what() const noexcept override { return message_.c_str(); }
+
+ private:
+  std::string message_ = "";
+};
+
 void DefineFingerprintingModule(py::module main_module) {
   auto m = main_module.def_submodule("fingerprinting");
 
   m.doc() = "Python bindings for TensorFlow SavedModel Fingerprinting.";
 
-  static py::exception<FingerprintException> ex(m, "FingerprintException");
+  static py::exception<FingerprintException> fp_ex(m, "FingerprintException");
   py::register_exception_translator([](std::exception_ptr p) {
     try {
       if (p) {
         std::rethrow_exception(p);
       }
     } catch (const FingerprintException &e) {
-      ex(e.what());
+      fp_ex(e.what());
+    }
+  });
+
+  static py::exception<FileNotFoundException> fnf_ex(m,
+                                                     "FileNotFoundException");
+  py::register_exception_translator([](std::exception_ptr p) {
+    try {
+      if (p) {
+        std::rethrow_exception(p);
+      }
+    } catch (const FileNotFoundException &e) {
+      fnf_ex(e.what());
     }
   });
 
@@ -81,11 +102,18 @@ void DefineFingerprintingModule(py::module main_module) {
             fingerprinting::ReadSavedModelFingerprint(export_dir);
         if (fingerprint.ok()) {
           return py::bytes(fingerprint.value().SerializeAsString());
+        } else if (fingerprint.status().code() == absl::StatusCode::kNotFound) {
+          throw FileNotFoundException(
+              std::string("Could not find fingerprint in directory: " +
+                          export_dir)
+                  .c_str());
+        } else {
+          throw FingerprintException(
+              std::string("Could not read fingerprint from fingerprint.pb file "
+                          "in directory: " +
+                          export_dir)
+                  .c_str());
         }
-        throw FingerprintException(
-            std::string("Could not read fingerprint from directory: " +
-                        export_dir)
-                .c_str());
       },
       py::arg("export_dir"),
       py::doc(
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting_test.py b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting_test.py
index 218dd4e9229..942b77ae25b 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting_test.py
+++ b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting_test.py
@@ -56,9 +56,9 @@ class FingerprintingTest(test.TestCase):
   def test_read_nonexistent_fingerprint(self):
     export_dir = test.test_src_dir_path("cc/saved_model/testdata/AssetModule")
     with self.assertRaises(
-        pywrap_fingerprinting.FingerprintException) as excinfo:
+        pywrap_fingerprinting.FileNotFoundException) as excinfo:
       pywrap_fingerprinting.ReadSavedModelFingerprint(export_dir)
-    self.assertRegex(str(excinfo.exception), "Could not read fingerprint.")
+    self.assertRegex(str(excinfo.exception), "Could not find fingerprint.")
 
   def test_read_saved_model_singleprint(self):
     export_dir = test.test_src_dir_path(
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc b/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
index d1e633c02e7..d0591edeaa8 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <exception>
+#include <string>
+#include <utility>
+
 #include "absl/strings/string_view.h"
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
 #include "tensorflow/cc/saved_model/metrics.h"
+#include "tensorflow/core/protobuf/fingerprint.pb.h"
 
 namespace tensorflow {
 namespace saved_model {
@@ -99,12 +105,15 @@ void DefineMetricsModule(py::module main_module) {
 
   m.def(
       "SetReadFingerprint",
-      [](const char* saved_model_checksum) {
-        metrics::SavedModelReadFingerprint().Set(saved_model_checksum);
+      [](const py::bytes fingerprint) {
+        FingerprintDef fingerprint_def;
+        fingerprint_def.ParseFromString(std::string(fingerprint));
+        metrics::SavedModelReadFingerprint().Set(
+            metrics::MakeFingerprintJson(fingerprint_def).c_str());
       },
-      py::kw_only(), py::arg("saved_model_checksum"),
+      py::kw_only(), py::arg("fingerprint"),
       py::doc("Set the '/tensorflow/core/saved_model/read/fingerprint' gauge "
-              "with `saved_model_checksum`."));
+              "with `fingerprint`."));
 
   m.def(
       "GetReadFingerprint",
@@ -114,12 +123,15 @@ void DefineMetricsModule(py::module main_module) {
 
   m.def(
       "SetWriteFingerprint",
-      [](const char* saved_model_checksum) {
-        metrics::SavedModelWriteFingerprint().Set(saved_model_checksum);
+      [](const py::bytes fingerprint) {
+        FingerprintDef fingerprint_def;
+        fingerprint_def.ParseFromString(std::string(fingerprint));
+        metrics::SavedModelWriteFingerprint().Set(
+            metrics::MakeFingerprintJson(fingerprint_def).c_str());
       },
-      py::kw_only(), py::arg("saved_model_checksum"),
+      py::kw_only(), py::arg("fingerprint"),
       py::doc("Set the '/tensorflow/core/saved_model/write/fingerprint' gauge "
-              "with `saved_model_checksum`."));
+              "with `fingerprint`."));
 
   m.def(
       "GetWriteFingerprint",
@@ -153,6 +165,48 @@ void DefineMetricsModule(py::module main_module) {
       "GetWritePath", []() { return metrics::SavedModelWritePath().value(); },
       py::doc("Get value of '/tensorflow/core/saved_model/write/path' gauge."));
 
+  m.def(
+      "SetReadPathAndSingleprint",
+      [](const char* path, const char* singleprint) {
+        metrics::SavedModelReadPathAndSingleprint().Set(
+            metrics::MakeSavedModelPathAndSingleprint(path, singleprint));
+      },
+      py::kw_only(), py::arg("path"), py::arg("singleprint"),
+      py::doc(
+          "Set the '/tensorflow/core/saved_model/read/path_and_singleprint' "
+          "gauge with `path` and `singleprint`."));
+
+  m.def(
+      "GetReadPathAndSingleprint",
+      []() {
+        return metrics::ParseSavedModelPathAndSingleprint(
+            metrics::SavedModelReadPathAndSingleprint().value());
+      },
+      py::doc(
+          "Get tuple of `path` and `singleprint` values of "
+          "'/tensorflow/core/saved_model/read/path_and_singleprint' gauge."));
+
+  m.def(
+      "SetWritePathAndSingleprint",
+      [](const char* path, const char* singleprint) {
+        metrics::SavedModelWritePathAndSingleprint().Set(
+            metrics::MakeSavedModelPathAndSingleprint(path, singleprint));
+      },
+      py::kw_only(), py::arg("path"), py::arg("singleprint"),
+      py::doc("Set the "
+              "'/tensorflow/core/saved_model/write/path_and_singleprint' gauge "
+              "with `path` and `singleprint`."));
+
+  m.def(
+      "GetWritePathAndSingleprint",
+      []() {
+        return metrics::ParseSavedModelPathAndSingleprint(
+            metrics::SavedModelWritePathAndSingleprint().value());
+      },
+      py::doc(
+          "Get tuple of `path` and `singleprint` values of "
+          "'/tensorflow/core/saved_model/write/path_and_singleprint' gauge."));
+
   m.def(
       "AddCheckpointReadDuration",
       [](const char* api_label, double microseconds) {
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_metrics_test.py b/tensorflow/python/saved_model/pywrap_saved_model_metrics_test.py
index fbf5627ae6e..b747131d079 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_metrics_test.py
+++ b/tensorflow/python/saved_model/pywrap_saved_model_metrics_test.py
@@ -17,6 +17,8 @@
 import os
 
 from tensorflow.core.framework import summary_pb2
+from tensorflow.core.framework import versions_pb2
+from tensorflow.core.protobuf import fingerprint_pb2
 from tensorflow.python.eager import test
 from tensorflow.python.saved_model.pywrap_saved_model import metrics
 
@@ -28,6 +30,15 @@ class MetricsTest(test.TestCase):
     histogram_proto.ParseFromString(proto_bytes)
     return histogram_proto
 
+  def _get_serialized_fingerprint_def(self):
+    return fingerprint_pb2.FingerprintDef(
+        saved_model_checksum=1,
+        graph_def_program_hash=2,
+        signature_def_hash=3,
+        saved_object_graph_hash=4,
+        checkpoint_hash=5,
+        version=versions_pb2.VersionDef(producer=6)).SerializeToString()
+
   def test_SM_increment_write(self):
     self.assertEqual(metrics.GetWrite(write_version="1"), 0)
     metrics.IncrementWriteApi("foo")
@@ -118,24 +129,48 @@ class MetricsTest(test.TestCase):
   def test_invalid_file(self):
     self.assertEqual(metrics.CalculateFileSize("not_a_file.txt"), -1)
 
-  def test_SM_fingerprint(self):
+  def test_SM_read_fingerprint(self):
     self.assertEqual(metrics.GetReadFingerprint(), "")
-    metrics.SetReadFingerprint(saved_model_checksum="foo")
-    self.assertEqual(metrics.GetReadFingerprint(), "foo")
+    metrics.SetReadFingerprint(
+        fingerprint=self._get_serialized_fingerprint_def())
+    read_fingerprint = metrics.GetReadFingerprint()
+    self.assertIn('"saved_model_checksum" : 1', read_fingerprint)
+    self.assertIn('"graph_def_program_hash" : 2', read_fingerprint)
+    self.assertIn('"signature_def_hash" : 3', read_fingerprint)
+    self.assertIn('"saved_object_graph_hash" : 4', read_fingerprint)
+    self.assertIn('"checkpoint_hash" : 5', read_fingerprint)
 
+  def test_SM_write_fingerprint(self):
     self.assertEqual(metrics.GetWriteFingerprint(), "")
-    metrics.SetWriteFingerprint(saved_model_checksum="foo")
-    self.assertEqual(metrics.GetWriteFingerprint(), "foo")
+    metrics.SetWriteFingerprint(
+        fingerprint=self._get_serialized_fingerprint_def())
+    write_fingerprint = metrics.GetWriteFingerprint()
+    self.assertIn('"saved_model_checksum" : 1', write_fingerprint)
+    self.assertIn('"graph_def_program_hash" : 2', write_fingerprint)
+    self.assertIn('"signature_def_hash" : 3', write_fingerprint)
+    self.assertIn('"saved_object_graph_hash" : 4', write_fingerprint)
+    self.assertIn('"checkpoint_hash" : 5', write_fingerprint)
 
-  def test_SM_path(self):
+  def test_SM_read_path(self):
     self.assertEqual(metrics.GetReadPath(), "")
     metrics.SetReadPath(saved_model_path="foo")
     self.assertEqual(metrics.GetReadPath(), "foo")
 
+  def test_SM_write_path(self):
     self.assertEqual(metrics.GetWritePath(), "")
     metrics.SetWritePath(saved_model_path="foo")
     self.assertEqual(metrics.GetWritePath(), "foo")
 
+  def test_SM_read_path_and_singleprint(self):
+    self.assertEqual(metrics.GetReadPathAndSingleprint(), ("", ""))
+    metrics.SetReadPathAndSingleprint(path="foo", singleprint="bar")
+    self.assertEqual(metrics.GetReadPathAndSingleprint(), ("foo", "bar"))
+
+  def test_SM_write_path_and_singleprint(self):
+    self.assertEqual(metrics.GetWritePathAndSingleprint(), ("", ""))
+    metrics.SetWritePathAndSingleprint(path="foo", singleprint="bar")
+    self.assertEqual(metrics.GetWritePathAndSingleprint(), ("foo", "bar"))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/registration/BUILD b/tensorflow/python/saved_model/registration/BUILD
index 467e1810607..efb628fcfa1 100644
--- a/tensorflow/python/saved_model/registration/BUILD
+++ b/tensorflow/python/saved_model/registration/BUILD
@@ -2,7 +2,7 @@
 # TensorFlow SavedModel Registration.
 
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -26,11 +26,11 @@ py_strict_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_inspect",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "registration_test",
     srcs = ["registration_test.py"],
     deps = [
@@ -41,18 +41,29 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "registration_saving_test",
     srcs = ["registration_saving_test.py"],
     deps = [
         ":registration",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:array_ops_stack",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/trackable:autotrackable",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -69,7 +80,7 @@ py_strict_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tf_registration_test",
     srcs = ["tf_registration_test.py"],
     data = [
@@ -82,6 +93,7 @@ tf_py_test(
         ":test_util",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:resource_loader",
     ],
 )
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index e4fa3262c3b..92b1d22fdb4 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -36,6 +36,7 @@ from tensorflow.python.checkpoint import util as checkpoint_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
 from tensorflow.python.eager.polymorphic_function import saved_model_exported_concrete
 from tensorflow.python.eager.polymorphic_function import saved_model_utils
 from tensorflow.python.framework import dtypes
@@ -72,6 +73,7 @@ from tensorflow.python.trackable import base
 from tensorflow.python.trackable import resource
 from tensorflow.python.trackable import trackable_utils
 from tensorflow.python.training.saving import trace_saveable_util
+from tensorflow.python.types import core as types_core
 from tensorflow.python.util import compat
 from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import tf_export
@@ -269,7 +271,7 @@ class _SaveableView(object):
 
     untraced_functions = self.augmented_graph_view.untraced_functions
     if untraced_functions:
-      logging.warning(
+      logging.info(
           "Found untraced functions such as %s while saving (showing %d of %d)."
           " These functions will not be directly callable after loading.",
           ", ".join(untraced_functions[:_NUM_DISPLAY_UNTRACED_FUNCTIONS]),
@@ -389,10 +391,10 @@ class _SaveableView(object):
     # TODO(b/205007558): Handle MirroredVariables and other types of variables
     # which may need special casing.
     object_map = object_identity.ObjectIdentityDictionary()
-    tensor_map = {}
+    tensor_map = object_identity.ObjectIdentityDictionary()
     asset_info = _AssetInfo(
         asset_defs=[],
-        asset_initializers_by_resource={},
+        asset_initializers_by_resource=object_identity.ObjectIdentityDictionary(),
         asset_filename_map={},
         asset_index={})
 
@@ -497,8 +499,9 @@ def _to_safe_name_scope(signature_key, user_input_name):
   return re.sub(r"[^A-Za-z0-9_.\\-]", "_", invalid_prefix_stripped)
 
 
-def _map_function_arguments_to_created_inputs(function_arguments, signature_key,
-                                              function_name):
+def _map_function_arguments_to_created_inputs(
+    function_arguments, signature_key, function_name, defaults=None
+):
   """Creates exterior placeholders in the exported graph for function arguments.
 
   Functions have two types of inputs: tensors captured from the outside (eager)
@@ -513,6 +516,8 @@ def _map_function_arguments_to_created_inputs(function_arguments, signature_key,
     function_arguments: A list of argument placeholders in the function body.
     signature_key: The name of the signature being exported, for error messages.
     function_name: The name of the function, for error messages.
+    defaults: A dictionary mapping signature_key to dictionary of
+      user_specified_name to Tensor representing default values.
 
   Returns:
     A tuple of (mapped_inputs, exterior_placeholders)
@@ -558,16 +563,27 @@ def _map_function_arguments_to_created_inputs(function_arguments, signature_key,
           "structures unless unique names are specified for each. Use "
           "tf.TensorSpec(..., name=...) to provide a name for a Tensor "
           "input.")
-    arg_placeholder = array_ops.placeholder(
-        shape=placeholder.shape,
-        dtype=placeholder.dtype,
-        name=_to_safe_name_scope(signature_key, user_input_name))
-    exterior_argument_placeholders[user_input_name] = arg_placeholder
-    mapped_inputs.append(arg_placeholder)
+    default_value = defaults.get(signature_key, {}).get(user_input_name)
+    if default_value is not None:
+      placeholder_with_default = array_ops.placeholder_with_default(
+          input=default_value.numpy(),
+          shape=placeholder.shape,
+          name=_to_safe_name_scope(signature_key, user_input_name),
+      )
+      exterior_argument_placeholders[user_input_name] = placeholder_with_default
+      mapped_inputs.append(placeholder_with_default)
+    else:
+      arg_placeholder = array_ops.placeholder(
+          shape=placeholder.shape,
+          dtype=placeholder.dtype,
+          name=_to_safe_name_scope(signature_key, user_input_name),
+      )
+      exterior_argument_placeholders[user_input_name] = arg_placeholder
+      mapped_inputs.append(arg_placeholder)
   return mapped_inputs, exterior_argument_placeholders
 
 
-def _generate_signatures(signature_functions, object_map):
+def _generate_signatures(signature_functions, object_map, defaults=None):
   """Validates and calls `signature_functions` in the exported graph.
 
   Args:
@@ -576,6 +592,8 @@ def _generate_signatures(signature_functions, object_map):
       which will be used to generate SignatureDefs.
     object_map: A dictionary that contains mappings from signature functions to
       concrete functions in the exported graph.
+    defaults: A dictionary mapping signature_key to dictionary of
+      user_specified_name to Tensor representing default values.
 
   Returns:
     Each function in the `signature_functions` dictionary is called with
@@ -600,8 +618,10 @@ def _generate_signatures(signature_functions, object_map):
     else:
       argument_inputs = function.graph.inputs
     mapped_inputs, exterior_argument_placeholders = (
-        _map_function_arguments_to_created_inputs(argument_inputs,
-                                                  signature_key, function.name))
+        _map_function_arguments_to_created_inputs(
+            argument_inputs, signature_key, function.name, defaults
+        )
+    )
     kwarg_names = list(
         sorted(
             object_map[function].function.structured_input_signature[1].keys()))
@@ -612,7 +632,9 @@ def _generate_signatures(signature_functions, object_map):
     signatures[signature_key] = signature_def_utils.build_signature_def(
         _tensor_dict_to_tensorinfo(exterior_argument_placeholders),
         _tensor_dict_to_tensorinfo(outputs),
-        method_name=signature_constants.PREDICT_METHOD_NAME)
+        method_name=signature_constants.PREDICT_METHOD_NAME,
+        defaults=defaults.get(signature_key, None),
+    )
   return signatures
 
 
@@ -749,14 +771,17 @@ def _trace_gradient_functions(graph, saveable_view):
             )
           elif outer_capture.graph is outer_fn.graph:
             capture_name = outer_capture.name
-            # It's possible for EagerDefinedFunctions to save different names
+            # It's possible for AtomicFunctions to save different names
             # for input tensors when serialized to FunctionDef (all
             # non-alphanumeric characters are converted to '_').
-            if isinstance(outer_fn, defun._EagerDefinedFunction):  # pylint:disable=protected-access
+            if isinstance(outer_fn, defun.AtomicFunction):  # pylint:disable=protected-access
               try:
                 arg_index = outer_fn.graph.inputs.index(outer_capture)
                 capture_name = (
-                    outer_fn.signature.input_arg[arg_index].name + ":0"
+                    outer_fn.cached_definition.signature.input_arg[
+                        arg_index
+                    ].name
+                    + ":0"
                 )
               except ValueError:
                 pass
@@ -783,8 +808,14 @@ def _trace_gradient_functions(graph, saveable_view):
       saveable_view.gradient_defs.append(grad_def)
 
 
-def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
-                         namespace_whitelist, save_custom_gradients):
+def _fill_meta_graph_def(
+    meta_graph_def,
+    saveable_view,
+    signature_functions,
+    namespace_whitelist,
+    save_custom_gradients,
+    defaults=None,
+):
   """Generates a MetaGraph which calls `signature_functions`.
 
   Args:
@@ -794,6 +825,8 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
       functions containing signatures to add to the MetaGraph.
     namespace_whitelist: List of strings containing whitelisted op namespaces.
     save_custom_gradients: Whether to save custom gradients.
+    defaults: A dictionary mapping signature_key to dictionary of
+      user_specified_name to Tensor representing default values.
 
   Returns:
     A tuple of (_AssetInfo, Graph) containing the captured assets and
@@ -806,7 +839,7 @@ def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions,
   resource_initializer_ops = []
   with exported_graph.as_default():
     object_map, tensor_map, asset_info = saveable_view.map_resources()
-    signatures = _generate_signatures(signature_functions, object_map)
+    signatures = _generate_signatures(signature_functions, object_map, defaults)
   if save_custom_gradients:
     # Custom gradients functions must be traced in the same context as the
     # when they are registered.
@@ -1044,7 +1077,7 @@ def _export_debug_info(exported_graph, export_dir):
   exported_operations = []
   for fn_name in exported_graph._functions:  # pylint: disable=protected-access
     fn = exported_graph._get_function(fn_name)  # pylint: disable=protected-access
-    if not isinstance(fn, defun._EagerDefinedFunction):  # pylint: disable=protected-access
+    if not isinstance(fn, defun.AtomicFunction):  # pylint: disable=protected-access
       continue
 
     fn_graph = fn.graph
@@ -1068,7 +1101,7 @@ def save(obj, export_dir, signatures=None, options=None):
   """Exports a [tf.Module](https://www.tensorflow.org/api_docs/python/tf/Module) (and subclasses) `obj` to [SavedModel format](https://www.tensorflow.org/guide/saved_model#the_savedmodel_format_on_disk).
 
   The `obj` must inherit from the [`Trackable`
-  class](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/training/tracking/base.py#L591).
+  class](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/trackable/base.py#L278).
 
   Example usage:
 
@@ -1393,8 +1426,9 @@ def _build_meta_graph_impl(obj, signatures, options, meta_graph_def=None):
     signatures = signature_serialization.find_function_to_export(
         augmented_graph_view)
 
-  signatures, wrapped_functions = (
-      signature_serialization.canonicalize_signatures(signatures))
+  signatures, wrapped_functions, defaults = (
+      signature_serialization.canonicalize_signatures(signatures)
+  )
   signature_serialization.validate_augmented_graph_view(augmented_graph_view)
   signature_map = signature_serialization.create_signature_map(signatures)
   augmented_graph_view.set_signature(signature_map, wrapped_functions)
@@ -1403,18 +1437,35 @@ def _build_meta_graph_impl(obj, signatures, options, meta_graph_def=None):
   saveable_view = _SaveableView(augmented_graph_view, options)
   object_saver = checkpoint.TrackableSaver(augmented_graph_view)
   asset_info, exported_graph = _fill_meta_graph_def(
-      meta_graph_def, saveable_view, signatures, options.namespace_whitelist,
-      options.experimental_custom_gradients)
+      meta_graph_def,
+      saveable_view,
+      signatures,
+      options.namespace_whitelist,
+      options.experimental_custom_gradients,
+      defaults,
+  )
   if options.function_aliases:
     function_aliases = meta_graph_def.meta_info_def.function_aliases
     for alias, func in options.function_aliases.items():
-      for fdef in func._list_all_concrete_functions():  # pylint: disable=protected-access
-        function_aliases[fdef.name] = alias
-
+      if isinstance(func, types_core.ConcreteFunction):
+        function_aliases[func.name] = alias
+      elif isinstance(func, polymorphic_function.Function):
+        for fdef in func._list_all_concrete_functions():  # pylint: disable=protected-access
+          function_aliases[fdef.name] = alias
+      elif isinstance(func, collections.abc.Iterable) and all(
+          isinstance(x, types_core.ConcreteFunction) for x in func
+      ):
+        for entry in func:
+          function_aliases[entry.name] = alias
+      else:
+        raise TypeError(
+            f"Unsupported type f{type(func)}. Functions in `function_aliases`"
+            " should be created by tf.function, or concrete functions, or"
+            " collections of concrete functions."
+        )
   object_graph_proto = _serialize_object_graph(saveable_view,
                                                asset_info.asset_index)
   meta_graph_def.object_graph_def.CopyFrom(object_graph_proto)
-
   return (meta_graph_def, exported_graph, object_saver, asset_info,
           saveable_view.nodes, saveable_view.node_paths)
 
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 85ce82f93b4..bc913b6f012 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -19,10 +19,9 @@ import os
 from absl.testing import parameterized
 
 from google.protobuf import text_format
-
 from tensorflow.core.config import flags
+from tensorflow.core.framework import graph_debug_info_pb2
 from tensorflow.core.framework import graph_pb2
-from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.python.checkpoint import checkpoint
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.data.ops import dataset_ops
@@ -62,9 +61,16 @@ from tensorflow.python.training import saver
 from tensorflow.python.util import compat
 
 
-def _run_signature(session, meta_graph_def, inputs, signature_key):
+def _run_signature(
+    session,
+    meta_graph_def,
+    inputs,
+    signature_key,
+    disable_check_for_input_signature_size_match=False,
+):
   signature = meta_graph_def.signature_def[signature_key]
-  assert set(inputs.keys()) == set(signature.inputs.keys())
+  if not disable_check_for_input_signature_size_match:
+    assert set(inputs.keys()) == set(signature.inputs.keys())
   feed_dict = {}
   for arg_name in inputs.keys():
     input_tensor = session.graph.get_tensor_by_name(
@@ -80,12 +86,20 @@ def _run_signature(session, meta_graph_def, inputs, signature_key):
 def _import_and_infer(
     save_dir,
     inputs,
-    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
+    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+    disable_check_for_input_signature_size_match=False,
+):
   """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
   graph = ops.Graph()
   with graph.as_default(), session_lib.Session() as session:
     model = loader.load(session, [tag_constants.SERVING], save_dir)
-    return _run_signature(session, model, inputs, signature_key)
+    return _run_signature(
+        session,
+        model,
+        inputs,
+        signature_key,
+        disable_check_for_input_signature_size_match,
+    )
 
 
 class SaveTest(test.TestCase, parameterized.TestCase):
@@ -167,6 +181,53 @@ class SaveTest(test.TestCase, parameterized.TestCase):
                      _import_and_infer(
                          save_dir, {"z": 1.}, signature_key="non_default_key"))
 
+  def test_method_save_defaults(self):
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([], dtypes.float32)]
+    )
+    def f(x, y=constant_op.constant(5.0)):
+      return x + y
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([], dtypes.float32)]
+    )
+    def g(x=constant_op.constant(10.0), y=constant_op.constant(20.0)):
+      return x + y
+
+    root = module.Module()
+    root.f = f
+    root.g = g
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir, {"f": root.f, "g": root.g})
+
+    self.assertEqual(
+        {"output_0": 7.0},
+        _import_and_infer(
+            save_dir,
+            inputs={"x": 2.0},
+            signature_key="f",
+            disable_check_for_input_signature_size_match=True,
+        ),
+    )
+    self.assertEqual(
+        {"output_0": 30.0},
+        _import_and_infer(
+            save_dir,
+            inputs={},
+            signature_key="g",
+            disable_check_for_input_signature_size_match=True,
+        ),
+    )
+    self.assertEqual(
+        {"output_0": 15.0},
+        _import_and_infer(
+            save_dir,
+            inputs={"y": 5.0},
+            signature_key="g",
+            disable_check_for_input_signature_size_match=True,
+        ),
+    )
+
   def test_unsaveable_func_graph(self):
     root = module.Module()
 
@@ -307,11 +368,11 @@ class SaveTest(test.TestCase, parameterized.TestCase):
     root = ObjWithFunction()
     root.bar(1)
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertLogs(level="WARNING") as logs:
+    with self.assertLogs(level="INFO") as logs:
       save.save(root, save_dir)
 
     expected_message = (
-        "WARNING:absl:Found untraced functions such as foo while saving "
+        "INFO:absl:Found untraced functions such as foo while saving "
         "(showing 1 of 1). These functions will not be directly callable after "
         "loading.")
     self.assertIn(expected_message, logs.output)
@@ -928,9 +989,11 @@ class SavingOptionsTest(test.TestCase):
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    options = save_options.SaveOptions(function_aliases={
-        "my_func": root.f,
-    })
+    options = save_options.SaveOptions(
+        function_aliases={
+            "my_func": root.f,
+        }
+    )
     save.save(root, save_dir, root.f, options=options)
     function_cache = root.f._variable_creation_fn._list_all_concrete_functions()
     function_aliases = loader_impl.parse_saved_model(
@@ -939,6 +1002,61 @@ class SavingOptionsTest(test.TestCase):
     self.assertEqual(function_cache[0].name.decode("utf-8"),
                      list(function_aliases.keys())[0])
 
+  def test_concrete_function_aliases(self):
+    root = autotrackable.AutoTrackable()
+    f = def_function.function(
+        lambda x: 2.0 * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)],
+    ).get_concrete_function()
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    options = save_options.SaveOptions(
+        function_aliases={
+            "my_func": f,
+        }
+    )
+    save.save(root, save_dir, f, options=options)
+    function_aliases = loader_impl.parse_saved_model(
+        save_dir).meta_graphs[0].meta_info_def.function_aliases
+    self.assertEqual(f.name.decode("utf-8"),
+                     list(function_aliases.keys())[0])
+
+  def test_concrete_function_list_aliases(self):
+    root = autotrackable.AutoTrackable()
+    f = def_function.function(lambda z: {"out": z * z})
+    f1 = f.get_concrete_function(tensor_spec.TensorSpec(None, dtypes.float32))
+    f2 = f.get_concrete_function(tensor_spec.TensorSpec(None, dtypes.int32))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    options = save_options.SaveOptions(
+        function_aliases={
+            "my_func": [f1, f2],
+        }
+    )
+    save.save(root, save_dir, f1, options=options)
+    function_aliases = (
+        loader_impl.parse_saved_model(save_dir)
+        .meta_graphs[0]
+        .meta_info_def.function_aliases
+    )
+    self.assertSameElements(
+        [f1.name.decode("utf-8"), f2.name.decode("utf-8")],
+        list(function_aliases.keys()),
+    )
+
+  def test_function_aliases_incorrect_type(self):
+    root = autotrackable.AutoTrackable()
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    f = lambda x: 2.0 * x
+    root.f = def_function.function(
+        f, input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)]
+    )
+    options = save_options.SaveOptions(
+        function_aliases={
+            "my_func": f,
+        }
+    )
+    with self.assertRaisesRegex(TypeError, "Unsupported type"):
+      save.save(root, save_dir, root.f, options=options)
+
   def test_accepts_io_device(self):
     options = save_options.SaveOptions()
     self.assertIsNone(options.experimental_io_device)
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index d0643acb813..3f96cf70f2e 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
@@ -60,7 +61,7 @@ class SavedModelTestBase(test.TestCase):
     return os.path.join(test.get_temp_dir(), label)
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
-    v = variables.VariableV1(variable_value, name=variable_name)
+    v = variable_v1.VariableV1(variable_value, name=variable_name)
     self.evaluate(variables.global_variables_initializer())
     self.assertEqual(variable_value, self.evaluate(v))
 
@@ -511,7 +512,7 @@ class SavedModelTest(SavedModelTestBase):
       # to:
       # - add with weights.
       with self.session(graph=ops.Graph()) as sess:
-        v = variables.VariableV1(42, name="v")
+        v = variable_v1.VariableV1(42, name="v")
         ops.add_to_collection("foo_vars", v)
         self.evaluate(variables.global_variables_initializer())
         self.assertEqual(42, self.evaluate(v))
@@ -521,7 +522,7 @@ class SavedModelTest(SavedModelTestBase):
       # SavedModel invoked to:
       # - simply add the model (weights are not updated).
       with self.session(graph=ops.Graph()) as sess:
-        v = variables.VariableV1(43, name="v")
+        v = variable_v1.VariableV1(43, name="v")
         ops.add_to_collection("bar_vars", v)
         self.evaluate(variables.global_variables_initializer())
         self.assertEqual(43, self.evaluate(v))
@@ -882,11 +883,11 @@ class SavedModelTest(SavedModelTestBase):
     with ops.Graph().as_default():
       with self.session(graph=ops.Graph()) as sess:
         # Add `v1` and `v2` variables to the graph.
-        v1 = variables.VariableV1(1, name="v1")
-        v2 = variables.VariableV1(2, name="v2")
+        v1 = variable_v1.VariableV1(1, name="v1")
+        v2 = variable_v1.VariableV1(2, name="v2")
 
         # Initialize another variable `v3` to 42.
-        v3 = variables.VariableV1(42, name="v3")
+        v3 = variable_v1.VariableV1(42, name="v3")
 
         # Set up an assignment op to be run as part of the main_op.
         with ops.control_dependencies([main_op.main_op()]):
@@ -917,8 +918,8 @@ class SavedModelTest(SavedModelTestBase):
     with ops.Graph().as_default():
       with self.session(graph=ops.Graph()) as sess:
         # Add `v1` and `v2` variables to the graph.
-        v1 = variables.VariableV1(1, name="v1")
-        v2 = variables.VariableV1(2, name="v2")
+        v1 = variable_v1.VariableV1(1, name="v1")
+        v2 = variable_v1.VariableV1(2, name="v2")
 
         self.evaluate(variables.global_variables_initializer())
         train_op = state_ops.assign_add(v1, v2)
@@ -948,8 +949,8 @@ class SavedModelTest(SavedModelTestBase):
     with ops.Graph().as_default():
       with self.session(graph=ops.Graph()) as sess:
         # Add `v1` and `v2` variables to the graph.
-        variables.VariableV1(1, name="v1")
-        variables.VariableV1(2, name="v2")
+        variable_v1.VariableV1(1, name="v1")
+        variable_v1.VariableV1(2, name="v2")
 
         self.evaluate(variables.global_variables_initializer())
         train_op = control_flow_ops.group()
@@ -974,8 +975,8 @@ class SavedModelTest(SavedModelTestBase):
     with ops.Graph().as_default():
       with self.session(graph=ops.Graph()) as sess:
         # Add `v1` and `v2` variables to the graph.
-        v1 = variables.VariableV1(1, name="v1")
-        v2 = variables.VariableV1(2, name="v2")
+        v1 = variable_v1.VariableV1(1, name="v1")
+        v2 = variable_v1.VariableV1(2, name="v2")
 
         self.evaluate(variables.global_variables_initializer())
         builder.add_meta_graph_and_variables(sess, ["pre_foo"])
@@ -1098,13 +1099,14 @@ class SavedModelTest(SavedModelTestBase):
           graph=ops.Graph(),
           config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
         with sess.graph.device("/cpu:0"):
-          v1 = variables.VariableV1(1, name="v1")
+          v1 = variable_v1.VariableV1(1, name="v1")
         with sess.graph.device("/cpu:1"):
-          v2 = variables.VariableV1(2, name="v2")
+          v2 = variable_v1.VariableV1(2, name="v2")
 
         # v3 is an unsaved variable derived from v1 and v2.  It is used to
         # exercise the ability to run an init op when restoring a graph.
-        v3 = variables.VariableV1(1, name="v3", trainable=False, collections=[])
+        v3 = variable_v1.VariableV1(
+            1, name="v3", trainable=False, collections=[])
         assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
         control_flow_ops.group(assign_v3, name="init_op")
 
@@ -1163,7 +1165,7 @@ class SavedModelTest(SavedModelTestBase):
 
     with ops.Graph().as_default() as graph:
       with self.session(graph=ops.Graph()) as sess:
-        variables.VariableV1(1, name="v1")
+        variable_v1.VariableV1(1, name="v1")
         self.evaluate(variables.global_variables_initializer())
         custom_saver = training.Saver(name="my_saver")
         builder.add_meta_graph_and_variables(sess, ["tag"], saver=custom_saver)
@@ -1185,7 +1187,7 @@ class SavedModelTest(SavedModelTestBase):
 
     with ops.Graph().as_default() as graph:
       with self.session(graph=ops.Graph()) as sess:
-        variables.VariableV1(1, name="v1")
+        variable_v1.VariableV1(1, name="v1")
         self.evaluate(variables.global_variables_initializer())
         training.Saver(name="my_saver")
         builder.add_meta_graph_and_variables(sess, ["tag"])
@@ -1207,7 +1209,7 @@ class SavedModelTest(SavedModelTestBase):
 
     with ops.Graph().as_default():
       with self.session(graph=ops.Graph()) as sess:
-        variables.VariableV1(1, name="v1")
+        variable_v1.VariableV1(1, name="v1")
         self.evaluate(variables.global_variables_initializer())
         builder.add_meta_graph_and_variables(sess, ["tag_0"])
 
@@ -1315,7 +1317,7 @@ class SavedModelTest(SavedModelTestBase):
     # Add a graph with a single variable and a test op with a defaultless
     # float32 attr, "test_attr".
     with session.Session(graph=ops.Graph()) as sess:
-      variables.VariableV1(1.0, dtype=dtypes.float64, name="var")
+      variable_v1.VariableV1(1.0, dtype=dtypes.float64, name="var")
       test_ops.test_attr(T=dtypes.float32, name="test_attr")
       self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["foo"])
@@ -1467,11 +1469,11 @@ class SavedModelV1Test(SavedModelTestBase):
     with ops.Graph().as_default():
       with self.session() as sess:
         # Initialize variable `v1` to 1.
-        v1 = variables.VariableV1(1, name="v1")
+        v1 = variable_v1.VariableV1(1, name="v1")
         ops.add_to_collection("v", v1)
 
         # Initialize another variable `v2` to 42.
-        v2 = variables.VariableV1(
+        v2 = variable_v1.VariableV1(
             42, name="v2", trainable=False, collections=[])
         ops.add_to_collection("v", v2)
 
@@ -1498,8 +1500,8 @@ class SavedModelV1Test(SavedModelTestBase):
     # Add a graph with two float32 variables and a Complex Op composing them
     # with strip_default_attrs enabled.
     with session.Session(graph=ops.Graph()) as sess:
-      real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
-      imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
+      real_num = variable_v1.VariableV1(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variable_v1.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
       self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(
@@ -1508,8 +1510,8 @@ class SavedModelV1Test(SavedModelTestBase):
     # Add a graph with the same float32 variables and a Complex Op composing
     # them with strip_default_attrs disabled.
     with session.Session(graph=ops.Graph()) as sess:
-      real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
-      imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
+      real_num = variable_v1.VariableV1(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variable_v1.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
       self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph(["bar"], strip_default_attrs=False)
@@ -1566,11 +1568,11 @@ class SavedModelV1Test(SavedModelTestBase):
     with ops.Graph().as_default():
       with self.session(graph=ops.Graph()) as sess:
         # Add `v1` and `v2` variables to the graph.
-        v1 = variables.VariableV1(1, name="v1")
-        v2 = variables.VariableV1(2, name="v2")
+        v1 = variable_v1.VariableV1(1, name="v1")
+        v2 = variable_v1.VariableV1(2, name="v2")
 
         # Initialize another variable `v3` to 42.
-        v3 = variables.VariableV1(42, name="v3", trainable=False)
+        v3 = variable_v1.VariableV1(42, name="v3", trainable=False)
 
         # Set up an assignment op to be run as part of the init_op.
         assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index d1f4b17b949..b2911b174b2 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -19,6 +19,7 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import utils_impl as utils
 from tensorflow.python.util import deprecation
@@ -32,15 +33,19 @@ from tensorflow.python.util.tf_export import tf_export
     ])
 @deprecation.deprecated_endpoints(
     'saved_model.signature_def_utils.build_signature_def')
-def build_signature_def(inputs=None, outputs=None, method_name=None):
+def build_signature_def(
+    inputs=None, outputs=None, method_name=None, defaults=None
+):
   """Utility function to build a SignatureDef protocol buffer.
 
   Args:
     inputs: Inputs of the SignatureDef defined as a proto map of string to
-        tensor info.
+      tensor info.
     outputs: Outputs of the SignatureDef defined as a proto map of string to
-        tensor info.
+      tensor info.
     method_name: Method name of the SignatureDef as a string.
+    defaults: Defaults of the SignatureDef defined as a proto map of string to
+      TensorProto.
 
   Returns:
     A SignatureDef protocol buffer constructed based on the supplied arguments.
@@ -54,6 +59,22 @@ def build_signature_def(inputs=None, outputs=None, method_name=None):
       signature_def.outputs[item].CopyFrom(outputs[item])
   if method_name is not None:
     signature_def.method_name = method_name
+  if defaults is not None:
+    for arg_name, default in defaults.items():
+      if isinstance(default, ops.EagerTensor):
+        signature_def.defaults[arg_name].CopyFrom(
+            tensor_util.make_tensor_proto(default.numpy())
+        )
+      else:
+        if default.op.type == 'Const':
+          signature_def.defaults[arg_name].CopyFrom(
+              default.op.get_attr('value')
+          )
+        else:
+          raise ValueError(
+              f'Unable to convert object {str(default)} of type {type(default)}'
+              ' to TensorProto.'
+          )
   return signature_def
 
 
@@ -212,7 +233,6 @@ def predict_signature_def(inputs, outputs):
   return signature_def
 
 
-# LINT.IfChange
 def supervised_train_signature_def(
     inputs, loss, predictions=None, metrics=None):
   return _supervised_signature_def(
@@ -267,7 +287,6 @@ def _supervised_signature_def(
       signature_inputs, signature_outputs, method_name)
 
   return signature_def
-# LINT.ThenChange(//keras/saving/utils_v1/signature_def_utils.py)
 
 
 @tf_export(
diff --git a/tensorflow/python/saved_model/signature_def_utils_test.py b/tensorflow/python/saved_model/signature_def_utils_test.py
index d206f25b246..6b51ff69b92 100644
--- a/tensorflow/python/saved_model/signature_def_utils_test.py
+++ b/tensorflow/python/saved_model/signature_def_utils_test.py
@@ -71,8 +71,13 @@ class SignatureDefUtilsTest(test.TestCase):
       outputs = {}
       outputs["foo-output"] = y_tensor_info
 
+      default_tensor = constant_op.constant(1.0, name="w")
+      defaults = {}
+      defaults["w"] = default_tensor
+
     signature_def = signature_def_utils_impl.build_signature_def(
-        inputs, outputs, "foo-method-name")
+        inputs, outputs, "foo-method-name", defaults
+    )
     self.assertEqual("foo-method-name", signature_def.method_name)
 
     # Check inputs in signature def.
@@ -90,6 +95,10 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, y_tensor_info_actual.dtype)
     self.assertEqual(0, len(y_tensor_info_actual.tensor_shape.dim))
 
+    self.assertEqual(1, len(signature_def.defaults))
+    self.assertEqual(types_pb2.DT_FLOAT, signature_def.defaults["w"].dtype)
+    self.assertEqual(1.0, signature_def.defaults["w"].float_val[0])
+
   def testRegressionSignatureDef(self):
     # Force the test to run in graph mode.
     # This tests a deprecated v1 API that uses functionality that does not work
diff --git a/tensorflow/python/saved_model/signature_serialization.py b/tensorflow/python/saved_model/signature_serialization.py
index d31807b535e..043cfd4703b 100644
--- a/tensorflow/python/saved_model/signature_serialization.py
+++ b/tensorflow/python/saved_model/signature_serialization.py
@@ -26,6 +26,7 @@ from tensorflow.python.saved_model import function_serialization
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.trackable import base
+from tensorflow.python.types import core
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
@@ -120,13 +121,14 @@ def find_function_to_export(saveable_view):
 def canonicalize_signatures(signatures):
   """Converts `signatures` into a dictionary of concrete functions."""
   if signatures is None:
-    return {}, {}
+    return {}, {}, {}
   if not isinstance(signatures, collections_abc.Mapping):
     signatures = {
         signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
   num_normalized_signatures_counter = 0
   concrete_signatures = {}
   wrapped_functions = {}
+  defaults = {}
   for signature_key, function in signatures.items():
     original_function = signature_function = _get_signature(function)
     if signature_function is None:
@@ -143,7 +145,7 @@ def canonicalize_signatures(signatures):
       signature_name_changes = _get_signature_name_changes(signature_function)
       if signature_name_changes:
         num_normalized_signatures_counter += 1
-        logging.warning(
+        logging.info(
             "Function `%s` contains input name(s) %s with unsupported "
             "characters which will be renamed to %s in the SavedModel.",
             compat.as_str(signature_function.graph.name),
@@ -152,11 +154,13 @@ def canonicalize_signatures(signatures):
     # Re-wrap the function so that it returns a dictionary of Tensors. This
     # matches the format of 1.x-style signatures.
     # pylint: disable=cell-var-from-loop
-    @def_function.function
     def signature_wrapper(**kwargs):
       structured_outputs = signature_function(**kwargs)
       return _normalize_outputs(
           structured_outputs, signature_function.name, signature_key)
+    if hasattr(function, "__name__"):
+      signature_wrapper.__name__ = "signature_wrapper_" + function.__name__
+    wrapped_function = def_function.function(signature_wrapper)
     tensor_spec_signature = {}
     if signature_function.structured_input_signature is not None:
       # The structured input signature may contain other non-tensor arguments.
@@ -177,7 +181,7 @@ def canonicalize_signatures(signatures):
       else:
         spec = tensor_spec.TensorSpec.from_tensor(inp, name=keyword)
       tensor_spec_signature[keyword] = spec
-    final_concrete = signature_wrapper._get_concrete_function_garbage_collected(  # pylint: disable=protected-access
+    final_concrete = wrapped_function._get_concrete_function_garbage_collected(  # pylint: disable=protected-access
         **tensor_spec_signature)
     # pylint: disable=protected-access
     if len(final_concrete._arg_keywords) == 1:
@@ -191,7 +195,16 @@ def canonicalize_signatures(signatures):
     # pylint: enable=protected-access
     concrete_signatures[signature_key] = final_concrete
     # pylint: enable=cell-var-from-loop
-  return concrete_signatures, wrapped_functions
+    if isinstance(function, core.GenericFunction):
+      full_arg_spec = function._function_spec.fullargspec  # pylint: disable=protected-access
+      len_defaults = len(full_arg_spec.defaults or [])
+      for arg, default in zip(
+          full_arg_spec.args[-len_defaults:], full_arg_spec.defaults or []
+      ):
+        if not (default and isinstance(default, ops.Tensor)):
+          continue
+        defaults.setdefault(signature_key, {})[arg] = default
+  return concrete_signatures, wrapped_functions, defaults
 
 
 def _normalize_outputs(outputs, function_name, signature_key):
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 2ff0cc467f2..b3f5a6849ce 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -18,6 +18,7 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.framework import byte_swap_tensor as bst
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -207,43 +208,7 @@ def get_element_from_tensor_info(tensor_info, graph=None, import_scope=None):
       ops.prepend_name_scope(tensor_info.name, import_scope=import_scope))
 
 
-# Based on tensor_bundle/byte_swap.cc
-byte_swappable = [
-    dtypes.float16, dtypes.float32, dtypes.float64, dtypes.bfloat16,
-    dtypes.complex64, dtypes.complex128, dtypes.uint16, dtypes.uint32,
-    dtypes.uint64, dtypes.int16, dtypes.int32, dtypes.int64, dtypes.qint16,
-    dtypes.quint16, dtypes.qint32
-]
-
-
 def swap_function_tensor_content(meta_graph_def, from_endiness, to_endiness):
-  functions = meta_graph_def.graph_def.library.function
-  for function in functions:
-    node_def = function.node_def
-    for node in node_def:
-      if node.op == "Const":
-        tensor = node.attr["value"].tensor
-        byte_swap_tensor_content(tensor, from_endiness, to_endiness)
-
-
-def byte_swap_tensor_content(tensor, from_endiness, to_endiness):
-  """Byte swaps."""
-  if tensor.dtype in byte_swappable:
-    tshape = tensor.tensor_shape.dim
-    tensor_bytes = tensor.tensor_content
-    if tensor_bytes:
-      tensor_size = 1
-      for sz in tshape:
-        tensor_size = tensor_size * sz.size
-      chunksize = int(len(tensor_bytes) / tensor_size)
-      # Split tensor_data into chunks for byte swapping.
-      to_swap = [
-          tensor_bytes[i:i + chunksize]
-          for i in range(0, len(tensor_bytes), chunksize)
-      ]
-      # Swap and replace tensor_content.
-      tensor.tensor_content = b"".join([
-          int.from_bytes(byteswap,
-                         from_endiness).to_bytes(chunksize, to_endiness)
-          for byteswap in to_swap
-      ])
+  bst.swap_tensor_content_in_graph_function(
+      meta_graph_def, from_endiness, to_endiness
+  )
diff --git a/tensorflow/python/summary/BUILD b/tensorflow/python/summary/BUILD
index 35682d155f8..a07fbc883c6 100644
--- a/tensorflow/python/summary/BUILD
+++ b/tensorflow/python/summary/BUILD
@@ -1,66 +1,125 @@
-load("//tensorflow:tensorflow.bzl", "py_tests")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-py_library(
-    name = "summary",
-    srcs = glob(
-        ["*.py"],
-        exclude = [
-            "fake*",
-            "*_test.py",
-        ],
-    ),
-    srcs_version = "PY3",
+py_strict_library(
+    name = "plugin_asset",
+    srcs = ["plugin_asset.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "__init__",
+    srcs = ["__init__.py"],
+    visibility = ["//visibility:public"],
+)
+
+py_strict_library(
+    name = "summary_iterator",
+    srcs = ["summary_iterator.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "summary_py",
+    srcs = ["summary.py"],
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:lib",
         "//tensorflow/python:logging_ops_gen",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:summary_op_util",
         "//tensorflow/python:summary_ops_gen",
         "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python/distribute:summary_op_util",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/summary/writer",
+        "//tensorflow/python/summary/writer:writer_cache",
         "//tensorflow/python/training:training_util",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_tests(
-    name = "summary_tests",
+tf_py_strict_test(
+    name = "plugin_asset_test",
     size = "small",
-    srcs = [
-        "plugin_asset_test.py",
-        "summary_iterator_test.py",
-        "summary_test.py",
-        "summary_v2_test.py",
-    ],
+    srcs = ["plugin_asset_test.py"],
     python_version = "PY3",
     deps = [
-        ":summary",
+        ":plugin_asset",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+    ],
+)
+
+tf_py_strict_test(
+    name = "summary_iterator_test",
+    size = "small",
+    srcs = ["summary_iterator_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":summary_iterator",
+        ":summary_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/summary/writer",
+    ],
+)
+
+tf_py_strict_test(
+    name = "summary_test",
+    size = "small",
+    srcs = ["summary_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":summary_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:variables",
         "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+    ],
+)
+
+tf_py_strict_test(
+    name = "summary_v2_test",
+    size = "small",
+    srcs = ["summary_v2_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":summary_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/summary/writer",
         "//tensorflow/python/training:training_util",
     ],
 )
diff --git a/tensorflow/python/summary/writer/BUILD b/tensorflow/python/summary/writer/BUILD
index fcd9eecd4fe..42de5bf41f5 100644
--- a/tensorflow/python/summary/writer/BUILD
+++ b/tensorflow/python/summary/writer/BUILD
@@ -1,41 +1,12 @@
-load("//tensorflow:tensorflow.bzl", "py_tests")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-py_library(
-    name = "writer",
-    srcs = glob(
-        ["*.py"],
-        exclude = [
-            "fake*",
-            "*_test.py",
-        ],
-    ),
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:logging_ops_gen",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:summary_op_util",
-        "//tensorflow/python:summary_ops_gen",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python/distribute:summary_op_util",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//tensorflow/python/util",
-    ],
-)
-
-py_library(
+py_strict_library(
     name = "fake_summary_writer",
     testonly = 1,
     srcs = ["fake_summary_writer.py"],
@@ -43,13 +14,69 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":writer",
+        ":writer_cache",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_tests(
-    name = "writer_tests",
+py_strict_library(
+    name = "writer",
+    srcs = ["writer.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":event_file_writer",
+        ":event_file_writer_v2",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/summary:plugin_asset",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "event_file_writer",
+    srcs = ["event_file_writer.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python/client:_pywrap_events_writer",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+    ],
+)
+
+py_strict_library(
+    name = "writer_cache",
+    srcs = ["writer_cache.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":writer",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "event_file_writer_v2",
+    srcs = ["event_file_writer_v2.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:gfile",
+    ],
+)
+
+tf_py_strict_test(
+    name = "writer_test",
     size = "small",
     srcs = [
         "writer_test.py",
@@ -57,15 +84,20 @@ py_tests(
     python_version = "PY3",
     deps = [
         ":writer",
+        ":writer_cache",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/framework",
-        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/summary:plugin_asset",
+        "//tensorflow/python/summary:summary_iterator",
+        "//tensorflow/python/util:compat",
     ],
 )
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 814665199c6..f8645f01bd5 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/c/eager/tfe_cancellation_manager_internal.h"
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/compiler/jit/flags.h"
@@ -45,7 +46,6 @@ limitations under the License.
 #include "tensorflow/python/lib/core/py_exception_registry.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
 #include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 #include "tensorflow/python/util/util.h"
 
@@ -515,7 +515,7 @@ static py::bytes TFE_GetCompilerIr(py::handle& ctx,
 
   if (!hlo_str.ok()) {
     ThrowValueError(absl::StrFormat("Failed getting HLO text: '%s'",
-                                    hlo_str.status().error_message())
+                                    hlo_str.status().message())
                         .c_str());
   }
   return py::bytes(*hlo_str);
@@ -801,6 +801,17 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
                                     status.get());
           tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
         });
+  m.def(
+      "TFE_ContextGetFunction",
+      [](py::handle& ctx, const char* function_name) {
+        tensorflow::Safe_TF_StatusPtr status =
+            tensorflow::make_safe(TF_NewStatus());
+        TF_Function* tf_function = TFE_ContextGetFunction(
+            tensorflow::InputTFE_Context(ctx), function_name, status.get());
+        tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+        return tf_function;
+      },
+      py::return_value_policy::reference);
   m.def("TFE_ContextGetFunctionDef",
         [](py::handle& ctx, const char* function_name, TF_Buffer& buf) {
           tensorflow::Safe_TF_StatusPtr status =
@@ -809,6 +820,14 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
                                     function_name, &buf, status.get());
           tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
         });
+  m.def("TFE_ContextGetGraphDebugInfo",
+        [](py::handle& ctx, const char* function_name, TF_Buffer& buf) {
+          tensorflow::Safe_TF_StatusPtr status =
+              tensorflow::make_safe(TF_NewStatus());
+          TFE_ContextGetGraphDebugInfo(tensorflow::InputTFE_Context(ctx),
+                                       function_name, &buf, status.get());
+          tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+        });
   m.def("TFE_ContextRemoveFunction", [](py::handle& ctx, const char* name) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
diff --git a/tensorflow/python/tfe_wrapper_monitoring_reader.cc b/tensorflow/python/tfe_wrapper_monitoring_reader.cc
new file mode 100644
index 00000000000..5496065a572
--- /dev/null
+++ b/tensorflow/python/tfe_wrapper_monitoring_reader.cc
@@ -0,0 +1,53 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");;
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "Python.h"
+#include "pybind11/complex.h"  // from @pybind11
+#include "pybind11/functional.h"  // from @pybind11
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "pybind11/stl.h"  // from @pybind11
+#include "tensorflow/c/eager/c_api_experimental_reader.h"
+#include "tensorflow/c/eager/tfe_monitoring_reader_internal.h"
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
+#include "tensorflow/python/util/util.h"
+
+namespace py = pybind11;
+
+PYBIND11_MAKE_OPAQUE(TFE_MonitoringCounterReader);
+
+PYBIND11_MODULE(_pywrap_tfe_monitoring_reader, m) {
+  py::class_<TFE_MonitoringCounterReader> TFE_MonitoringCounterReader_class(
+      m, "TFE_MonitoringCounterReader");
+  m.def("TFE_MonitoringNewCounterReader", [](const char* name) {
+    auto output = TFE_MonitoringNewCounterReader(name);
+    return output;
+  });
+  m.def("TFE_MonitoringReadCounter0",
+        [](TFE_MonitoringCounterReader* cell_reader) {
+          auto output = TFE_MonitoringReadCounter0(cell_reader);
+          return output;
+        });
+  m.def("TFE_MonitoringReadCounter1",
+        [](TFE_MonitoringCounterReader* cell_reader, const char* label) {
+          auto output = TFE_MonitoringReadCounter1(cell_reader, label);
+          return output;
+        });
+};
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 9f4dad1620c..f36bf08752a 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -1,7 +1,8 @@
 # Description:
 #   Tools for manipulating TensorFlow graphs.
 
-load("//tensorflow:tensorflow.bzl", "if_google", "if_xla_available", "py_binary", "py_test", "tf_cc_test")
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.bzl", "if_google", "if_xla_available", "tf_cc_test")
 load("//tensorflow/python/tools:tools.bzl", "saved_model_compile_aot")
 
 package(
@@ -11,7 +12,7 @@ package(
 )
 
 # Transitive dependencies of this target will be included in the pip package.
-py_library(
+py_strict_library(
     name = "tools_pip",
     data = [
         ":freeze_graph",
@@ -38,13 +39,19 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "saved_model_utils",
     srcs = ["saved_model_utils.py"],
     srcs_version = "PY3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/saved_model:constants",
+        "//tensorflow/python/util:compat",
+    ],
 )
 
-py_test(
+py_strict_test(
     name = "saved_model_utils_test",
     size = "small",
     srcs = ["saved_model_utils_test.py"],
@@ -54,32 +61,37 @@ py_test(
     visibility = ["//visibility:private"],
     deps = [
         ":saved_model_utils",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/saved_model",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:tag_constants",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "freeze_graph_lib",
     srcs = ["freeze_graph.py"],
     srcs_version = "PY3",
     deps = [
         ":saved_model_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:no_contrib",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:py_checkpoint_reader",
-        "//tensorflow/python:training",
-        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:convert_to_constants",
+        "//tensorflow/python/platform:gfile",
         "//tensorflow/python/saved_model:loader",
-        "@six_archive//:six",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//tensorflow/python/training:saver",
+        "@absl_py//absl:app",
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "freeze_graph",
     srcs = ["freeze_graph.py"],
     python_version = "PY3",
@@ -87,16 +99,27 @@ py_binary(
     deps = [":freeze_graph_main_lib"],
 )
 
-py_library(
+py_strict_library(
     name = "freeze_graph_main_lib",
     srcs = ["freeze_graph.py"],
     srcs_version = "PY3",
     deps = [
-        ":freeze_graph_lib",
+        ":saved_model_utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:convert_to_constants",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//tensorflow/python/training:py_checkpoint_reader",
+        "//tensorflow/python/training:saver",
+        "@absl_py//absl:app",
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "import_pb_to_tensorboard",
     srcs = ["import_pb_to_tensorboard.py"],
     python_version = "PY3",
@@ -104,22 +127,21 @@ py_binary(
     deps = [":import_pb_to_tensorboard_lib"],
 )
 
-py_library(
+py_strict_library(
     name = "import_pb_to_tensorboard_lib",
     srcs = ["import_pb_to_tensorboard.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/summary",
+        ":saved_model_utils",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/summary:summary_py",
+        "@absl_py//absl:app",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "freeze_graph_test",
     size = "small",
     srcs = ["freeze_graph_test.py"],
@@ -128,19 +150,30 @@ py_test(
     deps = [
         ":freeze_graph_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:training",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:signature_def_utils",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//tensorflow/python/training:saver",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "inspect_checkpoint",
     srcs = ["inspect_checkpoint.py"],
     python_version = "PY3",
@@ -148,48 +181,49 @@ py_binary(
     deps = [":inspect_checkpoint_lib"],
 )
 
-py_library(
+py_strict_library(
     name = "inspect_checkpoint_lib",
     srcs = ["inspect_checkpoint.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/python:platform",
         "//tensorflow/python:py_checkpoint_reader",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:flags",
+        "//third_party/py/numpy",
+        "@absl_py//absl:app",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "strip_unused_lib",
     srcs = ["strip_unused_lib.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/framework:graph_util",
+        "//tensorflow/python/platform:gfile",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "module_util",
     srcs = ["module_util.py"],
     srcs_version = "PY3",
 )
 
-py_binary(
+py_strict_binary(
     name = "strip_unused",
     srcs = ["strip_unused.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":strip_unused_lib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
-        "@six_archive//:six",
+        "//tensorflow/python/framework:dtypes",
+        "@absl_py//absl:app",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "strip_unused_test",
     size = "small",
     srcs = ["strip_unused_test.py"],
@@ -199,31 +233,34 @@ py_test(
     deps = [
         ":strip_unused_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "optimize_for_inference_lib",
     srcs = ["optimize_for_inference_lib.py"],
     srcs_version = "PY3",
     deps = [
         ":strip_unused_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:graph_util",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "optimize_for_inference",
     srcs = ["optimize_for_inference.py"],
     python_version = "PY3",
@@ -231,22 +268,21 @@ py_binary(
     deps = [":optimize_for_inference_main_lib"],
 )
 
-py_library(
+py_strict_library(
     name = "optimize_for_inference_main_lib",
     srcs = ["optimize_for_inference.py"],
     srcs_version = "PY3",
     deps = [
         ":optimize_for_inference_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
-        "@six_archive//:six",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:gfile",
+        "@absl_py//absl:app",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "optimize_for_inference_test",
     size = "small",
     srcs = ["optimize_for_inference_test.py"],
@@ -256,30 +292,35 @@ py_test(
         ":optimize_for_inference_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "selective_registration_header_lib",
     srcs = ["selective_registration_header_lib.py"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/python:platform",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:_pywrap_kernel_registry",
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "print_selective_registration_header",
     srcs = ["print_selective_registration_header.py"],
     python_version = "PY3",
@@ -288,30 +329,31 @@ py_binary(
     deps = [":print_selective_registration_header_lib"],
 )
 
-py_library(
+py_strict_library(
     name = "print_selective_registration_header_lib",
     srcs = ["print_selective_registration_header.py"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         ":selective_registration_header_lib",
-        "//tensorflow/python:platform",
+        "@absl_py//absl:app",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "print_selective_registration_header_test",
     srcs = ["print_selective_registration_header_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":selective_registration_header_lib",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:gfile",
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "saved_model_cli",
     srcs = ["saved_model_cli.py"],
     python_version = "PY3",
@@ -319,7 +361,7 @@ py_binary(
     deps = [":saved_model_cli_lib"],
 )
 
-py_library(
+py_strict_library(
     name = "saved_model_cli_lib",
     srcs = ["saved_model_cli.py"],
     srcs_version = "PY3",
@@ -330,29 +372,59 @@ py_library(
         # broken saved_model_cli.
         ":saved_model_aot_compile",
         ":saved_model_utils",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
         "@absl_py//absl/flags:argparse_flags",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python",
         "//tensorflow/python/client:session",
+        "//tensorflow/python/compiler/tensorrt:trt_convert_py",
         "//tensorflow/python/debug/wrappers:local_cli_wrapper",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/util:compat",
     ] + if_google([
         "//tensorflow/core/tfrt/tfrt_session",
         "//tensorflow_text:tensorflow_text",
     ]),
 )
 
-py_library(
+py_strict_library(
     name = "saved_model_aot_compile",
     srcs = ["saved_model_aot_compile.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:tf_optimizer",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:convert_to_constants",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:versions",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:sysconfig",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:saver",
     ] + if_xla_available(
         ["//tensorflow/compiler/tf2xla:tf2xla_proto_py"],
     ),
 )
 
-py_test(
+py_strict_test(
     name = "saved_model_cli_test",
     srcs = ["saved_model_cli_test.py"],
     data = [
@@ -363,26 +435,38 @@ py_test(
     deps = [
         ":saved_model_cli_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:parsing_config",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/debug/wrappers:local_cli_wrapper",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/trackable:autotrackable",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "make_aot_compile_models",
     srcs = ["make_aot_compile_models.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/saved_model",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/trackable:autotrackable",
         "@absl_py//absl:app",
         "@absl_py//absl/flags",
-        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/python/tools/api/generator/BUILD b/tensorflow/python/tools/api/generator/BUILD
index 8e94d5c0ca0..fd244d43185 100644
--- a/tensorflow/python/tools/api/generator/BUILD
+++ b/tensorflow/python/tools/api/generator/BUILD
@@ -1,7 +1,7 @@
 # Description:
 # Scripts used to generate TensorFlow Python API.
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "KERAS_API_INIT_FILES", "TENSORFLOW_API_INIT_FILES")
 load("//tensorflow/python/tools/api/generator:api_init_files_v1.bzl", "KERAS_API_INIT_FILES_V1", "TENSORFLOW_API_INIT_FILES_V1")
 load("//tensorflow/python/tools/api/generator:api_gen.bzl", "TENSORFLOW_API_GEN_PACKAGES")
@@ -17,29 +17,30 @@ exports_files(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "create_python_api",
-    srcs = ["//tensorflow/python/tools/api/generator:create_python_api.py"],
+    srcs = ["create_python_api.py"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:util",
-        "//tensorflow/python/tools/api/generator:doc_srcs",
-        "//tensorflow/python/util:fast_module_type",
+        ":doc_srcs",
+        "//tensorflow/python/util:module_wrapper",  # build_cleaner: keep
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "doc_srcs",
     srcs = ["doc_srcs.py"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "create_python_api_test",
     srcs = [
         "create_python_api.py",
@@ -49,12 +50,14 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":doc_srcs",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:no_contrib",
+        "//tensorflow/python:no_contrib",  # build_cleaner: keep
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "tensorflow_doc_srcs_test",
     srcs = ["doc_srcs_test.py"],
     args = [
@@ -66,12 +69,12 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":doc_srcs",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:no_contrib",
+        "//tensorflow/python:no_contrib",  # build_cleaner: keep
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "output_init_files_test",
     srcs = ["output_init_files_test.py"],
     args = [
@@ -87,12 +90,13 @@ py_test(
         "no_pip",
     ],
     deps = [
-        "//tensorflow/lite/python:analyzer",
-        "//tensorflow/lite/python:lite",
-        "//tensorflow/lite/python/authoring",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_combinations",
-        "//tensorflow/python:modules_with_exports",
-        "//tensorflow/python:no_contrib",
+        "//tensorflow/lite/python:analyzer",  # build_cleaner: keep
+        "//tensorflow/lite/python:lite",  # build_cleaner: keep
+        "//tensorflow/lite/python/authoring",  # build_cleaner: keep
+        "//tensorflow/python:modules_with_exports",  # build_cleaner: keep
+        "//tensorflow/python:no_contrib",  # build_cleaner: keep
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:resource_loader",
+        "//tensorflow/python/util:tf_decorator",
     ],
 )
diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl
index 9d855db5388..057421e5e7f 100644
--- a/tensorflow/python/tools/api/generator/api_gen.bzl
+++ b/tensorflow/python/tools/api/generator/api_gen.bzl
@@ -1,5 +1,6 @@
 """Targets for generating TensorFlow Python API __init__.py files."""
 
+load("//tensorflow:tensorflow.bzl", "if_oss")
 load("//tensorflow:tensorflow.default.bzl", "if_indexing_source_code")
 load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "TENSORFLOW_API_INIT_FILES")
 
@@ -125,7 +126,9 @@ def gen_api_init_files(
         srcs_version = "PY3",
         visibility = ["//visibility:public"],
         deps = package_deps + [
-            "//tensorflow/python:util",
+            "//tensorflow/python/util:tf_decorator",
+            "//tensorflow/python/util:tf_export",
+            "//tensorflow/python/util:module_wrapper",
             "//tensorflow/python/tools/api/generator:doc_srcs",
         ],
     )
@@ -151,7 +154,7 @@ def gen_api_init_files(
 
     flags = [
         root_init_template_flag,
-        "--apidir=$(@D)" + output_dir,
+        "--apidir=$(@D)/" + output_dir,
         "--apiname=" + api_name,
         "--apiversion=" + str(api_version),
         compat_api_version_flags,
@@ -170,22 +173,61 @@ def gen_api_init_files(
     loading_value = "default"
     # copybara:comment_end
 
-    native.genrule(
+    api_gen_rule(
         name = name,
         outs = all_output_files,
-        cmd = if_indexing_source_code(
-            _make_cmd(api_gen_binary_target, flags, loading = "static"),
-            _make_cmd(api_gen_binary_target, flags, loading = loading_value),
-        ),
         srcs = srcs,
-        tools = [":" + api_gen_binary_target],
+        flags = flags,
+        api_gen_binary_target = ":" + api_gen_binary_target,
+        loading_value = if_indexing_source_code("static", loading_value),
         visibility = [
             "//tensorflow:__pkg__",
             "//tensorflow/tools/api/tests:__pkg__",
         ],
     )
 
-def _make_cmd(api_gen_binary_target, flags, loading):
-    binary = "$(location :" + api_gen_binary_target + ")"
-    flags.append("--loading=" + loading)
-    return " ".join([binary] + flags + ["$(OUTS)"])
+def _api_gen_rule_impl(ctx):
+    api_gen_binary_target = ctx.attr.api_gen_binary_target[DefaultInfo].files_to_run.executable
+    flags = [ctx.expand_location(flag) for flag in ctx.attr.flags]
+    variables = {"@D": ctx.genfiles_dir.path + "/" + ctx.label.package}
+    flags = [ctx.expand_make_variables("tf_api_version", flag, variables) for flag in flags]
+    loading = ctx.expand_make_variables("TF_API_INIT_LOADING", ctx.attr.loading_value, {})
+    output_paths = [f.path for f in ctx.outputs.outs]
+
+    # Generate file containing the list of outputs
+    # Without this, the command will be too long (even when executed in a shell script)
+    params = ctx.actions.declare_file(ctx.attr.name + ".params")
+    ctx.actions.write(params, ";".join(output_paths))
+
+    cmd = _make_cmd(api_gen_binary_target, flags, loading, [params.path])
+    ctx.actions.run_shell(
+        inputs = ctx.files.srcs + [params],
+        outputs = ctx.outputs.outs,
+        tools = [api_gen_binary_target],
+        use_default_shell_env = True,
+        command = cmd,
+    )
+
+# Note: if only one output_paths is provided, api_gen_binary_target assumes it is a file to be read
+def _make_cmd(api_gen_binary_target, flags, loading, output_paths):
+    binary = api_gen_binary_target.path
+    flags = flags + ["--loading=" + loading]
+    return " ".join([binary] + flags + output_paths)
+
+# To prevent compiling the C++ code twice, we only want to build `api_gen_binary_target`
+# for the target platform and not the execution platform.
+# To achieve this without causing confusion with source dependencies (e.g. putting api_gen_binary_target in srcs of the genrule),
+# we use a custom rule to execute the command line for generating the API files.
+# See https://github.com/tensorflow/tensorflow/issues/60167
+# To not break internal cross-platform builds, we only set `cfg` to `target` for the OSS build.
+api_gen_rule = rule(
+    implementation = _api_gen_rule_impl,
+    output_to_genfiles = True,
+    attrs = {
+        "outs": attr.output_list(mandatory = True),
+        "srcs": attr.label_list(allow_files = True),
+        "flags": attr.string_list(),
+        "api_gen_binary_target": attr.label(executable = True, cfg = if_oss("target", "exec"), mandatory = True),
+        "loading_value": attr.string(mandatory = True),
+    },
+)
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 61c836e9c49..3802a334c20 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -40,7 +40,6 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "queue/__init__.py",
     "initializers/__init__.py",
     "layers/__init__.py",
-    "layers/experimental/__init__.py",
     "linalg/__init__.py",
     "linalg/experimental/__init__.py",
     "lite/__init__.py",
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index eb56ff0efc9..77d4370eb64 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import builder as saved_model_builder
@@ -56,7 +57,7 @@ class FreezeGraphTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     # We'll create an input graph that has a single variable containing 1.0,
     # and that then multiplies it by 2.
     with ops.Graph().as_default():
-      variable_node = variables.VariableV1(1.0, name="variable_node")
+      variable_node = variable_v1.VariableV1(1.0, name="variable_node")
       output_node = math_ops.multiply(variable_node, 2.0, name="output_node")
       sess = session.Session()
       init = variables.global_variables_initializer()
@@ -133,7 +134,7 @@ class FreezeGraphTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       features = parsing_ops.parse_example(examples, feature_configs)
       feature = features[feature_name]
 
-      variable_node = variables.VariableV1(1.0, name="variable_node")
+      variable_node = variable_v1.VariableV1(1.0, name="variable_node")
       scores = math_ops.multiply(variable_node, feature, name="output_node")
       class_feature = array_ops.fill(array_ops.shape(feature),
                                      "class_%s" % feature_name)
@@ -172,7 +173,7 @@ class FreezeGraphTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     output_graph_filename = os.path.join(tmp_dir, "output_graph.pb")
 
     with ops.Graph().as_default():
-      variable_node = variables.VariableV1(1.0, name="variable_node")
+      variable_node = variable_v1.VariableV1(1.0, name="variable_node")
       output_node = math_ops.multiply(variable_node, 2.0, name="output_node")
       sess = session.Session()
       init = variables.global_variables_initializer()
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 4a6dacdad82..0fb53e1073d 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -246,6 +246,15 @@ def _get_ops_in_metagraph(meta_graph_def):
   return set(meta_graph_lib.ops_used_by_graph_def(meta_graph_def.graph_def))
 
 
+def _show_ops_in_metagraph_mgd(meta_graph_def):
+  all_ops_set = _get_ops_in_metagraph(meta_graph_def)
+  print(
+      'The MetaGraph with tag set %s contains the following ops:'
+      % meta_graph_def.meta_info_def.tags,
+      all_ops_set,
+  )
+
+
 def _show_ops_in_metagraph(saved_model_dir, tag_set):
   """Prints the ops in the MetaGraph.
 
@@ -259,10 +268,7 @@ def _show_ops_in_metagraph(saved_model_dir, tag_set):
   """
   meta_graph_def = saved_model_utils.get_meta_graph_def(saved_model_dir,
                                                         tag_set)
-  all_ops_set = _get_ops_in_metagraph(meta_graph_def)
-  print(
-      'The MetaGraph with tag set %s contains the following ops:' %
-      meta_graph_def.meta_info_def.tags, all_ops_set)
+  _show_ops_in_metagraph_mgd(meta_graph_def)
 
 
 def _show_signature_def_map_keys(saved_model_dir, tag_set):
@@ -327,23 +333,20 @@ def _get_outputs_tensor_info_from_meta_graph_def(meta_graph_def,
   return meta_graph_def.signature_def[signature_def_key].outputs
 
 
-def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key, indent=0):
+def _show_inputs_outputs_mgd(meta_graph_def, signature_def_key, indent):
   """Prints input and output TensorInfos.
 
   Prints the details of input and output TensorInfos for the SignatureDef mapped
   by the given signature_def_key.
 
   Args:
-    saved_model_dir: Directory containing the SavedModel to inspect.
-    tag_set: Group of tag(s) of the MetaGraphDef, in string format, separated by
-        ','. For tag-set contains multiple tags, all tags must be passed in.
+    meta_graph_def: MetaGraphDef to inspect.
     signature_def_key: A SignatureDef key string.
     indent: How far (in increments of 2 spaces) to indent each line of output.
   """
-  meta_graph_def = saved_model_utils.get_meta_graph_def(saved_model_dir,
-                                                        tag_set)
   inputs_tensor_info = _get_inputs_tensor_info_from_meta_graph_def(
-      meta_graph_def, signature_def_key)
+      meta_graph_def, signature_def_key
+  )
   outputs_tensor_info = _get_outputs_tensor_info_from_meta_graph_def(
       meta_graph_def, signature_def_key)
 
@@ -366,13 +369,32 @@ def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key, indent=0):
            meta_graph_def.signature_def[signature_def_key].method_name)
 
 
-def _show_defined_functions(saved_model_dir):
+def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key, indent=0):
+  """Prints input and output TensorInfos.
+
+  Prints the details of input and output TensorInfos for the SignatureDef mapped
+  by the given signature_def_key.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect.
+    tag_set: Group of tag(s) of the MetaGraphDef, in string format, separated by
+      ','. For tag-set contains multiple tags, all tags must be passed in.
+    signature_def_key: A SignatureDef key string.
+    indent: How far (in increments of 2 spaces) to indent each line of output.
+  """
+  meta_graph_def = saved_model_utils.get_meta_graph_def(
+      saved_model_dir, tag_set
+  )
+  _show_inputs_outputs_mgd(meta_graph_def, signature_def_key, indent)
+
+
+def _show_defined_functions(saved_model_dir, meta_graphs):
   """Prints the callable concrete and polymorphic functions of the Saved Model.
 
   Args:
     saved_model_dir: Directory containing the SavedModel to inspect.
+    meta_graphs: Already-extracted MetaGraphDef of the SavedModel.
   """
-  meta_graphs = saved_model_utils.read_saved_model(saved_model_dir).meta_graphs
   has_object_graph_def = False
 
   for meta_graph_def in meta_graphs:
@@ -491,19 +513,22 @@ def _show_all(saved_model_dir):
   Args:
     saved_model_dir: Directory containing the SavedModel to inspect.
   """
-  tag_sets = saved_model_utils.get_saved_model_tag_sets(saved_model_dir)
-  for tag_set in sorted(tag_sets):
+  saved_model = saved_model_utils.read_saved_model(saved_model_dir)
+  for meta_graph_def in sorted(
+      saved_model.meta_graphs,
+      key=lambda meta_graph_def: meta_graph_def.meta_info_def.tags,
+  ):
+    tag_set = meta_graph_def.meta_info_def.tags
     print("\nMetaGraphDef with tag-set: '%s' "
           "contains the following SignatureDefs:" % ', '.join(tag_set))
 
     tag_set = ','.join(tag_set)
-    signature_def_map = get_signature_def_map(saved_model_dir, tag_set)
+    signature_def_map = meta_graph_def.signature_def
     for signature_def_key in sorted(signature_def_map.keys()):
       print('\nsignature_def[\'' + signature_def_key + '\']:')
-      _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key,
-                           indent=1)
-    _show_ops_in_metagraph(saved_model_dir, tag_set)
-  _show_defined_functions(saved_model_dir)
+      _show_inputs_outputs_mgd(meta_graph_def, signature_def_key, indent=1)
+    _show_ops_in_metagraph_mgd(meta_graph_def)
+  _show_defined_functions(saved_model_dir, saved_model.meta_graphs)
 
 
 def get_meta_graph_def(saved_model_dir, tag_set):
diff --git a/tensorflow/python/tools/tools.bzl b/tensorflow/python/tools/tools.bzl
index eb027da7312..0ed7102674b 100644
--- a/tensorflow/python/tools/tools.bzl
+++ b/tensorflow/python/tools/tools.bzl
@@ -110,7 +110,7 @@ def saved_model_compile_aot(
     """
     saved_model = "{}/saved_model.pb".format(directory)
     target_triple = target_triple or target_llvm_triple()
-    target_cpu = target_cpu or tfcompile_target_cpu() or ""
+    target_cpu = target_cpu or tfcompile_target_cpu(name) or ""
     variables_to_feed = variables_to_feed or "''"
     if checkpoint_path:
         checkpoint_cmd_args = (
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 50f262c8619..e9980406ea2 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -17,6 +17,7 @@ package(
         "//tensorflow:__subpackages__",
         "//third_party/py/jax_tpu_embedding:__subpackages__",
         "//third_party/py/lingvo:__subpackages__",
+        "//third_party/py/medical_research_foundations:__subpackages__",
         "//waymo/ml/deploy/sync_test/tools:__subpackages__",
     ],
     licenses = ["notice"],
@@ -39,7 +40,8 @@ py_test(
     ],
     deps = [
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/util:tf_decorator",
         "@absl_py//absl/testing:flagsaver",
     ],
 )
@@ -59,14 +61,13 @@ pytype_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
@@ -81,7 +82,6 @@ tpu_py_test(
         ":tpu_estimator",
         ":tpu_lib",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//third_party/py/numpy",
     ],
@@ -95,7 +95,7 @@ pytype_library(
     ],
     deps = [
         ":topology",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -106,9 +106,9 @@ pytype_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:errors",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session_run_hook",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
@@ -129,8 +129,8 @@ py_library(
         "//tensorflow/python/framework:func_graph",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/tpu/ops",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
         "@absl_py//absl/logging",
     ],
@@ -139,7 +139,6 @@ py_library(
 py_library(
     name = "tpu_estimator",
     srcs = [
-        "_tpu_estimator_embedding.py",
         "error_handling.py",
         "tpu_config.py",
         "tpu_context.py",
@@ -162,7 +161,6 @@ py_library(
         "//tensorflow/python:function",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary_ops_v2",
@@ -171,7 +169,6 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/estimator:util",
-        "//tensorflow/python/summary",
     ],
 )
 
@@ -247,6 +244,7 @@ pytype_library(
         ":tpu_replication",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_case",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:control_flow_util",
@@ -259,7 +257,6 @@ pytype_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_analytics",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:string_ops",
@@ -271,7 +268,10 @@ pytype_library(
         "//tensorflow/python/framework:function",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/lib/io:lib",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:remote_utils",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/summary:summary_iterator",
         "//tensorflow/python/tpu:tensor_tracer_proto_py",
         "//tensorflow/python/tpu/ops",
         "//tensorflow/python/training:training_util",
@@ -314,13 +314,11 @@ pytype_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_analytics",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tf2",
         "//tensorflow/python:tpu_ops_gen",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/compiler/xla",
@@ -328,8 +326,12 @@ pytype_library(
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/tpu/ops",
         "//tensorflow/python/tpu/profiler",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -350,6 +352,7 @@ pytype_library(
         "//tensorflow/python:auto_control_deps",
         "//tensorflow/python:c_api_util",
         "//tensorflow/python:composite_tensor",
+        "//tensorflow/python:cond",
         "//tensorflow/python:config",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:device",
@@ -360,13 +363,17 @@ pytype_library(
         "//tensorflow/python:function",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/compiler/xla",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/tpu/ops",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:traceback_utils",
+        "//tensorflow/python/util:variable_utils",
         "//third_party/py/numpy",
         "@absl_py//absl/logging",
     ],
@@ -384,6 +391,7 @@ pytype_library(
         "//tensorflow/python/compiler/xla/experimental:xla_sharding",
         "//tensorflow/python/tpu/ops",
         "//tensorflow/python/user_ops:ops",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -409,10 +417,10 @@ pytype_library(
         "//tensorflow/python:config",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session",
-        "//tensorflow/python:util",
         "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -430,7 +438,7 @@ pytype_library(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:data",
     ],
 )
 
@@ -444,6 +452,7 @@ tf_py_test(
     deps = [
         ":datasets",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/util:compat",
     ],
 )
 
@@ -494,6 +503,7 @@ tf_py_test(
         ":tpu",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -504,6 +514,7 @@ tf_py_test(
     deps = [
         ":topology",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -525,6 +536,8 @@ pytype_library(
         "//tensorflow/python:tpu_ops_gen",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -543,11 +556,13 @@ pytype_library(
         ":tpu_lib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:tape",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -597,6 +612,7 @@ pytype_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -661,7 +677,7 @@ pytype_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/tpu/ops",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/util:tf_export",
         "@absl_py//absl/logging",
     ],
@@ -699,8 +715,10 @@ pytype_library(
         "//tensorflow/python/tpu/ops",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/trackable:base",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:internal",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
         "@absl_py//absl/logging",
     ],
@@ -715,7 +733,7 @@ pytype_strict_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/trackable:autotrackable",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
     ],
 )
 
@@ -738,8 +756,8 @@ pytype_strict_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/ops/ragged:ragged_tensor",
-        "//tensorflow/python/types",
-        "//tensorflow/python/util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
         "@absl_py//absl/logging",
     ],
@@ -757,6 +775,8 @@ tf_py_test(
         "//tensorflow/python:init_ops_v2",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
 )
@@ -781,7 +801,7 @@ pytype_strict_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/ops/ragged:ragged_tensor",
-        "//tensorflow/python/util",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -796,6 +816,7 @@ tf_py_test(
     deps = [
         ":tpu_embedding_v2",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -810,6 +831,7 @@ tpu_py_test(
     tags = ["no_oss"],
     deps = [
         ":tpu_lib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
diff --git a/tensorflow/python/tpu/client/client.py b/tensorflow/python/tpu/client/client.py
index 83d352f4e8c..d86ba094536 100644
--- a/tensorflow/python/tpu/client/client.py
+++ b/tensorflow/python/tpu/client/client.py
@@ -44,6 +44,7 @@ _ENDPOINTS_SEPARATOR = ','
 _DEFAULT_ENV_VARIABLE = 'TPU_NAME'
 _DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
 _GCE_METADATA_URL_ENV_VARIABLE = 'GCE_METADATA_IP'
+_GCE_METADATA_ENDPOINT_ENV_VARIABLE = 'GCE_METADATA_HOST'
 _DEFAULT_ENDPOINT_PORT = '8470'
 _OOM_EVENT_COOL_TIME_SEC = 90
 _VERSION_SWITCHER_ENDPOINT = 'http://{}:8475/requestversion'
@@ -66,8 +67,12 @@ def _environment_discovery_url():
 
 
 def _gce_metadata_endpoint():
-  return 'http://' + os.environ.get(_GCE_METADATA_URL_ENV_VARIABLE,
-                                    'metadata.google.internal')
+  endpoint = os.environ.get(_GCE_METADATA_ENDPOINT_ENV_VARIABLE)
+  if not endpoint:
+    endpoint = os.environ.get(
+        _GCE_METADATA_URL_ENV_VARIABLE, 'metadata.google.internal'
+    )
+  return 'http://' + endpoint
 
 
 def _request_compute_metadata(path):
diff --git a/tensorflow/python/tpu/client/client_test.py b/tensorflow/python/tpu/client/client_test.py
index 7f7e71bcd9e..c2c8a58bd70 100644
--- a/tensorflow/python/tpu/client/client_test.py
+++ b/tensorflow/python/tpu/client/client_test.py
@@ -110,6 +110,19 @@ class CloudTpuClientTest(test.TestCase):
     self.assertEqual('https://{api}.internal/{apiVersion}',
                      (client._environment_discovery_url()))
 
+  def testEnvironmentGCEDefault(self):
+    self.assertEqual(
+        'http://metadata.google.internal', client._gce_metadata_endpoint()
+    )
+
+  @mock.patch.dict(os.environ, {'GCE_METADATA_IP': '1.2.3.4'})
+  def testEnvironmentGCEIPOverride(self):
+    self.assertEqual('http://1.2.3.4', client._gce_metadata_endpoint())
+
+  @mock.patch.dict(os.environ, {'GCE_METADATA_HOST': 'foo.bar'})
+  def testEnvironmentGCEHostOverride(self):
+    self.assertEqual('http://foo.bar', client._gce_metadata_endpoint())
+
   def testEnvironmentVarToNetworkEndpointsSingleIp(self):
     self.assertEqual(
         [{'ipAddress': '1.2.3.4', 'port': '1234'}],
diff --git a/tensorflow/python/tpu/profiler/BUILD b/tensorflow/python/tpu/profiler/BUILD
index 12798942e00..e657856353b 100644
--- a/tensorflow/python/tpu/profiler/BUILD
+++ b/tensorflow/python/tpu/profiler/BUILD
@@ -11,10 +11,9 @@ py_library(
     srcs = ["__init__.py"],
     srcs_version = "PY3",
     deps = [
-        ":profiler_analysis_pb2_grpc",
         "//tensorflow/core/profiler:profiler_analysis_proto_py",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_py",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:all_util",
+        "//tensorflow/tsl/profiler/protobuf:trace_events_proto_py",
     ],
 )
 
@@ -36,9 +35,10 @@ py_library(
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:errors",
-        "//tensorflow/python:platform",
         "//tensorflow/python:versions",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:profiler_client",
         "//tensorflow/python/profiler:profiler_v2",
         "@absl_py//absl:app",
diff --git a/tensorflow/python/tpu/profiler/__init__.py b/tensorflow/python/tpu/profiler/__init__.py
index eed9fe34ae1..ab89185c053 100644
--- a/tensorflow/python/tpu/profiler/__init__.py
+++ b/tensorflow/python/tpu/profiler/__init__.py
@@ -16,11 +16,11 @@
 """Classes for TPU trace events."""
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.core.profiler.protobuf.trace_events_pb2 import *
 from tensorflow.core.profiler.profiler_analysis_pb2 import *
+from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.tsl.profiler.protobuf.trace_events_pb2 import *
 # pylint: enable=wildcard-import,unused-import
 
-from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = ['Trace', 'Resource', 'Device', 'TraceEvent']
 
diff --git a/tensorflow/python/tpu/session_support.py b/tensorflow/python/tpu/session_support.py
index 1761cf57247..0c2a337a2a6 100644
--- a/tensorflow/python/tpu/session_support.py
+++ b/tensorflow/python/tpu/session_support.py
@@ -272,7 +272,7 @@ class WatchdogManager(threading.Thread):
     # If we hit an exception, reset our session as it is likely broken.
     while self._running:
       try:
-        self._worker_manager.ping(request=None)
+        self._worker_manager.ping(request=None)  # pytype: disable=attribute-error
         time.sleep(self.ping_interval)
       except errors.OpError as e:
         # Catch any TF errors that occur so we don't stop sending heartbeats
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index 27c05c95cf9..8a2e69e4696 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_case
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
@@ -562,7 +563,7 @@ class TensorTracer:
     """
     self._replica_id = None
     self._tt_config = tensor_tracer_report.TensorTracerConfig()
-    self._parameters = None
+    self._parameters = tensor_tracer_flags.TTParameters()
     self._host_call_fn = {}
     # _cache_variables is a dict (key = graph, value = dicts
     # (key = name, value = tensors))
@@ -892,7 +893,7 @@ class TensorTracer:
         mask = math_ops.reduce_any(
             gen_math_ops.logical_or(
                 gen_math_ops.is_nan(tensor), gen_math_ops.is_inf(tensor)))
-        output_tensor = control_flow_ops.cond(
+        output_tensor = cond.cond(
             mask,
             lambda: constant_op.constant([1.0]),
             lambda: constant_op.constant([0.0]))
@@ -1478,7 +1479,7 @@ class TensorTracer:
       """Returns the text to be printed for inspection output."""
       if (self._parameters.trace_mode ==
           tensor_tracer_flags.TRACE_MODE_NAN_INF):
-        return control_flow_ops.cond(
+        return cond.cond(
             math_ops.greater(tensor, 0.0),
             lambda: 'has NaNs/Infs!',
             lambda: 'has no NaNs or Infs.')
@@ -1501,7 +1502,7 @@ class TensorTracer:
               gen_math_ops.is_nan(cache), gen_math_ops.is_inf(cache)))
 
     # Summarizing message for each step.
-    step_error_message = control_flow_ops.cond(
+    step_error_message = cond.cond(
         step_has_nan_or_inf,
         lambda: 'NaNs or Infs in the step!',
         lambda: 'No numerical issues have been found for the step.')
@@ -1576,7 +1577,7 @@ class TensorTracer:
     diff_stack = array_ops_stack.stack(diffs)
     step_max = math_ops.reduce_max(diff_stack)
 
-    return control_flow_ops.cond(
+    return cond.cond(
         math_ops.greater(step_max, tensor_tracer_flags.DELTA_THRESHOLD.value),
         lambda: logging_ops.print_v2(*stats, summarize=-1),
         lambda: control_flow_ops.no_op())  # pylint: disable=unnecessary-lambda
@@ -2174,7 +2175,7 @@ class TensorTracer:
 
             def write_if_core_0(step, replica_id, tt_summary):
 
-              return control_flow_ops.cond(
+              return cond.cond(
                   math_ops.equal(replica_id, 0),
                   lambda: write_cache(step=step, event_file_suffix=None,  # pylint: disable=g-long-lambda
                                       tensor_tracer_summary=tt_summary),
diff --git a/tensorflow/python/tpu/tensor_tracer_flags.py b/tensorflow/python/tpu/tensor_tracer_flags.py
index 63d7098d2c5..e9617ce4178 100644
--- a/tensorflow/python/tpu/tensor_tracer_flags.py
+++ b/tensorflow/python/tpu/tensor_tracer_flags.py
@@ -370,6 +370,7 @@ class TTParameters(object):
     found, flag_value = self.get_flag_value(wanted_flag_name)
 
     if found:
+      assert flag_value is not None
       string_value_list = flag_value.split(',')
     return string_value_list
 
diff --git a/tensorflow/python/tpu/tests/BUILD b/tensorflow/python/tpu/tests/BUILD
index 533b19c4e22..a904995fddf 100644
--- a/tensorflow/python/tpu/tests/BUILD
+++ b/tensorflow/python/tpu/tests/BUILD
@@ -1,33 +1,42 @@
 # Description: Tests defined for Cloud TPUs
 
-load("//tensorflow:pytype.default.bzl", "pytype_library")
-load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_embedding_base_test",
     srcs = ["tpu_embedding_base_test.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:remote",
-        "//tensorflow/python/saved_model",
-        "//tensorflow/python/tpu:tpu_embedding",
-        "//tensorflow/python/tpu:tpu_embedding_for_serving",
-        "//tensorflow/python/tpu:tpu_embedding_v1",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/tpu:tpu_embedding_v2",
+        "//tensorflow/python/tpu:tpu_embedding_v2_utils",
         "//tensorflow/python/tpu:tpu_strategy_util",
+        "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
+        "@absl_py//absl/flags",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_checkpoint_test",
     srcs = [
         "tpu_embedding_v2_checkpoint_test.py",
@@ -38,11 +47,29 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/module",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/tpu:tpu_embedding_for_serving",
+        "//tensorflow/python/tpu:tpu_embedding_v2",
+        "//tensorflow/python/tpu:tpu_embedding_v2_utils",
+        "//tensorflow/python/tpu:tpu_strategy_util",
+        "//tensorflow/python/training:checkpoint_utils",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_optimizer_test",
     srcs = [
         "tpu_embedding_v2_optimizer_test.py",
@@ -53,10 +80,27 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu:tpu_embedding",
+        "//tensorflow/python/tpu:tpu_embedding_v2",
+        "//tensorflow/python/tpu:tpu_embedding_v2_utils",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_enqueue_mode_test",
     srcs = [
         "tpu_embedding_v2_enqueue_mode_test.py",
@@ -67,10 +111,20 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:nest",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_invalid_input_test",
     srcs = [
         "tpu_embedding_v2_invalid_input_test.py",
@@ -81,10 +135,18 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu:tpu_embedding_v2",
+        "//tensorflow/python/tpu:tpu_embedding_v2_utils",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_valid_input_test",
     srcs = [
         "tpu_embedding_v2_valid_input_test.py",
@@ -95,10 +157,25 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu:tpu_embedding_v2",
+        "//tensorflow/python/tpu:tpu_embedding_v2_utils",
+        "//tensorflow/python/util:nest",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_hd_valid_input_test",
     srcs = [
         "tpu_embedding_v2_hd_valid_input_test.py",
@@ -109,10 +186,19 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_hd_invalid_input_test",
     srcs = [
         "tpu_embedding_v2_hd_invalid_input_test.py",
@@ -123,10 +209,15 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_sequence_feature_test",
     srcs = [
         "tpu_embedding_v2_sequence_feature_test.py",
@@ -137,19 +228,31 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_embedding_v2_correctness_base_test",
     srcs = ["tpu_embedding_v2_correctness_base_test.py"],
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_correctness_sparse_training_test",
     srcs = [
         "tpu_embedding_v2_correctness_sparse_training_test.py",
@@ -160,10 +263,13 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_correctness_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_correctness_sparse_forward_test",
     srcs = [
         "tpu_embedding_v2_correctness_sparse_forward_test.py",
@@ -174,10 +280,13 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_correctness_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_correctness_ragged_training_test",
     srcs = [
         "tpu_embedding_v2_correctness_ragged_training_test.py",
@@ -188,10 +297,13 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_correctness_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_correctness_ragged_forward_test",
     srcs = [
         "tpu_embedding_v2_correctness_ragged_forward_test.py",
@@ -202,10 +314,13 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_correctness_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_correctness_hd_sparse_training_test",
     srcs = [
         "tpu_embedding_v2_correctness_hd_sparse_training_test.py",
@@ -216,10 +331,13 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_correctness_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_correctness_hd_sparse_forward_test",
     srcs = [
         "tpu_embedding_v2_correctness_hd_sparse_forward_test.py",
@@ -230,10 +348,13 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_correctness_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_correctness_hd_ragged_training_test",
     srcs = [
         "tpu_embedding_v2_correctness_hd_ragged_training_test.py",
@@ -244,10 +365,13 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_correctness_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_correctness_hd_ragged_forward_test",
     srcs = [
         "tpu_embedding_v2_correctness_hd_ragged_forward_test.py",
@@ -258,10 +382,13 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_correctness_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_correctness_dense_lookup_test",
     srcs = [
         "tpu_embedding_v2_correctness_dense_lookup_test.py",
@@ -272,10 +399,16 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_correctness_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_correctness_sequence_feature_test",
     srcs = [
         "tpu_embedding_v2_correctness_sequence_feature_test.py",
@@ -286,10 +419,20 @@ tpu_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_correctness_base_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu:tpu_embedding_v2",
+        "//tensorflow/python/tpu:tpu_embedding_v2_utils",
+        "//tensorflow/python/util:nest",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v2_initialization_test",
     srcs = [
         "tpu_embedding_v2_initialization_test.py",
@@ -299,13 +442,22 @@ tpu_py_test(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":tpu_embedding_v2_correctness_base_test",
+        ":tpu_embedding_base_test",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu:tpu_embedding_v2",
+        "//tensorflow/python/tpu:tpu_embedding_v2_utils",
+        "//third_party/py/numpy",
     ],
 )
 
 ### tpu embedding v1 tests
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v1_checkpoint_test",
     srcs = [
         "tpu_embedding_v1_checkpoint_test.py",
@@ -316,10 +468,26 @@ tpu_py_test(
     tags = ["no_oss"],
     deps = [
         ":tpu_embedding_base_test",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/tpu:tpu_embedding_for_serving",
+        "//tensorflow/python/tpu:tpu_embedding_v1",
+        "//tensorflow/python/tpu:tpu_embedding_v2_utils",
+        "//tensorflow/python/training:checkpoint_utils",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_embedding_v1_correctness_test",
     srcs = [
         "tpu_embedding_v1_correctness_test.py",
@@ -330,10 +498,20 @@ tpu_py_test(
     tags = ["no_oss"],
     deps = [
         ":tpu_embedding_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras/optimizer_v2",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu:tpu_embedding_v1",
+        "//tensorflow/python/tpu:tpu_embedding_v2_utils",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_initialization_test",
     srcs = [
         "tpu_initialization_test.py",
@@ -345,6 +523,10 @@ tpu_py_test(
     srcs_version = "PY3",
     tags = ["no_oss"],
     deps = [
-        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu:tpu_strategy_util",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/tpu/tests/tpu_embedding_v1_correctness_test.py b/tensorflow/python/tpu/tests/tpu_embedding_v1_correctness_test.py
index 13ee105e7e4..c0132d4ebf3 100644
--- a/tensorflow/python/tpu/tests/tpu_embedding_v1_correctness_test.py
+++ b/tensorflow/python/tpu/tests/tpu_embedding_v1_correctness_test.py
@@ -22,7 +22,10 @@ from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
-from tensorflow.python.keras import optimizer_v2
+from tensorflow.python.keras.optimizer_v2 import adagrad
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import ftrl
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import tpu_embedding_v1
 from tensorflow.python.tpu import tpu_embedding_v2_utils
@@ -32,17 +35,9 @@ from tensorflow.python.tpu.tests import tpu_embedding_base_test
 _SLOT_NAME_MAPPING = {
     # Slot names in Keras optimizer v2 are different compared to the slot names
     # in our API.
-    optimizer_v2.adagrad.Adagrad: {
-        'accumulators': 'accumulator'
-    },
-    optimizer_v2.adam.Adam: {
-        'momenta': 'm',
-        'velocities': 'v'
-    },
-    optimizer_v2.ftrl.Ftrl: {
-        'accumulators': 'accumulator',
-        'linears': 'linear'
-    },
+    adagrad.Adagrad: {'accumulators': 'accumulator'},
+    adam.Adam: {'momenta': 'm', 'velocities': 'v'},
+    ftrl.Ftrl: {'accumulators': 'accumulator', 'linears': 'linear'},
 }
 
 
@@ -82,22 +77,22 @@ class TPUEmbeddingV0CorrectnessTest(tpu_embedding_base_test.TPUEmbeddingBaseTest
     # variable creation fn properly populated.
     with strategy.scope():
       if optimizer_name == 'sgd':
-        optimizer = optimizer_v2.gradient_descent.SGD(learning_rate=0.1)
+        optimizer = gradient_descent.SGD(learning_rate=0.1)
         embedding_optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
       elif optimizer_name == 'adagrad':
-        optimizer = optimizer_v2.adagrad.Adagrad(learning_rate=0.1)
+        optimizer = adagrad.Adagrad(learning_rate=0.1)
         embedding_optimizer = tpu_embedding_v2_utils.Adagrad(
             learning_rate=0.1,
             slot_variable_creation_fn=self._get_slot_variable_creation_fn(
                 optimizer))
       elif optimizer_name == 'adam':
-        optimizer = optimizer_v2.adam.Adam(learning_rate=0.1)
+        optimizer = adam.Adam(learning_rate=0.1)
         embedding_optimizer = tpu_embedding_v2_utils.Adam(
             learning_rate=0.1,
             slot_variable_creation_fn=self._get_slot_variable_creation_fn(
                 optimizer))
       elif optimizer_name == 'ftrl':
-        optimizer = optimizer_v2.ftrl.Ftrl(learning_rate=0.1)
+        optimizer = ftrl.Ftrl(learning_rate=0.1)
         embedding_optimizer = tpu_embedding_v2_utils.FTRL(
             learning_rate=0.1,
             slot_variable_creation_fn=self._get_slot_variable_creation_fn(
@@ -178,13 +173,13 @@ class TPUEmbeddingV0CorrectnessTest(tpu_embedding_base_test.TPUEmbeddingBaseTest
                                           embedding_table_video_before,
                                           gradients_wrt_video, optimizer,
                                           table_to_variable):
-    if isinstance(optimizer, optimizer_v2.gradient_descent.SGD):
+    if isinstance(optimizer, gradient_descent.SGD):
       check_fn = self._check_embedding_and_slot_variables_for_sgd
-    elif isinstance(optimizer, optimizer_v2.adagrad.Adagrad):
+    elif isinstance(optimizer, adagrad.Adagrad):
       check_fn = self._check_embedding_and_slot_variables_for_adagrad
-    elif isinstance(optimizer, optimizer_v2.adam.Adam):
+    elif isinstance(optimizer, adam.Adam):
       check_fn = self._check_embedding_and_slot_variables_for_adam
-    elif isinstance(optimizer, optimizer_v2.ftrl.Ftrl):
+    elif isinstance(optimizer, ftrl.Ftrl):
       check_fn = self._check_embedding_and_slot_variables_for_ftrl
     else:
       raise ValueError('optimizer is not recognized: ', type(optimizer))
diff --git a/tensorflow/python/tpu/tests/tpu_embedding_v2_optimizer_test.py b/tensorflow/python/tpu/tests/tpu_embedding_v2_optimizer_test.py
index 01f50fb8e3d..3f658f0696d 100644
--- a/tensorflow/python/tpu/tests/tpu_embedding_v2_optimizer_test.py
+++ b/tensorflow/python/tpu/tests/tpu_embedding_v2_optimizer_test.py
@@ -20,7 +20,6 @@ import numpy as np
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -146,7 +145,7 @@ class TPUEmbeddingTest(tpu_embedding_base_test.TPUEmbeddingBaseTest):
     if use_tpu:
       strategy = self._get_strategy()
     else:
-      strategy = distribution_strategy_context.get_strategy()
+      strategy = distribute_lib.get_strategy()
     with strategy.scope():
       mid_level = tpu_embedding_v2.TPUEmbedding(
           feature_config=self.feature_config,
diff --git a/tensorflow/python/tpu/tpu.bzl b/tensorflow/python/tpu/tpu.bzl
index 2d0c7f80a71..91ea10ee047 100644
--- a/tensorflow/python/tpu/tpu.bzl
+++ b/tensorflow/python/tpu/tpu.bzl
@@ -56,6 +56,9 @@ def tpu_py_test(
         )
     )
 
+def tpu_py_strict_test(**kwargs):
+    tpu_py_test(**kwargs)
+
 def internal_create_sanitizer_settings():
     """Stub definition for an external rule."""
     pass
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index c56b15ea695..2daee6c2a96 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -39,6 +39,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
@@ -482,7 +483,7 @@ def _pad_all_input(
           # TODO(rxsang): This is a hack to make sure padded_input has dynamic
           # shapes, so any tf.size/tf.shape op performed on it won't be constant
           # folded. Do we have better ways to do it?
-          padded_input = control_flow_ops.cond(
+          padded_input = cond.cond(
               array_ops.constant(True),
               lambda: array_ops.pad(input_tensor, paddings),  # pylint: disable=cell-var-from-loop
               lambda: input_tensor)
diff --git a/tensorflow/python/tpu/tpu_embedding_for_serving.py b/tensorflow/python/tpu/tpu_embedding_for_serving.py
index 8950de3952c..9914e084bb1 100644
--- a/tensorflow/python/tpu/tpu_embedding_for_serving.py
+++ b/tensorflow/python/tpu/tpu_embedding_for_serving.py
@@ -17,7 +17,7 @@
 from typing import Any, Iterable, Optional, Text, Union, Dict
 from absl import logging
 
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -112,7 +112,7 @@ class TPUEmbeddingForServing(tpu_embedding_base.TPUEmbeddingBase):
       RuntimeError: If created under TPUStrategy.
     """
     super(TPUEmbeddingForServing, self).__init__(feature_config, optimizer)
-    self._strategy = distribution_strategy_context.get_strategy()
+    self._strategy = distribute_lib.get_strategy()
     if isinstance(self._strategy,
                   (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV2)):
       raise RuntimeError("Serving on TPU is not yet supported.")
diff --git a/tensorflow/python/tpu/tpu_embedding_v1.py b/tensorflow/python/tpu/tpu_embedding_v1.py
index 9db234dcdc1..7b19500025b 100644
--- a/tensorflow/python/tpu/tpu_embedding_v1.py
+++ b/tensorflow/python/tpu/tpu_embedding_v1.py
@@ -16,7 +16,7 @@
 
 from typing import Any, Dict, Iterable, Optional, Text, Union
 
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -131,7 +131,7 @@ class TPUEmbeddingV0(tpu_embedding_base.TPUEmbeddingBase):
       feature_config: Union[tpu_embedding_v2_utils.FeatureConfig, Iterable],  # pylint:disable=g-bare-generic
       optimizer: Optional[tpu_embedding_v2_utils._Optimizer]):  # pylint:disable=protected-access
     super(TPUEmbeddingV0, self).__init__(feature_config, optimizer)
-    self._strategy = distribution_strategy_context.get_strategy()
+    self._strategy = distribute_lib.get_strategy()
     if not isinstance(self._strategy,
                       (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV2)):
       raise RuntimeError(
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index c4ba9b841b3..cc653d31367 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -22,8 +22,8 @@ from absl import logging
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2
 from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import context
@@ -268,7 +268,7 @@ class TPUEmbedding(autotrackable.AutoTrackable):
       ValueError: If optimizer is not one of tf.tpu.experimental.embedding.(SGD,
       Adam or Adagrad) or None when created under a TPUStrategy.
     """
-    self._strategy = distribution_strategy_context.get_strategy()
+    self._strategy = distribute_lib.get_strategy()
     self._using_tpu = isinstance(self._strategy, (tpu_strategy.TPUStrategy,
                                                   tpu_strategy.TPUStrategyV2))
     self._pipeline_execution_with_tensor_core = (
@@ -1468,6 +1468,14 @@ class TPUEmbedding(autotrackable.AutoTrackable):
         output_shapes.append(TensorShape(per_replica_batch_size))
     return output_shapes
 
+  def _create_copy_for_async_checkpoint(
+      self, feature_config, optimizer, pipeline_execution_with_tensor_core):
+    """Create a TPUEmbedding copy for checkpoint/async_checkpoint_helper.py."""
+    return TPUEmbedding(
+        feature_config=feature_config,
+        optimizer=optimizer,
+        pipeline_execution_with_tensor_core=pipeline_execution_with_tensor_core)
+
 
 @def_function.function
 def _load_variables_impl(
diff --git a/tensorflow/python/tpu/tpu_feed.py b/tensorflow/python/tpu/tpu_feed.py
index cd0c0709310..23441ccaaad 100644
--- a/tensorflow/python/tpu/tpu_feed.py
+++ b/tensorflow/python/tpu/tpu_feed.py
@@ -815,7 +815,7 @@ class _PartitionedInfeedQueue(InfeedQueue):
     return tag_sharding_attribute_for_dequeued_tensors(
         values, self._input_partition_dims)
 
-  def generate_enqueue_ops(self, sharded_inputs):
+  def generate_enqueue_ops(self, sharded_inputs):  # pytype: disable=signature-mismatch  # overriding-parameter-count-checks
     """Generates the host-side Ops to enqueue the partitioned inputs.
 
     sharded_inputs is a list, one for each replica, of lists of
diff --git a/tensorflow/python/tpu/tpu_outside_compilation_test.py b/tensorflow/python/tpu/tpu_outside_compilation_test.py
index 5124b4a26af..7f4fcb023d0 100644
--- a/tensorflow/python/tpu/tpu_outside_compilation_test.py
+++ b/tensorflow/python/tpu/tpu_outside_compilation_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import logging_ops
@@ -595,7 +595,7 @@ class OutsideCompilationOnUnsupportedOpTest(test.TestCase,
             tokens = tokens.write(step, next_token)
             return (step + 1, tokens)
 
-          def cond(step, tokens):
+          def cond_fn(step, tokens):
             del tokens
             return math_ops.less(step, max_length)
 
@@ -609,7 +609,7 @@ class OutsideCompilationOnUnsupportedOpTest(test.TestCase,
           )
 
           step = constant_op.constant(0)
-          step, tokens_var = while_loop.while_loop(cond, body,
+          step, tokens_var = while_loop.while_loop(cond_fn, body,
                                                    [step, tokens_var])
 
           image_flat = array_ops.transpose(tokens_var.stack(), [1, 0])
@@ -771,7 +771,7 @@ class OutsideCompilationOnUnsupportedOpTest(test.TestCase,
         fn2 = lambda: computation_with_string_ops(a)
         pred = math_ops.greater_equal(a, b)
         result = array_ops.identity(
-            control_flow_ops.cond(pred, fn1, fn2),
+            cond.cond(pred, fn1, fn2),
             name="uncompilable_control_flow")
         return result
 
diff --git a/tensorflow/python/tpu/tpu_replication.py b/tensorflow/python/tpu/tpu_replication.py
index d0b289809fd..576e733564e 100644
--- a/tensorflow/python/tpu/tpu_replication.py
+++ b/tensorflow/python/tpu/tpu_replication.py
@@ -19,7 +19,7 @@ from typing import Any, Callable, List, Optional, Text, Tuple, Union
 from absl import logging
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph
@@ -67,9 +67,9 @@ def is_tpu_strategy(strategy: Any) -> bool:
 
 def _enclosing_tpu_device_assignment(
 ) -> Optional[device_assignment_lib.DeviceAssignment]:
-  if not distribution_strategy_context.has_strategy():
+  if not distribute_lib.has_strategy():
     return None
-  strategy = distribution_strategy_context.get_strategy()
+  strategy = distribute_lib.get_strategy()
   if not is_tpu_strategy(strategy):
     return None
   return strategy.extended._device_assignment  # pylint: disable=protected-access
diff --git a/tensorflow/python/trackable/BUILD b/tensorflow/python/trackable/BUILD
index 46c41fb12d7..61ad33c8d78 100644
--- a/tensorflow/python/trackable/BUILD
+++ b/tensorflow/python/trackable/BUILD
@@ -1,7 +1,8 @@
 # Description:
 #   Trackable class and subclass definitions.
 
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -11,7 +12,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "trackable",
     deps = [
         ":asset",
@@ -29,13 +30,13 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "trackable_init",
     srcs = ["__init__.py"],
     srcs_version = "PY3",
 )
 
-py_library(
+py_strict_library(
     name = "base",
     srcs = ["base.py"],
     srcs_version = "PY3",
@@ -44,44 +45,52 @@ py_library(
         "//tensorflow/python:control_flow_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "base_test",
     srcs = ["base_test.py"],
     deps = [
         ":base",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "constants",
     srcs = ["constants.py"],
     srcs_version = "PY3",
 )
 
-py_library(
+py_strict_library(
     name = "converter",
     srcs = ["converter.py"],
     srcs_version = "PY3",
     deps = [
+        ":base",
         ":data_structures",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager/polymorphic_function:saved_model_utils",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_util",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "trackable_utils",
     srcs = ["trackable_utils.py"],
     srcs_version = "PY3",
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "trackable_utils_test",
     srcs = ["trackable_utils_test.py"],
     deps = [
@@ -90,7 +99,7 @@ tf_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "base_delegate",
     srcs = ["base_delegate.py"],
     srcs_version = "PY3",
@@ -99,7 +108,7 @@ py_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "base_delegate_test",
     srcs = ["base_delegate_test.py"],
     deps = [
@@ -115,75 +124,105 @@ tf_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "asset",
     srcs = ["asset.py"],
     srcs_version = "PY3",
     deps = [
         ":base",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:lib",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/saved_model:path_helpers",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "autotrackable",
     srcs = ["autotrackable.py"],
     srcs_version = "PY3",
     deps = [
         ":base",
         ":data_structures",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:tf_export",
+        "@absl_py//absl/logging",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "autotrackable_test",
     srcs = ["autotrackable_test.py"],
     deps = [
         ":autotrackable",
         ":data_structures",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/util:nest",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "resource",
     srcs = ["resource.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
         ":base",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "resource_test",
     srcs = ["resource_test.py"],
     deps = [
-        ":base",
+        ":resource",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "layer_utils",
     srcs = ["layer_utils.py"],
     srcs_version = "PY3",
+    deps = ["//tensorflow/python/util:object_identity"],
 )
 
-py_library(
+py_strict_library(
     name = "data_structures",
     srcs = ["data_structures.py"],
     srcs_version = "PY3",
     deps = [
         ":base",
         ":layer_utils",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_export",
         "@wrapt",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "data_structures_test",
     srcs = ["data_structures_test.py"],
     tags = [
@@ -191,17 +230,27 @@ tf_py_test(
         "nomac",
     ],
     deps = [
+        ":autotrackable",
         ":data_structures",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:layers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/module",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:serialization",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "python_state",
     srcs = ["python_state.py"],
     srcs_version = "PY3",
@@ -214,7 +263,7 @@ py_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "python_state_test",
     srcs = ["python_state_test.py"],
     deps = [
@@ -222,6 +271,8 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/checkpoint",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/module",
     ],
 )
diff --git a/tensorflow/python/trackable/base.py b/tensorflow/python/trackable/base.py
index 588fcd178be..cc0a6aae853 100644
--- a/tensorflow/python/trackable/base.py
+++ b/tensorflow/python/trackable/base.py
@@ -121,7 +121,7 @@ class CheckpointInitialValueCallable(object):
 
 
 @tf_export("__internal__.tracking.CheckpointInitialValue", v1=[])
-class CheckpointInitialValue(ops.Tensor):
+class CheckpointInitialValue(object):
   """Tensor wrapper for managing update UIDs in `Variables`.
 
   When supplied as an initial value, objects of this type let a `Variable`
@@ -146,11 +146,10 @@ class CheckpointInitialValue(ops.Tensor):
         {VARIABLE_VALUE_KEY: shape_and_slice})[VARIABLE_VALUE_KEY]
     self._checkpoint_position = checkpoint_position
 
-  def __getattr__(self, attr):
-    try:
-      return getattr(self.wrapped_value, attr)
-    except AttributeError:
-      return self.__getattribute__(attr)
+  def __tf_tensor__(self, dtype=None, name=None):
+    del dtype
+    del name
+    return self.wrapped_value
 
   @property
   def checkpoint_position(self):
diff --git a/tensorflow/python/training/BUILD b/tensorflow/python/training/BUILD
index 4b4781cc486..50aae54fae9 100644
--- a/tensorflow/python/training/BUILD
+++ b/tensorflow/python/training/BUILD
@@ -109,10 +109,20 @@ py_library(
         "//tensorflow/python:sdca_ops",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/checkpoint:checkpoint_view",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/summary:summary_iterator",
+        "//tensorflow/python/summary:summary_py",
+        "//tensorflow/python/summary/writer",
         "//tensorflow/python/trackable:base_delegate",
         "//tensorflow/python/training/experimental:loss_scale_optimizer",
         "//tensorflow/python/training/experimental:mixed_precision",
         "//tensorflow/python/training/tracking:base_delegate",
+        "//tensorflow/python/util:_pywrap_checkpoint_reader",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:function_utils",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -122,7 +132,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":training_lib",
-        "//tensorflow/python/checkpoint:checkpoint_core",
+        "//tensorflow/python/checkpoint",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:python_state",
         "//tensorflow/python/training/tracking:base",
@@ -224,12 +234,13 @@ py_library(
         ":py_checkpoint_reader",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:io_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/util:tf_export",
     ],
@@ -241,8 +252,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:errors",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -254,18 +265,11 @@ py_library(
     deps = [
         ":server_lib",
         "//tensorflow/python:device",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
-    name = "distribution_strategy_context",
-    srcs = ["distribution_strategy_context.py"],
-    srcs_version = "PY3",
-    deps = ["//tensorflow/python/distribute:distribute_lib"],
-)
-
 py_library(
     name = "evaluation",
     srcs = ["evaluation.py"],
@@ -279,9 +283,9 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
@@ -334,11 +338,12 @@ py_library(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/summary:summary_py",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -369,6 +374,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:reduce_util",
@@ -390,8 +396,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_utils",
         "//tensorflow/python/distribute:reduce_util",
@@ -399,6 +404,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/trackable:base",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -436,7 +442,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:_pywrap_quantize_training",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -448,10 +454,10 @@ py_library(
     deps = [
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -485,10 +491,10 @@ py_library(
     deps = [
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session",
         "//tensorflow/python/checkpoint:checkpoint_lib",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -501,8 +507,10 @@ py_library(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:ref_variable",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/compiler/xla/experimental:xla_sharding",
         "//tensorflow/python/distribute:distribute_lib",
@@ -514,8 +522,9 @@ py_library(
     srcs = ["summary_io.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/summary:summary_iterator",
+        "//tensorflow/python/summary/writer",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -532,12 +541,12 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -564,27 +573,15 @@ py_library(
         ":saver",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
-    name = "distribute",
-    srcs = [
-        "distribute.py",
-        "distribution_strategy_context.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python/distribute:distribute_lib",
-    ],
-)
-
 tf_py_test(
     name = "server_lib_test",
     size = "small",
@@ -601,6 +598,7 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -681,7 +679,7 @@ tf_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:variable_v1",
         "//third_party/py/numpy",
     ],
 )
@@ -746,7 +744,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:variable_v1",
     ],
 )
 
@@ -770,11 +768,9 @@ tf_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
-        "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/ops/losses",
-        "//tensorflow/python/summary",
         "//third_party/py/numpy",
     ],
 )
@@ -786,8 +782,8 @@ py_library(
     deps = [
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:util",
         "//tensorflow/python/util:_pywrap_checkpoint_reader",
+        "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -798,6 +794,15 @@ tf_proto_library(
     cc_api_version = 2,
 )
 
+# copybara:uncomment_begin(google-only)
+# py_proto_library(
+#     name = "checkpoint_state_py_pb2",
+#     testonly = 0,
+#     api_version = 2,
+#     deps = [":checkpoint_state"],
+# )
+# copybara:uncomment_end
+
 py_library(
     name = "checkpoint_management",
     srcs = ["checkpoint_management.py"],
@@ -807,12 +812,10 @@ py_library(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -832,17 +835,18 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:pywrap_saved_model",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -879,6 +883,7 @@ cuda_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
@@ -894,21 +899,22 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/summary:summary_py",
         "//tensorflow/python/trackable:base",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
@@ -957,8 +963,9 @@ py_library(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -985,11 +992,11 @@ py_library(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/summary:summary_py",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -1013,10 +1020,13 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint:checkpoint_management",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/summary:summary_iterator",
+        "//tensorflow/python/summary:summary_py",
+        "//tensorflow/python/summary/writer",
     ],
 )
 
@@ -1027,7 +1037,8 @@ py_library(
     deps = [
         "//tensorflow/python:errors",
         "//tensorflow/python:pywrap_tf_session",
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -1041,12 +1052,13 @@ py_library(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -1060,7 +1072,7 @@ tf_py_test(
         ":training_util",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
     ],
 )
@@ -1080,6 +1092,7 @@ cuda_py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:ref_variable",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:variables",
@@ -1137,7 +1150,6 @@ TRAINING_TEST_DEPS = [
     "//tensorflow/python:nn_grad",
     "//tensorflow/python:nn_ops",
     "//tensorflow/python:partitioned_variables",
-    "//tensorflow/python:platform",
     "//tensorflow/python:platform_test",
     "//tensorflow/python:pywrap_tensorflow",
     "//tensorflow/python:random_ops",
@@ -1146,13 +1158,11 @@ TRAINING_TEST_DEPS = [
     "//tensorflow/python:sparse_ops",
     "//tensorflow/python:state_ops",
     "//tensorflow/python:state_ops_gen",
-    "//tensorflow/python:util",
     "//tensorflow/python:variable_scope",
     "//tensorflow/python:variables",
     "//tensorflow/python/distribute:cross_device_ops",
     "//tensorflow/python/distribute:distribute_utils",
     "//tensorflow/python/distribute:mirrored_strategy",
-    "//tensorflow/python/summary",
 ]
 
 cuda_py_test(
@@ -1282,6 +1292,7 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":training",
+        "//tensorflow/python:variable_v1",
     ] + TRAINING_TEST_DEPS,
 )
 
@@ -1292,6 +1303,7 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":training",
+        "//tensorflow/python:variable_v1",
     ] + TRAINING_TEST_DEPS,
 )
 
@@ -1312,6 +1324,7 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":training",
+        "//tensorflow/python:variable_v1",
     ] + TRAINING_TEST_DEPS,
 )
 
@@ -1322,9 +1335,10 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":training",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python:math_ops",
     ] + TRAINING_TEST_DEPS,
 )
 
@@ -1379,12 +1393,13 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:session",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python:while_loop",
         "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/platform:gfile",
     ],
 )
 
@@ -1406,13 +1421,14 @@ tf_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn_grad",
-        "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/summary",
-        "//tensorflow/python/summary/writer",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/summary:summary_py",
         "//tensorflow/python/summary/writer:fake_summary_writer",
+        "//tensorflow/python/summary/writer:writer_cache",
     ],
 )
 
@@ -1434,11 +1450,11 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:gfile",
     ],
 )
 
@@ -1454,7 +1470,6 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
@@ -1495,12 +1510,12 @@ py_library(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:resources",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_coordinator_context",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/summary:summary_py",
+        "//tensorflow/python/util:function_utils",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -1525,12 +1540,13 @@ tf_py_test(
         "//tensorflow/python:saver",
         "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:distribute_coordinator",
         "//tensorflow/python/saved_model",
-        "//tensorflow/python/summary",
+        "//tensorflow/python/summary:summary_py",
     ],
 )
 
@@ -1546,9 +1562,9 @@ tf_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 0f570888d49..2de63031045 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import ref_variable
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -66,8 +67,8 @@ class AdamOptimizerTest(test.TestCase):
           var0 = resource_variable_ops.ResourceVariable(var0_np)
           var1 = resource_variable_ops.ResourceVariable(var1_np)
         else:
-          var0 = variables.RefVariable(var0_np)
-          var1 = variables.RefVariable(var1_np)
+          var0 = ref_variable.RefVariable(var0_np)
+          var1 = ref_variable.RefVariable(var1_np)
         grads0_np_indices = np.array([0, 1], dtype=np.int32)
         grads0 = indexed_slices.IndexedSlices(
             constant_op.constant(grads0_np),
@@ -172,8 +173,8 @@ class AdamOptimizerTest(test.TestCase):
           var1 = resource_variable_ops.ResourceVariable(
               var1_np, name="var1_%d" % i)
         else:
-          var0 = variables.RefVariable(var0_np)
-          var1 = variables.RefVariable(var1_np)
+          var0 = ref_variable.RefVariable(var0_np)
+          var1 = ref_variable.RefVariable(var1_np)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index f43274ed101..b05518b60d4 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -24,7 +24,7 @@ import os
 import time
 
 from tensorflow.python.checkpoint import checkpoint_management
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -372,10 +372,10 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   """
   init_from_checkpoint_fn = lambda _: _init_from_checkpoint(
       ckpt_dir_or_file, assignment_map)
-  if distribution_strategy_context.get_cross_replica_context():
+  if distribute_lib.get_cross_replica_context():
     init_from_checkpoint_fn(None)
   else:
-    distribution_strategy_context.get_replica_context().merge_call(
+    distribute_lib.get_replica_context().merge_call(
         init_from_checkpoint_fn)
 
 
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
deleted file mode 100644
index 28cf8b33615..00000000000
--- a/tensorflow/python/training/distribute.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Deprecated, please use ../distribute/distribute_lib.py."""
-
-# pylint: disable=wildcard-import
-from tensorflow.python.distribute.distribute_lib import *
diff --git a/tensorflow/python/training/distribution_strategy_context.py b/tensorflow/python/training/distribution_strategy_context.py
deleted file mode 100644
index 2bdc54a9ba5..00000000000
--- a/tensorflow/python/training/distribution_strategy_context.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Deprecated, please use ../distribute/distribution_strategy_context.py."""
-
-# pylint: disable=wildcard-import
-from tensorflow.python.distribute.distribution_strategy_context import *
diff --git a/tensorflow/python/training/experimental/BUILD b/tensorflow/python/training/experimental/BUILD
index 346c9d49722..884d4bd2944 100644
--- a/tensorflow/python/training/experimental/BUILD
+++ b/tensorflow/python/training/experimental/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -7,43 +7,47 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "loss_scale",
     srcs = ["loss_scale.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python:cond",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variable_v1",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/trackable:base",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "loss_scale_optimizer",
     srcs = ["loss_scale_optimizer.py"],
     srcs_version = "PY3",
     deps = [
         ":loss_scale",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:smart_cond",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/training:optimizer",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "loss_scale_optimizer_test",
     size = "small",
     srcs = ["loss_scale_optimizer_test.py"],
@@ -63,53 +67,81 @@ py_test(
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/training:gradient_descent",
         "//tensorflow/python/training:momentum",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "loss_scale_test",
     size = "medium",
     srcs = ["loss_scale_test.py"],
     python_version = "PY3",
     deps = [
         ":loss_scale",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/distribute:one_device_strategy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "mixed_precision_global_state",
     srcs = ["mixed_precision_global_state.py"],
     srcs_version = "PY3",
+    deps = ["//tensorflow/python/util:tf_export"],
 )
 
-py_library(
+py_strict_library(
     name = "mixed_precision",
     srcs = ["mixed_precision.py"],
     srcs_version = "PY3",
     deps = [
-        ":loss_scale",
         ":loss_scale_optimizer",
         ":mixed_precision_global_state",
         "//tensorflow/python:config",
-        "//tensorflow/python:util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:optimizer",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "mixed_precision_test",
     size = "small",
     srcs = ["mixed_precision_test.py"],
     python_version = "PY3",
     deps = [
+        ":loss_scale_optimizer",
         ":mixed_precision",
+        ":mixed_precision_global_state",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:gradient_descent",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/training/experimental/loss_scale.py b/tensorflow/python/training/experimental/loss_scale.py
index 002a35c301d..60e3809d00f 100644
--- a/tensorflow/python/training/experimental/loss_scale.py
+++ b/tensorflow/python/training/experimental/loss_scale.py
@@ -15,14 +15,16 @@
 """Contains LossScale classes."""
 import abc
 
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.util import deprecation
@@ -132,7 +134,7 @@ class LossScale(trackable.Trackable, metaclass=abc.ABCMeta):
     Raises:
       RuntimeError: If a weight with `name` has already been added.
     """
-    variable = variable_scope.variable(
+    variable = variable_v1.VariableV1(
         initial_value=initial_value,
         name=name,
         dtype=dtype,
@@ -256,8 +258,13 @@ class FixedLossScale(LossScale):
 
 def _is_all_finite(grads):
   """Returns a scalar boolean tensor indicating if all gradients are finite."""
+  def raw_values(g):
+    return g.values if isinstance(g, indexed_slices.IndexedSlices) else g
+
   is_finite_per_grad = [
-      math_ops.reduce_all(math_ops.is_finite(g)) for g in grads if g is not None
+      math_ops.reduce_all(math_ops.is_finite(raw_values(g)))
+      for g in grads
+      if g is not None
   ]
   return math_ops.reduce_all(is_finite_per_grad)
 
@@ -281,7 +288,7 @@ def _op_in_graph_mode(tensor):
 
 def _assign_if_finite(var, value):
   """Assigns a value to a variable if the value is finite."""
-  return control_flow_ops.cond(
+  return cond.cond(
       math_ops.is_finite(value), lambda: _op_in_graph_mode(var.assign(value)),
       control_flow_ops.no_op)
 
@@ -365,8 +372,8 @@ class DynamicLossScale(LossScale):
   def update(self, grads):
     """Updates loss scale based on if gradients are finite in current step."""
     grads = nest.flatten(grads)
-    if distribution_strategy_context.has_strategy():
-      distribution = distribution_strategy_context.get_cross_replica_context()
+    if distribute_lib.has_strategy():
+      distribution = distribute_lib.get_cross_replica_context()
 
       def get_is_finite(grads):
         is_finite = _is_all_finite(grads)
@@ -392,7 +399,7 @@ class DynamicLossScale(LossScale):
             _assign_if_finite(self._current_loss_scale, new_loss_scale),
             self._num_good_steps.assign(0))
 
-      return control_flow_ops.cond(
+      return cond.cond(
           self._num_good_steps + 1 >= self._increment_period,
           incr_loss_scale, lambda: _op_in_graph_mode(
               self._num_good_steps.assign_add(1)))
@@ -406,8 +413,8 @@ class DynamicLossScale(LossScale):
           self._num_good_steps.assign(0),
           self._current_loss_scale.assign(new_loss_scale))
 
-    update_op = control_flow_ops.cond(is_finite, update_if_finite_grads,
-                                      update_if_not_finite_grads)
+    update_op = cond.cond(is_finite, update_if_finite_grads,
+                          update_if_not_finite_grads)
     should_apply_gradients = is_finite
     return update_op, should_apply_gradients
 
diff --git a/tensorflow/python/training/experimental/loss_scale_optimizer.py b/tensorflow/python/training/experimental/loss_scale_optimizer.py
index 0454d69d9f5..70386704874 100644
--- a/tensorflow/python/training/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/training/experimental/loss_scale_optimizer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Contains LossScale classes."""
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.ops import control_flow_ops
@@ -172,13 +172,13 @@ class MixedPrecisionLossScaleOptimizer(optimizer.Optimizer):
     Raises:
       RuntimeError: If you should use `_distributed_apply()` instead.
     """
-    if distribution_strategy_context.in_cross_replica_context():
+    if distribute_lib.in_cross_replica_context():
       raise ValueError('apply_gradients() must be called in a replica context.')
 
     if not self._doing_dynamic_loss_scaling():
       return self._optimizer.apply_gradients(grads_and_vars, global_step, name)
 
-    replica_context = distribution_strategy_context.get_replica_context()
+    replica_context = distribute_lib.get_replica_context()
     grads_and_vars = tuple(grads_and_vars)
 
     # TODO(nluehr) cleanup GraphKeys.TRAIN_OP
diff --git a/tensorflow/python/training/experimental/loss_scale_optimizer_test.py b/tensorflow/python/training/experimental/loss_scale_optimizer_test.py
index d9230fd8fe3..c751dc2eb08 100644
--- a/tensorflow/python/training/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/training/experimental/loss_scale_optimizer_test.py
@@ -18,10 +18,11 @@ import os
 
 from absl.testing import parameterized
 from tensorflow.python.checkpoint import checkpoint as trackable_utils
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -36,7 +37,7 @@ from tensorflow.python.training.experimental import loss_scale_optimizer
 
 # If called outside any strategy.scope() calls, this will return the default
 # strategy.
-default_strategy_fn = distribution_strategy_context.get_strategy
+default_strategy_fn = distribute_lib.get_strategy
 
 
 def create_mirrored_strategy():
@@ -87,8 +88,9 @@ def create_identity_with_grad_check_fn(expected_gradient, expected_dtype=None):
       if expected_dtype:
         assert dx.dtype == expected_dtype, (
             'dx.dtype should be %s but is: %s' % (expected_dtype, dx.dtype))
-      expected_tensor = ops.convert_to_tensor_v2(
-          expected_gradient, dtype=dx.dtype, name='expected_gradient')
+      expected_tensor = tensor_conversion.convert_to_tensor_v2(
+          expected_gradient, dtype=dx.dtype, name='expected_gradient'
+      )
       # Control dependency is to ensure input is available. It's possible the
       # dataset will throw a StopIteration to indicate there is no more data, in
       # which case we don't want to run the assertion.
diff --git a/tensorflow/python/training/experimental/loss_scale_test.py b/tensorflow/python/training/experimental/loss_scale_test.py
index d0e2c586c36..eaa9a55d5c5 100644
--- a/tensorflow/python/training/experimental/loss_scale_test.py
+++ b/tensorflow/python/training/experimental/loss_scale_test.py
@@ -17,7 +17,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -25,7 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -35,7 +35,7 @@ from tensorflow.python.training.experimental import loss_scale as loss_scale_mod
 
 # If called outside any strategy.scope() calls, this will return the default
 # strategy.
-default_strategy_fn = distribution_strategy_context.get_strategy
+default_strategy_fn = distribute_lib.get_strategy
 
 
 def create_mirrored_strategy():
@@ -102,19 +102,19 @@ def _get_example_iter(inputs):
 class DynamicLossScaleTest(test.TestCase, parameterized.TestCase):
 
   def _get_tensor(self, is_finite):
-    tensor = control_flow_ops.cond(is_finite, lambda: 1., lambda: float('NaN'))
+    tensor = cond.cond(is_finite, lambda: 1., lambda: float('NaN'))
 
-    if not distribution_strategy_context.has_strategy():
+    if not distribute_lib.has_strategy():
       return tensor
 
     def get():
       rep_id = (
-          distribution_strategy_context.get_replica_context()
+          distribute_lib.get_replica_context()
           .replica_id_in_sync_group)
-      return control_flow_ops.cond(
+      return cond.cond(
           math_ops.equal(rep_id, 0), lambda: tensor, lambda: 1.)
 
-    distribution = distribution_strategy_context.get_strategy()
+    distribution = distribute_lib.get_strategy()
     return distribution.extended.call_for_each_replica(get)
 
   def _test_helper(self,
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 544720774c7..ee1d019f756 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -37,7 +37,7 @@ from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.summary import summary
 from tensorflow.python.training import queue_runner
 from tensorflow.python.util import deprecation
@@ -69,7 +69,7 @@ def match_filenames_once(pattern, name=None):
     A variable that is initialized to the list of files matching the pattern(s).
   """
   with ops.name_scope(name, "matching_filenames", [pattern]) as name:
-    return vs.variable(
+    return variable_v1.VariableV1(
         name=name, initial_value=io_ops.matching_files(pattern),
         trainable=False, validate_shape=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES])
@@ -103,7 +103,7 @@ def limit_epochs(tensor, num_epochs=None, name=None):
     raise ValueError("num_epochs must be > 0 not %d." % num_epochs)
   with ops.name_scope(name, "limit_epochs", [tensor]) as name:
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
-    epochs = vs.variable(
+    epochs = variable_v1.VariableV1(
         zero64, name="epochs", trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES])
     counter = epochs.count_up_to(num_epochs)
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 2109ecf61cd..6e0e1a99ec2 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_assert
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import builder as saved_model_builder
@@ -76,8 +77,8 @@ class ScaffoldTest(test.TestCase):
   def test_defaults_empty_graph(self):
     with ops.Graph().as_default():
       scaffold = monitored_session.Scaffold()
-      variables.VariableV1(1, name='my_var')
-      variables.VariableV1(
+      variable_v1.VariableV1(1, name='my_var')
+      variable_v1.VariableV1(
           2, name='my_local_var', collections=[ops.GraphKeys.LOCAL_VARIABLES])
       scaffold.finalize()
       self.assertTrue(isinstance(scaffold.init_op, ops.Operation))
@@ -114,7 +115,7 @@ class ScaffoldTest(test.TestCase):
 
   def test_caches_values(self):
     with ops.Graph().as_default():
-      variables.VariableV1([1])
+      variable_v1.VariableV1([1])
       scaffold1 = monitored_session.Scaffold()
       scaffold1.finalize()
       scaffold2 = monitored_session.Scaffold()
@@ -128,7 +129,7 @@ class ScaffoldTest(test.TestCase):
 
   def test_raise_error_if_more_than_one_cached_item(self):
     with ops.Graph().as_default():
-      variables.VariableV1([1])
+      variable_v1.VariableV1([1])
       ops.add_to_collection(ops.GraphKeys.SAVERS, saver_lib.Saver())
       ops.add_to_collection(ops.GraphKeys.SAVERS, saver_lib.Saver())
       with self.assertRaisesRegex(RuntimeError, 'More than one item'):
@@ -136,7 +137,7 @@ class ScaffoldTest(test.TestCase):
 
   def test_uses_passed_values(self):
     with ops.Graph().as_default():
-      variables.VariableV1([1])
+      variable_v1.VariableV1([1])
       saver = saver_lib.Saver()
       scaffold = monitored_session.Scaffold(
           init_op=2,
@@ -159,7 +160,7 @@ class ScaffoldTest(test.TestCase):
 
   def test_graph_is_finalized(self):
     with ops.Graph().as_default():
-      variables.VariableV1([1])
+      variable_v1.VariableV1([1])
       monitored_session.Scaffold().finalize()
       with self.assertRaisesRegex(RuntimeError,
                                   'Graph is finalized and cannot be modified'):
@@ -168,7 +169,7 @@ class ScaffoldTest(test.TestCase):
   def test_new_scaffold_from_default_scaffold(self):
     scaffold1 = monitored_session.Scaffold()
     with ops.Graph().as_default():
-      variables.VariableV1([1])
+      variable_v1.VariableV1([1])
       saver = saver_lib.Saver()
       scaffold2 = monitored_session.Scaffold(
           init_op=2,
@@ -193,7 +194,7 @@ class ScaffoldTest(test.TestCase):
 
   def test_new_scaffold_from_existing_scaffold(self):
     with ops.Graph().as_default():
-      variables.VariableV1([1])
+      variable_v1.VariableV1([1])
       saver = saver_lib.Saver()
       scaffold1 = monitored_session.Scaffold(
           init_op=2,
@@ -1506,7 +1507,7 @@ class MonitoredSessionTest(test.TestCase):
 
   def test_defaults(self):
     with ops.Graph().as_default():
-      a_var = variables.VariableV1(0)
+      a_var = variable_v1.VariableV1(0)
       with monitored_session.MonitoredSession() as session:
         self.assertEqual(0, session.run(a_var))
 
@@ -1833,7 +1834,7 @@ class MonitoredSessionTest(test.TestCase):
 
   def test_graph_finalized_during_run_unfinalized_after_exit(self):
     with ops.Graph().as_default() as g:
-      a_var = variables.VariableV1(0)
+      a_var = variable_v1.VariableV1(0)
       with monitored_session.MonitoredSession() as session:
         self.assertEqual(0, session.run(a_var))
         self.assertTrue(g.finalized)
@@ -1841,7 +1842,7 @@ class MonitoredSessionTest(test.TestCase):
 
   def test_keep_finalized_graph_as_finalized(self):
     with ops.Graph().as_default() as g:
-      a_var = variables.VariableV1(0)
+      a_var = variable_v1.VariableV1(0)
       monitored_session.Scaffold().finalize()
       with monitored_session.MonitoredSession() as session:
         self.assertEqual(0, session.run(a_var))
@@ -2167,7 +2168,7 @@ class MonitoredSessionTest(test.TestCase):
     with ops.Graph().as_default():
       c = array_ops.placeholder(dtypes.float32)
       v = array_ops.identity(c)
-      graph_state = variables.VariableV1(0.0)
+      graph_state = variable_v1.VariableV1(0.0)
       graph_side_effect = state_ops.assign_add(graph_state, 0.31)
 
       def step_fn(step_context):
@@ -2223,7 +2224,7 @@ class MonitoredSessionTest(test.TestCase):
       c = array_ops.placeholder(dtypes.float32)
       v = array_ops.identity(c)
       vv = constant_op.constant(3.2)
-      graph_state = variables.VariableV1(0.0)
+      graph_state = variable_v1.VariableV1(0.0)
       graph_side_effect = state_ops.assign_add(graph_state, 0.31)
 
       class Hook(session_run_hook.SessionRunHook):
@@ -2260,7 +2261,7 @@ class SingularMonitoredSessionTest(test.TestCase):
 
   def test_handles_initialization(self):
     with ops.Graph().as_default():
-      a_var = variables.VariableV1(0)
+      a_var = variable_v1.VariableV1(0)
       with monitored_session.SingularMonitoredSession() as session:
         # If it's not initialized, following statement raises an error.
         self.assertEqual(0, session.run(a_var))
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 582de218b58..e9a7d177e1e 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """Maintain moving averages of parameters."""
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -23,6 +22,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
 from tensorflow.python.util.tf_export import tf_export
@@ -96,7 +96,7 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
       else:
         return _update(strategy, v, update_fn, args=(value,))
 
-    replica_context = distribution_strategy_context.get_replica_context()
+    replica_context = distribute_lib.get_replica_context()
     if replica_context:
       # In a replica context, we update variable using the mean of value across
       # replicas.
@@ -107,7 +107,7 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
 
       return replica_context.merge_call(merge_fn, args=(variable, value))
     else:
-      strategy = distribution_strategy_context.get_cross_replica_context()
+      strategy = distribute_lib.get_cross_replica_context()
       return update(strategy, variable, value)
 
 
@@ -177,7 +177,7 @@ def weighted_moving_average(value,
 
 def _update(strategy, var, update_fn, args):
   """Applies updates depending on the context."""
-  assert distribution_strategy_context.in_cross_replica_context(), (
+  assert distribute_lib.in_cross_replica_context(), (
       "_update can only be called in cross-replica context")
   if distribute_lib.get_update_replica_id() is not None:
     # Call update_fn on var to delegate the implementation. We expect `var` will
@@ -550,7 +550,9 @@ class ExponentialMovingAverage:
         with ops.init_scope():
           if isinstance(var, variables.Variable):
             with ops.device(var.device):
-              initialized_value = var.initialized_value()
+              initialized_value = control_flow_ops.cond(
+                  variable_v1.is_variable_initialized(var), var.read_value,
+                  lambda: var.initial_value)  # pylint: disable=cell-var-from-loop
             avg = slot_creator.create_slot(
                 var,
                 initialized_value,
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 8d0a50499df..286905c6c58 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -18,8 +18,8 @@
 
 import abc
 
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
-from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -32,7 +32,7 @@ from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.training import slot_creator
@@ -612,7 +612,7 @@ class Optimizer(
   def _scale_loss(loss_value):
     ops.get_default_graph()._is_loss_scaled_by_optimizer = False  # pylint: disable=protected-access
     if distribute_utils.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
-      num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
+      num_replicas = distribute_lib.get_strategy().num_replicas_in_sync
       if num_replicas > 1:
         loss_value *= (1. / num_replicas)
         ops.get_default_graph()._is_loss_scaled_by_optimizer = True  # pylint: disable=protected-access
@@ -666,14 +666,14 @@ class Optimizer(
     # TODO(isaprykin): Get rid of `has_strategy()` check by
     # always calling _distributed_apply(), using the default distribution
     # as needed.
-    if distribute_ctx.has_strategy() and not skip_gradients_aggregation:
+    if distribute_lib.has_strategy() and not skip_gradients_aggregation:
       # Handle DistributionStrategy case.
-      if distribute_ctx.in_cross_replica_context():
+      if distribute_lib.in_cross_replica_context():
         raise RuntimeError("Use `_distributed_apply()` instead of "
                            "`apply_gradients()` in a cross-replica context.")
 
       grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)()
-      return distribute_ctx.get_replica_context().merge_call(
+      return distribute_lib.get_replica_context().merge_call(
           self._distributed_apply, args=(grads_and_vars, global_step, name))
 
     # No DistributionStrategy case.
@@ -918,14 +918,14 @@ class Optimizer(
     v = self._non_slot_dict.get(key, None)
     if v is None:
       self._maybe_initialize_trackable()
-      distribution_strategy = distribute_ctx.get_strategy()
+      distribution_strategy = distribute_lib.get_strategy()
       with distribution_strategy.extended.colocate_vars_with(colocate_with):
         if eager:
           restored_initial_value = self._preload_simple_restoration(
               name=name)
           if restored_initial_value is not None:
             initial_value = restored_initial_value
-        v = variable_scope.variable(
+        v = variable_v1.VariableV1(
             initial_value, name=name, trainable=False,
             use_resource=resource_variable_ops.is_resource_variable(
                 colocate_with))
diff --git a/tensorflow/python/training/quantize_training_test.py b/tensorflow/python/training/quantize_training_test.py
index 904c4b045b7..18425cd10fa 100644
--- a/tensorflow/python/training/quantize_training_test.py
+++ b/tensorflow/python/training/quantize_training_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import quantize_training
@@ -56,7 +57,7 @@ class PywrapQuantizeTrainingTest(test.TestCase):
     g = ops.Graph()
     with session.Session(graph=g) as sess:
       a = constant_op.constant(6.0, shape=[1, 1], name='a')
-      b = variables.VariableV1(
+      b = variable_v1.VariableV1(
           constant_op.constant(7.0, shape=[1, 1]), name='b')
       c = math_ops.matmul(a, b, name='matmul')
 
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index 219f2ecdbd9..f755b5220fa 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
@@ -42,7 +43,7 @@ class QueueRunnerTest(test.TestCase):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
-      var = variables.VariableV1(zero64)
+      var = variable_v1.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       self.evaluate(variables.global_variables_initializer())
@@ -62,9 +63,9 @@ class QueueRunnerTest(test.TestCase):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
-      var0 = variables.VariableV1(zero64)
+      var0 = variable_v1.VariableV1(zero64)
       count_up_to_3 = var0.count_up_to(3)
-      var1 = variables.VariableV1(zero64)
+      var1 = variable_v1.VariableV1(zero64)
       count_up_to_30 = var1.count_up_to(30)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       qr = queue_runner_impl.QueueRunner(queue, [count_up_to_3, count_up_to_30])
@@ -129,7 +130,7 @@ class QueueRunnerTest(test.TestCase):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
-      var = variables.VariableV1(zero64)
+      var = variable_v1.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       self.evaluate(variables.global_variables_initializer())
@@ -182,7 +183,7 @@ class QueueRunnerTest(test.TestCase):
     with self.cached_session() as sess:
       with session.Session() as other_sess:
         zero64 = constant_op.constant(0, dtype=dtypes.int64)
-        var = variables.VariableV1(zero64)
+        var = variable_v1.VariableV1(zero64)
         count_up_to = var.count_up_to(3)
         queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
         self.evaluate(variables.global_variables_initializer())
@@ -197,7 +198,7 @@ class QueueRunnerTest(test.TestCase):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
-      var = variables.VariableV1(zero64)
+      var = variable_v1.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       self.evaluate(variables.global_variables_initializer())
@@ -213,7 +214,7 @@ class QueueRunnerTest(test.TestCase):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
-      var = variables.VariableV1(zero64)
+      var = variable_v1.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       self.evaluate(variables.global_variables_initializer())
@@ -248,7 +249,7 @@ class QueueRunnerTest(test.TestCase):
   def testStartQueueRunners(self):
     # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
-    var = variables.VariableV1(zero64)
+    var = variable_v1.VariableV1(zero64)
     count_up_to = var.count_up_to(3)
     queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
     init_op = variables.global_variables_initializer()
@@ -265,7 +266,7 @@ class QueueRunnerTest(test.TestCase):
 
   def testStartQueueRunnersRaisesIfNotASession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
-    var = variables.VariableV1(zero64)
+    var = variable_v1.VariableV1(zero64)
     count_up_to = var.count_up_to(3)
     queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
     init_op = variables.global_variables_initializer()
@@ -278,7 +279,7 @@ class QueueRunnerTest(test.TestCase):
 
   def testStartQueueRunnersIgnoresMonitoredSession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
-    var = variables.VariableV1(zero64)
+    var = variable_v1.VariableV1(zero64)
     count_up_to = var.count_up_to(3)
     queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
     init_op = variables.global_variables_initializer()
@@ -295,7 +296,7 @@ class QueueRunnerTest(test.TestCase):
     graph = ops.Graph()
     with graph.as_default():
       zero64 = constant_op.constant(0, dtype=dtypes.int64)
-      var = variables.VariableV1(zero64)
+      var = variable_v1.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       init_op = variables.global_variables_initializer()
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index d698c5fa7ce..45a9b53f440 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -47,6 +47,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -57,6 +58,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -309,8 +311,8 @@ class SaverTest(test.TestCase):
     with ops_lib.Graph().as_default():
       # Build a graph with 2 parameter nodes, and Save and
       # Restore nodes for them.
-      v0 = variables.VariableV1(10.0, name="v0")
-      v1 = variables.VariableV1(20.0, name="v1")
+      v0 = variable_v1.VariableV1(10.0, name="v0")
+      v1 = variable_v1.VariableV1(20.0, name="v1")
       v2 = saver_test_utils.CheckpointedOp(name="v2")
       v2_init = v2.insert("k1", 30.0)
       save = saver_module.Saver(
@@ -349,8 +351,8 @@ class SaverTest(test.TestCase):
       # Start a second session.  In that session the parameter nodes
       # have not been initialized either.
       with self.cached_session() as sess:
-        v0 = variables.VariableV1(-1.0, name="v0")
-        v1 = variables.VariableV1(-1.0, name="v1")
+        v0 = variable_v1.VariableV1(-1.0, name="v0")
+        v1 = variable_v1.VariableV1(-1.0, name="v1")
         v2 = saver_test_utils.CheckpointedOp(name="v2")
         save = saver_module.Saver({"v0": v0, "v1": v1, "v2": v2.saveable})
 
@@ -371,7 +373,7 @@ class SaverTest(test.TestCase):
   def testFilenameTensor(self):
     # train.Saver is V1 only API.
     with ops_lib.Graph().as_default():
-      v0 = variables.VariableV1(0, name="v0")
+      v0 = variable_v1.VariableV1(0, name="v0")
       filename = b"somerandomfilename"
       save = saver_module.Saver({"v0": v0}, filename=filename)
       with self.cached_session() as sess:
@@ -380,7 +382,7 @@ class SaverTest(test.TestCase):
         self.assertEqual(self.evaluate(tensor), filename)
 
   def testInvalidPath(self):
-    v0 = variables.VariableV1(0, name="v0")
+    v0 = variable_v1.VariableV1(0, name="v0")
     for ver in (saver_pb2.SaverDef.V1, saver_pb2.SaverDef.V2):
       with self.cached_session() as sess:
         save = saver_module.Saver({"v0": v0}, write_version=ver)
@@ -394,7 +396,7 @@ class SaverTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Build a graph with 1 node, and save and restore for them.
-      v = variables.VariableV1(np.int64(15), name="v")
+      v = variable_v1.VariableV1(np.int64(15), name="v")
       save = saver_module.Saver({"v": v}, restore_sequentially=True)
       self.evaluate(variables.global_variables_initializer())
 
@@ -404,7 +406,7 @@ class SaverTest(test.TestCase):
       self.assertEqual(save_path, val)
 
       with self.cached_session() as sess:
-        v = variables.VariableV1(np.int64(-1), name="v")
+        v = variable_v1.VariableV1(np.int64(-1), name="v")
         save = saver_module.Saver({"v": v})
 
       with self.assertRaisesWithPredicateMatch(
@@ -418,9 +420,9 @@ class SaverTest(test.TestCase):
 
   def testSomeErrors(self):
     with ops_lib.Graph().as_default():
-      v0 = variables.VariableV1([10.0], name="v0")
-      v1 = variables.VariableV1([20.0], name="v1")
-      v2 = variables.VariableV1([20.0], name="v2")
+      v0 = variable_v1.VariableV1([10.0], name="v0")
+      v1 = variable_v1.VariableV1([20.0], name="v1")
+      v2 = variable_v1.VariableV1([20.0], name="v2")
       v2._set_save_slice_info(
           variables.Variable.SaveSliceInfo("v1", [1], [0], [1]))
 
@@ -448,7 +450,7 @@ class SaverTest(test.TestCase):
 
   def testSameName(self):
     with ops_lib.Graph().as_default():
-      v0 = variables.VariableV1([10.0], name="v0")
+      v0 = variable_v1.VariableV1([10.0], name="v0")
       v2 = saver_test_utils.CheckpointedOp(name="v2")
 
       # Saving one variable under two names raises an error.
@@ -471,8 +473,8 @@ class SaverTest(test.TestCase):
     with self.session(graph=ops_lib.Graph()) as sess:
       # Build a graph with 2 parameter nodes, and Save and
       # Restore nodes for them.
-      v0 = variables.VariableV1(10.0, name="v0")
-      v1 = variables.VariableV1(20.0, name="v1")
+      v0 = variable_v1.VariableV1(10.0, name="v0")
+      v1 = variable_v1.VariableV1(20.0, name="v1")
       v2 = saver_test_utils.CheckpointedOp(name="v2")
       v2_init = v2.insert("k1", 30.0)
       save = saver_module.Saver([v0, v1, v2.saveable])
@@ -493,8 +495,8 @@ class SaverTest(test.TestCase):
     # Start a second session.  In that session the variables
     # have not been initialized either.
     with self.session(graph=ops_lib.Graph()) as sess:
-      v0 = variables.VariableV1(-1.0, name="v0")
-      v1 = variables.VariableV1(-1.0, name="v1")
+      v0 = variable_v1.VariableV1(-1.0, name="v0")
+      v1 = variable_v1.VariableV1(-1.0, name="v1")
       v2 = saver_test_utils.CheckpointedOp(name="v2")
       save = saver_module.Saver([v0, v1, v2.saveable])
 
@@ -518,8 +520,8 @@ class SaverTest(test.TestCase):
     # Build another graph with 2 nodes, initialized
     # differently, and a Restore node for them.
     with self.session(graph=ops_lib.Graph()) as sess:
-      v0_2 = variables.VariableV1(1000.0, name="v0")
-      v1_2 = variables.VariableV1(2000.0, name="v1")
+      v0_2 = variable_v1.VariableV1(1000.0, name="v0")
+      v1_2 = variable_v1.VariableV1(2000.0, name="v1")
       v2_2 = saver_test_utils.CheckpointedOp(name="v2")
       save2 = saver_module.Saver([v0_2, v1_2, v2_2.saveable])
       v2_2.insert("k1000", 3000.0).run()
@@ -578,14 +580,14 @@ class SaverTest(test.TestCase):
     save_path = os.path.join(self.get_temp_dir(), "gpu")
     with session.Session("", graph=ops_lib.Graph()) as sess:
       with sess.graph.device(test.gpu_device_name()):
-        v0_1 = variables.VariableV1(123.45)
+        v0_1 = variable_v1.VariableV1(123.45)
       save = saver_module.Saver({"v0": v0_1})
       self.evaluate(variables.global_variables_initializer())
       save.save(sess, save_path)
 
     with session.Session("", graph=ops_lib.Graph()) as sess:
       with sess.graph.device(test.gpu_device_name()):
-        v0_2 = variables.VariableV1(543.21)
+        v0_2 = variable_v1.VariableV1(543.21)
       save = saver_module.Saver({"v0": v0_2})
       self.evaluate(variables.global_variables_initializer())
 
@@ -595,22 +597,22 @@ class SaverTest(test.TestCase):
     save_path = os.path.join(self.get_temp_dir(), "gpu")
     with session.Session("", graph=ops_lib.Graph()) as sess:
       with sess.graph.device(test.gpu_device_name()):
-        v0_1 = variables.VariableV1(123.45)
+        v0_1 = variable_v1.VariableV1(123.45)
       save = saver_module.Saver({"v0": v0_1}, sharded=True, allow_empty=True)
       self.evaluate(variables.global_variables_initializer())
       save.save(sess, save_path)
 
     with session.Session("", graph=ops_lib.Graph()) as sess:
       with sess.graph.device(test.gpu_device_name()):
-        v0_2 = variables.VariableV1(543.21)
+        v0_2 = variable_v1.VariableV1(543.21)
       save = saver_module.Saver({"v0": v0_2}, sharded=True, allow_empty=True)
       self.evaluate(variables.global_variables_initializer())
 
   def testVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "variables")
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      one = variables.VariableV1(1.0)
-      twos = variables.VariableV1([2.0, 2.0, 2.0])
+      one = variable_v1.VariableV1(1.0)
+      twos = variable_v1.VariableV1([2.0, 2.0, 2.0])
       v2 = saver_test_utils.CheckpointedOp(name="v2")
       init = variables.global_variables_initializer()
       save = saver_module.Saver()
@@ -619,8 +621,8 @@ class SaverTest(test.TestCase):
       save.save(sess, save_path)
 
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      one = variables.VariableV1(0.0)
-      twos = variables.VariableV1([0.0, 0.0, 0.0])
+      one = variable_v1.VariableV1(0.0)
+      twos = variable_v1.VariableV1([0.0, 0.0, 0.0])
       v2 = saver_test_utils.CheckpointedOp(name="v2")
       # Saver with no arg, defaults to 'all variables'.
       save = saver_module.Saver()
@@ -632,14 +634,14 @@ class SaverTest(test.TestCase):
 
   def testVarListShouldBeEmptyInDeferredBuild(self):
     with ops_lib.Graph().as_default():
-      v = variables.VariableV1(1.0)
+      v = variable_v1.VariableV1(1.0)
       with self.assertRaisesRegex(ValueError, "defer_build"):
         saver_module.Saver([v], defer_build=True)
 
   def testBuildShouldBeCalledBeforeSaveInCaseOfDeferBuild(self):
     save_path = os.path.join(self.get_temp_dir(), "error_deferred_build")
     with ops_lib.Graph().as_default(), session.Session() as sess:
-      variables.VariableV1(1.0)
+      variable_v1.VariableV1(1.0)
       saver = saver_module.Saver(defer_build=True)
       with self.assertRaisesRegex(RuntimeError, "build"):
         saver.save(sess, save_path)
@@ -647,18 +649,18 @@ class SaverTest(test.TestCase):
   def testDeferredBuild(self):
     save_path = os.path.join(self.get_temp_dir(), "deferred_build")
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      one = variables.VariableV1(1.0)
+      one = variable_v1.VariableV1(1.0)
       save = saver_module.Saver(defer_build=True)
       # if build is not deferred, saver cannot save the `twos`.
-      twos = variables.VariableV1([2.0, 2.0, 2.0])
+      twos = variable_v1.VariableV1([2.0, 2.0, 2.0])
       init = variables.global_variables_initializer()
       save.build()
       init.run()
       save.save(sess, save_path)
 
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      one = variables.VariableV1(0.0)
-      twos = variables.VariableV1([0.0, 0.0, 0.0])
+      one = variable_v1.VariableV1(0.0)
+      twos = variable_v1.VariableV1([0.0, 0.0, 0.0])
       # Saver with no arg, defaults to 'all variables'.
       save = saver_module.Saver()
       save.restore(sess, save_path)
@@ -669,7 +671,7 @@ class SaverTest(test.TestCase):
   def testReshape(self):
     save_path = os.path.join(self.get_temp_dir(), "variables_reshape")
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      var = variables.VariableV1([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+      var = variable_v1.VariableV1([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
       init = variables.global_variables_initializer()
       save = saver_module.Saver()
       init.run()
@@ -677,7 +679,7 @@ class SaverTest(test.TestCase):
 
     # Error when restoring with default reshape=False
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      var = variables.VariableV1([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
+      var = variable_v1.VariableV1([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
       save = saver_module.Saver()
       with self.assertRaisesRegex(
           errors_impl.InvalidArgumentError,
@@ -686,7 +688,7 @@ class SaverTest(test.TestCase):
 
     # Restored to new shape with reshape=True
     with session.Session("", graph=ops_lib.Graph()) as sess:
-      var = variables.VariableV1([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
+      var = variable_v1.VariableV1([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
       save = saver_module.Saver(reshape=True)
       save.restore(sess, save_path)
       self.assertAllClose([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
@@ -737,8 +739,8 @@ class SaverTest(test.TestCase):
     for save_path in paths:
       # Build a graph with 2 parameter nodes, and Save and
       # Restore nodes for them.
-      v0 = variables.VariableV1(10.0, name="v0")
-      v1 = variables.VariableV1(20.0, name="v1")
+      v0 = variable_v1.VariableV1(10.0, name="v0")
+      v1 = variable_v1.VariableV1(20.0, name="v1")
       save = saver_module.Saver({"v0": v0, "v1": v1}, restore_sequentially=True)
       init_all_op = variables.global_variables_initializer()
 
@@ -776,8 +778,8 @@ class SaverTest(test.TestCase):
 
     # Build a graph with 2 parameter nodes, and Save and
     # Restore nodes for them.
-    v0 = variables.VariableV1(10.0, name="v0")
-    v1 = variables.VariableV1(20.0, name="v1")
+    v0 = variable_v1.VariableV1(10.0, name="v0")
+    v1 = variable_v1.VariableV1(20.0, name="v1")
     save = saver_module.Saver({"v0": v0, "v1": v1}, restore_sequentially=True)
     init_all_op = variables.global_variables_initializer()
 
@@ -934,10 +936,10 @@ class SaveRestoreShardedTest(test.TestCase):
         target="",
         config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
       with sess.graph.device("/cpu:0"):
-        v0 = variables.VariableV1(10, name="v0")
+        v0 = variable_v1.VariableV1(10, name="v0")
         t0 = saver_test_utils.CheckpointedOp(name="t0")
       with sess.graph.device("/cpu:1"):
-        v1 = variables.VariableV1(20, name="v1")
+        v1 = variable_v1.VariableV1(20, name="v1")
         t1 = saver_test_utils.CheckpointedOp(name="t1")
       save = saver_module.Saver(
           {
@@ -965,7 +967,7 @@ class SaveRestoreShardedTest(test.TestCase):
           target="",
           config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
         with sess.graph.device("/cpu:0"):
-          v0 = variables.VariableV1(111, name="v0")
+          v0 = variable_v1.VariableV1(111, name="v0")
           t0 = saver_test_utils.CheckpointedOp(name="t0")
         save = saver_module.Saver(
             {
@@ -989,7 +991,7 @@ class SaveRestoreShardedTest(test.TestCase):
           target="",
           config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
         with sess.graph.device("/cpu:0"):
-          v1 = variables.VariableV1(222)
+          v1 = variable_v1.VariableV1(222)
           t1 = saver_test_utils.CheckpointedOp(name="t1")
         save = saver_module.Saver(
             {
@@ -1013,10 +1015,10 @@ class SaveRestoreShardedTest(test.TestCase):
         target="",
         config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
       with sess.graph.device("/cpu:0"):
-        v0 = variables.VariableV1(111, name="v0")
+        v0 = variable_v1.VariableV1(111, name="v0")
         t0 = saver_test_utils.CheckpointedOp(name="t0")
       with sess.graph.device("/cpu:1"):
-        v1 = variables.VariableV1(222, name="v1")
+        v1 = variable_v1.VariableV1(222, name="v1")
         t1 = saver_test_utils.CheckpointedOp(name="t1")
       save = saver_module.Saver(
           {
@@ -1060,7 +1062,7 @@ class SaveRestoreShardedTest(test.TestCase):
   def testSaverDef(self):
     # train.Saver is V1 only API.
     with ops_lib.Graph().as_default(), self.cached_session():
-      v0 = variables.VariableV1(123, name="v0")
+      v0 = variable_v1.VariableV1(123, name="v0")
       save = saver_module.Saver({"v0": v0}, sharded=True)
       sd = save.as_saver_def()
       self.assertTrue(sd.sharded)
@@ -1093,7 +1095,7 @@ class SaveRestoreShardedTest(test.TestCase):
           if use_resource:
             vs = [resource_variable_ops.ResourceVariable(rnd, name=var_name)]
           else:
-            vs = [variables.VariableV1(rnd, name=var_name)]
+            vs = [variable_v1.VariableV1(rnd, name=var_name)]
 
         self.evaluate(variables.global_variables_initializer())
         if call_saver_with_dict:
@@ -1118,7 +1120,7 @@ class SaveRestoreShardedTest(test.TestCase):
           ]
         else:
           new_vs = [
-              variables.VariableV1(
+              variable_v1.VariableV1(
                   array_ops.zeros(
                       shape=var_full_shape),  # != original contents.
                   name=var_name)
@@ -1325,7 +1327,7 @@ class MaxToKeepTest(test.TestCase):
     with context.eager_mode():
       save_dir = self._get_test_dir("max_to_keep_eager")
 
-      v = variable_scope.variable(10.0, name="v")
+      v = variable_v1.VariableV1(10.0, name="v")
       save = saver_module.Saver({"v": v}, max_to_keep=2)
       self.evaluate(variables.global_variables_initializer())
       if not context.executing_eagerly():
@@ -1397,7 +1399,7 @@ class MaxToKeepTest(test.TestCase):
 
     # train.Saver is V1 only API.
     with ops_lib.Graph().as_default(), self.cached_session() as sess:
-      v = variables.VariableV1(10.0, name="v")
+      v = variable_v1.VariableV1(10.0, name="v")
       save = saver_module.Saver({"v": v}, max_to_keep=2)
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual([], save.last_checkpoints)
@@ -1576,9 +1578,9 @@ class MaxToKeepTest(test.TestCase):
         target="",
         config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
       with sess.graph.device("/cpu:0"):
-        v0 = variables.VariableV1(111, name="v0")
+        v0 = variable_v1.VariableV1(111, name="v0")
       with sess.graph.device("/cpu:1"):
-        v1 = variables.VariableV1(222, name="v1")
+        v1 = variable_v1.VariableV1(222, name="v1")
       save = saver_module.Saver(
           {
               "v0": v0,
@@ -1635,7 +1637,7 @@ class MaxToKeepTest(test.TestCase):
     save_dir2 = self._get_test_dir("max_to_keep_0")
 
     with self.cached_session() as sess:
-      v = variables.VariableV1(10.0, name="v")
+      v = variable_v1.VariableV1(10.0, name="v")
       self.evaluate(variables.global_variables_initializer())
 
       # Test max_to_keep being None.
@@ -1662,7 +1664,7 @@ class MaxToKeepTest(test.TestCase):
     save_dir = self._get_test_dir("no_meta_graph")
 
     with self.cached_session() as sess:
-      v = variables.VariableV1(10.0, name="v")
+      v = variable_v1.VariableV1(10.0, name="v")
       save = saver_module.Saver({"v": v})
       self.evaluate(variables.global_variables_initializer())
 
@@ -1691,7 +1693,7 @@ class RecoverLastCheckpointsTest(test.TestCase):
     with context.eager_mode():
       save_dir = self._get_test_dir("recover_last_checkpoints")
 
-      v = variable_scope.variable(10.0, name="v")
+      v = variable_v1.VariableV1(10.0, name="v")
       save = saver_module.Saver({"v": v}, max_to_keep=10)
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual([], save.last_checkpoints)
@@ -1746,7 +1748,7 @@ class KeepCheckpointEveryNHoursTest(test.TestCase):
     save_dir = self._get_test_dir("keep_checkpoint_every_n_hours")
 
     with self.cached_session() as sess:
-      v = variable_scope.variable([10.0], name="v")
+      v = variable_v1.VariableV1([10.0], name="v")
       # Run the initializer NOW to avoid the 0.5s overhead of the first Run()
       # call, which throws the test timing off in fastbuild mode.
       self.evaluate(variables.global_variables_initializer())
@@ -1883,13 +1885,13 @@ class MetaGraphTest(test.TestCase):
     filename = os.path.join(test_dir, "metafile")
     with self.cached_session():
       # Creates a graph.
-      v0 = variables.VariableV1(1.0, name="v0")
-      control_flow_ops.cond(
+      v0 = variable_v1.VariableV1(1.0, name="v0")
+      cond.cond(
           math_ops.less(v0, 10), lambda: math_ops.add(v0, 1),
           lambda: math_ops.subtract(v0, 1))
       while_loop.while_loop(lambda i: math_ops.less(i, 10),
                             lambda i: math_ops.add(i, 1), [v0])
-      var = variables.VariableV1(constant_op.constant(0, dtype=dtypes.int64))
+      var = variable_v1.VariableV1(constant_op.constant(0, dtype=dtypes.int64))
       count_up_to = var.count_up_to(3)
       input_queue = data_flow_ops.FIFOQueue(
           30, dtypes.float32, shared_name="collection_queue")
@@ -1938,7 +1940,7 @@ class MetaGraphTest(test.TestCase):
   def testAddCollectionDefFails(self):
     with self.cached_session():
       # Creates a graph.
-      v0 = variables.VariableV1(10.0, name="v0")
+      v0 = variable_v1.VariableV1(10.0, name="v0")
       # Creates a saver.
       save = saver_module.Saver({"v0": v0})
       # Generates MetaGraphDef.
@@ -1962,8 +1964,9 @@ class MetaGraphTest(test.TestCase):
     saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
     with self.session(graph=ops_lib.Graph()) as sess:
       # Creates a graph.
-      v0 = variables.VariableV1([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], name="v0")
-      v1 = variables.VariableV1(11.0, name="v1")
+      v0 = variable_v1.VariableV1([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
+                                  name="v0")
+      v1 = variable_v1.VariableV1(11.0, name="v1")
       # Creates 2 savers.
       saver0 = saver_module.Saver({"v0": v0}, name="saver0")
       saver1 = saver_module.Saver({"v1": v1}, name="saver1")
@@ -2044,8 +2047,9 @@ class MetaGraphTest(test.TestCase):
     saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
     with self.session(graph=ops_lib.Graph()) as sess:
       # Creates a graph.
-      v0 = variables.VariableV1([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], name="v0")
-      v1 = variables.VariableV1(11.0, name="v1")
+      v0 = variable_v1.VariableV1([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
+                                  name="v0")
+      v1 = variable_v1.VariableV1(11.0, name="v1")
 
       # Creates 2 savers.
       saver0 = saver_module.Saver({"v0": v0}, name="saver0")
@@ -2097,7 +2101,7 @@ class MetaGraphTest(test.TestCase):
     # train.Saver is V1 only API.
     with ops_lib.Graph().as_default(), self.session():
       # Creates a graph.
-      variables.VariableV1(10.0, name="v0")
+      variable_v1.VariableV1(10.0, name="v0")
       # Exports the graph as binary format.
       saver_module.export_meta_graph(filename, as_text=False)
     with ops_lib.Graph().as_default(), self.session():
@@ -2130,8 +2134,8 @@ class MetaGraphTest(test.TestCase):
     test_dir = self._get_test_dir("slice_saver")
     filename = os.path.join(test_dir, "metafile")
     with self.cached_session():
-      v1 = variables.VariableV1([20.0], name="v1")
-      v2 = variables.VariableV1([20.0], name="v2")
+      v1 = variable_v1.VariableV1([20.0], name="v1")
+      v2 = variable_v1.VariableV1([20.0], name="v2")
       v2._set_save_slice_info(
           variables.Variable.SaveSliceInfo("v1", [1], [0], [1]))
 
@@ -2158,28 +2162,28 @@ class MetaGraphTest(test.TestCase):
     # Hidden 1
     images = constant_op.constant(1.2, dtypes.float32, shape=[100, 28])
     with ops_lib.name_scope("hidden1"):
-      weights = variables.VariableV1(
-          random_ops.truncated_normal(
-              [28, 128], stddev=1.0 / math.sqrt(float(28))),
+      weights = variable_v1.VariableV1(
+          random_ops.truncated_normal([28, 128],
+                                      stddev=1.0 / math.sqrt(float(28))),
           name="weights")
-      # The use of control_flow_ops.cond here is purely for adding test coverage
+      # The use of cond.cond here is purely for adding test coverage
       # the save and restore of control flow context (which doesn't make any
       # sense here from a machine learning perspective).  The typical biases is
       # a simple Variable without the conditions.
-      biases = variables.VariableV1(
-          control_flow_ops.cond(
+      biases = variable_v1.VariableV1(
+          cond.cond(
               math_ops.less(random.random(), 0.5),
               lambda: array_ops.ones([128]), lambda: array_ops.zeros([128])),
           name="biases")
       hidden1 = nn_ops.relu(math_ops.matmul(images, weights) + biases)
     # Hidden 2
     with ops_lib.name_scope("hidden2"):
-      weights = variables.VariableV1(
-          random_ops.truncated_normal(
-              [128, 32], stddev=1.0 / math.sqrt(float(128))),
+      weights = variable_v1.VariableV1(
+          random_ops.truncated_normal([128, 32],
+                                      stddev=1.0 / math.sqrt(float(128))),
           name="weights")
 
-      # The use of control_flow_ops.while_loop here is purely for adding test
+      # The use of while_loop.while_loop here is purely for adding test
       # coverage the save and restore of control flow context (which doesn't
       # make any sense here from a machine learning perspective).  The typical
       # biases is a simple Variable without the conditions.
@@ -2192,16 +2196,16 @@ class MetaGraphTest(test.TestCase):
 
       _, biases = while_loop.while_loop(loop_cond, loop_body, [
           constant_op.constant(0),
-          variables.VariableV1(array_ops.zeros([32]))
+          variable_v1.VariableV1(array_ops.zeros([32]))
       ])
       hidden2 = nn_ops.relu(math_ops.matmul(hidden1, weights) + biases)
     # Linear
     with ops_lib.name_scope("softmax_linear"):
-      weights = variables.VariableV1(
-          random_ops.truncated_normal(
-              [32, 10], stddev=1.0 / math.sqrt(float(32))),
+      weights = variable_v1.VariableV1(
+          random_ops.truncated_normal([32, 10],
+                                      stddev=1.0 / math.sqrt(float(32))),
           name="weights")
-      biases = variables.VariableV1(array_ops.zeros([10]), name="biases")
+      biases = variable_v1.VariableV1(array_ops.zeros([10]), name="biases")
       logits = math_ops.matmul(hidden2, weights) + biases
       ops_lib.add_to_collection("logits", logits)
     init_all_op = variables.global_variables_initializer()
@@ -2290,7 +2294,7 @@ class MetaGraphTest(test.TestCase):
 
     # Create while loop using `outer_body_fn`.
     with ops_lib.Graph().as_default():
-      var = variables.VariableV1(0.0)
+      var = variable_v1.VariableV1(0.0)
       var_name = var.name
       output = graph_fn(var)
       output_name = output.name
@@ -2360,7 +2364,7 @@ class MetaGraphTest(test.TestCase):
     # Test while loop in a cond in a while loop.
     # pylint: disable=g-long-lambda
     def body(i, x):
-      cond_result = control_flow_ops.cond(
+      cond_result = cond.cond(
           i > 0,
           lambda: while_loop.while_loop(
               lambda j, y: j < 3,
@@ -2374,22 +2378,22 @@ class MetaGraphTest(test.TestCase):
   def testNestedCondsSerDes(self):
     # Test conds in a cond.
     # pylint: disable=g-long-lambda
-    self._testGradientSerDes(lambda x: control_flow_ops.cond(
+    self._testGradientSerDes(lambda x: cond.cond(
         x > 0,
-        lambda: control_flow_ops.cond(x > 3,
-                                      lambda: array_ops.identity(x),
-                                      lambda: math_ops.multiply(x, 2.0)),
-        lambda: control_flow_ops.cond(x < -3,
-                                      lambda: constant_op.constant(1.0),
-                                      lambda: math_ops.multiply(x, -1.0))))
+        lambda: cond.cond(x > 3,
+                          lambda: array_ops.identity(x),
+                          lambda: math_ops.multiply(x, 2.0)),
+        lambda: cond.cond(x < -3,
+                          lambda: constant_op.constant(1.0),
+                          lambda: math_ops.multiply(x, -1.0))))
     # pylint: enable=g-long-lambda
 
   @test_util.run_v1_only("This exercises Tensor.op which is meaningless in V2.")
   def testStrippedOpListDef(self):
     with self.cached_session():
       # Creates a graph.
-      v0 = variables.VariableV1(0.0)
-      var = variables.VariableV1(10.0)
+      v0 = variable_v1.VariableV1(0.0)
+      var = variable_v1.VariableV1(10.0)
       math_ops.add(v0, var)
 
       @function.Defun(dtypes.float32)
@@ -2429,8 +2433,8 @@ class MetaGraphTest(test.TestCase):
     # (complex64) in the "Complex" op must be removed.
     # train.Saver and train.export_meta_graph are V1 only APIs.
     with ops_lib.Graph().as_default(), self.cached_session():
-      real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
-      imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
+      real_num = variable_v1.VariableV1(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variable_v1.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
 
       save = saver_module.Saver({"real_num": real_num, "imag_num": imag_num})
@@ -2446,8 +2450,8 @@ class MetaGraphTest(test.TestCase):
     # (complex64) in the "Complex" op must *not* be removed, even if they map
     # to their defaults.
     with ops_lib.Graph().as_default(), self.session():
-      real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
-      imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
+      real_num = variable_v1.VariableV1(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variable_v1.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
 
       save = saver_module.Saver({"real_num": real_num, "imag_num": imag_num})
@@ -2468,9 +2472,9 @@ class MetaGraphTest(test.TestCase):
       image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
       label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
       with session.Session() as sess:
-        weights = variables.VariableV1(
+        weights = variable_v1.VariableV1(
             random_ops.random_uniform([784, 10]), name="weights")
-        bias = variables.VariableV1(array_ops.zeros([10]), name="bias")
+        bias = variable_v1.VariableV1(array_ops.zeros([10]), name="bias")
         logit = nn_ops.relu(
             math_ops.matmul(image, weights) + bias, name="logits")
         nn_ops.softmax(logit, name="prediction")
@@ -2514,7 +2518,7 @@ class MetaGraphTest(test.TestCase):
       self.assertIsNone(new_saver_1)
 
       # Create a variable in graph_2 under scope "my_scope".
-      variables.VariableV1(array_ops.zeros([10]), name="my_scope/my_var")
+      variable_v1.VariableV1(array_ops.zeros([10]), name="my_scope/my_var")
       self.evaluate(variables.global_variables_initializer())
       # Restore the checkpoint into a different scope "subgraph_2".
       new_saver_2 = saver_module.import_meta_graph(
@@ -2541,9 +2545,9 @@ class MetaGraphTest(test.TestCase):
       image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
       label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
       with session.Session() as sess:
-        weights = variables.VariableV1(
+        weights = variable_v1.VariableV1(
             random_ops.random_uniform([784, 10]), name="weights")
-        bias = variables.VariableV1(array_ops.zeros([10]), name="bias")
+        bias = variable_v1.VariableV1(array_ops.zeros([10]), name="bias")
         logit = nn_ops.relu(
             math_ops.matmul(image, weights) + bias, name="logits")
         nn_ops.softmax(logit, name="prediction")
@@ -2573,9 +2577,9 @@ class MetaGraphTest(test.TestCase):
       with ops_lib.device("/job:ps/replica:0/task:0/device:GPU:0"):
         image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
         label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
-        weights = variables.VariableV1(
+        weights = variable_v1.VariableV1(
             random_ops.random_uniform([784, 10]), name="weights")
-        bias = variables.VariableV1(array_ops.zeros([10]), name="bias")
+        bias = variable_v1.VariableV1(array_ops.zeros([10]), name="bias")
         logit = nn_ops.relu(math_ops.matmul(image, weights) + bias)
         nn_ops.softmax(logit, name="prediction")
         cost = nn_ops.softmax_cross_entropy_with_logits(labels=label,
@@ -2606,9 +2610,9 @@ class MetaGraphTest(test.TestCase):
       with ops_lib.device("/job:ps/replica:0/task:0/device:GPU:0"):
         image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
         label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
-        weights = variables.VariableV1(
+        weights = variable_v1.VariableV1(
             random_ops.random_uniform([784, 10]), name="weights")
-        bias = variables.VariableV1(array_ops.zeros([10]), name="bias")
+        bias = variable_v1.VariableV1(array_ops.zeros([10]), name="bias")
         logit = nn_ops.relu(math_ops.matmul(image, weights) + bias)
         nn_ops.softmax(logit, name="prediction")
         cost = nn_ops.softmax_cross_entropy_with_logits(labels=label,
@@ -2659,10 +2663,12 @@ class CheckpointReaderTest(test.TestCase):
 
   def testDebugString(self):
     # Builds a graph.
-    v0 = variables.VariableV1(
-        [[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32, name="v0")
-    v1 = variables.VariableV1(
-        [[[1], [2]], [[3], [4]], [[5], [6]]], dtype=dtypes.float32, name="v1")
+    v0 = variable_v1.VariableV1([[1, 2, 3], [4, 5, 6]],
+                                dtype=dtypes.float32,
+                                name="v0")
+    v1 = variable_v1.VariableV1([[[1], [2]], [[3], [4]], [[5], [6]]],
+                                dtype=dtypes.float32,
+                                name="v1")
     init_all_op = variables.global_variables_initializer()
     save = saver_module.Saver(
         {
@@ -2718,8 +2724,9 @@ class WriteGraphTest(test.TestCase):
 
   def testWriteGraph(self):
     test_dir = self._get_test_dir("write_graph_dir")
-    variables.VariableV1(
-        [[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32, name="v0")
+    variable_v1.VariableV1([[1, 2, 3], [4, 5, 6]],
+                           dtype=dtypes.float32,
+                           name="v0")
     path = graph_io.write_graph(ops_lib.get_default_graph(),
                                 os.path.join(test_dir, "l1"), "graph.pbtxt")
     truth = os.path.join(test_dir, "l1", "graph.pbtxt")
@@ -2728,8 +2735,9 @@ class WriteGraphTest(test.TestCase):
 
   def testRecursiveCreate(self):
     test_dir = self._get_test_dir("deep_dir")
-    variables.VariableV1(
-        [[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32, name="v0")
+    variable_v1.VariableV1([[1, 2, 3], [4, 5, 6]],
+                           dtype=dtypes.float32,
+                           name="v0")
     path = graph_io.write_graph(ops_lib.get_default_graph().as_graph_def(),
                                 os.path.join(test_dir, "l1", "l2", "l3"),
                                 "graph.pbtxt")
@@ -2753,16 +2761,16 @@ class ScopedGraphTest(test.TestCase):
       images = constant_op.constant(
           1.2, dtypes.float32, shape=[100, 28], name="images")
       with ops_lib.name_scope("hidden1"):
-        weights1 = variables.VariableV1(
-            random_ops.truncated_normal(
-                [28, 128], stddev=1.0 / math.sqrt(float(28))),
+        weights1 = variable_v1.VariableV1(
+            random_ops.truncated_normal([28, 128],
+                                        stddev=1.0 / math.sqrt(float(28))),
             name="weights")
-        # The use of control_flow_ops.cond here is purely for adding test
+        # The use of cond.cond here is purely for adding test
         # coverage the save and restore of control flow context (which doesn't
         # make any sense here from a machine learning perspective).  The typical
         # biases is a simple Variable without the conditions.
-        biases1 = variables.VariableV1(
-            control_flow_ops.cond(
+        biases1 = variable_v1.VariableV1(
+            cond.cond(
                 math_ops.less(random.random(), 0.5),
                 lambda: array_ops.ones([128]), lambda: array_ops.zeros([128])),
             name="biases")
@@ -2770,12 +2778,12 @@ class ScopedGraphTest(test.TestCase):
 
       # Hidden 2
       with ops_lib.name_scope("hidden2"):
-        weights2 = variables.VariableV1(
-            random_ops.truncated_normal(
-                [128, 32], stddev=1.0 / math.sqrt(float(128))),
+        weights2 = variable_v1.VariableV1(
+            random_ops.truncated_normal([128, 32],
+                                        stddev=1.0 / math.sqrt(float(128))),
             name="weights")
 
-        # The use of control_flow_ops.while_loop here is purely for adding test
+        # The use of while_loop.while_loop here is purely for adding test
         # coverage the save and restore of control flow context (which doesn't
         # make any sense here from a machine learning perspective).  The typical
         # biases is a simple Variable without the conditions.
@@ -2788,16 +2796,16 @@ class ScopedGraphTest(test.TestCase):
 
         _, biases2 = while_loop.while_loop(loop_cond, loop_body, [
             constant_op.constant(0),
-            variables.VariableV1(array_ops.zeros([32]))
+            variable_v1.VariableV1(array_ops.zeros([32]))
         ])
         hidden2 = nn_ops.relu(math_ops.matmul(hidden1, weights2) + biases2)
       # Linear
       with ops_lib.name_scope("softmax_linear"):
-        weights3 = variables.VariableV1(
-            random_ops.truncated_normal(
-                [32, 10], stddev=1.0 / math.sqrt(float(32))),
+        weights3 = variable_v1.VariableV1(
+            random_ops.truncated_normal([32, 10],
+                                        stddev=1.0 / math.sqrt(float(32))),
             name="weights")
-        biases3 = variables.VariableV1(array_ops.zeros([10]), name="biases")
+        biases3 = variable_v1.VariableV1(array_ops.zeros([10]), name="biases")
         logits = math_ops.matmul(hidden2, weights3) + biases3
         ops_lib.add_to_collection("logits", logits)
 
@@ -2843,12 +2851,12 @@ class ScopedGraphTest(test.TestCase):
     with graph.as_default():
       # Hidden 2
       with ops_lib.name_scope("hidden2"):
-        weights = variables.VariableV1(
-            random_ops.truncated_normal(
-                [128, 32], stddev=1.0 / math.sqrt(float(128))),
+        weights = variable_v1.VariableV1(
+            random_ops.truncated_normal([128, 32],
+                                        stddev=1.0 / math.sqrt(float(128))),
             name="weights")
 
-        # The use of control_flow_ops.while_loop here is purely for adding test
+        # The use of while_loop.while_loop here is purely for adding test
         # coverage the save and restore of control flow context (which doesn't
         # make any sense here from a machine learning perspective).  The typical
         # biases is a simple Variable without the conditions.
@@ -2861,16 +2869,16 @@ class ScopedGraphTest(test.TestCase):
 
         _, biases = while_loop.while_loop(loop_cond, loop_body, [
             constant_op.constant(0),
-            variables.VariableV1(array_ops.zeros([32]))
+            variable_v1.VariableV1(array_ops.zeros([32]))
         ])
         hidden2 = nn_ops.relu(math_ops.matmul(hidden1, weights) + biases)
       # Linear
       with ops_lib.name_scope("softmax_linear"):
-        weights = variables.VariableV1(
-            random_ops.truncated_normal(
-                [32, 10], stddev=1.0 / math.sqrt(float(32))),
+        weights = variable_v1.VariableV1(
+            random_ops.truncated_normal([32, 10],
+                                        stddev=1.0 / math.sqrt(float(32))),
             name="weights")
-        biases = variables.VariableV1(array_ops.zeros([10]), name="biases")
+        biases = variable_v1.VariableV1(array_ops.zeros([10]), name="biases")
         logits = math_ops.matmul(hidden2, weights) + biases
         ops_lib.add_to_collection("logits", logits)
 
@@ -2907,9 +2915,9 @@ class ScopedGraphTest(test.TestCase):
       with ops_lib.name_scope("hidden1"):
         images = constant_op.constant(
             1.0, dtypes.float32, shape=[3, 2], name="images")
-        weights1 = variables.VariableV1(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], name="weights")
-        biases1 = variables.VariableV1([0.1] * 3, name="biases")
+        weights1 = variable_v1.VariableV1([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],
+                                          name="weights")
+        biases1 = variable_v1.VariableV1([0.1] * 3, name="biases")
         nn_ops.relu(math_ops.matmul(images, weights1) + biases1, name="relu")
 
     # Run the graph and save scoped checkpoint.
@@ -2964,9 +2972,9 @@ class ScopedGraphTest(test.TestCase):
       with ops_lib.name_scope("hidden1"):
         images = constant_op.constant(
             1.0, dtypes.float32, shape=[3, 2], name="images")
-        weights1 = variables.VariableV1(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], name="weights")
-        biases1 = variables.VariableV1([0.1] * 3, name="biases")
+        weights1 = variable_v1.VariableV1([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],
+                                          name="weights")
+        biases1 = variable_v1.VariableV1([0.1] * 3, name="biases")
         nn_ops.relu(math_ops.matmul(images, weights1) + biases1, name="relu")
 
       # Run the graph and save scoped checkpoint.
@@ -3000,12 +3008,12 @@ class ScopedGraphTest(test.TestCase):
     graph = ops_lib.Graph()
     with graph.as_default():
       with ops_lib.name_scope("hidden1"):
-        variable1 = variables.VariableV1([1.0], name="variable1")
+        variable1 = variable_v1.VariableV1([1.0], name="variable1")
         saver1 = saver_module.Saver(var_list=[variable1])
         graph.add_to_collection(ops_lib.GraphKeys.SAVERS, saver1)
 
       with ops_lib.name_scope("hidden2"):
-        variable2 = variables.VariableV1([2.0], name="variable2")
+        variable2 = variable_v1.VariableV1([2.0], name="variable2")
       saver2 = saver_module.Saver(var_list=[variable2], name="hidden2/")
       graph.add_to_collection(ops_lib.GraphKeys.SAVERS, saver2)
 
@@ -3199,7 +3207,7 @@ class TrackableCompatibilityTests(test.TestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
 
     with ops_lib.Graph().as_default() as g:
-      a = variables.VariableV1(1., name="a")
+      a = variable_v1.VariableV1(1., name="a")
       a_saver = saver_module.Saver([a])
 
       with self.session(graph=g) as sess:
@@ -3207,7 +3215,7 @@ class TrackableCompatibilityTests(test.TestCase):
         save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
 
     with ops_lib.Graph().as_default() as g:
-      a = variables.VariableV1([1.], name="a")
+      a = variable_v1.VariableV1([1.], name="a")
       a_saver = saver_module.Saver([a])
       with self.session(graph=g) as sess:
         with self.assertRaisesRegex(
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 299639af03d..005bed02d67 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -2,7 +2,7 @@
 #   Low-level utilities for reading and writing checkpoints.
 
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -46,6 +46,7 @@ py_strict_library(
         ":saveable_object",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops_gen",
+        "//tensorflow/python:ref_variable",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
@@ -57,11 +58,11 @@ py_strict_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:python_state",
         "//tensorflow/python/trackable:trackable_utils",
-        "//tensorflow/python/types",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:object_identity",
@@ -84,7 +85,7 @@ py_strict_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "saveable_object_util_test",
     srcs = ["saveable_object_util_test.py"],
     deps = [
@@ -96,7 +97,6 @@ tf_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:resource",
     ],
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index 682a619b535..9ff5fdfa8ff 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_util
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.ops import ref_variable
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -271,7 +272,7 @@ def op_list_to_dict(op_list, convert_variable_to_tensor=True):
   for var in op_list:
     resource_or_ref_variable = (
         isinstance(var, resource_variable_ops.BaseResourceVariable) or
-        isinstance(var, variables.RefVariable))
+        isinstance(var, ref_variable.RefVariable))
 
     if isinstance(var, saveable_object.SaveableObject):
       names_to_saveables[var.name] = var
diff --git a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
index 47cab3b0e1b..c0bf3be7497 100644
--- a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
@@ -18,7 +18,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -36,8 +36,8 @@ class SameVariablesNoClearTest(test.TestCase):
     server = server_lib.Server.create_local_server()
 
     with session.Session(server.target) as sess_1:
-      v0 = variables.VariableV1([[2, 1]], name="v0")
-      v1 = variables.VariableV1([[1], [2]], name="v1")
+      v0 = variable_v1.VariableV1([[2, 1]], name="v0")
+      v1 = variable_v1.VariableV1([[1], [2]], name="v1")
       v2 = math_ops.matmul(v0, v1)
       sess_1.run([v0.initializer, v1.initializer])
       self.assertAllEqual([[4]], sess_1.run(v2))
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index a2b9c9b6db9..0ed399cfb5e 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import input as input_ops
@@ -75,9 +76,9 @@ class GrpcServerTest(test.TestCase):
     with ops.Graph().as_default():
       # Creates variable with container name.
       with ops.container("test0"):
-        v0 = variables.VariableV1(1.0, name="v0")
+        v0 = variable_v1.VariableV1(1.0, name="v0")
       # Creates variable with default container.
-      v1 = variables.VariableV1(2.0, name="v1")
+      v1 = variable_v1.VariableV1(2.0, name="v1")
       # Verifies resetting the non-existent target returns error.
       with self.assertRaises(errors_impl.NotFoundError):
         session.Session.reset("nonexistent", ["test0"])
@@ -294,8 +295,8 @@ class GrpcServerTest(test.TestCase):
           [0.], dtype=dtypes.float32))
       self.assertIsNotNone(input_queue)
 
-      var = variables.VariableV1(1., dtype=dtypes.float32, trainable=False,
-                                 name="var")
+      var = variable_v1.VariableV1(
+          1., dtype=dtypes.float32, trainable=False, name="var")
 
       sess.run(variables.global_variables_initializer())
       queue_runner_impl.start_queue_runners(sess)
@@ -306,7 +307,7 @@ class GrpcServerTest(test.TestCase):
 
     with ops.Graph().as_default():
       init_value = array_ops.placeholder(dtypes.int32)
-      v = variables.VariableV1(init_value, validate_shape=False, name="v")
+      v = variable_v1.VariableV1(init_value, validate_shape=False, name="v")
 
       sharing_config = config_pb2.ConfigProto(isolate_session_state=False)
       sharing_sess_0 = session.Session(server.target, config=sharing_config)
@@ -364,7 +365,7 @@ class GrpcServerTest(test.TestCase):
     isolate_config = config_pb2.ConfigProto(isolate_session_state=True)
 
     with ops.Graph().as_default():
-      w_vector = variables.VariableV1([1, 2, 3], name="w")
+      w_vector = variable_v1.VariableV1([1, 2, 3], name="w")
       with session.Session(server.target, config=sharing_config) as sess:
         with self.assertRaises(errors_impl.FailedPreconditionError):
           sess.run(w_vector)
@@ -372,14 +373,14 @@ class GrpcServerTest(test.TestCase):
         self.assertAllEqual([1, 2, 3], sess.run(w_vector))
 
     with ops.Graph().as_default():
-      w_vector = variables.VariableV1([4, 5, 6], name="w")
+      w_vector = variable_v1.VariableV1([4, 5, 6], name="w")
       with session.Session(server.target, config=sharing_config) as sess:
         self.assertAllEqual([1, 2, 3], sess.run(w_vector))
         sess.run(w_vector.initializer)
         self.assertAllEqual([4, 5, 6], sess.run(w_vector))
 
     with ops.Graph().as_default():
-      w_scalar = variables.VariableV1(37, name="w")
+      w_scalar = variable_v1.VariableV1(37, name="w")
       with session.Session(server.target, config=isolate_config) as sess:
         with self.assertRaises(errors_impl.FailedPreconditionError):
           sess.run(w_scalar)
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index d609a20abab..c25c1fa4a51 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -18,7 +18,7 @@ import time
 import numpy as np
 from tensorflow.python.checkpoint import checkpoint_management
 from tensorflow.python.client import session
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
@@ -210,7 +210,7 @@ class SessionManager:
     # This is required to so that we initialize the TPU device before
     # restoring from checkpoint since we'll be placing variables on the device
     # and TPUInitialize wipes out the memory of the device.
-    strategy = distribution_strategy_context.get_strategy()
+    strategy = distribute_lib.get_strategy()
     if strategy and hasattr(strategy.extended,
                             "_experimental_initialize_system"):
       strategy.extended._experimental_initialize_system()  # pylint: disable=protected-access
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index d41b0dca955..f01bc6bfaa1 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
 from tensorflow.python.platform import gfile
@@ -42,7 +43,7 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionSucceeds(self):
     with ops.Graph().as_default():
-      v = variables.VariableV1([1.0, 2.0, 3.0], name="v")
+      v = variable_v1.VariableV1([1.0, 2.0, 3.0], name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       sess = sm.prepare_session(
@@ -52,7 +53,7 @@ class SessionManagerTest(test.TestCase):
   def testPrepareSessionSucceedsWithInitFeedDict(self):
     with ops.Graph().as_default():
       p = array_ops.placeholder(dtypes.float32, shape=(3,))
-      v = variables.VariableV1(p, name="v")
+      v = variable_v1.VariableV1(p, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       sess = sm.prepare_session(
@@ -63,7 +64,7 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionSucceedsWithInitFn(self):
     with ops.Graph().as_default():
-      v = variables.VariableV1([125], name="v")
+      v = variable_v1.VariableV1([125], name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       sess = sm.prepare_session(
@@ -73,8 +74,8 @@ class SessionManagerTest(test.TestCase):
   def testPrepareSessionSucceedsWithLocalInitFeedDict(self):
     with ops.Graph().as_default():
       p = array_ops.placeholder(dtypes.float32, shape=(3,))
-      v = variables.VariableV1(p, name="v",
-                               collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      v = variable_v1.VariableV1(
+          p, name="v", collections=[ops.GraphKeys.LOCAL_VARIABLES])
       sm = session_manager.SessionManager(
           local_init_op=v.initializer,
           local_init_feed_dict={p: [1.0, 2.0, 3.0]},
@@ -93,7 +94,7 @@ class SessionManagerTest(test.TestCase):
     gfile.MakeDirs(checkpoint_dir)
 
     with ops.Graph().as_default():
-      v = variables.VariableV1([1.0, 2.0, 3.0], name="v")
+      v = variable_v1.VariableV1([1.0, 2.0, 3.0], name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       saver = saver_lib.Saver({"v": v})
@@ -111,9 +112,9 @@ class SessionManagerTest(test.TestCase):
       # Renames the checkpoint directory.
       os.rename(checkpoint_dir, checkpoint_dir2)
       gfile.MakeDirs(checkpoint_dir)
-      v = variables.VariableV1([6.0, 7.0, 8.0], name="v")
+      v = variable_v1.VariableV1([6.0, 7.0, 8.0], name="v")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
       session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       saver = saver_lib.Saver({"v": v})
@@ -140,7 +141,7 @@ class SessionManagerTest(test.TestCase):
           max_wait_secs=2)
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
 
   def _test_recovered_variable(self,
@@ -148,9 +149,9 @@ class SessionManagerTest(test.TestCase):
                                checkpoint_filename_with_path=None):
     # Create a new Graph and SessionManager and recover from a checkpoint.
     with ops.Graph().as_default():
-      v = variables.VariableV1(2, name="v")
+      v = variable_v1.VariableV1(2, name="v")
       with session_lib.Session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       saver = saver_lib.Saver({"v": v})
@@ -162,7 +163,7 @@ class SessionManagerTest(test.TestCase):
       self.assertTrue(initialized)
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEqual(1, sess.run(v))
 
@@ -176,7 +177,7 @@ class SessionManagerTest(test.TestCase):
     gfile.MakeDirs(checkpoint_dir)
 
     with ops.Graph().as_default():
-      v = variables.VariableV1(1, name="v")
+      v = variable_v1.VariableV1(1, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       saver = saver_lib.Saver({"v": v})
@@ -200,7 +201,7 @@ class SessionManagerTest(test.TestCase):
 
   def testWaitForSessionReturnsNoneAfterTimeout(self):
     with ops.Graph().as_default():
-      variables.VariableV1(1, name="v")
+      variable_v1.VariableV1(1, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables(),
           recovery_wait_secs=1)
@@ -231,7 +232,7 @@ class SessionManagerTest(test.TestCase):
     gfile.MakeDirs(checkpoint_dir)
 
     with ops.Graph().as_default():
-      v = variables.VariableV1(1, name="v")
+      v = variable_v1.VariableV1(1, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       saver = saver_lib.Saver({"v": v})
@@ -244,15 +245,15 @@ class SessionManagerTest(test.TestCase):
                                     "recover_session_checkpoint"))
     # Create a new Graph and SessionManager and recover.
     with ops.Graph().as_default():
-      v = variables.VariableV1(2, name="v")
-      w = variables.VariableV1(
+      v = variable_v1.VariableV1(2, name="v")
+      w = variable_v1.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
-        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(w).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables(),
           ready_for_local_init_op=variables.report_uninitialized_variables(
@@ -264,11 +265,11 @@ class SessionManagerTest(test.TestCase):
       self.assertTrue(initialized)
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEqual(1, sess.run(v))
       self.assertEqual(1, sess.run(w))
@@ -289,7 +290,7 @@ class SessionManagerTest(test.TestCase):
     gfile.MakeDirs(checkpoint_dir)
 
     with ops.Graph().as_default():
-      v = variables.VariableV1(1, name="v")
+      v = variable_v1.VariableV1(1, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       saver = saver_lib.Saver({"v": v})
@@ -302,15 +303,15 @@ class SessionManagerTest(test.TestCase):
                                     "recover_session_checkpoint"))
     # Create a new Graph and SessionManager and recover.
     with ops.Graph().as_default():
-      v = variables.VariableV1(2, name="v")
-      w = variables.VariableV1(
+      v = variable_v1.VariableV1(2, name="v")
+      w = variable_v1.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
-        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(w).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables(),
           ready_for_local_init_op=variables.report_uninitialized_variables(),
@@ -321,11 +322,11 @@ class SessionManagerTest(test.TestCase):
       self.assertFalse(initialized)
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEqual(
           False,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEqual(1, sess.run(v))
 
@@ -335,13 +336,13 @@ class SessionManagerTest(test.TestCase):
     # local_init_op exactly once, regardless of whether the session was
     # successfully recovered.
     with ops.Graph().as_default():
-      w = variables.VariableV1(
+      w = variable_v1.VariableV1(
           1,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(w).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables(),
           ready_for_local_init_op=None,
@@ -353,7 +354,7 @@ class SessionManagerTest(test.TestCase):
       self.assertFalse(initialized)
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEqual(1, sess.run(w))
 
@@ -370,15 +371,15 @@ class SessionManagerTest(test.TestCase):
 
     # Create a new Graph and SessionManager and recover.
     with ops.Graph().as_default():
-      v = variables.VariableV1(2, name="v")
-      w = variables.VariableV1(
+      v = variable_v1.VariableV1(2, name="v")
+      w = variable_v1.VariableV1(
           1,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
-        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(w).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables(),
           ready_for_local_init_op=None,
@@ -392,19 +393,19 @@ class SessionManagerTest(test.TestCase):
       self.assertFalse(initialized)
       self.assertEqual(
           False,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEqual(1, sess.run(w))
 
   def testWaitForSessionLocalInit(self):
     server = server_lib.Server.create_local_server()
     with ops.Graph().as_default() as graph:
-      v = variables.VariableV1(1, name="v")
-      w = variables.VariableV1(
+      v = variable_v1.VariableV1(1, name="v")
+      w = variable_v1.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -423,19 +424,19 @@ class SessionManagerTest(test.TestCase):
       sess = sm.wait_for_session(server.target, max_wait_secs=3)
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEqual(1, sess.run(v))
       self.assertEqual(1, sess.run(w))
 
   def testWaitForSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
     with ops.Graph().as_default() as graph:
-      v = variables.VariableV1(1, name="v")
-      w = variables.VariableV1(
+      v = variable_v1.VariableV1(1, name="v")
+      w = variable_v1.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -454,8 +455,8 @@ class SessionManagerTest(test.TestCase):
   @test_util.run_v1_only("Requires TF V1 variable behavior.")
   def testWaitForSessionInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default() as graph:
-      v = variables.VariableV1(1, name="v")
-      w = variables.VariableV1(
+      v = variable_v1.VariableV1(1, name="v")
+      w = variable_v1.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -471,21 +472,21 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionWithReadyForLocalInitOp(self):
     with ops.Graph().as_default():
-      v = variables.VariableV1(1, name="v")
-      w = variables.VariableV1(
+      v = variable_v1.VariableV1(1, name="v")
+      w = variable_v1.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
-      x = variables.VariableV1(
+      x = variable_v1.VariableV1(
           3 * v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="x")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
-        self.assertEqual(False, variables.is_variable_initialized(w).eval())
-        self.assertEqual(False, variables.is_variable_initialized(x).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(w).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(x).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables(),
           ready_for_local_init_op=variables.report_uninitialized_variables(
@@ -494,15 +495,15 @@ class SessionManagerTest(test.TestCase):
       sess = sm2.prepare_session("", init_op=v.initializer)
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("x:0")).eval(session=sess))
       self.assertEqual(1, sess.run(v))
       self.assertEqual(1, sess.run(w))
@@ -511,66 +512,69 @@ class SessionManagerTest(test.TestCase):
   @test_util.run_v1_only("Requires TF V1 variable behavior.")
   def testPrepareSessionWithPartialInitOp(self):
     with ops.Graph().as_default():
-      v = variables.VariableV1(1, name="v")
-      w = variables.VariableV1(
+      v = variable_v1.VariableV1(1, name="v")
+      w = variable_v1.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
-      x = variables.VariableV1(
+      x = variable_v1.VariableV1(
           3 * v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="x")
       # TODO(b/70206927): Use ResourceVariables once they are handled properly.
-      v_res = variables.VariableV1(1, name="v_res")
-      w_res = variables.VariableV1(
+      v_res = variable_v1.VariableV1(1, name="v_res")
+      w_res = variable_v1.VariableV1(
           v_res,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w_res")
-      x_res = variables.VariableV1(
+      x_res = variable_v1.VariableV1(
           3 * v_res,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="x_res")
 
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
-        self.assertEqual(False, variables.is_variable_initialized(w).eval())
-        self.assertEqual(False, variables.is_variable_initialized(x).eval())
-        self.assertEqual(False, variables.is_variable_initialized(v_res).eval())
-        self.assertEqual(False, variables.is_variable_initialized(w_res).eval())
-        self.assertEqual(False, variables.is_variable_initialized(x_res).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(w).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(x).eval())
+        self.assertEqual(False,
+                         variable_v1.is_variable_initialized(v_res).eval())
+        self.assertEqual(False,
+                         variable_v1.is_variable_initialized(w_res).eval())
+        self.assertEqual(False,
+                         variable_v1.is_variable_initialized(x_res).eval())
       sm2 = session_manager.SessionManager(local_init_op=[
           w.initializer, x.initializer, w_res.initializer, x_res.initializer
       ])
       sess = sm2.prepare_session("", init_op=None)
       self.assertEqual(
           False,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("x:0")).eval(session=sess))
       self.assertEqual(1, sess.run(w))
       self.assertEqual(3, sess.run(x))
       self.assertEqual(
           False,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("v_res:0")).eval(session=sess))
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("w_res:0")).eval(session=sess))
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("x_res:0")).eval(session=sess))
       self.assertEqual(1, sess.run(w_res))
       self.assertEqual(3, sess.run(x_res))
@@ -581,29 +585,29 @@ class SessionManagerTest(test.TestCase):
     # cyclic dependencies.
     with ops.Graph().as_default():
       i = while_loop.while_loop(lambda i: i < 1, lambda i: i + 1, [0])
-      v = variables.VariableV1(array_ops.identity(i), name="v")
+      v = variable_v1.VariableV1(array_ops.identity(i), name="v")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
       sm = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       sess = sm.prepare_session("", init_op=v.initializer)
       self.assertEqual(1, sess.run(v))
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
 
   def testPrepareSessionDidNotInitLocalVariable(self):
     with ops.Graph().as_default():
-      v = variables.VariableV1(1, name="v")
-      w = variables.VariableV1(
+      v = variable_v1.VariableV1(1, name="v")
+      w = variable_v1.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
-        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(w).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       with self.assertRaisesRegex(RuntimeError,
@@ -612,15 +616,15 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionDidNotInitLocalVariableList(self):
     with ops.Graph().as_default():
-      v = variables.VariableV1(1, name="v")
-      w = variables.VariableV1(
+      v = variable_v1.VariableV1(1, name="v")
+      w = variable_v1.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
-        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(w).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables())
       with self.assertRaisesRegex(RuntimeError,
@@ -629,15 +633,15 @@ class SessionManagerTest(test.TestCase):
 
   def testPrepareSessionWithReadyNotReadyForLocal(self):
     with ops.Graph().as_default():
-      v = variables.VariableV1(1, name="v")
-      w = variables.VariableV1(
+      v = variable_v1.VariableV1(1, name="v")
+      w = variable_v1.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
-        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(w).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables(),
           ready_for_local_init_op=variables.report_uninitialized_variables(
@@ -651,15 +655,15 @@ class SessionManagerTest(test.TestCase):
   @test_util.run_v1_only("Requires TF V1 variable behavior.")
   def testPrepareSessionWithInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default():
-      v = variables.VariableV1(1, name="v")
-      w = variables.VariableV1(
+      v = variable_v1.VariableV1(1, name="v")
+      w = variable_v1.VariableV1(
           v,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="w")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
-        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(w).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.report_uninitialized_variables(),
           ready_for_local_init_op=None,
@@ -678,7 +682,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
 
   def testPrepareSessionSucceeds(self):
     with ops.Graph().as_default():
-      v = variables.VariableV1([1.0, 2.0, 3.0], name="v")
+      v = variable_v1.VariableV1([1.0, 2.0, 3.0], name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized())
       sess = sm.prepare_session(
@@ -688,7 +692,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
   def testPrepareSessionSucceedsWithInitFeedDict(self):
     with ops.Graph().as_default():
       p = array_ops.placeholder(dtypes.float32, shape=(3,))
-      v = variables.VariableV1(p, name="v")
+      v = variable_v1.VariableV1(p, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized())
       sess = sm.prepare_session(
@@ -699,7 +703,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
 
   def testPrepareSessionSucceedsWithInitFn(self):
     with ops.Graph().as_default():
-      v = variables.VariableV1([125], name="v")
+      v = variable_v1.VariableV1([125], name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized())
       sess = sm.prepare_session(
@@ -717,7 +721,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
     gfile.MakeDirs(checkpoint_dir)
 
     with ops.Graph().as_default():
-      v = variables.VariableV1([1.0, 2.0, 3.0], name="v")
+      v = variable_v1.VariableV1([1.0, 2.0, 3.0], name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized())
       saver = saver_lib.Saver({"v": v})
@@ -735,9 +739,9 @@ class ObsoleteSessionManagerTest(test.TestCase):
       # Renames the checkpoint directory.
       os.rename(checkpoint_dir, checkpoint_dir2)
       gfile.MakeDirs(checkpoint_dir)
-      v = variables.VariableV1([6.0, 7.0, 8.0], name="v")
+      v = variable_v1.VariableV1([6.0, 7.0, 8.0], name="v")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
       session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized())
       saver = saver_lib.Saver({"v": v})
@@ -764,7 +768,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
           max_wait_secs=2)
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
 
   def testRecoverSession(self):
@@ -777,7 +781,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
     gfile.MakeDirs(checkpoint_dir)
 
     with ops.Graph().as_default():
-      v = variables.VariableV1(1, name="v")
+      v = variable_v1.VariableV1(1, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized())
       saver = saver_lib.Saver({"v": v})
@@ -790,9 +794,9 @@ class ObsoleteSessionManagerTest(test.TestCase):
                                     "recover_session_checkpoint"))
     # Create a new Graph and SessionManager and recover.
     with ops.Graph().as_default():
-      v = variables.VariableV1(2, name="v")
+      v = variable_v1.VariableV1(2, name="v")
       with self.cached_session():
-        self.assertEqual(False, variables.is_variable_initialized(v).eval())
+        self.assertEqual(False, variable_v1.is_variable_initialized(v).eval())
       sm2 = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized())
       saver = saver_lib.Saver({"v": v})
@@ -801,13 +805,13 @@ class ObsoleteSessionManagerTest(test.TestCase):
       self.assertTrue(initialized)
       self.assertEqual(
           True,
-          variables.is_variable_initialized(
+          variable_v1.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEqual(1, sess.run(v))
 
   def testWaitForSessionReturnsNoneAfterTimeout(self):
     with ops.Graph().as_default():
-      variables.VariableV1(1, name="v")
+      variable_v1.VariableV1(1, name="v")
       sm = session_manager.SessionManager(
           ready_op=variables.assert_variables_initialized(),
           recovery_wait_secs=1)
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 6b79f48ac2f..33e307da3e1 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -36,11 +36,14 @@ update_mavg = mavg.assign_sub((mavg - var) * (1 - decay))
 # pylint: disable=g-bad-name
 
 from tensorflow.python.compiler.xla.experimental import xla_sharding
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import ref_variable
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 
 
@@ -63,7 +66,7 @@ def _create_slot_var(primary,
   shape = shape if callable(val) else None
   if resource_variable_ops.is_resource_variable(primary):
     use_resource = True
-  elif isinstance(primary, variables.RefVariable):
+  elif isinstance(primary, ref_variable.RefVariable):
     use_resource = False
   else:
     use_resource = None
@@ -148,7 +151,7 @@ def create_slot(primary,
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      distribution_strategy = distribution_strategy_context.get_strategy()
+      distribution_strategy = distribute_lib.get_strategy()
       with distribution_strategy.extended.colocate_vars_with(primary):
         return _create_slot_var(
             primary,
@@ -207,7 +210,7 @@ def create_slot_with_initializer(primary,
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      distribution_strategy = distribution_strategy_context.get_strategy()
+      distribution_strategy = distribute_lib.get_strategy()
       with distribution_strategy.extended.colocate_vars_with(primary):
         return _create_slot_var(
             primary,
@@ -263,7 +266,10 @@ def create_zeros_slot(primary,
         copy_xla_sharding=copy_xla_sharding)
   else:
     if isinstance(primary, variables.Variable):
-      slot_shape = array_ops.shape(primary.initialized_value())
+      slot_shape = array_ops.shape(
+          control_flow_ops.cond(
+              variable_v1.is_variable_initialized(primary), primary.read_value,
+              lambda: primary.initial_value))
     else:
       slot_shape = array_ops.shape(primary)
     val = array_ops.zeros(slot_shape, dtype=dtype)
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index f70336b1ed4..69c7b6682ae 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -22,21 +22,29 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import slot_creator
 
 
+def initialized_value(var):
+  return control_flow_ops.cond(
+      variable_v1.is_variable_initialized(var), var.read_value,
+      lambda: var.initial_value)
+
+
 class SlotCreatorTest(test.TestCase):
 
   def testCreateSlotFromVariable(self):
     # slot_creator is used only in optimizer V1.
     with ops.Graph().as_default(), self.cached_session():
       v = variables.Variable([1.0, 2.5], name="var")
-      slot = slot_creator.create_slot(v, v.initialized_value(), name="slot")
+      slot = slot_creator.create_slot(v, initialized_value(v), name="slot")
 
       self.evaluate(variables.global_variables_initializer())
 
@@ -131,7 +139,7 @@ class SlotCreatorTest(test.TestCase):
     with ops.Graph().as_default(), self.cached_session():
       with variable_scope.variable_scope("scope"):
         v = variables.Variable([1.0, 2.5], name="var")
-        slot = slot_creator.create_slot(v, v.initialized_value(), name="slot")
+        slot = slot_creator.create_slot(v, initialized_value(v), name="slot")
         self.assertEqual("scope/scope/var/slot", slot.op.name)
 
   def testCreateSlotFromFirstMDimensionVariable(self):
@@ -143,7 +151,7 @@ class SlotCreatorTest(test.TestCase):
           shape=[2, 2],
           partitioner=partitioned_variables.fixed_size_partitioner(2))
       for i, v in enumerate(p_v):
-        slot = slot_creator.create_slot(v, s.initialized_value(), name="slot")
+        slot = slot_creator.create_slot(v, initialized_value(s), name="slot")
         si = slot._save_slice_info
 
         self.evaluate(variables.global_variables_initializer())
@@ -165,7 +173,7 @@ class SlotCreatorTest(test.TestCase):
           shape=[2, 2],
           partitioner=partitioned_variables.fixed_size_partitioner(2))
       for i, v in enumerate(p_v):
-        slot = slot_creator.create_slot(v, s.initialized_value(), name="slot")
+        slot = slot_creator.create_slot(v, initialized_value(s), name="slot")
 
         self.evaluate(variables.global_variables_initializer())
 
@@ -181,7 +189,7 @@ class SlotCreatorTest(test.TestCase):
       v = xla_sharding.mesh_split(
           v, np.array([0, 1]), [0], use_sharding_op=False)
       slot = slot_creator.create_slot(
-          v, v.initialized_value(), name="slot", copy_xla_sharding=True)
+          v, initialized_value(v), name="slot", copy_xla_sharding=True)
       self.assertEqual(
           xla_sharding.get_tensor_sharding(v),
           xla_sharding.get_tensor_sharding(slot))
diff --git a/tensorflow/python/training/supervisor_test.py b/tensorflow/python/training/supervisor_test.py
index de328e49d58..dba66a62e61 100644
--- a/tensorflow/python/training/supervisor_test.py
+++ b/tensorflow/python/training/supervisor_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -420,7 +421,7 @@ class SupervisorTest(test.TestCase):
   def testLogdirButExplicitlyNoSummaryWriter(self):
     logdir = self._test_dir("explicit_no_summary_writer")
     with ops.Graph().as_default():
-      variables.VariableV1([1.0], name="foo")
+      variable_v1.VariableV1([1.0], name="foo")
       summary.scalar("c1", constant_op.constant(1))
       summary.scalar("c2", constant_op.constant(2))
       summary.scalar("c3", constant_op.constant(3))
@@ -488,7 +489,7 @@ class SupervisorTest(test.TestCase):
 
   def testNoLogdirSucceeds(self):
     with ops.Graph().as_default():
-      variables.VariableV1([1.0, 2.0, 3.0])
+      variable_v1.VariableV1([1.0, 2.0, 3.0])
       sv = supervisor.Supervisor(logdir="", summary_op=None)
       sess = sv.prepare_or_wait_for_session("")
       sess.close()
@@ -496,7 +497,7 @@ class SupervisorTest(test.TestCase):
 
   def testUseSessionManager(self):
     with ops.Graph().as_default():
-      variables.VariableV1([1.0, 2.0, 3.0])
+      variable_v1.VariableV1([1.0, 2.0, 3.0])
       sm = session_manager_lib.SessionManager()
       # Pass in session_manager. The additional init_op is ignored.
       sv = supervisor.Supervisor(logdir="", session_manager=sm)
@@ -506,7 +507,7 @@ class SupervisorTest(test.TestCase):
   def testInitOp(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
-      v = variables.VariableV1([1.0, 2.0, 3.0])
+      v = variable_v1.VariableV1([1.0, 2.0, 3.0])
       sv = supervisor.Supervisor(logdir=logdir)
       sess = sv.prepare_or_wait_for_session("")
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
@@ -516,7 +517,7 @@ class SupervisorTest(test.TestCase):
   def testInitFn(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
-      v = variables.VariableV1([1.0, 2.0, 3.0])
+      v = variable_v1.VariableV1([1.0, 2.0, 3.0])
 
       def _init_fn(sess):
         sess.run(v.initializer)
@@ -531,7 +532,7 @@ class SupervisorTest(test.TestCase):
     logdir = self._test_dir("feed_dict_init_op")
     with ops.Graph().as_default():
       p = array_ops.placeholder(dtypes.float32, shape=(3,))
-      v = variables.VariableV1(p, name="v")
+      v = variable_v1.VariableV1(p, name="v")
       sv = supervisor.Supervisor(
           logdir=logdir,
           init_op=variables.global_variables_initializer(),
@@ -551,10 +552,10 @@ class SupervisorTest(test.TestCase):
       g = ops.Graph()
       with g.as_default():
         with ops.device("/job:localhost"):
-          v = variables.VariableV1(
+          v = variable_v1.VariableV1(
               1, name="default_ready_for_local_init_op_v_" + str(uid))
           vadd = v.assign_add(1)
-          w = variables.VariableV1(
+          w = variable_v1.VariableV1(
               v,
               trainable=False,
               collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -592,7 +593,7 @@ class SupervisorTest(test.TestCase):
 
     # Create a checkpoint.
     with ops.Graph().as_default():
-      v = variables.VariableV1(
+      v = variable_v1.VariableV1(
           10.0, name="ready_for_local_init_op_restore_v_" + str(uid))
       summary.scalar("ready_for_local_init_op_restore_v_" + str(uid), v)
       sv = supervisor.Supervisor(logdir=logdir)
@@ -609,10 +610,10 @@ class SupervisorTest(test.TestCase):
       g = ops.Graph()
       with g.as_default():
         with ops.device("/job:localhost"):
-          v = variables.VariableV1(
+          v = variable_v1.VariableV1(
               1.0, name="ready_for_local_init_op_restore_v_" + str(uid))
           vadd = v.assign_add(1)
-          w = variables.VariableV1(
+          w = variable_v1.VariableV1(
               v,
               trainable=False,
               collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -644,13 +645,12 @@ class SupervisorTest(test.TestCase):
     logdir = self._test_dir("default_local_init_op")
     with ops.Graph().as_default():
       # A local variable.
-      v = variables.VariableV1(
-          [1.0, 2.0, 3.0],
-          trainable=False,
-          collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      v = variable_v1.VariableV1([1.0, 2.0, 3.0],
+                                 trainable=False,
+                                 collections=[ops.GraphKeys.LOCAL_VARIABLES])
 
       # An entity which is initialized through a TABLE_INITIALIZER.
-      w = variables.VariableV1([4, 5, 6], trainable=False, collections=[])
+      w = variable_v1.VariableV1([4, 5, 6], trainable=False, collections=[])
       ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, w.initializer)
 
       # This shouldn't add a variable to the VARIABLES collection responsible
@@ -670,10 +670,9 @@ class SupervisorTest(test.TestCase):
     with ops.Graph().as_default():
       with ops.device("/job:localhost"):
         # A local variable.
-        v = variables.VariableV1(
-            [1.0, 2.0, 3.0],
-            trainable=False,
-            collections=[ops.GraphKeys.LOCAL_VARIABLES])
+        v = variable_v1.VariableV1([1.0, 2.0, 3.0],
+                                   trainable=False,
+                                   collections=[ops.GraphKeys.LOCAL_VARIABLES])
         # This shouldn't add a variable to the VARIABLES collection responsible
         # for variables that are saved/restored from checkpoints.
         self.assertEqual(len(variables.global_variables()), 0)
@@ -689,8 +688,8 @@ class SupervisorTest(test.TestCase):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("default_init_op_fails")
     with ops.Graph().as_default():
-      v = variables.VariableV1([1.0, 2.0, 3.0], name="v")
-      variables.VariableV1([4.0, 5.0, 6.0], name="w")
+      v = variable_v1.VariableV1([1.0, 2.0, 3.0], name="v")
+      variable_v1.VariableV1([4.0, 5.0, 6.0], name="w")
       # w will not be initialized.
       sv = supervisor.Supervisor(logdir=logdir, init_op=v.initializer)
       with self.assertRaisesRegex(RuntimeError, "Variables not initialized: w"):
@@ -700,14 +699,12 @@ class SupervisorTest(test.TestCase):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("default_init_op_fails_for_local_variable")
     with ops.Graph().as_default():
-      v = variables.VariableV1(
-          [1.0, 2.0, 3.0],
-          name="v",
-          collections=[ops.GraphKeys.LOCAL_VARIABLES])
-      variables.VariableV1(
-          [1.0, 2.0, 3.0],
-          name="w",
-          collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      v = variable_v1.VariableV1([1.0, 2.0, 3.0],
+                                 name="v",
+                                 collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      variable_v1.VariableV1([1.0, 2.0, 3.0],
+                             name="w",
+                             collections=[ops.GraphKeys.LOCAL_VARIABLES])
       # w will not be initialized.
       sv = supervisor.Supervisor(logdir=logdir, local_init_op=v.initializer)
       with self.assertRaisesRegex(RuntimeError, "Variables not initialized: w"):
@@ -717,18 +714,18 @@ class SupervisorTest(test.TestCase):
   def testSetupFail(self):
     logdir = self._test_dir("setup_fail")
     with ops.Graph().as_default():
-      variables.VariableV1([1.0, 2.0, 3.0], name="v")
+      variable_v1.VariableV1([1.0, 2.0, 3.0], name="v")
       with self.assertRaisesRegex(ValueError, "must have their device set"):
         supervisor.Supervisor(logdir=logdir, is_chief=False)
     with ops.Graph().as_default(), ops.device("/job:ps"):
-      variables.VariableV1([1.0, 2.0, 3.0], name="v")
+      variable_v1.VariableV1([1.0, 2.0, 3.0], name="v")
       supervisor.Supervisor(logdir=logdir, is_chief=False)
 
   @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testDefaultGlobalStep(self):
     logdir = self._test_dir("default_global_step")
     with ops.Graph().as_default():
-      variables.VariableV1(287, name="global_step")
+      variable_v1.VariableV1(287, name="global_step")
       sv = supervisor.Supervisor(logdir=logdir)
       sess = sv.prepare_or_wait_for_session("")
       self.assertEqual(287, sess.run(sv.global_step))
@@ -738,7 +735,7 @@ class SupervisorTest(test.TestCase):
   def testRestoreFromMetaGraph(self):
     logdir = self._test_dir("restore_from_meta_graph")
     with ops.Graph().as_default():
-      variables.VariableV1(1, name="v0")
+      variable_v1.VariableV1(1, name="v0")
       sv = supervisor.Supervisor(logdir=logdir)
       sess = sv.prepare_or_wait_for_session("")
       filename = sv.saver.save(sess, sv.save_path)
@@ -761,7 +758,7 @@ class SupervisorTest(test.TestCase):
     logdir = self._test_dir("standard_services_without_global_step")
     # Create a checkpoint.
     with ops.Graph().as_default():
-      v = variables.VariableV1([1.0], name="foo")
+      v = variable_v1.VariableV1([1.0], name="foo")
       summary.scalar("v", v[0])
       sv = supervisor.Supervisor(logdir=logdir)
       meta_graph_def = meta_graph.create_meta_graph_def(
@@ -800,7 +797,7 @@ class SupervisorTest(test.TestCase):
     self.assertRaises(StopIteration, lambda: next(rr))
     # There should be a checkpoint file with the variable "foo"
     with ops.Graph().as_default(), self.cached_session() as sess:
-      v = variables.VariableV1([10.10], name="foo")
+      v = variable_v1.VariableV1([10.10], name="foo")
       sav = saver_lib.Saver([v])
       sav.restore(sess, save_path)
       self.assertEqual(1.0, self.evaluate(v)[0])
@@ -812,7 +809,7 @@ class SupervisorTest(test.TestCase):
     logdir = self._test_dir("standard_services_with_global_step")
     # Create a checkpoint.
     with ops.Graph().as_default():
-      v = variables.VariableV1([123], name="global_step")
+      v = variable_v1.VariableV1([123], name="global_step")
       sv = supervisor.Supervisor(logdir=logdir)
       meta_graph_def = meta_graph.create_meta_graph_def(
           saver_def=sv.saver.saver_def)
@@ -865,7 +862,7 @@ class SupervisorTest(test.TestCase):
     self.assertRaises(StopIteration, lambda: next(rr))
     # There should be a checkpoint file with the variable "foo"
     with ops.Graph().as_default(), self.cached_session() as sess:
-      v = variables.VariableV1([-12], name="global_step")
+      v = variable_v1.VariableV1([-12], name="global_step")
       sav = saver_lib.Saver([v])
       sav.restore(sess, save_path)
       self.assertEqual(123, self.evaluate(v)[0])
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 23b5f32b0b1..195c928764a 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -14,14 +14,14 @@
 # ==============================================================================
 
 """Synchronize replicas for training."""
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer
@@ -255,9 +255,9 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
     # local_anchor op will be placed on this worker task by default.
     local_anchor = control_flow_ops.no_op()
     # Colocating local_step variable prevents it being placed on the PS.
-    distribution_strategy = distribution_strategy_context.get_strategy()
+    distribution_strategy = distribute_lib.get_strategy()
     with distribution_strategy.extended.colocate_vars_with(local_anchor):
-      self._local_step = variable_scope.variable(
+      self._local_step = variable_v1.VariableV1(
           initial_value=0,
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index 5c4ac476555..06a58715e32 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -21,7 +21,7 @@ from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework.test_util import create_local_cluster
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
@@ -38,12 +38,12 @@ def get_workers(num_workers, replicas_to_aggregate, workers):
     is_chief = (worker_id == 0)
     with graph.as_default():
       with ops.device("/job:ps/task:0"):
-        global_step = variables.VariableV1(
+        global_step = variable_v1.VariableV1(
             0, name="global_step", trainable=False)
-        var_0 = variables.VariableV1(0.0, name="v0")
+        var_0 = variable_v1.VariableV1(0.0, name="v0")
       with ops.device("/job:ps/task:1"):
-        var_1 = variables.VariableV1(1.0, name="v1")
-        var_sparse = variables.VariableV1([[3.0], [4.0]], name="v_sparse")
+        var_1 = variable_v1.VariableV1(1.0, name="v1")
+        var_sparse = variable_v1.VariableV1([[3.0], [4.0]], name="v_sparse")
 
       with ops.device("/job:worker/task:" + str(worker_id)):
         grads_0 = constant_op.constant(0.1 + worker_id * 0.2)
@@ -277,8 +277,8 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
         replicas_to_aggregate=1,
         total_num_replicas=1)
     hook = opt.make_session_run_hook(True)
-    v = variables.VariableV1([0.])
-    global_step = variables.VariableV1(0, name="global_step", trainable=False)
+    v = variable_v1.VariableV1([0.])
+    global_step = variable_v1.VariableV1(0, name="global_step", trainable=False)
     opt.minimize(v, global_step=global_step)
     hook.begin()
 
@@ -289,8 +289,8 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
         opt=adam.AdamOptimizer(0.01),
         replicas_to_aggregate=1,
         total_num_replicas=1)
-    v = variables.VariableV1([0.], name="fetch_variable_test")
-    global_step = variables.VariableV1(0, name="global_step", trainable=False)
+    v = variable_v1.VariableV1([0.], name="fetch_variable_test")
+    global_step = variable_v1.VariableV1(0, name="global_step", trainable=False)
     opt.minimize(v, global_step=global_step)
     opt_variables = opt.variables()
     beta1_power, beta2_power = opt._opt._get_beta_accumulators()
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index 2a8b7c53b70..8203152b0da 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -20,12 +20,11 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -34,8 +33,8 @@ py_library(
     srcs = ["trackable_utils.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
         "//tensorflow/python/trackable:trackable_utils",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -44,8 +43,8 @@ py_library(
     srcs = ["base_delegate.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
         "//tensorflow/python/trackable:base_delegate",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -56,9 +55,9 @@ py_library(
     deps = [
         ":base",
         "//tensorflow/python:lib",
-        "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/trackable:asset",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -69,8 +68,8 @@ py_library(
     deps = [
         ":base",
         ":data_structures",
-        "//tensorflow/python:util",
         "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -80,8 +79,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":base",
-        "//tensorflow/python:util",
         "//tensorflow/python/trackable:resource",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -101,8 +100,8 @@ py_library(
     srcs = ["layer_utils.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
         "//tensorflow/python/trackable:layer_utils",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -113,9 +112,9 @@ py_library(
     deps = [
         ":base",
         ":layer_utils",
-        "//tensorflow/python:util",
         "//tensorflow/python/saved_model:revived_types",
         "//tensorflow/python/trackable:data_structures",
+        "//tensorflow/python/util:deprecation",
         "@wrapt",
     ],
 )
@@ -132,11 +131,11 @@ py_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
         "//tensorflow/python/checkpoint:graph_view",
         "//tensorflow/python/training:optimizer",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -159,22 +158,21 @@ py_library(
         "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops_gen",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:saver",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/checkpoint:checkpoint_core",
+        "//tensorflow/python/checkpoint",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/saved_model:utils",
         "//tensorflow/python/training/saving:checkpoint_options",
         "//tensorflow/python/training/saving:functional_saver",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/util:deprecation",
     ],
 )
 
@@ -184,7 +182,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":base",
-        "//tensorflow/python:util",
         "//tensorflow/python/trackable:python_state",
+        "//tensorflow/python/util:deprecation",
     ],
 )
diff --git a/tensorflow/python/training/training_ops_test.py b/tensorflow/python/training/training_ops_test.py
index c9bc3157d22..f451244591d 100644
--- a/tensorflow/python/training/training_ops_test.py
+++ b/tensorflow/python/training/training_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework.test_util import TensorFlowTestCase
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import training_ops
@@ -52,7 +53,7 @@ class TrainingOpsTest(TensorFlowTestCase):
   def _testTypes(self, x, alpha, delta, use_gpu=None):
     self.setUp()
     with self.session(use_gpu=use_gpu):
-      var = variables.VariableV1(x)
+      var = variable_v1.VariableV1(x)
       self.evaluate(variables.global_variables_initializer())
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta)
@@ -73,8 +74,8 @@ class TrainingOpsTest(TensorFlowTestCase):
   def _testTypesForAdagrad(self, x, y, lr, grad, use_gpu=None):
     self.setUp()
     with self.session(use_gpu=use_gpu):
-      var = variables.VariableV1(x)
-      accum = variables.VariableV1(y)
+      var = variable_v1.VariableV1(x)
+      accum = variable_v1.VariableV1(y)
       self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
@@ -97,9 +98,9 @@ class TrainingOpsTest(TensorFlowTestCase):
                         lr_power=-0.5):
     self.setUp()
     with self.session(use_gpu=use_gpu):
-      var = variables.VariableV1(x)
-      accum = variables.VariableV1(y)
-      linear = variables.VariableV1(z)
+      var = variable_v1.VariableV1(x)
+      accum = variable_v1.VariableV1(y)
+      linear = variable_v1.VariableV1(z)
       self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
@@ -142,9 +143,9 @@ class TrainingOpsTest(TensorFlowTestCase):
                                           lr_power=-0.5):
     self.setUp()
     with self.session(use_gpu=use_gpu):
-      var = variables.VariableV1(x)
-      accum = variables.VariableV1(y)
-      linear = variables.VariableV1(z)
+      var = variable_v1.VariableV1(x)
+      accum = variable_v1.VariableV1(y)
+      linear = variable_v1.VariableV1(z)
       self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
@@ -226,8 +227,8 @@ class TrainingOpsTest(TensorFlowTestCase):
   def _testTypesForSparseAdagrad(self, x, y, lr, grad, indices, use_gpu):
     self.setUp()
     with self.session(use_gpu=use_gpu):
-      var = variables.VariableV1(x)
-      accum = variables.VariableV1(y)
+      var = variable_v1.VariableV1(x)
+      accum = variable_v1.VariableV1(y)
       self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
@@ -257,9 +258,9 @@ class TrainingOpsTest(TensorFlowTestCase):
                               lr_power=-0.5):
     self.setUp()
     with self.session(use_gpu=use_gpu):
-      var = variables.VariableV1(x)
-      accum = variables.VariableV1(y)
-      linear = variables.VariableV1(z)
+      var = variable_v1.VariableV1(x)
+      accum = variable_v1.VariableV1(y)
+      linear = variable_v1.VariableV1(z)
       self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
@@ -296,9 +297,9 @@ class TrainingOpsTest(TensorFlowTestCase):
                                                 lr_power=-0.5):
     self.setUp()
     with self.session(use_gpu=False):
-      var = variables.VariableV1(x)
-      accum = variables.VariableV1(y)
-      linear = variables.VariableV1(z)
+      var = variable_v1.VariableV1(x)
+      accum = variable_v1.VariableV1(y)
+      linear = variable_v1.VariableV1(z)
       self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
@@ -417,9 +418,9 @@ class TrainingOpsTest(TensorFlowTestCase):
   def _testTypesForAdam(self, var, m, v, grad, use_gpu):
     self.setUp()
     with self.session(use_gpu=use_gpu):
-      var_t = variables.VariableV1(var)
-      m_t = variables.VariableV1(m)
-      v_t = variables.VariableV1(v)
+      var_t = variable_v1.VariableV1(var)
+      m_t = variable_v1.VariableV1(m)
+      v_t = variable_v1.VariableV1(v)
 
       t = 1
       beta1 = np.array(0.9, dtype=var.dtype)
@@ -430,8 +431,8 @@ class TrainingOpsTest(TensorFlowTestCase):
       epsilon = np.array(1e-8, dtype=var.dtype)
       beta1_t = constant_op.constant(beta1, self._toType(var.dtype), [])
       beta2_t = constant_op.constant(beta2, self._toType(var.dtype), [])
-      beta1_power_t = variables.VariableV1(beta1_power)
-      beta2_power_t = variables.VariableV1(beta2_power)
+      beta1_power_t = variable_v1.VariableV1(beta1_power)
+      beta2_power_t = variable_v1.VariableV1(beta2_power)
       lr_t = constant_op.constant(lr, self._toType(var.dtype), [])
       epsilon_t = constant_op.constant(epsilon, self._toType(var.dtype), [])
       self.evaluate(variables.global_variables_initializer())
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 4e72a31c9d3..3b4a20ba289 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -17,10 +17,12 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
@@ -390,11 +392,17 @@ def _get_or_create_global_step_read(graph=None):
   # add 'zero' so that it will create a copy of variable as Tensor.
   with graph.as_default() as g, g.name_scope(None):
     with g.name_scope(global_step_tensor.op.name + '/'):
-      # using initialized_value to ensure that global_step is initialized before
+      # must ensure that global_step is initialized before
       # this run. This is needed for example Estimator makes all model_fn build
       # under global_step_read_tensor dependency.
-      global_step_value = global_step_tensor.initialized_value() if isinstance(
-          global_step_tensor, variables.Variable) else global_step_tensor
+      if isinstance(global_step_tensor, variables.Variable):
+        global_step_value = control_flow_ops.cond(
+            variable_v1.is_variable_initialized(global_step_tensor),
+            global_step_tensor.read_value,
+            lambda: global_step_tensor.initial_value)
+      else:
+        global_step_value = global_step_tensor
+
       global_step_read_tensor = global_step_value + 0
       ops.add_to_collection(GLOBAL_STEP_READ_KEY, global_step_read_tensor)
   return _get_global_step_read(graph)
diff --git a/tensorflow/python/training/training_util_test.py b/tensorflow/python/training/training_util_test.py
index 36af54ee830..20234fe8bb0 100644
--- a/tensorflow/python/training/training_util_test.py
+++ b/tensorflow/python/training/training_util_test.py
@@ -16,7 +16,7 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_v1
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import training_util
@@ -32,7 +32,7 @@ class GlobalStepTest(test.TestCase):
   def test_invalid_dtype(self):
     with ops.Graph().as_default() as g:
       self.assertIsNone(training_util.get_global_step())
-      variables.VariableV1(
+      variable_v1.VariableV1(
           0.0,
           trainable=False,
           dtype=dtypes.float32,
@@ -46,12 +46,11 @@ class GlobalStepTest(test.TestCase):
   def test_invalid_shape(self):
     with ops.Graph().as_default() as g:
       self.assertIsNone(training_util.get_global_step())
-      variables.VariableV1(
-          [0],
-          trainable=False,
-          dtype=dtypes.int32,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          collections=[ops.GraphKeys.GLOBAL_STEP])
+      variable_v1.VariableV1([0],
+                             trainable=False,
+                             dtype=dtypes.int32,
+                             name=ops.GraphKeys.GLOBAL_STEP,
+                             collections=[ops.GraphKeys.GLOBAL_STEP])
       self.assertRaisesRegex(TypeError, 'not scalar',
                              training_util.get_global_step)
     self.assertRaisesRegex(TypeError, 'not scalar',
@@ -71,7 +70,7 @@ class GlobalStepTest(test.TestCase):
   def test_get_global_step(self):
     with ops.Graph().as_default() as g:
       self.assertIsNone(training_util.get_global_step())
-      variables.VariableV1(
+      variable_v1.VariableV1(
           0,
           trainable=False,
           dtype=dtypes.int32,
diff --git a/tensorflow/python/types/BUILD b/tensorflow/python/types/BUILD
index 544da4bfae8..00e4b2d3ace 100644
--- a/tensorflow/python/types/BUILD
+++ b/tensorflow/python/types/BUILD
@@ -5,29 +5,13 @@ package(
     licenses = ["notice"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//visibility:private"],
-)
-
 # Important: this is a leaf library. It may not have any new dependencies inside TF proper.
 # The sole exception is tf_export, to allow exporting symbols into the public namespace.
 pytype_strict_library(
-    name = "types",
+    name = "core",
     srcs = [
         "__init__.py",
         "core.py",
-        "data.py",
-        "distribute.py",
-        "internal.py",
-        "trace.py",
     ],
     srcs_version = "PY3",
     visibility = [
@@ -37,12 +21,58 @@ pytype_strict_library(
     deps = [
         ":doc_typealias",
         "//tensorflow/python/util:tf_export",
-        "//tensorflow/tools/docs:doc_controls",
         "//third_party/py/numpy",
         "@typing_extensions_archive//:typing_extensions",
     ],
 )
 
+pytype_strict_library(
+    name = "data",
+    srcs = ["data.py"],
+    srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    deps = ["//tensorflow/python/util:tf_export"],
+)
+
+pytype_strict_library(
+    name = "distribute",
+    srcs = ["distribute.py"],
+    srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    deps = [
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/tools/docs:doc_controls",
+    ],
+)
+
+pytype_strict_library(
+    name = "internal",
+    srcs = ["internal.py"],
+    srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+pytype_strict_library(
+    name = "trace",
+    srcs = ["trace.py"],
+    srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    deps = [
+        ":core",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/tools/docs:doc_controls",
+        "@typing_extensions_archive//:typing_extensions",
+    ],
+)
+
 pytype_strict_library(
     name = "doc_typealias",
     srcs = [
diff --git a/tensorflow/python/types/trace.py b/tensorflow/python/types/trace.py
index 9fa17e541cb..4cedaa92625 100644
--- a/tensorflow/python/types/trace.py
+++ b/tensorflow/python/types/trace.py
@@ -211,6 +211,11 @@ class TraceType(metaclass=abc.ABCMeta):
     del value
     return []
 
+  @doc_controls.do_not_doc_inheritable
+  def _flatten(self) -> List["TraceType"]:
+    """Returns a list of TensorSpecs corresponding to `_to_tensors` values."""
+    return []
+
   @doc_controls.do_not_doc_inheritable
   def _cast(self, value, casting_context) -> Any:  # pylint:disable=unused-argument
     """Cast value to this type.
diff --git a/tensorflow/python/user_ops/BUILD b/tensorflow/python/user_ops/BUILD
index 40af85ab3cf..4eae2f33702 100644
--- a/tensorflow/python/user_ops/BUILD
+++ b/tensorflow/python/user_ops/BUILD
@@ -1,6 +1,7 @@
 # Description:
 #   Contains User Ops (internal TensorFlow version).
 
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
 
 visibility = [
@@ -18,13 +19,12 @@ tf_gen_op_wrapper_private_py(
 )
 
 # This target is deprecated.
-py_library(
+py_strict_library(
     name = "ops",
     srcs = ["user_ops.py"],
     srcs_version = "PY3",
     deps = [
         ":user_ops_gen",
-        "//tensorflow/python/util",
-        "@six_archive//:six",
+        "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index df4c4257748..e511b018f1b 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -1,7 +1,7 @@
 # Tensorflow util package
 
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "tf_py_test", "tf_python_pybind_extension")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "tf_py_strict_test", "tf_python_pybind_extension")
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")  # @unused
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 
@@ -10,6 +10,7 @@ visibility = [
     "//third_party/cloud_tpu/convergence_tools:__subpackages__",
     "//third_party/mlperf:__subpackages__",
     "//tensorflow:internal",
+    "//tensorflow/compiler/tf2xla:__pkg__",
     "//tensorflow/lite/toco/python:__pkg__",
     "//tensorflow_models:__subpackages__",
     "//tensorflow_model_optimization:__subpackages__",
@@ -193,10 +194,11 @@ tf_python_pybind_extension(
     hdrs = [
         "//tensorflow/c:checkpoint_reader_hdrs",
         "//tensorflow/c:headers",
+        "//tensorflow/c:safe_ptr_hdr",
         "//tensorflow/c/eager:headers",
         "//tensorflow/python/lib/core:ndarray_tensor_hdr",
         "//tensorflow/python/lib/core:py_exception_registry_hdr",
-        "//tensorflow/python/lib/core:safe_ptr_hdr",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
     ],
     deps = [
         "//tensorflow/core:lib_headers_for_pybind",
@@ -218,27 +220,27 @@ cc_library(
     srcs = ["util.cc"],
     hdrs = ["util.h"],
     deps = [
+        "//tensorflow/c:safe_ptr",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/python/lib/core:safe_ptr",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/memory",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "decorator_utils_test",
     srcs = ["decorator_utils_test.py"],
     python_version = "PY3",
     deps = [
-        ":util",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
+        ":decorator_utils",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "deprecation_test",
     srcs = [
         "deprecated_module.py",
@@ -247,29 +249,55 @@ tf_py_test(
     ],
     python_version = "PY3",
     deps = [
-        ":util",
+        ":deprecation",
+        ":tf_inspect",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "dispatch_test",
     srcs = ["dispatch_test.py"],
     python_version = "PY3",
     deps = [
-        ":util",
+        ":deprecation",
+        ":dispatch",
+        ":nest",
+        ":tf_export",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_stack",
+        "//tensorflow/python:bitwise_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:proto_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:extension_type",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops/linalg:linear_operator_diag",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/types:core",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "keyword_args_test",
     srcs = ["keyword_args_test.py"],
     python_version = "PY3",
     deps = [
-        ":util",
+        ":keyword_args",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -279,24 +307,29 @@ py_strict_library(
     srcs = ["tf_export.py"],
     compatible_with = get_compatible_with_portable(),
     srcs_version = "PY3",
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = [
+        "//tensorflow:__pkg__",
+        "//tensorflow:__subpackages__",
+        "//tensorflow_estimator:__subpackages__",
+    ],
     deps = [
         ":tf_decorator",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tf_export_test",
     srcs = ["tf_export_test.py"],
     python_version = "PY3",
     deps = [
-        ":util",
+        ":tf_decorator",
+        ":tf_export",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "vlog_test",
     srcs = ["vlog_test.py"],
     python_version = "PY3",
@@ -324,6 +357,7 @@ py_strict_library(
         "//third_party/py/tf_slim:__subpackages__",
         "//learning/deepmind/research/language/translation/lm:__subpackages__",
         "//learning/brain/analytics:__subpackages__",
+        "//tensorflow:__pkg__",
     ],
     deps = [
         "@six_archive//:six",
@@ -354,27 +388,26 @@ tf_python_pybind_extension(
         "stack_trace.h",
     ],
     deps = [
+        "//tensorflow/c:pywrap_required_hdrs",
+        "//tensorflow/core/common_runtime:core_cpu_headers_lib",
+        "//tensorflow/core/framework:pywrap_required_hdrs",
+        "//tensorflow/core/platform:path",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@pybind11",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
-        "//tensorflow/c:pywrap_required_hdrs",
-        "//tensorflow/core/common_runtime:core_cpu_headers_lib",
-        "//tensorflow/core/framework:pywrap_required_hdrs",
-        "//tensorflow/core/platform:path",
     ] + if_static([
         ":stack_trace",
     ]),
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tf_stack_test",
     srcs = ["tf_stack_test.py"],
     python_version = "PY3",
     deps = [
-        ":tf_export",
         ":tf_stack",
         "//tensorflow/python:client_testlib",
     ],
@@ -424,17 +457,17 @@ tf_python_pybind_extension(
     srcs = ["function_parameter_canonicalizer_binding_for_test.cc"],
     hdrs = [
         "function_parameter_canonicalizer.h",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
     ],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/types:span",
         "@pybind11",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "function_parameter_canonicalizer_test",
     srcs = ["function_parameter_canonicalizer_test.py"],
     python_version = "PY3",
@@ -448,64 +481,36 @@ tf_py_test(
     ],
 )
 
-py_library(
-    name = "util",
-    srcs = glob(
-        ["*.py"],
-        exclude = [
-            "example_parser*",
-            "tf_contextlib.py",
-            "tf_should_use.py",
-            "tf_export.py",
-            "tf_stack.py",
-            "tf_decorator.py",
-            "*_test.py",
-        ],
-    ),
-    compatible_with = get_compatible_with_portable(),
-    deprecation = "This target has been split. Depend on the sub-targets instead.",
-    srcs_version = "PY3",
-    visibility = util_subpackage_visibility,
-    deps = [
-        ":_pywrap_tensor_float_32_execution",
-        ":_pywrap_determinism",
-        # global_test_configuration is added here because all major tests depend on this
-        # library. It isn't possible to add these test dependencies via tensorflow.bzl's
-        # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
-        "//tensorflow/python:global_test_configuration",
-        ":tf_decorator",
-        ":tf_export",
-        "@com_google_protobuf//:protobuf_python",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-        "@wrapt",
-        "//tensorflow/python/util/protobuf",
-        "//tensorflow/tools/docs:doc_controls",
-        "//tensorflow/tools/compatibility:all_renames_v2",
-    ],
-)
-
-tf_py_test(
+tf_py_strict_test(
     name = "traceback_utils_test",
     size = "small",
     srcs = ["traceback_utils_test.py"],
     python_version = "PY3",
     deps = [
+        ":traceback_utils",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "object_identity_test",
     size = "small",
     srcs = ["object_identity_test.py"],
     python_version = "PY3",
+    deps = [
+        ":nest",
+        ":object_identity",
+        "//tensorflow/python/platform:client_testlib",
+    ],
 )
 
 # Placeholder for intenal nest_test comments.
-tf_py_test(
+tf_py_strict_test(
     name = "nest_test",
     size = "small",
     srcs = ["nest_test.py"],
@@ -534,47 +539,49 @@ py_strict_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "serialization_test",
     size = "small",
     srcs = ["serialization_test.py"],
     main = "serialization_test.py",
     python_version = "PY3",
     deps = [
-        ":util",
+        ":serialization",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/framework:tensor_shape",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "function_utils_test",
     srcs = ["function_utils_test.py"],
     python_version = "PY3",
     deps = [
-        ":util",
+        ":function_utils",
         "//tensorflow/python:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tf_contextlib_test",
     size = "small",
     srcs = ["tf_contextlib_test.py"],
     python_version = "PY3",
     deps = [
-        ":util",
+        ":tf_decorator",
         "//tensorflow/python:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tf_decorator_test",
     size = "small",
     srcs = ["tf_decorator_test.py"],
     python_version = "PY3",
     deps = [
-        ":util",
+        ":tf_decorator",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
@@ -586,11 +593,11 @@ py_strict_library(
         ":tf_decorator",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tf_should_use_test",
     size = "small",
     srcs = ["tf_should_use_test.py"],
@@ -598,17 +605,23 @@ tf_py_test(
     deps = [
         ":tf_should_use",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tf_inspect_test",
     size = "small",
     srcs = ["tf_inspect_test.py"],
     python_version = "PY3",
     deps = [
-        ":util",
+        ":tf_decorator",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
@@ -631,7 +644,7 @@ py_strict_library(
     deps = [
         ":decorator_utils",
         ":is_in_graph_mode",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:tf_logging",
         # global_test_configuration is added here because all major tests depend on this
         # library. It isn't possible to add these test dependencies via tensorflow.bzl's
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
@@ -661,7 +674,7 @@ py_strict_library(
     deps = [
         ":__init__",
         ":compat",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:tf_logging",
         # global_test_configuration is added here because all major tests depend on this
         # library. It isn't possible to add these test dependencies via tensorflow.bzl's
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
@@ -707,7 +720,7 @@ py_strict_library(
     srcs = ["lazy_loader.py"],
     visibility = util_subpackage_visibility,
     deps = [
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:tf_logging",
         # global_test_configuration is added here because all major tests depend on this
         # library. It isn't possible to add these test dependencies via tensorflow.bzl's
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
@@ -805,7 +818,9 @@ py_strict_library(
 py_strict_library(
     name = "serialization",
     srcs = ["serialization.py"],
-    visibility = util_subpackage_visibility,
+    visibility = util_subpackage_visibility + [
+        "//tensorflow_kfac/python/kernel_tests:__subpackages__",
+    ],
     deps = [
         ":compat",
         "//tensorflow/python/framework:dtypes",
@@ -854,7 +869,7 @@ py_strict_library(
     deps = [
         ":__init__",
         "//tensorflow/python/eager:monitoring",
-        "//tensorflow/python/platform",
+        "//tensorflow/python/platform:tf_logging",
         # global_test_configuration is added here because all major tests depend on this
         # library. It isn't possible to add these test dependencies via tensorflow.bzl's
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
@@ -984,28 +999,14 @@ py_strict_library(
     ],
 )
 
-py_strict_library(
-    name = "compat_internal",
-    srcs = ["compat_internal.py"],
-    compatible_with = get_compatible_with_portable(),
-    visibility = util_subpackage_visibility,
-    deps = [
-        ":compat",
-        # global_test_configuration is added here because all major tests depend on this
-        # library. It isn't possible to add these test dependencies via tensorflow.bzl's
-        # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
-        "//tensorflow/python:global_test_configuration",
-    ],
-)
-
-tf_py_test(
+tf_py_strict_test(
     name = "lock_util_test",
     size = "small",
     srcs = ["lock_util_test.py"],
     main = "lock_util_test.py",
     python_version = "PY3",
     deps = [
-        ":util",
+        ":lock_util",
         "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1022,31 +1023,31 @@ tf_python_pybind_extension(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "fast_module_type_test",
     srcs = ["fast_module_type_test.py"],
     python_version = "PY3",
     deps = [
         ":fast_module_type",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "module_wrapper_test",
     size = "small",
     srcs = ["module_wrapper_test.py"],
     python_version = "PY3",
     deps = [
-        ":fast_module_type",
-        ":util",
+        ":module_wrapper",
+        ":tf_inspect",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/tools/compatibility:all_renames_v2",
-        "@six_archive//:six",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "example_parser_configuration_test",
     size = "small",
     srcs = ["example_parser_configuration_test.py"],
@@ -1054,20 +1055,25 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         ":example_parser_configuration",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:parsing_ops",
+        "//tensorflow/python/client:session",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "type_annotations_test",
     size = "small",
     srcs = ["type_annotations_test.py"],
     python_version = "PY3",
     deps = [
+        ":type_annotations",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -1076,28 +1082,59 @@ filegroup(
     srcs = ["util.h"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "compat_test",
     srcs = ["compat_test.py"],
     python_version = "PY3",
     deps = [
-        ":util",
+        ":compat",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "variable_utils_test",
     srcs = ["variable_utils_test.py"],
     deps = [
-        ":util",
+        ":nest",
+        ":variable_utils",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+tf_py_strict_test(
+    name = "pywrap_xla_ops_test",
+    srcs = ["pywrap_xla_ops_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "no_gpu",
+        "no_pip",
+    ],
+    deps = [
+        ":pywrap_xla_ops",
         "//tensorflow/python/platform:test",
     ],
 )
+
+tf_python_pybind_extension(
+    name = "pywrap_xla_ops",
+    srcs = ["tf2xla_opset_wrapper.cc"],
+    hdrs = [
+        "//tensorflow/compiler/tf2xla:tf2xla_opset_hdrs",
+    ],
+    deps = [
+        "//tensorflow/python:pybind11_lib",
+        "@com_google_absl//absl/status:statusor",
+        "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:absl_casters",
+        "@pybind11_abseil//pybind11_abseil:status_casters",
+    ],
+)
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index e4980ecf371..718fc14f1b1 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import extension_type
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
@@ -254,7 +255,9 @@ class DispatchTest(test_util.TensorFlowTestCase):
       x = TensorTracer("x")
       y = TensorTracer("y")
       trace = math_ops.add(
-          math_ops.abs(ops.convert_to_tensor_v2_with_dispatch(x)), y)
+          math_ops.abs(tensor_conversion.convert_to_tensor_v2_with_dispatch(x)),
+          y,
+      )
       self.assertEqual(
           str(trace), "math.add(math.abs(convert_to_tensor(x)), y)")
 
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 76b4a44a9e4..5266b8e535f 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -187,12 +187,6 @@ def is_nested_or_composite(seq):
   return _is_nested_or_composite(seq)
 
 
-# FIXME(feyu): Remove the back-compat names before closing b/201685523, after
-# all users of is_sequence are moved to the new names. (cl/405503918)
-def is_sequence(seq):
-  return nest_util.is_nested(nest_util.Modality.CORE, seq)
-
-
 def is_sequence_or_composite(seq):
   return _is_nested_or_composite(seq)
 
diff --git a/tensorflow/python/util/object_identity.py b/tensorflow/python/util/object_identity.py
index e39d930f376..0ffa5755604 100644
--- a/tensorflow/python/util/object_identity.py
+++ b/tensorflow/python/util/object_identity.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+from typing import Any, Set
 import weakref
 
 from tensorflow.python.util.compat import collections_abc
@@ -181,6 +182,26 @@ class ObjectIdentitySet(collections_abc.MutableSet):
   def __init__(self, *args):
     self._storage = set(self._wrap_key(obj) for obj in list(*args))
 
+  def __le__(self, other: Set[Any]) -> bool:
+    if not isinstance(other, Set):
+      return NotImplemented
+    if len(self) > len(other):
+      return False
+    for item in self._storage:
+      if item not in other:
+        return False
+    return True
+
+  def __ge__(self, other: Set[Any]) -> bool:
+    if not isinstance(other, Set):
+      return NotImplemented
+    if len(self) < len(other):
+      return False
+    for item in other:
+      if item not in self:
+        return False
+    return True
+
   @staticmethod
   def _from_storage(storage):
     result = ObjectIdentitySet()
diff --git a/tensorflow/python/util/protobuf/BUILD b/tensorflow/python/util/protobuf/BUILD
index 8d501ae1943..3c12e5d7bbd 100644
--- a/tensorflow/python/util/protobuf/BUILD
+++ b/tensorflow/python/util/protobuf/BUILD
@@ -1,7 +1,7 @@
 # Tensorflow protobuf utility package
 
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "tf_py_strict_test")
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")  # @unused
 
 visibility = [
@@ -36,7 +36,7 @@ tf_proto_library(
     cc_api_version = 2,
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "protobuf_compare_test",
     size = "small",
     srcs = ["compare_test.py"],
@@ -45,9 +45,9 @@ tf_py_test(
     tags = ["no_pip"],  # compare_test_pb2 proto is not available in pip.
     deps = [
         ":compare_test_proto_py",
-        "@six_archive//:six",
+        ":protobuf",
         "//tensorflow/python:platform_test",
-        #"//tensorflow/python/util",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/python/util/protobuf/compare.py b/tensorflow/python/util/protobuf/compare.py
index 2696c120b28..44a9bfd15b3 100644
--- a/tensorflow/python/util/protobuf/compare.py
+++ b/tensorflow/python/util/protobuf/compare.py
@@ -70,15 +70,27 @@ from google.protobuf import message
 from google.protobuf import text_format
 
 
+# TODO(alankelly): Distinguish between signalling and quiet NaNs.
 def isClose(x, y, relative_tolerance):  # pylint: disable=invalid-name
-  """Returns True if x is close to y given the relative tolerance.
+  """Returns True if x is close to y given the relative tolerance or if x and y are both inf, both -inf, or both NaNs.
+
+  This function does not distinguish between signalling and non-signalling NaN.
 
   Args:
     x: float value to be compared
     y: float value to be compared
-    relative_tolerance: float, relative tolerance.  Returns false if x or y is
-      'inf' or 'nan'
+    relative_tolerance: float. The allowable difference between the two values
+      being compared is determined by multiplying the relative tolerance by the
+      maximum of the two values. If this is not provided, then all floats are
+      compared using string comparison.
   """
+  # NaNs are considered equal.
+  if math.isnan(x) or math.isnan(y):
+    return math.isnan(x) == math.isnan(y)
+
+  if math.isinf(x) or math.isinf(y):
+    return x == y
+
   return abs(x - y) <= relative_tolerance * max(abs(x), abs(y))
 
 
@@ -211,54 +223,6 @@ def assertProtoEqual(
       self.fail('%s :\n%s' % (msg, diff))
 
 
-def FindNans(pb):
-  """Checks  number fields of type flaot and double for NaN.
-
-  Recurses into nested objects.
-
-  Args:
-    pb: proto2 message.
-
-  Returns:
-    True if pb contains NaN.
-  """
-  result = False
-  for desc, values in pb.ListFields():
-    if desc.label != descriptor.FieldDescriptor.LABEL_REPEATED:
-      values = [values]
-
-    if (
-        desc.type == descriptor.FieldDescriptor.TYPE_FLOAT
-        or desc.type == descriptor.FieldDescriptor.TYPE_DOUBLE
-    ):
-      for x in values:
-        if math.isnan(x):
-          return True
-
-    if (
-        desc.type == descriptor.FieldDescriptor.TYPE_MESSAGE
-        or desc.type == descriptor.FieldDescriptor.TYPE_GROUP
-    ):
-      if (
-          desc.type == descriptor.FieldDescriptor.TYPE_MESSAGE
-          and desc.message_type.has_options
-          and desc.message_type.GetOptions().map_entry
-      ):
-        # This is a map, only recurse if the values have a message type.
-        if (
-            desc.message_type.fields_by_number[2].type
-            == descriptor.FieldDescriptor.TYPE_MESSAGE
-        ):
-          for v in six.itervalues(values):
-            result |= FindNans(v)
-      else:
-        for v in values:
-          # recursive step
-          result |= FindNans(v)
-
-  return result
-
-
 def NormalizeNumberFields(pb):
   """Normalizes types and precisions of number fields in a protocol buffer.
 
diff --git a/tensorflow/python/util/protobuf/compare_test.py b/tensorflow/python/util/protobuf/compare_test.py
index 9bc3c1f9444..d2a887eaf1a 100644
--- a/tensorflow/python/util/protobuf/compare_test.py
+++ b/tensorflow/python/util/protobuf/compare_test.py
@@ -204,29 +204,25 @@ class ProtoEqTest(googletest.TestCase):
     self.assertNotEquals('medium < smalls < strings: "a" > >',
                          'small < strings: "b" >')
 
-  def testNans(self):
-    pb = compare_test_pb2.Floats(float_=1)
-    self.assertFalse(compare.FindNans(pb))
-    pb = compare_test_pb2.Floats(float_=float('nan'))
-    self.assertTrue(compare.FindNans(pb))
-
-  def testNansRepeatedField(self):
-    pb = compare_test_pb2.RepeatedFloats(
-        float_=(x for x in [1, 2, float('nan')])
-    )
-    self.assertTrue(compare.FindNans(pb))
-
-  def testNansMap(self):
-    pb = compare_test_pb2.Floats(float_=float('nan'))
-    self.assertTrue(compare.FindNans(pb))
-
   def testIsClose(self):
     self.assertTrue(compare.isClose(1, 1, 1e-10))
     self.assertTrue(compare.isClose(65061.0420, 65061.0322, 1e-5))
     self.assertFalse(compare.isClose(65061.0420, 65061.0322, 1e-7))
-    # Special floats: Nans, inf and denormalized numbers
-    self.assertFalse(compare.isClose(float('nan'), float('nan'), 1e-10))
-    self.assertFalse(compare.isClose(float('inf'), float('inf'), 1e-10))
+
+  def testIsCloseNan(self):
+    self.assertTrue(compare.isClose(float('nan'), float('nan'), 1e-10))
+    self.assertFalse(compare.isClose(float('nan'), 1, 1e-10))
+    self.assertFalse(compare.isClose(1, float('nan'), 1e-10))
+    self.assertFalse(compare.isClose(float('nan'), float('inf'), 1e-10))
+
+  def testIsCloseInf(self):
+    self.assertTrue(compare.isClose(float('inf'), float('inf'), 1e-10))
+    self.assertTrue(compare.isClose(float('-inf'), float('-inf'), 1e-10))
+    self.assertFalse(compare.isClose(float('-inf'), float('inf'), 1e-10))
+    self.assertFalse(compare.isClose(float('inf'), 1, 1e-10))
+    self.assertFalse(compare.isClose(1, float('inf'), 1e-10))
+
+  def testIsCloseSubnormal(self):
     x = sys.float_info.min * sys.float_info.epsilon
     self.assertTrue(compare.isClose(x, x, 1e-10))
     self.assertFalse(compare.isClose(x, 1, 1e-10))
diff --git a/tensorflow/python/util/py_checkpoint_reader_wrapper.cc b/tensorflow/python/util/py_checkpoint_reader_wrapper.cc
index 7f3d1a26ef1..84856405b80 100644
--- a/tensorflow/python/util/py_checkpoint_reader_wrapper.cc
+++ b/tensorflow/python/util/py_checkpoint_reader_wrapper.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/stl.h"  // from @pybind11
 #include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -31,7 +32,7 @@ limitations under the License.
 #include "tensorflow/python/lib/core/py_exception_registry.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 namespace py = pybind11;
 
diff --git a/tensorflow/python/util/pywrap_xla_ops_test.py b/tensorflow/python/util/pywrap_xla_ops_test.py
new file mode 100644
index 00000000000..3b5a396dc20
--- /dev/null
+++ b/tensorflow/python/util/pywrap_xla_ops_test.py
@@ -0,0 +1,39 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from tensorflow.python.platform import googletest  # pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.util import pywrap_xla_ops
+
+
+class XlaOpsetUtilsTest(googletest.TestCase):
+
+  def testGetGpuCompilableKernelNames(self):
+    """Tests retrieving compilable op names for GPU."""
+    op_names = pywrap_xla_ops.get_gpu_kernel_names()
+    self.assertGreater(op_names.__len__(), 0)
+    self.assertEqual(op_names.count('Max'), 1)
+    self.assertEqual(op_names.count('Min'), 1)
+    self.assertEqual(op_names.count('MatMul'), 1)
+
+  def testGetCpuCompilableKernelNames(self):
+    """Tests retrieving compilable op names for CPU."""
+    op_names = pywrap_xla_ops.get_cpu_kernel_names()
+    self.assertGreater(op_names.__len__(), 0)
+    self.assertEqual(op_names.count('Max'), 1)
+    self.assertEqual(op_names.count('Min'), 1)
+    self.assertEqual(op_names.count('MatMul'), 1)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/util/tf2xla_opset_wrapper.cc b/tensorflow/python/util/tf2xla_opset_wrapper.cc
new file mode 100644
index 00000000000..ffeb2e02c0a
--- /dev/null
+++ b/tensorflow/python/util/tf2xla_opset_wrapper.cc
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <pybind11/stl.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "pybind11/stl.h"  // from @pybind11
+#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
+#include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil
+#include "tensorflow/compiler/tf2xla/tf2xla_opset.h"
+
+using tensorflow::GetRegisteredXlaOpsForDevice;
+
+PYBIND11_MODULE(pywrap_xla_ops, m) {
+  pybind11::google::ImportStatusModule();
+  m.def(
+      "get_gpu_kernel_names",
+      []() -> absl::StatusOr<std::vector<std::string>> {
+        return GetRegisteredXlaOpsForDevice("XLA_GPU_JIT");
+      },
+      R"pbdoc(
+     Returns list of names of gpu ops that can be compiled.
+    )pbdoc");
+  m.def(
+      "get_cpu_kernel_names",
+      []() -> absl::StatusOr<std::vector<std::string>> {
+        return GetRegisteredXlaOpsForDevice("XLA_CPU_JIT");
+      },
+      R"pbdoc(
+     Returns list of names of cpu ops that can be compiled.
+    )pbdoc");
+};
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 0e106932c5f..781dcb2ae89 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -90,11 +90,19 @@ if hasattr(_inspect, 'getfullargspec'):
       from FullArgSpec.
     """
     fullargspecs = getfullargspec(target)
+
+    defaults = fullargspecs.defaults or ()
+    if fullargspecs.kwonlydefaults:
+      defaults += tuple(fullargspecs.kwonlydefaults.values())
+
+    if not defaults:
+      defaults = None
+
     argspecs = ArgSpec(
-        args=fullargspecs.args,
+        args=fullargspecs.args + fullargspecs.kwonlyargs,
         varargs=fullargspecs.varargs,
         keywords=fullargspecs.varkw,
-        defaults=fullargspecs.defaults,
+        defaults=defaults,
     )
     return argspecs
 else:
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
index 2666f90e4a7..c81e229b26b 100644
--- a/tensorflow/python/util/tf_inspect_test.py
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -50,6 +50,12 @@ def test_decorated_function_with_defaults(a, b=2, c='Hello'):
   return [a, b, c]
 
 
+@test_decorator('decorator')
+def test_decorated_function_with_varargs_and_kwonlyargs(*args, b=2, c='Hello'):
+  """Test Decorated Function With both varargs and keyword args."""
+  return [args, b, c]
+
+
 @test_decorator('decorator')
 class TestDecoratedClass(object):
   """Test Decorated Class."""
@@ -97,11 +103,19 @@ class TfInspectTest(test.TestCase):
         annotations={},
     )
 
-    inner_decorator = tf_decorator.TFDecorator('', test_undecorated_function,
-                                               '', argspec)
+    inner_decorator = tf_decorator.TFDecorator(
+        '', test_undecorated_function, '', argspec
+    )
     outer_decorator = tf_decorator.TFDecorator('', inner_decorator)
     self.assertEqual(argspec, tf_inspect.getargspec(outer_decorator))
 
+  def testGetArgSpecThatContainsVarargsAndKwonlyArgs(self):
+    argspec = tf_inspect.getargspec(
+        test_decorated_function_with_varargs_and_kwonlyargs
+    )
+    self.assertEqual(['b', 'c'], argspec.args)
+    self.assertEqual((2, 'Hello'), argspec.defaults)
+
   def testGetArgSpecReturnsOutermostDecoratorThatChangesArgspec(self):
     outer_argspec = tf_inspect.FullArgSpec(
         args=['a'],
diff --git a/tensorflow/python/util/tf_stack.cc b/tensorflow/python/util/tf_stack.cc
index cc68016379e..50a2e64fd0c 100644
--- a/tensorflow/python/util/tf_stack.cc
+++ b/tensorflow/python/util/tf_stack.cc
@@ -200,6 +200,11 @@ class StackTraceWrapper : public AbstractStackTrace {
     return *stack_frames_cache_;
   }
 
+  void WipeCache() override {
+    tensorflow::mutex_lock lock(mu_);
+    stack_frames_cache_ = {};
+  }
+
   int get_stacklevel() const { return stacklevel_; }
 
   void set_stacklevel(int stacklevel) { stacklevel_ = stacklevel; }
@@ -491,6 +496,9 @@ PYBIND11_MODULE(_tf_stack, m) {
             return StackTraceWrapper{self.GetUserFrames()};
           },
           "Returns the non-framework frames as a new trace object.")
+      .def(
+          "wipe_cache", [](StackTraceWrapper& self) { self.WipeCache(); },
+          "Remove all cached or generated data.")
       .def(
           "last_user_frame",
           [](const StackTraceWrapper& self) { return self.LastUserFrame(); },
diff --git a/tensorflow/python/util/tf_stack_test.py b/tensorflow/python/util/tf_stack_test.py
index 06a626347d9..673bdb9cdbf 100644
--- a/tensorflow/python/util/tf_stack_test.py
+++ b/tensorflow/python/util/tf_stack_test.py
@@ -117,5 +117,23 @@ class TFStackTest(test.TestCase):
     with self.assertRaises(IndexError):
       del trace[len(trace)]
 
+  def testWipeCache(self):
+    def func(n):
+      if n == 0:
+        return tf_stack.extract_stack()  # COMMENT
+      else:
+        return func(n - 1)
+
+    trace = func(5)
+    full_list = list(trace)
+    del trace[-1]
+    self.assertLess(len(trace), len(full_list))
+
+    # Wiping the "cache" restores the stack trace to its
+    # original representation.
+    trace.wipe_cache()
+    self.assertEqual(len(trace), len(full_list))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 1b92ef70198..49e97e41959 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -143,6 +143,7 @@ string PyObjectToString(PyObject* o) {
   }
 }
 
+// FIXME(b/280464631): Consider remove this class.
 class CachedTypeCheck {
  public:
   explicit CachedTypeCheck(std::function<int(PyObject*)> ternary_predicate)
diff --git a/tensorflow/pytype.default.bzl b/tensorflow/pytype.default.bzl
index 05143e8a715..424185f4b63 100644
--- a/tensorflow/pytype.default.bzl
+++ b/tensorflow/pytype.default.bzl
@@ -1,5 +1,7 @@
 """Default (OSS) build versions of Python pytype rules."""
 
+load("//tensorflow:tensorflow.bzl", _py_test = "py_test")
+
 # Placeholder to use until bazel supports pytype_library.
 def pytype_library(name, pytype_deps = [], pytype_srcs = [], **kwargs):
     _ = (pytype_deps, pytype_srcs)  # @unused
@@ -12,3 +14,7 @@ def pytype_strict_binary(name, **kwargs):
 # Placeholder to use until bazel supports pytype_strict_library.
 def pytype_strict_library(name, **kwargs):
     native.py_library(name = name, **kwargs)
+
+# Placeholder to use until bazel supports pytype_strict_contrib_test.
+def pytype_strict_contrib_test(name, **kwargs):
+    _py_test(name = name, **kwargs)
diff --git a/tensorflow/security/README.md b/tensorflow/security/README.md
index ddcaff30f6b..8bd449e8e38 100644
--- a/tensorflow/security/README.md
+++ b/tensorflow/security/README.md
@@ -13,6 +13,26 @@ in
 
 | Advisory Number                                                                                                     | Type                                                                               | Versions affected   | Reported by                                                                        | Additional Information
 | ------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | :-----------------: | ---------------------------------------------------------------------------------- | --------------------------------------------------------------
+| [TFSA-2023-020](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-020.md) | OOB Read in GRUBlockCellGrad              | <= 2.12.0           |        r3pwnx of 360 AIVul Team                                      |
+| [TFSA-2023-019](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-019.md) | FPE in AvgPoolGrad with XLA              | <= 2.12.0           |        r3pwnx of 360 AIVul Team                                      |
+| [TFSA-2023-018](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-018.md) | OOB read in DynamicStitch              | <= 2.12.0           |        Google OSS VRP                                      |
+| [TFSA-2023-017](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-017.md) | NPE in QuantizedMatMulWithBiasAndDequantize              | <= 2.12.0           |        r3pwnx of 360 AIVul Team                                      |
+| [TFSA-2023-016](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-016.md) | Seg fault in `tf.raw_ops.Print`              | <= 2.12.0           |        Yu Tian of Qihoo 360 AIVul Team                                      |
+| [TFSA-2023-015](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-015.md) | Segmentation fault in tfg-translate              | <= 2.12.0           |        r3pwnx of 360 AIVul Team                                      |
+| [TFSA-2023-014](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-014.md) | Integer overflow in EditDistance             | <= 2.12.0           |        r3pwnx of 360 AIVul Team                                      |
+| [TFSA-2023-013](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-013.md) | FPE in TensorListSplit with XLA             | <= 2.12.0           |        r3pwnx of 360 AIVul Team                                      |
+| [TFSA-2023-012](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-012.md) | NPE in TensorArrayConcatV2            | <= 2.12.0           |        Yu Tian of Qihoo 360 AIVul Team                                      |
+| [TFSA-2023-011](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-011.md) | FPE in TensorListSplit with XLA            | <= 2.12.0           |        r3pwnx of 360 AIVul Team                                      |
+| [TFSA-2023-010](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-010.md) | Heap-buffer-overflow in AvgPoolGrad           | <= 2.12.0           |         evn@google.com                                                     |
+| [TFSA-2023-009](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-009.md) | NPE in RandomShuffle with XLA enable         | <= 2.12.0           |         r3pwnx of 360 AIVul Team                                                     |
+| [TFSA-2023-008](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-008.md) | FPE in AudioSpectrogram         | <= 2.12.0           |         r3pwnx of 360 AIVul Team                                                     |
+| [TFSA-2023-007](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-007.md) | Segfault in Bincount with XLA        | <= 2.12.0           |         r3pwnx of 360 AIVul Team                                                     |
+| [TFSA-2023-006](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-006.md) | NPE in SparseSparseMaximum        | <= 2.12.0           |         Yu Tian of Qihoo 360 AIVul Team                                                     |
+| [TFSA-2023-005](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-005.md) | Null dereference on ParallelConcat with XLA        | <= 2.12.0           |         r3pwnx of 360 AIVul Team                                                     |
+| [TFSA-2023-004](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-004.md) | Segfault when opening multiframe gif        | <= 2.12.0           |         Andrei                                                     |
+| [TFSA-2023-003](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-003.md) | Double free in Fractional(Max/Avg)Pool        | <= 2.12.0           |         https://github.com/dmc1778 of nimashiri2012@gmail.com                                                     |
+| [TFSA-2023-002](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-002.md) | A heap out-of-buffer read vulnerability in the QuantizeAndDequantize operation        | <= 2.12.0           |                                                              |
+| [TFSA-2023-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2023-001.md) | FPE in TFLite in conv kernel        | <= 2.12.0           | Wang Xuan of Qihoo 360 AIVul Team                                                              |
 | [TFSA-2022-170](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-170.md) | `CHECK` fail in `TensorListScatter` and `TensorListScatterV2` in eager mode        | <= 2.11.0           | Pattarakrit Rattankul                                                              |
 | [TFSA-2022-169](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-169.md) | `CHECK` failure in `SobolSample` via missing validation                            | <= 2.11.0           | (multiple authors)                                                                 |
 | [TFSA-2022-168](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-168.md) | Heap overflow in `QuantizeAndDequantizeV2`                                         | <= 2.11.0           |  Reported via OSS VRP                                                                      |
diff --git a/tensorflow/security/advisory/tfsa-2022-170.md b/tensorflow/security/advisory/tfsa-2022-170.md
index 2dd55689808..e693012610e 100644
--- a/tensorflow/security/advisory/tfsa-2022-170.md
+++ b/tensorflow/security/advisory/tfsa-2022-170.md
@@ -1,4 +1,4 @@
-## TFSA-2022-169: `CHECK` fail in `TensorListScatter` and `TensorListScatterV2` in eager mode
+## TFSA-2022-170: `CHECK` fail in `TensorListScatter` and `TensorListScatterV2` in eager mode
 
 ### CVE Number
 CVE-2022-35991
diff --git a/tensorflow/security/advisory/tfsa-2023-001.md b/tensorflow/security/advisory/tfsa-2023-001.md
new file mode 100644
index 00000000000..383985403dc
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-001.md
@@ -0,0 +1,20 @@
+## TFSA-2023-001: FPE in TFLite in conv kernel
+
+### CVE Number
+CVE-2023-27579 
+
+### Impact
+Constructing a tflite model with a paramater `filter_input_channel` of less than 1 gives a FPE.
+
+
+### Patches
+We have patched the issue in GitHub commit [34f8368c535253f5c9cb3a303297743b62442aaa](https://github.com/tensorflow/tensorflow/commit/34f8368c535253f5c9cb3a303297743b62442aaa).
+
+The fix will be included in TensorFlow 2.12. We will also cherrypick this commit on TensorFlow 2.11.1.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+This vulnerability was reported by Wang Xuan of Qihoo 360 AIVul Team.
diff --git a/tensorflow/security/advisory/tfsa-2023-002.md b/tensorflow/security/advisory/tfsa-2023-002.md
new file mode 100644
index 00000000000..31e24269eab
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-002.md
@@ -0,0 +1,39 @@
+## TFSA-2023-002: A heap out-of-buffer read vulnerability in the QuantizeAndDequantize operation
+
+### CVE Number
+CVE-2023-25668
+
+### Impact
+Attackers using Tensorflow can exploit the vulnerability. They can access heap memory which is not in the control of user, leading to a crash or RCE.
+When axis is larger than the dim of input, c->Dim(input,axis) goes out of bound.
+Same problem occurs in the QuantizeAndDequantizeV2/V3/V4/V4Grad operations too.
+```python
+import tensorflow as tf
+@tf.function
+def test():
+    tf.raw_ops.QuantizeAndDequantizeV2(input=[2.5],
+    								   input_min=[1.0],
+    								   input_max=[10.0],
+    								   signed_input=True,
+    								   num_bits=1,
+    								   range_given=True,
+    								   round_mode='HALF_TO_EVEN',
+    								   narrow_range=True,
+    								   axis=0x7fffffff)
+test()
+```
+
+
+
+### Patches
+We have patched the issue in GitHub commit [7b174a0f2e40ff3f3aa957aecddfd5aaae35eccb](https://github.com/tensorflow/tensorflow/commit/7b174a0f2e40ff3f3aa957aecddfd5aaae35eccb).
+
+The fix will be included in TensorFlow 2.12.0. We will also cherrypick this commit on TensorFlow 2.11.1
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+
+
diff --git a/tensorflow/security/advisory/tfsa-2023-003.md b/tensorflow/security/advisory/tfsa-2023-003.md
new file mode 100644
index 00000000000..0fe5639307f
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-003.md
@@ -0,0 +1,40 @@
+## TFSA-2023-003: Double free in Fractional(Max/Avg)Pool
+
+### CVE Number
+CVE-2023-25801
+
+### Impact
+`nn_ops.fractional_avg_pool_v2` and `nn_ops.fractional_max_pool_v2` require the first and fourth elements of their parameter `pooling_ratio` to be equal to 1.0, as pooling on batch and channel dimensions is not supported.
+
+```python
+import tensorflow as tf
+import os
+import numpy as np
+from tensorflow.python.ops import nn_ops
+try:
+  arg_0_tensor = tf.random.uniform([3, 30, 50, 3], dtype=tf.float64)
+  arg_0 = tf.identity(arg_0_tensor)
+  arg_1_0 = 2
+  arg_1_1 = 3
+  arg_1_2 = 1
+  arg_1_3 = 1
+  arg_1 = [arg_1_0,arg_1_1,arg_1_2,arg_1_3,]
+  arg_2 = True
+  arg_3 = True
+  seed = 341261001
+  out = nn_ops.fractional_avg_pool_v2(arg_0,arg_1,arg_2,arg_3,seed=seed,)
+except Exception as e:
+  print("Error:"+str(e))
+```
+
+### Patches
+We have patched the issue in GitHub commit [ee50d1e00f81f62a4517453f721c634bbb478307](https://github.com/tensorflow/tensorflow/commit/ee50d1e00f81f62a4517453f721c634bbb478307).
+
+The fix will be included in TensorFlow 2.12. We will also cherrypick this commit on TensorFlow 2.11.1.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+This vulnerability was reported by [dmc1778](https://github.com/dmc1778), of [nimashiri2012@gmail.com](mailto:nimashiri2012@gmail.com).
diff --git a/tensorflow/security/advisory/tfsa-2023-004.md b/tensorflow/security/advisory/tfsa-2023-004.md
new file mode 100644
index 00000000000..241ead45c12
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-004.md
@@ -0,0 +1,31 @@
+## TFSA-2023-004: Segfault when opening multiframe gif
+
+### CVE Number
+CVE-2023-25667
+
+### Impact
+Integer overflow occurs when 2^31 <= num_frames * height * width * channels < 2^32, for example Full HD screencast of at least 346 frames.
+```python
+import urllib.request
+dat = urllib.request.urlopen('https://raw.githubusercontent.com/tensorflow/tensorflow/1c38ad9b78ffe06076745a1ee00cec42f39ff726/tensorflow/core/lib/gif/testdata/3g_multiframe.gif').read()
+import tensorflow as tf
+tf.io.decode_gif(dat)
+```
+
+### Patches
+We have patched the issue in GitHub commit [8dc723fcdd1a6127d6c970bd2ecb18b019a1a58d](https://github.com/tensorflow/tensorflow/commit/8dc723fcdd1a6127d6c970bd2ecb18b019a1a58d).
+
+The fix will be included in TensorFlow 2.12.0. We will also cherrypick this commit on TensorFlow 2.11.1
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Andrei
+
+
+
+
+
diff --git a/tensorflow/security/advisory/tfsa-2023-005.md b/tensorflow/security/advisory/tfsa-2023-005.md
new file mode 100644
index 00000000000..b8b7a51e414
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-005.md
@@ -0,0 +1,34 @@
+## TFSA-2023-005: Null dereference on ParallelConcat with XLA
+
+### CVE Number
+CVE-2023-25676
+
+### Impact
+When running with XLA, `tf.raw_ops.ParallelConcat` segfaults with a nullptr dereference when given a parameter `shape` with rank that is not greater than zero.
+
+```python
+import tensorflow as tf
+
+func = tf.raw_ops.ParallelConcat
+para = {'shape':  0, 'values': [1]}
+
+@tf.function(jit_compile=True)
+def test():
+   y = func(**para)
+   return y
+
+test()
+```
+
+### Patches
+We have patched the issue in GitHub commit [da66bc6d5ff466aee084f9e7397980a24890cd15](https://github.com/tensorflow/tensorflow/commit/da66bc6d5ff466aee084f9e7397980a24890cd15).
+
+The fix will be included in TensorFlow 2.12. We will also cherrypick this commit on TensorFlow 2.11.1.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by r3pwnx of 360 AIVul Team
diff --git a/tensorflow/security/advisory/tfsa-2023-006.md b/tensorflow/security/advisory/tfsa-2023-006.md
new file mode 100644
index 00000000000..f54450e5c23
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-006.md
@@ -0,0 +1,37 @@
+## TFSA-2023-006: NPE in SparseSparseMaximum
+
+### CVE Number
+CVE-2023-25665
+
+### Impact
+When `SparseSparseMaximum` is given invalid sparse tensors as inputs, it can give an NPE. 
+
+```python
+import tensorflow as tf
+tf.raw_ops.SparseSparseMaximum(
+ a_indices=[[1]],
+ a_values =[ 0.1 ],
+ a_shape = [2],
+ b_indices=[[]],
+ b_values =[2 ],
+ b_shape = [2],
+)
+```
+
+### Patches
+We have patched the issue in GitHub commit [5e0ecfb42f5f65629fd7a4edd6c4afe7ff0feb04](https://github.com/tensorflow/tensorflow/commit/5e0ecfb42f5f65629fd7a4edd6c4afe7ff0feb04).
+
+The fix will be included in TensorFlow 2.12. We will also cherrypick this commit on TensorFlow 2.11.1.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Yu Tian of Qihoo 360 AIVul Team
+
+
+
+
+
diff --git a/tensorflow/security/advisory/tfsa-2023-007.md b/tensorflow/security/advisory/tfsa-2023-007.md
new file mode 100644
index 00000000000..e627f6c5467
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-007.md
@@ -0,0 +1,34 @@
+## TFSA-2023-007: Segfault in Bincount with XLA
+
+### CVE Number
+CVE-2023-25675
+
+### Impact
+When running with XLA, `tf.raw_ops.Bincount` segfaults when given a parameter `weights` that is neither the same shape as parameter `arr` nor a length-0 tensor.
+
+```python
+import tensorflow as tf
+
+func = tf.raw_ops.Bincount
+para={'arr': 6, 'size': 804, 'weights': [52, 351]}
+
+@tf.function(jit_compile=True)
+def fuzz_jit():
+ y = func(**para)
+ return y
+
+print(fuzz_jit())
+```
+
+### Patches
+We have patched the issue in GitHub commit [8ae76cf085f4be26295d2ecf2081e759e04b8acf](https://github.com/tensorflow/tensorflow/commit/8ae76cf085f4be26295d2ecf2081e759e04b8acf).
+
+The fix will be included in TensorFlow 2.12. We will also cherrypick this commit on TensorFlow 2.11.1.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by r3pwnx of 360 AIVul Team
diff --git a/tensorflow/security/advisory/tfsa-2023-008.md b/tensorflow/security/advisory/tfsa-2023-008.md
new file mode 100644
index 00000000000..d910e7a86c0
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-008.md
@@ -0,0 +1,47 @@
+## TFSA-2023-008: FPE in AudioSpectrogram 
+
+### CVE Number
+CVE-2023-25666
+
+### Impact
+version:2.11.0 audio_ops.cc:70
+
+Status SpectrogramShapeFn(InferenceContext* c) { ShapeHandle input; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input)); int32_t window_size; TF_RETURN_IF_ERROR(c->GetAttr("window_size", &window_size)); int32_t stride; TF_RETURN_IF_ERROR(c->GetAttr("stride", &stride)); .....[1]
+
+DimensionHandle input_length = c->Dim(input, 0); DimensionHandle input_channels = c->Dim(input, 1);
+
+DimensionHandle output_length; if (!c->ValueKnown(input_length)) { output_length = c->UnknownDim(); } else { const int64_t input_length_value = c->Value(input_length); const int64_t length_minus_window = (input_length_value - window_size); int64_t output_length_value; if (length_minus_window < 0) { output_length_value = 0; } else { output_length_value = 1 + (length_minus_window / stride); .....[2] } output_length = c->MakeDim(output_length_value); }
+
+Get the value of stride at [1], and the used at [2]
+```python
+import tensorflow as tf
+
+para = {'input': tf.constant([[14.], [24.]], dtype=tf.float32), 'window_size': 1, 'stride': 0, 'magnitude_squared': False}
+func = tf.raw_ops.AudioSpectrogram
+
+@tf.function(jit_compile=True)
+def fuzz_jit():
+   y = func(**para)
+   return y
+
+fuzz_jit()
+```
+
+### Patches
+We have patched the issue in GitHub commit [d0d4e779da0d0f56499c6fa5ba09f0a576cc6b14](https://github.com/tensorflow/tensorflow/commit/d0d4e779da0d0f56499c6fa5ba09f0a576cc6b14).
+
+The fix will be included in TensorFlow 2.12.0. We will also cherrypick this commit on TensorFlow 2.11.1
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by r3pwnx
+
+
+
+
+
+
diff --git a/tensorflow/security/advisory/tfsa-2023-009.md b/tensorflow/security/advisory/tfsa-2023-009.md
new file mode 100644
index 00000000000..78b7eb6fc91
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-009.md
@@ -0,0 +1,33 @@
+## TFSA-2023-009: NPE in RandomShuffle with XLA enable
+
+### CVE Number
+CVE-2023-25674
+
+### Impact
+NPE in RandomShuffle with XLA enable 
+```python
+import tensorflow as tf
+
+func = tf.raw_ops.RandomShuffle
+para = {'value': 1e+20, 'seed': -4294967297, 'seed2': -2147483649}
+
+@tf.function(jit_compile=True)
+def test():
+   y = func(**para)
+   return y
+
+test()
+```
+
+### Patches
+We have patched the issue in GitHub commit [728113a3be690facad6ce436660a0bc1858017fa](https://github.com/tensorflow/tensorflow/commit/728113a3be690facad6ce436660a0bc1858017fa).
+
+The fix will be included in TensorFlow 2.12.0. We will also cherrypick this commit on TensorFlow 2.11.1
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by r3pwnx
diff --git a/tensorflow/security/advisory/tfsa-2023-010.md b/tensorflow/security/advisory/tfsa-2023-010.md
new file mode 100644
index 00000000000..4a8e23c1651
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-010.md
@@ -0,0 +1,40 @@
+## TFSA-2023-010: Heap-buffer-overflow in AvgPoolGrad  
+
+### CVE Number
+CVE-2023-25664
+
+### Impact
+```python
+import os
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+import tensorflow as tf
+print(tf.__version__)
+with tf.device("CPU"):
+    ksize = [1, 40, 128, 1]
+    strides = [1, 128, 128, 30]
+    padding = "SAME"
+    data_format = "NHWC"
+    orig_input_shape = [11, 9, 78, 9]
+    grad = tf.saturate_cast(tf.random.uniform([16, 16, 16, 16], minval=-128, maxval=129, dtype=tf.int64), dtype=tf.float32)
+    res = tf.raw_ops.AvgPoolGrad(
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        orig_input_shape=orig_input_shape,
+        grad=grad,
+    )
+```
+
+### Patches
+We have patched the issue in GitHub commit [ddaac2bdd099bec5d7923dea45276a7558217e5b](https://github.com/tensorflow/tensorflow/commit/ddaac2bdd099bec5d7923dea45276a7558217e5b).
+
+The fix will be included in TensorFlow 2.12.0. We will also cherrypick this commit on TensorFlow 2.11.1
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by <evn@google.com>
diff --git a/tensorflow/security/advisory/tfsa-2023-011.md b/tensorflow/security/advisory/tfsa-2023-011.md
new file mode 100644
index 00000000000..cbbc21691ef
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-011.md
@@ -0,0 +1,33 @@
+## TFSA-2023-011: FPE in TensorListSplit with XLA 
+
+### CVE Number
+CVE-2023-25673
+
+### Impact
+FPE in TensorListSplit with XLA 
+```python
+import tensorflow as tf
+
+func = tf.raw_ops.TensorListSplit
+para = {'tensor': [1], 'element_shape': -1, 'lengths': [0]}
+
+@tf.function(jit_compile=True)
+def fuzz_jit():
+ y = func(**para)
+ return y
+
+print(fuzz_jit())
+```
+
+### Patches
+We have patched the issue in GitHub commit [728113a3be690facad6ce436660a0bc1858017fa](https://github.com/tensorflow/tensorflow/commit/728113a3be690facad6ce436660a0bc1858017fa).
+
+The fix will be included in TensorFlow 2.12.0. We will also cherrypick this commit on TensorFlow 2.11.1
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by r3pwnx
\ No newline at end of file
diff --git a/tensorflow/security/advisory/tfsa-2023-012.md b/tensorflow/security/advisory/tfsa-2023-012.md
new file mode 100644
index 00000000000..4b0e95da1b5
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-012.md
@@ -0,0 +1,24 @@
+## TFSA-2023-012: NPE in TensorArrayConcatV2
+
+### CVE Number
+CVE-2023-25663
+
+### Impact
+When ctx->step_containter() is a null ptr, the Lookup function will be executed with a null pointer.
+```python
+import tensorflow as tf
+tf.raw_ops.TensorArrayConcatV2(handle=['a', 'b'], flow_in = 0.1, dtype=tf.int32, element_shape_except0=1)
+```
+
+### Patches
+We have patched the issue in GitHub commit [239139d2ae6a81ae9ba499ad78b56d9b2931538a](https://github.com/tensorflow/tensorflow/commit/239139d2ae6a81ae9ba499ad78b56d9b2931538a).
+
+The fix will be included in TensorFlow 2.12.0. We will also cherrypick this commit on TensorFlow 2.11.1
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Yu Tian
\ No newline at end of file
diff --git a/tensorflow/security/advisory/tfsa-2023-013.md b/tensorflow/security/advisory/tfsa-2023-013.md
new file mode 100644
index 00000000000..56fbb66e285
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-013.md
@@ -0,0 +1,33 @@
+## TFSA-2023-013: FPE in TensorListSplit with XLA 
+
+### CVE Number
+CVE-2023-25673
+
+### Impact
+FPE in TensorListSplit with XLA 
+```python
+import tensorflow as tf
+
+func = tf.raw_ops.TensorListSplit
+para = {'tensor': [1], 'element_shape': -1, 'lengths': [0]}
+
+@tf.function(jit_compile=True)
+def fuzz_jit():
+ y = func(**para)
+ return y
+
+print(fuzz_jit())
+```
+
+### Patches
+We have patched the issue in GitHub commit [728113a3be690facad6ce436660a0bc1858017fa](https://github.com/tensorflow/tensorflow/commit/728113a3be690facad6ce436660a0bc1858017fa).
+
+The fix will be included in TensorFlow 2.12.0. We will also cherrypick this commit on TensorFlow 2.11.1
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by r3pwnx
\ No newline at end of file
diff --git a/tensorflow/security/advisory/tfsa-2023-014.md b/tensorflow/security/advisory/tfsa-2023-014.md
new file mode 100644
index 00000000000..c5d5dc93881
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-014.md
@@ -0,0 +1,35 @@
+## TFSA-2023-014: Integer overflow in EditDistance
+
+### CVE Number
+CVE-2023-25662
+
+### Impact
+TFversion 2.11.0 //tensorflow/core/ops/array_ops.cc:1067 const Tensor* hypothesis_shape_t = c->input_tensor(2); std::vector<DimensionHandle> dims(hypothesis_shape_t->NumElements() - 1); for (int i = 0; i < dims.size(); ++i) { dims[i] = c->MakeDim(std::max(h_values(i), t_values(i))); }
+
+if hypothesis_shape_t is empty, hypothesis_shape_t->NumElements() - 1 will be integer overflow, and the it will deadlock
+```python
+import tensorflow as tf
+para={
+    'hypothesis_indices': [[]],
+    'hypothesis_values': ['tmp/'],
+    'hypothesis_shape': [],
+    'truth_indices': [[]],
+    'truth_values': [''],
+    'truth_shape': [],
+    'normalize': False
+    }
+tf.raw_ops.EditDistance(**para)
+```
+
+### Patches
+We have patched the issue in GitHub commit [08b8e18643d6dcde00890733b270ff8d9960c56c](https://github.com/tensorflow/tensorflow/commit/08b8e18643d6dcde00890733b270ff8d9960c56c).
+
+The fix will be included in TensorFlow 2.12.0. We will also cherrypick this commit on TensorFlow 2.11.1
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by r3pwnx
\ No newline at end of file
diff --git a/tensorflow/security/advisory/tfsa-2023-015.md b/tensorflow/security/advisory/tfsa-2023-015.md
new file mode 100644
index 00000000000..b5758f2d663
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-015.md
@@ -0,0 +1,26 @@
+## TFSA-2023-015: Segmentation fault in tfg-translate 
+
+### CVE Number
+CVE-2023-25671
+
+### Impact
+Out-of-bounds access due to mismatched integer type sizes in ValueMap::Manager::GetValueOrCreatePlaceholder. Bug with tfg-translate call to InitMlir. The problem happens with generic functions, as it is already handled for non-generic functions. This is because they, unlike non-generic functions, are using the "old importer". A better long-term solution may be to have the "new importer" handle generic functions.
+
+### Patches
+We have patched the issue in GitHub
+- commit [760322a71ac9033e122ef1f4b1c62813021e5938](https://github.com/tensorflow/tensorflow/commit/760322a71ac9033e122ef1f4b1c62813021e5938).
+- commit [2eedc8f676d2c3b8be9492e547b2bc814c10b367](https://github.com/tensorflow/tensorflow/commit/2eedc8f676d2c3b8be9492e547b2bc814c10b367)
+
+The fix will be included in TensorFlow 2.12.0. We will also cherrypick this commit on TensorFlow 2.11.1
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by r3pwnx
+
+### Affiliation
+360 AIVul
+
diff --git a/tensorflow/security/advisory/tfsa-2023-016.md b/tensorflow/security/advisory/tfsa-2023-016.md
new file mode 100644
index 00000000000..72174fe1f3c
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-016.md
@@ -0,0 +1,30 @@
+## TFSA-2023-016: Seg fault in `tf.raw_ops.Print`
+
+### CVE Number
+CVE-2023-25660
+
+### Impact
+When the parameter `summarize` of `tf.raw_ops.Print` is zero, the new method `SummarizeArray<bool>` will reference to a nullptr, leading to a seg fault.
+
+```python
+import tensorflow as tf
+
+tf.raw_ops.Print(input =  tf.constant([1, 1, 1, 1],dtype=tf.int32),
+                            data =  [[False, False, False, False], [False], [False, False, False]],
+                            message =  'tmp/I',
+                            first_n = 100,
+                            summarize = 0)
+```
+
+### Patches
+We have patched the issue in GitHub commit [6d423b8bcc9aa9f5554dc988c1c16d038b508df1](https://github.com/tensorflow/tensorflow/commit/6d423b8bcc9aa9f5554dc988c1c16d038b508df1).
+
+The fix will be included in TensorFlow 2.12. We will also cherrypick this commit on TensorFlow 2.11.1.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Yu Tian of Qihoo 360 AIVul Team
diff --git a/tensorflow/security/advisory/tfsa-2023-017.md b/tensorflow/security/advisory/tfsa-2023-017.md
new file mode 100644
index 00000000000..556e5119483
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-017.md
@@ -0,0 +1,28 @@
+## TFSA-2023-017: NPE in QuantizedMatMulWithBiasAndDequantize
+
+### CVE Number
+CVE-2023-25670
+
+### Impact
+NPE in QuantizedMatMulWithBiasAndDequantize with MKL enable 
+```python
+import tensorflow as tf
+
+func = tf.raw_ops.QuantizedMatMulWithBiasAndDequantize
+para={'a': tf.constant(138, dtype=tf.quint8), 'b': tf.constant(4, dtype=tf.qint8), 'bias': [[31.81644630432129, 47.21876525878906], [109.95201110839844, 152.07968139648438]], 'min_a': 141.5337138686371, 'max_a': [73.84139251708984, 173.15280151367188], 'min_b': [], 'max_b': [[16.128345489501953, 193.26820373535156]], 'min_freezed_output': [], 'max_freezed_output': [115.50032806396484, 156.974853515625], 'Toutput': 1.0, 'transpose_a': True, 'transpose_b': False, 'input_quant_mode': 'MIN_FIRST'}
+
+func(**para)
+```
+
+### Patches
+We have patched the issue in GitHub commit [8a47a39d9697969206d23a523c977238717e8727](https://github.com/tensorflow/tensorflow/commit/8a47a39d9697969206d23a523c977238717e8727).
+
+The fix will be included in TensorFlow 2.12.0. We will also cherrypick this commit on TensorFlow 2.11.1
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by r3pwnx
diff --git a/tensorflow/security/advisory/tfsa-2023-018.md b/tensorflow/security/advisory/tfsa-2023-018.md
new file mode 100644
index 00000000000..977b61a6071
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-018.md
@@ -0,0 +1,27 @@
+## TFSA-2023-018: OOB read in DynamicStitch
+
+### CVE Number
+CVE-2023-25659
+
+### Impact
+If the parameter `indices` for `DynamicStitch` does not match the shape of the parameter `data`, it can trigger an stack OOB read.
+
+```python
+import tensorflow as tf
+func = tf.raw_ops.DynamicStitch
+para={'indices': [[0xdeadbeef], [405], [519], [758], [1015]], 'data': [[110.27793884277344], [120.29475402832031], [157.2418212890625], [157.2626953125], [188.45382690429688]]}
+y = func(**para)
+```
+
+### Patches
+We have patched the issue in GitHub commit [ee004b18b976eeb5a758020af8880236cd707d05](https://github.com/tensorflow/tensorflow/commit/ee004b18b976eeb5a758020af8880236cd707d05).
+
+The fix will be included in TensorFlow 2.12. We will also cherrypick this commit on TensorFlow 2.11.1.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This has been reported via Google OSS VRP.
\ No newline at end of file
diff --git a/tensorflow/security/advisory/tfsa-2023-019.md b/tensorflow/security/advisory/tfsa-2023-019.md
new file mode 100644
index 00000000000..9e6c879a65d
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-019.md
@@ -0,0 +1,32 @@
+## TFSA-2023-019: FPE in AvgPoolGrad with XLA
+
+### CVE Number
+CVE-2023-25669
+
+### Impact
+If the stride and window size are not positive for `tf.raw_ops.AvgPoolGrad`, it can give an FPE.
+
+```python
+import tensorflow as tf
+import numpy as np
+
+@tf.function(jit_compile=True)
+def test():
+   y = tf.raw_ops.AvgPoolGrad(orig_input_shape=[1,0,0,0], grad=[[[[0.39117979]]]], ksize=[1,0,0,0], strides=[1,0,0,0], padding="SAME", data_format="NCHW")
+   return y
+
+print(test())
+```
+
+### Patches
+We have patched the issue in GitHub commit [1295ae4dbb52fe06b19733b0257e2340d7b63b8d](https://github.com/tensorflow/tensorflow/commit/1295ae4dbb52fe06b19733b0257e2340d7b63b8d).
+
+The fix will be included in TensorFlow 2.12. We will also cherrypick this commit on TensorFlow 2.11.1.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by r3pwnx of 360 AIVul Team
diff --git a/tensorflow/security/advisory/tfsa-2023-020.md b/tensorflow/security/advisory/tfsa-2023-020.md
new file mode 100644
index 00000000000..719dc211cf0
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2023-020.md
@@ -0,0 +1,34 @@
+## TFSA-2023-020: OOB Read in GRUBlockCellGrad
+
+### CVE Number
+CVE-2023-25658
+
+### Impact
+Out of bounds read in GRUBlockCellGrad
+```python
+func = tf.raw_ops.GRUBlockCellGrad
+
+para = {'x': [[21.1, 156.2], [83.3, 115.4]], 'h_prev': array([[136.5],
+      [136.6]]), 'w_ru': array([[26.7,  0.8],
+      [47.9, 26.1],
+      [26.2, 26.3]]), 'w_c': array([[ 0.4],
+      [31.5],
+      [ 0.6]]), 'b_ru': array([0.1, 0.2 ], dtype=float32), 'b_c': 0x41414141, 'r': array([[0.3],
+      [0.4]], dtype=float32), 'u': array([[5.7],
+      [5.8]]), 'c': array([[52.9],
+      [53.1]]), 'd_h': array([[172.2],
+      [188.3 ]])}
+```
+
+### Patches
+We have patched the issue in GitHub commit [ff459137c2716a2a60f7d441b855fcb466d778cb](https://github.com/tensorflow/tensorflow/commit/ff459137c2716a2a60f7d441b855fcb466d778cb).
+
+The fix will be included in TensorFlow 2.12.0. We will also cherrypick this commit on TensorFlow 2.11.1
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by r3pwnx.
diff --git a/tensorflow/security/fuzzing/cc/core/function/BUILD b/tensorflow/security/fuzzing/cc/core/function/BUILD
index 4382bb1a06d..36b98aadb1f 100644
--- a/tensorflow/security/fuzzing/cc/core/function/BUILD
+++ b/tensorflow/security/fuzzing/cc/core/function/BUILD
@@ -9,6 +9,16 @@ tf_cc_fuzz_test(
     name = "runtime_client_fuzz",
     srcs = ["runtime_client_fuzz.cc"],
     deps = [
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_types_hdr",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/framework:function_proto_cc",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/function/runtime_client:runtime_client_cc",
+        "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc b/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc
index 98002f57aac..ebc3db7d369 100644
--- a/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc
@@ -19,7 +19,19 @@
 
 #include <gtest/gtest.h>
 #include "fuzztest/fuzztest.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/function/runtime_client/runtime_client.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace tensorflow {
 namespace fuzzing {
@@ -55,17 +67,17 @@ FunctionDef EmptyFunctionDefGenerator(int number_of_input_arguments,
                                    body_nodes, ret_def);
 }
 
-fuzztest::Domain<FunctionDef> FunctionDefDomain() {
-  return fuzztest::Map(EmptyFunctionDefGenerator, fuzztest::InRange(0, 7),
-                       fuzztest::InRange(1, 7));
-}
-
 class FuzzRuntimeClient {
  public:
   FuzzRuntimeClient()
       : ctx_(InitLocalEagerContextPtr()), rt_(core::function::Runtime(*ctx_)) {}
 
-  void CreateFunctionFuzz(FunctionDef def) {
+  void CreateFunctionInnerFuzz(int in_args, int out_args) {
+    TF_CHECK_OK(
+        rt_.CreateFunction(EmptyFunctionDefGenerator(in_args, out_args)));
+  }
+
+  void CreateFunctionOuterFuzz(FunctionDef def) {
     TF_CHECK_OK(rt_.CreateFunction(def));
   }
 
@@ -90,8 +102,10 @@ class FuzzRuntimeClient {
   }
 };
 
-FUZZ_TEST_F(FuzzRuntimeClient, CreateFunctionFuzz)
-    .WithDomains(FunctionDefDomain());
+FUZZ_TEST_F(FuzzRuntimeClient, CreateFunctionInnerFuzz)
+    .WithDomains(fuzztest::InRange(0, 7), fuzztest::InRange(1, 7));
+
+FUZZ_TEST_F(FuzzRuntimeClient, CreateFunctionOuterFuzz);
 
 }  // end namespace fuzzing
 }  // end namespace tensorflow
diff --git a/tensorflow/security/fuzzing/cc/fuzz_session.h b/tensorflow/security/fuzzing/cc/fuzz_session.h
index 96390e07b39..690a48d0d9e 100644
--- a/tensorflow/security/fuzzing/cc/fuzz_session.h
+++ b/tensorflow/security/fuzzing/cc/fuzz_session.h
@@ -132,7 +132,7 @@ class FuzzSession {
       // within a session.  Failure to create the session means we
       // can't send any data to the op.
       LOG(FATAL) << "Could not create session: "  // Crash OK
-                 << status.error_message();
+                 << status.message();
     }
     return status;
   }
@@ -157,7 +157,7 @@ class FuzzSession {
   void Fuzz(const T&... args) {
     Status status = InitIfNeeded();
     TF_CHECK_OK(status) << "Fuzzer graph initialization failed: "
-                        << status.error_message();
+                        << status.message();
     // No return value from fuzzing:  Success is defined as "did not
     // crash".  The actual application results are irrelevant.
     FuzzImpl(args...);
diff --git a/tensorflow/security/fuzzing/cc/ops/BUILD b/tensorflow/security/fuzzing/cc/ops/BUILD
index 2dfe3c75518..80504f03cb4 100644
--- a/tensorflow/security/fuzzing/cc/ops/BUILD
+++ b/tensorflow/security/fuzzing/cc/ops/BUILD
@@ -20,6 +20,8 @@ tf_cc_fuzz_test(
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/kernels:array",
         "//tensorflow/security/fuzzing/cc:fuzz_session",
+        "//tensorflow/security/fuzzing/cc/core/framework:tensor_domains",
+        "//tensorflow/security/fuzzing/cc/core/framework:tensor_shape_domains",
     ],
 )
 
@@ -32,6 +34,8 @@ tf_cc_fuzz_test(
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/kernels:array",
         "//tensorflow/security/fuzzing/cc:fuzz_session",
+        "//tensorflow/security/fuzzing/cc/core/framework:tensor_domains",
+        "//tensorflow/security/fuzzing/cc/core/framework:tensor_shape_domains",
     ],
 )
 
@@ -60,6 +64,8 @@ tf_cc_fuzz_test(
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/security/fuzzing/cc:fuzz_session",
+        "//tensorflow/security/fuzzing/cc/core/framework:tensor_domains",
+        "//tensorflow/security/fuzzing/cc/core/framework:tensor_shape_domains",
     ],
 )
 
@@ -72,6 +78,8 @@ tf_cc_fuzz_test(
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:bincount_op",
         "//tensorflow/security/fuzzing/cc:fuzz_session",
+        "//tensorflow/security/fuzzing/cc/core/framework:tensor_domains",
+        "//tensorflow/security/fuzzing/cc/core/framework:tensor_shape_domains",
     ],
 )
 
diff --git a/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc
index 61c2af314ae..bf2fe219c20 100644
--- a/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc
@@ -17,42 +17,51 @@
 #include <gtest/gtest.h>
 #include "fuzztest/fuzztest.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/security/fuzzing/cc/core/framework/tensor_domains.h"
+#include "tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.h"
 #include "tensorflow/security/fuzzing/cc/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
 
 // Creates FuzzBincount class that wraps a single operation node session.
-class FuzzBincount : public FuzzSession<Tensor, int32> {
+class FuzzBincount : public FuzzSession<Tensor, int32, Tensor> {
   void BuildGraph(const Scope& scope) override {
-    auto op_node1 =
-        tensorflow::ops::Placeholder(scope.WithOpName("input1"), DT_INT32);
-    auto op_node2 =
-        tensorflow::ops::Placeholder(scope.WithOpName("input2"), DT_INT32);
-    auto op_node3 =
-        tensorflow::ops::Placeholder(scope.WithOpName("input3"), DT_INT32);
-    tensorflow::ops::Bincount(scope.WithOpName("output"), op_node1, op_node2,
-                              op_node3);
+    auto arr = tensorflow::ops::Placeholder(scope.WithOpName("arr"), DT_INT32);
+    auto size =
+        tensorflow::ops::Placeholder(scope.WithOpName("size"), DT_INT32);
+    auto weights =
+        tensorflow::ops::Placeholder(scope.WithOpName("weights"), DT_INT32);
+    tensorflow::ops::Bincount(scope.WithOpName("output"), arr, size, weights);
   }
-  void FuzzImpl(const Tensor& data, const int32& nbins) final {
-    Tensor size(DT_INT32, TensorShape({1}));
+  void FuzzImpl(const Tensor& arr, const int32& nbins,
+                const Tensor& weights) final {
+    Tensor size(DT_INT32, {});
     size.flat<int32>()(0) = nbins;
 
-    // weights must be the same shape as data or a length 0
-    // in which case it acts as all weights equal to 1
-    Tensor weights(DT_INT32, TensorShape({0}));
-
     Status s = RunInputsWithStatus(
-        {{"input1", data}, {"input2", size}, {"input3", weights}});
+        {{"arr", arr}, {"size", size}, {"weights", weights}});
     if (!s.ok()) {
-      LOG(ERROR) << "Execution failed: " << s.error_message();
+      LOG(ERROR) << "Execution failed: " << s.message();
     }
   }
 };
 
 // Setup up fuzzing test.
+// TODO(unda, b/275737422): Make the values in arr be within [0, size) with high
+// chance
 FUZZ_TEST_F(FuzzBincount, Fuzz)
-    .WithDomains(AnyTensor<int32>(), fuzztest::Arbitrary<int32>());
+    .WithDomains(fuzzing::AnyValidTensor(fuzzing::AnyValidTensorShape(
+                                             /*max_rank=*/5,
+                                             /*dim_lower_bound=*/0,
+                                             /*dim_upper_bound=*/10),
+                                         fuzztest::Just(DT_INT32)),
+                 fuzztest::InRange<int32>(0, 10),
+                 fuzzing::AnyValidTensor(fuzzing::AnyValidTensorShape(
+                                             /*max_rank=*/5,
+                                             /*dim_lower_bound=*/0,
+                                             /*dim_upper_bound=*/10),
+                                         fuzztest::Just(DT_INT32)));
 
 }  // end namespace fuzzing
 }  // end namespace tensorflow
diff --git a/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc
index 0d71bf07643..18921ad9965 100644
--- a/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc
@@ -19,19 +19,21 @@
 #include "fuzztest/fuzztest.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/security/fuzzing/cc/core/framework/tensor_domains.h"
+#include "tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.h"
 #include "tensorflow/security/fuzzing/cc/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
 
 // Creates FuzzConcat class that wraps a single operation node session.
-class FuzzConcat : public FuzzSession<Tensor, Tensor, Tensor> {
+class FuzzConcat : public FuzzSession<Tensor, Tensor, int32> {
   void BuildGraph(const Scope& scope) override {
     auto value1 =
-        tensorflow::ops::Placeholder(scope.WithOpName("values"), DT_UINT8);
+        tensorflow::ops::Placeholder(scope.WithOpName("value1"), DT_INT32);
     Input value1_input(value1);
     auto value2 =
-        tensorflow::ops::Placeholder(scope.WithOpName("values"), DT_UINT8);
+        tensorflow::ops::Placeholder(scope.WithOpName("value2"), DT_INT32);
     Input value2_input(value2);
     InputList values_input_list({value1_input, value2_input});
     auto axis =
@@ -40,18 +42,30 @@ class FuzzConcat : public FuzzSession<Tensor, Tensor, Tensor> {
                             axis);
   }
   void FuzzImpl(const Tensor& value1, const Tensor& value2,
-                const Tensor& axis) final {
+                const int32& axis) final {
+    Tensor axis_tensor(DT_INT32, {});
+    axis_tensor.scalar<int32_t>()() = axis;
     Status s = RunInputsWithStatus(
-        {{"values", value1}, {"values", value2}, {"axis", axis}});
+        {{"value1", value1}, {"value2", value2}, {"axis", axis_tensor}});
     if (!s.ok()) {
-      LOG(ERROR) << "Execution failed: " << s.error_message();
+      LOG(ERROR) << "Execution failed: " << s.message();
     }
   }
 };
 
 // Setup up fuzzing test.
 FUZZ_TEST_F(FuzzConcat, Fuzz)
-    .WithDomains(AnyTensor(), AnyTensor(), AnyTensor<uint32_t>());
+    .WithDomains(fuzzing::AnyValidTensor(fuzzing::AnyValidTensorShape(
+                                             /*max_rank=*/5,
+                                             /*dim_lower_bound=*/0,
+                                             /*dim_upper_bound=*/10),
+                                         fuzztest::Just(DT_INT32)),
+                 fuzzing::AnyValidTensor(fuzzing::AnyValidTensorShape(
+                                             /*max_rank=*/5,
+                                             /*dim_lower_bound=*/0,
+                                             /*dim_upper_bound=*/10),
+                                         fuzztest::Just(DT_INT32)),
+                 fuzztest::InRange<int32>(0, 6));
 
 }  // end namespace fuzzing
 }  // end namespace tensorflow
diff --git a/tensorflow/security/fuzzing/cc/ops/identity_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/identity_fuzz.cc
index 7dac64e6670..af9b17bce24 100644
--- a/tensorflow/security/fuzzing/cc/ops/identity_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/identity_fuzz.cc
@@ -19,6 +19,8 @@
 #include "fuzztest/fuzztest.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/security/fuzzing/cc/core/framework/tensor_domains.h"
+#include "tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.h"
 #include "tensorflow/security/fuzzing/cc/fuzz_session.h"
 
 namespace tensorflow {
@@ -28,19 +30,24 @@ namespace fuzzing {
 class FuzzIdentity : public FuzzSession<Tensor> {
   void BuildGraph(const Scope& scope) override {
     auto op_node =
-        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_UINT8);
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_INT32);
     tensorflow::ops::Identity(scope.WithOpName("output"), op_node);
   }
   void FuzzImpl(const Tensor& input_tensor) final {
     Status s = RunInputsWithStatus({{"input", input_tensor}});
     if (!s.ok()) {
-      LOG(ERROR) << "Execution failed: " << s.error_message();
+      LOG(ERROR) << "Execution failed: " << s.message();
     }
   }
 };
 
 // Setup up fuzzing test.
-FUZZ_TEST_F(FuzzIdentity, Fuzz).WithDomains(AnyTensor());
+FUZZ_TEST_F(FuzzIdentity, Fuzz)
+    .WithDomains(fuzzing::AnyValidTensor(fuzzing::AnyValidTensorShape(
+                                             /*max_rank=*/5,
+                                             /*dim_lower_bound=*/0,
+                                             /*dim_upper_bound=*/10),
+                                         fuzztest::Just(DT_INT32)));
 
 }  // end namespace fuzzing
 }  // end namespace tensorflow
diff --git a/tensorflow/security/fuzzing/cc/ops/matmul_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/matmul_fuzz.cc
index 4a63f64fc82..a4f4b923513 100644
--- a/tensorflow/security/fuzzing/cc/ops/matmul_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/matmul_fuzz.cc
@@ -19,6 +19,8 @@
 #include "fuzztest/fuzztest.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/security/fuzzing/cc/core/framework/tensor_domains.h"
+#include "tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.h"
 #include "tensorflow/security/fuzzing/cc/fuzz_session.h"
 
 namespace tensorflow {
@@ -27,7 +29,17 @@ namespace fuzzing {
 // Creates FuzzIdentity class that wraps a single operation node session.
 BINARY_INPUT_OP_FUZZER(DT_INT32, MatMul);
 // Setup up fuzzing test.
-FUZZ_TEST_F(FuzzMatMul, Fuzz).WithDomains(AnyTensor(), AnyTensor());
+FUZZ_TEST_F(FuzzMatMul, Fuzz)
+    .WithDomains(fuzzing::AnyValidTensor(fuzzing::AnyValidTensorShape(
+                                             /*max_rank=*/5,
+                                             /*dim_lower_bound=*/0,
+                                             /*dim_upper_bound=*/10),
+                                         fuzztest::Just(DT_INT32)),
+                 fuzzing::AnyValidTensor(fuzzing::AnyValidTensorShape(
+                                             /*max_rank=*/5,
+                                             /*dim_lower_bound=*/0,
+                                             /*dim_upper_bound=*/10),
+                                         fuzztest::Just(DT_INT32)));
 
 }  // end namespace fuzzing
 }  // end namespace tensorflow
diff --git a/tensorflow/security/fuzzing/cc/ops/string_ops_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/string_ops_fuzz.cc
index fa2b952956a..33fcf093ea0 100644
--- a/tensorflow/security/fuzzing/cc/ops/string_ops_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/string_ops_fuzz.cc
@@ -45,7 +45,7 @@ class FuzzStringOpsStringSplit : public FuzzSession<std::string, std::string> {
     Status s = RunInputsWithStatus(
         {{"input", input_tensor}, {"delimiter", separator_tensor}});
     if (!s.ok()) {
-      LOG(ERROR) << "Execution failed: " << s.error_message();
+      LOG(ERROR) << "Execution failed: " << s.message();
     }
   }
 };
@@ -81,7 +81,7 @@ class FuzzStringOpsStringSplitV2
     Status s = RunInputsWithStatus(
         {{"input", input_tensor}, {"separator", separator_tensor}});
     if (!s.ok()) {
-      LOG(ERROR) << "Execution failed: " << s.error_message();
+      LOG(ERROR) << "Execution failed: " << s.message();
     }
   }
 };
@@ -105,7 +105,7 @@ class FuzzStringOpsStringUpper : public FuzzSession<std::string> {
 
     Status s = RunInputsWithStatus({{"input", input_tensor}});
     if (!s.ok()) {
-      LOG(ERROR) << "Execution failed: " << s.error_message();
+      LOG(ERROR) << "Execution failed: " << s.message();
     }
   }
 };
diff --git a/tensorflow/security/fuzzing/cc/ops/string_to_number_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/string_to_number_fuzz.cc
index feabbbf9115..11b3e8adfce 100644
--- a/tensorflow/security/fuzzing/cc/ops/string_to_number_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/string_to_number_fuzz.cc
@@ -33,7 +33,7 @@ class FuzzStringToNumber : public FuzzSession<std::string> {
     input_tensor.scalar<tensorflow::tstring>()() = input_string;
     Status s = RunInputsWithStatus({{"input", input_tensor}});
     if (!s.ok()) {
-      LOG(ERROR) << "Execution failed: " << s.error_message();
+      LOG(ERROR) << "Execution failed: " << s.message();
     }
   }
 };
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index ce33f16fc2d..c74d3c7cdfa 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -51,6 +51,7 @@ load(
     "if_mkldnn_aarch64_acl",
     "if_mkldnn_aarch64_acl_openmp",
     "if_mkldnn_openmp",
+    "if_onednn_v3",
 )
 load(
     "//third_party/compute_library:build_defs.bzl",
@@ -70,7 +71,7 @@ def register_extension_info(**kwargs):
 # not contain rc or alpha, only numbers.
 # Also update tensorflow/core/public/version.h
 # and tensorflow/tools/pip_package/setup.py
-VERSION = "2.13.0"
+VERSION = "2.14.0"
 VERSION_MAJOR = VERSION.split(".")[0]
 two_gpu_tags = ["requires-gpu-nvidia:2", "notap", "manual", "no_pip"]
 
@@ -433,6 +434,7 @@ def tf_copts(
         # optimizations for Intel builds using oneDNN if configured
         if_enable_mkl(["-DENABLE_MKL"]) +
         if_mkldnn_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
+        if_onednn_v3(["-DENABLE_ONEDNN_V3"]) +
         if_mkldnn_aarch64_acl(["-DDNNL_AARCH64_USE_ACL=1"]) +
         if_mkldnn_aarch64_acl_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
         if_zendnn(["-DAMD_ZENDNN"]) +
@@ -1239,13 +1241,10 @@ def _collect_op_reg_srcs_aspect_impl(_target, ctx):
     and return an OpRegistrationSrcInfo provider.
 
     OpRegistrationSrcInfo will have the union of the srcs of the C++ dependencies
-    that are generated by tf_gen_op_libs and tf_kernel_library and
-    their filename end with "_ops.cc" or "_op.cc".
+    with filename end with "_ops.cc" or "_op.cc".
     """
     direct, transitive = [], []
-    if (hasattr(ctx.rule.attr, "generator_function") and
-        ctx.rule.attr.generator_function in ["tf_gen_op_libs", "tf_kernel_library"] and
-        hasattr(ctx.rule.attr, "srcs")):
+    if ctx.rule.kind == "cc_library" and hasattr(ctx.rule.attr, "srcs"):
         # Assuming the filename of op registration source files ends with "_ops.cc" or "_op.cc"
         direct += [
             src
@@ -1726,6 +1725,11 @@ def tf_cc_tests(
             tags = tags,
         )
 
+register_extension_info(
+    extension = tf_cc_tests,
+    label_regex_for_dep = "{extension_name}",
+)
+
 def tf_cc_test_mkl(
         srcs,
         deps,
@@ -2491,11 +2495,11 @@ pywrap_tensorflow_macro = pywrap_tensorflow_macro_opensource
 #    Note that this only works on Windows. See the definition of
 #    //third_party/tensorflow/tools/pip_package:win_pip_package_marker for specific reasons.
 # 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
-def py_test(deps = [], data = [], kernels = [], exec_properties = None, **kwargs):
+def py_test(deps = [], data = [], kernels = [], exec_properties = None, test_rule = native.py_test, **kwargs):
     if not exec_properties:
         exec_properties = tf_exec_properties(kwargs)
 
-    native.py_test(
+    test_rule(
         # TODO(jlebar): Ideally we'd use tcmalloc here.,
         deps = select({
             "//conditions:default": deps,
@@ -3250,6 +3254,7 @@ def tf_python_pybind_static_deps(testonly = False):
         "@com_google_benchmark//:__subpackages__",
         "@com_google_googletest//:__subpackages__",
     ]
+    static_deps += if_onednn_v3(["@onednn_v3//:__subpackages__"])
     return if_oss(static_deps)
 
 # buildozer: enable=function-docstring-args
@@ -3363,7 +3368,7 @@ def tf_enable_mlir_bridge():
         "//conditions:default": [],
     })
 
-def tfcompile_target_cpu():
+def tfcompile_target_cpu(name = ""):
     return ""
 
 def tfcompile_dfsan_enabled():
diff --git a/tensorflow/tensorflow.default.bzl b/tensorflow/tensorflow.default.bzl
index 537664f9c21..017268250c3 100644
--- a/tensorflow/tensorflow.default.bzl
+++ b/tensorflow/tensorflow.default.bzl
@@ -56,12 +56,15 @@ pywrap_tensorflow_macro = _pywrap_tensorflow_macro
 tf_cc_shared_library = _tf_cc_shared_library
 pytype_library = _pytype_library
 tf_py_test = _tf_py_test
+tf_py_strict_test = _tf_py_test
 cuda_py_test = _cuda_py_test
+cuda_py_strict_test = _cuda_py_test
 tf_cuda_cc_test = _tf_cuda_cc_test
 tf_cuda_cc_tests = _tf_cuda_cc_tests
 tf_py_build_info_genrule = _tf_py_build_info_genrule
 tf_version_info_genrule = _tf_version_info_genrule
 tf_custom_op_py_library = _tf_custom_op_py_library
+tf_custom_op_py_strict_library = _tf_custom_op_py_library
 tensorflow_opensource_extra_deps = _tensorflow_opensource_extra_deps
 pybind_library = _pybind_library
 pybind_extension = _pybind_extension
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-d-type.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-d-type.pbtxt
index de7fc97176e..85476e9d05a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-d-type.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-d-type.pbtxt
@@ -69,6 +69,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
+    argspec: "args=[\'self\', \'type_enum\', \'handle_data\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_as_proto"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-graph-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-graph-def.pbtxt
index 19eccff03d2..6a09bca4883 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-graph-def.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-graph-def.pbtxt
@@ -32,5 +32,12 @@ tf_proto {
       type: TYPE_MESSAGE
       type_name: ".tensorflow.FunctionDefLibrary"
     }
+    field {
+      name: "debug_info"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphDebugInfo"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt
index dde37c581e5..80ad2d590ec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-graph.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.Graph"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Graph\'>"
-  is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.GraphHandle\'>"
-  is_instance: "<class \'pybind11_builtins.pybind11_object\'>"
+  is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.PyGraph\'>"
+  is_instance: "<type \'object\'>"
   member {
     name: "building_function"
     mtype: "<type \'property\'>"
@@ -19,6 +19,10 @@ tf_class {
     name: "graph_def_versions"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "operations"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "seed"
     mtype: "<type \'property\'>"
@@ -27,6 +31,9 @@ tf_class {
     name: "version"
     mtype: "<type \'property\'>"
   }
+  member_method {
+    name: "Dismantle"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -105,7 +112,6 @@ tf_class {
   }
   member_method {
     name: "get_operations"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_tensor_by_name"
@@ -127,6 +133,12 @@ tf_class {
     name: "name_scope"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "new_operations"
+  }
+  member_method {
+    name: "num_operations"
+  }
   member_method {
     name: "op_def_for_type"
     argspec: "args=[\'self\', \'type\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-operation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-operation.pbtxt
index 779b8338bbf..f802dc5aa29 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-operation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-operation.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.Operation"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Operation\'>"
-  is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.OperationHandle\'>"
-  is_instance: "<class \'pybind11_builtins.pybind11_object\'>"
+  is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.PyOperation\'>"
+  is_instance: "<type \'object\'>"
   member {
     name: "control_inputs"
     mtype: "<type \'property\'>"
@@ -11,6 +11,10 @@ tf_class {
     name: "device"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inputs"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
index e1493e1a38c..6849e8b5b53 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
@@ -9,18 +9,10 @@ tf_class {
     name: "OVERLOADABLE_OPERATORS"
     mtype: "<type \'set\'>"
   }
-  member {
-    name: "device"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -29,25 +21,12 @@ tf_class {
     name: "ndim"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "op"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "value_index"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'op\', \'value_index\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "consumers"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "eval"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index 94ba601a3f9..33a7a1cf418 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.Variable"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.variables.VariableV1\'>"
+  is_instance: "<class \'tensorflow.python.ops.variable_v1.VariableV1\'>"
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.__internal__.-symbolic-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.-symbolic-tensor.pbtxt
new file mode 100644
index 00000000000..2f1b0e2cf98
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.-symbolic-tensor.pbtxt
@@ -0,0 +1,72 @@
+path: "tensorflow.__internal__.SymbolicTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.SymbolicTensor\'>"
+  is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.PyTensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
+  is_instance: "<class \'tensorflow.python.types.core.Symbol\'>"
+  is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "OVERLOADABLE_OPERATORS"
+    mtype: "<type \'set\'>"
+  }
+  member {
+    name: "device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ndim"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_index"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "consumers"
+  }
+  member_method {
+    name: "eval"
+    argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_shape"
+    argspec: "args=[\'self\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.__internal__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.pbtxt
index 8589534a09d..7ea31263492 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.__internal__.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.__internal__"
 tf_module {
+  member {
+    name: "SymbolicTensor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "types"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index edb2e0efe75..a6840ab11e9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -229,6 +229,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 7806aa4dc29..15a40fa9d1d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -231,6 +231,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 777f85c6be9..45bbc931abb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -231,6 +231,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 121abace8d5..f5655dbf59b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -231,6 +231,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index eb915152990..b123b579dbc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -231,6 +231,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 26ab3f8c38d..7bc3e6a9386 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -231,6 +231,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 3dd850aeb5d..16564994f53 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -231,6 +231,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.-d-type.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.-d-type.pbtxt
index 9106a9bc501..8782630a4b0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.-d-type.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.-d-type.pbtxt
@@ -69,6 +69,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
+    argspec: "args=[\'self\', \'type_enum\', \'handle_data\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_as_proto"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index 30d90d31db4..37bda3a7e37 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -478,7 +478,7 @@ tf_module {
   }
   member_method {
     name: "top_k"
-    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \'None\'], "
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'index_type\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "truediv"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
index f9efbd45465..7dedd61823b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
@@ -426,7 +426,7 @@ tf_module {
   }
   member_method {
     name: "top_k"
-    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \'None\'], "
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'index_type\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "uniform_candidate_sampler"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index d288b7cb38e..3ee703db84b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -98,7 +98,7 @@ tf_module {
   }
   member {
     name: "Graph"
-    mtype: "<class \'pybind11_builtins.pybind11_type\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "GraphDef"
@@ -166,7 +166,7 @@ tf_module {
   }
   member {
     name: "Operation"
-    mtype: "<class \'pybind11_builtins.pybind11_type\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "OptimizerOptions"
@@ -1548,6 +1548,10 @@ tf_module {
     name: "is_strictly_increasing"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_symbolic_tensor"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_tensor"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 6c6d0a31767..a847e05b7df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1098,7 +1098,7 @@ tf_module {
   }
   member_method {
     name: "DatasetCardinality"
-    argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'cardinality_options\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
   member_method {
     name: "DatasetFromGraph"
@@ -1140,6 +1140,10 @@ tf_module {
     name: "DebugIdentityV2"
     argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'tfdbg_run_id\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'\', \'None\'], "
   }
+  member_method {
+    name: "DebugIdentityV3"
+    argspec: "args=[\'input\', \'device_name\', \'tensor_name\', \'io_of_node\', \'is_input\', \'io_index\', \'debug_urls\', \'gated_grpc\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'False\', \'-1\', \'[]\', \'False\', \'None\'], "
+  }
   member_method {
     name: "DebugNanCount"
     argspec: "args=[\'input\', \'device_name\', \'tensor_name\', \'debug_urls\', \'gated_grpc\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'[]\', \'False\', \'None\'], "
@@ -4252,6 +4256,10 @@ tf_module {
     name: "Snapshot"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SnapshotChunkDataset"
+    argspec: "args=[\'chunk_file\', \'output_types\', \'output_shapes\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "SnapshotDataset"
     argspec: "args=[\'input_dataset\', \'path\', \'output_types\', \'output_shapes\', \'compression\', \'reader_path_prefix\', \'writer_path_prefix\', \'shard_size_bytes\', \'pending_snapshot_expiry_seconds\', \'num_reader_threads\', \'reader_buffer_size\', \'num_writer_threads\', \'writer_buffer_size\', \'shuffle_on_read\', \'seed\', \'seed2\', \'mode\', \'snapshot_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'10737418240\', \'86400\', \'1\', \'1\', \'1\', \'1\', \'False\', \'0\', \'0\', \'auto\', \'\', \'None\'], "
@@ -5290,7 +5298,7 @@ tf_module {
   }
   member_method {
     name: "TopKV2"
-    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'index_type\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "Transpose"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
index 2dde9c495cc..342d6f45fc1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
@@ -154,7 +154,7 @@ tf_module {
   }
   member_method {
     name: "build_signature_def"
-    argspec: "args=[\'inputs\', \'outputs\', \'method_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'inputs\', \'outputs\', \'method_name\', \'defaults\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "build_tensor_info"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.signature_def_utils.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.signature_def_utils.pbtxt
index dde72390316..c3641f96667 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.signature_def_utils.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.signature_def_utils.pbtxt
@@ -6,7 +6,7 @@ tf_module {
   }
   member_method {
     name: "build_signature_def"
-    argspec: "args=[\'inputs\', \'outputs\', \'method_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'inputs\', \'outputs\', \'method_name\', \'defaults\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "classification_signature_def"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt
index d8ae8734231..7c4bbad8508 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt
@@ -34,4 +34,8 @@ tf_class {
     name: "save"
     argspec: "args=[\'self\', \'checkpoint_number\', \'check_interval\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "sync"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-d-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-d-type.pbtxt
index de7fc97176e..85476e9d05a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-d-type.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-d-type.pbtxt
@@ -69,6 +69,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
+    argspec: "args=[\'self\', \'type_enum\', \'handle_data\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_as_proto"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt
index dde37c581e5..80ad2d590ec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-graph.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.Graph"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Graph\'>"
-  is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.GraphHandle\'>"
-  is_instance: "<class \'pybind11_builtins.pybind11_object\'>"
+  is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.PyGraph\'>"
+  is_instance: "<type \'object\'>"
   member {
     name: "building_function"
     mtype: "<type \'property\'>"
@@ -19,6 +19,10 @@ tf_class {
     name: "graph_def_versions"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "operations"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "seed"
     mtype: "<type \'property\'>"
@@ -27,6 +31,9 @@ tf_class {
     name: "version"
     mtype: "<type \'property\'>"
   }
+  member_method {
+    name: "Dismantle"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -105,7 +112,6 @@ tf_class {
   }
   member_method {
     name: "get_operations"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_tensor_by_name"
@@ -127,6 +133,12 @@ tf_class {
     name: "name_scope"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "new_operations"
+  }
+  member_method {
+    name: "num_operations"
+  }
   member_method {
     name: "op_def_for_type"
     argspec: "args=[\'self\', \'type\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-operation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-operation.pbtxt
index 779b8338bbf..f802dc5aa29 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-operation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-operation.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.Operation"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Operation\'>"
-  is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.OperationHandle\'>"
-  is_instance: "<class \'pybind11_builtins.pybind11_object\'>"
+  is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.PyOperation\'>"
+  is_instance: "<type \'object\'>"
   member {
     name: "control_inputs"
     mtype: "<type \'property\'>"
@@ -11,6 +11,10 @@ tf_class {
     name: "device"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inputs"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
index e1493e1a38c..6849e8b5b53 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
@@ -9,18 +9,10 @@ tf_class {
     name: "OVERLOADABLE_OPERATORS"
     mtype: "<type \'set\'>"
   }
-  member {
-    name: "device"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -29,25 +21,12 @@ tf_class {
     name: "ndim"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "op"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "value_index"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'op\', \'value_index\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "consumers"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "eval"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-func-graph.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-func-graph.pbtxt
index f6e2e4522c2..d2db7d677ae 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-func-graph.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-func-graph.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.__internal__.FuncGraph"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.func_graph.FuncGraph\'>"
   is_instance: "<class \'tensorflow.python.framework.ops.Graph\'>"
-  is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.GraphHandle\'>"
-  is_instance: "<class \'pybind11_builtins.pybind11_object\'>"
+  is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.PyGraph\'>"
+  is_instance: "<type \'object\'>"
   member {
     name: "building_function"
     mtype: "<type \'property\'>"
@@ -32,6 +32,10 @@ tf_class {
     name: "finalized"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "function_captures"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph_def_versions"
     mtype: "<type \'property\'>"
@@ -40,6 +44,10 @@ tf_class {
     name: "internal_captures"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "operations"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "outer_graph"
     mtype: "<type \'property\'>"
@@ -80,6 +88,9 @@ tf_class {
     name: "version"
     mtype: "<type \'property\'>"
   }
+  member_method {
+    name: "Dismantle"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'name\', \'collections\', \'capture_by_value\', \'structured_input_signature\', \'structured_outputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -170,7 +181,6 @@ tf_class {
   }
   member_method {
     name: "get_operations"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_tensor_by_name"
@@ -196,6 +206,12 @@ tf_class {
     name: "name_scope"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "new_operations"
+  }
+  member_method {
+    name: "num_operations"
+  }
   member_method {
     name: "op_def_for_type"
     argspec: "args=[\'self\', \'type\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-symbolic-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-symbolic-tensor.pbtxt
new file mode 100644
index 00000000000..2f1b0e2cf98
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-symbolic-tensor.pbtxt
@@ -0,0 +1,72 @@
+path: "tensorflow.__internal__.SymbolicTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.SymbolicTensor\'>"
+  is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.PyTensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
+  is_instance: "<class \'tensorflow.python.types.core.Symbol\'>"
+  is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "OVERLOADABLE_OPERATORS"
+    mtype: "<type \'set\'>"
+  }
+  member {
+    name: "device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ndim"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_index"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "consumers"
+  }
+  member_method {
+    name: "eval"
+    argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ref"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_shape"
+    argspec: "args=[\'self\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.function.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.function.pbtxt
index 2eddc1b3f6a..5876f95beed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.function.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.function.pbtxt
@@ -4,8 +4,4 @@ tf_module {
     name: "Function"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "defun_with_attributes"
-    argspec: "args=[\'func\', \'input_signature\', \'attributes\', \'autograph\', \'experimental_autograph_options\', \'jit_compile\', \'reduce_retracing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'False\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt
index 1efef79c250..47775a6e6f9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt
@@ -10,7 +10,11 @@ tf_module {
   }
   member {
     name: "FuncGraph"
-    mtype: "<class \'pybind11_builtins.pybind11_type\'>"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SymbolicTensor"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "autograph"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-checkpoint-initial-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-checkpoint-initial-value.pbtxt
index 56a8ec31b0a..4562b3f8778 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-checkpoint-initial-value.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-checkpoint-initial-value.pbtxt
@@ -1,77 +1,13 @@
 path: "tensorflow.__internal__.tracking.CheckpointInitialValue"
 tf_class {
   is_instance: "<class \'tensorflow.python.trackable.base.CheckpointInitialValue\'>"
-  is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
-  is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
-  is_instance: "<class \'tensorflow.python.types.core.Symbol\'>"
-  is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "OVERLOADABLE_OPERATORS"
-    mtype: "<type \'set\'>"
-  }
   member {
     name: "checkpoint_position"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "device"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ndim"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "value_index"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'checkpoint_position\', \'shape\', \'shard_info\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "consumers"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "eval"
-    argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "experimental_ref"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_shape"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "ref"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_shape"
-    argspec: "args=[\'self\', \'shape\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 46d0eaef03f..be368d13f3c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -196,6 +196,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 0cb02cb619d..4c061c2f7c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -198,6 +198,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 708840f54ec..be9a8aba994 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -197,6 +197,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index b2ef1c239fd..fe6741cb0c8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -198,6 +198,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 3a584720de4..b61f709597e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -198,6 +198,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index fa9e57a4223..b8cfedf4006 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -199,6 +199,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index c4dd0f9a825..a4f8d6edcb2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -198,6 +198,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-preemption-checkpoint-handler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-preemption-checkpoint-handler.pbtxt
index 1ab5e9a85de..aca05013ff0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-preemption-checkpoint-handler.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-preemption-checkpoint-handler.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "run"
     argspec: "args=[\'self\', \'distributed_train_function\'], varargs=args, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "save_checkpoint_if_preempted"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
   member_method {
     name: "watch_preemption_scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.-d-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.-d-type.pbtxt
index 9106a9bc501..8782630a4b0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.-d-type.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.-d-type.pbtxt
@@ -69,6 +69,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
+    argspec: "args=[\'self\', \'type_enum\', \'handle_data\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_as_proto"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-checkpoint.pbtxt
index 782df07ded4..079962ae333 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-checkpoint.pbtxt
@@ -25,6 +25,10 @@ tf_class {
     name: "save"
     argspec: "args=[\'self\', \'file_prefix\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sync"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "write"
     argspec: "args=[\'self\', \'file_prefix\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt
index c38c797228b..4ff9cd49689 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt
@@ -11,7 +11,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'dataset\', \'mesh\', \'layouts\', \'global_batch_size\', \'dataset_already_batched\', \'batch_dim\', \'prefetch\', \'tf_data_service_config\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -199,6 +199,6 @@ tf_class {
   }
   member_method {
     name: "zip"
-    argspec: "args=[\'datasets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'datasets\', \'name\'], varargs=args, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-variable.pbtxt
index bbeda8688ff..45b76b8c598 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-variable.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.dtensor.python.d_variable.DVariable\'>"
   is_instance: "<class \'tensorflow.python.ops.resource_variable_ops.ResourceVariable\'>"
   is_instance: "<class \'tensorflow.python.ops.resource_variable_ops.BaseResourceVariable\'>"
-  is_instance: "<class \'tensorflow.python.ops.variables.VariableV1\'>"
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
@@ -75,7 +74,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'initial_value\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'initial_value\', \'dtype\'], varargs=args, keywords=kwargs, defaults=[\'None\'], "
   }
   member_method {
     name: "assign"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-layout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-layout.pbtxt
index 2f6f986994d..57bb624352b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-layout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-layout.pbtxt
@@ -29,23 +29,31 @@ tf_class {
   }
   member_method {
     name: "batch_sharded"
-    argspec: "args=[\'mesh\', \'batch_dim\', \'rank\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'cls\', \'mesh\', \'batch_dim\', \'rank\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "delete"
     argspec: "args=[\'self\', \'dims\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_device"
+    argspec: "args=[\'cls\', \'device\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "from_proto"
-    argspec: "args=[\'layout_proto\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'layout_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_single_device_mesh"
+    argspec: "args=[\'cls\', \'mesh\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_string"
-    argspec: "args=[\'layout_str\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'layout_str\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "inner_sharded"
-    argspec: "args=[\'mesh\', \'inner_dim\', \'rank\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'mesh\', \'inner_dim\', \'rank\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "is_batch_parallel"
@@ -55,6 +63,10 @@ tf_class {
     name: "is_fully_replicated"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_single_device"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "num_shards"
     argspec: "args=[\'self\', \'idx\'], varargs=None, keywords=None, defaults=None"
@@ -69,7 +81,7 @@ tf_class {
   }
   member_method {
     name: "replicated"
-    argspec: "args=[\'mesh\', \'rank\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'mesh\', \'rank\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "to_string"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-mesh.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-mesh.pbtxt
index 11b92526342..9b1e5abfeec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-mesh.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-mesh.pbtxt
@@ -11,6 +11,10 @@ tf_class {
     name: "name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "single_device"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "size"
     mtype: "<type \'property\'>"
@@ -47,13 +51,17 @@ tf_class {
     name: "dim_size"
     argspec: "args=[\'self\', \'dim_name\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_device"
+    argspec: "args=[\'cls\', \'device\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "from_proto"
-    argspec: "args=[\'proto\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'proto\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_string"
-    argspec: "args=[\'mesh_str\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'mesh_str\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "global_device_ids"
@@ -71,6 +79,10 @@ tf_class {
     name: "is_remote"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_single_device"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "local_device_ids"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-focal-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-focal-crossentropy.pbtxt
new file mode 100644
index 00000000000..c7a34037834
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-focal-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.CategoricalFocalCrossentropy"
+tf_class {
+  is_instance: "<class \'keras.losses.CategoricalFocalCrossentropy\'>"
+  is_instance: "<class \'keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\', \'auto\', \'categorical_focal_crossentropy\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
index fc468e3a7a6..6b9d80502c0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "CategoricalCrossentropy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalFocalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CategoricalHinge"
     mtype: "<type \'type\'>"
@@ -104,6 +108,10 @@ tf_module {
     name: "categorical_crossentropy"
     argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'0.0\', \'-1\'], "
   }
+  member_method {
+    name: "categorical_focal_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\'], "
+  }
   member_method {
     name: "categorical_hinge"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index fab4b8875dd..a391d63556f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -478,7 +478,7 @@ tf_module {
   }
   member_method {
     name: "top_k"
-    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \'None\'], "
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'index_type\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "truediv"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
index 624d46c8c22..b419f9e9ab9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
@@ -228,6 +228,10 @@ tf_module {
     name: "categorical_crossentropy"
     argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'0.0\', \'-1\'], "
   }
+  member_method {
+    name: "categorical_focal_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'alpha\', \'gamma\', \'from_logits\', \'label_smoothing\', \'axis\'], varargs=None, keywords=None, defaults=[\'0.25\', \'2.0\', \'False\', \'0.0\', \'-1\'], "
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index f7b6b60b511..380db54e660 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -342,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "top_k"
-    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \'None\'], "
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'index_type\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "weighted_cross_entropy_with_logits"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 79aa6687d70..a6169d37bdb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -22,7 +22,7 @@ tf_module {
   }
   member {
     name: "Graph"
-    mtype: "<class \'pybind11_builtins.pybind11_type\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "IndexedSlices"
@@ -38,7 +38,7 @@ tf_module {
   }
   member {
     name: "Operation"
-    mtype: "<class \'pybind11_builtins.pybind11_type\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "OptionalSpec"
@@ -760,6 +760,10 @@ tf_module {
     name: "inside_function"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_symbolic_tensor"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_tensor"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 6c6d0a31767..a847e05b7df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1098,7 +1098,7 @@ tf_module {
   }
   member_method {
     name: "DatasetCardinality"
-    argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'cardinality_options\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
   member_method {
     name: "DatasetFromGraph"
@@ -1140,6 +1140,10 @@ tf_module {
     name: "DebugIdentityV2"
     argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'tfdbg_run_id\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'\', \'None\'], "
   }
+  member_method {
+    name: "DebugIdentityV3"
+    argspec: "args=[\'input\', \'device_name\', \'tensor_name\', \'io_of_node\', \'is_input\', \'io_index\', \'debug_urls\', \'gated_grpc\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'False\', \'-1\', \'[]\', \'False\', \'None\'], "
+  }
   member_method {
     name: "DebugNanCount"
     argspec: "args=[\'input\', \'device_name\', \'tensor_name\', \'debug_urls\', \'gated_grpc\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'[]\', \'False\', \'None\'], "
@@ -4252,6 +4256,10 @@ tf_module {
     name: "Snapshot"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SnapshotChunkDataset"
+    argspec: "args=[\'chunk_file\', \'output_types\', \'output_shapes\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "SnapshotDataset"
     argspec: "args=[\'input_dataset\', \'path\', \'output_types\', \'output_shapes\', \'compression\', \'reader_path_prefix\', \'writer_path_prefix\', \'shard_size_bytes\', \'pending_snapshot_expiry_seconds\', \'num_reader_threads\', \'reader_buffer_size\', \'num_writer_threads\', \'writer_buffer_size\', \'shuffle_on_read\', \'seed\', \'seed2\', \'mode\', \'snapshot_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'10737418240\', \'86400\', \'1\', \'1\', \'1\', \'1\', \'False\', \'0\', \'0\', \'auto\', \'\', \'None\'], "
@@ -5290,7 +5298,7 @@ tf_module {
   }
   member_method {
     name: "TopKV2"
-    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'index_type\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "Transpose"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-load-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-load-options.pbtxt
index 55a0ee64d8a..c1c64b6d0a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-load-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-load-options.pbtxt
@@ -10,6 +10,10 @@ tf_class {
     name: "experimental_io_device"
     mtype: "<type \'member_descriptor\'>"
   }
+  member {
+    name: "experimental_load_function_aliases"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member {
     name: "experimental_skip_checkpoint"
     mtype: "<type \'member_descriptor\'>"
@@ -20,6 +24,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'allow_partial_checkpoint\', \'experimental_io_device\', \'experimental_skip_checkpoint\', \'experimental_variable_policy\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'allow_partial_checkpoint\', \'experimental_io_device\', \'experimental_skip_checkpoint\', \'experimental_variable_policy\', \'experimental_load_function_aliases\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'False\', \'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt
index d8ae8734231..7c4bbad8508 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt
@@ -34,4 +34,8 @@ tf_class {
     name: "save"
     argspec: "args=[\'self\', \'checkpoint_number\', \'check_interval\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "sync"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
index 2de21aaea52..233c4969a27 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
@@ -24,6 +24,10 @@ tf_class {
     name: "save"
     argspec: "args=[\'self\', \'file_prefix\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sync"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "write"
     argspec: "args=[\'self\', \'file_prefix\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/lib/BUILD b/tensorflow/tools/api/lib/BUILD
index 6165905c9f3..14ef797d245 100644
--- a/tensorflow/tools/api/lib/BUILD
+++ b/tensorflow/tools/api/lib/BUILD
@@ -25,8 +25,9 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":api_objects_proto_py",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_decorator",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 63463086db8..7f3b08838fb 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -42,7 +42,8 @@ py_test(
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:resource_loader",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/tools/api/lib:python_object_to_proto_visitor",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index a8ca0436e19..dbac33541f0 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -32,10 +32,10 @@ cc_library(
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu",
-            "//tensorflow/core:lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:framework_lite",
+            "//tensorflow/core:lib",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:tensorflow",
             "//tensorflow/core:test",
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.arm64 b/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
index a7c52176866..2b674e7015c 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
@@ -1,4 +1,4 @@
-FROM linaro/tensorflow-arm64-build:2.12-multipython
+FROM linaro/tensorflow-arm64-build:2.13-multipython
 
 ARG py_major_minor_version='3.10'
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
index 0af81d9b787..c595345aa6e 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
@@ -8,7 +8,7 @@
 #  --tag "gcr.io/tensorflow-testing/nosla-cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython" .
 # $ docker push gcr.io/tensorflow-testing/nosla-cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
 
-FROM gcr.io/tensorflow-sigs/build@sha256:bc94dcfc4b9e8e8abc91d67468d4af0345879c4d910cebc444d78402a7994237
+FROM gcr.io/tensorflow-sigs/build@sha256:86ab6082134fb68ff54f02bb183fecf45a4099846bd509e139bc932dd0c0049e
 
 # Copy and run the install scripts.
 ARG DEBIAN_FRONTEND=noninteractive
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython
index 3aaded3e202..f8ff5347dbe 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython
@@ -48,6 +48,10 @@ RUN /install/install_bootstrap_deb_packages.sh
 COPY install/install_deb_packages.sh /install/
 RUN /install/install_deb_packages.sh
 
+# LLVM/Clang: https://apt.llvm.org/
+RUN apt-key adv --fetch-keys https://apt.llvm.org/llvm-snapshot.gpg.key
+RUN printf "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-16 main\ndeb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-16 main\n" > /etc/apt/sources.list.d/llvm.list
+
 RUN apt-get update && apt-get install -y \
     libbz2-dev \
     libffi-dev \
@@ -57,6 +61,7 @@ RUN apt-get update && apt-get install -y \
     libreadline-dev \
     libsqlite3-dev \
     patchelf \
+    clang-16 \
       && \
     rm -rf /var/lib/apt/lists/*
 
@@ -75,7 +80,3 @@ RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8" "jax"
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "jax"
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "jax"
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "jax"
-
-ENV CLANG_VERSION="rf2b94bd7eaa83d853dc7568fac87b1f8bf4ddec6"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
index 642321ef8c0..19014b91823 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -52,6 +52,7 @@ RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteracti
   libboost-filesystem-dev \
   rpm \
   libnuma-dev \
+  patchelf \
   pciutils \
   virtualenv \
   python3-pip \
diff --git a/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh b/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
index b56ebb0d06d..2b64cbba56a 100644
--- a/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
+++ b/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
@@ -17,6 +17,4 @@ set -x
 
 ARM_SKIP_TESTS="-//tensorflow/lite/... \
 -//tensorflow/python/kernel_tests/nn_ops:atrous_conv2d_test \
--//tensorflow/python/kernel_tests/nn_ops:conv_ops_test \
--//tensorflow/python/kernel_tests/nn_ops:pooling_ops_test \
 "
diff --git a/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS_EXTENDED.sh b/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS_EXTENDED.sh
index b1c6ba4cf02..479bb6c506c 100644
--- a/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS_EXTENDED.sh
+++ b/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS_EXTENDED.sh
@@ -18,25 +18,7 @@ set -x
 source tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
 
 ARM_SKIP_TESTS="${ARM_SKIP_TESTS} \
--//tensorflow/compiler/mlir/lite/tests:const-fold.mlir.test \
--//tensorflow/compiler/mlir/tensorflow/tests/... \
 -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test \
--//tensorflow/compiler/tests:conv2d_test_cpu \
--//tensorflow/compiler/tests:conv2d_test_cpu_mlir_bridge_test \
--//tensorflow/compiler/tests:depthwise_conv_op_test_cpu \
--//tensorflow/compiler/tests:depthwise_conv_op_test_cpu_mlir_bridge_test \
--//tensorflow/compiler/tests:jit_test_cpu \
--//tensorflow/compiler/xla/service/cpu/tests:cpu_eigen_dot_operation_test \
--//tensorflow/compiler/xla/tests:conv_depthwise_backprop_filter_test_cpu \
--//tensorflow/compiler/xla/tests:convolution_dimension_numbers_test_cpu \
--//tensorflow/compiler/xla/tests:convolution_test_cpu \
--//tensorflow/compiler/xla/tests:convolution_variants_test_cpu \
--//tensorflow/core/distributed_runtime/integration_test:c_api_session_coordination_test_cpu \
--//tensorflow/core/grappler/optimizers:arithmetic_optimizer_test_cpu \
 -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu \
--//tensorflow/core/grappler/optimizers:constant_folding_test \
 -//tensorflow/core/grappler/optimizers:remapper_test_cpu \
--//tensorflow/core/profiler/rpc/client:remote_profiler_session_manager_test \
--//tensorflow/python/kernel_tests/nn_ops:conv_ops_3d_test_cpu \
--//tensorflow/python/kernel_tests/nn_ops:conv2d_backprop_filter_grad_test_cpu \
--//tensorflow/python/tools:aot_compiled_test"
+"
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
index a56a81f3b13..80bcb7844ab 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
@@ -42,6 +42,7 @@ JAX_PACKAGES=(
   "auditwheel"
   "msgpack"
   "typing_extensions"
+  "ml_dtypes>=0.1.0"
 )
 
 PACKAGES=(
@@ -91,13 +92,11 @@ fi
 
 if [[ "$2" == "jax" ]]; then
   # Special casing by version of Python
-  # E.g., numpy supports py3.10 only from 1.21.3
-  if [[ ${PYTHON_VERSION} -eq 10 ]]; then
-    "${PIP_INSTALL[@]}" "numpy==1.21.3" "scipy==1.7.2"
-  elif [[ ${PYTHON_VERSION} -eq 11 ]]; then
+  # E.g., numpy supports py3.11 only from 1.23.4
+  if [[ ${PYTHON_VERSION} -eq 11 ]]; then
     "${PIP_INSTALL[@]}" "numpy==1.23.4" "scipy==1.9.2"
   else
-    "${PIP_INSTALL[@]}" "numpy==1.20.3" "scipy==1.5.4"
+    "${PIP_INSTALL[@]}" "numpy==1.21.3" "scipy==1.7.2"
   fi
 else
   # Special casing by version of Python
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_cpp.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_cpp.sh
new file mode 100644
index 00000000000..0af258a6e9a
--- /dev/null
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_cpp.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+sudo mkdir /tmpfs
+sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /tmpfs
+sudo mkdir /tensorflow
+sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /tensorflow
+sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/lib/python*
+sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/bin
+sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/lib/python3/dist-packages
+
+# Update bazel
+install_bazelisk
+
+# Set python version string
+python_version=$(python3 -c 'import sys; print("python"+str(sys.version_info.major)+"."+str(sys.version_info.minor))')
+
+# Setup virtual environment
+setup_venv_ubuntu ${python_version}
+
+# Env vars used to avoid interactive elements of the build.
+export HOST_C_COMPILER=(which gcc)
+export HOST_CXX_COMPILER=(which g++)
+export TF_ENABLE_XLA=1
+export TF_DOWNLOAD_CLANG=0
+export TF_SET_ANDROID_WORKSPACE=0
+export TF_NEED_MPI=0
+export TF_NEED_ROCM=0
+export TF_NEED_GCP=0
+export TF_NEED_S3=0
+export TF_NEED_OPENCL_SYCL=0
+export TF_NEED_CUDA=0
+export TF_NEED_HDFS=0
+export TF_NEED_OPENCL=0
+export TF_NEED_JEMALLOC=1
+export TF_NEED_VERBS=0
+export TF_NEED_AWS=0
+export TF_NEED_GDR=0
+export TF_NEED_OPENCL_SYCL=0
+export TF_NEED_COMPUTECPP=0
+export TF_NEED_KAFKA=0
+export TF_NEED_TENSORRT=0
+
+# Export required variables for running the tests
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="CPU"
+
+# Get the default test targets for bazel
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Get the extended C++ skip test list for arm
+source tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS_EXTENDED.sh
+
+# Export optional variables for running the tests
+export TF_BUILD_FLAGS="--config=mkl_aarch64_threadpool --copt=-flax-vector-conversions"
+export TF_TEST_FLAGS="${TF_BUILD_FLAGS} \
+    --test_env=TF_ENABLE_ONEDNN_OPTS=1 --test_env=TF2_BEHAVIOR=1 --define=tf_api_version=2 \
+    --test_lang_filters=cc --flaky_test_attempts=3 --test_size_filters=small,medium \
+    --test_output=errors --verbose_failures=true --test_keep_going --notest_verbose_timeout_warnings"
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} ${ARM_SKIP_TESTS}"
+export TF_FILTER_TAGS="-no_oss,-oss_excluded,-oss_serial,-v1only,-benchmark-test,-no_aarch64,-gpu,-tpu,-no_oss_py38,-no_oss_py39,-no_oss_py310"
+
+if [ ${IS_NIGHTLY} == 1 ]; then
+  ./tensorflow/tools/ci_build/update_version.py --nightly
+fi
+
+sudo sed -i '/^build --profile/d' /usertools/aarch64.bazelrc
+sudo sed -i '\@^build.*=\"/usr/local/bin/python3\"$@d' /usertools/aarch64.bazelrc
+sed -i '$ aimport /usertools/aarch64.bazelrc' .bazelrc
+
+bazel test ${TF_TEST_FLAGS} \
+    --repo_env=PYTHON_BIN_PATH="$(which python)" \
+    --build_tag_filters=${TF_FILTER_TAGS} \
+    --test_tag_filters=${TF_FILTER_TAGS} \
+    --local_test_jobs=$(grep -c ^processor /proc/cpuinfo) \
+    --build_tests_only \
+    -- ${TF_TEST_TARGETS}
+
+# Remove virtual environment
+remove_venv_ubuntu
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_nonpip.sh
index aeb79bd3b5b..1568c0e80a1 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_nonpip.sh
@@ -19,6 +19,14 @@ set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 
+sudo mkdir /tmpfs
+sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /tmpfs
+sudo mkdir /tensorflow
+sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /tensorflow
+sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/lib/python*
+sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/bin
+sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/lib/python3/dist-packages
+
 # Update bazel
 install_bazelisk
 
@@ -62,22 +70,28 @@ source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 source tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS_EXTENDED.sh
 
 # Export optional variables for running the tests
-export TF_BUILD_FLAGS="--config=nonccl --config=mkl_aarch64_threadpool \
-    --copt=-mtune=generic --copt=-march=armv8-a --copt=-O3 --copt=-flax-vector-conversions"
-export TF_TEST_FLAGS="${TF_BUILD_FLAGS} --test_env=TF_ENABLE_ONEDNN_OPTS=1 \
-    --test_env=TF2_BEHAVIOR=1 --define=tf_api_version=2 \
-    --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium"
+export TF_BUILD_FLAGS="--config=mkl_aarch64_threadpool --copt=-flax-vector-conversions"
+export TF_TEST_FLAGS="${TF_BUILD_FLAGS} \
+    --test_env=TF_ENABLE_ONEDNN_OPTS=1 --test_env=TF2_BEHAVIOR=1 --define=tf_api_version=2 \
+    --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium \
+    --test_output=errors --verbose_failures=true --test_keep_going --notest_verbose_timeout_warnings"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} ${ARM_SKIP_TESTS}"
-export TF_FILTER_TAGS="-no_oss,-oss_excluded,-oss_serial,-v1only,-benchmark-test,-no_aarch64,-gpu,-tpu,-requires-gpu"
+export TF_FILTER_TAGS="-no_oss,-oss_excluded,-oss_serial,-v1only,-benchmark-test,-no_aarch64,-gpu,-tpu,-no_oss_py38,-no_oss_py39,-no_oss_py310"
+
+if [ ${IS_NIGHTLY} == 1 ]; then
+  ./tensorflow/tools/ci_build/update_version.py --nightly
+fi
+
+sudo sed -i '/^build --profile/d' /usertools/aarch64.bazelrc
+sudo sed -i '\@^build.*=\"/usr/local/bin/python3\"$@d' /usertools/aarch64.bazelrc
+sed -i '$ aimport /usertools/aarch64.bazelrc' .bazelrc
 
 bazel test ${TF_TEST_FLAGS} \
     --repo_env=PYTHON_BIN_PATH="$(which python)" \
     --build_tag_filters=${TF_FILTER_TAGS} \
     --test_tag_filters=${TF_FILTER_TAGS} \
-    --local_test_jobs=64 \
-    --verbose_failures \
+    --local_test_jobs=$(grep -c ^processor /proc/cpuinfo) \
     --build_tests_only \
-    -k \
     -- ${TF_TEST_TARGETS}
 
 # Remove virtual environment
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
index 190ded0db03..9f1bbdb7f49 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
@@ -70,11 +70,11 @@ source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 source tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
 
 # Export optional variables for running pip_new.sh
-export TF_BUILD_FLAGS="--config=ambe --config=mkl_aarch64_threadpool --copt=-flax-vector-conversions"
+export TF_BUILD_FLAGS="--config=mkl_aarch64_threadpool --copt=-flax-vector-conversions"
 export TF_TEST_FLAGS="${TF_BUILD_FLAGS} \
     --test_env=TF_ENABLE_ONEDNN_OPTS=1 --test_env=TF2_BEHAVIOR=1 --define=no_tensorflow_py_deps=true \
     --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium \
-    --test_output=errors --verbose_failures=true --test_keep_going"
+    --test_output=errors --verbose_failures=true --test_keep_going --notest_verbose_timeout_warnings"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} ${ARM_SKIP_TESTS}"
 export TF_PIP_TESTS="test_pip_virtualenv_clean test_pip_virtualenv_oss_serial"
 export TF_TEST_FILTER_TAGS="-no_oss,-oss_excluded,-v1only,-benchmark-test,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310"
diff --git a/tensorflow/tools/ci_build/release/requirements_common.txt b/tensorflow/tools/ci_build/release/requirements_common.txt
index 051387bf64e..4ec4f5458d2 100644
--- a/tensorflow/tools/ci_build/release/requirements_common.txt
+++ b/tensorflow/tools/ci_build/release/requirements_common.txt
@@ -3,7 +3,7 @@
 # This will change in the future.
 absl-py ~= 1.0.0
 astunparse ~= 1.6.3
-flatbuffers ~= 2.0
+flatbuffers ~= 23.1.21
 google_pasta ~= 0.2
 h5py ~= 3.8.0  # Earliest version for Python 3.11
 # TODO(b/262592253): Support older versions of NumPy for Python 3.10 and lower
@@ -25,9 +25,9 @@ gast == 0.4.0
 # Note that here we want the latest version that matches TF major.minor version
 # Note that we must use nightly here as these are used in nightly jobs
 # For release jobs, we will pin these on the release branch
-keras-nightly ~= 2.13.0.dev
-tb-nightly ~= 2.12.0.a
-tf-estimator-nightly ~= 2.13.0.dev
+keras-nightly ~= 2.14.0.dev
+tb-nightly ~= 2.13.0.a
+tf-estimator-nightly ~= 2.14.0.dev
 
 # Test dependencies
 grpcio ~= 1.49.1 # Earliest version for Python 3.11
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index da8fc8eeaa4..f4d9d6cf2f3 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -17,7 +17,7 @@ py_library(
     srcs = ["public_api.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:tf_decorator",
         "@six_archive//:six",
     ],
 )
@@ -43,7 +43,7 @@ py_library(
     srcs = ["traverse.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python/util:tf_decorator",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/tools/compatibility/all_renames_v2.py b/tensorflow/tools/compatibility/all_renames_v2.py
index a662afc8d6d..7520be36fae 100644
--- a/tensorflow/tools/compatibility/all_renames_v2.py
+++ b/tensorflow/tools/compatibility/all_renames_v2.py
@@ -25,557 +25,450 @@ from tensorflow.tools.compatibility import renames_v2
 # After modifying this dict, run the following to update reorders_v2.py:
 # bazel run tensorflow/tools/compatibility/update:generate_v2_reorders_map
 manual_symbol_renames = {
-    "tf.batch_to_space_nd":
-        "tf.batch_to_space",
-    "tf.batch_gather":
-        "tf.compat.v1.batch_gather",
-    "tf.space_to_batch_nd":
-        "tf.space_to_batch",
-    "tf.nn.space_to_batch":
-        "tf.space_to_batch",
-    "tf.estimator.inputs":
-        "tf.compat.v1.estimator.inputs",
-    "tf.extract_image_patches":
-        "tf.image.extract_patches",
-    "tf.image.extract_image_patches":
-        "tf.image.extract_patches",
-    "tf.gfile.Copy":
-        "tf.io.gfile.copy",
-    "tf.gfile.DeleteRecursively":
-        "tf.io.gfile.rmtree",
-    "tf.gfile.Exists":
-        "tf.io.gfile.exists",
-    "tf.gfile.Glob":
-        "tf.io.gfile.glob",
-    "tf.gfile.GFile":
-        "tf.io.gfile.GFile",
-    "tf.gfile.IsDirectory":
-        "tf.io.gfile.isdir",
-    "tf.gfile.ListDirectory":
-        "tf.io.gfile.listdir",
-    "tf.gfile.MakeDirs":
-        "tf.io.gfile.makedirs",
-    "tf.gfile.MkDir":
-        "tf.io.gfile.mkdir",
-    "tf.gfile.Open":
-        "tf.io.gfile.GFile",
-    "tf.gfile.Remove":
-        "tf.io.gfile.remove",
-    "tf.gfile.Rename":
-        "tf.io.gfile.rename",
-    "tf.gfile.Stat":
-        "tf.io.gfile.stat",
-    "tf.gfile.Walk":
-        "tf.io.gfile.walk",
-    "tf.contrib.cluster_resolver.ClusterResolver":
-        "tf.distribute.cluster_resolver.ClusterResolver",
-    "tf.contrib.cluster_resolver.GceClusterResolver":
-        "tf.distribute.cluster_resolver.GCEClusterResolver",
-    "tf.contrib.cluster_resolver.KubernetesClusterResolver":
-        "tf.distribute.cluster_resolver.KubernetesClusterResolver",
-    "tf.contrib.cluster_resolver.SimpleClusterResolver":
-        "tf.distribute.cluster_resolver.SimpleClusterResolver",
-    "tf.contrib.cluster_resolver.SlurmClusterResolver":
-        "tf.distribute.cluster_resolver.SlurmClusterResolver",
-    "tf.contrib.cluster_resolver.TFConfigClusterResolver":
-        "tf.distribute.cluster_resolver.TFConfigClusterResolver",
-    "tf.contrib.cluster_resolver.TPUClusterResolver":
-        "tf.distribute.cluster_resolver.TPUClusterResolver",
-    "tf.contrib.cluster_resolver.UnionClusterResolver":
-        "tf.distribute.cluster_resolver.UnionClusterResolver",
-    "tf.contrib.data.AUTOTUNE":
-        "tf.data.experimental.AUTOTUNE",
-    "tf.contrib.data.Counter":
-        "tf.data.experimental.Counter",
-    "tf.contrib.data.CheckpointInputPipelineHook":
-        "tf.data.experimental.CheckpointInputPipelineHook",
-    "tf.contrib.data.CsvDataset":
-        "tf.data.experimental.CsvDataset",
-    "tf.contrib.data.Optional":
-        "tf.data.experimental.Optional",
-    "tf.contrib.data.RandomDataset":
-        "tf.data.experimental.RandomDataset",
-    "tf.contrib.data.Reducer":
-        "tf.data.experimental.Reducer",
-    "tf.contrib.data.SqlDataset":
-        "tf.data.experimental.SqlDataset",
-    "tf.contrib.data.StatsAggregator":
-        "tf.data.experimental.StatsAggregator",
-    "tf.contrib.data.TFRecordWriter":
-        "tf.data.experimental.TFRecordWriter",
-    "tf.contrib.data.assert_element_shape":
-        "tf.data.experimental.assert_element_shape",
-    "tf.contrib.data.bucket_by_sequence_length":
-        "tf.data.experimental.bucket_by_sequence_length",
-    "tf.contrib.data.choose_from_datasets":
-        "tf.data.experimental.choose_from_datasets",
-    "tf.contrib.data.copy_to_device":
-        "tf.data.experimental.copy_to_device",
-    "tf.contrib.data.dense_to_sparse_batch":
-        "tf.data.experimental.dense_to_sparse_batch",
-    "tf.contrib.data.enumerate_dataset":
-        "tf.data.experimental.enumerate_dataset",
-    "tf.contrib.data.get_next_as_optional":
-        "tf.data.experimental.get_next_as_optional",
-    "tf.contrib.data.get_single_element":
-        "tf.data.experimental.get_single_element",
-    "tf.contrib.data.group_by_reducer":
-        "tf.data.experimental.group_by_reducer",
-    "tf.contrib.data.group_by_window":
-        "tf.data.experimental.group_by_window",
-    "tf.contrib.data.ignore_errors":
-        "tf.data.experimental.ignore_errors",
-    "tf.contrib.data.latency_stats":
-        "tf.data.experimental.latency_stats",
-    "tf.contrib.data.make_batched_features_dataset":
-        "tf.data.experimental.make_batched_features_dataset",
-    "tf.contrib.data.make_csv_dataset":
-        "tf.data.experimental.make_csv_dataset",
-    "tf.contrib.data.make_saveable_from_iterator":
-        "tf.data.experimental.make_saveable_from_iterator",
-    "tf.contrib.data.map_and_batch":
-        "tf.data.experimental.map_and_batch",
-    "tf.contrib.data.parallel_interleave":
-        "tf.data.experimental.parallel_interleave",
-    "tf.contrib.data.parse_example_dataset":
-        "tf.data.experimental.parse_example_dataset",
-    "tf.contrib.data.prefetch_to_device":
-        "tf.data.experimental.prefetch_to_device",
-    "tf.contrib.data.rejection_resample":
-        "tf.data.experimental.rejection_resample",
-    "tf.contrib.data.sample_from_datasets":
-        "tf.data.experimental.sample_from_datasets",
-    "tf.contrib.data.scan":
-        "tf.data.experimental.scan",
-    "tf.contrib.data.set_stats_aggregator":
-        "tf.data.experimental.set_stats_aggregator",
-    "tf.contrib.data.shuffle_and_repeat":
-        "tf.data.experimental.shuffle_and_repeat",
-    "tf.contrib.data.unbatch":
-        "tf.data.experimental.unbatch",
-    "tf.contrib.data.unique":
-        "tf.data.experimental.unique",
-    "tf.contrib.distribute.CrossDeviceOps":
-        "tf.distribute.CrossDeviceOps",
-    "tf.contrib.distribute.ReductionToOneDeviceCrossDeviceOps":
-        "tf.distribute.ReductionToOneDevice",
-    "tf.contrib.estimator.make_early_stopping_hook":
-        "tf.estimator.experimental.make_early_stopping_hook",
-    "tf.contrib.estimator.stop_if_higher_hook":
-        "tf.estimator.experimental.stop_if_higher_hook",
-    "tf.contrib.estimator.stop_if_lower_hook":
-        "tf.estimator.experimental.stop_if_lower_hook",
-    "tf.contrib.estimator.stop_if_no_decrease_hook":
-        "tf.estimator.experimental.stop_if_no_decrease_hook",
-    "tf.contrib.estimator.stop_if_no_increase_hook":
-        "tf.estimator.experimental.stop_if_no_increase_hook",
-    "tf.contrib.framework.CriticalSection":
-        "tf.CriticalSection",
-    "tf.contrib.framework.is_tensor":
-        "tf.is_tensor",
-    "tf.contrib.framework.load_variable":
-        "tf.train.load_variable",
-    "tf.contrib.framework.nest.assert_same_structure":
-        "tf.nest.assert_same_structure",
-    "tf.contrib.framework.nest.flatten":
-        "tf.nest.flatten",
-    "tf.contrib.framework.nest.is_sequence":
-        "tf.nest.is_nested",
-    "tf.contrib.framework.nest.map_structure":
-        "tf.nest.map_structure",
-    "tf.contrib.framework.nest.pack_sequence_as":
-        "tf.nest.pack_sequence_as",
-    "tf.contrib.batching.batch_function":
-        "tf.nondifferentiable_batch_function",
-    "tf.contrib.util.constant_value":
-        "tf.get_static_value",
-    "tf.contrib.saved_model.load_keras_model":
-        "tf.compat.v1.keras.experimental.load_from_saved_model",
-    "tf.contrib.saved_model.save_keras_model":
-        "tf.compat.v1.keras.experimental.export_saved_model",
-    "tf.contrib.rnn.RNNCell":
-        "tf.compat.v1.nn.rnn_cell.RNNCell",
-    "tf.contrib.rnn.LSTMStateTuple":
-        "tf.nn.rnn_cell.LSTMStateTuple",
-    "tf.contrib.rnn.BasicLSTMCell":
-        "tf.compat.v1.nn.rnn_cell.BasicLSTMCell",
-    "tf.contrib.rnn.BasicRNNCell":
-        "tf.compat.v1.nn.rnn_cell.BasicRNNCell",
-    "tf.contrib.rnn.GRUCell":
-        "tf.compat.v1.nn.rnn_cell.GRUCell",
-    "tf.contrib.rnn.LSTMCell":
-        "tf.compat.v1.nn.rnn_cell.LSTMCell",
-    "tf.contrib.rnn.MultiRNNCell":
-        "tf.compat.v1.nn.rnn_cell.MultiRNNCell",
-    "tf.contrib.rnn.static_rnn":
-        "tf.compat.v1.nn.static_rnn",
-    "tf.contrib.rnn.static_state_saving_rnn":
-        "tf.compat.v1.nn.static_state_saving_rnn",
-    "tf.contrib.rnn.static_bidirectional_rnn":
-        "tf.compat.v1.nn.static_bidirectional_rnn",
-    "tf.contrib.framework.sort":
-        "tf.sort",
-    "tf.contrib.framework.argsort":
-        "tf.argsort",
-    "tf.contrib.summary.all_summary_ops":
-        "tf.compat.v1.summary.all_v2_summary_ops",
-    "tf.contrib.summary.always_record_summaries":
-        "tf.compat.v2.summary.record_if",
-    "tf.contrib.summary.audio":
-        "tf.compat.v2.summary.audio",
-    "tf.contrib.summary.create_file_writer":
-        "tf.compat.v2.summary.create_file_writer",
-    "tf.contrib.summary.flush":
-        "tf.compat.v2.summary.flush",
-    "tf.contrib.summary.generic":
-        "tf.compat.v2.summary.write",
-    "tf.contrib.summary.histogram":
-        "tf.compat.v2.summary.histogram",
-    "tf.contrib.summary.image":
-        "tf.compat.v2.summary.image",
-    "tf.contrib.summary.initialize":
-        "tf.compat.v1.summary.initialize",
-    "tf.contrib.summary.never_record_summaries":
-        "tf.compat.v2.summary.record_if",
-    "tf.contrib.summary.scalar":
-        "tf.compat.v2.summary.scalar",
-    "tf.contrib.tpu.CrossShardOptimizer":
-        "tf.compat.v1.tpu.CrossShardOptimizer",
-    "tf.contrib.tpu.InputPipelineConfig":
-        "tf.compat.v1.estimator.tpu.InputPipelineConfig",
-    "tf.contrib.tpu.RunConfig":
-        "tf.compat.v1.estimator.tpu.RunConfig",
-    "tf.contrib.tpu.TPUConfig":
-        "tf.compat.v1.estimator.tpu.TPUConfig",
-    "tf.contrib.tpu.TPUEstimator":
-        "tf.compat.v1.estimator.tpu.TPUEstimator",
-    "tf.contrib.tpu.TPUEstimatorSpec":
-        "tf.compat.v1.estimator.tpu.TPUEstimatorSpec",
-    "tf.contrib.tpu.batch_parallel":
-        "tf.compat.v1.tpu.batch_parallel",
-    "tf.contrib.tpu.bfloat16_scope":
-        "tf.compat.v1.tpu.bfloat16_scope",
-    "tf.contrib.tpu.core":
-        "tf.compat.v1.tpu.core",
-    "tf.contrib.tpu.cross_replica_sum":
-        "tf.compat.v1.tpu.cross_replica_sum",
-    "tf.contrib.tpu.initialize_system":
-        "tf.compat.v1.tpu.initialize_system",
-    "tf.contrib.tpu.outside_compilation":
-        "tf.compat.v1.tpu.outside_compilation",
-    "tf.contrib.tpu.replicate":
-        "tf.compat.v1.tpu.replicate",
-    "tf.contrib.tpu.rewrite":
-        "tf.compat.v1.tpu.rewrite",
-    "tf.contrib.tpu.shard":
-        "tf.compat.v1.tpu.shard",
-    "tf.contrib.tpu.shutdown_system":
-        "tf.compat.v1.tpu.shutdown_system",
-    "tf.contrib.training.checkpoints_iterator":
-        "tf.train.checkpoints_iterator",
-    "tf.contrib.layers.recompute_grad":
-        "tf.recompute_grad",
-    "tf.count_nonzero":
-        "tf.math.count_nonzero",
-    "tf.decode_raw":
-        "tf.io.decode_raw",
-    "tf.manip.batch_to_space_nd":
-        "tf.batch_to_space",
-    "tf.quantize_v2":
-        "tf.quantization.quantize",
-    "tf.sparse_matmul":
-        "tf.linalg.matmul",
-    "tf.random.stateless_multinomial":
-        "tf.random.stateless_categorical",
-    "tf.substr":
-        "tf.strings.substr",
+    "tf.batch_to_space_nd": "tf.batch_to_space",
+    "tf.batch_gather": "tf.compat.v1.batch_gather",
+    "tf.space_to_batch_nd": "tf.space_to_batch",
+    "tf.nn.space_to_batch": "tf.space_to_batch",
+    "tf.estimator.inputs": "tf.compat.v1.estimator.inputs",
+    "tf.extract_image_patches": "tf.image.extract_patches",
+    "tf.image.extract_image_patches": "tf.image.extract_patches",
+    "tf.gfile.Copy": "tf.io.gfile.copy",
+    "tf.gfile.DeleteRecursively": "tf.io.gfile.rmtree",
+    "tf.gfile.Exists": "tf.io.gfile.exists",
+    "tf.gfile.Glob": "tf.io.gfile.glob",
+    "tf.gfile.GFile": "tf.io.gfile.GFile",
+    "tf.gfile.IsDirectory": "tf.io.gfile.isdir",
+    "tf.gfile.ListDirectory": "tf.io.gfile.listdir",
+    "tf.gfile.MakeDirs": "tf.io.gfile.makedirs",
+    "tf.gfile.MkDir": "tf.io.gfile.mkdir",
+    "tf.gfile.Open": "tf.io.gfile.GFile",
+    "tf.gfile.Remove": "tf.io.gfile.remove",
+    "tf.gfile.Rename": "tf.io.gfile.rename",
+    "tf.gfile.Stat": "tf.io.gfile.stat",
+    "tf.gfile.Walk": "tf.io.gfile.walk",
+    "tf.contrib.cluster_resolver.ClusterResolver": (
+        "tf.distribute.cluster_resolver.ClusterResolver"
+    ),
+    "tf.contrib.cluster_resolver.GceClusterResolver": (
+        "tf.distribute.cluster_resolver.GCEClusterResolver"
+    ),
+    "tf.contrib.cluster_resolver.KubernetesClusterResolver": (
+        "tf.distribute.cluster_resolver.KubernetesClusterResolver"
+    ),
+    "tf.contrib.cluster_resolver.SimpleClusterResolver": (
+        "tf.distribute.cluster_resolver.SimpleClusterResolver"
+    ),
+    "tf.contrib.cluster_resolver.SlurmClusterResolver": (
+        "tf.distribute.cluster_resolver.SlurmClusterResolver"
+    ),
+    "tf.contrib.cluster_resolver.TFConfigClusterResolver": (
+        "tf.distribute.cluster_resolver.TFConfigClusterResolver"
+    ),
+    "tf.contrib.cluster_resolver.TPUClusterResolver": (
+        "tf.distribute.cluster_resolver.TPUClusterResolver"
+    ),
+    "tf.contrib.cluster_resolver.UnionClusterResolver": (
+        "tf.distribute.cluster_resolver.UnionClusterResolver"
+    ),
+    "tf.contrib.data.AUTOTUNE": "tf.data.experimental.AUTOTUNE",
+    "tf.contrib.data.Counter": "tf.data.experimental.Counter",
+    "tf.contrib.data.CheckpointInputPipelineHook": (
+        "tf.data.experimental.CheckpointInputPipelineHook"
+    ),
+    "tf.contrib.data.CsvDataset": "tf.data.experimental.CsvDataset",
+    "tf.contrib.data.Optional": "tf.data.experimental.Optional",
+    "tf.contrib.data.RandomDataset": "tf.data.experimental.RandomDataset",
+    "tf.contrib.data.Reducer": "tf.data.experimental.Reducer",
+    "tf.contrib.data.SqlDataset": "tf.data.experimental.SqlDataset",
+    "tf.contrib.data.StatsAggregator": "tf.data.experimental.StatsAggregator",
+    "tf.contrib.data.TFRecordWriter": "tf.data.experimental.TFRecordWriter",
+    "tf.contrib.data.assert_element_shape": (
+        "tf.data.experimental.assert_element_shape"
+    ),
+    "tf.contrib.data.bucket_by_sequence_length": (
+        "tf.data.experimental.bucket_by_sequence_length"
+    ),
+    "tf.contrib.data.choose_from_datasets": (
+        "tf.data.experimental.choose_from_datasets"
+    ),
+    "tf.contrib.data.copy_to_device": "tf.data.experimental.copy_to_device",
+    "tf.contrib.data.dense_to_sparse_batch": (
+        "tf.data.experimental.dense_to_sparse_batch"
+    ),
+    "tf.contrib.data.enumerate_dataset": (
+        "tf.data.experimental.enumerate_dataset"
+    ),
+    "tf.contrib.data.get_next_as_optional": (
+        "tf.data.experimental.get_next_as_optional"
+    ),
+    "tf.contrib.data.get_single_element": (
+        "tf.data.experimental.get_single_element"
+    ),
+    "tf.contrib.data.group_by_reducer": "tf.data.experimental.group_by_reducer",
+    "tf.contrib.data.group_by_window": "tf.data.experimental.group_by_window",
+    "tf.contrib.data.ignore_errors": "tf.data.experimental.ignore_errors",
+    "tf.contrib.data.latency_stats": "tf.data.experimental.latency_stats",
+    "tf.contrib.data.make_batched_features_dataset": (
+        "tf.data.experimental.make_batched_features_dataset"
+    ),
+    "tf.contrib.data.make_csv_dataset": "tf.data.experimental.make_csv_dataset",
+    "tf.contrib.data.make_saveable_from_iterator": (
+        "tf.data.experimental.make_saveable_from_iterator"
+    ),
+    "tf.contrib.data.map_and_batch": "tf.data.experimental.map_and_batch",
+    "tf.contrib.data.parallel_interleave": (
+        "tf.data.experimental.parallel_interleave"
+    ),
+    "tf.contrib.data.parse_example_dataset": (
+        "tf.data.experimental.parse_example_dataset"
+    ),
+    "tf.contrib.data.prefetch_to_device": (
+        "tf.data.experimental.prefetch_to_device"
+    ),
+    "tf.contrib.data.rejection_resample": (
+        "tf.data.experimental.rejection_resample"
+    ),
+    "tf.contrib.data.sample_from_datasets": (
+        "tf.data.experimental.sample_from_datasets"
+    ),
+    "tf.contrib.data.scan": "tf.data.experimental.scan",
+    "tf.contrib.data.set_stats_aggregator": (
+        "tf.data.experimental.set_stats_aggregator"
+    ),
+    "tf.contrib.data.shuffle_and_repeat": (
+        "tf.data.experimental.shuffle_and_repeat"
+    ),
+    "tf.contrib.data.unbatch": "tf.data.experimental.unbatch",
+    "tf.contrib.data.unique": "tf.data.experimental.unique",
+    "tf.contrib.distribute.CrossDeviceOps": "tf.distribute.CrossDeviceOps",
+    "tf.contrib.distribute.ReductionToOneDeviceCrossDeviceOps": (
+        "tf.distribute.ReductionToOneDevice"
+    ),
+    "tf.contrib.estimator.make_early_stopping_hook": (
+        "tf.estimator.experimental.make_early_stopping_hook"
+    ),
+    "tf.contrib.estimator.stop_if_higher_hook": (
+        "tf.estimator.experimental.stop_if_higher_hook"
+    ),
+    "tf.contrib.estimator.stop_if_lower_hook": (
+        "tf.estimator.experimental.stop_if_lower_hook"
+    ),
+    "tf.contrib.estimator.stop_if_no_decrease_hook": (
+        "tf.estimator.experimental.stop_if_no_decrease_hook"
+    ),
+    "tf.contrib.estimator.stop_if_no_increase_hook": (
+        "tf.estimator.experimental.stop_if_no_increase_hook"
+    ),
+    "tf.contrib.framework.CriticalSection": "tf.CriticalSection",
+    "tf.contrib.framework.is_tensor": "tf.is_tensor",
+    "tf.contrib.framework.load_variable": "tf.train.load_variable",
+    "tf.contrib.framework.nest.assert_same_structure": (
+        "tf.nest.assert_same_structure"
+    ),
+    "tf.contrib.framework.nest.flatten": "tf.nest.flatten",
+    "tf.contrib.framework.nest.is_nested": "tf.nest.is_nested",
+    "tf.contrib.framework.nest.map_structure": "tf.nest.map_structure",
+    "tf.contrib.framework.nest.pack_sequence_as": "tf.nest.pack_sequence_as",
+    "tf.contrib.batching.batch_function": "tf.nondifferentiable_batch_function",
+    "tf.contrib.util.constant_value": "tf.get_static_value",
+    "tf.contrib.saved_model.load_keras_model": (
+        "tf.compat.v1.keras.experimental.load_from_saved_model"
+    ),
+    "tf.contrib.saved_model.save_keras_model": (
+        "tf.compat.v1.keras.experimental.export_saved_model"
+    ),
+    "tf.contrib.rnn.RNNCell": "tf.compat.v1.nn.rnn_cell.RNNCell",
+    "tf.contrib.rnn.LSTMStateTuple": "tf.nn.rnn_cell.LSTMStateTuple",
+    "tf.contrib.rnn.BasicLSTMCell": "tf.compat.v1.nn.rnn_cell.BasicLSTMCell",
+    "tf.contrib.rnn.BasicRNNCell": "tf.compat.v1.nn.rnn_cell.BasicRNNCell",
+    "tf.contrib.rnn.GRUCell": "tf.compat.v1.nn.rnn_cell.GRUCell",
+    "tf.contrib.rnn.LSTMCell": "tf.compat.v1.nn.rnn_cell.LSTMCell",
+    "tf.contrib.rnn.MultiRNNCell": "tf.compat.v1.nn.rnn_cell.MultiRNNCell",
+    "tf.contrib.rnn.static_rnn": "tf.compat.v1.nn.static_rnn",
+    "tf.contrib.rnn.static_state_saving_rnn": (
+        "tf.compat.v1.nn.static_state_saving_rnn"
+    ),
+    "tf.contrib.rnn.static_bidirectional_rnn": (
+        "tf.compat.v1.nn.static_bidirectional_rnn"
+    ),
+    "tf.contrib.framework.sort": "tf.sort",
+    "tf.contrib.framework.argsort": "tf.argsort",
+    "tf.contrib.summary.all_summary_ops": (
+        "tf.compat.v1.summary.all_v2_summary_ops"
+    ),
+    "tf.contrib.summary.always_record_summaries": (
+        "tf.compat.v2.summary.record_if"
+    ),
+    "tf.contrib.summary.audio": "tf.compat.v2.summary.audio",
+    "tf.contrib.summary.create_file_writer": (
+        "tf.compat.v2.summary.create_file_writer"
+    ),
+    "tf.contrib.summary.flush": "tf.compat.v2.summary.flush",
+    "tf.contrib.summary.generic": "tf.compat.v2.summary.write",
+    "tf.contrib.summary.histogram": "tf.compat.v2.summary.histogram",
+    "tf.contrib.summary.image": "tf.compat.v2.summary.image",
+    "tf.contrib.summary.initialize": "tf.compat.v1.summary.initialize",
+    "tf.contrib.summary.never_record_summaries": (
+        "tf.compat.v2.summary.record_if"
+    ),
+    "tf.contrib.summary.scalar": "tf.compat.v2.summary.scalar",
+    "tf.contrib.tpu.CrossShardOptimizer": (
+        "tf.compat.v1.tpu.CrossShardOptimizer"
+    ),
+    "tf.contrib.tpu.InputPipelineConfig": (
+        "tf.compat.v1.estimator.tpu.InputPipelineConfig"
+    ),
+    "tf.contrib.tpu.RunConfig": "tf.compat.v1.estimator.tpu.RunConfig",
+    "tf.contrib.tpu.TPUConfig": "tf.compat.v1.estimator.tpu.TPUConfig",
+    "tf.contrib.tpu.TPUEstimator": "tf.compat.v1.estimator.tpu.TPUEstimator",
+    "tf.contrib.tpu.TPUEstimatorSpec": (
+        "tf.compat.v1.estimator.tpu.TPUEstimatorSpec"
+    ),
+    "tf.contrib.tpu.batch_parallel": "tf.compat.v1.tpu.batch_parallel",
+    "tf.contrib.tpu.bfloat16_scope": "tf.compat.v1.tpu.bfloat16_scope",
+    "tf.contrib.tpu.core": "tf.compat.v1.tpu.core",
+    "tf.contrib.tpu.cross_replica_sum": "tf.compat.v1.tpu.cross_replica_sum",
+    "tf.contrib.tpu.initialize_system": "tf.compat.v1.tpu.initialize_system",
+    "tf.contrib.tpu.outside_compilation": (
+        "tf.compat.v1.tpu.outside_compilation"
+    ),
+    "tf.contrib.tpu.replicate": "tf.compat.v1.tpu.replicate",
+    "tf.contrib.tpu.rewrite": "tf.compat.v1.tpu.rewrite",
+    "tf.contrib.tpu.shard": "tf.compat.v1.tpu.shard",
+    "tf.contrib.tpu.shutdown_system": "tf.compat.v1.tpu.shutdown_system",
+    "tf.contrib.training.checkpoints_iterator": "tf.train.checkpoints_iterator",
+    "tf.contrib.layers.recompute_grad": "tf.recompute_grad",
+    "tf.count_nonzero": "tf.math.count_nonzero",
+    "tf.decode_raw": "tf.io.decode_raw",
+    "tf.manip.batch_to_space_nd": "tf.batch_to_space",
+    "tf.quantize_v2": "tf.quantization.quantize",
+    "tf.sparse_matmul": "tf.linalg.matmul",
+    "tf.random.stateless_multinomial": "tf.random.stateless_categorical",
+    "tf.substr": "tf.strings.substr",
     # TODO(b/129398290)
-    "tf.string_split":
-        "tf.compat.v1.string_split",
-    "tf.string_to_hash_bucket":
-        "tf.strings.to_hash_bucket",
-    "tf.string_to_number":
-        "tf.strings.to_number",
-    "tf.multinomial":
-        "tf.random.categorical",
-    "tf.random.multinomial":
-        "tf.random.categorical",
-    "tf.reduce_join":
-        "tf.strings.reduce_join",
-    "tf.load_file_system_library":
-        "tf.load_library",
-    "tf.bincount":
-        "tf.math.bincount",
-    "tf.confusion_matrix":
-        "tf.math.confusion_matrix",
-    "tf.train.confusion_matrix":
-        "tf.math.confusion_matrix",
-    "tf.train.sdca_fprint":
-        "tf.raw_ops.SdcaFprint",
-    "tf.train.sdca_optimizer":
-        "tf.raw_ops.SdcaOptimizer",
-    "tf.train.sdca_shrink_l1":
-        "tf.raw_ops.SdcaShrinkL1",
-    "tf.decode_csv":
-        "tf.io.decode_csv",
-    "tf.data.Iterator":
-        "tf.compat.v1.data.Iterator",
-    "tf.data.experimental.DatasetStructure":
-        "tf.data.DatasetSpec",
-    "tf.data.experimental.OptionalStructure":
-        "tf.OptionalSpec",
-    "tf.data.experimental.RaggedTensorStructure":
-        "tf.RaggedTensorSpec",
-    "tf.data.experimental.SparseTensorStructure":
-        "tf.SparseTensorSpec",
-    "tf.data.experimental.Structure":
-        "tf.TypeSpec",
-    "tf.data.experimental.TensorArrayStructure":
-        "tf.TensorArraySpec",
-    "tf.data.experimental.TensorStructure":
-        "tf.TensorSpec",
-    "tf.parse_example":
-        "tf.io.parse_example",
-    "tf.parse_single_example":
-        "tf.io.parse_single_example",
-    "tf.nn.fused_batch_norm":
-        "tf.compat.v1.nn.fused_batch_norm",
-    "tf.nn.softmax_cross_entropy_with_logits_v2":
-        "tf.nn.softmax_cross_entropy_with_logits",
-    "tf.losses.Reduction.MEAN":
-        "tf.compat.v1.losses.Reduction.MEAN",
-    "tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS":
-        "tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS",
-    "tf.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS":
-        "tf.compat.v1.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS",
-    "tf.lite.constants.FLOAT":
-        "tf.float32",
-    "tf.lite.constants.FLOAT16":
-        "tf.float16",
-    "tf.lite.constants.INT16":
-        "tf.int16",
-    "tf.lite.constants.INT32":
-        "tf.int32",
-    "tf.lite.constants.INT64":
-        "tf.int64",
-    "tf.lite.constants.INT8":
-        "tf.int8",
-    "tf.lite.constants.STRING":
-        "tf.string",
-    "tf.lite.constants.QUANTIZED_UINT8":
-        "tf.uint8",
-    "tf.arg_max":
-        "tf.argmax",
-    "tf.arg_min":
-        "tf.argmin",
+    "tf.string_split": "tf.compat.v1.string_split",
+    "tf.string_to_hash_bucket": "tf.strings.to_hash_bucket",
+    "tf.string_to_number": "tf.strings.to_number",
+    "tf.multinomial": "tf.random.categorical",
+    "tf.random.multinomial": "tf.random.categorical",
+    "tf.reduce_join": "tf.strings.reduce_join",
+    "tf.load_file_system_library": "tf.load_library",
+    "tf.bincount": "tf.math.bincount",
+    "tf.confusion_matrix": "tf.math.confusion_matrix",
+    "tf.train.confusion_matrix": "tf.math.confusion_matrix",
+    "tf.train.sdca_fprint": "tf.raw_ops.SdcaFprint",
+    "tf.train.sdca_optimizer": "tf.raw_ops.SdcaOptimizer",
+    "tf.train.sdca_shrink_l1": "tf.raw_ops.SdcaShrinkL1",
+    "tf.decode_csv": "tf.io.decode_csv",
+    "tf.data.Iterator": "tf.compat.v1.data.Iterator",
+    "tf.data.experimental.DatasetStructure": "tf.data.DatasetSpec",
+    "tf.data.experimental.OptionalStructure": "tf.OptionalSpec",
+    "tf.data.experimental.RaggedTensorStructure": "tf.RaggedTensorSpec",
+    "tf.data.experimental.SparseTensorStructure": "tf.SparseTensorSpec",
+    "tf.data.experimental.Structure": "tf.TypeSpec",
+    "tf.data.experimental.TensorArrayStructure": "tf.TensorArraySpec",
+    "tf.data.experimental.TensorStructure": "tf.TensorSpec",
+    "tf.parse_example": "tf.io.parse_example",
+    "tf.parse_single_example": "tf.io.parse_single_example",
+    "tf.nn.fused_batch_norm": "tf.compat.v1.nn.fused_batch_norm",
+    "tf.nn.softmax_cross_entropy_with_logits_v2": (
+        "tf.nn.softmax_cross_entropy_with_logits"
+    ),
+    "tf.losses.Reduction.MEAN": "tf.compat.v1.losses.Reduction.MEAN",
+    "tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS": (
+        "tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS"
+    ),
+    "tf.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS": (
+        "tf.compat.v1.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS"
+    ),
+    "tf.lite.constants.FLOAT": "tf.float32",
+    "tf.lite.constants.FLOAT16": "tf.float16",
+    "tf.lite.constants.INT16": "tf.int16",
+    "tf.lite.constants.INT32": "tf.int32",
+    "tf.lite.constants.INT64": "tf.int64",
+    "tf.lite.constants.INT8": "tf.int8",
+    "tf.lite.constants.STRING": "tf.string",
+    "tf.lite.constants.QUANTIZED_UINT8": "tf.uint8",
+    "tf.arg_max": "tf.argmax",
+    "tf.arg_min": "tf.argmin",
     # tf.nn.ctc_loss is still available in 2.0 but behavior
     # changed significantly.
-    "tf.nn.ctc_loss":
-        "tf.compat.v1.nn.ctc_loss",
+    "tf.nn.ctc_loss": "tf.compat.v1.nn.ctc_loss",
     # tf.saved_model.load in 1.x has no equivalent in 2.x, but there is a
     # symbol with the same name.
-    "tf.saved_model.load":
-        "tf.compat.v1.saved_model.load",
-    "tf.saved_model.loader.load":
-        "tf.compat.v1.saved_model.load",
-    "tf.saved_model.load_v2":
-        "tf.compat.v2.saved_model.load",
-    "tf.image.resize_images":
-        "tf.image.resize",
-    "tf.assert_equal":
-        "tf.compat.v1.assert_equal",
-    "tf.assert_greater":
-        "tf.compat.v1.assert_greater",
-    "tf.assert_greater_equal":
-        "tf.compat.v1.assert_greater_equal",
-    "tf.assert_integer":
-        "tf.compat.v1.assert_integer",
-    "tf.assert_less":
-        "tf.compat.v1.assert_less",
-    "tf.assert_less_equal":
-        "tf.compat.v1.assert_less_equal",
-    "tf.assert_near":
-        "tf.compat.v1.assert_near",
-    "tf.assert_negative":
-        "tf.compat.v1.assert_negative",
-    "tf.assert_non_negative":
-        "tf.compat.v1.assert_non_negative",
-    "tf.assert_non_positive":
-        "tf.compat.v1.assert_non_positive",
-    "tf.assert_none_equal":
-        "tf.compat.v1.assert_none_equal",
-    "tf.assert_positive":
-        "tf.compat.v1.assert_positive",
-    "tf.assert_rank":
-        "tf.compat.v1.assert_rank",
-    "tf.assert_rank_at_least":
-        "tf.compat.v1.assert_rank_at_least",
-    "tf.assert_rank_in":
-        "tf.compat.v1.assert_rank_in",
-    "tf.assert_scalar":
-        "tf.compat.v1.assert_scalar",
-    "tf.assert_type":
-        "tf.compat.v1.assert_type",
-    "tf.assert_variables_initialized":
-        "tf.compat.v1.assert_variables_initialized",
-    "tf.debugging.assert_equal":
-        "tf.compat.v1.debugging.assert_equal",
-    "tf.debugging.assert_greater":
-        "tf.compat.v1.debugging.assert_greater",
-    "tf.debugging.assert_greater_equal":
-        "tf.compat.v1.debugging.assert_greater_equal",
-    "tf.debugging.assert_integer":
-        "tf.compat.v1.debugging.assert_integer",
-    "tf.debugging.assert_less":
-        "tf.compat.v1.debugging.assert_less",
-    "tf.debugging.assert_less_equal":
-        "tf.compat.v1.debugging.assert_less_equal",
-    "tf.debugging.assert_near":
-        "tf.compat.v1.debugging.assert_near",
-    "tf.debugging.assert_negative":
-        "tf.compat.v1.debugging.assert_negative",
-    "tf.debugging.assert_non_negative":
-        "tf.compat.v1.debugging.assert_non_negative",
-    "tf.debugging.assert_non_positive":
-        "tf.compat.v1.debugging.assert_non_positive",
-    "tf.debugging.assert_none_equal":
-        "tf.compat.v1.debugging.assert_none_equal",
-    "tf.debugging.assert_positive":
-        "tf.compat.v1.debugging.assert_positive",
-    "tf.debugging.assert_rank":
-        "tf.compat.v1.debugging.assert_rank",
-    "tf.debugging.assert_rank_at_least":
-        "tf.compat.v1.debugging.assert_rank_at_least",
-    "tf.debugging.assert_rank_in":
-        "tf.compat.v1.debugging.assert_rank_in",
-    "tf.debugging.assert_scalar":
-        "tf.compat.v1.debugging.assert_scalar",
-    "tf.debugging.assert_type":
-        "tf.compat.v1.debugging.assert_type",
-    "tf.errors.exception_type_from_error_code":
-        "tf.compat.v1.errors.exception_type_from_error_code",
-    "tf.errors.error_code_from_exception_type":
-        "tf.compat.v1.errors.error_code_from_exception_type",
-    "tf.errors.raise_exception_on_not_ok_status":
-        "tf.compat.v1.errors.raise_exception_on_not_ok_status",
-    "tf.nn.max_pool":
-        "tf.nn.max_pool2d",
-    "tf.nn.avg_pool":
-        "tf.nn.avg_pool2d",
-    "tf.keras.initializers.zeros":
-        "tf.compat.v1.keras.initializers.zeros",
-    "tf.keras.initializers.Zeros":
-        "tf.compat.v1.keras.initializers.Zeros",
-    "tf.keras.initializers.ones":
-        "tf.compat.v1.keras.initializers.ones",
-    "tf.keras.initializers.Ones":
-        "tf.compat.v1.keras.initializers.Ones",
-    "tf.keras.initializers.constant":
-        "tf.compat.v1.keras.initializers.constant",
-    "tf.keras.initializers.Constant":
-        "tf.compat.v1.keras.initializers.Constant",
-    "tf.keras.initializers.VarianceScaling":
-        "tf.compat.v1.keras.initializers.VarianceScaling",
-    "tf.keras.initializers.Orthogonal":
-        "tf.compat.v1.keras.initializers.Orthogonal",
-    "tf.keras.initializers.orthogonal":
-        "tf.compat.v1.keras.initializers.orthogonal",
-    "tf.keras.initializers.Identity":
-        "tf.compat.v1.keras.initializers.Identity",
-    "tf.keras.initializers.identity":
-        "tf.compat.v1.keras.initializers.identity",
-    "tf.keras.initializers.glorot_uniform":
-        "tf.compat.v1.keras.initializers.glorot_uniform",
-    "tf.keras.initializers.glorot_normal":
-        "tf.compat.v1.keras.initializers.glorot_normal",
-    "tf.keras.initializers.lecun_normal":
-        "tf.compat.v1.keras.initializers.lecun_normal",
-    "tf.keras.initializers.lecun_uniform":
-        "tf.compat.v1.keras.initializers.lecun_uniform",
-    "tf.keras.initializers.he_normal":
-        "tf.compat.v1.keras.initializers.he_normal",
-    "tf.keras.initializers.he_uniform":
-        "tf.compat.v1.keras.initializers.he_uniform",
-    "tf.keras.initializers.TruncatedNormal":
-        "tf.compat.v1.keras.initializers.TruncatedNormal",
-    "tf.keras.initializers.truncated_normal":
-        "tf.compat.v1.keras.initializers.truncated_normal",
-    "tf.keras.initializers.RandomUniform":
-        "tf.compat.v1.keras.initializers.RandomUniform",
-    "tf.keras.initializers.uniform":
-        "tf.compat.v1.keras.initializers.uniform",
-    "tf.keras.initializers.random_uniform":
-        "tf.compat.v1.keras.initializers.random_uniform",
-    "tf.keras.initializers.RandomNormal":
-        "tf.compat.v1.keras.initializers.RandomNormal",
-    "tf.keras.initializers.normal":
-        "tf.compat.v1.keras.initializers.normal",
-    "tf.keras.initializers.random_normal":
-        "tf.compat.v1.keras.initializers.random_normal",
-    "tf.zeros_initializer":
-        "tf.compat.v1.zeros_initializer",
-    "tf.initializers.zeros":
-        "tf.compat.v1.initializers.zeros",
-    "tf.ones_initializer":
-        "tf.compat.v1.ones_initializer",
-    "tf.initializers.ones":
-        "tf.compat.v1.initializers.ones",
-    "tf.constant_initializer":
-        "tf.compat.v1.constant_initializer",
-    "tf.initializers.constant":
-        "tf.compat.v1.initializers.constant",
-    "tf.random_uniform_initializer":
-        "tf.compat.v1.random_uniform_initializer",
-    "tf.initializers.random_uniform":
-        "tf.compat.v1.initializers.random_uniform",
-    "tf.random_normal_initializer":
-        "tf.compat.v1.random_normal_initializer",
-    "tf.initializers.random_normal":
-        "tf.compat.v1.initializers.random_normal",
-    "tf.truncated_normal_initializer":
-        "tf.compat.v1.truncated_normal_initializer",
-    "tf.initializers.truncated_normal":
-        "tf.compat.v1.initializers.truncated_normal",
-    "tf.variance_scaling_initializer":
-        "tf.compat.v1.variance_scaling_initializer",
-    "tf.initializers.variance_scaling":
-        "tf.compat.v1.initializers.variance_scaling",
-    "tf.orthogonal_initializer":
-        "tf.compat.v1.orthogonal_initializer",
-    "tf.initializers.orthogonal":
-        "tf.compat.v1.initializers.orthogonal",
-    "tf.glorot_uniform_initializer":
-        "tf.compat.v1.glorot_uniform_initializer",
-    "tf.initializers.glorot_uniform":
-        "tf.compat.v1.initializers.glorot_uniform",
-    "tf.glorot_normal_initializer":
-        "tf.compat.v1.glorot_normal_initializer",
-    "tf.initializers.glorot_normal":
-        "tf.compat.v1.initializers.glorot_normal",
-    "tf.initializers.identity":
-        "tf.compat.v1.initializers.identity",
-    "tf.initializers.lecun_normal":
-        "tf.compat.v1.initializers.lecun_normal",
-    "tf.initializers.lecun_uniform":
-        "tf.compat.v1.initializers.lecun_uniform",
-    "tf.initializers.he_normal":
-        "tf.compat.v1.initializers.he_normal",
-    "tf.initializers.he_uniform":
-        "tf.compat.v1.initializers.he_uniform",
-    "tf.data.experimental.map_and_batch_with_legacy_function":
-        "tf.compat.v1.data.experimental.map_and_batch_with_legacy_function",
-    "tf.nn.conv2d_backprop_input":
-        "tf.nn.conv2d_transpose",
-    "tf.test.compute_gradient":
-        "tf.compat.v1.test.compute_gradient",
-    "tf.floor_div":
-        "tf.math.floordiv",
-    "tf.where":
-        "tf.compat.v1.where",
-    "tf.where_v2":
-        "tf.compat.v2.where",
-    "tf.app.flags":
-        "tf.compat.v1.app.flags",
+    "tf.saved_model.load": "tf.compat.v1.saved_model.load",
+    "tf.saved_model.loader.load": "tf.compat.v1.saved_model.load",
+    "tf.saved_model.load_v2": "tf.compat.v2.saved_model.load",
+    "tf.image.resize_images": "tf.image.resize",
+    "tf.assert_equal": "tf.compat.v1.assert_equal",
+    "tf.assert_greater": "tf.compat.v1.assert_greater",
+    "tf.assert_greater_equal": "tf.compat.v1.assert_greater_equal",
+    "tf.assert_integer": "tf.compat.v1.assert_integer",
+    "tf.assert_less": "tf.compat.v1.assert_less",
+    "tf.assert_less_equal": "tf.compat.v1.assert_less_equal",
+    "tf.assert_near": "tf.compat.v1.assert_near",
+    "tf.assert_negative": "tf.compat.v1.assert_negative",
+    "tf.assert_non_negative": "tf.compat.v1.assert_non_negative",
+    "tf.assert_non_positive": "tf.compat.v1.assert_non_positive",
+    "tf.assert_none_equal": "tf.compat.v1.assert_none_equal",
+    "tf.assert_positive": "tf.compat.v1.assert_positive",
+    "tf.assert_rank": "tf.compat.v1.assert_rank",
+    "tf.assert_rank_at_least": "tf.compat.v1.assert_rank_at_least",
+    "tf.assert_rank_in": "tf.compat.v1.assert_rank_in",
+    "tf.assert_scalar": "tf.compat.v1.assert_scalar",
+    "tf.assert_type": "tf.compat.v1.assert_type",
+    "tf.assert_variables_initialized": (
+        "tf.compat.v1.assert_variables_initialized"
+    ),
+    "tf.debugging.assert_equal": "tf.compat.v1.debugging.assert_equal",
+    "tf.debugging.assert_greater": "tf.compat.v1.debugging.assert_greater",
+    "tf.debugging.assert_greater_equal": (
+        "tf.compat.v1.debugging.assert_greater_equal"
+    ),
+    "tf.debugging.assert_integer": "tf.compat.v1.debugging.assert_integer",
+    "tf.debugging.assert_less": "tf.compat.v1.debugging.assert_less",
+    "tf.debugging.assert_less_equal": (
+        "tf.compat.v1.debugging.assert_less_equal"
+    ),
+    "tf.debugging.assert_near": "tf.compat.v1.debugging.assert_near",
+    "tf.debugging.assert_negative": "tf.compat.v1.debugging.assert_negative",
+    "tf.debugging.assert_non_negative": (
+        "tf.compat.v1.debugging.assert_non_negative"
+    ),
+    "tf.debugging.assert_non_positive": (
+        "tf.compat.v1.debugging.assert_non_positive"
+    ),
+    "tf.debugging.assert_none_equal": (
+        "tf.compat.v1.debugging.assert_none_equal"
+    ),
+    "tf.debugging.assert_positive": "tf.compat.v1.debugging.assert_positive",
+    "tf.debugging.assert_rank": "tf.compat.v1.debugging.assert_rank",
+    "tf.debugging.assert_rank_at_least": (
+        "tf.compat.v1.debugging.assert_rank_at_least"
+    ),
+    "tf.debugging.assert_rank_in": "tf.compat.v1.debugging.assert_rank_in",
+    "tf.debugging.assert_scalar": "tf.compat.v1.debugging.assert_scalar",
+    "tf.debugging.assert_type": "tf.compat.v1.debugging.assert_type",
+    "tf.errors.exception_type_from_error_code": (
+        "tf.compat.v1.errors.exception_type_from_error_code"
+    ),
+    "tf.errors.error_code_from_exception_type": (
+        "tf.compat.v1.errors.error_code_from_exception_type"
+    ),
+    "tf.errors.raise_exception_on_not_ok_status": (
+        "tf.compat.v1.errors.raise_exception_on_not_ok_status"
+    ),
+    "tf.nn.max_pool": "tf.nn.max_pool2d",
+    "tf.nn.avg_pool": "tf.nn.avg_pool2d",
+    "tf.keras.initializers.zeros": "tf.compat.v1.keras.initializers.zeros",
+    "tf.keras.initializers.Zeros": "tf.compat.v1.keras.initializers.Zeros",
+    "tf.keras.initializers.ones": "tf.compat.v1.keras.initializers.ones",
+    "tf.keras.initializers.Ones": "tf.compat.v1.keras.initializers.Ones",
+    "tf.keras.initializers.constant": (
+        "tf.compat.v1.keras.initializers.constant"
+    ),
+    "tf.keras.initializers.Constant": (
+        "tf.compat.v1.keras.initializers.Constant"
+    ),
+    "tf.keras.initializers.VarianceScaling": (
+        "tf.compat.v1.keras.initializers.VarianceScaling"
+    ),
+    "tf.keras.initializers.Orthogonal": (
+        "tf.compat.v1.keras.initializers.Orthogonal"
+    ),
+    "tf.keras.initializers.orthogonal": (
+        "tf.compat.v1.keras.initializers.orthogonal"
+    ),
+    "tf.keras.initializers.Identity": (
+        "tf.compat.v1.keras.initializers.Identity"
+    ),
+    "tf.keras.initializers.identity": (
+        "tf.compat.v1.keras.initializers.identity"
+    ),
+    "tf.keras.initializers.glorot_uniform": (
+        "tf.compat.v1.keras.initializers.glorot_uniform"
+    ),
+    "tf.keras.initializers.glorot_normal": (
+        "tf.compat.v1.keras.initializers.glorot_normal"
+    ),
+    "tf.keras.initializers.lecun_normal": (
+        "tf.compat.v1.keras.initializers.lecun_normal"
+    ),
+    "tf.keras.initializers.lecun_uniform": (
+        "tf.compat.v1.keras.initializers.lecun_uniform"
+    ),
+    "tf.keras.initializers.he_normal": (
+        "tf.compat.v1.keras.initializers.he_normal"
+    ),
+    "tf.keras.initializers.he_uniform": (
+        "tf.compat.v1.keras.initializers.he_uniform"
+    ),
+    "tf.keras.initializers.TruncatedNormal": (
+        "tf.compat.v1.keras.initializers.TruncatedNormal"
+    ),
+    "tf.keras.initializers.truncated_normal": (
+        "tf.compat.v1.keras.initializers.truncated_normal"
+    ),
+    "tf.keras.initializers.RandomUniform": (
+        "tf.compat.v1.keras.initializers.RandomUniform"
+    ),
+    "tf.keras.initializers.uniform": "tf.compat.v1.keras.initializers.uniform",
+    "tf.keras.initializers.random_uniform": (
+        "tf.compat.v1.keras.initializers.random_uniform"
+    ),
+    "tf.keras.initializers.RandomNormal": (
+        "tf.compat.v1.keras.initializers.RandomNormal"
+    ),
+    "tf.keras.initializers.normal": "tf.compat.v1.keras.initializers.normal",
+    "tf.keras.initializers.random_normal": (
+        "tf.compat.v1.keras.initializers.random_normal"
+    ),
+    "tf.zeros_initializer": "tf.compat.v1.zeros_initializer",
+    "tf.initializers.zeros": "tf.compat.v1.initializers.zeros",
+    "tf.ones_initializer": "tf.compat.v1.ones_initializer",
+    "tf.initializers.ones": "tf.compat.v1.initializers.ones",
+    "tf.constant_initializer": "tf.compat.v1.constant_initializer",
+    "tf.initializers.constant": "tf.compat.v1.initializers.constant",
+    "tf.random_uniform_initializer": "tf.compat.v1.random_uniform_initializer",
+    "tf.initializers.random_uniform": (
+        "tf.compat.v1.initializers.random_uniform"
+    ),
+    "tf.random_normal_initializer": "tf.compat.v1.random_normal_initializer",
+    "tf.initializers.random_normal": "tf.compat.v1.initializers.random_normal",
+    "tf.truncated_normal_initializer": (
+        "tf.compat.v1.truncated_normal_initializer"
+    ),
+    "tf.initializers.truncated_normal": (
+        "tf.compat.v1.initializers.truncated_normal"
+    ),
+    "tf.variance_scaling_initializer": (
+        "tf.compat.v1.variance_scaling_initializer"
+    ),
+    "tf.initializers.variance_scaling": (
+        "tf.compat.v1.initializers.variance_scaling"
+    ),
+    "tf.orthogonal_initializer": "tf.compat.v1.orthogonal_initializer",
+    "tf.initializers.orthogonal": "tf.compat.v1.initializers.orthogonal",
+    "tf.glorot_uniform_initializer": "tf.compat.v1.glorot_uniform_initializer",
+    "tf.initializers.glorot_uniform": (
+        "tf.compat.v1.initializers.glorot_uniform"
+    ),
+    "tf.glorot_normal_initializer": "tf.compat.v1.glorot_normal_initializer",
+    "tf.initializers.glorot_normal": "tf.compat.v1.initializers.glorot_normal",
+    "tf.initializers.identity": "tf.compat.v1.initializers.identity",
+    "tf.initializers.lecun_normal": "tf.compat.v1.initializers.lecun_normal",
+    "tf.initializers.lecun_uniform": "tf.compat.v1.initializers.lecun_uniform",
+    "tf.initializers.he_normal": "tf.compat.v1.initializers.he_normal",
+    "tf.initializers.he_uniform": "tf.compat.v1.initializers.he_uniform",
+    "tf.data.experimental.map_and_batch_with_legacy_function": (
+        "tf.compat.v1.data.experimental.map_and_batch_with_legacy_function"
+    ),
+    "tf.nn.conv2d_backprop_input": "tf.nn.conv2d_transpose",
+    "tf.test.compute_gradient": "tf.compat.v1.test.compute_gradient",
+    "tf.floor_div": "tf.math.floordiv",
+    "tf.where": "tf.compat.v1.where",
+    "tf.where_v2": "tf.compat.v2.where",
+    "tf.app.flags": "tf.compat.v1.app.flags",
 }
 # pylint: enable=line-too-long
 
diff --git a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
index 71c34e440c4..4c396b4853b 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
+++ b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
@@ -1,5 +1,4 @@
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -50,6 +49,14 @@ INCLUDEPRE_RE = re.compile(r"absl::lts_[0-9]+::base_internal::ThrowStdOutOfRange
                            r"absl::lts_[0-9]+::StrAppend|" # for _pywrap_tfe
                            r"absl::lts_[0-9]+::hash_internal|" # for _pywrap_tfcompile
                            r"absl::lts_[0-9]+::container_internal|" # for _pywrap_tfcompile
+                           r"absl::lts_[0-9]+::Status::raw_code|" # for absl::Status
+                           r"absl::lts_[0-9]+::Status::code|" # for absl::Status
+                           r"absl::lts_[0-9]+::Status::UnrefNonInlined|"  # for absl::Status
+                           r"absl::lts_[0-9]+::Status::Status|" # for absl::Status
+                           r"absl::lts_[0-9]+::Status::ForEachPayload|" # for absl::Status
+                           r"absl::lts_[0-9]+::internal_statusor::Helper::Crash|"  # for absl::StatusOr
+                           r"absl::lts_[0-9]+::internal_statusor::Helper::HandleInvalidStatusCtorArg|"
+                           r"absl::lts_[0-9]+::internal_statusor::ThrowBadStatusOrAccess|"
                            r"absl::lts_[0-9]+::Cord|" # for tensorflow::Status
                            r"absl::lts_[0-9]+::Cord::DestroyCordSlow|" # for tensorflow::Status
                            r"absl::lts_[0-9]+::cord_internal::CordzInfo::MaybeTrackCordImpl" # tensorflow::Status usage of absl::Cord
@@ -293,8 +300,8 @@ def main():
     def_fp.write("\t ??_7ConfigProto@tensorflow@@6B@\n") # for _pywrap_tfe
     def_fp.write("\t ??_7CoordinatedTask@tensorflow@@6B@\n") # for _pywrap_tfe
     def_fp.write("\t ?InternalSwap@CoordinatedTask@tensorflow@@AEAAXPEAV12@@Z\n") # for _pywrap_tfe
-    def_fp.write("\t ?kSeed@MixingHashState@hash_internal@lts_20220623@absl@@0QEBXEB\n") # for _pywrap_tfcompile
-    def_fp.write("\t ?kEmptyGroup@container_internal@lts_20220623@absl@@3QBW4ctrl_t@123@B\n") # for _pywrap_tfcompile
+    def_fp.write("\t ?kSeed@MixingHashState@hash_internal@lts_20230125@absl@@0QEBXEB\n") # for _pywrap_tfcompile
+    def_fp.write("\t ?kEmptyGroup@container_internal@lts_20230125@absl@@3QBW4ctrl_t@123@B\n") # for _pywrap_tfcompile
     def_fp.write("\t ??_7GraphDef@tensorflow@@6B@\n")
     def_fp.write("\t ??_7DeviceProperties@tensorflow@@6B@\n")
     def_fp.write("\t ??_7MetaGraphDef@tensorflow@@6B@\n")
@@ -304,7 +311,7 @@ def main():
     def_fp.write("\t ??1CoordinatedTask@tensorflow@@UEAA@XZ\n") # for _pywrap_tfe
     def_fp.write("\t ?CopyFrom@CoordinatedTask@tensorflow@@QEAAXAEBV12@@Z\n") # for _pywrap_tfe
     def_fp.write("\t ??0CoordinatedTask@tensorflow@@IEAA@PEAVArena@protobuf@google@@_N@Z\n") # for _pywrap_tfe
-    def_fp.write("\t ?MaybeTrackCordImpl@CordzInfo@cord_internal@lts_20220623@absl@@CAXAEAVInlineData@234@AEBV5234@W4MethodIdentifier@CordzUpdateTracker@234@@Z\n") # for tensorflow::Status usage of absl::Cord
+    def_fp.write("\t ?MaybeTrackCordImpl@CordzInfo@cord_internal@lts_20230125@absl@@CAXAEAVInlineData@234@AEBV5234@W4MethodIdentifier@CordzUpdateTracker@234@@Z\n") # for tensorflow::Status usage of absl::Cord
 
 
     # Each symbols returned by undname matches the same position in candidates.
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 1511ea1b9c4..664320c9943 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -32,6 +32,9 @@ tensorflow::IsBuiltWithNvcc
 tensorflow::GpuSupportsHalfMatMulAndConv
 tensorflow::IsMklEnabled
 
+[//tensorflow/compiler/tf2xla:tf2xla_opset] # tf2xla_opset
+tensorflow::GetRegisteredXlaOpsForDevice
+
 [//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl] # stat_summarizer
 stream_executor::StreamExecutor::EnablePeerAccessTo
 stream_executor::StreamExecutor::CanEnablePeerAccessTo
@@ -157,7 +160,7 @@ tensorflow::BundleReader::~BundleReader
 [//tensorflow/python:ndarray_tensor] # py_checkpoint_reader
 tensorflow::TensorToNdarray
 
-[//tensorflow/python:safe_ptr] # py_checkpoint_reader
+[//tensorflow/python:safe_pyobject_ptr] # py_checkpoint_reader
 tensorflow::detail::PyDecrefDeleter
 tensorflow::make_safe
 
@@ -231,6 +234,13 @@ tensorflow::EagerContext::WaitForAndCloseRemoteContexts
 [//tensorflow/c:tf_status_helper] # tfe
 tsl::Set_TF_Status_from_Status
 
+[//tensorflow/c:safe_ptr] # tfe
+tensorflow::detail::TFBufferDeleter
+tensorflow::detail::TFETensorHandleDeleter
+tensorflow::detail::TFStatusDeleter
+tensorflow::detail::TFTensorDeleter
+tensorflow::make_safe
+
 [//tensorflow/core/common_runtime/eager:context] # tfe
 tensorflow::EagerContext::WaitForAndCloseRemoteContexts
 tensorflow::SetCEagerContext
@@ -242,6 +252,7 @@ tensorflow::ExperimentalConvertSavedModelV1ToMlirLite
 tensorflow::ExperimentalConvertSavedModelV1ToMlir
 tensorflow::ExperimentalConvertSavedModelToMlir
 tensorflow::ExperimentalWriteBytecode
+tensorflow::ExperimentalTFLiteToTosaBytecode
 tensorflow::ImportGraphDef
 tensorflow::ImportFunction
 
@@ -450,6 +461,11 @@ tensorflow::metrics::SavedModelWriteFingerprint
 tensorflow::metrics::SavedModelReadFingerprint
 tensorflow::metrics::SavedModelWritePath
 tensorflow::metrics::SavedModelReadPath
+tensorflow::metrics::SavedModelWritePathAndSingleprint
+tensorflow::metrics::SavedModelReadPathAndSingleprint
+tensorflow::metrics::MakeFingerprintJson
+tensorflow::metrics::MakeSavedModelPathAndSingleprint
+tensorflow::metrics::ParseSavedModelPathAndSingleprint
 tensorflow::metrics::CheckpointReadDuration
 tensorflow::metrics::CheckpointWriteDuration
 tensorflow::metrics::AsyncCheckpointWriteDuration
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index c429a5765e4..af893328925 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -52,6 +52,7 @@ py_test(
     python_version = "PY3",
     shard_count = 4,
     tags = [
+        "no_oss",  # b/275546007
         "no_pip",
         "no_rocm",  # No need to rerun this test for ROCm config.
         "no_windows",  # numpy prints differently on windows.
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index d9dcb28d6fb..f199b185d0b 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -285,7 +285,7 @@ def build_docs(output_dir, code_url_prefix, search_hints):
       "tf/nn/sigmoid_cross_entropy_with_logits.md":
           "python/ops/nn_impl.py",
       "tf/keras/Model.md":
-          "keras/engine/training.py",
+          "engine/training.py",
   }
 
   all_passed = True
diff --git a/tensorflow/tools/docs/tf_doctest.py b/tensorflow/tools/docs/tf_doctest.py
index 5c1d25376dc..12156e17a48 100644
--- a/tensorflow/tools/docs/tf_doctest.py
+++ b/tensorflow/tools/docs/tf_doctest.py
@@ -24,9 +24,6 @@ from absl.testing import absltest
 import numpy as np
 import tensorflow.compat.v2 as tf
 
-# Prevent Python exception from circular dependencies (b/117329403) looking very
-# similar to https://bugs.python.org/issue43546.
-from tensorflow.python.distribute import distribution_strategy_context  # pylint: disable=unused-import
 from tensorflow.python.eager import context
 from tensorflow.python.ops import logging_ops
 
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 0ffcc2bc5ac..0f3b169eb3e 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -124,11 +124,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":transform_utils",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/algorithm:container",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/core/util/tensor_bundle",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -138,6 +134,10 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/util/tensor_bundle",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
     ] + if_not_windows([
         "//tensorflow/core:sparse_ops_op_lib",
         "//tensorflow/core:parsing_ops_op_lib",
@@ -336,8 +336,8 @@ py_library(
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:errors",
-        "//tensorflow/python:util",
         "//tensorflow/python/util:_pywrap_transform_graph",
+        "//tensorflow/python/util:compat",
     ],
 )
 
diff --git a/tensorflow/tools/graph_transforms/compare_graphs.cc b/tensorflow/tools/graph_transforms/compare_graphs.cc
index 28a80a885f8..d658279ec88 100644
--- a/tensorflow/tools/graph_transforms/compare_graphs.cc
+++ b/tensorflow/tools/graph_transforms/compare_graphs.cc
@@ -49,7 +49,7 @@ int ParseFlagsAndCompareGraphs(int argc, char* argv[]) {
   Status a_load_status = LoadTextOrBinaryGraphFile(argv[1], &a);
   if (!a_load_status.ok()) {
     LOG(ERROR) << "Loading graph '" << argv[1] << "' failed with "
-               << a_load_status.error_message();
+               << a_load_status.message();
     return -1;
   }
 
@@ -57,7 +57,7 @@ int ParseFlagsAndCompareGraphs(int argc, char* argv[]) {
   Status b_load_status = LoadTextOrBinaryGraphFile(argv[2], &b);
   if (!b_load_status.ok()) {
     LOG(ERROR) << "Loading graph '" << argv[2] << "' failed with "
-               << b_load_status.error_message();
+               << b_load_status.message();
     return -1;
   }
 
diff --git a/tensorflow/tools/graph_transforms/summarize_graph_main.cc b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
index df800672805..617d7e43074 100644
--- a/tensorflow/tools/graph_transforms/summarize_graph_main.cc
+++ b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
@@ -48,7 +48,7 @@ void PrintNodeInfo(const NodeDef* node) {
     if (shape_status.ok()) {
       shape_description = PartialTensorShape(shape_proto).DebugString();
     } else {
-      shape_description = shape_status.error_message();
+      shape_description = shape_status.message();
     }
   }
   DataType dtype = DT_INVALID;
@@ -318,7 +318,7 @@ int ParseFlagsAndSummarizeGraph(int argc, char* argv[]) {
   Status load_status = LoadTextOrBinaryGraphFile(in_graph, &graph_def);
   if (!load_status.ok()) {
     LOG(ERROR) << "Loading graph '" << in_graph << "' failed with "
-               << load_status.error_message();
+               << load_status.message();
     LOG(ERROR) << usage;
     return -1;
   }
@@ -326,7 +326,7 @@ int ParseFlagsAndSummarizeGraph(int argc, char* argv[]) {
   Status summarize_result =
       SummarizeGraph(graph_def, in_graph, print_structure);
   if (!summarize_result.ok()) {
-    LOG(ERROR) << summarize_result.error_message() << "\n" << usage;
+    LOG(ERROR) << summarize_result.message() << "\n" << usage;
     return -1;
   }
 
diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc
index 0123c71d3c5..62038360a55 100644
--- a/tensorflow/tools/graph_transforms/transform_graph.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph.cc
@@ -240,7 +240,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
       ParseTransformParameters(transforms_string, &transform_params);
   if (!parse_status.ok()) {
     LOG(ERROR) << "Failed to parse --transform argument, error was "
-               << parse_status.error_message();
+               << parse_status.message();
     return -1;
   }
   if (transform_params.empty()) {
@@ -252,7 +252,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
   Status load_status = LoadTextOrBinaryGraphFile(in_graph, &graph_def);
   if (!load_status.ok()) {
     LOG(ERROR) << "Loading graph '" << in_graph_string << "' failed with "
-               << load_status.error_message();
+               << load_status.message();
     LOG(ERROR) << usage;
     return -1;
   }
@@ -261,7 +261,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
       TransformGraph(inputs, outputs, transform_params, &graph_def);
 
   if (!transform_result.ok()) {
-    LOG(ERROR) << transform_result.error_message();
+    LOG(ERROR) << transform_result.message();
     LOG(ERROR) << usage;
     return -1;
   }
@@ -274,7 +274,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) {
   }
   if (!save_status.ok()) {
     LOG(ERROR) << "Saving graph '" << out_graph_string << "' failed with "
-               << save_status.error_message();
+               << save_status.message();
     return -1;
   }
 
@@ -331,7 +331,7 @@ Status TransformGraph(const std::vector<string>& inputs,
     if (!transform_result.ok()) {
       if (ignore_errors) {
         LOG(ERROR) << transform_name << ": Ignoring error "
-                   << transform_result.error_message();
+                   << transform_result.message();
         transformed_graph_def = *graph_def;
       } else {
         return transform_result;
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 4e37c3c496b..3488e8e4636 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -106,14 +106,20 @@ COMMON_PIP_DEPS = [
     "//tensorflow/core/function/trace_type:serialization_test_proto_py",
     "//tensorflow/core/function/trace_type:serialization",
     "//tensorflow/core:protos_all_proto_srcs",
+    "//tensorflow/dtensor/python/tests:multi_client_test_util",
     "//tensorflow/dtensor/python/tests:test_util",
     "//tensorflow/lite/python:tflite_convert",
     "//tensorflow/lite/toco/python:toco_from_protos",
     "//tensorflow/lite/tools:visualize",
+    "//tensorflow/python/autograph/converters:list_comprehensions",
     "//tensorflow/python/autograph/core:test_lib",
     "//tensorflow/python/autograph/impl/testing:pybind_for_testing",
-    "//tensorflow/python/autograph/pyct/testing",
+    "//tensorflow/python/autograph/pyct/testing:basic_definitions",
+    "//tensorflow/python/autograph/pyct/testing:decorators",
     "//tensorflow/python/autograph/pyct/common_transformers:common_transformers",
+    "//tensorflow/python/autograph/pyct/static_analysis:type_inference",
+    "//tensorflow/python/autograph/utils:context_managers",
+    "//tensorflow/python/autograph/utils:tensor_list",
     "//tensorflow/python/compiler:compiler",
     "//tensorflow/python:cond_v2",
     "//tensorflow/python:distributed_framework_test_lib",
@@ -132,6 +138,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/ops/ragged:ragged_tensor_test_ops",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/distribute:combinations",
+    "//tensorflow/python/distribute/coordinator:fault_tolerance_test_base",
     "//tensorflow/python/distribute/failure_handling:check_preemption_py",
     "//tensorflow/python/distribute/failure_handling:failure_handling_lib",
     "//tensorflow/python/distribute/failure_handling:failure_handling_util",
@@ -148,13 +155,26 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/kernel_tests/random:util",
     "//tensorflow/python/kernel_tests/signal:test_util",
     "//tensorflow/python/kernel_tests/sparse_ops:sparse_xent_op_test_base",
+    "//tensorflow/python/lib:__init__",
     "//tensorflow/python/ops/structured:structured_tensor_dynamic",
+    "//tensorflow/python/platform:resource_loader",
     "//tensorflow/python/profiler:trace",
     "//tensorflow/python/saved_model:saved_model",
+    "//tensorflow/python/summary:__init__",
+    "//tensorflow/python/summary:plugin_asset",
+    "//tensorflow/python/summary:summary_iterator",
+    "//tensorflow/python/summary:summary_py",
     "//tensorflow/python/tools:tools_pip",
     "//tensorflow/python/tools/api/generator:create_python_api",
     "//tensorflow/python/tpu",
+    "//tensorflow/python/util:deprecated_module",
+    "//tensorflow/python/util:deprecated_module_new",
     "//tensorflow/python/util:example_parser_configuration",
+    "//tensorflow/python/util:function_utils",
+    "//tensorflow/python/util:keyword_args",
+    "//tensorflow/python/util:lock_util",
+    "//tensorflow/python/util:module_wrapper",
+    "//tensorflow/python/util:serialization",
     "//tensorflow/python:image_grad_test_base",
     "//tensorflow/python:memory_checker",
     "//tensorflow/python:test_ops",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 560a0c69d73..545482cd1ed 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -47,7 +47,7 @@ from setuptools.dist import Distribution
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '2.13.0'
+_VERSION = '2.14.0'
 
 
 # We use the same setup.py for all tensorflow_* packages and for the nightly
@@ -84,17 +84,13 @@ def standard_or_nightly(standard, nightly):
 REQUIRED_PACKAGES = [
     'absl-py >= 1.0.0',
     'astunparse >= 1.6.0',
-    'flatbuffers >= 2.0',
+    'flatbuffers >= 23.1.21',
     # TODO(b/213222745) gast versions above 0.4.0 break TF's tests
     'gast >= 0.2.1, <= 0.4.0',
     'google_pasta >= 0.1.1',
     'h5py >= 2.9.0',
-    # TODO(b/239052279): replace with external dependency on JAX repo once JAX
-    # no longer relies on TF.
-    'jax >= 0.3.15',
     'libclang >= 13.0.0',
-    # TODO(b/263178356): numpy 1.24 breaks TF's tests
-    'numpy >= 1.22, <1.24',
+    'numpy >= 1.22',
     'opt_einsum >= 2.3.2',
     'packaging',
     'protobuf>=3.20.3,<5.0.0dev,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5',
@@ -118,12 +114,12 @@ REQUIRED_PACKAGES = [
     # current release version. These also usually have "alpha" or "dev" in their
     # version name.
     # These are all updated during the TF release process.
-    standard_or_nightly('tensorboard >= 2.12, < 2.13',
+    standard_or_nightly('tensorboard >= 2.13, < 2.14',
                         'tb-nightly ~= 2.13.0.a'),
-    standard_or_nightly('tensorflow_estimator >= 2.12.0rc0, < 2.13',
-                        'tf-estimator-nightly ~= 2.13.0.dev'),
-    standard_or_nightly('keras >= 2.12.0rc0, < 2.13',
-                        'keras-nightly ~= 2.13.0.dev'),
+    standard_or_nightly('tensorflow_estimator >= 2.13.0rc0, < 2.14',
+                        'tf-estimator-nightly ~= 2.14.0.dev'),
+    standard_or_nightly('keras >= 2.13.1rc0, < 2.14',
+                        'keras-nightly ~= 2.14.0.dev'),
 ]
 REQUIRED_PACKAGES = [p for p in REQUIRED_PACKAGES if p is not None]
 
@@ -156,6 +152,10 @@ if collaborator_build:
       # Windows machine.
       standard_or_nightly('tensorflow-intel', 'tf-nightly-intel') + '==' +
       _VERSION + ';platform_system=="Windows"',
+      # Install the TensorFlow package built by Apple if the user is running
+      # macOS on an Apple Silicon machine.
+      standard_or_nightly('tensorflow-macos', 'tf-nightly-macos') + '==' +
+      _VERSION + ';platform_system=="Darwin" and platform_machine=="arm64"',
   ]
 
 DOCLINES = __doc__.split('\n')
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index 854b9dc32e6..b2998173b53 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -36,9 +36,9 @@ cc_binary(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":gen_proto_text_functions_lib",
-        "@com_google_protobuf//:protobuf",
-        "//tensorflow/tsl/platform:protobuf_compiler",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/tsl/platform:protobuf_compiler",
+        "@com_google_protobuf//:protobuf",
     ] + if_ios(["//tensorflow/core/platform:logging"]),
 )
 
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index af89dc13667..25b1d57cc3b 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -36,7 +36,7 @@ namespace {
 template <typename... Args>
 string StrCat(const Args&... args) {
   std::ostringstream s;
-  std::vector<int>{((s << args), 0)...};
+  std::vector<int> give_me_a_name{((s << args), 0)...};
   return s.str();
 }
 
diff --git a/tensorflow/tools/tensorflow_builder/compat_checker/BUILD b/tensorflow/tools/tensorflow_builder/compat_checker/BUILD
index 718c303966b..9a7d3fe9a97 100644
--- a/tensorflow/tools/tensorflow_builder/compat_checker/BUILD
+++ b/tensorflow/tools/tensorflow_builder/compat_checker/BUILD
@@ -21,8 +21,8 @@ py_library(
     srcs = ["compat_checker.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:tf_decorator",
         "@six_archive//:six",
     ],
 )
@@ -37,7 +37,6 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":compat_checker",
-        "//tensorflow/python:platform",
         "//tensorflow/python:platform_test",
     ],
 )
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index f0345421204..62b63530d68 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -30,7 +30,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:errors",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:gfile",
         "@six_archive//:six",
     ],
 )
@@ -42,7 +42,6 @@ py_binary(
     srcs_version = "PY3",
     deps = [
         ":system_info_lib",
-        "//tensorflow/python:platform",
     ],
 )
 
@@ -55,7 +54,7 @@ py_library(
     deps = [
         ":system_info_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:gfile",
         "@absl_py//absl/logging",
         "@six_archive//:six",
     ],
@@ -82,7 +81,8 @@ py_library(
         ":run_and_gather_logs_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
index f07829638b2..7b7f2bab1fe 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
@@ -29,9 +29,9 @@ gast == 0.4.0
 # For release jobs, we will pin these on the release branch
 # Note that the CACHEBUSTER variable, set in the CI builds, will force these to
 # be the latest version.
-keras-nightly ~= 2.13.0.dev
-tb-nightly ~= 2.12.0.a
-tf-estimator-nightly ~= 2.13.0.dev
+keras-nightly ~= 2.14.0.dev
+tb-nightly ~= 2.13.0.a
+tf-estimator-nightly ~= 2.14.0.dev
 # Test dependencies
 grpcio ~= 1.49.1 # Earliest version for Python 3.11
 portpicker ~= 1.4.0
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
index 1117a201c2a..93acb2a97b9 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
@@ -23,13 +23,20 @@ build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
 # Target the AVX instruction set
 build --copt=-mavx --host_copt=-mavx
 
+# Disable clang extention that rejects type definitions within offsetof. 
+# This was added in clang-16 by https://reviews.llvm.org/D133574.
+# Can be removed once upb is updated, since a type definition is used within
+# offset of in the current version of ubp.
+# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
+build --copt=-Wno-gnu-offsetof-extensions
+
 # Store performance profiling log in the mounted artifact directory.
 # The profile can be viewed by visiting chrome://tracing in a Chrome browser.
 # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
 build --profile=/tf/pkg/profile.json.gz
 
 # Use the NVCC toolchain to compile for manylinux2014
-build --crosstool_top="@sigbuild-r2.12_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain"
 
 # Test-related settings below this point.
 test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
@@ -75,6 +82,9 @@ test:pip --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorf
 test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
 build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
 
+# For outputting Build Event Protocol files
+build:build_event_export --build_event_json_file=/tf/pkg/bep.json
+
 # For Remote Build Execution.
 build:rbe --google_default_credentials
 build:rbe --bes_backend=buildeventservice.googleapis.com
@@ -89,14 +99,14 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.12_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.12_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.12_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.12_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.12_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.12_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.13-clang_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.13-clang_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.13-clang_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.12_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.13-clang_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_clang.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
similarity index 83%
rename from tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_clang.bazelrc
rename to tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
index 0ed199223e2..d8625816879 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_clang.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
@@ -23,20 +23,13 @@ build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
 # Target the AVX instruction set
 build --copt=-mavx --host_copt=-mavx
 
-# Disable clang extention that rejects type definitions within offsetof. 
-# This was added in clang-16 by https://reviews.llvm.org/D133574.
-# Can be removed once upb is updated, since a type definition is used within
-# offset of in the current version of ubp.
-# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
-build --copt=-Wno-gnu-offsetof-extensions
-
 # Store performance profiling log in the mounted artifact directory.
 # The profile can be viewed by visiting chrome://tracing in a Chrome browser.
 # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
 build --profile=/tf/pkg/profile.json.gz
 
 # Use the NVCC toolchain to compile for manylinux2014
-build --crosstool_top="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.13_config_cuda//crosstool:toolchain"
 
 # Test-related settings below this point.
 test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
@@ -49,8 +42,8 @@ test --test_summary=short
 # Pass --config=nonpip to run the same suite of tests. If you want to run just
 # one test for investigation, you don't need --config=nonpip; just run the
 # bazel test invocation as normal.
-test:nonpip_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:nonpip_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:nonpip_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:nonpip_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
 test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
@@ -73,8 +66,8 @@ test:pip_venv --python_path="/bazel_pip/bin/python3"
 test:pip_venv --define=no_tensorflow_py_deps=true
 test:pip --config=pip_venv
 # Yes, we don't exclude the gpu tests on pip for some reason.
-test:pip_filters --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_excluded,-oss_serial,-v1only
-test:pip_filters --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_excluded,-oss_serial,-v1only
+test:pip_filters --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_excluded,-oss_serial,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:pip_filters --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_excluded,-oss_serial,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:pip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
 test:pip --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
@@ -82,6 +75,9 @@ test:pip --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorf
 test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
 build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
 
+# For outputting Build Event Protocol files
+build:build_event_export --build_event_json_file=/tf/pkg/bep.json
+
 # For Remote Build Execution.
 build:rbe --google_default_credentials
 build:rbe --bes_backend=buildeventservice.googleapis.com
@@ -96,14 +92,14 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.12-clang_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.12-clang_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.12-clang_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.13_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.13_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.13_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.13_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.13_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.13_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.12-clang_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.13_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
index 19c6df9fb12..0cc043421e6 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
@@ -23,6 +23,13 @@ build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
 # Target the AVX instruction set
 build --copt=-mavx --host_copt=-mavx
 
+# Disable clang extention that rejects type definitions within offsetof. 
+# This was added in clang-16 by https://reviews.llvm.org/D133574.
+# Can be removed once upb is updated, since a type definition is used within
+# offset of in the current version of ubp.
+# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
+build --copt=-Wno-gnu-offsetof-extensions
+
 # Store performance profiling log in the mounted artifact directory.
 # The profile can be viewed by visiting chrome://tracing in a Chrome browser.
 # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
@@ -30,13 +37,17 @@ build --profile=/tf/pkg/profile.json.gz
 
 # CUDA: Set up compilation CUDA version and paths
 build --@local_config_cuda//:enable_cuda
+build --@local_config_cuda//:cuda_compiler=clang
 build --repo_env TF_NEED_CUDA=1
+build --config cuda_clang
 build --action_env=TF_CUDA_VERSION="11"
 build --action_env=TF_CUDNN_VERSION="8"
 build --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.8"
 build --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
+build --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-16/bin/clang"
+build --action_env=TF_CUDA_CLANG="1"
 build --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
-build --crosstool_top="@sigbuild-r2.12_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain"
 
 # CUDA: Enable TensorRT optimizations
 # https://developer.nvidia.com/tensorrt
@@ -93,6 +104,9 @@ test:pip --config=pip_venv --config=pip_filters -- //bazel_pip/tensorflow/... -/
 test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
 build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
 
+# For outputting Build Event Protocol files
+build:build_event_export --build_event_json_file=/tf/pkg/bep.json
+
 # For Remote Build Execution.
 build:rbe --google_default_credentials
 build:rbe --bes_backend=buildeventservice.googleapis.com
@@ -107,24 +121,24 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.12_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.12_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.12_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.12_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.12_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.12_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.13-clang_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.13-clang_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.13-clang_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.12_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.13-clang_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
 # For Remote build execution -- GPU configuration
 build:rbe --repo_env=REMOTE_GPU_TESTING=1
 test:rbe --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.12_config_cuda"
-build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.12_config_tensorrt"
-build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.12_config_nccl"
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.12_config_python"
+build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.13-clang_config_cuda"
+build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.13-clang_config_tensorrt"
+build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.13-clang_config_nccl"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.13-clang_config_python"
 
 # For continuous builds
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_clang.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_gcc.bazelrc
similarity index 81%
rename from tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_clang.bazelrc
rename to tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_gcc.bazelrc
index 7391ea314f6..81a1642ef3c 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_clang.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_gcc.bazelrc
@@ -23,13 +23,6 @@ build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
 # Target the AVX instruction set
 build --copt=-mavx --host_copt=-mavx
 
-# Disable clang extention that rejects type definitions within offsetof. 
-# This was added in clang-16 by https://reviews.llvm.org/D133574.
-# Can be removed once upb is updated, since a type definition is used within
-# offset of in the current version of ubp.
-# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
-build --copt=-Wno-gnu-offsetof-extensions
-
 # Store performance profiling log in the mounted artifact directory.
 # The profile can be viewed by visiting chrome://tracing in a Chrome browser.
 # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
@@ -37,17 +30,13 @@ build --profile=/tf/pkg/profile.json.gz
 
 # CUDA: Set up compilation CUDA version and paths
 build --@local_config_cuda//:enable_cuda
-build --@local_config_cuda//:cuda_compiler=clang
 build --repo_env TF_NEED_CUDA=1
-build --config cuda_clang
 build --action_env=TF_CUDA_VERSION="11"
 build --action_env=TF_CUDNN_VERSION="8"
 build --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.8"
 build --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
-build --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-16/bin/clang"
-build --action_env=TF_CUDA_CLANG="1"
 build --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
-build --crosstool_top="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.13_config_cuda//crosstool:toolchain"
 
 # CUDA: Enable TensorRT optimizations
 # https://developer.nvidia.com/tensorrt
@@ -72,8 +61,8 @@ test --test_summary=short
 # Pass --config=nonpip to run the same suite of tests. If you want to run just
 # one test for investigation, you don't need --config=nonpip; just run the
 # bazel test invocation as normal.
-test:nonpip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11
-test:nonpip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11
+test:nonpip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:nonpip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
 test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
@@ -95,8 +84,8 @@ test:pip_venv --action_env PYTHON_LIB_PATH="/bazel_pip/lib/python3/site-packages
 test:pip_venv --python_path="/bazel_pip/bin/python3"
 test:pip_venv --define=no_tensorflow_py_deps=true
 # Yes, we don't exclude the gpu tests on pip for some reason.
-test:pip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_pip,-nopip
-test:pip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_pip,-nopip
+test:pip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_pip,-nopip,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:pip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_pip,-nopip,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:pip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
 test:pip --config=pip_venv --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
@@ -104,6 +93,9 @@ test:pip --config=pip_venv --config=pip_filters -- //bazel_pip/tensorflow/... -/
 test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
 build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
 
+# For outputting Build Event Protocol files
+build:build_event_export --build_event_json_file=/tf/pkg/bep.json
+
 # For Remote Build Execution.
 build:rbe --google_default_credentials
 build:rbe --bes_backend=buildeventservice.googleapis.com
@@ -118,24 +110,24 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.12-clang_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.12-clang_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.12-clang_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.13_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.13_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.13_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.13_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.13_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.13_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.12-clang_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.13_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
 # For Remote build execution -- GPU configuration
 build:rbe --repo_env=REMOTE_GPU_TESTING=1
 test:rbe --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.12-clang_config_cuda"
-build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.12-clang_config_tensorrt"
-build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.12-clang_config_nccl"
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.12-clang_config_python"
+build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.13_config_cuda"
+build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.13_config_tensorrt"
+build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.13_config_nccl"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.13_config_python"
 
 # For continuous builds
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt
index 27cedb6de7b..c9f1caa1d5d 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt
@@ -1,7 +1,6 @@
 # Test dependencies for pip tests
-grpcio ~= 1.49.1
-portpicker ~= 1.4.0
-scipy ~= 1.7.2; python_version < '3.11'
-scipy ~= 1.9.2; python_version >= '3.11' # Earliest version for Python 3.11
-jax ~= 0.3.24
-jaxlib ~= 0.3.24
+grpcio ~= 1.53.0
+portpicker ~= 1.5.2
+scipy ~= 1.10.1
+jax ~= 0.4.7
+jaxlib ~= 0.4.7
diff --git a/tensorflow/tools/tfg_graph_transforms/BUILD b/tensorflow/tools/tfg_graph_transforms/BUILD
index a8e57957ba1..329338ea5e6 100644
--- a/tensorflow/tools/tfg_graph_transforms/BUILD
+++ b/tensorflow/tools/tfg_graph_transforms/BUILD
@@ -41,6 +41,7 @@ TFG_GRAPH_TRANSFORM_DEPS = [
     "//tensorflow/core/transforms:PassRegistration",
     "//tensorflow/compiler/tf2xla/ops:xla_ops",
     "//tensorflow/core:ops",
+    "//tensorflow/core:protos_all_cc",
     "//tensorflow/core/ir/importexport:graphdef_export",
     "//tensorflow/core/ir/importexport:graphdef_import",
     "//tensorflow/core/ir/importexport:savedmodel_export",
diff --git a/tensorflow/tools/tfg_graph_transforms/tfg_graph_transforms_main.cc b/tensorflow/tools/tfg_graph_transforms/tfg_graph_transforms_main.cc
index 1de3dc41e9d..386b34d8e32 100644
--- a/tensorflow/tools/tfg_graph_transforms/tfg_graph_transforms_main.cc
+++ b/tensorflow/tools/tfg_graph_transforms/tfg_graph_transforms_main.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/ir/importexport/graphdef_export.h"
 #include "tensorflow/core/ir/importexport/graphdef_import.h"
 #include "tensorflow/core/ir/importexport/savedmodel_export.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "tensorflow/core/ir/ops.h"
 #include "tensorflow/core/ir/tf_op_registry.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/transforms/pass_registration.h"
 #include "tensorflow/tools/tfg_graph_transforms/utils.h"
 
@@ -106,7 +106,11 @@ tensorflow::Status RunOptimizationPasses(
     const mlir::PassPipelineCLParser& passPipeline, mlir::ModuleOp module,
     mlir::MLIRContext* context) {
   mlir::PassManager pm(context);
-  mlir::applyPassManagerCLOptions(pm);
+  mlir::registerPassManagerCLOptions();
+  if (failed(mlir::applyPassManagerCLOptions(pm))) {
+    return tensorflow::errors::InvalidArgument(
+        "Could not initialize MLIR pass manager CL options");
+  }
 
   auto error_handler = [&](const llvm::Twine& msg) {
     emitError(mlir::UnknownLoc::get(pm.getContext())) << msg;
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index 334388b314f..a5995fb43b5 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -232,7 +232,7 @@ def initialize_rbe_configs():
 
     tensorflow_rbe_config(
         name = "ubuntu20.04-clang_manylinux2014-cuda12.0.1-cudnn8.8",
-        compiler = "/clang_rf2b94bd7eaa83d853dc7568fac87b1f8bf4ddec6/bin/clang",
+        compiler = "/usr/lib/llvm-16/bin/clang",
         cuda_version = "12.0.1",
         cudnn_version = "8.8",
         os = "ubuntu20.04-manylinux2014-multipython",
@@ -498,3 +498,82 @@ def initialize_rbe_configs():
             "TF_TENSORRT_VERSION": "7.2",
         },
     )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.13": "docker://gcr.io/tensorflow-sigs/build@sha256:21131f082614f60207cb2242cd5150d5175a2a21e6789ad8fa32bd5eb7a1e5e0",
+            "sigbuild-r2.13-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:95e55defa05f791e79beeaba5094341ce603cc00d8bdb5af5dc3496e4ed2f6e2",
+            "sigbuild-r2.13-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:21131f082614f60207cb2242cd5150d5175a2a21e6789ad8fa32bd5eb7a1e5e0",
+            "sigbuild-r2.13-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:d99a44bfe37c49fd1d08e94eca4de15682fe66017074c3feb695587f5bf5add9",
+            "sigbuild-r2.13-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:684798c20fe171c932681cf54a4a80d27ed8fde6c0924ce96c9f7663eab1ef80",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/dt9/usr/bin/gcc",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/dt9/usr/bin/gcc",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "GCC_HOST_COMPILER_PATH": "/dt9/usr/bin/gcc",
+            "GCC_HOST_COMPILER_PREFIX": "/usr/bin",
+            "HOST_CXX_COMPILER": "/dt9/usr/bin/gcc",
+            "HOST_C_COMPILER": "/dt9/usr/bin/gcc",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_CUDA_CLANG": "0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "11.8",
+            "TF_CUDNN_VERSION": "8.6",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_NEED_TENSORRT": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_TENSORRT_VERSION": "8.4",
+        },
+    )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.13-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:21131f082614f60207cb2242cd5150d5175a2a21e6789ad8fa32bd5eb7a1e5e0",
+            "sigbuild-r2.13-clang-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:95e55defa05f791e79beeaba5094341ce603cc00d8bdb5af5dc3496e4ed2f6e2",
+            "sigbuild-r2.13-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:21131f082614f60207cb2242cd5150d5175a2a21e6789ad8fa32bd5eb7a1e5e0",
+            "sigbuild-r2.13-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:d99a44bfe37c49fd1d08e94eca4de15682fe66017074c3feb695587f5bf5add9",
+            "sigbuild-r2.13-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:684798c20fe171c932681cf54a4a80d27ed8fde6c0924ce96c9f7663eab1ef80",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/usr/lib/llvm-16/bin/clang",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/usr/lib/llvm-16/bin/clang",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-16/bin/clang",
+            "HOST_CXX_COMPILER": "/usr/lib/llvm-16/bin/clang",
+            "HOST_C_COMPILER": "/usr/lib/llvm-16/bin/clang",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_CUDA_CLANG": "1",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "11.2",
+            "TF_CUDNN_VERSION": "8.1",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_NEED_TENSORRT": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_TENSORRT_VERSION": "7.2",
+        },
+    )
diff --git a/tensorflow/tools/toolchains/remote_config/containers.bzl b/tensorflow/tools/toolchains/remote_config/containers.bzl
index 51872e9cf8d..be92f6e3cf8 100644
--- a/tensorflow/tools/toolchains/remote_config/containers.bzl
+++ b/tensorflow/tools/toolchains/remote_config/containers.bzl
@@ -6,9 +6,9 @@ container_digests = {
     "cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython": "sha256:48612bd85709cd014711d0b0f87e0806f3567d06d2e81c6e860516b87498b821",
     # JAX manylinux2014 configs.
     "cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython": "sha256:011034978c5f1e5dcecc816b3b964faafc42b243001d9cd09ff7cfe4a6a0f4b9",
-    "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:a4373ccb9dfeefaf2b98dab0efa76e327553e381ed184a0c0432f24121049a93",
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:6c7a3d009c65392f7d22d122956f2a7618d3567d5762973ff40f390664fe06ac",
-    "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:0d78e1e54f172ebbfdb55b46241e93f35d9e5fa7328dd92f80e6a40027552e83",
+    "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:d17894a1349a12baea1732cb133f65f08754ed97d0a6647efe23c916a9ab8f1c",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:be5a7644b1fe786be01c9ba3593fa9e1c81775812df096ee8198b8ba9704c895",
+    "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:5e915c804c0cc3e73ce7d72e26d3fe257344e3f1d1c721e56982399ab06437b0",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
diff --git a/tensorflow/tools/toolchains/win/BUILD b/tensorflow/tools/toolchains/win/BUILD
index 66ffa939e85..db8ae2eabd8 100644
--- a/tensorflow/tools/toolchains/win/BUILD
+++ b/tensorflow/tools/toolchains/win/BUILD
@@ -17,7 +17,7 @@ platform(
     remote_execution_properties = """
         properties:{
           name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:7260adc2eadea54f116ecbb720b5f7ad688d8b6ac4c5ea27e8519b85ba66a22c"
+          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:775f2889a35e7cc87f8e1dc83f3195acb4e709cebe3572cf3ae591ccef27e7e8"
         }
         properties:{
           name: "OSFamily"
@@ -26,5 +26,8 @@ platform(
         properties:{
           name: "Pool" value: "default"
         }
+        properties:{
+          name: "dockerNetwork" value: "off"
+        }
         """,
 )
diff --git a/tensorflow/tools/toolchains/win/tf_win_04052023/BUILD b/tensorflow/tools/toolchains/win/tf_win_04052023/BUILD
new file mode 100644
index 00000000000..f245f6d0789
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_04052023/BUILD
@@ -0,0 +1,630 @@
+# Copyright 2018 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under Windows.
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
+load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "malloc",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "mingw_compiler_files",
+    srcs = [":builtin_include_directory_paths_mingw"],
+)
+
+filegroup(
+    name = "clangcl_compiler_files",
+    srcs = [":builtin_include_directory_paths_clangcl"],
+)
+
+filegroup(
+    name = "msvc_compiler_files",
+    srcs = [":builtin_include_directory_paths_msvc"],
+)
+
+# Hardcoded toolchain, legacy behaviour.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "x64_windows|msvc-cl": ":cc-compiler-x64_windows",
+        "x64_x86_windows|msvc-cl": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows|msvc-cl": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "x64_windows|msys-gcc": ":cc-compiler-x64_windows_msys",
+        "x64_windows|mingw-gcc": ":cc-compiler-x64_windows_mingw",
+        "x64_windows|clang-cl": ":cc-compiler-x64_windows-clang-cl",
+        "x64_windows_msys": ":cc-compiler-x64_windows_msys",
+        "x64_windows": ":cc-compiler-x64_windows",
+        "x64_x86_windows": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows": ":cc-compiler-arm64_windows",
+        "arm64_windows": ":cc-compiler-arm64_windows",
+        "x64_arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_msys",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msys_x64",
+    toolchain_identifier = "msys_x64",
+)
+
+cc_toolchain_config(
+    name = "msys_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "msys-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/usr/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "msys",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/usr/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/usr/bin/ar",
+        "cpp": "c:/tools/msys64/usr/bin/cpp",
+        "dwp": "c:/tools/msys64/usr/bin/dwp",
+        "gcc": "c:/tools/msys64/usr/bin/gcc",
+        "gcov": "c:/tools/msys64/usr/bin/gcov",
+        "ld": "c:/tools/msys64/usr/bin/ld",
+        "nm": "c:/tools/msys64/usr/bin/nm",
+        "objcopy": "c:/tools/msys64/usr/bin/objcopy",
+        "objdump": "c:/tools/msys64/usr/bin/objdump",
+        "strip": "c:/tools/msys64/usr/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_msys",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:msys",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_msys",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_mingw",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_config = ":msys_x64_mingw",
+    toolchain_identifier = "msys_x64_mingw",
+)
+
+cc_toolchain_config(
+    name = "msys_x64_mingw",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "mingw-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/mingw64/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "mingw",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/mingw64/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/mingw64/bin/ar",
+        "cpp": "c:/tools/msys64/mingw64/bin/cpp",
+        "dwp": "c:/tools/msys64/mingw64/bin/dwp",
+        "gcc": "c:/tools/msys64/mingw64/bin/gcc",
+        "gcov": "c:/tools/msys64/mingw64/bin/gcov",
+        "ld": "c:/tools/msys64/mingw64/bin/ld",
+        "nm": "c:/tools/msys64/mingw64/bin/nm",
+        "objcopy": "c:/tools/msys64/mingw64/bin/objcopy",
+        "objdump": "c:/tools/msys64/mingw64/bin/objdump",
+        "strip": "c:/tools/msys64/mingw64/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_mingw",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:mingw",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_mingw",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64",
+    toolchain_identifier = "msvc_x64",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X64"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+    msvc_env_include = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\um\\x64",
+    msvc_env_path = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\devinit;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.20348.0\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\\\MSBuild\\Current\\Bin;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\;;C:\\Windows\\system32",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/lib.exe",
+    msvc_link_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/link.exe",
+    msvc_ml_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/ml64.exe",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/lib.exe",
+        "ml": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/ml64.exe",
+        "cpp": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+        "gcc": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_x86_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_x86",
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_x86",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X86"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X86"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+    msvc_env_include = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\ucrt\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\um\\x86",
+    msvc_env_path = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x86;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\devinit;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.20348.0\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\\\MSBuild\\Current\\Bin;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\;;C:\\Windows\\system32",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/lib.exe",
+    msvc_link_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/link.exe",
+    msvc_ml_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/ml.exe",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/lib.exe",
+        "ml": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/ml.exe",
+        "cpp": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+        "gcc": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_x86_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_32",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_x86_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_arm_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_arm",
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_arm",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm.bat",
+    msvc_link_path = "vc_installation_error_arm.bat",
+    msvc_ml_path = "vc_installation_error_arm.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm.bat",
+        "ml": "vc_installation_error_arm.bat",
+        "cpp": "vc_installation_error_arm.bat",
+        "gcc": "vc_installation_error_arm.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_arm_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_arm_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_arm64",
+    toolchain_identifier = "msvc_arm64",
+)
+
+cc_toolchain_config(
+    name = "msvc_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_x64",
+    toolchain_identifier = "clang_cl_x64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "clang-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = [
+        "/MACHINE:X64",
+        "/DEFAULTLIB:clang_rt.builtins-x86_64.lib",
+    ],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "clang_installation_error.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "clang_installation_error.bat",
+    msvc_link_path = "clang_installation_error.bat",
+    msvc_ml_path = "clang_installation_error.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "clang_installation_error.bat",
+        "ml": "clang_installation_error.bat",
+        "cpp": "clang_installation_error.bat",
+        "gcc": "clang_installation_error.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "clang_installation_error.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_arm64",
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "clang-cl",
+    cpu = "arm64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    target_libc = "msvcrt",
+    target_system_name = "aarch64-pc-windows-msvc",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+armeabi_cc_toolchain_config(name = "stub_armeabi-v7a")
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:armv7",
+        "@platforms//os:android",
+    ],
+    toolchain = ":cc-compiler-armeabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_04052023/armeabi_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/tf_win_04052023/armeabi_cc_toolchain_config.bzl
new file mode 100644
index 00000000000..72ef48ae6d6
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_04052023/armeabi_cc_toolchain_config.bzl
@@ -0,0 +1,82 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "feature",
+    "tool_path",
+)
+
+def _impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = []
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "llvm-profdata", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+armeabi_cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {},
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_04052023/builtin_include_directory_paths_msvc b/tensorflow/tools/toolchains/win/tf_win_04052023/builtin_include_directory_paths_msvc
new file mode 100644
index 00000000000..55ba44f761e
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_04052023/builtin_include_directory_paths_msvc
@@ -0,0 +1,6 @@
+This file is generated by cc_configure and contains builtin include directories
+that msvc reported. This file is a dependency of every compilation action and
+changes to it will be reflected in the action cache key. When some of these
+paths change, Bazel will make sure to rerun the action, even though none of
+declared action inputs or the action commandline changes.
+
diff --git a/tensorflow/tools/toolchains/win/tf_win_04052023/toolchain_image_info b/tensorflow/tools/toolchains/win/tf_win_04052023/toolchain_image_info
new file mode 100644
index 00000000000..aa4d48c52a0
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_04052023/toolchain_image_info
@@ -0,0 +1,2 @@
+REPOSITORY                                            TAG                 DIGEST                                                                    IMAGE ID            CREATED             SIZE
+gcr.io/tensorflow-testing/tf-win2019-docker-staging   latest              sha256:66e025eb9fd3970986fc4ae46fbd5f4558a87de5c3fafeb4555341782fb215fd   40bbe8c14413        5 minutes ago       15.9GB
\ No newline at end of file
diff --git a/tensorflow/tools/toolchains/win/tf_win_04052023/windows_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/tf_win_04052023/windows_cc_toolchain_config.bzl
new file mode 100644
index 00000000000..ba3de607d10
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_04052023/windows_cc_toolchain_config.bzl
@@ -0,0 +1,1394 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule for Windows"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _use_msvc_toolchain(ctx):
+    return ctx.attr.cpu in ["x64_windows", "arm64_windows"] and (ctx.attr.compiler == "msvc-cl" or ctx.attr.compiler == "clang-cl")
+
+def _impl(ctx):
+    if _use_msvc_toolchain(ctx):
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "object_file",
+                prefix = "",
+                extension = ".obj",
+            ),
+            artifact_name_pattern(
+                category_name = "static_library",
+                prefix = "",
+                extension = ".lib",
+            ),
+            artifact_name_pattern(
+                category_name = "alwayslink_static_library",
+                prefix = "",
+                extension = ".lo.lib",
+            ),
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+            artifact_name_pattern(
+                category_name = "dynamic_library",
+                prefix = "",
+                extension = ".dll",
+            ),
+            artifact_name_pattern(
+                category_name = "interface_library",
+                prefix = "",
+                extension = ".if.lib",
+            ),
+        ]
+    else:
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+        ]
+
+    if _use_msvc_toolchain(ctx):
+        cpp_link_nodeps_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_static_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_static_library,
+            implies = [
+                "nologo",
+                "archiver_flags",
+                "input_param_flags",
+                "linker_param_file",
+                "msvc_env",
+            ],
+            tools = [tool(path = ctx.attr.msvc_lib_path)],
+        )
+
+        assemble_action = action_config(
+            action_name = ACTION_NAMES.assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        preprocess_assemble_action = action_config(
+            action_name = ACTION_NAMES.preprocess_assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        c_compile_action = action_config(
+            action_name = ACTION_NAMES.c_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        linkstamp_compile_action = action_config(
+            action_name = ACTION_NAMES.linkstamp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "default_compile_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+                "unfiltered_compile_flags",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_compile_action = action_config(
+            action_name = ACTION_NAMES.cpp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_link_executable_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_executable,
+            implies = [
+                "nologo",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            linkstamp_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        action_configs = []
+
+    if _use_msvc_toolchain(ctx):
+        msvc_link_env_feature = feature(
+            name = "msvc_link_env",
+            env_sets = [
+                env_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    env_entries = [env_entry(key = "LIB", value = ctx.attr.msvc_env_lib)],
+                ),
+            ],
+        )
+
+        shared_flag_feature = feature(
+            name = "shared_flag",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/DLL"])],
+                ),
+            ],
+        )
+
+        determinism_feature = feature(
+            name = "determinism",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/wd4117",
+                                "-D__DATE__=\"redacted\"",
+                                "-D__TIMESTAMP__=\"redacted\"",
+                                "-D__TIME__=\"redacted\"",
+                            ] + (["-Wno-builtin-macro-redefined"] if ctx.attr.compiler == "clang-cl" else []),
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        sysroot_feature = feature(
+            name = "sysroot",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["--sysroot=%{sysroot}"],
+                            iterate_over = "sysroot",
+                            expand_if_available = "sysroot",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        unfiltered_compile_flags_feature = feature(
+            name = "unfiltered_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{unfiltered_compile_flags}"],
+                            iterate_over = "unfiltered_compile_flags",
+                            expand_if_available = "unfiltered_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archive_param_file_feature = feature(
+            name = "archive_param_file",
+            enabled = True,
+        )
+
+        compiler_param_file_feature = feature(
+            name = "compiler_param_file",
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(
+            name = "copy_dynamic_libraries_to_binary",
+        )
+
+        input_param_flags_feature = feature(
+            name = "input_param_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/IMPLIB:%{interface_library_output_path}"],
+                            expand_if_available = "interface_library_output_path",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{libopts}"],
+                            iterate_over = "libopts",
+                            expand_if_available = "libopts",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            iterate_over = "libraries_to_link",
+                            flag_groups = [
+                                flag_group(
+                                    iterate_over = "libraries_to_link.object_files",
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file_group",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "interface_library",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [
+                                        flag_group(
+                                            flags = ["%{libraries_to_link.name}"],
+                                            expand_if_false = "libraries_to_link.is_whole_archive",
+                                        ),
+                                        flag_group(
+                                            flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                            expand_if_true = "libraries_to_link.is_whole_archive",
+                                        ),
+                                    ],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "static_library",
+                                    ),
+                                ),
+                            ],
+                            expand_if_available = "libraries_to_link",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.fastbuild_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        user_compile_flags_feature = feature(
+            name = "user_compile_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_compile_flags}"],
+                            iterate_over = "user_compile_flags",
+                            expand_if_available = "user_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archiver_flags_feature = feature(
+            name = "archiver_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                        flag_group(
+                            flags = ctx.attr.archiver_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ctx.attr.default_link_flags)],
+                ),
+            ],
+        )
+
+        static_link_msvcrt_feature = feature(
+            name = "static_link_msvcrt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MT"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MTd"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+            ],
+        )
+
+        dynamic_link_msvcrt_feature = feature(
+            name = "dynamic_link_msvcrt",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MD"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MDd"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+            ],
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.dbg_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2"])],
+                ),
+            ],
+            implies = ["frame_pointer"],
+        )
+
+        supports_interface_shared_libraries_feature = feature(
+            name = "supports_interface_shared_libraries",
+            enabled = True,
+        )
+
+        user_link_flags_feature = feature(
+            name = "user_link_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_link_flags}"],
+                            iterate_over = "user_link_flags",
+                            expand_if_available = "user_link_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/DCOMPILER_MSVC",
+                                "/DNOMINMAX",
+                                "/D_WIN32_WINNT=0x0601",
+                                "/D_CRT_SECURE_NO_DEPRECATE",
+                                "/D_CRT_SECURE_NO_WARNINGS",
+                                "/bigobj",
+                                "/Zm500",
+                                "/EHsc",
+                                "/wd4351",
+                                "/wd4291",
+                                "/wd4250",
+                                "/wd4996",
+                            ],
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_compile_env_feature = feature(
+            name = "msvc_compile_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                    ],
+                    env_entries = [env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include)],
+                ),
+            ],
+        )
+
+        preprocessor_defines_feature = feature(
+            name = "preprocessor_defines",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/D%{preprocessor_defines}"],
+                            iterate_over = "preprocessor_defines",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        generate_pdb_file_feature = feature(
+            name = "generate_pdb_file",
+        )
+
+        output_execpath_flags_feature = feature(
+            name = "output_execpath_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        disable_assertions_feature = feature(
+            name = "disable_assertions",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+        supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+        no_stripping_feature = feature(name = "no_stripping")
+
+        linker_param_file_feature = feature(
+            name = "linker_param_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["@%{linker_param_file}"],
+                            expand_if_available = "linker_param_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        ignore_noisy_warnings_feature = feature(
+            name = "ignore_noisy_warnings",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [flag_group(flags = ["/ignore:4221"])],
+                ),
+            ],
+        )
+
+        no_legacy_features_feature = feature(name = "no_legacy_features")
+
+        parse_showincludes_feature = feature(
+            name = "parse_showincludes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                    ],
+                    flag_groups = [flag_group(flags = ["/showIncludes"])],
+                ),
+            ],
+        )
+
+        treat_warnings_as_errors_feature = feature(
+            name = "treat_warnings_as_errors",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile] + all_link_actions,
+                    flag_groups = [flag_group(flags = ["/WX"])],
+                ),
+            ],
+        )
+
+        windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+        no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+        include_paths_feature = feature(
+            name = "include_paths",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/I%{quote_include_paths}"],
+                            iterate_over = "quote_include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{include_paths}"],
+                            iterate_over = "include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{system_include_paths}"],
+                            iterate_over = "system_include_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        linkstamps_feature = feature(
+            name = "linkstamps",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{linkstamp_paths}"],
+                            iterate_over = "linkstamp_paths",
+                            expand_if_available = "linkstamp_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            enabled = True,
+            implies = ["copy_dynamic_libraries_to_binary"],
+        )
+
+        linker_subsystem_flag_feature = feature(
+            name = "linker_subsystem_flag",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+                ),
+            ],
+        )
+
+        frame_pointer_feature = feature(
+            name = "frame_pointer",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Oy-"])],
+                ),
+            ],
+        )
+
+        compiler_output_flags_feature = feature(
+            name = "compiler_output_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.assemble],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}", "/Zi"],
+                                    expand_if_available = "output_file",
+                                    expand_if_not_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_not_available = "output_preprocess_file",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}"],
+                                    expand_if_not_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                            expand_if_not_available = "output_assembly_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fa%{output_file}"],
+                                    expand_if_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/P", "/Fi%{output_file}"],
+                                    expand_if_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        nologo_feature = feature(
+            name = "nologo",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/nologo"])],
+                ),
+            ],
+        )
+
+        smaller_binary_feature = feature(
+            name = "smaller_binary",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        compiler_input_flags_feature = feature(
+            name = "compiler_input_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/c", "%{source_file}"],
+                            expand_if_available = "source_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        def_file_feature = feature(
+            name = "def_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                            expand_if_available = "def_file_path",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_env_feature = feature(
+            name = "msvc_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                        env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                        env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                    ],
+                ),
+            ],
+            implies = ["msvc_compile_env", "msvc_link_env"],
+        )
+        features = [
+            no_legacy_features_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            msvc_compile_env_feature,
+            msvc_link_env_feature,
+            include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            dynamic_link_msvcrt_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            frame_pointer_feature,
+            disable_assertions_feature,
+            determinism_feature,
+            treat_warnings_as_errors_feature,
+            smaller_binary_feature,
+            ignore_noisy_warnings_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            archive_param_file_feature,
+            compiler_param_file_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            implies = ["copy_dynamic_libraries_to_binary"],
+            enabled = True,
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+        gcc_env_feature = feature(
+            name = "gcc_env",
+            enabled = True,
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.tool_bin_path),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [flag_group(flags = ["-std=gnu++0x"])],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+
+        supports_dynamic_linker_feature = feature(
+            name = "supports_dynamic_linker",
+            enabled = True,
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g", "-Og"])],
+                ),
+            ],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = [
+                        "-g0",
+                        "-O3",
+                        "-DNDEBUG",
+                        "-ffunction-sections",
+                        "-fdata-sections",
+                    ])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                ),
+            ],
+        )
+
+        if ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+            archive_param_file_feature = feature(
+                name = "archive_param_file",
+                enabled = True,
+            )
+
+            compiler_param_file_feature = feature(
+                name = "compiler_param_file",
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                default_compile_flags_feature,
+                archive_param_file_feature,
+                compiler_param_file_feature,
+                default_link_flags_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+            ]
+        else:
+            supports_pic_feature = feature(
+                name = "supports_pic",
+                enabled = True,
+            )
+
+            sysroot_feature = feature(
+                name = "sysroot",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                            ACTION_NAMES.cpp_link_executable,
+                            ACTION_NAMES.cpp_link_dynamic_library,
+                            ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["--sysroot=%{sysroot}"],
+                                expand_if_available = "sysroot",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            fdo_optimize_feature = feature(
+                name = "fdo_optimize",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-fprofile-use=%{fdo_profile_path}",
+                                    "-fprofile-correction",
+                                ],
+                                expand_if_available = "fdo_profile_path",
+                            ),
+                        ],
+                    ),
+                ],
+                provides = ["profile"],
+            )
+
+            treat_warnings_as_errors_feature = feature(
+                name = "treat_warnings_as_errors",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [flag_group(flags = ["-Werror"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions,
+                        flag_groups = [flag_group(flags = ["-Wl,-fatal-warnings"])],
+                    ),
+                ],
+            )
+
+            user_compile_flags_feature = feature(
+                name = "user_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.assemble,
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                                expand_if_available = "user_compile_flags",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                supports_pic_feature,
+                default_compile_flags_feature,
+                default_link_flags_feature,
+                fdo_optimize_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+                user_compile_flags_feature,
+                treat_warnings_as_errors_feature,
+                sysroot_feature,
+            ]
+
+    tool_paths = [
+        tool_path(name = name, path = path)
+        for name, path in ctx.attr.tool_paths.items()
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = ctx.attr.cxx_builtin_include_directories,
+        toolchain_identifier = ctx.attr.toolchain_identifier,
+        host_system_name = ctx.attr.host_system_name,
+        target_system_name = ctx.attr.target_system_name,
+        target_cpu = ctx.attr.cpu,
+        target_libc = ctx.attr.target_libc,
+        compiler = ctx.attr.compiler,
+        abi_version = ctx.attr.abi_version,
+        abi_libc_version = ctx.attr.abi_libc_version,
+        tool_paths = tool_paths,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+        "toolchain_identifier": attr.string(),
+        "host_system_name": attr.string(),
+        "target_system_name": attr.string(),
+        "target_libc": attr.string(),
+        "abi_version": attr.string(),
+        "abi_libc_version": attr.string(),
+        "tool_paths": attr.string_dict(),
+        "cxx_builtin_include_directories": attr.string_list(),
+        "archiver_flags": attr.string_list(default = []),
+        "default_link_flags": attr.string_list(default = []),
+        "msvc_env_tmp": attr.string(default = "msvc_not_found"),
+        "msvc_env_path": attr.string(default = "msvc_not_found"),
+        "msvc_env_include": attr.string(default = "msvc_not_found"),
+        "msvc_env_lib": attr.string(default = "msvc_not_found"),
+        "msvc_cl_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_ml_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_link_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_lib_path": attr.string(default = "vc_installation_error.bat"),
+        "dbg_mode_debug_flag": attr.string(),
+        "fastbuild_mode_debug_flag": attr.string(),
+        "tool_bin_path": attr.string(default = "not_found"),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_04192023/BUILD b/tensorflow/tools/toolchains/win/tf_win_04192023/BUILD
new file mode 100644
index 00000000000..f245f6d0789
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_04192023/BUILD
@@ -0,0 +1,630 @@
+# Copyright 2018 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under Windows.
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
+load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "malloc",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "mingw_compiler_files",
+    srcs = [":builtin_include_directory_paths_mingw"],
+)
+
+filegroup(
+    name = "clangcl_compiler_files",
+    srcs = [":builtin_include_directory_paths_clangcl"],
+)
+
+filegroup(
+    name = "msvc_compiler_files",
+    srcs = [":builtin_include_directory_paths_msvc"],
+)
+
+# Hardcoded toolchain, legacy behaviour.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "x64_windows|msvc-cl": ":cc-compiler-x64_windows",
+        "x64_x86_windows|msvc-cl": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows|msvc-cl": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "x64_windows|msys-gcc": ":cc-compiler-x64_windows_msys",
+        "x64_windows|mingw-gcc": ":cc-compiler-x64_windows_mingw",
+        "x64_windows|clang-cl": ":cc-compiler-x64_windows-clang-cl",
+        "x64_windows_msys": ":cc-compiler-x64_windows_msys",
+        "x64_windows": ":cc-compiler-x64_windows",
+        "x64_x86_windows": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows": ":cc-compiler-arm64_windows",
+        "arm64_windows": ":cc-compiler-arm64_windows",
+        "x64_arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_msys",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msys_x64",
+    toolchain_identifier = "msys_x64",
+)
+
+cc_toolchain_config(
+    name = "msys_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "msys-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/usr/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "msys",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/usr/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/usr/bin/ar",
+        "cpp": "c:/tools/msys64/usr/bin/cpp",
+        "dwp": "c:/tools/msys64/usr/bin/dwp",
+        "gcc": "c:/tools/msys64/usr/bin/gcc",
+        "gcov": "c:/tools/msys64/usr/bin/gcov",
+        "ld": "c:/tools/msys64/usr/bin/ld",
+        "nm": "c:/tools/msys64/usr/bin/nm",
+        "objcopy": "c:/tools/msys64/usr/bin/objcopy",
+        "objdump": "c:/tools/msys64/usr/bin/objdump",
+        "strip": "c:/tools/msys64/usr/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_msys",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:msys",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_msys",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_mingw",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_config = ":msys_x64_mingw",
+    toolchain_identifier = "msys_x64_mingw",
+)
+
+cc_toolchain_config(
+    name = "msys_x64_mingw",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "mingw-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/mingw64/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "mingw",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/mingw64/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/mingw64/bin/ar",
+        "cpp": "c:/tools/msys64/mingw64/bin/cpp",
+        "dwp": "c:/tools/msys64/mingw64/bin/dwp",
+        "gcc": "c:/tools/msys64/mingw64/bin/gcc",
+        "gcov": "c:/tools/msys64/mingw64/bin/gcov",
+        "ld": "c:/tools/msys64/mingw64/bin/ld",
+        "nm": "c:/tools/msys64/mingw64/bin/nm",
+        "objcopy": "c:/tools/msys64/mingw64/bin/objcopy",
+        "objdump": "c:/tools/msys64/mingw64/bin/objdump",
+        "strip": "c:/tools/msys64/mingw64/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_mingw",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:mingw",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_mingw",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64",
+    toolchain_identifier = "msvc_x64",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X64"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+    msvc_env_include = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\um\\x64",
+    msvc_env_path = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\devinit;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.20348.0\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\\\MSBuild\\Current\\Bin;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\;;C:\\Windows\\system32",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/lib.exe",
+    msvc_link_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/link.exe",
+    msvc_ml_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/ml64.exe",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/lib.exe",
+        "ml": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/ml64.exe",
+        "cpp": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+        "gcc": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_x86_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_x86",
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_x86",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X86"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X86"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+    msvc_env_include = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\ucrt\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\um\\x86",
+    msvc_env_path = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x86;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\devinit;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.20348.0\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\\\MSBuild\\Current\\Bin;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\;;C:\\Windows\\system32",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/lib.exe",
+    msvc_link_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/link.exe",
+    msvc_ml_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/ml.exe",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/lib.exe",
+        "ml": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/ml.exe",
+        "cpp": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+        "gcc": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_x86_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_32",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_x86_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_arm_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_arm",
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_arm",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm.bat",
+    msvc_link_path = "vc_installation_error_arm.bat",
+    msvc_ml_path = "vc_installation_error_arm.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm.bat",
+        "ml": "vc_installation_error_arm.bat",
+        "cpp": "vc_installation_error_arm.bat",
+        "gcc": "vc_installation_error_arm.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_arm_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_arm_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_arm64",
+    toolchain_identifier = "msvc_arm64",
+)
+
+cc_toolchain_config(
+    name = "msvc_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_x64",
+    toolchain_identifier = "clang_cl_x64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "clang-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = [
+        "/MACHINE:X64",
+        "/DEFAULTLIB:clang_rt.builtins-x86_64.lib",
+    ],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "clang_installation_error.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "clang_installation_error.bat",
+    msvc_link_path = "clang_installation_error.bat",
+    msvc_ml_path = "clang_installation_error.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "clang_installation_error.bat",
+        "ml": "clang_installation_error.bat",
+        "cpp": "clang_installation_error.bat",
+        "gcc": "clang_installation_error.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "clang_installation_error.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_arm64",
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "clang-cl",
+    cpu = "arm64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    target_libc = "msvcrt",
+    target_system_name = "aarch64-pc-windows-msvc",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+armeabi_cc_toolchain_config(name = "stub_armeabi-v7a")
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:armv7",
+        "@platforms//os:android",
+    ],
+    toolchain = ":cc-compiler-armeabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_04192023/armeabi_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/tf_win_04192023/armeabi_cc_toolchain_config.bzl
new file mode 100644
index 00000000000..72ef48ae6d6
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_04192023/armeabi_cc_toolchain_config.bzl
@@ -0,0 +1,82 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "feature",
+    "tool_path",
+)
+
+def _impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = []
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "llvm-profdata", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+armeabi_cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {},
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_04192023/builtin_include_directory_paths_msvc b/tensorflow/tools/toolchains/win/tf_win_04192023/builtin_include_directory_paths_msvc
new file mode 100644
index 00000000000..1380bc62e15
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_04192023/builtin_include_directory_paths_msvc
@@ -0,0 +1,7 @@
+This file is generated by cc_configure and contains builtin include directories
+that msvc reported. This file is a dependency of every compilation action and
+changes to it will be reflected in the action cache key. When some of these
+paths change, Bazel will make sure to rerun the action, even though none of
+declared action inputs or the action commandline changes.
+
+
diff --git a/tensorflow/tools/toolchains/win/tf_win_04192023/toolchain_image_info b/tensorflow/tools/toolchains/win/tf_win_04192023/toolchain_image_info
new file mode 100644
index 00000000000..6808a04c223
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_04192023/toolchain_image_info
@@ -0,0 +1,2 @@
+REPOSITORY                                            TAG                 DIGEST                                                                    IMAGE ID            CREATED             SIZE
+gcr.io/tensorflow-testing/tf-win2019-docker-staging   latest              sha256:149a31892506113d6ec864213cadc67e8a67c4ab695513c6130158fd813e0b56   cef5b31d0835        4 minutes ago       15.9GB
diff --git a/tensorflow/tools/toolchains/win/tf_win_04192023/windows_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/tf_win_04192023/windows_cc_toolchain_config.bzl
new file mode 100644
index 00000000000..ba3de607d10
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_04192023/windows_cc_toolchain_config.bzl
@@ -0,0 +1,1394 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule for Windows"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _use_msvc_toolchain(ctx):
+    return ctx.attr.cpu in ["x64_windows", "arm64_windows"] and (ctx.attr.compiler == "msvc-cl" or ctx.attr.compiler == "clang-cl")
+
+def _impl(ctx):
+    if _use_msvc_toolchain(ctx):
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "object_file",
+                prefix = "",
+                extension = ".obj",
+            ),
+            artifact_name_pattern(
+                category_name = "static_library",
+                prefix = "",
+                extension = ".lib",
+            ),
+            artifact_name_pattern(
+                category_name = "alwayslink_static_library",
+                prefix = "",
+                extension = ".lo.lib",
+            ),
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+            artifact_name_pattern(
+                category_name = "dynamic_library",
+                prefix = "",
+                extension = ".dll",
+            ),
+            artifact_name_pattern(
+                category_name = "interface_library",
+                prefix = "",
+                extension = ".if.lib",
+            ),
+        ]
+    else:
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+        ]
+
+    if _use_msvc_toolchain(ctx):
+        cpp_link_nodeps_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_static_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_static_library,
+            implies = [
+                "nologo",
+                "archiver_flags",
+                "input_param_flags",
+                "linker_param_file",
+                "msvc_env",
+            ],
+            tools = [tool(path = ctx.attr.msvc_lib_path)],
+        )
+
+        assemble_action = action_config(
+            action_name = ACTION_NAMES.assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        preprocess_assemble_action = action_config(
+            action_name = ACTION_NAMES.preprocess_assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        c_compile_action = action_config(
+            action_name = ACTION_NAMES.c_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        linkstamp_compile_action = action_config(
+            action_name = ACTION_NAMES.linkstamp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "default_compile_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+                "unfiltered_compile_flags",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_compile_action = action_config(
+            action_name = ACTION_NAMES.cpp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_link_executable_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_executable,
+            implies = [
+                "nologo",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            linkstamp_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        action_configs = []
+
+    if _use_msvc_toolchain(ctx):
+        msvc_link_env_feature = feature(
+            name = "msvc_link_env",
+            env_sets = [
+                env_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    env_entries = [env_entry(key = "LIB", value = ctx.attr.msvc_env_lib)],
+                ),
+            ],
+        )
+
+        shared_flag_feature = feature(
+            name = "shared_flag",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/DLL"])],
+                ),
+            ],
+        )
+
+        determinism_feature = feature(
+            name = "determinism",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/wd4117",
+                                "-D__DATE__=\"redacted\"",
+                                "-D__TIMESTAMP__=\"redacted\"",
+                                "-D__TIME__=\"redacted\"",
+                            ] + (["-Wno-builtin-macro-redefined"] if ctx.attr.compiler == "clang-cl" else []),
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        sysroot_feature = feature(
+            name = "sysroot",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["--sysroot=%{sysroot}"],
+                            iterate_over = "sysroot",
+                            expand_if_available = "sysroot",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        unfiltered_compile_flags_feature = feature(
+            name = "unfiltered_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{unfiltered_compile_flags}"],
+                            iterate_over = "unfiltered_compile_flags",
+                            expand_if_available = "unfiltered_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archive_param_file_feature = feature(
+            name = "archive_param_file",
+            enabled = True,
+        )
+
+        compiler_param_file_feature = feature(
+            name = "compiler_param_file",
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(
+            name = "copy_dynamic_libraries_to_binary",
+        )
+
+        input_param_flags_feature = feature(
+            name = "input_param_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/IMPLIB:%{interface_library_output_path}"],
+                            expand_if_available = "interface_library_output_path",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{libopts}"],
+                            iterate_over = "libopts",
+                            expand_if_available = "libopts",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            iterate_over = "libraries_to_link",
+                            flag_groups = [
+                                flag_group(
+                                    iterate_over = "libraries_to_link.object_files",
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file_group",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "interface_library",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [
+                                        flag_group(
+                                            flags = ["%{libraries_to_link.name}"],
+                                            expand_if_false = "libraries_to_link.is_whole_archive",
+                                        ),
+                                        flag_group(
+                                            flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                            expand_if_true = "libraries_to_link.is_whole_archive",
+                                        ),
+                                    ],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "static_library",
+                                    ),
+                                ),
+                            ],
+                            expand_if_available = "libraries_to_link",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.fastbuild_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        user_compile_flags_feature = feature(
+            name = "user_compile_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_compile_flags}"],
+                            iterate_over = "user_compile_flags",
+                            expand_if_available = "user_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archiver_flags_feature = feature(
+            name = "archiver_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                        flag_group(
+                            flags = ctx.attr.archiver_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ctx.attr.default_link_flags)],
+                ),
+            ],
+        )
+
+        static_link_msvcrt_feature = feature(
+            name = "static_link_msvcrt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MT"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MTd"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+            ],
+        )
+
+        dynamic_link_msvcrt_feature = feature(
+            name = "dynamic_link_msvcrt",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MD"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MDd"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+            ],
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.dbg_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2"])],
+                ),
+            ],
+            implies = ["frame_pointer"],
+        )
+
+        supports_interface_shared_libraries_feature = feature(
+            name = "supports_interface_shared_libraries",
+            enabled = True,
+        )
+
+        user_link_flags_feature = feature(
+            name = "user_link_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_link_flags}"],
+                            iterate_over = "user_link_flags",
+                            expand_if_available = "user_link_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/DCOMPILER_MSVC",
+                                "/DNOMINMAX",
+                                "/D_WIN32_WINNT=0x0601",
+                                "/D_CRT_SECURE_NO_DEPRECATE",
+                                "/D_CRT_SECURE_NO_WARNINGS",
+                                "/bigobj",
+                                "/Zm500",
+                                "/EHsc",
+                                "/wd4351",
+                                "/wd4291",
+                                "/wd4250",
+                                "/wd4996",
+                            ],
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_compile_env_feature = feature(
+            name = "msvc_compile_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                    ],
+                    env_entries = [env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include)],
+                ),
+            ],
+        )
+
+        preprocessor_defines_feature = feature(
+            name = "preprocessor_defines",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/D%{preprocessor_defines}"],
+                            iterate_over = "preprocessor_defines",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        generate_pdb_file_feature = feature(
+            name = "generate_pdb_file",
+        )
+
+        output_execpath_flags_feature = feature(
+            name = "output_execpath_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        disable_assertions_feature = feature(
+            name = "disable_assertions",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+        supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+        no_stripping_feature = feature(name = "no_stripping")
+
+        linker_param_file_feature = feature(
+            name = "linker_param_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["@%{linker_param_file}"],
+                            expand_if_available = "linker_param_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        ignore_noisy_warnings_feature = feature(
+            name = "ignore_noisy_warnings",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [flag_group(flags = ["/ignore:4221"])],
+                ),
+            ],
+        )
+
+        no_legacy_features_feature = feature(name = "no_legacy_features")
+
+        parse_showincludes_feature = feature(
+            name = "parse_showincludes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                    ],
+                    flag_groups = [flag_group(flags = ["/showIncludes"])],
+                ),
+            ],
+        )
+
+        treat_warnings_as_errors_feature = feature(
+            name = "treat_warnings_as_errors",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile] + all_link_actions,
+                    flag_groups = [flag_group(flags = ["/WX"])],
+                ),
+            ],
+        )
+
+        windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+        no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+        include_paths_feature = feature(
+            name = "include_paths",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/I%{quote_include_paths}"],
+                            iterate_over = "quote_include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{include_paths}"],
+                            iterate_over = "include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{system_include_paths}"],
+                            iterate_over = "system_include_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        linkstamps_feature = feature(
+            name = "linkstamps",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{linkstamp_paths}"],
+                            iterate_over = "linkstamp_paths",
+                            expand_if_available = "linkstamp_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            enabled = True,
+            implies = ["copy_dynamic_libraries_to_binary"],
+        )
+
+        linker_subsystem_flag_feature = feature(
+            name = "linker_subsystem_flag",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+                ),
+            ],
+        )
+
+        frame_pointer_feature = feature(
+            name = "frame_pointer",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Oy-"])],
+                ),
+            ],
+        )
+
+        compiler_output_flags_feature = feature(
+            name = "compiler_output_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.assemble],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}", "/Zi"],
+                                    expand_if_available = "output_file",
+                                    expand_if_not_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_not_available = "output_preprocess_file",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}"],
+                                    expand_if_not_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                            expand_if_not_available = "output_assembly_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fa%{output_file}"],
+                                    expand_if_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/P", "/Fi%{output_file}"],
+                                    expand_if_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        nologo_feature = feature(
+            name = "nologo",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/nologo"])],
+                ),
+            ],
+        )
+
+        smaller_binary_feature = feature(
+            name = "smaller_binary",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        compiler_input_flags_feature = feature(
+            name = "compiler_input_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/c", "%{source_file}"],
+                            expand_if_available = "source_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        def_file_feature = feature(
+            name = "def_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                            expand_if_available = "def_file_path",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_env_feature = feature(
+            name = "msvc_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                        env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                        env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                    ],
+                ),
+            ],
+            implies = ["msvc_compile_env", "msvc_link_env"],
+        )
+        features = [
+            no_legacy_features_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            msvc_compile_env_feature,
+            msvc_link_env_feature,
+            include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            dynamic_link_msvcrt_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            frame_pointer_feature,
+            disable_assertions_feature,
+            determinism_feature,
+            treat_warnings_as_errors_feature,
+            smaller_binary_feature,
+            ignore_noisy_warnings_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            archive_param_file_feature,
+            compiler_param_file_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            implies = ["copy_dynamic_libraries_to_binary"],
+            enabled = True,
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+        gcc_env_feature = feature(
+            name = "gcc_env",
+            enabled = True,
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.tool_bin_path),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [flag_group(flags = ["-std=gnu++0x"])],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+
+        supports_dynamic_linker_feature = feature(
+            name = "supports_dynamic_linker",
+            enabled = True,
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g", "-Og"])],
+                ),
+            ],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = [
+                        "-g0",
+                        "-O3",
+                        "-DNDEBUG",
+                        "-ffunction-sections",
+                        "-fdata-sections",
+                    ])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                ),
+            ],
+        )
+
+        if ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+            archive_param_file_feature = feature(
+                name = "archive_param_file",
+                enabled = True,
+            )
+
+            compiler_param_file_feature = feature(
+                name = "compiler_param_file",
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                default_compile_flags_feature,
+                archive_param_file_feature,
+                compiler_param_file_feature,
+                default_link_flags_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+            ]
+        else:
+            supports_pic_feature = feature(
+                name = "supports_pic",
+                enabled = True,
+            )
+
+            sysroot_feature = feature(
+                name = "sysroot",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                            ACTION_NAMES.cpp_link_executable,
+                            ACTION_NAMES.cpp_link_dynamic_library,
+                            ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["--sysroot=%{sysroot}"],
+                                expand_if_available = "sysroot",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            fdo_optimize_feature = feature(
+                name = "fdo_optimize",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-fprofile-use=%{fdo_profile_path}",
+                                    "-fprofile-correction",
+                                ],
+                                expand_if_available = "fdo_profile_path",
+                            ),
+                        ],
+                    ),
+                ],
+                provides = ["profile"],
+            )
+
+            treat_warnings_as_errors_feature = feature(
+                name = "treat_warnings_as_errors",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [flag_group(flags = ["-Werror"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions,
+                        flag_groups = [flag_group(flags = ["-Wl,-fatal-warnings"])],
+                    ),
+                ],
+            )
+
+            user_compile_flags_feature = feature(
+                name = "user_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.assemble,
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                                expand_if_available = "user_compile_flags",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                supports_pic_feature,
+                default_compile_flags_feature,
+                default_link_flags_feature,
+                fdo_optimize_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+                user_compile_flags_feature,
+                treat_warnings_as_errors_feature,
+                sysroot_feature,
+            ]
+
+    tool_paths = [
+        tool_path(name = name, path = path)
+        for name, path in ctx.attr.tool_paths.items()
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = ctx.attr.cxx_builtin_include_directories,
+        toolchain_identifier = ctx.attr.toolchain_identifier,
+        host_system_name = ctx.attr.host_system_name,
+        target_system_name = ctx.attr.target_system_name,
+        target_cpu = ctx.attr.cpu,
+        target_libc = ctx.attr.target_libc,
+        compiler = ctx.attr.compiler,
+        abi_version = ctx.attr.abi_version,
+        abi_libc_version = ctx.attr.abi_libc_version,
+        tool_paths = tool_paths,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+        "toolchain_identifier": attr.string(),
+        "host_system_name": attr.string(),
+        "target_system_name": attr.string(),
+        "target_libc": attr.string(),
+        "abi_version": attr.string(),
+        "abi_libc_version": attr.string(),
+        "tool_paths": attr.string_dict(),
+        "cxx_builtin_include_directories": attr.string_list(),
+        "archiver_flags": attr.string_list(default = []),
+        "default_link_flags": attr.string_list(default = []),
+        "msvc_env_tmp": attr.string(default = "msvc_not_found"),
+        "msvc_env_path": attr.string(default = "msvc_not_found"),
+        "msvc_env_include": attr.string(default = "msvc_not_found"),
+        "msvc_env_lib": attr.string(default = "msvc_not_found"),
+        "msvc_cl_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_ml_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_link_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_lib_path": attr.string(default = "vc_installation_error.bat"),
+        "dbg_mode_debug_flag": attr.string(),
+        "fastbuild_mode_debug_flag": attr.string(),
+        "tool_bin_path": attr.string(default = "not_found"),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_05022023/BUILD b/tensorflow/tools/toolchains/win/tf_win_05022023/BUILD
new file mode 100644
index 00000000000..f245f6d0789
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_05022023/BUILD
@@ -0,0 +1,630 @@
+# Copyright 2018 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under Windows.
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
+load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "malloc",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "mingw_compiler_files",
+    srcs = [":builtin_include_directory_paths_mingw"],
+)
+
+filegroup(
+    name = "clangcl_compiler_files",
+    srcs = [":builtin_include_directory_paths_clangcl"],
+)
+
+filegroup(
+    name = "msvc_compiler_files",
+    srcs = [":builtin_include_directory_paths_msvc"],
+)
+
+# Hardcoded toolchain, legacy behaviour.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "x64_windows|msvc-cl": ":cc-compiler-x64_windows",
+        "x64_x86_windows|msvc-cl": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows|msvc-cl": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "x64_windows|msys-gcc": ":cc-compiler-x64_windows_msys",
+        "x64_windows|mingw-gcc": ":cc-compiler-x64_windows_mingw",
+        "x64_windows|clang-cl": ":cc-compiler-x64_windows-clang-cl",
+        "x64_windows_msys": ":cc-compiler-x64_windows_msys",
+        "x64_windows": ":cc-compiler-x64_windows",
+        "x64_x86_windows": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows": ":cc-compiler-arm64_windows",
+        "arm64_windows": ":cc-compiler-arm64_windows",
+        "x64_arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_msys",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msys_x64",
+    toolchain_identifier = "msys_x64",
+)
+
+cc_toolchain_config(
+    name = "msys_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "msys-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/usr/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "msys",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/usr/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/usr/bin/ar",
+        "cpp": "c:/tools/msys64/usr/bin/cpp",
+        "dwp": "c:/tools/msys64/usr/bin/dwp",
+        "gcc": "c:/tools/msys64/usr/bin/gcc",
+        "gcov": "c:/tools/msys64/usr/bin/gcov",
+        "ld": "c:/tools/msys64/usr/bin/ld",
+        "nm": "c:/tools/msys64/usr/bin/nm",
+        "objcopy": "c:/tools/msys64/usr/bin/objcopy",
+        "objdump": "c:/tools/msys64/usr/bin/objdump",
+        "strip": "c:/tools/msys64/usr/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_msys",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:msys",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_msys",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_mingw",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_config = ":msys_x64_mingw",
+    toolchain_identifier = "msys_x64_mingw",
+)
+
+cc_toolchain_config(
+    name = "msys_x64_mingw",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "mingw-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/mingw64/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "mingw",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/mingw64/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/mingw64/bin/ar",
+        "cpp": "c:/tools/msys64/mingw64/bin/cpp",
+        "dwp": "c:/tools/msys64/mingw64/bin/dwp",
+        "gcc": "c:/tools/msys64/mingw64/bin/gcc",
+        "gcov": "c:/tools/msys64/mingw64/bin/gcov",
+        "ld": "c:/tools/msys64/mingw64/bin/ld",
+        "nm": "c:/tools/msys64/mingw64/bin/nm",
+        "objcopy": "c:/tools/msys64/mingw64/bin/objcopy",
+        "objdump": "c:/tools/msys64/mingw64/bin/objdump",
+        "strip": "c:/tools/msys64/mingw64/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_mingw",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:mingw",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_mingw",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64",
+    toolchain_identifier = "msvc_x64",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X64"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+    msvc_env_include = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\um\\x64",
+    msvc_env_path = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\devinit;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.20348.0\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\\\MSBuild\\Current\\Bin;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\;;C:\\Windows\\system32",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/lib.exe",
+    msvc_link_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/link.exe",
+    msvc_ml_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/ml64.exe",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/lib.exe",
+        "ml": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/ml64.exe",
+        "cpp": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+        "gcc": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_x86_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_x86",
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_x86",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X86"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X86"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+    msvc_env_include = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\ucrt\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\um\\x86",
+    msvc_env_path = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x86;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\devinit;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.20348.0\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\\\MSBuild\\Current\\Bin;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\;;C:\\Windows\\system32",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/lib.exe",
+    msvc_link_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/link.exe",
+    msvc_ml_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/ml.exe",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/lib.exe",
+        "ml": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/ml.exe",
+        "cpp": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+        "gcc": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_x86_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_32",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_x86_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_arm_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_arm",
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_arm",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm.bat",
+    msvc_link_path = "vc_installation_error_arm.bat",
+    msvc_ml_path = "vc_installation_error_arm.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm.bat",
+        "ml": "vc_installation_error_arm.bat",
+        "cpp": "vc_installation_error_arm.bat",
+        "gcc": "vc_installation_error_arm.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_arm_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_arm_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_arm64",
+    toolchain_identifier = "msvc_arm64",
+)
+
+cc_toolchain_config(
+    name = "msvc_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_x64",
+    toolchain_identifier = "clang_cl_x64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "clang-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = [
+        "/MACHINE:X64",
+        "/DEFAULTLIB:clang_rt.builtins-x86_64.lib",
+    ],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "clang_installation_error.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "clang_installation_error.bat",
+    msvc_link_path = "clang_installation_error.bat",
+    msvc_ml_path = "clang_installation_error.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "clang_installation_error.bat",
+        "ml": "clang_installation_error.bat",
+        "cpp": "clang_installation_error.bat",
+        "gcc": "clang_installation_error.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "clang_installation_error.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_arm64",
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "clang-cl",
+    cpu = "arm64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    target_libc = "msvcrt",
+    target_system_name = "aarch64-pc-windows-msvc",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+armeabi_cc_toolchain_config(name = "stub_armeabi-v7a")
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:armv7",
+        "@platforms//os:android",
+    ],
+    toolchain = ":cc-compiler-armeabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_05022023/armeabi_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/tf_win_05022023/armeabi_cc_toolchain_config.bzl
new file mode 100644
index 00000000000..72ef48ae6d6
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_05022023/armeabi_cc_toolchain_config.bzl
@@ -0,0 +1,82 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "feature",
+    "tool_path",
+)
+
+def _impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = []
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "llvm-profdata", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+armeabi_cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {},
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_05022023/builtin_include_directory_paths_msvc b/tensorflow/tools/toolchains/win/tf_win_05022023/builtin_include_directory_paths_msvc
new file mode 100644
index 00000000000..55ba44f761e
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_05022023/builtin_include_directory_paths_msvc
@@ -0,0 +1,6 @@
+This file is generated by cc_configure and contains builtin include directories
+that msvc reported. This file is a dependency of every compilation action and
+changes to it will be reflected in the action cache key. When some of these
+paths change, Bazel will make sure to rerun the action, even though none of
+declared action inputs or the action commandline changes.
+
diff --git a/tensorflow/tools/toolchains/win/tf_win_05022023/toolchain_image_info b/tensorflow/tools/toolchains/win/tf_win_05022023/toolchain_image_info
new file mode 100644
index 00000000000..824b062a515
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_05022023/toolchain_image_info
@@ -0,0 +1,2 @@
+REPOSITORY                                            TAG                 DIGEST                                                                    IMAGE ID            CREATED             SIZE
+gcr.io/tensorflow-testing/tf-win2019-docker-staging   latest              sha256:775f2889a35e7cc87f8e1dc83f3195acb4e709cebe3572cf3ae591ccef27e7e8   a41aae797393        21 minutes ago      15.9GB
\ No newline at end of file
diff --git a/tensorflow/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
new file mode 100644
index 00000000000..ba3de607d10
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
@@ -0,0 +1,1394 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule for Windows"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _use_msvc_toolchain(ctx):
+    return ctx.attr.cpu in ["x64_windows", "arm64_windows"] and (ctx.attr.compiler == "msvc-cl" or ctx.attr.compiler == "clang-cl")
+
+def _impl(ctx):
+    if _use_msvc_toolchain(ctx):
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "object_file",
+                prefix = "",
+                extension = ".obj",
+            ),
+            artifact_name_pattern(
+                category_name = "static_library",
+                prefix = "",
+                extension = ".lib",
+            ),
+            artifact_name_pattern(
+                category_name = "alwayslink_static_library",
+                prefix = "",
+                extension = ".lo.lib",
+            ),
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+            artifact_name_pattern(
+                category_name = "dynamic_library",
+                prefix = "",
+                extension = ".dll",
+            ),
+            artifact_name_pattern(
+                category_name = "interface_library",
+                prefix = "",
+                extension = ".if.lib",
+            ),
+        ]
+    else:
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+        ]
+
+    if _use_msvc_toolchain(ctx):
+        cpp_link_nodeps_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_static_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_static_library,
+            implies = [
+                "nologo",
+                "archiver_flags",
+                "input_param_flags",
+                "linker_param_file",
+                "msvc_env",
+            ],
+            tools = [tool(path = ctx.attr.msvc_lib_path)],
+        )
+
+        assemble_action = action_config(
+            action_name = ACTION_NAMES.assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        preprocess_assemble_action = action_config(
+            action_name = ACTION_NAMES.preprocess_assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        c_compile_action = action_config(
+            action_name = ACTION_NAMES.c_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        linkstamp_compile_action = action_config(
+            action_name = ACTION_NAMES.linkstamp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "default_compile_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+                "unfiltered_compile_flags",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_compile_action = action_config(
+            action_name = ACTION_NAMES.cpp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_link_executable_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_executable,
+            implies = [
+                "nologo",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            linkstamp_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        action_configs = []
+
+    if _use_msvc_toolchain(ctx):
+        msvc_link_env_feature = feature(
+            name = "msvc_link_env",
+            env_sets = [
+                env_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    env_entries = [env_entry(key = "LIB", value = ctx.attr.msvc_env_lib)],
+                ),
+            ],
+        )
+
+        shared_flag_feature = feature(
+            name = "shared_flag",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/DLL"])],
+                ),
+            ],
+        )
+
+        determinism_feature = feature(
+            name = "determinism",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/wd4117",
+                                "-D__DATE__=\"redacted\"",
+                                "-D__TIMESTAMP__=\"redacted\"",
+                                "-D__TIME__=\"redacted\"",
+                            ] + (["-Wno-builtin-macro-redefined"] if ctx.attr.compiler == "clang-cl" else []),
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        sysroot_feature = feature(
+            name = "sysroot",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["--sysroot=%{sysroot}"],
+                            iterate_over = "sysroot",
+                            expand_if_available = "sysroot",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        unfiltered_compile_flags_feature = feature(
+            name = "unfiltered_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{unfiltered_compile_flags}"],
+                            iterate_over = "unfiltered_compile_flags",
+                            expand_if_available = "unfiltered_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archive_param_file_feature = feature(
+            name = "archive_param_file",
+            enabled = True,
+        )
+
+        compiler_param_file_feature = feature(
+            name = "compiler_param_file",
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(
+            name = "copy_dynamic_libraries_to_binary",
+        )
+
+        input_param_flags_feature = feature(
+            name = "input_param_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/IMPLIB:%{interface_library_output_path}"],
+                            expand_if_available = "interface_library_output_path",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{libopts}"],
+                            iterate_over = "libopts",
+                            expand_if_available = "libopts",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            iterate_over = "libraries_to_link",
+                            flag_groups = [
+                                flag_group(
+                                    iterate_over = "libraries_to_link.object_files",
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file_group",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "interface_library",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [
+                                        flag_group(
+                                            flags = ["%{libraries_to_link.name}"],
+                                            expand_if_false = "libraries_to_link.is_whole_archive",
+                                        ),
+                                        flag_group(
+                                            flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                            expand_if_true = "libraries_to_link.is_whole_archive",
+                                        ),
+                                    ],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "static_library",
+                                    ),
+                                ),
+                            ],
+                            expand_if_available = "libraries_to_link",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.fastbuild_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        user_compile_flags_feature = feature(
+            name = "user_compile_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_compile_flags}"],
+                            iterate_over = "user_compile_flags",
+                            expand_if_available = "user_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archiver_flags_feature = feature(
+            name = "archiver_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                        flag_group(
+                            flags = ctx.attr.archiver_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ctx.attr.default_link_flags)],
+                ),
+            ],
+        )
+
+        static_link_msvcrt_feature = feature(
+            name = "static_link_msvcrt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MT"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MTd"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+            ],
+        )
+
+        dynamic_link_msvcrt_feature = feature(
+            name = "dynamic_link_msvcrt",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MD"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MDd"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+            ],
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.dbg_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2"])],
+                ),
+            ],
+            implies = ["frame_pointer"],
+        )
+
+        supports_interface_shared_libraries_feature = feature(
+            name = "supports_interface_shared_libraries",
+            enabled = True,
+        )
+
+        user_link_flags_feature = feature(
+            name = "user_link_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_link_flags}"],
+                            iterate_over = "user_link_flags",
+                            expand_if_available = "user_link_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/DCOMPILER_MSVC",
+                                "/DNOMINMAX",
+                                "/D_WIN32_WINNT=0x0601",
+                                "/D_CRT_SECURE_NO_DEPRECATE",
+                                "/D_CRT_SECURE_NO_WARNINGS",
+                                "/bigobj",
+                                "/Zm500",
+                                "/EHsc",
+                                "/wd4351",
+                                "/wd4291",
+                                "/wd4250",
+                                "/wd4996",
+                            ],
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_compile_env_feature = feature(
+            name = "msvc_compile_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                    ],
+                    env_entries = [env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include)],
+                ),
+            ],
+        )
+
+        preprocessor_defines_feature = feature(
+            name = "preprocessor_defines",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/D%{preprocessor_defines}"],
+                            iterate_over = "preprocessor_defines",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        generate_pdb_file_feature = feature(
+            name = "generate_pdb_file",
+        )
+
+        output_execpath_flags_feature = feature(
+            name = "output_execpath_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        disable_assertions_feature = feature(
+            name = "disable_assertions",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+        supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+        no_stripping_feature = feature(name = "no_stripping")
+
+        linker_param_file_feature = feature(
+            name = "linker_param_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["@%{linker_param_file}"],
+                            expand_if_available = "linker_param_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        ignore_noisy_warnings_feature = feature(
+            name = "ignore_noisy_warnings",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [flag_group(flags = ["/ignore:4221"])],
+                ),
+            ],
+        )
+
+        no_legacy_features_feature = feature(name = "no_legacy_features")
+
+        parse_showincludes_feature = feature(
+            name = "parse_showincludes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                    ],
+                    flag_groups = [flag_group(flags = ["/showIncludes"])],
+                ),
+            ],
+        )
+
+        treat_warnings_as_errors_feature = feature(
+            name = "treat_warnings_as_errors",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile] + all_link_actions,
+                    flag_groups = [flag_group(flags = ["/WX"])],
+                ),
+            ],
+        )
+
+        windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+        no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+        include_paths_feature = feature(
+            name = "include_paths",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/I%{quote_include_paths}"],
+                            iterate_over = "quote_include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{include_paths}"],
+                            iterate_over = "include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{system_include_paths}"],
+                            iterate_over = "system_include_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        linkstamps_feature = feature(
+            name = "linkstamps",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{linkstamp_paths}"],
+                            iterate_over = "linkstamp_paths",
+                            expand_if_available = "linkstamp_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            enabled = True,
+            implies = ["copy_dynamic_libraries_to_binary"],
+        )
+
+        linker_subsystem_flag_feature = feature(
+            name = "linker_subsystem_flag",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+                ),
+            ],
+        )
+
+        frame_pointer_feature = feature(
+            name = "frame_pointer",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Oy-"])],
+                ),
+            ],
+        )
+
+        compiler_output_flags_feature = feature(
+            name = "compiler_output_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.assemble],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}", "/Zi"],
+                                    expand_if_available = "output_file",
+                                    expand_if_not_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_not_available = "output_preprocess_file",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}"],
+                                    expand_if_not_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                            expand_if_not_available = "output_assembly_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fa%{output_file}"],
+                                    expand_if_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/P", "/Fi%{output_file}"],
+                                    expand_if_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        nologo_feature = feature(
+            name = "nologo",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/nologo"])],
+                ),
+            ],
+        )
+
+        smaller_binary_feature = feature(
+            name = "smaller_binary",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        compiler_input_flags_feature = feature(
+            name = "compiler_input_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/c", "%{source_file}"],
+                            expand_if_available = "source_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        def_file_feature = feature(
+            name = "def_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                            expand_if_available = "def_file_path",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_env_feature = feature(
+            name = "msvc_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                        env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                        env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                    ],
+                ),
+            ],
+            implies = ["msvc_compile_env", "msvc_link_env"],
+        )
+        features = [
+            no_legacy_features_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            msvc_compile_env_feature,
+            msvc_link_env_feature,
+            include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            dynamic_link_msvcrt_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            frame_pointer_feature,
+            disable_assertions_feature,
+            determinism_feature,
+            treat_warnings_as_errors_feature,
+            smaller_binary_feature,
+            ignore_noisy_warnings_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            archive_param_file_feature,
+            compiler_param_file_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            implies = ["copy_dynamic_libraries_to_binary"],
+            enabled = True,
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+        gcc_env_feature = feature(
+            name = "gcc_env",
+            enabled = True,
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.tool_bin_path),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [flag_group(flags = ["-std=gnu++0x"])],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+
+        supports_dynamic_linker_feature = feature(
+            name = "supports_dynamic_linker",
+            enabled = True,
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g", "-Og"])],
+                ),
+            ],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = [
+                        "-g0",
+                        "-O3",
+                        "-DNDEBUG",
+                        "-ffunction-sections",
+                        "-fdata-sections",
+                    ])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                ),
+            ],
+        )
+
+        if ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+            archive_param_file_feature = feature(
+                name = "archive_param_file",
+                enabled = True,
+            )
+
+            compiler_param_file_feature = feature(
+                name = "compiler_param_file",
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                default_compile_flags_feature,
+                archive_param_file_feature,
+                compiler_param_file_feature,
+                default_link_flags_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+            ]
+        else:
+            supports_pic_feature = feature(
+                name = "supports_pic",
+                enabled = True,
+            )
+
+            sysroot_feature = feature(
+                name = "sysroot",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                            ACTION_NAMES.cpp_link_executable,
+                            ACTION_NAMES.cpp_link_dynamic_library,
+                            ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["--sysroot=%{sysroot}"],
+                                expand_if_available = "sysroot",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            fdo_optimize_feature = feature(
+                name = "fdo_optimize",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-fprofile-use=%{fdo_profile_path}",
+                                    "-fprofile-correction",
+                                ],
+                                expand_if_available = "fdo_profile_path",
+                            ),
+                        ],
+                    ),
+                ],
+                provides = ["profile"],
+            )
+
+            treat_warnings_as_errors_feature = feature(
+                name = "treat_warnings_as_errors",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [flag_group(flags = ["-Werror"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions,
+                        flag_groups = [flag_group(flags = ["-Wl,-fatal-warnings"])],
+                    ),
+                ],
+            )
+
+            user_compile_flags_feature = feature(
+                name = "user_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.assemble,
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                                expand_if_available = "user_compile_flags",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                supports_pic_feature,
+                default_compile_flags_feature,
+                default_link_flags_feature,
+                fdo_optimize_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+                user_compile_flags_feature,
+                treat_warnings_as_errors_feature,
+                sysroot_feature,
+            ]
+
+    tool_paths = [
+        tool_path(name = name, path = path)
+        for name, path in ctx.attr.tool_paths.items()
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = ctx.attr.cxx_builtin_include_directories,
+        toolchain_identifier = ctx.attr.toolchain_identifier,
+        host_system_name = ctx.attr.host_system_name,
+        target_system_name = ctx.attr.target_system_name,
+        target_cpu = ctx.attr.cpu,
+        target_libc = ctx.attr.target_libc,
+        compiler = ctx.attr.compiler,
+        abi_version = ctx.attr.abi_version,
+        abi_libc_version = ctx.attr.abi_libc_version,
+        tool_paths = tool_paths,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+        "toolchain_identifier": attr.string(),
+        "host_system_name": attr.string(),
+        "target_system_name": attr.string(),
+        "target_libc": attr.string(),
+        "abi_version": attr.string(),
+        "abi_libc_version": attr.string(),
+        "tool_paths": attr.string_dict(),
+        "cxx_builtin_include_directories": attr.string_list(),
+        "archiver_flags": attr.string_list(default = []),
+        "default_link_flags": attr.string_list(default = []),
+        "msvc_env_tmp": attr.string(default = "msvc_not_found"),
+        "msvc_env_path": attr.string(default = "msvc_not_found"),
+        "msvc_env_include": attr.string(default = "msvc_not_found"),
+        "msvc_env_lib": attr.string(default = "msvc_not_found"),
+        "msvc_cl_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_ml_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_link_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_lib_path": attr.string(default = "vc_installation_error.bat"),
+        "dbg_mode_debug_flag": attr.string(),
+        "fastbuild_mode_debug_flag": attr.string(),
+        "tool_bin_path": attr.string(default = "not_found"),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tsl/BUILD b/tensorflow/tsl/BUILD
index 111bd651883..da15e3cf07d 100644
--- a/tensorflow/tsl/BUILD
+++ b/tensorflow/tsl/BUILD
@@ -266,6 +266,24 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+selects.config_setting_group(
+    name = "linux_x86_64_with_onednn_v2",
+    match_all = [
+        ":linux_x86_64",
+        "@org_tensorflow//third_party/mkl_dnn:build_with_onednn_v2",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+selects.config_setting_group(
+    name = "linux_x86_64_with_onednn_v3",
+    match_all = [
+        ":linux_x86_64",
+        "@org_tensorflow//third_party/mkl_dnn:build_with_onednn_v3",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "ios_x86_64",
     flag_values = if_google(
diff --git a/tensorflow/tsl/c/tsl_status.cc b/tensorflow/tsl/c/tsl_status.cc
index 1b963871e80..c86d3c668c8 100644
--- a/tensorflow/tsl/c/tsl_status.cc
+++ b/tensorflow/tsl/c/tsl_status.cc
@@ -51,5 +51,5 @@ TSL_Code TSL_GetCode(const TSL_Status* s) {
 }
 
 const char* TSL_Message(const TSL_Status* s) {
-  return s->status.error_message().c_str();
+  return tsl::NullTerminatedMessage(s->status);
 }
diff --git a/tensorflow/tsl/c/tsl_status_helper.cc b/tensorflow/tsl/c/tsl_status_helper.cc
index e7148714f40..595a0df95f6 100644
--- a/tensorflow/tsl/c/tsl_status_helper.cc
+++ b/tensorflow/tsl/c/tsl_status_helper.cc
@@ -22,7 +22,7 @@ namespace tsl {
 
 void Set_TSL_Status_from_Status(TSL_Status* tsl_status, const Status& status) {
   absl::StatusCode code = static_cast<absl::StatusCode>(status.code());
-  const char* message(status.error_message().c_str());
+  const char* message = tsl::NullTerminatedMessage(status);
 
   switch (code) {
     case absl::StatusCode::kOk:
diff --git a/tensorflow/tsl/c/tsl_status_helper_test.cc b/tensorflow/tsl/c/tsl_status_helper_test.cc
index dc0970d89f1..2b41266af30 100644
--- a/tensorflow/tsl/c/tsl_status_helper_test.cc
+++ b/tensorflow/tsl/c/tsl_status_helper_test.cc
@@ -32,7 +32,7 @@ TEST(StatusHelper, TestStatusHelper) {
 
   Status another_cc_status(StatusFromTSL_Status(s));
   ASSERT_FALSE(another_cc_status.ok());
-  ASSERT_EQ(std::string("some error"), another_cc_status.error_message());
+  ASSERT_EQ(std::string("some error"), another_cc_status.message());
   ASSERT_EQ(error::INVALID_ARGUMENT, another_cc_status.code());
   // Ensure the payloads are not lost during conversions
   ASSERT_EQ(cc_status.GetPayload("key1"), another_cc_status.GetPayload("key1"));
diff --git a/tensorflow/tsl/concurrency/async_value.h b/tensorflow/tsl/concurrency/async_value.h
index e26d838a373..15c23dd37fd 100644
--- a/tensorflow/tsl/concurrency/async_value.h
+++ b/tensorflow/tsl/concurrency/async_value.h
@@ -502,22 +502,10 @@ struct KeepAsyncValuePayloadOnError {};
 //    state holding an `absl::Status` error, or in concrete state holding a
 //    value of type `T`.
 //
-template <typename T>
-struct IsStatus : std::false_type {};
-
-template <>
-struct IsStatus<absl::Status> : std::true_type {};
-
-template <typename T>
-struct IsStatus<absl::StatusOr<T>> : std::true_type {};
-
 // Subclass for storing the payload of the AsyncValue
 template <typename T>
 class ConcreteAsyncValue : public AsyncValue {
  public:
-  // Async value does not support `absl::Status` or `absl::StatusOr` payload.
-  static_assert(!IsStatus<T>::value, "invalid payload type");
-
   // Tag type for making a ConcreteAsyncValue without calling underlying value's
   // constructor.
   struct UnconstructedPayload {
diff --git a/tensorflow/tsl/cuda/BUILD b/tensorflow/tsl/cuda/BUILD
index 75480a3932c..67756c6016e 100644
--- a/tensorflow/tsl/cuda/BUILD
+++ b/tensorflow/tsl/cuda/BUILD
@@ -85,9 +85,9 @@ cc_library(
     deps = select({
         "//tensorflow/tsl:is_cuda_enabled_and_oss": [
             ":cuda_stub",
-            "@local_config_cuda//cuda:cuda_headers",
             "//tensorflow/tsl/platform:dso_loader",
             "//tensorflow/tsl/platform:env",
+            "@local_config_cuda//cuda:cuda_headers",
         ],
         "//conditions:default": [],
     }),
diff --git a/tensorflow/tsl/distributed_runtime/coordination/BUILD b/tensorflow/tsl/distributed_runtime/coordination/BUILD
index 2d019f30d8f..3742c8c9351 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/BUILD
+++ b/tensorflow/tsl/distributed_runtime/coordination/BUILD
@@ -1,12 +1,12 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
-load("//tensorflow/tsl:tsl.bzl", "if_oss", "tsl_gpu_library")
+load("//tensorflow/tsl:tsl.bzl", "if_oss", "set_external_visibility", "tsl_gpu_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//tensorflow/tsl:internal",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -106,6 +106,7 @@ tsl_cc_test(
     deps = [
         ":coordination_client",
         ":coordination_service",
+        ":coordination_service_error_util",
         ":coordination_service_impl",
         ":test_device_proto_cc",
         "//tensorflow/tsl/distributed_runtime:call_options",
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service.cc
index 6cfe01e7e1c..0a8d2b8a555 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -232,6 +232,10 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
   const uint64_t service_incarnation_ = random::New64();
   const uint64_t heartbeat_timeout_ms_;
   const absl::Duration shutdown_barrier_timeout_;
+  // If a task restarts with a new incarnation, we may allow it to reconnect
+  // silently if configured. This is useful when we know that a task can
+  // immediately resume work upon re-connecting to the service.
+  bool allow_new_incarnation_to_reconnect_ = false;
   std::function<DeviceInfo(const DeviceInfo& devices)>
       post_aggregate_device_fn_;
 
@@ -348,7 +352,9 @@ CoordinationServiceStandaloneImpl::CoordinationServiceStandaloneImpl(
                    : kDefaultHeartbeatTimeoutMs;
       }()),
       shutdown_barrier_timeout_(
-          absl::Milliseconds(config.shutdown_barrier_timeout_in_ms())) {
+          absl::Milliseconds(config.shutdown_barrier_timeout_in_ms())),
+      allow_new_incarnation_to_reconnect_(
+          config.allow_new_incarnation_to_reconnect()) {
   recoverable_jobs_ = absl::flat_hash_set<std::string>(
       config.recoverable_jobs().cbegin(), config.recoverable_jobs().cend());
   for (const auto& job : config.coordinated_job_list()) {
@@ -515,12 +521,16 @@ Status CoordinationServiceStandaloneImpl::RegisterTask(
     const auto task_status = task_cluster_state->GetStatus();
 
     if (task_state == CoordinatedTaskState::TASKSTATE_DISCONNECTED ||
-        (errors::IsUnavailable(task_status) &&
-         task_status.GetPayload(CoordinationErrorPayloadKey()))) {
-      // This task is currently disconnected (registering for the first time or
-      // has called ResetTask() previously), or being unavailable, e.g. due
-      // to preemption, but does not have chance to be reset. We should allow
-      // the connection.
+        (allow_new_incarnation_to_reconnect_ &&
+         (errors::IsUnavailable(task_status) &&
+          task_status.GetPayload(CoordinationErrorPayloadKey())))) {
+      // The task is allowed to register itself if:
+      // - this task is currently disconnected (registering for the first time
+      //   or has called ResetTask() previously).
+      // - this task has lost connection previously which caused it to have
+      //   an unavailable error state, but has now restarted (possibly with
+      //   a new incarnation). This is only allowed if configured with
+      //   `allow_new_incarnation_to_reconnect`.
       task_cluster_state->SetConnected(incarnation);
       LOG(INFO) << task_name
                 << " has connected to coordination service. Incarnation: "
@@ -674,7 +684,7 @@ CoordinationServiceStandaloneImpl::GetTaskState(
     }
     *state_info.mutable_task() = task;
     state_info.set_error_code(error.raw_code());
-    state_info.set_error_message(error.error_message());
+    state_info.set_error_message(std::string(error.message()));
     if (!error.ok()) {
       *state_info.mutable_error_payload()->mutable_source_task() = task;
       state_info.mutable_error_payload()->set_is_reported_error(false);
@@ -735,7 +745,7 @@ void CoordinationServiceStandaloneImpl::ReportServiceErrorToTaskAsync(
   auto request = std::make_shared<ReportErrorToTaskRequest>();
   auto response = std::make_shared<ReportErrorToTaskResponse>();
   request->set_error_code(error.raw_code());
-  request->set_error_message(error.error_message());
+  request->set_error_message(std::string(error.message()));
   CoordinatedTask* error_source =
       request->mutable_error_payload()->mutable_source_task();
   error_source->set_job_name("coordination_service");
@@ -767,7 +777,7 @@ void CoordinationServiceStandaloneImpl::PropagateError(
   assert(!error.ok());
   ReportErrorToTaskRequest request;
   request.set_error_code(error.raw_code());
-  request.set_error_message(error.error_message());
+  request.set_error_message(std::string(error.message()));
   CoordinationServiceError* payload = request.mutable_error_payload();
   *payload->mutable_source_task() = source_task;
   payload->set_is_reported_error(is_reported_by_task);
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index 1f70c775422..d2c3a52b207 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -433,7 +433,7 @@ Status CoordinationServiceAgentImpl::ReportError(const Status& error) {
   LOG(INFO) << "Reporting error to coordination service: " << error;
   ReportErrorToServiceRequest request;
   request.set_error_code(error.raw_code());
-  request.set_error_message(error.error_message());
+  request.set_error_message(std::string(error.message()));
   *request.mutable_error_origin() = task_;
   VLOG(5) << "ReportErrorToServiceRequest: " << request.DebugString();
   ReportErrorToServiceResponse response;
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc
index 538615b7a8e..4c521004e3f 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc
@@ -31,7 +31,7 @@ TEST(CoordinationServiceErrorUtil, MakeCoordinationErrorWithEmptyPayload) {
   Status coordination_error = MakeCoordinationError(error);
 
   EXPECT_EQ(coordination_error.code(), error.code());
-  EXPECT_EQ(coordination_error.error_message(), error.error_message());
+  EXPECT_EQ(coordination_error.message(), error.message());
   // Payload exists but has no value.
   EXPECT_EQ(
       coordination_error.GetPayload(CoordinationErrorPayloadKey()).value(), "");
@@ -46,7 +46,7 @@ TEST(CoordinationServiceErrorUtil, MakeCoordinationErrorWithErrorOrigin) {
   Status coordination_error = MakeCoordinationError(error, source_task);
 
   EXPECT_EQ(coordination_error.code(), error.code());
-  EXPECT_EQ(coordination_error.error_message(), error.error_message());
+  EXPECT_EQ(coordination_error.message(), error.message());
   CoordinationServiceError payload;
   // Explicit string conversion for open source builds.
   payload.ParseFromString(std::string(
@@ -66,7 +66,7 @@ TEST(CoordinationServiceErrorUtil, MakeCoordinationErrorWithUserReportedError) {
                                                     /*is_reported_error=*/true);
 
   EXPECT_EQ(coordination_error.code(), error.code());
-  EXPECT_EQ(coordination_error.error_message(), error.error_message());
+  EXPECT_EQ(coordination_error.message(), error.message());
   CoordinationServiceError payload;
   // Explicit string conversion for open source builds.
   payload.ParseFromString(std::string(
@@ -87,7 +87,7 @@ TEST(CoordinationServiceErrorUtil, MakeCoordinationErrorWithPayload) {
   Status coordination_error = MakeCoordinationError(error, payload);
 
   EXPECT_EQ(coordination_error.code(), error.code());
-  EXPECT_EQ(coordination_error.error_message(), error.error_message());
+  EXPECT_EQ(coordination_error.message(), error.message());
   CoordinationServiceError actual_payload;
   // Explicit string conversion for open source builds.
   actual_payload.ParseFromString(std::string(
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
index fdb5ec82112..f1ac20cde0c 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
@@ -163,7 +163,7 @@ void CoordinationServiceRpcHandler::ReportErrorToServiceAsync(
   done(service_->ReportTaskError(
       request->error_origin(),
       MakeCoordinationError(
-          Status{static_cast<error::Code>(request->error_code()),
+          Status{static_cast<absl::StatusCode>(request->error_code()),
                  request->error_message()},
           request->error_origin(),
           /*is_reported_error=*/true)));
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_test.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_test.cc
index d250f44f397..334a6a0b031 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_test.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "tensorflow/tsl/distributed_runtime/call_options.h"
 #include "tensorflow/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 #include "tensorflow/tsl/distributed_runtime/coordination/test_device.pb.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/env.h"
@@ -210,9 +211,11 @@ class CoordinateTwoTasksTest : public ::testing::Test {
   }
 
   // Set up coordination service.
-  void EnableCoordinationService(bool has_service_to_client_connection = true,
-                                 bool enable_shutdown_barrier = false,
-                                 bool set_worker_job_recoverable = false) {
+  void EnableCoordinationService(
+      bool has_service_to_client_connection = true,
+      bool enable_shutdown_barrier = false,
+      bool set_worker_job_recoverable = false,
+      bool allow_new_incarnation_to_reconnect = false) {
     CoordinationServiceConfig config =
         GetCoordinationServiceConfig(/*num_tasks=*/2);
     auto client_cache = std::make_unique<TestCoordinationClientCache>();
@@ -231,6 +234,9 @@ class CoordinateTwoTasksTest : public ::testing::Test {
       config.set_shutdown_barrier_timeout_in_ms(kShutdownBarrierTimeout /
                                                 absl::Milliseconds(1));
     }
+    if (allow_new_incarnation_to_reconnect) {
+      config.set_allow_new_incarnation_to_reconnect(true);
+    }
     // Init service.
     coord_service_ = CoordinationServiceInterface::EnableCoordinationService(
         Env::Default(), config, std::move(client_cache));
@@ -351,7 +357,7 @@ TEST(CoordinationServiceTest, TestCoordinatedJobs) {
   // Registering the evaluator task is unexpected
   Status status = coord_service->RegisterTask(evaluator, /*incarnation=*/0);
   EXPECT_TRUE(errors::IsInvalidArgument(status)) << status;
-  EXPECT_TRUE(!status.error_message().empty());
+  EXPECT_TRUE(!status.message().empty());
 }
 
 // RegisterTask() may succeed in the service, but the agent response times out.
@@ -396,7 +402,7 @@ TEST(CoordinationServiceTest,
   const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/1);
 
   EXPECT_TRUE(errors::IsAborted(status)) << status;
-  EXPECT_TRUE(!status.error_message().empty());
+  EXPECT_TRUE(!status.message().empty());
 }
 
 TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
@@ -419,7 +425,7 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
   const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/0);
 
   EXPECT_TRUE(errors::IsAborted(status)) << status;
-  EXPECT_TRUE(!status.error_message().empty());
+  EXPECT_TRUE(!status.message().empty());
 }
 
 TEST_F(CoordinateTwoTasksTest, TestTaskHeartbeatTimeout) {
@@ -1429,4 +1435,17 @@ TEST_F(CoordinateTwoTasksTest,
   TF_EXPECT_OK(client_1_.GetStatus());
 }
 
+TEST_F(CoordinateTwoTasksTest, UnavailableTaskCanReconnect) {
+  EnableCoordinationService(/*has_service_to_client_connection=*/true,
+                            /*enable_shutdown_barrier=*/false,
+                            /*set_worker_job_recoverable=*/false,
+                            /*allow_new_incarnation_to_reconnect=*/true);
+
+  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+
+  TF_ASSERT_OK(coord_service_->ReportTaskError(
+      task_0_, MakeCoordinationError(errors::Unavailable("test_error"))));
+
+  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_new_));
+}
 }  // namespace tsl
diff --git a/tensorflow/tsl/distributed_runtime/preemption/BUILD b/tensorflow/tsl/distributed_runtime/preemption/BUILD
index eb34ccadf5c..97572b32855 100644
--- a/tensorflow/tsl/distributed_runtime/preemption/BUILD
+++ b/tensorflow/tsl/distributed_runtime/preemption/BUILD
@@ -1,12 +1,13 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud", "tsl_grpc_cc_dependencies")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//tensorflow/tsl:internal",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -72,8 +73,6 @@ tsl_cc_test(
     deps = [
         ":preemption_notifier",
         ":preemption_sync_manager",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/time",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_client",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
@@ -91,5 +90,7 @@ tsl_cc_test(
         "//tensorflow/tsl/protobuf:coordination_config_proto_cc_impl",
         "//tensorflow/tsl/protobuf:coordination_service_proto_cc_impl",
         "//tensorflow/tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/time",
     ] + tsl_grpc_cc_dependencies(),
 )
diff --git a/tensorflow/tsl/distributed_runtime/rpc/BUILD b/tensorflow/tsl/distributed_runtime/rpc/BUILD
index f7550c44bf1..a55a98ef76b 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/BUILD
+++ b/tensorflow/tsl/distributed_runtime/rpc/BUILD
@@ -4,12 +4,13 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
 load("//tensorflow/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//tensorflow/tsl:internal",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -35,13 +36,14 @@ cc_library(
     srcs = ["grpc_util.cc"],
     hdrs = ["grpc_util.h"],
     deps = [
-        "@com_google_absl//absl/strings:cord",
+        "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/platform:random",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:stringpiece",
         "//tensorflow/tsl/platform:stringprintf",
-        "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/protobuf:distributed_runtime_payloads_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:cord",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -53,8 +55,8 @@ tsl_cc_test(
         "no_mac",
     ],
     deps = [
-        ":test_request_proto_cc_impl",
         ":grpc_util",
+        ":test_request_proto_cc_impl",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
@@ -81,20 +83,20 @@ cc_library(
     deps = [
         ":grpc_channel_common",
         ":grpc_util",
-        "@com_google_absl//absl/strings",
         "//tensorflow/tsl/lib/gtl:map_util",
-        "//tensorflow/tsl/util:device_name_utils",
-        "//tensorflow/tsl/platform:numbers",
-        "//tensorflow/tsl/platform:str_util",
-        "//tensorflow/tsl/platform:strcat",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:macros",
         "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:numbers",
         "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:str_util",
+        "//tensorflow/tsl/platform:strcat",
         "//tensorflow/tsl/platform:thread_annotations",
         "//tensorflow/tsl/platform:types",
         "//tensorflow/tsl/protobuf:rpc_options_proto_cc",
+        "//tensorflow/tsl/util:device_name_utils",
+        "@com_google_absl//absl/strings",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -122,10 +124,10 @@ cc_library(
         ":grpc_client_cq_tag",
         ":grpc_util",
         "//tensorflow/tsl/distributed_runtime:call_options",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:strcat",
-        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/util:env_var",
     ] + tsl_grpc_cc_dependencies(),
 )
diff --git a/tensorflow/tsl/distributed_runtime/rpc/coordination/BUILD b/tensorflow/tsl/distributed_runtime/rpc/coordination/BUILD
index e294a97401d..aa66f693c70 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/coordination/BUILD
+++ b/tensorflow/tsl/distributed_runtime/rpc/coordination/BUILD
@@ -1,11 +1,12 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//tensorflow/tsl:internal",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -33,15 +34,15 @@ cc_library(
     srcs = ["grpc_coordination_service_impl.cc"],
     hdrs = ["grpc_coordination_service_impl.h"],
     deps = [
-        "//tensorflow/tsl/platform:thread_annotations",
-        "//tensorflow/tsl/platform:mutex",
-        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_rpc_handler",
         "//tensorflow/tsl/distributed_runtime/rpc:async_service_interface",
-        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//tensorflow/tsl/distributed_runtime/rpc:grpc_call",
         "//tensorflow/tsl/distributed_runtime/rpc:grpc_util",
-        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:thread_annotations",
         "//tensorflow/tsl/protobuf:coordination_service_cc_grpc_proto",
+        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
     ] + tsl_grpc_cc_dependencies(),
 )
diff --git a/tensorflow/tsl/distributed_runtime/rpc/grpc_state.h b/tensorflow/tsl/distributed_runtime/rpc/grpc_state.h
index 34170e7d5e7..503905e8562 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/tsl/distributed_runtime/rpc/grpc_state.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
 
 #include <queue>
+#include <string>
 #include <utility>
 
 #include "grpcpp/generic/generic_stub.h"
@@ -183,7 +184,7 @@ class RPCState : public GrpcClientCQTag {
                                         [this]() { StartCall(); });
     } else {
       // Attach additional GRPC error information if any to the final status
-      string error_msg = s.error_message();
+      string error_msg = std::string(s.message());
       strings::StrAppend(&error_msg, "\nAdditional GRPC error information");
       if (target_) {
         strings::StrAppend(&error_msg, " from remote target ", *target_);
diff --git a/tensorflow/tsl/distributed_runtime/rpc/grpc_util.h b/tensorflow/tsl/distributed_runtime/rpc/grpc_util.h
index 3bdcff668c8..f41f0aecea3 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/tsl/distributed_runtime/rpc/grpc_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "grpcpp/grpcpp.h"
 #include "grpcpp/support/byte_buffer.h"
+#include "absl/status/status.h"
 #include "absl/strings/cord.h"
 #include "tensorflow/tsl/platform/protobuf.h"
 #include "tensorflow/tsl/platform/status.h"
@@ -89,9 +90,9 @@ inline Status FromGrpcStatus(const ::grpc::Status& s) {
     // Convert "UNKNOWN" stream removed errors into unavailable, to allow
     // for retry upstream.
     if (IsStreamRemovedError(s)) {
-      converted = Status(tensorflow::error::UNAVAILABLE, s.error_message());
+      converted = Status(absl::StatusCode::kUnavailable, s.error_message());
     }
-    converted = Status(static_cast<tensorflow::error::Code>(s.error_code()),
+    converted = Status(static_cast<absl::StatusCode>(s.error_code()),
                        s.error_message());
     InsertSerializedPayloads(converted, s.error_details());
     return converted;
@@ -102,16 +103,16 @@ inline ::grpc::Status ToGrpcStatus(const Status& s) {
   if (s.ok()) {
     return ::grpc::Status::OK;
   } else {
-    if (s.error_message().size() > 3072 /* 3k bytes */) {
+    if (s.message().size() > 3072 /* 3k bytes */) {
       // TODO(b/62947679): Remove truncation once the gRPC issue is resolved.
-      string scratch =
-          strings::Printf("%.3072s ... [truncated]", s.error_message().c_str());
+      string scratch = strings::Printf("%.3072s ... [truncated]",
+                                       tsl::NullTerminatedMessage(s));
       LOG(ERROR) << "Truncated error message: " << s;
       return ::grpc::Status(static_cast<::grpc::StatusCode>(s.code()), scratch,
                             SerializePayloads(s));
     }
     return ::grpc::Status(static_cast<::grpc::StatusCode>(s.code()),
-                          s.error_message(), SerializePayloads(s));
+                          std::string(s.message()), SerializePayloads(s));
   }
 }
 
diff --git a/tensorflow/tsl/framework/BUILD b/tensorflow/tsl/framework/BUILD
index a8164369d66..e298d1e9f8d 100644
--- a/tensorflow/tsl/framework/BUILD
+++ b/tensorflow/tsl/framework/BUILD
@@ -4,6 +4,7 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "//tensorflow/tsl/platform:build_config_root.bzl",
@@ -81,7 +82,7 @@ filegroup(
         "tracking_allocator.h",
         "type_traits.h",
     ],
-    visibility = ["//tensorflow/core:__subpackages__"],
+    visibility = set_external_visibility(["//tensorflow/core:__subpackages__"]),
 )
 
 # Files needed for tf2xla build.
@@ -152,11 +153,11 @@ cc_library(
         "cpu_allocator_impl.cc",
         "tracking_allocator.h",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla:__subpackages__",
         "//tensorflow/core:__subpackages__",
         "//tensorflow/tsl:__subpackages__",
-    ],
+    ]),
     deps = [
         ":numeric_types",
         ":type_traits",
@@ -254,6 +255,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "device_id_utils",
+    srcs = ["device_id_utils.cc"],
+    hdrs = [
+        "device_id_utils.h",
+    ],
+    deps = [
+        ":device_id_impl",
+        ":device_type",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:str_util",
+        "//tensorflow/tsl/util:device_name_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 filegroup(
     name = "device_runtime_headers",
     srcs = [
@@ -265,10 +285,10 @@ filegroup(
 cc_library(
     name = "numeric_types",
     hdrs = ["numeric_types.h"],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler:__subpackages__",
         "//tensorflow/core:__subpackages__",
-    ],
+    ]),
     deps = [
         ":fixedpoint_types",
         "//tensorflow/tsl/platform:types",
@@ -307,9 +327,9 @@ cc_library(
 cc_library(
     name = "type_traits",
     hdrs = ["type_traits.h"],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core/framework:__pkg__",
-    ],
+    ]),
     deps = [
         ":numeric_types",
         "//tensorflow/tsl/platform:types",
@@ -321,7 +341,7 @@ filegroup(
     srcs = [
         "cancellation.h",
     ],
-    visibility = ["//tensorflow/core:__subpackages__"],
+    visibility = set_external_visibility(["//tensorflow/core:__subpackages__"]),
 )
 
 cc_library(
@@ -377,12 +397,12 @@ exports_files(
         "shared_counter.h",
         "tracking_allocator.h",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/common_runtime:__pkg__",
         "//tensorflow/core/common_runtime/gpu:__pkg__",
         "//tensorflow/core/framework:__pkg__",
-    ],
+    ]),
 )
 
 # Files whose users still need to be migrated from core:framework to the
@@ -397,3 +417,18 @@ exports_files(
         "type_traits.h",
     ],
 )
+
+tsl_cc_test(
+    name = "device_id_utils_test",
+    srcs = [
+        "device_id_utils_test.cc",
+    ],
+    deps = [
+        ":device_id_impl",
+        ":device_id_utils",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/tsl/util:device_name_utils",
+    ],
+)
diff --git a/tensorflow/tsl/framework/bfc_allocator.cc b/tensorflow/tsl/framework/bfc_allocator.cc
index 8e78339480b..691c0103128 100644
--- a/tensorflow/tsl/framework/bfc_allocator.cc
+++ b/tensorflow/tsl/framework/bfc_allocator.cc
@@ -510,7 +510,7 @@ int64_t BFCAllocator::LargestFreeChunk() {
 
 double BFCAllocator::GetFragmentation() {
   int64_t bytes_available = *stats_.pool_bytes - stats_.bytes_in_use;
-  DCHECK_GT(bytes_available, 0);
+  DCHECK_GE(bytes_available, 0);
   return static_cast<double>(bytes_available - LargestFreeChunk()) /
          bytes_available;
 }
diff --git a/tensorflow/tsl/framework/contraction/BUILD b/tensorflow/tsl/framework/contraction/BUILD
index f65670b8058..da6f962aa8d 100644
--- a/tensorflow/tsl/framework/contraction/BUILD
+++ b/tensorflow/tsl/framework/contraction/BUILD
@@ -55,7 +55,7 @@ cc_library(
     # specialization there causing build breakages.  This must be added here
     # as "defines" so that the header is excluded in all dependent targets.
     # TODO(b/238649163): remove this once no longer necessary.
-    defines = ["GEMM_KERNEL_H"],
+    defines = ["EIGEN_USE_AVX512_GEMM_KERNELS=0"],
     deps = select({
         ":no_mkldnn_contraction_kernel": [":eigen_contraction_kernel_no_mkl"],
         "//conditions:default": [":eigen_contraction_kernel_with_mkl"],
@@ -82,10 +82,10 @@ cc_library(
         ],
     }),
     deps = [
-        "@com_google_absl//absl/base",
-        "//third_party/eigen3",
-        "//tensorflow/tsl/platform:dynamic_annotations",
         "//tensorflow/tsl/framework/fixedpoint",
+        "//tensorflow/tsl/platform:dynamic_annotations",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base",
     ] + select({
         "//tensorflow/tsl:android_x86": [],
         "//tensorflow/tsl:arm_any": [],
diff --git a/tensorflow/tsl/framework/convolution/BUILD b/tensorflow/tsl/framework/convolution/BUILD
index a495f319628..3aa880d9a61 100644
--- a/tensorflow/tsl/framework/convolution/BUILD
+++ b/tensorflow/tsl/framework/convolution/BUILD
@@ -24,7 +24,7 @@ cc_library(
     # specialization there causing build breakages.  This must be added here
     # as "defines" so that the header is excluded in all dependent targets.
     # TODO(b/238649163): remove this once no longer necessary.
-    defines = ["GEMM_KERNEL_H"],
+    defines = ["EIGEN_USE_AVX512_GEMM_KERNELS=0"],
     deps = [
         "//tensorflow/tsl/framework/convolution:eigen_convolution_helpers",
     ],
diff --git a/tensorflow/tsl/framework/device_id_utils.cc b/tensorflow/tsl/framework/device_id_utils.cc
new file mode 100644
index 00000000000..4bfb617c2ba
--- /dev/null
+++ b/tensorflow/tsl/framework/device_id_utils.cc
@@ -0,0 +1,135 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/framework/device_id_utils.h"
+
+#include <numeric>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/numbers.h"
+#include "tensorflow/tsl/framework/device_id.h"
+#include "tensorflow/tsl/framework/device_id_manager.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/str_util.h"
+
+namespace tsl {
+
+void CheckValidTfDeviceId(const DeviceType& type,
+                          const int visible_device_count,
+                          const TfDeviceId tf_device_id) {
+  PlatformDeviceId platform_device_id;
+  TF_CHECK_OK(DeviceIdManager::TfToPlatformDeviceId(type, tf_device_id,
+                                                    &platform_device_id));
+  CHECK_LT(platform_device_id.value(), visible_device_count)  // Crash OK
+      << "platform_device_id is outside discovered device range."
+      << " TF " << type << " id: " << tf_device_id << ", platform " << type
+      << " id: " << platform_device_id
+      << ", visible device count: " << visible_device_count;
+}
+
+Status ParseVisibleDeviceList(
+    const std::string& visible_device_list, const int visible_device_count,
+    std::vector<PlatformDeviceId>* visible_device_order) {
+  visible_device_order->clear();
+
+  // If the user wants to remap the visible to virtual Device mapping,
+  // check for that here.
+  if (visible_device_list.empty()) {
+    visible_device_order->resize(visible_device_count);
+    // By default, visible to virtual mapping is unchanged.
+    std::iota(visible_device_order->begin(), visible_device_order->end(), 0);
+  } else {
+    const std::vector<std::string> order_str =
+        tsl::str_util::Split(visible_device_list, ',');
+    for (const std::string& platform_device_id_str : order_str) {
+      int32_t platform_device_id;
+      if (!absl::SimpleAtoi(platform_device_id_str, &platform_device_id)) {
+        return tsl::errors::InvalidArgument(
+            "Could not parse entry in 'visible_device_list': '",
+            platform_device_id_str,
+            "'. visible_device_list = ", visible_device_list);
+      }
+      if (platform_device_id < 0 ||
+          platform_device_id >= visible_device_count) {
+        return tsl::errors::InvalidArgument(
+            "'visible_device_list' listed an invalid Device id '",
+            platform_device_id, "' but visible device count is ",
+            visible_device_count);
+      }
+      visible_device_order->push_back(
+          tsl::PlatformDeviceId(platform_device_id));
+    }
+  }
+
+  // Validate no repeats.
+  std::set<PlatformDeviceId> visible_device_set(visible_device_order->begin(),
+                                                visible_device_order->end());
+  if (visible_device_set.size() != visible_device_order->size()) {
+    return tsl::errors::InvalidArgument(
+        "visible_device_list contained a duplicate entry: ",
+        visible_device_list);
+  }
+  return tsl::OkStatus();
+}
+
+StatusOr<size_t> GetNumberTfDevicesAndConfigurePlatformDeviceId(
+    const absl::flat_hash_map<std::string, int64_t>&
+        session_option_device_counts,
+    absl::string_view device_type, absl::string_view visible_device_list,
+    const int visible_device_count) {
+  size_t num_tf_devices = INT_MAX;
+  const auto iter = session_option_device_counts.find(device_type);
+  if (iter != session_option_device_counts.end()) {
+    num_tf_devices = iter->second;
+  }
+  if (num_tf_devices == 0) {
+    return 0;
+  }
+  std::vector<PlatformDeviceId> visible_device_order;
+  TF_RETURN_IF_ERROR(ParseVisibleDeviceList(std::string(visible_device_list),
+                                            visible_device_count,
+                                            &visible_device_order));
+  if (num_tf_devices > visible_device_order.size()) {
+    num_tf_devices = visible_device_order.size();
+  }
+  for (int i = 0; i < num_tf_devices; ++i) {
+    const PlatformDeviceId platform_device_id = visible_device_order[i];
+    const TfDeviceId tf_device_id(i);
+    TF_RETURN_IF_ERROR(tsl::DeviceIdManager::InsertTfPlatformDeviceIdPair(
+        DeviceType(device_type), tf_device_id, platform_device_id));
+  }
+  return num_tf_devices;
+}
+
+StatusOr<int> GetDeviceIdFromDeviceParsedName(
+    const DeviceNameUtils::ParsedName& device_name,
+    const DeviceType& device_type) {
+  const TfDeviceId tf_device_id(device_name.id);
+  PlatformDeviceId platform_device_id;
+  Status platform_id_status = DeviceIdManager::TfToPlatformDeviceId(
+      device_type, tf_device_id, &platform_device_id);
+  if (platform_id_status.ok()) {
+    return platform_device_id.value();
+  }
+  if (tsl::errors::IsNotFound(platform_id_status)) {
+    return tf_device_id.value();
+  }
+  return platform_id_status;
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/framework/device_id_utils.h b/tensorflow/tsl/framework/device_id_utils.h
new file mode 100644
index 00000000000..9f361e8b435
--- /dev/null
+++ b/tensorflow/tsl/framework/device_id_utils.h
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_FRAMEWORK_DEVICE_ID_UTILS_H_
+#define TENSORFLOW_TSL_FRAMEWORK_DEVICE_ID_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/tsl/framework/device_id.h"
+#include "tensorflow/tsl/framework/device_type.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/util/device_name_utils.h"
+
+namespace tsl {
+
+// Utility methods for translation between TensorFlow device ids and platform
+// device ids.
+
+// Verify that the platform_device_id associated with a TfDeviceId is
+// legitimate.
+void CheckValidTfDeviceId(const DeviceType& type, int visible_device_count,
+                          TfDeviceId tf_device_id);
+
+// Parse `visible_device_list` into a list of platform Device ids.
+Status ParseVisibleDeviceList(
+    const std::string& visible_device_list, int visible_device_count,
+    std::vector<PlatformDeviceId>* visible_device_order);
+
+// Returns how many TF devices should be created, and generates the mapping
+// between TfDeviceId and PlatformDeviceId. The number of TF devices is the
+// minimum among the device count in `session_option_device_counts`,
+// `visible_device_count` and the number of visible devices in
+// `visible_device_list`. If `visible_device_list` is empty, the mapping
+// between TfDeviceId and PlatformDeviceId is an identity mapping.
+// Please refer to tensorflow/tsl/framework/device_id.h and
+// tensorflow/core/protobuf/config.proto about the relationship between
+// TfDeviceId and PlatformDeviceId, and how `visible_device_list` is used.
+StatusOr<size_t> GetNumberTfDevicesAndConfigurePlatformDeviceId(
+    const absl::flat_hash_map<std::string, int64_t>&
+        session_option_device_counts,
+    absl::string_view device_type, absl::string_view visible_device_list,
+    int visible_device_count);
+
+// Returns the corresponding PlatformDeviceId if it is found. Otherwise returns
+// the id in device_name.
+StatusOr<int> GetDeviceIdFromDeviceParsedName(
+    const DeviceNameUtils::ParsedName& device_name,
+    const DeviceType& device_type);
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_FRAMEWORK_DEVICE_ID_UTILS_H_
diff --git a/tensorflow/tsl/framework/device_id_utils_test.cc b/tensorflow/tsl/framework/device_id_utils_test.cc
new file mode 100644
index 00000000000..7bd023778f7
--- /dev/null
+++ b/tensorflow/tsl/framework/device_id_utils_test.cc
@@ -0,0 +1,178 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/tsl/framework/device_id_utils.h"
+
+#include <string_view>
+#include <vector>
+
+#include "tensorflow/tsl/framework/device_id_manager.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/util/device_name_utils.h"
+
+namespace tsl {
+namespace {
+
+using ::testing::HasSubstr;
+using ::tsl::testing::StatusIs;
+
+constexpr std::string_view kTestDeviceType = "CPU";
+
+PlatformDeviceId TfToPlatformDeviceId(TfDeviceId tf_device_id) {
+  PlatformDeviceId platform_device_id;
+  TF_CHECK_OK(DeviceIdManager::TfToPlatformDeviceId(
+      DeviceType(kTestDeviceType), tf_device_id, &platform_device_id));
+  return platform_device_id;
+}
+
+TEST(DeviceIdUtilsTest, CheckValidTfDeviceIdPass) {
+  TfDeviceId tf_device_id(0);
+  PlatformDeviceId platform_device_id(1);
+  TF_EXPECT_OK(DeviceIdManager::InsertTfPlatformDeviceIdPair(
+      DeviceType(kTestDeviceType), tf_device_id, platform_device_id));
+  tsl::CheckValidTfDeviceId("CPU", /*visible_device_count=*/2, tf_device_id);
+  DeviceIdManager::TestOnlyReset();
+}
+
+TEST(DeviceIdUtilsTest, CheckValidTfDeviceIdNotFound) {
+  TfDeviceId tf_device_id(0);
+  EXPECT_DEATH(
+      tsl::CheckValidTfDeviceId(DeviceType(kTestDeviceType),
+                                /*visible_device_count=*/2, tf_device_id),
+      "NOT_FOUND: TensorFlow device CPU:0 was not registered");
+}
+
+TEST(DeviceIdUtilsTest, CheckValidTfDeviceIdOutsideVisibleDeviceRange) {
+  TfDeviceId tf_device_id(0);
+  PlatformDeviceId platform_device_id(1);
+  TF_EXPECT_OK(DeviceIdManager::InsertTfPlatformDeviceIdPair(
+      DeviceType(kTestDeviceType), tf_device_id, platform_device_id));
+  EXPECT_DEATH(tsl::CheckValidTfDeviceId("CPU", /*visible_device_count=*/1,
+                                         tf_device_id),
+               "platform_device_id is outside discovered device range.");
+  DeviceIdManager::TestOnlyReset();
+}
+
+TEST(DeviceIdUtilsTest, ParseEmptyVisibleDeviceList) {
+  std::vector<PlatformDeviceId> visible_device_order;
+  TF_EXPECT_OK(ParseVisibleDeviceList("", 2, &visible_device_order));
+  PlatformDeviceId platform_device_id0(0), platform_device_id1(1);
+  std::vector<PlatformDeviceId> expected = {platform_device_id0,
+                                            platform_device_id1};
+  EXPECT_EQ(visible_device_order, expected);
+}
+
+TEST(DeviceIdUtilsTest, ParseVisibleDeviceList) {
+  std::vector<PlatformDeviceId> visible_device_order;
+  TF_EXPECT_OK(ParseVisibleDeviceList("2,1", 3, &visible_device_order));
+  PlatformDeviceId platform_device_id2(2), platform_device_id1(1);
+  std::vector<PlatformDeviceId> expected = {platform_device_id2,
+                                            platform_device_id1};
+  EXPECT_EQ(visible_device_order, expected);
+}
+
+TEST(DeviceIdUtilsTest, ParseInvalidVisibleDeviceList) {
+  std::vector<PlatformDeviceId> visible_device_order;
+  EXPECT_THAT(
+      ParseVisibleDeviceList("3,1", 3, &visible_device_order),
+      StatusIs(tensorflow::error::INVALID_ARGUMENT,
+               HasSubstr("'visible_device_list' listed an invalid Device id "
+                         "'3' but visible device count is 3")));
+}
+
+TEST(DeviceIdUtilsTest, ParseDuplicateVisibleDeviceList) {
+  std::vector<PlatformDeviceId> visible_device_order;
+  EXPECT_THAT(
+      ParseVisibleDeviceList("1,1", 3, &visible_device_order),
+      StatusIs(
+          tensorflow::error::INVALID_ARGUMENT,
+          HasSubstr("visible_device_list contained a duplicate entry: 1,1")));
+}
+
+TEST(DeviceIdUtilsTest, GetNumberTfDevicesDefault) {
+  TF_ASSERT_OK_AND_ASSIGN(size_t num_tf_device,
+                          GetNumberTfDevicesAndConfigurePlatformDeviceId(
+                              {}, kTestDeviceType, "", 2));
+
+  EXPECT_EQ(num_tf_device, 2);
+  TfDeviceId tf_device_id_0(0);
+  PlatformDeviceId expected_0(0);
+  EXPECT_EQ(expected_0, TfToPlatformDeviceId(tf_device_id_0));
+  TfDeviceId tf_device_id_1(1);
+  PlatformDeviceId expected_1(1);
+  EXPECT_EQ(expected_1, TfToPlatformDeviceId(tf_device_id_1));
+  DeviceIdManager::TestOnlyReset();
+}
+
+TEST(DeviceIdUtilsTest, GetNumberTfDevicesWithVisibleDeviceList) {
+  TF_ASSERT_OK_AND_ASSIGN(size_t num_tf_device,
+                          GetNumberTfDevicesAndConfigurePlatformDeviceId(
+                              {}, kTestDeviceType, "2,0", 3));
+
+  EXPECT_EQ(num_tf_device, 2);
+  TfDeviceId tf_device_id_0(0);
+  PlatformDeviceId expected_2(2);
+  EXPECT_EQ(expected_2, TfToPlatformDeviceId(tf_device_id_0));
+  TfDeviceId tf_device_id_1(1);
+  PlatformDeviceId expected_0(0);
+  EXPECT_EQ(expected_0, TfToPlatformDeviceId(tf_device_id_1));
+  DeviceIdManager::TestOnlyReset();
+}
+
+TEST(DeviceIdUtilsTest, GetNumberTfDevicesWithSessionOptionDeviceCount) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      size_t num_tf_device,
+      GetNumberTfDevicesAndConfigurePlatformDeviceId(
+          {{std::string(kTestDeviceType), 2}}, kTestDeviceType, "1,0,2", 3));
+
+  EXPECT_EQ(num_tf_device, 2);
+  TfDeviceId tf_device_id_0(0);
+  PlatformDeviceId expected_1(1);
+  EXPECT_EQ(expected_1, TfToPlatformDeviceId(tf_device_id_0));
+  TfDeviceId tf_device_id_1(1);
+  PlatformDeviceId expected_0(0);
+  EXPECT_EQ(expected_0, TfToPlatformDeviceId(tf_device_id_1));
+  DeviceIdManager::TestOnlyReset();
+}
+
+TEST(DeviceIdUtilsTest, GetDeviceIdWithPlatformDeviceId) {
+  TfDeviceId tf_device_id(0);
+  PlatformDeviceId platform_device_id(1);
+  TF_EXPECT_OK(DeviceIdManager::InsertTfPlatformDeviceIdPair(
+      DeviceType(kTestDeviceType), tf_device_id, platform_device_id));
+  DeviceNameUtils::ParsedName device_name;
+  device_name.id = 0;
+
+  TF_ASSERT_OK_AND_ASSIGN(int device_id,
+                          GetDeviceIdFromDeviceParsedName(
+                              device_name, DeviceType(kTestDeviceType)));
+
+  EXPECT_EQ(device_id, 1);
+  DeviceIdManager::TestOnlyReset();
+}
+
+TEST(DeviceIdUtilsTest, GetDeviceIdWithoutPlatformDeviceId) {
+  DeviceNameUtils::ParsedName device_name;
+  device_name.id = 0;
+
+  TF_ASSERT_OK_AND_ASSIGN(int device_id,
+                          GetDeviceIdFromDeviceParsedName(
+                              device_name, DeviceType(kTestDeviceType)));
+
+  EXPECT_EQ(device_id, 0);
+}
+
+}  // namespace
+}  // namespace tsl
diff --git a/tensorflow/tsl/lib/core/BUILD b/tensorflow/tsl/lib/core/BUILD
index a6d99b9bb5a..0c98370554f 100644
--- a/tensorflow/tsl/lib/core/BUILD
+++ b/tensorflow/tsl/lib/core/BUILD
@@ -4,6 +4,7 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/tsl/platform:rules_cc.bzl",
@@ -26,7 +27,7 @@ filegroup(
         "bits.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//tensorflow/core:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 filegroup(
@@ -37,10 +38,10 @@ filegroup(
         "status_test_util.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/core:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -49,7 +50,7 @@ filegroup(
         "bitmap_test.cc",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//tensorflow/core:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 filegroup(
@@ -59,7 +60,7 @@ filegroup(
         "bits.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//tensorflow/core:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 filegroup(
@@ -68,10 +69,10 @@ filegroup(
         "status_test_util.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/core:__pkg__",
-    ],
+    ]),
 )
 
 cc_library(
diff --git a/tensorflow/tsl/lib/gtl/BUILD b/tensorflow/tsl/lib/gtl/BUILD
index 9390d65fa65..3023b7ca847 100644
--- a/tensorflow/tsl/lib/gtl/BUILD
+++ b/tensorflow/tsl/lib/gtl/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load(
     "//tensorflow/tsl/platform:build_config.bzl",
@@ -10,7 +11,7 @@ load(
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
         # tensorflow/core/lib/strings:proto_serialization uses on gtl:inlined_vector
@@ -30,7 +31,7 @@ package(
         "//tensorflow/core/lib/gtl:__subpackages__",
         "//tensorflow/tsl/distributed_runtime/rpc:__pkg__",
         "//tensorflow/tsl/profiler/utils:__pkg__",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -115,10 +116,10 @@ filegroup(
         "inlined_vector.h",
         "iterator_range.h",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/gtl:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -127,30 +128,30 @@ filegroup(
         "int_type.h",
         "map_util.h",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/gtl:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
     name = "legacy_lib_test_internal_headers",
     srcs = [
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/gtl:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
     name = "legacy_android_gif_internal_headers",
     srcs = [
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/gtl:__pkg__",
-    ],
+    ]),
 )
 
 # Export source files needed for mobile builds, which do not use granular targets.
@@ -161,11 +162,11 @@ filegroup(
         "flatrep.h",
         "inlined_vector.h",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/gtl:__pkg__",
         "//tensorflow/tsl:__subpackages__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -177,10 +178,10 @@ filegroup(
         "map_util.h",
         "//tensorflow/tsl/lib/gtl/subtle:map_traits",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/gtl:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -196,10 +197,10 @@ filegroup(
         "map_util.h",
         "//tensorflow/tsl/lib/gtl/subtle:map_traits",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/gtl:__pkg__",
-    ],
+    ]),
 )
 
 tsl_cc_test(
diff --git a/tensorflow/tsl/lib/hash/BUILD b/tensorflow/tsl/lib/hash/BUILD
index cb29acc2604..5e27dcd3da0 100644
--- a/tensorflow/tsl/lib/hash/BUILD
+++ b/tensorflow/tsl/lib/hash/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load(
     "//tensorflow/tsl:tsl.bzl",
@@ -15,12 +16,12 @@ load(
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         # tensorflow/tsl/lib/io/table_builder.cc uses crc functionality
         "//tensorflow/tsl/lib/io:__pkg__",
         # tensorflow/core/lib/hash aliases hash for now
         "//tensorflow/core/lib/hash:__pkg__",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -49,7 +50,7 @@ filegroup(
         "crc32c.h",
         "crc32c_accelerate.cc",
     ],
-    visibility = ["//tensorflow/core/lib/hash:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core/lib/hash:__pkg__"]),
 )
 
 filegroup(
@@ -57,7 +58,7 @@ filegroup(
     srcs = [
         "crc32c.h",
     ],
-    visibility = ["//tensorflow/core/lib/hash:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core/lib/hash:__pkg__"]),
 )
 
 tsl_cc_test(
diff --git a/tensorflow/tsl/lib/io/BUILD b/tensorflow/tsl/lib/io/BUILD
index 70b9a1625ea..2ec076a03e8 100644
--- a/tensorflow/tsl/lib/io/BUILD
+++ b/tensorflow/tsl/lib/io/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load(
     "//tensorflow/tsl/platform:rules_cc.bzl",
@@ -7,7 +8,7 @@ load(
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//tensorflow/c/experimental/filesystem:__pkg__",
         "//tensorflow/c/experimental/filesystem/plugins/posix:__pkg__",
         "//tensorflow/tsl/lib/io/snappy:__pkg__",
@@ -17,7 +18,7 @@ package(
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/io:__subpackages__",
         "//tensorflow/tsl/profiler:__subpackages__",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -371,7 +372,7 @@ filegroup(
         "//tensorflow/tsl/lib/io/snappy:snappy_inputstream.h",
         "//tensorflow/tsl/lib/io/snappy:snappy_outputbuffer.h",
     ],
-    visibility = ["//tensorflow/core:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 filegroup(
@@ -389,7 +390,7 @@ filegroup(
         "table_builder.h",
         "table_options.h",
     ],
-    visibility = ["//tensorflow/core:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 filegroup(
@@ -405,7 +406,7 @@ filegroup(
         "//tensorflow/tsl/lib/io/snappy:snappy_inputstream.h",
         "//tensorflow/tsl/lib/io/snappy:snappy_outputbuffer.h",
     ],
-    visibility = ["//tensorflow/core:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 filegroup(
@@ -415,7 +416,7 @@ filegroup(
         "block_builder.h",
         "format.h",
     ],
-    visibility = ["//tensorflow/core:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 tsl_cc_test(
diff --git a/tensorflow/tsl/lib/io/record_reader.cc b/tensorflow/tsl/lib/io/record_reader.cc
index 15653c9fdae..9276fbda518 100644
--- a/tensorflow/tsl/lib/io/record_reader.cc
+++ b/tensorflow/tsl/lib/io/record_reader.cc
@@ -213,7 +213,7 @@ Status RecordReader::ReadRecord(uint64* offset, tstring* record) {
     last_read_failed_ = true;
     if (errors::IsOutOfRange(s)) {
       s = errors::DataLoss("truncated record at ", *offset, "' failed with ",
-                           s.error_message());
+                           s.message());
     }
     return s;
   }
@@ -244,7 +244,7 @@ Status RecordReader::SkipRecords(uint64* offset, int num_to_skip,
       last_read_failed_ = true;
       if (errors::IsOutOfRange(s)) {
         s = errors::DataLoss("truncated record at ", *offset, "' failed with ",
-                             s.error_message());
+                             s.message());
       }
       return s;
     }
diff --git a/tensorflow/tsl/lib/io/record_reader_writer_test.cc b/tensorflow/tsl/lib/io/record_reader_writer_test.cc
index cce8b58029c..8f9255977a7 100644
--- a/tensorflow/tsl/lib/io/record_reader_writer_test.cc
+++ b/tensorflow/tsl/lib/io/record_reader_writer_test.cc
@@ -257,13 +257,13 @@ TEST(RecordReaderWriterTest, TestMalformedInput) {
     Status s = reader.ReadRecord(&offset, &record);
     EXPECT_EQ(error::DATA_LOSS, s.code());
     EXPECT_EQ("corrupted record at 0 (Is this even a TFRecord file?)",
-              s.error_message());
+              s.message());
     // Beyond offset 0, we assume that earlier read or skip operations found
     // a usable TFRecord format or else messaged about that already.
     offset = 1;
     s = reader.ReadRecord(&offset, &record);
     EXPECT_EQ(error::DATA_LOSS, s.code());
-    EXPECT_EQ("corrupted record at 1", s.error_message());
+    EXPECT_EQ("corrupted record at 1", s.message());
   }
 }
 
diff --git a/tensorflow/tsl/lib/io/snappy/snappy_test.cc b/tensorflow/tsl/lib/io/snappy/snappy_test.cc
index 502d585e78a..523ce600021 100644
--- a/tensorflow/tsl/lib/io/snappy/snappy_test.cc
+++ b/tensorflow/tsl/lib/io/snappy/snappy_test.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tsl {
 
-static void CheckPrefixSuffix(const string& str, const string& prefix,
+static void CheckPrefixSuffix(absl::string_view str, const string& prefix,
                               const string& suffix) {
   CHECK_GE(str.size(), prefix.size());
   CHECK_GE(str.size(), suffix.size());
@@ -325,7 +325,7 @@ TEST(SnappyBuffers, SmallUncompressInputBuffer) {
   Status status = TestMultipleWrites(10000, 10000, 10, 10000, 2, true);
   CHECK_EQ(status.code(), error::Code::RESOURCE_EXHAUSTED);
   CheckPrefixSuffix(
-      status.error_message(),
+      status.message(),
       "Input buffer(size: 10 bytes) too small. Should be larger than ",
       " bytes.");
 }
@@ -349,7 +349,7 @@ TEST(SnappyBuffers, CorruptBlock) {
   Status status =
       TestMultipleWrites(10000, 10000, 700, 10000, 2, true, 1, true);
   CHECK_EQ(status.code(), error::Code::DATA_LOSS);
-  CheckPrefixSuffix(status.error_message(), "Failed to read ",
+  CheckPrefixSuffix(status.message(), "Failed to read ",
                     " bytes from file. Possible data corruption.");
 }
 
@@ -361,7 +361,7 @@ TEST(SnappyBuffers, CorruptBlockInputStream) {
   Status status =
       TestMultipleWritesInputStream(10000, 10000, 700, 10000, 2, true, 1, true);
   CHECK_EQ(status.code(), error::Code::DATA_LOSS);
-  CheckPrefixSuffix(status.error_message(), "Failed to read ",
+  CheckPrefixSuffix(status.message(), "Failed to read ",
                     " bytes from file. Possible data corruption.");
 }
 
@@ -382,7 +382,7 @@ TEST(SnappyBuffers, CorruptBlockLargeInputStream) {
   Status status = TestMultipleWritesInputStream(10000, 10000, 2000, 10000, 2,
                                                 true, 1, true);
   CHECK_EQ(status.code(), error::Code::DATA_LOSS);
-  CheckPrefixSuffix(status.error_message(), "Failed to read ",
+  CheckPrefixSuffix(status.message(), "Failed to read ",
                     " bytes from file. Possible data corruption.");
 }
 
diff --git a/tensorflow/tsl/lib/io/zlib_buffers_test.cc b/tensorflow/tsl/lib/io/zlib_buffers_test.cc
index abf8f38d7d6..0d474f36577 100644
--- a/tensorflow/tsl/lib/io/zlib_buffers_test.cc
+++ b/tensorflow/tsl/lib/io/zlib_buffers_test.cc
@@ -195,7 +195,7 @@ TEST(ZlibInputStream, FailsToReadIfWindowBitsAreIncompatible) {
                      input_options);
   Status read_status = in.ReadNBytes(data.size(), &result);
   CHECK_EQ(read_status.code(), error::DATA_LOSS);
-  CHECK(read_status.error_message().find("inflate() failed") != string::npos);
+  CHECK(absl::StrContains(read_status.message(), "inflate() failed"));
 }
 
 void WriteCompressedFile(Env* env, const string& fname, int input_buf_size,
diff --git a/tensorflow/tsl/lib/math/BUILD b/tensorflow/tsl/lib/math/BUILD
index 5fbbe06c47f..fcb0a0bbd3d 100644
--- a/tensorflow/tsl/lib/math/BUILD
+++ b/tensorflow/tsl/lib/math/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load(
     "//tensorflow/tsl/platform:build_config.bzl",
     "tsl_cc_test",
@@ -16,11 +17,11 @@ cc_library(
     name = "math_util",
     hdrs = ["math_util.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//platforms/performance/tf_sim/utils:__subpackages__",
         "//platforms/xla/service:__subpackages__",
         "//tensorflow:__subpackages__",
-    ],
+    ]),
     deps = [
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:types",
diff --git a/tensorflow/tsl/lib/monitoring/BUILD b/tensorflow/tsl/lib/monitoring/BUILD
index 0fc35a9c313..ebb64a7a8d6 100644
--- a/tensorflow/tsl/lib/monitoring/BUILD
+++ b/tensorflow/tsl/lib/monitoring/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load(
     "//tensorflow/tsl/platform:rules_cc.bzl",
@@ -6,7 +7,7 @@ load(
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//learning/brain/google/monitoring:__subpackages__",
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
@@ -23,7 +24,7 @@ package(
         "//tensorflow/compiler/xla/service:__subpackages__",
         "//tensorflow/tsl/framework:__subpackages__",
         "//tensorflow/tsl/distributed_runtime:__subpackages__",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -91,10 +92,10 @@ cc_library(
 cc_library(
     name = "metric_def",
     hdrs = ["metric_def.h"],
-    visibility = [
+    visibility = set_external_visibility([
         "//learning/brain/google/monitoring:__subpackages__",
         "//tensorflow/core:__subpackages__",
-    ],
+    ]),
     deps = [
         ":types",
         "//tensorflow/tsl/platform:stringpiece",
@@ -107,9 +108,9 @@ cc_library(
     name = "collection_registry",
     srcs = ["collection_registry.cc"],
     hdrs = ["collection_registry.h"],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__subpackages__",
-    ],
+    ]),
     deps = [
         ":collected_metrics",
         ":metric_def",
@@ -227,10 +228,10 @@ filegroup(
         "timed.h",
         "types.h",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/monitoring:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -248,10 +249,10 @@ filegroup(
         "timed.h",
         "types.h",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/monitoring:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -268,8 +269,8 @@ filegroup(
         "test_utils.h",
         "types.h",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/monitoring:__pkg__",
-    ],
+    ]),
 )
diff --git a/tensorflow/tsl/lib/monitoring/gauge.h b/tensorflow/tsl/lib/monitoring/gauge.h
index 9d28d636c95..467f9059394 100644
--- a/tensorflow/tsl/lib/monitoring/gauge.h
+++ b/tensorflow/tsl/lib/monitoring/gauge.h
@@ -236,7 +236,7 @@ class Gauge {
     if (registration_handle_) {
       status_ = OkStatus();
     } else {
-      status_ = Status(tensorflow::error::Code::ALREADY_EXISTS,
+      status_ = Status(absl::StatusCode::kAlreadyExists,
                        "Another metric with the same name already exists.");
     }
   }
diff --git a/tensorflow/tsl/lib/monitoring/sampler.h b/tensorflow/tsl/lib/monitoring/sampler.h
index bc34f08c4bf..a0cf7b2f754 100644
--- a/tensorflow/tsl/lib/monitoring/sampler.h
+++ b/tensorflow/tsl/lib/monitoring/sampler.h
@@ -241,7 +241,7 @@ class Sampler {
             &metric_def_, [&](MetricCollectorGetter getter) {
               auto metric_collector = getter.Get(&metric_def_);
 
-              mutex_lock l(mu_);
+              tf_shared_lock l(mu_);
               for (const auto& cell : cells_) {
                 metric_collector.CollectValue(cell.first, cell.second.value());
               }
@@ -309,11 +309,14 @@ SamplerCell* Sampler<NumLabels>::GetCell(const Labels&... labels)
                 "provided in GetCell(...).");
 
   const LabelArray& label_array = {{labels...}};
-  mutex_lock l(mu_);
-  const auto found_it = cells_.find(label_array);
-  if (found_it != cells_.end()) {
-    return &(found_it->second);
+  {
+    tf_shared_lock l(mu_);
+    const auto found_it = cells_.find(label_array);
+    if (found_it != cells_.end()) {
+      return &(found_it->second);
+    }
   }
+  mutex_lock l(mu_);
   return &(cells_
                .emplace(std::piecewise_construct,
                         std::forward_as_tuple(label_array),
diff --git a/tensorflow/tsl/lib/strings/BUILD b/tensorflow/tsl/lib/strings/BUILD
index fb58a16cd44..27f2196aacd 100644
--- a/tensorflow/tsl/lib/strings/BUILD
+++ b/tensorflow/tsl/lib/strings/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load(
     "//tensorflow/tsl/platform:rules_cc.bzl",
@@ -10,12 +11,12 @@ cc_library(
     name = "proto_serialization",
     srcs = ["proto_serialization.cc"],
     hdrs = ["proto_serialization.h"],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla/python:__pkg__",
         "//tensorflow/compiler/xla/service:__pkg__",
         "//tensorflow/compiler/xla/stream_executor:__pkg__",
         "//tensorflow/core/lib/strings:__pkg__",
-    ],
+    ]),
     deps = [
         "//tensorflow/tsl/lib/gtl:inlined_vector",
         "//tensorflow/tsl/platform:hash",
diff --git a/tensorflow/tsl/mkl/build_defs.bzl b/tensorflow/tsl/mkl/build_defs.bzl
index 74e9a75d8a4..eaa0b2dbde7 100644
--- a/tensorflow/tsl/mkl/build_defs.bzl
+++ b/tensorflow/tsl/mkl/build_defs.bzl
@@ -102,7 +102,8 @@ def mkl_deps():
     """
     return select({
         "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_aarch64": ["@mkl_dnn_acl_compatible//:mkl_dnn_acl"],
-        "@org_tensorflow//tensorflow/tsl:linux_x86_64": ["@mkl_dnn_v1//:mkl_dnn"],
+        "@org_tensorflow//tensorflow/tsl:linux_x86_64_with_onednn_v2": ["@mkl_dnn_v1//:mkl_dnn"],
+        "@org_tensorflow//tensorflow/tsl:linux_x86_64_with_onednn_v3": ["@onednn_v3//:mkl_dnn"],
         "@org_tensorflow//tensorflow/tsl:windows": ["@mkl_dnn_v1//:mkl_dnn"],
         "//conditions:default": [],
     })
diff --git a/tensorflow/tsl/platform/BUILD b/tensorflow/tsl/platform/BUILD
index f0fd60ac97b..a1ab3f4761d 100644
--- a/tensorflow/tsl/platform/BUILD
+++ b/tensorflow/tsl/platform/BUILD
@@ -17,6 +17,7 @@ load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/tsl:tsl.bzl",
     "if_not_fuchsia",
+    "set_external_visibility",
     "tsl_copts",
 )
 load(
@@ -59,10 +60,10 @@ exports_files(
         "load_library.h",
         "stringpiece_test.cc",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core/platform:__subpackages__",
         "//tensorflow/tsl:__subpackages__",
-    ],
+    ]),
 )
 
 cc_library(
@@ -271,6 +272,7 @@ cc_library(
         ":strcat",
         ":stringprintf",
         ":types",
+        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:function_ref",
@@ -278,7 +280,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/types:optional",
-        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
     ] + tf_platform_deps("status"),
 )
 
@@ -310,10 +311,6 @@ cc_library(
 
 cc_library(
     name = "statusor",
-    srcs = [
-        "statusor.cc",
-        "statusor_internals.h",
-    ],
     hdrs = ["statusor.h"],
     deps = [
         ":errors",
@@ -321,6 +318,7 @@ cc_library(
         ":macros",
         ":status",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -364,10 +362,10 @@ filegroup(
         "test_benchmark.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/platform:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -377,10 +375,10 @@ filegroup(
         "test.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/platform:__pkg__",
-    ],
+    ]),
 )
 
 cc_library(
@@ -493,8 +491,8 @@ filegroup(
 filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
-        "abi.h",
         "abi.cc",
+        "abi.h",
         "base64.cc",
         "base64.h",
         "bfloat16.h",
@@ -540,8 +538,8 @@ filegroup(
         "protobuf_util.cc",
         "ram_file_system.h",
         "raw_coding.h",
-        "regexp.h",
         "refcount.h",
+        "regexp.h",
         "resource.h",
         "scanner.cc",
         "scanner.h",
@@ -551,9 +549,7 @@ filegroup(
         "stacktrace.h",
         "status.cc",
         "status.h",
-        "statusor.cc",
         "statusor.h",
-        "statusor_internals.h",
         "str_util.cc",
         "str_util.h",
         "strcat.cc",
@@ -569,9 +565,9 @@ filegroup(
     ] + select({
         "//tensorflow/tsl:fuchsia": tf_google_mobile_srcs_no_runtime(),
         "//conditions:default": [
-            "//tensorflow/tsl/platform/default:mobile_srcs_no_runtime",
             "file_system_helper.cc",
             "tracing.cc",
+            "//tensorflow/tsl/platform/default:mobile_srcs_no_runtime",
         ],
     }),
     compatible_with = get_compatible_with_portable(),
@@ -597,10 +593,10 @@ filegroup(
         "subprocess.h",
     ]),
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/platform:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -609,10 +605,10 @@ filegroup(
         "gif.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core/lib/gif:__pkg__",
         "//tensorflow/core/platform:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -629,9 +625,9 @@ filegroup(
         ],
     ) + [
         "//tensorflow/tsl/platform/profile_utils:android_armv7a_cpu_utils_helper.h",
+        "//tensorflow/tsl/platform/profile_utils:clock_cycle_profiler.h",
         "//tensorflow/tsl/platform/profile_utils:cpu_utils.h",
         "//tensorflow/tsl/platform/profile_utils:i_cpu_utils_helper.h",
-        "//tensorflow/tsl/platform/profile_utils:clock_cycle_profiler.h",
     ],
     compatible_with = get_compatible_with_portable(),
 )
@@ -675,10 +671,10 @@ exports_files(
         "tracing.h",
         "tracing.cc",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         ":__subpackages__",
         "//tensorflow:__subpackages__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -691,9 +687,9 @@ filegroup(
         "stringpiece.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
-    ],
+    ]),
 )
 
 cc_library(
@@ -719,10 +715,10 @@ filegroup(
         "unbounded_work_queue.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/platform:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -731,10 +727,10 @@ filegroup(
         "jpeg.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/jpeg:__pkg__",
-    ],
+    ]),
 )
 
 cc_library(
@@ -753,10 +749,10 @@ filegroup(
         "platform.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/jpeg:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -770,11 +766,11 @@ filegroup(
         "stringpiece.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/jpeg:__pkg__",
         "//tensorflow/core/platform:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -787,11 +783,11 @@ filegroup(
         "platform.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/gif:__pkg__",
         "//tensorflow/core/platform:__pkg__",
-    ],
+    ]),
 )
 
 cc_library(
@@ -994,9 +990,9 @@ cc_library(
     hdrs = ["types.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        ":platform",
         ":bfloat16",
         ":float8",
+        ":platform",
         ":tstring",
     ] + tf_platform_deps("types"),
 )
@@ -1004,9 +1000,9 @@ cc_library(
 cc_library(
     name = "build_test",
     testonly = 1,
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core/platform:__pkg__",
-    ],
+    ]),
     deps = [
         ":byte_order",
         ":fingerprint",
@@ -1147,7 +1143,7 @@ filegroup(
         "str_util.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//tensorflow/core:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 cc_library(
@@ -1369,6 +1365,7 @@ tsl_cc_test(
         ":test",
         ":test_benchmark",
         ":test_main",
+        "@com_google_absl//absl/base:config",
     ],
 )
 
diff --git a/tensorflow/tsl/platform/cloud/BUILD b/tensorflow/tsl/platform/cloud/BUILD
index 8c2db9cbee0..223b8f2443b 100644
--- a/tensorflow/tsl/platform/cloud/BUILD
+++ b/tensorflow/tsl/platform/cloud/BUILD
@@ -5,15 +5,16 @@ load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow/tsl:tsl.bzl",
     "if_windows",
+    "set_external_visibility",
     "tsl_copts",
 )
 load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         ":dependency_allowlist",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -271,8 +272,10 @@ cc_library(
     deps = [
         ":curl_http_request",
         ":http_request",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:retrying_utils",
         "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -486,6 +489,7 @@ tsl_cc_test(
     deps = [
         ":compute_engine_metadata_client",
         ":http_request_fake",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:env_impl",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
diff --git a/tensorflow/tsl/platform/cloud/compute_engine_metadata_client.cc b/tensorflow/tsl/platform/cloud/compute_engine_metadata_client.cc
index 5e989b4c104..45e310d04f3 100644
--- a/tensorflow/tsl/platform/cloud/compute_engine_metadata_client.cc
+++ b/tensorflow/tsl/platform/cloud/compute_engine_metadata_client.cc
@@ -15,14 +15,19 @@ limitations under the License.
 
 #include "tensorflow/tsl/platform/cloud/compute_engine_metadata_client.h"
 
+#include <cstdlib>
 #include <utility>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/tsl/platform/cloud/curl_http_request.h"
 
 namespace tsl {
 
 namespace {
 
+// The environment variable to override the compute engine metadata endpoint.
+constexpr char kGceMetadataHost[] = "GCE_METADATA_HOST";
+
 // The URL to retrieve metadata when running in Google Compute Engine.
 constexpr char kGceMetadataBaseUrl[] =
     "http://metadata.google.internal/computeMetadata/v1/";
@@ -38,8 +43,16 @@ ComputeEngineMetadataClient::ComputeEngineMetadataClient(
 Status ComputeEngineMetadataClient::GetMetadata(
     const string& path, std::vector<char>* response_buffer) {
   const auto get_metadata_from_gce = [path, response_buffer, this]() {
+    string metadata_url;
+    const char* metadata_url_override = std::getenv(kGceMetadataHost);
+    if (metadata_url_override) {
+      metadata_url = absl::StrCat("http://", metadata_url_override,
+                                  "/computeMetadata/v1/");
+    } else {
+      metadata_url = kGceMetadataBaseUrl;
+    }
     std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-    request->SetUri(kGceMetadataBaseUrl + path);
+    request->SetUri(metadata_url + path);
     request->AddHeader("Metadata-Flavor", "Google");
     request->SetResultBuffer(response_buffer);
     TF_RETURN_IF_ERROR(request->Send());
diff --git a/tensorflow/tsl/platform/cloud/compute_engine_metadata_client_test.cc b/tensorflow/tsl/platform/cloud/compute_engine_metadata_client_test.cc
index b2b9f5830d4..3336763f1a3 100644
--- a/tensorflow/tsl/platform/cloud/compute_engine_metadata_client_test.cc
+++ b/tensorflow/tsl/platform/cloud/compute_engine_metadata_client_test.cc
@@ -16,11 +16,21 @@ limitations under the License.
 #include "tensorflow/tsl/platform/cloud/compute_engine_metadata_client.h"
 
 #include "tensorflow/tsl/platform/cloud/http_request_fake.h"
+#include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace tsl {
 
-TEST(ComputeEngineMetadataClientTest, GetMetadata) {
+class ComputeEngineMetadataClientTest : public ::testing::Test {
+ protected:
+  void SetUp() override { ClearEnvVars(); }
+
+  void TearDown() override { ClearEnvVars(); }
+
+  void ClearEnvVars() { unsetenv("GCE_METADATA_HOST"); }
+};
+
+TEST_F(ComputeEngineMetadataClientTest, GetMetadata) {
   const string example_response = "example response";
 
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
@@ -41,7 +51,29 @@ TEST(ComputeEngineMetadataClientTest, GetMetadata) {
   EXPECT_EQ(expected, result);
 }
 
-TEST(ComputeEngineMetadataClientTest, RetryOnFailure) {
+TEST_F(ComputeEngineMetadataClientTest, GetCustomMetadataEndpoint) {
+  const string example_response = "example response";
+  setenv("GCE_METADATA_HOST", "foo.bar", 1);
+
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest("Uri: http://foo.bar/computeMetadata/v1/instance"
+                           "/service-accounts/default/token\n"
+                           "Header Metadata-Flavor: Google\n",
+                           example_response)});
+
+  std::shared_ptr<HttpRequest::Factory> http_factory =
+      std::make_shared<FakeHttpRequestFactory>(&requests);
+  ComputeEngineMetadataClient client(http_factory,
+                                     RetryConfig(0 /* init_delay_time_us */));
+
+  std::vector<char> result;
+  TF_EXPECT_OK(
+      client.GetMetadata("instance/service-accounts/default/token", &result));
+  std::vector<char> expected(example_response.begin(), example_response.end());
+  EXPECT_EQ(expected, result);
+}
+
+TEST_F(ComputeEngineMetadataClientTest, RetryOnFailure) {
   const string example_response = "example response";
 
   std::vector<HttpRequest*> requests(
diff --git a/tensorflow/tsl/platform/cloud/curl_http_request_test.cc b/tensorflow/tsl/platform/cloud/curl_http_request_test.cc
index 86b4b66d1f1..6368e6eb3bd 100644
--- a/tensorflow/tsl/platform/cloud/curl_http_request_test.cc
+++ b/tensorflow/tsl/platform/cloud/curl_http_request_test.cc
@@ -371,7 +371,7 @@ TEST(CurlHttpRequestTest, GetRequest_Direct_ResponseTooLarge) {
       "Error executing an HTTP request: libcurl code 23 meaning "
       "'Failed writing received data to disk/application', error details: "
       "Received 12 response bytes for a 5-byte buffer",
-      status.error_message());
+      status.message());
 
   // As long as the request clearly fails, ok to leave truncated response here.
   EXPECT_EQ(5, http_request.GetResultBufferDirectBytesTransferred());
@@ -458,7 +458,7 @@ TEST(CurlHttpRequestTest, GetRequest_503) {
   EXPECT_EQ(
       "Error executing an HTTP request: HTTP response code 503 with body "
       "'get response'",
-      status.error_message());
+      status.message());
 }
 
 TEST(CurlHttpRequestTest, GetRequest_HttpCode0) {
@@ -476,7 +476,7 @@ TEST(CurlHttpRequestTest, GetRequest_HttpCode0) {
   EXPECT_EQ(
       "Error executing an HTTP request: libcurl code 28 meaning "
       "'Timeout was reached', error details: Operation timed out",
-      status.error_message());
+      status.message());
   EXPECT_EQ(0, http_request.GetResponseCode());
 }
 
@@ -497,7 +497,7 @@ TEST(CurlHttpRequestTest, GetRequest_CouldntResolveHost) {
       "Error executing an HTTP request: libcurl code 6 meaning "
       "'Couldn't resolve host name', error details: Could not resolve host "
       "'metadata'",
-      status.error_message());
+      status.message());
   EXPECT_EQ(0, http_request.GetResponseCode());
 }
 
@@ -518,7 +518,7 @@ TEST(CurlHttpRequestTest, GetRequest_SslBadCertfile) {
       "Error executing an HTTP request: libcurl code 77 meaning "
       "'Problem with the SSL CA cert (path? access rights?)', error details: "
       "error setting certificate verify locations:",
-      status.error_message());
+      status.message());
   EXPECT_EQ(0, http_request.GetResponseCode());
 }
 
@@ -752,7 +752,7 @@ TEST(CurlHttpRequestTest, ProgressIsStuck) {
   EXPECT_EQ(
       "Error executing an HTTP request: libcurl code 42 meaning 'Operation "
       "was aborted by an application callback', error details: (none)",
-      status.error_message());
+      status.message());
 }
 
 class TestStats : public HttpRequest::RequestStats {
diff --git a/tensorflow/tsl/platform/cloud/gcs_file_system.cc b/tensorflow/tsl/platform/cloud/gcs_file_system.cc
index b6c91c7c617..e0d2f73921c 100644
--- a/tensorflow/tsl/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/tsl/platform/cloud/gcs_file_system.cc
@@ -645,9 +645,9 @@ class GcsWritableFile : public WritableFile {
     if (errors::IsNotFound(upload_status)) {
       // GCS docs recommend retrying the whole upload. We're relying on the
       // RetryingFileSystem to retry the Sync() call.
-      return errors::Unavailable(strings::StrCat(
-          "Upload to gs://", bucket_, "/", object_,
-          " failed, caused by: ", upload_status.error_message()));
+      return errors::Unavailable(
+          strings::StrCat("Upload to gs://", bucket_, "/", object_,
+                          " failed, caused by: ", upload_status.message()));
     }
     if (upload_status.ok()) {
       if (should_compose) {
@@ -1374,10 +1374,10 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
     if (status.ok()) {
       old_content << read_chunk;
       offset += kReadAppendableFileBufferSize;
-    } else if (status.code() == error::NOT_FOUND) {
+    } else if (status.code() == absl::StatusCode::kNotFound) {
       // New file, there is no existing content in it.
       break;
-    } else if (status.code() == error::OUT_OF_RANGE) {
+    } else if (status.code() == absl::StatusCode::kOutOfRange) {
       // Expected, this means we reached EOF.
       old_content << read_chunk;
       break;
@@ -2074,7 +2074,7 @@ Status GcsFileSystem::DeleteRecursively(const string& dirname,
   if (!IsDirectory(dirname, token).ok()) {
     *undeleted_dirs = 1;
     return Status(
-        error::NOT_FOUND,
+        absl::StatusCode::kNotFound,
         strings::StrCat(dirname, " doesn't exist or not a directory."));
   }
   std::vector<string> all_objects;
diff --git a/tensorflow/tsl/platform/cloud/gcs_file_system_test.cc b/tensorflow/tsl/platform/cloud/gcs_file_system_test.cc
index 6c7b0644bce..61a588eea96 100644
--- a/tensorflow/tsl/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/tsl/platform/cloud/gcs_file_system_test.cc
@@ -1231,7 +1231,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
   const auto& status = file->Close();
   EXPECT_TRUE(errors::IsAborted(status));
   EXPECT_TRUE(
-      absl::StrContains(status.error_message(),
+      absl::StrContains(status.message(),
                         "All 10 retry attempts failed. The last failure: "
                         "important HTTP error 503"))
       << status;
@@ -1296,13 +1296,12 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
     const auto& status = file->Close();
     EXPECT_TRUE(errors::IsUnavailable(status));
     EXPECT_TRUE(
-        absl::StrContains(status.error_message(),
+        absl::StrContains(status.message(),
                           "Upload to gs://bucket/path/writeable.txt failed, "
                           "caused by: important HTTP error 410"))
         << status;
-    EXPECT_TRUE(
-        absl::StrContains(status.error_message(),
-                          "when uploading gs://bucket/path/writeable.txt"))
+    EXPECT_TRUE(absl::StrContains(
+        status.message(), "when uploading gs://bucket/path/writeable.txt"))
         << status;
   }
 
diff --git a/tensorflow/tsl/platform/cloud/google_auth_provider.cc b/tensorflow/tsl/platform/cloud/google_auth_provider.cc
index 2b2f576c053..5c379ffb66b 100644
--- a/tensorflow/tsl/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/tsl/platform/cloud/google_auth_provider.cc
@@ -164,7 +164,7 @@ Status GoogleAuthProvider::GetToken(string* t) {
   Status token_from_gce_status;
   if (skip_gce_check) {
     token_from_gce_status =
-        Status(error::CANCELLED,
+        Status(absl::StatusCode::kCancelled,
                strings::StrCat("GCE check skipped due to presence of $",
                                kNoGceCheck, " environment variable."));
   } else {
diff --git a/tensorflow/tsl/platform/cloud/time_util_test.cc b/tensorflow/tsl/platform/cloud/time_util_test.cc
index 396c079fea3..7b03ca6e07e 100644
--- a/tensorflow/tsl/platform/cloud/time_util_test.cc
+++ b/tensorflow/tsl/platform/cloud/time_util_test.cc
@@ -30,7 +30,7 @@ TEST(TimeUtil, ParseRfc3339Time) {
 TEST(TimeUtil, ParseRfc3339Time_ParseError) {
   int64_t mtime_nsec;
   EXPECT_EQ("Unrecognized RFC 3339 time format: 2016-04-29",
-            ParseRfc3339Time("2016-04-29", &mtime_nsec).error_message());
+            ParseRfc3339Time("2016-04-29", &mtime_nsec).message());
 }
 
 }  // namespace tsl
diff --git a/tensorflow/tsl/platform/default/BUILD b/tensorflow/tsl/platform/default/BUILD
index 40d51144484..228b4c06a0b 100644
--- a/tensorflow/tsl/platform/default/BUILD
+++ b/tensorflow/tsl/platform/default/BUILD
@@ -1,16 +1,16 @@
 # Tensorflow default + linux implementations of tensorflow/core/platform libraries.
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
-load("//tensorflow/tsl:tsl.bzl", "if_not_fuchsia", "if_not_windows", "tsl_copts")
+load("//tensorflow/tsl:tsl.bzl", "if_not_fuchsia", "if_not_windows", "set_external_visibility", "tsl_copts")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//tensorflow/core/lib/jpeg:__pkg__",
         "//tensorflow/core/platform:__pkg__",
         "//tensorflow/tsl/platform:__pkg__",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -159,6 +159,7 @@ cc_library(
         "//tensorflow/tsl/platform:types",
         "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
         "//third_party/eigen3",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
     ],
@@ -341,14 +342,14 @@ cc_library(
         "nobuilder",
     ],
     deps = [
-        "@com_google_absl//absl/base",
+        "//tensorflow/tsl/platform",
         "//tensorflow/tsl/platform:byte_order",
         "//tensorflow/tsl/platform:dynamic_annotations",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:macros",
-        "//tensorflow/tsl/platform/profile_utils:profile_utils_cpu_utils",
         "//tensorflow/tsl/platform:types",
-        "//tensorflow/tsl/platform:platform",
+        "//tensorflow/tsl/platform/profile_utils:profile_utils_cpu_utils",
+        "@com_google_absl//absl/base",
         "@snappy",
     ] + select({
         # TF Additional NUMA dependencies
@@ -540,7 +541,7 @@ cc_library(
         "nobuilder",
     ],
     textual_hdrs = ["crash_analysis.h"],
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = set_external_visibility(["//tensorflow:__subpackages__"]),
     deps = [
         "//tensorflow/tsl/platform",
         "//tensorflow/tsl/platform:protobuf",
@@ -555,7 +556,7 @@ cc_library(
         "nobuilder",
     ],
     textual_hdrs = ["status.h"],
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = set_external_visibility(["//tensorflow:__subpackages__"]),
     deps = [
         "//tensorflow/tsl/platform:types",
         "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
@@ -568,7 +569,7 @@ cc_library(
 bzl_library(
     name = "cuda_build_defs_bzl",
     srcs = ["cuda_build_defs.bzl"],
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = set_external_visibility(["//tensorflow:__subpackages__"]),
 )
 
 bzl_library(
@@ -579,7 +580,7 @@ bzl_library(
 # Export source files needed for mobile builds, which do not use granular targets.
 filegroup(
     name = "additional_mobile_srcs_no_runtime",
-    visibility = ["//tensorflow/core/platform:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core/platform:__pkg__"]),
 )
 
 filegroup(
@@ -600,10 +601,10 @@ filegroup(
         "//tensorflow/tsl/platform/profile_utils:cpu_utils.h",
         "//tensorflow/tsl/platform/profile_utils:i_cpu_utils_helper.h",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core/platform:__pkg__",
         "//tensorflow/tsl/platform:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -620,7 +621,7 @@ filegroup(
         "subprocess.cc",
         "subprocess.h",
     ]),
-    visibility = ["//tensorflow/core/platform:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core/platform:__pkg__"]),
 )
 
 exports_files(
@@ -632,7 +633,7 @@ exports_files(
             "test.cc",
         ],
     ),
-    visibility = ["//tensorflow/core/platform:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core/platform:__pkg__"]),
 )
 
 exports_files(
@@ -641,10 +642,10 @@ exports_files(
         "logging.h",
         "test.cc",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/gif:__pkg__",
         "//tensorflow/core/lib/jpeg:__pkg__",
         "//tensorflow/core/platform:__pkg__",
-    ],
+    ]),
 )
diff --git a/tensorflow/tsl/platform/default/build_config.bzl b/tensorflow/tsl/platform/default/build_config.bzl
index a325340067d..6ffd37bf26f 100644
--- a/tensorflow/tsl/platform/default/build_config.bzl
+++ b/tensorflow/tsl/platform/default/build_config.bzl
@@ -806,7 +806,7 @@ def tsl_cc_test(
     )
 
 def tf_portable_proto_lib():
-    return ["//tensorflow/core:protos_all_cc_impl"]
+    return ["//tensorflow/core:protos_all_cc_impl", "//tensorflow/tsl/protobuf:protos_all_cc_impl"]
 
 def tf_protobuf_compiler_deps():
     return if_static(
diff --git a/tensorflow/tsl/platform/default/build_config/BUILD b/tensorflow/tsl/platform/default/build_config/BUILD
index 088f4c4e752..c13c1730d72 100644
--- a/tensorflow/tsl/platform/default/build_config/BUILD
+++ b/tensorflow/tsl/platform/default/build_config/BUILD
@@ -2,9 +2,9 @@
 # Platform-specific build configurations.
 
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow/tsl:tsl.bzl", "tsl_copts")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility", "tsl_copts")
 
-package(default_visibility = ["//tensorflow/tsl:internal"])
+package(default_visibility = set_external_visibility(["//tensorflow/tsl:internal"]))
 
 licenses(["notice"])  # Apache 2.0
 
diff --git a/tensorflow/tsl/platform/default/cuda_build_defs.bzl b/tensorflow/tsl/platform/default/cuda_build_defs.bzl
index 8b0b3f55960..3b6ae1d66fb 100644
--- a/tensorflow/tsl/platform/default/cuda_build_defs.bzl
+++ b/tensorflow/tsl/platform/default/cuda_build_defs.bzl
@@ -4,5 +4,5 @@ load("@local_config_cuda//cuda:build_defs.bzl", _if_cuda_is_configured = "if_cud
 
 # We perform this indirection so that the copybara tool can distinguish this
 # macro from others provided by the same file.
-def if_cuda_is_configured(x):
-    return _if_cuda_is_configured(x)
+def if_cuda_is_configured(x, no_cuda = []):
+    return _if_cuda_is_configured(x, no_cuda)
diff --git a/tensorflow/tsl/platform/default/dlopen_checker.cc b/tensorflow/tsl/platform/default/dlopen_checker.cc
index 05516b92b72..6516384725d 100644
--- a/tensorflow/tsl/platform/default/dlopen_checker.cc
+++ b/tensorflow/tsl/platform/default/dlopen_checker.cc
@@ -37,7 +37,7 @@ Status TryDlopenCUDALibraries() {
       !cufft_status.status().ok() || !curand_status.status().ok() ||
       !cusolver_status.status().ok() || !cusparse_status.status().ok() ||
       !cudnn_status.status().ok() || !cublaslt_status.status().ok()) {
-    return Status(error::INTERNAL,
+    return Status(absl::StatusCode::kInternal,
                   absl::StrCat("Cannot dlopen all CUDA libraries."));
   } else {
     return tsl::OkStatus();
@@ -51,7 +51,7 @@ Status TryDlopenROCmLibraries() {
   auto rocrand_status = GetRocrandDsoHandle();
   if (!rocblas_status.status().ok() || !miopen_status.status().ok() ||
       !rocfft_status.status().ok() || !rocrand_status.status().ok()) {
-    return Status(error::INTERNAL,
+    return Status(absl::StatusCode::kInternal,
                   absl::StrCat("Cannot dlopen all ROCm libraries."));
   } else {
     return tsl::OkStatus();
@@ -73,7 +73,7 @@ Status TryDlopenTensorRTLibraries() {
   auto nvinfer_status = GetNvInferDsoHandle();
   auto nvinferplugin_status = GetNvInferPluginDsoHandle();
   if (!nvinfer_status.status().ok() || !nvinferplugin_status.status().ok()) {
-    return Status(error::INTERNAL,
+    return Status(absl::StatusCode::kInternal,
                   absl::StrCat("Cannot dlopen all TensorRT libraries."));
   } else {
     return tsl::OkStatus();
diff --git a/tensorflow/tsl/platform/default/dso_loader.cc b/tensorflow/tsl/platform/default/dso_loader.cc
index e69885497b2..b937cb7d1e7 100644
--- a/tensorflow/tsl/platform/default/dso_loader.cc
+++ b/tensorflow/tsl/platform/default/dso_loader.cc
@@ -58,14 +58,14 @@ StatusOr<void*> GetDsoHandle(const string& name, const string& version) {
   }
 
   auto message = absl::StrCat("Could not load dynamic library '", filename,
-                              "'; dlerror: ", status.error_message());
+                              "'; dlerror: ", status.message());
 #if !defined(PLATFORM_WINDOWS)
   if (const char* ld_library_path = getenv("LD_LIBRARY_PATH")) {
     message += absl::StrCat("; LD_LIBRARY_PATH: ", ld_library_path);
   }
 #endif
   VLOG(1) << message;
-  return Status(error::FAILED_PRECONDITION, message);
+  return Status(absl::StatusCode::kFailedPrecondition, message);
 }
 }  // namespace
 
diff --git a/tensorflow/tsl/platform/default/env.cc b/tensorflow/tsl/platform/default/env.cc
index c8b02c4bf79..9d6bf4e5626 100644
--- a/tensorflow/tsl/platform/default/env.cc
+++ b/tensorflow/tsl/platform/default/env.cc
@@ -60,7 +60,7 @@ std::map<std::thread::id, string>& GetThreadNameRegistry()
 class PThread : public Thread {
  public:
   PThread(const ThreadOptions& thread_options, const std::string& name,
-          std::function<void()> fn) {
+          absl::AnyInvocable<void()> fn) {
     ThreadParams* params = new ThreadParams;
     params->name = name;
     params->fn = std::move(fn);
@@ -81,7 +81,7 @@ class PThread : public Thread {
  private:
   struct ThreadParams {
     std::string name;
-    std::function<void()> fn;
+    absl::AnyInvocable<void()> fn;
   };
   static void* ThreadFn(void* params_arg) {
     std::unique_ptr<ThreadParams> params(
@@ -133,8 +133,8 @@ class PosixEnv : public Env {
   }
 
   Thread* StartThread(const ThreadOptions& thread_options, const string& name,
-                      std::function<void()> fn) override {
-    return new PThread(thread_options, name, fn);
+                      absl::AnyInvocable<void()> fn) override {
+    return new PThread(thread_options, name, std::move(fn));
   }
 
   int32 GetCurrentThreadId() override {
@@ -170,19 +170,20 @@ class PosixEnv : public Env {
 #endif
   }
 
-  void SchedClosure(std::function<void()> closure) override {
+  void SchedClosure(absl::AnyInvocable<void()> closure) override {
     // TODO(b/27290852): Spawning a new thread here is wasteful, but
     // needed to deal with the fact that many `closure` functions are
     // blocking in the current codebase.
-    std::thread closure_thread(closure);
+    std::thread closure_thread(std::move(closure));
     closure_thread.detach();
   }
 
-  void SchedClosureAfter(int64 micros, std::function<void()> closure) override {
+  void SchedClosureAfter(int64 micros,
+                         absl::AnyInvocable<void()> closure) override {
     // TODO(b/27290852): Consuming a thread here is wasteful, but this
     // code is (currently) only used in the case where a step fails
     // (AbortStep). This could be replaced by a timer thread
-    SchedClosure([this, micros, closure]() {
+    SchedClosure([this, micros, closure = std::move(closure)]() mutable {
       SleepForMicroseconds(micros);
       closure();
     });
diff --git a/tensorflow/tsl/platform/default/posix_file_system.cc b/tensorflow/tsl/platform/default/posix_file_system.cc
index 86f2ebabb0c..fb1875c56f9 100644
--- a/tensorflow/tsl/platform/default/posix_file_system.cc
+++ b/tensorflow/tsl/platform/default/posix_file_system.cc
@@ -85,7 +85,8 @@ class PosixRandomAccessFile : public RandomAccessFile {
         n -= r;
         offset += r;
       } else if (r == 0) {
-        s = Status(error::OUT_OF_RANGE, "Read less bytes than requested");
+        s = Status(absl::StatusCode::kOutOfRange,
+                   "Read less bytes than requested");
       } else if (errno == EINTR || errno == EAGAIN) {
         // Retry
       } else {
diff --git a/tensorflow/tsl/platform/default/status.h b/tensorflow/tsl/platform/default/status.h
index 9737de0709d..51390516ed6 100644
--- a/tensorflow/tsl/platform/default/status.h
+++ b/tensorflow/tsl/platform/default/status.h
@@ -52,7 +52,7 @@ class SourceLocationImpl {
 namespace internal {
 
 inline absl::Status MakeAbslStatus(
-    ::tensorflow::error::Code code, absl::string_view message,
+    absl::StatusCode code, absl::string_view message,
     absl::Span<const SourceLocationImpl>,
     SourceLocationImpl loc = SourceLocationImpl::current()) {
   return absl::Status(static_cast<absl::StatusCode>(code), message);
diff --git a/tensorflow/tsl/platform/env.h b/tensorflow/tsl/platform/env.h
index 1925411fa92..e4fd89185a4 100644
--- a/tensorflow/tsl/platform/env.h
+++ b/tensorflow/tsl/platform/env.h
@@ -21,8 +21,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
+#include "absl/functional/any_invocable.h"
 #include "tensorflow/tsl/platform/env_time.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/file_system.h"
@@ -428,9 +430,9 @@ class Env {
   ///
   /// Caller takes ownership of the result and must delete it eventually
   /// (the deletion will block until fn() stops running).
-  virtual Thread* StartThread(const ThreadOptions& thread_options,
-                              const std::string& name,
-                              std::function<void()> fn) TF_MUST_USE_RESULT = 0;
+  virtual Thread* StartThread(
+      const ThreadOptions& thread_options, const std::string& name,
+      absl::AnyInvocable<void()> fn) TF_MUST_USE_RESULT = 0;
 
   // Returns the thread id of calling thread.
   // Posix: Returns pthread id which is only guaranteed to be unique within a
@@ -444,14 +446,14 @@ class Env {
   // \brief Schedules the given closure on a thread-pool.
   //
   // NOTE(mrry): This closure may block.
-  virtual void SchedClosure(std::function<void()> closure) = 0;
+  virtual void SchedClosure(absl::AnyInvocable<void()> closure) = 0;
 
   // \brief Schedules the given closure on a thread-pool after the given number
   // of microseconds.
   //
   // NOTE(mrry): This closure must not block.
   virtual void SchedClosureAfter(int64_t micros,
-                                 std::function<void()> closure) = 0;
+                                 absl::AnyInvocable<void()> closure) = 0;
 
   // \brief Load a dynamic library.
   //
@@ -528,19 +530,19 @@ class EnvWrapper : public Env {
   }
   Thread* StartThread(const ThreadOptions& thread_options,
                       const std::string& name,
-                      std::function<void()> fn) override {
-    return target_->StartThread(thread_options, name, fn);
+                      absl::AnyInvocable<void()> fn) override {
+    return target_->StartThread(thread_options, name, std::move(fn));
   }
   int32 GetCurrentThreadId() override { return target_->GetCurrentThreadId(); }
   bool GetCurrentThreadName(std::string* name) override {
     return target_->GetCurrentThreadName(name);
   }
-  void SchedClosure(std::function<void()> closure) override {
-    target_->SchedClosure(closure);
+  void SchedClosure(absl::AnyInvocable<void()> closure) override {
+    target_->SchedClosure(std::move(closure));
   }
   void SchedClosureAfter(int64_t micros,
-                         std::function<void()> closure) override {
-    target_->SchedClosureAfter(micros, closure);
+                         absl::AnyInvocable<void()> closure) override {
+    target_->SchedClosureAfter(micros, std::move(closure));
   }
   Status LoadDynamicLibrary(const char* library_filename,
                             void** handle) override {
diff --git a/tensorflow/tsl/platform/errors.cc b/tensorflow/tsl/platform/errors.cc
index cf8b451a27a..005eaffc1a1 100644
--- a/tensorflow/tsl/platform/errors.cc
+++ b/tensorflow/tsl/platform/errors.cc
@@ -26,11 +26,11 @@ namespace errors {
 
 namespace {
 
-error::Code ErrnoToCode(int err_number) {
-  error::Code code;
+absl::StatusCode ErrnoToCode(int err_number) {
+  absl::StatusCode code;
   switch (err_number) {
     case 0:
-      code = error::Code::OK;
+      code = absl::StatusCode::kOk;
       break;
     case EINVAL:        // Invalid argument
     case ENAMETOOLONG:  // Filename too long
@@ -45,27 +45,27 @@ error::Code ErrnoToCode(int err_number) {
     case ENOTTY:        // Inappropriate I/O control operation
     case EPROTOTYPE:    // Protocol wrong type for socket
     case ESPIPE:        // Invalid seek
-      code = error::INVALID_ARGUMENT;
+      code = absl::StatusCode::kInvalidArgument;
       break;
     case ETIMEDOUT:  // Connection timed out
     case ETIME:      // Timer expired
-      code = error::DEADLINE_EXCEEDED;
+      code = absl::StatusCode::kDeadlineExceeded;
       break;
     case ENODEV:  // No such device
     case ENOENT:  // No such file or directory
     case ENXIO:   // No such device or address
     case ESRCH:   // No such process
-      code = error::NOT_FOUND;
+      code = absl::StatusCode::kNotFound;
       break;
     case EEXIST:         // File exists
     case EADDRNOTAVAIL:  // Address not available
     case EALREADY:       // Connection already in progress
-      code = error::ALREADY_EXISTS;
+      code = absl::StatusCode::kAlreadyExists;
       break;
     case EPERM:   // Operation not permitted
     case EACCES:  // Permission denied
     case EROFS:   // Read only file system
-      code = error::PERMISSION_DENIED;
+      code = absl::StatusCode::kPermissionDenied;
       break;
     case ENOTEMPTY:   // Directory not empty
     case EISDIR:      // Is a directory
@@ -84,7 +84,7 @@ error::Code ErrnoToCode(int err_number) {
     case ESHUTDOWN:  // Cannot send after transport endpoint shutdown
 #endif
     case ETXTBSY:  // Text file busy
-      code = error::FAILED_PRECONDITION;
+      code = absl::StatusCode::kFailedPrecondition;
       break;
     case ENOSPC:  // No space left on device
 #if !defined(_WIN32)
@@ -100,12 +100,12 @@ error::Code ErrnoToCode(int err_number) {
 #if !defined(_WIN32) && !defined(__HAIKU__)
     case EUSERS:  // Too many users
 #endif
-      code = error::RESOURCE_EXHAUSTED;
+      code = absl::StatusCode::kResourceExhausted;
       break;
     case EFBIG:      // File too large
     case EOVERFLOW:  // Value too large to be stored in data type
     case ERANGE:     // Result too large
-      code = error::OUT_OF_RANGE;
+      code = absl::StatusCode::kOutOfRange;
       break;
     case ENOSYS:        // Function not implemented
     case ENOTSUP:       // Operation not supported
@@ -118,7 +118,7 @@ error::Code ErrnoToCode(int err_number) {
     case ESOCKTNOSUPPORT:  // Socket type not supported
 #endif
     case EXDEV:  // Improper link
-      code = error::UNIMPLEMENTED;
+      code = absl::StatusCode::kUnimplemented;
       break;
     case EAGAIN:        // Resource temporarily unavailable
     case ECONNREFUSED:  // Connection refused
@@ -138,16 +138,16 @@ error::Code ErrnoToCode(int err_number) {
       defined(__HAIKU__))
     case ENONET:  // Machine is not on the network
 #endif
-      code = error::UNAVAILABLE;
+      code = absl::StatusCode::kUnavailable;
       break;
     case EDEADLK:  // Resource deadlock avoided
 #if !defined(_WIN32)
     case ESTALE:  // Stale file handle
 #endif
-      code = error::ABORTED;
+      code = absl::StatusCode::kAborted;
       break;
     case ECANCELED:  // Operation cancelled
-      code = error::CANCELLED;
+      code = absl::StatusCode::kCancelled;
       break;
     // NOTE: If you get any of the following (especially in a
     // reproducible way) and can propose a better mapping,
@@ -163,10 +163,10 @@ error::Code ErrnoToCode(int err_number) {
 #if !defined(_WIN32) && !defined(__HAIKU__)
     case EREMOTE:  // Object is remote
 #endif
-      code = error::UNKNOWN;
+      code = absl::StatusCode::kUnknown;
       break;
     default: {
-      code = error::UNKNOWN;
+      code = absl::StatusCode::kUnknown;
       break;
     }
   }
diff --git a/tensorflow/tsl/platform/errors.h b/tensorflow/tsl/platform/errors.h
index af86b23af39..1eab87d1520 100644
--- a/tensorflow/tsl/platform/errors.h
+++ b/tensorflow/tsl/platform/errors.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <type_traits>
 #include <unordered_map>
 #include <utility>
+#include <vector>
 
 #include "absl/base/attributes.h"
 #include "absl/status/status.h"
@@ -118,6 +119,7 @@ inline void CopyPayloads(const ::tsl::Status& from, ::tsl::Status& to) {
   });
 }
 
+#if defined(PLATFORM_GOOGLE)
 // Creates a new status with the given code, message and payloads.
 inline ::tsl::Status Create(
     absl::StatusCode code, ::tsl::StringPiece message,
@@ -127,6 +129,15 @@ inline ::tsl::Status Create(
   InsertPayloads(status, payloads);
   return status;
 }
+#else
+inline ::absl::Status Create(
+    absl::StatusCode code, ::tsl::StringPiece message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  Status status(code, message);
+  InsertPayloads(status, payloads);
+  return status;
+}
+#endif
 
 // Returns a new Status, replacing its message with the given.
 inline ::tsl::Status CreateWithUpdatedMessage(const ::tsl::Status& status,
@@ -140,9 +151,9 @@ inline ::tsl::Status CreateWithUpdatedMessage(const ::tsl::Status& status,
 // to be several layers of additional context.
 template <typename... Args>
 void AppendToMessage(::tsl::Status* status, Args... args) {
-  auto new_status = ::tsl::Status(
-      status->code(),
-      ::tsl::strings::StrCat(status->error_message(), "\n\t", args...));
+  auto new_status =
+      ::tsl::Status(status->code(),
+                    ::tsl::strings::StrCat(status->message(), "\n\t", args...));
   CopyPayloads(*status, new_status);
   *status = std::move(new_status);
 }
@@ -190,10 +201,13 @@ template <typename... Args>
                        ::tsl::strings::StrCat(
                            ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
+
+#if defined(PLATFORM_GOOGLE)
 // Specialized overloads to capture source location for up to three arguments.
 template <typename Arg1, typename Arg2, typename Arg3>
-::tsl::Status InvalidArgument(Arg1 arg1, Arg2 arg2, Arg3 arg3,
-                              SourceLocation loc = SourceLocation::current()) {
+::absl::Status InvalidArgument(
+    Arg1 arg1, Arg2 arg2, Arg3 arg3,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
   return ::tsl::Status(
       absl::StatusCode::kInvalidArgument,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
@@ -202,8 +216,9 @@ template <typename Arg1, typename Arg2, typename Arg3>
       loc);
 }
 template <typename Arg1, typename Arg2>
-::tsl::Status InvalidArgument(Arg1 arg1, Arg2 arg2,
-                              SourceLocation loc = SourceLocation::current()) {
+::absl::Status InvalidArgument(
+    Arg1 arg1, Arg2 arg2,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
   return ::tsl::Status(
       absl::StatusCode::kInvalidArgument,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
@@ -211,21 +226,50 @@ template <typename Arg1, typename Arg2>
       loc);
 }
 template <typename Arg1>
-::tsl::Status InvalidArgument(Arg1 arg1,
-                              SourceLocation loc = SourceLocation::current()) {
+::absl::Status InvalidArgument(
+    Arg1 arg1, absl::SourceLocation loc = absl::SourceLocation::current()) {
   return ::tsl::Status(
       absl::StatusCode::kInvalidArgument,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)),
       loc);
 }
 template <typename... Args>
-::tsl::Status InvalidArgumentWithPayloads(
+::absl::Status InvalidArgumentWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads,
-    SourceLocation loc = SourceLocation::current()) {
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
   return errors::Create(absl::StatusCode::kInvalidArgument, message, payloads,
                         loc);
 }
+#else
+template <typename Arg1, typename Arg2, typename Arg3>
+::absl::Status InvalidArgument(Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+  return ::absl::Status(
+      absl::StatusCode::kInvalidArgument,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2),
+                             ::tsl::errors::internal::PrepareForStrCat(arg3)));
+}
+template <typename Arg1, typename Arg2>
+::absl::Status InvalidArgument(Arg1 arg1, Arg2 arg2) {
+  return ::absl::Status(
+      absl::StatusCode::kInvalidArgument,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2)));
+}
+template <typename Arg1>
+::absl::Status InvalidArgument(Arg1 arg1) {
+  return ::absl::Status(
+      absl::StatusCode::kInvalidArgument,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)));
+}
+template <typename... Args>
+::absl::Status InvalidArgumentWithPayloads(
+    const ::tsl::StringPiece& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kInvalidArgument, message, payloads);
+}
+#endif
 
 // NotFound
 template <typename... Args>
@@ -234,10 +278,12 @@ template <typename... Args>
                        ::tsl::strings::StrCat(
                            ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
+#if defined(PLATFORM_GOOGLE)
 // Specialized overloads to capture source location for up to three arguments.
 template <typename Arg1, typename Arg2, typename Arg3>
-::tsl::Status NotFound(Arg1 arg1, Arg2 arg2, Arg3 arg3,
-                       SourceLocation loc = SourceLocation::current()) {
+::absl::Status NotFound(
+    Arg1 arg1, Arg2 arg2, Arg3 arg3,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
   return ::tsl::Status(
       absl::StatusCode::kNotFound,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
@@ -246,8 +292,9 @@ template <typename Arg1, typename Arg2, typename Arg3>
       loc);
 }
 template <typename Arg1, typename Arg2>
-::tsl::Status NotFound(Arg1 arg1, Arg2 arg2,
-                       SourceLocation loc = SourceLocation::current()) {
+::absl::Status NotFound(
+    Arg1 arg1, Arg2 arg2,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
   return ::tsl::Status(
       absl::StatusCode::kNotFound,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
@@ -255,20 +302,49 @@ template <typename Arg1, typename Arg2>
       loc);
 }
 template <typename Arg1>
-::tsl::Status NotFound(Arg1 arg1,
-                       SourceLocation loc = SourceLocation::current()) {
+::absl::Status NotFound(
+    Arg1 arg1, absl::SourceLocation loc = absl::SourceLocation::current()) {
   return ::tsl::Status(
       absl::StatusCode::kNotFound,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)),
       loc);
 }
 template <typename... Args>
-::tsl::Status NotFoundWithPayloads(
+::absl::Status NotFoundWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads,
-    SourceLocation loc = SourceLocation::current()) {
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
   return errors::Create(absl::StatusCode::kNotFound, message, payloads, loc);
 }
+#else
+template <typename Arg1, typename Arg2, typename Arg3>
+::absl::Status NotFound(Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+  return ::absl::Status(
+      absl::StatusCode::kNotFound,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2),
+                             ::tsl::errors::internal::PrepareForStrCat(arg3)));
+}
+template <typename Arg1, typename Arg2>
+::absl::Status NotFound(Arg1 arg1, Arg2 arg2) {
+  return ::absl::Status(
+      absl::StatusCode::kNotFound,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2)));
+}
+template <typename Arg1>
+::absl::Status NotFound(Arg1 arg1) {
+  return ::absl::Status(
+      absl::StatusCode::kNotFound,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)));
+}
+template <typename... Args>
+::absl::Status NotFoundWithPayloads(
+    const ::tsl::StringPiece& message,
+    const std::unordered_map<std::string, std::string>& payloads) {
+  return errors::Create(absl::StatusCode::kNotFound, message, payloads);
+}
+#endif
 
 // AlreadyExists
 template <typename... Args>
@@ -511,7 +587,7 @@ inline Status ReplaceErrorFromNonCommunicationOps(const Status s,
   return Status(
       absl::StatusCode::kInternal,
       strings::StrCat(
-          s.error_message(), "\nExecuting non-communication op <", op_name,
+          s.message(), "\nExecuting non-communication op <", op_name,
           "> originally returned UnavailableError, and was replaced by "
           "InternalError to avoid invoking TF network error handling logic."));
 }
diff --git a/tensorflow/tsl/platform/errors_test.cc b/tensorflow/tsl/platform/errors_test.cc
index 003f74a9ff4..c7b63ff1542 100644
--- a/tensorflow/tsl/platform/errors_test.cc
+++ b/tensorflow/tsl/platform/errors_test.cc
@@ -24,13 +24,12 @@ TEST(AppendToMessageTest, PayloadsAreCopied) {
   status.SetPayload("payload_key", absl::Cord("payload_value"));
   errors::AppendToMessage(&status, "Appended Message");
 
-  EXPECT_EQ(status.error_message(),
-            "Aborted Error Message\n\tAppended Message");
+  EXPECT_EQ(status.message(), "Aborted Error Message\n\tAppended Message");
   EXPECT_EQ(status.GetPayload("payload_key"), absl::Cord("payload_value"));
 }
 
 TEST(Status, GetAllPayloads) {
-  Status s_error(error::INTERNAL, "Error message");
+  Status s_error(absl::StatusCode::kInternal, "Error message");
   s_error.SetPayload("Error key", absl::Cord("foo"));
   auto payloads_error_status = errors::GetPayloads(s_error);
   ASSERT_EQ(payloads_error_status.size(), 1);
@@ -43,7 +42,7 @@ TEST(Status, GetAllPayloads) {
 
 TEST(Status, OKStatusInsertPayloadsFromErrorStatus) {
   // An OK status will should not change after InsertPayloads() calls.
-  Status s_error(error::INTERNAL, "Error message");
+  Status s_error(absl::StatusCode::kInternal, "Error message");
   s_error.SetPayload("Error key", absl::Cord("foo"));
   Status s_ok = Status();
 
@@ -54,7 +53,7 @@ TEST(Status, OKStatusInsertPayloadsFromErrorStatus) {
 
 TEST(Status, ErrorStatusInsertPayloadsFromOKStatus) {
   // An InsertPayloads() call should not take effect from empty inputs.
-  Status s_error(error::INTERNAL, "Error message");
+  Status s_error(absl::StatusCode::kInternal, "Error message");
   s_error.SetPayload("Error key", absl::Cord("foo"));
   Status s_ok = Status();
 
@@ -63,10 +62,10 @@ TEST(Status, ErrorStatusInsertPayloadsFromOKStatus) {
 }
 
 TEST(Status, ErrorStatusInsertPayloadsFromErrorStatus) {
-  Status s_error1(error::INTERNAL, "Error message");
+  Status s_error1(absl::StatusCode::kInternal, "Error message");
   s_error1.SetPayload("Error key 1", absl::Cord("foo"));
   s_error1.SetPayload("Error key 2", absl::Cord("bar"));
-  Status s_error2(error::INTERNAL, "Error message");
+  Status s_error2(absl::StatusCode::kInternal, "Error message");
   s_error2.SetPayload("Error key", absl::Cord("bar"));
   ASSERT_EQ(s_error2.GetPayload("Error key"), "bar");
 
diff --git a/tensorflow/tsl/platform/file_system.cc b/tensorflow/tsl/platform/file_system.cc
index c5b270eb48a..7bbf1c3cac2 100644
--- a/tensorflow/tsl/platform/file_system.cc
+++ b/tensorflow/tsl/platform/file_system.cc
@@ -83,7 +83,7 @@ Status FileSystem::IsDirectory(const string& name, TransactionToken* token) {
   if (stat.is_directory) {
     return OkStatus();
   }
-  return Status(tsl::error::FAILED_PRECONDITION, "Not a directory");
+  return Status(absl::StatusCode::kFailedPrecondition, "Not a directory");
 }
 
 Status FileSystem::HasAtomicMove(const string& path, bool* has_atomic_move) {
@@ -198,7 +198,7 @@ Status FileSystem::RecursivelyCreateDir(const string& dirname,
       Status directory_status = IsDirectory(current_entry);
       if (directory_status.ok()) {
         break;  // We need to start creating directories from here.
-      } else if (directory_status.code() == tsl::error::UNIMPLEMENTED) {
+      } else if (directory_status.code() == absl::StatusCode::kUnimplemented) {
         return directory_status;
       } else {
         return errors::FailedPrecondition(remaining_dir, " is not a directory");
@@ -222,7 +222,7 @@ Status FileSystem::RecursivelyCreateDir(const string& dirname,
   for (const StringPiece sub_dir : sub_dirs) {
     built_path = this->JoinPath(built_path, sub_dir);
     Status status = CreateDir(this->CreateURI(scheme, host, built_path));
-    if (!status.ok() && status.code() != tsl::error::ALREADY_EXISTS) {
+    if (!status.ok() && status.code() != absl::StatusCode::kAlreadyExists) {
       return status;
     }
   }
diff --git a/tensorflow/tsl/platform/file_system.h b/tensorflow/tsl/platform/file_system.h
index 51a439f9952..98433d66d8d 100644
--- a/tensorflow/tsl/platform/file_system.h
+++ b/tensorflow/tsl/platform/file_system.h
@@ -431,9 +431,6 @@ class FileSystem {
   std::string CreateURI(StringPiece scheme, StringPiece host,
                         StringPiece path) const;
 
-  ///  \brief Creates a temporary file name with an extension.
-  std::string GetTempFilename(const std::string& extension) const;
-
   /// \brief Return true if path is absolute.
   bool IsAbsolutePath(tsl::StringPiece path) const;
 
diff --git a/tensorflow/tsl/platform/file_system_helper.cc b/tensorflow/tsl/platform/file_system_helper.cc
index 055609b8958..de3b0c13b7a 100644
--- a/tensorflow/tsl/platform/file_system_helper.cc
+++ b/tensorflow/tsl/platform/file_system_helper.cc
@@ -206,7 +206,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
       // Get all children of `parent`. If this fails, return early.
       std::vector<std::string> children;
       Status s = fs->GetChildren(parent, &children);
-      if (s.code() == tsl::error::PERMISSION_DENIED) {
+      if (s.code() == absl::StatusCode::kPermissionDenied) {
         return;
       }
 
@@ -226,7 +226,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
         const std::string path = io::JoinPath(parent, children[j]);
         if (!fs->Match(path, match_pattern)) {
           children_status[j] =
-              Status(tsl::error::CANCELLED, "Operation not needed");
+              Status(absl::StatusCode::kCancelled, "Operation not needed");
         } else {
           children_status[j] = fs->IsDirectory(path);
         }
@@ -243,7 +243,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
       // remaining children get added to the result.
       // Otherwise, only the directories get added to the next queue.
       for (size_t j = 0; j < children.size(); j++) {
-        if (children_status[j].code() == tsl::error::CANCELLED) {
+        if (children_status[j].code() == absl::StatusCode::kCancelled) {
           continue;
         }
 
diff --git a/tensorflow/tsl/platform/float8.h b/tensorflow/tsl/platform/float8.h
index e97317b6bf9..6527b4100c2 100644
--- a/tensorflow/tsl/platform/float8.h
+++ b/tensorflow/tsl/platform/float8.h
@@ -19,10 +19,11 @@ limitations under the License.
 // 8-bit Floating Point Interchange Format, as described by
 //   https://arxiv.org/abs/2209.05433
 
-#include <cmath>
+#include <algorithm>
 #include <cstdint>
 #include <limits>
 #include <ostream>
+#include <type_traits>
 #include <utility>
 
 #include "absl/numeric/bits.h"
@@ -39,36 +40,54 @@ class float8_e5m2;
 
 template <typename Derived>
 class float8_base {
- protected:
   // Constructor tag to allow constexpr construction from bit representation.
   struct ConstructFromRepTag {};
 
+ public:
   constexpr float8_base() : rep_(0) {}
+
+  template <typename T,
+            typename EnableIf = std::enable_if<std::is_arithmetic_v<T>>>
+  explicit EIGEN_DEVICE_FUNC float8_base(T f)
+      : float8_base(ConvertFrom(static_cast<float>(f)).rep(),
+                    ConstructFromRepTag{}) {}
+  explicit EIGEN_DEVICE_FUNC float8_base(double f64)
+      : float8_base(ConvertFrom(f64).rep(), ConstructFromRepTag{}) {}
+  explicit EIGEN_DEVICE_FUNC float8_base(float f32)
+      : float8_base(ConvertFrom(f32).rep(), ConstructFromRepTag{}) {}
+  explicit EIGEN_DEVICE_FUNC float8_base(Eigen::bfloat16 bf16)
+      : float8_base(ConvertFrom(bf16).rep(), ConstructFromRepTag{}) {}
+  explicit EIGEN_DEVICE_FUNC float8_base(Eigen::half f16)
+      : float8_base(ConvertFrom(f16).rep(), ConstructFromRepTag{}) {}
   constexpr float8_base(uint8_t rep, ConstructFromRepTag) : rep_{rep} {}
 
- public:
   constexpr uint8_t rep() const { return rep_; }
 
+  template <typename T,
+            typename EnableIf = std::enable_if<std::is_arithmetic_v<T>>>
+  explicit EIGEN_DEVICE_FUNC operator T() const {
+    return static_cast<T>(static_cast<float>(derived()));
+  }
+  explicit EIGEN_DEVICE_FUNC operator double() const {
+    return ConvertTo<double>(derived());
+  }
+  explicit EIGEN_DEVICE_FUNC operator float() const {
+    return ConvertTo<float>(derived());
+  }
+  explicit EIGEN_DEVICE_FUNC operator Eigen::bfloat16() const {
+    return ConvertTo<Eigen::bfloat16>(derived());
+  }
+  explicit EIGEN_DEVICE_FUNC operator Eigen::half() const {
+    return ConvertTo<Eigen::half>(derived());
+  }
+  explicit EIGEN_DEVICE_FUNC operator bool() const {
+    return (rep() & 0x7F) != 0;
+  }
+
   constexpr Derived operator-() const {
     return Derived(static_cast<uint8_t>(rep() ^ 0x80), ConstructFromRepTag{});
   }
 
-  constexpr bool operator==(const Derived& other) const {
-    if (Eigen::numext::isnan(derived()) || Eigen::numext::isnan(other)) {
-      return false;
-    }
-    auto [lhs_sign, lhs_mag] = SignAndMagnitude(derived());
-    auto [rhs_sign, rhs_mag] = SignAndMagnitude(other);
-    if (lhs_mag == 0 && rhs_mag == 0) {
-      return true;
-    }
-    return rep() == other.rep();
-  }
-
-  constexpr bool operator!=(const Derived& other) const {
-    return !(derived() == other);
-  }
-
   constexpr const Derived& derived() const {
     return *static_cast<const Derived*>(this);
   }
@@ -107,60 +126,33 @@ class float8_base {
     return Derived{float{derived()} / float{other}};
   }
 
+  constexpr bool operator==(const Derived& other) const {
+    return Compare(derived(), other) == Ordering::kEquivalent;
+  }
+
+  constexpr bool operator!=(const Derived& other) const {
+    return Compare(derived(), other) != Ordering::kEquivalent;
+  }
+
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(
       const Derived& other) const {
-    if (Eigen::numext::isnan(derived()) || Eigen::numext::isnan(other)) {
-      return false;
-    }
-    auto [lhs_sign, lhs_mag] = SignAndMagnitude(derived());
-    auto [rhs_sign, rhs_mag] = SignAndMagnitude(other);
-    if (lhs_mag == 0 && rhs_mag == 0) {
-      return false;
-    }
-    return SignAndMagnitudeToTwosComplement(lhs_sign, lhs_mag) <
-           SignAndMagnitudeToTwosComplement(rhs_sign, rhs_mag);
+    return Compare(derived(), other) == Ordering::kLess;
   }
 
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(
       const Derived& other) const {
-    if (Eigen::numext::isnan(derived()) || Eigen::numext::isnan(other)) {
-      return false;
-    }
-    auto [lhs_sign, lhs_mag] = SignAndMagnitude(derived());
-    auto [rhs_sign, rhs_mag] = SignAndMagnitude(other);
-    if (lhs_mag == 0 && rhs_mag == 0) {
-      return true;
-    }
-    return SignAndMagnitudeToTwosComplement(lhs_sign, lhs_mag) <=
-           SignAndMagnitudeToTwosComplement(rhs_sign, rhs_mag);
+    return Compare(derived(), other) <= Ordering::kEquivalent;
   }
 
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(
       const Derived& other) const {
-    if (Eigen::numext::isnan(derived()) || Eigen::numext::isnan(other)) {
-      return false;
-    }
-    auto [lhs_sign, lhs_mag] = SignAndMagnitude(derived());
-    auto [rhs_sign, rhs_mag] = SignAndMagnitude(other);
-    if (lhs_mag == 0 && rhs_mag == 0) {
-      return false;
-    }
-    return SignAndMagnitudeToTwosComplement(lhs_sign, lhs_mag) >
-           SignAndMagnitudeToTwosComplement(rhs_sign, rhs_mag);
+    return Compare(derived(), other) == Ordering::kGreater;
   }
 
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(
       const Derived& other) const {
-    if (Eigen::numext::isnan(derived()) || Eigen::numext::isnan(other)) {
-      return false;
-    }
-    auto [lhs_sign, lhs_mag] = SignAndMagnitude(derived());
-    auto [rhs_sign, rhs_mag] = SignAndMagnitude(other);
-    if (lhs_mag == 0 && rhs_mag == 0) {
-      return true;
-    }
-    return SignAndMagnitudeToTwosComplement(lhs_sign, lhs_mag) >=
-           SignAndMagnitudeToTwosComplement(rhs_sign, rhs_mag);
+    Ordering ordering = Compare(derived(), other);
+    return ordering == Ordering::kGreater || ordering == Ordering::kEquivalent;
   }
 
   // Compound assignment.
@@ -201,9 +193,71 @@ class float8_base {
   SignAndMagnitudeToTwosComplement(uint8_t sign, uint8_t magnitude) {
     return magnitude ^ (static_cast<int8_t>(sign) < 0 ? -1 : 0);
   }
+
+  enum Ordering : int8_t {
+    kLess = -1,
+    kEquivalent = 0,
+    kGreater = 1,
+    kUnordered = 2,
+  };
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC friend Ordering Compare(
+      const Derived& lhs, const Derived& rhs) {
+    if (Eigen::numext::isnan(lhs) || Eigen::numext::isnan(rhs)) {
+      return Ordering::kUnordered;
+    }
+    auto [lhs_sign, lhs_mag] = SignAndMagnitude(lhs);
+    auto [rhs_sign, rhs_mag] = SignAndMagnitude(rhs);
+    if (lhs_mag == 0 && rhs_mag == 0) {
+      return Ordering::kEquivalent;
+    }
+    int8_t lhs_twos_complement =
+        SignAndMagnitudeToTwosComplement(lhs_sign, lhs_mag);
+    int8_t rhs_twos_complement =
+        SignAndMagnitudeToTwosComplement(rhs_sign, rhs_mag);
+    if (lhs_twos_complement < rhs_twos_complement) {
+      return Ordering::kLess;
+    }
+    if (lhs_twos_complement > rhs_twos_complement) {
+      return Ordering::kGreater;
+    }
+    return Ordering::kEquivalent;
+  }
+
   uint8_t rep_;
 };
 
+template <typename Derived, typename T>
+using EnableIsFloat8Subtype =
+    std::enable_if_t<std::is_base_of_v<float8_base<Derived>, Derived>, T>;
+
+}  // namespace float8_internal
+}  // namespace tsl
+
+// Eigen-specific overrides.
+namespace Eigen {
+namespace numext {
+
+// NOLINTBEGIN: bit_cast expects to take its parameter by reference.
+template <typename Tgt>
+constexpr tsl::float8_internal::EnableIsFloat8Subtype<Tgt, Tgt> bit_cast(
+    const uint8_t& src) {
+  return Tgt::FromRep(src);
+}
+// NOLINTEND
+
+template <typename Float8>
+constexpr uint8_t bit_cast(
+    const tsl::float8_internal::float8_base<Float8>& src) {
+  return src.rep();
+}
+
+}  // namespace numext
+}  // namespace Eigen
+
+namespace tsl {
+
+namespace float8_internal {
 class float8_e4m3fn : public float8_base<float8_e4m3fn> {
   // Exponent: 4, Mantissa: 3, bias: 7.
   // Extended range: no inf, NaN represented by 0bS111'1111.
@@ -214,50 +268,13 @@ class float8_e4m3fn : public float8_base<float8_e4m3fn> {
  private:
   using Base = float8_base<float8_e4m3fn>;
   friend class float8_base<float8_e4m3fn>;
-
-  constexpr float8_e4m3fn(uint8_t rep, ConstructFromRepTag)
-      : Base(rep, ConstructFromRepTag{}) {}
+  using Base::Base;
 
  public:
-  constexpr float8_e4m3fn() = default;
-
-  template <typename T,
-            typename EnableIf = std::enable_if<std::is_arithmetic_v<T>>>
-  explicit EIGEN_DEVICE_FUNC float8_e4m3fn(T f)
-      : float8_e4m3fn(ConvertFrom(static_cast<float>(f))) {}
-  explicit EIGEN_DEVICE_FUNC float8_e4m3fn(double f64)
-      : float8_e4m3fn(ConvertFrom(f64)) {}
-  explicit EIGEN_DEVICE_FUNC float8_e4m3fn(float f32)
-      : float8_e4m3fn(ConvertFrom(f32)) {}
-  explicit EIGEN_DEVICE_FUNC float8_e4m3fn(Eigen::bfloat16 bf16)
-      : float8_e4m3fn(ConvertFrom(bf16)) {}
-  explicit EIGEN_DEVICE_FUNC float8_e4m3fn(Eigen::half f16)
-      : float8_e4m3fn(ConvertFrom(f16)) {}
   explicit EIGEN_DEVICE_FUNC float8_e4m3fn(const float8_e5m2& f8)
       : float8_e4m3fn(ConvertFrom(f8)) {}
   explicit EIGEN_DEVICE_FUNC float8_e4m3fn(const float8_e4m3b11& f8)
       : float8_e4m3fn(ConvertFrom(f8)) {}
-
-  template <typename T,
-            typename EnableIf = std::enable_if<std::is_arithmetic_v<T>>>
-  explicit EIGEN_DEVICE_FUNC operator T() const {
-    return static_cast<T>(static_cast<float>(*this));
-  }
-  explicit EIGEN_DEVICE_FUNC operator double() const {
-    return ConvertTo<double>(*this);
-  }
-  explicit EIGEN_DEVICE_FUNC operator float() const {
-    return ConvertTo<float>(*this);
-  }
-  explicit EIGEN_DEVICE_FUNC operator Eigen::bfloat16() const {
-    return ConvertTo<Eigen::bfloat16>(*this);
-  }
-  explicit EIGEN_DEVICE_FUNC operator Eigen::half() const {
-    return ConvertTo<Eigen::half>(*this);
-  }
-  explicit EIGEN_DEVICE_FUNC operator bool() const {
-    return (rep() & 0x7F) != 0;
-  }
 };
 
 class float8_e4m3b11 : public float8_base<float8_e4m3b11> {
@@ -266,25 +283,9 @@ class float8_e4m3b11 : public float8_base<float8_e4m3b11> {
  private:
   using Base = float8_base<float8_e4m3b11>;
   friend class float8_base<float8_e4m3b11>;
-
-  constexpr float8_e4m3b11(uint8_t rep, ConstructFromRepTag)
-      : Base(rep, ConstructFromRepTag{}) {}
+  using Base::Base;
 
  public:
-  constexpr float8_e4m3b11() = default;
-
-  template <typename T,
-            typename EnableIf = std::enable_if<std::is_arithmetic_v<T>>>
-  explicit EIGEN_DEVICE_FUNC float8_e4m3b11(T f)
-      : float8_e4m3b11(ConvertFrom(static_cast<float>(f))) {}
-  explicit EIGEN_DEVICE_FUNC float8_e4m3b11(double f64)
-      : float8_e4m3b11(ConvertFrom(f64)) {}
-  explicit EIGEN_DEVICE_FUNC float8_e4m3b11(float f32)
-      : float8_e4m3b11(ConvertFrom(f32)) {}
-  explicit EIGEN_DEVICE_FUNC float8_e4m3b11(Eigen::bfloat16 bf16)
-      : float8_e4m3b11(ConvertFrom(bf16)) {}
-  explicit EIGEN_DEVICE_FUNC float8_e4m3b11(Eigen::half f16)
-      : float8_e4m3b11(ConvertFrom(f16)) {}
   explicit EIGEN_DEVICE_FUNC float8_e4m3b11(const float8_e5m2& f8)
       : float8_e4m3b11(ConvertFrom(f8)) {}
   explicit EIGEN_DEVICE_FUNC float8_e4m3b11(const float8_e4m3fn& f8)
@@ -292,7 +293,7 @@ class float8_e4m3b11 : public float8_base<float8_e4m3b11> {
 
   constexpr float8_e4m3b11 operator-() const {
     if ((rep() & 0x7f) == 0x00) {
-      return float8_e4m3b11(rep(), ConstructFromRepTag{});
+      return *this;
     }
     return Base::operator-();
   }
@@ -301,23 +302,6 @@ class float8_e4m3b11 : public float8_base<float8_e4m3b11> {
     return Base::operator-(other);
   }
 
-  template <typename T,
-            typename EnableIf = std::enable_if<std::is_arithmetic_v<T>>>
-  explicit EIGEN_DEVICE_FUNC operator T() const {
-    return static_cast<T>(static_cast<float>(*this));
-  }
-  explicit EIGEN_DEVICE_FUNC operator double() const {
-    return ConvertTo<double>(*this);
-  }
-  explicit EIGEN_DEVICE_FUNC operator float() const {
-    return ConvertTo<float>(*this);
-  }
-  explicit EIGEN_DEVICE_FUNC operator Eigen::bfloat16() const {
-    return ConvertTo<Eigen::bfloat16>(*this);
-  }
-  explicit EIGEN_DEVICE_FUNC operator Eigen::half() const {
-    return ConvertTo<Eigen::half>(*this);
-  }
   explicit EIGEN_DEVICE_FUNC operator bool() const { return rep() != 0; }
 };
 
@@ -327,52 +311,76 @@ class float8_e5m2 : public float8_base<float8_e5m2> {
  private:
   using Base = float8_base<float8_e5m2>;
   friend class float8_base<float8_e5m2>;
-
-  constexpr float8_e5m2(uint8_t rep, ConstructFromRepTag)
-      : Base(rep, ConstructFromRepTag{}) {}
+  using Base::Base;
 
  public:
-  constexpr float8_e5m2() = default;
-
-  template <typename T,
-            typename EnableIf = std::enable_if<std::is_arithmetic_v<T>>>
-  explicit EIGEN_DEVICE_FUNC float8_e5m2(T f)
-      : float8_e5m2(ConvertFrom(static_cast<float>(f))) {}
-  explicit EIGEN_DEVICE_FUNC float8_e5m2(double f64)
-      : float8_e5m2(ConvertFrom(f64)) {}
-  explicit EIGEN_DEVICE_FUNC float8_e5m2(float f32)
-      : float8_e5m2(ConvertFrom(f32)) {}
-  explicit EIGEN_DEVICE_FUNC float8_e5m2(Eigen::bfloat16 bf16)
-      : float8_e5m2(ConvertFrom(bf16)) {}
-  explicit EIGEN_DEVICE_FUNC float8_e5m2(Eigen::half f16)
-      : float8_e5m2(ConvertFrom(f16)) {}
   explicit EIGEN_DEVICE_FUNC float8_e5m2(float8_e4m3fn f8)
       : float8_e5m2(ConvertFrom(f8)) {}
   explicit EIGEN_DEVICE_FUNC float8_e5m2(float8_e4m3b11 f8)
       : float8_e5m2(ConvertFrom(f8)) {}
-
-  template <typename T,
-            typename EnableIf = std::enable_if<std::is_arithmetic_v<T>>>
-  explicit EIGEN_DEVICE_FUNC operator T() const {
-    return static_cast<T>(static_cast<float>(*this));
-  }
-  explicit EIGEN_DEVICE_FUNC operator double() const {
-    return ConvertTo<double>(*this);
-  }
-  explicit EIGEN_DEVICE_FUNC operator float() const {
-    return ConvertTo<float>(*this);
-  }
-  explicit EIGEN_DEVICE_FUNC operator Eigen::bfloat16() const {
-    return ConvertTo<Eigen::bfloat16>(*this);
-  }
-  explicit EIGEN_DEVICE_FUNC operator Eigen::half() const {
-    return ConvertTo<Eigen::half>(*this);
-  }
-  explicit EIGEN_DEVICE_FUNC operator bool() const {
-    return (rep() & 0x7F) != 0;
-  }
 };
 
+constexpr double ConstexprAbs(double x) { return x < 0.0 ? -x : x; }
+
+constexpr double ConstexprCeil(double x) {
+  constexpr double kIntegerThreshold =
+      uint64_t{1} << (std::numeric_limits<double>::digits - 1);
+  // Too big or NaN inputs get returned unchanged.
+  if (!(ConstexprAbs(x) < kIntegerThreshold)) {
+    return x;
+  }
+  const double x_trunc = static_cast<double>(static_cast<int64_t>(x));
+  return x_trunc < x ? x_trunc + 1.0 : x_trunc;
+}
+
+constexpr double ConstexprFloor(double x) { return -ConstexprCeil(-x); }
+
+constexpr double kLog10Of2 = 0.3010299956639812;
+// C17 5.2.4.2.2p11:
+// "number of decimal digits, q, such that any floating-point number with q
+// decimal digits can be rounded into a floating-point number with p radix b
+// digits and back again without change to the q decimal digits"
+// floor((p - 1) * log10(2));
+constexpr int Digits10FromDigits(int digits) {
+  return static_cast<int>(ConstexprFloor((digits - 1) * kLog10Of2));
+}
+
+// C17 5.2.4.2.2p11:
+// "number of decimal digits, n, such that any floating-point number with p
+// radix b digits can be rounded to a floating-point number with n decimal
+// digits and back again without change to the value"
+// ceil(1 + p * log10(2));
+constexpr int MaxDigits10FromDigits(int digits) {
+  return static_cast<int>(ConstexprCeil(1.0 + (digits * kLog10Of2)));
+}
+
+// C17 5.2.4.2.2p11:
+// "minimum negative integer such that 10 raised to that power is in the range
+// of normalized floating-point numbers"
+// ceil(log10(2**(emin - 1))) == ceil((emin - 1) * log10(2));
+constexpr int MinExponent10FromMinExponent(int min_exponent) {
+  return static_cast<int>(ConstexprCeil((min_exponent - 1) * kLog10Of2));
+}
+
+// C17 5.2.4.2.2p11:
+// "maximum integer such that 10 raised to that power is in the range of
+// representable finite floating-point numbers"
+// floor(log10((1 - 2**-p) * 2**emax)) == floor(log10(1 - 2**-p) +
+// emax * log10(2))
+constexpr int MaxExponent10FromMaxExponentAndDigits(int max_exponent,
+                                                    int digits) {
+  // We only support digits in {3,4}. This table would grow if we wanted to
+  // handle more values.
+  constexpr double kLog10OfOnePredecessor[] = {
+      // log10(1 - 2**-3)
+      -0.057991946977686754,
+      // log10(1 - 2**-4)
+      -0.028028723600243537,
+  };
+  return static_cast<int>(ConstexprFloor(kLog10OfOnePredecessor[digits - 3] +
+                                         max_exponent * kLog10Of2));
+}
+
 // Structures for use in specializing std::numeric_limits.
 struct numeric_limits_float8_base {
   // NOLINTBEGIN: these names must match std::numeric_limits.
@@ -392,12 +400,6 @@ struct numeric_limits_float8_base {
   static inline constexpr const bool traps = std::numeric_limits<float>::traps;
   static inline constexpr const bool tinyness_before =
       std::numeric_limits<float>::tinyness_before;
-  // NOLINTEND
-};
-
-template <typename Derived>
-struct numeric_limits_float8 {
-  // NOLINTBEGIN: these names must match std::numeric_limits.
   static inline constexpr const int digits = 0;
   static inline constexpr const int digits10 = 0;
   static inline constexpr const int max_digits10 = 0;
@@ -411,121 +413,172 @@ struct numeric_limits_float8 {
   // NOLINTEND
 };
 
-template <>
-struct numeric_limits_float8<float8_e4m3fn>
-    : public numeric_limits_float8_base {
+struct numeric_limits_float8_e4m3fn : public numeric_limits_float8_base {
+ private:
+  static inline constexpr const int kExponentBias = 7;
+  static inline constexpr const int kMantissaBits = 3;
+
+ public:
   // NOLINTBEGIN: these names must match std::numeric_limits.
   static inline constexpr const int digits = 4;
-  static inline constexpr const int digits10 = 0;      // floor(3 * log10(2));
-  static inline constexpr const int max_digits10 = 3;  // ceil(4 * log10(2) + 1)
-  static inline constexpr const int min_exponent = -5;
-  static inline constexpr const int min_exponent10 = -1;
-  static inline constexpr const int max_exponent = 9;  // Extended format.
-  static inline constexpr const int max_exponent10 = 2;
+  static inline constexpr const int digits10 = Digits10FromDigits(digits);
+  static inline constexpr const int max_digits10 =
+      MaxDigits10FromDigits(digits);
+  static inline constexpr const int min_exponent = (1 - kExponentBias) + 1;
+  static inline constexpr const int min_exponent10 =
+      MinExponent10FromMinExponent(min_exponent);
+  static inline constexpr const int max_exponent =
+      (0b1111 - 7) + 1;  // Extended format.
+  static inline constexpr const int max_exponent10 =
+      MaxExponent10FromMaxExponentAndDigits(max_exponent, digits);
   static inline constexpr const bool is_iec559 = false;
   static inline constexpr const bool has_infinity = false;
   static inline constexpr const bool has_signaling_NaN = false;
   // NOLINTEND
 
-  static constexpr float8_e4m3fn min() { return float8_e4m3fn::FromRep(0x08); }
+  // 1.0 * 2^(0b0001 - 7) = 1.0 * 2^-6 = 0.015625
+  static constexpr float8_e4m3fn min() {
+    return float8_e4m3fn::FromRep(0b0'0001 << kMantissaBits);
+  }
+  // -(1 + 0b110 * 2^-3) * 2^(0b1111 - 7) = -1.75 * 2^8 = 448
   static constexpr float8_e4m3fn lowest() {
-    return float8_e4m3fn::FromRep(0xFE);
+    return float8_e4m3fn::FromRep(0b1'1111'110);
   }
-  static constexpr float8_e4m3fn max() { return float8_e4m3fn::FromRep(0x7E); }
+  // (1 + 0b110 * 2^-3) * 2**(0b1111 - 7) = 1.75 * 2^8 = 448
+  static constexpr float8_e4m3fn max() {
+    return float8_e4m3fn::FromRep(0b0'1111'110);
+  }
+  // 1.0 * 2^-3 = 0.125
   static constexpr float8_e4m3fn epsilon() {
-    return float8_e4m3fn::FromRep(0x20);
+    return float8_e4m3fn::FromRep((-kMantissaBits + kExponentBias)
+                                  << kMantissaBits);
   }
+  // 1.0 * 2^-1 = 0.5
   static constexpr float8_e4m3fn round_error() {
-    return float8_e4m3fn::FromRep(0x30);
+    return float8_e4m3fn::FromRep((-1 + kExponentBias) << kMantissaBits);
   }
   static constexpr float8_e4m3fn infinity() {
-    return float8_e4m3fn::FromRep(0x7F);
-  }  // NaN.
+    return float8_e4m3fn::FromRep(0b0'1111'111);
+  }
+  // NaN.
   static constexpr float8_e4m3fn quiet_NaN() {
-    return float8_e4m3fn::FromRep(0x7F);
+    return float8_e4m3fn::FromRep(0b0'1111'111);
   }
   static constexpr float8_e4m3fn signaling_NaN() {
-    return float8_e4m3fn::FromRep(0x7F);
+    return float8_e4m3fn::FromRep(0b0'1111'111);
   }
+  // 1.0 * 2^(-7 - 3 + 1) = 1.0 * 2^-9 = 0.001953125
   static constexpr float8_e4m3fn denorm_min() {
-    return float8_e4m3fn::FromRep(0x01);
+    return float8_e4m3fn::FromRep(0b0'0000'001);
   }
 };
 
-template <>
-struct numeric_limits_float8<float8_e4m3b11>
-    : public numeric_limits_float8_base {
+struct numeric_limits_float8_e4m3b11 : public numeric_limits_float8_base {
+ private:
+  static inline constexpr const int kExponentBias = 11;
+  static inline constexpr const int kMantissaBits = 3;
+
+ public:
   // NOLINTBEGIN: these names must match std::numeric_limits.
   static inline constexpr const int digits = 4;
-  static inline constexpr const int digits10 = 0;      // floor(3 * log10(2));
-  static inline constexpr const int max_digits10 = 3;  // ceil(4 * log10(2) + 1)
-  static inline constexpr const int min_exponent = (1 - 11) + 1;
-  static inline constexpr const int min_exponent10 = -2;
+  static inline constexpr const int digits10 = Digits10FromDigits(digits);
+  static inline constexpr const int max_digits10 =
+      MaxDigits10FromDigits(digits);
+  static inline constexpr const int min_exponent = (1 - kExponentBias) + 1;
+  static inline constexpr const int min_exponent10 =
+      MinExponent10FromMinExponent(min_exponent);
   static inline constexpr const int max_exponent =
-      (0b1111 - 11) + 1;  // Extended format.
-  static inline constexpr const int max_exponent10 = 1;
+      (0b1111 - kExponentBias) + 1;  // Extended format.
+  static inline constexpr const int max_exponent10 =
+      MaxExponent10FromMaxExponentAndDigits(max_exponent, digits);
   static inline constexpr const bool is_iec559 = false;
   static inline constexpr const bool has_infinity = false;
   static inline constexpr const bool has_signaling_NaN = false;
   // NOLINTEND
 
+  // 1.0 * 2^(0b0001 - 11) = 1.0 * 2^-10 = 0.0009765625
   static constexpr float8_e4m3b11 min() {
-    return float8_e4m3b11::FromRep(0x08);
+    return float8_e4m3b11::FromRep(1 << kMantissaBits);
   }
+  // -(1 + 0b111 * 2^-3) * 2^(0b1111 - 11) = -1.875 * 2^4 = -30
   static constexpr float8_e4m3b11 lowest() {
-    return float8_e4m3b11::FromRep(0xFF);
+    return float8_e4m3b11::FromRep(0b1'1111'111);
   }
+  // (1 + 0b111 * 2^-3) * 2^(0b1111 - 11) = 1.875 * 2^4 = 30
   static constexpr float8_e4m3b11 max() {
-    return float8_e4m3b11::FromRep(0x7F);
+    return float8_e4m3b11::FromRep(0b0'1111'111);
   }
+  // 1.0 * 2^-3 = 0.125
   static constexpr float8_e4m3b11 epsilon() {
-    constexpr int kExponentBias = 11;
-    constexpr int kMantissaBits = 3;
-    return float8_e4m3b11::FromRep((kExponentBias - kMantissaBits)
+    return float8_e4m3b11::FromRep((-kMantissaBits + kExponentBias)
                                    << kMantissaBits);
   }
+  // 1.0 * 2^-1 = 0.5
   static constexpr float8_e4m3b11 round_error() {
-    constexpr int kExponentBias = 11;
-    constexpr int kMantissaBits = 3;
-    return float8_e4m3b11::FromRep((kExponentBias - 1) << kMantissaBits);
+    return float8_e4m3b11::FromRep((-1 + kExponentBias) << kMantissaBits);
   }
   static constexpr float8_e4m3b11 infinity() {
-    return float8_e4m3b11::FromRep(0x80);
-  }  // NaN.
+    return float8_e4m3b11::FromRep(0b1'0000'000);
+  }
+  // NaN.
   static constexpr float8_e4m3b11 quiet_NaN() {
-    return float8_e4m3b11::FromRep(0x80);
+    return float8_e4m3b11::FromRep(0b1'0000'000);
   }
   static constexpr float8_e4m3b11 signaling_NaN() {
-    return float8_e4m3b11::FromRep(0x80);
+    return float8_e4m3b11::FromRep(0b1'0000'000);
   }
+  // 1.0 * 2^(-11 - 3 + 1) = 1.0 * 2^-13 = 0.0001220703125
   static constexpr float8_e4m3b11 denorm_min() {
-    return float8_e4m3b11::FromRep(0x01);
+    return float8_e4m3b11::FromRep(0b0'0000'001);
   }
 };
 
-template <>
-struct numeric_limits_float8<float8_e5m2> : public numeric_limits_float8_base {
+struct numeric_limits_float8_e5m2 : public numeric_limits_float8_base {
+ private:
+  static inline constexpr const int kExponentBias = 15;
+  static inline constexpr const int kMantissaBits = 2;
+
+ public:
   // NOLINTBEGIN: these names must match std::numeric_limits.
   static inline constexpr const int digits = 3;
-  static inline constexpr const int digits10 = 0;      // floor(2 * log10(2))
-  static inline constexpr const int max_digits10 = 2;  // ceil(3 * log10(2) + 1)
-  static inline constexpr const int min_exponent = -13;
-  static inline constexpr const int min_exponent10 = -4;
-  static inline constexpr const int max_exponent = 16;
-  static inline constexpr const int max_exponent10 = 4;
+  static inline constexpr const int digits10 = Digits10FromDigits(digits);
+  static inline constexpr const int max_digits10 =
+      MaxDigits10FromDigits(digits);
+  static inline constexpr const int min_exponent = (1 - kExponentBias) + 1;
+  static inline constexpr const int min_exponent10 =
+      MinExponent10FromMinExponent(min_exponent);
+  static inline constexpr const int max_exponent = 0b11111 - kExponentBias;
+  static inline constexpr const int max_exponent10 =
+      MaxExponent10FromMaxExponentAndDigits(max_exponent, digits);
   static inline constexpr const bool is_iec559 = true;
   static inline constexpr const bool has_infinity = true;
   static inline constexpr const bool has_signaling_NaN = true;
   // NOLINTEND
 
-  static constexpr float8_e5m2 min() { return float8_e5m2::FromRep(0x04); }
-  static constexpr float8_e5m2 lowest() { return float8_e5m2::FromRep(0xFB); }
-  static constexpr float8_e5m2 max() { return float8_e5m2::FromRep(0x7B); }
-  static constexpr float8_e5m2 epsilon() { return float8_e5m2::FromRep(0x34); }
-  static constexpr float8_e5m2 round_error() {
-    return float8_e5m2::FromRep(0x38);
+  // 1.0 * 2^(0b00001 - 15) = 1.0 * 2^-14 = 0.00006103515625
+  static constexpr float8_e5m2 min() {
+    return float8_e5m2::FromRep(1 << kMantissaBits);
+  }
+  // -(1 + 0b11 * 2^-2) * 2^(0b11110 - 15) = -1.75 * 2^15 = -57344
+  static constexpr float8_e5m2 lowest() {
+    return float8_e5m2::FromRep(0b1'11110'11);
+  }
+  // (1 + 0b11 * 2^-2) * 2^(0b11110 - 15) = 1.75 * 2^15 = 57344
+  static constexpr float8_e5m2 max() {
+    return float8_e5m2::FromRep(0b0'11110'11);
+  }
+  // 1.0 * 2^-2 = 0.25
+  static constexpr float8_e5m2 epsilon() {
+    return float8_e5m2::FromRep((-kMantissaBits + kExponentBias)
+                                << kMantissaBits);
+  }
+  // 1.0 * 2^-1 = 0.5
+  static constexpr float8_e5m2 round_error() {
+    return float8_e5m2::FromRep((-1 + kExponentBias) << kMantissaBits);
+  }
+  static constexpr float8_e5m2 infinity() {
+    return float8_e5m2::FromRep(0b0'11111'00);
   }
-  static constexpr float8_e5m2 infinity() { return float8_e5m2::FromRep(0x7C); }
   static constexpr float8_e5m2 quiet_NaN() {
     // IEEE 754-2019 6.2.1: "All binary NaN bit strings have the sign bit S set
     // to 0 or 1 and all the bits of the biased exponent field E set to 1
@@ -538,8 +591,9 @@ struct numeric_limits_float8<float8_e5m2> : public numeric_limits_float8_base {
     // the first bit of the trailing significand field being 0."
     return float8_e5m2::FromRep(0b0'11111'01);
   }
+  // 1.0 * 2^(-15 - 2 + 1) = 1.0 * 2^-16 = 0.0000152587890625
   static constexpr float8_e5m2 denorm_min() {
-    return float8_e5m2::FromRep(0x01);
+    return float8_e5m2::FromRep(0b0'00000'01);
   }
 };
 
@@ -550,18 +604,15 @@ namespace std {
 // Standard-library overrides.  Note that these are picked up by Eigen as well.
 template <>
 struct numeric_limits<tsl::float8_internal::float8_e4m3fn>
-    : public tsl::float8_internal::numeric_limits_float8<
-          tsl::float8_internal::float8_e4m3fn> {};
+    : public tsl::float8_internal::numeric_limits_float8_e4m3fn {};
 
 template <>
 struct numeric_limits<tsl::float8_internal::float8_e4m3b11>
-    : public tsl::float8_internal::numeric_limits_float8<
-          tsl::float8_internal::float8_e4m3b11> {};
+    : public tsl::float8_internal::numeric_limits_float8_e4m3b11 {};
 
 template <>
 struct numeric_limits<tsl::float8_internal::float8_e5m2>
-    : public tsl::float8_internal::numeric_limits_float8<
-          tsl::float8_internal::float8_e5m2> {};
+    : public tsl::float8_internal::numeric_limits_float8_e5m2 {};
 }  // namespace std
 
 namespace tsl {
@@ -569,52 +620,110 @@ namespace float8_internal {
 
 // Free-functions for use with ADL and in Eigen.
 constexpr inline float8_e4m3fn abs(const float8_e4m3fn& a) {
-  return float8_e4m3fn::FromRep(a.rep() & 0x7F);
+  return float8_e4m3fn::FromRep(a.rep() & 0b0'1111'111);
 }
 
-constexpr inline bool isnan(const float8_e4m3fn& a) {
-  return (a.rep() & 0x7F) == 0x7F;
-}
-
-constexpr inline bool isinf(const float8_e4m3fn& a) {
-  return false;  // No inf representation.
-}
-
-constexpr inline bool isfinite(const float8_e4m3fn& a) {
-  return !isnan(a) && !isinf(a);
+constexpr inline bool(isnan)(const float8_e4m3fn& a) {
+  return abs(a).rep() == std::numeric_limits<float8_e4m3fn>::quiet_NaN().rep();
 }
 
 constexpr inline float8_e4m3b11 abs(const float8_e4m3b11& a) {
-  return (a.rep() & 0x7F) == 0 ? float8_e4m3b11::FromRep(a.rep())
-                               : float8_e4m3b11::FromRep(a.rep() & 0x7F);
+  return (a.rep() & 0b0'1111'111) == 0
+             ? float8_e4m3b11::FromRep(a.rep())
+             : float8_e4m3b11::FromRep(a.rep() & 0b0'1111'111);
 }
 
-constexpr inline bool isnan(const float8_e4m3b11& a) { return a.rep() == 0x80; }
-
-constexpr inline bool isinf(const float8_e4m3b11& a) {
-  return false;  // No inf representation.
-}
-
-constexpr inline bool isfinite(const float8_e4m3b11& a) {
-  return !isnan(a) && !isinf(a);
+constexpr inline bool(isnan)(const float8_e4m3b11& a) {
+  return a.rep() == std::numeric_limits<float8_e4m3b11>::quiet_NaN().rep();
 }
 
 constexpr inline float8_e5m2 abs(const float8_e5m2& a) {
-  return float8_e5m2::FromRep(a.rep() & 0x7F);
+  return float8_e5m2::FromRep(a.rep() & 0b0'11111'11);
 }
 
-constexpr inline bool isnan(const float8_e5m2& a) {
-  return (a.rep() & 0x7F) > 0x7C;
+constexpr inline bool(isnan)(const float8_e5m2& a) {
+  return abs(a).rep() > std::numeric_limits<float8_e5m2>::infinity().rep();
 }
 
-constexpr inline bool isinf(const float8_e5m2& a) {
-  return (a.rep() & 0x7F) == 0x7C;
+template <typename Float8>
+constexpr inline bool(isinf)(const float8_base<Float8>& a) {
+  return std::numeric_limits<Float8>::has_infinity
+             ? abs(a.derived()).rep() ==
+                   std::numeric_limits<Float8>::infinity().rep()
+             : false;  // No inf representation.
 }
 
-constexpr inline bool isfinite(const float8_e5m2& a) {
-  return !isnan(a) && !isinf(a);
+template <typename Float8>
+constexpr inline bool(isfinite)(const float8_base<Float8>& a) {
+  return !isnan(a.derived()) && !isinf(a.derived());
 }
 
+}  // namespace float8_internal
+}  // namespace tsl
+
+namespace Eigen {
+namespace numext {
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(
+    const tsl::float8_internal::float8_e4m3fn& x) {
+  return (tsl::float8_internal::isinf)(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(
+    const tsl::float8_internal::float8_e4m3b11& x) {
+  return (tsl::float8_internal::isinf)(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(
+    const tsl::float8_internal::float8_e5m2& x) {
+  return (tsl::float8_internal::isinf)(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(
+    const tsl::float8_internal::float8_e4m3fn& x) {
+  return (tsl::float8_internal::isnan)(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(
+    const tsl::float8_internal::float8_e4m3b11& x) {
+  return (tsl::float8_internal::isnan)(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(
+    const tsl::float8_internal::float8_e5m2& x) {
+  return (tsl::float8_internal::isnan)(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(
+    const tsl::float8_internal::float8_e4m3fn& x) {
+  return (tsl::float8_internal::isfinite)(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(
+    const tsl::float8_internal::float8_e4m3b11& x) {
+  return (tsl::float8_internal::isfinite)(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(
+    const tsl::float8_internal::float8_e5m2& x) {
+  return (tsl::float8_internal::isfinite)(x);
+}
+
+}  // namespace numext
+}  // namespace Eigen
+
+namespace tsl {
+namespace float8_internal {
+
 template <typename Float8>
 std::ostream& operator<<(std::ostream& os, const float8_base<Float8>& f8) {
   os << static_cast<float>(f8.derived());
@@ -625,29 +734,10 @@ std::ostream& operator<<(std::ostream& os, const float8_base<Float8>& f8) {
 // Inline conversion routines between float8 and other types.
 //==============================================================================
 
-// Helper struct for getting a bit representation provided a byte size.
+// Helper for getting a bit representation provided a byte size.
 template <int kNumBytes>
-struct GetUnsignedInteger;
-
-template <>
-struct GetUnsignedInteger<1> {
-  using type = uint8_t;
-};
-
-template <>
-struct GetUnsignedInteger<2> {
-  using type = uint16_t;
-};
-
-template <>
-struct GetUnsignedInteger<4> {
-  using type = uint32_t;
-};
-
-template <>
-struct GetUnsignedInteger<8> {
-  using type = uint64_t;
-};
+using GetUnsignedInteger =
+    typename Eigen::numext::get_integer_by_size<kNumBytes>::unsigned_type;
 
 // Converts between two floating-point types.
 template <typename From, typename To, bool kSaturate, bool kTruncate,
@@ -678,70 +768,24 @@ struct ConvertImpl<Scalar, Scalar, /*kSaturate=*/true, /*kTruncate=*/true,
 
 template <typename Float>
 struct TraitsBase {
-  using BitsType = typename GetUnsignedInteger<sizeof(Float)>::type;
+  using BitsType = GetUnsignedInteger<sizeof(Float)>;
   static constexpr int kBits = sizeof(Float) * CHAR_BIT;
   static constexpr int kMantissaBits = Eigen::NumTraits<Float>::digits() - 1;
   static constexpr int kExponentBits = kBits - kMantissaBits - 1;
   static constexpr BitsType kExponentMask = ((BitsType{1} << kExponentBits) - 1)
                                             << kMantissaBits;
   static constexpr BitsType kMantissaMask = (BitsType{1} << kMantissaBits) - 1;
+  static constexpr int kExponentBias = (1 << (kExponentBits - 1)) - 1;
 };
 
 template <typename Float>
-struct Traits : public TraitsBase<Float> {
-  using Base = TraitsBase<Float>;
-  static constexpr int kExponentBias = (1 << (Base::kExponentBits - 1)) - 1;
-  static EIGEN_DEVICE_FUNC Float ConstructFromSignAndBits(
-      typename Base::BitsType sign, typename Base::BitsType bits) {
-    return Eigen::numext::bit_cast<Float>(
-        static_cast<typename Base::BitsType>(bits | sign));
-  }
-};
+struct Traits : public TraitsBase<Float> {};
 
 template <>
 struct Traits<float8_e4m3b11> : public TraitsBase<float8_e4m3b11> {
-  using Base = TraitsBase<float8_e4m3b11>;
   static constexpr int kExponentBias = 11;
-  static EIGEN_DEVICE_FUNC float8_e4m3b11 ConstructFromSignAndBits(
-      typename Base::BitsType sign, typename Base::BitsType bits) {
-    // float8_e4m3b11 does not support signed zero, ignore the sign if we try to
-    // make one.
-    if (bits == 0) {
-      sign = 0;
-    }
-    return Eigen::numext::bit_cast<float8_e4m3b11>(
-        static_cast<typename Base::BitsType>(bits | sign));
-  }
 };
 
-// Shift bits in the appropriate directions and add the exponent offset
-// to convert between bit representations.  The input `in` must be a
-// positive normalized value.
-template <typename From, typename To,
-          typename FromBits = typename Traits<From>::BitsType,
-          typename ToBits = typename Traits<To>::BitsType>
-constexpr FromBits ToFromBits(ToBits in) {
-  using FromTraits = Traits<From>;
-  constexpr int kFromMantissaBits = FromTraits::kMantissaBits;
-  constexpr int kFromExponentBias = FromTraits::kExponentBias;
-
-  using ToTraits = Traits<To>;
-  constexpr int kToMantissaBits = ToTraits::kMantissaBits;
-  constexpr int kToExponentBias = ToTraits::kExponentBias;
-
-  constexpr int kExponentOffset = kFromExponentBias - kToExponentBias;
-  constexpr int kDigitShift = kFromMantissaBits - kToMantissaBits;
-
-  FromBits out = static_cast<FromBits>(in);
-  if constexpr (kDigitShift > 0) {
-    out <<= kDigitShift;
-  } else if constexpr (kDigitShift < 0) {
-    out >>= -kDigitShift;
-  }
-  out += static_cast<FromBits>(kExponentOffset) << kFromMantissaBits;
-  return out;
-}
-
 template <typename Bits>
 constexpr inline Bits RoundBitsToNearestEven(Bits bits, int roundoff) {
   // Round to nearest even by adding a bias term.
@@ -762,7 +806,7 @@ template <typename From, typename To, bool kSaturate, bool kTruncate>
 struct ConvertImpl<From, To, kSaturate, kTruncate,
                    std::enable_if_t<!std::is_same_v<From, To>>> {
   using FromTraits = Traits<From>;
-  using FromBits = typename GetUnsignedInteger<sizeof(From)>::type;
+  using FromBits = typename FromTraits::BitsType;
   static constexpr int kFromBits = FromTraits::kBits;
   static constexpr int kFromMantissaBits = FromTraits::kMantissaBits;
   static constexpr int kFromExponentBits = FromTraits::kExponentBits;
@@ -770,55 +814,50 @@ struct ConvertImpl<From, To, kSaturate, kTruncate,
   static constexpr FromBits kFromExponentMask = FromTraits::kExponentMask;
 
   using ToTraits = Traits<To>;
-  using ToBits = typename GetUnsignedInteger<sizeof(To)>::type;
+  using ToBits = typename ToTraits::BitsType;
   static constexpr int kToBits = ToTraits::kBits;
   static constexpr int kToMantissaBits = ToTraits::kMantissaBits;
   static constexpr int kToExponentBits = ToTraits::kExponentBits;
   static constexpr int kToExponentBias = ToTraits::kExponentBias;
   static constexpr ToBits kToExponentMask = ToTraits::kExponentMask;
 
+  // `WideBits` is wide enough to accomodate the largest exponent and mantissa
+  // in either `From` or `To`.
+  static constexpr int kWideBits =
+      (std::max(kToMantissaBits, kFromMantissaBits)) +  // Max significand.
+      (std::max(kToExponentBits, kFromExponentBits));   // Max exponent.
+  static constexpr int kWideBytes = (kWideBits + (CHAR_BIT - 1)) / CHAR_BIT;
+  using WideBits = GetUnsignedInteger<kWideBytes>;
   static constexpr int kExponentOffset = kToExponentBias - kFromExponentBias;
   static constexpr int kDigitShift = kToMantissaBits - kFromMantissaBits;
-  static constexpr int kSignShift = kToBits - kFromBits;
 
   static EIGEN_DEVICE_FUNC inline To run(const From& from) {
     // Shift bits to destination type, without sign bit.
-    FromBits from_bits = Eigen::numext::bit_cast<FromBits>(from);
-    const FromBits from_sign =
-        from_bits ^ Eigen::numext::bit_cast<FromBits>(Eigen::numext::abs(from));
-    ToBits sign;
-    if constexpr (kSignShift >= 0) {
-      sign = ToBits{from_sign} << kSignShift;
-    } else if constexpr (kSignShift < 0) {
-      sign = static_cast<ToBits>(from_sign >> -kSignShift);
-    }
-    from_bits ^= from_sign;  // Zeros sign bit to obtain absolute value.
+    const bool from_sign_bit =
+        Eigen::numext::bit_cast<FromBits>(from) >> (kFromBits - 1);
+    const FromBits from_bits =
+        Eigen::numext::bit_cast<FromBits>(Eigen::numext::abs(from));
 
     // Special values, preserving sign.
     if (Eigen::numext::isinf(from)) {
-      return sign != 0 ? -Eigen::NumTraits<To>::infinity()
-                       : Eigen::NumTraits<To>::infinity();
+      return from_sign_bit ? -Eigen::NumTraits<To>::infinity()
+                           : Eigen::NumTraits<To>::infinity();
     }
     if (Eigen::numext::isnan(from)) {
-      return sign != 0 ? -Eigen::NumTraits<To>::quiet_NaN()
-                       : Eigen::NumTraits<To>::quiet_NaN();
+      return from_sign_bit ? -Eigen::NumTraits<To>::quiet_NaN()
+                           : Eigen::NumTraits<To>::quiet_NaN();
     }
     if (from_bits == 0) {
-      return ToTraits::ConstructFromSignAndBits(/*sign=*/sign, /*bits=*/0);
+      return from_sign_bit ? -To{} : To{};
     }
 
-    // Adjust mantissa.
-    FromBits rounded_from_bits = from_bits;
-    if constexpr (kDigitShift < 0) {
-      if constexpr (!kTruncate) {
-        rounded_from_bits = RoundBitsToNearestEven(from_bits, -kDigitShift);
-      }
-      // Zero-out tail bits.
-      rounded_from_bits &= ~((FromBits{1} << (-kDigitShift)) - 1);
-    }
+    const int biased_from_exponent = from_bits >> kFromMantissaBits;
 
-    if constexpr (kExponentOffset > 0) {
-      if ((from.rep() & kFromExponentMask) == 0) {
+    // `To` supports more exponents near zero which means that some subnormal
+    // values in `From` may become normal.
+    if constexpr (std::numeric_limits<To>::min_exponent <
+                  std::numeric_limits<From>::min_exponent) {
+      if (biased_from_exponent == 0) {
         // Subnormals.
         ToBits bits = from_bits;
 
@@ -835,29 +874,20 @@ struct ConvertImpl<From, To, kSaturate, kTruncate,
         // Insert the exponent bits.
         bits |= static_cast<ToBits>(kExponentOffset - normalization_factor + 1)
                 << kToMantissaBits;
-        return ToTraits::ConstructFromSignAndBits(/*sign=*/sign, /*bits=*/bits);
+        To to = Eigen::numext::bit_cast<To>(bits);
+        return from_sign_bit ? -to : to;
       }
-    } else if constexpr (kExponentOffset < 0) {
-      // Check for overflows.
-
-      // Shift up exponent and mantissa, add offset to adjust exponent to
-      // source type.
-      constexpr ToBits kToHighest = Eigen::NumTraits<To>::highest().rep();
-      constexpr FromBits kHighest = ToFromBits<From, To>(kToHighest);
-
-      if (rounded_from_bits > kHighest) {
-        ToBits bits =
-            kSaturate ? kToHighest : Eigen::NumTraits<To>::infinity().rep();
-        return ToTraits::ConstructFromSignAndBits(/*sign=*/sign, /*bits=*/bits);
-      }
-
+    }
+    // `To` supports fewer exponents near zero which means that some values in
+    // `From` may become subnormal.
+    if constexpr (std::numeric_limits<To>::min_exponent >
+                  std::numeric_limits<From>::min_exponent) {
+      const int unbiased_exponent = biased_from_exponent - kFromExponentBias;
+      const int biased_to_exponent = unbiased_exponent + kToExponentBias;
       // Subnormals and zero.
-      constexpr FromBits kLowestNormal =
-          ToFromBits<From, To>(std::numeric_limits<To>::min().rep());
-      if (rounded_from_bits < kLowestNormal) {
+      if (biased_to_exponent <= 0) {
         // Round and shift mantissa down.
-        int exponent = ((from_bits >> kFromMantissaBits) - kFromExponentBias);
-        int exponent_shift = -kDigitShift - exponent - kToExponentBias + 1;
+        int exponent_shift = -kDigitShift - biased_to_exponent + 1;
 
         // Insert the implicit leading 1 bit on the mantissa.  This assumes
         // the input is normalized.  If it is not, then the mantissa bits -
@@ -866,8 +896,8 @@ struct ConvertImpl<From, To, kSaturate, kTruncate,
         // the lower precision bits may already be lost.  There is an edge-case
         // where rounding to a normalized value would normally round down,
         // but for a subnormal, we need to round up.
-        rounded_from_bits = ((from_bits & FromTraits::kMantissaMask) |
-                             (FromBits{1} << kFromMantissaBits));
+        FromBits rounded_from_bits = ((from_bits & FromTraits::kMantissaMask) |
+                                      (FromBits{1} << kFromMantissaBits));
         ToBits bits = 0;
         // To avoid UB, limit rounding and shifting to the full mantissa plus
         // leading 1.
@@ -879,22 +909,55 @@ struct ConvertImpl<From, To, kSaturate, kTruncate,
           bits = (rounded_from_bits >> exponent_shift);
         }
         // Insert sign and return.
-        return ToTraits::ConstructFromSignAndBits(/*sign=*/sign, /*bits=*/bits);
+        To to = Eigen::numext::bit_cast<To>(bits);
+        return from_sign_bit ? -to : to;
       }
     }
 
-    // Shift bits.
-    ToBits bits;
+    // Round the mantissa if it is shrinking.
+    WideBits rounded_from_bits = from_bits;
     if constexpr (kDigitShift < 0) {
+      if constexpr (!kTruncate) {
+        rounded_from_bits = RoundBitsToNearestEven(from_bits, -kDigitShift);
+      }
+      // Zero-out tail bits.
+      rounded_from_bits &= ~((WideBits{1} << (-kDigitShift)) - 1);
+    }
+
+    // Re-bias the exponent.
+    rounded_from_bits += static_cast<WideBits>(kExponentOffset)
+                         << kFromMantissaBits;
+
+    ToBits bits;
+    // Check for overflows by aligning the significands. We always align the
+    // narrower significand to the wider significand.
+    const WideBits kToHighestRep =
+        Eigen::numext::bit_cast<ToBits>(Eigen::NumTraits<To>::highest());
+    WideBits aligned_highest{kToHighestRep};
+    if constexpr (kDigitShift < 0) {
+      aligned_highest <<= -kDigitShift;
+      // Shift down, all dropped bits should already be zero.
       bits = static_cast<ToBits>(rounded_from_bits >> -kDigitShift);
     } else if constexpr (kDigitShift >= 0) {
-      bits = ToBits{rounded_from_bits} << kDigitShift;
+      // Shift up, inserting zeros in the newly created digits.
+      rounded_from_bits <<= kDigitShift;
+      bits = ToBits{rounded_from_bits};
     }
-    // Increase exponent by offset difference.
-    bits += static_cast<ToBits>(kExponentOffset) << kToMantissaBits;
 
+    To to = Eigen::numext::bit_cast<To>(bits);
+    // `From` supports larger values than `To`, we may overflow.
+    if constexpr (std::make_pair(std::numeric_limits<To>::max_exponent,
+                                 std::numeric_limits<To>::digits) <
+                  std::make_pair(std::numeric_limits<From>::max_exponent,
+                                 std::numeric_limits<From>::digits)) {
+      if (rounded_from_bits > aligned_highest) {
+        // Overflowed values map to highest or infinity depending on kSaturate.
+        to = kSaturate ? Eigen::NumTraits<To>::highest()
+                       : Eigen::NumTraits<To>::infinity();
+      }
+    }
     // Insert sign bit.
-    return ToTraits::ConstructFromSignAndBits(/*sign=*/sign, /*bits=*/bits);
+    return from_sign_bit ? -to : to;
   }
 };
 
@@ -930,7 +993,7 @@ struct ConvertImpl<Eigen::half, float8_e5m2, kSaturate, kTruncate> {
       // value if saturation is requested.
       if constexpr (kSaturate) {
         const float8_e5m2 kHighest = Eigen::NumTraits<float8_e5m2>::highest();
-        if ((from_bits & 0x7FFF) > static_cast<uint16_t>(kHighest.rep()) << 8) {
+        if ((from_bits & 0x7F00) > static_cast<uint16_t>(kHighest.rep()) << 8) {
           const bool from_sign_bit = from_bits >> 15;
           return from_sign_bit ? -kHighest : kHighest;
         }
@@ -970,93 +1033,4 @@ using float8_e5m2 = float8_internal::float8_e5m2;
 
 }  // namespace tsl
 
-// Eigen-specific overrides.
-namespace Eigen {
-namespace numext {
-
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC tsl::float8_e4m3fn
-bit_cast<tsl::float8_e4m3fn, uint8_t>(const uint8_t& src) {
-  return tsl::float8_e4m3fn::FromRep(src);
-}
-
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint8_t
-bit_cast<uint8_t, tsl::float8_e4m3fn>(const tsl::float8_e4m3fn& src) {
-  return src.rep();
-}
-
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC tsl::float8_e5m2
-bit_cast<tsl::float8_e5m2, uint8_t>(const uint8_t& src) {
-  return tsl::float8_e5m2::FromRep(src);
-}
-
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint8_t
-bit_cast<uint8_t, tsl::float8_e5m2>(const tsl::float8_e5m2& src) {
-  return src.rep();
-}
-
-}  // namespace numext
-
-// Work-around for isinf/isnan/isfinite issue on aarch64.
-namespace internal {
-template <>
-EIGEN_DEVICE_FUNC inline bool isinf_impl<tsl::float8_e4m3fn>(
-    const tsl::float8_e4m3fn& x) {
-  return tsl::float8_internal::isinf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline bool isinf_impl<tsl::float8_e4m3b11>(
-    const tsl::float8_e4m3b11& x) {
-  return tsl::float8_internal::isinf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline bool isinf_impl<tsl::float8_e5m2>(
-    const tsl::float8_e5m2& x) {
-  return tsl::float8_internal::isinf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline bool isnan_impl<tsl::float8_e4m3fn>(
-    const tsl::float8_e4m3fn& x) {
-  return tsl::float8_internal::isnan(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline bool isnan_impl<tsl::float8_e4m3b11>(
-    const tsl::float8_e4m3b11& x) {
-  return tsl::float8_internal::isnan(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline bool isnan_impl<tsl::float8_e5m2>(
-    const tsl::float8_e5m2& x) {
-  return tsl::float8_internal::isnan(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline bool isfinite_impl<tsl::float8_e4m3fn>(
-    const tsl::float8_e4m3fn& x) {
-  return tsl::float8_internal::isfinite(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline bool isfinite_impl<tsl::float8_e4m3b11>(
-    const tsl::float8_e4m3b11& x) {
-  return tsl::float8_internal::isfinite(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline bool isfinite_impl<tsl::float8_e5m2>(
-    const tsl::float8_e5m2& x) {
-  return tsl::float8_internal::isfinite(x);
-}
-
-}  // namespace internal
-}  // namespace Eigen
-
 #endif  // TENSORFLOW_TSL_PLATFORM_FLOAT8_H_
diff --git a/tensorflow/tsl/platform/float8_test.cu.cc b/tensorflow/tsl/platform/float8_test.cu.cc
index 8cd10603550..040efdbd384 100644
--- a/tensorflow/tsl/platform/float8_test.cu.cc
+++ b/tensorflow/tsl/platform/float8_test.cu.cc
@@ -76,6 +76,49 @@ TEST(Float8E4m3Test, NumericLimits) {
   EXPECT_EQ(
       static_cast<float>(std::numeric_limits<float8_e4m3fn>::denorm_min()),
       std::exp2(-9));
+  EXPECT_EQ(std::numeric_limits<float8_e4m3fn>::digits, 4);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3fn>::digits10, 0);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3fn>::max_digits10, 3);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3fn>::min_exponent, -5);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3fn>::min_exponent10, -1);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3fn>::max_exponent, 9);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3fn>::max_exponent10, 2);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3fn>::is_iec559, false);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3fn>::has_infinity, false);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3fn>::has_signaling_NaN, false);
+}
+
+TEST(Float8E4m3b11Test, NumericLimits) {
+  EXPECT_TRUE(
+      Eigen::numext::isnan(std::numeric_limits<float8_e4m3b11>::quiet_NaN()));
+  EXPECT_TRUE(Eigen::numext::isnan(
+      std::numeric_limits<float8_e4m3b11>::signaling_NaN()));
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3b11>::min()),
+            std::exp2(-10));
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3b11>::max()), 30);
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3b11>::lowest()),
+            -30);
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3b11>::epsilon()),
+            0.125);
+  EXPECT_EQ(
+      static_cast<float>(std::numeric_limits<float8_e4m3b11>::round_error()),
+      0.5);
+  // No infinity, represent as NaN.
+  EXPECT_TRUE(
+      Eigen::numext::isnan(std::numeric_limits<float8_e4m3b11>::infinity()));
+  EXPECT_EQ(
+      static_cast<float>(std::numeric_limits<float8_e4m3b11>::denorm_min()),
+      std::exp2(-13));
+  EXPECT_EQ(std::numeric_limits<float8_e4m3b11>::digits, 4);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3b11>::digits10, 0);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3b11>::max_digits10, 3);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3b11>::min_exponent, -9);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3b11>::min_exponent10, -3);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3b11>::max_exponent, 5);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3b11>::max_exponent10, 1);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3b11>::is_iec559, false);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3b11>::has_infinity, false);
+  EXPECT_EQ(std::numeric_limits<float8_e4m3b11>::has_signaling_NaN, false);
 }
 
 TEST(Float8E5m2Test, NumericLimits) {
@@ -96,6 +139,16 @@ TEST(Float8E5m2Test, NumericLimits) {
       Eigen::numext::isinf(std::numeric_limits<float8_e5m2>::infinity()));
   EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::denorm_min()),
             std::exp2(-16));
+  EXPECT_EQ(std::numeric_limits<float8_e5m2>::digits, 3);
+  EXPECT_EQ(std::numeric_limits<float8_e5m2>::digits10, 0);
+  EXPECT_EQ(std::numeric_limits<float8_e5m2>::max_digits10, 2);
+  EXPECT_EQ(std::numeric_limits<float8_e5m2>::min_exponent, -13);
+  EXPECT_EQ(std::numeric_limits<float8_e5m2>::min_exponent10, -4);
+  EXPECT_EQ(std::numeric_limits<float8_e5m2>::max_exponent, 16);
+  EXPECT_EQ(std::numeric_limits<float8_e5m2>::max_exponent10, 4);
+  EXPECT_EQ(std::numeric_limits<float8_e5m2>::is_iec559, true);
+  EXPECT_EQ(std::numeric_limits<float8_e5m2>::has_infinity, true);
+  EXPECT_EQ(std::numeric_limits<float8_e5m2>::has_signaling_NaN, true);
 }
 
 TYPED_TEST(Float8Test, FromRep) {
@@ -218,6 +271,18 @@ TYPED_TEST(Float8Test, ConvertFromWithTruncation) {
           less_than_two);
   EXPECT_EQ(static_cast<float>(rounded), 2);
 
+  double kLarge = 0x1.c001p+16;
+  EXPECT_EQ(
+      (Float8::template ConvertFrom</*kSaturate=*/false, /*kTruncate=*/true>(
+           kLarge)
+           .rep()),
+      std::numeric_limits<Float8>::infinity().rep());
+  EXPECT_EQ(
+      (Float8::template ConvertFrom</*kSaturate=*/false, /*kTruncate=*/false>(
+           kLarge)
+           .rep()),
+      std::numeric_limits<Float8>::infinity().rep());
+
   // Truncation and rounding of a subnormal.
   for (int i = 0x01; i < 0x04; ++i) {
     float less_than_subnorm =
@@ -272,14 +337,6 @@ TYPED_TEST(Float8Test, ConvertTo) {
 }
 
 TEST(Float8Test, Float8E5m2_To_Float8E4m3) {
-  for (int i = 0x00; i <= 0xFF; ++i) {
-    float8_e5m2 e5m2 = float8_e5m2::FromRep(i);
-    float8_e4m3fn e4m3 = static_cast<float8_e4m3fn>(e5m2);
-    float8_e4m3fn expected =
-        static_cast<float8_e4m3fn>(static_cast<float>(e5m2));
-    EXPECT_EQ(e4m3.rep(), expected.rep()) << i;
-  }
-
   // Saturation.
   float8_e5m2 max = std::numeric_limits<float8_e5m2>::max();
   float8_e4m3fn saturated = float8_e4m3fn::ConvertFrom</*kSaturate=*/true>(max);
@@ -299,14 +356,110 @@ TEST(Float8Test, Float8E5m2_To_Float8E4m3) {
   EXPECT_EQ(truncated_subnorm.rep(), 0x03);
 }
 
-TEST(Float8Test, Float8E4m3_To_Float8E5m2) {
-  for (int i = 0x00; i <= 0xFF; ++i) {
-    float8_e4m3fn e4m3 = float8_e4m3fn::FromRep(i);
-    float8_e5m2 e5m2 = static_cast<float8_e5m2>(e4m3);
-    float8_e5m2 expected = static_cast<float8_e5m2>(static_cast<float>(e4m3));
-    EXPECT_EQ(e5m2.rep(), expected.rep()) << i;
-  }
+TEST(Float8Test, Half_To_Float8E4m3) {
+  Eigen::half big_half(0x1.dfcp+8f);
+  float8_e4m3fn big_e4m3 =
+      float8_e4m3fn::ConvertFrom</*kSaturate=*/true, /*kTruncate=*/false>(
+          big_half);
+  EXPECT_EQ(big_e4m3.rep(), std::numeric_limits<float8_e4m3fn>::max().rep());
+}
 
+TEST(Float8Test, Float8E5m2_To_Float8E4m3b11) {
+  // Saturation.
+  float8_e5m2 max = std::numeric_limits<float8_e5m2>::max();
+  float8_e4m3b11 saturated =
+      float8_e4m3b11::ConvertFrom</*kSaturate=*/true>(max);
+  EXPECT_EQ(saturated, std::numeric_limits<float8_e4m3b11>::max());
+  saturated = float8_e5m2::ConvertTo<float8_e4m3b11, /*kSaturate=*/true>(max);
+  EXPECT_EQ(saturated, std::numeric_limits<float8_e4m3b11>::max());
+
+  // Truncation - only occurs for e4m3 subnormals.
+  float8_e5m2 less_than_subnorm = float8_e5m2::FromRep(0x0F);  // 2^-11 - 2^-14.
+  float8_e4m3b11 rounded_subnorm =
+      float8_e4m3b11::ConvertFrom</*kSaturate=*/false, /*kTruncate=*/false>(
+          less_than_subnorm);
+  EXPECT_EQ(rounded_subnorm.rep(), 0x04);
+  float8_e4m3b11 truncated_subnorm =
+      float8_e4m3b11::ConvertFrom</*kSaturate=*/false, /*kTruncate=*/true>(
+          less_than_subnorm);
+  EXPECT_EQ(truncated_subnorm.rep(), 0x03);
+
+  // Saturation.
+  for (uint8_t i = 0; i < std::numeric_limits<float8_e5m2>::infinity().rep();
+       ++i) {
+    float8_e5m2 big_e5m2 = Eigen::numext::bit_cast<float8_e5m2>(i);
+    EXPECT_TRUE(Eigen::numext::isfinite(big_e5m2)) << uint16_t{i};
+    float big_float = static_cast<float>(big_e5m2);
+    auto big_e4m3 =
+        float8_e4m3b11::ConvertFrom</*kSaturate=*/true, /*kTruncate=*/false>(
+            big_float);
+    if (i > 0x4f) {
+      EXPECT_EQ(big_e4m3.rep(),
+                std::numeric_limits<float8_e4m3b11>::max().rep())
+          << uint16_t{i};
+    }
+    EXPECT_EQ(
+        (float8_e4m3b11::ConvertFrom</*kSaturate=*/true, /*kTruncate=*/false>(
+             big_e5m2)
+             .rep()),
+        big_e4m3.rep())
+        << i;
+    EXPECT_EQ(
+        (float8_e4m3b11::ConvertFrom</*kSaturate=*/true, /*kTruncate=*/false>(
+             -big_e5m2)
+             .rep()),
+        (-big_e4m3).rep())
+        << i;
+  }
+}
+
+TEST(Float8Test, Float8E4m3b11_To_Float8E4m3) {
+  // Saturation.
+  float8_e4m3b11 max = std::numeric_limits<float8_e4m3b11>::max();
+  float8_e4m3fn saturated = float8_e4m3fn::ConvertFrom</*kSaturate=*/true>(max);
+  EXPECT_EQ(static_cast<float>(saturated),
+            static_cast<float>(std::numeric_limits<float8_e4m3b11>::max()));
+  saturated = float8_e4m3b11::ConvertTo<float8_e4m3fn, /*kSaturate=*/true>(max);
+  EXPECT_EQ(static_cast<float>(saturated),
+            static_cast<float>(std::numeric_limits<float8_e4m3b11>::max()));
+
+  // Truncation - only occurs for e4m3 subnormals.
+  float8_e4m3b11 less_than_subnorm =
+      float8_e4m3b11::FromRep(0b0011'110);  // 2^-7 - 2^-10.
+  float8_e4m3fn rounded_subnorm =
+      float8_e4m3fn::ConvertFrom</*kSaturate=*/false, /*kTruncate=*/false>(
+          less_than_subnorm);
+  EXPECT_EQ(rounded_subnorm.rep(), 0x04);
+  float8_e4m3fn truncated_subnorm =
+      float8_e4m3fn::ConvertFrom</*kSaturate=*/false, /*kTruncate=*/true>(
+          less_than_subnorm);
+  EXPECT_EQ(truncated_subnorm.rep(), 0x03);
+
+  // Saturation.
+  for (uint8_t i = 0; i < std::numeric_limits<float8_e4m3b11>::infinity().rep();
+       ++i) {
+    float8_e4m3b11 big_e4m3b11 = Eigen::numext::bit_cast<float8_e4m3b11>(i);
+    EXPECT_TRUE(Eigen::numext::isfinite(big_e4m3b11)) << uint16_t{i};
+    float big_float = static_cast<float>(big_e4m3b11);
+    auto big_e4m3 =
+        float8_e4m3fn::ConvertFrom</*kSaturate=*/true, /*kTruncate=*/false>(
+            big_float);
+    EXPECT_EQ(
+        (float8_e4m3fn::ConvertFrom</*kSaturate=*/true, /*kTruncate=*/false>(
+             big_e4m3b11)
+             .rep()),
+        big_e4m3.rep())
+        << i;
+    EXPECT_EQ(
+        (float8_e4m3fn::ConvertFrom</*kSaturate=*/true, /*kTruncate=*/false>(
+             -big_e4m3b11)
+             .rep()),
+        (big_float > 0.0f ? -big_e4m3 : big_e4m3).rep())
+        << i;
+  }
+}
+
+TEST(Float8Test, Float8E4m3_To_Float8E5m2) {
   // Truncation and rounding of a number ever-so-slightly less than 2.
   float8_e4m3fn less_than_two = float8_e4m3fn::FromRep(0x3F);
   float8_e5m2 truncated =
@@ -463,20 +616,27 @@ struct Float8CastTestParamNames {
   }
 };
 
-using Float8CastTypePairs = ::testing::Types<
 #if !defined(EIGEN_USE_GPU) && !defined(EIGEN_GPU_COMPILE_PHASE)
-    // long double doesn't work on GPU - it is treated as a regular 8-byte
-    // double, which differs in size from the 16-byte long double on intel CPU.
-    std::pair<float8_e5m2, long double>, std::pair<float8_e4m3fn, long double>,
+// long double doesn't work on GPU - it is treated as a regular 8-byte
+// double, which differs in size from the 16-byte long double on intel CPU.
+#define GEN_LONG_DOUBLE_PAIR(Type) std::pair<Type, long double>,
+#else
+#define GEN_LONG_DOUBLE_PAIR(Type)
 #endif
-    std::pair<float8_e4m3fn, double>, std::pair<float8_e4m3fn, float>,
-    std::pair<float8_e4m3fn, Eigen::bfloat16>,
-    std::pair<float8_e4m3fn, Eigen::half>, std::pair<float8_e4m3fn, bool>,
-    std::pair<float8_e4m3fn, int32_t>, std::pair<float8_e4m3fn, int64_t>,
-    std::pair<float8_e5m2, double>, std::pair<float8_e5m2, float>,
-    std::pair<float8_e5m2, Eigen::bfloat16>,
-    std::pair<float8_e5m2, Eigen::half>, std::pair<float8_e5m2, bool>,
-    std::pair<float8_e5m2, int32_t>, std::pair<float8_e5m2, int64_t> >;
+
+#define GEN_DEST_TYPES(Type)                                           \
+  GEN_LONG_DOUBLE_PAIR(Type)                                           \
+  std::pair<Type, double>, std::pair<Type, float>,                     \
+      std::pair<Type, Eigen::bfloat16>, std::pair<Type, Eigen::half>,  \
+      std::pair<Type, float8_e4m3fn>, std::pair<Type, float8_e4m3b11>, \
+      std::pair<Type, float8_e5m2>, std::pair<Type, bool>,             \
+      std::pair<Type, int32_t>, std::pair<Type, int64_t>
+
+#define GEN_TYPE_PAIRS()                                         \
+  GEN_DEST_TYPES(float8_e4m3fn), GEN_DEST_TYPES(float8_e4m3b11), \
+      GEN_DEST_TYPES(float8_e5m2)
+
+using Float8CastTypePairs = ::testing::Types<GEN_TYPE_PAIRS()>;
 
 template <typename CastPair>
 class Float8CastTest : public ::testing::Test {};
@@ -489,14 +649,14 @@ TYPED_TEST(Float8CastTest, CastThroughFloat) {
   for (int i = 0x00; i <= 0xFF; ++i) {
     Float8 f8 = Float8::FromRep(i);
 
-    if ((!Eigen::numext::isnan(f8) ||
-         std::numeric_limits<DestType>::has_quiet_NaN) &&
-        (!Eigen::numext::isinf(f8) ||
-         std::numeric_limits<DestType>::has_infinity)) {
-      DestType dest = static_cast<DestType>(f8);
-      DestType expected = static_cast<DestType>(static_cast<float>(f8));
-      EXPECT_THAT(dest, EqOrIsNan(expected));
+    if constexpr (std::numeric_limits<DestType>::is_integer) {
+      if (!Eigen::numext::isfinite(f8)) {
+        continue;
+      }
     }
+    DestType dest = static_cast<DestType>(f8);
+    DestType expected = static_cast<DestType>(static_cast<float>(f8));
+    EXPECT_THAT(dest, EqOrIsNan(expected));
   }
 }
 
diff --git a/tensorflow/tsl/platform/profile_utils/BUILD b/tensorflow/tsl/platform/profile_utils/BUILD
index 27fdc169182..e7975f8c7ca 100644
--- a/tensorflow/tsl/platform/profile_utils/BUILD
+++ b/tensorflow/tsl/platform/profile_utils/BUILD
@@ -1,6 +1,7 @@
 # Description:
 # profile_utils targets.
 
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load(
     "//tensorflow/tsl/platform:rules_cc.bzl",
@@ -13,12 +14,12 @@ load(
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/platform:__subpackages__",
         "//tensorflow/tsl:__pkg__",
         "//tensorflow/tsl/platform/default:__pkg__",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/tsl/platform/profile_utils/cpu_utils.cc b/tensorflow/tsl/platform/profile_utils/cpu_utils.cc
index baf69733f40..1462ecab70f 100644
--- a/tensorflow/tsl/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/tsl/platform/profile_utils/cpu_utils.cc
@@ -120,13 +120,31 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
          "CPU frequency";
   return INVALID_FREQUENCY;
 #elif defined(__APPLE__)
-  int64 freq_hz = 0;
-  size_t freq_size = sizeof(freq_hz);
+  int64_t freq_hz = 0;
+  size_t freq_hz_size = sizeof(freq_hz);
   int retval =
-      sysctlbyname("hw.cpufrequency_max", &freq_hz, &freq_size, NULL, 0);
+      sysctlbyname("hw.cpufrequency_max", &freq_hz, &freq_hz_size, NULL, 0);
   if (retval != 0 || freq_hz < 1e6) {
-    LOG(WARNING) << "Failed to get CPU frequency: " << freq_hz << " Hz";
-    return INVALID_FREQUENCY;
+    // Apple M1/M2 do not have hw.cpufrequency.* values, but instead rely on
+    // a base clock rate hw.tbfrequency and multiplier kern.clockrate.hz.
+    int64_t tbfrequency = 0;
+    size_t tbfrequency_size = sizeof(tbfrequency);
+    retval = sysctlbyname("hw.tbfrequency", &tbfrequency, &tbfrequency_size,
+                          NULL, 0);
+    if (retval == 0) {
+      clockinfo clock_info;
+      size_t clock_info_size = sizeof(clock_info);
+      retval = sysctlbyname("kern.clockrate", &clock_info, &clock_info_size,
+                            NULL, 0);
+      if (retval == 0) {
+        freq_hz = clock_info.hz * tbfrequency;
+      }
+    }
+
+    if (retval != 0 || freq_hz < 1e6) {
+      LOG(WARNING) << "Failed to get CPU frequency: " << freq_hz << " Hz";
+      return INVALID_FREQUENCY;
+    }
   }
   return freq_hz;
 #elif defined(_WIN32)
diff --git a/tensorflow/tsl/platform/refcount.h b/tensorflow/tsl/platform/refcount.h
index 5965b86420e..aed3aa6a272 100644
--- a/tensorflow/tsl/platform/refcount.h
+++ b/tensorflow/tsl/platform/refcount.h
@@ -65,6 +65,11 @@ class RefCounted {
   // reference implementation.
   bool TryRef() const;
 
+  // Notifies the instance is deleted. This function is used by WeakRefCounted
+  // for securely propagating the delete notification before the destruction
+  // sequence starts.
+  virtual void NotifyDeleted() const;
+
  private:
   mutable std::atomic_int_fast32_t ref_;
 
@@ -77,9 +82,31 @@ struct RefCountDeleter {
   void operator()(const RefCounted* o) const { o->Unref(); }
 };
 
+template <typename T>
+class RefCountPtr;
+
+// Adds a new reference to a RefCounted pointer.
+template <typename T>
+ABSL_MUST_USE_RESULT RefCountPtr<T> GetNewRef(T* ptr) {
+  static_assert(std::is_base_of<RefCounted, T>::value);
+
+  if (ptr == nullptr) return RefCountPtr<T>();
+  ptr->Ref();
+  RefCountPtr<T> ret(ptr);
+  return ret;
+}
+
 // A unique_ptr that unrefs the owned object on destruction.
 template <typename T>
-using RefCountPtr = std::unique_ptr<T, RefCountDeleter>;
+class RefCountPtr : public std::unique_ptr<T, RefCountDeleter> {
+ public:
+  using std::unique_ptr<T, RefCountDeleter>::unique_ptr;
+  ABSL_MUST_USE_RESULT RefCountPtr GetNewRef() const {
+    if (this->get() == nullptr) return RefCountPtr<T>();
+    this->get()->Ref();
+    return RefCountPtr<T>(this->get());
+  }
+};
 
 // Helper class to unref an object when out-of-scope.
 class ScopedUnref {
@@ -122,7 +149,7 @@ class WeakRefCounted : public RefCounted {
   }
 
  protected:
-  ~WeakRefCounted() override { data_->Notify(); }
+  void NotifyDeleted() const override { data_->Notify(); }
 
  private:
   struct WeakRefData : public RefCounted {
@@ -187,7 +214,7 @@ class WeakRefCounted : public RefCounted {
     }
   };
 
-  RefCountPtr<WeakRefData> data_{new WeakRefData(this)};
+  mutable RefCountPtr<WeakRefData> data_{new WeakRefData(this)};
 
   template <typename T>
   friend class WeakPtr;
@@ -304,6 +331,7 @@ inline bool RefCounted::Unref() const {
   // Using release alone is a bug on systems where acq_rel differs from release.
   // (e.g. arm), according to Herb Sutter's 2012 talk on "Atomic<> Weapons".
   if (ref_.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+    NotifyDeleted();
     delete this;
     return true;
   }
@@ -314,6 +342,8 @@ inline int_fast32_t RefCounted::RefCount() const {
   return ref_.load(std::memory_order_acquire);
 }
 
+inline void RefCounted::NotifyDeleted() const {}
+
 inline bool RefCounted::RefCountIsOne() const {
   return (ref_.load(std::memory_order_acquire) == 1);
 }
diff --git a/tensorflow/tsl/platform/refcount_test.cc b/tensorflow/tsl/platform/refcount_test.cc
index 817976bbad9..ba08af675b0 100644
--- a/tensorflow/tsl/platform/refcount_test.cc
+++ b/tensorflow/tsl/platform/refcount_test.cc
@@ -107,7 +107,28 @@ TEST_F(RefTest, ScopedUnref_Nullptr) {
   EXPECT_EQ(destroyed_, 0);
 }
 
-class ObjType : public WeakRefCounted {};
+TEST_F(RefTest, RefCountPtr) {
+  const RefCountPtr<MyRef> cref = RefCountPtr<MyRef>(new MyRef);
+  ASSERT_TRUE(cref.get() != nullptr);
+  ASSERT_EQ(cref->RefCount(), 1);
+  {
+    const RefCountPtr<MyRef> cref2 = cref.GetNewRef();
+    ASSERT_EQ(cref->RefCount(), 2);
+  }
+  ASSERT_EQ(cref->RefCount(), 1);
+}
+
+class ObjType : public WeakRefCounted {
+ public:
+  ObjType() : ObjType(unused_dtor_called_) {}
+  explicit ObjType(int& dtor_called) : dtor_called_(dtor_called) {}
+  ~ObjType() override { dtor_called_++; }
+
+  int& dtor_called_;
+  static int unused_dtor_called_;
+};
+
+int ObjType::unused_dtor_called_ = 0;
 
 TEST(WeakPtr, SingleThread) {
   auto obj = new ObjType();
@@ -190,6 +211,28 @@ TEST(WeakPtr, NotifyCalled) {
   EXPECT_EQ(num_calls2, 1);
 }
 
+TEST(WeakPtr, NotifyCalledBeforeDestructor) {
+  int dtor_called = 0;
+  auto obj = new ObjType(dtor_called);
+  int num_calls1 = 0;
+
+  auto notify_fn1 = [&num_calls1, &dtor_called]() {
+    num_calls1++;
+    EXPECT_EQ(dtor_called, 0);
+  };
+  WeakPtr<ObjType> weakptr1(obj, notify_fn1);
+
+  ASSERT_TRUE(obj->RefCountIsOne());
+  EXPECT_EQ(obj->WeakRefCount(), 1);
+  EXPECT_NE(weakptr1.GetNewRef(), nullptr);
+
+  EXPECT_EQ(num_calls1, 0);
+  obj->Unref();
+  EXPECT_EQ(weakptr1.GetNewRef(), nullptr);
+  EXPECT_EQ(num_calls1, 1);
+  EXPECT_EQ(dtor_called, 1);
+}
+
 TEST(WeakPtr, CopyTargetCalled) {
   auto obj = new ObjType();
   int num_calls1 = 0;
diff --git a/tensorflow/tsl/platform/retrying_file_system_test.cc b/tensorflow/tsl/platform/retrying_file_system_test.cc
index 710452f3233..486f0a65449 100644
--- a/tensorflow/tsl/platform/retrying_file_system_test.cc
+++ b/tensorflow/tsl/platform/retrying_file_system_test.cc
@@ -274,7 +274,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
   StringPiece result;
   char scratch[10];
   const auto& status = random_access_file->Read(0, 10, &result, scratch);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -305,7 +305,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_NoRetriesForSomeErrors) {
   StringPiece result;
   char scratch[10];
   EXPECT_EQ("Failed precondition",
-            random_access_file->Read(0, 10, &result, scratch).error_message());
+            random_access_file->Read(0, 10, &result, scratch).message());
 }
 
 TEST(RetryingFileSystemTest, NewWritableFile_ImmediateSuccess) {
@@ -439,7 +439,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
 
   // Use it and check the results.
   const auto& status = writable_file->Sync();
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -470,7 +470,7 @@ TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
   std::unique_ptr<ReadOnlyMemoryRegion> result;
   const auto& status =
       fs.NewReadOnlyMemoryRegionFromFile("filename.txt", nullptr, &result);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -497,7 +497,7 @@ TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
 
   std::vector<string> result;
   const auto& status = fs.GetChildren("gs://path", nullptr, &result);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -525,7 +525,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
 
   std::vector<string> result;
   const auto& status = fs.GetMatchingPaths("gs://path/dir", nullptr, &result);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -549,7 +549,7 @@ TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   const auto& status = fs.DeleteFile("gs://path/file.txt", nullptr);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -573,7 +573,7 @@ TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   const auto& status = fs.CreateDir("gs://path/newdir", nullptr);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -597,7 +597,7 @@ TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   const auto& status = fs.DeleteDir("gs://path/dir", nullptr);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -624,7 +624,7 @@ TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
 
   uint64 size;
   const auto& status = fs.GetFileSize("gs://path/file.txt", nullptr, &size);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -648,7 +648,7 @@ TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   const auto& status = fs.RenameFile("old_name", "new_name", nullptr);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -674,7 +674,7 @@ TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
 
   FileStatistics stat;
   const auto& status = fs.Stat("file_name", nullptr, &stat);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -686,7 +686,7 @@ TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   const auto& status = fs.FileExists("file_name", nullptr);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -723,7 +723,7 @@ TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   const auto& status = fs.IsDirectory("gs://path/dir", nullptr);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
@@ -753,7 +753,7 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
 
   const auto& status = fs.DeleteRecursively("gs://path/dir", nullptr,
                                             &undeleted_files, &undeleted_dirs);
-  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.message(), "Retriable error #10"))
       << status;
 }
 
diff --git a/tensorflow/tsl/platform/retrying_utils.cc b/tensorflow/tsl/platform/retrying_utils.cc
index 6d955ad6e64..179bbdea1ef 100644
--- a/tensorflow/tsl/platform/retrying_utils.cc
+++ b/tensorflow/tsl/platform/retrying_utils.cc
@@ -24,11 +24,11 @@ namespace tsl {
 
 namespace {
 
-bool IsRetriable(error::Code code) {
+bool IsRetriable(absl::StatusCode code) {
   switch (code) {
-    case error::UNAVAILABLE:
-    case error::DEADLINE_EXCEEDED:
-    case error::UNKNOWN:
+    case absl::StatusCode::kUnavailable:
+    case absl::StatusCode::kDeadlineExceeded:
+    case absl::StatusCode::kUnknown:
       return true;
     default:
       // OK also falls here.
@@ -62,9 +62,9 @@ Status RetryingUtils::CallWithRetries(
       // at a higher level.
       return Status(
           absl::StatusCode::kAborted,
-          strings::StrCat("All ", config.max_retries,
-                          " retry attempts failed. The last failure: ",
-                          status.error_message()));
+          strings::StrCat(
+              "All ", config.max_retries,
+              " retry attempts failed. The last failure: ", status.message()));
     }
     int64_t delay_micros = 0;
     if (config.init_delay_time_us > 0) {
diff --git a/tensorflow/tsl/platform/retrying_utils_test.cc b/tensorflow/tsl/platform/retrying_utils_test.cc
index 07f580eb339..911d12c88cf 100644
--- a/tensorflow/tsl/platform/retrying_utils_test.cc
+++ b/tensorflow/tsl/platform/retrying_utils_test.cc
@@ -37,7 +37,7 @@ TEST(RetryingUtilsTest, CallWithRetries_RetryDelays) {
       f, sleep, RetryConfig(500000 /* init_delay_time_us */));
   EXPECT_TRUE(errors::IsAborted(status));
   EXPECT_TRUE(absl::StrContains(
-      status.error_message(),
+      status.message(),
       "All 10 retry attempts failed. The last failure: Failed."))
       << status;
 
diff --git a/tensorflow/tsl/platform/status.cc b/tensorflow/tsl/platform/status.cc
index 2816dcc3a83..4dc3941485e 100644
--- a/tensorflow/tsl/platform/status.cc
+++ b/tensorflow/tsl/platform/status.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <string>
 #include <unordered_map>
 #include <utility>
+#include <vector>
 
 #include "absl/base/call_once.h"
 #include "absl/functional/function_ref.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/tsl/platform/mutex.h"
 #include "tensorflow/tsl/platform/stack_frame.h"
@@ -45,26 +47,7 @@ limitations under the License.
 #include "tensorflow/tsl/protobuf/error_codes.pb.h"
 
 namespace tsl {
-namespace error {
-// TODO(aminim): figure out the protobuf migration story
-using tensorflow::error::ABORTED;
-using tensorflow::error::ALREADY_EXISTS;
-using tensorflow::error::CANCELLED;
-using tensorflow::error::DATA_LOSS;
-using tensorflow::error::DEADLINE_EXCEEDED;
-using tensorflow::error::FAILED_PRECONDITION;
-using tensorflow::error::INTERNAL;
-using tensorflow::error::INVALID_ARGUMENT;
-using tensorflow::error::NOT_FOUND;
-using tensorflow::error::OK;
-using tensorflow::error::OUT_OF_RANGE;
-using tensorflow::error::PERMISSION_DENIED;
-using tensorflow::error::RESOURCE_EXHAUSTED;
-using tensorflow::error::UNAUTHENTICATED;
-using tensorflow::error::UNAVAILABLE;
-using tensorflow::error::UNIMPLEMENTED;
-using tensorflow::error::UNKNOWN;
-}  // namespace error
+
 namespace {
 
 // Log sink is used to collect recent warning and error log messages to be
@@ -172,225 +155,22 @@ std::vector<StackFrame> GetStackTrace(const ::tsl::Status& status) {
 
 }  // namespace errors
 
-Status::~Status() {}
+const absl::string_view kEmptyString = "";
 
-absl::Span<const SourceLocation> Status::GetSourceLocations() const {
-  return state_ != nullptr ? state_->source_locations
-                           : absl::Span<const SourceLocation>();
-}
-
-void Status::MaybeAddSourceLocation(SourceLocation loc) {
-  if (state_ == nullptr) {
-    return;
+const char* NullTerminatedMessage(const Status& status) {
+  auto message = status.message();
+  if (message.empty()) {
+    return kEmptyString.data();
   }
-  if (loc.line() <= 0) {
-    return;
-  }
-  if (loc.file_name() == nullptr) {
-    return;
-  }
-  if (loc.file_name()[0] == '\0') {
-    return;
-  }
-  state_->source_locations.push_back(loc);
+  return message.data();
 }
 
-Status::Status(tsl::errors::Code code, absl::string_view msg,
-               SourceLocation loc) {
-  assert(code != tsl::errors::Code::OK);
-  state_ = std::make_unique<State>();
-  state_->code = static_cast<tsl::error::Code>(code);
-  state_->msg = std::string(msg);
-  MaybeAddSourceLocation(loc);
-  VLOG(5) << "Generated non-OK status: \"" << *this << "\". "
-          << CurrentStackTrace();
-}
-
-Status::Status(absl::StatusCode code, absl::string_view msg,
-               SourceLocation loc) {
-  assert(code != absl::StatusCode::kOk);
-  state_ = std::make_unique<State>();
-  state_->code = static_cast<tsl::error::Code>(code);
-  state_->msg = std::string(msg);
-  MaybeAddSourceLocation(loc);
-  VLOG(5) << "Generated non-OK status: \"" << *this << "\". "
-          << CurrentStackTrace();
-}
-
-void Status::Update(const Status& new_status) {
-  if (ok()) {
-    *this = new_status;
-  }
-}
-
-void Status::SlowCopyFrom(const State* src) {
-  if (src == nullptr) {
-    state_ = nullptr;
-  } else {
-    state_ = std::make_unique<State>(*src);
-  }
-}
-
-Status::State* Status::NewStateFromNonOKStatus(const Status& s) {
-  return new State(*s.state_);
-}
-
-const std::string& Status::empty_string() {
-  static string* empty = new string;
-  return *empty;
-}
-
-std::string error_name(error::Code code) {
-  switch (code) {
-    case tsl::error::OK:
-      return "OK";
-      break;
-    case tsl::error::CANCELLED:
-      return "CANCELLED";
-      break;
-    case tsl::error::UNKNOWN:
-      return "UNKNOWN";
-      break;
-    case tsl::error::INVALID_ARGUMENT:
-      return "INVALID_ARGUMENT";
-      break;
-    case tsl::error::DEADLINE_EXCEEDED:
-      return "DEADLINE_EXCEEDED";
-      break;
-    case tsl::error::NOT_FOUND:
-      return "NOT_FOUND";
-      break;
-    case tsl::error::ALREADY_EXISTS:
-      return "ALREADY_EXISTS";
-      break;
-    case tsl::error::PERMISSION_DENIED:
-      return "PERMISSION_DENIED";
-      break;
-    case tsl::error::UNAUTHENTICATED:
-      return "UNAUTHENTICATED";
-      break;
-    case tsl::error::RESOURCE_EXHAUSTED:
-      return "RESOURCE_EXHAUSTED";
-      break;
-    case tsl::error::FAILED_PRECONDITION:
-      return "FAILED_PRECONDITION";
-      break;
-    case tsl::error::ABORTED:
-      return "ABORTED";
-      break;
-    case tsl::error::OUT_OF_RANGE:
-      return "OUT_OF_RANGE";
-      break;
-    case tsl::error::UNIMPLEMENTED:
-      return "UNIMPLEMENTED";
-      break;
-    case tsl::error::INTERNAL:
-      return "INTERNAL";
-      break;
-    case tsl::error::UNAVAILABLE:
-      return "UNAVAILABLE";
-      break;
-    case tsl::error::DATA_LOSS:
-      return "DATA_LOSS";
-      break;
-    default:
-      char tmp[30];
-      snprintf(tmp, sizeof(tmp), "UNKNOWN_CODE(%d)", static_cast<int>(code));
-      return tmp;
-      break;
-  }
-}
-
-std::string Status::ToString() const {
-  if (state_ == nullptr) {
-    return "OK";
-  } else {
-    std::string result(error_name(state_->code));
-    result += ": ";
-    result += state_->msg;
-
-    for (const std::pair<const std::string, absl::Cord>& element :
-         state_->payloads) {
-      absl::StrAppend(&result, " [", element.first, "='",
-                      absl::CHexEscape(std::string(element.second)), "']");
-    }
-
-    return result;
-  }
-}
-
-void Status::IgnoreError() const {
-  // no-op
-}
-
-void Status::SetPayload(absl::string_view type_url, absl::Cord payload) {
-  if (ok()) return;
-  state_->payloads[std::string(type_url)] = payload;
-}
-
-absl::optional<absl::Cord> Status::GetPayload(
-    absl::string_view type_url) const {
-  if (ok()) return absl::nullopt;
-  auto payload_iter = state_->payloads.find(std::string(type_url));
-  if (payload_iter == state_->payloads.end()) return absl::nullopt;
-  return payload_iter->second;
-}
-
-bool Status::ErasePayload(absl::string_view type_url) {
-  if (ok()) return false;
-  auto payload_iter = state_->payloads.find(std::string(type_url));
-  if (payload_iter == state_->payloads.end()) return false;
-  state_->payloads.erase(payload_iter);
-  return true;
-}
-
-void Status::ForEachPayload(
-    absl::FunctionRef<void(absl::string_view, const absl::Cord&)> visitor)
-    const {
-  if (ok()) return;
-  for (const auto& payload : state_->payloads) {
-    visitor(payload.first, payload.second);
-  }
-}
-
-std::ostream& operator<<(std::ostream& os, const Status& x) {
-  os << x.ToString();
-  return os;
-}
 
 Status OkStatus() { return Status(); }
 
-Status FromAbslStatus(const absl::Status& s, SourceLocation loc) {
-  if (s.ok()) {
-    return Status();
-  }
-  absl::Span<const SourceLocation> locs = internal::GetSourceLocations(s);
-  const SourceLocation first_loc = locs.empty() ? loc : locs[0];
-  Status converted(static_cast<tsl::error::Code>(s.code()), s.message(),
-                   first_loc);
-  for (int i = 1; i < locs.size(); ++i) {
-    converted.MaybeAddSourceLocation(locs[i]);
-  }
-  s.ForEachPayload(
-      [&converted](absl::string_view key, const absl::Cord& value) {
-        converted.SetPayload(key, value);
-      });
-  return converted;
-}
+Status FromAbslStatus(const absl::Status& s) { return s; }
 
-absl::Status ToAbslStatus(const ::tsl::Status& s, SourceLocation loc) {
-  if (s.ok()) {
-    return absl::OkStatus();
-  }
-
-  absl::Status converted = internal::MakeAbslStatus(
-      s.code(), s.error_message(), s.GetSourceLocations(), loc);
-  s.ForEachPayload([&converted](tsl::StringPiece key, const absl::Cord& value) {
-    converted.SetPayload(key, value);
-  });
-
-  return converted;
-}
+absl::Status ToAbslStatus(const ::absl::Status& s) { return s; }
 
 std::string* TfCheckOpHelperOutOfLine(const ::tsl::Status& v, const char* msg) {
   std::string r("Non-OK-status: ");
@@ -470,7 +250,7 @@ std::unordered_map<std::string, absl::Cord> StatusGroup::GetPayloads() const {
   return payloads;
 }
 
-Status MakeStatus(tensorflow::error::Code code, absl::string_view message,
+Status MakeStatus(absl::StatusCode code, absl::string_view message,
                   const std::unordered_map<std::string, absl::Cord>& payloads) {
   Status status(code, message);
   for (const auto& payload : payloads) {
@@ -480,7 +260,8 @@ Status MakeStatus(tensorflow::error::Code code, absl::string_view message,
 }
 
 std::string MakeString(const Status& status) {
-  return absl::StrCat(error_name(status.code()), ": ", status.error_message());
+  return absl::StrCat(absl::StatusCodeToString(status.code()), ": ",
+                      status.message());
 }
 
 // Summarize all the status objects in the StatusGroup. This is used when
@@ -507,10 +288,10 @@ Status StatusGroup::as_summary_status() const {
 
   // If only one root status is found, do not add summary header and footer.
   if (non_derived_.size() == 1) {
-    return MakeStatus(non_derived_.begin()->code(),
-                      strings::StrCat(non_derived_.begin()->error_message(),
-                                      get_recent_logs()),
-                      GetPayloads());
+    return MakeStatus(
+        non_derived_.begin()->code(),
+        strings::StrCat(non_derived_.begin()->message(), get_recent_logs()),
+        GetPayloads());
   }
 
   if (!non_derived_.empty()) {
@@ -520,11 +301,12 @@ Status StatusGroup::as_summary_status() const {
         strings::Printf("%zu root error(s) found.", non_derived_.size()));
 
     int index = 0;
-    auto code = tsl::error::CANCELLED;
+    auto code = absl::StatusCode::kCancelled;
     for (const auto& s : non_derived_) {
       // NOTE: Avoid using CANCELLED as the code of summary status if the group
       // contains other error code.
-      if (code == tsl::error::CANCELLED && s.code() != tsl::error::CANCELLED) {
+      if (code == absl::StatusCode::kCancelled &&
+          s.code() != absl::StatusCode::kCancelled) {
         code = s.code();
       }
       fmt.emplace_back(strings::StrCat("  (", index, ") ", MakeString(s)));
@@ -543,8 +325,7 @@ Status StatusGroup::as_summary_status() const {
   } else {
     // All statuses are derived. Pick the first available status to return.
     return MakeDerived(MakeStatus(derived_.begin()->code(),
-                                  derived_.begin()->error_message(),
-                                  GetPayloads()));
+                                  derived_.begin()->message(), GetPayloads()));
   }
 }
 
@@ -558,7 +339,7 @@ Status StatusGroup::as_concatenated_status() const {
   // If only one root status is found, return it directly.
   if (non_derived_.size() == 1) {
     return MakeStatus(non_derived_.begin()->code(),
-                      non_derived_.begin()->error_message(), GetPayloads());
+                      non_derived_.begin()->message(), GetPayloads());
   }
 
   if (!non_derived_.empty()) {
@@ -576,8 +357,7 @@ Status StatusGroup::as_concatenated_status() const {
     // All statuses are derived. Pick the first available status to return.
     // This should not happen in normal execution.
     return MakeDerived(MakeStatus(derived_.begin()->code(),
-                                  derived_.begin()->error_message(),
-                                  GetPayloads()));
+                                  derived_.begin()->message(), GetPayloads()));
   }
 }
 
diff --git a/tensorflow/tsl/platform/status.h b/tensorflow/tsl/platform/status.h
index 7e3d6750b6f..35405825f25 100644
--- a/tensorflow/tsl/platform/status.h
+++ b/tensorflow/tsl/platform/status.h
@@ -47,14 +47,18 @@ limitations under the License.
 
 namespace tsl {
 
-#if TF_HAS_CPP_ATTRIBUTE(nodiscard)
-class [[nodiscard]] Status;
-#endif
-
 typedef SourceLocationImpl SourceLocation;
 
+// Since April 2023, tensorflow::Status is an alias to absl::Status. TF 2.13 is
+// the first release including this change.
+// At the same time `tsl::errors::Code` aliases `absl::StatusCode`.
+//
+// Here is a set of correspondences:
+// - Use `absl::OkStatus()` instead of `tsl::OkStatus()`.
+typedef absl::Status Status;
+
 namespace errors {
-typedef ::tensorflow::error::Code Code;
+typedef absl::StatusCode Code;
 }  // namespace errors
 namespace error {
 typedef ::tensorflow::error::Code Code;
@@ -92,175 +96,21 @@ inline bool operator!=(const ::absl::StatusCode& c1,
 
 namespace tsl {
 
-/// @ingroup core
-/// Denotes success or failure of a call in Tensorflow.
-class Status {
- public:
-  /// Create a success status.
-  Status() {}
-  ~Status();  // Not inlined to save code space
-
-  /// \brief Create a status with the specified error code and msg as a
-  /// human-readable string containing more detailed information.
-  Status(absl::StatusCode code, absl::string_view msg,
-         SourceLocation loc = SourceLocation::current());
-  // Deprecated constructor using the Tensorflow protobuf enum error code.
-#ifndef SWIG
-  ABSL_DEPRECATED(
-      "Use `Status(absl::StatusCode, ...) instead of Status(tsl::errors::Code, "
-      "...).")
-#endif
-  Status(tsl::errors::Code code, absl::string_view msg,
-         SourceLocation loc = SourceLocation::current());
-
-  /// Copy the specified status.
-  Status(const Status& s);
-  Status& operator=(const Status& s);
-#ifndef SWIG
-  Status(Status&& s, SourceLocation loc = SourceLocation::current()) noexcept;
-  Status& operator=(Status&& s) noexcept;
-#endif  // SWIG
-
-  /// Returns true iff the status indicates success.
-  bool ok() const { return (state_ == nullptr); }
-
-  tsl::error::Code code() const {
-    return ok() ? tensorflow::error::OK : state_->code;
-  }
-
-  int raw_code() const { return static_cast<int>(code()); }
-
-  const std::string& error_message() const {
-    return ok() ? empty_string() : state_->msg;
-  }
-
-  bool operator==(const Status& x) const;
-  bool operator!=(const Status& x) const;
-
-  /// \brief If `ok()`, stores `new_status` into `*this`.  If `!ok()`,
-  /// preserves the current status, but may augment with additional
-  /// information about `new_status`.
-  ///
-  /// Convenient way of keeping track of the first error encountered.
-  /// Instead of:
-  ///   `if (overall_status.ok()) overall_status = new_status`
-  /// Use:
-  ///   `overall_status.Update(new_status);`
-  void Update(const Status& new_status);
-
-  /// \brief Return a string representation of this status suitable for
-  /// printing. Returns the string `"OK"` for success.
-  ///
-  /// By default, it returns combination of the error code name, the message and
-  /// any associated payload messages. This string is designed simply to be
-  /// human readable and its exact format should not be load bearing. Do not
-  /// depend on the exact format of the result of `ToString()` which is subject
-  /// to change.
-  std::string ToString() const;
-
-  // Ignores any errors. This method does nothing except potentially suppress
-  // complaints from any tools that are checking that errors are not dropped on
-  // the floor.
-  void IgnoreError() const;
-
-  //----------------------------------------------------------------------------
-  // Payload Management APIs (Cloned from absl::Status)
-  //----------------------------------------------------------------------------
-  // A payload may be attached to a status to provide additional context to an
-  // error that may not be satisfied by an existing `tsl::error::Code`.
-  // Typically, this payload serves one of several purposes:
-  //
-  //   * It may provide more fine-grained semantic information about the error
-  //     to facilitate actionable remedies.
-  //   * It may provide human-readable contexual information that is more
-  //     appropriate to display to an end user.
-  //
-  // A payload consists of a [key,value] pair, where the key is a string
-  // referring to a unique "type URL" and the value is an object of type
-  // `absl::Cord` to hold the contextual data.
-  //
-  // The "type URL" should be unique and follow the format of a URL
-  // (https://en.wikipedia.org/wiki/URL) and, ideally, provide some
-  // documentation or schema on how to interpret its associated data. For
-  // example, the default type URL for a protobuf message type is
-  // "type.googleapis.com/packagename.messagename". Other custom wire formats
-  // should define the format of type URL in a similar practice so as to
-  // minimize the chance of conflict between type URLs.
-  // Users should ensure that the type URL can be mapped to a concrete
-  // C++ type if they want to deserialize the payload and read it effectively.
-  //
-  // To attach a payload to a status object, call `Status::SetPayload()`,
-  // passing it the type URL and an `absl::Cord` of associated data. Similarly,
-  // to extract the payload from a status, call `Status::GetPayload()`. You
-  // may attach multiple payloads (with differing type URLs) to any given
-  // status object, provided that the status is currently exhibiting an error
-  // code (i.e. is not OK).
-  // TODO(b/197552541): Use absl::Cord for payload value type.
-
-  // The Payload-related APIs are cloned from absl::Status.
-  //
-  // Returns the payload of a status given its unique `type_url` key, if
-  // present.
-  absl::optional<absl::Cord> GetPayload(absl::string_view type_url) const;
-
-  // Sets the payload for a non-ok status using a `type_url` key, overwriting
-  // any existing payload for that `type_url`.
-  //
-  // This function does nothing if the Status is ok.
-  void SetPayload(absl::string_view type_url, absl::Cord payload);
-
-  // Erases the payload corresponding to the `type_url` key.  Returns `true` if
-  // the payload was present.
-  bool ErasePayload(absl::string_view type_url);
-
-  // Iterates over the stored payloads and calls the
-  // `visitor(type_key, payload)` callable for each one.
-  //
-  // The order of calls to `visitor()` is not specified and may change at
-  // any time and any mutation on the same Status object during visitation is
-  // forbidden and could result in undefined behavior.
-  void ForEachPayload(
-      absl::FunctionRef<void(absl::string_view, const absl::Cord&)> visitor)
-      const;
-
-  absl::Span<const SourceLocation> GetSourceLocations() const;
-
- private:
-  friend Status FromAbslStatus(const absl::Status& s, SourceLocation loc);
-
-  void MaybeAddSourceLocation(SourceLocation loc);
-
-  static const std::string& empty_string();
-  struct State {
-    State() TF_ATTRIBUTE_NOINLINE = default;
-    ~State() TF_ATTRIBUTE_NOINLINE = default;
-    State(const State&) TF_ATTRIBUTE_NOINLINE = default;
-    State& operator=(const State&) TF_ATTRIBUTE_NOINLINE = default;
-
-    tsl::error::Code code;
-    std::string msg;
-    std::unordered_map<std::string, absl::Cord> payloads;
-    absl::InlinedVector<SourceLocation, 4> source_locations;
-  };
-
-  // OK status has a `NULL` state_.  Otherwise, `state_` points to
-  // a `State` structure containing the error code and message(s)
-  std::unique_ptr<State> state_;
-
-  void SlowCopyFrom(const State* src);
-  State* NewStateFromNonOKStatus(const Status& s);
-};
-
 // OkStatus()
 //
 // Returns an OK status, equivalent to a default constructed instance. Prefer
 // usage of `OkStatus()` when constructing such an OK status.
 Status OkStatus();
 
-Status FromAbslStatus(const absl::Status& s,
-                      SourceLocation loc = SourceLocation::current());
-absl::Status ToAbslStatus(const ::tsl::Status& s,
-                          SourceLocation loc = SourceLocation::current());
+absl::Status FromAbslStatus(const absl::Status& s);
+absl::Status ToAbslStatus(const ::absl::Status& s);
+
+// Given `Status.message()` does not guarantee to be always backed by a
+// null-terminated string, we have this utility function when it's needed for
+// the Tensorflow C-API.
+// A more robust API would be to get both a `char*` of the beginning of the
+// string, plus the size (see e.g. `XlaCustomCallStatusSetFailure`).
+const char* NullTerminatedMessage(const Status& status);
 
 // TODO(b/197552541) Move this namespace to errors.h.
 namespace errors {
@@ -327,48 +177,12 @@ class StatusGroup {
   std::vector<std::string> recent_logs_;  // recent warning and error logs
 };
 
-inline Status::Status(const Status& s)
-    : state_((s.state_ == nullptr) ? nullptr : NewStateFromNonOKStatus(s)) {}
-
-inline Status& Status::operator=(const Status& s) {
-  // The following condition catches both aliasing (when this == &s),
-  // and the common case where both s and *this are ok.
-  if (state_ != s.state_) {
-    SlowCopyFrom(s.state_.get());
-  }
-  return *this;
-}
-
-#ifndef SWIG
-inline Status::Status(Status&& s, SourceLocation loc) noexcept
-    : state_(std::move(s.state_)) {
-  MaybeAddSourceLocation(loc);
-}
-
-inline Status& Status::operator=(Status&& s) noexcept {
-  if (state_ != s.state_) {
-    state_ = std::move(s.state_);
-  }
-  return *this;
-}
-#endif  // SWIG
-
-inline bool Status::operator==(const Status& x) const {
-  return (this->state_ == x.state_) || (ToString() == x.ToString());
-}
-
-inline bool Status::operator!=(const Status& x) const { return !(*this == x); }
-
-/// @ingroup core
-std::ostream& operator<<(std::ostream& os, const Status& x);
 
 typedef std::function<void(const Status&)> StatusCallback;
 
 extern tsl::string* TfCheckOpHelperOutOfLine(const ::tsl::Status& v,
                                              const char* msg);
 
-std::string error_name(error::Code code);
-
 inline tsl::string* TfCheckOpHelper(::tsl::Status v, const char* msg) {
   if (v.ok()) return nullptr;
   return TfCheckOpHelperOutOfLine(v, msg);
diff --git a/tensorflow/tsl/platform/status_matchers.cc b/tensorflow/tsl/platform/status_matchers.cc
index ddc4ae9cd2c..4374959af2f 100644
--- a/tensorflow/tsl/platform/status_matchers.cc
+++ b/tensorflow/tsl/platform/status_matchers.cc
@@ -54,7 +54,7 @@ bool StatusIsMatcherCommonImpl::MatchAndExplain(
     return false;
   }
 
-  if (!message_matcher_.Matches(std::string(status.error_message()))) {
+  if (!message_matcher_.Matches(std::string(status.message()))) {
     *result_listener << "whose error message is wrong";
     return false;
   }
diff --git a/tensorflow/tsl/platform/status_matchers_test.cc b/tensorflow/tsl/platform/status_matchers_test.cc
index f2e25285fb1..67b77789b55 100644
--- a/tensorflow/tsl/platform/status_matchers_test.cc
+++ b/tensorflow/tsl/platform/status_matchers_test.cc
@@ -118,8 +118,8 @@ TEST(IsOkAndHoldsTest, DescribeExpectedValue) {
 TEST(IsOkAndHoldsTest, ExplainNotMatchingStatus) {
   Matcher<StatusOr<int>> is_ok_and_less_than = IsOkAndHolds(LessThan(100));
   StatusOr<int> status = errors::Unknown("Unknown");
-  EXPECT_EQ(ExplainMatch(is_ok_and_less_than, status),
-            "which has status " + PrintToString(status));
+  EXPECT_THAT(ExplainMatch(is_ok_and_less_than, status),
+              HasSubstr("which has status UNKNOWN: Unknown"));
 }
 
 TEST(IsOkAndHoldsTest, ExplainNotMatchingValue) {
diff --git a/tensorflow/tsl/platform/status_test.cc b/tensorflow/tsl/platform/status_test.cc
index 2bea1707731..be8c299b7dc 100644
--- a/tensorflow/tsl/platform/status_test.cc
+++ b/tensorflow/tsl/platform/status_test.cc
@@ -49,7 +49,7 @@ TEST(ToStringTest, MatchesAbslStatus) {
                                        "payload_value %c%c%c", 1, 2, 3)));
 
   absl::Status absl_status =
-      absl::Status(absl::StatusCode::kAborted, status.error_message());
+      absl::Status(absl::StatusCode::kAborted, status.message());
   absl_status.SetPayload("payload_key", absl::Cord(absl::StrFormat(
                                             "payload_value %c%c%c", 1, 2, 3)));
 
diff --git a/tensorflow/tsl/platform/status_to_from_proto.cc b/tensorflow/tsl/platform/status_to_from_proto.cc
index ba5e5ede145..6849388d528 100644
--- a/tensorflow/tsl/platform/status_to_from_proto.cc
+++ b/tensorflow/tsl/platform/status_to_from_proto.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/tsl/platform/status_to_from_proto.h"
 
+#include <string>
+
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/protobuf/error_codes.pb.h"
 #include "tensorflow/tsl/protobuf/status.pb.h"
@@ -26,9 +28,9 @@ tensorflow::StatusProto StatusToProto(const Status& s) {
     return status_proto;
   }
 
-  status_proto.set_code(s.code());
-  if (!s.error_message().empty()) {
-    status_proto.set_message(s.error_message());
+  status_proto.set_code(static_cast<tsl::error::Code>(s.code()));
+  if (!s.message().empty()) {
+    status_proto.set_message(std::string(s.message()));
   }
   return status_proto;
 }
@@ -38,7 +40,7 @@ Status StatusFromProto(const tensorflow::StatusProto& proto,
   if (proto.code() == tensorflow::error::OK) {
     return OkStatus();
   }
-  return Status(proto.code(), proto.message(), loc);
+  return Status(static_cast<absl::StatusCode>(proto.code()), proto.message());
 }
 
 }  // namespace tsl
diff --git a/tensorflow/tsl/platform/statusor.cc b/tensorflow/tsl/platform/statusor.cc
deleted file mode 100644
index 2a7d582ea3f..00000000000
--- a/tensorflow/tsl/platform/statusor.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/tsl/platform/statusor.h"
-
-#include "tensorflow/tsl/platform/errors.h"
-#include "tensorflow/tsl/platform/logging.h"
-
-namespace tsl {
-namespace internal_statusor {
-
-void Helper::HandleInvalidStatusCtorArg(Status* status) {
-  const char* kMessage =
-      "An OK status is not a valid constructor argument to StatusOr<T>";
-  LOG(ERROR) << kMessage;
-  // Fall back to tsl::error::INTERNAL.
-  *status = ::tsl::errors::Internal(kMessage);
-}
-
-void Helper::Crash(const Status& status) {
-  LOG(FATAL) << "Attempting to fetch value instead of handling error "
-             << status;
-}
-
-}  // namespace internal_statusor
-}  // namespace tsl
diff --git a/tensorflow/tsl/platform/statusor.h b/tensorflow/tsl/platform/statusor.h
index 012e5d62d39..34bf3e38d20 100644
--- a/tensorflow/tsl/platform/statusor.h
+++ b/tensorflow/tsl/platform/statusor.h
@@ -69,312 +69,14 @@ limitations under the License.
 #define TENSORFLOW_TSL_PLATFORM_STATUSOR_H_
 
 #include "absl/base/attributes.h"
+#include "absl/status/statusor.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/macros.h"
 #include "tensorflow/tsl/platform/status.h"
-#include "tensorflow/tsl/platform/statusor_internals.h"
 
 namespace tsl {
 
-#if TF_HAS_CPP_ATTRIBUTE(nodiscard)
-template <typename T>
-class [[nodiscard]] StatusOr;
-#endif
-
-template <typename T>
-class StatusOr : private internal_statusor::StatusOrData<T>,
-                 private internal_statusor::TraitsBase<
-                     std::is_copy_constructible<T>::value,
-                     std::is_move_constructible<T>::value> {
-  template <typename U>
-  friend class StatusOr;
-
-  typedef internal_statusor::StatusOrData<T> Base;
-
- public:
-  typedef T element_type;  // DEPRECATED: use `value_type`.
-  typedef T value_type;
-
-  // Constructs a new StatusOr with Status::UNKNOWN status.  This is marked
-  // 'explicit' to try to catch cases like 'return {};', where people think
-  // StatusOr<std::vector<int>> will be initialized with an empty vector,
-  // instead of a Status::UNKNOWN status.
-  explicit StatusOr();
-
-  // StatusOr<T> will be copy constructible/assignable if T is copy
-  // constructible.
-  StatusOr(const StatusOr&) = default;
-  StatusOr& operator=(const StatusOr&) = default;
-
-  // StatusOr<T> will be move constructible/assignable if T is move
-  // constructible.
-  StatusOr(StatusOr&&) = default;
-  StatusOr& operator=(StatusOr&&) = default;
-
-  // Conversion copy/move constructor, T must be convertible from U.
-  template <typename U, typename std::enable_if<
-                            std::is_convertible<U, T>::value>::type* = nullptr>
-  StatusOr(const StatusOr<U>& other);
-  template <typename U, typename std::enable_if<
-                            std::is_convertible<U, T>::value>::type* = nullptr>
-  StatusOr(StatusOr<U>&& other);
-
-  // Conversion copy/move assignment operator, T must be convertible from U.
-  template <typename U, typename std::enable_if<
-                            std::is_convertible<U, T>::value>::type* = nullptr>
-  StatusOr& operator=(const StatusOr<U>& other);
-  template <typename U, typename std::enable_if<
-                            std::is_convertible<U, T>::value>::type* = nullptr>
-  StatusOr& operator=(StatusOr<U>&& other);
-
-  // Constructs the inner value `T` in-place using the provided args, using the
-  // `T(args...)` constructor.
-  template <typename... Args>
-  explicit StatusOr(absl::in_place_t, Args&&... args);
-
-  // Constructs a new StatusOr with the given value. After calling this
-  // constructor, calls to value() will succeed, and calls to status() will
-  // return OK.
-  //
-  // NOTE: Not explicit - we want to use StatusOr<T> as a return type
-  // so it is convenient and sensible to be able to do 'return T()'
-  // when the return type is StatusOr<T>.
-  //
-  // REQUIRES: T is copy constructible.
-  StatusOr(const T& value);
-
-  // Constructs a new StatusOr with the given non-ok status. After calling
-  // this constructor, calls to value() will CHECK-fail.
-  //
-  // NOTE: Not explicit - we want to use StatusOr<T> as a return
-  // value, so it is convenient and sensible to be able to do 'return
-  // Status()' when the return type is StatusOr<T>.
-  //
-  // REQUIRES: !status.ok(). This requirement is DCHECKed.
-  // In optimized builds, passing OkStatus() here will have the effect
-  // of passing tsl::error::INTERNAL as a fallback.
-  StatusOr(const Status& status);
-  StatusOr& operator=(const Status& status);
-
-  // TODO(b/62186997): Add operator=(T) overloads.
-
-  // Similar to the `const T&` overload.
-  //
-  // REQUIRES: T is move constructible.
-  StatusOr(T&& value);
-
-  // RValue versions of the operations declared above.
-  StatusOr(Status&& status);
-  StatusOr& operator=(Status&& status);
-
-  // Returns this->status().ok()
-  bool ok() const { return this->status_.ok(); }
-
-  // Returns a reference to our status. If this contains a T, then
-  // returns OkStatus().
-  const Status& status() const&;
-  Status status() &&;
-
-  // Returns a reference to our current value, or CHECK-fails if !this->ok().
-  //
-  // DEPRECATED: Prefer accessing the value using `operator*` or `operator->`
-  // after testing that the StatusOr is OK. If program termination is desired in
-  // the case of an error status, consider `CHECK_OK(status_or);`.
-  // Note: for value types that are cheap to copy, prefer simple code:
-  //
-  //   T value = statusor.value();
-  //
-  // Otherwise, if the value type is expensive to copy, but can be left
-  // in the StatusOr, simply assign to a reference:
-  //
-  //   T& value = statusor.value();  // or `const T&`
-  //
-  // Otherwise, if the value type supports an efficient move, it can be
-  // used as follows:
-  //
-  //   T value = std::move(statusor).value();
-  //
-  // The std::move on statusor instead of on the whole expression enables
-  // warnings about possible uses of the statusor object after the move.
-  // C++ style guide waiver for ref-qualified overloads granted in cl/143176389
-  // See go/ref-qualifiers for more details on such overloads.
-  const T& value() const&;
-  T& value() &;
-  const T&& value() const&&;
-  T&& value() &&;
-
-  // Returns a reference to the current value.
-  //
-  // REQUIRES: this->ok() == true, otherwise the behavior is undefined.
-  //
-  // Use this->ok() or `operator bool()` to verify that there is a current
-  // value. Alternatively, see value() for a similar API that guarantees
-  // CHECK-failing if there is no current value.
-  const T& operator*() const&;
-  T& operator*() &;
-  const T&& operator*() const&&;
-  T&& operator*() &&;
-
-  // Returns a pointer to the current value.
-  //
-  // REQUIRES: this->ok() == true, otherwise the behavior is undefined.
-  //
-  // Use this->ok() or `operator bool()` to verify that there is a current
-  // value.
-  const T* operator->() const;
-  T* operator->();
-
-  // Ignores any errors. This method does nothing except potentially suppress
-  // complaints from any tools that are checking that errors are not dropped on
-  // the floor.
-  void IgnoreError() const;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// Implementation details for StatusOr<T>
-
-template <typename T>
-StatusOr<T>::StatusOr() : Base(Status(absl::StatusCode::kUnknown, "")) {}
-
-template <typename T>
-StatusOr<T>::StatusOr(const T& value) : Base(value) {}
-
-template <typename T>
-StatusOr<T>::StatusOr(const Status& status) : Base(status) {}
-
-template <typename T>
-StatusOr<T>& StatusOr<T>::operator=(const Status& status) {
-  this->Assign(status);
-  return *this;
-}
-
-template <typename T>
-StatusOr<T>::StatusOr(T&& value) : Base(std::move(value)) {}
-
-template <typename T>
-template <typename... Args>
-StatusOr<T>::StatusOr(absl::in_place_t, Args&&... args)
-    : Base(absl::in_place, std::forward<Args>(args)...) {}
-
-template <typename T>
-StatusOr<T>::StatusOr(Status&& status) : Base(std::move(status)) {}
-
-template <typename T>
-StatusOr<T>& StatusOr<T>::operator=(Status&& status) {
-  this->Assign(std::move(status));
-  return *this;
-}
-
-template <typename T>
-template <typename U,
-          typename std::enable_if<std::is_convertible<U, T>::value>::type*>
-inline StatusOr<T>::StatusOr(const StatusOr<U>& other)
-    : Base(static_cast<const typename StatusOr<U>::Base&>(other)) {}
-
-template <typename T>
-template <typename U,
-          typename std::enable_if<std::is_convertible<U, T>::value>::type*>
-inline StatusOr<T>& StatusOr<T>::operator=(const StatusOr<U>& other) {
-  if (other.ok())
-    this->Assign(other.value());
-  else
-    this->Assign(other.status());
-  return *this;
-}
-
-template <typename T>
-template <typename U,
-          typename std::enable_if<std::is_convertible<U, T>::value>::type*>
-inline StatusOr<T>::StatusOr(StatusOr<U>&& other)
-    : Base(static_cast<typename StatusOr<U>::Base&&>(other)) {}
-
-template <typename T>
-template <typename U,
-          typename std::enable_if<std::is_convertible<U, T>::value>::type*>
-inline StatusOr<T>& StatusOr<T>::operator=(StatusOr<U>&& other) {
-  if (other.ok()) {
-    this->Assign(std::move(other).value());
-  } else {
-    this->Assign(std::move(other).status());
-  }
-  return *this;
-}
-
-template <typename T>
-const Status& StatusOr<T>::status() const& {
-  return this->status_;
-}
-template <typename T>
-Status StatusOr<T>::status() && {
-  // Note that we copy instead of moving the status here so that
-  // ~StatusOrData() can call ok() without invoking UB.
-  return ok() ? OkStatus() : this->status_;
-}
-
-template <typename T>
-const T& StatusOr<T>::value() const& {
-  this->EnsureOk();
-  return this->data_;
-}
-
-template <typename T>
-T& StatusOr<T>::value() & {
-  this->EnsureOk();
-  return this->data_;
-}
-
-template <typename T>
-const T&& StatusOr<T>::value() const&& {
-  this->EnsureOk();
-  return std::move(this->data_);
-}
-
-template <typename T>
-T&& StatusOr<T>::value() && {
-  this->EnsureOk();
-  return std::move(this->data_);
-}
-
-template <typename T>
-const T* StatusOr<T>::operator->() const {
-  this->EnsureOk();
-  return &this->data_;
-}
-
-template <typename T>
-T* StatusOr<T>::operator->() {
-  this->EnsureOk();
-  return &this->data_;
-}
-
-template <typename T>
-const T& StatusOr<T>::operator*() const& {
-  this->EnsureOk();
-  return this->data_;
-}
-
-template <typename T>
-T& StatusOr<T>::operator*() & {
-  this->EnsureOk();
-  return this->data_;
-}
-
-template <typename T>
-const T&& StatusOr<T>::operator*() const&& {
-  this->EnsureOk();
-  return std::move(this->data_);
-}
-
-template <typename T>
-T&& StatusOr<T>::operator*() && {
-  this->EnsureOk();
-  return std::move(this->data_);
-}
-
-template <typename T>
-void StatusOr<T>::IgnoreError() const {
-  // no-op
-}
+using absl::StatusOr;
 
 #define TF_ASSERT_OK_AND_ASSIGN(lhs, rexpr)                             \
   TF_ASSERT_OK_AND_ASSIGN_IMPL(                                         \
diff --git a/tensorflow/tsl/platform/statusor_internals.h b/tensorflow/tsl/platform/statusor_internals.h
deleted file mode 100644
index d19205ac91e..00000000000
--- a/tensorflow/tsl/platform/statusor_internals.h
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_TSL_PLATFORM_STATUSOR_INTERNALS_H_
-#define TENSORFLOW_TSL_PLATFORM_STATUSOR_INTERNALS_H_
-
-#include "tensorflow/tsl/platform/macros.h"
-#include "tensorflow/tsl/platform/status.h"
-
-namespace tsl {
-namespace internal_statusor {
-
-class Helper {
- public:
-  // Move type-agnostic error handling to the .cc.
-  static void HandleInvalidStatusCtorArg(Status*);
-  TF_ATTRIBUTE_NORETURN static void Crash(const Status& status);
-};
-
-// Construct an instance of T in `p` through placement new, passing Args... to
-// the constructor.
-// This abstraction is here mostly for the gcc performance fix.
-template <typename T, typename... Args>
-void PlacementNew(void* p, Args&&... args) {
-#if defined(__GNUC__) && !defined(__clang__)
-  // Teach gcc that 'p' cannot be null, fixing code size issues.
-  if (p == nullptr) __builtin_unreachable();
-#endif
-  new (p) T(std::forward<Args>(args)...);
-}
-
-// Helper base class to hold the data and all operations.
-// We move all this to a base class to allow mixing with the appropriate
-// TraitsBase specialization.
-template <typename T>
-class StatusOrData {
-  template <typename U>
-  friend class StatusOrData;
-
- public:
-  StatusOrData() = delete;
-
-  StatusOrData(const StatusOrData& other) {
-    if (other.ok()) {
-      MakeValue(other.data_);
-      MakeStatus();
-    } else {
-      MakeStatus(other.status_);
-    }
-  }
-
-  StatusOrData(StatusOrData&& other) noexcept {
-    if (other.ok()) {
-      MakeValue(std::move(other.data_));
-      MakeStatus();
-    } else {
-      MakeStatus(other.status_);
-    }
-  }
-
-  template <typename U>
-  StatusOrData(const StatusOrData<U>& other) {
-    if (other.ok()) {
-      MakeValue(other.data_);
-      MakeStatus();
-    } else {
-      MakeStatus(other.status_);
-    }
-  }
-
-  template <typename U>
-  StatusOrData(StatusOrData<U>&& other) {
-    if (other.ok()) {
-      MakeValue(std::move(other.data_));
-      MakeStatus();
-    } else {
-      MakeStatus(other.status_);
-    }
-  }
-
-  explicit StatusOrData(const T& value) : data_(value) { MakeStatus(); }
-  explicit StatusOrData(T&& value) : data_(std::move(value)) { MakeStatus(); }
-
-  template <typename... Args>
-  explicit StatusOrData(absl::in_place_t, Args&&... args)
-      : data_(std::forward<Args>(args)...) {
-    MakeStatus();
-  }
-
-  explicit StatusOrData(const Status& status) : status_(status) {
-    EnsureNotOk();
-  }
-  explicit StatusOrData(Status&& status) : status_(std::move(status)) {
-    EnsureNotOk();
-  }
-
-  StatusOrData& operator=(const StatusOrData& other) {
-    if (this == &other) return *this;
-    if (other.ok())
-      Assign(other.data_);
-    else
-      Assign(other.status_);
-    return *this;
-  }
-
-  StatusOrData& operator=(StatusOrData&& other) {
-    if (this == &other) return *this;
-    if (other.ok())
-      Assign(std::move(other.data_));
-    else
-      Assign(std::move(other.status_));
-    return *this;
-  }
-
-  ~StatusOrData() {
-    if (ok()) {
-      status_.~Status();
-      data_.~T();
-    } else {
-      status_.~Status();
-    }
-  }
-
-  void Assign(const T& value) {
-    if (ok()) {
-      data_.~T();
-      MakeValue(value);
-    } else {
-      MakeValue(value);
-      status_ = OkStatus();
-    }
-  }
-
-  void Assign(T&& value) {
-    if (ok()) {
-      data_.~T();
-      MakeValue(std::move(value));
-    } else {
-      MakeValue(std::move(value));
-      status_ = OkStatus();
-    }
-  }
-
-  void Assign(const Status& status) {
-    Clear();
-    status_ = status;
-    EnsureNotOk();
-  }
-
-  void Assign(Status&& status) {
-    Clear();
-    // Note that we copy instead of moving the status here so that
-    // status.~StatusOrData() can call ok() without invoking UB.
-    status_ = status;
-    EnsureNotOk();
-  }
-
-  bool ok() const { return status_.ok(); }
-
- protected:
-  // status_ will always be active after the constructor.
-  // We make it a union to be able to initialize exactly how we need without
-  // waste.
-  // Eg. in the copy constructor we use the default constructor of Status in
-  // the ok() path to avoid an extra Ref call.
-  union {
-    Status status_;
-  };
-
-  // data_ is active iff status_.ok()==true
-  struct Dummy {};
-  union {
-    // When T is const, we need some non-const object we can cast to void* for
-    // the placement new. dummy_ is that object.
-    Dummy dummy_;
-    T data_;
-  };
-
-  void Clear() {
-    if (ok()) data_.~T();
-  }
-
-  void EnsureOk() const {
-    if (!ok()) Helper::Crash(status_);
-  }
-
-  void EnsureNotOk() {
-    if (ok()) Helper::HandleInvalidStatusCtorArg(&status_);
-  }
-
-  // Construct the value (ie. data_) through placement new with the passed
-  // argument.
-  template <typename Arg>
-  void MakeValue(Arg&& arg) {
-    internal_statusor::PlacementNew<T>(&dummy_, std::forward<Arg>(arg));
-  }
-
-  // Construct the status (ie. status_) through placement new with the passed
-  // argument.
-  template <typename... Args>
-  void MakeStatus(Args&&... args) {
-    internal_statusor::PlacementNew<Status>(&status_,
-                                            std::forward<Args>(args)...);
-  }
-};
-
-// Helper base class to allow implicitly deleted constructors and assignment
-// operations in StatusOr.
-// TraitsBase will explicitly delete what it can't support and StatusOr will
-// inherit that behavior implicitly.
-template <bool Copy, bool Move>
-struct TraitsBase {
-  TraitsBase() = default;
-  TraitsBase(const TraitsBase&) = default;
-  TraitsBase(TraitsBase&&) = default;
-  TraitsBase& operator=(const TraitsBase&) = default;
-  TraitsBase& operator=(TraitsBase&&) = default;
-};
-
-template <>
-struct TraitsBase<false, true> {
-  TraitsBase() = default;
-  TraitsBase(const TraitsBase&) = delete;
-  TraitsBase(TraitsBase&&) = default;
-  TraitsBase& operator=(const TraitsBase&) = delete;
-  TraitsBase& operator=(TraitsBase&&) = default;
-};
-
-template <>
-struct TraitsBase<false, false> {
-  TraitsBase() = default;
-  TraitsBase(const TraitsBase&) = delete;
-  TraitsBase(TraitsBase&&) = delete;
-  TraitsBase& operator=(const TraitsBase&) = delete;
-  TraitsBase& operator=(TraitsBase&&) = delete;
-};
-
-}  // namespace internal_statusor
-}  // namespace tsl
-
-#endif  // TENSORFLOW_TSL_PLATFORM_STATUSOR_INTERNALS_H_
diff --git a/tensorflow/tsl/platform/statusor_test.cc b/tensorflow/tsl/platform/statusor_test.cc
index 12deef212c9..9dca858fa2f 100644
--- a/tensorflow/tsl/platform/statusor_test.cc
+++ b/tensorflow/tsl/platform/statusor_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/config.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/macros.h"
 #include "tensorflow/tsl/platform/test.h"
@@ -71,11 +72,6 @@ StatusOr<std::unique_ptr<int>> ReturnUniquePtr() {
   return std::unique_ptr<int>(new int(0));
 }
 
-TEST(StatusOr, ElementType) {
-  static_assert(std::is_same<StatusOr<int>::element_type, int>(), "");
-  static_assert(std::is_same<StatusOr<char>::element_type, char>(), "");
-}
-
 TEST(StatusOr, NullPointerStatusOr) {
   // As a very special case, null-plain-pointer StatusOr used to be an
   // error. Test that it no longer is.
@@ -165,12 +161,12 @@ TEST(StatusOr, TestMoveWithValuesAndErrors) {
   // Overwrite the value in status_or with an error.
   status_or = std::move(error1);
   ASSERT_FALSE(status_or.ok());
-  EXPECT_EQ("error1", status_or.status().error_message());
+  EXPECT_EQ("error1", status_or.status().message());
 
   // Overwrite the error in status_or with another error.
   status_or = std::move(error2);
   ASSERT_FALSE(status_or.ok());
-  EXPECT_EQ("error2", status_or.status().error_message());
+  EXPECT_EQ("error2", status_or.status().message());
 
   // Overwrite the error with a value.
   status_or = std::move(value2);
@@ -196,12 +192,12 @@ TEST(StatusOr, TestCopyWithValuesAndErrors) {
   // Overwrite the value in status_or with an error.
   status_or = error1;
   ASSERT_FALSE(status_or.ok());
-  EXPECT_EQ("error1", status_or.status().error_message());
+  EXPECT_EQ("error1", status_or.status().message());
 
   // Overwrite the error in status_or with another error.
   status_or = error2;
   ASSERT_FALSE(status_or.ok());
-  EXPECT_EQ("error2", status_or.status().error_message());
+  EXPECT_EQ("error2", status_or.status().message());
 
   // Overwrite the error with a value.
   status_or = value2;
@@ -210,8 +206,8 @@ TEST(StatusOr, TestCopyWithValuesAndErrors) {
 
   // Verify original values unchanged.
   EXPECT_EQ(std::string(1000, '1'), value1.value());
-  EXPECT_EQ("error1", error1.status().error_message());
-  EXPECT_EQ("error2", error2.status().error_message());
+  EXPECT_EQ("error1", error1.status().message());
+  EXPECT_EQ("error2", error2.status().message());
   EXPECT_EQ(std::string(1000, '2'), value2.value());
 }
 
@@ -223,10 +219,28 @@ TEST(StatusOr, TestDefaultCtor) {
 
 TEST(StatusOrDeathTest, TestDefaultCtorValue) {
   StatusOr<int> thing;
+#ifdef ABSL_HAVE_EXCEPTIONS
+  try {
+    thing.value();
+    ADD_FAILURE()
+        << "value() returned successfully while the access is illegal";
+  } catch (absl::BadStatusOrAccess& ex) {
+  }
+#else
   EXPECT_DEATH(thing.value(), "");
+#endif
 
   const StatusOr<int> thing2;
+#ifdef ABSL_HAVE_EXCEPTIONS
+  try {
+    thing.value();
+    ADD_FAILURE()
+        << "value() returned successfully while the access is illegal";
+  } catch (absl::BadStatusOrAccess& ex) {
+  }
+#else
   EXPECT_DEATH(thing.value(), "");
+#endif
 }
 
 TEST(StatusOr, TestStatusCtor) {
@@ -317,12 +331,30 @@ TEST(StatusOr, TestValueConst) {
 
 TEST(StatusOrDeathTest, TestValueNotOk) {
   StatusOr<int> thing(Status(absl::StatusCode::kCancelled, "cancelled"));
+#ifdef ABSL_HAVE_EXCEPTIONS
+  try {
+    thing.value();
+    ADD_FAILURE()
+        << "value() returned successfully while the access is illegal";
+  } catch (absl::BadStatusOrAccess& ex) {
+  }
+#else
   EXPECT_DEATH(thing.value(), "cancelled");
+#endif
 }
 
 TEST(StatusOrDeathTest, TestValueNotOkConst) {
   const StatusOr<int> thing(Status(absl::StatusCode::kUnknown, ""));
+#ifdef ABSL_HAVE_EXCEPTIONS
+  try {
+    thing.value();
+    ADD_FAILURE()
+        << "value() returned successfully while the access is illegal";
+  } catch (absl::BadStatusOrAccess& ex) {
+  }
+#else
   EXPECT_DEATH(thing.value(), "");
+#endif
 }
 
 TEST(StatusOr, TestPointerDefaultCtor) {
@@ -333,7 +365,16 @@ TEST(StatusOr, TestPointerDefaultCtor) {
 
 TEST(StatusOrDeathTest, TestPointerDefaultCtorValue) {
   StatusOr<int*> thing;
+#ifdef ABSL_HAVE_EXCEPTIONS
+  try {
+    thing.value();
+    ADD_FAILURE()
+        << "value() returned successfully while the access is illegal";
+  } catch (absl::BadStatusOrAccess& ex) {
+  }
+#else
   EXPECT_DEATH(thing.value(), "");
+#endif
 }
 
 TEST(StatusOr, TestPointerStatusCtor) {
@@ -418,11 +459,6 @@ TEST(StatusOr, TestArrowOperator) {
   EXPECT_EQ(*uptr->get(), 0);
 }
 
-TEST(StatusOr, TestArrowOperatorNotOk) {
-  StatusOr<Base1> error(Status(absl::StatusCode::kCancelled, "cancelled"));
-  EXPECT_DEATH(error->pad_++, "cancelled");
-}
-
 TEST(StatusOr, TestStarOperator) {
   StatusOr<std::unique_ptr<int>> uptr = ReturnUniquePtr();
   EXPECT_EQ(**uptr, 0);
@@ -442,16 +478,6 @@ TEST(StatusOr, TestStarOperatorDeath) {
 //   v.reserve(v.capacity() + 10);
 // }
 
-TEST(StatusOrDeathTest, TestPointerValueNotOk) {
-  StatusOr<int*> thing(Status(absl::StatusCode::kCancelled, "cancelled"));
-  EXPECT_DEATH(thing.value(), "cancelled");
-}
-
-TEST(StatusOrDeathTest, TestPointerValueNotOkConst) {
-  const StatusOr<int*> thing(Status(absl::StatusCode::kCancelled, "cancelled"));
-  EXPECT_DEATH(thing.value(), "cancelled");
-}
-
 static StatusOr<int> MakeStatus() { return 100; }
 // A factory to help us benchmark the various factory styles. All of
 // the factory methods are marked as non-inlineable so as to more
diff --git a/tensorflow/tsl/platform/windows/BUILD b/tensorflow/tsl/platform/windows/BUILD
index b8726f7c4ea..f73c587f156 100644
--- a/tensorflow/tsl/platform/windows/BUILD
+++ b/tensorflow/tsl/platform/windows/BUILD
@@ -72,6 +72,7 @@ cc_library(
         "//tensorflow/tsl/platform:types",
         "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
         "//third_party/eigen3",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
     ],
diff --git a/tensorflow/tsl/platform/windows/env.cc b/tensorflow/tsl/platform/windows/env.cc
index 16e9d5b70c8..0b39dbb6f43 100644
--- a/tensorflow/tsl/platform/windows/env.cc
+++ b/tensorflow/tsl/platform/windows/env.cc
@@ -53,8 +53,8 @@ class StdThread : public Thread {
  public:
   // thread_options is ignored.
   StdThread(const ThreadOptions& thread_options, const string& name,
-            std::function<void()> fn)
-      : thread_(fn) {
+            absl::AnyInvocable<void()> fn)
+      : thread_(std::move(fn)) {
     mutex_lock l(name_mutex);
     GetThreadNameRegistry().emplace(thread_.get_id(), name);
   }
@@ -98,8 +98,8 @@ class WindowsEnv : public Env {
   void SleepForMicroseconds(int64 micros) override { Sleep(micros / 1000); }
 
   Thread* StartThread(const ThreadOptions& thread_options, const string& name,
-                      std::function<void()> fn) override {
-    return new StdThread(thread_options, name, fn);
+                      absl::AnyInvocable<void()> fn) override {
+    return new StdThread(thread_options, name, std::move(fn));
   }
 
   int32 GetCurrentThreadId() override {
@@ -120,14 +120,14 @@ class WindowsEnv : public Env {
   static VOID CALLBACK SchedClosureCallback(PTP_CALLBACK_INSTANCE Instance,
                                             PVOID Context, PTP_WORK Work) {
     CloseThreadpoolWork(Work);
-    std::function<void()>* f = (std::function<void()>*)Context;
+    absl::AnyInvocable<void()>* f = (absl::AnyInvocable<void()>*)Context;
     (*f)();
     delete f;
   }
-  void SchedClosure(std::function<void()> closure) override {
+  void SchedClosure(absl::AnyInvocable<void()> closure) override {
     PTP_WORK work = CreateThreadpoolWork(
-        SchedClosureCallback, new std::function<void()>(std::move(closure)),
-        nullptr);
+        SchedClosureCallback,
+        new absl::AnyInvocable<void()>(std::move(closure)), nullptr);
     SubmitThreadpoolWork(work);
   }
 
@@ -135,15 +135,16 @@ class WindowsEnv : public Env {
                                                  PVOID Context,
                                                  PTP_TIMER Timer) {
     CloseThreadpoolTimer(Timer);
-    std::function<void()>* f = (std::function<void()>*)Context;
+    absl::AnyInvocable<void()>* f = (absl::AnyInvocable<void()>*)Context;
     (*f)();
     delete f;
   }
 
-  void SchedClosureAfter(int64 micros, std::function<void()> closure) override {
+  void SchedClosureAfter(int64 micros,
+                         absl::AnyInvocable<void()> closure) override {
     PTP_TIMER timer = CreateThreadpoolTimer(
         SchedClosureAfterCallback,
-        new std::function<void()>(std::move(closure)), nullptr);
+        new absl::AnyInvocable<void()>(std::move(closure)), nullptr);
     // in 100 nanosecond units
     FILETIME FileDueTime;
     ULARGE_INTEGER ulDueTime;
diff --git a/tensorflow/tsl/platform/windows/windows_file_system.cc b/tensorflow/tsl/platform/windows/windows_file_system.cc
index f1a53eac926..d928ff5fff3 100644
--- a/tensorflow/tsl/platform/windows/windows_file_system.cc
+++ b/tensorflow/tsl/platform/windows/windows_file_system.cc
@@ -139,7 +139,8 @@ class WindowsRandomAccessFile : public RandomAccessFile {
         dst += r;
         n -= r;
       } else if (r == 0) {
-        s = Status(error::OUT_OF_RANGE, "Read fewer bytes than requested");
+        s = Status(absl::StatusCode::kOutOfRange,
+                   "Read fewer bytes than requested");
       } else if (errno == EINTR || errno == EAGAIN) {
         // Retry
       } else {
@@ -647,7 +648,7 @@ Status WindowsFileSystem::IsDirectory(const string& fname,
   if (PathIsDirectoryW(ws_final_fname.c_str())) {
     return OkStatus();
   }
-  return Status(tsl::error::FAILED_PRECONDITION, "Not a directory");
+  return Status(absl::StatusCode::kFailedPrecondition, "Not a directory");
 }
 
 Status WindowsFileSystem::RenameFile(const string& src, const string& target,
diff --git a/tensorflow/tsl/profiler/backends/cpu/BUILD b/tensorflow/tsl/profiler/backends/cpu/BUILD
index cdcf59f1392..9e8d4980b4f 100644
--- a/tensorflow/tsl/profiler/backends/cpu/BUILD
+++ b/tensorflow/tsl/profiler/backends/cpu/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
@@ -9,16 +10,16 @@ cc_library(
     name = "traceme_recorder",
     hdrs = ["traceme_recorder.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl/profiler:internal",
         "//tensorflow/tsl/profiler:xla_profiler_backends",
-    ],
+    ]),
     deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/tsl/platform:macros",
         "//tensorflow/tsl/platform:mutex",
         "//tensorflow/tsl/platform:thread_annotations",
         "//tensorflow/tsl/platform:types",
+        "@com_google_absl//absl/container:flat_hash_map",
     ] + if_static([
         ":traceme_recorder_impl",
     ]),
@@ -31,13 +32,13 @@ cc_library(
     ],
     hdrs = ["traceme_recorder.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/python:__pkg__",
         "//tensorflow/tsl/platform/cloud:__pkg__",
         "//tensorflow/tsl/profiler:__pkg__",
         "//tensorflow/tsl/profiler:internal",
         "//tensorflow/tsl/profiler:xla_internal",
-    ],
+    ]),
     deps = [
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:logging",
@@ -75,13 +76,13 @@ cc_library(
     name = "annotation_stack",
     hdrs = ["annotation_stack.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
     deps = [
-        "@com_google_absl//absl/strings",
         "//tensorflow/tsl/platform:macros",
         "//tensorflow/tsl/platform:types",
+        "@com_google_absl//absl/strings",
     ] + if_static([
         ":annotation_stack_impl",
     ]),
@@ -94,10 +95,10 @@ cc_library(
         "annotation_stack.h",
     ],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla:__subpackages__",
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
     deps = [
         "//tensorflow/tsl/platform:macros",
         "//tensorflow/tsl/platform:types",
@@ -111,10 +112,10 @@ cc_library(
     srcs = ["host_tracer_utils.cc"],
     hdrs = ["host_tracer_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl/profiler:internal",
         "//tensorflow/tsl/profiler:xla_internal",
-    ],
+    ]),
     deps = [
         ":traceme_recorder",
         "//tensorflow/tsl/platform:types",
diff --git a/tensorflow/tsl/profiler/convert/BUILD b/tensorflow/tsl/profiler/convert/BUILD
index 07c889d7ef4..343bc1a1bdd 100644
--- a/tensorflow/tsl/profiler/convert/BUILD
+++ b/tensorflow/tsl/profiler/convert/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load(
     "//tensorflow/tsl/platform:rules_cc.bzl",
     "cc_library",
@@ -7,7 +8,7 @@ load("//tensorflow/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/tsl/profiler:internal"],
+    default_visibility = set_external_visibility(["//tensorflow/tsl/profiler:internal"]),
     licenses = ["notice"],
 )
 
@@ -22,17 +23,16 @@ cc_library(
     deps = [
         "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/profiler/protobuf:trace_events_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
 cc_library(
     name = "xla_op_utils",
     hdrs = ["xla_op_utils.h"],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl/profiler:internal",
         "//tensorflow/tsl/profiler:xla_profiler_backends",
-    ],
+    ]),
     deps = ["@com_google_absl//absl/strings"],
 )
 
@@ -52,7 +52,7 @@ cc_library(
     srcs = ["post_process_single_host_xplane.cc"],
     hdrs = ["post_process_single_host_xplane.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//tensorflow/tsl/profiler:internal"],
+    visibility = set_external_visibility(["//tensorflow/tsl/profiler:internal"]),
     deps = [
         "//tensorflow/tsl/platform:types",
         "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
@@ -66,9 +66,9 @@ cc_library(
     srcs = ["trace_events_to_json.cc"],
     hdrs = ["trace_events_to_json.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
     deps = [
         ":trace_container",
         "//tensorflow/tsl/platform:protobuf",
@@ -87,6 +87,7 @@ tsl_cc_test(
     srcs = ["trace_container_test.cc"],
     deps = [
         ":trace_container",
+        "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
         "//tensorflow/tsl/profiler/protobuf:trace_events_proto_cc",
@@ -112,9 +113,9 @@ cc_library(
     srcs = ["xplane_to_trace_events.cc"],
     hdrs = ["xplane_to_trace_events.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
     deps = [
         ":trace_container",
         "//tensorflow/tsl/platform:types",
@@ -134,10 +135,6 @@ tsl_cc_test(
     name = "xplane_to_trace_events_test",
     size = "small",
     srcs = ["xplane_to_trace_events_test.cc"],
-    tags = [
-        # b/273369126 Test is failing with asan builds.
-        "noasan",
-    ],
     deps = [
         ":xplane_to_trace_events",
         "//tensorflow/tsl/platform:test",
diff --git a/tensorflow/tsl/profiler/convert/trace_container.cc b/tensorflow/tsl/profiler/convert/trace_container.cc
index 053e5abcf7e..4881e4b4bd8 100644
--- a/tensorflow/tsl/profiler/convert/trace_container.cc
+++ b/tensorflow/tsl/profiler/convert/trace_container.cc
@@ -14,7 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/tsl/profiler/convert/trace_container.h"
 
+#include <algorithm>
 #include <string>
+#include <string_view>
+#include <vector>
 
 #include "tensorflow/tsl/platform/protobuf.h"
 
@@ -22,7 +25,41 @@ namespace tsl {
 namespace profiler {
 
 bool TraceContainer::ParseMetadataFromString(const std::string& description) {
-  return protobuf::TextFormat::ParseFromString(description, &trace_);
+  return protobuf::TextFormat::ParseFromString(description, &metadata_);
+}
+
+void TraceContainer::CapEvents(const uint32_t max_count) {
+  const size_t total_count = events_.size();
+
+  if (total_count <= max_count) {
+    // Nothing to do. Events are not known sorted after return.
+    return;
+  }
+
+  // Partially sort the events according to start time.
+  const std::vector<TraceEvent*>::iterator end = events_.begin() + max_count;
+  std::partial_sort(
+      events_.begin(), end, events_.end(),
+      [](const TraceEvent* const lhs, const TraceEvent* const rhs) -> bool {
+        return lhs->timestamp_ps() < rhs->timestamp_ps();
+      });
+  for (std::vector<TraceEvent*>::iterator i = end; i != events_.end(); ++i) {
+    delete *i;
+  }
+  events_.erase(end, events_.end());
+  // All events are known sorted here.
+}
+
+void TraceContainer::FlushAndSerializeEvents(std::string* const output) {
+  Trace trace = metadata_;
+  for (TraceEvent* const event : events_) {
+    trace.mutable_trace_events()->AddAllocated(event);
+  }
+  // Ownership was transferred to the `trace` message. since
+  // the container assumes it owns all the events in its storage
+  // buffer, we must clear the buffer, to prevent overreleasing.
+  events_.clear();
+  trace.SerializeToString(output);
 }
 
 }  // namespace profiler
diff --git a/tensorflow/tsl/profiler/convert/trace_container.h b/tensorflow/tsl/profiler/convert/trace_container.h
index 8442f0c3d87..bf58353910b 100644
--- a/tensorflow/tsl/profiler/convert/trace_container.h
+++ b/tensorflow/tsl/profiler/convert/trace_container.h
@@ -15,19 +15,18 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PROFILER_CONVERT_TRACE_CONTAINER_H_
 #define TENSORFLOW_TSL_PROFILER_CONVERT_TRACE_CONTAINER_H_
 
-#include <memory>
 #include <string>
+#include <string_view>
 #include <vector>
 
-#include "absl/container/flat_hash_set.h"
 #include "tensorflow/tsl/profiler/protobuf/trace_events.pb.h"
 
 namespace tsl {
 namespace profiler {
 
-using tensorflow::profiler::Device;
-using tensorflow::profiler::Trace;
-using tensorflow::profiler::TraceEvent;
+using tsl::profiler::Device;
+using tsl::profiler::Trace;
+using tsl::profiler::TraceEvent;
 
 template <typename /*Comparable*/ Event>
 class AnyTraceContainer {
@@ -40,44 +39,50 @@ class AnyTraceContainer {
 class TraceContainer : public AnyTraceContainer<TraceEvent> {
  public:
   TraceContainer() = default;
-
   ~TraceContainer() final {
     for (const TraceEvent* event : events_) {
       delete event;
     }
   }
 
-  // noncopyable
-  TraceContainer(TraceContainer&&) = default;
-  TraceContainer& operator=(TraceContainer&&) = default;
-  TraceContainer(const TraceContainer&) = delete;
-  TraceContainer& operator=(const TraceContainer&) = delete;
-
   // Returns the metadata for this trace container.
-  const Trace& trace() const { return trace_; }
+  const Trace& trace() const { return metadata_; }
 
   const std::vector<TraceEvent*>& UnsortedEvents() const final {
     return events_;
   }
 
-  std::vector<TraceEvent*>& YieldUnsortedEvents() { return events_; }
+  // Caps the number of stored trace events to the specified limit,
+  // keeping the `max_count` earliest trace events by timestamp
+  // if there are more events than the limit. The sortedness of
+  // the trace events after calling this function is currently unspecified.
+  void CapEvents(uint32_t max_count);
 
   // Returns a device descriptor.
   Device* MutableDevice(uint32_t device_id) {
-    return &(*trace_.mutable_devices())[device_id];
+    return &(*metadata_.mutable_devices())[device_id];
   }
 
+  // Allocates and returns a pointer to a trace event owned by this
+  // container. Do not persist the pointer; it will be invalidated
+  // on `FlushAndSerializeEvents(output:)`, or when the container is
+  // deinitialized, whichever comes first.
   TraceEvent* CreateEvent() final {
     TraceEvent* event = new TraceEvent;
     events_.push_back(event);
     return event;
   }
 
+  // Removes all stored trace events from the container, and serializes
+  // them as a protobuf string, along with the device metadata. This
+  // function does not clear the device metadata.
+  void FlushAndSerializeEvents(std::string* output);
+
   // Used for testing
   bool ParseMetadataFromString(const std::string& description);
 
  private:
-  Trace trace_;
+  Trace metadata_;
   std::vector<TraceEvent*> events_;
 };
 
diff --git a/tensorflow/tsl/profiler/convert/trace_container_test.cc b/tensorflow/tsl/profiler/convert/trace_container_test.cc
index f4db7e6a399..dbe7b48e146 100644
--- a/tensorflow/tsl/profiler/convert/trace_container_test.cc
+++ b/tensorflow/tsl/profiler/convert/trace_container_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,17 +15,16 @@ limitations under the License.
 
 #include "tensorflow/tsl/profiler/convert/trace_container.h"
 
+#include <string>
+
+#include "tensorflow/tsl/platform/protobuf.h"
 #include "tensorflow/tsl/platform/test.h"
-#include "tensorflow/tsl/profiler/protobuf/trace_events.pb.h"
 
 namespace tsl {
 namespace profiler {
 namespace {
 
-TEST(TraceContainer, TraceEventAllocation) {
-  TraceContainer container;
-
-  TraceEvent* event = container.CreateEvent();
+void PopulateDummyEvent(TraceEvent* const event) {
   event->set_device_id(1);
   event->set_resource_id(2);
   event->set_name("A");
@@ -33,6 +32,61 @@ TEST(TraceContainer, TraceEventAllocation) {
   event->set_duration_ps(4);
 }
 
+TEST(TraceContainer, TraceEventAllocation) {
+  TraceContainer container;
+  PopulateDummyEvent(container.CreateEvent());
+}
+
+TEST(TraceContainer, FlushAndSerializeEvents) {
+  TraceContainer container;
+
+  PopulateDummyEvent(container.CreateEvent());
+
+  EXPECT_EQ(container.UnsortedEvents().size(), 1);
+
+  std::string serialized;
+  container.FlushAndSerializeEvents(&serialized);
+
+  EXPECT_EQ(container.UnsortedEvents().size(), 0);
+
+  PopulateDummyEvent(container.CreateEvent());
+
+  EXPECT_EQ(container.UnsortedEvents().size(), 1);
+
+  std::string reserialized;
+  container.FlushAndSerializeEvents(&reserialized);
+
+  EXPECT_EQ(serialized, reserialized);
+  EXPECT_EQ(container.UnsortedEvents().size(), 0);
+
+  Trace trace;
+  trace.ParseFromString(reserialized);
+
+  EXPECT_EQ(trace.trace_events_size(), 1);
+}
+
+TEST(TraceContainer, CapEvents) {
+  TraceContainer container;
+  for (int i = 0; i < 100; i++) {
+    container.CreateEvent()->set_timestamp_ps((100 - i) % 50);
+  }
+  // No dropping.
+  container.CapEvents(101);
+  EXPECT_EQ(container.UnsortedEvents().size(), 100);
+
+  container.CapEvents(100);
+  EXPECT_EQ(container.UnsortedEvents().size(), 100);
+
+  container.CapEvents(99);
+  EXPECT_EQ(container.UnsortedEvents().size(), 99);
+
+  container.CapEvents(50);
+  EXPECT_EQ(container.UnsortedEvents().size(), 50);
+  for (const TraceEvent* const event : container.UnsortedEvents()) {
+    EXPECT_LT(event->timestamp_ps(), 25);
+  }
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tsl
diff --git a/tensorflow/tsl/profiler/convert/trace_events_to_json.cc b/tensorflow/tsl/profiler/convert/trace_events_to_json.cc
index 0724fb61b78..2df50248ea1 100644
--- a/tensorflow/tsl/profiler/convert/trace_events_to_json.cc
+++ b/tensorflow/tsl/profiler/convert/trace_events_to_json.cc
@@ -32,11 +32,6 @@ namespace tsl {
 namespace profiler {
 namespace {
 
-using tensorflow::profiler::Device;
-using tensorflow::profiler::Resource;
-using tensorflow::profiler::Trace;
-using tensorflow::profiler::TraceEvent;
-
 // Converts the given time from picoseconds to microseconds and then to a string
 // using maximum precision.
 inline std::string PicosToMicrosString(uint64 ps) {
@@ -110,7 +105,7 @@ inline void AddTraceEvent(const TraceEvent& event, string* json) {
 
 }  // namespace
 
-std::string TraceEventsToJson(const TraceContainer& container) {
+std::string TraceContainerToJson(const TraceContainer& container) {
   std::string json =
       R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true},)"
       R"("traceEvents":[)";
diff --git a/tensorflow/tsl/profiler/convert/trace_events_to_json.h b/tensorflow/tsl/profiler/convert/trace_events_to_json.h
index d6b6ce38148..40b7c3ba1ae 100644
--- a/tensorflow/tsl/profiler/convert/trace_events_to_json.h
+++ b/tensorflow/tsl/profiler/convert/trace_events_to_json.h
@@ -26,7 +26,7 @@ namespace profiler {
 
 // Converts trace events in the trace proto to a JSON string that can be
 // consumed by catapult trace viewer.
-std::string TraceEventsToJson(const TraceContainer& container);
+std::string TraceContainerToJson(const TraceContainer& container);
 
 }  // namespace profiler
 }  // namespace tsl
diff --git a/tensorflow/tsl/profiler/convert/trace_events_to_json_test.cc b/tensorflow/tsl/profiler/convert/trace_events_to_json_test.cc
index 5b9c38d9263..bc9cc4b8e9d 100644
--- a/tensorflow/tsl/profiler/convert/trace_events_to_json_test.cc
+++ b/tensorflow/tsl/profiler/convert/trace_events_to_json_test.cc
@@ -78,7 +78,8 @@ TEST(TraceEventsToJson, JsonConversion) {
   event->set_name("E2.2.1 # \"comment\"");
   event->set_timestamp_ps(105000);
 
-  Json::Value json = ToJsonValue(TraceEventsToJson(container));
+  container.CapEvents(2);
+  Json::Value json = ToJsonValue(TraceContainerToJson(container));
 
   Json::Value expected_json = ToJsonValue(R"(
   {
diff --git a/tensorflow/tsl/profiler/convert/xplane_to_trace_events.cc b/tensorflow/tsl/profiler/convert/xplane_to_trace_events.cc
index 30217b29711..5339fcad25c 100644
--- a/tensorflow/tsl/profiler/convert/xplane_to_trace_events.cc
+++ b/tensorflow/tsl/profiler/convert/xplane_to_trace_events.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <stddef.h>
 
 #include <algorithm>
-#include <iterator>
 #include <string>
 #include <utility>
 #include <vector>
@@ -39,10 +38,6 @@ namespace profiler {
 
 namespace {
 
-using tensorflow::profiler::Device;
-using tensorflow::profiler::Resource;
-using tensorflow::profiler::Trace;
-using tensorflow::profiler::TraceEvent;
 using tensorflow::profiler::XSpace;
 
 void BuildDeviceAndResources(uint32 device_id, const XPlaneVisitor& plane,
@@ -112,26 +107,6 @@ void ConvertXPlaneToTraceEvents(uint32 device_id, const XPlaneVisitor& xplane,
 
 }  // namespace
 
-void MaybeDropEventsForTraceViewer(std::vector<TraceEvent*>& events,
-                                   uint32 limit) {
-  const size_t trace_event_size = events.size();
-  if (trace_event_size <= limit) return;  // Nothing to do.
-  // Sort the events according to start time.
-  std::vector<uint64> timestamps;
-  timestamps.reserve(trace_event_size);
-  for (const TraceEvent* const event : events) {
-    timestamps.push_back(event->timestamp_ps());
-  }
-  std::partial_sort(timestamps.begin(), timestamps.begin() + limit,
-                    timestamps.end(), std::less<uint64>());
-  uint64 cutoff_timestamp = timestamps[limit - 1];
-  events.erase(std::remove_if(events.begin(), events.end(),
-                              [&](const TraceEvent* const event) {
-                                return event->timestamp_ps() > cutoff_timestamp;
-                              }),
-               events.end());
-}
-
 uint64 GetTraceViewerMaxEvents() {
   constexpr uint64 kMaxEvents = 1000000;
   // Testing only env variable, not recommended for use
@@ -143,7 +118,7 @@ uint64 GetTraceViewerMaxEvents() {
   }
 }
 
-TraceContainer ConvertXSpaceToTraceEvents(const XSpace& xspace) {
+TraceContainer ConvertXSpaceToTraceContainer(const XSpace& xspace) {
   TraceContainer container;
   const XPlane* host_plane = FindPlaneWithName(xspace, kHostThreadsPlaneName);
   if (host_plane != nullptr) {
@@ -165,19 +140,16 @@ TraceContainer ConvertXSpaceToTraceEvents(const XSpace& xspace) {
     uint32 device_id = kFirstDeviceId + xplane.Id();
     ConvertXPlaneToTraceEvents(device_id, xplane, container);
   }
-
   // Trace viewer (non-streaming) has scalability issues, we need to drop
   // events to avoid loading failure for trace viewer.
   uint64 viewer_max_events = GetTraceViewerMaxEvents();
-  MaybeDropEventsForTraceViewer(container.YieldUnsortedEvents(),
-                                viewer_max_events);
+  container.CapEvents(viewer_max_events);
   return container;
 }
 
 void ConvertXSpaceToTraceEventsString(const XSpace& xspace,
                                       std::string* content) {
-  TraceContainer const container = ConvertXSpaceToTraceEvents(xspace);
-  container.trace().SerializeToString(content);
+  ConvertXSpaceToTraceContainer(xspace).FlushAndSerializeEvents(content);
 }
 
 }  // namespace profiler
diff --git a/tensorflow/tsl/profiler/convert/xplane_to_trace_events.h b/tensorflow/tsl/profiler/convert/xplane_to_trace_events.h
index c5388604b48..500cfa06664 100644
--- a/tensorflow/tsl/profiler/convert/xplane_to_trace_events.h
+++ b/tensorflow/tsl/profiler/convert/xplane_to_trace_events.h
@@ -17,26 +17,19 @@ limitations under the License.
 #define TENSORFLOW_TSL_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
 
 #include <string>
-#include <vector>
 
 #include "tensorflow/tsl/platform/types.h"
 #include "tensorflow/tsl/profiler/convert/trace_container.h"
-#include "tensorflow/tsl/profiler/protobuf/trace_events.pb.h"
 #include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tsl {
 namespace profiler {
 
-TraceContainer ConvertXSpaceToTraceEvents(
+TraceContainer ConvertXSpaceToTraceContainer(
     const tensorflow::profiler::XSpace& xspace);
 
 void ConvertXSpaceToTraceEventsString(
     const tensorflow::profiler::XSpace& xspace, std::string* content);
-
-// Not Public API, Testing only.
-void MaybeDropEventsForTraceViewer(std::vector<TraceEvent*>& events,
-                                   uint32 limit);
-
 }  // namespace profiler
 }  // namespace tsl
 
diff --git a/tensorflow/tsl/profiler/convert/xplane_to_trace_events_test.cc b/tensorflow/tsl/profiler/convert/xplane_to_trace_events_test.cc
index c7ac3579084..31090b9e5b7 100644
--- a/tensorflow/tsl/profiler/convert/xplane_to_trace_events_test.cc
+++ b/tensorflow/tsl/profiler/convert/xplane_to_trace_events_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/tsl/profiler/convert/xplane_to_trace_events.h"
 
+#include <limits>
+#include <utility>
+
 #include "tensorflow/tsl/platform/test.h"
 #include "tensorflow/tsl/profiler/protobuf/trace_events.pb.h"
 #include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
@@ -26,7 +29,6 @@ namespace tsl {
 namespace profiler {
 namespace {
 
-using tensorflow::profiler::Trace;
 using tensorflow::profiler::XSpace;
 
 void CreateXSpace(XSpace* space) {
@@ -66,7 +68,7 @@ TEST(ConvertXPlaneToTraceEvents, Convert) {
   XSpace xspace;
   CreateXSpace(&xspace);
 
-  TraceContainer container = ConvertXSpaceToTraceEvents(xspace);
+  TraceContainer container = ConvertXSpaceToTraceContainer(xspace);
 
   ASSERT_EQ(container.trace().devices_size(), 2);
   EXPECT_EQ(
@@ -88,27 +90,11 @@ TEST(ConvertXPlaneToTraceEvents, SkipAsyncOps) {
   event1.SetTimestampNs(100);
   event1.SetDurationNs(1);
 
-  TraceContainer container = ConvertXSpaceToTraceEvents(xspace);
+  TraceContainer container = ConvertXSpaceToTraceContainer(xspace);
 
   ASSERT_THAT(container.UnsortedEvents(), ::testing::IsEmpty());
 }
 
-TEST(ConvertXPlaneToTraceEvents, Drop) {
-  TraceContainer container;
-  for (int i = 0; i < 100; i++) {
-    container.CreateEvent()->set_timestamp_ps((100 - i) % 50);
-  }
-
-  MaybeDropEventsForTraceViewer(container.YieldUnsortedEvents(), 150);
-  EXPECT_EQ(container.UnsortedEvents().size(), 100);  // No dropping.
-
-  MaybeDropEventsForTraceViewer(container.YieldUnsortedEvents(), 50);
-  EXPECT_EQ(container.UnsortedEvents().size(), 50);
-  for (const TraceEvent* const event : container.UnsortedEvents()) {
-    EXPECT_LT(event->timestamp_ps(), 25);
-  }
-}
-
 }  // namespace
 }  // namespace profiler
 }  // namespace tsl
diff --git a/tensorflow/tsl/profiler/lib/BUILD b/tensorflow/tsl/profiler/lib/BUILD
index ed41b7dfa9d..88d78d2b37e 100644
--- a/tensorflow/tsl/profiler/lib/BUILD
+++ b/tensorflow/tsl/profiler/lib/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
-load("//tensorflow/tsl:tsl.bzl", "if_not_android")
+load("//tensorflow/tsl:tsl.bzl", "if_not_android", "set_external_visibility")
 load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
 load(
     "//tensorflow/tsl/profiler/builds:build_config.bzl",
@@ -49,9 +49,9 @@ cc_library(
     name = "profiler_controller",
     srcs = ["profiler_controller.cc"],
     hdrs = ["profiler_controller.h"],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
     deps = [
         ":profiler_interface",
         "//tensorflow/tsl/platform:errors",
@@ -64,10 +64,10 @@ cc_library(
 cc_library(
     name = "profiler_factory",
     hdrs = ["profiler_factory.h"],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl/profiler:internal",
         "//tensorflow/tsl/profiler:xla_profiler_backends",
-    ],
+    ]),
     deps = [
         ":profiler_interface",
         "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
@@ -83,9 +83,9 @@ cc_library(
         "profiler_factory.h",
     ],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
     deps = [
         ":profiler_interface",
         "//tensorflow/tsl/platform:mutex",
@@ -116,11 +116,11 @@ cc_library(
     name = "profiler_interface",
     hdrs = ["profiler_interface.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl:internal",
         "//tensorflow/tsl/profiler:internal",
         "//tensorflow/tsl/profiler:xla_profiler_backends",
-    ],
+    ]),
     deps = [
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
@@ -132,7 +132,7 @@ cc_library(
     srcs = ["profiler_lock.cc"],
     hdrs = ["profiler_lock.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//tensorflow/tsl/profiler:internal"],
+    visibility = set_external_visibility(["//tensorflow/tsl/profiler:internal"]),
     deps = [
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:macros",
@@ -154,16 +154,16 @@ tsl_cc_test(
 cc_library(
     name = "profiler_session",
     hdrs = ["profiler_session.h"],
-    visibility = ["//tensorflow/tsl:internal"],
+    visibility = set_external_visibility(["//tensorflow/tsl:internal"]),
     deps = [
-        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
-        "//tensorflow/tsl/platform:thread_annotations",
+        "//tensorflow/tsl/platform",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:mutex",
-        "//tensorflow/tsl/platform:platform",
         "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:thread_annotations",
         "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
     ] + if_not_android([
         ":profiler_interface",
         ":profiler_lock",
@@ -179,18 +179,18 @@ cc_library(
         "profiler_session.h",
     ],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/python:__pkg__",
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
     deps = [
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:mutex",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:mutex",
         "//tensorflow/tsl/platform:thread_annotations",
         "//tensorflow/tsl/platform:types",
-        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/memory",
     ] + if_not_android([
         ":profiler_collection",
@@ -234,7 +234,7 @@ tsl_cc_test(
 tf_profiler_pybind_cc_library_wrapper(
     name = "traceme_for_pybind",
     actual = ":traceme",
-    visibility = ["//tensorflow/tsl/profiler:xla_internal"],
+    visibility = set_external_visibility(["//tensorflow/tsl/profiler:xla_internal"]),
 )
 
 cc_library(
@@ -243,11 +243,11 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":traceme_encode",
-        "@com_google_absl//absl/strings",
         "//tensorflow/tsl/platform",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:macros",
         "//tensorflow/tsl/platform:types",
+        "@com_google_absl//absl/strings",
     ] + if_not_android([
         "//tensorflow/tsl/profiler/backends/cpu:traceme_recorder",
         "//tensorflow/tsl/profiler/utils:time_utils",
@@ -259,10 +259,10 @@ cc_library(
     hdrs = ["nvtx_utils.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "@com_google_absl//absl/strings",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:macros",
         "//tensorflow/tsl/platform:types",
+        "@com_google_absl//absl/strings",
     ] + if_not_android([
         "//tensorflow/tsl/profiler/backends/cpu:annotation_stack",
     ]),
@@ -273,9 +273,9 @@ cc_library(
     hdrs = ["scoped_annotation.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "@com_google_absl//absl/strings",
         "//tensorflow/tsl/platform:macros",
         "//tensorflow/tsl/platform:types",
+        "@com_google_absl//absl/strings",
     ] + if_not_android([
         ":nvtx_utils",
         "//tensorflow/tsl/profiler/backends/cpu:annotation_stack",
@@ -285,10 +285,10 @@ cc_library(
 cc_library(
     name = "scoped_annotation_stack",
     hdrs = ["scoped_annotation_stack.h"],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl/profiler:internal",
         "//tensorflow/tsl/profiler:xla_internal",
-    ],
+    ]),
     deps = [
         "@com_google_absl//absl/strings",
     ] + if_not_android([
diff --git a/tensorflow/tsl/profiler/protobuf/BUILD b/tensorflow/tsl/profiler/protobuf/BUILD
index 9a3a9e75edd..de15c2a3169 100644
--- a/tensorflow/tsl/profiler/protobuf/BUILD
+++ b/tensorflow/tsl/profiler/protobuf/BUILD
@@ -1,4 +1,5 @@
 # copybara:uncomment(oss-unused) load("//net/grpc/go/build_defs:go_grpc_library.bzl", "go_grpc_library")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
@@ -15,7 +16,7 @@ tf_proto_library(
     srcs = ["xplane.proto"],
     cc_api_version = 2,
     make_default_target_header_only = True,
-    visibility = [":friends"],
+    visibility = set_external_visibility([":friends"]),
 )
 
 tf_proto_library(
@@ -67,26 +68,35 @@ tf_proto_library(
     name = "trace_events_proto",
     srcs = ["trace_events.proto"],
     cc_api_version = 2,
-    visibility = [":friends"],
+    visibility = set_external_visibility([":friends"]),
 )
 
+# copybara:uncomment_begin(google-only)
+# py_proto_library(
+#     name = "trace_events_py_pb2",
+#     api_version = 2,
+#     visibility = ["//visibility:public"],
+#     deps = [":trace_events_proto"],
+# )
+# copybara:uncomment_end
+
 # This is needed because of how tf_android_core_proto_sources parses proto paths.
 exports_files(
     srcs = ["xplane.proto"],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/tsl:__pkg__",
-    ],
+    ]),
 )
 
 tf_proto_library(
     name = "profile_proto",
     srcs = ["profile.proto"],
     cc_api_version = 2,
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla/python:__pkg__",
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
 )
 
 tf_proto_library(
@@ -104,7 +114,7 @@ tf_proto_library(
 # py_proto_library(
 #     name = "xplane_py_pb2",
 #     api_version = 2,
-#     visibility = [":friends"],
+#     visibility = set_external_visibility([":friends"]),
 #     deps = [":xplane_proto"],
 # )
 # copybara:uncomment_end
diff --git a/tensorflow/tsl/profiler/protobuf/trace_events.proto b/tensorflow/tsl/profiler/protobuf/trace_events.proto
index ea1ca85bb8c..2f7b3075f35 100644
--- a/tensorflow/tsl/profiler/protobuf/trace_events.proto
+++ b/tensorflow/tsl/profiler/protobuf/trace_events.proto
@@ -1,6 +1,6 @@
 syntax = "proto3";
 
-package tensorflow.profiler;
+package tsl.profiler;
 
 option cc_enable_arenas = true;
 option java_outer_classname = "TraceEventsProtos";
diff --git a/tensorflow/tsl/profiler/rpc/BUILD b/tensorflow/tsl/profiler/rpc/BUILD
index bbadf860f89..889ed8f2e35 100644
--- a/tensorflow/tsl/profiler/rpc/BUILD
+++ b/tensorflow/tsl/profiler/rpc/BUILD
@@ -5,10 +5,11 @@ load(
     "tf_profiler_pybind_cc_library_wrapper",
 )
 load("//tensorflow/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/tsl/profiler:internal"],
+    default_visibility = set_external_visibility(["//tensorflow/tsl/profiler:internal"]),
     licenses = ["notice"],
 )
 
@@ -18,18 +19,15 @@ cc_library(
     srcs = ["profiler_service_impl.cc"],
     hdrs = ["profiler_service_impl.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core/data/service:__pkg__",
         "//tensorflow/core/distributed_runtime/rpc:__pkg__",
         "//tensorflow/core/profiler/rpc:__pkg__",
         "//tensorflow/python:__pkg__",
         "//tensorflow/tsl/profiler/rpc/client:__pkg__",
         "//tensorflow_serving/model_servers:__pkg__",
-    ],
+    ]),
     deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:env_time",
         "//tensorflow/tsl/platform:errors",
@@ -38,14 +36,17 @@ cc_library(
         "//tensorflow/tsl/platform:mutex",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/profiler/lib:profiler_session",
-        "//tensorflow/tsl/profiler/protobuf:profiler_service_proto_cc",
         "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_proto_cc",
         "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/tsl/profiler/rpc/client:save_profile",
         "//tensorflow/tsl/profiler/utils:file_system_utils",
         "//tensorflow/tsl/profiler/utils:math_utils",
         "//tensorflow/tsl/profiler/utils:time_utils",
         "//tensorflow/tsl/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -60,20 +61,20 @@ cc_library(
     srcs = ["profiler_server.cc"],
     hdrs = ["profiler_server.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla:__subpackages__",
         "//tensorflow/core/profiler/rpc:__pkg__",
         "//tensorflow/python:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
         "//tensorflow/tsl/profiler:internal",
         "//tensorflow/tsl/profiler/rpc/client:__pkg__",
-    ],
+    ]),
     deps = [
         ":profiler_service_impl",
-        "@com_google_absl//absl/strings",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:types",
         "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
+        "@com_google_absl//absl/strings",
     ] + tsl_grpc_cc_dependencies(),
     alwayslink = True,
 )
diff --git a/tensorflow/tsl/profiler/rpc/client/BUILD b/tensorflow/tsl/profiler/rpc/client/BUILD
index 5d1d234c562..4e4716f38f7 100644
--- a/tensorflow/tsl/profiler/rpc/client/BUILD
+++ b/tensorflow/tsl/profiler/rpc/client/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 load("//tensorflow/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load(
     "//tensorflow/tsl/platform:build_config.bzl",
@@ -13,9 +14,9 @@ load(
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -24,11 +25,11 @@ cc_library(
     srcs = ["capture_profile.cc"],
     hdrs = ["capture_profile.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla/python:__pkg__",
         "//tensorflow/core/profiler/rpc/client:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
-    ],
+    ]),
     deps = [
         ":profiler_client_for_pybind",
         ":remote_profiler_session_manager",
@@ -53,11 +54,11 @@ cc_library(
     srcs = ["save_profile.cc"],
     hdrs = ["save_profile.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core/profiler/rpc/client:__pkg__",
         "//tensorflow/tsl/profiler:internal",
         "//tensorflow/tsl/profiler/rpc:__pkg__",
-    ],
+    ]),
     deps = [
         "//tensorflow/tsl/lib/io:zlib_compression_options",
         "//tensorflow/tsl/lib/io:zlib_outputbuffer",
@@ -77,20 +78,20 @@ cc_library(
 tf_profiler_pybind_cc_library_wrapper(
     name = "profiler_client_for_pybind",
     actual = ":profiler_client",
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core/profiler/rpc/client:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
-    ],
+    ]),
 )
 
 cc_library(
     name = "profiler_client",
     hdrs = ["profiler_client.h"],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla:__subpackages__",
         "//tensorflow/core/profiler/rpc/client:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
-    ],
+    ]),
     deps = [
         ":profiler_client_impl",
         "//tensorflow/tsl/platform:status",
@@ -109,15 +110,12 @@ cc_library(
         "profiler_client.h",
     ],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core/profiler/rpc/client:__pkg__",
         "//tensorflow/python:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
-    ],
+    ]),
     deps = [
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -125,6 +123,9 @@ cc_library(
         "//tensorflow/tsl/profiler/protobuf:profiler_analysis_cc_grpc_proto",
         "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
         "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
     ] + tsl_grpc_cc_dependencies(),
     alwayslink = True,
 )
@@ -134,15 +135,15 @@ cc_library(
     testonly = 1,
     hdrs = ["profiler_client_test_util.h"],
     deps = [
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/lib:profiler_session",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "//tensorflow/tsl/profiler/lib:profiler_session",
-        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
-        "//tensorflow/tsl/platform:test",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:types",
-        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
     ] + tf_protos_profiler_service(),
 )
 
@@ -153,9 +154,8 @@ tsl_cc_test(
         ":profiler_client",
         ":profiler_client_impl",  # for oss
         ":profiler_client_test_util",
-        "@com_google_absl//absl/time",
-        "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:env_impl",
+        "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
@@ -165,6 +165,7 @@ tsl_cc_test(
         "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
         "//tensorflow/tsl/profiler/rpc:profiler_service_impl",
         "//tensorflow/tsl/profiler/utils:time_utils_impl",
+        "@com_google_absl//absl/time",
     ] + tf_protos_profiler_service(),
 )
 
@@ -197,18 +198,18 @@ tsl_cc_test(
         ":profiler_client_impl",  # for oss
         ":profiler_client_test_util",
         ":remote_profiler_session_manager",
-        "@com_google_absl//absl/time",
         "//tensorflow/tsl/platform:env_impl",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
         "//tensorflow/tsl/platform:types",
-        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
         "//tensorflow/tsl/profiler/lib:profiler_factory_impl",
         "//tensorflow/tsl/profiler/lib:profiler_session_impl",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
         "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
         "//tensorflow/tsl/profiler/rpc:profiler_service_impl",
         "//tensorflow/tsl/profiler/utils:time_utils_impl",
+        "@com_google_absl//absl/time",
     ] + tf_protos_profiler_service(),
 )
diff --git a/tensorflow/tsl/profiler/rpc/client/capture_profile.cc b/tensorflow/tsl/profiler/rpc/client/capture_profile.cc
index 0d62fa28aec..8e7b7833422 100644
--- a/tensorflow/tsl/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/tsl/profiler/rpc/client/capture_profile.cc
@@ -119,7 +119,7 @@ inline bool ShouldRetryTracing(Status status) {
          // removed" error message. This should not be treated as an
          // unrecoverable error.
          (status.code() == error::Code::UNKNOWN &&
-          status.error_message() == "Stream removed");
+          status.message() == "Stream removed");
 }
 
 Status Profile(const std::string& repository_root,
@@ -255,10 +255,10 @@ Status ExportToTensorBoard(const XSpace& xspace, const std::string& logdir,
       tsl::profiler::SaveXSpace(repository_root, run, host, xspace));
   if (also_export_trace_json) {
     tsl::profiler::TraceContainer container =
-        tsl::profiler::ConvertXSpaceToTraceEvents(xspace);
+        tsl::profiler::ConvertXSpaceToTraceContainer(xspace);
     return tsl::profiler::SaveGzippedToolData(
         repository_root, run, host, "trace.json.gz",
-        tsl::profiler::TraceEventsToJson(container));
+        tsl::profiler::TraceContainerToJson(container));
   }
   return OkStatus();
 }
diff --git a/tensorflow/tsl/profiler/rpc/client/profiler_client.cc b/tensorflow/tsl/profiler/rpc/client/profiler_client.cc
index 0c8feb6b1e2..e2b9aa0fe8f 100644
--- a/tensorflow/tsl/profiler/rpc/client/profiler_client.cc
+++ b/tensorflow/tsl/profiler/rpc/client/profiler_client.cc
@@ -54,6 +54,7 @@ std::unique_ptr<typename T::Stub> CreateStub(
       service_address, ::grpc::InsecureChannelCredentials(), channel_args);
   if (!channel) {
     LOG(ERROR) << "Unable to create channel" << service_address;
+    return nullptr;
   }
   return T::NewStub(channel);
 }
diff --git a/tensorflow/tsl/profiler/rpc/profiler_service_impl.cc b/tensorflow/tsl/profiler/rpc/profiler_service_impl.cc
index f920ab277b9..b1073351855 100644
--- a/tensorflow/tsl/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/tsl/profiler/rpc/profiler_service_impl.cc
@@ -79,7 +79,7 @@ class ProfilerServiceImpl : public tensorflow::grpc::ProfilerService::Service {
     Status status = profiler->Status();
     if (!status.ok()) {
       return ::grpc::Status(::grpc::StatusCode::INTERNAL,
-                            status.error_message());
+                            std::string(status.message()));
     }
 
     Env* env = Env::Default();
@@ -100,7 +100,7 @@ class ProfilerServiceImpl : public tensorflow::grpc::ProfilerService::Service {
     status = CollectDataToRepository(*req, profiler.get(), response);
     if (!status.ok()) {
       return ::grpc::Status(::grpc::StatusCode::INTERNAL,
-                            status.error_message());
+                            std::string(status.message()));
     }
 
     return ::grpc::Status::OK;
diff --git a/tensorflow/tsl/profiler/utils/BUILD b/tensorflow/tsl/profiler/utils/BUILD
index 494282d1c84..b861b9d1bbc 100644
--- a/tensorflow/tsl/profiler/utils/BUILD
+++ b/tensorflow/tsl/profiler/utils/BUILD
@@ -2,12 +2,13 @@ load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
+load("//tensorflow/tsl:tsl.bzl", "set_external_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
     licenses = ["notice"],
 )
 
@@ -35,7 +36,7 @@ cc_library(
     name = "time_utils",
     hdrs = ["time_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
+    visibility = set_external_visibility([":friends"]),
     deps = [
         ":math_utils",
     ] + if_static([
@@ -50,11 +51,11 @@ cc_library(
         "time_utils.h",
     ],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla:__subpackages__",
         "//tensorflow/tsl/platform/cloud:__pkg__",
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
     deps = [
         ":math_utils",
         "@com_google_absl//absl/time",
@@ -113,12 +114,13 @@ cc_library(
     srcs = ["xplane_schema.cc"],
     hdrs = ["xplane_schema.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
+    visibility = set_external_visibility([":friends"]),
     deps = [
         ":tf_op_utils",
         "//tensorflow/tsl/lib/gtl:map_util",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:regexp",
         "//tensorflow/tsl/platform:types",
         "//tensorflow/tsl/profiler/lib:context_types_hdrs",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -133,7 +135,7 @@ cc_library(
     srcs = ["xplane_visitor.cc"],
     hdrs = ["xplane_visitor.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
+    visibility = set_external_visibility([":friends"]),
     deps = [
         ":timespan",
         "//tensorflow/tsl/platform:logging",
@@ -150,7 +152,7 @@ cc_library(
     srcs = ["xplane_builder.cc"],
     hdrs = ["xplane_builder.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
+    visibility = set_external_visibility([":friends"]),
     deps = [
         ":math_utils",
         ":timespan",
@@ -183,12 +185,14 @@ cc_library(
     name = "trace_utils",
     hdrs = ["trace_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla/backends/profiler/gpu:__pkg__",
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
     deps = [
         "//tensorflow/tsl/platform:types",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -197,7 +201,7 @@ cc_library(
     srcs = ["xplane_utils.cc"],
     hdrs = ["xplane_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
+    visibility = set_external_visibility([":friends"]),
     deps = [
         ":math_utils",
         ":tf_xplane_visitor",
@@ -244,7 +248,7 @@ cc_library(
     name = "tf_xplane_visitor",
     hdrs = ["tf_xplane_visitor.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
+    visibility = set_external_visibility([":friends"]),
     deps = [
         ":xplane_schema",
         ":xplane_visitor",
@@ -257,7 +261,7 @@ cc_library(
     srcs = ["parse_annotation.cc"],
     hdrs = ["parse_annotation.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
+    visibility = set_external_visibility([":friends"]),
     deps = [
         "@com_google_absl//absl/strings",
     ],
@@ -279,7 +283,7 @@ cc_library(
     srcs = ["group_events.cc"],
     hdrs = ["group_events.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
+    visibility = set_external_visibility([":friends"]),
     deps = [
         ":tf_xplane_visitor",
         ":xplane_builder",
@@ -307,7 +311,7 @@ cc_library(
     srcs = ["xplane_test_utils.cc"],
     hdrs = ["xplane_test_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
+    visibility = set_external_visibility([":friends"]),
     deps = [
         ":xplane_builder",
         ":xplane_schema",
@@ -380,10 +384,10 @@ cc_library(
     srcs = ["buffer_pool.cc"],
     hdrs = ["buffer_pool.h"],
     copts = tf_profiler_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/compiler/xla/backends/profiler/gpu:__pkg__",
         "//tensorflow/tsl/profiler:internal",
-    ],
+    ]),
     deps = [
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:mutex",
@@ -401,3 +405,42 @@ tsl_cc_test(
         "//tensorflow/tsl/platform:test_main",
     ],
 )
+
+cc_library(
+    name = "preprocess_xplane",
+    srcs = ["preprocess_xplane.cc"],
+    hdrs = ["preprocess_xplane.h"],
+    copts = tf_profiler_copts(),
+    visibility = [":friends"],
+    deps = [
+        ":trace_utils",
+        ":xplane_builder",
+        ":xplane_schema",
+        "//tensorflow/tsl/profiler/lib:connected_traceme",
+        "//tensorflow/tsl/profiler/lib:context_types",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tsl_cc_test(
+    name = "preprocess_xplane_test",
+    srcs = ["preprocess_xplane_test.cc"],
+    deps = [
+        ":preprocess_xplane",
+        ":tf_xplane_visitor",
+        ":xplane_builder",
+        ":xplane_schema",
+        ":xplane_test_utils",
+        ":xplane_visitor",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/tsl/profiler/lib:connected_traceme",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/hash",
+    ],
+)
diff --git a/tensorflow/tsl/profiler/utils/preprocess_xplane.cc b/tensorflow/tsl/profiler/utils/preprocess_xplane.cc
new file mode 100644
index 00000000000..6883cfa68e1
--- /dev/null
+++ b/tensorflow/tsl/profiler/utils/preprocess_xplane.cc
@@ -0,0 +1,173 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/profiler/utils/preprocess_xplane.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "tensorflow/tsl/profiler/lib/connected_traceme.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/utils/xplane_builder.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+
+namespace tsl {
+namespace profiler {
+namespace {
+
+using ::tsl::profiler::HostEventType;
+using ::tsl::profiler::StatType;
+using ::tsl::profiler::XEventBuilder;
+using ::tsl::profiler::XLineBuilder;
+using ::tsl::profiler::XPlane;
+using ::tsl::profiler::XPlaneBuilder;
+using ::tsl::profiler::XSpace;
+
+void MutateXPlane(XPlane* plane,
+                  const std::vector<std::unique_ptr<XplaneEventMutatorFactory>>&
+                      mutator_factories) {
+  XPlaneBuilder plane_builder(plane);
+
+  absl::flat_hash_map<int64_t, std::vector<std::unique_ptr<XplaneEventMutator>>>
+      mutators_from_event_metadata_id;
+  std::vector<std::unique_ptr<XplaneEventMutator>> line_mutators;
+  for (const auto& mutator_factory : mutator_factories) {
+    auto mutators = mutator_factory->CreateMutators(&plane_builder);
+    for (auto& mutator : mutators) {
+      if (mutator->event_metadata()) {
+        auto id = mutator->event_metadata()->id();
+        mutators_from_event_metadata_id[id].push_back(std::move(mutator));
+      } else {
+        line_mutators.push_back(std::move(mutator));
+      }
+    }
+  }
+  if (mutators_from_event_metadata_id.empty() && line_mutators.empty()) {
+    return;
+  }
+
+  plane_builder.ForEachLine([&](XLineBuilder line_builder) {
+    for (const auto& mutator : line_mutators) {
+      mutator->MutateEventsInLine(&line_builder);
+    }
+    if (mutators_from_event_metadata_id.empty()) return;
+    line_builder.ForEachEvent([&](XEventBuilder event_builder) {
+      auto event_mutators =
+          mutators_from_event_metadata_id.find(event_builder.MetadataId());
+      if (event_mutators != mutators_from_event_metadata_id.end()) {
+        for (const auto& mutator : event_mutators->second) {
+          mutator->Mutate(&event_builder);
+        }
+      }
+    });
+  });
+}
+
+std::vector<std::unique_ptr<XplaneEventMutatorFactory>>
+CreateMutatorFactories() {
+  std::vector<std::unique_ptr<XplaneEventMutatorFactory>> mutator_factories;
+  mutator_factories.push_back(XplaneRootEventMutatorFactory::CreateFactory(
+      HostEventType::kProcessBatch, 2));
+  mutator_factories.push_back(XplaneRootEventMutatorFactory::CreateFactory(
+      HostEventType::kBatchingSessionRun, 1));
+  // Legacy asynchronous TPU execution dispatcher
+  mutator_factories.push_back(
+      XplaneConnectedEventMutatorFactory<
+          /*producer_event=*/HostEventType::kExecutorStateProcess,
+          /*consumer_event=*/HostEventType::kTpuExecuteOp, ContextType::kLegacy,
+          /*unique_stats=*/false,
+          XContextStatsAccessor<uint64_t, StatType::kStepId>,
+          XContextStatsAccessor<uint64_t,
+                                StatType::kIterNum>>::CreateFactory());
+
+// Queue : enque => deque.
+#define ADD_QUEUE_CONNECTION(__enque_event__, __deque_event__)            \
+  mutator_factories.push_back(                                            \
+      XplaneConnectedEventMutatorFactory<                                 \
+          HostEventType::__enque_event__, HostEventType::__deque_event__, \
+          ContextType::kTpuStream, /*unique_stats=*/true,                 \
+          XContextStatsAccessor<uint64, StatType::kRequestId>,            \
+          XContextStatsAccessor<uint64,                                   \
+                                StatType::kQueueAddr>>::CreateFactory())
+  ADD_QUEUE_CONNECTION(kEnqueueRequestLocked, kRunProgramRequest);
+  ADD_QUEUE_CONNECTION(kEnqueueRequestLocked, kHostCallbackRequest);
+  ADD_QUEUE_CONNECTION(kEnqueueRequestLocked, kTransferH2DRequest);
+  ADD_QUEUE_CONNECTION(kEnqueueRequestLocked, kTransferPreprocessedH2DRequest);
+  ADD_QUEUE_CONNECTION(kEnqueueRequestLocked, kTransferD2HRequest);
+  ADD_QUEUE_CONNECTION(kEnqueueRequestLocked, kOnDeviceSendRequest);
+  ADD_QUEUE_CONNECTION(kEnqueueRequestLocked, kOnDeviceRecvRequest);
+  ADD_QUEUE_CONNECTION(kEnqueueRequestLocked, kOnDeviceSendRecvLocalRequest);
+  ADD_QUEUE_CONNECTION(kEnqueueRequestLocked, kCustomWait);
+  ADD_QUEUE_CONNECTION(kEnqueueRequestLocked, kOnDeviceSendRequestMulti);
+  ADD_QUEUE_CONNECTION(kEnqueueRequestLocked, kOnDeviceRecvRequestMulti);
+  ADD_QUEUE_CONNECTION(kEnqueueRequestLocked, kPjrtAsyncWait);
+#undef ADD_QUEUE_CONNECTION
+
+  // Fixup run_id from Host TraceMe to 28 LSB
+  mutator_factories.push_back(
+      HostRunIdMutatorFactory<
+          HostEventType::kDoEnqueueProgram>::CreateFactory());
+  mutator_factories.push_back(
+      HostRunIdMutatorFactory<
+          HostEventType::kCompleteCallbacks>::CreateFactory());
+  mutator_factories.push_back(
+      HostRunIdMutatorFactory<
+          HostEventType::kDoEnqueueContinuationProgram>::CreateFactory());
+
+  // TPU program execution launch related
+  mutator_factories.push_back(
+      XplaneConnectedEventMutatorFactory<
+          /*producer_event=*/HostEventType::kDoEnqueueProgram,
+          /*consumer_event=*/HostEventType::kCompleteCallbacks,
+          ContextType::kTpuLaunch,
+          /*unique_stats=*/true,
+          XContextStatsAccessor<uint64_t, StatType::kDeviceOrdinal>,
+          XContextStatsAccessor<uint64_t, StatType::kQueueId>,
+          XContextStatsAccessor<uint64_t, StatType::kRunId>>::CreateFactory());
+  // TODO(jiesun): remove kDoEnqueueContinuationProgram after 04/21/2023
+  // see cl/443548431.
+  mutator_factories.push_back(
+      XplaneConnectedEventMutatorFactory<
+          /*producer_event=*/HostEventType::kDoEnqueueContinuationProgram,
+          /*consumer_event=*/HostEventType::kCompleteCallbacks,
+          ContextType::kTpuLaunch,
+          /*unique_stats=*/true,
+          XContextStatsAccessor<uint64_t, StatType::kDeviceOrdinal>,
+          XContextStatsAccessor<uint64_t, StatType::kQueueId>,
+          XContextStatsAccessor<uint64_t, StatType::kRunId>>::CreateFactory());
+
+  mutator_factories.push_back(TpuModuleLineMutatorFactory::CreateFactory());
+  return mutator_factories;
+}
+
+}  // namespace
+
+void PreprocessXPlane(XPlane* plane) {
+  auto mutator_factories = CreateMutatorFactories();
+  MutateXPlane(plane, mutator_factories);
+}
+
+void PreprocessXSpace(XSpace* space) {
+  auto mutator_factories = CreateMutatorFactories();
+  for (XPlane& plane : *space->mutable_planes()) {
+    MutateXPlane(&plane, mutator_factories);
+  }
+}
+
+}  // namespace profiler
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/utils/preprocess_xplane.h b/tensorflow/tsl/profiler/utils/preprocess_xplane.h
new file mode 100644
index 00000000000..a0436d1c0d5
--- /dev/null
+++ b/tensorflow/tsl/profiler/utils/preprocess_xplane.h
@@ -0,0 +1,435 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_UTILS_PREPROCESS_XPLANE_H_
+#define TENSORFLOW_TSL_PROFILER_UTILS_PREPROCESS_XPLANE_H_
+
+#include <cstdint>
+#include <initializer_list>
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/tsl/profiler/lib/context_types.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/utils/trace_utils.h"
+#include "tensorflow/tsl/profiler/utils/xplane_builder.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+
+namespace tsl {
+namespace profiler {
+
+static constexpr uint32_t kRunIdMask = (1U << 27) - 1;
+
+/*
+ * Subclass of this interface will perform different mutatation to the event.
+ * Checking eligibilities of event mutation is not responsible of this class.
+ */
+class XplaneEventMutator {
+ public:
+  virtual ~XplaneEventMutator() = default;
+
+  // Mutate event by event specifiedd by the event_metadata.
+  virtual void Mutate(XEventBuilder* builder) = 0;
+  // Mutate line by line if event_metadata() return nullptr.
+  virtual void MutateEventsInLine(XLineBuilder* line) = 0;
+
+  const XEventMetadata* event_metadata() const { return event_metadata_; }
+
+ protected:
+  explicit XplaneEventMutator(XEventMetadata* event_metadata)
+      : event_metadata_(event_metadata) {}
+
+  XEventMetadata* event_metadata_;
+};
+
+class XplaneEventMutatorFactory {
+ public:
+  virtual ~XplaneEventMutatorFactory() = default;
+
+  virtual std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
+      XPlaneBuilder* xplane) const = 0;
+
+ protected:
+  XplaneEventMutatorFactory() = default;
+};
+
+/*
+ * mutate specific HostEventType by adding "_r" Xstats, which equal to the
+ * specified root level.
+ */
+class XplaneRootEventMutatorFactory : public XplaneEventMutatorFactory {
+ public:
+  static std::unique_ptr<XplaneEventMutatorFactory> CreateFactory(
+      HostEventType event_type, int64_t root_level) {
+    std::unique_ptr<XplaneEventMutatorFactory> base;
+    base.reset(new XplaneRootEventMutatorFactory(event_type, root_level));
+    return base;
+  }
+
+  std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
+      XPlaneBuilder* xplane) const override {
+    std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
+    XEventMetadata* event_metadata =
+        xplane->GetEventMetadata(GetHostEventTypeStr(event_type_));
+    if (event_metadata == nullptr) return {};
+    XStatMetadata* root_metadata =
+        xplane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kIsRoot));
+    mutators.emplace_back(std::make_unique<XplaneRootEventMutator>(
+        event_metadata, root_metadata, root_level_));
+    return mutators;
+  }
+
+ private:
+  explicit XplaneRootEventMutatorFactory(HostEventType event_type,
+                                         int64_t root_level)
+      : event_type_(event_type), root_level_(root_level) {}
+
+  class XplaneRootEventMutator : public XplaneEventMutator {
+   public:
+    XplaneRootEventMutator(XEventMetadata* event_metadata,
+                           XStatMetadata* root_stats_metadata,
+                           int64_t root_level)
+        : XplaneEventMutator(event_metadata),
+          root_stats_metadata_(root_stats_metadata),
+          root_level_(root_level) {}
+    void Mutate(XEventBuilder* event_builder) override {
+      event_builder->SetOrAddStatValue(*root_stats_metadata_, root_level_);
+    }
+    void MutateEventsInLine(XLineBuilder* line) override {
+      CHECK(false);  // Crash OK
+    }
+
+   private:
+    XStatMetadata* root_stats_metadata_;
+    int64_t root_level_;
+  };
+
+  HostEventType event_type_;
+  int64_t root_level_;
+};
+
+template <typename StatValueType, StatType kStatId>
+class XContextStatsAccessor {
+ public:
+  using value_type = StatValueType;
+
+  bool Initialize(XPlaneBuilder* xplane) {
+    stats_metadata_ = xplane->GetStatMetadata(GetStatTypeStr(kStatId));
+    return stats_metadata_;
+  }
+
+  std::optional<StatValueType> GetStat(XEventBuilder* event_builder) {
+    auto* stat = event_builder->GetStat(*stats_metadata_);
+    if (stat == nullptr) return std::nullopt;
+    if constexpr (std::is_integral_v<StatValueType>) {
+      return event_builder->IntOrUintValue(*stat);
+    } else {
+      return event_builder->StrOrRefValue(*stat);
+    }
+  }
+
+ private:
+  XStatMetadata* stats_metadata_ = nullptr;
+};
+
+// A template helper for tuple manipulation, although std::apply can achieve
+// similar result. However it requires C++ 17, TF windows bot is still C++ 14.
+template <std::size_t... Idx>
+auto make_index_dispatcher(std::index_sequence<Idx...>) {
+  return [](auto&& f) { (f(std::integral_constant<std::size_t, Idx>{}), ...); };
+}
+
+template <std::size_t N>
+auto make_index_dispatcher() {
+  return make_index_dispatcher(std::make_index_sequence<N>{});
+}
+
+template <typename Tuple, typename Func>
+void for_each(Tuple&& t, Func&& f) {
+  constexpr auto n = std::tuple_size<std::decay_t<Tuple>>::value;
+  auto dispatcher = make_index_dispatcher<n>();
+  dispatcher([&f, &t](auto idx) { f(std::get<idx>(std::forward<Tuple>(t))); });
+}
+
+/*
+ * mutate specific pair of HostEventType with specified XStats list by adding
+ * relevant producer and consumer connected TraceMe 2.0 semantics.
+ * 1. both produer and consumer side of smenatics is populated,
+ * 2. using the specified ContextType.
+ * 3. unique context id is automatically generated.
+ *    if the combination of stats value is unique under specified context_type,
+ *    then set unique_stats true, then context_id is a hash of stats tuple.
+ *    otherwise (unique_stats = false), context id is computed as a hash of
+ *    tuple <producer_event, consumer_event, stats>
+ */
+template <HostEventType producer_event, HostEventType consumer_event,
+          ContextType context_type, bool unique_stats,
+          typename... StatsAccessorTypes>
+class XplaneConnectedEventMutatorFactory : public XplaneEventMutatorFactory {
+ public:
+  static std::unique_ptr<XplaneEventMutatorFactory> CreateFactory() {
+    return absl::WrapUnique(new XplaneConnectedEventMutatorFactory());
+  }
+
+  using StatsAccessors = std::tuple<StatsAccessorTypes...>;
+
+  std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
+      XPlaneBuilder* xplane) const override {
+    // Check if all stats exist in current plane.
+    StatsAccessors stats_accessors;
+    bool all_required_stats_exist = true;
+    auto check_stats_meta = [&all_required_stats_exist,
+                             xplane](auto&& accessor) {
+      if (all_required_stats_exist == false) return;
+      if (!accessor.Initialize(xplane)) all_required_stats_exist = false;
+    };
+    for_each(stats_accessors, check_stats_meta);
+    if (!all_required_stats_exist) return {};
+
+    XEventMetadata* producer_event_metadata =
+        xplane->GetEventMetadata(GetHostEventTypeStr(producer_event));
+    XEventMetadata* consumer_event_metadata =
+        xplane->GetEventMetadata(GetHostEventTypeStr(consumer_event));
+
+    std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
+    if (producer_event_metadata) {
+      XStatMetadata* context_type_metadata = xplane->GetOrCreateStatMetadata(
+          GetStatTypeStr(StatType::kProducerType));
+      XStatMetadata* context_id_metadata = xplane->GetOrCreateStatMetadata(
+          GetStatTypeStr(StatType::kProducerId));
+      mutators.emplace_back(std::make_unique<XplaneConnectedEventMutator>(
+          producer_event_metadata, context_type_metadata, context_id_metadata,
+          stats_accessors));
+    }
+    if (consumer_event_metadata) {
+      XStatMetadata* context_type_metadata = xplane->GetOrCreateStatMetadata(
+          GetStatTypeStr(StatType::kConsumerType));
+      XStatMetadata* context_id_metadata = xplane->GetOrCreateStatMetadata(
+          GetStatTypeStr(StatType::kConsumerId));
+      mutators.emplace_back(std::make_unique<XplaneConnectedEventMutator>(
+          consumer_event_metadata, context_type_metadata, context_id_metadata,
+          stats_accessors));
+    }
+    return mutators;
+  }
+
+ private:
+  XplaneConnectedEventMutatorFactory() = default;
+
+  class XplaneConnectedEventMutator : public XplaneEventMutator {
+   public:
+    XplaneConnectedEventMutator(XEventMetadata* event_metadata,
+                                XStatMetadata* context_type_metadata,
+                                XStatMetadata* context_id_metadata,
+                                const StatsAccessors& accessors)
+        : XplaneEventMutator(event_metadata),
+          context_type_metadata_(context_type_metadata),
+          context_id_metadata_(context_id_metadata),
+          accessors_(accessors) {}
+
+    void Mutate(XEventBuilder* event_builder) override {
+      bool all_required_stats_exist = true;
+      std::vector<std::variant<absl::string_view, uint64_t>> required_stats;
+      auto check_stats_meta = [&all_required_stats_exist, &required_stats,
+                               event_builder](auto&& accessor) {
+        if (all_required_stats_exist == false) return;
+        auto stats_data = accessor.GetStat(event_builder);
+        if (!stats_data) {
+          all_required_stats_exist = false;
+        } else {
+          required_stats.emplace_back(*stats_data);
+        }
+      };
+      for_each(accessors_, check_stats_meta);
+
+      if (!all_required_stats_exist) return;
+
+      int64_t context_id;
+      if constexpr (unique_stats) {
+        context_id = absl::HashOf(required_stats);
+      } else {
+        context_id =
+            absl::HashOf(producer_event, consumer_event, required_stats);
+      }
+      event_builder->SetOrAddStatValue(*context_type_metadata_,
+                                       static_cast<int64_t>(context_type));
+      event_builder->SetOrAddStatValue(*context_id_metadata_, context_id);
+    }
+
+    void MutateEventsInLine(XLineBuilder* line) override {
+      CHECK(false);  // Crash OK
+    }
+
+   private:
+    XStatMetadata* context_type_metadata_;
+    XStatMetadata* context_id_metadata_;
+    StatsAccessors accessors_;
+  };
+};
+
+template <HostEventType event_type>
+class HostRunIdMutatorFactory : public XplaneEventMutatorFactory {
+ public:
+  static std::unique_ptr<XplaneEventMutatorFactory> CreateFactory() {
+    return absl::WrapUnique(new HostRunIdMutatorFactory());
+  }
+
+  std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
+      XPlaneBuilder* xplane) const override {
+    std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
+    XEventMetadata* event_metadata =
+        xplane->GetEventMetadata(GetHostEventTypeStr(event_type));
+    if (event_metadata == nullptr) return {};
+    XContextStatsAccessor<int64_t, StatType::kRunId> run_id_stats_accessor;
+    run_id_stats_accessor.Initialize(xplane);
+    XStatMetadata* run_id_metadata =
+        xplane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kRunId));
+    mutators.emplace_back(std::make_unique<HostRunIdMutator>(
+        event_metadata, run_id_stats_accessor, run_id_metadata));
+    return mutators;
+  }
+
+ private:
+  HostRunIdMutatorFactory() = default;
+  class HostRunIdMutator : public XplaneEventMutator {
+   public:
+    HostRunIdMutator(
+        XEventMetadata* event_metadata,
+        XContextStatsAccessor<int64_t, StatType::kRunId> run_id_stats_accessor,
+        XStatMetadata* run_id_metadata)
+        : XplaneEventMutator(event_metadata),
+          run_id_stats_accessor_(run_id_stats_accessor),
+          run_id_metadata_(run_id_metadata) {}
+
+    void Mutate(XEventBuilder* event_builder) override {
+      auto run_id = run_id_stats_accessor_.GetStat(event_builder);
+      if (!run_id) return;
+      int64_t fixed_run_id = ((uint64_t)run_id.value() & kRunIdMask);
+      event_builder->SetOrAddStatValue(*run_id_metadata_, fixed_run_id);
+    }
+
+    void MutateEventsInLine(XLineBuilder* line) override {
+      CHECK(false);  // Crash OK
+    }
+
+   private:
+    XContextStatsAccessor<int64_t, StatType::kRunId> run_id_stats_accessor_;
+    XStatMetadata* run_id_metadata_;
+  };
+};
+
+// Line mutator for TPU XLA module line.
+// To connect these events with launch events from CPU plane, we need to
+// create appropriate TraceMe 2.0 semantics (_c, _ct stats) from their
+// device_ordinal(from plane name) / run_id / queue_id stats (from event stats).
+class TpuModuleLineMutatorFactory : public XplaneEventMutatorFactory {
+ public:
+  static std::unique_ptr<XplaneEventMutatorFactory> CreateFactory() {
+    return absl::WrapUnique(new TpuModuleLineMutatorFactory());
+  }
+
+  std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
+      XPlaneBuilder* xplane) const override {
+    std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
+    if (absl::StartsWith(xplane->Name(), kTpuPlanePrefix) &&
+        IsTensorCorePlaneName(xplane->Name())) {
+      if (auto device_ordinal = ParseDeviceOrdinal(xplane->Name())) {
+        XStatMetadata* context_type_metadata = xplane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kConsumerType));
+        XStatMetadata* context_id_metadata = xplane->GetOrCreateStatMetadata(
+            GetStatTypeStr(StatType::kConsumerId));
+        XContextStatsAccessor<uint64_t, StatType::kQueueId>
+            queue_id_stats_accessor;
+        XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor;
+        queue_id_stats_accessor.Initialize(xplane);
+        run_id_stats_accessor.Initialize(xplane);
+        mutators.emplace_back(std::make_unique<TpuModuleLineMutator>(
+            *device_ordinal, context_type_metadata, context_id_metadata,
+            queue_id_stats_accessor, run_id_stats_accessor));
+      }
+    }
+    return mutators;
+  }
+
+ private:
+  TpuModuleLineMutatorFactory() = default;
+
+  class TpuModuleLineMutator : public XplaneEventMutator {
+   public:
+    TpuModuleLineMutator(
+        uint32_t device_ordinal, XStatMetadata* context_type_metadata,
+        XStatMetadata* context_id_metadata,
+        XContextStatsAccessor<uint64_t, StatType::kQueueId>
+            queue_id_stats_accessor,
+        XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor)
+        : XplaneEventMutator(nullptr),
+          device_ordinal_(device_ordinal),
+          context_type_metadata_(context_type_metadata),
+          context_id_metadata_(context_id_metadata),
+          queue_id_stats_accessor_(queue_id_stats_accessor),
+          run_id_stats_accessor_(run_id_stats_accessor) {}
+
+    void Mutate(XEventBuilder* event_builder) override {
+      CHECK(false);  // Crash OK
+    }
+
+    void MutateEventsInLine(XLineBuilder* line) override {
+      if (line->Name() != kXlaModuleLineName) return;
+      line->ForEachEvent([&](XEventBuilder event) {
+        auto run_id = run_id_stats_accessor_.GetStat(&event);
+        auto queue_id = queue_id_stats_accessor_.GetStat(&event);
+        if (!run_id || !queue_id) return;
+        // The order of tuple <device_ordinal, queue_id, run_id> need to be
+        // consistent with other kTpuLaunch types.
+        std::vector<std::variant<absl::string_view, uint64_t>> required_stats;
+        required_stats.reserve(3);
+        required_stats.emplace_back(device_ordinal_);
+        required_stats.emplace_back(*queue_id);
+        required_stats.emplace_back(*run_id);
+        int64_t context_id = absl::HashOf(required_stats);
+        event.SetOrAddStatValue(*context_type_metadata_,
+                                static_cast<int64_t>(ContextType::kTpuLaunch));
+        event.SetOrAddStatValue(*context_id_metadata_, context_id);
+      });
+    }
+
+   private:
+    uint64_t device_ordinal_;
+    XStatMetadata* context_type_metadata_;
+    XStatMetadata* context_id_metadata_;
+    XContextStatsAccessor<uint64_t, StatType::kQueueId>
+        queue_id_stats_accessor_;
+    XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor_;
+  };
+};
+
+// Preprocess the given XSpace to support legacy traces. It converts old context
+// events and stats into new ones according to go/xprof-traceme2-semantics.
+void PreprocessXSpace(XSpace* space);
+void PreprocessXPlane(XPlane* plane);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_UTILS_PREPROCESS_XPLANE_H_
diff --git a/tensorflow/tsl/profiler/utils/preprocess_xplane_test.cc b/tensorflow/tsl/profiler/utils/preprocess_xplane_test.cc
new file mode 100644
index 00000000000..48797b587eb
--- /dev/null
+++ b/tensorflow/tsl/profiler/utils/preprocess_xplane_test.cc
@@ -0,0 +1,253 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/profiler/utils/preprocess_xplane.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/hash/hash.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/profiler/lib/connected_traceme.h"
+#include "tensorflow/tsl/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/tsl/profiler/utils/xplane_builder.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+#include "tensorflow/tsl/profiler/utils/xplane_test_utils.h"
+#include "tensorflow/tsl/profiler/utils/xplane_visitor.h"
+
+namespace tsl {
+namespace profiler {
+namespace {
+
+using ::tsl::profiler::CreateTfXPlaneVisitor;
+using ::tsl::profiler::CreateXEvent;
+using ::tsl::profiler::GetHostEventTypeStr;
+using ::tsl::profiler::HostEventType;
+using ::tsl::profiler::StatType;
+using ::tsl::profiler::XEventVisitor;
+using ::tsl::profiler::XLineVisitor;
+using ::tsl::profiler::XPlane;
+using ::tsl::profiler::XPlaneBuilder;
+using ::tsl::profiler::XPlaneVisitor;
+using ::tsl::profiler::XSpace;
+
+TEST(PreprocessXPlane, IsRootStatsTest) {
+  XSpace space;
+  XPlane* plane = space.add_planes();
+  XPlaneBuilder plane_builder(plane);
+  plane_builder.ReserveLines(1);
+  auto line_builder = plane_builder.GetOrCreateLine(0);
+  CreateXEvent(&plane_builder, &line_builder,
+               GetHostEventTypeStr(HostEventType::kProcessBatch), 100, 100);
+  CreateXEvent(&plane_builder, &line_builder,
+               GetHostEventTypeStr(HostEventType::kBatchingSessionRun), 200,
+               100);
+  PreprocessXSpace(&space);
+  XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
+  plane_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      ASSERT_TRUE(event.GetStat(StatType::kIsRoot).has_value());
+      int64_t is_root = event.GetStat(StatType::kIsRoot)->IntValue();
+      if (event.Type() == HostEventType::kBatchingSessionRun) {
+        EXPECT_EQ(is_root, 1);
+      } else if (event.Type() == HostEventType::kProcessBatch) {
+        EXPECT_EQ(is_root, 2);
+      } else {
+        CHECK(false);
+      }
+    });
+  });
+}
+
+TEST(PreprocessXPlane, ProducerConsumerTest) {
+  XSpace space;
+  XPlane* plane = space.add_planes();
+  XPlaneBuilder plane_builder(plane);
+  plane_builder.ReserveLines(2);
+  auto line_builder = plane_builder.GetOrCreateLine(0);
+  CreateXEvent(
+      &plane_builder, &line_builder,
+      GetHostEventTypeStr(HostEventType::kExecutorStateProcess), 100, 100,
+      {{StatType::kStepId, int64_t{123}}, {StatType::kIterNum, int64_t{456}}});
+  line_builder = plane_builder.GetOrCreateLine(1);
+  CreateXEvent(
+      &plane_builder, &line_builder,
+      GetHostEventTypeStr(HostEventType::kTpuExecuteOp), 200, 100,
+      {{StatType::kStepId, int64_t{123}}, {StatType::kIterNum, int64_t{456}}});
+  PreprocessXSpace(&space);
+  absl::optional<uint64_t> producer_context_id, consumer_context_id;
+  XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
+  plane_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      if (event.Type() == HostEventType::kExecutorStateProcess) {
+        auto producer_type = event.GetStat(StatType::kProducerType);
+        ASSERT_TRUE(producer_type.has_value());
+        EXPECT_EQ(producer_type->IntValue(),
+                  static_cast<int64_t>(ContextType::kLegacy));
+        auto producer_id = event.GetStat(StatType::kProducerId);
+        ASSERT_TRUE(producer_id.has_value());
+        producer_context_id = producer_id->IntOrUintValue();
+      } else if (event.Type() == HostEventType::kTpuExecuteOp) {
+        auto consumer_type = event.GetStat(StatType::kConsumerType);
+        ASSERT_TRUE(consumer_type.has_value());
+        EXPECT_EQ(consumer_type->IntValue(),
+                  static_cast<int64_t>(ContextType::kLegacy));
+        auto consumer_id = event.GetStat(StatType::kConsumerId);
+        ASSERT_TRUE(consumer_id.has_value());
+        consumer_context_id = consumer_id->IntOrUintValue();
+      } else {
+        CHECK(false);
+      }
+    });
+  });
+  ASSERT_TRUE(producer_context_id && consumer_context_id);
+  ASSERT_EQ(*producer_context_id, *consumer_context_id);
+}
+
+// Producer and consumer events are assigned different context ids if their
+// context stats do not match, and will not be connected by the grouping code
+// later.
+TEST(PreprocessXPlane, ProducerConsumerNotMatchedTest) {
+  XSpace space;
+  XPlane* plane = space.add_planes();
+  XPlaneBuilder plane_builder(plane);
+  plane_builder.ReserveLines(2);
+  auto line_builder = plane_builder.GetOrCreateLine(0);
+  CreateXEvent(&plane_builder, &line_builder,
+               GetHostEventTypeStr(HostEventType::kExecutorStateProcess), 100,
+               100,
+               {{StatType::kStepId, int64_t{123}},
+                {StatType::kIterNum, int64_t{456}},
+                {StatType::kDeviceOrdinal, int64_t{789}}});
+  line_builder = plane_builder.GetOrCreateLine(1);
+  CreateXEvent(
+      &plane_builder, &line_builder,
+      GetHostEventTypeStr(HostEventType::kTpuExecuteOp), 200, 100,
+      {{StatType::kStepId, int64_t{123}}, {StatType::kIterNum, int64_t{789}}});
+  PreprocessXSpace(&space);
+  absl::optional<uint64_t> producer_context_id, consumer_context_id;
+  XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
+  plane_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      if (event.Type() == HostEventType::kExecutorStateProcess) {
+        auto producer_type = event.GetStat(StatType::kProducerType);
+        ASSERT_TRUE(producer_type.has_value());
+        EXPECT_EQ(producer_type->IntValue(),
+                  static_cast<int64_t>(ContextType::kLegacy));
+        auto producer_id = event.GetStat(StatType::kProducerId);
+        ASSERT_TRUE(producer_id.has_value());
+        producer_context_id = producer_id->IntOrUintValue();
+      } else if (event.Type() == HostEventType::kTpuExecuteOp) {
+        auto consumer_type = event.GetStat(StatType::kConsumerType);
+        ASSERT_TRUE(consumer_type.has_value());
+        EXPECT_EQ(consumer_type->IntValue(),
+                  static_cast<int64_t>(ContextType::kLegacy));
+        auto consumer_id = event.GetStat(StatType::kConsumerId);
+        ASSERT_TRUE(consumer_id.has_value());
+        consumer_context_id = consumer_id->IntOrUintValue();
+      } else {
+        CHECK(false);
+      }
+    });
+  });
+  ASSERT_TRUE(producer_context_id && consumer_context_id);
+  ASSERT_NE(*producer_context_id, *consumer_context_id);
+}
+
+TEST(PreprocessXPlane, MissingLegacyStatTest) {
+  XSpace space;
+  XPlane* plane = space.add_planes();
+  XPlaneBuilder plane_builder(plane);
+  plane_builder.ReserveLines(2);
+  auto line_builder = plane_builder.GetOrCreateLine(0);
+  CreateXEvent(&plane_builder, &line_builder,
+               GetHostEventTypeStr(HostEventType::kExecutorStateProcess), 100,
+               100, {{StatType::kStepId, int64_t{123}}});
+  line_builder = plane_builder.GetOrCreateLine(1);
+  CreateXEvent(&plane_builder, &line_builder,
+               GetHostEventTypeStr(HostEventType::kTpuExecuteOp), 200, 100,
+               {{StatType::kStepId, int64_t{123}}});
+  PreprocessXSpace(&space);
+  XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
+  plane_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      if (event.Type() == HostEventType::kExecutorStateProcess) {
+        // Context stats should not be set if not all legacy context stats
+        // exist.
+        auto producer_type = event.GetStat(StatType::kProducerType);
+        ASSERT_FALSE(producer_type.has_value());
+        auto producer_id = event.GetStat(StatType::kProducerId);
+        ASSERT_FALSE(producer_id.has_value());
+      } else if (event.Type() == HostEventType::kTpuExecuteOp) {
+        auto consumer_type = event.GetStat(StatType::kConsumerType);
+        ASSERT_FALSE(consumer_type.has_value());
+        auto consumer_id = event.GetStat(StatType::kConsumerId);
+        ASSERT_FALSE(consumer_id.has_value());
+      } else {
+        CHECK(false);
+      }
+    });
+  });
+}
+
+TEST(PreprocessXPlane, HostRunIdPreprocessorTest) {
+  XSpace space;
+  XPlane* plane = space.add_planes();
+  XPlaneBuilder plane_builder(plane);
+  plane_builder.ReserveLines(2);
+  auto line_builder = plane_builder.GetOrCreateLine(0);
+  int64_t host_run_id = int64_t{582974244};
+  int64_t device_run_id = int64_t{46103332};
+  CreateXEvent(
+      &plane_builder, &line_builder,
+      GetHostEventTypeStr(HostEventType::kDoEnqueueContinuationProgram), 100,
+      100, {});
+  CreateXEvent(&plane_builder, &line_builder,
+               GetHostEventTypeStr(HostEventType::kDoEnqueueProgram), 100, 100,
+               {{StatType::kRunId, int64_t{host_run_id}}});
+  CreateXEvent(&plane_builder, &line_builder,
+               GetHostEventTypeStr(HostEventType::kTpuExecuteOp), 200, 100,
+               {{StatType::kRunId, int64_t{device_run_id}}});
+  CreateXEvent(&plane_builder, &line_builder,
+               GetHostEventTypeStr(HostEventType::kCompleteCallbacks), 300, 100,
+               {{StatType::kRunId, int64_t{host_run_id}}});
+  line_builder = plane_builder.GetOrCreateLine(1);
+  PreprocessXSpace(&space);
+  XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
+  plane_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      if (event.Type() == HostEventType::kDoEnqueueContinuationProgram) {
+        auto run_id = event.GetStat(StatType::kRunId);
+        ASSERT_FALSE(run_id.has_value());
+      } else if (event.Type() == HostEventType::kDoEnqueueProgram) {
+        auto run_id = event.GetStat(StatType::kRunId);
+        ASSERT_TRUE(run_id.has_value());
+        ASSERT_EQ(run_id->IntValue(), device_run_id);
+      } else if (event.Type() == HostEventType::kTpuExecuteOp) {
+        auto run_id = event.GetStat(StatType::kRunId);
+        ASSERT_TRUE(run_id.has_value());
+        ASSERT_EQ(run_id->IntValue(), device_run_id);
+      } else if (event.Type() == HostEventType::kCompleteCallbacks) {
+        auto run_id = event.GetStat(StatType::kRunId);
+        ASSERT_TRUE(run_id.has_value());
+        ASSERT_EQ(run_id->IntValue(), device_run_id);
+      } else {
+        CHECK(false);
+      }
+    });
+  });
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/utils/trace_utils.h b/tensorflow/tsl/profiler/utils/trace_utils.h
index a8b0a896f6d..755e0c73c9f 100644
--- a/tensorflow/tsl/profiler/utils/trace_utils.h
+++ b/tensorflow/tsl/profiler/utils/trace_utils.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PROFILER_UTILS_TRACE_UTILS_H_
 #define TENSORFLOW_TSL_PROFILER_UTILS_TRACE_UTILS_H_
 
+#include <optional>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/tsl/platform/types.h"
 
 namespace tsl {
@@ -45,6 +48,23 @@ static inline bool IsDerivedThreadId(int thread_id) {
   return thread_id >= kThreadIdDerivedMin && thread_id <= kThreadIdDerivedMax;
 }
 
+// Parses the device ordinal (N) from device names that use TensorFlow
+// convention: "hostname /device:xPU:N".
+static inline std::optional<uint32_t> ParseDeviceOrdinal(
+    absl::string_view device_name) {
+  if (auto pos = device_name.find_last_of(':');
+      pos != absl::string_view::npos) {
+    device_name.remove_prefix(pos + 1);
+  }
+  if (auto pos = device_name.find_first_of(' ');
+      pos != absl::string_view::npos) {
+    device_name.remove_suffix(device_name.size() - pos);
+  }
+  uint32_t device_id;
+  if (absl::SimpleAtoi(device_name, &device_id)) return device_id;
+  return std::nullopt;
+}
+
 }  // namespace profiler
 }  // namespace tsl
 
diff --git a/tensorflow/tsl/profiler/utils/xplane_schema.cc b/tensorflow/tsl/profiler/utils/xplane_schema.cc
index 8b88113262d..4102823eb4d 100644
--- a/tensorflow/tsl/profiler/utils/xplane_schema.cc
+++ b/tensorflow/tsl/profiler/utils/xplane_schema.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/tsl/lib/gtl/map_util.h"
 #include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/regexp.h"
 #include "tensorflow/tsl/platform/types.h"
 #include "tensorflow/tsl/profiler/utils/tf_op_utils.h"
 
@@ -31,6 +32,7 @@ namespace profiler {
 const absl::string_view kHostThreadsPlaneName = "/host:CPU";
 const absl::string_view kGpuPlanePrefix = "/device:GPU:";
 const absl::string_view kTpuPlanePrefix = "/device:TPU:";
+const absl::string_view kTpuNonCorePlaneNamePrefix = "#Chip";
 const char kTpuPlaneRegex[] = {"/device:TPU:[0-9]*$"};
 // TODO(b/195582092): change it to /device:custom once all literals are
 // migrated.
@@ -394,7 +396,38 @@ bool IsInternalStat(absl::optional<int64_t> stat_type) {
   }
 }
 
+bool IsTensorCorePlaneName(absl::string_view plane_name) {
+  DCHECK(absl::StartsWith(plane_name, kTpuPlanePrefix) ||
+         absl::StartsWith(plane_name, kTpuNonCorePlaneNamePrefix))
+      << "unexpected plane name:" << plane_name;
+  return absl::StartsWith(plane_name, kTpuPlanePrefix) &&
+         RE2::FullMatch(plane_name, {kTpuPlaneRegex});
+}
+
 /*static*/ std::atomic<uint64_t> XFlow::next_flow_id_(0);
 
+// String constants for XProf TraceMes.
+const absl::string_view kMegaScaleDcnReceive =
+    "MegaScale: Communication Transport Receive";
+const absl::string_view kMegaScaleDcnSend =
+    "MegaScale: Communication Transport Send";
+const absl::string_view kMegaScaleDcnSendFinished = "MegaScale: Send Finished";
+const absl::string_view kMegaScaleTopologyDiscovery =
+    "MegaScale: Communication Topology Discovery.";
+const absl::string_view kMegaScaleBarrier = "MegaScale: Barrier.";
+const absl::string_view kMegaScaleHostCommand = "MegaScale: HostCommandHandle";
+const absl::string_view kMegaScaleD2HTransferStart =
+    "MegaScale: Device to Host Action";
+const absl::string_view kMegaScaleD2HTransferFinished =
+    "MegaScale: Device to Host Transfer Finished";
+const absl::string_view kMegaScaleH2DTransferStart =
+    "MegaScale: Host to Device Action";
+const absl::string_view kMegaScaleH2DTransferFinished =
+    "MegaScale: Host to Device Transfer Finished";
+const char kXProfMetadataKey[] = "key";
+const char kXProfMetadataFlow[] = "flow";
+const char kXProfMetadataTransfers[] = "transfers";
+const char kXProfMetadataBufferSize[] = "buffer_size";
+
 }  // namespace profiler
 }  // namespace tsl
diff --git a/tensorflow/tsl/profiler/utils/xplane_schema.h b/tensorflow/tsl/profiler/utils/xplane_schema.h
index 84f4f2b36af..e4239e8755e 100644
--- a/tensorflow/tsl/profiler/utils/xplane_schema.h
+++ b/tensorflow/tsl/profiler/utils/xplane_schema.h
@@ -43,6 +43,8 @@ TF_CONST_INIT extern const absl::string_view kTpuPlanePrefix;
 TF_CONST_INIT extern const char kTpuPlaneRegex[];
 // Name prefix of XPlane that contains custom device events.
 TF_CONST_INIT extern const absl::string_view kCustomPlanePrefix;
+// Name prefix of XPlane that contains TPU non-core events such as HBM, ICI etc.
+TF_CONST_INIT extern const absl::string_view kTpuNonCorePlaneNamePrefix;
 // Name prefix of XPlane that contains TPU runtime events.
 TF_CONST_INIT extern const absl::string_view kTpuRuntimePlaneName;
 // Name of XPlane that contains CUPTI driver API generated events.
@@ -320,6 +322,8 @@ inline bool IsStatType(StatType stat_type, absl::string_view stat_name) {
   return GetStatTypeStr(stat_type) == stat_name;
 }
 
+bool IsTensorCorePlaneName(absl::string_view plane_name);
+
 absl::optional<int64_t> FindStatType(absl::string_view stat_name);
 
 // Returns true if the given event shouldn't be shown in the trace viewer.
@@ -398,6 +402,22 @@ class XFlow {
   static_assert(sizeof(encoded_) == sizeof(uint64_t), "Must be 64 bits.");
 };
 
+// String constants for XProf TraceMes for DCN Messages.
+TF_CONST_INIT extern const absl::string_view kMegaScaleDcnReceive;
+TF_CONST_INIT extern const absl::string_view kMegaScaleDcnSend;
+TF_CONST_INIT extern const absl::string_view kMegaScaleDcnSendFinished;
+TF_CONST_INIT extern const absl::string_view kMegaScaleTopologyDiscovery;
+TF_CONST_INIT extern const absl::string_view kMegaScaleBarrier;
+TF_CONST_INIT extern const absl::string_view kMegaScaleHostCommand;
+TF_CONST_INIT extern const absl::string_view kMegaScaleD2HTransferStart;
+TF_CONST_INIT extern const absl::string_view kMegaScaleD2HTransferFinished;
+TF_CONST_INIT extern const absl::string_view kMegaScaleH2DTransferStart;
+TF_CONST_INIT extern const absl::string_view kMegaScaleH2DTransferFinished;
+TF_CONST_INIT extern const char kXProfMetadataKey[];
+TF_CONST_INIT extern const char kXProfMetadataFlow[];
+TF_CONST_INIT extern const char kXProfMetadataTransfers[];
+TF_CONST_INIT extern const char kXProfMetadataBufferSize[];
+
 }  // namespace profiler
 }  // namespace tsl
 
diff --git a/tensorflow/tsl/profiler/utils/xplane_utils.cc b/tensorflow/tsl/profiler/utils/xplane_utils.cc
index 9fd81bb7a32..4977181412e 100644
--- a/tensorflow/tsl/profiler/utils/xplane_utils.cc
+++ b/tensorflow/tsl/profiler/utils/xplane_utils.cc
@@ -589,21 +589,30 @@ void AggregateXPlane(const XPlane& full_trace, XPlane& aggregated_trace) {
   }
 }
 
-bool IsHostPlane(const XPlane& plane) {
+bool IsCustomPlane(const XPlane& plane) {
   // NOTE: remove me after all legacy traces are gone (i.e. 2022/08/04).
   constexpr absl::string_view kLegacyCustomPlanePrefix = "/custom:";
+  return absl::StartsWith(plane.name(), kCustomPlanePrefix) ||
+         absl::StartsWith(plane.name(), kLegacyCustomPlanePrefix);
+}
+
+bool IsHostPlane(const XPlane& plane) {
   return plane.name() == kHostThreadsPlaneName ||
          plane.name() == kHostCpusPlaneName ||
          plane.name() == kTFStreamzPlaneName ||
          plane.name() == kMetadataPlaneName ||
          plane.name() == kSyscallsPlaneName ||
          plane.name() == kPythonTracerPlaneName ||
-         plane.name() == kCuptiDriverApiPlaneName ||
-         absl::StartsWith(plane.name(), kCustomPlanePrefix) ||
-         absl::StartsWith(plane.name(), kLegacyCustomPlanePrefix);
+         plane.name() == kCuptiDriverApiPlaneName;
 }
 
-bool IsDevicePlane(const XPlane& plane) { return !IsHostPlane(plane); }
+bool IsDevicePlane(const XPlane& plane) {
+  // Device and host planes should be mutually exclusive.
+  if (IsHostPlane(plane)) return false;
+  return absl::StartsWith(plane.name(), "/device") ||
+         absl::StartsWith(plane.name(), kTpuNonCorePlaneNamePrefix) ||
+         IsCustomPlane(plane);
+}
 
 }  // namespace profiler
 }  // namespace tsl
diff --git a/tensorflow/tsl/profiler/utils/xplane_utils.h b/tensorflow/tsl/profiler/utils/xplane_utils.h
index a09dd4f569c..416ebffd139 100644
--- a/tensorflow/tsl/profiler/utils/xplane_utils.h
+++ b/tensorflow/tsl/profiler/utils/xplane_utils.h
@@ -208,6 +208,9 @@ class XEventContextTracker {
 // xplane.
 void AggregateXPlane(const XPlane& full_trace, XPlane& aggregated_trace);
 
+// Return whether this is a custom plan.
+bool IsCustomPlane(const XPlane& plane);
+
 // Return whether this is a host plan.
 bool IsHostPlane(const XPlane& plane);
 
diff --git a/tensorflow/tsl/profiler/utils/xplane_utils_test.cc b/tensorflow/tsl/profiler/utils/xplane_utils_test.cc
index 31105510a5b..f7c9f527f3d 100644
--- a/tensorflow/tsl/profiler/utils/xplane_utils_test.cc
+++ b/tensorflow/tsl/profiler/utils/xplane_utils_test.cc
@@ -615,8 +615,8 @@ TEST(XplaneutilsTest, TestIsHostPlane) {
   EXPECT_TRUE(IsHostPlane(*xplane_metadata));
   EXPECT_TRUE(IsHostPlane(*xplane_syscalls));
   EXPECT_TRUE(IsHostPlane(*xplane_python_tracer));
-  EXPECT_TRUE(IsHostPlane(*xplane_custom_prefix));
-  EXPECT_TRUE(IsHostPlane(*xplane_legacy_custom));
+  EXPECT_FALSE(IsHostPlane(*xplane_custom_prefix));
+  EXPECT_FALSE(IsHostPlane(*xplane_legacy_custom));
   EXPECT_TRUE(IsHostPlane(*xplane_cupti));
 }
 
@@ -625,8 +625,17 @@ TEST(XplaneutilsTest, TestIsDevicePlane) {
   auto xplane_host_thread = FindOrAddMutablePlaneWithName(&xspace, "/host:CPU");
   auto xplane_device_thread =
       FindOrAddMutablePlaneWithName(&xspace, "/device:TPU");
+  auto xplane_task_env_thread =
+      FindOrAddMutablePlaneWithName(&xspace, "Task Environment");
+  auto xplane_custom_prefix =
+      FindOrAddMutablePlaneWithName(&xspace, "/device:CUSTOM:123");
+  auto xplane_legacy_custom =
+      FindOrAddMutablePlaneWithName(&xspace, "/custom:456");
   EXPECT_FALSE(IsDevicePlane(*xplane_host_thread));
+  EXPECT_FALSE(IsDevicePlane(*xplane_task_env_thread));
   EXPECT_TRUE(IsDevicePlane(*xplane_device_thread));
+  EXPECT_TRUE(IsDevicePlane(*xplane_custom_prefix));
+  EXPECT_TRUE(IsDevicePlane(*xplane_legacy_custom));
 }
 
 }  // namespace
diff --git a/tensorflow/tsl/protobuf/BUILD b/tensorflow/tsl/protobuf/BUILD
index 52bdf489bea..44f815df0eb 100644
--- a/tensorflow/tsl/protobuf/BUILD
+++ b/tensorflow/tsl/protobuf/BUILD
@@ -1,6 +1,7 @@
 load(
     "//tensorflow/tsl:tsl.bzl",
     "if_google",
+    "set_external_visibility",
 )
 load(
     "//tensorflow/tsl/platform:build_config.bzl",
@@ -9,11 +10,11 @@ load(
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
+    default_visibility = set_external_visibility([
         "//tensorflow/core:__subpackages__",
         "//tensorflow/tsl:internal",
         "//tensorflow_models:__subpackages__",
-    ],
+    ]),
     features = if_google(["-parse_headers"]),
     licenses = ["notice"],
 )
@@ -99,10 +100,10 @@ tf_proto_library(
     name = "test_log_proto",
     srcs = ["test_log.proto"],
     make_default_target_header_only = True,
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__subpackages__",
         "//tensorflow/tsl/util:__pkg__",
-    ],
+    ]),
 )
 
 tf_proto_library(
diff --git a/tensorflow/tsl/protobuf/coordination_config.proto b/tensorflow/tsl/protobuf/coordination_config.proto
index 17558c2414f..ad23d3ce5fd 100644
--- a/tensorflow/tsl/protobuf/coordination_config.proto
+++ b/tensorflow/tsl/protobuf/coordination_config.proto
@@ -56,4 +56,9 @@ message CoordinationServiceConfig {
   // If empty, no jobs will be recoverable and every task failure will cause
   // error propagation to other tasks.
   repeated string recoverable_jobs = 9;
+
+  // If a task restarts with a new incarnation, we may allow it to reconnect
+  // silently. This is useful when we know that a task can immediately resume
+  // work upon re-connecting to the service.
+  bool allow_new_incarnation_to_reconnect = 11;
 }
diff --git a/tensorflow/tsl/protobuf/dnn.proto b/tensorflow/tsl/protobuf/dnn.proto
index ee3763f3861..daad67f448b 100644
--- a/tensorflow/tsl/protobuf/dnn.proto
+++ b/tensorflow/tsl/protobuf/dnn.proto
@@ -174,3 +174,10 @@ message ConvolutionDescriptorProto {
   // Tensorflow node name, same as in NodeDef, for debugging purposes.
   string name = 7;
 }
+
+// FusedMHAKind kind
+enum FusedMHAKind {
+  BMM1_OUTPUT_UNKNOWN = 0;
+  BMM1_OUTPUT_INPUT_TYPE = 1;
+  BMM1_OUTPUT_FLOAT = 2;
+}
diff --git a/tensorflow/tsl/python/lib/core/custom_float.h b/tensorflow/tsl/python/lib/core/custom_float.h
index e0282f1a32f..fd4554baefa 100644
--- a/tensorflow/tsl/python/lib/core/custom_float.h
+++ b/tensorflow/tsl/python/lib/core/custom_float.h
@@ -1163,43 +1163,14 @@ template <typename T>
 struct Ceil {
   T operator()(T a) { return T(std::ceil(static_cast<float>(a))); }
 };
-
-// Helper struct for getting a bit representation provided a byte size.
-template <int kNumBytes>
-struct GetUnsignedInteger;
-
-template <>
-struct GetUnsignedInteger<1> {
-  using type = uint8_t;
-};
-
-template <>
-struct GetUnsignedInteger<2> {
-  using type = uint16_t;
-};
-
-template <typename T>
-using BitsType = typename GetUnsignedInteger<sizeof(T)>::type;
-
-template <typename T>
-std::pair<BitsType<T>, BitsType<T>> SignAndMagnitude(T x) {
-  const BitsType<T> x_abs_bits =
-      Eigen::numext::bit_cast<BitsType<T>>(Eigen::numext::abs(x));
-  const BitsType<T> x_bits = Eigen::numext::bit_cast<BitsType<T>>(x);
-  const BitsType<T> x_sign = x_bits ^ x_abs_bits;
-  return {x_sign, x_abs_bits};
-}
-
 template <typename T>
 struct CopySign {
   T operator()(T a, T b) {
-    auto [a_sign, a_abs_bits] = SignAndMagnitude(a);
-    auto [b_sign, b_abs_bits] = SignAndMagnitude(b);
-    BitsType<T> rep = a_abs_bits | b_sign;
-    return Eigen::numext::bit_cast<T>(rep);
+    auto abs_a = Eigen::numext::abs(a);
+    bool b_sign = static_cast<bool>(Eigen::numext::signbit(b));
+    return b_sign ? -abs_a : abs_a;
   }
 };
-
 template <typename T>
 struct Exp {
   T operator()(T a) { return T(std::exp(static_cast<float>(a))); }
@@ -1344,10 +1315,7 @@ struct Sign {
 };
 template <typename T>
 struct SignBit {
-  bool operator()(T a) {
-    auto [sign_a, abs_a] = SignAndMagnitude(a);
-    return sign_a;
-  }
+  bool operator()(T a) { return static_cast<bool>(Eigen::numext::signbit(a)); }
 };
 template <typename T>
 struct Sqrt {
@@ -1507,40 +1475,35 @@ struct LogicalXor {
 
 template <typename T>
 struct NextAfter {
+  using BitsType =
+      typename Eigen::numext::get_integer_by_size<sizeof(T)>::unsigned_type;
   T operator()(T from, T to) {
-    BitsType<T> from_rep = Eigen::numext::bit_cast<BitsType<T>>(from);
-    BitsType<T> to_rep = Eigen::numext::bit_cast<BitsType<T>>(to);
     if (Eigen::numext::isnan(from) || Eigen::numext::isnan(to)) {
       return std::numeric_limits<T>::quiet_NaN();
     }
-    if (from_rep == to_rep) {
+    if (from == to) {
       return to;
     }
-    auto [from_sign, from_abs] = SignAndMagnitude(from);
-    auto [to_sign, to_abs] = SignAndMagnitude(to);
-    if (from_abs == 0) {
-      if (to_abs == 0) {
-        return to;
-      } else {
-        // Smallest subnormal signed like `to`.
-        return Eigen::numext::bit_cast<T>(
-            static_cast<BitsType<T>>(0x01 | to_sign));
-      }
+    CopySign<T> copysign;
+    if (from == T(0)) {
+      // Smallest subnormal signed like `to`.
+      return copysign(std::numeric_limits<T>::denorm_min(), to);
     }
-    BitsType<T> magnitude_adjustment =
-        (from_abs > to_abs || from_sign != to_sign)
-            ? static_cast<BitsType<T>>(-1)
-            : static_cast<BitsType<T>>(1);
-    BitsType<T> out_int = from_rep + magnitude_adjustment;
-    T out = Eigen::numext::bit_cast<T>(out_int);
     // Some non-IEEE compatible formats may have a representation for NaN
     // instead of -0, ensure we return a zero in such cases.
+    const bool from_is_bigger = from > to;
     if constexpr (!std::numeric_limits<T>::is_iec559) {
-      if (Eigen::numext::isnan(out)) {
-        return Eigen::numext::bit_cast<T>(BitsType<T>{0});
+      if (from == std::numeric_limits<T>::denorm_min() && from_is_bigger) {
+        return copysign(T(0), from);
       }
     }
-    return out;
+    const bool from_sign = static_cast<bool>(Eigen::numext::signbit(from));
+    const BitsType magnitude_adjustment = from_is_bigger != from_sign
+                                              ? static_cast<BitsType>(-1)
+                                              : static_cast<BitsType>(1);
+    const BitsType from_rep = Eigen::numext::bit_cast<BitsType>(from);
+    const BitsType out_int = from_rep + magnitude_adjustment;
+    return Eigen::numext::bit_cast<T>(out_int);
   }
 };
 
diff --git a/tensorflow/tsl/tsl.bzl b/tensorflow/tsl/tsl.bzl
index beb237b5c7b..d2fcbd5e283 100644
--- a/tensorflow/tsl/tsl.bzl
+++ b/tensorflow/tsl/tsl.bzl
@@ -736,3 +736,8 @@ def tsl_pybind_extension_opensource(
 
 # Export open source version of pybind_extension under base name as well.
 tsl_pybind_extension = tsl_pybind_extension_opensource
+
+# Used for specifying external visibility constraints. In non-monorepo situations, this needs to be
+# public, but monorepos can have more precise constraints.
+def set_external_visibility(monorepo_paths):
+    return if_oss(["//visibility:public"], monorepo_paths)
diff --git a/tensorflow/tsl/util/BUILD b/tensorflow/tsl/util/BUILD
index 25fc5e6a9ff..0e624d68a91 100644
--- a/tensorflow/tsl/util/BUILD
+++ b/tensorflow/tsl/util/BUILD
@@ -11,6 +11,7 @@ load(
 load(
     "//tensorflow/tsl:tsl.bzl",
     "check_deps",
+    "set_external_visibility",
     "tsl_copts",
 )
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
@@ -61,10 +62,10 @@ filegroup(
         "determinism.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow:__subpackages__",
         "//tensorflow/core/util:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -94,10 +95,10 @@ filegroup(
         "env_var.h",
         "use_cudnn.h",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/util:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -106,10 +107,10 @@ filegroup(
         "determinism.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow:__subpackages__",
         "//tensorflow/core/util:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -146,7 +147,7 @@ cc_library(
     srcs = ["determinism.cc"],
     hdrs = ["determinism.h"],
     copts = tsl_copts(),
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = set_external_visibility(["//tensorflow:__subpackages__"]),
     deps = [
         ":env_var",
         "//tensorflow/tsl/platform:mutex",
@@ -169,7 +170,7 @@ cc_library(
 alias(
     name = "determinism_for_kernels",
     actual = if_static(":determinism", ":determinism_hdr_lib"),
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = set_external_visibility(["//tensorflow:__subpackages__"]),
 )
 
 check_deps(
@@ -213,10 +214,10 @@ cc_library(
     name = "reporter",
     srcs = ["reporter.cc"],
     hdrs = ["reporter.h"],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__subpackages__",
         "//tensorflow/tsl:__subpackages__",
-    ],
+    ]),
     deps = [
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:env_impl",
@@ -239,9 +240,9 @@ cc_library(
         "stats_calculator.h",
     ],
     copts = tsl_copts(),
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/tsl:internal",
-    ],
+    ]),
 )
 
 tsl_cc_test(
@@ -299,7 +300,7 @@ filegroup(
     srcs = [
         "reporter.h",
     ],
-    visibility = ["//tensorflow/core/util:__pkg__"],
+    visibility = set_external_visibility(["//tensorflow/core/util:__pkg__"]),
 )
 
 filegroup(
@@ -308,10 +309,10 @@ filegroup(
     srcs = [
         "reporter.h",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/util:__pkg__",
-    ],
+    ]),
 )
 
 filegroup(
@@ -321,8 +322,8 @@ filegroup(
         "reporter.cc",
         ":android_test_hdrs",
     ],
-    visibility = [
+    visibility = set_external_visibility([
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/util:__pkg__",
-    ],
+    ]),
 )
diff --git a/tensorflow/tsl/util/device_name_utils.h b/tensorflow/tsl/util/device_name_utils.h
index 32d3c4e8ca8..16ce7466055 100644
--- a/tensorflow/tsl/util/device_name_utils.h
+++ b/tensorflow/tsl/util/device_name_utils.h
@@ -49,6 +49,8 @@ class DeviceNameUtils {
   static std::string FullName(const std::string& job, int replica, int task,
                               const std::string& type, int id);
 
+  // TODO(b/278776328): Convert this to a Protobuf, since emptiness of a field
+  // is a standardized pattern in Protobuf.
   struct ParsedName {
     void Clear() {
       has_job = false;
@@ -78,6 +80,55 @@ class DeviceNameUtils {
       return !operator==(other);
     }
 
+    bool operator<(const ParsedName& other) const {
+      if (has_job != other.has_job) return !has_job;
+      if (has_job) {
+        if (job < other.job) {
+          return true;
+        }
+        if (job > other.job) {
+          return false;
+        }
+      }
+      if (has_replica != other.has_replica) return !has_replica;
+      if (has_replica) {
+        if (replica < other.replica) {
+          return true;
+        }
+        if (replica > other.replica) {
+          return false;
+        }
+      }
+      if (has_task != other.has_task) return !has_task;
+      if (has_task) {
+        if (task < other.task) {
+          return true;
+        }
+        if (task > other.task) {
+          return false;
+        }
+      }
+      if (has_type != other.has_type) return !has_type;
+      if (has_type) {
+        if (type < other.type) {
+          return true;
+        }
+        if (type > other.type) {
+          return false;
+        }
+      }
+      if (has_id != other.has_id) return !has_id;
+      if (has_id) {
+        if (id < other.id) {
+          return true;
+        }
+        if (id > other.id) {
+          return false;
+        }
+      }
+      return false;
+    }
+
     bool has_job = false;
     std::string job;
     bool has_replica = false;
@@ -215,6 +266,17 @@ class DeviceNameUtils {
   // `device_name`.
   static Status DeviceNameToCpuDeviceName(const std::string& device_name,
                                           std::string* host_device_name);
+
+  static bool CompareFullNames(const StringPiece& a, const StringPiece& b) {
+    ParsedName parsed_a;
+    ParsedName parsed_b;
+    bool a_status = ParseFullName(a, &parsed_a);
+    bool b_status = ParseFullName(b, &parsed_b);
+    // Orders unparsable names first.
+    if (a_status != b_status) return !a_status;
+    if (!a_status) return a < b;
+    return parsed_a < parsed_b;
+  }
 };
 
 std::ostream& operator<<(std::ostream& os,
diff --git a/tensorflow/tsl/util/device_name_utils_test.cc b/tensorflow/tsl/util/device_name_utils_test.cc
index 51a3a5fac4e..82a2d72496c 100644
--- a/tensorflow/tsl/util/device_name_utils_test.cc
+++ b/tensorflow/tsl/util/device_name_utils_test.cc
@@ -428,7 +428,7 @@ static void MergeDevNamesError(const string& name_a, const string& name_b,
   DeviceNameUtils::ParsedName target_a = Name(name_a);
   Status s = DeviceNameUtils::MergeDevNames(&target_a, Name(name_b));
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(absl::StrContains(s.error_message(), expected_error_substr)) << s;
+  EXPECT_TRUE(absl::StrContains(s.message(), expected_error_substr)) << s;
 }
 
 static void MergeOverrideHelper(const string& target, const string& name,
@@ -628,6 +628,66 @@ TEST(DeviceNameUtilsTest, CanonicalizeDeviceName) {
   }
 }
 
+TEST(DeviceNameUtilsTest, CompareFullNames) {
+  // False cases for complete names.
+  EXPECT_FALSE(DeviceNameUtils::CompareFullNames(
+      "/job:foo/replica:0/task:0/cpu:0", "/job:foo/replica:0/task:0/cpu:0"));
+
+  EXPECT_FALSE(DeviceNameUtils::CompareFullNames(
+      "/job:foo/replica:0/task:0/device:CPU:1",
+      "/job:foo/replica:0/task:0/device:CPU:0"));
+  EXPECT_FALSE(DeviceNameUtils::CompareFullNames(
+      "/job:foo/replica:0/task:1/device:CPU:0",
+      "/job:foo/replica:0/task:0/device:CPU:0"));
+  EXPECT_FALSE(DeviceNameUtils::CompareFullNames(
+      "/job:foo/replica:1/task:0/device:CPU:0",
+      "/job:foo/replica:0/task:0/device:CPU:0"));
+  EXPECT_FALSE(DeviceNameUtils::CompareFullNames(
+      "/job:goo/replica:0/task:0/device:CPU:0",
+      "/job:foo/replica:0/task:0/device:CPU:0"));
+  EXPECT_FALSE(DeviceNameUtils::CompareFullNames(
+      "/job:foo/replica:0/task:0/device:GPU:0",
+      "/job:foo/replica:0/task:0/device:CPU:0"));
+
+  // True cases for complete names.
+  EXPECT_TRUE(DeviceNameUtils::CompareFullNames(
+      "/job:foo/replica:0/task:0/device:CPU:0",
+      "/job:foo/replica:0/task:0/device:CPU:1"));
+  EXPECT_TRUE(DeviceNameUtils::CompareFullNames(
+      "/job:foo/replica:0/task:0/device:CPU:0",
+      "/job:foo/replica:0/task:1/device:CPU:0"));
+  EXPECT_TRUE(DeviceNameUtils::CompareFullNames(
+      "/job:foo/replica:0/task:0/device:CPU:0",
+      "/job:foo/replica:1/task:0/device:CPU:0"));
+  EXPECT_TRUE(DeviceNameUtils::CompareFullNames(
+      "/job:foo/replica:0/task:0/device:CPU:0",
+      "/job:goo/replica:0/task:0/device:CPU:0"));
+  EXPECT_TRUE(DeviceNameUtils::CompareFullNames(
+      "/job:foo/replica:0/task:0/device:CPU:0",
+      "/job:foo/replica:0/task:0/device:GPU:0"));
+
+  // Unparsable names.
+  EXPECT_FALSE(
+      DeviceNameUtils::CompareFullNames("/device:CPU:1", "unparseablename"));
+  EXPECT_TRUE(
+      DeviceNameUtils::CompareFullNames("unparseablename", "/device:CPU:1"));
+
+  // Test partial names are put before complete names.
+  EXPECT_TRUE(DeviceNameUtils::CompareFullNames(
+      "/replica:0/task:0/device:CPU:1",
+      "/job:foo/replica:0/task:0/device:CPU:0"));
+  EXPECT_FALSE(DeviceNameUtils::CompareFullNames(
+      "/job:foo/replica:0/task:0/device:CPU:0",
+      "/replica:0/task:0/device:CPU:0"));
+
+  // Test compare partial names.
+  EXPECT_TRUE(DeviceNameUtils::CompareFullNames(
+      "/replica:0/task:0/device:CPU:0", "/replica:0/task:0/device:CPU:1"));
+  EXPECT_TRUE(DeviceNameUtils::CompareFullNames("/task:0/device:CPU:0",
+                                                "/task:0/device:CPU:1"));
+  EXPECT_TRUE(
+      DeviceNameUtils::CompareFullNames("/device:CPU:0", "/device:CPU:1"));
+}
 static void BM_ParseFullName(::testing::benchmark::State& state) {
   DeviceNameUtils::ParsedName p;
   for (auto s : state) {
diff --git a/tensorflow/workspace0.bzl b/tensorflow/workspace0.bzl
index c3c081b19ef..d8b53e1ef8b 100644
--- a/tensorflow/workspace0.bzl
+++ b/tensorflow/workspace0.bzl
@@ -8,6 +8,7 @@ load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependenci
 load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
 load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
 load("@local_config_android//:android.bzl", "android_workspace")
+load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
 
 def _tf_bind():
     """Bind targets for some external repositories"""
@@ -132,6 +133,7 @@ def workspace():
     _tf_bind()
 
     grpc_extra_deps()
+    rules_foreign_cc_dependencies()
     config_googleapis()
 
 # Alias so it can be loaded without assigning to a different symbol to prevent
diff --git a/tensorflow/workspace1.bzl b/tensorflow/workspace1.bzl
index fb72d54e4d6..c74a2e13d82 100644
--- a/tensorflow/workspace1.bzl
+++ b/tensorflow/workspace1.bzl
@@ -3,6 +3,7 @@
 load("//third_party/android:android_configure.bzl", "android_configure")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
+load("@com_google_benchmark//:bazel/benchmark_deps.bzl", "benchmark_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 load("@rules_cuda//cuda:dependencies.bzl", "rules_cuda_dependencies")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
@@ -33,6 +34,7 @@ def workspace(with_rules_cc = True):
     android_configure(name = "local_config_android")
 
     grpc_deps()
+    benchmark_deps()
 
 # Alias so it can be loaded without assigning to a different symbol to prevent
 # shadowing previous loads and trigger a buildifier warning.
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 1fe1efc7afe..de3f598da33 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -143,9 +143,9 @@ def _tf_repositories():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "4d047b1ba16e00740aa32f95cc80b40329524bfa175844f9fc61891acc912982",
-        strip_prefix = "XNNPACK-06b2705f1b3e1ba0f161dd2979e2901ce93014e3",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/06b2705f1b3e1ba0f161dd2979e2901ce93014e3.zip"),
+        sha256 = "c979b62e8b77af60dfd7567f22ade20d5a9d4d0888f8a2d60d155fc0d31b22ab",
+        strip_prefix = "XNNPACK-b9d4073a6913891ce9cbd8965c8d506075d2a45a",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/b9d4073a6913891ce9cbd8965c8d506075d2a45a.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
@@ -174,17 +174,9 @@ def _tf_repositories():
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "3c7b842cd67989810955b220fa1116e7e2ed10660a8cfb632118146a64992c30",
-        strip_prefix = "cudnn-frontend-0.7.3",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v0.7.3.zip"),
-    )
-
-    tf_http_archive(
-        name = "mkl_dnn",
-        build_file = "//third_party/mkl_dnn:mkldnn.BUILD",
-        sha256 = "a0211aeb5e7dad50b97fa5dffc1a2fe2fe732572d4164e1ee8750a2ede43fbec",
-        strip_prefix = "oneDNN-0.21.3",
-        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/v0.21.3.tar.gz"),
+        sha256 = "bfcf778030831f325cfc13ae5995388cc834fbff2995a297ba580d9ec65ca3b6",
+        strip_prefix = "cudnn-frontend-0.8",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v0.8.zip"),
     )
 
     tf_http_archive(
@@ -195,6 +187,14 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v2.7.3.tar.gz"),
     )
 
+    tf_http_archive(
+        name = "onednn_v3",
+        build_file = "//third_party/mkl_dnn:mkldnn_v1.BUILD",
+        sha256 = "28e31f2d576e1a7e3a796f5c33c1d733c256078cff1c48b9e2a692d5975e1401",
+        strip_prefix = "oneDNN-3.1",
+        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.1.tar.gz"),
+    )
+
     tf_http_archive(
         name = "mkl_dnn_acl_compatible",
         build_file = "//third_party/mkl_dnn:mkldnn_acl.BUILD",
@@ -214,7 +214,11 @@ def _tf_repositories():
         sha256 = "e20a060d3c4f803889d96c2f0b865004ba3ef4e228299a44339ea1c1ba827c85",
         strip_prefix = "ComputeLibrary-22.11",
         build_file = "//third_party/compute_library:BUILD",
-        patch_file = ["//third_party/compute_library:compute_library.patch", "//third_party/compute_library:acl_fixed_format_kernels_striding.patch", "//third_party/compute_library:acl_openmp_fix.patch"],
+        patch_file = [
+            "//third_party/compute_library:compute_library.patch",
+            "//third_party/compute_library:acl_fixed_format_kernels_striding.patch",
+            "//third_party/compute_library:acl_openmp_fix.patch",
+        ],
         urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v22.11.tar.gz"),
     )
 
@@ -502,10 +506,10 @@ def _tf_repositories():
     tf_http_archive(
         name = "curl",
         build_file = "//third_party:curl.BUILD",
-        sha256 = "dfb8582a05a893e305783047d791ffef5e167d295cf8d12b9eb9cfa0991ca5a9",
-        strip_prefix = "curl-7.88.0",
+        sha256 = "cdb38b72e36bc5d33d5b8810f8018ece1baa29a8f215b4495e495ded82bbf3c7",
+        strip_prefix = "curl-7.88.1",
         system_build_file = "//third_party/systemlibs:curl.BUILD",
-        urls = tf_mirror_urls("https://curl.haxx.se/download/curl-7.88.0.tar.gz"),
+        urls = tf_mirror_urls("https://curl.haxx.se/download/curl-7.88.1.tar.gz"),
     )
 
     # WARNING: make sure ncteisen@ and vpai@ are cc-ed on any CL to change the below rule
diff --git a/third_party/absl/absl_designated_initializers.patch b/third_party/absl/absl_designated_initializers.patch
new file mode 100644
index 00000000000..6ee232238f7
--- /dev/null
+++ b/third_party/absl/absl_designated_initializers.patch
@@ -0,0 +1,65 @@
+diff --git a/absl/crc/internal/crc_memcpy_x86_64.cc b/absl/crc/internal/crc_memcpy_x86_64.cc
+index 66f784de..ff424c54 100644
+--- a/absl/crc/internal/crc_memcpy_x86_64.cc
++++ b/absl/crc/internal/crc_memcpy_x86_64.cc
+@@ -359,18 +359,18 @@ CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
+     case CpuType::kIntelHaswell:
+     case CpuType::kIntelIvybridge:
+       return {
+-          .temporal = new FallbackCrcMemcpyEngine(),
+-          .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
++          /*.temporal=*/new FallbackCrcMemcpyEngine(),
++          /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
+       };
+     // INTEL_SANDYBRIDGE performs better with SSE than AVX.
+     case CpuType::kIntelSandybridge:
+       return {
+-          .temporal = new FallbackCrcMemcpyEngine(),
+-          .non_temporal = new CrcNonTemporalMemcpyEngine(),
++          /*.temporal=*/new FallbackCrcMemcpyEngine(),
++          /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(),
+       };
+     default:
+-      return {.temporal = new FallbackCrcMemcpyEngine(),
+-              .non_temporal = new FallbackCrcMemcpyEngine()};
++      return {/*.temporal=*/new FallbackCrcMemcpyEngine(),
++              /*.non_temporal=*/new FallbackCrcMemcpyEngine()};
+   }
+ #else
+   // Get the underlying architecture.
+@@ -388,8 +388,8 @@ CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
+     case CpuType::kAmdRome:
+     case CpuType::kAmdNaples:
+       return {
+-          .temporal = new AcceleratedCrcMemcpyEngine<1, 2>(),
+-          .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
++          /*.temporal=*/new AcceleratedCrcMemcpyEngine<1, 2>(),
++          /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
+       };
+     // PCLMULQDQ is slow and we don't have wide enough issue width to take
+     // advantage of it.  For an unknown architecture, don't risk using CLMULs.
+@@ -400,18 +400,18 @@ CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
+     case CpuType::kIntelHaswell:
+     case CpuType::kIntelIvybridge:
+       return {
+-          .temporal = new AcceleratedCrcMemcpyEngine<3, 0>(),
+-          .non_temporal = new CrcNonTemporalMemcpyAVXEngine(),
++          /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(),
++          /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
+       };
+     // INTEL_SANDYBRIDGE performs better with SSE than AVX.
+     case CpuType::kIntelSandybridge:
+       return {
+-          .temporal = new AcceleratedCrcMemcpyEngine<3, 0>(),
+-          .non_temporal = new CrcNonTemporalMemcpyEngine(),
++          /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(),
++          /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(),
+       };
+     default:
+-      return {.temporal = new FallbackCrcMemcpyEngine(),
+-              .non_temporal = new FallbackCrcMemcpyEngine()};
++      return {/*.temporal=*/new FallbackCrcMemcpyEngine(),
++              /*.non_temporal=*/new FallbackCrcMemcpyEngine()};
+   }
+ #endif  // UNDEFINED_BEHAVIOR_SANITIZER
+ }
diff --git a/third_party/absl/com_google_absl_fix_mac_and_nvcc_build.patch b/third_party/absl/com_google_absl_fix_mac_and_nvcc_build.patch
deleted file mode 100644
index f5c03435b12..00000000000
--- a/third_party/absl/com_google_absl_fix_mac_and_nvcc_build.patch
+++ /dev/null
@@ -1,267 +0,0 @@
-diff --git a/absl/container/internal/compressed_tuple.h b/absl/container/internal/compressed_tuple.h
-index 5ebe1649..01db7134 100644
---- a/absl/container/internal/compressed_tuple.h
-+++ b/absl/container/internal/compressed_tuple.h
-@@ -32,7 +32,6 @@
- #ifndef ABSL_CONTAINER_INTERNAL_COMPRESSED_TUPLE_H_
- #define ABSL_CONTAINER_INTERNAL_COMPRESSED_TUPLE_H_
- 
--#include <initializer_list>
- #include <tuple>
- #include <type_traits>
- #include <utility>
-@@ -77,134 +76,61 @@ constexpr bool IsFinal() {
- #endif
- }
- 
--// We can't use EBCO on other CompressedTuples because that would mean that we
--// derive from multiple Storage<> instantiations with the same I parameter,
--// and potentially from multiple identical Storage<> instantiations.  So anytime
--// we use type inheritance rather than encapsulation, we mark
--// CompressedTupleImpl, to make this easy to detect.
--struct uses_inheritance {};
--
- template <typename T>
- constexpr bool ShouldUseBase() {
--  return std::is_class<T>::value && std::is_empty<T>::value && !IsFinal<T>() &&
--         !std::is_base_of<uses_inheritance, T>::value;
-+  return std::is_class<T>::value && std::is_empty<T>::value && !IsFinal<T>();
- }
- 
- // The storage class provides two specializations:
- //  - For empty classes, it stores T as a base class.
- //  - For everything else, it stores T as a member.
--template <typename T, size_t I,
--#if defined(_MSC_VER)
--          bool UseBase =
--              ShouldUseBase<typename std::enable_if<true, T>::type>()>
--#else
--          bool UseBase = ShouldUseBase<T>()>
--#endif
-+template <typename D, size_t I, bool = ShouldUseBase<ElemT<D, I>>()>
- struct Storage {
-+  using T = ElemT<D, I>;
-   T value;
-   constexpr Storage() = default;
--  template <typename V>
--  explicit constexpr Storage(absl::in_place_t, V&& v)
--      : value(absl::forward<V>(v)) {}
-+  explicit constexpr Storage(T&& v) : value(absl::forward<T>(v)) {}
-   constexpr const T& get() const& { return value; }
-   T& get() & { return value; }
-   constexpr const T&& get() const&& { return absl::move(*this).value; }
-   T&& get() && { return std::move(*this).value; }
- };
- 
--template <typename T, size_t I>
--struct ABSL_INTERNAL_COMPRESSED_TUPLE_DECLSPEC Storage<T, I, true> : T {
-+template <typename D, size_t I>
-+struct ABSL_INTERNAL_COMPRESSED_TUPLE_DECLSPEC Storage<D, I, true>
-+    : ElemT<D, I> {
-+  using T = internal_compressed_tuple::ElemT<D, I>;
-   constexpr Storage() = default;
--
--  template <typename V>
--  explicit constexpr Storage(absl::in_place_t, V&& v)
--      : T(absl::forward<V>(v)) {}
--
-+  explicit constexpr Storage(T&& v) : T(absl::forward<T>(v)) {}
-   constexpr const T& get() const& { return *this; }
-   T& get() & { return *this; }
-   constexpr const T&& get() const&& { return absl::move(*this); }
-   T&& get() && { return std::move(*this); }
- };
- 
--template <typename D, typename I, bool ShouldAnyUseBase>
-+template <typename D, typename I>
- struct ABSL_INTERNAL_COMPRESSED_TUPLE_DECLSPEC CompressedTupleImpl;
- 
--template <typename... Ts, size_t... I, bool ShouldAnyUseBase>
--struct ABSL_INTERNAL_COMPRESSED_TUPLE_DECLSPEC CompressedTupleImpl<
--    CompressedTuple<Ts...>, absl::index_sequence<I...>, ShouldAnyUseBase>
-+template <typename... Ts, size_t... I>
-+struct ABSL_INTERNAL_COMPRESSED_TUPLE_DECLSPEC
-+    CompressedTupleImpl<CompressedTuple<Ts...>, absl::index_sequence<I...>>
-     // We use the dummy identity function through std::integral_constant to
-     // convince MSVC of accepting and expanding I in that context. Without it
-     // you would get:
-     //   error C3548: 'I': parameter pack cannot be used in this context
--    : uses_inheritance,
--      Storage<Ts, std::integral_constant<size_t, I>::value>... {
-+    : Storage<CompressedTuple<Ts...>,
-+              std::integral_constant<size_t, I>::value>... {
-   constexpr CompressedTupleImpl() = default;
--  template <typename... Vs>
--  explicit constexpr CompressedTupleImpl(absl::in_place_t, Vs&&... args)
--      : Storage<Ts, I>(absl::in_place, absl::forward<Vs>(args))... {}
--  friend CompressedTuple<Ts...>;
-+  explicit constexpr CompressedTupleImpl(Ts&&... args)
-+      : Storage<CompressedTuple<Ts...>, I>(absl::forward<Ts>(args))... {}
- };
- 
--template <typename... Ts, size_t... I>
--struct ABSL_INTERNAL_COMPRESSED_TUPLE_DECLSPEC CompressedTupleImpl<
--    CompressedTuple<Ts...>, absl::index_sequence<I...>, false>
--    // We use the dummy identity function as above...
--    : Storage<Ts, std::integral_constant<size_t, I>::value, false>... {
--  constexpr CompressedTupleImpl() = default;
--  template <typename... Vs>
--  explicit constexpr CompressedTupleImpl(absl::in_place_t, Vs&&... args)
--      : Storage<Ts, I, false>(absl::in_place, absl::forward<Vs>(args))... {}
--  friend CompressedTuple<Ts...>;
--};
--
--std::false_type Or(std::initializer_list<std::false_type>);
--std::true_type Or(std::initializer_list<bool>);
--
--// MSVC requires this to be done separately rather than within the declaration
--// of CompressedTuple below.
--template <typename... Ts>
--constexpr bool ShouldAnyUseBase() {
--  return decltype(
--      Or({std::integral_constant<bool, ShouldUseBase<Ts>()>()...})){};
--}
--
--template <typename T, typename V>
--using TupleElementMoveConstructible =
--    typename std::conditional<std::is_reference<T>::value,
--                              std::is_convertible<V, T>,
--                              std::is_constructible<T, V&&>>::type;
--
--template <bool SizeMatches, class T, class... Vs>
--struct TupleMoveConstructible : std::false_type {};
--
--template <class... Ts, class... Vs>
--struct TupleMoveConstructible<true, CompressedTuple<Ts...>, Vs...>
--    : std::integral_constant<
--          bool, absl::conjunction<
--                    TupleElementMoveConstructible<Ts, Vs&&>...>::value> {};
--
--template <typename T>
--struct compressed_tuple_size;
--
--template <typename... Es>
--struct compressed_tuple_size<CompressedTuple<Es...>>
--    : public std::integral_constant<std::size_t, sizeof...(Es)> {};
--
--template <class T, class... Vs>
--struct TupleItemsMoveConstructible
--    : std::integral_constant<
--          bool, TupleMoveConstructible<compressed_tuple_size<T>::value ==
--                                           sizeof...(Vs),
--                                       T, Vs...>::value> {};
--
- }  // namespace internal_compressed_tuple
- 
- // Helper class to perform the Empty Base Class Optimization.
- // Ts can contain classes and non-classes, empty or not. For the ones that
- // are empty classes, we perform the CompressedTuple. If all types in Ts are
--// empty classes, then CompressedTuple<Ts...> is itself an empty class.  (This
--// does not apply when one or more of those empty classes is itself an empty
--// CompressedTuple.)
-+// empty classes, then CompressedTuple<Ts...> is itself an empty class.
- //
- // To access the members, use member .get<N>() function.
- //
-@@ -220,59 +146,36 @@ struct TupleItemsMoveConstructible
- template <typename... Ts>
- class ABSL_INTERNAL_COMPRESSED_TUPLE_DECLSPEC CompressedTuple
-     : private internal_compressed_tuple::CompressedTupleImpl<
--          CompressedTuple<Ts...>, absl::index_sequence_for<Ts...>,
--          internal_compressed_tuple::ShouldAnyUseBase<Ts...>()> {
-+          CompressedTuple<Ts...>, absl::index_sequence_for<Ts...>> {
-  private:
-   template <int I>
-   using ElemT = internal_compressed_tuple::ElemT<CompressedTuple, I>;
- 
--  template <int I>
--  using StorageT = internal_compressed_tuple::Storage<ElemT<I>, I>;
--
-  public:
--  // There seems to be a bug in MSVC dealing in which using '=default' here will
--  // cause the compiler to ignore the body of other constructors. The work-
--  // around is to explicitly implement the default constructor.
--#if defined(_MSC_VER)
--  constexpr CompressedTuple() : CompressedTuple::CompressedTupleImpl() {}
--#else
-   constexpr CompressedTuple() = default;
--#endif
--  explicit constexpr CompressedTuple(const Ts&... base)
--      : CompressedTuple::CompressedTupleImpl(absl::in_place, base...) {}
--
--  template <typename First, typename... Vs,
--            absl::enable_if_t<
--                absl::conjunction<
--                    // Ensure we are not hiding default copy/move constructors.
--                    absl::negation<std::is_same<void(CompressedTuple),
--                                                void(absl::decay_t<First>)>>,
--                    internal_compressed_tuple::TupleItemsMoveConstructible<
--                        CompressedTuple<Ts...>, First, Vs...>>::value,
--                bool> = true>
--  explicit constexpr CompressedTuple(First&& first, Vs&&... base)
--      : CompressedTuple::CompressedTupleImpl(absl::in_place,
--                                             absl::forward<First>(first),
--                                             absl::forward<Vs>(base)...) {}
-+  explicit constexpr CompressedTuple(Ts... base)
-+      : CompressedTuple::CompressedTupleImpl(absl::forward<Ts>(base)...) {}
- 
-   template <int I>
-   ElemT<I>& get() & {
--    return StorageT<I>::get();
-+    return internal_compressed_tuple::Storage<CompressedTuple, I>::get();
-   }
- 
-   template <int I>
-   constexpr const ElemT<I>& get() const& {
--    return StorageT<I>::get();
-+    return internal_compressed_tuple::Storage<CompressedTuple, I>::get();
-   }
- 
-   template <int I>
-   ElemT<I>&& get() && {
--    return std::move(*this).StorageT<I>::get();
-+    return std::move(*this)
-+        .internal_compressed_tuple::template Storage<CompressedTuple, I>::get();
-   }
- 
-   template <int I>
-   constexpr const ElemT<I>&& get() const&& {
--    return absl::move(*this).StorageT<I>::get();
-+    return absl::move(*this)
-+        .internal_compressed_tuple::template Storage<CompressedTuple, I>::get();
-   }
- };
- 
-diff --git a/absl/time/internal/cctz/BUILD.bazel b/absl/time/internal/cctz/BUILD.bazel
-index 7304d40d..75d8e086 100644
---- a/absl/time/internal/cctz/BUILD.bazel
-+++ b/absl/time/internal/cctz/BUILD.bazel
-@@ -72,15 +72,6 @@ cc_library(
-         "include/cctz/time_zone.h",
-         "include/cctz/zone_info_source.h",
-     ],
--    linkopts = select({
--        ":osx": [
--            "-framework Foundation",
--        ],
--        ":ios": [
--            "-framework Foundation",
--        ],
--        "//conditions:default": [],
--    }),
-     visibility = ["//visibility:public"],
-     deps = [
-         ":civil_time",
-diff --git a/absl/base/config.h b/absl/base/config.h
-index 8533aea..07b4e80 100644
---- a/absl/base/config.h.orig
-+++ b/absl/base/config.h
-@@ -906,7 +906,7 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
- // SIMD).
- #ifdef ABSL_INTERNAL_HAVE_ARM_NEON
- #error ABSL_INTERNAL_HAVE_ARM_NEON cannot be directly set
--#elif defined(__ARM_NEON)
-+#elif defined(__ARM_NEON) && !defined(__CUDACC__)
- #define ABSL_INTERNAL_HAVE_ARM_NEON 1
- #endif
- 
diff --git a/third_party/absl/workspace.bzl b/third_party/absl/workspace.bzl
index 8c8923f1008..07f49cebb78 100644
--- a/third_party/absl/workspace.bzl
+++ b/third_party/absl/workspace.bzl
@@ -7,8 +7,8 @@ def repo():
 
     # Attention: tools parse and update these lines.
     # LINT.IfChange
-    ABSL_COMMIT = "273292d1cfc0a94a65082ee350509af1d113344d"
-    ABSL_SHA256 = "94aef187f688665dc299d09286bfa0d22c4ecb86a80b156dff6aabadc5a5c26d"
+    ABSL_COMMIT = "b971ac5250ea8de900eae9f95e06548d14cd95fe"
+    ABSL_SHA256 = "8eeec9382fc0338ef5c60053f3a4b0e0708361375fe51c9e65d0ce46ccfe55a7"
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake)
 
     SYS_DIRS = [
@@ -42,8 +42,9 @@ def repo():
         build_file = "//third_party/absl:com_google_absl.BUILD",
         system_build_file = "//third_party/absl:system.BUILD",
         system_link_files = SYS_LINKS,
-        # TODO(b/234139015): Remove the patch when https://github.com/abseil/abseil-cpp/issues/326 is resolved
-        patch_file = ["//third_party/absl:com_google_absl_fix_mac_and_nvcc_build.patch"],
+        # This patch pulls in a fix for designated initializers that MSVC
+        # complains about. It shouldn't be necessary at the next LTS release.
+        patch_file = ["//third_party/absl:absl_designated_initializers.patch"],
         strip_prefix = "abseil-cpp-{commit}".format(commit = ABSL_COMMIT),
         urls = tf_mirror_urls("https://github.com/abseil/abseil-cpp/archive/{commit}.tar.gz".format(commit = ABSL_COMMIT)),
     )
diff --git a/third_party/benchmark/workspace.bzl b/third_party/benchmark/workspace.bzl
index 54156276cf0..679133c60c0 100644
--- a/third_party/benchmark/workspace.bzl
+++ b/third_party/benchmark/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     """Imports benchmark."""
-    BM_COMMIT = "d572f4777349d43653b21d6c2fc63020ab326db2"
-    BM_SHA256 = "5467caa302752e1f4911b08759364c7d572325d4bf3893bd6b9e09ae7789770d"
+    BM_COMMIT = "f7547e29ccaed7b64ef4f7495ecfff1c9f6f3d03"
+    BM_SHA256 = "552ca3d4d1af4beeb1907980f7096315aa24150d6baf5ac1e5ad90f04846c670"
     tf_http_archive(
         name = "com_google_benchmark",
         sha256 = BM_SHA256,
diff --git a/third_party/compute_library/BUILD b/third_party/compute_library/BUILD
index 68be8a22762..14bde5ac345 100644
--- a/third_party/compute_library/BUILD
+++ b/third_party/compute_library/BUILD
@@ -73,8 +73,8 @@ cc_library(
             "**/*.inl",
         ],
     ) + [
-        "src/core/NEON/kernels/arm_gemm/transform-sve.cpp",
         "src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp",
+        "src/core/NEON/kernels/arm_gemm/transform-sve.cpp",
     ],
     copts = [
         "-march=armv8.2-a+sve",
@@ -136,11 +136,11 @@ cc_library(
             "src/gpu/**",
         ],
     ) + [
-        "src/core/CPP/CPPTypes.cpp",
         "src/c/operators/AclActivation.cpp",
-        "src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp",
-        "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp",
+        "src/core/CPP/CPPTypes.cpp",
         "src/core/NEON/kernels/arm_conv/addressing.cpp",
+        "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp",
+        "src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp",
     ],
     hdrs = glob([
         "src/core/NEON/kernels/**/*.h",
diff --git a/third_party/cudnn_frontend_header_fix.patch b/third_party/cudnn_frontend_header_fix.patch
index a5fee9cae53..bcc3e5943b6 100644
--- a/third_party/cudnn_frontend_header_fix.patch
+++ b/third_party/cudnn_frontend_header_fix.patch
@@ -1,4 +1,4 @@
-From 5c4069558d42fd61d083878335a704eb6f888ca9 Mon Sep 17 00:00:00 2001
+From 6e44c563e7d71e5a8988ac0d0f3259c4e3405b7d Mon Sep 17 00:00:00 2001
 From: Kaixi Hou <kaixih@nvidia.com>
 Date: Tue, 4 May 2021 15:21:11 -0700
 Subject: [PATCH] Update headers path to TF-compat
@@ -18,8 +18,9 @@ Subject: [PATCH] Update headers path to TF-compat
  include/cudnn_frontend_PointWiseDesc.h      | 4 ++--
  include/cudnn_frontend_ReductionDesc.h      | 4 ++--
  include/cudnn_frontend_Resample.h           | 4 ++--
+ include/cudnn_frontend_Rng.h                | 4 ++--
  include/cudnn_frontend_VariantPack.h        | 4 ++--
- 15 files changed, 27 insertions(+), 27 deletions(-)
+ 16 files changed, 29 insertions(+), 29 deletions(-)
 
 diff --git a/include/cudnn_backend_base.h b/include/cudnn_backend_base.h
 index 56d8bec..8ceb19c 100644
@@ -121,7 +122,7 @@ index aac4086..ed1f343 100644
  namespace cudnn_frontend {
  
 diff --git a/include/cudnn_frontend_Heuristics.h b/include/cudnn_frontend_Heuristics.h
-index b1c5e98..2ccd737 100644
+index 680906a..3df8924 100644
 --- a/include/cudnn_frontend_Heuristics.h
 +++ b/include/cudnn_frontend_Heuristics.h
 @@ -25,8 +25,8 @@
@@ -136,7 +137,7 @@ index b1c5e98..2ccd737 100644
  #include "cudnn_frontend_OperationGraph.h"
  #include "cudnn_frontend_EngineConfig.h"
 diff --git a/include/cudnn_frontend_MatMulDesc.h b/include/cudnn_frontend_MatMulDesc.h
-index a7d0714..c5ccb90 100644
+index 0b15295..cae5323 100644
 --- a/include/cudnn_frontend_MatMulDesc.h
 +++ b/include/cudnn_frontend_MatMulDesc.h
 @@ -29,8 +29,8 @@
@@ -151,7 +152,7 @@ index a7d0714..c5ccb90 100644
  #include "cudnn_frontend_utils.h"
  
 diff --git a/include/cudnn_frontend_Operation.h b/include/cudnn_frontend_Operation.h
-index d69decd..15c32d2 100644
+index 097e970..fca04ea 100644
 --- a/include/cudnn_frontend_Operation.h
 +++ b/include/cudnn_frontend_Operation.h
 @@ -30,8 +30,8 @@
@@ -166,7 +167,7 @@ index d69decd..15c32d2 100644
  #include "cudnn_frontend_ConvDesc.h"
  #include "cudnn_frontend_PointWiseDesc.h"
 diff --git a/include/cudnn_frontend_OperationGraph.h b/include/cudnn_frontend_OperationGraph.h
-index 8c708b7..ce5b000 100644
+index 2a68c41..50162bd 100644
 --- a/include/cudnn_frontend_OperationGraph.h
 +++ b/include/cudnn_frontend_OperationGraph.h
 @@ -30,8 +30,8 @@
@@ -181,7 +182,7 @@ index 8c708b7..ce5b000 100644
  #include "cudnn_frontend_Operation.h"
  #include "cudnn_frontend_utils.h"
 diff --git a/include/cudnn_frontend_PointWiseDesc.h b/include/cudnn_frontend_PointWiseDesc.h
-index b62cd27..8b56eeb 100644
+index 7a69b66..cfc9559 100644
 --- a/include/cudnn_frontend_PointWiseDesc.h
 +++ b/include/cudnn_frontend_PointWiseDesc.h
 @@ -30,8 +30,8 @@
@@ -196,7 +197,7 @@ index b62cd27..8b56eeb 100644
  #include "cudnn_frontend_utils.h"
  
 diff --git a/include/cudnn_frontend_ReductionDesc.h b/include/cudnn_frontend_ReductionDesc.h
-index 21c4ee3..e77e4ef 100644
+index e22a0fb..d69e8c6 100644
 --- a/include/cudnn_frontend_ReductionDesc.h
 +++ b/include/cudnn_frontend_ReductionDesc.h
 @@ -29,8 +29,8 @@
@@ -211,7 +212,7 @@ index 21c4ee3..e77e4ef 100644
  #include "cudnn_frontend_utils.h"
  
 diff --git a/include/cudnn_frontend_Resample.h b/include/cudnn_frontend_Resample.h
-index 2174509..d0d7e3b 100644
+index 4d2d197..1b7c24d 100644
 --- a/include/cudnn_frontend_Resample.h
 +++ b/include/cudnn_frontend_Resample.h
 @@ -29,8 +29,8 @@
@@ -225,6 +226,21 @@ index 2174509..d0d7e3b 100644
  
  #include "cudnn_frontend_utils.h"
  
+diff --git a/include/cudnn_frontend_Rng.h b/include/cudnn_frontend_Rng.h
+index 0001ac6..80a623b 100644
+--- a/include/cudnn_frontend_Rng.h
++++ b/include/cudnn_frontend_Rng.h
+@@ -29,8 +29,8 @@
+ #include <sstream>
+ #include <utility>
+ 
+-#include <cudnn.h>
+-#include <cudnn_backend.h>
++#include "third_party/gpus/cudnn/cudnn.h"
++#include "third_party/gpus/cudnn/cudnn_backend.h"
+ 
+ #include "cudnn_frontend_utils.h"
+ 
 diff --git a/include/cudnn_frontend_VariantPack.h b/include/cudnn_frontend_VariantPack.h
 index dc68207..8b47fce 100644
 --- a/include/cudnn_frontend_VariantPack.h
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index 075372fd9a4..b080bbbb385 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -37,19 +37,19 @@ cc_library(
         "lib/amigaos.c",
         "lib/amigaos.h",
         "lib/arpa_telnet.h",
-        "lib/asyn-ares.c",
         "lib/asyn.h",
+        "lib/asyn-ares.c",
         "lib/base64.c",
         "lib/bufref.c",
         "lib/bufref.h",
         "lib/c-hyper.c",
         "lib/c-hyper.h",
-        "lib/cfilters.c",
-        "lib/cfilters.h",
         "lib/cf-http.c",
         "lib/cf-http.h",
         "lib/cf-socket.c",
         "lib/cf-socket.h",
+        "lib/cfilters.c",
+        "lib/cfilters.h",
         "lib/config-amigaos.h",
         "lib/config-dos.h",
         "lib/config-mac.h",
@@ -122,11 +122,11 @@ cc_library(
         "lib/dynbuf.c",
         "lib/dynbuf.h",
         "lib/easy.c",
+        "lib/easy_lock.h",
         "lib/easygetopt.c",
         "lib/easyif.h",
         "lib/easyoptions.c",
         "lib/easyoptions.h",
-        "lib/easy_lock.h",
         "lib/escape.c",
         "lib/escape.h",
         "lib/file.c",
@@ -166,6 +166,8 @@ cc_library(
         "lib/http.h",
         "lib/http2.c",
         "lib/http2.h",
+        "lib/http_aws_sigv4.c",
+        "lib/http_aws_sigv4.h",
         "lib/http_chunks.c",
         "lib/http_chunks.h",
         "lib/http_digest.c",
@@ -176,8 +178,6 @@ cc_library(
         "lib/http_ntlm.h",
         "lib/http_proxy.c",
         "lib/http_proxy.h",
-        "lib/http_aws_sigv4.c",
-        "lib/http_aws_sigv4.h",
         "lib/idn.c",
         "lib/idn.h",
         "lib/if2ip.c",
@@ -277,18 +277,9 @@ cc_library(
         "lib/transfer.h",
         "lib/url.c",
         "lib/url.h",
-        "lib/urldata.h",
-        "lib/urlapi-int.h",
         "lib/urlapi.c",
-        "lib/version.c",
-        "lib/version_win32.c",
-        "lib/version_win32.h",
-        "lib/warnless.c",
-        "lib/warnless.h",
-        "lib/wildcard.c",
-        "lib/wildcard.h",
-        "lib/ws.c",
-        "lib/ws.h",
+        "lib/urlapi-int.h",
+        "lib/urldata.h",
         "lib/vauth/cleartext.c",
         "lib/vauth/cram.c",
         "lib/vauth/digest.c",
@@ -303,6 +294,9 @@ cc_library(
         "lib/vauth/spnego_sspi.c",
         "lib/vauth/vauth.c",
         "lib/vauth/vauth.h",
+        "lib/version.c",
+        "lib/version_win32.c",
+        "lib/version_win32.h",
         "lib/vquic/curl_msh3.c",
         "lib/vquic/curl_msh3.h",
         "lib/vquic/curl_ngtcp2.c",
@@ -347,6 +341,12 @@ cc_library(
         "lib/vtls/wolfssl.h",
         "lib/vtls/x509asn1.c",
         "lib/vtls/x509asn1.h",
+        "lib/warnless.c",
+        "lib/warnless.h",
+        "lib/wildcard.c",
+        "lib/wildcard.h",
+        "lib/ws.c",
+        "lib/ws.h",
     ] + select({
         "@org_tensorflow//tensorflow/tsl:macos": [
             "lib/vtls/sectransp.c",
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index 09dd0d785bc..d9a83085c88 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -40,15 +40,6 @@ cc_library(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = ["**/OWNERS"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "eigen_third_party_header_files",
     srcs = EIGEN3_THIRD_PARTY_HEADERS,
diff --git a/third_party/eigen3/workspace.bzl b/third_party/eigen3/workspace.bzl
index 0fce262a38c..db80ac81e8e 100644
--- a/third_party/eigen3/workspace.bzl
+++ b/third_party/eigen3/workspace.bzl
@@ -7,8 +7,8 @@ def repo():
 
     # Attention: tools parse and update these lines.
     # LINT.IfChange
-    EIGEN_COMMIT = "8fe61900015e8cf61f2a0287be4ccd3be76393b9"
-    EIGEN_SHA256 = "e4b44dc231cf51be9fd8b8548168bdcf1d8283ca3ad509f0aaf145e8cb00927c"
+    EIGEN_COMMIT = "b0f877f8e01e90a5b0f3a79d46ea234899f8b499"
+    EIGEN_SHA256 = "bdb1353ba33a5a7a5caadf822057ac1f0254ba2c5e70512dd1ec20cbb64e2f6c"
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/eigen.cmake)
 
     tf_http_archive(
diff --git a/third_party/fft2d/BUILD b/third_party/fft2d/BUILD
index 987019121a1..fe336bac9f9 100644
--- a/third_party/fft2d/BUILD
+++ b/third_party/fft2d/BUILD
@@ -37,12 +37,3 @@ filegroup(
         "fft2d.h",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = ["**/OWNERS"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index f32f1a5f324..2905833ad15 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -7,8 +7,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE.txt"])
 
-licenses(["notice"])
-
 config_setting(
     name = "platform_freebsd",
     values = {"cpu": "freebsd"},
@@ -46,12 +44,16 @@ filegroup(
         "include/flatbuffers/bfbs_generator.h",
         "include/flatbuffers/buffer.h",
         "include/flatbuffers/buffer_ref.h",
+        "include/flatbuffers/code_generator.h",
         "include/flatbuffers/code_generators.h",
         "include/flatbuffers/default_allocator.h",
         "include/flatbuffers/detached_buffer.h",
         "include/flatbuffers/flatbuffer_builder.h",
         "include/flatbuffers/flatbuffers.h",
+        "include/flatbuffers/flatc.h",
+        "include/flatbuffers/flex_flat_util.h",
         "include/flatbuffers/flexbuffers.h",
+        "include/flatbuffers/grpc.h",
         "include/flatbuffers/hash.h",
         "include/flatbuffers/idl.h",
         "include/flatbuffers/minireflect.h",
@@ -176,7 +178,7 @@ py_library(
 
 filegroup(
     name = "runtime_java_srcs",
-    srcs = glob(["java/com/google/flatbuffers/**/*.java"]),
+    srcs = glob(["java/src/main/java/com/google/flatbuffers/**/*.java"]),
 )
 
 java_library(
diff --git a/third_party/flatbuffers/workspace.bzl b/third_party/flatbuffers/workspace.bzl
index ceb8a2720da..2fa19ab1a96 100644
--- a/third_party/flatbuffers/workspace.bzl
+++ b/third_party/flatbuffers/workspace.bzl
@@ -5,9 +5,9 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "flatbuffers",
-        strip_prefix = "flatbuffers-2.0.6",
-        sha256 = "e2dc24985a85b278dd06313481a9ca051d048f9474e0f199e372fea3ea4248c9",
-        urls = tf_mirror_urls("https://github.com/google/flatbuffers/archive/v2.0.6.tar.gz"),
+        strip_prefix = "flatbuffers-23.1.21",
+        sha256 = "d84cb25686514348e615163b458ae0767001b24b42325f426fd56406fd384238",
+        urls = tf_mirror_urls("https://github.com/google/flatbuffers/archive/v23.1.21.tar.gz"),
         build_file = "//third_party/flatbuffers:flatbuffers.BUILD",
         system_build_file = "//third_party/flatbuffers:BUILD.system",
         link_files = {
diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl
index acb7a594e92..1e29ce9766a 100644
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -66,14 +66,18 @@ def cuda_default_copts():
         ["-O3"]
     ) + cuda_compiler(
         if_cuda_clang = [ "-Xcuda-fatbinary", "--compress-all"],
-        if_nvcc = [ "-Xcuda-fatbinary=--compress-all"]
+        if_nvcc = [
+            "-Xcuda-fatbinary=--compress-all",
+            # Ensure that NVCC matches clang's constexpr behavior.
+            "--expt-relaxed-constexpr"
+        ]
     )
 
 def cuda_gpu_architectures():
     """Returns a list of supported GPU architectures."""
     return %{cuda_gpu_architectures}
 
-def if_cuda_is_configured(x):
+def if_cuda_is_configured(x, no_cuda = []):
     """Tests if the CUDA was enabled during the configure process.
 
     Unlike if_cuda(), this does not require that we are building with
@@ -81,7 +85,7 @@ def if_cuda_is_configured(x):
     """
     if %{cuda_is_configured}:
       return select({"//conditions:default": x})
-    return select({"//conditions:default": []})
+    return select({"//conditions:default": no_cuda})
 
 def cuda_header_library(
         name,
diff --git a/third_party/hwloc/hwloc.BUILD b/third_party/hwloc/hwloc.BUILD
index 6d316eb1f83..7bc6d61892d 100644
--- a/third_party/hwloc/hwloc.BUILD
+++ b/third_party/hwloc/hwloc.BUILD
@@ -261,8 +261,8 @@ cc_library(
     ] + select({
         "@org_tensorflow//tensorflow/tsl:linux_x86_64": [
             "hwloc/topology-linux.c",
-            "include/hwloc/linux.h",
             "hwloc/topology-x86.c",
+            "include/hwloc/linux.h",
             "include/private/cpuid-x86.h",
         ],
         "@org_tensorflow//tensorflow/tsl:linux_aarch64": [
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 8b3f9f0b401..9f61f9e31e5 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -142,6 +142,7 @@ cc_library(
     srcs = [
         "jchuff.h",
         "jconfig.h",
+        "jconfigint.h",
         "jdct.h",
         "jerror.h",
         "jinclude.h",
@@ -368,8 +369,8 @@ HDRS_SIMD_ARM = [
 cc_library(
     name = "simd_armv7a",
     srcs = [
-        "simd/arm/aarch32/jsimd.c",
         "simd/arm/aarch32/jchuff-neon.c",
+        "simd/arm/aarch32/jsimd.c",
     ] + SRCS_SIMD_COMMON + SRCS_SIMD_ARM,
     hdrs = [
         "simd/arm/aarch32/jccolext-neon.c",
@@ -382,8 +383,8 @@ cc_library(
 cc_library(
     name = "simd_armv8a",
     srcs = [
-        "simd/arm/aarch64/jsimd.c",
         "simd/arm/aarch64/jchuff-neon.c",
+        "simd/arm/aarch64/jsimd.c",
     ] + SRCS_SIMD_COMMON + SRCS_SIMD_ARM,
     hdrs = [
         "simd/arm/aarch64/jccolext-neon.c",
diff --git a/third_party/llvm/build.patch b/third_party/llvm/build.patch
index 70dca55b317..bbf8f587aca 100644
--- a/third_party/llvm/build.patch
+++ b/third_party/llvm/build.patch
@@ -11,15 +11,15 @@ index 2b88729d748b..e12d979b4908 100644
 +            #            "lib/Support/BLAKE3/blake3_neon.c",
          ],
          "@platforms//cpu:x86_64": [
--            "lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S",
--            "lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S",
 -            "lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S",
 -            "lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S",
+-            "lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S",
+-            "lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S",
 +            # TODO(b/234415414): temporary disabled
-+            # "lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S",
-+            # "lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S",
 +            # "lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S",
 +            # "lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S",
++            # "lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S",
++            # "lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S",
          ],
          "//conditions:default": [
          ],
diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 822e9db2ccb..561fb512ef3 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,90 +1,119 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
---- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
-+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
-@@ -5003,6 +5003,8 @@
+diff -ruN --strip-trailing-cr a/clang/include/clang/Basic/Specifiers.h b/clang/include/clang/Basic/Specifiers.h
+--- a/clang/include/clang/Basic/Specifiers.h
++++ b/clang/include/clang/Basic/Specifiers.h
+@@ -19,9 +19,6 @@
+ #include "llvm/Support/DataTypes.h"
+ #include "llvm/Support/ErrorHandling.h"
  
-     // ARM -- Scalable Vector Extension
-     case clang::BuiltinType::SveBool:
-+    case clang::BuiltinType::SveBoolx2:
-+    case clang::BuiltinType::SveBoolx4:
-     case clang::BuiltinType::SveCount:
-     case clang::BuiltinType::SveInt8:
-     case clang::BuiltinType::SveInt8x2:
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
---- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
-+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
-@@ -9448,6 +9448,7 @@
-     if (GatherShuffle) {
-       Vec = CreateShuffle(Entries.front()->VectorizedValue,
-                           Entries.back()->VectorizedValue, Mask);
-+      VF = Mask.size();
-       if (Vec1) {
-         // Build final mask.
-         for (auto [I, Idx] : enumerate(Mask)) {
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
---- a/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
-+++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
-@@ -0,0 +1,51 @@
-+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-+; RUN: opt --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=icelake-server -S < %s | FileCheck %s
+-namespace llvm {
+-class raw_ostream;
+-} // namespace llvm
+ namespace clang {
+ 
+   /// Define the meaning of possible values of the kind in ExplicitSpecifier.
+@@ -336,8 +333,6 @@
+     // parameters are assumed to only get null on error.
+     NullableResult,
+   };
+-  /// Prints human-readable debug representation.
+-  llvm::raw_ostream &operator<<(llvm::raw_ostream&, NullabilityKind);
+ 
+   /// Return true if \p L has a weaker nullability annotation than \p R. The
+   /// ordering is: Unspecified < Nullable < NonNull.
+diff -ruN --strip-trailing-cr a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
+--- a/clang/lib/Basic/Diagnostic.cpp
++++ b/clang/lib/Basic/Diagnostic.cpp
+@@ -43,12 +43,28 @@
+ 
+ const StreamingDiagnostic &clang::operator<<(const StreamingDiagnostic &DB,
+                                              DiagNullabilityKind nullability) {
+-  DB.AddString(
+-      ("'" +
+-       getNullabilitySpelling(nullability.first,
+-                              /*isContextSensitive=*/nullability.second) +
+-       "'")
+-          .str());
++  StringRef string;
++  switch (nullability.first) {
++  case NullabilityKind::NonNull:
++    string = nullability.second ? "'nonnull'" : "'_Nonnull'";
++    break;
 +
-+define i1 @test() {
-+; CHECK-LABEL: define i1 @test
-+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-+; CHECK-NEXT:  entry:
-+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 0, 0
-+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP0]], i32 0
-+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i64> [[TMP1]], i64 0, i32 1
-+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i64> [[TMP2]], i64 0, i32 3
-+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> <i32 3, i32 undef, i32 1, i32 0>
-+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> <i64 undef, i64 0, i64 undef, i64 undef>, <4 x i64> [[TMP4]], <4 x i32> <i32 4, i32 1, i32 6, i32 7>
-+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 2, i32 2, i32 1, i32 3>
-+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP3]], [[TMP6]]
-+; CHECK-NEXT:    [[TMP8:%.*]] = sub <8 x i64> [[TMP3]], [[TMP6]]
-+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 5, i32 6, i32 7>
-+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <8 x i64> [[TMP9]], zeroinitializer
-+; CHECK-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP10]])
-+; CHECK-NEXT:    ret i1 [[TMP11]]
-+;
-+entry:
-+  %0 = shl i64 0, 0
-+  %1 = add i64 0, 0
-+  %2 = add i64 0, 0
-+  %3 = or i64 %2, %1
-+  %cmp750 = icmp ult i64 %3, 0
-+  %4 = or i64 %0, %1
-+  %cmp752 = icmp ult i64 %4, 0
-+  %or753 = or i1 %cmp750, %cmp752
-+  %5 = or i64 0, %1
-+  %cmp754 = icmp ult i64 %5, 0
-+  %or755 = or i1 %or753, %cmp754
-+  %6 = extractelement <16 x i64> zeroinitializer, i32 0
-+  %7 = sub i64 %1, %6
-+  %cmp756 = icmp ult i64 %7, 0
-+  %or757 = or i1 %or755, %cmp756
-+  %8 = sub i64 0, %2
-+  %cmp758 = icmp ult i64 %8, 0
-+  %or759 = or i1 %or757, %cmp758
-+  %9 = or i64 0, %2
-+  %cmp760 = icmp ult i64 %9, 0
-+  %or761 = or i1 %or759, %cmp760
-+  %10 = or i64 0, %6
-+  %cmp762 = icmp ult i64 %10, 0
-+  %or763 = or i1 %or761, %cmp762
-+  %11 = or i64 0, %0
-+  %cmp764 = icmp ult i64 %11, 0
-+  %or765 = or i1 %or763, %cmp764
-+  ret i1 %or765
-+}
-diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
---- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
-+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
-@@ -79,6 +79,7 @@
-     deps = [
-         "//libc:__support_cpp_limits",
-         "//libc:__support_cpp_type_traits",
-+        "//libc:errno.__internal__",
-         "//libc/test/UnitTest:LibcUnitTest",
-     ],
- )
++  case NullabilityKind::Nullable:
++    string = nullability.second ? "'nullable'" : "'_Nullable'";
++    break;
++
++  case NullabilityKind::Unspecified:
++    string = nullability.second ? "'null_unspecified'" : "'_Null_unspecified'";
++    break;
++
++  case NullabilityKind::NullableResult:
++    assert(!nullability.second &&
++           "_Nullable_result isn't supported as context-sensitive keyword");
++    string = "_Nullable_result";
++    break;
++  }
++
++  DB.AddString(string);
+   return DB;
+ }
+ 
+diff -ruN --strip-trailing-cr a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
+--- a/clang/lib/Basic/IdentifierTable.cpp
++++ b/clang/lib/Basic/IdentifierTable.cpp
+@@ -849,20 +849,6 @@
+   llvm_unreachable("Unknown nullability kind.");
+ }
+ 
+-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, NullabilityKind NK) {
+-  switch (NK) {
+-  case NullabilityKind::NonNull:
+-    return OS << "NonNull";
+-  case NullabilityKind::Nullable:
+-    return OS << "Nullable";
+-  case NullabilityKind::NullableResult:
+-    return OS << "NullableResult";
+-  case NullabilityKind::Unspecified:
+-    return OS << "Unspecified";
+-  }
+-  llvm_unreachable("Unknown nullability kind.");
+-}
+-
+ diag::kind
+ IdentifierTable::getFutureCompatDiagKind(const IdentifierInfo &II,
+                                          const LangOptions &LangOpts) {
+diff -ruN --strip-trailing-cr a/clang/test/SemaObjC/nullable-result.m b/clang/test/SemaObjC/nullable-result.m
+--- a/clang/test/SemaObjC/nullable-result.m
++++ b/clang/test/SemaObjC/nullable-result.m
+@@ -25,9 +25,9 @@
+ }
+ 
+ void test_dup(void) {
+-  id _Nullable_result _Nullable_result a; // expected-warning {{duplicate nullability specifier '_Nullable_result'}}
+-  id _Nullable _Nullable_result b; // expected-error{{nullability specifier '_Nullable_result' conflicts with existing specifier '_Nullable'}}
+-  id _Nullable_result _Nonnull c; // expected-error{{nullability specifier '_Nonnull' conflicts with existing specifier '_Nullable_result'}}
++  id _Nullable_result _Nullable_result a; // expected-warning {{duplicate nullability specifier _Nullable_result}}
++  id _Nullable _Nullable_result b; // expected-error{{nullability specifier _Nullable_result conflicts with existing specifier '_Nullable'}}
++  id _Nullable_result _Nonnull c; // expected-error{{nullability specifier '_Nonnull' conflicts with existing specifier _Nullable_result}}
+ }
+ 
+ @interface NoContextSensitive
+diff -ruN --strip-trailing-cr a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
+--- a/llvm/lib/Analysis/ValueTracking.cpp
++++ b/llvm/lib/Analysis/ValueTracking.cpp
+@@ -924,12 +924,14 @@
+ 
+     if (Arg == V && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
+       assert(BitWidth == 1 && "assume operand is not i1?");
++      (void)BitWidth;
+       Known.setAllOnes();
+       return;
+     }
+     if (match(Arg, m_Not(m_Specific(V))) &&
+         isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
+       assert(BitWidth == 1 && "assume operand is not i1?");
++      (void)BitWidth;
+       Known.setAllZero();
+       return;
+     }
diff --git a/third_party/llvm/toolchains.patch b/third_party/llvm/toolchains.patch
deleted file mode 100644
index 39bd1f5d485..00000000000
--- a/third_party/llvm/toolchains.patch
+++ /dev/null
@@ -1,55 +0,0 @@
-diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
-index d745b6e30a25..949328283b35 100644
---- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
-+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
-@@ -19,6 +19,29 @@ licenses(["notice"])
- 
- exports_files(["LICENSE.TXT"])
- 
-+config_setting(
-+    name = "macos_arm64",
-+    values = {
-+        "apple_platform_type": "macos",
-+        "cpu": "darwin_arm64",
-+    },
-+)
-+
-+config_setting(
-+    name = "linux_aarch64",
-+    values = {"cpu": "aarch64"},
-+)
-+
-+config_setting(
-+    name = "linux_ppc64le",
-+    values = {"cpu": "ppc"},
-+)
-+
-+config_setting(
-+    name = "linux_s390x",
-+    values = {"cpu": "s390x"},
-+)
-+
- # It may be tempting to add compiler flags here, but that should be avoided.
- # The necessary warnings and other compile flags should be provided by the
- # toolchain or the `.bazelrc` file. This is just a workaround until we have a
-diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl
-index 5507f80efa0b..4ed9f196e52e 100644
---- a/utils/bazel/llvm-project-overlay/llvm/config.bzl
-+++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl
-@@ -90,11 +90,11 @@ os_defines = select({
- # TODO: We should split out host vs. target here.
- llvm_config_defines = os_defines + select({
-     "@bazel_tools//src/conditions:windows": native_arch_defines("X86", "x86_64-pc-win32"),
--    "@bazel_tools//src/conditions:darwin_arm64": native_arch_defines("AArch64", "arm64-apple-darwin"),
--    "@bazel_tools//src/conditions:darwin_x86_64": native_arch_defines("X86", "x86_64-unknown-darwin"),
--    "@bazel_tools//src/conditions:linux_aarch64": native_arch_defines("AArch64", "aarch64-unknown-linux-gnu"),
--    "@bazel_tools//src/conditions:linux_ppc64le": native_arch_defines("PowerPC", "powerpc64le-unknown-linux-gnu"),
--    "@bazel_tools//src/conditions:linux_s390x": native_arch_defines("SystemZ", "systemz-unknown-linux_gnu"),
-+    "//llvm:macos_arm64": native_arch_defines("AArch64", "arm64-apple-darwin"),
-+    "@bazel_tools//src/conditions:darwin": native_arch_defines("X86", "x86_64-unknown-darwin"),
-+    "//llvm:linux_aarch64": native_arch_defines("AArch64", "aarch64-unknown-linux-gnu"),
-+    "//llvm:linux_ppc64le": native_arch_defines("PowerPC", "powerpc64le-unknown-linux-gnu"),
-+    "//llvm:linux_s390x": native_arch_defines("SystemZ", "systemz-unknown-linux_gnu"),
-     "//conditions:default": native_arch_defines("X86", "x86_64-unknown-linux-gnu"),
- }) + [
-     "LLVM_VERSION_MAJOR={}".format(LLVM_VERSION_MAJOR),
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 2dc90bcdb5d..7568013ef85 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "a49118a3f452403908abece104e946de8d5d50e1"
-    LLVM_SHA256 = "ea287d376e20ba9e6d38b3d97084b75021bc35c5b3340d794f6ae70f5b73627f"
+    LLVM_COMMIT = "30af2fb33ed2f610abfa50e53df9712887b2bd25"
+    LLVM_SHA256 = "60c68baf819aef5417f411de2c3c33fce44ce48cc509c243f834317b308d7e71"
 
     tf_http_archive(
         name = name,
@@ -20,7 +20,7 @@ def repo(name):
             "//third_party/llvm:generated.patch",  # Autogenerated, don't remove.
             "//third_party/llvm:build.patch",
             "//third_party/llvm:mathextras.patch",
-            "//third_party/llvm:toolchains.patch",
+            "//third_party/llvm:zstd.patch",
         ],
         link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
     )
diff --git a/third_party/llvm/zstd.patch b/third_party/llvm/zstd.patch
new file mode 100644
index 00000000000..feeab860e3a
--- /dev/null
+++ b/third_party/llvm/zstd.patch
@@ -0,0 +1,15 @@
+diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+index 80d5d677c537..5e47b681052f 100644
+--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
++++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+@@ -283,10 +283,6 @@ cc_library(
+         # be an empty library unless zlib is enabled, in which case it will
+         # both provide the necessary dependencies and configuration defines.
+         "@llvm_zlib//:zlib",
+-        # We unconditionally depend on the custom LLVM zstd wrapper. This will
+-        # be an empty library unless zstd is enabled, in which case it will
+-        # both provide the necessary dependencies and configuration defines.
+-        "@llvm_zstd//:zstd",
+     ],
+ )
+ 
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
index 7fc25ea49d5..151bc26b01c 100644
--- a/third_party/mkl_dnn/BUILD
+++ b/third_party/mkl_dnn/BUILD
@@ -28,6 +28,27 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "build_with_onednn_v3",
+    define_values = {
+        "build_with_mkl": "true",
+        "build_with_onednn_v3": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+# The following config is needed since oneDNN v2.x and v3.x are API incompatible.
+config_setting(
+    name = "build_with_onednn_v2",
+    define_values = {
+        # We are not defining 'build_with_mkl' since this config can be invoked
+        # on x86_64 platforms without --config=mkl (through Eigen contraction
+        # kernel)
+        "build_with_onednn_v2": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "build_with_mkl_aarch64_openmp",
     define_values = {
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index 22fcdf7f58c..0e247690590 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -1,3 +1,11 @@
+"""Starlark macros for oneDNN.
+
+if_mkldnn_openmp checks if we are building x86 backend with OpenMP.
+if_onednn_v3 checks if we are using oneDNN v3.
+if_mkldnn_aarch64_acl checks if we are building with Arm Compute Library.
+if_mkldnn_aarch64_acl_openmp checks if we are building ACL with OpenMP.
+"""
+
 def if_mkldnn_openmp(if_true, if_false = []):
     """Returns `if_true` if OpenMP is used with oneDNN.
 
@@ -14,6 +22,19 @@ def if_mkldnn_openmp(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
+def if_onednn_v3(if_true, if_false = []):
+    """Returns `if_true` if oneDNN v3.x is used.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with oneDNN v3.x open source library only. Otherwise, the select statement
+    evaluates to if_false.
+
+    """
+    return select({
+        "@org_tensorflow//third_party/mkl_dnn:build_with_onednn_v3": if_true,
+        "//conditions:default": if_false,
+    })
+
 def if_mkldnn_aarch64_acl(if_true, if_false = []):
     return select({
         "@org_tensorflow//third_party/mkl:build_with_mkl_aarch64": if_true,
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
deleted file mode 100644
index 8dc4e5fb1a2..00000000000
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ /dev/null
@@ -1,64 +0,0 @@
-exports_files(["LICENSE"])
-
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-
-expand_template(
-    name = "mkldnn_config_h",
-    out = "include/mkldnn_config.h",
-    substitutions = {
-        "#cmakedefine MKLDNN_CPU_BACKEND MKLDNN_BACKEND_${MKLDNN_CPU_BACKEND}": "#define MKLDNN_CPU_BACKEND MKLDNN_BACKEND_NATIVE",
-        "#cmakedefine MKLDNN_GPU_BACKEND MKLDNN_BACKEND_${MKLDNN_GPU_BACKEND}": "#define MKLDNN_GPU_BACKEND MKLDNN_BACKEND_NONE",
-    },
-    template = "include/mkldnn_config.h.in",
-)
-
-# Create the file mkldnn_version.h with MKL-DNN version numbers.
-# Currently, the version numbers are hard coded here. If MKL-DNN is upgraded then
-# the version numbers have to be updated manually. The version numbers can be
-# obtained from the PROJECT_VERSION settings in CMakeLists.txt. The variable is
-# set to "version_major.version_minor.version_patch". The git hash version can
-# be set to NA.
-# TODO(agramesh1) Automatically get the version numbers from CMakeLists.txt.
-# TODO(bhavanis): MKL-DNN minor version needs to be updated for MKL-DNN v1.x.
-# The current version numbers will work only if MKL-DNN v0.21 is used.
-
-expand_template(
-    name = "mkldnn_version_h",
-    out = "include/mkldnn_version.h",
-    substitutions = {
-        "@MKLDNN_VERSION_MAJOR@": "0",
-        "@MKLDNN_VERSION_MINOR@": "21",
-        "@MKLDNN_VERSION_PATCH@": "3",
-        "@MKLDNN_VERSION_HASH@": "N/A",
-    },
-    template = "include/mkldnn_version.h.in",
-)
-
-cc_library(
-    name = "mkldnn_single_threaded",
-    srcs = glob([
-        "src/common/*.cpp",
-        "src/common/*.hpp",
-        "src/cpu/*.cpp",
-        "src/cpu/*.hpp",
-        "src/cpu/**/*.cpp",
-        "src/cpu/**/*.hpp",
-        "src/cpu/xbyak/*.h",
-    ]) + [":mkldnn_version_h"],
-    hdrs = glob(["include/*"]),
-    copts = select({
-        "@org_tensorflow//tensorflow/tsl:windows": [],
-        "//conditions:default": ["-fexceptions"],
-    }) + [
-        "-DMKLDNN_THR=MKLDNN_THR_SEQ",  # Disables threading.
-    ],
-    includes = [
-        "include",
-        "src",
-        "src/common",
-        "src/cpu",
-        "src/cpu/gemm",
-        "src/cpu/xbyak",
-    ],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 26c61c7bffe..6a26ca83b44 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -155,6 +155,7 @@ cc_library(
         ],
         exclude = [
             "src/cpu/aarch64/**",
+            "src/cpu/rv64/**",
             "src/cpu/x64/gemm/**/*_kern_autogen.cpp",
         ],
     ),
diff --git a/third_party/mkl_dnn/onednn_acl_depthwise_convolution.patch b/third_party/mkl_dnn/onednn_acl_depthwise_convolution.patch
index 95f0374ec4d..e8e2f465bfa 100644
--- a/third_party/mkl_dnn/onednn_acl_depthwise_convolution.patch
+++ b/third_party/mkl_dnn/onednn_acl_depthwise_convolution.patch
@@ -1,5 +1,5 @@
  *******************************************************************************
- Copyright 2022 Arm Limited and affiliates.
+ Copyright 2022-2023 Arm Limited and affiliates.
  SPDX-License-Identifier: Apache-2.0
 
  Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,7 +16,7 @@
  *******************************************************************************
 
 diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
-index fc93d2aa9..6ebac0d17 100644
+index 1792d0af96..ad19caf5d8 100644
 --- a/src/cpu/aarch64/acl_convolution_utils.cpp
 +++ b/src/cpu/aarch64/acl_convolution_utils.cpp
 @@ -54,10 +54,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
@@ -33,12 +33,13 @@ index fc93d2aa9..6ebac0d17 100644
          return status::unimplemented;
      }
  
-@@ -135,11 +137,11 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -135,11 +137,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
          is_nspc = utils::one_of(src_tag, nhwc);
  
          memory_desc_t want_wei_md = weights_md;
 -        auto wei_tag = is_nspc ? ohwi : oihw;
 +        auto wei_tag = is_depthwise ? hwigo : (is_nspc ? ohwi : oihw);
++
          CHECK(memory_desc_init_by_tag(want_wei_md, wei_tag));
  
          // Compute Library does not support mismatching layouts
@@ -47,7 +48,7 @@ index fc93d2aa9..6ebac0d17 100644
              return status::unimplemented;
  
          if (weights_md.format_kind == format_kind::any) {
-@@ -187,6 +189,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -187,6 +190,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
              acl_wei_data_t,
              acl_layout);
  
@@ -60,7 +61,7 @@ index fc93d2aa9..6ebac0d17 100644
      acp.dst_info = arm_compute::TensorInfo(
              is_nspc ? arm_compute::TensorShape(oc, ow, oh, mb) :
              arm_compute::TensorShape(ow, oh, oc, mb),
-@@ -212,6 +220,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -212,6 +221,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
                  arm_compute::QuantizationInfo(1.0f / scales[0], 0));
      }
  
@@ -70,10 +71,10 @@ index fc93d2aa9..6ebac0d17 100644
 +        return status::success;
 +    }
 +
+     // WeightFormat::ANY tells ACL we can handle any format
      acp.weights_info = arm_compute::WeightsInfo(
-         false,
-         kw,
-@@ -302,6 +316,10 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+             false, kw, kh, oc, false, arm_compute::WeightFormat::ANY);
+@@ -280,6 +295,10 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
          const primitive_attr_t &attr) {
      acp.is_indirect = false;
  
@@ -84,7 +85,7 @@ index fc93d2aa9..6ebac0d17 100644
      // General Compute Library checks, memory tags are also set there
      CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
  
-@@ -330,7 +348,8 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -308,7 +327,8 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
      auto math_mode = get_fpmath_mode();
      // Indirect convolution results in slowdown for low thread count or 1x1
      // kernels, so fall back to GEMM-based convolution in these cases
@@ -94,7 +95,7 @@ index fc93d2aa9..6ebac0d17 100644
                  weights_md.dims[3] == 1, // kw
                  (!math_mode && dnnl_get_max_threads() < 28))) {
          return status::unimplemented;
-@@ -355,6 +374,27 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -333,6 +353,27 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
      return status::success;
  }
  
@@ -122,7 +123,7 @@ index fc93d2aa9..6ebac0d17 100644
  status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
          memory_desc_t &weights_md, memory_desc_t &dst_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
-@@ -364,7 +404,8 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -342,7 +383,8 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
      // Under these conditions, fallback to faster GEMM-based convolution
      // unless the user explicitly specifies Winograd algorithm
      // clang-format off
@@ -133,7 +134,7 @@ index fc93d2aa9..6ebac0d17 100644
                  src_md.dims[1] < 64, // ic
                  dst_md.dims[1] < 64, // oc
 diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
-index 44dc8eecb..7eae5cbb1 100644
+index 44dc8eecbf..7eae5cbb1a 100644
 --- a/src/cpu/aarch64/acl_convolution_utils.hpp
 +++ b/src/cpu/aarch64/acl_convolution_utils.hpp
 @@ -67,6 +67,11 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
@@ -149,7 +150,7 @@ index 44dc8eecb..7eae5cbb1 100644
          memory_desc_t &weights_md, memory_desc_t &dst_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
 diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
-index 4142dbc7e..1800aaf58 100644
+index 4142dbc7e7..1800aaf583 100644
 --- a/src/cpu/cpu_convolution_list.cpp
 +++ b/src/cpu/cpu_convolution_list.cpp
 @@ -65,6 +65,7 @@ using namespace dnnl::impl::cpu::x64;
@@ -170,7 +171,7 @@ index 4142dbc7e..1800aaf58 100644
              CPU_INSTANCE(gemm_convolution_fwd_t)
 diff --git a/src/cpu/aarch64/acl_depthwise_convolution.cpp b/src/cpu/aarch64/acl_depthwise_convolution.cpp
 new file mode 100644
-index 000000000..1beb8b8af
+index 0000000000..1beb8b8af3
 --- /dev/null
 +++ b/src/cpu/aarch64/acl_depthwise_convolution.cpp
 @@ -0,0 +1,41 @@
@@ -217,7 +218,7 @@ index 000000000..1beb8b8af
 +}
 diff --git a/src/cpu/aarch64/acl_depthwise_convolution.hpp b/src/cpu/aarch64/acl_depthwise_convolution.hpp
 new file mode 100644
-index 000000000..d84fc4fb5
+index 0000000000..d84fc4fb51
 --- /dev/null
 +++ b/src/cpu/aarch64/acl_depthwise_convolution.hpp
 @@ -0,0 +1,139 @@
diff --git a/third_party/mkl_dnn/onednn_acl_fixed_format_kernels.patch b/third_party/mkl_dnn/onednn_acl_fixed_format_kernels.patch
index 2c8af08ab8a..a0a10d79cea 100644
--- a/third_party/mkl_dnn/onednn_acl_fixed_format_kernels.patch
+++ b/third_party/mkl_dnn/onednn_acl_fixed_format_kernels.patch
@@ -1,5 +1,5 @@
  *******************************************************************************
- Copyright 2022 Arm Limited and affiliates.
+ Copyright 2022-2023 Arm Limited and affiliates.
  SPDX-License-Identifier: Apache-2.0
 
  Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,98 +16,76 @@
  *******************************************************************************
 
 diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
-index c46d69757..fc93d2aa9 100644
+index c46d697575..1792d0af96 100644
 --- a/src/cpu/aarch64/acl_convolution_utils.cpp
 +++ b/src/cpu/aarch64/acl_convolution_utils.cpp
-@@ -212,6 +212,87 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -212,6 +212,65 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
                  arm_compute::QuantizationInfo(1.0f / scales[0], 0));
      }
  
++    // WeightFormat::ANY tells ACL we can handle any format
 +    acp.weights_info = arm_compute::WeightsInfo(
-+        false,
-+        kw,
-+        kh,
-+        oc,
-+        false,
-+        arm_compute::WeightFormat::ANY);
++            false, kw, kh, oc, false, arm_compute::WeightFormat::ANY);
++
++    // Get the format that the ACL kernel will expect the weights to be
++    // in (if a kernel exists). Note that these are referred to as fixed format
++    // kernels, because they require one specific weights format
 +    arm_compute::WeightFormat expected_weight_format;
-+    auto acl_st = arm_compute::NEGEMMConvolutionLayer::has_opt_impl(
-+        expected_weight_format,
-+        &acp.src_info,
-+        &acp.wei_info,
-+        acp.with_bias ? &acp.bia_info : nullptr,
-+        &acp.dst_info,
-+        acp.padstride_info,
-+        acp.weights_info,
-+        acp.dilation_info,
-+        acp.act_info,
-+        acp.fast_math);
-+    if(acl_st.error_code() != arm_compute::ErrorCode::OK) {
-+        return status::unimplemented;
-+    }
++    ACL_CHECK_VALID(arm_compute::NEGEMMConvolutionLayer::has_opt_impl(
++            expected_weight_format, &acp.src_info, &acp.wei_info,
++            acp.with_bias ? &acp.bia_info : nullptr,
++            &acp.dst_info, acp.padstride_info, acp.weights_info,
++            acp.dilation_info, acp.act_info, acp.fast_math));
++
++    // Set weights info to the one returned by has_opt_impl
 +    acp.weights_info.set_weight_format(expected_weight_format);
 +
-+    int interleaved_by = arm_compute::interleave_by(expected_weight_format);
-+    int block_by = arm_compute::block_by(expected_weight_format);
++    // has_opt_impl may return a non fast math kernel, even if we requested one
++    acp.fast_math
++            = arm_compute::is_fixed_format_fast_math(expected_weight_format);
 +
-+    bool is_fast_math_kernel = arm_compute::is_fixed_format_fast_math(expected_weight_format);
-+    if(!is_fast_math_kernel) {
-+        // FP32 kernel is faster then BF16
++    // Map OIHW used in ACL WeightFormat to the logical dimensions of the memory descriptor
++    dim_t O_dim = 0;
++    dim_t I_dim = 1;
++    dim_t H_dim = 2;
++    dim_t W_dim = 3;
++
++    if (!is_nspc) {
++        // We can try to support NCHW by swapping IHW around, note that this
++        // requires weights_md.dims[I_dim] % block_by != 0 (see next block)
++        O_dim = 0;
++        I_dim = 3;
++        H_dim = 1;
++        W_dim = 2;
++    }
++
++    // We can't currently support nchw and block_by != 1. If this is the case,
++    // try a non fast math kernel, which currently have no blocking
++    int block_by = arm_compute::block_by(acp.weights_info.weight_format());
++    if (!is_nspc && weights_md.dims[I_dim] % block_by != 0 && acp.fast_math) {
 +        acp.fast_math = false;
-+    }
-+
-+    memory_desc_t want_wei_md = weights_md;
-+
-+    int ic_multiply = ic;
-+    if(ic % block_by != 0) {
-+        ic_multiply = utils::div_up(ic, block_by) * block_by;
-+        // Also we need to set padded dimensions as well
-+        want_wei_md.padded_dims[1] = ic_multiply;
-+    } else {
-+        // If we do not need to pad input channels for fast math mode
-+        // then it would be faster to run convolution with im2row
-+        // instead of using indirect buffer
-+        if(acp.fast_math && acp.is_indirect) {
++        acp.weights_info.set_weight_format(arm_compute::WeightFormat::ANY);
++        ACL_CHECK_VALID(arm_compute::NEGEMMConvolutionLayer::has_opt_impl(
++                expected_weight_format, &acp.src_info,
++                &acp.wei_info,
++                acp.with_bias ? &acp.bia_info : nullptr,
++                &acp.dst_info, acp.padstride_info, acp.weights_info,
++                acp.dilation_info, acp.act_info, acp.fast_math));
++        acp.weights_info.set_weight_format(expected_weight_format);
++        block_by = arm_compute::block_by(expected_weight_format);
++        // This shouldn't happen, because non-fastmath have no blocking, but
++        // guard against it because it would silently return incorrect results
++        if (weights_md.dims[I_dim] % block_by != 0)
 +            return status::unimplemented;
-+        }
-+    }
-+    if(oc % interleaved_by != 0) {
-+        int padded_dim = utils::div_up(oc, interleaved_by) * interleaved_by;
-+        want_wei_md.padded_dims[0] = padded_dim;
 +    }
 +
-+    // Set strides based on blocking information
-+    want_wei_md.format_desc.blocking.strides[0] = interleaved_by*ic_multiply*kw*kh;
-+    want_wei_md.format_desc.blocking.strides[1] = interleaved_by*block_by;
-+    want_wei_md.format_desc.blocking.strides[2] = interleaved_by*ic_multiply*kw;
-+    want_wei_md.format_desc.blocking.strides[3] = interleaved_by*ic_multiply;
-+
-+    acl_utils::update_strides_y_and_z(
-+        acp.wei_info,
-+        want_wei_md.format_desc.blocking.strides[0] * wei_d.data_type_size(),
-+        acp.wei_info.strides_in_bytes().z());
-+
-+    // Set blocking
-+    want_wei_md.format_desc.blocking.inner_nblks = (block_by > 1) + 1;
-+    want_wei_md.format_desc.blocking.inner_idxs[0] = 0; // second to last dimension in abcd format
-+    want_wei_md.format_desc.blocking.inner_blks[0] = interleaved_by;
-+
-+    if(block_by > 1) {
-+        want_wei_md.format_desc.blocking.inner_idxs[1] = 1; // second to last dimension in abcd format
-+        want_wei_md.format_desc.blocking.inner_blks[1] = block_by;
-+    }
-+
-+    if(is_fast_math_kernel) {
-+        // If it is fast math mode we need weights in BFloat16
-+        want_wei_md.data_type = dnnl_bf16;
-+    }
-+
-+    weights_md = want_wei_md;
++    acl_utils::reorder_to_weight_format(acp.wei_info, weights_md,
++            expected_weight_format, I_dim, O_dim, {W_dim, H_dim}, {});
 +
      return status::success;
  }
  
-@@ -219,6 +300,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -219,6 +278,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
          memory_desc_t &weights_md, memory_desc_t &dst_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
          const primitive_attr_t &attr) {
@@ -115,7 +93,7 @@ index c46d69757..fc93d2aa9 100644
  
      // General Compute Library checks, memory tags are also set there
      CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
-@@ -244,11 +326,13 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -244,11 +304,13 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
          memory_desc_t &weights_md, memory_desc_t &dst_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
          const primitive_attr_t &attr) {
@@ -130,7 +108,7 @@ index c46d69757..fc93d2aa9 100644
          return status::unimplemented;
      }
  
-@@ -275,6 +359,7 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -275,6 +337,7 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
          memory_desc_t &weights_md, memory_desc_t &dst_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
          const primitive_attr_t &attr) {
@@ -139,7 +117,7 @@ index c46d69757..fc93d2aa9 100644
      // Under these conditions, fallback to faster GEMM-based convolution
      // unless the user explicitly specifies Winograd algorithm
 diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
-index 3e56245fa..44dc8eecb 100644
+index 3e56245faf..44dc8eecbf 100644
 --- a/src/cpu/aarch64/acl_convolution_utils.hpp
 +++ b/src/cpu/aarch64/acl_convolution_utils.hpp
 @@ -43,6 +43,7 @@ struct acl_conv_conf_t {
@@ -151,7 +129,7 @@ index 3e56245fa..44dc8eecb 100644
      arm_compute::TensorInfo wei_info;
      arm_compute::TensorInfo bia_info;
 diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
-index bcf031a77..4ddc8cf91 100644
+index bcf031a771..4ddc8cf910 100644
 --- a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
 +++ b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
 @@ -41,6 +41,7 @@ struct acl_indirect_gemm_resource_t : public resource_t {
@@ -174,7 +152,7 @@ index bcf031a77..4ddc8cf91 100644
  
          return status::success;
 diff --git a/src/cpu/aarch64/acl_inner_product.hpp b/src/cpu/aarch64/acl_inner_product.hpp
-index c5e507085..163ff066e 100644
+index c5e507085f..163ff066e6 100644
 --- a/src/cpu/aarch64/acl_inner_product.hpp
 +++ b/src/cpu/aarch64/acl_inner_product.hpp
 @@ -45,6 +45,7 @@ struct acl_ip_conf_t {
@@ -325,7 +303,7 @@ index c5e507085..163ff066e 100644
              // Validate fully connected layer manually to check for return status
              ACL_CHECK_VALID(arm_compute::NEFullyConnectedLayer::validate(
 diff --git a/src/cpu/aarch64/acl_utils.cpp b/src/cpu/aarch64/acl_utils.cpp
-index 79ea775d6..7ee4c7398 100644
+index 79ea775d6d..3424d5dacc 100644
 --- a/src/cpu/aarch64/acl_utils.cpp
 +++ b/src/cpu/aarch64/acl_utils.cpp
 @@ -157,6 +157,28 @@ status_t tensor_info(
@@ -357,8 +335,84 @@ index 79ea775d6..7ee4c7398 100644
  status_t insert_singleton_dimension(arm_compute::TensorInfo &ti, size_t dim_i) {
  
      // Max 6 dims in ACL, so we can't insert another
+@@ -261,6 +283,75 @@ int reorder_dimensions_by_stride(std::vector<memory_desc_t *> permuted_mds,
+     return reordered_dims;
+ }
+ 
++void reorder_to_weight_format(arm_compute::TensorInfo &info, memory_desc_t &md,
++        arm_compute::WeightFormat wf, dim_t I_dim, dim_t O_dim,
++        std::vector<dim_t> spatial_dims, std::vector<dim_t> batch_dims) {
++
++    md.format_kind = format_kind::blocked;
++    md.format_desc.blocking = blocking_desc_t {};
++    const int interleaved_by = arm_compute::interleave_by(wf);
++    const int block_by = arm_compute::block_by(wf);
++
++    // I dimension becomes densest (apart from blocking)
++    md.format_desc.blocking.strides[I_dim] = interleaved_by * block_by;
++    md.padded_dims[I_dim] = utils::rnd_up(md.dims[I_dim], block_by);
++
++    // Then any spatial dimensions (e.g. HW)
++    dim_t ldb = interleaved_by * md.padded_dims[I_dim];
++    for (dim_t sd : spatial_dims) {
++        md.format_desc.blocking.strides[sd] = ldb;
++        ldb *= md.padded_dims[sd];
++    }
++
++    // O dim (which was the innermost) becomes the outermost (apart from batching)
++    md.format_desc.blocking.strides[O_dim] = ldb;
++    md.padded_dims[O_dim] = utils::rnd_up(md.dims[O_dim], interleaved_by);
++
++    // Update the batch dimensions, starting with stride of the innermost batch
++    const dim_t innermost_batch_stride
++            = md.padded_dims[I_dim] * md.padded_dims[O_dim];
++    dim_t batch_stride = innermost_batch_stride;
++    for (dim_t bd : batch_dims) {
++        md.format_desc.blocking.strides[bd] = batch_stride;
++        batch_stride *= md.padded_dims[bd];
++    }
++
++    // Weights can only be blocked if they are also interleaved
++    if (interleaved_by > 1) {
++        md.format_desc.blocking.inner_nblks = 1 + (block_by > 1);
++
++        md.format_desc.blocking.inner_idxs[0] = O_dim;
++        md.format_desc.blocking.inner_blks[0] = interleaved_by;
++        if (block_by > 1) {
++            md.format_desc.blocking.inner_idxs[1] = I_dim;
++            md.format_desc.blocking.inner_blks[1] = block_by;
++        }
++    }
++
++    if (arm_compute::is_fixed_format_fast_math(wf)) {
++        md.data_type = dnnl_bf16;
++        info.set_data_type(arm_compute::DataType::BFLOAT16);
++    }
++
++    // The data layout is now determined by the manually set strides
++    info.set_data_layout(arm_compute::DataLayout::UNKNOWN);
++
++    // x is ignored in fixed format kernels
++    // y is the leading dimension of b (ldb) in the GEMM d = a*b + c
++    //   This is the stride of O_dim in the md
++    // z is the batch dimension (not strictly needed if there's only 1 batch)
++    //   i.e. how much do I need to stride to get to the next matmul (ignoring
++    //   the interleaving). Note that we use the innermost_batch_stride
++    //   because all the batched dimensions are collapsed (as required by ACL).
++    arm_compute::Strides new_strides_in_bytes = info.strides_in_bytes();
++    new_strides_in_bytes.set(1, ldb * info.element_size());
++    new_strides_in_bytes.set(2, innermost_batch_stride * info.element_size());
++
++    info.init(info.tensor_shape(), info.num_channels(), info.data_type(),
++            new_strides_in_bytes, info.offset_first_element_in_bytes(),
++            memory_desc_wrapper(md).size());
++}
++
+ } // namespace acl_utils
+ 
+ } // namespace aarch64
 diff --git a/src/cpu/aarch64/acl_utils.hpp b/src/cpu/aarch64/acl_utils.hpp
-index 28693bb16..c7c9e1278 100644
+index 28693bb167..141b2974a2 100644
 --- a/src/cpu/aarch64/acl_utils.hpp
 +++ b/src/cpu/aarch64/acl_utils.hpp
 @@ -62,6 +62,9 @@ status_t tensor_info(arm_compute::TensorInfo &info, const memory_desc_t &md);
@@ -371,8 +425,37 @@ index 28693bb16..c7c9e1278 100644
  // Insert a dimension of size 1 at the index dim_i of TensorInfo
  status_t insert_singleton_dimension(arm_compute::TensorInfo &ti, size_t dim_i);
  
+@@ -74,6 +77,28 @@ status_t insert_singleton_dimension(arm_compute::TensorInfo &ti, size_t dim_i);
+ int reorder_dimensions_by_stride(std::vector<memory_desc_t *> permuted_mds,
+         std::vector<const memory_desc_t *> mds);
+ 
++// Reorder a memory_desc_t and set the strides on a arm_compute::TensorInfo to
++// match an arm_compute::WeightFormat. You are required to specify how various
++// logical dimensions in oneDNN correspond to logical dimensions in arm_compute.
++// info  TensorInfo where the strides will be changed to match the reordering
++// md    memory descriptor where the stride and padded dimensions will be
++//       changed or reordering
++// wf    Describes the memory format/layout of the weights
++// I_dim The logical dimension of md corresponding to the input channel of
++//       a convolution or the K dimension in a matmul
++// O_dim The logical dimension of md corresponding to the output channel of a
++//       convolution or the N dimension in a matmul
++// spatial_dims The logical dimensions of md corresponding to the spatial
++//              dimensions of the weights (H, W, D for example). These will be
++//              the next densest after the inner blocks and the input channel.
++// batch_dims The logical dimensions of md related to the batch in a batched
++//            matmul, ordered from innermost to outermost. ACL calls these
++//            the multi_stride_b. These will become the outermost (least dense)
++//            dimensions and will be collapsed.
++void reorder_to_weight_format(arm_compute::TensorInfo &info, memory_desc_t &md,
++        arm_compute::WeightFormat wf, dim_t I_dim, dim_t O_dim,
++        std::vector<dim_t> spatial_dims, std::vector<dim_t> batch_dims = {});
++
+ // Logs a custom 'info' line describing an unsupported case
+ #define LOG_ACL_UNSUPPORTED(msg) \
+     do { \
 diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-index 679baec3a..853277e37 100644
+index 679baec3a4..853277e37b 100644
 --- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
 +++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
 @@ -66,15 +66,12 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
diff --git a/third_party/ortools/ortools.patch b/third_party/ortools/ortools.patch
index 7dfd842a8e8..e02ec5b9482 100644
--- a/third_party/ortools/ortools.patch
+++ b/third_party/ortools/ortools.patch
@@ -125,6 +125,24 @@ diff '--color=auto' -u -r or-tools-7.3/ortools/util/file_util.cc or-tools-7.3-ne
    }
    const std::string output_filename = absl::StrCat(filename, file_type_suffix);
    VLOG(1) << "Writing " << output_string.size() << " bytes to "
+diff '--color=auto' -u -r or-tools-7.3/ortools/base/BUILD or-tools-7.3-new/ortools/base/BUILD
+--- or-tools-7.3/ortools/base/BUILD 2019-08-05 20:37:22.000000000 +0000
++++ or-tools-7.3-new/ortools/base/BUILD	2023-03-17 21:01:45.070628657 +0000
+@@ -137,13 +137,13 @@
+     hdrs = [
+         "recordio.h",
+     ],
+-    linkopts = ["-lz"],
+     deps = [
+         ":base",
+         ":file",
+         ":status",
+         "@com_google_absl//absl/strings:strings",
+         "@com_google_protobuf_cc//:protobuf",
++        "@zlib",
+     ],
+ )
+ 
 diff --git a/ortools/util/fp_utils.h b/ortools/util/fp_utils.h
 index 569eeddf47..7188abb29a 100644
 --- a/ortools/util/fp_utils.h
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 3f3fcf7e54c..f68e1c04734 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -30,12 +30,12 @@ cc_library(
         "pngwutil.c",
     ] + select({
         ":windows": [
-            "intel/intel_init.c",
             "intel/filter_sse2_intrinsics.c",
+            "intel/intel_init.c",
         ],
         "@org_tensorflow//tensorflow/tsl:linux_ppc64le": [
-            "powerpc/powerpc_init.c",
             "powerpc/filter_vsx_intrinsics.c",
+            "powerpc/powerpc_init.c",
         ],
         "//conditions:default": [
         ],
diff --git a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
index 5bb8b666545..db3fecd02a5 100644
--- a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
+++ b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
@@ -9,18 +9,10 @@ tf_class {
     name: "OVERLOADABLE_OPERATORS"
     mtype: "<type \'set\'>"
   }
-  member {
-    name: "device"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -29,25 +21,12 @@ tf_class {
     name: "ndim"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "op"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "value_index"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'op\', \'value_index\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "consumers"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "eval"
diff --git a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt
index 94a4af00a18..2f5490ad0c9 100644
--- a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt
+++ b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt
@@ -412,6 +412,10 @@ tf_module {
     name: "fix"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "flatten"
+    argspec: "args=[\'a\', \'order\'], varargs=None, keywords=None, defaults=[\'C\'], "
+  }
   member_method {
     name: "flip"
     argspec: "args=[\'m\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index f8a22187cac..e60d88cf76b 100644
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -1,68 +1,2609 @@
-diff --ruN a/stablehlo/stablehlo/dialect/Serialization.cpp b/stablehlo/stablehlo/dialect/Serialization.cpp
---- stablehlo/stablehlo/dialect/Serialization.cpp
-+++ stablehlo/stablehlo/dialect/Serialization.cpp
-@@ -24,7 +24,6 @@
- #include "mlir/Parser/Parser.h"
- #include "mlir/Pass/PassManager.h"
- #include "mlir/Support/LogicalResult.h"
--#include "stablehlo/dialect/Version.h"
- #include "stablehlo/dialect/VhloOps.h"
- #include "stablehlo/transforms/Passes.h"
+diff --ruN a/stablehlo/stablehlo/dialect/Base.h b/stablehlo/stablehlo/dialect/Base.h
+--- stablehlo/stablehlo/dialect/Base.h
++++ stablehlo/stablehlo/dialect/Base.h
+@@ -294,7 +294,8 @@
+   static LogicalResult inferReturnTypes(
+       MLIRContext * /*context*/, std::optional<Location> location,
+       ValueRange operands, DictionaryAttr /*attributes*/,
+-      RegionRange /*regions*/, SmallVectorImpl<Type> &inferredReturnTypes) {
++      OpaqueProperties /*properties*/, RegionRange /*regions*/,
++      SmallVectorImpl<Type> &inferredReturnTypes) {
+     // TODO(b/231358795): Review the use of InferTypeOpInterface for ops that
+     // support quantization or sparsity.
+     if (operands.empty())
+@@ -314,11 +315,13 @@
+   // (see examples in StablehloOps.cpp).
+   static LogicalResult inferReturnTypeComponentsFromOperands(
+       MLIRContext *context, std::optional<Location> location,
+-      ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
++      ValueShapeRange operands, DictionaryAttr attributes,
++      OpaqueProperties properties, RegionRange regions,
+       SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
+     SmallVector<Type> inferredReturnTypes;
+     if (failed(inferReturnTypes(context, location, operands.getValues(),
+-                                attributes, regions, inferredReturnTypes)))
++                                attributes, properties, regions,
++                                inferredReturnTypes)))
+       return failure();
+     if (inferredReturnTypes.size() != 1) return failure();
+     auto inferredReturnType = inferredReturnTypes[0].dyn_cast<ShapedType>();
+diff --ruN a/stablehlo/stablehlo/dialect/Base.td b/stablehlo/stablehlo/dialect/Base.td
+--- stablehlo/stablehlo/dialect/Base.td
++++ stablehlo/stablehlo/dialect/Base.td
+@@ -33,8 +33,8 @@
+ def HLO_UInt : UnsignedIntOfWidths<[4, 8, 16, 32, 64]>;
+ def HLO_Int : AnyTypeOf<[HLO_SInt, HLO_UInt]>;
  
+-def HLO_Float : AnyTypeOf<[F8E4M3FN, F8E5M2, F8E4M3FNUZ, F8E5M2FNUZ,
+-                           F8E4M3B11FNUZ, F16, F32, F64, BF16]>;
++def HLO_Float : AnyTypeOf<[F8E4M3B11FNUZ, F8E4M3FN, F8E4M3FNUZ, F8E5M2,
++                           F8E5M2FNUZ, F16, F32, F64, BF16]>;
+ def HLO_Float32Or64 : AnyTypeOf<[F32, F64]>;
+ 
+ def HLO_Complex : Complex<AnyTypeOf<[F32, F64]>>;
+diff --ruN a/stablehlo/stablehlo/dialect/ChloOps.cpp b/stablehlo/stablehlo/dialect/ChloOps.cpp
+--- stablehlo/stablehlo/dialect/ChloOps.cpp
++++ stablehlo/stablehlo/dialect/ChloOps.cpp
+@@ -43,15 +43,15 @@
+ 
+ // TODO(b/231358795): Review the use of InferTypeOpInterface for ops that
+ // support quantization or sparsity.
+-#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                        \
+-  LogicalResult Op::inferReturnTypeComponents(                                \
+-      MLIRContext* context, std::optional<Location> location,                 \
+-      ValueShapeRange operands, DictionaryAttr attributes,                    \
+-      RegionRange regions,                                                    \
+-      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {          \
+-    return inferReturnTypeComponentsFromOperands(context, location, operands, \
+-                                                 attributes, regions,         \
+-                                                 inferredReturnShapes);       \
++#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                \
++  LogicalResult Op::inferReturnTypeComponents(                        \
++      MLIRContext* context, std::optional<Location> location,         \
++      ValueShapeRange operands, DictionaryAttr attributes,            \
++      OpaqueProperties properties, RegionRange regions,               \
++      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {  \
++    return inferReturnTypeComponentsFromOperands(                     \
++        context, location, operands, attributes, properties, regions, \
++        inferredReturnShapes);                                        \
+   }
+ 
+ INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AcosOp)
+@@ -188,7 +188,7 @@
+ LogicalResult BroadcastComplexOp::inferReturnTypeComponents(
+     MLIRContext* context, std::optional<Location> location,
+     ValueShapeRange operands, DictionaryAttr attributes,
+-    RegionRange /*regions*/,
++    OpaqueProperties /*properties*/, RegionRange /*regions*/,
+     SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
+   ShapedType lhsType = operands[0].getType().cast<ShapedType>();
+   Type elementType = ComplexType::get(lhsType.getElementType());
+@@ -221,7 +221,7 @@
+ LogicalResult BroadcastCompareOp::inferReturnTypeComponents(
+     MLIRContext* context, std::optional<Location> location,
+     ValueShapeRange operands, DictionaryAttr attributes,
+-    RegionRange /*regions*/,
++    OpaqueProperties /*properties*/, RegionRange /*regions*/,
+     SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
+   Type elementType = IntegerType::get(context, 1);
+   return InferBroadcastBinaryOpReturnTypeComponents(context, location, operands,
+@@ -248,7 +248,8 @@
+ 
+ LogicalResult IsInfOp::inferReturnTypes(
+     MLIRContext* /*ctx*/, std::optional<Location>, ValueRange operands,
+-    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
++    DictionaryAttr, OpaqueProperties, RegionRange,
++    SmallVectorImpl<Type>& inferredReturnTypes) {
+   inferredReturnTypes.push_back(getIsInfLikeReturnType(operands.front()));
+   return success();
+ }
+@@ -259,7 +260,8 @@
+ 
+ LogicalResult IsNegInfOp::inferReturnTypes(
+     MLIRContext* /*ctx*/, std::optional<Location>, ValueRange operands,
+-    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
++    DictionaryAttr, OpaqueProperties, RegionRange,
++    SmallVectorImpl<Type>& inferredReturnTypes) {
+   inferredReturnTypes.push_back(getIsInfLikeReturnType(operands.front()));
+   return success();
+ }
+@@ -270,7 +272,8 @@
+ 
+ LogicalResult IsPosInfOp::inferReturnTypes(
+     MLIRContext* /*ctx*/, std::optional<Location>, ValueRange operands,
+-    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
++    DictionaryAttr, OpaqueProperties, RegionRange,
++    SmallVectorImpl<Type>& inferredReturnTypes) {
+   inferredReturnTypes.push_back(getIsInfLikeReturnType(operands.front()));
+   return success();
+ }
+@@ -283,7 +286,7 @@
+   LogicalResult Op::inferReturnTypeComponents(                             \
+       MLIRContext* context, std::optional<Location> location,              \
+       ValueShapeRange operands, DictionaryAttr attributes,                 \
+-      RegionRange regions,                                                 \
++      OpaqueProperties /*properties*/, RegionRange regions,                \
+       SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {        \
+     return InferBroadcastBinaryOpReturnTypeComponents(                     \
+         context, location, operands, attributes, /*element_type=*/nullptr, \
+@@ -342,7 +345,7 @@
+ 
+ LogicalResult ConstantLikeOp::inferReturnTypeComponents(
+     MLIRContext* /*context*/, std::optional<Location> location,
+-    ValueShapeRange operands, DictionaryAttr attributes,
++    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
+     RegionRange /*regions*/,
+     SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
+   ConstantLikeOp::Adaptor op(operands, attributes);
+@@ -376,7 +379,7 @@
+ 
+ LogicalResult BroadcastSelectOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr, RegionRange,
++    DictionaryAttr, OpaqueProperties, RegionRange,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+   BroadcastSelectOp::Adaptor op(operands.getValues());
+   auto predType = op.getPred().getType().cast<ShapedType>();
+@@ -447,10 +450,11 @@
+ 
+ LogicalResult TopKOp::inferReturnTypeComponents(
+     MLIRContext* context, std::optional<Location> location,
+-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
++    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+   Builder builder(context);
+-  TopKOp::Adaptor adaptor(operands, attributes, regions);
++  TopKOp::Adaptor adaptor(operands, attributes, {}, regions);
+   Value operand = adaptor.getOperand();
+   uint64_t k = adaptor.getK();
+ 
+@@ -483,7 +487,7 @@
+ 
+ LogicalResult ConstantOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location>, ValueRange,
+-    DictionaryAttr attributes, RegionRange,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange,
+     SmallVectorImpl<Type>& inferredReturnTypes) {
+   Type type = attributes.get("value").cast<TypedAttr>().getType();
+   inferredReturnTypes.push_back(type);
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
+--- stablehlo/stablehlo/dialect/StablehloOps.cpp
++++ stablehlo/stablehlo/dialect/StablehloOps.cpp
+@@ -149,15 +149,15 @@
+ 
+ // TODO(b/231358795): Review the use of InferTypeOpInterface for ops that
+ // support quantization or sparsity.
+-#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                        \
+-  LogicalResult Op::inferReturnTypeComponents(                                \
+-      MLIRContext* context, std::optional<Location> location,                 \
+-      ValueShapeRange operands, DictionaryAttr attributes,                    \
+-      RegionRange regions,                                                    \
+-      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {          \
+-    return inferReturnTypeComponentsFromOperands(context, location, operands, \
+-                                                 attributes, regions,         \
+-                                                 inferredReturnShapes);       \
++#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                \
++  LogicalResult Op::inferReturnTypeComponents(                        \
++      MLIRContext* context, std::optional<Location> location,         \
++      ValueShapeRange operands, DictionaryAttr attributes,            \
++      OpaqueProperties properties, RegionRange regions,               \
++      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {  \
++    return inferReturnTypeComponentsFromOperands(                     \
++        context, location, operands, attributes, properties, regions, \
++        inferredReturnShapes);                                        \
+   }
+ 
+ INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AddOp)
+@@ -206,7 +206,7 @@
+ 
+ LogicalResult AfterAllOp::inferReturnTypes(
+     MLIRContext* context, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+     SmallVectorImpl<Type>& inferredReturnTypes) {
+   return hlo::inferAfterAllOp(getStablehloDialect(context), location,
+                               inferredReturnTypes);
+@@ -250,7 +250,7 @@
+ 
+ LogicalResult ConstantOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange,
+     SmallVectorImpl<Type>& inferredReturnTypes) {
+   ConstantOpAdaptor adaptor(operands, attributes);
+   return hlo::inferConstantOp(location, adaptor.getValue(),
+@@ -317,7 +317,7 @@
+ 
+ LogicalResult CreateTokenOp::inferReturnTypes(
+     MLIRContext* context, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+     SmallVectorImpl<Type>& inferredReturnTypes) {
+   return hlo::inferCreateTokenOp(getStablehloDialect(context), location,
+                                  inferredReturnTypes);
+@@ -484,9 +484,9 @@
+ 
+ LogicalResult CholeskyOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  CholeskyOp::Adaptor adaptor(operands, attributes, regions);
++  CholeskyOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferCholeskyOp(location, adaptor.getA(), inferredReturnShapes);
+ }
+ 
+@@ -585,9 +585,9 @@
+ 
+ LogicalResult FftOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  FftOp::Adaptor adaptor(operands, attributes, regions);
++  FftOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferFftOp(location, adaptor.getOperand(),
+                          adaptor.getFftType() == FftType::RFFT,
+                          adaptor.getFftType() == FftType::IRFFT,
+@@ -690,9 +690,10 @@
+ 
+ LogicalResult GatherOp::inferReturnTypeComponents(
+     MLIRContext* context, std::optional<Location> location,
+-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
++    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  GatherOp::Adaptor adaptor(operands, attributes, regions);
++  GatherOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferGatherOp(
+       location, adaptor.getOperand(), adaptor.getStartIndices(),
+       adaptor.getDimensionNumbers().getOffsetDims(),
+@@ -714,9 +715,10 @@
+ 
+ LogicalResult DynamicGatherOp::inferReturnTypeComponents(
+     MLIRContext* context, std::optional<Location> location,
+-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
++    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  DynamicGatherOp::Adaptor adaptor(operands, attributes, regions);
++  DynamicGatherOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferDynamicGatherOp(
+       location, adaptor.getOperand(), adaptor.getStartIndices(),
+       adaptor.getSliceSizes(), adaptor.getDimensionNumbers().getOffsetDims(),
+@@ -731,9 +733,9 @@
+ 
+ LogicalResult GetDimensionSizeOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  GetDimensionSizeOp::Adaptor adaptor(operands, attributes, regions);
++  GetDimensionSizeOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferGetDimensionSizeOp(location, adaptor.getOperand().getType(),
+                                       adaptor.getDimension(),
+                                       inferredReturnShapes);
+@@ -779,9 +781,9 @@
+ 
+ LogicalResult DynamicUpdateSliceOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  DynamicUpdateSliceOp::Adaptor adaptor(operands, attributes, regions);
++  DynamicUpdateSliceOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferDynamicUpdateSliceOp(
+       location, adaptor.getOperand(), adaptor.getUpdate(),
+       adaptor.getStartIndices(), inferredReturnShapes);
+@@ -793,9 +795,9 @@
+ 
+ LogicalResult AbsOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  AbsOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  AbsOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferAbsOp(location, adaptor.getOperand(), inferredReturnTypes);
+ }
+ 
+@@ -849,9 +851,10 @@
+ 
+ LogicalResult AllToAllOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  AllToAllOp::Adaptor adaptor(operands, attributes, regions);
++  AllToAllOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferAllToAllOp(
+       location, adaptor.getOperand(), adaptor.getSplitDimension(),
+       adaptor.getConcatDimension(), adaptor.getSplitCount(),
+@@ -893,9 +896,10 @@
+ 
+ LogicalResult BatchNormGradOp::inferReturnTypeComponents(
+     MLIRContext* context, std::optional<Location> location,
+-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
++    ValueShapeRange operands, DictionaryAttr attributes,
++    OpaqueProperties /*properties*/, RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  BatchNormGradOp::Adaptor adaptor(operands, attributes, regions);
++  BatchNormGradOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferBatchNormGradOp(
+       location, adaptor.getOperand(), adaptor.getScale(), adaptor.getMean(),
+       adaptor.getVariance(), adaptor.getGradOutput(), adaptor.getFeatureIndex(),
+@@ -908,9 +912,10 @@
+ 
+ LogicalResult BatchNormTrainingOp::inferReturnTypeComponents(
+     MLIRContext* context, std::optional<Location> location,
+-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
++    ValueShapeRange operands, DictionaryAttr attributes,
++    OpaqueProperties /*properties*/, RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  BatchNormTrainingOp::Adaptor adaptor(operands, attributes, regions);
++  BatchNormTrainingOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferBatchNormTrainingOp(
+       location, adaptor.getOperand(), adaptor.getScale(), adaptor.getOffset(),
+       adaptor.getFeatureIndex(), inferredReturnShapes);
+@@ -922,9 +927,10 @@
+ 
+ LogicalResult BatchNormInferenceOp::inferReturnTypeComponents(
+     MLIRContext* context, std::optional<Location> location,
+-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
++    ValueShapeRange operands, DictionaryAttr attributes,
++    OpaqueProperties /*properties*/, RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  BatchNormInferenceOp::Adaptor adaptor(operands, attributes, regions);
++  BatchNormInferenceOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferBatchNormInferenceOp(
+       location, adaptor.getOperand(), adaptor.getScale(), adaptor.getOffset(),
+       adaptor.getMean(), adaptor.getVariance(), adaptor.getFeatureIndex(),
+@@ -967,9 +973,10 @@
+ 
+ LogicalResult BroadcastOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  BroadcastOp::Adaptor adaptor(operands, attributes, regions);
++  BroadcastOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferBroadcastOp(location, adaptor.getOperand(),
+                                adaptor.getBroadcastSizes(),
+                                inferredReturnShapes);
+@@ -1042,9 +1049,10 @@
+ 
+ LogicalResult ClampOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  ClampOp::Adaptor adaptor(operands, attributes, regions);
++  ClampOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferClampOp(location, adaptor.getMin(), adaptor.getOperand(),
+                            adaptor.getMax(), inferredReturnShapes);
+ }
+@@ -1063,9 +1071,9 @@
+ 
+ LogicalResult ComplexOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  ComplexOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  ComplexOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferComplexOp(location, adaptor.getLhs(), inferredReturnTypes);
+ }
+ 
+@@ -1075,9 +1083,9 @@
+ 
+ LogicalResult ImagOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  ImagOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  ImagOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferImagOp(location, adaptor.getOperand(), inferredReturnTypes);
+ }
+ 
+@@ -1087,9 +1095,9 @@
+ 
+ LogicalResult IsFiniteOp::inferReturnTypes(
+     MLIRContext* ctx, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  IsFiniteOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  IsFiniteOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferIsFiniteOp(ctx, location, adaptor.getX(),
+                               inferredReturnTypes);
+ }
+@@ -1100,9 +1108,9 @@
+ 
+ LogicalResult RealOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  RealOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  RealOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferRealOp(location, adaptor.getOperand(), inferredReturnTypes);
+ }
+ 
+@@ -1112,9 +1120,9 @@
+ 
+ LogicalResult ConcatenateOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  ConcatenateOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  ConcatenateOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferConcatenateOp(location, adaptor.getInputs().getTypes(),
+                                  adaptor.getDimension(), inferredReturnTypes);
+ }
+@@ -1196,9 +1204,10 @@
+ 
+ LogicalResult DynamicSliceOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  DynamicSliceOp::Adaptor adaptor(operands, attributes, regions);
++  DynamicSliceOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferDynamicSliceOp(location, adaptor.getOperand().getType(),
+                                   adaptor.getStartIndices().getTypes(),
+                                   adaptor.getSliceSizes(),
+@@ -1276,9 +1285,10 @@
+ 
+ LogicalResult MapOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  MapOp::Adaptor adaptor(operands, attributes, regions);
++  MapOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferMapOp(location, adaptor.getInputs(), adaptor.getDimensions(),
+                          adaptor.getComputation(), inferredReturnShapes);
+ }
+@@ -1296,8 +1306,8 @@
+ 
+ LogicalResult OutfeedOp::inferReturnTypes(
+     MLIRContext* context, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
+   return hlo::inferOutfeedOp(getStablehloDialect(context), location,
+                              inferredReturnTypes);
+ }
+@@ -1308,7 +1318,8 @@
+ 
+ LogicalResult SendOp::inferReturnTypes(
+     MLIRContext* context, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
++    DictionaryAttr, OpaqueProperties, RegionRange,
++    SmallVectorImpl<Type>& inferredReturnTypes) {
+   return hlo::inferSendOp(getStablehloDialect(context), location,
+                           inferredReturnTypes);
+ }
+@@ -1328,9 +1339,10 @@
+ 
+ LogicalResult ReduceWindowOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  ReduceWindowOp::Adaptor adaptor(operands, attributes, regions);
++  ReduceWindowOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferReduceWindowOp(
+       location, adaptor.getInputs(), adaptor.getInitValues(),
+       adaptor.getWindowDimensions(), adaptor.getWindowStrides(),
+@@ -1400,7 +1412,7 @@
+   if (mlir::succeeded(ReduceWindowOp::inferReturnTypes(
+           odsBuilder.getContext(), odsState.location, odsState.operands,
+           odsState.attributes.getDictionary(odsState.getContext()),
+-          odsState.regions, inferredReturnTypes)))
++          odsState.getRawProperties(), odsState.regions, inferredReturnTypes)))
+     odsState.addTypes(inferredReturnTypes);
+   else
+     llvm::report_fatal_error("Failed to infer result type(s).");
+@@ -1731,9 +1743,10 @@
+ 
+ LogicalResult ReduceOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  ReduceOp::Adaptor adaptor(operands, attributes, regions);
++  ReduceOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferReduceOp(location, adaptor.getInputs().getTypes(),
+                             adaptor.getInitValues().getTypes(),
+                             adaptor.getDimensions(), inferredReturnShapes);
+@@ -1789,7 +1802,7 @@
+ //===----------------------------------------------------------------------===//
+ LogicalResult OptimizationBarrierOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange,
+     SmallVectorImpl<Type>& inferredReturnTypes) {
+   OptimizationBarrierOp::Adaptor adaptor(operands, attributes);
+   return hlo::inferOptimizationBarrierOp(location, adaptor.getOperand(),
+@@ -1805,9 +1818,10 @@
+ 
+ LogicalResult ReverseOp::inferReturnTypeComponents(
+     MLIRContext* context, std::optional<Location> location,
+-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
++    ValueShapeRange operands, DictionaryAttr attributes,
++    OpaqueProperties /*properties*/, RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  ReverseOp::Adaptor adaptor(operands, attributes, regions);
++  ReverseOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferReverseOp(location, adaptor.getOperand().getType(),
+                              inferredReturnShapes);
+ }
+@@ -1828,9 +1842,10 @@
+ 
+ LogicalResult RngOp::inferReturnTypeComponents(
+     MLIRContext* context, std::optional<Location> location,
+-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
++    ValueShapeRange operands, DictionaryAttr attributes,
++    OpaqueProperties /*properties*/, RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  RngOp::Adaptor adaptor(operands, attributes, regions);
++  RngOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferRngOp(
+       location, adaptor.getA(), adaptor.getB(), adaptor.getShape(),
+       adaptor.getRngDistribution() == RngDistribution::UNIFORM,
+@@ -1852,7 +1867,7 @@
+ 
+ LogicalResult SelectOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+   SelectOp::Adaptor op(operands, attributes);
+   return hlo::inferSelectOp(location, op.getPred(), op.getOnTrue(),
+@@ -1873,9 +1888,10 @@
+ 
+ LogicalResult SetDimensionSizeOp::inferReturnTypeComponents(
+     MLIRContext* context, std::optional<Location> location,
+-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
++    ValueShapeRange operands, DictionaryAttr attributes,
++    OpaqueProperties /*properties*/, RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  SetDimensionSizeOp::Adaptor adaptor(operands, attributes, regions);
++  SetDimensionSizeOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferSetDimensionSizeOp(
+       getStablehloDialect(context), location, adaptor.getOperand().getType(),
+       adaptor.getSize(), adaptor.getDimension(), inferredReturnShapes);
+@@ -1887,9 +1903,9 @@
+ 
+ LogicalResult PadOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  PadOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  PadOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferPadOp(location, adaptor.getOperand().getType(),
+                          adaptor.getPaddingValue().getType(),
+                          adaptor.getEdgePaddingLow(),
+@@ -2047,7 +2063,8 @@
+ 
+ LogicalResult ReplicaIdOp::inferReturnTypes(
+     MLIRContext* context, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
++    DictionaryAttr, OpaqueProperties, RegionRange,
++    SmallVectorImpl<Type>& inferredReturnTypes) {
+   return hlo::inferReplicaIdOp(context, location, inferredReturnTypes);
+ }
+ 
+@@ -2057,7 +2074,7 @@
+ 
+ LogicalResult PartitionIdOp::inferReturnTypes(
+     MLIRContext* context, std::optional<Location> location,
+-    ValueRange /*operands*/, DictionaryAttr, RegionRange,
++    ValueRange /*operands*/, DictionaryAttr, OpaqueProperties, RegionRange,
+     SmallVectorImpl<Type>& inferredReturnTypes) {
+   return hlo::inferPartitionIdOp(context, location, inferredReturnTypes);
+ }
+@@ -2068,9 +2085,9 @@
+ 
+ LogicalResult IfOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  IfOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  IfOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferIfOp(location, adaptor.getPred(), adaptor.getRegions(),
+                         inferredReturnTypes);
+ }
+@@ -2081,9 +2098,9 @@
+ 
+ LogicalResult CaseOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  CaseOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  CaseOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferCaseOp(location, adaptor.getIndex(), adaptor.getRegions(),
+                           inferredReturnTypes);
+ }
+@@ -2094,8 +2111,8 @@
+ 
+ LogicalResult SliceOp::inferReturnTypes(
+     MLIRContext* context, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
+   SliceOpAdaptor adaptor(operands, attributes);
+   return hlo::inferSliceOp(location, adaptor.getOperand().getType(),
+                            adaptor.getStartIndices(), adaptor.getLimitIndices(),
+@@ -2119,9 +2136,10 @@
+ 
+ LogicalResult SortOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  SortOp::Adaptor adaptor(operands, attributes, regions);
++  SortOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferSortOp(location, adaptor.getInputs(), inferredReturnShapes);
+ }
+ 
+@@ -2174,9 +2192,9 @@
+ 
+ LogicalResult TransposeOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> loc, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  TransposeOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  TransposeOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferTransposeOp(loc, adaptor.getOperand(),
+                                adaptor.getPermutation(), inferredReturnTypes);
+ }
+@@ -2187,9 +2205,10 @@
+ 
+ LogicalResult TriangularSolveOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  TriangularSolveOp::Adaptor adaptor(operands, attributes, regions);
++  TriangularSolveOp::Adaptor adaptor(operands, attributes, {}, regions);
+   bool isTransposeAInvalid =
+       (adaptor.getTransposeA() == Transpose::TRANSPOSE_INVALID);
+   return hlo::inferTriangularSolveOp(location, adaptor.getA(), adaptor.getB(),
+@@ -2203,9 +2222,9 @@
+ 
+ LogicalResult GetTupleElementOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  GetTupleElementOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  GetTupleElementOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferGetTupleElementOp(location, adaptor.getOperand(),
+                                      adaptor.getIndex(), inferredReturnTypes);
+ }
+@@ -2216,9 +2235,9 @@
+ 
+ LogicalResult TupleOp::inferReturnTypes(
+     MLIRContext* context, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  TupleOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  TupleOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferTupleOp(context, location, adaptor.getVal(),
+                            inferredReturnTypes);
+ }
+@@ -2237,9 +2256,10 @@
+ 
+ LogicalResult CompareOp::inferReturnTypeComponents(
+     MLIRContext* context, std::optional<Location> location,
+-    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
++    ValueShapeRange operands, DictionaryAttr attributes,
++    OpaqueProperties /*properties*/, RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  CompareOp::Adaptor adaptor(operands, attributes, regions);
++  CompareOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferCompareOp(context, location, adaptor.getLhs(),
+                              inferredReturnShapes);
+ }
+@@ -2257,9 +2277,9 @@
+ 
+ LogicalResult SelectAndScatterOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  SelectAndScatterOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  SelectAndScatterOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferSelectAndScatterOp(adaptor.getOperand(),
+                                       inferredReturnTypes);
+ }
+@@ -2277,9 +2297,9 @@
+ 
+ LogicalResult ScatterOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  ScatterOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  ScatterOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferScatterOp(location, adaptor.getInputs(),
+                              inferredReturnTypes);
+ }
+@@ -2299,9 +2319,9 @@
+ 
+ LogicalResult WhileOp::inferReturnTypes(
+     MLIRContext*, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
+-    SmallVectorImpl<Type>& inferredReturnTypes) {
+-  WhileOp::Adaptor adaptor(operands, attributes, regions);
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
++  WhileOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferWhileOp(location, adaptor.getOperand(), inferredReturnTypes);
+ }
+ 
+@@ -2376,9 +2396,10 @@
+ 
+ LogicalResult UniformDequantizeOp::inferReturnTypeComponents(
+     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties /*properties*/,
++    RegionRange regions,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  UniformDequantizeOp::Adaptor adaptor(operands, attributes, regions);
++  UniformDequantizeOp::Adaptor adaptor(operands, attributes, {}, regions);
+   return hlo::inferUniformDequantizeOp(location, adaptor.getOperand(),
+                                        inferredReturnShapes);
+ }
 diff --ruN a/stablehlo/stablehlo/integrations/python/mlir/dialects/stablehlo.py b/stablehlo/stablehlo/integrations/python/mlir/dialects/stablehlo.py
 --- stablehlo/stablehlo/integrations/python/mlir/dialects/stablehlo.py
 +++ stablehlo/stablehlo/integrations/python/mlir/dialects/stablehlo.py
-@@ -17,3 +17,12 @@
+@@ -17,3 +17,10 @@
  # pylint: disable=wildcard-import,relative-beyond-top-level,g-import-not-at-top
  from ._stablehlo_ops_gen import *
  from .._mlir_libs._stablehlo import *
 +
 +
-+# TODO(#1298): Add C++ API as well
 +def get_earliest_forward_compatible_version():
 +  """Return the earliest StableHLO version that the current StableHLO version
-+
-+  is still forward compatible with.
++    is still forward compatible with.
 +  """
 +  return "0.9.0"
+diff --ruN a/stablehlo/stablehlo/reference/Tensor.cpp b/stablehlo/stablehlo/reference/Tensor.cpp
+--- stablehlo/stablehlo/reference/Tensor.cpp
++++ stablehlo/stablehlo/reference/Tensor.cpp
+@@ -100,14 +100,24 @@
+       getSizeInBytes(elementType) * flattenIndex(getShape(), index);
+ 
+   // Handle floating-point types.
++  if (elementType.isFloat8E4M3B11FNUZ()) {
++    auto elementData = reinterpret_cast<const uint8_t *>(elementPtr);
++    return Element(elementType, APFloat(llvm::APFloatBase::Float8E4M3B11FNUZ(),
++                                        APInt(8, *elementData)));
++  }
++  if (elementType.isFloat8E4M3FN()) {
++    auto elementData = reinterpret_cast<const uint8_t *>(elementPtr);
++    return Element(elementType, APFloat(llvm::APFloatBase::Float8E4M3FN(),
++                                        APInt(8, *elementData)));
++  }
+   if (elementType.isFloat8E4M3FNUZ()) {
+     auto elementData = reinterpret_cast<const uint8_t *>(elementPtr);
+     return Element(elementType, APFloat(llvm::APFloatBase::Float8E4M3FNUZ(),
+                                         APInt(8, *elementData)));
+   }
+-  if (elementType.isFloat8E4M3B11FNUZ()) {
++  if (elementType.isFloat8E5M2()) {
+     auto elementData = reinterpret_cast<const uint8_t *>(elementPtr);
+-    return Element(elementType, APFloat(llvm::APFloatBase::Float8E4M3B11FNUZ(),
++    return Element(elementType, APFloat(llvm::APFloatBase::Float8E5M2(),
+                                         APInt(8, *elementData)));
+   }
+   if (elementType.isFloat8E5M2FNUZ()) {
+@@ -222,7 +232,8 @@
+       getSizeInBytes(elementType) * flattenIndex(getShape(), index);
+ 
+   // Handle floating-point types.
+-  if (elementType.isFloat8E4M3FNUZ() || elementType.isFloat8E4M3B11FNUZ() ||
++  if (elementType.isFloat8E4M3B11FNUZ() || elementType.isFloat8E4M3FN() ||
++      elementType.isFloat8E4M3FNUZ() || elementType.isFloat8E5M2() ||
+       elementType.isFloat8E5M2FNUZ()) {
+     auto elementData = reinterpret_cast<uint8_t *>(elementPtr);
+     auto value = element.getFloatValue();
+@@ -377,16 +388,17 @@
+   auto elemType = type.getElementType();
+ 
+   // Handle floating-point types.
+-  if (elemType.isFloat8E4M3FNUZ() || elemType.isFloat8E4M3B11FNUZ() ||
++  if (elemType.isFloat8E4M3B11FNUZ() || elemType.isFloat8E4M3FN() ||
++      elemType.isFloat8E4M3FNUZ() || elemType.isFloat8E5M2() ||
+       elemType.isFloat8E5M2FNUZ()) {
+     auto floatValues = llvm::to_vector(llvm::map_range(
+         attr.getValues<APFloat>(), [&](APFloat value) -> uint8_t {
+           return value.bitcastToAPInt().getZExtValue();
+         }));
+ 
+-    // For f8E4M3FNUZ, f8E4M3B11FNUZ, and f8E5M2FNUZ floating-point types, we
+-    // use uint8_t as their storage type because there are no builtin types for
+-    // those.
++    // For f8E4M3B11FNUZ, f8E4M3FN, f8E4M3FNUZ, f8E5M2, and f8E5M2FNUZ
++    // floating-point types, we use uint8_t as their storage type because there
++    // are no builtin types for those.
+     return Tensor(type, HeapAsmResourceBlob::allocateAndCopyInferAlign<uint8_t>(
+                             floatValues));
+   }
+@@ -398,7 +410,7 @@
+         }));
+ 
+     // For both f16 and bf16 floating-point types, we use uint16_t as their
+-    // storage type because there are no buitin types for those.
++    // storage type because there are no builtin types for those.
+     return Tensor(
+         type,
+         HeapAsmResourceBlob::allocateAndCopyInferAlign<uint16_t>(floatValues));
+diff --ruN a/stablehlo/stablehlo/reference/Types.cpp b/stablehlo/stablehlo/reference/Types.cpp
+--- stablehlo/stablehlo/reference/Types.cpp
++++ stablehlo/stablehlo/reference/Types.cpp
+@@ -44,7 +44,8 @@
+ }
+ 
+ bool isSupportedFloatType(Type type) {
+-  return type.isFloat8E4M3FNUZ() || type.isFloat8E4M3B11FNUZ() ||
++  return type.isFloat8E4M3B11FNUZ() || type.isFloat8E4M3FN() ||
++         type.isFloat8E4M3FNUZ() || type.isFloat8E5M2() ||
+          type.isFloat8E5M2FNUZ() || type.isF16() || type.isBF16() ||
+          type.isF32() || type.isF64();
+ }
+diff --ruN a/stablehlo/stablehlo/tests/TestUtils.cpp b/stablehlo/stablehlo/tests/TestUtils.cpp
+--- stablehlo/stablehlo/tests/TestUtils.cpp
++++ stablehlo/stablehlo/tests/TestUtils.cpp
+@@ -55,7 +55,8 @@
+     SmallVector<Type, 4> types;
+     if (failed(definingOpInt.inferReturnTypes(
+             op->getContext(), op->getLoc(), definingOp->getOperands(),
+-            definingOp->getAttrDictionary(), definingOp->getRegions(), types)))
++            definingOp->getAttrDictionary(), nullptr, definingOp->getRegions(),
++            types)))
+       return failure();
+ 
+     // Replace the op with another pass-through op with attributes added.
+diff --ruN a/stablehlo/stablehlo/tests/infer_chlo.mlir b/stablehlo/stablehlo/tests/infer_chlo.mlir
+--- stablehlo/stablehlo/tests/infer_chlo.mlir
++++ stablehlo/stablehlo/tests/infer_chlo.mlir
+@@ -46,6 +46,7 @@
+ 
+ // -----
+ func.func @broadcast_complex_mismatch(%arg0: tensor<2xf64>, %arg1: tensor<2xf32>) -> tensor<2xcomplex<f32>> {
++  // expected-error @+2 {{failed to infer returned types}}
+   // expected-error @+1 {{mismatched operand types}}
+   %0 = "chlo.broadcast_complex"(%arg0, %arg1) : (tensor<2xf64>, tensor<2xf32>) -> tensor<2xcomplex<f32>>
+   %1 = "hlo_test_infer.get_return_types"(%0) : (tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
+@@ -109,6 +110,7 @@
+ 
+ // -----
+ func.func @broadcast_select_branch_mismatch(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xf32>) -> tensor<2xi32> {
++  // expected-error @+2 {{failed to infer returned types}}
+   // expected-error @+1 {{mismatched operand types}}
+   %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xf32>) -> tensor<2xi32>
+   %1 = "hlo_test_infer.get_return_types"(%0) : (tensor<2xi32>) -> tensor<2xi32>
+diff --ruN a/stablehlo/stablehlo/tests/infer_stablehlo.mlir b/stablehlo/stablehlo/tests/infer_stablehlo.mlir
+--- stablehlo/stablehlo/tests/infer_stablehlo.mlir
++++ stablehlo/stablehlo/tests/infer_stablehlo.mlir
+@@ -59,6 +59,7 @@
+ // -----
+ 
+ func.func @broadcast(%a : tensor<3xi32>) -> tensor<1x2x3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{Broadcast with negative dimension size -2}}
+   %0 = "stablehlo.broadcast"(%a) {broadcast_sizes = dense<[1, -2]> : tensor<2xi64>}
+       : (tensor<3xi32>) -> tensor<1x2x3xi32>
+@@ -938,6 +939,7 @@
+ // -----
+ 
+ func.func @slice_with_index_larger_than_bound_dim(%arg0: tensor<3x?x?xi32, #stablehlo.bounds<?, 4, ?>>) -> tensor<*xindex> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{limit index 5 is larger than dimension bound 4 in dimension 1}}
+   %0 = "stablehlo.slice"(%arg0) {start_indices = dense<[1, 0, 0]> : tensor<3xi64>, limit_indices = dense<[2, 5, 4]> : tensor<3xi64>, strides = dense<[1, 2, 2]> : tensor<3xi64>} : (tensor<3x?x?xi32, #stablehlo.bounds<?, 4, ?>>) -> tensor<*xi32>
+   %1 = "hlo_test_infer.get_return_types"(%0) : (tensor<*xi32>) -> tensor<*xindex>
+@@ -961,6 +963,7 @@
+ // -----
+ 
+ func.func @pad_with_negative_inferred_bounds(%arg0: tensor<3x?x?xf16, #stablehlo.bounds<?, 3, ?>>, %arg1: tensor<f16>) -> tensor<*xindex> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{Padding result in negative bound for dimension 1}}
+   %0 = "stablehlo.pad"(%arg0, %arg1) {
+     edge_padding_low = dense<[2, -10, 0]> : tensor<3xi64>,
+diff --ruN a/stablehlo/stablehlo/tests/interpret_constant.mlir b/stablehlo/stablehlo/tests/interpret_constant.mlir
+--- stablehlo/stablehlo/tests/interpret_constant.mlir
++++ stablehlo/stablehlo/tests/interpret_constant.mlir
+@@ -80,24 +80,41 @@
+ 
+ // -----
+ 
+-func.func @constant_op_test_f8_e4m3_fnuz() {
+-  %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E4M3FNUZ>
+-  check.expect_eq_const %0, dense<[0.0, 0.0, 1.0, 0.125, 0.1, 3.25, 240.0, -240.0, 0.000976562, -0.000976562]> : tensor<10xf8E4M3FNUZ>
++func.func @constant_op_test_f8_e4m3b11_fnuz() {
++  %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E4M3B11FNUZ>
++  check.expect_almost_eq_const %0, dense<[0.0, 0.0, 1.0, 0.125, 0.101563, 3.25, 30.0, -30.0, 0.00012207, -0.00012207]> : tensor<10xf8E4M3B11FNUZ>
+   func.return
+ }
+ 
+ // -----
+ 
+-func.func @constant_op_test_f8_e4m3b11_fnuz() {
+-  %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E4M3B11FNUZ>
+-  check.expect_eq_const %0, dense<[0.0, 0.0, 1.0, 0.125, 0.1, 3.25, 30.0, -30.0, 0.0001220703125, -0.0001220703125]> : tensor<10xf8E4M3B11FNUZ>
++func.func @constant_op_test_f8_e4m3_fn() {
++  %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E4M3FN>
++  check.expect_almost_eq_const %0, dense<[0.0, -0.0, 1.0, 0.125, 0.1015630, 3.25, 0x7F, 0xFF, 0.001953130, -0.001953130]> : tensor<10xf8E4M3FN>
+   func.return
+ }
+ 
+ // -----
++
++func.func @constant_op_test_f8_e4m3_fnuz() {
++  %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E4M3FNUZ>
++  check.expect_almost_eq_const %0, dense<[0.0, 0.0, 1.0, 0.125, 0.101563, 3.25, 240.0, -240.0, 0.000976562, -0.0009765620]> : tensor<10xf8E4M3FNUZ>
++  func.return
++}
++
++// -----
++
++func.func @constant_op_test_f8_e5m2() {
++  %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E5M2>
++  check.expect_almost_eq_const %0, dense<[0.0, -0.0, 1.0, 0.125, 0.09375, 3.0, 0x7F, 0xFF, 0.0000152588, -0.0000152588]> : tensor<10xf8E5M2>
++  func.return
++}
++
++// -----
++
+ func.func @constant_op_test_f8_e5m2_fnuz() {
+   %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E5M2FNUZ>
+-  check.expect_eq_const %0, dense<[0.0, 0.0, 1.0, 0.125, 0.1, 3.0, 57344.0, -57344.0, 7.62939e-06, -7.62939e-06]> : tensor<10xf8E5M2FNUZ>
++  check.expect_almost_eq_const %0, dense<[0.0, 0.0, 1.0, 0.125, 0.09375, 3.0, 57344.0, -57344.0, 0.00000762939, -0.00000762939]> : tensor<10xf8E5M2FNUZ>
+   func.return
+ }
+ 
+diff --ruN a/stablehlo/stablehlo/tests/ops_chlo.mlir b/stablehlo/stablehlo/tests/ops_chlo.mlir
+--- stablehlo/stablehlo/tests/ops_chlo.mlir
++++ stablehlo/stablehlo/tests/ops_chlo.mlir
+@@ -91,6 +91,7 @@
+ // -----
+ 
+ func.func @top_k(%arg0 : tensor<*xf32>) {
++  // expected-error @+2 {{failed to infer returned types}}
+   // @expected-error @+1{{operand must be ranked}}
+   %0:2 = chlo.top_k(%arg0, k=8) : tensor<*xf32> -> (tensor<8xf32>, tensor<8xi32>)
+   return
+@@ -99,6 +100,7 @@
+ // -----
+ 
+ func.func @top_k(%arg0 : tensor<f32>) {
++  // expected-error @+2 {{failed to infer returned types}}
+   // @expected-error @+1{{operand's rank must be at least 1}}
+   %0:2 = chlo.top_k(%arg0, k=8) : tensor<f32> -> (tensor<8xf32>, tensor<8xi32>)
+   return
+@@ -107,6 +109,7 @@
+ // -----
+ 
+ func.func @top_k(%arg0 : tensor<?xf32>) {
++  // expected-error @+2 {{failed to infer returned types}}
+   // @expected-error @+1{{operand's last dimension must be static}}
+   %0:2 = chlo.top_k(%arg0, k=8) : tensor<?xf32> -> (tensor<8xf32>, tensor<8xi32>)
+   return
+@@ -115,6 +118,7 @@
+ // -----
+ 
+ func.func @top_k(%arg0 : tensor<4xf32>) {
++  // expected-error @+2 {{failed to infer returned types}}
+   // @expected-error @+1{{operand's last dimension must be at least 8}}
+   %0:2 = chlo.top_k(%arg0, k=8) : tensor<4xf32> -> (tensor<8xf32>, tensor<8xi32>)
+   return
+diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo.mlir b/stablehlo/stablehlo/tests/ops_stablehlo.mlir
+--- stablehlo/stablehlo/tests/ops_stablehlo.mlir
++++ stablehlo/stablehlo/tests/ops_stablehlo.mlir
+@@ -484,6 +484,7 @@
+ // -----
+ 
+ func.func @alltoall_negative_split_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{AllToAll split_dimension cannot be negative}}
+   %0 = "stablehlo.all_to_all"(%data) {
+     split_dimension = -1 : i64,
+@@ -497,6 +498,7 @@
+ // -----
+ 
+ func.func @alltoall_out_bound_split_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{AllToAll split_dimension 2 is out-of-bounds for input rank 2}}
+   %0 = "stablehlo.all_to_all"(%data) {
+     split_dimension = 2 : i64,
+@@ -510,6 +512,7 @@
+ // -----
+ 
+ func.func @alltoall_negative_concat_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{AllToAll concat_dimension cannot be negative}}
+   %0 = "stablehlo.all_to_all"(%data) {
+     split_dimension = 1 : i64,
+@@ -523,6 +526,7 @@
+ // -----
+ 
+ func.func @alltoall_out_bound_concat_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{AllToAll concat_dimension 2 is out-of-bounds for input rank 2}}
+   %0 = "stablehlo.all_to_all"(%data) {
+     split_dimension = 1 : i64,
+@@ -536,6 +540,7 @@
+ // -----
+ 
+ func.func @alltoall_invalid_split_count(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{AllToAll split_count must be > 0}}
+   %0 = "stablehlo.all_to_all"(%data) {
+     split_dimension = 1 : i64,
+@@ -549,7 +554,8 @@
+ // -----
+ 
+ func.func @alltoall_invalid_split_dim_size(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+-// expected-error@+1 {{split dimension has size 16, expected to be a multiple of split_count 5}}
++  // expected-error@+2 {{failed to infer returned types}}
++  // expected-error@+1 {{split dimension has size 16, expected to be a multiple of split_count 5}}
+   %0 = "stablehlo.all_to_all"(%data) {
+     split_dimension = 1 : i64,
+     concat_dimension = 0 : i64,
+@@ -562,6 +568,7 @@
+ // -----
+ 
+ func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{replica groups should be a rank 2 tensor}}
+   %0 = "stablehlo.all_to_all"(%data) {
+     split_dimension = 1 : i64,
+@@ -575,6 +582,7 @@
+ // -----
+ 
+ func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{replica id #1 not seen in replica groups}}
+   %0 = "stablehlo.all_to_all"(%data) {
+     split_dimension = 1 : i64,
+@@ -588,6 +596,7 @@
+ // -----
+ 
+ func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{replica id #2 seen more than once}}
+   %0 = "stablehlo.all_to_all"(%data) {
+     split_dimension = 1 : i64,
+@@ -601,6 +610,7 @@
+ // -----
+ 
+ func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{replica id #4 not seen in replica groups}}
+   %0 = "stablehlo.all_to_all"(%data) {
+     split_dimension = 1 : i64,
+@@ -614,6 +624,7 @@
+ // -----
+ 
+ func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{group size of replica_groups must be 4}}
+   %0 = "stablehlo.all_to_all"(%data) {
+     split_dimension = 1 : i64,
+@@ -794,6 +805,7 @@
+ // -----
+ 
+ func.func @broadcast_bad_sizes_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{broadcast_sizes has rank 2 instead of rank 1}}
+   %0 = "stablehlo.broadcast"(%arg0) {broadcast_sizes = dense<[[1, 2]]> : tensor<1x2xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+   func.return %0 : tensor<1x2x3xi32>
+@@ -802,6 +814,7 @@
+ // -----
+ 
+ func.func @broadcast_bad_result_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'stablehlo.broadcast' op inferred type(s) 'tensor<2x3xi32>' are incompatible with return type(s) of operation 'tensor<1x2x3xi32>'}}
+   %0 = "stablehlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
+   func.return %0 : tensor<1x2x3xi32>
+@@ -810,6 +823,7 @@
+ // -----
+ 
+ func.func @broadcast_bad_first_part_result_shape(%arg0: tensor<3xi32>) -> tensor<1x3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'stablehlo.broadcast' op inferred type(s) 'tensor<2x3xi32>' are incompatible with return type(s) of operation 'tensor<1x3xi32>'}}
+   %0 = "stablehlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x3xi32>
+   func.return %0 : tensor<1x3xi32>
+@@ -818,6 +832,7 @@
+ // -----
+ 
+ func.func @broadcast_bad_second_part_result_shape(%arg0: tensor<3xi32>) -> tensor<2x1xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'stablehlo.broadcast' op inferred type(s) 'tensor<2x3xi32>' are incompatible with return type(s) of operation 'tensor<2x1xi32>'}}
+   %0 = "stablehlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<2x1xi32>
+   func.return %0 : tensor<2x1xi32>
+@@ -976,6 +991,7 @@
+ // -----
+ 
+ func.func @if_c1(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<f32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // @expected-error@+1 {{branch 0 must have 0 arguments, but found 1}}
+   %0 = "stablehlo.if"(%pred) ({
+       ^bb0(%arg0: tensor<f32>):
+@@ -989,6 +1005,7 @@
+ // -----
+ 
+ func.func @if_c1(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<f32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // @expected-error@+1 {{branch 1 must have 0 arguments, but found 1}}
+   %0 = "stablehlo.if"(%pred) ({
+       "stablehlo.return"(%branch_operand) : (tensor<f32>) -> ()
+@@ -1002,6 +1019,7 @@
+ // -----
+ 
+ func.func @if_c2(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<f32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // @expected-error@+1 {{branch 0 and branch 1 have mismatched return types: 'tensor<f32>', 'tensor<f32>' vs 'tensor<f32>'}}
+   %0 = "stablehlo.if"(%pred) ({
+       "stablehlo.return"(%branch_operand, %branch_operand) : (tensor<f32>, tensor<f32>) -> ()
+@@ -1014,6 +1032,7 @@
+ // -----
+ 
+ func.func @if_c3(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<i32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // @expected-error@+1 {{inferred type(s) 'tensor<f32>' are incompatible with return type(s) of operation 'tensor<i32>'}}
+   %0 = "stablehlo.if"(%pred) ({
+       "stablehlo.return"(%branch_operand) : (tensor<f32>) -> ()
+@@ -1050,6 +1069,7 @@
+ // -----
+ 
+ func.func @if_i1(%pred : tensor<1xi1>, %branch_operand : tensor<f32>) -> tensor<f32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // @expected-error@+1 {{operand should be rank 0 tensor but got rank 1}}
+   %0 = "stablehlo.if"(%pred) ({
+       "stablehlo.return"(%branch_operand) : (tensor<f32>) -> ()
+@@ -1086,6 +1106,7 @@
+ // -----
+ 
+ func.func @case_c1(%index : tensor<i32>, %branch_operand : tensor<2xf32>) -> tensor<2xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // @expected-error@+1 {{expect at least one branch}}
+   %0 = "stablehlo.case"(%index) : (tensor<i32>) -> tensor<2xf32>
+   func.return %0 : tensor<2xf32>
+@@ -1094,6 +1115,7 @@
+ // -----
+ 
+ func.func @case_c2(%index : tensor<i32>, %branch_operand : tensor<f32>) -> tensor<f32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // @expected-error@+1 {{branch 1 must have 0 arguments, but found 1}}
+   %0 = "stablehlo.case"(%index) ({
+       "stablehlo.return"(%branch_operand) : (tensor<f32>) -> ()
+@@ -1107,6 +1129,7 @@
+ // -----
+ 
+ func.func @case_c3(%index: tensor<i32>, %operand_1: tensor<f32>, %operand_2: tensor<f32>, %operand_3: tensor<f32>) -> tensor<f32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{branch 0 and branch 1 have mismatched return types: 'tensor<f32>' vs 'tensor<i32>'}}
+   %0 = "stablehlo.case"(%index) ({
+       %1 = "stablehlo.negate"(%operand_1) : (tensor<f32>) -> tensor<f32>
+@@ -1121,7 +1144,8 @@
+ // -----
+ 
+ func.func @case_c4(%index : tensor<i32>, %branch_operand : tensor<f32>) -> tensor<i32> {
+-  // @expected-error@+1 {{inferred type(s) 'tensor<f32>' are incompatible with return type(s) of operation 'tensor<i32>'}}
++  // expected-error@+2 {{failed to infer returned types}}
++  // expected-error@+1 {{inferred type(s) 'tensor<f32>' are incompatible with return type(s) of operation 'tensor<i32>'}}
+   %0 = "stablehlo.case"(%index) ({
+       "stablehlo.return"(%branch_operand) : (tensor<f32>) -> ()
+   }, {
+@@ -1157,7 +1181,8 @@
+ // -----
+ 
+ func.func @case_i1(%index : tensor<1xi32>, %branch_operand : tensor<2xf32>) -> tensor<2xf32> {
+-  // @expected-error@+1 {{operand should be rank 0 tensor but got rank 1}}
++  // expected-error@+2 {{failed to infer returned types}}
++  // expected-error@+1 {{operand should be rank 0 tensor but got rank 1}}
+   %0 = "stablehlo.case"(%index) ({
+       "stablehlo.return"(%branch_operand) : (tensor<2xf32>) -> ()
+   }, {
+@@ -1295,6 +1320,7 @@
+ // -----
+ 
+ func.func @concatenate_c1_c5(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<4xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{op inferred type(s) 'tensor<3xi32>' are incompatible with return type(s) of operation 'tensor<4xi32>'}}
+   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<4xi32>
+   func.return %0 : tensor<4xi32>
+@@ -1303,6 +1329,7 @@
+ // -----
+ 
+ func.func @concatenate_c2(%arg0: tensor<1xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{operands (0) and (1) do not match rank}}
+   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2x2xi32>) -> tensor<3xi32>
+   func.return %0 : tensor<3xi32>
+@@ -1319,6 +1346,7 @@
+ // -----
+ 
+ func.func @concatenate_c4(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{dimension -1 is negative}}
+   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = -1 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
+   func.return %0 : tensor<3xi32>
+@@ -1327,6 +1355,7 @@
+ // -----
+ 
+ func.func @concatenate_c4(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>)  -> tensor<*xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{dimension -1 is negative}}
+   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = -1 : i64 } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+   func.return %0 : tensor<*xi32>
+@@ -1335,6 +1364,7 @@
+ // -----
+ 
+ func.func @concatenate_c4(%arg0: tensor<i32>, %arg1: tensor<i32>)  -> tensor<2xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{rank-0 values cannot be concatenated}}
+   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+   func.return %0 : tensor<2xi32>
+@@ -1343,6 +1373,7 @@
+ // -----
+ 
+ func.func @concatenate_c4(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{dimension 10 is out-of-bounds for input rank 1}}
+   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = 10 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
+   func.return %0 : tensor<3xi32>
+@@ -1351,6 +1382,7 @@
+ // -----
+ 
+ func.func @concatenate_c6(%arg0: tensor<1x3xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3x3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{shapes of operand (0) and (1) do not match at non-concat index: (1, 3) != (2, 2) at non-concat index 1}}
+   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1x3xi32>, tensor<2x2xi32>) -> tensor<3x3xi32>
+   func.return %0 : tensor<3x3xi32>
+@@ -1381,6 +1413,7 @@
+ // -----
+ 
+ func.func @clamp_c1(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<1xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{min shape [2] is not scalar and is not compatible to operand shape [1]}}
+   %0 = "stablehlo.clamp"(%arg1, %arg0, %arg0) : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+   func.return %0: tensor<1xi32>
+@@ -1389,6 +1422,7 @@
+ // -----
+ 
+ func.func @clamp_c2(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<1xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{max shape [2] is not scalar and is not compatible to operand shape [1]}}
+   %0 = "stablehlo.clamp"(%arg0, %arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<1xi32>
+   func.return %0: tensor<1xi32>
+@@ -1397,7 +1431,8 @@
+ // -----
+ 
+ func.func @clamp_c4(%arg0: tensor<1xi32>) -> tensor<1x2xi32> {
+-  // // expected-error@+1{{inferred type(s) 'tensor<1xi32>' are incompatible with return type(s) of operation 'tensor<1x2xi32>'}}
++  // expected-error@+2 {{failed to infer returned types}}
++  // expected-error@+1{{inferred type(s) 'tensor<1xi32>' are incompatible with return type(s) of operation 'tensor<1x2xi32>'}}
+   %0 = "stablehlo.clamp"(%arg0, %arg0, %arg0) : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2xi32>
+   func.return %0: tensor<1x2xi32>
+ }
+@@ -1421,6 +1456,7 @@
+ // -----
+ 
+ func.func @cholesky_error_nonsquare(%arg0: tensor<1x2x1xf32>) -> tensor<1x2x1xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{minor dimensions of 'a' must have equal size, got shape 1, 2, 1}}
+   %0 = "stablehlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+   func.return %0: tensor<1x2x1xf32>
+@@ -1429,6 +1465,7 @@
+ // -----
+ 
+ func.func @cholesky_invalid_rank(%arg0: tensor<1xf32>) -> tensor<1xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{argument 'a' must have rank >= 2, got shape 1}}
+   %0 = "stablehlo.cholesky"(%arg0) { lower = true } : (tensor<1xf32>) -> tensor<1xf32>
+   func.return %0: tensor<1xf32>
+@@ -1437,7 +1474,7 @@
+ // -----
+ 
+ func.func @cholesky_invalid_elt(%arg0: tensor<1x2x2xi32>) -> tensor<1x2x2xi32> {
+-  // expected-error@+1 {{operand #0 must be tensor of f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<1x2x2xi32>}}
++  // expected-error@+1 {{operand #0 must be tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<1x2x2xi32>'}}
+   %0 = "stablehlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x2xi32>) -> tensor<1x2x2xi32>
+   func.return %0: tensor<1x2x2xi32>
+ }
+@@ -1445,6 +1482,7 @@
+ // -----
+ 
+ func.func @cholesky_wrong_infer_shape(%arg0: tensor<1x2x2xf32>) -> tensor<1x2x2x2xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'stablehlo.cholesky' op inferred type(s) 'tensor<1x2x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2x2x2xf32>'}}
+   %0 = "stablehlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x2xf32>) -> tensor<1x2x2x2xf32>
+   func.return %0: tensor<1x2x2x2xf32>
+@@ -1544,6 +1582,7 @@
+ // -----
+ 
+ func.func @imag_c2(%arg0: tensor<2xf32>) -> tensor<2xf16> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<2xf32>' are incompatible with return type(s) of operation 'tensor<2xf16>'}}
+   %0 = "stablehlo.imag"(%arg0) : (tensor<2xf32>) -> tensor<2xf16>
+   func.return %0 : tensor<2xf16>
+@@ -1637,6 +1676,7 @@
+ // -----
+ 
+ func.func @map_c3(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{requires monotonically increasing dimension numbers, but got: dense<[1, 0]> : tensor<2xi64>}}
+   %0 = "stablehlo.map"(%arg0, %arg1) ({
+     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+@@ -1649,6 +1689,7 @@
+ // -----
+ 
+ func.func @map_c3(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{applied to a subset of dimensions currently not supported: operand dimensions = 2, requested map dimensions size = 3}}
+   %0 = "stablehlo.map"(%arg0, %arg1) ({
+     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+@@ -1661,6 +1702,7 @@
+ // -----
+ 
+ func.func @map_c4(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects number of operands to match the arity of map computation, but got: 2 and 1}}
+   %0 = "stablehlo.map"(%arg0, %arg1) ({
+     ^bb0(%arg: tensor<f32>):
+@@ -1673,6 +1715,7 @@
+ // -----
+ 
+ func.func @map_c4(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{computation arguments must be 0-rank tensor, but got: arg #1 of type 'tensor<5xf32>'}}
+   %0 = "stablehlo.map"(%arg0, %arg1) ({
+     ^bb0(%arg2: tensor<f32>, %arg3: tensor<5xf32>):
+@@ -1685,6 +1728,7 @@
+ // -----
+ 
+ func.func @map_c4(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{element type of operands and computation arguments must match, but got: 'f32' and 'i32'}}
+   %0 = "stablehlo.map"(%arg0, %arg1) ({
+     ^bb0(%arg2: tensor<i32>, %arg3: tensor<i32>):
+@@ -1697,6 +1741,7 @@
+ // -----
+ 
+ func.func @map_c4(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{computation must return single output, but got: 0}}
+   %0 = "stablehlo.map"(%arg0, %arg1) ({
+     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+@@ -1709,6 +1754,7 @@
+ // -----
+ 
+ func.func @map_c4(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{computation must return 0-rank tensor, but got: 'tensor<5xf32>'}}
+   %0 = "stablehlo.map"(%arg0, %arg1) ({
+     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+@@ -1732,6 +1778,7 @@
+ // -----
+ 
+ func.func @map_i2(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{dimensions should be rank 1 but got rank 2}}
+   %0 = "stablehlo.map"(%arg0, %arg1) ({
+     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+@@ -1778,6 +1825,7 @@
+ // -----
+ 
+ func.func @real_c2(%arg0: tensor<2x3xcomplex<f32>>) -> tensor<2x3xf16> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<2x3xf32>' are incompatible with return type(s) of operation 'tensor<2x3xf16>'}}
+   %0 = "stablehlo.real"(%arg0) : (tensor<2x3xcomplex<f32>>) -> tensor<2x3xf16>
+   func.return %0 : tensor<2x3xf16>
+@@ -1899,6 +1947,7 @@
+ 
+ func.func @rng_normal_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+   %cst = "stablehlo.constant"() {value = dense<7> : tensor<1xi64>} : () -> tensor<1xi64>
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error @+1 {{inferred type(s) 'tensor<7xf32>' are incompatible with return type(s) of operation 'tensor<12xf32>'}}
+   %0 = "stablehlo.rng"(%arg0, %arg1, %cst) {rng_distribution = #stablehlo<rng_distribution NORMAL>}: (tensor<f32>, tensor<f32>, tensor<1xi64>) -> tensor<12xf32>
+   func.return
+@@ -1908,7 +1957,7 @@
+ 
+ func.func @rng_normal_invalid_mu_rank(%mu: tensor<1xf32>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
+   %shape = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
+-  // expected-error@+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
++  // expected-error@+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+   %0 = "stablehlo.rng"(%mu, %sigma, %shape) {rng_distribution = #stablehlo<rng_distribution NORMAL>}: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+   func.return %0 : tensor<2x3x5xf32>
+ }
+@@ -1917,7 +1966,7 @@
+ 
+ func.func @rng_normal_invalid_sigma_rank(%mu: tensor<f32>, %sigma: tensor<1xf32>) -> tensor<2x3x5xf32> {
+   %shape = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
+-  // expected-error@+1 {{#1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
++  // expected-error@+1 {{#1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+   %0 = "stablehlo.rng"(%mu, %sigma, %shape) {rng_distribution = #stablehlo<rng_distribution NORMAL>}: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+   func.return %0 : tensor<2x3x5xf32>
+ }
+@@ -1935,7 +1984,7 @@
+ 
+ func.func @rng_normal_invalid_type(%arg0: tensor<complex<f32>>, %arg1: tensor<f32>) {
+   %cst = "stablehlo.constant"() {value = dense<7> : tensor<1xi64>} : () -> tensor<1xi64>
+-  // expected-error @+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
++  // expected-error @+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
+   %0 = "stablehlo.rng"(%arg0, %arg1, %cst) {rng_distribution = #stablehlo<rng_distribution NORMAL>}: (tensor<complex<f32>>, tensor<f32>, tensor<1xi64>) -> tensor<7xf32>
+   func.return
+ }
+@@ -1968,6 +2017,7 @@
+ // -----
+ 
+ func.func @rng_uniform_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<7xi64>) {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error @+1 {{inferred type(s) 'tensor<?x?x?x?x?x?x?xf32>' are incompatible with return type(s) of operation 'tensor<?xf32>'}}
+   %0 = "stablehlo.rng"(%arg0, %arg1, %arg2) {rng_distribution = #stablehlo<rng_distribution UNIFORM>}: (tensor<f32>, tensor<f32>, tensor<7xi64>) -> tensor<?xf32>
+   func.return
+@@ -1977,7 +2027,7 @@
+ 
+ func.func @rng_uniform_invalid_a_rank(%a: tensor<1xf32>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
+   %shape = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
+-  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
++  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+   %0 = "stablehlo.rng"(%a, %b, %shape) {rng_distribution = #stablehlo<rng_distribution UNIFORM>}: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+   func.return %0 : tensor<2x3x5xf32>
+ }
+@@ -1987,7 +2037,7 @@
+ 
+ func.func @rng_uniform_invalid_b_rank(%a: tensor<f32>, %b: tensor<1xf32>) -> tensor<2x3x5xf32> {
+   %shape = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
+-  // expected-error@+1 {{operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
++  // expected-error@+1 {{operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+   %0 = "stablehlo.rng"(%a, %b, %shape) {rng_distribution = #stablehlo<rng_distribution UNIFORM>}: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+   func.return %0 : tensor<2x3x5xf32>
+ }
+@@ -2005,7 +2055,7 @@
+ 
+ func.func @rng_uniform_invalid_type(%a: tensor<complex<f32>>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
+   %shape = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
+-  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
++  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
+   %0 = "stablehlo.rng"(%a, %b, %shape) {rng_distribution = #stablehlo<rng_distribution UNIFORM>}: (tensor<complex<f32>>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+   func.return %0 : tensor<2x3x5xf32>
+ }
+@@ -2069,6 +2119,7 @@
+ // -----
+ 
+ func.func @select_c1(%arg0: tensor<3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{requires the same shape for all operands}}
+   %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+   func.return %0 : tensor<2x3xi32>
+@@ -2077,6 +2128,7 @@
+ // -----
+ 
+ func.func @select_c2(%arg0: tensor<3xi1>, %arg1: tensor<2x4xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{requires compatible types for non-predicate operands}}
+   %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x4xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+   func.return %0 : tensor<2x3xi32>
+@@ -2085,6 +2137,7 @@
+ // -----
+ 
+ func.func @select_c2(%arg0: tensor<i1>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x3xf32>) -> tensor<2x3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<2x3xf32>' are incompatible with return type(s) of operation 'tensor<2x3xi32>'}}
+   %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xi32>
+   func.return %0 : tensor<2x3xi32>
+@@ -2101,6 +2154,7 @@
+ // -----
+ 
+ func.func @slice_c2(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{the number of elements in start_indices (3) does not match the rank of the operand (2)}}
+   %0 = "stablehlo.slice"(%arg0) {
+     start_indices = dense<[1, 0, 0]> : tensor<3xi64>,
+@@ -2113,6 +2167,7 @@
+ // -----
+ 
+ func.func @slice_c3(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{negative start index -1 in dimension 0}}
+   %0 = "stablehlo.slice"(%arg0) {
+     start_indices = dense<[-1, 0]> : tensor<2xi64>,
+@@ -2125,6 +2180,7 @@
+ // -----
+ 
+ func.func @slice_c3(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{limit index 5 is larger than dimension size 4 in dimension 1}}
+   %0 = "stablehlo.slice"(%arg0) {
+     start_indices = dense<[1, 0]> : tensor<2xi64>,
+@@ -2137,6 +2193,7 @@
+ // -----
+ 
+ func.func @slice_c3(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{start index 3 is larger than limit index 2 in dimension 1}}
+   %0 = "stablehlo.slice"(%arg0) {
+     start_indices = dense<[1, 3]> : tensor<2xi64>,
+@@ -2149,6 +2206,7 @@
+ // -----
+ 
+ func.func @slice_c4(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{stride must be positive but got 0 in dimension 0}}
+   %0 = "stablehlo.slice"(%arg0) {
+     start_indices = dense<[1, 0]> : tensor<2xi64>,
+@@ -2173,6 +2231,7 @@
+ // -----
+ 
+ func.func @slice_i2(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{start_indices has rank 2 instead of required rank 1}}
+   %0 = "stablehlo.slice"(%arg0) {
+     start_indices = dense<[[1, 0]]> : tensor<1x2xi64>,
+@@ -2201,6 +2260,7 @@
+ // -----
+ 
+ func.func @dynamic_slice_c2(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{has mismatched number of slice sizes (1) and number of start indices (2)}}
+   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[4]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+   func.return %0 : tensor<1x4xi32>
+@@ -2209,6 +2269,7 @@
+ // -----
+ 
+ func.func @dynamic_slice_c2(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>) -> tensor<1x4xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{has mismatched number of start indices (1) and the rank of operand (2)}}
+   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1) {slice_sizes = dense<[1]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<i64>) -> tensor<1x4xi32>
+   func.return %0 : tensor<1x4xi32>
+@@ -2217,6 +2278,7 @@
+ // -----
+ 
+ func.func @dynamic_slice_c3(%arg0: tensor<3x4xi32>, %arg1: tensor<i32>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{start indices must have same element type}}
+   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i32>, tensor<i64>) -> tensor<1x4xi32>
+   func.return %0 : tensor<1x4xi32>
+@@ -2225,6 +2287,7 @@
+ // -----
+ 
+ func.func @dynamic_slice_c4(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{has negative size index to dynamic slice: -1}}
+   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[-1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+   func.return %0 : tensor<1x4xi32>
+@@ -2233,6 +2296,7 @@
+ // -----
+ 
+ func.func @dynamic_slice_c4(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{has slice size 10 greater than dimension size 4 in dimension 1 of operand}}
+   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 10]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+   func.return %0 : tensor<1x4xi32>
+@@ -2241,6 +2305,7 @@
+ // -----
+ 
+ func.func @dynamic_slice_c5(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<2x4xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<1x4xi32>' are incompatible with return type(s) of operation 'tensor<2x4xi32>'}}
+   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<2x4xi32>
+   func.return %0 : tensor<2x4xi32>
+@@ -2257,6 +2322,7 @@
+ // -----
+ 
+ func.func @dynamic_slice_i3(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{slice_sizes should be rank 1, but got rank 0.}}
+   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<1> : tensor<i64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+   func.return %0 : tensor<1x4xi32>
+@@ -2273,6 +2339,7 @@
+ // -----
+ 
+ func.func @dynamic_update_slice_c1(%operand: tensor<3x4xi64>, %update: tensor<1x4xi64>, %start_indices0: tensor<i64>, %start_indices1: tensor<i64>) -> tensor<3x5xi64> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{op inferred type(s) 'tensor<3x4xi64>' are incompatible with return type(s) of operation 'tensor<3x5xi64>'}}
+   %0 = "stablehlo.dynamic_update_slice"(%operand, %update, %start_indices0, %start_indices1) : (tensor<3x4xi64>, tensor<1x4xi64>, tensor<i64>, tensor<i64>) -> tensor<3x5xi64>
+   func.return %0 : tensor<3x5xi64>
+@@ -2281,6 +2348,7 @@
+ // -----
+ 
+ func.func @dynamic_update_slice_c3(%operand: tensor<3x4xi64>, %update: tensor<2xi64>, %start_indices0: tensor<i64>, %start_indices1: tensor<i64>) -> tensor<3x4xi64> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{update rank does not match operand rank: 1 vs 2.}}
+   %0 = "stablehlo.dynamic_update_slice"(%operand, %update, %start_indices0, %start_indices1) : (tensor<3x4xi64>, tensor<2xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
+   func.return %0 : tensor<3x4xi64>
+@@ -2289,6 +2357,7 @@
+ // -----
+ 
+ func.func @dynamic_update_slice_c4(%operand: tensor<3x4xi64>, %update: tensor<1x2xi64>, %start_indices0: tensor<i64>) -> tensor<3x4xi64> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects number of start_indices to match operand rank: 1 vs 2.}}
+   %0 = "stablehlo.dynamic_update_slice"(%operand, %update, %start_indices0) : (tensor<3x4xi64>, tensor<1x2xi64>, tensor<i64>) -> tensor<3x4xi64>
+   func.return %0 : tensor<3x4xi64>
+@@ -2297,6 +2366,7 @@
+ // -----
+ 
+ func.func @dynamic_update_slice_c5(%operand: tensor<11x3x4xi32>, %update: tensor<1x3x4xi32>, %start_indices0: tensor<i32>, %start_indices1: tensor<i64>, %start_indices2: tensor<i64>) -> tensor<11x3x4xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{start indices must have same element type}}
+   %0 = "stablehlo.dynamic_update_slice"(%operand, %update, %start_indices0, %start_indices1, %start_indices2) : (tensor<11x3x4xi32>, tensor<1x3x4xi32>, tensor<i32>, tensor<i64>, tensor<i64>) -> tensor<11x3x4xi32>
+   func.return %0 : tensor<11x3x4xi32>
+@@ -2305,6 +2375,7 @@
+ // -----
+ 
+ func.func @dynamic_update_slice_c6(%operand: tensor<3x4xi64>, %update: tensor<1x5xi64>, %start_indices0: tensor<i64>, %start_indices1: tensor<i64>) -> tensor<3x4xi64> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects size at dimension 1 of update to be in range [0, 4]. Got: 5.}}
+   %0 = "stablehlo.dynamic_update_slice"(%operand, %update, %start_indices0, %start_indices1) : (tensor<3x4xi64>, tensor<1x5xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
+   func.return %0 : tensor<3x4xi64>
+@@ -2375,6 +2446,7 @@
+ // -----
+ 
+ func.func @transpose_bad_permutations_rank(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{permutation has rank 2 instead of rank 1}}
+   %0 = "stablehlo.transpose"(%arg0) {permutation = dense<[[1]]> : tensor<1x1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+   func.return %0: tensor<2x1x4x3xi32>
+@@ -2383,6 +2455,7 @@
+ // -----
+ 
+ func.func @transpose_bad_permutations_size(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{TransposeOp operand rank 4 does not match permutation size 1}}
+   %0 = "stablehlo.transpose"(%arg0) {permutation = dense<[1]> : tensor<1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+   func.return %0: tensor<2x1x4x3xi32>
+@@ -2391,6 +2464,7 @@
+ // -----
+ 
+ func.func @transpose_bad_permutation(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{attribute permutation must be a permutation of [0, 1, 2, 3] but got dense<[1, 0, 3, 9]> : tensor<4xi64>}}
+   %0 = "stablehlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 9]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+   func.return %0: tensor<2x1x4x3xi32>
+@@ -2399,6 +2473,7 @@
+ // -----
+ 
+ func.func @transpose_operand_result_rank_mismatch(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{op inferred type(s) 'tensor<2x1x4x3xi32>' are incompatible with return type(s) of operation 'tensor<2xi32>'}}
+   %0 = "stablehlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2xi32>
+   func.return %0: tensor<2xi32>
+@@ -2407,6 +2482,7 @@
+ // -----
+ 
+ func.func @transpose_operand_result_permutation_mismatch(%arg0: tensor<1x?x3x?xi32>) ->  tensor<?x2x?x?xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{op inferred type(s) 'tensor<?x1x?x3xi32>' are incompatible with return type(s) of operation 'tensor<?x2x?x?xi32>}}
+   %0 = "stablehlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x?x3x?xi32>) -> tensor<?x2x?x?xi32>
+   func.return %0: tensor<?x2x?x?xi32>
+@@ -2471,6 +2547,7 @@
+ // -----
+ 
+ func.func @triangular_solve_rank_less_than_2(%arg0: tensor<4xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x3xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{operand 'a' must have rank >= 2, but got 'tensor<4xf32>'}}
+   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
+   func.return %0 : tensor<4x3xf32>
+@@ -2479,6 +2556,7 @@
+ // -----
+ 
+ func.func @triangular_solve_unequal_minor_dims_a(%arg0: tensor<4x3xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x3xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{two minor dimensions of operand 'a' must be compatible, but got 'tensor<4x3xf32>'}}
+   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4x3xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
+   func.return %0 : tensor<4x3xf32>
+@@ -2487,6 +2565,7 @@
+ // -----
+ 
+ func.func @triangular_solve_unequal_rank(%arg0: tensor<10x4x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x3xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{operands must have equal rank, but got 'tensor<10x4x4xf32>' and 'tensor<4x3xf32>'}}
+   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<10x4x4xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
+   func.return %0 : tensor<4x3xf32>
+@@ -2495,6 +2574,7 @@
+ // -----
+ 
+ func.func @triangular_solve_mismatch_shared_dim(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> tensor<3x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{shared dimension of operands 'a' and 'b' must be compatible, but got 'tensor<4x4xf32>' and 'tensor<3x4xf32>'}}
+   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4x4xf32>, tensor<3x4xf32>) -> tensor<3x4xf32>
+   func.return %0 : tensor<3x4xf32>
+@@ -2503,6 +2583,7 @@
+ // -----
+ 
+ func.func @triangular_solve_mismatch_leading_dims(%arg0: tensor<10x5x4x4xf32>, %arg1: tensor<10x6x4x3xf32>) -> tensor<10x6x4x3xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{batch dimensions of the operands must be compatible, but got 'tensor<10x5x4x4xf32>' and 'tensor<10x6x4x3xf32>'}}
+   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<10x5x4x4xf32>, tensor<10x6x4x3xf32>) -> tensor<10x6x4x3xf32>
+   func.return %0 : tensor<10x6x4x3xf32>
+@@ -2511,6 +2592,7 @@
+ // -----
+ 
+ func.func @triangular_solve_mismatch_result_and_b_type(%arg0: tensor<4x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<4x3xf32>' are incompatible with return type(s) of operation 'tensor<4x4xf32>'}}
+   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4x4xf32>, tensor<4x3xf32>) -> tensor<4x4xf32>
+   func.return %0 : tensor<4x4xf32>
+@@ -2519,6 +2601,7 @@
+ // -----
+ 
+ func.func @triangular_solve(%arg0: tensor<10x5x4x4xf32>, %arg1: tensor<10x5x4x4xf32>) -> tensor<10x5x4x4xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{Invalid transpose option value for triangular solve}}
+   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose TRANSPOSE_INVALID>, unit_diagonal = true} : (tensor<10x5x4x4xf32>, tensor<10x5x4x4xf32>) -> tensor<10x5x4x4xf32>
+   func.return %0 : tensor<10x5x4x4xf32>
+@@ -2542,6 +2625,7 @@
+ // -----
+ 
+ func.func @tuple_arg_size_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'tuple<tensor<f32>, tensor<f32>>' are incompatible with return type(s) of operation 'tuple<tensor<f32>, tensor<f32>, tensor<f32>>'}}
+   %0 = "stablehlo.tuple"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>>
+   func.return %0 : tuple<tensor<f32>, tensor<f32>, tensor<f32>>
+@@ -2550,6 +2634,7 @@
+ // -----
+ 
+ func.func @tuple_type_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tuple<tensor<f32>, tensor<i32>> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tuple<tensor<f32>, tensor<f32>>' are incompatible with return type(s) of operation 'tuple<tensor<f32>, tensor<i32>>'}}
+   %0 = "stablehlo.tuple"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<i32>>
+   func.return %0 : tuple<tensor<f32>, tensor<i32>>
+@@ -2572,6 +2657,7 @@
+ // -----
+ 
+ func.func @get_tuple_element_bad_type(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<i32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<f32>' are incompatible with return type(s) of operation 'tensor<i32>'}}
+   %0 = "stablehlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<i32>
+   func.return %0 : tensor<i32>
+@@ -2580,6 +2666,7 @@
+ // -----
+ 
+ func.func @get_tuple_element_index_out_of_bounds(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{index 2 is out of bounds of operand with size 2}}
+   %0 = "stablehlo.get_tuple_element"(%arg0) {index = 2 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
+   func.return %0 : tensor<f32>
+@@ -2611,7 +2698,7 @@
+ // -----
+ 
+ func.func @floor_invalid_i32_type(%arg0: tensor<4xi32>) -> tensor<4xi32> {
+-  // expected-error@+1 {{must be tensor of f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<4xi32>'}}
++  // expected-error@+1 {{must be tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<4xi32>'}}
+   %0 = "stablehlo.floor"(%arg0) : (tensor<4xi32>) -> tensor<4xi32>
+   func.return %0 : tensor<4xi32>
+ }
+@@ -2632,6 +2719,7 @@
+ // -----
+ 
+ func.func @constant_invalid() -> () {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'stablehlo.constant' op inferred type(s) 'tensor<i32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
+   %0 = "stablehlo.constant"() {value = dense<0> : tensor<i32>} : () -> (tensor<3xi32>)
+   func.return
+@@ -3516,6 +3604,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{index_vector_dim 4 is out of bounds for start indices with rank 3}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3533,6 +3622,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (2) is not equal to operand rank (3)}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3550,6 +3640,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{start_index_map size (1) is not equal to size of index dimension (2) of start_indices (2)}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3567,6 +3658,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{slice_sizes.rank != 1}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3584,6 +3676,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{slice_sizes size (2) not equal to (implied) operand rank (3)}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3601,6 +3694,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi32>) -> tensor<*xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{slice_sizes size (6) not equal to (implied) operand rank (3)}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3618,6 +3712,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<1x5x8xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3635,6 +3730,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<8x?x7x1x6x1x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3652,6 +3748,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi32>) -> tensor<*xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{slice_sizes collapsed dimension 2 should <= 1 but got 8}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3669,6 +3766,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{collapsed dimension -1 is out of bounds for slice_sizes.size (3)}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3686,6 +3784,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{collapsed dimension 17 is out of bounds for slice_sizes.size (3)}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3703,6 +3802,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<*xi32>) -> tensor<*xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{slice size (-1) is out of bounds for operand dimension (2) at index 2}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3720,6 +3820,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<*xi32>) -> tensor<*xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{slice size (8) is out of bounds for operand dimension (2) at index 2}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3737,6 +3838,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<16x11xi32>, %start_indices : tensor<5x2xi32>) -> tensor<5x8x6xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects offset_dims to not repeat, got: [2, 2]}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3754,6 +3856,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<16x11xi32>, %start_indices : tensor<5x2xi32>) -> tensor<5x8x6xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects offset_dims to be sorted, got: [2, 1]}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3771,6 +3874,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects collapsed_slice_dims to not repeat, got: [1, 1]}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3788,6 +3892,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects collapsed_slice_dims to be sorted, got: [1, 0]}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3805,6 +3910,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{offset_dims[0]: -1 is out of bounds for implied result rank 3}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3822,6 +3928,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{offset_dims[0]: 3 is out of bounds for implied result rank 3}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3839,6 +3946,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{start_index_map[0]: -2 is out of bounds for operand rank 3}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3856,6 +3964,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{start_index_map[1]: 3 is out of bounds for operand rank 3}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3873,6 +3982,7 @@
+ // -----
+ 
+ func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects start_index_map to not repeat, got: [0, 0]}}
+   %res = "stablehlo.gather"(%operand, %start_indices) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3936,6 +4046,7 @@
+ // -----
+ 
+ func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<*xi32>) -> tensor<*xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{index_vector_dim 4 is out of bounds for start indices with rank 3}}
+   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3952,6 +4063,7 @@
+ // -----
+ 
+ func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<*xi32>, %slice_sizes : tensor<*xi32>) -> tensor<*xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (2) is not equal to operand rank (3)}}
+   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3968,6 +4080,7 @@
+ // -----
+ 
+ func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x2xi32>, %slice_sizes : tensor<*xi32>) -> tensor<*xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{start_index_map size (1) is not equal to size of index dimension (2) of start_indices (2)}}
+   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+     dimension_numbers = #stablehlo.gather<
+@@ -3984,6 +4097,7 @@
+ // -----
+ 
+ func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi32>, %slice_sizes : tensor<?x?xi32>) -> tensor<*xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{slice_sizes.rank != 1}}
+   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+     dimension_numbers = #stablehlo.gather<
+@@ -4000,6 +4114,7 @@
+ // -----
+ 
+ func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<*xi32>, %slice_sizes : tensor<2xi32>) -> tensor<*xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{slice_sizes size (2) not equal to (implied) operand rank (3)}}
+   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+     dimension_numbers = #stablehlo.gather<
+@@ -4016,6 +4131,7 @@
+ // -----
+ 
+ func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<1x5x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
+   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+     dimension_numbers = #stablehlo.gather<
+@@ -4032,6 +4148,7 @@
+ // -----
+ 
+ func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<*xi32>) -> tensor<3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<1x5x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
+   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+     dimension_numbers = #stablehlo.gather<
+@@ -4048,6 +4165,7 @@
+ // -----
+ 
+ func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<?x?x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
+   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+     dimension_numbers = #stablehlo.gather<
+@@ -4064,6 +4182,7 @@
+ // -----
+ 
+ func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<?xi32>) -> tensor<?xi32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<?x?x?xi32>' are incompatible with return type(s) of operation 'tensor<?xi32>'}}
+   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+     dimension_numbers = #stablehlo.gather<
+@@ -4087,6 +4206,7 @@
+ // -----
+ 
+ func.func @get_dimension_size_c1(%I: tensor<1x128x512xf32>) -> tensor<i32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{requires non-negative dimension attribute; found (-1)}}
+   %size = "stablehlo.get_dimension_size"(%I) {dimension = -1 : i64} : (tensor<1x128x512xf32>) -> tensor<i32>
+   func.return %size : tensor<i32>
+@@ -4095,6 +4215,7 @@
+ // -----
+ 
+ func.func @get_dimension_size_c1(%I: tensor<1x128x512xf32>) -> tensor<i32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{requires dimension attribute in range [0, 3); found (3)}}
+   %size = "stablehlo.get_dimension_size"(%I) {dimension = 3 : i64} : (tensor<1x128x512xf32>) -> tensor<i32>
+   func.return %size : tensor<i32>
+@@ -4105,6 +4226,7 @@
+ func.func @set_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<1x128x512xf32> {
+   %dim = stablehlo.constant dense<512> : tensor<1xi32>
+ 
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{size operand should be of rank-0}}
+   %result = "stablehlo.set_dimension_size"(%I, %dim) {dimension = 2 : i64} : (tensor<1x128x512xf32>, tensor<1xi32>) -> tensor<1x128x512xf32>
+   func.return %result : tensor<1x128x512xf32>
+@@ -4114,6 +4236,7 @@
+ 
+ func.func @set_dimension_size_negative_dimension(%I: tensor<1x128x512xf32>) -> tensor<1x128x512xf32> {
+   %dim = stablehlo.constant dense<512> : tensor<i32>
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{requires non-negative dimension attribute; found (-1)}}
+   %result = "stablehlo.set_dimension_size"(%I, %dim) {dimension =-1 : i64} : (tensor<1x128x512xf32>, tensor<i32>) -> tensor<1x128x512xf32>
+   func.return %result : tensor<1x128x512xf32>
+@@ -4123,6 +4246,7 @@
+ 
+ func.func @set_dimension_size_invalid_dimension(%I: tensor<1x128x512xf32>) -> tensor<1x128x512xf32> {
+   %dim = stablehlo.constant dense<512> : tensor<i32>
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{requires dimension attribute in range [0, 3); found (3)}}
+   %result = "stablehlo.set_dimension_size"(%I, %dim) {dimension = 3 : i64} : (tensor<1x128x512xf32>, tensor<i32>) -> tensor<1x128x512xf32>
+   func.return %result : tensor<1x128x512xf32>
+@@ -4651,6 +4775,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %offset: tensor<2xf32>) -> tensor<2x2x2x2xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects featureIndex to be smaller than the rank of multi-dimensional operands; got featureIndex 4, and rank 4.}}
+   %0:3 = "stablehlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = 4 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
+   func.return %0#0 : tensor<2x2x2x2xf32>
+@@ -4659,6 +4784,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %offset: tensor<2xf32>) -> tensor<2x2x2x2xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects featureIndex to be a non-negative number, got -1.}}
+   %0:3 = "stablehlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = -1 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
+   func.return %0#0 : tensor<2x2x2x2xf32>
+@@ -4667,6 +4793,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<3xf32>, %offset: tensor<3xf32>) -> tensor<2x2x2x2xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects the size of single-dimensional operands to be compatible with feature count, but the size of single-dimensional operands is 3 and the feature count is 2.}}
+   %0:3 = "stablehlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = 3 : i64} : (tensor<2x2x2x2xf32>, tensor<3xf32>, tensor<3xf32>) -> (tensor<2x2x2x2xf32>, tensor<3xf32>, tensor<3xf32>)
+   func.return %0#0 : tensor<2x2x2x2xf32>
+@@ -4697,6 +4824,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>, %mean: tensor<256xf32>, %variance: tensor<256xf32>) -> (tensor<4x256xf32>) {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects featureIndex to be smaller than the rank of multi-dimensional operands; got featureIndex 2, and rank 2.}}
+   %0 = "stablehlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = 2 : i64} :
+       (tensor<4x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
+@@ -4707,6 +4835,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>, %mean: tensor<256xf32>, %variance: tensor<256xf32>) -> (tensor<4x256xf32>) {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects featureIndex to be a non-negative number, got -1.}}
+   %0 = "stablehlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = -1 : i64} :
+       (tensor<4x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
+@@ -4717,6 +4846,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<25xf32>, %offset: tensor<25xf32>, %mean: tensor<25xf32>, %variance: tensor<25xf32>) -> (tensor<4x256xf32>) {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects the size of single-dimensional operands to be compatible with feature count, but the size of single-dimensional operands is 25 and the feature count is 256.}}
+   %0 = "stablehlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = 1 : i64} :
+       (tensor<4x256xf32>, tensor<25xf32>, tensor<25xf32>, tensor<25xf32>,
+@@ -4746,6 +4876,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects featureIndex to be smaller than the rank of multi-dimensional operands; got featureIndex 4, and rank 4.}}
+   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 4 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
+   func.return %0#0 : tensor<2x2x2x2xf32>
+@@ -4754,6 +4885,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects featureIndex to be a non-negative number, got -1.}}
+   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = -1 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
+   func.return %0#0 : tensor<2x2x2x2xf32>
+@@ -4762,6 +4894,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf32>, %mean: tensor<4xf32>, %variance: tensor<4xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects the size of single-dimensional operands to be compatible with feature count, but the size of single-dimensional operands is 4 and the feature count is 2.}}
+   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<4xf32>, tensor<4xf32>)
+   func.return %0#0 : tensor<2x2x2x2xf32>
+@@ -4770,6 +4903,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects single-dimensional operands to have compatible shapes}}
+   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<4xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
+   func.return %0#0 : tensor<2x2x2x2xf32>
+@@ -4778,7 +4912,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xi32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
+-  // expected-error@+1 {{operand #0 must be ranked tensor of f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2x2x2xi32>'}}
++  // expected-error@+1 {{operand #0 must be ranked tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2x2x2xi32>'}}
+   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xi32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
+   func.return %0#0 : tensor<2x2x2x2xf32>
+ }
+@@ -4786,6 +4920,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2xf32>) -> tensor<2x2x2x2xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{expects multi-dimensional operands to have compatible shapes}}
+   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
+   func.return %0#0 : tensor<2x2x2x2xf32>
+@@ -4810,7 +4945,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
+-  // expected-error@+1 {{result #1 must be 1D tensor of f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2xf32>'}}
++  // expected-error@+1 {{result #1 must be 1D tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2xf32>'}}
+   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2x2xf32>, tensor<2xf32>)
+   func.return %0#0 : tensor<2x2x2x2xf32>
+ }
+@@ -4818,7 +4953,7 @@
+ // -----
+ 
+ func.func @error_batch_norm_grad(%input: tensor<*xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<*xf32> {
+-  // expected-error@+1 {{operand #0 must be ranked tensor of f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<*xf32>'}}
++  // expected-error@+1 {{operand #0 must be ranked tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<*xf32>'}}
+   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>)
+   func.return %0#0 : tensor<*xf32>
+ }
+@@ -4866,6 +5001,7 @@
+ // -----
+ 
+ func.func @rfft_not_float32or64(%arg0: tensor<3x9xf16>) -> tensor<3x5xcomplex<f32>> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{RFFT requires f32 or f64 input type, but is given 'f16'.}}
+   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #stablehlo<fft_type RFFT> } : (tensor<3x9xf16>) -> tensor<3x5xcomplex<f32>>
+   func.return %0 : tensor<3x5xcomplex<f32>>
+@@ -4874,6 +5010,7 @@
+ // -----
+ 
+ func.func @fft_invalid_rank(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{rank must be between 1 and 3, but got 4.}}
+   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<4xi64>, fft_type = #stablehlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
+   func.return %0 : tensor<3x9xcomplex<f32>>
+@@ -4882,6 +5019,7 @@
+ // -----
+ 
+ func.func @fft_rank_mismatch(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{operand rank must not be less than fft rank of 3 for operand of type 'tensor<3x9xf32>'}}
+   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<3xi64>, fft_type = #stablehlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
+   func.return %0 : tensor<3x9xcomplex<f32>>
+@@ -4890,6 +5028,7 @@
+ // -----
+ 
+ func.func @rfft_invalid_dim(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{RFFT requires innermost dimensions to be compatible with fft_length. Got: 3, 9 but wanted 9, 9.}}
+   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<2xi64>, fft_type = #stablehlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
+   func.return %0 : tensor<3x9xcomplex<f32>>
+@@ -4898,6 +5037,7 @@
+ // -----
+ 
+ func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{IRFFT requires non-final dimensions to be compatible with fft_length. Got: 3, 9 but wanted 9, 9, and 3 != 9.}}
+   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<2xi64>, fft_type = #stablehlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
+   func.return %0 : tensor<3x9xf32>
+@@ -4906,6 +5046,7 @@
+ // -----
+ 
+ func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{IRFFT requires innermost dimension to be compatible with fft_length[-1]/2+1. Got: 9 but fft_length is 9.}}
+   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #stablehlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
+   func.return %0 : tensor<3x9xf32>
+@@ -4914,6 +5055,7 @@
+ // -----
+ 
+ func.func @irfft_invalid_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{FFT/IFFT/IRFFT take a complex tensor as input, but is given 'tensor<3x9xf32>'}}
+   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #stablehlo<fft_type IRFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
+   func.return %0 : tensor<3x9xcomplex<f32>>
+@@ -4922,6 +5064,7 @@
+ // -----
+ 
+ func.func @irfft_invalid_ret_elt(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x16xcomplex<f32>> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<3x16xf32>' are incompatible with return type(s) of operation 'tensor<3x16xcomplex<f32>>'}}
+   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #stablehlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x16xcomplex<f32>>
+   func.return %0 : tensor<3x16xcomplex<f32>>
+@@ -4930,6 +5073,7 @@
+ // -----
+ 
+ func.func @rfft_invalid_ret_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<3x5xcomplex<f32>>' are incompatible with return type(s) of operation 'tensor<3x9xf32>'}}
+   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #stablehlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xf32>
+   func.return %0 : tensor<3x9xf32>
+@@ -4946,6 +5090,7 @@
+ // -----
+ 
+ func.func @rfft_dynamic_incompatible_dims(%arg0: tensor<3x10xf32>) -> tensor<?x?xcomplex<f32>> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1{{RFFT requires innermost dimensions to be compatible with fft_length. Got: 3, 10 but wanted 9.}}
+   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #stablehlo<fft_type RFFT> } : (tensor<3x10xf32>) -> tensor<?x?xcomplex<f32>>
+   func.return %0 : tensor<?x?xcomplex<f32>>
+@@ -4962,6 +5107,7 @@
+ // -----
+ 
+ func.func @irfft_dynamic_incompatible_non_final_dims(%arg0: tensor<?x3x15xcomplex<f32>>) -> tensor<?x?x?xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1{{IRFFT requires non-final dimensions to be compatible with fft_length. Got: -9223372036854775808, 3, 15 but wanted 4, 16, and 3 != 4}}
+   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<[4, 16]> : tensor<2xi64>, fft_type = #stablehlo<fft_type IRFFT> } : (tensor<?x3x15xcomplex<f32>>) -> tensor<?x?x?xf32>
+   func.return %0 : tensor<?x?x?xf32>
+@@ -4970,6 +5116,7 @@
+ // -----
+ 
+ func.func @irfft_dynamic_incompatible_final_dim(%arg0: tensor<?x8xcomplex<f32>>) -> tensor<?x?xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1{{IRFFT requires innermost dimension to be compatible with fft_length[-1]/2+1. Got: 8 but fft_length is 16.}}
+   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #stablehlo<fft_type IRFFT> } : (tensor<?x8xcomplex<f32>>) -> tensor<?x?xf32>
+   func.return %0 : tensor<?x?xf32>
+@@ -5115,6 +5262,7 @@
+ // -----
+ 
+ func.func @quantized_constants_invalid_storage_type() -> () {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'stablehlo.constant' op inferred type(s) 'tensor<2xui8>' are incompatible with return type(s) of operation 'tensor<2x!quant.uniform<i8:f32, 2.000000e+00:15>>}}
+   %0 = "stablehlo.constant"() {value = dense<[1, 2]> : tensor<2xui8>} : () -> tensor<2x!quant.uniform<i8:f32, 2.0:15>>
+   func.return
+@@ -5187,6 +5335,7 @@
+ // -----
+ 
+ func.func @pad_c2(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{edge_padding_low length (2) must match operand rank (3)}}
+   %0 = "stablehlo.pad"(%arg0, %arg1) {
+     edge_padding_low = dense<[0, 1]> : tensor<2xi64>,
+@@ -5199,6 +5348,7 @@
+ // -----
+ 
+ func.func @pad_c3(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x3xf16> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{Interior padding cannot be negative: -1}}
+   %0 = "stablehlo.pad"(%arg0, %arg1) {
+     edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
+@@ -5211,6 +5361,7 @@
+ // -----
+ 
+ func.func @pad_c4(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{Padding result in negative size for dimension 2}}
+   %0 = "stablehlo.pad"(%arg0, %arg1) {
+     edge_padding_low = dense<[0, 1, -4]> : tensor<3xi64>,
+@@ -5223,6 +5374,7 @@
+ // -----
+ 
+ func.func @pad_c4(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<8x8x8xf16> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'stablehlo.pad' op inferred type(s) 'tensor<2x4x7xf16>' are incompatible with return type(s) of operation 'tensor<8x8x8xf16>'}}
+   %0 = "stablehlo.pad"(%arg0, %arg1) {
+     edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
+@@ -5248,6 +5400,7 @@
+ // -----
+ 
+ func.func @pad_i2(%arg0: tensor<1x2x3xf16>, %arg1: tensor<2xf16>) -> tensor<2x4x7xf16> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{padding value type should be a rank-0 tensor, is rank 1}}
+   %0 = "stablehlo.pad"(%arg0, %arg1) {
+     edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
+@@ -5260,6 +5413,7 @@
+ // -----
+ 
+ func.func @pad_i3(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{edge_padding_low has rank 0 instead of required rank 1}}
+   %0 = "stablehlo.pad"(%arg0, %arg1) {
+     edge_padding_low = dense<1> : tensor<i64>,
+@@ -5495,6 +5649,7 @@
+ // -----
+ 
+ func.func @abs_c2(%arg0: tensor<1x2xf32>) -> tensor<1x2xf64> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'stablehlo.abs' op inferred type(s) 'tensor<1x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2xf64>'}}
+   %0 = "stablehlo.abs"(%arg0) {} : (tensor<1x2xf32>) -> tensor<1x2xf64>
+   func.return %0 : tensor<1x2xf64>
+@@ -5503,7 +5658,8 @@
+ // -----
+ 
+ func.func @abs_c2(%arg0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf64> {
+-// expected-error@+1 {{'stablehlo.abs' op inferred type(s) 'tensor<1x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2xf64>'}}
++  // expected-error@+2 {{failed to infer returned types}}
++  // expected-error@+1 {{'stablehlo.abs' op inferred type(s) 'tensor<1x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2xf64>'}}
+   %0 = "stablehlo.abs"(%arg0) {} : (tensor<1x2xcomplex<f32>>) -> tensor<1x2xf64>
+   func.return %0 : tensor<1x2xf64>
+ }
+@@ -5526,6 +5682,47 @@
+ 
+ // -----
+ 
++func.func @complex_int_input(%arg0: tensor<10x10xi32>, %arg1: tensor<10x10xi32>) -> tensor<10x10xcomplex<i32>> {
++  // expected-error@+1 {{operand #0 must be tensor of 32-bit float or 64-bit float values, but got 'tensor<10x10xi32>'}}
++  %0 = "stablehlo.complex"(%arg0, %arg1) {} : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<10x10xcomplex<i32>>
++  func.return %0 : tensor<10x10xcomplex<i32>>
++}
++
++// -----
++
++func.func @complex_f32_f64_mix_input(%arg0: tensor<10x10xf32>, %arg1: tensor<10x10xf64>) -> tensor<10x10xcomplex<f64>> {
++  // expected-error@+1 {{requires the same element type for all operands}}
++  %0 = "stablehlo.complex"(%arg0, %arg1) {} : (tensor<10x10xf32>, tensor<10x10xf64>) -> tensor<10x10xcomplex<f64>>
++  func.return %0 : tensor<10x10xcomplex<f64>>
++}
++
++// -----
++
++func.func @complex_f16_input(%arg0: tensor<10x10xf16>, %arg1: tensor<10x10xf16>) -> tensor<10x10xcomplex<f16>> {
++  // expected-error@+1 {{operand #0 must be tensor of 32-bit float or 64-bit float values, but got 'tensor<10x10xf16>'}}
++  %0 = "stablehlo.complex"(%arg0, %arg1) {} : (tensor<10x10xf16>, tensor<10x10xf16>) -> tensor<10x10xcomplex<f16>>
++  func.return %0 : tensor<10x10xcomplex<f16>>
++}
++
++// -----
++
++func.func @complex_mismatch_return_element_type(%arg0: tensor<10x10xf32>, %arg1: tensor<10x10xf32>) -> tensor<10x10xcomplex<f64>> {
++  // expected-error@+2 {{failed to infer returned types}}
++  // expected-error@+1 {{inferred type(s) 'tensor<10x10xcomplex<f32>>' are incompatible with return type(s) of operation 'tensor<10x10xcomplex<f64>>'}}
++  %0 = "stablehlo.complex"(%arg0, %arg1) {} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xcomplex<f64>>
++  func.return %0 : tensor<10x10xcomplex<f64>>
++}
++
++// -----
++
++func.func @complex_mismatch_return_shape(%arg0: tensor<10x10xf32>, %arg1: tensor<10x10xf32>) -> tensor<5x5xcomplex<f32>> {
++  // expected-error@+1 {{requires the same shape for all operands and results}}
++  %0 = "stablehlo.complex"(%arg0, %arg1) {} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<5x5xcomplex<f32>>
++  func.return %0 : tensor<5x5xcomplex<f32>>
++}
++
++// -----
++
+ // CHECK-LABEL: func @is_finite
+ func.func @is_finite(%arg0: tensor<3xf32>) -> tensor<3xi1> {
+   %0 = "stablehlo.is_finite"(%arg0) {} : (tensor<3xf32>) -> tensor<3xi1>
+@@ -5535,7 +5732,7 @@
+ // -----
+ 
+ func.func @is_finite_int_input(%arg0: tensor<3xi32>) -> tensor<3xi1> {
+-  // expected-error@+1 {{operand #0 must be tensor of f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<3xi32>'}}
++  // expected-error@+1 {{operand #0 must be tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<3xi32>'}}
+   %0 = "stablehlo.is_finite"(%arg0) {} : (tensor<3xi32>) -> tensor<3xi1>
+   func.return %0 : tensor<3xi1>
+ }
+@@ -5559,6 +5756,7 @@
+ // -----
+ 
+ func.func @negative_dimension_attr(%arg0: tensor<?x?xf32, #stablehlo.type_extensions<bounds = [3, -1]>>, %arg1: tensor<i32>) -> tensor<*xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{requires non-negative dimension attribute; found (-1)}}
+   %result = "stablehlo.set_dimension_size"(%arg0, %arg1) {dimension = -1 : i64} : (tensor<?x?xf32, #stablehlo.type_extensions<bounds = [3, -1]>>, tensor<i32>) -> tensor<*xf32>
+   func.return %result : tensor<*xf32>
+@@ -5567,6 +5765,7 @@
+ // -----
+ 
+ func.func @invalid_dimension_attr(%arg0: tensor<?x?xf32, #stablehlo.type_extensions<bounds = [3, -1]>>, %arg1: tensor<i32>) -> tensor<*xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{requires dimension attribute in range [0, 2); found (2)}}
+   %result = "stablehlo.set_dimension_size"(%arg0, %arg1) {dimension = 2 : i64} : (tensor<?x?xf32, #stablehlo.type_extensions<bounds = [3, -1]>>, tensor<i32>) -> tensor<*xf32>
+   func.return %result : tensor<*xf32>
+diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
+--- stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
++++ stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
+@@ -179,13 +179,15 @@
+   %cst_4 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+   %cst_5 = arith.constant dense<[[3, 2], [1, 4]]> : tensor<2x2xi32>
+   %cst_6 = arith.constant dense<[[1, 2], [4, 8]]> : tensor<2x2xui32>
+-  %cst_7 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E5M2FNUZ>
+-  %cst_8 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E4M3FNUZ>
+-  %cst_9 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E4M3B11FNUZ>
+-  %cst_10 = arith.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xbf16>
+-  %cst_11 = arith.constant dense<[1.0e+00, -4.0e+00, -65504.0e+00, 1.5625e-02]> : tensor<4xf16>
+-  %cst_12 = arith.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
+-  %cst_13 = arith.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
++  %cst_7 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E4M3B11FNUZ>
++  %cst_8 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E4M3FN>
++  %cst_9 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E4M3FNUZ>
++  %cst_10 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E5M2>
++  %cst_11 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E5M2FNUZ>
++  %cst_12 = arith.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xbf16>
++  %cst_13 = arith.constant dense<[1.0e+00, -4.0e+00, -65504.0e+00, 1.5625e-02]> : tensor<4xf16>
++  %cst_14 = arith.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
++  %cst_15 = arith.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
+   func.return
+ }
+ 
+diff --ruN a/stablehlo/stablehlo/tests/verify_reduce.mlir b/stablehlo/stablehlo/tests/verify_reduce.mlir
+--- stablehlo/stablehlo/tests/verify_reduce.mlir
++++ stablehlo/stablehlo/tests/verify_reduce.mlir
+@@ -401,6 +401,7 @@
+ func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xi32>,
+     %arg2: tensor<f32>, %arg3: tensor<i32>) -> (tensor<?xf32>) {
+ 
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{inferred type(s) 'tensor<?xf32>', 'tensor<?xi32>' are incompatible with return type(s) of operation 'tensor<?xf32>', 'tensor<?xi32>', 'tensor<?xi32>'}}
+   %0:3 = "stablehlo.reduce"(%arg0, %arg1, %arg2, %arg3) ({
+ 
+@@ -419,6 +420,7 @@
+ func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
+     -> (tensor<?x?xi32>) {
+ 
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'stablehlo.reduce' op inferred type(s) 'tensor<?xf32>' are incompatible with return type(s) of operation 'tensor<?xf32>', 'tensor<?xf32>'}}
+   %0:2 = "stablehlo.reduce"(%arg0, %arg1) ({
+ 
+@@ -436,6 +438,7 @@
+ func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xi32>,
+     %arg2: tensor<f32>, %arg3: tensor<i32>) -> (tensor<?xf32>) {
+ 
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'stablehlo.reduce' op inferred type(s) 'tensor<?xf32>', 'tensor<?xi32>' are incompatible with return type(s) of operation 'tensor<?xf32>', 'tensor<?x?xf32>'}}
+   %0:2 = "stablehlo.reduce"(%arg0, %arg1, %arg2, %arg3) ({
+ 
+@@ -454,6 +457,7 @@
+ func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
+     -> (tensor<?xi32>) {
+ 
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'stablehlo.reduce' op inferred type(s) 'tensor<?xf32>' are incompatible with return type(s) of operation 'tensor<?xi32>'}}
+   %0 = "stablehlo.reduce"(%arg0, %arg1) ({
+ 
+@@ -471,6 +475,7 @@
+ func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
+     -> (tensor<?x?xi32>) {
+ 
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error@+1 {{'stablehlo.reduce' op inferred type(s) 'tensor<?xf32>' are incompatible with return type(s) of operation 'tensor<?x?xf32>'}}
+   %0 = "stablehlo.reduce"(%arg0, %arg1) ({
+ 
+diff --ruN a/stablehlo/stablehlo/tests/verify_reduce_window.mlir b/stablehlo/stablehlo/tests/verify_reduce_window.mlir
+--- stablehlo/stablehlo/tests/verify_reduce_window.mlir
++++ stablehlo/stablehlo/tests/verify_reduce_window.mlir
+@@ -450,6 +450,7 @@
+ func.func @reduce_window_invalid_ret_type(%arg0: tensor<4x2xf32>,
+     %arg1: tensor<4x2xi32>, %init0: tensor<f32>, %init1: tensor<i32>) ->
+         tensor<2x2xf32> {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error @+1 {{inferred type(s) 'tensor<2x2xf32>', 'tensor<2x2xi32>' are incompatible with return type(s) of operation 'tensor<2x2xf32>'}}
+   %0 = "stablehlo.reduce_window"(%arg0, %arg1, %init0, %init1) ({
+          ^bb0(%a0: tensor<f32>, %a1: tensor<i32>,
+@@ -472,6 +473,7 @@
+ func.func @reduce_window_invalid_ret_type(%arg0: tensor<4x2xf32>,
+     %arg1: tensor<4x2xi32>, %init0: tensor<f32>, %init1: tensor<i32>) ->
+         (tensor<2x2xf32>, tensor<2x3xi32>) {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error @+1 {{inferred type(s) 'tensor<2x2xf32>', 'tensor<2x2xi32>' are incompatible with return type(s) of operation 'tensor<2x2xf32>', 'tensor<2x3xi32>'}}
+   %0:2 = "stablehlo.reduce_window"(%arg0, %arg1, %init0, %init1) ({
+          ^bb0(%a0: tensor<f32>, %a1: tensor<i32>,
+@@ -494,6 +496,7 @@
+ func.func @reduce_window_invalid_ret_type(%arg0: tensor<4x2xf32>,
+     %arg1: tensor<4x2xi32>, %init0: tensor<f32>, %init1: tensor<i32>) ->
+         (tensor<2x2xi32>, tensor<2x2xi32>) {
++  // expected-error@+2 {{failed to infer returned types}}
+   // expected-error @+1 {{inferred type(s) 'tensor<2x2xf32>', 'tensor<2x2xi32>' are incompatible with return type(s) of operation 'tensor<2x2xi32>', 'tensor<2x2xi32>'}}
+   %0:2 = "stablehlo.reduce_window"(%arg0, %arg1, %init0, %init1) ({
+          ^bb0(%a0: tensor<f32>, %a1: tensor<i32>,
+diff --ruN a/stablehlo/stablehlo/tests/verify_scatter.mlir b/stablehlo/stablehlo/tests/verify_scatter.mlir
+--- stablehlo/stablehlo/tests/verify_scatter.mlir
++++ stablehlo/stablehlo/tests/verify_scatter.mlir
+@@ -481,6 +481,7 @@
+     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
+       tensor<200x100xf32> {
+ 
++  // expected-error @+2 {{failed to infer returned types}}
+   // expected-error @+1 {{inferred type(s) 'tensor<200x100x300xf32>' are incompatible with return type(s) of operation 'tensor<200x100xf32>'}}
+   %0 = "stablehlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+diff --ruN a/stablehlo/stablehlo/tests/verify_select_and_scatter.mlir b/stablehlo/stablehlo/tests/verify_select_and_scatter.mlir
+--- stablehlo/stablehlo/tests/verify_select_and_scatter.mlir
++++ stablehlo/stablehlo/tests/verify_select_and_scatter.mlir
+@@ -394,6 +394,7 @@
+     %arg1: tensor<10x12x12x64xf32>) -> () {
+     %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+ 
++    // expected-error @+2 {{failed to infer returned types}}
+     // expected-error @+1 {{inferred type(s) 'tensor<10x24x24x64xf32>' are incompatible with return type(s) of operation 'tensor<10x24x24x32xf32>'}}
+     %1 = "stablehlo.select_and_scatter"(%arg0, %arg1, %0) ({
+     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+@@ -422,6 +423,7 @@
+     %arg1: tensor<10x12x12x64xf32>) -> () {
+     %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+ 
++    // expected-error @+2 {{failed to infer returned types}}
+     // expected-error @+1 {{inferred type(s) 'tensor<10x24x24x64xf32>' are incompatible with return type(s) of operation 'tensor<10x24x24x64xi32>'}}
+     %1 = "stablehlo.select_and_scatter"(%arg0, %arg1, %0) ({
+     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
 diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
 --- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
 +++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-@@ -221,6 +221,9 @@
-     auto operandType = op.getOperand().getType().cast<ShapedType>();
-     auto isOperandUnsigned = operandType.getElementType().isUnsignedInteger();
-     auto resultType = op.getResult().getType().cast<ShapedType>();
-+    if (!resultType.getElementType().isIntOrIndex())
-+      return rewriter.notifyMatchFailure(op,
-+                                         "expected integer result tensor type");
-     auto resultBitwidth = resultType.getElementType().getIntOrFloatBitWidth();
-     return evalUnary(rewriter, op, [&](APInt operand) {
-       return APSInt(operand, isOperandUnsigned).extOrTrunc(resultBitwidth);
-@@ -332,6 +335,9 @@
-   LogicalResult matchAndRewrite(SignOp op,
-                                 PatternRewriter& rewriter) const override {
-     auto resultType = op.getResult().getType().cast<ShapedType>();
-+    if (!resultType.getElementType().isIntOrIndex())
-+      return rewriter.notifyMatchFailure(op,
-+                                         "expected integer result tensor type");
-     auto resultBitwidth = resultType.getElementType().getIntOrFloatBitWidth();
-     return evalUnary(rewriter, op, [&](APInt operand) {
-       int64_t result;
-@@ -595,8 +601,14 @@
-     // This complicates the logic quite a bit and is not needed to pass the
-     // current tests, so we leave this for future work.
-     auto resultType = op.getResult().getType().cast<ShapedType>();
--    if (operandType.getElementType().getIntOrFloatBitWidth() !=
--        resultType.getElementType().getIntOrFloatBitWidth())
-+    auto getBitWidthFn = [](ShapedType type) {
-+      auto elementType = type.getElementType();
-+      if (auto complexType = elementType.dyn_cast<ComplexType>())
-+        return complexType.getElementType().getIntOrFloatBitWidth();
-+      return elementType.getIntOrFloatBitWidth();
-+    };
-+
-+    if (getBitWidthFn(operandType) != getBitWidthFn(resultType))
-       return rewriter.notifyMatchFailure(op, "unsupported bitwidth");
- 
-     return refineReturnShape(rewriter, op, operandType.getShape());
+@@ -852,7 +852,8 @@
+     SmallVector<Type> inferredReturnTypes;
+     if (failed(op.inferReturnTypes(getContext(), /*location=*/{},
+                                    op->getOperands(), op->getAttrDictionary(),
+-                                   op->getRegions(), inferredReturnTypes)))
++                                   op->getPropertiesStorage(), op->getRegions(),
++                                   inferredReturnTypes)))
+       return rewriter.notifyMatchFailure(op, "inferReturnTypes failed");
+     return refineReturnTypes(rewriter, op, inferredReturnTypes);
+   }
 
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index 24ee995daa5..1d7122cec43 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "48b32cb5126471481774244ddd8fee2f3efe66e1"
-    STABLEHLO_SHA256 = "e0d2dfc7d2d47d34e81d95c77b13e76cced01eaa7a2a92831a0f40f8fd8706ba"
+    STABLEHLO_COMMIT = "579f865e350cbd3513df71ecec4f4a12f0acb5fb"
+    STABLEHLO_SHA256 = "7d342ac2fd71dfb16da852f74bb929b548c6f5fbc0643504c32d60e5bb87ef76"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index f61b4868f02..c14440024a5 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "0aaa6e679847a4eeb407136e7b0bcef93ec652e6"
-    TFRT_SHA256 = "42f476e314713f8265791871c62b957882aff497b161610a591547c00932d0ef"
+    TFRT_COMMIT = "7d879c8b161085a4374ea481b93a52adb19c0529"
+    TFRT_SHA256 = "cfde5639b7c67c0edfa073b779b977aae744137c2b2fcb25a992f8d57bd033dd"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/triton/cl526173620.patch b/third_party/triton/cl526173620.patch
new file mode 100644
index 00000000000..b1addd9b585
--- /dev/null
+++ b/third_party/triton/cl526173620.patch
@@ -0,0 +1,81 @@
+==== triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp#6 - triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp ====
+# action=edit type=text
+--- triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp	2023-04-07 13:02:50.000000000 -0700
++++ triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp	2023-04-21 17:35:14.000000000 -0700
+@@ -68,14 +68,14 @@ public:
+                   ConversionPatternRewriter &rewriter) const override {
+     Type retType = getTypeConverter()->convertType(op.getType());
+     auto value = adaptor.getValue().dyn_cast<DenseElementsAttr>();
+-    if (dyn_cast<RankedTensorType>(retType)) {
++    if (dyn_cast<RankedTensorType>(cast<ShapedType>(retType))) {
+       assert(value);
+       if (value.getElementType().isInteger(1) && value.isSplat())
+         // Workaround until https://reviews.llvm.org/D133743 is included.
+-        value = DenseElementsAttr::get(retType, value.getSplatValue<bool>());
++        value = DenseElementsAttr::get(cast<ShapedType>(retType), value.getSplatValue<bool>());
+       else
+         // This is a hack. We just want to add encoding
+-        value = value.reshape(retType);
++        value = value.reshape(cast<ShapedType>(retType));
+     }
+     addNamedAttrs(
+         rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, retType, value),
+==== triton/lib/Dialect/Triton/IR/Dialect.cpp#2 - triton/lib/Dialect/Triton/IR/Dialect.cpp ====
+# action=edit type=text
+--- triton/lib/Dialect/Triton/IR/Dialect.cpp	2023-04-07 13:02:50.000000000 -0700
++++ triton/lib/Dialect/Triton/IR/Dialect.cpp	2023-04-21 17:28:36.000000000 -0700
+@@ -47,5 +47,5 @@
+ Operation *TritonDialect::materializeConstant(OpBuilder &builder,
+                                               Attribute value, Type type,
+                                               Location loc) {
+-  return builder.create<arith::ConstantOp>(loc, type, value);
++  return builder.create<arith::ConstantOp>(loc, type, cast<TypedAttr>(value));
+ }
+==== triton/lib/Dialect/Triton/Transforms/Combine.cpp#3 - triton/lib/Dialect/Triton/Transforms/Combine.cpp ====
+# action=edit type=text
+--- triton/lib/Dialect/Triton/Transforms/Combine.cpp	2023-04-07 13:02:50.000000000 -0700
++++ triton/lib/Dialect/Triton/Transforms/Combine.cpp	2023-04-21 17:49:20.000000000 -0700
+@@ -40,10 +40,10 @@
+   Type resType = bcast_res.getType();
+   DenseElementsAttr res;
+   if (auto denseValue = value.dyn_cast<DenseElementsAttr>()) {
+-    res =
+-        DenseElementsAttr::get(resType, denseValue.getSplatValue<Attribute>());
++    res = DenseElementsAttr::get(cast<ShapedType>(resType),
++                                 denseValue.getSplatValue<Attribute>());
+   } else {
+-    res = DenseElementsAttr::get(resType, value);
++    res = DenseElementsAttr::get(cast<ShapedType>(resType), value);
+   }
+   return res;
+ }
+==== triton/lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp#1 - triton/lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp ====
+# action=edit type=text
+--- triton/lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp	2023-04-07 13:02:50.000000000 -0700
++++ triton/lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp	2023-04-21 17:48:23.000000000 -0700
+@@ -181,7 +181,8 @@
+     }
+ 
+     // Create tensor
+-    Value constant = builder.create<arith::ConstantOp>(loc, attr);
++    Value constant =
++        builder.create<arith::ConstantOp>(loc, cast<TypedAttr>(attr));
+     return builder.create<triton::SplatOp>(loc, otherTensorType, constant);
+   }
+ };
+==== triton/lib/Dialect/TritonGPU/IR/Dialect.cpp#10 - triton/lib/Dialect/TritonGPU/IR/Dialect.cpp ====
+# action=edit type=text
+--- triton/lib/Dialect/TritonGPU/IR/Dialect.cpp	2023-04-07 13:02:50.000000000 -0700
++++ triton/lib/Dialect/TritonGPU/IR/Dialect.cpp	2023-04-21 17:46:57.000000000 -0700
+@@ -1056,8 +1056,9 @@
+   // cvt(type, constant) -> constant
+   if (auto cst = llvm::dyn_cast<arith::ConstantOp>(arg))
+     if (auto ret = cst.getValue().dyn_cast<SplatElementsAttr>()) {
+-      auto newRet = SplatElementsAttr::get(op->getResultTypes().front(),
+-                                           ret.getSplatValue<Attribute>());
++      auto newRet =
++          SplatElementsAttr::get(cast<ShapedType>(op->getResultTypes().front()),
++                                 ret.getSplatValue<Attribute>());
+       rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, newRet);
+       return mlir::success();
+     }
diff --git a/third_party/triton/cl528701873.patch b/third_party/triton/cl528701873.patch
new file mode 100644
index 00000000000..21a035a0be6
--- /dev/null
+++ b/third_party/triton/cl528701873.patch
@@ -0,0 +1,54 @@
+==== triton/lib/Dialect/Triton/IR/Ops.cpp#7 - /google/src/cloud/csigg/mlir_5e118f933b6590cecd7f1afb30845a1594bc4a5d_1683013084/triton/lib/Dialect/Triton/IR/Ops.cpp ====
+# action=edit type=text
+--- triton/lib/Dialect/Triton/IR/Ops.cpp	2023-04-24 23:33:26.000000000 -0700
++++ triton/lib/Dialect/Triton/IR/Ops.cpp	2023-05-02 03:18:21.000000000 -0700
+@@ -260,7 +260,7 @@
+ //-- TransOp --
+ mlir::LogicalResult mlir::triton::TransOp::inferReturnTypes(
+     MLIRContext *context, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+     SmallVectorImpl<Type> &inferredReturnTypes) {
+   // type is the same as the input
+   auto argTy = operands[0].getType().cast<RankedTensorType>();
+@@ -287,7 +287,7 @@
+ //-- DotOp --
+ mlir::LogicalResult mlir::triton::DotOp::inferReturnTypes(
+     MLIRContext *context, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+     SmallVectorImpl<Type> &inferredReturnTypes) {
+   // type is the same as the accumulator
+   auto accTy = operands[2].getType().cast<RankedTensorType>();
+@@ -355,7 +355,7 @@
+ 
+ mlir::LogicalResult mlir::triton::ReduceOp::inferReturnTypes(
+     MLIRContext *context, std::optional<Location> location, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+     SmallVectorImpl<Type> &inferredReturnTypes) {
+   for (auto arg : operands) {
+     auto argTy = arg.getType().cast<RankedTensorType>();
+@@ -462,7 +462,7 @@
+ //-- ExpandDimsOp --
+ mlir::LogicalResult mlir::triton::ExpandDimsOp::inferReturnTypes(
+     MLIRContext *context, std::optional<Location> loc, ValueRange operands,
+-    DictionaryAttr attributes, RegionRange regions,
++    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+     SmallVectorImpl<Type> &inferredReturnTypes) {
+   // infer shape
+   auto arg = operands[0];
+==== triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp#4 - /google/src/cloud/csigg/mlir_5e118f933b6590cecd7f1afb30845a1594bc4a5d_1683013084/triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp ====
+# action=edit type=text
+--- triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2023-04-24 23:33:26.000000000 -0700
++++ triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2023-05-02 02:57:06.000000000 -0700
+@@ -224,7 +224,8 @@
+     SmallVector<Type, 1> newTypes;
+     auto success = typeInfer.inferReturnTypes(
+         newOp->getContext(), newOp->getLoc(), newOp->getOperands(),
+-        newOp->getAttrDictionary(), newOp->getRegions(), newTypes);
++        newOp->getAttrDictionary(), newOp->getPropertiesStorage(),
++        newOp->getRegions(), newTypes);
+     if (succeeded(success))
+       newOp->getResult(0).setType(newTypes.front());
+   }
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index 4edb46a36d3..58e0dc4d918 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "f49b2004951427694aba5878f7f961d4e92a0a16"
-    TRITON_SHA256 = "ff1215c70623e2dac9c005c12017fb5d11d125b31c6861b252219108da708b4b"
+    TRITON_COMMIT = "1627e0c27869b4098e5fa720717645c1baaf5972"
+    TRITON_SHA256 = "574436dab7c65f185834bd80c1d92167bacb7471b0c25906db60686835c46e21"
 
     tf_http_archive(
         name = "triton",
@@ -14,5 +14,8 @@ def repo():
         strip_prefix = "triton-{commit}".format(commit = TRITON_COMMIT),
         urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{commit}.tar.gz".format(commit = TRITON_COMMIT)),
         # For temporary changes which haven't landed upstream yet.
-        patch_file = [],
+        patch_file = [
+            "//third_party/triton:cl526173620.patch",
+            "//third_party/triton:cl528701873.patch",
+        ],
     )